From 72660e9a4d683dc6a0c50e9fad96e59b7edd1f71 Mon Sep 17 00:00:00 2001 From: Alexey Suhov Date: Fri, 12 Apr 2019 18:25:53 +0300 Subject: [PATCH] Publishing 2019 R1 content --- README.md | 2 +- inference-engine/CMakeLists.txt | 37 +- inference-engine/README.md | 78 +- inference-engine/cmake/FindlibGNA.cmake | 4 +- inference-engine/cmake/check_features.cmake | 23 +- inference-engine/cmake/config.cmake.in | 2 +- inference-engine/cmake/cpplint.cmake | 162 + inference-engine/cmake/cpplint_html.cmake | 30 + inference-engine/cmake/cpplint_merge.cmake | 11 + inference-engine/cmake/cpplint_run.cmake | 37 + .../cmake/cpplint_to_cppcheck_xml.cmake | 12 + inference-engine/cmake/debug.cmake | 4 +- inference-engine/cmake/dependencies.cmake | 61 +- inference-engine/cmake/dependency_solver.cmake | 14 +- inference-engine/cmake/download.cmake | 4 +- inference-engine/cmake/download_and_apply.cmake | 4 +- inference-engine/cmake/download_and_check.cmake | 25 +- inference-engine/cmake/download_and_extract.cmake | 13 +- inference-engine/cmake/extract.cmake | 8 +- inference-engine/cmake/features.cmake | 18 +- inference-engine/cmake/ie_parallel.cmake | 8 +- inference-engine/cmake/itt.cmake | 2 +- inference-engine/cmake/linux_name.cmake | 4 +- inference-engine/cmake/mode.cmake | 2 +- inference-engine/cmake/options.cmake | 2 +- inference-engine/cmake/os_flags.cmake | 11 +- inference-engine/cmake/sanitizer.cmake | 8 +- inference-engine/cmake/sdl.cmake | 2 +- .../share/InferenceEngineConfig-version.cmake.in | 4 +- .../cmake/share/InferenceEngineConfig.cmake.in | 67 +- inference-engine/cmake/version.cmake | 4 +- inference-engine/ie_bridges/python/CMakeLists.txt | 8 +- .../ie_bridges/python/cmake/FindCython.cmake | 2 +- .../ie_bridges/python/cmake/UseCython.cmake | 2 +- .../ie_bridges/python/docs/api_overview.md | 469 +- .../python/sample/benchmark_app/README.md | 79 +- .../sample/benchmark_app/benchmark/__init__.py | 11 +- .../benchmark_app/{ => benchmark}/benchmark.py | 9 +- .../benchmark_app/benchmark/utils/__init__.py | 15 + .../{ => benchmark}/utils/benchmark_utils.py | 48 +- .../{ => benchmark}/utils/constants.py | 24 +- .../python/sample/benchmark_app/benchmark_app.py | 37 + .../python/sample/classification_sample/README.md | 79 + .../classification_sample.py | 63 +- .../sample/classification_sample_async/README.md | 89 + .../classification_sample_async.py | 58 +- .../Greengrass-FaaS-User-Guide.docx | Bin 33240 -> 0 bytes .../python/sample/greengrass_samples/README.md | 49 - .../greengrass_classification_sample.py | 180 - .../greengrass_object_detection_sample_ssd.py | 184 - .../classification_demo/classification_demo.ipynb | 463 - .../classification_demo/image_net_synset.txt | 1000 - .../python/sample/style_transfer_sample/README.md | 74 + .../style_transfer_sample.py | 58 +- .../ie_bridges/python/sample/voc_labels.txt | 21 - inference-engine/ie_bridges/python/setup.py | 10 +- .../src/openvino/inference_engine/CMakeLists.txt | 24 +- .../inference_engine/dnn_builder/CMakeLists.txt | 37 - .../inference_engine/dnn_builder/__init__.py | 2 - .../inference_engine/dnn_builder/dnn_builder.pxd | 26 - .../inference_engine/dnn_builder/dnn_builder.pyx | 423 - .../dnn_builder/dnn_builder_impl.cpp | 330 - .../dnn_builder/dnn_builder_impl.hpp | 161 - .../dnn_builder/dnn_builder_impl_defs.pxd | 97 - .../src/openvino/inference_engine/ie_api.pxd | 1 + .../src/openvino/inference_engine/ie_api.pyx | 10 +- .../src/openvino/inference_engine/ie_api_impl.cpp | 27 +- .../src/openvino/inference_engine/ie_api_impl.hpp | 12 +- .../openvino/inference_engine/ie_api_impl_defs.pxd | 9 +- .../include/builders/ie_argmax_layer.hpp | 17 +- .../builders/ie_batch_normalization_layer.hpp | 36 +- .../include/builders/ie_clamp_layer.hpp | 17 +- .../include/builders/ie_concat_layer.hpp | 17 +- .../include/builders/ie_const_layer.hpp | 23 +- .../include/builders/ie_convolution_layer.hpp | 37 +- .../include/builders/ie_crop_layer.hpp | 23 +- .../builders/ie_ctc_greedy_decoder_layer.hpp | 17 +- .../include/builders/ie_deconvolution_layer.hpp | 13 +- .../include/builders/ie_detection_output_layer.hpp | 17 +- .../include/builders/ie_eltwise_layer.hpp | 23 +- inference-engine/include/builders/ie_elu_layer.hpp | 17 +- .../include/builders/ie_fully_connected_layer.hpp | 30 +- inference-engine/include/builders/ie_grn_layer.hpp | 17 +- .../include/builders/ie_gru_sequence_layer.hpp | 87 + .../include/builders/ie_input_layer.hpp | 23 +- .../include/builders/ie_layer_builder.hpp | 138 +- ...e_layer_fragment.hpp => ie_layer_decorator.hpp} | 45 +- inference-engine/include/builders/ie_lrn_layer.hpp | 99 + .../include/builders/ie_lstm_sequence_layer.hpp | 87 + .../include/builders/ie_memory_layer.hpp | 17 +- inference-engine/include/builders/ie_mvn_layer.hpp | 17 +- .../include/builders/ie_network_builder.hpp | 146 +- .../include/builders/ie_norm_layer.hpp | 17 +- .../include/builders/ie_normalize_layer.hpp | 17 +- .../include/builders/ie_output_layer.hpp | 17 +- .../include/builders/ie_permute_layer.hpp | 28 +- .../include/builders/ie_pooling_layer.hpp | 23 +- .../include/builders/ie_power_layer.hpp | 17 +- .../include/builders/ie_prelu_layer.hpp | 23 +- .../builders/ie_prior_box_clustered_layer.hpp | 17 +- .../include/builders/ie_prior_box_layer.hpp | 17 +- .../include/builders/ie_proposal_layer.hpp | 17 +- .../include/builders/ie_psroi_pooling_layer.hpp | 17 +- .../include/builders/ie_region_yolo_layer.hpp | 17 +- .../include/builders/ie_relu6_layer.hpp | 17 +- .../include/builders/ie_relu_layer.hpp | 17 +- .../include/builders/ie_reorg_yolo_layer.hpp | 17 +- .../include/builders/ie_resample_layer.hpp | 126 + .../include/builders/ie_reshape_layer.hpp | 17 +- .../include/builders/ie_rnn_sequence_layer.hpp | 83 + .../include/builders/ie_roi_pooling_layer.hpp | 17 +- .../include/builders/ie_scale_shift_layer.hpp | 30 +- .../include/builders/ie_sigmoid_layer.hpp | 17 +- .../include/builders/ie_simpler_nms_layer.hpp | 17 +- .../include/builders/ie_softmax_layer.hpp | 17 +- .../include/builders/ie_split_layer.hpp | 17 +- .../include/builders/ie_tanh_layer.hpp | 17 +- .../include/builders/ie_tile_layer.hpp | 17 +- inference-engine/include/cldnn/cldnn_config.hpp | 4 +- inference-engine/include/cpp/ie_cnn_net_reader.h | 2 +- inference-engine/include/cpp/ie_cnn_network.h | 15 +- .../include/cpp/ie_executable_network.hpp | 12 +- inference-engine/include/cpp/ie_infer_request.hpp | 2 +- inference-engine/include/cpp/ie_memory_state.hpp | 2 +- inference-engine/include/cpp/ie_plugin_cpp.hpp | 3 +- inference-engine/include/details/caseless.hpp | 2 +- .../include/details/ie_blob_iterator.hpp | 2 +- .../include/details/ie_cnn_network_iterator.hpp | 2 +- .../include/details/ie_cnn_network_tools.h | 2 +- inference-engine/include/details/ie_exception.hpp | 17 +- .../include/details/ie_exception_conversion.hpp | 2 +- .../include/details/ie_inetwork_iterator.hpp | 35 +- inference-engine/include/details/ie_irelease.hpp | 2 +- inference-engine/include/details/ie_no_copy.hpp | 2 +- inference-engine/include/details/ie_no_release.hpp | 2 +- .../include/details/ie_pre_allocator.hpp | 2 +- inference-engine/include/details/ie_so_loader.h | 2 +- inference-engine/include/details/ie_so_pointer.hpp | 2 +- .../include/details/os/lin_shared_object_loader.h | 2 +- .../include/details/os/win_shared_object_loader.h | 2 +- inference-engine/include/gna/gna_config.hpp | 6 +- .../include/hetero/hetero_plugin_config.hpp | 4 +- inference-engine/include/ie_allocator.hpp | 2 +- inference-engine/include/ie_api.h | 2 +- inference-engine/include/ie_blob.h | 76 +- inference-engine/include/ie_builders.hpp | 7 +- inference-engine/include/ie_common.h | 5 +- inference-engine/include/ie_context.hpp | 2 +- inference-engine/include/ie_data.h | 9 +- inference-engine/include/ie_device.hpp | 11 +- inference-engine/include/ie_error.hpp | 2 +- inference-engine/include/ie_extension.h | 2 +- inference-engine/include/ie_icnn_net_reader.h | 2 +- inference-engine/include/ie_icnn_network.hpp | 4 +- inference-engine/include/ie_icnn_network_stats.hpp | 2 +- .../include/ie_iexecutable_network.hpp | 13 +- inference-engine/include/ie_iextension.h | 22 +- inference-engine/include/ie_ihetero_plugin.hpp | 2 +- inference-engine/include/ie_iinfer_request.hpp | 2 +- inference-engine/include/ie_imemory_state.hpp | 2 +- inference-engine/include/ie_input_info.hpp | 2 +- inference-engine/include/ie_layers.h | 588 +- inference-engine/include/ie_layers_property.hpp | 9 +- inference-engine/include/ie_layouts.h | 7 +- inference-engine/include/ie_locked_memory.hpp | 2 +- .../include/{ie_inetwork.hpp => ie_network.hpp} | 169 +- inference-engine/include/ie_parallel.hpp | 12 +- inference-engine/include/ie_parameter.hpp | 429 +- inference-engine/include/ie_plugin.hpp | 2 +- inference-engine/include/ie_plugin_config.hpp | 2 +- inference-engine/include/ie_plugin_dispatcher.hpp | 4 +- inference-engine/include/ie_plugin_ptr.hpp | 2 +- inference-engine/include/ie_precision.hpp | 22 +- inference-engine/include/ie_preprocess.hpp | 2 +- inference-engine/include/ie_primitive_info.hpp | 2 +- inference-engine/include/ie_tensor_info.hpp | 2 +- inference-engine/include/ie_unicode.hpp | 2 +- inference-engine/include/ie_utils.hpp | 2 +- inference-engine/include/ie_version.hpp | 2 +- inference-engine/include/inference_engine.hpp | 2 +- inference-engine/include/vpu/vpu_plugin_config.hpp | 213 + inference-engine/install_dependencies.sh | 7 +- inference-engine/samples/CMakeLists.txt | 105 +- .../samples/benchmark_app/CMakeLists.txt | 4 +- inference-engine/samples/benchmark_app/README.md | 118 +- .../samples/benchmark_app/benchmark_app.h | 133 - .../samples/benchmark_app/benchmark_app.hpp | 169 + .../samples/benchmark_app/infer_request_wrap.hpp | 64 + inference-engine/samples/benchmark_app/main.cpp | 379 +- .../samples/benchmark_app/progress_bar.hpp | 41 + .../samples/benchmark_app/statistics_report.cpp | 222 + .../samples/benchmark_app/statistics_report.hpp | 67 + inference-engine/samples/build_samples.sh | 61 - .../samples/calibration_tool/CMakeLists.txt | 4 +- .../samples/calibration_tool/README.md | 23 +- .../calibration_tool/calibrator_processors.cpp | 105 +- .../calibration_tool/calibrator_processors.h | 8 +- .../samples/calibration_tool/data_stats.cpp | 4 +- .../samples/calibration_tool/data_stats.h | 2 +- inference-engine/samples/calibration_tool/main.cpp | 37 +- .../samples/classification_sample/CMakeLists.txt | 4 +- .../samples/classification_sample/README.md | 56 +- .../classification_sample/classification_sample.h | 20 +- .../samples/classification_sample/main.cpp | 43 +- .../classification_sample_async/CMakeLists.txt | 4 +- .../samples/classification_sample_async/README.md | 82 +- .../classification_sample_async.h | 36 +- .../samples/classification_sample_async/main.cpp | 40 +- .../samples/common/format_reader/CMakeLists.txt | 18 +- .../samples/common/format_reader/MnistUbyte.cpp | 2 +- .../samples/common/format_reader/MnistUbyte.h | 4 +- .../samples/common/format_reader/bmp.cpp | 2 +- .../samples/common/format_reader/bmp.h | 4 +- .../samples/common/format_reader/format_reader.cpp | 2 +- .../samples/common/format_reader/format_reader.h | 6 +- .../common/format_reader/format_reader_ptr.h | 2 +- .../samples/common/format_reader/opencv_wraper.cpp | 8 +- .../samples/common/format_reader/opencv_wraper.h | 4 +- .../samples/common/format_reader/register.h | 2 +- .../samples/common/os/windows/w_dirent.h | 6 +- .../samples/common/samples/args_helper.hpp | 2 +- .../common/samples/classification_results.h | 92 + inference-engine/samples/common/samples/common.hpp | 44 +- .../samples}/console_progress.hpp | 4 +- .../samples}/csv_dumper.hpp | 2 +- .../samples/common/samples/ocv_common.hpp | 7 +- inference-engine/samples/common/samples/slog.hpp | 2 +- .../samples/create_msvc2015_solution.bat | 31 - .../samples/create_msvc2017_solution.bat | 31 - .../hello_autoresize_classification/CMakeLists.txt | 4 +- .../hello_autoresize_classification/README.md | 33 +- .../hello_autoresize_classification/main.cpp | 22 +- .../samples/hello_classification/CMakeLists.txt | 4 +- .../samples/hello_classification/main.cpp | 20 +- .../hello_request_classification/CMakeLists.txt | 4 +- .../samples/hello_request_classification/README.md | 25 +- .../samples/hello_request_classification/main.cpp | 21 +- .../samples/hello_shape_infer_ssd/CMakeLists.txt | 3 +- .../samples/hello_shape_infer_ssd/README.md | 16 +- .../samples/hello_shape_infer_ssd/main.cpp | 7 +- .../shape_infer_extension.hpp | 8 +- .../lenet_network_graph_builder/CMakeLists.txt | 6 +- .../samples/lenet_network_graph_builder/README.md | 38 +- .../lenet_network_graph_builder.hpp | 17 +- .../samples/lenet_network_graph_builder/main.cpp | 46 +- .../object_detection_sample_ssd/CMakeLists.txt | 4 +- .../samples/object_detection_sample_ssd/README.md | 53 +- .../samples/object_detection_sample_ssd/main.cpp | 43 +- .../object_detection_sample_ssd.h | 22 +- inference-engine/samples/perfcheck/CMakeLists.txt | 3 +- inference-engine/samples/perfcheck/README.md | 18 +- inference-engine/samples/perfcheck/main.cpp | 14 +- inference-engine/samples/perfcheck/perfcheck.h | 2 +- .../samples/speech_sample/CMakeLists.txt | 6 +- inference-engine/samples/speech_sample/README.md | 197 +- inference-engine/samples/speech_sample/main.cpp | 185 +- .../samples/speech_sample/speech_sample.hpp | 22 +- .../samples/style_transfer_sample/CMakeLists.txt | 4 +- .../samples/style_transfer_sample/README.md | 34 +- .../samples/style_transfer_sample/main.cpp | 11 +- .../style_transfer_sample/style_transfer_sample.h | 20 +- .../samples/validation_app/CMakeLists.txt | 6 +- .../validation_app/ClassificationProcessor.cpp | 22 +- .../validation_app/ClassificationProcessor.hpp | 2 +- .../validation_app/ObjectDetectionProcessor.cpp | 27 +- .../validation_app/ObjectDetectionProcessor.hpp | 2 +- .../validation_app/PreprocessingOptions.hpp | 2 +- .../samples/validation_app/Processor.cpp | 6 +- .../samples/validation_app/Processor.hpp | 8 +- inference-engine/samples/validation_app/README.md | 13 +- .../validation_app/SSDObjectDetectionProcessor.hpp | 13 +- .../samples/validation_app/VOCAnnotationParser.cpp | 2 +- .../samples/validation_app/VOCAnnotationParser.hpp | 2 +- .../YOLOObjectDetectionProcessor.hpp | 13 +- .../classification_set_generator.cpp | 5 +- .../classification_set_generator.hpp | 2 +- .../samples/validation_app/image_decoder.cpp | 8 +- .../samples/validation_app/image_decoder.hpp | 2 +- inference-engine/samples/validation_app/main.cpp | 11 +- .../samples/validation_app/pugixml/pugiconfig.hpp | 2 +- .../samples/validation_app/pugixml/pugixml.cpp | 2 +- .../samples/validation_app/pugixml/pugixml.hpp | 2 +- .../samples/validation_app/user_exception.hpp | 4 +- inference-engine/src/CMakeLists.txt | 5 +- inference-engine/src/cldnn_engine/CMakeLists.txt | 6 +- .../src/cldnn_engine/cldnn_custom_layer.cpp | 7 +- .../src/cldnn_engine/cldnn_custom_layer.h | 6 +- inference-engine/src/cldnn_engine/cldnn_engine.cpp | 8 +- inference-engine/src/cldnn_engine/cldnn_engine.h | 2 +- .../ctc_greedy_decoder.cl | 2 +- .../cldnn_global_custom_kernels/grn.cl | 2 +- .../cldnn_global_custom_kernels/interp.cl | 2 +- .../prior_box_clustered.cl | 2 +- inference-engine/src/cldnn_engine/cldnn_graph.cpp | 1261 +- inference-engine/src/cldnn_engine/cldnn_graph.h | 44 +- .../src/cldnn_engine/cldnn_infer_request.cpp | 71 +- .../src/cldnn_engine/cldnn_infer_request.h | 5 +- .../src/cldnn_engine/debug_options.cpp | 2 +- inference-engine/src/cldnn_engine/debug_options.h | 2 +- inference-engine/src/cldnn_engine/dllmain.cpp | 2 +- inference-engine/src/cldnn_engine/simple_math.cpp | 2 +- inference-engine/src/cldnn_engine/simple_math.h | 2 +- inference-engine/src/extension/CMakeLists.txt | 19 +- inference-engine/src/extension/README.md | 16 +- inference-engine/src/extension/cmake/CPUID.cmake | 8 +- .../src/extension/cmake/feature_defs.cmake | 4 +- inference-engine/src/extension/common/defs.h | 2 +- inference-engine/src/extension/common/fast_exp.h | 2 +- inference-engine/src/extension/common/matrixmult.h | 31 - inference-engine/src/extension/common/opt_exp.h | 2 +- inference-engine/src/extension/common/softmax.h | 2 +- inference-engine/src/extension/ext_argmax.cpp | 8 +- inference-engine/src/extension/ext_base.cpp | 17 +- inference-engine/src/extension/ext_base.hpp | 2 +- inference-engine/src/extension/ext_ctc_greedy.cpp | 18 +- .../src/extension/ext_depth_to_space.cpp | 125 + .../src/extension/ext_detectionoutput.cpp | 48 +- .../src/extension/ext_detectionoutput_onnx.cpp | 375 + inference-engine/src/extension/ext_expand.cpp | 192 + inference-engine/src/extension/ext_fill.cpp | 128 + inference-engine/src/extension/ext_gather.cpp | 239 +- inference-engine/src/extension/ext_grn.cpp | 4 +- inference-engine/src/extension/ext_interp.cpp | 2 +- inference-engine/src/extension/ext_list.cpp | 6 +- inference-engine/src/extension/ext_list.hpp | 2 +- inference-engine/src/extension/ext_mvn.cpp | 47 +- inference-engine/src/extension/ext_normalize.cpp | 11 +- inference-engine/src/extension/ext_pad.cpp | 6 +- inference-engine/src/extension/ext_powerfile.cpp | 2 +- inference-engine/src/extension/ext_priorbox.cpp | 17 +- .../src/extension/ext_priorbox_clustered.cpp | 16 +- .../src/extension/ext_priorgridgenerator_onnx.cpp | 97 + inference-engine/src/extension/ext_proposal.cpp | 69 +- .../src/extension/ext_proposal_onnx.cpp | 442 + inference-engine/src/extension/ext_psroi.cpp | 145 +- inference-engine/src/extension/ext_range.cpp | 132 + inference-engine/src/extension/ext_region_yolo.cpp | 4 +- inference-engine/src/extension/ext_reorg_yolo.cpp | 2 +- inference-engine/src/extension/ext_resample.cpp | 124 +- .../src/extension/ext_reverse_sequence.cpp | 179 + .../src/extension/ext_roifeatureextractor_onnx.cpp | 413 + .../src/extension/ext_shuffle_channels.cpp | 149 + inference-engine/src/extension/ext_simplernms.cpp | 12 +- .../src/extension/ext_space_to_depth.cpp | 126 + .../src/extension/ext_spatial_transformer.cpp | 155 - inference-engine/src/extension/ext_squeeze.cpp | 123 + .../src/extension/ext_strided_slice.cpp | 380 + .../src/extension/ext_topkrois_onnx.cpp | 78 + inference-engine/src/extension/ext_unsqueeze.cpp | 110 + inference-engine/src/extension/simple_copy.cpp | 2 +- inference-engine/src/extension/simple_copy.h | 2 +- inference-engine/src/gna_plugin/CMakeLists.txt | 3 +- inference-engine/src/gna_plugin/dnn.cpp | 59 +- inference-engine/src/gna_plugin/dnn.h | 5 +- inference-engine/src/gna_plugin/dnn_memory.cpp | 4 +- inference-engine/src/gna_plugin/dnn_memory.hpp | 3 +- inference-engine/src/gna_plugin/dnn_traits.hpp | 4 +- inference-engine/src/gna_plugin/floatmath.cpp | 4 +- inference-engine/src/gna_plugin/floatmath.h | 2 +- inference-engine/src/gna_plugin/gna_allocator.hpp | 2 +- .../src/gna_plugin/gna_api_wrapper.hpp | 5 +- inference-engine/src/gna_plugin/gna_device.cpp | 2 +- inference-engine/src/gna_plugin/gna_device.hpp | 2 +- .../src/gna_plugin/gna_executable_network.hpp | 2 +- inference-engine/src/gna_plugin/gna_helper.cpp | 4 +- .../src/gna_plugin/gna_infer_request.hpp | 7 +- inference-engine/src/gna_plugin/gna_layer_info.hpp | 14 +- .../src/gna_plugin/gna_mem_requests.hpp | 2 +- inference-engine/src/gna_plugin/gna_memory.hpp | 2 +- .../src/gna_plugin/gna_memory_state.hpp | 2 +- .../src/gna_plugin/gna_model_serial.cpp | 2 +- .../src/gna_plugin/gna_model_serial.hpp | 2 +- inference-engine/src/gna_plugin/gna_plugin.cpp | 991 +- inference-engine/src/gna_plugin/gna_plugin.hpp | 90 +- .../src/gna_plugin/gna_plugin_config.hpp | 2 +- .../src/gna_plugin/gna_plugin_entry_points.cpp | 4 +- .../src/gna_plugin/gna_plugin_internal.hpp | 35 +- inference-engine/src/gna_plugin/gna_plugin_log.hpp | 2 +- .../src/gna_plugin/gna_plugin_passes.cpp | 298 +- .../src/gna_plugin/gna_plugin_policy.hpp | 38 + inference-engine/src/gna_plugin/lstm.cpp | 4 +- inference-engine/src/gna_plugin/lstm.hpp | 2 +- .../src/gna_plugin/polymorh_allocator.hpp | 2 +- inference-engine/src/gna_plugin/pwl.h | 2 +- inference-engine/src/gna_plugin/pwl_design.cpp | 4 +- .../gna_plugin/quantization/layer_quantizer.hpp | 7 +- .../gna_plugin/quantization/model_quantizer.hpp | 6 +- .../src/gna_plugin/quantization/precision_ex.hpp | 2 +- .../src/gna_plugin/quantization/quantization.cpp | 2 +- .../src/gna_plugin/quantization/quantization.h | 2 +- .../quantization/quantized_layer_params.hpp | 2 +- .../gna_plugin/quantization/scale_factor_calc.hpp | 121 +- inference-engine/src/gna_plugin/util.cpp | 4 +- inference-engine/src/gna_plugin/util.h | 2 +- inference-engine/src/hetero_plugin/CMakeLists.txt | 4 +- .../src/hetero_plugin/fallback_policy.cpp | 16 +- .../src/hetero_plugin/fallback_policy.h | 16 +- .../hetero_plugin/hetero_async_infer_request.cpp | 16 +- .../src/hetero_plugin/hetero_async_infer_request.h | 16 +- .../src/hetero_plugin/hetero_device_loader.cpp | 16 +- .../src/hetero_plugin/hetero_device_loader.h | 16 +- .../hetero_plugin/hetero_executable_network.cpp | 25 +- .../src/hetero_plugin/hetero_executable_network.h | 20 +- .../src/hetero_plugin/hetero_infer_request.cpp | 16 +- .../src/hetero_plugin/hetero_infer_request.h | 16 +- .../src/hetero_plugin/hetero_plugin.cpp | 21 +- inference-engine/src/hetero_plugin/hetero_plugin.h | 16 +- .../src/hetero_plugin/hetero_plugin_base.hpp | 16 +- .../src/inference_engine/CMakeLists.txt | 27 +- inference-engine/src/inference_engine/ade_util.cpp | 2 +- inference-engine/src/inference_engine/ade_util.hpp | 2 +- .../src/inference_engine/blob_factory.cpp | 7 +- .../src/inference_engine/blob_factory.hpp | 9 +- .../src/inference_engine/blob_transform.cpp | 2 +- .../src/inference_engine/blob_transform.hpp | 2 +- .../inference_engine/builders/ie_argmax_layer.cpp | 65 +- .../builders/ie_batch_normalization_layer.cpp | 67 +- .../inference_engine/builders/ie_clamp_layer.cpp | 53 +- .../inference_engine/builders/ie_concat_layer.cpp | 80 +- .../inference_engine/builders/ie_const_layer.cpp | 41 +- .../builders/ie_convolution_layer.cpp | 237 +- .../inference_engine/builders/ie_crop_layer.cpp | 87 +- .../builders/ie_ctc_greedy_decoder_layer.cpp | 43 +- .../builders/ie_deconvolution_layer.cpp | 162 +- .../builders/ie_detection_output_layer.cpp | 116 +- .../inference_engine/builders/ie_eltwise_layer.cpp | 119 +- .../src/inference_engine/builders/ie_elu_layer.cpp | 49 +- .../builders/ie_fully_connected_layer.cpp | 52 +- .../src/inference_engine/builders/ie_grn_layer.cpp | 35 +- .../builders/ie_gru_sequence_layer.cpp | 126 + .../builders/ie_input_layer_layer.cpp | 32 +- .../inference_engine/builders/ie_layer_builder.cpp | 102 +- .../builders/ie_layer_decorator.cpp | 71 + .../builders/ie_layer_fragment.cpp | 52 - .../src/inference_engine/builders/ie_lrn_layer.cpp | 105 + .../builders/ie_lstm_sequence_layer.cpp | 127 + .../inference_engine/builders/ie_memory_layer.cpp | 51 +- .../src/inference_engine/builders/ie_mvn_layer.cpp | 59 +- .../builders/ie_network_builder.cpp | 649 +- .../inference_engine/builders/ie_norm_layer.cpp | 69 +- .../builders/ie_normalize_layer.cpp | 60 +- .../builders/ie_output_layer_layer.cpp | 24 +- .../inference_engine/builders/ie_permute_layer.cpp | 41 +- .../inference_engine/builders/ie_pooling_layer.cpp | 132 +- .../inference_engine/builders/ie_power_layer.cpp | 44 +- .../inference_engine/builders/ie_prelu_layer.cpp | 39 +- .../builders/ie_prior_box_clustered_layer.cpp | 85 +- .../builders/ie_prior_box_layer.cpp | 79 +- .../builders/ie_proposal_layer.cpp | 74 +- .../builders/ie_psroi_pooling_layer.cpp | 45 +- .../builders/ie_region_yolo_layer.cpp | 72 +- .../inference_engine/builders/ie_relu6_layer.cpp | 45 +- .../inference_engine/builders/ie_relu_layer.cpp | 46 +- .../builders/ie_reorg_yolo_layer.cpp | 36 +- .../builders/ie_resample_layer.cpp | 95 + .../inference_engine/builders/ie_reshape_layer.cpp | 41 +- .../builders/ie_rnn_sequence_layer.cpp | 100 + .../builders/ie_roi_pooling_layer.cpp | 44 +- .../builders/ie_scale_shift_layer.cpp | 39 +- .../inference_engine/builders/ie_sigmoid_layer.cpp | 26 +- .../builders/ie_simpler_nms_layer.cpp | 65 +- .../inference_engine/builders/ie_softmax_layer.cpp | 35 +- .../inference_engine/builders/ie_split_layer.cpp | 35 +- .../inference_engine/builders/ie_tanh_layer.cpp | 38 +- .../inference_engine/builders/ie_tile_layer.cpp | 44 +- .../src/inference_engine/cnn_network_impl.cpp | 20 +- .../src/inference_engine/cnn_network_impl.hpp | 11 +- .../cnn_network_int8_normalizer.cpp | 370 +- .../cnn_network_int8_normalizer.hpp | 44 +- .../inference_engine/cnn_network_stats_impl.cpp | 2 +- .../inference_engine/cnn_network_stats_impl.hpp | 2 +- .../base/ie_executable_network_base.hpp | 6 +- .../base/ie_infer_async_request_base.hpp | 2 +- .../cpp_interfaces/base/ie_memory_state_base.hpp | 2 +- .../cpp_interfaces/base/ie_plugin_base.hpp | 2 +- .../cpp_interfaces/exception2status.hpp | 2 +- .../cpp_interfaces/ie_executor_manager.cpp | 2 +- .../cpp_interfaces/ie_executor_manager.hpp | 2 +- .../cpp_interfaces/ie_itask_executor.hpp | 2 +- .../inference_engine/cpp_interfaces/ie_task.cpp | 2 +- .../inference_engine/cpp_interfaces/ie_task.hpp | 2 +- .../cpp_interfaces/ie_task_executor.cpp | 2 +- .../cpp_interfaces/ie_task_executor.hpp | 2 +- .../cpp_interfaces/ie_task_synchronizer.hpp | 2 +- .../cpp_interfaces/ie_task_with_stages.cpp | 4 +- .../cpp_interfaces/ie_task_with_stages.hpp | 2 +- .../impl/ie_executable_network_internal.hpp | 6 +- ...e_executable_network_thread_safe_async_only.hpp | 2 +- .../ie_executable_network_thread_safe_default.hpp | 2 +- .../impl/ie_infer_async_request_internal.hpp | 2 +- .../ie_infer_async_request_thread_safe_default.hpp | 2 +- ...ie_infer_async_request_thread_safe_internal.hpp | 2 +- .../impl/ie_infer_request_internal.hpp | 10 +- .../impl/ie_memory_state_internal.hpp | 2 +- .../cpp_interfaces/impl/ie_plugin_internal.hpp | 18 +- .../interface/ie_iexecutable_network_internal.hpp | 7 +- .../interface/ie_iinfer_async_request_internal.hpp | 2 +- .../interface/ie_iinfer_request_internal.hpp | 2 +- .../interface/ie_imemory_state_internal.hpp | 2 +- .../interface/ie_iplugin_internal.hpp | 2 +- .../src/inference_engine/cpu_detector.cpp | 2 +- .../src/inference_engine/cpu_detector.hpp | 2 +- .../cpu_x86_sse42/blob_transform_sse42.cpp | 2 +- .../cpu_x86_sse42/blob_transform_sse42.hpp | 2 +- .../cpu_x86_sse42/ie_preprocess_data_sse42.cpp | 2 +- .../cpu_x86_sse42/ie_preprocess_data_sse42.hpp | 2 +- .../ie_preprocess_gapi_kernels_sse42.cpp | 5 +- .../ie_preprocess_gapi_kernels_sse42.hpp | 3 +- .../src/inference_engine/data_stats.cpp | 2 +- inference-engine/src/inference_engine/data_stats.h | 2 +- inference-engine/src/inference_engine/debug.h | 3 +- .../src/inference_engine/description_buffer.hpp | 2 +- inference-engine/src/inference_engine/dll_main.hpp | 4 +- .../src/inference_engine/exec_graph_info.hpp | 34 + .../src/inference_engine/file_utils.cpp | 2 +- inference-engine/src/inference_engine/file_utils.h | 15 +- .../src/inference_engine/graph_tools.cpp | 24 +- .../src/inference_engine/graph_tools.hpp | 139 +- .../src/inference_engine/graph_transformer.cpp | 316 +- .../src/inference_engine/graph_transformer.h | 60 +- .../src/inference_engine/ie_algorithm.hpp | 10 +- .../src/inference_engine/ie_blob_common.cpp | 2 +- .../src/inference_engine/ie_blob_proxy.hpp | 2 +- .../src/inference_engine/ie_cnn_layer_builder.cpp | 96 + .../src/inference_engine/ie_cnn_layer_builder.h | 112 +- .../inference_engine/ie_cnn_net_reader_impl.cpp | 4 +- .../src/inference_engine/ie_cnn_net_reader_impl.h | 2 +- .../src/inference_engine/ie_context.cpp | 2 +- inference-engine/src/inference_engine/ie_data.cpp | 12 +- .../src/inference_engine/ie_device.cpp | 7 +- .../src/inference_engine/ie_format_parser.cpp | 58 +- .../src/inference_engine/ie_format_parser.h | 2 +- .../src/inference_engine/ie_graph_splitter.cpp | 2 +- .../src/inference_engine/ie_graph_splitter.hpp | 2 +- .../src/inference_engine/ie_layer_parsers.cpp | 3 +- .../src/inference_engine/ie_layer_parsers.h | 2 +- .../src/inference_engine/ie_layer_validators.cpp | 1733 +- .../src/inference_engine/ie_layer_validators.hpp | 581 +- .../src/inference_engine/ie_layers_internal.cpp | 5 +- .../src/inference_engine/ie_layers_internal.hpp | 3 +- .../src/inference_engine/ie_layers_prv.h | 99 - .../src/inference_engine/ie_layouts.cpp | 9 +- .../src/inference_engine/ie_memcpy.cpp | 2 +- inference-engine/src/inference_engine/ie_memcpy.h | 2 +- .../src/inference_engine/ie_network.cpp | 219 +- .../src/inference_engine/ie_network.hpp | 160 - .../src/inference_engine/ie_preprocess_data.cpp | 40 +- .../src/inference_engine/ie_preprocess_data.hpp | 9 +- .../src/inference_engine/ie_preprocess_gapi.cpp | 124 +- .../src/inference_engine/ie_preprocess_gapi.hpp | 5 +- .../ie_preprocess_gapi_kernels.cpp | 3 +- .../ie_preprocess_gapi_kernels.hpp | 2 +- .../ie_preprocess_gapi_kernels_impl.hpp | 2 +- .../src/inference_engine/ie_profiling.hpp | 6 +- .../src/inference_engine/ie_util_internal.cpp | 129 +- .../src/inference_engine/ie_util_internal.hpp | 3 +- inference-engine/src/inference_engine/ie_utils.cpp | 2 +- .../src/inference_engine/ie_version.cpp | 4 +- .../src/inference_engine/layer_transform.hpp | 19 +- .../src/inference_engine/memory_solver.cpp | 22 +- .../src/inference_engine/memory_solver.hpp | 20 +- inference-engine/src/inference_engine/net_pass.cpp | 1177 +- inference-engine/src/inference_engine/net_pass.h | 20 +- .../src/inference_engine/network_serializer.cpp | 299 +- .../src/inference_engine/network_serializer.h | 8 +- inference-engine/src/inference_engine/parsers.h | 2 +- .../src/inference_engine/precision_utils.cpp | 34 +- .../src/inference_engine/precision_utils.h | 2 +- .../src/inference_engine/range_iterator.hpp | 2 +- .../shape_infer/built-in/ie_argmax_shape_infer.hpp | 8 +- .../built-in/ie_bin_conv_shape_infer.hpp | 80 + .../shape_infer/built-in/ie_built_in_holder.cpp | 42 +- .../shape_infer/built-in/ie_built_in_holder.hpp | 2 +- .../shape_infer/built-in/ie_built_in_impl.hpp | 27 +- .../shape_infer/built-in/ie_concat_shape_infer.hpp | 6 +- .../shape_infer/built-in/ie_conv_shape_infer.hpp | 72 +- .../shape_infer/built-in/ie_crop_shape_infer.hpp | 6 +- .../built-in/ie_ctc_greedy_decoder_shape_infer.hpp | 9 +- .../shape_infer/built-in/ie_deconv_shape_infer.hpp | 64 +- .../built-in/ie_depth_to_space_shape_infer.hpp | 44 + .../built-in/ie_detection_output_shape_infer.hpp | 6 +- .../built-in/ie_eltwise_shape_infer.hpp | 24 +- .../shape_infer/built-in/ie_equal_shape_infer.hpp | 6 +- .../shape_infer/built-in/ie_expand_shape_infer.hpp | 39 + .../shape_infer/built-in/ie_fill_shape_infer.hpp | 49 + .../built-in/ie_flatten_shape_infer.hpp | 12 +- .../shape_infer/built-in/ie_gather_shape_infer.hpp | 8 +- .../shape_infer/built-in/ie_gemm_shape_infer.hpp | 7 +- .../built-in/ie_inner_product_shape_infer.hpp | 6 +- .../shape_infer/built-in/ie_interp_shape_infer.hpp | 96 +- .../shape_infer/built-in/ie_pad_shape_infer.hpp | 6 +- .../built-in/ie_permute_shape_infer.hpp | 6 +- .../shape_infer/built-in/ie_pool_shape_infer.hpp | 69 +- .../built-in/ie_priorbox_clustered_shape_infer.hpp | 12 +- .../built-in/ie_priorbox_shape_infer.hpp | 10 +- .../built-in/ie_proposal_shape_infer.hpp | 8 +- .../built-in/ie_psroi_pooling_shape_infer.hpp | 6 +- ...shape_infer.hpp => ie_quantize_shape_infer.hpp} | 23 +- .../shape_infer/built-in/ie_range_shape_infer.hpp | 51 + .../built-in/ie_region_yolo_shape_infer.hpp | 43 +- .../built-in/ie_reorg_yolo_shape_infer.hpp | 6 +- .../built-in/ie_resample_shape_infer.hpp | 26 +- .../built-in/ie_reshape_shape_infer.hpp | 50 +- .../built-in/ie_reverse_sequence_shape_infer.hpp | 39 + .../built-in/ie_rnn_cell_shape_infer.hpp | 46 + .../shape_infer/built-in/ie_rnn_shape_infer.hpp | 48 + .../built-in/ie_roi_pooling_shape_infer.hpp | 18 +- .../shape_infer/built-in/ie_shape_shape_infer.hpp | 36 + .../built-in/ie_shuffle_channels_shape_infer.hpp | 39 + .../built-in/ie_simpler_nms_shape_infer.hpp | 6 +- .../built-in/ie_space_to_depth_shape_infer.hpp | 44 + .../shape_infer/built-in/ie_split_shape_infer.hpp | 6 +- .../built-in/ie_squeeze_shape_infer.hpp | 115 + .../built-in/ie_strided_slice_shape_infer.hpp | 36 + .../built-in/ie_tensor_iterator_shape_infer.hpp | 109 + .../shape_infer/built-in/ie_tile_shape_infer.hpp | 6 +- .../built-in/ie_unsqueeze_shape_infer.hpp | 102 + .../built-in/ie_upsampling_shape_infer.hpp | 12 +- .../shape_infer/built-in/impl_register.hpp | 2 +- .../shape_infer/const_infer/ie_add_const_infer.hpp | 50 + .../const_infer/ie_concat_const_infer.hpp | 59 + .../const_infer/ie_const_const_infer.hpp | 36 + .../const_infer/ie_const_infer_holder.cpp | 80 + .../const_infer/ie_const_infer_holder.hpp | 52 + .../const_infer/ie_const_infer_impl.cpp | 25 + .../const_infer/ie_const_infer_impl.hpp | 64 + .../shape_infer/const_infer/ie_div_const_infer.hpp | 51 + .../const_infer/ie_fill_const_infer.hpp | 108 + .../const_infer/ie_gather_const_infer.hpp | 171 + .../const_infer/ie_in_place_const_infer.hpp | 37 + .../shape_infer/const_infer/ie_mul_const_infer.hpp | 50 + .../const_infer/ie_power_const_infer.hpp | 58 + .../const_infer/ie_range_const_infer.hpp | 116 + .../const_infer/ie_reshape_const_infer.hpp | 39 + .../const_infer/ie_shape_const_infer.hpp | 39 + .../const_infer/ie_split_const_infer.hpp | 58 + .../const_infer/ie_strided_slice_const_infer.hpp | 384 + .../const_infer/ie_tile_const_infer.hpp | 60 + .../shape_infer/ie_reshape_io_controllers.cpp | 84 +- .../shape_infer/ie_reshape_io_controllers.hpp | 20 +- .../shape_infer/ie_reshape_launcher.cpp | 81 +- .../shape_infer/ie_reshape_launcher.hpp | 27 +- .../inference_engine/shape_infer/ie_reshaper.cpp | 203 +- .../inference_engine/shape_infer/ie_reshaper.hpp | 33 +- .../src/inference_engine/system_alllocator.cpp | 2 +- .../src/inference_engine/system_alllocator.hpp | 2 +- .../transform/transform_network.cpp | 353 + .../transform/transform_network.hpp | 116 + .../inference_engine/transform/transformation.cpp | 20 + .../inference_engine/transform/transformation.hpp | 25 + .../transformations/eltwise_broadcast.cpp | 68 + .../transformations/eltwise_broadcast.hpp | 18 + .../transform/transformations/lrn.cpp | 63 + .../transform/transformations/lrn.hpp | 18 + .../transform/transformations/sub.cpp | 47 + .../transform/transformations/sub.hpp | 18 + inference-engine/src/inference_engine/w_dirent.h | 2 +- inference-engine/src/inference_engine/w_unistd.h | 2 +- .../src/inference_engine/xml_parse_utils.cpp | 60 +- .../src/inference_engine/xml_parse_utils.h | 3 +- inference-engine/src/mkldnn_plugin/CMakeLists.txt | 6 +- inference-engine/src/mkldnn_plugin/config.cpp | 2 +- inference-engine/src/mkldnn_plugin/config.h | 2 +- inference-engine/src/mkldnn_plugin/mean_image.cpp | 21 +- inference-engine/src/mkldnn_plugin/mean_image.h | 36 +- .../src/mkldnn_plugin/mkldnn/cpu_engine.h | 2 +- .../src/mkldnn_plugin/mkldnn/cpu_prim_layer.h | 2 +- .../src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h | 2 +- .../src/mkldnn_plugin/mkldnn/desc_iterator.hpp | 2 +- .../src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp | 2 +- .../src/mkldnn_plugin/mkldnn/iml_type_mapper.h | 2 +- .../src/mkldnn_plugin/mkldnn/omp_manager.cpp | 2 +- .../src/mkldnn_plugin/mkldnn/omp_manager.h | 2 +- .../mkldnn/os/lin/lin_omp_manager.cpp | 2 +- .../mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h | 2 +- .../mkldnn_plugin/mkldnn_async_infer_request.cpp | 2 +- .../src/mkldnn_plugin/mkldnn_async_infer_request.h | 2 +- .../src/mkldnn_plugin/mkldnn_descriptor.cpp | 65 +- .../src/mkldnn_plugin/mkldnn_descriptor.h | 17 +- inference-engine/src/mkldnn_plugin/mkldnn_dims.h | 26 +- inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp | 251 +- inference-engine/src/mkldnn_plugin/mkldnn_edge.h | 29 +- .../src/mkldnn_plugin/mkldnn_extension_mngr.cpp | 2 +- .../src/mkldnn_plugin/mkldnn_extension_mngr.h | 2 +- .../src/mkldnn_plugin/mkldnn_extension_utils.cpp | 9 +- .../src/mkldnn_plugin/mkldnn_extension_utils.h | 2 +- .../src/mkldnn_plugin/mkldnn_graph.cpp | 569 +- inference-engine/src/mkldnn_plugin/mkldnn_graph.h | 11 +- .../src/mkldnn_plugin/mkldnn_graph_dumper.cpp | 63 +- .../src/mkldnn_plugin/mkldnn_graph_dumper.h | 16 +- .../src/mkldnn_plugin/mkldnn_graph_optimizer.cpp | 308 +- .../src/mkldnn_plugin/mkldnn_graph_optimizer.h | 4 +- .../src/mkldnn_plugin/mkldnn_infer_request.cpp | 5 +- .../src/mkldnn_plugin/mkldnn_infer_request.h | 4 +- .../src/mkldnn_plugin/mkldnn_memory.cpp | 38 +- inference-engine/src/mkldnn_plugin/mkldnn_memory.h | 9 +- inference-engine/src/mkldnn_plugin/mkldnn_node.cpp | 179 +- inference-engine/src/mkldnn_plugin/mkldnn_node.h | 43 +- .../src/mkldnn_plugin/mkldnn_plugin.cpp | 8 +- inference-engine/src/mkldnn_plugin/mkldnn_plugin.h | 2 +- .../src/mkldnn_plugin/mkldnn_primitive.cpp | 2 +- .../src/mkldnn_plugin/mkldnn_primitive.h | 2 +- .../src/mkldnn_plugin/mkldnn_streams.cpp | 7 +- .../src/mkldnn_plugin/mkldnn_streams.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_activation_node.cpp | 16 +- .../mkldnn_plugin/nodes/mkldnn_activation_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp | 4 +- .../mkldnn_plugin/nodes/mkldnn_batchnorm_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp | 461 + .../src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h | 60 + .../src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp | 46 +- .../src/mkldnn_plugin/nodes/mkldnn_concat_node.h | 3 +- .../src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp | 77 +- .../src/mkldnn_plugin/nodes/mkldnn_conv_node.h | 20 +- .../src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_crop_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_deconv_node.h | 10 +- .../mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp | 18 +- .../mkldnn_plugin/nodes/mkldnn_depthwise_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp | 1865 +- .../src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h | 25 +- .../nodes/mkldnn_fullyconnected_node.cpp | 131 +- .../nodes/mkldnn_fullyconnected_node.h | 7 +- .../src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_gemm_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_generic_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_generic_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_input_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_input_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_lrn_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp | 2 +- .../mkldnn_plugin/nodes/mkldnn_permute_node.cpp | 38 +- .../src/mkldnn_plugin/nodes/mkldnn_permute_node.h | 4 +- .../mkldnn_plugin/nodes/mkldnn_pooling_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_pooling_node.h | 10 +- .../src/mkldnn_plugin/nodes/mkldnn_power_node.cpp | 8 +- .../src/mkldnn_plugin/nodes/mkldnn_power_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_quantize_node.cpp | 229 + .../src/mkldnn_plugin/nodes/mkldnn_quantize_node.h | 36 + .../mkldnn_plugin/nodes/mkldnn_reorder_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_reorder_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_reshape_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_reshape_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_rnn.cpp | 272 +- .../src/mkldnn_plugin/nodes/mkldnn_rnn.h | 22 +- .../nodes/mkldnn_roi_pooling_node.cpp | 2 +- .../mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h | 2 +- .../mkldnn_plugin/nodes/mkldnn_softmax_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_softmax_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_split_node.cpp | 69 +- .../src/mkldnn_plugin/nodes/mkldnn_split_node.h | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp | 2 +- .../src/mkldnn_plugin/nodes/mkldnn_tile_node.h | 2 +- inference-engine/src/mkldnn_plugin/perf_count.h | 2 +- .../src/mkldnn_plugin/utils/blob_dump.cpp | 16 +- .../src/mkldnn_plugin/utils/blob_dump.h | 16 +- inference-engine/tests/CMakeLists.txt | 27 +- inference-engine/tests/helpers/CMakeLists.txt | 44 +- inference-engine/tests/helpers/disable_tests.hpp | 2 +- inference-engine/tests/helpers/ir_gen_helper.cpp | 16 +- inference-engine/tests/helpers/ir_gen_helper.hpp | 4 +- .../tests/helpers/single_layer_common.cpp | 2 +- .../tests/helpers/single_layer_common.hpp | 10 +- inference-engine/tests/helpers/test_assertions.hpp | 2 +- inference-engine/tests/helpers/test_model_path.hpp | 2 +- .../tests/helpers/test_model_repo.hpp.in | 16 +- .../tests/helpers/test_models_path.cpp | 2 +- inference-engine/tests/helpers/tests_common.hpp | 122 +- .../tests/helpers/tests_common_func.hpp | 2 +- .../tests/helpers/tests_file_utils.cpp | 2 +- .../tests/helpers/tests_file_utils.hpp | 2 +- inference-engine/tests/helpers/tests_utils.hpp | 2 +- inference-engine/tests/helpers/version_printer.cpp | 2 +- inference-engine/tests/helpers/xml_father.hpp | 2 +- inference-engine/tests/helpers/xml_helper.hpp | 2 +- inference-engine/tests/helpers/xml_net_builder.cpp | 4 +- inference-engine/tests/helpers/xml_net_builder.hpp | 20 +- .../libs/gtest/googlemock/msvc/2005/gmock.sln | 32 + .../libs/gtest/googlemock/msvc/2010/gmock.sln | 46 + .../libs/gtest/googlemock/msvc/2010/gmock.vcxproj | 145 + .../gtest/googlemock/msvc/2010/gmock_main.vcxproj | 151 + .../gtest/googlemock/msvc/2010/gmock_test.vcxproj | 176 + .../libs/gtest/googlemock/msvc/2015/gmock.sln | 46 + .../libs/gtest/googlemock/msvc/2015/gmock.vcxproj | 145 + .../gtest/googlemock/msvc/2015/gmock_main.vcxproj | 151 + .../gtest/googlemock/msvc/2015/gmock_test.vcxproj | 176 + .../libs/gtest/googletest/msvc/2010/gtest-md.sln | 55 + .../gtest/googletest/msvc/2010/gtest-md.vcxproj | 149 + .../googletest/msvc/2010/gtest-md.vcxproj.filters | 18 + .../libs/gtest/googletest/msvc/2010/gtest.sln | 55 + .../libs/gtest/googletest/msvc/2010/gtest.vcxproj | 149 + .../googletest/msvc/2010/gtest.vcxproj.filters | 18 + .../googletest/msvc/2010/gtest_main-md.vcxproj | 154 + .../msvc/2010/gtest_main-md.vcxproj.filters | 18 + .../gtest/googletest/msvc/2010/gtest_main.vcxproj | 162 + .../msvc/2010/gtest_main.vcxproj.filters | 18 + .../msvc/2010/gtest_prod_test-md.vcxproj | 199 + .../msvc/2010/gtest_prod_test-md.vcxproj.filters | 26 + .../googletest/msvc/2010/gtest_prod_test.vcxproj | 191 + .../msvc/2010/gtest_prod_test.vcxproj.filters | 26 + .../googletest/msvc/2010/gtest_unittest-md.vcxproj | 188 + .../msvc/2010/gtest_unittest-md.vcxproj.filters | 18 + .../googletest/msvc/2010/gtest_unittest.vcxproj | 180 + .../msvc/2010/gtest_unittest.vcxproj.filters | 18 + .../googletest/xcode/Config/DebugProject.xcconfig | 30 + .../xcode/Config/FrameworkTarget.xcconfig | 17 + .../gtest/googletest/xcode/Config/General.xcconfig | 41 + .../xcode/Config/ReleaseProject.xcconfig | 32 + .../xcode/Config/StaticLibraryTarget.xcconfig | 18 + .../googletest/xcode/Config/TestTarget.xcconfig | 8 + .../gtest/googletest/xcode/Resources/Info.plist | 30 + .../xcode/Samples/FrameworkSample/Info.plist | 28 + .../WidgetFramework.xcodeproj/project.pbxproj | 457 + .../xcode/Samples/FrameworkSample/runtests.sh | 62 + .../xcode/Samples/FrameworkSample/widget.cc | 63 + .../xcode/Samples/FrameworkSample/widget.h | 59 + .../xcode/Samples/FrameworkSample/widget_test.cc | 68 + .../gtest/googletest/xcode/Scripts/runtests.sh | 65 + .../googletest/xcode/Scripts/versiongenerate.py | 100 + .../xcode/gtest.xcodeproj/project.pbxproj | 1182 + inference-engine/tests/mock_engine/CMakeLists.txt | 15 +- inference-engine/tests/mock_engine/dllmain.cpp | 2 +- inference-engine/tests/mock_engine/mock_plugin.cpp | 2 +- inference-engine/tests/mock_engine/mock_plugin.hpp | 2 +- .../tests/mock_engine/stub_inference_engine.xpp | 16 +- inference-engine/tests/unit/CMakeLists.txt | 99 +- .../tests/unit/builders/argmax_layer_test.cpp | 47 + .../builders/batch_normalization_layer_test.cpp | 24 +- .../tests/unit/builders/builder_test.hpp | 2 +- .../tests/unit/builders/clamp_layer_test.cpp | 49 + .../tests/unit/builders/concat_layer_test.cpp | 151 + .../tests/unit/builders/const_layer_test.cpp | 30 + .../tests/unit/builders/convolution_layer_test.cpp | 307 + .../tests/unit/builders/crop_layer_test.cpp | 84 + .../builders/ctc_greedy_decoder_layer_test.cpp | 42 + .../unit/builders/deconvolution_layer_test.cpp | 306 + .../unit/builders/detection_output_layer_test.cpp | 117 + .../tests/unit/builders/eltwise_layer_test.cpp | 102 + .../tests/unit/builders/elu_layer_test.cpp | 41 + .../tests/unit/builders/input_layer_test.cpp | 4 +- .../tests/unit/builders/mvn_layer_test.cpp | 64 + .../tests/unit/builders/network_builder_test.cpp | 469 +- .../tests/unit/builders/norm_layer_test.cpp | 64 + .../tests/unit/builders/normalize_layer_test.cpp | 89 + .../tests/unit/builders/output_layer_test.cpp | 25 + .../tests/unit/builders/relu6_layer_test.cpp | 34 + .../tests/unit/builders/relu_layer_test.cpp | 41 + .../tests/unit/builders/resample_layer_test.cpp | 35 + .../tests/unit/builders/split_layer_test.cpp | 83 + .../tests/unit/builders/tanh_layer_test.cpp | 31 + .../tests/unit/builders/transform_network_test.cpp | 185 + .../cnn_network/cnn_layer_validation_tests.cpp | 99 + .../unit/cnn_network/cnn_net_reader_impl_test.cpp | 361 +- .../unit/cnn_network/cnn_network_impl_test.cpp | 2 +- .../tests/unit/cnn_network/layer_builder.h | 150 + .../tests/unit/cnn_network/layout_tests.cpp | 2 +- .../tests/unit/cnn_network/mean_image.cpp | 2 +- .../tests/unit/cnn_network/mean_image.h | 2 +- .../tests/unit/cnn_network/parameters.h | 319 + .../tests/unit/cnn_network/parser_tests_base.hpp | 6 +- inference-engine/tests/unit/cnn_network/shapes.h | 257 + .../unit/cnn_network/v2_format_parser_test.cpp | 22 +- .../unit/cnn_network/v3_format_parser_test.cpp | 2 +- .../tests/unit/cnn_network/xml_father_tests.cpp | 2 +- .../unit/engines/gna/I8_quantisation_test.cpp | 16 +- .../tests/unit/engines/gna/configuration_test.cpp | 22 +- .../unit/engines/gna/fp32_non_quantized_tests.cpp | 208 + .../tests/unit/engines/gna/gna_allocator_test.cpp | 16 +- .../tests/unit/engines/gna/gna_aminteldnn_test.cpp | 38 + .../tests/unit/engines/gna/gna_api_stub.cpp | 22 +- .../tests/unit/engines/gna/gna_cppwraper_test.cpp | 27 + .../tests/unit/engines/gna/gna_graph_aot_test.cpp | 35 +- .../engines/gna/gna_hardware_precision_test.cpp | 16 +- .../unit/engines/gna/gna_input_precision_test.cpp | 51 + .../tests/unit/engines/gna/gna_matcher.cpp | 129 +- .../tests/unit/engines/gna/gna_matcher.hpp | 165 +- .../tests/unit/engines/gna/gna_memory_test.cpp | 16 +- .../tests/unit/engines/gna/gna_mock_api.hpp | 16 +- .../tests/unit/engines/gna/gna_proc_type_test.cpp | 16 +- .../tests/unit/engines/gna/gna_pwl_test.cpp | 35 +- .../unit/engines/gna/gna_query_state_tests.cpp | 35 +- .../unit/engines/gna/i16_quantisation_test.cpp | 196 +- .../unit/engines/gna/matchers/conv_matcher.hpp | 34 +- .../unit/engines/gna/matchers/copy_matcher.hpp | 34 +- .../unit/engines/gna/matchers/diag_matcher.hpp | 34 +- .../unit/engines/gna/matchers/fill_with_data.hpp | 74 + .../engines/gna/matchers/input_data_matcher.hpp | 69 + .../engines/gna/matchers/nnet_base_matcher.hpp | 34 +- .../unit/engines/gna/matchers/pool_matcher.hpp | 34 +- .../engines/gna/matchers/precision_matcher.hpp | 34 +- .../unit/engines/gna/matchers/pwl_matcher.hpp | 34 +- .../matchers/pwl_quantization_metrics_matcher.hpp | 34 +- .../unit/engines/gna/matchers/weights_matcher.hpp | 212 + .../tests/unit/engines/gna/test_irs.cpp | 632 +- .../tests/unit/engines/gna/test_irs.hpp | 40 +- .../engines/mkldnn/constant_propagation_test.cpp | 2 +- .../unit/engines/mkldnn/convert_desc_test.cpp | 2 +- .../tests/unit/engines/mkldnn/dump_test.cpp | 16 +- .../tests/unit/engines/mkldnn/dumper_test.cpp | 20 +- .../layers/extensions/depth_to_space_tests.cpp | 525 + .../graph/layers/extensions/expand_tests.cpp | 265 + .../mkldnn/graph/layers/extensions/fake_layer.cpp | 6 +- .../mkldnn/graph/layers/extensions/fill_tests.cpp | 202 + .../graph/layers/extensions/gather_tests.cpp | 31 +- .../graph/layers/extensions/graph_generic_test.cpp | 2 +- .../graph/layers/extensions/interp_tests.cpp | 4 +- .../mkldnn/graph/layers/extensions/mvn_tests.cpp | 2 +- .../mkldnn/graph/layers/extensions/range_tests.cpp | 255 + .../graph/layers/extensions/resample_tests.cpp | 2 +- .../layers/extensions/reverse_sequence_tests.cpp | 273 + .../layers/extensions/shuffle_channels_tests.cpp | 213 + .../graph/layers/extensions/squeeze_tests.cpp | 244 + .../layers/extensions/strided_slice_tests.cpp | 489 + .../graph/layers/extensions/unsqueeze_tests.cpp | 235 + .../layers/internal/graph_activation_test.cpp | 2 +- .../internal/graph_batchnorm_scaleshift_test.cpp | 2 +- .../graph/layers/internal/graph_batchnorm_test.cpp | 2 +- .../graph/layers/internal/graph_concat_test.cpp | 2 +- .../graph/layers/internal/graph_conv_test.cpp | 25 +- .../graph/layers/internal/graph_crop_test.cpp | 2 +- .../graph/layers/internal/graph_deconv_test.cpp | 195 +- .../graph/layers/internal/graph_depthwise_test.cpp | 2 +- .../graph/layers/internal/graph_eltwise_test.cpp | 641 +- .../layers/internal/graph_fullyconnected_test.cpp | 2 +- .../graph/layers/internal/graph_gemm_test.cpp | 2 +- .../graph/layers/internal/graph_input_test.cpp | 113 +- .../graph/layers/internal/graph_leaks_test.cpp | 2 +- .../graph/layers/internal/graph_lrn_test.cpp | 2 +- .../graph/layers/internal/graph_permute_test.cpp | 2 +- .../graph/layers/internal/graph_pooling_test.cpp | 2 +- .../graph/layers/internal/graph_power_test.cpp | 2 +- .../graph/layers/internal/graph_relu_test.cpp | 2 +- .../graph/layers/internal/graph_reorder_test.cpp | 2 +- .../graph/layers/internal/graph_reshape_test.cpp | 2 +- .../layers/internal/graph_roi_pooling_test.cpp | 2 +- .../layers/internal/graph_simplernms_test.cpp | 2 +- .../graph/layers/internal/graph_softmax_test.cpp | 2 +- .../graph/layers/internal/graph_split_test.cpp | 2 +- .../graph/layers/internal/graph_tile_test.cpp | 2 +- .../structure/graph_conv_depthwise_fusing_test.cpp | 2 +- .../graph/structure/graph_deconv_concat_tets.cpp | 400 + .../graph/structure/graph_dw_conv_fusing_test.cpp | 2 +- .../graph/structure/graph_optimization_test.cpp | 2 +- .../graph/structure/graph_structure_test.cpp | 334 +- .../tests/unit/engines/mkldnn/graph/test_graph.hpp | 11 +- .../unit/engines/mkldnn/mkldnn_primitive_test.cpp | 2 +- .../tests/unit/engines/mkldnn/test_layers.cpp | 2 +- .../tests/unit/graph_tools/graph_copy_tests.cpp | 4 +- .../tests/unit/graph_tools/graph_test_base.hpp | 83 +- .../tests/unit/graph_tools/graph_tools_test.cpp | 154 +- .../unit/inference_engine_tests/alocator_tests.cpp | 8 +- .../inference_engine_tests/blob_proxy_test.cpp | 2 +- .../unit/inference_engine_tests/blob_test.cpp | 7 +- .../inference_engine_tests/caslesseq_tests.cpp | 2 +- .../inference_engine_tests/cnn_network_test.cpp | 2 +- .../async_infer_request_base_tests.cpp | 2 +- .../cpp_interfaces/async_infer_request_tests.cpp | 2 +- ...ync_infer_request_thread_safe_default_tests.cpp | 2 +- .../async_infer_request_thread_safe_internal.cpp | 2 +- .../cpp_interfaces/callback_manager_tests.cpp | 2 +- .../executable_network_base_tests.cpp | 2 +- .../cpp_interfaces/executable_network_tests.cpp | 2 +- ...utable_network_thread_safe_async_only_tests.cpp | 2 +- .../executable_network_thread_safe_tests.cpp | 2 +- .../cpp_interfaces/executor_manager_tests.cpp | 2 +- .../iinference_plugin_internal_tests.cpp | 6 +- .../cpp_interfaces/memory_state_tests.cpp | 2 +- .../cpp_interfaces/plugin_base_tests.cpp | 6 +- .../cpp_interfaces/task_common_tests.cpp | 2 +- .../cpp_interfaces/task_executor_tests.cpp | 2 +- .../cpp_interfaces/task_synchronizer_tests.cpp | 2 +- .../cpp_interfaces/task_tests.cpp | 2 +- .../cpp_interfaces/task_tests_utils.hpp | 2 +- .../cpp_interfaces/task_with_stages_tests.cpp | 2 +- .../unit/inference_engine_tests/data_test.cpp | 2 +- .../unit/inference_engine_tests/debug_tests.cpp | 2 +- .../unit/inference_engine_tests/device_tests.cpp | 5 +- .../unit/inference_engine_tests/exception_test.cpp | 2 +- .../inference_engine_plugin_test.cpp | 2 +- .../inference_engine_test.cpp | 2 +- .../layer_transform_test.cpp | 2 +- .../unit/inference_engine_tests/layers_test.cpp | 2 +- .../inference_engine_tests/locked_memory_test.cpp | 2 +- .../normalization/latest_in_fuse_test.cpp | 163 + .../inference_engine_tests/parameter_tests.cpp | 292 + .../plugin_dispatcher_tests.cpp | 2 +- .../unit/inference_engine_tests/pointer_test.cpp | 2 +- .../inference_engine_tests/pre_allocator_test.cpp | 2 +- .../unit/inference_engine_tests/precision_test.cpp | 2 +- .../inference_engine_tests/preprocess_test.cpp | 2 +- .../range_iterator_tests.cpp | 2 +- .../response_buffer_test.cpp | 2 +- .../shared_object_loader_test.cpp | 2 +- .../inference_engine_tests/so_pointer_tests.cpp | 2 +- .../inference_engine_tests/tensor_desc_test.cpp | 2 +- .../util_const_infer_test.cpp | 830 + .../util_const_infer_test.hpp | 86 + .../unit/inference_engine_tests/util_test.cpp | 138 +- .../unit/inference_engine_tests/util_test.hpp | 121 + .../tests/unit/mem_solver/mem_solver_test.cpp | 2 +- .../impl/mock_async_infer_request_default.hpp | 2 +- .../impl/mock_async_infer_request_internal.hpp | 2 +- ...ck_async_infer_request_thread_safe_internal.hpp | 2 +- .../impl/mock_executable_network_internal.hpp | 4 +- .../mock_executable_thread_safe_async_only.hpp | 2 +- .../impl/mock_executable_thread_safe_default.hpp | 2 +- .../impl/mock_infer_request_internal.hpp | 2 +- .../impl/mock_inference_plugin_internal.hpp | 2 +- .../mock_iasync_infer_request_internal.hpp | 2 +- .../mock_iexecutable_network_internal.hpp | 3 +- .../interface/mock_iinfer_request_internal.hpp | 2 +- .../interface/mock_imemory_state_internal.hpp | 2 +- .../unit/mocks/cpp_interfaces/mock_plugin_impl.hpp | 2 +- .../mocks/cpp_interfaces/mock_task_executor.hpp | 2 +- .../cpp_interfaces/mock_task_synchronizer.hpp | 2 +- .../tests/unit/mocks/mock_allocator.hpp | 2 +- .../tests/unit/mocks/mock_error_listener.hpp | 2 +- .../tests/unit/mocks/mock_iasync_infer_request.hpp | 2 +- .../tests/unit/mocks/mock_icnn_network.hpp | 2 +- .../tests/unit/mocks/mock_iexecutable_network.hpp | 3 +- .../tests/unit/mocks/mock_iformat_parser.hpp | 2 +- .../tests/unit/mocks/mock_inference_engine.hpp | 2 +- .../unit/mocks/mock_not_empty_icnn_network.hpp | 12 +- .../tests/unit/mocks/mock_plugin_dispatcher.hpp | 2 +- .../mocks/shape_infer/mock_input_controller.hpp | 4 +- .../mocks/shape_infer/mock_ishape_infer_impl.hpp | 4 +- .../mocks/shape_infer/mock_output_controller.hpp | 2 +- .../mocks/shape_infer/mock_reshaper_launcher.hpp | 2 +- .../shape_infer/mock_shape_infer_extension.hpp | 2 +- .../tests/unit/opencv_test_gapi/CMakeLists.txt | 31 +- .../opencv_test_gapi/common/gapi_core_tests.cpp | 2 +- .../opencv_test_gapi/common/gapi_core_tests.hpp | 14 +- .../common/gapi_core_tests_inl.hpp | 363 +- .../opencv_test_gapi/common/gapi_tests_common.hpp | 2 +- .../opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp | 48 +- .../fluid_test_computations/CMakeLists.txt | 25 + .../fluid_test_computations.cpp | 133 + .../fluid_test_computations.hpp | 57 + .../tests/unit/shape_infer/adult_test.cpp | 648 + .../tests/unit/shape_infer/adult_test.hpp | 74 + .../tests/unit/shape_infer/adult_test_utils.cpp | 124 + .../tests/unit/shape_infer/adult_test_utils.hpp | 137 + .../unit/shape_infer/built_in_holder_test.cpp | 2 +- .../built_in_shape_infer_batch_test.cpp | 2 +- .../shape_infer/built_in_shape_infer_conv_test.cpp | 186 +- .../shape_infer/built_in_shape_infer_fake_test.cpp | 2 +- .../built_in_shape_infer_general_test.cpp | 300 +- .../built_in_shape_infer_general_test.hpp | 190 +- .../shape_infer/built_in_shape_infer_pool_test.cpp | 101 +- .../cpu_ext_shape_infer_general_test.cpp | 71 - .../unit/shape_infer/input_controller_test.cpp | 10 +- .../shape_infer/input_reshape_launcher_test.cpp | 2 +- .../unit/shape_infer/output_controller_test.cpp | 2 +- .../unit/shape_infer/reshape_launcher_test.cpp | 25 +- .../tests/unit/shape_infer/reshaper_test.cpp | 2 +- .../tests/unit/stress_tests/stress_tests.cpp | 4 +- .../v2_topology_verification_test.cpp | 2 +- .../transformations/eltwise_broadcast_test.cpp | 63 + .../tests/unit/transformations/sub_test.cpp | 39 + .../unit/transformations/tranformations_test.hpp | 13 + .../tests/validation_app/CMakeLists.txt | 62 + inference-engine/thirdparty/CMakeLists.txt | 10 +- inference-engine/thirdparty/clDNN/.gitignore | 7 + inference-engine/thirdparty/clDNN/CMakeLists.txt | 55 +- inference-engine/thirdparty/clDNN/README.md | 141 +- .../thirdparty/clDNN/api/C/batch_norm.h | 6 +- inference-engine/thirdparty/clDNN/api/C/border.h | 11 +- .../thirdparty/clDNN/api/C/broadcast.h | 48 +- inference-engine/thirdparty/clDNN/api/C/cldnn.h | 61 +- .../thirdparty/clDNN/api/C/condition.h | 70 + inference-engine/thirdparty/clDNN/api/C/contract.h | 89 + .../thirdparty/clDNN/api/C/convolution.h | 6 + .../clDNN/api/C/convolution_grad_weights.h | 4 + inference-engine/thirdparty/clDNN/api/C/crop.h | 22 +- .../thirdparty/clDNN/api/C/deconvolution.h | 2 + .../thirdparty/clDNN/api/C/depth_to_space.h | 49 + .../thirdparty/clDNN/api/C/detection_output.h | 8 +- .../thirdparty/clDNN/api/C/detection_output_sort.h | 60 + inference-engine/thirdparty/clDNN/api/C/eltwise.h | 30 +- inference-engine/thirdparty/clDNN/api/C/gather.h | 58 + inference-engine/thirdparty/clDNN/api/C/gemm.h | 9 - .../thirdparty/clDNN/api/C/index_select.h | 18 +- inference-engine/thirdparty/clDNN/api/C/lstm.h | 43 +- inference-engine/thirdparty/clDNN/api/C/one_hot.h | 71 + inference-engine/thirdparty/clDNN/api/C/pooling.h | 2 + inference-engine/thirdparty/clDNN/api/C/proposal.h | 3 + .../C/pyramid_roi_align.h} | 34 +- inference-engine/thirdparty/clDNN/api/C/reorder.h | 2 - .../thirdparty/clDNN/api/C/reverse_sequence.h | 51 + .../thirdparty/clDNN/api/C/roi_pooling.h | 13 +- .../thirdparty/clDNN/api/C/shuffle_channels.h | 51 + .../thirdparty/clDNN/api/C/strided_slice.h | 55 + .../thirdparty/clDNN/api/CPP/batch_norm.hpp | 122 +- .../thirdparty/clDNN/api/CPP/border.hpp | 39 +- .../thirdparty/clDNN/api/CPP/broadcast.hpp | 64 +- .../thirdparty/clDNN/api/CPP/cldnn_defs.h | 32 + .../thirdparty/clDNN/api/CPP/condition.hpp | 119 + .../thirdparty/clDNN/api/CPP/contract.hpp | 119 + .../thirdparty/clDNN/api/CPP/convolution.hpp | 389 +- .../clDNN/api/CPP/convolution_grad_weights.hpp | 46 + inference-engine/thirdparty/clDNN/api/CPP/crop.hpp | 76 +- .../thirdparty/clDNN/api/CPP/deconvolution.hpp | 123 +- .../thirdparty/clDNN/api/CPP/depth_to_space.hpp | 72 + .../thirdparty/clDNN/api/CPP/detection_output.hpp | 94 +- .../thirdparty/clDNN/api/CPP/eltwise.hpp | 75 +- .../thirdparty/clDNN/api/CPP/embed.hpp | 13 + .../thirdparty/clDNN/api/CPP/engine.hpp | 20 +- .../thirdparty/clDNN/api/CPP/gather.hpp | 88 + inference-engine/thirdparty/clDNN/api/CPP/gemm.hpp | 9 +- .../thirdparty/clDNN/api/CPP/index_select.hpp | 63 +- .../thirdparty/clDNN/api/CPP/layout.hpp | 80 +- inference-engine/thirdparty/clDNN/api/CPP/lstm.hpp | 28 +- .../thirdparty/clDNN/api/CPP/one_hot.hpp | 103 + .../thirdparty/clDNN/api/CPP/pooling.hpp | 30 +- .../thirdparty/clDNN/api/CPP/primitive.hpp | 40 +- .../thirdparty/clDNN/api/CPP/prior_box.hpp | 3 + .../thirdparty/clDNN/api/CPP/program.hpp | 27 +- .../thirdparty/clDNN/api/CPP/proposal.hpp | 32 +- .../thirdparty/clDNN/api/CPP/pyramid_roi_align.hpp | 64 + .../thirdparty/clDNN/api/CPP/reorder.hpp | 16 +- .../thirdparty/clDNN/api/CPP/reshape.hpp | 2 + .../thirdparty/clDNN/api/CPP/reverse_sequence.hpp | 100 + .../thirdparty/clDNN/api/CPP/roi_pooling.hpp | 27 +- .../thirdparty/clDNN/api/CPP/shuffle_channels.hpp | 79 + .../thirdparty/clDNN/api/CPP/split.hpp | 29 - .../thirdparty/clDNN/api/CPP/strided_slice.hpp | 99 + .../thirdparty/clDNN/api/CPP/tensor.hpp | 184 +- .../thirdparty/clDNN/api/CPP/topology.hpp | 12 +- .../clDNN/api_extension/C/fused_conv_bn_scale.h | 73 + .../clDNN/api_extension/C/fused_conv_eltwise.h | 104 + .../api_extension/CPP/fused_conv_bn_scale.hpp | 170 + .../clDNN/api_extension/CPP/fused_conv_eltwise.hpp | 262 + .../include/boost-1_64/boost/make_unique.hpp | 13 + .../boost-1_64/boost/smart_ptr/make_unique.hpp | 110 + .../thirdparty/clDNN/create_msvc_mscc.bat | 2 +- .../clDNN/kernel_selector/CMakeLists.txt | 21 +- .../clDNN/kernel_selector/common/common_tools.h | 1 + .../clDNN/kernel_selector/common/common_types.h | 70 +- .../clDNN/kernel_selector/common/tensor_type.cpp | 137 +- .../clDNN/kernel_selector/common/tensor_type.h | 22 +- .../activation/activation_kernel_base.cpp | 2 +- .../activation/activation_kernel_opt.h | 4 +- .../activation/activation_kernel_ref.h | 4 +- .../activation/activation_kernel_tutorial.h | 4 +- .../arg_max_min/arg_max_min_kernel_axis.h | 4 +- .../arg_max_min/arg_max_min_kernel_gpu_ref.h | 4 +- .../arg_max_min/arg_max_min_kernel_opt.h | 4 +- .../average_unpooling_kernel_gpu_ref.h | 4 +- .../batch_norm/batch_norm_kernel_base.cpp | 6 +- .../batch_norm/batch_norm_kernel_base.h | 2 + .../batch_norm/batch_norm_kernel_ref.h | 4 +- .../batch_norm_grad/batch_norm_grad_kernel_ref.h | 4 +- .../actual_kernels/border/border_kernel_base.h | 4 +- .../actual_kernels/border/border_kernel_ref.cpp | 2 +- .../core/actual_kernels/border/border_kernel_ref.h | 2 + .../broadcast/broadcast_kernel_base.cpp | 6 +- .../broadcast/broadcast_kernel_base.h | 2 + .../broadcast/broadcast_kernel_ref.cpp | 10 +- .../broadcast/broadcast_kernel_ref.h | 2 + .../concatenation/concatenation_kernel_base.h | 4 +- .../concatenation_kernel_depth_bfyx_no_pitch.h | 6 +- .../concatenation/concatenation_kernel_ref.h | 4 +- .../contract/contract_kernel_base.cpp | 138 + .../actual_kernels/contract/contract_kernel_base.h | 63 + .../contract/contract_kernel_ref.cpp | 53 + .../actual_kernels/contract/contract_kernel_ref.h | 30 + .../contract/contract_kernel_selector.cpp | 30 + .../contract/contract_kernel_selector.h | 34 + .../convolution_kernel_1x1_gemm_MMAD.cpp | 5 +- .../convolution/convolution_kernel_1x1_gemm_MMAD.h | 4 +- .../convolution/convolution_kernel_MMAD.cpp | 7 +- .../convolution/convolution_kernel_MMAD.h | 4 +- .../convolution/convolution_kernel_MMAD_blocks.cpp | 25 +- .../convolution/convolution_kernel_MMAD_blocks.h | 5 +- .../convolution/convolution_kernel_base.cpp | 129 +- .../convolution/convolution_kernel_base.h | 12 +- .../convolution/convolution_kernel_bfyx_1x1.cpp | 3 +- .../convolution/convolution_kernel_bfyx_1x1.h | 4 +- .../convolution_kernel_bfyx_1x1_gemm_buf.cpp | 3 +- .../convolution_kernel_bfyx_1x1_gemm_buf.h | 4 +- .../convolution_kernel_bfyx_1x1_opt.cpp | 173 + .../convolution/convolution_kernel_bfyx_1x1_opt.h | 40 + .../convolution_kernel_bfyx_3x3_dw_opt.cpp | 5 +- .../convolution_kernel_bfyx_3x3_dw_opt.h | 6 +- ...nvolution_kernel_bfyx_depthwise_weights_lwg.cpp | 11 +- ...convolution_kernel_bfyx_depthwise_weights_lwg.h | 6 +- .../convolution_kernel_bfyx_direct_10_12_16.cpp | 4 +- .../convolution_kernel_bfyx_direct_10_12_16.h | 4 +- .../convolution_kernel_bfyx_gemm_like.cpp | 5 +- .../convolution_kernel_bfyx_gemm_like.h | 4 +- .../convolution_kernel_bfyx_os_iyx_osv16.cpp | 35 +- .../convolution_kernel_bfyx_os_iyx_osv16.h | 7 +- .../convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp | 299 + .../convolution_kernel_bfyx_os_iyx_osv16_2_sg.h | 54 + .../convolution/convolution_kernel_bfyx_ref.cpp | 5 +- .../convolution/convolution_kernel_bfyx_ref.h | 5 +- ...olution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp | 81 + ...nvolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h | 43 + .../convolution_kernel_byxf_af32_depthwise.cpp | 5 +- .../convolution_kernel_byxf_af32_depthwise.h | 4 +- ...convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp | 62 + .../convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h | 41 + .../convolution/convolution_kernel_imad_1x1.cpp | 37 + .../convolution/convolution_kernel_imad_1x1.h | 37 + .../convolution/convolution_kernel_imad_3x3.cpp | 305 + .../convolution/convolution_kernel_imad_3x3.h | 58 + .../convolution/convolution_kernel_imad_7x7.cpp | 37 + .../convolution/convolution_kernel_imad_7x7.h | 37 + ...tion_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp | 187 + ...lution_kernel_mmad_32x32sg_128x128wg_slm_int8.h | 45 + ...tion_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp | 187 + ...lution_kernel_mmad_32x32sg_224x128wg_slm_int8.h | 45 + .../convolution_kernel_mmad_32x32sg_slm_int8.cpp | 184 + .../convolution_kernel_mmad_32x32sg_slm_int8.h | 45 + .../convolution_kernel_mmad_batched.cpp | 7 +- .../convolution/convolution_kernel_mmad_batched.h | 4 +- .../convolution_kernel_mmad_batched_block.cpp | 157 + .../convolution_kernel_mmad_batched_block.h | 39 + .../convolution_kernel_mmad_batched_block_1x1.cpp | 159 + .../convolution_kernel_mmad_batched_block_1x1.h | 39 + .../convolution_kernel_mmad_slm_2x14_rep4.cpp | 121 + .../convolution_kernel_mmad_slm_2x14_rep4.h | 43 + .../convolution_kernel_mmad_slm_7x7_rep4.cpp | 129 + .../convolution_kernel_mmad_slm_7x7_rep4.h | 43 + .../convolution/convolution_kernel_selector.cpp | 37 +- .../convolution/convolution_kernel_tutorial.cpp | 3 +- .../convolution/convolution_kernel_tutorial.h | 4 +- .../convolution_kernel_winograd_2x3_s1.cpp | 4 +- .../convolution_kernel_winograd_2x3_s1.h | 4 +- .../convolution_kernel_winograd_2x3_s1_fused.cpp | 3 +- .../convolution_kernel_winograd_2x3_s1_fused.h | 4 +- .../convolution_kernel_winograd_6x3_s1_fused.cpp | 2 +- .../convolution_kernel_winograd_6x3_s1_fused.h | 4 +- .../convolution/convolution_kernel_yxfb_ref.cpp | 3 +- .../convolution/convolution_kernel_yxfb_ref.h | 6 +- .../convolution_kernel_yxfb_yxio_b16.cpp | 3 +- .../convolution/convolution_kernel_yxfb_yxio_b16.h | 6 +- .../convolution_kernel_yxfb_yxio_b1_block.cpp | 4 +- .../convolution_kernel_yxfb_yxio_b1_block.h | 6 +- ...lution_kernel_yxfb_yxio_b1_block_multiple_x.cpp | 4 +- ...volution_kernel_yxfb_yxio_b1_block_multiple_x.h | 6 +- .../convolution_kernel_yxfb_yxio_b8.cpp | 4 +- .../convolution/convolution_kernel_yxfb_yxio_b8.h | 6 +- .../convolution/convolution_params.cpp | 12 +- .../convolution/convolution_params.h | 4 +- .../convolution_grad_weights_kernel_1x1.h | 4 +- .../convolution_grad_weights_kernel_3x3.h | 4 +- .../convolution_grad_weights_kernel_7x7.h | 4 +- .../convolution_grad_weights_kernel_base.cpp | 5 +- .../convolution_grad_weights_kernel_base.h | 5 +- .../convolution_grad_weights_kernel_ref.h | 4 +- .../convolution_grad_weights_kernel_yxfb.h | 4 +- .../deconvolution/deconvolution_kernel_base.cpp | 7 +- .../deconvolution/deconvolution_kernel_base.h | 10 +- .../deconvolution/deconvolution_kernel_bfyx_opt.h | 5 +- .../deconvolution/deconvolution_kernel_ref.cpp | 1 + .../deconvolution/deconvolution_kernel_ref.h | 5 +- .../depth_to_space/depth_to_space_kernel_ref.cpp | 85 + .../depth_to_space/depth_to_space_kernel_ref.h | 56 + .../depth_to_space_kernel_selector.cpp | 31 + .../depth_to_space_kernel_selector.h | 37 + .../detection_output_kernel_base.cpp | 67 + .../detection_output_kernel_base.h | 87 + .../detection_output_kernel_ref.cpp | 95 + .../detection_output/detection_output_kernel_ref.h | 37 + .../detection_output_kernel_selector.cpp | 42 + .../detection_output_kernel_selector.h | 52 + .../detection_output_kernel_sort.cpp | 89 + .../detection_output_kernel_sort.h | 37 + .../eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp | 301 + .../eltwise/eltwise_kernel_b_fs_yx_fsv4.h | 37 + .../actual_kernels/eltwise/eltwise_kernel_base.cpp | 131 +- .../actual_kernels/eltwise/eltwise_kernel_base.h | 6 +- .../eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp | 91 +- .../eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h | 4 +- .../actual_kernels/eltwise/eltwise_kernel_ref.cpp | 11 +- .../actual_kernels/eltwise/eltwise_kernel_ref.h | 4 +- .../eltwise/eltwise_kernel_selector.cpp | 4 +- .../eltwise/eltwise_kernel_vload8.cpp | 2 +- .../actual_kernels/eltwise/eltwise_kernel_vload8.h | 4 +- .../core/actual_kernels/embed/embed_kernel_ref.cpp | 5 +- .../core/actual_kernels/embed/embed_kernel_ref.h | 4 +- .../core/actual_kernels/embed/embed_params.h | 17 + .../fully_connected_block_kernel_base.cpp | 3 +- .../fully_connected_kernel_MMAD.cpp | 34 +- .../fully_connected/fully_connected_kernel_MMAD.h | 8 +- .../fully_connected_kernel_base.cpp | 46 +- .../fully_connected/fully_connected_kernel_base.h | 25 +- .../fully_connected_kernel_bf_io_gemm.cpp | 33 +- .../fully_connected_kernel_bf_io_gemm.h | 10 +- .../fully_connected_kernel_bf_io_input_spatial.cpp | 48 +- .../fully_connected_kernel_bf_io_input_spatial.h | 6 +- .../fully_connected_kernel_bf_io_ref.cpp | 14 +- .../fully_connected_kernel_bf_io_ref.h | 4 +- .../fully_connected_kernel_bfyx_ref.cpp | 33 +- .../fully_connected_kernel_bfyx_ref.h | 12 +- .../fully_connected_kernel_bs_f_bsv16_af8.cpp | 27 +- .../fully_connected_kernel_bs_f_bsv16_af8.h | 8 +- .../fully_connected_kernel_bs_f_bsv16_b1.cpp | 48 +- .../fully_connected_kernel_bs_f_bsv16_b1.h | 27 +- .../fully_connected_kernel_bs_f_bsv8_af8.cpp | 38 +- .../fully_connected_kernel_bs_f_bsv8_af8.h | 8 +- .../fully_connected_kernel_fb_io_b8_f8.cpp | 36 +- .../fully_connected_kernel_fb_io_b8_f8.h | 6 +- .../fully_connected_kernel_fb_io_block.cpp | 71 +- .../fully_connected_kernel_fb_io_block.h | 24 +- .../fully_connected_kernel_fb_io_ref.cpp | 13 +- .../fully_connected_kernel_fb_io_ref.h | 4 +- .../fully_connected_kernel_fb_oi_b8_ref.cpp | 26 +- .../fully_connected_kernel_fb_oi_b8_ref.h | 6 +- .../fully_connected_kernel_fb_oi_ref.cpp | 12 +- .../fully_connected_kernel_fb_oi_ref.h | 4 +- .../fully_connected_kernel_imad.cpp | 116 + .../fully_connected/fully_connected_kernel_imad.h | 37 + .../fully_connected_kernel_image_tutorial.cpp | 34 +- .../fully_connected_kernel_image_tutorial.h | 12 +- .../fully_connected_kernel_mmad_batched.cpp | 34 +- .../fully_connected_kernel_mmad_batched.h | 8 +- .../fully_connected_kernel_selector.cpp | 6 +- .../fully_connected_kernel_yxfb_ref.cpp | 15 +- .../fully_connected_kernel_yxfb_ref.h | 4 +- .../fully_connected_grad_input_kernel_base.cpp | 2 +- .../fully_connected_grad_input_kernel_ref.h | 4 +- .../fully_connected_grad_weights_kernel_base.cpp | 2 +- .../fully_connected_grad_weights_kernel_ref.h | 3 +- .../fused_conv_bn_scale_kernel_base.cpp | 176 + .../fused_conv_bn_scale_kernel_base.h | 81 + .../fused_conv_bn_scale_kernel_ref.cpp | 74 + .../fused_conv_bn_scale_kernel_ref.h | 44 + .../fused_conv_bn_scale_kernel_selector.cpp} | 18 +- .../fused_conv_bn_scale_kernel_selector.h | 37 + .../fused_conv_eltwise_kernel_base.cpp | 464 + .../fused_conv_eltwise_kernel_base.h | 138 + .../fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp | 194 + .../fused_conv_eltwise_kernel_bfyx_1x1_opt.h | 42 + ...fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp | 303 + .../fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h | 54 + .../fused_conv_eltwise_kernel_gemm.cpp | 164 + .../fused_conv_eltwise_kernel_gemm.h | 42 + ...wise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp | 224 + ...ltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h | 45 + ...wise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp | 224 + ...ltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h | 45 + .../fused_conv_eltwise_kernel_selector.cpp | 41 + .../fused_conv_eltwise_kernel_selector.h | 37 + .../fused_conv_eltwise_kernel_yxfb_yxio_b16.cpp | 224 + .../fused_conv_eltwise_kernel_yxfb_yxio_b16.h | 40 + .../actual_kernels/gather/gather_kernel_ref.cpp | 144 + .../core/actual_kernels/gather/gather_kernel_ref.h | 56 + .../gather/gather_kernel_selector.cpp | 31 + .../actual_kernels/gather/gather_kernel_selector.h | 37 + .../core/actual_kernels/gemm/gemm_kernel_base.cpp | 2 +- .../core/actual_kernels/gemm/gemm_kernel_ref.h | 2 + .../index_select/index_select_kernel_base.cpp | 93 +- .../index_select/index_select_kernel_base.h | 3 +- .../index_select/index_select_kernel_ref.h | 2 + .../lookup_table/lookup_table_kernel_axis.h | 4 +- .../lookup_table/lookup_table_kernel_ref.h | 4 +- .../lrn_kernel_across_channel_multiple_features.h | 4 +- .../lrn/lrn_kernel_across_channel_opt_b8.h | 4 +- .../lrn/lrn_kernel_across_channel_ref.h | 4 +- .../core/actual_kernels/lrn/lrn_kernel_ref.h | 2 + .../lrn/lrn_kernel_within_channel_byxf_opt.h | 2 +- .../lrn/lrn_kernel_within_channel_ref.h | 4 +- .../lrn/lrn_kernel_within_channel_ref_opt.h | 4 +- .../actual_kernels/lstm/lstm_elt_kernel_base.cpp | 7 +- .../actual_kernels/lstm/lstm_elt_kernel_base.h | 10 +- .../core/actual_kernels/lstm/lstm_elt_kernel_ref.h | 4 +- .../actual_kernels/lstm/lstm_gemm_kernel_base.cpp | 10 +- .../actual_kernels/lstm/lstm_gemm_kernel_base.h | 2 + .../actual_kernels/lstm/lstm_gemm_kernel_ref.h | 4 +- .../lstm/lstm_gemm_kernel_selector.cpp | 4 + .../lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cpp | 62 + .../lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h | 32 + .../lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cpp | 62 + .../lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h | 32 + .../max_unpooling/max_unpooling_kernel_gpu_ref.h | 4 +- .../core/actual_kernels/mvn/mvn_kernel_bfyx_opt.h | 4 +- .../core/actual_kernels/mvn/mvn_kernel_ref.h | 2 +- .../normalize_kernel_across_spatial_ref.h | 2 + .../normalize_kernel_within_spatial_ref.h | 4 +- .../actual_kernels/one_hot/one_hot_kernel_base.cpp | 76 + .../actual_kernels/one_hot/one_hot_kernel_base.h | 63 + .../actual_kernels/one_hot/one_hot_kernel_ref.cpp | 49 + .../actual_kernels/one_hot/one_hot_kernel_ref.h | 32 + .../one_hot/one_hot_kernel_selector.cpp | 30 + .../one_hot/one_hot_kernel_selector.h | 34 + .../actual_kernels/permute/permute_kernel_ref.cpp | 26 +- .../actual_kernels/permute/permute_kernel_ref.h | 4 +- .../pooling/pooling_kernel_gpu_average_opt.cpp | 2 +- .../pooling/pooling_kernel_gpu_average_opt.h | 4 +- .../pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp | 77 + .../pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h | 36 + .../pooling/pooling_kernel_gpu_bfyx_block_opt.h | 4 +- .../pooling/pooling_kernel_gpu_byxf_af32.h | 6 +- .../pooling/pooling_kernel_gpu_byxf_opt.h | 6 +- .../pooling/pooling_kernel_gpu_byxf_padding_opt.h | 4 +- .../pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h | 4 +- .../pooling/pooling_kernel_gpu_int8_ref.h | 4 +- .../pooling/pooling_kernel_gpu_ref.h | 4 +- .../pooling/pooling_kernel_selector.cpp | 4 +- .../pyramid_roi_align_kernel_base.cpp | 67 + .../pyramid_roi_align_kernel_base.h | 57 + .../pyramid_roi_align_kernel_ref.cpp | 40 + .../pyramid_roi_align_kernel_ref.h | 29 + .../pyramid_roi_align_kernel_selector.cpp | 28 + .../pyramid_roi_align_kernel_selector.h | 31 + .../region_yolo/region_yolo_kernel_ref.h | 2 +- .../reorder/reorder_from_winograd_2x3_kernel.h | 6 +- .../core/actual_kernels/reorder/reorder_kernel.h | 4 +- .../actual_kernels/reorder/reorder_kernel_base.cpp | 2 + .../reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp | 83 + .../reorder_kernel_byxf_f32_to_byx8_f4_i8.h | 37 + .../reorder/reorder_kernel_fast_b1.h | 6 +- .../reorder/reorder_kernel_selector.cpp | 2 + .../reorder/reorder_kernel_to_yxfb_batched.h | 4 +- .../reorder/reorder_to_winograd_2x3_kernel.h | 6 +- .../reorder/reorder_weights_image_fyx_b_kernel.h | 6 +- .../reorder_weights_image_winograd_6x3_kernel.h | 6 +- .../reorder/reorder_weights_kernel.h | 4 +- .../reorder/reorder_weights_winograd_2x3_kernel.h | 6 +- .../reorder/reorder_weights_winograd_6x3_kernel.h | 6 +- .../reorg_yolo/reorg_yolo_kernel_ref.h | 3 +- .../actual_kernels/reshape/reshape_kernel_ref.cpp | 2 +- .../actual_kernels/reshape/reshape_kernel_ref.h | 4 +- .../reverse_sequence_kernel_ref.cpp | 87 + .../reverse_sequence/reverse_sequence_kernel_ref.h | 57 + .../reverse_sequence_kernel_selector.cpp | 31 + .../reverse_sequence_kernel_selector.h | 37 + .../roi_pooling/roi_pooling_kernel_base.cpp | 83 + .../roi_pooling/roi_pooling_kernel_base.h | 75 + .../roi_pooling/roi_pooling_kernel_ps_ref.cpp | 55 + .../roi_pooling/roi_pooling_kernel_ps_ref.h | 40 + .../roi_pooling/roi_pooling_kernel_ref.cpp | 73 +- .../roi_pooling/roi_pooling_kernel_ref.h | 39 +- .../roi_pooling/roi_pooling_kernel_selector.cpp | 10 +- .../scale_grad_weights_kernel_base.cpp | 2 +- .../scale_grad_weights_kernel_ref.h | 3 +- .../actual_kernels/select/select_kernel_base.cpp | 2 +- .../core/actual_kernels/select/select_kernel_ref.h | 4 +- .../shuffle_channels_kernel_ref.cpp | 102 + .../shuffle_channels/shuffle_channels_kernel_ref.h | 57 + .../shuffle_channels_kernel_selector.cpp | 31 + .../shuffle_channels_kernel_selector.h | 37 + .../actual_kernels/softmax/softmax_kernel_base.cpp | 2 +- .../actual_kernels/softmax/softmax_kernel_bf.h | 6 +- .../actual_kernels/softmax/softmax_kernel_fb.h | 4 +- .../softmax/softmax_kernel_items_class_optimized.h | 4 +- .../actual_kernels/softmax/softmax_kernel_ref.h | 4 +- .../softmax_loss_grad_kernel_ref.h | 4 +- .../strided_slice/strided_slice_kernel_ref.cpp | 104 + .../strided_slice/strided_slice_kernel_ref.h | 61 + .../strided_slice_kernel_selector.cpp | 31 + .../strided_slice/strided_slice_kernel_selector.h | 37 + .../core/actual_kernels/tile/tile_kernel_ref.h | 2 + .../upsampling/upsampling_kernel_base.cpp | 2 +- .../upsampling/upsampling_kernel_ref.h | 4 +- .../clDNN/kernel_selector/core/auto_tuner.cpp | 177 +- .../clDNN/kernel_selector/core/auto_tuner.h | 16 +- .../kernel_selector/core/auto_tuner_offline.cpp | 43 - .../kernel_selector/core/auto_tuner_offline.h | 73 - .../clDNN/kernel_selector/core/cache/cache.json | 52153 +++++++++++++++++++ .../clDNN/kernel_selector/core/cache/cache_APL.cpp | 2572 - .../kernel_selector/core/cache/cache_GT3_B1.cpp | 1937 - .../core/cache/cache_ICL_B1_B16.cpp | 1823 - .../core/cache/cache_SKL_GT2_B1_B16.cpp | 3478 -- .../core/cache/cache_SKL_GT2_B8.cpp | 29 - .../kernel_selector/core/cache/cache_SKL_GT4e.cpp | 28 - .../core/cache/cache_SKL_GT4e_B1_B16.cpp | 3710 -- .../core/cache/cache_SKL_GT4e_B32_B64.cpp | 29 - .../core/cache/cache_SKL_GT4e_B8.cpp | 169 - .../core/cl_kernels/arg_max_min_axis.cl | 8 +- .../core/cl_kernels/arg_max_min_gpu_ref.cl | 8 +- .../core/cl_kernels/batch_norm_gpu_ref.cl | 32 +- .../core/cl_kernels/broadcast_gpu_ref.cl | 22 +- .../core/cl_kernels/contract_ref.cl | 64 + .../core/cl_kernels/convolution_gpu_bfyx_1x1.cl | 1 + .../cl_kernels/convolution_gpu_bfyx_1x1_opt.cl | 238 + .../convolution_gpu_bfyx_depthwise_weights_lwg.cl | 2 +- .../convolution_gpu_bfyx_gemm_like_fp16.cl | 6 +- .../convolution_gpu_bfyx_gemm_like_fp32.cl | 1 + .../convolution_gpu_bfyx_os_iyx_osv16.cl | 44 +- .../convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl | 254 + .../core/cl_kernels/convolution_gpu_bfyx_ref.cl | 26 +- ...convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl | 170 + .../convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl | 105 + .../core/cl_kernels/convolution_gpu_imad.cl | 202 + ...volution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl | 396 + ...volution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl | 389 + .../convolution_gpu_mmad_32x32sg_slm_int8.cl | 430 + .../convolution_gpu_mmad_batched_block.cl | 194 + .../convolution_gpu_mmad_batched_block_1x1.cl | 241 + .../convolution_gpu_mmad_slm_2x14_rep4.cl | 948 + .../convolution_gpu_mmad_slm_7x7_rep4.cl | 1044 + .../convolution_gpu_winograd_2x3_s1_fused.cl | 2 +- .../core/cl_kernels/convolution_gpu_yxfb_ref.cl | 14 +- .../convolution_gpu_yxfb_yxio_b16_fp16.cl | 19 +- .../convolution_gpu_yxfb_yxio_b16_fp32.cl | 1 + .../convolution_gpu_yxfb_yxio_b1_block_fp32.cl | 1 + ...ution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl | 1 + .../convolution_gpu_yxfb_yxio_b8_fp32.cl | 1 + .../cl_kernels/convolution_grad_weights_ref.cl | 17 +- .../cl_kernels/convolution_grad_weights_yxfb.cl | 16 +- .../core/cl_kernels/deconvolution_gpu_bfyx_opt.cl | 16 +- .../core/cl_kernels/deconvolution_gpu_ref.cl | 16 +- .../core/cl_kernels/depth_to_space_ref.cl | 36 + .../core/cl_kernels/detection_output.cl | 217 + .../core/cl_kernels/detection_output_sort.cl | 217 + .../core/cl_kernels/eltwise_b_fs_yx_fsv4.cl | 100 + .../core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl | 5 + .../kernel_selector/core/cl_kernels/embed_ref.cl | 19 +- .../fully_connected_gpu_bs_f_bsv16_af8_vload.cl | 77 +- .../fully_connected_gpu_bs_f_bsv8_af8_vload.cl | 4 +- .../cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl | 1 + .../fully_connected_gpu_fb_io_b8_f8_vload.cl | 1 + .../core/cl_kernels/fully_connected_gpu_imad.cl | 95 + .../cl_kernels/fully_connected_gpu_yxfb_ref.cl | 1 + .../cl_kernels/fused_conv_bn_scale_kernel_ref.cl | 197 + .../fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl | 254 + .../fused_conv_eltwise_gpu_bfyx_os_iyx_osv16.cl | 252 + .../cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl | 602 + ..._eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl | 509 + ..._eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl | 505 + .../fused_conv_eltwise_gpu_yxfb_yxio_b16_fp16.cl | 256 + .../kernel_selector/core/cl_kernels/gather_ref.cl | 33 + .../kernel_selector/core/cl_kernels/gemm_ref.cl | 2 +- .../core/cl_kernels/generic_eltwise_ref.cl | 32 +- .../include/arg_max_min_common.cl} | 18 +- .../core/cl_kernels/include/common.cl | 4 + .../core/cl_kernels/include/data_types.cl | 4 - .../cl_kernels/include/detection_output_common.cl | 180 + .../core/cl_kernels/include/fetch.cl | 149 + .../core/cl_kernels/include/imad.cl | 34 + .../core/cl_kernels/include/include_all.cl | 2 - .../core/cl_kernels/include/mmad.cl | 88 + .../core/cl_kernels/include/vec_typedefs.cl | 34 - .../core/cl_kernels/index_select_gpu_ref.cl | 91 +- .../core/cl_kernels/lstm_elt_gpu_bfyx_ref.cl | 10 +- .../core/cl_kernels/lstm_gemm_gpu_bfyx_ref.cl | 4 +- .../lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cl | 128 + .../lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cl | 131 + .../kernel_selector/core/cl_kernels/one_hot_ref.cl | 39 + .../kernel_selector/core/cl_kernels/permute_ref.cl | 27 +- .../core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl | 143 + .../cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl | 30 +- .../core/cl_kernels/pyramid_roi_align_gpu_ref.cl | 159 + .../core/cl_kernels/reorder_data.cl | 8 + .../reorder_data_byxf_f32_to_byx8_f4_i8.cl | 136 + .../core/cl_kernels/reorder_weights.cl | 24 +- .../core/cl_kernels/reverse_sequence_ref.cl | 43 + .../core/cl_kernels/roi_pooling_ps_ref.cl | 141 + .../core/cl_kernels/roi_pooling_ref.cl | 73 +- .../core/cl_kernels/shuffle_channels_ref.cl | 43 + .../core/cl_kernels/strided_slice_ref.cl | 50 + .../core/common/common_kernel_base.cpp | 4 +- .../core/common/common_kernel_base.h | 4 +- .../clDNN/kernel_selector/core/common/jitter.cpp | 227 +- .../clDNN/kernel_selector/core/common/jitter.h | 139 +- .../core/common/kernel_selector_utils.cpp | 99 +- .../core/common/kernel_selector_utils.h | 14 +- .../core/common/primitive_db_gen.py | 4 +- .../clDNN/kernel_selector/core/kernel_base.h | 15 +- .../clDNN/kernel_selector/core/kernel_selector.cpp | 29 +- .../core/kernel_selector_common.cpp | 31 +- .../kernel_selector/core/kernel_selector_common.h | 6 +- .../core/kernel_selector_params.cpp | 58 +- .../kernel_selector/core/kernel_selector_params.h | 80 +- .../thirdparty/clDNN/src/CMakeLists.txt | 56 +- .../thirdparty/clDNN/src/activation.cpp | 2 + .../thirdparty/clDNN/src/activation_grad.cpp | 3 + .../thirdparty/clDNN/src/apply_adam.cpp | 9 +- .../thirdparty/clDNN/src/arg_max_min.cpp | 5 +- .../thirdparty/clDNN/src/average_unpooling.cpp | 3 + .../thirdparty/clDNN/src/batch_norm.cpp | 49 +- .../thirdparty/clDNN/src/batch_norm_grad.cpp | 3 + inference-engine/thirdparty/clDNN/src/border.cpp | 16 +- .../thirdparty/clDNN/src/broadcast.cpp | 96 +- .../clDNN/src/caps/public/gpu_devices.inc | 63 - .../thirdparty/clDNN/src/caps/public/gpu_enums.inc | 30 - .../thirdparty/clDNN/src/caps/public/mode.inc | 1 - inference-engine/thirdparty/clDNN/src/cldnn.cpp | 256 +- .../thirdparty/clDNN/src/concatenation.cpp | 2 + .../thirdparty/clDNN/src/condition.cpp | 85 + .../thirdparty/clDNN/src/constants_propagator.cpp | 114 - inference-engine/thirdparty/clDNN/src/contract.cpp | 130 + .../thirdparty/clDNN/src/convolution.cpp | 36 +- .../clDNN/src/convolution_grad_weights.cpp | 11 +- inference-engine/thirdparty/clDNN/src/crop.cpp | 91 +- inference-engine/thirdparty/clDNN/src/data.cpp | 1 + .../thirdparty/clDNN/src/deconvolution.cpp | 2 + .../thirdparty/clDNN/src/depth_to_space.cpp | 78 + .../thirdparty/clDNN/src/detection_output.cpp | 116 +- inference-engine/thirdparty/clDNN/src/eltwise.cpp | 148 +- inference-engine/thirdparty/clDNN/src/embed.cpp | 9 +- inference-engine/thirdparty/clDNN/src/engine.cpp | 35 +- .../thirdparty/clDNN/src/error_handler.cpp | 45 +- .../thirdparty/clDNN/src/fully_connected.cpp | 7 +- .../clDNN/src/fully_connected_grad_input.cpp | 3 + .../clDNN/src/fully_connected_grad_weights.cpp | 3 + .../thirdparty/clDNN/src/fused_conv_bn_scale.cpp | 131 + .../thirdparty/clDNN/src/fused_conv_eltwise.cpp | 196 + inference-engine/thirdparty/clDNN/src/gather.cpp | 68 + inference-engine/thirdparty/clDNN/src/gemm.cpp | 18 +- .../thirdparty/clDNN/src/generic_layer.cpp | 6 + .../thirdparty/clDNN/src/gpu/activation_gpu.cpp | 4 +- .../clDNN/src/gpu/activation_grad_gpu.cpp | 8 +- .../thirdparty/clDNN/src/gpu/arg_max_min_gpu.cpp | 7 - .../thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp | 46 +- .../thirdparty/clDNN/src/gpu/broadcast_gpu.cpp | 31 +- .../clDNN/src/gpu/command_queues_builder.cpp | 151 + .../clDNN/src/gpu/command_queues_builder.h | 46 + .../thirdparty/clDNN/src/gpu/concatenation_gpu.cpp | 2 + .../thirdparty/clDNN/src/gpu/condition_gpu.cpp | 144 + .../thirdparty/clDNN/src/gpu/configuration.cpp | 4 +- .../thirdparty/clDNN/src/gpu/confiugration.h | 50 + .../thirdparty/clDNN/src/gpu/contract_gpu.cpp | 88 + .../thirdparty/clDNN/src/gpu/convolution_gpu.cpp | 38 +- .../clDNN/src/gpu/convolution_grad_weights_gpu.cpp | 8 +- .../thirdparty/clDNN/src/gpu/crop_gpu.cpp | 18 + .../clDNN/src/gpu/custom_gpu_primitive_gpu.cpp | 1 + .../thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp | 21 +- .../clDNN/src/gpu/depth_to_space_gpu.cpp | 72 + .../clDNN/src/gpu/detection_output_cpu.cpp | 652 + .../clDNN/src/gpu/detection_output_gpu.cpp | 656 +- .../thirdparty/clDNN/src/gpu/eltwise_gpu.cpp | 82 +- .../thirdparty/clDNN/src/gpu/engine_info.cpp | 91 +- .../thirdparty/clDNN/src/gpu/engine_info.h | 63 +- .../thirdparty/clDNN/src/gpu/events_pool.h | 139 + .../thirdparty/clDNN/src/gpu/events_waiter.h | 9 +- .../clDNN/src/gpu/fully_connected_gpu.cpp | 7 +- .../src/gpu/fully_connected_grad_weights_gpu.cpp | 4 +- .../clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp | 166 + .../clDNN/src/gpu/fused_conv_eltwise_gpu.cpp | 214 + .../thirdparty/clDNN/src/gpu/gather_gpu.cpp | 86 + .../thirdparty/clDNN/src/gpu/index_select_gpu.cpp | 28 +- .../thirdparty/clDNN/src/gpu/kernel.cpp | 16 +- inference-engine/thirdparty/clDNN/src/gpu/kernel.h | 5 +- .../thirdparty/clDNN/src/gpu/kernel_runner.h | 2 +- .../thirdparty/clDNN/src/gpu/kernels_cache.cpp | 1 + .../thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp | 4 +- .../thirdparty/clDNN/src/gpu/lstm_elt_gpu.cpp | 13 +- .../thirdparty/clDNN/src/gpu/lstm_gemm_gpu.cpp | 25 +- .../thirdparty/clDNN/src/gpu/memory_gpu.cpp | 8 +- .../thirdparty/clDNN/src/gpu/ocl_base_event.h | 65 +- .../thirdparty/clDNN/src/gpu/ocl_builder.cpp | 178 + .../thirdparty/clDNN/src/gpu/ocl_builder.h | 54 + .../thirdparty/clDNN/src/gpu/ocl_toolkit.cpp | 260 +- .../thirdparty/clDNN/src/gpu/ocl_toolkit.h | 76 +- .../thirdparty/clDNN/src/gpu/ocl_user_event.cpp | 20 +- .../thirdparty/clDNN/src/gpu/ocl_user_event.h | 39 +- .../thirdparty/clDNN/src/gpu/one_hot_gpu.cpp | 72 + .../thirdparty/clDNN/src/gpu/permute_gpu.cpp | 9 +- .../thirdparty/clDNN/src/gpu/pooling_gpu.cpp | 9 +- .../clDNN/src/gpu/primitive_gpu_base.cpp | 10 +- .../thirdparty/clDNN/src/gpu/primitive_gpu_base.h | 24 +- .../thirdparty/clDNN/src/gpu/proposal_gpu.cpp | 167 +- .../clDNN/src/gpu/pyramid_roi_align_gpu.cpp | 76 + .../clDNN/src/gpu/reverse_sequence_gpu.cpp | 71 + .../thirdparty/clDNN/src/gpu/roi_pooling_gpu.cpp | 24 +- .../clDNN/src/gpu/shuffle_channels_gpu.cpp | 75 + .../thirdparty/clDNN/src/gpu/strided_slice_gpu.cpp | 99 + .../thirdparty/clDNN/src/gpu/upsampling_gpu.cpp | 2 +- .../clDNN/src/gpu/wait_for_events_gpu.cpp | 5 + .../src/graph_optimizer/add_required_reorders.cpp | 143 + .../graph_optimizer/add_reshape_to_primitives.cpp | 120 + .../src/graph_optimizer/calculate_prior_boxes.cpp | 47 + .../clDNN/src/graph_optimizer/compile_graph.cpp | 39 + .../src/graph_optimizer/eltwise_remove_stride.cpp | 105 + .../src/graph_optimizer/eltwise_shrinking.cpp | 132 + .../src/graph_optimizer/graph_initializations.cpp | 641 + .../src/graph_optimizer/handle_input_padding.cpp | 94 + .../clDNN/src/graph_optimizer/mark_nodes.cpp | 43 + .../src/graph_optimizer/post_optimize_weights.cpp | 131 + .../src/graph_optimizer/pre_optimize_bias.cpp | 87 + .../prep_opt_depthwise_sep_post.cpp | 100 + .../src/graph_optimizer/prepare_buffer_fusing.cpp | 321 + .../graph_optimizer/prepare_depthwise_sep_opt.cpp | 70 + .../clDNN/src/graph_optimizer/prepare_padding.cpp | 146 + .../graph_optimizer/prepare_primitive_fusing.cpp | 542 + .../src/graph_optimizer/propagate_constants.cpp | 194 + .../graph_optimizer/remove_redundant_reorders.cpp | 92 + .../clDNN/src/graph_optimizer/reorder_inputs.cpp | 269 + .../clDNN/src/graph_optimizer/trim_to_outputs.cpp | 76 + .../thirdparty/clDNN/src/include/activation_inst.h | 1 + .../thirdparty/clDNN/src/include/apply_adam_inst.h | 1 + .../thirdparty/clDNN/src/include/batch_norm_inst.h | 53 +- .../thirdparty/clDNN/src/include/border_inst.h | 3 +- .../thirdparty/clDNN/src/include/broadcast_inst.h | 2 +- .../clDNN/src/include/concatenation_inst.h | 1 + .../thirdparty/clDNN/src/include/condition_inst.h | 127 + .../clDNN/src/include/constants_propagator.h | 48 - .../thirdparty/clDNN/src/include/contract_inst.h | 53 + .../src/include/convolution_grad_weights_inst.h | 10 + .../clDNN/src/include/convolution_inst.h | 56 +- .../thirdparty/clDNN/src/include/crop_inst.h | 4 +- .../clDNN/src/include/custom_gpu_primitive_inst.h | 3 + .../clDNN/src/include/deconvolution_inst.h | 41 +- .../clDNN/src/include/depth_to_space_inst.h | 51 + .../clDNN/src/include/detection_output_inst.h | 35 + .../thirdparty/clDNN/src/include/eltwise_inst.h | 3 +- .../thirdparty/clDNN/src/include/embed_inst.h | 2 +- .../thirdparty/clDNN/src/include/engine_impl.h | 33 +- .../thirdparty/clDNN/src/include/error_handler.h | 39 +- .../thirdparty/clDNN/src/include/event_impl.h | 5 +- .../clDNN/src/include/fused_conv_bn_scale_inst.h | 149 + .../clDNN/src/include/fused_conv_eltwise_inst.h | 204 + .../thirdparty/clDNN/src/include/gather_inst.h | 51 + .../clDNN/src/include/generic_layer_inst.h | 1 + .../clDNN/src/include/implementation_map.h | 73 +- .../clDNN/src/include/index_select_inst.h | 6 +- .../clDNN/src/include/input_layout_inst.h | 4 +- .../clDNN/src/include/kernel_selector_helper.h | 118 +- .../thirdparty/clDNN/src/include/lstm_elt_inst.h | 2 + .../clDNN/src/include/max_unpooling_inst.h | 2 +- .../thirdparty/clDNN/src/include/memory_impl.h | 11 +- .../thirdparty/clDNN/src/include/memory_pool.h | 4 +- .../thirdparty/clDNN/src/include/meta_utils.h | 2 +- .../thirdparty/clDNN/src/include/network_impl.h | 11 +- .../thirdparty/clDNN/src/include/one_hot_inst.h | 53 + .../thirdparty/clDNN/src/include/pass_manager.h | 276 + .../thirdparty/clDNN/src/include/permute_inst.h | 1 + .../thirdparty/clDNN/src/include/pooling_inst.h | 1 + .../thirdparty/clDNN/src/include/primitive_inst.h | 39 +- .../thirdparty/clDNN/src/include/primitive_type.h | 2 + .../clDNN/src/include/primitive_type_base.h | 14 + .../clDNN/src/include/program_dump_graph.h | 5 +- .../thirdparty/clDNN/src/include/program_helpers.h | 114 + .../thirdparty/clDNN/src/include/program_impl.h | 306 +- .../thirdparty/clDNN/src/include/program_node.h | 65 +- .../clDNN/src/include/pyramid_roi_align_inst.h | 64 + .../thirdparty/clDNN/src/include/reshape_inst.h | 1 + .../clDNN/src/include/reverse_sequence_inst.h | 51 + .../thirdparty/clDNN/src/include/scale_inst.h | 2 + .../clDNN/src/include/shuffle_channels_inst.h | 51 + .../clDNN/src/include/strided_slice_inst.h | 51 + .../thirdparty/clDNN/src/include/to_string_utils.h | 59 +- .../thirdparty/clDNN/src/include/upsampling_inst.h | 1 + .../thirdparty/clDNN/src/include/xml_object.h | 129 - .../thirdparty/clDNN/src/index_select.cpp | 109 +- .../thirdparty/clDNN/src/input_layout.cpp | 6 + .../clDNN/src/kernel_selector_helper.cpp | 130 +- .../thirdparty/clDNN/src/layout_optimizer.cpp | 11 +- .../thirdparty/clDNN/src/lookup_table.cpp | 3 + inference-engine/thirdparty/clDNN/src/lrn.cpp | 2 + inference-engine/thirdparty/clDNN/src/lstm.cpp | 27 +- inference-engine/thirdparty/clDNN/src/lstm_elt.cpp | 6 +- .../thirdparty/clDNN/src/lstm_gemm.cpp | 9 +- .../thirdparty/clDNN/src/max_unpooling.cpp | 8 + .../thirdparty/clDNN/src/memory_pool.cpp | 4 + .../thirdparty/clDNN/src/mutable_data.cpp | 1 + inference-engine/thirdparty/clDNN/src/mvn.cpp | 2 + inference-engine/thirdparty/clDNN/src/network.cpp | 138 +- .../thirdparty/clDNN/src/nodes_ordering.cpp | 119 + .../thirdparty/clDNN/src/normalize.cpp | 2 + inference-engine/thirdparty/clDNN/src/one_hot.cpp | 97 + inference-engine/thirdparty/clDNN/src/permute.cpp | 34 +- inference-engine/thirdparty/clDNN/src/pooling.cpp | 7 + .../thirdparty/clDNN/src/primitive_inst.cpp | 36 +- .../thirdparty/clDNN/src/prior_box.cpp | 2 + inference-engine/thirdparty/clDNN/src/program.cpp | 3170 +- .../thirdparty/clDNN/src/program_dump_graph.cpp | 180 +- .../thirdparty/clDNN/src/program_helpers.cpp | 92 + .../thirdparty/clDNN/src/program_node.cpp | 79 +- inference-engine/thirdparty/clDNN/src/proposal.cpp | 18 +- .../thirdparty/clDNN/src/pyramid_roi_align.cpp | 63 + .../thirdparty/clDNN/src/region_yolo.cpp | 3 + inference-engine/thirdparty/clDNN/src/reorder.cpp | 2 +- .../thirdparty/clDNN/src/reorg_yolo.cpp | 3 + inference-engine/thirdparty/clDNN/src/reshape.cpp | 29 +- .../thirdparty/clDNN/src/reverse_sequence.cpp | 65 + .../thirdparty/clDNN/src/roi_pooling.cpp | 27 +- inference-engine/thirdparty/clDNN/src/scale.cpp | 2 + .../thirdparty/clDNN/src/scale_grad_input.cpp | 3 + .../thirdparty/clDNN/src/scale_grad_weights.cpp | 3 + inference-engine/thirdparty/clDNN/src/select.cpp | 2 + .../thirdparty/clDNN/src/shuffle_channels.cpp | 83 + inference-engine/thirdparty/clDNN/src/softmax.cpp | 2 + .../thirdparty/clDNN/src/softmax_loss_grad.cpp | 3 + inference-engine/thirdparty/clDNN/src/split.cpp | 4 +- .../thirdparty/clDNN/src/strided_slice.cpp | 141 + inference-engine/thirdparty/clDNN/src/tile.cpp | 2 + .../thirdparty/clDNN/src/upsampling.cpp | 2 + .../thirdparty/clDNN/tests/CMakeLists.txt | 4 +- .../clDNN/tests/module_tests/events_pool_test.cpp | 65 + .../clDNN/tests/module_tests/gpu_toolkit_test.cpp | 112 +- .../tests/test_cases/activation_grad_gpu_test.cpp | 4 +- .../test_cases/activation_simple_gpu_test.cpp | 79 +- .../tests/test_cases/add_reorders_gpu_test.cpp | 213 + .../clDNN/tests/test_cases/apply_adam_gpu_test.cpp | 2 +- .../clDNN/tests/test_cases/arg_max_gpu_test.cpp | 12 +- .../test_cases/average_unpooling_gpu_test.cpp | 8 +- .../clDNN/tests/test_cases/batch_norm_gpu_test.cpp | 2429 +- .../tests/test_cases/batch_norm_grad_gpu_test.cpp | 2 +- .../clDNN/tests/test_cases/border_gpu_test.cpp | 18 +- .../clDNN/tests/test_cases/broadcast_gpu_test.cpp | 1311 +- .../clDNN/tests/test_cases/command_queue_test.cpp | 165 + .../clDNN/tests/test_cases/condition_gpu_test.cpp | 617 + .../clDNN/tests/test_cases/contract_gpu_test.cpp | 352 + .../tests/test_cases/convolution_gpu_test.cpp | 1999 +- .../test_cases/convolution_grad_input_gpu_test.cpp | 12 +- .../convolution_grad_weights_gpu_test.cpp | 95 +- .../clDNN/tests/test_cases/crop_gpu_test.cpp | 348 +- .../tests/test_cases/custom_gpu_primitive_test.cpp | 81 +- .../tests/test_cases/deconvolution_gpu_test.cpp | 301 +- .../test_cases/depth_concatenate_gpu_test.cpp | 289 +- .../tests/test_cases/depth_to_space_gpu_test.cpp | 308 + .../tests/test_cases/detection_output_test.cpp | 1156 +- .../clDNN/tests/test_cases/eltwise_gpu_test.cpp | 2910 +- .../clDNN/tests/test_cases/embed_gpu_test.cpp | 8 +- .../tests/test_cases/fully_connected_gpu_test.cpp | 221 +- .../fully_connected_grad_input_gpu_test.cpp | 2 +- .../fully_connected_grad_weights_gpu_test.cpp | 6 +- .../test_cases/fused_conv_eltwise_gpu_test.cpp | 112 + .../clDNN/tests/test_cases/gather_gpu_test.cpp | 513 + .../clDNN/tests/test_cases/gemm_gpu_test.cpp | 239 +- .../tests/test_cases/index_select_gpu_test.cpp | 875 +- .../clDNN/tests/test_cases/lookup_table_test.cpp | 8 +- .../clDNN/tests/test_cases/lstm_gpu_test.cpp | 1398 +- .../tests/test_cases/max_unpooling_gpu_test.cpp | 10 +- .../clDNN/tests/test_cases/memory_test.cpp | 137 +- .../clDNN/tests/test_cases/mvn_gpu_test.cpp | 16 +- .../clDNN/tests/test_cases/one_hot_gpu_test.cpp | 193 + .../clDNN/tests/test_cases/permute_gpu_test.cpp | 413 +- .../clDNN/tests/test_cases/pooling_gpu_test.cpp | 213 +- .../test_cases/propagate_constants_gpu_test.cpp | 69 + .../clDNN/tests/test_cases/proposal_cpu_test.cpp | 2 +- .../test_cases/pyramid_roi_align_gpu_test.cpp | 191 + .../clDNN/tests/test_cases/reorder_gpu_test.cpp | 102 +- .../clDNN/tests/test_cases/reshape_gpu_test.cpp | 51 +- .../tests/test_cases/reverse_sequence_gpu_test.cpp | 580 + .../clDNN/tests/test_cases/scale_gpu_test.cpp | 42 +- .../tests/test_cases/scale_grad_input_test.cpp | 2 +- .../tests/test_cases/scale_grad_weights_test.cpp | 6 +- .../clDNN/tests/test_cases/select_gpu_test.cpp | 1788 +- .../tests/test_cases/shuffle_channels_test.cpp | 386 + .../clDNN/tests/test_cases/softmax_gpu_test.cpp | 13 +- .../test_cases/softmax_loss_grad_gpu_test.cpp | 2 +- .../clDNN/tests/test_cases/split_gpu_test.cpp | 306 +- .../tests/test_cases/strided_slice_gpu_test.cpp | 375 + .../clDNN/tests/test_cases/tile_gpu_test.cpp | 10 +- .../clDNN/tests/test_cases/topology_test.cpp | 4 +- .../tests/test_cases/trim_to_outputs_gpu_test.cpp | 200 + .../clDNN/tests/test_cases/upsampling_gpu_test.cpp | 4 +- .../clDNN/tests/test_utils/instrumentation.cpp | 27 +- .../clDNN/tests/test_utils/test_utils.cpp | 21 +- .../thirdparty/clDNN/tests/test_utils/test_utils.h | 43 +- .../clDNN/tests_core_internal/CMakeLists.txt | 311 + .../cache_ICL.cpp => tests_core_internal/main.cpp} | 15 +- .../tests_core_internal/program_impl_wrapper.h | 32 + .../test_cases/graph_manipulation_gpu_test.cpp | 203 + .../thirdparty/clDNN/utils/rapidjson/allocators.h | 284 + .../clDNN/utils/rapidjson/cursorstreamwrapper.h | 78 + .../thirdparty/clDNN/utils/rapidjson/document.h | 2643 + .../clDNN/utils/rapidjson/encodedstream.h | 299 + .../thirdparty/clDNN/utils/rapidjson/encodings.h | 716 + .../thirdparty/clDNN/utils/rapidjson/error/en.h | 74 + .../thirdparty/clDNN/utils/rapidjson/error/error.h | 161 + .../clDNN/utils/rapidjson/filereadstream.h | 99 + .../clDNN/utils/rapidjson/filewritestream.h | 104 + .../thirdparty/clDNN/utils/rapidjson/fwd.h | 151 + .../clDNN/utils/rapidjson/internal/biginteger.h | 290 + .../clDNN/utils/rapidjson/internal/diyfp.h | 271 + .../clDNN/utils/rapidjson/internal/dtoa.h | 245 + .../clDNN/utils/rapidjson/internal/ieee754.h | 78 + .../clDNN/utils/rapidjson/internal/itoa.h | 308 + .../clDNN/utils/rapidjson/internal/meta.h | 186 + .../clDNN/utils/rapidjson/internal/pow10.h | 55 + .../clDNN/utils/rapidjson/internal/regex.h | 737 + .../clDNN/utils/rapidjson/internal/stack.h | 231 + .../clDNN/utils/rapidjson/internal/strfunc.h | 69 + .../clDNN/utils/rapidjson/internal/strtod.h | 290 + .../clDNN/utils/rapidjson/internal/swap.h | 46 + .../clDNN/utils/rapidjson/istreamwrapper.h | 113 + .../clDNN/utils/rapidjson/memorybuffer.h | 70 + .../clDNN/utils/rapidjson/memorystream.h | 71 + .../clDNN/utils/rapidjson/msinttypes/inttypes.h | 316 + .../clDNN/utils/rapidjson/msinttypes/stdint.h | 300 + .../clDNN/utils/rapidjson/ostreamwrapper.h | 81 + .../thirdparty/clDNN/utils/rapidjson/pointer.h | 1357 + .../clDNN/utils/rapidjson/prettywriter.h | 277 + .../thirdparty/clDNN/utils/rapidjson/rapidjson.h | 654 + .../thirdparty/clDNN/utils/rapidjson/reader.h | 2230 + .../thirdparty/clDNN/utils/rapidjson/schema.h | 2496 + .../thirdparty/clDNN/utils/rapidjson/stream.h | 223 + .../clDNN/utils/rapidjson/stringbuffer.h | 121 + .../thirdparty/clDNN/utils/rapidjson/writer.h | 709 + inference-engine/thirdparty/clDNN/version.json | 2 +- inference-engine/thirdparty/fluid/checksum.txt | 2 +- .../thirdparty/fluid/modules/gapi/CMakeLists.txt | 11 +- .../thirdparty/fluid/modules/gapi/cmake/init.cmake | 6 + .../fluid/modules/gapi/include/opencv2/gapi.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/core.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/cpu/core.hpp | 2 +- .../gapi/include/opencv2/gapi/cpu/gcpukernel.hpp | 2 +- .../gapi/include/opencv2/gapi/cpu/imgproc.hpp | 2 +- .../gapi/include/opencv2/gapi/fluid/core.hpp | 2 +- .../include/opencv2/gapi/fluid/gfluidbuffer.hpp | 2 +- .../include/opencv2/gapi/fluid/gfluidkernel.hpp | 2 +- .../gapi/include/opencv2/gapi/fluid/imgproc.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/garg.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/garray.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/gcall.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/gcommon.hpp | 2 +- .../gapi/include/opencv2/gapi/gcompiled.hpp | 2 +- .../gapi/include/opencv2/gapi/gcompoundkernel.hpp | 2 +- .../gapi/include/opencv2/gapi/gcomputation.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/gkernel.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/gmat.hpp | 4 +- .../modules/gapi/include/opencv2/gapi/gmetaarg.hpp | 13 +- .../modules/gapi/include/opencv2/gapi/gproto.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/gpu/core.hpp | 12 +- .../gapi/include/opencv2/gapi/gpu/ggpukernel.hpp | 238 +- .../gapi/include/opencv2/gapi/gpu/imgproc.hpp | 13 +- .../modules/gapi/include/opencv2/gapi/gscalar.hpp | 2 +- .../gapi/include/opencv2/gapi/gtype_traits.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/gtyped.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/imgproc.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/ocl/core.hpp | 27 + .../gapi/include/opencv2/gapi/ocl/goclkernel.hpp | 244 + .../gapi/include/opencv2/gapi/ocl/imgproc.hpp | 27 + .../gapi/include/opencv2/gapi/opencv_includes.hpp | 2 +- .../gapi/include/opencv2/gapi/operators.hpp | 2 +- .../gapi/include/opencv2/gapi/own/assert.hpp | 8 +- .../gapi/include/opencv2/gapi/own/convert.hpp | 2 +- .../gapi/include/opencv2/gapi/own/cvdefs.hpp | 2 +- .../gapi/include/opencv2/gapi/own/exports.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/own/mat.hpp | 2 +- .../gapi/include/opencv2/gapi/own/saturate.hpp | 2 +- .../gapi/include/opencv2/gapi/own/scalar.hpp | 2 +- .../gapi/include/opencv2/gapi/own/types.hpp | 2 +- .../modules/gapi/include/opencv2/gapi/util/any.hpp | 2 +- .../include/opencv2/gapi/util/compiler_hints.hpp | 4 +- .../gapi/include/opencv2/gapi/util/optional.hpp | 2 +- .../gapi/include/opencv2/gapi/util/throw.hpp | 2 +- .../gapi/include/opencv2/gapi/util/util.hpp | 2 +- .../gapi/include/opencv2/gapi/util/variant.hpp | 2 +- .../gapi/perf/common/gapi_core_perf_tests.cpp | 2 +- .../gapi/perf/common/gapi_core_perf_tests.hpp | 8 +- .../gapi/perf/common/gapi_core_perf_tests_inl.hpp | 74 +- .../gapi/perf/common/gapi_imgproc_perf_tests.cpp | 2 +- .../gapi/perf/common/gapi_imgproc_perf_tests.hpp | 2 +- .../perf/common/gapi_imgproc_perf_tests_inl.hpp | 28 +- .../gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp | 18 +- .../gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp | 2 +- .../perf/cpu/gapi_imgproc_perf_tests_fluid.cpp | 182 +- .../gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp | 18 +- .../gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp | 17 +- .../perf/internal/gapi_compiler_perf_tests.cpp | 2 +- .../fluid/modules/gapi/perf/perf_main.cpp | 2 +- .../fluid/modules/gapi/perf/perf_precomp.hpp | 4 +- .../fluid/modules/gapi/src/api/gapi_priv.cpp | 2 +- .../fluid/modules/gapi/src/api/gapi_priv.hpp | 2 +- .../fluid/modules/gapi/src/api/garray.cpp | 2 +- .../fluid/modules/gapi/src/api/gbackend.cpp | 2 +- .../fluid/modules/gapi/src/api/gbackend_priv.hpp | 2 +- .../fluid/modules/gapi/src/api/gcall.cpp | 9 +- .../fluid/modules/gapi/src/api/gcall_priv.hpp | 22 +- .../fluid/modules/gapi/src/api/gcomputation.cpp | 2 +- .../modules/gapi/src/api/gcomputation_priv.hpp | 2 +- .../fluid/modules/gapi/src/api/gkernel.cpp | 2 +- .../thirdparty/fluid/modules/gapi/src/api/gmat.cpp | 31 +- .../fluid/modules/gapi/src/api/gnode.cpp | 2 +- .../fluid/modules/gapi/src/api/gnode.hpp | 2 +- .../fluid/modules/gapi/src/api/gnode_priv.hpp | 2 +- .../fluid/modules/gapi/src/api/gproto.cpp | 2 +- .../fluid/modules/gapi/src/api/gproto_priv.hpp | 2 +- .../fluid/modules/gapi/src/api/gscalar.cpp | 2 +- .../fluid/modules/gapi/src/api/kernels_core.cpp | 2 +- .../fluid/modules/gapi/src/api/kernels_imgproc.cpp | 2 +- .../fluid/modules/gapi/src/api/operators.cpp | 2 +- .../modules/gapi/src/backends/common/gbackend.hpp | 2 +- .../gapi/src/backends/common/gcompoundbackend.cpp | 2 +- .../gapi/src/backends/common/gcompoundkernel.cpp | 2 +- .../modules/gapi/src/backends/cpu/gcpubackend.cpp | 2 +- .../modules/gapi/src/backends/cpu/gcpubackend.hpp | 2 +- .../modules/gapi/src/backends/cpu/gcpucore.cpp | 2 +- .../modules/gapi/src/backends/cpu/gcpucore.hpp | 2 +- .../modules/gapi/src/backends/cpu/gcpuimgproc.cpp | 2 +- .../modules/gapi/src/backends/cpu/gcpuimgproc.hpp | 2 +- .../modules/gapi/src/backends/cpu/gcpukernel.cpp | 2 +- .../gapi/src/backends/fluid/gfluidbackend.cpp | 56 +- .../gapi/src/backends/fluid/gfluidbackend.hpp | 2 +- .../gapi/src/backends/fluid/gfluidbuffer.cpp | 2 +- .../gapi/src/backends/fluid/gfluidbuffer_priv.hpp | 2 +- .../modules/gapi/src/backends/fluid/gfluidcore.cpp | 6 +- .../gapi/src/backends/fluid/gfluidimgproc.cpp | 472 +- .../backends/fluid/gfluidimgproc_func.dispatch.cpp | 112 +- .../gapi/src/backends/fluid/gfluidimgproc_func.hpp | 102 +- .../src/backends/fluid/gfluidimgproc_func.simd.hpp | 1561 +- .../gapi/src/backends/fluid/gfluidutils.hpp | 4 +- .../modules/gapi/src/backends/gpu/ggpubackend.cpp | 2 +- .../modules/gapi/src/backends/gpu/ggpubackend.hpp | 2 +- .../modules/gapi/src/backends/gpu/ggpucore.cpp | 2 +- .../modules/gapi/src/backends/gpu/ggpucore.hpp | 2 +- .../modules/gapi/src/backends/gpu/ggpuimgproc.cpp | 2 +- .../modules/gapi/src/backends/gpu/ggpuimgproc.hpp | 2 +- .../modules/gapi/src/backends/gpu/ggpukernel.cpp | 2 +- .../modules/gapi/src/backends/ocl/goclbackend.cpp | 226 + .../modules/gapi/src/backends/ocl/goclbackend.hpp | 72 + .../modules/gapi/src/backends/ocl/goclcore.cpp | 582 + .../modules/gapi/src/backends/ocl/goclcore.hpp | 24 + .../modules/gapi/src/backends/ocl/goclimgproc.cpp | 277 + .../modules/gapi/src/backends/ocl/goclimgproc.hpp | 23 + .../modules/gapi/src/backends/ocl/goclkernel.cpp | 50 + .../fluid/modules/gapi/src/compiler/gcompiled.cpp | 2 +- .../modules/gapi/src/compiler/gcompiled_priv.hpp | 2 +- .../fluid/modules/gapi/src/compiler/gcompiler.cpp | 8 +- .../fluid/modules/gapi/src/compiler/gcompiler.hpp | 2 +- .../modules/gapi/src/compiler/gislandmodel.cpp | 2 +- .../modules/gapi/src/compiler/gislandmodel.hpp | 2 +- .../fluid/modules/gapi/src/compiler/gmodel.cpp | 4 +- .../fluid/modules/gapi/src/compiler/gmodel.hpp | 3 +- .../modules/gapi/src/compiler/gmodelbuilder.cpp | 4 +- .../modules/gapi/src/compiler/gmodelbuilder.hpp | 2 +- .../fluid/modules/gapi/src/compiler/gobjref.hpp | 2 +- .../modules/gapi/src/compiler/passes/dump_dot.cpp | 2 +- .../modules/gapi/src/compiler/passes/exec.cpp | 2 +- .../modules/gapi/src/compiler/passes/helpers.cpp | 2 +- .../modules/gapi/src/compiler/passes/helpers.hpp | 2 +- .../modules/gapi/src/compiler/passes/islands.cpp | 2 +- .../modules/gapi/src/compiler/passes/kernels.cpp | 2 +- .../modules/gapi/src/compiler/passes/meta.cpp | 2 +- .../modules/gapi/src/compiler/passes/passes.hpp | 2 +- .../modules/gapi/src/compiler/transactions.hpp | 2 +- .../fluid/modules/gapi/src/executor/gexecutor.cpp | 2 +- .../fluid/modules/gapi/src/executor/gexecutor.hpp | 2 +- .../thirdparty/fluid/modules/gapi/src/logger.hpp | 2 +- .../thirdparty/fluid/modules/gapi/src/precomp.hpp | 2 +- .../gapi/test/common/gapi_compoundkernel_tests.cpp | 2 +- .../modules/gapi/test/common/gapi_core_tests.cpp | 2 +- .../modules/gapi/test/common/gapi_core_tests.hpp | 8 +- .../gapi/test/common/gapi_core_tests_inl.hpp | 60 +- .../gapi/test/common/gapi_imgproc_tests.cpp | 2 +- .../gapi/test/common/gapi_imgproc_tests.hpp | 2 +- .../gapi/test/common/gapi_imgproc_tests_inl.hpp | 2 +- .../gapi/test/common/gapi_operators_tests.cpp | 2 +- .../gapi/test/common/gapi_operators_tests.hpp | 2 +- .../gapi/test/common/gapi_operators_tests_inl.hpp | 2 +- .../modules/gapi/test/common/gapi_tests_common.hpp | 49 +- .../modules/gapi/test/cpu/gapi_core_tests_cpu.cpp | 13 +- .../gapi/test/cpu/gapi_core_tests_fluid.cpp | 5 +- .../gapi/test/cpu/gapi_imgproc_tests_cpu.cpp | 2 +- .../gapi/test/cpu/gapi_imgproc_tests_fluid.cpp | 2 +- .../gapi/test/cpu/gapi_operators_tests_cpu.cpp | 2 +- .../gapi/test/cpu/gapi_operators_tests_fluid.cpp | 2 +- .../fluid/modules/gapi/test/gapi_array_tests.cpp | 2 +- .../modules/gapi/test/gapi_basic_hetero_tests.cpp | 2 +- .../fluid/modules/gapi/test/gapi_desc_tests.cpp | 39 +- .../modules/gapi/test/gapi_fluid_resize_test.cpp | 2 +- .../modules/gapi/test/gapi_fluid_roi_test.cpp | 2 +- .../fluid/modules/gapi/test/gapi_fluid_test.cpp | 2 +- .../modules/gapi/test/gapi_fluid_test_kernels.cpp | 2 +- .../modules/gapi/test/gapi_fluid_test_kernels.hpp | 2 +- .../modules/gapi/test/gapi_gcompiled_tests.cpp | 2 +- .../modules/gapi/test/gapi_gcomputation_tests.cpp | 2 +- .../fluid/modules/gapi/test/gapi_gpu_test.cpp | 207 + .../fluid/modules/gapi/test/gapi_kernel_tests.cpp | 2 +- .../fluid/modules/gapi/test/gapi_mock_kernels.hpp | 2 +- .../modules/gapi/test/gapi_sample_pipelines.cpp | 2 +- .../fluid/modules/gapi/test/gapi_scalar_tests.cpp | 2 +- .../fluid/modules/gapi/test/gapi_smoke_test.cpp | 2 +- .../fluid/modules/gapi/test/gapi_typed_tests.cpp | 2 +- .../fluid/modules/gapi/test/gapi_util_tests.cpp | 2 +- .../modules/gapi/test/gpu/gapi_core_tests_gpu.cpp | 12 +- .../gapi/test/gpu/gapi_imgproc_tests_gpu.cpp | 19 +- .../gapi/test/gpu/gapi_operators_tests_gpu.cpp | 3 +- .../gapi/test/internal/gapi_int_backend_tests.cpp | 2 +- .../gapi/test/internal/gapi_int_executor_tests.cpp | 2 +- .../gapi/test/internal/gapi_int_garg_test.cpp | 2 +- .../gapi/test/internal/gapi_int_gmetaarg_test.cpp | 2 +- .../test/internal/gapi_int_gmodel_builder_test.cpp | 2 +- .../test/internal/gapi_int_island_fusion_tests.cpp | 2 +- .../gapi/test/internal/gapi_int_island_tests.cpp | 2 +- .../test/internal/gapi_int_recompilation_test.cpp | 2 +- .../test/internal/gapi_int_resolve_kernel_test.cpp | 2 +- .../gapi/test/internal/gapi_int_vectorref_test.cpp | 2 +- .../gapi/test/internal/gapi_transactions_test.cpp | 2 +- .../modules/gapi/test/opencl_kernels_test_gapi.hpp | 260 + .../modules/gapi/test/own/gapi_types_tests.cpp | 2 +- .../fluid/modules/gapi/test/own/mat_tests.cpp | 2 +- .../fluid/modules/gapi/test/own/scalar_tests.cpp | 2 +- .../fluid/modules/gapi/test/test_main.cpp | 2 +- .../fluid/modules/gapi/test/test_precomp.hpp | 4 +- .../fluid/modules/gapi/test/util/any_tests.cpp | 2 +- .../modules/gapi/test/util/optional_tests.cpp | 2 +- .../fluid/modules/gapi/test/util/variant_tests.cpp | 2 +- inference-engine/thirdparty/fluid/revision.txt | 2 +- inference-engine/thirdparty/mkl-dnn/CMakeLists.txt | 13 +- inference-engine/thirdparty/mkl-dnn/LICENSE | 14 + inference-engine/thirdparty/mkl-dnn/README.md | 443 +- .../thirdparty/mkl-dnn/cmake/Doxygen.cmake | 6 +- .../thirdparty/mkl-dnn/cmake/MKL.cmake | 49 +- .../thirdparty/mkl-dnn/cmake/OpenMP.cmake | 58 +- .../thirdparty/mkl-dnn/cmake/SDL.cmake | 31 +- .../thirdparty/mkl-dnn/cmake/TBB.cmake | 3 +- .../thirdparty/mkl-dnn/cmake/config.cmake.in | 6 + .../thirdparty/mkl-dnn/cmake/options.cmake | 18 +- .../thirdparty/mkl-dnn/cmake/platform.cmake | 12 +- .../thirdparty/mkl-dnn/cmake/profiling.cmake | 2 +- .../thirdparty/mkl-dnn/cmake/template.vcxproj.user | 7 + .../thirdparty/mkl-dnn/cmake/utils.cmake | 61 +- .../thirdparty/mkl-dnn/cmake/version.cmake | 46 + .../thirdparty/mkl-dnn/doc/Doxyfile.in | 2 +- .../thirdparty/mkl-dnn/doc/ex_simplenet.md | 2 +- .../thirdparty/mkl-dnn/doc/mainpage.md | 31 +- .../thirdparty/mkl-dnn/doc/perf_profile.md | 59 +- .../thirdparty/mkl-dnn/doc/winograd_convolution.md | 93 + .../thirdparty/mkl-dnn/examples/CMakeLists.txt | 10 +- .../thirdparty/mkl-dnn/examples/simple_net.c | 24 +- .../thirdparty/mkl-dnn/examples/simple_net.cpp | 16 +- .../mkl-dnn/examples/simple_net_int8.cpp | 2 +- .../thirdparty/mkl-dnn/examples/simple_rnn.cpp | 272 +- .../mkl-dnn/examples/simple_rnn_int8.cpp | 709 + .../mkl-dnn/examples/simple_rnn_training.cpp | 30 +- .../mkl-dnn/examples/simple_training_net.c | 30 +- .../mkl-dnn/examples/simple_training_net.cpp | 4 +- .../thirdparty/mkl-dnn/include/mkldnn.h | 742 +- .../thirdparty/mkl-dnn/include/mkldnn.hpp | 366 +- .../thirdparty/mkl-dnn/include/mkldnn_debug.h | 1 + .../thirdparty/mkl-dnn/include/mkldnn_types.h | 354 +- .../thirdparty/mkl-dnn/include/mkldnn_version.h.in | 32 + .../mkl-dnn/scripts/generate_mkldnn_debug.py | 2 +- .../thirdparty/mkl-dnn/scripts/prepare_mkl.bat | 4 +- .../thirdparty/mkl-dnn/scripts/prepare_mkl.sh | 4 +- .../thirdparty/mkl-dnn/src/CMakeLists.txt | 87 +- .../mkl-dnn/src/common/batch_normalization_pd.hpp | 2 +- .../thirdparty/mkl-dnn/src/common/binarization.cpp | 66 + .../mkl-dnn/src/common/binarization_pd.hpp | 89 + .../mkl-dnn/src/common/binary_convolution.cpp | 120 + .../mkl-dnn/src/common/binary_convolution_pd.hpp | 153 + .../thirdparty/mkl-dnn/src/common/c_types_map.hpp | 64 +- .../thirdparty/mkl-dnn/src/common/convolution.cpp | 2 +- .../mkl-dnn/src/common/convolution_pd.cpp | 56 + .../mkl-dnn/src/common/convolution_pd.hpp | 123 +- .../mkl-dnn/src/common/convolution_relu.cpp | 43 - .../mkl-dnn/src/common/deconvolution_pd.hpp | 11 +- .../thirdparty/mkl-dnn/src/common/depthwise.cpp | 4 +- .../thirdparty/mkl-dnn/src/common/eltwise.cpp | 18 +- .../thirdparty/mkl-dnn/src/common/eltwise_pd.hpp | 6 +- .../mkl-dnn/src/common/format_traits.hpp | 72 +- .../thirdparty/mkl-dnn/src/common/math_utils.hpp | 196 +- .../thirdparty/mkl-dnn/src/common/memory.cpp | 5 +- .../mkl-dnn/src/common/memory_desc_wrapper.cpp | 338 +- .../mkl-dnn/src/common/memory_desc_wrapper.hpp | 52 +- .../mkl-dnn/src/common/memory_tracking.hpp | 297 + .../thirdparty/mkl-dnn/src/common/mkldnn_debug.cpp | 57 +- .../mkl-dnn/src/common/mkldnn_thread.hpp | 7 + .../src/common/mkldnn_thread_parallel_nd.hpp | 23 +- .../mkl-dnn/src/common/mkldnn_traits.hpp | 4 +- .../thirdparty/mkl-dnn/src/common/nstl.hpp | 4 +- .../thirdparty/mkl-dnn/src/common/primitive.hpp | 6 +- .../mkl-dnn/src/common/primitive_attr.cpp | 68 +- .../mkl-dnn/src/common/primitive_attr.hpp | 57 +- .../mkl-dnn/src/common/primitive_desc.cpp | 3 + .../mkl-dnn/src/common/primitive_desc.hpp | 8 + .../thirdparty/mkl-dnn/src/common/rnn.cpp | 275 +- .../thirdparty/mkl-dnn/src/common/rnn_pd.hpp | 253 +- .../thirdparty/mkl-dnn/src/common/roi_pooling.cpp | 2 +- .../thirdparty/mkl-dnn/src/common/scratchpad.cpp | 12 +- .../thirdparty/mkl-dnn/src/common/softmax_pd.hpp | 6 +- .../thirdparty/mkl-dnn/src/common/type_helpers.hpp | 62 +- .../thirdparty/mkl-dnn/src/common/utils.cpp | 42 +- .../thirdparty/mkl-dnn/src/common/utils.hpp | 36 +- .../thirdparty/mkl-dnn/src/common/verbose.cpp | 63 +- .../thirdparty/mkl-dnn/src/common/verbose.hpp | 246 +- .../thirdparty/mkl-dnn/src/common/z_magic.hpp | 9 +- .../src/cpu/cpu_batch_normalization_utils.cpp | 198 +- .../src/cpu/cpu_batch_normalization_utils.hpp | 24 +- .../mkl-dnn/src/cpu/cpu_binarization_pd.hpp | 86 + .../mkl-dnn/src/cpu/cpu_binary_convolution_pd.hpp | 91 + .../thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp | 2 +- .../mkl-dnn/src/cpu/cpu_convolution_pd.hpp | 55 +- .../mkl-dnn/src/cpu/cpu_deconvolution_pd.hpp | 32 + .../thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp | 134 +- .../thirdparty/mkl-dnn/src/cpu/cpu_memory.cpp | 37 +- .../thirdparty/mkl-dnn/src/cpu/cpu_memory.hpp | 23 +- .../cpu/cpu_primitive.cpp} | 27 +- .../thirdparty/mkl-dnn/src/cpu/cpu_primitive.hpp | 36 +- .../thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp | 230 +- .../thirdparty/mkl-dnn/src/cpu/cpu_reducer.hpp | 171 +- .../thirdparty/mkl-dnn/src/cpu/cpu_reorder.cpp | 62 +- .../thirdparty/mkl-dnn/src/cpu/cpu_reorder_pd.hpp | 2 +- .../thirdparty/mkl-dnn/src/cpu/cpu_sum.hpp | 2 +- .../{gemm_utils.cpp => f32/gemm_utils_f32.cpp} | 20 +- .../{gemm_utils.hpp => f32/gemm_utils_f32.hpp} | 11 +- .../gemm/{ => f32}/jit_avx512_common_gemm_f32.cpp | 282 +- .../cpu/gemm/f32/jit_avx512_common_gemm_f32.hpp | 36 + .../src/cpu/gemm/{ => f32}/jit_avx_gemm_f32.cpp | 286 +- .../cpu/gemm/f32/jit_avx_gemm_f32.hpp} | 29 +- .../gemm/{ref_gemm.cpp => f32/ref_gemm_f32.cpp} | 105 +- .../mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.hpp | 36 + .../thirdparty/mkl-dnn/src/cpu/gemm/gemm.cpp | 265 +- .../thirdparty/mkl-dnn/src/cpu/gemm/gemm.hpp | 21 +- .../src/cpu/gemm/jit_avx512_common_gemm_f32.hpp | 58 - .../mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.hpp | 58 - .../thirdparty/mkl-dnn/src/cpu/gemm/os_blas.hpp | 2 +- .../mkl-dnn/src/cpu/gemm/s8x8s32/common.hpp | 206 + .../mkl-dnn/src/cpu/gemm/s8x8s32/gemv.hpp | 28 + .../gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.cpp | 155 + .../gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.hpp | 37 + .../gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.cpp | 1409 + .../gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.hpp | 38 + .../s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp | 539 + .../s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp | 101 + .../gemm/s8x8s32/jit_avx512_core_gemv_s8u8s32.cpp | 290 + .../jit_avx512_core_kernel_gemv_s8u8s32_kern.cpp | 411 + .../jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp | 64 + .../s8x8s32/jit_avx512_core_u8_copy_an_kern.cpp | 819 + .../s8x8s32/jit_avx512_core_u8_copy_at_kern.cpp | 2209 + .../s8x8s32/jit_avx512_core_u8_copy_bn_kern.cpp | 564 + .../s8x8s32/jit_avx512_core_u8_copy_bt_kern.cpp | 501 + .../jit_avx512_core_u8_copy_sum_an_kern.cpp | 1283 + .../jit_avx512_core_u8_copy_sum_at_kern.cpp | 3163 ++ .../jit_avx512_core_u8_copy_sum_bn_kern.cpp | 821 + .../jit_avx512_core_u8_copy_sum_bt_kern.cpp | 647 + .../src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.cpp | 116 + .../src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.hpp | 39 + .../mkl-dnn/src/cpu/gemm_convolution.cpp | 161 +- .../mkl-dnn/src/cpu/gemm_convolution.hpp | 293 +- .../mkl-dnn/src/cpu/gemm_convolution_utils.cpp | 534 +- .../mkl-dnn/src/cpu/gemm_convolution_utils.hpp | 45 +- .../mkl-dnn/src/cpu/gemm_inner_product.cpp | 36 +- .../mkl-dnn/src/cpu/gemm_inner_product.hpp | 30 +- .../src/cpu/gemm_u8s8s32x_inner_product.cpp | 100 - .../mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.cpp | 710 +- .../mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.hpp | 212 +- .../src/cpu/gemm_x8s8s32x_inner_product.cpp | 461 + ...product.hpp => gemm_x8s8s32x_inner_product.hpp} | 116 +- .../src/cpu/jit_avx2_1x1_conv_kernel_f32.cpp | 123 +- .../src/cpu/jit_avx2_1x1_conv_kernel_f32.hpp | 26 +- .../mkl-dnn/src/cpu/jit_avx2_1x1_convolution.cpp | 284 +- .../mkl-dnn/src/cpu/jit_avx2_1x1_convolution.hpp | 274 +- .../mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.cpp | 445 +- .../mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.hpp | 67 +- .../mkl-dnn/src/cpu/jit_avx2_convolution.cpp | 293 +- .../mkl-dnn/src/cpu/jit_avx2_convolution.hpp | 244 +- .../src/cpu/jit_avx512_common_1x1_conv_kernel.cpp | 145 +- .../src/cpu/jit_avx512_common_1x1_conv_kernel.hpp | 34 +- .../src/cpu/jit_avx512_common_1x1_convolution.cpp | 283 +- .../src/cpu/jit_avx512_common_1x1_convolution.hpp | 259 +- .../src/cpu/jit_avx512_common_conv_kernel.cpp | 833 +- .../src/cpu/jit_avx512_common_conv_kernel.hpp | 136 +- .../jit_avx512_common_conv_winograd_kernel_f32.cpp | 431 +- .../jit_avx512_common_conv_winograd_kernel_f32.hpp | 3 +- .../src/cpu/jit_avx512_common_convolution.cpp | 553 +- .../src/cpu/jit_avx512_common_convolution.hpp | 270 +- .../cpu/jit_avx512_common_convolution_winograd.cpp | 1193 +- .../cpu/jit_avx512_common_convolution_winograd.hpp | 399 +- .../mkl-dnn/src/cpu/jit_avx512_common_lrn.cpp | 54 +- .../mkl-dnn/src/cpu/jit_avx512_common_lrn.hpp | 16 +- .../src/cpu/jit_avx512_core_fp32_wino_conv_2x3.cpp | 193 +- .../src/cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp | 99 +- .../src/cpu/jit_avx512_core_fp32_wino_conv_4x3.cpp | 144 +- .../src/cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp | 348 +- .../jit_avx512_core_fp32_wino_conv_4x3_kernel.cpp | 104 +- .../jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp | 5 +- .../src/cpu/jit_avx512_core_i8i8_pooling.cpp | 582 - .../cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp | 602 - .../jit_avx512_core_u8s8s32x_wino_convolution.cpp | 409 +- .../jit_avx512_core_u8s8s32x_wino_convolution.hpp | 74 +- .../jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp | 224 +- .../jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp | 58 +- .../jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 112 +- .../jit_avx512_core_x8s8s32x_1x1_convolution.hpp | 160 +- .../jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp | 162 + .../cpu/jit_avx512_core_x8s8s32x_conv_kernel.cpp | 596 +- .../cpu/jit_avx512_core_x8s8s32x_conv_kernel.hpp | 233 +- .../cpu/jit_avx512_core_x8s8s32x_convolution.cpp | 107 +- .../cpu/jit_avx512_core_x8s8s32x_convolution.hpp | 104 +- .../cpu/jit_avx512_core_x8s8s32x_deconvolution.cpp | 928 + ... => jit_avx512_core_x8s8s32x_deconvolution.hpp} | 100 +- .../thirdparty/mkl-dnn/src/cpu/jit_generator.hpp | 212 +- .../mkl-dnn/src/cpu/jit_primitive_conf.hpp | 146 +- .../src/cpu/jit_sse42_1x1_conv_kernel_f32.cpp | 107 +- .../src/cpu/jit_sse42_1x1_conv_kernel_f32.hpp | 23 +- .../mkl-dnn/src/cpu/jit_sse42_1x1_convolution.cpp | 92 +- .../mkl-dnn/src/cpu/jit_sse42_1x1_convolution.hpp | 109 +- .../mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.cpp | 103 +- .../mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.hpp | 13 +- .../mkl-dnn/src/cpu/jit_sse42_convolution.cpp | 92 +- .../mkl-dnn/src/cpu/jit_sse42_convolution.hpp | 111 +- .../mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.cpp | 586 + ...i8i8_pooling.hpp => jit_sse42_i8i8_pooling.hpp} | 28 +- .../mkl-dnn/src/cpu/jit_uni_1x1_conv_utils.hpp | 70 +- .../src/cpu/jit_uni_batch_normalization.cpp | 299 +- .../src/cpu/jit_uni_batch_normalization.hpp | 106 +- .../mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.cpp | 925 + .../mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.hpp | 140 + .../mkl-dnn/src/cpu/jit_uni_binarization.cpp | 276 + .../mkl-dnn/src/cpu/jit_uni_binarization.hpp | 73 + .../mkl-dnn/src/cpu/jit_uni_binary_convolution.cpp | 251 + .../mkl-dnn/src/cpu/jit_uni_binary_convolution.hpp | 138 + .../mkl-dnn/src/cpu/jit_uni_depthwise.cpp | 660 +- .../mkl-dnn/src/cpu/jit_uni_depthwise.hpp | 73 +- .../mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.cpp | 728 +- .../mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.hpp | 129 +- .../mkl-dnn/src/cpu/jit_uni_dw_convolution.cpp | 358 +- .../mkl-dnn/src/cpu/jit_uni_dw_convolution.hpp | 180 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.cpp | 702 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.hpp | 100 +- .../mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp | 939 +- .../mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp | 24 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.cpp | 62 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.hpp | 16 +- .../src/cpu/jit_uni_planar_conv_kernel_f32.cpp | 760 + .../src/cpu/jit_uni_planar_conv_kernel_f32.hpp | 135 + .../mkl-dnn/src/cpu/jit_uni_planar_convolution.cpp | 172 + .../mkl-dnn/src/cpu/jit_uni_planar_convolution.hpp | 119 + .../thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.cpp | 58 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.hpp | 32 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_reorder.cpp | 108 +- .../mkl-dnn/src/cpu/jit_uni_reorder_utils.cpp | 13 + .../mkl-dnn/src/cpu/jit_uni_roi_pooling.cpp | 12 +- .../mkl-dnn/src/cpu/jit_uni_roi_pooling.hpp | 12 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.cpp | 18 +- .../thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.hpp | 8 +- .../src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp | 507 - .../src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp | 98 - .../src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp | 147 - .../src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp | 140 - .../src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp | 464 +- .../src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp | 48 +- .../src/cpu/jit_uni_x8s8s32x_convolution.cpp | 278 +- .../src/cpu/jit_uni_x8s8s32x_convolution.hpp | 99 +- .../src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp | 243 +- .../src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp | 28 +- .../src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp | 57 +- .../src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp | 53 +- .../thirdparty/mkl-dnn/src/cpu/nchw_pooling.cpp | 108 +- .../thirdparty/mkl-dnn/src/cpu/nchw_pooling.hpp | 20 +- .../mkl-dnn/src/cpu/ncsp_batch_normalization.cpp | 115 +- .../mkl-dnn/src/cpu/ncsp_batch_normalization.hpp | 102 +- .../thirdparty/mkl-dnn/src/cpu/nhwc_pooling.cpp | 110 +- .../thirdparty/mkl-dnn/src/cpu/nhwc_pooling.hpp | 28 +- .../mkl-dnn/src/cpu/nspc_batch_normalization.cpp | 118 +- .../mkl-dnn/src/cpu/nspc_batch_normalization.hpp | 97 +- .../mkl-dnn/src/cpu/ref_batch_normalization.cpp | 76 +- .../mkl-dnn/src/cpu/ref_batch_normalization.hpp | 20 +- .../mkl-dnn/src/cpu/ref_binarization.cpp | 86 + .../mkl-dnn/src/cpu/ref_binarization.hpp | 78 + .../mkl-dnn/src/cpu/ref_binary_convolution.cpp | 284 + .../mkl-dnn/src/cpu/ref_binary_convolution.hpp | 151 + .../thirdparty/mkl-dnn/src/cpu/ref_concat.hpp | 12 +- .../thirdparty/mkl-dnn/src/cpu/ref_convolution.cpp | 258 +- .../thirdparty/mkl-dnn/src/cpu/ref_convolution.hpp | 87 +- .../mkl-dnn/src/cpu/ref_deconvolution.cpp | 84 +- .../mkl-dnn/src/cpu/ref_deconvolution.hpp | 95 +- .../thirdparty/mkl-dnn/src/cpu/ref_depthwise.cpp | 20 +- .../thirdparty/mkl-dnn/src/cpu/ref_depthwise.hpp | 10 +- .../thirdparty/mkl-dnn/src/cpu/ref_eltwise.cpp | 137 +- .../thirdparty/mkl-dnn/src/cpu/ref_eltwise.hpp | 48 +- .../mkl-dnn/src/cpu/ref_inner_product.cpp | 111 +- .../mkl-dnn/src/cpu/ref_inner_product.hpp | 36 +- .../thirdparty/mkl-dnn/src/cpu/ref_lrn.cpp | 66 +- .../thirdparty/mkl-dnn/src/cpu/ref_lrn.hpp | 24 +- .../thirdparty/mkl-dnn/src/cpu/ref_pooling.cpp | 110 +- .../thirdparty/mkl-dnn/src/cpu/ref_pooling.hpp | 20 +- .../thirdparty/mkl-dnn/src/cpu/ref_rnn.cpp | 1192 - .../thirdparty/mkl-dnn/src/cpu/ref_rnn.hpp | 440 - .../thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.cpp | 22 +- .../thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.hpp | 12 +- .../thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp | 60 +- .../thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp | 20 +- .../thirdparty/mkl-dnn/src/cpu/ref_softmax.cpp | 118 +- .../thirdparty/mkl-dnn/src/cpu/ref_softmax.hpp | 99 +- .../thirdparty/mkl-dnn/src/cpu/ref_sum.hpp | 12 +- .../thirdparty/mkl-dnn/src/cpu/rnn/cell_common.cpp | 90 + .../thirdparty/mkl-dnn/src/cpu/rnn/cell_gru.cpp | 180 + .../mkl-dnn/src/cpu/rnn/cell_gru_lbr.cpp | 170 + .../thirdparty/mkl-dnn/src/cpu/rnn/cell_lstm.cpp | 147 + .../thirdparty/mkl-dnn/src/cpu/rnn/cell_rnn.cpp | 113 + .../mkl-dnn/src/cpu/{ => rnn}/cpu_rnn_pd.hpp | 115 +- .../mkl-dnn/src/cpu/rnn/jit_uni_rnn_postgemm.hpp | 424 + .../thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.cpp | 807 + .../thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.hpp | 335 + .../mkl-dnn/src/cpu/rnn/rnn_reorders.hpp | 396 + .../thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.cpp | 400 + .../thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.hpp | 224 + .../thirdparty/mkl-dnn/src/cpu/simple_concat.cpp | 93 +- .../thirdparty/mkl-dnn/src/cpu/simple_concat.hpp | 178 +- .../thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp | 377 +- .../thirdparty/mkl-dnn/src/cpu/simple_sum.cpp | 10 +- .../thirdparty/mkl-dnn/src/cpu/simple_sum.hpp | 10 +- .../thirdparty/mkl-dnn/src/cpu/wino_reorder.hpp | 138 +- .../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak.h | 193 +- .../mkl-dnn/src/cpu/xbyak/xbyak_bin2hex.h | 2 +- .../mkl-dnn/src/cpu/xbyak/xbyak_mnemonic.h | 118 +- .../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h | 216 +- .../thirdparty/mkl-dnn/tests/CMakeLists.txt | 4 +- inference-engine/thirdparty/mkl-dnn/tests/api.c | 16 +- .../mkl-dnn/tests/benchdnn/CMakeLists.txt | 9 +- .../thirdparty/mkl-dnn/tests/benchdnn/README.md | 702 +- .../mkl-dnn/tests/benchdnn/bnorm/bench_bnorm.cpp | 2 +- .../mkl-dnn/tests/benchdnn/bnorm/bnorm.cpp | 17 +- .../mkl-dnn/tests/benchdnn/bnorm/perf_report.cpp | 4 +- .../mkl-dnn/tests/benchdnn/conv/bench_conv.cpp | 6 +- .../mkl-dnn/tests/benchdnn/conv/bench_deconv.cpp | 4 +- .../thirdparty/mkl-dnn/tests/benchdnn/conv/cfg.cpp | 23 + .../mkl-dnn/tests/benchdnn/conv/conv.cpp | 167 +- .../mkl-dnn/tests/benchdnn/conv/conv_aux.cpp | 83 +- .../mkl-dnn/tests/benchdnn/conv/conv_common.hpp | 24 +- .../mkl-dnn/tests/benchdnn/conv/deconv.cpp | 66 +- .../mkl-dnn/tests/benchdnn/conv/ref_conv.cpp | 72 +- .../mkl-dnn/tests/benchdnn/conv/ref_wino.cpp | 4 - .../mkl-dnn/tests/benchdnn/dnn_types.cpp | 88 +- .../mkl-dnn/tests/benchdnn/dnn_types.hpp | 8 +- .../mkl-dnn/tests/benchdnn/inputs/conv_auto | 2 + .../mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p1 | 18 +- .../mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p2 | 10 +- .../tests/benchdnn/inputs/conv_googlenet_v1 | 14 +- .../tests/benchdnn/inputs/conv_googlenet_v2 | 32 +- .../tests/benchdnn/inputs/conv_googlenet_v3 | 40 +- .../mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p1 | 24 +- .../mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p2 | 2 +- .../mkl-dnn/tests/benchdnn/inputs/conv_mobilenet | 22 +- .../tests/benchdnn/inputs/conv_mobilenet_dw | 1 - .../tests/benchdnn/inputs/conv_regression_gemm | 6 + .../mkl-dnn/tests/benchdnn/inputs/conv_resnet_50 | 24 +- .../tests/benchdnn/inputs/conv_ssd_mobilenet | 11 + .../mkl-dnn/tests/benchdnn/inputs/conv_tails | 2 + .../mkl-dnn/tests/benchdnn/inputs/conv_vgg_19 | 6 +- .../mkl-dnn/tests/benchdnn/inputs/conv_yolov2 | 28 +- .../tests/benchdnn/inputs/deconv/deconv_1x1 | 33 + .../tests/benchdnn/inputs/{ => deconv}/deconv_2d | 6 + .../tests/benchdnn/inputs/{ => deconv}/deconv_3d | 0 .../tests/benchdnn/inputs/{ => deconv}/deconv_all | 2 +- .../{dilated_deconv => deconv/deconv_dilated} | 0 .../tests/benchdnn/inputs/deconv/test_deconv_1x1 | 24 + .../tests/benchdnn/inputs/deconv/test_deconv_all | 30 + .../mkl-dnn/tests/benchdnn/inputs/deepbench | 2 +- .../mkl-dnn/tests/benchdnn/inputs/ip/ip_all | 11 +- .../mkl-dnn/tests/benchdnn/inputs/ip/test_ip_all | 4 + .../tests/benchdnn/inputs/reorder/test_default | 3 + .../mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru | 2 +- .../tests/benchdnn/inputs/rnn/rnn_inference | 3 + .../mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training | 1 + .../tests/benchdnn/inputs/rnn/test_rnn_small | 35 +- .../mkl-dnn/tests/benchdnn/inputs/test_conv_all | 20 +- .../mkl-dnn/tests/benchdnn/inputs/test_conv_attrs | 35 +- .../tests/benchdnn/inputs/test_conv_depthwise | 9 +- .../tests/benchdnn/inputs/test_conv_regression | 5 +- .../benchdnn/inputs/test_conv_regression_general | 10 + .../mkl-dnn/tests/benchdnn/inputs/test_deconv_all | 26 - .../thirdparty/mkl-dnn/tests/benchdnn/ip/cfg.cpp | 40 + .../thirdparty/mkl-dnn/tests/benchdnn/ip/ip.cpp | 7 - .../mkl-dnn/tests/benchdnn/mkldnn_debug.cpp | 4 + .../mkl-dnn/tests/benchdnn/mkldnn_memory.hpp | 7 +- .../mkl-dnn/tests/benchdnn/reorder/reorder.cpp | 19 +- .../mkl-dnn/tests/benchdnn/reorder/reorder.hpp | 2 +- .../mkl-dnn/tests/benchdnn/reorder/reorder_aux.cpp | 4 +- .../mkl-dnn/tests/benchdnn/rnn/bench_rnn.cpp | 47 +- .../thirdparty/mkl-dnn/tests/benchdnn/rnn/cfg.cpp | 110 +- .../mkl-dnn/tests/benchdnn/rnn/perf_report.cpp | 66 +- .../mkl-dnn/tests/benchdnn/rnn/ref_rnn.cpp | 369 +- .../thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.cpp | 157 +- .../thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.hpp | 63 +- .../mkl-dnn/tests/benchdnn/rnn/rnn_aux.cpp | 90 +- .../mkl-dnn/tests/benchdnn/rnn/rnn_aux.hpp | 2 + .../mkl-dnn/tests/benchdnn/self/conv.cpp | 19 +- .../mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp | 21 +- .../mkl-dnn/tests/generate_c_symbols_refs.sh | 9 +- .../thirdparty/mkl-dnn/tests/gtests/CMakeLists.txt | 21 +- .../mkl-dnn/tests/gtests/convolution_common.h | 109 +- .../tests/gtests/in/convolution_simple_small.h | 8 +- .../thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h | 409 +- .../mkl-dnn/tests/gtests/mkldnn_test_common.hpp | 76 +- .../tests/gtests/test_batch_normalization.cpp | 8 +- .../mkl-dnn/tests/gtests/test_binarization.cpp | 160 + ...est_binary_convolution_binarization_forward.cpp | 74 + .../test_binary_convolution_depthwise_forward.cpp | 75 + ...ry_convolution_dw_conv_binarization_forward.cpp | 56 + ...inary_convolution_dw_conv_depthwise_forward.cpp | 46 + ..._binary_convolution_dw_conv_eltwise_forward.cpp | 55 + .../test_binary_convolution_dw_conv_forward.cpp | 61 + ...t_binary_convolution_dw_conv_forward_common.hpp | 528 + ...test_binary_convolution_dw_conv_sum_forward.cpp | 67 + .../test_binary_convolution_eltwise_forward.cpp | 80 + .../gtests/test_binary_convolution_forward.cpp | 92 + .../test_binary_convolution_forward_common.hpp | 352 + .../gtests/test_binary_convolution_sum_forward.cpp | 71 + .../mkl-dnn/tests/gtests/test_concat.cpp | 6 +- .../test_convolution_backward_data_common.hpp | 2 +- .../test_convolution_backward_weights_common.hpp | 2 +- .../test_convolution_depthwise_forward_common.hpp | 237 + .../test_convolution_depthwise_forward_f32.cpp | 231 +- ...st_convolution_depthwise_forward_x8s8f32s32.cpp | 106 + .../gtests/test_convolution_dw_conv_common.hpp | 81 +- .../tests/gtests/test_convolution_dw_conv_f32.cpp | 7 +- .../gtests/test_convolution_dw_conv_u8s8s32.cpp | 89 + .../test_convolution_eltwise_forward_common.hpp | 192 +- .../test_convolution_eltwise_forward_f32.cpp | 258 +- ...test_convolution_eltwise_forward_x8s8f32s32.cpp | 109 + .../gtests/test_convolution_forward_common.hpp | 2 +- .../gtests/test_convolution_forward_common_3d.hpp | 2 +- .../gtests/test_convolution_forward_f32_3d.cpp | 30 + .../gtests/test_convolution_forward_u8s8fp.cpp | 1 - .../gtests/test_convolution_forward_u8s8s32.cpp | 1 - ...f32.cpp => test_convolution_forward_u8s8u8.cpp} | 9 +- .../test_convolution_relu_forward_common.hpp | 201 - .../mkl-dnn/tests/gtests/test_deconvolution.cpp | 2 +- .../mkl-dnn/tests/gtests/test_depthwise.cpp | 16 +- .../mkl-dnn/tests/gtests/test_eltwise.cpp | 122 +- .../mkl-dnn/tests/gtests/test_gemm_common.hpp | 138 +- .../mkl-dnn/tests/gtests/test_memory.cpp | 17 +- .../mkl-dnn/tests/gtests/test_pooling_forward.cpp | 4 +- .../thirdparty/mkl-dnn/tests/gtests/test_relu.cpp | 249 - .../mkl-dnn/tests/gtests/test_reorder.cpp | 8 +- .../mkl-dnn/tests/gtests/test_rnn_forward.cpp | 243 + .../mkl-dnn/tests/gtests/test_softmax_forward.cpp | 7 +- .../mkl-dnn/tests/other/subproject/CMakeLists.txt | 33 + .../mkl-dnn/tests/other/subproject/main.c | 26 + inference-engine/thirdparty/mkldnn.cmake | 56 +- .../tools/accuracy_checker_tool/README.md | 163 + .../tools/accuracy_checker_tool/accuracy_check.py | 19 + .../accuracy_checker_tool/convert_annotation.py | 20 + inference-engine/tools/benchmark_tool/README.md | 16 + inference-engine/tools/benchmark_tool/benchmark.py | 22 + inference-engine/tools/calibration_tool/README.md | 149 + .../tools/calibration_tool/calibrate.py | 23 + .../tools/calibration_tool/configs/definitions.yml | 202 + .../calibration_tool/configs/inception_v1.yml | 29 + .../tools/calibration_tool/configs/ncf_config.yml | 56 + .../configs/ssd_mobilenet_v1_coco.yml | 40 + .../tools/calibration_tool/configs/unet2d.yml | 54 + .../tools/collect_statistics_tool/README.md | 7 + .../collect_statistics_tool/collect_statistics.py | 39 + .../extensions/back/ConvolutionReshaper.py | 10 +- .../extensions/back/CreateConstNodes.py | 84 + .../extensions/back/CreateConstNodes_test.py | 138 + .../extensions/back/DumpFakeQuantStat.py | 57 + .../extensions/back/EltwiseBroadcast.py | 8 +- .../extensions/back/EnableConstantStridedSlice.py | 36 + .../extensions/back/PackBinaryWeights.py | 58 + .../extensions/back/PermuteForReshape.py | 5 +- .../extensions/back/PermuteForReshape_test.py | 2 +- .../extensions/back/RNNSequenceTypeRename.py | 40 + model-optimizer/extensions/back/ReshapeMutation.py | 89 + .../extensions/back/ShufflenetReLUReorder.py | 6 +- .../extensions/back/ShufflenetReLUReorder_test.py | 2 +- model-optimizer/extensions/back/TileReshaper.py | 7 +- .../extensions/back/TileReshaper_test.py | 2 +- .../back/disable_unsupported_ND_operations.py | 6 +- .../back/insert_compatibility_l2normalization.py | 10 +- .../insert_compatibility_l2normalization_test.py | 2 +- .../extensions/back/kaldi_remove_memory_output.py | 20 +- .../back/kaldi_remove_memory_output_test.py | 25 +- .../extensions/back/remove_last_softmax_pattern.py | 34 +- .../extensions/back/remove_last_softmax_test.py | 11 +- model-optimizer/extensions/front/LRNReplacer.py | 5 +- model-optimizer/extensions/front/Pack.py | 13 +- .../extensions/front/caffe/accum_ext.py | 2 +- .../extensions/front/caffe/accum_ext_test.py | 2 +- .../extensions/front/caffe/argmax_ext.py | 2 +- .../extensions/front/caffe/argmax_ext_test.py | 2 +- model-optimizer/extensions/front/caffe/axpy.py | 8 +- .../extensions/front/caffe/axpy_test.py | 2 +- model-optimizer/extensions/front/caffe/bias_ext.py | 37 + .../extensions/front/caffe/bias_ext_test.py | 46 + .../extensions/front/caffe/binarization.py | 43 + .../extensions/front/caffe/binary_conv_ext.py | 55 + model-optimizer/extensions/front/caffe/bn.py | 7 +- model-optimizer/extensions/front/caffe/bn_test.py | 2 +- model-optimizer/extensions/front/caffe/conv_ext.py | 2 +- .../extensions/front/caffe/conv_ext_test.py | 2 +- .../extensions/front/caffe/correlation_ext.py | 2 +- .../extensions/front/caffe/correlation_ext_test.py | 2 +- .../extensions/front/caffe/ctcgreedydecoder_ext.py | 2 +- .../front/caffe/ctcgreedydecoder_ext_test.py | 2 +- .../front/caffe/data_augmentation_ext.py | 2 +- .../front/caffe/data_augmentation_ext_test.py | 2 +- .../extensions/front/caffe/detection_output.py | 4 +- .../extensions/front/caffe/flatten_ext.py | 2 +- model-optimizer/extensions/front/caffe/grn_ext.py | 2 +- .../extensions/front/caffe/grn_ext_test.py | 2 +- .../extensions/front/caffe/interp_ext.py | 2 +- .../extensions/front/caffe/interp_ext_test.py | 2 +- model-optimizer/extensions/front/caffe/mvn_ext.py | 2 +- .../extensions/front/caffe/normalize_ext.py | 2 +- .../extensions/front/caffe/normalize_ext_test.py | 2 +- .../extensions/front/caffe/pooling_ext.py | 2 +- .../extensions/front/caffe/pooling_ext_test.py | 2 +- .../extensions/front/caffe/power_file_ext.py | 2 +- .../extensions/front/caffe/power_file_ext_test.py | 2 +- .../extensions/front/caffe/prelu_ext.py | 2 +- .../extensions/front/caffe/prelu_ext_test.py | 2 +- .../front/caffe/priorbox_clustered_ext.py | 2 +- .../front/caffe/priorbox_clustered_ext_test.py | 2 +- .../extensions/front/caffe/priorbox_ext.py | 2 +- .../extensions/front/caffe/priorbox_ext_test.py | 2 +- .../extensions/front/caffe/proposal_ext.py | 2 +- .../extensions/front/caffe/proposal_ext_test.py | 2 +- .../extensions/front/caffe/proposal_python_ext.py | 2 +- .../front/caffe/proposal_python_ext_test.py | 2 +- .../extensions/front/caffe/psroipooling_ext.py | 2 +- .../front/caffe/psroipooling_ext_test.py | 2 +- .../extensions/front/caffe/regionyolo_ext.py | 2 +- .../extensions/front/caffe/regionyolo_ext_test.py | 2 +- .../extensions/front/caffe/reorgyolo_ext.py | 2 +- .../extensions/front/caffe/reorgyolo_ext_test.py | 2 +- .../extensions/front/caffe/resample_ext.py | 2 +- .../extensions/front/caffe/resample_ext_test.py | 2 +- .../extensions/front/caffe/shufflechannel_ext.py | 2 +- .../extensions/front/caffe/simplernms_ext.py | 2 +- .../extensions/front/caffe/simplernms_ext_test.py | 2 +- .../extensions/front/caffe/softmax_ext.py | 2 +- .../front/caffe/spatial_transformer_ext.py | 2 +- .../front/caffe/spatial_transformer_ext_test.py | 2 +- .../extensions/front/caffe/split_to_identity.py | 7 +- .../extensions/front/create_tensor_nodes.py | 34 + .../{mo/ops => extensions/front}/div.py | 23 +- model-optimizer/extensions/front/div_test.py | 98 + model-optimizer/extensions/front/eltwise_n.py | 6 +- model-optimizer/extensions/front/eltwise_n_test.py | 2 +- .../extensions/front/freeze_placeholder_value.py | 29 +- .../front/freeze_placeholder_value_test.py | 12 +- model-optimizer/extensions/front/image_scaler.py | 40 +- .../extensions/front/image_scaler_test.py | 208 +- model-optimizer/extensions/front/input_cut.py | 33 + .../extensions/front/instance_normalization.py | 6 +- .../front/instance_normalization_test.py | 2 +- model-optimizer/extensions/front/kaldi/__init__.py | 0 .../front/kaldi/add_permute_after_convolution.py | 111 + .../kaldi/add_permute_after_convolution_test.py | 75 + .../front/kaldi/add_reshape_around_convolution.py | 7 +- .../front/kaldi/add_reshape_around_pooling.py | 10 +- .../front/kaldi/eliminate_redundant_reshape.py | 6 +- .../front/kaldi/fuse_repeated_reshape.py | 7 +- .../front/kaldi/replace_lstm_node_pattern.py | 19 +- .../front/kaldi/replace_splice_node_pattern.py | 9 +- .../kaldi/replace_splice_node_pattern_test.py | 2 +- model-optimizer/extensions/front/mxnet/RNN_ext.py | 48 +- .../extensions/front/mxnet/RNN_ext_test.py | 99 + .../front/mxnet/add_input_data_to_prior_boxes.py | 62 + .../mxnet/add_input_data_to_prior_boxes_test.py} | 11 +- .../extensions/front/mxnet/add_n_ext.py | 2 +- .../extensions/front/mxnet/block_grad_ext.py | 2 +- .../extensions/front/mxnet/broadcast_mul.py | 9 +- .../extensions/front/mxnet/broadcast_mul_ext.py | 2 +- .../front/mxnet/check_softmax_node_inputs.py | 19 +- .../front/mxnet/check_softmax_node_inputs_test.py | 2 +- model-optimizer/extensions/front/mxnet/conv_ext.py | 2 +- .../extensions/front/mxnet/conv_ext_test.py | 2 +- model-optimizer/extensions/front/mxnet/copy_ext.py | 2 +- model-optimizer/extensions/front/mxnet/custom.py | 2 +- .../extensions/front/mxnet/custom_test.py | 2 +- .../extensions/front/mxnet/dropout_ext.py | 2 +- .../extensions/front/mxnet/element_wise_sum_ext.py | 2 +- model-optimizer/extensions/front/mxnet/exp_ext.py | 28 + .../extensions/front/mxnet/flatten_ext.py | 2 +- model-optimizer/extensions/front/mxnet/gather.py | 33 + .../front/mxnet/gather_ext.py} | 18 +- .../extensions/front/mxnet/gather_test.py | 64 + .../extensions/front/mxnet/instance_norm_ext.py | 2 +- model-optimizer/extensions/front/mxnet/max_ext.py | 6 +- .../extensions/front/mxnet/maximum_ext.py | 2 +- .../extensions/front/mxnet/minimum_ext.py | 2 +- .../extensions/front/mxnet/minus_scalar.py | 6 +- .../extensions/front/mxnet/minus_scalar_ext.py | 2 +- .../extensions/front/mxnet/mul_scalar.py | 8 +- .../extensions/front/mxnet/mul_scalar_ext.py | 2 +- model-optimizer/extensions/front/mxnet/pad_ext.py | 2 +- .../extensions/front/mxnet/pooling_ext.py | 2 +- .../extensions/front/mxnet/pooling_ext_test.py | 2 +- .../extensions/front/mxnet/proposal_ext.py | 2 +- .../extensions/front/mxnet/reshape_ext.py | 2 +- .../extensions/front/mxnet/rnn_param_concat.py | 2 +- .../extensions/front/mxnet/roi_pooling_ext.py | 2 +- .../extensions/front/mxnet/slice_channel_ext.py | 2 +- .../front/mxnet/slice_channel_ext_test.py | 2 +- model-optimizer/extensions/front/mxnet/softmax.py | 5 +- .../front/mxnet/softmax_activation_ext.py | 2 +- .../extensions/front/mxnet/softmax_ext.py | 2 +- .../extensions/front/mxnet/softmax_output_ext.py | 2 +- .../ssd_pattern_flatten_softmax_activation.py | 10 +- .../ssd_pattern_flatten_softmax_activation_test.py | 4 +- .../front/mxnet/ssd_pattern_remove_flatten.py | 10 +- .../front/mxnet/ssd_pattern_remove_flatten_test.py | 2 +- .../front/mxnet/ssd_pattern_remove_reshape.py | 10 +- .../front/mxnet/ssd_pattern_remove_reshape_test.py | 2 +- .../front/mxnet/ssd_pattern_remove_transpose.py | 10 +- .../mxnet/ssd_pattern_remove_transpose_test.py | 2 +- .../mxnet/ssd_reorder_detection_out_inputs.py | 12 +- .../mxnet/ssd_reorder_detection_out_inputs_test.py | 2 +- .../extensions/front/mxnet/stack_ext.py | 2 +- .../extensions/front/mxnet/swapaxes_ext.py | 2 +- .../extensions/front/mxnet/up_sampling_ext.py | 2 +- .../extensions/front/mxnet/zeros_ext.py | 6 +- model-optimizer/extensions/front/no_op_eraser.py | 10 +- model-optimizer/extensions/front/onnx/add_ext.py | 2 +- .../extensions/front/onnx/affine_ext.py | 2 +- .../extensions/front/onnx/affine_ext_test.py | 2 +- model-optimizer/extensions/front/onnx/argmax.py | 46 + .../extensions/front/onnx/argmax_ext.py | 42 + model-optimizer/extensions/front/onnx/cast_ext.py | 30 + model-optimizer/extensions/front/onnx/clip_ext.py | 33 + .../extensions/front/onnx/constant_fill_ext.py | 2 +- model-optimizer/extensions/front/onnx/conv_ext.py | 60 +- .../extensions/front/onnx/conv_ext_test.py | 16 +- model-optimizer/extensions/front/onnx/crop_ext.py | 2 +- .../extensions/front/onnx/crop_ext_test.py | 2 +- .../extensions/front/onnx/detection_output.py | 112 + .../extensions/front/onnx/detection_output_test.py | 102 + .../extensions/front/onnx/detectionoutput_ext.py | 42 + .../extensions/front/onnx/dropout_ext.py | 36 + model-optimizer/extensions/front/onnx/elu_ext.py | 2 +- .../extensions/front/onnx/elu_ext_test.py | 2 +- model-optimizer/extensions/front/onnx/exp_ext.py | 28 + .../extensions/front/onnx/flatten_ext.py | 2 +- .../extensions/front/onnx/flatten_ext_test.py | 2 +- .../extensions/front/onnx/gather_ext.py | 2 +- .../extensions/front/onnx/gather_ext_test.py | 2 +- model-optimizer/extensions/front/onnx/gru_ext.py | 59 + .../extensions/front/onnx/gru_ext_test.py | 79 + .../extensions/front/onnx/image_scaler_ext.py | 2 +- .../extensions/front/onnx/image_scaler_ext_test.py | 2 +- .../front/onnx/instance_normalization_ext.py | 2 +- .../front/onnx/instance_normalization_ext_test.py | 2 +- .../extensions/front/onnx/leaky_relu_ext.py | 2 +- model-optimizer/extensions/front/onnx/lrn_ext.py | 2 +- model-optimizer/extensions/front/onnx/lstm_ext.py | 42 +- .../extensions/front/onnx/lstm_ext_test.py | 77 + .../extensions/front/onnx/matmul_ext.py | 2 +- model-optimizer/extensions/front/onnx/mul_ext.py | 2 +- model-optimizer/extensions/front/onnx/neg_ext.py | 2 +- model-optimizer/extensions/front/onnx/pad_ext.py | 2 +- .../extensions/front/onnx/pad_ext_test.py | 2 +- .../extensions/front/onnx/pooling_ext.py | 2 +- model-optimizer/extensions/front/onnx/pow_ext.py | 2 +- .../extensions/front/onnx/priorbox_ext.py | 51 + .../extensions/front/onnx/priorbox_ext_test.py | 89 + .../front/onnx/priorgridgenerator_ext.py | 35 + .../extensions/front/onnx/proposal_ext.py | 34 + .../extensions/front/onnx/quantize_ext.py | 30 + .../extensions/front/onnx/reduce_mean_ext.py | 2 +- .../extensions/front/onnx/reduce_sum_ext.py | 2 +- model-optimizer/extensions/front/onnx/rnn_ext.py | 57 + .../extensions/front/onnx/rnn_ext_test.py | 77 + .../front/onnx/roifeatureextractor_ext.py | 42 + model-optimizer/extensions/front/onnx/scale_ext.py | 35 + .../extensions/front/onnx/sigmoid_ext.py | 2 +- .../extensions/front/onnx/sigmoid_ext_test.py | 2 +- model-optimizer/extensions/front/onnx/slice_ext.py | 2 +- .../extensions/front/onnx/slice_ext_test.py | 2 +- .../extensions/front/onnx/softmax_ext.py | 2 +- model-optimizer/extensions/front/onnx/split_ext.py | 2 +- .../extensions/front/onnx/squeeze_ext.py | 2 +- .../extensions/front/onnx/squeeze_ext_test.py | 2 +- model-optimizer/extensions/front/onnx/tanh_ext.py | 2 +- .../extensions/front/onnx/tanh_ext_test.py | 2 +- .../extensions/front/onnx/topkrois_ext.py | 30 + .../extensions/front/onnx/transpose_ext.py | 2 +- .../extensions/front/onnx/transpose_ext_test.py | 2 +- .../extensions/front/onnx/unsqueeze_ext.py | 2 +- .../extensions/front/onnx/unsqueeze_ext_test.py | 2 +- .../extensions/front/onnx/upsample_ext.py | 2 +- .../extensions/front/onnx/upsample_ext_test.py | 2 +- model-optimizer/extensions/front/output_cut.py | 32 + model-optimizer/extensions/front/override_batch.py | 25 + model-optimizer/extensions/front/pass_separator.py | 43 + model-optimizer/extensions/front/reciprocal.py | 6 +- .../extensions/front/reciprocal_test.py | 2 +- model-optimizer/extensions/front/restore_ports.py | 42 + .../extensions/front/squared_difference.py | 6 +- .../extensions/front/standalone_const_eraser.py | 10 +- model-optimizer/extensions/front/sub.py | 6 +- .../extensions/front/tf/ArgMaxReshape.py | 12 +- model-optimizer/extensions/front/tf/BlockLSTM.py | 23 +- .../extensions/front/tf/BlockLSTM_ext.py | 2 +- .../extensions/front/tf/CTCGreedyDecoder.py | 14 +- .../extensions/front/tf/CTCGreedyDecoder_ext.py | 2 +- model-optimizer/extensions/front/tf/Cast_ext.py | 30 + model-optimizer/extensions/front/tf/ConvFlatten.py | 15 +- .../front/tf/CropAndResizeReplacement.py | 26 +- .../extensions/front/tf/FlattenToReshape.py | 91 + .../extensions/front/tf/ObjectDetectionAPI.py | 399 +- .../extensions/front/tf/ObjectDetectionAPI_test.py | 7 +- .../tf/RetinaNetFilteredDetectionsReplacement.py | 17 +- .../front/tf/SSDToolboxDetectionOutput.py | 21 +- .../extensions/front/tf/TensorArrayExtractors.py | 2 +- .../extensions/front/tf/TensorArrayGatherV3.py | 2 +- model-optimizer/extensions/front/tf/Unpack.py | 12 +- model-optimizer/extensions/front/tf/YOLO.py | 14 +- model-optimizer/extensions/front/tf/ZerosLike.py | 38 + model-optimizer/extensions/front/tf/addn_ext.py | 2 +- model-optimizer/extensions/front/tf/argmax_ext.py | 2 +- .../extensions/front/tf/assign_elimination.py | 24 +- .../extensions/front/tf/basic_lstm_cell.py | 67 +- model-optimizer/extensions/front/tf/concat.py | 7 +- model-optimizer/extensions/front/tf/concat_ext.py | 2 +- .../extensions/front/tf/concat_ext_test.py | 2 +- model-optimizer/extensions/front/tf/concat_test.py | 2 +- model-optimizer/extensions/front/tf/conv_ext.py | 2 +- .../extensions/front/tf/conv_ext_test.py | 2 +- .../extensions/front/tf/crop_and_resize_ext.py | 2 +- model-optimizer/extensions/front/tf/deconv_ext.py | 2 +- .../extensions/front/tf/deconv_ext_test.py | 2 +- .../extensions/front/tf/depth_to_space.py | 2 +- model-optimizer/extensions/front/tf/exp_ext.py | 28 + .../extensions/front/tf/extract_image_patches.py | 2 +- model-optimizer/extensions/front/tf/fake_const.py | 11 +- .../extensions/front/tf/faster_rcnn_support.json | 14 +- .../front/tf/faster_rcnn_support_api_v1.10.json | 113 + .../front/tf/faster_rcnn_support_api_v1.7.json | 12 + .../extensions/front/tf/fifo_queue_v2_ext.py | 4 +- .../extensions/front/tf/fifo_replacer.py | 31 +- .../extensions/front/tf/fifo_replacer_test.py | 2 +- model-optimizer/extensions/front/tf/gather_ext.py | 2 +- .../extensions/front/tf/mask_rcnn_support.json | 14 +- .../front/tf/mask_rcnn_support_api_v1.11.json | 12 + .../front/tf/mask_rcnn_support_api_v1.7.json | 12 + model-optimizer/extensions/front/tf/max_ext.py | 2 +- model-optimizer/extensions/front/tf/mvn.py | 9 +- .../extensions/front/tf/mvn_unrolled.py | 12 +- .../extensions/front/tf/mvn_unrolled_test.py | 2 +- .../front/tf/nearest_neighbor_upsampling.py | 10 +- .../extensions/front/tf/next_iteration_ext.py | 2 +- .../extensions/front/tf/next_iteration_ext_test.py | 2 +- model-optimizer/extensions/front/tf/pad_ext.py | 2 +- .../extensions/front/tf/pad_ext_test.py | 2 +- model-optimizer/extensions/front/tf/pooling_ext.py | 2 +- .../extensions/front/tf/pooling_ext_test.py | 2 +- model-optimizer/extensions/front/tf/prelu.py | 14 +- model-optimizer/extensions/front/tf/rank_ext.py | 2 +- .../extensions/front/tf/resize_bilinear.py | 2 +- .../extensions/front/tf/resize_nearest_neighbor.py | 2 +- .../extensions/front/tf/reverse_sequence.py | 11 +- model-optimizer/extensions/front/tf/reverse_v2.py | 7 +- .../extensions/front/tf/rfcn_support.json | 8 +- .../front/tf/rfcn_support_api_v1.10.json | 145 + model-optimizer/extensions/front/tf/shape_ext.py | 31 + model-optimizer/extensions/front/tf/slice_ext.py | 2 +- model-optimizer/extensions/front/tf/softmax_ext.py | 2 +- model-optimizer/extensions/front/tf/split_ext.py | 2 +- model-optimizer/extensions/front/tf/sqrt_ext.py | 2 +- model-optimizer/extensions/front/tf/square_ext.py | 2 +- .../extensions/front/tf/stop_gradient_ext.py | 2 +- .../extensions/front/tf/stop_gradient_ext_test.py | 2 +- model-optimizer/extensions/front/tf/sum_ext.py | 28 + .../tensorflow_custom_operations_config_update.py | 61 + .../extensions/front/tf/tensorflow_patterns.py | 51 + .../tf/tensorflow_use_custom_operations_config.py | 44 + model-optimizer/extensions/front/tf/tile_ext.py | 2 +- .../extensions/front/tf/variable_ext.py | 2 +- .../front/tf/variables_values_freezing.py | 36 + .../extensions/front/tf/yolo_v3_tiny.json | 14 + .../extensions/front/user_data_repack.py | 42 + .../extensions/middle/AddIsCyclicAttribute.py | 17 +- .../extensions/middle/AddIsCyclicAttribute_test.py | 2 +- .../extensions/middle/AddMeanScaleValues.py | 122 + .../extensions/middle/AddMeanScaleValues_test.py | 252 + .../extensions/middle/AddQuantizeFuse.py | 80 + .../middle/AddReshapeAfterStridedSlice.py | 124 - .../middle/AddReshapeAfterStridedSlice_test.py | 312 - .../extensions/middle/BinarizeWeightsM1P1.py | 154 + .../extensions/middle/BlockLSTMtoLSTMSequence.py | 67 +- model-optimizer/extensions/middle/Cast.py | 41 + .../extensions/middle/ChangePlaceholderTypes.py | 94 + model-optimizer/extensions/middle/CheckForCycle.py | 39 + .../extensions/middle/CheckForCycle_test.py | 77 + .../extensions/middle/ConcatOptimization.py | 93 + .../extensions/middle/ConstSwitchResolver.py | 12 +- .../extensions/middle/ConvToBinaryConv.py | 129 + .../middle/ConvertGroupedStridedSlice.py | 147 +- .../middle/ConvertGroupedStridedSlice_test.py | 429 +- .../middle/ConvertLayoutDependentOperations.py | 11 +- .../extensions/middle/ConvertMultiInputConv.py | 75 + .../extensions/middle/CustomSubgraphCall.py | 322 + .../middle/DecomposeBidirectionalRNNSequence.py | 213 + .../extensions/middle/DeleteControlFlowEdges.py | 37 + .../extensions/middle/DeleteNotExecutable.py | 42 + model-optimizer/extensions/middle/DepthToSpace.py | 38 +- .../extensions/middle/DilatedConvolution.py | 89 + .../extensions/middle/EltwiseChecker.py | 21 +- .../extensions/middle/EltwiseInputNormalization.py | 11 +- .../middle/EltwiseInputNormalization_test.py | 2 +- .../extensions/middle/EltwiseInputReshape.py | 19 +- .../extensions/middle/EltwiseInputReshape_test.py | 2 +- .../extensions/middle/FusePermutesSequence.py | 9 +- .../extensions/middle/FusePermutesSequence_test.py | 40 +- .../extensions/middle/FusedBatchNormNonConstant.py | 16 +- .../middle/FusedBatchNormTrainingCatch.py | 15 +- .../middle/GRURNNSequenceToTensorIterator.py | 223 + .../extensions/middle/GatherNdNormalizer.py | 100 + model-optimizer/extensions/middle/GemmResolver.py | 28 +- .../extensions/middle/GemmToFullyConnected.py | 88 + model-optimizer/extensions/middle/InputCut.py | 34 + model-optimizer/extensions/middle/L2NormToNorm.py | 107 + ...rator.py => LSTMRNNSequenceToTensorIterator.py} | 67 +- .../middle/LayoutChangeForConstantShapePaths.py | 113 + .../extensions/middle/MXNetRNNSequenceNormalize.py | 229 + .../extensions/middle/MXNetSplitMultiLayers.py | 206 + model-optimizer/extensions/middle/MeanToAvgPool.py | 95 + .../middle/MeanToAvgPool_test.py} | 24 +- .../extensions/middle/MinimumMiddleReplacer.py | 15 +- .../middle/MinumumMiddleReplacer_test.py | 2 +- .../extensions/middle/MulQuantizeFuse.py | 90 + model-optimizer/extensions/middle/NasNet.py | 146 + .../extensions/middle/NormalizeFullyConnected.py | 17 +- .../middle/NormalizeFullyConnected_test.py | 2 +- model-optimizer/extensions/middle/NormalizePad.py | 14 +- .../extensions/middle/ONNXRNNSequenceNormalize.py | 234 + model-optimizer/extensions/middle/PartialInfer.py | 31 + .../extensions/middle/PixelLinkReshape.py | 128 +- .../extensions/middle/PixelLinkReshape_test.py | 26 +- .../extensions/middle/RNNSequenceNormalizeToIE.py | 215 + model-optimizer/extensions/middle/Reduce.py | 14 +- model-optimizer/extensions/middle/Reduce_test.py | 2 +- .../extensions/middle/ReluQuantizeFuse.py | 90 + .../extensions/middle/RemoveIdentity.py | 83 + .../RemoveRedundantReshapeAfterCropAndResize.py | 68 + .../middle/ReverseV2ToReverseSequence.py | 62 + model-optimizer/extensions/middle/ScaleInput.py | 71 + .../extensions/middle/ScaleInput_test.py | 91 + .../extensions/middle/SharedWeightsDuplication.py | 54 + .../middle/SharedWeightsDuplication_test.py} | 34 +- .../extensions/middle/ShuffleChannel.py | 13 +- .../extensions/middle/ShuffleChannel_test.py | 2 +- .../extensions/middle/ShufflenetReshape.py | 18 +- .../extensions/middle/ShufflenetReshape_test.py | 2 +- .../extensions/middle/SliceConvert_test.py | 34 +- .../extensions/middle/SliceConverter.py | 56 +- .../extensions/middle/SwapAxesMiddleReplacer.py | 12 +- .../extensions/middle/TF_lstm_cell_to_generic.py | 15 +- .../extensions/middle/TensorIteratorBackEdge.py | 16 +- .../middle/TensorIteratorBackEdge_test.py | 2 +- .../extensions/middle/TensorIteratorCondition.py | 82 +- .../middle/TensorIteratorConditionChecker.py | 27 +- .../middle/TensorIteratorCondition_test.py | 18 +- .../extensions/middle/TensorIteratorInput.py | 73 +- .../extensions/middle/TensorIteratorInput_test.py | 2 +- ...ence.py => TensorIteratorLSTMToLSTMSequence.py} | 58 +- .../extensions/middle/TensorIteratorMerge.py | 77 +- .../extensions/middle/TensorIteratorOutput.py | 145 +- .../extensions/middle/TensorIteratorOutput_test.py | 2 +- .../extensions/middle/TensorIterator_utils.py | 13 +- model-optimizer/extensions/middle/UselessMerge.py | 11 +- .../extensions/middle/UselessSplitEraser.py | 46 + .../extensions/middle/UselessSridedSlice_test.py | 4 +- .../extensions/middle/UselessStridedSlice.py | 9 +- .../extensions/middle/decompose_bi_lstm.py | 188 - .../extensions/middle/lstm_sequence_normalize.py | 281 - .../middle/lstm_sequence_normalize_test.py | 55 - .../middle/mxnet_lstm_sequence_normalize.py | 168 - .../extensions/middle/pass_separator.py | 58 + .../extensions/middle/permute_tensor_iterator.py | 57 +- .../extensions/middle/reverse_tensor_iterator.py | 34 +- model-optimizer/extensions/ops/BlockLSTM.py | 9 +- model-optimizer/extensions/ops/Cast.py | 40 + model-optimizer/extensions/ops/DetectionOutput.py | 13 +- model-optimizer/extensions/ops/Enter.py | 7 +- model-optimizer/extensions/ops/Exit.py | 9 +- model-optimizer/extensions/ops/GRU.py | 81 + model-optimizer/extensions/ops/GRUCell.py | 83 + model-optimizer/extensions/ops/GatherNd.py | 47 + model-optimizer/extensions/ops/LSTM.py | 82 + model-optimizer/extensions/ops/NextIteration.py | 8 +- model-optimizer/extensions/ops/RNN.py | 154 + model-optimizer/extensions/ops/RNNCell.py | 81 + model-optimizer/extensions/ops/Reverse.py | 47 + .../extensions/ops/SquaredDifference.py | 7 +- model-optimizer/extensions/ops/TensorArray.py | 7 +- .../extensions/ops/TensorArrayGather.py | 6 +- model-optimizer/extensions/ops/TensorArrayRead.py | 6 +- .../extensions/ops/TensorArrayScatter.py | 7 +- model-optimizer/extensions/ops/TensorArraySize.py | 7 +- model-optimizer/extensions/ops/TensorArrayWrite.py | 7 +- .../extensions/ops/TensorIterator_ops.py | 30 +- model-optimizer/extensions/ops/accum.py | 8 +- model-optimizer/extensions/ops/accum_test.py | 34 +- model-optimizer/extensions/ops/argmax.py | 11 +- model-optimizer/extensions/ops/argmax_test.py | 41 +- model-optimizer/extensions/ops/assert_op.py | 8 +- model-optimizer/extensions/ops/assert_test.py | 2 +- model-optimizer/extensions/ops/axpy.py | 7 +- model-optimizer/extensions/ops/binarization.py | 32 + model-optimizer/extensions/ops/bn.py | 9 +- model-optimizer/extensions/ops/constant_fill.py | 18 +- model-optimizer/extensions/ops/correlation.py | 9 +- model-optimizer/extensions/ops/correlation_test.py | 11 +- .../extensions/ops/ctc_greedy_decoder.py | 9 +- .../extensions/ops/ctc_greedy_decoder_test.py | 11 +- .../extensions/ops/data_augmentation.py | 8 +- .../extensions/ops/data_augmentation_test.py | 11 +- model-optimizer/extensions/ops/depth_to_space.py | 12 +- .../extensions/ops/depth_to_space_test.py | 2 +- .../extensions/ops/detectionoutput_onnx.py | 59 + model-optimizer/extensions/ops/exp.py | 47 + model-optimizer/extensions/ops/exp_test.py | 76 + model-optimizer/extensions/ops/gather.py | 14 +- model-optimizer/extensions/ops/gather_test.py | 2 +- model-optimizer/extensions/ops/grn.py | 7 +- model-optimizer/extensions/ops/grn_test.py | 12 +- model-optimizer/extensions/ops/identity.py | 17 +- .../extensions/ops/instance_normalization.py | 5 +- .../extensions/ops/instance_normalization_test.py | 6 +- model-optimizer/extensions/ops/interp.py | 10 +- model-optimizer/extensions/ops/interp_test.py | 47 +- model-optimizer/extensions/ops/lstm_cell.py | 23 +- model-optimizer/extensions/ops/lstm_sequence.py | 42 +- model-optimizer/extensions/ops/merge.py | 10 +- model-optimizer/extensions/ops/merge_test.py | 2 +- model-optimizer/extensions/ops/mvn.py | 7 +- model-optimizer/extensions/ops/normalize.py | 9 +- model-optimizer/extensions/ops/normalize_test.py | 11 +- model-optimizer/extensions/ops/pack.py | 9 +- model-optimizer/extensions/ops/power_file.py | 7 +- .../extensions/ops/prediction_heatmap.py | 8 +- model-optimizer/extensions/ops/prelu.py | 7 +- model-optimizer/extensions/ops/priorbox.py | 9 +- .../extensions/ops/priorbox_clustered.py | 8 +- .../extensions/ops/priorbox_clustered_test.py | 17 +- model-optimizer/extensions/ops/priorbox_test.py | 39 +- .../extensions/ops/priorgridgenerator_onnx.py | 52 + model-optimizer/extensions/ops/proposal.py | 13 +- model-optimizer/extensions/ops/proposal_onnx.py | 45 + .../extensions/ops/proposal_python_example.py | 5 +- model-optimizer/extensions/ops/proposal_test.py | 11 +- model-optimizer/extensions/ops/psroipooling.py | 14 +- .../extensions/ops/psroipooling_test.py | 23 +- model-optimizer/extensions/ops/quantize.py | 98 + model-optimizer/extensions/ops/quantize_test.py | 135 + model-optimizer/extensions/ops/range.py | 71 + model-optimizer/extensions/ops/rank.py | 11 +- model-optimizer/extensions/ops/regionyolo.py | 9 +- model-optimizer/extensions/ops/regionyolo_test.py | 41 +- model-optimizer/extensions/ops/reorgyolo.py | 12 +- model-optimizer/extensions/ops/reorgyolo_test.py | 11 +- model-optimizer/extensions/ops/resample.py | 10 +- model-optimizer/extensions/ops/resample_test.py | 23 +- .../extensions/ops/resize_factor_utils.py | 2 +- model-optimizer/extensions/ops/reverse_sequence.py | 38 +- .../extensions/ops/roifeatureextractor_onnx.py | 53 + model-optimizer/extensions/ops/select.py | 8 +- model-optimizer/extensions/ops/select_test.py | 2 +- model-optimizer/extensions/ops/shufflechannel.py | 7 +- model-optimizer/extensions/ops/simplernms.py | 8 +- model-optimizer/extensions/ops/simplernms_test.py | 17 +- .../extensions/ops/spatial_transformer.py | 10 +- .../extensions/ops/spatial_transformer_test.py | 17 +- model-optimizer/extensions/ops/splice.py | 8 +- model-optimizer/extensions/ops/splitv.py | 6 +- model-optimizer/extensions/ops/stop_gradient.py | 9 +- model-optimizer/extensions/ops/swapaxes.py | 8 +- model-optimizer/extensions/ops/switch.py | 7 +- model-optimizer/extensions/ops/switch_test.py | 2 +- model-optimizer/extensions/ops/tensor_iterator.py | 16 +- model-optimizer/extensions/ops/topkrois_onnx.py | 38 + .../install_prerequisites/install_prerequisites.sh | 8 +- .../install_prerequisites_caffe.sh | 2 +- .../install_prerequisites_kaldi.sh | 2 +- .../install_prerequisites_mxnet.sh | 2 +- .../install_prerequisites_onnx.sh | 2 +- .../install_prerequisites_tf.sh | 2 +- model-optimizer/mo.py | 2 +- model-optimizer/mo/back/ie_ir_ver_2/emitter.py | 118 +- .../mo/back/ie_ir_ver_2/emitter_test.py | 2 +- model-optimizer/mo/back/replacement.py | 2 +- .../mo/front/caffe/collect_attributes.py | 2 +- .../mo/front/caffe/custom_layers_mapping.py | 4 +- .../mo/front/caffe/custom_layers_mapping_test.py | 2 +- model-optimizer/mo/front/caffe/extractor.py | 2 +- model-optimizer/mo/front/caffe/extractor_test.py | 2 +- .../mo/front/caffe/extractors/batchnorm.py | 2 +- .../mo/front/caffe/extractors/batchnorm_test.py | 2 +- .../mo/front/caffe/extractors/concat.py | 2 +- .../mo/front/caffe/extractors/concat_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/crop.py | 2 +- .../mo/front/caffe/extractors/crop_test.py | 2 +- .../mo/front/caffe/extractors/eltwise.py | 2 +- .../mo/front/caffe/extractors/eltwise_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/elu.py | 2 +- .../mo/front/caffe/extractors/elu_test.py | 2 +- .../mo/front/caffe/extractors/inner_product.py | 2 +- .../front/caffe/extractors/inner_product_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/input.py | 2 +- .../mo/front/caffe/extractors/input_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/lrn.py | 2 +- .../mo/front/caffe/extractors/lrn_test.py | 2 +- .../mo/front/caffe/extractors/native_caffe.py | 2 +- .../mo/front/caffe/extractors/permute.py | 2 +- .../mo/front/caffe/extractors/permute_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/power.py | 2 +- .../mo/front/caffe/extractors/power_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/relu.py | 2 +- model-optimizer/mo/front/caffe/extractors/relu6.py | 2 +- .../mo/front/caffe/extractors/relu_test.py | 2 +- .../mo/front/caffe/extractors/reshape.py | 2 +- .../mo/front/caffe/extractors/reshape_test.py | 2 +- .../mo/front/caffe/extractors/roipooling.py | 2 +- model-optimizer/mo/front/caffe/extractors/scale.py | 2 +- .../mo/front/caffe/extractors/scale_test.py | 2 +- .../mo/front/caffe/extractors/sigmoid.py | 2 +- model-optimizer/mo/front/caffe/extractors/slice.py | 2 +- .../mo/front/caffe/extractors/slice_test.py | 2 +- model-optimizer/mo/front/caffe/extractors/tanh.py | 2 +- model-optimizer/mo/front/caffe/extractors/tile.py | 2 +- model-optimizer/mo/front/caffe/extractors/utils.py | 2 +- .../mo/front/caffe/extractors/utils_test.py | 2 +- model-optimizer/mo/front/caffe/loader.py | 33 +- model-optimizer/mo/front/caffe/loader_test.py | 6 +- model-optimizer/mo/front/caffe/proto/caffe_pb2.py | 295 +- .../mo/front/caffe/proto/mo_caffe.proto | 2 + .../mo/front/caffe/python_layer_extractor.py | 2 +- .../mo/front/caffe/python_layer_extractor_test.py | 2 +- .../mo/front/caffe/register_custom_ops.py | 14 +- .../mo/front/common/custom_replacement_registry.py | 2 +- .../mo/front/common/extractors/utils.py | 2 +- .../mo/front/common/find_unsupported_ops.py | 61 +- model-optimizer/mo/front/common/layout.py | 2 +- model-optimizer/mo/front/common/layout_test.py | 2 +- .../mo/front/common/partial_infer/batch_norm.py | 2 +- .../front/common/partial_infer/caffe_fallback.py | 13 +- .../common/partial_infer/caffe_fallback_test.py | 15 +- .../mo/front/common/partial_infer/concat.py | 6 +- .../mo/front/common/partial_infer/concat_test.py | 29 +- .../mo/front/common/partial_infer/const.py | 2 +- .../mo/front/common/partial_infer/crop.py | 2 +- .../mo/front/common/partial_infer/crop_test.py | 35 +- .../mo/front/common/partial_infer/elemental.py | 15 +- .../front/common/partial_infer/elemental_test.py | 2 +- .../mo/front/common/partial_infer/eltwise.py | 6 +- .../mo/front/common/partial_infer/eltwise_test.py | 35 +- .../mo/front/common/partial_infer/expand_dims.py | 8 +- .../front/common/partial_infer/expand_dims_test.py | 36 +- .../mo/front/common/partial_infer/inner_product.py | 2 +- .../common/partial_infer/inner_product_test.py | 17 +- .../mo/front/common/partial_infer/matmul.py | 2 +- .../common/partial_infer/multi_box_detection.py | 2 +- .../partial_infer/multi_box_detection_test.py | 2 +- .../front/common/partial_infer/multi_box_prior.py | 2 +- .../common/partial_infer/multi_box_prior_test.py | 2 +- .../front/common/partial_infer/random_uniform.py | 2 +- .../mo/front/common/partial_infer/range.py | 2 +- .../mo/front/common/partial_infer/range_test.py | 2 +- .../mo/front/common/partial_infer/reduce.py | 4 +- .../mo/front/common/partial_infer/reshape.py | 10 +- .../mo/front/common/partial_infer/roipooling.py | 2 +- .../front/common/partial_infer/roipooling_test.py | 23 +- .../mo/front/common/partial_infer/slice.py | 113 +- .../mo/front/common/partial_infer/slice_test.py | 126 +- .../front/common/partial_infer/space_to_batch.py | 2 +- .../mo/front/common/partial_infer/split.py | 2 +- .../mo/front/common/partial_infer/split_test.py | 2 +- .../mo/front/common/partial_infer/squeeze.py | 2 +- .../mo/front/common/partial_infer/transpose.py | 2 +- .../mo/front/common/partial_infer/utils.py | 23 +- .../mo/front/common/register_custom_ops.py | 2 +- model-optimizer/mo/front/common/replacement.py | 52 +- model-optimizer/mo/front/common/weights.py | 2 +- model-optimizer/mo/front/extractor.py | 104 +- model-optimizer/mo/front/extractor_test.py | 29 +- model-optimizer/mo/front/kaldi/extractor.py | 2 +- .../mo/front/kaldi/extractors/add_shift_ext.py | 2 +- .../front/kaldi/extractors/add_shift_ext_test.py | 2 +- .../front/kaldi/extractors/affine_component_ext.py | 2 +- .../kaldi/extractors/affine_component_ext_test.py | 2 +- .../affine_component_preconditioned_online_ext.py | 2 +- .../front/kaldi/extractors/affine_transform_ext.py | 2 +- .../kaldi/extractors/affine_transform_ext_test.py | 2 +- .../mo/front/kaldi/extractors/common_ext_test.py | 6 +- .../mo/front/kaldi/extractors/concat_ext.py | 2 +- .../mo/front/kaldi/extractors/concat_ext_test.py | 2 +- .../extractors/convolutional_1d_component_ext.py | 2 +- .../extractors/convolutional_component_ext.py | 2 +- .../extractors/convolutional_component_ext_test.py | 2 +- .../mo/front/kaldi/extractors/copy_ext.py | 2 +- .../kaldi/extractors/fixed_affine_component_ext.py | 2 +- .../extractors/fixed_affine_component_ext_test.py | 2 +- .../kaldi/extractors/lstm_projected_streams_ext.py | 2 +- .../mo/front/kaldi/extractors/max_pooling_ext.py | 2 +- .../front/kaldi/extractors/max_pooling_ext_test.py | 2 +- .../kaldi/extractors/normalize_component_ext.py | 6 +- .../extractors/rectified_linear_component_ext.py | 2 +- .../mo/front/kaldi/extractors/rescale_ext.py | 2 +- .../mo/front/kaldi/extractors/rescale_ext_test.py | 2 +- .../mo/front/kaldi/extractors/sigmoid_ext.py | 2 +- .../mo/front/kaldi/extractors/sigmoid_ext_test.py | 2 +- .../mo/front/kaldi/extractors/slice_ext.py | 2 +- .../mo/front/kaldi/extractors/slice_ext_test.py | 2 +- .../mo/front/kaldi/extractors/softmax_ext.py | 2 +- .../front/kaldi/extractors/splice_component_ext.py | 2 +- .../front/kaldi/extractors/tanh_component_ext.py | 2 +- .../mo/front/kaldi/extractors/tanh_ext_test.py | 2 +- model-optimizer/mo/front/kaldi/loader/loader.py | 26 +- model-optimizer/mo/front/kaldi/loader/utils.py | 2 +- .../mo/front/kaldi/loader/utils_test.py | 2 +- .../mo/front/kaldi/register_custom_ops.py | 14 +- model-optimizer/mo/front/kaldi/utils.py | 2 +- model-optimizer/mo/front/mxnet/extractor.py | 2 +- .../mo/front/mxnet/extractors/activation.py | 2 +- .../mo/front/mxnet/extractors/activation_test.py | 2 +- model-optimizer/mo/front/mxnet/extractors/add_n.py | 2 +- .../mo/front/mxnet/extractors/batchnorm.py | 2 +- .../mo/front/mxnet/extractors/concat.py | 2 +- model-optimizer/mo/front/mxnet/extractors/crop.py | 2 +- .../mo/front/mxnet/extractors/crop_test.py | 2 +- .../mo/front/mxnet/extractors/eltwise.py | 2 +- .../mo/front/mxnet/extractors/eltwise_test.py | 2 +- .../mo/front/mxnet/extractors/fully_connected.py | 2 +- .../mo/front/mxnet/extractors/l2_normalization.py | 2 +- .../mo/front/mxnet/extractors/leaky_relu.py | 2 +- .../mo/front/mxnet/extractors/leaky_relu_test.py | 2 +- model-optimizer/mo/front/mxnet/extractors/lrn.py | 2 +- .../front/mxnet/extractors/multibox_detection.py | 2 +- .../mxnet/extractors/multibox_detection_test.py | 2 +- .../mo/front/mxnet/extractors/multibox_prior.py | 2 +- .../front/mxnet/extractors/multibox_prior_test.py | 2 +- model-optimizer/mo/front/mxnet/extractors/null.py | 2 +- model-optimizer/mo/front/mxnet/extractors/relu.py | 2 +- .../mo/front/mxnet/extractors/relu_test.py | 2 +- .../mo/front/mxnet/extractors/scaleshift.py | 2 +- .../mo/front/mxnet/extractors/sigmoid.py | 2 +- .../mo/front/mxnet/extractors/sigmoid_test.py | 2 +- .../mo/front/mxnet/extractors/slice_axis.py | 2 +- .../mo/front/mxnet/extractors/slice_axis_test.py | 2 +- .../mo/front/mxnet/extractors/transpose.py | 2 +- model-optimizer/mo/front/mxnet/extractors/utils.py | 17 +- .../mo/front/mxnet/extractors/utils_test.py | 11 +- model-optimizer/mo/front/mxnet/loader.py | 19 +- model-optimizer/mo/front/mxnet/loader_test.py | 2 +- model-optimizer/mo/front/mxnet/nd_to_params.py | 2 +- .../mo/front/mxnet/register_custom_ops.py | 14 +- model-optimizer/mo/front/onnx/extractor.py | 6 +- model-optimizer/mo/front/onnx/extractors/concat.py | 2 +- model-optimizer/mo/front/onnx/extractors/const.py | 2 +- .../mo/front/onnx/extractors/constant.py | 2 +- .../mo/front/onnx/extractors/constant_test.py | 2 +- .../mo/front/onnx/extractors/dropout.py | 32 - .../mo/front/onnx/extractors/eltwise.py | 2 +- .../mo/front/onnx/extractors/fused_bn.py | 2 +- model-optimizer/mo/front/onnx/extractors/matmul.py | 2 +- .../mo/front/onnx/extractors/placeholder.py | 2 +- .../mo/front/onnx/extractors/reshape.py | 2 +- model-optimizer/mo/front/onnx/extractors/utils.py | 26 +- model-optimizer/mo/front/onnx/loader.py | 8 +- .../mo/front/onnx/register_custom_ops.py | 12 +- model-optimizer/mo/front/subgraph_matcher.py | 14 +- .../mo/front/tf/change_placeholder_type.py | 80 - model-optimizer/mo/front/tf/common.py | 2 +- .../mo/front/tf/custom_subgraph_call.py | 311 +- model-optimizer/mo/front/tf/extractor.py | 9 +- model-optimizer/mo/front/tf/extractors/bias_add.py | 2 +- model-optimizer/mo/front/tf/extractors/concat.py | 2 +- .../mo/front/tf/extractors/concat_test.py | 2 +- model-optimizer/mo/front/tf/extractors/const.py | 2 +- .../mo/front/tf/extractors/const_test.py | 2 +- model-optimizer/mo/front/tf/extractors/eltwise.py | 2 +- .../mo/front/tf/extractors/eltwise_test.py | 2 +- model-optimizer/mo/front/tf/extractors/elu.py | 2 +- .../mo/front/tf/extractors/expand_dims.py | 2 +- .../mo/front/tf/extractors/expand_dims_test.py | 2 +- model-optimizer/mo/front/tf/extractors/fused_bn.py | 2 +- model-optimizer/mo/front/tf/extractors/identity.py | 2 +- .../mo/front/tf/extractors/identity_test.py | 2 +- model-optimizer/mo/front/tf/extractors/lrn.py | 2 +- model-optimizer/mo/front/tf/extractors/lrn_test.py | 2 +- model-optimizer/mo/front/tf/extractors/matmul.py | 2 +- .../mo/front/tf/extractors/matmul_test.py | 2 +- model-optimizer/mo/front/tf/extractors/mean.py | 2 +- .../mo/front/tf/extractors/mean_test.py | 2 +- .../mo/front/tf/extractors/native_tf.py | 2 +- model-optimizer/mo/front/tf/extractors/pack.py | 2 +- .../mo/front/tf/extractors/placeholder.py | 2 +- model-optimizer/mo/front/tf/extractors/prod.py | 2 +- .../mo/front/tf/extractors/prod_test.py | 2 +- .../mo/front/tf/extractors/random_uniform.py | 2 +- model-optimizer/mo/front/tf/extractors/range.py | 2 +- model-optimizer/mo/front/tf/extractors/reshape.py | 2 +- model-optimizer/mo/front/tf/extractors/sigmoid.py | 2 +- .../mo/front/tf/extractors/space_to_batch.py | 2 +- model-optimizer/mo/front/tf/extractors/split.py | 2 +- model-optimizer/mo/front/tf/extractors/squeeze.py | 2 +- .../mo/front/tf/extractors/squeeze_test.py | 2 +- .../mo/front/tf/extractors/strided_slice.py | 43 +- model-optimizer/mo/front/tf/extractors/tanh.py | 2 +- .../mo/front/tf/extractors/transpose.py | 2 +- model-optimizer/mo/front/tf/extractors/unpack.py | 2 +- model-optimizer/mo/front/tf/extractors/utils.py | 2 +- .../mo/front/tf/extractors/utils_test.py | 2 +- model-optimizer/mo/front/tf/graph_utils.py | 31 +- model-optimizer/mo/front/tf/loader.py | 25 +- model-optimizer/mo/front/tf/loader_test.py | 2 +- model-optimizer/mo/front/tf/partial_infer/tf.py | 10 +- model-optimizer/mo/front/tf/register_custom_ops.py | 16 +- model-optimizer/mo/front/tf/replacement.py | 22 +- model-optimizer/mo/graph/connection.py | 221 + model-optimizer/mo/graph/graph.py | 1050 +- model-optimizer/mo/graph/graph_test.py | 1213 +- model-optimizer/mo/graph/port.py | 275 + model-optimizer/mo/main.py | 55 +- model-optimizer/mo/main_test.py | 2 +- model-optimizer/mo/middle/passes/conv.py | 457 +- model-optimizer/mo/middle/passes/conv_test.py | 152 +- .../mo/middle/passes/convert_data_type.py | 12 +- model-optimizer/mo/middle/passes/debug.py | 4 +- model-optimizer/mo/middle/passes/eliminate.py | 120 +- model-optimizer/mo/middle/passes/eliminate_test.py | 49 +- .../mo/middle/passes/fusing/decomposition.py | 126 +- .../mo/middle/passes/fusing/decomposition_test.py | 163 +- .../mo/middle/passes/fusing/fuse_grouped_conv.py | 10 +- .../mo/middle/passes/fusing/fuse_linear_ops.py | 17 +- .../middle/passes/fusing/fuse_linear_ops_test.py | 421 +- .../mo/middle/passes/fusing/fuse_linear_seq.py | 9 +- .../middle/passes/fusing/fuse_linear_seq_test.py | 82 +- model-optimizer/mo/middle/passes/fusing/helpers.py | 2 +- .../mo/middle/passes/fusing/helpers_test.py | 55 +- .../mo/middle/passes/fusing/mark_unfused_nodes.py | 9 +- .../passes/fusing/mark_unfused_nodes_test.py | 18 +- .../mo/middle/passes/fusing/resnet_optimization.py | 18 +- .../passes/fusing/resnet_optimization_test.py | 2 +- model-optimizer/mo/middle/passes/infer.py | 209 +- model-optimizer/mo/middle/passes/infer_test.py | 312 +- .../mo/middle/passes/l2normalization.py | 25 +- model-optimizer/mo/middle/passes/leaky_relu.py | 8 +- .../mo/middle/passes/mean_scale_values.py | 6 +- .../mo/middle/passes/mean_scale_values_test.py | 53 +- model-optimizer/mo/middle/passes/pool.py | 90 - model-optimizer/mo/middle/passes/shape.py | 66 +- .../mo/middle/passes/shared_weights_duplication.py | 45 - model-optimizer/mo/middle/passes/tensor_names.py | 15 +- model-optimizer/mo/middle/pattern_match.py | 23 +- model-optimizer/mo/middle/replacement.py | 10 +- model-optimizer/mo/ops/activation.py | 14 +- model-optimizer/mo/ops/activation_test.py | 2 +- model-optimizer/mo/ops/clamp.py | 11 +- model-optimizer/mo/ops/clamp_test.py | 2 +- model-optimizer/mo/ops/concat.py | 3 +- model-optimizer/mo/ops/concat_test.py | 2 +- model-optimizer/mo/ops/const.py | 5 +- model-optimizer/mo/ops/convolution.py | 35 +- model-optimizer/mo/ops/convolution_test.py | 41 +- model-optimizer/mo/ops/crop.py | 11 +- model-optimizer/mo/ops/crop_test.py | 2 +- model-optimizer/mo/ops/deconvolution.py | 9 +- model-optimizer/mo/ops/eltwise.py | 8 +- model-optimizer/mo/ops/eltwise_n.py | 8 +- model-optimizer/mo/ops/expand_dims.py | 10 +- model-optimizer/mo/ops/flatten.py | 11 +- model-optimizer/mo/ops/flatten_onnx.py | 21 +- model-optimizer/mo/ops/flatten_onnx_test.py | 2 +- model-optimizer/mo/ops/flatten_test.py | 17 +- model-optimizer/mo/ops/inner_product.py | 9 +- model-optimizer/mo/ops/inner_product_test.py | 2 +- model-optimizer/mo/ops/input.py | 8 +- model-optimizer/mo/ops/lin_op.py | 16 +- model-optimizer/mo/ops/lrn.py | 9 +- model-optimizer/mo/ops/memory.py | 10 +- model-optimizer/mo/ops/op.py | 66 +- model-optimizer/mo/ops/output.py | 12 +- model-optimizer/mo/ops/pad.py | 8 +- model-optimizer/mo/ops/pad_test.py | 2 +- model-optimizer/mo/ops/permute.py | 9 +- model-optimizer/mo/ops/permute_test.py | 2 +- model-optimizer/mo/ops/pooling.py | 9 +- model-optimizer/mo/ops/pooling_test.py | 27 +- model-optimizer/mo/ops/power.py | 9 +- model-optimizer/mo/ops/power_test.py | 2 +- model-optimizer/mo/ops/reduce.py | 11 +- model-optimizer/mo/ops/relu.py | 11 +- model-optimizer/mo/ops/reshape.py | 14 +- model-optimizer/mo/ops/roipooling.py | 6 +- model-optimizer/mo/ops/scale_shift.py | 9 +- model-optimizer/mo/ops/shape.py | 13 +- model-optimizer/mo/ops/slice.py | 18 +- model-optimizer/mo/ops/slice_test.py | 2 +- model-optimizer/mo/ops/softmax.py | 10 +- model-optimizer/mo/ops/split.py | 8 +- model-optimizer/mo/ops/squeeze.py | 9 +- model-optimizer/mo/ops/strided_slice.py | 114 + model-optimizer/mo/ops/strided_slice_test.py | 290 + model-optimizer/mo/ops/tile.py | 9 +- model-optimizer/mo/ops/tile_test.py | 2 +- model-optimizer/mo/ops/unsqueeze.py | 8 +- model-optimizer/mo/ops/unsqueeze_test.py | 2 +- model-optimizer/mo/pipeline/caffe.py | 131 +- model-optimizer/mo/pipeline/common.py | 21 +- model-optimizer/mo/pipeline/common_test.py | 2 +- model-optimizer/mo/pipeline/kaldi.py | 64 +- model-optimizer/mo/pipeline/kaldi_test.py | 41 +- model-optimizer/mo/pipeline/mx.py | 125 +- model-optimizer/mo/pipeline/onnx.py | 131 +- model-optimizer/mo/pipeline/tf.py | 382 +- model-optimizer/mo/utils/class_registration.py | 81 +- model-optimizer/mo/utils/cli_parser.py | 14 +- model-optimizer/mo/utils/cli_parser_test.py | 2 +- model-optimizer/mo/utils/convert.py | 2 +- .../mo/utils/custom_replacement_config.py | 44 +- model-optimizer/mo/utils/dsu.py | 2 +- model-optimizer/mo/utils/error.py | 2 +- model-optimizer/mo/utils/find_inputs.py | 32 +- model-optimizer/mo/utils/graph.py | 22 +- model-optimizer/mo/utils/graph_test.py | 32 +- model-optimizer/mo/utils/guess_framework.py | 4 +- model-optimizer/mo/utils/import_extensions.py | 45 +- model-optimizer/mo/utils/logger.py | 2 +- model-optimizer/mo/utils/pipeline_config.py | 6 +- model-optimizer/mo/utils/pipeline_config_test.py | 2 +- model-optimizer/mo/utils/replacement_pattern.py | 5 +- model-optimizer/mo/utils/simple_proto_parser.py | 2 +- .../mo/utils/simple_proto_parser_test.py | 2 +- model-optimizer/mo/utils/str_to.py | 2 +- model-optimizer/mo/utils/summarize_graph.py | 2 +- model-optimizer/mo/utils/summarize_graph_test.py | 2 +- model-optimizer/mo/utils/tensorboard.py | 2 +- model-optimizer/mo/utils/unittest/extractors.py | 2 +- model-optimizer/mo/utils/unittest/graph.py | 41 +- model-optimizer/mo/utils/unsupported_ops.py | 6 +- model-optimizer/mo/utils/utils.py | 38 +- model-optimizer/mo/utils/utils_test.py | 2 +- model-optimizer/mo/utils/version.py | 2 +- model-optimizer/mo/utils/version_test.py | 2 +- model-optimizer/mo/utils/versions_checker.py | 31 +- model-optimizer/mo_caffe.py | 2 +- model-optimizer/mo_kaldi.py | 2 +- model-optimizer/mo_mxnet.py | 2 +- model-optimizer/mo_onnx.py | 2 +- model-optimizer/mo_tf.py | 2 +- model-optimizer/requirements.txt | 2 + model-optimizer/requirements_caffe.txt | 2 + model-optimizer/requirements_kaldi.txt | 2 + model-optimizer/requirements_mxnet.txt | 2 + model-optimizer/requirements_onnx.txt | 2 + model-optimizer/requirements_tf.txt | 2 + model-optimizer/tf_call_ie_layer/build.sh | 8 +- model-optimizer/version.txt | 3 - tools/README.md | 69 + tools/__init__.py | 17 + tools/accuracy_checker/.pylintrc | 31 + tools/accuracy_checker/README.md | 60 + tools/accuracy_checker/__init__.py | 39 + .../accuracy_checker/accuracy_checker/__init__.py | 17 + .../accuracy_checker/adapters/README.md | 73 + .../accuracy_checker/adapters/__init__.py | 79 + .../adapters/action_recognition.py | 119 + .../accuracy_checker/adapters/adapter.py | 71 + .../adapters/attributes_recognition.py | 210 + .../accuracy_checker/adapters/classification.py | 45 + .../accuracy_checker/adapters/detection.py | 344 + .../accuracy_checker/adapters/dummy_adapters.py | 64 + .../accuracy_checker/adapters/hit_ratio.py | 47 + .../accuracy_checker/adapters/image_processing.py | 35 + .../accuracy_checker/adapters/pose_estimation.py | 331 + .../accuracy_checker/adapters/reidentification.py | 58 + .../accuracy_checker/adapters/segmentation.py | 83 + .../accuracy_checker/adapters/text_detection.py | 309 + .../annotation_converters/README.md | 98 + .../annotation_converters/__init__.py | 55 + .../annotation_converters/_reid_common.py | 45 + .../annotation_converters/brats.py | 53 + .../annotation_converters/cityscapes.py | 73 + .../annotation_converters/convert.py | 126 + .../detection_opencv_storage.py | 114 + .../annotation_converters/format_converter.py | 108 + .../annotation_converters/icdar.py | 63 + .../annotation_converters/imagenet.py | 52 + .../accuracy_checker/annotation_converters/lfw.py | 111 + .../annotation_converters/market1501.py | 41 + .../accuracy_checker/annotation_converters/mars.py | 38 + .../annotation_converters/ms_coco.py | 129 + .../annotation_converters/ncf_converter.py | 74 + .../annotation_converters/pascal_voc.py | 157 + .../annotation_converters/sample_converter.py | 100 + .../super_resolution_converter.py | 52 + .../annotation_converters/vgg_face_regression.py | 64 + .../annotation_converters/wider.py | 64 + .../accuracy_checker/config/__init__.py | 48 + .../accuracy_checker/config/config_reader.py | 281 + .../accuracy_checker/config/config_validator.py | 339 + .../accuracy_checker/data_readers/__init__.py | 40 + .../accuracy_checker/data_readers/data_reader.py | 216 + tools/accuracy_checker/accuracy_checker/dataset.py | 190 + .../accuracy_checker/dependency.py | 108 + .../accuracy_checker/launcher/__init__.py | 34 + .../launcher/caffe_installation_readme.md | 56 + .../accuracy_checker/launcher/caffe_launcher.py | 141 + .../launcher/caffe_launcher_readme.md | 24 + .../accuracy_checker/launcher/dlsdk_launcher.py | 430 + .../launcher/dlsdk_launcher_readme.md | 54 + .../accuracy_checker/launcher/dummy_launcher.py | 69 + .../accuracy_checker/launcher/input_feeder.py | 138 + .../accuracy_checker/launcher/launcher.py | 149 + .../accuracy_checker/launcher/loaders/__init__.py | 26 + .../accuracy_checker/launcher/loaders/loader.py | 54 + .../launcher/loaders/pickle_loader.py | 34 + .../launcher/loaders/xml_loader.py | 29 + .../accuracy_checker/launcher/model_conversion.py | 196 + tools/accuracy_checker/accuracy_checker/logging.py | 134 + tools/accuracy_checker/accuracy_checker/main.py | 216 + .../accuracy_checker/metrics/README.md | 127 + .../accuracy_checker/metrics/__init__.py | 92 + .../accuracy_checker/metrics/average_meter.py | 46 + .../metrics/character_recognition.py | 36 + .../accuracy_checker/metrics/classification.py | 107 + .../accuracy_checker/metrics/coco_metrics.py | 322 + .../accuracy_checker/metrics/detection.py | 487 + .../accuracy_checker/metrics/hit_ratio.py | 100 + .../accuracy_checker/metrics/metric.py | 159 + .../accuracy_checker/metrics/metric_executor.py | 106 + .../metrics/multilabel_recognition.py | 189 + .../accuracy_checker/metrics/overlap.py | 71 + .../accuracy_checker/metrics/regression.py | 360 + .../accuracy_checker/metrics/reid.py | 379 + .../metrics/semantic_segmentation.py | 139 + .../accuracy_checker/metrics/text_detection.py | 124 + .../accuracy_checker/model_evaluator.py | 132 + .../accuracy_checker/postprocessor/README.md | 40 + .../accuracy_checker/postprocessor/__init__.py | 69 + .../accuracy_checker/postprocessor/cast_to_int.py | 71 + .../accuracy_checker/postprocessor/clip_boxes.py | 68 + .../accuracy_checker/postprocessor/clip_points.py | 68 + .../postprocessor/clip_segmentation_mask.py | 48 + .../postprocessor/correct_yolo_v2_boxes.py | 75 + .../postprocessor/crop_segmentation_mask.py | 49 + .../postprocessor/encode_segmentation_mask.py | 46 + .../postprocessor/extend_segmentation_mask.py | 64 + .../accuracy_checker/postprocessor/filter.py | 319 + .../accuracy_checker/postprocessor/nms.py | 80 + .../postprocessor/normalize_landmarks_points.py | 59 + .../postprocessor/postprocessing_executor.py | 79 + .../postprocessor/postprocessor.py | 188 + .../postprocessor/resize_prediction_boxes.py | 40 + .../postprocessor/resize_segmentation_mask.py | 73 + .../postprocessor/zoom_segmentation_mask.py | 65 + .../accuracy_checker/preprocessor/README.md | 51 + .../accuracy_checker/preprocessor/__init__.py | 51 + .../preprocessor/preprocessing_executor.py | 52 + .../accuracy_checker/preprocessor/preprocessors.py | 565 + .../accuracy_checker/presenters.py | 123 + .../accuracy_checker/progress_reporters.py | 92 + .../accuracy_checker/representation/__init__.py | 103 + .../representation/base_representation.py | 42 + .../character_recognition_representation.py | 31 + .../classification_representation.py | 44 + .../representation/detection_representation.py | 87 + .../representation/hit_ratio_representation.py | 40 + .../representation/multilabel_recognition.py | 32 + .../pose_estimation_representation.py | 63 + .../representation/regression_representation.py | 72 + .../representation/reid_representation.py | 42 + .../representation/representaton_container.py | 78 + .../representation/segmentation_representation.py | 91 + .../super_resolution_representation.py | 67 + .../text_detection_representation.py | 46 + tools/accuracy_checker/accuracy_checker/utils.py | 361 + .../configs/face-detection-adas-0001.yml | 94 + .../configs/face-detection-retail-0004.yml | 98 + .../configs/face-reidentification-retail-0095.yml | 74 + .../configs/human-pose-estimation-0001.yml | 114 + .../configs/landmarks-regression-retail-0009.yml | 82 + .../person-reidentification-retail-0031.yml | 80 + .../person-reidentification-retail-0076.yml | 76 + .../person-reidentification-retail-0079.yml | 76 + .../configs/text-detection-0002.yml | 110 + .../configs/text-recognition-0012.yml | 76 + tools/accuracy_checker/data/test_data/1.jpg | Bin 0 -> 147595 bytes .../data/test_models/SampLeNet.bin | Bin 0 -> 248024 bytes .../data/test_models/SampLeNet.caffemodel | Bin 0 -> 248617 bytes .../data/test_models/SampLeNet.prototxt | 116 + .../data/test_models/SampLeNet.xml | 239 + tools/accuracy_checker/pylint_checkers.py | 144 + tools/accuracy_checker/requirements.txt | 9 + tools/accuracy_checker/setup.cfg | 8 + tools/accuracy_checker/tests/__init__.py | 16 + tools/accuracy_checker/tests/common.py | 139 + tools/accuracy_checker/tests/conftest.py | 52 + tools/accuracy_checker/tests/test_adapters.py | 121 + .../accuracy_checker/tests/test_caffe_launcher.py | 77 + tools/accuracy_checker/tests/test_config_reader.py | 1014 + .../tests/test_config_validator.py | 379 + tools/accuracy_checker/tests/test_dataset.py | 191 + tools/accuracy_checker/tests/test_dependency.py | 89 + .../tests/test_detection_metrics.py | 459 + .../accuracy_checker/tests/test_dlsdk_launcher.py | 980 + tools/accuracy_checker/tests/test_input_feeder.py | 255 + .../tests/test_metric_evaluator.py | 549 + .../tests/test_model_conversion.py | 80 + .../accuracy_checker/tests/test_model_evaluator.py | 143 + tools/accuracy_checker/tests/test_postprocessor.py | 1070 + tools/accuracy_checker/tests/test_preprocessor.py | 610 + tools/accuracy_checker/tests/test_presenter.py | 348 + .../tests/test_regression_metrics.py | 338 + tools/accuracy_checker/tests/test_reid_metrics.py | 77 + .../tests/test_segmentation_metrics.py | 164 + tools/accuracy_checker/tests/test_utils.py | 127 + tools/benchmark/README.md | 31 + tools/benchmark/__init__.py | 26 + tools/benchmark/__main__.py | 28 + tools/benchmark/benchmark.py | 157 + tools/benchmark/command_line_reader.py | 155 + tools/benchmark/configuration.py | 64 + tools/benchmark/logging.py | 125 + tools/benchmark/requirements.txt | 8 + tools/calibration/README.md | 33 + tools/calibration/__init__.py | 34 + tools/calibration/__main__.py | 79 + tools/calibration/aggregated_statistics.py | 170 + tools/calibration/base_calibrator.py | 556 + tools/calibration/calibration_configuration.py | 150 + tools/calibration/calibration_metrics.py | 30 + tools/calibration/calibrator.py | 255 + tools/calibration/calibrator_configuration.py | 66 + tools/calibration/calibrator_factory.py | 31 + tools/calibration/command_line_processor.py | 142 + tools/calibration/command_line_reader.py | 209 + tools/calibration/fp16_calibrator.py | 31 + tools/calibration/infer_raw_results.py | 72 + tools/calibration/inference_result.py | 85 + tools/calibration/int8_calibrator.py | 34 + tools/calibration/layer_accuracy_drop/__init__.py | 21 + .../layer_accuracy_drop/collector_by_image.py | 128 + .../layer_accuracy_drop/collector_by_layer.py | 184 + tools/calibration/layer_accuracy_drop_info.py | 36 + tools/calibration/layers/__init__.py | 15 + tools/calibration/logging.py | 159 + tools/calibration/network_node_stats.py | 26 + tools/calibration/nrmsd.py | 38 + tools/calibration/requirements.txt | 8 + tools/calibration/shape.py | 121 + tools/calibration/single_layer_network.py | 85 + tools/calibration/top_results.py | 37 + tools/network.py | 111 + tools/utils/__init__.py | 22 + tools/utils/biases.py | 29 + tools/utils/building/__init__.py | 17 + tools/utils/building/layer.py | 157 + tools/utils/building/network_builder.py | 51 + tools/utils/building/port.py | 20 + tools/utils/configuration_filter.py | 74 + tools/utils/connection.py | 34 + tools/utils/edge.py | 39 + tools/utils/layer.py | 99 + tools/utils/network_info.py | 123 + tools/utils/path.py | 67 + tools/utils/port.py | 29 + tools/utils/tensor_desc.py | 19 + tools/utils/weights.py | 29 + 3639 files changed, 266956 insertions(+), 64512 deletions(-) create mode 100644 inference-engine/cmake/cpplint.cmake create mode 100644 inference-engine/cmake/cpplint_html.cmake create mode 100644 inference-engine/cmake/cpplint_merge.cmake create mode 100644 inference-engine/cmake/cpplint_run.cmake create mode 100644 inference-engine/cmake/cpplint_to_cppcheck_xml.cmake rename model-optimizer/mo/front/tf/extractors/sum.py => inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/__init__.py (69%) rename inference-engine/ie_bridges/python/sample/benchmark_app/{ => benchmark}/benchmark.py (98%) create mode 100644 inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/__init__.py rename inference-engine/ie_bridges/python/sample/benchmark_app/{ => benchmark}/utils/benchmark_utils.py (62%) rename inference-engine/ie_bridges/python/sample/benchmark_app/{ => benchmark}/utils/constants.py (64%) create mode 100644 inference-engine/ie_bridges/python/sample/benchmark_app/benchmark_app.py create mode 100644 inference-engine/ie_bridges/python/sample/classification_sample/README.md rename inference-engine/ie_bridges/python/sample/{ => classification_sample}/classification_sample.py (62%) create mode 100644 inference-engine/ie_bridges/python/sample/classification_sample_async/README.md rename inference-engine/ie_bridges/python/sample/{ => classification_sample_async}/classification_sample_async.py (65%) delete mode 100644 inference-engine/ie_bridges/python/sample/greengrass_samples/Greengrass-FaaS-User-Guide.docx delete mode 100644 inference-engine/ie_bridges/python/sample/greengrass_samples/README.md delete mode 100644 inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_classification_sample.py delete mode 100644 inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_object_detection_sample_ssd.py delete mode 100644 inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb delete mode 100644 inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt create mode 100644 inference-engine/ie_bridges/python/sample/style_transfer_sample/README.md rename inference-engine/ie_bridges/python/sample/{ => style_transfer_sample}/style_transfer_sample.py (68%) delete mode 100644 inference-engine/ie_bridges/python/sample/voc_labels.txt delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp delete mode 100644 inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd create mode 100644 inference-engine/include/builders/ie_gru_sequence_layer.hpp rename inference-engine/include/builders/{ie_layer_fragment.hpp => ie_layer_decorator.hpp} (50%) create mode 100644 inference-engine/include/builders/ie_lrn_layer.hpp create mode 100644 inference-engine/include/builders/ie_lstm_sequence_layer.hpp create mode 100644 inference-engine/include/builders/ie_resample_layer.hpp create mode 100644 inference-engine/include/builders/ie_rnn_sequence_layer.hpp rename inference-engine/include/{ie_inetwork.hpp => ie_network.hpp} (70%) create mode 100644 inference-engine/include/vpu/vpu_plugin_config.hpp delete mode 100644 inference-engine/samples/benchmark_app/benchmark_app.h create mode 100644 inference-engine/samples/benchmark_app/benchmark_app.hpp create mode 100644 inference-engine/samples/benchmark_app/infer_request_wrap.hpp create mode 100644 inference-engine/samples/benchmark_app/progress_bar.hpp create mode 100644 inference-engine/samples/benchmark_app/statistics_report.cpp create mode 100644 inference-engine/samples/benchmark_app/statistics_report.hpp delete mode 100644 inference-engine/samples/build_samples.sh create mode 100644 inference-engine/samples/common/samples/classification_results.h rename inference-engine/samples/{validation_app => common/samples}/console_progress.hpp (95%) rename inference-engine/samples/{validation_app => common/samples}/csv_dumper.hpp (98%) delete mode 100644 inference-engine/samples/create_msvc2015_solution.bat delete mode 100644 inference-engine/samples/create_msvc2017_solution.bat delete mode 100644 inference-engine/src/extension/common/matrixmult.h create mode 100644 inference-engine/src/extension/ext_depth_to_space.cpp create mode 100644 inference-engine/src/extension/ext_detectionoutput_onnx.cpp create mode 100644 inference-engine/src/extension/ext_expand.cpp create mode 100644 inference-engine/src/extension/ext_fill.cpp create mode 100644 inference-engine/src/extension/ext_priorgridgenerator_onnx.cpp create mode 100644 inference-engine/src/extension/ext_proposal_onnx.cpp create mode 100644 inference-engine/src/extension/ext_range.cpp create mode 100644 inference-engine/src/extension/ext_reverse_sequence.cpp create mode 100644 inference-engine/src/extension/ext_roifeatureextractor_onnx.cpp create mode 100644 inference-engine/src/extension/ext_shuffle_channels.cpp create mode 100644 inference-engine/src/extension/ext_space_to_depth.cpp delete mode 100644 inference-engine/src/extension/ext_spatial_transformer.cpp create mode 100644 inference-engine/src/extension/ext_squeeze.cpp create mode 100644 inference-engine/src/extension/ext_strided_slice.cpp create mode 100644 inference-engine/src/extension/ext_topkrois_onnx.cpp create mode 100644 inference-engine/src/extension/ext_unsqueeze.cpp create mode 100644 inference-engine/src/gna_plugin/gna_plugin_policy.hpp create mode 100644 inference-engine/src/inference_engine/builders/ie_gru_sequence_layer.cpp create mode 100644 inference-engine/src/inference_engine/builders/ie_layer_decorator.cpp delete mode 100644 inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp create mode 100644 inference-engine/src/inference_engine/builders/ie_lrn_layer.cpp create mode 100644 inference-engine/src/inference_engine/builders/ie_lstm_sequence_layer.cpp create mode 100644 inference-engine/src/inference_engine/builders/ie_resample_layer.cpp create mode 100644 inference-engine/src/inference_engine/builders/ie_rnn_sequence_layer.cpp create mode 100644 inference-engine/src/inference_engine/exec_graph_info.hpp create mode 100644 inference-engine/src/inference_engine/ie_cnn_layer_builder.cpp delete mode 100644 inference-engine/src/inference_engine/ie_layers_prv.h delete mode 100644 inference-engine/src/inference_engine/ie_network.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_bin_conv_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_depth_to_space_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_expand_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_fill_shape_infer.hpp rename inference-engine/src/inference_engine/shape_infer/built-in/{ie_spatial_transformer_shape_infer.hpp => ie_quantize_shape_infer.hpp} (53%) create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_range_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_reverse_sequence_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_cell_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_shape_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_shuffle_channels_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_space_to_depth_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_squeeze_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_strided_slice_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_tensor_iterator_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/built-in/ie_unsqueeze_shape_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_add_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_concat_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.cpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.cpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_div_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_fill_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_gather_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_in_place_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_mul_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_power_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_range_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_reshape_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_shape_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_split_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_strided_slice_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/shape_infer/const_infer/ie_tile_const_infer.hpp create mode 100644 inference-engine/src/inference_engine/transform/transform_network.cpp create mode 100644 inference-engine/src/inference_engine/transform/transform_network.hpp create mode 100644 inference-engine/src/inference_engine/transform/transformation.cpp create mode 100644 inference-engine/src/inference_engine/transform/transformation.hpp create mode 100644 inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.cpp create mode 100644 inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.hpp create mode 100644 inference-engine/src/inference_engine/transform/transformations/lrn.cpp create mode 100644 inference-engine/src/inference_engine/transform/transformations/lrn.hpp create mode 100644 inference-engine/src/inference_engine/transform/transformations/sub.cpp create mode 100644 inference-engine/src/inference_engine/transform/transformations/sub.hpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2005/gmock.sln create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.sln create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_main.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_test.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.sln create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_main.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_test.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.sln create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.sln create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj.filters create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Config/DebugProject.xcconfig create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Config/FrameworkTarget.xcconfig create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Config/General.xcconfig create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Config/ReleaseProject.xcconfig create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Config/StaticLibraryTarget.xcconfig create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Config/TestTarget.xcconfig create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Resources/Info.plist create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/Info.plist create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/WidgetFramework.xcodeproj/project.pbxproj create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/runtests.sh create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.cc create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.h create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget_test.cc create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Scripts/runtests.sh create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/Scripts/versiongenerate.py create mode 100644 inference-engine/tests/libs/gtest/googletest/xcode/gtest.xcodeproj/project.pbxproj create mode 100644 inference-engine/tests/unit/builders/argmax_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/clamp_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/concat_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/const_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/convolution_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/crop_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/ctc_greedy_decoder_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/deconvolution_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/detection_output_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/eltwise_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/elu_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/mvn_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/norm_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/normalize_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/output_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/relu6_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/relu_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/resample_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/split_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/tanh_layer_test.cpp create mode 100644 inference-engine/tests/unit/builders/transform_network_test.cpp create mode 100644 inference-engine/tests/unit/cnn_network/cnn_layer_validation_tests.cpp create mode 100644 inference-engine/tests/unit/cnn_network/layer_builder.h create mode 100644 inference-engine/tests/unit/cnn_network/parameters.h create mode 100644 inference-engine/tests/unit/cnn_network/shapes.h create mode 100644 inference-engine/tests/unit/engines/gna/fp32_non_quantized_tests.cpp create mode 100644 inference-engine/tests/unit/engines/gna/gna_aminteldnn_test.cpp create mode 100644 inference-engine/tests/unit/engines/gna/gna_cppwraper_test.cpp create mode 100644 inference-engine/tests/unit/engines/gna/gna_input_precision_test.cpp create mode 100644 inference-engine/tests/unit/engines/gna/matchers/fill_with_data.hpp create mode 100644 inference-engine/tests/unit/engines/gna/matchers/input_data_matcher.hpp create mode 100644 inference-engine/tests/unit/engines/gna/matchers/weights_matcher.hpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/depth_to_space_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/expand_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fill_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/range_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/reverse_sequence_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/shuffle_channels_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/squeeze_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/strided_slice_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/unsqueeze_tests.cpp create mode 100644 inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_deconv_concat_tets.cpp create mode 100644 inference-engine/tests/unit/inference_engine_tests/normalization/latest_in_fuse_test.cpp create mode 100644 inference-engine/tests/unit/inference_engine_tests/parameter_tests.cpp create mode 100644 inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.cpp create mode 100644 inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.hpp create mode 100644 inference-engine/tests/unit/inference_engine_tests/util_test.hpp create mode 100644 inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/CMakeLists.txt create mode 100644 inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.cpp create mode 100644 inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.hpp create mode 100644 inference-engine/tests/unit/shape_infer/adult_test.cpp create mode 100644 inference-engine/tests/unit/shape_infer/adult_test.hpp create mode 100644 inference-engine/tests/unit/shape_infer/adult_test_utils.cpp create mode 100644 inference-engine/tests/unit/shape_infer/adult_test_utils.hpp delete mode 100644 inference-engine/tests/unit/shape_infer/cpu_ext_shape_infer_general_test.cpp create mode 100644 inference-engine/tests/unit/transformations/eltwise_broadcast_test.cpp create mode 100644 inference-engine/tests/unit/transformations/sub_test.cpp create mode 100644 inference-engine/tests/unit/transformations/tranformations_test.hpp create mode 100644 inference-engine/tests/validation_app/CMakeLists.txt create mode 100644 inference-engine/thirdparty/clDNN/.gitignore create mode 100644 inference-engine/thirdparty/clDNN/api/C/condition.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/contract.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/depth_to_space.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/detection_output_sort.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/gather.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/one_hot.h rename inference-engine/thirdparty/clDNN/{kernel_selector/core/cache/cache_SKL_GT2.cpp => api/C/pyramid_roi_align.h} (68%) create mode 100644 inference-engine/thirdparty/clDNN/api/C/reverse_sequence.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/shuffle_channels.h create mode 100644 inference-engine/thirdparty/clDNN/api/C/strided_slice.h create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/condition.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/contract.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/depth_to_space.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/gather.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/one_hot.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/pyramid_roi_align.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/reverse_sequence.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/shuffle_channels.hpp create mode 100644 inference-engine/thirdparty/clDNN/api/CPP/strided_slice.hpp create mode 100644 inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_bn_scale.h create mode 100644 inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_eltwise.h create mode 100644 inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_bn_scale.hpp create mode 100644 inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_eltwise.hpp create mode 100644 inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/make_unique.hpp create mode 100644 inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/smart_ptr/make_unique.hpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h rename inference-engine/thirdparty/clDNN/kernel_selector/core/{cache/cache_SKL_GT2_B32_B64.cpp => actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp} (58%) create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.h create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache.json delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_APL.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3_B1.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL_B1_B16.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B1_B16.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B1_B16.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B32_B64.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B8.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_sort.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_os_iyx_osv16.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_yxfb_yxio_b16_fp16.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl rename inference-engine/thirdparty/clDNN/kernel_selector/core/{cache/cache_GT3.cpp => cl_kernels/include/arg_max_min_common.cl} (73%) create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/detection_output_common.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/imad.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/one_hot_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pyramid_roi_align_gpu_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ps_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/shuffle_channels_ref.cl create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/strided_slice_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/src/caps/public/gpu_devices.inc delete mode 100644 inference-engine/thirdparty/clDNN/src/caps/public/gpu_enums.inc create mode 100644 inference-engine/thirdparty/clDNN/src/condition.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/constants_propagator.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/contract.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/depth_to_space.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gather.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.h create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/condition_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/confiugration.h create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/depth_to_space_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/events_pool.h create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.h create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/one_hot_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/pyramid_roi_align_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/shuffle_channels_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/gpu/strided_slice_gpu.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/add_required_reorders.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/calculate_prior_boxes.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/compile_graph.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_remove_stride.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_shrinking.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/handle_input_padding.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/mark_nodes.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/post_optimize_weights.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/prep_opt_depthwise_sep_post.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_depthwise_sep_opt.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/propagate_constants.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/include/condition_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/constants_propagator.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/contract_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/depth_to_space_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/gather_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/one_hot_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/pass_manager.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/program_helpers.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/pyramid_roi_align_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/reverse_sequence_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/shuffle_channels_inst.h create mode 100644 inference-engine/thirdparty/clDNN/src/include/strided_slice_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/xml_object.h create mode 100644 inference-engine/thirdparty/clDNN/src/nodes_ordering.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/one_hot.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/program_helpers.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/pyramid_roi_align.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/reverse_sequence.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/shuffle_channels.cpp create mode 100644 inference-engine/thirdparty/clDNN/src/strided_slice.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/module_tests/events_pool_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/command_queue_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/condition_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/depth_to_space_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/one_hot_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/propagate_constants_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/pyramid_roi_align_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/reverse_sequence_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/shuffle_channels_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/strided_slice_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/trim_to_outputs_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests_core_internal/CMakeLists.txt rename inference-engine/thirdparty/clDNN/{kernel_selector/core/cache/cache_ICL.cpp => tests_core_internal/main.cpp} (70%) create mode 100644 inference-engine/thirdparty/clDNN/tests_core_internal/program_impl_wrapper.h create mode 100644 inference-engine/thirdparty/clDNN/tests_core_internal/test_cases/graph_manipulation_gpu_test.cpp create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/allocators.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/cursorstreamwrapper.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/document.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/encodedstream.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/encodings.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/error/en.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/error/error.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/filereadstream.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/filewritestream.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/fwd.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/biginteger.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/diyfp.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/dtoa.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/ieee754.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/itoa.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/meta.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/pow10.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/regex.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/stack.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strfunc.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strtod.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/internal/swap.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/istreamwrapper.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/memorybuffer.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/memorystream.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/inttypes.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/stdint.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/ostreamwrapper.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/pointer.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/prettywriter.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/rapidjson.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/reader.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/schema.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/stream.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/stringbuffer.h create mode 100644 inference-engine/thirdparty/clDNN/utils/rapidjson/writer.h create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/core.hpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/goclkernel.hpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/imgproc.hpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.cpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.hpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.cpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.hpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.cpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.hpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclkernel.cpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gpu_test.cpp create mode 100644 inference-engine/thirdparty/fluid/modules/gapi/test/opencl_kernels_test_gapi.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/cmake/config.cmake.in create mode 100644 inference-engine/thirdparty/mkl-dnn/cmake/template.vcxproj.user create mode 100644 inference-engine/thirdparty/mkl-dnn/cmake/version.cmake create mode 100644 inference-engine/thirdparty/mkl-dnn/doc/winograd_convolution.md create mode 100644 inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_int8.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/include/mkldnn_version.h.in create mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/binarization.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/binarization_pd.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution_pd.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.cpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/convolution_relu.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/common/memory_tracking.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binarization_pd.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binary_convolution_pd.hpp rename inference-engine/thirdparty/mkl-dnn/{tests/gtests/test_convolution_relu_forward_f32.cpp => src/cpu/cpu_primitive.cpp} (60%) rename inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/{gemm_utils.cpp => f32/gemm_utils_f32.cpp} (95%) rename inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/{gemm_utils.hpp => f32/gemm_utils_f32.hpp} (89%) rename inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/{ => f32}/jit_avx512_common_gemm_f32.cpp (91%) create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.hpp rename inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/{ => f32}/jit_avx_gemm_f32.cpp (93%) rename inference-engine/thirdparty/mkl-dnn/{tests/gtests/test_convolution_relu_forward_s16s16s32.cpp => src/cpu/gemm/f32/jit_avx_gemm_f32.hpp} (64%) rename inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/{ref_gemm.cpp => f32/ref_gemm_f32.cpp} (80%) create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/common.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/gemv.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemv_s8u8s32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.cpp rename inference-engine/thirdparty/mkl-dnn/src/cpu/{gemm_u8s8s32x_inner_product.hpp => gemm_x8s8s32x_inner_product.hpp} (56%) delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.cpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.cpp rename inference-engine/thirdparty/mkl-dnn/src/cpu/{jit_avx512_core_u8s8s32x_deconvolution.hpp => jit_avx512_core_x8s8s32x_deconvolution.hpp} (63%) create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.cpp rename inference-engine/thirdparty/mkl-dnn/src/cpu/{jit_avx512_core_i8i8_pooling.hpp => jit_sse42_i8i8_pooling.hpp} (81%) create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.cpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_common.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru_lbr.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_lstm.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_rnn.cpp rename inference-engine/thirdparty/mkl-dnn/src/cpu/{ => rnn}/cpu_rnn_pd.hpp (63%) create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/jit_uni_rnn_postgemm.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_reorders.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_auto create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_regression_gemm create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_ssd_mobilenet create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_1x1 rename inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/{ => deconv}/deconv_2d (92%) rename inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/{ => deconv}/deconv_3d (100%) rename inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/{ => deconv}/deconv_all (61%) rename inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/{dilated_deconv => deconv/deconv_dilated} (100%) create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_1x1 create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_all delete mode 100644 inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_deconv_all create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binarization.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_binarization_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_depthwise_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_binarization_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_depthwise_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_eltwise_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward_common.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_sum_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_eltwise_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward_common.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_sum_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_common.hpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_x8s8f32s32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_u8s8s32.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp rename inference-engine/thirdparty/mkl-dnn/tests/gtests/{test_convolution_relu_forward_neg_slope_f32.cpp => test_convolution_forward_u8s8u8.cpp} (81%) delete mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_common.hpp delete mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_relu.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/gtests/test_rnn_forward.cpp create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/other/subproject/CMakeLists.txt create mode 100644 inference-engine/thirdparty/mkl-dnn/tests/other/subproject/main.c create mode 100644 inference-engine/tools/accuracy_checker_tool/README.md create mode 100644 inference-engine/tools/accuracy_checker_tool/accuracy_check.py create mode 100644 inference-engine/tools/accuracy_checker_tool/convert_annotation.py create mode 100644 inference-engine/tools/benchmark_tool/README.md create mode 100644 inference-engine/tools/benchmark_tool/benchmark.py create mode 100644 inference-engine/tools/calibration_tool/README.md create mode 100644 inference-engine/tools/calibration_tool/calibrate.py create mode 100644 inference-engine/tools/calibration_tool/configs/definitions.yml create mode 100644 inference-engine/tools/calibration_tool/configs/inception_v1.yml create mode 100644 inference-engine/tools/calibration_tool/configs/ncf_config.yml create mode 100644 inference-engine/tools/calibration_tool/configs/ssd_mobilenet_v1_coco.yml create mode 100644 inference-engine/tools/calibration_tool/configs/unet2d.yml create mode 100644 inference-engine/tools/collect_statistics_tool/README.md create mode 100644 inference-engine/tools/collect_statistics_tool/collect_statistics.py create mode 100644 model-optimizer/extensions/back/CreateConstNodes.py create mode 100644 model-optimizer/extensions/back/CreateConstNodes_test.py create mode 100644 model-optimizer/extensions/back/DumpFakeQuantStat.py create mode 100644 model-optimizer/extensions/back/EnableConstantStridedSlice.py create mode 100644 model-optimizer/extensions/back/PackBinaryWeights.py create mode 100644 model-optimizer/extensions/back/RNNSequenceTypeRename.py create mode 100644 model-optimizer/extensions/back/ReshapeMutation.py create mode 100644 model-optimizer/extensions/front/caffe/bias_ext.py create mode 100644 model-optimizer/extensions/front/caffe/bias_ext_test.py create mode 100644 model-optimizer/extensions/front/caffe/binarization.py create mode 100644 model-optimizer/extensions/front/caffe/binary_conv_ext.py create mode 100644 model-optimizer/extensions/front/create_tensor_nodes.py rename model-optimizer/{mo/ops => extensions/front}/div.py (54%) create mode 100644 model-optimizer/extensions/front/div_test.py create mode 100644 model-optimizer/extensions/front/input_cut.py create mode 100644 model-optimizer/extensions/front/kaldi/__init__.py create mode 100644 model-optimizer/extensions/front/kaldi/add_permute_after_convolution.py create mode 100644 model-optimizer/extensions/front/kaldi/add_permute_after_convolution_test.py create mode 100644 model-optimizer/extensions/front/mxnet/RNN_ext_test.py create mode 100644 model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes.py rename model-optimizer/{mo/pipeline/mx_test.py => extensions/front/mxnet/add_input_data_to_prior_boxes_test.py} (85%) create mode 100644 model-optimizer/extensions/front/mxnet/exp_ext.py create mode 100644 model-optimizer/extensions/front/mxnet/gather.py rename model-optimizer/{mo/front/tf/extractors/shape.py => extensions/front/mxnet/gather_ext.py} (64%) create mode 100644 model-optimizer/extensions/front/mxnet/gather_test.py create mode 100644 model-optimizer/extensions/front/onnx/argmax.py create mode 100644 model-optimizer/extensions/front/onnx/argmax_ext.py create mode 100644 model-optimizer/extensions/front/onnx/cast_ext.py create mode 100644 model-optimizer/extensions/front/onnx/clip_ext.py create mode 100644 model-optimizer/extensions/front/onnx/detection_output.py create mode 100644 model-optimizer/extensions/front/onnx/detection_output_test.py create mode 100644 model-optimizer/extensions/front/onnx/detectionoutput_ext.py create mode 100644 model-optimizer/extensions/front/onnx/dropout_ext.py create mode 100644 model-optimizer/extensions/front/onnx/exp_ext.py create mode 100644 model-optimizer/extensions/front/onnx/gru_ext.py create mode 100644 model-optimizer/extensions/front/onnx/gru_ext_test.py create mode 100644 model-optimizer/extensions/front/onnx/lstm_ext_test.py create mode 100644 model-optimizer/extensions/front/onnx/priorbox_ext.py create mode 100644 model-optimizer/extensions/front/onnx/priorbox_ext_test.py create mode 100644 model-optimizer/extensions/front/onnx/priorgridgenerator_ext.py create mode 100644 model-optimizer/extensions/front/onnx/proposal_ext.py create mode 100644 model-optimizer/extensions/front/onnx/quantize_ext.py create mode 100644 model-optimizer/extensions/front/onnx/rnn_ext.py create mode 100644 model-optimizer/extensions/front/onnx/rnn_ext_test.py create mode 100644 model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py create mode 100644 model-optimizer/extensions/front/onnx/scale_ext.py create mode 100644 model-optimizer/extensions/front/onnx/topkrois_ext.py create mode 100644 model-optimizer/extensions/front/output_cut.py create mode 100644 model-optimizer/extensions/front/override_batch.py create mode 100644 model-optimizer/extensions/front/pass_separator.py create mode 100644 model-optimizer/extensions/front/restore_ports.py create mode 100644 model-optimizer/extensions/front/tf/Cast_ext.py create mode 100644 model-optimizer/extensions/front/tf/FlattenToReshape.py create mode 100644 model-optimizer/extensions/front/tf/ZerosLike.py create mode 100644 model-optimizer/extensions/front/tf/exp_ext.py create mode 100644 model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.10.json create mode 100644 model-optimizer/extensions/front/tf/rfcn_support_api_v1.10.json create mode 100644 model-optimizer/extensions/front/tf/shape_ext.py create mode 100644 model-optimizer/extensions/front/tf/sum_ext.py create mode 100644 model-optimizer/extensions/front/tf/tensorflow_custom_operations_config_update.py create mode 100644 model-optimizer/extensions/front/tf/tensorflow_patterns.py create mode 100644 model-optimizer/extensions/front/tf/tensorflow_use_custom_operations_config.py create mode 100644 model-optimizer/extensions/front/tf/variables_values_freezing.py create mode 100644 model-optimizer/extensions/front/tf/yolo_v3_tiny.json create mode 100644 model-optimizer/extensions/front/user_data_repack.py create mode 100644 model-optimizer/extensions/middle/AddMeanScaleValues.py create mode 100644 model-optimizer/extensions/middle/AddMeanScaleValues_test.py create mode 100644 model-optimizer/extensions/middle/AddQuantizeFuse.py delete mode 100644 model-optimizer/extensions/middle/AddReshapeAfterStridedSlice.py delete mode 100644 model-optimizer/extensions/middle/AddReshapeAfterStridedSlice_test.py create mode 100644 model-optimizer/extensions/middle/BinarizeWeightsM1P1.py create mode 100644 model-optimizer/extensions/middle/Cast.py create mode 100644 model-optimizer/extensions/middle/ChangePlaceholderTypes.py create mode 100644 model-optimizer/extensions/middle/CheckForCycle.py create mode 100644 model-optimizer/extensions/middle/CheckForCycle_test.py create mode 100644 model-optimizer/extensions/middle/ConcatOptimization.py create mode 100644 model-optimizer/extensions/middle/ConvToBinaryConv.py create mode 100644 model-optimizer/extensions/middle/ConvertMultiInputConv.py create mode 100644 model-optimizer/extensions/middle/CustomSubgraphCall.py create mode 100644 model-optimizer/extensions/middle/DecomposeBidirectionalRNNSequence.py create mode 100644 model-optimizer/extensions/middle/DeleteControlFlowEdges.py create mode 100644 model-optimizer/extensions/middle/DeleteNotExecutable.py create mode 100644 model-optimizer/extensions/middle/DilatedConvolution.py create mode 100644 model-optimizer/extensions/middle/GRURNNSequenceToTensorIterator.py create mode 100644 model-optimizer/extensions/middle/GatherNdNormalizer.py create mode 100644 model-optimizer/extensions/middle/GemmToFullyConnected.py create mode 100644 model-optimizer/extensions/middle/InputCut.py create mode 100644 model-optimizer/extensions/middle/L2NormToNorm.py rename model-optimizer/extensions/middle/{lstm_sequence_tensor_iterator.py => LSTMRNNSequenceToTensorIterator.py} (78%) create mode 100644 model-optimizer/extensions/middle/LayoutChangeForConstantShapePaths.py create mode 100644 model-optimizer/extensions/middle/MXNetRNNSequenceNormalize.py create mode 100644 model-optimizer/extensions/middle/MXNetSplitMultiLayers.py create mode 100644 model-optimizer/extensions/middle/MeanToAvgPool.py rename model-optimizer/{mo/middle/passes/pool_test.py => extensions/middle/MeanToAvgPool_test.py} (87%) create mode 100644 model-optimizer/extensions/middle/MulQuantizeFuse.py create mode 100644 model-optimizer/extensions/middle/NasNet.py create mode 100644 model-optimizer/extensions/middle/ONNXRNNSequenceNormalize.py create mode 100644 model-optimizer/extensions/middle/PartialInfer.py create mode 100644 model-optimizer/extensions/middle/RNNSequenceNormalizeToIE.py create mode 100644 model-optimizer/extensions/middle/ReluQuantizeFuse.py create mode 100644 model-optimizer/extensions/middle/RemoveIdentity.py create mode 100644 model-optimizer/extensions/middle/RemoveRedundantReshapeAfterCropAndResize.py create mode 100644 model-optimizer/extensions/middle/ReverseV2ToReverseSequence.py create mode 100644 model-optimizer/extensions/middle/ScaleInput.py create mode 100644 model-optimizer/extensions/middle/ScaleInput_test.py create mode 100644 model-optimizer/extensions/middle/SharedWeightsDuplication.py rename model-optimizer/{mo/middle/passes/shared_weights_duplication_test.py => extensions/middle/SharedWeightsDuplication_test.py} (72%) rename model-optimizer/extensions/middle/{lstm_tensor_iterator_to_lstm_sequence.py => TensorIteratorLSTMToLSTMSequence.py} (67%) create mode 100644 model-optimizer/extensions/middle/UselessSplitEraser.py delete mode 100644 model-optimizer/extensions/middle/decompose_bi_lstm.py delete mode 100644 model-optimizer/extensions/middle/lstm_sequence_normalize.py delete mode 100644 model-optimizer/extensions/middle/lstm_sequence_normalize_test.py delete mode 100644 model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py create mode 100644 model-optimizer/extensions/middle/pass_separator.py create mode 100644 model-optimizer/extensions/ops/Cast.py create mode 100644 model-optimizer/extensions/ops/GRU.py create mode 100644 model-optimizer/extensions/ops/GRUCell.py create mode 100644 model-optimizer/extensions/ops/GatherNd.py create mode 100644 model-optimizer/extensions/ops/LSTM.py create mode 100644 model-optimizer/extensions/ops/RNN.py create mode 100644 model-optimizer/extensions/ops/RNNCell.py create mode 100644 model-optimizer/extensions/ops/Reverse.py create mode 100644 model-optimizer/extensions/ops/binarization.py create mode 100644 model-optimizer/extensions/ops/detectionoutput_onnx.py create mode 100644 model-optimizer/extensions/ops/exp.py create mode 100644 model-optimizer/extensions/ops/exp_test.py create mode 100644 model-optimizer/extensions/ops/priorgridgenerator_onnx.py create mode 100644 model-optimizer/extensions/ops/proposal_onnx.py create mode 100644 model-optimizer/extensions/ops/quantize.py create mode 100644 model-optimizer/extensions/ops/quantize_test.py create mode 100644 model-optimizer/extensions/ops/range.py create mode 100644 model-optimizer/extensions/ops/roifeatureextractor_onnx.py create mode 100644 model-optimizer/extensions/ops/topkrois_onnx.py delete mode 100644 model-optimizer/mo/front/onnx/extractors/dropout.py delete mode 100644 model-optimizer/mo/front/tf/change_placeholder_type.py create mode 100644 model-optimizer/mo/graph/connection.py create mode 100644 model-optimizer/mo/graph/port.py delete mode 100644 model-optimizer/mo/middle/passes/pool.py delete mode 100644 model-optimizer/mo/middle/passes/shared_weights_duplication.py create mode 100644 model-optimizer/mo/ops/strided_slice.py create mode 100644 model-optimizer/mo/ops/strided_slice_test.py delete mode 100644 model-optimizer/version.txt create mode 100644 tools/README.md create mode 100644 tools/__init__.py create mode 100644 tools/accuracy_checker/.pylintrc create mode 100644 tools/accuracy_checker/README.md create mode 100644 tools/accuracy_checker/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/README.md create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/action_recognition.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/adapter.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/attributes_recognition.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/classification.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/detection.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/dummy_adapters.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/hit_ratio.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/image_processing.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/pose_estimation.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/reidentification.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/segmentation.py create mode 100644 tools/accuracy_checker/accuracy_checker/adapters/text_detection.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/README.md create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/_reid_common.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/brats.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/cityscapes.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/convert.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/detection_opencv_storage.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/format_converter.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/icdar.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/imagenet.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/lfw.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/market1501.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/mars.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/ms_coco.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/ncf_converter.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/pascal_voc.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/sample_converter.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/super_resolution_converter.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/vgg_face_regression.py create mode 100644 tools/accuracy_checker/accuracy_checker/annotation_converters/wider.py create mode 100644 tools/accuracy_checker/accuracy_checker/config/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/config/config_reader.py create mode 100644 tools/accuracy_checker/accuracy_checker/config/config_validator.py create mode 100644 tools/accuracy_checker/accuracy_checker/data_readers/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/data_readers/data_reader.py create mode 100644 tools/accuracy_checker/accuracy_checker/dataset.py create mode 100644 tools/accuracy_checker/accuracy_checker/dependency.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/caffe_installation_readme.md create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher_readme.md create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher_readme.md create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/dummy_launcher.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/input_feeder.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/launcher.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/loaders/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/loaders/loader.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/loaders/pickle_loader.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/loaders/xml_loader.py create mode 100644 tools/accuracy_checker/accuracy_checker/launcher/model_conversion.py create mode 100644 tools/accuracy_checker/accuracy_checker/logging.py create mode 100644 tools/accuracy_checker/accuracy_checker/main.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/README.md create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/average_meter.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/character_recognition.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/classification.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/coco_metrics.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/detection.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/hit_ratio.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/metric.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/metric_executor.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/multilabel_recognition.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/overlap.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/regression.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/reid.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/semantic_segmentation.py create mode 100644 tools/accuracy_checker/accuracy_checker/metrics/text_detection.py create mode 100644 tools/accuracy_checker/accuracy_checker/model_evaluator.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/README.md create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/cast_to_int.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/clip_boxes.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/clip_points.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/clip_segmentation_mask.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/correct_yolo_v2_boxes.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/crop_segmentation_mask.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/encode_segmentation_mask.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/extend_segmentation_mask.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/filter.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/nms.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/normalize_landmarks_points.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/postprocessing_executor.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/postprocessor.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/resize_prediction_boxes.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/resize_segmentation_mask.py create mode 100644 tools/accuracy_checker/accuracy_checker/postprocessor/zoom_segmentation_mask.py create mode 100644 tools/accuracy_checker/accuracy_checker/preprocessor/README.md create mode 100644 tools/accuracy_checker/accuracy_checker/preprocessor/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/preprocessor/preprocessing_executor.py create mode 100644 tools/accuracy_checker/accuracy_checker/preprocessor/preprocessors.py create mode 100644 tools/accuracy_checker/accuracy_checker/presenters.py create mode 100644 tools/accuracy_checker/accuracy_checker/progress_reporters.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/__init__.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/base_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/character_recognition_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/classification_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/detection_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/hit_ratio_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/multilabel_recognition.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/pose_estimation_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/regression_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/reid_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/representaton_container.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/segmentation_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/super_resolution_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/representation/text_detection_representation.py create mode 100644 tools/accuracy_checker/accuracy_checker/utils.py create mode 100644 tools/accuracy_checker/configs/face-detection-adas-0001.yml create mode 100644 tools/accuracy_checker/configs/face-detection-retail-0004.yml create mode 100644 tools/accuracy_checker/configs/face-reidentification-retail-0095.yml create mode 100644 tools/accuracy_checker/configs/human-pose-estimation-0001.yml create mode 100644 tools/accuracy_checker/configs/landmarks-regression-retail-0009.yml create mode 100644 tools/accuracy_checker/configs/person-reidentification-retail-0031.yml create mode 100644 tools/accuracy_checker/configs/person-reidentification-retail-0076.yml create mode 100644 tools/accuracy_checker/configs/person-reidentification-retail-0079.yml create mode 100644 tools/accuracy_checker/configs/text-detection-0002.yml create mode 100644 tools/accuracy_checker/configs/text-recognition-0012.yml create mode 100644 tools/accuracy_checker/data/test_data/1.jpg create mode 100644 tools/accuracy_checker/data/test_models/SampLeNet.bin create mode 100644 tools/accuracy_checker/data/test_models/SampLeNet.caffemodel create mode 100644 tools/accuracy_checker/data/test_models/SampLeNet.prototxt create mode 100644 tools/accuracy_checker/data/test_models/SampLeNet.xml create mode 100644 tools/accuracy_checker/pylint_checkers.py create mode 100644 tools/accuracy_checker/requirements.txt create mode 100644 tools/accuracy_checker/setup.cfg create mode 100644 tools/accuracy_checker/tests/__init__.py create mode 100644 tools/accuracy_checker/tests/common.py create mode 100644 tools/accuracy_checker/tests/conftest.py create mode 100644 tools/accuracy_checker/tests/test_adapters.py create mode 100644 tools/accuracy_checker/tests/test_caffe_launcher.py create mode 100644 tools/accuracy_checker/tests/test_config_reader.py create mode 100644 tools/accuracy_checker/tests/test_config_validator.py create mode 100644 tools/accuracy_checker/tests/test_dataset.py create mode 100644 tools/accuracy_checker/tests/test_dependency.py create mode 100644 tools/accuracy_checker/tests/test_detection_metrics.py create mode 100644 tools/accuracy_checker/tests/test_dlsdk_launcher.py create mode 100644 tools/accuracy_checker/tests/test_input_feeder.py create mode 100644 tools/accuracy_checker/tests/test_metric_evaluator.py create mode 100644 tools/accuracy_checker/tests/test_model_conversion.py create mode 100644 tools/accuracy_checker/tests/test_model_evaluator.py create mode 100644 tools/accuracy_checker/tests/test_postprocessor.py create mode 100644 tools/accuracy_checker/tests/test_preprocessor.py create mode 100644 tools/accuracy_checker/tests/test_presenter.py create mode 100644 tools/accuracy_checker/tests/test_regression_metrics.py create mode 100644 tools/accuracy_checker/tests/test_reid_metrics.py create mode 100644 tools/accuracy_checker/tests/test_segmentation_metrics.py create mode 100644 tools/accuracy_checker/tests/test_utils.py create mode 100644 tools/benchmark/README.md create mode 100644 tools/benchmark/__init__.py create mode 100644 tools/benchmark/__main__.py create mode 100644 tools/benchmark/benchmark.py create mode 100644 tools/benchmark/command_line_reader.py create mode 100644 tools/benchmark/configuration.py create mode 100644 tools/benchmark/logging.py create mode 100644 tools/benchmark/requirements.txt create mode 100644 tools/calibration/README.md create mode 100644 tools/calibration/__init__.py create mode 100644 tools/calibration/__main__.py create mode 100644 tools/calibration/aggregated_statistics.py create mode 100644 tools/calibration/base_calibrator.py create mode 100644 tools/calibration/calibration_configuration.py create mode 100644 tools/calibration/calibration_metrics.py create mode 100644 tools/calibration/calibrator.py create mode 100644 tools/calibration/calibrator_configuration.py create mode 100644 tools/calibration/calibrator_factory.py create mode 100644 tools/calibration/command_line_processor.py create mode 100644 tools/calibration/command_line_reader.py create mode 100644 tools/calibration/fp16_calibrator.py create mode 100644 tools/calibration/infer_raw_results.py create mode 100644 tools/calibration/inference_result.py create mode 100644 tools/calibration/int8_calibrator.py create mode 100644 tools/calibration/layer_accuracy_drop/__init__.py create mode 100644 tools/calibration/layer_accuracy_drop/collector_by_image.py create mode 100644 tools/calibration/layer_accuracy_drop/collector_by_layer.py create mode 100644 tools/calibration/layer_accuracy_drop_info.py create mode 100644 tools/calibration/layers/__init__.py create mode 100644 tools/calibration/logging.py create mode 100644 tools/calibration/network_node_stats.py create mode 100644 tools/calibration/nrmsd.py create mode 100644 tools/calibration/requirements.txt create mode 100644 tools/calibration/shape.py create mode 100644 tools/calibration/single_layer_network.py create mode 100644 tools/calibration/top_results.py create mode 100644 tools/network.py create mode 100644 tools/utils/__init__.py create mode 100644 tools/utils/biases.py create mode 100644 tools/utils/building/__init__.py create mode 100644 tools/utils/building/layer.py create mode 100644 tools/utils/building/network_builder.py create mode 100644 tools/utils/building/port.py create mode 100644 tools/utils/configuration_filter.py create mode 100644 tools/utils/connection.py create mode 100644 tools/utils/edge.py create mode 100644 tools/utils/layer.py create mode 100644 tools/utils/network_info.py create mode 100644 tools/utils/path.py create mode 100644 tools/utils/port.py create mode 100644 tools/utils/tensor_desc.py create mode 100644 tools/utils/weights.py diff --git a/README.md b/README.md index f4fca3a..a082023 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # [OpenVINO™ Toolkit](https://01.org/openvinotoolkit) - Deep Learning Deployment Toolkit repository -[![Stable release](https://img.shields.io/badge/version-2018.R5-green.svg)](https://github.com/opencv/dldt/releases/tag/2018_R5) +[![Stable release](https://img.shields.io/badge/version-2019.R1-green.svg)](https://github.com/opencv/dldt/releases/tag/2019_R1) [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE) This toolkit allows developers to deploy pre-trained deep learning models through a high-level C++ Inference Engine API integrated with application logic. diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt index 46f821d..9e639ff 100644 --- a/inference-engine/CMakeLists.txt +++ b/inference-engine/CMakeLists.txt @@ -1,13 +1,15 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 3.3) +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) project(InferenceEngine) set(DEV_BUILD TRUE) +include(CTest) + ## WA for problem with gtest submodule. It cannot detect uint32 type. ## remove Gtest submodule and this two lines together include (CheckTypeSize) @@ -133,25 +135,28 @@ set (CMAKE_POSITION_INDEPENDENT_CODE ON) include (sanitizer) include(CheckCXXCompilerFlag) -if(UNIX) - CHECK_CXX_COMPILER_FLAG("-fvisibility=hidden" COMPILER_SUPPORTS_VISIBILITY) - if (COMPILER_SUPPORTS_VISIBILITY) - #add_definitions(-fvisibility=hidden) todo: should be hidden? if so define default visibiliti explicite for each funtion - add_definitions(-fvisibility=default) - endif(COMPILER_SUPPORTS_VISIBILITY) -endif(UNIX) + +include(cpplint) add_subdirectory(src) add_subdirectory(tests) add_subdirectory(thirdparty) -if (ENABLE_SAMPLES_CORE) - set(InferenceEngine_DIR "${CMAKE_BINARY_DIR}") +set(InferenceEngine_DIR "${CMAKE_BINARY_DIR}") - #to be able to link - set (LIB_FOLDER ${IE_MAIN_SOURCE_DIR}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION}/lib) - add_subdirectory(samples) -endif() +#to be able to link +set (LIB_FOLDER ${IE_MAIN_SOURCE_DIR}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION}/lib) + +# gflags and format_reader targets are kept inside of samples directory and +# they must be built even if samples build is disabled (required for tests and tools). +add_subdirectory(samples) + +file(GLOB_RECURSE SAMPLES_SOURCES samples/*.cpp samples/*.hpp samples/*.h) +add_cpplint_target(sample_cpplint + FOR_SOURCES ${SAMPLES_SOURCES} + EXCLUDE_PATTERNS "thirdparty/*" "pugixml/*") if (ENABLE_PYTHON) add_subdirectory(ie_bridges/python) -endif() \ No newline at end of file +endif() + +add_cpplint_report_target() diff --git a/inference-engine/README.md b/inference-engine/README.md index d28782e..36053cd 100644 --- a/inference-engine/README.md +++ b/inference-engine/README.md @@ -16,8 +16,8 @@ Inference Engine plugins for Intel® FPGA and Intel® Movidius™ Neural Compute ## Build on Linux\* Systems The software was validated on: -- Ubuntu\* 16.04 with default GCC\* 5.4.0 -- CentOS\* 7.4 with default GCC\* 4.8.5 +- Ubuntu\* 16.04 (64-bit) with default GCC\* 5.4.0 +- CentOS\* 7.4 (64-bit) with default GCC\* 4.8.5 - [Intel® Graphics Compute Runtime for OpenCL™ Driver package 18.28.11080](https://github.com/intel/compute-runtime/releases/tag/18.28.11080). ### Software Requirements @@ -45,11 +45,19 @@ The software was validated on: You can use the following additional build options: - Internal JIT GEMM implementation is used by default. - To switch to OpenBLAS\* implementation, use `GEMM=OPENBLAS` option and `BLAS_INCLUDE_DIRS` and `BLAS_LIBRARIES` cmake options to specify path to OpenBLAS headers and library, for example use the following options on CentOS\*: `-DGEMM=OPENBLAS -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DBLAS_LIBRARIES=/usr/lib64/libopenblas.so.0` -- To switch to optimized MKL-ML\* GEMM implementation, use `GEMM=MKL` and `MKLROOT` cmake options to specify path to unpacked MKL-ML with `include` and `lib` folders, for example use the following options: `-DGEMM=MKL -DMKLROOT=`. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_lnx_2019.0.1.20180928.tgz) -- OpenMP threading is used by default. To build Inference Engine with TBB threading, set `-DTHREADING=TBB` option. +- To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=` cmake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_lnx_2019.0.1.20180928.tgz) -- To build Python API wrapper, use -DENABLE_PYTHON=ON option. To specify exact Python version, use the following options: `-DPYTHON_EXECUTABLE=`which python3.6` -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.6m.so -DPYTHON_INCLUDE_DIR=/usr/include/python3.6` +- Threading Building Blocks (TBB) is used by default. To build the Inference Engine with OpenMP* threading, set the `-DTHREADING=OMP` option. + +- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed. + +- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options: +```sh + -DPYTHON_EXECUTABLE=`which python3.7` \ + -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.7m.so \ + -DPYTHON_INCLUDE_DIR=/usr/include/python3.7 +``` - To switch on/off the CPU and GPU plugins, use `cmake` options `-DENABLE_MKL_DNN=ON/OFF` and `-DENABLE_CLDNN=ON/OFF`. @@ -74,7 +82,7 @@ You can use the following additional build options: ## Build on Windows\* Systems: The software was validated on: -- Microsoft\* Windows\* 10 with Visual Studio 2017 and Intel® C++ Compiler 2018 Update 3 +- Microsoft\* Windows\* 10 (64-bit) with Visual Studio 2017 and Intel® C++ Compiler 2018 Update 3 - [Intel® Graphics Driver for Windows* [24.20] driver package](https://downloadcenter.intel.com/download/27803/Graphics-Intel-Graphics-Driver-for-Windows-10?v=t). ### Software Requirements @@ -107,25 +115,75 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^ - Internal JIT GEMM implementation is used by default. - To switch to OpenBLAS GEMM implementation, use -DGEMM=OPENBLAS cmake option and specify path to OpenBLAS using `-DBLAS_INCLUDE_DIRS=\include` and `-DBLAS_LIBRARIES=\lib\libopenblas.dll.a` options. Prebuilt OpenBLAS\* package can be downloaded [here](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download), mingw64* runtime dependencies [here](https://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip/download) -- To switch to optimized MKL-ML GEMM implementation, use `GEMM=MKL` and `MKLROOT` cmake options to specify path to unpacked MKL-ML with `include` and `lib` folders, for example use the following options: `-DGEMM=MKL -DMKLROOT=`. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_win_2019.0.1.20180928.zip) +- To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=` cmake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_win_2019.0.1.20180928.zip) + +- Threading Building Blocks (TBB) is used by default. To build the Inference Engine with OpenMP* threading, set the `-DTHREADING=OMP` option. -- OpenMP threading is used by default. To build Inference Engine with TBB threading, set `-DTHREADING=TBB` option. +- Required versions of TBB and OpenCV packages are downloaded automatically by the CMake-based script. If you already have installed TBB or OpenCV packages configured in your environment, you may need to clean the `TBBROOT` and `OpenCV_DIR` environment variables before running the `cmake` command, otherwise they won't be downloaded and the build may fail if incompatible versions were installed. -- To build Python API wrapper, use -DENABLE_PYTHON=ON option. To specify exact Python version, use the following options: `-DPYTHON_EXECUTABLE="C:\Program Files\Python36\python.exe" -DPYTHON_INCLUDE_DIR="C:\Program Files\Python36\include" -DPYTHON_LIBRARY="C:\Program Files\Python36\libs\python36.lib"`. +- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options: +```sh + -DPYTHON_EXECUTABLE="C:\Program Files\Python37\python.exe" ^ + -DPYTHON_LIBRARY="C:\Program Files\Python37\libs\python37.lib" ^ + -DPYTHON_INCLUDE_DIR="C:\Program Files\Python37\include" +``` 6. Build generated solution in Visual Studio 2017 or run `cmake --build . --config Release` to build from the command line. +7. Before running the samples, add paths to TBB and OpenCV binaries used for the build to the %PATH% environment variable. By default, TBB binaries are downloaded by the CMake-based script to the `/inference-engine/temp/tbb/lib` folder, OpenCV binaries - to the `/inference-engine/temp/opencv_4.1.0/bin` folder. + ### Building Inference Engine with Ninja ```sh call "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\bin\ipsxe-comp-vars.bat" intel64 vs2017 set CXX=icl set CC=icl +:: clean TBBROOT value set by ipsxe-comp-vars.bat, required TBB package will be downloaded by dldt cmake script +set TBBROOT= cmake -G Ninja -Wno-dev -DCMAKE_BUILD_TYPE=Release .. cmake --build . --config Release ``` -Before running the samples on Microsoft\* Windows\*, please add path to OpenMP library (/inference-engine/temp/omp/lib) and OpenCV libraries (/inference-engine/temp/opencv_4.0.0/bin) to the %PATH% environment variable. +## Build on macOS\* Systems + +The software was validated on: +- macOS\* 10.14, 64-bit + +### Software Requirements +- [CMake\*](https://cmake.org/download/) 3.9 or higher +- Clang\* compiler from Xcode\* 10.1 +- Python\* 3.4 or higher for the Inference Engine Python API wrapper + +### Build Steps +1. Clone submodules: + ```sh + cd dldt/inference-engine + git submodule init + git submodule update --recursive + ``` +2. Install build dependencies using the `install_dependencies.sh` script in the project root folder. +3. Create a build folder: +```sh + mkdir build +``` +4. Inference Engine uses a CMake-based build system. In the created `build` directory, run `cmake` to fetch project dependencies and create Unix makefiles, then run `make` to build the project: +```sh + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j16 +``` +You can use the following additional build options: +- Internal JIT GEMM implementation is used by default. +- To switch to the optimized MKL-ML\* GEMM implementation, use `-DGEMM=MKL` and `-DMKLROOT=` cmake options to specify a path to unpacked MKL-ML with the `include` and `lib` folders. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17.1/mklml_mac_2019.0.1.20180928.tgz) + +- Threading Building Blocks (TBB) is used by default. To build the Inference Engine with OpenMP* threading, set the `-DTHREADING=OMP` option. + +- To build the Python API wrapper, use the `-DENABLE_PYTHON=ON` option. To specify an exact Python version, use the following options: +```sh + -DPYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3.7 \ + -DPYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib \ + -DPYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m +``` + --- \* Other names and brands may be claimed as the property of others. diff --git a/inference-engine/cmake/FindlibGNA.cmake b/inference-engine/cmake/FindlibGNA.cmake index eeb8480..eccf759 100644 --- a/inference-engine/cmake/FindlibGNA.cmake +++ b/inference-engine/cmake/FindlibGNA.cmake @@ -1,12 +1,10 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # #module to locate GNA libraries -cmake_minimum_required(VERSION 2.8) - if (WIN32) set(GNA_PLATFORM_DIR win64) set(GNA_LIB_DIR x64) diff --git a/inference-engine/cmake/check_features.cmake b/inference-engine/cmake/check_features.cmake index 88ff23f..00861fa 100644 --- a/inference-engine/cmake/check_features.cmake +++ b/inference-engine/cmake/check_features.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # @@ -65,10 +65,6 @@ if (ENABLE_PROFILING_RAW) add_definitions(-DENABLE_PROFILING_RAW=1) endif() -if (ENABLE_GTEST_PATCHES) - add_definitions(-DENABLE_GTEST_PATCHES=1) -endif() - if (ENABLE_CLDNN) add_definitions(-DENABLE_CLDNN=1) endif() @@ -77,22 +73,14 @@ if (ENABLE_MKL_DNN) add_definitions(-DENABLE_MKL_DNN=1) endif() -if (ENABLE_STRESS_UNIT_TESTS) - add_definitions(-DENABLE_STRESS_UNIT_TESTS=1) -endif() - -if (ENABLE_SEGMENTATION_TESTS) - add_definitions(-DENABLE_SEGMENTATION_TESTS=1) -endif() - -if (ENABLE_OBJECT_DETECTION_TESTS) - add_definitions(-DENABLE_OBJECT_DETECTION_TESTS=1) -endif() - if (ENABLE_GNA) add_definitions(-DENABLE_GNA) endif() +if (ENABLE_SAMPLES) + set (ENABLE_SAMPLES_CORE ON) +endif() + if (DEVELOPMENT_PLUGIN_MODE) message (STATUS "Enabled development plugin mode") @@ -112,5 +100,4 @@ if (VERBOSE_BUILD) set(CMAKE_VERBOSE_MAKEFILE ON) endif() - print_enabled_features() diff --git a/inference-engine/cmake/config.cmake.in b/inference-engine/cmake/config.cmake.in index ed3c880..7c3459f 100644 --- a/inference-engine/cmake/config.cmake.in +++ b/inference-engine/cmake/config.cmake.in @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # diff --git a/inference-engine/cmake/cpplint.cmake b/inference-engine/cmake/cpplint.cmake new file mode 100644 index 0000000..f4eca4c --- /dev/null +++ b/inference-engine/cmake/cpplint.cmake @@ -0,0 +1,162 @@ +# Copyright (C) 2018-2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +if(ENABLE_CPPLINT) + find_package(PythonInterp 2.7 EXACT) + + if(NOT PYTHONINTERP_FOUND) + message(WARNING "Python was not found (required for cpplint check)") + set(ENABLE_CPPLINT OFF) + endif() +endif() + +if(ENABLE_CPPLINT) + add_custom_target(cpplint_all ALL) + set(CPPLINT_ALL_OUTPUT_FILES "" CACHE INTERNAL "All cpplint output files") +endif() + +function(add_cpplint_target TARGET_NAME) + if(NOT ENABLE_CPPLINT) + return() + endif() + + set(options "") + set(oneValueArgs "") + set(multiValueArgs "FOR_TARGETS" "FOR_SOURCES" "EXCLUDE_PATTERNS") + cmake_parse_arguments(CPPLINT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + foreach(target IN LISTS CPPLINT_FOR_TARGETS) + get_target_property(target_sources "${target}" SOURCES) + list(APPEND CPPLINT_FOR_SOURCES ${target_sources}) + endforeach() + list(REMOVE_DUPLICATES CPPLINT_FOR_SOURCES) + + set(all_output_files "") + foreach(source_file IN LISTS CPPLINT_FOR_SOURCES) + set(exclude FALSE) + foreach(pattern IN LISTS CPPLINT_EXCLUDE_PATTERNS) + if(source_file MATCHES "${pattern}") + set(exclude TRUE) + break() + endif() + endforeach() + + if(exclude) + continue() + endif() + + file(RELATIVE_PATH source_file_relative "${CMAKE_CURRENT_SOURCE_DIR}" "${source_file}") + set(output_file "${CMAKE_CURRENT_BINARY_DIR}/cpplint/${source_file_relative}.cpplint") + string(REPLACE ".." "__" output_file "${output_file}") + get_filename_component(output_dir "${output_file}" DIRECTORY) + file(MAKE_DIRECTORY "${output_dir}") + + add_custom_command( + OUTPUT + "${output_file}" + COMMAND + "${CMAKE_COMMAND}" + -D "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" + -D "CPPLINT_SCRIPT=${IE_MAIN_SOURCE_DIR}/scripts/cpplint.py" + -D "INPUT_FILE=${source_file}" + -D "OUTPUT_FILE=${output_file}" + -D "WORKING_DIRECTORY=${CMAKE_CURRENT_SOURCE_DIR}" + -D "SKIP_RETURN_CODE=${ENABLE_CPPLINT_REPORT}" + -P "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_run.cmake" + DEPENDS + "${source_file}" + "${IE_MAIN_SOURCE_DIR}/scripts/cpplint.py" + "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_run.cmake" + COMMENT + "[cpplint] ${source_file}" + VERBATIM) + + list(APPEND all_output_files "${output_file}") + endforeach() + + set(CPPLINT_ALL_OUTPUT_FILES + ${CPPLINT_ALL_OUTPUT_FILES} ${all_output_files} + CACHE INTERNAL + "All cpplint output files") + + add_custom_target(${TARGET_NAME} ALL + DEPENDS ${all_output_files} + COMMENT "[cpplint] ${TARGET_NAME}") + + if(CPPLINT_FOR_TARGETS) + foreach(target IN LISTS CPPLINT_FOR_TARGETS) + add_dependencies(${target} ${TARGET_NAME}) + endforeach() + endif() + + add_dependencies(cpplint_all ${TARGET_NAME}) +endfunction() + +function(add_cpplint_report_target) + if(NOT ENABLE_CPPLINT OR NOT ENABLE_CPPLINT_REPORT) + return() + endif() + + set(cpplint_output_file "${CMAKE_BINARY_DIR}/cpplint/final_output.cpplint") + add_custom_command( + OUTPUT + "${cpplint_output_file}" + COMMAND + "${CMAKE_COMMAND}" + -D "FINAL_OUTPUT_FILE=${cpplint_output_file}" + -D "OUTPUT_FILES=${CPPLINT_ALL_OUTPUT_FILES}" + -P "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_merge.cmake" + DEPENDS + ${CPPLINT_ALL_OUTPUT_FILES} + "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_merge.cmake" + COMMENT + "[cpplint] Merge all output files" + VERBATIM) + + set(cppcheck_output_file "${CMAKE_BINARY_DIR}/cpplint/cpplint-cppcheck-result.xml") + add_custom_command( + OUTPUT + "${cppcheck_output_file}" + COMMAND + "${CMAKE_COMMAND}" + -D "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" + -D "CONVERT_SCRIPT=${IE_MAIN_SOURCE_DIR}/scripts/cpplint_to_cppcheckxml.py" + -D "INPUT_FILE=${cpplint_output_file}" + -D "OUTPUT_FILE=${cppcheck_output_file}" + -P "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_to_cppcheck_xml.cmake" + DEPENDS + ${cpplint_output_file} + "${IE_MAIN_SOURCE_DIR}/scripts/cpplint_to_cppcheckxml.py" + "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_to_cppcheck_xml.cmake" + COMMENT + "[cpplint] Convert to cppcheck XML format" + VERBATIM) + + set(report_dir "${IE_MAIN_SOURCE_DIR}/report/cpplint") + set(html_output_file "${report_dir}/index.html") + add_custom_command( + OUTPUT + "${html_output_file}" + COMMAND + "${CMAKE_COMMAND}" + -D "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" + -D "CONVERT_SCRIPT=${IE_MAIN_SOURCE_DIR}/scripts/cppcheck-htmlreport.py" + -D "INPUT_FILE=${cppcheck_output_file}" + -D "REPORT_DIR=${report_dir}" + -D "SOURCE_DIR=${IE_MAIN_SOURCE_DIR}" + -D "TITLE=${CMAKE_PROJECT_NAME}" + -P "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_html.cmake" + DEPENDS + "${cppcheck_output_file}" + "${IE_MAIN_SOURCE_DIR}/scripts/cppcheck-htmlreport.py" + "${IE_MAIN_SOURCE_DIR}/cmake/cpplint_html.cmake" + COMMENT + "[cpplint] Generate HTML report" + VERBATIM) + + add_custom_target(cpplint_report + DEPENDS "${html_output_file}" + COMMENT "[cpplint] Generate report") +endfunction() diff --git a/inference-engine/cmake/cpplint_html.cmake b/inference-engine/cmake/cpplint_html.cmake new file mode 100644 index 0000000..55992d8 --- /dev/null +++ b/inference-engine/cmake/cpplint_html.cmake @@ -0,0 +1,30 @@ +# Copyright (C) 2018-2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +if(EXISTS "${REPORT_DIR}") + file(REMOVE_RECURSE "${REPORT_DIR}") +endif() + +file(MAKE_DIRECTORY "${REPORT_DIR}") + +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${CONVERT_SCRIPT}" + "--file=${INPUT_FILE}" + "--report-dir=${REPORT_DIR}" + "--source-dir=${SOURCE_DIR}" + "--title=${TITLE}") + +# Change cppcheck things to cpplint + +file(READ "${REPORT_DIR}/index.html" cur_file_content) + +string(REPLACE "Cppcheck" "cpplint" cur_file_content ${cur_file_content}) +string(REPLACE "a tool for static C/C++ code analysis" "an open source lint-like tool from Google" cur_file_content ${cur_file_content}) +string(REPLACE "http://cppcheck.sourceforge.net" "http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py" cur_file_content ${cur_file_content}) +string(REPLACE "IRC: irc://irc.freenode.net/cppcheck" " " cur_file_content ${cur_file_content}) + +file(WRITE "${REPORT_DIR}/index.html" "${cur_file_content}") diff --git a/inference-engine/cmake/cpplint_merge.cmake b/inference-engine/cmake/cpplint_merge.cmake new file mode 100644 index 0000000..da87157 --- /dev/null +++ b/inference-engine/cmake/cpplint_merge.cmake @@ -0,0 +1,11 @@ +# Copyright (C) 2018-2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +file(WRITE "${FINAL_OUTPUT_FILE}" "") + +foreach(output_file IN LISTS OUTPUT_FILES) + file(READ "${output_file}" cur_file_content) + file(APPEND "${FINAL_OUTPUT_FILE}" "${cur_file_content}\n") +endforeach() diff --git a/inference-engine/cmake/cpplint_run.cmake b/inference-engine/cmake/cpplint_run.cmake new file mode 100644 index 0000000..f9c9ec5 --- /dev/null +++ b/inference-engine/cmake/cpplint_run.cmake @@ -0,0 +1,37 @@ +# Copyright (C) 2018-2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +file(REMOVE "${OUTPUT_FILE}") + +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${CPPLINT_SCRIPT}" + "--linelength=160" + "--counting=detailed" + "--filter=-readability/fn_size" + "${INPUT_FILE}" + WORKING_DIRECTORY "${WORKING_DIRECTORY}" + RESULT_VARIABLE result + OUTPUT_VARIABLE output + ERROR_VARIABLE output) + +# Display the cpplint output to console (to parse it form IDE) +message("${output}") + +# Store cpplint output to file (replace problematic symbols) +string(REPLACE "\"" ""\;" output ${output}) +string(REPLACE "<" "<\;" output ${output}) +string(REPLACE ">" ">\;" output ${output}) +string(REPLACE "'" "&apos\;" output ${output}) +string(REPLACE "&" "&\;" output ${output}) +file(WRITE "${OUTPUT_FILE}" ${output}) + +if(NOT SKIP_RETURN_CODE) + # Pass through the cpplint return code + if(NOT result EQUAL 0) + message(FATAL_ERROR "[cpplint] Code style check failed for : ${INPUT_FILE}") + endif() +endif() diff --git a/inference-engine/cmake/cpplint_to_cppcheck_xml.cmake b/inference-engine/cmake/cpplint_to_cppcheck_xml.cmake new file mode 100644 index 0000000..6651b93 --- /dev/null +++ b/inference-engine/cmake/cpplint_to_cppcheck_xml.cmake @@ -0,0 +1,12 @@ +# Copyright (C) 2018-2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${CONVERT_SCRIPT}" + INPUT_FILE "${INPUT_FILE}" + OUTPUT_FILE "${OUTPUT_FILE}" + ERROR_FILE "${OUTPUT_FILE}") diff --git a/inference-engine/cmake/debug.cmake b/inference-engine/cmake/debug.cmake index 8d5ad84..9aeb2a5 100644 --- a/inference-engine/cmake/debug.cmake +++ b/inference-engine/cmake/debug.cmake @@ -1,10 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) - function (debug_message) if (VERBOSE_BUILD) message(${ARGV}) diff --git a/inference-engine/cmake/dependencies.cmake b/inference-engine/cmake/dependencies.cmake index cc027bf..a541357 100644 --- a/inference-engine/cmake/dependencies.cmake +++ b/inference-engine/cmake/dependencies.cmake @@ -1,9 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) cmake_policy(SET CMP0054 NEW) #features trigger supported by build system @@ -14,7 +13,7 @@ include(debug) include(dependency_solver) #prepare temporary folder -if (DEFINED ENV{${DL_SDK_TEMP}}) +if (DEFINED ENV{${DL_SDK_TEMP}} AND NOT $ENV{${DL_SDK_TEMP}} STREQUAL "") if (WIN32) string(REPLACE "\\" "\\\\" TEMP $ENV{${DL_SDK_TEMP}}) else(WIN32) @@ -38,9 +37,6 @@ else() set(MODELS_BRANCH "master") endif() -set(MODELS_PATH "${TEMP}/models") -debug_message(STATUS "MODELS_PATH=" ${MODELS_PATH}) - ## enable cblas_gemm from OpenBLAS package if (GEMM STREQUAL "OPENBLAS") if(NOT BLAS_LIBRARIES OR NOT BLAS_INCLUDE_DIRS) @@ -77,6 +73,12 @@ elseif(LINUX) TARGET_PATH "${TEMP}/omp" ENVIRONMENT "OMP" VERSION_REGEX ".*_([a-z]*_([a-z0-9]+\\.)*[0-9]+).*") +else(APPLE) + RESOLVE_DEPENDENCY(OMP + ARCHIVE_MAC "iomp_20190130_mac.tgz" + TARGET_PATH "${TEMP}/omp" + ENVIRONMENT "OMP" + VERSION_REGEX ".*_([a-z]*_([a-z0-9]+\\.)*[0-9]+).*") endif() log_rpath_from_dir(OMP "${OMP}/lib") debug_message(STATUS "intel_omp=" ${OMP}) @@ -96,6 +98,12 @@ elseif(LINUX) ARCHIVE_LIN "tbb2019_20181010_lin.tgz" TARGET_PATH "${TEMP}/tbb" ENVIRONMENT "TBBROOT") +else(APPLE) + RESOLVE_DEPENDENCY(TBB + ARCHIVE_MAC "tbb2019_20190130_mac.tgz" + TARGET_PATH "${TEMP}/tbb" + ENVIRONMENT "TBBROOT" + VERSION_REGEX ".*_([a-z]*_([a-z0-9]+\\.)*[0-9]+).*") endif() log_rpath_from_dir(TBB "${TBB}/lib") debug_message(STATUS "tbb=" ${TBB}) @@ -104,34 +112,51 @@ endif () if (ENABLE_OPENCV) if (WIN32) RESOLVE_DEPENDENCY(OPENCV - ARCHIVE_WIN "opencv_4.0.1-0353.zip" - TARGET_PATH "${TEMP}/opencv_4.0.0" + ARCHIVE_WIN "opencv_4.1.0-0437.zip" + TARGET_PATH "${TEMP}/opencv_4.1.0" + ENVIRONMENT "OpenCV_DIR" + VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*") + log_rpath_from_dir(OPENCV "\\opencv_4.1.0\\bin") + set( ENV{OpenCV_DIR} ${OPENCV}/cmake ) +elseif(APPLE) + RESOLVE_DEPENDENCY(OPENCV + ARCHIVE_MAC "opencv_4.1.0-0437_osx.tar.xz" + TARGET_PATH "${TEMP}/opencv_4.1.0_osx" ENVIRONMENT "OpenCV_DIR" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*") - log_rpath_from_dir(OPENCV "\\opencv_4.0.0\\bin") + log_rpath_from_dir(OPENCV "opencv_4.1.0_osx/lib") set( ENV{OpenCV_DIR} ${OPENCV}/cmake ) elseif(LINUX) if (${LINUX_OS_NAME} STREQUAL "Ubuntu 16.04") RESOLVE_DEPENDENCY(OPENCV - ARCHIVE_LIN "opencv_4.0.0-0305_ubuntu16.tgz" - TARGET_PATH "${TEMP}/opencv_4.0.0_ubuntu" + ARCHIVE_LIN "opencv_4.1.0-0437_ubuntu16.tar.xz" + TARGET_PATH "${TEMP}/opencv_4.1.0_ubuntu16" ENVIRONMENT "OpenCV_DIR" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*") - log_rpath_from_dir(OPENCV "opencv_4.0.0_ubuntu/lib") + log_rpath_from_dir(OPENCV "opencv_4.1.0_ubuntu16/lib") elseif (${LINUX_OS_NAME} STREQUAL "Ubuntu 18.04") RESOLVE_DEPENDENCY(OPENCV - ARCHIVE_LIN "opencv_4.0.0-0305_ubuntu18.tgz" - TARGET_PATH "${TEMP}/opencv_4.0.0_ubuntu18" + ARCHIVE_LIN "opencv_4.1.0-0437_ubuntu18.tar.xz" + TARGET_PATH "${TEMP}/opencv_4.1.0_ubuntu18" ENVIRONMENT "OpenCV_DIR" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*") - log_rpath_from_dir(OPENCV "opencv_4.0.0_ubuntu/lib") + log_rpath_from_dir(OPENCV "opencv_4.1.0_ubuntu18/lib") elseif (${LINUX_OS_NAME} STREQUAL "CentOS 7") RESOLVE_DEPENDENCY(OPENCV - ARCHIVE_LIN "opencv_4.0.0-0305_centos.tgz" - TARGET_PATH "${TEMP}/opencv_4.0.0_centos" + ARCHIVE_LIN "opencv_4.1.0-0437_centos7.tar.xz" + TARGET_PATH "${TEMP}/opencv_4.1.0_centos" + ENVIRONMENT "OpenCV_DIR" + VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*") + log_rpath_from_dir(OPENCV "opencv_4.1.0_centos/lib") +elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "armv7l" AND + (${LINUX_OS_NAME} STREQUAL "Debian 9" OR + ${LINUX_OS_NAME} STREQUAL "Raspbian 9")) + RESOLVE_DEPENDENCY(OPENCV + ARCHIVE_LIN "opencv_4.1.0-0437_debian9arm.tar.xz" + TARGET_PATH "${TEMP}/opencv_4.1.0_debian9arm" ENVIRONMENT "OpenCV_DIR" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*") - log_rpath_from_dir(OPENCV "opencv_4.0.0_centos/lib") + log_rpath_from_dir(OPENCV "opencv_4.1.0_debian9arm/lib") endif() set( ENV{OpenCV_DIR} ${OPENCV}/cmake ) endif() diff --git a/inference-engine/cmake/dependency_solver.cmake b/inference-engine/cmake/dependency_solver.cmake index 92d2994..178b379 100644 --- a/inference-engine/cmake/dependency_solver.cmake +++ b/inference-engine/cmake/dependency_solver.cmake @@ -1,10 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) - include ("download") function (resolve_archive_dependency VAR COMPONENT ARCHIVE ARCHIVE_UNIFIED ARCHIVE_WIN ARCHIVE_LIN ARCHIVE_MAC TARGET_PATH FOLDER ENVIRONMENT) @@ -15,7 +13,7 @@ function (resolve_archive_dependency VAR COMPONENT ARCHIVE ARCHIVE_UNIFIED ARCHI if (NOT DEFINED HAS_ENV) if (ARCHIVE) - #TODO: check wether this is platform specific binary with same name per or it is in common folder + #TODO: check whether this is platform specific binary with same name per or it is in common folder DownloadAndExtract(${COMPONENT} ${ARCHIVE} ${TARGET_PATH} result_path ${FOLDER}) else() DownloadAndExtractPlatformSpecific(${COMPONENT} ${ARCHIVE_UNIFIED} ${ARCHIVE_WIN} ${ARCHIVE_LIN} ${ARCHIVE_MAC} ${TARGET_PATH} result_path ${FOLDER}) @@ -130,11 +128,3 @@ function (RESOLVE_DEPENDENCY NAME_OF_CMAKE_VAR) endif() endfunction(RESOLVE_DEPENDENCY) - -function (resolve_model_dependency network archive network_model_path) - RESOLVE_DEPENDENCY(${network_model_path} - ARCHIVE "models_archives/${archive}" - TARGET_PATH "${MODELS_PATH}/${network}") - string (REPLACE ${MODELS_PATH} "" relative_path ${${network_model_path}}) - set(${network_model_path} ".${relative_path}" PARENT_SCOPE) -endfunction() diff --git a/inference-engine/cmake/download.cmake b/inference-engine/cmake/download.cmake index 6c5ad3f..b5f6bc7 100644 --- a/inference-engine/cmake/download.cmake +++ b/inference-engine/cmake/download.cmake @@ -1,10 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) - function (Download from to fatal result output) if((NOT EXISTS "${to}")) diff --git a/inference-engine/cmake/download_and_apply.cmake b/inference-engine/cmake/download_and_apply.cmake index 4c75c6d..d4869e4 100644 --- a/inference-engine/cmake/download_and_apply.cmake +++ b/inference-engine/cmake/download_and_apply.cmake @@ -1,10 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) - function (DownloadAndApply URL apply_to) if (EXISTS ${apply_to}) diff --git a/inference-engine/cmake/download_and_check.cmake b/inference-engine/cmake/download_and_check.cmake index 6872fe2..5f4e49c 100644 --- a/inference-engine/cmake/download_and_check.cmake +++ b/inference-engine/cmake/download_and_check.cmake @@ -1,23 +1,22 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) include (FindWget) function (DownloadAndCheck from to fatal result) - set(status_res "ON") - set(output 1) + set(status_res "ON") + set(output 1) - get_filename_component(download_dir ${to} DIRECTORY) - if (NOT EXISTS ${download_dir}) - file(MAKE_DIRECTORY ${download_dir}) - endif() + get_filename_component(download_dir ${to} DIRECTORY) + if (NOT EXISTS ${download_dir}) + file(MAKE_DIRECTORY ${download_dir}) + endif() - if(NOT EXISTS "${to}") + if(NOT EXISTS "${to}") + if (${from} MATCHES "(http:)|(https:)|(ftp:)") message(STATUS "Downloading from ${from} to ${to} ...") - find_program(aria2c "aria2c") if (${aria2c} STREQUAL "aria2c-NOTFOUND") if (NOT ${WGET_FOUND}) @@ -48,9 +47,13 @@ function (DownloadAndCheck from to fatal result) status_code: ${status_code}") endif() endif() + else() + message(STATUS "Copying from local folder ${from} to ${to} ... ") + file(COPY ${from} DESTINATION ${download_dir}) endif() + endif() file(REMOVE ${to}.md5) set(${result} "${status_res}" PARENT_SCOPE) -endfunction(DownloadAndCheck) \ No newline at end of file +endfunction(DownloadAndCheck) diff --git a/inference-engine/cmake/download_and_extract.cmake b/inference-engine/cmake/download_and_extract.cmake index 513de81..27af8f8 100644 --- a/inference-engine/cmake/download_and_extract.cmake +++ b/inference-engine/cmake/download_and_extract.cmake @@ -1,9 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) include ("extract") include ("download_and_check") @@ -120,12 +119,12 @@ function (DownloadOrExtractInternal URL archive_path unpacked_path folder fattal if (ENABLE_UNSAFE_LOCATIONS) ExtractWithVersion(${URL} ${archive_path} ${unpacked_path} ${folder} result) if(NOT ${result}) - DownloadAndExtractInternal(${URL} ${archive_path} ${unpacked_path} ${folder} ${fattal} result) + DownloadAndExtractInternal(${URL} ${archive_path} ${unpacked_path} ${folder} ${fattal} result) endif() else() debug_message("archive found on FS : ${archive_path}, however we cannot check it's checksum and think that it is invalid") file(REMOVE_RECURSE "${archive_path}") - DownloadAndExtractInternal(${URL} ${archive_path} ${unpacked_path} ${folder} ${fattal} result) + DownloadAndExtractInternal(${URL} ${archive_path} ${unpacked_path} ${folder} ${fattal} result) endif() @@ -144,7 +143,11 @@ function (CheckOrDownloadAndExtract component RELATIVE_URL archive_name unpacked set (status "ON") set (on_master FALSE) - set (URL "https://download.01.org/openvinotoolkit/2018_R5/dldt/inference_engine/${RELATIVE_URL}") + if(DEFINED ENV{IE_PATH_TO_DEPS}) + set(URL "$ENV{IE_PATH_TO_DEPS}/${RELATIVE_URL}") + else() + set(URL "https://download.01.org/opencv/2019/openvinotoolkit/R1/inference_engine/${RELATIVE_URL}") + endif() #no message on recursive calls if (${use_alternatives}) diff --git a/inference-engine/cmake/extract.cmake b/inference-engine/cmake/extract.cmake index 9b8d5a0..2aa6fd4 100644 --- a/inference-engine/cmake/extract.cmake +++ b/inference-engine/cmake/extract.cmake @@ -1,17 +1,15 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) - function (extract archive_path unpacked_path folder result) # Slurped from a generated extract-TARGET.cmake file. if (NOT EXISTS ${unpacked_path}) get_filename_component(unpacked_dir ${unpacked_path} DIRECTORY) - + file(MAKE_DIRECTORY ${unpacked_path}) - + message(STATUS "extracting... src='${archive_path}' dst='${unpacked_path}'") diff --git a/inference-engine/cmake/features.cmake b/inference-engine/cmake/features.cmake index d9ff98b..b6d2266 100644 --- a/inference-engine/cmake/features.cmake +++ b/inference-engine/cmake/features.cmake @@ -1,11 +1,9 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required (VERSION 2.8) - -include ("options") +include (options) #this options are aimed to optimize build time on development system @@ -21,8 +19,6 @@ ie_option (ENABLE_PROFILING_ITT "ITT tracing of IE and plugins internals" ON) ie_option (ENABLE_PROFILING_RAW "Raw counters profiling (just values, no start/stop time or timeline)" OFF) -# - # "MKL-DNN library might use MKL-ML or OpenBLAS for gemm tasks: MKL|OPENBLAS|JIT" if (NOT GEMM STREQUAL "MKL" AND NOT GEMM STREQUAL "OPENBLAS" @@ -30,15 +26,17 @@ if (NOT GEMM STREQUAL "MKL" set (GEMM "JIT") message(STATUS "GEMM should be set to MKL, OPENBLAS or JIT. Default option is " ${GEMM}) endif() +set(GEMM "${GEMM}" CACHE STRING "Gemm implementation" FORCE) list (APPEND IE_OPTIONS GEMM) # "MKL-DNN library based on OMP or TBB or Sequential implementation: TBB|OMP|SEQ" if (NOT THREADING STREQUAL "TBB" AND NOT THREADING STREQUAL "OMP" AND NOT THREADING STREQUAL "SEQ") - set (THREADING "OMP") + set (THREADING "TBB") message(STATUS "THREADING should be set to TBB, OMP or SEQ. Default option is " ${THREADING}) endif() +set(THREADING "${THREADING}" CACHE STRING "Threading" FORCE) list (APPEND IE_OPTIONS THREADING) # Enable postfixes for Debug/Release builds @@ -53,7 +51,9 @@ else() set (IE_DEBUG_POSTFIX ${IE_DEBUG_POSTFIX_LIN}) set (IE_RELEASE_POSTFIX ${IE_RELEASE_POSTFIX_LIN}) endif() +set(IE_DEBUG_POSTFIX "${IE_DEBUG_POSTFIX}" CACHE STRING "Debug postfix" FORCE) list (APPEND IE_OPTIONS IE_DEBUG_POSTFIX) +set(IE_RELEASE_POSTFIX "${IE_RELEASE_POSTFIX}" CACHE STRING "Release postfix" FORCE) list (APPEND IE_OPTIONS IE_RELEASE_POSTFIX) ie_option (ENABLE_TESTS "unit and functional tests" OFF) @@ -62,6 +62,7 @@ ie_option (ENABLE_GAPI_TESTS "unit tests for GAPI kernels" OFF) ie_option (GAPI_TEST_PERF "if GAPI unit tests should examine performance" OFF) +ie_option (ENABLE_SAMPLES "console samples are part of inference engine package" ON) ie_option (ENABLE_SAMPLES_CORE "console samples core library" ON) @@ -93,6 +94,9 @@ ie_option (ENABLE_DEBUG_SYMBOLS "generates symbols for debugging" OFF) ie_option (ENABLE_PYTHON "enables ie python bridge build" OFF) +ie_option(ENABLE_CPPLINT "Enable cpplint checks during the build" OFF) +ie_option(ENABLE_CPPLINT_REPORT "Build cpplint report instead of failing the build" OFF) + #environment variables used #name of environment variable stored path to temp directory" diff --git a/inference-engine/cmake/ie_parallel.cmake b/inference-engine/cmake/ie_parallel.cmake index 7c183b5..18ccdf0 100644 --- a/inference-engine/cmake/ie_parallel.cmake +++ b/inference-engine/cmake/ie_parallel.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # @@ -94,7 +94,13 @@ function(set_ie_threading_interface_for TARGET_NAME) endif () endif () endif () + endif () target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=${IE_THREAD_DEFINE}) + + if (NOT THREADING STREQUAL "SEQ") + find_package(Threads REQUIRED) + target_link_libraries(${TARGET_NAME} PUBLIC ${CMAKE_THREAD_LIBS_INIT}) + endif() endfunction(set_ie_threading_interface_for) diff --git a/inference-engine/cmake/itt.cmake b/inference-engine/cmake/itt.cmake index add2811..3ed2394 100644 --- a/inference-engine/cmake/itt.cmake +++ b/inference-engine/cmake/itt.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # diff --git a/inference-engine/cmake/linux_name.cmake b/inference-engine/cmake/linux_name.cmake index 0dd8dd5..8b07919 100644 --- a/inference-engine/cmake/linux_name.cmake +++ b/inference-engine/cmake/linux_name.cmake @@ -1,10 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - if (UNIX) function(get_linux_name res_var) if (NOT EXISTS "/etc/lsb-release") diff --git a/inference-engine/cmake/mode.cmake b/inference-engine/cmake/mode.cmake index 6ecdfaa..3e55471 100644 --- a/inference-engine/cmake/mode.cmake +++ b/inference-engine/cmake/mode.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # diff --git a/inference-engine/cmake/options.cmake b/inference-engine/cmake/options.cmake index 1f44f87..3cc68d6 100644 --- a/inference-engine/cmake/options.cmake +++ b/inference-engine/cmake/options.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # diff --git a/inference-engine/cmake/os_flags.cmake b/inference-engine/cmake/os_flags.cmake index cb7c6b1..29608ea 100644 --- a/inference-engine/cmake/os_flags.cmake +++ b/inference-engine/cmake/os_flags.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # @@ -8,10 +8,13 @@ if (WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") #no asynchronous structured exception handling set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE") - + + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Z7") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Z7") + if(ENABLE_DEBUG_SYMBOLS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Z7") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Z7") set(DEBUG_SYMBOLS_LINKER_FLAGS "/DEBUG") if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") diff --git a/inference-engine/cmake/sanitizer.cmake b/inference-engine/cmake/sanitizer.cmake index cdbe108..23814e7 100644 --- a/inference-engine/cmake/sanitizer.cmake +++ b/inference-engine/cmake/sanitizer.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # @@ -11,7 +11,11 @@ if (ENABLE_SANITIZER) if (SANITIZE_RECOVER_SUPPORTED) set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize-recover=address") endif() - set(SANITIZER_LINKER_FLAGS "-fsanitize=address -fuse-ld=gold") + + set(SANITIZER_LINKER_FLAGS "-fsanitize=address") + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fuse-ld=gold") + endif() set(CMAKE_CC_FLAGS "${CMAKE_CC_FLAGS} ${SANITIZER_COMPILER_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SANITIZER_COMPILER_FLAGS}") diff --git a/inference-engine/cmake/sdl.cmake b/inference-engine/cmake/sdl.cmake index 26618c6..e6229a7 100644 --- a/inference-engine/cmake/sdl.cmake +++ b/inference-engine/cmake/sdl.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # diff --git a/inference-engine/cmake/share/InferenceEngineConfig-version.cmake.in b/inference-engine/cmake/share/InferenceEngineConfig-version.cmake.in index 506fc54..bc4c3a9 100644 --- a/inference-engine/cmake/share/InferenceEngineConfig-version.cmake.in +++ b/inference-engine/cmake/share/InferenceEngineConfig-version.cmake.in @@ -1,9 +1,9 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -set(InferenceEngine_VERSION 1.5.0) +set(InferenceEngine_VERSION 1.6.0) set(PACKAGE_VERSION ${InferenceEngine_VERSION}) set(PACKAGE_VERSION_EXACT False) diff --git a/inference-engine/cmake/share/InferenceEngineConfig.cmake.in b/inference-engine/cmake/share/InferenceEngineConfig.cmake.in index 8f806e9..860870b 100644 --- a/inference-engine/cmake/share/InferenceEngineConfig.cmake.in +++ b/inference-engine/cmake/share/InferenceEngineConfig.cmake.in @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # @@ -54,72 +54,27 @@ else() set(THREADING "@THREADING@") # check whether setvars.sh is sourced - if(NOT IE_ROOT_DIR AND (DEFINED ENV{InferenceEngine_DIR} OR InferenceEngine_DIR OR DEFINED ENV{INTEL_CVSDK_DIR})) + if(NOT IE_ROOT_DIR AND (DEFINED ENV{InferenceEngine_DIR} OR InferenceEngine_DIR OR DEFINED ENV{INTEL_OPENVINO_DIR})) if (EXISTS "${InferenceEngine_DIR}") # InferenceEngine_DIR manually set via command line params set(IE_ROOT_DIR "${InferenceEngine_DIR}/..") elseif (EXISTS "$ENV{InferenceEngine_DIR}") # InferenceEngine_DIR manually set via env set(IE_ROOT_DIR "$ENV{InferenceEngine_DIR}/..") - elseif (EXISTS "$ENV{INTEL_CVSDK_DIR}/inference_engine") + elseif (EXISTS "$ENV{INTEL_OPENVINO_DIR}/inference_engine") # if we installed DL SDK - set(IE_ROOT_DIR "$ENV{INTEL_CVSDK_DIR}/inference_engine") - elseif (EXISTS "$ENV{INTEL_CVSDK_DIR}/deployment_tools/inference_engine") + set(IE_ROOT_DIR "$ENV{INTEL_OPENVINO_DIR}/inference_engine") + elseif (EXISTS "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine") # CV SDK is installed - set(IE_ROOT_DIR "$ENV{INTEL_CVSDK_DIR}/deployment_tools/inference_engine") + set(IE_ROOT_DIR "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine") endif() endif() - if(IE_ROOT_DIR) - if (WIN32) - set(_OS_PATH "") - else() - if (NOT EXISTS "/etc/lsb-release") - execute_process(COMMAND find -L /etc/ -maxdepth 1 -type f -name *-release -exec cat {} \; - OUTPUT_VARIABLE release_data RESULT_VARIABLE result) - set(name_regex "NAME=\"([^ \"\n]*).*\"\n") - set(version_regex "VERSION=\"([0-9]+(\\.[0-9]+)?)[^\n]*\"") - else() - #linux version detection using cat /etc/lsb-release - file(READ "/etc/lsb-release" release_data) - set(name_regex "DISTRIB_ID=([^ \n]*)\n") - set(version_regex "DISTRIB_RELEASE=([0-9]+(\\.[0-9]+)?)") - endif() - - string(REGEX MATCH ${name_regex} name ${release_data}) - set(os_name ${CMAKE_MATCH_1}) - - string(REGEX MATCH ${version_regex} version ${release_data}) - set(os_name "${os_name} ${CMAKE_MATCH_1}") - - if (NOT os_name) - ext_message(FATAL_ERROR "Cannot detect OS via reading /etc/*-release:\n ${release_data}") - endif() - - if (NOT InferenceEngine_FIND_QUIETLY) - message (STATUS "/etc/*-release distrib: ${os_name}") - endif() - - if (${os_name} STREQUAL "Ubuntu 14.04") - set(_OS_PATH "ubuntu_14.04/") - elseif (${os_name} STREQUAL "Ubuntu 16.04") - set(_OS_PATH "ubuntu_16.04/") - elseif (${os_name} STREQUAL "Ubuntu 18.04") - set(_OS_PATH "ubuntu_18.04/") - elseif (${os_name} STREQUAL "CentOS 7") - set(_OS_PATH "centos_7.4/") - elseif (${os_name} STREQUAL "poky 2.0") - set(_OS_PATH "ubuntu_16.04/") - elseif (${os_name} STREQUAL "poky 2.5") - set(_OS_PATH "ubuntu_18.04/") - elseif (${os_name} STREQUAL "Raspbian 9") - set(_OS_PATH "raspbian_9/") - else() - ext_message(FATAL_ERROR "${os_name} is not supported. List of supported OS: Ubuntu 14.04, Ubuntu 16.04, Ubuntu 18.04, CentOS 7, poky 2.0, poky 2.5, Raspbian 9") - endif() - endif() + if(NOT IE_ROOT_DIR) + ext_message(FATAL_ERROR "inference_engine directory is not found") endif() + if(IE_INCLUDE_DIR AND NOT "${IE_ROOT_DIR}/include" EQUAL "${IE_INCLUDE_DIR}") unset(IE_INCLUDE_DIR CACHE) endif() @@ -128,13 +83,13 @@ else() unset(IE_SRC_DIR CACHE) endif() - if(IE_LIBRARY AND NOT "${IE_ROOT_DIR}/lib/${_OS_PATH}/${_ARCH}" EQUAL "${IE_LIBRARY}") + if(IE_LIBRARY AND NOT "${IE_ROOT_DIR}/lib/${_ARCH}" EQUAL "${IE_LIBRARY}") unset(IE_LIBRARY CACHE) endif() set(_IE_ROOT_INCLUDE_DIR "${IE_ROOT_DIR}/include") set(_IE_ROOT_SRC_DIR "${IE_ROOT_DIR}/src") - set(_IE_ROOT_LIBRARY "${IE_ROOT_DIR}/lib/${_OS_PATH}/${_ARCH}") + set(_IE_ROOT_LIBRARY "${IE_ROOT_DIR}/lib/${_ARCH}") find_path(IE_INCLUDE_DIR inference_engine.hpp "${_IE_ROOT_INCLUDE_DIR}") find_path(IE_SRC_DIR extension "${_IE_ROOT_SRC_DIR}") diff --git a/inference-engine/cmake/version.cmake b/inference-engine/cmake/version.cmake index 645c257..daf21cd 100644 --- a/inference-engine/cmake/version.cmake +++ b/inference-engine/cmake/version.cmake @@ -1,10 +1,8 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - function (branchName VAR) execute_process( COMMAND git rev-parse --abbrev-ref HEAD diff --git a/inference-engine/ie_bridges/python/CMakeLists.txt b/inference-engine/ie_bridges/python/CMakeLists.txt index 2ce462b..6176ccc 100644 --- a/inference-engine/ie_bridges/python/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/CMakeLists.txt @@ -26,6 +26,11 @@ if (NOT(IE_MAIN_SOURCE_DIR)) if(NOT(WIN32)) set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE}) endif() +else() + if (UNIX OR APPLE) + # cython generated files requires public visibility. Force visibility required. + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility=default") + endif() endif() include (UseCython) @@ -45,5 +50,4 @@ endif() find_package (InferenceEngine REQUIRED) set (PYTHON_BRIDGE_SRC_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) -add_subdirectory (src/openvino/inference_engine) -add_subdirectory (src/openvino/inference_engine/dnn_builder) \ No newline at end of file +add_subdirectory (src/openvino/inference_engine) \ No newline at end of file diff --git a/inference-engine/ie_bridges/python/cmake/FindCython.cmake b/inference-engine/ie_bridges/python/cmake/FindCython.cmake index 3070950..baadc4d 100644 --- a/inference-engine/ie_bridges/python/cmake/FindCython.cmake +++ b/inference-engine/ie_bridges/python/cmake/FindCython.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2016 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/inference-engine/ie_bridges/python/cmake/UseCython.cmake b/inference-engine/ie_bridges/python/cmake/UseCython.cmake index 1b9a0a2..373621b 100644 --- a/inference-engine/ie_bridges/python/cmake/UseCython.cmake +++ b/inference-engine/ie_bridges/python/cmake/UseCython.cmake @@ -46,7 +46,7 @@ # # See also FindCython.cmake -# Copyright (c) 2016 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/inference-engine/ie_bridges/python/docs/api_overview.md b/inference-engine/ie_bridges/python/docs/api_overview.md index 3a182ec..8365cc8 100644 --- a/inference-engine/ie_bridges/python/docs/api_overview.md +++ b/inference-engine/ie_bridges/python/docs/api_overview.md @@ -1,7 +1,7 @@ # Overview of Inference Engine Python* API -**NOTE:** It is a preview version of the Inference Engine Python\* API for evaluation purpose only. -Module structure and API itself may be changed in future releases. +> **NOTE:** It is a preview version of the Inference Engine Python\* API for evaluation purpose only. +> Module structure and API itself may be changed in future releases. This API provides a simplified interface for Inference Engine functionality that allows to: @@ -21,24 +21,24 @@ Supported Python* versions: ## Setting Up the Environment To configure the environment for the Inference Engine Python\* API, run: - * On Ubuntu 16.04: `source /bin/setupvars.sh .` + * On Ubuntu 16.04: `source /bin/setupvars.sh .` * On Windows 10: `call \deployment_tools\inference_engine\python_api\setenv.bat` - + The script automatically detects latest installed Python\* version and configures required environment if the version is supported. If you want to use certain version of Python\*, set the environment variable `PYTHONPATH=/deployment_tools/inference_engine/python_api/` after running the environment configuration script. - + ## IENetLayer -This class stores main information about the layer and allow to modify some layer parameters +This class stores main information about the layer and allow to modify some layer parameters ### Class attributes: - -* `name` - Name of the layer + +* `name` - Name of the layer * `type`- Layer type * `precision` - Layer base operating precision. Provides getter and setter interfaces. * `layout` - Returns the layout of shape of the layer. * `shape` - Return the list of the shape of the layer. * `parents` - Returns a list, which contains names of layers preceding this layer. -* `children` - Returns a list, which contains names of layers following this layer. +* `children` - Returns a list, which contains names of layers following this layer. * `affinity` - Layer affinity set by user or a default affinity set by the `IEPlugin.set_initial_affinity()` method. The affinity attribute provides getter and setter interfaces, so the layer affinity can be modified directly. For example: @@ -46,39 +46,39 @@ This class stores main information about the layer and allow to modify some laye >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> plugin = IEPlugin(device="HETERO:FPGA,CPU") >>> plugin.set_config({"TARGET_FALLBACK": "HETERO:FPGA,CPU"}) ->>> plugin.set_initial_affinity(net) +>>> plugin.set_initial_affinity(net) >>> for l in net.layers.values(): ... if l.type == "Convolution": ... l.affinity = "CPU" ``` - -To correctly set affinity for the network, you must first initialize and properly configure the HETERO plugin. -`set_config({"TARGET_FALLBACK": "HETERO:FPGA,GPU"})` function configures the plugin fallback devices and their order. -`plugin.set_initial_affinity(net)` function sets affinity parameter of model layers according to its support -on specified devices. -After default affinity is set by the plugin, override the default values by setting affinity manually how it's +To correctly set affinity for the network, you must first initialize and properly configure the HETERO plugin. +`set_config({"TARGET_FALLBACK": "HETERO:FPGA,GPU"})` function configures the plugin fallback devices and their order. +`plugin.set_initial_affinity(net)` function sets affinity parameter of model layers according to its support +on specified devices. + +After default affinity is set by the plugin, override the default values by setting affinity manually how it's described in example above -To understand how default and non-default affinities are set: +To understand how default and non-default affinities are set: 1. Call `net.layers` function right after model loading and check that layer affinity parameter is empty. 2. Call `plugin.set_default_affinity(net)`. 3. Call `net.layers` and check layer affinity parameters to see how plugin set a default affinity 4. Set layer affinity how it's described above -5. Call `net.layers` again and check layer affinity parameters to see how it was changed after manual affinity +5. Call `net.layers` again and check layer affinity parameters to see how it was changed after manual affinity setting - + Please refer to `affinity_setting_demo.py` to see the full usage pipeline. - + * `weights`- Dictionary with layer weights, biases or custom blobs if any * `params` - Layer specific parameters. Provides getter and setter interfaces to get and modify layer parameters. - Please note that some modifications can be ignored and\or overwriten by target plugin (e.g. modification of + Please note that some modifications can be ignored and\or overwriten by target plugin (e.g. modification of convolution kernel size will be reflected in layer parameters but finally the plugin will ignore it and will - use initial kernel size) + use initial kernel size) -## IENetwork +## IENetwork This class contains the information about the network model read from IR and allows you to manipulate with some model parameters such as layers affinity and output layers. @@ -86,18 +86,15 @@ layers affinity and output layers. ### Class Constructor * `__init__(model: str, weights: str)` - * Parameters: - * model - Path to `.xml` file of the IR * weights - Path to `.bin` file of the IR ### Class attributes: * `name` - Name of the loaded network -* `inputs` - A dictionary that maps input layer names to InputInfo objects. +* `inputs` - A dictionary that maps input layer names to InputInfo objects. For example, to get a shape of the input layer: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net.inputs @@ -105,10 +102,8 @@ layers affinity and output layers. >>> net.inputs['data'].shape [1, 3, 224, 224] ``` - * `outputs` - A dictionary that maps output layer names to OutputInfo objects For example, to get a shape of the output layer: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net.inputs @@ -116,10 +111,9 @@ layers affinity and output layers. >>> net.outputs['prob'].shape [1, 1000] ``` - -* `batch_size` - Batch size of the network. Provides getter and setter interfaces to get and modify the + +* `batch_size` - Batch size of the network. Provides getter and setter interfaces to get and modify the network batch size. For example: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net.batch_size @@ -130,10 +124,8 @@ layers affinity and output layers. >>> net.inputs['data'].shape [4, 3, 224, 224] ``` - -* `layers` - Return dictionary that maps network layer names to `IENetLayer` +* `layers` - Return dictionary that maps network layer names to `IENetLayer` objects containing layer properties in topological order. For example, to list all network layers: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net.layers @@ -141,11 +133,10 @@ layers affinity and output layers. ... } ``` - - * `stats` - Returns `LayersStatsMap` object containing dictionary that maps network layer names to calibration statistics + * `stats` - Returns `LayersStatsMap` object containing dictionary that maps network layer names to calibration statistics represented by `LayerStats` objects. `LayersStatsMap` class inherited from built-in python `dict` and overrides default `update()`method to allow - to set or modify layers calibration statistics. + to set or modify layers calibration statistics. ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net.stats.update({ @@ -153,151 +144,104 @@ layers affinity and output layers. "conv2_2d" : LayserStats(min=(-5, -1, 0, 1, -7, 2), max=(63, 124, 70, 174, 99, 106)), }) ``` -For more details about low precision inference please refer to "Low-Precision 8-bit Integer Inference" -section in Inference Engine Developers Guide documentation. +For more details about low precision inference please refer to "Low-Precision 8-bit Integer Inference" +section in Inference Engine Developers Guide documentation. - ### Class Methods -* `from_ir(model: str, weights: str)` - -**Note:** The function is deprecated. Please use `IENetwork()` class constructor to create valid instance of `IENetwork` - - * Description: - +* `from_ir(model: str, weights: str)` +> **NOTE:** The function is deprecated. Please use `IENetwork()` class constructor to create valid instance of `IENetwork` + * Description: The class method serves to read the model from the `.xml` and `.bin` files of the IR. - * Parameters: - * model - Path to `.xml` file of the IR * weights - Path to `.bin` file of the IR - * Return value: - An instance of the `IENetwork` class - * Usage example: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net ``` - + ### Instance Methods - -* `add_outputs(outputs)`: - * Description: - - The method serves to mark any intermediate layer as output layer to retrieve the inference results +* `add_outputs(outputs)`: + * Description: + The method serves to mark any intermediate layer as output layer to retrieve the inference results from the specified layers. - * Parameters: - * `outputs` - List of layer names to be set as model outputs. In case of setting one layer as output, string with one layer can be provided. - * Return value: - None - * Usage example: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> net.add_outputs(["conv5_1/dwise', conv2_1/expand'])] >>> net.outputs ['prob', 'conv5_1/dwise', 'conv2_1/expand'] ``` - -**Note** - -The last layers (nodes without successors in graph representation of the model) are set as output -by default. In the case above, `prob` layer is a default output and `conv5_1/dwise`, `conv2_1/expand` are user-defined -outputs. +> **NOTE**: The last layers (nodes without successors in graph representation of the model) are set as output +> by default. In the case above, `prob` layer is a default output and `conv5_1/dwise`, `conv2_1/expand` are user-defined +> outputs. * `reshape(input_shapes: dict)`: - - * Description: - + * Description: The method reshapes the network to change spatial dimensions, batch size, or any dimension. - - **Note:** - - Before using this method, make sure that the target shape is applicable for the network - Changing the network shape to an arbitrary value may lead to unpredictable behaviour. - +> **Note:** Before using this method, make sure that the target shape is applicable for the network. Changing the network shape to an arbitrary value may lead to unpredictable behaviour. * Parameters: - * `input_shapes` - The dictionary that maps input layer names to tuples with the target shape - - * Return value: - - None - + * Return value: + None * Usage example: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> input_layer = next(iter(net.inputs)) >>> n, c, h, w = net.inputs[input_layer] >>> net.reshape({input_layer: (n, c, h*2, w*2)}] -``` - -* `serialize(path_to_xml, path_to_bin)`: - - * Description: - - The method serializes the network and stores it in files. - - * Parameters: - - * `path_to_xml` - path to a file, where a serialized model will be stored. +``` +* `serialize(path_to_xml, path_to_bin)`: + * Description: + The method serializes the network and stores it in files. + * Parameters: + * `path_to_xml` - path to a file, where a serialized model will be stored. * `path_to_bin` - path to a file, where serialized weights will be stored. - * Return value: - - None - + None * Usage example: - ```py >>> net = IENetwork(model=path_to_model, weights=path_to_weights) >>> net.serialize(path_to_xml, path_to_bin) -``` +``` + ## LayerStats -Layer calibration statistic container + +Layer calibration statistic container. + ### Class Constructor * `__init__(min: tuple = (), max: tuple = ())` - * Parameters: - - * min - Tuple with per-channel minimum layer activation values + * min - Tuple with per-channel minimum layer activation values * max - Tuple with per-channel maximum layer activation values -## InputInfo +## InputInfo This class contains the information about the network input layers ### Class attributes: -* `precision` - Precision of the input data provided by user. Provides setter and getter interfaces +* `precision` - Precision of the input data provided by user. Provides setter and getter interfaces to get and modify input layer precision. - List of applicable precisions: FP32 FP16, I32, I16, I8, U32, U16 - - **Note**: Support of any calculation precision depends on the target plugin - +> **NOTE**: Support of any calculation precision depends on the target plugin. * `layout` - Layout of the input data provided by user. Provides setter and getter interfaces - to get and modify input layer layout. - + to get and modify input layer layout. List of applicable layouts: NCHW, NHWC, OIHW, C, CHW, HW, NC, CN, BLOCKED - * `shape` - input layer data shape - -## OutputInfo +## OutputInfo This class contains the information about the network input layers @@ -305,52 +249,40 @@ This class contains the information about the network input layers * `precision` - Precision of the output data. Provides setter and getter interfaces to get and modify output layer precision. - * `layout` - Layout of the output data provided by user - * `shape` - Input layer data shape - + ## IEPlugin Class This class is the main plugin interface and serves to initialize and configure the plugin. - + ### Class Constructor * `__init__(device: str, plugin_dirs=None)` - * Parameters: - * `device` - Target device name. Supported devices: CPU, GPU, FPGA, MYRIAD, HETERO - * `plugin_dirs` - List of paths to plugin directories - + * `plugin_dirs` - List of paths to plugin directories + ### Properties * `device` - a name of the device that was specified to initialize IEPlugin -* `version` - a version of the plugin +* `version` - a version of the plugin ### Instance Methods * ```load(network: IENetwork, num_requests: int=1, config=None)``` - - * Description: - - Loads a network that was read from the IR to the plugin and creates an executable network from a network object. - You can create as many networks as you need and use them simultaneously (up to the limitation of the hardware + * Description: + Loads a network that was read from the IR to the plugin and creates an executable network from a network object. + You can create as many networks as you need and use them simultaneously (up to the limitation of the hardware resources). - * Parameters: - * `network` - A valid `IENetwork` instance - * `num_requests` - A positive integer value of infer requests to be created. Number of infer requests may be limited + * `num_requests` - A positive integer value of infer requests to be created. Number of infer requests may be limited by device capabilities. * `config` - A dictionary of plugin configuration keys and their values - - * Return value: - + * Return value: None - * Usage example: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> plugin = IEPlugin(device="CPU") @@ -358,89 +290,52 @@ This class is the main plugin interface and serves to initialize and configure t >>> exec_net ``` - * `set_initial_affinity(net: IENetwork)` - * Description: - - Sets initial affinity for model layers according to the HETERO plugin logic. Applicable only if + Sets initial affinity for model layers according to the HETERO plugin logic. Applicable only if IEPlugin was initialized for HETERO device. - * Parameters: - - * `net` - A valid instance of IENetwork - - * Return value: - - None - - * Usage example: - - See `affinity` attribute of the `IENetLayer` class. - + * `net` - A valid instance of IENetwork + * Return value: + None + * Usage example: + See `affinity` attribute of the `IENetLayer` class. * `add_cpu_extension(extension_path: str)` - * Description: - - Loads extensions library to the plugin. Applicable only for CPU device and HETERO device with CPU - + Loads extensions library to the plugin. Applicable only for CPU device and HETERO device with CPU * Parameters: - - * `extension_path` - A full path to CPU extensions library - + * `extension_path` - A full path to CPU extensions library * Return value: - None - * Usage example: - ```py >>> plugin = IEPlugin(device="CPU") >>> plugin.add_cpu_extenstions(ext_lib_path) -``` - - +``` * `set_config(config: dict)` - - * Description: - - Sets a configuration for the plugin. Refer to `SetConfig()` in Inference Engine C++ documentation for acceptable + * Description: + Sets a configuration for the plugin. Refer to `SetConfig()` in Inference Engine C++ documentation for acceptable keys and values list. - - * Parameters: - + * Parameters: * `config` - A dictionary of keys and values of acceptable configuration parameters - * Return value: - None - - * Usage examples: - - See `set_affinity` method of the `IENetwork` class. - + * Usage examples: + See `set_affinity` method of the `IENetwork` class. * `get_supported_layers(net: IENetwork)` - * Description: - - Returns the set of layers supported by the plugin. Please note that in case of CPU plugin support of - a layer may depends on extension loaded by `add_cpu_extenstion()` method - + Returns the set of layers supported by the plugin. Please note that in case of CPU plugin support of + a layer may depends on extension loaded by `add_cpu_extenstion()` method * Parameters: - - * `net` - A valid instance of IENetwork - + * `net` - A valid instance of IENetwork * Return value: - Set of layers supported by the plugin - - * Usage example: - - See `affinity` attribute of the `IENetLayer` class. - + * Usage example: + See `affinity` attribute of the `IENetLayer` class. + ## ExecutableNetwork Class -This class represents a network instance loaded to plugin and ready for inference. +This class represents a network instance loaded to plugin and ready for inference. ### Class Constructor @@ -449,37 +344,28 @@ There is no explicit class constructor. To make a valid instance of `ExecutableN ### Class attributes * `requests` - A tuple of InferRequest instances - - * Usage example: - + * Usage example: ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> plugin = IEPlugin(device="CPU") >>> exec_net = plugin.load(network=net, num_requsts=3) >>> exec_net.requests -(, -, +(, +, ) ``` - + ### Instance Methods * `infer(inputs=None)` - * Description: - Starts synchronous inference for the first infer request of the executable network and returns output data. Wraps `infer()` method of the `InferRequest` class - * Parameters: * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer - * Return value: - A dictionary that maps output layer names to `numpy.ndarray` objects with output data of the layer - * Usage example: - ```py >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file) >>> plugin = IEPlugin(device="CPU") @@ -493,35 +379,26 @@ There is no explicit class constructor. To make a valid instance of `ExecutableN ...... ]])} ``` -For illustration of input data preparation, please see samples (for example, `classification_sample.py`). - + For illustration of input data preparation, please see samples (for example, `classification_sample.py`). * `start_async(request_id, inputs=None)` - * Description: - Starts asynchronous inference for specified infer request. Wraps `async_infer()` method of the `InferRequest` class - * Parameters: - * `request_id` - Index of infer request to start inference * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer - * Return value: - A handler of specified infer request, which is an instance of the `InferRequest` class. - * Usage example: - ```py >>> infer_request_handle = exec_net.start_async(request_id=0, inputs={input_blob: image}) >>> infer_status = infer_request_handle.wait() >>> res = infer_request_handle.outputs[out_blob] ``` - -For more details about infer requests processing, see `classification_sample_async.py` (simplified case) and + +For more details about infer requests processing, see `classification_sample_async.py` (simplified case) and `object_detection_demo_ssd_async.py` (real asynchronous use case) samples. - + ## InferRequest Class This class provides an interface to infer requests of `ExecutableNetwork` and serves to handle infer requests execution @@ -529,153 +406,107 @@ and to set and get output data. ### Class Constructor -There is no explicit class constructor. To make a valid `InferRequest` instance, use `load()` method of the `IEPlugin` -class with specified number of requests to get `ExecutableNetwork` instance which stores infer requests. +There is no explicit class constructor. To make a valid `InferRequest` instance, use `load()` method of the `IEPlugin` +class with specified number of requests to get `ExecutableNetwork` instance which stores infer requests. ### Class attributes * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer * `outputs` - A dictionary that maps output layer names to `numpy.ndarray` objects with output data of the layer - * Usage example: - ```py >>> exec_net.requests[0].inputs['data'][:] = image >>> exec_net.requests[0].infer() >>> res = exec_net.requests[0].outputs['prob'] ->>> np.flip(np.sort(np.squeeze(res)),0) +>>> np.flip(np.sort(np.squeeze(res)),0) array([4.85416055e-01, 1.70385033e-01, 1.21873841e-01, 1.18894853e-01, 5.45198545e-02, 2.44456064e-02, 5.41366823e-03, 3.42589128e-03, 2.26027006e-03, 2.12283316e-03 ...]) -``` - +``` + ### Instance Methods -It is not recommended to run inference directly on `InferRequest` instance. -To run inference, please use simplified methods `infer()` and `start_async()` of `ExecutableNetwork`. +It is not recommended to run inference directly on `InferRequest` instance. +To run inference, please use simplified methods `infer()` and `start_async()` of `ExecutableNetwork`. * `infer(inputs=None)` - - * Description: - - Starts synchronous inference of the infer request and fill outputs array - - * Parameters: - - * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer - - * Return value: - - None - - * Usage example: - + * Description: + Starts synchronous inference of the infer request and fill outputs array + * Parameters: + * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer + * Return value: + None + * Usage example: ```py >>> exec_net = plugin.load(network=net, num_requests=2) >>> exec_net.requests[0].infer({input_blob: image}) >>> res = exec_net.requests[0].outputs['prob'] ->>> np.flip(np.sort(np.squeeze(res)),0) +>>> np.flip(np.sort(np.squeeze(res)),0) array([4.85416055e-01, 1.70385033e-01, 1.21873841e-01, 1.18894853e-01, 5.45198545e-02, 2.44456064e-02, 5.41366823e-03, 3.42589128e-03, - 2.26027006e-03, 2.12283316e-03 ...]) -``` - + 2.26027006e-03, 2.12283316e-03 ...]) +``` * `async_infer(inputs=None)` - - * Description: - - Starts asynchronous inference of the infer request and fill outputs array - - * Parameters: - - * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer - - * Return value: - - None - - * Usage example: - + * Description: + Starts asynchronous inference of the infer request and fill outputs array + * Parameters: + * `inputs` - A dictionary that maps input layer names to `numpy.ndarray` objects of proper shape with input data for the layer + * Return value: + None + * Usage example: ```py >>> exec_net = plugin.load(network=net, num_requests=2) >>> exec_net.requests[0].async_infer({input_blob: image}) >>> exec_net.requests[0].wait() >>> res = exec_net.requests[0].outputs['prob'] ->>> np.flip(np.sort(np.squeeze(res)),0) +>>> np.flip(np.sort(np.squeeze(res)),0) array([4.85416055e-01, 1.70385033e-01, 1.21873841e-01, 1.18894853e-01, 5.45198545e-02, 2.44456064e-02, 5.41366823e-03, 3.42589128e-03, - 2.26027006e-03, 2.12283316e-03 ...]) -``` - + 2.26027006e-03, 2.12283316e-03 ...]) +``` * `wait(timeout=-1)` - - * Description: - - Waits for the result to become available. Blocks until specified timeout elapses or the result - becomes available, whichever comes first. - - **Note:** - - There are special values of the timeout parameter: - - * 0 - Immediately returns the inference status. It does not block or interrupt execution. + * Description: + Waits for the result to become available. Blocks until specified timeout elapses or the result + becomes available, whichever comes first. +> **NOTE:** There are special values of the timeout parameter: + * 0 - Immediately returns the inference status. It does not block or interrupt execution. To find statuses meaning, please refer to InferenceEngine::StatusCode in Inference Engine C++ documentation - * -1 - Waits until inference result becomes available (default value) - * Parameters: - - * `timeout` - Time to wait in milliseconds or special (0, -1) cases described above. + * `timeout` - Time to wait in milliseconds or special (0, -1) cases described above. If not specified, `timeout` value is set to -1 by default. - - * Usage example: - - See `async_infer()` method of the the `InferRequest` class. - - + * Usage example: + See `async_infer()` method of the the `InferRequest` class. * `get_perf_counts()` - * Description: - - Queries performance measures per layer to get feedback of what is the most time consuming layer. . - - **Note**: - - Performance counters data and format depends on the plugin - + Queries performance measures per layer to get feedback of what is the most time consuming layer. +> **NOTE**: Performance counters data and format depends on the plugin * Parameters: - None - - * Usage example: - + * Usage example: ```py >>> exec_net = plugin.load(network=net, num_requests=2) >>> exec_net.requests[0].infer({input_blob: image}) >>> exec_net.requests[0].get_perf_counts() -{'Conv2D': {'exec_type': 'jit_avx2_1x1', - 'real_time': 154, - 'cpu_time': 154, - 'status': 'EXECUTED', +{'Conv2D': {'exec_type': 'jit_avx2_1x1', + 'real_time': 154, + 'cpu_time': 154, + 'status': 'EXECUTED', 'layer_type': 'Convolution'}, - 'Relu6': {'exec_type': 'undef', - 'real_time': 0, - 'cpu_time': 0, - 'status': 'NOT_RUN', + 'Relu6': {'exec_type': 'undef', + 'real_time': 0, + 'cpu_time': 0, + 'status': 'NOT_RUN', 'layer_type': 'Clamp'} ... } ``` - * `set_batch(size)` * Description: Sets new batch size for certain infer request when dynamic batching is enabled in executable network that created this request. - - **Note:** Support of dynamic batch size depends on the target plugin. - +> **NOTE:** Support of dynamic batch size depends on the target plugin. * Parameters: * `batch` - new batch size to be used by all the following inference calls for this request. - * Usage example: ```py >>> plugin.set_config({"DYN_BATCH_ENABLED": "YES"}) @@ -683,5 +514,3 @@ array([4.85416055e-01, 1.70385033e-01, 1.21873841e-01, 1.18894853e-01, >>> exec_net.requests[0].set_batch(inputs_count) ``` Please refer to `dynamic_batch_demo.py` to see the full usage example. - - diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/README.md b/inference-engine/ie_bridges/python/sample/benchmark_app/README.md index 7a9a526..f4a1f55 100644 --- a/inference-engine/ie_bridges/python/sample/benchmark_app/README.md +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/README.md @@ -1,4 +1,4 @@ -# Benchmark Application Demo +# Benchmark Application Python* Demo This topic demonstrates how to run the Benchmark Application demo, which performs inference using convolutional networks. @@ -8,6 +8,7 @@ This topic demonstrates how to run the Benchmark Application demo, which perform Upon the start-up, the application reads command-line parameters and loads a network and images to the Inference Engine plugin. The number of infer requests and execution approach depend on a mode defined with the `-api` command-line parameter. +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). ### Synchronous API For synchronous mode, the primary metric is latency. The application creates one infer request and executes the `Infer` method. A number of executions is defined by one of the two values: @@ -30,37 +31,69 @@ The infer requests are executed asynchronously. `Wait` method is used to wait fo ## Running Running the application with the `-h` or `--help`' option yields the following usage message: -```python3 benchmark_app.py -h +```python3 benchmark_app.py -h``` -benchmark_app [OPTION] -Options: +The command yields the following usage message: +``` + usage: benchmark_app.py [-h] -i PATH_TO_IMAGES -m PATH_TO_MODEL + [-c PATH_TO_CLDNN_CONFIG] [-l PATH_TO_EXTENSION] + [-api {sync,async}] [-d TARGET_DEVICE] + [-niter NUMBER_ITERATIONS] + [-nireq NUMBER_INFER_REQUESTS] + [-nthreads NUMBER_THREADS] [-b BATCH_SIZE] + [-pin {YES,NO}] - -h, --help Print a usage message - -i, --path_to_images "" Required. Path to a folder with images or to image files. - -m, --path_to_model "" Required. Path to an .xml file with a trained model. - -pp "" Path to a plugin folder. - -api, --api_type "" Required. Enable using sync/async API. - -d, --target_device "" Specify a target device to infer on: CPU, GPU, FPGA or MYRIAD. Use "-d HETERO:" format to specify HETERO plugin. The application looks for a suitable plugin for the specified device. - -niter, --number_iterations "" Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device. - -nireq, --number_infer_requests "" Optional. Number of infer requests (default value is 2). - -l, --path_to_extension "" Required for CPU custom layers. Absolute path to a shared library with the kernels implementations. - Or - -c, --path_to_cldnn_config "" Required for GPU custom kernels. Absolute path to an .xml file with the kernels description. - -b, --batch_size "" Optional. Batch size value. If not specified, the batch size value is determined from IR. - -nthreads, --number_threads "" Number of threads to use for inference on the CPU (including Hetero cases). - -pin {YES,NO}, --infer_threads_pinning {YES,NO} Optional. Enable ("YES" is default value) or disable ("NO")CPU threads pinning for CPU-involved inference. +Options: + -h, --help Show this help message and exit. + -i PATH_TO_IMAGES, --path_to_images PATH_TO_IMAGES + Required. Path to a folder with images or to image + files. + -m PATH_TO_MODEL, --path_to_model PATH_TO_MODEL + Required. Path to an .xml file with a trained model. + -c PATH_TO_CLDNN_CONFIG, --path_to_cldnn_config PATH_TO_CLDNN_CONFIG + Optional. Required for GPU custom kernels. Absolute + path to an .xml file with the kernels description. + -l PATH_TO_EXTENSION, --path_to_extension PATH_TO_EXTENSION + Optional. Required for GPU custom kernels. Absolute + path to an .xml file with the kernels description. + -api {sync,async}, --api_type {sync,async} + Optional. Enable using sync/async API. Default value + is sync + -d TARGET_DEVICE, --target_device TARGET_DEVICE + Optional. Specify a target device to infer on: CPU, + GPU, FPGA, HDDL or MYRIAD. Use "-d HETERO:" format to specify HETERO + plugin. The application looks for a suitable plugin + for the specified device. + -niter NUMBER_ITERATIONS, --number_iterations NUMBER_ITERATIONS + Optional. Number of iterations. If not specified, the + number of iterations is calculated depending on a + device. + -nireq NUMBER_INFER_REQUESTS, --number_infer_requests NUMBER_INFER_REQUESTS + Optional. Number of infer requests (default value is + 2). + -nthreads NUMBER_THREADS, --number_threads NUMBER_THREADS + Number of threads to use for inference on the CPU + (including Hetero cases). + -b BATCH_SIZE, --batch_size BATCH_SIZE + Optional. Batch size value. If not specified, the + batch size value is determined from IR + -pin {YES,NO}, --infer_threads_pinning {YES,NO} + Optional. Enable ("YES" is default value) or disable + ("NO")CPU threads pinning for CPU-involved inference. ``` Running the application with the empty list of options yields the usage message given above and an error message. -To run the demo, you can use one-layer public models or one-layer pre-trained and optimized models delivered with the package that support images as input. +To run the demo, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the demo with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). For example, to do inference on an image using a trained network with multiple outputs on CPU, run the following command: -```python3 benchmark_app.py -i /inputImage.bmp -m /multiple-output.xml -d CPU ``` - -> **NOTE**: Public models should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). +python3 benchmark_app.py -i /inputImage.bmp -m /multiple-output.xml -d CPU +``` ## Demo Output @@ -79,3 +112,5 @@ For asynchronous API, the application outputs only throughput: ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/model-optimizer/mo/front/tf/extractors/sum.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/__init__.py similarity index 69% rename from model-optimizer/mo/front/tf/extractors/sum.py rename to inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/__init__.py index e7b06f7..86feb30 100644 --- a/model-optimizer/mo/front/tf/extractors/sum.py +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/__init__.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -from mo.front.common.partial_infer.reduce import tf_reduce_infer - -def tf_sum_ext(pb): - return { - 'keep_dims': pb.attr["keep_dims"].b, - 'infer': lambda node: tf_reduce_infer(node) - } +from .benchmark import main +from .utils.constants import HELP_MESSAGES diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/benchmark.py similarity index 98% rename from inference-engine/ie_bridges/python/sample/benchmark_app/benchmark.py rename to inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/benchmark.py index 761b63e..462e030 100644 --- a/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark.py +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/benchmark.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,7 +17,7 @@ from statistics import median from openvino.inference_engine import IENetwork, IEPlugin -from utils.benchmark_utils import * +from .utils.benchmark_utils import * def main(args=None): try: @@ -198,7 +197,3 @@ def main(args=None): except Exception as e: logging.exception(e) - - -if __name__ == "__main__": - main() diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/__init__.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/__init__.py new file mode 100644 index 0000000..3091761 --- /dev/null +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/__init__.py @@ -0,0 +1,15 @@ +""" + Copyright (C) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/utils/benchmark_utils.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/benchmark_utils.py similarity index 62% rename from inference-engine/ie_bridges/python/sample/benchmark_app/utils/benchmark_utils.py rename to inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/benchmark_utils.py index 4267614..2f6f38b 100644 --- a/inference-engine/ie_bridges/python/sample/benchmark_app/utils/benchmark_utils.py +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/benchmark_utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ from random import choice from datetime import datetime from fnmatch import fnmatch -from . constants import * +from .constants import * logging.basicConfig(format="[ %(levelname)s ] %(message)s", level=logging.INFO, stream=sys.stdout) logger = logging.getLogger('BenchmarkApp') @@ -42,27 +42,29 @@ def validate_args(args): def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-i', '--path_to_images', type=str, required=True, help=HELP_MESSAGES['IMAGE_MESSAGE']) - parser.add_argument('-m', '--path_to_model', type=str, required=True, help=HELP_MESSAGES['MODEL_MESSAGE']) - parser.add_argument('-c', '--path_to_cldnn_config', type=str, required=False, - help=HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE']) - parser.add_argument('-l', '--path_to_extension', type=str, required=False, default=None, - help=HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE']) - parser.add_argument('-api', '--api_type', type=str, required=False, default='async', choices=['sync', 'async'], - help=HELP_MESSAGES['API_MESSAGE']) - parser.add_argument('-d', '--target_device', type=str, required=False, default="CPU", - help=HELP_MESSAGES['TARGET_DEVICE_MESSAGE']) - parser.add_argument('-niter', '--number_iterations', type=int, required=False, default=None, - help=HELP_MESSAGES['ITERATIONS_COUNT_MESSAGE']) - parser.add_argument('-nireq', '--number_infer_requests', type=int, required=False, default=2, - help=HELP_MESSAGES['INFER_REQUESTS_COUNT_MESSAGE']) - parser.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None, - help=HELP_MESSAGES['INFER_NUM_THREADS_MESSAGE']) - parser.add_argument('-b', '--batch_size', type=int, required=False, default=None, - help=HELP_MESSAGES['BATCH_SIZE_MESSAGE']) - parser.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', - choices=['YES', 'NO'], help=HELP_MESSAGES['INFER_THREADS_PINNING_MESSAGE']) + parser = argparse.ArgumentParser(add_help=False) + args = parser.add_argument_group('Options') + args.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help=HELP_MESSAGES["HELP"]) + args.add_argument('-i', '--path_to_images', type=str, required=True, help=HELP_MESSAGES['IMAGE_MESSAGE']) + args.add_argument('-m', '--path_to_model', type=str, required=True, help=HELP_MESSAGES['MODEL_MESSAGE']) + args.add_argument('-c', '--path_to_cldnn_config', type=str, required=False, + help=HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE']) + args.add_argument('-l', '--path_to_extension', type=str, required=False, default=None, + help=HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE']) + args.add_argument('-api', '--api_type', type=str, required=False, default='async', choices=['sync', 'async'], + help=HELP_MESSAGES['API_MESSAGE']) + args.add_argument('-d', '--target_device', type=str, required=False, default="CPU", + help=HELP_MESSAGES['TARGET_DEVICE_MESSAGE']) + args.add_argument('-niter', '--number_iterations', type=int, required=False, default=None, + help=HELP_MESSAGES['ITERATIONS_COUNT_MESSAGE']) + args.add_argument('-nireq', '--number_infer_requests', type=int, required=False, default=2, + help=HELP_MESSAGES['INFER_REQUESTS_COUNT_MESSAGE']) + args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None, + help=HELP_MESSAGES['INFER_NUM_THREADS_MESSAGE']) + args.add_argument('-b', '--batch_size', type=int, required=False, default=None, + help=HELP_MESSAGES['BATCH_SIZE_MESSAGE']) + args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', + choices=['YES', 'NO'], help=HELP_MESSAGES['INFER_THREADS_PINNING_MESSAGE']) return parser.parse_args() diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/utils/constants.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/constants.py similarity index 64% rename from inference-engine/ie_bridges/python/sample/benchmark_app/utils/constants.py rename to inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/constants.py index f68919e..b9770a1 100644 --- a/inference-engine/ie_bridges/python/sample/benchmark_app/utils/constants.py +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark/utils/constants.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,22 +15,24 @@ """ HELP_MESSAGES = { - 'IMAGE_MESSAGE': "Path to a folder with images or to image files.", - 'MULTI_INPUT_MESSAGE': "Path to multi input file containing.", - 'MODEL_MESSAGE': "Path to an .xml file with a trained model.", - 'PLUGIN_PATH_MESSAGE': "Path to a plugin folder.", - 'API_MESSAGE': "Enable using sync/async API. Default value is sync", - 'TARGET_DEVICE_MESSAGE': "Specify a target device to infer on: CPU, GPU, FPGA or MYRIAD. " + 'HELP': "Show this help message and exit.", + 'IMAGE_MESSAGE': "Required. Path to a folder with images or to image files.", + 'MULTI_INPUT_MESSAGE': "Optional. Path to multi input file containing.", + 'MODEL_MESSAGE': "Required. Path to an .xml file with a trained model.", + 'PLUGIN_PATH_MESSAGE': "Optional. Path to a plugin folder.", + 'API_MESSAGE': "Optional. Enable using sync/async API. Default value is sync", + 'TARGET_DEVICE_MESSAGE': "Optional. Specify a target device to infer on: CPU, GPU, FPGA, HDDL or MYRIAD. " "Use \"-d HETERO:\" format to specify HETERO plugin. " "The application looks for a suitable plugin for the specified device.", - 'ITERATIONS_COUNT_MESSAGE': "Number of iterations. " + 'ITERATIONS_COUNT_MESSAGE': "Optional. Number of iterations. " "If not specified, the number of iterations is calculated depending on a device.", - 'INFER_REQUESTS_COUNT_MESSAGE': "Number of infer requests (default value is 2).", + 'INFER_REQUESTS_COUNT_MESSAGE': "Optional. Number of infer requests (default value is 2).", 'INFER_NUM_THREADS_MESSAGE': "Number of threads to use for inference on the CPU " "(including Hetero cases).", - 'CUSTOM_CPU_LIBRARY_MESSAGE': "Required for CPU custom layers. " + 'CUSTOM_CPU_LIBRARY_MESSAGE': "Optional. Required for CPU custom layers. " "Absolute path to a shared library with the kernels implementations.", - 'CUSTOM_GPU_LIBRARY_MESSAGE': "Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.", + 'CUSTOM_GPU_LIBRARY_MESSAGE': "Optional. Required for GPU custom kernels. Absolute path to an .xml file with the " + "kernels description.", 'BATCH_SIZE_MESSAGE': "Optional. Batch size value. If not specified, the batch size value is determined from IR", 'INFER_THREADS_PINNING_MESSAGE': "Optional. Enable (\"YES\" is default value) or disable (\"NO\")" "CPU threads pinning for CPU-involved inference." diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark_app.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark_app.py new file mode 100644 index 0000000..4f587a8 --- /dev/null +++ b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark_app.py @@ -0,0 +1,37 @@ +import benchmark + +from argparse import ArgumentParser, SUPPRESS + + +def parse_args(): + parser = ArgumentParser(add_help=False) + args = parser.add_argument_group('Options') + args.add_argument('-h', '--help', action='help', default=SUPPRESS, help=benchmark.HELP_MESSAGES["HELP"]) + args.add_argument('-i', '--path_to_images', type=str, required=True, + help=benchmark.HELP_MESSAGES['IMAGE_MESSAGE']) + args.add_argument('-m', '--path_to_model', type=str, required=True, + help=benchmark.HELP_MESSAGES['MODEL_MESSAGE']) + args.add_argument('-c', '--path_to_cldnn_config', type=str, required=False, + help=benchmark.HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE']) + args.add_argument('-l', '--path_to_extension', type=str, required=False, default=None, + help=benchmark.HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE']) + args.add_argument('-api', '--api_type', type=str, required=False, default='async', choices=['sync', 'async'], + help=benchmark.HELP_MESSAGES['API_MESSAGE']) + args.add_argument('-d', '--target_device', type=str, required=False, default="CPU", + help=benchmark.HELP_MESSAGES['TARGET_DEVICE_MESSAGE']) + args.add_argument('-niter', '--number_iterations', type=int, required=False, default=None, + help=benchmark.HELP_MESSAGES['ITERATIONS_COUNT_MESSAGE']) + args.add_argument('-nireq', '--number_infer_requests', type=int, required=False, default=2, + help=benchmark.HELP_MESSAGES['INFER_REQUESTS_COUNT_MESSAGE']) + args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None, + help=benchmark.HELP_MESSAGES['INFER_NUM_THREADS_MESSAGE']) + args.add_argument('-b', '--batch_size', type=int, required=False, default=None, + help=benchmark.HELP_MESSAGES['BATCH_SIZE_MESSAGE']) + args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', + choices=['YES', 'NO'], help=benchmark.HELP_MESSAGES['INFER_THREADS_PINNING_MESSAGE']) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + benchmark.main(args) diff --git a/inference-engine/ie_bridges/python/sample/classification_sample/README.md b/inference-engine/ie_bridges/python/sample/classification_sample/README.md new file mode 100644 index 0000000..a4eec40 --- /dev/null +++ b/inference-engine/ie_bridges/python/sample/classification_sample/README.md @@ -0,0 +1,79 @@ +# Image Classification Python* Sample + +This topic demonstrates how to run the Image Classification sample application, which performs +inference using image classification networks such as AlexNet and GoogLeNet. + +### How It Works + +Upon the start-up, the sample application reads command line parameters and loads a network and an image to the Inference +Engine plugin. When inference is done, the application creates an +output image and outputs data to the standard output stream. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + +## Running + +Run the application with the `-h` option yields the usage message: +``` +python3 classification_sample.py -h +``` +The command yields the following usage message: +``` +usage: classification_sample.py [-h] -m MODEL -i INPUT [INPUT ...] + [-l CPU_EXTENSION] [-pp PLUGIN_DIR] + [-d DEVICE] [--labels LABELS] [-nt NUMBER_TOP] + [-ni NUMBER_ITER] [-pc] + +Options: + -h, --help Show this help message and exit. + -m MODEL, --model MODEL + Required. Path to an .xml file with a trained model. + -i INPUT [INPUT ...], --input INPUT [INPUT ...] + Required. Path to a folder with images or path to an + image files + -l CPU_EXTENSION, --cpu_extension CPU_EXTENSION + Optional. Required for CPU custom layers. MKLDNN (CPU)-targeted custom layers. + Absolute path to a shared library with the kernels + implementations. + -pp PLUGIN_DIR, --plugin_dir PLUGIN_DIR + Optional. Path to a plugin folder + -d DEVICE, --device DEVICE + Optional. Specify the target device to infer on; CPU, + GPU, FPGA, HDDL or MYRIAD is acceptable. The sample + will look for a suitable plugin for device specified. + Default value is CPU + --labels LABELS Optional. Path to a labels mapping file + -nt NUMBER_TOP, --number_top NUMBER_TOP + Optional. Number of top results + -ni NUMBER_ITER, --number_iter NUMBER_ITER + Optional. Number of inference iterations + -pc, --perf_counts Optional. Report performance counters +``` + +Running the application with the empty list of options yields the usage message given above. + +To run the sample, you can use AlexNet and GoogLeNet or other image classification models. You can download the pre-trained models with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or from [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + +For example, to perform inference of an AlexNet model (previously converted to the Inference Engine format) on CPU, use the following command: + +``` + python3 classification_sample.py -i /cat.bmp -m /alexnet_fp32.xml +``` + +### Sample Output + +By default the application outputs top-10 inference results. +Add the `-nt` option to the previous command to modify the number of top output results. +For example, to get the top-5 results on GPU, run the following command: +``` + python3 classification_sample.py/cat.bmp -m /alexnet_fp32.xml -nt 5 -d GPU +``` + +## See Also +* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) + + diff --git a/inference-engine/ie_bridges/python/sample/classification_sample.py b/inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py similarity index 62% rename from inference-engine/ie_bridges/python/sample/classification_sample.py rename to inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py index f02459f..ea87429 100644 --- a/inference-engine/ie_bridges/python/sample/classification_sample.py +++ b/inference-engine/ie_bridges/python/sample/classification_sample/classification_sample.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ from __future__ import print_function import sys import os -from argparse import ArgumentParser +from argparse import ArgumentParser, SUPPRESS import cv2 import numpy as np import logging as log @@ -26,22 +26,29 @@ from openvino.inference_engine import IENetwork, IEPlugin def build_argparser(): - parser = ArgumentParser() - parser.add_argument("-m", "--model", help="Path to an .xml file with a trained model.", required=True, type=str) - parser.add_argument("-i", "--input", help="Path to a folder with images or path to an image files", required=True, - type=str, nargs="+") - parser.add_argument("-l", "--cpu_extension", - help="MKLDNN (CPU)-targeted custom layers.Absolute path to a shared library with the kernels " - "impl.", type=str, default=None) - parser.add_argument("-pp", "--plugin_dir", help="Path to a plugin folder", type=str, default=None) - parser.add_argument("-d", "--device", - help="Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample " - "will look for a suitable plugin for device specified (CPU by default)", default="CPU", - type=str) - parser.add_argument("--labels", help="Labels mapping file", default=None, type=str) - parser.add_argument("-nt", "--number_top", help="Number of top results", default=10, type=int) - parser.add_argument("-ni", "--number_iter", help="Number of inference iterations", default=1, type=int) - parser.add_argument("-pc", "--perf_counts", help="Report performance counters", default=False, action="store_true") + parser = ArgumentParser(add_help=False) + args = parser.add_argument_group('Options') + args.add_argument('-h', '--help', action='help', default=SUPPRESS, help='Show this help message and exit.') + args.add_argument("-m", "--model", help="Required. Path to an .xml file with a trained model.", required=True, + type=str) + args.add_argument("-i", "--input", help="Required. Path to a folder with images or path to an image files", + required=True, + type=str, nargs="+") + args.add_argument("-l", "--cpu_extension", + help="Optional. Required for CPU custom layers. " + "MKLDNN (CPU)-targeted custom layers. Absolute path to a shared library with the" + " kernels implementations.", type=str, default=None) + args.add_argument("-pp", "--plugin_dir", help="Optional. Path to a plugin folder", type=str, default=None) + args.add_argument("-d", "--device", + help="Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL, MYRIAD or HETERO: is " + "acceptable. The sample will look for a suitable plugin for device specified. Default " + "value is CPU", + default="CPU", type=str) + args.add_argument("--labels", help="Optional. Path to a labels mapping file", default=None, type=str) + args.add_argument("-nt", "--number_top", help="Optional. Number of top results", default=10, type=int) + args.add_argument("-ni", "--number_iter", help="Optional. Number of inference iterations", default=1, type=int) + args.add_argument("-pc", "--perf_counts", help="Optional. Report performance counters", default=False, + action="store_true") return parser @@ -93,7 +100,6 @@ def main(): # Loading model to the plugin log.info("Loading model to the plugin") exec_net = plugin.load(network=net) - del net # Start sync inference log.info("Starting inference ({} iterations)".format(args.number_iter)) @@ -101,7 +107,7 @@ def main(): for i in range(args.number_iter): t0 = time() res = exec_net.infer(inputs={input_blob: images}) - infer_time.append((time()-t0)*1000) + infer_time.append((time() - t0) * 1000) log.info("Average running time of one iteration: {} ms".format(np.average(np.asarray(infer_time)))) if args.perf_counts: perf_counts = exec_net.requests[0].get_perf_counts() @@ -120,18 +126,25 @@ def main(): labels_map = [x.split(sep=' ', maxsplit=1)[-1].strip() for x in f] else: labels_map = None + classid_str = "classid" + probability_str = "probability" for i, probs in enumerate(res): probs = np.squeeze(probs) top_ind = np.argsort(probs)[-args.number_top:][::-1] print("Image {}\n".format(args.input[i])) + print(classid_str, probability_str) + print("{} {}".format('-' * len(classid_str), '-' * len(probability_str))) for id in top_ind: - det_label = labels_map[id] if labels_map else "#{}".format(id) - print("{:.7f} label {}".format(probs[id], det_label)) + det_label = labels_map[id] if labels_map else "{}".format(id) + label_length = len(det_label) + space_num_before = (len(classid_str) - label_length) // 2 + space_num_after = len(classid_str) - (space_num_before + label_length) + 2 + space_num_before_prob = (len(probability_str) - len(str(probs[id]))) // 2 + print("{}{}{}{}{:.7f}".format(' ' * space_num_before, det_label, + ' ' * space_num_after, ' ' * space_num_before_prob, + probs[id])) print("\n") - del exec_net - del plugin - if __name__ == '__main__': sys.exit(main() or 0) diff --git a/inference-engine/ie_bridges/python/sample/classification_sample_async/README.md b/inference-engine/ie_bridges/python/sample/classification_sample_async/README.md new file mode 100644 index 0000000..e121f4a --- /dev/null +++ b/inference-engine/ie_bridges/python/sample/classification_sample_async/README.md @@ -0,0 +1,89 @@ +# Image Classification Python* Sample Async + +This sample demonstrates how to build and execute inference in pipelined mode on example of classifications networks. + +The pipelined mode might increase the throughput of the pictures. The latency of one inference will be the same as for synchronous execution. +
+The throughput increases due to follow reasons: +* Some plugins have heterogeneity inside themselves: data transferring, execution on remote device, pre-processing and post-processing on the host. +* Using of explicit heterogeneous plugin with execution of different parts of network on different devices, for example HETERO:CPU,GPU. + +When two or more devices process one image, creating several infer requests and starting asynchronous inference allow for using devices in the most efficient way. +If two devices are involved in execution, the most optimal value for `-nireq` option is 2. +To process infer requests more efficiently, Classification Sample Async uses round-robin algorithm. It starts execution of the current infer request and switches to waiting for results of the previous one. After finishing of waiting, it switches infer requests and repeat the procedure. + +Another required aspect of good throughput is a number of iterations. Only with big number of iterations you can emulate the real application work and get good performance. + +The batch mode is an independent attribute on the pipelined mode. Pipelined mode works efficiently with any batch size. + +### How It Works + +Upon the start-up, the sample application reads command line parameters and loads a network and an image to the Inference +Engine plugin. +Then application creates several infer requests pointed in `-nireq` parameter and loads images for inference. + +Then in a loop it starts inference for the current infer request and switches to waiting for the previous one. When results are ready, it swaps infer requests. + +When inference is done, the application outputs data to the standard output stream. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + +## Running + +Running the application with the -h option yields the following usage message: +``` +python3 classification_sample_async.py -h +``` +The command yields the following usage message: +``` +usage: classification_sample_async.py [-h] -m MODEL -i INPUT [INPUT ...] + [-l CPU_EXTENSION] [-pp PLUGIN_DIR] + [-d DEVICE] [--labels LABELS] + [-nt NUMBER_TOP] [-ni NUMBER_ITER] [-pc] + +Options: + -h, --help Show this help message and exit. + -m MODEL, --model MODEL + Required. Path to an .xml file with a trained model. + -i INPUT [INPUT ...], --input INPUT [INPUT ...] + Required. Path to a folder with images or path to an + image files + -l CPU_EXTENSION, --cpu_extension CPU_EXTENSION + Optional. Required for CPU custom layers. Absolute + path to a shared library with the kernels + implementations. + -pp PLUGIN_DIR, --plugin_dir PLUGIN_DIR + Optional. Path to a plugin folder + -d DEVICE, --device DEVICE + Optional. Specify the target device to infer on; CPU, + GPU, FPGA, HDDL or MYRIAD is acceptable. The sample + will look for a suitable plugin for device specified. + Default value is CPU + --labels LABELS Optional. Labels mapping file + -nt NUMBER_TOP, --number_top NUMBER_TOP + Optional. Number of top results + -ni NUMBER_ITER, --number_iter NUMBER_ITER + Optional. Number of inference iterations + -pc, --perf_counts Optional. Report performance counters + +``` + +Running the application with the empty list of options yields the usage message given above and an error message. + +To run the sample, you can use AlexNet and GoogLeNet or other image classification models. You can download the pre-trained models with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or from [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + + +You can do inference on an image using a trained AlexNet network on FPGA with fallback to CPU using the following command: +``` + python3 classification_sample_async.py -i /cat.bmp -m /alexnet_fp32.xml -nt 5 -d HETERO:FPGA,CPU -nireq 2 -ni 200 +``` + +### Sample Output + +By default, the application outputs top-10 inference results for each infer request. +It also provides throughput value measured in frames per seconds. + +## See Also +* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) diff --git a/inference-engine/ie_bridges/python/sample/classification_sample_async.py b/inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py similarity index 65% rename from inference-engine/ie_bridges/python/sample/classification_sample_async.py rename to inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py index ae86555..601be2d 100644 --- a/inference-engine/ie_bridges/python/sample/classification_sample_async.py +++ b/inference-engine/ie_bridges/python/sample/classification_sample_async/classification_sample_async.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ from __future__ import print_function import sys import os -from argparse import ArgumentParser +from argparse import ArgumentParser, SUPPRESS import cv2 import numpy as np import logging as log @@ -26,22 +26,26 @@ from openvino.inference_engine import IENetwork, IEPlugin def build_argparser(): - parser = ArgumentParser() - parser.add_argument("-m", "--model", help="Path to an .xml file with a trained model.", required=True, type=str) - parser.add_argument("-i", "--input", help="Path to a folder with images or path to an image files", required=True, - type=str, nargs="+") - parser.add_argument("-l", "--cpu_extension", - help="MKLDNN (CPU)-targeted custom layers.Absolute path to a shared library with the kernels " - "impl.", type=str, default=None) - parser.add_argument("-pp", "--plugin_dir", help="Path to a plugin folder", type=str, default=None) - parser.add_argument("-d", "--device", - help="Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample " - "will look for a suitable plugin for device specified (CPU by default)", default="CPU", - type=str) - parser.add_argument("--labels", help="Labels mapping file", default=None, type=str) - parser.add_argument("-nt", "--number_top", help="Number of top results", default=10, type=int) - parser.add_argument("-ni", "--number_iter", help="Number of inference iterations", default=1, type=int) - parser.add_argument("-pc", "--perf_counts", help="Report performance counters", default=False, action="store_true") + parser = ArgumentParser(add_help=False) + args = parser.add_argument_group('Options') + args.add_argument('-h', '--help', action='help', default=SUPPRESS, help='Show this help message and exit.') + args.add_argument("-m", "--model", help="Required. Path to an .xml file with a trained model.", + required=True, type=str) + args.add_argument("-i", "--input", help="Required. Path to a folder with images or path to an image files", + required=True, type=str, nargs="+") + args.add_argument("-l", "--cpu_extension", + help="Optional. Required for CPU custom layers. Absolute path to a shared library with the" + " kernels implementations.", type=str, default=None) + args.add_argument("-pp", "--plugin_dir", help="Optional. Path to a plugin folder", type=str, default=None) + args.add_argument("-d", "--device", + help="Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is " + "acceptable. The sample will look for a suitable plugin for device specified. Default value is CPU", + default="CPU", type=str) + args.add_argument("--labels", help="Optional. Labels mapping file", default=None, type=str) + args.add_argument("-nt", "--number_top", help="Optional. Number of top results", default=10, type=int) + args.add_argument("-ni", "--number_iter", help="Optional. Number of inference iterations", default=1, type=int) + args.add_argument("-pc", "--perf_counts", help="Optional. Report performance counters", + default=False, action="store_true") return parser @@ -92,7 +96,6 @@ def main(): # Loading model to the plugin log.info("Loading model to the plugin") exec_net = plugin.load(network=net) - del net # Start sync inference log.info("Starting inference ({} iterations)".format(args.number_iter)) @@ -119,18 +122,25 @@ def main(): labels_map = [x.split(sep=' ', maxsplit=1)[-1].strip() for x in f] else: labels_map = None + classid_str = "classid" + probability_str = "probability" for i, probs in enumerate(res): probs = np.squeeze(probs) top_ind = np.argsort(probs)[-args.number_top:][::-1] print("Image {}\n".format(args.input[i])) + print(classid_str, probability_str) + print("{} {}".format('-' * len(classid_str), '-' * len(probability_str))) for id in top_ind: - det_label = labels_map[id] if labels_map else "#{}".format(id) - print("{:.7f} {}".format(probs[id], det_label)) + det_label = labels_map[id] if labels_map else "{}".format(id) + label_length = len(det_label) + space_num_before = (7 - label_length) // 2 + space_num_after = 7 - (space_num_before + label_length) + 2 + space_num_before_prob = (11 - len(str(probs[id]))) // 2 + print("{}{}{}{}{:.7f}".format(' ' * space_num_before, det_label, + ' ' * space_num_after, ' ' * space_num_before_prob, + probs[id])) print("\n") - del exec_net - del plugin - if __name__ == '__main__': sys.exit(main() or 0) diff --git a/inference-engine/ie_bridges/python/sample/greengrass_samples/Greengrass-FaaS-User-Guide.docx b/inference-engine/ie_bridges/python/sample/greengrass_samples/Greengrass-FaaS-User-Guide.docx deleted file mode 100644 index 6fedb499e2b3edf06f86742aee8520f866b5630f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 33240 zcmeFYQK|rgfpT?(`$_+;0xbNDTL#SF#3&z9w$` z@2|2H%p(LM1!<($Q-5!p7LT^~ugh50kqj4hP_5%%HNa#&OSY@^>yk!ySSFF6%7X9V zpjrcmY{gE^2FEG+C(JiW0;drNVNWGy`XiG+0b(gj;#3K`M=;?9d!jJ!CVhV+rEU>$ zUg{V0i}rbv4GrkM-il{gBtVkXUDeoBbwQB{F=#*)bLetD_7^tUkC;1dB%E*;Kw0Q} z77HrBZ0j?;N|%Le2(Y8V3zOx;*+;TF>pmuRjynQxf zd;YzPzrVl$^8X*8GgN#*fBwzv|Kk<3qHRHMx9 zYWFfUB;%uN1_a-NxQCV0cJx1{{5%GgH5CN5)2QVq2yZB@cAEl^-K$)ava#F& zf6-k-r~eFX;hT&tmWLXSDhAwtFr* z&e53ee==iiXXIk@FBJYK2L3O80RIKTf6?;)?6#$@V3$RY;G5U{i*V108D@RmSyb9Ojunln8KS#4 z+l@(HYgDi>_u2U+KJz!jMS57;60|@q=$V#Iw}<@~D3QZ^+_bQs>8a5A%wz`jmZF6_ zC|g4hJ^xOkP-X^)Rz?zfM2#A>GksRB!s7Fe*7uR4D^_S$dlbV@1O1Mz$0ZR04xgFM zN&Y4QfmujS6skqAY^r##)z;=vjjYxX%)0}u?#Cra6Xvw;ZhEg@+~X|)#_-^$S8OszbXx5@QxRcw?BK=}%T6~!l@m3fR60)}P@ zDMa97uuNZxR56A}rnE`lGqf_~0xtk-5)^d;0CyRMh{*`#5p*bJ!Y zH%+PH6KN!1G%*ZVVD@`g>EwLF5}mFYE-AH=pkb#fV`OuhWKn}9B-F+-^JvY%BIKB) zQ^XjX0Uj13Wzq`9xV}A&gZRsxrSa1WV^;+H1(eqp7BCWbMn+o*q7$)cw;H4bxYN(_ z##yOL=b!I*p|kg&e#wv^?z4l1a%IV8SNJCny9?qcWi1006<_5QX~U?s((SfBnAx*q zs0+!gwa_Vye%eK&Nu8nw7TboasEt->}Mc*S)Z_HD$IG zfw5K~QF@XdEFjyZtmr|veE{pGEiA5Os&vaT_s(|6&w00}K)2t=AJ$5Q4L)TPv-84X z3D)PQJIz^08?uACUm*XTPq7Pd7vp{5Na_Vmn73fp4N;qDWx^0}Dv_G^@eH7_)EZUA34 zp<7<(tBl8rM)`AIi}h|XtY0rl|)0hX{+xe@u;+-v0}~pY4kK z)mh^8_+*!#1w>h;S?5tw4*_Hl!}rO;d;3XECpm|dj>t`X4XzJx2wz3!#{XyLwE-6O3e9L&&Y2>lB?sE1S$wwdL z_v$;1d&V5es@6}ICYAg;UzyM6BlzosSaiJEawx3IVm_qJatJ4eSLzAY5pUYIOaYWq z0Dc>nQNTebNPfa{U!}0kmcS=u6Xu(!I}~~ER)8F9@Cr4YhlKt{<)Ja5f?{yLzB&?8 z1i7DmYGo*iorMs15mHYV2%^PZ0Z95q$Vm<}N9-IEXJ?~O>1Dl!p zPT;n2x}2$fR4?t{?nQ0Zyxl?burpe{{?fCdl9dgA1ethC?{}y-qGRjHLr7 zsY3lF6N_oCjWw9FL18fS`nZY2Gq5buIbdcTM{tCwV%F&=3o>p>gmrnR7P~lR#5ODP z&Y}~7Ypb2|wE~aG!k_4L8us(MLW1at{_&R1^s~ym5Wy7zU)j3pHja+eibKyLDdg_C zZUW|-N=sAe^p&^o^49yo2Oygd=Y{_WGo@E;ssjlA*{{KgAX@dHp8gV%~&u0{~p=DbM5>V z5D(JzD*^i%;Zd5u)^;7#`Z188rt{3HuH>?%Wq!^+uwb`@d9^9;Q@k62R+>`})0pUe z>L-Xia+u2=?`UTe%H&n}rOD_e@CmeLtryM)`4_+HVjCVnv$%N$Ff)Q1_v{Wb(;U(U zydC0K1j)It{!84iWj8-F)M}6JJLMOfKUI&{CFBN>7*v{*27iExY3m&eAvo_ncZx+tFB!V?{1=l4vvZ$=j~TE%Psr?>5uDoQflwDEJ>emQB^khX^cM?-^#K9A z68bL&I5o@zS}*tAxIOps5L@|4l$AML`iA*F9Y$T^fVKMDIouL=bXSO_#@l+XkRk^b zX`08*`)z@CsZ)Zg0N;6`J7)s1|B}JcJNH2?z3##^lGISL8TzF~V2dFy1b~@G( zTd>Q7VwomTRaST~?`Z1|iNZxn)1?fvDo%4?4BFxlNDc2z`gXs$Ce;PC^)s zMMFy?zQk5z$FnB=pQ6Nk!iBXP=%Gs09kUN;>!N|>^#aDTc#{r-6FaMd@VTU5UN!ok zHq=OR#V$VD3M?XMT8&h&?*ZsmmAmuizp2p?Err<$2fImayL*Cghjq0 z!|leO&OyWlH?5h?nd<---|ue_iZq`Qp0GC0kXIya5&u1eVpqV4Nq^MRmt+8_#TXe| zJjb&$oI)x(xOjTt_q=$S2rO<^PqbSF-6QBfL3Ln13X(II%x8)x$z>@yl7*Cp-E^q4 za9OyN@cIE#;}9%VC>vu&#O!K5N6iy0AE>YKG30yN;4$+vQc{|5I#i&GVCf{1xWM&8 z7LB!>QtKmIk4Zuzv!@W}LybGqUuE3$%I~l;vNUH-Q&phmr3b7cg-fy#bb4>>Qdld6 z^Oj0b!u>mg`I)pnm{Ku-DG2jNZ?y zD;xv}-8nnvJ5=@1%ps4PpMLj=ezxF$jV#z?75>XfhSeaQ$i@I`ChLkf-`H=~PZteZ zI}0=nMA4sv3UzO%9+n!{68Qm0p+0yZ%*xxSZ~KT_P|P|*Z)g1qw&Q4D|1f|*cXTW7gzrc%-A{i$7eeCv23GqNbZC6@Hx~?YO=3p_ zp!IzW5){pBpOvh(PW{VFS;zQZ6e0)oL9{YTPx9^Nb4ur1fVlq9B&+;-K?jC84 zxy$F#mZKCoTyDA6w<(QCs_3_^bi^oaDB6*qqcvv4aY>}fI(DVYr!57~ntU2b2vPIP(t9T{`LzMdxw2JAkl=*+*oBe@S9e8;;`>T4rakUZ!yYDUTbpChV+*j5M>;Scld&+N$lnBXrS_MLYRY z`>MM@t_=LbqqaF)E{x+JeIp&UPKEE*ffdN`j4P+__`%c4dmsXYHf|&4tp~D`zj(~ADU5aP22-ONgbEx4k=@Ksac3Z8=SV4K~O?eNvJMXMXu)oYQ zt`a9DNB6WI8|02NmY47E9US+%%E+J5l*`&~Z?-l&AB#lZgk@x0_$q?+zhSsDU!2vJ zqU35YWa0}1x5)^HE)L}%jt7^#DaSRxZ)ASI6^45(q4;3FU(i~XGXxiv7b zOEonLXkbG_URmqR7L3=4HLTm-6J5zGutnJ=y)3CIn$ZD}P2L>=NykL)o4`mRcxaVB z5tFuE@3|tH73ek<(ck*v^W^oGz4sw*aOZjWT*;C$k;EH}U=aeWxc54@9VKyhj8@s; ztrdiA!PBFU#MBwW@?rHPxA38_qd|juRlly?If0~%Fv~B2_;=7pGyK(3uxWLg8@-D$ zT_wVz;yu*S=n<{_9Q^s*VzVoGAjuRn9i-|jKD~$puG*5EfBg?Te8~zw-x=5vFE1W> zOx%xWj2%>_&_1CYY@Fm^nY;3A$Lh?mM$qNK^r4G|n7TfyxV3dF58A_n-M_d94^fF87E(MmfuR6J%M8=mUeXwbgEaPjQ~cdmeSbctZ}HW+?^8FjX+ z%EZZ2iEmGqD?t2IP-fN4joXts<@D#@0a_b#2|nU9&ifNuwX_2CT;j^gs_2;Cy{{J$WnTH=ca%L z-m;*>3?L5?vfO_xLwatk#Zrklz3HOIN{*>rgD$}?hwPy(czvD!)ksI)u&JE|!%wSy zfg;`gtlHGSyE6-9y<2`IYJ|Vgp^5-9unJ_rtvZ^mcGAwm71* zD((`~*fBcjI;JISrP6II-KZ_dn6m&KzXUT0H}*9*;YcP^;O>|*fT>qVBc782w2sDY z0OauNQ$PJD95J=7M6NoNEiQ);xNlNSNU?UR{*1*_Ub!WD*#h*9X1Kkqgo($Q z9yR@WK7%O^ui|_p^2x3E%bp+_CAD%r=)t$Xo>X2jF)bC#gv0}x+FavZ8`naXsrzt* z7}#NJB@!YTtY*EsKUTwPW?Mwru0@c%qF#eWW8h{L)y^3sGcR|H*3Ubols+vlth=3{ z@bQqsRbO;rMmARmgF}LURs(B&KMXbug&bdl^pR9`g;AqkVG7uf*eR`^ymmAu7)Hd& zNGMa(O{aW!@b)Hl!jx!V5+nO6!O&P*$2qo4kWFksx63htMG0FXpQ3=I! zYj|?Y(%}_XGXKD3O_^*DA!REnW>yMQ@xdDl(%k}|bRA8v^(f7D$HeUbr26 z*s(HjE&H8VIOrR?aTX`XPwYylNv5=GrwJzcwrl5*gnxJ*Z3DwEQDjYdUk=jT93ba> zPZ|z56P!l`KDw_a#7Wy(U(o~sorW=M9I+;@C<12zcpW9H7;#Va193RWVsFBUtsfYL ztG~5MQBr0liPqF2y^A@QH7~?{kR8Xp;dwJY#sBXv=4RZt8PkdV=&T*bmXo} z1wrfmpW&vXFB*f5OZ0H=$EDZ0SaDVa@a8;a!PndH;_vKFCox3=Mzd-t3$6=LD*Mu`ajxp^gDJfpQOrV zww5@lO=*OJHfW7r$ch1@!o6x%)AiEot-bf+aRN-nG&5kfd*MoFk8(D9p#^pAkgZy-# zD;al2N?$pdHJ?Pj5+N~Hcd8)%-ZymVBq$o>XAHg*4-#GqRTQoq*+R$1)dRl+xwK-r zd^uS~TfI44C38$w+Vk{3yeC(+4Qavrhz3-Ue^R3c6Bl;hfUN^1=O8RvV{^&i>PEzlXJ9q2ViPLea> zQcdYe{+mi53V103zf<$TCG)?GgFQp~$s8SOQx=q73Y@9bF@T$RE}cPvq7K)9)HYbt zAnnXox?nN$DZ;0%V6Bl1CL%8I^N|%FW!70ftytkN`Tw$BDku#U{9&0EjrQB9#SO}l zMa&-eP9p(Lm`{(+O?x!c4Vwyx$uu^hBI>MDpjFWhXr9P}6O|3F6L*>Nv0Tk&{F^jQ zWx-_B9mN#;!#9ZX;Ct~rf$F{nRttm=kTRUnq6_vz6J9T}oWvz7Y)NpjZN$`vl3rVuJX*m8 zTk9w>RGnFq^l@|J9~*6ka6-tIS;r(|;N+Wxsv?g|@%E*FcW)Df3}UWmAZ^JfzZIK@ znO+tFmSXh5|G=53#GkM7s31j9Ab`L3UW?h@=O{2VA-)1<%epYbUMEb^KBW#7k>}CS zCVhHrN1XrO-DagD)_BMe;})wBJiu>!y#DIQZJYd zg1)MEeKNvNGI?H|y`rnUh13+BB#R>How<|yf@wubu=^BHKXUAvq-_!d*>x^5Tu%CY zsAPW0#cFj>8v*1xR#!9n{kUEAep)~uaS+oz+{IwXomYDsQA*e_bWGpGzyDTn2I%<( z`8dKESv0GZ2p<+SWq=AJcQou3T*%NlR$$R*o5kATua`R$)F(yRfq%%;xBQjIS~GB! z^s^7dDsoI9lvb9uGN6p+L;m4nBb+>0z7Wv1NU?usa@br1P5IJyQyoxr-O!Gkwuysg zql#op^p8yt{beGrjRM zwW`kj6Kz~!-D6!q(%f8@%A{r^4&Q|PwS{EkQ?YR)HOWjG0V}vX;B+xoPf55j%G6Ij z(y_@n28{VK&D3|Vao5;m$?jo~$cP(#MA6I;x)0oJ8Vag)*9C1O#iEs2p|z56_i8ox zeK79~dwJa7+_Q?pP6A5bsI@cHqw?7|sRBjL?%I8F5V0MYO!r+h2YXR|zSh)Yx3esn_j%IEv7sF@#5_b@1c z)L)dBEPK}+Elug+^X3H9h{tYkN1YGc(`RNMg;v=Dovhhz6I=9qYzWm!(AXgKnR3pu zX|?4$SpebG4)imH>|$XIr;RrrI5(SZr$XZ$g7`>t=UiWuWP_zks7C(*Ro6En?yZGa z&09ceMjqA8ASHWh{!u3*T^ONYn=%WZ*9#}M3g7G7LpwZEjI49!a^%V<=abi zg`CI%A$nr4K9$U=R>b7tB6Yo~ab9m~709hL|`-rAUZ=Wfw5Iq;XT3s$agDC!Tvqmw%hU zWxXw)7J@DqZCwtw`tIx={7NNHq8+XWZ2x2CL>J3hPzVy5;dNfkn(Z}Vv*P5HZk}xQ z#-^Luc6)vhf_VIJ+*zS9YcOVfg?$@k&t_WV34aS;Sh=I6F`?u_wc z`GB8%yW?@os>_9>i9>=$_tKSt3?qkBZx(=InESvl-MU<*b13hl9W_TmS-h&|BUnSQ z+IvqpUVKWRm`qV6z%iw}vEW5;ocRX29y69gaC97W*thgoez;-Vc`ic|3b%6*6&2d%u4qQu_>iH%Zv}Wt&%` z+7PJ|bNjF7XPUEbMP!$<*Cgnb2@XY3u2{c)BYB*f7SZsMh=^`O%6MU0FA(S=da-QX z6RvxyevkL(m;LYa$-b-e>-7gOr&=q(w<7oXvS zBA0fv^&J2ge>xm`es|}}OC5(qSe58A)O$bJoN>yi2vx06-j$LAF7J4FTJg2nor_5= zTV&qHBy4bQI(vpSaZ3syuAq}Ngi;5Q^ao4OP%Y`*m@T*anU*)+;7J5d2m0kt=?b}% z8e0^|uwc&B<)!fsdZV%vQ8^7pSOpWsoCWd%!5yfNC z?1^C$B9p1p^P0d(odI(iLit#Zm=lR3zw;9OqfJTA)=Qj##G?%)ibJrXQQF?#k-Obm zv9MhksR=#Gv8?FyuT}1<3htPrG+bqvlLFybe0Gjtkfx*S$!zd1C5^&*w1(WlhQ!Zn z_D~ABDeuL1R7VmoIQGcX*suA4jv1Pev=C4Gk&&GoPrH?v?_C&H@OuG|E%d{??Ub6S zbkL@v4Elzplt|{f`bN1ySAB_Psf?@x8})4$bqlF@%QG6BC7w0A4XB!$P_84lK`aj2 z!!WM&a)slK8HX7WtR$3?+Eol=TsbFb^z%_mQlE~TE|u=a#kiGOz^CVfrUXhkSzS&# zPy>hQvv$(~?n(D)=5m8u!xJI%-EMaKN}`6Ddkr_a9(Zr*qYPz76if~B*tK5exX%@R zAjll2wKW@B_*cfy8@M#;zhG@2OFBv6SgaH zgcgYbQApnv`4j#37uiwCt_R+R_^g@p-znA-gg>${*+uIkjOt}!D^thcdvEiVv z(+Ph)765wD8**`CkrrC|!t5MnMItv`ZMme)gsDo2WiBe`-Ty*Avl+9nG@DPWmRn|2 zLgTiz3}A?;OBIP3v#zb@PIq*LKloAw_sUS1cy@wN*aUh_ZTNlt&@ooTq$+fpTbLVX zXF*@{`Tnx~z-FqO%p`>QNw0Yv*q-2lk=++vIld<`a%jlDO}T~DyOqvRM`4ZMAqO+n zYEqP)^pvrG&I{QyT+LB*+hy$E*^m7F7yELrc!Ab;gka zz(TX+kUbfa4?qX?l^-T4rbjNFwzQeYOSrxuOwcV-=wd=hToQV6WxdSQkyL@_0Bgqy zPUHWJoQ57__jNqHqJ8$DI@O?Skk0o<4OzY+3y$lFM(b(0bU||Hy?y)KZS?^eFO%jm zaw#HzwKq9($9p~S+hzYqJ7&}sd4aSnZ%D`1fj9v&vt%#PpOP(vv|SYIh2E|pXk9*N zhL9Td`>0x-(xHSX;5H6f$p(_@*>0w5p<9WC?D!K(5=R0pBiZcQ-%KMcTl7T0zr=*Zll-8_n+?u3#wAQXcIxw*jB%#L2fOnXhRa0;qbO^Hrw z`zuB-14neXUaE&}1!*aRp8aHB`-GH+J09C=V^;^I^usfw__+>41OFewUM z+gcO>P1>R%25^d&8N_3WX^{y8YCRreCHOkNG{3`2G$Vc*Pn1?9X|HPwR|e(faUhnr zN7R@pHbT@YZar~vg5P_YPby^JC4!G*=OCnfYc+^8DisK#sYis+gooLTL-+0NlHY$r zfvo*%NCHLKLvpn^qhsbK%P?G_bX|$v3?lb!lDy*hOR1WLf@CO%yZDl_6*zXJIZs`U zJqVT~E8}fj&Jform_}ds!UlD+oqaK)-xzu=Escl@!veVtBOWsGUyj4H$}eK-QZaK# zF$&n;`)^{+n>P+U7NZ}bd9dB!X%vckky{$8M>BLPBYeF5%pme$YV1e*)gC=G2t2)P z$YY0qjETpUF30Bx50J=zMh26;t@Voil*K$-{F>1Z$)Q$l#byuh=$50MAP!*;%NRrU zinIHm7uqu+132TJTfaM=7TdD8$_xHwcf4XV;TySC&9cdRft%?)fqtCnqMxMGC1IU1 zK~7g^_Y$M4?$QwvAN$;=ZhrH!XSE1OcEXn>o-c(8# zC;T2$?Q_JBx@UIPE$wM1x+iPw+C}EGhu9aW3Xa5t)^+USs%pJJ+$0-`_{0*7 zyU2ISd)uGOvgGW;ij_LpS_39sq^?obG10bf!&@~SfUZ?`%J)yp34nLfTL>DN%rv;(=d)bDD*sy3vlN^+MVAdGTy1Na5 zx_+x0w1g;XOLH)f`bNUrP85yK(F>RG7JA94g9}3F%Bc9L(5f&m_l$xIB0|(zEg&Yo zvvm93qK)J__kV7p*~>D$!4LPMM7+Qze`n|ves;Psp7a^*1|^e z%0lX7GE@sijf{njq#k;M*4#1>(FX_(?s^L<&0J(6KIE}jgYHhDl@(B~F~W%cd7Mb@ za_EqOh0|Z^NJRCoEO>vC6i<>X$Z#TwbJr&I$~m3@p9{OyH#*>h$=S2Kxd#j( z24%I!xh-k|efu&gF}tNyv3b}pNl7u<4)*&jr3}73I(jwS%edX&^FYX8w7xic=j9L# zy-+UjJZo#Al>ETidkL^7jqHHINK7;#MB*bDwK&su+y6;N!UB7j{?Qa6gODW8{3f66u$= zvVz}6cMj+XlbSQd*awe@v2K)EI1d^6U@2shY5YjYIsLVpmP6!2L-awQp%(D&w;^&* zCtbkCzo1={^m*b8jNM^qe2cTS;YNM6pornWG4bgnf?5!buMJM*N^72x1QSJrhFQ4zIz`Cqv%U#sAf|IQ4SUT)U1sEW;9&v2y;UJf4%^r6|DcK~Yg6iAjXsPeX8%xmrx$kNzp{_KHde;cNmn3`U!{ zlw|jR6q$+)JcpvcG5$srTq<)jxowv`7A!9$oG>q&effnFhZD1%-u$V2(IR{*aR-kir^1b#Z6`wb4(0^ti^L1n5M&mO>yjR~)2s_Sbq|>F_Ppp7u_$#RP zv_9HvkQ4Om)%jd zWFE`mW$OsAgG}-gz!tjWJIjsOjdPx{NwrMyowTuHW{qf5#UOk!$BL6qefU?7p$|Zz zwSVM9f<<}(Ad+sg{~UfJvmq0}Z$9Ai^B$NR{Njii2AekFy0!j1@S^b8V6ysx#*jN) zAnRaN0}BSPBJq&q&SVN5!}1D81~w|EK;UXO$r2kS+}aHi96*Pn6Soza;${6xfAxs1^P%saVDnFlA1hDvx}L!l1k7awTL#A)H&0lI4YyjNp;sFQzGf_ z-+2tK(}-mxg|NLR(_VKyOZ3Ow@D|4Oleg+3Q#)6a1{q&Zfn^8OQF)2_K%F2i)K91& zzF&|VusyUHZYMe6t4gBUIUZk#2j@QTBOI-EvhbkTeGGIokzSA9HGA*d)a-mfnu9RK zQDw7i{#Lf6YdF_nNR{QkH&x_Ll3}<@?r=9$Lc>7;Ti`L_OTbRY$l4`Nrvaufw0dn+ zaGl4+kT1?DzU$(lxm*$cp@U*L?md*8CDhCIdbrQ^Qg%k&<>hcYG1yBxoOTrR@HVwjJC>LFn8 zcTC|{a#TX+xNFqX^yS3e*PGa>hq@XWq&^$l+xCYs=mlv_rw0)IizoafWmI0NvekVETeEqR{J%P`EjO;jO zdFg2;Bsaldtg^#^&^P7lY}+N#yysl$A2G2ua4h4zY{28X8nnxYGE}Zqi`{~FK>M7B2P^4{|!%wTkixFl-KtJB<4tBrjIFI^lqlSijHNO+K8)Yk# zw!y(Wj}%^8!7cV>-f-7EMiszgDV`WHky%LFzJ?gi0c-3zwUW-Y+Kka~CPxtE#s z?B@=r#tp{_q@%qXU)n15AxnTyQ_k)cmP}Bo0(7_5*RFG zI^1)#kz6tOkt$GSQ-NwY?B_&TyNGhL{jnG3*onx$wTQbzRl&)Ppgn^L?$_Xmj?iFQ z;SrjIWC$3W!G8C_eUr7hlH%CoX%+PA!Swl~m6v|Phfr0l2?#Dr4qnR$Ya!@?^Eb!w zabq@v{a|o6&Hm(6I#NehWD714Lo+3f)FgbR-w0QuTM`b!km|s@)+9Sv6*FZ#9aWtO z?>(Cm#z+9_6l1o2w<)7!_r%2S^fx*|?1B`cazG@oHh`rf3h{a4?KE)=go#)hdn-c0 z&q>U#cr_&ijwiC|MfAawh<9c}eMF$T$WE9PlG~BD(7u}fSbKbYChT!AnUk2K0m+>t zS4=TGk&iKAa-+k@s7Y#n`5oS6?=2Kaj1OY$B|fRJ+G!L9Hrkvk9?E$z6pO+M?io@> zxAobfe@9{bphl3Go%dh);a)r0xadAW; zjyrC1M3vHQ(}=_&{C3{EqZ0n*K|vm@VNpESs(zK`$qFJz>GP^9I)krc>&U*33R15U z@`LJ^4Iu~fgHTQnF6XcNL{GN-d{;xdJKS3|HArm~|5Bv@SvMOQEs{z`zdH2TJ*cog zkfRKCog*YT&p(VI!7rt%Ko4OdL&rb{>)0nrFrS*M*+?;cJ$z`eyrM*7awm$)ChFrK zBW+bRNq=J3aU-I~_VWS>=5J0pe!68utryU#&GJ+pOweiWU386^Mq}2ubbdJmYNOCD zd%pwLe)OEmFNk}H_xAb9Am;5Cf1OS`p5%UAuoxjbd~CXO5w$?RT>_ubKcZrp<6!c8 zzx4j2Z2I37g_=qHC8PhQz`+0jkp7n(!qm>r+1Ae4#OXiksN&@9*i8b2kXx!}T_V_Dz*!8E-(a8E+?j>sB|vW=YNczaUES z);Wp77;q)bVS!&i&o<8)#Ju!MjG&?@N95cco85kG*AGwx7!U(wWAwbY)O6ywfzuSv zf6+4ETT-_I!YHdb%%N0BI`VPb6Kxl$W9(J{(RYjnyi7Gu#={qX7yF=sqEOkVf#9|S zEKhQ=y97H;1b8J#1`*fK%OHV4EwabOhAdf^!j6Ip8nXUuk$8l~^W2F-BO8id1Pj9J zd(&K7V6QINH{=prcgQqr%5=_{;@ejB?=--5A>Sc=9wQVl?|pGINYRyx$E6d|Hr(0G zj7OL*WFzUhFh>ne^Y*~jdqCEQZALzB3mC~UdthJpbI(@)toj05Q9n3Z{jf5{u{6Db zTjuiNz_7m*&mpE#_bbVL%w}GpsmU%J@0%;^hnU2I+JlrCYs9Odg`kDGm!;1Q8*WO`cZvJQXbx?r+R|R8YYy4lVzA1Z1|4&A!c7Xe5 zw@sGPHKefAT*P4bdVts8A$#OcZi97Mf!*xU5(=71N%y_ON!Bi2Tw0r5X=Lk!M@)Q! zFxMJsnKsPz9}2^g8)A*B}hs`&cl_f%d^pXF_dFpzPB*MorRIf$OMA*L`PZ) z>Ysim1TZAICDv1^+!M zn=+$psiJAeVCy(sL#xX9+J^CG6ApE*d3iQKxrDq|+0u&?d@|B|NK6YvuQ(-Q?`RH6 z2Ejp;x9Td6_N}ASSAeKlfUXW|4nLd~*TVBj^SHZCvzC-^*p>LB!J_xcA?}m`MNq5^ zUR!`1_X=^u66!*&xDB&u)x|hu7K=3v#=8k9G9XS-DbCq7>hmjVnRBtpUsLMXRkx_m zT?eaqni>_t3rd`;X1rRP7xo^_2iq9U%@(1fI$`nP{DK?F-}9#YT;I}Mx^b~)8KYj$ zfAaqSo$N|Ql3k*IR7YU}0HFRavYpLMY)t6>$H?#>xUV#1qOjR8deA@chqiG2rr(Sr z*w&1=ESj={uSiM6JVbL9sxdJoc65+|NI1}-N-WL`0(LyfZGa)45`Qvnlr~Tv&@1QN zt;bU}Hmvn2GzK;CXFo6IKPBUsb0^hgk$_?m4%16<%RDO%0FuF&hb)#HZgAkFXu$0O z7ZH#$3uBqYwmKR=_rcTlW5X2!{hQ_9;jY*l5EgSBGr$`$AEcZk71!BHsRIO6HI(5n zxl?_S6>C)}V;q4TF8onQ7p|x8F~A{kOlfnr2j2b1{3`Y|g!uoJ46rcKps-~t*U|Hw z^62eBBt{bsgP-0Iav;DF#zL;n*ioNPlildj(nN)=8#Xfsp>+Wjlv&icCk;* zxPwka-GKSBKdCV+@q5fS4HauV>HydAdo(lz*r|5HfT(96Q;6oDH8B z>B-{r@%lV94PRhM6K!8Szv_K`KPlaiRss=kakj^mW0+I=>OhB7n2aF|^Adb1+N-m=FEr3>&~S*v`uk!6~J$tIq)=VxM6< ziRXi^3Ymi3GEyM3?k%tT1lFcERo^ z*jW3~^aiaf*Sc9dosW*?dqj;T4?>iP`&Y-wA~dFN53Yq@^1@*pJ5%@Zz<45P2|0#~ zjaX`8>I_CqO>T+GpKOLt|LDvW4W0DF#Yz`%p~kOi|1k`{r_6>-SH~O-J{#j9H^d$f z;((6!cOqoMB^2x@5YFDTwr%VXc+=KT=03>=^;v4?%{9D#cZ;H6rTQOCo>99etgKx zC1dAN+FCW<_G)XS4S8B_le_NpA_VQ8talNsnS5j2>U9514tYDUa>Jil&=R*rhkojI z?oGS-?RK`?oIRymql!^YdpoGC-6Gbyy*82_3rI(99!=?V;^?*~$5s)+q`^lnZJ?$N zWwJ*Ms)WHn&C^H3AF`(mtEfc$(_@e8KwqMr3tzc@!F5T{WSkf3Vb!9^ogoQKu6q68 zO|fxoLHw{H>huiJ>Re};;HgXtVchx~8qAt5QD{pzPmMK8P+;mxOTB@HRx|ajigsa@ zfZFj3Fy*^+x0RisZ5N+UKdEG^XZ*Uw)J>Cq(rlG_-ojLSnq(0a+`1)=w80o{5WbQ% z%dvW`A_?oMLC$KytabZmH6y&*0$5@ii#^q=_C|sYFxxe|;V0I0b^% z=IY8uIyb{>A1Qm)0w#$M0=T&T@ToN$a^Pc@Dh&XksbR)ULeq@{T@eSFS~RWsw_)oR z5yFfSdxyd^!`wj?4+*iXFvCt>s-~XwDe{TLr6J;FIS{=W~a@!`Ul5-`9lRxrI)m*yt*K4zHyYwVUWbnJJW9;AyFh zo?l8dH-BD0^)WNRV>aq_>|U`-T)Y3D_TD+VlCA3-O}b;-wr$&1Cmq|iJ9g5s(XnmY zw%xJK&dtH|p6-6my?1=y-*3iPYwuLmZ`Q6^vue$`)&wV2)2mj=>>XyCLdO>Cb5Hwu z-EvP544Z;ZgZlkif0nYDl4r**3kCYCO3SQ+d0qL+Nj-?w{T9Kz-CmzFyjGw2J?_6M z4t{O~vcgaR0ME=nDGv6A4i2VP#(%1dwdyNYSOf5$uwT9L9Ls9o5XJw1DQUhsY$agO@b;)HcrwT3QTwWL{w zy0Cx0)7aMBx=MYv;#lK8XT1XLkqVK#?)_i%WKxZ@_ zHFLAD58JbM#Mal>tPb^B-hoLPQ}W}iT)lRHmo#)seV+U4uGeLy73+>y-^e#jc-U7X z=M78ef~EPla|@vLOp?b7_tb|~(`_LaA%H2c>MX*?~dN_Ec>8JNk5wbMx`P)>F${kF!Z%e8#m zloez(V$1z-HV$uTqeu0PiV2*G`qN6hwXf2QPG0PQBX{Mlsu9+yE2&i*(}cJf+g%La zUs8jr-nLBLH>~bNs&Lg;Sp{p*RIjh6mi6+gVfHRyjE}Y zpC`7i--5H$wKBCYXmL4U&L3Mm#8<2mv|q3FGH69*XFT{_*i0rmtYCr?#8}2si0=FN zJ!WVRmAJLIq1GBGoe35t{dT?KFfn$x)FMVe3NU7@^!kdrkt($cZf^bvx;}^QzMihxd{y^AFLWMn6Jiy63 ztu*RG->z345wdk~s)d&TtWBTo2n}Z&E*u80tLh(YEWGL1N~GQyyoD|~n<__Vy)NXp z08Rxeg&5$dIxlbU=Nd+Y)he^?5!lch-)|?Wp>A;*0!1%KGy+#u≻>IA>%Lk|sWe zzmT?6nAK)G&Qvc%RU3vs1W9h-hjbww(UIqFZX^?^iB~luCVIV$a;LjN!^9*E2omYo)CQ5;WwZ}K~`NAtM8w;3F zq$Sxgd}h)u&uCo_ruG8H4MKb`KMMtn96iJB(6xJC)J+%=nUDm?B!q!3r&7j`i7jDp z(5#;$LSuB08e;rg#ITS&;s~B1M35ey^9wa|1?Sly8uL_+VEFRL3A=ncHmly@3~<~ ztj6f$t?>&m18&Gd5_0ww5I)?Qp$glo6=WL}s*qDA5w4QO46T_4ec^iydHSkWyROuG z9I@D)HD%Rwz)v6J^Nf@77r%H`6lZV8<`tB}X6TGbG1&0#Islbpf{U{$&aFbEt(obf zN7gQX_3qJ7PvEXKw=Cm)S1a*k46xE2C#M8u%l*DdKoZ17Z8r-Un^$TcXMYce8g`JQ z*Tb%N(X69;Hxd;*N;AzY1D9DwUDCqPjf`b-qDSpbLJSm?1<=uoj422S%Rb2!1QugK zG8@MYv5lkm_;$C-L-jW$(NhSwDpJOEO;>S{aHZy?pqQ<=49izSaUH@;pcwSJW`mRL zvQ(X5B_Fw&gpic5tD$PybPOwDL+&*ng)%Bj%ALY%FlI)NRKSfZ%xI;nps)Ls&^v3q|2q?kUdZHX(3K~dA!RZ+={57Ks6MFX9k8hsq3NF(`DIg)`h4V<0Om?LGp zF#MUuABkHqhw#p09F5aYE`==zn3CWOnnMzte4V_{Z57y2h9tV<*9+ZZG=o1QpkiX zeux0=UZ_H&pN!bHs=aa&`k_(}!MZFn`hrxX9nmT`i9x>e@qH7vP-8tz1_G^rUq%{S zP}wk~;1I76k+AV-D#DTgwTm4Ro0U5SWF_f+ilQ!^pWJu=ys1sYprS>u(`MP#hw}mP zs10;iQAcnsQZUmCFW;ogOP))-%2Gr_uq2Y};A>XxLc2aYN`dX-h_pI##+s#Mo`8x{ z9d}5L(HkYxRPxn+&IA4wovk4Vi4^qpxP-+ZIEI-|Fi746ar~95+dmvD zhuI6+B{FsoeS08#=|eb)eLk4^+@Yk6HA-T)4kJr4(Ulp)Y1w8H03`!+@!}6FRH7j4 zJV0BCrQC(Ws{<#PB!~-OLJM<57|PY&CFRGIRw*mz*)l=S0lloVE0t6^5bU74CyG6i zdC5wO*LHsn*05XE#3vGOjg({bBoZ*v+wtFY-j2SKf5{-HAgPogoa1eL3SI{LXow9x z@c@LVU=xp6P>eC73QslKm>r8{d^{C>sH5gi$(}K`l0=U%R9htzV(IabfJDuP2csq) z__)9(YC{~3K@oI-U^ZMPEcXhIaCxT@n^M2gC{)>QRg^%K-kuR3JeiHFOX&lJhBE00 zP^6ka(Tf%dd#o*Ud|LV?4Y?FCoKSXxDkQm@k0%&D@wOw`{0@!G@bZL>oXdB)l@K$z zRq%18iya&Zl+c}ZMHP!ZZbcOq)j`Mkz%3bY{12H=QimpegGzP$$9V=yTR(1Ze7OZN z$08A!O+uDJdg&vKKNveDBUqF+oj@l};_hap^;OHnh3cr0>e@8KfAxJ%U?(rvS*4L8 z^`OnFY@#vQsuttITtM%bq4Wh^Ex{*Jk;+LbN@69(Dkm$N3^Pq0i+~Idt5sUt{U=B?iUlrv72K~T5(HEI0G_uA4<>b3XH_Cm8js23` z6M(Yf(4wR+usppX0v+XSvQ`*jw^d#|gZ$wlrJSA8KsX2ov?}-ohB-O0J}jvP z5PDD%4h#G_Rntn8(#u9at@J^*PvzriL8qO`p&NC|w6Wb!;7)sI1`$Y8-!_Uu*wb2O zN736?K8K|=2`fVufl3x6ap{A`kL`xkea62(*1*YHJ3gQPH1i2*X6&}7;xP1K>4`5o z<@R)v+1NV&oQkG^)GFw{Y9;Mr<*poec_KPBXa&*5))v6&*r={pu!arL{kF6sVa+Oa zrRDW}F4f58p6mQF1Td0jr6rSRjM>wA62=?%Dd-f?6D#F(>*)Rvz8oe}*K^LYKeOf5 z)Z^a9-9CGnsb|J`vd8FgBuJOG!1$6&E8_mt%{$=7zL7Vw$D2h<)5Gl^-btTXV{*A` zgLTU3osIg-9d-wgw&VGkD39g^&S8fkW9!D7{j&@febqoO;L6(!Ret)^%mD6${x;Ly zlKmAEN#?+;sT6nA;Y^Yj4r2j5W5gP-<%;f{<+^sJ_Y-@>#%+o6+ais(OMeFj2biw& zhQ*4~DFVIrDXQLT!BvG9qYewd2{hHmHM;Z?r{PRZ_54Sn4Ti6}c37N&4 zi@Oo@gt5i6@=QMR6S zC#bwM1+ZE$J8w?W#h?-aibt%$eXp+(WsTAlt^nLFZ~o^`iR9sg=`L<+WC zYr_Hn@P`2aynpRruu(~AwE-Fof^Xo3tQ=)lB7G{!_+S+Du|VusYN z)Hoctnrf_nLKItub_W2MLRL!FBHlI@OQI^#v*5=y?T^r3J83&SCAT*z$L#uNu7`f4 zwu$r2d=;hgXu#WmuN3+Oc2c#2{Nf6QL~kf{6up?ey{=nV{jm}=Bz z9JLQq9=$Bu7h(I^{iuRS>tkylAQlsV_`}&A{de@_f$ZA=5+8un7m5_#?Jf6aJVTA$ zE8{!*8Qxpk+YMydFfJD_(W|dj*F)5)Gg5VmbUEniQoYm?SK9=}yB>BEkT|1aDWEgn zZ#bj*U8GN{{Y0^{)U?y9AK>=(V@tvn$k~_T`*^0PkXhSf9z5(+HHY8IX|b^8KIJRa z%-1F5M%^-}oOxu_A-}jcF}5Wp?olBOQ!R~;wIQv}A(#JPP0_7;l>8w&>^q?%WGFr}!5%Tc@Ybr3QA+FID%H0UMEX1| zxXtx+L7hu!SibK}E?&q~4Hx`;lEXO&x>+-9`;wg2;`!u#hSg-&&x)E+NK% z01H!OfTRf)r1B&i3Q3kNRb*s;)dM}oz;-@Znr{#bEmPX-*}edFXq`D zPA9apd#7{H8qUU=O+%3V`}V1o_f3`kJ1T-@nw04YOW!}{bnlV)d6%^e)Rrc349w+K>s)4 z;kGy;U7R!yeBJ(mlKzl}2b+>}N3 zaTxEC5a9|(y75rmcQ;Y*4Sw|#Hipy6P$+k8Zu{LPp~M2-&ZmN zma8VX=kT+Zg2J{Z_NbZ4;6_`QeRt`bSKTqKLyvDQF2LtDLBGBjl0|FxQk#;>HI!Ww zOO;edJD|iFcu5vf5A784A=rt(P7cK;!-Nh6#EqGY*_9%er4HWm^lsC@S&e>ApWqn| zF8PXf)gvRIiuHi0puxMk1_J9rd@Zv?=LJCPDu2<7nV*62atN9Jb|TsZ8I4>(n_=d@ zW!f(0Bb^+iRUoh(4QvXvfn19gTpj0nXo$Rieq%T;j`GO=oHTV5AMfiXkQQyA3@yp^ zRT^m6cpwB0Xi_gmbdZp zV+hLX1HXTD{sC?7n_Hm;q?L!Ex=lL;XM}hL<1s-AV@Ac%yi5BrClB|m3)kLKcK9o^ z4)j48usNT#^xzJIxP2zGiVF^9>4nF_Le2;o7i_E(#JM(lSSz}q&ttX!higL+;JpzL zh|v91!<-3{XI;nowNaed!tp-b{8+jXq#~tI?y^gS=rSmBj7xyN3Z7Mik zyo~H+bz9Y2l~G95)j_iH$9le~3wMl(csn*3_z2fJO%o1r1CcCS zMTUWFdZf+tKCg77v2A-S53QL$XV|HIu1NI=uX^r0o3;^z}j)T@x|{!(E~J5 zy3yFcf~1_oI$BHkPhfW;UWV1i13=QsTGw8{!Ov!J++=n?)UM13I_f?!ztmJ<(%VSb zV>oPM!DtVe38TAn!oaXP1$WsFz|gY+Ea8I)6O^z~5G~1r43Kd4ZsHjR@rhC{5^iQA z$Gt~*kewrx;e|@pV$efW3PRz)EaFcJ1;CkPN1~B0;>!sEZ_5Y;keOr)fHBGA1Emz& z00a_J#0|hB54XS}XE(mLtPqI5A^$lhQs2F#5m6o=BBc=SgR~Nu4@ofwZ6x0R9}Mn; zuoB*xkW#QO357o&BC(F&_ce|kU}`%KhO0C zR+$>*|KfZ%j|E!UwQbHFQJJ^3?cKZ{hZ2;xLobUf{v22^%I1ax7(Z;giT7ps_X|RT zC^pbQy7%aJ%tDu|ORctxKM}}5W_n9KcK|X>iCT^1@PN%+d|%WK~-6Z5mW%%VCV(;g=(2ss1!KX-@Yw_3xWhs_7FS!}nXS%P(=oP~`B z;#q9AyI6p=LmSwv$9Zv9rbnjUO9BdHa$ zoWYRE|KQ-dhGqf!*Jj~{muBILCuYr1@(j)50-$YI@kZf#9R~yPXxK1)VU}X>Pm`$8 z!6KIsYLzYRHR=s8#@?^Q&B_0pQ7vKCiD^I=<~&S5eI@qr*leRk(a7wXP#vL;bi_zP zX@0+0;LYg8xb5&+hLl`}?5(-rs03WM-yO zKWqM<&VZGfM*WE4m##VmBa!;{uWmVp`jn?3(j;c_h>h{m=CV55F41boFS&3zPl@(J zF{pL+9vLw?6*KsakNi`$f+{Y1d}2xbs@!ol)x+(GnfOerLN<@IgsuQ}cb**KEGDH| z?WB4ki}phyv=Xah^+`>!gGPhQYpvDkErsc=hPiO<^hUNbe%Vw`O|^>kwK7N@xl2B` zLu#NvrmIFW--~Um$cIfeYj_n?3{OJjxJKSQTMyCVg2&+>CQ_(Z4s^E8-m;#u9k$+D zr^v?f6G~83A5QAyi@ffh#p}{rwN~rhN{A#x?Mqmx`|E2p8&G1q8BnEj#2Sh=q|zCZ zg!y{*0%1j<=kmSObQF`H+zd?!DBgm?g|I{O?T)29MZFHV_NOFEAY|jp&-x;YJxk70 z0y;IUZbiN)y9_%I3QTI7IA>*>YIgCHz6@PI_>Sj1r^G&m>pwa)OmQU0YBI_!U@jF( zGg{OaFB>_m@wJ_P=H#2vafDPpC2qioO`+5p!p)Q@3g)_*u;}!OH&r^{9ZzAy#7MpC zPhgrWh_FCW$rNB{4PX+s5++nq|0)^k(3Ij(ep=oi$CPT8?Ro{?)O5lsw{&zgSo{Xh^EkCj_Udk;0waGk0)Ng5B6T;#Bd~{!H;(w4 z;S;Ox%GsbIi!OvyrqGC_jZ4~sU4JqdBj z9fXJa+(H&}vdA@UUS+nrQSO5Pj*@#!JUnMmT1Q%P9c#;ZhjEGMIchetnH`B{q@Klx z4hNt`PRj);XLQ=i%#Z6v2?|Z$_7p`YORYeT%8N9}YO? z-}KlKbqiFE>m`j}r6v!`GpdigDB``L*NnS;@lTSq-vM(;)ikHV9?M&r&h@IH*OvF80|7dpCk_ z&34+8@rR6S%5_p9mJ7BEuBw2&Y1Ag7Z1A$UXpq+Rqb1e_4@>-6?Rsv($Wl1BM{`Pn z{(N-NBMrMcF>lfmgZHMVYIA+VoT(>A2^Ok`W`xcBJl1DfxrZxZgQRD=%K3$sh?)$F z$y3@;^B;}TP71K|&}HZIdslhPCql=^Ikn~P{^vP$7ppR5<_OJZW!)?y$5wmA6FSZI z=eOkKD`=GBlL)a2iu|Oflk=`ik>ux8#?sjLck9|%=@}aJDLI7tmKhZ|aW#ev`1$AO zbJngH2g0K`ijKbFUQYHB(8$4aMlt)b{&`$236~2P!H_#ck|=#h;GSV0CM55J`_@Injd8gWzI8aAF&$&Mb28zK!~h zaVYCk)}0V-bO1ROtDw!5(I6&Eeook{xL37geOq6;{9bj|I67A>nIX)iC>c9v7Ppim z9{p?R|DP1h7p;)6EJOt0RnOPrMc{ zSi|SMUiWS5{ULg+P*%^IA_wM<U7Nr}GdvQ4w&6k>4yPzP&C-NhB>BJPluF8X{c`BM~_KO&QF;2^n7a`zUa zGvlQttBV(1yvdzEBvLhC?+%~}>^}Ma0+~idhN%4n<|&QK=zu`EG@Z(OE~5GEYm#e2 z(~2z6r|YdXK{%1SQ9njsievtw`>$&Eyq&}Y8GYgg6R{KK9nKdWEq#`_G@6yybatC= z$lpX-Ig;-~5F-c%yo59-$@oub@E-f`HbX&y#N*@Ty}z72nA6(wvSSWBKh@efc}}aq z)@^HrbX+;zD=8yNTXJ*y z?s|4`Y+S3qy}nwn|5))@IAXt7yPv~_Z*BzCgJRyeV}=JeXsq=Ma-dIC0=&iEP0u-D zfc7I2*$&^QVJf1P{Xuo3pnxbrVcl6vK82S)Kri_vJnw59KHi7}mf9AFU3%gD($ZO# z<19xMY^vE3vN-btEO`%s+#%Wz5ceExu66zUrm_UaBF7aZ$2R?)-eXbsesFZ1A-4GW z2*m`lg+#^(MawldWFNcqZ!9uphC513!vVS2JG1JnGRpZH+QAX_HUNT?U*pk$<#dtc zu|M`<={tqd4_}$0G=*(YehsA&+Y?jn)ktXKQIIF5gl)IspJ4^!?^Ji?81O(yDwiI} zS>GJNf@N+5#B$wQU6EtD%@4V;Cxalj^szR)n6uhBksDXb#=p}x!{htjv8#ZnEhcTM zWyS;$iejWh`*HU}w}``mE;U&2;!dHwq&*HPBK|Wzr&yT*00eSs`d~O-T*`2~~ zs`fYq^zZw=mJvqHpx?4d=W29M+_MRsgvouNze5IAL;W_R^J8eoo)GYW550hZJYvGo zZ21#w)Wqe326hh@&%lm77msr<`kx!9dq(^B;SlmW$@1|bVtkhJA|@iJ`xJbV@#8%` zD-klyF{-ix&A}!dAw353`Yl|_v0sE}@Kk#2BYNoUrYg}hj#0*}KmSaxG8`5hr4lY&9i(JtO}>;NNbNM;g=)Af^LVoy*|rV#NS2ZJDjwq-r0q zZ9G?EzBQ`3F&@Y>y+FGesHDw|)&P}|!SacAm1ByO)CYk&qe6^pgX2Uk5Dc8;#rR-_ z|I#Bb`%&bqCkEgK@YNrD=j%f9JRYKC1I(9r-Tc5&e*6p@Y&r@*%CRIr2`d;{xFPYq zFMd+SDSjeUYa9-8_1{XK1qXdxyE|QU{yDxGqphr_25aT0|JBOZ}swxrPi zLGEjxqLWf>w5KHY$XWzRRx_2m3OmYMb!3;Se618|hnrXx z5CzwbBSjFbPfU}}93L=V{2mJe+n1FxLqSGxsdC0XYjfUx4>4_a^dZf6)k_)QarwI= z%|GOd{G&UwLzn@0&t>)UxtS$h$#{z+zl` z7{HF2Ys4O(?zr!k=CF$LRz2`>w{$eQWdgslPQA2FjZXUbR5wvX3JWq@vi z1-ujm>ZlpvGd0-3X9SIiu)xg3lffMtGC+}A#g8cjVAbryX6>fB7bdyVNNUvKL)gdh zsM?p?C3fVO)_yjnBULm5%~;3t5W)Iu3|%&*?x(7WN1)V9ifMe6CAxrpHwPo;Iup`y}+F|O3B!=zXvA}evUL@&n`oT6RkD$-3q?Dg17uiu!JXAS;PgS za~hemwt(*Yfk6ZBlUTieX)u&12ILSQGIsd|MQ$wRpbba!gRS!F7DuiCN1iWK;u8|D zOmS8ZGh=|mA*TR%GC#T0mkqH{@R9=x3t*)oU?o*Zu?_8Eu#%HJiOZ~R9SU4kU3w8@ z&44~7QyitF(K7=He9eHNFCVeDDZMCPFL?iTwPGu>K+)jcwfgQ|e|j&v1^8ZDK+ewE z#-3K+(cZz@@(&g1y;jfv)*5+dUTZABc{d#z-?`swP>j1p%4bUn^(mF0M0{1!69&Ch zvo4~%5G#he4b-JH$I6jF&OqAMdx036i|&T`4Cdk(f)6sFfMFpTDDkSII?>WuuvB?P zWCtn`C_Wz*b>KGE{cA+#OQ(<(6ZpZ9ecmIKbXF)f=wy@!!UK}ys<)I<4QuKR)l}Fe z4cm95OfU&&Yp&C$V|tz-L3v!dwBAtE247SQaH<6@U2Gq~c~yxQl;M3AW+kU*1#x=y zWB5SX3B@#~g9b&xqHGxOdoD36k9LfpsQ8=M-Bb?Ko7JTh1y_%C)Ojh_dPfJE&XIw0 z4eET?VwxsN4S*#pBg}wwv?}UT#H;Q0;=s zu<{*TQc$lsjhNWN+w!)CPMcE}9KNj7X$&o!t0#`UMc=7U8%V|%kERwZzbryN(;xCY z7j^P@FFs)ZoBHm+%kI|w&UvzT0mXS2&c9?*)zaecvN)y=hL(RihW~F(oOdq$=P!Js zDzt|V3S{$jbG^&NI&D>M7`{M~+<{X9rWr$2TuOgL)y#CGeJ`KV;eagU?ODF1EkIb4 zzNWw=tH?|a4I7$A2E^<6)$C(vUw;=3(o`oldi~nIsC#dp1o-6%VoA0T3td>Qb=lGw z8r7#bx`Yuic2#oy42SA73({F_$F}*$4#2h8A{aj(4>Sz9SMDuNmWUkZMnY$BwC}7T$l@@c;Um_O;9Goj-x+e(i#jYmOGiYUH-V^GuV+i&>u*Y!L)4 zKv&2y_0M^N(wlc=wqw7x<0=QiG8(de_j5}4xU~0`oKmykv~%mVOqg>}0I41^wkFii z(Wa0URvGm!J(xgfe~z6ZHZM@-cGc#AAPU$u+cG3_V9y^yMEyoeBGBnXNUevAuy2Ib zK-_XBxl=dXc3mH9w=f(`os}{*u-xp2 zqnKxP2gCC6sUxAJU1k|tYjlTnv$@+_iE^VY`^P(j{yVaQ*$_=N_njGkRANK?FZTT( zajtYi8p7{A%x25;n==^umuMP|UbGa`Ja)OWru#^&w%!)`H*C&Zxk#_38@N;nyN z78^=3Z+~V?hyx*2fLFP2O%-(baE*?NGJe5bIOZejL^%>FPFMymiB!VVUe%G?9QcZJ zR-X9BQu}{jd<)V}i}yQQVBRGQ{+Hp++RpHQ2e*Hok?ux-xFB^#`yi8mb0*)9n zN>Y2#Rjp&UTKS*|p> zS`m}#9MC~zm64`h)fzour(N0fZ5@&>@Yk!I-MheOq+KBExt610@Siz9^`fb_gY;Pp zATEO@Hp!1ZmZ@#|UP7w_o*^)FLSP||(rbYCWHV>yC|szhy(E>QfJNee1YTCdawPE3 z!$vc-CYg<9R1j~V-6UBCQg<^pgiJ78B6nv#I*z&+IQd@ZWk%*WM+p;5;yWPI3}xag zE7`FPk~9LHE}6F&3f_g|E1RF2z`W>R^216;drw4_LmV|WT#Yg@hVhg1aHhw^3{a+` za%jQosH`UFL0*qv=I?s=7qI?=sZFGo7BFX zPq%=)ORPQ2Pt#UL@6}M1boDI$6x-DV_4m4GNCO+Br<|&-4S)hP5h*Iv=J{6AxG7TZ zPQifY?RYCpBMlo}U+bGH1rn8)ePcSSUhlUCnryrtV;OJ6;_?;3vCn56;oze+6r#h^ zt*?iMCi$0js(Pg$7@X#XI1^EEdDALHs*pqoRcUX6MfFLUNiZEbI3&Oao|KG)I(4X! z2S8|)r?*sJ2Ww)*uAnfRv%*sx2b|a16^9B*C>SD5g?NU@_Jz<|+H=4@3X?lXFt*97 zGs~-s*Nw)72hZC~q{X{oDP9uo0J{sMiAb2Crz$ok4---Xk8m$!*xpf1nv-twf?t?8 zq=FS)_yT%d0Qn@@%TI}xM0a9=7@4p;?`qvTE)T2PQEe|_?Zi=*fOkx zwV=f^q;%Y7WxFluqq~y?fIHq^9Ad=T5CpgC1T6gJ)BPLRWVGa3g2oh!{5WdY(1#%OhG83C7q9eG-q#r~h zdsMz>?v)QP=vqoFnz7{WnJCG=_JnWWgqn~8>5v$Fm8DfR1Y>|$sG9E@q*V`)=bANR zkXSdo(b4#IZAjh4w_&wm!)p?JY(-IpmOy-sgr*h=x<*%f4*KucrtMZ^vJ5KZrq2UM(0^I8Xo@4vf7#b z`0g>%QjQWR{$St=vW4AI9+)H_9ESmeh>2spa}fyhvHj^&O9hS>olE9IWeiYrj<}hb zYh3zW$F6+(LD;`Xi7tL%?2dnb*JHg;_`a7={LAsm*-%gMr+AN(_z{^NIym8bu@}Kz zPfSu%6U1m00Wk0o3~B5iEW|DM@4HLt97J zUIJStCkC{GQf-@Ad&gRfVlk(>+0C9NT9Q$a%)}|h^aL^;qi!Sjj1NiUWXL0dk~4SaCg{pA#3bCnJ$~;z*4Y`$eJto+*3WLI(@)n zjLnYT2iWYpOq#W(D7*HDF0a777T1sKem}PLaarTn$T}+MAMC)aR-gCQMVP|F%J#AV zJ{n8HatmNPIDIyBot-kRyR*4cl<0Y#Y~zaDp$LmLNQ-!G9JhV3eNRF!6+@qAYB#>a zy9+NWE-}LvHnA~~`F*OQ91-;G=I}ib^k0T-!%Y)|%lF%O`~Cui`+3OLwXyl%5&NGJ zJ5906YK0D}4R(xbLgp!52$txCU->hIFu4Vtze8BW>PgD-FXHQyy-f zX<1n!Vk>QR^%H!Oowbm_DN1d30T$JkU$fCw$|xV*uRH-H2f zV!+~!wFaEMHa)J>oF>XCa?A4hmsQGQx~p~@Od>t$3$)a1yETUn<6FpGb8?vh@?W7t zS>0c>rDF3Ew+YXy!fUW^=WhFDd(Zt}xq_%B+8CWTqD zw6nYX(jYQ|wo7bxb!}-Lz6@$AtrT|C3@x=@vjytq#0MZV^Ro<-z`dly-DfoeR>U7- zon0EiVt7);JPH$LRBZ=IOx9Tq-BDnC{|ap=IH`|`Afkp|o6FtLJvw4%L1VCl3dcr$ z;-kwl2K4}AzZPk^lzgR(#D!}Q^6IojI7nt`SF9PaVE1OF+BNW-iWLFR z(nrpzvZa}eP(xStI$>fFG#e-)#S0Fa00E`0LxhR8NV@RiQgj>{~@6$@;8Z} z;}w60|L$!34}3u3Kk)x?J^mg1`|9XFU`4&Z!N07P{*M23?(iQp0Dz-80KmV@B>qm| z*ZG~F2_RSj{AB0J~N5&cf#*Wvdc{Cm= reporting_interval: - res_json["timestamp"] = frame_timestamp.isoformat() - res_json["frame_id"] = int(frameid) - res_json["inference_fps"] = frame_count / inf_seconds - start_time = timeit.default_timer() - report(res_json, frame) - frame_count = 0 - inf_seconds = 0.0 - - client.publish(topic=PARAM_TOPIC_NAME, payload="End of the input, exiting...") - del exec_net - del plugin - - -greengrass_classification_sample_run() - - -def function_handler(event, context): - client.publish(topic=PARAM_TOPIC_NAME, payload='HANDLER_CALLED!') - return diff --git a/inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_object_detection_sample_ssd.py b/inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_object_detection_sample_ssd.py deleted file mode 100644 index e6898be..0000000 --- a/inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_object_detection_sample_ssd.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -BSD 3-clause "New" or "Revised" license - -Copyright (C) 2018 Intel Corporation. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - -import sys -import os -import cv2 -import numpy as np -import greengrasssdk -import boto3 -import timeit -import datetime -import json -from collections import OrderedDict - -from openvino.inference_engine import IENetwork, IEPlugin - -# Specify the delta in seconds between each report -reporting_interval = 1.0 - -# Parameters for IoT Cloud -enable_iot_cloud_output = True - -# Parameters for Kinesis -enable_kinesis_output = False -kinesis_stream_name = "" -kinesis_partition_key = "" -kinesis_region = "" - -# Parameters for S3 -enable_s3_jpeg_output = False -s3_bucket_name = "ssd_test" - -# Parameters for jpeg output on local disk -enable_local_jpeg_output = False - -# Create a Greengrass Core SDK client for publishing messages to AWS Cloud -client = greengrasssdk.client("iot-data") - -# Create an S3 client for uploading files to S3 -if enable_s3_jpeg_output: - s3_client = boto3.client("s3") - -# Create a Kinesis client for putting records to streams -if enable_kinesis_output: - kinesis_client = boto3.client("kinesis", "us-west-2") - -# Read environment variables set by Lambda function configuration -PARAM_MODEL_XML = os.environ.get("PARAM_MODEL_XML") -PARAM_INPUT_SOURCE = os.environ.get("PARAM_INPUT_SOURCE") -PARAM_DEVICE = os.environ.get("PARAM_DEVICE") -PARAM_OUTPUT_DIRECTORY = os.environ.get("PARAM_OUTPUT_DIRECTORY") -PARAM_CPU_EXTENSION_PATH = os.environ.get("PARAM_CPU_EXTENSION_PATH") -PARAM_LABELMAP_FILE = os.environ.get("PARAM_LABELMAP_FILE") -PARAM_TOPIC_NAME = os.environ.get("PARAM_TOPIC_NAME", "intel/faas/ssd") - - -def report(res_json, frame): - now = datetime.datetime.now() - date_prefix = str(now).replace(" ", "_") - if enable_iot_cloud_output: - data = json.dumps(res_json) - client.publish(topic=PARAM_TOPIC_NAME, payload=data) - if enable_kinesis_output: - kinesis_client.put_record(StreamName=kinesis_stream_name, Data=json.dumps(res_json), - PartitionKey=kinesis_partition_key) - if enable_s3_jpeg_output: - temp_image = os.path.join(PARAM_OUTPUT_DIRECTORY, "inference_result.jpeg") - cv2.imwrite(temp_image, frame) - with open(temp_image) as file: - image_contents = file.read() - s3_client.put_object(Body=image_contents, Bucket=s3_bucket_name, Key=date_prefix + ".jpeg") - if enable_local_jpeg_output: - cv2.imwrite(os.path.join(PARAM_OUTPUT_DIRECTORY, date_prefix + ".jpeg"), frame) - - -def greengrass_object_detection_sample_ssd_run(): - client.publish(topic=PARAM_TOPIC_NAME, payload="OpenVINO: Initializing...") - model_bin = os.path.splitext(PARAM_MODEL_XML)[0] + ".bin" - - # Plugin initialization for specified device and load extensions library if specified - plugin = IEPlugin(device=PARAM_DEVICE, plugin_dirs="") - if "CPU" in PARAM_DEVICE: - plugin.add_cpu_extension(PARAM_CPU_EXTENSION_PATH) - # Read IR - net = IENetwork(model=PARAM_MODEL_XML, weights=model_bin) - assert len(net.inputs.keys()) == 1, "Sample supports only single input topologies" - assert len(net.outputs) == 1, "Sample supports only single output topologies" - input_blob = next(iter(net.inputs)) - out_blob = next(iter(net.outputs)) - # Read and pre-process input image - n, c, h, w = net.inputs[input_blob] - cap = cv2.VideoCapture(PARAM_INPUT_SOURCE) - exec_net = plugin.load(network=net) - del net - client.publish(topic=PARAM_TOPIC_NAME, payload="Starting inference on %s" % PARAM_INPUT_SOURCE) - start_time = timeit.default_timer() - inf_seconds = 0.0 - frame_count = 0 - labeldata = None - if PARAM_LABELMAP_FILE is not None: - with open(PARAM_LABELMAP_FILE) as labelmap_file: - labeldata = json.load(labelmap_file) - - while (cap.isOpened()): - ret, frame = cap.read() - if not ret: - break - frameid = cap.get(cv2.CAP_PROP_POS_FRAMES) - initial_w = cap.get(3) - initial_h = cap.get(4) - in_frame = cv2.resize(frame, (w, h)) - in_frame = in_frame.transpose((2, 0, 1)) # Change data layout from HWC to CHW - in_frame = in_frame.reshape((n, c, h, w)) - # Start synchronous inference - inf_start_time = timeit.default_timer() - res = exec_net.infer(inputs={input_blob: in_frame}) - inf_seconds += timeit.default_timer() - inf_start_time - # Parse detection results of the current request - res_json = OrderedDict() - frame_timestamp = datetime.datetime.now() - object_id = 0 - for obj in res[out_blob][0][0]: - if obj[2] > 0.5: - xmin = int(obj[3] * initial_w) - ymin = int(obj[4] * initial_h) - xmax = int(obj[5] * initial_w) - ymax = int(obj[6] * initial_h) - cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (255, 165, 20), 4) - obj_id = "Object" + str(object_id) - classlabel = labeldata[str(int(obj[1]))] if labeldata else "" - res_json[obj_id] = {"label": int(obj[1]), "class": classlabel, "confidence": round(obj[2], 2), "xmin": round( - obj[3], 2), "ymin": round(obj[4], 2), "xmax": round(obj[5], 2), "ymax": round(obj[6], 2)} - object_id += 1 - frame_count += 1 - # Measure elapsed seconds since the last report - seconds_elapsed = timeit.default_timer() - start_time - if seconds_elapsed >= reporting_interval: - res_json["timestamp"] = frame_timestamp.isoformat() - res_json["frame_id"] = int(frameid) - res_json["inference_fps"] = frame_count / inf_seconds - start_time = timeit.default_timer() - report(res_json, frame) - frame_count = 0 - inf_seconds = 0.0 - - client.publish(topic=PARAM_TOPIC_NAME, payload="End of the input, exiting...") - del exec_net - del plugin - - -greengrass_object_detection_sample_ssd_run() - - -def function_handler(event, context): - client.publish(topic=PARAM_TOPIC_NAME, payload='HANDLER_CALLED!') - return diff --git a/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb b/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb deleted file mode 100644 index 632672f..0000000 --- a/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb +++ /dev/null @@ -1,463 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook demonstrates the worklflow of a simple image classification task.\n", - "We will go through all the pipeline steps: downloading the model, generating the Intermediate Representation (IR) using the Model Optimizer, running inference in Python, and parsing and interpretating the output results.\n", - "\n", - "To demonstrate the scenario, we will use the pre-trained SquezeNet V1.1 Caffe\\* model. SqueezeNet is a pretty accurate and at the same time lightweight network. For more information about the model, please visit GitHub page and refer to original SqueezeNet paper.\n", - "\n", - "Follow the steps to perform image classification with the SquezeNet V1.1 model:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**1. Download the model files:** " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "echo \"Downloading deploy.protxt ...\"\n", - "if [ -f deploy.prototxt ]; then \n", - " echo \"deploy.protxt file already exists. Downloading skipped\"\n", - "else\n", - " wget https://raw.githubusercontent.com/DeepScale/SqueezeNet/a47b6f13d30985279789d08053d37013d67d131b/SqueezeNet_v1.1/deploy.prototxt -q\n", - " echo \"Finished!\"\n", - "fi" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "! echo \"Downloading squeezenet_v1.1.caffemodel ...\"\n", - "if [ -f squeezenet_v1.1.caffemodel ]; then\n", - " echo \"squeezenet_v1.1.caffemodel file already exists. Download skipped\"\n", - "else\n", - " wget https://github.com/DeepScale/SqueezeNet/raw/a47b6f13d30985279789d08053d37013d67d131b/SqueezeNet_v1.1/squeezenet_v1.1.caffemodel -q\n", - " echo \"Finished!\"\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Run the following command to see the model files:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!ls -la" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* `deploy.prototxt` contains the network toplogy description in text format. \n", - "* `squeezenet_v1.1.caffemodel` contains weights for all network layers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**2. Optimize and convert the model from intial Caffe representation to the IR representation, which is required for scoring the model using Inference Engine. To convert and optimize the model, use the Model Optimizer command line tool.**\n", - "\n", - "To locate Model Optimizer scripts, specify the path to the Model Optimizer root directory in the `MO_ROOT` variable in the cell bellow and then run it (If you use the installed OpenVINO™ package, you can find the Model Optimizer in `/deployment_tools/model_optimizer`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "MO_ROOT=/localdisk/repos/model-optimizer-tensorflow/\n", - "echo $MO_ROOT\n", - "python3 $MO_ROOT/mo.py --input_model squeezenet_v1.1.caffemodel --input_proto deploy.prototxt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**3. Now, you have the SqueezeNet model converted to the IR, and you can infer it.**\n", - "\n", - "a. First, import required modules:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from openvino.inference_engine import IENetwork, IEPlugin\n", - "import numpy as np\n", - "import cv2\n", - "import logging as log\n", - "from time import time\n", - "import sys\n", - "import glob\n", - "import os\n", - "from matplotlib import pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "b. Initialize required constants:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Configure logging format\n", - "log.basicConfig(format=\"[ %(levelname)s ] %(message)s\", level=log.INFO, stream=sys.stdout)\n", - "\n", - "# Path to IR model files\n", - "MODEL_XML = \"./squeezenet_v1.1.xml\"\n", - "MODEL_BIN = \"./squeezenet_v1.1.bin\"\n", - "\n", - "# Target device to run inference\n", - "TARGET_DEVICE = \"CPU\"\n", - "\n", - "# Folder with input images for the model\n", - "IMAGES_FOLDER = \"./images\"\n", - "\n", - "# File containing information about classes names \n", - "LABELS_FILE = \"./image_net_synset.txt\"\n", - "\n", - "# Number of top prediction results to parse\n", - "NTOP = 5\n", - "\n", - "# Required batch size - number of images which will be processed in parallel\n", - "BATCH = 4" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "c. Create a plugin instance for the specified target device \n", - "d. Read the IR files and create an `IENEtwork` instance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plugin = IEPlugin(TARGET_DEVICE)\n", - "net = IENetwork(model=MODEL_XML, weights=MODEL_BIN)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "e. Set the network batch size to the constatns specified above. \n", - "\n", - "Batch size is an \"amount\" of input data that will be infered in parallel. In this cases it is a number of images, which will be classified in parallel. \n", - "\n", - "You can set the network batch size using one of the following options:\n", - "1. On the IR generation stage, run the Model Optimizer with `-b` command line option. For example, to generate the IR with batch size equal to 4, add `-b 4` to Model Optimizer command line options. By default, it takes the batch size from the original network in framework representation (usually, it is equal to 1, but in this case, the original Caffe model is provided with the batch size equal to 10). \n", - "2. Use Inference Engine after reading IR. We will use this option.\n", - "\n", - "To set the batch size with the Inference Engine:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "log.info(\"Current network batch size is {}, will be changed to {}\".format(net.batch_size, BATCH))\n", - "net.batch_size = BATCH" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "f. After setting batch size, you can get required information about network input layers.\n", - "To preprocess input images, you need to know input layer shape.\n", - "\n", - "`inputs` property of `IENetwork` returns the dicitonary with input layer names and `InputInfo` objects, which contain information about an input layer including its shape.\n", - "\n", - "SqueezeNet is a single-input toplogy, so to get the input layer name and its shape, you can get the first item from the `inputs` dictionary:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "input_layer = next(iter(net.inputs))\n", - "n,c,h,w = net.inputs[input_layer].shape\n", - "layout = net.inputs[input_layer].layout\n", - "log.info(\"Network input layer {} has shape {} and layout {}\".format(input_layer, (n,c,h,w), layout))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So what do the shape and layout mean? \n", - "Layout will helps to interprete the shape dimsesnions meaning. \n", - "\n", - "`NCHW` input layer layout means:\n", - "* the fisrt dimension of an input data is a batch of **N** images processed in parallel \n", - "* the second dimension is a numnber of **C**hannels expected in the input images\n", - "* the third and the forth are a spatial dimensions - **H**eight and **W**idth of an input image\n", - "\n", - "Our shapes means that the network expects four 3-channel images running in parallel with size 227x227." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "g. Read and preprocess input images.\n", - "\n", - "For it, go to `IMAGES_FOLDER`, find all `.bmp` files, and take four images for inference:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "search_pattern = os.path.join(IMAGES_FOLDER, \"*.bmp\")\n", - "images = glob.glob(search_pattern)[:BATCH]\n", - "log.info(\"Input images:\\n {}\".format(\"\\n\".join(images)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can read and preprocess the image files and create an array with input blob data.\n", - "\n", - "For preprocessing, you must do the following:\n", - "1. Resize the images to fit the HxW input dimenstions.\n", - "2. Transpose the HWC layout.\n", - "\n", - "Transposing is tricky and not really obvious.\n", - "As you alredy saw above, the network has the `NCHW` layout, so each input image should be in `CHW` format. But by deafult, OpenCV\\* reads images in the `HWC` format. That is why you have to swap the axes using the `numpy.transpose()` function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "input_data = np.ndarray(shape=(n, c, h, w))\n", - "orig_images = [] # Will be used to show image in notebook\n", - "for i, img in enumerate(images):\n", - " image = cv2.imread(img)\n", - " orig_images.append(image)\n", - " if image.shape[:-1] != (h, w):\n", - " log.warning(\"Image {} is resized from {} to {}\".format(img, image.shape[:-1], (h, w)))\n", - " image = cv2.resize(image, (w, h))\n", - " image = image.transpose((2, 0, 1)) # Change data layout from HWC to CHW\n", - " input_data[i] = image" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "i. Infer the model model to classify input images:\n", - "\n", - "1. Load the `IENetwork` object to the plugin to create `ExectuableNEtwork` object. \n", - "2. Start inference using the `infer()` function specifying dictionary with input layer name and prepared data as an argument for the function. \n", - "3. Measure inference time in miliseconds and calculate throughput metric in frames-per-second (FPS)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "exec_net = plugin.load(net)\n", - "t0 = time()\n", - "res_map = exec_net.infer({input_layer: input_data})\n", - "inf_time = (time() - t0) * 1000 \n", - "fps = BATCH * inf_time \n", - "log.info(\"Inference time: {} ms.\".format(inf_time))\n", - "log.info(\"Throughput: {} fps.\".format(fps))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**4. After the inference, you need to parse and interpretate the inference results.**\n", - "\n", - "First, you need to see the shape of the network output layer. It can be done in similar way as for the inputs, but here you need to call `outputs` property of `IENetwork` object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_layer = next(iter(net.outputs))\n", - "n,c,h,w = net.outputs[output_layer].shape\n", - "layout = net.outputs[output_layer].layout\n", - "log.info(\"Network output layer {} has shape {} and layout {}\".format(output_layer, (n,c,h,w), layout))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is not a common case for classification netowrks to have output layer with *NCHW* layout. Usually, it is just *NC*. However, in this case, the last two dimensions are just a feature of the network and do not have much sense. Ignore them as you will remove them on the final parsing stage. \n", - "\n", - "What are the first and second dimensions of the output layer? \n", - "* The first dimension is a batch. We precoessed four images, and the prediction result for a particular image is stored in the first dimension of the output array. For example, prediction results for the third image is `res[2]` (since numeration starts from 0).\n", - "* The second dimension is an array with normalized probabilities (from 0 to 1) for each class. This network is trained using the ImageNet dataset with 1000 classes. Each `n`-th value in the output data for a certain image represent the probability of the image belonging to the `n`-th class. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To parse the output results:\n", - "\n", - "a. Read the `LABELS_FILE`, which maps the class ID to human-readable class names:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(LABELS_FILE, 'r') as f:\n", - " labels_map = [x.split(sep=' ', maxsplit=1)[-1].strip() for x in f]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "b. Parse the output array with prediction results. The parsing algorith is the following:\n", - "0. Squeeze the last two \"extra\" dimensions of the output data.\n", - "1. Iterate over all batches.\n", - "2. Sort the probabilities vector descendingly to get `NTOP` classes with the highest probabilities (by default, the `numpy.argsort` sorts the data in the ascending order, but using the array slicing `[::-1]`, you can reverse the data order).\n", - "3. Map the `NTOP` probabilities to the corresponding labeles in `labeles_map`.\n", - "\n", - "For the vizualization, you also need to store top-1 class and probability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "top1_res = [] # will be used for the visualization\n", - "res = np.squeeze(res_map[output_layer])\n", - "log.info(\"Top {} results: \".format(NTOP))\n", - "for i, probs in enumerate(res):\n", - " top_ind = np.argsort(probs)[-NTOP:][::-1]\n", - " print(\"Image {}\".format(images[i]))\n", - " top1_ind = top_ind[0]\n", - " top1_res.append((labels_map[top1_ind], probs[top1_ind]))\n", - " for id in top_ind:\n", - " print(\"label: {} probability: {:.2f}% \".format(labels_map[id], probs[id] * 100))\n", - " print(\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The code above prints the results as plain text. \n", - "You can also use OpenCV\\* to visualize the results using the `orig_images` and `top1_res` variables, which you created during images reading and results parsing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.clf()\n", - "for i, img in enumerate(orig_images):\n", - " label_str = \"{}\".format(top1_res[i][0].split(',')[0])\n", - " prob_str = \"{:.2f}%\".format(top1_res[i][1])\n", - " cv2.putText(img, label_str, (5, 15), cv2.FONT_HERSHEY_COMPLEX, 0.6, (220,100,10), 1)\n", - " cv2.putText(img, prob_str, (5, 35), cv2.FONT_HERSHEY_COMPLEX, 0.6, (220,100,10), 1)\n", - " plt.figure()\n", - " plt.axis(\"off\")\n", - " \n", - " # We have to convert colors, because matplotlib expects an image in RGB color format \n", - " # but by default, the OpenCV read images in BRG format\n", - " im_to_show = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", - " plt.imshow(im_to_show)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt b/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt deleted file mode 100644 index a9e8c7f..0000000 --- a/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt +++ /dev/null @@ -1,1000 +0,0 @@ -n01440764 tench, Tinca tinca -n01443537 goldfish, Carassius auratus -n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias -n01491361 tiger shark, Galeocerdo cuvieri -n01494475 hammerhead, hammerhead shark -n01496331 electric ray, crampfish, numbfish, torpedo -n01498041 stingray -n01514668 cock -n01514859 hen -n01518878 ostrich, Struthio camelus -n01530575 brambling, Fringilla montifringilla -n01531178 goldfinch, Carduelis carduelis -n01532829 house finch, linnet, Carpodacus mexicanus -n01534433 junco, snowbird -n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea -n01558993 robin, American robin, Turdus migratorius -n01560419 bulbul -n01580077 jay -n01582220 magpie -n01592084 chickadee -n01601694 water ouzel, dipper -n01608432 kite -n01614925 bald eagle, American eagle, Haliaeetus leucocephalus -n01616318 vulture -n01622779 great grey owl, great gray owl, Strix nebulosa -n01629819 European fire salamander, Salamandra salamandra -n01630670 common newt, Triturus vulgaris -n01631663 eft -n01632458 spotted salamander, Ambystoma maculatum -n01632777 axolotl, mud puppy, Ambystoma mexicanum -n01641577 bullfrog, Rana catesbeiana -n01644373 tree frog, tree-frog -n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui -n01664065 loggerhead, loggerhead turtle, Caretta caretta -n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea -n01667114 mud turtle -n01667778 terrapin -n01669191 box turtle, box tortoise -n01675722 banded gecko -n01677366 common iguana, iguana, Iguana iguana -n01682714 American chameleon, anole, Anolis carolinensis -n01685808 whiptail, whiptail lizard -n01687978 agama -n01688243 frilled lizard, Chlamydosaurus kingi -n01689811 alligator lizard -n01692333 Gila monster, Heloderma suspectum -n01693334 green lizard, Lacerta viridis -n01694178 African chameleon, Chamaeleo chamaeleon -n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis -n01697457 African crocodile, Nile crocodile, Crocodylus niloticus -n01698640 American alligator, Alligator mississipiensis -n01704323 triceratops -n01728572 thunder snake, worm snake, Carphophis amoenus -n01728920 ringneck snake, ring-necked snake, ring snake -n01729322 hognose snake, puff adder, sand viper -n01729977 green snake, grass snake -n01734418 king snake, kingsnake -n01735189 garter snake, grass snake -n01737021 water snake -n01739381 vine snake -n01740131 night snake, Hypsiglena torquata -n01742172 boa constrictor, Constrictor constrictor -n01744401 rock python, rock snake, Python sebae -n01748264 Indian cobra, Naja naja -n01749939 green mamba -n01751748 sea snake -n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus -n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus -n01756291 sidewinder, horned rattlesnake, Crotalus cerastes -n01768244 trilobite -n01770081 harvestman, daddy longlegs, Phalangium opilio -n01770393 scorpion -n01773157 black and gold garden spider, Argiope aurantia -n01773549 barn spider, Araneus cavaticus -n01773797 garden spider, Aranea diademata -n01774384 black widow, Latrodectus mactans -n01774750 tarantula -n01775062 wolf spider, hunting spider -n01776313 tick -n01784675 centipede -n01795545 black grouse -n01796340 ptarmigan -n01797886 ruffed grouse, partridge, Bonasa umbellus -n01798484 prairie chicken, prairie grouse, prairie fowl -n01806143 peacock -n01806567 quail -n01807496 partridge -n01817953 African grey, African gray, Psittacus erithacus -n01818515 macaw -n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita -n01820546 lorikeet -n01824575 coucal -n01828970 bee eater -n01829413 hornbill -n01833805 hummingbird -n01843065 jacamar -n01843383 toucan -n01847000 drake -n01855032 red-breasted merganser, Mergus serrator -n01855672 goose -n01860187 black swan, Cygnus atratus -n01871265 tusker -n01872401 echidna, spiny anteater, anteater -n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus -n01877812 wallaby, brush kangaroo -n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus -n01883070 wombat -n01910747 jellyfish -n01914609 sea anemone, anemone -n01917289 brain coral -n01924916 flatworm, platyhelminth -n01930112 nematode, nematode worm, roundworm -n01943899 conch -n01944390 snail -n01945685 slug -n01950731 sea slug, nudibranch -n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore -n01968897 chambered nautilus, pearly nautilus, nautilus -n01978287 Dungeness crab, Cancer magister -n01978455 rock crab, Cancer irroratus -n01980166 fiddler crab -n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica -n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus -n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish -n01985128 crayfish, crawfish, crawdad, crawdaddy -n01986214 hermit crab -n01990800 isopod -n02002556 white stork, Ciconia ciconia -n02002724 black stork, Ciconia nigra -n02006656 spoonbill -n02007558 flamingo -n02009229 little blue heron, Egretta caerulea -n02009912 American egret, great white heron, Egretta albus -n02011460 bittern -n02012849 crane -n02013706 limpkin, Aramus pictus -n02017213 European gallinule, Porphyrio porphyrio -n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana -n02018795 bustard -n02025239 ruddy turnstone, Arenaria interpres -n02027492 red-backed sandpiper, dunlin, Erolia alpina -n02028035 redshank, Tringa totanus -n02033041 dowitcher -n02037110 oystercatcher, oyster catcher -n02051845 pelican -n02056570 king penguin, Aptenodytes patagonica -n02058221 albatross, mollymawk -n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus -n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca -n02074367 dugong, Dugong dugon -n02077923 sea lion -n02085620 Chihuahua -n02085782 Japanese spaniel -n02085936 Maltese dog, Maltese terrier, Maltese -n02086079 Pekinese, Pekingese, Peke -n02086240 Shih-Tzu -n02086646 Blenheim spaniel -n02086910 papillon -n02087046 toy terrier -n02087394 Rhodesian ridgeback -n02088094 Afghan hound, Afghan -n02088238 basset, basset hound -n02088364 beagle -n02088466 bloodhound, sleuthhound -n02088632 bluetick -n02089078 black-and-tan coonhound -n02089867 Walker hound, Walker foxhound -n02089973 English foxhound -n02090379 redbone -n02090622 borzoi, Russian wolfhound -n02090721 Irish wolfhound -n02091032 Italian greyhound -n02091134 whippet -n02091244 Ibizan hound, Ibizan Podenco -n02091467 Norwegian elkhound, elkhound -n02091635 otterhound, otter hound -n02091831 Saluki, gazelle hound -n02092002 Scottish deerhound, deerhound -n02092339 Weimaraner -n02093256 Staffordshire bullterrier, Staffordshire bull terrier -n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier -n02093647 Bedlington terrier -n02093754 Border terrier -n02093859 Kerry blue terrier -n02093991 Irish terrier -n02094114 Norfolk terrier -n02094258 Norwich terrier -n02094433 Yorkshire terrier -n02095314 wire-haired fox terrier -n02095570 Lakeland terrier -n02095889 Sealyham terrier, Sealyham -n02096051 Airedale, Airedale terrier -n02096177 cairn, cairn terrier -n02096294 Australian terrier -n02096437 Dandie Dinmont, Dandie Dinmont terrier -n02096585 Boston bull, Boston terrier -n02097047 miniature schnauzer -n02097130 giant schnauzer -n02097209 standard schnauzer -n02097298 Scotch terrier, Scottish terrier, Scottie -n02097474 Tibetan terrier, chrysanthemum dog -n02097658 silky terrier, Sydney silky -n02098105 soft-coated wheaten terrier -n02098286 West Highland white terrier -n02098413 Lhasa, Lhasa apso -n02099267 flat-coated retriever -n02099429 curly-coated retriever -n02099601 golden retriever -n02099712 Labrador retriever -n02099849 Chesapeake Bay retriever -n02100236 German short-haired pointer -n02100583 vizsla, Hungarian pointer -n02100735 English setter -n02100877 Irish setter, red setter -n02101006 Gordon setter -n02101388 Brittany spaniel -n02101556 clumber, clumber spaniel -n02102040 English springer, English springer spaniel -n02102177 Welsh springer spaniel -n02102318 cocker spaniel, English cocker spaniel, cocker -n02102480 Sussex spaniel -n02102973 Irish water spaniel -n02104029 kuvasz -n02104365 schipperke -n02105056 groenendael -n02105162 malinois -n02105251 briard -n02105412 kelpie -n02105505 komondor -n02105641 Old English sheepdog, bobtail -n02105855 Shetland sheepdog, Shetland sheep dog, Shetland -n02106030 collie -n02106166 Border collie -n02106382 Bouvier des Flandres, Bouviers des Flandres -n02106550 Rottweiler -n02106662 German shepherd, German shepherd dog, German police dog, alsatian -n02107142 Doberman, Doberman pinscher -n02107312 miniature pinscher -n02107574 Greater Swiss Mountain dog -n02107683 Bernese mountain dog -n02107908 Appenzeller -n02108000 EntleBucher -n02108089 boxer -n02108422 bull mastiff -n02108551 Tibetan mastiff -n02108915 French bulldog -n02109047 Great Dane -n02109525 Saint Bernard, St Bernard -n02109961 Eskimo dog, husky -n02110063 malamute, malemute, Alaskan malamute -n02110185 Siberian husky -n02110341 dalmatian, coach dog, carriage dog -n02110627 affenpinscher, monkey pinscher, monkey dog -n02110806 basenji -n02110958 pug, pug-dog -n02111129 Leonberg -n02111277 Newfoundland, Newfoundland dog -n02111500 Great Pyrenees -n02111889 Samoyed, Samoyede -n02112018 Pomeranian -n02112137 chow, chow chow -n02112350 keeshond -n02112706 Brabancon griffon -n02113023 Pembroke, Pembroke Welsh corgi -n02113186 Cardigan, Cardigan Welsh corgi -n02113624 toy poodle -n02113712 miniature poodle -n02113799 standard poodle -n02113978 Mexican hairless -n02114367 timber wolf, grey wolf, gray wolf, Canis lupus -n02114548 white wolf, Arctic wolf, Canis lupus tundrarum -n02114712 red wolf, maned wolf, Canis rufus, Canis niger -n02114855 coyote, prairie wolf, brush wolf, Canis latrans -n02115641 dingo, warrigal, warragal, Canis dingo -n02115913 dhole, Cuon alpinus -n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus -n02117135 hyena, hyaena -n02119022 red fox, Vulpes vulpes -n02119789 kit fox, Vulpes macrotis -n02120079 Arctic fox, white fox, Alopex lagopus -n02120505 grey fox, gray fox, Urocyon cinereoargenteus -n02123045 tabby, tabby cat -n02123159 tiger cat -n02123394 Persian cat -n02123597 Siamese cat, Siamese -n02124075 Egyptian cat -n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor -n02127052 lynx, catamount -n02128385 leopard, Panthera pardus -n02128757 snow leopard, ounce, Panthera uncia -n02128925 jaguar, panther, Panthera onca, Felis onca -n02129165 lion, king of beasts, Panthera leo -n02129604 tiger, Panthera tigris -n02130308 cheetah, chetah, Acinonyx jubatus -n02132136 brown bear, bruin, Ursus arctos -n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus -n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus -n02134418 sloth bear, Melursus ursinus, Ursus ursinus -n02137549 mongoose -n02138441 meerkat, mierkat -n02165105 tiger beetle -n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle -n02167151 ground beetle, carabid beetle -n02168699 long-horned beetle, longicorn, longicorn beetle -n02169497 leaf beetle, chrysomelid -n02172182 dung beetle -n02174001 rhinoceros beetle -n02177972 weevil -n02190166 fly -n02206856 bee -n02219486 ant, emmet, pismire -n02226429 grasshopper, hopper -n02229544 cricket -n02231487 walking stick, walkingstick, stick insect -n02233338 cockroach, roach -n02236044 mantis, mantid -n02256656 cicada, cicala -n02259212 leafhopper -n02264363 lacewing, lacewing fly -n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk -n02268853 damselfly -n02276258 admiral -n02277742 ringlet, ringlet butterfly -n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus -n02280649 cabbage butterfly -n02281406 sulphur butterfly, sulfur butterfly -n02281787 lycaenid, lycaenid butterfly -n02317335 starfish, sea star -n02319095 sea urchin -n02321529 sea cucumber, holothurian -n02325366 wood rabbit, cottontail, cottontail rabbit -n02326432 hare -n02328150 Angora, Angora rabbit -n02342885 hamster -n02346627 porcupine, hedgehog -n02356798 fox squirrel, eastern fox squirrel, Sciurus niger -n02361337 marmot -n02363005 beaver -n02364673 guinea pig, Cavia cobaya -n02389026 sorrel -n02391049 zebra -n02395406 hog, pig, grunter, squealer, Sus scrofa -n02396427 wild boar, boar, Sus scrofa -n02397096 warthog -n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius -n02403003 ox -n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis -n02410509 bison -n02412080 ram, tup -n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis -n02417914 ibex, Capra ibex -n02422106 hartebeest -n02422699 impala, Aepyceros melampus -n02423022 gazelle -n02437312 Arabian camel, dromedary, Camelus dromedarius -n02437616 llama -n02441942 weasel -n02442845 mink -n02443114 polecat, fitch, foulmart, foumart, Mustela putorius -n02443484 black-footed ferret, ferret, Mustela nigripes -n02444819 otter -n02445715 skunk, polecat, wood pussy -n02447366 badger -n02454379 armadillo -n02457408 three-toed sloth, ai, Bradypus tridactylus -n02480495 orangutan, orang, orangutang, Pongo pygmaeus -n02480855 gorilla, Gorilla gorilla -n02481823 chimpanzee, chimp, Pan troglodytes -n02483362 gibbon, Hylobates lar -n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus -n02484975 guenon, guenon monkey -n02486261 patas, hussar monkey, Erythrocebus patas -n02486410 baboon -n02487347 macaque -n02488291 langur -n02488702 colobus, colobus monkey -n02489166 proboscis monkey, Nasalis larvatus -n02490219 marmoset -n02492035 capuchin, ringtail, Cebus capucinus -n02492660 howler monkey, howler -n02493509 titi, titi monkey -n02493793 spider monkey, Ateles geoffroyi -n02494079 squirrel monkey, Saimiri sciureus -n02497673 Madagascar cat, ring-tailed lemur, Lemur catta -n02500267 indri, indris, Indri indri, Indri brevicaudatus -n02504013 Indian elephant, Elephas maximus -n02504458 African elephant, Loxodonta africana -n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens -n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca -n02514041 barracouta, snoek -n02526121 eel -n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch -n02606052 rock beauty, Holocanthus tricolor -n02607072 anemone fish -n02640242 sturgeon -n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus -n02643566 lionfish -n02655020 puffer, pufferfish, blowfish, globefish -n02666196 abacus -n02667093 abaya -n02669723 academic gown, academic robe, judge's robe -n02672831 accordion, piano accordion, squeeze box -n02676566 acoustic guitar -n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier -n02690373 airliner -n02692877 airship, dirigible -n02699494 altar -n02701002 ambulance -n02704792 amphibian, amphibious vehicle -n02708093 analog clock -n02727426 apiary, bee house -n02730930 apron -n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin -n02749479 assault rifle, assault gun -n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack -n02776631 bakery, bakeshop, bakehouse -n02777292 balance beam, beam -n02782093 balloon -n02783161 ballpoint, ballpoint pen, ballpen, Biro -n02786058 Band Aid -n02787622 banjo -n02788148 bannister, banister, balustrade, balusters, handrail -n02790996 barbell -n02791124 barber chair -n02791270 barbershop -n02793495 barn -n02794156 barometer -n02795169 barrel, cask -n02797295 barrow, garden cart, lawn cart, wheelbarrow -n02799071 baseball -n02802426 basketball -n02804414 bassinet -n02804610 bassoon -n02807133 bathing cap, swimming cap -n02808304 bath towel -n02808440 bathtub, bathing tub, bath, tub -n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon -n02814860 beacon, lighthouse, beacon light, pharos -n02815834 beaker -n02817516 bearskin, busby, shako -n02823428 beer bottle -n02823750 beer glass -n02825657 bell cote, bell cot -n02834397 bib -n02835271 bicycle-built-for-two, tandem bicycle, tandem -n02837789 bikini, two-piece -n02840245 binder, ring-binder -n02841315 binoculars, field glasses, opera glasses -n02843684 birdhouse -n02859443 boathouse -n02860847 bobsled, bobsleigh, bob -n02865351 bolo tie, bolo, bola tie, bola -n02869837 bonnet, poke bonnet -n02870880 bookcase -n02871525 bookshop, bookstore, bookstall -n02877765 bottlecap -n02879718 bow -n02883205 bow tie, bow-tie, bowtie -n02892201 brass, memorial tablet, plaque -n02892767 brassiere, bra, bandeau -n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty -n02895154 breastplate, aegis, egis -n02906734 broom -n02909870 bucket, pail -n02910353 buckle -n02916936 bulletproof vest -n02917067 bullet train, bullet -n02927161 butcher shop, meat market -n02930766 cab, hack, taxi, taxicab -n02939185 caldron, cauldron -n02948072 candle, taper, wax light -n02950826 cannon -n02951358 canoe -n02951585 can opener, tin opener -n02963159 cardigan -n02965783 car mirror -n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig -n02966687 carpenter's kit, tool kit -n02971356 carton -n02974003 car wheel -n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM -n02978881 cassette -n02979186 cassette player -n02980441 castle -n02981792 catamaran -n02988304 CD player -n02992211 cello, violoncello -n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone -n02999410 chain -n03000134 chainlink fence -n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour -n03000684 chain saw, chainsaw -n03014705 chest -n03016953 chiffonier, commode -n03017168 chime, bell, gong -n03018349 china cabinet, china closet -n03026506 Christmas stocking -n03028079 church, church building -n03032252 cinema, movie theater, movie theatre, movie house, picture palace -n03041632 cleaver, meat cleaver, chopper -n03042490 cliff dwelling -n03045698 cloak -n03047690 clog, geta, patten, sabot -n03062245 cocktail shaker -n03063599 coffee mug -n03063689 coffeepot -n03065424 coil, spiral, volute, whorl, helix -n03075370 combination lock -n03085013 computer keyboard, keypad -n03089624 confectionery, confectionary, candy store -n03095699 container ship, containership, container vessel -n03100240 convertible -n03109150 corkscrew, bottle screw -n03110669 cornet, horn, trumpet, trump -n03124043 cowboy boot -n03124170 cowboy hat, ten-gallon hat -n03125729 cradle -n03126707 crane -n03127747 crash helmet -n03127925 crate -n03131574 crib, cot -n03133878 Crock Pot -n03134739 croquet ball -n03141823 crutch -n03146219 cuirass -n03160309 dam, dike, dyke -n03179701 desk -n03180011 desktop computer -n03187595 dial telephone, dial phone -n03188531 diaper, nappy, napkin -n03196217 digital clock -n03197337 digital watch -n03201208 dining table, board -n03207743 dishrag, dishcloth -n03207941 dishwasher, dish washer, dishwashing machine -n03208938 disk brake, disc brake -n03216828 dock, dockage, docking facility -n03218198 dogsled, dog sled, dog sleigh -n03220513 dome -n03223299 doormat, welcome mat -n03240683 drilling platform, offshore rig -n03249569 drum, membranophone, tympan -n03250847 drumstick -n03255030 dumbbell -n03259280 Dutch oven -n03271574 electric fan, blower -n03272010 electric guitar -n03272562 electric locomotive -n03290653 entertainment center -n03291819 envelope -n03297495 espresso maker -n03314780 face powder -n03325584 feather boa, boa -n03337140 file, file cabinet, filing cabinet -n03344393 fireboat -n03345487 fire engine, fire truck -n03347037 fire screen, fireguard -n03355925 flagpole, flagstaff -n03372029 flute, transverse flute -n03376595 folding chair -n03379051 football helmet -n03384352 forklift -n03388043 fountain -n03388183 fountain pen -n03388549 four-poster -n03393912 freight car -n03394916 French horn, horn -n03400231 frying pan, frypan, skillet -n03404251 fur coat -n03417042 garbage truck, dustcart -n03424325 gasmask, respirator, gas helmet -n03425413 gas pump, gasoline pump, petrol pump, island dispenser -n03443371 goblet -n03444034 go-kart -n03445777 golf ball -n03445924 golfcart, golf cart -n03447447 gondola -n03447721 gong, tam-tam -n03450230 gown -n03452741 grand piano, grand -n03457902 greenhouse, nursery, glasshouse -n03459775 grille, radiator grille -n03461385 grocery store, grocery, food market, market -n03467068 guillotine -n03476684 hair slide -n03476991 hair spray -n03478589 half track -n03481172 hammer -n03482405 hamper -n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier -n03485407 hand-held computer, hand-held microcomputer -n03485794 handkerchief, hankie, hanky, hankey -n03492542 hard disc, hard disk, fixed disk -n03494278 harmonica, mouth organ, harp, mouth harp -n03495258 harp -n03496892 harvester, reaper -n03498962 hatchet -n03527444 holster -n03529860 home theater, home theatre -n03530642 honeycomb -n03532672 hook, claw -n03534580 hoopskirt, crinoline -n03535780 horizontal bar, high bar -n03538406 horse cart, horse-cart -n03544143 hourglass -n03584254 iPod -n03584829 iron, smoothing iron -n03590841 jack-o'-lantern -n03594734 jean, blue jean, denim -n03594945 jeep, landrover -n03595614 jersey, T-shirt, tee shirt -n03598930 jigsaw puzzle -n03599486 jinrikisha, ricksha, rickshaw -n03602883 joystick -n03617480 kimono -n03623198 knee pad -n03627232 knot -n03630383 lab coat, laboratory coat -n03633091 ladle -n03637318 lampshade, lamp shade -n03642806 laptop, laptop computer -n03649909 lawn mower, mower -n03657121 lens cap, lens cover -n03658185 letter opener, paper knife, paperknife -n03661043 library -n03662601 lifeboat -n03666591 lighter, light, igniter, ignitor -n03670208 limousine, limo -n03673027 liner, ocean liner -n03676483 lipstick, lip rouge -n03680355 Loafer -n03690938 lotion -n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system -n03692522 loupe, jeweler's loupe -n03697007 lumbermill, sawmill -n03706229 magnetic compass -n03709823 mailbag, postbag -n03710193 mailbox, letter box -n03710637 maillot -n03710721 maillot, tank suit -n03717622 manhole cover -n03720891 maraca -n03721384 marimba, xylophone -n03724870 mask -n03729826 matchstick -n03733131 maypole -n03733281 maze, labyrinth -n03733805 measuring cup -n03742115 medicine chest, medicine cabinet -n03743016 megalith, megalithic structure -n03759954 microphone, mike -n03761084 microwave, microwave oven -n03763968 military uniform -n03764736 milk can -n03769881 minibus -n03770439 miniskirt, mini -n03770679 minivan -n03773504 missile -n03775071 mitten -n03775546 mixing bowl -n03776460 mobile home, manufactured home -n03777568 Model T -n03777754 modem -n03781244 monastery -n03782006 monitor -n03785016 moped -n03786901 mortar -n03787032 mortarboard -n03788195 mosque -n03788365 mosquito net -n03791053 motor scooter, scooter -n03792782 mountain bike, all-terrain bike, off-roader -n03792972 mountain tent -n03793489 mouse, computer mouse -n03794056 mousetrap -n03796401 moving van -n03803284 muzzle -n03804744 nail -n03814639 neck brace -n03814906 necklace -n03825788 nipple -n03832673 notebook, notebook computer -n03837869 obelisk -n03838899 oboe, hautboy, hautbois -n03840681 ocarina, sweet potato -n03841143 odometer, hodometer, mileometer, milometer -n03843555 oil filter -n03854065 organ, pipe organ -n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO -n03866082 overskirt -n03868242 oxcart -n03868863 oxygen mask -n03871628 packet -n03873416 paddle, boat paddle -n03874293 paddlewheel, paddle wheel -n03874599 padlock -n03876231 paintbrush -n03877472 pajama, pyjama, pj's, jammies -n03877845 palace -n03884397 panpipe, pandean pipe, syrinx -n03887697 paper towel -n03888257 parachute, chute -n03888605 parallel bars, bars -n03891251 park bench -n03891332 parking meter -n03895866 passenger car, coach, carriage -n03899768 patio, terrace -n03902125 pay-phone, pay-station -n03903868 pedestal, plinth, footstall -n03908618 pencil box, pencil case -n03908714 pencil sharpener -n03916031 perfume, essence -n03920288 Petri dish -n03924679 photocopier -n03929660 pick, plectrum, plectron -n03929855 pickelhaube -n03930313 picket fence, paling -n03930630 pickup, pickup truck -n03933933 pier -n03935335 piggy bank, penny bank -n03937543 pill bottle -n03938244 pillow -n03942813 ping-pong ball -n03944341 pinwheel -n03947888 pirate, pirate ship -n03950228 pitcher, ewer -n03954731 plane, carpenter's plane, woodworking plane -n03956157 planetarium -n03958227 plastic bag -n03961711 plate rack -n03967562 plow, plough -n03970156 plunger, plumber's helper -n03976467 Polaroid camera, Polaroid Land camera -n03976657 pole -n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria -n03980874 poncho -n03982430 pool table, billiard table, snooker table -n03983396 pop bottle, soda bottle -n03991062 pot, flowerpot -n03992509 potter's wheel -n03995372 power drill -n03998194 prayer rug, prayer mat -n04004767 printer -n04005630 prison, prison house -n04008634 projectile, missile -n04009552 projector -n04019541 puck, hockey puck -n04023962 punching bag, punch bag, punching ball, punchball -n04026417 purse -n04033901 quill, quill pen -n04033995 quilt, comforter, comfort, puff -n04037443 racer, race car, racing car -n04039381 racket, racquet -n04040759 radiator -n04041544 radio, wireless -n04044716 radio telescope, radio reflector -n04049303 rain barrel -n04065272 recreational vehicle, RV, R.V. -n04067472 reel -n04069434 reflex camera -n04070727 refrigerator, icebox -n04074963 remote control, remote -n04081281 restaurant, eating house, eating place, eatery -n04086273 revolver, six-gun, six-shooter -n04090263 rifle -n04099969 rocking chair, rocker -n04111531 rotisserie -n04116512 rubber eraser, rubber, pencil eraser -n04118538 rugby ball -n04118776 rule, ruler -n04120489 running shoe -n04125021 safe -n04127249 safety pin -n04131690 saltshaker, salt shaker -n04133789 sandal -n04136333 sarong -n04141076 sax, saxophone -n04141327 scabbard -n04141975 scale, weighing machine -n04146614 school bus -n04147183 schooner -n04149813 scoreboard -n04152593 screen, CRT screen -n04153751 screw -n04154565 screwdriver -n04162706 seat belt, seatbelt -n04179913 sewing machine -n04192698 shield, buckler -n04200800 shoe shop, shoe-shop, shoe store -n04201297 shoji -n04204238 shopping basket -n04204347 shopping cart -n04208210 shovel -n04209133 shower cap -n04209239 shower curtain -n04228054 ski -n04229816 ski mask -n04235860 sleeping bag -n04238763 slide rule, slipstick -n04239074 sliding door -n04243546 slot, one-armed bandit -n04251144 snorkel -n04252077 snowmobile -n04252225 snowplow, snowplough -n04254120 soap dispenser -n04254680 soccer ball -n04254777 sock -n04258138 solar dish, solar collector, solar furnace -n04259630 sombrero -n04263257 soup bowl -n04264628 space bar -n04265275 space heater -n04266014 space shuttle -n04270147 spatula -n04273569 speedboat -n04275548 spider web, spider's web -n04277352 spindle -n04285008 sports car, sport car -n04286575 spotlight, spot -n04296562 stage -n04310018 steam locomotive -n04311004 steel arch bridge -n04311174 steel drum -n04317175 stethoscope -n04325704 stole -n04326547 stone wall -n04328186 stopwatch, stop watch -n04330267 stove -n04332243 strainer -n04335435 streetcar, tram, tramcar, trolley, trolley car -n04336792 stretcher -n04344873 studio couch, day bed -n04346328 stupa, tope -n04347754 submarine, pigboat, sub, U-boat -n04350905 suit, suit of clothes -n04355338 sundial -n04355933 sunglass -n04356056 sunglasses, dark glasses, shades -n04357314 sunscreen, sunblock, sun blocker -n04366367 suspension bridge -n04367480 swab, swob, mop -n04370456 sweatshirt -n04371430 swimming trunks, bathing trunks -n04371774 swing -n04372370 switch, electric switch, electrical switch -n04376876 syringe -n04380533 table lamp -n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle -n04392985 tape player -n04398044 teapot -n04399382 teddy, teddy bear -n04404412 television, television system -n04409515 tennis ball -n04417672 thatch, thatched roof -n04418357 theater curtain, theatre curtain -n04423845 thimble -n04428191 thresher, thrasher, threshing machine -n04429376 throne -n04435653 tile roof -n04442312 toaster -n04443257 tobacco shop, tobacconist shop, tobacconist -n04447861 toilet seat -n04456115 torch -n04458633 totem pole -n04461696 tow truck, tow car, wrecker -n04462240 toyshop -n04465501 tractor -n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi -n04476259 tray -n04479046 trench coat -n04482393 tricycle, trike, velocipede -n04483307 trimaran -n04485082 tripod -n04486054 triumphal arch -n04487081 trolleybus, trolley coach, trackless trolley -n04487394 trombone -n04493381 tub, vat -n04501370 turnstile -n04505470 typewriter keyboard -n04507155 umbrella -n04509417 unicycle, monocycle -n04515003 upright, upright piano -n04517823 vacuum, vacuum cleaner -n04522168 vase -n04523525 vault -n04525038 velvet -n04525305 vending machine -n04532106 vestment -n04532670 viaduct -n04536866 violin, fiddle -n04540053 volleyball -n04542943 waffle iron -n04548280 wall clock -n04548362 wallet, billfold, notecase, pocketbook -n04550184 wardrobe, closet, press -n04552348 warplane, military plane -n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin -n04554684 washer, automatic washer, washing machine -n04557648 water bottle -n04560804 water jug -n04562935 water tower -n04579145 whiskey jug -n04579432 whistle -n04584207 wig -n04589890 window screen -n04590129 window shade -n04591157 Windsor tie -n04591713 wine bottle -n04592741 wing -n04596742 wok -n04597913 wooden spoon -n04599235 wool, woolen, woollen -n04604644 worm fence, snake fence, snake-rail fence, Virginia fence -n04606251 wreck -n04612504 yawl -n04613696 yurt -n06359193 web site, website, internet site, site -n06596364 comic book -n06785654 crossword puzzle, crossword -n06794110 street sign -n06874185 traffic light, traffic signal, stoplight -n07248320 book jacket, dust cover, dust jacket, dust wrapper -n07565083 menu -n07579787 plate -n07583066 guacamole -n07584110 consomme -n07590611 hot pot, hotpot -n07613480 trifle -n07614500 ice cream, icecream -n07615774 ice lolly, lolly, lollipop, popsicle -n07684084 French loaf -n07693725 bagel, beigel -n07695742 pretzel -n07697313 cheeseburger -n07697537 hotdog, hot dog, red hot -n07711569 mashed potato -n07714571 head cabbage -n07714990 broccoli -n07715103 cauliflower -n07716358 zucchini, courgette -n07716906 spaghetti squash -n07717410 acorn squash -n07717556 butternut squash -n07718472 cucumber, cuke -n07718747 artichoke, globe artichoke -n07720875 bell pepper -n07730033 cardoon -n07734744 mushroom -n07742313 Granny Smith -n07745940 strawberry -n07747607 orange -n07749582 lemon -n07753113 fig -n07753275 pineapple, ananas -n07753592 banana -n07754684 jackfruit, jak, jack -n07760859 custard apple -n07768694 pomegranate -n07802026 hay -n07831146 carbonara -n07836838 chocolate sauce, chocolate syrup -n07860988 dough -n07871810 meat loaf, meatloaf -n07873807 pizza, pizza pie -n07875152 potpie -n07880968 burrito -n07892512 red wine -n07920052 espresso -n07930864 cup -n07932039 eggnog -n09193705 alp -n09229709 bubble -n09246464 cliff, drop, drop-off -n09256479 coral reef -n09288635 geyser -n09332890 lakeside, lakeshore -n09399592 promontory, headland, head, foreland -n09421951 sandbar, sand bar -n09428293 seashore, coast, seacoast, sea-coast -n09468604 valley, vale -n09472597 volcano -n09835506 ballplayer, baseball player -n10148035 groom, bridegroom -n10565667 scuba diver -n11879895 rapeseed -n11939491 daisy -n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum -n12144580 corn -n12267677 acorn -n12620546 hip, rose hip, rosehip -n12768682 buckeye, horse chestnut, conker -n12985857 coral fungus -n12998815 agaric -n13037406 gyromitra -n13040303 stinkhorn, carrion fungus -n13044778 earthstar -n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa -n13054560 bolete -n13133613 ear, spike, capitulum -n15075141 toilet tissue, toilet paper, bathroom tissue diff --git a/inference-engine/ie_bridges/python/sample/style_transfer_sample/README.md b/inference-engine/ie_bridges/python/sample/style_transfer_sample/README.md new file mode 100644 index 0000000..2c5fa61 --- /dev/null +++ b/inference-engine/ie_bridges/python/sample/style_transfer_sample/README.md @@ -0,0 +1,74 @@ +# Neural Style Transfer Python* Sample + +This topic demonstrates how to run the Neural Style Transfer sample application, which performs +inference of style transfer models. + +> **NOTE**: The OpenVINO™ toolkit does not include a pre-trained model to run the Neural Style Transfer sample. A public model from the [Zhaw's Neural Style Transfer repository](https://github.com/zhaw/neural_style) can be used. Read the [Converting a Style Transfer Model from MXNet*](./docs/MO_DG/prepare_model/convert_model/mxnet_specific/Convert_Style_Transfer_From_MXNet.md) topic from the [Model Optimizer Developer Guide](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) to learn about how to get the trained model and how to convert it to the Inference Engine format (\*.xml + \*.bin). + +## How It Works + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + +## Running + +Running the application with the -h option yields the following usage message: +``` +python3 style_transfer_sample.py --help +``` +The command yields the following usage message: +``` +usage: style_transfer_sample.py [-h] -m MODEL -i INPUT [INPUT ...] + [-l CPU_EXTENSION] [-pp PLUGIN_DIR] + [-d DEVICE] [-nt NUMBER_TOP] [-ni NUMBER_ITER] + [--mean_val_r MEAN_VAL_R] + [--mean_val_g MEAN_VAL_G] + [--mean_val_b MEAN_VAL_B] [-pc] + +Options: + -h, --help Show this help message and exit. + -m MODEL, --model MODEL + Path to an .xml file with a trained model. + -i INPUT [INPUT ...], --input INPUT [INPUT ...] + Path to a folder with images or path to an image files + -l CPU_EXTENSION, --cpu_extension CPU_EXTENSION + Optional. Required for CPU custom layers. Absolute + MKLDNN (CPU)-targeted custom layers. Absolute path to + a shared library with the kernels implementations + -pp PLUGIN_DIR, --plugin_dir PLUGIN_DIR + Path to a plugin folder + -d DEVICE, --device DEVICE + Specify the target device to infer on; CPU, GPU, FPGA, + HDDL or MYRIAD is acceptable. Sample will look for a + suitable plugin for device specified. Default value is CPU + -nt NUMBER_TOP, --number_top NUMBER_TOP + Number of top results + -ni NUMBER_ITER, --number_iter NUMBER_ITER + Number of inference iterations + --mean_val_r MEAN_VAL_R, -mean_val_r MEAN_VAL_R + Mean value of red chanel for mean value subtraction in + postprocessing + --mean_val_g MEAN_VAL_G, -mean_val_g MEAN_VAL_G + Mean value of green chanel for mean value subtraction + in postprocessing + --mean_val_b MEAN_VAL_B, -mean_val_b MEAN_VAL_B + Mean value of blue chanel for mean value subtraction + in postprocessing + -pc, --perf_counts Report performance counters + +``` + +Running the application with the empty list of options yields the usage message given above and an error message. + +To perform inference on an image using a trained model of NST network on Intel® CPUs, use the following command: +``` + python3 style_transfer_sample.py -i /cat.bmp -m /1_decoder_FP32.xml +``` + +### Demo Output + +The application outputs an image (`out1.bmp`) or a sequence of images (`out1.bmp`, ..., `out.bmp`) which are redrawn in style of the style transfer model used for sample. + +## See Also +* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) + + diff --git a/inference-engine/ie_bridges/python/sample/style_transfer_sample.py b/inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py similarity index 68% rename from inference-engine/ie_bridges/python/sample/style_transfer_sample.py rename to inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py index 76fcada..fc08b17 100644 --- a/inference-engine/ie_bridges/python/sample/style_transfer_sample.py +++ b/inference-engine/ie_bridges/python/sample/style_transfer_sample/style_transfer_sample.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ - Copyright (c) 2018 Intel Corporation + Copyright (C) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ from __future__ import print_function import sys import os -from argparse import ArgumentParser +from argparse import ArgumentParser, SUPPRESS import cv2 import numpy as np import logging as log @@ -26,30 +26,33 @@ from openvino.inference_engine import IENetwork, IEPlugin def build_argparser(): - parser = ArgumentParser() - parser.add_argument("-m", "--model", help="Path to an .xml file with a trained model.", required=True, type=str) - parser.add_argument("-i", "--input", help="Path to a folder with images or path to an image files", required=True, - type=str, nargs="+") - parser.add_argument("-l", "--cpu_extension", - help="MKLDNN (CPU)-targeted custom layers.Absolute path to a shared library with the kernels " - "impl.", type=str, default=None) - parser.add_argument("-pp", "--plugin_dir", help="Path to a plugin folder", type=str, default=None) - parser.add_argument("-d", "--device", - help="Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample " - "will look for a suitable plugin for device specified (CPU by default)", default="CPU", - type=str) - parser.add_argument("-nt", "--number_top", help="Number of top results", default=10, type=int) - parser.add_argument("-ni", "--number_iter", help="Number of inference iterations", default=1, type=int) - parser.add_argument("--mean_val_r", "-mean_val_r", - help="Mean value of red chanel for mean value subtraction in postprocessing ", default=0, - type=float) - parser.add_argument("--mean_val_g", "-mean_val_g", - help="Mean value of green chanel for mean value subtraction in postprocessing ", default=0, - type=float) - parser.add_argument("--mean_val_b", "-mean_val_b", - help="Mean value of blue chanel for mean value subtraction in postprocessing ", default=0, - type=float) - parser.add_argument("-pc", "--perf_counts", help="Report performance counters", default=False, action="store_true") + parser = ArgumentParser(add_help=False) + args = parser.add_argument_group('Options') + args.add_argument('-h', '--help', action='help', default=SUPPRESS, help='Show this help message and exit.') + args.add_argument("-m", "--model", help="Path to an .xml file with a trained model.", required=True, type=str) + args.add_argument("-i", "--input", help="Path to a folder with images or path to an image files", required=True, + type=str, nargs="+") + args.add_argument("-l", "--cpu_extension", + help="Optional. Required for CPU custom layers. " + "Absolute MKLDNN (CPU)-targeted custom layers. Absolute path to a shared library with the " + "kernels implementations", type=str, default=None) + args.add_argument("-pp", "--plugin_dir", help="Path to a plugin folder", type=str, default=None) + args.add_argument("-d", "--device", + help="Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. Sample " + "will look for a suitable plugin for device specified. Default value is CPU", default="CPU", + type=str) + args.add_argument("-nt", "--number_top", help="Number of top results", default=10, type=int) + args.add_argument("-ni", "--number_iter", help="Number of inference iterations", default=1, type=int) + args.add_argument("--mean_val_r", "-mean_val_r", + help="Mean value of red chanel for mean value subtraction in postprocessing ", default=0, + type=float) + args.add_argument("--mean_val_g", "-mean_val_g", + help="Mean value of green chanel for mean value subtraction in postprocessing ", default=0, + type=float) + args.add_argument("--mean_val_b", "-mean_val_b", + help="Mean value of blue chanel for mean value subtraction in postprocessing ", default=0, + type=float) + args.add_argument("-pc", "--perf_counts", help="Report performance counters", default=False, action="store_true") return parser @@ -101,7 +104,6 @@ def main(): # Loading model to the plugin log.info("Loading model to the plugin") exec_net = plugin.load(network=net) - del net # Start sync inference log.info("Starting inference ({} iterations)".format(args.number_iter)) @@ -133,8 +135,6 @@ def main(): out_img = os.path.join(os.path.dirname(__file__), "out_{}.bmp".format(batch)) cv2.imwrite(out_img, data) log.info("Result image was saved to {}".format(out_img)) - del exec_net - del plugin if __name__ == '__main__': diff --git a/inference-engine/ie_bridges/python/sample/voc_labels.txt b/inference-engine/ie_bridges/python/sample/voc_labels.txt deleted file mode 100644 index 008dd5f..0000000 --- a/inference-engine/ie_bridges/python/sample/voc_labels.txt +++ /dev/null @@ -1,21 +0,0 @@ -background -aeroplane -bicycle -bird -boat -bottle -bus -car -cat -chair -cow -diningtable -dog -horse -motorbike -person -pottedplant -sheep -sofa -train -tvmonitor \ No newline at end of file diff --git a/inference-engine/ie_bridges/python/setup.py b/inference-engine/ie_bridges/python/setup.py index bb9df0e..82ed125 100644 --- a/inference-engine/ie_bridges/python/setup.py +++ b/inference-engine/ie_bridges/python/setup.py @@ -167,12 +167,12 @@ except ImportError: c_sources = [ - PACKAGE / 'ie_driver.cpp', - PACKAGE / 'ie_driver.hpp', + PACKAGE / 'ie_api_impl.cpp', + PACKAGE / 'ie_api_impl.hpp', - PACKAGE / 'c_ie_driver.pxd', - PACKAGE / 'ie_driver.pyx', - PACKAGE / 'ie_driver.pxd', + PACKAGE / 'ie_api_impl_defs.pxd', + PACKAGE / 'ie_api.pyx', + PACKAGE / 'ie_api.pxd', ] extensions = [ diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt index aa8ac74..8e0a91a 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt @@ -5,24 +5,20 @@ set (TARGET_NAME "ie_api") set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_engine) set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) -set_source_files_properties( - ie_api_impl_defs.pxd - ie_api_impl.hpp - ie_api_impl.cpp - ie_api.pyx - ie_api.pxd +file(GLOB SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/*.pyx + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) - PROPERTIES CYTHON_IS_CXX TRUE +set_source_files_properties(${SOURCE} PROPERTIES CYTHON_IS_CXX TRUE ) -cython_add_module ( - ${TARGET_NAME} +## Compatibility with python 2.7 which has depricated "register" specifier +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + add_definitions("-Wno-register") +endif() - ie_api_impl_defs.pxd - ie_api_impl.hpp - ie_api_impl.cpp - ie_api.pyx -) +cython_add_module (${TARGET_NAME} ${SOURCE}) set_target_properties (${TARGET_NAME} PROPERTIES CXX_STANDARD 11 LINKER_LANGUAGE CXX) target_link_libraries (${TARGET_NAME} PRIVATE ${InferenceEngine_LIBRARIES}) diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt deleted file mode 100644 index 1b25c3e..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt +++ /dev/null @@ -1,37 +0,0 @@ -# If the pyx file is a C++ file, we should specify that here. -set(CMAKE_INCLUDE_CURRENT_DIR ON) - -set(TARGET_NAME "dnn_builder") - -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_engine/${TARGET_NAME}) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) - -set_source_files_properties( - dnn_builder_defs.pxd - dnn_builder_impl.hpp - dnn_builder_impl.cpp - dnn_builder.pyx - dnn_builder.pxd - - PROPERTIES CYTHON_IS_CXX TRUE -) - -cython_add_module( - ${TARGET_NAME} - - dnn_builder_impl_defs.pxd - dnn_builder_impl.hpp - dnn_builder_impl.cpp - dnn_builder.pyx -) - -set_target_properties (${TARGET_NAME} PROPERTIES CXX_STANDARD 11 LINKER_LANGUAGE CXX) -add_dependencies (${TARGET_NAME} ie_api) -target_include_directories (${TARGET_NAME} PRIVATE ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/inference_engine ) -target_link_libraries (${TARGET_NAME} PRIVATE ${InferenceEngine_LIBRARIES}) - -# perform copy -ADD_CUSTOM_COMMAND (TARGET ${TARGET_NAME} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/inference_engine/${TARGET_NAME}/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -) \ No newline at end of file diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py deleted file mode 100644 index 79744ab..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .dnn_builder import * -__all__ = ["NetworkBuilder", "LayerBuilder"] diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd deleted file mode 100644 index 9a56215..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd +++ /dev/null @@ -1,26 +0,0 @@ -from .cimport dnn_builder_impl_defs as C -from libcpp.memory cimport shared_ptr - -cdef class NetworkBuilder: - cdef C.NetworkBuilder impl - -cdef class INetwork: - cdef C.INetwork impl - -cdef class ILayer: - cdef C.ILayer impl - -cdef class Port: - cdef C.Port impl - -cdef class PortInfo: - cdef C.PortInfo impl - -cdef class Connection: - cdef C.Connection impl - -cdef class LayerBuilder: - cdef C.LayerBuilder impl - -cdef class LayerConstantData(dict): - cdef shared_ptr[C.LayerBuilder] impl \ No newline at end of file diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx deleted file mode 100644 index b0754cb..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx +++ /dev/null @@ -1,423 +0,0 @@ -# #distutils: language=c++ -#from cython.operator cimport dereference as deref -from libcpp.vector cimport vector -from libcpp.map cimport map -from libcpp.string cimport string -from ..ie_api cimport IENetwork, BlobBuffer -from .cimport dnn_builder_impl_defs as C -from .dnn_builder_impl_defs cimport Blob -import numpy as np - - -np_precision_map = { - "float32": "FP32", - "float16": "FP16", - "int32": "I32", - "int16": "I16", - "uint16": "U16", - "int8": "I8", - "uint8": "U8", - } -cdef class NetworkBuilder: - def __cinit__(self, name=None, IENetwork ie_net=None): - if name is not None and ie_net is not None: - raise AttributeError("Both name and ie_net arguments are defined") - elif name is not None: - self.impl = C.NetworkBuilder(name.encode()) - elif ie_net is not None: - self.impl = C.NetworkBuilder().from_ie_network(ie_net.impl) - - def build(self): - cdef INetwork i_net = INetwork() - i_net.impl = self.impl.build() - return i_net - - def get_layer(self, id: int): - cdef LayerBuilder py_layer = LayerBuilder() - py_layer.impl = self.impl.getLayer(id) - return py_layer - - @property - def layers(self): - cdef vector[C.LayerBuilder] c_layers = self.impl.getLayers() - cdef LayerBuilder py_layer - py_layers = {} - for l in c_layers: - py_layer = LayerBuilder() - py_layer.impl = l - py_layers[l.getName().decode()] = py_layer - return py_layers - - def remove_layer(self, LayerBuilder layer): - self.impl.removeLayer(layer.impl) - - def get_layer_connection(self, LayerBuilder layer): - cdef vector[C.Connection] c_connections = self.impl.getLayerConnections(layer.impl) - cdef Connection connection - connections = [] - for con in c_connections: - connection = Connection() - connection.impl = con - connections.append(connection) - return connections - - def disconnect(self, Connection connection): - self.impl.disconnect(connection.impl) - - def connect(self, PortInfo input, PortInfo output): - self.impl.connect(input.impl, output.impl) - - def add_layer(self, LayerBuilder layer, input_ports: list = None): - cdef vector[C.PortInfo] c_ports - cdef PortInfo c_port - if not input_ports: - return self.impl.addLayer(layer.impl) - else: - for p in input_ports: - c_port = PortInfo(p.layer_id, p.port_id) - c_ports.push_back(c_port.impl) - return self.impl.addAndConnectLayer(c_ports, layer.impl) - -cdef class INetwork: - def __iter__(self): - cdef ILayer layer - layers = [] - cdef vector[C.ILayer] c_layers = self.impl.layers - for l in c_layers: - layer = ILayer() - layer.impl = l - layers.append(layer) - return iter(layers) - - @property - def layers(self): - cdef ILayer layer - layers = {} - cdef vector[C.ILayer] c_layers = self.impl.layers - for l in c_layers: - layer = ILayer() - layer.impl = l - layers[l.name.decode()] = layer - return layers - - @property - def inputs(self): - cdef ILayer layer - layers = {} - cdef vector[C.ILayer] c_layers = self.impl.inputs - for l in c_layers: - layer = ILayer() - layer.impl = l - layers[l.name.decode()] = layer - return layers - - @property - def outputs(self): - cdef ILayer layer - layers = {} - cdef vector[C.ILayer] c_layers = self.impl.outputs - for l in c_layers: - layer = ILayer() - layer.impl = l - layers[l.name.decode()] = layer - return layers - - @property - def name(self): - return self.impl.name.decode() - - - @property - def size(self): - return self.impl.size - - def get_layer_connection(self, layer: ILayer): - cdef Connection connection - connections = [] - cdef vector[C.Connection] c_connections = self.impl.getLayerConnections(layer.id) - for con in c_connections: - connection = Connection() - connection.impl = con - connections.append(connection) - return connections - - def to_ie_network(self): - cdef IENetwork net = IENetwork() - net.impl = self.impl.to_ie_network() - return net - -cdef class ILayer: - @property - def name(self): - return self.impl.name.decode() - - @property - def id(self): - return self.impl.id - - @property - def type(self): - return self.impl.type.decode() - - @property - def params(self): - return {k.decode(): v.decode() for k, v in self.impl.parameters} - - @property - def input_ports(self): - cdef Port port - cdef vector[C.Port] c_ports = self.impl.in_ports - ports = [] - for p in c_ports: - port = Port() - port.impl = p - ports.append(port) - return ports - - @property - def output_ports(self): - cdef Port port - cdef vector[C.Port] c_ports = self.impl.out_ports - ports = [] - for p in c_ports: - port = Port() - port.impl = p - ports.append(port) - return ports - - @property - def constant_data(self): - cdef map[string, Blob.Ptr] c_constant_data - c_constant_data = self.impl.constant_data - constant_data = {} - cdef BlobBuffer weights_buffer - for weights in c_constant_data: - weights_buffer = BlobBuffer() - weights_buffer.reset(weights.second) - constant_data[weights.first.decode()] = weights_buffer.to_numpy() - return constant_data - - -cdef class Port: - def __cinit__(self, shape: list=[]): - cdef vector[size_t] c_shape - for d in shape: - c_shape.push_back(d) - self.impl = C.Port(c_shape) - @property - def shape(self): - return self.impl.shape - -cdef class PortInfo: - def __cinit__(self, layer_id: int = -1, port_id: int = -1): - if layer_id != -1 and port_id != -1: - self.impl = C.PortInfo(layer_id, port_id) - else: - self.impl = C.PortInfo() - @property - def layer_id(self): - return self.impl.layer_id - - @property - def port_id(self): - return self.impl.port_id - - def __eq__(self, other): - return self.layer_id == other.layer_id and self.port_id == other.port_id - - def __ne__(self, other): - return self.layer_id != other.layer_id and self.port_id != other.port_id - -cdef class Connection: - def __cinit__(self, PortInfo input = None, PortInfo output = None): - if input and output: - self.impl = C.Connection(input.impl, output.impl) - else: - self.impl = C.Connection() - @property - def _from(self): - cdef PortInfo port_info = PortInfo() - port_info.impl = self.impl._from - return port_info - - @property - def to(self): - cdef PortInfo port_info = PortInfo() - port_info.impl = self.impl.to - return port_info - - def __eq__(self, other): - return self._from == other._from and self.to == other.to - - def __ne__(self, other): - return self._from != other._from and self.to != other.to - - -def check_constant_data(data): - for k, v in data.items(): - if not all([isinstance(x, type(v[0])) for x in v]): - raise TypeError("Elements of list for key {} have different data types! " - "Please specify list of 'int' or 'float' values.".format(k)) - if isinstance(v, list): - if isinstance(v[0], float): - dtype = np.float32 - elif isinstance(v[0], int): - dtype = np.int32 - else: - raise TypeError("Unsupported precision of the data for key {}! Given {} but 'float or 'int' precision expected". - format(k, str(v.dtype))) - data[k] = np.asanyarray(v, dtype=dtype) - elif isinstance(v, np.ndarray): - pass - else: - raise TypeError("Unsupported data type for key '{}'. {} given but 'list' or 'numpy.ndarray' expected". - format(k, type(v))) - return data - - -# TODO: Fix LAyerBuilder object copying - pass by reference -# cdef class LayerConstantData(dict): -# def update(self, other=None, **kwargs): -# if other: -# other = check_constant_data(other) -# cdef vector[size_t] dims -# cdef Blob.Ptr blob_ptr -# cdef BlobBuffer buffer -# for k, v in other.items(): -# if k in self.keys() and (v.shape == self[k].shape and v.dtype == self[k].dtype): -# print("Reuse blob for {}\n".format(k)) -# self[k][:] = v -# else: -# for dim in v.shape: -# dims.push_back(dim) -# ie_precision = np_precision_map.get(str(v.dtype), None) -# if not ie_precision: -# raise BufferError("Unsupported precision of the data for key {}! Given {} but one of the {} precisions expected". -# format(k, str(v.dtype), ", ".join(np_precision_map.keys()))) -# blob_ptr = deref(self.impl).allocateBlob(dims, ie_precision.encode()) -# buffer = BlobBuffer() -# buffer.reset(blob_ptr) -# np_buffer = buffer.to_numpy() -# np_buffer[:] = v -# deref(self.impl).addConstantData(k.encode(), blob_ptr) - -cdef class LayerBuilder: - - def __cinit__(self, type: str=None, name: str=None): - if name and type: - self.impl = C.LayerBuilder(name.encode(), type.encode()) - else: - self.impl = C.LayerBuilder() - - @property - def id(self): - return self.impl.id - @property - def name(self): - return self.impl.getName().decode() - @name.setter - def name(self, name: str): - self.impl.setName(name.encode()) - - @property - def type(self): - return self.impl.getType().decode() - @type.setter - def type(self, type: str): - self.impl.setType(type.encode()) - - @property - def input_ports(self): - cdef Port port - cdef vector[C.Port] c_ports = self.impl.getInputPorts() - py_ports = [] - for p in c_ports: - port = Port() - port.impl = p - py_ports.append(port) - return py_ports - - @input_ports.setter - def input_ports(self, ports: list): - cdef vector[C.Port] c_ports - cdef Port c_port - for p in ports: - c_port = Port(p.shape) - c_ports.push_back(c_port.impl) - self.impl.setInputPorts(c_ports) - - @property - def output_ports(self): - cdef Port port - cdef vector[C.Port] c_ports = self.impl.getOutputPorts() - py_ports = [] - for p in c_ports: - port = Port() - port.impl = p - py_ports.append(port) - return py_ports - - @output_ports.setter - def output_ports(self, ports: list): - cdef vector[C.Port] c_ports - cdef Port c_port - for p in ports: - c_port = Port(p.shape) - c_ports.push_back(c_port.impl) - self.impl.setOutputPorts(c_ports) - - @property - def params(self): - return {k.decode(): v.decode() for k, v in self.impl.getParameters()} - - @params.setter - def params(self, params_map: dict): - cdef map[string, string] c_params_map - for k, v in params_map.items(): - c_params_map[k.encode()] = str(v).encode() - self.impl.setParameters(c_params_map) - - def build(self): - cdef ILayer layer = ILayer() - layer.impl = self.impl.build() - return layer - - @property - def constant_data(self): - cdef map[string, Blob.Ptr] c_constant_data - c_constant_data = self.impl.getConstantData() - constant_data = {} - # TODO: Fix LAyerBuilder object copying - pass by reference - # constant_data = LayerConstantData() - # constant_data.impl = make_shared[C.LayerBuilder](self.impl) - cdef BlobBuffer weights_buffer - for weights in c_constant_data: - weights_buffer = BlobBuffer() - weights_buffer.reset(weights.second) - constant_data[weights.first.decode()] = weights_buffer.to_numpy() - return constant_data - - @constant_data.setter - def constant_data(self, data: dict): - cdef vector[size_t] dims - cdef map[string, Blob.Ptr] c_constant_data - cdef Blob.Ptr blob_ptr - cdef BlobBuffer buffer - data = check_constant_data(data) - for k, v in data.items(): - for dim in v.shape: - dims.push_back(dim) - ie_precision = np_precision_map.get(str(v.dtype), None) - if not ie_precision: - raise BufferError("Unsupported precision of the data for key {}! Given {} but one of the {} precisions expected". - format(k, str(v.dtype), ", ".join(np_precision_map.keys()))) - blob_ptr = self.impl.allocateBlob(dims, ie_precision.encode()) - buffer = BlobBuffer() - buffer.reset(blob_ptr) - np_buffer = buffer.to_numpy() - np_buffer[:] = v - c_constant_data[k.encode()] = blob_ptr - - self.impl.setConstantData(c_constant_data) - - # TODO: Implement get\setGraph when will be supported \ No newline at end of file diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp deleted file mode 100644 index fc9ab4e..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "dnn_builder_impl.hpp" - -// using namespace InferenceEnginePython; -// using namespace std; - -std::map precision_map = {{"FP32", InferenceEngine::Precision::FP32}, - {"FP16", InferenceEngine::Precision::FP16}, - {"Q78", InferenceEngine::Precision::Q78}, - {"I32", InferenceEngine::Precision::I32}, - {"I16", InferenceEngine::Precision::I16}, - {"I8", InferenceEngine::Precision::I8}, - {"U16", InferenceEngine::Precision::U16}, - {"U8", InferenceEngine::Precision::U8}}; - -InferenceEnginePython::ILayer buildILayer(InferenceEngine::ILayer::CPtr it) { - std::vector in_ports; - std::vector out_ports; - for (const auto &port : it->getInputPorts()) { - in_ports.push_back(InferenceEnginePython::Port(port.shape())); - } - for (const auto &port : it->getOutputPorts()) { - out_ports.push_back(InferenceEnginePython::Port(port.shape())); - } - - std::map params_map; - for (const auto ¶ms : it->getParameters()->getParameters()) { - params_map.emplace(params.first, params.second); - } - std::map data_map; - for (const auto &data : it->getParameters()->getConstantData()) { - data_map.emplace(data.first, std::const_pointer_cast(data.second)); - } - return {it, - it->getName(), - it->getId(), - it->getType(), - params_map, - data_map, - in_ports, - out_ports, - }; -} - -// NetworkBuilder -InferenceEnginePython::NetworkBuilder::NetworkBuilder(const std::string &name) { - // TODO( ): std::move or instance in heap? Please check in other places. - InferenceEngine::Builder::Network network(name); - network_ptr = std::make_shared(network); -} - -InferenceEnginePython::NetworkBuilder InferenceEnginePython::NetworkBuilder::from_ie_network( - const InferenceEnginePython::IENetwork &icnn_net) { - InferenceEngine::Builder::Network network((InferenceEngine::ICNNNetwork &) icnn_net.actual); - NetworkBuilder net_builder = NetworkBuilder(); - net_builder.network_ptr = std::make_shared(network); - return net_builder; -} - -InferenceEnginePython::INetwork InferenceEnginePython::NetworkBuilder::build() { - InferenceEngine::INetwork::Ptr i_net = network_ptr->build(); - std::vector layers; - for (const auto &it : *i_net) { - layers.push_back(buildILayer(it)); - } - std::vector inputs; - for (const auto &it : i_net->getInputs()) { - inputs.push_back(buildILayer(it)); - } - std::vector outputs; - for (const auto &it : i_net->getInputs()) { - outputs.push_back(buildILayer(it)); - } - return {i_net, // INetwork ptr - i_net->getName(), // name - i_net->size(), // Number of layers - layers, - inputs, - outputs - }; -} - -std::vector InferenceEnginePython::NetworkBuilder::getLayers() { - std::vector layers; - for (const auto &it : network_ptr->getLayers()) { - LayerBuilder layer; - layer.actual = it; - layer.id = it.getId(); - layers.push_back(layer); - } - return layers; -} - -InferenceEnginePython::LayerBuilder InferenceEnginePython::NetworkBuilder::getLayer(size_t layer_id) { - LayerBuilder layer; - InferenceEngine::Builder::Layer ie_layer = network_ptr->getLayer(layer_id); - layer.actual = ie_layer; - layer.id = ie_layer.getId(); - return layer; -} - -void InferenceEnginePython::NetworkBuilder::removeLayer(const LayerBuilder &layer) { - network_ptr->removeLayer(layer.id); -} - -const std::vector InferenceEnginePython::NetworkBuilder::getLayerConnections( - const LayerBuilder &layer) { - std::vector ie_connections = network_ptr->getLayerConnections(layer.id); - std::vector connections; - for (auto const &it : ie_connections) { - PortInfo input(it.from().layerId(), it.from().portId()); - PortInfo output(it.to().layerId(), it.to().portId()); - connections.push_back(Connection(input, output)); - } - return connections; -} - -void InferenceEnginePython::NetworkBuilder::disconnect(const Connection &connection) { - network_ptr->disconnect(connection.actual); -} - -void InferenceEnginePython::NetworkBuilder::connect(const PortInfo &input, const PortInfo &output) { - network_ptr->connect(input.actual, output.actual); -} - -size_t InferenceEnginePython::NetworkBuilder::addLayer(const LayerBuilder &layer) { - return network_ptr->addLayer(layer.actual); -} - -size_t InferenceEnginePython::NetworkBuilder::addAndConnectLayer(const std::vector &input, - const LayerBuilder &layer) { - std::vector ie_ports; - for (const auto &it : input) { - ie_ports.push_back(it.actual); - } - return network_ptr->addLayer(ie_ports, layer.actual); -} -// NetworkBuilder end -// NetworkBuilder end - -// Port -InferenceEnginePython::Port::Port(const std::vector &shapes) { - actual = InferenceEngine::Port(shapes); - shape = actual.shape(); -} - -InferenceEnginePython::PortInfo::PortInfo(size_t layer_id, size_t port_id) : PortInfo() { - this->actual = InferenceEngine::PortInfo(layer_id, port_id); - this->layer_id = layer_id; - this->port_id = port_id; -} -// Port end - -// INetwork -std::vector InferenceEnginePython::INetwork::getLayerConnections(size_t layer_id) { - std::vector connections; - for (const auto &it : actual->getLayerConnections(layer_id)) { - PortInfo input = PortInfo(it.from().layerId(), it.from().portId()); - PortInfo output = PortInfo(it.to().layerId(), it.to().portId()); - connections.push_back(Connection(input, output)); - } - return connections; -} - -InferenceEnginePython::IENetwork InferenceEnginePython::INetwork::to_ie_network() { - std::shared_ptr icnn_net = InferenceEngine::Builder::convertToICNNNetwork(actual); - InferenceEngine::CNNNetwork cnn_net(icnn_net); - IENetwork ie_net = IENetwork(); - ie_net.actual = cnn_net; - ie_net.name = name; - ie_net.batch_size = cnn_net.getBatchSize(); - return ie_net; -} -// INetwork end - -// Connection -InferenceEnginePython::Connection::Connection(PortInfo input, PortInfo output) : Connection() { - this->actual = InferenceEngine::Connection(InferenceEngine::PortInfo(input.layer_id, input.port_id), - InferenceEngine::PortInfo(output.layer_id, output.port_id)); - this->_from = PortInfo(actual.from().layerId(), actual.from().portId()); - this->to = PortInfo(actual.to().layerId(), actual.to().portId()); -} -// Connection end - -// LayerBuilder -InferenceEnginePython::LayerBuilder::LayerBuilder(const std::string &type, const std::string &name) : LayerBuilder() { - InferenceEngine::Builder::Layer layer(type, name); - this->actual = layer; - this->id = layer.getId(); -} - -const std::string &InferenceEnginePython::LayerBuilder::getName() { - return actual.getName(); -} - -const std::string &InferenceEnginePython::LayerBuilder::getType() { - return actual.getType(); -} - -std::vector InferenceEnginePython::LayerBuilder::getInputPorts() { - std::vector ports; - for (const auto &it : actual.getInputPorts()) { - ports.push_back(Port(it.shape())); - } - return ports; -} - -std::vector InferenceEnginePython::LayerBuilder::getOutputPorts() { - std::vector ports; - for (const auto &it : actual.getOutputPorts()) { - ports.push_back(Port(it.shape())); - } - return ports; -} - -std::map InferenceEnginePython::LayerBuilder::getParameters() { - std::map params_map; - for (const auto &it : actual.getParameters()) { - params_map.emplace(it.first, it.second); - } - return params_map; -} - -void InferenceEnginePython::LayerBuilder::setParameters(std::map params_map) { - std::map ie_params_map; - for (const auto &it : params_map) { - InferenceEngine::Parameter ie_param((it.second)); - ie_params_map.emplace(it.first, ie_param); - } - actual = actual.setParameters(ie_params_map); -} - -void InferenceEnginePython::LayerBuilder::setName(const std::string &name) { - actual = actual.setName(name); -} - -void InferenceEnginePython::LayerBuilder::setType(const std::string &type) { - actual = actual.setType(type); -} - -void InferenceEnginePython::LayerBuilder::setInputPorts(const std::vector ports) { - std::vector ie_ports; - for (const auto &it : ports) { - ie_ports.push_back(it.actual); - } - actual = actual.setInputPorts(ie_ports); -} - -void InferenceEnginePython::LayerBuilder::setOutputPorts(const std::vector ports) { - std::vector ie_ports; - for (const auto &it : ports) { - ie_ports.push_back(it.actual); - } - actual = actual.setOutputPorts(ie_ports); -} - -InferenceEnginePython::ILayer InferenceEnginePython::LayerBuilder::build() { - return buildILayer(actual.build()); -} - -std::map InferenceEnginePython::LayerBuilder::getConstantData() { - std::map data_map; - for (const auto &it : actual.getConstantData()) { - data_map.emplace(it.first, std::const_pointer_cast(it.second)); - } - return data_map; -} - -InferenceEngine::Blob::Ptr InferenceEnginePython::LayerBuilder::allocateBlob(std::vector dims, - const std::string &precision) { - InferenceEngine::Layout ie_layout; - ie_layout = InferenceEngine::TensorDesc::getLayoutByDims(dims); - InferenceEngine::Precision ie_precision = precision_map.at(precision); - const InferenceEngine::TensorDesc &tdesc = InferenceEngine::TensorDesc(ie_precision, dims, ie_layout); - InferenceEngine::Blob::Ptr blob; - switch (ie_precision) { - case InferenceEngine::Precision::FP32: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - case InferenceEngine::Precision::FP16: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - case InferenceEngine::Precision::I16: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - case InferenceEngine::Precision::U16: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - case InferenceEngine::Precision::U8: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - case InferenceEngine::Precision::I8: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - case InferenceEngine::Precision::I32: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - default: - blob = InferenceEngine::make_shared_blob(tdesc); - break; - } - - blob->allocate(); - return blob; -} - -void InferenceEnginePython::LayerBuilder::setConstantData(const std::map &const_data) { - actual.setConstantData(const_data); -} -// TODO( ): Fix LAyerBuilder object copying - pass by reference -// void LayerBuilder::addConstantData(const std::string & name, InferenceEngine::Blob::Ptr data){ -// InferenceEngine::Blob::CPtr c_data = const_pointer_cast(data); -// actual.addConstantData(name, c_data); -// } - -// LayerBuilder end diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp deleted file mode 100644 index b58994a..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - - -// namespace IE Python -namespace InferenceEnginePython { -struct LayerBuilder; - -struct Port { - Port() = default; - - explicit Port(const std::vector &shapes); - - InferenceEngine::Port actual; - std::vector shape; -}; - -struct ILayer { - InferenceEngine::ILayer::CPtr layer_ptr; - std::string name; - size_t id; - std::string type; - std::map parameters; - std::map constant_data; - std::vector in_ports; - std::vector out_ports; -}; - -struct PortInfo { - PortInfo(size_t layer_id, size_t port_id); - - PortInfo() : actual(0, 0) {} - - InferenceEngine::PortInfo actual; - size_t layer_id; - size_t port_id; -}; - -struct Connection { - Connection() : actual(InferenceEngine::PortInfo(0), InferenceEngine::PortInfo(0)) {} - - Connection(PortInfo input, PortInfo output); - - InferenceEngine::Connection actual; - PortInfo _from; - PortInfo to; -}; - -struct INetwork { - InferenceEngine::INetwork::Ptr actual; - std::string name; - size_t size; - std::vector layers; - std::vector inputs; - std::vector outputs; - - std::vector getLayerConnections(size_t layer_id); - - IENetwork to_ie_network(); -}; - -struct NetworkBuilder { - InferenceEngine::Builder::Network::Ptr network_ptr; - - explicit NetworkBuilder(const std::string &name); - - NetworkBuilder() = default; - - NetworkBuilder from_ie_network(const InferenceEnginePython::IENetwork &icnn_net); - - INetwork build(); - - std::vector getLayers(); - - LayerBuilder getLayer(size_t layer_id); - - void removeLayer(const LayerBuilder &layer); - - size_t addLayer(const LayerBuilder &layer); - - size_t addAndConnectLayer(const std::vector &input, const LayerBuilder &layer); - - const std::vector getLayerConnections(const LayerBuilder &layer); - - void disconnect(const Connection &connection); - - void connect(const PortInfo &input, const PortInfo &output); -}; - -struct LayerBuilder { - InferenceEngine::Builder::Layer actual; - size_t id; - - LayerBuilder(const std::string &type, const std::string &name); - - LayerBuilder() : actual("", "") {} - - LayerBuilder from_ilayer(const ILayer &ilayer); - - const std::string &getName(); - - void setName(const std::string &name); - - const std::string &getType(); - - void setType(const std::string &type); - - std::vector getInputPorts(); - - void setInputPorts(const std::vector ports); - - std::vector getOutputPorts(); - - void setOutputPorts(const std::vector ports); - - - std::map getParameters(); - - void setParameters(std::map params_map); - - ILayer build(); - - std::map getConstantData(); - - InferenceEngine::Blob::Ptr allocateBlob(std::vector dims, const std::string &precision); - - void setConstantData(const std::map &const_data); - -// TODO( ): Fix LAyerBuilder object copying - pass by reference -// void addConstantData(const std::string & name, InferenceEngine::Blob::Ptr data); -}; -} // namespace InferenceEnginePython diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd deleted file mode 100644 index 29795f2..0000000 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd +++ /dev/null @@ -1,97 +0,0 @@ -from libcpp.string cimport string -from libcpp.vector cimport vector -from libc.stddef cimport size_t -from libcpp.memory cimport shared_ptr -from libcpp.map cimport map -from ..ie_api_impl_defs cimport IENetwork - -cdef extern from "" namespace "InferenceEngine": - ctypedef vector[size_t] SizeVector - - cdef cppclass TensorDesc: - SizeVector& getDims() - const Precision& getPrecision() const - - cdef cppclass Blob: - ctypedef shared_ptr[Blob] Ptr - const TensorDesc& getTensorDesc() const - size_t element_size() const - - cdef cppclass Precision: - const char*name() const - -cdef extern from "dnn_builder_impl.hpp" namespace "InferenceEnginePython": - cdef cppclass ILayer: - const string name - size_t id - string type - map[string, string] parameters - vector[Port] in_ports - vector[Port] out_ports - map[string, Blob.Ptr] constant_data; - - - cdef cppclass INetwork: - string name - size_t size - vector[ILayer] layers - vector[ILayer] inputs - vector[ILayer] outputs - vector[Port] in_ports; - vector[Port] out_ports; - vector[Connection] getLayerConnections(size_t layer_id); - IENetwork to_ie_network(); - - cdef cppclass NetworkBuilder: - NetworkBuilder() except + - NetworkBuilder(string name) except + - NetworkBuilder from_ie_network(IENetwork &icnn_net) except + - INetwork build() except + - vector[LayerBuilder] getLayers() except + - LayerBuilder getLayer(size_t layer_id) except + - void removeLayer(const LayerBuilder& layer) except + - const vector[Connection] getLayerConnections(const LayerBuilder& layer) except + - void disconnect(const Connection& connection) except + - void connect(const PortInfo& input, const PortInfo& output) except + - size_t addLayer(const LayerBuilder& layer) except + - size_t addAndConnectLayer(const vector[PortInfo]& input, const LayerBuilder& layer); - - cdef cppclass Port: - Port() except + - Port(const vector[size_t] & shapes) except + - const vector[size_t] shape - - - cdef cppclass PortInfo: - PortInfo(size_t layer_id, size_t port_id) except + - PortInfo() except + - size_t layer_id - size_t port_id - - cdef cppclass Connection: - Connection(PortInfo input, PortInfo output) except + - Connection() except + - PortInfo _from - PortInfo to - - cdef cppclass LayerBuilder: - LayerBuilder() - LayerBuilder(const string& type, const string& name ) except + - size_t id - LayerBuilder from_ilayer(const ILayer& ilayer) except + - string getName() except + - string getType() except + - vector[Port] getInputPorts() except + - vector[Port] getOutputPorts() except + - map[string, string] getParameters() except + - void setParameters(map[string, string] params_map) except + - void setName(const string & name) except + - void setType(const string & type) except + - void setInputPorts(const vector[Port] ports) except + - void setOutputPorts(const vector[Port] ports) except + - ILayer build() except + - map[string, Blob.Ptr] getConstantData() - void setConstantData(map[string, Blob.Ptr] &const_data) - # TODO: Fix LAyerBuilder object copying - pass by reference - # void addConstantData(const string & name, Blob.Ptr data) - Blob.Ptr allocateBlob(vector[size_t] dims, const string & precision) diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd index 52bb27e..8ee5656 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd @@ -33,6 +33,7 @@ cdef class IENetwork: cdef class ExecutableNetwork: cdef unique_ptr[C.IEExecNetwork] impl + cdef C.IEPlugin plugin_impl cdef public: _requests, inputs, outputs diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx index 518125e..834f72c 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx @@ -32,7 +32,7 @@ cdef dict_to_c_map(py_dict): return c_map supported_precisions = ["FP32", "FP16", "Q78", "I32", "I16", "I8", "U32", "U16"] -supported_layouts = ["NCHW", "NHWC", "OIHW", "C", "CHW", "HW", "NC", "CN", "BLOCKED"] +supported_layouts = ["NCHW", "NHWC", "OIHW", "C", "CHW", "HW", "NC", "CN", "BLOCKED", "NCDHW"] known_plugins = ['CPU', 'GPU', 'FPGA', 'MYRIAD', 'HETERO', 'HDDL'] def get_version(): @@ -218,6 +218,10 @@ cdef class InferRequest: outputs[output] = self._get_blob_buffer(output.encode()).to_numpy() return deepcopy(outputs) + @property + def latency(self): + return self.impl.exec_time + def set_batch(self, size): if size <= 0: raise ValueError("Batch size should be positive integer number but {} specified".format(size)) @@ -225,6 +229,7 @@ cdef class InferRequest: def _fill_inputs(self, inputs): for k, v in inputs.items(): + assert k in self._inputs_list, "No input with name {} found in network".format(k) self.inputs[k][:] = v @@ -357,6 +362,7 @@ cdef class IENetwork: cdef vector[size_t] c_shape net_inputs = self.inputs for input, shape in input_shapes.items(): + c_shape = [] if input not in net_inputs: raise AttributeError("Specified {} layer not in network inputs {}! ".format(input, net_inputs)) for v in shape: @@ -396,7 +402,7 @@ cdef class IEPlugin: if config: for k, v in config.items(): c_config[to_std_string(k)] = to_std_string(v) - + exec_net.plugin_impl = self.impl exec_net.impl = move(self.impl.load(network.impl, num_requests, c_config)) exec_net.inputs = network.inputs.keys() exec_net.outputs = list(network.outputs.keys()) diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.cpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.cpp index 296b1bf..1bb3e90 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.cpp +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ std::map layout_map = {{"ANY", Inferen {"HW", InferenceEngine::Layout::HW}, {"NC", InferenceEngine::Layout::NC}, {"CN", InferenceEngine::Layout::CN}, + {"NCDHW", InferenceEngine::Layout::NCDHW}, {"BLOCKED", InferenceEngine::Layout::BLOCKED}}; #define stringify(name) # name #define IE_CHECK_CALL(expr) { \ @@ -301,7 +302,6 @@ InferenceEnginePython::IEPlugin::load(const InferenceEnginePython::IENetwork &ne InferenceEngine::ResponseDesc response; auto exec_network = InferenceEnginePython::make_unique(net.name, num_requests); - IE_CHECK_CALL(actual->LoadNetwork(exec_network->actual, net.actual, config, &response)) for (size_t i = 0; i < num_requests; ++i) { @@ -322,9 +322,8 @@ InferenceEnginePython::IEExecNetwork::IEExecNetwork(const std::string &name, siz } void InferenceEnginePython::IEExecNetwork::infer() { - InferenceEngine::ResponseDesc response; InferRequestWrap &request = infer_requests[0]; - request.request_ptr->Infer(&response); + request.infer(); } @@ -340,13 +339,33 @@ void InferenceEnginePython::InferRequestWrap::setBatch(int size) { IE_CHECK_CALL(request_ptr->SetBatch(size, &response)); } +void latency_callback(InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode code){ + if (code != InferenceEngine::StatusCode::OK) { + THROW_IE_EXCEPTION << "Async Infer Request failed with status code " << code; + } + InferenceEnginePython::InferRequestWrap *requestWrap; + InferenceEngine::ResponseDesc dsc; + request->GetUserData(reinterpret_cast(&requestWrap), &dsc); + auto end_time = Time::now(); + auto execTime = std::chrono::duration_cast(end_time - requestWrap->start_time); + requestWrap->exec_time = static_cast(execTime.count()) * 0.000001; +} + void InferenceEnginePython::InferRequestWrap::infer() { InferenceEngine::ResponseDesc response; + start_time = Time::now(); IE_CHECK_CALL(request_ptr->Infer(&response)); + auto end_time = Time::now(); + auto execTime = std::chrono::duration_cast(end_time - start_time); + exec_time = static_cast(execTime.count()) * 0.000001; } + void InferenceEnginePython::InferRequestWrap::infer_async() { InferenceEngine::ResponseDesc response; + start_time = Time::now(); + IE_CHECK_CALL(request_ptr->SetUserData(this, &response)); + request_ptr->SetCompletionCallback(latency_callback); IE_CHECK_CALL(request_ptr->StartAsync(&response)); } diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp index 7bb2dd3..9297de6 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,11 +23,16 @@ #include #include + #include #include #include -#include +#include +#include "inference_engine.hpp" + +typedef std::chrono::high_resolution_clock Time; +typedef std::chrono::nanoseconds ns; namespace InferenceEnginePython { struct IENetLayer { @@ -111,7 +116,8 @@ struct IENetwork { struct InferRequestWrap { InferenceEngine::IInferRequest::Ptr request_ptr; - + Time::time_point start_time; + double exec_time; void infer(); void infer_async(); diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl_defs.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl_defs.pxd index 78f2a62..f5729b6 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl_defs.pxd +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl_defs.pxd @@ -45,14 +45,14 @@ cdef extern from "ie_api_impl.hpp" namespace "InferenceEnginePython": vector[size_t] dims string precision string layout - void setPrecision(string precision) - void setLayout(string layout) + void setPrecision(string precision) except + + void setLayout(string layout) except + cdef cppclass OutputInfo: vector[size_t] dims string precision string layout - void setPrecision(string precision) + void setPrecision(string precision) except + cdef cppclass ProfileInfo: string status @@ -100,7 +100,8 @@ cdef extern from "ie_api_impl.hpp" namespace "InferenceEnginePython": string version cdef cppclass InferRequestWrap: - void getBlobPtr(const string &blob_name, Blob.Ptr &blob_ptr) + double exec_time; + void getBlobPtr(const string &blob_name, Blob.Ptr &blob_ptr) except + map[string, ProfileInfo] getPerformanceCounts() except + void infer() except + void infer_async() except + diff --git a/inference-engine/include/builders/ie_argmax_layer.hpp b/inference-engine/include/builders/ie_argmax_layer.hpp index 9ac1b5d..f5a042a 100644 --- a/inference-engine/include/builders/ie_argmax_layer.hpp +++ b/inference-engine/include/builders/ie_argmax_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for ArgMax layer */ -class INFERENCE_ENGINE_API_CLASS(ArgMaxLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ArgMaxLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ArgMaxLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ArgMaxLayer(Layer& genLayer); + explicit ArgMaxLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ArgMaxLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_batch_normalization_layer.hpp b/inference-engine/include/builders/ie_batch_normalization_layer.hpp index dbdf538..14d0fe2 100644 --- a/inference-engine/include/builders/ie_batch_normalization_layer.hpp +++ b/inference-engine/include/builders/ie_batch_normalization_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for BatchNormalization layer */ -class INFERENCE_ENGINE_API_CLASS(BatchNormalizationLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(BatchNormalizationLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit BatchNormalizationLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit BatchNormalizationLayer(Layer& genLayer); + explicit BatchNormalizationLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit BatchNormalizationLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -46,19 +51,6 @@ public: BatchNormalizationLayer& setPort(const Port &port); /** - * @brief Sets weights for layer - * @param weights Constant blob with weights - * @return reference to layer builder - */ - BatchNormalizationLayer& setWeights(const Blob::CPtr& weights); - /** - * @brief Sets biases for layer - * @param biases Constant blob with biases - * @return reference to layer builder - */ - BatchNormalizationLayer& setBiases(const Blob::CPtr& biases); - - /** * @brief Returns epsilon * @return Epsilon */ @@ -69,12 +61,6 @@ public: * @return reference to layer builder */ BatchNormalizationLayer& setEpsilon(float eps); - - /** - * @brief Validates layer before creation - * @param layer generic layer builder - */ - static void validate(const Layer& layer); }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_clamp_layer.hpp b/inference-engine/include/builders/ie_clamp_layer.hpp index a575962..642ff7a 100644 --- a/inference-engine/include/builders/ie_clamp_layer.hpp +++ b/inference-engine/include/builders/ie_clamp_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Clamp layer */ -class INFERENCE_ENGINE_API_CLASS(ClampLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ClampLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ClampLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ClampLayer(Layer& genLayer); + explicit ClampLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ClampLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_concat_layer.hpp b/inference-engine/include/builders/ie_concat_layer.hpp index 96cd23b..b138d3a 100644 --- a/inference-engine/include/builders/ie_concat_layer.hpp +++ b/inference-engine/include/builders/ie_concat_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Concat layer */ -class INFERENCE_ENGINE_API_CLASS(ConcatLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ConcatLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit ConcatLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ConcatLayer(Layer& genLayer); + explicit ConcatLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ConcatLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_const_layer.hpp b/inference-engine/include/builders/ie_const_layer.hpp index db0b31a..54e7069 100644 --- a/inference-engine/include/builders/ie_const_layer.hpp +++ b/inference-engine/include/builders/ie_const_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Const layer */ -class INFERENCE_ENGINE_API_CLASS(ConstLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ConstLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ConstLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ConstLayer(Layer& genLayer); + explicit ConstLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ConstLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -51,6 +56,12 @@ public: * @return reference to layer builder */ ConstLayer& setData(const Blob::CPtr& data); + + /** + * @brief Returns constant data + * @return constant blob with data + */ + const Blob::CPtr& getData() const; }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_convolution_layer.hpp b/inference-engine/include/builders/ie_convolution_layer.hpp index a577d5e..68caf99 100644 --- a/inference-engine/include/builders/ie_convolution_layer.hpp +++ b/inference-engine/include/builders/ie_convolution_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for ArgMax layer */ -class INFERENCE_ENGINE_API_CLASS(ConvolutionLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ConvolutionLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,14 +24,14 @@ public: explicit ConvolutionLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ConvolutionLayer(Layer& genLayer); + explicit ConvolutionLayer(const Layer::Ptr& layer); /** - * @brief Operator creates generic layer builder - * @return Generic layer builder + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder */ - operator Layer() const override; + explicit ConvolutionLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -40,19 +40,6 @@ public: ConvolutionLayer& setName(const std::string& name); /** - * @brief Sets weights for layer - * @param weights Constant blob with weights - * @return reference to layer builder - */ - ConvolutionLayer& setWeights(const Blob::CPtr& weights); - /** - * @brief Sets biases for layer - * @param biases Constant blob with biases - * @return reference to layer builder - */ - ConvolutionLayer& setBiases(const Blob::CPtr& biases); - - /** * @brief Returns input port * @return Input port */ @@ -151,12 +138,6 @@ public: * @return reference to layer builder */ ConvolutionLayer& setOutDepth(size_t outDepth); - - /** - * @brief Validates layer before creation - * @param layer generic layer builder - */ - static void validate(const Layer& layer); }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_crop_layer.hpp b/inference-engine/include/builders/ie_crop_layer.hpp index 7bfbe94..275c1d2 100644 --- a/inference-engine/include/builders/ie_crop_layer.hpp +++ b/inference-engine/include/builders/ie_crop_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Crop layer */ -class INFERENCE_ENGINE_API_CLASS(CropLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(CropLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit CropLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit CropLayer(Layer& genLayer); + explicit CropLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit CropLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -78,12 +83,6 @@ public: * @return reference to layer builder */ CropLayer& setOffset(const std::vector& offsets); - - /** - * @brief Validates layer before creation - * @param layer generic layer builder - */ - static void validate(const Layer& layer); }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp b/inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp index 78cdbd3..388bd05 100644 --- a/inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp +++ b/inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for CTCGreedyDecoder layer */ -class INFERENCE_ENGINE_API_CLASS(CTCGreedyDecoderLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(CTCGreedyDecoderLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit CTCGreedyDecoderLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit CTCGreedyDecoderLayer(Layer& genLayer); + explicit CTCGreedyDecoderLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit CTCGreedyDecoderLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_deconvolution_layer.hpp b/inference-engine/include/builders/ie_deconvolution_layer.hpp index c8d3925..a1cdfde 100644 --- a/inference-engine/include/builders/ie_deconvolution_layer.hpp +++ b/inference-engine/include/builders/ie_deconvolution_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include -#include +#include #include namespace InferenceEngine { @@ -23,9 +23,14 @@ public: explicit DeconvolutionLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit DeconvolutionLayer(Layer& genLayer); + explicit DeconvolutionLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit DeconvolutionLayer(const Layer::CPtr& layer); }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_detection_output_layer.hpp b/inference-engine/include/builders/ie_detection_output_layer.hpp index e4ee542..c15c4f0 100644 --- a/inference-engine/include/builders/ie_detection_output_layer.hpp +++ b/inference-engine/include/builders/ie_detection_output_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for ArgMax layer */ -class INFERENCE_ENGINE_API_CLASS(DetectionOutputLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(DetectionOutputLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit DetectionOutputLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit DetectionOutputLayer(Layer& genLayer); + explicit DetectionOutputLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit DetectionOutputLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_eltwise_layer.hpp b/inference-engine/include/builders/ie_eltwise_layer.hpp index ffdacba..370cd68 100644 --- a/inference-engine/include/builders/ie_eltwise_layer.hpp +++ b/inference-engine/include/builders/ie_eltwise_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Eltwise layer */ -class INFERENCE_ENGINE_API_CLASS(EltwiseLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(EltwiseLayer): public LayerDecorator { public: /** * @brief The enum defines all Eltwise types @@ -23,7 +23,11 @@ public: enum EltwiseType { SUM = 1, MAX, - MUL + MUL, + SUB, + DIV, + MIN, + SQUARED_DIFF }; /** @@ -33,9 +37,14 @@ public: explicit EltwiseLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit EltwiseLayer(Layer& genLayer); + explicit EltwiseLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit EltwiseLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_elu_layer.hpp b/inference-engine/include/builders/ie_elu_layer.hpp index ad5b3b4..eb62a9e 100644 --- a/inference-engine/include/builders/ie_elu_layer.hpp +++ b/inference-engine/include/builders/ie_elu_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for ELU layer */ -class INFERENCE_ENGINE_API_CLASS(ELULayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ELULayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ELULayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ELULayer(Layer& genLayer); + explicit ELULayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ELULayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_fully_connected_layer.hpp b/inference-engine/include/builders/ie_fully_connected_layer.hpp index 9b03f7d..f0a448a 100644 --- a/inference-engine/include/builders/ie_fully_connected_layer.hpp +++ b/inference-engine/include/builders/ie_fully_connected_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for FullyConnected layer */ -class INFERENCE_ENGINE_API_CLASS(FullyConnectedLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(FullyConnectedLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit FullyConnectedLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit FullyConnectedLayer(Layer& genLayer); + explicit FullyConnectedLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit FullyConnectedLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -34,19 +39,6 @@ public: FullyConnectedLayer& setName(const std::string& name); /** - * @brief Sets weights for layer - * @param weights Constant blob with weights - * @return reference to layer builder - */ - FullyConnectedLayer& setWeights(const Blob::CPtr& weights); - /** - * @brief Sets biases for layer - * @param biases Constant blob with biases - * @return reference to layer builder - */ - FullyConnectedLayer& setBiases(const Blob::CPtr& biases); - - /** * @brief Returns input port * @return Input port */ diff --git a/inference-engine/include/builders/ie_grn_layer.hpp b/inference-engine/include/builders/ie_grn_layer.hpp index f06f903..e544ab6 100644 --- a/inference-engine/include/builders/ie_grn_layer.hpp +++ b/inference-engine/include/builders/ie_grn_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for ArgMax layer */ -class INFERENCE_ENGINE_API_CLASS(GRNLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(GRNLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit GRNLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit GRNLayer(Layer& genLayer); + explicit GRNLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit GRNLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_gru_sequence_layer.hpp b/inference-engine/include/builders/ie_gru_sequence_layer.hpp new file mode 100644 index 0000000..5cb620a --- /dev/null +++ b/inference-engine/include/builders/ie_gru_sequence_layer.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +namespace InferenceEngine { +namespace Builder { + +/** + * @brief The class represents a builder for GRUSequence layer + */ +class INFERENCE_ENGINE_API_CLASS(GRUSequenceLayer): public LayerDecorator { +public: + /** + * @brief The constructor creates a builder with the name + * @param name Layer name + */ + explicit GRUSequenceLayer(const std::string& name = ""); + /** + * @brief The constructor creates a builder from generic builder + * @param layer pointer to generic builder + */ + explicit GRUSequenceLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit GRUSequenceLayer(const Layer::CPtr& layer); + /** + * @brief Sets the name for the layer + * @param name Layer name + * @return reference to layer builder + */ + GRUSequenceLayer& setName(const std::string& name); + + /** + * @brief Returns input ports with shapes for the layer + * @return Vector of ports + */ + const std::vector& getInputPorts() const; + /** + * @brief Sets input ports for the layer + * @param ports vector of input ports + * @return reference to layer builder + */ + GRUSequenceLayer& setInputPorts(const std::vector& ports); + + /** + * @brief Returns output ports with shapes for the layer + * @return Vector of ports + */ + const std::vector& getOutputPorts() const; + /** + * @brief Sets output ports for the layer + * @param ports vector of output ports + * @return reference to layer builder + */ + GRUSequenceLayer& setOutputPorts(const std::vector& ports); + + int getHiddenSize() const; + GRUSequenceLayer& setHiddenSize(int size); + bool getSequenceDim() const; + GRUSequenceLayer& setSqquenceDim(bool flag); + const std::vector& getActivations() const; + GRUSequenceLayer& setActivations(const std::vector& activations); + const std::vector& getActivationsAlpha() const; + GRUSequenceLayer& setActivationsAlpha(const std::vector& activations); + const std::vector& getActivationsBeta() const; + GRUSequenceLayer& setActivationsBeta(const std::vector& activations); + float getClip() const; + GRUSequenceLayer& setClip(float clip); + bool getLinearBeforeReset() const; + GRUSequenceLayer& setLinearBeforeReset(bool flag); + const std::string& getDirection() const; + GRUSequenceLayer& setDirection(const std::string& direction); +}; + +} // namespace Builder +} // namespace InferenceEngine + + diff --git a/inference-engine/include/builders/ie_input_layer.hpp b/inference-engine/include/builders/ie_input_layer.hpp index 5312fcd..f9a436f 100644 --- a/inference-engine/include/builders/ie_input_layer.hpp +++ b/inference-engine/include/builders/ie_input_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Input layer */ -class INFERENCE_ENGINE_API_CLASS(InputLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(InputLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit InputLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit InputLayer(Layer& genLayer); + explicit InputLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit InputLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -44,12 +49,6 @@ public: * @return reference to layer builder */ InputLayer& setPort(const Port &port); - - /** - * @brief Validates layer before creation - * @param layer generic layer builder - */ - static void validate(const Layer& layer); }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_layer_builder.hpp b/inference-engine/include/builders/ie_layer_builder.hpp index 47620fa..9e4038d 100644 --- a/inference-engine/include/builders/ie_layer_builder.hpp +++ b/inference-engine/include/builders/ie_layer_builder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,7 +6,7 @@ #include
#include -#include +#include #include #include #include @@ -25,26 +25,31 @@ struct ValidatorsHolder { /** * @brief Caseless map connects type with validator */ - details::caseless_map> validators; + details::caseless_map&, bool)>> validators; }; /** * @brief This class implements a builder for IE Layer */ -class INFERENCE_ENGINE_API_CLASS(Layer) { +class INFERENCE_ENGINE_API_CLASS(Layer): public ILayer, + public std::enable_shared_from_this { public: /** + * @brief A shared pointer to the Layer builder + */ + using Ptr = std::shared_ptr; + /** + * @brief A shared pointer to the constant Layer builder + */ + using CPtr = std::shared_ptr; + + /** * @brief The constructor creates a Layer builder with layer type and layer name * @param type Layer type * @param name Layer name */ explicit Layer(const std::string& type, const std::string& name = ""); /** - * @brief The constructor creates a Layer builder from shared pointer to ILayer - * @param layer shared pointer to ILayer - */ - explicit Layer(const ILayer::Ptr& layer); - /** * @brief The constructor creates a Layer builder from shared pointer to constant ILayer * @param layer shared pointer to constant ILayer */ @@ -57,38 +62,25 @@ public: Layer(idx_t id, const Layer& layer); /** - * @brief Returns layer builder ID - * @return ID + * @brief Compares the given Layer builder with the current one + * @param rhs Layer builder to compare with + * @return true if the given Layer builder is equal to the current one, false - otherwise */ - idx_t getId() const; + bool operator==(const Layer& rhs) const { + return params == rhs.params; + } /** - * @brief Returns a reference to layer type - * @return Layer type - */ - std::string& getType(); - /** - * @brief Returns a reference to constant layer type - * @return constant layer type + * @brief Returns layer ID + * @return Layer ID */ - const std::string& getType() const; - /** - * @brief Sets layer type - * @param type Layer type - * @return Reference to Layer builder - */ - Layer& setType(const std::string& type); + idx_t getId() const noexcept override; /** - * @brief Returns a reference to layer name + * @brief Returns a constant reference to layer name * @return Layer name */ - std::string& getName(); - /** - * @brief Returns a reference to constant layer name - * @return constant layer name - */ - const std::string& getName() const; + const std::string& getName() const noexcept override; /** * @brief Sets layer name * @param name Layer name @@ -97,32 +89,27 @@ public: Layer& setName(const std::string& name); /** - * @brief Returns layer subgraph - * @return shared pointer to INetwork - */ - INetwork::Ptr& getGraph(); - /** - * @brief Returns constant layer subgraph - * @return constant shared pointer to INetwork + * @brief Returns a constant reference to layer type + * @return Layer type */ - const INetwork::Ptr& getGraph() const; + const std::string& getType() const noexcept override; /** - * @brief Sets layer subgraph - * @param graph constant shared pointer to INetwork + * @brief Sets layer type + * @param type Layer type * @return Reference to Layer builder */ - Layer& setGraph(const INetwork::Ptr& graph); + Layer& setType(const std::string& type); /** * @brief Returns map of parameters * @return map of parameters */ - std::map& getParameters(); + const std::map& getParameters() const noexcept override; /** - * @brief Returns constant map of parameters - * @return constant map of parameters + * @brief Returns map of parameters + * @return map of parameters */ - const std::map& getParameters() const; + std::map& getParameters(); /** * @brief Sets parameters for layer * @param params constant map of parameters @@ -131,46 +118,16 @@ public: Layer& setParameters(const std::map& params); /** - * @brief Returns map of internal blobs - * @return map of internal blobs - */ - std::map& getConstantData(); - /** - * @brief Returns constant map of internal blobs - * @return constant map of internal blobs - */ - const std::map& getConstantData() const; - /** - * @brief Sets constant data for layer - * @param constData constant map of shared pointers to blobs - * @return Reference to Layer builder - */ - Layer& setConstantData(const std::map& constData); - /** - * @brief Sets constant data for layer - * @param constData constant map of shared pointers to constant blobs - * @return Reference to Layer builder - */ - Layer& setConstantData(const std::map& constData); - /** - * @brief Adds constant data for layer by name - * @param name Name of constant data - * @param data shared pointer to constant blob - * @return Reference to Layer builder + * @brief Returns vector of input ports + * @return Vector of input ports */ - Layer& addConstantData(const std::string& name, const Blob::CPtr& data); - + const std::vector& getInputPorts() const noexcept override; /** * @brief Returns vector of input ports * @return Vector of input ports */ std::vector& getInputPorts(); /** - * @brief Returns constant vector of input ports - * @return constant vector of input ports - */ - const std::vector& getInputPorts() const; - /** * @brief Sets input ports * @param ports vector of ports * @return Reference to Layer builder @@ -181,12 +138,12 @@ public: * @brief Returns vector of output ports * @return Vector of output ports */ - std::vector& getOutputPorts(); + const std::vector& getOutputPorts() const noexcept override; /** - * @brief Returns constant vector of output ports - * @return constant vector of output ports + * @brief Returns vector of output ports + * @return Vector of output ports */ - const std::vector& getOutputPorts() const; + std::vector& getOutputPorts(); /** * @brief Sets output ports * @param ports vector of ports @@ -198,30 +155,27 @@ public: * @brief Validates the current builder and generates ILayer object * @return constant shared pointer to ILayer */ - const ILayer::Ptr build() const; + const ILayer::CPtr build() const; /** * @brief Validates layer builder */ - void validate() const; + void validate(bool partial = false) const; /** * @brief Registers a new validator for type * @param type Layer type * @param validator Layer validator */ - static void addValidator(const std::string& type, const std::function& validator); + static void addValidator(const std::string& type, const std::function& validator); private: idx_t id; std::string type; std::string name; - INetwork::Ptr graph; std::vector inPorts; std::vector outPorts; std::map params; - std::map constData; - static std::shared_ptr getValidatorsHolder(); }; @@ -235,7 +189,7 @@ public: * @param type Layer type * @param validator Layer validator */ - explicit ValidatorRegisterBase(const std::string& type, const std::function& validator) { + explicit ValidatorRegisterBase(const std::string& type, const std::function& validator) { InferenceEngine::Builder::Layer::addValidator(type, validator); } }; diff --git a/inference-engine/include/builders/ie_layer_fragment.hpp b/inference-engine/include/builders/ie_layer_decorator.hpp similarity index 50% rename from inference-engine/include/builders/ie_layer_fragment.hpp rename to inference-engine/include/builders/ie_layer_decorator.hpp index a9723b3..c3b9c34 100644 --- a/inference-engine/include/builders/ie_layer_fragment.hpp +++ b/inference-engine/include/builders/ie_layer_decorator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,36 +14,41 @@ namespace Builder { /** * @brief This class defines the basic functional for layer builders */ -class INFERENCE_ENGINE_API_CLASS(LayerFragment) { +class INFERENCE_ENGINE_API_CLASS(LayerDecorator) { public: /** * @brief The constructor creates layer builders with layer type and layer name * @param type Layer type * @param name Layer name */ - LayerFragment(const std::string& type, const std::string& name); + LayerDecorator(const std::string& type, const std::string& name); /** * @brief The constructor creates layer builders from reference to generic layer builder - * @param genLayer Generic layer builder + * @param layer pointer to generic layer builder */ - explicit LayerFragment(Layer& genLayer); + explicit LayerDecorator(const Layer::Ptr& layer); + /** + * @brief The constructor creates layer builders from reference to generic layer builder + * @param layer constant pointer to generic layer builder + */ + explicit LayerDecorator(const Layer::CPtr& layer); /** * @brief The copy constructor * @param rval Source builder */ - explicit LayerFragment(const LayerFragment& rval); + LayerDecorator(const LayerDecorator& rval); /** - * @brief Copy operator for LayerFragment + * @brief Copy operator for LayerDecorator * @param rval * @return Layer builder */ - LayerFragment& operator=(const LayerFragment& rval); + LayerDecorator& operator=(const LayerDecorator& rval); /** * @brief Virtual destructor */ - virtual ~LayerFragment() = default; + virtual ~LayerDecorator() = default; /** * @brief The operator creates generic builder @@ -52,6 +57,18 @@ public: virtual operator Layer() const; /** + * @brief The operator creates generic builder + * @return Pointer to generic builder + */ + virtual operator Layer::Ptr(); + + /** + * @brief The operator creates generic builder + * @return Constant pointer to generic builder + */ + virtual operator Layer::CPtr() const; + + /** * @brief Returns layer type * @return Layer type */ @@ -63,12 +80,14 @@ public: const std::string& getName() const; protected: - const std::vector uInts2size_t(const std::vector& vector) const; - Layer& getLayer() const; + Layer::Ptr& getLayer(); + const Layer::CPtr getLayer() const; + void checkType(const std::string& type) const; + + Layer::CPtr cLayer; private: - Layer layer; - Layer& refLayer; + Layer::Ptr layer; }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_lrn_layer.hpp b/inference-engine/include/builders/ie_lrn_layer.hpp new file mode 100644 index 0000000..625de12 --- /dev/null +++ b/inference-engine/include/builders/ie_lrn_layer.hpp @@ -0,0 +1,99 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +namespace InferenceEngine { +namespace Builder { + +/** + * @brief The class represents a builder for LRN layer + */ +class INFERENCE_ENGINE_API_CLASS(LRNLayer): public LayerDecorator { +public: + /** + * @brief The constructor creates a builder with the name + * @param name Layer name + */ + explicit LRNLayer(const std::string& name = ""); + /** + * @brief The constructor creates a builder from generic builder + * @param layer pointer to generic builder + */ + explicit LRNLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit LRNLayer(const Layer::CPtr& layer); + /** + * @brief Sets the name for the layer + * @param name Layer name + * @return reference to layer builder + */ + LRNLayer& setName(const std::string& name); + + /** + * @brief Returns port with shapes for the layer + * @return Port with shapes + */ + const Port& getPort() const; + /** + * @brief Sets port shapes for the layer + * @param port Port with shapes + * @return reference to layer builder + */ + LRNLayer& setPort(const Port& port); + /** + * @brief Returns side length of the region + * @return Size + */ + size_t getSize() const; + /** + * @brief Sets side length of the region + * @param size Size + * @return reference to layer builder + */ + LRNLayer& setSize(size_t size); + /** + * @brief Returns scaling parameter for the normalizing sum + * @return Scaling parameter + */ + float getAlpha() const; + /** + * @brief Sets scaling parameter for the normalizing sum + * @param alpha Scaling parameter + * @return reference to layer builder + */ + LRNLayer& setAlpha(float alpha); + /** + * @brief Returns exponent for the normalizing sum + * @return Exponent + */ + float getBeta() const; + /** + * @brief Sets exponent for the normalizing sum + * @param beta Exponent + * @return reference to layer builder + */ + LRNLayer& setBeta(float beta); + /** + * @brief Returns region type + * @return true if normalizing sum is performed over adjacent channels + */ + float getBias() const; + /** + * @brief Sets bias for the normalizing sum + * @param bias Bias + * @return reference to layer builder + */ + LRNLayer& setBias(float bias); +}; + +} // namespace Builder +} // namespace InferenceEngine diff --git a/inference-engine/include/builders/ie_lstm_sequence_layer.hpp b/inference-engine/include/builders/ie_lstm_sequence_layer.hpp new file mode 100644 index 0000000..1d01f58 --- /dev/null +++ b/inference-engine/include/builders/ie_lstm_sequence_layer.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +namespace InferenceEngine { +namespace Builder { + +/** + * @brief The class represents a builder for LSTMSequence layer + */ +class INFERENCE_ENGINE_API_CLASS(LSTMSequenceLayer): public LayerDecorator { +public: + /** + * @brief The constructor creates a builder with the name + * @param name Layer name + */ + explicit LSTMSequenceLayer(const std::string& name = ""); + /** + * @brief The constructor creates a builder from generic builder + * @param layer pointer to generic builder + */ + explicit LSTMSequenceLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit LSTMSequenceLayer(const Layer::CPtr& layer); + /** + * @brief Sets the name for the layer + * @param name Layer name + * @return reference to layer builder + */ + LSTMSequenceLayer& setName(const std::string& name); + + /** + * @brief Returns input ports with shapes for the layer + * @return Vector of ports + */ + const std::vector& getInputPorts() const; + /** + * @brief Sets input ports for the layer + * @param ports vector of input ports + * @return reference to layer builder + */ + LSTMSequenceLayer& setInputPorts(const std::vector& ports); + + /** + * @brief Returns output ports with shapes for the layer + * @return Vector of ports + */ + const std::vector& getOutputPorts() const; + /** + * @brief Sets output ports for the layer + * @param ports vector of output ports + * @return reference to layer builder + */ + LSTMSequenceLayer& setOutputPorts(const std::vector& ports); + + int getHiddenSize() const; + LSTMSequenceLayer& setHiddenSize(int size); + bool getSequenceDim() const; + LSTMSequenceLayer& setSqquenceDim(bool flag); + const std::vector& getActivations() const; + LSTMSequenceLayer& setActivations(const std::vector& activations); + const std::vector& getActivationsAlpha() const; + LSTMSequenceLayer& setActivationsAlpha(const std::vector& activations); + const std::vector& getActivationsBeta() const; + LSTMSequenceLayer& setActivationsBeta(const std::vector& activations); + float getClip() const; + LSTMSequenceLayer& setClip(float clip); + bool getInputForget() const; + LSTMSequenceLayer& setInputForget(bool flag); + const std::string& getDirection() const; + LSTMSequenceLayer& setDirection(const std::string& direction); +}; + +} // namespace Builder +} // namespace InferenceEngine + + diff --git a/inference-engine/include/builders/ie_memory_layer.hpp b/inference-engine/include/builders/ie_memory_layer.hpp index b399e95..474220b 100644 --- a/inference-engine/include/builders/ie_memory_layer.hpp +++ b/inference-engine/include/builders/ie_memory_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Memory layer */ -class INFERENCE_ENGINE_API_CLASS(MemoryLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(MemoryLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit MemoryLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit MemoryLayer(Layer& genLayer); + explicit MemoryLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit MemoryLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_mvn_layer.hpp b/inference-engine/include/builders/ie_mvn_layer.hpp index ef92351..4e6f327 100644 --- a/inference-engine/include/builders/ie_mvn_layer.hpp +++ b/inference-engine/include/builders/ie_mvn_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for MVN layer */ -class INFERENCE_ENGINE_API_CLASS(MVNLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(MVNLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit MVNLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit MVNLayer(Layer& genLayer); + explicit MVNLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit MVNLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_network_builder.hpp b/inference-engine/include/builders/ie_network_builder.hpp index 586a267..9b5000c 100644 --- a/inference-engine/include/builders/ie_network_builder.hpp +++ b/inference-engine/include/builders/ie_network_builder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -23,12 +23,43 @@ namespace Builder { /** * @brief This class implements a builder for IE Network */ -class INFERENCE_ENGINE_API_CLASS(Network) { +class INFERENCE_ENGINE_API_CLASS(Network): public INetwork { public: /** * @brief A shared pointer to the Network builder */ using Ptr = std::shared_ptr; + /** + * @brief An iterator for Network builder definition + */ + using iterator = details::INetworkIterator; + /** + * @brief Begin network iterator + * @return Network iterator + */ + iterator begin(); + /** + * @brief Begin network iterator + * @return const INetwork iterator + */ + const_iterator begin() const noexcept override; + + /** + * @brief End network iterator + * @return Network iterator + */ + iterator end(); + /** + * @brief End network iterator + * @return const INetwork iterator + */ + const_iterator end() const noexcept override; + + /** + * @brief Returns a number of layers in the network. + * @return Layers count + */ + size_t size() const noexcept override; /** * @brief The constructor creates a builder based on ICNNNetwork @@ -69,11 +100,6 @@ public: Network(const Context& ieContext, const INetwork& network); /** - * @brief Virtual destructor - */ - virtual ~Network() = default; - - /** * @brief Adds new layer and connects it with previous layers * * @param inputs Vector with PortInfo objects from previous layers @@ -112,64 +138,102 @@ public: void disconnect(const Connection& connection); /** - * @brief Returns layer builder by ID - * - * @param layerId Layer ID + * @brief Returns vector of layer builders * - * @return Layer buider + * @return Vector of layer builders */ - Layer& getLayer(idx_t layerId); + std::vector& getLayers(); /** - * @brief Returns constant layer builder by ID - * - * @param layerId Layer ID + * @brief Returns constant vector of layer builders * - * @return constant layer builder + * @return constant vector of layer builders */ - const Layer& getLayer(idx_t layerId) const; + const std::vector& getLayers() const; /** - * @brief Returns vector of layer builders - * - * @return Vector of layer builders + * @brief Returns a constant smart pointer to a Layer interface. + * If the layer is missing, returns nullptr. + * @param id Id of the Layer + * @return Layer interface smart pointer */ - std::vector& getLayers(); + const ILayer::CPtr getLayer(idx_t id) const noexcept override; + Layer::Ptr getLayer(idx_t layerId); + /** - * @brief Returns constant vector of layer builders - * - * @return constant vector of layer builders + * @brief Returns a constant vector of input layers. + * @return Vector of input layers */ - const std::vector& getLayers() const; + const std::vector getInputs() const noexcept override; + /** + * @brief Returns a vector of input layers. + * @return Vector of input layers + */ + std::vector getInputs(); /** - * @brief Returns all connections for layer - * - * @param layerId Layer ID - * - * @return Vector of connections for the current layer + * @brief Returns a constant vector of output layers. + * @return Vector of output layers + */ + const std::vector getOutputs() const noexcept override; + /** + * @brief Returns a vector of input layers. + * @return Vector of input layers + */ + std::vector getOutputs(); + + /** + * @brief Returns a constant vector of connections for specific layer. + * If the layer is missing, returns empty vector. + * @param layerId layer index + * @return Vector of connections */ - const std::vector getLayerConnections(idx_t layerId) const noexcept; + const std::vector getLayerConnections(idx_t layerId) const noexcept override; /** - * @brief Builds and validate networks + * @brief Returns a constant vector of all connections. + * @return Vector of connections + */ + const std::vector& getConnections() const; + + /** + * @brief Returns a network name. + * @return Network name + */ + const std::string& getName() const noexcept override; + + /** + * @brief Returns a network context + * @return const reference to Context + */ + const Context& getContext() const noexcept override; + /** + * @brief Returns a network context + * @return reference to Context + */ + Context& getContext() noexcept; + + /** + * @brief Builds and validate network * * @return const shared pointer to INetwork */ - const INetwork::Ptr build() const; + const INetwork::CPtr build(); + + /** + * @brief Validates network + * + */ + void validate(); /** * @brief The operator builds network * * @return const shared pointer to INetwork */ - explicit operator const INetwork::Ptr() const; + explicit operator const INetwork::CPtr(); private: - const Context ctx; - const size_t version; - std::string name; - std::vector layers; - std::vector connections; + std::map parameters; }; /** @@ -178,7 +242,7 @@ private: * @param network constant shared pointer to INetwork object * @return constant shared pointer to ICNNNetwork */ -INFERENCE_ENGINE_API_CPP(const std::shared_ptr) convertToICNNNetwork(const INetwork::Ptr& network); +INFERENCE_ENGINE_API_CPP(const std::shared_ptr) convertToICNNNetwork(const INetwork::CPtr& network); } // namespace Builder diff --git a/inference-engine/include/builders/ie_norm_layer.hpp b/inference-engine/include/builders/ie_norm_layer.hpp index 58d972b..6209057 100644 --- a/inference-engine/include/builders/ie_norm_layer.hpp +++ b/inference-engine/include/builders/ie_norm_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Norm layer */ -class INFERENCE_ENGINE_API_CLASS(NormLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(NormLayer): public LayerDecorator { public: /** * @brief The enum defines all Norm types @@ -30,9 +30,14 @@ public: explicit NormLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit NormLayer(Layer& genLayer); + explicit NormLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit NormLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_normalize_layer.hpp b/inference-engine/include/builders/ie_normalize_layer.hpp index bc05381..b2f2b8e 100644 --- a/inference-engine/include/builders/ie_normalize_layer.hpp +++ b/inference-engine/include/builders/ie_normalize_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Normalize layer */ -class INFERENCE_ENGINE_API_CLASS(NormalizeLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(NormalizeLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit NormalizeLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit NormalizeLayer(Layer& genLayer); + explicit NormalizeLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit NormalizeLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_output_layer.hpp b/inference-engine/include/builders/ie_output_layer.hpp index 71abd38..d113e60 100644 --- a/inference-engine/include/builders/ie_output_layer.hpp +++ b/inference-engine/include/builders/ie_output_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Output layer */ -class INFERENCE_ENGINE_API_CLASS(OutputLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(OutputLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit OutputLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit OutputLayer(Layer& genLayer); + explicit OutputLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit OutputLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_permute_layer.hpp b/inference-engine/include/builders/ie_permute_layer.hpp index 54cfcf3..f6cad5b 100644 --- a/inference-engine/include/builders/ie_permute_layer.hpp +++ b/inference-engine/include/builders/ie_permute_layer.hpp @@ -1,10 +1,10 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include +#include #include #include @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Permute layer */ -class INFERENCE_ENGINE_API_CLASS(PermuteLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PermuteLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit PermuteLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PermuteLayer(Layer& genLayer); + explicit PermuteLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PermuteLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -34,19 +39,6 @@ public: PermuteLayer& setName(const std::string& name); /** - * @brief Sets weights for layer - * @param weights Constant blob with weights - * @return reference to layer builder - */ - PermuteLayer& setWeights(const Blob::CPtr& weights); - /** - * @brief Sets biases for layer - * @param biases Constant blob with biases - * @return reference to layer builder - */ - PermuteLayer& setBiases(const Blob::CPtr& biases); - - /** * @brief Returns input port * @return Input port */ diff --git a/inference-engine/include/builders/ie_pooling_layer.hpp b/inference-engine/include/builders/ie_pooling_layer.hpp index 80150ae..b732a49 100644 --- a/inference-engine/include/builders/ie_pooling_layer.hpp +++ b/inference-engine/include/builders/ie_pooling_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Pooling layer */ -class INFERENCE_ENGINE_API_CLASS(PoolingLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PoolingLayer): public LayerDecorator { public: /** * @brief The enum defines available pooling types @@ -40,9 +40,14 @@ public: explicit PoolingLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PoolingLayer(Layer& genLayer); + explicit PoolingLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PoolingLayer(const Layer::CPtr& layer); /** * @brief Operator creates generic layer builder * @return Generic layer builder @@ -155,12 +160,6 @@ public: */ PoolingLayer& setExcludePad(bool exclude); - /** - * @brief Validates layer before creation - * @param layer generic layer builder - */ - static void validate(const Layer& layer); - private: PoolingType type; RoundingType roundingType; diff --git a/inference-engine/include/builders/ie_power_layer.hpp b/inference-engine/include/builders/ie_power_layer.hpp index 94ef1cc..4db69c0 100644 --- a/inference-engine/include/builders/ie_power_layer.hpp +++ b/inference-engine/include/builders/ie_power_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Power layer */ -class INFERENCE_ENGINE_API_CLASS(PowerLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PowerLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit PowerLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PowerLayer(Layer& genLayer); + explicit PowerLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PowerLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_prelu_layer.hpp b/inference-engine/include/builders/ie_prelu_layer.hpp index 5e7dedd..d3f7f01 100644 --- a/inference-engine/include/builders/ie_prelu_layer.hpp +++ b/inference-engine/include/builders/ie_prelu_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for PReLU layer */ -class INFERENCE_ENGINE_API_CLASS(PReLULayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PReLULayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit PReLULayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PReLULayer(Layer& genLayer); + explicit PReLULayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PReLULayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -34,12 +39,6 @@ public: PReLULayer& setName(const std::string& name); /** - * @brief Sets weights for layer - * @param weights Constant blob with weights - * @return reference to layer builder - */ - PReLULayer& setWeights(const Blob::CPtr& weights); - /** * @brief Returns port with shapes for the layer * @return Port with shapes */ diff --git a/inference-engine/include/builders/ie_prior_box_clustered_layer.hpp b/inference-engine/include/builders/ie_prior_box_clustered_layer.hpp index 61d7f16..ff891dc 100644 --- a/inference-engine/include/builders/ie_prior_box_clustered_layer.hpp +++ b/inference-engine/include/builders/ie_prior_box_clustered_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for PriorBoxClustered layer */ -class INFERENCE_ENGINE_API_CLASS(PriorBoxClusteredLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PriorBoxClusteredLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit PriorBoxClusteredLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PriorBoxClusteredLayer(Layer& genLayer); + explicit PriorBoxClusteredLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PriorBoxClusteredLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_prior_box_layer.hpp b/inference-engine/include/builders/ie_prior_box_layer.hpp index 8051d6c..3e36f0d 100644 --- a/inference-engine/include/builders/ie_prior_box_layer.hpp +++ b/inference-engine/include/builders/ie_prior_box_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for PriorBox layer */ -class INFERENCE_ENGINE_API_CLASS(PriorBoxLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PriorBoxLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit PriorBoxLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PriorBoxLayer(Layer& genLayer); + explicit PriorBoxLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PriorBoxLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_proposal_layer.hpp b/inference-engine/include/builders/ie_proposal_layer.hpp index e7fcac4..aa14504 100644 --- a/inference-engine/include/builders/ie_proposal_layer.hpp +++ b/inference-engine/include/builders/ie_proposal_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Proposal layer */ -class INFERENCE_ENGINE_API_CLASS(ProposalLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ProposalLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit ProposalLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ProposalLayer(Layer& genLayer); + explicit ProposalLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ProposalLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_psroi_pooling_layer.hpp b/inference-engine/include/builders/ie_psroi_pooling_layer.hpp index 82c9f47..34b5108 100644 --- a/inference-engine/include/builders/ie_psroi_pooling_layer.hpp +++ b/inference-engine/include/builders/ie_psroi_pooling_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for PSROIPooling layer */ -class INFERENCE_ENGINE_API_CLASS(PSROIPoolingLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(PSROIPoolingLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit PSROIPoolingLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit PSROIPoolingLayer(Layer& genLayer); + explicit PSROIPoolingLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit PSROIPoolingLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_region_yolo_layer.hpp b/inference-engine/include/builders/ie_region_yolo_layer.hpp index 1a2d645..1f2e37c 100644 --- a/inference-engine/include/builders/ie_region_yolo_layer.hpp +++ b/inference-engine/include/builders/ie_region_yolo_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for RegionYolo layer */ -class INFERENCE_ENGINE_API_CLASS(RegionYoloLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(RegionYoloLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit RegionYoloLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit RegionYoloLayer(Layer& genLayer); + explicit RegionYoloLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit RegionYoloLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_relu6_layer.hpp b/inference-engine/include/builders/ie_relu6_layer.hpp index 3bc3360..1cc384a 100644 --- a/inference-engine/include/builders/ie_relu6_layer.hpp +++ b/inference-engine/include/builders/ie_relu6_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for ReLU6 layer */ -class INFERENCE_ENGINE_API_CLASS(ReLU6Layer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ReLU6Layer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ReLU6Layer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ReLU6Layer(Layer& genLayer); + explicit ReLU6Layer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ReLU6Layer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_relu_layer.hpp b/inference-engine/include/builders/ie_relu_layer.hpp index 9422e19..2853858 100644 --- a/inference-engine/include/builders/ie_relu_layer.hpp +++ b/inference-engine/include/builders/ie_relu_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for ReLU layer */ -class INFERENCE_ENGINE_API_CLASS(ReLULayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ReLULayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ReLULayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ReLULayer(Layer& genLayer); + explicit ReLULayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ReLULayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_reorg_yolo_layer.hpp b/inference-engine/include/builders/ie_reorg_yolo_layer.hpp index 4719873..0529ee5 100644 --- a/inference-engine/include/builders/ie_reorg_yolo_layer.hpp +++ b/inference-engine/include/builders/ie_reorg_yolo_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for ReorgYolo layer */ -class INFERENCE_ENGINE_API_CLASS(ReorgYoloLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ReorgYoloLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit ReorgYoloLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ReorgYoloLayer(Layer& genLayer); + explicit ReorgYoloLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer const pointer to generic builder + */ + explicit ReorgYoloLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_resample_layer.hpp b/inference-engine/include/builders/ie_resample_layer.hpp new file mode 100644 index 0000000..4e343bd --- /dev/null +++ b/inference-engine/include/builders/ie_resample_layer.hpp @@ -0,0 +1,126 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +namespace InferenceEngine { +namespace Builder { + +/** + * @brief The class represents a builder for Resample layer + */ +class INFERENCE_ENGINE_API_CLASS(ResampleLayer): public LayerDecorator { +public: + /** + * @brief The constructor creates a builder with the name + * @param name Layer name + */ + explicit ResampleLayer(const std::string& name = ""); + /** + * @brief The constructor creates a builder from generic builder + * @param layer pointer to generic builder + */ + explicit ResampleLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer const pointer to generic builder + */ + explicit ResampleLayer(const Layer::CPtr& layer); + /** + * @brief Sets the name for the layer + * @param name Layer name + * @return reference to layer builder + */ + ResampleLayer& setName(const std::string& name); + + /** + * @brief Returns input port + * @return Input port + */ + const Port& getInputPort() const; + /** + * @brief Sets input port + * @param ports Input port + * @return reference to layer builder + */ + ResampleLayer& setInputPort(const Port& ports); + /** + * @brief Returns output port + * @return Output port + */ + const Port& getOutputPort() const; + /** + * @brief Sets output port + * @param port Output port + * @return reference to layer builder + */ + ResampleLayer& setOutputPort(const Port& port); + /** + * @brief Returns resample type + * @return Type + */ + const std::string& getResampleType() const; + /** + * @brief Sets resample type + * @param type Type + * @return reference to layer builder + */ + ResampleLayer& setResampleType(const std::string& type); + /** + * @brief Returns flag that denotes whether to perform anti-aliasing + * @return true if anti-aliasing is performed + */ + bool getAntialias() const; + /** + * @brief Sets flag that denotes whether to perform anti-aliasing + * @param flag antialias + * @return reference to layer builder + */ + ResampleLayer& setAntialias(bool antialias); + /** + * @brief Returns resample factor + * @return Factor + */ + float getFactor() const; + /** + * @brief Sets resample factor + * @param factor Factor + * @return reference to layer builder + */ + ResampleLayer& setFactor(float factor); + /** + * @brief Returns width + * @return Width + */ + size_t getWidth() const; + /** + * @brief Sets width + * @param width Width + * @return reference to layer builder + */ + ResampleLayer& setWidth(size_t width); + /** + * @brief Returns height + * @return Height + */ + size_t getHeight() const; + /** + * @brief Sets height + * @param height Height + * @return reference to layer builder + */ + ResampleLayer& setHeight(size_t height); +}; + +} // namespace Builder +} // namespace InferenceEngine + + + + diff --git a/inference-engine/include/builders/ie_reshape_layer.hpp b/inference-engine/include/builders/ie_reshape_layer.hpp index 42eacea..578e9b7 100644 --- a/inference-engine/include/builders/ie_reshape_layer.hpp +++ b/inference-engine/include/builders/ie_reshape_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Reshape layer */ -class INFERENCE_ENGINE_API_CLASS(ReshapeLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ReshapeLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit ReshapeLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ReshapeLayer(Layer& genLayer); + explicit ReshapeLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ReshapeLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_rnn_sequence_layer.hpp b/inference-engine/include/builders/ie_rnn_sequence_layer.hpp new file mode 100644 index 0000000..8851916 --- /dev/null +++ b/inference-engine/include/builders/ie_rnn_sequence_layer.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +namespace InferenceEngine { +namespace Builder { + +/** + * @brief The class represents a builder for RNNSequence layer + */ +class INFERENCE_ENGINE_API_CLASS(RNNSequenceLayer): public LayerDecorator { +public: + /** + * @brief The constructor creates a builder with the name + * @param name Layer name + */ + explicit RNNSequenceLayer(const std::string& name = ""); + /** + * @brief The constructor creates a builder from generic builder + * @param layer pointer to generic builder + */ + explicit RNNSequenceLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit RNNSequenceLayer(const Layer::CPtr& layer); + /** + * @brief Sets the name for the layer + * @param name Layer name + * @return reference to layer builder + */ + RNNSequenceLayer& setName(const std::string& name); + + /** + * @brief Returns input ports with shapes for the layer + * @return Vector of ports + */ + const std::vector& getInputPorts() const; + /** + * @brief Sets input ports for the layer + * @param ports vector of input ports + * @return reference to layer builder + */ + RNNSequenceLayer& setInputPorts(const std::vector& ports); + + /** + * @brief Returns output ports with shapes for the layer + * @return Vector of ports + */ + const std::vector& getOutputPorts() const; + /** + * @brief Sets output ports for the layer + * @param ports vector of output ports + * @return reference to layer builder + */ + RNNSequenceLayer& setOutputPorts(const std::vector& ports); + + int getHiddenSize() const; + RNNSequenceLayer& setHiddenSize(int size); + bool getSequenceDim() const; + RNNSequenceLayer& setSqquenceDim(bool flag); + const std::vector& getActivations() const; + RNNSequenceLayer& setActivations(const std::vector& activations); + const std::vector& getActivationsAlpha() const; + RNNSequenceLayer& setActivationsAlpha(const std::vector& activations); + const std::vector& getActivationsBeta() const; + RNNSequenceLayer& setActivationsBeta(const std::vector& activations); + float getClip() const; + RNNSequenceLayer& setClip(float clip); +}; + +} // namespace Builder +} // namespace InferenceEngine + + diff --git a/inference-engine/include/builders/ie_roi_pooling_layer.hpp b/inference-engine/include/builders/ie_roi_pooling_layer.hpp index d6bb578..7105d09 100644 --- a/inference-engine/include/builders/ie_roi_pooling_layer.hpp +++ b/inference-engine/include/builders/ie_roi_pooling_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for ROIPooling layer */ -class INFERENCE_ENGINE_API_CLASS(ROIPoolingLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ROIPoolingLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit ROIPoolingLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ROIPoolingLayer(Layer& genLayer); + explicit ROIPoolingLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ROIPoolingLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_scale_shift_layer.hpp b/inference-engine/include/builders/ie_scale_shift_layer.hpp index 361664e..9e40572 100644 --- a/inference-engine/include/builders/ie_scale_shift_layer.hpp +++ b/inference-engine/include/builders/ie_scale_shift_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for ScaleShift layer */ -class INFERENCE_ENGINE_API_CLASS(ScaleShiftLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(ScaleShiftLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit ScaleShiftLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit ScaleShiftLayer(Layer& genLayer); + explicit ScaleShiftLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit ScaleShiftLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name @@ -44,19 +49,6 @@ public: * @return reference to layer builder */ ScaleShiftLayer& setPort(const Port &port); - - /** - * @brief Sets weights for layer - * @param weights Constant blob with weights - * @return reference to layer builder - */ - ScaleShiftLayer& setWeights(const Blob::CPtr& weights); - /** - * @brief Sets biases for layer - * @param biases Constant blob with biases - * @return reference to layer builder - */ - ScaleShiftLayer& setBiases(const Blob::CPtr& biases); }; } // namespace Builder diff --git a/inference-engine/include/builders/ie_sigmoid_layer.hpp b/inference-engine/include/builders/ie_sigmoid_layer.hpp index 6c48358..d6f20a6 100644 --- a/inference-engine/include/builders/ie_sigmoid_layer.hpp +++ b/inference-engine/include/builders/ie_sigmoid_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for Sigmoid layer */ -class INFERENCE_ENGINE_API_CLASS(SigmoidLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(SigmoidLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit SigmoidLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit SigmoidLayer(Layer& genLayer); + explicit SigmoidLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit SigmoidLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_simpler_nms_layer.hpp b/inference-engine/include/builders/ie_simpler_nms_layer.hpp index 28cf6ee..c97e84b 100644 --- a/inference-engine/include/builders/ie_simpler_nms_layer.hpp +++ b/inference-engine/include/builders/ie_simpler_nms_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for SimplerNMS layer */ -class INFERENCE_ENGINE_API_CLASS(SimplerNMSLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(SimplerNMSLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit SimplerNMSLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit SimplerNMSLayer(Layer& genLayer); + explicit SimplerNMSLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit SimplerNMSLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_softmax_layer.hpp b/inference-engine/include/builders/ie_softmax_layer.hpp index 1ce13b8..2031a62 100644 --- a/inference-engine/include/builders/ie_softmax_layer.hpp +++ b/inference-engine/include/builders/ie_softmax_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for SoftMax layer */ -class INFERENCE_ENGINE_API_CLASS(SoftMaxLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(SoftMaxLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit SoftMaxLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit SoftMaxLayer(Layer& genLayer); + explicit SoftMaxLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit SoftMaxLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_split_layer.hpp b/inference-engine/include/builders/ie_split_layer.hpp index 526ed79..f982da0 100644 --- a/inference-engine/include/builders/ie_split_layer.hpp +++ b/inference-engine/include/builders/ie_split_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Split layer */ -class INFERENCE_ENGINE_API_CLASS(SplitLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(SplitLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit SplitLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit SplitLayer(Layer& genLayer); + explicit SplitLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit SplitLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_tanh_layer.hpp b/inference-engine/include/builders/ie_tanh_layer.hpp index acb0002..0caf3d0 100644 --- a/inference-engine/include/builders/ie_tanh_layer.hpp +++ b/inference-engine/include/builders/ie_tanh_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include namespace InferenceEngine { @@ -14,7 +14,7 @@ namespace Builder { /** * @brief The class represents a builder for TanH layer */ -class INFERENCE_ENGINE_API_CLASS(TanHLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(TanHLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -23,9 +23,14 @@ public: explicit TanHLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit TanHLayer(Layer& genLayer); + explicit TanHLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit TanHLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/builders/ie_tile_layer.hpp b/inference-engine/include/builders/ie_tile_layer.hpp index de03ba2..004d9a2 100644 --- a/inference-engine/include/builders/ie_tile_layer.hpp +++ b/inference-engine/include/builders/ie_tile_layer.hpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include +#include #include #include @@ -15,7 +15,7 @@ namespace Builder { /** * @brief The class represents a builder for Tile layer */ -class INFERENCE_ENGINE_API_CLASS(TileLayer): public LayerFragment { +class INFERENCE_ENGINE_API_CLASS(TileLayer): public LayerDecorator { public: /** * @brief The constructor creates a builder with the name @@ -24,9 +24,14 @@ public: explicit TileLayer(const std::string& name = ""); /** * @brief The constructor creates a builder from generic builder - * @param genLayer generic builder + * @param layer pointer to generic builder */ - explicit TileLayer(Layer& genLayer); + explicit TileLayer(const Layer::Ptr& layer); + /** + * @brief The constructor creates a builder from generic builder + * @param layer constant pointer to generic builder + */ + explicit TileLayer(const Layer::CPtr& layer); /** * @brief Sets the name for the layer * @param name Layer name diff --git a/inference-engine/include/cldnn/cldnn_config.hpp b/inference-engine/include/cldnn/cldnn_config.hpp index dc440ba..571ff51 100644 --- a/inference-engine/include/cldnn/cldnn_config.hpp +++ b/inference-engine/include/cldnn/cldnn_config.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,7 +11,7 @@ #pragma once #include -#include "../ie_plugin_config.hpp" +#include "ie_plugin_config.hpp" namespace InferenceEngine { diff --git a/inference-engine/include/cpp/ie_cnn_net_reader.h b/inference-engine/include/cpp/ie_cnn_net_reader.h index 7bc0b25..149f86a 100644 --- a/inference-engine/include/cpp/ie_cnn_net_reader.h +++ b/inference-engine/include/cpp/ie_cnn_net_reader.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/cpp/ie_cnn_network.h b/inference-engine/include/cpp/ie_cnn_network.h index 82d13cf..4ccccd8 100644 --- a/inference-engine/include/cpp/ie_cnn_network.h +++ b/inference-engine/include/cpp/ie_cnn_network.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -111,6 +111,14 @@ public: /** * @brief Wraps original method + * ICNNNetwork::getName + */ + const std::string& getName() const noexcept { + return actual->getName(); + } + + /** + * @brief Wraps original method * ICNNNetwork::setBatchSize */ virtual void setBatchSize(const size_t size) { @@ -222,9 +230,10 @@ public: /** * @brief Serialize network to IR and weights files. * @param xmlPath Path to output IR file. - * @param binPath Path to output weights file. + * @param binPath Path to output weights file. The parameter is skipped in case + * of executable graph info serialization. */ - void serialize(const std::string &xmlPath, const std::string &binPath) const { + void serialize(const std::string &xmlPath, const std::string &binPath = "") const { CALL_STATUS_FNC(serialize, xmlPath, binPath); } diff --git a/inference-engine/include/cpp/ie_executable_network.hpp b/inference-engine/include/cpp/ie_executable_network.hpp index dd8e942..c9225a1 100644 --- a/inference-engine/include/cpp/ie_executable_network.hpp +++ b/inference-engine/include/cpp/ie_executable_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -16,6 +16,7 @@ #include "ie_iexecutable_network.hpp" #include "cpp/ie_infer_request.hpp" #include "cpp/ie_memory_state.hpp" +#include "cpp/ie_cnn_network.h" #include "details/ie_exception_conversion.hpp" namespace InferenceEngine { @@ -107,6 +108,15 @@ public: return actual; } + /** + * @brief Get executable graph information from a plugin represented as CNNNetwork + * @return CNNetwork containing Executable Graph Info + */ + CNNNetwork GetExecGraphInfo() { + ICNNNetwork::Ptr ptr = nullptr; + CALL_STATUS_FNC(GetExecGraphInfo, ptr); + return CNNNetwork(ptr); + } /** *@brief see original function InferenceEngine::IExecutableNetwork::QueryState diff --git a/inference-engine/include/cpp/ie_infer_request.hpp b/inference-engine/include/cpp/ie_infer_request.hpp index 10317af..1205d3e 100644 --- a/inference-engine/include/cpp/ie_infer_request.hpp +++ b/inference-engine/include/cpp/ie_infer_request.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/cpp/ie_memory_state.hpp b/inference-engine/include/cpp/ie_memory_state.hpp index f9bd90a..d20fcae 100644 --- a/inference-engine/include/cpp/ie_memory_state.hpp +++ b/inference-engine/include/cpp/ie_memory_state.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/cpp/ie_plugin_cpp.hpp b/inference-engine/include/cpp/ie_plugin_cpp.hpp index 5605209..0cef8cf 100644 --- a/inference-engine/include/cpp/ie_plugin_cpp.hpp +++ b/inference-engine/include/cpp/ie_plugin_cpp.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -77,6 +77,7 @@ public: } /** + * @deprecated Loads IExecutableNetwork to create IInferRequest. * @brief Wraps original method * IInferencePlugin::Infer(const BlobMap&, BlobMap&, ResponseDesc *resp) */ diff --git a/inference-engine/include/details/caseless.hpp b/inference-engine/include/details/caseless.hpp index f3e0d7a..8f9d3ce 100644 --- a/inference-engine/include/details/caseless.hpp +++ b/inference-engine/include/details/caseless.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_blob_iterator.hpp b/inference-engine/include/details/ie_blob_iterator.hpp index 6b083e1..61e7acf 100644 --- a/inference-engine/include/details/ie_blob_iterator.hpp +++ b/inference-engine/include/details/ie_blob_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_cnn_network_iterator.hpp b/inference-engine/include/details/ie_cnn_network_iterator.hpp index 9cc65c9..ff29b5d 100644 --- a/inference-engine/include/details/ie_cnn_network_iterator.hpp +++ b/inference-engine/include/details/ie_cnn_network_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_cnn_network_tools.h b/inference-engine/include/details/ie_cnn_network_tools.h index b80978b..a872fdb 100644 --- a/inference-engine/include/details/ie_cnn_network_tools.h +++ b/inference-engine/include/details/ie_cnn_network_tools.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_exception.hpp b/inference-engine/include/details/ie_exception.hpp index 514a639..5285f05 100644 --- a/inference-engine/include/details/ie_exception.hpp +++ b/inference-engine/include/details/ie_exception.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,8 +31,21 @@ if (!(EXPRESSION)) throw InferenceEngine::details::InferenceEngineException(__FILE__, __LINE__) << "AssertionFailed: " << #EXPRESSION // NOLINT #else #include + +class NullStream { + public : + template + NullStream & operator << (const T &obj) noexcept { + return *this; + } + + NullStream & operator<< (std::ostream & (*manip)(std::ostream &)) noexcept { + return *this; + } +}; + #define IE_ASSERT(EXPRESSION)\ - assert((EXPRESSION)); std::stringstream() + assert((EXPRESSION)); NullStream() #endif // NDEBUG namespace InferenceEngine { diff --git a/inference-engine/include/details/ie_exception_conversion.hpp b/inference-engine/include/details/ie_exception_conversion.hpp index 3c2b947..1c45d82 100644 --- a/inference-engine/include/details/ie_exception_conversion.hpp +++ b/inference-engine/include/details/ie_exception_conversion.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_inetwork_iterator.hpp b/inference-engine/include/details/ie_inetwork_iterator.hpp index 84f8dee..7d77bc8 100644 --- a/inference-engine/include/details/ie_inetwork_iterator.hpp +++ b/inference-engine/include/details/ie_inetwork_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +15,7 @@ #include #include -#include +#include namespace InferenceEngine { namespace details { @@ -33,23 +33,22 @@ public: allInputs.push_back(std::dynamic_pointer_cast(input)); } - bool res = forestDFS(allInputs, [&](std::shared_ptr current) { + forestDFS(allInputs, [&](std::shared_ptr current) { sortedLayers.push_back(current); }, false); - if (!res) { - THROW_IE_EXCEPTION << "Sorting not possible, due to existed loop."; - } - std::reverse(std::begin(sortedLayers), std::end(sortedLayers)); currentLayer = getNextLayer(); } + bool operator!=(const INetworkIterator& that) const { return !operator==(that); } + bool operator==(const INetworkIterator& that) const { return network == that.network && currentLayer == that.currentLayer; } + typename INetworkIterator::reference operator*() { if (nullptr == currentLayer) { THROW_IE_EXCEPTION << "iterator out of bound"; @@ -79,27 +78,24 @@ private: } template - inline bool forestDFS(const std::vector>& heads, const T &visit, bool bVisitBefore) { + inline void forestDFS(const std::vector>& heads, const T &visit, bool bVisitBefore) { if (heads.empty()) { - return true; + return; } std::unordered_map visited; for (auto & layer : heads) { - if (!DFS(visited, layer, visit, bVisitBefore)) { - return false; - } + DFS(visited, layer, visit, bVisitBefore); } - return true; } template - inline bool DFS(std::unordered_map &visited, + inline void DFS(std::unordered_map &visited, const std::shared_ptr &layer, const T &visit, bool visitBefore) { if (layer == nullptr) { - return true; + return; } if (visitBefore) @@ -111,25 +107,24 @@ private: continue; } const auto outLayer = network->getLayer(connection.to().layerId()); + if (!outLayer) + THROW_IE_EXCEPTION << "Couldn't get layer with id: " << connection.to().layerId(); auto i = visited.find(outLayer->getId()); if (i != visited.end()) { /** * cycle detected we entered still not completed node */ if (!i->second) { - return false; + THROW_IE_EXCEPTION << "Sorting not possible, due to existed loop."; } continue; } - if (!DFS(visited, outLayer, visit, visitBefore)) { - return false; - } + DFS(visited, outLayer, visit, visitBefore); } if (!visitBefore) visit(layer); visited[layer->getId()] = true; - return true; } }; diff --git a/inference-engine/include/details/ie_irelease.hpp b/inference-engine/include/details/ie_irelease.hpp index a1b55dd..8bbf396 100644 --- a/inference-engine/include/details/ie_irelease.hpp +++ b/inference-engine/include/details/ie_irelease.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_no_copy.hpp b/inference-engine/include/details/ie_no_copy.hpp index 8d823ad..565835a 100644 --- a/inference-engine/include/details/ie_no_copy.hpp +++ b/inference-engine/include/details/ie_no_copy.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_no_release.hpp b/inference-engine/include/details/ie_no_release.hpp index 3afe7c5..7033484 100644 --- a/inference-engine/include/details/ie_no_release.hpp +++ b/inference-engine/include/details/ie_no_release.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_pre_allocator.hpp b/inference-engine/include/details/ie_pre_allocator.hpp index b280cc1..d4801ba 100644 --- a/inference-engine/include/details/ie_pre_allocator.hpp +++ b/inference-engine/include/details/ie_pre_allocator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_so_loader.h b/inference-engine/include/details/ie_so_loader.h index 6b93d26..4a5d39f 100644 --- a/inference-engine/include/details/ie_so_loader.h +++ b/inference-engine/include/details/ie_so_loader.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/ie_so_pointer.hpp b/inference-engine/include/details/ie_so_pointer.hpp index a4973ff..a6d7372 100644 --- a/inference-engine/include/details/ie_so_pointer.hpp +++ b/inference-engine/include/details/ie_so_pointer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/os/lin_shared_object_loader.h b/inference-engine/include/details/os/lin_shared_object_loader.h index 9e883f3..1126e0d 100644 --- a/inference-engine/include/details/os/lin_shared_object_loader.h +++ b/inference-engine/include/details/os/lin_shared_object_loader.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/details/os/win_shared_object_loader.h b/inference-engine/include/details/os/win_shared_object_loader.h index 27be898..269cba2 100644 --- a/inference-engine/include/details/os/win_shared_object_loader.h +++ b/inference-engine/include/details/os/win_shared_object_loader.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp index 29b4342..6b9cbe8 100644 --- a/inference-engine/include/gna/gna_config.hpp +++ b/inference-engine/include/gna/gna_config.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -12,7 +12,7 @@ #pragma once #include -#include "../ie_plugin_config.hpp" +#include "ie_plugin_config.hpp" namespace InferenceEngine { @@ -27,6 +27,8 @@ namespace GNAConfigParams { /** * @brief Scale factor that is calculated by user, in order to use static quantisation feature * This option should be used with floating point value serialized to string with decimal separator equals to . (dot) +* @details For multiple input case, individual scale factors can be passed, using KEY_GNA_SCALE_FACTOR[_input_layer_name] +* where input_layer can be obtained from from CNNNetwork::GetInputsInfo */ DECLARE_GNA_CONFIG_KEY(SCALE_FACTOR); diff --git a/inference-engine/include/hetero/hetero_plugin_config.hpp b/inference-engine/include/hetero/hetero_plugin_config.hpp index 4330e1e..4f2e166 100644 --- a/inference-engine/include/hetero/hetero_plugin_config.hpp +++ b/inference-engine/include/hetero/hetero_plugin_config.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -12,7 +12,7 @@ #pragma once #include -#include "../ie_plugin_config.hpp" +#include "ie_plugin_config.hpp" namespace InferenceEngine { diff --git a/inference-engine/include/ie_allocator.hpp b/inference-engine/include/ie_allocator.hpp index b9f5f5c..08b6838 100644 --- a/inference-engine/include/ie_allocator.hpp +++ b/inference-engine/include/ie_allocator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_api.h b/inference-engine/include/ie_api.h index 3a71e75..76bc7e2 100644 --- a/inference-engine/include/ie_api.h +++ b/inference-engine/include/ie_api.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_blob.h b/inference-engine/include/ie_blob.h index 21267a3..c96a01b 100644 --- a/inference-engine/include/ie_blob.h +++ b/inference-engine/include/ie_blob.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -304,6 +304,17 @@ public: } /** + * @brief Creates a TBlob object with the specified dimensions, layout and custom memory allocator but does not allocate the memory. + * @param p Precision + * @param l Layout + * @param dims Tensor dimensions + * @param alloc Allocator to be used + */ + TBlob(const TensorDesc& tensorDesc, const std::shared_ptr& alloc) + : Blob(tensorDesc), _allocator(alloc) { + } + + /** * @deprecated Please use TensorDesc for Blob initialization. */ explicit TBlob(Precision p, Layout l) : Blob(p, l) {} @@ -588,7 +599,9 @@ protected: */ template inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, const SizeVector &dims) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return std::make_shared>(p, l, dims); } @@ -602,7 +615,9 @@ inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, const S */ template inline typename TBlob::Ptr make_shared_blob(Precision p, const SizeVector &dims) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return make_shared_blob(p, TensorDesc::getLayoutByDims(dims), dims); } @@ -616,7 +631,9 @@ inline typename TBlob::Ptr make_shared_blob(Precision p, const SizeVector */ template inline typename InferenceEngine::TBlob::Ptr make_shared_blob(Precision p, Layout l, const TArg &arg) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return std::make_shared>(p, l, arg); } @@ -630,7 +647,9 @@ inline typename InferenceEngine::TBlob::Ptr make_shared_blob(Precision p, */ template inline typename InferenceEngine::TBlob::Ptr make_shared_blob(Precision p, const TArg &arg) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return make_shared_blob(p, TensorDesc::getLayoutByDims(arg), arg); } @@ -642,7 +661,9 @@ inline typename InferenceEngine::TBlob::Ptr make_shared_blob(Precision p, */ template inline typename InferenceEngine::TBlob::Ptr make_shared_blob(const TensorDesc& tensorDesc) { - IE_ASSERT(tensorDesc.getPrecision().hasStorageType()); + if (!tensorDesc.getPrecision().hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return std::make_shared>(tensorDesc); } @@ -656,11 +677,28 @@ inline typename InferenceEngine::TBlob::Ptr make_shared_blob(const TensorD */ template inline typename InferenceEngine::TBlob::Ptr make_shared_blob(const TensorDesc& tensorDesc, Type * ptr, size_t size = 0) { - IE_ASSERT(tensorDesc.getPrecision().hasStorageType()); + if (!tensorDesc.getPrecision().hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return std::make_shared>(tensorDesc, ptr, size); } /** + * @brief Creates a blob with the given tensor descriptor and allocator. + * @tparam Type Type of the shared pointer to be created + * @param tensorDesc Tensor descriptor for Blob creation + * @param alloc Shared pointer to IAllocator to use in the blob + * @return A shared pointer to the newly created blob of the given type + */ +template +inline typename InferenceEngine::TBlob::Ptr make_shared_blob(const TensorDesc& tensorDesc, const std::shared_ptr& alloc) { + if (!tensorDesc.getPrecision().hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; + return std::make_shared>(tensorDesc, alloc); +} + +/** * @deprecated Use TensorDesc in order to create Blob::Ptr. * @brief Gets a shared pointer for the new TBlob instance. * The created instance is based on move semantics from the given TBlob instance. @@ -693,7 +731,9 @@ inline typename InferenceEngine::TBlob::Ptr make_shared_blob(const TBlob */ template inline typename InferenceEngine::TBlob::Ptr make_shared_blob(Precision p, Layout l = NCHW) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return std::make_shared>(p, l); } @@ -709,7 +749,9 @@ inline typename InferenceEngine::TBlob::Ptr make_shared_blob(Precision p */ template inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, SizeVector dims, const std::vector &arg) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; auto blob = std::make_shared>(p, l, dims); blob->set(arg); return blob; @@ -726,7 +768,9 @@ inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, SizeV */ template inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, const std::vector &arg) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; auto blob = std::make_shared>(p, l); blob->set(arg); return blob; @@ -742,7 +786,9 @@ inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, const */ template inline typename TBlob::Ptr make_shared_blob(Precision p, const std::vector &arg) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return make_shared_blob(p, TensorDesc::getLayoutByDims(arg), arg); } @@ -758,7 +804,9 @@ inline typename TBlob::Ptr make_shared_blob(Precision p, const std::vect */ template inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, const SizeVector &dims, TypeTo * ptr, size_t size = 0) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; auto blob = std::make_shared>(p, l, dims, ptr, size); return blob; } @@ -774,7 +822,9 @@ inline typename TBlob::Ptr make_shared_blob(Precision p, Layout l, const */ template inline typename TBlob::Ptr make_shared_blob(Precision p, const SizeVector &dims, TypeTo * ptr, size_t size = 0) { - IE_ASSERT(p.hasStorageType()); + if (!p.hasStorageType()) + THROW_IE_EXCEPTION << "Cannot make shared blob! " + << "The blob type cannot be used to store objects of current precision"; return make_shared_blob(p, TensorDesc::getLayoutByDims(dims), dims, ptr, size); } diff --git a/inference-engine/include/ie_builders.hpp b/inference-engine/include/ie_builders.hpp index ad2543f..6ab7802 100644 --- a/inference-engine/include/ie_builders.hpp +++ b/inference-engine/include/ie_builders.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,10 @@ #include #include #include +#include #include +#include +#include #include #include #include @@ -38,7 +41,9 @@ #include #include #include +#include #include +#include #include #include #include diff --git a/inference-engine/include/ie_common.h b/inference-engine/include/ie_common.h index e08c265..7d75eee 100644 --- a/inference-engine/include/ie_common.h +++ b/inference-engine/include/ie_common.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -85,6 +85,9 @@ enum Layout : uint8_t { // weight layouts OIHW = 64, + // Scalar + SCALAR = 95, + // bias layouts C = 96, diff --git a/inference-engine/include/ie_context.hpp b/inference-engine/include/ie_context.hpp index d7aca90..22f6f93 100644 --- a/inference-engine/include/ie_context.hpp +++ b/inference-engine/include/ie_context.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_data.h b/inference-engine/include/ie_data.h index 2088919..0ae2073 100644 --- a/inference-engine/include/ie_data.h +++ b/inference-engine/include/ie_data.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -112,6 +112,13 @@ public: void setLayout(Layout layout); /** + * @brief changes dims and layout at same time + * @param dims new dimensions + * @param layout new layout + */ + void reshape(const SizeVector &dims, Layout layout); + + /** * @brief Gets the layout value for this Data instance */ Layout getLayout() const; diff --git a/inference-engine/include/ie_device.hpp b/inference-engine/include/ie_device.hpp index 2cc67cc..6dc7c4e 100644 --- a/inference-engine/include/ie_device.hpp +++ b/inference-engine/include/ie_device.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -29,7 +29,8 @@ enum class TargetDevice : uint8_t { eMYRIAD = 5, eHDDL = 6, eGNA = 7, - eHETERO = 8 + eHETERO = 8, + eKMB = 9, }; /** @@ -53,7 +54,8 @@ class TargetDeviceInfo { DECL_DEVICE(MYRIAD), DECL_DEVICE(HDDL), DECL_DEVICE(GNA), - DECL_DEVICE(HETERO) + DECL_DEVICE(HETERO), + DECL_DEVICE(KMB) }; #undef DECLARE return g_allDeviceInfos; @@ -69,7 +71,8 @@ class TargetDeviceInfo { { "HDDL", InferenceEngine::TargetDevice::eHDDL }, { "GNA", InferenceEngine::TargetDevice::eGNA }, { "BALANCED", InferenceEngine::TargetDevice::eBalanced }, - { "HETERO", InferenceEngine::TargetDevice::eHETERO } + { "HETERO", InferenceEngine::TargetDevice::eHETERO }, + { "KMB", InferenceEngine::TargetDevice::eKMB } }; auto val = deviceFromNameMap.find(deviceName); return val != deviceFromNameMap.end() ? val->second : InferenceEngine::TargetDevice::eDefault; diff --git a/inference-engine/include/ie_error.hpp b/inference-engine/include/ie_error.hpp index a934a78..5016a73 100644 --- a/inference-engine/include/ie_error.hpp +++ b/inference-engine/include/ie_error.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_extension.h b/inference-engine/include/ie_extension.h index 926dbd6..534f018 100644 --- a/inference-engine/include/ie_extension.h +++ b/inference-engine/include/ie_extension.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_icnn_net_reader.h b/inference-engine/include/ie_icnn_net_reader.h index 820c2b4..ce791ed 100644 --- a/inference-engine/include/ie_icnn_net_reader.h +++ b/inference-engine/include/ie_icnn_net_reader.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_icnn_network.hpp b/inference-engine/include/ie_icnn_network.hpp index 07b2444..cf6869b 100644 --- a/inference-engine/include/ie_icnn_network.hpp +++ b/inference-engine/include/ie_icnn_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -34,6 +34,8 @@ using OutputsDataMap = std::map; */ class ICNNNetwork : public details::IRelease { public: + using Ptr = std::shared_ptr; + /** * @brief Returns the main network operating precision. * This may be MIXED if not homogeneous. diff --git a/inference-engine/include/ie_icnn_network_stats.hpp b/inference-engine/include/ie_icnn_network_stats.hpp index 440c202..2547fb6 100644 --- a/inference-engine/include/ie_icnn_network_stats.hpp +++ b/inference-engine/include/ie_icnn_network_stats.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_iexecutable_network.hpp b/inference-engine/include/ie_iexecutable_network.hpp index 0b0a915..f3f4221 100644 --- a/inference-engine/include/ie_iexecutable_network.hpp +++ b/inference-engine/include/ie_iexecutable_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +11,7 @@ #include "ie_common.h" #include "ie_primitive_info.hpp" #include "ie_iinfer_request.hpp" +#include "ie_icnn_network.hpp" #include "ie_imemory_state.hpp" #include "ie_input_info.hpp" #include @@ -73,7 +74,7 @@ public: virtual StatusCode Export(const std::string& modelFileName, ResponseDesc *resp) noexcept = 0; /** - * @brief Gets the mapping of IR layer names to implemented kernels + * @brief Get the mapping of IR layer names to implemented kernels * @param deployedTopology Map of PrimitiveInfo objects that represent the deployed topology * @param resp Optional: pointer to an already allocated object to contain information in case of failure * @return Status code of the operation: OK (0) for success @@ -81,6 +82,14 @@ public: virtual StatusCode GetMappedTopology(std::map> &deployedTopology, ResponseDesc *resp) noexcept = 0; /** + * @brief Get executable graph information from a device + * @param graphPtr network ptr to store executable graph information + * @param resp Optional: pointer to an already allocated object to contain information in case of failure + * @return Status code of the operation: OK (0) for success + */ + virtual StatusCode GetExecGraphInfo(ICNNNetwork::Ptr &graphPtr, ResponseDesc *resp) noexcept = 0; + + /** * @brief Gets state control interface for given executable network, State control essential for recurrent networks * @param pState reference to a pointer that receives internal states * @param idx requested index for receiving memory state diff --git a/inference-engine/include/ie_iextension.h b/inference-engine/include/ie_iextension.h index c0ea3f8..7d529b4 100644 --- a/inference-engine/include/ie_iextension.h +++ b/inference-engine/include/ie_iextension.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -161,11 +161,21 @@ public: /** * @brief check that reshape can be applied, that parameters and shapes are valid */ - virtual StatusCode inferShapes(const std::vector& inShapes, - const std::map& params, - const std::map& blobs, - std::vector& outShapes, - ResponseDesc* resp) noexcept = 0; + virtual StatusCode inferShapes(const std::vector& /*inBlobs*/, + const std::map& /*params*/, + const std::map& /*blobs*/, + std::vector& /*outShapes*/, + ResponseDesc* /*resp*/) noexcept { return NOT_IMPLEMENTED; } // For backward-compatibility + + /** + * @deprecated + * @brief check that reshape can be applied, that parameters and shapes are valid + */ + virtual StatusCode inferShapes(const std::vector& /*inShapes*/, + const std::map& /*params*/, + const std::map& /*blobs*/, + std::vector& /*outShapes*/, + ResponseDesc* /*resp*/) noexcept { return NOT_IMPLEMENTED; } // For backward-compatibility }; /** diff --git a/inference-engine/include/ie_ihetero_plugin.hpp b/inference-engine/include/ie_ihetero_plugin.hpp index 326c350..f9f1f23 100644 --- a/inference-engine/include/ie_ihetero_plugin.hpp +++ b/inference-engine/include/ie_ihetero_plugin.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_iinfer_request.hpp b/inference-engine/include/ie_iinfer_request.hpp index fe09be7..d922f5b 100644 --- a/inference-engine/include/ie_iinfer_request.hpp +++ b/inference-engine/include/ie_iinfer_request.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_imemory_state.hpp b/inference-engine/include/ie_imemory_state.hpp index 2c007df..4240025 100644 --- a/inference-engine/include/ie_imemory_state.hpp +++ b/inference-engine/include/ie_imemory_state.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_input_info.hpp b/inference-engine/include/ie_input_info.hpp index 17f6a67..590b491 100644 --- a/inference-engine/include/ie_input_info.hpp +++ b/inference-engine/include/ie_input_info.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_layers.h b/inference-engine/include/ie_layers.h index 4582842..3e1b9bb 100644 --- a/inference-engine/include/ie_layers.h +++ b/inference-engine/include/ie_layers.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -373,7 +373,7 @@ public: * @param def Default value of the parameter if not found * @return An bool value for the specified parameter */ - bool GetParamsAsBool(const char *param, bool def) const { + bool GetParamAsBool(const char *param, bool def) const { std::string val = GetParamAsString(param, std::to_string(def).c_str()); std::string loweredCaseValue; std::transform(val.begin(), val.end(), std::back_inserter(loweredCaseValue), [](char value) { @@ -384,11 +384,17 @@ public: if (!(std::istringstream(loweredCaseValue) >> std::boolalpha >> result)) { // attempting parse using non alpha bool - return static_cast(GetParamAsInt(param, def)); + return (GetParamAsInt(param, def) != 0); } return result; } + /** + * @deprecated Use GetParamAsBool function for that functionality + */ + bool GetParamsAsBool(const char *param, bool def) const { + return GetParamAsBool(param, def); + } /** * @brief Returns a string value for the given parameter or returns the default one @@ -398,13 +404,26 @@ public: */ std::string GetParamAsString(const char *param, const char *def) const { auto it = params.find(param); - if (it == params.end()) { + if (it == params.end() || it->second.empty()) { return def; } return (*it).second; } /** + * @brief Checks the param presence in the layer + * @param param Name of the layer parameter + * @return a bool depending param presence + */ + bool CheckParamPresence(const char *param) const { + auto it = params.find(param); + if (it == params.end()) { + return false; + } + return true; + } + + /** * @brief Returns a string value for the given parameter. * Throws exception if parameter was not found. * @param param Name of the layer parameter @@ -418,10 +437,28 @@ public: return (*it).second; } + std::vector GetParamAsStrings(const char *param, std::vector def) const { + std::string vals = GetParamAsString(param, ""); + std::vector result; + std::istringstream stream(vals); + std::string str; + if (vals.empty()) + return def; + while (getline(stream, str, ',')) { + try { + result.push_back(str); + } catch (...) { + THROW_IE_EXCEPTION << "Cannot parse parameter " << param << " from IR for layer " << name << "."; + } + } + return result; + } + /** * @brief Map of pairs: (parameter name, parameter value) */ std::map params; + /** * @brief Map of pairs: (name, weights/biases blob) */ @@ -638,6 +675,107 @@ public: PoolingLayer(PoolingLayer &&) = default; }; +/** + * @brief This class represents a standard binary convolution layer + */ +class BinaryConvolutionLayer : public WeightableLayer { +public: + /** + * @enum eBinaryConvolutionMode + * @brief Defines possible modes of binary convolution operation + */ + enum eBinaryConvolutionMode { + xnor_popcount = 0 + }; + + /** + * @brief Mode of binary convolution operation + */ + eBinaryConvolutionMode _mode = xnor_popcount; + + /** + * @brief A number of input feature maps (size) generating the 3'rd input dimension + */ + unsigned int _in_depth = 0u; + + /** + * @brief A pad value which is used to fill pad area + */ + float _pad_value = -1.0f; + + /** + * @brief A convolution kernel array [X, Y, Z, ...] + */ + DEFINE_PROP(_kernel); + /** + * @brief A convolution paddings begin array [X, Y, Z, ...] + */ + DEFINE_PROP(_padding); + /** + * @brief A convolution paddings end array [X, Y, Z, ...] + */ + PropertyVector _pads_end; + /** + * @brief A convolution strides array [X, Y, Z, ...] + */ + DEFINE_PROP(_stride); + /** + * @brief A convolution dilations array [X, Y, Z, ...] + */ + DEFINE_PROP(_dilation); + /** + * @brief A number of output feature maps (size) generating the 3'rd output dimension + */ + unsigned int _out_depth = 0u; + /** + * @brief Number of groups + */ + unsigned int _group = 1u; + /** + * @brief Auto padding type + */ + std::string _auto_pad; + + /** + * @brief Creates a new BinaryConvolutionLayer instance. + */ + explicit BinaryConvolutionLayer(const LayerParams &p) : WeightableLayer(p), + _kernel(2, 0u), _padding(2, 0u), _stride(2, 1u), _dilation(2, 1u) {} + /** + * @brief assignment operator + */ + BinaryConvolutionLayer & operator = (const BinaryConvolutionLayer & that) { + if (&that != this) { + WeightableLayer::operator=(that); + _kernel = that._kernel; + _padding = that._padding; + _pads_end = that._pads_end; + _stride = that._stride; + _dilation = that._dilation; + _out_depth = that._out_depth; + _group = that._group; + _mode = that._mode; + _in_depth = that._in_depth; + _pad_value = that._pad_value; + } + return *this; + } + /** + * @brief move assignment operator + */ + BinaryConvolutionLayer& operator = (BinaryConvolutionLayer &&) = default; + /** + * @brief copy constructor + */ + BinaryConvolutionLayer(const BinaryConvolutionLayer & that) : WeightableLayer(that) { + operator = (that); + } + /** + * @brief move constructor + */ + BinaryConvolutionLayer(BinaryConvolutionLayer &&) = default; +}; + #undef DEFINE_PROP /** @@ -816,6 +954,21 @@ public: using CNNLayer::CNNLayer; }; + +/** + * @brief This class represents a ReLU6 activation layer + * Clamps all tensor elements into the range [0, 6.0] + */ +class ReLU6Layer : public ClampLayer { +public: + explicit ReLU6Layer(const LayerParams &prms) : ClampLayer(prms) { + max_value = 6.0f; + } + + using ClampLayer::ClampLayer; +}; + + /** * @brief This class represents an element wise operation layer */ @@ -826,7 +979,9 @@ public: * @brief Defines possible operations that can be used */ enum eOperation { - Sum = 0, Prod, Max + Sum = 0, Prod, Max, Sub, Min, Div, Squared_diff, Floor_mod, Pow, + Equal, Not_equal, Less, Less_equal, Greater, Greater_equal, + Logical_AND, Logical_OR, Logical_XOR }; /** @@ -963,9 +1118,219 @@ public: }; /** -* @class PReLULayer -* @brief This class represents a Layer which performs Scale and Shift -*/ + * @brief Base class for recurrent cell layers + */ +class RNNCellBase : public WeightableLayer { +public: + using WeightableLayer::WeightableLayer; + + /** + * @brief Direct type of recurrent cell (including subtypes) + * Description of particular cell semantics is in LSTMCell, GRUCell, RNNCell. + */ + enum CellType { + LSTM, /**< Original LSTM cell */ + GRU, /**< Original GRU cell */ + RNN, /**< Original RNN cell */ + GRU_LBR, /**< GRU cell modification. "Linear before reset" */ + }; + + /** @copybrief CellType */ + CellType cellType = LSTM; + + /** + * @brief Size of hidden state data + * + * In case of batch output state tensor will have shape [N, hidden_size] + */ + int hidden_size = 0; + + /** + * @brief Clip data into range [-clip, clip] on input of activations + * + * clip==0.0f means no clipping + */ + float clip = 0.0f; + /** + * @brief Activations used inside recurrent cell + * + * Valid values: sigmoid, tanh, relu + */ + std::vector activations; + + /** + * @brief Alpha parameters of activations + * + * Respective to activation list. + */ + std::vector activation_alpha; + + /** + * @brief Beta parameters of activations + * + * Respective to activation list. + */ + std::vector activation_beta; +}; + +/** + * @brief LSTM Cell layer + * + * G - number of gates (=4) + * N - batch size + * S - state size (=hidden_size) + * + * Inputs: + * [N,D] Xt - input data + * [N,S] Ht-1 - initial hidden state + * [N,S] Ct-1 - initial cell state + * + * Outputs: + * [N,S] Ht - out hidden state + * [N,S] Ct - out cell state + * + * Weights: + * - weights [G,S,D+S] + * - biases [G,S] + * NB! gates order is FICO {forget, input, candidate, output} + * + * activations is {_f, _g, _h} + * default: {_f=sigm, _g=tanh, _h=tanh} + * + * Equations: + * + * * - matrix mult + * (.) - eltwise mult + * [,] - concatenation + * + * - ft = _f(Wf*[Ht-1, Xt] + Bf) + * - it = _f(Wi*[Ht-1, Xt] + Bi) + * - ct = _g(Wc*[Ht-1, Xt] + Bc) + * - ot = _f(Wo*[Ht-1, Xt] + Bo) + * - Ct = ft (.) Ct-1 + it (.) ct + * - Ht = ot (.) _h(Ct) + */ +using LSTMCell = RNNCellBase; + +/** + * @brief GRU Cell layer + * + * G - number of gates (=3) + * N - batch size + * S - state size (=hidden_size) + * + * Inputs: + * [N,D] Xt - input data + * [N,S] Ht-1 - initial hidden state + * + * Outputs: + * [N,S] Ht - out hidden state + * + * Weights: + * - weights [G,S,D+S] + * - biases [G,S] + * NB! gates order is ZRH {update, reset, output} + * + * activations is {_f, _g} + * default: {_f=sigm, _g=tanh} + * + * Equations: + * + * * - matrix mult + * (.) - eltwise mult + * [,] - concatenation + * + * - zt = _f(Wz*[Ht-1, Xt] + Bz) + * - rt = _f(Wr*[Ht-1, Xt] + Br) + * - ht = _g(Wh*[rt (.) Ht-1, Xt] + Bh) + * - Ht = (1 - zt) (.) ht + zt (.) Ht-1 + */ +using GRUCell = RNNCellBase; + +/** + * @brief RNN Cell layer + * + * G - number of gates (=1) + * N - batch size + * S - state size (=hidden_size) + * + * Inputs: + * [N,D] Xt - input data + * [N,S] Ht-1 - initial hidden state + * + * Outputs: + * [N,S] Ht - out hidden state + * + * Weights: + * - weights [G,S,D+S] + * - biases [G,S] + * + * activations is {_f} + * default: {_f=tanh} + * + * Equations: + * + * * - matrix mult + * [,] - concatenation + * + * - Ht = _f(Wi*[Ht-1, Xt] + Bi) + */ +using RNNCell = RNNCellBase; + +/** + * @brief Sequence of recurrent cells + * + * N - batch size + * T - sequence size + * S - state size (=hidden_size) + * NS - num of state tensors (LSTM=2, GRU/RNN=1) + * ND - num of direction (BDR=2, WFD/BWD=1) + * + * Inputs: + * [N,T,D] Xt - input data + * [ND,N,S] Ht-1 - initial hidden state + * [ND,N,S] Ct-1 - initial cell state // if NS==2 + * + * Outputs: + * [ND,N,T,S] Xt - input data + * [ND,N,S] Ht-1 - initial hidden state + * [ND,N,S] Ct-1 - initial cell state // if NS==2 + * + * NB! if axis==0 batch and sequense dimensions are swapped (N <-> T) for input and output tensors + * + * Weights: + * - weights [ND,G,S,D+S] + * - biases [ND,G,S] + * NB! if ND==2 weights are concatenated cell weights [forward_cell_weights, backward_cell_weights] + * + */ +class RNNSequenceLayer : public RNNCellBase { +public: + using RNNCellBase::RNNCellBase; + + /** + * @brief An axis by which iteration is performed + * axis=0 means first input/output data blob dimension is sequence + * axis=1 means first input/output data blob dimension is batch + */ + unsigned int axis = 1; + + /** + * @brief Direction of iteration through sequence dimension + */ + enum Direction { + FWD, /**< Forward mode. Iterate starts from index 0 with step 1. */ + BWD, /**< Backward mode. Iterate starts from last index with step -1. */ + BDR /**< Bidirectional mode. First is forward pass, second is backward. */ + }; + + /** @copybrief Direction */ + Direction direction = FWD; +}; + +/** + * @brief This class represents a Layer which performs Scale and Shift + */ class PReLULayer : public WeightableLayer { public: /** @@ -975,9 +1340,9 @@ public: public: /** - * @brief A default constructor. Creates a new PReLULayer instance and initializes layer parameters with the given values. - * @param prms Initial layer parameters - */ + * @brief A default constructor. Creates a new PReLULayer instance and initializes layer parameters with the given values. + * @param prms Initial layer parameters + */ explicit PReLULayer(const LayerParams &prms) : WeightableLayer(prms), _channel_shared(false) {} }; @@ -1101,4 +1466,205 @@ public: */ using CNNLayer::CNNLayer; }; + +/** + * @brief This class represents a standard Strided Slice layer + * Strided Slice picks from input tensor according parameters + */ +class StridedSliceLayer : public CNNLayer { +public: + /** + * @brief The begin_mask is a bitmask where bit i being 0 means + * to ignore the begin value and instead use the default value + */ + std::string begin_mask; + /** + * @brief Analogous to begin_mask + */ + std::string end_mask; + /** + * @brief The ellipsis_mask is a bitmask where bit i being 1 means + * the i-th is actually an ellipsis + */ + std::string ellipsis_mask; + /** + * @brief The new_axis_mask_ is a bitmask where bit i being 1 means + * the i-th position creates a new 1 dimension shape + */ + std::string new_axis_mask; + /** + * @brief The shrink_axis_mask is a bitmask where bit i being 1 means + * the i-th position shrinks the dimensionality + */ + std::string shrink_axis_mask; + + /** + * @brief Creates a new StridedSliceLayer instance. + */ + using CNNLayer::CNNLayer; +}; + +/** +* @brief This class represents a standard Shuffle Channels layer +* Shuffle Channels picks from input tensor according parameters +*/ +class ShuffleChannelsLayer : public CNNLayer { +public: + /** + * @brief The axis in tensor to shuffle channels + */ + int axis = 1; + + /** + * @brief The group of output shuffled channels + */ + unsigned int group = 1; + + /** + * @brief Creates a new ShuffleChannelsLayer instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Depth To Space layer +* Depth To Space picks from input tensor according parameters +*/ +class DepthToSpaceLayer : public CNNLayer { +public: + /** + * @brief The group of output shuffled channels + */ + unsigned int block_size = 1; + + /** + * @brief Creates a new DepthToSpaceLayer instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Space To Depth layer +* Depth To Space picks from input tensor according parameters +*/ +class SpaceToDepthLayer : public CNNLayer { +public: + /** + * @brief The group of output Space To Depth + */ + unsigned int block_size = 1; + + /** + * @brief Creates a new SpaceToDepthLayer instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Reverse Sequence layer +* Reverse Sequence modifies input tensor according parameters +*/ +class ReverseSequenceLayer : public CNNLayer { +public: + /** + * @brief The seq_axis dimension in tensor which is partially reversed + */ + int seq_axis = 1; + + /** + * @brief The batch_axis dimension in tensor along which reversal is performed + */ + int batch_axis = 0; + + /** + * @brief Creates a new ReverseSequence instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Squeeze layer +* Squeeze modifies input tensor dimensions according parameters +*/ +class SqueezeLayer : public CNNLayer { +public: + /** + * @brief Creates a new Squeeze instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Unsqueeze layer +* Unsqueeze modifies input tensor dimensions according parameters +*/ +class UnsqueezeLayer : public CNNLayer { +public: + /** + * @brief Creates a new Unsqueeze instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard RangeLayer layer +* RangeLayer modifies input tensor dimensions according parameters +*/ +class RangeLayer : public CNNLayer { +public: + /** + * @brief Creates a new RangeLayer instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Fill layer +* RFill modifies input tensor according parameters +*/ +class FillLayer : public CNNLayer { +public: + /** + * @brief Creates a new Fill instance. + */ + using CNNLayer::CNNLayer; +}; + + +/** +* @brief This class represents a standard Expand layer +* Expand modifies input tensor dimensions according parameters +*/ +class ExpandLayer : public CNNLayer { +public: + /** + * @brief Creates a new Expand instance. + */ + using CNNLayer::CNNLayer; +}; + +/** + * @brief This class represents a quantization operation layer + * Element-wise linear quantization of floating point input values into a descrete set of floating point values + */ +class QuantizeLayer : public CNNLayer { +public: + /** + * @brief The number of quantization levels + */ + int levels = 1; + + /** + * @brief Creates a new QuantizeLayer instance. + */ + using CNNLayer::CNNLayer; +}; + } // namespace InferenceEngine diff --git a/inference-engine/include/ie_layers_property.hpp b/inference-engine/include/ie_layers_property.hpp index 52d434c..eeac6b6 100644 --- a/inference-engine/include/ie_layers_property.hpp +++ b/inference-engine/include/ie_layers_property.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -47,6 +47,13 @@ public: } } + PropertyVector(std::initializer_list init_list) { + size_t i = 0; + for (const auto val : init_list) { + insert(i++, val); + } + } + /** * @brief allows access up-to capacity size * @param index diff --git a/inference-engine/include/ie_layouts.h b/inference-engine/include/ie_layouts.h index f4c0e4d..740da27 100644 --- a/inference-engine/include/ie_layouts.h +++ b/inference-engine/include/ie_layouts.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -219,6 +219,9 @@ public: void setLayout(Layout l) { bool inconsistentLayout = true; switch (l) { + case Layout::SCALAR: + inconsistentLayout = !dims.empty(); + break; case Layout::C: inconsistentLayout = dims.size() != 1; break; @@ -246,7 +249,7 @@ public: break; } if (inconsistentLayout) - THROW_IE_EXCEPTION << "Dims(" << std::to_string(dims.size()) << ") and format(" << std::to_string(l) << ") are inconsistent."; + THROW_IE_EXCEPTION << "Size of dims(" << std::to_string(dims.size()) << ") and format(" << l << ") are inconsistent."; layout = l; } diff --git a/inference-engine/include/ie_locked_memory.hpp b/inference-engine/include/ie_locked_memory.hpp index 59e81f0..d0ddb9b 100644 --- a/inference-engine/include/ie_locked_memory.hpp +++ b/inference-engine/include/ie_locked_memory.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_inetwork.hpp b/inference-engine/include/ie_network.hpp similarity index 70% rename from inference-engine/include/ie_inetwork.hpp rename to inference-engine/include/ie_network.hpp index 41c02f0..b33e779 100644 --- a/inference-engine/include/ie_inetwork.hpp +++ b/inference-engine/include/ie_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -134,77 +134,152 @@ private: }; /** + * This class describes port data + */ +class INFERENCE_ENGINE_API_CLASS(PortData) { +public: + /** + * @brief A shared pointer to the PortData object. + */ + using Ptr = std::shared_ptr; + + /** + * @brief Default constructor + */ + PortData(); + + /** + * Creates port data with precision and shape + * @param shape Dimensions + * @param precision Precision + */ + PortData(const SizeVector& shape, const Precision& precision); + + /** + * @brief virtual destructor + */ + virtual ~PortData() = default; + + /** + * @brief Returns data + * @return Blob with data + */ + const Blob::Ptr& getData() const; + + /** + * @brief Sets data + * @param data Blob with data + */ + void setData(const Blob::Ptr& data); + + /** + * @brief Returns data parameters + * @return Map of parameters + */ + const std::map& getParameters() const noexcept; + + /** + * @brief Sets new shapes for data + * @param shape New shapes + */ + void setShape(const SizeVector& shape); + +private: + Blob::Ptr data; + std::map parameters; + + void createData(const TensorDesc& desc); +}; + +/** * @brief This class is the main object to describe the Inference Engine port. */ -class Port { +class INFERENCE_ENGINE_API_CLASS(Port) { public: /** * @brief Default constructor of a port object. */ - Port() = default; + Port(); /** * @brief Constructor of a port object with shapes. * @param shapes port shapes + * @param precision Port precision */ - explicit Port(const SizeVector& shapes): pShapes(shapes) {} + explicit Port(const SizeVector& shapes, + const Precision& precision = Precision::UNSPECIFIED); /** * @brief Copy constructor. * @param port object to copy */ - Port(const Port& port) { - this->pShapes = port.pShapes; - } + Port(const Port& port); + + /** + * @brief Virtual destructor + */ + virtual ~Port() = default; + + /** + * @brief Compares the given Port with the current one + * @param rhs Port to compare with + * @return true if the given Port is equal to the current one, false - otherwise + */ + bool operator== (const Port& rhs) const; + + /** + * @brief Compares the given Port with the current one + * @param rhs Port to compare with + * @return true if the given Port is NOT equal to the current one, false - otherwise + */ + bool operator!= (const Port& rhs) const; /** * @brief Returns a constant reference to a vector with shapes. * Shapes should be initialized if shape is empty. * @return constant reference to shapes */ - const SizeVector& shape() const noexcept { - return pShapes; - } + const SizeVector& shape() const noexcept; /** - * @brief Returns a reference to a vector with shapes. - * Shapes should be initialized if shape is empty. - * @return reference to shapes + * @brief Sets new shapes for current port + * @param shape New shapes */ - SizeVector& shape() noexcept { - return pShapes; - } + void setShape(const SizeVector& shape); -private: - SizeVector pShapes; -}; + /** + * @brief Returns a constant reference to parameters + * @return Map with parameters + */ + const std::map& getParameters() const noexcept; -/** - * @brief This class is the main interface to describe the Inference Engine layer parameters. - * All methods here are constant and do not throw exceptions. - */ -class IParameters { -public: /** - * @brief A shared pointer to the IParameters object. + * @brief Sets new parameters for current port + * @param params New parameters */ - using Ptr = std::shared_ptr; + void setParameters(const std::map& params) noexcept; /** - * @brief Virtual destructor for the parameters interface + * @brief Sets the new parameter for current port + * @param name Name of parameter + * @param param New value */ - virtual ~IParameters() = default; + void setParameter(const std::string& name, const Parameter& param); /** - * @brief Returns a constant reference to a map with parameters. - * @return Map of parameters + * @brief Returns port data + * @return Port data */ - virtual const std::map& getParameters() const noexcept = 0; + const PortData::Ptr& getData() const noexcept; /** - * @brief Returns a constant reference to a constant pointers to constant data. - * @return Map of constant pointers to constant data + * @brief Sets new port data for current port + * @param data Port data */ - virtual const std::map& getConstantData() const noexcept = 0; + void setData(const PortData::Ptr& data); + +private: + std::map parameters; + PortData::Ptr data; }; class INetwork; @@ -218,10 +293,6 @@ class INetwotkIterator; class ILayer { public: /** - * @brief A shared pointer to the ILayer object - */ - using Ptr = std::shared_ptr; - /** * @brief A shared pointer to the const ILayer object */ using CPtr = std::shared_ptr; @@ -250,16 +321,10 @@ public: virtual const std::string& getType() const noexcept = 0; /** - * @brief Returns a constant smart pointer reference to a Network interface. - * @return Network interface smart pointer - */ - virtual const std::shared_ptr& getGraph() const noexcept = 0; - - /** * @brief Returns a constant smart pointer reference to a Parameters interface. * @return Parameters interface smart pointer */ - virtual const IParameters::Ptr& getParameters() const noexcept = 0; + virtual const std::map& getParameters() const noexcept = 0; /** * @brief Returns a constant reference to a vector with input ports. @@ -289,11 +354,11 @@ class INetworkIterator; class INetwork { public: /** - * @brief A shared pointer to the INetwork object. + * @brief A shared pointer to the constant INetwork object. */ - using Ptr = std::shared_ptr; + using CPtr = std::shared_ptr; /** - * @brief A constant iterator for INetwork objects definition + * @brief A constant iterator for INetwork definition */ using const_iterator = details::INetworkIterator; @@ -326,19 +391,19 @@ public: * @param id Id of the Layer * @return Layer interface smart pointer */ - virtual const ILayer::Ptr getLayer(idx_t id) const noexcept = 0; + virtual const ILayer::CPtr getLayer(idx_t id) const noexcept = 0; /** * @brief Returns a constant vector of input layers. * @return Vector of input layers */ - virtual const std::vector getInputs() const noexcept = 0; + virtual const std::vector getInputs() const noexcept = 0; /** * @brief Returns a constant vector of output layers. * @return Vector of output layers */ - virtual const std::vector getOutputs() const noexcept = 0; + virtual const std::vector getOutputs() const noexcept = 0; /** * @brief Returns a constant vector of connections for specific layer. diff --git a/inference-engine/include/ie_parallel.hpp b/inference-engine/include/ie_parallel.hpp index 4dbd3f4..af72214 100644 --- a/inference-engine/include/ie_parallel.hpp +++ b/inference-engine/include/ie_parallel.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -149,8 +149,8 @@ R parallel_sum(const T0 D0, R &input, F func) { #if IE_THREAD == IE_THREAD_OMP #pragma omp parallel for reduction(+ : sum) schedule(static) #endif - for (T0_IT dim1 = 0; dim1 < D0; dim1++) { - sum += func(dim1); + for (T0_IT dim1 = 0; dim1 < static_cast(D0); dim1++) { + sum += static_cast(func(dim1)); } return sum; #endif @@ -230,9 +230,9 @@ R parallel_sum3d(const T0 D0, const T1 D1, const T2 D2, R input, F func) { #if IE_THREAD == IE_THREAD_OMP #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static) #endif - for (T0_IT dim1 = 0; dim1 < D0; dim1++) { - for (T1_IT dim2 = 0; dim2 < D1; dim2++) { - for (T2_IT dim3 = 0; dim3 < D2; dim3++) { + for (T0_IT dim1 = 0; dim1 < static_cast(D0); dim1++) { + for (T1_IT dim2 = 0; dim2 < static_cast(D1); dim2++) { + for (T2_IT dim3 = 0; dim3 < static_cast(D2); dim3++) { sum += func(dim1, dim2, dim3); } } diff --git a/inference-engine/include/ie_parameter.hpp b/inference-engine/include/ie_parameter.hpp index 59526ad..e30d83d 100644 --- a/inference-engine/include/ie_parameter.hpp +++ b/inference-engine/include/ie_parameter.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,10 +10,13 @@ #include
#include +#include #include +#include #include #include #include +#include #include namespace InferenceEngine { @@ -29,337 +32,245 @@ public: Parameter() = default; /** - * @brief The constructor creates a Parameter object with string value - * @param value string value + * @brief Move constructor + * @param parameter Parameter object */ - Parameter(const std::string& value): initialized(true), value(value) {} // NOLINT + Parameter(Parameter &¶meter) noexcept: ptr(std::move(parameter.ptr)) {} /** - * @brief The constructor creates a Parameter object with template value - * @param value template value + * @brief Copy constructor + * @param parameter Parameter object */ - template - Parameter(const T& value): initialized(true), value(std::to_string(value)) {} // NOLINT + Parameter(const Parameter ¶meter) { + *this = parameter; + } /** - * @brief The constructor creates a Parameter object with a vector of template values - * @param values vector of template values + * @brief Constructor creates parameter with object + * @tparam T Parameter type + * @tparam U Identity type-transformation + * @param parameter object */ - template - Parameter(const std::vector& values): initialized(true) { // NOLINT - for (const auto& val : values) { - if (!value.empty()) - value += ","; - value += std::to_string(val); - } + template + Parameter(T&& parameter) { // NOLINT + ptr = new RealData::type>(std::forward(parameter)); } /** - * @brief The cast to string object - * Throws exception if parameter was not found. - * @return string value + * @brief Constructor creates string parameter from char * + * @param str char array */ - operator std::string() const { // NOLINT - return asString(); - } + Parameter(const char *str): Parameter(std::string(str)) {} // NOLINT /** - * @brief Returns a string value for the given parameter or returns the default one - * @param def Default value of the parameter if not found - * @return A string value + * @brief Destructor */ - std::string asString(std::string def) const { - if (!initialized) { - return def; - } - return value; + virtual ~Parameter() { + clear(); } /** - * @brief Returns a string value for the given parameter. - * Throws exception if parameter was not found. - * @return A string value + * Copy operator for Parameter + * @param parameter Parameter object + * @return Parameter */ - std::string asString() const { - if (!initialized) { - THROW_IE_EXCEPTION << "Parameter was not initialized!"; + Parameter& operator=(const Parameter& parameter) { + if (this == ¶meter) { + return *this; } - return value; + clear(); + if (!parameter.empty()) + ptr = parameter.ptr->copy(); + return *this; } /** - * @brief Gets float value for the given parameter - * @param def - default value of the parameter if not found - * @return float value + * Remove a value from parameter */ - float asFloat(float def) const { - std::string val = asString(std::to_string(def)); - try { - return std::stof(val); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to float."; - } + void clear() { + delete ptr; + ptr = nullptr; } /** - * @brief Returns a float value for the given layer parameter - * @return A float value for the specified parameter + * Checks that parameter contains a value + * @return false if parameter contains a value else false */ - float asFloat() const { - std::string val = asString(); - try { - return std::stof(val); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to float."; - } + bool empty() const noexcept { + return nullptr == ptr; } /** - * @brief Returns a vector of float values for the given parameter or returns the default value - * @param def Default value of the parameter if not found - * @return vector of float values + * Checks the type of value + * @tparam T Type of value + * @return true if type of value is correct */ - std::vector asFloats(std::vector def) const { - std::string vals = asString(""); - std::vector result; - std::istringstream stream(vals); - std::string str; - if (vals.empty()) - return def; - while (getline(stream, str, ',')) { - try { - result.push_back(std::stof(str)); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to floats."; - } - } - return result; + template + bool is() const { + return empty() ? false : ptr->is(typeid(T)); } /** - * @brief Returns a vector of float values for the given parameter - * @return vector of float values + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - std::vector asFloats() const { - std::string vals = asString(); - std::vector result; - std::istringstream stream(vals); - std::string str; - while (getline(stream, str, ',')) { - try { - result.push_back(std::stof(str)); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to floats."; - } - } - return result; + template + T &&as() && { + return std::move(dyn_cast(ptr)); } /** - * @brief Returns an integer value for the given parameter or returns the default value - * @param def Default value of the parameter if not found - * @return An int value for the specified parameter + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - int asInt(int def) const { - std::string val = asString(std::to_string(def)); - try { - return std::stoi(val); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to int."; - } + template + T& as() & { + return dyn_cast(ptr); } - /** - * @brief Returns an integer value for the given parameter - * @return An int value for the specified parameter + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - int asInt() const { - std::string val = asString(); - try { - return std::stoi(val); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to int."; - } + template + const T& as() const & { + return dyn_cast(ptr); } - /** - * @brief Returns a vector of int values for the given parameter or returns the default value - * @param def Default value of the parameter if not found - * @return vector of int values + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - std::vector asInts(std::vector def) const { - std::string vals = asString(""); - std::vector result; - std::istringstream stream(vals); - std::string str; - if (vals.empty()) - return def; - while (getline(stream, str, ',')) { - try { - result.push_back(std::stoi(str)); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to ints."; - } - } - return result; + template + operator T&&() && { + return std::move(dyn_cast::type>(ptr)); } /** - * @brief Returns a vector of int values for the given parameter - * @return vector of int values + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - std::vector asInts() const { - std::string vals = asString(); - std::vector result; - std::istringstream stream(vals); - std::string str; - while (getline(stream, str, ',')) { - try { - result.push_back(std::stoi(str)); - } catch (...) { - THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to ints."; - } - } - return result; + template + operator T&() & { + return dyn_cast::type>(ptr); } + /** - * @brief Returns an unsigned integer value for the given parameter or returns the default value - * @param def Default value of the parameter if not found - * @return An unsigned integer value for the specified parameter + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - unsigned int asUInt(unsigned int def) const { - std::string val = asString(std::to_string(def)); - std::string message = "Value " + val + " cannot be casted to unsigned int."; - try { - int value = std::stoi(val); - if (value < 0) { - THROW_IE_EXCEPTION << message; - } - return static_cast(value); - } catch (...) { - THROW_IE_EXCEPTION << message; - } + template operator const T&() const & { + return dyn_cast::type>(ptr); } /** - * @brief Returns an unsigned integer value for the given parameter - * @return An unsigned integer value for the specified parameter + * Dynamic cast to specified type + * @tparam T type + * @return casted object */ - unsigned int asUInt() const { - std::string val = asString(); - std::string message = "Value " + val + " cannot be casted to unsigned int."; - try { - int value = std::stoi(val); - if (value < 0) { - THROW_IE_EXCEPTION << message; - } - return static_cast(value); - } catch (...) { - THROW_IE_EXCEPTION << message; - } + template operator T&() const & { + return dyn_cast::type>(ptr); } - /** - * @brief Returns a vector of unsigned int values for the given parameter or returns the default value - * @param def Default value of the parameter if not found - * @return vector of unsigned int values + * @brief The comparison operator for the Parameter + * @param rhs object to compare + * @return true if objects are equal */ - std::vector asUInts(std::vector def) const { - std::string vals = asString(""); - std::vector result; - std::istringstream stream(vals); - std::string str; - std::string message = "Value " + vals + " cannot be casted to unsigned ints."; - if (vals.empty()) - return def; - while (getline(stream, str, ',')) { - try { - int value = std::stoi(str); - if (value < 0) { - THROW_IE_EXCEPTION << message; - } - result.push_back(static_cast(value)); - } catch (...) { - THROW_IE_EXCEPTION << message; - } - } - return result; + bool operator == (const Parameter& rhs) const { + return *ptr == *(rhs.ptr); } - /** - * @brief Returns a vector of unsigned int values for the given parameter - * @return vector of unsigned int values + * @brief The comparison operator for the Parameter + * @param rhs object to compare + * @return true if objects aren't equal */ - std::vector asUInts() const { - std::string vals = asString(); - std::vector result; - std::istringstream stream(vals); - std::string str; - std::string message = "Value " + vals + " cannot be casted to unsigned ints."; - while (getline(stream, str, ',')) { - try { - int value = std::stoi(str); - if (value < 0) { - THROW_IE_EXCEPTION << message; - } - result.push_back(static_cast(value)); - } catch (...) { - THROW_IE_EXCEPTION << message; - } - } - return result; + bool operator != (const Parameter& rhs) const { + return !(*this == rhs); } - /** - * @brief Returns an boolean value for the given parameter. - * The valid values are (true, false, 1, 0). - * @param def Default value of the parameter if not found - * @return An bool value for the specified parameter - */ - bool asBool(bool def) const { - std::string val = asString(std::to_string(def)); - std::string loweredCaseValue; - std::transform(val.begin(), val.end(), std::back_inserter(loweredCaseValue), [](char value) { - return std::tolower(value); - }); - - bool result = false; - - if (!(std::istringstream(loweredCaseValue) >> std::boolalpha >> result)) { - // attempting parse using non alpha bool - return static_cast(asInt(def)); +private: + template + struct CheckOperatorEqual { + template + static auto test(U*) -> decltype(std::declval() == std::declval()) { + return false; } - return result; - } + template + static auto test(...) -> std::false_type { + return {}; + } - /** - * @brief Returns an boolean value for the given parameter. - * The valid values are (true, false, 1, 0). - * @return An bool value for the specified parameter - */ - bool asBool() const { - std::string val = asString(); - std::string loweredCaseValue; - std::transform(val.begin(), val.end(), std::back_inserter(loweredCaseValue), [](char value) { - return std::tolower(value); - }); - - bool result = false; - - if (!(std::istringstream(loweredCaseValue) >> std::boolalpha >> result)) { - // attempting parse using non alpha bool - return static_cast(asInt()); + using type = typename std::is_same(nullptr))>::type; + }; + + template + struct HasOperatorEqual : CheckOperatorEqual::type {}; + + struct Any { + virtual ~Any() = default; + virtual bool is(const std::type_info&) const = 0; + virtual Any *copy() const = 0; + virtual bool operator==(const Any& rhs) const = 0; + }; + + template + struct RealData: Any, std::tuple { + using std::tuple::tuple; + + bool is(const std::type_info& id) const override { + return id == typeid(T); + } + Any *copy() const override { + return new RealData{get()}; + } + + T& get() & { + return std::get<0>(*this); } - return result; + const T& get() const & { + return std::get<0>(*this); + } + + template + typename std::enable_if::value, bool>::type + equal(const Any& left, const Any& rhs) const { + THROW_IE_EXCEPTION << "Parameter doesn't contain equal operator"; + } + + template + typename std::enable_if::value, bool>::type + equal(const Any& left, const Any& rhs) const { + return dyn_cast(&left) == dyn_cast(&rhs); + } + + bool operator==(const Any& rhs) const override { + return rhs.is(typeid(T)) && equal(*this, rhs); + } + }; + + template + static T &dyn_cast(Any* obj) { + if (obj == nullptr) + THROW_IE_EXCEPTION << "Parameter is empty!"; + return dynamic_cast&>(*obj).get(); } -private: - bool initialized; - std::string value; + template + static const T &dyn_cast(const Any* obj) { + if (obj == nullptr) + THROW_IE_EXCEPTION << "Parameter is empty!"; + return dynamic_cast &>(*obj).get(); + } + + Any *ptr = nullptr; }; } // namespace InferenceEngine diff --git a/inference-engine/include/ie_plugin.hpp b/inference-engine/include/ie_plugin.hpp index 5623dd6..2712f1f 100644 --- a/inference-engine/include/ie_plugin.hpp +++ b/inference-engine/include/ie_plugin.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_plugin_config.hpp b/inference-engine/include/ie_plugin_config.hpp index 0e3397d..028b404 100644 --- a/inference-engine/include/ie_plugin_config.hpp +++ b/inference-engine/include/ie_plugin_config.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_plugin_dispatcher.hpp b/inference-engine/include/ie_plugin_dispatcher.hpp index 60d729d..b041d07 100644 --- a/inference-engine/include/ie_plugin_dispatcher.hpp +++ b/inference-engine/include/ie_plugin_dispatcher.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,7 +23,7 @@ public: * @brief A constructor * @param pp Vector of paths to plugin directories */ - explicit PluginDispatcher(const std::vector &pp) : pluginDirs(pp) {} + explicit PluginDispatcher(const std::vector &pp = {file_name_t()}) : pluginDirs(pp) {} /** * @brief Loads a plugin from plugin directories diff --git a/inference-engine/include/ie_plugin_ptr.hpp b/inference-engine/include/ie_plugin_ptr.hpp index 6c10cf5..84f2a20 100644 --- a/inference-engine/include/ie_plugin_ptr.hpp +++ b/inference-engine/include/ie_plugin_ptr.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_precision.hpp b/inference-engine/include/ie_precision.hpp index d50fe5c..8726ae6 100644 --- a/inference-engine/include/ie_precision.hpp +++ b/inference-engine/include/ie_precision.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -30,6 +30,7 @@ public: I8 = 50, /**< 8bit signed integer value */ U16 = 60, /**< 16bit unsigned integer value */ I32 = 70, /**< 32bit signed integer value */ + BIN = 71, /**< 1bit integer value */ CUSTOM = 80 /**< custom precision has it's own name and size of elements */ }; @@ -79,11 +80,13 @@ public: return Precision(8 * sizeof(T), typeName == nullptr ? typeid(T).name() : typeName); } - /** @brief checks whether given storage class T can be used for store objects of current precision */ + /** @brief checks whether given storage class T can be used to store objects of current precision */ template bool hasStorageType(const char * typeName = nullptr) const noexcept { - if (sizeof(T) != size()) { - return false; + if (precisionInfo.value != BIN) { + if (sizeof(T) != size()) { + return false; + } } #define CASE(x, y) case x: return std::is_same() #define CASE2(x, y1, y2) case x: return std::is_same() || std::is_same() @@ -97,6 +100,7 @@ public: CASE(U8, uint8_t); CASE(I8, int8_t); CASE2(Q78, int16_t, uint16_t); + CASE2(BIN, int8_t, uint8_t); default : return areSameStrings(name(), typeName == nullptr ? typeid(T).name() : typeName); #undef CASE #undef CASE2 @@ -159,6 +163,7 @@ public: PRECISION_NAME(FP32), PRECISION_NAME(FP16), PRECISION_NAME(MIXED), + PRECISION_NAME(BIN), #undef PRECISION_NAME }; auto i = names.find(str); @@ -210,6 +215,7 @@ public: CASE(I8); CASE(Q78); CASE(MIXED); + CASE(BIN); default : return makePrecisionInfo("UNSPECIFIED"); #undef CASE } @@ -257,6 +263,10 @@ template<> struct PrecisionTrait { using value_type = int32_t; }; +template<> +struct PrecisionTrait { + using value_type = int8_t; +}; template inline uint8_t type_size_or_zero() { @@ -295,7 +305,9 @@ template inline Precision::PrecisionInfo Precision::makePrecisionInfo(const char *name) { Precision::PrecisionInfo info; info.name = name; - info.bitsSize = 8 * type_size_or_zero::value_type>(); + + int nBits = precision == BIN ? 1 : 8; + info.bitsSize = nBits * type_size_or_zero::value_type>(); info.isFloat = is_floating(); info.value = precision; return info; diff --git a/inference-engine/include/ie_preprocess.hpp b/inference-engine/include/ie_preprocess.hpp index 1b984ff..0a969ee 100644 --- a/inference-engine/include/ie_preprocess.hpp +++ b/inference-engine/include/ie_preprocess.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_primitive_info.hpp b/inference-engine/include/ie_primitive_info.hpp index d4e4fbc..31afb20 100644 --- a/inference-engine/include/ie_primitive_info.hpp +++ b/inference-engine/include/ie_primitive_info.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_tensor_info.hpp b/inference-engine/include/ie_tensor_info.hpp index 5f71dc9..ccbf3e8 100644 --- a/inference-engine/include/ie_tensor_info.hpp +++ b/inference-engine/include/ie_tensor_info.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_unicode.hpp b/inference-engine/include/ie_unicode.hpp index f8231fa..41e2603 100644 --- a/inference-engine/include/ie_unicode.hpp +++ b/inference-engine/include/ie_unicode.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_utils.hpp b/inference-engine/include/ie_utils.hpp index 2ba9f02..545af57 100644 --- a/inference-engine/include/ie_utils.hpp +++ b/inference-engine/include/ie_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/ie_version.hpp b/inference-engine/include/ie_version.hpp index d743115..9228939 100644 --- a/inference-engine/include/ie_version.hpp +++ b/inference-engine/include/ie_version.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/inference_engine.hpp b/inference-engine/include/inference_engine.hpp index 352d943..2df7fda 100644 --- a/inference-engine/include/inference_engine.hpp +++ b/inference-engine/include/inference_engine.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/include/vpu/vpu_plugin_config.hpp b/inference-engine/include/vpu/vpu_plugin_config.hpp new file mode 100644 index 0000000..c6cd1e9 --- /dev/null +++ b/inference-engine/include/vpu/vpu_plugin_config.hpp @@ -0,0 +1,213 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief A header that defines advanced related properties for VPU plugins. + * These properties should be used in SetConfig() and LoadNetwork() methods of plugins + * + * @file vpu_plugin_config.hpp + */ + +#pragma once + +#include +#include "ie_plugin_config.hpp" + +#define VPU_CONFIG_KEY(name) InferenceEngine::VPUConfigParams::_CONFIG_KEY(VPU_##name) +#define VPU_CONFIG_VALUE(name) InferenceEngine::VPUConfigParams::VPU_##name + +#define DECLARE_VPU_CONFIG_KEY(name) DECLARE_CONFIG_KEY(VPU_##name) +#define DECLARE_VPU_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(VPU_##name) + +#define VPU_HDDL_CONFIG_KEY(name) InferenceEngine::VPUConfigParams::_CONFIG_KEY(VPU_HDDL_##name) +#define VPU_HDDL_CONFIG_VALUE(name) InferenceEngine::VPUConfigParams::VPU_HDDL_##name + +#define DECLARE_VPU_HDDL_CONFIG_KEY(name) DECLARE_CONFIG_KEY(VPU_HDDL_##name) +#define DECLARE_VPU_HDDL_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(VPU_HDDL_##name) + +namespace InferenceEngine { +namespace VPUConfigParams { + +/** + * @brief Turn on HW stages usage (applicable for MyriadX devices only). + * This option should be used with values: CONFIG_VALUE(YES) or CONFIG_VALUE(NO) (default) + */ +DECLARE_VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION); + +/** + * @brief The key to specify desirable log level for devices. + * This option should be used with values: CONFIG_VALUE(LOG_NONE) (default), + * CONFIG_VALUE(LOG_WARNING), CONFIG_VALUE(LOG_INFO), CONFIG_VALUE(LOG_DEBUG) + */ +DECLARE_VPU_CONFIG_KEY(LOG_LEVEL); + +/** + * @deprecated + * @brief The key to define normalization coefficient for the network input. + * This option should used with be a real number. Example "255.f" + */ +DECLARE_VPU_CONFIG_KEY(INPUT_NORM); + +/** + * @deprecated + * @brief The flag to specify Bias value that is added to each element of the network input. + * This option should used with be a real number. Example "0.1f" + */ +DECLARE_VPU_CONFIG_KEY(INPUT_BIAS); + +/** + * @brief The flag for adding to the profiling information the time of obtaining a tensor. + * This option should be used with values: CONFIG_VALUE(YES) or CONFIG_VALUE(NO) (default) + */ +DECLARE_VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME); + +/** + * @brief The flag to reset stalled devices: CONFIG_VALUE(YES) or CONFIG_VALUE(NO) (default) + * This is a plugin scope option and must be used with the plugin's SetConfig method + */ +DECLARE_VPU_CONFIG_KEY(FORCE_RESET); + +/** + * @brief [Only for HDDLPlugin] + * Type: Arbitrary non-empty string. If empty (""), equals no set, default: ""; + * This option allows to specify the number of MYX devices used for inference a specific Executable network. + * Note: Only one network would be allocated to one device. + * The number of devices for the tag is specified in the hddl_service.config file. + * Example: + * "service_settings": + * { + * "graph_tag_map": + * { + * "tagA":3 + * } + * } + * It means that an executable network marked with tagA will be executed on 3 devices + */ +DECLARE_VPU_HDDL_CONFIG_KEY(GRAPH_TAG); + +/** + * @brief [Only for HDDLPlugin] + * Type: Arbitrary non-empty string. If empty (""), equals no set, default: ""; + * This config makes the executable networks to be allocated on one certain device (instead of multiple devices). + * And all inference through this executable network, will be done on this device. + * Note: Only one network would be allocated to one device. + * The number of devices which will be used for stream-affinity must be specified in hddl_service.config file. + * Example: + * "service_settings": + * { + * "stream_device_number":5 + * } + * It means that 5 device will be used for stream-affinity + */ +DECLARE_VPU_HDDL_CONFIG_KEY(STREAM_ID); + +/** + * @brief [Only for HDDLPlugin] + * Type: Arbitrary non-empty string. If empty (""), equals no set, default: ""; + * This config allows user to control device flexibly. This config gives a "tag" for a certain device while + * allocating a network to it. Afterward, user can allocating/deallocating networks to this device with this "tag". + * Devices used for such use case is controlled by a so-called "Bypass Scheduler" in HDDL backend, and the number + * of such device need to be specified in hddl_service.config file. + * Example: + * "service_settings": + * { + * "bypass_device_number": 5 + * } + * It means that 5 device will be used for Bypass scheduler. + */ +DECLARE_VPU_HDDL_CONFIG_KEY(DEVICE_TAG); + +/** + * @brief [Only for HDDLPlugin] + * Type: "YES/NO", default is "NO". + * This config is a sub-config of DEVICE_TAG, and only available when "DEVICE_TAG" is set. After a user load a + * network, the user got a handle for the network. + * If "YES", the network allocated is bind to the device (with the specified "DEVICE_TAG"), which means all afterwards + * inference through this network handle will be executed on this device only. + * If "NO", the network allocated is not bind to the device (with the specified "DEVICE_TAG"). If the same network + * is allocated on multiple other devices (also set BIND_DEVICE to "False"), then inference through any handle of these + * networks may be executed on any of these devices those have the network loaded. + */ +DECLARE_VPU_HDDL_CONFIG_KEY(BIND_DEVICE); + +/** + * @brief [Only for HDDLPlugin] + * Type: A signed int wrapped in a string, default is "0". + * This config is a sub-config of DEVICE_TAG, and only available when "DEVICE_TAG" is set and "BIND_DEVICE" is "False". + * When there are multiple devices running a certain network (a same network running on multiple devices in Bypass Scheduler), + * the device with a larger number has a higher priority, and more inference tasks will be fed to it with priority. + */ +DECLARE_VPU_HDDL_CONFIG_KEY(RUNTIME_PRIORITY); + + +/** + * @brief [Only for HDDLPlugin] + * Type: "YES/NO", default is "NO". **Note: ONLY available when "DEVICE_TAG" is set. + * This config should be used only when the network has been loaded already with the same network content, the same + * "DEVICE_TAG" as used this time and "BIND_DEVICE" of the loaded network had been set to "NO". + * This config is only used to update the "RUNTIME_PRIORITY" of previous loaded network, and the application should keep using + * the network handle that previous allocated to do inference. + * - If "Yes": the "RUNTIME_PRIORITY" must be specified with a integer, and it will be set as the new runtime priority for that network on that device. + * - If "No": load this network to deivce. + * **Note: If "BIND_DEVICE" of the previously loaded network was "Yes", the behavior of "update runtime priority" is undefined. + */ +DECLARE_VPU_HDDL_CONFIG_KEY(UPDATE_RUNTIME_PRIORITY); + +/** + * @brief This option allows to pass extra configuration for executable network. + * By default, it is empty string, which means - no configuration. + * String format: + * =,=,... + * Supported parameters and options: + * * file : path to XML file with configuration + * * data : options related to data objects (input, output, intermediate), next parameter describes the option + * * scale : SCALE factor for data range (applicable for input and intermediate data) + */ +DECLARE_VPU_CONFIG_KEY(NETWORK_CONFIG); + +/** + * @brief This option allows to to specify input output layouts for network layers. + * By default, this value set to VPU_CONFIG_VALUE(AUTO) value. + * Supported values: + * VPU_CONFIG_VALUE(AUTO) executable network configured to use optimal layer layout depending on available HW + * VPU_CONFIG_VALUE(NCHW) executable network forced to use NCHW input/output layouts + * VPU_CONFIG_VALUE(NHWC) executable network forced to use NHWC input/output layouts + */ +DECLARE_VPU_CONFIG_KEY(COMPUTE_LAYOUT); + +/** + * @brief This option allows to pass custom layers binding xml. + * If layer is present in such an xml, it would be used during inference even if the layer is natively supported + */ +DECLARE_VPU_CONFIG_KEY(CUSTOM_LAYERS); + +/** + * @brief Supported keys definition for VPU_CONFIG_KEY(COMPUTE_LAYOUT) option. + */ +DECLARE_VPU_CONFIG_VALUE(AUTO); +DECLARE_VPU_CONFIG_VALUE(NCHW); +DECLARE_VPU_CONFIG_VALUE(NHWC); + +/** + * @brief This option allows to specify device. + * If specified device is not available then creating infer request will throw an exception. + */ +DECLARE_VPU_CONFIG_KEY(PLATFORM); + +/** + * @brief Supported keys definition for VPU_CONFIG_KEY(PLATFORM) option. + */ +DECLARE_VPU_CONFIG_VALUE(2450); +DECLARE_VPU_CONFIG_VALUE(2480); + +/** + * @brief Ignore statistic in IR by plugin. + * Plugin could use statistic present in IR in order to try to improve calculations precision. + * If you don't want statistic to be used enable this option. + * This option should be used with values: CONFIG_VALUE(YES) or CONFIG_VALUE(NO) (default) + */ +DECLARE_VPU_CONFIG_KEY(IGNORE_IR_STATISTIC); + +} // namespace VPUConfigParams +} // namespace InferenceEngine diff --git a/inference-engine/install_dependencies.sh b/inference-engine/install_dependencies.sh index fdb70e2..12dfaca 100755 --- a/inference-engine/install_dependencies.sh +++ b/inference-engine/install_dependencies.sh @@ -22,6 +22,7 @@ function yes_or_no { # install dependencies if [[ -f /etc/lsb-release ]]; then # Ubuntu + system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2` sudo -E apt update sudo -E apt-get install -y \ build-essential \ @@ -40,7 +41,6 @@ if [[ -f /etc/lsb-release ]]; then automake \ libtool \ autoconf \ - libpng12-dev \ libcairo2-dev \ libpango1.0-dev \ libglib2.0-dev \ @@ -52,6 +52,11 @@ if [[ -f /etc/lsb-release ]]; then gstreamer1.0-plugins-base \ libusb-1.0-0-dev \ libopenblas-dev + if [ $system_ver = "18.04" ]; then + sudo -E apt-get install -y libpng-dev + else + sudo -E apt-get install -y libpng12-dev + fi else # CentOS 7.x sudo -E yum install -y centos-release-scl epel-release diff --git a/inference-engine/samples/CMakeLists.txt b/inference-engine/samples/CMakeLists.txt index 1f7bb9f..da00b43 100644 --- a/inference-engine/samples/CMakeLists.txt +++ b/inference-engine/samples/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -50,16 +50,6 @@ else () set (LIBRARY_OUTPUT_PATH ${LIBRARY_OUTPUT_DIRECTORY}/lib) endif() -# use this flag if you need to throw custom message in case if the IE package is not found. -if (IE_NOT_FOUND_MESSAGE) - find_package(InferenceEngine 1.5 QUIET) - if (NOT(InferenceEngine_FOUND)) - message(FATAL_ERROR ${IE_NOT_FOUND_MESSAGE}) - endif() -else() - find_package(InferenceEngine 1.5 REQUIRED) -endif() - if (WIN32) if (NOT "${CMAKE_SIZEOF_VOID_P}" EQUAL "8") message(FATAL_ERROR "Only 64-bit supported on Windows") @@ -69,13 +59,16 @@ if (WIN32) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS -DNOMINMAX") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") #no asynchronous structured exception handling set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE") + if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251 /wd4275 /wd4267") #disable some warnings + endif() else() - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Werror=return-type ") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") #treating warnings as errors if (APPLE) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-command-line-argument") elseif(UNIX) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized -Winit-self") - if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + if(NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL Clang) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wmaybe-uninitialized") endif() endif() @@ -86,54 +79,70 @@ endif() ## to use C++11 set (CMAKE_CXX_STANDARD 11) set (CMAKE_CXX_STANDARD_REQUIRED ON) -set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") +if (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) + set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") +endif() #################################### set (GFLAGS_IS_SUBPROJECT TRUE) set (HAVE_SYS_STAT_H 1) set (HAVE_INTTYPES_H 1) -if (WIN32) - # add_compile_options("/WX") -else() - add_compile_options("-Werror") +add_subdirectory(thirdparty/gflags) + +if (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") endif() -# Properties->C/C++->General->Additional Include Directories include_directories ( - ${CMAKE_CURRENT_SOURCE_DIR}/common/format_reader - ${InferenceEngine_INCLUDE_DIRS} - ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/gflags/include - ${CMAKE_CURRENT_SOURCE_DIR}/common + ${CMAKE_CURRENT_SOURCE_DIR}/common + ${CMAKE_CURRENT_SOURCE_DIR}/common/format_reader ) +add_subdirectory(common/format_reader) -if (UNIX) - set (LIB_DL dl) +# samples build can be switched off during whole IE build +if (IE_MAIN_SOURCE_DIR AND NOT ENABLE_SAMPLES) + return() endif() -add_subdirectory(thirdparty/gflags) -add_subdirectory(common/format_reader) - -# collect all samples subdirectories -file(GLOB subdirs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *) -# skip building of unnecessary subdirs -list(REMOVE_ITEM subdirs archived common thirdparty) - -foreach (dir ${subdirs}) - if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${dir}) - # check if a subdirectory contains CMakeLists.txt. In this case we can build it. - file(GLOB is_sample_dir "${CMAKE_CURRENT_SOURCE_DIR}/${dir}/CMakeLists.txt") - if(is_sample_dir) - # check if specified sample/demo is found. - if (BUILD_SAMPLE_NAME) - list(FIND BUILD_SAMPLE_NAME ${dir} index) - endif() - if (index EQUAL -1) - message(STATUS "${dir} SKIPPED") - else() - # Include subdirectory to the project. - add_subdirectory(${dir}) +function(add_samples_to_build) + # check each passed sample subdirectory + foreach (dir ${ARGN}) + if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${dir}) + # check if a subdirectory contains CMakeLists.txt. In this case we can build it. + file(GLOB is_sample_dir "${CMAKE_CURRENT_SOURCE_DIR}/${dir}/CMakeLists.txt") + if(is_sample_dir) + # check if specified sample/demo is found. + if (BUILD_SAMPLE_NAME) + list(FIND BUILD_SAMPLE_NAME ${dir} index) + endif() + if (index EQUAL -1) + message(STATUS "${dir} SKIPPED") + else() + # Include subdirectory to the project. + add_subdirectory(${dir}) + endif() endif() endif() + endforeach() +endfunction(add_samples_to_build) + +# use this flag if you need to throw custom message in case if the IE package is not found. +if (IE_NOT_FOUND_MESSAGE) + find_package(InferenceEngine 1.6 QUIET) + if (NOT(InferenceEngine_FOUND)) + message(FATAL_ERROR ${IE_NOT_FOUND_MESSAGE}) endif() -endforeach() +else() + find_package(InferenceEngine 1.6 REQUIRED) +endif() + +if (UNIX) + set (LIB_DL dl) +endif() + +# collect all samples subdirectories +file(GLOB samples_dirs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *) +# skip building of unnecessary subdirectories +list(REMOVE_ITEM samples_dirs archived common thirdparty) +add_samples_to_build(${samples_dirs}) diff --git a/inference-engine/samples/benchmark_app/CMakeLists.txt b/inference-engine/samples/benchmark_app/CMakeLists.txt index 87db730..c142ea6 100644 --- a/inference-engine/samples/benchmark_app/CMakeLists.txt +++ b/inference-engine/samples/benchmark_app/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "benchmark_app") file (GLOB SRC diff --git a/inference-engine/samples/benchmark_app/README.md b/inference-engine/samples/benchmark_app/README.md index ab0bbd7..23c17e4 100644 --- a/inference-engine/samples/benchmark_app/README.md +++ b/inference-engine/samples/benchmark_app/README.md @@ -1,34 +1,51 @@ -# Benchmark Application Demo +# Benchmark Application C++ Demo -This topic demonstrates how to use the Benchmark Application to estimate deep learning inference performance on supported devices. Performance can be measured for two inference modes: synchronous and asynchronous. +This topic demonstrates how to use the Benchmark Application to estimate deep learning inference performance on +supported devices. Performance can be measured for two inference modes: synchronous and asynchronous. -> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Application. For the Python* implementation, refer to [Benchmark Application (Python*)](./samples/python_samples/benchmark_app/README.md) +> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Application. For the Python* implementation, refer to [Benchmark Application (Python*)](./inference-engine/ie_bridges/python/sample/benchmark_app/README.md). ## How It Works -> **NOTE:** To achieve benchmark results similar to the official published results, set CPU frequency to 2.9GHz and GPU frequency to 1GHz. +> **NOTE:** To achieve benchmark results similar to the official published results, set CPU frequency to 2.9 GHz and GPU frequency to 1 GHz. -Upon the start-up, the application reads command-line parameters and loads a network and images to the Inference Engine plugin. The number of infer requests and execution approach depend on a mode defined with the `-api` command-line parameter. +Upon start-up, the application reads command-line parameters and loads a network and images to the Inference Engine +plugin, which is chosen depending on a specified device. The number of infer requests and execution approach depend +on the mode defined with the `-api` command-line parameter. +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). -### Synchronous API -For synchronous mode, the primary metric is latency. The application creates one infer request and executes the `Infer` method. A number of executions is defined by one of the two values: -* Number of iterations defined with the `-niter` command-line argument -* Predefined duration if `-niter` is skipped. Predefined duration value depends on device. +If you run the application in the synchronous mode, it creates one infer request and executes the `Infer` method. +If you run the application in the asynchronous mode, it creates as many infer requests as specified in the `-nireq` +command-line parameter and executes the `StartAsync` method for each of them. -During the execution, the application collects two types of metrics: -* Latency for each infer request executed with `Infer` method -* Duration of all executions +The `Wait` method is used to wait for a previous execution of an infer request to complete. A number of execution steps +is defined by one of the two values: +* Number of iterations specified with the `-niter` command-line argument +* Predefined duration if `-niter` is not specified. Predefined duration value depends on device. -Reported latency value is calculated as mean value of all collected latencies. Reported throughput value is a derivative from reported latency and additionally depends on batch size. +During the execution, the application collects latency for each executed infer request. -### Asynchronous API -For asynchronous mode, the primary metric is throughput in frames per second (FPS). The application creates a certain number of infer requests and executes the `StartAsync` method. A number of infer is specified with the `-nireq` command-line parameter. A number of executions is defined by one of the two values: -* Number of iterations defined with the `-niter` command-line argument -* Predefined duration if `-niter` is skipped. Predefined duration value depends on device. +Reported latency value is calculated as a median value of all collected latencies. Reported throughput value is reported +in frames per second (FPS) and calculated as a derivative from: +* Reported latency in the Sync mode +* The total execution time in the Async mode + +Throughput value also depends on batch size. + +The application also collects per-layer Performance Measurement (PM) counters for each executed infer request if you +enable statistics dumping by setting the `-report_type` parameter to one of the possible values: +* `no_counters` report includes configuration options specified, resulting FPS and latency. +* `median_counters` report extends the `no_counters` report and additionally includes median PM counters values for each layer from the network. +* `detailed_counters` report extends the `median_counters` report and additionally includes per-layer PM counters and latency for each executed infer request. + +Depending on the type, the report is stored to `benchmark_no_counters_report.csv`, `benchmark_median_counters_report.csv`, +or `benchmark_detailed_counters_report.csv` file located in the path specified in `-report_folder`. + +The application also saves executable graph information serialized to a XML file if you specify a path to it with the +`-exec_graph_path` parameter. -The infer requests are executed asynchronously. `Wait` method is used to wait for previous execution to complete. The application measures all infer requests executions and reports the throughput metric based on batch size and total execution duration. ## Running @@ -43,30 +60,39 @@ InferenceEngine: benchmark_app [OPTION] Options: - -h Print a usage message - -i "" Required. Path to a folder with images or to image files. - -m "" Required. Path to an .xml file with a trained model. - -pp "" Path to a plugin folder. - -api "" Required. Enable using sync/async API. - -d "" Specify a target device to infer on: CPU, GPU, FPGA or MYRIAD. Use "-d HETERO:" format to specify HETERO plugin. The application looks for a suitable plugin for the specified device. - -niter "" Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device. - -nireq "" Optional. Number of infer requests (default value is 2). - -l "" Required for CPU custom layers. Absolute path to a shared library with the kernels implementations. + -h Print a usage message + -i "" Required. Path to a folder with images or to image files. + -m "" Required. Path to an .xml file with a trained model. + -pp "" Optional. Path to a plugin folder. + -d "" Optional. Specify a target device to infer on: CPU, GPU, FPGA, HDDL or MYRIAD. Default value is CPU. Use "-d HETERO:" format to specify HETERO plugin. The application looks for a suitable plugin for the specified device. + -l "" Required for CPU custom layers. Absolute path to a shared library with the kernels implementations. Or - -c "" Required for GPU custom kernels. Absolute path to an .xml file with the kernels description. - -b "" Optional. Batch size value. If not specified, the batch size value is determined from IR. + -c "" Required for GPU custom kernels. Absolute path to an .xml file with the kernels description. + -api "" Optional. Enable Sync/Async API. Default value is "async". + -niter "" Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device. + -nireq "" Optional. Number of infer requests. Default value is 2. + -b "" Optional. Batch size value. If not specified, the batch size value is determined from Intermediate Representation. + -stream_output Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a multiline output. + + CPU-specific performance options: + -nthreads "" Optional. Number of threads to use for inference on the CPU (including HETERO cases). + -pin "YES"/"NO" Optional. Enable ("YES" is default value) or disable ("NO") CPU threads pinning for CPU-involved inference. + + Statistics dumping options: + -report_type "" Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "median_counters" report extends "no_counters" report and additionally includes median PM counters values for each layer from the network. "detailed_counters" report extends "median_counters" report and additionally includes per-layer PM counters and latency for each executed infer request. + -report_folder Optional. Path to a folder where statistics report is stored. + -exec_graph_path Optional. Path to a file where to store executable graph information serialized. ``` Running the application with the empty list of options yields the usage message given above and an error message. -You can run the application for one input layer four-dimensional models that support images as input, for example, public -AlexNet and GoogLeNet models that can be downloaded -with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader). +You can run the application for one input layer four-dimensional models that support images as input, for example, public +AlexNet and GoogLeNet models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). -> **NOTE**: To run the application, the model should be first converted to the Inference Engine format (\*.xml + \*.bin) -using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). +> **NOTE**: Before running the demo with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). -For example, to perform inference on CPU in the synchronous mode and get estimated performance metrics for AlexNet model, run the following command: +For example, to perform inference on CPU in the synchronous mode and get estimated performance metrics for AlexNet model, +run the following command: ```sh ./benchmark_app -i /inputImage.bmp -m /alexnet_fp32.xml -d CPU -api sync @@ -80,21 +106,25 @@ For the asynchronous mode: ## Demo Output -Application output depends on a used API. For synchronous API, the application outputs latency and throughput: -``` -[ INFO ] Start inference synchronously (60000 ms duration) +The application outputs latency and throughput. Additionally, if you set the `-report_type` parameter, the application +outputs statistics report. If you set `-exec_graph_path`, the application reports executable graph information serialized. +Progress bar shows the progress of each execution step: -[ INFO ] Latency: 37.91 ms -[ INFO ] Throughput: 52.7566 FPS ``` +[Step 7/8] Start inference asynchronously (100 async inference executions, 4 inference requests in parallel) +Progress: [....................] 100.00% done -For asynchronous API, the application outputs only throughput: -``` -[ INFO ] Start inference asynchronously (60000 ms duration, 2 inference requests in parallel) +[Step 8/8] Dump statistics report +[ INFO ] statistics report is stored to benchmark_detailed_counters_report.csv +Progress: [....................] 100.00% done -[ INFO ] Throughput: 48.2031 FPS +Latency: 73.33 ms +Throughput: 53.28 FPS ``` +All measurements including per-layer PM counters are reported in milliseconds. + + ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) * [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) diff --git a/inference-engine/samples/benchmark_app/benchmark_app.h b/inference-engine/samples/benchmark_app/benchmark_app.h deleted file mode 100644 index 6ae2ffa..0000000 --- a/inference-engine/samples/benchmark_app/benchmark_app.h +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include -#include -#include - -#ifdef _WIN32 -#include -#else -#include -#include -#endif - -/// @brief message for help argument -static const char help_message[] = "Print a usage message"; - -/// @brief message for images argument -static const char image_message[] = "Required. Path to a folder with images or to image files."; - -/// @brief message for images argument -static const char multi_input_message[] = "Path to multi input file containing."; - -/// @brief message for model argument -static const char model_message[] = "Required. Path to an .xml file with a trained model."; - -/// @brief message for plugin_path argument -static const char plugin_path_message[] = "Path to a plugin folder."; - -/// @brief message for plugin argument -static const char api_message[] = "Required. Enable using sync/async API."; - -/// @brief message for assigning cnn calculation to device -static const char target_device_message[] = "Specify a target device to infer on: CPU, GPU, FPGA or MYRIAD. " \ -"Use \"-d HETERO:\" format to specify HETERO plugin. " \ -"The application looks for a suitable plugin for the specified device."; - -/// @brief message for iterations count -static const char iterations_count_message[] = "Optional. Number of iterations. " \ -"If not specified, the number of iterations is calculated depending on a device."; - -/// @brief message for requests count -static const char infer_requests_count_message[] = "Optional. Number of infer requests (default value is 2)."; - -/// @brief message for #threads for CPU inference -static const char infer_num_threads_message[] = "Optional. Number of threads to use for inference on the CPU " - "(including Hetero cases)."; - -/// @brief message for user library argument -static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations."; - -/// @brief message for clDNN custom kernels desc -static const char custom_cldnn_message[] = "Required for GPU custom kernels. Absolute path to an .xml file with the kernels description."; - -static const char batch_size_message[] = "Optional. Batch size value. If not specified, the batch size value is determined from IR"; - -// @brief message for CPU threads pinning option -static const char infer_threads_pinning_message[] = "Optional. Enable (\"YES\" is default value) or disable (\"NO\")" \ - "CPU threads pinning for CPU-involved inference."; - -/// @brief Define flag for showing help message
-DEFINE_bool(h, false, help_message); - -/// @brief Define parameter for set image file
-/// i or mif is a required parameter -DEFINE_string(i, "", image_message); - -/// @brief Define parameter for set model file
-/// It is a required parameter -DEFINE_string(m, "", model_message); - -/// @brief Define parameter for set path to plugins
-DEFINE_string(pp, "", plugin_path_message); - -/// @brief Enable per-layer performance report -DEFINE_string(api, "async", api_message); - -/// @brief device the target device to infer on
-DEFINE_string(d, "", target_device_message); - -/// @brief Absolute path to CPU library with user layers
-/// It is a required parameter -DEFINE_string(l, "", custom_cpu_library_message); - -/// @brief Define parameter for clDNN custom kernels path
-/// Default is ./lib -DEFINE_string(c, "", custom_cldnn_message); - -/// @brief Iterations count (default 0) -/// Sync mode: iterations count -/// Async mode: StartAsync counts -DEFINE_int32(niter, 0, iterations_count_message); - -/// @brief Number of infer requests in parallel -DEFINE_int32(nireq, 2, infer_requests_count_message); - -/// @brief Number of threads to use for inference on the CPU (also affects Hetero cases) -DEFINE_int32(nthreads, 0, infer_num_threads_message); - -/// @brief Define parameter for batch size
-/// Default is 0 (that means don't specify) -DEFINE_int32(b, 0, batch_size_message); - -// @brief Enable plugin messages -DEFINE_string(pin, "YES", infer_threads_pinning_message); -/** -* @brief This function show a help message -*/ -static void showUsage() { - std::cout << std::endl; - std::cout << "universal_app [OPTION]" << std::endl; - std::cout << "Options:" << std::endl; - std::cout << std::endl; - std::cout << " -h " << help_message << std::endl; - std::cout << " -i \"\" " << image_message << std::endl; - std::cout << " -m \"\" " << model_message << std::endl; - std::cout << " -pp \"\" " << plugin_path_message << std::endl; - std::cout << " -api \"\" " << api_message << std::endl; - std::cout << " -d \"\" " << target_device_message << std::endl; - std::cout << " -niter \"\" " << iterations_count_message << std::endl; - std::cout << " -l \"\" " << custom_cpu_library_message << std::endl; - std::cout << " Or" << std::endl; - std::cout << " -c \"\" " << custom_cldnn_message << std::endl; - std::cout << " -nireq \"\" " << infer_requests_count_message << std::endl; - std::cout << " -b \"\" " << batch_size_message << std::endl; - std::cout << " Some CPU-specific performance options" << std::endl; - std::cout << " -nthreads \"\" " << infer_num_threads_message << std::endl; - std::cout << " -pin \"YES\"/\"NO\" " << infer_threads_pinning_message << std::endl; -} diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp new file mode 100644 index 0000000..8320fb7 --- /dev/null +++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp @@ -0,0 +1,169 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#endif + +/// @brief message for help argument +static const char help_message[] = "Print a usage message"; + +/// @brief message for images argument +static const char image_message[] = "Required. Path to a folder with images or to image files."; + +/// @brief message for images argument +static const char multi_input_message[] = "Path to multi input file containing."; + +/// @brief message for model argument +static const char model_message[] = "Required. Path to an .xml file with a trained model."; + +/// @brief message for plugin_path argument +static const char plugin_path_message[] = "Optional. Path to a plugin folder."; + +/// @brief message for execution mode +static const char api_message[] = "Optional. Enable Sync/Async API. Default value is \"async\"."; + +/// @brief message for assigning cnn calculation to device +static const char target_device_message[] = "Optional. Specify a target device to infer on: CPU, GPU, FPGA, HDDL or MYRIAD. Default value is CPU. " \ +"Use \"-d HETERO:\" format to specify HETERO plugin. " \ +"The application looks for a suitable plugin for the specified device."; + +/// @brief message for iterations count +static const char iterations_count_message[] = "Optional. Number of iterations. " \ +"If not specified, the number of iterations is calculated depending on a device."; + +/// @brief message for requests count +static const char infer_requests_count_message[] = "Optional. Number of infer requests. Default value is 2."; + +/// @brief message for #threads for CPU inference +static const char infer_num_threads_message[] = "Optional. Number of threads to use for inference on the CPU " + "(including HETERO cases)."; + +/// @brief message for user library argument +static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations."; + +/// @brief message for clDNN custom kernels desc +static const char custom_cldnn_message[] = "Required for GPU custom kernels. Absolute path to an .xml file with the kernels description."; + +static const char batch_size_message[] = "Optional. Batch size value. If not specified, the batch size value is determined from Intermediate Representation."; + +// @brief message for CPU threads pinning option +static const char infer_threads_pinning_message[] = "Optional. Enable (\"YES\" is default value) or disable (\"NO\") " \ + "CPU threads pinning for CPU-involved inference."; + +// @brief message for stream_output option +static const char stream_output_message[] = "Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a " + "multiline output."; + +// @brief message for report_type option +static const char report_type_message[] = "Optional. Enable collecting statistics report. \"no_counters\" report contains " + "configuration options specified, resulting FPS and latency. \"median_counters\" " + "report extends \"no_counters\" report and additionally includes median PM " + "counters values for each layer from the network. \"detailed_counters\" report " + "extends \"median_counters\" report and additionally includes per-layer PM " + "counters and latency for each executed infer request."; + +// @brief message for report_folder option +static const char report_folder_message[] = "Optional. Path to a folder where statistics report is stored."; + +// @brief message for exec_graph_path option +static const char exec_graph_path_message[] = "Optional. Path to a file where to store executable graph information serialized."; + +/// @brief Define flag for showing help message
+DEFINE_bool(h, false, help_message); + +/// @brief Define parameter for set image file
+/// i or mif is a required parameter +DEFINE_string(i, "", image_message); + +/// @brief Define parameter for set model file
+/// It is a required parameter +DEFINE_string(m, "", model_message); + +/// @brief Define parameter for set path to plugins
+DEFINE_string(pp, "", plugin_path_message); + +/// @brief Define execution mode +DEFINE_string(api, "async", api_message); + +/// @brief device the target device to infer on
+DEFINE_string(d, "CPU", target_device_message); + +/// @brief Absolute path to CPU library with user layers
+/// It is a required parameter +DEFINE_string(l, "", custom_cpu_library_message); + +/// @brief Define parameter for clDNN custom kernels path
+/// Default is ./lib +DEFINE_string(c, "", custom_cldnn_message); + +/// @brief Iterations count (default 0) +/// Sync mode: iterations count +/// Async mode: StartAsync counts +DEFINE_uint32(niter, 0, iterations_count_message); + +/// @brief Number of infer requests in parallel +DEFINE_uint32(nireq, 2, infer_requests_count_message); + +/// @brief Number of threads to use for inference on the CPU (also affects Hetero cases) +DEFINE_uint32(nthreads, 0, infer_num_threads_message); + +/// @brief Define parameter for batch size
+/// Default is 0 (that means don't specify) +DEFINE_uint32(b, 0, batch_size_message); + +// @brief Enable plugin messages +DEFINE_string(pin, "YES", infer_threads_pinning_message); + +/// @brief Enables multiline text output instead of progress bar +DEFINE_bool(stream_output, false, stream_output_message); + +/// @brief Enables statistics report collecting +DEFINE_string(report_type, "", report_type_message); + +/// @brief Path to a folder where statistics report is stored +DEFINE_string(report_folder, "", report_folder_message); + +/// @brief Path to a file where to store executable graph information serialized +DEFINE_string(exec_graph_path, "", exec_graph_path_message); + +/** +* @brief This function show a help message +*/ +static void showUsage() { + std::cout << std::endl; + std::cout << "benchmark_app [OPTION]" << std::endl; + std::cout << "Options:" << std::endl; + std::cout << std::endl; + std::cout << " -h " << help_message << std::endl; + std::cout << " -i \"\" " << image_message << std::endl; + std::cout << " -m \"\" " << model_message << std::endl; + std::cout << " -pp \"\" " << plugin_path_message << std::endl; + std::cout << " -d \"\" " << target_device_message << std::endl; + std::cout << " -l \"\" " << custom_cpu_library_message << std::endl; + std::cout << " Or" << std::endl; + std::cout << " -c \"\" " << custom_cldnn_message << std::endl; + std::cout << " -api \"\" " << api_message << std::endl; + std::cout << " -niter \"\" " << iterations_count_message << std::endl; + std::cout << " -nireq \"\" " << infer_requests_count_message << std::endl; + std::cout << " -b \"\" " << batch_size_message << std::endl; + std::cout << " -stream_output " << stream_output_message << std::endl; + std::cout << std::endl << " CPU-specific performance options:" << std::endl; + std::cout << " -nthreads \"\" " << infer_num_threads_message << std::endl; + std::cout << " -pin \"YES\"/\"NO\" " << infer_threads_pinning_message << std::endl; + std::cout << std::endl << " Statistics dumping options:" << std::endl; + std::cout << " -report_type \"\" " << report_type_message << std::endl; + std::cout << " -report_folder " << report_folder_message << std::endl; + std::cout << " -exec_graph_path " << exec_graph_path_message << std::endl; +} diff --git a/inference-engine/samples/benchmark_app/infer_request_wrap.hpp b/inference-engine/samples/benchmark_app/infer_request_wrap.hpp new file mode 100644 index 0000000..741ee19 --- /dev/null +++ b/inference-engine/samples/benchmark_app/infer_request_wrap.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include "inference_engine.hpp" + +typedef std::chrono::high_resolution_clock Time; +typedef std::chrono::nanoseconds ns; + +/// @brief Wrapper class for InferenceEngine::InferRequest. Handles asynchronous callbacks and calculates execution time. +class InferReqWrap { +public: + using Ptr = std::shared_ptr; + + explicit InferReqWrap(InferenceEngine::ExecutableNetwork& net) : _request(net.CreateInferRequest()) { + _request.SetCompletionCallback( + [&]() { + _endTime = Time::now(); + }); + } + + void startAsync() { + _startTime = Time::now(); + _request.StartAsync(); + } + + void infer() { + _startTime = Time::now(); + _request.Infer(); + _endTime = Time::now(); + } + + std::map getPerformanceCounts() { + return _request.GetPerformanceCounts(); + } + + void wait() { + InferenceEngine::StatusCode code = _request.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); + if (code != InferenceEngine::StatusCode::OK) { + throw std::logic_error("Wait"); + } + } + + InferenceEngine::Blob::Ptr getBlob(const std::string &name) { + return _request.GetBlob(name); + } + + double getExecTime() const { + auto execTime = std::chrono::duration_cast(_endTime - _startTime); + return static_cast(execTime.count()) * 0.000001; + } + +private: + InferenceEngine::InferRequest _request; + Time::time_point _startTime; + Time::time_point _endTime; +}; \ No newline at end of file diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp index 134287b..3174582 100644 --- a/inference-engine/samples/benchmark_app/main.cpp +++ b/inference-engine/samples/benchmark_app/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,84 +13,104 @@ #include #include +#include #include #include #include -#include "benchmark_app.h" +#include "benchmark_app.hpp" +#include "infer_request_wrap.hpp" +#include "progress_bar.hpp" +#include "statistics_report.hpp" using namespace InferenceEngine; long long getDurationInNanoseconds(const std::string& device); -double getMedianValue(const std::vector& sortedTimes); - void fillBlobWithImage( Blob::Ptr& inputBlob, const std::vector& filePaths, - const size_t batchSize, + const size_t& batchSize, const InferenceEngine::InputInfo& info); -static const std::vector> deviceDurationsInSeconds{ - { "CPU", 60LL }, - { "GPU", 60LL }, - { "VPU", 60LL }, - { "MYRIAD", 60LL }, - { "FPGA", 120LL }, - { "UNKNOWN", 120LL } -}; +static const size_t progressBarDefaultTotalCount = 1000; -/** -* @brief The entry point the benchmark application -*/ -int main(int argc, char *argv[]) { - try { - slog::info << "InferenceEngine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl; +bool ParseAndCheckCommandLine(int argc, char *argv[]) { + // ---------------------------Parsing and validation of input args-------------------------------------- + slog::info << "Parsing input parameters" << slog::endl; + gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true); + if (FLAGS_h) { + showUsage(); + return false; + } - slog::info << "Parsing input parameters" << slog::endl; - gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true); - if (FLAGS_h) { - showUsage(); - return 0; - } + if (FLAGS_m.empty()) { + throw std::logic_error("Model required is not set. Please use -h."); + } - if (FLAGS_m.empty()) { - throw std::logic_error("Model required is not set. Please use -h."); - } + if (FLAGS_api.empty()) { + throw std::logic_error("API not selected. Please use -h."); + } - if (FLAGS_api.empty()) { - throw std::logic_error("API not selected. Please use -h."); - } + if (FLAGS_api != "async" && FLAGS_api != "sync") { + throw std::logic_error("Incorrect API. Please use -h."); + } - if (FLAGS_api != "async" && FLAGS_api != "sync") { - throw std::logic_error("Incorrect API. Please use -h."); - } + if (FLAGS_i.empty()) { + throw std::logic_error("Input is not set. Please use -h."); + } - if (FLAGS_i.empty()) { - throw std::logic_error("Input is not set. Please use -h."); - } + if (FLAGS_niter < 0) { + throw std::logic_error("Number of iterations should be positive (invalid -niter option value)"); + } - if (FLAGS_niter < 0) { - throw std::logic_error("Number of iterations should be positive (invalid -niter option value)"); - } + if (FLAGS_nireq < 0) { + throw std::logic_error("Number of inference requests should be positive (invalid -nireq option value)"); + } - if (FLAGS_nireq < 0) { - throw std::logic_error("Number of inference requests should be positive (invalid -nireq option value)"); - } + if (FLAGS_b < 0) { + throw std::logic_error("Batch size should be positive (invalid -b option value)"); + } + + if (!FLAGS_report_type.empty() && + FLAGS_report_type != noCntReport && FLAGS_report_type != medianCntReport && FLAGS_report_type != detailedCntReport) { + std::string err = "only " + std::string(noCntReport) + "/" + std::string(medianCntReport) + "/" + std::string(detailedCntReport) + + " report types are supported (invalid -report_type option value)"; + throw std::logic_error(err); + } + + return true; +} - if (FLAGS_b < 0) { - throw std::logic_error("Batch size should be positive (invalid -b option value)"); +/** +* @brief The entry point the benchmark application +*/ +int main(int argc, char *argv[]) { + try { + slog::info << "InferenceEngine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl; + + // ------------------------------ Parsing and validation of input args --------------------------------- + std::cout << std::endl << "[Step 1/8] Parsing and validation of input args" << std::endl; + ProgressBar progressBar(1, FLAGS_stream_output); + + if (!ParseAndCheckCommandLine(argc, argv)) { + return 0; } - std::vector inputs; - parseInputFilesArguments(inputs); - if (inputs.size() == 0ULL) { + /** This vector stores paths to the processed images **/ + std::vector inputImages; + parseInputFilesArguments(inputImages); + if (inputImages.size() == 0ULL) { throw std::logic_error("no images found"); } + progressBar.addProgress(1); + progressBar.finish(); // --------------------------- 1. Load Plugin for inference engine ------------------------------------- - slog::info << "Loading plugin" << slog::endl; + std::cout << "[Step 2/8] Loading plugin" << std::endl; + progressBar.newBar(1); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); if (!FLAGS_l.empty()) { @@ -105,12 +125,21 @@ int main(int argc, char *argv[]) { } InferenceEngine::ResponseDesc resp; + if (FLAGS_d == "MYRIAD") { + plugin.SetConfig({ {CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_INFO)}, {VPU_CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_INFO)} }); + } const Version *pluginVersion = plugin.GetVersion(); - slog::info << pluginVersion << slog::endl << slog::endl; + slog::info << pluginVersion << slog::endl; + + progressBar.addProgress(1); + progressBar.finish(); // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------ + std::cout << "[Step 3/8] Read IR network" << std::endl; + progressBar.newBar(1); + slog::info << "Loading network files" << slog::endl; InferenceEngine::CNNNetReader netBuilder; @@ -125,10 +154,11 @@ int main(int argc, char *argv[]) { } if (inputInfo.size() != 1) { - throw std::logic_error("only one input layer network is supported"); + throw std::logic_error("only networks with one input are supported"); } // --------------------------- 3. Resize network to match image sizes and given batch---------------------- + if (FLAGS_b != 0) { // We support models having only one input layers ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes(); @@ -146,15 +176,21 @@ int main(int argc, char *argv[]) { slog::info << (FLAGS_b != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize << ", precision: " << precision << slog::endl; + progressBar.addProgress(1); + progressBar.finish(); + // --------------------------- 4. Configure input & output --------------------------------------------- + std::cout << "[Step 4/8] Configure input & output of the model" << std::endl; + progressBar.newBar(1); + const InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::U8; for (auto& item : inputInfo) { /** Set the precision of input data provided by the user, should be called before load of the network to the plugin **/ item.second->setInputPrecision(inputPrecision); } - const size_t imagesCount = inputs.size(); + const size_t imagesCount = inputImages.size(); if (batchSize > imagesCount) { slog::warn << "Network batch size " << batchSize << " is greater than images count " << imagesCount << ", some input files will be duplicated" << slog::endl; @@ -182,9 +218,14 @@ int main(int argc, char *argv[]) { outputBlobs[item.first] = output; } + progressBar.addProgress(1); + progressBar.finish(); + // --------------------------- 5. Loading model to the plugin ------------------------------------------ - slog::info << "Loading model to the plugin" << slog::endl; + std::cout << "[Step 5/8] Loading model to the plugin " << std::endl; + progressBar.newBar(1); + std::map networkConfig; if (FLAGS_d.find("CPU") != std::string::npos) { // CPU supports few special performance-oriented keys // limit threading for CPU portion of inference @@ -196,111 +237,154 @@ int main(int argc, char *argv[]) { if (FLAGS_api == "async" && FLAGS_d == "CPU") networkConfig[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = std::to_string(FLAGS_nireq); } + + if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) { + networkConfig[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES; + } + InferenceEngine::ExecutableNetwork exeNetwork = plugin.LoadNetwork(cnnNetwork, networkConfig); - // --------------------------- 6. Performance measurements stuff ------------------------------------------ + progressBar.addProgress(1); + progressBar.finish(); - typedef std::chrono::high_resolution_clock Time; - typedef std::chrono::nanoseconds ns; + // --------------------------- 6. Create infer requests and fill input blobs --------------------------- + + std::cout << "[Step 6/8] Create infer requests and fill input blobs with images" << std::endl; + progressBar.newBar(1); + + std::vector inferRequests; + auto numOfReq = (FLAGS_api == "async") ? FLAGS_nireq : 1; + inferRequests.reserve(numOfReq); + + for (size_t i = 0; i < numOfReq; i++) { + inferRequests.push_back(std::make_shared(exeNetwork)); + slog::info << "Infer Request " << i << " created" << slog::endl; + + for (const InputsDataMap::value_type& item : inputInfo) { + Blob::Ptr inputBlob = inferRequests[i]->getBlob(item.first); + fillBlobWithImage(inputBlob, inputImages, batchSize, *item.second); + } + } + + progressBar.addProgress(1); + progressBar.finish(); + + // --------------------------- 7. Performance measurements stuff ------------------------------------------ - std::vector times; long long durationInNanoseconds; if (FLAGS_niter != 0) { durationInNanoseconds = 0LL; - times.reserve(FLAGS_niter); } else { durationInNanoseconds = getDurationInNanoseconds(FLAGS_d); } - if (FLAGS_api == "sync") { - InferRequest inferRequest = exeNetwork.CreateInferRequest(); - slog::info << "Sync request created" << slog::endl; + std::map emptyStat = {}; + StatisticsReport::Config config = { + FLAGS_d, + FLAGS_api, + batchSize, + FLAGS_nireq, + FLAGS_niter, + FLAGS_nthreads, + FLAGS_pin, + FLAGS_report_type, + FLAGS_report_folder + }; + StatisticsReport statistics(config); + double fps; + double totalDuration; + + size_t progressCnt = 0; + size_t progressBarTotalCount; + size_t iteration = 0; - for (const InputsDataMap::value_type& item : inputInfo) { - Blob::Ptr inputBlob = inferRequest.GetBlob(item.first); - fillBlobWithImage(inputBlob, inputs, batchSize, *item.second); - } + if (FLAGS_api == "sync") { + InferReqWrap::Ptr inferRequest = inferRequests[0]; + std::cout << "[Step 7/8] "; if (FLAGS_niter != 0) { - slog::info << "Start inference synchronously (" << FLAGS_niter << " sync inference executions)" << slog::endl << slog::endl; + std::cout << "Start inference synchronously (" << FLAGS_niter << " sync inference executions)" << std::endl; + progressBarTotalCount = FLAGS_niter; } else { - slog::info << "Start inference synchronously (" << durationInNanoseconds * 0.000001 << " ms duration)" << slog::endl << slog::endl; + std::cout << "Start inference synchronously (" << durationInNanoseconds * 0.000001 << " ms duration)" << std::endl; + progressBarTotalCount = progressBarDefaultTotalCount; } // warming up - out of scope - inferRequest.Infer(); - inferRequest.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); + inferRequest->infer(); const auto startTime = Time::now(); - auto currentTime = Time::now(); + auto execTime = std::chrono::duration_cast(Time::now() - startTime).count(); - size_t iteration = 0ULL; - while ((iteration < FLAGS_niter) || ((FLAGS_niter == 0LL) && ((currentTime - startTime).count() < durationInNanoseconds))) { - const auto iterationStartTime = Time::now(); - inferRequest.Infer(); - currentTime = Time::now(); - - const auto iterationDurationNs = std::chrono::duration_cast(currentTime - iterationStartTime); - times.push_back(static_cast(iterationDurationNs.count()) * 0.000001); + /** Start inference & calculate performance **/ + progressBar.newBar(progressBarTotalCount); + while ((iteration < FLAGS_niter) || + ((FLAGS_niter == 0) && (execTime < durationInNanoseconds))) { + inferRequest->infer(); + statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ? + inferRequest->getPerformanceCounts() : emptyStat, + inferRequest->getExecTime()); iteration++; - } - - std::sort(times.begin(), times.end()); - const double latency = getMedianValue(times); - slog::info << "Latency: " << latency << " ms" << slog::endl; - - slog::info << "Throughput: " << batchSize * 1000.0 / latency << " FPS" << slog::endl; - } else if (FLAGS_api == "async") { - std::vector inferRequests; - inferRequests.reserve(FLAGS_nireq); - for (size_t i = 0; i < FLAGS_nireq; i++) { - InferRequest inferRequest = exeNetwork.CreateInferRequest(); - inferRequests.push_back(inferRequest); - - for (const InputsDataMap::value_type& item : inputInfo) { - Blob::Ptr inputBlob = inferRequest.GetBlob(item.first); - fillBlobWithImage(inputBlob, inputs, batchSize, *item.second); + if (FLAGS_niter > 0) { + progressBar.addProgress(1); + } else { + execTime = std::chrono::duration_cast(Time::now() - startTime).count(); + // calculate how many progress intervals are covered by current iteration. + // depends on the current iteration time and time of each progress interval. + // Previously covered progress intervals must be skipped. + auto progressIntervalTime = durationInNanoseconds / progressBarTotalCount; + size_t newProgress = execTime / progressIntervalTime - progressCnt; + progressBar.addProgress(newProgress); + progressCnt += newProgress; } } - + fps = batchSize * 1000.0 / statistics.getMedianLatency(); + totalDuration = std::chrono::duration_cast(Time::now() - startTime).count() * 0.000001; + progressBar.finish(); + } else { + std::cout << "[Step 7/8] "; if (FLAGS_niter != 0) { - slog::info << "Start inference asynchronously (" << FLAGS_niter << + std::cout << "Start inference asynchronously (" << FLAGS_niter << " async inference executions, " << FLAGS_nireq << - " inference requests in parallel)" << slog::endl << slog::endl; + " inference requests in parallel)" << std::endl; + progressBarTotalCount = FLAGS_niter + FLAGS_nireq - 1; } else { - slog::info << "Start inference asynchronously (" << durationInNanoseconds * 0.000001 << + std::cout << std::endl << "Start inference asynchronously (" << durationInNanoseconds * 0.000001 << " ms duration, " << FLAGS_nireq << - " inference requests in parallel)" << slog::endl << slog::endl; + " inference requests in parallel)" << std::endl; + progressBarTotalCount = 1000; } + size_t currentInference = 0ULL; bool requiredInferenceRequestsWereExecuted = false; long long previousInference = 1LL - FLAGS_nireq; // warming up - out of scope - inferRequests[0].StartAsync(); - inferRequests[0].Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); - - const size_t stepsCount = FLAGS_niter + FLAGS_nireq - 1; + inferRequests[0]->startAsync(); + inferRequests[0]->wait(); - /** Start inference & calculate performance **/ const auto startTime = Time::now(); + auto execTime = std::chrono::duration_cast(Time::now() - startTime).count(); - size_t step = 0ULL; + /** Start inference & calculate performance **/ + /** to use FLAGS_niter + FLAGS_nireq - 1 to guarantee that last infer requests are executed in the same conditions **/ + progressBar.newBar(progressBarTotalCount); while ((!requiredInferenceRequestsWereExecuted) || - (step < stepsCount) || - ((FLAGS_niter == 0LL) && ((Time::now() - startTime).count() < durationInNanoseconds))) { + (iteration < FLAGS_niter + FLAGS_nireq - 1) || + ((FLAGS_niter == 0LL) && (execTime < durationInNanoseconds))) { // start new inference - inferRequests[currentInference].StartAsync(); + inferRequests[currentInference]->startAsync(); // wait the latest inference execution if exists if (previousInference >= 0) { - const StatusCode code = inferRequests[previousInference].Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); - if (code != StatusCode::OK) { - throw std::logic_error("Wait"); - } + inferRequests[previousInference]->wait(); + // update statistics with PM counters only in case of detailed or median reports + statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ? + inferRequests[previousInference]->getPerformanceCounts() : emptyStat, + inferRequests[previousInference]->getExecTime()); } currentInference++; @@ -314,16 +398,30 @@ int main(int argc, char *argv[]) { previousInference = 0; } - step++; + iteration++; + + if (FLAGS_niter > 0) { + progressBar.addProgress(1); + } else { + execTime = std::chrono::duration_cast(Time::now() - startTime).count(); + // calculate how many progress intervals are covered by current iteration. + // depends on the current iteration time and time of each progress interval. + // Previously covered progress intervals must be skipped. + auto progressIntervalTime = durationInNanoseconds / progressBarTotalCount; + size_t newProgress = execTime / progressIntervalTime - progressCnt; + progressBar.addProgress(newProgress); + progressCnt += newProgress; + } } // wait the latest inference executions for (size_t notCompletedIndex = 0ULL; notCompletedIndex < (FLAGS_nireq - 1); ++notCompletedIndex) { if (previousInference >= 0) { - const StatusCode code = inferRequests[previousInference].Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); - if (code != StatusCode::OK) { - throw std::logic_error("Wait"); - } + inferRequests[previousInference]->wait(); + // update statistics with PM counters only in case of detailed or median reports + statistics.add((FLAGS_report_type == detailedCntReport || FLAGS_report_type == medianCntReport) ? + inferRequests[previousInference]->getPerformanceCounts() : emptyStat, + inferRequests[previousInference]->getExecTime()); } previousInference++; @@ -331,13 +429,25 @@ int main(int argc, char *argv[]) { previousInference = 0LL; } } + totalDuration = std::chrono::duration_cast(Time::now() - startTime).count() * 0.000001; + fps = batchSize * 1000.0 * iteration / totalDuration; + progressBar.finish(); + } - const double totalDuration = std::chrono::duration_cast(Time::now() - startTime).count() * 0.000001; - const double fps = batchSize * 1000.0 * step / totalDuration; - slog::info << "Throughput: " << fps << " FPS" << slog::endl; - } else { - throw std::logic_error("unknown api command line argument value"); + std::cout << "[Step 8/8] Dump statistics report" << std::endl; + progressBar.newBar(1); + statistics.dump(fps, iteration, totalDuration); + + if (!FLAGS_exec_graph_path.empty()) { + CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo(); + execGraphInfo.serialize(FLAGS_exec_graph_path); + slog::info << "executable graph is stored to " << FLAGS_exec_graph_path << slog::endl; } + progressBar.addProgress(1); + progressBar.finish(); + + std::cout << "Latency: " << statistics.getMedianLatency() << " ms" << std::endl; + std::cout << "Throughput: " << fps << " FPS" << std::endl; } catch (const std::exception& ex) { slog::err << ex.what() << slog::endl; return 3; @@ -347,6 +457,16 @@ int main(int argc, char *argv[]) { } long long getDurationInNanoseconds(const std::string& device) { + static const std::vector> deviceDurationsInSeconds{ + { "CPU", 60LL }, + { "GPU", 60LL }, + { "VPU", 60LL }, + { "MYRIAD", 60LL }, + { "HDDL", 60LL }, + { "FPGA", 120LL }, + { "UNKNOWN", 120LL } + }; + auto duration = 0LL; for (const auto& deviceDurationInSeconds : deviceDurationsInSeconds) { if (device.find(deviceDurationInSeconds.first) != std::string::npos) { @@ -370,22 +490,16 @@ long long getDurationInNanoseconds(const std::string& device) { return duration * 1000000000LL; } -double getMedianValue(const std::vector& sortedTimes) { - return (sortedTimes.size() % 2 != 0) ? - sortedTimes[sortedTimes.size() / 2ULL] : - (sortedTimes[sortedTimes.size() / 2ULL] + sortedTimes[sortedTimes.size() / 2ULL - 1ULL]) / 2.0; -} - void fillBlobWithImage( Blob::Ptr& inputBlob, const std::vector& filePaths, - const size_t batchSize, + const size_t& batchSize, const InferenceEngine::InputInfo& info) { - uint8_t* inputBlobData = inputBlob->buffer().as(); + auto inputBlobData = inputBlob->buffer().as(); const SizeVector& inputBlobDims = inputBlob->dims(); - slog::info << "Input dimensions (" << info.getTensorDesc().getLayout() << "): "; + slog::info << "Network Input dimensions (" << info.getTensorDesc().getLayout() << "): "; for (const auto& i : info.getTensorDesc().getDims()) { slog::info << i << " "; } @@ -400,6 +514,7 @@ void fillBlobWithImage( inputIndex = 0ULL; } + slog::info << "Prepare image " << filePaths[inputIndex] << slog::endl; FormatReader::ReaderPtr reader(filePaths[inputIndex].c_str()); if (reader.get() == nullptr) { slog::warn << "Image " << filePaths[inputIndex] << " cannot be read!" << slog::endl << slog::endl; diff --git a/inference-engine/samples/benchmark_app/progress_bar.hpp b/inference-engine/samples/benchmark_app/progress_bar.hpp new file mode 100644 index 0000000..bc7e485 --- /dev/null +++ b/inference-engine/samples/benchmark_app/progress_bar.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include + +/// @brief Responsible for progress bar handling within the benchmark_app +class ProgressBar { +public: + ProgressBar(size_t totalNum, bool stream_output) { + _bar.reset(new ConsoleProgress(totalNum, stream_output)); + _isFinished = true; + } + + void addProgress(size_t num) { + _isFinished = false; + _bar->addProgress(num); + } + + void finish() { + _isFinished = true; + _bar->finish(); + std::cout << std::endl; + } + + void newBar(size_t totalNum) { + if (_isFinished) { + _bar.reset(new ConsoleProgress(totalNum)); + } else { + throw std::logic_error("Can't create new bar. Current progress bar is still in progress"); + } + } + +private: + std::unique_ptr _bar; + bool _isFinished; +}; \ No newline at end of file diff --git a/inference-engine/samples/benchmark_app/statistics_report.cpp b/inference-engine/samples/benchmark_app/statistics_report.cpp new file mode 100644 index 0000000..3bb0df4 --- /dev/null +++ b/inference-engine/samples/benchmark_app/statistics_report.cpp @@ -0,0 +1,222 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "statistics_report.hpp" + +void StatisticsReport::add(const std::map &pmStat, const double &latency) { + if (_config.niter > 0 && _config.niter == _performanceCounters.size()) { + // do not add elements for the adittionaly executed requests. + return; + } + + _latencies.push_back(latency); + if (_config.report_type == medianCntReport || _config.report_type == detailedCntReport) { + // collect per-iteration statistics only in case of enabled median/detailed statistic collecting + _performanceCounters.push_back(pmStat); + } +} + +void StatisticsReport::dump(const double &fps, const size_t &numProcessedReq, const double &totalExecTime) { + if (_config.report_type.empty()) { + slog::info << "Statistics collecting was not requested. No reports are dumped." << slog::endl; + return; + } + + size_t numMeasuredReq = numProcessedReq; + if (_config.api == "async" && _config.niter > 0) { + // in this case number of processed requests is higher than the value of -niter option. + // but we need to handle statistics for -niter number of requests only + numMeasuredReq = _config.niter; + } + + std::string separator = +#if defined _WIN32 || defined __CYGWIN__ + # if defined UNICODE + L"\\"; + # else + "\\"; + # endif +#else + "/"; +#endif + if (_config.report_folder.empty()) + separator = ""; + + CsvDumper dumper(true, _config.report_folder + separator + "benchmark_" + _config.report_type + "_report.csv"); + + // resulting number of columns in csv file depends on the report_type. If it's noCntReport, then + // no PM data is collected and there are only 3 columns in the file (in configuration section). If it's + // medianCntReport then median PM values are collected per each layer and the number of columns is 6. + // Example from GPU: + // + // layer name;exec status;layer type;exec type;real time;cpu time; + // conv1;EXECUTED;Convolution;convolution_gpu_bfyx_gemm_like;615;3; + // Here, all the data are taken from InferenceEngine::InferenceEngineProfileInfo. + // + // In case of detailedCntReport the number of columns is 4 + numMeasuredReq * 2, because first 4 parameters + // are the same but realTime and cpuTime can be different on each iteration (example from 5 GPU requests): + // conv1;EXECUTED;Convolution;convolution_gpu_bfyx_gemm_like;630,3;617,3;616,3;615,3;617,3; + size_t numOfColumns = 0; + if (_config.report_type == noCntReport) { + numOfColumns = 3; + } else if (_config.report_type == medianCntReport) { + numOfColumns = 6; + } else { + // for detailedCntReport + numOfColumns = 4 + numMeasuredReq * 2; + } + + auto completeCsvRow = [](CsvDumper &dumper, size_t numOfColumns, size_t filled) { + for (size_t i = 0; i < numOfColumns - filled; i++) + dumper << ""; + dumper.endLine(); + }; + + // dump execution configuration + dumper << "Configuration setup"; + completeCsvRow(dumper, numOfColumns, 1); + dumper << "config option" << "CLI parameter" << "value"; + completeCsvRow(dumper, numOfColumns, 3); + + dumper << "target device" << " -d" << _config.device; + completeCsvRow(dumper, numOfColumns, 3); + dumper << "execution mode" << " -api" << _config.api; + completeCsvRow(dumper, numOfColumns, 3); + dumper << "batch size" << " -b" << _config.batch; + completeCsvRow(dumper, numOfColumns, 3); + dumper << "number of iterations" << " -niter" << _config.niter; + completeCsvRow(dumper, numOfColumns, 3); + dumper << "number of parallel infer requests" << " -nireq" << _config.nireq; + completeCsvRow(dumper, numOfColumns, 3); + dumper << "number of CPU threads" << " -nthreads" << _config.cpu_nthreads; + completeCsvRow(dumper, numOfColumns, 3); + dumper << "CPU pinning enabled" << " -pin" << _config.cpu_pin; + completeCsvRow(dumper, numOfColumns, 3); + + dumper.endLine(); + + // write PM data from each iteration + if (!_performanceCounters.empty()) { + if (_config.report_type != medianCntReport && _config.report_type != detailedCntReport) { + throw std::logic_error("PM data should only be collected for median or detailed report types"); + } + + // this vector is sorted according to network layers execution order. + auto performanceMapSorted = preparePmStatistics(); + + dumper << "Performance counters"; + completeCsvRow(dumper, numOfColumns, 1); + dumper << "layer name" << "exec status" << "layer type" << "exec type"; + + if (_config.report_type == medianCntReport) { + dumper << "median real time" << "median cpu time"; + completeCsvRow(dumper, numOfColumns, 6); + } else { + // detailedCntReport case + for (size_t i = 0; i< _performanceCounters.size(); i++) { + dumper << "realTime_iter" + std::to_string(i) << "cpuTime_iter" + std::to_string(i); + } + completeCsvRow(dumper, numOfColumns, 4 + _performanceCounters.size() * 2); + } + + for (const auto &layer : performanceMapSorted) { + dumper << layer.first; // layer name + switch (layer.second.status) { + case InferenceEngine::InferenceEngineProfileInfo::EXECUTED: + dumper << "EXECUTED"; + break; + case InferenceEngine::InferenceEngineProfileInfo::NOT_RUN: + dumper << "NOT_RUN"; + break; + case InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT: + dumper << "OPTIMIZED_OUT"; + break; + } + dumper << layer.second.layer_type << layer.second.exec_type; + + if (_config.report_type == medianCntReport) { + // write median realTime and cpuTime from each processed request for current layer + dumper << + std::to_string(getMedianValue(_perLayerRealTime[layer.first]) / 1000.0) << + std::to_string(getMedianValue(_perLayerCpuTime[layer.first]) / 1000.0); + } else { + // write all realTime and cpuTime from each processed request for current layer + for (size_t i = 0; i < numMeasuredReq; i++) { + dumper << std::to_string(_perLayerRealTime[layer.first][i] / 1000.0) << std::to_string(_perLayerCpuTime[layer.first][i] / 1000.0); + } + } + dumper.endLine(); + } + dumper.endLine(); + } + + if (_config.report_type == detailedCntReport) { + dumper << "Statistics"; + completeCsvRow(dumper, numOfColumns, 1); + + dumper << "metric"; + for (size_t i = 0; i < _latencies.size(); i++) { + // detailedCntReport case + dumper << "iter" + std::to_string(i); + } + completeCsvRow(dumper, numOfColumns, 4 + _latencies.size()); + dumper << "latencies"; + for (const auto &lat : _latencies) { + dumper << lat; + } + completeCsvRow(dumper, numOfColumns, _latencies.size()); + dumper.endLine(); + } + + dumper << "Execution results"; + completeCsvRow(dumper, numOfColumns, 1); + dumper << "number of measured infer requests" << numMeasuredReq; + completeCsvRow(dumper, numOfColumns, 2); + dumper << "latency" << getMedianValue(_latencies); + completeCsvRow(dumper, numOfColumns, 2); + dumper << "throughput" << fps; + completeCsvRow(dumper, numOfColumns, 2); + dumper << "total execution time" << totalExecTime; + completeCsvRow(dumper, numOfColumns, 2); + + slog::info << "statistics report is stored to " << dumper.getFilename() << slog::endl; +} + +double StatisticsReport::getMedianLatency() { + return getMedianValue(_latencies); +} + +std::vector> StatisticsReport::preparePmStatistics() { + if (_performanceCounters.empty()) { + throw std::logic_error("preparePmStatistics() was called when no PM data was collected"); + } + + // sort PM data of first processed request according to layers execution order + auto performanceMapSorted = perfCountersSorted(_performanceCounters[0]); + + // iterate over each processed infer request and handle its PM data + for (auto &pm : _performanceCounters) { + // iterate over each layer from sorted vector and add required PM data to the per-layer maps + for (const auto & it : performanceMapSorted) { + _perLayerRealTime[it.first].push_back(pm[it.first].realTime_uSec); + _perLayerCpuTime[it.first].push_back(pm[it.first].cpu_uSec); + } + } + return performanceMapSorted; +} + +template +T StatisticsReport::getMedianValue(const std::vector &vec) { + std::vector sortedVec(vec); + std::sort(sortedVec.begin(), sortedVec.end()); + return (sortedVec.size() % 2 != 0) ? + sortedVec[sortedVec.size() / 2ULL] : + (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast(2.0); +} diff --git a/inference-engine/samples/benchmark_app/statistics_report.hpp b/inference-engine/samples/benchmark_app/statistics_report.hpp new file mode 100644 index 0000000..248d7cd --- /dev/null +++ b/inference-engine/samples/benchmark_app/statistics_report.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +// @brief statistics reports types +static constexpr char noCntReport[] = "no_counters"; +static constexpr char medianCntReport[] = "median_counters"; +static constexpr char detailedCntReport[] = "detailed_counters"; + +/// @brief Responsible for collecting of statistics and dumping to .csv file +class StatisticsReport { +public: + struct Config { + std::string device; + std::string api; + size_t batch; + size_t nireq; + size_t niter; + size_t cpu_nthreads; + std::string cpu_pin; + std::string report_type; + std::string report_folder; + }; + + explicit StatisticsReport(Config config) : _config(std::move(config)) { + if (_config.niter > 0) { + _performanceCounters.reserve(_config.niter); + } + } + + void add(const std::map &pmStat, const double &latency); + + void dump(const double &fps, const size_t &numProcessedReq, const double &totalExecTime); + + double getMedianLatency(); + +private: + std::vector> preparePmStatistics(); + + template + T getMedianValue(const std::vector &vec); + + // Contains PM data for each processed infer request + std::vector> _performanceCounters; + // Contains latency of each processed infer request + std::vector _latencies; + + // configuration of current benchmark execution + const Config _config; + + // mapping from network layer to a vector of calculated RealTime values from each processed infer request. + std::map> _perLayerRealTime; + // mapping from network layer to a vector of calculated CPU Time values from each processed infer request. + std::map> _perLayerCpuTime; +}; diff --git a/inference-engine/samples/build_samples.sh b/inference-engine/samples/build_samples.sh deleted file mode 100644 index f531f91..0000000 --- a/inference-engine/samples/build_samples.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2018 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -error() { - local code="${3:-1}" - if [[ -n "$2" ]];then - echo "Error on or near line $1: $2; exiting with status ${code}" - else - echo "Error on or near line $1; exiting with status ${code}" - fi - exit "${code}" -} -trap 'error ${LINENO}' ERR - -SAMPLES_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -if [[ -z "${InferenceEngine_DIR}" ]]; then - printf "\nInferenceEngine_DIR environment variable is not set. Trying to find setupvars.sh to set it. \n" - - setvars_path=$SAMPLES_PATH/../.. - if [ -e "$setvars_path/inference_engine/bin/setvars.sh" ]; then # for Intel Deep Learning Deployment Toolkit package - setvars_path="$setvars_path/inference_engine/bin/setvars.sh" - elif [ -e "$setvars_path/../bin/setupvars.sh" ]; then # for OpenVINO package - setvars_path="$setvars_path/../bin/setupvars.sh" - elif [ -e "$setvars_path/../setupvars.sh" ]; then - setvars_path="$setvars_path/../setupvars.sh" - else - printf "Error: setupvars.sh is not found in hardcoded paths. \n\n" - exit 1 - fi - if ! source $setvars_path ; then - printf "Unable to run ./setupvars.sh. Please check its presence. \n\n" - exit 1 - fi -fi - -if ! command -v cmake &>/dev/null; then - printf "\n\nCMAKE is not installed. It is required to build Inference Engine samples. Please install it. \n\n" - exit 1 -fi - -build_dir=$HOME/inference_engine_samples_build -mkdir -p $build_dir -cd $build_dir -cmake -DCMAKE_BUILD_TYPE=Release $SAMPLES_PATH -make -j8 - -printf "\nBuild completed, you can find binaries for all samples in the $HOME/inference_engine_samples_build/intel64/Release subfolder.\n\n" diff --git a/inference-engine/samples/calibration_tool/CMakeLists.txt b/inference-engine/samples/calibration_tool/CMakeLists.txt index f69a6e7..c654336 100644 --- a/inference-engine/samples/calibration_tool/CMakeLists.txt +++ b/inference-engine/samples/calibration_tool/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "calibration_tool") file (GLOB MAIN_SRC diff --git a/inference-engine/samples/calibration_tool/README.md b/inference-engine/samples/calibration_tool/README.md index f40c671..6e07559 100644 --- a/inference-engine/samples/calibration_tool/README.md +++ b/inference-engine/samples/calibration_tool/README.md @@ -3,12 +3,14 @@ Inference Engine Calibration Tool calibrates a given FP32 model so that is can be run in low-precision 8-bit integer mode while keeping the input data of this model in the original precision. +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + ## Calibration Tool Options The core command-line options for the Calibration Tool are the same as for -[Validation Application](./samples/validation_app/README.md). However, the Calibration Tool has the following specific options: `-t`, `-subset`, `-output`, and `-threshold`. +[Validation Application](./inference-engine/samples/validation_app/README.md). However, the Calibration Tool has the following specific options: `-t`, `-subset`, `-output`, and `-threshold`. -Running the Calibration Tool with the `-h` option yields the following usage message with all CLI options listed: +Running the Calibration Tool with the `-h` option yields the following usage message: ```sh Usage: calibration_tool [OPTION] @@ -25,7 +27,7 @@ Available options: -lbl Labels file path. The labels file contains names of the dataset classes -l Required for CPU custom layers. Absolute path to a shared library with the kernel implementations. -c Required for GPU custom kernels. Absolute path to an .xml file with the kernel descriptions. - -d Target device to infer on: CPU (default), GPU, FPGA, or MYRIAD. The application looks for a suitable plugin for the specified device. + -d Target device to infer on: CPU (default), GPU, FPGA, HDDL or MYRIAD. The application looks for a suitable plugin for the specified device. -b N Batch size value. If not specified, the batch size value is taken from IR -ppType Preprocessing type. Options: "None", "Resize", "ResizeCrop" -ppSize N Preprocessing size (used with ppType="ResizeCrop") @@ -35,7 +37,7 @@ Available options: -subset Number of pictures from the whole validation set tocreate the calibration dataset. Default value is 0, which stands forthe whole provided dataset -output Output name for calibrated model. Default is _i8.xml|bin -threshold Threshold for a maximum accuracy drop of quantized model. Must be an integer number (percents) without a percent sign. Default value is 1, which stands for accepted accuracy drop in 1% - - stream_output Flag for printing progress as a plain text.When used, interactive progress bar is replaced with multiline output + -stream_output Flag for printing progress as a plain text.When used, interactive progress bar is replaced with multiline output Classification-specific options: -Czb true "Zero is a background" flag. Some networks are trained with a modified dataset where the class IDs are enumerated from 1, but 0 is an undefined "background" class (which is never detected) @@ -53,6 +55,9 @@ The tool options are divided into two categories: 2. **Network type-specific options** named as an acronym of the network type (C or OD) followed by a letter or a word. +You can run the tool with public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the tool on a trained model, make sure the model is converted to the Inference Engine format (`*.xml` + `*.bin`) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). ## Calibrate a Classification Model @@ -68,7 +73,7 @@ named as labels that contain all images of this class and ImageNet*-like format, `.txt` file containing list of images and IDs of classes. For more information on the structure of the datasets, refer to the **Prepare a Dataset** section of the -[Validation Application document](./samples/validation_app/README.md). +[Validation Application document](./inference-engine/samples/validation_app/README.md). If you decide to use the subset of the given dataset, use the ImageNet-like format instead of "folder as classes" format. This brings a more accurate calibration as you are likely to get images @@ -79,11 +84,9 @@ To run the sample you can use classification models that can be downloaded with For example, to calibrate the trained Caffe\* `resnet-50` classification model, run the following command: ```bash -./calibration_tool -t C -m resnet-50.xml -i ILSVRC2012_val.txt -Czb false -ppType "ResizeCrop" -ppSize 342 -b 1 -d CPU -subset 2000 +./calibration_tool -t C -m /resnet-50.xml -i ILSVRC2012_val.txt -Czb false -ppType "ResizeCrop" -ppSize 342 -b 1 -d CPU -subset 2000 ``` -> **NOTE**: To run the tool for a model, the model should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). - ## Calibrate Object Detection Model This topic demonstrates how to run the Calibration Tool on the Object Detection CNN on a set of images. Please @@ -96,7 +99,7 @@ format as the SSD CNN should be supported as well. Before you start calibrating the model, make sure your dataset is in the correct format. For more information, refer to the **Prepare a Dataset** section of the -[Validation Application document](./samples/validation_app/README.md). +[Validation Application document](./inference-engine/samples/validation_app/README.md). Once you have prepared the dataset, you can calibrate the model on it by running the following command: ```bash @@ -106,3 +109,5 @@ Once you have prepared the dataset, you can calibrate the model on it by running ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/calibration_tool/calibrator_processors.cpp b/inference-engine/samples/calibration_tool/calibrator_processors.cpp index d4cf7fe..e6a00b8 100644 --- a/inference-engine/samples/calibration_tool/calibrator_processors.cpp +++ b/inference-engine/samples/calibration_tool/calibrator_processors.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -12,6 +12,7 @@ #include #include #include +#include #include "details/ie_cnn_network_tools.h" #include "details/caseless.hpp" @@ -37,7 +38,7 @@ CNNLayerPtr Int8Calibrator::addScaleShiftBeforeLayer(std::string name, CNNLayer: if (scale.size() == 1) { scale.resize(wdims[0]); - for (int i = 1; i < wdims[0]; i++) { + for (size_t i = 1; i < wdims[0]; i++) { scale[i] = scale[0]; } } @@ -53,7 +54,7 @@ CNNLayerPtr Int8Calibrator::addScaleShiftBeforeLayer(std::string name, CNNLayer: if (buffer == nullptr) { THROW_IE_EXCEPTION << "Could not allocate weights buffer"; } - for (size_t i = 0, idx = 0; i < pData->dims[2]; i++) { + for (size_t i = 0; i < pData->dims[2]; i++) { buffer[i] = scale[i]; } pScaleShift->_weights = weights; @@ -64,7 +65,7 @@ CNNLayerPtr Int8Calibrator::addScaleShiftBeforeLayer(std::string name, CNNLayer: biases = make_shared_blob(Precision::FP32, Layout::C, bdims); biases->allocate(); buffer = biases->buffer().as(); - for (size_t i = 0, idx = 0; i < pData->dims[2]; i++) { + for (size_t i = 0; i < pData->dims[2]; i++) { buffer[i] = 0.f; } pScaleShift->_biases = biases; @@ -94,7 +95,6 @@ CNNLayerPtr Int8Calibrator::addScaleShiftBeforeLayer(std::string name, CNNLayer: float Int8Calibrator::compare_NRMSD(InferenceEngine::Blob::Ptr res, InferenceEngine::Blob::Ptr ref) { float *res_ptr = res->buffer().as(); - size_t res_size = res->size(); float *ref_ptr = ref->buffer().as(); size_t ref_size = ref->size(); @@ -111,9 +111,12 @@ float Int8Calibrator::compare_NRMSD(InferenceEngine::Blob::Ptr res, InferenceEng mmin = std::min(mmin, ref_ptr[i]); mmax = std::max(mmax, ref_ptr[i]); } + if (std::fabs(ref_size) < std::numeric_limits::epsilon()) { + throw std::logic_error("ref_size can't be equal to zero"); + } sum /= ref_size; - sum = pow(sum, 0.5); + sum = pow(sum, 0.5f); sum /= mmax - mmin; @@ -149,6 +152,9 @@ void Int8Calibrator::collectFP32Statistic() { networkReaderC = InferenceEngine::CNNNetReader(); networkReaderC.ReadNetwork(_modelFileNameI8C); if (!networkReaderC.isParseSuccess()) THROW_IE_EXCEPTION << "cannot load a failed Model"; + /** Extract model name and load weights **/ + std::string binFileName = fileNameNoExt(_modelFileNameI8C) + ".bin"; + networkReaderC.ReadWeights(binFileName.c_str()); if (_cBatch == 0) { // Zero means "take batch value from the IR" _cBatch = networkReaderC.getNetwork().getBatchSize(); @@ -163,10 +169,6 @@ void Int8Calibrator::collectFP32Statistic() { networkReaderC.getNetwork().reshape(input_shapes); } - /** Extract model name and load weights **/ - std::string binFileName = fileNameNoExt(_modelFileNameI8C) + ".bin"; - networkReaderC.ReadWeights(binFileName.c_str()); - auto network = networkReaderC.getNetwork(); @@ -196,10 +198,12 @@ void Int8Calibrator::collectFP32Statistic() { // 1. add all layers as output one for (auto &&layer : network) { std::string layerType = network.getLayerByName(layer->name.c_str())->type; - if (/*layerType != "Split" &&*/layerType != "Input") { - network.addOutput(layer->name); + if (layerType != "Const") { + if (/*layerType != "Split" &&*/layerType != "Input") { + network.addOutput(layer->name); + } + _statData.registerLayer(layer->name); } - _statData.registerLayer(layer->name); } ExecutableNetwork executable_network = _pluginI8C.LoadNetwork(network, { { CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(YES) } }); @@ -207,12 +211,16 @@ void Int8Calibrator::collectFP32Statistic() { } void Int8Calibrator::validateInt8Config(const InferenceEngine::NetworkStatsMap &stat, - const std::map &layersToInt8) { + const std::map &layersToInt8, + bool convertFullyConnected) { _collectByLayer = false; _collectStatistic = false; networkReaderC = InferenceEngine::CNNNetReader(); networkReaderC.ReadNetwork(_modelFileNameI8C); if (!networkReaderC.isParseSuccess()) THROW_IE_EXCEPTION << "cannot load a failed Model"; + /** Extract model name and load weights **/ + std::string binFileName = fileNameNoExt(_modelFileNameI8C) + ".bin"; + networkReaderC.ReadWeights(binFileName.c_str()); if (_cBatch == 0) { // Zero means "take batch value from the IR" _cBatch = networkReaderC.getNetwork().getBatchSize(); @@ -227,10 +235,6 @@ void Int8Calibrator::validateInt8Config(const InferenceEngine::NetworkStatsMap & networkReaderC.getNetwork().reshape(input_shapes); } - /** Extract model name and load weights **/ - std::string binFileName = fileNameNoExt(_modelFileNameI8C) + ".bin"; - networkReaderC.ReadWeights(binFileName.c_str()); - // Initialize statistic ICNNNetworkStats *pstats = nullptr; StatusCode s = ((ICNNNetwork&)networkReaderC.getNetwork()).getStats(&pstats, nullptr); @@ -239,6 +243,13 @@ void Int8Calibrator::validateInt8Config(const InferenceEngine::NetworkStatsMap & } auto network = networkReaderC.getNetwork(); + + for (auto l : network) { + if (l->type == "FullyConnected") { + l->params["quantization_level"] = (convertFullyConnected == false) ? "FP32" : "I8"; + } + } + for (auto l : layersToInt8) { network.getLayerByName(l.first.c_str())-> params["quantization_level"] = (l.second == false) ? "FP32" : "I8"; @@ -363,6 +374,9 @@ void Int8Calibrator::collectByLayerStatistic(const InferenceEngine::NetworkStats networkReaderC = InferenceEngine::CNNNetReader(); networkReaderC.ReadNetwork(_modelFileNameI8C); if (!networkReaderC.isParseSuccess()) THROW_IE_EXCEPTION << "cannot load a failed Model"; + /** Extract model name and load weights **/ + std::string binFileName = fileNameNoExt(_modelFileNameI8C) + ".bin"; + networkReaderC.ReadWeights(binFileName.c_str()); if (_cBatch != 0) { auto input_shapes = networkReaderC.getNetwork().getInputShapes(); std::string input_name; @@ -373,15 +387,11 @@ void Int8Calibrator::collectByLayerStatistic(const InferenceEngine::NetworkStats networkReaderC.getNetwork().reshape(input_shapes); } - /** Extract model name and load weights **/ - std::string binFileName = fileNameNoExt(_modelFileNameI8C) + ".bin"; - networkReaderC.ReadWeights(binFileName.c_str()); - auto network = networkReaderC.getNetwork(); // 1. add all layers as output one for (auto &&layer : network) { std::string layerType = network.getLayerByName(layer->name.c_str())->type; - if (/*layerType != "Split" &&*/layerType != "Input") { + if (/*layerType != "Split" &&*/layerType != "Input" && layerType != "Const") { network.addOutput(layer->name); } @@ -401,7 +411,6 @@ void Int8Calibrator::collectByLayerStatistic(const InferenceEngine::NetworkStats // currently it is only supported // if only one output from conv and if it is an output to relu - bool quattization = false; if (layerToClone->outData.size() == 1 && layerToClone->outData[0]->inputTo.size() == 1 && CaselessEq()(layerToClone->outData[0]->inputTo.begin()->second->name, "relu")) { @@ -461,16 +470,14 @@ void Int8Calibrator::collectCalibrationStatistic(size_t pics) { outName = _inputsFromLayers[l]; } - size_t N, C, statCount; + size_t N, C; if (outBlob->dims().size() == 4 && outBlob->layout() == Layout::NCHW) { // TODO(amalyshe) cahnge to using of tensor desc N = pics; C = outBlob->dims()[2]; - statCount = C; } else if (outBlob->dims().size() == 2 && outBlob->layout() == Layout::NC) { N = pics; C = outBlob->dims()[0]; - statCount = 1; } else { continue; } @@ -568,10 +575,11 @@ shared_ptr ClassificationCalibrator::Process(bool s generator.readLabels(labelFileName); } catch (InferenceEngine::details::InferenceEngineException& ex) { slog::warn << "Can't read labels file " << labelFileName << slog::endl; + slog::warn << "Error: " << ex.what() << slog::endl; } auto validationMap = generator.getValidationMap(imagesPath); - if (validationMap.size() == 0) { + if (validationMap.empty()) { THROW_IE_EXCEPTION << "The validation dataset in " << imagesPath << "is empty. Check the dataset file or folder and the labels file"; } @@ -580,7 +588,6 @@ shared_ptr ClassificationCalibrator::Process(bool s // ----------------------------Do inference------------------------------------------------------------- std::vector expected(batch); std::vector files(batch); - int captured = 0; if (!_nPictures) { _nPictures = validationMap.size(); @@ -599,7 +606,7 @@ shared_ptr ClassificationCalibrator::Process(bool s size_t ipics = 0; auto iter = validationMap.begin(); while (iter != validationMap.end() && ipics < _nPictures) { - int b = 0; + size_t b = 0; int filesWatched = 0; for (; b < batch && iter != validationMap.end() && ipics + b < _nPictures ; b++, iter++, filesWatched++) { expected[b] = iter->first; @@ -608,6 +615,7 @@ shared_ptr ClassificationCalibrator::Process(bool s files[b] = iter->second; } catch (const InferenceEngineException &iex) { slog::warn << "Can't read file " << iter->second << slog::endl; + slog::warn << "Error: " << iex.what() << slog::endl; // Could be some non-image file in directory b--; continue; @@ -619,12 +627,11 @@ shared_ptr ClassificationCalibrator::Process(bool s collectCalibrationStatistic(b); std::vector results; - auto firstOutputData = firstOutputBlob->buffer().as::value_type *>(); InferenceEngine::TopResults(1, *firstOutputBlob, results); - for (int i = 0; i < b; i++) { + for (size_t i = 0; i < b; i++) { int expc = expected[i]; if (zeroBackground) expc++; - bool top1Scored = (results[i] == expc); + bool top1Scored = (static_cast(results[i]) == expc); if (top1Scored) top1Result++; total++; } @@ -633,6 +640,10 @@ shared_ptr ClassificationCalibrator::Process(bool s calculateLayersAccuracyDrop(); + if (total == 0) { + throw std::logic_error("total can't be equal to zero"); + } + im.AccuracyResult = static_cast(top1Result) / static_cast(total); return std::shared_ptr(new CalibrationMetrics(im)); @@ -675,19 +686,14 @@ shared_ptr SSDObjectDetectionCalibrator::Process(bo for (auto &ann : annCollector.annotations()) { std::list dobList; for (auto &obj : ann.objects) { - DetectedObject dob(classes[obj.name], obj.bndbox.xmin, obj.bndbox.ymin, obj.bndbox.xmax, obj.bndbox.ymax, 1.0, obj.difficult != 0); + DetectedObject dob(classes[obj.name], static_cast(obj.bndbox.xmin), static_cast(obj.bndbox.ymin), + static_cast(obj.bndbox.xmax), static_cast(obj.bndbox.ymax), 1.0f, obj.difficult != 0); dobList.push_back(dob); } ImageDescription id(dobList); desiredForFiles.insert(std::pair(ann.folder + "/" + (!subdir.empty() ? subdir + "/" : "") + ann.filename, id)); } - - ImageDecoder decoder; - - const int maxProposalCount = outputDims[1]; - const int objectSize = outputDims[0]; - for (auto &item : outInfo) { DataPtr outputData = item.second; if (!outputData) { @@ -718,18 +724,17 @@ shared_ptr SSDObjectDetectionCalibrator::Process(bo while (iter != annCollector.annotations().end() && ipics < _nPictures) { std::vector files; - int b = 0; + size_t b = 0; int filesWatched = 0; for (; b < batch && iter != annCollector.annotations().end(); b++, iter++, filesWatched++) { expected[b] = *iter; string filename = iter->folder + "/" + (!subdir.empty() ? subdir + "/" : "") + iter->filename; try { - Size orig_size = decoder.insertIntoBlob(std::string(imagesPath) + "/" + filename, b, *firstInputBlob, preprocessingOptions); float scale_x, scale_y; - scale_x = 1.0 / iter->size.width; // orig_size.width; - scale_y = 1.0 / iter->size.height; // orig_size.height; + scale_x = 1.0f / iter->size.width; // orig_size.width; + scale_y = 1.0f / iter->size.height; // orig_size.height; if (scaleProposalToInputSize) { scale_x *= firstInputBlob->dims()[0]; @@ -742,6 +747,7 @@ shared_ptr SSDObjectDetectionCalibrator::Process(bo files.push_back(filename); } catch (const InferenceEngineException &iex) { slog::warn << "Can't read file " << this->imagesPath + "/" + filename << slog::endl; + slog::warn << "Error: " << iex.what() << slog::endl; // Could be some non-image file in directory b--; continue; @@ -749,9 +755,6 @@ shared_ptr SSDObjectDetectionCalibrator::Process(bo ipics++; } - InferenceEngine::StatusCode sts; - InferenceEngine::ResponseDesc dsc; - // Infer model Infer(progress, filesWatched, im); collectCalibrationStatistic(b); @@ -761,9 +764,9 @@ shared_ptr SSDObjectDetectionCalibrator::Process(bo // Calculating similarity // - for (int b = 0; b < files.size(); b++) { - ImageDescription result(detectedObjects[files[b]]); - im.apc.consumeImage(result, scaledDesiredForFiles.at(files[b])); + for (size_t j = 0; j < files.size(); j++) { + ImageDescription result(detectedObjects[files[j]]); + im.apc.consumeImage(result, scaledDesiredForFiles.at(files[j])); } } progress.finish(); @@ -779,7 +782,7 @@ shared_ptr SSDObjectDetectionCalibrator::Process(bo for (auto i : appc) { mAP += i.second; } - imCalibration.AccuracyResult = mAP / appc.size(); + imCalibration.AccuracyResult = static_cast(mAP / appc.size()); } return std::shared_ptr(new CalibrationMetrics(imCalibration)); } diff --git a/inference-engine/samples/calibration_tool/calibrator_processors.h b/inference-engine/samples/calibration_tool/calibrator_processors.h index 05e7c1e..fdcfc12 100644 --- a/inference-engine/samples/calibration_tool/calibrator_processors.h +++ b/inference-engine/samples/calibration_tool/calibrator_processors.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -66,9 +66,11 @@ public: * @param stat - The statistic for normalization * @param layersToInt8 - list of layers planned to be executed in int8. if layer is absent in this * map, it is assumed that it will be executed in int8 + * @param convertFullyConnected - should the FullyConnected layers be converted into Int8 or not */ void validateInt8Config(const InferenceEngine::NetworkStatsMap &stat, - const std::map& layersToInt8); + const std::map& layersToInt8, + bool convertFullyConnected); /** * Statistic collected in the collectFP32Statistic is processed with threshold passed as a parameter @@ -105,7 +107,7 @@ protected: InferenceEngine::InferRequest _inferRequestI8C; int _cBatch = 0; - int _nPictures; + size_t _nPictures; private: /** diff --git a/inference-engine/samples/calibration_tool/data_stats.cpp b/inference-engine/samples/calibration_tool/data_stats.cpp index ba17e55..ecee50b 100644 --- a/inference-engine/samples/calibration_tool/data_stats.cpp +++ b/inference-engine/samples/calibration_tool/data_stats.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -90,7 +90,7 @@ void AggregatedDataStats::getDataMinMax(const std::string& name, size_t channel, minValues.push_back(tsS.getMinValue()); } // define number of elements to throw out - size_t elementToTake = maxValues.size() * threshold / 100; + size_t elementToTake = static_cast(maxValues.size() * (threshold / 100)); int elementsToThrow = maxValues.size() - elementToTake; std::sort(maxValues.begin(), maxValues.end()); std::sort(minValues.begin(), minValues.end()); diff --git a/inference-engine/samples/calibration_tool/data_stats.h b/inference-engine/samples/calibration_tool/data_stats.h index 0d8b4de..9f2c375 100644 --- a/inference-engine/samples/calibration_tool/data_stats.h +++ b/inference-engine/samples/calibration_tool/data_stats.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/calibration_tool/main.cpp b/inference-engine/samples/calibration_tool/main.cpp index cd01014..90ee2b0 100644 --- a/inference-engine/samples/calibration_tool/main.cpp +++ b/inference-engine/samples/calibration_tool/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -38,8 +38,6 @@ using namespace InferenceEngine::details; using InferenceEngine::details::InferenceEngineException; -#define DEFAULT_PATH_P "./lib" - /// @brief Message for help argument static const char help_message[] = "Print a help message"; /// @brief Message for images argument @@ -56,7 +54,7 @@ static const char model_message[] = "Required. Path to an .xml file with a train static const char plugin_message[] = "Plugin name. For example, CPU. If this parameter is passed, " "the sample looks for a specified plugin only."; /// @brief Message for assigning cnn calculation to device -static const char target_device_message[] = "Target device to infer on: CPU (default), GPU, FPGA, or MYRIAD." +static const char target_device_message[] = "Target device to infer on: CPU (default), GPU, FPGA, HDDL or MYRIAD." " The application looks for a suitable plugin for the specified device."; /// @brief Message for label argument static const char label_message[] = "Path to a file with labels for a model"; @@ -99,9 +97,12 @@ static const char zero_background_message[] = "\"Zero is a background\" flag. So " are enumerated from 1, but 0 is an undefined \"background\" class" " (which is never detected)"; -static const char stream_output_message[] = "Flag for printing progress as a plain text.When used, interactive progress" +static const char stream_output_message[] = "Flag for printing progress as a plain text. When used, interactive progress" " bar is replaced with multiline output"; +static const char convert_fc_message[] = "Convert FullyConnected layers to Int8 or not (false by default)"; + + /// @brief Network type options and their descriptions static const char* types_descriptions[][2] = { { "C", "calibrate Classification network and write the calibrated network to IR" }, @@ -139,7 +140,7 @@ DEFINE_string(p, "", plugin_message); DEFINE_string(OCl, "", label_message); /// @brief Define parameter for a path to plugins
/// Default is ./lib -DEFINE_string(pp, DEFAULT_PATH_P, plugin_path_message); +DEFINE_string(pp, "", plugin_path_message); /// @brief Define paraneter for a target device to infer on
DEFINE_string(d, "CPU", target_device_message); /// @brief Define parameter for batch size
@@ -189,6 +190,8 @@ DEFINE_string(output, "", output_model_name); DEFINE_string(lbl, "", labels_file_message); +DEFINE_bool(convert_fc, false, convert_fc_message); + /** * @brief This function shows a help message */ @@ -250,7 +253,8 @@ std::string strtolower(const std::string& s) { void SaveCalibratedIR(const std::string &originalName, const std::string &outModelName, const std::map& layersToInt8, - const InferenceEngine::NetworkStatsMap& statMap) { + const InferenceEngine::NetworkStatsMap& statMap, + bool convertFullyConnected) { slog::info << "Layers profile for Int8 quantization\n"; CNNNetReader networkReader; networkReader.ReadNetwork(originalName); @@ -271,6 +275,14 @@ void SaveCalibratedIR(const std::string &originalName, layer->params["quantization_level"] = "I8"; std::cout << layer->name << ": " << "I8" << std::endl; } + } else if (CaselessEq()(layer->type, "fullyconnected")) { + if (!convertFullyConnected) { + layer->params["quantization_level"] = "FP32"; + std::cout << layer->name << ": " << "FP32" << std::endl; + } else { + layer->params["quantization_level"] = "I8"; + std::cout << layer->name << ": " << "I8" << std::endl; + } } } @@ -340,7 +352,7 @@ int main(int argc, char *argv[]) { // ---------------------Loading plugin for Inference Engine------------------------------------------------ slog::info << "Loading plugin" << slog::endl; /** Loading the library with extensions if provided**/ - InferencePlugin plugin = PluginDispatcher({ FLAGS_pp, "../../../lib/intel64", "" }).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); /** Loading default extensions **/ if (FLAGS_d.find("CPU") != std::string::npos) { @@ -436,7 +448,7 @@ int main(int argc, char *argv[]) { for (float threshold = 100.0f; threshold > 95.0f; threshold -= 0.5) { std::cout << "Validate int8 accuracy, threshold for activation statistics = " << threshold << std::endl; InferenceEngine::NetworkStatsMap tmpStatMap = calibrator->getStatistic(threshold); - calibrator->validateInt8Config(tmpStatMap, {}); + calibrator->validateInt8Config(tmpStatMap, {}, FLAGS_convert_fc); shared_ptr pIM_I8 = processor->Process(FLAGS_stream_output); const CalibrationMetrics *mI8 = dynamic_cast(pIM_I8.get()); if (maximalAccuracy < mI8->AccuracyResult) { @@ -472,7 +484,7 @@ int main(int argc, char *argv[]) { while (it != orderedLayersAccuracyDrop.crend() && bAccuracy == false) { slog::info << "Returning of '" << it->second << "' to FP32 precision, start validation\n"; layersToInt8[it->second] = false; - calibrator->validateInt8Config(statMap, layersToInt8); + calibrator->validateInt8Config(statMap, layersToInt8, FLAGS_convert_fc); pIM_I8 = processor->Process(FLAGS_stream_output); mI8 = dynamic_cast(pIM_I8.get()); maximalAccuracy = mI8->AccuracyResult; @@ -494,7 +506,7 @@ int main(int argc, char *argv[]) { "current Int8 configuration accuracy: " << OUTPUT_FLOATING(100.0 * maximalAccuracy) << "% " << "with threshold for activation statistic: " << bestThreshold << "%" << std::endl; std::string outModelName = FLAGS_output.empty() ? fileNameNoExt(FLAGS_m) + "_i8" : fileNameNoExt(FLAGS_output); - SaveCalibratedIR(FLAGS_m, outModelName, layersToInt8, statMap); + SaveCalibratedIR(FLAGS_m, outModelName, layersToInt8, statMap, FLAGS_convert_fc); } else { slog::info << "Required threshold of accuracy drop cannot be achieved with any int8 quantization\n"; } @@ -502,7 +514,7 @@ int main(int argc, char *argv[]) { std::cout << "Collected activation statistics, writing maximum values to IR" << std::endl; statMap = calibrator->getStatistic(100.0f); std::string outModelName = FLAGS_output.empty() ? fileNameNoExt(FLAGS_m) + "_i8" : fileNameNoExt(FLAGS_output); - SaveCalibratedIR(FLAGS_m, outModelName, layersToInt8, statMap); + SaveCalibratedIR(FLAGS_m, outModelName, layersToInt8, statMap, FLAGS_convert_fc); } if (dumper.dumpEnabled()) { @@ -521,7 +533,6 @@ int main(int argc, char *argv[]) { showUsage(); return ex.list().begin()->exitCode(); } else { - const char* s = ex.what(); slog::err << "Input problems: \n" << ex.what() << slog::endl; showUsage(); return ex.list().begin()->exitCode(); diff --git a/inference-engine/samples/classification_sample/CMakeLists.txt b/inference-engine/samples/classification_sample/CMakeLists.txt index 4c80190..1dab0c9 100644 --- a/inference-engine/samples/classification_sample/CMakeLists.txt +++ b/inference-engine/samples/classification_sample/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "classification_sample") file (GLOB SRC diff --git a/inference-engine/samples/classification_sample/README.md b/inference-engine/samples/classification_sample/README.md index 26e943b..348e90f 100644 --- a/inference-engine/samples/classification_sample/README.md +++ b/inference-engine/samples/classification_sample/README.md @@ -1,14 +1,23 @@ -# Image Classification Sample +# Image Classification C++ Sample -This topic demonstrates how to run the Image Classification sample application, which performs +This topic demonstrates how to run the Image Classification sample application, which performs inference using image classification networks such as AlexNet and GoogLeNet. -## Running +> **NOTE:** This topic describes usage of C++ implementation of the Image Classification Sample. For the Python* implementation, refer to [Image Classification Python* Sample](./inference-engine/ie_bridges/python/sample/classification_sample/README.md). + +## How It Works + +Upon the start-up, the sample application reads command line parameters and loads a network and an image to the Inference +Engine plugin. When inference is done, the application creates an +output image and outputs data to the standard output stream. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). -Running the application with the -h option yields the following usage message: +## Running +Running the application with the `-h` option yields the following usage message: ```sh ./classification_sample -h -InferenceEngine: +InferenceEngine: API version ............ Build .................. @@ -19,13 +28,13 @@ Options: -i "" "" Required. Path to a folder with images or path to an image files: a .ubyte file for LeNet and a .bmp file for the other networks. -m "" Required. Path to an .xml file with a trained model. - -l "" Optional. Absolute path to library with MKL-DNN (CPU) custom layers (*.so). + -l "" Required for CPU custom layers. Absolute path to a shared library with the kernels implementations. Or - -c "" Optional. Absolute path to clDNN (GPU) custom layers config (*.xml). + -c "" Required for GPU custom kernels. Absolute path to the .xml file with the kernels descriptions. -pp "" Path to a plugin folder. - -d "" Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified - -nt "" Number of top results (default 10) - -ni "" Number of iterations (default 1) + -d "" Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified + -nt "" Number of top results. Default value is 10 + -ni "" Number of iterations. Default value is 1 -pc Enables per-layer performance report -p_msg Enables messages from a plugin @@ -33,32 +42,27 @@ Options: Running the application with the empty list of options yields the usage message given above. -To run the sample you can use AlexNet and GoogLeNet models that can be downloaded with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or other image classification models. +To run the sample, you can use AlexNet and GoogLeNet or other public or pre-trained image classification models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). -> **IMPORTANT**: To run the sample, the model should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). -For example, to perform inference of an AlexNet model (previously converted to the Inference Engine format) on CPU, use the following command: +For example, to perform inference of an AlexNet model on CPU, use the following command: ```sh ./classification_sample -i /cat.bmp -m /alexnet_fp32.xml ``` -### Outputs +## Demo Output + +By default the application outputs top-10 inference results. +Add the `-nt` option to the previous command to modify the number of top output results. -By default the application outputs top-10 inference results. -Add the -nt option to the previous command to modify the number of top output results. -
For example, to get the top-5 results on Intel® HD Graphics, use the following commands: +For example, to get the top-5 results on GPU, use the following commands: ```sh ./classification_sample -i /cat.bmp -m /alexnet_fp32.xml -nt 5 -d GPU ``` -### How it works - -Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference -Engine plugin. When inference is done, the application creates an -output image and outputs data to the standard output stream. - -## See Also +## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) -* [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) -* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) \ No newline at end of file +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/classification_sample/classification_sample.h b/inference-engine/samples/classification_sample/classification_sample.h index 9bf4a61..7b84e6a 100644 --- a/inference-engine/samples/classification_sample/classification_sample.h +++ b/inference-engine/samples/classification_sample/classification_sample.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -29,25 +29,25 @@ static const char plugin_path_message[] = "Path to a plugin folder."; static const char model_message[] = "Required. Path to an .xml file with a trained model."; /// @brief message for assigning cnn calculation to device -static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. " \ +static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. " \ "Sample will look for a suitable plugin for device specified (CPU by default)"; /// @brief message for performance counters static const char performance_counter_message[] = "Enables per-layer performance report"; /// @brief message for top results number -static const char ntop_message[] = "Number of top results (default 10)"; +static const char ntop_message[] = "Number of top results. Default value is 10"; /// @brief message for iterations count -static const char iterations_count_message[] = "Number of iterations (default 1)"; +static const char iterations_count_message[] = "Number of iterations. Default value is 1"; /// @brief message for clDNN custom kernels desc -static const char custom_cldnn_message[] = "Required for clDNN (GPU)-targeted custom kernels."\ - "Absolute path to the xml file with the kernels desc."; +static const char custom_cldnn_message[] = "Required for GPU custom kernels. "\ + "Absolute path to the .xml file with the kernels descriptions."; /// @brief message for user library argument -static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers." \ - "Absolute path to a shared library with the kernels impl."; +static const char custom_cpu_library_message[] = "Required for CPU custom layers. " \ + "Absolute path to a shared library with the kernels implementations."; /// @brief message for plugin messages static const char plugin_message[] = "Enables messages from a plugin"; @@ -70,7 +70,7 @@ DEFINE_string(pp, "", plugin_path_message); DEFINE_string(d, "CPU", target_device_message); /// @brief Top results number (default 10)
-DEFINE_int32(nt, 10, ntop_message); +DEFINE_uint32(nt, 10, ntop_message); /// @brief Enable per-layer performance report DEFINE_bool(pc, false, performance_counter_message); @@ -84,7 +84,7 @@ DEFINE_string(c, "", custom_cldnn_message); DEFINE_string(l, "", custom_cpu_library_message); /// @brief Iterations count (default 1) -DEFINE_int32(ni, 1, iterations_count_message); +DEFINE_uint32(ni, 1, iterations_count_message); /// @brief Enable plugin messages DEFINE_bool(p_msg, false, plugin_message); diff --git a/inference-engine/samples/classification_sample/main.cpp b/inference-engine/samples/classification_sample/main.cpp index bf29415..422e737 100644 --- a/inference-engine/samples/classification_sample/main.cpp +++ b/inference-engine/samples/classification_sample/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include #include "classification_sample.h" @@ -68,7 +70,7 @@ int main(int argc, char *argv[]) { // --------------------------- 1. Load Plugin for inference engine ------------------------------------- slog::info << "Loading plugin" << slog::endl; - InferencePlugin plugin = PluginDispatcher({ FLAGS_pp, "../../../lib/intel64" , "" }).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); if (FLAGS_p_msg) { static_cast(plugin)->SetLogCallback(error_listener); } @@ -242,7 +244,7 @@ int main(int argc, char *argv[]) { double total = 0.0; /** Start inference & calc performance **/ - for (int iter = 0; iter < FLAGS_ni; ++iter) { + for (size_t iter = 0; iter < FLAGS_ni; ++iter) { auto t0 = Time::now(); infer_request.Infer(); auto t1 = Time::now(); @@ -256,24 +258,16 @@ int main(int argc, char *argv[]) { slog::info << "Processing output blobs" << slog::endl; const Blob::Ptr output_blob = infer_request.GetBlob(firstOutputName); - auto output_data = output_blob->buffer().as::value_type*>(); /** Validating -nt value **/ - const int resultsCnt = output_blob->size() / batchSize; + const size_t resultsCnt = output_blob->size() / batchSize; if (FLAGS_nt > resultsCnt || FLAGS_nt < 1) { slog::warn << "-nt " << FLAGS_nt << " is not available for this network (-nt should be less than " \ << resultsCnt+1 << " and more than 0)\n will be used maximal value : " << resultsCnt; FLAGS_nt = resultsCnt; } - /** This vector stores id's of top N results **/ - std::vector results; - TopResults(FLAGS_nt, *output_blob, results); - - std::cout << std::endl << "Top " << FLAGS_nt << " results:" << std::endl << std::endl; - /** Read labels from file (e.x. AlexNet.labels) **/ - bool labelsEnabled = false; std::string labelFileName = fileNameNoExt(FLAGS_m) + ".labels"; std::vector labels; @@ -285,26 +279,17 @@ int main(int argc, char *argv[]) { trim(strLine); labels.push_back(strLine); } - labelsEnabled = true; } - /** Print the result iterating over each batch **/ - for (int image_id = 0; image_id < batchSize; ++image_id) { - std::cout << "Image " << imageNames[image_id] << std::endl << std::endl; - for (size_t id = image_id * FLAGS_nt, cnt = 0; cnt < FLAGS_nt; ++cnt, ++id) { - std::cout.precision(7); - /** Getting probability for resulting class **/ - const auto result = output_data[results[id] + image_id*(output_blob->size() / batchSize)]; - std::cout << std::left << std::fixed << results[id] << " " << result; - if (labelsEnabled) { - std::cout << " label " << labels[results[id]] << std::endl; - } else { - std::cout << " label #" << results[id] << std::endl; - } - } - std::cout << std::endl; - } + ClassificationResult classificationResult(output_blob, imageNames, + batchSize, FLAGS_nt, + labels); + classificationResult.print(); + // ----------------------------------------------------------------------------------------------------- + if (std::fabs(total) < std::numeric_limits::epsilon()) { + throw std::logic_error("total can't be equal to zero"); + } std::cout << std::endl << "total inference time: " << total << std::endl; std::cout << "Average running time of one iteration: " << total / static_cast(FLAGS_ni) << " ms" << std::endl; std::cout << std::endl << "Throughput: " << 1000 * static_cast(FLAGS_ni) * batchSize / total << " FPS" << std::endl; diff --git a/inference-engine/samples/classification_sample_async/CMakeLists.txt b/inference-engine/samples/classification_sample_async/CMakeLists.txt index 96e6e41..9e37440 100644 --- a/inference-engine/samples/classification_sample_async/CMakeLists.txt +++ b/inference-engine/samples/classification_sample_async/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "classification_sample_async") file (GLOB SRC diff --git a/inference-engine/samples/classification_sample_async/README.md b/inference-engine/samples/classification_sample_async/README.md index 995a5d6..e5feedf 100644 --- a/inference-engine/samples/classification_sample_async/README.md +++ b/inference-engine/samples/classification_sample_async/README.md @@ -1,24 +1,39 @@ -# Image Classification Sample Async +# Image Classification C++ Sample Async This sample demonstrates how to build and execute inference in pipelined mode on example of classifications networks. -The pipelined mode might increase the throghput of the pictures. The latency of one inference will be the same as for syncronious execution. -
-The throughput is increased due to follow reasons: -* Some plugins have heterogenity inside themselves. Transferring of data, execution on remote device, doigin pre-processing and post-processing on the host -* Using of explicit heterogenious plugin with execution of different parts of network on differnt devices +> **NOTE:** This topic describes usage of C++ implementation of the Image Classification Sample Async. For the Python* implementation, refer to [Image Classification Python* Sample Async](./inference-engine/ie_bridges/python/sample/classification_sample_async/README.md). -When two and more devices are involved in inference process of one picture, creation of several infer requests and starting of asynchronious inference allows to utilize devices the most efficient way. -If two devices are involved in execution, the most optimal value for -nireq option is 2 -To do this efficiently, Classification Sample Async uses round-robin algorithm for infer requests. It starts execution for the current infer request and swith for the waiting of results for previous one. After finishing of wait, it switches infer requsts and repeat the procedure. +The pipelined mode might increase the throughput of the pictures. The latency of one inference will be the same as for synchronous execution. -Another required aspect of seeing good throughput is number of iterations. Only having big number of iterations you can emulate the real application work and see performance +The throughput increases due to follow reasons: +* Some plugins have heterogeneity inside themselves. Data transferring, execution on remote device, pre-processing and post-processing on the host +* Using of explicit heterogeneous plugin with execution of different parts of network on different devices + +When two or more devices process one image, creating several infer requests and starting asynchronous inference allow for using devices in the most efficient way. +If two devices are involved in execution, the most optimal value for `-nireq` option is 2. + +To process infer requests more efficiently, Classification Sample Async uses round-robin algorithm. It starts execution of the current infer request and switches to waiting for results of the previous one. After finishing of waiting, it switches infer requests and repeat the procedure. + +Another required aspect of good throughput is a number of iterations. Only with big number of iterations you can emulate the real application work and get good performance. The batch mode is an independent attribute on the pipelined mode. Pipelined mode works efficiently with any batch size. +## How It Works + +Upon the start-up, the sample application reads command line parameters and loads a network and an image to the Inference +Engine plugin. +Then application creates several infer requests pointed in `-nireq` parameter and loads images for inference. + +Then in a loop it starts inference for the current infer request and switches to waiting for the previous one. When results are ready, it swaps infer requests. + +When inference is done, the application outputs data to the standard output stream. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + ## Running -Running the application with the -h option yields the following usage message: +Running the application with the `-h` option yields the following usage message: ```sh ./classification_sample_async -h InferenceEngine: @@ -36,50 +51,47 @@ Options: -m "" Required. Path to an .xml file with a trained model. -l "" - Optional. Absolute path to library with MKL-DNN (CPU) custom layers (*.so). + Required for CPU. Absolute path to a shared library with the kernel implementations Or -c "" - Optional. Absolute path to clDNN (GPU) custom layers config (*.xml). + Required for GPU custom kernels. Absolute path to the .xml file with kernel descriptions -pp "" - Path to a plugin folder. + Optional. Path to a plugin folder. -d "" - Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified + Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified. Default value is "CPU". -nt "" - Number of top results (default 10) + Optional. Number of top results. Default value is 10. -ni "" - Number of iterations (default 1) + Optional. Number of iterations. Default value is 1. -pc - Enables per-layer performance report + Optional. Enables per-layer performance report -nireq "" - Number of infer request for pipelined mode (default 1) + Optional. Number of infer request for pipelined mode. Default value is 1. -p_msg - Enables messages from a plugin - + Optional. Enables messages from a plugin + -nthreads "" + Optional. Number of threads to use for inference on the CPU (including HETERO cases) + -pin "YES"/"NO" + Optional. Enable ("YES", default) or disable ("NO") CPU threads pinning for CPU-involved inference ``` Running the application with the empty list of options yields the usage message given above and an error message. -You can do inference on an image using a trained AlexNet network on FPGA with fallback to Intel® Processors using the following command: +To run the sample, use AlexNet and GoogLeNet or other public or pre-trained image classification models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + +You can do inference on an image using a trained AlexNet network on FPGA with fallback to CPU using the following command: ```sh ./classification_sample_async -i /cat.bmp -m /alexnet_fp32.xml -nt 5 -d HETERO:FPGA,CPU -nireq 2 -ni 200 ``` -> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). - -### Outputs +## Sample Output By default the application outputs top-10 inference results for each infer request. In addition to this information it will provide throughput value measured in frames per seconds. -### How it works - -Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference -Engine plugin. -Then application creates several infer requests pointed in -nireq parameter and loads pictures for inference. - -Then in the loop it starts inference for the current infer request and switch for waiting of another one. When results are ready, infer requests will be swapped. - -When inference is done, the application outputs data to the standard output stream. - ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) diff --git a/inference-engine/samples/classification_sample_async/classification_sample_async.h b/inference-engine/samples/classification_sample_async/classification_sample_async.h index c0a202c..2a44ac3 100644 --- a/inference-engine/samples/classification_sample_async/classification_sample_async.h +++ b/inference-engine/samples/classification_sample_async/classification_sample_async.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,45 +23,45 @@ static const char image_message[] = "Required. Path to a folder with images or p "and a .bmp file for the other networks."; /// @brief message for plugin_path argument -static const char plugin_path_message[] = "Path to a plugin folder."; +static const char plugin_path_message[] = "Optional. Path to a plugin folder."; /// @brief message for model argument static const char model_message[] = "Required. Path to an .xml file with a trained model."; /// @brief message for assigning cnn calculation to device -static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. " \ - "Sample will look for a suitable plugin for device specified (CPU by default)"; +static const char target_device_message[] = "Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. " \ + "Sample will look for a suitable plugin for device specified. Default value is CPU"; /// @brief message for performance counters -static const char performance_counter_message[] = "Enables per-layer performance report"; +static const char performance_counter_message[] = "Optional. Enables per-layer performance report"; /// @brief message for top results number -static const char ntop_message[] = "Number of top results (default 10)"; +static const char ntop_message[] = "Optional. Number of top results. Default value is 10."; /// @brief message for iterations count -static const char iterations_count_message[] = "Number of iterations (default 1)"; +static const char iterations_count_message[] = "Optional. Number of iterations. Default value is 1."; /// @brief message for iterations count -static const char ninfer_request_message[] = "Number of infer request for pipelined mode (default 1)"; +static const char ninfer_request_message[] = "Optional. Number of infer request for pipelined mode. Default value is 1."; /// @brief message for #threads for CPU inference static const char infer_num_threads_message[] = "Optional. Number of threads to use for inference on the CPU " - "(including Hetero cases)."; + "(including HETERO cases)."; /// @brief message for clDNN custom kernels desc -static const char custom_cldnn_message[] = "Required for clDNN (GPU)-targeted custom kernels."\ - "Absolute path to the xml file with the kernels desc."; +static const char custom_cldnn_message[] = "Required for GPU custom kernels."\ + "Absolute path to the .xml file with kernels description"; /// @brief message for user library argument -static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers." \ - "Absolute path to a shared library with the kernels impl."; +static const char custom_cpu_library_message[] = "Required for CPU custom layers." \ + "Absolute path to a shared library with the kernels implementation"; // @brief message for CPU threads pinning option -static const char cpu_threads_pinning_message[] = "Optional. Enable (\"YES\"default) or disable (\"NO\")" \ +static const char cpu_threads_pinning_message[] = "Optional. Enable (\"YES\", default) or disable (\"NO\")" \ "CPU threads pinning for CPU-involved inference."; /// @brief message for plugin messages -static const char plugin_message[] = "Enables messages from a plugin"; +static const char plugin_message[] = "Optional. Enables messages from a plugin"; /// @brief Define flag for showing help message
@@ -82,7 +82,7 @@ DEFINE_string(pp, "", plugin_path_message); DEFINE_string(d, "CPU", target_device_message); /// @brief Top results number (default 10)
-DEFINE_int32(nt, 10, ntop_message); +DEFINE_uint32(nt, 10, ntop_message); /// @brief Enable per-layer performance report DEFINE_bool(pc, false, performance_counter_message); @@ -96,10 +96,10 @@ DEFINE_string(c, "", custom_cldnn_message); DEFINE_string(l, "", custom_cpu_library_message); /// @brief Iterations count (default 1) -DEFINE_int32(ni, 1, iterations_count_message); +DEFINE_uint32(ni, 1, iterations_count_message); /// @brief Number of infer requests -DEFINE_int32(nireq, 1, ninfer_request_message); +DEFINE_uint32(nireq, 1, ninfer_request_message); /// @brief Enable plugin messages DEFINE_bool(p_msg, false, plugin_message); diff --git a/inference-engine/samples/classification_sample_async/main.cpp b/inference-engine/samples/classification_sample_async/main.cpp index e8428ef..f73f126 100644 --- a/inference-engine/samples/classification_sample_async/main.cpp +++ b/inference-engine/samples/classification_sample_async/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -84,7 +85,7 @@ int main(int argc, char *argv[]) { // --------------------------- 1. Load Plugin for inference engine ------------------------------------- slog::info << "Loading plugin" << slog::endl; - InferencePlugin plugin = PluginDispatcher({ FLAGS_pp, "../../../lib/intel64" , "" }).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); if (FLAGS_p_msg) { static_cast(plugin)->SetLogCallback(error_listener); } @@ -254,7 +255,7 @@ int main(int argc, char *argv[]) { size_t currentInfer = 0; size_t prevInfer = (FLAGS_nireq > 1) ? 1 : 0; - for (int iter = 0; iter < FLAGS_ni + FLAGS_nireq; ++iter) { + for (size_t iter = 0; iter < FLAGS_ni + FLAGS_nireq; ++iter) { if (iter < FLAGS_ni) { inferRequests[currentInfer].StartAsync(); } @@ -280,20 +281,14 @@ int main(int argc, char *argv[]) { for (size_t i = 0; i < FLAGS_nireq; i++) { /** Validating -nt value **/ - const int resultsCnt = outputBlobs[i]->size() / batchSize; + const size_t resultsCnt = outputBlobs[i]->size() / batchSize; if (FLAGS_nt > resultsCnt || FLAGS_nt < 1) { slog::warn << "-nt " << FLAGS_nt << " is not available for this network (-nt should be less than " \ << resultsCnt+1 << " and more than 0)\n will be used maximal value : " << resultsCnt << slog::endl; FLAGS_nt = resultsCnt; } - /** This vector stores id's of top N results **/ - std::vector results; - TopResults(FLAGS_nt, *outputBlobs[i], results); - - std::cout << std::endl << "Top " << FLAGS_nt << " results:" << std::endl << std::endl; /** Read labels from file (e.x. AlexNet.labels) **/ - bool labelsEnabled = false; std::string labelFileName = fileNameNoExt(FLAGS_m) + ".labels"; std::vector labels; @@ -305,26 +300,12 @@ int main(int argc, char *argv[]) { trim(strLine); labels.push_back(strLine); } - labelsEnabled = true; } - /** Print the result iterating over each batch **/ - for (int image_id = 0; image_id < batchSize; ++image_id) { - std::cout << "Image " << imageNames[image_id] << std::endl << std::endl; - for (size_t id = image_id * FLAGS_nt, cnt = 0; cnt < FLAGS_nt; ++cnt, ++id) { - std::cout.precision(7); - /** Getting probability for resulting class **/ - auto result = outputBlobs[i]->buffer(). - as::value_type*>()[results[id] + image_id*(outputBlobs[i]->size() / batchSize)]; - std::cout << std::left << std::fixed << results[id] << " " << result; - if (labelsEnabled) { - std::cout << " label " << labels[results[id]] << std::endl; - } else { - std::cout << " label #" << results[id] << std::endl; - } - } - std::cout << std::endl; - } + ClassificationResult classificationResult(outputBlobs[i], imageNames, + batchSize, FLAGS_nt, + labels); + classificationResult.print(); } // ----------------------------------------------------------------------------------------------------- std::cout << std::endl << "total inference time: " << total << std::endl; @@ -335,8 +316,7 @@ int main(int argc, char *argv[]) { std::map performanceMap; if (FLAGS_pc) { for (size_t nireq = 0; nireq < FLAGS_nireq; nireq++) { - performanceMap = inferRequests[nireq].GetPerformanceCounts(); - printPerformanceCounts(performanceMap, std::cout); + printPerformanceCounts(inferRequests[nireq], std::cout); } } } diff --git a/inference-engine/samples/common/format_reader/CMakeLists.txt b/inference-engine/samples/common/format_reader/CMakeLists.txt index 0498e0a..e3ecd58 100644 --- a/inference-engine/samples/common/format_reader/CMakeLists.txt +++ b/inference-engine/samples/common/format_reader/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "format_reader") file (GLOB MAIN_SRC @@ -15,7 +13,7 @@ file (GLOB LIBRARY_HEADERS ) # Find OpenCV components if exist -find_package(OpenCV COMPONENTS imgcodecs QUIET) +find_package(OpenCV COMPONENTS imgcodecs videoio imgproc QUIET) if(NOT(OpenCV_FOUND)) message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " is built without OPENCV support") else() @@ -34,13 +32,15 @@ add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API) source_group("src" FILES ${LIBRARY_SRC}) source_group("include" FILES ${LIBRARY_HEADERS}) -# Properties->C/C++->General->Additional Include Directories -include_directories ( - ${CMAKE_CURRENT_SOURCE_DIR}) # Create library file from sources. add_library(${TARGET_NAME} SHARED ${MAIN_SRC} ${LIBRARY_HEADERS}) target_link_libraries(${TARGET_NAME} ${OpenCV_LIBRARIES}) -set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE" -COMPILE_PDB_NAME ${TARGET_NAME}) +if(CMAKE_VERSION VERSION_LESS "2.8.11") + include_directories (${CMAKE_CURRENT_SOURCE_DIR}) +else() + target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +endif() + +set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) diff --git a/inference-engine/samples/common/format_reader/MnistUbyte.cpp b/inference-engine/samples/common/format_reader/MnistUbyte.cpp index c1b04c0..6e46f0e 100644 --- a/inference-engine/samples/common/format_reader/MnistUbyte.cpp +++ b/inference-engine/samples/common/format_reader/MnistUbyte.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/common/format_reader/MnistUbyte.h b/inference-engine/samples/common/format_reader/MnistUbyte.h index d9d51c4..fd6ae0f 100644 --- a/inference-engine/samples/common/format_reader/MnistUbyte.h +++ b/inference-engine/samples/common/format_reader/MnistUbyte.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -47,7 +47,7 @@ public: delete this; } - std::shared_ptr getData(int width, int height) override { + std::shared_ptr getData(size_t width, size_t height) override { if ((width * height != 0) && (_width * _height != width * height)) { std::cout << "[ WARNING ] Image won't be resized! Please use OpenCV.\n"; return nullptr; diff --git a/inference-engine/samples/common/format_reader/bmp.cpp b/inference-engine/samples/common/format_reader/bmp.cpp index 56822ff..b52f839 100644 --- a/inference-engine/samples/common/format_reader/bmp.cpp +++ b/inference-engine/samples/common/format_reader/bmp.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/common/format_reader/bmp.h b/inference-engine/samples/common/format_reader/bmp.h index 53ca373..b1b05df 100644 --- a/inference-engine/samples/common/format_reader/bmp.h +++ b/inference-engine/samples/common/format_reader/bmp.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -64,7 +64,7 @@ public: delete this; } - std::shared_ptr getData(int width, int height) override { + std::shared_ptr getData(size_t width, size_t height) override { if ((width * height != 0) && (_width * _height != width * height)) { std::cout << "[ WARNING ] Image won't be resized! Please use OpenCV.\n"; return nullptr; diff --git a/inference-engine/samples/common/format_reader/format_reader.cpp b/inference-engine/samples/common/format_reader/format_reader.cpp index a698431..30f3345 100644 --- a/inference-engine/samples/common/format_reader/format_reader.cpp +++ b/inference-engine/samples/common/format_reader/format_reader.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/common/format_reader/format_reader.h b/inference-engine/samples/common/format_reader/format_reader.h index 8a4cfcd..d0c7462 100644 --- a/inference-engine/samples/common/format_reader/format_reader.h +++ b/inference-engine/samples/common/format_reader/format_reader.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,7 +11,7 @@ #include #include #include -#include +#include #if defined(_WIN32) # ifdef IMPLEMENT_FORMAT_READER @@ -62,7 +62,7 @@ public: * @return shared pointer with input data * @In case of using OpenCV, parameters width and height will be used for image resizing */ - virtual std::shared_ptr getData(int width = 0, int height = 0) = 0; + virtual std::shared_ptr getData(size_t width = 0, size_t height = 0) = 0; /** * \brief Get size diff --git a/inference-engine/samples/common/format_reader/format_reader_ptr.h b/inference-engine/samples/common/format_reader/format_reader_ptr.h index faba463..0b82d46 100644 --- a/inference-engine/samples/common/format_reader/format_reader_ptr.h +++ b/inference-engine/samples/common/format_reader/format_reader_ptr.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/common/format_reader/opencv_wraper.cpp b/inference-engine/samples/common/format_reader/opencv_wraper.cpp index b29b39b..835402a 100644 --- a/inference-engine/samples/common/format_reader/opencv_wraper.cpp +++ b/inference-engine/samples/common/format_reader/opencv_wraper.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,11 +27,11 @@ OCVReader::OCVReader(const string &filename) { _height = img.size().height; } -std::shared_ptr OCVReader::getData(int width = 0, int height = 0) { +std::shared_ptr OCVReader::getData(size_t width = 0, size_t height = 0) { cv::Mat resized(img); if (width != 0 && height != 0) { - int iw = img.size().width; - int ih = img.size().height; + size_t iw = img.size().width; + size_t ih = img.size().height; if (width != iw || height != ih) { slog::warn << "Image is resized from (" << iw << ", " << ih << ") to (" << width << ", " << height << ")" << slog::endl; } diff --git a/inference-engine/samples/common/format_reader/opencv_wraper.h b/inference-engine/samples/common/format_reader/opencv_wraper.h index e4b40b8..5dc0b12 100644 --- a/inference-engine/samples/common/format_reader/opencv_wraper.h +++ b/inference-engine/samples/common/format_reader/opencv_wraper.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -50,7 +50,7 @@ public: delete this; } - std::shared_ptr getData(int width, int height) override; + std::shared_ptr getData(size_t width, size_t height) override; }; } // namespace FormatReader #endif \ No newline at end of file diff --git a/inference-engine/samples/common/format_reader/register.h b/inference-engine/samples/common/format_reader/register.h index 764b5b4..34cf1f7 100644 --- a/inference-engine/samples/common/format_reader/register.h +++ b/inference-engine/samples/common/format_reader/register.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // /** diff --git a/inference-engine/samples/common/os/windows/w_dirent.h b/inference-engine/samples/common/os/windows/w_dirent.h index 40bcf9e..e9111d9 100644 --- a/inference-engine/samples/common/os/windows/w_dirent.h +++ b/inference-engine/samples/common/os/windows/w_dirent.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,10 @@ #if defined(_WIN32) +#ifndef NOMINMAX +# define NOMINMAX +#endif + #include #include #include diff --git a/inference-engine/samples/common/samples/args_helper.hpp b/inference-engine/samples/common/samples/args_helper.hpp index 9edfb97..a38570b 100644 --- a/inference-engine/samples/common/samples/args_helper.hpp +++ b/inference-engine/samples/common/samples/args_helper.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/common/samples/classification_results.h b/inference-engine/samples/common/samples/classification_results.h new file mode 100644 index 0000000..3cf0a2b --- /dev/null +++ b/inference-engine/samples/common/samples/classification_results.h @@ -0,0 +1,92 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief a header file with ouput classification results + * @file classification_results.hpp + */ +#include +#include +#include +#include + +#include + +/** + * @class ClassificationResult + * @brief A ClassificationResult creates an output table with results + */ +class ClassificationResult { +private: + const std::string _classidStr = "classid"; + const std::string _probabilityStr = "probability"; + const std::string _labelStr = "label"; + size_t _nTop; + InferenceEngine::Blob::Ptr _outBlob; + const std::vector _labels; + const std::vector _imageNames; + const size_t _batchSize; + + void printHeader() { + std::cout << _classidStr << " " << _probabilityStr; + if (!_labels.empty()) + std::cout << " " << _labelStr; + std::string classidColumn(_classidStr.length(), '-'); + std::string probabilityColumn(_probabilityStr.length(), '-'); + std::string labelColumn(_labelStr.length(), '-'); + std::cout << std::endl << classidColumn << " " << probabilityColumn; + if (!_labels.empty()) + std::cout << " " << labelColumn; + std::cout << std::endl; + } + +public: + explicit ClassificationResult(InferenceEngine::Blob::Ptr output_blob, + std::vector image_names = {}, + size_t batch_size = 1, + size_t num_of_top = 10, + std::vector labels = {}) : + _nTop(num_of_top), + _outBlob(std::move(output_blob)), + _labels(std::move(labels)), + _imageNames(std::move(image_names)), + _batchSize(batch_size) { + if (_imageNames.size() != _batchSize) { + throw std::logic_error("Batch size should be equal to the number of images."); + } + } + + /** + * @brief prints formatted classification results + */ + void print() { + /** This vector stores id's of top N results **/ + std::vector results; + TopResults(_nTop, *_outBlob, results); + + /** Print the result iterating over each batch **/ + std::cout << std::endl << "Top " << _nTop << " results:" << std::endl << std::endl; + for (unsigned int image_id = 0; image_id < _batchSize; ++image_id) { + std::cout << "Image " << _imageNames[image_id] << std::endl << std::endl; + printHeader(); + + for (size_t id = image_id * _nTop, cnt = 0; id < (image_id + 1) * _nTop; ++cnt, ++id) { + std::cout.precision(7); + /** Getting probability for resulting class **/ + const auto result = _outBlob->buffer(). + as::value_type*>() + [results[id] + image_id * (_outBlob->size() / _batchSize)]; + + std::cout << std::setw(static_cast(_classidStr.length())) << std::left << results[id] << " "; + std::cout << std::left << std::setw(static_cast(_probabilityStr.length())) << std::fixed << result; + + if (!_labels.empty()) { + std::cout << " " + _labels[results[id]]; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + } +}; diff --git a/inference-engine/samples/common/samples/common.hpp b/inference-engine/samples/common/samples/common.hpp index 88c87e3..44bcca3 100644 --- a/inference-engine/samples/common/samples/common.hpp +++ b/inference-engine/samples/common/samples/common.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -113,7 +113,7 @@ static UNUSED InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std:: * @param filepath - full file name * @return filename without extension */ -static std::string fileNameNoExt(const std::string &filepath) { +static UNUSED std::string fileNameNoExt(const std::string &filepath) { auto pos = filepath.rfind('.'); if (pos == std::string::npos) return filepath; return filepath.substr(0, pos); @@ -640,6 +640,19 @@ inline double getDurationOf(std::function func) { return std::chrono::duration_cast>>(fs).count(); } +static std::vector> +perfCountersSorted(std::map perfMap) { + using perfItem = std::pair; + std::vector sorted; + for (auto &kvp : perfMap) sorted.push_back(kvp); + + std::stable_sort(sorted.begin(), sorted.end(), + [](const perfItem& l, const perfItem& r) { + return l.second.execution_index < r.second.execution_index; + }); + + return sorted; +} static UNUSED void printPerformanceCounts(const std::map& performanceMap, std::ostream &stream, @@ -649,7 +662,10 @@ static UNUSED void printPerformanceCounts(const std::map perfomanceMap; - plugin->GetPerformanceCounts(perfomanceMap, nullptr); - printPerformanceCounts(perfomanceMap, stream); + std::map performanceMap; + plugin->GetPerformanceCounts(performanceMap, nullptr); + printPerformanceCounts(performanceMap, stream); } /** @@ -883,7 +899,7 @@ public: for (auto desObj = desiredObjects.alist.begin(); desObj != desiredObjects.alist.end(); desObj++, j++) { double iou = DetectedObject::ioU(detObj, *desObj); if (iou > overlap_max) { - overlap_max = iou; + overlap_max = static_cast(iou); jmax = j; desmax = desObj; } @@ -964,7 +980,7 @@ public: break; } else { if (max_precs[j] < prec[i]) { - max_precs[j] = prec[i]; + max_precs[j] = static_cast(prec[i]); } } } @@ -1014,10 +1030,10 @@ static UNUSED void addRectangles(unsigned char *data, size_t height, size_t widt for (size_t i = 0; i < detectedObjects.size(); i++) { int cls = detectedObjects[i].objectType % colors.size(); - int xmin = detectedObjects[i].xmin * width; - int xmax = detectedObjects[i].xmax * width; - int ymin = detectedObjects[i].ymin * height; - int ymax = detectedObjects[i].ymax * height; + int xmin = static_cast(detectedObjects[i].xmin * width); + int xmax = static_cast(detectedObjects[i].xmax * width); + int ymin = static_cast(detectedObjects[i].ymin * height); + int ymax = static_cast(detectedObjects[i].ymax * height); size_t shift_first = ymin*width * 3; size_t shift_second = ymax*width * 3; diff --git a/inference-engine/samples/validation_app/console_progress.hpp b/inference-engine/samples/common/samples/console_progress.hpp similarity index 95% rename from inference-engine/samples/validation_app/console_progress.hpp rename to inference-engine/samples/common/samples/console_progress.hpp index 35047a4..89b0d74 100644 --- a/inference-engine/samples/validation_app/console_progress.hpp +++ b/inference-engine/samples/common/samples/console_progress.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -69,7 +69,7 @@ public: * @param add - value to add */ void addProgress(int add) { - if (add < 0 && -add > current) { + if (add < 0 && -add > static_cast(current)) { add = -static_cast(current); } updateProgress(current + add); diff --git a/inference-engine/samples/validation_app/csv_dumper.hpp b/inference-engine/samples/common/samples/csv_dumper.hpp similarity index 98% rename from inference-engine/samples/validation_app/csv_dumper.hpp rename to inference-engine/samples/common/samples/csv_dumper.hpp index 2e0b22f..4dbcfa1 100644 --- a/inference-engine/samples/validation_app/csv_dumper.hpp +++ b/inference-engine/samples/common/samples/csv_dumper.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/common/samples/ocv_common.hpp b/inference-engine/samples/common/samples/ocv_common.hpp index c979cd3..9372503 100644 --- a/inference-engine/samples/common/samples/ocv_common.hpp +++ b/inference-engine/samples/common/samples/ocv_common.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,7 +27,8 @@ void matU8ToBlob(const cv::Mat& orig_image, InferenceEngine::Blob::Ptr& blob, in T* blob_data = blob->buffer().as(); cv::Mat resized_image(orig_image); - if (width != orig_image.size().width || height!= orig_image.size().height) { + if (static_cast(width) != orig_image.size().width || + static_cast(height) != orig_image.size().height) { cv::resize(orig_image, resized_image, cv::Size(width, height)); } @@ -50,7 +51,7 @@ void matU8ToBlob(const cv::Mat& orig_image, InferenceEngine::Blob::Ptr& blob, in * @param mat - given cv::Mat object with an image data. * @return resulting Blob pointer. */ -static InferenceEngine::Blob::Ptr wrapMat2Blob(const cv::Mat &mat) { +static UNUSED InferenceEngine::Blob::Ptr wrapMat2Blob(const cv::Mat &mat) { size_t channels = mat.channels(); size_t height = mat.size().height; size_t width = mat.size().width; diff --git a/inference-engine/samples/common/samples/slog.hpp b/inference-engine/samples/common/samples/slog.hpp index 23eb8d3..c50b4c9 100644 --- a/inference-engine/samples/common/samples/slog.hpp +++ b/inference-engine/samples/common/samples/slog.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/create_msvc2015_solution.bat b/inference-engine/samples/create_msvc2015_solution.bat deleted file mode 100644 index b0f67c8..0000000 --- a/inference-engine/samples/create_msvc2015_solution.bat +++ /dev/null @@ -1,31 +0,0 @@ -@echo off - -:: Copyright (c) 2018 Intel Corporation -:: -:: Licensed under the Apache License, Version 2.0 (the "License"); -:: you may not use this file except in compliance with the License. -:: You may obtain a copy of the License at -:: -:: http://www.apache.org/licenses/LICENSE-2.0 -:: -:: Unless required by applicable law or agreed to in writing, software -:: distributed under the License is distributed on an "AS IS" BASIS, -:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -:: See the License for the specific language governing permissions and -:: limitations under the License. - - -@setlocal -set "ROOT_DIR=%~dp0" - -set "SOLUTION_DIR64=%USERPROFILE%\Documents\Intel\OpenVINO\inference_engine_samples_2015" -if exist "%SOLUTION_DIR64%" rd /s /q "%SOLUTION_DIR64%" -if "%InferenceEngine_DIR%"=="" set "InferenceEngine_DIR=%ROOT_DIR%\..\share" -if exist "%ROOT_DIR%\..\..\bin\setupvars.bat" call "%ROOT_DIR%\..\..\bin\setupvars.bat" -if exist "%ROOT_DIR%\..\..\..\bin\setupvars.bat" call "%ROOT_DIR%\..\..\..\bin\setupvars.bat" - -echo Creating Visual Studio 2015 (x64) files in %SOLUTION_DIR64%... && ^ -cd "%ROOT_DIR%" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio 14 2015 Win64" "%ROOT_DIR%" - -echo Done. -pause \ No newline at end of file diff --git a/inference-engine/samples/create_msvc2017_solution.bat b/inference-engine/samples/create_msvc2017_solution.bat deleted file mode 100644 index 6bc3521..0000000 --- a/inference-engine/samples/create_msvc2017_solution.bat +++ /dev/null @@ -1,31 +0,0 @@ -@echo off - -:: Copyright (c) 2018 Intel Corporation -:: -:: Licensed under the Apache License, Version 2.0 (the "License"); -:: you may not use this file except in compliance with the License. -:: You may obtain a copy of the License at -:: -:: http://www.apache.org/licenses/LICENSE-2.0 -:: -:: Unless required by applicable law or agreed to in writing, software -:: distributed under the License is distributed on an "AS IS" BASIS, -:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -:: See the License for the specific language governing permissions and -:: limitations under the License. - - -@setlocal -set "ROOT_DIR=%~dp0" - -set "SOLUTION_DIR64=%USERPROFILE%\Documents\Intel\OpenVINO\inference_engine_samples_2017" -if exist "%SOLUTION_DIR64%" rd /s /q "%SOLUTION_DIR64%" -if "%InferenceEngine_DIR%"=="" set "InferenceEngine_DIR=%ROOT_DIR%\..\share" -if exist "%ROOT_DIR%\..\..\bin\setupvars.bat" call "%ROOT_DIR%\..\..\bin\setupvars.bat" -if exist "%ROOT_DIR%\..\..\..\bin\setupvars.bat" call "%ROOT_DIR%\..\..\..\bin\setupvars.bat" - -echo Creating Visual Studio 2017 (x64) files in %SOLUTION_DIR64%... && ^ -cd "%ROOT_DIR%" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio 15 2017 Win64" "%ROOT_DIR%" - -echo Done. -pause \ No newline at end of file diff --git a/inference-engine/samples/hello_autoresize_classification/CMakeLists.txt b/inference-engine/samples/hello_autoresize_classification/CMakeLists.txt index d70a974..01deda6 100644 --- a/inference-engine/samples/hello_autoresize_classification/CMakeLists.txt +++ b/inference-engine/samples/hello_autoresize_classification/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "hello_autoresize_classification") file (GLOB SRC diff --git a/inference-engine/samples/hello_autoresize_classification/README.md b/inference-engine/samples/hello_autoresize_classification/README.md index 524ec22..bb479b7 100644 --- a/inference-engine/samples/hello_autoresize_classification/README.md +++ b/inference-engine/samples/hello_autoresize_classification/README.md @@ -1,28 +1,33 @@ -# Hello Autoresize Classification Sample +# Hello Autoresize Classification C++ Sample This topic describes how to run the Hello Autoresize Classification sample application. -The sample is simplified version of [Image Classification Sample](./samples/classification_sample/README.md). -It's intended to demonstrate using of new input autoresize API of Inference Engine in applications. Refer to -[Integrate with customer application New Request API](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details. +The sample is simplified version of [Image Classification Sample](./inference-engine/samples/classification_sample/README.md). +It demonstrates how to use the new input autoresize API of Inference Engine in applications. Refer to +[Integrate the Inference Engine New Request API with Your Application](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details. There is also new API introduced to crop a ROI object and set it as input without additional memory re-allocation. -To properly demonstrate this new API it's required to run several networks in pipeline which is out of scope of this sample. -Please refer to [Object Detection for SSD Demo app](./samples/object_detection_demo_ssd_async/README.md) or -[Security Barrier Camera Demo](./samples/security_barrier_camera_demo/README.md) or -[Crossroad Camera Demo](./samples/crossroad_camera_demo/README.md) with an example of using of new crop ROI API. +To properly demonstrate this new API, it is required to run several networks in pipeline which is out of scope of this sample. +Please refer to [Object Detection for SSD Demo](./inference-engine/samples/object_detection_demo_ssd_async/README.md), +[Security Barrier Camera Demo](./inference-engine/samples/security_barrier_camera_demo/README.md), or +[Crossroad Camera Demo](./inference-engine/samples/crossroad_camera_demo/README.md) with an example of using of new crop ROI API. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). ## Running -You can do inference on an image using a trained AlexNet network on Intel® Processors using the following command: +To run the sample, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + +You can do inference on an image using a trained AlexNet network on CPU using the following command: ```sh ./hello_autoresize_classification /alexnet_fp32.xml /cat.bmp CPU ``` -> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). - -### Outputs +## Sample Output -The application outputs top-10 inference results. +The application outputs top-10 inference results. -## See Also +## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/hello_autoresize_classification/main.cpp b/inference-engine/samples/hello_autoresize_classification/main.cpp index 2ac9337..9700416 100644 --- a/inference-engine/samples/hello_autoresize_classification/main.cpp +++ b/inference-engine/samples/hello_autoresize_classification/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include #include +#include using namespace InferenceEngine; @@ -28,11 +29,11 @@ int main(int argc, char *argv[]) { // ----------------------------------------------------------------------------------------------------- // --------------------------- 1. Load Plugin for inference engine ------------------------------------- - InferencePlugin plugin = PluginDispatcher({"../../../lib/intel64", ""}).getPluginByDevice(device_name); + InferencePlugin plugin = PluginDispatcher().getPluginByDevice(device_name); // ----------------------------------------------------------------------------------------------------- // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------ - int batchSize = 1; + size_t batchSize = 1; CNNNetReader network_reader; network_reader.ReadNetwork(input_model); network_reader.ReadWeights(input_model.substr(0, input_model.size() - 4) + ".bin"); @@ -90,18 +91,9 @@ int main(int argc, char *argv[]) { // --------------------------- 8. Process output ------------------------------------------------------ Blob::Ptr output = infer_request.GetBlob(output_name); - auto output_data = output->buffer().as::value_type*>(); - - std::vector results; - /* This is to sort output probabilities and put it to results vector */ - TopResults(10, *output, results); - - std::cout << std::endl << "Top 10 results:" << std::endl << std::endl; - for (size_t id = 0; id < 10; ++id) { - std::cout.precision(7); - auto result = output_data[results[id]]; - std::cout << std::left << std::fixed << result << " label #" << results[id] << std::endl; - } + // Print classification results + ClassificationResult classificationResult(output, {input_image_path}); + classificationResult.print(); // ----------------------------------------------------------------------------------------------------- std::cout << std::endl << "total inference time: " << total << std::endl; diff --git a/inference-engine/samples/hello_classification/CMakeLists.txt b/inference-engine/samples/hello_classification/CMakeLists.txt index 9531a21..845f7e9 100644 --- a/inference-engine/samples/hello_classification/CMakeLists.txt +++ b/inference-engine/samples/hello_classification/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "hello_classification") file (GLOB SRC diff --git a/inference-engine/samples/hello_classification/main.cpp b/inference-engine/samples/hello_classification/main.cpp index d9482e1..b3b5158 100644 --- a/inference-engine/samples/hello_classification/main.cpp +++ b/inference-engine/samples/hello_classification/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,6 +14,7 @@ #include #include +#include using namespace InferenceEngine; @@ -41,8 +42,7 @@ int wmain(int argc, wchar_t *argv[]) { // ----------------------------------------------------------------------------------------------------- // --------------------------- 1. Load Plugin for inference engine ------------------------------------- - PluginDispatcher dispatcher({_T("../../../lib/intel64"), _T("")}); - InferencePlugin plugin(dispatcher.getSuitablePlugin(TargetDevice::eCPU)); + InferencePlugin plugin(PluginDispatcher().getSuitablePlugin(TargetDevice::eCPU)); // ----------------------------------------------------------------------------------------------------- // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------ @@ -103,18 +103,10 @@ int wmain(int argc, wchar_t *argv[]) { // --------------------------- 8. Process output ------------------------------------------------------ Blob::Ptr output = infer_request.GetBlob(output_name); - auto output_data = output->buffer().as::value_type*>(); + // Print classification results + ClassificationResult classificationResult(output, {fileNameToString(input_image_path)}); + classificationResult.print(); - std::vector results; - /* This is to sort output probabilities and put it to results vector */ - TopResults(10, *output, results); - - std::cout << std::endl << "Top 10 results:" << std::endl << std::endl; - for (size_t id = 0; id < 10; ++id) { - std::cout.precision(7); - auto result = output_data[results[id]]; - std::cout << std::left << std::fixed << result << " label #" << results[id] << std::endl; - } // ----------------------------------------------------------------------------------------------------- } catch (const std::exception & ex) { std::cerr << ex.what() << std::endl; diff --git a/inference-engine/samples/hello_request_classification/CMakeLists.txt b/inference-engine/samples/hello_request_classification/CMakeLists.txt index 8818453..c7dbb1e 100644 --- a/inference-engine/samples/hello_request_classification/CMakeLists.txt +++ b/inference-engine/samples/hello_request_classification/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "hello_request_classification") file (GLOB SRC diff --git a/inference-engine/samples/hello_request_classification/README.md b/inference-engine/samples/hello_request_classification/README.md index 708fa81..fd8d35b 100644 --- a/inference-engine/samples/hello_request_classification/README.md +++ b/inference-engine/samples/hello_request_classification/README.md @@ -1,23 +1,26 @@ -# Hello Infer Request Classification Sample +# Hello Infer Request Classification C++ Sample This topic describes how to run the Hello Infer Classification sample application. -The sample is simplified version of [Image Classification Sample](./samples/classification_sample/README.md). -It's intended to demonstrate using of new Infer Request API of Inference Engine in applications. Refer to -[Integrate with customer application New Request API](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details. +The sample is simplified version of [Image Classification Sample](./inference-engine/samples/classification_sample/README.md). +It demonstrates how to use the new Infer Request API of Inference Engine in applications. Refer to +[Integrate the Inference Engine New Request API with Your Application](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). ## Running -You can do inference on an image using a trained AlexNet network on Intel® Processors using the following command: -```sh -./hello_autoresize_classification /alexnet_fp32.xml /cat.bmp CPU -``` +To run the sample, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). > **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). -### Outputs +You can do inference on an image using a trained AlexNet network on CPU using the following command: +```sh +./hello_autoresize_classification /alexnet_fp32.xml /cat.bmp CPU +``` -The application outputs top-10 inference results. +## Sample Output +The application outputs top-10 inference results. -## See Also +## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) diff --git a/inference-engine/samples/hello_request_classification/main.cpp b/inference-engine/samples/hello_request_classification/main.cpp index d5fabb2..e03142b 100644 --- a/inference-engine/samples/hello_request_classification/main.cpp +++ b/inference-engine/samples/hello_request_classification/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include #include +#include using namespace InferenceEngine; @@ -28,7 +29,7 @@ int main(int argc, char *argv[]) { // ----------------------------------------------------------------------------------------------------- // --------------------------- 1. Load Plugin for inference engine ------------------------------------- - InferencePlugin plugin = PluginDispatcher({"../../../lib/intel64", ""}).getPluginByDevice(device_name); + InferencePlugin plugin = PluginDispatcher().getPluginByDevice(device_name); // ----------------------------------------------------------------------------------------------------- // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------ @@ -123,18 +124,10 @@ int main(int argc, char *argv[]) { // --------------------------- 8. Process output ------------------------------------------------------- for (auto &item : output_info) { auto output_name = item.first; - Blob::Ptr output = async_infer_request.GetBlob(output_name); - auto output_buffer = output->buffer().as::value_type *>(); - std::vector results; - /** This is to sort output probabilities and put it to results vector **/ - TopResults(10, *output, results); - - std::cout << std::endl << "Top 10 results:" << std::endl << std::endl; - for (size_t id = 0; id < 10; ++id) { - std::cout.precision(7); - auto result = output_buffer[results[id]]; - std::cout << std::left << std::fixed << result << " label #" << results[id] << std::endl; - } + Blob::Ptr output = async_infer_request.GetBlob(output_name);; + // Print classification results + ClassificationResult classificationResult(output, {input_image_path}); + classificationResult.print(); } // ----------------------------------------------------------------------------------------------------- } catch (const std::exception & ex) { diff --git a/inference-engine/samples/hello_shape_infer_ssd/CMakeLists.txt b/inference-engine/samples/hello_shape_infer_ssd/CMakeLists.txt index ffc9856..b0ef62b 100644 --- a/inference-engine/samples/hello_shape_infer_ssd/CMakeLists.txt +++ b/inference-engine/samples/hello_shape_infer_ssd/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 2.8) set(TARGET_NAME "hello_shape_infer_ssd") diff --git a/inference-engine/samples/hello_shape_infer_ssd/README.md b/inference-engine/samples/hello_shape_infer_ssd/README.md index f275abc..0f3846e 100644 --- a/inference-engine/samples/hello_shape_infer_ssd/README.md +++ b/inference-engine/samples/hello_shape_infer_ssd/README.md @@ -1,18 +1,22 @@ -# Hello Shape Infer Sample +# Hello Shape Infer C++ Sample This topic demonstrates how to run the Hello Shape Infer SSD application, which does inference using object detection networks like SSD-VGG. The sample shows how to use [Shape Inference feature](./docs/IE_DG/ShapeInference.md). +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + ## Running -You can use the following command to do inference on Intel® Processors on an image using a trained SSD network: +To run the sample, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + +You can use the following command to do inference on CPU on an image using a trained SSD network: ```sh ./hello_shape_infer_ssd /ssd_300.xml /500x500.bmp CPU 3 ``` -> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). - -### Outputs +## Sample Output The application renders an image with detected objects enclosed in rectangles. It outputs the list of classes of the detected objects along with the respective confidence values and the coordinates of the @@ -20,3 +24,5 @@ rectangles to the standard output stream. ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) diff --git a/inference-engine/samples/hello_shape_infer_ssd/main.cpp b/inference-engine/samples/hello_shape_infer_ssd/main.cpp index 020b941..ee691e5 100644 --- a/inference-engine/samples/hello_shape_infer_ssd/main.cpp +++ b/inference-engine/samples/hello_shape_infer_ssd/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -29,7 +29,7 @@ int main(int argc, char* argv[]) { // ----------------------------------------------------------------------------------------------------- // --------------------------- 1. Load Plugin for inference engine ------------------------------------- - InferencePlugin plugin = PluginDispatcher({"../../../lib/intel64", ""}).getPluginByDevice(device_name); + InferencePlugin plugin = PluginDispatcher().getPluginByDevice(device_name); IExtensionPtr cpuExtension, inPlaceExtension; if (device_name == "CPU") { cpuExtension = std::make_shared(); @@ -53,7 +53,6 @@ int main(int argc, char* argv[]) { // --------------------------- Resize network to match image sizes and given batch---------------------- if (device_name == "CPU") { - // register shape inference functions (SpatialTransformer) from CPU Extension network.AddExtension(cpuExtension); // register sample's custom shape inference (CustomReLU) network.AddExtension(inPlaceExtension); @@ -121,7 +120,7 @@ int main(int argc, char* argv[]) { // --------------------------- 6. Prepare input -------------------------------------------------------- Blob::Ptr input = infer_request.GetBlob(input_name); - for (int b = 0; b < batch_size; b++) { + for (size_t b = 0; b < batch_size; b++) { matU8ToBlob(image, input, b); } // ----------------------------------------------------------------------------------------------------- diff --git a/inference-engine/samples/hello_shape_infer_ssd/shape_infer_extension.hpp b/inference-engine/samples/hello_shape_infer_ssd/shape_infer_extension.hpp index 110fa65..e70afd0 100644 --- a/inference-engine/samples/hello_shape_infer_ssd/shape_infer_extension.hpp +++ b/inference-engine/samples/hello_shape_infer_ssd/shape_infer_extension.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -78,7 +78,7 @@ private: class CustomReLUResizeImpl : public InferenceEngine::IShapeInferImpl { public: - InferenceEngine::StatusCode inferShapes(const std::vector& inShapes, + InferenceEngine::StatusCode inferShapes(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes, @@ -89,7 +89,9 @@ public: " shape inference for the first time (next messages won't be printed)" << std::endl; wasCalled = true; } - outShapes = inShapes; + for (const auto& blob : inBlobs) { + outShapes.push_back(blob->getTensorDesc().getDims()); + } return InferenceEngine::StatusCode::OK; } }; diff --git a/inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt b/inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt index aab4788..f8960bd 100644 --- a/inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt +++ b/inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "lenet_network_graph_builder") file (GLOB MAIN_SRC @@ -34,4 +32,4 @@ target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES} gflags format_ if(UNIX) target_link_libraries( ${TARGET_NAME} ${LIB_DL} pthread) -endif() \ No newline at end of file +endif() diff --git a/inference-engine/samples/lenet_network_graph_builder/README.md b/inference-engine/samples/lenet_network_graph_builder/README.md index d7fdfb7..6ba3d1b 100644 --- a/inference-engine/samples/lenet_network_graph_builder/README.md +++ b/inference-engine/samples/lenet_network_graph_builder/README.md @@ -1,12 +1,23 @@ -# Lenet Number Classifications Network using Graph Builder API +# LeNet Number Classifications Network Using Graph Builder API This sample demonstrates how to execute inference using Inference Engine Graph Builder API to build a network on example of the LeNet classifications network. -XML file is not required for network building now. Inference Engine Graph Builder API allows building of a network "on the fly" from source code. The sample uses 1-channel ubyte pictures as input. -
+ +XML file is not required for network building now. Inference Engine Graph Builder API allows building of a network "on the fly" from source code. The sample uses one-channel `ubyte` pictures as input. + +## How It Works + +Upon the start-up the sample reads command line parameters and builds a network using Graph Builder API and passed weights file. +Then, the application loads built network and an image to the Inference Engine plugin. + +When inference is done, the application outputs inference results to the standard output stream. + +> **NOTE**: This sample is implemented to support models with FP32 weights only. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). ## Running -Running the application with the -h option yields the following usage message: +Running the application with the `-h` option yields the following usage message: ```sh ./lenet_network_graph_builder -h InferenceEngine: @@ -19,11 +30,11 @@ Options: -h Print a usage message. -m "" Path to a .bin file with weights for trained model -i "" Required. Path to image or folder with images - -d "" Specify the target device to infer on this. Sample will look for a suitable plugin for device specified(default value is CPU) + -d "" Specify the target device to infer on this. Sample will look for a suitable plugin for device specified. Default value is CPU -pp "" Path to a plugin folder -pc Enables per-layer performance report - -nt "" Number of top results (default 10) - -ni "" Number of iterations (default 1) + -nt "" Number of top results. Default value is 10 + -ni "" Number of iterations. Default value is 1 ``` @@ -34,21 +45,10 @@ For example, to do inference of an ubyte image on a GPU run the following comman ./lenet_network_graph_builder -i -m -d GPU ``` -### Outputs +## Sample Output By default the application outputs top-10 inference results for each infer request. In addition to this information it will provide throughput value measured in frames per seconds. -### How it works - -Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference -Engine plugin. When inference is done, the application creates an -output image and outputs data to the standard output stream. - -Upon the start-up the sample reads command line parameters and builds a network using Graph Builder API and passed weights file. -Then, the application loads built network and an image to the Inference Engine plugin. - -When inference is done, the application outputs inference results to the standard output stream. - ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) diff --git a/inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp b/inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp index 7cb59e2..47c6277 100644 --- a/inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp +++ b/inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,8 +15,6 @@ #include #endif -#define DEFAULT_PATH_P "./lib" - /// @brief message for help argument static const char help_message[] = "Print a usage message"; @@ -28,8 +26,8 @@ static const char model_message[] = "Path to an .bin file with weights for train /// @brief message for assigning cnn calculation to device static const char target_device_message[] = "Specify the target device to infer on this. " \ - "Sample will look for a suitable plugin for device specified" \ - "(default value is CPU)"; + "Sample will look for a suitable plugin for device specified. " \ + "Default value is CPU"; /// @brief message for plugin_path argument static const char plugin_path_message[] = "Path to a plugin folder"; @@ -38,10 +36,10 @@ static const char plugin_path_message[] = "Path to a plugin folder"; static const char performance_counter_message[] = "Enables per-layer performance report"; /// @brief message for top results number -static const char ntop_message[] = "Number of top results (default 10)"; +static const char ntop_message[] = "Number of top results. Default 10"; /// @brief message for iterations count -static const char iterations_count_message[] = "Number of iterations (default 1)"; +static const char iterations_count_message[] = "Number of iterations. Default value is 1"; /// \brief Define flag for showing help message
DEFINE_bool(h, false, help_message); @@ -65,10 +63,10 @@ DEFINE_string(pp, "", plugin_path_message); DEFINE_bool(pc, false, performance_counter_message); /// @brief Top results number (default 10)
-DEFINE_int32(nt, 10, ntop_message); +DEFINE_uint32(nt, 10, ntop_message); /// @brief Iterations count (default 1) -DEFINE_int32(ni, 1, iterations_count_message); +DEFINE_uint32(ni, 1, iterations_count_message); /** * \brief This function show a help message @@ -87,4 +85,3 @@ static void showUsage() { std::cout << " -nt \"\" " << ntop_message << std::endl; std::cout << " -ni \"\" " << iterations_count_message << std::endl; } - diff --git a/inference-engine/samples/lenet_network_graph_builder/main.cpp b/inference-engine/samples/lenet_network_graph_builder/main.cpp index cd9031a..ab63bab 100644 --- a/inference-engine/samples/lenet_network_graph_builder/main.cpp +++ b/inference-engine/samples/lenet_network_graph_builder/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -95,7 +96,7 @@ int main(int argc, char *argv[]) { // --------------------------- 1. Load Plugin for inference engine ------------------------------------- slog::info << "Loading plugin" << slog::endl; - InferencePlugin plugin = PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); printPluginVersion(plugin, std::cout); /** Per layer metrics **/ @@ -108,14 +109,16 @@ int main(int argc, char *argv[]) { TBlob::CPtr weightsPtr = ReadWeights(FLAGS_m); Builder::Network builder("LeNet"); - size_t layerId = builder.addLayer(Builder::InputLayer("data").setPort(Port({1, 1, 28, 28}))); + idx_t layerId = builder.addLayer(Builder::InputLayer("data").setPort(Port({1, 1, 28, 28}))); auto ptrWeights = make_shared_blob(TensorDesc(Precision::FP32, {500}, Layout::C), weightsPtr->cbuffer().as()); auto ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {20}, Layout::C), weightsPtr->cbuffer().as() + 500); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer("conv1").setKernel({5, 5}).setDilation({1, 1}) - .setGroup(1).setStrides({1, 1}).setOutDepth(20).setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}) - .setWeights(ptrWeights).setBiases(ptrBiases)); + idx_t weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(ptrWeights)); + idx_t biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(ptrBiases)); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer("conv1") + .setKernel({5, 5}).setDilation({1, 1}).setGroup(1).setStrides({1, 1}).setOutDepth(20) + .setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0})); layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer("pool1").setExcludePad(true).setKernel({2, 2}) .setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}) .setPoolingType(Builder::PoolingLayer::PoolingType::MAX) @@ -124,9 +127,11 @@ int main(int argc, char *argv[]) { weightsPtr->cbuffer().as() + 520); ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {50}, Layout::C), weightsPtr->cbuffer().as() + 25520); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer("conv2").setDilation({1, 1}).setGroup(1) - .setKernel({5, 5}).setOutDepth(50).setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}) - .setStrides({1, 1}).setWeights(ptrWeights).setBiases(ptrBiases)); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(ptrWeights)); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(ptrBiases)); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer("conv2") + .setDilation({1, 1}).setGroup(1).setKernel({5, 5}).setOutDepth(50).setPaddingsBegin({0, 0}) + .setPaddingsEnd({0, 0}).setStrides({1, 1})); layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer("pool2").setExcludePad(true).setKernel({2, 2}) .setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX) .setRoundingType(Builder::PoolingLayer::RoundingType::CEIL).setStrides({2, 2})); @@ -134,17 +139,21 @@ int main(int argc, char *argv[]) { weightsPtr->cbuffer().as() + 102280 / 4); ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {500}, Layout::C), weightsPtr->cbuffer().as() + 1702280 / 4); - layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer("ip1").setOutputNum(500) - .setWeights(ptrWeights).setBiases(ptrBiases)); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(ptrWeights)); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(ptrBiases)); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::FullyConnectedLayer("ip1") + .setOutputNum(500)); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer("relu1").setNegativeSlope(0.0f)); ptrWeights = make_shared_blob(TensorDesc(Precision::FP32, {5000}, Layout::C), weightsPtr->cbuffer().as() + 1704280 / 4); ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {10}, Layout::C), weightsPtr->cbuffer().as() + 1724280 / 4); - layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer("ip2").setOutputNum(10) - .setWeights(ptrWeights).setBiases(ptrBiases)); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(ptrWeights)); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(ptrBiases)); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::FullyConnectedLayer("ip2") + .setOutputNum(10)); layerId = builder.addLayer({{layerId}}, Builder::SoftMaxLayer("prob").setAxis(1)); - size_t outputId = builder.addLayer({PortInfo(layerId)}, Builder::OutputLayer("sf_out")); + builder.addLayer({PortInfo(layerId)}, Builder::OutputLayer("sf_out")); CNNNetwork network{Builder::convertToICNNNetwork(builder.build())}; // ----------------------------------------------------------------------------------------------------- @@ -272,7 +281,7 @@ int main(int argc, char *argv[]) { double total = 0.0; /** Start inference & calc performance **/ - for (int iter = 0; iter < FLAGS_ni; ++iter) { + for (size_t iter = 0; iter < FLAGS_ni; ++iter) { auto t0 = Time::now(); infer_request.Infer(); auto t1 = Time::now(); @@ -289,7 +298,7 @@ int main(int argc, char *argv[]) { auto outputData = outputBlob->buffer().as::value_type*>(); /** Validating -nt value **/ - const int resultsCnt = outputBlob->size() / batchSize; + const size_t resultsCnt = outputBlob->size() / batchSize; if (FLAGS_nt > resultsCnt || FLAGS_nt < 1) { slog::warn << "-nt " << FLAGS_nt << " is not available for this network (-nt should be less than " \ << resultsCnt+1 << " and more than 0)\n will be used maximal value : " << resultsCnt; @@ -303,7 +312,7 @@ int main(int argc, char *argv[]) { std::cout << std::endl << "Top " << FLAGS_nt << " results:" << std::endl << std::endl; /** Print the result iterating over each batch **/ - for (int image_id = 0; image_id < batchSize; ++image_id) { + for (size_t image_id = 0; image_id < batchSize; ++image_id) { std::cout << "Image " << images[image_id] << std::endl << std::endl; for (size_t id = image_id * FLAGS_nt, cnt = 0; cnt < FLAGS_nt; ++cnt, ++id) { std::cout.precision(7); @@ -313,6 +322,9 @@ int main(int argc, char *argv[]) { } std::cout << std::endl; } + if (std::fabs(total) < std::numeric_limits::epsilon()) { + throw std::logic_error("total can't be equal to zero"); + } // ----------------------------------------------------------------------------------------------------- std::cout << std::endl << "total inference time: " << total << std::endl; std::cout << "Average running time of one iteration: " << total / static_cast(FLAGS_ni) << " ms" << std::endl; diff --git a/inference-engine/samples/object_detection_sample_ssd/CMakeLists.txt b/inference-engine/samples/object_detection_sample_ssd/CMakeLists.txt index 60cd38e..436edc2 100644 --- a/inference-engine/samples/object_detection_sample_ssd/CMakeLists.txt +++ b/inference-engine/samples/object_detection_sample_ssd/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "object_detection_sample_ssd") file (GLOB MAIN_SRC diff --git a/inference-engine/samples/object_detection_sample_ssd/README.md b/inference-engine/samples/object_detection_sample_ssd/README.md index dc6f477..a8db1a8 100644 --- a/inference-engine/samples/object_detection_sample_ssd/README.md +++ b/inference-engine/samples/object_detection_sample_ssd/README.md @@ -1,14 +1,22 @@ -# Object Detection Sample SSD +# Object Detection C++ Sample SSD -This topic demonstrates how to run the Object Detection sample application, which does inference using object detection +This topic demonstrates how to run the Object Detection sample application, which does inference using object detection networks like SSD-VGG on Intel® Processors and Intel® HD Graphics. +## How It Works + +Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference +Engine plugin. When inference is done, the application creates an +output image and outputs data to the standard output stream. + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + ## Running Running the application with the -h option yields the following usage message: ```sh ./object_detection_sample_ssd -h -InferenceEngine: +InferenceEngine: API version ............ Build .................. @@ -18,46 +26,41 @@ Options: -h Print a usage message. -i "" Required. Path to an .bmp image. -m "" Required. Path to an .xml file with a trained model. - -l "" Required for MKLDNN (CPU)-targeted custom layers. Absolute path to a shared library with the kernels impl. + -l "" Required for CPU custom layers. Absolute path to a shared library with the kernel implementations. Or - -c "" Required for clDNN (GPU)-targeted custom kernels. Absolute path to the xml file with the kernels desc. - -pp "" Path to a plugin folder. - -d "" Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified - -pc Enables per-layer performance report - -ni "" Number of iterations (default 1) - -p_msg Enables messages from a plugin + -c "" Required for GPU custom kernels. Absolute path to the .xml file with the kernel descriptions. + -pp "" Optional. Path to a plugin folder. + -d "" Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified + -pc Optional. Enables per-layer performance report + -ni "" Optional. Number of iterations. Default value is 1 + -p_msg Optional. Enables messages from a plugin ``` Running the application with the empty list of options yields the usage message given above and an error message. -To run the sample, you can use a set of pre-trained and optimized models delivered with the package or a Caffe* public model. +To run the sample, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). -> **NOTE**: A public model should be converted to the Inference Engine format (`.xml` + `.bin`) using the Model Optimizer tool. For Model Optimizer documentation, see https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer. +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). For example, to do inference on a CPU with the OpenVINO™ toolkit person detection SSD models, run one of the following commands: ```sh -./object_detection_sample_ssd -i /inputImage.bmp -m /deployment_tools/intel_models/person-detection-retail-0013/FP32/person-detection-retail-0013.xml -d CPU +./object_detection_sample_ssd -i /inputImage.bmp -m person-detection-retail-0013.xml -d CPU ``` or ```sh -./object_detection_sample_ssd -i /inputImage.jpg -m /deployment_tools/intel_models/person-detection-retail-0002/FP32/person-detection-retail-0002.xml -d CPU +./object_detection_sample_ssd -i /inputImage.jpg -m person-detection-retail-0002.xml -d CPU ``` -> **NOTE**: Before running the sample with another trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). +## Sample Output -### Outputs - -The application outputs an image (out_0.bmp) with detected objects enclosed in rectangles. It outputs the list of classes -of the detected objects along with the respective confidence values and the coordinates of the +The application outputs an image (`out_0.bmp`) with detected objects enclosed in rectangles. It outputs the list of classes +of the detected objects along with the respective confidence values and the coordinates of the rectangles to the standard output stream. -### How it works - -Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference -Engine plugin. When inference is done, the application creates an -output image and outputs data to the standard output stream. -## See Also +## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/object_detection_sample_ssd/main.cpp b/inference-engine/samples/object_detection_sample_ssd/main.cpp index 066e9ff..32e41e7 100644 --- a/inference-engine/samples/object_detection_sample_ssd/main.cpp +++ b/inference-engine/samples/object_detection_sample_ssd/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) { // --------------------------- 3. Load Plugin for inference engine ------------------------------------- slog::info << "Loading plugin" << slog::endl; - InferencePlugin plugin = PluginDispatcher({ FLAGS_pp, "../../../lib/intel64" , "" }).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); if (FLAGS_p_msg) { static_cast(plugin)->SetLogCallback(error_listener); } @@ -149,7 +149,7 @@ int main(int argc, char *argv[]) { */ std::string imageInputName, imInfoInputName; - InputInfo::Ptr inputInfo = inputsInfo.begin()->second; + InputInfo::Ptr inputInfo = nullptr; SizeVector inputImageDims; /** Stores input image **/ @@ -160,6 +160,8 @@ int main(int argc, char *argv[]) { if (item.second->getInputData()->getTensorDesc().getDims().size() == 4) { imageInputName = item.first; + inputInfo = item.second; + slog::info << "Batch size is " << std::to_string(networkReader.getNetwork().getBatchSize()) << slog::endl; /** Creating first input blob **/ @@ -170,12 +172,15 @@ int main(int argc, char *argv[]) { Precision inputPrecision = Precision::FP32; item.second->setPrecision(inputPrecision); - if ((item.second->getTensorDesc().getDims()[1] != 3 && item.second->getTensorDesc().getDims()[1] != 6) || - item.second->getTensorDesc().getDims()[0] != 1) { + if ((item.second->getTensorDesc().getDims()[1] != 3 && item.second->getTensorDesc().getDims()[1] != 6)) { throw std::logic_error("Invalid input info. Should be 3 or 6 values length"); } } } + + if (inputInfo == nullptr) { + inputInfo = inputsInfo.begin()->second; + } // ----------------------------------------------------------------------------------------------------- // --------------------------- 6. Prepare output blobs ------------------------------------------------- @@ -226,7 +231,7 @@ int main(int argc, char *argv[]) { // --------------------------- 9. Prepare input -------------------------------------------------------- /** Collect images data ptrs **/ std::vector> imagesData, originalImagesData; - std::vector imageWidths, imageHeights; + std::vector imageWidths, imageHeights; for (auto & i : images) { FormatReader::ReaderPtr reader(i.c_str()); if (reader.get() == nullptr) { @@ -285,7 +290,7 @@ int main(int argc, char *argv[]) { for (size_t image_id = 0; image_id < std::min(imagesData.size(), batchSize); ++image_id) { p[image_id * imInfoDim + 0] = static_cast(inputsInfo[imageInputName]->getTensorDesc().getDims()[2]); p[image_id * imInfoDim + 1] = static_cast(inputsInfo[imageInputName]->getTensorDesc().getDims()[3]); - for (int k = 2; k < imInfoDim; k++) { + for (size_t k = 2; k < imInfoDim; k++) { p[image_id * imInfoDim + k] = 1.0f; // all scale factors are set to 1.0 } } @@ -301,7 +306,7 @@ int main(int argc, char *argv[]) { double total = 0.0; /** Start inference & calc performance **/ - for (int iter = 0; iter < FLAGS_ni; ++iter) { + for (size_t iter = 0; iter < FLAGS_ni; ++iter) { auto t0 = Time::now(); infer_request.Infer(); auto t1 = Time::now(); @@ -322,28 +327,28 @@ int main(int argc, char *argv[]) { /* Each detection has image_id that denotes processed image */ for (int curProposal = 0; curProposal < maxProposalCount; curProposal++) { - float image_id = detection[curProposal * objectSize + 0]; + auto image_id = static_cast(detection[curProposal * objectSize + 0]); if (image_id < 0) { break; } - float label = detection[curProposal * objectSize + 1]; float confidence = detection[curProposal * objectSize + 2]; - float xmin = detection[curProposal * objectSize + 3] * imageWidths[image_id]; - float ymin = detection[curProposal * objectSize + 4] * imageHeights[image_id]; - float xmax = detection[curProposal * objectSize + 5] * imageWidths[image_id]; - float ymax = detection[curProposal * objectSize + 6] * imageHeights[image_id]; + auto label = static_cast(detection[curProposal * objectSize + 1]); + auto xmin = static_cast(detection[curProposal * objectSize + 3] * imageWidths[image_id]); + auto ymin = static_cast(detection[curProposal * objectSize + 4] * imageHeights[image_id]); + auto xmax = static_cast(detection[curProposal * objectSize + 5] * imageWidths[image_id]); + auto ymax = static_cast(detection[curProposal * objectSize + 6] * imageHeights[image_id]); std::cout << "[" << curProposal << "," << label << "] element, prob = " << confidence << " (" << xmin << "," << ymin << ")-(" << xmax << "," << ymax << ")" << " batch id : " << image_id; if (confidence > 0.5) { /** Drawing only objects with >50% probability **/ - classes[image_id].push_back(static_cast(label)); - boxes[image_id].push_back(static_cast(xmin)); - boxes[image_id].push_back(static_cast(ymin)); - boxes[image_id].push_back(static_cast(xmax - xmin)); - boxes[image_id].push_back(static_cast(ymax - ymin)); + classes[image_id].push_back(label); + boxes[image_id].push_back(xmin); + boxes[image_id].push_back(ymin); + boxes[image_id].push_back(xmax - xmin); + boxes[image_id].push_back(ymax - ymin); std::cout << " WILL BE PRINTED!"; } std::cout << std::endl; diff --git a/inference-engine/samples/object_detection_sample_ssd/object_detection_sample_ssd.h b/inference-engine/samples/object_detection_sample_ssd/object_detection_sample_ssd.h index 1e9f287..540ed59 100644 --- a/inference-engine/samples/object_detection_sample_ssd/object_detection_sample_ssd.h +++ b/inference-engine/samples/object_detection_sample_ssd/object_detection_sample_ssd.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -25,7 +25,7 @@ static const char help_message[] = "Print a usage message."; static const char image_message[] = "Required. Path to an .bmp image."; /// @brief message for plugin_path argument -static const char plugin_path_message[] = "Path to a plugin folder."; +static const char plugin_path_message[] = "Optional. Path to a plugin folder."; /// @brief message for model argument static const char model_message[] = "Required. Path to an .xml file with a trained model."; @@ -35,25 +35,25 @@ static const char plugin_message[] = "Plugin name. For example MKLDNNPlugin. If "the sample will look for this plugin only"; /// @brief message for assigning cnn calculation to device -static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. " \ +static const char target_device_message[] = "Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. " \ "Sample will look for a suitable plugin for device specified"; /// @brief message for performance counters -static const char performance_counter_message[] = "Enables per-layer performance report"; +static const char performance_counter_message[] = "Optional. Enables per-layer performance report"; /// @brief message for iterations count -static const char iterations_count_message[] = "Number of iterations (default 1)"; +static const char iterations_count_message[] = "Optional. Number of iterations. Default value is 1"; /// @brief message for clDNN custom kernels desc -static const char custom_cldnn_message[] = "Required for clDNN (GPU)-targeted custom kernels. "\ -"Absolute path to the xml file with the kernels desc."; +static const char custom_cldnn_message[] = "Required for GPU custom kernels. "\ +"Absolute path to the .xml file with the kernels descriptions."; /// @brief message for user library argument -static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers. " \ -"Absolute path to a shared library with the kernels impl."; +static const char custom_cpu_library_message[] = "Required for CPU custom layers. " \ +"Absolute path to a shared library with the kernels implementations."; /// @brief message for plugin messages -static const char plugin_err_message[] = "Enables messages from a plugin"; +static const char plugin_err_message[] = "Optional. Enables messages from a plugin"; /// \brief Define flag for showing help message
DEFINE_bool(h, false, help_message); @@ -85,7 +85,7 @@ DEFINE_string(c, "", custom_cldnn_message); DEFINE_string(l, "", custom_cpu_library_message); /// @brief Iterations count (default 1) -DEFINE_int32(ni, 1, iterations_count_message); +DEFINE_uint32(ni, 1, iterations_count_message); /// @brief Enable plugin messages DEFINE_bool(p_msg, false, plugin_err_message); diff --git a/inference-engine/samples/perfcheck/CMakeLists.txt b/inference-engine/samples/perfcheck/CMakeLists.txt index bc08b7d..4a68a8b 100644 --- a/inference-engine/samples/perfcheck/CMakeLists.txt +++ b/inference-engine/samples/perfcheck/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -25,7 +25,6 @@ endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options(${TARGET_NAME} - PRIVATE "-Weverything" PRIVATE "-Wno-c++98-compat" PRIVATE "-Wno-global-constructors" PRIVATE "-Wno-missing-variable-declarations" diff --git a/inference-engine/samples/perfcheck/README.md b/inference-engine/samples/perfcheck/README.md index daf0448..e38bd29 100644 --- a/inference-engine/samples/perfcheck/README.md +++ b/inference-engine/samples/perfcheck/README.md @@ -10,7 +10,7 @@ After inference stage, Perfcheck sample computes total time of execution, divide ## Running -Running the application with the -h option yields the following usage message: +Running the application with the `-h` option yields the following usage message: ```sh ./perfcheck -h @@ -37,14 +37,16 @@ perfcheck [OPTIONS] Running the application with the empty list of options yields an error message. -You can use the following command to do inference on Intel® Processors on images from a folder using a trained Faster R-CNN network: +To run the sample, you can use public or pre-trained models. To download the pre-trained models, use the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/). + +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + +You can use the following command to do inference on CPU on images from a folder using a trained Faster R-CNN network: ```sh ./perfcheck -m /faster_rcnn.xml -inputs_dir -d CPU ``` -> **NOTE**: Public models should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer). - ## Sample Output The application outputs a performance statistics that shows: total execution time (in milliseconds), number of iterations, batch size, minimum, average and maximum FPS. @@ -63,11 +65,13 @@ Example of sample output: Total time: 8954.61 ms Num iterations: 1000 Batch: 1 -Min fps: 110.558 -Avg fps: 111.674 -Max fps: 112.791 +Min FPS: 110.558 +Avg FPS: 111.674 +Max FPS: 112.791 ``` ## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/perfcheck/main.cpp b/inference-engine/samples/perfcheck/main.cpp index 88d5de9..0c062c6 100644 --- a/inference-engine/samples/perfcheck/main.cpp +++ b/inference-engine/samples/perfcheck/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -30,7 +30,7 @@ #include "inference_engine.hpp" #include "ext_list.hpp" -//#include "vpu/vpu_plugin_config.hpp" +#include "vpu/vpu_plugin_config.hpp" #include "samples/common.hpp" #include "samples/slog.hpp" @@ -116,7 +116,7 @@ static std::size_t getNumberRequests(const std::string &plugin) { return num_requests == supported_plugins.end() ? 1 : num_requests->second; } -#if defined(WIN32) +#if defined(WIN32) || defined(__APPLE__) typedef std::chrono::time_point time_point; #else typedef std::chrono::time_point time_point; @@ -168,9 +168,9 @@ static void printFPS(std::size_t num_requests, std::size_t num_intervals, const std::cout << "Num iterations: " << num_iterations << std::endl; std::cout << "Batch: " << FLAGS_batch << std::endl; - std::cout << "Min fps: " << min_fps << std::endl; - std::cout << "Avg fps: " << avg_fps << std::endl; - std::cout << "Max fps: " << max_fps << std::endl; + std::cout << "Min FPS: " << min_fps << std::endl; + std::cout << "Avg FPS: " << avg_fps << std::endl; + std::cout << "Max FPS: " << max_fps << std::endl; } template @@ -417,7 +417,7 @@ int main(int argc, char *argv[]) { } } - auto plugin = InferenceEngine::PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(FLAGS_d); + auto plugin = InferenceEngine::PluginDispatcher({FLAGS_pp}).getPluginByDevice(FLAGS_d); /* If CPU device, load default library with extensions that comes with the product */ if (FLAGS_d.find("CPU") != std::string::npos) { diff --git a/inference-engine/samples/perfcheck/perfcheck.h b/inference-engine/samples/perfcheck/perfcheck.h index 01419f1..facc5f6 100644 --- a/inference-engine/samples/perfcheck/perfcheck.h +++ b/inference-engine/samples/perfcheck/perfcheck.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/speech_sample/CMakeLists.txt b/inference-engine/samples/speech_sample/CMakeLists.txt index 33e7e72..e789f7a 100644 --- a/inference-engine/samples/speech_sample/CMakeLists.txt +++ b/inference-engine/samples/speech_sample/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "speech_sample") file (GLOB MAIN_SRC @@ -30,7 +28,7 @@ add_dependencies(${TARGET_NAME} gflags) set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE" COMPILE_PDB_NAME ${TARGET_NAME}) -target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES} gflags) +target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES} IE::ie_cpu_extension gflags) if(UNIX) target_link_libraries( ${TARGET_NAME} ${LIB_DL} pthread) diff --git a/inference-engine/samples/speech_sample/README.md b/inference-engine/samples/speech_sample/README.md index 31f2b8d..a9ca938 100644 --- a/inference-engine/samples/speech_sample/README.md +++ b/inference-engine/samples/speech_sample/README.md @@ -1,19 +1,87 @@ -# Automatic Speech Recognition Sample +# Automatic Speech Recognition C++ Sample This topic shows how to run the speech sample application, which demonstrates acoustic model inference based on Kaldi\* neural networks and speech feature vectors. -## Running +## How It Works + +Upon the start-up, the application reads command line parameters +and loads a Kaldi-trained neural network along with Kaldi ARK speech +feature vector file to the Inference Engine plugin. It then performs +inference on all speech utterances stored in the input ARK +file. Context-windowed speech frames are processed in batches of 1-8 +frames according to the `-bs` parameter. Batching across utterances is +not supported by this sample. When inference is done, the application +creates an output ARK file. If the `-r` option is given, error +statistics are provided for each speech utterance as shown above. + +### GNA-specific details + +#### Quantization -### Usage +If the GNA device is selected (for example, using the `-d` GNA flag), +the GNA Inference Engine plugin quantizes the model and input feature +vector sequence to integer representation before performing inference. +Several parameters control neural network quantization. The `-q` flag +determines the quantization mode. Three modes are supported: static, +dynamic, and user-defined. In static quantization mode, the first +utterance in the input ARK file is scanned for dynamic range. The +scale factor (floating point scalar multiplier) required to scale the +maximum input value of the first utterance to 16384 (15 bits) is used +for all subsequent inputs. The neural network is quantized to +accomodate the scaled input dynamic range. In user-defined +quantization mode, the user may specify a scale factor via the `-sf` +flag that will be used for static quantization. In dynamic +quantization mode, the scale factor for each input batch is computed +just before inference on that batch. The input and network are +(re)quantized on-the-fly using an efficient procedure. + +The `-qb` flag provides a hint to the GNA plugin regarding the preferred +target weight resolution for all layers. For example, when `-qb 8` is +specified, the plugin will use 8-bit weights wherever possible in the +network. Note that it is not always possible to use 8-bit weights due +to GNA hardware limitations. For example, convolutional layers always +use 16-bit weights (GNA harware verison 1 and 2). This limitation +will be removed in GNA hardware version 3 and higher. + +#### Execution Modes + +Several execution modes are supported via the `-d` flag. If the device +is set to `CPU` and the GNA plugin is selected, the GNA device is +emulated in fast-but-not-bit-exact mode. If the device is set to +`GNA_AUTO`, then the GNA hardware is used if available and the driver is +installed. Otherwise, the GNA device is emulated in +fast-but-not-bit-exact mode. If the device is set to `GNA_HW`, then the +GNA hardware is used if available and the driver is installed. +Otherwise, an error will occur. If the device is set to `GNA_SW`, the +GNA device is emulated in fast-but-not-bit-exact mode. Finally, if +the device is set to `GNA_SW_EXACT`, the GNA device is emulated in +bit-exact mode. + +#### Loading and Saving Models + +The GNA plugin supports loading and saving of the GNA-optimized model +(non-IR) via the `-rg` and `-wg` flags. Thereby, it is possible to avoid +the cost of full model quantization at run time. The GNA plugin also +supports export of firmware-compatible embedded model images for the +Intel® Speech Enabling Developer Kit and Amazon Alexa* Premium +Far-Field Voice Development Kit via the `-we` flag (save only). + +In addition to performing inference directly from a GNA model file, these options make it possible to: +- Convert from IR format to GNA format model file (`-m`, `-wg`) +- Convert from IR format to embedded format model file (`-m`, `-we`) +- Convert from GNA format to embedded format model file (`-rg`, `-we`) + + +## Running Running the application with the `-h` option yields the following usage message: ```sh $ ./speech_sample -h -InferenceEngine: +InferenceEngine: API version ............ Build .................. @@ -23,21 +91,22 @@ Options: -h Print a usage message. -i "" Required. Path to an .ark file. -m "" Required. Path to an .xml file with a trained model (required if -rg is missing). - -o "" Output file name (default name is scores.ark). - -l "" Required for MKLDNN (CPU)-targeted custom layers.Absolute path to a shared library with the kernels impl. - -d "" Specify the target device to infer on; CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT is acceptable. Sample will look for a suitable plugin for device specified - -p Plugin name. For example MKLDNNPlugin. If this parameter is pointed, the sample will look for this plugin only - -pp Path to a plugin folder. - -pc Enables performance report - -q "" Input quantization mode: static (default), dynamic, or user (use with -sf). - -qb "" Weight bits for quantization: 8 or 16 (default) - -sf "" Optional user-specified input scale factor for quantization (use with -q user). - -bs "" Batch size 1-8 (default 1) - -r "" Read reference score .ark file and compare scores. - -rg "" Read GNA model from file using path/filename provided (required if -m is missing). - -wg "" Write GNA model to file using path/filename provided. - -we "" Write GNA embedded model to file using path/filename provided. + -o "" Optional. Output file name (default name is "scores.ark"). + -l "" Required for CPU custom layers. Absolute path to a shared library with the kernel implementations. + -d "" Optional. Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT and HETERO with combination of GNA as the primary device and CPU as a secondary (e.g. HETERO:GNA,CPU) are supported. The sample will look for a suitable plugin for device specified. + -p Optional. Plugin name. For example, GPU. If this parameter is set, the sample will look for this plugin only + -pp Optional. Path to a plugin folder. + -pc Optional. Enables performance report + -q "" Optional. Input quantization mode: "static" (default), "dynamic", or "user" (use with -sf). + -qb "" Optional. Weight bits for quantization: 8 or 16 (default) + -sf "" Optional. Input scale factor for quantization (use with -q user). + -bs "" Optional. Batch size 1-8 (default 1) + -r "" Optional. Read reference score .ark file and compare scores. + -rg "" Optional. Read GNA model from file using path/filename provided (required if -m is missing). + -wg "" Optional. Write GNA model to file using path/filename provided. + -we "" Optional. Write GNA embedded model to file using path/filename provided. -nthreads "" Optional. Number of threads to use for concurrent async inference requests on the GNA. + -cw "" Optional. Number of frames for context windows (default is 0). Works only with context window networks. If you use the cw flag, the batch size and nthreads arguments are ignored. ``` @@ -46,8 +115,6 @@ usage message given above and an error message. ### Model Preparation -> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). - You can use the following model optimizer command to convert a Kaldi nnet1 or nnet2 neural network to Intel IR format: @@ -61,13 +128,13 @@ network, `wsj_dnn5b_smbr.nnet`, and Kaldi class counts file, the Intel IR network consisting of `wsj_dnn5b_smbr.xml` and `wsj_dnn5b_smbr.bin`. -The following pretrained models are available: +The following pre-trained models are available: * wsj\_dnn5b\_smbr * rm\_lstm4f * rm\_cnn4a\_smbr -All of them can be downloaded from [https://download.01.org/openvinotoolkit/2018_R3/models_contrib/GNA/](https://download.01.org/openvinotoolkit/2018_R3/models_contrib/GNA/). +All of them can be downloaded from [https://download.01.org/openvinotoolkit/models_contrib/speech/kaldi](https://download.01.org/openvinotoolkit/models_contrib/speech/kaldi) or using the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) . ### Speech Inference @@ -85,7 +152,9 @@ scores (`wsj_dnn5b_smbr_dev93_scores_10.ark`) corresponding to the input feature file (`wsj_dnn5b_smbr_dev93_10.ark`) are assumed to be available for comparison. -### Sample Output +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + +## Sample Output The acoustic log likelihood sequences for all utterances are stored in the Kaldi ARK file, `scores.ark`. If the `-r` option is used, a report on @@ -101,81 +170,12 @@ Utterance 0: 4k0c0301 stdev error: 0.00393488 ``` -## How it works - -Upon the start-up the speech_sample application reads command line parameters -and loads a Kaldi-trained neural network along with Kaldi ARK speech -feature vector file to the Inference Engine plugin. It then performs -inference on all speech utterances stored in the input ARK -file. Context-windowed speech frames are processed in batches of 1-8 -frames according to the `-bs` parameter. Batching across utterances is -not supported by this sample. When inference is done, the application -creates an output ARK file. If the `-r` option is given, error -statistics are provided for each speech utterance as shown above. - -### GNA-specific details - -#### Quantization - -If the GNA device is selected (for example, using the `-d` GNA flag), -the GNA Inference Engine plugin quantizes the model and input feature -vector sequence to integer representation before performing inference. -Several parameters control neural network quantization. The `-q` flag -determines the quantization mode. Three modes are supported: static, -dynamic, and user-defined. In static quantization mode, the first -utterance in the input ARK file is scanned for dynamic range. The -scale factor (floating point scalar multiplier) required to scale the -maximum input value of the first utterance to 16384 (15 bits) is used -for all subsequent inputs. The neural network is quantized to -accomodate the scaled input dynamic range. In user-defined -quantization mode, the user may specify a scale factor via the `-sf` -flag that will be used for static quantization. In dynamic -quantization mode, the scale factor for each input batch is computed -just before inference on that batch. The input and network are -(re)quantized on-the-fly using an efficient procedure. - -The `-qb` flag provides a hint to the GNA plugin regarding the preferred -target weight resolution for all layers. For example, when `-qb 8` is -specified, the plugin will use 8-bit weights wherever possible in the -network. Note that it is not always possible to use 8-bit weights due -to GNA hardware limitations. For example, convolutional layers always -use 16-bit weights (GNA harware verison 1 and 2). This limitation -will be removed in GNA hardware version 3 and higher. - -#### Execution Modes - -Several execution modes are supported via the `-d` flag. If the device -is set to `CPU` and the GNA plugin is selected, the GNA device is -emulated in fast-but-not-bit-exact mode. If the device is set to -`GNA_AUTO`, then the GNA hardware is used if available and the driver is -installed. Otherwise, the GNA device is emulated in -fast-but-not-bit-exact mode. If the device is set to `GNA_HW`, then the -GNA hardware is used if available and the driver is installed. -Otherwise, an error will occur. If the device is set to `GNA_SW`, the -GNA device is emulated in fast-but-not-bit-exact mode. Finally, if -the device is set to `GNA_SW_EXACT`, the GNA device is emulated in -bit-exact mode. - -#### Loading and Saving Models - -The GNA plugin supports loading and saving of the GNA-optimized model -(non-IR) via the `-rg` and `-wg` flags. Thereby, it is possible to avoid -the cost of full model quantization at run time. The GNA plugin also -supports export of firmware-compatible embedded model images for the -Intel® Speech Enabling Developer Kit and Amazon Alexa* Premium -Far-Field Voice Development Kit via the `-we` flag (save only). - -In addition to performing inference directly from a GNA model file, these options make it possible to: -- Convert from IR format to GNA format model file (`-m`, `-wg`) -- Convert from IR format to embedded format model file (`-m`, `-we`) -- Convert from GNA format to embedded format model file (`-rg`, `-we`) - ## Use of Sample in Kaldi* Speech Recognition Pipeline The Wall Street Journal DNN model used in this example was prepared using the Kaldi s5 recipe and the Kaldi Nnet (nnet1) framework. It is possible to recognize speech by substituting the `speech_sample` for -Kaldi's nnet-forward command. Since the speech_sample does not yet +Kaldi's nnet-forward command. Since the speech_sample does not yet use pipes, it is necessary to use temporary files for speaker- transformed feature vectors and scores when running the Kaldi speech recognition pipeline. The following operations assume that feature @@ -199,10 +199,7 @@ latgen-faster-mapped --max-active=7000 --max-mem=50000000 --beam=13.0 --lattice- cat out.txt | utils/int2sym.pl -f 2- words.txt | sed s:\::g | compute-wer --text --mode=present ark:test_filt.txt ark,p:- ``` -## Links - -- [Main Page](index.html) -- [Use of the Inference Engine](./docs/IE_DG/Integrate_with_customer_application.md) -- [Intel's Deep Learning Model Optimizer Developer Guide](https://software.intel.com/en-us/model-optimizer-devguide) -- [Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) -- [Deep Learning Deployment Toolkit Web Page](https://software.intel.com/en-us/computer-vision-sdk) +## See Also +* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/speech_sample/main.cpp b/inference-engine/samples/speech_sample/main.cpp index e0dc005..4b7115a 100644 --- a/inference-engine/samples/speech_sample/main.cpp +++ b/inference-engine/samples/speech_sample/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -25,6 +25,7 @@ #include #include #include +#include #ifndef ALIGN #define ALIGN(memSize, pad) ((static_cast((memSize) + pad - 1) / pad) * pad) @@ -51,6 +52,12 @@ typedef struct { float sumSquaredRelError; } score_error_t; +struct InferRequestStruct { + InferRequest inferRequest; + int frameIndex; + uint32_t numFramesThisBatch; +}; + void GetKaldiArkInfo(const char *fileName, uint32_t numArrayToFindSize, uint32_t *ptrNumArrays, @@ -119,7 +126,6 @@ void LoadKaldiArkArray(const char *fileName, uint32_t arrayIndex, std::string &p in_file.read(reinterpret_cast(ptrNumRows), sizeof(uint32_t)); // read number of rows std::getline(in_file, line, '\4'); // read control-D in_file.read(reinterpret_cast(ptrNumColumns), sizeof(uint32_t)); // read number of columns - size_t willWrite = *ptrNumRows * *ptrNumColumns * sizeof(float); in_file.read(reinterpret_cast(&memory.front()), *ptrNumRows * *ptrNumColumns * sizeof(float)); // read array data } @@ -286,7 +292,6 @@ inline void native_cpuid(unsigned int *eax, unsigned int *ebx, // return GNA module frequency in MHz float getGnaFrequencyMHz() { - uint32_t level = 0; uint32_t eax = 1; uint32_t ebx = 0; uint32_t ecx = 0; @@ -353,12 +358,11 @@ void printPerformanceCounters(std::map(it.second.realTime_uSec); float call_units = current_units / callsNum; - float freq = 1.0; // if GNA HW counters // get frequency of GNA module - freq = getGnaFrequencyMHz(); + float freq = getGnaFrequencyMHz(); current_units /= freq * 1000; call_units /= freq; stream << std::setw(30) << std::left << counter_name.substr(4, counter_name.size() - 1); @@ -414,9 +418,20 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) { throw std::logic_error("Only one of -m and -rg is allowed."); } - if ((FLAGS_d.compare("GPU") != 0) && (FLAGS_d.compare("CPU") != 0) && (FLAGS_d.compare("GNA_AUTO") != 0) && - (FLAGS_d.compare("GNA_HW") != 0) - && (FLAGS_d.compare("GNA_SW") != 0) && (FLAGS_d.compare("GNA_SW_EXACT") != 0)) { + std::vector possibleDeviceTypes = { + "CPU", + "GPU", + "GNA_AUTO", + "GNA_HW", + "GNA_SW_EXACT", + "GNA_SW", + "HETERO:GNA,CPU", + "HETERO:GNA_HW,CPU", + "HETERO:GNA_SW_EXACT,CPU", + "HETERO:GNA_SW,CPU", + }; + + if (std::find(possibleDeviceTypes.begin(), possibleDeviceTypes.end(), FLAGS_d) == possibleDeviceTypes.end()) { throw std::logic_error("Specified device is not supported."); } @@ -447,6 +462,10 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) { throw std::logic_error("Not valid value for 'nthreads' argument. It should be > 0 "); } + if (FLAGS_cw < 0) { + throw std::logic_error("Not valid value for 'cw' argument. It should be > 0 "); + } + return true; } @@ -468,10 +487,14 @@ int main(int argc, char *argv[]) { slog::info << "No extensions provided" << slog::endl; } - bool useGna = (FLAGS_d.find("GNA") != std::string::npos); - auto deviceStr = FLAGS_d.substr(0, (FLAGS_d.find("_"))); + auto isFeature = [&](const std::string xFeature) { return FLAGS_d.find(xFeature) != std::string::npos; }; + + bool useGna = isFeature("GNA"); + bool useHetero = isFeature("HETERO"); + std::string deviceStr = + useHetero && useGna ? "HETERO:GNA,CPU" : FLAGS_d.substr(0, (FLAGS_d.find("_"))); float scaleFactorInput = static_cast(FLAGS_sf); - uint32_t batchSize = (uint32_t) FLAGS_bs; + uint32_t batchSize = FLAGS_cw > 0 ? 1 : (uint32_t) FLAGS_bs; /** Extract input ark file name **/ std::string inputArkName = fileNameNoExt(FLAGS_i) + ".ark"; @@ -484,7 +507,7 @@ int main(int argc, char *argv[]) { // --------------------------- 1. Load Plugin for inference engine ------------------------------------- slog::info << "Loading plugin" << slog::endl; /** Loading plugin for device **/ - InferencePlugin plugin = PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(deviceStr); + InferencePlugin plugin = PluginDispatcher({FLAGS_pp}).getPluginByDevice(deviceStr); /** Printing plugin version **/ std::cout << plugin.GetVersion() << std::endl << std::endl; @@ -514,9 +537,20 @@ int main(int argc, char *argv[]) { /** Setting plugin parameter for per layer metrics **/ std::map gnaPluginConfig; std::map genericPluginConfig; - if (FLAGS_d.compare("CPU") != 0) { - gnaPluginConfig[GNAConfigParams::KEY_GNA_DEVICE_MODE] = FLAGS_d; + if (useGna) { + std::string gnaDevice = + useHetero ? FLAGS_d.substr(FLAGS_d.find("GNA"), FLAGS_d.find(",") - FLAGS_d.find("GNA")) : FLAGS_d; + gnaPluginConfig[GNAConfigParams::KEY_GNA_DEVICE_MODE] = + gnaDevice.find("_") == std::string::npos ? "GNA_AUTO" : gnaDevice; + } else if (plugin.GetVersion()->description == std::string("MKLDNNPlugin")) { + /** + * cpu_extensions library is compiled from "extension" folder containing + * custom MKLDNNPlugin layer implementations. These layers are not supported + * by mkldnn, but they can be useful for inferring custom topologies. + **/ + plugin.AddExtension(std::make_shared()); } + if (FLAGS_pc) { genericPluginConfig[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES; } @@ -550,7 +584,7 @@ int main(int argc, char *argv[]) { gnaPluginConfig[GNAConfigParams::KEY_GNA_PRECISION] = "I16"; } - gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string(FLAGS_nthreads); + gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string(FLAGS_cw > 0 ? 1 : FLAGS_nthreads); gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO); // ----------------------------------------------------------------------------------------------------- @@ -568,6 +602,7 @@ int main(int argc, char *argv[]) { } auto t0 = Time::now(); ExecutableNetwork executableNet; + if (!FLAGS_m.empty()) { slog::info << "Loading model to the plugin" << slog::endl; executableNet = plugin.LoadNetwork(netBuilder.getNetwork(), genericPluginConfig); @@ -576,7 +611,6 @@ int main(int argc, char *argv[]) { executableNet = plugin.ImportNetwork(FLAGS_rg.c_str(), genericPluginConfig); } - ms loadTime = std::chrono::duration_cast(Time::now() - t0); slog::info << "Model loading time " << loadTime.count() << " ms" << slog::endl; @@ -595,9 +629,9 @@ int main(int argc, char *argv[]) { return 0; } - std::vector> inferRequests(FLAGS_nthreads); + std::vector inferRequests(FLAGS_cw > 0 ? 1 : FLAGS_nthreads); for (auto& inferRequest : inferRequests) { - inferRequest = {executableNet.CreateInferRequest(), -1}; + inferRequest = {executableNet.CreateInferRequest(), -1, batchSize}; } // ----------------------------------------------------------------------------------------------------- @@ -614,7 +648,7 @@ int main(int argc, char *argv[]) { throw std::logic_error("Sample supports only topologies with 1 input"); } - Blob::Ptr ptrInputBlob = inferRequests[0].first.GetBlob(cInputInfo.begin()->first); + Blob::Ptr ptrInputBlob = inferRequests[0].inferRequest.GetBlob(cInputInfo.begin()->first); /** configure input precision if model loaded from IR **/ for (auto &item : inputInfo) { @@ -632,7 +666,7 @@ int main(int argc, char *argv[]) { outputInfo = netBuilder.getNetwork().getOutputsInfo(); } - Blob::Ptr ptrOutputBlob = inferRequests[0].first.GetBlob(cOutputInfo.begin()->first); + Blob::Ptr ptrOutputBlob = inferRequests[0].inferRequest.GetBlob(cOutputInfo.begin()->first); for (auto &item : outputInfo) { DataPtr outData = item.second; @@ -699,22 +733,20 @@ int main(int argc, char *argv[]) { auto inputFrame = &ptrUtterance.front(); auto outputFrame = &ptrScores.front(); - size_t frameIndex{0}; + std::map callPerfMap; + + size_t frameIndex = 0; + numFrames += 2 * FLAGS_cw; uint32_t numFramesThisBatch{batchSize}; auto t0 = Time::now(); auto t1 = t0; - // Doing inference while (frameIndex <= numFrames) { if (frameIndex == numFrames) { - bool hasRequests = false; - for (auto &inferRequest : inferRequests) { - if (inferRequest.second != -1) { - hasRequests = true; - } - } - if (!hasRequests) { + if (std::find_if(inferRequests.begin(), + inferRequests.end(), + [&](InferRequestStruct x) { return (x.frameIndex != -1); } ) == inferRequests.end()) { break; } } @@ -724,54 +756,79 @@ int main(int argc, char *argv[]) { if (frameIndex == numFrames) { numFramesThisBatch = 1; } else { - numFramesThisBatch = (numFrames - frameIndex < batchSize) ? (numFrames - frameIndex) : batchSize; + numFramesThisBatch = (numFrames - frameIndex < batchSize) ? (numFrames - frameIndex) + : batchSize; } - if (inferRequest.second != -1) { - StatusCode code = inferRequest.first.Wait( + if (inferRequest.frameIndex != -1) { + StatusCode code = inferRequest.inferRequest.Wait( InferenceEngine::IInferRequest::WaitMode::RESULT_READY); - if (code != StatusCode::OK) { - continue; + if (!useHetero) continue; + if (code != StatusCode::INFER_NOT_STARTED) continue; } - if (!FLAGS_o.empty()) { - Blob::Ptr outputBlob = inferRequest.first.GetBlob(cOutputInfo.begin()->first); - std::memcpy(outputFrame, - outputBlob->buffer(), - outputBlob->byteSize()); - outputFrame += numScoresPerFrame * sizeof(float); - } - - if (!FLAGS_r.empty()) { - Blob::Ptr outputBlob = inferRequest.first.GetBlob(cOutputInfo.begin()->first); - CompareScores(outputBlob->buffer().as(), - &ptrReferenceScores[inferRequest.second * - numFrameElementsReference * - numBytesPerElementReference], - &frameError, - numFramesThisBatch, - numFrameElementsReference); - UpdateScoreError(&frameError, &totalError); + if (inferRequest.frameIndex >= 0) { + if (!FLAGS_o.empty()) { + outputFrame = + &ptrScores.front() + numScoresPerFrame * sizeof(float) * (inferRequest.frameIndex); + Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.begin()->first); + auto byteSize = inferRequest.numFramesThisBatch * numScoresPerFrame * sizeof(float); + std::memcpy(outputFrame, + outputBlob->buffer(), + byteSize); + } + + if (!FLAGS_r.empty()) { + Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.begin()->first); + CompareScores(outputBlob->buffer().as(), + &ptrReferenceScores[inferRequest.frameIndex * + numFrameElementsReference * + numBytesPerElementReference], + &frameError, + inferRequest.numFramesThisBatch, + numFrameElementsReference); + UpdateScoreError(&frameError, &totalError); + } + if (FLAGS_pc) { + // retrive new counters + getPerformanceCounters(inferRequest.inferRequest, callPerfMap); + // summarize retrived counters with all previous + sumPerformanceCounters(callPerfMap, utterancePerfMap); + } } } - inferRequest.second = -1; - if (frameIndex == numFrames) { + inferRequest.frameIndex = -1; continue; } - Blob::Ptr inputBlob = inferRequest.first.GetBlob(cInputInfo.begin()->first); + Blob::Ptr inputBlob = inferRequest.inferRequest.GetBlob(cInputInfo.begin()->first); + std::memcpy(inputBlob->buffer(), inputFrame, inputBlob->byteSize()); - inferRequest.first.StartAsync(); + auto index = frameIndex - 2 * FLAGS_cw; + inferRequest.inferRequest.StartAsync(); + inferRequest.frameIndex = index < 0 ? -2 : index; + inferRequest.numFramesThisBatch = numFramesThisBatch; - inferRequest.second = frameIndex; frameIndex += numFramesThisBatch; - inputFrame += sizeof(float) * numFrameElementsInput * numFramesThisBatch; + + if (FLAGS_cw > 0) { + int i = frameIndex - FLAGS_cw; + if (i > 0 && i < static_cast(numFrames)) { + inputFrame += sizeof(float) * numFrameElementsInput * numFramesThisBatch; + } else if (i >= static_cast(numFrames)) { + inputFrame = &ptrUtterance.front() + + (numFrames - 1) * sizeof(float) * numFrameElementsInput * + numFramesThisBatch; + } + } else { + inputFrame += sizeof(float) * numFrameElementsInput * numFramesThisBatch; + } inferRequestFetched |= true; } @@ -779,16 +836,6 @@ int main(int argc, char *argv[]) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); continue; } - - if (FLAGS_pc) { - std::map callPerfMap; - // retrive new counters - for (auto inferRequest : inferRequests) { - getPerformanceCounters(inferRequest.first, callPerfMap); - // summarize retrived counters with all previous - sumPerformanceCounters(callPerfMap, utterancePerfMap); - } - } } t1 = Time::now(); diff --git a/inference-engine/samples/speech_sample/speech_sample.hpp b/inference-engine/samples/speech_sample/speech_sample.hpp index 37cb88f..7a033f8 100644 --- a/inference-engine/samples/speech_sample/speech_sample.hpp +++ b/inference-engine/samples/speech_sample/speech_sample.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,8 +15,6 @@ #include #endif -#define DEFAULT_PATH_P "./lib" - /// @brief message for help argument static const char help_message[] = "Print a usage message."; @@ -34,8 +32,11 @@ static const char plugin_message[] = "Plugin name. For example MKLDNNPlugin. If "the sample will look for this plugin only"; /// @brief message for assigning cnn calculation to device -static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT is acceptable. " \ - "Sample will look for a suitable plugin for device specified"; +static const char target_device_message[] = "Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, " + "GNA_SW_EXACT and HETERO with combination of GNA as the primary device and CPU" + " as a secondary (e.g. HETERO:GNA,CPU) are supported. The sample will look " + "for a suitable plugin for device specified."; + /// @brief message for performance counters static const char performance_counter_message[] = "Enables per-layer performance report"; @@ -74,6 +75,11 @@ static const char batch_size_message[] = "Batch size 1-8 (default 1)"; static const char infer_num_threads_message[] = "Optional. Number of threads to use for concurrent async" \ " inference requests on the GNA."; +/// @brief message for context window argument +static const char context_window_message[] = "Optional. Number of frames for context windows (default is 0). " \ + "Works only with context window networks." + " If you use the cw flag, then batch size and nthreads arguments are ignored."; + /// \brief Define flag for showing help message
DEFINE_bool(h, false, help_message); @@ -91,7 +97,7 @@ DEFINE_string(p, "", plugin_message); /// \brief Define parameter for set path to plugins
/// Default is ./lib -DEFINE_string(pp, DEFAULT_PATH_P, plugin_path_message); +DEFINE_string(pp, "", plugin_path_message); /// \brief device the target device to infer on
DEFINE_string(d, "GNA_AUTO", target_device_message); @@ -133,6 +139,9 @@ DEFINE_int32(bs, 1, batch_size_message); /// @brief Number of threads to use for inference on the CPU (also affects Hetero cases) DEFINE_int32(nthreads, 1, infer_num_threads_message); +/// @brief Batch size (default 0) +DEFINE_int32(cw, 0, context_window_message); + /** * \brief This function show a help message */ @@ -159,5 +168,6 @@ static void showUsage() { std::cout << " -wg \"\" " << write_gna_model_message << std::endl; std::cout << " -we \"\" " << write_embedded_model_message << std::endl; std::cout << " -nthreads \"\" " << infer_num_threads_message << std::endl; + std::cout << " -cw \"\" " << context_window_message << std::endl; } diff --git a/inference-engine/samples/style_transfer_sample/CMakeLists.txt b/inference-engine/samples/style_transfer_sample/CMakeLists.txt index bbc971e..ac2a170 100644 --- a/inference-engine/samples/style_transfer_sample/CMakeLists.txt +++ b/inference-engine/samples/style_transfer_sample/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "style_transfer_sample") file (GLOB MAIN_SRC diff --git a/inference-engine/samples/style_transfer_sample/README.md b/inference-engine/samples/style_transfer_sample/README.md index 89bd837..a192a3c 100644 --- a/inference-engine/samples/style_transfer_sample/README.md +++ b/inference-engine/samples/style_transfer_sample/README.md @@ -1,7 +1,11 @@ -# Neural Style Transfer Sample +# Neural Style Transfer C++ Sample -This topic demonstrates how to build and run the Neural Style Transfer sample (NST sample) application, which does -inference using models of style transfer topology. +This topic demonstrates how to run the Neural Style Transfer sample application, which performs +inference of style transfer models. + +> **NOTE**: The OpenVINO™ toolkit does not include a pre-trained model to run the Neural Style Transfer sample. A public model from the [Zhaw's Neural Style Transfer repository](https://github.com/zhaw/neural_style) can be used. Read the [Converting a Style Transfer Model from MXNet*](./docs/MO_DG/prepare_model/convert_model/mxnet_specific/Convert_Style_Transfer_From_MXNet.md) topic from the [Model Optimizer Developer Guide](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) to learn about how to get the trained model and how to convert it to the Inference Engine format (\*.xml + \*.bin). + +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). ## Running @@ -15,12 +19,12 @@ InferenceEngine: style_transfer_sample [OPTION] Options: - -h Print a usage message. - -i "" Required. Path to an .bmp image. + -h Print a usage message + -i "" Required. Path to a .bmp image file or a sequence of paths separated by spaces. -m "" Required. Path to an .xml file with a trained model. - -pp "" Path to a plugin folder. - -d "" Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified - -ni "" Number of iterations (default 1) + -pp "" Path to a plugin folder + -d "" The target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. The sample looks for a suitable plugin for the device specified. + -ni "" Number of iterations. Default value is 1 -pc Enables per-layer performance report -mean_val_r, -mean_val_g, @@ -30,18 +34,16 @@ Options: Running the application with the empty list of options yields the usage message given above and an error message. -You can do inference on an image using a trained model of NST network on Intel® Processors using the following command: +To perform inference on an image using a trained model of NST network on Intel® CPUs, use the following command: ```sh ./style_transfer_sample -i /cat.bmp -m /1_decoder_FP32.xml ``` -> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). +## Sample Output -### Outputs +The application outputs an image (`out1.bmp`) or a sequence of images (`out1.bmp`, ..., `out.bmp`) which are redrawn in style of the style transfer model used for sample. -The application outputs an styled image(s) (out(1).bmp) which were redrawn in style of model which used for infer. -Style of output images depend on models which use for sample. - -## See Also +## See Also * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md) - +* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md) +* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) diff --git a/inference-engine/samples/style_transfer_sample/main.cpp b/inference-engine/samples/style_transfer_sample/main.cpp index 4096335..9e943e3 100644 --- a/inference-engine/samples/style_transfer_sample/main.cpp +++ b/inference-engine/samples/style_transfer_sample/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -67,7 +67,7 @@ int main(int argc, char *argv[]) { // --------------------------- 1. Load Plugin for inference engine ------------------------------------- slog::info << "Loading plugin" << slog::endl; - InferencePlugin plugin = PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({FLAGS_pp}).getPluginByDevice(FLAGS_d); /** Printing plugin version **/ printPluginVersion(plugin, std::cout); @@ -213,7 +213,7 @@ int main(int argc, char *argv[]) { double total = 0.0; /** Start inference & calc performance **/ - for (int iter = 0; iter < FLAGS_ni; ++iter) { + for (size_t iter = 0; iter < FLAGS_ni; ++iter) { auto t0 = Time::now(); infer_request.Infer(); auto t1 = Time::now(); @@ -274,7 +274,10 @@ int main(int argc, char *argv[]) { if (!outFile.is_open()) { throw new std::runtime_error("Cannot create " + out_img_name); } - std::vector data_img2(data_img.begin(), data_img.end()); + std::vector data_img2; + for (float i : data_img) { + data_img2.push_back(static_cast(i)); + } writeOutputBmp(data_img2.data(), H, W, outFile); outFile.close(); slog::info << "Image " << out_img_name << " created!" << slog::endl; diff --git a/inference-engine/samples/style_transfer_sample/style_transfer_sample.h b/inference-engine/samples/style_transfer_sample/style_transfer_sample.h index 4377f39..9af35b1 100644 --- a/inference-engine/samples/style_transfer_sample/style_transfer_sample.h +++ b/inference-engine/samples/style_transfer_sample/style_transfer_sample.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ static const char help_message[] = "Print a usage message."; static const char image_message[] = "Required. Path to an .bmp image."; /// @brief message for plugin_path argument -static const char plugin_path_message[] = "Path to a plugin folder."; +static const char plugin_path_message[] = "Optional. Path to a plugin folder."; /// @brief message for model argument static const char model_message[] = "Required. Path to an .xml file with a trained model.";\ @@ -32,22 +32,22 @@ static const char plugin_message[] = "Plugin name. For example MKLDNNPlugin. If "the sample will look for this plugin only"; /// @brief message for assigning cnn calculation to device -static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. " \ +static const char target_device_message[] = "Optional. Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD is acceptable. " \ "Sample will look for a suitable plugin for device specified"; /// @brief message for performance counters -static const char performance_counter_message[] = "Enables per-layer performance report"; +static const char performance_counter_message[] = "Optional. Enables per-layer performance report"; /// @brief message for iterations count -static const char iterations_count_message[] = "Number of iterations (default 1)"; +static const char iterations_count_message[] = "Optional. Number of iterations. Default value is 1"; /// @brief message for user library argument -static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers." \ - "Absolute path to a shared library with the kernels impl."; +static const char custom_cpu_library_message[] = "Optional. Required for CPU custom layers." \ + "Absolute path to a shared library with the kernels implementations."; /// @brief message for clDNN custom kernels desc -static const char custom_cldnn_message[] = "Required for clDNN (GPU)-targeted custom kernels."\ - "Absolute path to the xml file with the kernels desc."; +static const char custom_cldnn_message[] = "Optional. Required for GPU custom kernels."\ + "Absolute path to the xml file with the kernels descriptions."; /// @brief message for mean values arguments static const char preprocess_data_message[] = "Mean values. Required if the model needs mean values for preprocessing and postprocessing"; @@ -76,7 +76,7 @@ DEFINE_string(d, "CPU", target_device_message); DEFINE_bool(pc, false, performance_counter_message); /// @brief Iterations count (default 1) -DEFINE_int32(ni, 1, iterations_count_message); +DEFINE_uint32(ni, 1, iterations_count_message); /// @brief Absolute path to CPU library with user layers
/// It is a required parameter diff --git a/inference-engine/samples/validation_app/CMakeLists.txt b/inference-engine/samples/validation_app/CMakeLists.txt index 898256e..87b337c 100644 --- a/inference-engine/samples/validation_app/CMakeLists.txt +++ b/inference-engine/samples/validation_app/CMakeLists.txt @@ -1,9 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) - set (TARGET_NAME "validation_app") file (GLOB MAIN_SRC @@ -22,7 +20,7 @@ source_group("src" FILES ${MAIN_SRC}) source_group("include" FILES ${MAIN_HEADERS}) # Find OpenCV components if exist -find_package(OpenCV COMPONENTS imgcodecs QUIET) +find_package(OpenCV COMPONENTS imgcodecs imgproc QUIET) if(NOT(OpenCV_FOUND)) message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped") return() diff --git a/inference-engine/samples/validation_app/ClassificationProcessor.cpp b/inference-engine/samples/validation_app/ClassificationProcessor.cpp index 9c52c1e..7db4b32 100644 --- a/inference-engine/samples/validation_app/ClassificationProcessor.cpp +++ b/inference-engine/samples/validation_app/ClassificationProcessor.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -33,11 +33,12 @@ ClassificationProcessor::ClassificationProcessor(const std::string& flags_m, con std::shared_ptr ClassificationProcessor::Process(bool stream_output) { slog::info << "Collecting labels" << slog::endl; ClassificationSetGenerator generator; - // try { - // generator.readLabels(labelFileName); - // } catch (InferenceEngine::details::InferenceEngineException& ex) { - // slog::warn << "Can't read labels file " << labelFileName << slog::endl; - // } + try { + generator.readLabels(labelFileName); + } catch (InferenceEngine::details::InferenceEngineException& ex) { + slog::warn << "Can't read labels file " << labelFileName << slog::endl; + slog::warn << "Error: " << ex.what() << slog::endl; + } auto validationMap = generator.getValidationMap(imagesPath); ImageDecoder decoder; @@ -59,7 +60,7 @@ std::shared_ptr ClassificationProcessor::Process(bo auto iter = validationMap.begin(); while (iter != validationMap.end()) { - int b = 0; + size_t b = 0; int filesWatched = 0; for (; b < batch && iter != validationMap.end(); b++, iter++, filesWatched++) { expected[b] = iter->first; @@ -68,6 +69,7 @@ std::shared_ptr ClassificationProcessor::Process(bo files[b] = iter->second; } catch (const InferenceEngineException& iex) { slog::warn << "Can't read file " << iter->second << slog::endl; + slog::warn << "Error: " << iex.what() << slog::endl; // Could be some non-image file in directory b--; continue; @@ -80,16 +82,16 @@ std::shared_ptr ClassificationProcessor::Process(bo auto firstOutputData = firstOutputBlob->buffer().as::value_type*>(); InferenceEngine::TopResults(TOP_COUNT, *firstOutputBlob, results); - for (int i = 0; i < b; i++) { + for (size_t i = 0; i < b; i++) { int expc = expected[i]; if (zeroBackground) expc++; - bool top1Scored = (results[0 + TOP_COUNT * i] == expc); + bool top1Scored = (static_cast(results[0 + TOP_COUNT * i]) == expc); dumper << "\"" + files[i] + "\"" << top1Scored; if (top1Scored) im.top1Result++; for (int j = 0; j < TOP_COUNT; j++) { unsigned classId = results[j + TOP_COUNT * i]; - if (classId == expc) { + if (static_cast(classId) == expc) { im.topCountResult++; } dumper << classId << firstOutputData[classId + i * (firstOutputBlob->size() / batch)]; diff --git a/inference-engine/samples/validation_app/ClassificationProcessor.hpp b/inference-engine/samples/validation_app/ClassificationProcessor.hpp index 1813ac3..e7a6c94 100644 --- a/inference-engine/samples/validation_app/ClassificationProcessor.hpp +++ b/inference-engine/samples/validation_app/ClassificationProcessor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/ObjectDetectionProcessor.cpp b/inference-engine/samples/validation_app/ObjectDetectionProcessor.cpp index 6109a96..8e3a23e 100644 --- a/inference-engine/samples/validation_app/ObjectDetectionProcessor.cpp +++ b/inference-engine/samples/validation_app/ObjectDetectionProcessor.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -24,7 +24,7 @@ ObjectDetectionProcessor::ObjectDetectionProcessor(const std::string& flags_m, c double threshold, InferenceEngine::InferencePlugin plugin, CsvDumper& dumper, const std::string& flags_a, const std::string& classes_list_file, PreprocessingOptions preprocessingOptions, bool scaleProposalToInputSize) : Processor(flags_m, flags_d, flags_i, flags_b, plugin, dumper, "Object detection network", preprocessingOptions), - threshold(threshold), annotationsPath(flags_a), subdir(subdir), scaleProposalToInputSize(scaleProposalToInputSize) { + annotationsPath(flags_a), subdir(subdir), threshold(threshold), scaleProposalToInputSize(scaleProposalToInputSize) { std::ifstream clf(classes_list_file); if (!clf) { throw UserException(1) << "Classes list file \"" << classes_list_file << "\" not found or inaccessible"; @@ -65,19 +65,15 @@ shared_ptr ObjectDetectionProcessor::Process(bool s for (auto& ann : annCollector.annotations()) { std::list dobList; for (auto& obj : ann.objects) { - DetectedObject dob(classes[obj.name], obj.bndbox.xmin, obj.bndbox.ymin, obj.bndbox.xmax, obj.bndbox.ymax, 1.0, obj.difficult != 0); + DetectedObject dob(classes[obj.name], static_cast(obj.bndbox.xmin), + static_cast(obj.bndbox.ymin), static_cast(obj.bndbox.xmax), + static_cast(obj.bndbox.ymax), 1.0f, obj.difficult != 0); dobList.push_back(dob); } ImageDescription id(dobList); desiredForFiles.insert(std::pair(ann.folder + "/" + (!subdir.empty() ? subdir + "/" : "") + ann.filename, id)); } - - ImageDecoder decoder; - - const int maxProposalCount = outputDims[1]; - const int objectSize = outputDims[0]; - for (auto & item : outInfo) { DataPtr outputData = item.second; if (!outputData) { @@ -104,18 +100,17 @@ shared_ptr ObjectDetectionProcessor::Process(bool s while (iter != annCollector.annotations().end()) { std::vector files; - int b = 0; + size_t b = 0; int filesWatched = 0; for (; b < batch && iter != annCollector.annotations().end(); b++, iter++, filesWatched++) { expected[b] = *iter; string filename = iter->folder + "/" + (!subdir.empty() ? subdir + "/" : "") + iter->filename; try { - Size orig_size = decoder.insertIntoBlob(std::string(imagesPath) + "/" + filename, b, *firstInputBlob, preprocessingOptions); float scale_x, scale_y; - scale_x = 1.0 / iter->size.width; // orig_size.width; - scale_y = 1.0 / iter->size.height; // orig_size.height; + scale_x = 1.0f / iter->size.width; // orig_size.width; + scale_y = 1.0f / iter->size.height; // orig_size.height; if (scaleProposalToInputSize) { scale_x *= firstInputBlob->dims()[0]; @@ -128,6 +123,7 @@ shared_ptr ObjectDetectionProcessor::Process(bool s files.push_back(filename); } catch (const InferenceEngineException& iex) { slog::warn << "Can't read file " << this->imagesPath + "/" + filename << slog::endl; + slog::warn << "Error: " << iex.what() << slog::endl; // Could be some non-image file in directory b--; continue; @@ -135,9 +131,6 @@ shared_ptr ObjectDetectionProcessor::Process(bool s } if (files.size() == batch) { - InferenceEngine::StatusCode sts; - InferenceEngine::ResponseDesc dsc; - // Infer model Infer(progress, filesWatched, im); @@ -146,7 +139,7 @@ shared_ptr ObjectDetectionProcessor::Process(bool s // Calculating similarity // - for (int b = 0; b < files.size(); b++) { + for (size_t b = 0; b < files.size(); b++) { ImageDescription result(detectedObjects[files[b]]); im.apc.consumeImage(result, scaledDesiredForFiles.at(files[b])); } diff --git a/inference-engine/samples/validation_app/ObjectDetectionProcessor.hpp b/inference-engine/samples/validation_app/ObjectDetectionProcessor.hpp index 0bb2231..7a27710 100644 --- a/inference-engine/samples/validation_app/ObjectDetectionProcessor.hpp +++ b/inference-engine/samples/validation_app/ObjectDetectionProcessor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/PreprocessingOptions.hpp b/inference-engine/samples/validation_app/PreprocessingOptions.hpp index 0089308..3e5da5e 100644 --- a/inference-engine/samples/validation_app/PreprocessingOptions.hpp +++ b/inference-engine/samples/validation_app/PreprocessingOptions.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/Processor.cpp b/inference-engine/samples/validation_app/Processor.cpp index d352331..cf8e73b 100644 --- a/inference-engine/samples/validation_app/Processor.cpp +++ b/inference-engine/samples/validation_app/Processor.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,8 +14,8 @@ using namespace InferenceEngine; Processor::Processor(const std::string& flags_m, const std::string& flags_d, const std::string& flags_i, int flags_b, InferencePlugin plugin, CsvDumper& dumper, const std::string& approach, PreprocessingOptions preprocessingOptions) - : targetDevice(flags_d), modelFileName(flags_m), imagesPath(flags_i), batch(flags_b), - plugin(plugin), dumper(dumper), approach(approach), preprocessingOptions(preprocessingOptions) { + : modelFileName(flags_m), targetDevice(flags_d), imagesPath(flags_i), batch(flags_b), + preprocessingOptions(preprocessingOptions), dumper(dumper), plugin(plugin), approach(approach) { // --------------------Load network (Generated xml/bin files)------------------------------------------- slog::info << "Loading network files" << slog::endl; diff --git a/inference-engine/samples/validation_app/Processor.hpp b/inference-engine/samples/validation_app/Processor.hpp index 49d5263..22ce3b6 100644 --- a/inference-engine/samples/validation_app/Processor.hpp +++ b/inference-engine/samples/validation_app/Processor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,9 +13,9 @@ #include "inference_engine.hpp" -#include "csv_dumper.hpp" +#include "samples/csv_dumper.hpp" #include "image_decoder.hpp" -#include "console_progress.hpp" +#include "samples/console_progress.hpp" using namespace std; @@ -36,7 +36,7 @@ protected: std::string modelFileName; std::string targetDevice; std::string imagesPath; - int batch; + size_t batch; InferenceEngine::InferRequest inferRequest; InferenceEngine::InputsDataMap inputInfo; InferenceEngine::OutputsDataMap outInfo; diff --git a/inference-engine/samples/validation_app/README.md b/inference-engine/samples/validation_app/README.md index 4c8af47..11c9ac7 100644 --- a/inference-engine/samples/validation_app/README.md +++ b/inference-engine/samples/validation_app/README.md @@ -15,6 +15,8 @@ Possible use cases of the tool: * Use Validation Application as another sample: although the code is much more complex than in classification and object detection samples, the source code is open and can be re-used. +> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Specify Input Shapes** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md). + ## Validation Application Options The Validation Application provides the following command-line interface (CLI): @@ -31,8 +33,8 @@ Available options: -m Required. Path to an .xml file with a trained model -lbl Labels file path. The labels file contains names of the dataset classes -l Required for CPU custom layers. Absolute path to a shared library with the kernel implementations - -c Required for GPU custom kernels.Absolute path to an .xml file with the kernel descriptions. - -d Target device to infer on: CPU (default), GPU, FPGA, or MYRIAD. The application looks for a suitable plugin for the specified device. + -c Required for GPU custom kernels. Absolute path to an .xml file with the kernel descriptions. + -d Target device to infer on: CPU (default), GPU, FPGA, HDDL or MYRIAD. The application looks for a suitable plugin for the specified device. -b N Batch size value. If not specified, the batch size value is taken from IR -ppType Preprocessing type. Options: "None", "Resize", "ResizeCrop" -ppSize N Preprocessing size (used with ppType="ResizeCrop") @@ -57,6 +59,8 @@ The tool options are divided into two categories: ## General Workflow +> **NOTE**: By default, Inference Engine samples expect input images to have BGR channels order. If you trained you model to work with images in RGB order, you need to manually rearrange the default channels order in the sample application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to [When to Specify Input Shapes](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md#when_to_reverse_input_channels). + When executed, the Validation Application perform the following steps: 1. Loads a model to an Inference Engine plugin @@ -64,7 +68,6 @@ When executed, the Validation Application perform the following steps: - if you specified a directory, the application tries to load labels first. To do this, it searches for the file with the same name as a model, but with `.labels` extension (instead of `.xml`). Then it searches for the specified folder, detects its sub-folders named as known labels, and adds all images from these sub-folders to the validation set. When there are no such sub-folders, validation set is considered empty. - - if you specified a `.txt` file, the application reads this file expecting every line to be in the correct format. For more information about the format, refer to the Preparing the Dataset section below. @@ -195,6 +198,8 @@ Save this file as `VOC_SSD_Classes.txt`. ## Validate Classification Models +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + Once you have prepared the dataset (refer to the Preparing the Dataset section above), run the following command to infer a classification model on the selected dataset: ```bash @@ -206,6 +211,8 @@ run the following command to infer a classification model on the selected datase > **NOTE**: Validation Application was validated with SSD CNN. Any network that can be inferred by the Inference Engine > and has the same input and output format as one of these should be supported as well. +> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). + Once you have prepared the dataset (refer to the Preparing the Dataset section above), run the following command to infer an Object Detection model on the selected dataset: ```bash diff --git a/inference-engine/samples/validation_app/SSDObjectDetectionProcessor.hpp b/inference-engine/samples/validation_app/SSDObjectDetectionProcessor.hpp index 52f3f6b..a8dc30e 100644 --- a/inference-engine/samples/validation_app/SSDObjectDetectionProcessor.hpp +++ b/inference-engine/samples/validation_app/SSDObjectDetectionProcessor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -26,16 +26,16 @@ protected: const auto detectionOutArray = inferRequest.GetBlob(firstOutputName); const float *box = detectionOutArray->buffer().as(); - const int maxProposalCount = outputDims[1]; - const int objectSize = outputDims[0]; + const size_t maxProposalCount = outputDims[1]; + const size_t objectSize = outputDims[0]; - for (int b = 0; b < batch; b++) { + for (size_t b = 0; b < batch; b++) { string fn = files[b]; std::list dr = std::list(); detectedObjects.insert(std::pair>(fn, dr)); } - for (int i = 0; i < maxProposalCount; i++) { + for (size_t i = 0; i < maxProposalCount; i++) { float image_id = box[i * objectSize + 0]; float label = box[i * objectSize + 1]; float confidence = box[i * objectSize + 2]; @@ -48,7 +48,8 @@ protected: break; // Finish } - detectedObjects[files[image_id]].push_back(DetectedObject(label, xmin, ymin, xmax, ymax, confidence)); + detectedObjects[files[static_cast(image_id)]].push_back( + DetectedObject(static_cast(label), xmin, ymin, xmax, ymax, confidence)); } return detectedObjects; diff --git a/inference-engine/samples/validation_app/VOCAnnotationParser.cpp b/inference-engine/samples/validation_app/VOCAnnotationParser.cpp index 94693db..68e2656 100644 --- a/inference-engine/samples/validation_app/VOCAnnotationParser.cpp +++ b/inference-engine/samples/validation_app/VOCAnnotationParser.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/VOCAnnotationParser.hpp b/inference-engine/samples/validation_app/VOCAnnotationParser.hpp index b23363a..a9d2d89 100644 --- a/inference-engine/samples/validation_app/VOCAnnotationParser.hpp +++ b/inference-engine/samples/validation_app/VOCAnnotationParser.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/YOLOObjectDetectionProcessor.hpp b/inference-engine/samples/validation_app/YOLOObjectDetectionProcessor.hpp index fe9dad9..816f969 100644 --- a/inference-engine/samples/validation_app/YOLOObjectDetectionProcessor.hpp +++ b/inference-engine/samples/validation_app/YOLOObjectDetectionProcessor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -46,10 +46,6 @@ private: int row = grid / S; int col = grid % S; for (int b = 0; b < B; b++) { - int index = grid * B + b; - int p_index = SS * C + grid * B + b; - float scale = net_out[p_index]; - int box_index = SS * (C + B) + (grid * B + b) * 4; int objectType = class_num; float conf = confs[(grid * B + b)]; @@ -57,7 +53,6 @@ private: float yc = (cords[(grid * B + b) * 4 + 1] + row) / S; float w = pow(cords[(grid * B + b) * 4 + 2], 2); float h = pow(cords[(grid * B + b) * 4 + 3], 2); - int class_index = grid * C; float prob = probs[grid * C + class_num] * conf; DetectedObject bx(objectType, xc - w / 2, yc - h / 2, xc + w / 2, @@ -77,12 +72,12 @@ private: // Filtering out overlapping boxes std::vector overlapped(boxes.size(), false); - for (int i = 0; i < boxes.size(); i++) { + for (size_t i = 0; i < boxes.size(); i++) { if (overlapped[i]) continue; DetectedObject box_i = boxes[i]; - for (int j = i + 1; j < boxes.size(); j++) { + for (size_t j = i + 1; j < boxes.size(); j++) { DetectedObject box_j = boxes[j]; if (DetectedObject::ioU(box_i, box_j) >= 0.4) { overlapped[j] = true; @@ -90,7 +85,7 @@ private: } } - for (int i = 0; i < boxes.size(); i++) { + for (size_t i = 0; i < boxes.size(); i++) { if (boxes[i].prob > 0.0f) { boxes_result.push_back(boxes[i]); } diff --git a/inference-engine/samples/validation_app/classification_set_generator.cpp b/inference-engine/samples/validation_app/classification_set_generator.cpp index 2ff731d..051474e 100644 --- a/inference-engine/samples/validation_app/classification_set_generator.cpp +++ b/inference-engine/samples/validation_app/classification_set_generator.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -112,7 +112,8 @@ std::vector> ClassificationSetGenerator::validationM try { classId = std::stoi(line.substr(pos + 1)); } catch (const std::invalid_argument& e) { - THROW_USER_EXCEPTION(1) << "Invalid class id specified at line " << lineNumber << ":\n> " << line; + THROW_USER_EXCEPTION(1) << "Invalid class id specified at line " << lineNumber << ":\n> " << line + << " Error: " << e.what(); } imgPath = line.substr(0, pos); validationMap.push_back({ classId, dir + imgPath }); diff --git a/inference-engine/samples/validation_app/classification_set_generator.hpp b/inference-engine/samples/validation_app/classification_set_generator.hpp index 252717e..764364a 100644 --- a/inference-engine/samples/validation_app/classification_set_generator.hpp +++ b/inference-engine/samples/validation_app/classification_set_generator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/image_decoder.cpp b/inference-engine/samples/validation_app/image_decoder.cpp index 7ca0894..b977b63 100644 --- a/inference-engine/samples/validation_app/image_decoder.cpp +++ b/inference-engine/samples/validation_app/image_decoder.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -40,7 +40,7 @@ cv::Size addToBlob(std::string name, int batch_pos, Blob& blob, PreprocessingOpt // TODO This is a dirty hack to support VOC2007 (where no file extension is put into annotation). // Rewrite. - if (name.find('.') == -1) tryName = name + ".JPEG"; + if (name.find('.') == std::string::npos) tryName = name + ".JPEG"; orig_image = imread(tryName, loadMode); @@ -70,7 +70,7 @@ cv::Size addToBlob(std::string name, int batch_pos, Blob& blob, PreprocessingOpt THROW_IE_EXCEPTION << "Unsupported ResizeCropPolicy value"; } - float scaleFactor = preprocessingOptions.scaleValuesTo01 ? 255.0 : 1.0; + float scaleFactor = preprocessingOptions.scaleValuesTo01 ? 255.0f : 1.0f; for (int c = 0; c < channels; c++) { for (int h = 0; h < height; h++) { @@ -106,7 +106,7 @@ std::map convertToBlob(std::vector names, in } std::map res; - for (int b = 0; b < names.size(); b++) { + for (size_t b = 0; b < names.size(); b++) { std::string name = names[b]; Size orig_size = add_func(name, batch_pos + b, blob, preprocessingOptions); res.insert(std::pair(name, orig_size)); diff --git a/inference-engine/samples/validation_app/image_decoder.hpp b/inference-engine/samples/validation_app/image_decoder.hpp index 35cca5a..922956e 100644 --- a/inference-engine/samples/validation_app/image_decoder.hpp +++ b/inference-engine/samples/validation_app/image_decoder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/main.cpp b/inference-engine/samples/validation_app/main.cpp index a2c9446..23137de 100644 --- a/inference-engine/samples/validation_app/main.cpp +++ b/inference-engine/samples/validation_app/main.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -35,8 +35,6 @@ using namespace InferenceEngine; using InferenceEngine::details::InferenceEngineException; -#define DEFAULT_PATH_P "./lib" - /// @brief Message for help argument static const char help_message[] = "Print a help message"; /// @brief Message for images argument @@ -53,7 +51,7 @@ static const char model_message[] = "Required. Path to an .xml file with a train static const char plugin_message[] = "Plugin name. For example, CPU. If this parameter is passed, " "the sample looks for a specified plugin only."; /// @brief Message for assigning cnn calculation to device -static const char target_device_message[] = "Target device to infer on: CPU (default), GPU, FPGA, or MYRIAD." +static const char target_device_message[] = "Target device to infer on: CPU (default), GPU, FPGA, HDDL or MYRIAD." " The application looks for a suitable plugin for the specified device."; /// @brief Message for label argument static const char label_message[] = "Path to a file with labels for a model"; @@ -123,7 +121,7 @@ DEFINE_string(p, "", plugin_message); DEFINE_string(OCl, "", label_message); /// @brief Define parameter for a path to plugins
/// Default is ./lib -DEFINE_string(pp, DEFAULT_PATH_P, plugin_path_message); +DEFINE_string(pp, "", plugin_path_message); /// @brief Define parameter for a target device to infer on
DEFINE_string(d, "CPU", target_device_message); /// @brief Define parameter for batch size
@@ -267,7 +265,7 @@ int main(int argc, char *argv[]) { // ---------------------Loading plugin for Inference Engine------------------------------------------------ slog::info << "Loading plugin" << slog::endl; /** Loading the library with extensions if provided**/ - InferencePlugin plugin = PluginDispatcher({ FLAGS_pp, "../../../lib/intel64", "" }).getPluginByDevice(FLAGS_d); + InferencePlugin plugin = PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); /** Loading default extensions **/ if (FLAGS_d.find("CPU") != std::string::npos) { @@ -358,7 +356,6 @@ int main(int argc, char *argv[]) { showUsage(); return ex.list().begin()->exitCode(); } else { - const char* s = ex.what(); slog::err << "Input problems: \n" << ex.what() << slog::endl; showUsage(); return ex.list().begin()->exitCode(); diff --git a/inference-engine/samples/validation_app/pugixml/pugiconfig.hpp b/inference-engine/samples/validation_app/pugixml/pugiconfig.hpp index 0e976cf..085d6c6 100644 --- a/inference-engine/samples/validation_app/pugixml/pugiconfig.hpp +++ b/inference-engine/samples/validation_app/pugixml/pugiconfig.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/pugixml/pugixml.cpp b/inference-engine/samples/validation_app/pugixml/pugixml.cpp index d4db9c4..aa18656 100644 --- a/inference-engine/samples/validation_app/pugixml/pugixml.cpp +++ b/inference-engine/samples/validation_app/pugixml/pugixml.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/pugixml/pugixml.hpp b/inference-engine/samples/validation_app/pugixml/pugixml.hpp index 9f609d1..fd3067f 100644 --- a/inference-engine/samples/validation_app/pugixml/pugixml.hpp +++ b/inference-engine/samples/validation_app/pugixml/pugixml.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/samples/validation_app/user_exception.hpp b/inference-engine/samples/validation_app/user_exception.hpp index bdeda3c..dd3f43d 100644 --- a/inference-engine/samples/validation_app/user_exception.hpp +++ b/inference-engine/samples/validation_app/user_exception.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -85,7 +85,7 @@ public: ss << _list.back().what(); } else { auto iter = _list.begin(); - for (int i = 0; i < _list.size() - 1; i++) { + for (size_t i = 0; i < _list.size() - 1; i++) { ss << "\t* " << (*iter++).what() << std::endl; } ss << "\t* " << _list.back().what(); diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt index cabd78b..aad2b5b 100644 --- a/inference-engine/src/CMakeLists.txt +++ b/inference-engine/src/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -35,3 +35,6 @@ endfunction() add_subdirectory(extension EXCLUDE_FROM_ALL) add_library(IE::ie_cpu_extension ALIAS ie_cpu_extension) + +file(GLOB_RECURSE EXTENSION_SOURCES extension/*.cpp extension/*.hpp extension/*.h) +add_cpplint_target(ie_cpu_extension_cpplint FOR_SOURCES ${EXTENSION_SOURCES}) diff --git a/inference-engine/src/cldnn_engine/CMakeLists.txt b/inference-engine/src/cldnn_engine/CMakeLists.txt index 372bae8..a2d81c3 100644 --- a/inference-engine/src/cldnn_engine/CMakeLists.txt +++ b/inference-engine/src/cldnn_engine/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -67,10 +67,12 @@ set(CLDNN_LIBRARY clDNN_shlib) add_library(${TARGET_NAME} SHARED ${MAIN_SRC} ${LIBRARY_HEADERS}) -target_link_libraries(${TARGET_NAME} pugixml ${INTEL_ITT_LIBS} inference_engine ${CLDNN_LIBRARY}) +target_link_libraries(${TARGET_NAME} ${INTEL_ITT_LIBS} inference_engine ${CLDNN_LIBRARY}) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) #copy default global xml file describing the custom kernels and the *.cl files add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND "${CMAKE_COMMAND}" -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/cldnn_global_custom_kernels $/cldnn_global_custom_kernels) + +add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) diff --git a/inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp b/inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp index a247d64..32fb414 100644 --- a/inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,11 @@ #include #include #include + +#ifdef _WIN32 +# include +#endif + #include "simple_math.h" using namespace InferenceEngine; diff --git a/inference-engine/src/cldnn_engine/cldnn_custom_layer.h b/inference-engine/src/cldnn_engine/cldnn_custom_layer.h index 89a802f..e948f29 100644 --- a/inference-engine/src/cldnn_engine/cldnn_custom_layer.h +++ b/inference-engine/src/cldnn_engine/cldnn_custom_layer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -57,8 +57,8 @@ public: const int InputDimSourceIndex() { return m_wgDimInputIdx; } protected: - CLDNNCustomLayer() {} - explicit CLDNNCustomLayer(const std::string dirname) : m_configDir(dirname) {} + CLDNNCustomLayer() : m_wgDimInputIdx(0) {} + explicit CLDNNCustomLayer(const std::string dirname) : m_configDir(dirname), m_wgDimInputIdx(0) {} bool Error() const { return m_ErrorMessage.length() > 0; } void LoadSingleLayer(const pugi::xml_node& node); diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 4b79fe6..fab02d3 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -116,12 +116,8 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(InferenceEngine:: INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept { try { plugin = make_ie_compatible_plugin( - {1, 5, -#ifdef CLDNN_VERSION - CLDNN_VERSION, -#else + {1, 6, CI_BUILD_NUMBER, -#endif "clDNNPlugin"}, std::make_shared()); return OK; } diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.h b/inference-engine/src/cldnn_engine/cldnn_engine.h index 6de94cf..6241a94 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.h +++ b/inference-engine/src/cldnn_engine/cldnn_engine.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/ctc_greedy_decoder.cl b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/ctc_greedy_decoder.cl index 40a7107..0467adc 100644 --- a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/ctc_greedy_decoder.cl +++ b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/ctc_greedy_decoder.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/grn.cl b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/grn.cl index 554b8b6..1f37043 100644 --- a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/grn.cl +++ b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/grn.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/interp.cl b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/interp.cl index ef41d13..649667d 100644 --- a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/interp.cl +++ b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/interp.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/prior_box_clustered.cl b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/prior_box_clustered.cl index a61f021..f1fe258 100644 --- a/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/prior_box_clustered.cl +++ b/inference-engine/src/cldnn_engine/cldnn_global_custom_kernels/prior_box_clustered.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.cpp b/inference-engine/src/cldnn_engine/cldnn_graph.cpp index fe61da1..9f8f58b 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -42,6 +42,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -52,7 +57,6 @@ #include #include #include -#include #include "cldnn_infer_request.h" #include #include "details/caseless.hpp" @@ -99,9 +103,6 @@ static void ValidateLayer(const InferenceEngine::CNNLayerPtr& layer, unsigned in } static void ValidateEltwiseLayer(const InferenceEngine::CNNLayerPtr& layer) { - if (layer->insData.size() < 2) { - THROW_CLDNN_EXCEPTION("Invalid number of inputs for layer: " << layer->name << ". Eltwise layer should take at least 2 inputs"); - } if (layer->_fusedWith) { THROW_CLDNN_EXCEPTION("Unsupported fuse in layer: " << layer->name << " with: " << layer->_fusedWith->name); } @@ -287,7 +288,6 @@ bool CLDNNGraph::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, const Config& config, int max_batch) : m_config(config), m_defaultFormat(cldnn::format::bfyx), - m_networkPrecision(cldnn::data_types::f32), m_curBatch(-1) { m_env.engine = std::make_shared(cldnn::engine_configuration( (config.useProfiling || (config.tuningConfig.mode != cldnn::tuning_mode::tuning_disabled)), @@ -309,7 +309,21 @@ CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, const Config& conf _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eGPU)); } - bool res = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true; + bool res = !NetPass::CombineRNNSeq(network) ? NetPass::UnrollTI(network) : true; + res &= NetPass::UnrollRNN_if(network, [] (RNNCellBase rnn) -> bool { + if (rnn.clip != 0.0f) + return true; + if (rnn.type == "GRUCell" || + rnn.type == "GRUSequence" || + rnn.type == "RNNCell" || + rnn.type == "RNNSequence") + return true; + if (!(rnn.type == "LSTMCell" || rnn.type == "LSTMSequence") || + rnn.activations == std::vector{"sigmoid", "tanh", "tanh"}) + return false; + return true; + }); + if (!res) THROW_CLDNN_EXCEPTION("Plugin doesn't support Tensor Iterator in pure form. " "No one TI optimization pattern was not applied successfully"); @@ -372,6 +386,14 @@ CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, const Config& conf m_env.debugOptions.ClearTimedEvents(); } +inline std::string layer_type_name_ID(InferenceEngine::CNNLayer* layer) { + return layer->type + ":" + layer->name; +} + +inline std::string layer_type_name_ID(InferenceEngine::CNNLayerPtr layer) { + return layer_type_name_ID(layer.get()); +} + std::vector CLDNNGraph::GetNextLayers(const InferenceEngine::DataPtr data) { std::vector nextLayers; if (data == nullptr) { @@ -417,7 +439,6 @@ InferenceEngine::CNNLayerPtr CLDNNGraph::GetNextSingleLayer(const InferenceEngin void CLDNNGraph::InitFormat(InferenceEngine::ICNNNetwork &network) { m_defaultFormat = FormatFromLayout(InferenceEngine::Layout::NCHW); - m_networkPrecision = DataTypeFromPrecision(network.getPrecision()); } void CLDNNGraph::CompileNetwork() { @@ -451,29 +472,30 @@ void CLDNNGraph::Load(InferenceEngine::ICNNNetwork &network) { THROW_CLDNN_EXCEPTION("No inputs detected."); } + using LayerVect = std::vector; std::list layersToHandle; - for (auto input : networkInputs) { - IE_ASSERT(input.first.compare(input.second->name()) == 0); - AddInputPrimitive(input.second); - - auto consumers = input.second->getInputData()->getInputTo(); - // collect next layers to process - for (auto l : consumers) { - layersToHandle.push_back(l.second); + auto push_if = [&](const LayerVect& clist) { + for (auto& l : clist) { + if ( (std::find_if( layersToHandle.begin(), + layersToHandle.end(), + [&](const CNNLayerPtr& x) { return layer_type_name_ID(x) == layer_type_name_ID(l); } )) == layersToHandle.end() ) + layersToHandle.push_back(l); } - } + }; auto allInputs = CNNNetGetAllInputLayers(network); for (auto input : allInputs) { if (LayerTypeFromStr(input->type) == ConstantBlob) { AddConstantBlobInput(input); - - // collect next layers to process - for (auto nl : GetNextLayers(input)) { - layersToHandle.push_back(nl); + } else { + auto iter = networkInputs.find(input->name); // regular input + if (iter != networkInputs.end()) { + AddInputPrimitive(iter->second, input->precision); } } + // collect next layers to process + push_if(GetNextLayers(input)); } // 2. traverse layers @@ -485,7 +507,7 @@ void CLDNNGraph::Load(InferenceEngine::ICNNNetwork &network) { } InferenceEngine::CNNLayerPtr currLayer = layersToHandle.front(); layersToHandle.pop_front(); - auto layerName = currLayer->name; + auto layerName = layer_type_name_ID(currLayer); if (m_env.primitiveIDs.find(layerName) != m_env.primitiveIDs.end()) { infLoopProtection = 0; @@ -496,7 +518,7 @@ void CLDNNGraph::Load(InferenceEngine::ICNNNetwork &network) { try { GetPrevLayersPrimitives(currLayer); } catch (std::exception) { - missingInput = true; + missingInput = true; } if (missingInput) { // some inputs aren't created yet @@ -505,13 +527,10 @@ void CLDNNGraph::Load(InferenceEngine::ICNNNetwork &network) { } infLoopProtection = 0; // found a layer with all inputs already existing - IE_ASSERT(_networkPrecision == currLayer->precision); CreateSingleLayerPrimitive(currLayer); // currLayer will be advanced if layer was skipped or merged - m_env.prevPrimitiveIDs[currLayer->name] = GetPrevLayersPrimitives(currLayer); + m_env.prevPrimitiveIDs[layerName] = GetPrevLayersPrimitives(currLayer); - for (auto nl : GetNextLayers(currLayer)) { - layersToHandle.push_back(nl); - } + push_if(GetNextLayers(currLayer)); } // 3. Handle output reordering @@ -536,6 +555,8 @@ CLDNNGraph::LayerType CLDNNGraph::LayerTypeFromStr(const std::string &str) { { "TanH" , TanH }, { "ELU" , ELU }, { "Activation" , Activation }, + { "Exp" , Exp }, + { "Not" , Not }, { "Norm" , LRN }, { "Pooling" , Pooling }, { "FullyConnected" , FullyConnected }, @@ -573,7 +594,13 @@ CLDNNGraph::LayerType CLDNNGraph::LayerTypeFromStr(const std::string &str) { { "Tile" , Tile }, { "Pad" , Pad }, { "LSTMCell" , LSTMCell }, - { "RNN" , RNN }, + { "LSTMSequence" , RNN }, + { "RNNSequence" , RNN }, + { "Gather" , Gather }, + { "DepthToSpace" , DepthToSpace }, + { "ShuffleChannels" , ShuffleChannels }, + { "StridedSlice" , StridedSlice }, + { "ReverseSequence" , ReverseSequence } }; auto it = LayerNameToType.find(str); if (it != LayerNameToType.end()) @@ -604,6 +631,32 @@ cldnn::eltwise_mode CLDNNGraph::EltwiseModeFromIEEltwise(InferenceEngine::Eltwis return cldnn::eltwise_mode::prod; case InferenceEngine::EltwiseLayer::Max: return cldnn::eltwise_mode::max; + case InferenceEngine::EltwiseLayer::Sub: + return cldnn::eltwise_mode::sub; + case InferenceEngine::EltwiseLayer::Min: + return cldnn::eltwise_mode::min; + case InferenceEngine::EltwiseLayer::Div: + return cldnn::eltwise_mode::div; + case InferenceEngine::EltwiseLayer::Squared_diff: + return cldnn::eltwise_mode::squared_diff; + case InferenceEngine::EltwiseLayer::Equal: + return cldnn::eltwise_mode::eq; + case InferenceEngine::EltwiseLayer::Not_equal: + return cldnn::eltwise_mode::ne; + case InferenceEngine::EltwiseLayer::Less: + return cldnn::eltwise_mode::lt; + case InferenceEngine::EltwiseLayer::Less_equal: + return cldnn::eltwise_mode::le; + case InferenceEngine::EltwiseLayer::Greater: + return cldnn::eltwise_mode::gt; + case InferenceEngine::EltwiseLayer::Greater_equal: + return cldnn::eltwise_mode::ge; + case InferenceEngine::EltwiseLayer::Logical_AND: + return cldnn::eltwise_mode::logic_and; + case InferenceEngine::EltwiseLayer::Logical_OR: + return cldnn::eltwise_mode::logic_or; + case InferenceEngine::EltwiseLayer::Logical_XOR: + return cldnn::eltwise_mode::logic_xor; default: THROW_CLDNN_EXCEPTION("Unsupported eltwise operation: " << op); break; } @@ -647,6 +700,7 @@ void CLDNNGraph::CreatePrimitiveFromBlob(cldnn::primitive_id primID, } else if ((pBlob->layout() != InferenceEngine::OIHW) && (pBlob->layout() != InferenceEngine::NCHW) && (pBlob->layout() != InferenceEngine::CHW) && + (pBlob->layout() != InferenceEngine::NC) && (pBlob->layout() != InferenceEngine::C)) { // TODO: support more layouts THROW_CLDNN_EXCEPTION("Unsupported layout (" << DebugOptions::IELayoutToString(pBlob->layout()) << ") in blob: " << primID); @@ -712,13 +766,15 @@ void CLDNNGraph::CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPt switch (LayerTypeFromStr(layer->type)) { case Convolution: { auto convLayer = dynamic_cast (layer.get()); - groupSize = convLayer->_group; if ((inFeatures % groupSize) || (convLayer->_out_depth % groupSize)) { THROW_CLDNN_EXCEPTION("Invalid group size in layer " << convLayer->name); } + groupSize = convLayer->_group; + if (groupSize >= 16) // cldnn optimization for 16 and more groups + groupSize = 1; weightDimsVec = { TensorValue(convLayer->_out_depth / groupSize), - TensorValue(inFeatures / groupSize), + TensorValue(inFeatures / convLayer->_group), TensorValue(convLayer->_kernel[X_AXIS]), TensorValue(convLayer->_kernel[Y_AXIS]) }; @@ -729,13 +785,15 @@ void CLDNNGraph::CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPt break; case Deconvolution: { auto deconvLayer = dynamic_cast (layer.get()); - groupSize = deconvLayer->_group; if ((inFeatures % groupSize) || (deconvLayer->_out_depth % groupSize)) { THROW_CLDNN_EXCEPTION("Invalid group size in layer " << deconvLayer->name); } + groupSize = deconvLayer->_group; + if (groupSize >= 16) // cldnn optimization for 16 and more groups + groupSize = 1; weightDimsVec = { TensorValue(deconvLayer->_out_depth / groupSize), - TensorValue(inFeatures / groupSize), + TensorValue(inFeatures / deconvLayer->_group), TensorValue(deconvLayer->_kernel[X_AXIS]), TensorValue(deconvLayer->_kernel[Y_AXIS]) }; @@ -754,13 +812,13 @@ void CLDNNGraph::CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPt // create weights primitive cldnn::layout weightsLayout = cldnn::layout( - m_networkPrecision, + DataTypeFromPrecision(layer->precision), m_defaultFormat, cldnn::tensor(weightDimsVec)); size_t bytesPerGroup = weightsLayout.bytes_count(); for (unsigned g = 0; g < groupSize; g++) { - cldnn::primitive_id weightID = layer->name + m_weightsTag + std::to_string(g); + cldnn::primitive_id weightID = layer_type_name_ID(layer) + m_weightsTag + std::to_string(g); CreatePrimitiveFromBlob( weightID, pWeightsBlob, @@ -773,12 +831,12 @@ void CLDNNGraph::CreateWeightAndBiasPrimitives(const InferenceEngine::CNNLayerPt // create bias primitive if (pBiasBlob != nullptr) { cldnn::layout biasesLayout = cldnn::layout( - m_networkPrecision, + DataTypeFromPrecision(layer->precision), m_defaultFormat, cldnn::spatial(TensorValue(outFeatures / groupSize))); size_t bytesPerGroup = biasesLayout.bytes_count(); for (unsigned g = 0; g < groupSize; g++) { - cldnn::primitive_id biasID = layer->name + m_biasesTag + std::to_string(g); + cldnn::primitive_id biasID = layer_type_name_ID(layer) + m_biasesTag + std::to_string(g); CreatePrimitiveFromBlob( biasID, pBiasBlob, @@ -813,7 +871,7 @@ void CLDNNGraph::CreateScaleWeightsAndBiasesFromBN( THROW_CLDNN_EXCEPTION("Batch normalization input doesn't have 2 or 4 dimensions in " << bnLayer->name); } cldnn::layout blobLayout( - m_networkPrecision, + DataTypeFromPrecision(bnLayer->precision), m_defaultFormat, blobTensor); @@ -875,7 +933,7 @@ void CLDNNGraph::CreateScaleWeightsAndBiasesFromBN( void CLDNNGraph::CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr &layer) { // Initialize a profiling entry - InitProfileInfo(layer->name, layer->type, "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + InitProfileInfo(layer->name, layer->type); // First check for custom layer auto customLayer = m_config.customLayers.find(layer->type); @@ -895,6 +953,8 @@ void CLDNNGraph::CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr &layer) case ELU: case Clamp: case Activation: + case Exp: + case Not: CreateActivationPrimitive(layer, LayerTypeFromStr(layer->type)); break; case LRN: CreateLRNPrimitive(layer); @@ -967,6 +1027,16 @@ void CLDNNGraph::CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr &layer) break; case Pad: CreatePadPrimitive(layer); break; + case Gather: CreateGatherPrimitive(layer); + break; + case DepthToSpace: CreateDepthToSpacePrimitive(layer); + break; + case ShuffleChannels: CreateShuffleChannelsPrimitive(layer); + break; + case StridedSlice: CreateStridedSlicePrimitive(layer); + break; + case ReverseSequence: CreateReverseSequencePrimitive(layer); + break; default: THROW_CLDNN_EXCEPTION("Unknown Layer Type: " << layer->type); } } @@ -990,8 +1060,7 @@ void CLDNNGraph::CreateScaleShiftPrimitive(InferenceEngine::CNNLayerPtr &layer) default: THROW_CLDNN_EXCEPTION("Invalid weights dimensions in layer " << layer->name); break; } - - cldnn::layout blobLayout(m_networkPrecision, m_defaultFormat, weightTensor); + cldnn::layout blobLayout(DataTypeFromPrecision(layer->precision), m_defaultFormat, weightTensor); CreatePrimitiveFromBlob(scalePrimID, scaleShiftLayer->_weights, blobLayout); if (scaleShiftLayer->_biases != nullptr) { if (scaleShiftLayer->_biases->dims() != dims) { @@ -1002,21 +1071,20 @@ void CLDNNGraph::CreateScaleShiftPrimitive(InferenceEngine::CNNLayerPtr &layer) biasPrimID = ""; // 0-bias } + std::string scaleShiftLayerName = layer_type_name_ID(layer); auto scaleShiftPrim = cldnn::scale( - scaleShiftLayer->name, + scaleShiftLayerName, inputPrimitives[0], scalePrimID, biasPrimID); - m_env.primitiveIDs[scaleShiftLayer->name] = scaleShiftLayer->name; + m_env.primitiveIDs[scaleShiftLayerName] = scaleShiftLayerName; m_topology->add(scaleShiftPrim); - m_env.profilingIDs.insert(scaleShiftLayer->name); + m_env.profilingIDs.push_back(scaleShiftLayerName); } void CLDNNGraph::CreateProposalPrimitive(InferenceEngine::CNNLayerPtr & layer) { ValidateLayer(layer, 3); - IE_ASSERT(layer->insData[0].lock()->dims[3] == 1); // only handling input batch size 1 - IE_ASSERT(layer->insData[1].lock()->dims[3] == 1); // only handling input batch size 1 auto proposalLayer = dynamic_cast (layer.get()); float nms_thresh = proposalLayer->GetParamAsFloat("nms_thresh", 0.7f); @@ -1031,6 +1099,9 @@ void CLDNNGraph::CreateProposalPrimitive(InferenceEngine::CNNLayerPtr & layer) { int base_size = proposalLayer->GetParamAsInt("base_size", 16); std::string framework = proposalLayer->GetParamAsString("framework", ""); auto inputPrimitives = GetPrevLayersPrimitives(layer); + bool normalize = layer->GetParamsAsBool("normalize", false); + bool clip_before_nms = layer->GetParamsAsBool("clip_before_nms", true); + bool clip_after_nms = layer->GetParamsAsBool("clip_after_nms", false); float coordinates_offset; bool swap_xy; @@ -1052,8 +1123,9 @@ void CLDNNGraph::CreateProposalPrimitive(InferenceEngine::CNNLayerPtr & layer) { swap_xy = false; } + std::string proposalLayerName = layer_type_name_ID(layer); auto proposalPrim = cldnn::proposal( - proposalLayer->name, + proposalLayerName, inputPrimitives[0], // cls_score inputPrimitives[1], // bbox_pred inputPrimitives[2], // im_info @@ -1071,12 +1143,15 @@ void CLDNNGraph::CreateProposalPrimitive(InferenceEngine::CNNLayerPtr & layer) { box_size_scale, swap_xy, initial_clip, + clip_before_nms, + clip_after_nms, round_ratios, - shift_anchors); + shift_anchors, + normalize); - m_env.primitiveIDs[proposalLayer->name] = proposalLayer->name; + m_env.primitiveIDs[proposalLayerName] = proposalLayerName; m_topology->add(proposalPrim); - m_env.profilingIDs.insert(proposalLayer->name); + m_env.profilingIDs.push_back(proposalLayerName); } void CLDNNGraph::CreatePReLUPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1084,6 +1159,7 @@ void CLDNNGraph::CreatePReLUPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto inputPrimitives = GetPrevLayersPrimitives(layer); auto preluLayer = dynamic_cast (layer.get()); + std::string preluLayerName = layer_type_name_ID(layer); auto inDataPtr = preluLayer->insData[0].lock(); if (!inDataPtr) { THROW_CLDNN_EXCEPTION("Data inserted into PreLu " << preluLayer->name << " is nullptr"); @@ -1115,35 +1191,36 @@ void CLDNNGraph::CreatePReLUPrimitive(InferenceEngine::CNNLayerPtr &layer) { break; default: THROW_CLDNN_EXCEPTION("Invalid PReLU slope blob precision in " << preluLayer->name); } - m_topology->add(cldnn::activation(preluLayer->name, inputPrimitives[0], activation_relu_negative_slope, { slope, 0.f })); + m_topology->add(cldnn::activation(preluLayerName, inputPrimitives[0], activation_relu_negative_slope, { slope, 0.f })); } else { CreateGenericLayerBlobPrimitives(preluLayer); - cldnn::primitive_id slopePrimID(preluLayer->name + "_" + blobName + m_weightsTag); - m_topology->add(cldnn::activation(preluLayer->name, inputPrimitives[0], slopePrimID, activation_relu_negative_slope)); + cldnn::primitive_id slopePrimID(preluLayerName + "_" + blobName + m_weightsTag); + m_topology->add(cldnn::activation(preluLayerName, inputPrimitives[0], slopePrimID, activation_relu_negative_slope)); } - m_env.primitiveIDs[preluLayer->name] = preluLayer->name; - m_env.profilingIDs.insert(preluLayer->name); + m_env.primitiveIDs[preluLayerName] = preluLayerName; + m_env.profilingIDs.push_back(preluLayerName); } void CLDNNGraph::CreateBatchNormalizationPrimitive(InferenceEngine::CNNLayerPtr & layer) { ValidateLayer(layer, 1); auto inputPrimitives = GetPrevLayersPrimitives(layer); + std::string bnLayerName = layer_type_name_ID(layer); auto bnLayer = dynamic_cast (layer.get()); - cldnn::primitive_id weightID = bnLayer->name + "_" + m_scalesTag; - cldnn::primitive_id biasID = bnLayer->name + "_" + m_biasesTag; + cldnn::primitive_id weightID = bnLayerName + "_" + m_scalesTag; + cldnn::primitive_id biasID = bnLayerName + "_" + m_biasesTag; #define _SCALE_BN_OPT #ifdef _SCALE_BN_OPT // Using scale as an optimization (1 mad instead of mad+rsq) // create new blobs for scale shift CreateScaleWeightsAndBiasesFromBN(bnLayer, weightID, biasID); - auto scalePrim = cldnn::scale(bnLayer->name, inputPrimitives[0], weightID, biasID); + auto scalePrim = cldnn::scale(bnLayerName, inputPrimitives[0], weightID, biasID); - m_env.primitiveIDs[bnLayer->name] = bnLayer->name; + m_env.primitiveIDs[bnLayerName] = bnLayerName; m_topology->add(scalePrim); - m_env.profilingIDs.insert(bnLayer->name); + m_env.profilingIDs.push_back(bnLayerName); return; #endif // _SCALE_BN_OPT @@ -1159,67 +1236,85 @@ void CLDNNGraph::CreateBatchNormalizationPrimitive(InferenceEngine::CNNLayerPtr THROW_CLDNN_EXCEPTION("Batch normalization input doesn't have 2 or 4 dimensions in " << bnLayer->name); } cldnn::layout blobLayout( - m_networkPrecision, + DataTypeFromPrecision(layer->precision), m_defaultFormat, blobTensor); // Create variance primitive - cldnn::primitive_id varianceID = bnLayer->name + "_" + m_weightsTag; + cldnn::primitive_id varianceID = bnLayerName + "_" + m_weightsTag; CreatePrimitiveFromBlob(varianceID, bnLayer->_weights, blobLayout); // Create mean primitive - cldnn::primitive_id meanID = bnLayer->name + "_" + m_biasesTag; + cldnn::primitive_id meanID = bnLayerName + "_" + m_biasesTag; CreatePrimitiveFromBlob(meanID, bnLayer->_biases, blobLayout); auto bnPrim = cldnn::batch_norm( - bnLayer->name, + bnLayerName, inputPrimitives[0], meanID, varianceID, bnLayer->epsilon); - m_env.primitiveIDs[bnLayer->name] = bnLayer->name; + m_env.primitiveIDs[bnLayerName] = bnLayerName; m_topology->add(bnPrim); - m_env.profilingIDs.insert(bnLayer->name); + m_env.profilingIDs.push_back(bnLayerName); } void CLDNNGraph::CreateFlattenPrimitive(InferenceEngine::CNNLayerPtr &layer) { ValidateLayer(layer, 1); auto inputPrimitives = GetPrevLayersPrimitives(layer); auto flattenLayer = dynamic_cast (layer.get()); + std::string flattenLayerName = layer_type_name_ID(layer); auto flattenPrim = cldnn::reshape( - flattenLayer->name, + flattenLayerName, inputPrimitives[0], CldnnTensorFromIEDims(flattenLayer->outData[0]->dims)); - m_env.primitiveIDs[flattenLayer->name] = flattenLayer->name; + m_env.primitiveIDs[flattenLayerName] = flattenLayerName; m_topology->add(flattenPrim); - m_env.profilingIDs.insert(flattenLayer->name); + m_env.profilingIDs.push_back(flattenLayerName); } void CLDNNGraph::CreatePermutePrimitive(InferenceEngine::CNNLayerPtr &layer) { ValidateLayer(layer, 1); auto inputPrimitives = GetPrevLayersPrimitives(layer); auto permuteLayer = dynamic_cast (layer.get()); - std::vector order; + std::vector ie_order; for (auto& a : permuteLayer->GetParamAsInts("order")) - order.push_back(static_cast(a)); + ie_order.push_back(static_cast(a)); // if order size is less than 4 - fill the rest with just copy - for (auto o = order.size(); o < 4; o++) - order.push_back((uint16_t)o); + for (auto o = ie_order.size(); o < 4; o++) + ie_order.push_back((uint16_t)o); - auto outputDims = permuteLayer->outData[0]->dims; + /* + Because ofthe cldnn ordering: bfxy, and IE ordering: bfyx + wee need to adjust the permute order. + */ + std::vector cldnn_permute_order; + // 1. Switch permute order values (x and y) + for (auto const& o : ie_order) { + if (o == 2) + cldnn_permute_order.push_back(3); + else if (o == 3) + cldnn_permute_order.push_back(2); + else + cldnn_permute_order.push_back(o); + } + // 2. Swap x and y positions + std::swap(cldnn_permute_order[2], cldnn_permute_order[3]); + + std::string permuteLayerName = layer_type_name_ID(layer); auto permutePrim = cldnn::permute( - permuteLayer->name, + permuteLayerName, inputPrimitives[0], - order); + cldnn_permute_order); - m_env.primitiveIDs[permuteLayer->name] = permuteLayer->name; + m_env.primitiveIDs[permuteLayerName] = permuteLayerName; m_topology->add(permutePrim); - m_env.profilingIDs.insert(permuteLayer->name); + m_env.profilingIDs.push_back(permuteLayerName); } void CLDNNGraph::CreateReshapePrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1227,15 +1322,16 @@ void CLDNNGraph::CreateReshapePrimitive(InferenceEngine::CNNLayerPtr &layer) { auto inputPrimitives = GetPrevLayersPrimitives(layer); auto reshapeLayer = dynamic_cast (layer.get()); IE_ASSERT(reshapeLayer->outData.size()); + std::string reshapeLayerName = layer_type_name_ID(layer); auto reshapePrim = cldnn::reshape( - reshapeLayer->name, + reshapeLayerName, inputPrimitives[0], CldnnTensorFromIEDims(reshapeLayer->outData[0]->dims)); - m_env.primitiveIDs[reshapeLayer->name] = reshapeLayer->name; + m_env.primitiveIDs[reshapeLayerName] = reshapeLayerName; m_topology->add(reshapePrim); - m_env.profilingIDs.insert(reshapeLayer->name); + m_env.profilingIDs.push_back(reshapeLayerName); } void CLDNNGraph::CreateNormalizePrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1254,69 +1350,73 @@ void CLDNNGraph::CreateNormalizePrimitive(InferenceEngine::CNNLayerPtr &layer) { eps = 1e-10f; } + std::string normLayerName = layer_type_name_ID(layer); auto normPrim = cldnn::normalize( - normLayer->name, + normLayerName, inputPrimitives[0], - normLayer->name + "_weights" + m_weightsTag, + normLayerName + "_weights" + m_weightsTag, across_spatial, eps); - m_env.primitiveIDs[normLayer->name] = normLayer->name; + m_env.primitiveIDs[normLayerName] = normLayerName; m_topology->add(normPrim); - m_env.profilingIDs.insert(normLayer->name); + m_env.profilingIDs.push_back(normLayerName); } void CLDNNGraph::CreateDetectionOutputPrimitive(InferenceEngine::CNNLayerPtr &layer) { ValidateLayer(layer, 3); auto detectionLayer = dynamic_cast (layer.get()); - uint32_t num_classes = detectionLayer->GetParamAsUInt("num_classes", 1); - bool share_location = detectionLayer->GetParamsAsBool("share_location", true); - int background_label_id = detectionLayer->GetParamAsInt("background_label_id", 0); - float nms_threshold = detectionLayer->GetParamAsFloat("nms_threshold", 0.3f); - int top_k = detectionLayer->GetParamAsInt("top_k", -1); - float confidence_threshold = detectionLayer->GetParamAsFloat("confidence_threshold", -FLT_MAX); - float eta = detectionLayer->GetParamAsFloat("eta", 1.0f); - int keep_top_k = detectionLayer->GetParamAsInt("keep_top_k", -1); + uint32_t num_classes = detectionLayer->GetParamAsUInt("num_classes", 1); + bool share_location = detectionLayer->GetParamsAsBool("share_location", true); + int background_label_id = detectionLayer->GetParamAsInt("background_label_id", 0); + float nms_threshold = detectionLayer->GetParamAsFloat("nms_threshold", 0.3f); + int top_k = detectionLayer->GetParamAsInt("top_k", -1); + float confidence_threshold = detectionLayer->GetParamAsFloat("confidence_threshold", -FLT_MAX); + float eta = detectionLayer->GetParamAsFloat("eta", 1.0f); + int keep_top_k = detectionLayer->GetParamAsInt("keep_top_k", -1); bool variance_encoded_in_target = detectionLayer->GetParamsAsBool("variance_encoded_in_target", false); - int input_width = detectionLayer->GetParamAsInt("input_width", -1); - int input_height = detectionLayer->GetParamAsInt("input_height", -1); - bool normalized = detectionLayer->GetParamsAsBool("normalized", true); - std::string code_type = detectionLayer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CORNER"); - bool clip = detectionLayer->GetParamsAsBool("clip", false); - bool decrease_label_id = detectionLayer->GetParamsAsBool("decrease_label_id", false); - cldnn::prior_box_code_type cldnnCodeType = PriorBoxCodeFromString(code_type); + int input_width = detectionLayer->GetParamAsInt("input_width", -1); + int input_height = detectionLayer->GetParamAsInt("input_height", -1); + bool normalized = detectionLayer->GetParamsAsBool("normalized", true); + std::string code_type = detectionLayer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CORNER"); + bool clip_before_nms = detectionLayer->GetParamsAsBool("clip_before_nms", false) || + detectionLayer->GetParamsAsBool("clip", false); // For backward compatibility + bool clip_after_nms = detectionLayer->GetParamsAsBool("clip_after_nms", false); + bool decrease_label_id = detectionLayer->GetParamsAsBool("decrease_label_id", false); + cldnn::prior_box_code_type cldnnCodeType = PriorBoxCodeFromString(code_type); int32_t prior_info_size = normalized != 0 ? 4 : 5; int32_t prior_coordinates_offset = normalized != 0 ? 0 : 1; auto inputPrimitives = GetPrevLayersPrimitives(layer); - auto detectionPrim = cldnn::detection_output( - detectionLayer->name, - inputPrimitives[0], - inputPrimitives[1], - inputPrimitives[2], - num_classes, - keep_top_k, - share_location, - background_label_id, - nms_threshold, - top_k, - eta, - cldnnCodeType, - variance_encoded_in_target, - confidence_threshold, - prior_info_size, - prior_coordinates_offset, - normalized, - input_width, - input_height, - decrease_label_id, - clip); - - m_env.primitiveIDs[detectionLayer->name] = detectionLayer->name; + std::string detectionLayerName = layer_type_name_ID(layer); + auto detectionPrim = cldnn::detection_output(detectionLayerName, + inputPrimitives[0], + inputPrimitives[1], + inputPrimitives[2], + num_classes, + keep_top_k, + share_location, + background_label_id, + nms_threshold, + top_k, + eta, + cldnnCodeType, + variance_encoded_in_target, + confidence_threshold, + prior_info_size, + prior_coordinates_offset, + normalized, + input_width, + input_height, + decrease_label_id, + clip_before_nms, + clip_after_nms); + + m_env.primitiveIDs[detectionLayerName] = detectionLayerName; m_topology->add(detectionPrim); - m_env.profilingIDs.insert(detectionLayer->name); + m_env.profilingIDs.push_back(detectionLayerName); } void CLDNNGraph::CreatePriorBoxPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1367,8 +1467,9 @@ void CLDNNGraph::CreatePriorBoxPrimitive(InferenceEngine::CNNLayerPtr &layer) { _step_h = static_cast(img_h) / static_cast(img_dims[1]); } + std::string priorBoxLayerName = layer_type_name_ID(layer); auto priorBoxPrim = cldnn::prior_box( - priorBoxLayer->name, + priorBoxLayerName, inputPrimitives[0], img_size, min_size, @@ -1382,9 +1483,9 @@ void CLDNNGraph::CreatePriorBoxPrimitive(InferenceEngine::CNNLayerPtr &layer) { offset, scale_all_sizes); - m_env.primitiveIDs[priorBoxLayer->name] = priorBoxLayer->name; + m_env.primitiveIDs[priorBoxLayerName] = priorBoxLayerName; m_topology->add(priorBoxPrim); - m_env.profilingIDs.insert(priorBoxLayer->name); + m_env.profilingIDs.push_back(priorBoxLayerName); } void CLDNNGraph::CreateDeconvolutionPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1401,22 +1502,38 @@ void CLDNNGraph::CreateDeconvolutionPrimitive(InferenceEngine::CNNLayerPtr &laye CreateWeightAndBiasPrimitives(layer, weightPrimID, biasPrimID); auto allPads = getPaddings(*deconvLayer); cldnn::tensor stride = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), - cldnn::spatial(deconvLayer->_stride[X_AXIS], deconvLayer->_stride[Y_AXIS])); + cldnn::spatial(deconvLayer->_stride[X_AXIS], deconvLayer->_stride[Y_AXIS])); cldnn::tensor padding = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), - cldnn::spatial(-allPads.begin[X_AXIS], -allPads.begin[Y_AXIS])); + cldnn::spatial(-allPads.begin[X_AXIS], -allPads.begin[Y_AXIS])); - auto deconvPrim = cldnn::deconvolution(deconvLayer->name, - inputPrimitives[0], - weightPrimID, - biasPrimID, - stride, - padding, - false, - 0.0f, - CldnnTensorFromIEDims(deconvLayer->outData[0]->dims)); - m_env.primitiveIDs[deconvLayer->name] = deconvLayer->name; - m_topology->add(deconvPrim); - m_env.profilingIDs.insert(deconvLayer->name); + std::string deconvLayerName = layer_type_name_ID(layer); + + if (deconvLayer->_group >= 16) { + auto deconvPrim = cldnn::deconvolution(deconvLayerName, + inputPrimitives[0], + weightPrimID, + biasPrimID, + deconvLayer->_group, + stride, + padding, + false, + 0.0f, + CldnnTensorFromIEDims(deconvLayer->outData[0]->dims)); + m_topology->add(deconvPrim); + } else { + auto deconvPrim = cldnn::deconvolution(deconvLayerName, + inputPrimitives[0], + weightPrimID, + biasPrimID, + stride, + padding, + false, + 0.0f, + CldnnTensorFromIEDims(deconvLayer->outData[0]->dims)); + m_topology->add(deconvPrim); + } + m_env.primitiveIDs[deconvLayerName] = deconvLayerName; + m_env.profilingIDs.push_back(deconvLayerName); } void CLDNNGraph::CreateCropPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1452,14 +1569,15 @@ void CLDNNGraph::CreateCropPrimitive(InferenceEngine::CNNLayerPtr &layer) { TensorValue(offset[3]), TensorValue(offset[2])); + std::string cropLayerName = layer_type_name_ID(layer); auto cropPrim = cldnn::crop( - cropLayer->name, + cropLayerName, inputPrimitives[0], refSize, offSize); - m_env.primitiveIDs[cropLayer->name] = cropLayer->name; + m_env.primitiveIDs[cropLayerName] = cropLayerName; m_topology->add(cropPrim); - m_env.profilingIDs.insert(cropLayer->name); + m_env.profilingIDs.push_back(cropLayerName); } void CLDNNGraph::CreateROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1471,6 +1589,7 @@ void CLDNNGraph::CreateROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) int pooled_height = roiPoolingLayer->GetParamAsInt("pooled_h", 0); float spatial_scale = roiPoolingLayer->GetParamAsFloat("spatial_scale", 1.0f); std::string method = roiPoolingLayer->GetParamAsString("method", "max"); + bool position_sensitive = false; cldnn::pooling_mode mode = cldnn::pooling_mode::max; if (method == "bilinear") { @@ -1478,17 +1597,18 @@ void CLDNNGraph::CreateROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) } auto inputPrimitives = GetPrevLayersPrimitives(layer); - auto roiPoolingPrim = cldnn::roi_pooling( - roiPoolingLayer->name, - inputPrimitives[0], // input data - inputPrimitives[1], // input rois - mode, - pooled_width, - pooled_height, - spatial_scale); - m_env.primitiveIDs[roiPoolingLayer->name] = roiPoolingLayer->name; + std::string roiPoolingLayerName = layer_type_name_ID(layer); + auto roiPoolingPrim = cldnn::roi_pooling(roiPoolingLayerName, + inputPrimitives[0], // input data + inputPrimitives[1], // input rois + mode, + position_sensitive, + pooled_width, + pooled_height, + spatial_scale); + m_env.primitiveIDs[roiPoolingLayerName] = roiPoolingLayerName; m_topology->add(roiPoolingPrim); - m_env.profilingIDs.insert(roiPoolingLayer->name); + m_env.profilingIDs.push_back(roiPoolingLayerName); } void CLDNNGraph::CreatePSROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1497,22 +1617,34 @@ void CLDNNGraph::CreatePSROIPoolingPrimitive(InferenceEngine::CNNLayerPtr &layer // params int group_size = psROIPoolingLayer->GetParamAsInt("group_size"); - // todo: assert outputdim*group_size*group_size == input features + int output_dim = psROIPoolingLayer->GetParamAsInt("output_dim"); float spatial_scale = psROIPoolingLayer->GetParamAsFloat("spatial_scale"); + size_t spatial_bins_x = static_cast(psROIPoolingLayer->GetParamAsInt("spatial_bins_x", 1)); + size_t spatial_bins_y = static_cast(psROIPoolingLayer->GetParamAsInt("spatial_bins_y", 1)); + std::string mode_str = psROIPoolingLayer->GetParamAsString("mode", "average"); + bool position_sensitive = true; + + cldnn::pooling_mode mode = mode_str == "average" ? cldnn::pooling_mode::average + : cldnn::pooling_mode::bilinear; + auto inputPrimitives = GetPrevLayersPrimitives(layer); - auto psROIPoolingPrim = cldnn::roi_pooling( - psROIPoolingLayer->name, - inputPrimitives[0], // input data - inputPrimitives[1], // input rois - cldnn::pooling_mode::average, - group_size, - group_size, - spatial_scale, - group_size); - m_env.primitiveIDs[psROIPoolingLayer->name] = psROIPoolingLayer->name; + std::string psROIPoolingLayerName = layer_type_name_ID(layer); + auto psROIPoolingPrim = cldnn::roi_pooling(psROIPoolingLayerName, + inputPrimitives[0], // input data + inputPrimitives[1], // input rois + mode, + position_sensitive, + group_size, + group_size, + spatial_scale, + output_dim, + spatial_bins_x, + spatial_bins_y); + + m_env.primitiveIDs[psROIPoolingLayerName] = psROIPoolingLayerName; m_topology->add(psROIPoolingPrim); - m_env.profilingIDs.insert(psROIPoolingLayer->name); + m_env.profilingIDs.push_back(psROIPoolingLayerName); } void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer, CLDNNCustomLayerPtr customLayer) { @@ -1547,7 +1679,7 @@ void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer THROW_CLDNN_EXCEPTION("Invalid dimensions for blob " << blob.first << " in layer " << genericLayer->name); } CreatePrimitiveFromBlob(blobId, blob.second, cldnn::layout( - m_networkPrecision, + DataTypeFromPrecision(blob.second->precision()), m_defaultFormat, cldnn::tensor(1, 1, TensorValue(blob.second->dims()[0]), 1))); // save index in blobIndex @@ -1577,8 +1709,8 @@ void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer param.format, DataTypeFromPrecision(layer->precision)); m_topology->add(preprocessPrim); - m_env.profilingIDs.insert(reorderPrimName); - InitProfileInfo(reorderPrimName, "Reorder", "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + m_env.profilingIDs.push_back(reorderPrimName); + InitProfileInfo(reorderPrimName, "Reorder"); reorderedInputs[param.portIndex] = (reorderPrimName); } else { reorderedInputs[param.portIndex] = inputPrimitives[param.portIndex]; @@ -1629,6 +1761,7 @@ void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer int xDim = outputTensor.spatial[0]; int iidx = customLayer->InputDimSourceIndex(); + std::string genericLayerName = layer_type_name_ID(layer); // if input index is greater than -1, take dimension from input if (iidx >= 0) { if (iidx >= genericLayer->insData.size()) @@ -1670,7 +1803,7 @@ void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer } auto customPrim = cldnn::custom_gpu_primitive( - genericLayer->name, + genericLayerName, reorderedInputs, { layerTitle, defineTitle, layerDefines, customLayer->KernelSource() }, customLayer->KernelEntry(), @@ -1681,23 +1814,24 @@ void CLDNNGraph::CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr & layer lws); if (outputLayout.format != cldnn::format::any && - p_currentOutputs->find(genericLayer->name) == p_currentOutputs->end()) { + p_currentOutputs->find(genericLayerName) == p_currentOutputs->end()) { // Handle output reorder - auto reorderPrimName = genericLayer->name + m_postCustomLayerTag; + auto reorderPrimName = genericLayerName + m_postCustomLayerTag; m_topology->add( cldnn::reorder( reorderPrimName, - genericLayer->name, + genericLayerName, m_defaultFormat, - m_networkPrecision)); - m_env.primitiveIDs[genericLayer->name] = reorderPrimName; - m_env.profilingIDs.insert(reorderPrimName); - InitProfileInfo(reorderPrimName, "Reorder", "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + customPrim.output_layout.data_type)); + m_env.primitiveIDs[genericLayerName] = reorderPrimName; + m_env.primitiveIDs[reorderPrimName] = reorderPrimName; + m_env.profilingIDs.push_back(reorderPrimName); + InitProfileInfo(reorderPrimName, "Reorder"); } else { - m_env.primitiveIDs[genericLayer->name] = genericLayer->name; + m_env.primitiveIDs[genericLayerName] = genericLayerName; } m_topology->add(customPrim); - m_env.profilingIDs.insert(genericLayer->name); + m_env.profilingIDs.push_back(genericLayerName); } void CLDNNGraph::CreateSimplerNMSPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1715,8 +1849,9 @@ void CLDNNGraph::CreateSimplerNMSPrimitive(InferenceEngine::CNNLayerPtr &layer) std::vector scale = simpleNMSLayer->GetParamAsFloats("scale"); auto inputPrimitives = GetPrevLayersPrimitives(layer); + std::string simpleNMSLayerName = layer_type_name_ID(layer); auto simpleNMSPrim = cldnn::proposal( - simpleNMSLayer->name, + simpleNMSLayerName, inputPrimitives[0], // cls_score inputPrimitives[1], // bbox_pred inputPrimitives[2], // im_info @@ -1729,9 +1864,9 @@ void CLDNNGraph::CreateSimplerNMSPrimitive(InferenceEngine::CNNLayerPtr &layer) { 0.5f, 1.0f, 2.0f }, // ratios for the SimplerNMS variant scale); - m_env.primitiveIDs[simpleNMSLayer->name] = simpleNMSLayer->name; + m_env.primitiveIDs[simpleNMSLayerName] = simpleNMSLayerName; m_topology->add(simpleNMSPrim); - m_env.profilingIDs.insert(simpleNMSLayer->name); + m_env.profilingIDs.push_back(simpleNMSLayerName); } void CLDNNGraph::CreateEltwisePrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1749,27 +1884,29 @@ void CLDNNGraph::CreateEltwisePrimitive(InferenceEngine::CNNLayerPtr &layer) { THROW_IE_EXCEPTION << "Number of provided coefficients is not equal to number of operands"; } + std::string eltwiseLayerName = layer_type_name_ID(layer); auto eltwisePrim = cldnn::eltwise( - eltwiseLayer->name, + eltwiseLayerName, inputPrimitives, EltwiseModeFromIEEltwise(eltwiseLayer->_operation), coefficients); - m_env.primitiveIDs[eltwiseLayer->name] = eltwiseLayer->name; + m_env.primitiveIDs[eltwiseLayerName] = eltwiseLayerName; m_topology->add(eltwisePrim); - m_env.profilingIDs.insert(eltwiseLayer->name); + m_env.profilingIDs.push_back(eltwiseLayerName); } void CLDNNGraph::CreateConcatenatePrimitive(InferenceEngine::CNNLayerPtr &layer) { ValidateLayer(layer, 0); auto concatLayer = dynamic_cast (layer.get()); auto inputPrimitives = GetPrevLayersPrimitives(layer); + std::string concatLayerName = layer_type_name_ID(layer); auto concatPrim = cldnn::concatenation( - concatLayer->name, + concatLayerName, inputPrimitives, ConcatAxisFromIEAxis(concatLayer->_axis)); - m_env.primitiveIDs[concatLayer->name] = concatLayer->name; + m_env.primitiveIDs[concatLayerName] = concatLayerName; m_topology->add(concatPrim); - m_env.profilingIDs.insert(concatLayer->name); + m_env.profilingIDs.push_back(concatLayerName); } void CLDNNGraph::CreateSplitPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1798,7 +1935,7 @@ std::cout << "Splitting layer: " << layer->name << "\n\tSize:" << CldnnTensorFro auto cropPrim = cldnn::crop(outLayer->name, inputPrimitives[0], outTensor, CldnnTensorFromIEDims(startOffset)); m_topology->add(cropPrim); m_env.primitiveIDs[outLayer->name] = outLayer->name; - m_env.profilingIDs.insert(outLayer->name); + m_env.profilingIDs.push_back(outLayer->name); outputOffsets.push_back({ outLayer->name, CldnnTensorFromIEDims(startOffset) }); for (size_t i = 0; i < inputDims.size(); i++) { if (outLayer->dims[i] != inputDims[i]) { @@ -1838,6 +1975,7 @@ std::cout << "Splitting layer: " << layer->name << "\n\tSize:" << CldnnTensorFro }; for (auto& outLayer : splitLayer->outData) { + std::string outLayerName = splitLayer->type + ":" + outLayer->name; if (outLayer->dims.size() != startOffset.size()) { THROW_CLDNN_EXCEPTION("Invalid dimesions in split layer: " << splitLayer->name << " output: " << outLayer->name); } @@ -1854,11 +1992,11 @@ std::cout << "Splitting layer: " << layer->name << "\n\tSize:" << CldnnTensorFro std::reverse(reverseOffset.begin(), reverseOffset.end()); auto offsetTensor = TensorFromIEDims(reverseOffset, 0); - auto cropPrim = cldnn::crop(outLayer->name, inputPrimitives[0], outTensor, offsetTensor); - m_env.primitiveIDs[outLayer->name] = outLayer->name; + auto cropPrim = cldnn::crop(outLayerName, inputPrimitives[0], outTensor, offsetTensor); + m_env.primitiveIDs[outLayerName] = outLayerName; m_topology->add(cropPrim); - m_env.profilingIDs.insert(outLayer->name); - InitProfileInfo(outLayer->name, "Crop", "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + m_env.profilingIDs.push_back(outLayerName); + InitProfileInfo(outLayerName, "Crop"); for (size_t i = 0; i < inputDims.size(); i++) { if (outLayer->dims[i] != inputDims[i]) { @@ -1868,7 +2006,7 @@ std::cout << "Splitting layer: " << layer->name << "\n\tSize:" << CldnnTensorFro } // set split as not_run - InitProfileInfo(layer->name, layer->type, "None", InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); // Mark this layer as optimized out + InitProfileInfo(layer->name, layer->type, false, InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); // Mark this layer as optimized out #endif // _USE_SPLIT_PRIMITIVE } } @@ -1893,9 +2031,9 @@ void CLDNNGraph::CreateFusedSplitConvMergePrimitive(InferenceEngine::CNNLayerPtr THROW_CLDNN_EXCEPTION("Expected single layer does not exist"); } // Mark these layers as optimized out - InitProfileInfo(convLayer1->name, convLayer1->type, "None", InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); - InitProfileInfo(convLayer2->name, convLayer2->type, "None", InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); - InitProfileInfo(concatLayer->name, concatLayer->type, "None", InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); + InitProfileInfo(convLayer1->name, convLayer1->type, false, InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); + InitProfileInfo(convLayer2->name, convLayer2->type, false, InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); + InitProfileInfo(concatLayer->name, concatLayer->type, false, InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); // build the split conv primitive std::vector weightPrimID; @@ -1913,7 +2051,8 @@ void CLDNNGraph::CreateFusedSplitConvMergePrimitive(InferenceEngine::CNNLayerPtr cldnn::tensor dilation = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(convLayer1->_dilation[X_AXIS], convLayer1->_dilation[Y_AXIS])); - auto splitPrim = cldnn::convolution(splitLayer->name, + std::string splitLayerName = layer_type_name_ID(layer); + auto splitPrim = cldnn::convolution(splitLayerName, inputPrimitives[0], weightPrimID, biasPrimID, @@ -1926,14 +2065,14 @@ void CLDNNGraph::CreateFusedSplitConvMergePrimitive(InferenceEngine::CNNLayerPtr layer = concatLayerPtr; - m_env.primitiveIDs[splitLayer->name] = splitLayer->name; - m_env.primitiveIDs[convLayer1->name] = splitLayer->name; - m_env.primitiveIDs[convLayer2->name] = splitLayer->name; - m_env.primitiveIDs[concatLayer->name] = splitLayer->name; // pair the last merged layer (concat or relu) with + m_env.primitiveIDs[splitLayerName] = splitLayerName; + m_env.primitiveIDs[layer_type_name_ID(convLayer1)] = splitLayerName; + m_env.primitiveIDs[layer_type_name_ID(convLayer2)] = splitLayerName; + m_env.primitiveIDs[layer_type_name_ID(concatLayer)] = splitLayerName; // pair the last merged layer (concat or relu) with // this primitive name to be used as // input prim for subsequent layers m_topology->add(splitPrim); - m_env.profilingIDs.insert(splitLayer->name); + m_env.profilingIDs.push_back(splitLayerName); } void CLDNNGraph::CreatePowerPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -1944,45 +2083,46 @@ void CLDNNGraph::CreatePowerPrimitive(InferenceEngine::CNNLayerPtr &layer) { THROW_CLDNN_EXCEPTION("Power Layer " << layer->name << "uses unsupported power value"); } + std::string powerLayerName = layer_type_name_ID(layer); if ((powerLayer->scale == 1.0f) && (powerLayer->offset == 0.0f)) { if (powerLayer->power == 0.5f) { - auto activationPrim = cldnn::activation(powerLayer->name, inputPrimitives[0], activation_sqrt); + auto activationPrim = cldnn::activation(powerLayerName, inputPrimitives[0], activation_sqrt); m_topology->add(activationPrim); - m_env.profilingIDs.insert(powerLayer->name); - m_env.primitiveIDs[powerLayer->name] = powerLayer->name; + m_env.profilingIDs.push_back(powerLayerName); + m_env.primitiveIDs[powerLayerName] = powerLayerName; } else { // skip this layer - m_env.primitiveIDs[powerLayer->name] = inputPrimitives[0]; // register the previous primID for this layer too - InitProfileInfo(layer->name, layer->type, "None", InferenceEngine::InferenceEngineProfileInfo::NOT_RUN); // Mark this layer as not run + m_env.primitiveIDs[powerLayerName] = inputPrimitives[0]; // register the previous primID for this layer too + InitProfileInfo(layer->name, layer->type, false, InferenceEngine::InferenceEngineProfileInfo::NOT_RUN); // Mark this layer as not run } } else { // create scale primitive - auto scaleValuePrimName = powerLayer->name + m_scalesTag; + auto scaleValuePrimName = powerLayerName + m_scalesTag; AddSingleValuePrimitive(scaleValuePrimName, DataTypeFromPrecision(powerLayer->precision), powerLayer->scale); cldnn::primitive_id biasValuePrimName = ""; if (powerLayer->offset != 0.0f) { - biasValuePrimName = powerLayer->name + m_biasesTag; + biasValuePrimName = powerLayerName + m_biasesTag; AddSingleValuePrimitive(biasValuePrimName, DataTypeFromPrecision(powerLayer->precision), powerLayer->offset); } auto scalePrim = cldnn::scale( - powerLayer->name, + powerLayerName, inputPrimitives[0], scaleValuePrimName, biasValuePrimName); - m_env.primitiveIDs[powerLayer->name] = powerLayer->name; + m_env.primitiveIDs[powerLayerName] = powerLayerName; m_topology->add(scalePrim); - m_env.profilingIDs.insert(powerLayer->name); + m_env.profilingIDs.push_back(powerLayerName); if (powerLayer->power == 0.5f) { - auto activationPrim = cldnn::activation(powerLayer->name+"_sqrt", powerLayer->name, activation_sqrt); + auto activationPrim = cldnn::activation(powerLayerName+"_sqrt", powerLayerName, activation_sqrt); m_topology->add(activationPrim); - m_env.profilingIDs.insert(powerLayer->name+"_sqrt"); + m_env.profilingIDs.push_back(powerLayerName+"_sqrt"); } } } @@ -2007,10 +2147,11 @@ void CLDNNGraph::CreateSoftMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) { isPrevFC = true; // end of WA - auto softmaxPrim = cldnn::softmax(softmaxLayer->name, inputPrimitives[0], SoftmaxDimensionFromIEAxis(softmaxLayer, isPrevFC)); - m_env.primitiveIDs[softmaxLayer->name] = softmaxLayer->name; + std::string softmaxLayerName = layer_type_name_ID(layer); + auto softmaxPrim = cldnn::softmax(softmaxLayerName, inputPrimitives[0], SoftmaxDimensionFromIEAxis(softmaxLayer, isPrevFC)); + m_env.primitiveIDs[softmaxLayerName] = softmaxLayerName; m_topology->add(softmaxPrim); - m_env.profilingIDs.insert(softmaxLayer->name); + m_env.profilingIDs.push_back(softmaxLayerName); } void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2018,13 +2159,14 @@ void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &lay auto inputPrimitives = GetPrevLayersPrimitives(layer); auto fcLayer = dynamic_cast (layer.get()); + std::string fcLayerName = layer_type_name_ID(layer); // create bias primitive cldnn::primitive_id biasesPrimID = ""; if (fcLayer->_biases != nullptr) { - biasesPrimID = fcLayer->name + m_biasesTag; + biasesPrimID = fcLayerName + m_biasesTag; CreatePrimitiveFromBlob(biasesPrimID, fcLayer->_biases, - cldnn::layout(m_networkPrecision, m_defaultFormat, + cldnn::layout(DataTypeFromPrecision(fcLayer->precision), m_defaultFormat, cldnn::spatial(TensorValue(fcLayer->_out_num)))); } @@ -2032,7 +2174,7 @@ void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &lay // gcc bug to resolve auto, at least for 5.4 version std::shared_ptr insData0 = fcLayer->insData[0].lock(); IE_ASSERT(insData0 != nullptr); - cldnn::primitive_id weightsPrimID = fcLayer->name + m_weightsTag; + cldnn::primitive_id weightsPrimID = fcLayerName + m_weightsTag; cldnn::tensor weightsDims; switch (insData0->dims.size()) { case 4: @@ -2048,18 +2190,18 @@ void CLDNNGraph::CreateFullyConnectedPrimitive(InferenceEngine::CNNLayerPtr &lay } CreatePrimitiveFromBlob(weightsPrimID, fcLayer->_weights, - cldnn::layout(m_networkPrecision, m_defaultFormat, weightsDims)); + cldnn::layout(DataTypeFromPrecision(fcLayer->precision), m_defaultFormat, weightsDims)); - auto fcPrim = cldnn::fully_connected(fcLayer->name, + auto fcPrim = cldnn::fully_connected(fcLayerName, inputPrimitives[0], weightsPrimID, biasesPrimID, false, 0.0f); - m_env.primitiveIDs[fcLayer->name] = fcLayer->name; + m_env.primitiveIDs[fcLayerName] = fcLayerName; m_topology->add(fcPrim); - m_env.profilingIDs.insert(fcLayer->name); + m_env.profilingIDs.push_back(fcLayerName); } void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2067,6 +2209,7 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto inputPrimitives = GetPrevLayersPrimitives(layer); auto poolLayer = dynamic_cast (layer.get()); + std::string poolLayerName = layer_type_name_ID(layer); auto allPads = getPaddings(*poolLayer); if (poolLayer->outData.size() > 1) { // max pooling with argmax @@ -2119,7 +2262,7 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { m_env.primitiveIDs[argmaxOutputID] = argmaxPrimID; // create pooling primitive itself - auto poolPrim = cldnn::pooling(poolLayer->name, + auto poolPrim = cldnn::pooling(poolLayerName, inputPrimitives[0], argmaxPrimID, cldnn::pooling_mode::max_with_argmax, @@ -2129,10 +2272,10 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]) }, CldnnTensorFromIEDims(poolLayer->outData[0]->dims)); m_topology->add(poolPrim); - m_env.primitiveIDs[realOutputID] = poolLayer->name; + m_env.primitiveIDs[realOutputID] = poolLayerName; } else { // regular pooling - auto poolPrim = cldnn::pooling(poolLayer->name, + auto poolPrim = cldnn::pooling(poolLayerName, inputPrimitives[0], PoolingModeFromIEPooling(poolLayer->_type, poolLayer->_exclude_pad), cldnn::spatial(TensorValue(poolLayer->_kernel[X_AXIS]), TensorValue(poolLayer->_kernel[Y_AXIS])), // size @@ -2141,18 +2284,19 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]) }, CldnnTensorFromIEDims(poolLayer->outData[0]->dims)); m_topology->add(poolPrim); - m_env.primitiveIDs[poolLayer->name] = poolLayer->name; + m_env.primitiveIDs[poolLayerName] = poolLayerName; } - m_env.profilingIDs.insert(poolLayer->name); + m_env.profilingIDs.push_back(poolLayerName); } void CLDNNGraph::CreateLRNPrimitive(InferenceEngine::CNNLayerPtr &layer) { ValidateLayer(layer, 1); auto inputPrimitives = GetPrevLayersPrimitives(layer); auto lrnLayer = dynamic_cast (layer.get()); + std::string lrnLayerName = layer_type_name_ID(layer); auto lrnPrim = cldnn::lrn( - lrnLayer->name, + lrnLayerName, inputPrimitives[0], lrnLayer->_size, static_cast(lrnLayer->_k), @@ -2160,9 +2304,9 @@ void CLDNNGraph::CreateLRNPrimitive(InferenceEngine::CNNLayerPtr &layer) { lrnLayer->_beta, lrnLayer->_isAcrossMaps ? cldnn_lrn_norm_region_across_channel : cldnn_lrn_norm_region_within_channel); - m_env.primitiveIDs[lrnLayer->name] = lrnLayer->name; + m_env.primitiveIDs[lrnLayerName] = lrnLayerName; m_topology->add(lrnPrim); - m_env.profilingIDs.insert(lrnLayer->name); + m_env.profilingIDs.push_back(lrnLayerName); } void CLDNNGraph::CreateActivationPrimitive(InferenceEngine::CNNLayerPtr &layer, const LayerType type) { @@ -2186,6 +2330,10 @@ void CLDNNGraph::CreateActivationPrimitive(InferenceEngine::CNNLayerPtr &layer, activationType = ReLU6; } else if (activation_type == "clamp") { activationType = Clamp; + } else if (activation_type == "exp") { + activationType = Exp; + } else if (activation_type == "not") { + activationType = Not; } else { THROW_CLDNN_EXCEPTION("Unsupported activation type (" + activation_type + ") in layer " + layer->name); @@ -2230,15 +2378,26 @@ void CLDNNGraph::CreateActivationPrimitive(InferenceEngine::CNNLayerPtr &layer, params.b = layer->GetParamAsFloat("max"); break; } + case Exp: + { + func = cldnn_activation_func_t::activation_exp; + break; + } + case Not: + { + func = cldnn_activation_func_t::activation_not; + break; + } default: THROW_CLDNN_EXCEPTION("Unsupported activation type (" + layer->type + ") in layer " + layer->name); } - auto activationPrimitive = cldnn::activation(layer->name, inputPrimitives[0], func, params); - m_env.primitiveIDs[layer->name] = layer->name; + std::string layerName = layer_type_name_ID(layer); + auto activationPrimitive = cldnn::activation(layerName, inputPrimitives[0], func, params); + m_env.primitiveIDs[layerName] = layerName; m_topology->add(activationPrimitive); - m_env.profilingIDs.insert(layer->name); + m_env.profilingIDs.push_back(layerName); } void CLDNNGraph::CreateCopyPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2247,8 +2406,9 @@ void CLDNNGraph::CreateCopyPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto copyLayer = dynamic_cast (layer.get()); // Optimize out and just update references - m_env.primitiveIDs[copyLayer->name] = inputPrimitives[0]; - InitProfileInfo(layer->name, layer->type, "None", InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); // Mark this layer as optimized out + std::string layerName = layer_type_name_ID(layer); + m_env.primitiveIDs[layerName] = inputPrimitives[0]; + InitProfileInfo(layerName, layer->type, false, InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT); // Mark this layer as optimized out } void CLDNNGraph::CreateUpsamplingPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2260,16 +2420,17 @@ void CLDNNGraph::CreateUpsamplingPrimitive(InferenceEngine::CNNLayerPtr &layer) uint32_t numFilter = upsamplingLayer->GetParamAsUInt("num_filter"); std::string sampleType = upsamplingLayer->GetParamAsString("sample_type"); + std::string upsamplingLayerName = layer_type_name_ID(layer); auto upsamplingPrim = cldnn::upsampling( - upsamplingLayer->name, + upsamplingLayerName, inputPrimitives[0], scale, numFilter, UpsamplingTypeFromString(sampleType)); - m_env.primitiveIDs[upsamplingLayer->name] = upsamplingLayer->name; + m_env.primitiveIDs[upsamplingLayerName] = upsamplingLayerName; m_topology->add(upsamplingPrim); - m_env.profilingIDs.insert(upsamplingLayer->name); + m_env.profilingIDs.push_back(upsamplingLayerName); } void CLDNNGraph::CreateResamplePrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2295,16 +2456,17 @@ void CLDNNGraph::CreateResamplePrimitive(InferenceEngine::CNNLayerPtr &layer) { THROW_CLDNN_EXCEPTION("Unsupported resampling type (" + sampleType + ") in layer " + layer->name); } + std::string resampleLayerName = layer_type_name_ID(layer); auto upsamplingPrim = cldnn::upsampling( - resampleLayer->name, + resampleLayerName, inputPrimitives[0], scale, inFeatures, cldnn::upsampling_sample_type::nearest); - m_env.primitiveIDs[resampleLayer->name] = resampleLayer->name; + m_env.primitiveIDs[resampleLayerName] = resampleLayerName; m_topology->add(upsamplingPrim); - m_env.profilingIDs.insert(resampleLayer->name); + m_env.profilingIDs.push_back(resampleLayerName); } void CLDNNGraph::CreateYOLO2RegionPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2323,8 +2485,9 @@ void CLDNNGraph::CreateYOLO2RegionPrimitive(InferenceEngine::CNNLayerPtr &layer) mask_size = static_cast(mask.size()); } + std::string YOLOregionLayerName = layer_type_name_ID(layer); auto regionPrim = cldnn::region_yolo( - YOLOregionLayer->name, + YOLOregionLayerName, inputPrimitives[0], coords, classes, @@ -2332,9 +2495,9 @@ void CLDNNGraph::CreateYOLO2RegionPrimitive(InferenceEngine::CNNLayerPtr &layer) mask_size, do_softmax); - m_env.primitiveIDs[YOLOregionLayer->name] = YOLOregionLayer->name; + m_env.primitiveIDs[YOLOregionLayerName] = YOLOregionLayerName; m_topology->add(regionPrim); - m_env.profilingIDs.insert(YOLOregionLayer->name); + m_env.profilingIDs.push_back(YOLOregionLayerName); } void CLDNNGraph::CreateYOLO2ReorgPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2343,14 +2506,15 @@ void CLDNNGraph::CreateYOLO2ReorgPrimitive(InferenceEngine::CNNLayerPtr &layer) auto YOLOreorgLayer = dynamic_cast (layer.get()); uint32_t stride = YOLOreorgLayer->GetParamAsUInt("stride"); + std::string YOLOreorgLayerName = layer_type_name_ID(layer); auto reorgPrim = cldnn::reorg_yolo( - YOLOreorgLayer->name, + YOLOreorgLayerName, inputPrimitives[0], stride); - m_env.primitiveIDs[YOLOreorgLayer->name] = YOLOreorgLayer->name; + m_env.primitiveIDs[YOLOreorgLayerName] = YOLOreorgLayerName; m_topology->add(reorgPrim); - m_env.profilingIDs.insert(YOLOreorgLayer->name); + m_env.profilingIDs.push_back(YOLOreorgLayerName); } void CLDNNGraph::CreateArgMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2385,16 +2549,17 @@ void CLDNNGraph::CreateArgMaxPrimitive(InferenceEngine::CNNLayerPtr &layer) { } } + std::string ArgMaxLayerName = layer_type_name_ID(layer); auto argmaxPrim = cldnn::arg_max_min( - ArgMaxLayer->name, + ArgMaxLayerName, inputPrimitives[0], otype, top_k, chosen_axis); - m_env.primitiveIDs[ArgMaxLayer->name] = ArgMaxLayer->name; + m_env.primitiveIDs[ArgMaxLayerName] = ArgMaxLayerName; m_topology->add(argmaxPrim); - m_env.profilingIDs.insert(ArgMaxLayer->name); + m_env.profilingIDs.push_back(ArgMaxLayerName); } void CLDNNGraph::CreateMaxUnpoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2429,16 +2594,17 @@ void CLDNNGraph::CreateMaxUnpoolingPrimitive(InferenceEngine::CNNLayerPtr &layer uint32_t stride = UnpoolingLayer->GetParamAsUInt("stride"); uint32_t kernel_size = UnpoolingLayer->GetParamAsUInt("kernel_size"); + std::string UnpoolingLayerName = layer_type_name_ID(layer); auto unpoolingPrim = cldnn::max_unpooling( - UnpoolingLayer->name, + UnpoolingLayerName, real_input, argmax_mutable, cldnn::spatial(kernel_size, kernel_size), // size cldnn::spatial(stride, stride) ); // stride - m_env.primitiveIDs[UnpoolingLayer->name] = UnpoolingLayer->name; + m_env.primitiveIDs[UnpoolingLayerName] = UnpoolingLayerName; m_topology->add(unpoolingPrim); - m_env.profilingIDs.insert(UnpoolingLayer->name); + m_env.profilingIDs.push_back(UnpoolingLayerName); } void CLDNNGraph::CreateMVNPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2450,16 +2616,17 @@ void CLDNNGraph::CreateMVNPrimitive(InferenceEngine::CNNLayerPtr &layer) { bool normalize_variance = MvnLayer->GetParamsAsBool("normalize_variance", true); float eps = MvnLayer->GetParamAsFloat("eps", 1e-10f); + std::string MvnLayerName = layer_type_name_ID(layer); auto mvnPrim = cldnn::mvn( - MvnLayer->name, + MvnLayerName, inputPrimitives[0], across_channels, normalize_variance, eps); - m_env.primitiveIDs[MvnLayer->name] = MvnLayer->name; + m_env.primitiveIDs[MvnLayerName] = MvnLayerName; m_topology->add(mvnPrim); - m_env.profilingIDs.insert(MvnLayer->name); + m_env.profilingIDs.push_back(MvnLayerName); } void CLDNNGraph::CreateTilePrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2479,15 +2646,16 @@ void CLDNNGraph::CreateTilePrimitive(InferenceEngine::CNNLayerPtr &layer) { default: THROW_CLDNN_EXCEPTION("Unsupported tile axis: " << axis); } }; + std::string tileLayerName = layer_type_name_ID(layer); auto tilePrim = cldnn::tile( - tileLayer->name, + tileLayerName, inputPrimitives[0], cldnnAxisFromIE(axis), tiles); - m_env.primitiveIDs[tileLayer->name] = tileLayer->name; + m_env.primitiveIDs[tileLayerName] = tileLayerName; m_topology->add(tilePrim); - m_env.profilingIDs.insert(tileLayer->name); + m_env.profilingIDs.push_back(tileLayerName); } void CLDNNGraph::CreatePadPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2532,17 +2700,18 @@ void CLDNNGraph::CreatePadPrimitive(InferenceEngine::CNNLayerPtr &layer) { else THROW_CLDNN_EXCEPTION("Invalid border mode " << mode << " in layer " << padLayer->name); + std::string padLayerName = layer_type_name_ID(layer); auto tilePrim = cldnn::border( - padLayer->name, + padLayerName, inputPrimitives[0], pads_begin, pads_end, border_mode, pad_value); - m_env.primitiveIDs[padLayer->name] = padLayer->name; + m_env.primitiveIDs[padLayerName] = padLayerName; m_topology->add(tilePrim); - m_env.profilingIDs.insert(padLayer->name); + m_env.profilingIDs.push_back(padLayerName); } std::string get_string_id(size_t i) { @@ -2557,10 +2726,11 @@ void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) { bool hasBias = false; auto inputPrimitives = GetPrevLayersPrimitives(layer); - auto elementSize = cldnn::data_type_traits::size_of(m_networkPrecision); - cldnn::primitive_id weightID = layer->name + m_weightsTag; - cldnn::primitive_id recurrentID = layer->name + "_recurrent" + m_weightsTag; - cldnn::primitive_id biasID = layer->name + m_biasesTag; + auto elementSize = cldnn::data_type_traits::size_of(DataTypeFromPrecision(layer->precision)); + std::string layerName = layer_type_name_ID(layer); + cldnn::primitive_id weightID = layerName + m_weightsTag; + cldnn::primitive_id recurrentID = layerName + "_recurrent" + m_weightsTag; + cldnn::primitive_id biasID = layerName + m_biasesTag; auto cellLayer = dynamic_cast (layer.get()); /* check incoming CNN layer and setup required variables */ @@ -2596,16 +2766,12 @@ void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) { THROW_IE_EXCEPTION << "Wrong input shapes for LSTMCell Layer " << layer->name; } - /* - * Prepare weight/bias memory primitives: - * - split weight blob into W and R - * - rearrange gate order from FICO layout in IR to IOFC expected by clDNN - */ + /* Prepare weight/bias memory primitives - split weight blob into W and R */ { cldnn::tensor wTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(lstm_input_size, 4 * lstm_hidden_size)); cldnn::tensor rTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(lstm_hidden_size, 4 * lstm_hidden_size)); - cldnn::layout WLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, wTensor); - cldnn::layout RLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, rTensor); + cldnn::layout WLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), m_defaultFormat, wTensor); + cldnn::layout RLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), m_defaultFormat, rTensor); auto wmem = cldnn::memory::allocate(*(m_env.engine), WLayout); auto wtmpPointer = wmem.pointer(); // implicitly maps buffer - unmap in destructor @@ -2613,33 +2779,23 @@ void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto rmem = cldnn::memory::allocate(*(m_env.engine), RLayout); auto rtmpPointer = rmem.pointer(); - // FICO -> IOFC - const std::vector gate_offs{2, 0, 3, 1}; - auto wLayer = dynamic_cast (layer.get()); auto pWeightsBlob = wLayer->_weights; auto blobBytes = static_cast(pWeightsBlob->buffer()); const size_t WchunkSz = lstm_input_size * elementSize; const size_t RchunkSz = lstm_hidden_size * elementSize; - for (int g = 0; g < 4; g++) { - auto wBytes = wtmpPointer.data() + gate_offs[g] * lstm_hidden_size * WchunkSz; - auto rBytes = rtmpPointer.data() + gate_offs[g] * lstm_hidden_size * RchunkSz; - for (int h = 0; h < lstm_hidden_size; h++) { - // copy "input size" elements to W - for (size_t b = 0; b < WchunkSz; b++) { - wBytes[b] = blobBytes[b]; - } - blobBytes += WchunkSz; - wBytes += WchunkSz; + auto wBytes = wtmpPointer.data(); + auto rBytes = rtmpPointer.data(); - // copy "lstm_hidden_size" elements to R - for (size_t b = 0; b < RchunkSz; b++) { - rBytes[b] = blobBytes[b]; - } - blobBytes += RchunkSz; - rBytes += RchunkSz; - } + for (int h = 0; h < 4 * lstm_hidden_size; h++) { + // copy "input size" elements to W + for (size_t b = 0; b < WchunkSz; b++) + *wBytes++ = *blobBytes++; + + // copy "lstm_hidden_size" elements to R + for (size_t b = 0; b < RchunkSz; b++) + *rBytes++ = *blobBytes++; } m_topology->add(cldnn::data(weightID, wmem)); @@ -2649,71 +2805,63 @@ void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto pBiasBlob = wLayer->_biases; if (pBiasBlob != nullptr) { cldnn::tensor bTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(4 * lstm_hidden_size, 1)); - cldnn::layout BLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, rTensor); + cldnn::layout BLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), m_defaultFormat, rTensor); auto bmem = cldnn::memory::allocate(*(m_env.engine), BLayout); auto btmpPointer = bmem.pointer(); auto blobBytes = static_cast(pBiasBlob->buffer()); const size_t BchunkSz = lstm_hidden_size * elementSize; + auto bBytes = btmpPointer.data(); - for (int g = 0; g < 4; g++) { - auto bBytes = btmpPointer.data() + gate_offs[g] * BchunkSz; - // copy "lstm_hidden_size" elements to B - for (size_t b = 0; b < BchunkSz; b++) { - bBytes[b] = blobBytes[b]; - } - blobBytes += BchunkSz; - } + for (size_t b = 0; b < 4 * BchunkSz; b++) + *bBytes++ = *blobBytes++; m_topology->add(cldnn::data(biasID, bmem)); hasBias = true; } } - cldnn::primitive_id inReshapeID = layer->name + "_inReshape"; - cldnn::primitive_id permuteID = layer->name + "_inputReorder"; - cldnn::primitive_id inHiddenReshapeID = layer->name + "_inHiddenReshape"; + cldnn::primitive_id inReshapeID = layerName + "_inReshape"; + cldnn::primitive_id permuteID = layerName + "_inputReorder"; + cldnn::primitive_id inHiddenReshapeID = layerName + "_inHiddenReshape"; cldnn::tensor inputShape = { lstm_batch_size, 1, lstm_input_size, 1 }; cldnn::tensor hiddenStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 }; - cldnn::layout inputLayout = cldnn::layout(m_networkPrecision, cldnn::format::bfyx, inputShape); + cldnn::layout inputLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), cldnn::format::bfyx, inputShape); m_topology->add(cldnn::reshape(inReshapeID, inputPrimitives[0], inputShape)); m_topology->add(cldnn::reorder(permuteID, inReshapeID, inputLayout)); - m_topology->add(cldnn::reshape(inHiddenReshapeID+"_1", inputPrimitives[1], hiddenStateShape)); - m_topology->add(cldnn::reshape(inHiddenReshapeID+"_2", inputPrimitives[2], hiddenStateShape)); + std::string hiddenInStr = inHiddenReshapeID + "_1"; + std::string cellInStr = inHiddenReshapeID + "_2"; + m_topology->add(cldnn::reshape(hiddenInStr, inputPrimitives[1], hiddenStateShape)); + m_topology->add(cldnn::reshape(cellInStr, inputPrimitives[2], hiddenStateShape)); - cldnn::tensor hiddenSz = cldnn::tensor{ 1, lstm_batch_size, lstm_hidden_size, 1 }; + cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 }; cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0}; - std::string hiddenInStr = inHiddenReshapeID+"_1"; - std::string cellInStr = inHiddenReshapeID+"_2"; - - std::string lstm_gemm_id = layer->name + "_lstm_gemm"; - std::string lstm_elt_id = layer->name + "_lstm_elt"; - std::string crop_id = layer->name + "_crop"; + std::string lstm_gemm_id = layerName + "_lstm_gemm"; + std::string lstm_elt_id = layerName + "_lstm_elt"; + std::string crop_id = layerName + "_crop"; m_topology->add(cldnn::lstm_gemm(lstm_gemm_id, permuteID, weightID, recurrentID, hasBias ? biasID : "", hiddenInStr)); - m_topology->add(cldnn::lstm_elt(lstm_elt_id, lstm_gemm_id, - cellInStr)); - - + m_topology->add(cldnn::lstm_elt(lstm_elt_id, lstm_gemm_id, cellInStr, + 0, 0, {}, {}, cldnn_lstm_offset_order_fizo)); - - cldnn::primitive_id outputHiddenID = layer->name; + cldnn::primitive_id outputHiddenID = layerName; m_topology->add(cldnn::crop(outputHiddenID, lstm_elt_id, hiddenSz, cldnn::tensor{0, 0, 0, 0})); - m_env.primitiveIDs[outputHiddenID] = outputHiddenID; - m_env.primitiveIDs[layer->outData[0]->name] = outputHiddenID; - - cldnn::primitive_id outputCellID = layer->outData[1]->name; + cldnn::primitive_id outputCellID = layer->type + ":" + layer->outData[1]->name; m_topology->add(cldnn::crop(outputCellID, lstm_elt_id, hiddenSz, cellCropSz)); - m_env.primitiveIDs[outputCellID] = outputCellID; - m_env.profilingIDs.insert(layer->name); + // output primitive IDs + m_env.primitiveIDs[outputHiddenID] = outputHiddenID; // LSTMCell:LSTMCell - "concat hidden" + m_env.primitiveIDs[layer->type + ":" + layer->outData[0]->name] = outputHiddenID; // LSTMCell:LSTMCell:0 - hidden state + m_env.primitiveIDs[outputCellID] = outputCellID; // LSTMCell:LSTMCell:1 - cell state + + m_env.profilingIDs.push_back(layerName); } void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2722,15 +2870,17 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { bool hasInitialHidden = false, hasInitialCell = false, hasBias = false, isForward = true; auto inputPrimitives = GetPrevLayersPrimitives(layer); - auto elementSize = cldnn::data_type_traits::size_of(m_networkPrecision); - cldnn::primitive_id weightID = layer->name + m_weightsTag; - cldnn::primitive_id recurrentID = layer->name + "_recurrent" + m_weightsTag; - cldnn::primitive_id biasID = layer->name + m_biasesTag; - auto rnnLayer = dynamic_cast (layer.get()); + auto elementSize = cldnn::data_type_traits::size_of(DataTypeFromPrecision(layer->precision)); + std::string layerName = layer_type_name_ID(layer); + cldnn::primitive_id weightID = layerName + m_weightsTag; + cldnn::primitive_id recurrentID = layerName + "_recurrent" + m_weightsTag; + cldnn::primitive_id biasID = layerName + m_biasesTag; + auto rnnLayer = dynamic_cast (layer.get()); + bool permute_input = (1 != rnnLayer->axis); /* check incoming CNN layer and setup required variables */ { - if (rnnLayer->cellType != "LSTM") + if (rnnLayer->cellType != RNNSequenceLayer::LSTM) THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell"; auto in_data0 = layer->insData[0].lock(); @@ -2740,7 +2890,7 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto in_dims0 = in_data0->dims; auto out_dims0 = layer->outData[0]->dims; - if (1 == rnnLayer->axis) { + if (!permute_input) { lstm_batch_size = in_dims0[2]; lstm_sequence_len = in_dims0[1]; } else { @@ -2767,24 +2917,20 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { hasInitialCell = true; } - if (rnnLayer->direction != RNNLayer::RNN_FWD && rnnLayer->direction != RNNLayer::RNN_BWD) + if (rnnLayer->direction != RNNSequenceLayer::FWD && rnnLayer->direction != RNNSequenceLayer::BWD) THROW_IE_EXCEPTION << "Support only forward and backward direction for RNN Layer " << layer->name; - isForward = rnnLayer->direction == RNNLayer::RNN_FWD; + isForward = rnnLayer->direction == RNNSequenceLayer::FWD; if (in_dims0.size() != 3 || in_dims1.size() != 2 || in_dims2.size() != 2) THROW_IE_EXCEPTION << "Wrong input shapes for RNN Layer " << layer->name; } - /* - * Prepare weight/bias memory primitives: - * - split weight blob into W and R - * - rearrange gate order from FICO layout in IR to IOFC expected by clDNN - */ + /* Prepare weight/bias memory primitives - split weight blob into W and R */ { cldnn::tensor wTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(lstm_input_size, 4 * lstm_hidden_size)); cldnn::tensor rTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(lstm_hidden_size, 4 * lstm_hidden_size)); - cldnn::layout WLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, wTensor); - cldnn::layout RLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, rTensor); + cldnn::layout WLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), m_defaultFormat, wTensor); + cldnn::layout RLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), m_defaultFormat, rTensor); auto wmem = cldnn::memory::allocate(*(m_env.engine), WLayout); auto wtmpPointer = wmem.pointer(); // implicitly maps buffer - unmap in destructor @@ -2792,33 +2938,23 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto rmem = cldnn::memory::allocate(*(m_env.engine), RLayout); auto rtmpPointer = rmem.pointer(); - // FICO -> IOFC - const std::vector gate_offs{2, 0, 3, 1}; - auto wLayer = dynamic_cast (layer.get()); auto pWeightsBlob = wLayer->_weights; auto blobBytes = static_cast(pWeightsBlob->buffer()); const size_t WchunkSz = lstm_input_size * elementSize; const size_t RchunkSz = lstm_hidden_size * elementSize; - for (int g = 0; g < 4; g++) { - auto wBytes = wtmpPointer.data() + gate_offs[g] * lstm_hidden_size * WchunkSz; - auto rBytes = rtmpPointer.data() + gate_offs[g] * lstm_hidden_size * RchunkSz; - for (int h = 0; h < lstm_hidden_size; h++) { - // copy "input size" elements to W - for (size_t b = 0; b < WchunkSz; b++) { - wBytes[b] = blobBytes[b]; - } - blobBytes += WchunkSz; - wBytes += WchunkSz; + auto wBytes = wtmpPointer.data(); + auto rBytes = rtmpPointer.data(); - // copy "lstm_hidden_size" elements to R - for (size_t b = 0; b < RchunkSz; b++) { - rBytes[b] = blobBytes[b]; - } - blobBytes += RchunkSz; - rBytes += RchunkSz; - } + for (int h = 0; h < 4 * lstm_hidden_size; h++) { + // copy "input size" elements to W + for (size_t b = 0; b < WchunkSz; b++) + *wBytes++ = *blobBytes++; + + // copy "lstm_hidden_size" elements to R + for (size_t b = 0; b < RchunkSz; b++) + *rBytes++ = *blobBytes++; } m_topology->add(cldnn::data(weightID, wmem)); @@ -2828,22 +2964,17 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { auto pBiasBlob = wLayer->_biases; if (pBiasBlob != nullptr) { cldnn::tensor bTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(4 * lstm_hidden_size, 1)); - cldnn::layout BLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, rTensor); + cldnn::layout BLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), m_defaultFormat, rTensor); auto bmem = cldnn::memory::allocate(*(m_env.engine), BLayout); auto btmpPointer = bmem.pointer(); auto blobBytes = static_cast(pBiasBlob->buffer()); const size_t BchunkSz = lstm_hidden_size * elementSize; + auto bBytes = btmpPointer.data(); - for (int g = 0; g < 4; g++) { - auto bBytes = btmpPointer.data() + gate_offs[g] * BchunkSz; - // copy "lstm_hidden_size" elements to B - for (size_t b = 0; b < BchunkSz; b++) { - bBytes[b] = blobBytes[b]; - } - blobBytes += BchunkSz; - } + for (size_t b = 0; b < 4 * BchunkSz; b++) + *bBytes++ = *blobBytes++; m_topology->add(cldnn::data(biasID, bmem)); hasBias = true; @@ -2853,13 +2984,19 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { std::vector> input_ids_offsets; std::vector output_ids_offsets; - cldnn::primitive_id inReshapeID = layer->name + "_inReshape"; - cldnn::primitive_id permuteID = layer->name + "_inputReorder"; - cldnn::primitive_id inHiddenReshapeID = layer->name + "_inHiddenReshape"; + cldnn::primitive_id inReshapeID = layerName + "_inReshape"; + cldnn::primitive_id permuteID = layerName + "_inputReorder"; + cldnn::primitive_id inHiddenReshapeID = layerName + "_inHiddenReshape"; + + cldnn::tensor inputShape; - cldnn::tensor inputShape = { lstm_batch_size, lstm_sequence_len, lstm_input_size, 1 }; + if (permute_input) { + inputShape = { lstm_sequence_len, lstm_batch_size, lstm_input_size, 1 }; + } else { + inputShape = { lstm_batch_size, lstm_sequence_len, lstm_input_size, 1 }; + } cldnn::tensor hiddenStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 }; - cldnn::layout inputLayout = cldnn::layout(m_networkPrecision, cldnn::format::bfyx, inputShape); + cldnn::layout inputLayout = cldnn::layout(DataTypeFromPrecision(layer->precision), cldnn::format::bfyx, inputShape); m_topology->add(cldnn::reshape(inReshapeID, inputPrimitives[0], inputShape)); m_topology->add(cldnn::reorder(permuteID, inReshapeID, inputLayout)); @@ -2869,18 +3006,24 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { for (int i = 0; i < lstm_sequence_len; ++i) input_ids_offsets.push_back({ get_string_id(i), {0, i, 0, 0} }); - cldnn::primitive_id inputSplitID = layer->name + "_inputSplit"; - m_topology->add(cldnn::split(inputSplitID, permuteID, input_ids_offsets)); + cldnn::primitive_id inputSplitID = layerName + "_inputSplit"; - cldnn::tensor hiddenSz = cldnn::tensor{ 1, lstm_batch_size, lstm_hidden_size, 1 }; + if (permute_input) { + m_topology->add(cldnn::permute(layerName + "_inputSwap", permuteID, { 1, 0, 2, 3 })); + m_topology->add(cldnn::split(inputSplitID, layerName + "_inputSwap", input_ids_offsets)); + } else { + m_topology->add(cldnn::split(inputSplitID, permuteID, input_ids_offsets)); + } + + cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 }; cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0}; std::string hiddenStr = hasInitialHidden ? inHiddenReshapeID+"_1" : ""; std::string cellStr = hasInitialCell ? inHiddenReshapeID+"_2" : ""; for (int i = 0; i < lstm_sequence_len; ++i) { - std::string lstm_gemm_id = layer->name + "_lstm_gemm" + get_string_id(i); - std::string lstm_elt_id = layer->name + "_lstm_elt" + get_string_id(i); - std::string crop_id = layer->name + "_crop" + get_string_id(i); + std::string lstm_gemm_id = layerName + "_lstm_gemm" + get_string_id(i); + std::string lstm_elt_id = layerName + "_lstm_elt" + get_string_id(i); + std::string crop_id = layerName + "_crop" + get_string_id(i); int seqIdx = isForward ? i : lstm_sequence_len - 1 - i; m_topology->add(cldnn::lstm_gemm(lstm_gemm_id, inputSplitID + ":" + get_string_id(seqIdx), @@ -2888,54 +3031,46 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) { hasBias ? biasID : "", hiddenStr)); m_topology->add(cldnn::lstm_elt(lstm_elt_id, lstm_gemm_id, - cellStr)); + cellStr, 0, 0, {}, {}, + cldnn_lstm_offset_order_fizo)); hiddenStr = crop_id + ":hidden"; - m_topology->add(cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{0, 0, 0, 0})); + cellStr = crop_id + ":cell"; + m_topology->add(cldnn::crop(hiddenStr, lstm_elt_id, hiddenSz, cldnn::tensor{ 0, 0, 0, 0 })); output_ids_offsets.push_back(hiddenStr); if (i < lstm_sequence_len - 1) { - cellStr = crop_id + ":cell"; m_topology->add(cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz)); } else { // last hidden state crop (output 2) if (layer->outData.size() > 1) { - cldnn::primitive_id outputHiddenID = layer->outData[1]->name; + cldnn::primitive_id outputHiddenID = layer->type + ":" + layer->outData[1]->name; m_env.primitiveIDs[hiddenStr] = hiddenStr; m_env.primitiveIDs[outputHiddenID] = hiddenStr; } // last cell state crop (output 3) if (layer->outData.size() > 2) { - cldnn::primitive_id outputCellID = layer->outData[2]->name; - auto cropPrim = cldnn::crop(outputCellID, lstm_elt_id, hiddenSz, cellCropSz); - m_topology->add(cropPrim); - m_env.primitiveIDs[outputCellID] = outputCellID; + m_topology->add(cldnn::crop(cellStr, lstm_elt_id, hiddenSz, cellCropSz)); + cldnn::primitive_id outputCellID = layer->type + ":" + layer->outData[2]->name; + m_env.primitiveIDs[cellStr] = cellStr; + m_env.primitiveIDs[outputCellID] = cellStr; } } } if (!isForward) std::reverse(output_ids_offsets.begin(), output_ids_offsets.end()); - // main output (concatenated hidden) - cldnn::primitive_id concatID = layer->name + "_outputConcat"; - m_topology->add(cldnn::concatenation(concatID, output_ids_offsets, cldnn::concatenation::along_f)); - - // permute output to [1, batch, sequence, hidden_size] - cldnn::tensor outputTensor; - if (1 == rnnLayer->axis) { - outputTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(lstm_batch_size), cldnn::spatial(lstm_hidden_size, lstm_sequence_len)); + if (permute_input) { + m_topology->add(cldnn::concatenation(layerName + "_outputConcat", output_ids_offsets, cldnn::concatenation::along_f)); + m_topology->add(cldnn::permute(layerName, layerName + "_outputConcat", { 1, 0, 2, 3 })); } else { - outputTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(lstm_sequence_len), cldnn::spatial(lstm_hidden_size, lstm_batch_size)); + m_topology->add(cldnn::concatenation(layerName, output_ids_offsets, cldnn::concatenation::along_f)); } - cldnn::layout outputLayout = cldnn::layout(m_networkPrecision, cldnn::format::bfyx, outputTensor); - cldnn::primitive_id outReshapeID = layer->name + "_outReshape"; - m_topology->add(cldnn::reshape(outReshapeID, concatID, outputTensor)); - m_topology->add(cldnn::reorder(layer->name, outReshapeID, outputLayout)); - m_env.primitiveIDs[layer->name] = layer->name; - m_env.primitiveIDs[layer->outData[0]->name] = layer->name; - m_env.profilingIDs.insert(layer->name); + m_env.primitiveIDs[layerName] = layerName; + m_env.primitiveIDs[layer->type + ":" + layer->outData[0]->name] = layerName; + m_env.profilingIDs.push_back(layerName); } void CLDNNGraph::AddConstantBlobInput(InferenceEngine::CNNLayerPtr &layer) { @@ -2952,7 +3087,8 @@ void CLDNNGraph::AddConstantBlobInput(InferenceEngine::CNNLayerPtr &layer) { break; case 2: constTensor = cldnn::tensor(TensorValue(constDims[1]), TensorValue(constDims[0]), 1, 1); break; - case 1: // not implemented yet. + case 1: constTensor = cldnn::tensor(TensorValue(constDims[0]), 1, 1, 1); + break; default: THROW_CLDNN_EXCEPTION("Invalid constant blob dimensions"); } @@ -2962,23 +3098,10 @@ void CLDNNGraph::AddConstantBlobInput(InferenceEngine::CNNLayerPtr &layer) { constTensor); size_t bytes = constLayout.bytes_count(); - cldnn::primitive_id constPrimID = layer->name; - - /* clDNN Constant Propagator bug WA - use MutableData primitive instead of Data - to prevent FP16 -> FP32 conversion loss and crash */ - // CreatePrimitiveFromBlob(constPrimID, constBlob, constLayout); - auto mem = cldnn::memory::allocate(*(m_env.engine), constLayout); - auto tmpPointer = mem.pointer(); // implicitly maps buffer - unmap in destructor - auto buf = tmpPointer.data(); - auto bufSize = constLayout.bytes_count(); + cldnn::primitive_id constPrimID = layer_type_name_ID(layer); - auto data = static_cast(constBlob->buffer()); - for (size_t i = 0; i < bufSize; i++) { - buf[i] = data[i]; - } - m_topology->add(cldnn::mutable_data(constPrimID, mem)); - - m_env.primitiveIDs[layer->name] = constPrimID; + CreatePrimitiveFromBlob(constPrimID, constBlob, constLayout); + m_env.primitiveIDs[constPrimID] = constPrimID; } void CLDNNGraph::CreateConvolutionPrimitive(InferenceEngine::CNNLayerPtr &layer) { @@ -2998,20 +3121,202 @@ void CLDNNGraph::CreateConvolutionPrimitive(InferenceEngine::CNNLayerPtr &layer) cldnn::tensor dilation = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(convLayer->_dilation[X_AXIS], convLayer->_dilation[Y_AXIS])); - auto convPrim = cldnn::convolution(convLayer->name, - inputPrimitives[0], - weightPrimID, - biasPrimID, - stride, - padding, - dilation, - false, - 0.0f, - CldnnTensorFromIEDims(convLayer->outData[0]->dims)); + std::string convLayerName = layer_type_name_ID(layer); + if (convLayer->_group >= 16) { + auto convPrim = cldnn::convolution(convLayerName, + inputPrimitives[0], + weightPrimID, + biasPrimID, + convLayer->_group, + stride, + padding, + dilation, + false, + 0.0, + CldnnTensorFromIEDims(convLayer->outData[0]->dims)); + m_topology->add(convPrim); + } else { + auto convPrim = cldnn::convolution(convLayerName, + inputPrimitives[0], + weightPrimID, + biasPrimID, + stride, + padding, + dilation, + false, + 0.0f, + CldnnTensorFromIEDims(convLayer->outData[0]->dims)); + m_topology->add(convPrim); + } + m_env.primitiveIDs[convLayerName] = convLayerName; + m_env.profilingIDs.push_back(convLayerName); +} + +void CLDNNGraph::CreateGatherPrimitive(InferenceEngine::CNNLayerPtr &layer) { + ValidateLayer(layer, 2); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto gatherLayer = dynamic_cast (layer.get()); + + int axis = gatherLayer->GetParamAsInt("axis", 0); + + // Be careful, TensorFlow consist negative axis interpretation bug. Here: -3 = b, -2 = f, -1 = y, but must be -3 = f, -2 = y, -1 = x + auto cldnnAxisFromIE = [](int axis) { + switch (axis) { + case 0: return cldnn::gather::gather_axis::along_b; + case 1: return cldnn::gather::gather_axis::along_f; + case 2: return cldnn::gather::gather_axis::along_y; + case 3: return cldnn::gather::gather_axis::along_x; + case -1: return cldnn::gather::gather_axis::along_y; + case -2: return cldnn::gather::gather_axis::along_f; + case -3: return cldnn::gather::gather_axis::along_b; + default: THROW_CLDNN_EXCEPTION("Unsupported gather axis: " << axis); + } + }; - m_env.primitiveIDs[convLayer->name] = convLayer->name; - m_topology->add(convPrim); - m_env.profilingIDs.insert(convLayer->name); + std::string gatherLayerName = layer_type_name_ID(layer); + auto gatherPrim = cldnn::gather( + gatherLayerName, + inputPrimitives[0], + inputPrimitives[1], + cldnnAxisFromIE(axis), + CldnnTensorFromIEDims(gatherLayer->outData[0]->dims)); + + m_env.primitiveIDs[gatherLayerName] = gatherLayerName; + m_topology->add(gatherPrim); + m_env.profilingIDs.push_back(gatherLayerName); +} + +void CLDNNGraph::CreateDepthToSpacePrimitive(InferenceEngine::CNNLayerPtr &layer) { + ValidateLayer(layer, 1); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto depthToSpace = dynamic_cast (layer.get()); + + size_t blockSize = depthToSpace->GetParamAsInt("block_size", 2); + + if (depthToSpace->input().get()->dims.size() != 4) + THROW_CLDNN_EXCEPTION("Unsupported size of tensor " << depthToSpace->input().get()->dims.size()); + + size_t blockSizeSquare = blockSize * blockSize; + + if (depthToSpace->input().get()->dims[2] % blockSizeSquare != 0) + THROW_CLDNN_EXCEPTION("The depth of the input tensor must be divisible by squared block size = " << blockSizeSquare); + + std::string depthToSpaceName = layer_type_name_ID(layer); + auto depthToSpacePrim = cldnn::depth_to_space( + depthToSpaceName, + inputPrimitives[0], + blockSize); + + m_env.primitiveIDs[depthToSpaceName] = depthToSpaceName; + m_topology->add(depthToSpacePrim); + m_env.profilingIDs.push_back(depthToSpaceName); +} + +void CLDNNGraph::CreateShuffleChannelsPrimitive(InferenceEngine::CNNLayerPtr &layer) { + ValidateLayer(layer, 1); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto shuffleChannels = dynamic_cast (layer.get()); + const int32_t numberOfDims = shuffleChannels->input()->getDims().size(); + + int32_t group = shuffleChannels->GetParamAsInt("group", 1); + int32_t axis = shuffleChannels->GetParamAsInt("axis", 1); + + if (axis < 0) + axis += numberOfDims; + + if (axis < 0 || axis >= numberOfDims) + THROW_CLDNN_EXCEPTION("Incorrect axis value! Actual axis is" + std::to_string(group)); + + if (group < 1) + THROW_CLDNN_EXCEPTION("Invalid group size value (should equal at least one). Actual block size is" + + std::to_string(group)); + + if (shuffleChannels->input().get()->getDims()[axis] % group != 0) + THROW_CLDNN_EXCEPTION("Group parameter must evenly divide the channel dimension. Actual group size is " + + std::to_string(axis)); + + std::string shuffleChannelsName = layer_type_name_ID(layer); + auto shuffleChannelsPrim = cldnn::shuffle_channels( + shuffleChannelsName, + inputPrimitives[0], + group, + axis); + + m_env.primitiveIDs[shuffleChannelsName] = shuffleChannelsName; + m_topology->add(shuffleChannelsPrim); + m_env.profilingIDs.push_back(shuffleChannelsName); +} + +void CLDNNGraph::CreateStridedSlicePrimitive(InferenceEngine::CNNLayerPtr &layer) { + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto stridedSliceLayer = dynamic_cast (layer.get()); + + auto tmp = stridedSliceLayer->GetParamAsUInts("end_mask"); + std::vector end_mask(tmp.begin(), tmp.end()); + tmp = stridedSliceLayer->GetParamAsUInts("begin_mask"); + std::vector begin_mask(tmp.begin(), tmp.end()); + tmp = stridedSliceLayer->GetParamAsUInts("new_axis_mask"); + std::vector new_axis_mask(tmp.begin(), tmp.end()); + tmp = stridedSliceLayer->GetParamAsUInts("shrink_axis_mask"); + std::vector shrink_axis_mask(tmp.begin(), tmp.end()); + + std::string stridedSliceLayerName = layer_type_name_ID(layer); + auto stridedSlicePrim = cldnn::strided_slice( + stridedSliceLayerName, + inputPrimitives[0], inputPrimitives[1], inputPrimitives[2], inputPrimitives[3], + begin_mask, end_mask, new_axis_mask, shrink_axis_mask); + + m_env.primitiveIDs[stridedSliceLayerName] = stridedSliceLayerName; + m_topology->add(stridedSlicePrim); + m_env.profilingIDs.push_back(stridedSliceLayerName); +} + +void CLDNNGraph::CreateReverseSequencePrimitive(InferenceEngine::CNNLayerPtr &layer) { + ValidateLayer(layer, 2); + + auto inputPrimitives = GetPrevLayersPrimitives(layer); + auto reverseSequence = dynamic_cast (layer.get()); + const int32_t numberOfDims = reverseSequence->input()->getDims().size(); + + const auto input = reverseSequence->insData[0].lock()->getDims(); + const auto sequence_lengths = reverseSequence->insData[1].lock()->getDims(); + + int32_t batch_axis = reverseSequence->GetParamAsInt("batch_axis", 0); + int32_t seq_axis = reverseSequence->GetParamAsInt("seq_axis", 1); + + if (batch_axis < 0) + batch_axis += input.size(); + + if (seq_axis < 0) + seq_axis += input.size(); + + if (batch_axis == seq_axis) + THROW_CLDNN_EXCEPTION("Batch axis and sequence axis should not be equal\n"); + + if (seq_axis < 0 || seq_axis >= input.size()) + THROW_CLDNN_EXCEPTION("Incorrect Sequence axis value! Actual axis is " + std::to_string(seq_axis)); + + if (batch_axis < 0 || batch_axis >= input.size()) + THROW_CLDNN_EXCEPTION("Incorrect Sequence axis value! Actual axis is " + std::to_string(batch_axis)); + + if (sequence_lengths[0] != input[batch_axis]) + THROW_CLDNN_EXCEPTION("Sequence lengths must be a vector of length " + std::to_string(input[batch_axis]) + + "! Actual axis is " + std::to_string(sequence_lengths[0])); + + std::string reverseSequenceLayerName = layer_type_name_ID(layer); + auto reverseSequencePrim = cldnn::reverse_sequence( + reverseSequenceLayerName, + inputPrimitives[0], + inputPrimitives[1], + seq_axis, + batch_axis); + + m_env.primitiveIDs[reverseSequenceLayerName] = reverseSequenceLayerName; + m_topology->add(reverseSequencePrim); + m_env.profilingIDs.push_back(reverseSequence->name); } bool CLDNNGraph::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitLayer) const { @@ -3063,7 +3368,7 @@ bool CLDNNGraph::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitL return true; } -void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo) { +void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo, Precision inputPrecision) { // first create and add the input layout auto inputDims = inputInfo->getDims(); InferenceEngine::Layout l = inputInfo->getTensorDesc().getLayout(); @@ -3091,7 +3396,7 @@ void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo) { break; case 3: if (InferenceEngine::Layout::CHW == l) { - dataTensor = cldnn::tensor(TensorValue(inputDims[2]), TensorValue(inputDims[1]), TensorValue(inputDims[0]), 1); + dataTensor = cldnn::tensor(TensorValue(inputDims[2]), TensorValue(inputDims[1]), 1, TensorValue(inputDims[0])); } else { THROW_CLDNN_EXCEPTION("Unsupported layout (" << DebugOptions::IELayoutToString(l) << ") in 3D input " + inputInfo->name()); } @@ -3105,18 +3410,21 @@ void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo) { THROW_CLDNN_EXCEPTION("Unsupported layout (" << DebugOptions::IELayoutToString(l) << ") in 2D input " + inputInfo->name()); } break; - case 1: // not implemented yet. + case 1: + dataTensor = cldnn::tensor(TensorValue(inputDims[0]), 1, 1, 1); + break; default: THROW_CLDNN_EXCEPTION("Invalid data dimensions"); } cldnn::layout inputLayout(DataTypeFromPrecision(inputInfo->getInputPrecision()), FormatFromLayout(l), dataTensor); - auto inputName = inputInfo->name(); - m_topology->add(cldnn::input_layout(inputName, inputLayout)); // save the input dims - m_env.inputLayouts.insert({ inputName, inputLayout }); + m_env.inputLayouts.insert({ inputInfo->name(), inputLayout }); + + auto inputName = "Input:" + inputInfo->name(); + m_topology->add(cldnn::input_layout(inputName, inputLayout)); // create preprocess primitive for this input auto preProcess = inputInfo->getPreProcess(); @@ -3124,7 +3432,7 @@ void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo) { size_t meanChannels = preProcess.getNumberOfChannels(); inputLayout.format = m_defaultFormat; inputLayout.size = inputLayout.size.transform(m_defaultFormat, 1); - inputLayout.data_type = m_networkPrecision; + inputLayout.data_type = DataTypeFromPrecision(inputPrecision); auto preprocessPrimID = inputName + m_preProcessTag; if ((meanChannels > 0) && @@ -3144,8 +3452,8 @@ void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo) { } } m_topology->add(cldnn::reorder(preprocessPrimID, inputName, inputLayout, meanValues)); - m_env.profilingIDs.insert(preprocessPrimID); - InitProfileInfo(preprocessPrimID, "Reorder", "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + m_env.profilingIDs.push_back(preprocessPrimID); + InitProfileInfo(preprocessPrimID, "Reorder"); } break; @@ -3189,8 +3497,8 @@ void CLDNNGraph::AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo) { inputName, inputLayout, inputName + m_meanValuesTag)); - m_env.profilingIDs.insert(preprocessPrimID); - InitProfileInfo(preprocessPrimID, "Reorder", "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + m_env.profilingIDs.push_back(preprocessPrimID); + InitProfileInfo(preprocessPrimID, "Reorder"); } break; @@ -3212,12 +3520,18 @@ std::vector CLDNNGraph::GetPrevLayersPrimitives(const Infer THROW_CLDNN_EXCEPTION("Nonexistent input for layer: " << layer->name); } auto prevCreator = prevData->creatorLayer.lock(); - auto prevName = prevCreator ? prevCreator->name : prevData->name; - if (prevCreator && prevCreator->outData.size() > 1) { - inputPrimitives.push_back(m_env.primitiveIDs.at(prevData->name)); + std::string prevName; + + if (prevCreator) { + prevName = prevCreator->type + ":"; + if (prevCreator->outData.size() > 1) + prevName += prevData->name; + else + prevName += prevCreator->name; } else { - inputPrimitives.push_back(m_env.primitiveIDs.at(prevName)); + prevName = prevData->name; } + inputPrimitives.push_back(m_env.primitiveIDs.at(prevName)); } return inputPrimitives; } @@ -3230,12 +3544,21 @@ void CLDNNGraph::AddOutputPrimitive(std::string outputName, const InferenceEngin outputData->layout != InferenceEngine::NC) { THROW_CLDNN_EXCEPTION("Unsupported layout (" << DebugOptions::IELayoutToString(outputData->layout) << ") in output: " << outputName); } + + auto outputCreator = outputData->getCreatorLayer().lock(); + std::string outLayerName = outputCreator->type + ":"; + + if (outputCreator->outData.size() > 1) + outLayerName += outputName; + else + outLayerName += outputCreator->name; + auto outputReorderID = outputName + m_postProcessTag; Precision precision = outputPrecision == Precision::UNSPECIFIED ? outputData->getPrecision() : outputPrecision; // Find correct output ID. Start with name stored in IR. - std::string outputID = outputName; - std::string finalID = m_env.primitiveIDs.at(outputName); + std::string outputID = outLayerName; + std::string finalID = m_env.primitiveIDs.at(outLayerName); while (outputID != finalID) { auto prim = m_env.primitiveIDs.find(finalID); @@ -3251,8 +3574,8 @@ void CLDNNGraph::AddOutputPrimitive(std::string outputName, const InferenceEngin FormatFromLayout(outputData->getLayout()), DataTypeFromPrecision(precision))); m_env.primitiveIDs[outputName] = outputReorderID; - m_env.profilingIDs.insert(outputReorderID); - InitProfileInfo(outputReorderID, "Reorder", "GPU", InferenceEngine::InferenceEngineProfileInfo::EXECUTED); + m_env.profilingIDs.push_back(outputReorderID); + InitProfileInfo(outputReorderID, "Reorder"); m_env.outputDims[outputName] = outputData->dims; m_env.prevPrimitiveIDs[outputReorderID] = {outputName}; } @@ -3293,6 +3616,8 @@ cldnn::data_types CLDNNGraph::DataTypeFromPrecision(InferenceEngine::Precision p return cldnn::data_types::f16; case Precision::U8: return cldnn::data_types::u8; + case Precision::I32: + return cldnn::data_types::i32; default: THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "The plugin does not support " << p.name() << " precision"; break; @@ -3304,6 +3629,7 @@ cldnn::format CLDNNGraph::FormatFromLayout(InferenceEngine::Layout l) { case InferenceEngine::Layout::NCHW: case InferenceEngine::Layout::NC: case InferenceEngine::Layout::CHW: + case InferenceEngine::Layout::C: return cldnn::format::bfyx; case InferenceEngine::Layout::NHWC: return cldnn::format::byxf; @@ -3371,7 +3697,7 @@ void CLDNNGraph::CreateGenericLayerBlobPrimitives(const InferenceEngine::Generic THROW_CLDNN_EXCEPTION("Unhandled blob dim in layer " + layer->name); } CreatePrimitiveFromBlob( - layer->name + "_" + blob.first + m_weightsTag, + layer->type + ":" + layer->name + "_" + blob.first + m_weightsTag, blob.second, cldnn::layout( DataTypeFromPrecision(blob.second->precision()), @@ -3412,12 +3738,15 @@ CLDNNGraph::CreateInferRequestImpl(InputsDataMap networkInputs, OutputsDataMap n void CLDNNGraph::InitProfileInfo(const std::string& layerName, const std::string& layerType, - const std::string& execType, + bool isCPU, InferenceEngine::InferenceEngineProfileInfo::LayerStatus status) { - m_env.perfMap[layerName].status = status; - m_env.perfMap[layerName].cpu_uSec = m_env.perfMap[layerName].realTime_uSec = 0; - layerType.copy(m_env.perfMap[layerName].layer_type, layerType.length()); - execType.copy(m_env.perfMap[layerName].exec_type, execType.length()); + m_env.perfMap[layerType + ":" + layerName].first = layerName; + auto& perfEntry = m_env.perfMap[layerType + ":" + layerName].second; + perfEntry.layerType = layerType; + perfEntry.status = status; + perfEntry.cpu_uSec = perfEntry.realTime_uSec = 0; + perfEntry.isCPU = isCPU; + perfEntry.status = status; } }; // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.h b/inference-engine/src/cldnn_engine/cldnn_graph.h index c26b60a..0ea0649 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.h +++ b/inference-engine/src/cldnn_engine/cldnn_graph.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,7 @@ #include #include #include +#include #include "ie_blob.h" #include "ie_plugin.hpp" #include "cpp/ie_cnn_network.h" @@ -29,13 +30,30 @@ namespace CLDNNPlugin { +struct PerfCounter { + InferenceEngine::InferenceEngineProfileInfo::LayerStatus status; + bool isCPU; + uint64_t realTime_uSec; + uint64_t cpu_uSec; + uint32_t num; + std::string layerType; + +public: + PerfCounter() : realTime_uSec(0), cpu_uSec(0), num(0), + status(InferenceEngine::InferenceEngineProfileInfo::NOT_RUN), isCPU(false) {} + + long long realTime_avg() const { return (num == 0) ? 0 : realTime_uSec / num; } + long long cpu_avg() const { return (num == 0) ? 0 : cpu_uSec / num; } +}; + struct InferenceEnv { std::shared_ptr engine; std::shared_ptr network; std::map primitiveIDs; std::map> prevPrimitiveIDs; - std::map perfMap; - std::set profilingIDs; + + std::map> perfMap; + std::vector profilingIDs; DebugOptions debugOptions; @@ -108,6 +126,8 @@ protected: TanH, ELU, Activation, + Exp, + Not, LRN, Pooling, FullyConnected, @@ -145,6 +165,11 @@ protected: Pad, LSTMCell, RNN, + Gather, + DepthToSpace, + ShuffleChannels, + StridedSlice, + ReverseSequence, NO_TYPE }; @@ -155,7 +180,6 @@ protected: }; cldnn::format m_defaultFormat; - cldnn::data_types m_networkPrecision; void InitFormat(InferenceEngine::ICNNNetwork &network); static cldnn::data_types DataTypeFromPrecision(InferenceEngine::Precision p); @@ -181,7 +205,7 @@ protected: cldnn::primitive_id weightsPrimID, cldnn::primitive_id biasesPrimID); void AddPreProcessPrimitive(InferenceEngine::InputInfo::Ptr inputInfo); - void AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo); + void AddInputPrimitive(InferenceEngine::InputInfo::Ptr inputInfo, InferenceEngine::Precision inputPrecision); void AddOutputPrimitive(std::string outputName, const InferenceEngine::DataPtr outputData, InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::UNSPECIFIED); void CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr& layer); @@ -204,8 +228,9 @@ protected: void InitProfileInfo(const std::string& layerName, const std::string& layerType, - const std::string& execType, - InferenceEngine::InferenceEngineProfileInfo::LayerStatus status); + bool isCPU = false, + InferenceEngine::InferenceEngineProfileInfo::LayerStatus status + = InferenceEngine::InferenceEngineProfileInfo::EXECUTED); void changeInputBatch(size_t batch); void CompileNetwork(); @@ -250,6 +275,11 @@ protected: void CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer); void AddConstantBlobInput(InferenceEngine::CNNLayerPtr &layer); void CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr &layer, CLDNNCustomLayerPtr customLayer); + void CreateGatherPrimitive(InferenceEngine::CNNLayerPtr &layer); + void CreateDepthToSpacePrimitive(InferenceEngine::CNNLayerPtr &layer); + void CreateShuffleChannelsPrimitive(InferenceEngine::CNNLayerPtr &layer); + void CreateStridedSlicePrimitive(InferenceEngine::CNNLayerPtr &layer); + void CreateReverseSequencePrimitive(InferenceEngine::CNNLayerPtr &layer); }; }; // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp index e36578c..c903a4f 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ using namespace InferenceEngine; namespace CLDNNPlugin { -const std::string CLDNNInferRequest::fp32_suffix = "_fp32"; +const char CLDNNInferRequest::fp32_suffix[] = "_fp32"; Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* mem_ptr) { const Layout l = desc.getLayout(); @@ -156,20 +156,21 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr network, size_t n = (bi == nullptr) ? inputBlob.size() : bi->buf_size; size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; + cldnn::primitive_id internalName = "Input:" + inputName; switch (inputBlob.precision()) { case Precision::FP32: { float* blob_ptr = const_cast(inputBlob.cbuffer().as()) + offset; - network->set_input_data(inputName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); break; } case Precision::FP16: { uint16_t* blob_ptr = const_cast(inputBlob.cbuffer().as()) + offset; - network->set_input_data(inputName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); break; } case Precision::U8: { uint8_t* blob_ptr = const_cast(inputBlob.cbuffer().as()) + offset; - network->set_input_data(inputName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); break; } default: @@ -361,10 +362,10 @@ void CLDNNInferRequest::SetBatch(int new_batch) { CLDNNInferRequest::CLDNNInferRequest(InferenceEnv env, bool useProfiling, InputsDataMap networkInputs, OutputsDataMap networkOutputs) : InferRequestInternal(networkInputs, networkOutputs), - m_curBatch(-1), m_env(env), m_useProfiling(useProfiling) { if (m_env.m_max_batch > 1) { + SetBatch(m_env.m_max_batch); AllocateInputsDyn(); AllocateOutputsDyn(); } else { @@ -440,20 +441,18 @@ void CLDNNInferRequest::execAndParse() { // Get profiling info for all layers for (auto &profiledID : m_env.profilingIDs) { - std::string impl = implementationsMap.at(profiledID); - impl.copy(m_env.perfMap[profiledID].exec_type, impl.length()); - + auto& perfCount = m_env.perfMap[profiledID].second; // Change status if layer wasn't executed by cldnn engine - if (executedPrimitives.find(profiledID) == executedPrimitives.end()) { + if (perfCount.num == 0 && + executedPrimitives.find(profiledID) == executedPrimitives.end()) { if (allPrimitives.find(profiledID) != allPrimitives.end() && allPrimitives.at(profiledID) == "_optimized_") { // Layer was marked as optimized by cldnn - m_env.perfMap[profiledID].status = InferenceEngineProfileInfo::OPTIMIZED_OUT; + perfCount.status = InferenceEngineProfileInfo::OPTIMIZED_OUT; } else { // Layer wasn't run for some reason - m_env.perfMap[profiledID].status = InferenceEngineProfileInfo::NOT_RUN; + perfCount.status = InferenceEngineProfileInfo::NOT_RUN; } - m_env.perfMap[profiledID].cpu_uSec = m_env.perfMap[profiledID].realTime_uSec = 0; continue; } @@ -468,17 +467,17 @@ void CLDNNInferRequest::execAndParse() { auto count = std::chrono::duration_cast(interval.value->value()).count(); if (interval.name == "submission") { - m_env.perfMap[profiledID].cpu_uSec = count; + perfCount.cpu_uSec += count; } else if (interval.name == "executing") { - m_env.perfMap[profiledID].realTime_uSec = count; + perfCount.realTime_uSec += count; } else if (interval.name == "duration") { // "duration" is used for CPU layers - m_env.perfMap[profiledID].cpu_uSec = count; - static const std::string cpuExecType("CPU"); - memset(m_env.perfMap[profiledID].exec_type, 0, sizeof(m_env.perfMap[profiledID].exec_type)); - cpuExecType.copy(m_env.perfMap[profiledID].exec_type, - cpuExecType.length()); // Override execType as CPU + perfCount.cpu_uSec += count; + + if (perfCount.num == 0) + perfCount.isCPU = true; } } + perfCount.num++; } } } @@ -543,7 +542,32 @@ void CLDNNInferRequest::GetPerformanceCounts( if (!m_useProfiling) { THROW_IE_EXCEPTION << "Performance counters were not enabled"; } else { - perfMap = m_env.perfMap; + unsigned i = 0; + for (auto& profiledID : m_env.profilingIDs) { + const auto& layerName = m_env.perfMap.at(profiledID).first; + if (layerName.length() == 0) // no layer directly associated + continue; + + const auto& perfCounter = m_env.perfMap.at(profiledID).second; + auto& extPerfEntry = perfMap[layerName]; + + // copy layer implementation + if (perfCounter.isCPU) { + static const std::string cpuExecType("CPU"); + memset(extPerfEntry.exec_type, 0, sizeof(extPerfEntry.exec_type)); + cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length()); // Override execType as CPU + } else { + std::string impl = implementationsMap.at(profiledID); + impl.copy(extPerfEntry.exec_type, impl.length()); + } + + extPerfEntry.execution_index = i++; + extPerfEntry.status = perfCounter.status; + extPerfEntry.cpu_uSec = perfCounter.cpu_avg(); + extPerfEntry.realTime_uSec = perfCounter.realTime_avg(); + + perfCounter.layerType.copy(extPerfEntry.layer_type, perfCounter.layerType.length()); + } } } @@ -564,20 +588,21 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const return (blob_ptr == mem_ptr) && (blob.byteSize() == memory.size()); }; + cldnn::primitive_id internalName = "Input:" + inputName; const cldnn::memory& memory = inputsMemory.at(inputName); if (inputBlob.precision() == Precision::I16) { // clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix); cldnn::pointer ptr = fp32_mem.pointer(); InferenceEngine::copyToFloat(ptr.data(), &inputBlob); - m_env.network->set_input_data(inputName, fp32_mem); + m_env.network->set_input_data(internalName, fp32_mem); } else if (is_same_buffer(inputBlob, memory)) { // If input memory was allocated by cldnn engine and wasn't overwritten by user set_input_data method won't copy input data. switch (inputBlob.precision()) { case Precision::FP32: case Precision::FP16: case Precision::U8: { - m_env.network->set_input_data(inputName, memory); + m_env.network->set_input_data(internalName, memory); break; } default: diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.h b/inference-engine/src/cldnn_engine/cldnn_infer_request.h index f4b9d33..375d707 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -44,7 +44,6 @@ protected: InferenceEnv m_env; // dynamic batch stuff - int m_curBatch; std::map> batchInputs; std::map> batchOutputs; @@ -66,7 +65,7 @@ protected: void PrepareInputDyn(const cldnn::primitive_id &inputName, const InferenceEngine::Blob &inputBlob); private: - static const std::string fp32_suffix; + static const char fp32_suffix[]; }; }; // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/debug_options.cpp b/inference-engine/src/cldnn_engine/debug_options.cpp index 5a6de15..3c964dc 100644 --- a/inference-engine/src/cldnn_engine/debug_options.cpp +++ b/inference-engine/src/cldnn_engine/debug_options.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/cldnn_engine/debug_options.h b/inference-engine/src/cldnn_engine/debug_options.h index 3001b29..1dad92e 100644 --- a/inference-engine/src/cldnn_engine/debug_options.h +++ b/inference-engine/src/cldnn_engine/debug_options.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/cldnn_engine/dllmain.cpp b/inference-engine/src/cldnn_engine/dllmain.cpp index 31257da..c862ee1 100644 --- a/inference-engine/src/cldnn_engine/dllmain.cpp +++ b/inference-engine/src/cldnn_engine/dllmain.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/cldnn_engine/simple_math.cpp b/inference-engine/src/cldnn_engine/simple_math.cpp index 20b09fb..9ee02b4 100644 --- a/inference-engine/src/cldnn_engine/simple_math.cpp +++ b/inference-engine/src/cldnn_engine/simple_math.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/cldnn_engine/simple_math.h b/inference-engine/src/cldnn_engine/simple_math.h index 445b62a..bf20316 100644 --- a/inference-engine/src/cldnn_engine/simple_math.h +++ b/inference-engine/src/cldnn_engine/simple_math.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/CMakeLists.txt b/inference-engine/src/extension/CMakeLists.txt index ca9cc27..b0078e2 100644 --- a/inference-engine/src/extension/CMakeLists.txt +++ b/inference-engine/src/extension/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -11,7 +11,22 @@ if (NOT(IE_MAIN_SOURCE_DIR)) # to use C++11 if samples are built outside of IE repo set (CMAKE_CXX_STANDARD 11) set (CMAKE_CXX_STANDARD_REQUIRED ON) - set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") + if (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) + set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") + endif() +endif() + +# treating warnings as errors +if (WIN32) + if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251 /wd4275 /wd4267") #disable some warnings + endif() +else() + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") +endif() + +if (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") endif() file(GLOB_RECURSE SRC *.cpp) diff --git a/inference-engine/src/extension/README.md b/inference-engine/src/extension/README.md index 94aece3..5b766ea 100644 --- a/inference-engine/src/extension/README.md +++ b/inference-engine/src/extension/README.md @@ -17,7 +17,11 @@ when cross-compiling this library for another platform. * ArgMax * CTCGreedyDecoder + * DepthToSpace * DetectionOutput + * Expand + * Fill + * Gather * GRN * Interp * MVN @@ -28,11 +32,17 @@ when cross-compiling this library for another platform. * PriorBoxClustered * Proposal * PSROIPooling - * Region Yolo - * Reorg Yolo + * Range + * RegionYolo + * ReorgYolo * Resample + * ReverseSequence + * ShuffleChannels * SimplerNMS - * SpatialTransformer + * SpaceToDepth + * Squeeze + * StridedSlice + * Unsqueeze In order to add a new layer, you can use [the extensibility mechanism](./docs/IE_DG/Integrate_your_kernels_into_IE.md). diff --git a/inference-engine/src/extension/cmake/CPUID.cmake b/inference-engine/src/extension/cmake/CPUID.cmake index 7b6c26b..4bf7528 100644 --- a/inference-engine/src/extension/cmake/CPUID.cmake +++ b/inference-engine/src/extension/cmake/CPUID.cmake @@ -7,7 +7,7 @@ include (CheckCXXSourceRuns) -if(NOT WIN32) +if(NOT WIN32 AND NOT APPLE) set(CMAKE_REQUIRED_FLAGS "-std=c++11") endif() @@ -204,14 +204,14 @@ private: } // load bitset with flags for function 0x80000001 - if (nExIds_ >= 0x80000001) + if ((unsigned)nExIds_ >= 0x80000001) { f_81_ECX_ = extdata_[1][2]; f_81_EDX_ = extdata_[1][3]; } // Interpret CPU brand string if reported - if (nExIds_ >= 0x80000004) + if ((unsigned)nExIds_ >= 0x80000004) { memcpy(brand + 0, extdata_[2].data(), sizeof(cpui)); memcpy(brand + 16, extdata_[3].data(), sizeof(cpui)); @@ -248,7 +248,7 @@ const InstructionSet::InstructionSet_Internal InstructionSet::CPU_Rep; // Print out supported instruction set extensions int main() { - std::ofstream fo(\"cpuid.txt\"); + std::ofstream fo(\"${CMAKE_BINARY_DIR}/cpuid.txt\"); auto& outstream = fo;//std::cout; auto support_message = [&outstream](std::string isa_feature, bool is_supported) { diff --git a/inference-engine/src/extension/cmake/feature_defs.cmake b/inference-engine/src/extension/cmake/feature_defs.cmake index 4c07c2d..d40f1d3 100644 --- a/inference-engine/src/extension/cmake/feature_defs.cmake +++ b/inference-engine/src/extension/cmake/feature_defs.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required (VERSION 2.8) - include(CPUID) include(OptimizationFlags) diff --git a/inference-engine/src/extension/common/defs.h b/inference-engine/src/extension/common/defs.h index 9bf0400..a5dc5e8 100644 --- a/inference-engine/src/extension/common/defs.h +++ b/inference-engine/src/extension/common/defs.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/common/fast_exp.h b/inference-engine/src/extension/common/fast_exp.h index 4fcd25c..062198d 100644 --- a/inference-engine/src/extension/common/fast_exp.h +++ b/inference-engine/src/extension/common/fast_exp.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/common/matrixmult.h b/inference-engine/src/extension/common/matrixmult.h deleted file mode 100644 index 9070dda..0000000 --- a/inference-engine/src/extension/common/matrixmult.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -static inline void matrixMult(float *A, float *B, float *C, int m, int n, int k, bool transposeB = false) { - if (transposeB) { - for (int rowA = 0; rowA < m; rowA++) { - for (int rowB = 0; rowB < n; rowB++) { - float sum = 0; - for (int colA = 0; colA < k; colA++) { - sum += A[rowA * k + colA] * B[rowB * k + colA]; - } - - C[rowA * n + rowB] = sum; - } - } - } else { - for (int rowA = 0; rowA < m; rowA++) { - for (int colB = 0; colB < n; colB++) { - float sum = 0; - for (int colA = 0; colA < k; colA++) { - sum += A[rowA * k + colA] * B[colA * n + colB]; - } - - C[rowA * n + colB] = sum; - } - } - } -} \ No newline at end of file diff --git a/inference-engine/src/extension/common/opt_exp.h b/inference-engine/src/extension/common/opt_exp.h index 7fb57a9..04a0a3e 100644 --- a/inference-engine/src/extension/common/opt_exp.h +++ b/inference-engine/src/extension/common/opt_exp.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/common/softmax.h b/inference-engine/src/extension/common/softmax.h index 6aaf634..498bff8 100644 --- a/inference-engine/src/extension/common/softmax.h +++ b/inference-engine/src/extension/common/softmax.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/ext_argmax.cpp b/inference-engine/src/extension/ext_argmax.cpp index c6efa6c..3a8dab3 100644 --- a/inference-engine/src/extension/ext_argmax.cpp +++ b/inference-engine/src/extension/ext_argmax.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,7 +23,7 @@ public: if (layer->insData.size() != 1 || layer->outData.empty()) THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; - out_max_val_ = static_cast(layer->GetParamAsInt("out_max_val")); + out_max_val_ = layer->GetParamAsBool("out_max_val", false); top_k_ = layer->GetParamAsInt("top_k"); has_axis_ = (layer->params.find("axis") != layer->params.end()); @@ -73,12 +73,12 @@ public: dst_data[(i / axis_dist * top_k_ + j) * axis_dist + i % axis_dist] = src_vector[j].first; } else { // Produces max_ind and max_val - dst_data[2 * i * top_k_ + j] = src_vector[j].second; + dst_data[2 * i * top_k_ + j] = static_cast(src_vector[j].second); dst_data[2 * i * top_k_ + top_k_ + j] = src_vector[j].first; } } else { // Produces max_ind per axis - dst_data[(i / axis_dist * top_k_ + j) * axis_dist + i % axis_dist] = src_vector[j].second; + dst_data[(i / axis_dist * top_k_ + j) * axis_dist + i % axis_dist] = static_cast(src_vector[j].second); } } } diff --git a/inference-engine/src/extension/ext_base.cpp b/inference-engine/src/extension/ext_base.cpp index cb00fda..dc1339a 100644 --- a/inference-engine/src/extension/ext_base.cpp +++ b/inference-engine/src/extension/ext_base.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -80,6 +80,8 @@ void ExtLayerBase::addConfig(const CNNLayer* layer, std::vector order(blocks.size()); for (size_t i = 0; i < order.size(); i++) order[i] = i; + const bool isInt8 = (data->getPrecision() == Precision::I8 || data->getPrecision() == Precision::U8); + if (conf.layout == ConfLayout::BLK8 || conf.layout == ConfLayout::BLK16) { if (data_dims.size() < 4 && data_dims.size() > 5) THROW_IE_EXCEPTION << "Inapplicable blocking layout." @@ -91,10 +93,17 @@ void ExtLayerBase::addConfig(const CNNLayer* layer, std::vectorinsData[i].lock()); - for (int i = 0; i < out_l.size(); i++) + for (size_t i = 0; i < out_l.size(); i++) fill_port(config.outConfs, out_l[i], layer->outData[i]); config.dynBatchSupport = dynBatchSupport; diff --git a/inference-engine/src/extension/ext_base.hpp b/inference-engine/src/extension/ext_base.hpp index 3fa756a..7914842 100644 --- a/inference-engine/src/extension/ext_base.hpp +++ b/inference-engine/src/extension/ext_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/ext_ctc_greedy.cpp b/inference-engine/src/extension/ext_ctc_greedy.cpp index 71c9d71..ae9a099 100644 --- a/inference-engine/src/extension/ext_ctc_greedy.cpp +++ b/inference-engine/src/extension/ext_ctc_greedy.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,8 +21,7 @@ public: THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; std::vector inps; - for (const auto &in : layer->insData) - inps.emplace_back(ConfLayout::PLN); + inps.resize(layer->insData.size(), DataConfigurator(ConfLayout::PLN)); addConfig(layer, inps, {DataConfigurator(ConfLayout::PLN)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); @@ -51,7 +50,7 @@ public: output_sequences[ii] = -1; } - for (int n = 0; n < N_; ++n) { + for (size_t n = 0; n < N_; ++n) { int prev_class_idx = -1; size_t output_index = n*T_; @@ -63,21 +62,22 @@ public: float max_prob = probs[0]; ++probs; - for (int c = 1; c < C_; ++c, ++probs) { + for (size_t c = 1; c < C_; ++c, ++probs) { if (*probs > max_prob) { - max_class_idx = c; + max_class_idx = static_cast(c); max_prob = *probs; } } - if (max_class_idx < C_-1 && max_class_idx != prev_class_idx) { - output_sequences[output_index] = max_class_idx; + if (max_class_idx < static_cast(C_) - 1 && + max_class_idx != prev_class_idx) { + output_sequences[output_index] = static_cast(max_class_idx); output_index++; } prev_class_idx = max_class_idx; - if (t + 1 == T_ || sequence_indicators[(t + 1)*N_ + n] == 0) { + if (t + 1 == static_cast(T_) || sequence_indicators[(t + 1)*N_ + n] == 0) { break; } } diff --git a/inference-engine/src/extension/ext_depth_to_space.cpp b/inference-engine/src/extension/ext_depth_to_space.cpp new file mode 100644 index 0000000..0e20681 --- /dev/null +++ b/inference-engine/src/extension/ext_depth_to_space.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class DepthToSpaceImpl: public ExtLayerBase { +#define CNTR_SIZE 5 + +public: + explicit DepthToSpaceImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + SizeVector src_dims = layer->insData[0].lock()->getTensorDesc().getDims(); + if (src_dims.size() < 3) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input dimensions!"; + if (layer->insData[0].lock()->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only F32 is supported!"; + + SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); + if (dst_dims.size() < 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of output dimensions!"; + if (layer->outData[0]->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect output precision. Only F32 is supported!"; + + size_t block_size = layer->GetParamAsUInt("block_size", 1); + if (block_size == 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect block_size parameter is zero!"; + + if (src_dims[src_dims.size() - 3] % (block_size * block_size)) + THROW_IE_EXCEPTION << layer->name << " block_size parameter is incompatible with input tensor Color dimension size!"; + + if (dst_dims.size() > 2 && src_dims[src_dims.size() - 3] != (dst_dims[dst_dims.size() - 3] * block_size * block_size)) + THROW_IE_EXCEPTION << layer->name << " Input/Output tensor Color dimension is incompatible with block_size!"; + + if (dst_dims[dst_dims.size() - 2] != (src_dims[src_dims.size() - 2] * block_size)) + THROW_IE_EXCEPTION << layer->name << " Input/Output tensor Height dimension is incompatible with block_size!"; + + if (dst_dims[dst_dims.size() - 1] != (src_dims[src_dims.size() - 1] * block_size)) + THROW_IE_EXCEPTION << layer->name << " Input/Output tensor Width dimension is incompatible with block_size!"; + + own_dims[0] = 1; + for (size_t i = 0; i < (src_dims.size() - 3); i++) + own_dims[0] *= src_dims[i]; + own_dims[1] = src_dims[src_dims.size() - 2]; + own_dims[2] = src_dims[src_dims.size() - 3] / block_size; + own_dims[3] = src_dims[src_dims.size() - 1]; + own_dims[4] = block_size; + + size_t C = src_dims[src_dims.size() - 2] * src_dims[src_dims.size() - 1]; + ownStrides[0] = src_dims[src_dims.size() - 3] * C; + ownStrides[1] = src_dims[src_dims.size() - 1]; + ownStrides[2] = block_size * C; + ownStrides[3] = 1; + ownStrides[4] = C; + work_amount_dst = ownStrides[0] * own_dims[0]; + + addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + const float *src_data = inputs[0]->cbuffer().as() + + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + // Parallel + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0, src_idx = 0; + size_t counters[CNTR_SIZE] = { 0 }; + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = CNTR_SIZE - 1, i = start; j >= 0; j--) { + counters[j] = i % own_dims[j]; + src_idx += counters[j] * ownStrides[j]; + i /= own_dims[j]; + } + + for (size_t iwork = start, i = 1; iwork < end; ++iwork) { + dst_data[iwork] = src_data[src_idx]; + for (int j = CNTR_SIZE - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < own_dims[j]) { + src_idx += ownStrides[j]; + break; + } else { + counters[j] = i = 0; + } + } + if (!i) { + for (src_idx = 0; i < CNTR_SIZE; ++i) + src_idx += counters[i] * ownStrides[i]; + } + } + }); + + return OK; + } + +private: + size_t work_amount_dst; + size_t own_dims[CNTR_SIZE]; + size_t ownStrides[CNTR_SIZE]; +}; + +REG_FACTORY_FOR(ImplFactory, DepthToSpace); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_detectionoutput.cpp b/inference-engine/src/extension/ext_detectionoutput.cpp index acf58fb..1ec523f 100644 --- a/inference-engine/src/extension/ext_detectionoutput.cpp +++ b/inference-engine/src/extension/ext_detectionoutput.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -40,7 +40,9 @@ public: _nms_threshold = layer->GetParamAsFloat("nms_threshold"); _confidence_threshold = layer->GetParamAsFloat("confidence_threshold", -FLT_MAX); _share_location = layer->GetParamsAsBool("share_location", true); - _clip = layer->GetParamsAsBool("clip", false); + _clip_before_nms = layer->GetParamsAsBool("clip_before_nms", false) || + layer->GetParamsAsBool("clip", false); // for backward compatibility + _clip_after_nms = layer->GetParamsAsBool("clip_after_nms", false); _decrease_label_id = layer->GetParamsAsBool("decrease_label_id", false); _normalized = layer->GetParamsAsBool("normalized", true); _image_height = layer->GetParamAsInt("input_height", 1); @@ -53,12 +55,15 @@ public: _code_type = (code_type_str == "caffe.PriorBoxParameter.CENTER_SIZE" ? CodeType::CENTER_SIZE : CodeType::CORNER); - _num_priors = static_cast(layer->insData[idx_priors].lock()->dims[0] / _prior_size); + _num_priors = static_cast(layer->insData[idx_priors].lock()->getDims().back() / _prior_size); + _priors_batches = layer->insData[idx_priors].lock()->getDims().front() != 1; - if (_num_priors * _num_loc_classes * 4 != layer->insData[idx_location].lock()->dims[0]) - THROW_IE_EXCEPTION << "Number of priors must match number of location predictions."; + if (_num_priors * _num_loc_classes * 4 != static_cast(layer->insData[idx_location].lock()->getDims()[1])) + THROW_IE_EXCEPTION << "Number of priors must match number of location predictions (" + << _num_priors * _num_loc_classes * 4 << " vs " + << layer->insData[idx_location].lock()->getDims()[1] << ")"; - if (_num_priors * _num_classes != layer->insData[idx_confidence].lock()->dims[0]) + if (_num_priors * _num_classes != static_cast(layer->insData[idx_confidence].lock()->dims[0])) THROW_IE_EXCEPTION << "Number of priors must match number of confidence predictions."; if (_decrease_label_id && _background_label_id != 0) @@ -131,10 +136,14 @@ public: int *indices_data = _indices->buffer(); int *num_priors_actual = _num_priors_actual->buffer(); - const float *prior_variances = prior_data + _num_priors*_prior_size; - const float *ppriors = prior_data; - for (int n = 0; n < N; ++n) { + const float *ppriors = prior_data; + const float *prior_variances = prior_data + _num_priors*_prior_size; + if (_priors_batches) { + ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size; + prior_variances += _variance_encoded_in_target ? 0 : n*_num_priors*_prior_size; + } + if (_share_location) { const float *ploc = loc_data + n*4*_num_priors; float *pboxes = decoded_bboxes_data + n*4*_num_priors; @@ -227,7 +236,7 @@ public: // Store the new indices. memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int)); - for (int j = 0; j < conf_index_class_map.size(); ++j) { + for (size_t j = 0; j < conf_index_class_map.size(); ++j) { int label = conf_index_class_map[j].second.first; int idx = conf_index_class_map[j].second.second; int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors; @@ -260,8 +269,8 @@ public: for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) { int idx = pindices[c*_num_priors + i]; - dst_data[count * DETECTION_SIZE + 0] = n; - dst_data[count * DETECTION_SIZE + 1] = _decrease_label_id ? c-1 : c; + dst_data[count * DETECTION_SIZE + 0] = static_cast(n); + dst_data[count * DETECTION_SIZE + 1] = static_cast(_decrease_label_id ? c-1 : c); dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx]; float xmin = _share_location ? pboxes[idx*4 + 0] : @@ -273,6 +282,13 @@ public: float ymax = _share_location ? pboxes[idx*4 + 3] : pboxes[c*4*_num_priors + idx*4 + 3]; + if (_clip_after_nms) { + xmin = std::max(0.0f, std::min(1.0f, xmin)); + ymin = std::max(0.0f, std::min(1.0f, ymin)); + xmax = std::max(0.0f, std::min(1.0f, xmax)); + ymax = std::max(0.0f, std::min(1.0f, ymax)); + } + dst_data[count * DETECTION_SIZE + 3] = xmin; dst_data[count * DETECTION_SIZE + 4] = ymin; dst_data[count * DETECTION_SIZE + 5] = xmax; @@ -304,8 +320,9 @@ private: int _keep_top_k = 0; int _code_type = 0; - bool _share_location = false; - bool _clip = false; + bool _share_location = false; + bool _clip_before_nms = false; // clip bounding boxes before nms step + bool _clip_after_nms = false; // clip bounding boxes after nms step bool _decrease_label_id = false; int _image_width = 0; @@ -320,6 +337,7 @@ private: int _num = 0; int _num_loc_classes = 0; int _num_priors = 0; + bool _priors_batches = false; enum CodeType { CORNER = 1, @@ -477,7 +495,7 @@ void DetectionOutputImpl::decodeBBoxes(const float *prior_data, new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; } - if (_clip) { + if (_clip_before_nms) { new_xmin = std::max(0.0f, std::min(1.0f, new_xmin)); new_ymin = std::max(0.0f, std::min(1.0f, new_ymin)); new_xmax = std::max(0.0f, std::min(1.0f, new_xmax)); diff --git a/inference-engine/src/extension/ext_detectionoutput_onnx.cpp b/inference-engine/src/extension/ext_detectionoutput_onnx.cpp new file mode 100644 index 0000000..39412b3 --- /dev/null +++ b/inference-engine/src/extension/ext_detectionoutput_onnx.cpp @@ -0,0 +1,375 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include "ie_parallel.hpp" + + +namespace { +struct Indexer { + const std::vector dims_; + int total_{1}; + + explicit Indexer(const std::vector& dims) : dims_(dims) { + total_ = 1; + for (size_t i = 0; i < dims_.size(); ++i) { + total_ *= dims_[i]; + } + } + + const int operator()(const std::vector& idx) const { + int flat_idx = 0; + assert(idx.size() == dims_.size()); + for (size_t i = 0; i < dims_.size(); ++i) { + assert(0 <= idx[i] && idx[i] < dims_[i]); + flat_idx = flat_idx * dims_[i] + idx[i]; + } + assert(flat_idx < total_); + return flat_idx; + } +}; +} // namespace + + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +static +void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, + float* refined_boxes, float* refined_boxes_areas, float* refined_scores, + const int rois_num, const int classes_num, + const float img_H, const float img_W, + const float max_delta_log_wh, + float coordinates_offset) { + Indexer box_idx({rois_num, 4}); + Indexer delta_idx({rois_num, classes_num, 4}); + Indexer score_idx({rois_num, classes_num}); + + Indexer refined_box_idx({classes_num, rois_num, 4}); + Indexer refined_score_idx({classes_num, rois_num}); + + for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) { + float x0 = boxes[box_idx({roi_idx, 0})]; + float y0 = boxes[box_idx({roi_idx, 1})]; + float x1 = boxes[box_idx({roi_idx, 2})]; + float y1 = boxes[box_idx({roi_idx, 3})]; + + if (x1 - x0 <= 0 || y1 - y0 <= 0) { + continue; + } + + // width & height of box + const float ww = x1 - x0 + coordinates_offset; + const float hh = y1 - y0 + coordinates_offset; + // center location of box + const float ctr_x = x0 + 0.5f * ww; + const float ctr_y = y0 + 0.5f * hh; + + for (int class_idx = 1; class_idx < classes_num; ++class_idx) { + const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0]; + const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1]; + const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2]; + const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3]; + + // new center location according to deltas (dx, dy) + const float pred_ctr_x = dx * ww + ctr_x; + const float pred_ctr_y = dy * hh + ctr_y; + // new width & height according to deltas d(log w), d(log h) + const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww; + const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh; + + // update upper-left corner location + float x0_new = pred_ctr_x - 0.5f * pred_w; + float y0_new = pred_ctr_y - 0.5f * pred_h; + // update lower-right corner location + float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset; + float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset; + + // adjust new corner locations to be within the image region, + x0_new = std::max(0.0f, std::min(x0_new, img_W - coordinates_offset)); + y0_new = std::max(0.0f, std::min(y0_new, img_H - coordinates_offset)); + x1_new = std::max(0.0f, std::min(x1_new, img_W - coordinates_offset)); + y1_new = std::max(0.0f, std::min(y1_new, img_H - coordinates_offset)); + + // recompute new width & height + const float box_w = x1_new - x0_new + coordinates_offset; + const float box_h = y1_new - y0_new + coordinates_offset; + + refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new; + + refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h; + + refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})]; + } + } +} + +template +static bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + + +struct ConfidenceComparator { + explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} + + bool operator()(int idx1, int idx2) { + if (_conf_data[idx1] > _conf_data[idx2]) return true; + if (_conf_data[idx1] < _conf_data[idx2]) return false; + return idx1 < idx2; + } + + const float* _conf_data; +}; + +static inline float JaccardOverlap(const float *decoded_bbox, + const float *bbox_sizes, + const int idx1, + const int idx2, + const float coordinates_offset = 1) { + float xmin1 = decoded_bbox[idx1 * 4 + 0]; + float ymin1 = decoded_bbox[idx1 * 4 + 1]; + float xmax1 = decoded_bbox[idx1 * 4 + 2]; + float ymax1 = decoded_bbox[idx1 * 4 + 3]; + + float xmin2 = decoded_bbox[idx2 * 4 + 0]; + float ymin2 = decoded_bbox[idx2 * 4 + 1]; + float ymax2 = decoded_bbox[idx2 * 4 + 3]; + float xmax2 = decoded_bbox[idx2 * 4 + 2]; + + if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { + return 0.0f; + } + + float intersect_xmin = std::max(xmin1, xmin2); + float intersect_ymin = std::max(ymin1, ymin2); + float intersect_xmax = std::min(xmax1, xmax2); + float intersect_ymax = std::min(ymax1, ymax2); + + float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; + float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; + + if (intersect_width <= 0 || intersect_height <= 0) { + return 0.0f; + } + + float intersect_size = intersect_width * intersect_height; + float bbox1_size = bbox_sizes[idx1]; + float bbox2_size = bbox_sizes[idx2]; + + return intersect_size / (bbox1_size + bbox2_size - intersect_size); +} + + +static void nms_cf(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int& detections, + const int boxes_num, + const int pre_nms_topn, + const int post_nms_topn, + const float confidence_threshold, + const float nms_threshold) { + int count = 0; + for (int i = 0; i < boxes_num; ++i) { + if (conf_data[i] > confidence_threshold) { + indices[count] = i; + count++; + } + } + + int num_output_scores = (pre_nms_topn == -1 ? count : std::min(pre_nms_topn, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + detections = 0; + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + + bool keep = true; + for (int k = 0; k < detections; ++k) { + const int kept_idx = indices[k]; + float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); + if (overlap > nms_threshold) { + keep = false; + break; + } + } + if (keep) { + indices[detections] = idx; + detections++; + } + } + + detections = (post_nms_topn == -1 ? detections : std::min(post_nms_topn, detections)); +} + + +class ExperimentalDetectronDetectionOutputImpl: public ExtLayerBase { +private: + const int INPUT_ROIS {0}; + const int INPUT_DELTAS {1}; + const int INPUT_SCORES {2}; + const int INPUT_IM_INFO {3}; + + const int OUTPUT_BOXES {0}; + const int OUTPUT_CLASSES {1}; + const int OUTPUT_SCORES {2}; + +public: + explicit ExperimentalDetectronDetectionOutputImpl(const CNNLayer* layer) { + try { + score_threshold_ = layer->GetParamAsFloat("score_threshold"); + nms_threshold_ = layer->GetParamAsFloat("nms_threshold"); + max_delta_log_wh_ = layer->GetParamAsFloat("max_delta_log_wh"); + classes_num_ = layer->GetParamAsInt("num_classes"); + max_detections_per_class_ = layer->GetParamAsInt("post_nms_count"); + max_detections_per_image_ = layer->GetParamAsInt("max_detections_per_image"); + class_agnostic_box_regression_ = layer->GetParamAsBool("class_agnostic_box_regression", false); + deltas_weights_ = layer->GetParamAsFloats("deltas_weights"); + + std::vector inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN)); + std::vector outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN)); + addConfig(layer, inputs_layouts, outputs_layouts); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, + ResponseDesc *resp) noexcept override { + const int rois_num = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0]; + assert(classes_num_ == static_cast(inputs[INPUT_SCORES]->getTensorDesc().getDims()[1])); + assert(4 * classes_num_ == static_cast(inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1])); + + const auto* boxes = inputs[INPUT_ROIS]->buffer().as(); + const auto* deltas = inputs[INPUT_DELTAS]->buffer().as(); + const auto* scores = inputs[INPUT_SCORES]->buffer().as(); + const auto* im_info = inputs[INPUT_IM_INFO]->buffer().as(); + + auto* output_boxes = outputs[OUTPUT_BOXES]->buffer().as(); + auto* output_scores = outputs[OUTPUT_SCORES]->buffer().as(); + auto* output_classes = outputs[OUTPUT_CLASSES]->buffer().as(); + + const float img_H = im_info[0]; + const float img_W = im_info[1]; + + // Apply deltas. + std::vector refined_boxes(classes_num_ * rois_num * 4, 0); + std::vector refined_scores(classes_num_ * rois_num, 0); + std::vector refined_boxes_areas(classes_num_ * rois_num, 0); + Indexer refined_box_idx({classes_num_, rois_num, 4}); + Indexer refined_score_idx({classes_num_, rois_num}); + + refine_boxes(boxes, deltas, &deltas_weights_[0], scores, + &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], + rois_num, classes_num_, + img_H, img_W, + max_delta_log_wh_, + 1.0f); + + // Apply NMS class-wise. + std::vector buffer(rois_num, 0); + std::vector indices(classes_num_ * rois_num, 0); + std::vector detections_per_class(classes_num_, 0); + int total_detections_num = 0; + + for (int class_idx = 1; class_idx < classes_num_; ++class_idx) { + nms_cf(&refined_scores[refined_score_idx({class_idx, 0})], + &refined_boxes[refined_box_idx({class_idx, 0, 0})], + &refined_boxes_areas[refined_score_idx({class_idx, 0})], + &buffer[0], + &indices[total_detections_num], + detections_per_class[class_idx], + rois_num, + -1, + max_detections_per_class_, + score_threshold_, + nms_threshold_); + total_detections_num += detections_per_class[class_idx]; + } + + // Leave only max_detections_per_image_ detections. + // confidence, + std::vector>> conf_index_class_map; + + int indices_offset = 0; + for (int c = 0; c < classes_num_; ++c) { + int n = detections_per_class[c]; + for (int i = 0; i < n; ++i) { + int idx = indices[indices_offset + i]; + float score = refined_scores[refined_score_idx({c, idx})]; + conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx))); + } + indices_offset += n; + } + + assert(max_detections_per_image_ > 0); + if (total_detections_num > max_detections_per_image_) { + std::partial_sort(conf_index_class_map.begin(), + conf_index_class_map.begin() + max_detections_per_image_, + conf_index_class_map.end(), + SortScorePairDescend>); + conf_index_class_map.resize(max_detections_per_image_); + total_detections_num = max_detections_per_image_; + } + + // Fill outputs. + memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(float)); + memset(output_scores, 0, max_detections_per_image_ * sizeof(float)); + memset(output_classes, 0, max_detections_per_image_ * sizeof(float)); + + int i = 0; + for (const auto & detection : conf_index_class_map) { + float score = detection.first; + int cls = detection.second.first; + int idx = detection.second.second; + output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})]; + output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})]; + output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})]; + output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})]; + output_scores[i] = score; + output_classes[i] = static_cast(cls); + ++i; + } + + return OK; + } + +private: + float score_threshold_; + float nms_threshold_; + float max_delta_log_wh_; + int classes_num_; + int max_detections_per_class_; + int max_detections_per_image_; + bool class_agnostic_box_regression_; + std::vector deltas_weights_; +}; + + + +REG_FACTORY_FOR(ImplFactory, ExperimentalDetectronDetectionOutput); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_expand.cpp b/inference-engine/src/extension/ext_expand.cpp new file mode 100644 index 0000000..297f586 --- /dev/null +++ b/inference-engine/src/extension/ext_expand.cpp @@ -0,0 +1,192 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class ExpandImpl: public ExtLayerBase { +public: + explicit ExpandImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + if (layer->insData.size() != 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; + + SizeVector shape_dims = layer->insData[EXPAND_SHAPE].lock()->getTensorDesc().getDims(); + if (shape_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Shape vector should be 1 dimension"; + + if (layer->insData[EXPAND_SHAPE].lock()->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << layer->name << " Shape vector should be I32!"; + + if (!(layer->insData[EXPAND_INPUT].lock()->getTensorDesc().getPrecision() == Precision::I32 && + layer->outData[0]->getTensorDesc().getPrecision() == Precision::I32) && + !(layer->insData[EXPAND_INPUT].lock()->getTensorDesc().getPrecision() == Precision::FP32 && + layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { + THROW_IE_EXCEPTION << layer->name << + " Input and output tensors should have same precision and only FP32 and I32 are supported!"; + } + + src_dims = layer->insData[EXPAND_INPUT].lock()->getTensorDesc().getDims(); + srcStrides = layer->insData[EXPAND_INPUT].lock()->getTensorDesc().getBlockingDesc().getStrides(); + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + int32_t* shape_dims = inputs[EXPAND_SHAPE]->cbuffer().as() + + inputs[EXPAND_SHAPE]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + size_t shape_size = (inputs[EXPAND_SHAPE]->getTensorDesc().getDims())[0]; + SizeVector dst_dims = outputs[0]->getTensorDesc().getDims(); + + if (dst_dims.size() != shape_size) { + if (resp) { + std::string errorMsg = "Output tensor dimension mismatch"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + + if (src_dims.size() > dst_dims.size()) { + if (resp) { + std::string errorMsg = "Output tensor dimension is smaller then input tensor dimension"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + + size_t i; + for (i = 0; i < dst_dims.size(); i++) { + if (static_cast(dst_dims[i]) != shape_dims[i]) { + if (resp) { + std::string errorMsg = "Output tensor dimension size mismatch"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + + size_t prefix_size = dst_dims.size() - src_dims.size(); + for (i = 0; i < src_dims.size(); i++) { + if (src_dims[i] != 1 && + static_cast(src_dims[i]) != shape_dims[i + prefix_size]) { + if (resp) { + std::string errorMsg = "In/Output corresponding dimension must have the same value, or Input dimension is equal to 1"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + + InferenceEngine::SizeVector dstStrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides(); + InferenceEngine::SizeVector src_aligned(dst_dims.size()); + InferenceEngine::SizeVector srcStrides_aligned(dst_dims.size()); + for (i = 0; i < dst_dims.size(); i++) { + if (i < prefix_size) { + src_aligned[i] = 1; + srcStrides_aligned[i] = srcStrides[0]; + } else { + src_aligned[i] = src_dims[i - prefix_size]; + srcStrides_aligned[i] = srcStrides[i - prefix_size]; + } + } + + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; + + switch (outputs[0]->precision()) { + case Precision::FP32: { + const float *src_data = inputs[EXPAND_INPUT]->cbuffer().as() + + inputs[EXPAND_INPUT]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, src_idx, start = 0, end = 0; + SizeVector counters(dst_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = dst_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % dst_dims[j]; + i /= dst_dims[j]; + } + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < dst_dims.size(); ++i) + src_idx += counters[i] ? ((counters[i] % src_aligned[i]) * srcStrides_aligned[i]) : 0; + + dst_data[iwork] = src_data[src_idx]; + + for (int j = dst_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % dst_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + case Precision::I32: { + const int32_t *src_data = inputs[EXPAND_INPUT]->cbuffer().as() + + inputs[EXPAND_INPUT]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + int32_t* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, src_idx, start = 0, end = 0; + SizeVector counters(dst_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = dst_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % dst_dims[j]; + i /= dst_dims[j]; + } + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < dst_dims.size(); ++i) + src_idx += counters[i] ? ((counters[i] % src_aligned[i]) * srcStrides_aligned[i]) : 0; + + dst_data[iwork] = src_data[src_idx]; + + for (int j = dst_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % dst_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + default: + if (resp) { + std::string errorMsg = "Incorrect output precision. Only FP32 and I32 are supported!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + + return OK; + } + +private: + const size_t EXPAND_INPUT = 0; + const size_t EXPAND_SHAPE = 1; + + SizeVector src_dims; + SizeVector srcStrides; +}; + +REG_FACTORY_FOR(ImplFactory, Expand); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_fill.cpp b/inference-engine/src/extension/ext_fill.cpp new file mode 100644 index 0000000..aea45e9 --- /dev/null +++ b/inference-engine/src/extension/ext_fill.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class FillImpl: public ExtLayerBase { +public: + explicit FillImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + if (layer->insData.size() != 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; + + SizeVector fill_dims = layer->insData[FILL_DIMS].lock()->getTensorDesc().getDims(); + if (fill_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Fill dimensions vector should be 1 dimension"; + + if (layer->insData[FILL_DIMS].lock()->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << layer->name << " Fill dimensions vector should be I32!"; + + SizeVector value_dims = layer->insData[FILL_VALUE].lock()->getTensorDesc().getDims(); + if (value_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Value scalar should have 1 dimension"; + + if (!(layer->insData[FILL_VALUE].lock()->getTensorDesc().getPrecision() == Precision::I32 && + layer->outData[0]->getTensorDesc().getPrecision() == Precision::I32) && + !(layer->insData[FILL_VALUE].lock()->getTensorDesc().getPrecision() == Precision::FP32 && + layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { + THROW_IE_EXCEPTION << layer->name << + " 'Value' input scalars and output tensor should have same precision and only FP32 and I32 are supported!"; + } + + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + int32_t* fill_dims = inputs[FILL_DIMS]->cbuffer().as() + + inputs[FILL_DIMS]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + size_t fill_size = inputs[FILL_DIMS]->getTensorDesc().getDims()[0]; + SizeVector dst_dims = outputs[0]->getTensorDesc().getDims(); + + if (dst_dims.size() != fill_size) { + if (resp) { + std::string errorMsg = "Output tensor dimension mismatch"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + + size_t work_amount_dst = 1; + for (size_t i = 0; i < dst_dims.size(); i++) { + work_amount_dst *= fill_dims[i]; + if (static_cast(dst_dims[i]) != fill_dims[i]) { + if (resp) { + std::string errorMsg = "Output tensor dimension size mismatch"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + + switch (outputs[0]->precision()) { + case Precision::FP32: { + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float value = (inputs[FILL_VALUE]->cbuffer().as() + + inputs[FILL_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, start, end); + std::fill_n(dst_data + start, end - start, value); + }); + } + break; + case Precision::I32: { + int32_t* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + int32_t value = (inputs[FILL_VALUE]->cbuffer().as() + + inputs[FILL_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, start, end); + std::fill_n(dst_data + start, end - start, value); + }); + return OK; + } + break; + default: + if (resp) { + std::string errorMsg = "Incorrect output precision. Only FP32 and I32 are supported!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + + return OK; + } + +private: + const size_t FILL_DIMS = 0; + const size_t FILL_VALUE = 1; +}; + +REG_FACTORY_FOR(ImplFactory, Fill); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_gather.cpp b/inference-engine/src/extension/ext_gather.cpp index 27ae077..03527ce 100644 --- a/inference-engine/src/extension/ext_gather.cpp +++ b/inference-engine/src/extension/ext_gather.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -18,88 +18,19 @@ namespace InferenceEngine { namespace Extensions { namespace Cpu { -inline void clipping(int *idx, const int min, const int max) { - (*idx) = ((*idx) > min) ? (*idx) : min; - (*idx) = ((*idx) < max) ? (*idx) : (max - 1); - return; -} - -class GatherImpl: public ILayerExecImpl { +class GatherImpl: public ExtLayerBase { public: - StatusCode init(LayerConfig& config, ResponseDesc *resp) noexcept override { - for (auto& input : config.inConfs) { - for (auto& offset : input.desc.getBlockingDesc().getOffsetPaddingToData()) { - if (offset) { - return GENERAL_ERROR; - } - } - } - for (auto& output : config.outConfs) { - for (auto& offset : output.desc.getBlockingDesc().getOffsetPaddingToData()) { - if (offset) { - return GENERAL_ERROR; - } - } - } - - // Check for holes in tensors - SizeVector dictionary_dims = config.inConfs[GATHER_DICTIONARY].desc.getDims(); - SizeVector indexes_dims = config.inConfs[GATHER_INDEXES].desc.getDims(); - SizeVector out_dims = config.outConfs[0].desc.getDims(); - size_t idx_size = 1; - for (auto dims : indexes_dims) - idx_size *= dims; - - size_t dct_size = 1; - for (auto dims : dictionary_dims) - dct_size *= dims; - - size_t out_size = 1; - for (auto dims : out_dims) - out_size *= dims; - - size_t dctSV = config.inConfs[GATHER_DICTIONARY].desc.getBlockingDesc().getStrides()[0]; - size_t dctDV = config.inConfs[GATHER_DICTIONARY].desc.getBlockingDesc().getBlockDims()[0]; - size_t idxSV = config.inConfs[GATHER_INDEXES].desc.getBlockingDesc().getStrides()[0]; - size_t idxDV = config.inConfs[GATHER_INDEXES].desc.getBlockingDesc().getBlockDims()[0]; - size_t outSV = config.outConfs[0].desc.getBlockingDesc().getStrides()[0]; - size_t outDV = config.outConfs[0].desc.getBlockingDesc().getBlockDims()[0]; - if (outSV * outDV == out_size && idxSV * idxDV == idx_size && dctSV * dctDV == dct_size) - withHoles = NONE; - else if (outSV * outDV != out_size && idxSV * idxDV == idx_size && dctSV * dctDV == dct_size) - withHoles = OUTPUT; - - return OK; - }; - - StatusCode getSupportedConfigurations(std::vector& conf, ResponseDesc *resp) noexcept override { - if (!errorMsg.empty()) { - if (resp) { - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return GENERAL_ERROR; - } - conf = confs; - return OK; - }; - explicit GatherImpl(const CNNLayer* layer) { try { if (layer->insData.size() != 2 || layer->outData.empty()) - THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; Precision inIdxPrecision = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getPrecision(); - if (inIdxPrecision != Precision::FP32 && - inIdxPrecision != Precision::I32 && - inIdxPrecision != Precision::U16 && - inIdxPrecision != Precision::I16 && - inIdxPrecision != Precision::U8 && - inIdxPrecision != Precision::I8) - THROW_IE_EXCEPTION << "Incorrect input precision. Only FP32|I32|U16|I16|U8|I8 are supported!"; + if (inIdxPrecision != Precision::FP32 && inIdxPrecision != Precision::I32) + THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only FP32 or I32 are supported!"; // Remove redundant dimensions const SizeVector& dictionary_dims = layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getDims(); - size_t actualAxis = 0; SizeVector dims_actual; for (size_t i = 0; i < dictionary_dims.size(); i++) { if (dictionary_dims[i] > 1) { @@ -110,83 +41,42 @@ public: } if (dims_actual.size() == 0) - THROW_IE_EXCEPTION << "Incorrect input parameters dimension!"; + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimension!"; axis = static_cast(layer->GetParamAsInt("axis")); // Dictionary must be at least rank axis + 1 - if (axis > 0 && (dims_actual.size() - axis) < 1) - THROW_IE_EXCEPTION << "Incorrect input parameters dimensions and axis number!"; + if (axis > 0 && static_cast(dims_actual.size()) < (1 + axis)) + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimensions and axis number!"; else if (axis < 0 && (static_cast(dims_actual.size()) + axis) < 0) - THROW_IE_EXCEPTION << "Incorrect input parameters dimensions and axis number!"; + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimensions and axis number!"; if (axis < 0) axis += dims_actual.size(); // Find number of dictionaries, index range and data length - for (size_t i = 0; i < axis; i++) + for (int i = 0; i < axis; i++) numDictionaries *= dims_actual[i]; indexRange = dims_actual[axis]; for (size_t i = axis + 1; i < dims_actual.size(); i++) dataLength *= dims_actual[i]; if (dataLength == 0) - THROW_IE_EXCEPTION << "Incorrect input parameters dimension!"; - - LayerConfig config; - DataConfig dataConfigIdx, dataConfigDct; - const SizeVector& indexes_dims = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getDims(); - dataConfigDct.desc = TensorDesc(InferenceEngine::Precision(InferenceEngine::Precision::FP32), dictionary_dims, InferenceEngine::Layout::ANY); - dataConfigIdx.desc = TensorDesc(inIdxPrecision, indexes_dims, InferenceEngine::Layout::ANY); - if (GATHER_DICTIONARY == 0) { - config.inConfs.push_back(dataConfigDct); - config.inConfs.push_back(dataConfigIdx); - } else { - config.inConfs.push_back(dataConfigIdx); - config.inConfs.push_back(dataConfigDct); - } + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimension!"; - DataConfig dataConfigOut; - const SizeVector& out_dims = layer->outData[0]->getTensorDesc().getDims(); - SizeVector blocks = out_dims; - SizeVector order(blocks.size()); - SizeVector dimOffsets(blocks.size()); - SizeVector strides(blocks.size()); - size_t offset(std::numeric_limits::max()); - for (size_t i = 0; i < order.size(); i++) { - strides[i] = std::numeric_limits::max(); - dimOffsets[i] = 0; - order[i] = i; - } - dataConfigOut.desc = TensorDesc(InferenceEngine::Precision(InferenceEngine::Precision::FP32), out_dims, - { blocks, order, offset, dimOffsets, strides }); - config.outConfs.push_back(dataConfigOut); - config.dynBatchSupport = false; - confs.push_back(config); + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } } - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { switch (inputs[GATHER_INDEXES]->precision()) { case Precision::FP32: - gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles); + gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0]); break; case Precision::I32: - gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles); - break; - case Precision::U16: - gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles); - break; - case Precision::I16: - gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles); - break; - case Precision::U8: - gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles); - break; - case Precision::I8: - gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles); + gather(inputs[GATHER_INDEXES]->cbuffer().as(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0]); break; default: return GENERAL_ERROR; @@ -195,20 +85,9 @@ public: return OK; } -protected: - enum class ConfLayout { ANY, PLN, BLK8, BLK16 }; - std::string errorMsg; - std::vector confs; - private: - enum HolesMode { - NONE = 0, - OUTPUT = 1, - ALL = 2 - }; - template - void gather(data_t *src_dataIdx, Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output, bool withHoles); + void gather(data_t *src_dataIdx, Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output); int axis = 0; size_t numDictionaries = 1; @@ -216,82 +95,46 @@ private: size_t dataLength = 1; const size_t GATHER_DICTIONARY = 0; const size_t GATHER_INDEXES = 1; - HolesMode withHoles = ALL; }; template -void GatherImpl::gather(data_t *src_dataIdx, Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output, bool withHoles) { +void GatherImpl::gather(data_t *src_dataIdx, Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output) { size_t src_dataIdxSize = indexes->size(); - size_t dataSize = sizeof(float) * dataLength; - - if (withHoles == GatherImpl::NONE) { // No holes in tensors - const float *src_dataDict = dictionary->cbuffer().as() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dst_data = output->cbuffer().as() + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); - src_dataIdx += indexes->getTensorDesc().getBlockingDesc().getOffsetPadding(); + const float *src_dataDict = dictionary->cbuffer().as() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = output->cbuffer().as() + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); + src_dataIdx += indexes->getTensorDesc().getBlockingDesc().getOffsetPadding(); - if (axis == 0) { - parallel_for(src_dataIdxSize, [&](size_t i) { - int idx = static_cast(src_dataIdx[i]); - - // Index clipping - clipping(&idx, 0, indexRange); + if (axis == 0) { + parallel_for(src_dataIdxSize, [&](size_t i) { + unsigned int idx = static_cast(src_dataIdx[i]); + // Index clipping + if (idx < indexRange) { // Copying data to destination from Dictionary - simple_copy(&dst_data[dataLength * i], + simple_copy(&dst_data[i * dataLength], output->byteSize() - (dataLength * i), &src_dataDict[dataLength * idx], - dataSize); - }); - } else { - parallel_for(src_dataIdxSize, [&](size_t i) { - int idx = static_cast(src_dataIdx[i]); - - // Index clipping - clipping(&idx, 0, indexRange); + sizeof(float) * dataLength); + } else { + std::fill_n(&dst_data[i * dataLength], dataLength, 0.f); + } + }); + } else { + parallel_for(src_dataIdxSize, [&](size_t i) { + unsigned int idx = static_cast(src_dataIdx[i]); + // Index clipping + if (idx < indexRange) { // Copying data to destination from Dictionary for (size_t j = 0; j < numDictionaries; j++) { simple_copy(&dst_data[dataLength * (i + j * src_dataIdxSize)], output->byteSize() - (dataLength * (i + j * src_dataIdxSize)), &src_dataDict[dataLength * (idx + j * indexRange)], - dataSize); + sizeof(float) * dataLength); } - }); - } - } else if (withHoles == GatherImpl::OUTPUT) { // If only output tensor have holes - const float *src_dataDict = dictionary->cbuffer().as() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dst_data = output->cbuffer().as(); - src_dataIdx += indexes->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - parallel_for(src_dataIdxSize, [&](size_t i) { - int idx = static_cast(src_dataIdx[i]); - - // Index clipping - clipping(&idx, 0, indexRange); - - // Copying data to destination from Dictionary - for (size_t j = 0; j < numDictionaries; j++) { - for (size_t k = 0; k < dataLength; k++) { - dst_data[output->getTensorDesc().offset(k + dataLength * (i + j * src_dataIdxSize))] = - src_dataDict[k + dataLength * (idx + j * indexRange)]; - } - } - }); - } else { // If input and oupput tensors have holes - const float *src_dataDict = dictionary->cbuffer().as(); - float* dst_data = output->cbuffer().as(); - - parallel_for(src_dataIdxSize, [&](size_t i) { - int idx = static_cast(src_dataIdx[indexes->getTensorDesc().offset(i)]); - - // Index clipping - clipping(&idx, 0, indexRange); - - // Copying data to destination from Dictionary - for (size_t j = 0; j < numDictionaries; j++) { - for (size_t k = 0; k < dataLength; k++) { - dst_data[output->getTensorDesc().offset(k + dataLength * (i + j * src_dataIdxSize))] = - src_dataDict[dictionary->getTensorDesc().offset(k + dataLength * (idx + j * indexRange))]; + } else { + for (size_t j = 0; j < numDictionaries; j++) { + std::fill_n(&dst_data[dataLength * (i + j * src_dataIdxSize)], dataLength, 0.f); } } }); diff --git a/inference-engine/src/extension/ext_grn.cpp b/inference-engine/src/extension/ext_grn.cpp index 4810d9d..87869f7 100644 --- a/inference-engine/src/extension/ext_grn.cpp +++ b/inference-engine/src/extension/ext_grn.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -48,7 +48,7 @@ public: } variance = std::pow(variance + bias, 0.5f); for (int c = 0; c < C; c++) { - dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / variance; + dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); } }); return OK; diff --git a/inference-engine/src/extension/ext_interp.cpp b/inference-engine/src/extension/ext_interp.cpp index 64ff20d..3b3b684 100644 --- a/inference-engine/src/extension/ext_interp.cpp +++ b/inference-engine/src/extension/ext_interp.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/ext_list.cpp b/inference-engine/src/extension/ext_list.cpp index 6aa139d..89058be 100644 --- a/inference-engine/src/extension/ext_list.cpp +++ b/inference-engine/src/extension/ext_list.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,8 +31,8 @@ void CpuExtensions::AddShapeInferImpl(std::string name, const IShapeInferImpl::P void CpuExtensions::GetVersion(const Version*& versionInfo) const noexcept { static Version ExtensionDescription = { - { 1, 0 }, // extension API version - "1.0", + { 1, 6 }, // extension API version + "1.6", "ie-cpu-ext" // extension description message }; diff --git a/inference-engine/src/extension/ext_list.hpp b/inference-engine/src/extension/ext_list.hpp index 6e83e7e..08f6235 100644 --- a/inference-engine/src/extension/ext_list.hpp +++ b/inference-engine/src/extension/ext_list.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/ext_mvn.cpp b/inference-engine/src/extension/ext_mvn.cpp index 27f8b9f..7c09e53 100644 --- a/inference-engine/src/extension/ext_mvn.cpp +++ b/inference-engine/src/extension/ext_mvn.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,8 +31,8 @@ public: if (layer->insData.size() != 1 || layer->outData.empty()) THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; - across_channels = static_cast(layer->GetParamAsInt("across_channels")); - normalize_variance = static_cast(layer->GetParamAsInt("normalize_variance")); + across_channels = layer->GetParamAsBool("across_channels", false); + normalize_variance = layer->GetParamAsBool("normalize_variance", false); eps = layer->GetParamAsFloat("eps"); #if defined(HAVE_AVX512F) @@ -87,7 +87,7 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& size_t cb = b * C3; if (across_channels) { double mean = 0.0; - mean = parallel_sum(C, mean, [&](int c)->double { + mean = parallel_sum(C, mean, [&](size_t c)->double { double mean_internal = 0.0; size_t cc = cb + c * C2; for (size_t d = 0lu; d < D; d++) { @@ -111,7 +111,7 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& size_t ch = cd + h * W; for (size_t w = 0lu; w < W; w++) { size_t cw = ch + w; - dst_data[cw] = src_data[cw] - mean; + dst_data[cw] = src_data[cw] - static_cast(mean); } } } @@ -138,7 +138,7 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& size_t ch = cd + h * W; for (size_t w = 0lu; w < W; w++) { size_t cw = ch + w; - dst_data[cw] = src_data[cw] - mean; + dst_data[cw] = src_data[cw] - static_cast(mean); } } } @@ -152,7 +152,7 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& size_t cb = b * C3; if (across_channels) { double variance = 0.0; - variance = parallel_sum(C, variance, [&](int c)->double { + variance = parallel_sum(C, variance, [&](size_t c)->double { double variance_internal = 0.0; size_t cc = cb + c * C2; for (size_t d = 0lu; d < D; d++) { @@ -177,7 +177,7 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& for (size_t h = 0lu; h < H; h++) { size_t ch = cd + h * W; for (size_t w = 0lu; w < W; w++) { - dst_data[ch + w] /= variance; + dst_data[ch + w] /= static_cast(variance); } } } @@ -204,7 +204,7 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& for (size_t h = 0lu; h < H; h++) { size_t ch = cd + h * W; for (size_t w = 0lu; w < W; w++) { - dst_data[ch + w] /= variance; + dst_data[ch + w] /= static_cast(variance); } } } @@ -233,13 +233,12 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& size_t H = (dims_size > 3) ? dims[dims_size - 2] : 1lu; size_t W = (dims_size > 2) ? dims[dims_size - 1] : 1lu; - int CB = div_up(C, static_cast(blk_size)); + int CB = div_up(static_cast(C), static_cast(blk_size)); size_t C0 = W * blk_size; size_t C1 = C0 * H; size_t C2 = C1 * D; size_t C3 = C2 * CB; - size_t C4 = D * H * W; size_t C5 = C * D * H * W; if (normalize_variance) { @@ -265,9 +264,8 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& double variance = 0.0; variance = parallel_sum3d(CB, D, H, variance, [&](size_t cb, size_t d, size_t h)->double { size_t ccbd = ccb + cb * C2 + d * C1 + h * C0; - size_t min_cb = std::min(blk_size, C - cb * blk_size); double variance_internal = 0.0; - for (size_t w = 0lu; w < W; w++) { + for (size_t w = 0lu, min_cb = std::min(blk_size, C - cb * blk_size); w < W; w++) { size_t cw = ccbd + w * blk_size; for (size_t c = 0lu; c < min_cb; c++) { variance_internal += std::pow(static_cast(src_data[cw + c]) - mean, 2); @@ -282,19 +280,17 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& parallel_for3d(CB, D, H, [&](size_t cb, size_t d, size_t h) { size_t ccbd = ccb + cb * C2 + d * C1 + h * C0; - size_t min_cb = std::min(blk_size, C - cb * blk_size); - for (size_t w = 0lu; w < W; w++) { + for (size_t w = 0lu, min_cb = std::min(blk_size, C - cb * blk_size); w < W; w++) { size_t cw = ccbd + w * blk_size; for (size_t c = 0lu; c < min_cb; c++) { size_t src_offset = cw + c; - dst_data[src_offset] = (static_cast(src_data[src_offset]) - mean) / variance; + dst_data[src_offset] = static_cast((static_cast(src_data[src_offset]) - mean) / variance); } } }); } else { parallel_for(CB, [&](size_t cb) { - size_t min_cb = std::min(blk_size, C - cb * blk_size); size_t src_off = ccb + cb * C2; #if defined(HAVE_AVX2) || defined(HAVE_AVX512F) vec_type vmean = _mm_uni_setzero_ps(); @@ -344,6 +340,7 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& } } #else + size_t min_cb = std::min(blk_size, C - cb * blk_size); for (size_t c = 0; c < min_cb; c++) { size_t cc = src_off + c; @@ -358,6 +355,7 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& } } + size_t C4 = D * H * W; mean /= static_cast(C4); double variance = 0.0; @@ -382,7 +380,7 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& size_t ch = cd + h * C0; for (size_t w = 0lu; w < W; w++) { size_t index = ch + w * blk_size; - dst_data[index] = (src_data[index] - mean) / variance; + dst_data[index] = (src_data[index] - static_cast(mean)) / static_cast(variance); } } } @@ -398,9 +396,8 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& double mean = 0.0; mean = parallel_sum3d(CB, D, H, mean, [&](size_t cb, size_t d, size_t h)->double { size_t ccbd = ccb + cb * C2 + d * C1 + h * C0; - size_t min_cb = std::min(blk_size, C - cb * blk_size); double mean_internal = 0.f; - for (size_t w = 0lu; w < W; w++) { + for (size_t w = 0lu, min_cb = std::min(blk_size, C - cb * blk_size); w < W; w++) { size_t cw = ccbd + w * blk_size; for (size_t c = 0lu; c < min_cb; c++) { mean_internal += src_data[cw + c]; @@ -413,19 +410,17 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& parallel_for3d(CB, D, H, [&](size_t cb, size_t d, size_t h) { size_t ccbd = ccb + cb * C2 + d * C1 + h * C0; - size_t min_cb = std::min(blk_size, C - cb * blk_size); - for (size_t w = 0lu; w < W; w++) { + for (size_t w = 0lu, min_cb = std::min(blk_size, C - cb * blk_size); w < W; w++) { size_t cw = ccbd + w * blk_size; for (size_t c = 0lu; c < min_cb; c++) { size_t src_offset = cw + c; - dst_data[src_offset] = src_data[src_offset] - mean; + dst_data[src_offset] = src_data[src_offset] - static_cast(mean); } } }); } else { parallel_for(CB, [&](size_t cb) { - size_t min_cb = std::min(blk_size, C - cb * blk_size); size_t src_off = ccb + cb * C2; #if defined(HAVE_AVX2) || defined(HAVE_AVX512F) vec_type vmean = _mm_uni_setzero_ps(); @@ -455,6 +450,7 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& } } #else + size_t min_cb = std::min(blk_size, C - cb * blk_size); for (size_t c = 0lu; c < min_cb; c++) { size_t cc = src_off + c; double mean = 0.0; @@ -468,6 +464,7 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& } } + size_t C4 = D * H * W; mean /= static_cast(C4); for (size_t d = 0lu; d < D; d++) { @@ -476,7 +473,7 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& size_t ch = cd + h * C0; for (size_t w = 0lu; w < W; w++) { size_t index = ch + w * blk_size; - dst_data[index] = src_data[index] - mean; + dst_data[index] = src_data[index] - static_cast(mean); } } } diff --git a/inference-engine/src/extension/ext_normalize.cpp b/inference-engine/src/extension/ext_normalize.cpp index 0c77e3e..448d0cb 100644 --- a/inference-engine/src/extension/ext_normalize.cpp +++ b/inference-engine/src/extension/ext_normalize.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,8 +31,8 @@ public: weights = std::dynamic_pointer_cast>(layer->blobs.at("weights")); if (!weights) THROW_IE_EXCEPTION << layer->name << " weights is empty!"; - across_spatial = static_cast(layer->GetParamAsInt("across_spatial")); - channel_shared = static_cast(layer->GetParamAsInt("channel_shared")); + across_spatial = layer->GetParamAsBool("across_spatial", false); + channel_shared = layer->GetParamAsBool("channel_shared", false); eps = layer->GetParamAsFloat("eps"); addConfig(layer, {{ConfLayout::PLN, false, 0}}, {{ConfLayout::PLN, false, 0}}, true); @@ -83,9 +83,6 @@ public: const int H = static_cast(dims.size() > 2 ? dims[2] : 1); const int W = static_cast(dims.size() > 3 ? dims[3] : 1); - const int HW = H*W; - const int CHW = C*HW; - for (int n = 0; n < N; n++) { const float* psrc = src + n*C*H*W; float* pdst = dst + n*C*H*W; @@ -220,7 +217,7 @@ private: bool across_spatial = true; bool channel_shared = true; - float eps = 1e-10; + float eps = 1e-10f; }; REG_FACTORY_FOR(ImplFactory, Normalize); diff --git a/inference-engine/src/extension/ext_pad.cpp b/inference-engine/src/extension/ext_pad.cpp index 102db13..255e1ad 100644 --- a/inference-engine/src/extension/ext_pad.cpp +++ b/inference-engine/src/extension/ext_pad.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ public: explicit PadImpl(const CNNLayer* layer) { try { if (layer->insData.empty() || layer->outData.empty()) - THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; pads_begin = layer->GetParamAsUInts("pads_begin"); std::vector pads_end = layer->GetParamAsUInts("pads_end"); @@ -28,7 +28,7 @@ public: src_dims = layer->insData[0].lock()->getTensorDesc().getDims(); dst_dims = layer->outData[0]->getTensorDesc().getDims(); if (src_dims.size() != dst_dims.size() || pads_begin.size() != src_dims.size()) - THROW_IE_EXCEPTION << "Incorrect number of input/output dimensions!"; + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!"; std::string pad_mode = layer->GetParamAsString("pad_mode"); if (pad_mode == "constant") { diff --git a/inference-engine/src/extension/ext_powerfile.cpp b/inference-engine/src/extension/ext_powerfile.cpp index f3666b2..ff3fe0f 100644 --- a/inference-engine/src/extension/ext_powerfile.cpp +++ b/inference-engine/src/extension/ext_powerfile.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/ext_priorbox.cpp b/inference-engine/src/extension/ext_priorbox.cpp index 8b948ef..d1cb195 100644 --- a/inference-engine/src/extension/ext_priorbox.cpp +++ b/inference-engine/src/extension/ext_priorbox.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,7 @@ #include #include #include +#include namespace InferenceEngine { namespace Extensions { @@ -28,9 +29,9 @@ public: _step = layer->GetParamAsFloat("step", 0); _min_sizes = layer->GetParamAsFloats("min_size", {}); _max_sizes = layer->GetParamAsFloats("max_size", {}); - _flip = static_cast(layer->GetParamAsInt("flip")); - _clip = static_cast(layer->GetParamAsInt("clip")); - _scale_all_sizes = static_cast(layer->GetParamAsInt("scale_all_sizes", 1)); + _flip = layer->GetParamAsBool("flip", false); + _clip = layer->GetParamAsBool("clip", false); + _scale_all_sizes = layer->GetParamAsBool("scale_all_sizes", true); bool exist; @@ -41,6 +42,10 @@ public: for (float aspect_ratio : aspect_ratios) { exist = false; + if (std::fabs(aspect_ratio) < std::numeric_limits::epsilon()) { + THROW_IE_EXCEPTION << "aspect_ratio param can't be equal to zero"; + } + for (float _aspect_ratio : _aspect_ratios) { if (fabs(aspect_ratio - _aspect_ratio) < 1e-6) { exist = true; @@ -91,6 +96,10 @@ public: } } + StatusCode init(LayerConfig& config, ResponseDesc *resp) noexcept override { + return OK; + } + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { if (inputs.size() != 2 || outputs.empty()) { diff --git a/inference-engine/src/extension/ext_priorbox_clustered.cpp b/inference-engine/src/extension/ext_priorbox_clustered.cpp index 69807a9..40fd273 100644 --- a/inference-engine/src/extension/ext_priorbox_clustered.cpp +++ b/inference-engine/src/extension/ext_priorbox_clustered.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -39,12 +39,16 @@ public: } } + StatusCode init(LayerConfig& config, ResponseDesc *resp) noexcept override { + return OK; + } + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { int num_priors_ = widths_.size(); if (variance_.empty()) - variance_.push_back(0.1); + variance_.push_back(0.1f); // Execute const int layer_width = inputs[0]->getTensorDesc().getDims()[3]; @@ -73,10 +77,10 @@ public: float box_width = widths_[s]; float box_height = heights_[s]; - float xmin = (center_x - box_width / 2.) / img_width; - float ymin = (center_y - box_height / 2.) / img_height; - float xmax = (center_x + box_width / 2.) / img_width; - float ymax = (center_y + box_height / 2.) / img_height; + float xmin = (center_x - box_width / 2.0f) / img_width; + float ymin = (center_y - box_height / 2.0f) / img_height; + float xmax = (center_x + box_width / 2.0f) / img_width; + float ymax = (center_y + box_height / 2.0f) / img_height; if (clip_) { xmin = std::min(std::max(xmin, 0.0f), 1.0f); diff --git a/inference-engine/src/extension/ext_priorgridgenerator_onnx.cpp b/inference-engine/src/extension/ext_priorgridgenerator_onnx.cpp new file mode 100644 index 0000000..a8e668b --- /dev/null +++ b/inference-engine/src/extension/ext_priorgridgenerator_onnx.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" +#include +#include +#include + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +const int INPUT_PRIORS {0}; +const int INPUT_FEATUREMAP {1}; +const int INPUT_IMAGE {2}; + +const int OUTPUT_ROIS {0}; + +class ExperimentalDetectronPriorGridGeneratorImpl: public ExtLayerBase { +private: + // Inputs: + // priors, shape [n, 4] + // [feature_map], shape [b, c, h, w] + // [im_data], shape [b, 3, im_h, im_w] + // Outputs: + // priors_grid, shape [m, 4] + +public: + explicit ExperimentalDetectronPriorGridGeneratorImpl(const CNNLayer* layer) { + try { + if (layer->insData.size() > 3 || layer->outData.empty()) + THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; + + if (layer->insData[INPUT_PRIORS].lock()->dims.size() != 2 || + (layer->insData.size() > INPUT_FEATUREMAP && + layer->insData[INPUT_FEATUREMAP].lock()->dims.size() != 4) || + (layer->insData.size() > INPUT_IMAGE && + layer->insData[INPUT_IMAGE].lock()->dims.size() != 4)) + THROW_IE_EXCEPTION << "Unsupported shape of input blobs!"; + + grid_w_ = layer->GetParamAsInt("w", 0); + grid_h_ = layer->GetParamAsInt("h", 0); + stride_h_ = layer->GetParamAsFloat("stride_y", 0); + stride_w_ = layer->GetParamAsFloat("stride_x", 0); + + addConfig(layer, + {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, + {DataConfigurator(ConfLayout::PLN)}); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, + ResponseDesc *resp) noexcept override { + const int num_priors_ = inputs[INPUT_PRIORS]->getTensorDesc().getDims()[0]; + assert(inputs[INPUT_PRIORS]->getTensorDesc().getDims()[1] == 4); + + // Execute + const int layer_width = grid_w_ ? grid_w_ : inputs[INPUT_FEATUREMAP]->getTensorDesc().getDims()[3]; + const int layer_height = grid_h_ ? grid_h_ : inputs[INPUT_FEATUREMAP]->getTensorDesc().getDims()[2]; + const float step_w = stride_w_ ? stride_w_ : static_cast(inputs[INPUT_IMAGE]->getTensorDesc().getDims()[3]) / layer_width; + const float step_h = stride_h_ ? stride_h_ : static_cast(inputs[INPUT_IMAGE]->getTensorDesc().getDims()[2]) / layer_height; + + const auto *bottom_data_0 = inputs[0]->buffer().as(); + auto *top_data_0 = outputs[OUTPUT_ROIS]->buffer().as(); + + for (int h = 0; h < layer_height; ++h) { + for (int w = 0; w < layer_width; ++w) { + for (int s = 0; s < num_priors_; ++s) { + top_data_0[0] = bottom_data_0[4 * s + 0] + step_w * (w + 0.5f); + top_data_0[1] = bottom_data_0[4 * s + 1] + step_h * (h + 0.5f); + top_data_0[2] = bottom_data_0[4 * s + 2] + step_w * (w + 0.5f); + top_data_0[3] = bottom_data_0[4 * s + 3] + step_h * (h + 0.5f); + top_data_0 += 4; + } + } + } + + return OK; + } + +private: + int grid_w_; + int grid_h_; + float stride_w_; + float stride_h_; +}; + + +REG_FACTORY_FOR(ImplFactory, ExperimentalDetectronPriorGridGenerator); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_proposal.cpp b/inference-engine/src/extension/ext_proposal.cpp index 2f93b05..e431d49 100644 --- a/inference-engine/src/extension/ext_proposal.cpp +++ b/inference-engine/src/extension/ext_proposal.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -74,7 +74,7 @@ void enumerate_proposals_cpu(const float* bottom4d, const float* d_anchor4d, con const int bottom_W, const float img_H, const float img_W, const float min_box_H, const float min_box_W, const int feat_stride, const float box_coordinate_scale, const float box_size_scale, - float coordinates_offset, bool initial_clip, bool swap_xy) { + float coordinates_offset, bool initial_clip, bool swap_xy, bool clip_before_nms) { const int bottom_area = bottom_H * bottom_W; const float* p_anchors_wm = anchors + 0 * num_anchors; @@ -83,8 +83,8 @@ void enumerate_proposals_cpu(const float* bottom4d, const float* d_anchor4d, con const float* p_anchors_hp = anchors + 3 * num_anchors; parallel_for2d(bottom_H, bottom_W, [&](size_t h, size_t w) { - const float x = (swap_xy ? h : w) * feat_stride; - const float y = (swap_xy ? w : h) * feat_stride; + const float x = static_cast((swap_xy ? h : w) * feat_stride); + const float y = static_cast((swap_xy ? w : h) * feat_stride); const float* p_box = d_anchor4d + h * bottom_W + w; const float* p_score = bottom4d + h * bottom_W + w; @@ -135,10 +135,12 @@ void enumerate_proposals_cpu(const float* bottom4d, const float* d_anchor4d, con y1 = pred_ctr_y + 0.5f * pred_h; // adjust new corner locations to be within the image region, - x0 = std::max(0.0f, std::min(x0, img_W - coordinates_offset)); - y0 = std::max(0.0f, std::min(y0, img_H - coordinates_offset)); - x1 = std::max(0.0f, std::min(x1, img_W - coordinates_offset)); - y1 = std::max(0.0f, std::min(y1, img_H - coordinates_offset)); + if (clip_before_nms) { + x0 = std::max(0.0f, std::min(x0, img_W - coordinates_offset)); + y0 = std::max(0.0f, std::min(y0, img_H - coordinates_offset)); + x1 = std::max(0.0f, std::min(x1, img_W - coordinates_offset)); + y1 = std::max(0.0f, std::min(y1, img_H - coordinates_offset)); + } // recompute new width & height const float box_w = x1 - x0 + coordinates_offset; @@ -290,7 +292,8 @@ static void retrieve_rois_cpu(const int num_rois, const int item_index, const int num_proposals, const float* proposals, const int roi_indices[], - float* rois, int post_nms_topn_) { + float* rois, int post_nms_topn_, + bool normalize, float img_h, float img_w, bool clip_after_nms) { const float *src_x0 = proposals + 0 * num_proposals; const float *src_y0 = proposals + 1 * num_proposals; const float *src_x1 = proposals + 2 * num_proposals; @@ -299,12 +302,26 @@ void retrieve_rois_cpu(const int num_rois, const int item_index, parallel_for(num_rois, [&](size_t roi) { int index = roi_indices[roi]; - const float x0 = src_x0[index]; - const float y0 = src_y0[index]; - const float x1 = src_x1[index]; - const float y1 = src_y1[index]; + float x0 = src_x0[index]; + float y0 = src_y0[index]; + float x1 = src_x1[index]; + float y1 = src_y1[index]; + + if (clip_after_nms) { + x0 = std::max(0.0f, std::min(x0, img_w)); + y0 = std::max(0.0f, std::min(y0, img_h)); + x1 = std::max(0.0f, std::min(x1, img_w)); + y1 = std::max(0.0f, std::min(y1, img_h)); + } + + if (normalize) { + x0 /= img_w; + y0 /= img_h; + x1 /= img_w; + y1 /= img_h; + } - rois[roi * 5 + 0] = item_index; + rois[roi * 5 + 0] = static_cast(item_index); rois[roi * 5 + 1] = x0; rois[roi * 5 + 2] = y0; rois[roi * 5 + 3] = x1; @@ -341,6 +358,9 @@ public: box_size_scale_ = layer->GetParamAsFloat("box_size_scale", 1.0); scales = layer->GetParamAsFloats("scale", {}); ratios = layer->GetParamAsFloats("ratio", {}); + normalize_ = layer->GetParamsAsBool("normalize", false); + clip_before_nms = layer->GetParamsAsBool("clip_before_nms", true); + clip_after_nms = layer->GetParamsAsBool("clip_after_nms", false); anchors_shape_0 = ratios.size() * scales.size(); anchors_.resize(anchors_shape_0 * 4); @@ -386,10 +406,7 @@ public: const float* p_img_info_cpu = inputs[2]->buffer(); float* p_roi_item = outputs[0]->buffer(); - size_t img_info_size = 1; - for (size_t i = 0; i < inputs[2]->getTensorDesc().getDims().size(); i++) { - img_info_size *= inputs[2]->getTensorDesc().getDims()[i]; - } + size_t img_info_size = inputs[2]->getTensorDesc().getDims()[1]; // No second output so ignoring this // Dtype* p_score_item = (top.size() > 1) ? top[1]->mutable_cpu_data() : NULL; @@ -437,12 +454,12 @@ public: // Execute int nn = inputs[0]->getTensorDesc().getDims()[0]; for (int n = 0; n < nn; ++n) { - enumerate_proposals_cpu(p_bottom_item + num_proposals, p_d_anchor_item, + enumerate_proposals_cpu(p_bottom_item + num_proposals + n*num_proposals*2, p_d_anchor_item + n*num_proposals*4, &anchors_[0], reinterpret_cast(&proposals_[0]), anchors_shape_0, bottom_H, bottom_W, img_H, img_W, min_box_H, min_box_W, feat_stride_, box_coordinate_scale_, box_size_scale_, - coordinates_offset, initial_clip, swap_xy); + coordinates_offset, initial_clip, swap_xy, clip_before_nms); std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), [](const ProposalBox& struct1, const ProposalBox& struct2) { return (struct1.score > struct2.score); @@ -450,7 +467,8 @@ public: unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, nms_thresh_, post_nms_topn_, coordinates_offset); - retrieve_rois_cpu(num_rois, n, pre_nms_topn, &unpacked_boxes[0], &roi_indices_[0], p_roi_item, post_nms_topn_); + retrieve_rois_cpu(num_rois, n, pre_nms_topn, &unpacked_boxes[0], &roi_indices_[0], p_roi_item + n*post_nms_topn_*5, + post_nms_topn_, normalize_, img_H, img_W, clip_after_nms); } return OK; @@ -467,6 +485,7 @@ private: float box_size_scale_; std::vector scales; std::vector ratios; + bool normalize_; size_t anchors_shape_0; std::vector anchors_; @@ -475,9 +494,11 @@ private: // Framework specific parameters float coordinates_offset; bool swap_xy; - bool initial_clip; // clip initial bounding boxes - bool round_ratios; // round ratios during anchors generation stage - bool shift_anchors; // shift anchors by half size of the box + bool initial_clip; // clip initial bounding boxes + bool clip_before_nms; // clip bounding boxes before nms step + bool clip_after_nms; // clip bounding boxes after nms step + bool round_ratios; // round ratios during anchors generation stage + bool shift_anchors; // shift anchors by half size of the box }; class ProposalFactory : public ImplFactory { diff --git a/inference-engine/src/extension/ext_proposal_onnx.cpp b/inference-engine/src/extension/ext_proposal_onnx.cpp new file mode 100644 index 0000000..43ce9a0 --- /dev/null +++ b/inference-engine/src/extension/ext_proposal_onnx.cpp @@ -0,0 +1,442 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include +#include +#if defined(HAVE_AVX2) +#include +#endif +#include "ie_parallel.hpp" + + +namespace { +struct Indexer { + const std::vector dims_; + int total_{1}; + + explicit Indexer(const std::vector& dims) : dims_(dims) { + total_ = 1; + for (size_t i = 0; i < dims_.size(); ++i) { + total_ *= dims_[i]; + } + } + + const int operator()(const std::vector& idx) const { + int flat_idx = 0; + assert(idx.size() == dims_.size()); + for (size_t i = 0; i < dims_.size(); ++i) { + assert(0 <= idx[i] && idx[i] < dims_[i]); + flat_idx = flat_idx * dims_[i] + idx[i]; + } + assert(flat_idx < total_); + return flat_idx; + } +}; +} // namespace + + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +static +void refine_anchors(const float* deltas, const float* scores, const float* anchors, + float* proposals, const int anchors_num, const int bottom_H, + const int bottom_W, const float img_H, const float img_W, + const float min_box_H, const float min_box_W, + const float max_delta_log_wh, + float coordinates_offset) { + Indexer delta_idx({anchors_num, 4, bottom_H, bottom_W}); + Indexer score_idx({anchors_num, 1, bottom_H, bottom_W}); + Indexer proposal_idx({bottom_H, bottom_W, anchors_num, 5}); + Indexer anchor_idx({bottom_H, bottom_W, anchors_num, 4}); + + parallel_for2d(bottom_H, bottom_W, [&](int h, int w) { + for (int anchor = 0; anchor < anchors_num; ++anchor) { + float x0 = anchors[anchor_idx({h, w, anchor, 0})]; + float y0 = anchors[anchor_idx({h, w, anchor, 1})]; + float x1 = anchors[anchor_idx({h, w, anchor, 2})]; + float y1 = anchors[anchor_idx({h, w, anchor, 3})]; + + const float dx = deltas[delta_idx({anchor, 0, h, w})]; + const float dy = deltas[delta_idx({anchor, 1, h, w})]; + const float d_log_w = deltas[delta_idx({anchor, 2, h, w})]; + const float d_log_h = deltas[delta_idx({anchor, 3, h, w})]; + + const float score = scores[score_idx({anchor, 0, h, w})]; + + // width & height of box + const float ww = x1 - x0 + coordinates_offset; + const float hh = y1 - y0 + coordinates_offset; + // center location of box + const float ctr_x = x0 + 0.5f * ww; + const float ctr_y = y0 + 0.5f * hh; + + // new center location according to deltas (dx, dy) + const float pred_ctr_x = dx * ww + ctr_x; + const float pred_ctr_y = dy * hh + ctr_y; + // new width & height according to deltas d(log w), d(log h) + const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww; + const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh; + + // update upper-left corner location + x0 = pred_ctr_x - 0.5f * pred_w; + y0 = pred_ctr_y - 0.5f * pred_h; + // update lower-right corner location + x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset; + y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset; + + // adjust new corner locations to be within the image region, + x0 = std::max(0.0f, std::min(x0, img_W - coordinates_offset)); + y0 = std::max(0.0f, std::min(y0, img_H - coordinates_offset)); + x1 = std::max(0.0f, std::min(x1, img_W - coordinates_offset)); + y1 = std::max(0.0f, std::min(y1, img_H - coordinates_offset)); + + // recompute new width & height + const float box_w = x1 - x0 + coordinates_offset; + const float box_h = y1 - y0 + coordinates_offset; + + proposals[proposal_idx({h, w, anchor, 0})] = x0; + proposals[proposal_idx({h, w, anchor, 1})] = y0; + proposals[proposal_idx({h, w, anchor, 2})] = x1; + proposals[proposal_idx({h, w, anchor, 3})] = y1; + proposals[proposal_idx({h, w, anchor, 4})] = (min_box_W <= box_w) * (min_box_H <= box_h) * score; + } + }); +} + +static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) { + parallel_for(pre_nms_topn, [&](size_t i) { + unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0]; + unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1]; + unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2]; + unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3]; + unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4]; + }); +} + +static +void nms_cpu(const int num_boxes, int is_dead[], + const float* boxes, int index_out[], int* const num_out, + const int base_index, const float nms_thresh, const int max_num_out, + float coordinates_offset) { + const int num_proposals = num_boxes; + int count = 0; + + const float* x0 = boxes + 0 * num_proposals; + const float* y0 = boxes + 1 * num_proposals; + const float* x1 = boxes + 2 * num_proposals; + const float* y1 = boxes + 3 * num_proposals; + + memset(is_dead, 0, num_boxes * sizeof(int)); + +#if defined(HAVE_AVX2) + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256i vc_ione = _mm256_set1_epi32(1); + __m256 vc_zero = _mm256_set1_ps(0.0f); + + __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); +#endif + + for (int box = 0; box < num_boxes; ++box) { + if (is_dead[box]) + continue; + + index_out[count++] = base_index + box; + if (count == max_num_out) + break; + + int tail = box + 1; + +#if defined(HAVE_AVX2) + __m256 vx0i = _mm256_set1_ps(x0[box]); + __m256 vy0i = _mm256_set1_ps(y0[box]); + __m256 vx1i = _mm256_set1_ps(x1[box]); + __m256 vy1i = _mm256_set1_ps(y1[box]); + + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + + for (; tail <= num_boxes - 8; tail += 8) { + __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); + + __m256 vx0j = _mm256_loadu_ps(x0 + tail); + __m256 vy0j = _mm256_loadu_ps(y0 + tail); + __m256 vx1j = _mm256_loadu_ps(x1 + tail); + __m256 vy1j = _mm256_loadu_ps(y1 + tail); + + __m256 vx0 = _mm256_max_ps(vx0i, vx0j); + __m256 vy0 = _mm256_max_ps(vy0i, vy0j); + __m256 vx1 = _mm256_min_ps(vx1i, vx1j); + __m256 vy1 = _mm256_min_ps(vy1i, vy1j); + + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); + __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); + + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + + __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); + __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); + + __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS); + __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS); + __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS); + __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS); + __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS); + + vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1); + vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3); + vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0); + vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2); + + _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4))); + } +#endif + + for (; tail < num_boxes; ++tail) { + float res = 0.0f; + + const float x0i = x0[box]; + const float y0i = y0[box]; + const float x1i = x1[box]; + const float y1i = y1[box]; + + const float x0j = x0[tail]; + const float y0j = y0[tail]; + const float x1j = x1[tail]; + const float y1j = y1[tail]; + + if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) { + // overlapped region (= box) + const float x0 = std::max(x0i, x0j); + const float y0 = std::max(y0i, y0j); + const float x1 = std::min(x1i, x1j); + const float y1 = std::min(y1i, y1j); + + // intersection area + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; + + // area of A, B + const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); + const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset); + + // IoU + res = area / (A_area + B_area - area); + } + + if (nms_thresh < res) + is_dead[tail] = 1; + } + } + + *num_out = count; +} + + +static +void fill_output_blobs(const float* proposals, const int* roi_indices, + float* rois, float* scores, + const int num_proposals, const int num_rois, const int post_nms_topn) { + const float *src_x0 = proposals + 0 * num_proposals; + const float *src_y0 = proposals + 1 * num_proposals; + const float *src_x1 = proposals + 2 * num_proposals; + const float *src_y1 = proposals + 3 * num_proposals; + const float *src_score = proposals + 4 * num_proposals; + + parallel_for(num_rois, [&](size_t i) { + int index = roi_indices[i]; + rois[i * 4 + 0] = src_x0[index]; + rois[i * 4 + 1] = src_y0[index]; + rois[i * 4 + 2] = src_x1[index]; + rois[i * 4 + 3] = src_y1[index]; + scores[i] = src_score[index]; + }); + + if (num_rois < post_nms_topn) { + for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) { + rois[i] = 0.f; + } + for (int i = num_rois; i < post_nms_topn; i++) { + scores[i] = 0.f; + } + } +} + + +class ONNXCustomProposalImpl : public ExtLayerBase { +private: + const int INPUT_IM_INFO {0}; + const int INPUT_ANCHORS {1}; + const int INPUT_DELTAS {2}; + const int INPUT_SCORES {3}; + const int OUTPUT_ROIS {0}; + const int OUTPUT_SCORES {1}; + +public: + explicit ONNXCustomProposalImpl(const CNNLayer *layer) { + try { + if (layer->insData.size() != 4 || layer->outData.size() != 2) + THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; + + min_size_ = layer->GetParamAsFloat("min_size"); + nms_thresh_ = layer->GetParamAsFloat("nms_threshold"); + pre_nms_topn_ = layer->GetParamAsInt("pre_nms_count"); + post_nms_topn_ = layer->GetParamAsInt("post_nms_count"); + + coordinates_offset = 0.0f; + + roi_indices_.resize(post_nms_topn_); + addConfig(layer, + {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), + DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, + {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + void print_shape(const Blob::Ptr& b) { + for (size_t i = 0; i < b->getTensorDesc().getDims().size(); ++i) { + std::cout << b->getTensorDesc().getDims()[i] << ", "; + } + std::cout << std::endl; + } + + StatusCode execute(std::vector &inputs, std::vector &outputs, + ResponseDesc *resp) noexcept override { + if (inputs.size() != 4 || outputs.size() != 2) { + if (resp) { + std::string errorMsg = "Incorrect number of input or output edges!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + + // Prepare memory + const float* p_deltas_item = inputs[INPUT_DELTAS]->buffer(); + const float* p_scores_item = inputs[INPUT_SCORES]->buffer(); + const float* p_anchors_item = inputs[INPUT_ANCHORS]->buffer(); + const float* p_img_info_cpu = inputs[INPUT_IM_INFO]->buffer(); + + float* p_roi_item = outputs[OUTPUT_ROIS]->buffer(); + float* p_roi_score_item = outputs[OUTPUT_SCORES]->buffer(); + + + size_t img_info_size = 1; + for (size_t i = 0; i < inputs[INPUT_IM_INFO]->getTensorDesc().getDims().size(); i++) { + img_info_size *= inputs[INPUT_IM_INFO]->getTensorDesc().getDims()[i]; + } + + const int anchors_num = inputs[INPUT_SCORES]->getTensorDesc().getDims()[0]; + + // bottom shape: (num_anchors) x H x W + const int bottom_H = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1]; + const int bottom_W = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[2]; + + // input image height & width + const float img_H = p_img_info_cpu[0]; + const float img_W = p_img_info_cpu[1]; + + // scale factor for height & width + + // minimum box width & height + const float min_box_H = min_size_; + const float min_box_W = min_size_; + + // number of all proposals = num_anchors * H * W + const int num_proposals = anchors_num * bottom_H * bottom_W; + + // number of top-n proposals before NMS + const int pre_nms_topn = std::min(num_proposals, pre_nms_topn_); + + // number of final RoIs + int num_rois = 0; + + // enumerate all proposals + // num_proposals = num_anchors * H * W + // (x1, y1, x2, y2, score) for each proposal + // NOTE: for bottom, only foreground scores are passed + struct ProposalBox { + float x0; + float y0; + float x1; + float y1; + float score; + }; + std::vector proposals_(num_proposals); + std::vector unpacked_boxes(5 * pre_nms_topn); + std::vector is_dead(pre_nms_topn); + + // Execute + int batch_size = 1; // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0]; + for (int n = 0; n < batch_size; ++n) { + refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, + reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, + bottom_W, img_H, img_W, + min_box_H, min_box_W, + static_cast(log(1000. / 16.)), + 1.0f); + std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), + [](const ProposalBox& struct1, const ProposalBox& struct2) { + return (struct1.score > struct2.score); + }); + + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); + nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, + nms_thresh_, post_nms_topn_, coordinates_offset); + fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item, + pre_nms_topn, num_rois, post_nms_topn_); + } + + return OK; + } + +private: + float min_size_; + int pre_nms_topn_; + int post_nms_topn_; + float nms_thresh_; + float coordinates_offset; + + std::vector roi_indices_; +}; + +class ONNXCustomProposalFactory : public ImplFactory { +public: + explicit ONNXCustomProposalFactory(const CNNLayer *layer): ImplFactory(layer) {} + // set output shapes by input shapes. + StatusCode getShapes(const std::vector& inShapes, std::vector& outShapes, + ResponseDesc *resp) noexcept override { + if (inShapes.size() != 1) { + if (resp) { + std::string errorMsg = "Incorrect input shapes!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + outShapes.clear(); + outShapes.emplace_back(cnnLayer.precision, inShapes[0].getDims(), inShapes[0].getLayout()); + return OK; + } +}; + +REG_FACTORY_FOR(ONNXCustomProposalFactory, ExperimentalDetectronGenerateProposalsSingleImage); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_psroi.cpp b/inference-engine/src/extension/ext_psroi.cpp index 355a3e6..71bd3f6 100644 --- a/inference-engine/src/extension/ext_psroi.cpp +++ b/inference-engine/src/extension/ext_psroi.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -26,6 +26,9 @@ public: spatial_scale_ = layer->GetParamAsFloat("spatial_scale"); pooled_height_ = group_size_; pooled_width_ = group_size_; + spatial_bins_x_ = static_cast(layer->GetParamAsInt("spatial_bins_x", 1)); + spatial_bins_y_ = static_cast(layer->GetParamAsInt("spatial_bins_y", 1)); + mode_ = layer->GetParamAsString("mode", "average"); SizeVector inDims = layer->insData[0].lock()->getTensorDesc().getDims(); channels = static_cast(inDims[1]); @@ -59,51 +62,116 @@ public: } } + size_t num_bins = spatial_bins_x_*spatial_bins_y_; + parallel_for(real_rois, [&](int n) { const float* bottom_rois = bottom_rois_beginning + n * 5; int roi_batch_ind = static_cast(bottom_rois[0]); - float roi_start_w = static_cast(round(bottom_rois[1])) * spatial_scale_; - float roi_start_h = static_cast(round(bottom_rois[2])) * spatial_scale_; - float roi_end_w = static_cast(round(bottom_rois[3]) + 1.0f) * spatial_scale_; - float roi_end_h = static_cast(round(bottom_rois[4]) + 1.0f) * spatial_scale_; - - // Force too small ROIs to be 1x1 - float roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0 - float roi_height = std::max(roi_end_h - roi_start_h, 0.1f); - - float bin_size_h = roi_height / static_cast(pooled_height_); - float bin_size_w = roi_width / static_cast(pooled_width_); + float roi_start_w = 0.0f; + float roi_start_h = 0.0f; + float roi_end_w = 0.0f; + float roi_end_h = 0.0f; + float roi_width = 0.0f; + float roi_height = 0.0f; + + if (mode_ == "bilinear") { + roi_start_w = bottom_rois[1] * spatial_scale_; + roi_start_h = bottom_rois[2] * spatial_scale_; + roi_end_w = bottom_rois[3] * spatial_scale_; + roi_end_h = bottom_rois[4] * spatial_scale_; + roi_width = roi_end_w - roi_start_w; + roi_height = roi_end_h - roi_start_h; + } else if (mode_ == "average") { + roi_start_w = static_cast(round(bottom_rois[1])) * spatial_scale_; + roi_start_h = static_cast(round(bottom_rois[2])) * spatial_scale_; + roi_end_w = static_cast(round(bottom_rois[3]) + 1.0f) * spatial_scale_; + roi_end_h = static_cast(round(bottom_rois[4]) + 1.0f) * spatial_scale_; + // Force too small ROIs to be 1x1 + roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0 + roi_height = std::max(roi_end_h - roi_start_h, 0.1f); + } for (int c = 0; c < nc; c++) { for (int h = 0; h < nh; h++) { - int hstart = floor(static_cast(h + 0) * bin_size_h + roi_start_h); - int hend = ceil(static_cast(h + 1) * bin_size_h + roi_start_h); - - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - for (int w = 0; w < nw; w++) { - int index = n * nc * nh * nw + c * nh * nw + h * nw + w; + size_t index = n*nc*nh*nw + c*nh*nw + h*nw + w; dst_data[index] = 0.0f; - int wstart = floor(static_cast(w + 0) * bin_size_w + roi_start_w); - int wend = ceil(static_cast(w + 1) * bin_size_w + roi_start_w); - - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - - float bin_area = (hend - hstart) * (wend - wstart); - if (bin_area) { - int gc = (c * group_size_ + h) * group_size_ + w; - const float *bottom_data = - bottom_data_beginning + ((roi_batch_ind * channels + gc) * height * width); - - float out_sum = 0.0f; - for (int hh = hstart; hh < hend; ++hh) - for (int ww = wstart; ww < wend; ++ww) - out_sum += bottom_data[hh * width + ww]; - - dst_data[index] = out_sum / bin_area; + if (mode_ == "average") { + float bin_size_h = roi_height / static_cast(pooled_height_); + float bin_size_w = roi_width / static_cast(pooled_width_); + + int hstart = static_cast(floor(static_cast(h + 0) * bin_size_h + roi_start_h)); + int hend = static_cast(ceil(static_cast(h + 1) * bin_size_h + roi_start_h)); + + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + int wstart = static_cast(floor(static_cast(w + 0) * bin_size_w + roi_start_w)); + int wend = static_cast(ceil(static_cast(w + 1) * bin_size_w + roi_start_w)); + + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + + float bin_area = static_cast((hend - hstart) * (wend - wstart)); + if (bin_area) { + int gc = (c * group_size_ + h) * group_size_ + w; + const float *bottom_data = + bottom_data_beginning + ((roi_batch_ind * channels + gc) * height * width); + + float out_sum = 0.0f; + for (int hh = hstart; hh < hend; ++hh) + for (int ww = wstart; ww < wend; ++ww) + out_sum += bottom_data[hh * width + ww]; + + dst_data[index] = out_sum / bin_area; + } + } else if (mode_ == "bilinear") { + for (size_t bin_y = 0; bin_y < spatial_bins_y_; bin_y++) { + for (size_t bin_x = 0; bin_x < spatial_bins_x_; bin_x++) { + float box_xmin = roi_start_w + (bin_x + 0) * (roi_width / spatial_bins_x_); + float box_xmax = roi_start_w + (bin_x + 1) * (roi_width / spatial_bins_x_); + float box_ymin = roi_start_h + (bin_y + 0) * (roi_height / spatial_bins_y_); + float box_ymax = roi_start_h + (bin_y + 1) * (roi_height / spatial_bins_y_); + + size_t gc = c + (bin_y*spatial_bins_x_ + bin_x)*nc; + size_t src_idx = (roi_batch_ind * channels + gc) * height * width; + const float *bottom_data = bottom_data_beginning + src_idx; + + float height_scale = nh > 1 ? (box_ymax - box_ymin) * (height - 1) / (pooled_height_ - 1) + : 0.0f; + float width_scale = nw > 1 ? (box_xmax - box_xmin) * (width - 1) / (pooled_width_ - 1) + : 0.0f; + + float in_y = nh > 1 ? (h * height_scale + box_ymin * (height - 1)) + : 0.5f * (box_ymin + box_ymax) * (height - 1); + float in_x = nw > 1 ? (w * width_scale + box_xmin * (width - 1)) + : 0.5f * (box_xmin + box_xmax) * (width - 1); + + if (!(in_y < 0 || in_y > height - 1 || in_x < 0 || in_x > width - 1)) { + int top_y_index = static_cast(floorf(in_y)); + int bottom_y_index = static_cast(ceilf(in_y)); + int left_x_index = static_cast(floorf(in_x)); + int right_x_index = static_cast(ceilf(in_x)); + + if (right_x_index > width - 1) + right_x_index = width - 1; + + if (bottom_y_index > height - 1) + bottom_y_index = height - 1; + + const float top_left = bottom_data[top_y_index * width + left_x_index]; + const float top_right = bottom_data[top_y_index * width + right_x_index]; + const float bottom_left = bottom_data[bottom_y_index * width + left_x_index]; + const float bottom_right = bottom_data[bottom_y_index * width + right_x_index]; + + const float top = top_left + (top_right - top_left) * (in_x - left_x_index); + const float bottom = bottom_left + (bottom_right - bottom_left) * (in_x - left_x_index); + + dst_data[index] += top + (bottom - top) * (in_y - top_y_index); + } + } + } + dst_data[index] /= num_bins; } } } @@ -126,6 +194,9 @@ private: float spatial_scale_ = 0; size_t pooled_height_ = 0; size_t pooled_width_ = 0; + size_t spatial_bins_x_ = 0; + size_t spatial_bins_y_ = 0; + std::string mode_ = ""; int channels = 0; int height = 0; diff --git a/inference-engine/src/extension/ext_range.cpp b/inference-engine/src/extension/ext_range.cpp new file mode 100644 index 0000000..d438df8 --- /dev/null +++ b/inference-engine/src/extension/ext_range.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class RangeImpl: public ExtLayerBase { +public: + explicit RangeImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + if (layer->insData.size() != 3) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; + + SizeVector start_dims = layer->insData[RANGE_START].lock()->getTensorDesc().getDims(); + if (start_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Start scalar should have 1 dimension"; + + SizeVector limit_dims = layer->insData[RANGE_LIMIT].lock()->getTensorDesc().getDims(); + if (limit_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Limit scalar should have 1 dimension"; + + SizeVector delta_dims = layer->insData[RANGE_DELTA].lock()->getTensorDesc().getDims(); + if (delta_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Delta scalar should have 1 dimension"; + + SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); + if (dst_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Output vector should have 1 dimension"; + + if (!(layer->insData[RANGE_START].lock()->getTensorDesc().getPrecision() == Precision::I32 && + layer->insData[RANGE_LIMIT].lock()->getTensorDesc().getPrecision() == Precision::I32 && + layer->insData[RANGE_DELTA].lock()->getTensorDesc().getPrecision() == Precision::I32 && + layer->outData[0]->getTensorDesc().getPrecision() == Precision::I32) && + !(layer->insData[RANGE_START].lock()->getTensorDesc().getPrecision() == Precision::FP32 && + layer->insData[RANGE_LIMIT].lock()->getTensorDesc().getPrecision() == Precision::FP32 && + layer->insData[RANGE_DELTA].lock()->getTensorDesc().getPrecision() == Precision::FP32 && + layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { + THROW_IE_EXCEPTION << layer->name << + " 'Start', 'Limit', 'Delta' input scalars and output tensor should have same precision" << + "and only FP32 and I32 are supported!"; + } + + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + StatusCode retcode = OK; + switch (outputs[0]->precision()) { + case Precision::FP32: { + retcode = range((inputs[RANGE_START]->cbuffer().as() + + inputs[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inputs[RANGE_LIMIT]->cbuffer().as() + + inputs[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inputs[RANGE_DELTA]->cbuffer().as() + + inputs[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], outputs[0]); + } + break; + case Precision::I32: { + retcode = range((inputs[RANGE_START]->cbuffer().as() + + inputs[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inputs[RANGE_LIMIT]->cbuffer().as() + + inputs[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inputs[RANGE_DELTA]->cbuffer().as() + + inputs[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], outputs[0]); + } + break; + default: + if (resp) { + std::string errorMsg = "Incorrect output precision. Only FP32 and I32 are supported!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + retcode = GENERAL_ERROR; + } + if (resp && retcode == PARAMETER_MISMATCH) { + std::string errorMsg = "Range indexes exceeds data tensor dimension"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return retcode; + } + +private: + const size_t RANGE_START = 0; + const size_t RANGE_LIMIT = 1; + const size_t RANGE_DELTA = 2; + + template + StatusCode range(data_t start, data_t limit, data_t delta, Blob::Ptr output); +}; + +template +StatusCode RangeImpl::range(data_t start, data_t limit, data_t delta, Blob::Ptr output) { + size_t dst_size = (output->getTensorDesc().getDims())[0]; + data_t* dst_data = output->cbuffer().as() + + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); + size_t work_amount_dst = static_cast(std::floor(std::abs((limit - start) / delta))); + if (work_amount_dst != dst_size) + return PARAMETER_MISMATCH; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t iwork = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, iwork, end); + data_t dst_value = start + iwork * delta; + + for (; iwork < end; ++iwork, dst_value += delta) { + dst_data[iwork] = dst_value; + } + }); + return OK; +} +REG_FACTORY_FOR(ImplFactory, Range); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_region_yolo.cpp b/inference-engine/src/extension/ext_region_yolo.cpp index 1cda662..a53869a 100644 --- a/inference-engine/src/extension/ext_region_yolo.cpp +++ b/inference-engine/src/extension/ext_region_yolo.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,7 +23,7 @@ public: classes = layer->GetParamAsInt("classes"); coords = layer->GetParamAsInt("coords"); num = layer->GetParamAsInt("num"); - do_softmax = static_cast(layer->GetParamAsInt("do_softmax", 1)); + do_softmax = layer->GetParamAsBool("do_softmax", true); mask = layer->GetParamAsInts("mask", {}); addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)}); diff --git a/inference-engine/src/extension/ext_reorg_yolo.cpp b/inference-engine/src/extension/ext_reorg_yolo.cpp index ebeecb7..8f0e559 100644 --- a/inference-engine/src/extension/ext_reorg_yolo.cpp +++ b/inference-engine/src/extension/ext_reorg_yolo.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/ext_resample.cpp b/inference-engine/src/extension/ext_resample.cpp index 531158f..5c3492c 100644 --- a/inference-engine/src/extension/ext_resample.cpp +++ b/inference-engine/src/extension/ext_resample.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -35,7 +35,7 @@ public: THROW_IE_EXCEPTION << "Resample supports only 4D blobs!"; type = layer->GetParamAsString("type"); - antialias = static_cast(layer->GetParamAsInt("antialias")); + antialias = layer->GetParamAsBool("antialias", false); #if defined(HAVE_AVX512F) auto blk_layout = ConfLayout::BLK16; @@ -58,6 +58,7 @@ public: #undef IN #endif Layout layout = inputs[0]->layout(); + Precision precision = inputs[0]->precision(); size_t IN = inputs[0]->getTensorDesc().getDims()[0]; size_t IC = inputs[0]->getTensorDesc().getDims()[1]; @@ -68,7 +69,11 @@ public: size_t OW = outputs[0]->getTensorDesc().getDims()[3]; if (IW == OW && IH == OH && type == "caffe.ResampleParameter.LINEAR") { - simple_copy(dst_data, outputs[0]->byteSize(), src_data, IN * IC * IH * IW * sizeof(float)); + size_t size = IN * IC * IH * IW; + if (inputs[0]->getTensorDesc().getPrecision() == Precision::FP32) { + size *= sizeof(float); + } + simple_copy(dst_data, outputs[0]->byteSize(), src_data, size); return OK; } @@ -79,14 +84,24 @@ public: if (type == "caffe.ResampleParameter.NEAREST") { if (!isDownsample && fx == 0.25f && fy == 0.25f) { - if (layout == NCHW) { - Upsample_Nearest_PLN<4>(src_data, dst_data, IN, IC, IH, IW); + if (layout == NCHW || layout == NHWC) { + if (precision == Precision::FP32) { + Upsample_Nearest_PLN(src_data, dst_data, IN, IC, IH, IW, layout); + } else { + Upsample_Nearest_PLN(reinterpret_cast(src_data), + reinterpret_cast(dst_data), IN, IC, IH, IW, layout); + } } else { Upsample_Nearest_BLK<4>(src_data, dst_data, IN, IC, IH, IW); } } else if (!isDownsample && fx == 0.5f && fy == 0.5f) { - if (layout == NCHW) { - Upsample_Nearest_PLN<2>(src_data, dst_data, IN, IC, IH, IW); + if (layout == NCHW || layout == NHWC) { + if (precision == Precision::FP32) { + Upsample_Nearest_PLN(src_data, dst_data, IN, IC, IH, IW, layout); + } else { + Upsample_Nearest_PLN(reinterpret_cast(src_data), + reinterpret_cast(dst_data), IN, IC, IH, IW, layout); + } } else { Upsample_Nearest_BLK<2>(src_data, dst_data, IN, IC, IH, IW); } @@ -143,8 +158,8 @@ private: float ax = 1.0f / (antialias ? fx : 1.0f); float ay = 1.0f / (antialias ? fy : 1.0f); - int rx = (fx < 1.0f) ? 2 : ceil(static_cast(kernel_width) / ax); - int ry = (fy < 1.0f) ? 2 : ceil(static_cast(kernel_width) / ay); + int rx = (fx < 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ax)); + int ry = (fy < 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ay)); for (int y = iy_r - ry; y <= iy_r + ry; y++) { for (int x = ix_r - rx; x <= ix_r + rx; x++) { @@ -169,13 +184,13 @@ private: } static void NearestNeighborKernel_PLN(const float *in_ptr_, float *out_ptr_, int B, int C, int IH, int IW, float fx, float fy, int OH, int OW) { - for (size_t b = 0; b < B; b++) { - for (size_t c = 0; c < C; c++) { + for (int b = 0; b < B; b++) { + for (int c = 0; c < C; c++) { const float *in_ptr = in_ptr_ + IW * IH * C * b + IW * IH * c; float *out_ptr = out_ptr_ + OW * OH * C * b + OW * OH * c; - for (size_t oy = 0; oy < OH; oy++) { - for (size_t ox = 0; ox < OW; ox++) { + for (int oy = 0; oy < OH; oy++) { + for (int ox = 0; ox < OW; ox++) { float ix = ox * fx + fy / 2.0f - 0.5f; float iy = oy * fy + fx / 2.0f - 0.5f; @@ -191,15 +206,15 @@ private: static void NearestNeighborKernel_BLK(const float *in_ptr_, float *out_ptr_, int B, int C, int IH, int IW, float fx, float fy, int OH, int OW) { int blk_size = 8; - size_t CB = (size_t)div_up(C, blk_size); + int CB = div_up(C, blk_size); - for (size_t b = 0; b < B; b++) { - for (size_t cb = 0; cb < CB; cb++) { + for (int b = 0; b < B; b++) { + for (int cb = 0; cb < CB; cb++) { const float *in_ptr = in_ptr_ + IW * IH * CB * blk_size * b + IW * IH * cb * blk_size; float *out_ptr = out_ptr_ + OW * OH * CB * blk_size * b + OW * OH * cb * blk_size; - for (size_t oy = 0; oy < OH; oy++) { - for (size_t ox = 0; ox < OW; ox++) { + for (int oy = 0; oy < OH; oy++) { + for (int ox = 0; ox < OW; ox++) { float ix = ox * fx + fy / 2.0f - 0.5f; float iy = oy * fy + fx / 2.0f - 0.5f; @@ -217,30 +232,67 @@ private: } } - template - static void Upsample_Nearest_PLN(const float *in_ptr_, float *out_ptr_, int B, int C, int IH, int IW) { + template + static void Upsample_Nearest_PLN(const T *in_ptr_, T *out_ptr_, int B, int C, int IH, int IW, Layout layout) { int OH = factor * IH; int OW = factor * IW; - for (size_t b = 0; b < B; b++) { - for (size_t c = 0; c < C; c++) { - const float *in_ptr = in_ptr_ + IW * IH * C * b + IW * IH * c; - float *out_ptr = out_ptr_ + OW * OH * C * b + OW * OH * c; + if (layout == NCHW) { + for (int b = 0; b < B; b++) { + for (int c = 0; c < C; c++) { + const T *in_ptr = in_ptr_ + IW * IH * C * b + IW * IH * c; + T *out_ptr = out_ptr_ + OW * OH * C * b + OW * OH * c; - for (size_t iy = 0; iy < IH; iy++) { - for (size_t ix = 0; ix < IW; ix++) { - size_t oy = factor * iy; - size_t ox = factor * ix; - float value = in_ptr[iy * IW + ix]; + for (int iy = 0; iy < IH; iy++) { + for (int ix = 0; ix < IW; ix++) { + int oy = factor * iy; + int ox = factor * ix; + float value = in_ptr[iy * IW + ix]; - for (int fh = 0; fh < factor; fh++) { - for (int fw = 0; fw < factor; fw++) { - out_ptr[(oy + fh) * OW + ox + fw] = value; + for (int fh = 0; fh < factor; fh++) { + for (int fw = 0; fw < factor; fw++) { + out_ptr[(oy + fh) * OW + ox + fw] = static_cast(value); + } } } } } } + } else { + int block_size = C; + int block_size_bytes = block_size * sizeof(T); + + int ICIWIH = C * IW * IH; + int OWOH = OW * OH; + int OCOWOH = C * OWOH; + + int stepX = factor; + int stepY = factor; + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int mb = 0; mb < B; mb++) { + for (int oh = 0; oh < OH; oh += stepY) { + size_t dst_off = mb * OCOWOH + (oh * OW) * block_size; + size_t src_off = mb * ICIWIH + (oh / stepY * IW) * block_size; + + for (int ow = 0; ow < OW; ow += stepX) { + size_t dst_off_curr = dst_off + ow * block_size; + size_t src_off_curr = src_off + ow / stepX * block_size; + + memcpy(&out_ptr_[dst_off_curr], &in_ptr_[src_off_curr], block_size_bytes); + + for (int owx = 1; owx < stepX; owx++) { + memcpy(&out_ptr_[dst_off_curr + block_size * owx], &in_ptr_[src_off_curr], block_size_bytes); + } + } + + for (int ohy = 1; ohy < stepY; ohy++) { + memcpy(&out_ptr_[dst_off + OW * block_size * ohy], &out_ptr_[dst_off], block_size_bytes * OW); + } + } + } } } @@ -268,10 +320,10 @@ private: const float *in_ptr = in_ptr_ + IW * IH * CB * blk_size * b + IW * IH * cb * blk_size; float *out_ptr = out_ptr_ + OW * OH * CB * blk_size * b + OW * OH * cb * blk_size; - for (size_t iy = 0; iy < IH; iy++) { - for (size_t ix = 0; ix < IW; ix++) { - size_t oy = factor * iy; - size_t ox = factor * ix; + for (int iy = 0; iy < IH; iy++) { + for (int ix = 0; ix < IW; ix++) { + int oy = factor * iy; + int ox = factor * ix; vec_type vsrc = _mm_uni_loadu_ps(in_ptr + iy * IW * blk_size + ix * blk_size); diff --git a/inference-engine/src/extension/ext_reverse_sequence.cpp b/inference-engine/src/extension/ext_reverse_sequence.cpp new file mode 100644 index 0000000..5780ef2 --- /dev/null +++ b/inference-engine/src/extension/ext_reverse_sequence.cpp @@ -0,0 +1,179 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class ReverseSequenceImpl: public ExtLayerBase { +public: + explicit ReverseSequenceImpl(const CNNLayer* layer) { + try { + if (layer->insData.size() != 2 || layer->outData.size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + src_dims = layer->insData[REVERSESEQUENCE_DATA].lock()->getTensorDesc().getDims(); + SizeVector seq_lengths_dims = layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getDims(); + if (layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision() != Precision::I32 && + layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'seq_lengths' input precision. Only FP32 and I32 are supported!"; + if (seq_lengths_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Seq_lengths vector should be 1 dimension"; + + SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); + if (src_dims.size() != dst_dims.size()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output sizes!"; + + for (size_t i = 0; i < dst_dims.size(); i++) { + if (src_dims[i] != dst_dims[i]) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimension!"; + } + + seq_axis = layer->GetParamAsInt("seq_axis", 1); + if (seq_axis < 0) + seq_axis += src_dims.size(); + + if (seq_axis < 0 || seq_axis >= static_cast(src_dims.size())) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'seq_axis' parameters dimensions and axis number!"; + + batch_axis = layer->GetParamAsInt("batch_axis", 0); + if (batch_axis < 0) + batch_axis += src_dims.size(); + + if (batch_axis < 0 || batch_axis >= static_cast(src_dims.size())) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'batch_axis' parameters dimensions and axis number!"; + + if (seq_lengths_dims[0] != dst_dims[batch_axis]) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'seq_lengths_dims' parameters dimension!"; + + srcStrides = layer->insData[REVERSESEQUENCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides(); + work_amount_dst = srcStrides[0] * src_dims[0]; + + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + size_t i; + const float *src_data = inputs[REVERSESEQUENCE_DATA]->cbuffer().as() + + inputs[REVERSESEQUENCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + switch (inputs[REVERSESEQUENCE_LENGTHS]->precision()) { + case Precision::FP32: { + float *seq_lengths_data = inputs[REVERSESEQUENCE_LENGTHS]->cbuffer().as() + + inputs[REVERSESEQUENCE_LENGTHS]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + for (i = 0; i < src_dims[batch_axis]; i++) { + if (static_cast(seq_lengths_data[i]) > static_cast(src_dims[seq_axis])) { + if (resp) { + std::string errorMsg = "Incorrect input 'seq_lengths' values!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, src_idx = 0; + SizeVector counters(src_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = src_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % src_dims[j]; + i /= src_dims[j]; + } + + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (static_cast(i) == seq_axis && + static_cast(idx) < static_cast(seq_lengths_data[counters[batch_axis]])) { + idx = static_cast(seq_lengths_data[counters[batch_axis]]) - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + dst_data[iwork] = src_data[src_idx]; + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + case Precision::I32: { + int32_t *seq_lengths_data = inputs[REVERSESEQUENCE_LENGTHS]->cbuffer().as() + + inputs[REVERSESEQUENCE_LENGTHS]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + for (i = 0; i < src_dims[batch_axis]; i++) { + if (seq_lengths_data[i] > static_cast(src_dims[seq_axis])) { + if (resp) { + std::string errorMsg = "Incorrect input 'seq_lengths' values!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, src_idx = 0; + SizeVector counters(src_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = src_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % src_dims[j]; + i /= src_dims[j]; + } + + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (static_cast(i) == seq_axis && + static_cast(idx) < seq_lengths_data[counters[batch_axis]]) { + idx = seq_lengths_data[counters[batch_axis]] - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + dst_data[iwork] = src_data[src_idx]; + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + default: + return GENERAL_ERROR; + } + + return OK; + } + +private: + const size_t REVERSESEQUENCE_DATA = 0; + const size_t REVERSESEQUENCE_LENGTHS = 1; + + int seq_axis; + int batch_axis; + SizeVector src_dims; + SizeVector srcStrides; + size_t work_amount_dst; +}; + +REG_FACTORY_FOR(ImplFactory, ReverseSequence); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_roifeatureextractor_onnx.cpp b/inference-engine/src/extension/ext_roifeatureextractor_onnx.cpp new file mode 100644 index 0000000..8c7a096 --- /dev/null +++ b/inference-engine/src/extension/ext_roifeatureextractor_onnx.cpp @@ -0,0 +1,413 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// There are some code snippets in this file. +// Original source file is avaialble here (Copyright (c) 2018 Facebook, MIT License): +// https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +// + +#include "ext_list.hpp" +#include "ext_base.hpp" +#include +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc.at(pre_calc_index) = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high = 0; + int x_high = 0; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = static_cast(1) - ly, hx = static_cast(1) - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward_cpu_kernel( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + T* top_data) { + int roi_cols = 4; + + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + parallel_for(n_rois, [&](size_t n) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = static_cast(offset_bottom_rois[0]); + offset_bottom_rois++; + } + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + + // Force malformed ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); + T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); + + // We do average (integral) pooling inside a bin + const T count = static_cast(roi_bin_grid_h * roi_bin_grid_w); // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + }); +} + + +void redistribute_rois(const float* rois, int* level_ids, + const int num_rois, const int levels_num) { + const float canonical_scale = 224.0f; + const int canonical_level = 2; + + for (int i = 0; i < num_rois; ++i) { + const float x0 = rois[4 * i + 0]; + const float y0 = rois[4 * i + 1]; + const float x1 = rois[4 * i + 2]; + const float y1 = rois[4 * i + 3]; + + int target_level = levels_num; + float area = (x1 - x0) * (y1 - y0); + if (area > 0) { + area = std::sqrt(area) / canonical_scale; + area = std::log2(area + 1e-6f); + target_level = static_cast(std::floor(area + canonical_level)); + target_level = std::max(0, std::min(levels_num - 1, target_level)); + } + + level_ids[i] = target_level; + } +} + + +void reorder(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, + int* dst_mapping) { + std::iota(dst_mapping, dst_mapping + n, 0); + std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];}); + for (int i = 0; i < n; ++i) { + const int j = dst_mapping[i]; + assert(0 <= j && j < n); + std::memcpy(dst_data + i * step, src_data + j * step, sizeof(float) * step); + } +} + +void split_points(const std::vector& ids, std::vector& rois_per_level, const int levels_num) { + rois_per_level.clear(); + rois_per_level.resize(levels_num, 0); + for (size_t i = 0; i < ids.size(); ++i) { + assert(0 <= ids[i] && ids[i] < levels_num); + rois_per_level[ids[i]]++; + } + for (int i = 1; i < levels_num; ++i) { + rois_per_level[i] += rois_per_level[i - 1]; + } + rois_per_level.insert(rois_per_level.begin(), 0); +} + + +void reorder_rois(const float *rois, const int* ids, int* mapping, const int rois_num, + float * reordered_rois, std::vector& rois_per_level, const int levels_num) { + rois_per_level.clear(); + rois_per_level.resize(levels_num, 0); + for (int i = 0; i < rois_num; ++i) { + assert(0 <= ids[i] && ids[i] < levels_num); + rois_per_level[ids[i]]++; + } + for (int i = 1; i < levels_num; ++i) { + rois_per_level[i] += rois_per_level[i - 1]; + } + rois_per_level.insert(rois_per_level.begin(), 0); + + std::vector level_counter = rois_per_level; + + for (int i = 0; i < rois_num; ++i) { + const int level = ids[i]; + assert(level < levels_num); + const int j = level_counter[level]; + assert(0 <= j && j < rois_num); + reordered_rois[j * 4 + 0] = rois[i * 4 + 0]; + reordered_rois[j * 4 + 1] = rois[i * 4 + 1]; + reordered_rois[j * 4 + 2] = rois[i * 4 + 2]; + reordered_rois[j * 4 + 3] = rois[i * 4 + 3]; + level_counter[level]++; + } +} + +class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase { +private: + const int INPUT_ROIS {0}; + const int INPUT_FEATURES_START {1}; + + const int OUTPUT_ROI_FEATURES {0}; + const int OUTPUT_ROIS {1}; + +public: + explicit ExperimentalDetectronROIFeatureExtractorImpl(const CNNLayer* layer) { + try { + output_dim_ = layer->GetParamAsInt("output_size"); + pyramid_scales_ = layer->GetParamAsInts("pyramid_scales"); + sampling_ratio_ = layer->GetParamAsInt("sampling_ratio"); + pooled_height_ = output_dim_; + pooled_width_ = output_dim_; + + std::vector inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN)); + std::vector outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN)); + addConfig(layer, inputs_layouts, outputs_layouts); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, + ResponseDesc *resp) noexcept override { + const int levels_num = inputs.size() - INPUT_FEATURES_START; + const int num_rois = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0]; + const int channels_num = inputs[INPUT_FEATURES_START]->getTensorDesc().getDims()[1]; + const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num; + + auto *input_rois = inputs[INPUT_ROIS]->buffer().as(); + auto *output_rois_features = outputs[OUTPUT_ROI_FEATURES]->buffer().as(); + float *output_rois = nullptr; + if (OUTPUT_ROIS < static_cast(outputs.size())) { + output_rois = outputs[OUTPUT_ROIS]->buffer().as(); + } + + std::vector level_ids(num_rois, 0); + redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); + + std::vector reordered_rois(4 * num_rois, 0); + std::vector original_rois_mapping(num_rois, 0); + reorder(input_rois, &level_ids[0], num_rois, 4, &reordered_rois[0], &original_rois_mapping[0]); + + std::vector rois_per_level; + split_points(level_ids, rois_per_level, levels_num + 1); + + std::vector output_rois_features_temp(feaxels_per_roi * num_rois, 0); + for (int i = 0; i < levels_num; ++i) { + const int level_rois_offset = rois_per_level[i]; + const int level_rois_num = rois_per_level[i + 1] - level_rois_offset; + if (level_rois_num > 0) { + auto *featuremap = inputs[INPUT_FEATURES_START + i]->buffer().as(); + const int featuremap_height = inputs[INPUT_FEATURES_START + i]->getTensorDesc().getDims()[2]; + const int featuremap_width = inputs[INPUT_FEATURES_START + i]->getTensorDesc().getDims()[3]; + ROIAlignForward_cpu_kernel(feaxels_per_roi * level_rois_num, + featuremap, + 1.0f / pyramid_scales_[i], + channels_num, + featuremap_height, + featuremap_width, + pooled_height_, + pooled_width_, + sampling_ratio_, + &reordered_rois[4 * level_rois_offset], + &output_rois_features_temp[feaxels_per_roi * level_rois_offset]); + } + } + + std::vector dummy_mapping(num_rois, 0); + reorder(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi, + output_rois_features, &dummy_mapping[0]); + if (output_rois != nullptr) { + std::memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float)); + } + + return OK; + } + +private: + int output_dim_ = 0; + int pooled_height_ = 0; + int pooled_width_ = 0; + std::vector pyramid_scales_; + int sampling_ratio_ = 0; + + int channels = 0; + int height = 0; + int width = 0; + + int nn = 0; + int nc = 0; + int nh = 0; + int nw = 0; +}; + +REG_FACTORY_FOR(ImplFactory, ExperimentalDetectronROIFeatureExtractor); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_shuffle_channels.cpp b/inference-engine/src/extension/ext_shuffle_channels.cpp new file mode 100644 index 0000000..79b23da --- /dev/null +++ b/inference-engine/src/extension/ext_shuffle_channels.cpp @@ -0,0 +1,149 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class ShuffleChannelsImpl: public ExtLayerBase { +#define CNTR_SIZE 3 + +__inline size_t initter(size_t start, size_t size, size_t* counters, size_t* own_dims, size_t* ownStrides) { + size_t i = start; + size_t idx = 0; + for (int j = size - 1; j >= 0; j--) { + counters[j] = i % own_dims[j]; + idx += counters[j] * ownStrides[j]; + i /= own_dims[j]; + } + return idx; +} + +__inline size_t updater(size_t idx, size_t size, size_t* counters, size_t* own_dims, size_t* ownStrides) { + size_t i = 1; + for (int j = size - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < own_dims[j]) { + idx += ownStrides[j]; + break; + } else { + counters[j] = 0; + i = 0; + } + } + if (!i) { + for (idx = 0; i < CNTR_SIZE; ++i) + idx += counters[i] * ownStrides[i]; + } + return idx; +} + +public: + explicit ShuffleChannelsImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + SizeVector src_dims = layer->insData[0].lock()->getTensorDesc().getDims(); + SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); + if (src_dims.size() != dst_dims.size()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!"; + + if (layer->insData[0].lock()->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only F32 is supported!"; + + if (layer->outData[0]->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect output precision. Only F32 is supported!"; + + int axis = layer->GetParamAsInt("axis", 1); + if (axis < 0) + axis += dst_dims.size(); + + if (axis < 0 || axis >= static_cast(dst_dims.size())) + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimensions and axis number!"; + + size_t group = layer->GetParamAsUInt("group", 1); + if (group == 0 || dst_dims[axis] % group) + THROW_IE_EXCEPTION << layer->name << " Group parameter must evenly divide the channel dimension!"; + + // Find number of dictionaries, index range and data length + own_dims[0] = 1; + for (int i = 0; i < axis; i++) + own_dims[0] *= dst_dims[i]; + + for (size_t i = axis + 1; i < dst_dims.size(); i++) + dataLength *= dst_dims[i]; + + if (dataLength == 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimension!"; + + own_dims[1] = dst_dims[axis] / group; + own_dims[2] = group; + ownStrides[0] = dst_dims[axis]; + ownStrides[1] = 1; + ownStrides[2] = own_dims[1]; + work_amount_dst = ownStrides[0] * own_dims[0]; + + addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + const float *src_data = inputs[0]->cbuffer().as() + + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + if (dataLength > 1) { + // Vectorized & Parallel + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0, src_idx = 0; + size_t counters[CNTR_SIZE] = { 0 }; + splitter(work_amount_dst, nthr, ithr, start, end); + src_idx = initter(start, CNTR_SIZE, counters, own_dims, ownStrides); + for (size_t iwork = start, dst_idx = start * dataLength; iwork < end; ++iwork, dst_idx += dataLength) { + memcpy(&dst_data[dst_idx], &src_data[dataLength * src_idx], sizeof(float) * dataLength); + src_idx = updater(src_idx, CNTR_SIZE, counters, own_dims, ownStrides); + } + }); + } else { + // Parallel + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0, src_idx = 0; + size_t counters[CNTR_SIZE] = { 0 }; + splitter(work_amount_dst, nthr, ithr, start, end); + src_idx = initter(start, CNTR_SIZE, counters, own_dims, ownStrides); + for (size_t iwork = start; iwork < end; ++iwork) { + dst_data[iwork] = src_data[src_idx]; + src_idx = updater(src_idx, CNTR_SIZE, counters, own_dims, ownStrides); + } + }); + } + + return OK; + } + +private: + size_t dataLength = 1; + size_t work_amount_dst; + size_t own_dims[CNTR_SIZE]; + size_t ownStrides[CNTR_SIZE]; +}; + +REG_FACTORY_FOR(ImplFactory, ShuffleChannels); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_simplernms.cpp b/inference-engine/src/extension/ext_simplernms.cpp index 72b004a..cb0e717 100644 --- a/inference-engine/src/extension/ext_simplernms.cpp +++ b/inference-engine/src/extension/ext_simplernms.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -263,9 +263,9 @@ public: const float* delta_pred = src_delta->buffer().as(); const float* im_info = inputs[2]->buffer().as(); - int IW = im_info[1]; - int IH = im_info[0]; - int IS = im_info[2]; + int IW = static_cast(im_info[1]); + int IH = static_cast(im_info[0]); + int IS = static_cast(im_info[2]); int scaled_min_bbox_size = min_box_size_ * IS; @@ -293,8 +293,8 @@ public: simpler_nms_roi_t tmp_roi = simpler_nms_gen_bbox(anchors[anchor_index], bbox_delta, anchor_shift_x, anchor_shift_y); simpler_nms_roi_t roi = tmp_roi.clamp({ 0, 0, static_cast(IW - 1), static_cast(IH - 1)}); - int bbox_w = roi.x1 - roi.x0 + 1; - int bbox_h = roi.y1 - roi.y0 + 1; + int bbox_w = static_cast(roi.x1 - roi.x0) + 1; + int bbox_h = static_cast(roi.y1 - roi.y0) + 1; if (bbox_w >= scaled_min_bbox_size && bbox_h >= scaled_min_bbox_size) { simpler_nms_proposal_t proposal { roi, proposal_confidence, sorted_proposals_confidence.size() }; diff --git a/inference-engine/src/extension/ext_space_to_depth.cpp b/inference-engine/src/extension/ext_space_to_depth.cpp new file mode 100644 index 0000000..e00bc0a --- /dev/null +++ b/inference-engine/src/extension/ext_space_to_depth.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class SpaceToDepthImpl: public ExtLayerBase { +#define CNTR_SIZE 5 + +public: + explicit SpaceToDepthImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + SizeVector src_dims = layer->insData[0].lock()->getTensorDesc().getDims(); + if (src_dims.size() < 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input dimensions!"; + if (layer->insData[0].lock()->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only F32 is supported!"; + + SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); + if (dst_dims.size() < 3) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of output dimensions!"; + if (layer->outData[0]->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect output precision. Only F32 is supported!"; + + size_t block_size = layer->GetParamAsUInt("block_size", 1); + if (block_size == 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect block_size parameter is zero!"; + + if (dst_dims[dst_dims.size() - 3] % (block_size * block_size)) + THROW_IE_EXCEPTION << layer->name << " block_size parameter is incompatible with input tensor Color dimension size!"; + + if (src_dims.size() > 2 && dst_dims[dst_dims.size() - 3] != (src_dims[src_dims.size() - 3] * block_size * block_size)) + THROW_IE_EXCEPTION << layer->name << " Input/Output tensor Color dimension is incompatible with block_size!"; + + if (src_dims[src_dims.size() - 2] != (dst_dims[dst_dims.size() - 2] * block_size)) + THROW_IE_EXCEPTION << layer->name << " Input/Output tensor Height dimension is incompatible with block_size!"; + + if (src_dims[src_dims.size() - 1] != (dst_dims[dst_dims.size() - 1] * block_size)) + THROW_IE_EXCEPTION << layer->name << " Input/Output tensor Width dimension is incompatible with block_size!"; + + own_dims[0] = 1; + for (size_t i = 0; i < (dst_dims.size() - 3); i++) + own_dims[0] *= dst_dims[i]; + own_dims[1] = dst_dims[dst_dims.size() - 2]; + own_dims[2] = dst_dims[dst_dims.size() - 3] / block_size; + own_dims[3] = dst_dims[dst_dims.size() - 1]; + own_dims[4] = block_size; + + size_t C = dst_dims[dst_dims.size() - 2] * dst_dims[dst_dims.size() - 1]; + ownStrides[0] = dst_dims[dst_dims.size() - 3] * C; + ownStrides[1] = dst_dims[dst_dims.size() - 1]; + ownStrides[2] = block_size * C; + ownStrides[3] = 1; + ownStrides[4] = C; + work_amount_dst = ownStrides[0] * own_dims[0]; + + addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + const float *src_data = inputs[0]->cbuffer().as() + + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + // Parallel + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, dst_idx = 0; + size_t counters[CNTR_SIZE] = { 0 }; + splitter(work_amount_dst, nthr, ithr, start, end); + i = start; + for (int j = CNTR_SIZE - 1; j >= 0; j--) { + counters[j] = i % own_dims[j]; + dst_idx += counters[j] * ownStrides[j]; + i /= own_dims[j]; + } + + for (size_t iwork = start, i = 1; iwork < end; ++iwork) { + dst_data[dst_idx] = src_data[iwork]; + for (int j = CNTR_SIZE - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < own_dims[j]) { + dst_idx += ownStrides[j]; + break; + } else { + counters[j] = i = 0; + } + } + if (!i) { + for (dst_idx = 0; i < CNTR_SIZE; ++i) + dst_idx += counters[i] * ownStrides[i]; + } + } + }); + + return OK; + } + +private: + size_t work_amount_dst; + size_t own_dims[CNTR_SIZE]; + size_t ownStrides[CNTR_SIZE]; +}; + +REG_FACTORY_FOR(ImplFactory, SpaceToDepth); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_spatial_transformer.cpp b/inference-engine/src/extension/ext_spatial_transformer.cpp deleted file mode 100644 index a63fb69..0000000 --- a/inference-engine/src/extension/ext_spatial_transformer.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "ext_list.hpp" -#include "ext_base.hpp" - -#include "matrixmult.h" - -#include -#include -#include -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class SpatialTransformerImpl: public ExtLayerBase { -public: - explicit SpatialTransformerImpl(const CNNLayer* layer) { - try { - if (layer->insData.size() != 2 || layer->outData.empty()) - THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; - - if (layer->insData[0].lock()->dims.size() != 4) - THROW_IE_EXCEPTION << "SpatialTransformer supports only 4D blobs!"; - - addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)}); - } catch (InferenceEngine::details::InferenceEngineException &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - std::vector real_dims = inputs[0]->getTensorDesc().getDims(); - size_t data_size = inputs[0]->size(); - - const auto *src_data = inputs[0]->cbuffer().as(); - auto *theta = inputs[1]->buffer().as(); - auto *dst_data = outputs[0]->buffer().as(); - - auto N = real_dims[0]; - auto C = real_dims[1]; - auto output_H_ = real_dims[2]; - auto output_W_ = real_dims[3]; - - // Prepare input and output grid - std::vector input_grid_data(N * output_H_ * output_W_ * 2); - std::vector output_grid_data(3 * output_H_ * output_W_); - for (int i = 0; i < output_H_ * output_W_; ++i) { - output_grid_data[3 * i] = (i / output_W_) * 1.0 / output_H_ * 2 - 1; - output_grid_data[3 * i + 1] = (i % output_W_) * 1.0 / output_W_ * 2 - 1; - output_grid_data[3 * i + 2] = 1; - } - - // Actually execute - for (int i = 0; i < N; ++i) { - auto coordinates = input_grid_data.begin() + (output_H_ * output_W_ * 2) * i; - - auto M_size = output_H_ * output_W_; - auto N_size = 2; - auto K_size = 3; - - matrixMult(&output_grid_data[0], theta + 6 * i, &(*coordinates), M_size, N_size, K_size, true); - - int row_idx; - float px, py; - - for (int j = 0; j < C; ++j) { - for (int s = 0; s < output_H_; ++s) { - for (int t = 0; t < output_W_; ++t) { - row_idx = output_W_ * s + t; - - px = coordinates[row_idx * 2]; - py = coordinates[row_idx * 2 + 1]; - - size_t dst_offset = ((i * C + j) * output_H_ + s) * output_W_ + t; - size_t src_offset = ((i * C + j) * output_H_ + 0) * output_W_ + 0; - dst_data[dst_offset] = transform_forward_cpu(src_data + src_offset, px, py); - } - } - } - } - return OK; - } - -private: - float transform_forward_cpu(const float *pic, float px, float py) { - int H = 24; - int W = 94; - - float res = 0.0f; - float x = (px + 1) / 2 * H; - float y = (py + 1) / 2 * W; - - int m, n; - float w; - - m = std::floor(x); - n = std::floor(y); - w = 0; - if (m >= 0 && m < H && n >= 0 && n < W) { - w = std::max(0.0f, 1 - std::abs(x - m)) * std::max(0.0f, 1 - std::abs(y - n)); - res += w * pic[m * W + n]; - } - - m = std::floor(x) + 1; - n = std::floor(y); - w = 0; - if (m >= 0 && m < H && n >= 0 && n < W) { - w = std::max(0.0f, 1 - std::abs(x - m)) * std::max(0.0f, 1 - std::abs(y - n)); - res += w * pic[m * W + n]; - } - - m = std::floor(x); - n = std::floor(y) + 1; - w = 0; - if (m >= 0 && m < H && n >= 0 && n < W) { - w = std::max(0.0f, 1 - std::abs(x - m)) * std::max(0.0f, 1 - std::abs(y - n)); - res += w * pic[m * W + n]; - } - - m = std::floor(x) + 1; - n = std::floor(y) + 1; - w = 0; - if (m >= 0 && m < H && n >= 0 && n < W) { - w = std::max(0.0f, 1 - std::abs(x - m)) * std::max(0.0f, 1 - std::abs(y - n)); - res += w * pic[m * W + n]; - } - - return res; - } -}; - -class SpatialTransformerShapeInfer : public IShapeInferImpl { -public: - StatusCode inferShapes(const std::vector& inShapes, - const std::map& params, - const std::map& blobs, - std::vector& outShapes, - ResponseDesc* resp) noexcept override { - outShapes.push_back(inShapes[0]); - return InferenceEngine::OK; - } -}; - -REG_FACTORY_FOR(ImplFactory, SpatialTransformer); -REG_SHAPE_INFER_FOR_TYPE(SpatialTransformerShapeInfer, SpatialTransformer); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_squeeze.cpp b/inference-engine/src/extension/ext_squeeze.cpp new file mode 100644 index 0000000..a745031 --- /dev/null +++ b/inference-engine/src/extension/ext_squeeze.cpp @@ -0,0 +1,123 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class SqueezeImpl: public ExtLayerBase { +public: + explicit SqueezeImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + if (layer->insData.size() != 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; + + idx_dims = layer->insData[SQUEEZE_INDEXES].lock()->getTensorDesc().getDims(); + if (idx_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension"; + + if (layer->insData[SQUEEZE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32 && + layer->insData[SQUEEZE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'indices_to_squeeze' input precision. Only FP32 and I32 are supported!"; + + data_dims = layer->insData[SQUEEZE_DATA].lock()->getTensorDesc().getDims(); + SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims(); + if (data_dims.size() < dst_dims.size()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!"; + + if (data_dims.size() <= idx_dims[0] && !(data_dims.size() == 1 && idx_dims[0] == 1)) + THROW_IE_EXCEPTION << layer->name << " Incompatible number of data dimensions and indexes vector length!"; + + addConfig(layer, { { ConfLayout::PLN, false, 0 }, { ConfLayout::ANY, true } }, { { ConfLayout::PLN, false, 0 } }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + switch (inputs[SQUEEZE_INDEXES]->precision()) { + case Precision::FP32: { + float *idx_data = inputs[SQUEEZE_INDEXES]->cbuffer().as() + + inputs[SQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + for (size_t i = 0; i < idx_dims[0]; i++) { + float axis = idx_data[i]; + if (axis < 0) + axis += data_dims.size(); + + if (axis > static_cast(data_dims.size())) { + if (resp) { + std::string errorMsg = "Index to squeeze exceeds data tensor dimension"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } else if (data_dims[static_cast(axis)] != 1) { + if (resp) { + std::string errorMsg = "Index to squeeze of data tensor dimension is not 1"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + } + break; + case Precision::I32: { + int32_t *idx_data = inputs[SQUEEZE_INDEXES]->cbuffer().as() + + inputs[SQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + for (size_t i = 0; i < idx_dims[0]; i++) { + int32_t axis = idx_data[i]; + if (axis < 0) + axis += data_dims.size(); + + if (axis > static_cast(data_dims.size())) { + if (resp) { + std::string errorMsg = "Index to squeeze exceeds data tensor dimension"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } else if (data_dims[axis] != 1) { + if (resp) { + std::string errorMsg = "Index to squeeze of data tensor dimension is not 1"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + } + break; + default: + if (resp) { + std::string errorMsg = "Incorrect 'indices_to_squeeze' input precision. Only FP32 and I32 are supported!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + + return OK; + } + +private: + const size_t SQUEEZE_DATA = 0; + const size_t SQUEEZE_INDEXES = 1; + + SizeVector data_dims; + SizeVector idx_dims; +}; + +REG_FACTORY_FOR(ImplFactory, Squeeze); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_strided_slice.cpp b/inference-engine/src/extension/ext_strided_slice.cpp new file mode 100644 index 0000000..4a94059 --- /dev/null +++ b/inference-engine/src/extension/ext_strided_slice.cpp @@ -0,0 +1,380 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +inline void clipping(int *idx, const int min, const int max) { + (*idx) = ((*idx) > min) ? (*idx) : min; + (*idx) = ((*idx) < max) ? (*idx) : (max - 1); + return; +} + +class StridedSliceImpl: public ExtLayerBase { +public: + explicit StridedSliceImpl(const CNNLayer* layer) { + try { + if (layer->insData.size() > 4 || layer->outData.size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + src_dims = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getDims(); + + bounds_size = 0; + begin_dims = {}; + if (layer->insData.size() > 1) { + begin_dims = layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getDims(); + if (layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'begin' input precision. Only I32 is supported!"; + if (begin_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Begin vector should be 1 dimension"; + bounds_size = begin_dims[0]; + } + + if (layer->insData.size() > 2) { + end_dims = layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getDims(); + if (layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'end' input precision. Only I32 is supported!"; + if (end_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " End vector should be 1 dimension"; + if (begin_dims[0] != end_dims[0]) + THROW_IE_EXCEPTION << layer->name << " Begin vector size should be equal end vectror size"; + } + + if (layer->insData.size() > 3) { + stride_dims = layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getDims(); + if (layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'strides' input precision. Only I32 is supported!"; + if (stride_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " End vector should be 1 dimension"; + if (begin_dims[0] != stride_dims[0]) + THROW_IE_EXCEPTION << layer->name << " Stride vector size should be equal begin vectror size"; + } + dst_dims = layer->outData[0]->getTensorDesc().getDims(); + + std::string::size_type i; + std::string begin_mask_str = layer->GetParamAsString("begin_mask", ""); + for (i = 0; i < begin_mask_str.size(); ++i) { + if (begin_mask_str[i] == '1') begin_mask.push_back(1); + else if (begin_mask_str[i] == '0') begin_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) begin_mask.push_back(1); + + std::string end_mask_str = layer->GetParamAsString("end_mask", ""); + for (i = 0; i < end_mask_str.size(); ++i) { + if (end_mask_str[i] == '1') end_mask.push_back(1); + else if (end_mask_str[i] == '0') end_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) end_mask.push_back(1); + + std::string ellipsis_mask_str = layer->GetParamAsString("ellipsis_mask", ""); + size_t ellipsis_mask_counter = 0; + for (i = 0; i < ellipsis_mask_str.size(); ++i) { + if (ellipsis_mask_str[i] == '1') { + ellipsis_mask_counter++; + ellipsis_mask.push_back(1); + } else if (ellipsis_mask_str[i] == '0') { + ellipsis_mask.push_back(0); + } + } + if (ellipsis_mask_counter > 1) + THROW_IE_EXCEPTION << layer->name << " 'Ellipsis_mask' must be a power of two (only one ellipsis)!"; + for (; i < src_dims.size(); ++i) ellipsis_mask.push_back(0); + + std::string new_axis_mask_str = layer->GetParamAsString("new_axis_mask", ""); + for (i = 0; i < new_axis_mask_str.size(); ++i) { + if (new_axis_mask_str[i] == '1') new_axis_mask.push_back(1); + else if (new_axis_mask_str[i] == '0') new_axis_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) new_axis_mask.push_back(0); + + std::string shrink_axis_mask_str = layer->GetParamAsString("shrink_axis_mask", ""); + for (i = 0; i < shrink_axis_mask_str.size(); ++i) { + if (shrink_axis_mask_str[i] == '1') shrink_axis_mask.push_back(1); + else if (shrink_axis_mask_str[i] == '0') shrink_axis_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) shrink_axis_mask.push_back(0); + + + int new_axis = 0; + for (auto& na : new_axis_mask) + new_axis += na; + + shrink_axis = 0; + for (auto& sa : shrink_axis_mask) + shrink_axis += sa; + max_dims = src_dims.size() + new_axis; + + // ellipsis_mask must be a power of two (only one ellipsis), so to take a first position + ellipsis_pos1 = ellipsis_pos2 = max_dims; + for (i = 0; i < ellipsis_mask.size(); i++) { + if (ellipsis_mask[i] > 0) { + ellipsis_pos1 = i; + break; + } + } + bounds_size -= ellipsis_pos1; + if (bounds_size > 0 && (max_dims - bounds_size) > ellipsis_pos1) + ellipsis_pos2 = max_dims - bounds_size; + + begin_dms.assign(max_dims, 0); + end_dms.assign(max_dims, -1); + stride_dms.assign(max_dims, 1); + + srcStrides = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides(); + dstStrides = layer->outData[0]->getTensorDesc().getBlockingDesc().getStrides(); + if (layer->insData.size() == 1) { + addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } else if (layer->insData.size() == 2) { + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } else if (layer->insData.size() == 3) { + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); + } else { + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), + DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + } + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + const float *src_data = inputs[STRIDEDSLICE_DATA]->cbuffer().as() + + inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + int *begin = nullptr, *end = nullptr, *stride = nullptr; + if (begin_dims.size()) + begin = inputs[STRIDEDSLICE_BEGIN]->cbuffer().as() + inputs[STRIDEDSLICE_BEGIN]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + if (end_dims.size()) + end = inputs[STRIDEDSLICE_END]->cbuffer().as() + inputs[STRIDEDSLICE_END]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + if (stride_dims.size()) + stride = inputs[STRIDEDSLICE_STRIDE]->cbuffer().as() + inputs[STRIDEDSLICE_STRIDE]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = outputs[0]->cbuffer().as() + + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + InferenceEngine::SizeVector src_dims = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getStrides(); + InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims(); + InferenceEngine::SizeVector dstStrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides(); + + size_t i, j, k, bj, ej, sj; + InferenceEngine::SizeVector our_dims; + InferenceEngine::SizeVector out_dims; + for (i = 0, j = 0, k = 0, bj = 0, ej = 0, sj = 0; static_cast(i) < max_dims; i++) { + if (static_cast(i) >= ellipsis_pos1 && + static_cast(i) < ellipsis_pos2) { + if (new_axis_mask.size() > i && new_axis_mask[i] == 1) + end_dms[i] = 0; + else + end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j++] + end_dms[i]; + + out_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast(abs(stride_dms[i]))))); + our_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast(abs(stride_dms[i]))))); + k = ellipsis_pos1; + } else { + stride_dms[i] = (stride != nullptr && stride_dims[0] > sj && stride[sj] != 0) ? stride[sj++] : 1; + + if (begin_mask.size() > j && begin_mask[j] == 0) + begin_dms[i] = stride_dms[i] > 0 ? 0 : -1; + else + begin_dms[i] = (begin != nullptr && begin_dims[0] > bj) ? begin[bj] : (stride_dms[i] > 0 ? 0 : -1); + bj++; + begin_dms[i] = begin_dms[i] >= 0 ? begin_dms[i] : src_dims[j] + begin_dms[i]; + // Clipping 'begin' + clipping(&begin_dms[i], 0, src_dims[j]); + + if (end_mask.size() > j && end_mask[j] == 0) { + end_dms[i] = stride_dms[i] > 0 ? -1 : 0; + } else { + int end_dms_tmp = (end != nullptr && end_dims[0] > ej) ? (stride_dms[i] > 0 ? end[ej] - 1 : end[ej] + 1) + : end_dms[i]; + end_dms[i] = (end != nullptr && end_dims[0] > ej) ? end_dms_tmp : (stride_dms[i] > 0 ? -1 : 0); + } + ej++; + end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j] + end_dms[i]; + // Clipping 'end' + clipping(&end_dms[i], 0, src_dims[j]); + + if (new_axis_mask.size() > i && new_axis_mask[i] == 1) + end_dms[i] = 0; + else + j++; + + if (shrink_axis_mask.size() > k && shrink_axis_mask[k] == 1) + end_dms[i] = begin_dms[i]; + else + out_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / + static_cast(abs(stride_dms[i]))))); + + our_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / + static_cast(abs(stride_dms[i]))))); + k++; + } + } + + for (i = 0; i < std::min(out_dims.size(), dst_dims.size()); i++) { + if (out_dims[i] != dst_dims[i]) + return PARAMETER_MISMATCH; + } + + if (static_cast(src_dims.size()) == max_dims && shrink_axis == 0 && + stride_dms[stride_dms.size()-1] == 1 && stride_dms.size() > 1) + strided_slice_vp(src_data, dst_data); + else if (static_cast(src_dims.size()) == max_dims && shrink_axis == 0) + strided_slice_p(src_data, dst_data); + else + strided_slice(src_data, dst_data, our_dims); + + return OK; + } + +private: + const size_t STRIDEDSLICE_DATA = 0; + const size_t STRIDEDSLICE_BEGIN = 1; + const size_t STRIDEDSLICE_END = 2; + const size_t STRIDEDSLICE_STRIDE = 3; + + void strided_slice(const float *src_data, float* dst_data, std::vector &dims); + void strided_slice_vp(const float *src_data, float* dst_data); + void strided_slice_p(const float *src_data, float* dst_data); + + SizeVector begin_dims; + SizeVector end_dims; + SizeVector stride_dims; + + SizeVector begin_mask; + SizeVector end_mask; + SizeVector ellipsis_mask; + SizeVector new_axis_mask; + SizeVector shrink_axis_mask; + int shrink_axis; + + SizeVector src_dims; + SizeVector dst_dims; + std::vector begin_dms; + std::vector end_dms; + std::vector stride_dms; + SizeVector srcStrides; + SizeVector dstStrides; + int bounds_size; + int max_dims; + int ellipsis_pos1, ellipsis_pos2; +}; + +void StridedSliceImpl::strided_slice(const float *src_data, float* dst_data, std::vector &dims) { + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; + parallel_nt(0, [&](const int ithr, const int nthr) { + int j; + size_t i, start = 0, end = 0; + SizeVector counters(max_dims, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (j = max_dims - 1, i = start; j >= 0; j--) { + counters[j] = i % dims[j]; + i /= dims[j]; + } + for (size_t iwork = start; iwork < end; ++iwork) { + int src_idx = 0; + for (i = 0, j = 0; static_cast(i) < max_dims; ++i) { + if (!(new_axis_mask.size() > i && new_axis_mask[i] == 1)) + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[j++]; + } + + dst_data[iwork] = src_data[src_idx]; + + for (j = max_dims - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < dims[j]) + break; + else + counters[j] = 0; + } + } + }); +} + +void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data) { + // Vectorized copy + size_t dims_size_1 = dst_dims.size() - 1; + size_t dataLength = dst_dims[dims_size_1]; + size_t work_amount_dst = dstStrides[0] * dst_dims[0] / dst_dims[dims_size_1]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + SizeVector counters(dims_size_1, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + size_t src_idx = begin_dms[dims_size_1]; + for (int j = dims_size_1 - 1, i = start; j >= 0; j--) { + counters[j] = i % dst_dims[j]; + src_idx += (begin_dms[j] + counters[j] * stride_dms[j]) * srcStrides[j]; + i /= dst_dims[j]; + } + + for (size_t iwork = start, dst_idx = start * dataLength, i = 1; iwork < end; ++iwork, dst_idx += dataLength) { + memcpy(&dst_data[dst_idx], &src_data[src_idx], sizeof(float) * dataLength); + for (int j = dims_size_1 - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < dst_dims[j]) { + src_idx += stride_dms[j] * srcStrides[j]; + break; + } else { + counters[j] = i = 0; + } + } + if (!i) { + for (src_idx = begin_dms[dims_size_1]; i < dims_size_1; ++i) + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[i]; + } + } + }); +} + +void StridedSliceImpl::strided_slice_p(const float *src_data, float* dst_data) { + size_t dims_size = dst_dims.size(); + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + SizeVector counters(dims_size, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + int src_idx = 0; + for (int j = dims_size - 1, i = start; j >= 0; j--) { + counters[j] = i % dst_dims[j]; + src_idx += (begin_dms[j] + counters[j] * stride_dms[j]) * srcStrides[j]; + i /= dst_dims[j]; + } + + for (size_t iwork = start, dst_idx = start, i = 1; iwork < end; ++iwork, dst_idx++) { + dst_data[dst_idx] = src_data[src_idx]; + for (int j = dims_size - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < dst_dims[j]) { + src_idx += stride_dms[j] * srcStrides[j]; + break; + } else { + counters[j] = i = 0; + } + } + if (!i) { + for (src_idx = 0; i < dims_size; ++i) + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[i]; + } + } + }); +} + +REG_FACTORY_FOR(ImplFactory, StridedSlice); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_topkrois_onnx.cpp b/inference-engine/src/extension/ext_topkrois_onnx.cpp new file mode 100644 index 0000000..0584bd5 --- /dev/null +++ b/inference-engine/src/extension/ext_topkrois_onnx.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" +#include +#include +#include + + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class ExperimentalDetectronTopKROIsImpl: public ExtLayerBase { +private: + // Inputs: + // rois, shape [n, 4] + // rois_probs, shape [n] + // Outputs: + // top_rois, shape [max_rois, 4] + + const int INPUT_ROIS {0}; + const int INPUT_PROBS {1}; + + const int OUTPUT_ROIS {0}; + +public: + explicit ExperimentalDetectronTopKROIsImpl(const CNNLayer* layer) { + try { + if (layer->insData.size() != 2 || layer->outData.empty()) + THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; + + if (layer->insData[INPUT_ROIS].lock()->dims.size() != 2 || + layer->insData[INPUT_PROBS].lock()->dims.size() != 1) + THROW_IE_EXCEPTION << "Unsupported shape of input blobs!"; + + max_rois_num_ = layer->GetParamAsInt("max_rois", 0); + + addConfig(layer, + {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, + {DataConfigurator(ConfLayout::PLN)}); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, + ResponseDesc *resp) noexcept override { + const int input_rois_num = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0]; + const int top_rois_num = std::min(max_rois_num_, input_rois_num); + + auto *input_rois = inputs[INPUT_ROIS]->buffer().as(); + auto *input_probs = inputs[INPUT_PROBS]->buffer().as(); + auto *output_rois = outputs[OUTPUT_ROIS]->buffer().as(); + + std::vector idx(input_rois_num); + iota(idx.begin(), idx.end(), 0); + // FIXME. partial_sort is enough here. + sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];}); + + for (int i = 0; i < top_rois_num; ++i) { + std::memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float)); + } + + return OK; + } + +private: + int max_rois_num_; +}; + +REG_FACTORY_FOR(ImplFactory, ExperimentalDetectronTopKROIs); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/ext_unsqueeze.cpp b/inference-engine/src/extension/ext_unsqueeze.cpp new file mode 100644 index 0000000..0fda31c --- /dev/null +++ b/inference-engine/src/extension/ext_unsqueeze.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ext_list.hpp" +#include "ext_base.hpp" + +#include +#include +#include +#include +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace Extensions { +namespace Cpu { + +class UnsqueezeImpl: public ExtLayerBase { +public: + explicit UnsqueezeImpl(const CNNLayer* layer) { + try { + if (layer->insData.empty() || layer->outData.empty()) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; + + if (layer->insData.size() != 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; + + idx_dims = layer->insData[UNSQUEEZE_INDEXES].lock()->getTensorDesc().getDims(); + data_dims = layer->insData[UNSQUEEZE_DATA].lock()->getTensorDesc().getDims(); + if (idx_dims.size() > 1) + THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension"; + + if (layer->insData[UNSQUEEZE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32 && + layer->insData[UNSQUEEZE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'indices_to_squeeze' input precision. Only FP32 and I32 are supported!"; + + addConfig(layer, { { ConfLayout::PLN, false, 0 }, { ConfLayout::ANY, true } }, { { ConfLayout::PLN, false, 0 } }); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + errorMsg = ex.what(); + } + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + InferenceEngine::SizeVector data_dims = inputs[UNSQUEEZE_DATA]->getTensorDesc().getDims(); + InferenceEngine::SizeVector idx_dims = inputs[UNSQUEEZE_INDEXES]->getTensorDesc().getDims(); + + switch (inputs[UNSQUEEZE_INDEXES]->precision()) { + case Precision::FP32: { + float *idx_data = inputs[UNSQUEEZE_INDEXES]->cbuffer().as() + + inputs[UNSQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + size_t max = data_dims.size(); + for (size_t i = 0; i < idx_dims[0]; i++) { + size_t axis = static_cast(idx_data[i]); + if (axis > max) max = axis; + } + max++; + + if ((idx_dims[0] + data_dims.size()) < max) { + if (resp) { + std::string errorMsg = "Indices_to_set for unsqueeze layer is out of tensor dimension"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + break; + case Precision::I32: { + int32_t *idx_data = inputs[UNSQUEEZE_INDEXES]->cbuffer().as() + + inputs[UNSQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + size_t max = data_dims.size(); + for (size_t i = 0; i < idx_dims[0]; i++) { + size_t axis = static_cast(idx_data[i]); + if (axis > max) max = axis; + } + max++; + + if ((idx_dims[0] + data_dims.size()) < max) { + if (resp) { + std::string errorMsg = "Indices_to_set for unsqueeze layer is out of tensor dimension"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return PARAMETER_MISMATCH; + } + } + break; + default: + if (resp) { + std::string errorMsg = "Incorrect 'indices_to_set' input precision. Only FP32 and I32 are supported!"; + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + + return OK; + } + +private: + const size_t UNSQUEEZE_DATA = 0; + const size_t UNSQUEEZE_INDEXES = 1; + + SizeVector data_dims; + SizeVector idx_dims; +}; + +REG_FACTORY_FOR(ImplFactory, Unsqueeze); + +} // namespace Cpu +} // namespace Extensions +} // namespace InferenceEngine diff --git a/inference-engine/src/extension/simple_copy.cpp b/inference-engine/src/extension/simple_copy.cpp index 22d6be0..d427328 100644 --- a/inference-engine/src/extension/simple_copy.cpp +++ b/inference-engine/src/extension/simple_copy.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/extension/simple_copy.h b/inference-engine/src/extension/simple_copy.h index aaf7521..42ea6c9 100644 --- a/inference-engine/src/extension/simple_copy.h +++ b/inference-engine/src/extension/simple_copy.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/CMakeLists.txt b/inference-engine/src/gna_plugin/CMakeLists.txt index f6a25b6..4c6b3d6 100644 --- a/inference-engine/src/gna_plugin/CMakeLists.txt +++ b/inference-engine/src/gna_plugin/CMakeLists.txt @@ -1,6 +1,5 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# set(TARGET_NAME "GNAPlugin") diff --git a/inference-engine/src/gna_plugin/dnn.cpp b/inference-engine/src/gna_plugin/dnn.cpp index 8c94f72..76f94cb 100644 --- a/inference-engine/src/gna_plugin/dnn.cpp +++ b/inference-engine/src/gna_plugin/dnn.cpp @@ -1,7 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // - +// dnn.cpp : component based neural network class for ease of use +// extern bool global_debug; #include @@ -1932,6 +1933,8 @@ void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) { if (ptr_nnet == nullptr) THROW_GNA_EXCEPTION << "Invalid input parameter"; + if (ptr_nnet->pLayers != nullptr) + THROW_GNA_EXCEPTION << "InitGNAStruct can't work on prellocated layers array"; if (component.empty()) THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()"; @@ -2180,10 +2183,10 @@ void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) { pLayer++; break; case kDnnCopyOp: - pLayer->nInputRows = component[i].num_rows_in; - pLayer->nInputColumns = component[i].num_columns_in; - pLayer->nOutputRows = component[i].num_rows_out; - pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nInputRows = component[i].num_columns_in; + pLayer->nInputColumns = component[i].num_rows_in; + pLayer->nOutputRows = component[i].num_columns_out; + pLayer->nOutputColumns = component[i].num_rows_out; pLayer->nBytesPerInput = component[i].num_bytes_per_input; pLayer->nBytesPerOutput = component[i].num_bytes_per_output; pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); @@ -2198,8 +2201,8 @@ void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) { THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure."; } auto *pCopyLayer = reinterpret_cast(pLayer->pLayerStruct); - pCopyLayer->nCopyRows = component[i].op.copy.num_copy_rows; - pCopyLayer->nCopyCols = component[i].op.copy.num_copy_columns; + pCopyLayer->nCopyRows = component[i].op.copy.num_copy_columns; + pCopyLayer->nCopyCols = component[i].op.copy.num_copy_rows; } pLayer++; break; @@ -2398,20 +2401,18 @@ void AmIntelDnn::WriteInputAndOutputText() { float floatValue = 0.f; if (component[i].num_bytes_per_output == 4) { if (number_type_ == kDnnInt) { - auto value = (reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]); - // out_file << std::setw(8) << value << "\n"; - floatValue = (static_cast(value) / component[i].output_scale_factor); + auto value = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]; + floatValue = static_cast(value); } else { - floatValue = (reinterpret_cast(component[i].ptr_outputs)[ - k * component[i].num_columns_out+ j]) / component[i].output_scale_factor; + floatValue = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]; } } else { auto value = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]; - // out_file << std::setw(8) << value << "\n"; - floatValue = (static_cast(value) / component[i].output_scale_factor); + floatValue = static_cast(value); } - out_file << std::setw(8) << floatValue << "\n"; + out_file << std::setw(8) << floatValue / component[i].output_scale_factor << "\n"; + if (ref_out_file) { float ref_value = 0.f; ref_out_file >> ref_value; @@ -2433,25 +2434,31 @@ void AmIntelDnn::WriteInputAndOutputText() { << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl; } + float input_scale_factor = component[i].output_scale_factor; + if (component[i].operation == kDnnAffineOp || + component[i].operation == kDnnDiagonalOp) { + input_scale_factor /= component[i].op.affine.weight_scale_factor; + } else if (component[i].operation == kDnnConvolutional1dOp) { + input_scale_factor /= component[i].op.conv1D.weight_scale_factor; + } else if (component[i].operation == kDnnPiecewiselinearOp) { + input_scale_factor = 1.f; + } for (int k = 0; k < component[i].num_rows_in; k++) { for (int j = 0; j < component[i].num_columns_in; j++) { + float floatValue = 0.f; if (component[i].num_bytes_per_input == 4) { if (number_type_ == kDnnInt) { - in_file << std::setw(8) - << (reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in - + j]); + auto value = reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + j]; + floatValue = static_cast(value); } else { - in_file << std::setw(8) - << (reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in - + j]); + floatValue = reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + j]; } } else { - in_file << std::setw(8) - << (reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in - + j]); + auto value = reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in+ j]; + floatValue = static_cast(value); } - in_file << "\n"; + in_file << std::setw(8) << floatValue / input_scale_factor << "\n"; } } #endif diff --git a/inference-engine/src/gna_plugin/dnn.h b/inference-engine/src/gna_plugin/dnn.h index 8a1506d..0d89a2d 100644 --- a/inference-engine/src/gna_plugin/dnn.h +++ b/inference-engine/src/gna_plugin/dnn.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -251,7 +251,8 @@ class AmIntelDnn { softmax_type(kSoftmaxNone), ptr_sumgroup_sizes(NULL), num_sumgroup_sizes(0), - ptr_priors(NULL) { + ptr_priors(NULL), + ptr_dnn_memory_(NULL) { } ~AmIntelDnn() { diff --git a/inference-engine/src/gna_plugin/dnn_memory.cpp b/inference-engine/src/gna_plugin/dnn_memory.cpp index 16496b5..dec7907 100644 --- a/inference-engine/src/gna_plugin/dnn_memory.cpp +++ b/inference-engine/src/gna_plugin/dnn_memory.cpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// dnn_memory.cpp : memory manipulation routines +// #include #include diff --git a/inference-engine/src/gna_plugin/dnn_memory.hpp b/inference-engine/src/gna_plugin/dnn_memory.hpp index 5ab2c96..43720f7 100644 --- a/inference-engine/src/gna_plugin/dnn_memory.hpp +++ b/inference-engine/src/gna_plugin/dnn_memory.hpp @@ -1,6 +1,7 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// dnn_memory.hpp : memory manipulation routines #pragma once diff --git a/inference-engine/src/gna_plugin/dnn_traits.hpp b/inference-engine/src/gna_plugin/dnn_traits.hpp index 0a92bb3..98238df 100644 --- a/inference-engine/src/gna_plugin/dnn_traits.hpp +++ b/inference-engine/src/gna_plugin/dnn_traits.hpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// dnn_traits.hpp : c++ trait approach to define dnn objects +// #pragma once diff --git a/inference-engine/src/gna_plugin/floatmath.cpp b/inference-engine/src/gna_plugin/floatmath.cpp index 3ea4112..72f3b3e 100644 --- a/inference-engine/src/gna_plugin/floatmath.cpp +++ b/inference-engine/src/gna_plugin/floatmath.cpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// floatmath.cpp : unoptimized floating point math routines (for reference) +// #include "floatmath.h" #include "pwl.h" diff --git a/inference-engine/src/gna_plugin/floatmath.h b/inference-engine/src/gna_plugin/floatmath.h index ff9bf99..5ce0db9 100644 --- a/inference-engine/src/gna_plugin/floatmath.h +++ b/inference-engine/src/gna_plugin/floatmath.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_allocator.hpp b/inference-engine/src/gna_plugin/gna_allocator.hpp index ae62b1f..e862efc 100644 --- a/inference-engine/src/gna_plugin/gna_allocator.hpp +++ b/inference-engine/src/gna_plugin/gna_allocator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp index fb9d2cc..1328ef5 100644 --- a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp +++ b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -32,6 +32,9 @@ class CPPWrapper { * @param n - number of layers */ explicit CPPWrapper(size_t n) { + if (n == 0) { + THROW_GNA_EXCEPTION << "Can't allocate array of intel_nnet_layer_t objects of zero length"; + } obj.pLayers = reinterpret_cast(_mm_malloc(n * sizeof(intel_nnet_layer_t), 64)); if (obj.pLayers == nullptr) { THROW_GNA_EXCEPTION << "out of memory in while allocating "<< n << " GNA layers"; diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp index 3936bc8..344d44e 100644 --- a/inference-engine/src/gna_plugin/gna_device.cpp +++ b/inference-engine/src/gna_plugin/gna_device.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_device.hpp b/inference-engine/src/gna_plugin/gna_device.hpp index 7828211..563f3a5 100644 --- a/inference-engine/src/gna_plugin/gna_device.hpp +++ b/inference-engine/src/gna_plugin/gna_device.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_executable_network.hpp b/inference-engine/src/gna_plugin/gna_executable_network.hpp index 1230624..88960ce 100644 --- a/inference-engine/src/gna_plugin/gna_executable_network.hpp +++ b/inference-engine/src/gna_plugin/gna_executable_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_helper.cpp b/inference-engine/src/gna_plugin/gna_helper.cpp index 604828c..7d26aaf 100644 --- a/inference-engine/src/gna_plugin/gna_helper.cpp +++ b/inference-engine/src/gna_plugin/gna_helper.cpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// gna_helper.cpp : various GNA-related utility functions +// #include "lstm.hpp" diff --git a/inference-engine/src/gna_plugin/gna_infer_request.hpp b/inference-engine/src/gna_plugin/gna_infer_request.hpp index ba8e99f..00a03a8 100644 --- a/inference-engine/src/gna_plugin/gna_infer_request.hpp +++ b/inference-engine/src/gna_plugin/gna_infer_request.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -32,7 +32,10 @@ class GNAInferRequest : public InferenceEngine::AsyncInferRequestInternal { // copy inputs blobs since we need to have them in separate address space to allow simultaneous infer requests _outputs[_networkOutputs.begin()->first] = plg->GetOutputBlob(networkOutputs.begin()->second->getPrecision()); - _inputs[_networkInputs.begin()->first] = plg->GetInputBlob(networkInputs.begin()->second->getInputPrecision()); + for (auto input : _networkInputs) { + _inputs[input.first] = + plg->GetInputBlob(input.first, networkInputs.begin()->second->getInputPrecision()); + } } /** * @brief Infers specified input(s) in synchronous mode diff --git a/inference-engine/src/gna_plugin/gna_layer_info.hpp b/inference-engine/src/gna_plugin/gna_layer_info.hpp index 7e6da43..5851a86 100644 --- a/inference-engine/src/gna_plugin/gna_layer_info.hpp +++ b/inference-engine/src/gna_plugin/gna_layer_info.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -55,7 +55,7 @@ class LayerInfo { bool has32BOutput() const noexcept { IS_VALID(); static InferenceEngine::details::caseless_set layersWith32BOutputs = - {"FullyConnected", "InnerProduct", "Eltwise", "ScaleShift", "Convolution", "Pooling"}; + {"FullyConnected", "InnerProduct", "AffineFilter", "Eltwise", "ScaleShift", "Convolution", "Pooling"}; return (layersWith32BOutputs.find(layer->type) != layersWith32BOutputs.end()) || (isCrop() && isCropAffined()); } @@ -88,6 +88,11 @@ class LayerInfo { IS_VALID(); return InferenceEngine::details::CaselessEq()(layer->type, "input"); } + bool isScaleShift() const noexcept { + IS_VALID(); + return nullptr != as(); + } + bool isEltwise() const noexcept { IS_VALID(); return nullptr != as(); @@ -112,9 +117,6 @@ class LayerInfo { return InferenceEngine::details::CaselessEq()(layer->type, "FullyConnected") || InferenceEngine::details::CaselessEq()(layer->type, "InnerProduct"); } - bool isConvolutional() const noexcept { - return InferenceEngine::details::CaselessEq()(layer->type, "Convolution"); - } bool isSplit() const noexcept { IS_VALID(); return InferenceEngine::details::CaselessEq()(layer->type, "split"); @@ -155,7 +157,7 @@ class LayerInfo { bool isCropAffined() const noexcept { auto cropLayer = dynamic_cast (layer); size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); - return (ALIGN(cropOffset, 8) != cropOffset); + return (ALIGN64(cropOffset) != cropOffset); } bool isCopy() const noexcept { IS_VALID(); diff --git a/inference-engine/src/gna_plugin/gna_mem_requests.hpp b/inference-engine/src/gna_plugin/gna_mem_requests.hpp index 24163dc..99d0731 100644 --- a/inference-engine/src/gna_plugin/gna_mem_requests.hpp +++ b/inference-engine/src/gna_plugin/gna_mem_requests.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_memory.hpp b/inference-engine/src/gna_plugin/gna_memory.hpp index d1c9650..30da318 100644 --- a/inference-engine/src/gna_plugin/gna_memory.hpp +++ b/inference-engine/src/gna_plugin/gna_memory.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_memory_state.hpp b/inference-engine/src/gna_plugin/gna_memory_state.hpp index 7edcb02..90e1f43 100644 --- a/inference-engine/src/gna_plugin/gna_memory_state.hpp +++ b/inference-engine/src/gna_plugin/gna_memory_state.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp index 3b14b8c..84c7d3c 100644 --- a/inference-engine/src/gna_plugin/gna_model_serial.cpp +++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_model_serial.hpp b/inference-engine/src/gna_plugin/gna_model_serial.hpp index 0ba5be5..30be460 100644 --- a/inference-engine/src/gna_plugin/gna_model_serial.hpp +++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index 620aa48..fc57d52 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -131,7 +131,7 @@ void GNAPlugin::copyInputData(T *dst, for (uint32_t i = 0; i < num_frames; i++) { for (uint32_t j = 0; j < num_vector_elements; j++) { if (!std::is_same::value) { - dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * input_scale_factor); + dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * get_input_scale_factor()); } else { dst[j * num_group + i] = src[i * num_vector_elements + j]; } @@ -154,7 +154,7 @@ void GNAPlugin::copyInputData(T *dst, U *ptr_src_vec = const_cast(reinterpret_cast(src) + i * num_vector_elements); std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T)); for (int j=0; j < num_vector_elements; j++) { - ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * input_scale_factor); + ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * get_input_scale_factor()); } } @@ -189,9 +189,13 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst, for (auto&& outputLayer : splitInfo.splitOutputLayers) { uint32_t begin = outputLayer.offset/precision_size; uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size; + if (dst_ptr - dst >= end) { + // output layer with bind pointer as previous one. Skip + continue; + } for (uint32_t i = begin; i < end; ++i) { if (!std::is_same::value) { - *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * input_scale_factor); + *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * get_input_scale_factor()); } else { *(dst_ptr++) = *(src_ptr++); } @@ -285,46 +289,39 @@ void GNAPlugin::ImportFrames( uint32_t num_group, uint32_t num_vector_elements, uint32_t num_vector_stride) { - // special case if split/slice layers connected - // with Input detected - auto it = split_connection.end(); - if (split_connection.size() != 0) { - it = std::find_if(split_connection.begin(), split_connection.end(), [] - (const std::pair &item) -> bool { - return CaselessEq()(item.second.splitInputLayer.name, "Input"); - }); - } if (orientation == kDnnInterleavedOrientation) { // TODO : fix that as well - if (input_precision.size() == 2) { + if (input_precision == Precision::U8) { + int16_t *dst = const_cast(reinterpret_cast(ptr_dst)); + uint8_t *src = const_cast(reinterpret_cast(ptr_src)); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } else if (input_precision.size() == 2) { int16_t *dst = const_cast(reinterpret_cast(ptr_dst)); int16_t *src = const_cast(reinterpret_cast(ptr_src)); - if (it != split_connection.end()) { - copyInputDataWithSplit(dst, src, it->second, input_precision.size()); - } else { - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); - } + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); } else if (input_precision.size() == 4) { if (!gnadevice) { float *dst = const_cast(reinterpret_cast(ptr_dst)); float *src = const_cast(reinterpret_cast(ptr_src)); - if (it != split_connection.end()) { - copyInputDataWithSplit(dst, src, it->second, input_precision.size()); - } else { - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); - } + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); } else { int16_t *dst = reinterpret_cast(ptr_dst); const float *src = reinterpret_cast(ptr_src); - if (it != split_connection.end()) { - copyInputDataWithSplit(dst, src, it->second, input_precision.size()); - } else { - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); - } + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); } } } else { - if (input_precision.size()== 2) { + if (input_precision == Precision::U8) { + uint8_t *src = const_cast(reinterpret_cast(ptr_src)); + if (!gnadevice) { + float *dst = const_cast(reinterpret_cast(ptr_dst)); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } else { + int16_t *dst = const_cast(reinterpret_cast(ptr_dst)); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } + + } else if (input_precision.size()== 2) { int16_t *dst = const_cast(reinterpret_cast(ptr_dst)); int16_t *src = const_cast(reinterpret_cast(ptr_src)); copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); @@ -342,9 +339,8 @@ void GNAPlugin::ImportFrames( } } -void GNAPlugin::fillMemoryConnections(std::map>& - memoryPairs) { +void GNAPlugin::fillMemoryConnections(std::unordered_map>& memoryPairs) { for (auto &memory : memoryPairs) { auto inputLayer = memory.second[1]; auto outputLayer = memory.second[0]; @@ -401,7 +397,7 @@ void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) { LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput); for (size_t i = 0; i < layer->outData.size(); ++i) { size_t padding = 0; - size_t layer_size = 0; + size_t output_layer_size = 0; auto& dataOutput = layer->outData[i]; if (!dataOutput || !dataInput) { @@ -416,16 +412,19 @@ void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) { padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize()) * dataOutput->precision.size(); - layer_size = + output_layer_size = InferenceEngine::details::product(begin(dataOutput->dims), end(dataOutput->dims)) * dataOutput->precision.size(); - layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, layer_size); + if (ptrSplitLayerOutput->type == "AffineFilter") { + size_t aligned64_offset = ptrSplitLayerOutput->GetParamAsInt("offset"); + layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, aligned64_offset, output_layer_size); + } else { + layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, output_layer_size); + } } - split_size += ptrSplitLayerInputLayerInfo.isInput() ? - ALIGN64(padding + layer_size): - padding + layer_size; + split_size += padding + output_layer_size; } layerInfoItem.reserved_size = split_size; layerInfoItem.splitInputLayer = @@ -717,9 +716,9 @@ void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) { auto ¤tComponent = dnnComponentsForLayer.back().second; dnn.InitCopyComponent(currentComponent, orientation, - num_rows_in + num_padding_in, + ALIGN(num_rows_in, 8), num_columns_in, - num_rows_out + num_padding_out, + ALIGN(num_rows_out, 8), num_columns_out, inputs->precision.size(), outputs->precision.size(), @@ -732,7 +731,7 @@ void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) { size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product( begin(outputs->dims), end(outputs->dims)), 8) * outputs->precision.size(); - size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding_in) * inputs->precision.size(); + size_t num_data_bytes_in = num_columns_in * ALIGN(num_rows_in, 8) * inputs->precision.size(); connectInput(layer, ptr_inputs, num_data_bytes_in); connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); @@ -757,13 +756,23 @@ void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) { THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported"; } + auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second; for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) { if ( LayerInfo(outLayer.second).isConcat() ) { - auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second; connectOutput(layer, &concatLayerInfo.gna_ptr, &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size); } } + + size_t idx = 0; + for (auto && inputLayer : concatLayerInfo.concatInputLayers) { + if ( InferenceEngine::details::CaselessEq() + (inputLayer.name, "input") ) { + connectInput(layer, &concatLayerInfo.gna_ptr, + concatLayerInfo.reserved_size-inputLayer.offset, static_cast(-inputLayer.offset), idx); + } + ++idx; + } } void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) { @@ -780,9 +789,9 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) { auto quantized = InferenceEngine::getInjectedData(layer); size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); - size_t cropSize = cropLayer->dim.back() * cropLayer->precision.size(); + size_t cropOutputSize = cropLayer->dim.back() * cropLayer->precision.size(); - if (ALIGN(cropOffset, 8) == cropOffset) { + if (ALIGN64(cropOffset) == cropOffset) { // leave crop as it is GNAPlugin::GNACropLayer cropLayerInfoItem(layer); std::string& id = layer->name; @@ -795,13 +804,13 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) { } // calculate index idx for connectInput last parameter - connectInput(layer, &cropLayerInfo->second.gna_ptr, cropSize + cropOffset, cropOffset, 0); + connectInput(layer, &cropLayerInfo->second.gna_ptr, cropOutputSize + cropOffset, cropOffset, 0); // cases for certain output layers for (auto &&outLayer : layer->outData.front()->getInputTo()) { auto& nextLayer = outLayer.second; if ( LayerInfo(nextLayer).isConcat() ) { - connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropSize); + connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropOutputSize); } } } else { @@ -842,30 +851,16 @@ void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) { begin(outputs->dims), end(outputs->dims)) * 4; size_t num_data_bytes_in = num_columns_in * - (num_rows_in + num_padding) * inputs->precision.size(); + ALIGN(num_rows_in, 8) * inputs->precision.size(); connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0); connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); - gnamem->readonly().push_initializer(ptr_weights, num_rows_out * (num_rows_in + num_padding)*layer->precision.size(), [=](void * data, size_t size) { - int out = 0; - for (int input = cropLayer->offset.back(); input < num_rows_out + cropLayer->offset.back(); ++input) { - auto mem_ptr = reinterpret_cast(data) + input * layer->precision.size() + out * (num_rows_in+num_padding) * layer->precision.size(); - if (quantized == nullptr) { - auto float_ptr = reinterpret_cast(mem_ptr); - *float_ptr = 1.0f; - } else { - auto int_ptr = reinterpret_cast(mem_ptr); - *int_ptr = 1; - } - ++out; - } - }, 64); - if (quantized == nullptr) { - gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); - } else { + FillWeightOfAligningFilter(layer, ptr_weights, cropLayer->offset.back(), (quantized == nullptr) ? false : true); + + (quantized == nullptr) ? + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64): gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); - } } } @@ -907,6 +902,7 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1); uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2); uint32_t num_rows_out = num_rows_in; + uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; void *ptr_inputs; void *ptr_outputs; @@ -916,9 +912,9 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); auto ¤tComponent = dnnComponentsForLayer.back().second; dnn.InitAffineComponent(currentComponent, - num_rows_in, + num_rows_in + num_padding, num_columns_in, - num_rows_out, + num_rows_out + num_padding, inputs2Bytes->precision.size(), outputs->precision.size(), // TODO: only fp32 and Int16 tested @@ -936,11 +932,11 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n"; #endif - size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) - * outputs->precision.size(); + size_t num_data_bytes_out = + InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) * outputs->precision.size(); - size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs2Bytes->dims), end(inputs2Bytes->dims)) - * inputs2Bytes->precision.size(); + size_t num_data_bytes_in = + num_columns_in * (num_rows_in + num_padding) * inputs2Bytes->precision.size(); connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx); @@ -955,6 +951,7 @@ void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { #define FLOAT_TO_INT16(a) static_cast(((a) < 0)?((a) - 0.5):((a) + 0.5)) auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); } connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx); @@ -1028,19 +1025,25 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) auto transpose = false; auto transposedRows = 0; auto transposedCols = 0; - /** - * TODO: enable transpose correction between Conv/affine layers implement dedicated pass - * TF topologies have inplace permutes so we dont care - * kaldi topologies did this internally - */ + if (0 && connectionInfo.needTransposeWeights) { - gnalog() << "Transposing weights for layer: " << layer->name << "\n"; // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1 auto permuteOrder = connectionInfo.permute->GetParamAsInts("order"); if (permuteOrder != vector({0, 3, 2, 1})) { THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") << ", but only support 0, 3, 2, 1"; } + + /** + * TODO: weights transpose happened after quantisation might result in poor quality for in 8 - move this to passes + */ + if (weightable._weights->precision() == Precision::I8) { + THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute operation for 8 bit weights for layer: " << layer->name; + } + + // this affine connected to convolution via pool or activation + gnalog() << "Transposing weights for layer: " << layer->name << "\n"; + transpose = !isDiag; transposedRows = connectionInfo.permute->input()->getDims()[3]; transposedCols = connectionInfo.permute->input()->getDims()[1]; @@ -1053,7 +1056,6 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) weightable._weights->byteSize(), 64); } else { - // ToDO: write unit tests for transpose gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) { for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) { auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size(); @@ -1063,13 +1065,16 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) for (int i = 0; i < transposedRows; i++) { auto offsetWrite = (transposedRows * j + i) * weightable.precision.size(); auto offsetRead = (i * transposedCols + j) * weightable.precision.size(); - memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size()); + std::memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size()); } } } }, 64); } } else { + if (transpose) { + THROW_GNA_EXCEPTION << "transpozed weights with non zero padding not yet supported"; + } auto elementsIn = (num_rows_in + num_padding) * num_columns_in; auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out; auto paddedWeightsSize = paddedWeights * weightable.precision.size(); @@ -1094,6 +1099,123 @@ void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) } } +void GNAPlugin::FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized) { + auto outputs = *layer->outData.begin(); + auto inputs = layer->insData.begin()->lock(); + + uint32_t num_rows_in = FROM_IR_DIM(inputs, 1); + uint32_t num_rows_out = FROM_IR_DIM(outputs, 1); + + if (!ptrWeights) { + THROW_GNA_EXCEPTION << "Weights memory is not allocated!!!"; + } + + gnamem->readonly().push_initializer(ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void * data, size_t size) { + int out = 0; + for (int input = offset; input < num_rows_out + offset; ++input) { + auto mem_ptr = reinterpret_cast(data) + input * layer->precision.size() + out * ALIGN(num_rows_in, 8) * layer->precision.size(); + if (!isQuantized) { + auto float_ptr = reinterpret_cast(mem_ptr); + *float_ptr = 1.0f; + } else { + auto int_ptr = reinterpret_cast(mem_ptr); + *int_ptr = 1; + } + ++out; + } + }, 64); +} + +void GNAPlugin::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto filterLayer = dynamic_cast (layer.get()); + + if (filterLayer == nullptr) { + return; + } + + std::string& name = filterLayer->name; + auto quantized = InferenceEngine::getInjectedData(layer); + + // we look for this concat layer pointer in extra concat map + auto prevLayer = CNNNetPrevLayer(layer.get(), 0); + if (!LayerInfo(prevLayer).isSplit() && !LayerInfo(prevLayer).isSlice()) { + THROW_GNA_EXCEPTION << "Case with Affine Aligning Filter for not Split/Slice layers is not implemented yet!"; + } + + void *ptr_inputs; + void *ptr_outputs; + void *ptr_weights; + void *ptr_biases; + + auto outputs = *layer->outData.begin(); + auto inputs = layer->insData.begin()->lock(); + + uint32_t num_columns_in = FROM_IR_DIM(inputs, 2); + uint32_t num_rows_out = FROM_IR_DIM(outputs, 1); + uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out; + + uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; + + gnalog() << "Filter " << layer->name << " is being inserted...\n"; + auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->precision() : outputs->precision; + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + dnn.InitAffineComponent(currentComponent, + num_rows_in + num_padding, + num_columns_in, + num_rows_out, + inputs->precision.size(), + outputs->precision.size(), + filterLayer->_weights->precision().size(), + biasPrecision.size(), + quantized == nullptr ? 1 : quantized->_weights_quant.scale, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs, + ptr_weights, + ptr_biases, + false); + + size_t num_data_bytes_out = + InferenceEngine::details::product( + begin(outputs->dims), end(outputs->dims)) * 4; + + size_t num_data_bytes_in = num_columns_in * + ALIGN(num_rows_in, 8) * inputs->precision.size(); + + connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0); + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + + if (num_padding == 0) { + gnamem->readonly().push_ptr(ptr_weights, + filterLayer->_weights->cbuffer().as(), + filterLayer->_weights->byteSize(), + 64); + } else { + auto elementsIn = (num_rows_in + num_padding) * num_columns_in; + auto paddedWeights = elementsIn * num_rows_out; + auto paddedWeightsSize = paddedWeights * filterLayer->precision.size(); + + gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) { + for (int i = 0; i < num_rows_out; i++) { + std::memcpy(data, + filterLayer->_weights->cbuffer().as() + num_rows_in * i * filterLayer->precision.size(), + num_rows_in * filterLayer->precision.size()); + data = reinterpret_cast(data) + (num_rows_in + num_padding) * filterLayer->precision.size(); + } + }, 64); + } + + if (filterLayer->_biases) { + gnamem->readonly().push_ptr(ptr_biases, + filterLayer->_biases->cbuffer().as(), + filterLayer->_biases->byteSize(), + 64); + } else { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); + } +} + void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) { auto *generic = dynamic_cast(layer.get()); std::string type; @@ -1269,6 +1391,7 @@ void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) { {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}}, // skip input layers they are not used in GNA lib, only as a memory blobs {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)}, {{"ScaleShift"}, CREATE(DiagonalPrimitive)}, + {{"AffineFilter"}, CREATE(AffineFilterPrimitive)}, {{"Eltwise"}, CREATE(EltwisePrimitive)}, // same as diagonal while weights are not taken from network, rather than from another output {{"Split"}, SKIP}, // skip information about which part of prev layer need to consume handle during layer creation @@ -1293,109 +1416,10 @@ void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) { GNAPlugin::GNAPlugin(const std::map& configMap) { - // holds actual value of a found key - std::string value; - auto if_set = [&](std::string key, const std::function & handler) { - auto keyInMap = configMap.find(key); - if (keyInMap != configMap.end()) { - value = keyInMap->second; - handler(); - } - }; - - if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] { - input_scale_factor = std::stod(value); - }); - - if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] { - dumpXNNPath = value; - }); - - if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] { - static caseless_unordered_map supported_values = { - {GNAConfigParams::GNA_AUTO, GNA_AUTO}, - {GNAConfigParams::GNA_HW, GNA_HARDWARE}, - {GNAConfigParams::GNA_SW, GNA_SOFTWARE}, - {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE} - }; - auto procType = supported_values.find(value); - if (procType == supported_values.end()) { - THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value; - } - gna_proc_type = static_cast(procType->second); - }); - - if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] { - if (value == PluginConfigParams::YES) { - compact_mode = true; - } else if (value == PluginConfigParams::NO) { - compact_mode = false; - } else { - THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value; - } - }); - - if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] { - if (value == PluginConfigParams::YES) { - exclusive_async_requests = true; - } else if (value == PluginConfigParams::NO) { - exclusive_async_requests = false; - } else { - THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; - } - }); - - if_set(GNA_CONFIG_KEY(PRECISION), [&] { - auto precision = Precision::FromStr(value); - if (precision != Precision::I8 && precision != Precision::I16) { - THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value; - } - gnaPrecision = precision; - }); - - if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] { - if (value == PluginConfigParams::YES) { - uniformPwlDesign = true; - } else if (value == PluginConfigParams::NO) { - uniformPwlDesign = false; - } else { - THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter " - << "should be equal to YES/NO, but not" << value; - } - }); - - if_set(CONFIG_KEY(PERF_COUNT), [&] { - if (value == PluginConfigParams::YES) { - performance_counting = true; - } else if (value == PluginConfigParams::NO) { - performance_counting = false; - } else { - THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter " - << "should be equal to YES/NO, but not" << value; - } - }); - - if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] { - uint64_t lib_threads = std::stoul(value, NULL, 10); - if (lib_threads == 0 || lib_threads > std::numeric_limits::max()/2-1) { - THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value - << ", should be greateer than 0 and less than 127"; - } - gna_lib_async_threads_num = lib_threads; - }); - - if_set(CONFIG_KEY(SINGLE_THREAD), [&] { - if (value == PluginConfigParams::YES) { - gna_openmp_multithreading = false; - } else if (value == PluginConfigParams::NO) { - gna_openmp_multithreading = true; - } else { - THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; - } - }); + SetConfig(configMap); } -GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) { +GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) const { static const caseless_map LayerNameToType = { { "Input" , Input }, { "Convolution" , Convolution }, @@ -1433,13 +1457,14 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage auto network_precision = network.getPrecision(); network.getInputsInfo(inputs); auto network_input_precision = inputs.begin()->second->getInputPrecision(); - auto batch_sise = network.getBatchSize(); + auto batch_size = network.getBatchSize(); if (network_precision != Precision::FP32) { errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n"; return false; } if (network_input_precision != Precision::FP32 && - network_input_precision != Precision::I16) { + network_input_precision != Precision::I16 && + network_input_precision != Precision::U8) { errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n"; return false; } @@ -1469,7 +1494,9 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n"; check_result = false; } - if (batch_sise != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) { + if (batch_size != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) { + errMessage = "topology with layer: " + layer->name + ", type: " + layer->type + + ", and batch size(" + to_string(batch_size) + ") != 1 not supported"; check_result = false; } }, false); @@ -1477,6 +1504,10 @@ bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage return check_result; } +float GNAPlugin::get_input_scale_factor() const { + return input_scale_factor.empty() ? 1.0 : input_scale_factor.begin()->second; +} + void GNAPlugin::LoadNetwork(ICNNNetwork &network) { // Check the input network std::string error; @@ -1490,21 +1521,34 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { substitutePRelu(layers); layers = CNNNetSortTopologically(*network.get()); reorderMaxPool(layers); - applyOrientations(layers); + // ToDo sort if bool flag "changed" + // returned from insertion function + insertAligningFilterLayer(layers); + +#if ENABLE_AUTO_PERMUTE + layers = CNNNetSortTopologically(*network.get()); + reversePermutations(layers); +#endif + layers = CNNNetSortTopologically(*network.get()); insertIdentityLayer(layers); + layers = CNNNetSortTopologically(*network.get()); + insertCopyLayer(layers); + layers = CNNNetSortTopologically(*network.get()); insertDiagonalLayer(layers); + layers = CNNNetSortTopologically(*network.get()); + substituteScaleShiftBroadCast(layers); }; Config supported = Config({ {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr { if (gnaPrecision == Precision::I16) { ModelQuantizer q; - return q.quantize(network, run_passes, input_scale_factor); + return q.quantize(network, run_passes, get_input_scale_factor()); } if (gnaPrecision == Precision::I8) { ModelQuantizer q; - return q.quantize(network, run_passes, input_scale_factor); + return q.quantize(network, run_passes, get_input_scale_factor()); } THROW_GNA_EXCEPTION << "no mans land for GNA precision"; }}, @@ -1529,24 +1573,13 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { supported.setDefaultDevice(TargetDevice::eGNA); auto newNet = supported.find_configuration(network).convert(network); - auto networkPrecision = newNet->getPrecision(); - if (!networkPrecision.is_float()) { - gnadevice.reset(new GNADeviceHelper(gna_proc_type, - gna_lib_async_threads_num, - gna_openmp_multithreading, - performance_counting)); - gnamem.reset(new gna_memory_type( - make_polymorph(*gnadevice.get()), PAGE_SIZE_BYTES)); - } else { - gnamem.reset(new gna_memory_type(make_polymorph>())); - } + // creating intel dnn_t structures from network auto sortedNet = CNNNetSortTopologically(*newNet); std::vector sortedNoMem; - std::map> memoryPairs; + std::unordered_map> memoryPairs; // find all memory layers pairs and mark which one used as outputs for (auto &layer : sortedNet) { auto generic = dynamic_cast(layer.get()); @@ -1572,16 +1605,28 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { // fill in extra storage with memory layers fillMemoryConnections(memoryPairs); + if (memory_connection.size() != 0) { + gna_lib_async_threads_num = 1; + } + + auto networkPrecision = newNet->getPrecision(); + + if (!networkPrecision.is_float()) { + gnadevice.reset(new GNADeviceHelper(gna_proc_type, + gna_lib_async_threads_num, + gna_openmp_multithreading, + performance_counting)); + gnamem.reset(new gna_memory_type( + make_polymorph(*gnadevice.get()), PAGE_SIZE_BYTES)); + } else { + gnamem.reset(new gna_memory_type(make_polymorph>())); + } + // keep inputs information and create input primitives newNet->getInputsInfo(inputsDataMap); if (inputsDataMap.empty()) { THROW_GNA_EXCEPTION << " No inputs for the topology"; } - if (inputsDataMap.size() != 1) { - THROW_GNA_EXCEPTION << " cannot infer topologies with more than one inputs"; - } - - inputDims = inputsDataMap.begin()->second->getDims(); // keep output dims newNet->getOutputsInfo(outputsDataMap); @@ -1593,7 +1638,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { } outputDims = outputsDataMap.begin()->second->dims; - ptr_inputs_global.resize(gna_lib_async_threads_num); + for (auto && input : inputsDataMap) { + get_ptr_inputs_global(input.first).resize(gna_lib_async_threads_num); + } + ptr_outputs_global.resize(gna_lib_async_threads_num); // CreatingLayer primitives // TODO: solely gna_example convolution hack @@ -1601,11 +1649,25 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) { CreateLayerPrimitive(*layer); } - gnamem->bind_ptr(&ptr_outputs_global.front(), &dnnComponentsForLayer.back().second.ptr_outputs); + DnnComponentsForLayer::iterator output_component = std::find_if(dnnComponentsForLayer.begin(), + dnnComponentsForLayer.end(), + [&](const std::pair& v) + { return outputsDataMap.begin()->first == v.first; }); + + if (output_component == dnnComponentsForLayer.end()) { + if (dnnComponentsForLayer.empty()) { + THROW_GNA_EXCEPTION << "No outputs found in internal structures"; + } + // likely layer is fused. Take last one + output_component = std::prev(dnnComponentsForLayer.end()); + gnalog() << "Output layer "<< outputsDataMap.begin()->first + << " has not been found in component list. Took " + << output_component->first << " instead \n" << std::flush; + } + gnamem->bind_ptr(&ptr_outputs_global.front(), &output_component->second.ptr_outputs); // make room for active list - auto &last_component = dnnComponentsForLayer.back().second; - gnamem->reserve_ptr(nullptr, ALIGN64(last_component.num_bytes_per_output * last_component.num_rows_out)); + gnamem->reserve_ptr(nullptr, ALIGN64(output_component->second.num_bytes_per_output * output_component->second.num_rows_out)); void *pParallelExecutionData = nullptr; @@ -1630,16 +1692,16 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { // in fp32 mode last PWL cannot be computed without that dnn.InitActiveList(NULL); - nnets.push_back(std::make_tuple(make_shared>(0), -1, InferenceEngine::BlobMap())); + nnets.push_back(std::make_tuple(make_shared>(), -1, InferenceEngine::BlobMap())); if (!networkPrecision.is_float()) { // number of layer gets calculated inside that InitGNAStruct function dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj); } - // creating same gna RW segment for paralle infer requests + // creating same gna RW segment for parallel infer requests for (int i = 1; i != gna_lib_async_threads_num; i++) { - nnets.push_back(std::make_tuple(make_shared>(0), -1, InferenceEngine::BlobMap())); + nnets.push_back(std::make_tuple(make_shared>(), -1, InferenceEngine::BlobMap())); // this can be improved by just copy all structures, but we are too lazy dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj); @@ -1656,7 +1718,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { } }; - relocate(ptr_inputs_global[i], ptr_inputs_global[0]); + for (auto &&input : ptr_inputs_global_storage) { + relocate(input[i], input[0]); + } + relocate(ptr_outputs_global[i], ptr_outputs_global[0]); for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) { auto & layer = std::get<0>(nnets[i])->obj.pLayers[j]; @@ -1666,11 +1731,60 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate); } } - orientation_in = dnn.component[0].orientation_in; - orientation_out = dnn.component[dnn.num_components()-1].orientation_out; - num_bytes_per_output = dnn.component[dnn.num_components()-1].num_bytes_per_output; - auto quantized = InferenceEngine::getInjectedData(sortedNoMem.back()); + // calculating input orientation without memory layers, since their orientation not changed during infer right now + std::unordered_map skippedLayers; + for (auto &layer : sortedNet) { + for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) { + auto prevLayer = CNNNetPrevLayer(layer.get(), i); + if (!skippedLayers.count(prevLayer->name)) { + if (CNNNetHasPrevLayer(prevLayer.get())) { + continue; + } + + // we are in the one of input layers + if (LayerInfo(prevLayer).isMemory()) { + continue; + } + } + + auto dnnLayer = findDnnLayer(layer); + string inputName = prevLayer->name; + if (skippedLayers.count(prevLayer->name)) { + inputName = skippedLayers[prevLayer->name]; + } + + // non functional layer - skipped by gna + if (nullptr == dnnLayer) { + // storing input name for skipped layer + skippedLayers[layer->name] = inputName; + continue; + } + + // input orientation might be already initialized, thus verify that it matches + if (!orientation_in.count(inputName)) { + orientation_in[inputName] = dnnLayer->orientation_in; + } else { + if (orientation_in[inputName] != dnnLayer->orientation_in) { + THROW_GNA_EXCEPTION << "orientation for input layer: " << inputName << "cannot be calculated"; + } + } + } + } + + orientation_out = output_component->second.orientation_out; + num_bytes_per_output = output_component->second.num_bytes_per_output; + + // find output layer + auto output = std::find_if(sortedNet.begin(), + sortedNet.end(), + [&](const CNNLayerPtr& v) + { return outputsDataMap.begin()->first == v.get()->name; }); + if (output == sortedNet.end()) { + // likely layer is fused. Take last one + output = std::prev(sortedNet.end()); + } + auto quantized = InferenceEngine::getInjectedData(*output); output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; num_rotate_rows = dnn.num_rotate_rows; @@ -1692,7 +1806,7 @@ void GNAPlugin::DumpXNNToFile() const { } auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices); dump.header.rw_region_size = gnamem->getRWBytes(); - dump.header.input_scaling_factor = input_scale_factor; + dump.header.input_scaling_factor = get_input_scale_factor(); dump.header.output_scaling_factor = output_scale_factor; std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary); dumpStream.write(reinterpret_cast(&dump.header), sizeof(intel_gna_model_header)); @@ -1726,69 +1840,81 @@ void RotateFeatures(uint8_t *ptr_feat, } } -uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) { - return QueueInference(*input.begin()->second.get(), result); - - /*if (!syncPoints.empty()) { - syncPoints.back().second = result; - }*/ -} - -uint32_t GNAPlugin::QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result) { - auto inputLayout = input.layout(); - if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) { - THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " << input.layout(); - } - if (inputLayout == NCHW) { - inputLayout = NC; - } - auto is2D = input.layout() == Layout::NC || input.layout() == Layout ::CN; - +uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) { auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) { return std::get<1>(item) == -1; }); if (freeNnet == nnets.end()) { - THROW_IE_EXCEPTION << as_status << REQUEST_BUSY - << "GNA executable network has max of " << static_cast(gna_lib_async_threads_num) - << " parallel infer requests, please sync one of already running"; + if (memory_connection.size() != 0) { + Wait(0); + freeNnet = nnets.begin(); + } else { + THROW_IE_EXCEPTION << as_status << REQUEST_BUSY + << "GNA executable network has max of " + << static_cast(gna_lib_async_threads_num) + << " parallel infer requests, please sync one of already running"; + } } + auto nnet = std::get<0>(*freeNnet).get(); auto idx = static_cast(std::distance(std::begin(nnets), freeNnet)); - if (ptr_inputs_global[idx] == nullptr) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : global input pointer not set"; - } + for (auto &input : inputs) { + auto inputLayout = input.second->layout(); + if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) { + THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " + << input.second->layout(); + } + if (inputLayout == NCHW) { + inputLayout = NC; + } + auto is2D = input.second->layout() == Layout::NC || input.second->layout() == Layout::CN; - if (orientation_in == kDnnUnknownOrientation) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : input orientation not set"; - } + if (!ptr_inputs_global_id.count(input.first)) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set"; + } - if (orientation_out == kDnnUnknownOrientation) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : output orientation not set"; - } + if (get_ptr_inputs_global(input.first)[idx] == nullptr) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #" + << idx << " not set"; + } - ImportFrames(ptr_inputs_global[idx], - input.cbuffer().as(), - input.precision(), - orientation_in, - input.dims()[input.dims().size() - 1], - is2D ? input.dims()[1] : input.dims()[input.dims().size() - 1], - is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2], - is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2]); + if (orientation_in[input.first] == kDnnUnknownOrientation) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set"; + } + + if (orientation_out == kDnnUnknownOrientation) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : output orientation not set"; + } - if ((inputLayout == Layout::NC || inputLayout == Layout::NCHW) != (orientation_in == kDnnInterleavedOrientation)) { - RotateFeatures(reinterpret_cast(ptr_inputs_global[idx]), - gnadevice ? 2 : 4, - // TODO: only works for cnn4a and google command so far - input.dims()[input.dims().size() - 1], - is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2], // num_feature_vectors looks batch should be there - num_rotate_rows, - num_rotate_columns); + auto dims = input.second->dims(); + + ImportFrames(get_ptr_inputs_global(input.first)[idx], + input.second->cbuffer().as(), + input.second->precision(), + orientation_in[input.first], + dims[dims.size() - 1], + is2D ? dims[1] : dims[dims.size() - 1], + is2D ? dims[0] : dims[0] * dims[1] * dims[2], + is2D ? dims[0] : dims[0] * dims[1] * dims[2]); + bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1; + if (((inputLayout == Layout::NC || inputLayout == Layout::NCHW) + != (orientation_in[input.first] == kDnnInterleavedOrientation)) + && !isOneChannel) { + RotateFeatures(reinterpret_cast(get_ptr_inputs_global(input.first)[idx]), + gnadevice ? 2 : 4, + // TODO: only works for cnn4a and google command so far + dims[dims.size() - 1], + is2D ? dims[0] : dims[0] * dims[2], // num_feature_vectors looks batch should be there + num_rotate_rows, + num_rotate_columns); + } } if (!gnadevice) { @@ -1810,7 +1936,7 @@ void GNAPlugin::Wait(uint32_t idx) { } std::get<1>(nnets[idx]) = -1; - auto & output = *std::get<2>(nnets[idx]).begin()->second; + auto & result = std::get<2>(nnets[idx]); #ifdef PLOT dnn.BeginNewWrite(); if (dnn.num_components() != 0) { @@ -1819,18 +1945,38 @@ void GNAPlugin::Wait(uint32_t idx) { } dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj); #endif + if (result.size() != 1) { + THROW_GNA_EXCEPTION << "Invalid number of outputs for infer request: " << result.size() << ", only 1 supported"; + } + auto & output = *result.begin()->second; if (output.layout() == Layout::NC) { // TODO: rotate can be incorporated with exporting - used only in unit tests so far // TODO: restore: // if (orientation_out != kDnnInterleavedOrientation) { +// if (inputs.size() != 1) { +// THROW_GNA_EXCEPTION << "Invalid number of inputs for for deinterleave " << inputs.size() +// << ", only 1 supported"; +// } +// auto dims = inputs.begin()->second->dims(); // RotateFeatures(reinterpret_cast(ptr_outputs_global), // gnadevice ? 2 : 4, -// input.dims()[input.dims().size() - 1], -// input.dims()[0], // num_feature_vectors looks batch should be there -// input.dims()[0], -// input.dims()[input.dims().size() - 1]); +// dims[dims.size() - 1], +// dims[0], // num_feature_vectors looks batch should be there +// dims[0], +// dims[dims.size() - 1]); // } + // we concider the last layer as output ... + size_t output_layer_index = std::max(0, static_cast(std::get<0>(nnets[idx])->obj.nLayers - 1)); + if (gnadevice && std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].pOutputs != ptr_outputs_global[idx]) { + // ...as this is not true, we should look for output layer index + for (int j = 0; j != std::get<0>(nnets[idx])->obj.nLayers; j++) { + if (std::get<0>(nnets[idx])->obj.pLayers[j].pOutputs == ptr_outputs_global[idx]) { + output_layer_index = j; + break; + } + } + } ExportScores(output.buffer(), ptr_outputs_global[idx], @@ -1841,7 +1987,7 @@ void GNAPlugin::Wait(uint32_t idx) { output.dims()[0], output.dims()[0], // TODO: create better getter consider multiple outputs case - gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[std::get<0>(nnets[idx])->obj.nLayers - 1].nBytesPerOutput : sizeof(float), + gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[output_layer_index].nBytesPerOutput : sizeof(float), sizeof(float)); } else if (output.layout() != Layout::CN) { THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout(); @@ -1884,13 +2030,6 @@ void GNAPlugin::Wait(uint32_t idx) { } } - -void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) { - BlobMap result; - result["output"] = std::shared_ptr(&output, [](Blob*){}); - Wait(QueueInference(input, result)); -} - void GNAPlugin::Reset() { for (auto && memLayer : memory_connection) { std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size); @@ -1900,10 +2039,23 @@ void GNAPlugin::Reset() { } } -void GNAPlugin::Infer(const BlobMap &inputs, BlobMap &result) { - auto &input = *inputs.begin()->second.get(); - auto &output = *result.begin()->second.get(); - Infer(input, output); +void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) { + BlobMap bmInput; + BlobMap bmOutput; + if (inputsDataMap.size() != 1) { + THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << "inputs"; + } + if (outputsDataMap.size() != 1) { + THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << outputsDataMap.size() << "outputs"; + } + + bmInput[inputsDataMap.begin()->first] = std::shared_ptr(const_cast(&input), [](Blob*){}); + bmOutput[outputsDataMap.begin()->first] = std::shared_ptr(&output, [](Blob*){}); + Infer(bmInput, bmOutput); +} + +void GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) { + Wait(QueueInference(input, result)); } Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) { @@ -1914,10 +2066,11 @@ Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) { return outputBlob; } -Blob::Ptr GNAPlugin::GetInputBlob(InferenceEngine::Precision precision) { +Blob::Ptr GNAPlugin::GetInputBlob(std::string name, InferenceEngine::Precision precision) { InferenceEngine::Blob::Ptr inputBlob; // need to have intermediate blob for interleave conversion // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not + auto inputDims = inputsDataMap[name]->getDims(); inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims); inputBlob->allocate(); return inputBlob; @@ -1955,7 +2108,8 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt); serial.Import(basePtr, header.gnaMemSize, inputStream); - ptr_inputs_global.push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.input.descriptor_offset)); + + get_ptr_inputs_global("input").push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.input.descriptor_offset)); ptr_outputs_global.push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.output.descriptor_offset)); auto getOrientation = [](intel_nnet_layer_t & layer) { @@ -1963,14 +2117,14 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str kDnnNonInterleavedOrientation : kDnnInterleavedOrientation; }; - orientation_in = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]); + orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]); orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]); num_bytes_per_output = header.output.element_size; outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup}); - inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup}); + auto inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup}); inputsDataMap["input"] = std::make_shared(); inputsDataMap["input"]->setInputData(make_shared("input", @@ -1983,7 +2137,7 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str Layout::NC); output_scale_factor = header.output.scaleFactor; - input_scale_factor = header.input.scaleFactor; + input_scale_factor["input"] = header.input.scaleFactor; num_rotate_rows = header.nRotateRows; num_rotate_columns = header.nRotateColumns; @@ -2007,20 +2161,25 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str } void GNAPlugin::Export(const std::string &fileName) { - if (ptr_inputs_global.empty() || ptr_outputs_global.empty()) { + if (ptr_inputs_global_id.empty() || ptr_outputs_global.empty()) { THROW_GNA_EXCEPTION << " network not loaded"; } + if (ptr_inputs_global_id.size() != 1) { + THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported"; + } + std::fstream outStream(fileName, ios_base::out | ios_base::binary); // TODO: nnet group parameter looks only used in application - so can we move this line into load network. + auto inputDims = inputsDataMap.begin()->second->getDims(); if (inputDims.size() == 2) { std::get<0>(nnets.front())->obj.nGroup = inputDims[1]; } auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj, - {input_scale_factor, - ptr_inputs_global[0], + {get_input_scale_factor(), + ptr_inputs_global_storage.front()[0], 2, static_cast(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))}, {output_scale_factor, @@ -2043,7 +2202,209 @@ void GNAPlugin::GetPerformanceCounts(std::map &config) {} + +void GNAPlugin::SetConfig(const std::map &config) { + std::vector supportedConfigOptions = { + GNA_CONFIG_KEY(SCALE_FACTOR), + GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), + GNA_CONFIG_KEY(DEVICE_MODE), + GNA_CONFIG_KEY(COMPACT_MODE), + CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), + GNA_CONFIG_KEY(PRECISION), + GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), + CONFIG_KEY(PERF_COUNT), + GNA_CONFIG_KEY(LIB_N_THREADS), + CONFIG_KEY(SINGLE_THREAD) + }; + + for (auto& item : config) { + auto keys = std::find_if(supportedConfigOptions.begin(), supportedConfigOptions.end(), [&item](std::string supportedConfigOption) { + return item.first.find(supportedConfigOption) != std::string::npos; + }); + if (keys == supportedConfigOptions.end()) { + THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first << " not supported"; + } + } + + // holds actual value of a found key + std::string key; + std::string value; + auto if_set = [&](std::string keyInput, const std::function & handler) { + auto keyInMap = config.find(keyInput); + if (keyInMap != config.end()) { + value = keyInMap->second; + handler(); + } + }; + + auto if_start = [&](std::string keyInput, const std::function & handler) { + for (auto && c : config) { + if (c.first.find(keyInput) == 0) { + if (c.first.size() > keyInput.size() + 1) { + key = c.first.substr(keyInput.size() + 1); + value = c.second; + handler(); + } + } + } + }; + + auto fp32eq = [](float p1, float p2) -> bool { + return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); + }; + + auto & log = gnalog(); + + if_start(GNA_CONFIG_KEY(SCALE_FACTOR), [&, this] { + // only identical scale factors supported so far + auto ref = input_scale_factor.size() ? input_scale_factor.begin()->second : 1.0; + input_scale_factor[key] = std::stod(value); + if (ref != 1.0 && !fp32eq(input_scale_factor[key], ref)) { + std::string message = "only identical input scale factors supported, but provided: " + + std::to_string(ref) + " and " + std::to_string(input_scale_factor[key]); + log << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key]; + THROW_GNA_EXCEPTION << "only identical input scale factors supported, but provided: " << ref <<" and " << input_scale_factor[key]; + } + }); + + if (input_scale_factor.empty()) { + if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] { + input_scale_factor["placeHolder"] = std::stod(value); + }); + } + + if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] { + dumpXNNPath = value; + }); + + if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] { + static caseless_unordered_map supported_values = { + {GNAConfigParams::GNA_AUTO, GNA_AUTO}, + {GNAConfigParams::GNA_HW, GNA_HARDWARE}, + {GNAConfigParams::GNA_SW, GNA_SOFTWARE}, + {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE} + }; + auto procType = supported_values.find(value); + if (procType == supported_values.end()) { + log << "GNA device mode unsupported: " << value; + THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value; + } + gna_proc_type = static_cast(procType->second); + }); + + if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] { + if (value == PluginConfigParams::YES) { + compact_mode = true; + } else if (value == PluginConfigParams::NO) { + compact_mode = false; + } else { + log << "GNA compact mode should be YES/NO, but not" << value; + THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value; + } + }); + + if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] { + if (value == PluginConfigParams::YES) { + exclusive_async_requests = true; + } else if (value == PluginConfigParams::NO) { + exclusive_async_requests = false; + } else { + log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; + THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; + } + }); + + if_set(GNA_CONFIG_KEY(PRECISION), [&] { + auto precision = Precision::FromStr(value); + if (precision != Precision::I8 && precision != Precision::I16) { + log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value; + THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value; + } + gnaPrecision = precision; + }); + + if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] { + if (value == PluginConfigParams::YES) { + uniformPwlDesign = true; + } else if (value == PluginConfigParams::NO) { + uniformPwlDesign = false; + } else { + log << "GNA pwl uniform algorithm parameter " + << "should be equal to YES/NO, but not" << value; + THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter " + << "should be equal to YES/NO, but not" << value; + } + }); + + if_set(CONFIG_KEY(PERF_COUNT), [&] { + if (value == PluginConfigParams::YES) { + performance_counting = true; + } else if (value == PluginConfigParams::NO) { + performance_counting = false; + } else { + log << "GNA performance counter enabling parameter " + << "should be equal to YES/NO, but not" << value; + THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter " + << "should be equal to YES/NO, but not" << value; + } + }); + + if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] { + uint64_t lib_threads = std::stoul(value, NULL, 10); + if (lib_threads == 0 || lib_threads > std::numeric_limits::max()/2-1) { + log << "Unsupported accelerator lib number of threads: " << value << ", should be greateer than 0 and less than 127"; + THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value + << ", should be greateer than 0 and less than 127"; + } + gna_lib_async_threads_num = lib_threads; + }); + + if_set(CONFIG_KEY(SINGLE_THREAD), [&] { + if (value == PluginConfigParams::YES) { + gna_openmp_multithreading = false; + } else if (value == PluginConfigParams::NO) { + gna_openmp_multithreading = true; + } else { + log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; + THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; + } + }); +} + +/** + * @depricated Use the version with config parameter + */ +void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network, + InferenceEngine::QueryNetworkResult& res) const { + QueryNetwork(network, {}, res); +} + +void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network, + const std::map& config, + InferenceEngine::QueryNetworkResult& res) const { + std::unordered_set allLayers; + InferenceEngine::InputsDataMap inputs; + + network.getInputsInfo(inputs); + std::vector sortedLayers = CNNNetSortTopologically(network); + + if (inputs.empty()) { + THROW_GNA_EXCEPTION << "Network is empty (GNA)\n"; + } + + auto const & secondLayers = inputs.begin()->second->getInputData()->getInputTo(); + if (secondLayers.empty()) { + THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n"; + } + + InferenceEngine::details::UnorderedDFS(allLayers, + secondLayers.begin()->second, + [&](CNNLayerPtr const layer) { + if (GNAPluginNS::GNAPlugin::LayerTypeFromStr(layer->type) != NO_TYPE) { + res.supportedLayers.insert(layer->name); + } + }, false); + } intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) { if (current->insData.empty()) return nullptr; @@ -2076,7 +2437,7 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi } else { IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out)); // same offsets - gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, nextMemoryLayer.reserved_offset); + gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0); } return; } @@ -2119,6 +2480,13 @@ void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, voi }); if (included == concat_connection.end()) { gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size)); + + for (auto && inputLayer : concatLayerInfoItem.concatInputLayers) { + if ( InferenceEngine::details::CaselessEq() + (inputLayer.name, "input") ) { + bytes_alllocated_for_input[inputLayer.name] = ALIGN64(concatLayerInfoItem.reserved_size) - inputLayer.offset; + } + } } concatLayerInfo->second.output_allocation_flag = true; } @@ -2158,7 +2526,15 @@ intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) { return nullptr; } -GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, size_t offset, int idx) { +std::vector& GNAPlugin::get_ptr_inputs_global(std::string name) { + if (!ptr_inputs_global_id.count(name)) { + ptr_inputs_global_storage.push_front({}); + ptr_inputs_global_id[name] = ptr_inputs_global_storage.begin(); + } + return *ptr_inputs_global_id[name]; +} + +GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, int32_t offset, int idx) { // selecting particular input layers auto prevLayer = CNNNetPrevLayer(layer, idx); @@ -2166,15 +2542,24 @@ GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *pt // real input not a memory input if (LayerInfo(prevLayer).isInput()) { - if (0 == bytes_alllocated_for_input) { - gnamem->push_value(&ptr_inputs_global.front(), static_cast(0), num_data_bytes_in, 64); - bytes_alllocated_for_input = num_data_bytes_in; + if (0 == bytes_alllocated_for_input[prevLayer->name]) { + gnamem->push_value(&get_ptr_inputs_global(prevLayer->name).front(), static_cast(0), num_data_bytes_in, 64); + bytes_alllocated_for_input[prevLayer->name] = num_data_bytes_in; } - if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input, 64)) { - THROW_IE_EXCEPTION << "Layer: " << layer->name << " Cannot bind pointer to already allocated input, due to size_allocated=" - << bytes_alllocated_for_input << ", and size_requested=" << num_data_bytes_in; + if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input[prevLayer->name], 64)) { + THROW_GNA_EXCEPTION + << "Layer: " << layer->name + << " Cannot bind pointer to already allocated input(" << prevLayer->name + << "), due to size_allocated=" << bytes_alllocated_for_input[prevLayer->name] + << ", and size_requested=" << num_data_bytes_in; } - gnamem->bind_ptr(ptr, &ptr_inputs_global.front(), offset); + + if (offset >= 0) { + gnamem->bind_ptr(ptr, &get_ptr_inputs_global(prevLayer->name).front(), offset); + } else { + gnamem->bind_ptr(&get_ptr_inputs_global(prevLayer->name).front(), ptr, -offset); + } + return prevLayer; } @@ -2213,7 +2598,7 @@ GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *pt prevLayer->name); if (concatLayerInfo != concat_connection.end()) { auto & concatLayerInfoItem = concatLayerInfo->second; - // dnnLayer that is input for concat output layer + // dnnLayer that is input for concat layer gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset); // return layer over concat return CNNNetPrevLayer(prevLayer); diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp index 53365d7..34bc866 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,7 @@ #include "gna_memory.hpp" #include "gna_device.hpp" #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include "gna_allocator.hpp" #include "gna_api_wrapper.hpp" +#include "gna_plugin_policy.hpp" namespace GNAPluginNS { @@ -49,9 +51,16 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: */ std::vector> nnets; - intel_dnn_orientation_t orientation_in = kDnnUnknownOrientation; + std::unordered_map orientation_in; intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation; - double input_scale_factor = 1.0; + + /** + * temporary solution to support multiple scale factors + * @return + */ + float get_input_scale_factor() const; + std::unordered_map input_scale_factor; + double output_scale_factor = 1.0; uint32_t num_rotate_rows = 0; uint32_t num_rotate_columns = 0; @@ -60,11 +69,13 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: uint32_t num_feature_maps = 1; uint32_t num_memory_bytes; - std::vector ptr_inputs_global; + std::unordered_map>::iterator> ptr_inputs_global_id; + std::list> ptr_inputs_global_storage; + + std::vector& get_ptr_inputs_global(std::string name); + std::vector ptr_outputs_global; - int16_t *ptr_int_inputs = NULL; - int32_t *ptr_int_outputs = NULL; uint32_t *ptr_active_indices = NULL; uint32_t num_active_indices = 0; uint32_t num_group_in = 0; @@ -81,7 +92,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16; bool performance_counting = false; - int bytes_alllocated_for_input = 0; + intel_dnn_number_type_t output_type = kDnnInt; std::string utterance_name; @@ -136,14 +147,13 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: * @deprecated Use the version with config parameter */ void QueryNetwork(const InferenceEngine::ICNNNetwork &network, - InferenceEngine::QueryNetworkResult &res) const override { } + InferenceEngine::QueryNetworkResult &res) const override; void QueryNetwork(const InferenceEngine::ICNNNetwork &network, const std::map& config, - InferenceEngine::QueryNetworkResult &res) const override { } + InferenceEngine::QueryNetworkResult &res) const override; uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result); void Wait(uint32_t idx = 0); - uint32_t QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result); /** * * @param sync - points to gna sync point @@ -163,7 +173,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: /** * utility to provide input and output blobs externally to be used by InferenceEngine request API clients */ - InferenceEngine::Blob::Ptr GetInputBlob(InferenceEngine::Precision precision); + InferenceEngine::Blob::Ptr GetInputBlob(std::string name, InferenceEngine::Precision precision); InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision); /** * helpers to provide inputs info on AOT network @@ -176,7 +186,13 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: */ std::vector QueryState(); + /** + * test-wise API + */ + void SetPolicy(Policy p) {policy = p;} + protected: + Policy policy; uint32_t num_cnn_rows_out = 0; bool done = false; std::string dumpXNNPath; @@ -185,6 +201,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: void DumpXNNToFile() const; void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr); void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false); + void AffineFilterPrimitive(InferenceEngine::CNNLayerPtr); void DiagonalPrimitive(InferenceEngine::CNNLayerPtr); void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr); void PermutePrimitive(InferenceEngine::CNNLayerPtr); @@ -198,7 +215,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: void PWLPrimitive(InferenceEngine::CNNLayerPtr); void CopyPrimitive(InferenceEngine::CNNLayerPtr); bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage); - LayerType LayerTypeFromStr(std::string const &str); + LayerType LayerTypeFromStr(std::string const &str) const; /** * maps tpe of connection to input and output layers also stores gna_pointer for alloc request */ @@ -272,7 +289,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; } /** - * gna memory of this size is reserved for concat + * gna memory of this size is reserved for split */ size_t reserved_size = 0; bool output_allocation_flag = false; @@ -318,16 +335,16 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: void *gna_ptr = nullptr; }; using MemoryConnection = std::list>; - using ConcatConnection = std::map; - using SplitConnection = std::map; - using CropConnection = std::map; + using ConcatConnection = std::unordered_map; + using SplitConnection = std::unordered_map; + using CropConnection = std::unordered_map; // layers with extra storage for connections and additional // non trivial processing MemoryConnection memory_connection; ConcatConnection concat_connection; SplitConnection split_connection; CropConnection crop_connection; - void fillMemoryConnections(std::map> &memoryPairs); void fillConcatConnections(InferenceEngine::CNNLayerPtr layer); @@ -336,7 +353,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: * maps layer name to dnn.component, in topological sort prev nodes will be initialized */ using DnnComponentsForLayer = std::list>; - std::list> dnnComponentsForLayer; + DnnComponentsForLayer dnnComponentsForLayer; /** * @brief returns corresponding dnn layer for topology layer @@ -356,6 +373,15 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: std::unique_ptr gnamem; /** + * Fill in the Affine layer weights + * @param layer - affine layer pointer + * @param ptrWeights - pointer to weights memory + * @param offset - memory before offset value will be zeroed + * @param isQuantized - information about layer quantization + */ + void FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized = false); + + /** * Connects either memory output, or generic output to a layer * @param layer - layer pointer * @param ptr - pointer to pointer where to store output layer information @@ -387,7 +413,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer, void *pVoid, size_t num_data_bytes_in, - size_t offset = 0, + int32_t offset = 0, int idx = 0); void ImportFrames(void *ptr_dst, @@ -438,18 +464,26 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: const GNASplitLayer& splitInfo, size_t precision_size); /** - * @brief GNA affine layers are always have activation atatched, while IR not - * @param net - copied net ready for quantisation + * @brief GNA affine layers are always have activation atached, while IR not */ void insertIdentityLayer(std::vector &layers); /** - * @brief GNA convolution layers have deinterleaved oriantations, while affine one doesn't + * @brief GNA cannot support broadcast - so we will tile weights and biases for scaleshift layer + */ + void substituteScaleShiftBroadCast(std::vector &layers); + + + /** + * @brief GNA convolution layers have deinterleaved layout, while affine one doesn't * so between convolution and affine layers permute layers need to be inserted, - * or removed if they are present in topology + * current MO approach is to insert such permutations + * since GNA-HW already support conv->affine in permuted for, this pass inverses MO behavior + * so its remove permutations of certain form conv->conv, and between conv->affine + * and insert permutation between conv->affine if they are missed in IR * @param layers */ - void applyOrientations(std::vector &layers); + void reversePermutations(std::vector &layers); /** @@ -477,9 +511,13 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: */ void insertCopyLayer(std::vector & layers); - intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current); + /** + * aligned filter layer insertion required in cases when split/slice have output connections on not aligned addresses + */ + void insertAligningFilterLayer(std::vector & layers); - InferenceEngine::SizeVector inputDims; + intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current); + std::map bytes_alllocated_for_input; InferenceEngine::InputsDataMap inputsDataMap; InferenceEngine::SizeVector outputDims; diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp index f82e443..15a3436 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_config.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_config.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp index d231274..96d4763 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,7 +13,7 @@ using namespace GNAPluginNS; INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept { try { - plugin = make_ie_compatible_plugin({1, 5, "GNAPlugin", "GNAPlugin"}, make_shared()); + plugin = make_ie_compatible_plugin({1, 6, "GNAPlugin", "GNAPlugin"}, make_shared()); return OK; } catch (std::exception &ex) { diff --git a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp index 3c2dcf0..f23b938 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,15 +15,38 @@ namespace GNAPluginNS { class GNAPluginInternal : public InferenceEngine::InferencePluginInternal { public: - InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, - const std::map &config) override { + InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl( + InferenceEngine::ICNNNetwork &network, + const std::map &config) override { return std::make_shared(network, config); } - void SetConfig(const std::map &config) override {} - InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName, - const std::map &config) override { + void SetConfig(const std::map &config) override { + auto plg = std::make_shared(); + plg->SetConfig(config); + } + InferenceEngine::IExecutableNetwork::Ptr ImportNetwork( + const std::string &modelFileName, + const std::map &config) override { return make_executable_network(std::make_shared(modelFileName, config)); } + + /** + * @depricated Use the version with config parameter + */ + void QueryNetwork(const InferenceEngine::ICNNNetwork& network, + InferenceEngine::QueryNetworkResult& res) const override { + auto plg = std::make_shared(); + plg->QueryNetwork(network, {}, res); + } + void QueryNetwork(const InferenceEngine::ICNNNetwork& network, + const std::map& config, + InferenceEngine::QueryNetworkResult& res) const override { + auto plg = std::make_shared(); + try { + plg->SetConfig(config); + } catch (InferenceEngine::details::InferenceEngineException& e) {} + plg->QueryNetwork(network, config, res); + } }; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp index 08f45ad..6905f66 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/gna_plugin_passes.cpp b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp index 79d42d2..22cf3c0 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_passes.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp @@ -1,11 +1,15 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "gna_plugin_policy.hpp" #include #include #include #include +#include +#include +#include #include #include "gna_plugin.hpp" @@ -13,11 +17,12 @@ using namespace InferenceEngine; -using namespace std; +using namespace InferenceEngine::details; using namespace GNAPluginNS; void GNAPlugin::insertDiagonalLayer(std::vector & layers) { int numOfDiagLayers = 0; + auto quantized = InferenceEngine::getInjectedData(layers.front()); for (auto & l : layers) { if (l->insData.empty()) continue; auto prevLayer = CNNNetPrevLayer(l); @@ -51,18 +56,20 @@ void GNAPlugin::insertDiagonalLayer(std::vector & layers) { #endif // actual insertion auto diagName = std::string("SyntheticScaleShift_") + std::to_string(numOfDiagLayers++); - auto diagLayer = make_shared(LayerParams({diagName, "ScaleShift", Precision::FP32})); + auto diagLayer = std::make_shared(LayerParams({diagName, "ScaleShift", Precision::FP32})); // TODO: diagonal size std::vector arrayOf1(l->outData[0]->dims[0], 1.f); - diagLayer->_weights = make_shared_blob(l->outData[0]->precision, Layout::C, arrayOf1);; + diagLayer->_weights = make_shared_blob(l->outData[0]->precision, Layout::C, arrayOf1); auto newDims = l->outData[0]->dims; auto dataPtr = std::make_shared(diagName, newDims, l->outData[0]->precision, l->outData[0]->layout); - auto diagonalWithQuant = InferenceEngine::injectData(diagLayer); + auto diagonalWithQuant = quantized ? + InferenceEngine::injectData(diagLayer) : + diagLayer; dataPtr->creatorLayer = diagonalWithQuant; diagonalWithQuant->outData.push_back(dataPtr); @@ -93,7 +100,7 @@ void GNAPlugin::reorderMaxPool(std::vector & layer } std::vector GNAPlugin::getCandidatesForIdentityInsertion(const CNNLayerPtr l) { - vector prevLayers; + std::vector prevLayers; // skipping memory inputs and true inputs layers if (l->insData.empty()) return {}; @@ -199,8 +206,8 @@ void GNAPlugin::substitutePRelu(std::vector &layer auto relu1 = outputLayers.begin()->second; auto neg1 = (++outputLayers.begin())->second; if (second.isRelu()) { - swap(first, second); - swap(relu1, neg1); + std::swap(first, second); + std::swap(relu1, neg1); } if (!first.isRelu()) continue; // now we have relu as first layer, lets check second @@ -254,11 +261,108 @@ void GNAPlugin::substitutePRelu(std::vector &layer } } -void GNAPlugin::applyOrientations(std::vector & layers) { +void GNAPlugin::reversePermutations(std::vector &layers) { + std::function)> prevLayerSkipCertain + = [&prevLayerSkipCertain](CNNLayerPtr layer, std::function shouldSkip) -> CNNLayerPtr { + if (CNNNetHasPrevLayer(layer.get())) { + return nullptr; + } + auto prev = CNNNetPrevLayer(layer); + + if (!shouldSkip(prev)) return prevLayerSkipCertain(prev, shouldSkip); + + return prev; + }; + + auto prevLayerSkipReshape = [&prevLayerSkipCertain](CNNLayerPtr layer) -> CNNLayerPtr { + return prevLayerSkipCertain(layer, [] (CNNLayerPtr l2) { + return LayerInfo(l2).isReshape(); + }); + }; + + + std::function nextLayerSkipReshape = [&nextLayerSkipReshape](CNNLayerPtr layer) -> CNNLayerPtr { + if (layer->outData.empty()) { + return nullptr; + } + if (layer->outData.front()->inputTo.size() != 1) { + return nullptr; + } + auto next = layer->outData.front()->inputTo.begin()->second; + + if (LayerInfo(next).isReshape()) return nextLayerSkipReshape(next); + + return next; + }; + + auto prevConv = [&prevLayerSkipCertain](CNNLayerPtr layer) -> CNNLayerPtr { + return prevLayerSkipCertain(layer, [] (CNNLayerPtr l2) { + return + LayerInfo(l2).isReshape() || + LayerInfo(l2).isPooling() || + LayerInfo(l2).isActivation(); + }); + }; + + std::unordered_set affineWithPermutedWeights; + std::list permutationstoRemove; + + for (auto & l : layers) { + if (!LayerInfo(l).isPermute()) { + continue; + } + + auto layerOrder = l->GetParamAsInts("order"); + + if (layerOrder != std::vector({0, 3, 2, 1})) { + THROW_GNA_EXCEPTION << "Unsupported permute layer: " << l->name << ", order: was " << l->GetParamAsString("order") << + ", but support order is 0,3,2,1"; + } + + // search for it's input convolution + auto prev = prevConv(l); + + // pooling no used in speech models without convolution + if (!prev) { + THROW_GNA_EXCEPTION << "Unsupported permute layer: " << l->name << " no valid input to that layer"; + } + + // we can remove that permutation if it is input to ScaleShift or FC layer + auto next = nextLayerSkipReshape(l); + if (!next || !LayerInfo(next).isFullyConnected()) { + THROW_GNA_EXCEPTION << "Unsupported permute layer: " << l->name << " no valid output of that layer"; + } + + permutationstoRemove.push_back(l); + + // removing that permutation layer and saving information about affine + affineWithPermutedWeights.insert(next->name); + } + + for (auto && toRemove : permutationstoRemove) { + CNNNetworkRemoveLayer(toRemove); + } + + // search for conv->affine sequences + for (auto & l : layers) { + if (!LayerInfo(l).isFullyConnected() || 0 != affineWithPermutedWeights.count(l->name)) { + continue; + } + // found an affine layer that not involved in permutations removing + // searching whether it has direct input from convolution + auto prevConvLayer = prevConv(l); + if (!prevConvLayer) continue; + + auto directPrev = CNNNetPrevLayer(l); + + // TODO : make new permute + CNNNetworkInsertLayer(l, directPrev, CNNLayerPtr(nullptr)); + } } void GNAPlugin::insertIdentityLayer(std::vector &layers) { int numOfIdentityLayers = 0; + auto quantized = InferenceEngine::getInjectedData(layers.front()); for (auto & l : layers) { for (auto && prev : getCandidatesForIdentityInsertion(l)) { // actual insertion @@ -267,7 +371,7 @@ void GNAPlugin::insertIdentityLayer(std::vector &layers) { gnalog() << "Inserted "<< activationName << " between: " << prev->name << " and " << l->name << "\n" << std::flush; CNNLayerPtr activationLayer = - make_shared(LayerParams({activationName, "identity", Precision::FP32})); + std::make_shared(LayerParams({activationName, "identity", Precision::FP32})); auto inputData = l->insData[0].lock(); auto newDims = inputData->dims; std::reverse(begin(newDims), end(newDims)); @@ -276,8 +380,9 @@ void GNAPlugin::insertIdentityLayer(std::vector &layers) { TensorDesc(inputData->precision, newDims, inputData->layout)); - - auto activationLayerWithQuant = InferenceEngine::injectData(activationLayer); + auto activationLayerWithQuant = quantized ? + InferenceEngine::injectData(activationLayer) : + activationLayer; dataPtr->creatorLayer = activationLayerWithQuant; activationLayerWithQuant->outData.push_back(dataPtr); // wether 1 identity or all outputs TODO possible grouping here, need to implement special groupped inserter @@ -299,6 +404,7 @@ void GNAPlugin::insertIdentityLayer(std::vector &layers) { void GNAPlugin::insertCopyLayer(std::vector & layers) { int numCopyLayers = 0; + auto quantized = InferenceEngine::getInjectedData(layers.front()); for (auto & l : layers) { if (l->insData.empty()) continue; auto prevLayer = CNNNetPrevLayer(l); @@ -317,7 +423,7 @@ void GNAPlugin::insertCopyLayer(std::vector & laye gnalog() << "Inserted "<< copyName << " between: " << l->name << " and " << prevLayer->name << "\n" << std::flush; CNNLayerPtr copyLayer = - make_shared(LayerParams({copyName, "Copy", Precision::FP32})); + std::make_shared(LayerParams({copyName, "Copy", Precision::FP32})); auto inputData = l->insData[0].lock(); auto newDims = inputData->dims; @@ -329,10 +435,174 @@ void GNAPlugin::insertCopyLayer(std::vector & laye newDims, inputData->layout)); - auto copyWithQuant = InferenceEngine::injectData(copyLayer); + auto copyWithQuant = quantized ? + InferenceEngine::injectData(copyLayer) : + copyLayer; dataPtr->creatorLayer = copyWithQuant; copyWithQuant->outData.push_back(dataPtr); CNNNetworkInsertLayer(prevLayer, l, copyWithQuant); } } } + +void GNAPlugin::insertAligningFilterLayer(std::vector & layers) { + // currently split layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this no necessary but usefull for testing + const int bytesPerSplitElement = 2; + auto quantized = InferenceEngine::getInjectedData(layers.front()); + + int numOfFilterLayers = 0; + for (auto &l : layers) { + auto info = LayerInfo(l); + if (!info.isSplit() && !info.isSlice()) { + continue; + } + + size_t currentOffset = 0; + int splitOutIndex = 0; + for (auto &&splitOutput : l->outData) { + auto outputSize = product(++begin(splitOutput->getDims()), end(splitOutput->getDims())); + + if (currentOffset != ALIGN64(currentOffset)) { + // this split output not beginning from 64 bytes aligned boundary - need to correct by alligning filter layer +#ifdef PLOT + // getting list of layers attached to current split output + gnalog() << "Inserted Affine Filter Layer between: " << l->name << " and : \n"; + for (auto &&followingLayers : splitOutput->getInputTo()) { + gnalog() << " " << followingLayers.second->name << "\n"; + } + gnalog() << std::flush; +#endif + // insert the filter + auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++); + auto filterLayer = + std::make_shared(LayerParams({filterName, "AffineFilter", Precision::FP32})); + + + auto inputData = splitOutput; + auto newDims = splitOutput->dims; + + size_t aligned64_offset = std::max(0, static_cast(ALIGN64(currentOffset) - 64)); + size_t newOutputSize = (currentOffset + ALIGN(outputSize, 8) * bytesPerSplitElement - aligned64_offset) + / bytesPerSplitElement; + + // encodes offset to beginning of split layer input + filterLayer->params["offset"] = std::to_string(aligned64_offset); + + auto &num_rows_out = splitOutput->dims[0]; + + std::vector filterWeights(newOutputSize * num_rows_out, 0.f); + + auto offset = (currentOffset - aligned64_offset) / bytesPerSplitElement; + + for (int i = 0; i != outputSize; i++) { + filterWeights[offset] = 1.0f; + offset += newOutputSize + 1; + } + + filterLayer->_weights = make_shared_blob(inputData->precision, Layout::C, filterWeights); + + std::reverse(begin(newDims), end(newDims)); + + auto outData = std::make_shared(filterName, + TensorDesc(splitOutput->precision, + newDims, + inputData->layout)); + + auto filterWithQuant = quantized ? + InferenceEngine::injectData(filterLayer) : + filterLayer; + outData->creatorLayer = filterWithQuant; + filterWithQuant->outData.push_back(outData); + CNNNetworkInsertLayer(l, nullptr, filterWithQuant, splitOutIndex); + } + + + // search data that starts from unaligned location + currentOffset += outputSize * bytesPerSplitElement; + splitOutIndex++; + } + } +} + +void GNAPlugin::substituteScaleShiftBroadCast(std::vector &layers) { + auto quantized = InferenceEngine::getInjectedData(layers.front()); + for (auto & l : layers) { + LayerInfo layerInfo(l); + + if (!layerInfo.isScaleShift()) { + continue; + } + + auto scaleShift = layerInfo.as(); + + auto insData = scaleShift->insData.front().lock(); + if (!insData) { + THROW_GNA_EXCEPTION << "Cannot get inputs data for layer: " << l->name; + } + + if (insData->getDims().size() <= 2) { + // NC or C cannot do broadcast + continue; + } + auto batchSize = insData->getDims()[0]; + auto nElements = details::product(insData->getDims()) / batchSize; + auto weightsElements = scaleShift->_weights->size(); + auto weightsBytes = scaleShift->_weights->byteSize(); + + if (nElements == weightsElements) { + continue; + } + + // only 3d scaleshift supported where number of c is arbitrary + auto lastD = insData->getDims()[insData->getDims().size() - 1]; + if (lastD != weightsElements) { + THROW_GNA_EXCEPTION << "Unsupported layer: " << l->name + << " should have last dim(" << lastD << ") equal to weights(" << weightsElements << ") length"; + } + if (insData->getDims().size() == 2) { + THROW_GNA_EXCEPTION << "For layer: " << l->name + << " weights size(" << weightsElements<< ") invalid: should match input size of(" << lastD << ")"; + } + + gnalog() << "Substitution ScaleShift broadcast for layer: " << l->name << "\n"; + // approach 1 - weights tiling + if (policy.ScaleShiftPolicy == Policy::WEIGHTS_TILING) { + auto tileBlob = [](Blob::Ptr &blob, size_t TileTo){ + auto weightsElements = blob->size(); + auto weightsBytes = blob->byteSize(); + if (weightsElements == 0) { + THROW_IE_EXCEPTION << "Blob size is 0"; + } + if (TileTo % weightsElements) { + return false; + } + + auto tiledBlob = make_plain_blob(blob->getTensorDesc().getPrecision(), {TileTo}); + tiledBlob->allocate(); + + + for (int i=0; i != TileTo / weightsElements; i++) { + ie_memcpy(tiledBlob->buffer().as() + i * weightsBytes, weightsBytes, blob->cbuffer(), weightsBytes); + } + blob = tiledBlob; + return true; + }; + + if (!tileBlob(scaleShift->_weights, nElements)) { + THROW_GNA_EXCEPTION << "Cannot tile weights for layer: " << l->name << ", due to weights size not GCD of dims product"; + } + if (scaleShift->_biases) { + if (!tileBlob(scaleShift->_biases, nElements)) { + THROW_GNA_EXCEPTION << "Cannot tile biases for layer: " << l->name << ", due to biases size not GCD of dims product"; + } + } + + // currently data type no providing reshape method of tensor desc + scaleShift->outData.front()->reshape({batchSize, nElements}, Layout::NC); + insData->reshape({batchSize, nElements}, Layout::NC); + } else { + THROW_GNA_EXCEPTION << "Not implemented substitution of scaleshift broadcast policy of " + << policy.ScaleShiftPolicy << "using layers tiling, layer: " << l->name; + } + } +} \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/gna_plugin_policy.hpp b/inference-engine/src/gna_plugin/gna_plugin_policy.hpp new file mode 100644 index 0000000..1d499c4 --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_plugin_policy.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + + +namespace GNAPluginNS { +/** + * @brief policy agregates various settings that cannot be tweak using configuration options right now, + * and essential to keep test coverage for options both in on and off cases + */ +class Policy { + public: + /** + * @brief for scaleshift substitution, weight tiling simplify final graph but have extra weights overhead + * if not defined scaleshift broadcast will result in creating multiple diagonal layers instead of weight tiling + */ + enum { + WEIGHTS_TILING, + /** + * GNA has limited amount of batch so even existed topologies cannot be substituted with only batching, + * this option combines batch and weights tiling + */ + BATCH_AND_WEIGHTS_TILING, + DIAGLAYER_TILING + } ScaleShiftPolicy = WEIGHTS_TILING; + + /** + * Policy on whether to substitute permute layers or not + */ + enum { + DISABLED, + AUTO_PERMUTE + } PermutePolicy = DISABLED; +}; + +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/lstm.cpp b/inference-engine/src/gna_plugin/lstm.cpp index 53906e6..e1c0f7e 100644 --- a/inference-engine/src/gna_plugin/lstm.cpp +++ b/inference-engine/src/gna_plugin/lstm.cpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// lstm.cpp : GNA LSTM macro layer definition +// #include "lstm.hpp" diff --git a/inference-engine/src/gna_plugin/lstm.hpp b/inference-engine/src/gna_plugin/lstm.hpp index 6ce8f10..87f96bc 100644 --- a/inference-engine/src/gna_plugin/lstm.hpp +++ b/inference-engine/src/gna_plugin/lstm.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/polymorh_allocator.hpp b/inference-engine/src/gna_plugin/polymorh_allocator.hpp index d50d8a3..6742ba3 100644 --- a/inference-engine/src/gna_plugin/polymorh_allocator.hpp +++ b/inference-engine/src/gna_plugin/polymorh_allocator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/pwl.h b/inference-engine/src/gna_plugin/pwl.h index fd45903..061dd56 100644 --- a/inference-engine/src/gna_plugin/pwl.h +++ b/inference-engine/src/gna_plugin/pwl.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/pwl_design.cpp b/inference-engine/src/gna_plugin/pwl_design.cpp index 1f325ba..2d150df 100644 --- a/inference-engine/src/gna_plugin/pwl_design.cpp +++ b/inference-engine/src/gna_plugin/pwl_design.cpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// pwl_design.cpp : simple activation function designer +// #include "pwl.h" #include "gna_plugin_log.hpp" diff --git a/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp index 6c42d92..442be42 100644 --- a/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp +++ b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -199,6 +199,11 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc, uint32_t num_rows = isDiagonal ? 1 : wl->outData[0]->getDims()[1]; uint32_t num_columns = wl->insData[0].lock().get()->getDims()[1]; + if (wl->type == "AffineFilter") { + // for affine filter layer insdata size not equal to actual coded in input layer + num_columns = wl->_weights->size() / num_rows; + } + if (isDiagonal) { std::swap(num_rows, num_columns); } diff --git a/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp index 797c87c..c0f1852 100644 --- a/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp +++ b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp @@ -1,10 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once - -#pragma once #include #include "gna_plugin_config.hpp" #include "layer_transform.hpp" @@ -49,7 +47,7 @@ class ModelQuantizer { gnalog() << layer->name << std::endl; } - // weights scale is a hint, not all weightable layer preserve it in all possible precisions + // weights scale is a hint, not all weightable layers preserve it in all possible precisions propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), scaleFactor); // sorted order gives possibility for propagate quantisation along depended layers diff --git a/inference-engine/src/gna_plugin/quantization/precision_ex.hpp b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp index 798345e..c3782fb 100644 --- a/inference-engine/src/gna_plugin/quantization/precision_ex.hpp +++ b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/quantization/quantization.cpp b/inference-engine/src/gna_plugin/quantization/quantization.cpp index 457bff9..1609d5d 100644 --- a/inference-engine/src/gna_plugin/quantization/quantization.cpp +++ b/inference-engine/src/gna_plugin/quantization/quantization.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/quantization/quantization.h b/inference-engine/src/gna_plugin/quantization/quantization.h index bd1ff7b..8e704fd 100644 --- a/inference-engine/src/gna_plugin/quantization/quantization.h +++ b/inference-engine/src/gna_plugin/quantization/quantization.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp index 347102b..aaa53c9 100644 --- a/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp +++ b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp index a3ba22c..1585463 100644 --- a/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp +++ b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,7 @@ #include #include #include +#include #include "gna_layer_info.hpp" #include "ie_layers.h" #include "gna_plugin_log.hpp" @@ -53,6 +54,25 @@ class ScaleFactorPerLayer { const float identity_scale_factor = 2049.0f; const float k = 5; const float k_identity = 6; + + protected : + static bool fp32eq(float p1, float p2) { + return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); + } + float getActivationScale(GNAPluginNS::LayerInfo const& layer, QuantizedLayerParams const* qunatizedParams) { + // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights + // set the initial value + float result = 1.0f; + result = (layer.isIdentity()) ? identity_scale_factor : activation_scale_factor; + // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow + if (layer.isRelu() && + static_cast(result * qunatizedParams->_src_quant.scale) + > std::numeric_limits::max()-1) { + result = (result * 0.5); + } + return result; + } + public : bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { if ( !cnnLayer ) { @@ -62,21 +82,43 @@ class ScaleFactorPerLayer { // TODO: current approach set input scale factor for true input layer(s) equals to provided factor, auto quant = getInjectedData(*cnnLayer); if (InferenceEngine::details::CaselessEq()(cnnLayer->type, "Memory")) { - // for memory output layer need to verify it's input scale factor - if (CNNNetHasPrevLayer(cnnLayer)) { + if (CNNNetHasPrevLayer(cnnLayer)) { auto prevLayer = CNNNetPrevLayer(cnnLayer); + auto prevInfo = LayerInfo(prevLayer); auto inputQuant = getInjectedData(prevLayer); - if (inputQuant->_dst_quant.scale != activation_scale_factor) { - gnawarn() << "[WARNING] quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") " - << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : " - << activation_scale_factor << std::endl; - inputQuant->_dst_quant.scale = activation_scale_factor; - // restarting from that activation; - result = ScaleFactorUpdateResult(prevLayer.get()); + // locating corresponding memory layers ith same ID + for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) { + LayerInfo ll(input); + if (!ll.isMemory() || + !InferenceEngine::details::CaselessEq()(input->params["id"], cnnLayer->params["id"])) { + continue; + } + + auto quantSibling = getInjectedData(input); + + // after restarting from memory input - quant is fine + if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) { + quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale; + return true; + } + + if (!fp32eq(quantSibling->_dst_quant.scale, 1)) { + // means we already restarted propagation from that memory layer - we cannot do mach here + THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") " + << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : " + << activation_scale_factor; + } + + gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")" + << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : " + << activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl; + + // try updating memory input layer scale factor and restart from it + quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale; + result = ScaleFactorUpdateResult(input.get()); return true; } } - quant->_src_quant.scale = quant->_dst_quant.scale = activation_scale_factor; return true; } @@ -93,13 +135,7 @@ class ScaleFactorPerLayer { if (layerInfo.isActivation()) { // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // set the initial value - quant->_dst_quant.scale = layerInfo.isIdentity() ? identity_scale_factor:activation_scale_factor; - // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow - if (layerInfo.isRelu() && - static_cast(quant->_dst_quant.scale * quant->_src_quant.scale) - > std::numeric_limits::max()-1) { - quant->_dst_quant.scale = (quant->_dst_quant.scale * 0.5); - } + quant->_dst_quant.scale = getActivationScale(layerInfo, quant); } return true; } @@ -170,7 +206,7 @@ class ScaleFactorPerLayer { } // if we are here it means that we are in the port 1 - if (info.isFullyConnected() || info.isConvolutional()) { + if (info.isFullyConnected() || info.isConvolution()) { auto quantDataForInputLayer = InferenceEngine::getInjectedData(*in); auto newOutputScale = quantParams->_dst_quant.scale * maxValue; auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale; @@ -193,6 +229,53 @@ class ScaleFactorPerLayer { }; template<> +class ScaleFactorPerLayer { + public: + bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { + if ( !concatLayer ) { + THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n"; + } + auto in0 = InferenceEngine::CNNNetPrevLayer(concatLayer, 0); + auto in1 = InferenceEngine::CNNNetPrevLayer(concatLayer, 1); + auto infoIn0 = LayerInfo(in0); + auto infoIn1 = LayerInfo(in1); + auto quantParams0 = InferenceEngine::getInjectedData(in0); + auto quantParams1 = InferenceEngine::getInjectedData(in1); + GNAPluginNS::QuantizedLayerParams* sourceQuantParams = NULL; + auto quantData = InferenceEngine::getInjectedData(*concatLayer); + + if (quantParams0->_dst_quant.scale == quantParams1->_dst_quant.scale) { + return true; + } else if (infoIn0.isInput() && infoIn1.isInput()) { + THROW_GNA_EXCEPTION << "Two Input layers has different scales in concat!!! \n"; + } + + int i = 0; + if (infoIn0.isInput()) { + sourceQuantParams = quantParams0; + } else if (infoIn1.isInput()) { + ++i; + sourceQuantParams = quantParams1; + } + + if (!sourceQuantParams) { + THROW_GNA_EXCEPTION << "Concat quantization for this case need to be implemented!!! \n"; + } + auto destinationQuantParams = + InferenceEngine::getInjectedData(InferenceEngine::CNNNetPrevLayer(concatLayer, !i)); + InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(concatLayer, !i); + + quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; + quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale; + + destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; + result = ScaleFactorUpdateResult(in.get()); + + return true; + } +}; + +template<> class ScaleFactorPerLayer { private: float const _scale_reduction_50 = 0.50; diff --git a/inference-engine/src/gna_plugin/util.cpp b/inference-engine/src/gna_plugin/util.cpp index c10e317..e6f5776 100644 --- a/inference-engine/src/gna_plugin/util.cpp +++ b/inference-engine/src/gna_plugin/util.cpp @@ -1,6 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// util.cpp : various utility functions for debugging, file i/o, etc. +// #include #ifndef _WIN32 diff --git a/inference-engine/src/gna_plugin/util.h b/inference-engine/src/gna_plugin/util.h index 0838bd2..523d35e 100644 --- a/inference-engine/src/gna_plugin/util.h +++ b/inference-engine/src/gna_plugin/util.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/hetero_plugin/CMakeLists.txt b/inference-engine/src/hetero_plugin/CMakeLists.txt index 7456834..a073998 100644 --- a/inference-engine/src/hetero_plugin/CMakeLists.txt +++ b/inference-engine/src/hetero_plugin/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -25,3 +25,5 @@ add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN) add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS}) target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS}) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) + +add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) diff --git a/inference-engine/src/hetero_plugin/fallback_policy.cpp b/inference-engine/src/hetero_plugin/fallback_policy.cpp index bc278f1..9288db7 100644 --- a/inference-engine/src/hetero_plugin/fallback_policy.cpp +++ b/inference-engine/src/hetero_plugin/fallback_policy.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "fallback_policy.h" diff --git a/inference-engine/src/hetero_plugin/fallback_policy.h b/inference-engine/src/hetero_plugin/fallback_policy.h index 59f112a..5547ee8 100644 --- a/inference-engine/src/hetero_plugin/fallback_policy.h +++ b/inference-engine/src/hetero_plugin/fallback_policy.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp b/inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp index 3fa1e8e..5aa360b 100644 --- a/inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp +++ b/inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "hetero_async_infer_request.h" diff --git a/inference-engine/src/hetero_plugin/hetero_async_infer_request.h b/inference-engine/src/hetero_plugin/hetero_async_infer_request.h index 3532765..d09ada9 100644 --- a/inference-engine/src/hetero_plugin/hetero_async_infer_request.h +++ b/inference-engine/src/hetero_plugin/hetero_async_infer_request.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // /** diff --git a/inference-engine/src/hetero_plugin/hetero_device_loader.cpp b/inference-engine/src/hetero_plugin/hetero_device_loader.cpp index 79728a9..589388e 100644 --- a/inference-engine/src/hetero_plugin/hetero_device_loader.cpp +++ b/inference-engine/src/hetero_plugin/hetero_device_loader.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "hetero_device_loader.h" diff --git a/inference-engine/src/hetero_plugin/hetero_device_loader.h b/inference-engine/src/hetero_plugin/hetero_device_loader.h index e8fbab4..f9b9e4c 100644 --- a/inference-engine/src/hetero_plugin/hetero_device_loader.h +++ b/inference-engine/src/hetero_plugin/hetero_device_loader.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp index 1192abb..b6f4286 100644 --- a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp +++ b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "hetero_executable_network.h" @@ -208,10 +220,17 @@ void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_, _deviceLoaders[affinity]->SetLogCallback(*listener); } + InferenceEngine::ICNNNetworkStats* networkStats = nullptr; + if (StatusCode::OK != network.getStats(&networkStats, nullptr)) { + networkStats = nullptr; + } + + for (auto &&subgraph : subgraphs) { auto affinity = (*subgraph.begin())->affinity; tempLayers.assign(subgraph.begin(), subgraph.end()); - auto tempNetwork = cloneNet(tempLayers); + auto tempNetwork = cloneNet(tempLayers, networkStats); + tempNetwork->setName(network.getName() + "_" + std::to_string(std::distance(subgraphs.data(), &subgraph))); // restoring some outputs from original net if they are not marked as output automatically // this might happen if output was set manually for origin network and // it doesn't go to next subgraph diff --git a/inference-engine/src/hetero_plugin/hetero_executable_network.h b/inference-engine/src/hetero_plugin/hetero_executable_network.h index 24b59b0..08e4bd7 100644 --- a/inference-engine/src/hetero_plugin/hetero_executable_network.h +++ b/inference-engine/src/hetero_plugin/hetero_executable_network.h @@ -1,7 +1,23 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // +/** + * @brief a header file for ExecutableNetwork + * @file dlia_executable_network.hpp + */ #pragma once #include diff --git a/inference-engine/src/hetero_plugin/hetero_infer_request.cpp b/inference-engine/src/hetero_plugin/hetero_infer_request.cpp index fdf865c..81349f9 100644 --- a/inference-engine/src/hetero_plugin/hetero_infer_request.cpp +++ b/inference-engine/src/hetero_plugin/hetero_infer_request.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "hetero_infer_request.h" diff --git a/inference-engine/src/hetero_plugin/hetero_infer_request.h b/inference-engine/src/hetero_plugin/hetero_infer_request.h index 77a6cb2..7633022 100644 --- a/inference-engine/src/hetero_plugin/hetero_infer_request.h +++ b/inference-engine/src/hetero_plugin/hetero_infer_request.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // /** diff --git a/inference-engine/src/hetero_plugin/hetero_plugin.cpp b/inference-engine/src/hetero_plugin/hetero_plugin.cpp index fff3d16..987e703 100644 --- a/inference-engine/src/hetero_plugin/hetero_plugin.cpp +++ b/inference-engine/src/hetero_plugin/hetero_plugin.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "hetero_plugin.h" @@ -20,7 +32,7 @@ using namespace HeteroPlugin; using namespace std; static Version heteroPluginDescription = { - {1, 4}, // plugin API version + {1, 6}, // plugin API version CI_BUILD_NUMBER, "dliaPlugin" // plugin description message - }; @@ -37,6 +49,7 @@ Engine::Engine() { InferenceEngine::ExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map &config) { + // TODO(amalyshe) do we need here verification of input precisions? std::map tconfig; tconfig = config; @@ -83,7 +96,7 @@ INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine( ResponseDesc *resp) noexcept { try { plugin = new HeteroPluginBase( - {{1, 5}, "heteroPlugin", "heteroPlugin"}, + {{1, 6}, "heteroPlugin", "heteroPlugin"}, std::make_shared()); return OK; } diff --git a/inference-engine/src/hetero_plugin/hetero_plugin.h b/inference-engine/src/hetero_plugin/hetero_plugin.h index 93fa7b3..671463d 100644 --- a/inference-engine/src/hetero_plugin/hetero_plugin.h +++ b/inference-engine/src/hetero_plugin/hetero_plugin.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/src/hetero_plugin/hetero_plugin_base.hpp b/inference-engine/src/hetero_plugin/hetero_plugin_base.hpp index d38275d..e2e166b 100644 --- a/inference-engine/src/hetero_plugin/hetero_plugin_base.hpp +++ b/inference-engine/src/hetero_plugin/hetero_plugin_base.hpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright (C) 2018-2019 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // /** diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt index 41f0e98..b3dc75f 100644 --- a/inference-engine/src/inference_engine/CMakeLists.txt +++ b/inference-engine/src/inference_engine/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -6,10 +6,13 @@ set (TARGET_NAME "inference_engine") file (GLOB LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/transform/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/transform/transformations/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/builders/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp_interfaces/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/built-in/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/const_infer/*.cpp ) file (GLOB LIBRARY_HEADERS @@ -18,6 +21,7 @@ file (GLOB LIBRARY_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/cpp_interfaces/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/built-in/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/const_infer/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp_interfaces/base/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp_interfaces/impl/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp_interfaces/interface/*.hpp @@ -33,9 +37,15 @@ if( (NOT DEFINED ENABLE_SSE42) OR ENABLE_SSE42) ${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/*.hpp ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/blob_transform_sse42.cpp PROPERTIES COMPILE_FLAGS -msse4.2) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_data_sse42.cpp PROPERTIES COMPILE_FLAGS -msse4.2) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp PROPERTIES COMPILE_FLAGS -msse4.2) + if (WIN32) + set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/blob_transform_sse42.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_data_sse42.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp" PROPERTIES COMPILE_FLAGS /arch:SSE2) + else() + set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/blob_transform_sse42.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_data_sse42.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp" PROPERTIES COMPILE_FLAGS -msse4.2) + endif() add_definitions(-DHAVE_SSE=1) endif() @@ -64,7 +74,7 @@ add_library(${TARGET_NAME} SHARED ${PUBLIC_HEADERS}) set_ie_threading_interface_for(${TARGET_NAME}) -target_link_libraries(${TARGET_NAME} PRIVATE pugixml fluid ade ${CMAKE_DL_LIBS} ${INTEL_ITT_LIBS}) +target_link_libraries(${TARGET_NAME} PRIVATE fluid ade ${INTEL_ITT_LIBS} PUBLIC pugixml ${CMAKE_DL_LIBS}) # Properties->C/C++->General->Additional Include Directories target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR} @@ -105,7 +115,8 @@ target_compile_definitions(${TARGET_NAME}_s PUBLIC -DUSE_STATIC_IE) set_target_properties(${TARGET_NAME}_s PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_s) target_link_libraries(${TARGET_NAME}_s PRIVATE fluid - PRIVATE ade) + PRIVATE ade + PRIVATE ${INTEL_ITT_LIBS}) # export targets export(TARGETS ${TARGET_NAME} NAMESPACE IE:: FILE "${CMAKE_BINARY_DIR}/targets.cmake") @@ -118,4 +129,6 @@ configure_file( configure_file( "${CMAKE_SOURCE_DIR}/cmake/share/InferenceEngineConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/InferenceEngineConfig-version.cmake" - COPYONLY) \ No newline at end of file + COPYONLY) + +add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) diff --git a/inference-engine/src/inference_engine/ade_util.cpp b/inference-engine/src/inference_engine/ade_util.cpp index 041c565..437d02a 100644 --- a/inference-engine/src/inference_engine/ade_util.cpp +++ b/inference-engine/src/inference_engine/ade_util.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ade_util.hpp b/inference-engine/src/inference_engine/ade_util.hpp index 7348354..f4b26dd 100644 --- a/inference-engine/src/inference_engine/ade_util.hpp +++ b/inference-engine/src/inference_engine/ade_util.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/blob_factory.cpp b/inference-engine/src/inference_engine/blob_factory.cpp index 8be9ab9..dbd9eec 100644 --- a/inference-engine/src/inference_engine/blob_factory.cpp +++ b/inference-engine/src/inference_engine/blob_factory.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,6 +15,11 @@ InferenceEngine::Blob::Ptr make_blob_with_precision(const InferenceEngine::Tenso return make_blob_with_precision(desc.getPrecision(), desc, ptr); } + +InferenceEngine::Blob::Ptr make_blob_with_precision(const InferenceEngine::TensorDesc& desc, const std::shared_ptr& alloc) { + return make_blob_with_precision(desc.getPrecision(), desc, alloc); +} + InferenceEngine::Layout plain_layout(InferenceEngine::SizeVector dims) { int n = dims.size(); return n == 1 ? InferenceEngine::C : diff --git a/inference-engine/src/inference_engine/blob_factory.hpp b/inference-engine/src/inference_engine/blob_factory.hpp index a4a5d20..b65f35b 100644 --- a/inference-engine/src/inference_engine/blob_factory.hpp +++ b/inference-engine/src/inference_engine/blob_factory.hpp @@ -1,10 +1,11 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include +#include #include "inference_engine.hpp" template @@ -23,6 +24,9 @@ class BlobFactory { static InferenceEngine::Blob::Ptr make(const InferenceEngine::TensorDesc& desc, void* ptr) { return InferenceEngine::make_shared_blob(desc, reinterpret_cast(ptr)); } + static InferenceEngine::Blob::Ptr make(const InferenceEngine::TensorDesc& desc, const std::shared_ptr& alloc) { + return InferenceEngine::make_shared_blob(desc, alloc); + } }; template InferenceEngine::Blob::Ptr make_shared_blob2(Args && ... args) { @@ -35,6 +39,8 @@ template Infe INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_blob_with_precision(const InferenceEngine::TensorDesc& desc); INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_blob_with_precision(const InferenceEngine::TensorDesc& desc, void* ptr); +INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_blob_with_precision(const InferenceEngine::TensorDesc& desc, + const std::shared_ptr& alloc); INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_plain_blob(InferenceEngine::Precision prec, const InferenceEngine::SizeVector dims); INFERENCE_ENGINE_API_CPP(InferenceEngine::Layout) plain_layout(InferenceEngine::SizeVector dims); @@ -50,6 +56,7 @@ InferenceEngine::Blob::Ptr make_blob_with_precision(InferenceEngine::Precision p USE_FACTORY(I8); USE_FACTORY(U16); USE_FACTORY(I32); + USE_FACTORY(BIN); default: THROW_IE_EXCEPTION << "cannot locate blob for precision: " << precision; } diff --git a/inference-engine/src/inference_engine/blob_transform.cpp b/inference-engine/src/inference_engine/blob_transform.cpp index bde62a6..f3fc7ea 100644 --- a/inference-engine/src/inference_engine/blob_transform.cpp +++ b/inference-engine/src/inference_engine/blob_transform.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/blob_transform.hpp b/inference-engine/src/inference_engine/blob_transform.hpp index 4d83015..0c6bfe2 100644 --- a/inference-engine/src/inference_engine/blob_transform.hpp +++ b/inference-engine/src/inference_engine/blob_transform.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp b/inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp index 265913f..b666bc9 100644 --- a/inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp @@ -1,61 +1,86 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::ArgMaxLayer::ArgMaxLayer(const std::string& name): LayerFragment("ArgMax", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ArgMaxLayer::ArgMaxLayer(const std::string& name): LayerDecorator("ArgMax", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); } -Builder::ArgMaxLayer::ArgMaxLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ArgMax")) - THROW_IE_EXCEPTION << "Cannot create ArgMaxLayer decorator for layer " << getLayer().getType(); +Builder::ArgMaxLayer::ArgMaxLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ArgMax"); +} + +Builder::ArgMaxLayer::ArgMaxLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ArgMax"); } Builder::ArgMaxLayer& Builder::ArgMaxLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ArgMaxLayer::getPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::ArgMaxLayer& Builder::ArgMaxLayer::setPort(const Port &port) { - getLayer().getInputPorts()[0] = port; - getLayer().getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } int Builder::ArgMaxLayer::getAxis() const { - return getLayer().getParameters()["axis"].asInt(); + return getLayer()->getParameters().at("axis"); } Builder::ArgMaxLayer& Builder::ArgMaxLayer::setAxis(int axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; } size_t Builder::ArgMaxLayer::getTopK() const { - return getLayer().getParameters()["top_k"].asUInt(); + return getLayer()->getParameters().at("top_k"); } Builder::ArgMaxLayer& Builder::ArgMaxLayer::setTopK(size_t topK) { - getLayer().getParameters()["top_k"] = topK; + getLayer()->getParameters()["top_k"] = topK; return *this; } size_t Builder::ArgMaxLayer::getOutMaxVal() const { - return getLayer().getParameters()["out_max_val"].asUInt(); + return getLayer()->getParameters().at("out_max_val"); } Builder::ArgMaxLayer& Builder::ArgMaxLayer::setOutMaxVal(size_t outMaxVal) { - if (outMaxVal > 1) - THROW_IE_EXCEPTION << "OutMaxVal supports only 0 and 1 values."; - getLayer().getParameters()["out_max_val"] = outMaxVal; + getLayer()->getParameters()["out_max_val"] = outMaxVal; return *this; } +REG_VALIDATOR_FOR(ArgMax, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } + Builder::ArgMaxLayer layer(input_layer); + if (layer.getAxis() > 1) { + THROW_IE_EXCEPTION << "axis supports only 0 and 1 values."; + } + if (layer.getOutMaxVal() > 1) { + THROW_IE_EXCEPTION << "OutMaxVal supports only 0 and 1 values."; + } +}); + +REG_CONVERTER_FOR(ArgMax, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = cnnLayer->GetParamAsInt("axis"); + layer.getParameters()["top_k"] = static_cast(cnnLayer->GetParamAsUInt("top_k")); + layer.getParameters()["out_max_val"] = static_cast(cnnLayer->GetParamAsUInt("out_max_val")); +}); + + diff --git a/inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp b/inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp index 1c3d275..329d3f5 100644 --- a/inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp @@ -1,68 +1,63 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::BatchNormalizationLayer::BatchNormalizationLayer(const std::string& name): LayerFragment("BatchNormalization", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::BatchNormalizationLayer::BatchNormalizationLayer(const std::string& name): LayerDecorator("BatchNormalization", name) { + getLayer()->getInputPorts().resize(3); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getOutputPorts().resize(1); setEpsilon(0.00000001f); } -Builder::BatchNormalizationLayer::BatchNormalizationLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "BatchNormalization")) - THROW_IE_EXCEPTION << "Cannot create BatchNormalizationLayer decorator for layer " << getLayer().getType(); +Builder::BatchNormalizationLayer::BatchNormalizationLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("BatchNormalization"); +} + +Builder::BatchNormalizationLayer::BatchNormalizationLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("BatchNormalization"); } Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::BatchNormalizationLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; - return *this; -} - -Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setWeights(const Blob::CPtr& weights) { - getLayer().addConstantData("weights", weights); - return *this; -} -Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setBiases(const Blob::CPtr& biases) { - getLayer().addConstantData("biases", biases); + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::BatchNormalizationLayer::getEpsilon() const { - return getLayer().getParameters()["epsilon"].asFloat(); + return getLayer()->getParameters().at("epsilon"); } Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setEpsilon(float eps) { - getLayer().getParameters()["epsilon"] = eps; + getLayer()->getParameters()["epsilon"] = eps; return *this; } -void Builder::BatchNormalizationLayer::validate(const Layer& layer) { - auto weightsIt = layer.getConstantData().find("weights"); - auto biasesIt = layer.getConstantData().find("biases"); - bool valid = weightsIt != layer.getConstantData().end() && - biasesIt != layer.getConstantData().end() && - weightsIt->second != nullptr && - weightsIt->second->cbuffer() != nullptr && - biasesIt->second != nullptr && - biasesIt->second->cbuffer() != nullptr; - if (!valid) - THROW_IE_EXCEPTION << "Cannot create BatchNotmalization layer! Weights and biases are required!"; -} +REG_VALIDATOR_FOR(BatchNormalization, [](const Builder::Layer::CPtr& layer, bool partial) { + Builder::BatchNormalizationLayer batchNormBuilder(layer); + if (partial) + return; + auto weights = layer->getInputPorts()[1].getData()->getData(); + auto biases = layer->getInputPorts()[2].getData()->getData(); + if (!weights || weights->cbuffer() == nullptr || !biases || biases->cbuffer() == nullptr) + THROW_IE_EXCEPTION << "Cannot create BatchNormalization layer! Weights and biases are required!"; +}); -REG_VALIDATOR_FOR(BatchNormalization, Builder::BatchNormalizationLayer::validate); \ No newline at end of file +REG_CONVERTER_FOR(BatchNormalization, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["epsilon"] = cnnLayer->GetParamAsFloat("epsilon"); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp b/inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp index 0bc1fb9..587b442 100644 --- a/inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp @@ -1,56 +1,77 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::ClampLayer::ClampLayer(const std::string& name): LayerFragment("Clamp", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ClampLayer::ClampLayer(const std::string& name): LayerDecorator("Clamp", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setMinValue(0.0f); setMaxValue(1.0f); } -Builder::ClampLayer::ClampLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Clamp")) - THROW_IE_EXCEPTION << "Cannot create ClampLayer decorator for layer " << getLayer().getType(); +Builder::ClampLayer::ClampLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Clamp"); +} + +Builder::ClampLayer::ClampLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Clamp"); } Builder::ClampLayer& Builder::ClampLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ClampLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ClampLayer& Builder::ClampLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::ClampLayer::getMaxValue() const { - return getLayer().getParameters()["max"].asFloat(); + return getLayer()->getParameters().at("max"); } Builder::ClampLayer& Builder::ClampLayer::setMaxValue(float maxValue) { - getLayer().getParameters()["max"] = maxValue; + getLayer()->getParameters()["max"] = maxValue; return *this; } float Builder::ClampLayer::getMinValue() const { - return getLayer().getParameters()["min"].asFloat(); + return getLayer()->getParameters().at("min"); } Builder::ClampLayer& Builder::ClampLayer::setMinValue(float minValue) { - getLayer().getParameters()["min"] = minValue; + getLayer()->getParameters()["min"] = minValue; return *this; } +REG_VALIDATOR_FOR(Clamp, [](const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::ClampLayer layer(input_layer); + if (layer.getMinValue() > layer.getMaxValue()) { + THROW_IE_EXCEPTION << "MinValue should be less or equal MaxValue"; + } + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(Clamp, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["max"] = cnnLayer->GetParamAsFloat("max", 0); + layer.getParameters()["min"] = cnnLayer->GetParamAsFloat("min", 0); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_concat_layer.cpp b/inference-engine/src/inference_engine/builders/ie_concat_layer.cpp index 8ba326f..a5e8d3f 100644 --- a/inference-engine/src/inference_engine/builders/ie_concat_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_concat_layer.cpp @@ -1,53 +1,105 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::ConcatLayer::ConcatLayer(const std::string& name): LayerFragment("Concat", name) { - getLayer().getOutputPorts().resize(1); +Builder::ConcatLayer::ConcatLayer(const std::string& name): LayerDecorator("Concat", name) { + getLayer()->getOutputPorts().resize(1); setAxis(1); } -Builder::ConcatLayer::ConcatLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Concat")) - THROW_IE_EXCEPTION << "Cannot create ConcatLayer decorator for layer " << getLayer().getType(); +Builder::ConcatLayer::ConcatLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Concat"); +} + +Builder::ConcatLayer::ConcatLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Concat"); } Builder::ConcatLayer& Builder::ConcatLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ConcatLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ConcatLayer& Builder::ConcatLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector& Builder::ConcatLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::ConcatLayer& Builder::ConcatLayer::setInputPorts(const std::vector& ports) { - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } size_t Builder::ConcatLayer::getAxis() const { - return getLayer().getParameters()["axis"].asUInt(); + return getLayer()->getParameters().at("axis"); } Builder::ConcatLayer& Builder::ConcatLayer::setAxis(size_t axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; } + +REG_VALIDATOR_FOR(Concat, [] (const InferenceEngine::Builder::Layer::CPtr &input_layer, bool partial) { + if (partial) { + return; + } + Builder::ConcatLayer layer(input_layer); + if (layer.getInputPorts().size() < 1) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << " contains incorrect input ports. " + << "It takes at least two Blobs"; + } + for (size_t i = 1; i < layer.getInputPorts().size(); ++i) { + if (layer.getInputPorts()[i - 1].shape().size() != layer.getInputPorts()[i].shape().size()) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << " contains incorrect input ports. " + << "It should have equal number of dimensions"; + } + } + if (layer.getInputPorts()[0].shape().size() != layer.getOutputPort().shape().size()) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << " contains incorrect input and output ports " + << "It should have equal number of dimensions"; + } + if (layer.getAxis() >= layer.getOutputPort().shape().size()) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << "contains incorrect axis. " + << "It should be >= 0 and < number of port's dimensions."; + } + for (size_t i = 0; i < layer.getOutputPort().shape().size(); ++i) { + if (i == layer.getAxis()) { + size_t sumInputDimensions = 0; + for (const Port& port : layer.getInputPorts()) { + sumInputDimensions += port.shape()[i]; + } + if (sumInputDimensions != layer.getOutputPort().shape()[i]) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << " contains incorrect input and output ports " + << "Sum of input port's dimensions in the given axis should be equal to output ports dimension in the same axis."; + } + } else { + for (const Port& port : layer.getInputPorts()) { + if (port.shape()[i] != layer.getOutputPort().shape()[i]) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << " contains incorrect input and output ports. " + << "It should have equal dimensions in axis different from given"; + } + } + } + } +}); + +REG_CONVERTER_FOR(Concat, [] (const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis", 1)); +}); + diff --git a/inference-engine/src/inference_engine/builders/ie_const_layer.cpp b/inference-engine/src/inference_engine/builders/ie_const_layer.cpp index da5d43d..0b0f243 100644 --- a/inference-engine/src/inference_engine/builders/ie_const_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_const_layer.cpp @@ -1,39 +1,58 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
#include using namespace InferenceEngine; -Builder::ConstLayer::ConstLayer(const std::string& name): LayerFragment("Const", name) { - getLayer().getOutputPorts().resize(1); +Builder::ConstLayer::ConstLayer(const std::string& name): LayerDecorator("Const", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getParameters()["custom"] = Blob::CPtr(); } -Builder::ConstLayer::ConstLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Const")) - THROW_IE_EXCEPTION << "Cannot create ConstLayer decorator for layer " << getLayer().getType(); +Builder::ConstLayer::ConstLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Const"); +} + +Builder::ConstLayer::ConstLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Const"); } Builder::ConstLayer& Builder::ConstLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ConstLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ConstLayer& Builder::ConstLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + const auto & data = getLayer()->getOutputPorts()[0].getData(); + getLayer()->getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0].setData(data); return *this; } Builder::ConstLayer& Builder::ConstLayer::setData(const Blob::CPtr& data) { - getLayer().addConstantData("custom", data); + getLayer()->getParameters()["custom"] = data; + getLayer()->getOutputPorts()[0].getData()->setData(std::const_pointer_cast(data)); return *this; } +const Blob::CPtr& Builder::ConstLayer::getData() const { + if (getLayer()->getParameters().at("custom").as().get() != + getLayer()->getOutputPorts()[0].getData()->getData().get()) + THROW_IE_EXCEPTION << "Constant data output port has incorrect data!"; + return getLayer()->getParameters().at("custom").as(); +} + +REG_VALIDATOR_FOR(Const, [] (const InferenceEngine::Builder::Layer::CPtr& layer, bool partial) { + Builder::ConstLayer constBuilder(layer); + const auto& data = constBuilder.getData(); + if (!data || data->cbuffer() == nullptr) + THROW_IE_EXCEPTION << "Cannot create Const layer! Data is required!"; +}); diff --git a/inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp b/inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp index a66e155..3c81b3f 100644 --- a/inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp @@ -1,153 +1,126 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include +#include using namespace InferenceEngine; -Builder::ConvolutionLayer::ConvolutionLayer(const std::string& name): LayerFragment("Convolution", name) { - getLayer().getInputPorts().resize(1); - getLayer().getOutputPorts().resize(1); +Builder::ConvolutionLayer::ConvolutionLayer(const std::string& name): LayerDecorator("Convolution", name) { + getLayer()->getInputPorts().resize(3); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getOutputPorts().resize(1); + setGroup(1); + setKernel({}); + setOutDepth(0); + setStrides({}); + setDilation({}); + setPaddingsEnd({}); + setPaddingsBegin({}); } -Builder::ConvolutionLayer::ConvolutionLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Convolution")) - THROW_IE_EXCEPTION << "Cannot create ConvolutionLayer decorator for layer " << getLayer().getType(); +Builder::ConvolutionLayer::ConvolutionLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Convolution"); } -Builder::ConvolutionLayer::operator Builder::Layer() const { - Layer genLayer(getLayer()); - - std::vector l_kernel = getKernel(); - std::vector l_dilation = getDilation(); - std::vector l_paddingBegin = getPaddingsBegin(); - std::vector l_paddingEnd = getPaddingsEnd(); - std::vector l_strides = getStrides(); - - if (l_paddingBegin.empty() && !l_kernel.empty()) - l_paddingBegin.resize(l_kernel.size(), 0); - if (l_paddingEnd.empty() && !l_kernel.empty()) - l_paddingEnd.resize(l_kernel.size(), 0); - if (l_dilation.empty() && !l_kernel.empty()) - l_dilation.resize(l_kernel.size(), 1); - if (l_strides.empty() && !l_kernel.empty()) - l_strides.resize(l_kernel.size(), 1); - - if (!getOutDepth() || l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() || - l_kernel.size() != l_dilation.size() || l_kernel.size() != l_strides.size()) - THROW_IE_EXCEPTION << genLayer.getType() << " node " << genLayer.getName() << " contains incorrect parameters!"; - - genLayer.getParameters()["kernel"] = l_kernel; - genLayer.getParameters()["strides"] = l_strides; - genLayer.getParameters()["pads_begin"] = l_paddingBegin; - genLayer.getParameters()["pads_end"] = l_paddingEnd; - genLayer.getParameters()["dilations"] = l_dilation; - return genLayer; +Builder::ConvolutionLayer::ConvolutionLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Convolution"); } Builder::ConvolutionLayer &Builder::ConvolutionLayer::setName(const std::string &name) { - getLayer().getName() = name; - return *this; -} - -Builder::ConvolutionLayer& Builder::ConvolutionLayer::setWeights(const Blob::CPtr& weights) { - getLayer().addConstantData("weights", weights); - return *this; -} -Builder::ConvolutionLayer& Builder::ConvolutionLayer::setBiases(const Blob::CPtr& biases) { - getLayer().addConstantData("biases", biases); + getLayer()->setName(name); return *this; } const Port& Builder::ConvolutionLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setInputPort(const Port& port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::ConvolutionLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector Builder::ConvolutionLayer::getKernel() const { - return uInts2size_t(getLayer().getParameters()["kernel"].asUInts({})); + return getLayer()->getParameters().at("kernel"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setKernel(const std::vector& kernel) { - getLayer().getParameters()["kernel"] = kernel; + getLayer()->getParameters()["kernel"] = kernel; return *this; } const std::vector Builder::ConvolutionLayer::getStrides() const { - return uInts2size_t(getLayer().getParameters()["strides"].asUInts({})); + return getLayer()->getParameters().at("strides"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setStrides(const std::vector& strides) { - getLayer().getParameters()["strides"] = strides; + getLayer()->getParameters()["strides"] = strides; return *this; } const std::vector Builder::ConvolutionLayer::getDilation() const { - return uInts2size_t(getLayer().getParameters()["dilations"].asUInts({})); + return getLayer()->getParameters().at("dilations"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setDilation(const std::vector& dilation) { - getLayer().getParameters()["dilations"] = dilation; + getLayer()->getParameters()["dilations"] = dilation; return *this; } const std::vector Builder::ConvolutionLayer::getPaddingsBegin() const { - return uInts2size_t(getLayer().getParameters()["pads_begin"].asUInts({})); + return getLayer()->getParameters().at("pads_begin"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setPaddingsBegin(const std::vector& paddings) { - getLayer().getParameters()["pads_begin"] = paddings; + getLayer()->getParameters()["pads_begin"] = paddings; return *this; } const std::vector Builder::ConvolutionLayer::getPaddingsEnd() const { - return uInts2size_t(getLayer().getParameters()["pads_end"].asUInts({})); + return getLayer()->getParameters().at("pads_end"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setPaddingsEnd(const std::vector& paddings) { - getLayer().getParameters()["pads_end"] = paddings; + getLayer()->getParameters()["pads_end"] = paddings; return *this; } size_t Builder::ConvolutionLayer::getGroup() const { - return getLayer().getParameters()["group"].asUInt(1); + return getLayer()->getParameters().at("group"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setGroup(size_t group) { - getLayer().getParameters()["group"] = group; + getLayer()->getParameters()["group"] = group; return *this; } size_t Builder::ConvolutionLayer::getOutDepth() const { - return getLayer().getParameters()["output"].asUInt(0); + return getLayer()->getParameters().at("output"); } Builder::ConvolutionLayer& Builder::ConvolutionLayer::setOutDepth(size_t outDepth) { - getLayer().getParameters()["output"] = outDepth; + getLayer()->getParameters()["output"] = outDepth; return *this; } -void Builder::ConvolutionLayer::validate(const Layer& layer) { - Layer convLayer = layer; - Builder::ConvolutionLayer convBuilder(convLayer); - std::vector l_kernel = convBuilder.getKernel(); - +REG_VALIDATOR_FOR(Convolution, [] (const InferenceEngine::Builder::Layer::CPtr& layer, bool partial) { // WA for old IRs - if (l_kernel.empty() && layer.getParameters().find("kernel-x") != layer.getParameters().end() && - layer.getParameters().find("kernel-y") != layer.getParameters().end()) + if (layer->getParameters().find("kernel") == layer->getParameters().end() && + layer->getParameters().find("kernel-x") != layer->getParameters().end() && + layer->getParameters().find("kernel-y") != layer->getParameters().end()) return; + Builder::ConvolutionLayer convBuilder(layer); + std::vector l_kernel = convBuilder.getKernel(); std::vector l_dilation = convBuilder.getDilation(); std::vector l_paddingBegin = convBuilder.getPaddingsBegin(); std::vector l_paddingEnd = convBuilder.getPaddingsEnd(); @@ -162,9 +135,121 @@ void Builder::ConvolutionLayer::validate(const Layer& layer) { if (l_strides.empty() && !l_kernel.empty()) l_strides.resize(l_kernel.size(), 1); - if (!convBuilder.getOutDepth() || l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() || - l_kernel.size() != l_dilation.size() || l_kernel.size() != l_strides.size()) - THROW_IE_EXCEPTION << layer.getType() << " node " << layer.getName() << " contains incorrect parameters!"; -} + if (l_kernel.empty()) { + THROW_IE_EXCEPTION << "Kernel is empty!"; + } + + if (l_paddingBegin.size() != l_paddingEnd.size()) { + THROW_IE_EXCEPTION << "Padding_begin dimension is not equal to padding_end dimension"; + } + + if (!l_paddingBegin.empty() && l_kernel.size() != l_paddingBegin.size()) { + THROW_IE_EXCEPTION << "Padding dimension is not equal to kernel dimension"; + } + + if (l_kernel.size() != l_strides.size()) { + THROW_IE_EXCEPTION << "Stride dimension is not equal to kernel dimension"; + } + + if (!l_dilation.empty() && l_kernel.size() != l_dilation.size()) { + THROW_IE_EXCEPTION << "Dilation dimension is not equal to kernel dimension"; + } + + if (convBuilder.getOutDepth() == 0) { + THROW_IE_EXCEPTION << "OutDepth parameter should be more than 0"; + } + + for (size_t kernel_dim : l_kernel) { + if (kernel_dim == 0) { + THROW_IE_EXCEPTION << "Kernel dimensions should be more than 0"; + } + } + + for (size_t i_stride : l_strides) { + if (i_stride == 0) { + THROW_IE_EXCEPTION << "Strides should be more than 0"; + } + } + + for (size_t dil : l_dilation) { + if (dil == 0) + THROW_IE_EXCEPTION << "Dilation should be more than 0"; + } + + if (!convBuilder.getGroup()) + THROW_IE_EXCEPTION << "Group should be more than 0"; + + if (convBuilder.getInputPort().shape().empty()) + return; + + const size_t IC = convBuilder.getInputPort().shape()[1]; + if (IC % convBuilder.getGroup()) + THROW_IE_EXCEPTION << "Number of input channels (" << IC << + ") is not divided by group number (" << convBuilder.getGroup() << ")"; + + size_t weight_size = convBuilder.getOutDepth() * IC / convBuilder.getGroup(); + for (size_t kernel_dim : l_kernel) { + if (static_cast(weight_size) * kernel_dim > std::numeric_limits::max()) { + THROW_IE_EXCEPTION << "Weight size exceeds the size_t max"; + } + weight_size *= kernel_dim; + } + + if (partial) + return; + + const auto weights = layer->getInputPorts()[1].getData()->getData(); + if (weights->size() != weight_size) { + THROW_IE_EXCEPTION << "Weight size is not correct!"; + } + + const auto biases = layer->getInputPorts()[2].getData()->getData(); + if (biases && biases->cbuffer() && biases->size() != convBuilder.getOutDepth()) + THROW_IE_EXCEPTION << "Biases size is incorrect!"; +}); + +REG_CONVERTER_FOR(Convolution, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + // WA for old IRs + if (cnnLayer->params.find("kernel") == cnnLayer->params.end() && + cnnLayer->params.find("kernel-x") != cnnLayer->params.end() && + cnnLayer->params.find("kernel-y") != cnnLayer->params.end()) + return; -REG_VALIDATOR_FOR(Convolution, Builder::ConvolutionLayer::validate); + std::vector tmp = cnnLayer->GetParamAsUInts("kernel"); + std::vector cur(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["kernel"] = cur; + + tmp = cnnLayer->GetParamAsUInts("strides"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["strides"] = cur; + + tmp = cnnLayer->GetParamAsUInts("dilations"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["dilations"] = cur; + + tmp = cnnLayer->GetParamAsUInts("pads_begin"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["pads_begin"] = cur; + + tmp = cnnLayer->GetParamAsUInts("pads_end"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["pads_end"] = cur; + + layer.getParameters()["group"] = static_cast(cnnLayer->GetParamAsUInt("group")); + layer.getParameters()["output"] = static_cast(cnnLayer->GetParamAsUInt("output")); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_crop_layer.cpp b/inference-engine/src/inference_engine/builders/ie_crop_layer.cpp index 7fe2591..239a6f4 100644 --- a/inference-engine/src/inference_engine/builders/ie_crop_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_crop_layer.cpp @@ -1,69 +1,110 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::CropLayer::CropLayer(const std::string& name): LayerFragment("Crop", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(2); +Builder::CropLayer::CropLayer(const std::string& name): LayerDecorator("Crop", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(2); } -Builder::CropLayer::CropLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Crop")) - THROW_IE_EXCEPTION << "Cannot create CropLayer decorator for layer " << getLayer().getType(); +Builder::CropLayer::CropLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Crop"); +} + +Builder::CropLayer::CropLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Crop"); } Builder::CropLayer& Builder::CropLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::CropLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::CropLayer& Builder::CropLayer::setInputPorts(const std::vector& ports) { - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::CropLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::CropLayer& Builder::CropLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector Builder::CropLayer::getAxis() const { - return uInts2size_t(getLayer().getParameters()["axis"].asUInts()); + return getLayer()->getParameters().at("axis"); } Builder::CropLayer& Builder::CropLayer::setAxis(const std::vector& axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; } const std::vector Builder::CropLayer::getOffset() const { - return uInts2size_t(getLayer().getParameters()["offset"].asUInts()); + return getLayer()->getParameters().at("offset"); } Builder::CropLayer& Builder::CropLayer::setOffset(const std::vector& offsets) { - getLayer().getParameters()["offset"] = offsets; + getLayer()->getParameters()["offset"] = offsets; return *this; } -void Builder::CropLayer::validate(const Layer& layer) { - if (layer.getInputPorts().size() != 2) - THROW_IE_EXCEPTION << "Incorrect parameters for layer " << layer.getName() << " should have 2 inputs!"; -} - -REG_VALIDATOR_FOR(Crop, Builder::CropLayer::validate); \ No newline at end of file +REG_VALIDATOR_FOR(Crop, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + if (input_layer->getInputPorts().size() != 2) { + THROW_IE_EXCEPTION << "Incorrect parameters for getLayer() " << input_layer->getName() + << " should have 2 input ports."; + } + if (input_layer->getOutputPorts().size() != 1) { + THROW_IE_EXCEPTION << "Incorrect parameters for getLayer() " << input_layer->getName() + << " should have 1 output port"; + } + Builder::CropLayer layer(input_layer); + if (layer.getAxis().size() != layer.getOffset().size()) { + THROW_IE_EXCEPTION << "Incorrect parameters for getLayer() " << input_layer->getName() + << ". Axis size must be equal to the size of Offset"; + } + for (size_t i = 0; i < layer.getAxis().size(); ++i) { + const size_t index = layer.getAxis()[i]; + if (index >= layer.getInputPorts()[0].shape().size()) { + THROW_IE_EXCEPTION << "Incorrect parameters for getLayer() " << input_layer->getName() + << ". Each element of Axis should be less than input shape length"; + } + if (layer.getOutputPort().shape()[index] != layer.getInputPorts()[1].shape()[index]) { + THROW_IE_EXCEPTION << "Incorrect parameters for getLayer() " << input_layer->getName() + << ". The second input shapes should have the same value as the output shapes in the indexes contained in Axis"; + } + if (layer.getInputPorts()[0].shape()[index] < layer.getOutputPort().shape()[index] + layer.getOffset()[i]) { + THROW_IE_EXCEPTION << "Incorrect parameters for getLayer() " << input_layer->getName() + << ". The sum of offset and output shape in the " << i + 1 << " dimension is bigger then input shape size"; + } + } +}); + +REG_CONVERTER_FOR(Crop, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + std::vector tmp = cnnLayer->GetParamAsUInts("axis"); + layer.getParameters()["axis"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["axis"].as>()[i] = static_cast(tmp[i]); + } + + tmp = cnnLayer->GetParamAsUInts("offset"); + layer.getParameters()["offset"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["offset"].as>()[i] = static_cast(tmp[i]); + } +}); diff --git a/inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp b/inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp index c3e017a..c5b8065 100644 --- a/inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp @@ -1,46 +1,61 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(const std::string& name): LayerFragment("CTCGreedyDecoder", name) { - getLayer().getOutputPorts().resize(1); +Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(const std::string& name): LayerDecorator("CTCGreedyDecoder", name) { + getLayer()->getOutputPorts().resize(1); } -Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "CTCGreedyDecoder")) - THROW_IE_EXCEPTION << "Cannot create CTCGreedyDecoderLayer decorator for layer " << getLayer().getType(); +Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("CTCGreedyDecoder"); +} + +Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("CTCGreedyDecoder"); } Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::CTCGreedyDecoderLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setInputPorts(const std::vector& ports) { - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::CTCGreedyDecoderLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } bool Builder::CTCGreedyDecoderLayer::getCTCMergeRepeated() const { - return getLayer().getParameters()["ctc_merge_repeated"].asBool(); + return getLayer()->getParameters().at("ctc_merge_repeated"); } Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setCTCMergeRepeated(bool flag) { - getLayer().getParameters()["ctc_merge_repeated"] = flag; + getLayer()->getParameters()["ctc_merge_repeated"] = flag; return *this; } +REG_VALIDATOR_FOR(CTCGreedyDecoder, [](const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::CTCGreedyDecoderLayer layer(input_layer); + + if (layer.getInputPorts().empty() || layer.getInputPorts().size() > 2) { + THROW_IE_EXCEPTION << "Input ports are wrong in layer " << layer.getName() << + ". There are should be 1 or 2 input ports"; + } +}); + +REG_CONVERTER_FOR(CTCGreedyDecoder, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["ctc_merge_repeated"] = cnnLayer->GetParamsAsBool("ctc_merge_repeated", false); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp b/inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp index dfb607a..648cdb5 100644 --- a/inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp @@ -1,20 +1,164 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include +#include +#include #include using namespace InferenceEngine; Builder::DeconvolutionLayer::DeconvolutionLayer(const std::string& name): ConvolutionLayer(name) { - getLayer().setType("Deconvolution"); + getLayer()->setType("Deconvolution"); } -Builder::DeconvolutionLayer::DeconvolutionLayer(Layer& genLayer): ConvolutionLayer(genLayer.getName()) { - getLayer().setName(""); - getLayer().setType(""); - getLayer() = genLayer; - if (!details::CaselessEq()(getLayer().getType(), "Deconvolution")) - THROW_IE_EXCEPTION << "Cannot create DeconvolutionLayer decorator for layer " << getLayer().getType(); +Builder::DeconvolutionLayer::DeconvolutionLayer(const Layer::Ptr& layer): ConvolutionLayer(layer->getName()) { + this->getLayer() = layer; + checkType("Deconvolution"); } +Builder::DeconvolutionLayer::DeconvolutionLayer(const Layer::CPtr& layer): ConvolutionLayer(layer->getName()) { + this->getLayer().reset(); + cLayer = layer; + checkType("Deconvolution"); +} + +REG_VALIDATOR_FOR(Deconvolution, [] (const InferenceEngine::Builder::Layer::CPtr& layer, bool partial) { + // WA for old IRs + if (layer->getParameters().find("kernel") == layer->getParameters().end() && + layer->getParameters().find("kernel-x") != layer->getParameters().end() && + layer->getParameters().find("kernel-y") != layer->getParameters().end()) + return; + Builder::DeconvolutionLayer deconvBuilder(layer); + std::vector l_kernel = deconvBuilder.getKernel(); + std::vector l_dilation = deconvBuilder.getDilation(); + std::vector l_paddingBegin = deconvBuilder.getPaddingsBegin(); + std::vector l_paddingEnd = deconvBuilder.getPaddingsEnd(); + std::vector l_strides = deconvBuilder.getStrides(); + + if (l_paddingBegin.empty() && !l_kernel.empty()) + l_paddingBegin.resize(l_kernel.size(), 0); + if (l_paddingEnd.empty() && !l_kernel.empty()) + l_paddingEnd.resize(l_kernel.size(), 0); + if (l_dilation.empty() && !l_kernel.empty()) + l_dilation.resize(l_kernel.size(), 1); + if (l_strides.empty() && !l_kernel.empty()) + l_strides.resize(l_kernel.size(), 1); + + if (l_kernel.empty()) { + THROW_IE_EXCEPTION << "Kernel is empty!"; + } + + if (l_paddingBegin.size() != l_paddingEnd.size()) { + THROW_IE_EXCEPTION << "Padding_begin dimension is not equal to padding_end dimension"; + } + + if (!l_paddingBegin.empty() && l_kernel.size() != l_paddingBegin.size()) { + THROW_IE_EXCEPTION << "Padding dimension is not equal to kernel dimension"; + } + + if (l_kernel.size() != l_strides.size()) { + THROW_IE_EXCEPTION << "Stride dimension is not equal to kernel dimension"; + } + + if (!l_dilation.empty() && l_kernel.size() != l_dilation.size()) { + THROW_IE_EXCEPTION << "Dilation dimension is not equal to kernel dimension"; + } + + if (deconvBuilder.getOutDepth() == 0) { + THROW_IE_EXCEPTION << "OutDepth parameter should be more than 0"; + } + + for (size_t kernel_dim : l_kernel) { + if (kernel_dim == 0) { + THROW_IE_EXCEPTION << "Kernel dimensions should be more than 0"; + } + } + + for (size_t i_stride : l_strides) { + if (i_stride == 0) { + THROW_IE_EXCEPTION << "Strides should be more than 0"; + } + } + + for (size_t dil : l_dilation) { + if (dil == 0) + THROW_IE_EXCEPTION << "Dilation should be more than 0"; + } + + if (!deconvBuilder.getGroup()) + THROW_IE_EXCEPTION << "Group should be more than 0"; + + if (deconvBuilder.getInputPort().shape().empty()) + return; + + const size_t IC = deconvBuilder.getInputPort().shape()[1]; + if (IC % deconvBuilder.getGroup()) + THROW_IE_EXCEPTION << "Number of input channels (" << IC << + ") is not divided by group number (" << deconvBuilder.getGroup() << ")"; + + size_t weight_size = deconvBuilder.getOutDepth() * IC / deconvBuilder.getGroup(); + for (size_t kernel_dim : l_kernel) { + if (static_cast(weight_size) * kernel_dim > std::numeric_limits::max()) { + THROW_IE_EXCEPTION << "Weight size exceeds the size_t max"; + } + weight_size *= kernel_dim; + } + + if (partial) + return; + + const auto weights = layer->getInputPorts()[1].getData()->getData(); + if (weights->size() != weight_size) { + THROW_IE_EXCEPTION << "Weight size is not correct!"; + } + + const auto biases = layer->getInputPorts()[2].getData()->getData(); + if (biases && biases->cbuffer() && biases->size() != deconvBuilder.getOutDepth()) + THROW_IE_EXCEPTION << "Biases size is incorrect!"; +}); + +REG_CONVERTER_FOR(Deconvolution, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + // WA for old IRs + if (cnnLayer->params.find("kernel") == cnnLayer->params.end() && + cnnLayer->params.find("kernel-x") != cnnLayer->params.end() && + cnnLayer->params.find("kernel-y") != cnnLayer->params.end()) + return; + std::vector tmp = cnnLayer->GetParamAsUInts("kernel"); + std::vector cur(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["kernel"] = cur; + + tmp = cnnLayer->GetParamAsUInts("strides"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["strides"] = cur; + + tmp = cnnLayer->GetParamAsUInts("dilations"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["dilations"] = cur; + + tmp = cnnLayer->GetParamAsUInts("pads_begin"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["pads_begin"] = cur; + + tmp = cnnLayer->GetParamAsUInts("pads_end"); + cur.resize(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + cur[i] = static_cast(tmp[i]); + } + layer.getParameters()["pads_end"] = cur; + + layer.getParameters()["group"] = static_cast(cnnLayer->GetParamAsUInt("group")); + layer.getParameters()["output"] = static_cast(cnnLayer->GetParamAsUInt("output")); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp b/inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp index f836445..42e1a14 100644 --- a/inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp @@ -1,124 +1,168 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include +#include #include #include using namespace InferenceEngine; -Builder::DetectionOutputLayer::DetectionOutputLayer(const std::string& name): LayerFragment("DetectionOutput", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(2); +Builder::DetectionOutputLayer::DetectionOutputLayer(const std::string& name): LayerDecorator("DetectionOutput", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(2); + setBackgroudLabelId(-1); } -Builder::DetectionOutputLayer::DetectionOutputLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "DetectionOutput")) - THROW_IE_EXCEPTION << "Cannot create DetectionOutputLayer decorator for layer " << getLayer().getType(); +Builder::DetectionOutputLayer::DetectionOutputLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("DetectionOutput"); +} + +Builder::DetectionOutputLayer::DetectionOutputLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("DetectionOutput"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::DetectionOutputLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setInputPorts(const std::vector &ports) { if (ports.size() != 3) - THROW_IE_EXCEPTION << "Incorrect number of inputs for DetectionOutput layer."; - getLayer().getInputPorts() = ports; + THROW_IE_EXCEPTION << "Incorrect number of inputs for DetectionOutput getLayer()."; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::DetectionOutputLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } size_t Builder::DetectionOutputLayer::getNumClasses() const { - return getLayer().getParameters()["num_classes"].asUInt(); + return getLayer()->getParameters().at("num_classes"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setNumClasses(size_t num) { - getLayer().getParameters()["num_classes"] = num; + getLayer()->getParameters()["num_classes"] = num; return *this; } int Builder::DetectionOutputLayer::getBackgroudLabelId() const { - return getLayer().getParameters()["background_label_id"].asInt(-1); + return getLayer()->getParameters().at("background_label_id"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setBackgroudLabelId(int labelId) { - getLayer().getParameters()["background_label_id"] = labelId; + getLayer()->getParameters()["background_label_id"] = labelId; return *this; } int Builder::DetectionOutputLayer::getTopK() const { - return getLayer().getParameters()["top_k"].asInt(); + return getLayer()->getParameters().at("top_k"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setTopK(int topK) { - getLayer().getParameters()["top_k"] = topK; + getLayer()->getParameters()["top_k"] = topK; return *this; } int Builder::DetectionOutputLayer::getKeepTopK() const { - return getLayer().getParameters()["keep_top_k"].asInt(); + return getLayer()->getParameters().at("keep_top_k"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setKeepTopK(int topK) { - getLayer().getParameters()["keep_top_k"] = topK; + getLayer()->getParameters()["keep_top_k"] = topK; return *this; } int Builder::DetectionOutputLayer::getNumOrientClasses() const { - return getLayer().getParameters()["num_orient_classes"].asInt(); + return getLayer()->getParameters().at("num_orient_classes"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setNumOrientClasses(int numClasses) { - getLayer().getParameters()["num_orient_classes"] = numClasses; + getLayer()->getParameters()["num_orient_classes"] = numClasses; return *this; } std::string Builder::DetectionOutputLayer::getCodeType() const { - return getLayer().getParameters()["code_type"]; + return getLayer()->getParameters().at("code_type"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setCodeType(std::string type) { - getLayer().getParameters()["code_type"] = type; + getLayer()->getParameters()["code_type"] = type; return *this; } int Builder::DetectionOutputLayer::getInterpolateOrientation() const { - return getLayer().getParameters()["interpolate_orientation"].asInt(); + return getLayer()->getParameters().at("interpolate_orientation"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setInterpolateOrientation(int orient) { - getLayer().getParameters()["interpolate_orientation"] = orient; + getLayer()->getParameters()["interpolate_orientation"] = orient; return *this; } float Builder::DetectionOutputLayer::getNMSThreshold() const { - return getLayer().getParameters()["nms_threshold"].asFloat(); + return getLayer()->getParameters().at("nms_threshold"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setNMSThreshold(float threshold) { - getLayer().getParameters()["nms_threshold"] = threshold; + getLayer()->getParameters()["nms_threshold"] = threshold; return *this; } float Builder::DetectionOutputLayer::getConfidenceThreshold() const { - return getLayer().getParameters()["confidence_threshold"].asFloat(); + return getLayer()->getParameters().at("confidence_threshold"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setConfidenceThreshold(float threshold) { - getLayer().getParameters()["confidence_threshold"] = threshold; + getLayer()->getParameters()["confidence_threshold"] = threshold; return *this; } bool Builder::DetectionOutputLayer::getShareLocation() const { - return getLayer().getParameters()["share_location"].asBool(); + return getLayer()->getParameters().at("share_location"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setShareLocation(bool flag) { - getLayer().getParameters()["share_location"] = flag; + getLayer()->getParameters()["share_location"] = flag; return *this; } bool Builder::DetectionOutputLayer::getVariantEncodedInTarget() const { - return getLayer().getParameters()["variance_encoded_in_target"].asBool(); + return getLayer()->getParameters().at("variance_encoded_in_target"); } Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setVariantEncodedInTarget(bool flag) { - getLayer().getParameters()["variance_encoded_in_target"] = flag; + getLayer()->getParameters()["variance_encoded_in_target"] = flag; return *this; } + +REG_VALIDATOR_FOR(DetectionOutput, [](const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::DetectionOutputLayer layer(input_layer); + if (layer.getNumClasses() == 0) { + THROW_IE_EXCEPTION << "NumClasses parameter is wrong in layer " << layer.getName() << + ". It should be > 0."; + } + if (layer.getCodeType() != "caffe.PriorBoxParameter.CENTER_SIZE" && + layer.getCodeType() != "caffe.PriorBoxParameter.CORNER") { + THROW_IE_EXCEPTION << "CodeType parameter is wrong in layer " << layer.getName() << + ". It should be equal to 'caffe.PriorBoxParameter.CORNER' or 'caffe.PriorBoxParameter.CENTER_SIZE'"; + } + if (layer.getBackgroudLabelId() < -1) { + THROW_IE_EXCEPTION << "BackgroundLabelId parameter is wrong in layer " << layer.getName() << + ". It should be >= 0 if this one is an Id of existing label else it should be equal to -1"; + } + if (layer.getNMSThreshold() <= 0) { + THROW_IE_EXCEPTION << "NMSThreshold parameter is wrong in layer " << layer.getName() << + ". It should be > 0."; + } + if (layer.getConfidenceThreshold() <= 0) { + THROW_IE_EXCEPTION << "ConfidenceThreshold parameter is wrong in layer " << layer.getName() << + ". It should be > 0."; + } +}); + +REG_CONVERTER_FOR(DetectionOutput, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["num_classes"] = static_cast(cnnLayer->GetParamAsUInt("num_classes")); + layer.getParameters()["background_label_id"] = cnnLayer->GetParamAsInt("background_label_id", 0); + layer.getParameters()["top_k"] = cnnLayer->GetParamAsInt("top_k", -1); + layer.getParameters()["keep_top_k"] = cnnLayer->GetParamAsInt("keep_top_k", -1); + layer.getParameters()["num_orient_classes"] = cnnLayer->GetParamAsInt("num_orient_classes", 0); + layer.getParameters()["code_type"] = cnnLayer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CORNER"); + layer.getParameters()["interpolate_orientation"] = cnnLayer->GetParamAsInt("interpolate_orientation", 1); + layer.getParameters()["nms_threshold"] = cnnLayer->GetParamAsFloat("nms_threshold"); + layer.getParameters()["confidence_threshold"] = cnnLayer->GetParamAsFloat("confidence_threshold", -FLT_MAX); + layer.getParameters()["share_location"] = cnnLayer->GetParamsAsBool("share_location", true); + layer.getParameters()["variance_encoded_in_target"] = cnnLayer->GetParamsAsBool("variance_encoded_in_target", false); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp b/inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp index cffecaa..df51f5e 100644 --- a/inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp @@ -1,64 +1,95 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::EltwiseLayer::EltwiseLayer(const std::string& name): LayerFragment("Eltwise", name) { - getLayer().getOutputPorts().resize(1); +Builder::EltwiseLayer::EltwiseLayer(const std::string& name): LayerDecorator("Eltwise", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(2); setEltwiseType(EltwiseType::SUM); } -Builder::EltwiseLayer::EltwiseLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Eltwise")) - THROW_IE_EXCEPTION << "Cannot create EltwiseLayer decorator for layer " << getLayer().getType(); +Builder::EltwiseLayer::EltwiseLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Eltwise"); - std::string operatorStr = getLayer().getParameters()["operation"]; + std::string operatorStr = getLayer()->getParameters()["operation"]; if (operatorStr == "max") { type = MAX; } else if (operatorStr == "sum") { type = SUM; } else if (operatorStr == "mul") { type = MUL; + } else if (operatorStr == "sub") { + type = SUB; + } else if (operatorStr == "div") { + type = DIV; + } else if (operatorStr == "min") { + type = MIN; + } else if (operatorStr == "squared_diff") { + type = SQUARED_DIFF; + } +} + +Builder::EltwiseLayer::EltwiseLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Eltwise"); + + const auto cLayer = static_cast(this)->getLayer(); + + std::string operatorStr = cLayer->getParameters().at("operation"); + if (operatorStr == "max") { + type = MAX; + } else if (operatorStr == "sum") { + type = SUM; + } else if (operatorStr == "mul") { + type = MUL; + } else if (operatorStr == "sub") { + type = SUB; + } else if (operatorStr == "div") { + type = DIV; + } else if (operatorStr == "min") { + type = MIN; + } else if (operatorStr == "squared_diff") { + type = SQUARED_DIFF; } } Builder::EltwiseLayer& Builder::EltwiseLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::EltwiseLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::EltwiseLayer& Builder::EltwiseLayer::setInputPorts(const std::vector& ports) { - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::EltwiseLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::EltwiseLayer& Builder::EltwiseLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector Builder::EltwiseLayer::getScales() const { - return getLayer().getParameters()["scales"].asFloats({}); + return getLayer()->getParameters().at("scales"); } // TODO: IR doesn't contain Scales!!! Builder::EltwiseLayer& Builder::EltwiseLayer::setScales(const std::vector& scales) { - getLayer().getParameters()["scales"] = scales; + getLayer()->getParameters()["scales"] = scales; return *this; } @@ -70,17 +101,57 @@ Builder::EltwiseLayer& Builder::EltwiseLayer::setEltwiseType(Builder::EltwiseLay this->type = type; std::string operatorStr; switch (type) { - case MAX: - operatorStr = "max"; - break; - case SUM: - operatorStr = "sum"; - break; - case MUL: - operatorStr = "mul"; + case MAX: + operatorStr = "max"; + break; + case SUM: + operatorStr = "sum"; + break; + case MUL: + operatorStr = "mul"; + break; + case SUB: + operatorStr = "sub"; + break; + case DIV: + operatorStr = "div"; + break; + case MIN: + operatorStr = "min"; + break; + case SQUARED_DIFF: + operatorStr = "squared_diff"; + break; } - getLayer().getParameters()["operation"] = operatorStr; + getLayer()->getParameters()["operation"] = operatorStr; return *this; } +REG_VALIDATOR_FOR(Eltwise, [](const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::EltwiseLayer layer(input_layer); + + if (layer.getInputPorts().size() != 2) { + THROW_IE_EXCEPTION << "Input ports are incorrect in the layer " << layer.getName() + << ". Number of input ports should be equal to 2."; + } + if (partial && (layer.getInputPorts()[0].shape().empty() || layer.getInputPorts()[1].shape().empty() || + layer.getOutputPort().shape().empty())) + return; + + if (layer.getInputPorts()[0].shape() != layer.getInputPorts()[1].shape()) { + THROW_IE_EXCEPTION << "Input ports are incorrect in the layer " << layer.getName() + << ". They should have equal dimensions"; + } + + if (layer.getInputPorts()[0].shape() != layer.getOutputPort().shape()) { + THROW_IE_EXCEPTION << "Layer " << layer.getName() << " have different input and output ports. " + << "They should have equal dimensions."; + } +}); + +REG_CONVERTER_FOR(Eltwise, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["scales"] = cnnLayer->GetParamAsFloats("scales", {}); + layer.getParameters()["operation"] = cnnLayer->GetParamAsString("operation"); +}); + diff --git a/inference-engine/src/inference_engine/builders/ie_elu_layer.cpp b/inference-engine/src/inference_engine/builders/ie_elu_layer.cpp index 5be0044..eb280a7 100644 --- a/inference-engine/src/inference_engine/builders/ie_elu_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_elu_layer.cpp @@ -1,46 +1,67 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::ELULayer::ELULayer(const std::string& name): LayerFragment("ELU", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ELULayer::ELULayer(const std::string& name): LayerDecorator("ELU", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setAlpha(1); } -Builder::ELULayer::ELULayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ELU")) - THROW_IE_EXCEPTION << "Cannot create ELULayer decorator for layer " << getLayer().getType(); +Builder::ELULayer::ELULayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ELU"); +} + +Builder::ELULayer::ELULayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ELU"); } Builder::ELULayer& Builder::ELULayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ELULayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ELULayer& Builder::ELULayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::ELULayer::getAlpha() const { - return getLayer().getParameters()["alpha"].asFloat(); + return getLayer()->getParameters().at("alpha"); } Builder::ELULayer& Builder::ELULayer::setAlpha(float alpha) { - getLayer().getParameters()["alpha"] = alpha; + getLayer()->getParameters()["alpha"] = alpha; return *this; } +REG_VALIDATOR_FOR(ELU, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } + Builder::ELULayer layer(input_layer); + if (layer.getAlpha() < 0) { + THROW_IE_EXCEPTION << "Alpha should be >= 0"; + } +}); + +REG_CONVERTER_FOR(ELU, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["alpha"] = cnnLayer->GetParamAsFloat("alpha", 0); +}); + diff --git a/inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp b/inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp index 1abe7b8..cb78799 100644 --- a/inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp @@ -1,62 +1,66 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::FullyConnectedLayer::FullyConnectedLayer(const std::string& name): LayerFragment("FullyConnected", name) { - getLayer().getInputPorts().resize(1); - getLayer().getOutputPorts().resize(1); - getLayer().getParameters()["out-size"] = 0; +Builder::FullyConnectedLayer::FullyConnectedLayer(const std::string& name): LayerDecorator("FullyConnected", name) { + getLayer()->getInputPorts().resize(3); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getOutputPorts().resize(1); + getLayer()->getParameters()["out-size"] = 0; } -Builder::FullyConnectedLayer::FullyConnectedLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "FullyConnected")) - THROW_IE_EXCEPTION << "Cannot create FullyConnectedLayer decorator for layer " << getLayer().getType(); +Builder::FullyConnectedLayer::FullyConnectedLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("FullyConnected"); } -Builder::FullyConnectedLayer &Builder::FullyConnectedLayer::setName(const std::string &name) { - getLayer().getName() = name; - return *this; +Builder::FullyConnectedLayer::FullyConnectedLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("FullyConnected"); } -Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setWeights(const Blob::CPtr& weights) { - getLayer().addConstantData("weights", weights); - return *this; -} -Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setBiases(const Blob::CPtr& biases) { - getLayer().addConstantData("biases", biases); +Builder::FullyConnectedLayer &Builder::FullyConnectedLayer::setName(const std::string &name) { + getLayer()->setName(name); return *this; } const Port& Builder::FullyConnectedLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setInputPort(const Port& port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::FullyConnectedLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } size_t Builder::FullyConnectedLayer::getOutputNum() const { - return getLayer().getParameters()["out-size"].asUInt(); + return getLayer()->getParameters().at("out-size"); } + Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setOutputNum(size_t outNum) { - getLayer().getParameters()["out-size"] = outNum; + getLayer()->getParameters()["out-size"] = outNum; return *this; } + +REG_VALIDATOR_FOR(FullyConnected, [](const InferenceEngine::Builder::Layer::CPtr& layer, bool partial) { +}); + +REG_CONVERTER_FOR(FullyConnected, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["out-size"] = static_cast(cnnLayer->GetParamAsUInt("out-size", 0)); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_grn_layer.cpp b/inference-engine/src/inference_engine/builders/ie_grn_layer.cpp index 1cc1a7a..afa362c 100644 --- a/inference-engine/src/inference_engine/builders/ie_grn_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_grn_layer.cpp @@ -1,45 +1,52 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::GRNLayer::GRNLayer(const std::string& name): LayerFragment("GRN", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::GRNLayer::GRNLayer(const std::string& name): LayerDecorator("GRN", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setBeta(0); } -Builder::GRNLayer::GRNLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "GRN")) - THROW_IE_EXCEPTION << "Cannot create GRNLayer decorator for layer " << getLayer().getType(); +Builder::GRNLayer::GRNLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("GRN"); +} + +Builder::GRNLayer::GRNLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("GRN"); } Builder::GRNLayer& Builder::GRNLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::GRNLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::GRNLayer& Builder::GRNLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::GRNLayer::getBeta() const { - return getLayer().getParameters()["beta"].asFloat(); + return getLayer()->getParameters().at("beta"); } Builder::GRNLayer& Builder::GRNLayer::setBeta(float beta) { - getLayer().getParameters()["beta"] = beta; + getLayer()->getParameters()["beta"] = beta; return *this; } + +REG_CONVERTER_FOR(GRN, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["beta"] = static_cast(cnnLayer->GetParamAsFloat("beta")); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_gru_sequence_layer.cpp b/inference-engine/src/inference_engine/builders/ie_gru_sequence_layer.cpp new file mode 100644 index 0000000..3197686 --- /dev/null +++ b/inference-engine/src/inference_engine/builders/ie_gru_sequence_layer.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include +#include + +using namespace InferenceEngine; + +Builder::GRUSequenceLayer::GRUSequenceLayer(const std::string& name): LayerDecorator("GRUSequence", name) { + getLayer()->getOutputPorts().resize(2); + getLayer()->getInputPorts().resize(5); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getInputPorts()[3].setParameter("type", "optional"); +} + +Builder::GRUSequenceLayer::GRUSequenceLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("GRUSequence"); +} + +Builder::GRUSequenceLayer::GRUSequenceLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("GRUSequence"); +} + +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setName(const std::string& name) { + getLayer()->setName(name); + return *this; +} + +const std::vector& Builder::GRUSequenceLayer::getInputPorts() const { + return getLayer()->getInputPorts(); +} + +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setInputPorts(const std::vector& ports) { + getLayer()->getInputPorts() = ports; + return *this; +} + +const std::vector& Builder::GRUSequenceLayer::getOutputPorts() const { + return getLayer()->getOutputPorts(); +} + +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setOutputPorts(const std::vector& ports) { + getLayer()->getOutputPorts() = ports; + return *this; +} +int Builder::GRUSequenceLayer::getHiddenSize() const { + return getLayer()->getParameters().at("hidden_size"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setHiddenSize(int size) { + getLayer()->getParameters()["hidden_size"] = size; + return *this; +} +bool Builder::GRUSequenceLayer::getSequenceDim() const { + return getLayer()->getParameters().at("sequence_dim"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setSqquenceDim(bool flag) { + getLayer()->getParameters()["sequence_dim"] = flag; + return *this; +} +const std::vector& Builder::GRUSequenceLayer::getActivations() const { + return getLayer()->getParameters().at("activations"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setActivations(const std::vector& activations) { + getLayer()->getParameters()["activations"] = activations; + return *this; +} +const std::vector& Builder::GRUSequenceLayer::getActivationsAlpha() const { + return getLayer()->getParameters().at("activations_alpha"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setActivationsAlpha(const std::vector& activations) { + getLayer()->getParameters()["activations_alpha"] = activations; + return *this; +} +const std::vector& Builder::GRUSequenceLayer::getActivationsBeta() const { + return getLayer()->getParameters().at("activations_beta"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setActivationsBeta(const std::vector& activations) { + getLayer()->getParameters()["activations_beta"] = activations; + return *this; +} +float Builder::GRUSequenceLayer::getClip() const { + return getLayer()->getParameters().at("clip"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setClip(float clip) { + getLayer()->getParameters()["clip"] = clip; + return *this; +} + +bool Builder::GRUSequenceLayer::getLinearBeforeReset() const { + return getLayer()->getParameters().at("linear_before_reset"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setLinearBeforeReset(bool flag) { + getLayer()->getParameters()["linear_before_reset"] = flag; + return *this; +} +const std::string& Builder::GRUSequenceLayer::getDirection() const { + return getLayer()->getParameters().at("direction"); +} +Builder::GRUSequenceLayer& Builder::GRUSequenceLayer::setDirection(const std::string& direction) { + getLayer()->getParameters()["direction"] = direction; + return *this; +} + +REG_CONVERTER_FOR(GRUSequence, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["hidden_size"] = cnnLayer->GetParamAsInt("hidden_size"); + layer.getParameters()["sequence_dim"] = cnnLayer->GetParamsAsBool("sequence_dim", true); + std::vector activations; + std::istringstream stream(cnnLayer->GetParamAsString("activations")); + std::string str; + while (getline(stream, str, ',')) { + activations.push_back(str); + } + layer.getParameters()["activations"] = activations; + layer.getParameters()["activations_alpha"] = cnnLayer->GetParamAsFloats("activations_alpha"); + layer.getParameters()["activations_beta"] = cnnLayer->GetParamAsFloats("activations_beta"); + layer.getParameters()["clip"] = cnnLayer->GetParamAsFloat("clip"); + layer.getParameters()["linear_before_reset"] = cnnLayer->GetParamsAsBool("linear_before_reset", true); + layer.getParameters()["direction"] = cnnLayer->GetParamAsString("direction", ""); +}); + + diff --git a/inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp b/inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp index e7e099f..3b06293 100644 --- a/inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp @@ -1,40 +1,40 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
#include using namespace InferenceEngine; -Builder::InputLayer::InputLayer(const std::string& name): LayerFragment("Input", name) { - getLayer().getOutputPorts().resize(1); +Builder::InputLayer::InputLayer(const std::string& name): LayerDecorator("Input", name) { + getLayer()->getOutputPorts().resize(1); } -Builder::InputLayer::InputLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Input")) - THROW_IE_EXCEPTION << "Cannot create InputLayer decorator for layer " << getLayer().getType(); +Builder::InputLayer::InputLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Input"); +} + +Builder::InputLayer::InputLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Input"); } Builder::InputLayer& Builder::InputLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::InputLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::InputLayer& Builder::InputLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } -void Builder::InputLayer::validate(const Layer& layer) { - if (layer.getOutputPorts()[0].shape().empty()) - THROW_IE_EXCEPTION << layer.getType() << " node " << layer.getName() << " should have shape!"; -} - -REG_VALIDATOR_FOR(Input, Builder::InputLayer::validate); \ No newline at end of file +REG_VALIDATOR_FOR(Input, [] (const InferenceEngine::Builder::Layer::CPtr& layer, bool partial) { + if (layer->getOutputPorts()[0].shape().empty()) + THROW_IE_EXCEPTION << layer->getType() << " node " << layer->getName() << " should have shape!"; +}); diff --git a/inference-engine/src/inference_engine/builders/ie_layer_builder.cpp b/inference-engine/src/inference_engine/builders/ie_layer_builder.cpp index a65dd7c..99af91c 100644 --- a/inference-engine/src/inference_engine/builders/ie_layer_builder.cpp +++ b/inference-engine/src/inference_engine/builders/ie_layer_builder.cpp @@ -1,10 +1,9 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include #include
-#include #include #include @@ -14,71 +13,43 @@ using namespace InferenceEngine; -Builder::Layer::Layer(const std::string& type, const std::string& name): id((std::numeric_limits::max)()), type(type), name(name) {} +Builder::Layer::Layer(const std::string& type, const std::string& name): + name(name), type(type), id((std::numeric_limits::max)()) {} -Builder::Layer::Layer(const ILayer::Ptr& layer) { - id = layer->getId(); - getType() = layer->getType(); - getName() = layer->getName(); - getGraph() = layer->getGraph(); - getParameters() = layer->getParameters()->getParameters(); - getInputPorts() = layer->getInputPorts(); - getOutputPorts() = layer->getOutputPorts(); - getConstantData() = layer->getParameters()->getConstantData(); -} Builder::Layer::Layer(const ILayer::CPtr& layer) { id = layer->getId(); - getType() = layer->getType(); - getName() = layer->getName(); - getGraph() = layer->getGraph(); - getParameters() = layer->getParameters()->getParameters(); - getInputPorts() = layer->getInputPorts(); - getOutputPorts() = layer->getOutputPorts(); - getConstantData() = layer->getParameters()->getConstantData(); + name = layer->getName(); + type = layer->getType(); + inPorts = layer->getInputPorts(); + outPorts = layer->getOutputPorts(); + params = layer->getParameters(); } Builder::Layer::Layer(idx_t id, const Builder::Layer& layer): Layer(layer) { this->id = id; } -idx_t Builder::Layer::getId() const { +idx_t Builder::Layer::getId() const noexcept { return id; } -std::string& Builder::Layer::getType() { - return type; -} -const std::string& Builder::Layer::getType() const { +const std::string& Builder::Layer::getType() const noexcept { return type; } Builder::Layer& Builder::Layer::setType(const std::string& type) { - getType() = type; + this->type = type; return *this; } -std::string& Builder::Layer::getName() { - return name; -} -const std::string& Builder::Layer::getName() const { +const std::string& Builder::Layer::getName() const noexcept { return name; } Builder::Layer& Builder::Layer::setName(const std::string& name) { - getName() = name; - return *this; -} - -INetwork::Ptr& Builder::Layer::getGraph() { - return graph; -} -const INetwork::Ptr& Builder::Layer::getGraph() const { - return graph; -} -Builder::Layer& Builder::Layer::setGraph(const INetwork::Ptr& graph) { - getGraph() = graph; + this->name = name; return *this; } -const std::map& Builder::Layer::getParameters() const { +const std::map& Builder::Layer::getParameters() const noexcept { return params; } std::map& Builder::Layer::getParameters() { @@ -89,30 +60,10 @@ Builder::Layer& Builder::Layer::setParameters(const std::map& Builder::Layer::getConstantData() const { - return constData; -} -std::map& Builder::Layer::getConstantData() { - return constData; -} -Builder::Layer& Builder::Layer::setConstantData(const std::map& constData) { - for (const auto& it : constData) - addConstantData(it.first, it.second); - return *this; -} -Builder::Layer& Builder::Layer::setConstantData(const std::map& constData) { - getConstantData() = constData; - return *this; -} -Builder::Layer& Builder::Layer::addConstantData(const std::string& name, const Blob::CPtr& data) { - getConstantData()[name] = data; - return *this; -} - std::vector& Builder::Layer::getInputPorts() { return inPorts; } -const std::vector& Builder::Layer::getInputPorts() const { +const std::vector& Builder::Layer::getInputPorts() const noexcept { return inPorts; } Builder::Layer& Builder::Layer::setInputPorts(const std::vector &ports) { @@ -123,7 +74,7 @@ Builder::Layer& Builder::Layer::setInputPorts(const std::vector &ports) { std::vector& Builder::Layer::getOutputPorts() { return outPorts; } -const std::vector& Builder::Layer::getOutputPorts() const { +const std::vector& Builder::Layer::getOutputPorts() const noexcept { return outPorts; } Builder::Layer& Builder::Layer::setOutputPorts(const std::vector &ports) { @@ -131,29 +82,20 @@ Builder::Layer& Builder::Layer::setOutputPorts(const std::vector &ports) { return *this; } -const ILayer::Ptr Builder::Layer::build() const { - validate(); - details::Layer::Ptr layer = std::make_shared(id); - - layer->getName() = name; - layer->getType() = type; - layer->setGraph(graph); - layer->getInputPorts() = inPorts; - layer->getOutputPorts() = outPorts; - layer->getParameters()->getParameters() = params; - layer->getParameters()->getConstantData() = constData; - return std::static_pointer_cast(layer); +const ILayer::CPtr Builder::Layer::build() const { + validate(true); + return std::static_pointer_cast(shared_from_this()); } -void Builder::Layer::addValidator(const std::string &type, const std::function& validator) { +void Builder::Layer::addValidator(const std::string &type, const std::function& validator) { auto holder = getValidatorsHolder(); if (holder->validators.find(type) == holder->validators.end()) holder->validators[type] = validator; } -void Builder::Layer::validate() const { +void Builder::Layer::validate(bool partial) const { if (getValidatorsHolder()->validators.find(type) != getValidatorsHolder()->validators.end()) - getValidatorsHolder()->validators[type](*this); + getValidatorsHolder()->validators[type](shared_from_this(), partial); } std::shared_ptr Builder::Layer::getValidatorsHolder() { diff --git a/inference-engine/src/inference_engine/builders/ie_layer_decorator.cpp b/inference-engine/src/inference_engine/builders/ie_layer_decorator.cpp new file mode 100644 index 0000000..d01bc97 --- /dev/null +++ b/inference-engine/src/inference_engine/builders/ie_layer_decorator.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include
+#include +#include +#include + +using namespace InferenceEngine; +using namespace details; + +Builder::LayerDecorator::LayerDecorator(const std::string& type, const std::string& name) { + layer = std::make_shared(type, name); +} + +Builder::LayerDecorator::LayerDecorator(const Layer::Ptr& layer): layer(layer) {} +Builder::LayerDecorator::LayerDecorator(const Layer::CPtr& layer): cLayer(layer) {} + +Builder::LayerDecorator::LayerDecorator(const Builder::LayerDecorator & rval) { + *this = rval; +} + +Builder::LayerDecorator &Builder::LayerDecorator::operator=(const Builder::LayerDecorator &rval) { + layer = rval.layer; + cLayer = rval.cLayer; + return *this; +} + +Builder::LayerDecorator::operator Builder::Layer() const { + getLayer()->validate(true); + return *getLayer(); +} + +Builder::LayerDecorator::operator Builder::Layer::Ptr() { + getLayer()->validate(true); + return getLayer(); +} + +Builder::LayerDecorator::operator Builder::Layer::CPtr() const { + getLayer()->validate(true); + return getLayer(); +} + +const std::string& Builder::LayerDecorator::getType() const { + return getLayer()->getType(); +} +const std::string& Builder::LayerDecorator::getName() const { + return getLayer()->getName(); +} + +Builder::Layer::Ptr& Builder::LayerDecorator::getLayer() { + if (!layer) + THROW_IE_EXCEPTION << "Cannot get Layer::Ptr!"; + return layer; +} + +const Builder::Layer::CPtr Builder::LayerDecorator::getLayer() const { + if (!cLayer) { + if (!layer) + THROW_IE_EXCEPTION << "Cannot get Layer::CPtr!"; + return std::static_pointer_cast(layer); + } + return cLayer; +} + +void Builder::LayerDecorator::checkType(const std::string& type) const { + if (!details::CaselessEq()(getLayer()->getType(), type)) + THROW_IE_EXCEPTION << "Cannot create " << type << " decorator for layer " << getLayer()->getType(); +} diff --git a/inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp b/inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp deleted file mode 100644 index 8cefe78..0000000 --- a/inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include - -using namespace InferenceEngine; -using namespace details; - -Builder::LayerFragment::LayerFragment(const std::string& type, const std::string& name): layer(type, name), refLayer(layer) {} - -Builder::LayerFragment::LayerFragment(Layer& genLayer): layer("", ""), refLayer(genLayer) {} - -Builder::LayerFragment &Builder::LayerFragment::operator=(const Builder::LayerFragment &rval) { - layer = rval.layer; - refLayer = rval.refLayer; - if (!layer.getType().empty() && !layer.getName().empty()) - refLayer = layer; - return *this; -} - -Builder::LayerFragment::LayerFragment(const Builder::LayerFragment & rval): LayerFragment("", "") { - *this = rval; -} - -Builder::LayerFragment::operator Builder::Layer() const { - getLayer().validate(); - return getLayer(); -} - -const std::string& Builder::LayerFragment::getType() const { - return getLayer().getType(); -} -const std::string& Builder::LayerFragment::getName() const { - return getLayer().getName(); -} - -Builder::Layer& Builder::LayerFragment::getLayer() const { - return refLayer; -} - -const std::vector Builder::LayerFragment::uInts2size_t(const std::vector& vector) const { - std::vector newVector; - newVector.reserve(vector.size()); - for (const auto& it : vector) { - newVector.push_back(it); - } - return newVector; -} diff --git a/inference-engine/src/inference_engine/builders/ie_lrn_layer.cpp b/inference-engine/src/inference_engine/builders/ie_lrn_layer.cpp new file mode 100644 index 0000000..8bd20a7 --- /dev/null +++ b/inference-engine/src/inference_engine/builders/ie_lrn_layer.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include + +using namespace InferenceEngine; + +Builder::LRNLayer::LRNLayer(const std::string& name): LayerDecorator("LRN", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); + setSize(1); + setAlpha(1e-4); + setBeta(0.75f); + setBias(1.0f); +} + +Builder::LRNLayer::LRNLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("LRN"); +} + +Builder::LRNLayer::LRNLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("LRN"); +} + +Builder::LRNLayer& Builder::LRNLayer::setName(const std::string& name) { + getLayer()->setName(name); + return *this; +} + +const Port& Builder::LRNLayer::getPort() const { + return getLayer()->getOutputPorts()[0]; +} + +Builder::LRNLayer& Builder::LRNLayer::setPort(const Port &port) { + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; + return *this; +} + +size_t Builder::LRNLayer::getSize() const { + return getLayer()->getParameters().at("size"); +} + +Builder::LRNLayer& Builder::LRNLayer::setSize(size_t size) { + getLayer()->getParameters()["size"] = size; + return *this; +} + +float Builder::LRNLayer::getAlpha() const { + return getLayer()->getParameters().at("alpha"); +} + +Builder::LRNLayer& Builder::LRNLayer::setAlpha(float alpha) { + getLayer()->getParameters()["alpha"] = alpha; + return *this; +} + +float Builder::LRNLayer::getBeta() const { + return getLayer()->getParameters().at("beta"); +} + +Builder::LRNLayer& Builder::LRNLayer::setBeta(float beta) { + getLayer()->getParameters()["beta"] = beta; + return *this; +} + +float Builder::LRNLayer::getBias() const { + return getLayer()->getParameters().at("bias"); +} + +Builder::LRNLayer& Builder::LRNLayer::setBias(float bias) { + getLayer()->getParameters()["bias"] = bias; + return *this; +} + +REG_VALIDATOR_FOR(LRN, [](const Builder::Layer::CPtr &input_layer, bool partial) { + Builder::LRNLayer layer(input_layer); + if (layer.getAlpha() <= 0) { + THROW_IE_EXCEPTION << "Alpha should be > 0"; + } + if (layer.getBeta() <= 0) { + THROW_IE_EXCEPTION << "Beta should be > 0"; + } + if (layer.getSize() == 0) { + THROW_IE_EXCEPTION << "Size should be > 0"; + } + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(LRN, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["bias"] = cnnLayer->GetParamAsFloat("bias", 1.0f); + layer.getParameters()["beta"] = cnnLayer->GetParamAsFloat("beta", 0.75f); + layer.getParameters()["alpha"] = cnnLayer->GetParamAsFloat("alpha", 1e-4f); + layer.getParameters()["size"] = cnnLayer->GetParamAsUInt("size", 1); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_lstm_sequence_layer.cpp b/inference-engine/src/inference_engine/builders/ie_lstm_sequence_layer.cpp new file mode 100644 index 0000000..c856368 --- /dev/null +++ b/inference-engine/src/inference_engine/builders/ie_lstm_sequence_layer.cpp @@ -0,0 +1,127 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include +#include + +using namespace InferenceEngine; + +Builder::LSTMSequenceLayer::LSTMSequenceLayer(const std::string& name): LayerDecorator("LSTMSequence", name) { + getLayer()->getOutputPorts().resize(3); + getLayer()->getInputPorts().resize(7); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getInputPorts()[3].setParameter("type", "optional"); + getLayer()->getInputPorts()[6].setParameter("type", "weights"); +} + +Builder::LSTMSequenceLayer::LSTMSequenceLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("LSTMSequence"); +} + +Builder::LSTMSequenceLayer::LSTMSequenceLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("LSTMSequence"); +} + +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setName(const std::string& name) { + getLayer()->setName(name); + return *this; +} + +const std::vector& Builder::LSTMSequenceLayer::getInputPorts() const { + return getLayer()->getInputPorts(); +} + +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setInputPorts(const std::vector& ports) { + getLayer()->getInputPorts() = ports; + return *this; +} + +const std::vector& Builder::LSTMSequenceLayer::getOutputPorts() const { + return getLayer()->getOutputPorts(); +} + +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setOutputPorts(const std::vector& ports) { + getLayer()->getOutputPorts() = ports; + return *this; +} +int Builder::LSTMSequenceLayer::getHiddenSize() const { + return getLayer()->getParameters().at("hidden_size"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setHiddenSize(int size) { + getLayer()->getParameters()["hidden_size"] = size; + return *this; +} +bool Builder::LSTMSequenceLayer::getSequenceDim() const { + return getLayer()->getParameters().at("sequence_dim"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setSqquenceDim(bool flag) { + getLayer()->getParameters()["sequence_dim"] = flag; + return *this; +} +const std::vector& Builder::LSTMSequenceLayer::getActivations() const { + return getLayer()->getParameters().at("activations"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setActivations(const std::vector& activations) { + getLayer()->getParameters()["activations"] = activations; + return *this; +} +const std::vector& Builder::LSTMSequenceLayer::getActivationsAlpha() const { + return getLayer()->getParameters().at("activations_alpha"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setActivationsAlpha(const std::vector& activations) { + getLayer()->getParameters()["activations_alpha"] = activations; + return *this; +} +const std::vector& Builder::LSTMSequenceLayer::getActivationsBeta() const { + return getLayer()->getParameters().at("activations_beta"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setActivationsBeta(const std::vector& activations) { + getLayer()->getParameters()["activations_beta"] = activations; + return *this; +} +float Builder::LSTMSequenceLayer::getClip() const { + return getLayer()->getParameters().at("clip"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setClip(float clip) { + getLayer()->getParameters()["clip"] = clip; + return *this; +} + +bool Builder::LSTMSequenceLayer::getInputForget() const { + return getLayer()->getParameters().at("input_forget"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setInputForget(bool flag) { + getLayer()->getParameters()["input_forget"] = flag; + return *this; +} +const std::string& Builder::LSTMSequenceLayer::getDirection() const { + return getLayer()->getParameters().at("direction"); +} +Builder::LSTMSequenceLayer& Builder::LSTMSequenceLayer::setDirection(const std::string& direction) { + getLayer()->getParameters()["direction"] = direction; + return *this; +} + +REG_CONVERTER_FOR(LSTMSequence, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["hidden_size"] = cnnLayer->GetParamAsInt("hidden_size"); + layer.getParameters()["sequence_dim"] = cnnLayer->GetParamsAsBool("sequence_dim", true); + std::vector activations; + std::istringstream stream(cnnLayer->GetParamAsString("activations")); + std::string str; + while (getline(stream, str, ',')) { + activations.push_back(str); + } + layer.getParameters()["activations"] = activations; + layer.getParameters()["activations_alpha"] = cnnLayer->GetParamAsFloats("activations_alpha"); + layer.getParameters()["activations_beta"] = cnnLayer->GetParamAsFloats("activations_beta"); + layer.getParameters()["clip"] = cnnLayer->GetParamAsFloat("clip"); + layer.getParameters()["input_forget"] = cnnLayer->GetParamsAsBool("input_forget", true); + layer.getParameters()["direction"] = cnnLayer->GetParamAsString("direction", ""); +}); + + diff --git a/inference-engine/src/inference_engine/builders/ie_memory_layer.cpp b/inference-engine/src/inference_engine/builders/ie_memory_layer.cpp index f987b07..39c0dbf 100644 --- a/inference-engine/src/inference_engine/builders/ie_memory_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_memory_layer.cpp @@ -1,70 +1,83 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::MemoryLayer::MemoryLayer(const std::string& name): LayerFragment("Memory", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::MemoryLayer::MemoryLayer(const std::string& name): LayerDecorator("Memory", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); + setSize(2); } -Builder::MemoryLayer::MemoryLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Memory")) - THROW_IE_EXCEPTION << "Cannot create MemoryLayer decorator for layer " << getLayer().getType(); +Builder::MemoryLayer::MemoryLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Memory"); +} + +Builder::MemoryLayer::MemoryLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Memory"); } Builder::MemoryLayer& Builder::MemoryLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::MemoryLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::MemoryLayer& Builder::MemoryLayer::setInputPort(const Port &port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::MemoryLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::MemoryLayer& Builder::MemoryLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::string Builder::MemoryLayer::getId() const { - return getLayer().getParameters()["id"]; + return getLayer()->getParameters().at("id"); } Builder::MemoryLayer& Builder::MemoryLayer::setId(const std::string& id) { - getLayer().getParameters()["id"] = id; + getLayer()->getParameters()["id"] = id; return *this; } size_t Builder::MemoryLayer::getIndex() const { - return getLayer().getParameters()["index"].asUInt(); + return getLayer()->getParameters().at("index"); } Builder::MemoryLayer& Builder::MemoryLayer::setIndex(size_t index) { if (index > 1) THROW_IE_EXCEPTION << "Index supports only 0 and 1 values."; - getLayer().getParameters()["index"] = index; + getLayer()->getParameters()["index"] = index; return *this; } size_t Builder::MemoryLayer::getSize() const { - return getLayer().getParameters()["size"].asUInt(2); + return getLayer()->getParameters().at("size"); } Builder::MemoryLayer& Builder::MemoryLayer::setSize(size_t size) { if (size != 2) THROW_IE_EXCEPTION << "Only size equal 2 is supported."; - getLayer().getParameters()["size"] = size; + getLayer()->getParameters()["size"] = size; return *this; } +REG_VALIDATOR_FOR(Memory, [](const InferenceEngine::Builder::Layer::CPtr& layer, bool partial) { +}); + +REG_CONVERTER_FOR(Memory, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["id"] = cnnLayer->GetParamAsString("id", 0); + layer.getParameters()["index"] = static_cast(cnnLayer->GetParamAsUInt("index", 0)); + layer.getParameters()["size"] = static_cast(cnnLayer->GetParamAsUInt("size", 0)); +}); + diff --git a/inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp b/inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp index 0211e9f..c81772d 100644 --- a/inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp @@ -1,60 +1,83 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::MVNLayer::MVNLayer(const std::string& name): LayerFragment("MVN", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::MVNLayer::MVNLayer(const std::string& name): LayerDecorator("MVN", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setEpsilon(9.999999717180685e-10f); setNormalize(true); setAcrossChannels(true); } -Builder::MVNLayer::MVNLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "MVN")) - THROW_IE_EXCEPTION << "Cannot create MVNLayer decorator for layer " << getLayer().getType(); +Builder::MVNLayer::MVNLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("MVN"); +} + +Builder::MVNLayer::MVNLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("MVN"); } Builder::MVNLayer& Builder::MVNLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::MVNLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::MVNLayer& Builder::MVNLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } bool Builder::MVNLayer::getAcrossChannels() const { - return getLayer().getParameters()["across_channels"].asBool(true); + return getLayer()->getParameters().at("across_channels"); } Builder::MVNLayer& Builder::MVNLayer::setAcrossChannels(bool flag) { - getLayer().getParameters()["across_channels"] = flag ? 1 : 0; + getLayer()->getParameters()["across_channels"] = flag ? 1 : 0; return *this; } bool Builder::MVNLayer::getNormalize() const { - return getLayer().getParameters()["normalize_variance"].asBool(true); + return getLayer()->getParameters().at("normalize_variance"); } Builder::MVNLayer& Builder::MVNLayer::setNormalize(bool flag) { - getLayer().getParameters()["normalize_variance"] = flag ? 1 : 0; + getLayer()->getParameters()["normalize_variance"] = flag ? 1 : 0; return *this; } float Builder::MVNLayer::getEpsilon() const { - return getLayer().getParameters()["eps"].asFloat(); + return getLayer()->getParameters().at("eps"); } Builder::MVNLayer& Builder::MVNLayer::setEpsilon(float eps) { - getLayer().getParameters()["eps"] = eps; + getLayer()->getParameters()["eps"] = eps; return *this; } + +REG_VALIDATOR_FOR(MVN, [](const Builder::Layer::CPtr& input_layer, bool partial) { + Builder::MVNLayer layer(input_layer); + if (layer.getEpsilon() <= 0) { + THROW_IE_EXCEPTION << "Epsilon should be > 0"; + } + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(MVN, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["across_channels"] = cnnLayer->GetParamsAsBool("across_channels", 0); + layer.getParameters()["normalize_variance"] = cnnLayer->GetParamsAsBool("normalize_variance", 0); + layer.getParameters()["eps"] = cnnLayer->GetParamAsFloat("eps", 0); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_network_builder.cpp b/inference-engine/src/inference_engine/builders/ie_network_builder.cpp index 70d3cde..2899cfd 100644 --- a/inference-engine/src/inference_engine/builders/ie_network_builder.cpp +++ b/inference-engine/src/inference_engine/builders/ie_network_builder.cpp @@ -1,9 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include #include "graph_tools.hpp" #include @@ -33,28 +32,35 @@ Builder::Network::Network(const std::string &name): Builder::Network(Context(), Builder::Network::Network(const INetwork &network): Builder::Network(Context(), network) {} Builder::Network::Network(const ICNNNetwork &network): Builder::Network(Context(), network) {} -Builder::Network::Network(const Context& ieContext, const std::string &name): ctx(ieContext), name(name), version(3) {} +Builder::Network::Network(const Context& ieContext, const std::string &name) { + parameters["name"] = name; + parameters["context"] = ieContext; + parameters["version"] = 3; + parameters["layers"] = std::vector(); + parameters["connections"] = std::vector(); +} -Builder::Network::Network(const Context& ieContext, const INetwork &network): ctx(ieContext), name(network.getName()), version(3) { +Builder::Network::Network(const Context& ieContext, const INetwork &network): Network(ieContext, network.getName()) { for (const auto& layer : network) { - layers.push_back(Layer(layer)); + parameters["layers"].as>().push_back(std::make_shared(layer)); const auto layerConnections = network.getLayerConnections(layer->getId()); for (const auto& connection : layerConnections) { bool found = false; - for (const auto& con : connections) { + for (const auto& con : parameters["connections"].as>()) { if (con == connection) { found = true; break; } } if (!found) { - connections.push_back(connection); + parameters["connections"].as>().push_back(connection); } } } } -Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): ctx(ieContext), name(network.getName()), version(0) { +Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): Network(ieContext, network.getName()) { + parameters["version"] = 0; auto allInputs = CNNNetGetAllInputLayers(network); InputsDataMap inputs; network.getInputsInfo(inputs); @@ -66,7 +72,6 @@ Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): std::vector queueLayers; auto createGenericFromCNNLayer = [&](const CNNLayerPtr& cnnLayer) { - std::vector inputPorts; for (const auto& data : cnnLayer->insData) { auto lockedData = data.lock(); if (!lockedData) @@ -74,155 +79,49 @@ Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): if (dataPtrs.find(lockedData.get()) == dataPtrs.end()) { dataPtrs.insert(lockedData.get()); } - inputPorts.emplace_back(lockedData->getTensorDesc().getDims()); } - std::vector outputPorts; for (const auto& data : cnnLayer->outData) { if (dataPtrs.find(data.get()) == dataPtrs.end()) { dataPtrs.insert(data.get()); } - outputPorts.push_back(Port(data->getTensorDesc().getDims())); } - - std::map params; - for (const auto& it : cnnLayer->params) { - params[it.first] = it.second; + std::map blobs = cnnLayer->blobs; + size_t inputsCount(0); + for (const auto& data : cnnLayer->insData) { + auto lockedData = data.lock(); + if (!lockedData) + continue; + inputsCount++; } - const auto layer = Layer(cnnLayer->type, cnnLayer->name) - .setInputPorts(inputPorts).setOutputPorts(outputPorts) - .setParameters(params).setConstantData(cnnLayer->blobs); + const auto layer = builderFromCNNLayer(cnnLayer); idx_t layerId = addLayer(layer); + + if (blobs.find("weights") != blobs.end()) { + idx_t constLayerId = addLayer(ConstLayer("weights").setData(blobs["weights"])); + connect({constLayerId}, {layerId, inputsCount++}); + } + if (blobs.find("biases") != blobs.end()) { + if (blobs.find("weights") == blobs.end()) ++inputsCount; + + idx_t constLayerId = addLayer(ConstLayer("biases").setData(blobs["biases"])); + connect({constLayerId}, {layerId, inputsCount++}); + } + for (const auto& it : blobs) { + if (it.first == "weights" || it.first == "biases") + continue; + idx_t constLayerId = addLayer(ConstLayer(it.first).setData(it.second)); + connect({constLayerId}, {layerId, inputsCount++}); + } name2id[layer.getName()] = layerId; return layerId; }; auto addPreProcessFor = [&](const InputInfo::Ptr& inputInfo) { auto inputLayer = getLayer(name2id[inputInfo->name()]); - if (inputLayer.getType().empty() && inputLayer.getName().empty()) + if (inputLayer->getType().empty() && inputLayer->getName().empty()) return; - ResizeAlgorithm alg = inputInfo->getPreProcess().getResizeAlgorithm(); - std::string algStr; - switch (alg) { - case RESIZE_BILINEAR: - algStr = "RESIZE_BILINEAR"; - break; - case RESIZE_AREA: - algStr = "RESIZE_AREA"; - break; - default: - break; - } - - if (!algStr.empty()) - inputLayer.getParameters()["resize_alg"] = algStr; - - switch (inputInfo->getPreProcess().getMeanVariant()) { - case MEAN_IMAGE: { - auto meanWidth = inputInfo->getPreProcess()[0]->meanData->dims()[0]; - auto meanHeight = inputInfo->getPreProcess()[0]->meanData->dims()[1]; - - TensorDesc desc(Precision::FP32, inputLayer.getOutputPorts()[0].shape(), Layout::NCHW); - Blob::Ptr meanBuffer = make_blob_with_precision(desc); - meanBuffer->allocate(); - auto *meanData = meanBuffer->buffer().as(); - for (unsigned channel = 0; channel < inputInfo->getPreProcess().getNumberOfChannels(); channel++) { - Blob::Ptr meanBlob = inputInfo->getPreProcess()[channel]->meanData; - if (!meanBlob || meanBlob->precision() != Precision::FP32) - THROW_IE_EXCEPTION << "mean image not provided or not in Float 32"; - if (meanBlob->size() != meanHeight*meanWidth) { - THROW_IE_EXCEPTION << "mean image size does not match expected network input, expecting " << meanWidth << " x " << meanHeight; - } - ie_memcpy(meanData + channel*meanBlob->size(), - meanBuffer->byteSize() - channel*meanBlob->size() * sizeof(float), - meanBlob->buffer(), - meanBlob->byteSize()); - } - - // WA for batch != 1 - // Reshape for new batch is not supported for models with mean image - size_t noBatchSize = desc.getBlockingDesc().getStrides()[0]; - for (size_t b = 1; b < inputLayer.getOutputPorts()[0].shape()[0]; b++) { - ie_memcpy(meanData + noBatchSize*b, - meanBuffer->byteSize() - noBatchSize * b * sizeof(float), - meanData, - noBatchSize * sizeof(float)); - } - - std::vector outPorts; - std::vector inputConnections = getLayerConnections(inputLayer.getId()); - for (const auto& connection : inputConnections) { - outPorts.push_back(connection.to()); - disconnect(connection); - } - - idx_t constId = addLayer(Builder::ConstLayer(inputLayer.getName() + "_mean_image") - .setPort(inputLayer.getOutputPorts()[0]).setData(meanBuffer)); - idx_t constNegId = addLayer({{constId}}, Builder::PowerLayer(inputLayer.getName() + "_mean_image_neg") - .setPort(inputLayer.getOutputPorts()[0]).setScale(-1)); - - idx_t eltwiseId = addLayer({{inputLayer.getId()}, {constNegId}}, - Builder::EltwiseLayer(inputLayer.getName() + "_mean_image_elt") - .setInputPorts({inputLayer.getOutputPorts()[0], inputLayer.getOutputPorts()[0]}) - .setOutputPort(inputLayer.getOutputPorts()[0]) - .setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUM)); - - for (const auto& port : outPorts) { - connect({eltwiseId}, port); - } - } - break; - case MEAN_VALUE: { - TensorDesc desc(Precision::FP32, {inputInfo->getPreProcess().getNumberOfChannels()}, Layout::C); - Blob::Ptr mean = make_blob_with_precision(desc); - mean->allocate(); - Blob::Ptr scale = make_blob_with_precision(desc); - scale->allocate(); - Blob::Ptr emptyScale = make_blob_with_precision(desc); - emptyScale->allocate(); - auto *meanData = mean->buffer().as(); - auto *scaleData = scale->buffer().as(); - auto *emptyScaleData = emptyScale->buffer().as(); - bool noMean = true; - bool noScale = true; - for (size_t i = 0; i < inputInfo->getPreProcess().getNumberOfChannels(); i++) { - meanData[i] = -inputInfo->getPreProcess()[i]->meanValue; - noMean = noMean && (meanData[i] == 0); - scaleData[i] = inputInfo->getPreProcess()[i]->stdScale; - emptyScaleData[i] = 1; - noScale = noScale && (scaleData[i] == 1); - } - std::vector outPorts; - std::vector inputConnections = getLayerConnections(inputLayer.getId()); - for (const auto& connection : inputConnections) { - outPorts.push_back(connection.to()); - disconnect(connection); - } - - idx_t meanId = inputLayer.getId(); - if (!noMean) { - meanId = addLayer({{inputLayer.getId()}}, - Builder::ScaleShiftLayer(inputLayer.getName() + "_mean_value") - .setPort(inputLayer.getOutputPorts()[0]) - .setBiases(mean).setWeights(emptyScale)); - } - - idx_t scaleId = meanId; - if (!noScale) { - scaleId = addLayer({{meanId}}, - Builder::ScaleShiftLayer(inputLayer.getName() + "_scale_value") - .setPort(inputLayer.getOutputPorts()[0]) - .setWeights(scale)); - } - - for (const auto& port : outPorts) { - connect({scaleId}, port); - } - } - break; - default: - break; - } + inputLayer->getParameters()["preProcess"] = inputInfo->getPreProcess(); }; for (auto input : inputs) { @@ -300,10 +199,10 @@ Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): THROW_IE_EXCEPTION << "Cannot find output layer " << creator->name; auto lastLayer = getLayer(name2id[creator->name]); - if (lastLayer.getName() == "" && lastLayer.getType().empty()) + if (lastLayer->getName() == "" && lastLayer->getType().empty()) THROW_IE_EXCEPTION << "Cannot find output layer " << creator->name; - std::string name = "out_" + lastLayer.getName(); + std::string name = "out_" + lastLayer->getName(); CNNLayerPtr cnnOutLayer(new CNNLayer({name, "Output", creator->outData[0]->getPrecision()})); cnnOutLayer->insData.push_back((*it).second); @@ -318,7 +217,7 @@ Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): } } - connections.push_back(Connection({lastLayer.getId(), inIdx}, {outLayerId})); + parameters["connections"].as>().push_back(Connection({lastLayer->getId(), inIdx}, {outLayerId})); } for (const auto dataPtr : dataPtrs) { @@ -349,21 +248,21 @@ Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): break; } } - connections.push_back(Connection({name2id[cnnInputLayer->name], inIdx}, {name2id[it.second->name], outIdx})); + parameters["connections"].as>() + .push_back(Connection({name2id[cnnInputLayer->name], inIdx}, {name2id[it.second->name], outIdx})); } } - for (auto input : inputs) { + for (const auto &input : inputs) { addPreProcessFor(input.second); } } -std::vector& Builder::Network::getLayers() { - return layers; +const std::vector& Builder::Network::getLayers() const { + return parameters.at("layers").as>(); } - -const std::vector& Builder::Network::getLayers() const { - return layers; +std::vector& Builder::Network::getLayers() { + return parameters["layers"].as>(); } idx_t Builder::Network::addLayer(const std::vector &inputs, @@ -380,10 +279,11 @@ idx_t Builder::Network::addLayer(const Layer& layer) { if (defaultId == (std::numeric_limits::max)()) defaultId = 0; - auto it = layers.begin(); - while (it != layers.end()) { - for (it = layers.begin(); it != layers.end(); it++) { - if (it->getId() == defaultId) { + auto it = parameters["layers"].as>().begin(); + while (it != parameters["layers"].as>().end()) { + for (it = parameters["layers"].as>().begin(); + it != parameters["layers"].as>().end(); it++) { + if ((*it)->getId() == defaultId) { defaultId++; break; } @@ -399,8 +299,8 @@ idx_t Builder::Network::addLayer(const Layer& layer) { bool nameIsUnique(false); while (!nameIsUnique) { nameIsUnique = true; - for (const auto& layer : layers) { - if (generatedName == layer.getName()) { + for (const auto& layer : parameters["layers"].as>()) { + if (generatedName == layer->getName()) { nameIsUnique = false; generatedName += "_" + idName; } @@ -410,83 +310,131 @@ idx_t Builder::Network::addLayer(const Layer& layer) { }; idx_t generatedId = getAvailableId(layer.getId()); const auto name = generateAvailableName(layer.getName(), generatedId); - layers.emplace_back(generatedId, layer); - layers[layers.size() - 1].getName() = name; + parameters["layers"].as>().emplace_back(std::make_shared(generatedId, layer)); + parameters["layers"].as>()[parameters["layers"].as>().size() - 1]->setName(name); return generatedId; } void Builder::Network::connect(const PortInfo& input, const PortInfo& output) { - connections.emplace_back(input, output); + const auto mergePortData = [&]() -> bool { + const auto blobEqualOrEmpty = [](const Blob::Ptr& ref, const Blob::Ptr& test) -> bool { + return (ref->size() == test->size() || test->size() == 0) && + (!memcmp(ref->cbuffer(), test->cbuffer(), test->byteSize())) && + (ref->getTensorDesc().getPrecision() == test->getTensorDesc().getPrecision() || + test->getTensorDesc().getPrecision() == Precision::UNSPECIFIED) && + (ref->getTensorDesc().getLayout() == test->getTensorDesc().getLayout() || + test->getTensorDesc().getLayout() == Layout::ANY) && + (ref->getTensorDesc().getDims() == test->getTensorDesc().getDims() || + test->getTensorDesc().getDims().empty()) && + (ref->cbuffer().as() == test->cbuffer().as() || + test->cbuffer() == nullptr); + }; + + const auto srcPortData = getLayer(input.layerId())->getOutputPorts()[input.portId()].getData(); + const auto dstPortData = getLayer(output.layerId())->getInputPorts()[output.portId()].getData(); + if (srcPortData == dstPortData) + return true; + + if (srcPortData->getParameters() != dstPortData->getParameters() && + !srcPortData->getParameters().empty() && + !dstPortData->getParameters().empty()) + return false; + + size_t srcDataCount(0), dstDataCount(0); + if (!srcPortData->getParameters().empty()) srcDataCount++; + if (!dstPortData->getParameters().empty()) dstDataCount++; + + const auto srcBlb = srcPortData->getData(); + const auto dstBlb = dstPortData->getData(); + if (srcBlb == dstBlb || (srcBlb->size() == dstBlb->size() && + srcBlb->getTensorDesc() == dstBlb->getTensorDesc() && + ((srcBlb->cbuffer().as() == dstBlb->cbuffer().as()) || + (srcBlb->cbuffer() != nullptr && dstBlb->cbuffer() != nullptr && + !memcmp(srcBlb->cbuffer(), dstBlb->cbuffer(), dstBlb->byteSize()))))) { + srcDataCount++; + dstDataCount++; + } else if (blobEqualOrEmpty(srcBlb, dstBlb)) { + srcDataCount++; + } else if (blobEqualOrEmpty(dstBlb, srcBlb)) { + dstDataCount++; + } else { + return false; + } + + if (dstDataCount > srcDataCount) { + // Change source and all src destination data + for (const auto& connection : getLayerConnections(input.layerId())) { + if (connection.from() != input) + continue; + getLayer(connection.to().layerId())->getInputPorts()[connection.to().portId()].setData(dstPortData); + } + getLayer(input.layerId())->getOutputPorts()[input.portId()].setData(dstPortData); + } else { + // Change destination data + getLayer(output.layerId())->getInputPorts()[output.portId()].setData(srcPortData); + } + + return true; + }; + + if (!mergePortData()) + THROW_IE_EXCEPTION << "Cannot connect two ports with different data!"; + + parameters["connections"].as>().emplace_back(input, output); } void Builder::Network::removeLayer(idx_t layerId) { - auto it = layers.begin(); - for (; it != layers.end(); it++) { - if (it->getId() == layerId) { + auto it = parameters["layers"].as>().begin(); + for (; it != parameters["layers"].as>().end(); it++) { + if ((*it)->getId() == layerId) { break; } } - if (it != layers.end()) - layers.erase(it); + if (it != parameters["layers"].as>().end()) + parameters["layers"].as>().erase(it); } void Builder::Network::disconnect(const Connection& connection) { - auto it = connections.begin(); - for (; it != connections.end(); it++) { + auto it = parameters["connections"].as>().begin(); + for (; it != parameters["connections"].as>().end(); it++) { if (connection == *it) break; } - if (it != connections.end()) - connections.erase(it); -} + if (it != parameters["connections"].as>().end()) + parameters["connections"].as>().erase(it); -const INetwork::Ptr Builder::Network::build() const { - // Check that all ports are connected - for (const auto& layer : layers) { - std::vector existInCon(layer.getInputPorts().size()); - std::vector existOutCon(layer.getOutputPorts().size()); - - const auto layerConnections = getLayerConnections(layer.getId()); - for (const auto& connection : layerConnections) { - if (connection.from().layerId() == layer.getId()) { - existOutCon[connection.from().portId()] = true; - getLayer(connection.to().layerId()); - } - if (connection.to().layerId() == layer.getId()) { - existInCon[connection.to().portId()] = true; - getLayer(connection.from().layerId()); - } - } - bool allPortsConnected = true; - for (const auto& cons : {existInCon, existOutCon}) { - for (const auto &existCon : cons) { - allPortsConnected = allPortsConnected && existCon; - } - } - if (!allPortsConnected) - THROW_IE_EXCEPTION << "Not all ports of layer " << layer.getName() << " were connected!"; - } + try { + auto layer = getLayer(connection.to().layerId()); + layer->getInputPorts()[connection.to().portId()].setData(std::make_shared()); + } catch (InferenceEngine::details::InferenceEngineException& ex) {} +} - InferenceEngine::details::Network::Ptr network = std::make_shared(ctx, name); - for (const auto& layer : layers) { - network->addLayer(layer.build()); - } - for (const auto& connection : connections) { - network->addConnection(connection); - } +const INetwork::CPtr Builder::Network::build() { + validate(); + InferenceEngine::Builder::Network::Ptr network = + std::make_shared(static_cast(*this)); + return network; +} +void Builder::Network::validate() { // Check that all ports are connected - for (const auto& layer : *network) { + for (const auto& layer : getLayers()) { std::vector existInCon(layer->getInputPorts().size()); + for (size_t i = 0; i < layer->getInputPorts().size(); i++) { + if (layer->getInputPorts()[i].getParameters().find("type") != layer->getInputPorts()[i].getParameters().end()) + existInCon[i] = true; + } std::vector existOutCon(layer->getOutputPorts().size()); - const auto layerConnections = network->getLayerConnections(layer->getId()); + const auto layerConnections = getLayerConnections(layer->getId()); for (const auto& connection : layerConnections) { if (connection.from().layerId() == layer->getId()) { existOutCon[connection.from().portId()] = true; + getLayer(connection.to().layerId()); } if (connection.to().layerId() == layer->getId()) { existInCon[connection.to().portId()] = true; + getLayer(connection.from().layerId()); } } bool allPortsConnected = true; @@ -499,25 +447,32 @@ const INetwork::Ptr Builder::Network::build() const { THROW_IE_EXCEPTION << "Not all ports of layer " << layer->getName() << " were connected!"; } + // Check all layers + for (const auto& connection : getConnections()) { + if (!getLayer(connection.to().layerId())) + THROW_IE_EXCEPTION << "Cannot find layer with id: " << connection.to().layerId(); + if (!getLayer(connection.from().layerId())) + THROW_IE_EXCEPTION << "Cannot find layer with id: " << connection.from().layerId(); + } + std::map inputShapes; - for (const auto& input : network->getInputs()) + for (const auto& input : getInputs()) inputShapes[input->getName()] = input->getOutputPorts()[0].shape(); - if (version) { - details::BaseCreator::version_ = version; + if (parameters.at("version").as()) { + details::BaseCreator::version_ = parameters.at("version"); } - ShapeInfer::Reshaper reshaper(ctx, network); + ShapeInfer::Reshaper reshaper(this); ResponseDesc resp; StatusCode sts = reshaper.run(inputShapes, &resp); // Not all implementations may be registered if all shapes were read from IR. if (sts == NOT_FOUND) { bool allShapesLooksGood = true; - for (const auto& connection : network->getConnections()) { - if (network->getLayer(connection.from().layerId())-> - getOutputPorts()[connection.from().portId()].shape() != - network->getLayer(connection.to().layerId())-> - getInputPorts()[connection.to().portId()].shape()) { + for (const auto& connection : getConnections()) { + if (getLayer(connection.from().layerId())->getOutputPorts()[connection.from().portId()].shape() != + getLayer(connection.to().layerId())->getInputPorts()[connection.to().portId()].shape() || + getLayer(connection.to().layerId())->getInputPorts()[connection.to().portId()].shape().empty()) { allShapesLooksGood = false; break; } @@ -529,30 +484,19 @@ const INetwork::Ptr Builder::Network::build() const { if (sts != OK) THROW_IE_EXCEPTION << resp.msg; - return std::static_pointer_cast(network); -} - -const std::shared_ptr Builder::convertToICNNNetwork(const INetwork::Ptr& network) { - std::unique_ptr cnnNetworkImpl(new details::CNNNetworkImpl()); - - Precision detectedPrecision = Precision::FP32; - for (const auto& layer : *network) { - const auto& params = layer->getParameters(); - if (!params) - continue; - Precision prc = Precision::UNSPECIFIED; - for (const auto& blobIterator : params->getConstantData()) { - if (blobIterator.second) { - prc = blobIterator.second->precision(); - break; - } - } - if (prc != Precision::UNSPECIFIED) { - detectedPrecision = prc; - break; + // Check all parameters + for (const auto& layer : getLayers()) { + try { + layer->build(); + } catch(InferenceEngine::details::InferenceEngineException& ex) { + THROW_IE_EXCEPTION << "Cannot build layer " << layer->getName() << ": " << ex.what(); + } catch(std::bad_cast& ex) { + THROW_IE_EXCEPTION << "Cannot build layer " << layer->getName() << ": " << ex.what(); } } +} +const std::shared_ptr Builder::convertToICNNNetwork(const INetwork::CPtr& network) { auto createCNNLayer = [](const std::shared_ptr& layer, Precision precision) { static std::vector> convertors = { std::make_shared>("Power"), @@ -578,7 +522,9 @@ const std::shared_ptr Builder::convertToICNNNetwork(const INetwork: std::make_shared>("Reshape"), std::make_shared>("Flatten"), std::make_shared>("Tile"), + std::make_shared>("Pad"), std::make_shared(), + std::make_shared(), std::make_shared>("BatchNormalization"), }; for (auto &convertor : convertors) { @@ -590,11 +536,65 @@ const std::shared_ptr Builder::convertToICNNNetwork(const INetwork: return genericCreator.createLayer(layer, precision); }; + auto keep_input_info = [](std::unique_ptr& network, DataPtr &in_data, + PreProcessInfo preProc) { + InputInfo::Ptr info(new InputInfo()); + info->getPreProcess() = preProc; + info->setInputData(in_data); + Precision prc = info->getInputPrecision(); + + // Convert precision into native format (keep element size) + prc = prc == Precision::Q78 ? Precision::I16 : + prc == Precision::FP16 ? Precision::FP32 : + static_cast(prc); + + info->setInputPrecision(prc); + network->setInputInfo(info); + }; + + std::unique_ptr cnnNetworkImpl(new details::CNNNetworkImpl()); + + Precision detectedPrecision = Precision::UNSPECIFIED; + for (const auto& layer : *network) { + for (const auto& port : layer->getInputPorts()) { + Precision prc = port.getData()->getData()->getTensorDesc().getPrecision(); + if (prc != Precision::UNSPECIFIED) { + detectedPrecision = prc; + break; + } + } + for (const auto& port : layer->getOutputPorts()) { + Precision prc = port.getData()->getData()->getTensorDesc().getPrecision(); + if (prc != Precision::UNSPECIFIED) { + detectedPrecision = prc; + break; + } + } + if (detectedPrecision != Precision::UNSPECIFIED) + break; + } + if (detectedPrecision == Precision::UNSPECIFIED) + detectedPrecision = Precision::FP32; + + details::CaselessEq eq; cnnNetworkImpl->setName(network->getName()); cnnNetworkImpl->setPrecision(Precision::UNSPECIFIED); for (const auto& layer : *network) { - if (details::CaselessEq()(layer->getType(), "Output")) + bool isInternalLayer = eq(layer->getType(), "Const"); + for (const auto& connection : network->getLayerConnections(layer->getId())) { + if (!isInternalLayer) + break; + if (connection.from().layerId() != layer->getId()) + continue; + const auto& port = network->getLayer(connection.to().layerId())->getInputPorts()[connection.to().portId()]; + isInternalLayer = isInternalLayer && + port.getParameters().find("type") != port.getParameters().end(); + } + isInternalLayer = isInternalLayer || eq(layer->getType(), "Output"); + + if (isInternalLayer) continue; + CNNLayerPtr cnnLayer = createCNNLayer(layer, detectedPrecision); if (cnnNetworkImpl->getPrecision() == Precision::UNSPECIFIED) { cnnNetworkImpl->setPrecision(cnnLayer->precision); @@ -606,10 +606,13 @@ const std::shared_ptr Builder::convertToICNNNetwork(const INetwork: auto connections = network->getLayerConnections(layer->getId()); std::unordered_set inputNum, outputNum; for (const auto& connection : connections) { - if (connection.from().layerId() != layer->getId()) - inputNum.insert(connection.to().portId()); - else + if (connection.from().layerId() != layer->getId()) { + const auto& port = layer->getInputPorts()[connection.to().portId()]; + if (port.getParameters().find("type") == port.getParameters().end()) + inputNum.insert(connection.to().portId()); + } else { outputNum.insert(connection.from().portId()); + } } cnnLayer->insData.resize(inputNum.size()); cnnLayer->outData.resize(outputNum.size()); @@ -620,8 +623,8 @@ const std::shared_ptr Builder::convertToICNNNetwork(const INetwork: auto connections = network->getLayerConnections(layer->getId()); CNNLayerPtr cnnLayer; StatusCode sts = cnnNetworkImpl->getLayerByName(layer->getName().c_str(), cnnLayer, nullptr); - details::CaselessEq eq; - if (sts != OK && eq(layer->getType(), "Output")) + + if (sts != OK && (eq(layer->getType(), "Output") || eq(layer->getType(), "Const"))) continue; else if (sts != OK) THROW_IE_EXCEPTION << "Cannot find CNNLayer by name " << layer->getName(); @@ -634,24 +637,31 @@ const std::shared_ptr Builder::convertToICNNNetwork(const INetwork: CNNLayerPtr cnnOutLayer; sts = cnnNetworkImpl->getLayerByName(outLayer->getName().c_str(), cnnOutLayer, nullptr); - if (sts != OK && !eq(outLayer->getType(), "Output")) + if (sts != OK && !eq(outLayer->getType(), "Output") && !eq(layer->getType(), "Const")) THROW_IE_EXCEPTION << "Cannot find CNNLayer by name " << outLayer->getName(); std::string dataName = layer->getName(); if (cnnLayer->outData.size() > 1) { - dataName += "_" + std::to_string(connection.from().portId()); + dataName += "." + std::to_string(connection.from().portId()); } DataPtr& data = cnnNetworkImpl->getData(dataName); if (!data) { TensorDesc dataDesc(detectedPrecision, layer->getOutputPorts()[connection.from().portId()].shape(), TensorDesc::getLayoutByDims(layer->getOutputPorts()[connection.from().portId()].shape())); - data = std::make_shared(layer->getName(), dataDesc); + data = std::make_shared(dataName, dataDesc); data->creatorLayer = cnnLayer; } cnnLayer->outData[connection.from().portId()] = data; + + idx_t realPortId(0); + const auto inputPorts = outLayer->getInputPorts(); + for (size_t i = 0; i < connection.to().portId() && i < inputPorts.size(); i++) { + if (inputPorts[i].getParameters().find("type") == inputPorts[i].getParameters().end()) + realPortId++; + } if (cnnOutLayer) { data->inputTo[outLayer->getName()] = cnnOutLayer; - cnnOutLayer->insData[connection.to().portId()] = data; + cnnOutLayer->insData[realPortId] = data; } else { cnnNetworkImpl->addOutput(data->getName()); } @@ -659,38 +669,161 @@ const std::shared_ptr Builder::convertToICNNNetwork(const INetwork: cnnLayer->validateLayer(); if (eq(cnnLayer->type, "Input")) { - InputInfo::Ptr inputInfo(new InputInfo()); - inputInfo->setInputData(*cnnLayer->outData.begin()); - cnnNetworkImpl->setInputInfo(inputInfo); + PreProcessInfo preProc; + if (layer->getParameters().find("preProcess") != layer->getParameters().end()) + preProc = layer->getParameters().at("preProcess"); + keep_input_info(cnnNetworkImpl, *cnnLayer->outData.begin(), preProc); + } + } + + // Set default output precision to FP32 (for back-compatibility) + OutputsDataMap outputsInfo; + cnnNetworkImpl->getOutputsInfo(outputsInfo); + for (auto outputInfo : outputsInfo) { + if (outputInfo.second->getPrecision() != Precision::FP32 && + outputInfo.second->getPrecision() != Precision::I32) { + outputInfo.second->setPrecision(Precision::FP32); } } return std::shared_ptr(cnnNetworkImpl.release()); } -Builder::Network::operator const INetwork::Ptr() const { +Builder::Network::operator const INetwork::CPtr() { return build(); } -const Builder::Layer &Builder::Network::getLayer(idx_t layerId) const { +const ILayer::CPtr Builder::Network::getLayer(idx_t layerId) const noexcept { + try { + for (auto& layer : getLayers()) { + if (layer->getId() == layerId) + return layer->build(); + } + } catch(...) {} + + return nullptr; +} + +Builder::Layer::Ptr Builder::Network::getLayer(idx_t layerId) { for (auto& layer : getLayers()) { - if (layer.getId() == layerId) + if (layer->getId() == layerId) return layer; } THROW_IE_EXCEPTION << "Cannot find layer with id: " << layerId; } -Builder::Layer &Builder::Network::getLayer(idx_t layerId) { - for (auto& layer : getLayers()) { - if (layer.getId() == layerId) - return layer; +const std::string& Builder::Network::getName() const noexcept { + return parameters.at("name"); +} + +const Context& Builder::Network::getContext() const noexcept { + return parameters.at("context"); +} + +Context& Builder::Network::getContext() noexcept { + return parameters.at("context"); +} + +Builder::Network::const_iterator Builder::Network::begin() const noexcept { + try { + return Network::const_iterator(this); + } catch (...) { + return Network::const_iterator(this, true); } - THROW_IE_EXCEPTION << "Cannot find layer with id: " << layerId; +} + + +Builder::Network::const_iterator Builder::Network::end() const noexcept { + return Network::const_iterator(this, true); +} + +size_t Builder::Network::size() const noexcept { + return static_cast(std::distance(std::begin(*this), std::end(*this))); +} + +Builder::Network::iterator Builder::Network::begin() { + return Network::iterator(this); +} + +Builder::Network::iterator Builder::Network::end() { + return Network::iterator(this, true); +} + +const std::vector Builder::Network::getInputs() const noexcept { + std::vector inputs; + for (const auto& layer : parameters.at("layers").as>()) { + bool isInputLayer = true; + for (const auto& connection : getLayerConnections(layer->getId())) { + if (connection.to().layerId() == layer->getId()) { + isInputLayer = false; + break; + } + } + if (isInputLayer) { + inputs.push_back(layer->build()); + } + } + return inputs; +} + +std::vector Builder::Network::getInputs() { + std::vector inputs; + for (auto& layer : parameters.at("layers").as>()) { + bool isInputLayer = true; + for (const auto& connection : getLayerConnections(layer->getId())) { + if (connection.to().layerId() == layer->getId()) { + isInputLayer = false; + break; + } + } + if (isInputLayer) { + inputs.push_back(layer); + } + } + return inputs; +} + +const std::vector Builder::Network::getOutputs() const noexcept { + std::vector outputs; + for (const auto& layer : parameters.at("layers").as>()) { + bool isOutputLayer = true; + for (const auto& connection : getLayerConnections(layer->getId())) { + if (connection.from().layerId() == layer->getId()) { + isOutputLayer = false; + break; + } + } + if (isOutputLayer) { + outputs.push_back(layer->build()); + } + } + return outputs; +} + +std::vector Builder::Network::getOutputs() { + std::vector outputs; + for (auto& layer : parameters.at("layers").as>()) { + bool isOutputLayer = true; + for (const auto& connection : getLayerConnections(layer->getId())) { + if (connection.from().layerId() == layer->getId()) { + isOutputLayer = false; + break; + } + } + if (isOutputLayer) { + outputs.push_back(layer); + } + } + return outputs; +} + +const std::vector& Builder::Network::getConnections() const { + return parameters.at("connections").as>(); } const std::vector Builder::Network::getLayerConnections(idx_t layerId) const noexcept { std::vector layerConnections; - for (const auto connection : connections) { + for (const auto connection : parameters.at("connections").as>()) { if (connection.from().layerId() == layerId || connection.to().layerId() == layerId) layerConnections.push_back(connection); } diff --git a/inference-engine/src/inference_engine/builders/ie_norm_layer.cpp b/inference-engine/src/inference_engine/builders/ie_norm_layer.cpp index cb6d47b..16a2b2d 100644 --- a/inference-engine/src/inference_engine/builders/ie_norm_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_norm_layer.cpp @@ -1,77 +1,80 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::NormLayer::NormLayer(const std::string& name): LayerFragment("Norm", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::NormLayer::NormLayer(const std::string& name): LayerDecorator("Norm", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setAcrossMaps(false); setSize(0); setAlpha(0); setBeta(0); } -Builder::NormLayer::NormLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Norm")) - THROW_IE_EXCEPTION << "Cannot create NormLayer decorator for layer " << getLayer().getType(); +Builder::NormLayer::NormLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Norm"); +} + +Builder::NormLayer::NormLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Norm"); } Builder::NormLayer& Builder::NormLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::NormLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::NormLayer& Builder::NormLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } size_t Builder::NormLayer::getSize() const { - return getLayer().getParameters()["local-size"].asUInt(); + return getLayer()->getParameters().at("local-size"); } Builder::NormLayer& Builder::NormLayer::setSize(size_t size) { - getLayer().getParameters()["local-size"] = size; + getLayer()->getParameters()["local-size"] = size; return *this; } float Builder::NormLayer::getAlpha() const { - return getLayer().getParameters()["alpha"].asFloat(); + return getLayer()->getParameters().at("alpha"); } Builder::NormLayer& Builder::NormLayer::setAlpha(float alpha) { - getLayer().getParameters()["alpha"] = alpha; + getLayer()->getParameters()["alpha"] = alpha; return *this; } float Builder::NormLayer::getBeta() const { - return getLayer().getParameters()["beta"].asFloat(); + return getLayer()->getParameters().at("beta"); } Builder::NormLayer& Builder::NormLayer::setBeta(float beta) { - getLayer().getParameters()["beta"] = beta; + getLayer()->getParameters()["beta"] = beta; return *this; } bool Builder::NormLayer::getAcrossMaps() const { - return getLayer().getParameters()["region"].asString() == "across"; + return getLayer()->getParameters().at("region").as() == "across"; } Builder::NormLayer& Builder::NormLayer::setAcrossMaps(bool acrossMap) { std::string value = acrossMap ? "across" : "same"; - getLayer().getParameters()["region"] = value; + getLayer()->getParameters()["region"] = value; return *this; } @@ -83,3 +86,29 @@ Builder::NormLayer& Builder::NormLayer::setRegion(Builder::NormLayer::NormType t setAcrossMaps(type); return *this; } + +REG_VALIDATOR_FOR(Norm, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::NormLayer layer(input_layer); + if (layer.getAlpha() <= 0) { + THROW_IE_EXCEPTION << "Alpha should be > 0"; + } + if (layer.getBeta() <= 0) { + THROW_IE_EXCEPTION << "Beta should be > 0"; + } + if (layer.getSize() == 0) { + THROW_IE_EXCEPTION << "Size should be > 0"; + } + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(Norm, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["local-size"] = (size_t)cnnLayer->GetParamAsUInt("local-size", 0); + layer.getParameters()["alpha"] = cnnLayer->GetParamAsFloat("alpha", 0); + layer.getParameters()["beta"] = cnnLayer->GetParamAsFloat("beta", 0); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp b/inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp index 699993f..faa54dc 100644 --- a/inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp @@ -1,65 +1,89 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::NormalizeLayer::NormalizeLayer(const std::string& name): LayerFragment("Normalize", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::NormalizeLayer::NormalizeLayer(const std::string& name): LayerDecorator("Normalize", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setAcrossMaps(false); setChannelShared(false); setEpsilon(0.0000001f); } -Builder::NormalizeLayer::NormalizeLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Normalize")) - THROW_IE_EXCEPTION << "Cannot create NormalizeLayer decorator for layer " << getLayer().getType(); +Builder::NormalizeLayer::NormalizeLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Normalize"); +} + +Builder::NormalizeLayer::NormalizeLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Normalize"); } Builder::NormalizeLayer& Builder::NormalizeLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::NormalizeLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::NormalizeLayer& Builder::NormalizeLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } bool Builder::NormalizeLayer::getAcrossMaps() const { - return getLayer().getParameters()["region"].asBool(); + return getLayer()->getParameters().at("region"); } Builder::NormalizeLayer& Builder::NormalizeLayer::setAcrossMaps(bool acrossMap) { - getLayer().getParameters()["region"] = acrossMap ? 1 : 0; + getLayer()->getParameters()["region"] = acrossMap ? 1 : 0; return *this; } bool Builder::NormalizeLayer::getChannelShared() const { - return getLayer().getParameters()["channel_shared"].asBool(); + return getLayer()->getParameters().at("channel_shared"); } Builder::NormalizeLayer& Builder::NormalizeLayer::setChannelShared(bool channelShared) { - getLayer().getParameters()["channel_shared"] = channelShared ? 1 : 0; + getLayer()->getParameters()["channel_shared"] = channelShared ? 1 : 0; return *this; } float Builder::NormalizeLayer::getEpsilon() const { - return getLayer().getParameters()["eps"].asFloat(); + return getLayer()->getParameters().at("eps"); } Builder::NormalizeLayer& Builder::NormalizeLayer::setEpsilon(float eps) { - getLayer().getParameters()["eps"] = eps; + getLayer()->getParameters()["eps"] = eps; return *this; } + +REG_VALIDATOR_FOR(Normalize, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::NormalizeLayer layer(input_layer); + if (layer.getEpsilon() <= 0) { + THROW_IE_EXCEPTION << "Epsilon should be > 0"; + } + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(Normalize, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["region"] = cnnLayer->GetParamsAsBool("region", 0); + layer.getParameters()["channel_shared"] = cnnLayer->GetParamsAsBool("channel_shared", 0); + layer.getParameters()["eps"] = cnnLayer->GetParamAsFloat("eps", 0); +}); + diff --git a/inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp b/inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp index 88dfcf1..9bca83a 100644 --- a/inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp @@ -1,33 +1,37 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
#include using namespace InferenceEngine; -Builder::OutputLayer::OutputLayer(const std::string& name): LayerFragment("Output", name) { - getLayer().getInputPorts().resize(1); +Builder::OutputLayer::OutputLayer(const std::string& name): LayerDecorator("Output", name) { + getLayer()->getInputPorts().resize(1); } -Builder::OutputLayer::OutputLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Output")) - THROW_IE_EXCEPTION << "Cannot create OutputLayer decorator for layer " << getLayer().getType(); +Builder::OutputLayer::OutputLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Output"); +} + +Builder::OutputLayer::OutputLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Output"); } Builder::OutputLayer& Builder::OutputLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::OutputLayer::getPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::OutputLayer& Builder::OutputLayer::setPort(const Port &port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } + +REG_VALIDATOR_FOR(Output, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) {}); diff --git a/inference-engine/src/inference_engine/builders/ie_permute_layer.cpp b/inference-engine/src/inference_engine/builders/ie_permute_layer.cpp index 2cfa879..65df2c5 100644 --- a/inference-engine/src/inference_engine/builders/ie_permute_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_permute_layer.cpp @@ -1,52 +1,63 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::PermuteLayer::PermuteLayer(const std::string& name): LayerFragment("Permute", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::PermuteLayer::PermuteLayer(const std::string& name): LayerDecorator("Permute", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); } -Builder::PermuteLayer::PermuteLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Permute")) - THROW_IE_EXCEPTION << "Cannot create PermuteLayer decorator for layer " << getLayer().getType(); +Builder::PermuteLayer::PermuteLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Permute"); +} + +Builder::PermuteLayer::PermuteLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Permute"); } Builder::PermuteLayer& Builder::PermuteLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::PermuteLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PermuteLayer& Builder::PermuteLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const Port& Builder::PermuteLayer::getInputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PermuteLayer& Builder::PermuteLayer::setInputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector Builder::PermuteLayer::getOrder() const { - return uInts2size_t(getLayer().getParameters()["order"].asUInts()); + return getLayer()->getParameters().at("order"); } Builder::PermuteLayer& Builder::PermuteLayer::setOrder(const std::vector& ratios) { - getLayer().getParameters()["order"] = ratios; + getLayer()->getParameters()["order"] = ratios; return *this; } + +REG_CONVERTER_FOR(Permute, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + std::vector tmp = cnnLayer->GetParamAsUInts("order"); + layer.getParameters()["order"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["order"].as>()[i] = static_cast(tmp[i]); + } +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp b/inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp index 41db6c8..67bbcc5 100644 --- a/inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp @@ -1,42 +1,63 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::PoolingLayer::PoolingLayer(const std::string& name): LayerFragment("Pooling", name) { - getLayer().getInputPorts().resize(1); - getLayer().getOutputPorts().resize(1); +Builder::PoolingLayer::PoolingLayer(const std::string& name): LayerDecorator("Pooling", name) { + getLayer()->getInputPorts().resize(1); + getLayer()->getOutputPorts().resize(1); + setKernel({}); + setStrides({}); + setPaddingsEnd({}); + setPaddingsBegin({}); setExcludePad(false); setPoolingType(PoolingType::MAX); setRoundingType(RoundingType::CEIL); } -Builder::PoolingLayer::PoolingLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Pooling")) - THROW_IE_EXCEPTION << "Cannot create PoolingLayer decorator for layer " << getLayer().getType(); +Builder::PoolingLayer::PoolingLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Pooling"); - std::string typeStr = getLayer().getParameters()["pool-method"].asString("max"); + std::string typeStr = getLayer()->getParameters()["pool-method"]; if (typeStr == "max") type = MAX; else if (typeStr == "avg") type = AVG; - typeStr = getLayer().getParameters()["rounding_type"].asString("ceil"); - if (typeStr == "ceil") + std::string roundTypeStr = getLayer()->getParameters()["rounding_type"]; + if (roundTypeStr == "ceil") roundingType = CEIL; + else if (roundTypeStr == "avg") + roundingType = FLOOR; +} + +Builder::PoolingLayer::PoolingLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Pooling"); + + const auto cLayer = static_cast(this)->getLayer(); + + std::string typeStr = cLayer->getParameters().at("pool-method"); + if (typeStr == "max") + type = MAX; else if (typeStr == "avg") + type = AVG; + + std::string roundTypeStr = cLayer->getParameters().at("rounding_type"); + if (roundTypeStr == "ceil") + roundingType = CEIL; + else if (roundTypeStr == "avg") roundingType = FLOOR; } Builder::PoolingLayer::operator Builder::Layer() const { - Layer genLayer(getLayer()); + Layer genLayer(*getLayer()); std::vector l_kernel = getKernel(); std::vector l_paddingBegin = getPaddingsBegin(); @@ -61,57 +82,57 @@ Builder::PoolingLayer::operator Builder::Layer() const { } Builder::PoolingLayer &Builder::PoolingLayer::setName(const std::string &name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::PoolingLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::PoolingLayer& Builder::PoolingLayer::setInputPort(const Port& port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::PoolingLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PoolingLayer& Builder::PoolingLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector Builder::PoolingLayer::getKernel() const { - return uInts2size_t(getLayer().getParameters()["kernel"].asUInts({})); + return getLayer()->getParameters().at("kernel"); } Builder::PoolingLayer& Builder::PoolingLayer::setKernel(const std::vector& kernel) { - getLayer().getParameters()["kernel"] = kernel; + getLayer()->getParameters()["kernel"] = kernel; return *this; } const std::vector Builder::PoolingLayer::getStrides() const { - return uInts2size_t(getLayer().getParameters()["strides"].asUInts({})); + return getLayer()->getParameters().at("strides"); } Builder::PoolingLayer& Builder::PoolingLayer::setStrides(const std::vector& strides) { - getLayer().getParameters()["strides"] = strides; + getLayer()->getParameters()["strides"] = strides; return *this; } const std::vector Builder::PoolingLayer::getPaddingsBegin() const { - return uInts2size_t(getLayer().getParameters()["pads_begin"].asUInts({})); + return getLayer()->getParameters().at("pads_begin"); } Builder::PoolingLayer& Builder::PoolingLayer::setPaddingsBegin(const std::vector& paddings) { - getLayer().getParameters()["pads_begin"] = paddings; + getLayer()->getParameters()["pads_begin"] = paddings; return *this; } const std::vector Builder::PoolingLayer::getPaddingsEnd() const { - return uInts2size_t(getLayer().getParameters()["pads_end"].asUInts({})); + return getLayer()->getParameters().at("pads_end"); } Builder::PoolingLayer& Builder::PoolingLayer::setPaddingsEnd(const std::vector& paddings) { - getLayer().getParameters()["pads_end"] = paddings; + getLayer()->getParameters()["pads_end"] = paddings; return *this; } @@ -119,7 +140,6 @@ Builder::PoolingLayer::PoolingType Builder::PoolingLayer::getPoolingType() const return type; } Builder::PoolingLayer& Builder::PoolingLayer::setPoolingType(Builder::PoolingLayer::PoolingType type) { - this->type = type; std::string typeStr; switch (type) { case MAX: @@ -129,7 +149,8 @@ Builder::PoolingLayer& Builder::PoolingLayer::setPoolingType(Builder::PoolingLay typeStr = "avg"; break; } - getLayer().getParameters()["pool-method"] = typeStr; + getLayer()->getParameters()["pool-method"] = typeStr; + this->type = type; return *this; } @@ -147,28 +168,27 @@ Builder::PoolingLayer& Builder::PoolingLayer::setRoundingType(Builder::PoolingLa typeStr = "floor"; break; } - getLayer().getParameters()["rounding_type"] = typeStr; + getLayer()->getParameters()["rounding_type"] = typeStr; return *this; } bool Builder::PoolingLayer::getExcludePad() const { - return getLayer().getParameters()["exclude-pad"].asBool(); + return getLayer()->getParameters().at("exclude-pad"); } Builder::PoolingLayer& Builder::PoolingLayer::setExcludePad(bool exclude) { - getLayer().getParameters()["exclude-pad"] = exclude; + getLayer()->getParameters()["exclude-pad"] = exclude; return *this; } - -void Builder::PoolingLayer::validate(const Layer& layer) { - Layer poolLayer = layer; - Builder::PoolingLayer poolBuilder(poolLayer); - std::vector l_kernel = poolBuilder.getKernel(); +REG_VALIDATOR_FOR(Pooling, [](const Builder::Layer::CPtr& layer, bool partial) { // WA for old IRs - if (l_kernel.empty() && layer.getParameters().find("kernel-x") != layer.getParameters().end() && - layer.getParameters().find("kernel-y") != layer.getParameters().end()) + if (layer->getParameters().find("kernel") == layer->getParameters().end() && layer->getParameters().find("kernel-x") != layer->getParameters().end() && + layer->getParameters().find("kernel-y") != layer->getParameters().end()) return; + + Builder::PoolingLayer poolBuilder(layer); + std::vector l_kernel = poolBuilder.getKernel(); std::vector l_paddingBegin = poolBuilder.getPaddingsBegin(); std::vector l_paddingEnd = poolBuilder.getPaddingsEnd(); std::vector l_strides = poolBuilder.getStrides(); @@ -181,7 +201,39 @@ void Builder::PoolingLayer::validate(const Layer& layer) { l_strides.resize(l_kernel.size(), 1); if (l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() || l_kernel.size() != l_strides.size()) - THROW_IE_EXCEPTION << layer.getType() << " node " << layer.getName() << " contains incorrect parameters!"; -} + THROW_IE_EXCEPTION << layer->getType() << " node " << layer->getName() << " contains incorrect parameters!"; +}); + +REG_CONVERTER_FOR(Pooling, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + if (cnnLayer->params.find("kernel") == cnnLayer->params.end() && + cnnLayer->params.find("kernel-x") != cnnLayer->params.end() && + cnnLayer->params.find("kernel-y") != cnnLayer->params.end()) + return; + std::vector tmp = cnnLayer->GetParamAsUInts("kernel"); + layer.getParameters()["kernel"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["kernel"].as>()[i] = static_cast(tmp[i]); + } + + tmp = cnnLayer->GetParamAsUInts("strides"); + layer.getParameters()["strides"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["strides"].as>()[i] = static_cast(tmp[i]); + } + + tmp = cnnLayer->GetParamAsUInts("pads_begin"); + layer.getParameters()["pads_begin"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["pads_begin"].as>()[i] = static_cast(tmp[i]); + } + + tmp = cnnLayer->GetParamAsUInts("pads_end"); + layer.getParameters()["pads_end"] = std::vector(tmp.size()); + for (size_t i = 0; i < tmp.size(); ++i) { + layer.getParameters()["pads_end"].as>()[i] = static_cast(tmp[i]); + } -REG_VALIDATOR_FOR(Pooling, Builder::PoolingLayer::validate); + layer.getParameters()["exclude-pad"] = cnnLayer->GetParamAsBool("exclude-pad", false); + layer.getParameters()["rounding_type"] = cnnLayer->GetParamAsString("rounding_type", "ceil"); + layer.getParameters()["pool-method"] = cnnLayer->GetParamAsString("pool-method", "max"); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_power_layer.cpp b/inference-engine/src/inference_engine/builders/ie_power_layer.cpp index c3142fa..db04e2b 100644 --- a/inference-engine/src/inference_engine/builders/ie_power_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_power_layer.cpp @@ -1,66 +1,74 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::PowerLayer::PowerLayer(const std::string& name): LayerFragment("Power", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::PowerLayer::PowerLayer(const std::string& name): LayerDecorator("Power", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setPower(1); setScale(1); setShift(0); } -Builder::PowerLayer::PowerLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Power")) - THROW_IE_EXCEPTION << "Cannot create PowerLayer decorator for layer " << getLayer().getType(); +Builder::PowerLayer::PowerLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Power"); +} + +Builder::PowerLayer::PowerLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Power"); } Builder::PowerLayer& Builder::PowerLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::PowerLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PowerLayer& Builder::PowerLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::PowerLayer::getPower() const { - return getLayer().getParameters()["power"].asFloat(); + return getLayer()->getParameters().at("power"); } Builder::PowerLayer& Builder::PowerLayer::setPower(float power) { - getLayer().getParameters()["power"] = power; + getLayer()->getParameters()["power"] = power; return *this; } float Builder::PowerLayer::getScale() const { - return getLayer().getParameters()["scale"].asFloat(); + return getLayer()->getParameters().at("scale"); } Builder::PowerLayer& Builder::PowerLayer::setScale(float scale) { - getLayer().getParameters()["scale"] = scale; + getLayer()->getParameters()["scale"] = scale; return *this; } float Builder::PowerLayer::getShift() const { - return getLayer().getParameters()["shift"].asFloat(); + return getLayer()->getParameters().at("shift"); } Builder::PowerLayer& Builder::PowerLayer::setShift(float shift) { - getLayer().getParameters()["shift"] = shift; + getLayer()->getParameters()["shift"] = shift; return *this; } +REG_CONVERTER_FOR(Power, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["shift"] = cnnLayer->GetParamAsFloat("shift", 0); + layer.getParameters()["scale"] = cnnLayer->GetParamAsFloat("scale", 1); + layer.getParameters()["power"] = cnnLayer->GetParamAsFloat("power", 1); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp b/inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp index 6263f96..dec276e 100644 --- a/inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp @@ -1,49 +1,52 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::PReLULayer::PReLULayer(const std::string& name): LayerFragment("PReLU", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::PReLULayer::PReLULayer(const std::string& name): LayerDecorator("PReLU", name) { + getLayer()->getInputPorts().resize(2); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getOutputPorts().resize(1); setChannelShared(false); } -Builder::PReLULayer::PReLULayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "PReLU")) - THROW_IE_EXCEPTION << "Cannot create PReLULayer decorator for layer " << getLayer().getType(); +Builder::PReLULayer::PReLULayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("PReLU"); +} + +Builder::PReLULayer::PReLULayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("PReLU"); } Builder::PReLULayer& Builder::PReLULayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::PReLULayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PReLULayer& Builder::PReLULayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } bool Builder::PReLULayer::getChannelShared() const { - return getLayer().getParameters()["channel_shared"].asBool(); + return getLayer()->getParameters().at("channel_shared"); } Builder::PReLULayer& Builder::PReLULayer::setChannelShared(bool flag) { - getLayer().getParameters()["channel_shared"] = flag ? 1 : 0; + getLayer()->getParameters()["channel_shared"] = flag ? 1 : 0; return *this; } -Builder::PReLULayer& Builder::PReLULayer::setWeights(const Blob::CPtr& weights) { - getLayer().addConstantData("weights", weights); - return *this; -} +REG_CONVERTER_FOR(PReLU, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["channel_shared"] = cnnLayer->GetParamAsBool("channel_shared", false); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp b/inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp index c52b2f4..e4505b6 100644 --- a/inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp @@ -1,124 +1,141 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(const std::string& name): LayerFragment("PriorBoxClustered", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(2); +Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(const std::string& name): LayerDecorator("PriorBoxClustered", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(2); } -Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "PriorBoxClustered")) - THROW_IE_EXCEPTION << "Cannot create PriorBoxClusteredLayer decorator for layer " << getLayer().getType(); +Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("PriorBoxClustered"); +} + +Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("PriorBoxClustered"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::PriorBoxClusteredLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setInputPorts(const std::vector &ports) { if (ports.size() != 2) - THROW_IE_EXCEPTION << "Incorrect number of inputs for PriorBoxClustered layer."; - getLayer().getInputPorts() = ports; + THROW_IE_EXCEPTION << "Incorrect number of inputs for PriorBoxClustered getLayer()."; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::PriorBoxClusteredLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } float Builder::PriorBoxClusteredLayer::getVariance() const { - return getLayer().getParameters()["variance"].asFloat(); + return getLayer()->getParameters().at("variance"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setVariance(float variance) { - getLayer().getParameters()["variance"] = variance; + getLayer()->getParameters()["variance"] = variance; return *this; } float Builder::PriorBoxClusteredLayer::getOffset() const { - return getLayer().getParameters()["offset"].asFloat(); + return getLayer()->getParameters().at("offset"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setOffset(float offset) { - getLayer().getParameters()["offset"] = offset; + getLayer()->getParameters()["offset"] = offset; return *this; } float Builder::PriorBoxClusteredLayer::getWidth() const { - return getLayer().getParameters()["width"].asFloat(); + return getLayer()->getParameters().at("width"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setWidth(float width) { - getLayer().getParameters()["width"] = width; + getLayer()->getParameters()["width"] = width; return *this; } float Builder::PriorBoxClusteredLayer::getHeight() const { - return getLayer().getParameters()["height"].asFloat(); + return getLayer()->getParameters().at("height"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setHeight(float height) { - getLayer().getParameters()["height"] = height; + getLayer()->getParameters()["height"] = height; return *this; } const std::vector Builder::PriorBoxClusteredLayer::getSteps() const { - return {getLayer().getParameters()["step_h"].asFloat(), getLayer().getParameters()["step_w"].asFloat()}; + return {getLayer()->getParameters().at("step_h"), getLayer()->getParameters().at("step_w")}; } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setSteps(const std::vector steps) { if (steps.size() != 2) THROW_IE_EXCEPTION << "PriorBoxClusteredLayer supports sizes only for height and width dimensions!"; - getLayer().getParameters()["step_h"] = steps[0]; - getLayer().getParameters()["step_w"] = steps[1]; + getLayer()->getParameters()["step_h"] = steps[0]; + getLayer()->getParameters()["step_w"] = steps[1]; return *this; } const std::vector Builder::PriorBoxClusteredLayer::getImgSizes() const { - return {getLayer().getParameters()["img_h"].asFloat(), getLayer().getParameters()["img_w"].asFloat()}; + return {getLayer()->getParameters().at("img_h"), getLayer()->getParameters().at("img_w")}; } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setImgSizes(const std::vector sizes) { if (sizes.size() != 2) THROW_IE_EXCEPTION << "PriorBoxClusteredLayer allows to specify only height and width dimensions of an input image!"; - getLayer().getParameters()["img_h"] = sizes[0]; - getLayer().getParameters()["img_w"] = sizes[1]; + getLayer()->getParameters()["img_h"] = sizes[0]; + getLayer()->getParameters()["img_w"] = sizes[1]; return *this; } float Builder::PriorBoxClusteredLayer::getStep() const { - return getLayer().getParameters()["step"].asFloat(); + return getLayer()->getParameters().at("step"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setStep(float step) { - getLayer().getParameters()["step"] = step; + getLayer()->getParameters()["step"] = step; return *this; } bool Builder::PriorBoxClusteredLayer::getClip() const { - return getLayer().getParameters()["clip"].asBool(); + return getLayer()->getParameters().at("clip"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setClip(bool flag) { - getLayer().getParameters()["clip"] = flag; + getLayer()->getParameters()["clip"] = flag; return *this; } bool Builder::PriorBoxClusteredLayer::getFlip() const { - return getLayer().getParameters()["flip"].asBool(); + return getLayer()->getParameters().at("flip"); } Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setFlip(bool flag) { - getLayer().getParameters()["flip"] = flag; + getLayer()->getParameters()["flip"] = flag; return *this; } + +REG_CONVERTER_FOR(PriorBoxClustered, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["flip"] = cnnLayer->GetParamAsBool("flip", false); + layer.getParameters()["clip"] = cnnLayer->GetParamAsBool("clip", false); + layer.getParameters()["step"] = cnnLayer->GetParamAsFloat("step"); + layer.getParameters()["img_h"] = cnnLayer->GetParamAsFloat("img_h", 0); + layer.getParameters()["img_w"] = cnnLayer->GetParamAsFloat("img_w", 0); + layer.getParameters()["step_h"] = cnnLayer->GetParamAsFloat("step_h", 0); + layer.getParameters()["step_w"] = cnnLayer->GetParamAsFloat("step_w", 0); + layer.getParameters()["height"] = cnnLayer->GetParamAsFloat("height", 0); + layer.getParameters()["width"] = cnnLayer->GetParamAsFloat("width", 0); + layer.getParameters()["offset"] = cnnLayer->GetParamAsFloat("offset", 0); + layer.getParameters()["variance"] = cnnLayer->GetParamAsFloat("variance", 0); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp b/inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp index dab36e0..febe397 100644 --- a/inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp @@ -1,118 +1,133 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::PriorBoxLayer::PriorBoxLayer(const std::string& name): LayerFragment("PriorBox", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(2); +Builder::PriorBoxLayer::PriorBoxLayer(const std::string& name): LayerDecorator("PriorBox", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(2); setScaleAllSizes(true); } -Builder::PriorBoxLayer::PriorBoxLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "PriorBox")) - THROW_IE_EXCEPTION << "Cannot create PriorBoxLayer decorator for layer " << getLayer().getType(); +Builder::PriorBoxLayer::PriorBoxLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("PriorBox"); +} + +Builder::PriorBoxLayer::PriorBoxLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("PriorBox"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::PriorBoxLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setInputPorts(const std::vector &ports) { if (ports.size() != 2) - THROW_IE_EXCEPTION << "Incorrect number of inputs for PriorBox layer."; - getLayer().getInputPorts() = ports; + THROW_IE_EXCEPTION << "Incorrect number of inputs for PriorBox getLayer()."; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::PriorBoxLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } float Builder::PriorBoxLayer::getVariance() const { - return getLayer().getParameters()["variance"].asFloat(); + return getLayer()->getParameters().at("variance"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setVariance(float variance) { - getLayer().getParameters()["variance"] = variance; + getLayer()->getParameters()["variance"] = variance; return *this; } float Builder::PriorBoxLayer::getOffset() const { - return getLayer().getParameters()["offset"].asFloat(); + return getLayer()->getParameters().at("offset"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setOffset(float offset) { - getLayer().getParameters()["offset"] = offset; + getLayer()->getParameters()["offset"] = offset; return *this; } float Builder::PriorBoxLayer::getStep() const { - return getLayer().getParameters()["step"].asFloat(); + return getLayer()->getParameters().at("step"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setStep(float step) { - getLayer().getParameters()["step"] = step; + getLayer()->getParameters()["step"] = step; return *this; } size_t Builder::PriorBoxLayer::getMinSize() const { - return getLayer().getParameters()["min_size"].asUInt(); + return getLayer()->getParameters().at("min_size"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setMinSize(size_t minSize) { - getLayer().getParameters()["min_size"] = minSize; + getLayer()->getParameters()["min_size"] = minSize; return *this; } size_t Builder::PriorBoxLayer::getMaxSize() const { - return getLayer().getParameters()["max_size"].asUInt(); + return getLayer()->getParameters().at("max_size"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setMaxSize(size_t maxSize) { - getLayer().getParameters()["max_size"] = maxSize; + getLayer()->getParameters()["max_size"] = maxSize; return *this; } bool Builder::PriorBoxLayer::getScaleAllSizes() const { - return getLayer().getParameters()["scale_all_sizes"].asBool(true); + return getLayer()->getParameters().at("scale_all_sizes"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setScaleAllSizes(bool flag) { - getLayer().getParameters()["scale_all_sizes"] = flag; + getLayer()->getParameters()["scale_all_sizes"] = flag; return *this; } bool Builder::PriorBoxLayer::getClip() const { - return getLayer().getParameters()["clip"].asBool(); + return getLayer()->getParameters().at("clip"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setClip(bool flag) { - getLayer().getParameters()["clip"] = flag; + getLayer()->getParameters()["clip"] = flag; return *this; } bool Builder::PriorBoxLayer::getFlip() const { - return getLayer().getParameters()["flip"].asBool(); + return getLayer()->getParameters().at("flip"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setFlip(bool flag) { - getLayer().getParameters()["flip"] = flag; + getLayer()->getParameters()["flip"] = flag; return *this; } const std::vector Builder::PriorBoxLayer::getAspectRatio() const { - return uInts2size_t(getLayer().getParameters()["aspect_ratio"].asUInts()); + return getLayer()->getParameters().at("aspect_ratio"); } Builder::PriorBoxLayer& Builder::PriorBoxLayer::setAspectRatio(const std::vector& aspectRatio) { - getLayer().getParameters()["aspect_ratio"] = aspectRatio; + getLayer()->getParameters()["aspect_ratio"] = aspectRatio; return *this; } + +REG_CONVERTER_FOR(PriorBox, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["flip"] = cnnLayer->GetParamAsBool("flip", false); + layer.getParameters()["clip"] = cnnLayer->GetParamAsBool("clip", false); + layer.getParameters()["scale_all_sizes"] = cnnLayer->GetParamAsBool("scale_all_sizes", true); + layer.getParameters()["step"] = cnnLayer->GetParamAsFloat("step", 0); + layer.getParameters()["offset"] = cnnLayer->GetParamAsFloat("offset"); + layer.getParameters()["variance"] = cnnLayer->GetParamAsFloat("variance", 0); + layer.getParameters()["aspect_ratio"] = cnnLayer->GetParamAsFloats("aspect_ratio", {}); + layer.getParameters()["min_size"] = static_cast(cnnLayer->GetParamAsUInt("min_size", 0)); + layer.getParameters()["max_size"] = static_cast(cnnLayer->GetParamAsUInt("max_size", 0)); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp b/inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp index 2437b7c..945f59e 100644 --- a/inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp @@ -1,103 +1,117 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::ProposalLayer::ProposalLayer(const std::string& name): LayerFragment("Proposal", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(3); +Builder::ProposalLayer::ProposalLayer(const std::string& name): LayerDecorator("Proposal", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(3); } -Builder::ProposalLayer::ProposalLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Proposal")) - THROW_IE_EXCEPTION << "Cannot create ProposalLayer decorator for layer " << getLayer().getType(); +Builder::ProposalLayer::ProposalLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Proposal"); +} + +Builder::ProposalLayer::ProposalLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Proposal"); } Builder::ProposalLayer& Builder::ProposalLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::ProposalLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::ProposalLayer& Builder::ProposalLayer::setInputPorts(const std::vector &ports) { if (ports.size() != 3) - THROW_IE_EXCEPTION << "Incorrect number of inputs for Proposal layer."; - getLayer().getInputPorts() = ports; + THROW_IE_EXCEPTION << "Incorrect number of inputs for Proposal getLayer()."; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::ProposalLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ProposalLayer& Builder::ProposalLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } size_t Builder::ProposalLayer::getPostNMSTopN() const { - return getLayer().getParameters()["post_nms_topn"].asUInt(); + return getLayer()->getParameters().at("post_nms_topn"); } Builder::ProposalLayer& Builder::ProposalLayer::setPostNMSTopN(size_t topN) { - getLayer().getParameters()["post_nms_topn"] = topN; + getLayer()->getParameters()["post_nms_topn"] = topN; return *this; } size_t Builder::ProposalLayer::getPreNMSTopN() const { - return getLayer().getParameters()["pre_nms_topn"].asUInt(); + return getLayer()->getParameters().at("pre_nms_topn"); } Builder::ProposalLayer& Builder::ProposalLayer::setPreNMSTopN(size_t topN) { - getLayer().getParameters()["pre_nms_topn"] = topN; + getLayer()->getParameters()["pre_nms_topn"] = topN; return *this; } float Builder::ProposalLayer::getNMSThresh() const { - return getLayer().getParameters()["nms_thresh"].asFloat(); + return getLayer()->getParameters().at("nms_thresh"); } Builder::ProposalLayer& Builder::ProposalLayer::setNMSThresh(float thresh) { - getLayer().getParameters()["nms_thresh"] = thresh; + getLayer()->getParameters()["nms_thresh"] = thresh; return *this; } size_t Builder::ProposalLayer::getBaseSize() const { - return getLayer().getParameters()["base_size"].asUInt(); + return getLayer()->getParameters().at("base_size"); } Builder::ProposalLayer& Builder::ProposalLayer::setBaseSize(size_t baseSize) { - getLayer().getParameters()["base_size"] = baseSize; + getLayer()->getParameters()["base_size"] = baseSize; return *this; } size_t Builder::ProposalLayer::getMinSize() const { - return getLayer().getParameters()["min_size"].asUInt(); + return getLayer()->getParameters().at("min_size"); } Builder::ProposalLayer& Builder::ProposalLayer::setMinSize(size_t minSize) { - getLayer().getParameters()["min_size"] = minSize; + getLayer()->getParameters()["min_size"] = minSize; return *this; } size_t Builder::ProposalLayer::getFeatStride() const { - return getLayer().getParameters()["feat_stride"].asUInt(); + return getLayer()->getParameters().at("feat_stride"); } Builder::ProposalLayer& Builder::ProposalLayer::setFeatStride(size_t featStride) { - getLayer().getParameters()["feat_stride"] = featStride; + getLayer()->getParameters()["feat_stride"] = featStride; return *this; } const std::vector Builder::ProposalLayer::getScale() const { - return getLayer().getParameters()["scale"].asFloats(); + return getLayer()->getParameters().at("scale"); } Builder::ProposalLayer& Builder::ProposalLayer::setScale(const std::vector& scales) { - getLayer().getParameters()["scale"] = scales; + getLayer()->getParameters()["scale"] = scales; return *this; } const std::vector Builder::ProposalLayer::getRatio() const { - return getLayer().getParameters()["ratio"].asFloats(); + return getLayer()->getParameters().at("ratio"); } Builder::ProposalLayer& Builder::ProposalLayer::setRatio(const std::vector& ratios) { - getLayer().getParameters()["ratio"] = ratios; + getLayer()->getParameters()["ratio"] = ratios; return *this; } + +REG_CONVERTER_FOR(Proposal, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["post_nms_topn"] = static_cast(cnnLayer->GetParamAsUInt("post_nms_topn", 0)); + layer.getParameters()["pre_nms_topn"] = static_cast(cnnLayer->GetParamAsUInt("pre_nms_topn", 0)); + layer.getParameters()["nms_thresh"] = cnnLayer->GetParamAsFloat("nms_thresh", 0); + layer.getParameters()["min_size"] = static_cast(cnnLayer->GetParamAsUInt("base_size", 0)); + layer.getParameters()["max_size"] = static_cast(cnnLayer->GetParamAsUInt("max_size", 0)); + layer.getParameters()["max_size"] = static_cast(cnnLayer->GetParamAsUInt("feat_stride", 0)); + layer.getParameters()["scale"] = cnnLayer->GetParamAsFloats("scale"); + layer.getParameters()["ratio"] = cnnLayer->GetParamAsFloats("ratio"); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp b/inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp index 8a023d3..ac768e2 100644 --- a/inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp @@ -1,61 +1,70 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::PSROIPoolingLayer::PSROIPoolingLayer(const std::string& name): LayerFragment("PSROIPooling", name) { - getLayer().getOutputPorts().resize(1); +Builder::PSROIPoolingLayer::PSROIPoolingLayer(const std::string& name): LayerDecorator("PSROIPooling", name) { + getLayer()->getOutputPorts().resize(1); } -Builder::PSROIPoolingLayer::PSROIPoolingLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "PSROIPooling")) - THROW_IE_EXCEPTION << "Cannot create PSROIPoolingLayer decorator for layer " << getLayer().getType(); +Builder::PSROIPoolingLayer::PSROIPoolingLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("PSROIPooling"); +} + +Builder::PSROIPoolingLayer::PSROIPoolingLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("PSROIPooling"); } Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::PSROIPoolingLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setInputPorts(const std::vector& ports) { if (ports.size() != 2) THROW_IE_EXCEPTION << "PSROIPoolingLayer should have 2 inputs!"; - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::PSROIPoolingLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } float Builder::PSROIPoolingLayer::getSpatialScale() const { - return getLayer().getParameters()["spatial_scale"].asFloat(); + return getLayer()->getParameters().at("spatial_scale"); } Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setSpatialScale(float spatialScale) { - getLayer().getParameters()["spatial_scale"] = spatialScale; + getLayer()->getParameters()["spatial_scale"] = spatialScale; return *this; } size_t Builder::PSROIPoolingLayer::getOutputDim() const { - return getLayer().getParameters()["output_dim"].asUInt(); + return getLayer()->getParameters().at("output_dim"); } Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setOutputDim(size_t outDim) { - getLayer().getParameters()["output_dim"] = outDim; + getLayer()->getParameters()["output_dim"] = outDim; return *this; } size_t Builder::PSROIPoolingLayer::getGroupSize() const { - return getLayer().getParameters()["group_size"].asUInt(); + return getLayer()->getParameters().at("group_size"); } Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setGroupSize(size_t size) { - getLayer().getParameters()["group_size"] = size; + getLayer()->getParameters()["group_size"] = size; return *this; } + +REG_CONVERTER_FOR(PSROIPooling, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["group_size"] = static_cast(cnnLayer->GetParamAsUInt("group_size", 0)); + layer.getParameters()["output_dim"] = static_cast(cnnLayer->GetParamAsUInt("output_dim", 0)); + layer.getParameters()["spatial_scale"] = cnnLayer->GetParamAsFloat("spatial_scale", 0); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp b/inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp index bcefcbb..3e4c42e 100644 --- a/inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp @@ -1,96 +1,110 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::RegionYoloLayer::RegionYoloLayer(const std::string& name): LayerFragment("RegionYolo", name) { - getLayer().getInputPorts().resize(1); - getLayer().getOutputPorts().resize(1); +Builder::RegionYoloLayer::RegionYoloLayer(const std::string& name): LayerDecorator("RegionYolo", name) { + getLayer()->getInputPorts().resize(1); + getLayer()->getOutputPorts().resize(1); } -Builder::RegionYoloLayer::RegionYoloLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "RegionYolo")) - THROW_IE_EXCEPTION << "Cannot create RegionYoloLayer decorator for layer " << getLayer().getType(); +Builder::RegionYoloLayer::RegionYoloLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("RegionYolo"); +} + +Builder::RegionYoloLayer::RegionYoloLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("RegionYolo"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::RegionYoloLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setInputPort(const Port& port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::RegionYoloLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } int Builder::RegionYoloLayer::getCoords() const { - return getLayer().getParameters()["coords"].asInt(); + return getLayer()->getParameters().at("coords"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setCoords(int coords) { - getLayer().getParameters()["coords"] = coords; + getLayer()->getParameters()["coords"] = coords; return *this; } int Builder::RegionYoloLayer::getClasses() const { - return getLayer().getParameters()["classes"].asInt(); + return getLayer()->getParameters().at("classes"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setClasses(int classes) { - getLayer().getParameters()["classes"] = classes; + getLayer()->getParameters()["classes"] = classes; return *this; } int Builder::RegionYoloLayer::getNum() const { - return getLayer().getParameters()["num"].asInt(); + return getLayer()->getParameters().at("num"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setNum(int num) { - getLayer().getParameters()["num"] = num; + getLayer()->getParameters()["num"] = num; return *this; } bool Builder::RegionYoloLayer::getDoSoftMax() const { - return getLayer().getParameters()["do_softmax"].asBool(); + return getLayer()->getParameters().at("do_softmax"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setDoSoftMax(bool flag) { - getLayer().getParameters()["do_softmax"] = flag ? 1 : 0; + getLayer()->getParameters()["do_softmax"] = flag ? 1 : 0; return *this; } float Builder::RegionYoloLayer::getAnchors() const { - return getLayer().getParameters()["anchors"].asFloat(); + return getLayer()->getParameters().at("anchors"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setAnchors(float anchors) { - getLayer().getParameters()["anchors"] = anchors; + getLayer()->getParameters()["anchors"] = anchors; return *this; } int Builder::RegionYoloLayer::getMask() const { - return getLayer().getParameters()["mask"].asInt(); + return getLayer()->getParameters().at("mask"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setMask(int mask) { - getLayer().getParameters()["mask"] = mask; + getLayer()->getParameters()["mask"] = mask; return *this; } size_t Builder::RegionYoloLayer::getAxis() const { - return getLayer().getParameters()["axis"].asUInt(); + return getLayer()->getParameters().at("axis"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setAxis(size_t axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; } size_t Builder::RegionYoloLayer::getEndAxis() const { - return getLayer().getParameters()["end_axis"].asUInt(); + return getLayer()->getParameters().at("end_axis"); } Builder::RegionYoloLayer& Builder::RegionYoloLayer::setEndAxis(size_t axis) { - getLayer().getParameters()["end_axis"] = axis; + getLayer()->getParameters()["end_axis"] = axis; return *this; } + +REG_CONVERTER_FOR(RegionYoloLayer, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["end_axis"] = static_cast(cnnLayer->GetParamAsUInt("end_axis", 0)); + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis", 0)); + layer.getParameters()["num"] = cnnLayer->GetParamAsInt("num", 0); + layer.getParameters()["mask"] = cnnLayer->GetParamAsInt("mask", 0); + layer.getParameters()["coords"] = cnnLayer->GetParamAsInt("coords", 0); + layer.getParameters()["classes"] = cnnLayer->GetParamAsInt("classes", 0); + layer.getParameters()["anchors"] = cnnLayer->GetParamAsFloat("anchors", 0); + layer.getParameters()["do_softmax"] = cnnLayer->GetParamAsBool("do_softmax", false); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp b/inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp index d39b2d0..966dcb5 100644 --- a/inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp @@ -1,47 +1,62 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::ReLU6Layer::ReLU6Layer(const std::string& name): LayerFragment("ReLU6", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ReLU6Layer::ReLU6Layer(const std::string& name): LayerDecorator("ReLU6", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setN(6); } -Builder::ReLU6Layer::ReLU6Layer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ReLU6")) - THROW_IE_EXCEPTION << "Cannot create ReLU6Layer decorator for layer " << getLayer().getType(); +Builder::ReLU6Layer::ReLU6Layer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ReLU6"); +} + +Builder::ReLU6Layer::ReLU6Layer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ReLU6"); } Builder::ReLU6Layer& Builder::ReLU6Layer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ReLU6Layer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ReLU6Layer& Builder::ReLU6Layer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::ReLU6Layer::getN() const { - return getLayer().getParameters()["n"].asFloat(); + return getLayer()->getParameters().at("n"); } Builder::ReLU6Layer& Builder::ReLU6Layer::setN(float n) { - getLayer().getParameters()["n"] = n; + getLayer()->getParameters()["n"] = n; return *this; } - +REG_VALIDATOR_FOR(ReLU6, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(ReLU6, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["n"] = cnnLayer->GetParamAsFloat("n", 0); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_relu_layer.cpp b/inference-engine/src/inference_engine/builders/ie_relu_layer.cpp index 29793c4..63c221b 100644 --- a/inference-engine/src/inference_engine/builders/ie_relu_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_relu_layer.cpp @@ -1,45 +1,63 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::ReLULayer::ReLULayer(const std::string& name): LayerFragment("ReLU", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ReLULayer::ReLULayer(const std::string& name): LayerDecorator("ReLU", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setNegativeSlope(0); } -Builder::ReLULayer::ReLULayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ReLU")) - THROW_IE_EXCEPTION << "Cannot create ReLULayer decorator for layer " << getLayer().getType(); +Builder::ReLULayer::ReLULayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ReLU"); +} + +Builder::ReLULayer::ReLULayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ReLU"); } Builder::ReLULayer& Builder::ReLULayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ReLULayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ReLULayer& Builder::ReLULayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } float Builder::ReLULayer::getNegativeSlope() const { - return getLayer().getParameters()["negative_slope"].asFloat(); + return getLayer()->getParameters().at("negative_slope"); } Builder::ReLULayer& Builder::ReLULayer::setNegativeSlope(float negativeSlope) { - getLayer().getParameters()["negative_slope"] = negativeSlope; + getLayer()->getParameters()["negative_slope"] = negativeSlope; return *this; } + +REG_VALIDATOR_FOR(ReLU, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + Builder::ReLULayer layer(input_layer); + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); + +REG_CONVERTER_FOR(ReLU, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["negative_slope"] = cnnLayer->GetParamAsFloat("negative_slope", 0); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp b/inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp index 83c831f..3d2e7f4 100644 --- a/inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp @@ -1,47 +1,53 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::ReorgYoloLayer::ReorgYoloLayer(const std::string& name): LayerFragment("ReorgYolo", name) { - getLayer().getInputPorts().resize(1); - getLayer().getOutputPorts().resize(1); +Builder::ReorgYoloLayer::ReorgYoloLayer(const std::string& name): LayerDecorator("ReorgYolo", name) { + getLayer()->getInputPorts().resize(1); + getLayer()->getOutputPorts().resize(1); } -Builder::ReorgYoloLayer::ReorgYoloLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ReorgYolo")) - THROW_IE_EXCEPTION << "Cannot create ReorgYoloLayer decorator for layer " << getLayer().getType(); +Builder::ReorgYoloLayer::ReorgYoloLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ReorgYolo"); +} + +Builder::ReorgYoloLayer::ReorgYoloLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ReorgYolo"); } Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ReorgYoloLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setInputPort(const Port& port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::ReorgYoloLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } int Builder::ReorgYoloLayer::getStride() const { - return getLayer().getParameters()["stride"].asInt(); + return getLayer()->getParameters().at("stride"); } Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setStride(int stride) { - getLayer().getParameters()["stride"] = stride; + getLayer()->getParameters()["stride"] = stride; return *this; } +REG_CONVERTER_FOR(ReorgYolo, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["stride"] = cnnLayer->GetParamAsInt("stride", 0); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_resample_layer.cpp b/inference-engine/src/inference_engine/builders/ie_resample_layer.cpp new file mode 100644 index 0000000..ca2ddda --- /dev/null +++ b/inference-engine/src/inference_engine/builders/ie_resample_layer.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +using namespace InferenceEngine; + +Builder::ResampleLayer::ResampleLayer(const std::string& name): LayerDecorator("Resample", name) { + getLayer()->getInputPorts().resize(1); + getLayer()->getOutputPorts().resize(1); +} + +Builder::ResampleLayer::ResampleLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Resample"); +} + +Builder::ResampleLayer::ResampleLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Resample"); +} + +Builder::ResampleLayer& Builder::ResampleLayer::setName(const std::string& name) { + getLayer()->setName(name); + return *this; +} +const Port& Builder::ResampleLayer::getInputPort() const { + return getLayer()->getInputPorts()[0]; +} +Builder::ResampleLayer& Builder::ResampleLayer::setInputPort(const Port& port) { + getLayer()->getInputPorts()[0] = port; + return *this; +} +const Port& Builder::ResampleLayer::getOutputPort() const { + return getLayer()->getOutputPorts()[0]; +} +Builder::ResampleLayer& Builder::ResampleLayer::setOutputPort(const Port& port) { + getLayer()->getOutputPorts()[0] = port; + return *this; +} + +const std::string &Builder::ResampleLayer::getResampleType() const { + return getLayer()->getParameters().at("type"); +} + +Builder::ResampleLayer &Builder::ResampleLayer::setResampleType(const std::string &type) { + getLayer()->getParameters()["type"] = type; + return *this; +} + +bool Builder::ResampleLayer::getAntialias() const { + return getLayer()->getParameters().at("antialias"); +} + +Builder::ResampleLayer &Builder::ResampleLayer::setAntialias(bool antialias) { + getLayer()->getParameters()["antialias"] = antialias; + return *this; +} + +float Builder::ResampleLayer::getFactor() const { + return getLayer()->getParameters().at("factor"); +} + +Builder::ResampleLayer &Builder::ResampleLayer::setFactor(float factor) { + getLayer()->getParameters()["factor"] = factor; + return *this; +} + +size_t Builder::ResampleLayer::getWidth() const { + return getLayer()->getParameters().at("width"); +} + +Builder::ResampleLayer &Builder::ResampleLayer::setWidth(size_t width) { + getLayer()->getParameters()["width"] = width; + return *this; +} + +size_t Builder::ResampleLayer::getHeight() const { + return getLayer()->getParameters().at("height"); +} + +Builder::ResampleLayer &Builder::ResampleLayer::setHeight(size_t height) { + getLayer()->getParameters()["height"] = height; + return *this; +} + +REG_CONVERTER_FOR(Resample, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["height"] = static_cast(cnnLayer->GetParamAsUInt("height", 0)); + layer.getParameters()["width"] = static_cast(cnnLayer->GetParamAsUInt("width", 0)); + layer.getParameters()["factor"] = cnnLayer->GetParamAsFloat("factor", 0); + layer.getParameters()["antialias"] = cnnLayer->GetParamAsBool("antialias", false); + layer.getParameters()["type"] = cnnLayer->GetParamAsString("type"); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp b/inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp index 9f6c1f9..e72f2fe 100644 --- a/inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp @@ -1,54 +1,65 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::ReshapeLayer::ReshapeLayer(const std::string& name): LayerFragment("Reshape", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ReshapeLayer::ReshapeLayer(const std::string& name): LayerDecorator("Reshape", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); } -Builder::ReshapeLayer::ReshapeLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Reshape")) - THROW_IE_EXCEPTION << "Cannot create ReshapeLayer decorator for layer " << getLayer().getType(); +Builder::ReshapeLayer::ReshapeLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Reshape"); +} + +Builder::ReshapeLayer::ReshapeLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Reshape"); } Builder::ReshapeLayer& Builder::ReshapeLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ReshapeLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::ReshapeLayer& Builder::ReshapeLayer::setInputPort(const Port &port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::ReshapeLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ReshapeLayer& Builder::ReshapeLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } const std::vector Builder::ReshapeLayer::getDims() const { - return getLayer().getParameters()["dim"].asInts(); + return getLayer()->getParameters().at("dim"); } Builder::ReshapeLayer& Builder::ReshapeLayer::setDims(const std::vector& dims) { - getLayer().getParameters()["dim"] = dims; + getLayer()->getParameters()["dim"] = dims; return *this; } +REG_CONVERTER_FOR(Flatten, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis", 0)); + layer.getParameters()["dim"] = cnnLayer->GetParamAsInts("dim", {}); +}); +REG_CONVERTER_FOR(Reshape, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis", 0)); + layer.getParameters()["dim"] = cnnLayer->GetParamAsInts("dim", {}); +}); diff --git a/inference-engine/src/inference_engine/builders/ie_rnn_sequence_layer.cpp b/inference-engine/src/inference_engine/builders/ie_rnn_sequence_layer.cpp new file mode 100644 index 0000000..9382a94 --- /dev/null +++ b/inference-engine/src/inference_engine/builders/ie_rnn_sequence_layer.cpp @@ -0,0 +1,100 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include +#include + +using namespace InferenceEngine; + +Builder::RNNSequenceLayer::RNNSequenceLayer(const std::string& name): LayerDecorator("RNNSequence", name) { + getLayer()->getOutputPorts().resize(2); + getLayer()->getInputPorts().resize(5); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getInputPorts()[3].setParameter("type", "optional"); +} + +Builder::RNNSequenceLayer::RNNSequenceLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("RNNSequence"); +} + +Builder::RNNSequenceLayer::RNNSequenceLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("RNNSequence"); +} + +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setName(const std::string& name) { + getLayer()->setName(name); + return *this; +} + +const std::vector& Builder::RNNSequenceLayer::getInputPorts() const { + return getLayer()->getInputPorts(); +} + +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setInputPorts(const std::vector& ports) { + getLayer()->getInputPorts() = ports; + return *this; +} + +const std::vector& Builder::RNNSequenceLayer::getOutputPorts() const { + return getLayer()->getOutputPorts(); +} + +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setOutputPorts(const std::vector& ports) { + getLayer()->getOutputPorts() = ports; + return *this; +} +int Builder::RNNSequenceLayer::getHiddenSize() const { + return getLayer()->getParameters().at("hidden_size"); +} +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setHiddenSize(int size) { + getLayer()->getParameters()["hidden_size"] = size; + return *this; +} +bool Builder::RNNSequenceLayer::getSequenceDim() const { + return getLayer()->getParameters().at("sequence_dim"); +} +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setSqquenceDim(bool flag) { + getLayer()->getParameters()["sequence_dim"] = flag; + return *this; +} +const std::vector& Builder::RNNSequenceLayer::getActivations() const { + return getLayer()->getParameters().at("activations"); +} +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setActivations(const std::vector& activations) { + getLayer()->getParameters()["activations"] = activations; + return *this; +} +const std::vector& Builder::RNNSequenceLayer::getActivationsAlpha() const { + return getLayer()->getParameters().at("activations_alpha"); +} +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setActivationsAlpha(const std::vector& activations) { + getLayer()->getParameters()["activations_alpha"] = activations; + return *this; +} +const std::vector& Builder::RNNSequenceLayer::getActivationsBeta() const { + return getLayer()->getParameters().at("activations_beta"); +} +Builder::RNNSequenceLayer& Builder::RNNSequenceLayer::setActivationsBeta(const std::vector& activations) { + getLayer()->getParameters()["activations_beta"] = activations; + return *this; +} +REG_CONVERTER_FOR(RNNSequence, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["hidden_size"] = cnnLayer->GetParamAsInt("hidden_size"); + layer.getParameters()["sequence_dim"] = cnnLayer->GetParamsAsBool("sequence_dim", true); + std::vector activations; + std::istringstream stream(cnnLayer->GetParamAsString("activations")); + std::string str; + while (getline(stream, str, ',')) { + activations.push_back(str); + } + layer.getParameters()["activations"] = activations; + layer.getParameters()["activations_alpha"] = cnnLayer->GetParamAsFloats("activations_alpha"); + layer.getParameters()["activations_beta"] = cnnLayer->GetParamAsFloats("activations_beta"); +}); + + diff --git a/inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp b/inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp index bd1cf4f..ad0963c 100644 --- a/inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp @@ -1,58 +1,68 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::ROIPoolingLayer::ROIPoolingLayer(const std::string& name): LayerFragment("ROIPooling", name) { - getLayer().getOutputPorts().resize(1); +Builder::ROIPoolingLayer::ROIPoolingLayer(const std::string& name): LayerDecorator("ROIPooling", name) { + getLayer()->getOutputPorts().resize(1); setPooled({0, 0}); } -Builder::ROIPoolingLayer::ROIPoolingLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ROIPooling")) - THROW_IE_EXCEPTION << "Cannot create ROIPoolingLayer decorator for layer " << getLayer().getType(); +Builder::ROIPoolingLayer::ROIPoolingLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ROIPooling"); +} + +Builder::ROIPoolingLayer::ROIPoolingLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ROIPooling"); } Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::ROIPoolingLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setInputPorts(const std::vector& ports) { if (ports.size() != 2) THROW_IE_EXCEPTION << "ROIPoolingLayer should have 2 inputs!"; - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::ROIPoolingLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } float Builder::ROIPoolingLayer::getSpatialScale() const { - return getLayer().getParameters()["spatial_scale"].asFloat(); + return getLayer()->getParameters().at("spatial_scale"); } Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setSpatialScale(float spatialScale) { - getLayer().getParameters()["spatial_scale"] = spatialScale; + getLayer()->getParameters()["spatial_scale"] = spatialScale; return *this; } const std::vector Builder::ROIPoolingLayer::getPooled() const { - return {getLayer().getParameters()["pooled_h"].asInt(0), getLayer().getParameters()["pooled_w"].asInt(0)}; + return {getLayer()->getParameters().at("pooled_h"), + getLayer()->getParameters().at("pooled_w")}; } Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setPooled(const std::vector& pooled) { if (pooled.size() != 2) THROW_IE_EXCEPTION << "ROIPoolingLayer supports only pooled for height and width dimensions"; - getLayer().getParameters()["pooled_h"] = pooled[0]; - getLayer().getParameters()["pooled_w"] = pooled[1]; + getLayer()->getParameters()["pooled_h"] = pooled[0]; + getLayer()->getParameters()["pooled_w"] = pooled[1]; return *this; } + +REG_CONVERTER_FOR(ROIPooling, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["pooled_h"] = cnnLayer->GetParamAsInt("pooled_h", 0); + layer.getParameters()["pooled_w"] = cnnLayer->GetParamAsInt("pooled_w", 0); + layer.getParameters()["spatial_scale"] = cnnLayer->GetParamAsFloat("spatial_scale"); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp b/inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp index 534959b..95ec737 100644 --- a/inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp @@ -1,44 +1,39 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
#include using namespace InferenceEngine; -Builder::ScaleShiftLayer::ScaleShiftLayer(const std::string& name): LayerFragment("ScaleShift", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::ScaleShiftLayer::ScaleShiftLayer(const std::string& name): LayerDecorator("ScaleShift", name) { + getLayer()->getInputPorts().resize(3); + getLayer()->getInputPorts()[1].setParameter("type", "weights"); + getLayer()->getInputPorts()[2].setParameter("type", "biases"); + getLayer()->getOutputPorts().resize(1); } -Builder::ScaleShiftLayer::ScaleShiftLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "ScaleShift")) - THROW_IE_EXCEPTION << "Cannot create ScaleShiftLayer decorator for layer " << getLayer().getType(); +Builder::ScaleShiftLayer::ScaleShiftLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("ScaleShift"); +} + +Builder::ScaleShiftLayer::ScaleShiftLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("ScaleShift"); } Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::ScaleShiftLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; - return *this; -} - -Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setWeights(const Blob::CPtr& weights) { - getLayer().addConstantData("weights", weights); - return *this; -} -Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setBiases(const Blob::CPtr& biases) { - getLayer().addConstantData("biases", biases); + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; -} +} \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp b/inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp index 72ccc80..265ad37 100644 --- a/inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp @@ -1,35 +1,37 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
#include using namespace InferenceEngine; -Builder::SigmoidLayer::SigmoidLayer(const std::string& name): LayerFragment("Sigmoid", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::SigmoidLayer::SigmoidLayer(const std::string& name): LayerDecorator("Sigmoid", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); } -Builder::SigmoidLayer::SigmoidLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Sigmoid")) - THROW_IE_EXCEPTION << "Cannot create SigmoidLayer decorator for layer " << getLayer().getType(); +Builder::SigmoidLayer::SigmoidLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Sigmoid"); +} + +Builder::SigmoidLayer::SigmoidLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Sigmoid"); } Builder::SigmoidLayer& Builder::SigmoidLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::SigmoidLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::SigmoidLayer& Builder::SigmoidLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } diff --git a/inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp b/inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp index 1fc3e07..5e33313 100644 --- a/inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp @@ -1,89 +1,102 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::SimplerNMSLayer::SimplerNMSLayer(const std::string& name): LayerFragment("SimplerNMS", name) { - getLayer().getOutputPorts().resize(1); +Builder::SimplerNMSLayer::SimplerNMSLayer(const std::string& name): LayerDecorator("SimplerNMS", name) { + getLayer()->getOutputPorts().resize(1); } -Builder::SimplerNMSLayer::SimplerNMSLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "SimplerNMS")) - THROW_IE_EXCEPTION << "Cannot create SimplerNMSLayer decorator for layer " << getLayer().getType(); +Builder::SimplerNMSLayer::SimplerNMSLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("SimplerNMS"); +} + +Builder::SimplerNMSLayer::SimplerNMSLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("SimplerNMS"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const std::vector& Builder::SimplerNMSLayer::getInputPorts() const { - return getLayer().getInputPorts(); + return getLayer()->getInputPorts(); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setInputPorts(const std::vector& ports) { - getLayer().getInputPorts() = ports; + getLayer()->getInputPorts() = ports; return *this; } const Port& Builder::SimplerNMSLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setOutputPort(const Port& port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } size_t Builder::SimplerNMSLayer::getPreNMSTopN() const { - return getLayer().getParameters()["pre_nms_topn"].asUInt(); + return getLayer()->getParameters().at("pre_nms_topn"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setPreNMSTopN(size_t topN) { - getLayer().getParameters()["pre_nms_topn"] = topN; + getLayer()->getParameters()["pre_nms_topn"] = topN; return *this; } size_t Builder::SimplerNMSLayer::getPostNMSTopN() const { - return getLayer().getParameters()["post_nms_topn"].asUInt(); + return getLayer()->getParameters().at("post_nms_topn"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setPostNMSTopN(size_t topN) { - getLayer().getParameters()["post_nms_topn"] = topN; + getLayer()->getParameters()["post_nms_topn"] = topN; return *this; } size_t Builder::SimplerNMSLayer::getFeatStride() const { - return getLayer().getParameters()["feat_stride"].asUInt(); + return getLayer()->getParameters().at("feat_stride"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setFeatStride(size_t featStride) { - getLayer().getParameters()["feat_stride"] = featStride; + getLayer()->getParameters()["feat_stride"] = featStride; return *this; } size_t Builder::SimplerNMSLayer::getMinBoxSize() const { - return getLayer().getParameters()["min_bbox_size"].asUInt(); + return getLayer()->getParameters().at("min_bbox_size"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setMinBoxSize(size_t minSize) { - getLayer().getParameters()["min_bbox_size"] = minSize; + getLayer()->getParameters()["min_bbox_size"] = minSize; return *this; } size_t Builder::SimplerNMSLayer::getScale() const { - return getLayer().getParameters()["scale"].asUInt(); + return getLayer()->getParameters().at("scale"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setScale(size_t scale) { - getLayer().getParameters()["scale"] = scale; + getLayer()->getParameters()["scale"] = scale; return *this; } float Builder::SimplerNMSLayer::getCLSThreshold() const { - return getLayer().getParameters()["cls_threshold"].asFloat(); + return getLayer()->getParameters().at("cls_threshold"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setCLSThreshold(float threshold) { - getLayer().getParameters()["cls_threshold"] = threshold; + getLayer()->getParameters()["cls_threshold"] = threshold; return *this; } float Builder::SimplerNMSLayer::getIOUThreshold() const { - return getLayer().getParameters()["iou_threshold"].asFloat(); + return getLayer()->getParameters().at("iou_threshold"); } Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setIOUThreshold(float threshold) { - getLayer().getParameters()["iou_threshold"] = threshold; + getLayer()->getParameters()["iou_threshold"] = threshold; return *this; } + +REG_CONVERTER_FOR(SimplerNMS, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["iou_threshold"] = cnnLayer->GetParamAsFloat("iou_threshold"); + layer.getParameters()["cls_threshold"] = cnnLayer->GetParamAsFloat("cls_threshold"); + layer.getParameters()["scale"] = static_cast(cnnLayer->GetParamAsUInt("scale")); + layer.getParameters()["min_bbox_size"] = static_cast(cnnLayer->GetParamAsUInt("min_bbox_size")); + layer.getParameters()["feat_stride"] = static_cast(cnnLayer->GetParamAsUInt("feat_stride")); + layer.getParameters()["pre_nms_topn"] = static_cast(cnnLayer->GetParamAsUInt("pre_nms_topn")); + layer.getParameters()["post_nms_topn"] = static_cast(cnnLayer->GetParamAsUInt("post_nms_topn")); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp b/inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp index d4ccfa9..32cde38 100644 --- a/inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp @@ -1,45 +1,52 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include using namespace InferenceEngine; -Builder::SoftMaxLayer::SoftMaxLayer(const std::string& name): LayerFragment("SoftMax", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::SoftMaxLayer::SoftMaxLayer(const std::string& name): LayerDecorator("SoftMax", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); setAxis(1); } -Builder::SoftMaxLayer::SoftMaxLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "SoftMax")) - THROW_IE_EXCEPTION << "Cannot create SoftMaxLayer decorator for layer " << getLayer().getType(); +Builder::SoftMaxLayer::SoftMaxLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("SoftMax"); +} + +Builder::SoftMaxLayer::SoftMaxLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("SoftMax"); } Builder::SoftMaxLayer& Builder::SoftMaxLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::SoftMaxLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::SoftMaxLayer& Builder::SoftMaxLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } size_t Builder::SoftMaxLayer::getAxis() const { - return getLayer().getParameters()["axis"].asUInt(); + return getLayer()->getParameters().at("axis"); } Builder::SoftMaxLayer& Builder::SoftMaxLayer::setAxis(size_t axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; } + +REG_CONVERTER_FOR(SoftMax, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis", 1)); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_split_layer.cpp b/inference-engine/src/inference_engine/builders/ie_split_layer.cpp index 50d04dd..7c8185c 100644 --- a/inference-engine/src/inference_engine/builders/ie_split_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_split_layer.cpp @@ -1,53 +1,60 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::SplitLayer::SplitLayer(const std::string& name): LayerFragment("Concat", name) { - getLayer().getInputPorts().resize(1); +Builder::SplitLayer::SplitLayer(const std::string& name): LayerDecorator("Split", name) { + getLayer()->getInputPorts().resize(1); setAxis(1); } -Builder::SplitLayer::SplitLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Concat")) - THROW_IE_EXCEPTION << "Cannot create SplitLayer decorator for layer " << getLayer().getType(); +Builder::SplitLayer::SplitLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Split"); +} + +Builder::SplitLayer::SplitLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Split"); } Builder::SplitLayer& Builder::SplitLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::SplitLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::SplitLayer& Builder::SplitLayer::setInputPort(const Port &port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const std::vector& Builder::SplitLayer::getOutputPorts() const { - return getLayer().getOutputPorts(); + return getLayer()->getOutputPorts(); } Builder::SplitLayer& Builder::SplitLayer::setOutputPorts(const std::vector& ports) { - getLayer().getOutputPorts() = ports; + getLayer()->getOutputPorts() = ports; return *this; } size_t Builder::SplitLayer::getAxis() const { - return getLayer().getParameters()["axis"].asUInt(); + return getLayer()->getParameters().at("axis"); } Builder::SplitLayer& Builder::SplitLayer::setAxis(size_t axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; } + +REG_CONVERTER_FOR(Split, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis", 1)); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp b/inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp index 37eb7eb..eeb0503 100644 --- a/inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp @@ -1,35 +1,47 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
#include using namespace InferenceEngine; -Builder::TanHLayer::TanHLayer(const std::string& name): LayerFragment("TanH", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::TanHLayer::TanHLayer(const std::string& name): LayerDecorator("TanH", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); } -Builder::TanHLayer::TanHLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "TanH")) - THROW_IE_EXCEPTION << "Cannot create TanHLayer decorator for layer " << getLayer().getType(); +Builder::TanHLayer::TanHLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("TanH"); +} + +Builder::TanHLayer::TanHLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("TanH"); } Builder::TanHLayer& Builder::TanHLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::TanHLayer::getPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::TanHLayer& Builder::TanHLayer::setPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; - getLayer().getInputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; -} \ No newline at end of file +} + +REG_VALIDATOR_FOR(TanH, [] (const InferenceEngine::Builder::Layer::CPtr& input_layer, bool partial) { + if (!input_layer->getInputPorts().empty() && + !input_layer->getOutputPorts().empty() && + !input_layer->getInputPorts()[0].shape().empty() && + !input_layer->getOutputPorts()[0].shape().empty() && + input_layer->getInputPorts()[0].shape() != input_layer->getOutputPorts()[0].shape()) { + THROW_IE_EXCEPTION << "Input and output ports should be equal"; + } +}); diff --git a/inference-engine/src/inference_engine/builders/ie_tile_layer.cpp b/inference-engine/src/inference_engine/builders/ie_tile_layer.cpp index fade9f3..125c530 100644 --- a/inference-engine/src/inference_engine/builders/ie_tile_layer.cpp +++ b/inference-engine/src/inference_engine/builders/ie_tile_layer.cpp @@ -1,62 +1,70 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include -#include
+#include #include #include using namespace InferenceEngine; -Builder::TileLayer::TileLayer(const std::string& name): LayerFragment("Tile", name) { - getLayer().getOutputPorts().resize(1); - getLayer().getInputPorts().resize(1); +Builder::TileLayer::TileLayer(const std::string& name): LayerDecorator("Tile", name) { + getLayer()->getOutputPorts().resize(1); + getLayer()->getInputPorts().resize(1); } -Builder::TileLayer::TileLayer(Layer& genLayer): LayerFragment(genLayer) { - if (!details::CaselessEq()(getLayer().getType(), "Tile")) - THROW_IE_EXCEPTION << "Cannot create TileLayer decorator for layer " << getLayer().getType(); +Builder::TileLayer::TileLayer(const Layer::Ptr& layer): LayerDecorator(layer) { + checkType("Tile"); +} + +Builder::TileLayer::TileLayer(const Layer::CPtr& layer): LayerDecorator(layer) { + checkType("Tile"); } Builder::TileLayer& Builder::TileLayer::setName(const std::string& name) { - getLayer().getName() = name; + getLayer()->setName(name); return *this; } const Port& Builder::TileLayer::getInputPort() const { - return getLayer().getInputPorts()[0]; + return getLayer()->getInputPorts()[0]; } Builder::TileLayer& Builder::TileLayer::setInputPort(const Port &port) { - getLayer().getInputPorts()[0] = port; + getLayer()->getInputPorts()[0] = port; return *this; } const Port& Builder::TileLayer::getOutputPort() const { - return getLayer().getOutputPorts()[0]; + return getLayer()->getOutputPorts()[0]; } Builder::TileLayer& Builder::TileLayer::setOutputPort(const Port &port) { - getLayer().getOutputPorts()[0] = port; + getLayer()->getOutputPorts()[0] = port; return *this; } size_t Builder::TileLayer::getTiles() const { - return getLayer().getParameters()["tiles"].asUInt(); + return getLayer()->getParameters().at("tiles"); } Builder::TileLayer& Builder::TileLayer::setTiles(size_t tiles) { - getLayer().getParameters()["tiles"] = tiles; + getLayer()->getParameters()["tiles"] = tiles; return *this; } size_t Builder::TileLayer::getAxis() const { - return getLayer().getParameters()["axis"].asUInt(); + return getLayer()->getParameters().at("axis"); } Builder::TileLayer& Builder::TileLayer::setAxis(size_t axis) { - getLayer().getParameters()["axis"] = axis; + getLayer()->getParameters()["axis"] = axis; return *this; -} \ No newline at end of file +} + +REG_CONVERTER_FOR(SoftMax, [](const CNNLayerPtr& cnnLayer, Builder::Layer& layer) { + layer.getParameters()["axis"] = static_cast(cnnLayer->GetParamAsUInt("axis")); + layer.getParameters()["tiles"] = static_cast(cnnLayer->GetParamAsUInt("tiles")); +}); \ No newline at end of file diff --git a/inference-engine/src/inference_engine/cnn_network_impl.cpp b/inference-engine/src/inference_engine/cnn_network_impl.cpp index 620fe34..2918da1 100644 --- a/inference-engine/src/inference_engine/cnn_network_impl.cpp +++ b/inference-engine/src/inference_engine/cnn_network_impl.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,6 +22,14 @@ using namespace InferenceEngine::details; CNNNetworkImpl::CNNNetworkImpl(): _targetDevice(TargetDevice::eDefault), _stats(new CNNNetworkStatsImpl()) { } +CNNNetworkImpl::~CNNNetworkImpl() { + for (auto& data : _data) { + for (auto& input : data.second->getInputTo()) { + input.second.reset(); + } + } +} + void CNNNetworkImpl::getOutputsInfo(std::map& out) const noexcept { out = _outputData; } @@ -34,6 +42,16 @@ void CNNNetworkImpl::addLayer(const CNNLayerPtr& layer) noexcept { _layers[layer->name] = layer; } +void CNNNetworkImpl::removeLayer(const string& layerName) { + auto it = _layers.find(layerName); + if (it != _layers.end()) { _layers.erase(it); } +} + +void CNNNetworkImpl::removeData(const string& dataName) { + auto it = _data.find(dataName); + if (it != _data.end()) { _data.erase(it); } +} + void CNNNetworkImpl::validate(int version) { if (version != 1) { std::set layerNames; diff --git a/inference-engine/src/inference_engine/cnn_network_impl.hpp b/inference-engine/src/inference_engine/cnn_network_impl.hpp index d2d9ae1..87ac2e5 100644 --- a/inference-engine/src/inference_engine/cnn_network_impl.hpp +++ b/inference-engine/src/inference_engine/cnn_network_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,6 +28,7 @@ namespace details { class INFERENCE_ENGINE_API_CLASS(CNNNetworkImpl) : public ICNNNetwork { public: CNNNetworkImpl(); + ~CNNNetworkImpl() override; Precision getPrecision() const noexcept override { return precision; } @@ -52,6 +53,10 @@ public: _inputData[data->name()] = data; } + void removeInputInfo(const std::string& name) { + _inputData.erase(name); + } + void getName(char* pName, size_t len) const noexcept override { // Description buffer will preserve garbage if external pointer not initialized if (len < 1) return; @@ -85,6 +90,10 @@ public: void addLayer(const CNNLayerPtr& layer) noexcept override; + void removeLayer(const std::string& layerName); + + void removeData(const std::string& dataName); + StatusCode getLayerByName(const char* layerName, CNNLayerPtr& out, ResponseDesc* resp) const noexcept override; // deprecated, as there is no ResponseDesc to put error message diff --git a/inference-engine/src/inference_engine/cnn_network_int8_normalizer.cpp b/inference-engine/src/inference_engine/cnn_network_int8_normalizer.cpp index 58dd61f..435c24d 100644 --- a/inference-engine/src/inference_engine/cnn_network_int8_normalizer.cpp +++ b/inference-engine/src/inference_engine/cnn_network_int8_normalizer.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -44,13 +44,18 @@ CNNStatisticHelper::CNNStatisticHelper(CNNNetwork &network, const std::mapinsData) { + if (internalNodesStats_.find(i.lock()->creatorLayer.lock()->name) == internalNodesStats_.end()) { + return false; + } } - return false; + // verification if there is a statistic for output of the layer + if ((layer->outData.size() > 1) && (internalNodesStats_.find(layer->name) == internalNodesStats_.end())) { + return false; + } + return true; } void CNNStatisticHelper::copyStatistics(const std::string& srcName, const std::string& dstName) { @@ -75,13 +80,18 @@ InferenceEngine::Blob::Ptr CNNStatisticHelper::getInputScale(CNNLayer::Ptr layer std::string inputLayerName = previousLayer->name; // for case when we have the only average pooling before, we need to take this - // statistic from input of avg pooloing to compensate work of average pooling + // statistic from input of avg pooling to compensate work of average pooling // and to stay in int8 as much as we can if (previousLayer->type == "Pooling" && (previousLayer->precision == Precision::I8 || previousLayer->precision == Precision::U8)) { // take input name to the pooling inputLayerName = previousLayer->insData[0].lock()->creatorLayer.lock()->name; } size_t inputChannels = layer->insData[0].lock()->getTensorDesc().getDims()[1]; + if (getStatistic(previousLayer)->_minOutputs.size() != inputChannels + || getStatistic(previousLayer)->_maxOutputs.size() != inputChannels) { + THROW_IE_EXCEPTION << "min and max sizes should be equal to input channels count for " << previousLayer->name; + } + return calculateScaleFactor(inputChannels, getStatistic(previousLayer), hasNegativeOutput(previousLayer->name) ? maxSign_ : maxUnsign_); } @@ -90,8 +100,13 @@ InferenceEngine::Blob::Ptr CNNStatisticHelper::getOutputScale(CNNLayer::Ptr laye // TODO(amalyshe) for now we are looking to precision on the data node size_t outputChannels = layer->outData[0]->getTensorDesc().getDims()[1]; if (layer->outData.size() != 1) { - THROW_IE_EXCEPTION << "Trying to get scales after layer having multiple ouptut ports"; + THROW_IE_EXCEPTION << "Trying to get scales after layer having multiple output ports"; + } + if (getStatistic(layer)->_minOutputs.size() != outputChannels + || getStatistic(layer)->_maxOutputs.size() != outputChannels) { + THROW_IE_EXCEPTION << "min and max sizes should be equal to output channels count for " << layer->name; } + return calculateScaleFactor(outputChannels, getStatistic(layer), layer->outData[0]->getPrecision() == Precision::I8 ? maxSign_ : maxUnsign_); } @@ -139,7 +154,8 @@ NetworkNodeStatsPtr CNNStatisticHelper::getStatistic(CNNLayer::Ptr layer) const CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const { if (layer->outData[0]->inputTo.size() == 1 && - CaselessEq()(layer->outData[0]->inputTo.begin()->second->type, "relu")) { + (CaselessEq()(layer->outData[0]->inputTo.begin()->second->type, "relu") || + CNNNetworkInt8Normalizer::isReLULikeClamp(layer->outData[0]->inputTo.begin()->second))) { return layer->outData[0]->inputTo.begin()->second; } // Conv-Sum-ReLU fuse @@ -164,14 +180,16 @@ CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const { } else { // look to the ports of eltwise if (eltwise->insData[1].lock()->creatorLayer.lock() == layer && - CaselessEq()(eltwise->insData[0].lock()->creatorLayer.lock()->type, "convolution")) { + CaselessEq()(eltwise->insData[0].lock()->creatorLayer.lock()->type, "convolution") && + eltwise->insData[0].lock()->inputTo.size() == 1) { // this is a case when two convolutions come to eltwise, the second one will be selected for fuse, // first will be used as sum operator return layer; } // given layer is a convolution and will be used for fuse, but we need to verify if there is ReLU after eltwise if (eltwise->outData[0]->inputTo.size() == 1 && - CaselessEq()(eltwise->outData[0]->inputTo.begin()->second->type, "relu")) { + (CaselessEq()(eltwise->outData[0]->inputTo.begin()->second->type, "relu") || + CNNNetworkInt8Normalizer::isReLULikeClamp(eltwise->outData[0]->inputTo.begin()->second))) { return eltwise->outData[0]->inputTo.begin()->second; } return eltwise; @@ -202,6 +220,7 @@ void CNNStatisticHelper::NormalizeStatistic() { for (auto i : l->insData) { if (newMap.find(i.lock()->creatorLayer.lock()->name) == newMap.end()) { allInputsHaveStatistics = false; + break; } } // if we do not have statistic - verify who is consumer of this layer @@ -211,12 +230,18 @@ void CNNStatisticHelper::NormalizeStatistic() { if (CaselessEq()(it.second->type, "scaleshift") || CaselessEq()(it.second->type, "convolution")) { isStarterLayer = true; + break; } } } } else { isStarterLayer = true; } + if (CaselessEq()(l->type, "scaleshift") || + CaselessEq()(l->type, "convolution")) { + isStarterLayer = true; + } + if (!isStarterLayer) { continue; } @@ -230,8 +255,11 @@ void CNNStatisticHelper::NormalizeStatistic() { bool perChannelScale = true; + if (CaselessEq()(l->type, "concat") - && l->outData.size() == 1 && l->outData[0]->getTensorDesc().getDims().size() == 4) { + && l->outData.size() == 1 + && l->outData[0]->getTensorDesc().getDims().size() == 4 + && allInputsHaveStatistics) { size_t concatLayerIdx = 0; for (int k = 0; k < l->insData.size(); k++) { auto prevKLayer = l->insData[k].lock()->creatorLayer.lock(); @@ -246,11 +274,28 @@ void CNNStatisticHelper::NormalizeStatistic() { THROW_IE_EXCEPTION << "We have incomplete statistic for predecessors of concat layer " << l->name; } } + } else if (CaselessEq()(l->type, "resample")) { + if (l->insData.size() == 1) { + CNNLayerPtr creator = l->insData[0].lock()->getCreatorLayer().lock(); + if (CaselessEq()(creator->type, "concat")) { + auto concatStat = newMap[creator->name]; + currentStat->_maxOutputs = concatStat->_maxOutputs; + currentStat->_minOutputs = concatStat->_minOutputs; + newMap[l->name] = currentStat; + } else { + auto itOld = internalNodesStats_.find(l->name); + if (itOld != internalNodesStats_.end()) { + currentStat->_maxOutputs = itOld->second->_maxOutputs; + currentStat->_minOutputs = itOld->second->_minOutputs; + newMap[l->name] = currentStat; + } + } + } } else { // go over all children until we get convoluition, scaleshift, eltwise or unknown layer // layers Pooling and ReLU are passthrough // to understand the granularity of the scaling - // layer concat is a lyer which produce statistics and waterfall it down + // layer concat is a layer which produce statistics and waterfall it down std::vector toAnalyze; for (auto it : l->outData[0]->inputTo) { toAnalyze.push_back(it.second); @@ -264,6 +309,7 @@ void CNNStatisticHelper::NormalizeStatistic() { toAnalyze.pop_back(); if (CaselessEq()(tl->type, "pooling") || CaselessEq()(tl->type, "relu") || + CNNNetworkInt8Normalizer::isReLULikeClamp(tl) || CaselessEq()(tl->type, "concat")) { if (tl->outData.size() == 1) { for (auto it : tl->outData[0]->inputTo) { @@ -282,37 +328,61 @@ void CNNStatisticHelper::NormalizeStatistic() { } auto itOld = internalNodesStats_.find(getLatestInFuse(l)->name); + if (itOld == internalNodesStats_.end()) { + itOld = internalNodesStats_.find(l->name); + } if (itOld != internalNodesStats_.end()) { - currentStat->_maxOutputs = itOld->second->_maxOutputs; - currentStat->_minOutputs = itOld->second->_minOutputs; - if (!perChannelScale) { - float min = FLT_MAX; - float max = FLT_MIN; + currentStat->_maxOutputs.resize(itOld->second->_maxOutputs.size()); if (!itOld->second->_maxOutputs.empty()) { + float max = FLT_MIN; DataStats::GetDataAbsMax(&itOld->second->_maxOutputs[0], itOld->second->_maxOutputs.size(), max); std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max); } + + currentStat->_minOutputs.resize(itOld->second->_minOutputs.size()); if (!itOld->second->_minOutputs.empty()) { + float min = FLT_MAX; DataStats::GetDataMinMax(&itOld->second->_minOutputs[0], itOld->second->_minOutputs.size(), min, dummy); std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min); } + } else { + currentStat->_maxOutputs = itOld->second->_maxOutputs; + currentStat->_minOutputs = itOld->second->_minOutputs; + } + } + + + if (l->outData.size() == 1) { + size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[1]; + auto oldStat = internalNodesStats_.find(l->name); + if ((oldStat != internalNodesStats_.end()) && outputChannels > 1 && oldStat->second->_minOutputs.size() == 1) { + auto min = oldStat->second->_minOutputs[0]; + auto max = oldStat->second->_maxOutputs[0]; + + currentStat->_minOutputs = std::vector(outputChannels); + currentStat->_maxOutputs = std::vector(outputChannels); + std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min); + std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max); } } } // propagate this statistic to all layers without scale in primitives - std::vector toAnalyze; - toAnalyze.push_back(l); - while (!toAnalyze.empty()) { - CNNLayer::Ptr tl = toAnalyze.back(); - toAnalyze.pop_back(); - newMap[tl->name] = currentStat; - if (tl->outData.size() == 1) { - for (auto it : tl->outData[0]->inputTo) { - if (CaselessEq()(it.second->type, "pooling") || - CaselessEq()(it.second->type, "relu")) { - toAnalyze.push_back(it.second); + if (!currentStat->_maxOutputs.empty() && !currentStat->_minOutputs.empty()) { + std::vector toAnalyze; + toAnalyze.push_back(l); + while (!toAnalyze.empty()) { + CNNLayer::Ptr tl = toAnalyze.back(); + toAnalyze.pop_back(); + newMap[tl->name] = currentStat; + if (tl->outData.size() == 1) { + for (auto it : tl->outData[0]->inputTo) { + if (CaselessEq()(it.second->type, "pooling") || + CaselessEq()(it.second->type, "relu") || + CNNNetworkInt8Normalizer::isReLULikeClamp(it.second)) { + toAnalyze.push_back(it.second); + } } } } @@ -490,8 +560,9 @@ void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net, CNNStatisticHelpe for (auto nextIter : iter->outData[l1_out_i]->inputTo) { CNNLayer::Ptr next = nextIter.second; - // Checking for an INT8 convolution with FP32 output - if (iter->type == "Convolution" && + // Checking for an INT8 convolution or fully connected with FP32 output + if ((CaselessEq()(iter->type, "Convolution") || + CaselessEq()(iter->type, "FullyConnected")) && iter->precision == Precision::I8 && next->precision == Precision::FP32 && iter->outData[l1_out_i]->getPrecision() == Precision::FP32) { @@ -511,6 +582,29 @@ void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net, CNNStatisticHelpe } } +void CNNNetworkInt8Normalizer::ClampsToReLU(CNNNetwork& net, CNNStatisticHelper& statHelper) { + std::vector sortedLayers = CNNNetSortTopologically(net); + + for (auto iter : sortedLayers) { + if (isReLULikeClamp(iter) && (iter->precision == Precision::I8 || iter->precision == Precision::U8)) { + std::string layerName = iter->name + "_ReLU"; + LayerParams ssCnnLayerParams{ layerName, "ReLU", iter->precision }; + CNNLayerPtr ssCnnLayer(new ReLULayer(ssCnnLayerParams)); + + auto previousLayer = iter->insData[0].lock()->creatorLayer.lock(); + ssCnnLayer->insData.push_back(iter->insData[0]); + ssCnnLayer->insData[0].lock()->inputTo.erase(iter->name); + ssCnnLayer->insData[0].lock()->inputTo[iter->name] = ssCnnLayer; + + ssCnnLayer->outData.push_back(iter->outData[0]); + ssCnnLayer->outData[0]->creatorLayer = ssCnnLayer; + + iter->insData.clear(); + iter->outData.clear(); + } + } +} + void CNNNetworkInt8Normalizer::ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector& scales) { if (scales.size() == 0 || /*srcblob->size()*/srcSize % scales.size() != 0) { THROW_IE_EXCEPTION << "Wrong number of scale factors"; @@ -659,31 +753,35 @@ void CNNNetworkInt8Normalizer::replaceScaleShiftByDWConvolution(CNNNetwork &net) && layer->insData[0].lock()->creatorLayer.lock() && !CaselessEq()(layer->insData[0].lock()->creatorLayer.lock()->type, "input") && layer->outData[0]->inputTo.size() > 0) { - // verification if this layer does not pass data to PriorBox, if it passes, we do not substitute - bool notToPriorBox = true; - for (auto o : layer->outData[0]->inputTo) { - if (CaselessEq()(o.second->type, "priorbox") || - CaselessEq()(o.second->type, "priorboxclustered")) { - notToPriorBox = false; + const auto dims = layer->insData[0].lock()->getTensorDesc().getDims(); + // only four or five dimensions Convolution layers are supported + if ((dims.size() == 4) || (dims.size() == 5)) { + // verification if this layer does not pass data to PriorBox, if it passes, we do not substitute + bool notToPriorBox = true; + for (auto o : layer->outData[0]->inputTo) { + if (CaselessEq()(o.second->type, "priorbox") || + CaselessEq()(o.second->type, "priorboxclustered")) { + notToPriorBox = false; + } + } + if (notToPriorBox) { + ScaleShiftLayer *pSS = dynamic_cast(layer.get()); + float *ssWValues = pSS->_weights->buffer().as(); + float *ssSValues = pSS->_biases->buffer().as(); + CNNLayer::Ptr newLayer = createDWConvolutionForScale(layer->name, layer->outData[0]->getTensorDesc().getDims()[1], ssWValues, ssSValues); + + newLayer->outData = layer->outData; + newLayer->outData[0]->creatorLayer = newLayer; + newLayer->insData = layer->insData; + newLayer->insData[0].lock()->inputTo.erase(layer->name); + newLayer->insData[0].lock()->inputTo[newLayer->name] = newLayer; } - } - if (notToPriorBox) { - ScaleShiftLayer *pSS = dynamic_cast(layer.get()); - float *ssWValues = pSS->_weights->buffer().as(); - float *ssSValues = pSS->_biases->buffer().as(); - CNNLayer::Ptr newLayer = createDWConvolutionForScale(layer->name, layer->outData[0]->getTensorDesc().getDims()[1], ssWValues, ssSValues); - - newLayer->outData = layer->outData; - newLayer->outData[0]->creatorLayer = newLayer; - newLayer->insData = layer->insData; - newLayer->insData[0].lock()->inputTo.erase(layer->name); - newLayer->insData[0].lock()->inputTo[newLayer->name] = newLayer; } } } } -void CNNNetworkInt8Normalizer::QuantizeConvolution(CNNLayer::Ptr convolution, +void CNNNetworkInt8Normalizer::QuantizeConvolutionOrFullyConnected(CNNLayer::Ptr convolution, CNNStatisticHelper& statHelper) { size_t inputChannels = convolution->insData[0].lock()->getTensorDesc().getDims()[1]; size_t outputChannels = convolution->outData[0]->getTensorDesc().getDims()[1]; @@ -725,20 +823,27 @@ void CNNNetworkInt8Normalizer::QuantizeConvolution(CNNLayer::Ptr convolution, if (weights) { const float *weight = static_cast(weights->buffer()); - ConvolutionLayer *pConv = dynamic_cast(convolution.get()); - if (pConv->_group == 0) { + WeightableLayer *pConv = dynamic_cast(convolution.get()); + ConvolutionLayer *pConv1 = dynamic_cast(convolution.get()); + + if (pConv1 != nullptr && pConv1->_group == 0) { THROW_IE_EXCEPTION << "Convolution '" << convolution->name << "'has wrong groups number == 0"; } + int group = 1; + if (pConv1 != nullptr && pConv1->_group != 1) { + group = pConv1->_group; + } + std::vector newWeights; // "new" weights are weights multiplied by i-scale - size_t W_CO = outputChannels / pConv->_group, - W_CI = inputChannels / pConv->_group, - W_HW = weights->size()/ W_CI / W_CO / pConv->_group; + size_t W_CO = outputChannels / group, + W_CI = inputChannels / group, + W_HW = weights->size()/ W_CI / W_CO / group; { float *iScaleMemory = static_cast(iScale->buffer()); - for (size_t g = 0; g < pConv->_group; g++) { + for (size_t g = 0; g < group; g++) { for (size_t co = 0; co < W_CO; co++) { for (size_t ci = 0; ci < W_CI; ci++) { size_t kernelBase = g * W_CO * W_CI * W_HW + co * W_CI * W_HW + ci * W_HW; @@ -749,7 +854,7 @@ void CNNNetworkInt8Normalizer::QuantizeConvolution(CNNLayer::Ptr convolution, } } } - size_t outChannelSize = weights->dims()[0] / W_CO / pConv->_group; + size_t outChannelSize = weights->dims()[0] / W_CO / group; // Calculating weights normalization scale factor (w-scale) float *weight_convolution; @@ -790,9 +895,27 @@ void CNNNetworkInt8Normalizer::QuantizeConvolution(CNNLayer::Ptr convolution, } } -void CNNNetworkInt8Normalizer::returnTailToFP32(CNNLayer::Ptr layer) { +bool CNNNetworkInt8Normalizer::layerProducesFloat(const CNNLayer::Ptr layer) { + // currently we support only case of layers which have one output port + if (layer->outData.size() > 1) { + return false; + } + + bool consumersFP32 = true; + for (const auto dOut : layer->outData[0]->inputTo) { + if (dOut.second->precision != Precision::FP32) { + consumersFP32 = false; + } + } + return consumersFP32; +} + +void CNNNetworkInt8Normalizer::returnTailToFP32(const CNNLayer::Ptr layer) { std::set layersToReturn; - layersToReturn.insert(layer); + if (layerProducesFloat(layer)) { + layersToReturn.insert(layer); + } + while (!layersToReturn.empty()) { CNNLayer::Ptr layerA = *layersToReturn.begin(); layersToReturn.erase(layerA); @@ -806,29 +929,31 @@ void CNNNetworkInt8Normalizer::returnTailToFP32(CNNLayer::Ptr layer) { } if ((CaselessEq()(layerA->type, "convolution") - || CaselessEq()(layerA->type, "relu")) && + || CaselessEq()(layerA->type, "fullyconnected") + || CaselessEq()(layerA->type, "relu") + || isReLULikeClamp(layerA)) && layerA->outData.size() == 1) { layerA->outData[0]->setPrecision(Precision::FP32); + if (CaselessEq()(layerA->type, "relu") + && isNextFusionAllowed(layerA->insData[0].lock()->creatorLayer.lock())) { + layerA->precision = Precision::FP32; + layerA->insData[0].lock()->creatorLayer.lock()->outData[0]->setPrecision(Precision::FP32); + } } // adding parents for analysis - if (!CaselessEq()(layerA->type, "convolution")) { - // for all parrents, if they produce data to only FP32 layers + if (!CaselessEq()(layerA->type, "convolution") && + !CaselessEq()(layerA->type, "fullyconnected")) { + // for all parents, if they produce data to only FP32 layers for (auto i : layerA->insData) { DataPtr d = i.lock(); if (d->creatorLayer.lock()->precision != Precision::FP32 && (CaselessEq()(layerA->type, "pooling") || CaselessEq()(layerA->type, "relu") + || isReLULikeClamp(layerA) || CaselessEq()(layerA->type, "concat"))) { - // check if layer produce to only FP32 - bool consumersFP32 = true; - for (auto dOut : d->inputTo) { - if (dOut.second->precision != Precision::FP32) { - consumersFP32 = false; - } - } - if (consumersFP32) { + if (layerProducesFloat(d->creatorLayer.lock())) { layersToReturn.insert(d->creatorLayer.lock()); } } @@ -837,8 +962,8 @@ void CNNNetworkInt8Normalizer::returnTailToFP32(CNNLayer::Ptr layer) { } } -bool CNNNetworkInt8Normalizer::isNextFusionAllowed(CNNLayer::Ptr layer) const { - // fusion can happen only if initial layer supplys data to only one layer +bool CNNNetworkInt8Normalizer::isNextFusionAllowed(const CNNLayer::Ptr& layer) { + // fusion can happen only if initial layer supplies data to only one layer // if it sends to several layers - it is safe to execute initial layer in any precision if (layer->outData[0]->inputTo.size() == 1) { std::string aType = layer->outData[0]->inputTo.begin()->second->type; @@ -847,6 +972,10 @@ bool CNNNetworkInt8Normalizer::isNextFusionAllowed(CNNLayer::Ptr layer) const { if (rL->negative_slope != 0.f) { return false; } + } else if (CaselessEq()(aType, "clamp")) { + if (!isReLULikeClamp(layer->outData[0]->inputTo.begin()->second)) { + return false; + } } else { static const InferenceEngine::details::caseless_set nonSuportedActivations = {"elu", "clamp", "tanh", "logistic", "square", "abs", @@ -857,6 +986,17 @@ bool CNNNetworkInt8Normalizer::isNextFusionAllowed(CNNLayer::Ptr layer) const { return true; } +bool CNNNetworkInt8Normalizer::isReLULikeClamp(CNNLayer::Ptr layer) { + if (CaselessEq()(layer->type, "Clamp")) { + ClampLayer *clamp = dynamic_cast(layer.get()); + if (clamp == nullptr) { + THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << layer->name << "' to Clamp"; + } + return clamp->min_value == 0; + } + return false; +} + void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNStatisticHelper &statHelper) { std::vector sortedLayers = CNNNetSortTopologically(net); @@ -866,30 +1006,39 @@ void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNSta continue; } - if (statHelper.canLayerBeQuantized(iter->name)) { + // Legacy: FullyConnected should not be converted to Int8, + // if it isn't explicitly marked to. + if (iter->params.find("quantization_level") == iter->params.end() && CaselessEq()(iter->type, "fullyconnected")) { + continue; + } + + if (!statHelper.canLayerBeQuantized(iter)) { continue; } - if (CaselessEq()(iter->type, "convolution")) { + if (CaselessEq()(iter->type, "convolution") || + CaselessEq()(iter->type, "fullyconnected")) { if (isNextFusionAllowed(iter)) { iter->precision = Precision::I8; // we will override I8 to U8 during analysing of Conv-ReLU and Conv-Sum-ReLU fusions iter->outData[0]->setPrecision(Precision::I8); } - } else if (CaselessEq()(iter->type, "relu")) { + } else if (CaselessEq()(iter->type, "relu") || + isReLULikeClamp(iter)) { // casting to ReLU ReLULayer *rL = dynamic_cast(iter.get()); DataPtr outData = iter->outData.size() ? iter->outData[0] : nullptr; if (iter->insData[0].lock()->creatorLayer.lock()->precision != Precision::FP32 && outData->getPrecision() == Precision::FP32) { iter->precision = Precision::I8; - if (rL->negative_slope != 0.0f) { + if (rL != nullptr && rL->negative_slope != 0.0f) { outData->setPrecision(Precision::I8); } else { outData->setPrecision(Precision::U8); // if convolution is a predecessor, change its data to U8 also CNNLayer::Ptr prevLayer = iter->insData[0].lock()->creatorLayer.lock(); - if (prevLayer && CaselessEq()(prevLayer->type, "convolution")) { + if (prevLayer && (CaselessEq()(prevLayer->type, "convolution") || + CaselessEq()(prevLayer->type, "fullyconnected"))) { iter->insData[0].lock()->setPrecision(Precision::U8); } // if there is a patter A0 -> Eltwise -> ReLU and Convolution -> Eltwise -> ReLU, @@ -916,9 +1065,12 @@ void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNSta } } else if (CaselessEq()(iter->type, "pooling")) { auto pool = dynamic_cast(iter.get()); - if (pool && (pool->_type == PoolingLayer::MAX - || (pool->_type == PoolingLayer::AVG - && pool->outData.size() == 1))) { + if (pool == nullptr) { + THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << iter->name << "' to pooling"; + } + + if (pool->_type == PoolingLayer::MAX || + (pool->_type == PoolingLayer::AVG && pool->outData.size() == 1)) { auto prevLayer = iter->insData[0].lock()->creatorLayer.lock(); if (prevLayer && (prevLayer->precision == Precision::I8 || prevLayer->precision == Precision::U8)) { iter->precision = Precision::I8; @@ -1041,7 +1193,7 @@ void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNSta iter->precision = Precision::I8; iter->outData[0]->setPrecision(Precision::I8); // calculate the only scale - Blob::Ptr sumLayerScales = statHelper.getOutputScale(sumLayer); + Blob::Ptr sumLayerScales = statHelper.getOutputScale(statHelper.getLatestInFuse(sumLayer)); Blob::Ptr convLayerScales = statHelper.getOutputScale(statHelper.getLatestInFuse(convLayer)); float *sumScale = sumLayerScales->buffer().as(); float *convScale = convLayerScales->buffer().as(); @@ -1055,20 +1207,27 @@ void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNSta } else { // if there are convolutions are inputs to this eltwise, we forcedly move them to FP32 for (auto i : iter->insData) { - if (CaselessEq()(i.lock()->creatorLayer.lock()->type, "convolution")) { + auto type = i.lock()->creatorLayer.lock()->type; + if (CaselessEq()(type, "convolution") || + CaselessEq()(type, "fullyconnected")) { i.lock()->creatorLayer.lock()->precision = Precision::FP32; i.lock()->setPrecision(Precision::FP32); } } } + } else if (CaselessEq()(iter->type, "resample")) { + iter->precision = Precision::I8; + iter->outData[0]->setPrecision(iter->insData[0].lock()->getPrecision()); } } // quantization of weights/biases sortedLayers = CNNNetSortTopologically(net); for (auto iter : sortedLayers) { - if (iter->precision == Precision::I8 && CaselessEq()(iter->type, "convolution")) { - QuantizeConvolution(iter, statHelper); + if (iter->precision == Precision::I8 && + (CaselessEq()(iter->type, "convolution") || + CaselessEq()(iter->type, "fullyconnected"))) { + QuantizeConvolutionOrFullyConnected(iter, statHelper); } } @@ -1080,8 +1239,8 @@ void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNSta if (iter->precision == Precision::I8 && iter->outData.size() == 1) { if ((iter->outData[0]->inputTo.size() == 1 - && iter->outData[0]->inputTo.begin()->second->precision == Precision::FP32) - || iter->outData[0]->inputTo.size() == 0) { + && iter->outData[0]->inputTo.begin()->second->precision == Precision::FP32) + || iter->outData[0]->inputTo.size() == 0) { returnTailToFP32(iter); } } @@ -1091,8 +1250,6 @@ void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNSta void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper) { std::vector sortedLayers = CNNNetSortTopologically(net); - std::vector oScaleLayers; - // Moving o-scales down for (auto iter : sortedLayers) { if (iter->type == "Concat" && iter->precision == Precision::I8) { @@ -1143,7 +1300,10 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNS if (iter->outData.size() == 1) { for (auto l : iter->outData[0]->inputTo) { if (l.second->precision == Precision::I8 || l.second->precision == Precision::U8) { - if (l.second->type == "Pooling" || l.second->type == "ReLU") { + if (CaselessEq()(l.second->type, "Pooling") || + CaselessEq()(l.second->type, "ReLU") || + CNNNetworkInt8Normalizer::isReLULikeClamp(l.second) + ) { l.second->blobs["o-scale"] = iter->blobs["o-scale"]; // debug scales. Need to compare with actual values in FP32 scoring l.second->blobs["ext-scale"] = l.second->blobs["o-scale"]; @@ -1156,6 +1316,25 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNS l.second->blobs["o-scale"] = iter->blobs["o-scale"]; } int8Consumers++; + } else if ((l.second->precision == Precision::I8 || l.second->precision == Precision::U8) && + CaselessEq()(l.second->type, "Resample")) { + // If resample has concat as input layer it should inherit it's + // output scale + if (l.second->insData.size() == 1) { + CNNLayerPtr creator = l.second->insData[0].lock()->creatorLayer.lock(); + if (CaselessEq()(creator->type, "Concat")) { + l.second->blobs["o-scale"] = creator->blobs["o-scale"]; + l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"]; + } + } + + // No concat found, let use statistics + if (l.second->blobs.find("o-scale") == l.second->blobs.end()) { + auto oScale = statHelper.getOutputScale(l.second); + l.second->blobs["o-scale"] = oScale; + l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"]; + } + int8Consumers++; } else if ((l.second->precision == Precision::I8) && CaselessEq()(l.second->type, "concat")) { // if concat is i8, we can propagate oscale further to concat. @@ -1181,7 +1360,8 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNS fp32Consumers++; } - if (CaselessEq()(iter->type, "Convolution")) { + if (CaselessEq()(iter->type, "Convolution") || + CaselessEq()(iter->type, "FullyConnected")) { if (int8Consumers) { iter->blobs["oi-scale"] = iter->blobs["o-scale"]; } else { @@ -1227,9 +1407,10 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNS && curLayer->insData[0].lock()->creatorLayer.lock()->outData.size() == 1 && curLayer->insData[0].lock()->inputTo.size() == 1) { curLayer = curLayer->insData[0].lock()->creatorLayer.lock(); - if (curLayer->type != "Pooling" - && curLayer->type != "ReLU" - && curLayer->type != "Convolution") { + if (!CaselessEq()(curLayer->type, "Pooling") + && !CaselessEq()(curLayer->type, "ReLU") + && !isReLULikeClamp(curLayer) + && !CaselessEq()(curLayer->type, "Convolution")) { eliminateOScale = false; } } else { @@ -1309,6 +1490,7 @@ void CNNNetworkInt8Normalizer::NormalizeNetwork(ICNNNetwork& network, ICNNNetwor DefinesExecutionPrecision(cnnn, statHelper); PropagateScaleFactors(cnnn, statHelper); + ClampsToReLU(cnnn, statHelper); AddScaleShifts(cnnn, statHelper); #ifndef NDEBUG std::ofstream file("i8_normalized.dot"); diff --git a/inference-engine/src/inference_engine/cnn_network_int8_normalizer.hpp b/inference-engine/src/inference_engine/cnn_network_int8_normalizer.hpp index 69e94b1..4e0b658 100644 --- a/inference-engine/src/inference_engine/cnn_network_int8_normalizer.hpp +++ b/inference-engine/src/inference_engine/cnn_network_int8_normalizer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -45,7 +45,7 @@ public: * Returns if we can quantize layer basing on information of existing statistic before and after * layers */ - bool canLayerBeQuantized(const std::string &layerName) const; + bool canLayerBeQuantized(CNNLayer::Ptr layer) const; /** * The topology is allowed to be changed, we need to modify statistic accordingly @@ -163,15 +163,15 @@ private: public: /** main function for calling of quantization */ - void NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats); + static void NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats); protected: /** Helper function to add scaleshifts and other layers for transformatin of topology */ - void AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor, size_t port); + static void AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor, size_t port); /** Helper function to add scaleshifts and other layers for transformatin of topology */ - void AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer, const std::string& nextLayerName); + static void AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer, const std::string& nextLayerName); /** Adds ScaleShift between two specified layers */ - void AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2, CNNStatisticHelper& statHelper); + static void AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2, CNNStatisticHelper& statHelper); /** @@ -181,28 +181,31 @@ protected: * data * o-scale - multiplication on this scale will convert above denormalized fp32 to i8 for next layer */ - void QuantizeConvolution(CNNLayer::Ptr convolution, CNNStatisticHelper& statHelper); + static void QuantizeConvolutionOrFullyConnected(CNNLayer::Ptr convolution, CNNStatisticHelper& statHelper); /** Adds ScaleShifts everywhere */ - void AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper); + static void AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper); + + /** Convert ReLu-like Clamps to ReLu layers */ + static void ClampsToReLU(CNNNetwork& net, CNNStatisticHelper& statHelper); /** * Goes over all layers and mark which layers will be executed in FP32/I8 and marks data between * layers to I8/U8/FP32 */ - void DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper); + static void DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper); /** * Since o-scales exist only for convolutins, we need to propagate them down oever concats and * linear layers */ - void PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper); + static void PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper); /** * Normalizes and quantizes srcData using scales for normalization and int8blob precision for * quantization */ - void ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector& scales); + static void ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector& scales); /** * Replaces all ScaleShifts layers met in the model to the depth-wise convolution with the same @@ -216,23 +219,34 @@ protected: * This conversion allows to avoid introductin one more i8 primitive - ScaleShift accepting i8 input * and producing i8 output */ - void replaceScaleShiftByDWConvolution(CNNNetwork& net); + static void replaceScaleShiftByDWConvolution(CNNNetwork& net); /** Helper function which creates DW/Grouped/regular convolution by passed weights and biases */ - CNNLayer::Ptr createDWConvolutionForScale(const std::string& layerName, size_t channels, float *weights, float *biases); + static CNNLayer::Ptr createDWConvolutionForScale(const std::string& layerName, size_t channels, float *weights, float *biases); + + /** + * Verifies if layer produces data to layers which marked as float + */ + static bool layerProducesFloat(const CNNLayer::Ptr layer); /** * Returns tails from I8 to FP32 until convolution - it is the most performed approach because * convolution can convert to FP32 for free, while adding one more scale will decrease performance */ - void returnTailToFP32(CNNLayer::Ptr layer); + static void returnTailToFP32(const CNNLayer::Ptr layer); /** * Verifies if next layer has type which potentially can be fused with convolution * and if activation is supported for int8 * @return true if layer does not have improper activation for fusion */ - bool isNextFusionAllowed(CNNLayer::Ptr layer) const; + static bool isNextFusionAllowed(const CNNLayer::Ptr& layer); + +public: + /** + * Returns true for a "relu-like" clamp layer i.e. a clamp with minimum = 0 + */ + static bool isReLULikeClamp(CNNLayer::Ptr layer); }; typedef std::shared_ptr CNNNetworkNormalizerPtr; diff --git a/inference-engine/src/inference_engine/cnn_network_stats_impl.cpp b/inference-engine/src/inference_engine/cnn_network_stats_impl.cpp index dd89fcb..0a577ab 100644 --- a/inference-engine/src/inference_engine/cnn_network_stats_impl.cpp +++ b/inference-engine/src/inference_engine/cnn_network_stats_impl.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cnn_network_stats_impl.hpp b/inference-engine/src/inference_engine/cnn_network_stats_impl.hpp index f83aca6..f97e1d8 100644 --- a/inference-engine/src/inference_engine/cnn_network_stats_impl.hpp +++ b/inference-engine/src/inference_engine/cnn_network_stats_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_executable_network_base.hpp b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_executable_network_base.hpp index fd9bd1b..aceb479 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_executable_network_base.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_executable_network_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -58,6 +58,10 @@ public: TO_STATUS(_impl->GetMappedTopology(deployedTopology)); } + StatusCode GetExecGraphInfo(ICNNNetwork::Ptr &graphPtr, ResponseDesc *resp) noexcept override { + TO_STATUS(_impl->GetExecGraphInfo(graphPtr)); + } + StatusCode QueryState(IMemoryState::Ptr & pState, size_t idx , ResponseDesc *resp) noexcept override { try { diff --git a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_infer_async_request_base.hpp b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_infer_async_request_base.hpp index 916849a..6222b14 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_infer_async_request_base.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_infer_async_request_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_memory_state_base.hpp b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_memory_state_base.hpp index 9764b75..2b448e3 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_memory_state_base.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_memory_state_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_plugin_base.hpp b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_plugin_base.hpp index 33b3f39..6269dd3 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/base/ie_plugin_base.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/base/ie_plugin_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/exception2status.hpp b/inference-engine/src/inference_engine/cpp_interfaces/exception2status.hpp index 0f3462a..4015bb1 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/exception2status.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/exception2status.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.cpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.cpp index 1930937..f9533ca 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.hpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.hpp index 8c4d4d0..17e86e0 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_itask_executor.hpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_itask_executor.hpp index 0f9be30..3b02eff 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_itask_executor.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_itask_executor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task.cpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task.cpp index 89e716c..0df1242 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task.hpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task.hpp index 8646da9..c299be4 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.cpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.cpp index 8e4c693..3868abc 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.hpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.hpp index ad06a60..c135a82 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_synchronizer.hpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_synchronizer.hpp index 3ac5f9f..1608293 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_synchronizer.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_synchronizer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.cpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.cpp index 48b2790..1e12aca 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ namespace InferenceEngine { StagedTask::StagedTask() : Task(), _stages(0) {} -StagedTask::StagedTask(std::function function, size_t stages) : Task(function), _stages(stages) { +StagedTask::StagedTask(std::function function, size_t stages) : Task(function), _stages(stages), _stage(0) { if (!function) THROW_IE_EXCEPTION << "Failed to create StagedTask object with null function"; resetStages(); } diff --git a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.hpp b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.hpp index f9b3755..fff5e51 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_internal.hpp index 01e85a3..ba3efa3 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -59,6 +59,10 @@ public: THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str; } + void GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) override { + THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str; + } + void SetPointerToPluginInternal(InferencePluginInternalPtr plugin) { _plugin = plugin; } diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp index 515a283..f92d8da 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp index b3c7ad0..88ad125 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_internal.hpp index d194a30..2a8ffe0 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp index f18a47a..3384164 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_internal.hpp index 96a905f..04622f0 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_request_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_request_internal.hpp index c9afe39..c04a5d9 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_request_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -30,7 +30,8 @@ class InferRequestInternal : virtual public IInferRequestInternal { public: typedef std::shared_ptr Ptr; - InferRequestInternal(InputsDataMap networkInputs, OutputsDataMap networkOutputs) { + InferRequestInternal(InputsDataMap networkInputs, OutputsDataMap networkOutputs) + : m_curBatch(-1) { // We should copy maps in order to avoid modifications in the future. for (const auto &it : networkInputs) { InputInfo::Ptr newPtr; @@ -101,6 +102,7 @@ public: } if (foundInput->getPreProcess().getResizeAlgorithm() != ResizeAlgorithm::NO_RESIZE) { + PreProcessData::isApplicable(data, _inputs[name]); // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing. _preProcData[name].setRoiBlob(data); } else { @@ -177,7 +179,8 @@ public: if (it != _preProcData.end()) { _preProcData[input.first].execute(input.second, _networkInputs[input.first]->getPreProcess().getResizeAlgorithm(), - serial); + serial, + m_curBatch); } } } @@ -189,6 +192,7 @@ protected: InferenceEngine::BlobMap _outputs; ExecutableNetworkInternalPtr _exeNetwork; std::map _preProcData; // pre-process data per input + int m_curBatch; // current batch value used in dynamic batching protected: /** diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_memory_state_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_memory_state_internal.hpp index db3659e..7d5a9fd 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_memory_state_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_memory_state_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_plugin_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_plugin_internal.hpp index d9bee35..bb261db 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_plugin_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/impl/ie_plugin_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,6 +13,7 @@ #include #include #include +#include "graph_transformer.h" #include "cpp_interfaces/interface/ie_iplugin_internal.hpp" #include "cpp_interfaces/base/ie_executable_network_base.hpp" #include "cpp_interfaces/impl/ie_executable_network_internal.hpp" @@ -47,6 +48,19 @@ public: StatusCode sts = _loadedNetwork->CreateInferRequest(_createdInferRequest, &resp); if (sts != OK) THROW_IE_EXCEPTION << resp.msg; } + /** + * @brief most plugins successfully consume unreshapable networks - lets do it in base class + * WARNING: this functions modifies layers in input network and might affect application, that uses it + */ + virtual ICNNNetwork& RemoveConstLayers(ICNNNetwork &network) { + auto* implNetwork = dynamic_cast(&network); + if (implNetwork) { + // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network + ConstTransformer transformator(implNetwork); + transformator.fullTrim(); + } + return network; + } /** * @brief Creates an executable network from an pares network object, users can create as many networks as they need and use @@ -101,7 +115,7 @@ public: } _networkOutputs[it.first] = newData; } - auto impl = LoadExeNetworkImpl(network, config); + auto impl = LoadExeNetworkImpl(RemoveConstLayers(network), config); impl->setNetworkInputs(_networkInputs); impl->setNetworkOutputs(_networkOutputs); // skip setting shared ptr to avoid curricular dependency: ExecutableNetworkBase -> IExecutableNetworkInternal -> InferencePluginInternal diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp index cd8a46a..eafed12 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -60,6 +60,11 @@ public: */ virtual void GetMappedTopology(std::map> &deployedTopology) = 0; + /** + * @brief Get executable graph information from a device + * @param graphPtr network ptr to store executable graph information + */ + virtual void GetExecGraphInfo(ICNNNetwork::Ptr &graphPtr) = 0; virtual std::vector QueryState() = 0; }; diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_async_request_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_async_request_internal.hpp index 844261a..c3162e7 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_async_request_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_async_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_request_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_request_internal.hpp index e10e6b0..24776f1 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_request_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_imemory_state_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_imemory_state_internal.hpp index a36a91e..387c19b 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_imemory_state_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_imemory_state_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.hpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.hpp index 8bac85a..f7645a5 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.hpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_detector.cpp b/inference-engine/src/inference_engine/cpu_detector.cpp index d05c6dd..9373771 100644 --- a/inference-engine/src/inference_engine/cpu_detector.cpp +++ b/inference-engine/src/inference_engine/cpu_detector.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_detector.hpp b/inference-engine/src/inference_engine/cpu_detector.hpp index c0ac96f..021919f 100644 --- a/inference-engine/src/inference_engine/cpu_detector.hpp +++ b/inference-engine/src/inference_engine/cpu_detector.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.cpp b/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.cpp index f8c16a4..c426125 100644 --- a/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.cpp +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.hpp b/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.hpp index 5eeb60d..034ed95 100644 --- a/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.hpp +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp index 7d40157..220280c 100644 --- a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.hpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.hpp index 09a5379..4cc5e7e 100644 --- a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.hpp +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp index ea37235..573aaa0 100644 --- a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -888,12 +888,9 @@ void calcRowLinear_32F(float *dst[], const float alpha[], const int mapsx[], const float beta[], - float tmp[], const Size & inSz, const Size & outSz, int lpi) { - UNUSED(tmp); - bool xRatioEq1 = inSz.width == outSz.width; bool yRatioEq1 = inSz.height == outSz.height; diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp index bbb0d6e..8a211e4 100644 --- a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp +++ b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -73,7 +73,6 @@ void calcRowLinear_32F(float *dst[], const float alpha[], const int mapsx[], const float beta[], - float tmp[], const Size & inSz, const Size & outSz, int lpi); diff --git a/inference-engine/src/inference_engine/data_stats.cpp b/inference-engine/src/inference_engine/data_stats.cpp index 58e43a1..127be61 100644 --- a/inference-engine/src/inference_engine/data_stats.cpp +++ b/inference-engine/src/inference_engine/data_stats.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/data_stats.h b/inference-engine/src/inference_engine/data_stats.h index b25f1d0..3805156 100644 --- a/inference-engine/src/inference_engine/data_stats.h +++ b/inference-engine/src/inference_engine/data_stats.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/debug.h b/inference-engine/src/inference_engine/debug.h index 8c5df8e..2e9200d 100644 --- a/inference-engine/src/inference_engine/debug.h +++ b/inference-engine/src/inference_engine/debug.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -25,7 +25,6 @@ #include "ie_algorithm.hpp" #ifdef _WIN32 -#include #include #define POSIX_EPOCH_AS_FILETIME 116444736000000000ULL diff --git a/inference-engine/src/inference_engine/description_buffer.hpp b/inference-engine/src/inference_engine/description_buffer.hpp index ae2bf3f..f814aff 100644 --- a/inference-engine/src/inference_engine/description_buffer.hpp +++ b/inference-engine/src/inference_engine/description_buffer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/dll_main.hpp b/inference-engine/src/inference_engine/dll_main.hpp index 2860d03..fa0eefd 100644 --- a/inference-engine/src/inference_engine/dll_main.hpp +++ b/inference-engine/src/inference_engine/dll_main.hpp @@ -1,7 +1,7 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // - +// dllmain.cpp : Defines the entry point for the DLL application. #pragma once #ifdef _WIN32 diff --git a/inference-engine/src/inference_engine/exec_graph_info.hpp b/inference-engine/src/inference_engine/exec_graph_info.hpp new file mode 100644 index 0000000..633d27f --- /dev/null +++ b/inference-engine/src/inference_engine/exec_graph_info.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ExecGraphInfoSerialization { +/** +* @brief Executable Graph Info is represented in ICNNNetwork format with general CNNLayer nodes inside +* including connections between the nodes. Each node describes an executable hardware-specific +* primitive and stores its parameters within CNNLayer::params map. +* There is a list of general keys for the parameters map. +*/ + +/** + * @brief A general key for CNNLayer::params map. Used to get a string of layer names separated by a comma + * from the original IR, which were fused/merged to the current executable primitive. + */ +static const char ORIGIN_NAMES[] = "originalFusedLayersNames"; +/** + * @brief A general key for CNNLayer::params map. Used to get a type of the executable primitive. + */ +static const char IMPL_TYPE[] = "primitiveType"; +/** + * @brief A general key for CNNLayer::params map. Used to get a precision of the executable primitive. + */ +static const char PRECISION[] = "precision"; +/** + * @brief A general key for CNNLayer::params map. Used to get value of execution time of the executable primitive. + */ +static const char PERF_COUNTER[] = "execTimeMcs"; +} // namespace ExecGraphInfoSerialization \ No newline at end of file diff --git a/inference-engine/src/inference_engine/file_utils.cpp b/inference-engine/src/inference_engine/file_utils.cpp index 7b38b9f..b76c2b7 100644 --- a/inference-engine/src/inference_engine/file_utils.cpp +++ b/inference-engine/src/inference_engine/file_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/file_utils.h b/inference-engine/src/inference_engine/file_utils.h index a3e2276..ce79a9f 100644 --- a/inference-engine/src/inference_engine/file_utils.h +++ b/inference-engine/src/inference_engine/file_utils.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,14 +10,17 @@ #include #ifdef _WIN32 -#define _WINSOCKAPI_ -#include -#include +# ifndef NOMINMAX +# define NOMINMAX +# endif +# define _WINSOCKAPI_ +# include +# include #endif #ifdef __MACH__ -#include -#include +# include +# include #endif #include "ie_api.h" diff --git a/inference-engine/src/inference_engine/graph_tools.cpp b/inference-engine/src/inference_engine/graph_tools.cpp index e123c75..5c20edd 100644 --- a/inference-engine/src/inference_engine/graph_tools.cpp +++ b/inference-engine/src/inference_engine/graph_tools.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,4 +27,26 @@ std::vector CNNNetSortTopologically(const ICNNNetwork & network) { } } // namespace details + +void CNNNetSubstituteLayer(InferenceEngine::ICNNNetwork &network, + const InferenceEngine::CNNLayerPtr &layer, + const InferenceEngine::CNNLayerPtr &newLayer) { + IE_ASSERT(layer->name == newLayer->name); + + // Redirect srd data + for (auto& src : layer->insData) { + src.lock()->getInputTo()[layer->name] = newLayer; + } + newLayer->insData = layer->insData; + + // Redirect dst data + for (auto& dst : layer->outData) { + dst->creatorLayer = newLayer; + } + newLayer->outData = layer->outData; + + network.addLayer(newLayer); +} + + } // namespace InferenceEngine \ No newline at end of file diff --git a/inference-engine/src/inference_engine/graph_tools.hpp b/inference-engine/src/inference_engine/graph_tools.hpp index bce8a70..2207181 100644 --- a/inference-engine/src/inference_engine/graph_tools.hpp +++ b/inference-engine/src/inference_engine/graph_tools.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -263,14 +263,15 @@ inline std::string CNNNetPrevLayerName(const InferenceEngine::DataWeakPtr & dat * @param idx - index in previous layer collection * @param layer */ - inline bool CNNNetHasPrevLayer(const InferenceEngine::CNNLayer* layer, int idx = 0) { - IE_ASSERT(layer != nullptr); - if (layer->insData.empty() || layer->insData.size() <= idx) { - return false; - } - auto prevData = layer->insData[idx].lock(); - return !!prevData->getCreatorLayer().lock(); +inline bool CNNNetHasPrevLayer(const InferenceEngine::CNNLayer* layer, int idx = 0) { + IE_ASSERT(layer != nullptr); + if (layer->insData.empty() || layer->insData.size() <= idx) { + return false; } + auto prevData = layer->insData[idx].lock(); + return !!prevData->getCreatorLayer().lock(); +} + /** * @brief pointer of previous layers * @param idx - index in previous layer collection @@ -499,14 +500,36 @@ inline CNNLayerSet CNNNetGetAllInputLayers(const ICNNNetwork &network) { if (inputs.empty()) return inputLayers; - auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo(); - if (secondLayers.empty()) - return inputLayers; + for (const auto & input : inputs) { + auto &secondLayers = input.second->getInputData()->getInputTo(); - details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) { - if (layer->insData.empty()) { - inputLayers.insert(layer); - } + if (secondLayers.empty()) + continue; + + details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) { + if (layer->insData.empty()) { + inputLayers.insert(layer); + } + }, false); + } + return inputLayers; +} + +/** + * @brief returns all layers that are input or memory , searc started from arbitrary location in network + * @param start layer + * @return set of input layers + */ +inline CNNLayerSet CNNNetGetAllInputLayers(CNNLayer* layer) { + CNNLayerSet inputLayers; + std::unordered_set allLayers; + + CNNLayerPtr layerPtr(layer, [](CNNLayer*){}); + + details::UnorderedDFS(allLayers, layerPtr, [&](CNNLayerPtr layer) { + if (layer->insData.empty()) { + inputLayers.insert(layer); + } }, false); return inputLayers; } @@ -703,8 +726,9 @@ inline CNNNetPtr CNNNetCopy(const ICNNNetwork &input) { * @param after, insertion happened after this layer, if after is nullptr, insertion happened after all inputLayers for before layer * @param before, insertion happened before layer, if before is nullptr, insertion happened before all outputLayers of after layer * @param layerToInsert inserted layer + * @param outDataIndex optional parameter. You can reduce or improve layer search in some use cases by specifying index data */ -inline void CNNNetworkInsertLayer(CNNLayerPtr after, CNNLayerPtr before, CNNLayerPtr layerToInsert) { +inline void CNNNetworkInsertLayer(CNNLayerPtr after, CNNLayerPtr before, CNNLayerPtr layerToInsert, size_t outDataIndex = 0) { if (after == nullptr && before == nullptr) { THROW_IE_EXCEPTION << "Cannot Insert Layer: before or after layers should be valid layer pointers"; } @@ -713,6 +737,10 @@ inline void CNNNetworkInsertLayer(CNNLayerPtr after, CNNLayerPtr before, CNNLaye if (after != nullptr) { // TODO: only one output data supported for (auto && data : after->outData) { + if (outDataIndex) { + --outDataIndex; + continue; + } for (auto && input : data->inputTo) { if (before != nullptr && input.second.get() != before.get()) continue; @@ -768,4 +796,83 @@ inline void CNNNetworkInsertLayer(CNNLayerPtr after, CNNLayerPtr before, CNNLaye } } +/** + * @brief remove givven layer from topology, currently only layers with one input data and one output data supported + */ +inline void CNNNetworkRemoveLayer(CNNLayerPtr layer) { + if (!layer) { + THROW_IE_EXCEPTION << "Cannot remove layer pointed to NULL"; + } + if (layer->insData.size() != 1) { + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 input"; + } + if (layer->outData.size() != 1) { + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 output"; + } + + auto isp = layer->insData.front().lock(); + if (!isp) { + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" cannot get it's input"; + } + // if dimensions of input layer not equal target dimensions - shape infer or reshape layer required, so skipping those cases + auto osp = layer->outData.front(); + if (isp->getDims() != osp->getDims()) { + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" its input layer(" + << isp->getName() << ") and output(" << osp->getName() << ") have incompatible dimensions"; + } + + // remove isp->layer connection + for (auto i = isp->getInputTo().begin(); i != isp->getInputTo().end(); i++) { + if (i->second.get() == layer.get()) { + isp->getInputTo().erase(i); + break; + } + } + + // remove osp->layer connection + for (auto && outData : osp->getInputTo()) { + for (auto i = outData.second->insData.begin(); i != outData.second->insData.end(); i++) { + auto insData = i->lock(); + if (!insData) { + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<", its output layer(" << + outData.first << " has invalid input configuration"; + } + auto creator = insData->getCreatorLayer().lock(); + if (!creator) { + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<", its output layer(" << + outData.first << " has invalid input configuration"; + } + + // found layer that need to be removed + if (creator.get() == layer.get()) { + outData.second->insData.erase(i); + break; + } + } + } + + // add isp->osp connections + for (auto && outData : osp->getInputTo()) { + // new syntetic name to avoid duplicates in map + isp->getInputTo()[layer->name + "_" + outData.first] = outData.second; + } + + // add osp->isp connections + for (auto && outData : osp->getInputTo()) { + outData.second->insData.push_back(isp); + } + + // removing layer->osp, and layer->isp connection not necessary - layer will delete it by itself +} + +/** + * @brief Replaces layer with newLayer in network + * @param network - graph containing the layer + * @param layer - layer which need to replace + * @param newLayer - new layer instead of layer; it must have same name like a layer for replace + */ +void CNNNetSubstituteLayer(InferenceEngine::ICNNNetwork &network, + const InferenceEngine::CNNLayerPtr &layer, + const InferenceEngine::CNNLayerPtr &newLayer); + } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/graph_transformer.cpp b/inference-engine/src/inference_engine/graph_transformer.cpp index af0dd63..8c40f08 100644 --- a/inference-engine/src/inference_engine/graph_transformer.cpp +++ b/inference-engine/src/inference_engine/graph_transformer.cpp @@ -1,28 +1,318 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include +#include +#include
+#include
#include "graph_transformer.h" +#include "cnn_network_impl.hpp" +#include "blob_factory.hpp" +#include "graph_tools.hpp" +#include +#include +#include +#include +#include +#include namespace InferenceEngine { -void replaceLayerWithNewLayer(ICNNNetwork &network, const CNNLayerPtr &layer, const CNNLayerPtr &newLayer) { - assert(layer->name == newLayer->name); +std::vector +ConstTransformer::foldConstSubgraphsInternal(const std::map& constLayers, const BlobMap& constData, + const std::vector& sortedLayers) { + std::vector remainingConstLayers; + for (const auto& layer : sortedLayers) { + if (constLayers.find(layer->name) != constLayers.end()) { + // const layer doesn't need parent connections -> erase them + for (const auto& insData : layer->insData) { + auto& inputTo = insData.lock()->getInputTo(); + inputTo.erase(layer->name); + // Notr: to resolve corner case above layers can be marked as const with const data, just to be removed properly.. + // and maybe this logic wouldn't be needed + if (inputTo.empty()) { + auto creator = insData.lock()->creatorLayer.lock(); + auto it = std::find(creator->outData.begin(), creator->outData.end(), insData.lock()); + if (it != creator->outData.end()) { + network->removeData((*it)->name); + creator->outData.erase(it); + } + } + } + layer->insData.clear(); - // Redirect srd data - for (auto& src : layer->insData) { - src.lock()->getInputTo()[layer->name] = newLayer; + if (constLayers.at(layer->name)) { + for (const auto& outData : layer->outData) { + for (const auto& inputTo : outData->getInputTo()) { + CNNLayerPtr inputToLayer; + std::string inputToName; + std::tie(inputToName, inputToLayer) = inputTo; + auto& insData = inputToLayer->insData; + auto insDataIt = std::find_if(insData.begin(), insData.end(), + [&outData](const DataWeakPtr& current) { + return current.lock()->name == outData->name; + }); + // remove connection with const data, because for const child it's not needed, for dynamic - new one will be created + if (insDataIt != insData.end()) { + insDataIt = inputToLayer->insData.erase(insDataIt); + } + } + network->removeData(outData->name); + } + network->removeLayer(layer->name); + } else { + // if only one output data is not const - do nothing, otherwise - run procedure below + // note: multiple const output data requires multiple layers with blob["custom"] to keep const data + bool keepConstData = layer->outData.size() == 1; + if (keepConstData) { + auto outData = layer->outData[0]; + for (const auto& inputTo : outData->getInputTo()) { + if (constLayers.find(inputTo.first) != constLayers.end()) { + keepConstData = false; + } + } + } + if (keepConstData) { + if (!constLayers.at(layer->name)) { + auto outData = layer->outData[0]; + if (layer->blobs.find("custom") == layer->blobs.end()) { + // if there's no const data - set it + const auto it = constData.find(outData->name); + if (it != constData.end()) { + layer->blobs["custom"] = it->second; + } + } + if (layer->type != "Const") { + // layer was calculated during the Const Propagation, need to hide its semantic (type, params) + LayerParams layerParams{layer->name + "__" + outData->name + "__Const", "Const", + layer->precision}; + auto newLayer = std::make_shared(layerParams); + for (const auto& data : layer->outData) { + data->creatorLayer = newLayer; + } + newLayer->outData = layer->outData; + newLayer->blobs["custom"] = layer->blobs["custom"]; + network->removeLayer(layer->name); + network->addLayer(newLayer); + remainingConstLayers.push_back(newLayer->name); + } else { + // Layer with `Const` type should be also considered on trimming shape inputs + remainingConstLayers.push_back(layer->name); + } + } + } else { + for (const auto& outData : layer->outData) { + for (const auto& inputTo : outData->getInputTo()) { + CNNLayerPtr inputToLayer; + std::string inputToName; + std::tie(inputToName, inputToLayer) = inputTo; + auto& insData = inputToLayer->insData; + auto insDataIt = std::find_if(insData.begin(), insData.end(), + [&outData](const DataWeakPtr& current) { + return current.lock()->name == outData->name; + }); + // remove connection with const data, because for const child it's not needed, for dynamic - new one will be created + if (insDataIt != insData.end()) { + insDataIt = inputToLayer->insData.erase(insDataIt); + } + if (constLayers.find(inputToName) == constLayers.end()) { + // next layer is not const, need to attach const data to it via blobs["custom"] of new Const layer + LayerParams layerParams{layer->name + "__" + outData->name + "__Const", "Const", + layer->precision}; + auto newLayer = std::make_shared(layerParams); + remainingConstLayers.push_back(newLayer->name); + const auto it = constData.find(outData->name); + if (it != constData.end()) { + newLayer->blobs["custom"] = it->second; + } + auto newData = std::make_shared(outData->name + "__" + inputToName, + outData->getTensorDesc()); + newData->creatorLayer = newLayer; + newData->inputTo[inputToName] = inputToLayer; + newLayer->outData = {newData}; + network->addLayer(newLayer); + network->getData(newData->name) = newData; + inputToLayer->insData.insert(insDataIt, newData); + } + } + } + for (const auto& data : layer->outData) { + network->removeData(data->name); + } + network->removeLayer(layer->name); + } + } + } } - newLayer->insData = layer->insData; + return remainingConstLayers; +} + +const std::map ConstTransformer::getConstLayers(const std::vector& sortedLayers) { + std::map mapConstLayers; + // collect all const layers, which inputs are const layers. + for (const auto& layer : sortedLayers) { + // Layers with "Shape" and "Const" type are Const by definition + if (layer->type == "Shape" || layer->type == "Const") { + mapConstLayers[layer->name] = false; + } else { + bool isAllInputsConst = true; + for (auto const& data : layer->insData) { + auto creatorName = data.lock()->creatorLayer.lock()->name; + if (mapConstLayers.find(creatorName) == mapConstLayers.end()) { + isAllInputsConst = false; + } + } + if (isAllInputsConst && !layer->insData.empty()) mapConstLayers[layer->name] = false; + } + } + // Add mark for const layers, if it's used for shape taking layers as second input + // true - is used and can be deleted from graph, as no influence on data, false - opposite + std::map mapVisitedLayers = mapConstLayers; + for (auto rit = sortedLayers.rbegin(); rit != sortedLayers.rend(); rit++) { + auto currentLayer = (*rit); + std::string currentLayerName = currentLayer->name; + bool isCurrentConst = mapConstLayers.find(currentLayerName) != mapConstLayers.end(); + for (int i = 0; i < currentLayer->insData.size(); i++) { + std::string creatorName; + if (currentLayer->insData[i].lock()) { + auto creator = currentLayer->insData[i].lock()->creatorLayer.lock(); + if (creator) { + creatorName = creator->name; + } + } + bool isCreatorConst = mapConstLayers.find(creatorName) != mapConstLayers.end(); + if (isCreatorConst) { + // mark second const input of shape taking layers (Reshape, Interp..), if they wasn't visited before + if ((i == 1) && (shapeTaking.find(currentLayer->type)) != shapeTaking.end()) { + if (!mapConstLayers[creatorName]) { + if (!mapVisitedLayers.at(creatorName)) { + mapConstLayers[creatorName] = true; + } + } + } else { + if (isCurrentConst) { + if (mapConstLayers.at(currentLayerName)) { + if (!mapConstLayers[creatorName]) { + if (!mapVisitedLayers.at(creatorName)) { + mapConstLayers[creatorName] = true; + } + } + } else { + mapConstLayers[creatorName] = false; + } + } else { + mapConstLayers[creatorName] = false; + } + } + } + mapVisitedLayers[creatorName] = true; + } + mapVisitedLayers[currentLayerName] = true; + } + return mapConstLayers; +} + +const BlobMap ConstTransformer::getConstData(const std::map& constLayers, const std::vector& sortedLayers) { + ShapeInfer::ConstInferHolder holder; + BlobMap constData; + auto getInputBlobs = [&constData](const std::vector& insData, + bool isForShape) -> std::vector { + std::vector inputBlobs; + // special case of Const layers: no inputs, no input blobs + if (insData.empty()) { + return {}; + } + for (const auto& data : insData) { + std::string dataName = data.lock()->name; + if (constData.find(dataName) != constData.end()) { + // get blobs, inferred before + inputBlobs.push_back(constData.at(dataName)); + } else { + // special case of Shape layer: no input data, but blob contains info about dimensions, layout and etc... + auto blob = make_blob_with_precision(data.lock()->getTensorDesc()); + inputBlobs.push_back(blob); + } + } + return inputBlobs; + }; + + auto getOutputBlobs = [](const std::vector& outData) -> std::vector { + std::vector outputBlobs; + for (const auto& data : outData) { + auto blob = make_blob_with_precision(data->getTensorDesc()); + blob->allocate(); + outputBlobs.push_back(blob); + } + return outputBlobs; + }; - // Redirect dst data - for (auto& dst : layer->outData) { - dst->creatorLayer = newLayer; + for (const auto& layer : sortedLayers) { + if (constLayers.find(layer->name) != constLayers.end()) { + std::string layerName = layer->name; + bool isForShape = constLayers.at(layerName); + CNNNetwork cnnNetwork(network); + auto layer = cnnNetwork.getLayerByName(layerName.c_str()); + auto implPtr = holder.getConstInferImpl(layer->type); + if (!implPtr && !isForShape) + THROW_IE_EXCEPTION << "Failed to find reference implementation for `" + + layer->name + "` Layer with `" + layer->type + "` Type on constant propagation"; + if (!isForShape) { + auto outputBlobs = getOutputBlobs(layer->outData); + implPtr->infer(getInputBlobs(layer->insData, isForShape), layer->params, layer->blobs, outputBlobs); + for (int i = 0; i < layer->outData.size(); i++) { + std::string dataName = layer->outData[i]->name; + auto shapes = layer->outData[i]->getTensorDesc().getDims(); + outputBlobs[i]->Reshape(SizeVector(shapes.rbegin(), shapes.rend()), + TensorDesc::getLayoutByDims(shapes)); + constData[dataName] = outputBlobs[i]; + } + } + } } - newLayer->outData = layer->outData; + return constData; +} + +void ConstTransformer::trimShapeInputs(const std::vector& constLayers) { + for (const auto& layerName : constLayers) { + auto layer = cnnNetwork.getLayerByName(layerName.c_str()); + if (layer->outData.size() == 1 && layer->type == "Const" && layer->insData.empty()) { + auto constData = layer->outData[0]; + std::map inputToMap = constData->getInputTo(); + for (const auto& inputTo : inputToMap) { + CNNLayerPtr inputToLayer = inputTo.second; + if (shapeTaking.find(inputToLayer->type) != shapeTaking.end()) { + auto& insData = inputToLayer->insData; + auto it = std::find_if(insData.begin(), insData.end(), + [&constData](const DataWeakPtr& current) { + return current.lock()->name == constData->name; + }); + if (it != insData.end() && std::distance(insData.begin(), it) == 1) { + inputToLayer->insData.erase(it); + constData->getInputTo().erase(inputTo.first); + } + } + } + if (constData->inputTo.empty()) { + network->removeData(constData->name); + network->removeLayer(layer->name); + } + } + } +} + +void ConstTransformer::foldConstSubgraphs() { + auto sortedLayers = details::CNNNetSortTopologically(*network); + auto constLayers = getConstLayers(sortedLayers); + auto constData = getConstData(constLayers, sortedLayers); + foldConstSubgraphsInternal(constLayers, constData, sortedLayers); +} - network.addLayer(newLayer); +void ConstTransformer::fullTrim() { + auto sortedLayers = details::CNNNetSortTopologically(*network); + auto constMapLayers = getConstLayers(sortedLayers); + auto constData = getConstData(constMapLayers, sortedLayers); + auto constLayers = foldConstSubgraphsInternal(constMapLayers, constData, sortedLayers); + trimShapeInputs(constLayers); } } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/graph_transformer.h b/inference-engine/src/inference_engine/graph_transformer.h index 9d8014d..d984535 100644 --- a/inference-engine/src/inference_engine/graph_transformer.h +++ b/inference-engine/src/inference_engine/graph_transformer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,16 +9,64 @@ #pragma once +#include +#include +#include #include +#include
+#include "cnn_network_impl.hpp" namespace InferenceEngine { /** - * @brief Replaces layer with newLayer in network - * @param network - graph containing the layer - * @param layer - layer which need to replace - * @param newLayer - new layer instead of layer; it must have same name like a layer for replace + * @brief TBD */ -void replaceLayerWithNewLayer(ICNNNetwork &network, const CNNLayerPtr &layer, const CNNLayerPtr &newLayer); +class INFERENCE_ENGINE_API_CLASS(ConstTransformer) { +public: + explicit ConstTransformer(details::CNNNetworkImpl* _network) { + if (!_network) THROW_IE_EXCEPTION << "[ERROR]: Failed to init ConstTransformer with null pointer of network"; + network = _network; + cnnNetwork = CNNNetwork(network); + } + + /** + * @brief calculates const layers, combines const subgraph into a single const layers + */ + void foldConstSubgraphs(); + + /** + * @brief folds Const Subgraphs and removes second input of Reshape-like layers (Interp, Gather, Resample, ...) + */ + void fullTrim(); + +protected: + /** + * @brief collect all const layers with marking if it defines shape (1 - for shape, 0 - otherwise) + */ + virtual const std::map getConstLayers(const std::vector& sortedLayers); + + /** + * @brief TBD + */ + virtual const BlobMap + getConstData(const std::map& constLayers, const std::vector& sortedLayers); + + /** + * @brief TBD + */ + virtual std::vector + foldConstSubgraphsInternal(const std::map& constLayers, const BlobMap& constData, + const std::vector& sortedLayers); + + /** + * @brief TBD + */ + virtual void trimShapeInputs(const std::vector& constLayers); + +private: + const details::caseless_set shapeTaking = {"Reshape", "Resample", "Interp"}; + details::CNNNetworkImpl* network; + CNNNetwork cnnNetwork; +}; } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_algorithm.hpp b/inference-engine/src/inference_engine/ie_algorithm.hpp index d0c8750..d5662e1 100644 --- a/inference-engine/src/inference_engine/ie_algorithm.hpp +++ b/inference-engine/src/inference_engine/ie_algorithm.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -38,5 +38,11 @@ auto product(TIterator beg, TIterator en) -> typename std::remove_reference::type>(1), std::multiplies::type>()); } + +inline void clipping(int* idx, const int min, const int max) { + (*idx) = ((*idx) > min) ? (*idx) : min; + (*idx) = ((*idx) < max) ? (*idx) : (max - 1); +} + } // namespace details -} // namespace InferenceEngine \ No newline at end of file +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_blob_common.cpp b/inference-engine/src/inference_engine/ie_blob_common.cpp index ca991c7..7098ca2 100644 --- a/inference-engine/src/inference_engine/ie_blob_common.cpp +++ b/inference-engine/src/inference_engine/ie_blob_common.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_blob_proxy.hpp b/inference-engine/src/inference_engine/ie_blob_proxy.hpp index b770590..cb0615b 100644 --- a/inference-engine/src/inference_engine/ie_blob_proxy.hpp +++ b/inference-engine/src/inference_engine/ie_blob_proxy.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_cnn_layer_builder.cpp b/inference-engine/src/inference_engine/ie_cnn_layer_builder.cpp new file mode 100644 index 0000000..7e015db --- /dev/null +++ b/inference-engine/src/inference_engine/ie_cnn_layer_builder.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +using namespace InferenceEngine; + +std::map Builder::convertParameters2Strings(const std::map& parameters) { + std::map oldParams; + for (const auto& param : parameters) { + // skip blobs and ports + if (param.second.is() || param.second.is() || param.second.is>() + || param.second.is()) + continue; + if (param.second.is() || param.second.is>()) { + oldParams[param.first] = Builder::convertParameter2String(param.second); + } else if (param.second.is() || param.second.is>()) { + oldParams[param.first] = Builder::convertParameter2String(param.second); + } else if (param.second.is() || param.second.is>()) { + oldParams[param.first] = Builder::convertParameter2String(param.second); + } else if (param.second.is() || param.second.is>()) { + oldParams[param.first] = Builder::convertParameter2String(param.second); + } else if (param.second.is() || param.second.is>()) { + oldParams[param.first] = Builder::convertParameter2String(param.second); + } else if (param.second.is() || param.second.is>()) { + oldParams[param.first] = Builder::convertParameter2String(param.second); + } else { + THROW_IE_EXCEPTION << "Parameter " << param.first << " has unsupported parameter type!"; + } + } + return oldParams; +} + +Builder::Layer Builder::builderFromCNNLayer(const CNNLayerPtr& cnnLayer) { + Builder::Layer layer(cnnLayer->type, cnnLayer->name); + std::vector inputPorts; + for (const auto& data : cnnLayer->insData) { + auto lockedData = data.lock(); + if (!lockedData) + continue; + inputPorts.emplace_back(lockedData->getTensorDesc().getDims()); + } + + std::vector outputPorts; + for (const auto& data : cnnLayer->outData) { + outputPorts.emplace_back(data->getTensorDesc().getDims()); + } + + size_t inputsCount = inputPorts.size(); + std::map blobs = cnnLayer->blobs; + if (blobs.find("weights") != blobs.end()) { + auto port = Port(); + port.setParameter("type", "weights"); + inputPorts.push_back(port); + } + if (blobs.find("biases") != blobs.end()) { + if (inputsCount == inputPorts.size()) { + auto port = Port(); + port.setParameter("type", "weights"); + inputPorts.push_back(port); + } + + auto port = Port(); + port.setParameter("type", "biases"); + inputPorts.push_back(port); + } + for (const auto& it : blobs) { + if (it.first == "weights" || it.first == "biases") + continue; + auto port = Port(); + port.setParameter("type", it.first); + inputPorts.emplace_back(port); + } + + std::map params; + for (const auto& it : cnnLayer->params) { + params[it.first] = it.second; + } + + layer.setInputPorts(inputPorts).setOutputPorts(outputPorts).setParameters(params); + + Builder::ConverterRegister::convert(cnnLayer, layer); + + return layer; +} + +Builder::ConverterRegister::ConverterRegister(const std::string& type, const std::function& converter) { + if (getConvertersHolder().converters.find(type) == getConvertersHolder().converters.end()) + getConvertersHolder().converters[type] = converter; +} + +Builder::ConvertersHolder &Builder::ConverterRegister::getConvertersHolder() { + static Builder::ConvertersHolder holder; + return holder; +} diff --git a/inference-engine/src/inference_engine/ie_cnn_layer_builder.h b/inference-engine/src/inference_engine/ie_cnn_layer_builder.h index 8cad3ca..85d058c 100644 --- a/inference-engine/src/inference_engine/ie_cnn_layer_builder.h +++ b/inference-engine/src/inference_engine/ie_cnn_layer_builder.h @@ -1,20 +1,73 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include
-#include +#include +#include #include #include #include #include +#include +#include namespace InferenceEngine { namespace Builder { +template +inline std::string convertParameter2String(const Parameter& parameter) { + if (parameter.is>()) { + std::vector params = parameter.as>(); + std::string result; + for (const auto& param : params) { + if (!result.empty()) + result += ","; + result += convertParameter2String(param); + } + return result; + } + return std::to_string(parameter.as()); +} +template<> +inline std::string convertParameter2String(const Parameter& parameter) { + return parameter.as(); +} + +std::map convertParameters2Strings(const std::map& parameters); +Layer builderFromCNNLayer(const CNNLayerPtr& cnnLayer); + +struct ConvertersHolder { + details::caseless_map> converters; +}; + +/** + * @brief This class registers layer validators + */ +class ConverterRegister { +public: + /** + * @brief The constructor registers new layer validator + * @param type Layer type + * @param validator Layer validator + */ + explicit ConverterRegister(const std::string& type, const std::function& converter); + + static void convert(const CNNLayerPtr& cnnLayer, Layer& layer) { + if (getConvertersHolder().converters.find(layer.getType()) != getConvertersHolder().converters.end()) + getConvertersHolder().converters[layer.getType()](cnnLayer, layer); + } + +private: + static ConvertersHolder& getConvertersHolder(); +}; + +#define REG_CONVERTER_FOR(__type, __converter) \ +static InferenceEngine::Builder::ConverterRegister _reg_converter_##__type(#__type, __converter) + class BaseConverter { public: explicit BaseConverter(const std::string& type): type(type) {} @@ -37,20 +90,30 @@ public: auto * weightLayerPtr = dynamic_cast(res.get()); - for (auto& it : layer->getParameters()->getConstantData()) { - res->blobs[it.first] = std::const_pointer_cast(it.second); + for (const auto& port : layer->getInputPorts()) { + if (port.getParameters().find("type") == port.getParameters().end() || + port.getData()->getData()->cbuffer() == nullptr) + continue; + res->blobs[port.getParameters().at("type")] = port.getData()->getData(); if (weightLayerPtr == nullptr) continue; - if (it.first == "weights") { - weightLayerPtr->_weights = std::const_pointer_cast(it.second); - } else if (it.first == "biases") { - weightLayerPtr->_biases = std::const_pointer_cast(it.second); + if (port.getParameters().at("type").as() == "weights") { + weightLayerPtr->_weights = port.getData()->getData(); + } else if (port.getParameters().at("type").as() == "biases") { + weightLayerPtr->_biases = port.getData()->getData(); } } - for (const auto& it : layer->getParameters()->getParameters()) { - res->params[it.first] = it.second; + // For constant layers + for (auto& it : layer->getParameters()) { + if (it.second.is()) { + res->blobs[it.first] = std::const_pointer_cast(it.second.as()); + } else if (it.second.is()) { + res->blobs[it.first] = it.second.as(); + } } + + res->params = convertParameters2Strings(layer->getParameters()); return res; } @@ -75,13 +138,13 @@ public: {"tanh", std::make_shared>("TanH")}, }; - auto typeIt = layer->getParameters()->getParameters().find("type"); - if (typeIt == layer->getParameters()->getParameters().end()) + auto typeIt = layer->getParameters().find("type"); + if (typeIt == layer->getParameters().end()) THROW_IE_EXCEPTION << "Unsupported Activation layer. Type is unknown."; auto activationBuilder = activationCreators.find(typeIt->second); if (activationBuilder == activationCreators.end()) { - THROW_IE_EXCEPTION << "Unsupported Activation layer type: " << typeIt->second.asString(); + THROW_IE_EXCEPTION << "Unsupported Activation layer type: " << typeIt->second.as(); } auto activation = activationBuilder->second->createLayer(layer, precision); @@ -98,5 +161,28 @@ public: } }; +class RNNSequenceConverter: public BaseConverter { +public: + RNNSequenceConverter(): BaseConverter("RNN") {} + + CNNLayer::Ptr createLayer(const std::shared_ptr& layer, Precision precision) override { + auto rnnLayer = LayerConverter("RNN").createLayer(layer, precision); + rnnLayer->type = "RNN"; + std::string type = layer->getType(); + size_t pos = type.find("Sequence"); + if (pos != std::string::npos) + type.erase(pos); + rnnLayer->params["cell_type"] = type; + return rnnLayer; + } + + bool canCreate(const std::string& nodeType) const override { + static const details::caseless_set supportedRnnTypes { + "LSTMSequence", "GRUSequence", "RNNSequence" + }; + return supportedRnnTypes.find(nodeType) != supportedRnnTypes.end(); + } +}; + } // namespace Builder } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp b/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp index 2db4c2a..12349aa 100644 --- a/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp +++ b/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -141,7 +141,7 @@ StatusCode CNNNetReaderImpl::ReadNetwork(pugi::xml_document& xmlDoc) { _version = GetFileVersion(root); if (_version < 1) THROW_IE_EXCEPTION << "deprecated IR version: " << _version; - if (_version > 4) THROW_IE_EXCEPTION << "cannot parse future versions: " << _version; + if (_version > 5) THROW_IE_EXCEPTION << "cannot parse future versions: " << _version; _parser = parserCreator->create(_version); network = _parser->Parse(root); name = network->getName(); diff --git a/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.h b/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.h index fb9bd49..cd92144 100644 --- a/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.h +++ b/inference-engine/src/inference_engine/ie_cnn_net_reader_impl.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_context.cpp b/inference-engine/src/inference_engine/ie_context.cpp index 8f8335b..58d727d 100644 --- a/inference-engine/src/inference_engine/ie_context.cpp +++ b/inference-engine/src/inference_engine/ie_context.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_data.cpp b/inference-engine/src/inference_engine/ie_data.cpp index 7626620..8f91730 100644 --- a/inference-engine/src/inference_engine/ie_data.cpp +++ b/inference-engine/src/inference_engine/ie_data.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -57,7 +57,7 @@ const TensorDesc& Data::getTensorDesc() const { } bool Data::isInitialized() const { - return !dims.empty() || !tensorDesc.getDims().empty(); + return !dims.empty() || !tensorDesc.getDims().empty() || layout == SCALAR; } void Data::setDims(const SizeVector &a_dims) { @@ -84,6 +84,14 @@ void Data::setLayout(Layout layout) { this->layout = layout; } +void Data::reshape(const SizeVector &a_dims, Layout a_layout) { + dims = a_dims; + layout = a_layout; + std::reverse(dims.begin(), dims.end()); + + tensorDesc.reshape(a_dims, layout); +} + CNNLayerWeakPtr &Data::getCreatorLayer() { return creatorLayer; } diff --git a/inference-engine/src/inference_engine/ie_device.cpp b/inference-engine/src/inference_engine/ie_device.cpp index 3094414..2090e7f 100644 --- a/inference-engine/src/inference_engine/ie_device.cpp +++ b/inference-engine/src/inference_engine/ie_device.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -57,6 +57,11 @@ FindPluginResponse InferenceEngine::findPlugin(const FindPluginRequest& req) { case TargetDevice::eHETERO: pluginVec.push_back("HeteroPlugin"); break; + case TargetDevice::eKMB: +#ifdef ENABLE_KMB + pluginVec.push_back("kmbPlugin"); +#endif + break; default: THROW_IE_EXCEPTION << "Cannot find plugin for device: " << getDeviceName(req.device); diff --git a/inference-engine/src/inference_engine/ie_format_parser.cpp b/inference-engine/src/inference_engine/ie_format_parser.cpp index 2acd267..57fa00a 100644 --- a/inference-engine/src/inference_engine/ie_format_parser.cpp +++ b/inference-engine/src/inference_engine/ie_format_parser.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -12,7 +12,6 @@ #include #include #include "ie_icnn_network_stats.hpp" -#include "ie_layers_prv.h" using namespace InferenceEngine; using namespace InferenceEngine::details; @@ -335,6 +334,7 @@ CNNNetworkImplPtr FormatParser::Parse(pugi::xml_node& root) { pars_info.inputPorts[i].dims, pars_info.inputPorts[i].precision, TensorDesc::getLayoutByDims(pars_info.inputPorts[i].dims))); + data->setDims(pars_info.inputPorts[i].dims); layer->insData[i] = data; data->inputTo[layer->name] = layer; @@ -354,6 +354,17 @@ CNNNetworkImplPtr FormatParser::Parse(pugi::xml_node& root) { if (!_network->allLayers().size()) THROW_IE_EXCEPTION << "Incorrect model! Network doesn't contain layers."; + size_t inputLayersNum(0); + CaselessEq cmp; + for (const auto& kvp : _network->allLayers()) { + const CNNLayer::Ptr& layer = kvp.second; + if (cmp(layer->type, "Input") || cmp(layer->type, "Const")) + inputLayersNum++; + } + + if (!inputLayersNum && !cmp(root.name(), "body")) + THROW_IE_EXCEPTION << "Incorrect model! Network doesn't contain input layers."; + // check all input ports are occupied for (const auto& kvp : _network->allLayers()) { const CNNLayer::Ptr& layer = kvp.second; @@ -378,7 +389,10 @@ CNNNetworkImplPtr FormatParser::Parse(pugi::xml_node& root) { OutputsDataMap outputsInfo; _network->getOutputsInfo(outputsInfo); for (auto outputInfo : outputsInfo) { - outputInfo.second->setPrecision(Precision::FP32); + if (outputInfo.second->getPrecision() != Precision::FP32 && + outputInfo.second->getPrecision() != Precision::I32) { + outputInfo.second->setPrecision(Precision::FP32); + } } if (_version == 1) { @@ -414,11 +428,13 @@ inline Blob::Ptr GetTypedBlobFromSegment(const TBlob::Ptr& weights, con Blob::Ptr FormatParser::GetBlobFromSegment(const TBlob::Ptr& weights, const WeightSegment& segment) const { if (segment.precision == Precision::FP32) { return GetTypedBlobFromSegment(weights, segment); + } else if (segment.precision == Precision::I32) { + return GetTypedBlobFromSegment(weights, segment); } else if (segment.precision == Precision::I16 || segment.precision == Precision::Q78 || segment.precision == Precision::FP16) { return GetTypedBlobFromSegment(weights, segment); } else if (segment.precision == Precision::U8) { return GetTypedBlobFromSegment(weights, segment); - } else if (segment.precision == Precision::I8) { + } else if (segment.precision == Precision::I8 || segment.precision == Precision::BIN) { return GetTypedBlobFromSegment(weights, segment); } else { THROW_IE_EXCEPTION << "precision " << segment.precision << " is not supported..."; @@ -436,7 +452,18 @@ void FormatParser::SetWeights(const TBlob::Ptr& weights) { WeightableLayer* pWL = dynamic_cast(kvp.second.get()); if (pWL != nullptr) { if (lprms.blobs.find("weights") != lprms.blobs.end()) { - pWL->_weights = GetBlobFromSegment(weights, lprms.blobs["weights"]); + if (lprms.prms.type == "BinaryConvolution") { + auto segment = lprms.blobs["weights"]; + if (segment.getEnd() > weights->size()) + THROW_IE_EXCEPTION << "segment exceeds given buffer limits. Please, validate weights file"; + size_t noOfElement = segment.size; + SizeVector w_dims({noOfElement}); + typename TBlobProxy::Ptr binBlob(new TBlobProxy(Precision::BIN, Layout::C, weights, segment.start, w_dims)); + + pWL->_weights = binBlob; + } else { + pWL->_weights = GetBlobFromSegment(weights, lprms.blobs["weights"]); + } pWL->blobs["weights"] = pWL->_weights; } if (lprms.blobs.find("biases") != lprms.blobs.end()) { @@ -488,10 +515,6 @@ void FormatParser::ParseDims(SizeVector& dims, const pugi::xml_node &parentNode) dims.push_back(dim); } - if (dims.empty()) { - THROW_IE_EXCEPTION << "input must have dimensions"; - } - if (_version == 1) dims.insert(dims.begin(), 1); // for batch, in version 1, in version 2 it is already there. } @@ -670,6 +693,15 @@ const std::vector >& FormatParser::getCreators() co std::make_shared>("Gemm"), std::make_shared>("Pad"), std::make_shared>("Gather"), + std::make_shared>("StridedSlice"), + std::make_shared>("ShuffleChannels"), + std::make_shared>("DepthToSpace"), + std::make_shared>("SpaceToDepth"), + std::make_shared>("ReverseSequence"), + std::make_shared>("Squeeze"), + std::make_shared>("Unsqueeze"), + std::make_shared>("Range"), + std::make_shared>("Expand"), std::make_shared>("ScaleShift"), std::make_shared>("PReLU"), std::make_shared>("Crop"), @@ -680,7 +712,13 @@ const std::vector >& FormatParser::getCreators() co std::make_shared>("BatchNormalization"), std::make_shared("TensorIterator"), std::make_shared>("LSTMCell"), - std::make_shared>("RNN"), + std::make_shared>("GRUCell"), + std::make_shared>("RNNCell"), + std::make_shared>("RNNSequence"), + std::make_shared>("GRUSequence"), + std::make_shared>("LSTMSequence"), + std::make_shared>("Quantize"), + std::make_shared>("BinaryConvolution"), }; return creators; } diff --git a/inference-engine/src/inference_engine/ie_format_parser.h b/inference-engine/src/inference_engine/ie_format_parser.h index 6820b1e..11e5f26 100644 --- a/inference-engine/src/inference_engine/ie_format_parser.h +++ b/inference-engine/src/inference_engine/ie_format_parser.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_graph_splitter.cpp b/inference-engine/src/inference_engine/ie_graph_splitter.cpp index 630287a..47b5d94 100644 --- a/inference-engine/src/inference_engine/ie_graph_splitter.cpp +++ b/inference-engine/src/inference_engine/ie_graph_splitter.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_graph_splitter.hpp b/inference-engine/src/inference_engine/ie_graph_splitter.hpp index 30e5f37..3252632 100644 --- a/inference-engine/src/inference_engine/ie_graph_splitter.hpp +++ b/inference-engine/src/inference_engine/ie_graph_splitter.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_layer_parsers.cpp b/inference-engine/src/inference_engine/ie_layer_parsers.cpp index 886c759..ca86df6 100644 --- a/inference-engine/src/inference_engine/ie_layer_parsers.cpp +++ b/inference-engine/src/inference_engine/ie_layer_parsers.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -32,6 +32,7 @@ CNNLayer::Ptr ActivationLayerCreator::CreateLayer(pugi::xml_node& node, LayerPar static caseless_map> activationCreators = { {"relu", std::make_shared>("ReLU")}, + {"relu6", std::make_shared>("ReLU6")}, {"prelu", std::make_shared>("PReLU")}, {"clamp", std::make_shared>("Clamp")}, {"elu", std::make_shared>("ELU")}, diff --git a/inference-engine/src/inference_engine/ie_layer_parsers.h b/inference-engine/src/inference_engine/ie_layer_parsers.h index f2a7ce9..5af4a03 100644 --- a/inference-engine/src/inference_engine/ie_layer_parsers.h +++ b/inference-engine/src/inference_engine/ie_layer_parsers.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_layer_validators.cpp b/inference-engine/src/inference_engine/ie_layer_validators.cpp index b39a054..86248f1 100644 --- a/inference-engine/src/inference_engine/ie_layer_validators.cpp +++ b/inference-engine/src/inference_engine/ie_layer_validators.cpp @@ -1,9 +1,8 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "ie_layers.h" -#include "ie_layers_prv.h" #include "ie_layer_validators.hpp" #include "debug.h" #include "xml_parse_utils.h" @@ -11,6 +10,8 @@ #include #include #include +#include +#include #include #include @@ -20,6 +21,8 @@ namespace InferenceEngine { using namespace details; using std::vector; +using std::string; +using std::map; template inline bool one_of(T val, P item) { return val == item; } @@ -44,17 +47,21 @@ void CNNLayer::validateLayer() { } struct WeightableParams { - size_t kernel_w, kernel_h, outputs, groups; - bool isKernelFromInput; - - WeightableParams(size_t _outputs, bool _isKernelFromInput, size_t _groups = 0, size_t _kernel_h = 0, - size_t _kernel_w = 0) : outputs(_outputs), isKernelFromInput(_isKernelFromInput), - kernel_h(_kernel_h), kernel_w(_kernel_w), - groups(_groups) {} + std::vector _kernel; + size_t _outputs = 0lu; + size_t _groups = 1lu; + bool _isKernelFromInput = false; + + WeightableParams(size_t outputs, bool isKernelFromInput, size_t groups = 0, const std::vector& kernel = {}) : + _kernel(kernel), + _outputs(outputs), + _groups(groups), + _isKernelFromInput(isKernelFromInput) {} }; void checkWeightable(const std::map& blobs, - const vector& inShapes, WeightableParams params, + const vector& inShapes, + WeightableParams params, const SizeVector& numDims) { if (inShapes.size() != 1) THROW_IE_EXCEPTION << "Number of inputs (" << inShapes.size() << ") is not equal to expected ones (1)"; @@ -75,18 +82,18 @@ void checkWeightable(const std::map& blobs, if (firstInputShape.empty()) THROW_IE_EXCEPTION << "Input shape can't be empty"; - size_t KW = 1, KH = 1, IC, OC; + size_t IC, OC; + std::vector kernel; IC = firstInputShape[1]; - if (params.isKernelFromInput) { - if (firstInputShape.size() == 4) { - KH = firstInputShape[2]; - KW = firstInputShape[3]; - } + if (params._isKernelFromInput) { + for (int i = 1; i <= inputSize - 2; i++) + kernel.push_back(firstInputShape[inputSize - i]); } else { - KH = params.kernel_h; - KW = params.kernel_w; + for (auto k : params._kernel) { + kernel.push_back(k); + } } - OC = params.outputs; + OC = params._outputs; auto it = blobs.find("weights"); if (it != blobs.end()) { // TODO: return with fixing shape infer tests: THROW_IE_EXCEPTION << "Invalid blobs: no weights"; @@ -94,12 +101,22 @@ void checkWeightable(const std::map& blobs, if (weights == nullptr || weights->dims().empty()) THROW_IE_EXCEPTION << "Weights can't be empty"; auto weightsSize = details::product(weights->dims()); - size_t expectedWeightsSize = OC * KW * KH * IC; - if (params.groups) expectedWeightsSize /= params.groups; + size_t expectedWeightsSize = OC * IC; + for (auto k : kernel) { + expectedWeightsSize *= k; + } + if (params._groups) expectedWeightsSize /= params._groups; if (expectedWeightsSize != weightsSize) { - THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(firstInputShape) << " make Kernels(" << KH << "x" - << KW << "), Channels(" << IC << "), Output depth(" << OC << "), Groups(" - << params.groups << ") not matching weights size: " << weightsSize; + std::string ker_str; + for (int i = 0; i < params._kernel.size(); i++) { + if (!ker_str.empty()) + ker_str += "x"; + ker_str += std::to_string(kernel[i]); + } + THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(firstInputShape) << " make Kernels(" << ker_str << + "), Channels(" << IC << "), Output depth(" << OC << "), Groups(" + << params._groups << ") not matching weights size: " + << expectedWeightsSize << " vs " << weightsSize; } } @@ -114,6 +131,30 @@ void checkWeightable(const std::map& blobs, } } +void checkDims(const std::vector& shapes, const vector& expected_shape_size) { + for (auto i : shapes) { + if (i.empty()) { + THROW_IE_EXCEPTION << " Failed with invalid shapes: dimension is empty"; + } + auto iter = std::find(expected_shape_size.begin(), expected_shape_size.end(), i.size()); + if (iter == expected_shape_size.end()) { + THROW_IE_EXCEPTION << " Failed with invalid shapes: dimension is invalid"; + } + } +} + +void checkNumOfInput(const std::vector& inShapes, const vector& expected_num_of_shapes) { + bool shape_was_found = false; + for (const auto& i : expected_num_of_shapes) { + if (inShapes.size() == i) { + shape_was_found = true; + } + } + if (!shape_was_found) { + THROW_IE_EXCEPTION << "Number of inputs (" << inShapes.size() << ") is not equal to expected ones"; + } +} + LayerValidators* LayerValidators::getInstance() { if (!_instance) { _instance = new LayerValidators(); @@ -145,19 +186,27 @@ void FullyConnectedValidator::parseParams(CNNLayer* layer) { } void FullyConnectedValidator::checkParams(const CNNLayer* layer) { - // TODO: check that values belong to the scope of the definition according to spec + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of FullyConnectedLayer class"; + } + unsigned int _out_num = casted->GetParamAsUInt("out-size"); } void FullyConnectedValidator::checkCorrespondence(const CNNLayer* layer, const std::map& blobs, const vector& inShapes) const { const auto casted = dynamic_cast(layer); - if (!casted) THROW_IE_EXCEPTION << "Layer is not instance of FullyConnectedLayer class"; - checkWeightable(blobs, inShapes, {casted->_out_num, true, 1}, {4, 2}); + if (!casted) THROW_IE_EXCEPTION << "Layer is not instance of FullyConnected layer class"; + checkWeightable(blobs, inShapes, {casted->_out_num, true, 1}, {2, 4, 5}); } FullyConnectedValidator::FullyConnectedValidator(const std::string& _type) : LayerValidator(_type) {} +void FullyConnectedValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void CropValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -204,9 +253,8 @@ void CropValidator::checkShapes(const CNNLayer* layer, const vector& THROW_IE_EXCEPTION << "Layer is not instance of CropLayer class"; } size_t numInputs = inShapes.size(); - if (numInputs != 1 && numInputs != 2) { - THROW_IE_EXCEPTION << "Crop can take only 1 or 2 inputs, but actually it has: " << numInputs; - } + checkNumOfInput(inShapes, {1, 2}); + auto firstShape = inShapes[0]; size_t shapeSize = firstShape.size(); for (size_t i = 0; i < casted->axis.size(); i++) { @@ -326,21 +374,50 @@ void ConvolutionValidator::checkParams(const CNNLayer* layer) { if (!casted) { THROW_IE_EXCEPTION << "Layer is not instance of ConvolutionLayer class"; } - // TODO: check that values belong to the scope of the definition according to spec + casted->GetParamAsUInt("output"); + + vector kernels = casted->GetParamAsUInts("kernel", {}); + if (kernels.empty()) { + // IR_v == 2 + casted->GetParamAsUInt("kernel-x"); + casted->GetParamAsUInt("kernel-y"); + casted->GetParamAsUInt("stride-x", 1u); + casted->GetParamAsUInt("stride-y", 1u); + casted->GetParamAsUInt("pad-x", 0u); + casted->GetParamAsUInt("pad-y", 0u); + casted->GetParamAsUInt("pad-r", casted->_padding[X_AXIS]); + casted->GetParamAsUInt("pad-b", casted->_padding[Y_AXIS]); + casted->GetParamAsUInt("dilation-x", 1u); + casted->GetParamAsUInt("dilation-y", 1u); + } else { + // IR_v > 2 + vector default_0 = vector (casted->_kernel.size(), 0u); + vector default_1 = vector (casted->_kernel.size(), 1u); + casted->GetParamAsUInts("strides", default_1); + casted->GetParamAsUInts("pads_begin", default_0); + casted->GetParamAsUInts("pads_end", default_0); + casted->GetParamAsUInts("dilations", default_1); + } + casted->GetParamAsString("auto_pad", ""); + casted->GetParamAsUInt("group", 1); } void ConvolutionValidator::checkCorrespondence(const CNNLayer* layer, const std::map& blobs, const vector& inShapes) const { auto convLayer = dynamic_cast(layer); - if (!convLayer) THROW_IE_EXCEPTION << "Layer is not instance of ConvolutionLayer class"; - auto version = BaseCreator::version_; - if (version < 3) { - checkWeightable(blobs, inShapes, {convLayer->_out_depth, false, convLayer->_group, convLayer->_kernel[Y_AXIS], convLayer->_kernel[X_AXIS]}, - {4}); - } else if (version == 3) { - // TODO: implement v2 convolution valitation - } + if (!convLayer) + THROW_IE_EXCEPTION << "Layer is not instance of Convolution layer class"; + + std::vector krn; + for (int i = 0; i < convLayer->_kernel.size(); i++) + krn.push_back(convLayer->_kernel[i]); + checkWeightable(blobs, inShapes, {convLayer->_out_depth, false, convLayer->_group, krn}, + {4, 5}); +} + +void ConvolutionValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); } void DeconvolutionValidator::parseParams(CNNLayer* layer) { @@ -352,7 +429,36 @@ void DeconvolutionValidator::parseParams(CNNLayer* layer) { } void DeconvolutionValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of ConvolutionLayer class"; + } + casted->GetParamAsUInt("output"); + + vector kernels = casted->GetParamAsUInts("kernel", {}); + if (kernels.empty()) { + // IR_v == 2 + casted->GetParamAsUInt("kernel-x"); + casted->GetParamAsUInt("kernel-y"); + casted->GetParamAsUInt("stride-x", 1u); + casted->GetParamAsUInt("stride-y", 1u); + casted->GetParamAsUInt("pad-x", 0u); + casted->GetParamAsUInt("pad-y", 0u); + casted->GetParamAsUInt("pad-r", casted->_padding[X_AXIS]); + casted->GetParamAsUInt("pad-b", casted->_padding[Y_AXIS]); + casted->GetParamAsUInt("dilation-x", 1u); + casted->GetParamAsUInt("dilation-y", 1u); + } else { + // IR_v > 2 + vector default_0 = vector (casted->_kernel.size(), 0u); + vector default_1 = vector (casted->_kernel.size(), 1u); + casted->GetParamAsUInts("strides", default_1); + casted->GetParamAsUInts("pads_begin", default_0); + casted->GetParamAsUInts("pads_end", default_0); + casted->GetParamAsUInts("dilations", default_1); + } + casted->GetParamAsString("auto_pad", ""); + casted->GetParamAsUInt("group", 1); } DeconvolutionValidator::DeconvolutionValidator(const std::string& _type) : ConvolutionValidator(_type) {} @@ -360,10 +466,19 @@ DeconvolutionValidator::DeconvolutionValidator(const std::string& _type) : Convo void DeconvolutionValidator::checkCorrespondence(const CNNLayer* layer, const std::map& blobs, const vector& inShapes) const { - auto casted = dynamic_cast(layer); - if (!casted) THROW_IE_EXCEPTION << "Layer is not instance of ConvolutionLayer class"; - checkWeightable(blobs, inShapes, {casted->_out_depth, false, casted->_group, casted->_kernel[Y_AXIS], casted->_kernel[X_AXIS]}, - {4}); + auto deconv_layer = dynamic_cast(layer); + if (!deconv_layer) + THROW_IE_EXCEPTION << "Layer is not instance of Deconvolution layer class"; + + std::vector krn; + for (int i = 0; i < deconv_layer->_kernel.size(); i++) + krn.push_back(deconv_layer->_kernel[i]); + checkWeightable(blobs, inShapes, {deconv_layer->_out_depth, false, deconv_layer->_group, krn}, + {4, 5}); +} + +void DeconvolutionValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); } PoolingValidator::PoolingValidator(const std::string& _type) : LayerValidator(_type) {} @@ -483,6 +598,10 @@ void PoolingValidator::checkParams(const CNNLayer* layer) { // TODO: check that values belong to the scope of the definition according to spec } +void PoolingValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); +} + void BatchNormalizationValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -492,11 +611,22 @@ void BatchNormalizationValidator::parseParams(CNNLayer* layer) { } void BatchNormalizationValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of BatchNormalizationLayer class"; + } + float epsilon = casted->GetParamAsFloat("epsilon"); + if (epsilon < 0) { + THROW_IE_EXCEPTION << "The value of BatchNormalization layer epsilon parameter is invalid"; + } } BatchNormalizationValidator::BatchNormalizationValidator(const std::string& _type) : LayerValidator(_type) {} +void BatchNormalizationValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void PowerValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -513,6 +643,10 @@ void PowerValidator::checkParams(const CNNLayer* layer) { PowerValidator::PowerValidator(const std::string& _type) : LayerValidator(_type) {} +void PowerValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void PReLUValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -527,6 +661,10 @@ void PReLUValidator::checkParams(const CNNLayer* layer) { PReLUValidator::PReLUValidator(const std::string& _type) : LayerValidator(_type) {} +void PReLUValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void ScaleShiftValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -543,6 +681,10 @@ void ScaleShiftValidator::checkParams(const CNNLayer* layer) { ScaleShiftValidator::ScaleShiftValidator(const std::string& _type) : LayerValidator(_type) {} +void ScaleShiftValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void TileValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -553,11 +695,23 @@ void TileValidator::parseParams(CNNLayer* layer) { } void TileValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of TileLayer class"; + } + int axis = casted->GetParamAsInt("axis", -1); + int tiles = casted->GetParamAsInt("tiles", -1); + if (axis < 0 && tiles < 0) { + THROW_IE_EXCEPTION << "The value of Tile layer parameters is invalid"; + } } TileValidator::TileValidator(const std::string& _type) : LayerValidator(_type) {} +void TileValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + ReshapeValidator::ReshapeValidator(const std::string& _type) : LayerValidator(_type) {} void ReshapeValidator::parseParams(CNNLayer *layer) { @@ -605,6 +759,36 @@ void EltwiseValidator::parseParams(CNNLayer* layer) { casted->_operation = EltwiseLayer::Prod; } else if (op == "max") { casted->_operation = EltwiseLayer::Max; + } else if (op == "sub") { + casted->_operation = EltwiseLayer::Sub; + } else if (op == "div") { + casted->_operation = EltwiseLayer::Div; + } else if (op == "min") { + casted->_operation = EltwiseLayer::Min; + } else if (op == "squared_diff") { + casted->_operation = EltwiseLayer::Squared_diff; + } else if (op == "equal") { + casted->_operation = EltwiseLayer::Equal; + } else if (op == "not_equal") { + casted->_operation = EltwiseLayer::Not_equal; + } else if (op == "less") { + casted->_operation = EltwiseLayer::Less; + } else if (op == "less_equal") { + casted->_operation = EltwiseLayer::Less_equal; + } else if (op == "greater") { + casted->_operation = EltwiseLayer::Greater; + } else if (op == "greater_equal") { + casted->_operation = EltwiseLayer::Greater_equal; + } else if (op == "logical_and") { + casted->_operation = EltwiseLayer::Logical_AND; + } else if (op == "logical_or") { + casted->_operation = EltwiseLayer::Logical_OR; + } else if (op == "logical_xor") { + casted->_operation = EltwiseLayer::Logical_XOR; + } else if (op == "floor_mod") { + casted->_operation = EltwiseLayer::Floor_mod; + } else if (op == "pow") { + casted->_operation = EltwiseLayer::Pow; } else { THROW_IE_EXCEPTION << "Unsupported element wise operation: " << op; } @@ -621,7 +805,17 @@ void EltwiseValidator::parseParams(CNNLayer* layer) { } void EltwiseValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of EltwiseLayer class"; + } +} + +void EltwiseValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + if (inShapes.empty()) { + THROW_IE_EXCEPTION << "Number of inputs (" << inShapes.size() << + ") of Eltwise layer is zero"; + } } EltwiseValidator::EltwiseValidator(const std::string& _type) : LayerValidator(_type) {} @@ -635,12 +829,13 @@ void ClampValidator::parseParams(CNNLayer* layer) { casted->max_value = casted->GetParamAsFloat("max"); } -void ClampValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); -} ClampValidator::ClampValidator(const std::string& _type) : LayerValidator(_type) {} +void ClampValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void ReLUValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -652,11 +847,24 @@ void ReLUValidator::parseParams(CNNLayer* layer) { } void ReLUValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of ReLULayer class"; + } + if (!casted->params.empty()) { + float negative_slope = casted->GetParamAsFloat("negative_slope"); + if (negative_slope < 0) { + THROW_IE_EXCEPTION << "The value of ReLU layer negative_slope parameter is invalid"; + } + } } ReLUValidator::ReLUValidator(const std::string& _type) : LayerValidator(_type) {} +void ReLUValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); +} + void MVNValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -667,11 +875,14 @@ void MVNValidator::parseParams(CNNLayer* layer) { } void MVNValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); } MVNValidator::MVNValidator(const std::string& _type) : LayerValidator(_type) {} +void MVNValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void GRNValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -686,6 +897,10 @@ void GRNValidator::checkParams(const CNNLayer* layer) { GRNValidator::GRNValidator(const std::string& _type) : LayerValidator(_type) {} +void GRNValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void SoftMaxValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -695,11 +910,22 @@ void SoftMaxValidator::parseParams(CNNLayer* layer) { } void SoftMaxValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of SoftMaxLayer class"; + } + int axis = casted->GetParamAsInt("axis", 1); + if (axis < 0) { + THROW_IE_EXCEPTION << "The value of SoftMax layer axis parameter is invalid"; + } } SoftMaxValidator::SoftMaxValidator(const std::string& _type) : LayerValidator(_type) {} +void SoftMaxValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + void NormValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { @@ -714,11 +940,23 @@ void NormValidator::parseParams(CNNLayer* layer) { } void NormValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of NormLayer class"; + } + float _alpha = casted->GetParamAsFloat("alpha"); + float _beta = casted->GetParamAsFloat("beta"); + if (_alpha < 0 && _beta < 0) { + THROW_IE_EXCEPTION << "The value of Norm layer alpha or beta parameters is invalid"; + } } NormValidator::NormValidator(const std::string& _type) : LayerValidator(_type) {} +void NormValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + SplitValidator::SplitValidator(const std::string& _type) : LayerValidator(_type) {} void SplitValidator::parseParams(CNNLayer* layer) { @@ -733,7 +971,7 @@ void SplitValidator::parseParams(CNNLayer* layer) { if (!out_sizes.empty()) out_sizes += ","; if (static_cast(i->getTensorDesc().getDims().size()) <= casted->_axis) { - THROW_IE_EXCEPTION << "Internal error - dimensions are emtpy"; + THROW_IE_EXCEPTION << "Internal error - dimensions are empty"; } out_sizes += std::to_string(i->getTensorDesc().getDims()[casted->_axis]); } @@ -741,19 +979,6 @@ void SplitValidator::parseParams(CNNLayer* layer) { casted->params["out_sizes"] = out_sizes; } -void checkNumOfInput(const std::vector& inShapes, const vector& expected_num_of_shapes) { - bool shape_was_found = false; - for (const auto& i : expected_num_of_shapes) { - if (inShapes.size() == i) { - shape_was_found = true; - } - } - if (!shape_was_found) { - THROW_IE_EXCEPTION << "Number of inputs (" << inShapes.size() << ") is not equal to expected ones"; - } -} - - void SplitValidator::checkParams(const CNNLayer* layer) { LayerValidator::checkParams(layer); std::vector out_sizes = layer->GetParamAsInts("out_sizes", {}); @@ -768,6 +993,19 @@ void SplitValidator::checkShapes(const CNNLayer* layer, const std::vector 3) { + std::vector out_sizes = layer->GetParamAsInts("out_sizes", {}); + size_t sum(0); + for (const auto& size : out_sizes) + sum += size; + if (inShapes.empty() || inShapes[0].size() <= casted->_axis) + THROW_IE_EXCEPTION << "Layer has incorrect input shapes!"; + if (sum != inShapes[0][casted->_axis]) { + THROW_IE_EXCEPTION << "The sum of the dimensions on the axis(" << casted->_axis + << ") is not equal out_sizes: " << details::dumpVec(out_sizes); + } + } } ConcatValidator::ConcatValidator(const std::string& _type) : LayerValidator(_type) {} @@ -781,11 +1019,9 @@ void ConcatValidator::parseParams(CNNLayer* layer) { } void ConcatValidator::checkParams(const CNNLayer* layer) { - LayerValidator::checkParams(layer); } -void ConcatValidator::checkShapes(const CNNLayer* layer, - const std::vector& inShapes) const { +void ConcatValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { if (inShapes.empty()) THROW_IE_EXCEPTION << "Inputs are empty"; @@ -812,9 +1048,8 @@ void ConcatValidator::checkShapes(const CNNLayer* layer, bool eq_part2 = std::equal(firstShape.begin() + axis + 1, firstShape.end(), shape.begin() + axis + 1); if (!(eq_part1 && eq_part2)) - THROW_IE_EXCEPTION << "Invalid inputs for Concat layer: dimensions should match in all" - << "positions except axis (" << axis << ") one" - << ") should match : [" << dumpVec(firstShape) << "] vs [" + THROW_IE_EXCEPTION << "Invalid inputs for Concat layer: dimensions should match in all " + << "positions except axis (" << axis << ") : [" << dumpVec(firstShape) << "] vs [" << dumpVec(shape) <<"]"; } } @@ -843,8 +1078,7 @@ void GemmValidator::checkShapes(const CNNLayer* layer, const vector& } size_t numInputs = inShapes.size(); - if (numInputs != 2 && numInputs != 3) - THROW_IE_EXCEPTION << "Gemm can take only 2 or 3 inputs, but actually it has: " << numInputs; + checkNumOfInput(inShapes, {2, 3}); auto dims0 = inShapes[0]; auto dims1 = inShapes[1]; @@ -879,7 +1113,7 @@ PadValidator::PadValidator(const std::string& _type) : LayerValidator(_type) {} void PadValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { - THROW_IE_EXCEPTION << "Layer is not instance of PadLayer class"; + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of PadLayer class"; } std::vector pads_begin = casted->GetParamAsUInts("pads_begin"); std::vector pads_end = casted->GetParamAsUInts("pads_end"); @@ -906,7 +1140,7 @@ void PadValidator::parseParams(CNNLayer* layer) { } else if (mode == "symmetric") { casted->pad_mode = PadLayer::Symmetric; } else { - THROW_IE_EXCEPTION << "Unsupported pad mode operation: " << mode; + THROW_IE_EXCEPTION << layer->name << " Unsupported pad mode operation: " << mode; } } @@ -917,30 +1151,29 @@ void PadValidator::checkParams(const CNNLayer* layer) { void PadValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { auto casted = dynamic_cast(layer); if (!casted) { - THROW_IE_EXCEPTION << "Layer is not instance of PadLayer class"; + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of PadLayer class"; } size_t numInputs = inShapes.size(); - if (numInputs != 1) - THROW_IE_EXCEPTION << "Pad can take only 1 input, but actually it has: " << numInputs; + checkNumOfInput(inShapes, {1}); if (inShapes[0].size() != casted->pads_begin.size()) - THROW_IE_EXCEPTION << "Dimensions count mismatch in layer " << layer->name + THROW_IE_EXCEPTION << layer->name << " Dimensions count mismatch in layer " << layer->name << ". Expected: " << casted->pads_begin.size() << " Got: " << inShapes[0].size(); if (inShapes[0].size() != casted->pads_end.size()) - THROW_IE_EXCEPTION << "Dimensions count mismatch in layer " << layer->name + THROW_IE_EXCEPTION << layer->name << " Dimensions count mismatch in layer " << layer->name << ". Expected: " << casted->pads_end.size() << " Got: " << inShapes[0].size(); if (casted->pad_mode == PadLayer::Symmetric || casted->pad_mode == PadLayer::Reflect) { for (size_t i = 0; i < inShapes[0].size(); i++) { if (inShapes[0][i] < casted->pads_begin[i]) { - THROW_IE_EXCEPTION << "Pad can't be grater than input shape in symmetric and reflect modes." + THROW_IE_EXCEPTION << layer->name << " Pad can't be grater than input shape in symmetric and reflect modes." << " For dimension " << i << " pad_begin=" << casted->pads_begin[i] << " in_shape="<< inShapes[0][i]; } if (inShapes[0][i] < casted->pads_end[i]) { - THROW_IE_EXCEPTION << "Pad can't be grater than input shape in symmetric and reflect modes." + THROW_IE_EXCEPTION << layer->name << " Pad can't be grater than input shape in symmetric and reflect modes." << " For dimension " << i << " pad_end=" << casted->pads_end[i] << " in_shape="<< inShapes[0][i]; } @@ -953,7 +1186,7 @@ GatherValidator::GatherValidator(const std::string& _type) : LayerValidator(_typ void GatherValidator::parseParams(CNNLayer* layer) { auto casted = dynamic_cast(layer); if (!casted) { - THROW_IE_EXCEPTION << "Layer is not instance of GatherLayer class"; + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of GatherLayer class"; } casted->axis = casted->GetParamAsInt("axis", 0); @@ -966,58 +1199,1322 @@ void GatherValidator::checkParams(const CNNLayer* layer) { void GatherValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { auto casted = dynamic_cast(layer); if (!casted) { - THROW_IE_EXCEPTION << "Layer is not instance of GatherLayer class"; + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of GatherLayer class"; } size_t numInputs = inShapes.size(); if (numInputs != 2) - THROW_IE_EXCEPTION << "Gather can take only 2 inputs, but actually it has: " << numInputs; + THROW_IE_EXCEPTION << layer->name << " Gather can take only 2 inputs, but actually it has: " << numInputs; - if (casted->axis > 0 && (inShapes[0].size() - casted->axis) < 1) - THROW_IE_EXCEPTION << "Incorrect input dictionary dimensions " << inShapes[0].size() + if (casted->axis > 0 && inShapes[0].size() < (1 + casted->axis)) + THROW_IE_EXCEPTION << layer->name << " Incorrect input dictionary dimensions " << inShapes[0].size() << " and axis number " << casted->axis; else if (casted->axis < 0 && (static_cast(inShapes[0].size()) + casted->axis) < 0) - THROW_IE_EXCEPTION << "Incorrect input dictionary dimensions " << inShapes[0].size() + THROW_IE_EXCEPTION << layer->name << " Incorrect input dictionary dimensions " << inShapes[0].size() << " and axis number " << casted->axis; } -RNNValidator::RNNValidator(const std::string& _type) : LayerValidator(_type) {} +StridedSliceValidator::StridedSliceValidator(const std::string& _type) : LayerValidator(_type) {} -void RNNValidator::parseParams(CNNLayer* layer) { - auto casted = dynamic_cast(layer); - if (!casted) - THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; +void StridedSliceValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of StridedSlice class"; + } - std::string cell = layer->GetParamAsString("cell_type"); - std::string direction = layer->GetParamAsString("direction", "Forward"); - int axis = layer->GetParamAsInt("axis", 1); + casted->begin_mask = layer->GetParamAsString("begin_mask", ""); + casted->end_mask = layer->GetParamAsString("end_mask", ""); + casted->ellipsis_mask = layer->GetParamAsString("ellipsis_mask", ""); + casted->new_axis_mask = layer->GetParamAsString("new_axis_mask", ""); + casted->shrink_axis_mask = layer->GetParamAsString("shrink_axis_mask", ""); +} - if (!one_of(cell, "LSTM", "RNN", "GRU")) - THROW_IE_EXCEPTION << "Unknown RNN cell type " << cell << ". " - << "Expected one of [ LSTM | RNN | GRU ]."; +void StridedSliceValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} - if (!one_of(direction, "Forward", "Backward", "Bidirectional")) - THROW_IE_EXCEPTION << "Unknown RNN direction type " << direction << ". " - << "Expected one of [ Forward | Backward | Bidirectional ]."; +void StridedSliceValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of StridedSliceLayer class"; + } - casted->axis = axis; - casted->cellType = cell; - casted->direction = direction == "Forward" ? RNNLayer::RNN_FWD : - direction == "Backward" ? RNNLayer::RNN_BWD : - RNNLayer::RNN_BDR; + size_t numInputs = inShapes.size(); + if (numInputs > 4) + THROW_IE_EXCEPTION << layer->name << " StridedSlice can take up to 4 inputs, but actually it has: " << numInputs; + + size_t ellipsis_mask_counter = 0; + for (size_t i = 0; i < casted->ellipsis_mask.size(); ++i) { + if (casted->ellipsis_mask[i] == '1') + ellipsis_mask_counter++; + } + if (ellipsis_mask_counter > 1) + THROW_IE_EXCEPTION << layer->name << " 'Ellipsis_mask' must be a power of two (only one ellipsis)!"; } -void RNNValidator::checkParams(const InferenceEngine::CNNLayer *layer) { - auto casted = dynamic_cast(layer); - if (!casted) - THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; - if (!one_of(casted->axis, 1, 0)) - THROW_IE_EXCEPTION << "Unsupported axis for RNN layer iterator. Only 0 and 1 axis are supported."; +ShuffleChannelsValidator::ShuffleChannelsValidator(const std::string& _type) : LayerValidator(_type) {} + +void ShuffleChannelsValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ShuffleChannels class"; + } + + casted->axis = casted->GetParamAsInt("axis", 1); + casted->group = casted->GetParamAsUInt("group", 1); +} + +void ShuffleChannelsValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ShuffleChannelsValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ShuffleChannels class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 1) + THROW_IE_EXCEPTION << layer->name << " ShuffleChannels can take only 1 input, but actually it has: " << numInputs; + + if (casted->axis > 0 && inShapes[0].size() < (1 + casted->axis)) + THROW_IE_EXCEPTION << layer->name << "I ncorrect input tensor dimensions " << inShapes[0].size() + << " and axis number " << casted->axis; + else if (casted->axis < 0 && (static_cast(inShapes[0].size()) + casted->axis) < 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect input dictionary dimensions " << inShapes[0].size() + << " and axis number " << casted->axis; + + int axis = casted->axis; + if (axis < 0) + axis += inShapes[0].size(); + + if (inShapes[0][axis] % casted->group) + THROW_IE_EXCEPTION << layer->name << " Group parameter must evenly divide the channel dimension!"; + + size_t dataLength = 1; + for (size_t i = axis + 1; i < inShapes[0].size(); i++) + dataLength *= inShapes[0][i]; + + if (dataLength == 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimension!"; +} + + +DepthToSpaceValidator::DepthToSpaceValidator(const std::string& _type) : LayerValidator(_type) {} + +void DepthToSpaceValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of DepthToSpace class"; + } + + casted->block_size = casted->GetParamAsUInt("block_size", 1); +} + +void DepthToSpaceValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void DepthToSpaceValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of DepthToSpace class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 1) + THROW_IE_EXCEPTION << layer->name << " DepthToSpace can take only 1 input, but actually it has: " << numInputs; + + if (inShapes[0].size() < 3) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input dimensions!"; + + if (casted->block_size == 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect block_size parameter is zero!"; + + if (inShapes[0][inShapes[0].size() - 3] % (casted->block_size * casted->block_size)) + THROW_IE_EXCEPTION << layer->name << " block_size parameter is incompatible with input tensor Color dimension size!"; +} + + +SpaceToDepthValidator::SpaceToDepthValidator(const std::string& _type) : LayerValidator(_type) {} + +void SpaceToDepthValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of SpaceToDepth class"; + } + + casted->block_size = casted->GetParamAsUInt("block_size", 1); +} + +void SpaceToDepthValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void SpaceToDepthValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of SpaceToDepth class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 1) + THROW_IE_EXCEPTION << layer->name << " SpaceToDepth can take only 1 input, but actually it has: " << numInputs; - // TODO: Add more RNN verification.. + if (inShapes[0].size() < 2) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of input dimensions!"; + + if (casted->block_size == 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect block_size parameter is zero!"; + + if (inShapes[0][inShapes[0].size() - 1] % casted->block_size) + THROW_IE_EXCEPTION << layer->name << " block_size parameter is incompatible with input tensor With dimension size!"; + + if (inShapes[0][inShapes[0].size() - 2] % casted->block_size) + THROW_IE_EXCEPTION << layer->name << " block_size parameter is incompatible with input tensor Height dimension size!"; +} + + +ReverseSequenceValidator::ReverseSequenceValidator(const std::string& _type) : LayerValidator(_type) {} + +void ReverseSequenceValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ReverseSequence class"; + } + + casted->seq_axis = casted->GetParamAsInt("seq_axis", 1); + casted->batch_axis = casted->GetParamAsInt("batch_axis", 0); +} + +void ReverseSequenceValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ReverseSequenceValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ReverseSequence class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 2) + THROW_IE_EXCEPTION << layer->name << " ReverseSequence can take 2 inputs, but actually it has: " << numInputs; + + if (inShapes[1].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'seq_lengths' input dimensions!"; + + if (casted->seq_axis > 0 && inShapes[0].size() < (1 + casted->seq_axis)) + THROW_IE_EXCEPTION << layer->name << "Incorrect input tensor dimensions " << inShapes[0].size() + << " and seq_axis number " << casted->seq_axis; + else if (casted->seq_axis < 0 && (static_cast(inShapes[0].size()) + casted->seq_axis) < 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect input dictionary dimensions " << inShapes[0].size() + << " and seq_axis number " << casted->seq_axis; + + if (casted->batch_axis > 0 && inShapes[0].size() < (1 + casted->batch_axis)) + THROW_IE_EXCEPTION << layer->name << "Incorrect input tensor dimensions " << inShapes[0].size() + << " and batch_axis number " << casted->batch_axis; + else if (casted->batch_axis < 0 && (static_cast(inShapes[0].size()) + casted->batch_axis) < 0) + THROW_IE_EXCEPTION << layer->name << " Incorrect input dictionary dimensions " << inShapes[0].size() + << " and batch_axis number " << casted->batch_axis; + + int batch_axis = casted->batch_axis; + if (batch_axis < 0) + batch_axis += inShapes[0].size(); + if (inShapes[1][0] != inShapes[0][batch_axis]) + THROW_IE_EXCEPTION << layer->name << " Incorrect 'seq_lengths_dims' parameter dimensions!"; } -void RNNValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const {} + +SqueezeValidator::SqueezeValidator(const std::string& _type) : LayerValidator(_type) {} + +void SqueezeValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Squeeze class"; + } +} + +void SqueezeValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void SqueezeValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Squeeze class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 2) + THROW_IE_EXCEPTION << layer->name << " Squeeze can take 2 inputs, but actually it has: " << numInputs; + + if (inShapes[1].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'indices_to_squeeze' input dimensions!"; +} + + +UnsqueezeValidator::UnsqueezeValidator(const std::string& _type) : LayerValidator(_type) {} + +void UnsqueezeValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Unsqueeze class"; + } +} + +void UnsqueezeValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void UnsqueezeValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Unsqueeze class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 2) + THROW_IE_EXCEPTION << layer->name << " Unsqueeze can take 2 inputs, but actually it has: " << numInputs; + + if (inShapes[1].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'indices_to_set' input dimensions!"; +} + + +RangeValidator::RangeValidator(const std::string& _type) : LayerValidator(_type) {} + +void RangeValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Range class"; + } +} + +void RangeValidator::checkParams(const CNNLayer* layer) {} + +void RangeValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Range class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 3) + THROW_IE_EXCEPTION << layer->name << " Range can take 3 inputs, but actually it has: " << numInputs; + + if (inShapes[0].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'start' input dimensions!"; + + if (inShapes[1].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'limit' input dimensions!"; + + if (inShapes[2].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'delta' input dimensions!"; +} + + +FillValidator::FillValidator(const std::string& _type) : LayerValidator(_type) {} + +void FillValidator::parseParams(CNNLayer* layer) {} + +void FillValidator::checkParams(const CNNLayer* layer) {} + +void FillValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + size_t numInputs = inShapes.size(); + if (numInputs != 2) + THROW_IE_EXCEPTION << layer->name << " Fill can take 2 inputs, but actually it has: " << numInputs; + + if (inShapes[0].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'fill_dims' input dimensions!"; + + if (inShapes[1].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'fill_value' input dimensions!"; +} + + +ExpandValidator::ExpandValidator(const std::string& _type) : LayerValidator(_type) {} + +void ExpandValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Expand class"; + } +} + +void ExpandValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ExpandValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << layer->name << " Layer is not instance of Expand class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 2) + THROW_IE_EXCEPTION << layer->name << " Expand can take 2 inputs, but actually it has: " << numInputs; + + if (inShapes[1].size() != 1) + THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'shape' input dimensions!"; +} + +/****************************************/ +/*** RNN specific validators ************/ +/****************************************/ + +static RNNCellBase::CellType cell_type_from(string type_name) { + const vector to_remove {"Cell", "Sequence"}; + for (auto &sub : to_remove) { + auto idx = type_name.find(sub); + if (idx != string::npos) + type_name.erase(idx); + } + + if (!one_of(type_name, "LSTM", "RNN", "GRU")) + THROW_IE_EXCEPTION << "Unknown RNN cell type " << type_name << ". " + << "Expected one of [ LSTM | RNN | GRU ]."; + + return type_name == "LSTM" ? RNNSequenceLayer::LSTM : + type_name == "GRU" ? RNNSequenceLayer::GRU : + type_name == "RNN" ? RNNSequenceLayer::RNN : + RNNSequenceLayer::LSTM; +} + +static RNNSequenceLayer::Direction direction_from(string direction_name) { + if (!one_of(direction_name, "Forward", "Backward", "Bidirectional")) + THROW_IE_EXCEPTION << "Unknown RNN direction type " << direction_name << ". " + << "Expected one of [ Forward | Backward | Bidirectional ]."; + + return direction_name == "Forward" ? RNNSequenceLayer::FWD : + direction_name == "Backward" ? RNNSequenceLayer::BWD : + direction_name == "Bidirecttional" ? RNNSequenceLayer::BDR : + RNNSequenceLayer::FWD; +} + +template<> +std::vector +RNNBaseValidator::def_acts {"sigmoid", "tanh", "tanh"}; +template<> +std::vector +RNNBaseValidator::def_alpha {0, 0, 0}; +template<> +std::vector +RNNBaseValidator::def_beta {0, 0, 0}; +template<> +size_t +RNNBaseValidator::G = 4; +template<> +size_t +RNNBaseValidator::NS = 2; + +template<> +std::vector +RNNBaseValidator::def_acts {"sigmoid", "tanh"}; +template<> +std::vector +RNNBaseValidator::def_alpha {0, 0}; +template<> +std::vector +RNNBaseValidator::def_beta {0, 0}; +template<> +size_t +RNNBaseValidator::G = 3; +template<> +size_t +RNNBaseValidator::NS = 1; + +template<> +std::vector +RNNBaseValidator::def_acts {"tanh"}; +template<> +std::vector +RNNBaseValidator::def_alpha {0}; +template<> +std::vector +RNNBaseValidator::def_beta {0}; +template<> +size_t +RNNBaseValidator::G = 1; +template<> +size_t +RNNBaseValidator::NS = 1; + +template +RNNBaseValidator::RNNBaseValidator(const std::string& _type) : LayerValidator(_type) {} + +template +void RNNBaseValidator::parseParams(CNNLayer* layer) { + auto rnn = dynamic_cast(layer); + if (!rnn) + THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; + + rnn->cellType = cell_type_from(layer->type); + rnn->hidden_size = rnn->GetParamAsInt("hidden_size"); + rnn->clip = rnn->GetParamAsFloat("clip", 0.0f); + rnn->activations = rnn->GetParamAsStrings("activations", def_acts); + rnn->activation_alpha = rnn->GetParamAsFloats("activation_alpha", def_alpha); + rnn->activation_beta = rnn->GetParamAsFloats("activation_beta", def_beta); + + if (rnn->cellType == RNNCellBase::GRU) { + auto lbr = rnn->GetParamAsBool("linear_before_reset", false); + if (lbr) rnn->cellType = RNNCellBase::GRU_LBR; + } +} + +template +void RNNBaseValidator::checkParams(const InferenceEngine::CNNLayer *layer) { + auto rnn = dynamic_cast(layer); + if (!rnn) + THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; + + if (rnn->clip < 0.0f) + THROW_IE_EXCEPTION << "Clip parameter should be positive"; + + for (auto &act : rnn->activations) + if (!one_of(act, "sigmoid", "tanh", "relu")) + THROW_IE_EXCEPTION << "Unsupported activation function (" << act << ") for RNN layer."; + + int act_num_required = def_acts.size(); + if (rnn->activations.size() != act_num_required) + THROW_IE_EXCEPTION << "Expected " << act_num_required << " activations, but provided " + << rnn->activations.size(); + + if (rnn->activation_alpha.size() != act_num_required) + THROW_IE_EXCEPTION << "Expected " << act_num_required << " activation alpha parameters, " + << "but provided " << rnn->activation_alpha.size(); + if (rnn->activation_beta.size() != act_num_required) + THROW_IE_EXCEPTION << "Expected " << act_num_required << " activation beta parameters, " + << "but provided " << rnn->activation_beta.size(); +} + +template +void RNNBaseValidator::checkCorrespondence(const CNNLayer* layer, + const map& blobs, + const vector& inShapes) const { + auto rnn = dynamic_cast(layer); + if (!rnn) + THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; + + if (blobs.size() != 2) + THROW_IE_EXCEPTION << "Expected only 2 blobs with trained parameters (weights and biases), " + << "but provided only " << blobs.size(); + if (inShapes.empty()) + THROW_IE_EXCEPTION << "No input tensors."; + + size_t D = inShapes[0].back(); + size_t S = rnn->hidden_size; + size_t expectetd_w_size = G*S*(D+S); + size_t expectetd_b_size = G*S; + + if (rnn->cellType == RNNCellBase::GRU_LBR) + expectetd_b_size = (G + 1)*S; + + auto w = blobs.find("weights"); + if (w == blobs.end()) + THROW_IE_EXCEPTION << "Weights blob is not provided"; + + if (w->second->size() != expectetd_w_size) + THROW_IE_EXCEPTION << "Weights blob has wrang size. Expected " << expectetd_w_size; + + auto b = blobs.find("biases"); + if (b == blobs.end()) + THROW_IE_EXCEPTION << "Biases blob is not provided"; + + if (b->second->size() != expectetd_b_size) + THROW_IE_EXCEPTION << "Biases blob has wrang size. Expected " << expectetd_b_size; +} + +template +RNNSequenceValidator::RNNSequenceValidator(const std::string& _type) : RNNBaseValidator(_type) {} + +template +void RNNSequenceValidator::parseParams(CNNLayer* layer) { + RNNBaseValidator::parseParams(layer); + + auto casted = dynamic_cast(layer); + if (!casted) + THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; + + std::string direction = layer->GetParamAsString("direction"); + + casted->axis = layer->GetParamAsUInt("axis", 1); + casted->direction = direction_from(direction); +} + +template +void RNNSequenceValidator::checkParams(const InferenceEngine::CNNLayer *layer) { + RNNBaseValidator::checkParams(layer); + + auto casted = dynamic_cast(layer); + if (!casted) + THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class"; + + if (!one_of(casted->axis, 1, 0)) + THROW_IE_EXCEPTION << "Unsupported iteration axis for RNNSequense layer. Only 0 or 1 axis are supported."; +} + +template +void RNNSequenceValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto rnn = dynamic_cast(layer); + if (!rnn) + THROW_IE_EXCEPTION << "Layer is not instance of RNNSequenceLayer class"; + + if (inShapes.empty()) + THROW_IE_EXCEPTION << "No input tensors."; + + if (inShapes[0].size() != 3) + THROW_IE_EXCEPTION << "First input data tensor should be 3D"; + + size_t T_axis = rnn->axis; + size_t N_axis = (T_axis + 1)%2; + size_t N = inShapes[0][N_axis]; + size_t T = inShapes[0][T_axis]; + size_t D = inShapes[0].back(); + size_t S = rnn->hidden_size; + size_t NS = RNNSequenceValidator::NS; + + SizeVector expected_state_shape {N, S}; + + if (inShapes.size() > 1) { // has an initial state blobs + if (inShapes.size() != 1 + NS) + THROW_IE_EXCEPTION << "Wrong number of input tensors. Expected 1 (data) or " + << 1 + NS << " (data and states)"; + if (inShapes[1] != expected_state_shape) + THROW_IE_EXCEPTION << "Wrong shape of first initial state tensors."; +// << " Expected " << expected_state_shape << " but provided " << inShapes[1]; + + if (NS == 2 && inShapes[2] != expected_state_shape) + THROW_IE_EXCEPTION << "Wrong shape of second initial state tensors."; +// << " Expected " << expected_state_shape << " but provided " << inShapes[2]; + } +} + +template class details::RNNSequenceValidator; +template class details::RNNSequenceValidator; +template class details::RNNSequenceValidator; + +template +RNNCellValidator::RNNCellValidator(const std::string& _type) : RNNBaseValidator(_type) {} + +template +void RNNCellValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto rnn = dynamic_cast(layer); + if (!rnn) + THROW_IE_EXCEPTION << "Layer is not instance of RNNSequenceLayer class"; + + const size_t &NS = RNNCellValidator::NS; + + if (inShapes.size() != NS + 1) + THROW_IE_EXCEPTION << "Wrong number of input tensors. Expected " << NS + 1; + + if (inShapes[0].size() != 2) + THROW_IE_EXCEPTION << "First input data tensor should be 2D"; + + size_t N = inShapes[0][0]; + size_t D = inShapes[0][1]; + size_t S = rnn->hidden_size; + + SizeVector expected_state_shape {N, S}; + + if (inShapes[1] != expected_state_shape) + THROW_IE_EXCEPTION << "Wrong shape of first initial state tensors."; +// << " Expected " << expected_state_shape << " but provided " << inShapes[1]; + + if (NS == 2 && inShapes[2] != expected_state_shape) + THROW_IE_EXCEPTION << "Wrong shape of second initial state tensors."; +// << " Expected " << expected_state_shape << " but provided " << inShapes[2]; +} + +template class details::RNNCellValidator; +template class details::RNNCellValidator; +template class details::RNNCellValidator; + +void ArgMaxValidator::checkParams(const CNNLayer* layer) { + unsigned int top_k_ = layer->GetParamAsUInt("top_k"); +} + +void ArgMaxValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +ArgMaxValidator::ArgMaxValidator(const std::string& _type) : LayerValidator(_type) { +} + +void CTCGreedyDecoderValidator::checkParams(const CNNLayer* layer) { + int flag = layer->GetParamAsInt("ctc_merge_repeated", 0); + if (flag != 0 && flag != 1) { + THROW_IE_EXCEPTION << "CTCGreedyDecoder layer parameter ctc_merge_repeated is invalid"; + } +} + +void CTCGreedyDecoderValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); +} + +CTCGreedyDecoderValidator::CTCGreedyDecoderValidator(const std::string& _type) : LayerValidator(_type) { +} + +void DetectionOutputValidator::parseParams(CNNLayer* layer) { + unsigned int num_classes = layer->GetParamAsUInt("num_classes"); + if (num_classes == 0) { + THROW_IE_EXCEPTION << "num_classes parameter of DetectionOutput layer can't be equal to zero"; + } + float _nms_threshold = layer->GetParamAsFloat("nms_threshold"); + if (_nms_threshold < 0) { + THROW_IE_EXCEPTION << "nms_threshold parameter of DetectionOutput layer can't be less then zero"; + } + int _keep_top_k = layer->GetParamAsUInt("keep_top_k", -1); + + if (layer->CheckParamPresence("background_label_id")) + int _background_label_id = layer->GetParamAsUInt("background_label_id", -1); + if (layer->CheckParamPresence("top_k")) + int _top_k = layer->GetParamAsUInt("top_k", -1); + if (layer->CheckParamPresence("variance_encoded_in_target")) + bool _variance_encoded_in_target = static_cast(layer->GetParamAsUInt("variance_encoded_in_target")); + if (layer->CheckParamPresence("num_orient_classes")) + int _num_orient_classes = layer->GetParamAsUInt("num_orient_classes"); + if (layer->CheckParamPresence("share_location")) + bool _share_location = static_cast(layer->GetParamAsUInt("share_location")); + if (layer->CheckParamPresence("interpolate_orientation")) + int _interpolate_orientation = layer->GetParamAsInt("interpolate_orientation"); + if (layer->CheckParamPresence("confidence_threshold")) { + float _confidence_threshold = layer->GetParamAsFloat("confidence_threshold"); + if (_confidence_threshold < 0) { + THROW_IE_EXCEPTION << "_nms_threshold parameter of DetectionOutput layer can't be less then zero"; + } + } + + if (layer->CheckParamPresence("code_type")) { + std::string _code_type = layer->GetParamAsString("code_type"); + std::vector code_types = {"caffe.PriorBoxParameter.CENTER_SIZE", + "caffe.PriorBoxParameter.CORNER"}; + auto it = std::find(code_types.begin(), code_types.end(), _code_type); + if (it == code_types.end()) { + THROW_IE_EXCEPTION << "Parameter code_type of DetectionOutput layer "; + } + } +} + +void DetectionOutputValidator::checkParams(const CNNLayer* layer) { + unsigned int num_classes = layer->GetParamAsUInt("num_classes"); + if (num_classes == 0) { + THROW_IE_EXCEPTION << "num_classes parameter of DetectionOutput layer can't be equal to zero"; + } + float _nms_threshold = layer->GetParamAsFloat("nms_threshold"); + if (_nms_threshold < 0) { + THROW_IE_EXCEPTION << "nms_threshold parameter of DetectionOutput layer can't be less then zero"; + } + int _keep_top_k = layer->GetParamAsUInt("keep_top_k", -1); + + if (layer->CheckParamPresence("background_label_id")) + int _background_label_id = layer->GetParamAsUInt("background_label_id", -1); + if (layer->CheckParamPresence("top_k")) + int _top_k = layer->GetParamAsUInt("top_k", -1); + if (layer->CheckParamPresence("variance_encoded_in_target")) + bool _variance_encoded_in_target = static_cast(layer->GetParamAsUInt("variance_encoded_in_target")); + if (layer->CheckParamPresence("num_orient_classes")) + int _num_orient_classes = layer->GetParamAsUInt("num_orient_classes"); + if (layer->CheckParamPresence("share_location")) + bool _share_location = static_cast(layer->GetParamAsUInt("share_location")); + if (layer->CheckParamPresence("interpolate_orientation")) + int _interpolate_orientation = layer->GetParamAsInt("interpolate_orientation"); + if (layer->CheckParamPresence("confidence_threshold")) { + float _confidence_threshold = layer->GetParamAsFloat("confidence_threshold"); + if (_confidence_threshold < 0) { + THROW_IE_EXCEPTION << "_nms_threshold parameter of DetectionOutput layer can't be less then zero"; + } + } + if (layer->CheckParamPresence("code_type")) { + std::string _code_type = layer->GetParamAsString("code_type"); + std::vector code_types = {"caffe.PriorBoxParameter.CENTER_SIZE", + "caffe.PriorBoxParameter.CORNER"}; + auto it = std::find(code_types.begin(), code_types.end(), _code_type); + if (it == code_types.end()) { + THROW_IE_EXCEPTION << "Parameter code_type of DetectionOutput layer "; + } + } +} + +void DetectionOutputValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {3, 5}); +} + +DetectionOutputValidator::DetectionOutputValidator(const std::string& _type) : LayerValidator(_type) { +} + +void InterpValidator::checkParams(const CNNLayer* layer) { +} + +void InterpValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); + auto IS_ZERO = [](float value) { + return std::fabs(value) < std::numeric_limits::epsilon(); + }; + if (inShapes.size() != 2) { + float factor = layer->GetParamAsFloat("factor", 0); + if (factor < 0) + THROW_IE_EXCEPTION << "factor parameter of Interp layer can't be less then zero"; + float shrink_factor = layer->GetParamAsFloat("shrink_factor", 0); + if (shrink_factor < 0) + THROW_IE_EXCEPTION << "shrink_factor parameter of Interp layer can't be less then zero"; + float zoom_factor = (layer->GetParamAsFloat("zoom_factor", 0)); + if (zoom_factor < 0) + THROW_IE_EXCEPTION << "zoom_factor parameter of Interp layer can't be less then zero"; + bool noFactor = IS_ZERO(factor) && IS_ZERO(shrink_factor) && IS_ZERO(zoom_factor); + + auto height = layer->GetParamAsUInt("height", 0); + auto width = layer->GetParamAsUInt("width", 0); + + if (noFactor && (height == 0 || width == 0)) { + THROW_IE_EXCEPTION + << "Can't reshape without factor, or target resolution. " + << "Supported attributes: factor, shrink_factor, zoom_factor, height, width"; + } + } +} + +InterpValidator::InterpValidator(const std::string& _type) : LayerValidator(_type) { +} + +void InterpValidator::parseParams(CNNLayer* layer) { + float factor = layer->GetParamAsFloat("factor", 0); + float shrink_factor = layer->GetParamAsFloat("shrink_factor", 0); + float zoom_factor = layer->GetParamAsFloat("zoom_factor", 0); + + auto height = layer->GetParamAsUInt("height", 0); + auto width = layer->GetParamAsUInt("width", 0); +} + + void PermuteValidator::checkParams(const CNNLayer* layer) { + std::vector layerOrder = layer->GetParamAsUInts("order"); +} + +void PermuteValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +PermuteValidator::PermuteValidator(const std::string& _type) : LayerValidator(_type) { +} + +void PriorBoxValidator::checkParams(const CNNLayer* layer) { + std::vector min_sizes = layer->GetParamAsUInts("min_size", {}); + std::vector max_sizes = layer->GetParamAsUInts("max_size", {}); + bool flip = static_cast(layer->GetParamAsInt("flip")); + if (layer->CheckParamPresence("aspect_ratio")) + const std::vector aspect_ratios = layer->GetParamAsUInts("aspect_ratio", {}); + bool clip_ = static_cast(layer->GetParamAsInt("clip")); + if (layer->CheckParamPresence("variance")) { + float variance_ = layer->GetParamAsFloat("variance", 1.0); + if (variance_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer variance_ parameter is invalid"; + } + } + float step_ = layer->GetParamAsFloat("step", 0); + if (step_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer step_ parameter is invalid"; + } + float offset_ = layer->GetParamAsFloat("offset"); + if (offset_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer offset_ parameter is invalid"; + } +} + +void PriorBoxValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {2}); +} + +PriorBoxValidator::PriorBoxValidator(const std::string& _type) : LayerValidator(_type) { +} + +void PriorBoxClusteredValidator::checkParams(const CNNLayer* layer) { + std::vector widths = layer->GetParamAsFloats("width", {}); + for (auto i : widths) { + if (i < 0) { + THROW_IE_EXCEPTION << "The value of PriorBoxClustered layer width parameter is invalid"; + } + } + std::vector heights = layer->GetParamAsFloats("height", {}); + for (auto i : heights) { + if (i < 0) { + THROW_IE_EXCEPTION << "The value of PriorBoxClustered layer heights parameter is invalid"; + } + } + bool flip = static_cast(layer->GetParamAsInt("flip")); + bool clip_ = static_cast(layer->GetParamAsInt("clip")); + float offset_ = layer->GetParamAsFloat("offset"); + if (offset_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer offset_ parameter is invalid"; + } + if (layer->CheckParamPresence("variance")) { + float variance_ = layer->GetParamAsFloat("variance"); + if (variance_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer variance_ parameter is invalid"; + } + } + float step_h_ = layer->GetParamAsFloat("step_h", 0); + if (step_h_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer step_h_ parameter is invalid"; + } + float step_w_ = layer->GetParamAsFloat("step_w", 0); + if (step_w_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer step_w_ parameter is invalid"; + } + float img_h_ = layer->GetParamAsFloat("img_h", 0); + if (img_h_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer img_h_ parameter is invalid"; + } + float img_w_ = layer->GetParamAsFloat("img_w", 0); + if (img_w_ < 0) { + THROW_IE_EXCEPTION << "The value of PriorBox layer img_w_ parameter is invalid"; + } +} + +void PriorBoxClusteredValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {2}); +} + +PriorBoxClusteredValidator::PriorBoxClusteredValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ProposalValidator::checkParams(const CNNLayer* layer) { + unsigned int post_nms_topn_ = layer->GetParamAsUInt("post_nms_topn"); + + if (layer->CheckParamPresence("feat_stride")) + unsigned int feat_stride_ = layer->GetParamAsUInt("feat_stride"); + if (layer->CheckParamPresence("base_size")) + unsigned int base_size_ = layer->GetParamAsUInt("base_size"); + if (layer->CheckParamPresence("min_size")) + unsigned int min_size_ = layer->GetParamAsUInt("min_size"); + if (layer->CheckParamPresence("pre_nms_topn")) + unsigned int pre_nms_topn_ = layer->GetParamAsUInt("pre_nms_topn"); + if (layer->CheckParamPresence("nms_thresh")) { + float nms_thresh_ = layer->GetParamAsFloat("nms_thresh"); + if (nms_thresh_ < 0) { + THROW_IE_EXCEPTION << "The value of Proposal layer nms_thresh_ parameter is invalid"; + } + } +} + +void ProposalValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {3}); +} + +ProposalValidator::ProposalValidator(const std::string& _type) : LayerValidator(_type) { +} + +void PSROIPoolingValidator::checkParams(const CNNLayer* layer) { + unsigned int output_dim = layer->GetParamAsUInt("output_dim"); + unsigned int group_size = layer->GetParamAsUInt("group_size"); + if (layer->CheckParamPresence("spatial_scale")) { + float spatial_scale_ = layer->GetParamAsFloat("spatial_scale"); + if (spatial_scale_ < 0) { + THROW_IE_EXCEPTION << "The value of PSROIPooling layer spatial_scale_ parameter is invalid"; + } + } +} + +void PSROIPoolingValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); +} + +PSROIPoolingValidator::PSROIPoolingValidator(const std::string& _type) : LayerValidator(_type) { +} + +void RegionYoloValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void RegionYoloValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +RegionYoloValidator::RegionYoloValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ReorgYoloValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ReorgYoloValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +ReorgYoloValidator::ReorgYoloValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ResampleValidator::checkParams(const CNNLayer* layer) { + if (layer->CheckParamPresence("antialias")) { + auto antialias = static_cast(layer->GetParamAsInt("antialias")); + + if (antialias != 0 && antialias != 1) { + THROW_IE_EXCEPTION << "The value of resample layer antialias parameter is invalid"; + } + } + if (layer->CheckParamPresence("type")) { + std::string type = layer->GetParamAsString("type"); + if (type != "caffe.ResampleParameter.NEAREST" && type != "caffe.ResampleParameter.CUBIC" && + type != "caffe.ResampleParameter.LINEAR") { + THROW_IE_EXCEPTION << "The value of resample layer type parameter is invalid"; + } + } +} + +void ResampleValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); +} + +ResampleValidator::ResampleValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ROIPoolingValidator::checkParams(const CNNLayer* layer) { + unsigned int pooled_h = layer->GetParamAsUInt("pooled_h"); + unsigned int pooled_w = layer->GetParamAsUInt("pooled_w"); + float spatial_scale = layer->GetParamAsFloat("spatial_scale"); + if (spatial_scale < 0) { + THROW_IE_EXCEPTION << "The value of ROIPooling layer spatial_scale parameter is invalid"; + } +} + +void ROIPoolingValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 2}); +} + +ROIPoolingValidator::ROIPoolingValidator(const std::string& _type) : LayerValidator(_type) { +} + +void SimplerNMSValidator::checkParams(const CNNLayer* layer) { + unsigned int post_nms_topn_ = layer->GetParamAsUInt("post_nms_topn"); + + if (layer->CheckParamPresence("min_bbox_size")) + unsigned int min_box_size_ = layer->GetParamAsUInt("min_bbox_size"); + if (layer->CheckParamPresence("feat_stride")) + unsigned int feat_stride_ = layer->GetParamAsUInt("feat_stride"); + if (layer->CheckParamPresence("pre_nms_topn")) + unsigned int pre_nms_topn_ = layer->GetParamAsUInt("pre_nms_topn"); + if (layer->CheckParamPresence("iou_threshold")) { + float iou_threshold_ = layer->GetParamAsFloat("iou_threshold"); + if (iou_threshold_ < 0) { + THROW_IE_EXCEPTION << "The value of SimplerNMS layer iou_threshold_ parameter is invalid"; + } + } + if (layer->CheckParamPresence("scale")) + std::vector scale = layer->GetParamAsUInts("scale", {}); + if (layer->CheckParamPresence("cls_threshold")) { + float cls_threshold = layer->GetParamAsFloat("cls_threshold"); + if (cls_threshold < 0) { + THROW_IE_EXCEPTION << "The value of SimplerNMS layer cls_threshold parameter is invalid"; + } + } +} + +void SimplerNMSValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {3}); +} + +SimplerNMSValidator::SimplerNMSValidator(const std::string& _type) : LayerValidator(_type) { +} + +void SpatialTransformerValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void SpatialTransformerValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {2}); +} + +SpatialTransformerValidator::SpatialTransformerValidator(const std::string& _type) : LayerValidator(_type) { +} + +void UpsamplingValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void UpsamplingValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +UpsamplingValidator::UpsamplingValidator(const std::string& _type) : LayerValidator(_type) { +} + +void UnpoolingValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void UnpoolingValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +UnpoolingValidator::UnpoolingValidator(const std::string& _type) : LayerValidator(_type) { +} + +ActivationValidator::ActivationValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ActivationValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ActivationValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +ConstValidator::ConstValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ConstValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ConstValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {0, 1}); +} + +CopyValidator::CopyValidator(const std::string& _type) : LayerValidator(_type) { +} + +void CopyValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void CopyValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +ELUValidator::ELUValidator(const std::string& _type) : LayerValidator(_type) { +} + +void ELUValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ELUValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +InputValidator::InputValidator(const std::string& _type) : LayerValidator(_type) { +} + +void InputValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void InputValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {0}); +} + +MemoryValidator::MemoryValidator(const std::string& _type) : LayerValidator(_type) { +} + +void MemoryValidator::checkParams(const CNNLayer* layer) { + int size = layer->GetParamAsInt("size"); + if (size != 2) { + THROW_IE_EXCEPTION << "The value of Memory layer size parameter is invalid"; + } +} + +void MemoryValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1, 0}); +} + +NormalizeValidator::NormalizeValidator(const std::string& _type) : LayerValidator(_type) { +} + +void NormalizeValidator::checkParams(const CNNLayer* layer) { + if (layer->CheckParamPresence("eps")) { + float eps = layer->GetParamAsFloat("eps"); + if (eps < 0) { + THROW_IE_EXCEPTION << "The value of Normalize layer eps parameter is invalid"; + } + } +} + +void NormalizeValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +PowerFileValidator::PowerFileValidator(const std::string& _type) : LayerValidator(_type) { +} + +void PowerFileValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void PowerFileValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +ReLU6Validator::ReLU6Validator(const std::string& _type) : LayerValidator(_type) { +} + +void ReLU6Validator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void ReLU6Validator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +SigmoidValidator::SigmoidValidator(const std::string& _type) : LayerValidator(_type) { +} + +void SigmoidValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void SigmoidValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +TanHValidator::TanHValidator(const std::string& _type) : LayerValidator(_type) { +} + +void TanHValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} + +QuantizeValidator::QuantizeValidator(const std::string& _type) : LayerValidator(_type) {} + +void QuantizeValidator::parseParams(CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of QuantizeLayer class"; + } + + casted->levels = casted->GetParamAsInt("levels", 1); + + if (casted->levels <= 1) { + THROW_IE_EXCEPTION << layer->name << ": Incorrect value for parameter levels = " << casted->levels + << ". Expected to be > 1."; + } +} + +void QuantizeValidator::checkParams(const CNNLayer* layer) { + LayerValidator::checkParams(layer); +} + +void QuantizeValidator::checkShapes(const CNNLayer* layer, const vector& inShapes) const { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of QuantizeLayer class"; + } + + size_t numInputs = inShapes.size(); + if (numInputs != 5) + THROW_IE_EXCEPTION << "Quantize can take only 5 inputs, but actually it has: " << numInputs; + + auto dims0 = inShapes[0]; + if (dims0.size() < 1) { + THROW_IE_EXCEPTION << "Quantize input0 shape must have at least 1 dimension"; + } +} + +BinaryConvolutionValidator::BinaryConvolutionValidator(const std::string& _type) : LayerValidator(_type) {} + +void BinaryConvolutionValidator::parseParams(CNNLayer* layer) { + auto binConvLayer = dynamic_cast(layer); + if (!binConvLayer) { + THROW_IE_EXCEPTION << "Layer is not instance of BinaryConvolutionLayer class"; + } + + binConvLayer->_pad_value = binConvLayer->GetParamAsFloat("pad_value", -1.f); + binConvLayer->_in_depth = binConvLayer->GetParamAsUInt("input"); + binConvLayer->_mode = BinaryConvolutionLayer::xnor_popcount; + std::string mode = binConvLayer->GetParamAsString("mode", "xnor-popcount"); + if (mode != "xnor-popcount") + THROW_IE_EXCEPTION << "Layer with type `" << _type << "` has incorrect mode!"; + + binConvLayer->_out_depth = binConvLayer->GetParamAsUInt("output"); + + binConvLayer->_kernel.clear(); + binConvLayer->_stride.clear(); + binConvLayer->_padding.clear(); + binConvLayer->_pads_end.clear(); + binConvLayer->_dilation.clear(); + + vector kernels = binConvLayer->GetParamAsUInts("kernel", {}); + if (kernels.empty()) { + // IR_v == 2 + binConvLayer->_kernel.insert(X_AXIS, binConvLayer->GetParamAsUInt("kernel-x")); + binConvLayer->_kernel.insert(Y_AXIS, binConvLayer->GetParamAsUInt("kernel-y")); + + binConvLayer->_stride.insert(X_AXIS, binConvLayer->GetParamAsUInt("stride-x", 1u)); + binConvLayer->_stride.insert(Y_AXIS, binConvLayer->GetParamAsUInt("stride-y", 1u)); + // TODO: maybe just throw exception, why do we change IR? + if (0 == binConvLayer->_stride[X_AXIS]) { + binConvLayer->_stride[X_AXIS] = 1u; + LogError("Warning! in layer %s: Stride x is 0, setting to 1 ", binConvLayer->name.c_str()); + } + if (0 == binConvLayer->_stride[Y_AXIS]) { + binConvLayer->_stride[Y_AXIS] = 1u; + LogError("Warning! in layer %s: Stride y is 0, setting to 1", binConvLayer->name.c_str()); + } + + binConvLayer->_padding.insert(X_AXIS, binConvLayer->GetParamAsUInt("pad-x", 0u)); + binConvLayer->_padding.insert(Y_AXIS, binConvLayer->GetParamAsUInt("pad-y", 0u)); + + binConvLayer->_pads_end.insert(X_AXIS, binConvLayer->GetParamAsUInt("pad-r", binConvLayer->_padding[X_AXIS])); + binConvLayer->_pads_end.insert(Y_AXIS, binConvLayer->GetParamAsUInt("pad-b", binConvLayer->_padding[Y_AXIS])); + + binConvLayer->_dilation.insert(X_AXIS, binConvLayer->GetParamAsUInt("dilation-x", 1u)); + binConvLayer->_dilation.insert(Y_AXIS, binConvLayer->GetParamAsUInt("dilation-y", 1u)); + } else { + // IR_v > 2 + for (int i = 1; i <= kernels.size(); i++) { + binConvLayer->_kernel.insert(i - 1, kernels[kernels.size() - i]); + } + + vector default_0 = vector (binConvLayer->_kernel.size(), 0u); + vector default_1 = vector (binConvLayer->_kernel.size(), 1u); + + vector strides = binConvLayer->GetParamAsUInts("strides", default_1); + for (int i = 1; i <= strides.size(); i++) { + if (strides[strides.size() - i] == 0) { + THROW_IE_EXCEPTION << "Stride could not be 0.\nIn layer " << binConvLayer->name; + } + binConvLayer->_stride.insert(i - 1, strides[strides.size() - i]); + } + + vector pads_begin = binConvLayer->GetParamAsUInts("pads_begin", default_0); + for (int i = 1; i <= pads_begin.size(); i++) { + binConvLayer->_padding.insert(i - 1, pads_begin[pads_begin.size() - i]); + } + + vector pads_end = binConvLayer->GetParamAsUInts("pads_end", pads_begin); + for (int i = 1; i <= pads_end.size(); i++) { + binConvLayer->_pads_end.insert(i - 1, pads_end[pads_end.size() - i]); + } + + vector dilations = binConvLayer->GetParamAsUInts("dilations", default_1); + for (int i = 1; i <= dilations.size(); i++) { + binConvLayer->_dilation.insert(i - 1, dilations[dilations.size() - i]); + } + } + + binConvLayer->_auto_pad = binConvLayer->GetParamAsString("auto_pad", ""); + binConvLayer->_group = binConvLayer->GetParamAsUInt("group", 1u); +} + +void BinaryConvolutionValidator::checkParams(const CNNLayer* layer) { + auto casted = dynamic_cast(layer); + if (!casted) { + THROW_IE_EXCEPTION << "Layer is not instance of BinaryConvolutionLayer class"; + } +} + +void BinaryConvolutionValidator::checkCorrespondence(const CNNLayer* layer, + const std::map& blobs, + const vector& inShapes) const { + auto binConvLayer = dynamic_cast(layer); + if (!binConvLayer) + THROW_IE_EXCEPTION << "Layer is not instance of BinaryConvolutionLayer class"; +} + +void BinaryConvolutionValidator::checkShapes(const CNNLayer* layer, const std::vector& inShapes) const { + checkNumOfInput(inShapes, {1}); +} } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_layer_validators.hpp b/inference-engine/src/inference_engine/ie_layer_validators.hpp index 6361b4f..94a0a67 100644 --- a/inference-engine/src/inference_engine/ie_layer_validators.hpp +++ b/inference-engine/src/inference_engine/ie_layer_validators.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -47,8 +47,7 @@ public: * @note: This function doesn't touch ins and out Data of the layer. * Throws exception if the check fails */ - virtual void checkShapes(const CNNLayer* layer, - const std::vector& inShapes) const {} + virtual void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const {} /** * @brief Checks correspondence of all parameters in the aggregate, except output shapes. @@ -86,41 +85,6 @@ private: InferenceEngine::details::caseless_unordered_map _validators; }; -static void checkWeakData(const DataWeakPtr& data) { -} - -static void checkData(const DataPtr& data) { -} - - -/** - * @brief Checks that input Data is not empty and pointers are not null, number of inputs correspond number of input shapes, dimensions in Data are not empty - */ -static void checkInputs(const CNNLayer* layer, const std::vector& inShapes) { - // TODO: not finished implementation - if (layer->insData.size() != inShapes.size()) - THROW_IE_EXCEPTION << "Number of layer's inputs don't correspond number of new input shapes"; - - auto inData = layer->insData[0].lock(); - bool isCorrect = false; - SizeVector inDims, inShape; - if (inData) { - inDims = inData->getDims(); - inShape = inShapes[0]; - isCorrect = inShape.size() == inDims.size() && !inShape.empty() && !inDims.empty(); - } - - if (!isCorrect) - THROW_IE_EXCEPTION << " Failed with invalid shapes: shapes are empty" - << "new input shape size=" << inShape.size() << ", input shape size in IR=" - << inDims.size(); -} - -/** - * @brief Checks that output Data is not empty and pointers are not null, number of outputs correspond number of output shapes, dimensions in Data are not empty - */ -static void checkOutputs(const CNNLayer* layer, const std::vector& outShapes) {} - static void getInOutShapes(const CNNLayer* layer, InOutDims& inOutShapes) { inOutShapes.inDims.clear(); inOutShapes.outDims.clear(); @@ -155,6 +119,8 @@ public: void checkCorrespondence(const CNNLayer* layer, const std::map& blobs, const std::vector& inShapes) const override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(DeconvolutionValidator) : public ConvolutionValidator { @@ -168,6 +134,8 @@ public: void checkCorrespondence(const CNNLayer* layer, const std::map& blobs, const std::vector& inShapes) const override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; @@ -177,6 +145,8 @@ public: void checkParams(const CNNLayer* layer) override; + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; + explicit PoolingValidator(const std::string& _type); }; @@ -191,6 +161,8 @@ public: void checkCorrespondence(const CNNLayer* layer, const std::map& blobs, const std::vector& inShapes) const override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(CropValidator) : public LayerValidator { @@ -211,6 +183,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(BatchNormalizationValidator) : public LayerValidator { @@ -220,6 +194,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(PowerValidator) : public LayerValidator { @@ -229,6 +205,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(PReLUValidator) : public LayerValidator { @@ -238,6 +216,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(ScaleShiftValidator) : public LayerValidator { @@ -247,6 +227,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(ReshapeValidator) : public LayerValidator { @@ -265,6 +247,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(ClampValidator) : public LayerValidator { @@ -273,7 +257,7 @@ public: void parseParams(CNNLayer* layer) override; - void checkParams(const CNNLayer* layer) override; + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(ReLUValidator) : public LayerValidator { @@ -283,6 +267,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(MVNValidator) : public LayerValidator { @@ -292,6 +278,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(GRNValidator) : public LayerValidator { @@ -301,6 +289,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(SoftMaxValidator) : public LayerValidator { @@ -310,6 +300,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(NormValidator) : public LayerValidator { @@ -319,6 +311,8 @@ public: void parseParams(CNNLayer* layer) override; void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; class INFERENCE_ENGINE_API_CLASS(SplitValidator) : public LayerValidator { @@ -376,9 +370,31 @@ public: void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; -class INFERENCE_ENGINE_API_CLASS(RNNValidator) : public LayerValidator { +class INFERENCE_ENGINE_API_CLASS(StridedSliceValidator) : public LayerValidator { +public: + explicit StridedSliceValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ShuffleChannelsValidator) : public LayerValidator { +public: + explicit ShuffleChannelsValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(DepthToSpaceValidator) : public LayerValidator { public: - explicit RNNValidator(const std::string& _type); + explicit DepthToSpaceValidator(const std::string& _type); void parseParams(CNNLayer* layer) override; @@ -387,6 +403,412 @@ public: void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; }; +class INFERENCE_ENGINE_API_CLASS(SpaceToDepthValidator) : public LayerValidator { +public: + explicit SpaceToDepthValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ReverseSequenceValidator) : public LayerValidator { +public: + explicit ReverseSequenceValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(SqueezeValidator) : public LayerValidator { +public: + explicit SqueezeValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(UnsqueezeValidator) : public LayerValidator { +public: + explicit UnsqueezeValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(RangeValidator) : public LayerValidator { +public: + explicit RangeValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(FillValidator) : public LayerValidator { +public: + explicit FillValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ExpandValidator) : public LayerValidator { +public: + explicit ExpandValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +template +class INFERENCE_ENGINE_API_CLASS(RNNBaseValidator) : public LayerValidator { +public: + explicit RNNBaseValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkCorrespondence(const CNNLayer* layer, + const std::map& blobs, + const std::vector& inShapes) const override; + +protected: + static std::vector def_acts; // Default values for cell gate activations + static std::vector def_alpha; // Default activation alpha parameter + static std::vector def_beta; // Default activation beta parameter + static size_t G; // gate number + static size_t NS; // state number +}; + +template +class INFERENCE_ENGINE_API_CLASS(RNNCellValidator) : public RNNBaseValidator { +public: + explicit RNNCellValidator(const std::string& _type); + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +extern template class INFERENCE_ENGINE_API_CLASS(RNNCellValidator); +extern template class INFERENCE_ENGINE_API_CLASS(RNNCellValidator); +extern template class INFERENCE_ENGINE_API_CLASS(RNNCellValidator); + +template +class INFERENCE_ENGINE_API_CLASS(RNNSequenceValidator) : public RNNBaseValidator { +public: + explicit RNNSequenceValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +extern template class INFERENCE_ENGINE_API_CLASS(RNNSequenceValidator); +extern template class INFERENCE_ENGINE_API_CLASS(RNNSequenceValidator); +extern template class INFERENCE_ENGINE_API_CLASS(RNNSequenceValidator); + +class INFERENCE_ENGINE_API_CLASS(ArgMaxValidator) : public LayerValidator { +public: + explicit ArgMaxValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(CTCGreedyDecoderValidator) : public LayerValidator { +public: + explicit CTCGreedyDecoderValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(DetectionOutputValidator) : public LayerValidator { +public: + explicit DetectionOutputValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(InterpValidator) : public LayerValidator { +public: + explicit InterpValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(PermuteValidator) : public LayerValidator { +public: + explicit PermuteValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(PriorBoxValidator) : public LayerValidator { +public: + explicit PriorBoxValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(PriorBoxClusteredValidator) : public LayerValidator { +public: + explicit PriorBoxClusteredValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ProposalValidator) : public LayerValidator { +public: + explicit ProposalValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(PSROIPoolingValidator) : public LayerValidator { +public: + explicit PSROIPoolingValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(RegionYoloValidator) : public LayerValidator { +public: + explicit RegionYoloValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ReorgYoloValidator) : public LayerValidator { +public: + explicit ReorgYoloValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ResampleValidator) : public LayerValidator { +public: + explicit ResampleValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ROIPoolingValidator) : public LayerValidator { +public: + explicit ROIPoolingValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(SimplerNMSValidator) : public LayerValidator { +public: + explicit SimplerNMSValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(SpatialTransformerValidator) : public LayerValidator { +public: + explicit SpatialTransformerValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(UpsamplingValidator) : public LayerValidator { +public: + explicit UpsamplingValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ActivationValidator) : public LayerValidator { +public: + explicit ActivationValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ConstValidator) : public LayerValidator { +public: + explicit ConstValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ELUValidator) : public LayerValidator { +public: + explicit ELUValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(InputValidator) : public LayerValidator { +public: + explicit InputValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(MemoryValidator) : public LayerValidator { +public: + explicit MemoryValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(NormalizeValidator) : public LayerValidator { +public: + explicit NormalizeValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(CopyValidator) : public LayerValidator { +public: + explicit CopyValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(PowerFileValidator) : public LayerValidator { +public: + explicit PowerFileValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(ReLU6Validator) : public LayerValidator { +public: + explicit ReLU6Validator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(SigmoidValidator) : public LayerValidator { +public: + explicit SigmoidValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(TanHValidator) : public LayerValidator { +public: + explicit TanHValidator(const std::string& _type); + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(UnpoolingValidator) : public LayerValidator { +public: + explicit UnpoolingValidator(const std::string& _type); + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(QuantizeValidator) : public LayerValidator { +public: + explicit QuantizeValidator(const std::string& _type); + + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + +class INFERENCE_ENGINE_API_CLASS(BinaryConvolutionValidator) : public LayerValidator { +public: + void parseParams(CNNLayer* layer) override; + + void checkParams(const CNNLayer* layer) override; + + explicit BinaryConvolutionValidator(const std::string& _type); + + void checkCorrespondence(const CNNLayer* layer, + const std::map& blobs, + const std::vector& inShapes) const override; + + void checkShapes(const CNNLayer* layer, const std::vector& inShapes) const override; +}; + template class ValidatorRegisterBase { public: @@ -398,34 +820,79 @@ public: #define REG_LAYER_VALIDATOR_FOR_TYPE(__validator, __type) \ static ValidatorRegisterBase<__validator> __reg__##__type(#__type) +REG_LAYER_VALIDATOR_FOR_TYPE(ActivationValidator, Activation); +REG_LAYER_VALIDATOR_FOR_TYPE(ArgMaxValidator, ArgMax); +REG_LAYER_VALIDATOR_FOR_TYPE(BatchNormalizationValidator, BatchNormalization); +REG_LAYER_VALIDATOR_FOR_TYPE(CTCGreedyDecoderValidator, CTCGreedyDecoder); +REG_LAYER_VALIDATOR_FOR_TYPE(ClampValidator, Clamp); +REG_LAYER_VALIDATOR_FOR_TYPE(ConcatValidator, Concat); +REG_LAYER_VALIDATOR_FOR_TYPE(ConstValidator, Const); REG_LAYER_VALIDATOR_FOR_TYPE(ConvolutionValidator, Convolution); +REG_LAYER_VALIDATOR_FOR_TYPE(CopyValidator, Copy); +REG_LAYER_VALIDATOR_FOR_TYPE(CropValidator, Crop); REG_LAYER_VALIDATOR_FOR_TYPE(DeconvolutionValidator, Deconvolution); -REG_LAYER_VALIDATOR_FOR_TYPE(PoolingValidator, Pooling); +REG_LAYER_VALIDATOR_FOR_TYPE(DetectionOutputValidator, DetectionOutput); +REG_LAYER_VALIDATOR_FOR_TYPE(ELUValidator, ELU); +REG_LAYER_VALIDATOR_FOR_TYPE(EltwiseValidator, Eltwise); REG_LAYER_VALIDATOR_FOR_TYPE(FullyConnectedValidator, InnerProduct); REG_LAYER_VALIDATOR_FOR_TYPE(FullyConnectedValidator, FullyConnected); -REG_LAYER_VALIDATOR_FOR_TYPE(CropValidator, Crop); -REG_LAYER_VALIDATOR_FOR_TYPE(BatchNormalizationValidator, BatchNormalization); -REG_LAYER_VALIDATOR_FOR_TYPE(PowerValidator, Power); +REG_LAYER_VALIDATOR_FOR_TYPE(GRNValidator, GRN); +REG_LAYER_VALIDATOR_FOR_TYPE(InputValidator, Input); +REG_LAYER_VALIDATOR_FOR_TYPE(InterpValidator, Interp); +REG_LAYER_VALIDATOR_FOR_TYPE(MVNValidator, MVN); +REG_LAYER_VALIDATOR_FOR_TYPE(MemoryValidator, Memory); +REG_LAYER_VALIDATOR_FOR_TYPE(NormValidator, Norm); +REG_LAYER_VALIDATOR_FOR_TYPE(NormValidator, LRN); +REG_LAYER_VALIDATOR_FOR_TYPE(NormalizeValidator, Normalize); REG_LAYER_VALIDATOR_FOR_TYPE(PReLUValidator, PReLU); -REG_LAYER_VALIDATOR_FOR_TYPE(ScaleShiftValidator, ScaleShift); -REG_LAYER_VALIDATOR_FOR_TYPE(TileValidator, Tile); +REG_LAYER_VALIDATOR_FOR_TYPE(PSROIPoolingValidator, PSROIPooling); +REG_LAYER_VALIDATOR_FOR_TYPE(PermuteValidator, Permute); +REG_LAYER_VALIDATOR_FOR_TYPE(PoolingValidator, Pooling); +REG_LAYER_VALIDATOR_FOR_TYPE(PowerValidator, Power); +REG_LAYER_VALIDATOR_FOR_TYPE(PowerFileValidator, PowerFile); +REG_LAYER_VALIDATOR_FOR_TYPE(PriorBoxClusteredValidator, PriorBoxClustered); +REG_LAYER_VALIDATOR_FOR_TYPE(PriorBoxValidator, PriorBox); +REG_LAYER_VALIDATOR_FOR_TYPE(ProposalValidator, Proposal); +REG_LAYER_VALIDATOR_FOR_TYPE(ROIPoolingValidator, ROIPooling); +REG_LAYER_VALIDATOR_FOR_TYPE(ReLUValidator, ReLU); +REG_LAYER_VALIDATOR_FOR_TYPE(ReLU6Validator, ReLU6); +REG_LAYER_VALIDATOR_FOR_TYPE(RegionYoloValidator, RegionYolo); +REG_LAYER_VALIDATOR_FOR_TYPE(ReorgYoloValidator, ReorgYolo); +REG_LAYER_VALIDATOR_FOR_TYPE(ResampleValidator, Resample); REG_LAYER_VALIDATOR_FOR_TYPE(ReshapeValidator, Reshape); REG_LAYER_VALIDATOR_FOR_TYPE(ReshapeValidator, Flatten); -REG_LAYER_VALIDATOR_FOR_TYPE(EltwiseValidator, Eltwise); -REG_LAYER_VALIDATOR_FOR_TYPE(ClampValidator, Clamp); -REG_LAYER_VALIDATOR_FOR_TYPE(ReLUValidator, ReLU); -REG_LAYER_VALIDATOR_FOR_TYPE(MVNValidator, MVN); -REG_LAYER_VALIDATOR_FOR_TYPE(GRNValidator, GRN); +REG_LAYER_VALIDATOR_FOR_TYPE(ScaleShiftValidator, ScaleShift); +REG_LAYER_VALIDATOR_FOR_TYPE(SigmoidValidator, Sigmoid); +REG_LAYER_VALIDATOR_FOR_TYPE(SigmoidValidator, Logistic); +REG_LAYER_VALIDATOR_FOR_TYPE(SimplerNMSValidator, SimplerNMS); REG_LAYER_VALIDATOR_FOR_TYPE(SoftMaxValidator, SoftMax); -REG_LAYER_VALIDATOR_FOR_TYPE(NormValidator, Norm); -REG_LAYER_VALIDATOR_FOR_TYPE(NormValidator, LRN); +REG_LAYER_VALIDATOR_FOR_TYPE(SpatialTransformerValidator, SpatialTransformer); REG_LAYER_VALIDATOR_FOR_TYPE(SplitValidator, Split); REG_LAYER_VALIDATOR_FOR_TYPE(SplitValidator, Slice); -REG_LAYER_VALIDATOR_FOR_TYPE(ConcatValidator, Concat); REG_LAYER_VALIDATOR_FOR_TYPE(GemmValidator, Gemm); REG_LAYER_VALIDATOR_FOR_TYPE(PadValidator, Pad); REG_LAYER_VALIDATOR_FOR_TYPE(GatherValidator, Gather); -REG_LAYER_VALIDATOR_FOR_TYPE(RNNValidator, RNN); - +REG_LAYER_VALIDATOR_FOR_TYPE(StridedSliceValidator, StridedSlice); +REG_LAYER_VALIDATOR_FOR_TYPE(ShuffleChannelsValidator, ShuffleChannels); +REG_LAYER_VALIDATOR_FOR_TYPE(DepthToSpaceValidator, DepthToSpace); +REG_LAYER_VALIDATOR_FOR_TYPE(SpaceToDepthValidator, SpaceToDepth); +REG_LAYER_VALIDATOR_FOR_TYPE(ReverseSequenceValidator, ReverseSequence); +REG_LAYER_VALIDATOR_FOR_TYPE(RNNCellValidator, RNNCell); +REG_LAYER_VALIDATOR_FOR_TYPE(RNNCellValidator, GRUCell); +REG_LAYER_VALIDATOR_FOR_TYPE(RNNCellValidator, LSTMCell); +REG_LAYER_VALIDATOR_FOR_TYPE(RNNSequenceValidator, RNNSequence); +REG_LAYER_VALIDATOR_FOR_TYPE(RNNSequenceValidator, GRUSequence); +REG_LAYER_VALIDATOR_FOR_TYPE(RNNSequenceValidator, LSTMSequence); +REG_LAYER_VALIDATOR_FOR_TYPE(SqueezeValidator, Squeeze); +REG_LAYER_VALIDATOR_FOR_TYPE(UnsqueezeValidator, Unsqueeze); +REG_LAYER_VALIDATOR_FOR_TYPE(RangeValidator, Range); +REG_LAYER_VALIDATOR_FOR_TYPE(FillValidator, Fill); +REG_LAYER_VALIDATOR_FOR_TYPE(ExpandValidator, Expand); +REG_LAYER_VALIDATOR_FOR_TYPE(TanHValidator, TanH); +REG_LAYER_VALIDATOR_FOR_TYPE(TileValidator, Tile); +REG_LAYER_VALIDATOR_FOR_TYPE(UnpoolingValidator, Unpooling); +REG_LAYER_VALIDATOR_FOR_TYPE(UpsamplingValidator, Upsampling); +REG_LAYER_VALIDATOR_FOR_TYPE(QuantizeValidator, Quantize); +REG_LAYER_VALIDATOR_FOR_TYPE(BinaryConvolutionValidator, BinaryConvolution); } // namespace details } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_layers_internal.cpp b/inference-engine/src/inference_engine/ie_layers_internal.cpp index 55fb626..c995966 100644 --- a/inference-engine/src/inference_engine/ie_layers_internal.cpp +++ b/inference-engine/src/inference_engine/ie_layers_internal.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -98,7 +98,8 @@ class PaddingsUpdater { Paddings getPaddingsImpl(const CNNLayer &layer) { Paddings actual; - details::visitActualLayer(std::tuple (), layer, PaddingsUpdater(actual)); + details::visitActualLayer(std::tuple (), layer, PaddingsUpdater(actual)); return actual; } diff --git a/inference-engine/src/inference_engine/ie_layers_internal.hpp b/inference-engine/src/inference_engine/ie_layers_internal.hpp index 296b565..562bacb 100644 --- a/inference-engine/src/inference_engine/ie_layers_internal.hpp +++ b/inference-engine/src/inference_engine/ie_layers_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,6 +28,7 @@ template inline typename std::enable_if::value, Paddings>::type getPaddings(const T & layer) { return getPaddingsImpl(layer); diff --git a/inference-engine/src/inference_engine/ie_layers_prv.h b/inference-engine/src/inference_engine/ie_layers_prv.h deleted file mode 100644 index 9ec8c3c..0000000 --- a/inference-engine/src/inference_engine/ie_layers_prv.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -/** - * @brief a header file for internal Layers structure - * @file - */ -#pragma once - -#include "ie_layers.h" -#include - -namespace InferenceEngine { - -/** - * LSTM Cell Layer - * - * Inputs: - * Xt {N, D} - * Ht-1 {N, S} - * Ct-1 {N, S} - * - * Outputs: - * Ht {N, S} - * Ct {N, S} - * - * Weights: - * W {G=4, S, D+S} - * B {G=4, S} - * - * G=4 and gate order is [f,i,c,o] - * - * Semantic: - * - * * - matrix mult - * (.) - eltwise mult - * [,] - concatenation - * - * f = sigmoid - * h = tanh - * - * - ft = f(Wf*[Ht-1, Xt] + Bf) - * - it = f(Wi*[Ht-1, Xt] + Bi) - * - ct = h(Wc*[Ht-1, Xt] + Bc) - * - ot = f(Wo*[Ht-1, Xt] + Bo) - * - Ct = ft (.) Ct-1 + it (.) ct - * - Ht = ot (.) h(Ct) - */ -class LSTMCell : public WeightableLayer { -public: - using WeightableLayer::WeightableLayer; -}; - -/** - * @brief This class represents RNN-Sequence layer - * - * Date shapes and meaning (cellType = "LSTM", axis = 1): - * input[0] Xt - {N,T,DC} input data sequence - * input[1] H0 - {N,SC} initial hidden state - * input[2] C0 - {N,SC} initial cell state - * - * output[0] Ht - {N,T,SC} out data sequence - * output[1] HT - {N,SC} last hidden state - * output[2] CT - {N,SC} last cell state - * - * Recurrent formula and weight format are same as from - * corresponding Cell primitive. - */ -class RNNLayer : public WeightableLayer { -public: - /** - * @brief Type of RNN cell used sequence layer - * Possible values "RNN", "LSTM", "GRU". - */ - std::string cellType = "LSTM"; - - /** - * @brief An axis by which iteration is performed - * axis=0 means first input/output data blob dimension is sequence - * axis=1 means first input/output data blob dimension is batch - */ - unsigned int axis = 1; - - /** - * @brief Direction of iteration through sequence dimension - */ - enum Direction { - RNN_FWD, /**< Forward mode. Iterate starts from index 0 with step 1. */ - RNN_BWD, /**< Backward mode. Iterate starts from last index with step -1. */ - RNN_BDR /**< Bidirectional mode. First is forward pass, second is backward. */ - }; - - Direction direction = RNN_FWD; - - using WeightableLayer::WeightableLayer; -}; - -} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_layouts.cpp b/inference-engine/src/inference_engine/ie_layouts.cpp index 63cbc16..a0ecfb0 100644 --- a/inference-engine/src/inference_engine/ie_layouts.cpp +++ b/inference-engine/src/inference_engine/ie_layouts.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -54,6 +54,9 @@ TensorDesc::TensorDesc(const Precision &precision, SizeVector dims, const Blocki layout = Layout::BLOCKED; if (dims.size() == blockingDesc.getBlockDims().size()) { switch (dims.size()) { + case 0: + layout = Layout::SCALAR; + break; case 1: layout = Layout::C; break; @@ -97,6 +100,7 @@ TensorDesc::TensorDesc(const Precision &precision, SizeVector dims, const Blocki TensorDesc::TensorDesc() { this->layout = Layout::ANY; + precision = Precision::UNSPECIFIED; } void TensorDesc::setDims(const SizeVector &dims) { @@ -129,6 +133,8 @@ bool TensorDesc::operator!=(const TensorDesc &rhs) const { Layout TensorDesc::getLayoutByDims(SizeVector dims) { switch (dims.size()) { + case 0: + return Layout::SCALAR; case 1: return Layout::C; case 2: @@ -246,6 +252,7 @@ BlockingDesc::BlockingDesc(const SizeVector& dims, Layout layout): offsetPadding SizeVector l_order; SizeVector l_dims; switch (layout) { + case Layout::SCALAR: case Layout::ANY: return; case Layout::C: diff --git a/inference-engine/src/inference_engine/ie_memcpy.cpp b/inference-engine/src/inference_engine/ie_memcpy.cpp index 330c0f2..d5b1627 100644 --- a/inference-engine/src/inference_engine/ie_memcpy.cpp +++ b/inference-engine/src/inference_engine/ie_memcpy.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_memcpy.h b/inference-engine/src/inference_engine/ie_memcpy.h index ab174de..a91adfa 100644 --- a/inference-engine/src/inference_engine/ie_memcpy.h +++ b/inference-engine/src/inference_engine/ie_memcpy.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_network.cpp b/inference-engine/src/inference_engine/ie_network.cpp index 3c92b99..c2db484 100644 --- a/inference-engine/src/inference_engine/ie_network.cpp +++ b/inference-engine/src/inference_engine/ie_network.cpp @@ -1,161 +1,126 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "ie_network.hpp" -#include
-#include
-#include -#include -#include +#include #include +#include +#include using namespace InferenceEngine; -details::Network &details::Network::operator=(const details::Network &network) { - if (this == &network) - return *this; - name = network.getName(); - for (const auto& layer : network) { - layers.push_back(Layer::Ptr(new details::Layer(*layer))); - } - for (const auto& connection : network.connections) { - connections.push_back(connection); +PortData::PortData() { + createData({}); +} + +PortData::PortData(const SizeVector& shape, const Precision& precision) { + createData({precision, shape, TensorDesc::getLayoutByDims(shape)}); +} + +const Blob::Ptr& PortData::getData() const { + return data; +} + +void PortData::setData(const Blob::Ptr& data) { + this->data = data; +} + +const std::map& PortData::getParameters() const noexcept { + return parameters; +} + +void PortData::createData(const TensorDesc& desc) { + switch (desc.getPrecision()) { + case Precision::UNSPECIFIED: + data = std::make_shared>(desc); + break; + case Precision::FP32: + data = make_shared_blob::value_type>(desc); + break; + case Precision::FP16: + data = make_shared_blob::value_type>(desc); + break; + case Precision::Q78: + data = make_shared_blob::value_type>(desc); + break; + case Precision::I16: + data = make_shared_blob::value_type>(desc); + break; + case Precision::U8: + data = make_shared_blob::value_type>(desc); + break; + case Precision::I8: + data = make_shared_blob::value_type>(desc); + break; + case Precision::U16: + data = make_shared_blob::value_type>(desc); + break; + case Precision::I32: + data = make_shared_blob::value_type>(desc); + break; + default: + THROW_IE_EXCEPTION << "Unsupported precisions!"; } - return *this; -} - -details::Network &details::Network::operator=(const INetwork &network) { - if (this == &network) - return *this; - name = network.getName(); - for (const auto& layer : network) { - layers.push_back(std::make_shared(*layer)); - for (const auto& newConnection : network.getLayerConnections(layer->getId())) { - bool connectionFound = false; - for (const auto& connection : connections) { - if (connection == newConnection) { - connectionFound = true; - break; - } - } - if (!connectionFound) - connections.push_back(newConnection); - } - } - return *this; -} - -details::Network::Network(const Context& context, const std::string& name): ctx(context), name(name) {} - -details::Network::Network(const Context& context, const details::Network &network): ctx(context) { - *this = network; -} - -details::Network::Network(const Context& context, const INetwork &network): ctx(context) { - *this = network; } -size_t details::Network::size() const noexcept { - return static_cast(std::distance(std::begin(*this), std::end(*this))); +void PortData::setShape(const SizeVector& shape) { + TensorDesc desc = data->getTensorDesc(); + if (desc.getDims() == shape) + return; + if (data->cbuffer() != nullptr) { + THROW_IE_EXCEPTION << "Cannot change shape for allocated data!"; + } + createData({desc.getPrecision(), shape, TensorDesc::getLayoutByDims(shape)}); } -const std::string& details::Network::getName() const noexcept { - return name; +Port::Port() { + data = std::make_shared(); } -std::string& details::Network::getName() noexcept { - return name; +Port::Port(const SizeVector& shapes, const Precision& precision) { + data = std::make_shared(shapes, precision); } - -const Context& details::Network::getContext() const noexcept { - return ctx; +Port::Port(const Port& port) { + parameters = port.parameters; + data = port.data; } -const ILayer::Ptr details::Network::getLayer(size_t id) const noexcept { - for (const auto& layer : layers) { - if (layer->getId() == id) - return std::static_pointer_cast(layer); - } - return nullptr; -} - -const std::vector details::Network::getInputs() const noexcept { - std::vector inputs; - for (const auto& layer : layers) { - bool isInputLayer = true; - for (const auto& connection : getLayerConnections(layer->getId())) { - if (connection.to().layerId() == layer->getId()) { - isInputLayer = false; - break; - } - } - if (isInputLayer) { - inputs.push_back(layer); - } - } - return inputs; -} - -const std::vector details::Network::getOutputs() const noexcept { - std::vector outputs; - for (const auto& layer : layers) { - bool isOutputLayer = true; - for (const auto& connection : getLayerConnections(layer->getId())) { - if (connection.from().layerId() == layer->getId()) { - isOutputLayer = false; - break; - } - } - if (isOutputLayer) { - outputs.push_back(layer); - } - } - return outputs; -} -const std::vector& details::Network::getConnections() const noexcept { - return connections; +bool Port::operator==(const Port& rhs) const { + return parameters == rhs.parameters && + data == rhs.data; } -details::Layer::Ptr details::Network::getLayer(size_t id) noexcept { - for (const auto& layer : layers) { - if (layer->getId() == id) - return layer; - } - return nullptr; +bool Port::operator!=(const Port& rhs) const { + return !(rhs == *this); } -const std::vector details::Network::getLayerConnections(idx_t layerId) const noexcept { - std::vector layerConnections; - for (auto& connection : connections) { - if (connection.from().layerId() == layerId || connection.to().layerId() == layerId) - layerConnections.push_back(connection); - } - return layerConnections; +const SizeVector& Port::shape() const noexcept { + return data->getData()->getTensorDesc().getDims(); } -void details::Network::addLayer(const ILayer::Ptr &layer) noexcept { - if (layer) - layers.push_back(std::make_shared(*layer)); +void Port::setShape(const SizeVector& shape) { + data->setShape(shape); } -void details::Network::addConnection(const Connection &connection) noexcept { - connections.push_back(connection); +const std::map& Port::getParameters() const noexcept { + return parameters; } -INetwork::const_iterator details::Network::begin() const noexcept { - return INetwork::const_iterator(this); +void Port::setParameters(const std::map& params) noexcept { + parameters = params; } -INetwork::const_iterator details::Network::end() const noexcept { - return INetwork::const_iterator(this, true); +void Port::setParameter(const std::string& name, const Parameter& param) { + parameters[name] = param; } -details::Network::iterator details::Network::begin() noexcept { - return Network::iterator(this); +const PortData::Ptr& Port::getData() const noexcept { + return data; } -details::Network::iterator details::Network::end() noexcept { - return Network::iterator(this, true); -} +void Port::setData(const PortData::Ptr& data) { + if (!data) + return; + this->data = data; +} \ No newline at end of file diff --git a/inference-engine/src/inference_engine/ie_network.hpp b/inference-engine/src/inference_engine/ie_network.hpp deleted file mode 100644 index 16a80f7..0000000 --- a/inference-engine/src/inference_engine/ie_network.hpp +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace InferenceEngine { -namespace details { - -class Network; - -class Parameters: public IParameters { -public: - using Ptr = std::shared_ptr; - - const std::map& getParameters() const noexcept override { - return params; - } - const std::map& getConstantData() const noexcept override { - return constData; - } - - std::map& getParameters() { - return params; - } - std::map& getConstantData() noexcept { - return constData; - } -private: - std::map params; - std::map constData; -}; - -class Layer: public ILayer { -public: - using Ptr = std::shared_ptr; - - explicit Layer(size_t id): id(id), params(new Parameters()) {} - Layer(const Layer& layer) { - this->outputs = layer.getOutputPorts(); - this->inputs = layer.getInputPorts(); - this->params = layer.getParameters(); - this->subGraph = layer.getGraph(); - this->name = layer.getName(); - this->type = layer.getType(); - this->id = layer.getId(); - } - explicit Layer(const ILayer& layer) { - this->outputs = layer.getOutputPorts(); - this->inputs = layer.getInputPorts(); - this->params = layer.getParameters(); - this->subGraph = layer.getGraph(); - this->name = layer.getName(); - this->type = layer.getType(); - this->id = layer.getId(); - } - - size_t getId() const noexcept override { - return id; - } - const std::string& getName() const noexcept override { - return name; - } - const std::string& getType() const noexcept override { - return type; - } - const INetwork::Ptr& getGraph() const noexcept override { - return subGraph; - } - const IParameters::Ptr& getParameters() const noexcept override { - return params; - } - const std::vector& getInputPorts() const noexcept override { - return inputs; - } - const std::vector& getOutputPorts() const noexcept override { - return outputs; - } - - std::string& getName() noexcept { - return name; - } - - std::string& getType() noexcept { - return type; - } - std::shared_ptr getGraph() noexcept { - return std::dynamic_pointer_cast(subGraph); - } - void setGraph(const INetwork::Ptr& graph) noexcept { - subGraph = graph; - } - Parameters::Ptr getParameters() noexcept { - return std::dynamic_pointer_cast(params); - } - std::vector& getInputPorts() noexcept { - return inputs; - } - std::vector& getOutputPorts() noexcept { - return outputs; - } - -private: - idx_t id; - std::string name; - std::string type; - INetwork::Ptr subGraph; - IParameters::Ptr params; - std::vector inputs; - std::vector outputs; -}; - -class Network: public INetwork { -public: - using Ptr = std::shared_ptr; - using iterator = details::INetworkIterator; - - explicit Network(const Context& context, const std::string& name = ""); - Network(const Context& context, const INetwork& network); - Network(const Context& context, const Network& network); - - Network& operator=(const Network& network); - Network& operator=(const INetwork& network); - - const_iterator begin() const noexcept override; - const_iterator end() const noexcept override; - iterator begin() noexcept; - iterator end() noexcept; - - const ILayer::Ptr getLayer(size_t id) const noexcept override; - const std::vector getInputs() const noexcept override; - const std::vector getOutputs() const noexcept override; - const std::vector getLayerConnections(idx_t layerId) const noexcept override; - size_t size() const noexcept override; - const std::string& getName() const noexcept override; - const Context& getContext() const noexcept override; - - const std::vector& getConnections() const noexcept; - Layer::Ptr getLayer(size_t id) noexcept; - std::string& getName() noexcept; - - void addLayer(const ILayer::Ptr& layer) noexcept; - void addConnection(const Connection& connection) noexcept; - -private: - const Context ctx; - std::string name; - std::vector layers; - std::vector connections; -}; - -} // namespace details -} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_preprocess_data.cpp b/inference-engine/src/inference_engine/ie_preprocess_data.cpp index 11c3f9e..ca64d4b 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_data.cpp +++ b/inference-engine/src/inference_engine/ie_preprocess_data.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,7 @@ #include "ie_preprocess_data_sse42.hpp" #endif #include "ie_preprocess_gapi.hpp" +#include "debug.h" #include @@ -751,7 +752,8 @@ Blob::Ptr PreProcessData::getRoiBlob() const { return _roiBlob; } -void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial) { +void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial, + int batchSize) { IE_PROFILING_AUTO_SCOPE_TASK(perf_preprocessing) if (algorithm == NO_RESIZE) { @@ -762,13 +764,28 @@ void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorith THROW_IE_EXCEPTION << "Input pre-processing is called without ROI blob set"; } + if (batchSize == 0) { + THROW_IE_EXCEPTION << "Input pre-processing is called with invalid batch size " + << batchSize; + } + + if (batchSize < 0) { + // if batch_size is unspecified, process the whole input blob + batchSize = static_cast(_roiBlob->getTensorDesc().getDims()[0]); + } + if (!_preproc) { _preproc.reset(new PreprocEngine); } - if (_preproc->preprocessWithGAPI(_roiBlob, outBlob, algorithm, serial)) { + if (_preproc->preprocessWithGAPI(_roiBlob, outBlob, algorithm, serial, batchSize)) { return; } + if (batchSize > 1) { + THROW_IE_EXCEPTION << "Batch pre-processing is unsupported in this mode. " + "Use default pre-processing instead to process batches."; + } + Blob::Ptr res_in, res_out; if (_roiBlob->getTensorDesc().getLayout() == NHWC) { if (!_tmp1 || _tmp1->size() != _roiBlob->size()) { @@ -814,4 +831,21 @@ void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorith } } +void PreProcessData::isApplicable(const Blob::Ptr &src, const Blob::Ptr &dst) { + auto &src_dims = src->getTensorDesc().getDims(); + auto &dst_dims = dst->getTensorDesc().getDims(); + + if (src_dims.size() != dst_dims.size()) + THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs have different " + "number of dimensions"; + + if (src_dims.size() != 4) + THROW_IE_EXCEPTION << "Preprocessing is not applicable. Only 4D tensors are supported."; + + if (src_dims[0] != dst_dims[0] || src_dims[1] != dst_dims[1]) + THROW_IE_EXCEPTION << "Preprocessing is not applicable. Wrong shape. Network expected 4D input tensor with " + "shape [" << dst_dims[0] << "," << dst_dims[1] <<",H,W] but provided tensor has " + "shape " << details::dumpVec(src_dims) << "."; +} + } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_preprocess_data.hpp b/inference-engine/src/inference_engine/ie_preprocess_data.hpp index f5a7730..479e542 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_data.hpp +++ b/inference-engine/src/inference_engine/ie_preprocess_data.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -55,8 +55,13 @@ public: * @brief Executes input pre-processing with a given resize algorithm. * @param outBlob pre-processed output blob to be used for inference. * @param algorithm resize algorithm. + * @param serial disable OpenMP threading if the value set to true. + * @param batchSize batch size for pre-processing. */ - void execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial); + void execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial, + int batchSize = -1); + + static void isApplicable(const Blob::Ptr &src, const Blob::Ptr &dst); }; //---------------------------------------------------------------------- diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi.cpp b/inference-engine/src/inference_engine/ie_preprocess_gapi.cpp index 31f5983..b6624b5 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_gapi.cpp +++ b/inference-engine/src/inference_engine/ie_preprocess_gapi.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -72,27 +72,43 @@ inline int get_cv_depth(const InferenceEngine::TensorDesc &ie_desc) { } } -std::vector bind_to_blob(Blob::Ptr &blob) { +std::vector> bind_to_blob(Blob::Ptr &blob, int batch_size) { + if (batch_size <= 0) { + return {}; + } + const auto& ie_desc = blob->getTensorDesc(); const auto& ie_desc_blk = ie_desc.getBlockingDesc(); const auto desc = G::decompose(blob); const auto cv_depth = get_cv_depth(ie_desc); const auto stride = desc.s.H*blob->element_size(); const auto planeSize = cv::gapi::own::Size(desc.d.W, desc.d.H); - - - uint8_t* ptr = static_cast(blob->buffer()); - ptr += blob->element_size()*ie_desc_blk.getOffsetPadding(); - - std::vector result; - if (blob->layout() == NHWC) { - result.emplace_back(planeSize.height, planeSize.width, CV_MAKETYPE(cv_depth, desc.d.C), ptr, stride); - } else { // NCHW - const auto planeType = CV_MAKETYPE(cv_depth, 1); - for (size_t ch = 0; ch < desc.d.C; ch++) { - cv::gapi::own::Mat plane(planeSize.height, planeSize.width, planeType, ptr + ch*desc.s.C*blob->element_size(), stride); - result.emplace_back(plane); + // Note: operating with strides (desc.s) rather than dimensions (desc.d) which is vital for ROI + // blobs (data buffer is shared but dimensions are different due to ROI != original image) + const auto batch_offset = desc.s.N * blob->element_size(); + + std::vector> result(batch_size); + + uint8_t* blob_ptr = static_cast(blob->buffer()); + blob_ptr += blob->element_size()*ie_desc_blk.getOffsetPadding(); + + for (int i = 0; i < batch_size; ++i) { + uint8_t* curr_data_ptr = blob_ptr + i * batch_offset; + + std::vector planes; + if (blob->layout() == NHWC) { + planes.emplace_back(planeSize.height, planeSize.width, CV_MAKETYPE(cv_depth, desc.d.C), + curr_data_ptr, stride); + } else { // NCHW + const auto planeType = CV_MAKETYPE(cv_depth, 1); + for (size_t ch = 0; ch < desc.d.C; ch++) { + cv::gapi::own::Mat plane(planeSize.height, planeSize.width, planeType, + curr_data_ptr + ch*desc.s.C*blob->element_size(), stride); + planes.emplace_back(plane); + } } + + result[i] = std::move(planes); } return result; } @@ -203,13 +219,13 @@ InferenceEngine::PreprocEngine::Update InferenceEngine::PreprocEngine::needUpdat BlobDesc last_in; BlobDesc last_out; - ResizeAlgorithm last_algo; + ResizeAlgorithm last_algo = ResizeAlgorithm::NO_RESIZE; std::tie(last_in, last_out, last_algo) = *_lastCall; CallDesc newCall = newCallOrig; BlobDesc new_in; BlobDesc new_out; - ResizeAlgorithm new_algo; + ResizeAlgorithm new_algo = ResizeAlgorithm::NO_RESIZE; std::tie(new_in, new_out, new_algo) = newCall; // Declare two empty vectors per each call @@ -259,7 +275,8 @@ InferenceEngine::PreprocEngine::Update InferenceEngine::PreprocEngine::needUpdat return Update::NOTHING; } -bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool omp_serial) { +bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob, + const ResizeAlgorithm &algorithm, bool omp_serial, int batch_size) { static const bool NO_GAPI = [](const char *str) -> bool { std::string var(str ? str : ""); return var == "N" || var == "NO" || var == "OFF" || var == "0"; @@ -280,6 +297,20 @@ bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob: in_desc = G::decompose(inBlob), out_desc = G::decompose(outBlob); + // according to the IE's current design, input blob batch size _must_ match networks's expected + // batch size, even if the actual processing batch size (set on infer request) is different. + if (in_desc.d.N != out_desc.d.N) { + THROW_IE_EXCEPTION << "Input blob batch size is invalid: (input blob) " + << in_desc.d.N << " != " << out_desc.d.N << " (expected by network)"; + } + + // sanity check batch_size + if (batch_size > in_desc.d.N || batch_size > out_desc.d.N) { + THROW_IE_EXCEPTION << "Provided batch size is invaid: (provided)" + << batch_size << " > " << out_desc.d.N << " (expected by network)"; + } + + // CallDesc doesn't change within batch CallDesc thisCall = CallDesc{ BlobDesc{ in_desc_ie.getPrecision(), inBlob->layout(), in_desc_ie.getDims() }, @@ -289,9 +320,6 @@ bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob: algorithm }; const Update update = needUpdate(thisCall); - std::vector input_plane_mats = bind_to_blob(inBlob); - std::vector output_plane_mats = bind_to_blob(outBlob); - Opt _lastComputation; if (Update::REBUILD == update || Update::RESHAPE == update) { _lastCall = cv::util::make_optional(std::move(thisCall)); @@ -307,6 +335,8 @@ bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob: get_cv_depth(in_desc_ie))); } } + auto batched_input_plane_mats = bind_to_blob(inBlob, batch_size); + auto batched_output_plane_mats = bind_to_blob(outBlob, batch_size); const int thread_num = #if IE_THREAD == IE_THREAD_OMP @@ -323,7 +353,7 @@ bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob: // that an actual number of threads will be as assumed, so it // possible that all slices are processed by the same thread. // - parallel_nt_static(thread_num , [&, this](int slice_n, const int total_slices){ + parallel_nt_static(thread_num , [&, this](int slice_n, const int total_slices) { IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_tile); auto& compiled = _lastComp[slice_n]; @@ -331,21 +361,28 @@ bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob: // need to compile (or reshape) own object for a particular ROI IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_compiling); - auto meta_of = [](std::vector const& ins){ - std::vector rslt{ins.size()}; rslt.clear(); - for (auto& m : ins) { - rslt.emplace_back(descr_of(m)); - } - return rslt; - }; - using cv::gapi::own::Rect; - const auto lines_per_thread = output_plane_mats[0].rows / total_slices; + // current design implies all images in batch are equal + const auto& input_plane_mats = batched_input_plane_mats[0]; + const auto& output_plane_mats = batched_output_plane_mats[0]; + + auto lines_per_thread = output_plane_mats[0].rows / total_slices; const auto remainder = output_plane_mats[0].rows - total_slices * lines_per_thread; - const auto roi_height = lines_per_thread + ((slice_n == total_slices -1) ? remainder : 0); - auto roi = Rect{0, slice_n * lines_per_thread, output_plane_mats[0].cols, roi_height}; + // remainder shows how many threads must calculate 1 additional row. now these additions + // must also be addressed in rect's Y coordinate: + int roi_y = 0; + if (slice_n < remainder) { + lines_per_thread++; // 1 additional row + roi_y = slice_n * lines_per_thread; // all previous rois have lines+1 rows + } else { + // remainder rois have lines+1 rows, the rest prior to slice_n have lines rows + roi_y = + remainder * (lines_per_thread + 1) + (slice_n - remainder) * lines_per_thread; + } + + auto roi = Rect{0, roi_y, output_plane_mats[0].cols, lines_per_thread}; std::vector rois(output_plane_mats.size(), roi); // TODO: make a ROI a runtime argument to avoid @@ -353,20 +390,25 @@ bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob: auto args = cv::compile_args(gapi::preprocKernels(), cv::GFluidOutputRois{std::move(rois)}); if (Update::REBUILD == update) { auto& computation = _lastComputation.value(); - compiled = computation.compile(meta_of(input_plane_mats), std::move(args)); + compiled = computation.compile(descr_of(input_plane_mats), std::move(args)); } else { IE_ASSERT(compiled); - compiled.reshape(meta_of(input_plane_mats), std::move(args)); + compiled.reshape(descr_of(input_plane_mats), std::move(args)); } } - cv::GRunArgs call_ins; - cv::GRunArgsP call_outs; - for (const auto & m : input_plane_mats) { call_ins.emplace_back(m);} - for (auto & m : output_plane_mats) { call_outs.emplace_back(&m);} + for (int i = 0; i < batch_size; ++i) { + const std::vector& input_plane_mats = batched_input_plane_mats[i]; + std::vector& output_plane_mats = batched_output_plane_mats[i]; - IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_graph); - compiled(std::move(call_ins), std::move(call_outs)); + cv::GRunArgs call_ins; + cv::GRunArgsP call_outs; + for (const auto & m : input_plane_mats) { call_ins.emplace_back(m);} + for (auto & m : output_plane_mats) { call_outs.emplace_back(&m);} + + IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_graph); + compiled(std::move(call_ins), std::move(call_outs)); + } }); return true; diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi.hpp b/inference-engine/src/inference_engine/ie_preprocess_gapi.hpp index 5d9168a..6ac9db2 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_gapi.hpp +++ b/inference-engine/src/inference_engine/ie_preprocess_gapi.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -36,7 +36,8 @@ class PreprocEngine { public: PreprocEngine(); - bool preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool omp_serial); + bool preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, + bool omp_serial, int batch_size = -1); }; } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp index 4910a2a..5b282d9 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -585,7 +585,6 @@ static void calcRowLinear(const cv::gapi::fluid::View & in, reinterpret_cast(alpha), reinterpret_cast(mapsx), reinterpret_cast(beta), - reinterpret_cast(tmp), inSz, outSz, lpi); return; } diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp index f4875e6..6213f6e 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp +++ b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp index 11530dc..be1d985 100644 --- a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp +++ b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_profiling.hpp b/inference-engine/src/inference_engine/ie_profiling.hpp index 540255b..6c75d75 100644 --- a/inference-engine/src/inference_engine/ie_profiling.hpp +++ b/inference-engine/src/inference_engine/ie_profiling.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -209,7 +209,7 @@ inline static void annotateEnd(TimeResultsMap& m, TimeSampler& t) { #define IE_STR(x) IE_STR_(x) #define IE_STR_(x) #x -#define IE_PROFILING_AUTO_SCOPE(NAME) IE_ITT_SCOPE(IE_STR(NAME)); IE_TIMER_SCOPE(IE_STR(NAME)); +#define IE_PROFILING_AUTO_SCOPE(NAME) IE_ITT_SCOPE(IE_STR(NAME)); IE_TIMER_SCOPE(IE_STR(NAME)) struct ProfilingTask { std::string name; @@ -261,7 +261,7 @@ inline static void annotateEnd(IttStatic&, IttProfilingTask& t) { #define IE_ITT_TASK_SCOPE(profiling_task) #endif -#define IE_PROFILING_AUTO_SCOPE_TASK(PROFILING_TASK) IE_ITT_TASK_SCOPE(PROFILING_TASK); IE_TIMER_SCOPE(PROFILING_TASK.name); +#define IE_PROFILING_AUTO_SCOPE_TASK(PROFILING_TASK) IE_ITT_TASK_SCOPE(PROFILING_TASK); IE_TIMER_SCOPE(PROFILING_TASK.name) inline static void anotateSetThreadName(const char* name) { #if ENABLE_PROFILING_ITT diff --git a/inference-engine/src/inference_engine/ie_util_internal.cpp b/inference-engine/src/inference_engine/ie_util_internal.cpp index 44be1b5..fd0f772 100644 --- a/inference-engine/src/inference_engine/ie_util_internal.cpp +++ b/inference-engine/src/inference_engine/ie_util_internal.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -137,6 +137,16 @@ CNNLayerPtr clonelayer(const CNNLayer& source) { &layerCloneImpl, &layerCloneImpl, &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, &layerCloneImpl, &layerCloneImpl, &layerCloneImpl, @@ -149,6 +159,11 @@ CNNLayerPtr clonelayer(const CNNLayer& source) { &layerCloneImpl, &layerCloneImpl, &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, + &layerCloneImpl, &layerCloneImpl, &layerCloneImpl }; @@ -169,8 +184,13 @@ details::CNNNetworkImplPtr cloneNet(const ICNNNetwork &network) { layers.push_back(*i); i++; } + + InferenceEngine::ICNNNetworkStats* pstatsSrc = nullptr; + if (StatusCode::OK != network.getStats(&pstatsSrc, nullptr)) { + pstatsSrc = nullptr; + } // copy of the network - details::CNNNetworkImplPtr net = cloneNet(layers); + details::CNNNetworkImplPtr net = cloneNet(layers, pstatsSrc); // going over output layers and duplicatig them: OutputsDataMap outputs; network.getOutputsInfo(outputs); @@ -194,21 +214,12 @@ details::CNNNetworkImplPtr cloneNet(const ICNNNetwork &network) { } } - // cloning of statistics - InferenceEngine::ICNNNetworkStats* pstatsSrc = nullptr, *pstatsTarget = nullptr; - StatusCode s = network.getStats(&pstatsSrc, nullptr); - if (s == StatusCode::OK && pstatsSrc && !pstatsSrc->isEmpty()) { - StatusCode st = net->getStats(&pstatsTarget, nullptr); - if (st == StatusCode::OK && pstatsTarget) { - pstatsTarget->setNodesStats(pstatsSrc->getNodesStats()); - } - } - return net; } details::CNNNetworkImplPtr cloneNet(const std::vector& layers, + const ICNNNetworkStats* networkStats, std::function layerCloner) { // TODO layerCloner std::function is heavy and can be replaced with // llvm::function_ref-like lightweight callable when we add one @@ -319,6 +330,15 @@ details::CNNNetworkImplPtr cloneNet(const std::vector& layers, net->resolveOutput(); + // cloning of statistics + InferenceEngine::ICNNNetworkStats* pstatsTarget = nullptr; + if (networkStats != nullptr && !networkStats->isEmpty()) { + StatusCode st = net->getStats(&pstatsTarget, nullptr); + if (st == StatusCode::OK && pstatsTarget) { + pstatsTarget->setNodesStats(networkStats->getNodesStats()); + } + } + return net; } @@ -413,9 +433,10 @@ struct NodePrinter { } string cleanNodeName_(string node_name) const { - // remove dot and dash symbols form node name. It is incorrectly displayed in xdot + // remove dot and dash symbols from node name. It is incorrectly displayed in xdot node_name.erase(remove(node_name.begin(), node_name.end(), '.'), node_name.end()); std::replace(node_name.begin(), node_name.end(), '-', '_'); + std::replace(node_name.begin(), node_name.end(), ':', '_'); return node_name; } @@ -462,6 +483,45 @@ struct NodePrinter { if (negative_slope != 0.0f) printed_properties.emplace_back("negative_slope", std::to_string(negative_slope)); + } else if (type == "Eltwise") { + auto* eltwise = dynamic_cast(layer.get()); + + std::string operation; + + if (eltwise->_operation == EltwiseLayer::Sum) + operation = "Sum"; + else if (eltwise->_operation == EltwiseLayer::Prod) + operation = "Prod"; + else if (eltwise->_operation == EltwiseLayer::Max) + operation = "Max"; + else if (eltwise->_operation == EltwiseLayer::Sub) + operation = "Sub"; + else if (eltwise->_operation == EltwiseLayer::Min) + operation = "Min"; + else if (eltwise->_operation == EltwiseLayer::Div) + operation = "Div"; + else if (eltwise->_operation == EltwiseLayer::Squared_diff) + operation = "Squared_diff"; + else if (eltwise->_operation == EltwiseLayer::Equal) + operation = "Equal"; + else if (eltwise->_operation == EltwiseLayer::Not_equal) + operation = "Not_equal"; + else if (eltwise->_operation == EltwiseLayer::Less) + operation = "Less"; + else if (eltwise->_operation == EltwiseLayer::Less_equal) + operation = "Less_equal"; + else if (eltwise->_operation == EltwiseLayer::Greater) + operation = "Greater"; + else if (eltwise->_operation == EltwiseLayer::Greater_equal) + operation = "Greater_equal"; + else if (eltwise->_operation == EltwiseLayer::Logical_AND) + operation = "Logical_AND"; + else if (eltwise->_operation == EltwiseLayer::Logical_OR) + operation = "Logical_OR"; + else if (eltwise->_operation == EltwiseLayer::Logical_XOR) + operation = "Logical_XOR"; + + printed_properties.emplace_back("operation", operation); } if (layer_cb != nullptr) { @@ -483,9 +543,9 @@ struct NodePrinter { }; std::stringstream dims_ss; - size_t idx = data->dims.size(); + size_t idx = data->getTensorDesc().getDims().size(); dims_ss << '['; - for (auto &dim : data->dims) { + for (auto &dim : data->getTensorDesc().getDims()) { dims_ss << dim << ((--idx) != 0u ? ", " : ""); } dims_ss << ']'; @@ -499,20 +559,20 @@ struct NodePrinter { void printNode(string const &node_name, const string &node_title, ordered_properties const &node_properties, ordered_properties const &printed_properties) { - // normalization of names, removing all prohinited symbols like "/" + // normalization of names, removing all prohibited symbols like "/" string nodeNameN = node_name; std::replace(nodeNameN.begin(), nodeNameN.end(), '/', '_'); string dataNameN = node_title; std::replace(dataNameN.begin(), dataNameN.end(), '/', '_'); out << '\t' << nodeNameN << " ["; - for (auto &node_propertie : node_properties) { - out << node_propertie.first << "=\"" << node_propertie.second << "\", "; + for (auto &node_property : node_properties) { + out << node_property.first << "=\"" << node_property.second << "\", "; } out << "label=\"" << node_title; - for (auto &printed_propertie : printed_properties) { - out << "\\n" << printed_propertie.first << ": " << printed_propertie.second; + for (auto &printed_property : printed_properties) { + out << "\\n" << printed_property.first << ": " << printed_property.second; } out << "\"];\n"; } @@ -539,17 +599,10 @@ void saveGraphToDot(InferenceEngine::ICNNNetwork &network, std::ostream &out, pr } } - std::vector> perf_info; - auto store_perf_info = [&](CNNLayerPtr layer) { - auto perf = layer->params.find("perf"); - if (perf != layer->params.end()) perf_info.push_back({layer, perf->second}); - }; - out << "strict digraph Network {\n"; // Traverse graph and print nodes for (const auto &layer : details::CNNNetSortTopologically(network)) { printer.printLayerNode(layer); - store_perf_info(layer); // Print output Data Object for (auto &dataptr : layer->outData) { @@ -571,28 +624,6 @@ void saveGraphToDot(InferenceEngine::ICNNNetwork &network, std::ostream &out, pr printer.printEdge(layer, dataptr, true); } } - - if (!perf_info.empty()) { - out << "// Performance statistic" << std::endl; - out << "node [shape=plain, fontsize=24]" << std::endl; - - for (auto &p : perf_info) { - auto &perf = p.second; - auto &name = p.first->name; - auto layer_name = "layer_" + name; - auto perf_name = "perf_" + name; - // {rank=same; perf_conv1 [label="133 mcs"]; layer_conv1;} - out << "{rank=same; " << perf_name << " [label=\"" << perf << "\"]; " - << layer_name << ";}" << std::endl; - } - - out << std::endl << "edge[style=invis];" << std::endl; - auto p = perf_info.begin(); - out << "perf_" + p->first->name; - for (; p != perf_info.end(); p++) - out << " -> perf_" + p->first->name; - } - out << "}" << std::endl; } diff --git a/inference-engine/src/inference_engine/ie_util_internal.hpp b/inference-engine/src/inference_engine/ie_util_internal.hpp index 1f6e9f6..61bf95f 100644 --- a/inference-engine/src/inference_engine/ie_util_internal.hpp +++ b/inference-engine/src/inference_engine/ie_util_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -94,6 +94,7 @@ INFERENCE_ENGINE_API_CPP(CNNLayerPtr) clonelayer(const CNNLayer& source); */ INFERENCE_ENGINE_API_CPP(InferenceEngine::details::CNNNetworkImplPtr) cloneNet(const std::vector& layers, + const ICNNNetworkStats* networkStats, std::function layerCloner = clonelayer); /** diff --git a/inference-engine/src/inference_engine/ie_utils.cpp b/inference-engine/src/inference_engine/ie_utils.cpp index aa8e009..fd81632 100644 --- a/inference-engine/src/inference_engine/ie_utils.cpp +++ b/inference-engine/src/inference_engine/ie_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/ie_version.cpp b/inference-engine/src/inference_engine/ie_version.cpp index cca54cc..5473e80 100644 --- a/inference-engine/src/inference_engine/ie_version.cpp +++ b/inference-engine/src/inference_engine/ie_version.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,7 +10,7 @@ INFERENCE_ENGINE_API(const Version*) GetInferenceEngineVersion() noexcept { // Use local static variable to make sure it is always properly initialized // even if called from global constructor static Version inferenceEngineVersion = { - {1, 4}, // inference engine API version + {1, 6}, // inference engine API version CI_BUILD_NUMBER }; return &inferenceEngineVersion; diff --git a/inference-engine/src/inference_engine/layer_transform.hpp b/inference-engine/src/inference_engine/layer_transform.hpp index fd51793..7301552 100644 --- a/inference-engine/src/inference_engine/layer_transform.hpp +++ b/inference-engine/src/inference_engine/layer_transform.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,7 +8,6 @@ #include #include #include "ie_layers.h" -#include "ie_layers_prv.h" namespace InferenceEngine { @@ -31,6 +30,16 @@ using AllLayers = std::tuple < GemmLayer*, PadLayer*, GatherLayer*, + StridedSliceLayer*, + ShuffleChannelsLayer*, + DepthToSpaceLayer*, + SpaceToDepthLayer*, + ReverseSequenceLayer*, + SqueezeLayer*, + UnsqueezeLayer*, + RangeLayer*, + FillLayer*, + ExpandLayer*, ConcatLayer*, SplitLayer*, NormLayer*, @@ -49,7 +58,11 @@ using AllLayers = std::tuple < ClampLayer*, TensorIterator*, LSTMCell*, - RNNLayer*, + GRUCell*, + RNNCell*, + RNNSequenceLayer*, + QuantizeLayer*, + BinaryConvolutionLayer*, WeightableLayer*, CNNLayer* >; diff --git a/inference-engine/src/inference_engine/memory_solver.cpp b/inference-engine/src/inference_engine/memory_solver.cpp index ce31fc1..e70caab 100644 --- a/inference-engine/src/inference_engine/memory_solver.cpp +++ b/inference-engine/src/inference_engine/memory_solver.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -51,7 +51,7 @@ inline bool popupTogetherWith(MemorySolver::Box &box_new, const MemorySolver::Bo } } -int MemorySolver::solve() { +int64_t MemorySolver::solve() { maxTopDepth(); // at first make sure that we no need more for boxes sorted by box.start std::vector> time_slots(_time_duration); for (auto & slot : time_slots) slot.reserve(_top_depth); // 2D array [_time_duration][_top_depth] @@ -61,11 +61,11 @@ int MemorySolver::solve() { std::sort(_boxes.begin(), _boxes.end(), [](const Box& l, const Box& r) { return l.size > r.size; }); - int _min_required = 0; + int64_t _min_required = 0; for (Box& box : _boxes) { // start from bottom and will lift it up if intersect with other present - int id = box.id; + int64_t id = box.id; box.id = 0; // id will be used as a temp offset storage bool popped_up; do { @@ -91,17 +91,17 @@ int MemorySolver::solve() { return _min_required; } -int MemorySolver::maxDepth() { +int64_t MemorySolver::maxDepth() { if (_depth == -1) calcDepth(); return _depth; } -int MemorySolver::maxTopDepth() { +int64_t MemorySolver::maxTopDepth() { if (_top_depth == -1) calcDepth(); return _top_depth; } -int MemorySolver::getOffset(int id) const { +int64_t MemorySolver::getOffset(int id) const { auto res = _offsets.find(id); if (res == _offsets.end()) THROW_IE_EXCEPTION << "There are no box for provided ID"; return res->second; @@ -110,12 +110,12 @@ int MemorySolver::getOffset(int id) const { //======== Private =============// void MemorySolver::calcDepth() { - int top_depth = 0; - int depth = 0; - std::map> release_at; + int64_t top_depth = 0; + int64_t depth = 0; + std::map> release_at; for (const Box& box : _boxes) { - int time = box.start; + int64_t time = box.start; depth += box.size; top_depth++; diff --git a/inference-engine/src/inference_engine/memory_solver.hpp b/inference-engine/src/inference_engine/memory_solver.hpp index 04b8e06..0881b85 100644 --- a/inference-engine/src/inference_engine/memory_solver.hpp +++ b/inference-engine/src/inference_engine/memory_solver.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -57,10 +57,10 @@ public: int finish; /** Size of data. In abstract unit of measure (byte, simd, cache line, ...) */ - int size; + int64_t size; /** Box identifier, unique for each box. Will be used to querying calculated offset. */ - int id; + int64_t id; }; explicit MemorySolver(const std::vector& boxes); @@ -69,21 +69,21 @@ public: * @brief Solve memory location with maximal reuse. * @return Size of common memory blob required for storing all */ - int solve(); + int64_t solve(); /** Provides calculated offset for specified box id */ - int getOffset(int id) const; + int64_t getOffset(int id) const; /** Additional info. Max sum of box sizes required for any time stamp. */ - int maxDepth(); + int64_t maxDepth(); /** Additional info. Max num of boxes required for any time stamp. */ - int maxTopDepth(); + int64_t maxTopDepth(); private: std::vector _boxes; - std::map _offsets; - int _top_depth = -1; - int _depth = -1; + std::map _offsets; + int64_t _top_depth = -1; + int64_t _depth = -1; int _time_duration = -1; void calcDepth(); diff --git a/inference-engine/src/inference_engine/net_pass.cpp b/inference-engine/src/inference_engine/net_pass.cpp index 96ceb63..4e7fad2 100644 --- a/inference-engine/src/inference_engine/net_pass.cpp +++ b/inference-engine/src/inference_engine/net_pass.cpp @@ -1,16 +1,25 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "net_pass.h" -#include "ie_layers_prv.h" +#include "blob_factory.hpp" +#include "ie_memcpy.h" +#include "details/ie_cnn_network_tools.h" #include "graph_tools.hpp" #include #include +#include #include +#include +#include +#include #include +namespace InferenceEngine { +namespace NetPass { + template inline bool one_of(T val, P item) { return val == item; } template @@ -18,8 +27,124 @@ inline bool one_of(T val, P item, Args... item_others) { return val == item || one_of(val, item_others...); } -namespace InferenceEngine { -namespace NetPass { +/************************************************************/ +/**** TI Utils ********************************************/ +/************************************************************/ + +static std::vector getAllInputs(const std::vector &heads) { + CNNLayerSet inputLayers; + std::unordered_set allLayers; + + // Define all start layers + for (const auto & data : heads) { + auto &secondLayers = data->getInputTo(); + + details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) { + if (layer->insData.empty()) { + inputLayers.insert(layer); + } + }, false); + } + + std::vector res = heads; + // Add fake input data to point on not achievable + // layers from head (like const placeholders) + for (auto &starter : inputLayers) { + DataPtr holder(new Data(starter->name + ":input_holder", starter->precision)); + holder->inputTo[starter->name] = starter; + res.push_back(holder); + } + + return res; +} + +static std::vector SortTopologically(const TensorIterator::Body &body) { + std::vector all_layers; + + auto all_input_layers = getAllInputs(body.inputs); + CNNNetForestDFS(all_input_layers, [&](CNNLayerPtr current){ + all_layers.push_back(current); + }, false); + std::reverse(all_layers.begin(), all_layers.end()); + return all_layers; +} + +static TensorIterator::Body CopyTIBody(ICNNNetwork &net, const TensorIterator::Body &body, std::string suffix = "") { + struct NoneStruct {}; + auto cp = [&](CNNLayerPtr lp) { + return injectData(lp); + }; + + const auto all_orig = SortTopologically(body); + auto num = all_orig.size(); + + std::unordered_map old2new_l; + for (int i = 0; i < num; i++) { + auto &orig = all_orig[i]; + old2new_l[orig.get()] = cp(orig); + } + + std::unordered_map old2new_d; + for (auto &in : body.inputs) { + auto new_data = std::make_shared(*in.get()); + for (auto &to : new_data->getInputTo()) + to.second = old2new_l[to.second.get()]; + + old2new_d[in.get()] = new_data; + } + + for (const auto &old : all_orig) { + auto &new_one = old2new_l[old.get()]; + // remap output data + for (int i = 0; i != old->outData.size(); i++) { + auto old_data = old->outData[i]; + auto new_data = new_one->outData[i]; + new_data->getCreatorLayer() = CNNLayerWeakPtr(new_one); + old2new_d[old_data.get()] = new_data; + + for (auto &to : new_data->getInputTo()) + to.second = old2new_l[to.second.get()]; + } + // remap input data + for (int i = 0; i != old->insData.size(); i++) { + auto old_data = old->insData[i].lock(); + auto new_data = old2new_d.at(old_data.get()); + new_one->insData[i] = new_data; + } + } + + // Add suffix + if (!suffix.empty()) { + for (auto &kvp : old2new_l) { + auto layer = kvp.second; + auto old_name = layer->name; + layer->name += suffix; + for (auto &ins : layer->insData) { + ins.lock()->inputTo.erase(old_name); + ins.lock()->inputTo[layer->name] = layer; + } + + // And also hold newly created layer in parent network. + // TI body may contain isolated constant placeholder layers + // which are not achievable from body inputs. + net.addLayer(layer); + } + for (auto &kvp : old2new_d) kvp.second->name += suffix; + } + + TensorIterator::Body res; + for (auto &in : body.inputs) + res.inputs.emplace_back(old2new_d[in.get()]); + + for (auto &out : body.outputs) + res.outputs.emplace_back(old2new_d[out.get()]); + + return res; +} + +/************************************************************/ +/**** TI rule helpers *************************************/ +/************************************************************/ inline bool is_full_ranged(const TensorIterator::PortMap& rule, const DataPtr &data) { if (!data) @@ -39,35 +164,174 @@ inline bool is_full_ranged(const TensorIterator::PortMap& rule, const DataPtr &d : begin == size && end == 0; } -bool convertToLSTMSequence(CNNLayerPtr cur) { - if (cur->type != "TensorIterator") return false; - auto ti = std::dynamic_pointer_cast(cur); +inline int get_num_iteration(const std::shared_ptr &ti) { + int iter_num = 1; // 1 means no iteration + + for (auto & rule : ti->input_port_map) { + if (rule.axis == -1) continue; + + auto data = ti->insData[rule.from].lock(); + IE_ASSERT(data); + auto shape = data->getDims(); + size_t size = shape[rule.axis]; + size_t step = std::abs(rule.stride); + size_t cur_iter_size = size / step; + + if (iter_num == 1) { + iter_num = cur_iter_size; + } else { + if (iter_num != cur_iter_size) + return -1; // TI is inconsistent + } + } + + for (auto & rule : ti->output_port_map) { + if (rule.axis == -1) continue; + + auto data = ti->outData[rule.from]; + auto shape = data->getDims(); + + size_t size = shape[rule.axis]; + size_t step = std::abs(rule.stride); + size_t cur_iter_size = size / step; + + if (iter_num == 1) { + iter_num = cur_iter_size; + } else { + if (iter_num != cur_iter_size) + return -1; // TI is inconsistent + } + } + return iter_num; +} + +using RuleSet = std::vector; + +std::tuple ClassifyInRules(const std::shared_ptr &ti) { + /* + * first_class - which has iteration component + * second_class - which has no iteration and there are no backedge connection to the same port + * third_class - which has no iteration and has corresponding backedge + */ + RuleSet first_class_rules, second_class_rules, third_class_rules; + + std::set ports_with_backedge; + for (const auto &back_edge : ti->back_edges) ports_with_backedge.insert(back_edge.to); + + for (const auto &rule : ti->input_port_map) { + if (rule.axis != -1) + first_class_rules.push_back(rule); + + else if (!ports_with_backedge.count(rule.to)) + second_class_rules.push_back(rule); + + else + third_class_rules.push_back(rule); + } + return std::tuple {first_class_rules, second_class_rules, third_class_rules}; +} + +std::tuple ClassifyOutRules(const std::shared_ptr &ti) { + /* + * first_class - which has iteration component + * second_class - which has no iteration and there are no backedge connection to the same port + * third_class - which has no iteration and has corresponding backedge + */ + RuleSet first_class_rules, second_class_rules, third_class_rules; + + std::set ports_with_backedge; + for (const auto &back_edge : ti->back_edges) ports_with_backedge.insert(back_edge.from); + + for (const auto &rule : ti->output_port_map) { + if (rule.axis != -1) + first_class_rules.push_back(rule); + + else if (!ports_with_backedge.count(rule.to)) + second_class_rules.push_back(rule); + + else + third_class_rules.push_back(rule); + } + return std::tuple {first_class_rules, second_class_rules, third_class_rules}; +} + +/** + * Merge slave connections into master + * @param master + * @param slave + */ +void CombineData(DataPtr &master, DataPtr &slave) { + for (auto &kvp : slave->inputTo) { + auto &slave_layer = kvp.second; + for (auto &slv_ins_wptr : slave_layer->insData) { + auto slv_ins = slv_ins_wptr.lock(); + // Replace slave ptr with master + if (slv_ins == slave) slv_ins_wptr = master; + } + master->inputTo[slave_layer->name] = slave_layer; + } +} + +/************************************************************/ +/**** Converter Passes ************************************/ +/************************************************************/ + +static RNNSequenceLayer::CellType cell_type_from_name(std::string &layer_type) { + RNNSequenceLayer::CellType res; + if (layer_type == "LSTMCell") + res = RNNSequenceLayer::LSTM; + else if (layer_type == "GRUCell") + res = RNNSequenceLayer::GRU; + else if (layer_type == "RNNCell") + res = RNNSequenceLayer::GRU; + else + THROW_IE_EXCEPTION << "Unknown Cell type (" << layer_type << "). Expected LSTMCell|GRUCell|RNNCell"; + return res; +} + +static std::string cell_name(RNNSequenceLayer::CellType type) { + std::string res; + if (type == RNNSequenceLayer::LSTM) + res = "LSTM"; + else if (type == RNNSequenceLayer::GRU) + res = "GRU"; + else if (type == RNNSequenceLayer::GRU) + res = "GRU"; + else + THROW_IE_EXCEPTION << "Unknown Cell type (enum index: " << type << "). Expected LSTM|GRU|RNN"; + return res; +} + + +bool convertToRNNSeq(CNNLayerPtr cur, ICNNNetwork &net) { + if (cur->type != "TensorIterator") return true; + + auto ti = std::dynamic_pointer_cast(cur); IE_ASSERT(ti) << "Cannot cast object with type TensorIterator to TensorIterator object"; - // Topological order - std::vector all_body_layers; - CNNNetForestDFS(ti->body.inputs, [&](CNNLayerPtr current){ - all_body_layers.push_back(current); - }, false); - std::reverse(all_body_layers.begin(), all_body_layers.end()); + auto all_body_layers = SortTopologically(ti->body); // Check if body is: squeeze -> lstm_cell -> unsqueeze if (all_body_layers.size() != 3 || all_body_layers[0]->type != "Reshape" - || all_body_layers[1]->type != "LSTMCell" + || !one_of(all_body_layers[1]->type, "GRUCell", "RNNCell", "LSTMCell") || all_body_layers[2]->type != "Reshape") return false; - auto &rsp1 = all_body_layers[0]; - auto &lstm = all_body_layers[1]; - auto &rsp2 = all_body_layers[2]; + auto rsp1 = std::dynamic_pointer_cast(all_body_layers[0]); + auto cell = std::dynamic_pointer_cast(all_body_layers[1]); + auto rsp2 = std::dynamic_pointer_cast(all_body_layers[2]); + + auto cell_type = cell_type_from_name(all_body_layers[1]->type); - IE_ASSERT(lstm->insData.size() == 3); // {data, hidden, cell} - IE_ASSERT(lstm->outData.size() == 2); // {hidden, cell} + int NS = cell_type == RNNSequenceLayer::LSTM ? 2 : 1; // number of states - if (lstm->insData[0].lock()->creatorLayer.lock() != rsp1 || - lstm->outData[0]->inputTo.begin()->second != rsp2) + IE_ASSERT(cell->insData.size() == NS + 1); // {data, state1, [state2]} + IE_ASSERT(cell->outData.size() == NS); // {state1, [state2]} + + if (cell->insData[0].lock()->creatorLayer.lock() != rsp1 || + cell->outData[0]->inputTo.begin()->second != rsp2) return false; // Check port mapping @@ -76,16 +340,17 @@ bool convertToLSTMSequence(CNNLayerPtr cur) { return indx == scope.size() ? -1 : indx; }; - int in_hs_idx = _indx_in(ti->body.inputs, lstm->insData[1].lock()); - int in_cs_idx = _indx_in(ti->body.inputs, lstm->insData[2].lock()); int in_dt_idx = _indx_in(ti->body.inputs, rsp1->insData[0].lock()); + int in_hs_idx = _indx_in(ti->body.inputs, cell->insData[1].lock()); + int in_cs_idx = NS == 2 ? _indx_in(ti->body.inputs, cell->insData[2].lock()) : -1; - int out_hs_idx = _indx_in(ti->body.outputs, lstm->outData[0]); - int out_cs_idx = _indx_in(ti->body.outputs, lstm->outData[1]); int out_dt_idx = _indx_in(ti->body.outputs, rsp2->outData[0]); + int out_hs_idx = _indx_in(ti->body.outputs, cell->outData[0]); + int out_cs_idx = NS == 2 ? _indx_in(ti->body.outputs, cell->outData[1]) : -1; - // indexes should be [0,1,2] : sum == 3 - if (in_hs_idx + in_cs_idx + in_dt_idx != 3 || out_hs_idx + out_cs_idx + out_dt_idx != 3) + // indexes should be [0,1,2] : sum == 3 or [0,1,-1] : sum == 0 + int sum = (NS - 1) * 3; + if (in_hs_idx + in_cs_idx + in_dt_idx != sum || out_hs_idx + out_cs_idx + out_dt_idx != sum) return false; std::map i2map, o2map, be2map; @@ -93,12 +358,11 @@ bool convertToLSTMSequence(CNNLayerPtr cur) { for (auto &m : ti->output_port_map) o2map[m.to] = m; for (auto &m : ti->back_edges) be2map[m.to] = m; - if (!one_of(i2map.size(), 3, 1) || - !one_of(o2map.size(), 3, 1) || + if (!one_of(i2map.size(), NS + 1, 1) || + !one_of(o2map.size(), NS + 1, 1) || !one_of(be2map.size(), 2)) return false; - auto in_iter_rule = i2map[in_dt_idx]; auto in_iter_data = ti->insData[in_iter_rule.from].lock(); @@ -122,39 +386,47 @@ bool convertToLSTMSequence(CNNLayerPtr cur) { bool no_init_state = i2map.size() == 1; bool no_last_state = o2map.size() == 1; - if (!no_init_state && ( i2map[in_hs_idx].axis != -1 || i2map[in_cs_idx].axis != -1 )) + if (!no_init_state && ( i2map[in_hs_idx].axis != -1 || (NS == 2 && i2map[in_cs_idx].axis != -1) )) return false; - if (!no_last_state && ( o2map[out_hs_idx].axis != -1 || o2map[out_cs_idx].axis != -1 )) + if (!no_last_state && ( o2map[out_hs_idx].axis != -1 || (NS == 2 && o2map[out_cs_idx].axis != -1) )) return false; - auto i_order = no_init_state - ? std::vector{i2map[in_dt_idx].from} - : std::vector{i2map[in_dt_idx].from, - i2map[in_hs_idx].from, - i2map[in_cs_idx].from}; - auto o_order = no_last_state - ? std::vector{o2map[out_dt_idx].from} - : std::vector{o2map[out_dt_idx].from, - o2map[out_hs_idx].from, - o2map[out_cs_idx].from}; + std::vector i_order {i2map[in_dt_idx].from }; + if (!no_init_state) + i_order.push_back(i2map[in_hs_idx].from); + if (!no_init_state && NS == 2) + i_order.push_back(i2map[in_cs_idx].from); + + std::vector o_order {o2map[out_dt_idx].from}; + if (!no_last_state) + o_order.push_back(o2map[out_hs_idx].from); + if (!no_last_state && NS == 2) + o_order.push_back(o2map[out_cs_idx].from); // need swap an i/o ports if it is not in natural order - std::string name = lstm->name + "_sequence"; - auto rnn = std::make_shared(LayerParams{ name, "RNN", Precision::FP32 }); - rnn->cellType = "LSTM"; + std::string name = cell->name + "_sequence"; + auto rnn = std::make_shared(LayerParams{ name, cell_name(cell_type) + "Sequence", cell->precision}); + rnn->cellType = cell_type; rnn->axis = in_iter_rule.axis; rnn->direction = in_iter_rule.stride == 1 - ? RNNLayer::RNN_FWD - : RNNLayer::RNN_BWD; + ? RNNSequenceLayer::FWD + : RNNSequenceLayer::BWD; - rnn->_weights = dynamic_cast(lstm.get())->_weights; - rnn->blobs["weights"] = lstm->blobs["weights"]; - rnn->_biases = dynamic_cast(lstm.get())->_biases; - rnn->blobs["biases"] = lstm->blobs["biases"]; + // copy base RNN cell fields + rnn->_weights = cell->_weights; + rnn->_biases = cell->_biases; + rnn->blobs = cell->blobs; + rnn->activations = cell->activations; + rnn->activation_alpha = cell->activation_alpha; + rnn->activation_beta = cell->activation_beta; + rnn->hidden_size = cell->hidden_size; + rnn->clip = cell->clip; for (int i : i_order) { - rnn->insData.push_back(ti->insData[i]); - rnn->insData.back().lock()->inputTo[ti->name] = rnn; + auto in_data = ti->insData[i].lock(); + in_data->inputTo.erase(ti->name); + in_data->inputTo[rnn->name] = rnn; + rnn->insData.push_back(in_data); } for (int i : o_order) { rnn->outData.push_back(ti->outData[i]); @@ -164,16 +436,807 @@ bool convertToLSTMSequence(CNNLayerPtr cur) { return true; } -bool CombineLSTMSeq(const ICNNNetwork &net) { - // Apply action for all nodes - CNNNetForestDFS(CNNNetGetAllInputLayers(net), &convertToLSTMSequence, true); +bool unrollTI(CNNLayerPtr cur, ICNNNetwork &net) { + if (cur->type != "TensorIterator") + return true; + + auto ti = std::dynamic_pointer_cast(cur); + IE_ASSERT(ti) << "Cannot cast object with type TensorIterator to TensorIterator object"; + + int num = get_num_iteration(ti); // -1 means inconsistent TI + if (num == -1) return false; // TODO: better to throw exception + + const auto &body = ti->body; + + std::vector body_list(num); + for (int i = 0; i < num; i++) { + // copy with additional suffix to each object name + body_list[i] = CopyTIBody(net, body, ":" + std::to_string(i)); + } + + RuleSet first_class, second_class, third_class; + std::tie(first_class, second_class, third_class) = ClassifyInRules(ti); + + /** Clean links on TI */ + for (auto &ins : ti->insData) + ins.lock()->inputTo.erase(ti->name); + for (auto &outs : ti->outData) + outs->creatorLayer.reset(); + + /** FIRST class comes */ + for (int i = 0; i < first_class.size(); i++) { + auto &rule = first_class[i]; + auto in_data = ti->insData[rule.from].lock(); + + std::string name = ti->name + ":in_split_" + std::to_string(i); + auto split = std::make_shared(LayerParams{ name, "Split", cur->precision }); + split->_axis = rule.axis; + split->outData.resize(num); + split->insData.emplace_back(in_data); + in_data->inputTo[split->name] = split; + + for (int j = 0; j < num; j++) { + auto body_idx = rule.stride == 1 ? j : num - 1 - j; + auto &chunk = body_list[body_idx].inputs[rule.to]; + chunk->creatorLayer = split; + split->outData[j] = chunk; + } + } + + /** SECOND class come on */ + for (const auto &rule : second_class) { + auto in_data = ti->insData[rule.from].lock(); + + for (int j = 0; j < num; j++) { + auto &chunk = body_list[j].inputs[rule.to]; + CombineData(in_data, chunk); + } + } + + /** BACK EDGES that's your time */ + for (const auto &rule : ti->back_edges) { + for (int i = 1; i < num; i++) { + auto &from_data = body_list[i-1].outputs[rule.from]; + auto &to_data = body_list[i].inputs[rule.to]; + CombineData(from_data, to_data); + } + } + + /** THIRD class end up */ + for (const auto &rule : third_class) { + // first iteration + auto from_data = ti->insData[rule.from].lock(); + auto &to_data = body_list[0].inputs[rule.to];; + CombineData(from_data, to_data); + } + + /** And the same actions for outputs connections */ + std::tie(first_class, second_class, third_class) = ClassifyOutRules(ti); + + /** FIRST class comes */ + for (int i = 0; i < first_class.size(); i++) { + auto &rule = first_class[i]; + auto out_data = ti->outData[rule.from]; + + std::string name = ti->name + ":out_concat_" + std::to_string(i); + auto concat = std::make_shared(LayerParams{ name, "Concat", cur->precision }); + concat->_axis = rule.axis; + concat->insData.resize(num); + concat->outData.emplace_back(out_data); + out_data->creatorLayer = concat; + + for (int j = 0; j < num; j++) { + auto body_idx = rule.stride == 1 ? j : num - 1 - j; + auto &chunk = body_list[body_idx].outputs[rule.to]; + chunk->inputTo[concat->name] = concat; + concat->insData[j] = chunk; + } + } + + /** SECOND class come on */ + for (const auto &rule : second_class) { + auto out_data = ti->outData[rule.from]; + + for (int j = 0; j < num; j++) { + auto &chunk = body_list[j].outputs[rule.to]; + CombineData(chunk, out_data); + } + } + + /** THIRD class end up */ + for (const auto &rule : third_class) { + // first iteration + auto &from_data = ti->outData[rule.from]; + auto &to_data = body_list[num-1].outputs[rule.to]; + + auto parent = to_data->creatorLayer.lock(); + std::replace(parent->outData.begin(), parent->outData.end(), to_data, from_data); + from_data->creatorLayer = parent; + + CombineData(from_data, to_data); + } + return true; +} + +/************************************************************/ +/**** Builder helpers ************************************/ +/************************************************************/ + +static CNNLayerPtr _concat(std::string name, Precision prc, SizeVector dims, int num) { + auto res = std::make_shared(LayerParams{name, "Concat", prc}); + res->_axis = 1; + + res->insData.resize(num); + res->outData.resize(1); + + auto out_data = DataPtr(new Data(name, + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[0] = out_data; + return res; +} + +static CNNLayerPtr _split(std::string name, Precision prc, SizeVector dims, int num) { + auto res = std::make_shared(LayerParams{name, "Split", prc}); + res->_axis = 1; + res->params["axis"] = res->_axis; + + res->insData.resize(1); + res->outData.resize(num); + + for (int i = 0; i < num; i++) { + auto out_data = DataPtr(new Data(name + "_part_" + std::to_string(i), + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[i] = out_data; + } + return res; +} + +static CNNLayerPtr _fc(std::string name, Precision prc, SizeVector dims, Blob::Ptr &W, Blob::Ptr &B) { + auto res = std::make_shared(LayerParams{name, "FullyConnected", prc}); + + res->_weights = W; + res->_biases = B; + res->_out_num = dims[1]; + res->blobs["weights"] = W; + res->blobs["biases"] = B; + res->params["out-size"] = std::to_string(dims[1]); + + res->insData.resize(1); + res->outData.resize(1); + + auto out_data = DataPtr(new Data(name, + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[0] = out_data; + return res; +} + +static CNNLayerPtr _act(std::string name, Precision prc, SizeVector dims, std::string type) { + auto res = std::make_shared(LayerParams{name, "Activation", prc}); + + res->params["type"] = type; + + res->insData.resize(1); + res->outData.resize(1); + + auto out_data = DataPtr(new Data(name, + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[0] = out_data; + return res; +} + +static CNNLayerPtr _pwr(std::string name, Precision prc, SizeVector dims, float scale, float shift) { + auto res = std::make_shared(LayerParams{name, "Power", prc}); + + res->power = 1.0; + res->scale = scale; + res->offset = shift; + res->params["power"] = res->power; + res->params["scale"] = res->scale; + res->params["shift"] = res->offset; + + res->insData.resize(1); + res->outData.resize(1); + + auto out_data = DataPtr(new Data(name, + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[0] = out_data; + return res; +} + + +static CNNLayerPtr _eltw(std::string name, Precision prc, SizeVector dims, std::string type) { + auto res = std::make_shared(LayerParams{name, "Eltwise", prc}); + + res->params["operation"] = type; + res->_operation = type == "sum" ? EltwiseLayer::Sum : EltwiseLayer::Prod; + + res->insData.resize(2); + res->outData.resize(1); + + auto out_data = DataPtr(new Data(name, + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[0] = out_data; + return res; +} + +static std::shared_ptr _resh(std::string name, Precision prc, SizeVector dims) { + auto res = std::make_shared(LayerParams{name, "Reshape", prc}); + + res->insData.resize(1); + res->outData.resize(1); + + auto out_data = DataPtr(new Data(name, + TensorDesc { prc, dims, TensorDesc::getLayoutByDims(dims) })); + out_data->creatorLayer = res; + + res->outData[0] = out_data; + return res; +} + +static std::shared_ptr _cell(std::string name, Precision prc, SizeVector data_dims, SizeVector state_dims, RNNSequenceLayer::CellType type) { + std::shared_ptr res; + size_t NS = 1; + switch (type) { + case RNNSequenceLayer::LSTM: + res = std::make_shared(LayerParams{name, "LSTMCell", prc}); NS = 2; + break; + case RNNSequenceLayer::GRU: + case RNNSequenceLayer::GRU_LBR: + res = std::make_shared(LayerParams{name, "GRUCell", prc}); + break; + case RNNSequenceLayer::RNN: + res = std::make_shared(LayerParams{name, "RNNCell", prc}); + break; + } + + res->cellType = type; + res->insData.resize(1 + NS); + res->outData.resize(NS); + + auto out_data = DataPtr(new Data(name + ":out_data", + TensorDesc { prc, data_dims, TensorDesc::getLayoutByDims(data_dims) })); + out_data->creatorLayer = res; + res->outData[0] = out_data; + + for (size_t i = 0; i < NS; i++) { + auto out_state = DataPtr(new Data(name + ":out_state_" + std::to_string(i), + TensorDesc { prc, state_dims, TensorDesc::getLayoutByDims(state_dims) })); + out_state->creatorLayer = res; + res->outData[i] = out_state; + } + + return res; +} + +static std::shared_ptr _ti(std::string name, Precision prc, size_t NS) { + auto res = std::make_shared(LayerParams{name, "TensorIterator", prc}); + + res->insData.resize(1 + NS); + res->outData.resize(1 + NS); + + return res; +} + +static void _link(CNNLayerPtr src, CNNLayerPtr dst, size_t src_port = 0, size_t dst_port = 0) { + auto data = src->outData[src_port]; + data->inputTo[dst->name] = dst; + dst->insData[dst_port] = data; +} + +static void _link(DataPtr &data, CNNLayerPtr dst, size_t dst_port = 0) { + data->inputTo[dst->name] = dst; + dst->insData[dst_port] = data; +} + +/** Link nodes with clipping data if required (clip_val != 0.0) */ +static void _link_with_clip(CNNLayerPtr src, CNNLayerPtr dst, const float clip_val, + size_t src_port = 0, size_t dst_port = 0) { + if (clip_val == 0.0f) { + _link(src, dst, src_port, dst_port); + } else { + auto clip_name = dst->name + "_clip"; + auto clip_prc = dst->precision; + auto clip_shape = src->outData[src_port]->getTensorDesc().getDims(); + auto clip = _act(clip_name, clip_prc, clip_shape, "clamp"); + clip->params["min"] = std::to_string(-clip_val); + clip->params["max"] = std::to_string(clip_val); + + _link(src, clip, src_port, 0); + _link(clip, dst, 0, dst_port); + } +} + + +static Blob::Ptr make_partial_copy(Blob::Ptr src, size_t off, size_t size) { + auto res = make_plain_blob(src->precision(), {size}); + res->allocate(); + + size_t elem_size = src->precision().size(); + auto src_ptr = src->buffer().as(); + auto dst_ptr = res->buffer().as(); + + ie_memcpy(dst_ptr, res->byteSize(), src_ptr + off * elem_size, size * elem_size); + + return res; +} + +static Blob::Ptr wrap_as_tensor(Blob::Ptr src, SizeVector dims) { + auto res = make_blob_with_precision( + TensorDesc { src->precision(), dims, plain_layout(dims) }, + src->buffer()); + IE_ASSERT(src->size() == res->size()); + return res; +} + +static Blob::Ptr make_region_copy(Blob::Ptr src, SizeVector region, SizeVector offset) { + IE_ASSERT(region.size() == offset.size()); + IE_ASSERT(region.size() == src->dims().size()); + + auto res = make_plain_blob(src->precision(), region); + res->allocate(); + + size_t elem_size = src->precision().size(); + auto src_ptr = src->buffer().as(); + auto dst_ptr = res->buffer().as(); + + auto &dd = src->getTensorDesc().getDims(); + SizeVector src_dims {1, 1, 1}; + std::copy(dd.begin(), dd.end(), src_dims.end() - dd.size()); + + SizeVector dims {1, 1, 1}; + std::copy(region.begin(), region.end(), dims.end() - region.size()); + + SizeVector off {0, 0, 0}; + std::copy(offset.begin(), offset.end(), off.end() - offset.size()); + + const auto D1 = dims[0]; + const auto D2 = dims[1]; + const auto D3 = dims[2]; + const auto off1 = off[0]; + const auto off2 = off[1]; + const auto off3 = off[2]; + const auto str1 = src_dims[1]*src_dims[2]; + const auto str2 = src_dims[2]; + + for (size_t d1 = 0; d1 < D1; d1++) + for (size_t d2 = 0; d2 < D2; d2++) { + auto off_src = (off1 + d1)*str1 + (off2 + d2)*str2 + off3; + auto off_dst = d1*D2*D3 + d2*D3; + ie_memcpy(dst_ptr + off_dst * elem_size, res->byteSize(), src_ptr + off_src * elem_size, D3 * elem_size); + } + + return res; +} + + +static bool unrollRNNCellBody(CNNLayerPtr cur) { + if (cur->type != "RNNCell") + return true; + + auto cell = std::dynamic_pointer_cast(cur); + IE_ASSERT(cell) << "Cannot cast object with type ***Cell to WeightableLayer object"; + + auto name = cell->name; + + auto in_data = cell->insData[0].lock(); + auto in_h_state = cell->insData[1].lock(); + auto out_h_state = cell->outData[0]; + + auto d_dims = in_data->getTensorDesc().getDims(); + auto s_dims = in_h_state->getTensorDesc().getDims(); + + size_t N = d_dims[0]; + size_t D = d_dims[1]; + size_t S = s_dims[1]; + + auto prc = cell->precision; + + /** Release links on TI */ + for (auto &ins : cell->insData) + ins.lock()->inputTo.erase(cell->name); + for (auto &outs : cell->outData) + outs->creatorLayer.reset(); + + // operations + auto concat = _concat(name + ":concat", prc, {N, D+S}, 2); + auto fc = _fc(name + ":fc", prc, {N, S}, cell->_weights, cell->_biases); + auto act = _act(name + ":act", prc, {N, S}, cell->activations[0]); + + // Connection + _link(in_data, concat, 0); + _link(in_h_state, concat, 1); + _link(concat, fc); + _link_with_clip(fc, act, cell->clip); + + // Output + act->outData[0] = out_h_state; + out_h_state->creatorLayer = act; + + return true; +} + +static bool unrollLSTMCellBody(CNNLayerPtr cur) { + if (cur->type != "LSTMCell") + return true; + + auto cell = std::dynamic_pointer_cast(cur); + IE_ASSERT(cell) << "Cannot cast object with type ***Cell to WeightableLayer object"; + + auto name = cell->name; + + auto in_data = cell->insData[0].lock(); + auto in_h_state = cell->insData[1].lock(); + auto in_c_state = cell->insData[2].lock(); + auto out_h_state = cell->outData[0]; + auto out_c_state = cell->outData[1]; + + auto d_dims = in_data->getTensorDesc().getDims(); + auto s_dims = in_h_state->getTensorDesc().getDims(); + + size_t N = d_dims[0]; + size_t D = d_dims[1]; + size_t S = s_dims[1]; + size_t G = 4; + + auto prc = cell->precision; + + /** Release links on TI */ + for (auto &ins : cell->insData) + ins.lock()->inputTo.erase(cell->name); + for (auto &outs : cell->outData) + outs->creatorLayer.reset(); + + // operations + auto concat = _concat(name + ":concat", prc, {N, D+S}, 2); + auto split = _split(name + ":split", prc, {N, S}, G); + auto fc = _fc(name + ":fc", prc, {N, S*G}, cell->_weights, cell->_biases); + + const std::string _f = cell->activations[0], _g = cell->activations[1], _h = cell->activations[2]; + + auto act_f = _act(name + ":act_f", prc, {N, S}, _f); + auto act_i = _act(name + ":act_i", prc, {N, S}, _f); + auto act_c = _act(name + ":act_c", prc, {N, S}, _g); + auto act_o = _act(name + ":act_o", prc, {N, S}, _f); + auto act_x = _act(name + ":act_x", prc, {N, S}, _h); + + auto mul_ic = _eltw(name + ":mul_ic", prc, {N, S}, "mul"); + auto mul_f = _eltw(name + ":mul_f" , prc, {N, S}, "mul"); + auto sum = _eltw(name + ":sum" , prc, {N, S}, "sum"); + auto mul = _eltw(name + ":mul" , prc, {N, S}, "mul"); + + // Connection + _link(in_data, concat, 0); + _link(in_h_state, concat, 1); + _link(concat, fc); + + _link_with_clip(fc, split, cell->clip); + + _link(split, act_f, 0, 0); + _link(split, act_i, 1, 0); + _link(split, act_c, 2, 0); + _link(split, act_o, 3, 0); + + _link(act_i, mul_ic, 0, 0); + _link(act_c, mul_ic, 0, 1); + + _link(act_f, mul_f, 0, 0); + _link(in_c_state, mul_f, 1); + + _link(mul_f, sum, 0, 0); + _link(mul_ic, sum, 0, 1); + + _link(sum, act_x); + + _link(act_x, mul, 0, 0); + _link(act_o, mul, 0, 1); + + // Output + mul->outData[0] = out_h_state; + out_h_state->creatorLayer = mul; + + CombineData(out_c_state, sum->outData[0]); + sum->outData[0] = out_c_state; + out_c_state->creatorLayer = sum; + + return true; +} + +static bool unrollGRUCellBody(CNNLayerPtr cur, bool linear_before_reset = false) { + if (cur->type != "GRUCell") + return true; + + auto cell = std::dynamic_pointer_cast(cur); + IE_ASSERT(cell) << "Cannot cast object with type ***Cell to WeightableLayer object"; + + auto name = cell->name; + + auto in_data = cell->insData[0].lock(); + auto in_h_state = cell->insData[1].lock(); + auto out_h_state = cell->outData[0]; + + auto d_dims = in_data->getTensorDesc().getDims(); + auto s_dims = in_h_state->getTensorDesc().getDims(); + + size_t N = d_dims[0]; + size_t D = d_dims[1]; + size_t S = s_dims[1]; + + // Split weights UR and O gates. Original gates are URO + size_t bG = linear_before_reset ? 4 : 3; + auto orig_W = wrap_as_tensor(cell->_weights, {3, S, D+S}); + auto orig_B = wrap_as_tensor(cell->_biases, {bG, S}); + + auto ur_W = make_region_copy(orig_W, {2, S, D+S}, {0, 0, 0}); + auto o_W = make_region_copy(orig_W, {1, S, D+S}, {2, 0, 0}); + auto ur_B = make_region_copy(orig_B, {2, S}, {0, 0}); + auto o_B = make_region_copy(orig_B, {1, S}, {2, 0}); + + auto prc = cell->precision; + + /** Release links on TI */ + for (auto &ins : cell->insData) + ins.lock()->inputTo.erase(cell->name); + for (auto &outs : cell->outData) + outs->creatorLayer.reset(); + + // operations + auto concat = _concat(name + ":concat", prc, {N, D+S}, 2); + auto split = _split(name + ":split", prc, {N, S}, 2); + auto fc_ur = _fc(name + ":fc_ur", prc, {N, S*2}, ur_W, ur_B); + + const std::string _f = cell->activations[0], _g = cell->activations[1]; + + auto act_ur = _act(name + ":act_ur", prc, {N, 2*S}, _f); + auto act_o = _act(name + ":act_o", prc, {N, S}, _g); + + auto mul_u = _eltw(name + ":mul_u", prc, {N, S}, "mul"); + auto mul_r = _eltw(name + ":mul_r", prc, {N, S}, "mul"); + + auto pwr_m1 = _pwr(name + ":pwr", prc, {N, S}, -1.0, 1.0); + + auto mul = _eltw(name + ":mul" , prc, {N, S}, "mul"); + auto sum = _eltw(name + ":sum" , prc, {N, S}, "sum"); + + /** + * - zt = _f(Wz*[Xt + Ht-1] + Bz) + * - rt = _f(Wr*[Xt + Ht-1] + Br) + * - ht = _g(Wh*[Xt + (rt (.) Ht-1)] + Bh) # default, when linear_before_reset = 0 + * - ht = _g(Whw*Xt + Bhw + (rt (.) (Whr*Ht-1 + Bhr))) # when linear_before_reset != 0 + * - Ht = (1 - zt) (.) ht + zt (.) Ht-1 + */ + _link(in_data, concat, 0); + _link(in_h_state, concat, 1); + _link(concat, fc_ur); + _link_with_clip(fc_ur, act_ur, cell->clip); + _link(act_ur, split); // split[0] - zt, split[1] - rt + + if (linear_before_reset) { + auto lbr_B = wrap_as_tensor(orig_B, {4, S}); + + auto whw_W = make_region_copy(o_W, {1, S, D}, {0, 0, 0}); + auto whr_W = make_region_copy(o_W, {1, S, S}, {0, 0, D}); + auto whw_B = make_region_copy(lbr_B, {1, S}, {2, 0}); + auto whr_B = make_region_copy(lbr_B, {1, S}, {3, 0}); + + auto fc_whr = _fc(name + ":fc_whr", prc, {N, S}, whr_W, whr_B); + auto fc_whw = _fc(name + ":fc_whw", prc, {N, S}, whw_W, whw_B); + auto sum_h = _eltw(name + ":sum_h", prc, {N, S}, "sum"); + + _link(in_h_state, fc_whr); // Whr*Ht-1 + Bhr + _link(fc_whr, mul_r, 0); // + _link(split, mul_r, 1, 1); // rt (.) (Whr*Ht-1 + Bhr) + _link(in_data, fc_whw); // Whw*Xt + Bhw + _link(fc_whw, sum_h, 0, 0); // + _link(mul_r, sum_h, 0, 1); // Whw*Xt + Bhw + (rt (.) (Whr*Ht-1 + Bhr)) + _link_with_clip(sum_h, act_o, cell->clip); // _g(Whw*Xt + Bhw + (rt (.) (Whr*Ht-1 + Bhr))) + } else { + auto fc_wh = _fc(name + ":fc_o", prc, {N, S}, o_W, o_B); + auto concat_h = _concat(name + ":concat_h", prc, {N, D+S}, 2); + + _link(split, mul_r, 1, 0); // + _link(in_h_state, mul_r, 1); // rt (.) Ht-1 + _link(in_data, concat_h, 0); // + _link(mul_r, concat_h, 0, 1); // [Xt + (rt (.) Ht-1)] + _link(concat_h, fc_wh); // Wh*[Xt + (rt (.) Ht-1)] + Bh + _link_with_clip(fc_wh, act_o, cell->clip); // _g(Wh*[Xt + (rt (.) Ht-1)] + Bh) + } + + _link(split, pwr_m1, 0, 0); // 1 - zt + _link(act_o, mul, 0, 0); // + _link(pwr_m1, mul, 0, 1); // (1 - zt) (.) ht + _link(split, mul_u, 0, 0); // + _link(in_h_state, mul_u, 1); // zt (.) Ht-1 + _link(mul, sum, 0, 0); // + _link(mul_u, sum, 0, 1); // (1 - zt) (.) ht + zt (.) Ht-1 + + // Output + sum->outData[0] = out_h_state; + out_h_state->creatorLayer = sum; + return true; } -bool UnrollTI(const ICNNNetwork &net) { +static bool unrollCell(CNNLayerPtr cur, ICNNNetwork &net) { + auto cell = std::dynamic_pointer_cast(cur); + switch (cell->cellType) { + case RNNCellBase::LSTM: return unrollLSTMCellBody(cur); + case RNNCellBase::GRU: return unrollGRUCellBody(cur); + case RNNCellBase::GRU_LBR: return unrollGRUCellBody(cur, true); + case RNNCellBase::RNN: return unrollRNNCellBody(cur); + } return false; } +static bool unrollSeq(CNNLayerPtr cur, ICNNNetwork &net) { + if (!one_of(cur->type, "LSTMSequence", "GRUSequence", "RNNSequence")) + return true; + + auto seq = std::dynamic_pointer_cast(cur); + IE_ASSERT(seq) << "Cannot cast object with type ***Sequence to RNNSequenceLayer object"; + + auto name = seq->name; + + auto in_data = seq->insData[0].lock(); + auto in_h_state = seq->insData[1].lock(); + auto out_data = seq->outData[0]; + + auto in_d_dims = in_data->getTensorDesc().getDims(); + auto state_dims = in_h_state->getTensorDesc().getDims(); + auto out_d_dims = out_data->getTensorDesc().getDims(); + + const int axis = seq->axis; + const auto direct = seq->direction; + const auto prc = seq->precision; + + /** Release links on Seq */ + for (auto &ins : seq->insData) + ins.lock()->inputTo.erase(seq->name); + for (auto &outs : seq->outData) + outs->creatorLayer.reset(); + + /** Body subgraph*/ + auto in_d_body_dims = in_d_dims; + in_d_body_dims[axis] = 1; + + auto in_d_body_squeeze_dims = in_d_dims; + in_d_body_squeeze_dims.erase(in_d_body_squeeze_dims.begin() + axis); + + auto out_d_body_dims = out_d_dims; + out_d_body_dims[axis] = 1; + + auto out_d_body_squeeze_dims = out_d_dims; + out_d_body_squeeze_dims.erase(out_d_body_squeeze_dims.begin() + axis); + + auto body_in_data = DataPtr(new Data(name + ":data_in", + TensorDesc { prc, in_d_body_dims, TensorDesc::getLayoutByDims(in_d_body_dims) })); + + auto resh1 = _resh(name + ":resh1", prc, in_d_body_squeeze_dims); + auto cell = _cell(name + ":cell", prc, out_d_body_squeeze_dims, state_dims, seq->cellType); + auto resh2 = _resh(name + ":resh2", prc, out_d_body_dims); + + _link(body_in_data, resh1); + _link(resh1, cell); + _link(cell, resh2); + + cell->_weights = seq->_weights; + cell->_biases = seq->_biases; + cell->hidden_size = seq->hidden_size; + cell->clip = seq->clip; + cell->activations = seq->activations; + cell->activation_alpha = seq->activation_alpha; + cell->activation_beta = seq->activation_beta; + + const size_t NS = cell->outData.size(); // num of state + + /** TI layer */ + auto ti = _ti(name + ":ti", prc, NS); + _link(in_data, ti, 0); + + ti->outData[0] = out_data; + out_data->creatorLayer = ti; + + ti->body.inputs.push_back(body_in_data); + ti->body.outputs.push_back(resh2->outData[0]); + + int start = direct == RNNSequenceLayer::FWD ? 0 : -1; + int end = direct == RNNSequenceLayer::FWD ? -1 : 0; + int step = direct == RNNSequenceLayer::FWD ? 1 : -1; + ti->input_port_map.push_back({0, 0, axis, step, start, end, 1}); + ti->output_port_map.push_back({0, 0, axis, step, start, end, 1}); + + for (size_t i = 0; i < NS; i++) { + auto in_state = seq->insData[1 + i].lock(); + _link(in_state, ti, 1 + i); + + auto out_state = seq->outData[1 + i]; + ti->outData[1 + i] = out_state; + out_state->creatorLayer = ti; + + auto body_in_state = DataPtr(new Data(name + ":state_in_" + std::to_string(i), + TensorDesc { prc, state_dims, TensorDesc::getLayoutByDims(state_dims) })); + + _link(body_in_state, cell, 1 + i); + + ti->body.inputs.push_back(body_in_state); + ti->body.outputs.push_back(cell->outData[i]); + + const int ii = 1 + static_cast(i); + ti->input_port_map.push_back({ii, ii, -1, 0, 0, 0, 0}); + ti->output_port_map.push_back({ii, ii, -1, 0, 0, 0, 0}); + ti->back_edges.push_back({ii, ii, -1, 0, 0, 0, 0}); + } + + unrollTI(ti, net); + + return true; +} + +/************************************************************/ +/**** Converter API ***************************************/ +/************************************************************/ + +template +bool ApplyForAll(ICNNNetwork &net, T action) { + auto all_layers = details::CNNNetSortTopologically(net); + bool sts = true; + + for (auto &layer : all_layers) + sts &= action(layer, net); + + return sts; +} + +template +bool ApplyForAll_if(ICNNNetwork &net, T action, P pred) { + auto all_layers = details::CNNNetSortTopologically(net); + bool sts = true; + + for (auto &layer : all_layers) + if (pred(layer)) + sts &= action(layer, net); + + return sts; +} + +bool CombineRNNSeq(ICNNNetwork &net) { + return ApplyForAll(net, convertToRNNSeq); +} + +bool UnrollTI(ICNNNetwork &net) { + return ApplyForAll(net, unrollTI); +} + +bool UnrollRNN_if(ICNNNetwork &net, const std::function pred) { + // Filter layers by RNN specific type + auto _seq_pred = [&] (CNNLayerPtr layer) { + auto rnn = std::dynamic_pointer_cast(layer); + if (!rnn) return false; + return pred(*rnn.get()); + }; + auto _cell_pred = [&] (CNNLayerPtr layer) { + auto rnn = std::dynamic_pointer_cast(layer); + if (!rnn || !one_of(rnn->type, "LSTMCell", "GRUCell", "RNNCell")) return false; + return pred(*rnn.get()); + }; + + bool res = true; + res &= ApplyForAll_if(net, unrollSeq, _seq_pred); + res &= ApplyForAll_if(net, unrollCell, _cell_pred); + return res; +} + } // namespace NetPass } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/net_pass.h b/inference-engine/src/inference_engine/net_pass.h index 8b19286..62e996f 100644 --- a/inference-engine/src/inference_engine/net_pass.h +++ b/inference-engine/src/inference_engine/net_pass.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,17 +15,31 @@ namespace NetPass { /** * Try to detect LSTM Sequence pattern inside TI and convert it + * * @param net network to modify * @return true if all Tensor iterator was converted */ -INFERENCE_ENGINE_API_CPP(bool) CombineLSTMSeq(const ICNNNetwork &net); +INFERENCE_ENGINE_API_CPP(bool) CombineRNNSeq(ICNNNetwork &net); /** * Unroll all present Tensor Iterators + * * @param net network to modify * @return true if all Tensor iterator was unrolled successfully */ -INFERENCE_ENGINE_API_CPP(bool) UnrollTI(const ICNNNetwork &net); +INFERENCE_ENGINE_API_CPP(bool) UnrollTI(ICNNNetwork &net); + +/** + * Unroll all RNN specific layers by predicate + * + * Will be applied to all RNNSeq and RNNCell layers + * + * @param net network to modify + * @param pred predicate to mark layer to unroll + * @return true if all RNN layers was unrolled successfully + */ +INFERENCE_ENGINE_API_CPP(bool) UnrollRNN_if(ICNNNetwork &net, + std::function pred); } // namespace NetPass } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/network_serializer.cpp b/inference-engine/src/inference_engine/network_serializer.cpp index f530e35..4ccf4a5 100644 --- a/inference-engine/src/inference_engine/network_serializer.cpp +++ b/inference-engine/src/inference_engine/network_serializer.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include "details/ie_cnn_network_tools.h" #include "details/caseless.hpp" #include "network_serializer.h" +#include "exec_graph_info.hpp" #include "xml_parse_utils.h" using namespace InferenceEngine; @@ -38,22 +39,44 @@ void NetworkSerializer::serialize( const std::string &xmlPath, const std::string &binPath, const InferenceEngine::ICNNNetwork& network) { + const std::vector ordered = CNNNetSortTopologically(network); - std::ofstream ofsBin(binPath, std::ofstream::out | std::ofstream::binary); - if (!ofsBin) { - THROW_IE_EXCEPTION << "File '" << binPath << "' is not opened as out file stream"; + // A flag for serializing executable graph information (not complete IR) + bool execGraphInfoSerialization = false; + // If first layer has perfCounter parameter set then it's executable graph info serialization. + // All other layers must also have this parameter set. + if (ordered[0]->params.find(ExecGraphInfoSerialization::PERF_COUNTER) != ordered[0]->params.end()) { + execGraphInfoSerialization = true; + for (const auto &layer : ordered) { + if (layer->params.find(ExecGraphInfoSerialization::PERF_COUNTER) == layer->params.end()) { + THROW_IE_EXCEPTION << "Each node must have " << ExecGraphInfoSerialization::PERF_COUNTER + << " parameter set in case of executable graph info serialization"; + } + } + } + + bool dumpWeights = !execGraphInfoSerialization & !binPath.empty(); + std::ofstream ofsBin; + if (dumpWeights) { + ofsBin.open(binPath, std::ofstream::out | std::ofstream::binary); + if (!ofsBin) { + THROW_IE_EXCEPTION << "File '" << binPath << "' is not opened as out file stream"; + } } pugi::xml_document doc; - pugi::xml_node net = doc.append_child("net"); - net.append_attribute("name").set_value(network.getName().c_str()); - net.append_attribute("version").set_value("3"); - net.append_attribute("batch").set_value(network.getBatchSize()); + pugi::xml_node netXml = doc.append_child("net"); + netXml.append_attribute("name").set_value(network.getName().c_str()); + + // no need to print this information for executable graph information serialization because it is not IR. + if (!execGraphInfoSerialization) { + netXml.append_attribute("version").set_value("3"); + netXml.append_attribute("batch").set_value(network.getBatchSize()); + } - pugi::xml_node layers = net.append_child("layers"); + pugi::xml_node layers = netXml.append_child("layers"); - const std::vector ordered = CNNNetSortTopologically(network); - std::map matching; + std::map matching; for (size_t i = 0; i < ordered.size(); i++) { matching[ordered[i]] = i; } @@ -70,18 +93,20 @@ void NetworkSerializer::serialize( layer.append_attribute("precision").set_value(precision.name()); layer.append_attribute("id").set_value(i); - updateStdLayerParams(node); + if (!execGraphInfoSerialization) { + updateStdLayerParams(node); + } const auto ¶ms = node->params; - if (params.size()) { + if (!params.empty()) { pugi::xml_node data = layer.append_child(dataName.c_str()); - for (const auto it : params) { + for (const auto &it : params) { data.append_attribute(it.first.c_str()).set_value(it.second.c_str()); } } - if (node->insData.size()) { + if (!node->insData.empty()) { pugi::xml_node input = layer.append_child("input"); for (size_t iport = 0; iport < node->insData.size(); iport++) { @@ -95,7 +120,7 @@ void NetworkSerializer::serialize( } } } - if (node->outData.size()) { + if (!node->outData.empty()) { pugi::xml_node input = layer.append_child("output"); for (size_t oport = 0; oport < node->outData.size(); oport++) { pugi::xml_node port = input.append_child("port"); @@ -107,9 +132,9 @@ void NetworkSerializer::serialize( } } } - if (node->blobs.size()) { + if (dumpWeights && !node->blobs.empty()) { auto blobsNode = layer.append_child("blobs"); - for (const auto dataIt : node->blobs) { + for (const auto &dataIt : node->blobs) { const char *dataPtr = dataIt.second->buffer().as(); size_t dataSize = dataIt.second->byteSize(); @@ -126,31 +151,33 @@ void NetworkSerializer::serialize( } } - ofsBin.close(); - if (!ofsBin.good()) { - THROW_IE_EXCEPTION << "Error during '" << binPath << "' closing"; + if (dumpWeights) { + ofsBin.close(); + if (!ofsBin.good()) { + THROW_IE_EXCEPTION << "Error during '" << binPath << "' closing"; + } } - pugi::xml_node edges = net.append_child("edges"); + pugi::xml_node edges = netXml.append_child("edges"); - for (size_t i = 0; i < ordered.size(); i++) { - const CNNLayer::Ptr node = ordered[i]; + for (const auto &ord : ordered) { + const CNNLayer::Ptr node = ord; - if (node->outData.size()) { + if (!node->outData.empty()) { auto itFrom = matching.find(node); if (itFrom == matching.end()) { THROW_IE_EXCEPTION << "Internal error, cannot find " << node->name << " in matching container during serialization of IR"; } for (size_t oport = 0; oport < node->outData.size(); oport++) { const DataPtr outData = node->outData[oport]; - for (auto inputTo : outData->inputTo) { + for (const auto &inputTo : outData->inputTo) { auto itTo = matching.find(inputTo.second); if (itTo == matching.end()) { THROW_IE_EXCEPTION << "Broken edge form layer " << node->name << " to layer " << inputTo.first<< "during serialization of IR"; } - size_t foundPort = -1; - for (size_t iport = 0; iport < inputTo.second->insData.size(); iport++) { + int foundPort = -1; + for (int iport = 0; iport < inputTo.second->insData.size(); iport++) { if (inputTo.second->insData[iport].lock() == outData) { foundPort = iport; } @@ -171,63 +198,10 @@ void NetworkSerializer::serialize( } } - - InputsDataMap inputInfo; - network.getInputsInfo(inputInfo); - - // assuming that we have preprocess only for one input - for (auto ii : inputInfo) { - const PreProcessInfo& pp = ii.second->getPreProcess(); - size_t nInChannels = pp.getNumberOfChannels(); - if (nInChannels) { - pugi::xml_node preproc = net.append_child("pre-process"); - - preproc.append_attribute("reference-layer-name").set_value(ii.first.c_str()); - preproc.append_attribute("mean-precision").set_value(Precision(Precision::FP32).name()); - - for (size_t ch = 0; ch < nInChannels; ch++) { - const PreProcessChannel::Ptr &preProcessChannel = pp[ch]; - auto channel = preproc.append_child("channel"); - channel.append_attribute("id").set_value(ch); - - auto mean = channel.append_child("mean"); - - if (!preProcessChannel->meanData) { - mean.append_attribute("value").set_value(preProcessChannel->meanValue); - } else { - THROW_IE_EXCEPTION << "Mean data is not supported yet for serialization of the model"; - } - } - } - } - - - // adding statistic to the file if statistic exists - ICNNNetworkStats* netNodesStats = nullptr; - auto stats = net.append_child("statistics"); - network.getStats(&netNodesStats, nullptr); - const NetworkStatsMap statsmap = netNodesStats->getNodesStats(); - - auto joinCommas = [&](const std::vector& v) -> std::string { - std::string res; - - for (size_t i = 0; i < v.size(); ++i) { - res += std::to_string(v[i]); - if (i < v.size() - 1) { - res += ", "; - } - } - - return res; - }; - - for (const auto itStats : statsmap) { - auto layer = stats.append_child("layer"); - - layer.append_child("name").text().set(itStats.first.c_str()); - - layer.append_child("min").text().set(joinCommas(itStats.second->_minOutputs).c_str()); - layer.append_child("max").text().set(joinCommas(itStats.second->_maxOutputs).c_str()); + // no need to print this info in case of executable graph info serialization + if (!execGraphInfoSerialization) { + updatePreProcInfo(network, netXml); + updateStatisticsInfo(network, netXml); } if (!doc.save_file(xmlPath.c_str())) { @@ -235,20 +209,19 @@ void NetworkSerializer::serialize( } } - -void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr layer) { +void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr &layer) { auto layerPtr = layer.get(); auto ¶ms = layer->params; if (CaselessEq()(layer->type, "power")) { - PowerLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["scale"] = std::to_string(lr->scale); params["shift"] = std::to_string(lr->offset); params["power"] = std::to_string(lr->power); } else if (CaselessEq()(layer->type, "convolution") || - CaselessEq()(layer->type, "deconvolution")) { - ConvolutionLayer *lr = dynamic_cast(layerPtr); + CaselessEq()(layer->type, "deconvolution")) { + auto *lr = dynamic_cast(layerPtr); params["kernel"] = arrayRevertToIRProperty(lr->_kernel); params["pads_begin"] = arrayRevertToIRProperty(lr->_padding); @@ -258,20 +231,20 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr layer) { params["output"] = std::to_string(lr->_out_depth); params["group"] = std::to_string(lr->_group); } else if (CaselessEq()(layer->type, "relu")) { - ReLULayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); if (lr->negative_slope != 0.0f) { params["negative_slope"] = std::to_string(lr->negative_slope); } } else if (CaselessEq()(layer->type, "norm") || - CaselessEq()(layer->type, "lrn")) { - NormLayer *lr = dynamic_cast(layerPtr); + CaselessEq()(layer->type, "lrn")) { + auto *lr = dynamic_cast(layerPtr); params["alpha"] = std::to_string(lr->_alpha); params["beta"] = std::to_string(lr->_beta); params["local-size"] = std::to_string(lr->_size); params["region"] = lr->_isAcrossMaps ? "across" : "same"; } else if (CaselessEq()(layer->type, "pooling")) { - PoolingLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["kernel"] = arrayRevertToIRProperty(lr->_kernel); params["pads_begin"] = arrayRevertToIRProperty(lr->_padding); @@ -279,85 +252,85 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr layer) { params["strides"] = arrayRevertToIRProperty(lr->_stride); switch (lr->_type) { - case PoolingLayer::MAX: - params["pool-method"] = "max"; - break; - case PoolingLayer::AVG: - params["pool-method"] = "avg"; - break; - - default: - THROW_IE_EXCEPTION << "Found unsupported pooling method: " << lr->_type; + case PoolingLayer::MAX: + params["pool-method"] = "max"; + break; + case PoolingLayer::AVG: + params["pool-method"] = "avg"; + break; + + default: + THROW_IE_EXCEPTION << "Found unsupported pooling method: " << lr->_type; } } else if (CaselessEq()(layer->type, "split")) { - SplitLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["axis"] = std::to_string(lr->_axis); } else if (CaselessEq()(layer->type, "concat")) { - ConcatLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["axis"] = std::to_string(lr->_axis); } else if (CaselessEq()(layer->type, "FullyConnected") || - CaselessEq()(layer->type, "InnerProduct")) { - FullyConnectedLayer *lr = dynamic_cast(layerPtr); + CaselessEq()(layer->type, "InnerProduct")) { + auto *lr = dynamic_cast(layerPtr); params["out-size"] = std::to_string(lr->_out_num); } else if (CaselessEq()(layer->type, "softmax")) { - SoftMaxLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["axis"] = std::to_string(lr->axis); } else if (CaselessEq()(layer->type, "reshape")) { // need to add here support of flatten layer if it is created from API - ReshapeLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["dim"] = arrayToIRProperty(lr->shape); } else if (CaselessEq()(layer->type, "Eltwise")) { - EltwiseLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); std::string op; switch (lr->_operation) { - case EltwiseLayer::Sum: - op = "sum"; - break; - case EltwiseLayer::Prod: - op = "prod"; - break; - case EltwiseLayer::Max: - op = "max"; - break; - default: - break; + case EltwiseLayer::Sum: + op = "sum"; + break; + case EltwiseLayer::Prod: + op = "prod"; + break; + case EltwiseLayer::Max: + op = "max"; + break; + default: + break; } params["operation"] = op; } else if (CaselessEq()(layer->type, "scaleshift")) { - ScaleShiftLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["broadcast"] = std::to_string(lr->_broadcast); } else if (CaselessEq()(layer->type, "crop")) { - CropLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["axis"] = arrayToIRProperty(lr->axis); params["offset"] = arrayToIRProperty(lr->offset); params["dim"] = arrayToIRProperty(lr->dim); } else if (CaselessEq()(layer->type, "tile")) { - TileLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["axis"] = std::to_string(lr->axis); params["tiles"] = std::to_string(lr->tiles); } else if (CaselessEq()(layer->type, "prelu")) { - PReLULayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["channel_shared"] = std::to_string(lr->_channel_shared); } else if (CaselessEq()(layer->type, "clamp")) { - ClampLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["min"] = std::to_string(lr->min_value); params["max"] = std::to_string(lr->max_value); } else if (CaselessEq()(layer->type, "BatchNormalization")) { - BatchNormalizationLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["epsilon"] = std::to_string(lr->epsilon); } else if (CaselessEq()(layer->type, "grn")) { - GRNLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["bias"] = std::to_string(lr->bias); } else if (CaselessEq()(layer->type, "mvn")) { - MVNLayer *lr = dynamic_cast(layerPtr); + auto *lr = dynamic_cast(layerPtr); params["across_channels"] = std::to_string(lr->across_channels); params["normalize_variance"] = std::to_string(lr->normalize); } else if (CaselessEq()(layer->type, "rnn") || - CaselessEq()(layer->type, "TensorIterator") || - CaselessEq()(layer->type, "LSTMCell")) { + CaselessEq()(layer->type, "TensorIterator") || + CaselessEq()(layer->type, "LSTMCell")) { THROW_IE_EXCEPTION << "Not covered layers for writing to IR"; } @@ -365,9 +338,8 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr layer) { params["quantization_level"] = layer->params["quantization_level"]; } - // update of weightable layers - WeightableLayer *pwlayer = dynamic_cast(layerPtr); + auto *pwlayer = dynamic_cast(layerPtr); if (pwlayer) { if (pwlayer->_weights) { pwlayer->blobs["weights"] = pwlayer->_weights; @@ -377,3 +349,64 @@ void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr layer) { } } } + +void NetworkSerializer::updatePreProcInfo(const InferenceEngine::ICNNNetwork& network, pugi::xml_node &netXml) { + InputsDataMap inputInfo; + network.getInputsInfo(inputInfo); + + // Assume that you preprocess only one input + for (auto ii : inputInfo) { + const PreProcessInfo &pp = ii.second->getPreProcess(); + size_t nInChannels = pp.getNumberOfChannels(); + if (nInChannels) { + pugi::xml_node preproc = netXml.append_child("pre-process"); + + preproc.append_attribute("reference-layer-name").set_value(ii.first.c_str()); + preproc.append_attribute("mean-precision").set_value(Precision(Precision::FP32).name()); + + for (size_t ch = 0; ch < nInChannels; ch++) { + const PreProcessChannel::Ptr &preProcessChannel = pp[ch]; + auto channel = preproc.append_child("channel"); + channel.append_attribute("id").set_value(ch); + + auto mean = channel.append_child("mean"); + + if (!preProcessChannel->meanData) { + mean.append_attribute("value").set_value(preProcessChannel->meanValue); + } else { + THROW_IE_EXCEPTION << "Mean data is not supported yet for serialization of the model"; + } + } + } + } +} + +void NetworkSerializer::updateStatisticsInfo(const InferenceEngine::ICNNNetwork& network, pugi::xml_node &netXml) { + // If statistics exists, add it to the file + ICNNNetworkStats *netNodesStats = nullptr; + auto stats = netXml.append_child("statistics"); + network.getStats(&netNodesStats, nullptr); + const NetworkStatsMap statsmap = netNodesStats->getNodesStats(); + + auto joinCommas = [&](const std::vector &v) -> std::string { + std::string res; + + for (size_t i = 0; i < v.size(); ++i) { + res += std::to_string(v[i]); + if (i < v.size() - 1) { + res += ", "; + } + } + + return res; + }; + + for (const auto &itStats : statsmap) { + auto layer = stats.append_child("layer"); + + layer.append_child("name").text().set(itStats.first.c_str()); + + layer.append_child("min").text().set(joinCommas(itStats.second->_minOutputs).c_str()); + layer.append_child("max").text().set(joinCommas(itStats.second->_maxOutputs).c_str()); + } +} \ No newline at end of file diff --git a/inference-engine/src/inference_engine/network_serializer.h b/inference-engine/src/inference_engine/network_serializer.h index a67f4f4..e39ebc0 100644 --- a/inference-engine/src/inference_engine/network_serializer.h +++ b/inference-engine/src/inference_engine/network_serializer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,8 @@ #include +#include "xml_parse_utils.h" + namespace InferenceEngine { namespace details { @@ -17,7 +19,9 @@ public: static void serialize(const std::string &xmlPath, const std::string &binPath, const InferenceEngine::ICNNNetwork& network); private: - static void updateStdLayerParams(InferenceEngine::CNNLayer::Ptr layer); + static void updateStdLayerParams(const InferenceEngine::CNNLayer::Ptr &layer); + static void updatePreProcInfo(const InferenceEngine::ICNNNetwork& network, pugi::xml_node &netXml); + static void updateStatisticsInfo(const InferenceEngine::ICNNNetwork& network, pugi::xml_node &netXml); }; } // namespace details diff --git a/inference-engine/src/inference_engine/parsers.h b/inference-engine/src/inference_engine/parsers.h index acfe776..0d83099 100644 --- a/inference-engine/src/inference_engine/parsers.h +++ b/inference-engine/src/inference_engine/parsers.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/precision_utils.cpp b/inference-engine/src/inference_engine/precision_utils.cpp index 9988693..b1d43ec 100644 --- a/inference-engine/src/inference_engine/precision_utils.cpp +++ b/inference-engine/src/inference_engine/precision_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -41,13 +41,20 @@ INFERENCE_ENGINE_API_CPP(void) f32tof16Arrays(short *dst, // small helper function to represent uint32_t value as float32 inline float asfloat(uint32_t v) { - return *reinterpret_cast(&v); + // Both type-punning casts and unions are UB per C++ spec + // But compilers usually only break code with casts + union { + float f; + uint32_t i; + }; + i = v; + return f; } // Function to convert F32 into F16 INFERENCE_ENGINE_API_CPP(float) f16tof32(ie_fp16 x) { // this is storage for output result - uint32_t u = x; + uint32_t u = static_cast(x); // get sign in 32bit format uint32_t s = ((u & 0x8000) << 16); @@ -65,8 +72,23 @@ INFERENCE_ENGINE_API_CPP(float) f16tof32(ie_fp16 x) { u <<= (23 - 10); u |= EXP_MASK_F32; u |= s; - } else if ((x & EXP_MASK_F16) == 0) { // check for zero and denormals. both are converted to zero - u = s; + } else if ((u & EXP_MASK_F16) == 0) { // check for zero and denormals. + uint16_t h_sig = (u & 0x03ffu); + if (h_sig == 0) { + /* Signed zero */ + u = s; + } else { + /* Subnormal */ + uint16_t h_exp = (u & EXP_MASK_F16); + h_sig <<= 1; + while ((h_sig & 0x0400u) == 0) { + h_sig <<= 1; + h_exp++; + } + uint32_t f_exp = (static_cast(127 - 15 - h_exp)) << 23; + uint32_t f_sig = (static_cast(h_sig & 0x03ffu)) << 13; + u = s + f_exp + f_sig; + } } else { // abs u = (u & 0x7FFF); @@ -82,7 +104,7 @@ INFERENCE_ENGINE_API_CPP(float) f16tof32(ie_fp16 x) { } // finaly represent result as float and return - return *reinterpret_cast(&u); + return asfloat(u); } // This function convert f32 to f16 with rounding to nearest value to minimize error diff --git a/inference-engine/src/inference_engine/precision_utils.h b/inference-engine/src/inference_engine/precision_utils.h index 3b824f2..be1c935 100644 --- a/inference-engine/src/inference_engine/precision_utils.h +++ b/inference-engine/src/inference_engine/precision_utils.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/range_iterator.hpp b/inference-engine/src/inference_engine/range_iterator.hpp index 423bd81..cf4578f 100644 --- a/inference-engine/src/inference_engine/range_iterator.hpp +++ b/inference-engine/src/inference_engine/range_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_argmax_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_argmax_shape_infer.hpp index 96a91fb..8605a88 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_argmax_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_argmax_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class ArgMaxShapeProp : public BuiltInShapeInferImpl { public: explicit ArgMaxShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); auto out_max_val = static_cast(cnnLayer.GetParamAsInt("out_max_val", 0)); auto top_k = static_cast(cnnLayer.GetParamAsInt("top_k", 0)); int axis = 0; @@ -45,7 +45,7 @@ public: size_t num_top_axes = firstInputShape.size(); if (num_top_axes < 3) num_top_axes = 3; - SizeVector outputShape(num_top_axes, 1); + SizeVector outputShape(num_top_axes, 1lu); if (isValidAxis) { if (axis < 0) { axis = static_cast(firstInputShape.size() + axis); diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_bin_conv_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_bin_conv_shape_infer.hpp new file mode 100644 index 0000000..2fd99ef --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_bin_conv_shape_infer.hpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for BinaryConvolution layer + */ +class BinConvShapeProp : public BuiltInShapeInferImpl { +public: + explicit BinConvShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + BinaryConvolutionLayer binConvLayer(lp); + binConvLayer.params = params; + binConvLayer.type = _type; + validate(&binConvLayer, inBlobs, params, blobs); + + auto dims = inShapes[0]; + auto computeSpatialShape = [&](size_t inDim, int axis) { + size_t kernel = 0; + if (binConvLayer._dilation[axis]) + kernel = (binConvLayer._kernel[axis] - 1) * binConvLayer._dilation[axis] + 1; + else + kernel = binConvLayer._kernel[axis]; + size_t stride = binConvLayer._stride[axis]; + size_t pad = binConvLayer._padding[axis]; + + float outDim; + std::string padType = binConvLayer._auto_pad; + if (padType == "valid") { + outDim = std::ceil((inDim - kernel + 1.f) / stride); + } else if (padType == "same_upper") { + outDim = std::ceil(1.f * inDim / stride); + } else if (padType == "same_lower") { + outDim = std::floor(1.f * inDim / stride); + } else { + int padEnd = binConvLayer._pads_end[axis]; + outDim = std::floor(1.f * (inDim + pad + padEnd - kernel) / stride) + 1.f; + } + + if (outDim < 0) + THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; + + return static_cast(outDim); + }; + + size_t inputN = dims[0]; + size_t OC = binConvLayer._out_depth; + SizeVector shapes; + shapes.push_back(inputN); + shapes.push_back(OC); + if (dims.size() == 5) + shapes.push_back(computeSpatialShape(dims[dims.size() - 3], Z_AXIS)); + shapes.push_back(computeSpatialShape(dims[dims.size() - 2], Y_AXIS)); + shapes.push_back(computeSpatialShape(dims[dims.size() - 1], X_AXIS)); + outShapes.push_back(shapes); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.cpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.cpp index 0781df1..bbad178 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.cpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,7 +27,6 @@ #include "ie_detection_output_shape_infer.hpp" #include "ie_priorbox_clustered_shape_infer.hpp" #include "ie_ctc_greedy_decoder_shape_infer.hpp" -#include "ie_spatial_transformer_shape_infer.hpp" #include "ie_inner_product_shape_infer.hpp" #include "ie_resample_shape_infer.hpp" #include "ie_interp_shape_infer.hpp" @@ -35,6 +34,22 @@ #include "ie_gemm_shape_infer.hpp" #include "ie_pad_shape_infer.hpp" #include "ie_gather_shape_infer.hpp" +#include "ie_strided_slice_shape_infer.hpp" +#include "ie_shuffle_channels_shape_infer.hpp" +#include "ie_depth_to_space_shape_infer.hpp" +#include "ie_space_to_depth_shape_infer.hpp" +#include "ie_reverse_sequence_shape_infer.hpp" +#include "ie_shape_shape_infer.hpp" +#include "ie_squeeze_shape_infer.hpp" +#include "ie_unsqueeze_shape_infer.hpp" +#include "ie_range_shape_infer.hpp" +#include "ie_fill_shape_infer.hpp" +#include "ie_expand_shape_infer.hpp" +#include "ie_rnn_shape_infer.hpp" +#include "ie_tensor_iterator_shape_infer.hpp" +#include "ie_rnn_cell_shape_infer.hpp" +#include "ie_quantize_shape_infer.hpp" +#include "ie_bin_conv_shape_infer.hpp" #include #include #include @@ -132,14 +147,37 @@ REG_SHAPE_INFER_FOR_TYPE(TileShapeProp, Tile); REG_SHAPE_INFER_FOR_TYPE(CropShapeProp, Crop); REG_SHAPE_INFER_FOR_TYPE(ConcatShapeProp, Concat); REG_SHAPE_INFER_FOR_TYPE(EltWiseShapeProp, Eltwise); +REG_SHAPE_INFER_FOR_TYPE(EltWiseShapeProp, Mul); +REG_SHAPE_INFER_FOR_TYPE(EltWiseShapeProp, Add); +REG_SHAPE_INFER_FOR_TYPE(EltWiseShapeProp, Div); REG_SHAPE_INFER_FOR_TYPE(CTCGreedyDecoderShapeProp, CTCGreedyDecoder); REG_SHAPE_INFER_FOR_TYPE(ProposalShapeProp, Proposal); REG_SHAPE_INFER_FOR_TYPE(ReorgYoloShapeProp, ReorgYolo); REG_SHAPE_INFER_FOR_TYPE(RegionYoloShapeProp, RegionYolo); +REG_SHAPE_INFER_FOR_TYPE(RNNShapeProp, RNNSequence); +REG_SHAPE_INFER_FOR_TYPE(RNNShapeProp, GRUSequence); +REG_SHAPE_INFER_FOR_TYPE(RNNShapeProp, LSTMSequence); +REG_SHAPE_INFER_FOR_TYPE(RNNCellShapeProp, RNNCell); +REG_SHAPE_INFER_FOR_TYPE(GRUCellShapeProp, GRUCell); +REG_SHAPE_INFER_FOR_TYPE(LSTMCellShapeProp, LSTMCell); +REG_SHAPE_INFER_FOR_TYPE(TensorIteratorShapeProp, TensorIterator); REG_SHAPE_INFER_FOR_TYPE(ArgMaxShapeProp, ArgMax); REG_SHAPE_INFER_FOR_TYPE(GemmShapeProp, Gemm); REG_SHAPE_INFER_FOR_TYPE(PadShapeProp, Pad); REG_SHAPE_INFER_FOR_TYPE(GatherShapeProp, Gather); +REG_SHAPE_INFER_FOR_TYPE(StridedSliceShapeProp, StridedSlice); +REG_SHAPE_INFER_FOR_TYPE(ShuffleChannelsShapeProp, ShuffleChannels); +REG_SHAPE_INFER_FOR_TYPE(DepthToSpaceShapeProp, DepthToSpace); +REG_SHAPE_INFER_FOR_TYPE(SpaceToDepthShapeProp, SpaceToDepth); +REG_SHAPE_INFER_FOR_TYPE(ReverseSequenceShapeProp, ReverseSequence); +REG_SHAPE_INFER_FOR_TYPE(SqueezeShapeProp, Squeeze); +REG_SHAPE_INFER_FOR_TYPE(UnsqueezeShapeProp, Unsqueeze); +REG_SHAPE_INFER_FOR_TYPE(RangeShapeProp, Range); +REG_SHAPE_INFER_FOR_TYPE(FillShapeProp, Fill); +REG_SHAPE_INFER_FOR_TYPE(ExpandShapeProp, Expand); +REG_SHAPE_INFER_FOR_TYPE(ShapeShapeProp, Shape); +REG_SHAPE_INFER_FOR_TYPE(QuantizeShapeProp, Quantize); +REG_SHAPE_INFER_FOR_TYPE(BinConvShapeProp, BinaryConvolution); } // namespace ShapeInfer } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.hpp index 3cb0610..84b3510 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_impl.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_impl.hpp index 9189673..39a6b82 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_impl.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -30,7 +30,7 @@ public: THROW_IE_EXCEPTION << "Internal error: failed to find validator for layer with type: " << _type; } - void validate(CNNLayer* layer, const std::vector& inShapes, + void validate(CNNLayer* layer, const std::vector& inBlobs, const std::map& params, const std::map& blobs) { _validator->parseParams(layer); @@ -39,7 +39,7 @@ public: _validator->checkCorrespondence(layer, blobs, inShapes); } - virtual void inferShapesImpl(const std::vector& inShapes, + virtual void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) = 0; @@ -49,21 +49,34 @@ public: const std::map& blobs, std::vector& outShapes, ResponseDesc* resp) noexcept override { + return DescriptionBuffer(GENERAL_ERROR, resp) + << "Unexpected call of deprecated Shape Infer function with input shapes"; + } + + StatusCode inferShapes(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes, + ResponseDesc* resp) noexcept override { + inShapes.clear(); + for (const auto& blob : inBlobs) { + inShapes.push_back(blob->getTensorDesc().getDims()); + } outShapes.clear(); - std::string errorPrefix = "Failed to infer shapes for " + _type + " layer with error: "; try { - inferShapesImpl(inShapes, params, blobs, outShapes); + inferShapesImpl(inBlobs, params, blobs, outShapes); return OK; } catch (const std::exception& ex) { - return InferenceEngine::DescriptionBuffer(GENERAL_ERROR, resp) << errorPrefix + ex.what(); + return InferenceEngine::DescriptionBuffer(GENERAL_ERROR, resp) << ex.what(); } catch (...) { - return InferenceEngine::DescriptionBuffer(UNEXPECTED) << errorPrefix + " unknown"; + return InferenceEngine::DescriptionBuffer(UNEXPECTED) << "Unknown error"; } } protected: std::string _type; details::LayerValidator::Ptr _validator; + std::vector inShapes; }; } // namespace ShapeInfer diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_concat_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_concat_shape_infer.hpp index 8d183ea..0e3688b 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_concat_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_concat_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ class ConcatShapeProp : public BuiltInShapeInferImpl { public: explicit ConcatShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,7 +28,7 @@ public: ConcatLayer concatLayer(lp); concatLayer.params = params; concatLayer.type = _type; - validate(&concatLayer, inShapes, params, blobs); + validate(&concatLayer, inBlobs, params, blobs); size_t sum(0); size_t axis = concatLayer._axis; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_conv_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_conv_shape_infer.hpp index 7c1751f..a42f81d 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_conv_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_conv_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -24,7 +24,7 @@ class ConvShapeProp : public BuiltInShapeInferImpl { public: explicit ConvShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -32,49 +32,51 @@ public: ConvolutionLayer convLayer(lp); convLayer.params = params; convLayer.type = _type; - validate(&convLayer, inShapes, params, blobs); + validate(&convLayer, inBlobs, params, blobs); - float OH_temp, OW_temp; auto dims = inShapes[0]; + auto dims_size = dims.size(); + auto spacial_d_size = dims.size() - 2; + float* OD_temp = new float[spacial_d_size]; + size_t* KDims = new size_t[spacial_d_size]; size_t inputN = dims[0]; - size_t IH = dims[2]; - size_t IW = dims[3]; - size_t KH = 0, KW = 0; - int PR = -1, PB = -1; - if (convLayer._dilation[Y_AXIS]) - KH = (convLayer._kernel[Y_AXIS] - 1) * convLayer._dilation[Y_AXIS] + 1; - else - KH = convLayer._kernel[Y_AXIS]; - if (convLayer._dilation[X_AXIS]) - KW = (convLayer._kernel[X_AXIS] - 1) * convLayer._dilation[X_AXIS] + 1; - else - KW = convLayer._kernel[X_AXIS]; - size_t SH = convLayer._stride[Y_AXIS]; - size_t SW = convLayer._stride[X_AXIS]; - size_t PH = convLayer._padding[Y_AXIS]; - size_t PW = convLayer._padding[X_AXIS]; + for (int i = 0; i < spacial_d_size; i++) { + if (convLayer._dilation[i]) + KDims[i] = (convLayer._kernel[i] - 1) * convLayer._dilation[i] + 1; + else + KDims[i] = convLayer._kernel[i]; + } size_t OC = convLayer._out_depth; std::string padType = convLayer._auto_pad; if (padType == "valid") { - OH_temp = std::ceil((IH - KH + 1.f) / SH); - OW_temp = std::ceil((IW - KW + 1.f) / SW); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::ceil((dims[dims_size - 1 - i] - KDims[i] + 1.f) / convLayer._stride[i]); } else if (padType == "same_upper") { - OH_temp = std::ceil(1.f * IH / SH); - OW_temp = std::ceil(1.f * IW / SW); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::ceil(1.f * dims[dims_size - 1 - i] / convLayer._stride[i]); } else if (padType == "same_lower") { - OH_temp = std::floor(1.f * IH / SH); - OW_temp = std::floor(1.f * IW / SW); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::floor(1.f * dims[dims_size - 1 - i] / convLayer._stride[i]); } else { - PR = convLayer._pads_end[X_AXIS]; - PB = convLayer._pads_end[Y_AXIS]; - OH_temp = std::floor(1.f * (IH + PH + PB - KH) / SH) + 1.f; - OW_temp = std::floor(1.f * (IW + PW + PR - KW) / SW) + 1.f; + for (int i = 0; i < spacial_d_size; i++) { + OD_temp[i] = std::floor(1.f * (dims[dims_size - 1 - i] + + convLayer._padding[i] + convLayer._pads_end[i] - KDims[i]) / + convLayer._stride[i]) + 1.f; + } } - if (OH_temp < 0 || OW_temp < 0) - THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; - size_t OH = static_cast(OH_temp); - size_t OW = static_cast(OW_temp); - outShapes.push_back({inputN, OC, OH, OW}); + + for (int i = 0; i < spacial_d_size; i++) + if (OD_temp[i] < 0) + THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; + + SizeVector outShape = {inputN, OC}; + for (int i = spacial_d_size - 1; i >= 0; i--) + outShape.push_back(static_cast(OD_temp[i])); + + outShapes.push_back(outShape); + + delete[] OD_temp; + delete[] KDims; } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_crop_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_crop_shape_infer.hpp index 91b72f2..b0bfa2f 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_crop_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_crop_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ class CropShapeProp : public BuiltInShapeInferImpl { public: explicit CropShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,7 +28,7 @@ public: CropLayer cropLayer(lp); cropLayer.params = params; cropLayer.type = _type; - validate(&cropLayer, inShapes, params, blobs); + validate(&cropLayer, inBlobs, params, blobs); outShapes.push_back(inShapes[0]); if (inShapes.size() == 2) { diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_ctc_greedy_decoder_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_ctc_greedy_decoder_shape_infer.hpp index 29625ff..c18a597 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_ctc_greedy_decoder_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_ctc_greedy_decoder_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,15 +20,16 @@ class CTCGreedyDecoderShapeProp : public BuiltInShapeInferImpl { public: explicit CTCGreedyDecoderShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { outShapes.clear(); LayerParams lp{}; CNNLayer cnnLayer(lp); - cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + cnnLayer.params = params; + cnnLayer.type = _type; + validate(&cnnLayer, inBlobs, params, blobs); outShapes.push_back({inShapes[0][1], inShapes[0][0], 1, 1}); } diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_deconv_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_deconv_shape_infer.hpp index c4f130a..2ddf5bd 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_deconv_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_deconv_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class DeconvShapeProp : public BuiltInShapeInferImpl { public: explicit DeconvShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,45 +30,45 @@ public: DeconvolutionLayer deconvLayer(lp); deconvLayer.params = params; deconvLayer.type = _type; - validate(&deconvLayer, inShapes, params, blobs); + validate(&deconvLayer, inBlobs, params, blobs); auto dims = inShapes[0]; + auto dims_size = dims.size(); + auto spacial_d_size = dims.size() - 2; + float* OD_temp = new float[spacial_d_size]; + size_t* KDims = new size_t[spacial_d_size]; size_t inputN = dims[0]; - size_t IH = dims[2]; - size_t IW = dims[3]; - int PR = -1, PB = -1; - float OHTemp, OWTemp, KH, KW; - if (deconvLayer._dilation[Y_AXIS]) - KH = (deconvLayer._kernel[Y_AXIS] - 1) * deconvLayer._dilation[Y_AXIS] + 1; - else - KH = deconvLayer._kernel[Y_AXIS]; - if (deconvLayer._dilation[X_AXIS]) - KW = (deconvLayer._kernel[X_AXIS] - 1) * deconvLayer._dilation[X_AXIS] + 1; - else - KW = deconvLayer._kernel[X_AXIS]; - size_t SH = deconvLayer._stride[Y_AXIS]; - size_t SW = deconvLayer._stride[X_AXIS]; - size_t PH = deconvLayer._padding[Y_AXIS]; - size_t PW = deconvLayer._padding[X_AXIS]; + for (int i = 0; i < spacial_d_size; i++) { + if (deconvLayer._dilation[i]) + KDims[i] = (deconvLayer._kernel[i] - 1) * deconvLayer._dilation[i] + 1; + else + KDims[i] = deconvLayer._kernel[i]; + } size_t OC = deconvLayer._out_depth; std::string padType = deconvLayer._auto_pad; if (padType == "valid") { - OHTemp = IH * SH + KH - 1; - OWTemp = IW * SW + KW - 1; + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = dims[dims_size - 1 - i] * deconvLayer._stride[i] + KDims[i] - 1; } else if ((padType == "same_upper") || (padType == "same_lower")) { - OHTemp = IH * SH; - OWTemp = IW * SW; + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = dims[dims_size - 1 - i] * deconvLayer._stride[i]; } else { - PR = deconvLayer._pads_end[X_AXIS]; - PB = deconvLayer._pads_end[Y_AXIS]; - OHTemp = SH * (IH - 1) + KH - PH - PB; - OWTemp = SW * (IW - 1) + KW - PW - PR; + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = deconvLayer._stride[i] * (dims[dims_size - 1 - i] - 1) + + KDims[i] - deconvLayer._padding[i] - deconvLayer._pads_end[i]; } - if (OHTemp < 0 || OWTemp < 0) - THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; - size_t OH = static_cast(OHTemp); - size_t OW = static_cast(OWTemp); - outShapes.emplace_back(std::initializer_list{inputN, OC, OH, OW}); + for (int i = 0; i < spacial_d_size; i++) + if (OD_temp[i] < 0) + THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; + + SizeVector outShape = {inputN, OC}; + for (int i = spacial_d_size - 1; i >= 0; i--) + outShape.push_back(static_cast(OD_temp[i])); + + outShapes.emplace_back(outShape); + + delete[] OD_temp; + delete[] KDims; } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_depth_to_space_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_depth_to_space_shape_infer.hpp new file mode 100644 index 0000000..9942c05 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_depth_to_space_shape_infer.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for DepthToSpace layer + */ +class DepthToSpaceShapeProp : public BuiltInShapeInferImpl { +public: + explicit DepthToSpaceShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + DepthToSpaceLayer depthToSpaceLayer(lp); + depthToSpaceLayer.params = params; + depthToSpaceLayer.type = _type; + validate(&depthToSpaceLayer, inBlobs, params, blobs); + + unsigned int block_size = depthToSpaceLayer.block_size; + outShapes = {inShapes[0]}; + + outShapes[0][outShapes[0].size() - 1] = inShapes[0][inShapes[0].size() - 1] * block_size; + outShapes[0][outShapes[0].size() - 2] = inShapes[0][inShapes[0].size() - 2] * block_size; + outShapes[0][outShapes[0].size() - 3] = inShapes[0][inShapes[0].size() - 3] / block_size / block_size; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_detection_output_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_detection_output_shape_infer.hpp index eff11ed..6055655 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_detection_output_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_detection_output_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class DetectionOutputShapeProp : public BuiltInShapeInferImpl { public: explicit DetectionOutputShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); int top_k = cnnLayer.GetParamAsInt("keep_top_k"); outShapes.push_back({1, 1, static_cast(top_k) * inShapes[0][0], 7}); diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_eltwise_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_eltwise_shape_infer.hpp index ce7248c..652f8ab 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_eltwise_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_eltwise_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,7 @@ #include #include #include +#include namespace InferenceEngine { namespace ShapeInfer { @@ -20,7 +21,7 @@ class EltWiseShapeProp : public BuiltInShapeInferImpl { public: explicit EltWiseShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,8 +29,23 @@ public: EltwiseLayer eltwiseLayer(lp); eltwiseLayer.params = params; eltwiseLayer.type = _type; - validate(&eltwiseLayer, inShapes, params, blobs); - outShapes.push_back(inShapes[0]); + validate(&eltwiseLayer, inBlobs, params, blobs); + + if (inShapes.size() == 1) { + outShapes.push_back(inShapes[0]); + } else { + SizeVector outShape((std::max)(inShapes[0], inShapes[1])); + for (size_t ind = 0; ind < outShape.size(); ++ind) { + if (ind < inShapes[0].size() && ind < inShapes[1].size()) { + outShape[ind] = (std::max)(inShapes[0][ind], inShapes[1][ind]); + } else if (ind >= inShapes[0].size()) { + outShape[ind] = inShapes[1][ind]; + } else { + outShape[ind] = inShapes[0][ind]; + } + } + outShapes.push_back(outShape); + } } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_equal_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_equal_shape_infer.hpp index 9378aba..e21de0e 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_equal_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_equal_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,7 +21,7 @@ class EqualShapeProp : public BuiltInShapeInferImpl { public: explicit EqualShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, const std::map& params, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { outShapes = inShapes; } @@ -31,7 +31,7 @@ class DoNothingShapeProp : public BuiltInShapeInferImpl { public: explicit DoNothingShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, const std::map& params, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override {} }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_expand_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_expand_shape_infer.hpp new file mode 100644 index 0000000..db2d687 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_expand_shape_infer.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for Expand layer + */ +class ExpandShapeProp : public BuiltInShapeInferImpl { +public: + explicit ExpandShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + ExpandLayer unsqueezeLayer(lp); + unsqueezeLayer.params = params; + unsqueezeLayer.type = _type; + validate(&unsqueezeLayer, inBlobs, params, blobs); + + outShapes = {inShapes[0]}; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_fill_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_fill_shape_infer.hpp new file mode 100644 index 0000000..504d919 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_fill_shape_infer.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for Fill layer + */ +class FillShapeProp : public BuiltInShapeInferImpl { +public: + explicit FillShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + FillLayer fillLayer(lp); + fillLayer.params = params; + fillLayer.type = _type; + validate(&fillLayer, inBlobs, params, blobs); + + auto dimsBlob = *inBlobs.begin(); + SizeVector shape; + SizeVector dims = dimsBlob->getTensorDesc().getDims(); + auto* buffer = dimsBlob->cbuffer().as(); + if (!buffer || dimsBlob->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << " Fill dimensions vector should be I32!"; + + for (int i = 0; i < dimsBlob->size(); i++) { + shape.push_back(buffer[i]); + } + outShapes = {shape}; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp index bdde976..be42a6c 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -24,15 +24,15 @@ class FlattenShapeProp : public BuiltInShapeInferImpl { public: explicit FlattenShapeProp(const std::string &type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector &inShapes, - const std::map ¶ms, - const std::map &blobs, - std::vector &outShapes) override { + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { LayerParams lp{}; ReshapeLayer reshapeLayer(lp); reshapeLayer.params = params; reshapeLayer.type = _type; - validate(&reshapeLayer, inShapes, params, blobs); + validate(&reshapeLayer, inBlobs, params, blobs); auto inputShape = inShapes[0]; size_t inputShapeTotal = std::accumulate(inputShape.begin(), inputShape.end(), 1lu, std::multiplies()); diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp index 41641cb..5a37378 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ class GatherShapeProp : public BuiltInShapeInferImpl { public: explicit GatherShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,7 +28,7 @@ public: GatherLayer gatherLayer(lp); gatherLayer.params = params; gatherLayer.type = _type; - validate(&gatherLayer, inShapes, params, blobs); + validate(&gatherLayer, inBlobs, params, blobs); int axis = gatherLayer.axis; if (axis < 0) @@ -36,7 +36,7 @@ public: outShapes.resize(1); outShapes[0].resize(inShapes[0].size() + inShapes[1].size() - 1); - for (size_t i = 0; i < axis; i++) + for (int i = 0; i < axis; i++) outShapes[0][i] = inShapes[0][i]; for (size_t i = 0; i < inShapes[1].size(); i++) diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp index 5cac2f5..f3474f1 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -24,15 +24,16 @@ class GemmShapeProp : public BuiltInShapeInferImpl { public: explicit GemmShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { + // TODO: primitive does not support 5D tensor yet LayerParams lp{}; GemmLayer gemmLayer(lp); gemmLayer.params = params; gemmLayer.type = _type; - validate(&gemmLayer, inShapes, params, blobs); + validate(&gemmLayer, inBlobs, params, blobs); auto dims0 = inShapes[0]; auto dims1 = inShapes[1]; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_inner_product_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_inner_product_shape_infer.hpp index d65a0d3..63160d0 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_inner_product_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_inner_product_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -19,7 +19,7 @@ class InnerProductShapeProp : public BuiltInShapeInferImpl { public: explicit InnerProductShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -27,7 +27,7 @@ public: FullyConnectedLayer fcLayer(lp); fcLayer.params = params; fcLayer.type = _type; - validate(&fcLayer, inShapes, params, blobs); + validate(&fcLayer, inBlobs, params, blobs); size_t OC, ON; ON = inShapes[0][0]; OC = fcLayer._out_num; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_interp_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_interp_shape_infer.hpp index ebca8ff..a7efae0 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_interp_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_interp_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +11,7 @@ #include #include #include +#include namespace InferenceEngine { namespace ShapeInfer { @@ -22,7 +23,7 @@ class InterpShapeProp : public BuiltInShapeInferImpl { public: explicit InterpShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,60 +31,67 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); - auto factor = static_cast(cnnLayer.GetParamAsInt("factor", 0)); - auto shrink_factor = static_cast(cnnLayer.GetParamAsInt("shrink_factor", 0)); - auto zoom_factor = static_cast(cnnLayer.GetParamAsInt("zoom_factor", 0)); - auto height = static_cast(cnnLayer.GetParamAsInt("height", 0)); - auto width = static_cast(cnnLayer.GetParamAsInt("width", 0)); + validate(&cnnLayer, inBlobs, params, blobs); + SizeVector outShape; + if (inBlobs.size() == 2) { + auto* buffer = inBlobs[1]->cbuffer().as(); + if (buffer != nullptr) { + for (int i = 0; i < inBlobs[1]->size(); i++) { + outShape.push_back(static_cast(buffer[i])); + } + } else { + THROW_IE_EXCEPTION << "Second input must have allocated data"; + } + } else { + auto factor = cnnLayer.GetParamAsFloat("factor", 0); + auto shrink_factor = cnnLayer.GetParamAsFloat("shrink_factor", 0); + auto zoom_factor = cnnLayer.GetParamAsFloat("zoom_factor", 0); + auto height = static_cast(cnnLayer.GetParamAsInt("height", 0)); + auto width = static_cast(cnnLayer.GetParamAsInt("width", 0)); + + auto IS_ZERO = [](float value) { + return std::fabs(value) < std::numeric_limits::epsilon(); + }; + + bool noFactor = IS_ZERO(zoom_factor) && IS_ZERO(shrink_factor) && IS_ZERO(factor); - // TODO: move to validators - if (!zoom_factor && !shrink_factor && !factor && (!height || !width)) { - THROW_IE_EXCEPTION - << "Can't reshape without factor, or target resolution. " - << "Supported attributes: factor, shrink_factor, zoom_factor, height, width"; - } size_t N, C, H, W; - // TODO: validate that only one input N = inShapes[0][0]; C = inShapes[0][1]; H = inShapes[0][2]; W = inShapes[0][3]; + auto SETW = [&width, &W](size_t value) { + if (width) { + W = width; + } else { + W = value; + } + }; - auto SETW = [&width, &W](size_t value) { - if (width) { - W = width; - } else { - W = value; - } - }; + auto SETH = [&height, &H](size_t value) { + if (height) { + H = height; + } else { + H = value; + } + }; - auto SETH = [&height, &H](size_t value) { - if (height) { - H = height; + if (noFactor) { + SETW(width); + SETH(height); } else { - H = value; - } - }; - - if (factor) { - SETH(H * factor); - SETW(W * factor); - } else if (shrink_factor || zoom_factor) { - if (shrink_factor) { - SETH(H / shrink_factor); - SETW(W / shrink_factor); - } - if (zoom_factor) { - SETH(H * zoom_factor); - SETW(W * zoom_factor); + float actualFactor = factor; + if (!IS_ZERO(shrink_factor) || !IS_ZERO(zoom_factor)) { + if (!IS_ZERO(zoom_factor)) actualFactor = zoom_factor; + if (!IS_ZERO(shrink_factor)) actualFactor /= shrink_factor; + } + SETW(W * actualFactor); + SETH(H * actualFactor); } - } else { - SETW(width); - SETH(height); + outShape = {N, C, H, W}; } - outShapes.push_back({N, C, H, W}); + outShapes.push_back(outShape); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp index 2fb1c49..424ab39 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ class PadShapeProp : public BuiltInShapeInferImpl { public: explicit PadShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,7 +28,7 @@ public: PadLayer padLayer(lp); padLayer.params = params; padLayer.type = _type; - validate(&padLayer, inShapes, params, blobs); + validate(&padLayer, inBlobs, params, blobs); outShapes.push_back(inShapes[0]); for (size_t i = 0; i < outShapes[0].size(); i++) { diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_permute_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_permute_shape_infer.hpp index 46f1456..cdfa2d7 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_permute_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_permute_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class PermuteShapeProp : public BuiltInShapeInferImpl { public: explicit PermuteShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer permuteLayer(lp); permuteLayer.params = params; permuteLayer.type = _type; - validate(&permuteLayer, inShapes, params, blobs); + validate(&permuteLayer, inBlobs, params, blobs); std::vector order; std::vector layerOrder = permuteLayer.GetParamAsInts("order"); diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_pool_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_pool_shape_infer.hpp index 4850c8a..4344a42 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_pool_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_pool_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class PoolingShapeProp : public BuiltInShapeInferImpl { public: explicit PoolingShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,32 +30,27 @@ public: PoolingLayer poolLayer(lp); poolLayer.params = params; poolLayer.type = _type; - validate(&poolLayer, inShapes, params, blobs); + validate(&poolLayer, inBlobs, params, blobs); - float OHTemp = 1.f, OWTemp = 1.f; auto dims = inShapes[0]; - int PR = -1, PB = -1; + auto dims_size = dims.size(); + auto spacial_d_size = dims.size() - 2; + float* OD_temp = new float[spacial_d_size]; + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = 1.f; size_t inputN = dims[0]; size_t IC = dims[1]; - size_t IH = dims[2]; - size_t IW = dims[3]; - size_t KH = poolLayer._kernel[Y_AXIS]; - size_t KW = poolLayer._kernel[X_AXIS]; - size_t SH = poolLayer._stride[Y_AXIS]; - size_t SW = poolLayer._stride[X_AXIS]; - size_t PH = poolLayer._padding[Y_AXIS]; - size_t PW = poolLayer._padding[X_AXIS]; std::string padType = poolLayer._auto_pad; if (padType == "valid") { - OHTemp = std::ceil((IH - KH + 1.f) / SH); - OWTemp = std::ceil((IW - KW + 1.f) / SW); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::ceil((dims[dims_size - 1 - i] - poolLayer._kernel[i] + 1.f) / poolLayer._stride[i]); } else if (padType == "same_upper") { - OHTemp = std::ceil(1.f * IH / SH); - OWTemp = std::ceil(1.f * IW / SW); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::ceil(1.f * dims[dims_size - 1 - i] / poolLayer._stride[i]); } else if (padType == "same_lower") { - OHTemp = std::floor(1.f * IH / SH); - OWTemp = std::floor(1.f * IW / SW); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::floor(1.f * dims[dims_size - 1 - i] / poolLayer._stride[i]); } else { auto it = std::find_if( poolLayer.params.begin(), @@ -67,25 +62,31 @@ public: if (it != poolLayer.params.end()) { if (it->second == "floor") isCeil = false; } - PR = poolLayer._pads_end[X_AXIS]; - PB = poolLayer._pads_end[Y_AXIS]; - OHTemp += 1.f * (IH + PH + PB - KH) / SH; - OWTemp += 1.f * (IW + PW + PR - KW) / SW; + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] += 1.f * (dims[dims_size - 1 - i] + poolLayer._padding[i] + + poolLayer._pads_end[i] - poolLayer._kernel[i]) / poolLayer._stride[i]; if (isCeil) { - OHTemp = std::ceil(OHTemp); - OWTemp = std::ceil(OWTemp); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::ceil(OD_temp[i]); } else { - OHTemp = std::floor(OHTemp); - OWTemp = std::floor(OWTemp); + for (int i = 0; i < spacial_d_size; i++) + OD_temp[i] = std::floor(OD_temp[i]); } - if ((OHTemp - 1) * SH >= IH + PH) --OHTemp; - if ((OWTemp - 1) * SW >= IW + PW) --OWTemp; + for (int i = 0; i < spacial_d_size; i++) + if ((OD_temp[i] - 1) * poolLayer._stride[i] >= dims[dims_size - 1 - i] + + poolLayer._padding[i]) --OD_temp[i]; } - if (OHTemp < 0 || OWTemp < 0) - THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; - size_t OH = static_cast(OHTemp); - size_t OW = static_cast(OWTemp); - outShapes.emplace_back(std::initializer_list{inputN, IC, OH, OW}); + for (int i = 0; i < spacial_d_size; i++) + if (OD_temp[i] < 0) + THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative"; + + SizeVector outShape = {inputN, IC}; + for (int i = spacial_d_size - 1; i >= 0; i--) + outShape.push_back(static_cast(OD_temp[i])); + + outShapes.emplace_back(outShape); + + delete[] OD_temp; } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_clustered_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_clustered_shape_infer.hpp index 1aaf3e4..b716193 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_clustered_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_clustered_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,17 +22,19 @@ class PriorBoxClusteredShapeProp : public BuiltInShapeInferImpl { public: explicit PriorBoxClusteredShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { - LayerParams lp{}; + LayerParams lp{}; CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); std::vector widths = cnnLayer.GetParamAsFloats("width", {}); - size_t res_prod = widths.size() * inShapes[0][2] * inShapes[0][3] * 4; + size_t res_prod = widths.size() * 4; + for (int i = 2; i < inShapes[0].size(); i++) + res_prod *= inShapes[0][i]; outShapes.push_back({1, 2, res_prod}); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_shape_infer.hpp index 03a8d9c..867651d 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class PriorBoxShapeProp : public BuiltInShapeInferImpl { public: explicit PriorBoxShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); std::vector min_sizes = cnnLayer.GetParamAsFloats("min_size", {}); std::vector max_sizes = cnnLayer.GetParamAsFloats("max_size", {}); bool flip = static_cast(cnnLayer.GetParamAsInt("flip")); @@ -45,7 +45,9 @@ public: num_priors = (flip ? 2 : 1) * aspect_ratios.size() + min_sizes.size() - 1; } - size_t res_prod = num_priors * inShapes[0][2] * inShapes[0][3] * 4; + size_t res_prod = num_priors * 4; + for (int i = 2; i < inShapes[0].size(); i++) + res_prod *= inShapes[0][i]; outShapes.push_back({1, 2, res_prod}); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_proposal_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_proposal_shape_infer.hpp index 8058500..6a09fe5 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_proposal_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_proposal_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ class ProposalShapeProp : public BuiltInShapeInferImpl { public: explicit ProposalShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,8 +28,8 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); - size_t post_nms_topn = cnnLayer.GetParamAsInt("post_nms_topn"); + validate(&cnnLayer, inBlobs, params, blobs); + size_t post_nms_topn = static_cast(cnnLayer.GetParamAsInt("post_nms_topn")); outShapes.push_back({inShapes[0][0] * post_nms_topn, 5}); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_psroi_pooling_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_psroi_pooling_shape_infer.hpp index f6ce94e..c53feb2 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_psroi_pooling_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_psroi_pooling_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class PSRoiPoolingShapeProp : public BuiltInShapeInferImpl { public: explicit PSRoiPoolingShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); size_t output_dim = static_cast(cnnLayer.GetParamAsInt("output_dim")); size_t group_size = static_cast(cnnLayer.GetParamAsInt("group_size")); outShapes.push_back({inShapes[1][0], output_dim, group_size, group_size}); diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_spatial_transformer_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_quantize_shape_infer.hpp similarity index 53% rename from inference-engine/src/inference_engine/shape_infer/built-in/ie_spatial_transformer_shape_infer.hpp rename to inference-engine/src/inference_engine/shape_infer/built-in/ie_quantize_shape_infer.hpp index 8548c92..5a8ee08 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_spatial_transformer_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_quantize_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,31 +6,34 @@ #include #include "ie_built_in_impl.hpp" -#include #include #include #include #include +#include +#include +#include namespace InferenceEngine { namespace ShapeInfer { /** - *@brief Implementation of Shape inference for SpatialTransformer layer + *@brief Implementation of Shape inference for quantize layer */ -class SpatialTransformerShapeProp : public BuiltInShapeInferImpl { +class QuantizeShapeProp : public BuiltInShapeInferImpl { public: - explicit SpatialTransformerShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + explicit QuantizeShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { LayerParams lp{}; - CNNLayer cnnLayer(lp); - cnnLayer.params = params; - cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + QuantizeLayer quantizeLayer(lp); + quantizeLayer.params = params; + quantizeLayer.type = _type; + validate(&quantizeLayer, inBlobs, params, blobs); + outShapes.push_back(inShapes[0]); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_range_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_range_shape_infer.hpp new file mode 100644 index 0000000..4719f04 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_range_shape_infer.hpp @@ -0,0 +1,51 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for Range layer + */ +class RangeShapeProp : public BuiltInShapeInferImpl { +public: + explicit RangeShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + RangeLayer rangeLayer(lp); + rangeLayer.params = params; + rangeLayer.type = _type; + validate(&rangeLayer, inBlobs, params, blobs); + + const size_t RANGE_START = 0; + const size_t RANGE_LIMIT = 1; + const size_t RANGE_DELTA = 2; + + float start = (inBlobs[RANGE_START]->cbuffer().as() + + inBlobs[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + float limit = (inBlobs[RANGE_LIMIT]->cbuffer().as() + + inBlobs[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + float delta = (inBlobs[RANGE_DELTA]->cbuffer().as() + + inBlobs[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + size_t work_amount_dst = std::floor(std::abs((limit - start) / delta)); + outShapes = {{work_amount_dst}}; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_region_yolo_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_region_yolo_shape_infer.hpp index 78847a0..bed8123 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_region_yolo_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_region_yolo_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,22 +22,43 @@ class RegionYoloShapeProp : public BuiltInShapeInferImpl { public: explicit RegionYoloShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { LayerParams lp{}; - CNNLayer cnnLayer(lp); - cnnLayer.params = params; - cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + CNNLayer layer(lp); + layer.params = params; + int classes; + int coords; + int num; + bool do_softmax; + std::vector mask; + classes = layer.GetParamAsInt("classes", 1); + coords = layer.GetParamAsInt("coords", 1); + num = layer.GetParamAsInt("num", 1); + do_softmax = static_cast(layer.GetParamAsInt("do_softmax", 1)); + mask = layer.GetParamAsInts("mask", {}); + unsigned int axis = layer.GetParamAsUInt("axis", 1); + int end_axis = layer.GetParamAsInt("end_axis", 1); + if (end_axis < 0) end_axis += inShapes[0].size(); + SizeVector outShape; - outShape.push_back(inShapes[0][0]); - size_t mul(1); - for (size_t i = 1; i < inShapes[0].size(); i++) { - mul *= inShapes[0][i]; + if (do_softmax) { + size_t flat_dim = 1; + for (size_t i = 0; i < axis; i++) { + outShape.push_back(inShapes[0][i]); + } + for (size_t i = axis; i < end_axis + 1; i++) { + flat_dim *= inShapes[0][i]; + } + outShape.push_back(flat_dim); + for (size_t i = end_axis + 1; i < inShapes[0].size(); i++) { + outShape.push_back(inShapes[0][i]); + } + } else { + outShape = {inShapes[0][0], (classes + coords + 1) * mask.size(), inShapes[0][2], inShapes[0][3]}; } - outShape.push_back(mul); outShapes.push_back({outShape}); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_reorg_yolo_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_reorg_yolo_shape_infer.hpp index 82ffafa..7ae0a80 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_reorg_yolo_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_reorg_yolo_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class ReorgYoloShapeProp : public BuiltInShapeInferImpl { public: explicit ReorgYoloShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); size_t stride = static_cast(cnnLayer.GetParamAsInt("stride")); SizeVector outShape; for (size_t i = 0; i < inShapes[0].size(); i++) { diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_resample_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_resample_shape_infer.hpp index 8e67ccf..fe06a46 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_resample_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_resample_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class ResampleShapeProp : public BuiltInShapeInferImpl { public: explicit ResampleShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,10 +30,24 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); - // TODO: validate param and number of inputs (1) - auto scale = static_cast(cnnLayer.GetParamAsInt("factor")); - outShapes.push_back({inShapes[0][0], inShapes[0][1], inShapes[0][2] * scale, inShapes[0][3] * scale}); + validate(&cnnLayer, inBlobs, params, blobs); + SizeVector outShape; + if (inBlobs.size() == 2) { + auto* buffer = inBlobs[1]->cbuffer().as(); + if (buffer != nullptr) { + for (int i = 0; i < inBlobs[1]->size(); i++) { + outShape.push_back(static_cast(buffer[i])); + } + } else { + THROW_IE_EXCEPTION << "Second input must have allocated data"; + } + } else { + auto scale = static_cast(cnnLayer.GetParamAsInt("factor")); + outShape = {inShapes[0][0], inShapes[0][1]}; + for (int i = 2; i < inShapes[0].size(); i++) + outShape.push_back(inShapes[0][i] * scale); + } + outShapes.push_back(outShape); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_reshape_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_reshape_shape_infer.hpp index 97b6571..d586f3c 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_reshape_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_reshape_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include #include "ie_built_in_impl.hpp" +#include "precision_utils.h" #include #include #include @@ -22,22 +23,48 @@ namespace ShapeInfer { */ class ReshapeShapeProp : public BuiltInShapeInferImpl { public: - explicit ReshapeShapeProp(const std::string &type) : BuiltInShapeInferImpl(type) {} + explicit ReshapeShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector &inShapes, - const std::map ¶ms, - const std::map &blobs, - std::vector &outShapes) override { + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { LayerParams lp{}; ReshapeLayer reshapeLayer(lp); reshapeLayer.params = params; reshapeLayer.type = _type; - validate(&reshapeLayer, inShapes, params, blobs); + validate(&reshapeLayer, inBlobs, params, blobs); - auto inputShape = inShapes[0]; - size_t inputShapeTotal = std::accumulate(inputShape.begin(), inputShape.end(), 1lu, std::multiplies()); SizeVector outShape; - std::vector reshapeMask = reshapeLayer.shape; + std::vector reshapeMask; + if (inBlobs.size() == 2) { + if (inBlobs[1]->precision() == Precision::FP32) { + auto* buffer = inBlobs[1]->cbuffer().as(); + if (buffer != nullptr) { + for (int i = 0; i < inBlobs[1]->size(); i++) { + reshapeMask.push_back(static_cast(buffer[i])); + } + } else { + THROW_IE_EXCEPTION << "Second input must have allocated data"; + } + } else if (inBlobs[1]->precision() == Precision::FP16) { + auto* buffer = inBlobs[1]->cbuffer().as(); + if (buffer != nullptr) { + for (int i = 0; i < inBlobs[1]->size(); i++) { + reshapeMask.push_back(static_cast(PrecisionUtils::f16tof32(buffer[i]))); + } + } else { + THROW_IE_EXCEPTION << "Second input must have allocated data"; + } + } else { + THROW_IE_EXCEPTION << "Second input has unsupported precision"; + } + } else { + reshapeMask = reshapeLayer.shape; + } + auto inputShape = inShapes[0]; + size_t inputShapeTotal = std::accumulate(inputShape.begin(), inputShape.end(), 1lu, + std::multiplies()); if (reshapeMask.empty()) { outShape = {inputShapeTotal}; @@ -60,7 +87,8 @@ public: outShape.push_back(reshapeMask[i]); } } - size_t outputShapeTotal = std::accumulate(outShape.begin(), outShape.end(), 1lu, std::multiplies()); + size_t outputShapeTotal = std::accumulate(outShape.begin(), outShape.end(), 1lu, + std::multiplies()); if (inputShapeTotal != outputShapeTotal) THROW_IE_EXCEPTION << "Invalid reshape mask (dim attribute): number of elements in input: " << details::dumpVec(inputShape) << " and output: " << details::dumpVec(outShape) diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_reverse_sequence_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_reverse_sequence_shape_infer.hpp new file mode 100644 index 0000000..858ffa6 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_reverse_sequence_shape_infer.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for ReverseSequence layer + */ +class ReverseSequenceShapeProp : public BuiltInShapeInferImpl { +public: + explicit ReverseSequenceShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + ReverseSequenceLayer reverseSequenceLayer(lp); + reverseSequenceLayer.params = params; + reverseSequenceLayer.type = _type; + validate(&reverseSequenceLayer, inBlobs, params, blobs); + + outShapes = {inShapes[0]}; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_cell_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_cell_shape_infer.hpp new file mode 100644 index 0000000..4869b73 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_cell_shape_infer.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for DetectionOutput layer + */ +template +class RNNBaseCellShapeProp : public BuiltInShapeInferImpl { +public: + explicit RNNBaseCellShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + CELL cnnLayer(lp); + cnnLayer.params = params; + cnnLayer.type = _type; + validate(&cnnLayer, inBlobs, params, blobs); + + auto state_dims = inShapes[1]; + for (int i = 0; i < S; i++) + outShapes.push_back(state_dims); + } +}; + +using RNNCellShapeProp = RNNBaseCellShapeProp; +using GRUCellShapeProp = RNNBaseCellShapeProp; +using LSTMCellShapeProp = RNNBaseCellShapeProp; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_shape_infer.hpp new file mode 100644 index 0000000..c8763a0 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_rnn_shape_infer.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for DetectionOutput layer + */ +class RNNShapeProp : public BuiltInShapeInferImpl { +public: + explicit RNNShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + RNNSequenceLayer rnn(lp); + rnn.params = params; + rnn.type = _type; + rnn.precision = Precision::FP32; // FIXME: No ability to discover current precision. Assume fp32 + validate(&rnn, inBlobs, params, blobs); + + int state_size = rnn.hidden_size; + + auto data_dims = inShapes[0]; + data_dims[2] = static_cast(state_size); + outShapes.push_back(data_dims); + + for (int i = 1; i < inShapes.size(); i++) { + outShapes.push_back(inShapes[i]); + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_roi_pooling_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_roi_pooling_shape_infer.hpp index b5f6c85..c128469 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_roi_pooling_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_roi_pooling_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class RoiPoolingShapeProp : public BuiltInShapeInferImpl { public: explicit RoiPoolingShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,12 +30,16 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); - int pooled_h = cnnLayer.GetParamAsInt("pooled_h"); - int pooled_w = cnnLayer.GetParamAsInt("pooled_w"); - outShapes.push_back( - {inShapes[1][0], inShapes[0][1], static_cast(pooled_h), static_cast(pooled_w)}); + SizeVector out_shapes = {inShapes[1][0], inShapes[0][1]}; + for (auto attr : {"pooled_d", "pooled_h", "pooled_w"}) { // desired IR format: pooled="...,d,h,w" + int pooled = cnnLayer.GetParamAsInt(attr, -1); + if (pooled >= 0) { + out_shapes.push_back(static_cast(pooled)); + } + } + outShapes.push_back(out_shapes); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_shape_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_shape_shape_infer.hpp new file mode 100644 index 0000000..87fbab9 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_shape_shape_infer.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for Shape layer + */ +class ShapeShapeProp : public BuiltInShapeInferImpl { +public: + explicit ShapeShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + outShapes.push_back({inShapes[0].size()}); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_shuffle_channels_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_shuffle_channels_shape_infer.hpp new file mode 100644 index 0000000..8bcda89 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_shuffle_channels_shape_infer.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for ShuffleChannels layer + */ +class ShuffleChannelsShapeProp : public BuiltInShapeInferImpl { +public: + explicit ShuffleChannelsShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + ShuffleChannelsLayer shuffleChannelsLayer(lp); + shuffleChannelsLayer.params = params; + shuffleChannelsLayer.type = _type; + validate(&shuffleChannelsLayer, inBlobs, params, blobs); + + outShapes = {inShapes[0]}; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_simpler_nms_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_simpler_nms_shape_infer.hpp index c39755f..ddc2eb1 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_simpler_nms_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_simpler_nms_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class SimplerNMSShapeProp : public BuiltInShapeInferImpl { public: explicit SimplerNMSShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,7 +30,7 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); size_t post_nms_topn = static_cast(cnnLayer.GetParamAsInt("post_nms_topn")); outShapes.push_back({post_nms_topn, 5}); diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_space_to_depth_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_space_to_depth_shape_infer.hpp new file mode 100644 index 0000000..fdc14a1 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_space_to_depth_shape_infer.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for SpaceToDepth layer + */ +class SpaceToDepthShapeProp : public BuiltInShapeInferImpl { +public: + explicit SpaceToDepthShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + SpaceToDepthLayer spaceToDepthLayer(lp); + spaceToDepthLayer.params = params; + spaceToDepthLayer.type = _type; + validate(&spaceToDepthLayer, inBlobs, params, blobs); + + unsigned int block_size = spaceToDepthLayer.block_size; + outShapes = {inShapes[0]}; + + outShapes[0][outShapes[0].size() - 1] = inShapes[0][inShapes[0].size() - 1] / block_size; + outShapes[0][outShapes[0].size() - 2] = inShapes[0][inShapes[0].size() - 2] / block_size; + outShapes[0][outShapes[0].size() - 3] = inShapes[0][inShapes[0].size() - 3] * block_size * block_size; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_split_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_split_shape_infer.hpp index 94b612f..099380b 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_split_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_split_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,7 +21,7 @@ class SplitShapeProp : public BuiltInShapeInferImpl { public: explicit SplitShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -29,7 +29,7 @@ public: SplitLayer splitLayer(lp); splitLayer.params = params; splitLayer.type = _type; - validate(&splitLayer, inShapes, params, blobs); + validate(&splitLayer, inBlobs, params, blobs); std::vector out_sizes = splitLayer.GetParamAsInts("out_sizes", {}); if (out_sizes.empty()) diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_squeeze_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_squeeze_shape_infer.hpp new file mode 100644 index 0000000..6e0fe41 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_squeeze_shape_infer.hpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for Squeeze layer + */ +class SqueezeShapeProp : public BuiltInShapeInferImpl { +public: + explicit SqueezeShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + SqueezeLayer layer(lp); + layer.params = params; + layer.type = _type; + validate(&layer, inBlobs, params, blobs); + + const size_t SQUEEZE_DATA = 0; + const size_t SQUEEZE_INDEXES = 1; + + SizeVector data_dims; + SizeVector idx_dims; + + idx_dims = inBlobs[SQUEEZE_INDEXES]->getTensorDesc().getDims(); + if (idx_dims.size() > 1) + THROW_IE_EXCEPTION << " Index vector should be 1 dimension"; + + if (inBlobs[SQUEEZE_INDEXES]->getTensorDesc().getPrecision() != Precision::I32 && + inBlobs[SQUEEZE_INDEXES]->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << " Incorrect 'indices_to_squeeze' input precision. Only FP32 and I32 are supported!"; + + data_dims = inBlobs[SQUEEZE_DATA]->getTensorDesc().getDims(); + + if (data_dims.size() <= idx_dims[0] && !(data_dims.size() == 1 && idx_dims[0] == 1)) + THROW_IE_EXCEPTION << " Incompatible number of data dimensions and indexes vector length!"; + SizeVector outShape; + switch (inBlobs[SQUEEZE_INDEXES]->precision()) { + case Precision::FP32: { + float* idx_data = inBlobs[SQUEEZE_INDEXES]->cbuffer().as() + + inBlobs[SQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + for (size_t i = 0; i < idx_dims[0]; i++) { + float axis = idx_data[i]; + if (axis < 0) + axis += data_dims.size(); + + if (axis > data_dims.size()) { + THROW_IE_EXCEPTION << "Index to squeeze exceeds data tensor dimension"; + } else if (data_dims[axis] != 1) { + THROW_IE_EXCEPTION << "Index to squeeze of data tensor dimension is not 1"; + } + } + for (size_t j = 0; j < data_dims.size(); j++) { + bool found = false; + for (size_t i = 0; i < inBlobs[SQUEEZE_INDEXES]->size(); i++) { + int32_t axis = idx_data[i]; + if (axis < 0) + axis += data_dims.size(); + if (j == static_cast(axis)) found = true; + } + if (!found) outShape.push_back(data_dims[j]); + } + } + break; + case Precision::I32: { + int32_t* idx_data = inBlobs[SQUEEZE_INDEXES]->cbuffer().as() + + inBlobs[SQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + for (size_t i = 0; i < idx_dims[0]; i++) { + int32_t axis = idx_data[i]; + if (axis < 0) + axis += data_dims.size(); + + if (axis > data_dims.size()) { + THROW_IE_EXCEPTION << "Index to squeeze exceeds data tensor dimension"; + } else if (data_dims[axis] != 1) { + THROW_IE_EXCEPTION << "Index to squeeze of data tensor dimension is not 1"; + } + } + for (size_t j = 0; j < data_dims.size(); j++) { + bool found = false; + for (size_t i = 0; i < inBlobs[SQUEEZE_INDEXES]->size(); i++) { + int32_t axis = idx_data[i]; + if (axis < 0) + axis += data_dims.size(); + if (j == static_cast(axis)) found = true; + } + if (!found) outShape.push_back(data_dims[j]); + } + } + break; + default: + THROW_IE_EXCEPTION + << "Incorrect 'indices_to_squeeze' input precision. Only FP32 and I32 are supported!"; + } + outShapes.push_back(outShape); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_strided_slice_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_strided_slice_shape_infer.hpp new file mode 100644 index 0000000..074010d --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_strided_slice_shape_infer.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for StridedSlice layer + */ +class StridedSliceShapeProp : public BuiltInShapeInferImpl { +public: + explicit StridedSliceShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + StridedSliceHelper helper(inBlobs, params); + outShapes.push_back(helper.getOutputShape()); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_tensor_iterator_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_tensor_iterator_shape_infer.hpp new file mode 100644 index 0000000..417bbd4 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_tensor_iterator_shape_infer.hpp @@ -0,0 +1,109 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for DetectionOutput layer + */ +class TensorIteratorShapeProp : public BuiltInShapeInferImpl { +public: + explicit TensorIteratorShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void setOriginalLayer(const CNNLayer *layer) { + auto ti = dynamic_cast(layer); + if (!ti) + THROW_IE_EXCEPTION << "Error during shape infer. Original layer is not TensorIterator."; + _original_ti = ti; + } + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + TensorIterator ti(lp); + ti.params = params; + ti.type = _type; + ti.body = _original_ti->body; + ti.back_edges = _original_ti->back_edges; + ti.input_port_map = _original_ti->input_port_map; + ti.output_port_map = _original_ti->output_port_map; + validate(&ti, inBlobs, params, blobs); + + // TODO: make util function to calculate num of iteration + int num_iteration = 1; + + // Prepare input shapes for internal body + std::map> newInShapes; + for (auto &port_map : ti.input_port_map) { + int ext_port = port_map.from; + int int_port = port_map.to; + auto int_name = ti.body.inputs[int_port]->name; + + auto shape = inShapes[ext_port]; + if (port_map.axis != -1) { + int size = shape[port_map.axis]; + int start = port_map.start < 0 + ? port_map.start + size + 1 + : port_map.start; + int end = port_map.end < 0 + ? port_map.end + size + 1 + : port_map.end; + + num_iteration = std::abs(end - start) / std::abs(port_map.stride); + + // port with iterating through. Change dimension with iteration + shape[port_map.axis] = port_map.part_size; + } + + newInShapes[int_name] = shape; + } + + // Body shape infer + _body_reshaper = std::make_shared(_original_ti->body.inputs); + _body_reshaper->runNoApply(newInShapes); + + outShapes.resize(ti.output_port_map.size()); + for (auto &port_map : ti.output_port_map) { + int ext_port = port_map.from; + int int_port = port_map.to; + auto &int_out_data = ti.body.outputs[int_port]; + auto shape = _body_reshaper->getResultShapeFor(int_out_data); + + if (port_map.axis != -1) { + // port with iterating through. Change dimension with iteration + shape[port_map.axis] *= num_iteration; + } + + outShapes[ext_port] = shape; + } + } + + void apply() { + if (!_body_reshaper) + THROW_IE_EXCEPTION << "Request of apply reshape results while shape infer was not finished"; + _body_reshaper->apply(); + } + + +private: + const TensorIterator* _original_ti; + std::shared_ptr _body_reshaper; +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_tile_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_tile_shape_infer.hpp index ad89d83..c86654e 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_tile_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_tile_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -20,7 +20,7 @@ class TileShapeProp : public BuiltInShapeInferImpl { public: explicit TileShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -28,7 +28,7 @@ public: TileLayer tileLayer(lp); tileLayer.params = params; tileLayer.type = _type; - validate(&tileLayer, inShapes, params, blobs); + validate(&tileLayer, inBlobs, params, blobs); outShapes.push_back(inShapes[0]); outShapes[0][tileLayer.axis] *= tileLayer.tiles; } diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_unsqueeze_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_unsqueeze_shape_infer.hpp new file mode 100644 index 0000000..36dc367 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_unsqueeze_shape_infer.hpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_built_in_impl.hpp" +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Shape inference for Unsqueeze layer + */ +class UnsqueezeShapeProp : public BuiltInShapeInferImpl { +public: + explicit UnsqueezeShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} + + void inferShapesImpl(const std::vector& inBlobs, + const std::map& params, + const std::map& blobs, + std::vector& outShapes) override { + LayerParams lp{}; + UnsqueezeLayer unsqueezeLayer(lp); + unsqueezeLayer.params = params; + unsqueezeLayer.type = _type; + validate(&unsqueezeLayer, inBlobs, params, blobs); + + const size_t UNSQUEEZE_DATA = 0; + const size_t UNSQUEEZE_INDEXES = 1; + + SizeVector idx_dims = inBlobs[UNSQUEEZE_INDEXES]->getTensorDesc().getDims(); + SizeVector data_dims = inBlobs[UNSQUEEZE_DATA]->getTensorDesc().getDims(); + SizeVector outShape; + if (idx_dims.size() > 1) + THROW_IE_EXCEPTION << " Index vector should be 1 dimension"; + if (inBlobs[UNSQUEEZE_INDEXES]->getTensorDesc().getPrecision() != Precision::I32 && + inBlobs[UNSQUEEZE_INDEXES]->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << " Incorrect 'indices_to_squeeze' input precision. Only FP32 and I32 are supported!"; + + size_t max = data_dims.size(); + switch (inBlobs[UNSQUEEZE_INDEXES]->precision()) { + case Precision::FP32: { + float* idx_data = inBlobs[UNSQUEEZE_INDEXES]->cbuffer().as() + + inBlobs[UNSQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + for (size_t i = 0; i < idx_dims[0]; i++) { + auto axis = static_cast(idx_data[i]); + if (axis > max) max = axis; + } + max++; + if ((idx_dims[0] + data_dims.size()) < max) { + THROW_IE_EXCEPTION << "Indices_to_set for unsqueeze layer is out of tensor dimension"; + } + max = inBlobs[UNSQUEEZE_INDEXES]->size() + data_dims.size(); + for (size_t i = 0, j = 0, k = 0; i < max; i++) { + if (k < inBlobs[UNSQUEEZE_INDEXES]->size() && i == idx_data[k]) { + outShape.push_back(1); + k++; + } else { + outShape.push_back(data_dims[j++]); + } + } + } + break; + case Precision::I32: { + int32_t* idx_data = inBlobs[UNSQUEEZE_INDEXES]->cbuffer().as() + + inBlobs[UNSQUEEZE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + max = data_dims.size(); + for (size_t i = 0; i < idx_dims[0]; i++) { + auto axis = static_cast(idx_data[i]); + if (axis > max) max = axis; + } + max++; + if ((idx_dims[0] + data_dims.size()) < max) { + THROW_IE_EXCEPTION << "Indices_to_set for unsqueeze layer is out of tensor dimension"; + } + max = inBlobs[UNSQUEEZE_INDEXES]->size() + data_dims.size(); + for (size_t i = 0, j = 0, k = 0; i < max; i++) { + if (k < inBlobs[UNSQUEEZE_INDEXES]->size() && i == idx_data[k]) { + outShape.push_back(1); + k++; + } else { + outShape.push_back(data_dims[j++]); + } + } + } + default: + THROW_IE_EXCEPTION << "Incorrect 'indices_to_set' input precision. Only FP32 and I32 are supported!"; + } + outShapes.push_back(outShape); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_upsampling_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_upsampling_shape_infer.hpp index d74a6b9..d7dc645 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/ie_upsampling_shape_infer.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/ie_upsampling_shape_infer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class UpsamplingShapeProp : public BuiltInShapeInferImpl { public: explicit UpsamplingShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {} - void inferShapesImpl(const std::vector& inShapes, + void inferShapesImpl(const std::vector& inBlobs, const std::map& params, const std::map& blobs, std::vector& outShapes) override { @@ -30,9 +30,13 @@ public: CNNLayer cnnLayer(lp); cnnLayer.params = params; cnnLayer.type = _type; - validate(&cnnLayer, inShapes, params, blobs); + validate(&cnnLayer, inBlobs, params, blobs); size_t scale = static_cast(cnnLayer.GetParamAsInt("scale")); - outShapes.push_back({inShapes[0][0], inShapes[0][1], inShapes[0][2] * scale, inShapes[0][3] * scale}); + SizeVector out_shapes = {inShapes[0][0], inShapes[0][1]}; + for (int i = 2; i < inShapes[0].size(); i++) { + out_shapes.push_back(inShapes[0][i] * scale); + } + outShapes.push_back(out_shapes); } }; diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/impl_register.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/impl_register.hpp index 0c40bd4..9939c8f 100644 --- a/inference-engine/src/inference_engine/shape_infer/built-in/impl_register.hpp +++ b/inference-engine/src/inference_engine/shape_infer/built-in/impl_register.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_add_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_add_const_infer.hpp new file mode 100644 index 0000000..043b093 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_add_const_infer.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for TBD layer + */ +class AddConstInfer : public ConstInferImpl { +public: + explicit AddConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + size_t numInputs = inData.size(); + if (inData.size() != 2) + THROW_IE_EXCEPTION << "Unsupported number of inputs: " << numInputs << ". 2 inputs is supported"; + auto* firstBlobBuffer = inData[0]->cbuffer().as(); + auto* secondBlobBuffer = inData[1]->cbuffer().as(); + + if (!firstBlobBuffer || !secondBlobBuffer) { + THROW_IE_EXCEPTION << "empty input data"; + } + auto outBlob = *outData.begin(); + auto* outBuffer = outBlob->buffer().as(); + if (!outBuffer) THROW_IE_EXCEPTION << "empty output data"; + if (inData[0]->size() != inData[1]->size()) { + THROW_IE_EXCEPTION << "inputs with different shapes are not supported"; + } + for (int i = 0; i < outBlob->size(); i++) { + outBuffer[i] = firstBlobBuffer[i] + secondBlobBuffer[i]; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_concat_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_concat_const_infer.hpp new file mode 100644 index 0000000..d14bdec --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_concat_const_infer.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Tile layer + */ +class ConcatConstInfer : public ConstInferImpl { +public: + explicit ConcatConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + LayerParams lp{}; + ConcatLayer layer(lp); + layer.params = params; + layer.type = _type; + _validator->parseParams(&layer); + + auto outBlob = *outData.begin(); + SizeVector outShape = outBlob->getTensorDesc().getDims(); + auto* outBuffer = outBlob->buffer().as(); + + size_t outerSize = 1; + for (int i = 0; i < layer._axis; i++) + outerSize *= outShape[i]; + + size_t outIdx = 0; + for (size_t osIdx = 0; osIdx < outerSize; osIdx++) { + for (auto& inBlob : inData) { + const auto* inBuffer = inBlob->cbuffer().as(); + size_t innerSize = inBlob->size() / outerSize; + + for (size_t j = 0; j < innerSize; j++, outIdx++) { + outBuffer[outIdx] = inBuffer[osIdx * innerSize + j]; + } + } + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_const_infer.hpp new file mode 100644 index 0000000..4ea84b8 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_const_infer.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for TBD layer + */ +class ConstConstInfer : public ConstInferImpl { +public: + explicit ConstConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + auto it = blobs.find("custom"); + if (it == blobs.end()) THROW_IE_EXCEPTION << "Missed `custom` blob"; + // TODO: copy instead of putting pointer? + outData[0] = (*it).second; + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.cpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.cpp new file mode 100644 index 0000000..1e491de --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef __INTEL_COMPILER +#pragma warning disable: 2586 +#endif + + +#include "ie_const_infer_holder.hpp" +#include "ie_mul_const_infer.hpp" +#include "ie_add_const_infer.hpp" +#include "ie_div_const_infer.hpp" +#include "ie_const_const_infer.hpp" +#include "ie_shape_const_infer.hpp" +#include "ie_power_const_infer.hpp" +#include "ie_tile_const_infer.hpp" +#include "ie_reshape_const_infer.hpp" +#include "ie_gather_const_infer.hpp" +#include "ie_split_const_infer.hpp" +#include "ie_concat_const_infer.hpp" +#include "ie_in_place_const_infer.hpp" +#include "ie_strided_slice_const_infer.hpp" +#include "ie_fill_const_infer.hpp" +#include "ie_range_const_infer.hpp" +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +ConstInferHolder::ImplsHolder::Ptr ConstInferHolder::GetImplsHolder() { + static ImplsHolder::Ptr localHolder; + if (localHolder == nullptr) { + localHolder = std::make_shared(); + } + return localHolder; +} + +void ConstInferHolder::AddImpl(const std::string& name, const IConstInferImpl::Ptr& impl) { + GetImplsHolder()->list[name] = impl; +} + +std::list ConstInferHolder::getConstInferTypes() { + std::list types; + auto& factories = GetImplsHolder()->list; + for (const auto& factory : factories) { + types.push_back(factory.first); + } + return types; +} + +IConstInferImpl::Ptr ConstInferHolder::getConstInferImpl(const std::string& type) { + auto& impls = ConstInferHolder::GetImplsHolder()->list; + if (impls.find(type) != impls.end()) { + return impls[type]; + } + return nullptr; +} + +REG_CONST_INFER_FOR_TYPE(MulConstInfer, Mul); +REG_CONST_INFER_FOR_TYPE(AddConstInfer, Add); +REG_CONST_INFER_FOR_TYPE(DivConstInfer, Div); +REG_CONST_INFER_FOR_TYPE(ShapeConstInfer, Shape); +REG_CONST_INFER_FOR_TYPE(ConstConstInfer, Const); +REG_CONST_INFER_FOR_TYPE(PowerConstInfer, Power); +REG_CONST_INFER_FOR_TYPE(TileConstInfer, Tile); +REG_CONST_INFER_FOR_TYPE(ReshapeConstInfer, Reshape); +REG_CONST_INFER_FOR_TYPE(GatherConstInfer, Gather); +REG_CONST_INFER_FOR_TYPE(SplitConstInfer, Split); +REG_CONST_INFER_FOR_TYPE(ConcatConstInfer, Concat); +REG_CONST_INFER_FOR_TYPE(InPlaceConstInfer, Unsqueeze); +REG_CONST_INFER_FOR_TYPE(InPlaceConstInfer, Squeeze); +REG_CONST_INFER_FOR_TYPE(StridedSliceConstInfer, StridedSlice); +REG_CONST_INFER_FOR_TYPE(FillConstInfer, Fill); +REG_CONST_INFER_FOR_TYPE(RangeConstInfer, Range); + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.hpp new file mode 100644 index 0000000..ab3ed03 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_holder.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include "details/caseless.hpp" +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Holder of const infer implementations for build-in IE layers, that plugins support out-of-the-box + */ +class INFERENCE_ENGINE_API_CLASS(ConstInferHolder) { + struct ImplsHolder { + using Ptr = std::shared_ptr; + InferenceEngine::details::caseless_map list; + }; +public: + std::list getConstInferTypes(); + + IConstInferImpl::Ptr getConstInferImpl(const std::string& type); + + static void AddImpl(const std::string& name, const IConstInferImpl::Ptr& impl); + +private: + static ImplsHolder::Ptr GetImplsHolder(); +}; + +template +class ImplRegisterBase { +public: + explicit ImplRegisterBase(const std::string& type) { + ConstInferHolder::AddImpl(type, std::make_shared(type)); + } +}; + +#define REG_CONST_INFER_FOR_TYPE(__prim, __type) \ +static ImplRegisterBase<__prim> __ci_reg__##__type(#__type) + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.cpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.cpp new file mode 100644 index 0000000..224b4ed --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.cpp @@ -0,0 +1,25 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +using namespace InferenceEngine; +using namespace ShapeInfer; + +void ConstInferImpl::infer(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) { + std::string errorPrefix = "Ref infer error for Layer with `" + _type + "` type: "; + if (outData.empty()) THROW_IE_EXCEPTION << errorPrefix + "output data is empty"; + for (auto const& data : outData) { + if (data->buffer() == nullptr) THROW_IE_EXCEPTION << errorPrefix + "output data is not allocated"; + } + // TODO: check for direct (NCHW, NCH, NC) and FP32 + inferImpl(inData, params, blobs, outData); +} + diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.hpp new file mode 100644 index 0000000..6ed1cbb --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_const_infer_impl.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include "ie_layer_validators.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + * @experimental + * @class IConstInferImpl + * @brief This class provides interface for the layer's implementation to propagate const + */ +class IConstInferImpl { +public: + using Ptr = std::shared_ptr; + + virtual ~IConstInferImpl() = default; + + + /** + * @brief all shapes are valid, blobs are allocated + * + */ + virtual void infer(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) = 0; +}; + +class ConstInferImpl : public IConstInferImpl { +public: + explicit ConstInferImpl(const std::string& type) : _type(type) { + _validator = details::LayerValidators::getInstance()->getValidator(_type); + if (!_validator) + THROW_IE_EXCEPTION << "Internal error: failed to find validator for layer with type: " << _type; + } + + virtual void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) = 0; + + void infer(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override; + +protected: + std::string _type; + // to get parsed descendant CNNLayer from map + details::LayerValidator::Ptr _validator; +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine + diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_div_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_div_const_infer.hpp new file mode 100644 index 0000000..e5da597 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_div_const_infer.hpp @@ -0,0 +1,51 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for TBD layer + */ +class DivConstInfer : public ConstInferImpl { +public: + explicit DivConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + size_t numInputs = inData.size(); + if (inData.size() != 2) + THROW_IE_EXCEPTION << "Unsupported number of inputs: " << numInputs << ". 2 inputs is supported"; + auto* firstBlobBuffer = inData[0]->cbuffer().as(); + auto* secondBlobBuffer = inData[1]->cbuffer().as(); + + if (!firstBlobBuffer || !secondBlobBuffer) { + THROW_IE_EXCEPTION << "empty input data"; + } + auto outBlob = *outData.begin(); + auto* outBuffer = outBlob->buffer().as(); + if (!outBuffer) THROW_IE_EXCEPTION << "empty output data"; + if (inData[0]->size() != inData[1]->size()) { + THROW_IE_EXCEPTION << "inputs with different shapes are not supported"; + } + for (int i = 0; i < outBlob->size(); i++) { + if (secondBlobBuffer[i] == 0) THROW_IE_EXCEPTION << "division by zero"; + outBuffer[i] = firstBlobBuffer[i] / secondBlobBuffer[i]; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_fill_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_fill_const_infer.hpp new file mode 100644 index 0000000..0d2dd7b --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_fill_const_infer.hpp @@ -0,0 +1,108 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Fill layer + */ +class FillConstInfer : public ConstInferImpl { +public: + explicit FillConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + const size_t FILL_DIMS = 0; + const size_t FILL_VALUE = 1; + if (inData.empty() || outData.empty()) + THROW_IE_EXCEPTION << " Incorrect number of input/output edges!"; + + if (inData.size() != 2) + THROW_IE_EXCEPTION << " Incorrect number of input edges!"; + + SizeVector dims = inData[FILL_DIMS]->getTensorDesc().getDims(); + if (dims.size() > 1) + THROW_IE_EXCEPTION << " Fill dimensions vector should be 1 dimension"; + + if (inData[FILL_DIMS]->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << " Fill dimensions vector should be I32!"; + + SizeVector value_dims = inData[FILL_VALUE]->getTensorDesc().getDims(); + if (value_dims.size() > 1) + THROW_IE_EXCEPTION << " Value scalar should have 1 dimension"; + + if (!(inData[FILL_VALUE]->getTensorDesc().getPrecision() == Precision::I32 && + outData[0]->getTensorDesc().getPrecision() == Precision::I32) && + !(inData[FILL_VALUE]->getTensorDesc().getPrecision() == Precision::FP32 && + outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { + THROW_IE_EXCEPTION << + " 'Value' input scalars and output tensor should have same precision and only FP32 and I32 are supported!"; + } + + int32_t* fill_dims = inData[FILL_DIMS]->cbuffer().as() + + inData[FILL_DIMS]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + size_t fill_size = inData[FILL_DIMS]->getTensorDesc().getDims()[0]; + SizeVector dst_dims = outData[0]->getTensorDesc().getDims(); + + if (dst_dims.size() != fill_size) { + THROW_IE_EXCEPTION << "Output tensor dimension mismatch"; + } + + size_t work_amount_dst = 1; + for (size_t i = 0; i < dst_dims.size(); i++) { + work_amount_dst *= fill_dims[i]; + if (static_cast(dst_dims[i]) != fill_dims[i]) { + THROW_IE_EXCEPTION << "Output tensor dimension size mismatch"; + } + } + + switch (outData[0]->precision()) { + case Precision::FP32: { + float* dst_data = outData[0]->cbuffer().as() + + outData[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float value = (inData[FILL_VALUE]->cbuffer().as() + + inData[FILL_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, start, end); + std::fill_n(dst_data + start, end - start, value); + }); + } + break; + case Precision::I32: { + int32_t* dst_data = outData[0]->cbuffer().as() + + outData[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + int32_t value = (inData[FILL_VALUE]->cbuffer().as() + + inData[FILL_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, start, end); + std::fill_n(dst_data + start, end - start, value); + }); + } + break; + default: + THROW_IE_EXCEPTION << "Incorrect output precision. Only FP32 and I32 are supported!"; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_gather_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_gather_const_infer.hpp new file mode 100644 index 0000000..23c0b71 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_gather_const_infer.hpp @@ -0,0 +1,171 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +struct GatherParams { + size_t dataLength = 1; + int axis = 0; + size_t indexRange = 0; + size_t numDictionaries = 1; +}; + +template +void +gather(data_t* src_dataIdx, const Blob::CPtr& indexes, const Blob::CPtr& dictionary, const Blob::Ptr& output, + const GatherParams& p) { + size_t src_dataIdxSize = indexes->size(); + size_t dataSize = sizeof(float) * p.dataLength; + + const float* src_dataDict = + dictionary->cbuffer().as() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding(); + float* dst_data = output->cbuffer().as() + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); + src_dataIdx += indexes->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + if (p.axis == 0) { + parallel_for(src_dataIdxSize, [&](size_t i) { + int idx = static_cast(src_dataIdx[i]); + + // Index clipping + details::clipping(&idx, 0, p.indexRange); + + // Copying data to destination from Dictionary + ie_memcpy(&dst_data[p.dataLength * i], + output->byteSize() - (p.dataLength * i), + &src_dataDict[p.dataLength * idx], + dataSize); + }); + } else { + parallel_for(src_dataIdxSize, [&](size_t i) { + int idx = static_cast(src_dataIdx[i]); + + // Index clipping + details::clipping(&idx, 0, p.indexRange); + + // Copying data to destination from Dictionary + for (size_t j = 0; j < p.numDictionaries; j++) { + ie_memcpy(&dst_data[p.dataLength * (i + j * src_dataIdxSize)], + output->byteSize() - (p.dataLength * (i + j * src_dataIdxSize)), + &src_dataDict[p.dataLength * (idx + j * p.indexRange)], + dataSize); + } + }); + } +} + +/** + *@brief Implementation of Const inference for Gather layer + */ +class GatherConstInfer : public ConstInferImpl { +public: + explicit GatherConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + LayerParams lp{}; + CNNLayer layer(lp); + layer.params = params; + + + const size_t GATHER_DICTIONARY = 0; + const size_t GATHER_INDEXES = 1; + + if (inData.size() != 2 || outData.empty()) + THROW_IE_EXCEPTION << " Incorrect number of input/output edges!"; + + Precision inIdxPrecision = inData[GATHER_INDEXES]->getTensorDesc().getPrecision(); + if (inIdxPrecision != Precision::FP32 && + inIdxPrecision != Precision::I32 && + inIdxPrecision != Precision::U16 && + inIdxPrecision != Precision::I16 && + inIdxPrecision != Precision::U8 && + inIdxPrecision != Precision::I8) + THROW_IE_EXCEPTION << " Incorrect input precision. Only FP32|I32|U16|I16|U8|I8 are supported!"; + + // Remove redundant dimensions + const SizeVector& dictionary_dims = inData[GATHER_DICTIONARY]->getTensorDesc().getDims(); + size_t actualAxis = 0; + SizeVector dims_actual; + for (size_t i = 0; i < dictionary_dims.size(); i++) { + if (dictionary_dims[i] > 1) { + for (size_t j = i; j < dictionary_dims.size(); j++) + dims_actual.push_back(dictionary_dims[j]); + break; + } + } + + if (dims_actual.size() == 0) + THROW_IE_EXCEPTION << " Incorrect input parameters dimension!"; + + GatherParams p; + p.axis = static_cast(layer.GetParamAsInt("axis")); + // Dictionary must be at least rank axis + 1 + if (p.axis > 0 && dims_actual.size() < (1 + p.axis)) + THROW_IE_EXCEPTION << " Incorrect input parameters dimensions and axis number!"; + else if (p.axis < 0 && (static_cast(dims_actual.size()) + p.axis) < 0) + THROW_IE_EXCEPTION << " Incorrect input parameters dimensions and axis number!"; + + if (p.axis < 0) + p.axis += dims_actual.size(); + + // Find number of dictionaries, index range and data length + for (size_t i = 0; i < p.axis; i++) + p.numDictionaries *= dims_actual[i]; + p.indexRange = dims_actual[p.axis]; + for (size_t i = p.axis + 1; i < dims_actual.size(); i++) + p.dataLength *= dims_actual[i]; + + if (p.dataLength == 0) + THROW_IE_EXCEPTION << " Incorrect input parameters dimension!"; + + + switch (inData[GATHER_INDEXES]->precision()) { + case Precision::FP32: + gather(inData[GATHER_INDEXES]->cbuffer().as(), inData[GATHER_INDEXES], + inData[GATHER_DICTIONARY], outData[0], p); + break; + case Precision::I32: + gather(inData[GATHER_INDEXES]->cbuffer().as(), inData[GATHER_INDEXES], + inData[GATHER_DICTIONARY], outData[0], p); + break; + case Precision::U16: + gather(inData[GATHER_INDEXES]->cbuffer().as(), inData[GATHER_INDEXES], + inData[GATHER_DICTIONARY], outData[0], p); + break; + case Precision::I16: + gather(inData[GATHER_INDEXES]->cbuffer().as(), inData[GATHER_INDEXES], + inData[GATHER_DICTIONARY], outData[0], p); + break; + case Precision::U8: + gather(inData[GATHER_INDEXES]->cbuffer().as(), inData[GATHER_INDEXES], + inData[GATHER_DICTIONARY], outData[0], p); + break; + case Precision::I8: + gather(inData[GATHER_INDEXES]->cbuffer().as(), inData[GATHER_INDEXES], + inData[GATHER_DICTIONARY], outData[0], p); + break; + default: + THROW_IE_EXCEPTION << " Unsupported precision!"; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_in_place_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_in_place_const_infer.hpp new file mode 100644 index 0000000..abbcd20 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_in_place_const_infer.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Unsqueeze layer + */ +class InPlaceConstInfer : public ConstInferImpl { +public: + explicit InPlaceConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + auto inBlob = inData[0]; + auto outBlob = outData[0]; + auto* inBuffer = inBlob->cbuffer().as(); + auto* outBuffer = outBlob->buffer().as(); + ie_memcpy(outBuffer, outData[0]->byteSize(), inBuffer, inBlob->byteSize()); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_mul_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_mul_const_infer.hpp new file mode 100644 index 0000000..37f398f --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_mul_const_infer.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for TBD layer + */ +class MulConstInfer : public ConstInferImpl { +public: + explicit MulConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + size_t numInputs = inData.size(); + if (inData.size() != 2) + THROW_IE_EXCEPTION << "Unsupported number of inputs: " << numInputs << ". 2 inputs is supported"; + auto* firstBlobBuffer = inData[0]->cbuffer().as(); + auto* secondBlobBuffer = inData[1]->cbuffer().as(); + + if (!firstBlobBuffer || !secondBlobBuffer) { + THROW_IE_EXCEPTION << "empty input data"; + } + auto outBlob = *outData.begin(); + auto* outBuffer = outBlob->buffer().as(); + if (!outBuffer) THROW_IE_EXCEPTION << "empty output data"; + if (inData[0]->size() != inData[1]->size()) { + THROW_IE_EXCEPTION << "inputs with different shapes are not supported"; + } + for (int i = 0; i < outBlob->size(); i++) { + outBuffer[i] = firstBlobBuffer[i] * secondBlobBuffer[i]; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_power_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_power_const_infer.hpp new file mode 100644 index 0000000..d6ce3df --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_power_const_infer.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for TBD layer + */ +class PowerConstInfer : public ConstInferImpl { +public: + explicit PowerConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + LayerParams lp{}; + PowerLayer layer(lp); + layer.params = params; + layer.type = _type; + _validator->parseParams(&layer); + + float scale = layer.scale; + float power = layer.power; + float shift = layer.offset; + + // TODO: check for access and sizes + auto* input = inData[0]->cbuffer().as(); + auto* output = outData[0]->buffer().as(); + size_t dataSize = inData[0]->size(); + + if (power == 1.0f) { + for (int i = 0; i < dataSize; i++) { + output[i] = input[i] * scale + shift; + } + } else { + for (int i = 0; i < dataSize; i++) { + output[i] = pow(input[i] * scale + shift, power); + } + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_range_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_range_const_infer.hpp new file mode 100644 index 0000000..dfdd7f8 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_range_const_infer.hpp @@ -0,0 +1,116 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Fill layer + */ +class RangeConstInfer : public ConstInferImpl { +public: + explicit RangeConstInfer(const std::string& type) : ConstInferImpl(type) {} + + template + void range(data_t start, data_t limit, data_t delta, const Blob::Ptr& output) { + size_t dst_size = (output->getTensorDesc().getDims())[0]; + data_t* dst_data = output->cbuffer().as() + + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); + size_t work_amount_dst = std::floor(std::abs((limit - start) / delta)); + if (work_amount_dst != dst_size) + THROW_IE_EXCEPTION << "Range indexes exceeds data tensor dimension"; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t iwork = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, iwork, end); + data_t dst_value = start + iwork * delta; + + for (; iwork < end; ++iwork, dst_value += delta) { + dst_data[iwork] = dst_value; + } + }); + } + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + const size_t RANGE_START = 0; + const size_t RANGE_LIMIT = 1; + const size_t RANGE_DELTA = 2; + if (inData.empty() || outData.empty()) + THROW_IE_EXCEPTION << " Incorrect number of input/output edges!"; + + if (inData.size() != 3) + THROW_IE_EXCEPTION << " Incorrect number of input edges!"; + + SizeVector start_dims = inData[RANGE_START]->getTensorDesc().getDims(); + if (start_dims.size() > 1) + THROW_IE_EXCEPTION << " Start scalar should have 1 dimension"; + + SizeVector limit_dims = inData[RANGE_LIMIT]->getTensorDesc().getDims(); + if (limit_dims.size() > 1) + THROW_IE_EXCEPTION << " Limit scalar should have 1 dimension"; + + SizeVector delta_dims = inData[RANGE_DELTA]->getTensorDesc().getDims(); + if (delta_dims.size() > 1) + THROW_IE_EXCEPTION << " Delta scalar should have 1 dimension"; + + SizeVector dst_dims = outData[0]->getTensorDesc().getDims(); + if (dst_dims.size() > 1) + THROW_IE_EXCEPTION << " Output vector should have 1 dimension"; + + if (!(inData[RANGE_START]->getTensorDesc().getPrecision() == Precision::I32 && + inData[RANGE_LIMIT]->getTensorDesc().getPrecision() == Precision::I32 && + inData[RANGE_DELTA]->getTensorDesc().getPrecision() == Precision::I32 && + outData[0]->getTensorDesc().getPrecision() == Precision::I32) && + !(inData[RANGE_START]->getTensorDesc().getPrecision() == Precision::FP32 && + inData[RANGE_LIMIT]->getTensorDesc().getPrecision() == Precision::FP32 && + inData[RANGE_DELTA]->getTensorDesc().getPrecision() == Precision::FP32 && + outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { + THROW_IE_EXCEPTION << + " 'Start', 'Limit', 'Delta' input scalars and output tensor should have same precision" + << + "and only FP32 and I32 are supported!"; + } + + StatusCode retcode = OK; + switch (outData[0]->precision()) { + case Precision::FP32: { + range((inData[RANGE_START]->cbuffer().as() + + inData[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inData[RANGE_LIMIT]->cbuffer().as() + + inData[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inData[RANGE_DELTA]->cbuffer().as() + + inData[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], outData[0]); + } + break; + case Precision::I32: { + range((inData[RANGE_START]->cbuffer().as() + + inData[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inData[RANGE_LIMIT]->cbuffer().as() + + inData[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], + (inData[RANGE_DELTA]->cbuffer().as() + + inData[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], outData[0]); + } + break; + default: + THROW_IE_EXCEPTION << "Incorrect output precision. Only FP32 and I32 are supported!"; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_reshape_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_reshape_const_infer.hpp new file mode 100644 index 0000000..71f470b --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_reshape_const_infer.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Tile layer + */ +class ReshapeConstInfer : public ConstInferImpl { +public: + explicit ReshapeConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + auto inBlob = *inData.begin(); + const auto* inBuffer = inBlob->cbuffer().as(); + auto outBlob = *outData.begin(); + auto* outBuffer = outBlob->buffer().as(); + ie_memcpy(outBuffer, outBlob->byteSize(), inBuffer, inBlob->byteSize()); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_shape_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_shape_const_infer.hpp new file mode 100644 index 0000000..531104c --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_shape_const_infer.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for TBD layer + */ +class ShapeConstInfer : public ConstInferImpl { +public: + explicit ShapeConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + SizeVector inShape = (*inData.begin())->getTensorDesc().getDims(); + auto outBlob = *outData.begin(); + if (inShape.size() != outBlob->size()) THROW_IE_EXCEPTION << "Number of shapes don't match size of output"; + auto* outBuffer = outBlob->buffer().as(); + for (int i = 0; i < outBlob->size(); i++) { + outBuffer[i] = inShape[i]; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_split_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_split_const_infer.hpp new file mode 100644 index 0000000..39135b1 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_split_const_infer.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Tile layer + */ +class SplitConstInfer : public ConstInferImpl { +public: + explicit SplitConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + LayerParams lp{}; + SplitLayer layer(lp); + layer.params = params; + layer.type = _type; + _validator->parseParams(&layer); + + auto inBlob = *inData.begin(); + SizeVector inShape = inBlob->getTensorDesc().getDims(); + const auto* inBuffer = inBlob->cbuffer().as(); + + size_t outerSize = 1; + for (int i = 0; i < layer._axis; i++) + outerSize *= inShape[i]; + + for (size_t osIdx = 0; osIdx < outerSize; osIdx++) { + for (auto& outBlob : outData) { + auto* outBuffer = outBlob->buffer().as(); + size_t innerSize = outBlob->size() / outerSize; + + for (size_t j = 0; j < innerSize; j++, inBuffer++) { + outBuffer[osIdx * innerSize + j] = *inBuffer; + } + } + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_strided_slice_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_strided_slice_const_infer.hpp new file mode 100644 index 0000000..6aee61d --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_strided_slice_const_infer.hpp @@ -0,0 +1,384 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#define NOMINMAX + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" +#include "ie_parallel.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +class StridedSliceHelper { +public: + StridedSliceHelper(const std::vector& inData, + const std::map& params) { + LayerParams lp{}; + CNNLayer layer(lp); + layer.params = params; + + src_data = inData[STRIDEDSLICE_DATA]->cbuffer().as() + + inData[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + if (inData.size() > 4) + THROW_IE_EXCEPTION << " Incorrect number of input/output edges!"; + + src_dims = inData[STRIDEDSLICE_DATA]->getTensorDesc().getDims(); + + bounds_size = 0; + if (inData.size() > 1) { + begin_dims = inData[STRIDEDSLICE_BEGIN]->getTensorDesc().getDims(); + if (inData[STRIDEDSLICE_BEGIN]->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << " Incorrect 'begin' input precision. Only I32 is supported!"; + if (begin_dims.size() > 1) + THROW_IE_EXCEPTION << " Begin vector should be 1 dimension"; + bounds_size = begin_dims[0]; + } + + if (inData.size() > 2) { + end_dims = inData[STRIDEDSLICE_END]->getTensorDesc().getDims(); + if (inData[STRIDEDSLICE_END]->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << " Incorrect 'end' input precision. Only I32 is supported!"; + if (end_dims.size() > 1) + THROW_IE_EXCEPTION << " End vector should be 1 dimension"; + if (begin_dims[0] != end_dims[0]) + THROW_IE_EXCEPTION << " Begin vector size should be equal end vectror size"; + } + + if (inData.size() > 3) { + stride_dims = inData[STRIDEDSLICE_STRIDE]->getTensorDesc().getDims(); + if (inData[STRIDEDSLICE_STRIDE]->getTensorDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << " Incorrect 'strides' input precision. Only I32 is supported!"; + if (stride_dims.size() > 1) + THROW_IE_EXCEPTION << " End vector should be 1 dimension"; + if (begin_dims[0] != stride_dims[0]) + THROW_IE_EXCEPTION << " Stride vector size should be equal begin vectror size"; + } + + std::string::size_type i; + std::string begin_mask_str = layer.GetParamAsString("begin_mask", ""); + for (i = 0; i < begin_mask_str.size(); ++i) { + if (begin_mask_str[i] == '1') begin_mask.push_back(1); + else if (begin_mask_str[i] == '0') begin_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) begin_mask.push_back(1); + + std::string end_mask_str = layer.GetParamAsString("end_mask", ""); + for (i = 0; i < end_mask_str.size(); ++i) { + if (end_mask_str[i] == '1') end_mask.push_back(1); + else if (end_mask_str[i] == '0') end_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) end_mask.push_back(1); + + std::string ellipsis_mask_str = layer.GetParamAsString("ellipsis_mask", ""); + size_t ellipsis_mask_counter = 0; + for (i = 0; i < ellipsis_mask_str.size(); ++i) { + if (ellipsis_mask_str[i] == '1') { + ellipsis_mask_counter++; + ellipsis_mask.push_back(1); + } else if (ellipsis_mask_str[i] == '0') { + ellipsis_mask.push_back(0); + } + } + if (ellipsis_mask_counter > 1) + THROW_IE_EXCEPTION << " 'Ellipsis_mask' must be a power of two (only one ellipsis)!"; + for (; i < src_dims.size(); ++i) ellipsis_mask.push_back(0); + + std::string new_axis_mask_str = layer.GetParamAsString("new_axis_mask", ""); + for (i = 0; i < new_axis_mask_str.size(); ++i) { + if (new_axis_mask_str[i] == '1') new_axis_mask.push_back(1); + else if (new_axis_mask_str[i] == '0') new_axis_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) new_axis_mask.push_back(0); + + std::string shrink_axis_mask_str = layer.GetParamAsString("shrink_axis_mask", ""); + for (i = 0; i < shrink_axis_mask_str.size(); ++i) { + if (shrink_axis_mask_str[i] == '1') shrink_axis_mask.push_back(1); + else if (shrink_axis_mask_str[i] == '0') shrink_axis_mask.push_back(0); + } + for (; i < src_dims.size(); ++i) shrink_axis_mask.push_back(0); + + int new_axis = 0; + for (auto& na : new_axis_mask) + new_axis += na; + + shrink_axis = 0; + for (auto& sa : shrink_axis_mask) + shrink_axis += sa; + max_dims = src_dims.size() + new_axis; + + // ellipsis_mask must be a power of two (only one ellipsis), so to take a first position + ellipsis_pos1 = ellipsis_pos2 = max_dims; + for (i = 0; i < ellipsis_mask.size(); i++) { + if (ellipsis_mask[i] > 0) { + ellipsis_pos1 = i; + break; + } + } + bounds_size -= ellipsis_pos1; + if (bounds_size > 0 && (max_dims - bounds_size) > ellipsis_pos1) + ellipsis_pos2 = max_dims - bounds_size; + + begin_dms.assign(max_dims, 0); + end_dms.assign(max_dims, -1); + stride_dms.assign(max_dims, 1); + + srcStrides = inData[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getStrides(); + + int* begin = nullptr, * end = nullptr, * stride = nullptr; + if (begin_dims.size()) + begin = inData[STRIDEDSLICE_BEGIN]->cbuffer().as() + + inData[STRIDEDSLICE_BEGIN]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + if (end_dims.size()) + end = inData[STRIDEDSLICE_END]->cbuffer().as() + + inData[STRIDEDSLICE_END]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + if (stride_dims.size()) + stride = inData[STRIDEDSLICE_STRIDE]->cbuffer().as() + + inData[STRIDEDSLICE_STRIDE]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + int j, k, bj, ej, sj; + for (i = 0, j = 0, k = 0, bj = 0, ej = 0, sj = 0; i < max_dims; i++) { + if (i >= ellipsis_pos1 && i < ellipsis_pos2) { + if (new_axis_mask.size() > i && new_axis_mask[i] == 1) + end_dms[i] = 0; + else + end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j++] + end_dms[i]; + + out_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / + static_cast(abs(stride_dms[i]))))); + our_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / + static_cast(abs(stride_dms[i]))))); + k = ellipsis_pos1; + } else { + stride_dms[i] = (stride != nullptr && stride_dims[0] > sj && stride[sj] != 0) ? stride[sj++] : 1; + + if (begin_mask.size() > j && begin_mask[j] == 0) + begin_dms[i] = stride_dms[i] > 0 ? 0 : -1; + else + begin_dms[i] = (begin != nullptr && begin_dims[0] > bj) ? begin[bj] : (stride_dms[i] > 0 ? 0 : -1); + bj++; + begin_dms[i] = begin_dms[i] >= 0 ? begin_dms[i] : src_dims[j] + begin_dms[i]; + // Clipping 'begin' + details::clipping(&begin_dms[i], 0, src_dims[j]); + + if (end_mask.size() > j && end_mask[j] == 0) { + end_dms[i] = stride_dms[i] > 0 ? -1 : 0; + } else { + int end_dms_tmp = (end != nullptr && end_dims[0] > ej) ? (stride_dms[i] > 0 ? end[ej] - 1 : end[ej] + 1) + : end_dms[i]; + end_dms[i] = (end != nullptr && end_dims[0] > ej) ? end_dms_tmp : (stride_dms[i] > 0 ? -1 : 0); + } + ej++; + end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j] + end_dms[i]; + // Clipping 'end' + details::clipping(&end_dms[i], 0, src_dims[j]); + + if (new_axis_mask.size() > i && new_axis_mask[i] == 1) + end_dms[i] = 0; + else + j++; + + if (shrink_axis_mask.size() > k && shrink_axis_mask[k] == 1) + end_dms[i] = begin_dms[i]; + else + out_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / + static_cast(abs(stride_dms[i]))))); + + our_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / + static_cast(abs(stride_dms[i]))))); + k++; + } + } + } + + SizeVector getOutputShape() { + return out_dims; + } + + void infer(std::vector& outData) { + dst_dims = outData[0]->getTensorDesc().getDims(); + size_t range = out_dims.size() < dst_dims.size() ? out_dims.size() : dst_dims.size(); + for (int i = 0; i < range; i++) { + if (out_dims[i] != dst_dims[i]) + THROW_IE_EXCEPTION << "parameter mismatch"; + } + dstStrides = outData[0]->getTensorDesc().getBlockingDesc().getStrides(); + if (outData.size() != 1) + THROW_IE_EXCEPTION << " Incorrect number of input/output edges!"; + float* dst_data = outData[0]->cbuffer().as() + + outData[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + if (src_dims.size() == max_dims && shrink_axis == 0 && stride_dms[stride_dms.size() - 1] == 1 && + stride_dms.size() > 1) + strided_slice_vp(src_data, dst_data); + else if (src_dims.size() == max_dims && shrink_axis == 0) + strided_slice_p(src_data, dst_data); + else + strided_slice(src_data, dst_data, our_dims); + } + +private: + void strided_slice(const float* src_data, float* dst_data, std::vector& dims) { + size_t i; + int j; + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; + SizeVector counters(max_dims, 0); + + for (size_t iwork = 0; iwork < work_amount_dst; ++iwork) { + int src_idx = 0; + for (i = 0, j = 0; i < max_dims; ++i) { + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[j]; + if (!(new_axis_mask.size() > i && new_axis_mask[i] == 1)) j++; + } + + dst_data[iwork] = src_data[src_idx]; + + for (j = max_dims - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < dims[j]) + break; + else + counters[j] = 0; + } + } + } + + void strided_slice_vp(const float* src_data, float* dst_data) { + // Vectorized copy + size_t dims_size_1 = dst_dims.size() - 1; + size_t dataLength = dst_dims[dims_size_1]; + size_t work_amount_dst = dstStrides[0] * dst_dims[0] / dst_dims[dims_size_1]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0; + SizeVector counters(dims_size_1, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + int src_idx = begin_dms[dims_size_1]; + for (int j = dims_size_1 - 1, i = start; j >= 0; j--) { + counters[j] = i % dst_dims[j]; + src_idx += (begin_dms[j] + counters[j] * stride_dms[j]) * srcStrides[j]; + i /= dst_dims[j]; + } + + for (size_t iwork = start, dst_idx = start * dataLength, i = 1; + iwork < end; ++iwork, dst_idx += dataLength) { + memcpy(&dst_data[dst_idx], &src_data[src_idx], sizeof(float) * dataLength); + for (int j = dims_size_1 - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < dst_dims[j]) { + src_idx += stride_dms[j] * srcStrides[j]; + break; + } else { + counters[j] = i = 0; + } + } + if (!i) { + for (src_idx = begin_dms[dims_size_1]; i < dims_size_1; ++i) + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[i]; + } + } + }); + } + + void strided_slice_p(const float* src_data, float* dst_data) { + size_t dims_size = dst_dims.size(); + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0; + SizeVector counters(dims_size, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + int src_idx = 0; + for (int j = dims_size - 1, i = start; j >= 0; j--) { + counters[j] = i % dst_dims[j]; + src_idx += (begin_dms[j] + counters[j] * stride_dms[j]) * srcStrides[j]; + i /= dst_dims[j]; + } + + for (size_t iwork = start, dst_idx = start, i = 1; iwork < end; ++iwork, dst_idx++) { + dst_data[dst_idx] = src_data[src_idx]; + for (int j = dims_size - 1; j >= 0; j--) { + counters[j]++; + if (counters[j] < dst_dims[j]) { + src_idx += stride_dms[j] * srcStrides[j]; + break; + } else { + counters[j] = i = 0; + } + } + if (!i) { + for (src_idx = 0; i < dims_size; ++i) + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[i]; + } + } + }); + } + +private: + const size_t STRIDEDSLICE_DATA = 0; + const size_t STRIDEDSLICE_BEGIN = 1; + const size_t STRIDEDSLICE_END = 2; + const size_t STRIDEDSLICE_STRIDE = 3; + + SizeVector begin_dims; + SizeVector end_dims; + SizeVector stride_dims; + + SizeVector begin_mask; + SizeVector end_mask; + SizeVector ellipsis_mask; + SizeVector new_axis_mask; + SizeVector shrink_axis_mask; + int shrink_axis; + + SizeVector src_dims; + SizeVector dst_dims; + std::vector begin_dms; + std::vector end_dms; + std::vector stride_dms; + SizeVector srcStrides; + SizeVector dstStrides; + size_t bounds_size; + size_t max_dims; + size_t ellipsis_pos1, ellipsis_pos2; + + InferenceEngine::SizeVector out_dims; + InferenceEngine::SizeVector our_dims; + const float* src_data; +}; + +/** + *@brief Implementation of Const inference for Tile layer + */ +class StridedSliceConstInfer : public ConstInferImpl { +public: + explicit StridedSliceConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + LayerParams lp{}; + StridedSliceLayer layer(lp); + layer.params = params; + layer.type = _type; + _validator->parseParams(&layer); + + StridedSliceHelper helper(inData, params); + helper.infer(outData); + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/const_infer/ie_tile_const_infer.hpp b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_tile_const_infer.hpp new file mode 100644 index 0000000..3147a45 --- /dev/null +++ b/inference-engine/src/inference_engine/shape_infer/const_infer/ie_tile_const_infer.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "ie_const_infer_impl.hpp" + +namespace InferenceEngine { +namespace ShapeInfer { + +/** + *@brief Implementation of Const inference for Tile layer + */ +class TileConstInfer : public ConstInferImpl { +public: + explicit TileConstInfer(const std::string& type) : ConstInferImpl(type) {} + + void inferImpl(const std::vector& inData, + const std::map& params, + const std::map& blobs, + std::vector& outData) override { + LayerParams lp{}; + TileLayer layer(lp); + layer.params = params; + layer.type = _type; + _validator->parseParams(&layer); + + auto inBlob = *inData.begin(); + SizeVector inShape = inBlob->getTensorDesc().getDims(); + const auto* inBuffer = inBlob->cbuffer().as(); + + auto outBlob = *outData.begin(); + auto* outBuffer = outBlob->buffer().as(); + + int m_outer_dim = 1; + int m_inner_dim = 1; + + for (int i = 0; i < layer.axis; i++) m_outer_dim *= inShape[i]; + for (int i = layer.axis; i < inShape.size(); i++) m_inner_dim *= inShape[i]; + + for (int i = 0; i < m_outer_dim; ++i) { + for (int t = 0; t < layer.tiles; ++t) { + ie_memcpy(outBuffer, outBlob->byteSize(), inBuffer, m_inner_dim * sizeof(float)); + outBuffer += m_inner_dim; + } + inBuffer += m_inner_dim; + } + } +}; + +} // namespace ShapeInfer +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.cpp b/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.cpp index fafd651..ed1d337 100644 --- a/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.cpp +++ b/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,6 +7,7 @@ #include #include #include +#include #include "shape_infer/ie_reshape_io_controllers.hpp" using namespace InferenceEngine; @@ -28,6 +29,8 @@ InputController::InputController(const std::vector& dataVec, const std: _dataNames.push_back(data->name); SizeVector dims = data->getTensorDesc().getDims(); _irShapes.push_back(dims); + // TODO probably need to create blobs with dimensions, not on getBlobs stage + _inferedData.push_back(nullptr); } } _shapes = _irShapes; @@ -38,6 +41,11 @@ void InputController::setShapeByName(const SizeVector& shape, const std::string& _shapes[pos] = shape; } +SizeVector InputController::getShapeByName(const std::string& dataName) { + long pos = getPositionByName(dataName); + return _shapes[pos]; +} + std::vector InputController::getShapes(bool check) { if (check) checkCorrespondence(); return _shapes; @@ -57,9 +65,6 @@ void InputController::checkCorrespondence() { << ") doesn't match with number of shapes(" << _shapes.size() << ") for layer '" << _layerName << "'!"; } - for (const auto& shape : _shapes) { - if (shape.empty()) THROW_IE_EXCEPTION << "ReshapeLauncher error: shape is not set"; - } // TODO: iterate and check for emptiness and size matching } @@ -93,6 +98,34 @@ void InputController::setShapeByIndex(const SizeVector& shape, size_t index) { _shapes[index] = shape; } +bool InputController::isDataAvailable() { + if (_inferedData.empty()) return false; + for (const auto& data : _inferedData) { + if (!data) return false; + else if (data->cbuffer() == nullptr) return false; + } + return true; +} + +std::vector InputController::getBlobs(bool check) { + if (check) checkCorrespondence(); + for (int i = 0; i < _dataVec.size(); i++) { + if (_inferedData[i] == nullptr || _inferedData[i]->cbuffer() == nullptr) { + TensorDesc desc = _dataVec[i]->getTensorDesc(); + desc.setDims(_shapes[i]); + // special case of Shape layer: no input data, but blob contains info about dimensions, layout and etc... + auto blob = make_blob_with_precision(desc); + _inferedData[i] = blob; + } + } + return _inferedData; +} + +void InputController::setBlobByName(const Blob::CPtr& blob, const std::string& dataName) { + long pos = getPositionByName(dataName); + _inferedData[pos] = blob; +} + OutputController::OutputController(const std::vector& data, const std::string& layerName, const DefaultChecker::Ptr& checker) : InputController(data, layerName, checker) {} @@ -120,6 +153,49 @@ void OutputController::propagateShapes(const std::set& lau } } +// Combine with propagate shapes +void OutputController::propagateBlobs(const std::set& launchers) { + unsigned idx = 0; + for (auto const& outData : _dataVec) { + for (auto const& inputTo : outData->inputTo) { + CNNLayerPtr layer = inputTo.second; + if (layer == nullptr) { + THROW_IE_EXCEPTION << "Failed to propagate shapes for layer (" << inputTo.first + << "): connected layer is null"; + } + auto layerName = layer->name; + auto foundLauncher = std::find_if(launchers.begin(), launchers.end(), + [&layerName](const ReshapeLauncher::Ptr& launcher) { + return launcher->getLayerName() == layerName; + }); + if (foundLauncher == launchers.end()) + THROW_IE_EXCEPTION << "Failed to find ReshapeLauncher for layer: '" << layerName << "'"; + (*foundLauncher)->setBlobByName(_inferedData[idx], outData->name); + } + idx++; + } +} + void OutputController::setShapes(const std::vector& shapes) { _shapes = shapes; } + +void OutputController::setBlobs(const std::vector& blobs) { + _inferedData.clear(); + for (const auto& blob : blobs) { + _inferedData.push_back(blob); + } +} + +std::vector OutputController::createBlobs() { + std::vector blobs; + for (int i = 0; i < _dataVec.size(); i++) { + TensorDesc desc = _dataVec[i]->getTensorDesc(); + desc.setDims(_shapes[i]); + auto blob = make_blob_with_precision(desc); + blob->allocate(); + blobs.push_back(blob); + } + return blobs; +} + diff --git a/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.hpp b/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.hpp index c553a73..f6d1044 100644 --- a/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.hpp +++ b/inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -54,6 +54,11 @@ public: virtual void setShapeByName(const SizeVector& shape, const std::string& dataName); /** + * @brief Return calculated shape for name. + */ + virtual SizeVector getShapeByName(const std::string& dataName); + + /** * @brief Set shape for current reshape launcher by corresponding index. * @param shape - shape to be set * @param index - shape's index @@ -95,6 +100,12 @@ public: virtual void checkCorrespondence(); + virtual bool isDataAvailable(); + + virtual std::vector getBlobs(bool check); + + virtual void setBlobByName(const Blob::CPtr& blob, const std::string& name); + private: long getPositionByName(const std::string& dataName); @@ -104,6 +115,7 @@ protected: std::vector _irShapes; std::vector _dataNames; std::string _layerName; + std::vector _inferedData; }; /** @@ -122,6 +134,12 @@ public: virtual void propagateShapes(const std::set& launchers); virtual void setShapes(const std::vector& shapes); + + virtual void setBlobs(const std::vector& blobs); + + std::vector createBlobs(); + + void propagateBlobs(const std::set& set); }; } // namespace ShapeInfer diff --git a/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.cpp b/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.cpp index c2651a0..d64c3bb 100644 --- a/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.cpp +++ b/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,8 +10,12 @@ #include #include #include
+#include #include "shape_infer/ie_reshape_launcher.hpp" #include "shape_infer/ie_reshape_io_controllers.hpp" +#include "ie_reshape_launcher.hpp" + +#include "built-in/ie_tensor_iterator_shape_infer.hpp" using namespace InferenceEngine; using namespace ShapeInfer; @@ -35,8 +39,10 @@ OutputController* DefaultInitializer::createOutputController(const CNNLayer* lay } ReshapeLauncher::ReshapeLauncher(const CNNLayer* layer, const IShapeInferImpl::Ptr& impl, - const DefaultInitializer::Ptr& initializer) : _layer(layer), _impl(impl) { + const DefaultInitializer::Ptr& initializer) : _layer(layer), _reshapeImpl(impl) { initializer->check(layer, impl); + ConstInferHolder holder; + if (layer) _inferImpl = holder.getConstInferImpl(layer->type); try { _iController = initializer->createInputController(layer); _oController = initializer->createOutputController(layer); @@ -59,13 +65,37 @@ void ReshapeLauncher::setShapeByName(const SizeVector& shape, const std::string& _iController->setShapeByName(shape, dataName); } +void ReshapeLauncher::setBlobByName(const Blob::CPtr& blob, const std::string& dataName) { + _iController->setBlobByName(blob, dataName); +} + +SizeVector ReshapeLauncher::getShapeByName(const std::string& dataName) { + return _oController->getShapeByName(dataName); +} + void ReshapeLauncher::reshape(const std::set& launchers) { ResponseDesc resp; std::vector outShapes; - auto sts = _impl->inferShapes(_iController->getShapes(true), _layer->params, _layer->blobs, outShapes, &resp); + + // TODO: TensorIterator strongly required original layer instance because body is not presented + // in params map. Original subnetwork body is required for internal shape infer + TensorIteratorShapeProp *TI_shaper = dynamic_cast(_reshapeImpl.get()); + if (TI_shaper) { + TI_shaper->setOriginalLayer(_layer); + } + + // try to call new API with input blobs + auto sts = _reshapeImpl->inferShapes(_iController->getBlobs(true), _layer->params, _layer->blobs, outShapes, &resp); + // in case of old custom shape infer function call old API + if (sts == NOT_IMPLEMENTED) { + sts = _reshapeImpl->inferShapes(_iController->getShapes(true), _layer->params, _layer->blobs, outShapes, + &resp); + } _oController->setShapes(outShapes); if (sts != OK) - THROW_IE_EXCEPTION << resp.msg; + THROW_IE_EXCEPTION << + "Failed to infer shapes for " + _layer->type + " layer (" + _layer->name + ") with error: " + + resp.msg; _oController->propagateShapes(launchers); } @@ -73,6 +103,23 @@ void ReshapeLauncher::applyChanges(CNNLayer* layer) { checkLayer(layer); _iController->applyChanges(); _oController->applyChanges(); + + // TODO: Need to finalize result of internal body shape infer and apply + // new shapes to body subnetwork + TensorIteratorShapeProp *TI_shaper = dynamic_cast(_reshapeImpl.get()); + if (TI_shaper) TI_shaper->apply(); +} + +void ReshapeLauncher::constInfer(const std::set& launchers) { + if (_iController->isDataAvailable() || _layer->type == "Const" || _layer->type == "Shape") { + auto outBlobs = _oController->createBlobs(); + _oController->setBlobs(outBlobs); + if (!_inferImpl) + THROW_IE_EXCEPTION << "Failed to find reference implementation for `" + + _layer->name + "` Layer with `" + _layer->type + "` Type on constant propagation"; + _inferImpl->infer(_iController->getBlobs(false), _layer->params, _layer->blobs, outBlobs); + _oController->propagateBlobs(launchers); + } } void ReshapeLauncher::reset() { @@ -106,7 +153,7 @@ void ReshapeLauncher::setIRShapeByName(const std::string& dataName) { } void ReshapeLauncher::setShapeInferImpl(const IShapeInferImpl::Ptr& impl) { - _impl = impl; + _reshapeImpl = impl; } const CNNLayer* ReshapeLauncher::getLayer() const { @@ -178,6 +225,10 @@ void OutputOnlyReshapeLauncher::setShapeByName(const SizeVector& shape, const st _oController->setShapeByName(shape, dataName); } +void OutputOnlyReshapeLauncher::setBlobByName(const Blob::CPtr& blob, const std::string& dataName) { + _oController->setBlobByName(blob, dataName); +} + void OutputOnlyReshapeLauncher::setIRShapeByName(const std::string& dataName) { SizeVector foundShape = _oController->getIRShapeByName(dataName); _oController->setShapeByName(foundShape, dataName); @@ -192,6 +243,23 @@ void OutputOnlyReshapeLauncher::reset() { _oController->reset(); } +void OutputOnlyReshapeLauncher::constInfer(const std::set& launchers) { + if (_layer->type == "Const") { + auto outBlobs = _oController->createBlobs(); + _oController->setBlobs(outBlobs); + if (!_inferImpl) + THROW_IE_EXCEPTION << "Failed to find reference implementation for `" + + _layer->name + "` Layer with `" + _layer->type + "` Type on constant propagation"; + _inferImpl->infer({}, _layer->params, _layer->blobs, outBlobs); + auto shapes = _oController->getShapes(true); + for (int i = 0; i < outBlobs.size(); i++) { + outBlobs[i]->Reshape(SizeVector(shapes[i].rbegin(), shapes[i].rend()), TensorDesc::getLayoutByDims(shapes[i])); + } + _oController->setBlobs(outBlobs); + _oController->propagateBlobs(launchers); + } +} + void InputInitializer::check(const CNNLayer* layer, const IShapeInferImpl::Ptr& impl) { OutputOnlyInitializer::check(layer, impl); std::string errorBase = "Failed to init reshape launcher: layer type (`" + layer->type + "`) is not"; @@ -263,9 +331,6 @@ OutMemoryReshapeLauncher::OutMemoryReshapeLauncher(const CNNLayer* layer, const : ReshapeLauncher(layer, impl, std::make_shared()) { } -void OutMemoryReshapeLauncher::reshape(const std::set& launchers) { -} - void OutMemoryReshapeLauncher::applyChanges(CNNLayer* layer) { checkLayer(layer); _iController->applyChanges(); diff --git a/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.hpp b/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.hpp index 5a9de53..28083c6 100644 --- a/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.hpp +++ b/inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -12,6 +12,7 @@ #include #include +#include "shape_infer/const_infer/ie_const_infer_impl.hpp" #include "shape_infer/built-in/ie_built_in_holder.hpp" namespace InferenceEngine { @@ -60,6 +61,14 @@ public: */ virtual void setShapeByName(const SizeVector& shape, const std::string& dataName); + virtual void setBlobByName(const Blob::CPtr& blob, const std::string& dataName); + + /** + * @brief Return calculated shape for data with requested name. + * @return Result shape + */ + virtual SizeVector getShapeByName(const std::string& dataName); + /** * @brief Set input shape from IR by Data name. If there's no Data with given name it throws exception * @param dataName - name of the corresponding Data. @@ -74,6 +83,8 @@ public: */ virtual void reshape(const std::set& launchers); + virtual void constInfer(const std::set& launchers); + /** * @brief Apply new input shapes, calculated output shapes and changed layer's params to CNNLayer and Data. * @param layer - pointer to the layer for setting changes in layer's params @@ -86,7 +97,6 @@ public: */ virtual void reset(); - // TODO: use layer instead? virtual std::string getLayerName() const; virtual std::string getLayerType() const; @@ -99,7 +109,8 @@ protected: InputController* _iController = nullptr; OutputController* _oController = nullptr; const CNNLayer* _layer; - IShapeInferImpl::Ptr _impl; + IShapeInferImpl::Ptr _reshapeImpl; + IConstInferImpl::Ptr _inferImpl; protected: /** @@ -134,6 +145,8 @@ public: FakeReshapeLauncher(const CNNLayer* layer, const IShapeInferImpl::Ptr& impl); void reshape(const std::set& launchers) override; + + void constInfer(const std::set& launchers) override {} }; class OutputOnlyInitializer : public DefaultInitializer { @@ -163,6 +176,10 @@ public: void applyChanges(CNNLayer* layer) override; void reset() override; + + void setBlobByName(const Blob::CPtr& blob, const std::string& dataName) override; + + void constInfer(const std::set& launchers) override; }; class InputInitializer : public OutputOnlyInitializer { @@ -222,11 +239,13 @@ public: OutMemoryReshapeLauncher(const CNNLayer* layer1, const IShapeInferImpl::Ptr& impl1); - void reshape(const std::set& launchers) override; + void reshape(const std::set& launchers) override {} void applyChanges(CNNLayer* layer) override; void reset() override; + + void constInfer(const std::set& launchers) override {} }; } // namespace ShapeInfer diff --git a/inference-engine/src/inference_engine/shape_infer/ie_reshaper.cpp b/inference-engine/src/inference_engine/shape_infer/ie_reshaper.cpp index 89dd72e..53e39fc 100644 --- a/inference-engine/src/inference_engine/shape_infer/ie_reshaper.cpp +++ b/inference-engine/src/inference_engine/shape_infer/ie_reshaper.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,20 +10,53 @@ #include #include #include +#include +#include #include "shape_infer/built-in/ie_built_in_holder.hpp" #include "shape_infer/ie_reshaper.hpp" #include "details/caseless.hpp" #include "details/ie_cnn_network_tools.h" #include "ie_reshaper.hpp" +#include "ie_cnn_layer_builder.h" using namespace InferenceEngine; using namespace InferenceEngine::details; using namespace ShapeInfer; -Reshaper::Reshaper(const Context &context, Network::Ptr& network): ctx(context), network(network) {} +Reshaper::Reshaper(Builder::Network* network): network(network) {} -Reshaper::Reshaper(ICNNNetwork& network, const LauncherCreator::Ptr& launcherCreator) { +static std::vector SortTopologicallyStartsFrom(const std::vector &inputs) { + std::vector all_layers; + CNNNetForestDFS(inputs, [&](CNNLayerPtr current){ + all_layers.push_back(current); + }, false); + std::reverse(all_layers.begin(), all_layers.end()); + return all_layers; +} + +Reshaper::Reshaper(std::vector insDatas, const LauncherCreator::Ptr& launcherCreator): network(nullptr) { + auto builtIn = std::make_shared(); + _allTypes = getTypeNamesFromExtension(builtIn); + _extensions.push_back(builtIn); + + _allSortedLayers = SortTopologicallyStartsFrom(insDatas); + for (auto &in_data : insDatas) { + for (auto layer : in_data->inputTo) { + _inputLayers.insert(layer.second); + } + } + + if (_inputLayers.empty() || _allSortedLayers.empty()) + THROW_IE_EXCEPTION << "Unsupported model for shape inference: failed to collect inputs and layers"; + + for (auto const& currentLayer : _allSortedLayers) { + auto createdLauncher = launcherCreator->createNotInputLauncher(currentLayer.get(), _extensions); + _launchers.insert(createdLauncher); + } +} + +Reshaper::Reshaper(ICNNNetwork& network, const LauncherCreator::Ptr& launcherCreator): network(nullptr) { auto builtIn = std::make_shared(); _allTypes = getTypeNamesFromExtension(builtIn); _extensions.push_back(builtIn); @@ -55,7 +88,7 @@ void Reshaper::AddExtension(const IShapeInferExtensionPtr& extension) { if (!extension) THROW_IE_EXCEPTION << "Failed to add empty shape infer extension"; if (network) { - ctx.addExtension(extension); + network->getContext().addExtension(extension); return; } @@ -139,8 +172,48 @@ StatusCode Reshaper::run(const std::map& inputShapes, R for (auto& layer : _allSortedLayers) { auto foundLauncher = getLauncherByLayerName(layer->name); foundLauncher->reshape(_launchers); + foundLauncher->constInfer(_launchers); + } + + // apply changes + for (auto& layer : _allSortedLayers) { + auto foundLauncher = getLauncherByLayerName(layer->name); + foundLauncher->applyChanges(layer.get()); + } + return OK; +} + +StatusCode Reshaper::runNoApply(const std::map& inputShapes, ResponseDesc* resp) { + // Reset all shapes from previous run + for (const auto& launcher : _launchers) { + launcher->reset(); + } + + // Set new input shapes + for (auto const& input : _inputLayers) { + std::string layerName = input->name; + for (auto const& inData_w : input->insData) { + auto inData = inData_w.lock(); + auto dataName = inData->name; + auto foundShapeIt = inputShapes.find(dataName); + auto foundLauncher = getLauncherByLayerName(layerName); + if (foundShapeIt != inputShapes.end()) { + foundLauncher->setShapeByName(foundShapeIt->second, dataName); + } else { + foundLauncher->setIRShapeByName(dataName); + } + } + } + + // do reshape + for (auto& layer : _allSortedLayers) { + auto foundLauncher = getLauncherByLayerName(layer->name); + foundLauncher->reshape(_launchers); } + return OK; +} +StatusCode Reshaper::apply(ResponseDesc* resp) { // apply changes for (auto& layer : _allSortedLayers) { auto foundLauncher = getLauncherByLayerName(layer->name); @@ -149,11 +222,21 @@ StatusCode Reshaper::run(const std::map& inputShapes, R return OK; } +SizeVector Reshaper::getResultShapeFor(DataPtr &data, ResponseDesc* resp) { + auto creator_layer = data->creatorLayer.lock(); + std::string creator_layer_name; + if (creator_layer) { + creator_layer_name = creator_layer->name; + } + auto foundLauncher = getLauncherByLayerName(creator_layer_name); + return foundLauncher->getShapeByName(data->getName()); +} + StatusCode Reshaper::networkShapeInfer(const std::map& inputShapes, ResponseDesc* resp) { if (!network) return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! Network is not loaded."; - std::vector propagatedLayers; - Network propagatedNetwork(*network); + std::vector propagatedLayers; + Builder::Network propagatedNetwork(*network); // Set new input shapes for (auto& layer : propagatedNetwork) { @@ -164,12 +247,78 @@ StatusCode Reshaper::networkShapeInfer(const std::map& if (layer->getOutputPorts().size() != 1) return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! Input layers can have only one output port."; - layer->getOutputPorts()[0].shape() = inputShapes.find(layer->getName())->second; + layer->getOutputPorts()[0].setShape(inputShapes.find(layer->getName())->second); + } + + std::map> preparedParams; + // Prepare params for split layer + for (auto& layer : propagatedNetwork) { + if ((layer->getType() == "Reshape" || layer->getType() == "Flatten") && + layer->getInputPorts().size() != 2 && !layer->getInputPorts()[0].shape().empty() && + layer->getParameters().find("axis") != layer->getParameters().end() && + (layer->getParameters().find("dim") == layer->getParameters().end() || + layer->getParameters().at("dim").as>().empty())) { + auto inputShape = layer->getInputPorts()[0].shape(); + size_t inputShapeTotal = std::accumulate(inputShape.begin(), inputShape.end(), 1lu, + std::multiplies()); + std::vector dim; + size_t axis = layer->getParameters().at("axis"); + for (size_t i = 0; i < axis; i++) { + dim.emplace_back(inputShape[i]); + inputShapeTotal /= inputShape[i]; + } + if (dim.size() < inputShape.size()) + dim.emplace_back(inputShapeTotal); + layer->getParameters()["dim"] = dim; + } + + std::map params = InferenceEngine::Builder::convertParameters2Strings(layer->getParameters()); + if (layer->getType() == "Split") { + Builder::SplitLayer splitLayer(layer); + std::vector sizes; + size_t axisSize = splitLayer.getInputPort().shape()[splitLayer.getAxis()]; + size_t uninitOuts(0); + for (const auto& port : layer->getOutputPorts()) { + if (port.shape().empty()) { + sizes.push_back(0); + uninitOuts++; + } else if (port.shape().size() <= splitLayer.getAxis()) { + THROW_IE_EXCEPTION << "Incorrect output shapes in Split layer " << layer->getName(); + } else { + sizes.push_back(port.shape()[splitLayer.getAxis()]); + axisSize -= port.shape()[splitLayer.getAxis()]; + } + } + + if ((axisSize && !uninitOuts) || (axisSize && uninitOuts && axisSize % uninitOuts)) + THROW_IE_EXCEPTION << "Incorrect output shapes in Split layer " << layer->getName(); + + size_t commonSize = uninitOuts != 0 ? axisSize / uninitOuts : 0; + for (size_t i = 0; i < sizes.size() && commonSize; i++) { + if (!sizes[i]) + sizes[i] = commonSize; + } + + std::string out_sizes; + for (const auto& size : sizes) { + if (!out_sizes.empty()) + out_sizes += ","; + out_sizes += std::to_string(size); + } + if (!out_sizes.empty()) + params["out_sizes"] = out_sizes; + } + + preparedParams[layer->getId()] = params; } // Try to propagate shapes for (auto& layer : propagatedNetwork) { - const auto impl = ctx.getShapeInferImpl(layer->getType()); + // constant layer does not change during the shape inference and also the Const blob always has C layout and + // doesn't know its real shape, so don't run shape propagation for it + if (details::CaselessEq()(layer->getType(), "Const")) + continue; + const auto impl = network->getContext().getShapeInferImpl(layer->getType()); if (!impl) return DescriptionBuffer(NOT_FOUND, resp) << "Cannot infer shapes! Shape infer implementation was not found for type " << layer->getType() << "."; @@ -178,33 +327,43 @@ StatusCode Reshaper::networkShapeInfer(const std::map& std::map params; std::map blobs; + std::vector inBlobs; for (const auto& inPort : layer->getInputPorts().empty() ? layer->getOutputPorts() : layer->getInputPorts()) { - inShapes.push_back(inPort.shape()); - } - if (layer->getParameters()) { - for (const auto& it : layer->getParameters()->getParameters()) { - params[it.first] = it.second; - } - for (const auto& it : layer->getParameters()->getConstantData()) { - blobs[it.first] = std::const_pointer_cast(it.second); + if (inPort.getParameters().find("type") == inPort.getParameters().end()) { + inBlobs.push_back(inPort.getData()->getData()); } } + params = preparedParams[layer->getId()]; + + for (const auto& port : layer->getInputPorts()) { + if (port.getParameters().find("type") == port.getParameters().end() || + port.getData()->getData()->cbuffer() == nullptr) + continue; + blobs[port.getParameters().at("type")] = port.getData()->getData(); + } + for (const auto& it : layer->getParameters()) { + if (!it.second.is()) + continue; + blobs[it.first] = std::const_pointer_cast(it.second.as()); + } - StatusCode sts = impl->inferShapes(inShapes, params, blobs, outShapes, resp); + StatusCode sts = impl->inferShapes(inBlobs, params, blobs, outShapes, resp); if (sts != OK) return sts; if (outShapes.size() != layer->getOutputPorts().size()) - return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! The number of output shapes is not equal the number of output ports."; + return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! The number of output shapes is not " + "equal the number of output ports for layer " + << layer->getName(); for (size_t i = 0; i < outShapes.size(); i++) { - layer->getOutputPorts()[i].shape() = outShapes[i]; + layer->getOutputPorts()[i].setShape(outShapes[i]); } for (const auto& connection : propagatedNetwork.getLayerConnections(layer->getId())) { if (connection.from().layerId() != layer->getId()) continue; auto nextLayer = propagatedNetwork.getLayer(connection.to().layerId()); - nextLayer->getInputPorts()[connection.to().portId()].shape() = outShapes[connection.from().portId()]; + nextLayer->getInputPorts()[connection.to().portId()].setShape(outShapes[connection.from().portId()]); } } @@ -212,10 +371,10 @@ StatusCode Reshaper::networkShapeInfer(const std::map& for (auto& layer : *network) { const auto& propagatedLayer = propagatedNetwork.getLayer(layer->getId()); for (size_t i = 0; i < layer->getInputPorts().size(); i++) { - layer->getInputPorts()[i].shape() = propagatedLayer->getInputPorts()[i].shape(); + layer->getInputPorts()[i].setShape(propagatedLayer->getInputPorts()[i].shape()); } for (size_t i = 0; i < layer->getOutputPorts().size(); i++) { - layer->getOutputPorts()[i].shape() = propagatedLayer->getOutputPorts()[i].shape(); + layer->getOutputPorts()[i].setShape(propagatedLayer->getOutputPorts()[i].shape()); } } return OK; diff --git a/inference-engine/src/inference_engine/shape_infer/ie_reshaper.hpp b/inference-engine/src/inference_engine/shape_infer/ie_reshaper.hpp index 4f18507..834abe3 100644 --- a/inference-engine/src/inference_engine/shape_infer/ie_reshaper.hpp +++ b/inference-engine/src/inference_engine/shape_infer/ie_reshaper.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,7 +13,7 @@ #include #include -#include "../ie_network.hpp" +#include #include "details/caseless.hpp" #include "ie_reshape_launcher.hpp" #include "ie_icnn_network.hpp" @@ -60,9 +60,12 @@ public: * @param network - const reference to the ICNNNetwork for performing shape inference */ explicit Reshaper(ICNNNetwork& network, - const LauncherCreator::Ptr& creator = std::make_shared()); + const LauncherCreator::Ptr& creator = std::make_shared()); - Reshaper(const Context& context, details::Network::Ptr& network); + explicit Reshaper(std::vector inputs, + const LauncherCreator::Ptr& launcherCreator = std::make_shared()); + + Reshaper(Builder::Network* network); virtual ~Reshaper() = default; @@ -78,6 +81,25 @@ public: * @param inputShapes - Map of input names (data) to their input shapes. */ StatusCode run(const std::map& inputShapes, ResponseDesc* resp = nullptr); + + /** + * @brief Perform shape inference for the given input shapes but not apply it. + * In case of cusses call apply() method. + * @param inputShapes - Map of input names (data) to their input shapes. + * @throws exception if shape infer failed without corruption of original shapes + */ + StatusCode runNoApply(const std::map& inputShapes, ResponseDesc* resp = nullptr); + + /** + * @brief Apply shapes pre calculated by runNoApply() method. + */ + StatusCode apply(ResponseDesc* resp = nullptr); + + /** + * @brief Return newly calculated shape for provided data. + */ + SizeVector getResultShapeFor(DataPtr &data, ResponseDesc* resp = nullptr); + private: ReshapeLauncher::Ptr getLauncherByLayerName(const std::string& layerName) const; @@ -91,8 +113,7 @@ private: std::set _inputLayers{}; InferenceEngine::details::caseless_set _allTypes; - Context ctx; - details::Network::Ptr network; + Builder::Network* network; }; } // namespace ShapeInfer diff --git a/inference-engine/src/inference_engine/system_alllocator.cpp b/inference-engine/src/inference_engine/system_alllocator.cpp index c5e9f45..e075219 100644 --- a/inference-engine/src/inference_engine/system_alllocator.cpp +++ b/inference-engine/src/inference_engine/system_alllocator.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/system_alllocator.hpp b/inference-engine/src/inference_engine/system_alllocator.hpp index bc49a2b..b5a3cc7 100644 --- a/inference-engine/src/inference_engine/system_alllocator.hpp +++ b/inference-engine/src/inference_engine/system_alllocator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/transform/transform_network.cpp b/inference-engine/src/inference_engine/transform/transform_network.cpp new file mode 100644 index 0000000..5f39833 --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transform_network.cpp @@ -0,0 +1,353 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include + +using namespace InferenceEngine; + +Transform::Port::Port(Builder::Network& network, PortInfo port, bool isInput) + : network(network), port(port), input(isInput) { + const auto& layer = network.getLayer(port.layerId()); + if (isInput) { + if (layer->getInputPorts().size() < port.portId()) + THROW_IE_EXCEPTION << "Cannot find input port " + << port.portId() << " in layer " + << layer->getName(); + } else { + if (layer->getOutputPorts().size() < port.portId()) + THROW_IE_EXCEPTION << "Cannot find output port " + << port.portId() << " in layer " + << layer->getName(); + } +} + +PortData::Ptr Transform::Port::getData() const { + return input ? + network.getLayer(port.layerId())->getInputPorts()[port.portId()].getData() : + network.getLayer(port.layerId())->getOutputPorts()[port.portId()].getData(); +} + +const std::map &Transform::Port::getParameters() const { + return input ? + network.getLayer(port.layerId())->getInputPorts()[port.portId()].getParameters() : + network.getLayer(port.layerId())->getOutputPorts()[port.portId()].getParameters(); +} + +Transform::Layer Transform::Port::getLayer() const { + return Transform::Network(network).getLayer(getPortInfo().layerId()); +} + +Transform::Connection Transform::Port::getConnection() const { + return Connection(*this); +} + +void Transform::Port::connect(const Port& port) { + if (this->input) + this->getConnection().setSource(port); + else + this->getConnection().addDestination(port); +} + +void Transform::Port::disconnect() { + getConnection().remove(); +} + +const SizeVector& Transform::Port::shape() const { + return this->getData()->getData()->getTensorDesc().getDims(); +} + +PortInfo Transform::Port::getPortInfo() const { + return port; +} + +bool Transform::Port::operator==(const Port& rObj) const { + return &network == &rObj.network && + port == rObj.port && + input == rObj.input; +} + +bool Transform::Port::operator!=(const Port& rObj) const { + return !(*this == rObj); +} + + +Transform::Layer::Layer(Builder::Network& network, idx_t id) + : network(network), layerId(id) {} + +idx_t Transform::Layer::getId() const { + return layerId; +} + +std::string Transform::Layer::getName() const { + return getLayer()->getName(); +} + +std::string Transform::Layer::getType() const { + return getLayer()->getType(); +} + +Builder::Layer::Ptr Transform::Layer::getLayer() const { + return network.getLayer(layerId); +} + +Transform::Layer::operator Builder::Layer::Ptr() const { + return getLayer(); +} + +Transform::Port Transform::Layer::getInPort() const { + if (getLayer()->getInputPorts().size() != 1) + THROW_IE_EXCEPTION << "Layer " << getName() + << " has more than 1 input port."; + return Transform::Port(network, {layerId, 0}, true); +} + +Transform::Port Transform::Layer::getInPort(idx_t idx) const { + if (getLayer()->getInputPorts().size() <= idx) + THROW_IE_EXCEPTION << "Layer " << getName() + << " has less than " << idx << " input port(s)."; + return Transform::Port(network, {layerId, idx}, true); +} + +std::vector Transform::Layer::getInPorts() const { + std::vector ports; + for (size_t i = 0; i < getLayer()->getInputPorts().size(); i++) { + ports.push_back({network, {layerId, i}, true}); + } + return ports; +} + +Transform::Port Transform::Layer::getOutPort() const { + if (getLayer()->getOutputPorts().size() != 1) + THROW_IE_EXCEPTION << "Layer " << getName() + << " has more than 1 output port."; + return Transform::Port(network, {layerId, 0}, false); +} + +Transform::Port Transform::Layer::getOutPort(idx_t idx) const { + if (getLayer()->getOutputPorts().size() <= idx) + THROW_IE_EXCEPTION << "Layer " << getName() + << " has less than " << idx << " output port(s)."; + return Transform::Port(network, {layerId, idx}, false); +} + +std::vector Transform::Layer::getOutPorts() const { + std::vector ports; + for (size_t i = 0; i < getLayer()->getInputPorts().size(); i++) { + ports.push_back({network, {layerId, i}, false}); + } + return ports; +} + +void Transform::Layer::setParameter(const std::string& key, const Parameter& value) { + auto& params = getLayer()->getParameters(); + params[key] = value; +} + +Parameter& Transform::Layer::getParameter(const std::string& key) const { + auto& params = getLayer()->getParameters(); + if (params.find(key) == params.end()) + THROW_IE_EXCEPTION << "Layer " << getName() << " has no parameter " << key; + return params[key]; +} + +Transform::Connection::Connection(const Transform::Port& port) + : network(port.network), inPort({(std::numeric_limits::max)(), (std::numeric_limits::max)()}) { + if (port.input) { + outPorts = {port.getPortInfo()}; + for (const auto& connection : network.getLayerConnections(port.getPortInfo().layerId())) { + if (connection.to() == port.getPortInfo()) { + inPort = connection.from(); + break; + } + } + } else { + inPort = port.getPortInfo(); + for (const auto& connection : network.getLayerConnections(port.getPortInfo().layerId())) { + if (connection.from() == port.getPortInfo()) { + outPorts.emplace_back(connection.to()); + } + } + } +} +Transform::Connection::Connection(Builder::Network& network, const InferenceEngine::Connection& connection) + : Connection(network, connection.from(), connection.to()) {} +Transform::Connection::Connection(Builder::Network& network, const PortInfo& inPort, const PortInfo& outPort) + : Connection(network, inPort, std::vector({outPort})) {} +Transform::Connection::Connection(Builder::Network& network, const PortInfo& inPort, const std::vector& outPorts) + : network(network), inPort(inPort), outPorts(outPorts) {} + +Transform::Port Transform::Connection::getSource() const { + if (!inPortExist()) + THROW_IE_EXCEPTION << "Connection doesn't have source port!"; + return Port(network, inPort, false); +} + +void Transform::Connection::setSource(const Transform::Port &port) { + if (inPortExist()) { + // disconnect old port + for (const auto& outPort : outPorts) { + network.disconnect({inPort, outPort}); + } + } + inPort = port.getPortInfo(); + for (const auto& outPort : outPorts) { + network.connect(inPort, outPort); + } +} + +Transform::Port Transform::Connection::getDestination() const { + if (outPorts.size() != 1) + THROW_IE_EXCEPTION << "Connection has more than 1 output."; + return Transform::Port(network, outPorts[0], true); +} + +Transform::Port Transform::Connection::getDestination(idx_t idx) { + if (outPorts.size() <= idx) + THROW_IE_EXCEPTION << "Connection has less than " + << idx << " input port(s)."; + return Transform::Port(network, outPorts[idx], true); +} + +std::vector Transform::Connection::getDestinations() const { + std::vector ports; + for (const auto& port : outPorts) { + ports.emplace_back(network, port, true); + } + return ports; +} + +void Transform::Connection::addDestination(const Transform::Port &port) { + for (const auto& outPort : outPorts) { + if (outPort == port.getPortInfo()) { + THROW_IE_EXCEPTION << "Cannot connect twice with one port!"; + } + } + outPorts.emplace_back(port.getPortInfo()); + if (!inPortExist()) + return; + network.connect(inPort, outPorts[outPorts.size() - 1]); +} + +void Transform::Connection::setDestination(const Transform::Port &port) { + if (outPorts.size() > 1) { + THROW_IE_EXCEPTION << "Cannot set destination for connection which has more than 1 consumer." + << "Please use addDestination or setDestinations methods!"; + } + + if (!outPorts.empty()) { + if (inPortExist()) + network.disconnect({inPort, outPorts[0]}); + outPorts.clear(); + } + addDestination(port); +} + +void Transform::Connection::setDestinations(const std::vector &ports) { + if (!outPorts.empty() && outPorts.size() != ports.size()) + THROW_IE_EXCEPTION << "Cannot change number of output connections!"; + + if (inPortExist()) { + for (const auto &port : outPorts) { + network.disconnect({inPort, port}); + } + } + outPorts.clear(); + for (const auto &port : ports) { + addDestination(port); + } +} + +void Transform::Connection::remove() { + if (!inPortExist()) + return; + for (const auto& port : outPorts) { + network.disconnect({inPort, port}); + } +} + +bool Transform::Connection::inPortExist() const { + static PortInfo uninitPort((std::numeric_limits::max)(), (std::numeric_limits::max)()); + return inPort != uninitPort; +} + +Transform::Layer Transform::Network::addLayer(const Builder::Layer &layer) { + idx_t layerId = network.addLayer(layer); + return Transform::Layer(network, layerId); +} + +void Transform::Network::removeLayer(const Transform::Layer &layer) { + for (const auto& connection : network.getLayerConnections(layer.getId())) + network.disconnect(connection); + network.removeLayer(layer.getId()); +} + +Transform::Layer Transform::Network::getLayer(const std::string &name) const { + for (const auto& layer : network) { + if (layer->getName() == name) + return Transform::Layer(network, layer->getId()); + } + THROW_IE_EXCEPTION << "Layer with name: " << name << " was not found!"; +} + +Transform::Layer Transform::Network::getLayer(idx_t id) const { + for (const auto& layer : network) { + if (layer->getId() == id) + return Transform::Layer(network, layer->getId()); + } + THROW_IE_EXCEPTION << "Layer with id: " << id << " was not found!"; +} + +Transform::Connection Transform::Network::connect(const Transform::Layer &src, + const Transform::Layer &dst) { + Port srcPort = src.getOutPort(); + Port dstPort = dst.getInPort(); + + network.connect(srcPort.getPortInfo(), dstPort.getPortInfo()); + return Connection(network, srcPort.getPortInfo(), dstPort.getPortInfo()); +} + +Transform::Connection Transform::Network::connect(const Transform::Port &src, + const Transform::Port &dst) { + network.connect(src.getPortInfo(), dst.getPortInfo()); + return Connection(network, src.getPortInfo(), dst.getPortInfo()); +} + +void Transform::Network::disconnect(const Transform::Layer &src, const Transform::Layer &dst) { + getConnection(src, dst).remove(); +} + +void Transform::Network::disconnect(const Transform::Port &src, const Transform::Port &dst) { + getConnection(src, dst).remove(); +} + +Builder::Network& Transform::Network::getBuilderNetwork() const { + return network; +} + +Transform::Connection Transform::Network::getConnection(const Transform::Layer &src, + const Transform::Layer &dst) const { + Port srcPort = src.getOutPort(); + Port dstPort = dst.getInPort(); + + for (const auto& connection : network.getConnections()) { + if (connection.from() == srcPort.getPortInfo() && connection.to() == dstPort.getPortInfo()) + return Connection(network, srcPort.getPortInfo(), dstPort.getPortInfo()); + } + THROW_IE_EXCEPTION << "Connection " << src.getName() << " -> " << dst.getName() << " was not found!"; +} + +Transform::Connection Transform::Network::getConnection(const Transform::Port &src, + const Transform::Port &dst) const { + for (const auto& connection : network.getConnections()) { + if (connection.from() == src.getPortInfo() && connection.to() == dst.getPortInfo()) + return Connection(network, src.getPortInfo(), dst.getPortInfo()); + } + THROW_IE_EXCEPTION << "Connection " << getLayer(src.getPortInfo().layerId()).getName() + << " -> " << getLayer(dst.getPortInfo().layerId()).getName() << " was not found!"; +} diff --git a/inference-engine/src/inference_engine/transform/transform_network.hpp b/inference-engine/src/inference_engine/transform/transform_network.hpp new file mode 100644 index 0000000..fc97c28 --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transform_network.hpp @@ -0,0 +1,116 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace InferenceEngine { +namespace Transform { + +class Connection; +class Layer; + +class INFERENCE_ENGINE_API_CLASS(Port) { +public: + Port(Builder::Network& network, PortInfo port, bool isInput); + PortData::Ptr getData() const; + const std::map& getParameters() const; + Layer getLayer() const; + Connection getConnection() const; + void connect(const Port& port); + void disconnect(); + const SizeVector& shape() const; + PortInfo getPortInfo() const; + bool operator==(const Port& rObj) const; + bool operator!=(const Port& rObj) const; + +private: + Builder::Network& network; + PortInfo port; + bool input; + + friend class Connection; +}; + +class INFERENCE_ENGINE_API_CLASS(Layer) { +public: + Layer(Builder::Network& network, idx_t id); + Port getInPort() const; + Port getInPort(idx_t idx) const; + std::vector getInPorts() const; + Port getOutPort() const; + Port getOutPort(idx_t idx) const; + std::vector getOutPorts() const; + + void setParameter(const std::string& key, const Parameter& value); + Parameter& getParameter(const std::string& value) const; + + idx_t getId() const; + std::string getName() const; + std::string getType() const; + operator Builder::Layer::Ptr() const; + +private: + Builder::Network& network; + idx_t layerId; + + Builder::Layer::Ptr getLayer() const; +}; + +class INFERENCE_ENGINE_API_CLASS(Connection) { +public: + explicit Connection(const Port& port); + Connection(Builder::Network& network, const InferenceEngine::Connection& connection); + Connection(Builder::Network& network, const PortInfo& inPort, const PortInfo& outPort); + Connection(Builder::Network& network, const PortInfo& inPort, const std::vector& outPorts); + + Port getSource() const; + void setSource(const Port& port); + Port getDestination() const; + Port getDestination(idx_t idx); + std::vector getDestinations() const; + void addDestination(const Port& port); + void setDestination(const Port& port); + void setDestinations(const std::vector& ports); + void remove(); + +private: + Builder::Network& network; + PortInfo inPort; + std::vector outPorts; + + bool inPortExist() const; +}; + +class INFERENCE_ENGINE_API_CLASS(Network) { +public: + explicit Network(Builder::Network& network): network(network) {} + virtual ~Network() = default; + + Layer addLayer(const Builder::Layer& layer); + void removeLayer(const Layer& layer); + Layer getLayer(const std::string& name) const; + Layer getLayer(idx_t id) const; + + Builder::Network& getBuilderNetwork() const; + + Connection connect(const Layer& src, const Layer& dst); + Connection connect(const Port& src, const Port& dst); + void disconnect(const Layer& src, const Layer& dst); + void disconnect(const Port& src, const Port& dst); + Connection getConnection(const Layer& src, const Layer& dst) const; + Connection getConnection(const Port& src, const Port& dst) const; + +private: + Builder::Network& network; +}; + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformation.cpp b/inference-engine/src/inference_engine/transform/transformation.cpp new file mode 100644 index 0000000..6f82e98 --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformation.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +namespace InferenceEngine { +namespace Transform { + +std::string Transformation::getName() const { + return name; +} + +void Transformation::setName(const std::string& name) { + this->name = name; +} + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformation.hpp b/inference-engine/src/inference_engine/transform/transformation.hpp new file mode 100644 index 0000000..790ad48 --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformation.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +namespace InferenceEngine { +namespace Transform { + +class Transformation { + std::string name; +public: + std::string getName() const; + void setName(const std::string& name); + virtual ~Transformation() = default; + virtual void execute(Network& network) = 0; +}; + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.cpp b/inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.cpp new file mode 100644 index 0000000..27f5d62 --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "eltwise_broadcast.hpp" +#include "builders/ie_network_builder.hpp" +#include "builders/ie_reshape_layer.hpp" +#include "builders/ie_tile_layer.hpp" +#include "debug.h" +#include +#include +#include + +namespace InferenceEngine { +namespace Transform { + +TransformationEltwiseBroadcast::TransformationEltwiseBroadcast() { + this->setName("ie.transform.eltwise_broadcast"); +} + +void insertTileOverDimension(Transform::Network& network, Transform::Port& inputPort, size_t axis, size_t tile) { + auto tileLayerBuilder = Builder::TileLayer("Tile" + std::to_string(axis) + "_" + std::to_string(tile)).setAxis(axis).setTiles(tile); + auto tileLayer = network.addLayer(tileLayerBuilder); + inputPort.getConnection().setDestination(tileLayer.getInPort()); + tileLayer.getOutPort().connect(inputPort); +} + +void TransformationEltwiseBroadcast::execute(Network& network) { + for (auto layer : network.getBuilderNetwork()) { + if (layer->getType() == "Eltwise") { + auto eltwiseLayer = network.getLayer(layer->getName()); + auto outShape = eltwiseLayer.getOutPort(0).shape(); + for (auto& eltwiseInPort : eltwiseLayer.getInPorts()) { + auto inShape = eltwiseInPort.shape(); + // if shape lengths are not equal then insert Reshape with shape prepended with ones + if (inShape.size() < outShape.size()) { + std::vector reshapeDims(inShape.begin(), inShape.end()); + reshapeDims.insert(reshapeDims.begin(), outShape.size() - inShape.size(), 1); + auto reshapeLayerBuilder = Builder::ReshapeLayer(eltwiseInPort.getLayer().getName() + "/Reshape").setDims(reshapeDims); + auto reshapeLayer = network.addLayer(reshapeLayerBuilder); + eltwiseInPort.getConnection().setDestination(reshapeLayer.getInPort()); + reshapeLayer.getOutPort().connect(eltwiseInPort); + SizeVector newOutShape(reshapeDims.size()); + // update shape of the Port + for (size_t ind = 0; ind < reshapeDims.size(); ++ind) + newOutShape[ind] = reshapeDims[ind]; + eltwiseInPort.getData()->setShape(newOutShape); + inShape = newOutShape; + } + for (size_t axis = 0; axis < inShape.size(); ++axis) { + if (inShape[axis] != outShape[axis]) { + if (inShape[axis] != 1) { + THROW_IE_EXCEPTION << "Layer " << layer->getName() + << " input has invalid shape " + << details::dumpVec(inShape) + << " which can not be broadcasted to output shape " + << details::dumpVec(outShape); + } + insertTileOverDimension(network, eltwiseInPort, axis, outShape[axis]); + } + } + } + } + } +} + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.hpp b/inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.hpp new file mode 100644 index 0000000..863b34a --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformations/eltwise_broadcast.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include + +namespace InferenceEngine { +namespace Transform { + +class TransformationEltwiseBroadcast: public Transformation { +public: + TransformationEltwiseBroadcast(); + void execute(Network& network) override; +}; + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformations/lrn.cpp b/inference-engine/src/inference_engine/transform/transformations/lrn.cpp new file mode 100644 index 0000000..710a71e --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformations/lrn.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lrn.hpp" +#include "builders/ie_network_builder.hpp" +#include "builders/ie_power_layer.hpp" +#include "builders/ie_eltwise_layer.hpp" +#include "builders/ie_norm_layer.hpp" +#include +#include + +namespace InferenceEngine { +namespace Transform { + +TransformationLRN::TransformationLRN() { + this->setName("ie.transform.lrn"); +} + +void TransformationLRN::execute(Network& network) { + for (auto layer : network.getBuilderNetwork()) { + if (layer->getType() == "LRN") { + auto lrnLayer = network.getLayer(layer->getName()); + float scale_value = 1.0f / std::pow(static_cast(lrnLayer.getParameter("bias")), + static_cast(lrnLayer.getParameter("beta"))); + + auto normLayerBuilder = Builder::NormLayer(lrnLayer.getName() + "/Norm"). + setAlpha(static_cast(lrnLayer.getParameter("alpha")) / static_cast(lrnLayer.getParameter("bias"))). + setSize(static_cast(lrnLayer.getParameter("size"))). + setBeta(static_cast(lrnLayer.getParameter("beta"))). + setAcrossMaps(true); + auto normLayer = network.addLayer(normLayerBuilder); + + auto mulLayerBuilder = Builder::EltwiseLayer(lrnLayer.getName() + "/Mul").setEltwiseType( + Builder::EltwiseLayer::EltwiseType::MUL); + auto mulLayer = network.addLayer(mulLayerBuilder); + + auto tensorDesc = TensorDesc(Precision::FP32, SizeVector(4, 1), Layout::NCHW); + auto blob = make_shared_blob(tensorDesc); + blob->allocate(); + float *buffer = blob->buffer().as::value_type *>(); + buffer[0] = scale_value; + + auto constLayerBuilder = Builder::ConstLayer(mulLayerBuilder.getName() + "/Const").setData(blob); + auto constLayer = network.addLayer(constLayerBuilder); + + // re-connect input of LRN layer to input of Norm layer + lrnLayer.getInPort().getConnection().setDestination(normLayer.getInPort()); + + // multiple output of Norm with a constant + mulLayer.getInPort(0).connect(normLayer.getOutPort()); + mulLayer.getInPort(1).connect(constLayer.getOutPort()); + + // connect consumers of LRN with mul + lrnLayer.getOutPort().getConnection().setSource(mulLayer.getOutPort()); + + network.removeLayer(lrnLayer); + } + } +} + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformations/lrn.hpp b/inference-engine/src/inference_engine/transform/transformations/lrn.hpp new file mode 100644 index 0000000..040180a --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformations/lrn.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include + +namespace InferenceEngine { +namespace Transform { + +class TransformationLRN: public Transformation { +public: + TransformationLRN(); + void execute(Network& network) override; +}; + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformations/sub.cpp b/inference-engine/src/inference_engine/transform/transformations/sub.cpp new file mode 100644 index 0000000..337bb77 --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformations/sub.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "sub.hpp" +#include "builders/ie_network_builder.hpp" +#include "builders/ie_power_layer.hpp" +#include "builders/ie_eltwise_layer.hpp" +#include +#include +#include + +namespace InferenceEngine { +namespace Transform { + +TransformationSub::TransformationSub() { + this->setName("ie.transform.sub"); +} + +void TransformationSub::execute(Network& network) { + for (auto layer : network.getBuilderNetwork()) { + if (layer->getType() == "Eltwise" && layer->getParameters()["operation"].as() == "sub") { + auto subLayer = network.getLayer(layer->getName()); + + auto powerLayerBuilder = Builder::PowerLayer(subLayer.getName() + "/Power").setPower(1.0f).setScale(-1.0f).setShift(0.0f); + auto powerLayer = network.addLayer(powerLayerBuilder); + + auto eltwiseLayerBuilder = Builder::EltwiseLayer(subLayer.getName() + "/Add").setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUM); + auto eltwiseLayer = network.addLayer(eltwiseLayerBuilder); + + // negate the second input to the sub layer + subLayer.getInPort(1).getConnection().setDestination(powerLayer.getInPort()); + + // connect new eltwise with sum with two inputs + subLayer.getInPort(0).getConnection().setDestination(eltwiseLayer.getInPort(0)); + eltwiseLayer.getInPort(1).connect(powerLayer.getOutPort()); + + // reconnect new eltwise with outputs of all eltwise with sub + subLayer.getOutPort().getConnection().setSource(eltwiseLayer.getOutPort()); + + network.removeLayer(subLayer); + } + } +} + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/transform/transformations/sub.hpp b/inference-engine/src/inference_engine/transform/transformations/sub.hpp new file mode 100644 index 0000000..c67649d --- /dev/null +++ b/inference-engine/src/inference_engine/transform/transformations/sub.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include + +namespace InferenceEngine { +namespace Transform { + +class TransformationSub: public Transformation { +public: + TransformationSub(); + void execute(Network& network) override; +}; + +} // namespace Transform +} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/w_dirent.h b/inference-engine/src/inference_engine/w_dirent.h index e5243db..d100d51 100644 --- a/inference-engine/src/inference_engine/w_dirent.h +++ b/inference-engine/src/inference_engine/w_dirent.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/w_unistd.h b/inference-engine/src/inference_engine/w_unistd.h index 5064580..18e4d8d 100644 --- a/inference-engine/src/inference_engine/w_unistd.h +++ b/inference-engine/src/inference_engine/w_unistd.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/inference_engine/xml_parse_utils.cpp b/inference-engine/src/inference_engine/xml_parse_utils.cpp index 82327e8..7e8c5a6 100644 --- a/inference-engine/src/inference_engine/xml_parse_utils.cpp +++ b/inference-engine/src/inference_engine/xml_parse_utils.cpp @@ -1,17 +1,26 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "xml_parse_utils.h" #include "details/ie_exception.hpp" #include "ie_precision.hpp" +#include +#include int XMLParseUtils::GetIntAttr(const pugi::xml_node &node, const char *str) { auto attr = node.attribute(str); if (attr.empty()) THROW_IE_EXCEPTION << "node <" << node.name() << "> is missing mandatory attribute: " << str << " at offset " << node.offset_debug(); - return atoi(attr.value()); + std::string str_value = std::string(attr.value()); + std::size_t idx = 0; + int int_value = std::stoi(str_value, &idx, 10); + if (idx != str_value.length()) + THROW_IE_EXCEPTION << "node <" << node.name() << "> has attribute \"" << str << "\" = \"" << str_value + << "\" which is not an integer" << " at offset " + << node.offset_debug(); + return int_value; } uint64_t XMLParseUtils::GetUInt64Attr(const pugi::xml_node &node, const char *str) { @@ -19,11 +28,14 @@ uint64_t XMLParseUtils::GetUInt64Attr(const pugi::xml_node &node, const char *st if (attr.empty()) THROW_IE_EXCEPTION << "node <" << node.name() << "> is missing mandatory attribute: " << str << " at offset " << node.offset_debug(); - int64_t value = atoll(attr.value()); - if (value < 0) - THROW_IE_EXCEPTION << "node <" << node.name() << "> has incorrect parameter: " << str << " at offset " + std::string str_value = std::string(attr.value()); + std::size_t idx = 0; + long long int_value = std::stoll(str_value, &idx, 10); + if (idx != str_value.length() || int_value < 0 || int_value > (std::numeric_limits::max)()) + THROW_IE_EXCEPTION << "node <" << node.name() << "> has attribute \"" << str << "\" = \"" << str_value + << "\" which is not an unsigned 64 bit integer" << " at offset " << node.offset_debug(); - return static_cast(value); + return static_cast(int_value); } unsigned int XMLParseUtils::GetUIntAttr(const pugi::xml_node &node, const char *str) { @@ -31,11 +43,14 @@ unsigned int XMLParseUtils::GetUIntAttr(const pugi::xml_node &node, const char * if (attr.empty()) THROW_IE_EXCEPTION << "node <" << node.name() << "> is missing mandatory attribute: " << str << " at offset " << node.offset_debug(); - int value = atoi(attr.value()); - if (value < 0) - THROW_IE_EXCEPTION << "node <" << node.name() << "> has incorrect parameter: " << str << " at offset " + std::string str_value = std::string(attr.value()); + std::size_t idx = 0; + long long int_value = std::stoll(str_value, &idx, 10); + if (idx != str_value.length() || int_value < 0 || int_value > (std::numeric_limits::max)()) + THROW_IE_EXCEPTION << "node <" << node.name() << "> has attribute \"" << str << "\" = \"" << str_value + << "\" which is not an unsigned integer" << " at offset " << node.offset_debug(); - return static_cast(value); + return static_cast(int_value); } std::string XMLParseUtils::GetStrAttr(const pugi::xml_node &node, const char *str) { @@ -57,7 +72,14 @@ float XMLParseUtils::GetFloatAttr(const pugi::xml_node &node, const char *str) { if (attr.empty()) THROW_IE_EXCEPTION << "node <" << node.name() << "> is missing mandatory attribute: " << str << " at offset " << node.offset_debug(); - return static_cast(atof(attr.value())); + std::string str_value = std::string(attr.value()); + std::size_t idx = 0; + float float_value = std::stof(str_value, &idx); + if (idx != str_value.length()) + THROW_IE_EXCEPTION << "node <" << node.name() << "> has attribute \"" << str << "\" = \"" << str_value + << "\" which is not a floating point" << " at offset " + << node.offset_debug(); + return float_value; } InferenceEngine::Precision XMLParseUtils::GetPrecisionAttr(const pugi::xml_node &node, const char *str) { @@ -78,33 +100,25 @@ InferenceEngine::Precision XMLParseUtils::GetPrecisionAttr(const pugi::xml_node int XMLParseUtils::GetIntAttr(const pugi::xml_node &node, const char *str, int defVal) { auto attr = node.attribute(str); if (attr.empty()) return defVal; - return atoi(attr.value()); + return GetIntAttr(node, str); } uint64_t XMLParseUtils::GetUInt64Attr(const pugi::xml_node &node, const char *str, uint64_t defVal) { auto attr = node.attribute(str); if (attr.empty()) return defVal; - int64_t value = atoll(attr.value()); - if (value < 0) - THROW_IE_EXCEPTION << "node <" << node.name() << "> has incorrect parameter: " << str << " at offset " - << node.offset_debug(); - return static_cast(value); + return GetUInt64Attr(node, str); } unsigned int XMLParseUtils::GetUIntAttr(const pugi::xml_node &node, const char *str, unsigned int defVal) { auto attr = node.attribute(str); if (attr.empty()) return defVal; - int value = atoi(attr.value()); - if (value < 0) - THROW_IE_EXCEPTION << "node <" << node.name() << "> has incorrect parameter: " << str << " at offset " - << node.offset_debug(); - return static_cast(value); + return GetUIntAttr(node, str); } float XMLParseUtils::GetFloatAttr(const pugi::xml_node &node, const char *str, float defVal) { auto attr = node.attribute(str); if (attr.empty()) return defVal; - return static_cast(atof(attr.value())); + return GetFloatAttr(node, str); } int XMLParseUtils::GetIntChild(const pugi::xml_node &node, const char *str, int defVal) { diff --git a/inference-engine/src/inference_engine/xml_parse_utils.h b/inference-engine/src/inference_engine/xml_parse_utils.h index 3d2750b..77aa9c7 100644 --- a/inference-engine/src/inference_engine/xml_parse_utils.h +++ b/inference-engine/src/inference_engine/xml_parse_utils.h @@ -1,11 +1,10 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include -#include #include "pugixml.hpp" #include "ie_common.h" #include "ie_api.h" diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt index 5997f7d..df81a5a 100644 --- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt +++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -34,7 +34,9 @@ include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/common + ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include + ${CMAKE_BINARY_DIR}/include/ ) if (GEMM STREQUAL "MKL") @@ -64,3 +66,5 @@ target_compile_definitions(test_${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR} target_link_libraries(test_${TARGET_NAME} PRIVATE inference_engine_s mkldnn) set_target_properties(test_${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME test_${TARGET_NAME}) + +add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) diff --git a/inference-engine/src/mkldnn_plugin/config.cpp b/inference-engine/src/mkldnn_plugin/config.cpp index 4ef10ee..cfbe1a8 100644 --- a/inference-engine/src/mkldnn_plugin/config.cpp +++ b/inference-engine/src/mkldnn_plugin/config.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/config.h b/inference-engine/src/mkldnn_plugin/config.h index 558ac87..46610bd 100644 --- a/inference-engine/src/mkldnn_plugin/config.h +++ b/inference-engine/src/mkldnn_plugin/config.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mean_image.cpp b/inference-engine/src/mkldnn_plugin/mean_image.cpp index f1ac17e..dcf11ef 100644 --- a/inference-engine/src/mkldnn_plugin/mean_image.cpp +++ b/inference-engine/src/mkldnn_plugin/mean_image.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -72,13 +72,17 @@ void MeanImage::Load(const MKLDNNDims& inputDims, InputInfo::Ptr inputInfo) { } } -void MeanImage::Subtract(const MKLDNNDims &inputDims, float *input) { +void MeanImage::Subtract(const MKLDNNDims &inputDims, float *input, InferenceEngine::Layout layout) { IE_ASSERT(input != nullptr); if (inputDims.ndims() != 4) { THROW_IE_EXCEPTION << "Expecting input as 4 dimension blob with format NxCxHxW."; } + if (layout != NCHW && layout != NHWC) { + THROW_IE_EXCEPTION << "Expecting input layout NCHW or NHWC."; + } + int MB = inputDims[0]; int srcSize = inputDims.size() / MB; @@ -92,8 +96,15 @@ void MeanImage::Subtract(const MKLDNNDims &inputDims, float *input) { int C = inputDims[1]; srcSize /= inputDims[1]; - parallel_for3d(MB, C, srcSize, [&](int mb, int c, int i) { - input[srcSize * mb * C + c * srcSize + i] -= meanValues[c]; - }); + if (layout == NCHW) { + parallel_for3d(MB, C, srcSize, [&](int mb, int c, int i) { + input[mb * C * srcSize + c * srcSize + i] -= meanValues[c]; + }); + } else if (layout == NHWC) { + parallel_for2d(MB, srcSize, [&](int mb, int i) { + for (int c = 0; c < C; c++) + input[mb * srcSize * C + i * C + c] -= meanValues[c]; + }); + } } } diff --git a/inference-engine/src/mkldnn_plugin/mean_image.h b/inference-engine/src/mkldnn_plugin/mean_image.h index 24dc816..eba0762 100644 --- a/inference-engine/src/mkldnn_plugin/mean_image.h +++ b/inference-engine/src/mkldnn_plugin/mean_image.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -18,16 +18,20 @@ public: public: void Load(const MKLDNNDims& inputDims, InferenceEngine::InputInfo::Ptr inputInfo); - void Subtract(const MKLDNNDims &inputDims, float *input); + void Subtract(const MKLDNNDims &inputDims, float *input, InferenceEngine::Layout layout); template::value>::type* = nullptr> - void Subtract(const MKLDNNDims &inputDims, T *input) { + void Subtract(const MKLDNNDims &inputDims, T *input, InferenceEngine::Layout layout) { IE_ASSERT(input != nullptr); if (inputDims.ndims() != 4) { THROW_IE_EXCEPTION << "Expecting input as 4 dimension blob with format NxCxHxW."; } + if (layout != InferenceEngine::NCHW && layout != InferenceEngine::NHWC) { + THROW_IE_EXCEPTION << "Expecting input layout NCHW or NHWC."; + } + int MB = inputDims[0]; int srcSize = inputDims.size() / MB; @@ -45,13 +49,25 @@ public: int C = inputDims[1]; srcSize /= inputDims[1]; - InferenceEngine::parallel_for3d(MB, C, srcSize, [&](int mb, int c, int i) { - int buf = input[srcSize * mb * C + c * srcSize + i]; - buf -= meanValues[c]; - if (buf < std::numeric_limits::min()) buf = std::numeric_limits::min(); - if (buf > std::numeric_limits::max()) buf = std::numeric_limits::max(); - input[srcSize * mb * C + c * srcSize + i] = buf; - }); + if (layout == InferenceEngine::NCHW) { + InferenceEngine::parallel_for3d(MB, C, srcSize, [&](int mb, int c, int i) { + int buf = input[srcSize * mb * C + c * srcSize + i]; + buf -= meanValues[c]; + if (buf < std::numeric_limits::min()) buf = std::numeric_limits::min(); + if (buf > std::numeric_limits::max()) buf = std::numeric_limits::max(); + input[srcSize * mb * C + c * srcSize + i] = buf; + }); + } else if (layout == InferenceEngine::NHWC) { + InferenceEngine::parallel_for2d(MB, srcSize, [&](int mb, int i) { + for (int c = 0; c < C; c++) { + int buf = input[mb * srcSize * C + i * C + c]; + buf -= meanValues[c]; + if (buf < std::numeric_limits::min()) buf = std::numeric_limits::min(); + if (buf > std::numeric_limits::max()) buf = std::numeric_limits::max(); + input[mb * srcSize * C + i * C + c] = buf; + } + }); + } } } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h index 09ec76c..e80bf95 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h index b3ad3c0..d6b0997 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h index 616f517..de42f36 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp index 57b6edc..271bc56 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp index ff3616a..34a6296 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h index 45cca04..91d5bba 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp index 19bc513..c79d6a8 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h index 65cc216..409e55e 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp index 14c3e1d..735f819 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h index dfd69bb..1c4dc3a 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp index ea463a2..bb5d4cc 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h index 447787f..357b43a 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp index bcb4741..63af551 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -53,19 +53,6 @@ MKLDNNDescriptor::operator std::shared_ptr() return typeDesc->getPtr(); } -MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { - this->desc.reset(new DescFwdImpl(desc)); -} - -MKLDNNDescriptor::operator std::shared_ptr() { - DescFwdImpl *typeDesc = - dynamic_cast *>(desc.get()); - if (typeDesc == nullptr) { - THROW_IE_EXCEPTION << "Cannot cast descriptor!"; - } - return typeDesc->getPtr(); -} - MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc, std::shared_ptr prim) { this->desc.reset( @@ -132,19 +119,6 @@ MKLDNNDescriptor::operator std::shared_ptr() { return typeDesc->getPtr(); } -MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { - this->desc.reset(new DescFwdImpl(desc)); -} - -MKLDNNDescriptor::operator std::shared_ptr() { - DescFwdImpl *typeDesc = - dynamic_cast *>(desc.get()); - if (typeDesc == nullptr) { - THROW_IE_EXCEPTION << "Cannot cast descriptor!"; - } - return typeDesc->getPtr(); -} - MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { this->desc.reset(new DescFwdImpl(desc)); } @@ -196,3 +170,40 @@ MKLDNNDescriptor::operator std::shared_ptr() { } return typeDesc->getPtr(); } + +MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { + this->desc.reset(new DescFwdImpl(desc)); +} + +MKLDNNDescriptor::operator std::shared_ptr() { + DescFwdImpl *typeDesc = + dynamic_cast *>(desc.get()); + if (typeDesc == nullptr) { + THROW_IE_EXCEPTION << "Cannot cast descriptor!"; + } + return typeDesc->getPtr(); +} + +MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { + this->desc.reset(new DescFwdImpl(desc)); +} + +MKLDNNDescriptor::operator std::shared_ptr() { + auto *typeDesc = dynamic_cast *>(desc.get()); + if (typeDesc == nullptr) { + THROW_IE_EXCEPTION << "Cannot cast descriptor!"; + } + return typeDesc->getPtr(); +} + +MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr desc) { + this->desc.reset(new DescFwdImpl(desc)); +} + +MKLDNNDescriptor::operator std::shared_ptr() { + auto *typeDesc = dynamic_cast *>(desc.get()); + if (typeDesc == nullptr) { + THROW_IE_EXCEPTION << "Cannot cast descriptor!"; + } + return typeDesc->getPtr(); +} diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h index dff0720..4a78650 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -17,9 +17,6 @@ public: explicit MKLDNNDescriptor(std::shared_ptr desc); operator std::shared_ptr(); - explicit MKLDNNDescriptor(std::shared_ptr desc); - operator std::shared_ptr(); - MKLDNNDescriptor(std::shared_ptr desc, std::shared_ptr prim); operator std::shared_ptr(); @@ -34,9 +31,6 @@ public: explicit MKLDNNDescriptor(std::shared_ptr desc); operator std::shared_ptr(); - explicit MKLDNNDescriptor(std::shared_ptr desc); - operator std::shared_ptr(); - explicit MKLDNNDescriptor(std::shared_ptr desc); operator std::shared_ptr(); @@ -49,6 +43,15 @@ public: explicit MKLDNNDescriptor(std::shared_ptr desc); operator std::shared_ptr(); + explicit MKLDNNDescriptor(std::shared_ptr desc); + operator std::shared_ptr(); + + explicit MKLDNNDescriptor(std::shared_ptr desc); + operator std::shared_ptr(); + + explicit MKLDNNDescriptor(std::shared_ptr desc); + operator std::shared_ptr(); + mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::engine &engine, const mkldnn::primitive_attr &attr = mkldnn::primitive_attr()) const; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h index 06616a8..62cb10f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -18,18 +18,18 @@ public: MKLDNNDims() = default; explicit MKLDNNDims(const InferenceEngine::SizeVector& size) { - dims = std::vector(size.begin(), size.end()); + dims = std::vector(size.begin(), size.end()); } - explicit MKLDNNDims(const std::vector& dim) { + explicit MKLDNNDims(const std::vector& dim) { dims = dim; } MKLDNNDims(const mkldnn_dims_t dnn_dims, int dnn_ndims) { - dims = std::vector(dnn_dims, dnn_dims + dnn_ndims); + dims = std::vector(dnn_dims, dnn_dims + dnn_ndims); } - explicit MKLDNNDims(std::initializer_list ilist) : dims(ilist) {} + explicit MKLDNNDims(std::initializer_list ilist) : dims(ilist) {} explicit MKLDNNDims(std::initializer_list ilist) : dims(ilist.begin(), ilist.end()) {} InferenceEngine::SizeVector ToSizeVector() const { @@ -45,12 +45,12 @@ public: return dims.size(); } - int size() const { + ptrdiff_t size() const { return size(0); } - int size(int start) const { - int size = 1; + ptrdiff_t size(int start) const { + ptrdiff_t size = 1; for (int i = start; i < dims.size(); i++) { size *= dims[i]; @@ -67,7 +67,7 @@ public: return dims; } - bool operator == (const MKLDNNDims& rhs) { + bool operator == (const MKLDNNDims& rhs) const { if (dims.size() != rhs.dims.size()) { return false; } @@ -75,20 +75,20 @@ public: return std::equal(rhs.dims.begin(), rhs.dims.end(), dims.begin()); } - bool operator != (const MKLDNNDims& rhs) { + bool operator != (const MKLDNNDims& rhs) const { return !(*this == rhs); } - int& operator[](int idx) { + ptrdiff_t& operator[](int idx) { return dims[idx]; } - int operator[](int idx) const { + ptrdiff_t operator[](int idx) const { return dims[idx]; } private: - std::vector dims; + std::vector dims; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp index 92c8c5a..7d13d01 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,120 +8,140 @@ #include using namespace mkldnn; -using namespace MKLDNNPlugin; +namespace MKLDNNPlugin { -MKLDNNPlugin::MKLDNNEdge::MKLDNNEdge(const std::shared_ptr &parent, - const std::shared_ptr &child) { - this->parent = parent; - this->child = child; -} +MKLDNNEdge::MKLDNNEdge(const MKLDNNNodePtr &parent, const MKLDNNNodePtr &child, int pr_port, int ch_port) : + parent(parent), child(child), parent_port(pr_port), child_port(ch_port) {} -const std::shared_ptr MKLDNNPlugin::MKLDNNEdge::getParent() const { +const MKLDNNNodePtr MKLDNNEdge::getParent() const { auto parentPtr = parent.lock(); if (!parentPtr) THROW_IE_EXCEPTION << "Edge contains empty parent node"; return parentPtr; } -const std::shared_ptr MKLDNNPlugin::MKLDNNEdge::getChild() const { +const MKLDNNNodePtr MKLDNNEdge::getChild() const { auto childPtr = child.lock(); if (!childPtr) THROW_IE_EXCEPTION << "Edge contains empty child node"; return childPtr; } -bool MKLDNNPlugin::MKLDNNEdge::isDropped() { - return getInputNum() == -1 && getOutputNum() == -1; +bool MKLDNNEdge::isDropped() { + bool not_in_parent = true; + bool not_in_child = true; + + auto parent_ptr = parent.lock(); + if (parent_ptr) { + for (auto &edge : parent_ptr->childEdges) + if (edge.lock().get() == this) + not_in_parent = false; + } + + auto child_ptr = child.lock(); + if (child_ptr) { + for (auto &edge : child_ptr->parentEdges) + if (edge.lock().get() == this) + not_in_child = false; + } + return not_in_parent && not_in_child; } -bool MKLDNNPlugin::MKLDNNEdge::needReorder() { +void MKLDNNEdge::drop() { + auto _drop_from = [&] (std::vector &list) { + auto myself = std::find_if(list.begin(), list.end(), + [&] (MKLDNNEdgeWeakPtr edge) { return edge.lock().get() == this; }); + + if (myself != list.end()) + list.erase(myself); + }; + + _drop_from(getParent()->childEdges); + _drop_from(getChild()->parentEdges); +} + + +bool MKLDNNEdge::needReorder() { bool canBeInPlaceConflicts = false; auto parentSPD = getParent()->getSelectedPrimitiveDescriptor(); auto childSPD = getChild()->getSelectedPrimitiveDescriptor(); if (!parentSPD || !childSPD) THROW_IE_EXCEPTION << "Cannot make a decision about reorder. Primitive descriptors weren't selected."; - int inputNum = getInputNum(); + int outNumber = getOutputNum(); + int inNumber = getInputNum(); bool in_place = inPlace(); - if (in_place && !getParent()->getChildEdges().empty()) { - for (size_t i = 0; i < getParent()->getChildEdges().size(); i++) { - if (i == inputNum) + bool childCanChangeMem = childSPD->getConfig().outConfs.empty(); + for (const auto conf : childSPD->getConfig().outConfs) { + if (conf.inPlace == outNumber && outNumber >= 0) + childCanChangeMem = true; + } + + const auto& detectInPlaceChildsNum = [](const std::vector& edges) -> size_t { + size_t count = 0; + for (const auto& edge : edges) { + auto childSPD = edge->getChild()->getSelectedPrimitiveDescriptor(); + int outNumber = edge->getOutputNum(); + if (childSPD->getConfig().outConfs.empty()) + count++; + for (const auto conf : childSPD->getConfig().outConfs) { + if (conf.inPlace == outNumber) + count++; + } + } + return count; + }; + + const auto portChildEdges = getParent()->getChildEdgesAtPort(inNumber); + if (in_place && detectInPlaceChildsNum(portChildEdges) > 1 && childCanChangeMem) + canBeInPlaceConflicts = true; + if (!canBeInPlaceConflicts && in_place && !getParent()->getChildEdges().empty()) { + for (auto &p_edge_peer : portChildEdges) { + if (p_edge_peer.get() == this) continue; - if (getParent()->getChildEdgeAt(i)->getChild()->getType() != Reorder && getParent()->getChildEdgeAt(i)->inPlace(LOOK_DOWN)) + if (p_edge_peer->getChild()->getType() != Reorder && p_edge_peer->inPlace(LOOK_DOWN)) canBeInPlaceConflicts = true; } } if (in_place) { - int outNumber = getOutputNum(); - int inNumber = getInputNum(); if (inNumber >= 0 && inNumber < parentSPD->getConfig().outConfs.size() && parentSPD->getConfig().outConfs[inNumber].inPlace >= 0 && outNumber >= 0 && outNumber < childSPD->getConfig().inConfs.size() && childSPD->getConfig().inConfs[outNumber].inPlace >= 0) canBeInPlaceConflicts = true; } - return !MKLDNNExtensionUtils::initTensorsAreEqual(getInputDesc(), getOutputDesc()) || canBeInPlaceConflicts; + return canBeInPlaceConflicts || !MKLDNNExtensionUtils::initTensorsAreEqual(getInputDesc(), getOutputDesc()); } -InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getInputDesc() { +InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() { if (inputDesc.getLayout() == InferenceEngine::Layout::ANY) { inputDesc = getSpecifiedInputDesc({}); } return inputDesc; } -InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getOutputDesc() { +InferenceEngine::TensorDesc MKLDNNEdge::getOutputDesc() { if (outputDesc.getLayout() == InferenceEngine::Layout::ANY) { outputDesc = getSpecifiedOutputDesc({}); } return outputDesc; } -InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getDesc() { +InferenceEngine::TensorDesc MKLDNNEdge::getDesc() { if (!MKLDNNExtensionUtils::initTensorsAreEqual(getInputDesc(), getOutputDesc())) THROW_IE_EXCEPTION << "Cannot get descriptor for edge: " << getParent()->getName() << "->" << getChild()->getName(); return getInputDesc(); } -int MKLDNNPlugin::MKLDNNEdge::getInputNum() { - return getAllInputNums()[0]; -} - -std::vector MKLDNNPlugin::MKLDNNEdge::getAllInputNums() { - auto parentPtr = parent.lock(); - if (!parentPtr) - return {-1}; - - std::vector res; - for (size_t i = 0; i < parentPtr->getChildEdges().size(); i++) { - auto childEdge = parentPtr->getChildEdges()[i].lock(); - if (childEdge && childEdge.get() == this) { - res.push_back(static_cast(i)); - } - } - return res.empty() ? std::vector{-1} : res; +int MKLDNNEdge::getInputNum() { + return parent_port; } -int MKLDNNPlugin::MKLDNNEdge::getOutputNum() { - return getAllOutputNums()[0]; +int MKLDNNEdge::getOutputNum() { + return child_port; } -std::vector MKLDNNPlugin::MKLDNNEdge::getAllOutputNums() { - auto childPtr = child.lock(); - if (!childPtr) - return {-1}; - - std::vector res; - for (size_t i = 0; i < childPtr->getParentEdges().size(); i++) { - auto parentEdge = childPtr->getParentEdges()[i].lock(); - if (parentEdge && parentEdge.get() == this) { - res.push_back(static_cast(i)); - } - } - return res.empty() ? std::vector{-1} : res; -} - -void MKLDNNPlugin::MKLDNNEdge::allocate(const void* mem_ptr) { +void MKLDNNEdge::allocate(const void* mem_ptr) { if (status != Status::NeedAllocation) return; @@ -142,7 +162,7 @@ void MKLDNNPlugin::MKLDNNEdge::allocate(const void* mem_ptr) { status = Status::Allocated; } -void MKLDNNPlugin::MKLDNNEdge::changeStatus(MKLDNNPlugin::MKLDNNEdge::Status state) { +void MKLDNNEdge::changeStatus(MKLDNNEdge::Status state) { if (state == Status::NotAllocated) { THROW_IE_EXCEPTION << "Incorrect behaviour! Use method sharedMemFrom()"; } @@ -156,7 +176,7 @@ void MKLDNNPlugin::MKLDNNEdge::changeStatus(MKLDNNPlugin::MKLDNNEdge::Status sta status = state; } -MKLDNNPlugin::MKLDNNDims &MKLDNNPlugin::MKLDNNEdge::getDims() { +const MKLDNNDims& MKLDNNEdge::getDims() { if (!dims.ndims()) { MKLDNNDims outDims; MKLDNNDims inDims; @@ -196,11 +216,7 @@ MKLDNNPlugin::MKLDNNDims &MKLDNNPlugin::MKLDNNEdge::getDims() { return dims; } -void MKLDNNPlugin::MKLDNNEdge::setDims(MKLDNNPlugin::MKLDNNDims &dims) { - this->dims = dims; -} - -bool MKLDNNPlugin::MKLDNNEdge::nodeCanChangeDesc(const std::shared_ptr &node) const { +bool MKLDNNEdge::nodeCanChangeDesc(const MKLDNNNodePtr &node) const { PrimitiveDescInfo * selectedPd = node->getSelectedPrimitiveDescriptor(); if (selectedPd == nullptr) THROW_IE_EXCEPTION << "Primitive descriptor for node " << node->getName() << " is not selected."; @@ -245,7 +261,7 @@ bool MKLDNNPlugin::MKLDNNEdge::nodeCanChangeDesc(const std::shared_ptr {any} or {any} -> {any, any, any} or {any} -> {any} it means that /// layer doesn't change memory format /// We don't support {any, any, nchw} -> {any} -InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getSpecifiedInputDesc(std::map formats) { +InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map formats) { InferenceEngine::TensorDesc inDesc; static int enterCount = 0; enterCount++; @@ -370,7 +386,7 @@ InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getSpecifiedInputDesc(std: return MKLDNNMemoryDesc(getDims(), inDataType, desc); } -InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getSpecifiedOutputDesc(std::map formats) { +InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map formats) { static int enterCount = 0; enterCount++; InferenceEngine::TensorDesc outDesc; @@ -510,7 +526,7 @@ InferenceEngine::TensorDesc MKLDNNPlugin::MKLDNNEdge::getSpecifiedOutputDesc(std return childPtr->getSelectedPrimitiveDescriptor()->getConfig().outConfs[outputIdx].desc; } -const MKLDNNPlugin::MKLDNNMemory &MKLDNNPlugin::MKLDNNEdge::getMemory() { +const MKLDNNMemory &MKLDNNEdge::getMemory() { if (status == Status::NotAllocated) { memoryPtr.reset(new MKLDNNMemory(getParent()->getEngine())); memoryPtr->Create(MKLDNNMemoryDesc(getDesc()), getSharedEdge()->getMemoryPtr()->GetData()); @@ -521,7 +537,7 @@ const MKLDNNPlugin::MKLDNNMemory &MKLDNNPlugin::MKLDNNEdge::getMemory() { return *memoryPtr; } -MKLDNNPlugin::MKLDNNMemoryPtr &MKLDNNPlugin::MKLDNNEdge::getMemoryPtr() { +MKLDNNMemoryPtr &MKLDNNEdge::getMemoryPtr() { if (status == Status::NotAllocated) { memoryPtr.reset(new MKLDNNMemory(getParent()->getEngine())); memoryPtr->Create(MKLDNNMemoryDesc(getDesc()), getSharedEdge()->getMemoryPtr()->GetData()); @@ -545,12 +561,12 @@ InferenceEngine::Blob::Ptr MKLDNNEdge::getBlob() { return make_blob_with_precision(desc, memoryPtr->GetData()); } -void MKLDNNPlugin::MKLDNNEdge::sharedMemFrom(const MKLDNNPlugin::MKLDNNEdgePtr &edge) { +void MKLDNNEdge::sharedMemFrom(const MKLDNNEdgePtr &edge) { memoryFromEdge = edge; status = Status::NotAllocated; } -void MKLDNNPlugin::MKLDNNEdge::validate() { +void MKLDNNEdge::validate() { if (status == Status::Validated) return; getMemory(); @@ -563,7 +579,7 @@ void MKLDNNPlugin::MKLDNNEdge::validate() { status = Status::Validated; } -MKLDNNPlugin::MKLDNNEdgePtr MKLDNNPlugin::MKLDNNEdge::getSharedEdge() const { +MKLDNNEdgePtr MKLDNNEdge::getSharedEdge() const { auto memoryFromEdgePtr = memoryFromEdge.lock(); if (!memoryFromEdgePtr) { THROW_IE_EXCEPTION << "Cannot get memory ptr for edge(" << getParent()->getName() << "->" @@ -578,44 +594,45 @@ void MKLDNNEdge::init() { MKLDNNEdgePtr edgePtr = getBaseEdge(); if (edgePtr.get() == this) { changeStatus(Status::NeedAllocation); - if (getInputNum() > 0 && getParent()->getSelectedPrimitiveDescriptor() && - getParent()->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size() <= getInputNum() && - edgePtr != getParent()->getChildEdgeAt(0)) { - sharedMemFrom(getParent()->getChildEdgeAt(0)); + auto port = getInputNum(); + if (port < 0) + return; + auto edges_at_same_port = getParent()->getChildEdgesAtPort(static_cast(port)); + if (!edges_at_same_port.empty() && + edgePtr != edges_at_same_port[0]) { + sharedMemFrom(edges_at_same_port[0]); } } else { sharedMemFrom(edgePtr); - if (getInputNum() > 0 && getParent()->getSelectedPrimitiveDescriptor() && - getParent()->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size() <= getInputNum() && - edgePtr != getParent()->getChildEdgeAt(0)) { - if (getParent()->getChildEdgeAt(0)->getStatus() != Status::NeedAllocation && - getParent()->getChildEdgeAt(0)->getStatus() != Status::Uninitialized) { - if (getParent()->getChildEdgeAt(0)->getSharedEdge() != edgePtr) + auto port = getInputNum(); + if (port < 0) + return; + auto edges_at_same_port = getParent()->getChildEdgesAtPort(static_cast(port)); + for (auto edge : edges_at_same_port) { + if (edge->getStatus() != Status::NeedAllocation && edge->getStatus() != Status::Uninitialized) { + if (edge->getSharedEdge() != edgePtr) THROW_IE_EXCEPTION << "Unsupported behavior. Cannot mark edge " << getParent()->getChildEdgeAt(0)->getParent()->getName() << "->" << getParent()->getChildEdgeAt(0)->getChild()->getName() << " as not allocated!"; } else { - getParent()->getChildEdgeAt(0)->sharedMemFrom(edgePtr); + if (edge != edgePtr) + edge->sharedMemFrom(edgePtr); } } } } /** - * Should analize graph node dependensies, inplace node information and return root memory(edge) it view on + * Should analyze graph node dependencies, inplace node information and return root memory(edge) it view on * * @param type some magic enum values... description needed * @return root of view-on-memory subgraph */ -MKLDNNEdgePtr MKLDNNEdge::getBaseEdge(LOOK look) { +MKLDNNEdgePtr MKLDNNEdge::getBaseEdge(int look) { auto parentConfig = getParent()->getSelectedPrimitiveDescriptor()->getConfig(); auto childConfig = getChild()->getSelectedPrimitiveDescriptor()->getConfig(); int inputNum = getInputNum(); int outputNum = getOutputNum(); - if (inputNum >= parentConfig.outConfs.size()) - inputNum = 0; - if (outputNum >= childConfig.inConfs.size()) - outputNum = 0; if (childConfig.inConfs[outputNum].inPlace >= 0 && parentConfig.outConfs[inputNum].inPlace >= 0) { inputNum = getInputNum(); @@ -623,37 +640,43 @@ MKLDNNEdgePtr MKLDNNEdge::getBaseEdge(LOOK look) { } if (childConfig.inConfs[outputNum].inPlace >= 0 && (look & LOOK_DOWN)) { - int next_edge_ind = childConfig.inConfs[outputNum].inPlace; - if (childConfig.outConfs[next_edge_ind].inPlace >= 0) { - childConfig.outConfs[next_edge_ind].inPlace = -1; + int next_port_idx = childConfig.inConfs[outputNum].inPlace; + if (childConfig.outConfs[next_port_idx].inPlace >= 0) { + childConfig.outConfs[next_port_idx].inPlace = -1; getChild()->initDescriptor(childConfig); } - // this is a WA ... :-( - if (childConfig.outConfs.size() <= getChild()->getChildEdges().size()) { - // Multiple connection to some out port. - // Will try to find implace consumer. - for (int i = 0; i< getChild()->getChildEdges().size(); i++) { - auto chch_edge = getChild()->getChildEdgeAt(i); - auto chch_conf = chch_edge->getChild()->getSelectedPrimitiveDescriptor()->getConfig(); + auto ch_edges = getChild()->getChildEdgesAtPort(next_port_idx); + auto &next_ch_edge = ch_edges[0]; + // Multiple connection to some out port + // Will try to find inplace consumer + for (auto &ch_edge : ch_edges) { + auto &chch_conf = ch_edge->getChild()->getSelectedPrimitiveDescriptor()->getConfig(); - if (chch_conf.inConfs[chch_edge->getOutputNum()].inPlace >= 0) { - next_edge_ind = i; - } - } + if (chch_conf.inConfs[ch_edge->getOutputNum()].inPlace >= 0) + next_ch_edge = ch_edge; } - return getChild()->getChildEdgeAt(next_edge_ind)->getBaseEdge(LOOK_DOWN); + return next_ch_edge->getBaseEdge(LOOK_DOWN); } else if (parentConfig.outConfs[inputNum].inPlace >= 0 && (look & LOOK_UP)) { - if (parentConfig.inConfs[parentConfig.outConfs[inputNum].inPlace].inPlace >= 0) { - parentConfig.inConfs[parentConfig.outConfs[inputNum].inPlace].inPlace = -1; + int next_port_idx = parentConfig.outConfs[inputNum].inPlace; + if (parentConfig.inConfs[next_port_idx].inPlace >= 0) { + parentConfig.inConfs[next_port_idx].inPlace = -1; getParent()->initDescriptor(parentConfig); } - return getParent()->getParentEdgeAt(parentConfig.outConfs[inputNum].inPlace)->getBaseEdge(LOOK_UP); + return getParent()->getParentEdgesAtPort(next_port_idx)[0]->getBaseEdge(LOOK_UP); } - inputNum = getInputNum(); - return getParent()->getChildEdgeAt(inputNum); + auto edges_for_same_port = getParent()->getChildEdgesAtPort(inputNum); + if (!(look & LOOK_NO_RECURRENT)) { + for (auto edge : edges_for_same_port) { + if (edge.get() != this) { + auto base = edge->getBaseEdge(LOOK_BOTH | LOOK_NO_RECURRENT); + if (base != edge) return base; + } + } + } + return edges_for_same_port[0]; } bool MKLDNNEdge::inPlace(LOOK look) { @@ -671,18 +694,12 @@ bool MKLDNNEdge::inPlace(LOOK look) { if (look & LOOK_UP) { if (parentSPD->getConfig().outConfs[inputNum].inPlace >= 0) return true; - for (const auto &inConf : parentSPD->getConfig().inConfs) { - if (inConf.inPlace == inputNum) - return true; - } } if (look & LOOK_DOWN) { if (childSPD->getConfig().inConfs[outputNum].inPlace >= 0) return true; - for (const auto &outConf : childSPD->getConfig().outConfs) { - if (outConf.inPlace == inputNum) - return true; - } } return false; } + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h index f5364f6..759084b 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,6 +21,10 @@ using MKLDNNEdgeWeakPtr = std::weak_ptr; class MKLDNNEdge : public InferenceEngine::details::no_copy { public: + MKLDNNEdge(const std::shared_ptr& parent, + const std::shared_ptr& child, + int pr_port = 0, int ch_port = 0); + enum class Status { Uninitialized, NeedAllocation, @@ -28,9 +32,8 @@ public: Allocated, Validated }; - MKLDNNEdge(const std::shared_ptr& parent, const std::shared_ptr& child); - inline Status getStatus() noexcept { + inline Status getStatus() const noexcept { return status; } @@ -39,26 +42,23 @@ public: virtual void init(); virtual void allocate(const void* mem_ptr = nullptr); virtual void validate(); + void drop(); const std::shared_ptr getParent() const; const std::shared_ptr getChild() const; - bool needReorder(); - InferenceEngine::Blob::Ptr getBlob(); + InferenceEngine::TensorDesc getDesc(); + + const MKLDNNDims &getDims(); const MKLDNNMemory& getMemory(); MKLDNNMemoryPtr& getMemoryPtr(); + bool needReorder(); bool isDropped(); - InferenceEngine::TensorDesc getDesc(); int getInputNum(); int getOutputNum(); - std::vector getAllOutputNums(); - std::vector getAllInputNums(); - - MKLDNNDims &getDims(); - void setDims(MKLDNNDims &dims); void sharedMemFrom(const MKLDNNEdgePtr& edge); MKLDNNEdgePtr getSharedEdge() const; @@ -66,6 +66,9 @@ public: private: std::weak_ptr parent; std::weak_ptr child; + int parent_port; + int child_port; + MKLDNNEdgeWeakPtr memoryFromEdge; MKLDNNDims dims; MKLDNNMemoryPtr memoryPtr; @@ -81,9 +84,9 @@ private: bool nodeCanChangeDesc(const std::shared_ptr& node) const; - enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2, LOOK_BOTH = LOOK_UP | LOOK_DOWN }; + enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2, LOOK_BOTH = LOOK_UP | LOOK_DOWN, LOOK_NO_RECURRENT = 4 }; - MKLDNNEdgePtr getBaseEdge(LOOK look = LOOK_BOTH); + MKLDNNEdgePtr getBaseEdge(int look = LOOK_BOTH); bool inPlace(LOOK look = LOOK_BOTH); friend class MKLDNNGraph; }; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp index b362433..de757ee 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h index f3abd8b..5481aa1 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp index 3600ee5..7b45731 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,8 +21,11 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType) return 1; case mkldnn::memory::data_type::u8: return 1; + case mkldnn::memory::data_type::bin: + return 1; case mkldnn::memory::data_type::data_undef: return 0; + default: THROW_IE_EXCEPTION << "Unsupported data type."; } @@ -40,6 +43,8 @@ memory::data_type MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::P return memory::s8; case InferenceEngine::Precision::U8: return memory::u8; + case InferenceEngine::Precision::BIN: + return memory::bin; default: { THROW_IE_EXCEPTION << "The plugin does not support " << prec.name(); @@ -59,6 +64,8 @@ InferenceEngine::Precision MKLDNNExtensionUtils::DataTypeToIEPrecision(memory::d return InferenceEngine::Precision::I8; case memory::u8: return InferenceEngine::Precision::U8; + case memory::bin: + return InferenceEngine::Precision::BIN; default: { THROW_IE_EXCEPTION << "Unsupported data type."; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h index 8b2994e..358a1e7 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp index 9c079ef..13b8e5f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -33,6 +33,7 @@ #include #include #include +#include
#include @@ -49,10 +50,16 @@ #include "utils/blob_dump.h" /***************************************************** - * Dump capability - * Specify path to dump folder in BLOB_DUMP_PATH + * Debug capability + * - BLOB_DUMP_PATH : Specify with existing folder name + * to dump intermediate blobs into it + * - PRINT_GRAPH_INFO : Define it to enable printing + * additional information to std output. + * *****************************************************/ -// #define BLOB_DUMP_PATH "dump" +// #define BLOB_DUMP_PATH "mkldnn_dump" +// #define PRINT_GRAPH_INFO +// #define DUMP_AS_TEXT #ifdef BLOB_DUMP_PATH # define DUMP_DIR BLOB_DUMP_PATH @@ -69,11 +76,15 @@ using namespace InferenceEngine; using namespace InferenceEngine::details; void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) { - if (IsReady()) { + if (IsReady()) ForgetGraphData(); - } - // go over the inputs and create input primitives + Replicate(network, extMgr); + InitGraph(); + status = Ready; +} + +void MKLDNNGraph::Replicate(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) { InputsDataMap inputs; network.getInputsInfo(inputs); if (inputs.empty()) { @@ -86,160 +97,84 @@ void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionM if (inputLayer) inputLayer->precision = inputLayer->outData[0]->precision; } - for (const auto& input : inputs) { - auto inputLayer = input.second->getInputData()->getCreatorLayer().lock(); - if (!inputLayer) { - // For v1 parser - inputLayer.reset(new CNNLayer({input.second->getInputData()->getName(), - "Input", - input.second->getInputData()->getPrecision()})); - - inputLayer->outData.push_back(input.second->getInputData()); - } - - const MKLDNNNodePtr inputNode = MKLDNNNodePtr(MKLDNNNode::CreateNode(inputLayer, getEngine(), extMgr)); - - graphNodes.push_back(inputNode); - inputNodes[input.first] = inputNode; - std::vector queueLayers; - - for (const auto &layer : input.second->getInputData()->getInputTo()) { - queueLayers.push_back({inputNode, layer.second, 0}); - } + std::unordered_map layer2node; - while (!queueLayers.empty()) { - ParseNode(queueLayers[0].cnnLayer, queueLayers[0].parent, extMgr, queueLayers[0].outIdx, queueLayers); - queueLayers.erase(queueLayers.begin()); - } + auto _parent_port = [] (const DataPtr &data) -> int { + auto parent = data->creatorLayer.lock(); + for (int i = 0; parent->outData.size(); i++) + if (data == parent->outData[i]) + return i; + return -1; + }; - // Loading mean images - MKLDNNDims outDims(inputNode->getChildEdgeAt(0)->getDims()); - if (inputs.find(input.first) != inputs.end()) { - InputInfo::Ptr ii = inputs[input.first]; - if (ii && ii->getPreProcess().getNumberOfChannels()) { - _meanImages[input.first].Load(outDims, ii); - } + // Replicate All Nodes in topological order + for (const auto layer : CNNNetSortTopologically(network)) { + CNNLayerPtr _layer = layer; + if (layer->type == "Memory" && layer->GetParamAsString("index") == "1") { + auto memoryId = layer->GetParamAsString("id"); + _layer.reset(new CNNLayer({layer->name + "/id=" + memoryId, "MemoryInput", layer->precision})); + _layer->params = layer->params; + _layer->outData = layer->outData; } - } - auto allInputs = CNNNetGetAllInputLayers(network); - for (const auto& input : allInputs) { - auto isRealInput = std::find_if(std::begin(inputs), std::end(inputs), [&](InputsDataMap::value_type& inputInfo){ - return inputInfo.second->getInputData()->getName() == input->name; - }); - if (isRealInput != std::end(inputs)) { - continue; - } + const MKLDNNNodePtr node(MKLDNNNode::CreateNode(_layer, getEngine(), extMgr)); + graphNodes.push_back(node); + layer2node[layer] = node; - MKLDNNNodePtr inputNode; - CaselessEq eq; + for (int port = 0; port < layer->insData.size(); port++) { + auto data = layer->insData[port].lock(); + auto parent_layer = data->creatorLayer.lock(); + if (!parent_layer) continue; // no parent means that it is input data node (or memory/const layer) - if (eq(input->type, "Memory")) { - auto memoryId = input->GetParamAsString("id"); - CNNLayerPtr layer(new CNNLayer({input->name + "/id=" + memoryId, "MemoryInput", input->precision})); - layer->params = input->params; - layer->outData = input->outData; + auto parent_node = layer2node[parent_layer]; - inputNode = MKLDNNNodePtr(MKLDNNNode::CreateNode(layer, getEngine(), extMgr)); - } else if (eq(input->type, "Const")) { - inputNode = MKLDNNNodePtr(MKLDNNNode::CreateNode(input, getEngine(), extMgr)); + MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), port)); + node->addEdge(edge); + graphEdges.push_back(edge); } - graphNodes.push_back(inputNode); + } - std::vector queueLayers; - size_t count_out = 0; - for (auto &&outData : input->outData) { - for (auto &&layer : outData->getInputTo()) { - queueLayers.push_back({inputNode, layer.second, count_out}); - } - count_out++; - } + std::map outputs; + network.getOutputsInfo(outputs); - while (!queueLayers.empty()) { - ParseNode(queueLayers[0].cnnLayer, queueLayers[0].parent, extMgr, queueLayers[0].outIdx, queueLayers); - queueLayers.erase(queueLayers.begin()); - } - } + for (const auto &output : outputs) { + const auto data = output.second; - std::map output; - network.getOutputsInfo(output); - - for (auto it = output.begin(); it != output.end(); ++it) { - const DataPtr& outputDataPtr = it->second; - - MKLDNNNodePtr node = FindNodeWithName(outputDataPtr->getCreatorLayer().lock()->name); - if (!node) - THROW_IE_EXCEPTION << "Cannot find output layer " << outputDataPtr->getCreatorLayer().lock()->name; - - const std::string name = "out_" + it->first; - - CNNLayerPtr layer(new CNNLayer({name, "Output", outputDataPtr->getCreatorLayer().lock()->outData[0]->getPrecision()})); - layer->insData.push_back(outputDataPtr); - MKLDNNNodePtr outputLayer(new MKLDNNInputNode(layer, getEngine())); - MKLDNNEdgePtr edgePtr(new MKLDNNEdge(node, outputLayer)); - graphEdges.push_back(edgePtr); - - const std::vector& childEdges = node->getChildEdges(); - size_t insertBeforeChildEdgeIndex = childEdges.size(); - if (!childEdges.empty()) { - bool outputDataIndexWasFound = false; - size_t outputDataIndex = 0; - for (size_t i = 0; i < node->getCnnLayer()->outData.size(); ++i) { - const DataPtr& otherOutputDataPtr = node->getCnnLayer()->outData[i]; - if (otherOutputDataPtr->name == it->first) { - outputDataIndexWasFound = true; - outputDataIndex = i; - } - } - IE_ASSERT(outputDataIndexWasFound) << "Node " << node->getName() << " doesn't have output data '" << it->first << "'"; + auto parent_layer = data->creatorLayer.lock(); + auto parent_node = layer2node[parent_layer]; - std::unordered_map nodeOutputDataIndexByData; - const CNNLayerPtr& nodeLayer = node->getCnnLayer(); - for (size_t dataIndex = 0; dataIndex < nodeLayer->outData.size(); ++dataIndex) { - nodeOutputDataIndexByData.emplace(nodeLayer->outData[dataIndex].get(), dataIndex); - } + CNNLayerPtr layer(new CNNLayer({"out_" + output.first, "Output", data->precision})); + layer->insData.push_back(data); - auto getOutputDataIndex = [&](const MKLDNNEdgePtr& childEdge) -> size_t { - const InferenceEngine::CNNLayerPtr& childNodeLayer = childEdge->getChild()->getCnnLayer(); - for (const DataWeakPtr& childNodeInsertWeakData : childNodeLayer->insData) { - const DataPtr childNodeInsertData = childNodeInsertWeakData.lock(); - if (!childNodeInsertData) { - continue; - } + const MKLDNNNodePtr node(MKLDNNNode::CreateNode(layer, getEngine(), extMgr)); - const auto indexIt = nodeOutputDataIndexByData.find(childNodeInsertData.get()); - if (indexIt != nodeOutputDataIndexByData.end()) { - return indexIt->second; - } - } + MKLDNNEdgePtr edge(new MKLDNNEdge(parent_node, node, _parent_port(data), 0)); + node->addEdge(edge); + graphEdges.push_back(edge); - IE_ASSERT(false) << "Node has child edge without insert data"; - }; + graphNodes.push_back(node); + outputNodes.push_back(node); + layer2node[layer] = node; + } - for (size_t childEdgeIndex = 0; childEdgeIndex < childEdges.size(); ++childEdgeIndex) { - const MKLDNNEdgePtr childEdge = childEdges[childEdgeIndex].lock(); - if (!childEdge) { - continue; - } + // Replicate input nodes + for (const auto& input : inputs) { + auto inputLayer = input.second->getInputData()->getCreatorLayer().lock(); + inputNodes[input.first] = layer2node[inputLayer]; - const size_t edgeOutputDataIndex = getOutputDataIndex(childEdge); - if (outputDataIndex < edgeOutputDataIndex) { - insertBeforeChildEdgeIndex = childEdgeIndex; - break; - } + // Loading mean images + MKLDNNDims outDims(inputNodes[input.first]->getChildEdgeAt(0)->getDims()); + if (inputs.find(input.first) != inputs.end()) { + InputInfo::Ptr ii = inputs[input.first]; + if (ii && ii->getPreProcess().getNumberOfChannels()) { + _meanImages[input.first].Load(outDims, ii); } } - - if (insertBeforeChildEdgeIndex < childEdges.size()) { - outputLayer->addEdge(edgePtr, 0, insertBeforeChildEdgeIndex, true); - } else { - outputLayer->addEdge(edgePtr, 0, node->getChildEdges().size()); - } - - graphNodes.push_back(outputLayer); - outputNodes.push_back(outputLayer); } +} +void MKLDNNGraph::InitGraph() { + SortTopologically(); MKLDNNGraphOptimizer optimizer; optimizer.ApplyCommonGraphOptimizations(*this); SortTopologically(); @@ -259,37 +194,47 @@ void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionM CreatePrimitives(); - // Will do it before cleanup. Because it will lose original layers information - if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_init.dot"); + // Do it before cleanup. Because it will lose original layers information + for (auto &graphNode : graphNodes) { + auto nodeType = graphNode->getType(); + if (nodeType == Reorder || nodeType == Output) continue; + + graphNode->addOriginalLayer(graphNode->getCnnLayer()); + if (graphNode->getFusedWith().size() || graphNode->getMergeWith().size()) { + // Original layer names + std::vector internal = graphNode->getFusedWith(); + auto &merged = graphNode->getMergeWith(); + internal.insert(internal.end(), merged.begin(), merged.end()); + + for (auto &sub_node : internal) { + graphNode->addOriginalLayer(sub_node->getCnnLayer()); + } + } + } + if (!config.dumpToDot.empty()) + dumpToDotFile(config.dumpToDot + "_init.dot"); for (auto &graphNode : graphNodes) { graphNode->cleanup(); } +#if !defined(NDEBUG) && defined(PRINT_GRAPH_INFO) for (auto &graphNode : graphNodes) { -#ifndef NDEBUG std::cout << "name: " << graphNode->getName() << " [ "; -#endif if (graphNode->parentEdges.size() > 0) { - auto prnt = graphNode->parentEdges[0].lock(); -#ifndef NDEBUG - std::cout << "in: " << prnt->getOutputDesc().getPrecision().name() << "/l=" - << prnt->getOutputDesc().getLayout() + auto prnt_out_desc = graphNode->parentEdges[0].lock()->getOutputDesc(); + std::cout << "in: " << prnt_out_desc.getPrecision().name() + << "/l=" << prnt_out_desc.getLayout() << "; "; -#endif } if (graphNode->childEdges.size() > 0) { - auto chld = graphNode->childEdges[0].lock(); -#ifndef NDEBUG - std::cout << "out: " << chld->getInputDesc().getPrecision().name() << "/l=" - << chld->getInputDesc().getLayout(); -#endif + auto chld_in_desc = graphNode->childEdges[0].lock()->getInputDesc(); + std::cout << "out: " << chld_in_desc.getPrecision().name() + << "/l=" << chld_in_desc.getLayout(); } -#ifndef NDEBUG std::cout << " ]" << std::endl; -#endif } - +#endif mkldnn::stream stream = mkldnn::stream(stream::kind::eager); for (auto &graphNode : graphNodes) { @@ -297,101 +242,6 @@ void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionM continue; graphNode->execute(stream); } - - status = Ready; -} - -void MKLDNNGraph::ParseNode(const CNNLayerPtr& cnnLayer, MKLDNNNodePtr& parent, - const MKLDNNExtensionManager::Ptr& extMgr, size_t outIdx, - std::vector& queuelayers) { - if (cnnLayer->precision != Precision::FP32 && - cnnLayer->precision != Precision::I8 && - cnnLayer->precision != Precision::U8) { - THROW_IE_EXCEPTION << "The plugin does not support " << cnnLayer->precision; - } - - MKLDNNNodePtr node = FindNodeWithName(cnnLayer->name); - bool exists = false; - if (node) { - exists = true; - } else { - node.reset(MKLDNNNode::CreateNode(cnnLayer, getEngine(), extMgr)); - } - - if (parent) { - MKLDNNEdgePtr edgePtr; - size_t shift = 0; - if (outIdx >= parent->getChildEdges().size() || !parent->getChildEdges()[outIdx].lock()) { - edgePtr.reset(new MKLDNNEdge(parent, node)); - graphEdges.push_back(edgePtr); - } else { - edgePtr = parent->getChildEdgeAt(outIdx); - if (edgePtr->getChild() != node) { - edgePtr.reset(new MKLDNNEdge(parent, node)); - graphEdges.push_back(edgePtr); - shift = parent->getChildEdges().size(); - } - } - - - size_t pIndex = node->getParentEdges().size(); - if (parent->getCnnLayer() != nullptr) { - for (size_t idx = 0; idx < cnnLayer->insData.size(); idx++) { - auto cnnLayerIN = cnnLayer->insData[idx].lock(); - if (cnnLayerIN && - parent->getCnnLayer()->outData.size() > outIdx && - cnnLayerIN.get() == parent->getCnnLayer()->outData[outIdx].get()) { - pIndex = idx; - break; - } - } - node->addEdge(edgePtr, pIndex, outIdx + shift); - if (cnnLayer->insData.size() > 1) { - for (size_t idx = 1; idx < cnnLayer->insData.size(); idx++) { - if (cnnLayer->insData[idx].lock() == cnnLayer->insData[idx - 1].lock()) { - node->addEdge(edgePtr, pIndex + idx, outIdx + shift + idx); - } - } - } - } else { - for (size_t idx = 0; idx < cnnLayer->insData.size(); idx++) { - if (cnnLayer->insData[idx].lock()->getName() == parent->getName()) { - pIndex = static_cast(idx); - break; - } - } - node->addEdge(edgePtr, pIndex, outIdx + shift); - } - } - - if (exists) - return; - - if (cnnLayer->blobs.find("ext-scale") != cnnLayer->blobs.end()) - node->ext_scales = cnnLayer->blobs["ext-scale"]; - - graphNodes.push_back(node); - - size_t count_out = 0; - std::vector remaining; - for (const auto &layer : cnnLayer->outData) { - bool first = true; - for (const auto &data : layer->getInputTo()) { - if (first) { - queuelayers.push_back({node, data.second, count_out}); - first = false; - } else { - // TODO: Just to hide bug with port ordering. - // At first step we visit only first connection - // at port. As second we will visit all remaining. - // - // Not first connection to the port are stored here - remaining.push_back({node, data.second, count_out}); - } - } - count_out++; - } - queuelayers.insert(queuelayers.end(), remaining.begin(), remaining.end()); } void MKLDNNGraph::InitNodes() { @@ -427,44 +277,44 @@ void MKLDNNGraph::InitEdges() { size_t numberOfEdges = graphEdges.size(); for (auto i = 0; i < numberOfEdges; i++) { if (graphEdges[i]->needReorder()) { - std::string layerName = graphEdges[i]->getParent()->getName() + "_" + - reorderArgs(graphEdges[i]->getInputDesc(), graphEdges[i]->getOutputDesc()) + "_" + - graphEdges[i]->getChild()->getName(); + auto &edge = graphEdges[i]; + std::string layerName = edge->getParent()->getName() + "_" + + reorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" + + edge->getChild()->getName(); CNNLayerPtr layer(new CNNLayer({layerName, "Reorder", - graphEdges[i]->getInputDesc().getPrecision()})); + edge->getInputDesc().getPrecision()})); MKLDNNNodePtr newReorder(new MKLDNNReorderNode(layer, getEngine())); auto *reorderPtr = dynamic_cast(newReorder.get()); if (reorderPtr) { - reorderPtr->setDescs(graphEdges[i]->getInputDesc(), graphEdges[i]->getOutputDesc()); + reorderPtr->setDescs(edge->getInputDesc(), edge->getOutputDesc()); } - MKLDNNEdgePtr beforeNode(new MKLDNNEdge(graphEdges[i]->getParent(), newReorder)); - beforeNode->setDims(graphEdges[i]->getDims()); - MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, graphEdges[i]->getChild())); - afterNode->setDims(graphEdges[i]->getDims()); - - auto oIndexes = graphEdges[i]->getAllOutputNums(); - auto iIndexes = graphEdges[i]->getAllInputNums(); - if (iIndexes[0] < 0 || oIndexes[0] < 0) + + auto oIndex = edge->getOutputNum(); + auto iIndex = edge->getInputNum(); + if (iIndex < 0 || oIndex < 0) THROW_IE_EXCEPTION << "Cannot create reorder for nodes: " - << graphEdges[i]->getParent()->getName() << " and " - << graphEdges[i]->getChild()->getName() << "."; + << edge->getParent()->getName() << " and " + << edge->getChild()->getName() << "."; + + edge->drop(); + + MKLDNNEdgePtr beforeNode(new MKLDNNEdge(edge->getParent(), newReorder, iIndex, 0)); + MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, edge->getChild(), 0, oIndex)); // Add edge for beforeNode beforeNode->getChild()->parentEdges.push_back(beforeNode); - for (auto iIndex : iIndexes) graphEdges[i]->getParent()->childEdges[iIndex] = beforeNode; + edge->getParent()->childEdges.push_back(beforeNode); // Add edge for afterNode afterNode->getParent()->childEdges.push_back(afterNode); - for (auto oIndex : oIndexes) graphEdges[i]->getChild()->parentEdges[oIndex] = afterNode; + edge->getChild()->parentEdges.push_back(afterNode); newReorder->getSupportedDescriptors(); newReorder->initSupportedPrimitiveDescriptors(); newReorder->selectOptimalPrimitiveDescriptor(); - beforeNode->getDesc(); graphEdges.push_back(beforeNode); - afterNode->getDesc(); graphEdges.push_back(afterNode); graphNodes.push_back(newReorder); @@ -492,14 +342,15 @@ void MKLDNNGraph::AllocateWithReuse() { for (auto &claster : edge_clasters) { for (auto &element : claster) { if (element == par) { - claster.push_back(edge); + if (std::find(claster.begin(), claster.end(), edge) == claster.end()) + claster.push_back(edge); found = true; break; } } } - if (!found) edge_clasters.push_back({par, edge}); - + if (!found) + edge_clasters.push_back({par, edge}); } else { bool found = false; for (auto &claster : edge_clasters) { @@ -510,7 +361,8 @@ void MKLDNNGraph::AllocateWithReuse() { } } } - if (!found) edge_clasters.push_back({edge}); + if (!found) + edge_clasters.push_back({edge}); } } @@ -535,17 +387,17 @@ void MKLDNNGraph::AllocateWithReuse() { // remove duplicates in merged claster std::sort(base_classter->begin(), base_classter->end()); base_classter->erase(std::unique(base_classter->begin(), base_classter->end()), - base_classter->end() ); + base_classter->end() ); // remove empty clasters edge_clasters.erase(std::remove_if(edge_clasters.begin(), edge_clasters.end(), - [] ( std::vector &cls) { return cls.empty(); }), - edge_clasters.end()); + [] ( std::vector &cls) { return cls.empty(); }), + edge_clasters.end()); } } //======= End of WA ============ - const int alignment = 16; // 64 bytes or 16 floats + const int64_t alignment = 32; // 32 bytes std::vector boxes(edge_clasters.size()); for (int i = 0; i < edge_clasters.size(); i++) { @@ -557,10 +409,12 @@ void MKLDNNGraph::AllocateWithReuse() { const BlockingDesc block_desk = edge->getDesc().getBlockingDesc(); - int e_size = block_desk.getOffsetPadding() + 1; // size in elements (from begin of data to last element) + int64_t e_size = block_desk.getOffsetPadding() + 1; // size in bytes (from begin of data to last element) for (int j = 0; j < block_desk.getBlockDims().size(); j++) e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j]; + e_size *= edge->getDesc().getPrecision() == Precision::BIN ? 1 : edge->getDesc().getPrecision().size(); + box.start = std::min(e_start, box.start); box.finish = std::max(e_finish, box.finish); box.size = std::max(e_size, box.size); @@ -587,20 +441,20 @@ void MKLDNNGraph::AllocateWithReuse() { } MemorySolver memSolver(boxes); - size_t total_size = memSolver.solve() * alignment; + size_t total_size = static_cast(memSolver.solve()) * alignment; - memWorkspace.reset(new MKLDNNMemory(eng)); - memWorkspace->Create(MKLDNNMemoryDesc(TensorDesc(Precision::FP32, {total_size}, Layout::C))); - float* workspace_ptr = static_cast(memWorkspace->GetData()); + memWorkspace = std::make_shared(eng); + memWorkspace->Create(MKLDNNMemoryDesc(TensorDesc(Precision::I8, {total_size}, Layout::C))); + auto* workspace_ptr = static_cast(memWorkspace->GetData()); for (int i = 0; i < edge_clasters.size(); i++) { int count = 0; for (auto &edge : edge_clasters[i]) { if (edge->getStatus() == MKLDNNEdge::Status::NeedAllocation) { - int offset = memSolver.getOffset(i); + int64_t offset = memSolver.getOffset(i); // !! Fallback to individual memory allocation !! // if you like to check infer without reuse just call this function without arguments. - edge->allocate(workspace_ptr + offset * alignment); // alignment in float + edge->allocate(workspace_ptr + offset * alignment); // alignment in byte count++; } } @@ -653,7 +507,7 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine:: // todo: make sure 'name' exists in this map... if (_meanImages.find(name) != _meanImages.end()) { if (in->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) { - _meanImages[name].Subtract(outDims, reinterpret_cast(inter_data_ptr)); + _meanImages[name].Subtract(outDims, reinterpret_cast(inter_data_ptr), in->getTensorDesc().getLayout()); } else { THROW_IE_EXCEPTION << "Mean image of type " << in->getTensorDesc().getPrecision().name() << " is unsupported"; } @@ -734,20 +588,6 @@ void MKLDNNGraph::Infer(int batch) { } } -MKLDNNNodePtr MKLDNNGraph::FindNodeWithName(const std::string& name) const { - if (inputNodes.empty()) { - return std::shared_ptr(); - } - - const auto children = graphNodes; - const auto node = std::find_if(children.begin(), children.end(), - [&name](MKLDNNNodePtr const& item) { - return item->getName() == name; - }); - - return (node == children.end() ? std::shared_ptr() : *node); -} - void MKLDNNGraph::VisitNode(MKLDNNNodePtr node, std::vector& sortedNodes) { if (node->temporary) { return; @@ -793,12 +633,51 @@ void MKLDNNGraph::SortTopologically() { graphNodes.erase(graphNodes.begin(), graphNodes.end()); graphNodes.assign(sorted.begin(), sorted.end()); + + // TODO: Sort in/out edges by port index because of backward compatibility + // A lot of plugin logic are build on top of assumption that index in + // vector childEdges/parentEdges is port number. But that is not + // truth anymore. But to keep old logic correct need to simulate ordering. + // + // Make first N (N == port_num) edge indexes are matched with port index + for (auto &node : graphNodes) { + { + int port_num = node->inDims.size(); + std::vector res(port_num); + + for (int i = 0; i < node->parentEdges.size(); i++) { + auto edge = node->getParentEdgeAt(i); + int port = edge->getOutputNum(); + if (!res[port]) + res[port] = edge; + else + res.push_back(edge); + } + node->parentEdges = {res.begin(), res.end()}; + } + { + int port_num = node->outDims.size(); + std::vector res(port_num); + + for (int i = 0; i < node->childEdges.size(); i++) { + auto edge = node->getChildEdgeAt(i); + int port = edge->getInputNum(); + if (!res[port]) + res[port] = edge; + else + res.push_back(edge); + } + node->childEdges = {res.begin(), res.end()}; + } + } } void MKLDNNGraph::GetPerfData(std::map &perfMap) const { + unsigned i = 0; std::function &, const MKLDNNNodePtr&)> getPerfMapFor = [&](std::map &perfMap, const MKLDNNNodePtr& node) { InferenceEngine::InferenceEngineProfileInfo &pc = perfMap[node->getName()]; + pc.execution_index = i++; // TODO: Why time counter is signed? pc.cpu_uSec = pc.realTime_uSec = (long long) node->PerfCounter().avg(); pc.status = pc.cpu_uSec > 0 ? InferenceEngine::InferenceEngineProfileInfo::EXECUTED @@ -863,38 +742,40 @@ void MKLDNNGraph::DropNode(const MKLDNNNodePtr &node) { } } }; - for (size_t i = 0; i < node->parentEdges.size(); i++) { - if (!node->parentEdges[i].lock()) - continue; - auto parent = node->parentEdges[i].lock()->getParent(); - if (!parent) - continue; - for (size_t j = 0; j < node->childEdges.size(); j++) { - if (!node->childEdges[j].lock()) + auto childs = node->childEdges; + auto parents = node->parentEdges; + + for (size_t i = 0; i < parents.size(); i++) { + auto p_edge = parents[i].lock(); + if (!p_edge) continue; + auto parent = p_edge->getParent(); + if (!parent) continue; + + for (size_t j = 0; j < childs.size(); j++) { + if (!childs[j].lock()) continue; - auto child = node->childEdges[j].lock()->getChild(); + auto child = childs[j].lock()->getChild(); if (!child) continue; - MKLDNNEdgePtr remEdge = node->parentEdges[i].lock(); + MKLDNNEdgePtr &remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); - node->removeEdge(remEdge); + remEdge->drop(); removeEdge(*this, remEdge); } - inNum += j; - remEdge = node->childEdges[j].lock(); + remEdge = childs[j].lock(); int outNum = 0; if (remEdge) { outNum = remEdge->getOutputNum(); - node->removeEdge(remEdge); + remEdge->drop(); removeEdge(*this, remEdge); } - MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child)); - this->GetEdges().push_back(newEdge); - parent->addEdge(newEdge, outNum, inNum); + MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum)); + graphEdges.push_back(newEdge); + parent->addEdge(newEdge); } } } @@ -939,7 +820,10 @@ void MKLDNNGraph::dumpToDotFile(std::string file) const { void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) { auto exec_order = std::to_string(node->execIndex); std::string nodeName = node->name; + std::replace(nodeName.begin(), nodeName.end(), '\\', '_'); std::replace(nodeName.begin(), nodeName.end(), '/', '_'); + std::replace(nodeName.begin(), nodeName.end(), ' ', '_'); + std::replace(nodeName.begin(), nodeName.end(), ':', '_'); auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size(); for (size_t i = 0; i < num_ports; i++) { @@ -948,18 +832,27 @@ void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) { auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_in" + std::to_string(i) + ".ieb"; TensorDesc desc = prEdge->getDesc(); + if (desc.getPrecision() == Precision::BIN) + return; Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData()); BlobDumper dumper(blob); if (pr->ext_scales) dumper.withScales(pr->ext_scales); +#ifdef DUMP_AS_TEXT + dumper.dumpAsTxt(dump_file); +#else dumper.dump(dump_file); +#endif } } void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) { auto exec_order = std::to_string(node->execIndex); auto nodeName = node->name; + std::replace(nodeName.begin(), nodeName.end(), '\\', '_'); std::replace(nodeName.begin(), nodeName.end(), '/', '_'); + std::replace(nodeName.begin(), nodeName.end(), ' ', '_'); + std::replace(nodeName.begin(), nodeName.end(), ':', '_'); auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size(); for (size_t i = 0; i < num_ports; i++) { @@ -967,15 +860,25 @@ void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) { auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_out" + std::to_string(i) + ".ieb"; TensorDesc desc = childEdge->getDesc(); + if (desc.getPrecision() == Precision::BIN) + return; Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData()); BlobDumper dumper(blob); if (node->ext_scales) dumper.withScales(node->ext_scales); +#ifdef DUMP_AS_TEXT + dumper.dumpAsTxt(dump_file); +#else dumper.dump(dump_file); +#endif } } +InferenceEngine::ICNNNetwork::Ptr MKLDNNGraph::dump() const { + return dump_graph_as_ie_net(*this); +} + bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const { InputsDataMap inputs; network.getInputsInfo(inputs); @@ -1037,16 +940,26 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) { ICNNNetworkStats* pstats = nullptr; StatusCode s = network.getStats(&pstats, nullptr); - // we are cloning network if we have statistics and we can transform network - // in other case we pass original network. Especially because LSTM networks - // are not cloned properly - details::CNNNetworkImplPtr clonedNetwork; + // we are cloning network if we have statistics and we can transform network. + auto clonedNetwork = cloneNet(network); + if (s == StatusCode::OK && pstats && !pstats->isEmpty()) { CNNNetworkInt8Normalizer cnnorm; - clonedNetwork = cloneNet(network); cnnorm.NormalizeNetwork(*clonedNetwork, *pstats); } - bool ti_proc_ok = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true; + + bool ti_proc_ok = !NetPass::CombineRNNSeq(*clonedNetwork) ? NetPass::UnrollTI(*clonedNetwork) : true; + ti_proc_ok &= NetPass::UnrollRNN_if(*clonedNetwork, [] (RNNCellBase rnn) -> bool { + if (rnn.clip != 0.0f) + return true; + if ((rnn.cellType == RNNCellBase::GRU || rnn.cellType == RNNCellBase::GRU_LBR) && + rnn.activations != std::vector {"sigmoid", "tanh"}) + return true; + if (rnn.cellType == RNNCellBase::LSTM && + rnn.activations != std::vector {"sigmoid", "tanh", "tanh"}) + return true; + return false; + }); if (!ti_proc_ok) THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. " "None TI optimization pattern has been applied successfully"; @@ -1054,7 +967,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network if (cfg.batchLimit > 1) { // check topology for applicability - if (!CanProcessDynBatch(clonedNetwork ? *clonedNetwork : network)) { + if (!CanProcessDynBatch(*clonedNetwork)) { THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!"; } } @@ -1081,7 +994,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network } _graph->setConfig(cfg); - _graph->CreateGraph(clonedNetwork ? *clonedNetwork : network, extensionManager); + _graph->CreateGraph(*clonedNetwork, extensionManager); if (cfg.throughputStreams > 1) // for streams, each worker thread has it's own graph MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph; }); @@ -1126,3 +1039,7 @@ void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr & mkldnnSyncRequest->SetGraph(graphs[0]); } } + +void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) { + graphPtr = graphs[0]->dump(); +} \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h index de026b5..7b01c71 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -111,8 +111,9 @@ public: #endif } + InferenceEngine::ICNNNetwork::Ptr dump() const; + protected: - MKLDNNNodePtr FindNodeWithName(const std::string& name) const; void VisitNode(MKLDNNNodePtr node, std::vector& sortedNodes); void SortTopologically(); @@ -144,6 +145,8 @@ protected: #endif mkldnn::engine eng; + void Replicate(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr); + void InitGraph(); void InitNodes(); void InitEdges(); void Allocate(); @@ -164,8 +167,6 @@ private: InferenceEngine::CNNLayerPtr cnnLayer; size_t outIdx; }; - void ParseNode(const InferenceEngine::CNNLayerPtr& cnnLayer, MKLDNNNodePtr& parent, - const MKLDNNExtensionManager::Ptr& extMgr, size_t outIdx, std::vector& layers); }; @@ -188,6 +189,8 @@ public: void setProperty(const std::map &properties); + void GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &graphPtr) override; + protected: std::vector graphs; MKLDNNExtensionManager::Ptr extensionManager; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp index ae24579..8b9bcc8 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp @@ -1,10 +1,23 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "mkldnn_graph_dumper.h" #include "cnn_network_impl.hpp" #include "ie_util_internal.hpp" +#include "exec_graph_info.hpp" #include #include @@ -33,7 +46,7 @@ std::shared_ptr dump_graph_as_ie_net(const MKLDNNGraph &graph) { auto net = std::make_shared(); net->setPrecision(Precision::FP32); - net->setName("internal_cpu_graph"); + net->setName("runtime_cpu_graph"); std::map node2layer; // Copy all nodes to network @@ -109,6 +122,7 @@ static std::map type_n2l { {Lrn, "Lrn"}, {Pooling, "Pool"}, {FullyConnected, "FC"}, + {FullyConnected_Activation, "FC_Activ"}, {SoftMax, "SoftMax"}, {Split, "Split"}, {Concatenation, "Concat"}, @@ -122,37 +136,24 @@ static std::map type_n2l { {BatchNormalization, "BatchNorm"}, {Flatten, "Flatten"}, {Permute, "Permute"}, + {Quantize, "Quantize"}, + {BinaryConvolution, "BinaryConvolution"}, {MemoryOutput, "MemoryIn"}, {MemoryInput, "MemoryOut"} }; -static const std::string ORIGIN_NAMES = "origin"; -static const std::string IMPL_TYPE = "impl"; -static const std::string PRECISION = "prec"; -static const std::string PERF_COUNTER = "perf"; - -static const std::string BLUE = "#D8D9F1"; -static const std::string GREEN = "#D9EAD3"; +static const char BLUE[] = "#D8D9F1"; +static const char GREEN[] = "#D9EAD3"; void copy_node_metadata(const MKLDNNNodePtr &node, CNNLayer::Ptr &layer) { layer->type = type_n2l[node->getType()]; layer->name = node->getName(); // Is ID - if (node->getCnnLayer()) { - // Original layer names - std::vector internal = node->getFusedWith(); - auto &merged = node->getMergeWith(); - internal.insert(internal.end(), merged.begin(), merged.end()); - - std::string orig_names = node->getCnnLayer()->name; - for (auto &sub_node : internal) - orig_names += " " + sub_node->getCnnLayer()->name; - - layer->params[ORIGIN_NAMES] = orig_names; - } + // Original layers + layer->params[ExecGraphInfoSerialization::ORIGIN_NAMES] = node->getOriginalLayers(); // Implementation type name - layer->params[IMPL_TYPE] = node->getPrimitiveDescriptorType(); + layer->params[ExecGraphInfoSerialization::IMPL_TYPE] = node->getPrimitiveDescriptorType(); // Precision // TODO: That is not fully correct mapping type to precision. @@ -169,11 +170,13 @@ void copy_node_metadata(const MKLDNNNodePtr &node, CNNLayer::Ptr &layer) { if (impl_type & jit && impl_type & avx512 && node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8) precision = "INT8"; - layer->params[PRECISION] = precision; + layer->params[ExecGraphInfoSerialization::PRECISION] = precision; // Performance if (node->PerfCounter().avg() != 0) { - layer->params[PERF_COUNTER] = std::to_string(node->PerfCounter().avg())+ " mcs"; + layer->params[ExecGraphInfoSerialization::PERF_COUNTER] = std::to_string(node->PerfCounter().avg()); + } else { + layer->params[ExecGraphInfoSerialization::PERF_COUNTER] = "not_executed"; // it means it was not calculated yet } } @@ -183,25 +186,29 @@ void drawer_callback(const InferenceEngine::CNNLayerPtr layer, const auto ¶ms = layer->params; // Implementation - auto impl = params.find(IMPL_TYPE); + auto impl = params.find(ExecGraphInfoSerialization::IMPL_TYPE); if (impl != params.end()) { printed_properties.push_back({"impl", impl->second}); } // Original names - auto orig = params.find(ORIGIN_NAMES); + auto orig = params.find(ExecGraphInfoSerialization::ORIGIN_NAMES); if (orig != params.end()) { printed_properties.push_back({"originals", orig->second}); } // Precision - auto prec = params.find(PRECISION); + auto prec = params.find(ExecGraphInfoSerialization::PRECISION); if (prec != params.end()) { printed_properties.push_back({"precision", prec->second}); } // Set color node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE}); + + // Set xlabel containing PM data if calculated + auto perf = layer->params.find(ExecGraphInfoSerialization::PERF_COUNTER); + node_properties.push_back({"xlabel", (perf != layer->params.end()) ? perf->second : ""}); } } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h index 6ec5ffc..b419109 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index 6c88ebd..4723403 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include "cpu_isa_traits.hpp" using namespace mkldnn; using namespace MKLDNNPlugin; @@ -28,8 +31,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) { MergeGroupConvolution(graph); graph.RemoveDroppedNodes(); -// SLTMTransform(graph); -// RemoveDropped(graph); + FuseConvolutionAndDepthwise(graph); + graph.RemoveDroppedNodes(); FuseConvolutionAndActivation(graph); graph.RemoveDroppedNodes(); @@ -40,9 +43,15 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) { FuseConvolutionAndDWConvolution(graph); graph.RemoveDroppedNodes(); + FuseBinaryConvolutionAndQuantize(graph); + graph.RemoveDroppedNodes(); + FuseBatchNormWithScale(graph); graph.RemoveDroppedNodes(); + FuseFullyConnectedAndActivation(graph); + graph.RemoveDroppedNodes(); + RemoveIdentityOperator(graph); graph.RemoveDroppedNodes(); @@ -113,6 +122,9 @@ void MKLDNNGraphOptimizer::MergeGroupConvolution(MKLDNNGraph &graph) { conv->inDims[0] = convInDims; conv->outDims[0] = convOutDims; + conv->fuseWith(split); + conv->fuseWith(concat); + graph.DropNode(split); graph.DropNode(concat); } @@ -167,11 +179,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) { }; for (int i = 0; i < graphNodes.size(); i++) { - if (graphNodes[i]->getType() == Convolution) { + if (graphNodes[i]->getType() == Convolution || graphNodes[i]->getType() == BinaryConvolution) { auto conv = graphNodes[i]; auto fuse = [&] (MKLDNNNodePtr relu) { - conv->setType(Convolution_Activation); + if (graphNodes[i]->getType() != BinaryConvolution) + conv->setType(Convolution_Activation); conv->fuseWith(relu); }; @@ -215,9 +228,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) { auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { - return (node->getType() == Convolution || node->getType() == Convolution_Activation) && - node->getCnnLayer()->precision == Precision::FP32 && - (node->getChildEdges().size() == 1); + bool isSutableConv = (node->getType() == Convolution || node->getType() == Convolution_Activation) && + node->getCnnLayer()->precision == Precision::FP32; + bool isSutableBinConv = node->getType() == BinaryConvolution; + return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1; }; auto isSutableChildNode = [](MKLDNNNodePtr node) { @@ -240,7 +254,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) { if (!isSutableChildNode(depthwise0)) continue; conv->fuseWith(depthwise0); - conv->setType(Convolution_Depthwise); + if (conv->type != BinaryConvolution) + conv->setType(Convolution_Depthwise); if (depthwise0->getChildEdges().size() == 1) { auto depthwise1 = depthwise0->getChildEdgeAt(0)->getChild(); @@ -262,64 +277,163 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) { return node->getType() == Convolution || node->getType() == Convolution_Activation; }; + auto isBinaryConvolutionNode = [](MKLDNNNodePtr node) { + return node->getType() == BinaryConvolution; + }; + auto is1x1Convolution = [](ConvolutionLayer* layer) { return layer->_kernel[X_AXIS] == 1 && layer->_kernel[Y_AXIS] == 1; }; auto isSutableParentConvolution = [&](MKLDNNNodePtr node) { - auto* layer = dynamic_cast(node->getCnnLayer().get()); + if (isBinaryConvolutionNode(node)) { + auto *layer = dynamic_cast(node->getCnnLayer().get()); + + bool isSupportedParams = layer->_group == 1; + if (!isSupportedParams) return false; + } else { + auto *layer = dynamic_cast(node->getCnnLayer().get()); - bool isSupportedParams = layer->_group == 1 && - ((is1x1Convolution(layer) && - layer->_stride[X_AXIS] == 1 && layer->_stride[Y_AXIS] == 1) || !is1x1Convolution(layer)) && - layer->precision == Precision::FP32;; - if (!isSupportedParams) return false; + bool isSupportedParams = layer->_group == 1 && + ((is1x1Convolution(layer) && layer->_stride[X_AXIS] == 1 && + layer->_stride[Y_AXIS] == 1) || !is1x1Convolution(layer)) && + (layer->precision == Precision::FP32 || layer->precision == Precision::I8); + if (!isSupportedParams) return false; + } return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild()); }; - auto isSutableChildConvolution = [](MKLDNNNodePtr node) { - auto* layer = dynamic_cast(node->getCnnLayer().get()); - auto allPads = getPaddings(*layer); - bool isSupportedParams = layer->_out_depth == layer->_group && + auto isSutableChildConvolution = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) { + auto* childLayer = dynamic_cast(childNode->getCnnLayer().get()); - layer->_out_depth != 1 && - // Depthwise convolution output should be multiple of 8 + if (!isBinaryConvolutionNode(parentNode)) { + auto* parentLayer = dynamic_cast(parentNode->getCnnLayer().get()); + if (parentLayer->precision != childLayer->precision) + return false; + } - layer->_kernel[X_AXIS] == 3 && layer->_kernel[Y_AXIS] == 3 && + auto allPads = getPaddings(*childLayer); + bool isSupportedParams = childLayer->_out_depth == childLayer->_group && + childLayer->_out_depth != 1 && + // Depthwise convolution output should be multiple of 8 + childLayer->_kernel[X_AXIS] == 3 && childLayer->_kernel[Y_AXIS] == 3 && allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 && - layer->_dilation[X_AXIS] == 1 && layer->_dilation[Y_AXIS] == 1 && - layer->_biases != nullptr && layer->_biases->size() != 0 && - layer->precision == Precision::FP32; + childLayer->_dilation[X_AXIS] == 1 && childLayer->_dilation[Y_AXIS] == 1 && + childLayer->_biases != nullptr && childLayer->_biases->size() != 0; + return isSupportedParams; }; - auto isFusingWorthwhile = [](MKLDNNNodePtr node) { - auto inDims = node->inDims[0]; - auto outDims = node->outDims[0]; + auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) { + if (isBinaryConvolutionNode(parentNode)) { + return true; + } + + auto* layer = dynamic_cast(childNode->getCnnLayer().get()); + + auto inDims = childNode->inDims[0]; + auto outDims = childNode->outDims[0]; + int elemSize = MKLDNNExtensionUtils::sizeOfDataType(MKLDNNExtensionUtils::IEPrecisionToDataType(layer->precision)); int L3_cache_size = mkldnn_get_cache_size(3, false); - int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * sizeof(float); - int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * sizeof(float); - return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2); + int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize; + int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize; + + bool isInt8 = layer->precision == Precision::I8 || layer->precision == Precision::U8; + bool isAVX512NotSupported = !mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common); + + return isInt8 ? isAVX512NotSupported : (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2); }; for (int i = 0; i < graphNodes.size(); i++) { - if (!isConvolutionNode(graphNodes[i])) continue; + if (!isConvolutionNode(graphNodes[i]) && !isBinaryConvolutionNode(graphNodes[i])) continue; auto parentConvNode = graphNodes[i]; if (!isSutableParentConvolution(parentConvNode)) continue; auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild(); - if (!isSutableChildConvolution(childConvNode)) continue; + if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue; - if (!isFusingWorthwhile(childConvNode)) continue; + if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue; parentConvNode->fuseWith(childConvNode); graph.DropNode(childConvNode); } } +void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph) { + auto removeEdge = [](MKLDNNGraph &graph, MKLDNNEdgePtr& edge) { + auto& edges = graph.GetEdges(); + for (auto it = edges.begin(); it != edges.end(); it++) { + if ((*it) == edge) { + edges.erase(it); + return; + } + } + }; + + auto& graphNodes = graph.GetNodes(); + + auto isSutableParentNode = [](MKLDNNNodePtr node) { + bool isSutableBinConv = node->getType() == BinaryConvolution; + return isSutableBinConv && node->getChildEdges().size() == 1; + }; + + auto isSutableChildNode = [](MKLDNNNodePtr node) { + if (!node->getCnnLayer()) + return false; + + auto* quantizeLayer = dynamic_cast(node->getCnnLayer().get()); + bool isSutableQuantize = node->getType() == Quantize && quantizeLayer->levels == 2; + + return isSutableQuantize; + }; + + for (int i = 0; i < graphNodes.size(); i++) { + auto parent = graphNodes[i]; + if (!isSutableParentNode(parent)) continue; + + auto child = parent->getChildEdgeAt(0)->getChild(); + if (!isSutableChildNode(child)) continue; + + parent->fuseWith(child); + + auto* binConvNode = dynamic_cast(parent.get()); + + auto parents = child->parentEdges; + for (size_t i = 0; i < parents.size(); i++) { + auto p_edge = parents[i].lock(); + if (p_edge->getParent()->getType() == Input) { + InferenceEngine::SizeVector dims; + dims.push_back(binConvNode->getChildEdgeAt(0)->getDims()[1]); + + auto InputLowBlob = dynamic_cast*>(p_edge->getParent()->getCnnLayer()->blobs["custom"].get()); + + auto inputLowData = InputLowBlob->buffer().as(); + int inputLowAxis = p_edge->getDims().ndims() == 1 ? 0 : 1; + bool isInputLowBroadcasted = p_edge->getDims()[inputLowAxis] != dims[0]; + + for (int i = 0; i < dims[0]; i++) { + binConvNode->pushBinarizationThreshold(inputLowData[isInputLowBroadcasted ? 0 : i]); + } + + break; + } + } + + for (size_t i = 0; i < parents.size(); i++) { + auto p_edge = parents[i].lock(); + if (p_edge->getParent()->getType() == BinaryConvolution) + continue; + + removeEdge(graph, p_edge); + } + + graph.DropNode(child); + } +} + /** * Check if there is a data dependency between parent and child * BFS starting from parent and comparing with child @@ -417,18 +531,18 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG if (!std::dynamic_pointer_cast(graphNode)->isSum()) continue; if (!std::dynamic_pointer_cast(graphNode)->isUnitScales()) continue; + auto parent1 = graphNode->getParentEdgeAt(0)->getParent(); + auto parent2 = graphNode->getParentEdgeAt(1)->getParent(); // TODO: Enlarge to several inputs if (graphNode->getParentEdges().size() != 2 || - (graphNode->getParentEdgeAt(0)->getParent()->getType() != Convolution && - graphNode->getParentEdgeAt(1)->getParent()->getType() != Convolution)) + (parent1->getType() != Convolution && parent1->getType() != BinaryConvolution && + parent2->getType() != Convolution && parent2->getType() != BinaryConvolution)) continue; - auto parent1 = graphNode->getParentEdgeAt(0)->getParent(); - auto parent2 = graphNode->getParentEdgeAt(1)->getParent(); - - auto mergedConv = (parent1->getType() == Convolution) ? parent1 : parent2; - auto peerNode = (parent1->getType() == Convolution) ? parent2 : parent1; - if (peerNode->getType() == Convolution && mergedConv->getChildEdges().size() != 1) { + auto mergedConv = (parent1->getType() == Convolution || parent1->getType() == BinaryConvolution) ? parent1 : parent2; + auto peerNode = (parent1->getType() == Convolution || parent1->getType() == BinaryConvolution) ? parent2 : parent1; + if ((peerNode->getType() == Convolution || peerNode->getType() == BinaryConvolution) && + mergedConv->getChildEdges().size() != 1) { mergedConv = parent2; peerNode = parent1; } @@ -455,16 +569,23 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) { auto relu_shared = graphNode->getChildEdgeAt(0)->getChild(); lastNode = relu_shared; - mergedConv->setType(Convolution_Sum_Activation); + if (mergedConv->getType() != BinaryConvolution) + mergedConv->setType(Convolution_Sum_Activation); mergedConv->fuseWith(sum); } else { - mergedConv->setType(Convolution_Sum); + if (mergedConv->getType() != BinaryConvolution) + mergedConv->setType(Convolution_Sum); } mergedConv->fuseWith(lastNode); - MKLDNNEdgePtr edgePtr(new MKLDNNEdge(peerNode, mergedConv)); - graph.GetEdges().push_back(edgePtr); + if (mergedConv->fusedWith.size() > 0 && + (mergedConv->fusedWith[0]->getType() == Convolution || mergedConv->fusedWith[0]->getType() == BinaryConvolution)) { + // Merged with DW_conv. Shape may change + mergedConv->inDims.push_back(mergedConv->fusedWith[0]->outDims[0]); + } else { + mergedConv->inDims.push_back(mergedConv->outDims[0]); + } size_t childIdx = 0; for (childIdx = 0; childIdx < peerNode->getChildEdges().size(); childIdx++) { @@ -473,17 +594,29 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG } } - mergedConv->addEdge(edgePtr, mergedConv->getParentEdges().size(), childIdx); + int peer_port = peerNode->getChildEdgeAt(childIdx)->getInputNum(); + peerNode->getChildEdgeAt(childIdx)->drop(); + + MKLDNNEdgePtr edgePtr(new MKLDNNEdge(peerNode, mergedConv, peer_port, 1)); + graph.GetEdges().push_back(edgePtr); + + mergedConv->addEdge(edgePtr); - for (size_t j = 0; j < lastNode->getChildEdges().size(); j++) { - auto child = lastNode->getChildEdgeAt(j)->getChild(); - edgePtr = lastNode->getChildEdgeAt(j); - int idxParent = edgePtr->getOutputNum(); - int idxChild = edgePtr->getInputNum(); + std::vector edges_to_reconnect = lastNode->getChildEdges(); + for (auto &edge_w : edges_to_reconnect) { + auto edge = edge_w.lock(); + auto child = edge->getChild(); + int idxParent = edge->getInputNum(); + int idxChild = edge->getOutputNum(); - MKLDNNEdgePtr newEdge(new MKLDNNEdge(mergedConv, child)); + // reconnect after activation/sum. Port index must be 0 + IE_ASSERT(idxParent == 0); + + edge->drop(); + + MKLDNNEdgePtr newEdge(new MKLDNNEdge(mergedConv, child, idxParent, idxChild)); graph.GetEdges().push_back(newEdge); - child->addEdge(newEdge, idxParent, idxChild); + child->addEdge(newEdge); } if (lastNode != sum) { @@ -493,6 +626,40 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG } } +void MKLDNNGraphOptimizer::FuseFullyConnectedAndActivation(MKLDNNGraph &graph) { + auto& graphNodes = graph.GetNodes(); + + auto isFusingSupported = [&](MKLDNNNodePtr fc, MKLDNNNodePtr activation) { + if (!activation->getCnnLayer()) + return false; + + auto* activationNode = dynamic_cast(activation.get()); + + // TODO: fuse on fp32 not optimized yet in mkl-dnn + return activationNode && fc->getCnnLayer()->precision != Precision::FP32 && + (activationNode->getAlgorithm() == eltwise_relu); + }; + + for (int i = 0; i < graphNodes.size(); i++) { + if (graphNodes[i]->getType() == FullyConnected) { + auto fc = graphNodes[i]; + + auto fuse = [&] (MKLDNNNodePtr relu) { + fc->setType(FullyConnected_Activation); + fc->fuseWith(relu); + }; + + if (fc->getChildEdges().size() == 1) { + auto ch1 = fc->getChildEdgeAt(0)->getChild(); + + if (isFusingSupported(fc, ch1)) { + fuse(ch1); + graph.DropNode(ch1); + } + } + } + } +} void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) { for (MKLDNNNodePtr& node : graph.GetNodes()) { @@ -538,6 +705,7 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) { } MKLDNNNodePtr p = n->getParentEdgeAt(0)->getParent(); + MKLDNNNodePtr c = nn->getChildEdgeAt(0)->getChild(); auto oldEdgeNum = n->getParentEdgeAt(0)->getInputNum(); @@ -547,7 +715,12 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) { processed.insert(node); processed.insert(nextNode); - auto edge = p->getChildEdgeAt(oldEdgeNum); + MKLDNNEdgePtr edge; + for (auto cur : p->getChildEdgesAtPort(oldEdgeNum)) { + if (cur->getChild() == c) + edge = cur; + } + if (!edge) THROW_IE_EXCEPTION << "Inappropriate graph processing"; std::string layerName = edge->getParent()->getName() + "_ScaleReorder_" + edge->getChild()->getName(); @@ -560,37 +733,38 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) { reorderPtr->setDescs(n->getInput(), nn->getOutput()); reorderPtr->_scales = scales; } - MKLDNNEdgePtr beforeNode(new MKLDNNEdge(edge->getParent(), newReorder)); - beforeNode->setDims(edge->getDims()); - MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, edge->getChild())); - afterNode->setDims(edge->getDims()); - int oIndex = edge->getOutputNum(); - int iIndex = edge->getInputNum(); + // new !!! + auto oIndex = edge->getOutputNum(); + auto iIndex = edge->getInputNum(); if (iIndex < 0 || oIndex < 0) THROW_IE_EXCEPTION << "Cannot create reorder for nodes: " << edge->getParent()->getName() << " and " << edge->getChild()->getName() << "."; + edge->drop(); + + MKLDNNEdgePtr beforeNode(new MKLDNNEdge(edge->getParent(), newReorder, iIndex, 0)); + MKLDNNEdgePtr afterNode(new MKLDNNEdge(newReorder, edge->getChild(), 0, oIndex)); // Add edge for beforeNode - edge->getParent()->childEdges[iIndex].reset(); - edge->getParent()->childEdges[iIndex] = beforeNode; beforeNode->getChild()->parentEdges.push_back(beforeNode); + edge->getParent()->childEdges.push_back(beforeNode); // Add edge for afterNode afterNode->getParent()->childEdges.push_back(afterNode); - edge->getChild()->parentEdges[oIndex].reset(); - edge->getChild()->parentEdges[oIndex] = afterNode; + edge->getChild()->parentEdges.push_back(afterNode); newReorder->getSupportedDescriptors(); newReorder->initSupportedPrimitiveDescriptors(); newReorder->selectOptimalPrimitiveDescriptor(); - beforeNode->getDesc(); graph.GetEdges().push_back(beforeNode); - afterNode->getDesc(); graph.GetEdges().push_back(afterNode); + // Just to check accordance + afterNode->getDesc(); + beforeNode->getDesc(); + newNodes.push_back(newReorder); graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end()); } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h index 6818cc9..6a6d7d7 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,8 +23,10 @@ private: void FuseConvolutionAndActivation(MKLDNNGraph &graph); void FuseConvolutionAndDepthwise(MKLDNNGraph &graph); void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph); + void FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph); void FuseBatchNormWithScale(MKLDNNGraph& graph); void FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph); + void FuseFullyConnectedAndActivation(MKLDNNGraph &graph); void RemoveIdentityOperator(MKLDNNGraph& graph); void RemoveIOScaleShifts(MKLDNNGraph& graph); diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp index 95e8039..573ab06 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsDataMap networkInputs, InferenceEngine::OutputsDataMap networkOutputs) - : InferRequestInternal(networkInputs, networkOutputs), m_curBatch(-1) {} + : InferRequestInternal(networkInputs, networkOutputs) {} template void MKLDNNPlugin::MKLDNNInferRequest::pushInput(const std::string& inputName, InferenceEngine::Blob::Ptr& inputBlob) { @@ -218,6 +218,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const char *name, const Inference } if (foundInput->getPreProcess().getResizeAlgorithm() != InferenceEngine::ResizeAlgorithm::NO_RESIZE) { + PreProcessData::isApplicable(data, _inputs[name]); // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing. _preProcData[name].setRoiBlob(data); } else { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h index 6d88bc8..47f1191 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -46,7 +46,5 @@ private: void changeDefaultPtr(); MKLDNNGraph::Ptr graph; std::map externalPtr; - - int m_curBatch; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp index 1821b88..5d9c345 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -81,7 +81,7 @@ void MKLDNNMemory::SetData(memory::data_type dataType, memory::format format, co GetDataType() != dataType) { auto memData = GetDescriptor().data; - std::vector dims(memData.dims, memData.dims + memData.ndims); + std::vector dims(memData.dims, memData.dims + memData.ndims); auto dataType = GetDataType(); @@ -220,7 +220,7 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) { bool MKLDNNMemory::IsPlainFormat(memory::format format) { std::vector plains = {memory::nc, memory::nchw, memory::ncdhw, memory::nhwc, memory::ndhwc, memory::chwn, - memory::oi, memory::io, memory::oihw, memory::oidhw, memory::ihwo, + memory::oi, memory::io, memory::oihw, memory::oidhw, memory::ihwo, memory::tnc, memory::goihw, memory::blocked}; @@ -252,6 +252,7 @@ memory::format MKLDNNMemory::GetPlainFormat(memory::dims dims) { InferenceEngine::Layout MKLDNNMemory::GetPlainLayout(memory::dims dims) { switch (dims.size()) { + case 0: return Layout::SCALAR; case 1: return Layout::C; case 2: return Layout::NC; case 3: return Layout::CHW; @@ -290,7 +291,7 @@ void MKLDNNMemory::CreateBlockingDesc(memory::desc &desc) { const int prev_idx = perm[ndims - d]; const int curr_idx = perm[ndims - 1 - d]; - blk.strides[0][curr_idx] = dims[curr_idx] == 0 ? 1 : blk.strides[0][prev_idx] * (std::max)(1, dims[prev_idx]); + blk.strides[0][curr_idx] = dims[curr_idx] == 0 ? 1 : blk.strides[0][prev_idx] * (std::max)((ptrdiff_t)1, dims[prev_idx]); } } memory::format MKLDNNMemory::Convert(const InferenceEngine::Layout layout) { @@ -457,6 +458,9 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const { case mkldnn_s32: precision = Precision::I32; break; + case mkldnn_bin: + precision = Precision::BIN; + break; default: THROW_IE_EXCEPTION << "Cannot cast to TensorDesc. Unsupported precision!"; } @@ -510,10 +514,17 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const { case memory::nhwc: layout = Layout::NHWC; order = {0, 2, 3, 1}; - blkDims = {static_cast(dims[0]), - static_cast(dims[2]), - static_cast(dims[3]), - static_cast(dims[1])}; + if (precision == Precision::BIN) { + blkDims = {static_cast(dims[0]), + static_cast(dims[2]), + static_cast(dims[3]), + static_cast(rnd_up(dims[1], 8))}; + } else { + blkDims = {static_cast(dims[0]), + static_cast(dims[2]), + static_cast(dims[3]), + static_cast(dims[1])}; + } break; case memory::ndhwc: layout = Layout::NDHWC; @@ -621,7 +632,9 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): case Precision::I32: data_type = mkldnn::memory::data_type::s32; break; - + case Precision::BIN: + data_type = mkldnn::memory::data_type::bin; + break; default: THROW_IE_EXCEPTION << "Cannot create MKLDNNMemoryDesc from TensorDesc. Unsupported precision!"; } @@ -651,6 +664,7 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): case OIHW: mkldnnFormat = memory::format::oihw; break; + case SCALAR: case C: mkldnnFormat = memory::format::x; break; @@ -764,7 +778,7 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): const int prev_idx = perm[realDims.ndims() - d]; const int curr_idx = perm[realDims.ndims() - 1 - d]; - blk.strides[0][curr_idx] = realDims[curr_idx] == 0 ? 1 : blk.strides[0][prev_idx] * (std::max)(1, realDims[prev_idx]); + blk.strides[0][curr_idx] = realDims[curr_idx] == 0 ? 1 : blk.strides[0][prev_idx] * (std::max)((ptrdiff_t)1, realDims[prev_idx]); } } else { desc = MKLDNNMemoryDesc(realDims, data_type, mkldnnFormat); @@ -772,12 +786,12 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): desc.data.layout_desc.blocking.offset_padding = tDesc.getBlockingDesc().getOffsetPadding(); for (size_t i = 0; i < tDesc.getBlockingDesc().getOffsetPaddingToData().size() && i < TENSOR_MAX_DIMS; i++) { - desc.data.layout_desc.blocking.offset_padding_to_data[i] = static_cast(offsetsToData[i]); + desc.data.layout_desc.blocking.offset_padding_to_data[i] = static_cast(offsetsToData[i]); } if (notDefault) { for (size_t i = 0; i < strides.size() && i < desc.data.ndims; i++) { - desc.data.layout_desc.blocking.strides[0][i] = static_cast(strides[order[i]]); + desc.data.layout_desc.blocking.strides[0][i] = static_cast(strides[order[i]]); } } } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h index 37578e5..0a047dd 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -76,7 +76,10 @@ public: } void* GetData() const { - return prim->get_data_handle(); + void* data = prim->get_data_handle(); + if (data == nullptr) + THROW_IE_EXCEPTION << "Cannot get memory!"; + return data; } mkldnn::memory::data_type GetDataType() const { @@ -92,7 +95,7 @@ public: mkldnn::memory::dims GetDims() const { auto data = GetDescriptor().data; - return std::vector(data.dims, data.dims + data.ndims); + return std::vector(data.dims, data.dims + data.ndims); } void Create(mkldnn::memory::dims dims, mkldnn::memory::data_type data_type, mkldnn::memory::format format, diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index 73975b7..5740080 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include "mkldnn_extension_utils.h" #include "mkldnn_plugin.h" @@ -70,6 +72,8 @@ MKLDNNNode::Register MKLDNNSoftMaxNode::reg; MKLDNNNode::Register MKLDNNSplitNode::reg; MKLDNNNode::Register MKLDNNTileNode::reg; MKLDNNNode::Register MKLDNNPermuteNode::reg; +MKLDNNNode::Register MKLDNNQuantizeNode::reg; +MKLDNNNode::Register MKLDNNBinaryConvolutionNode::reg; MKLDNNNode::Register MKLDNNMemoryInputNode::reg; MKLDNNNode::Register MKLDNNMemoryOutputNode::reg; MKLDNNNode::Register MKLDNNRNN::reg; @@ -91,7 +95,6 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn:: } } - parentEdges.resize(layer->insData.size()); for (const auto& inData : layer->insData) { inDims.emplace_back(inData.lock()->getDims()); } @@ -109,7 +112,7 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn:: } } -void MKLDNNNode::addEdge(const MKLDNNEdgeWeakPtr& edge, size_t pIndex, size_t cIndex, bool insertChildIndex) { +void MKLDNNNode::addEdge(const MKLDNNEdgeWeakPtr& edge) { auto edgePtr = edge.lock(); if (!edgePtr) return; @@ -117,22 +120,9 @@ void MKLDNNNode::addEdge(const MKLDNNEdgeWeakPtr& edge, size_t pIndex, size_t cI auto childPtr = edgePtr->getChild(); if (!parentPtr || !childPtr) return; - if (cIndex < parentPtr->childEdges.size()) { - if (insertChildIndex) { - parentPtr->childEdges.insert(parentPtr->childEdges.begin() + cIndex, edge); - } else { - removeEdge(parentPtr->childEdges[cIndex]); - parentPtr->childEdges[cIndex] = edge; - } - } else { - parentPtr->childEdges.push_back(edge); - } - if (pIndex < childPtr->parentEdges.size()) { - removeEdge(childPtr->parentEdges[pIndex]); - childPtr->parentEdges[pIndex] = edge; - } else { - childPtr->parentEdges.push_back(edge); - } + + parentPtr->childEdges.push_back(edge); + childPtr->parentEdges.push_back(edge); } void MKLDNNNode::removeEdge(const MKLDNNEdgeWeakPtr& edge) { @@ -146,24 +136,26 @@ void MKLDNNNode::removeEdge(const MKLDNNEdgeWeakPtr& edge) { for (auto it = childPtr->parentEdges.begin(); it != childPtr->parentEdges.end(); it++) { auto parentEdge = (*it).lock(); if (parentEdge && parentEdge->getChild() == childPtr && parentEdge->getParent() == parentPtr) { - (*it).reset(); + childPtr->parentEdges.erase(it); break; } } for (auto it = parentPtr->childEdges.begin(); it != parentPtr->childEdges.end(); it++) { auto childEdge = (*it).lock(); if (childEdge && childEdge->getChild() == childPtr && childEdge->getParent() == parentPtr) { - (*it).reset(); + parentPtr->childEdges.erase(it); break; } } } void MKLDNNNode::remove() { - for (const auto &parentEdge : parentEdges) { + auto parent_edges = parentEdges; + for (const auto &parentEdge : parent_edges) { removeEdge(parentEdge); } - for (const auto &childEdge : childEdges) { + auto child_edges = childEdges; + for (const auto &childEdge : child_edges) { removeEdge(childEdge); } } @@ -355,11 +347,42 @@ const MKLDNNEdgePtr MKLDNNNode::getChildEdgeAt(size_t idx) const { return childEdgePtr; } +const std::vector MKLDNNNode::getParentEdgesAtPort(size_t idx) const { + if (idx >= inDims.size()) + THROW_IE_EXCEPTION << "Node " << getName() << " contains less input ports than " << idx; + + std::vector res; + for (auto &edge_w : parentEdges) { + auto edge = edge_w.lock(); + if (!edge) + THROW_IE_EXCEPTION << "Node " << getName() << " contains dead weak ptr"; + if (edge->getOutputNum() == idx) res.push_back(edge); + } + return res; +} + +const std::vector MKLDNNNode::getChildEdgesAtPort(size_t idx) const { + if (idx >= outDims.size()) + THROW_IE_EXCEPTION << "Node " << getName() << " contains less output ports than " << idx; + + std::vector res; + for (auto &edge_w : childEdges) { + auto edge = edge_w.lock(); + if (!edge) + THROW_IE_EXCEPTION << "Node " << getName() << " contains dead weak ptr"; + if (edge->getInputNum() == idx) res.push_back(edge); + } + return res; +} + + std::vector MKLDNNNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const { if (dims.ndims() == 1) return {memory::format::x}; else if (dims.ndims() == 2) return {memory::format::nc}; + else if (dims.ndims() == 3) + return {memory::format::tnc, memory::format::ntc}; else if (dims.ndims() == 4) return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c}; else if (dims.ndims() == 5) @@ -379,7 +402,7 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() { for (auto& desc : descs) { try { - primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine); + std::shared_ptr itpd = std::make_shared(desc.createPrimitiveDescriptorIterator(engine)); do { InferenceEngine::LayerConfig config; config.dynBatchSupport = true; @@ -387,7 +410,7 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() { InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = -1; dataConfig.constant = false; - dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getSrcMemDesc(itpd, i)); + dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getSrcMemDesc(*itpd, i)); config.inConfs.push_back(dataConfig); } @@ -395,13 +418,13 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() { InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = canBeInPlace() ? 0 : -1; dataConfig.constant = false; - dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getDstMemDesc(itpd, i)); + dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getDstMemDesc(*itpd, i)); config.outConfs.push_back(dataConfig); } - impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); + impl_desc_type impl_type = parse_impl_name(itpd->get_impl_info_str()); supportedPrimitiveDescriptors.emplace_back(config, impl_type); - } while (itpd.next()); + } while (itpd->next()); } catch (std::exception& e) { // it throw exception in case of no implementation found continue; @@ -422,12 +445,19 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) { outDescs.push_back(outConf.desc); createDescriptor({inDescs}, {outDescs}); + std::shared_ptr attr = initPrimitiveAttr(); + InferenceEngine::LayerConfig rightConfig = getSelectedPrimitiveDescriptor()->getConfig(); size_t selected_count = 0; for (size_t j = 0; j < descs.size(); j++) { try { const auto &desc = descs[j]; - primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine); + std::shared_ptr itpd; + if (attr == nullptr) { + itpd = std::make_shared(desc.createPrimitiveDescriptorIterator(engine)); + } else { + itpd = std::make_shared(desc.createPrimitiveDescriptorIterator(engine, *(attr.get()))); + } do { InferenceEngine::LayerConfig cfg; cfg.dynBatchSupport = true; @@ -435,7 +465,7 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) { InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = canBeInPlace() ? 0 : -1; dataConfig.constant = false; - dataConfig.desc = getSrcMemDesc(itpd, i); + dataConfig.desc = getSrcMemDesc(*itpd, i); cfg.inConfs.push_back(dataConfig); } @@ -443,10 +473,10 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) { InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = -1; dataConfig.constant = false; - dataConfig.desc = getDstMemDesc(itpd, i); + dataConfig.desc = getDstMemDesc(*itpd, i); cfg.outConfs.push_back(dataConfig); } - impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str().c_str()); + impl_desc_type impl_type = parse_impl_name(itpd->get_impl_info_str().c_str()); if (selected_count == selectedPrimitiveDescriptorIndex) { if (impl_type != selectedPD->getImplementationType()) { THROW_IE_EXCEPTION << "Cannot get the original layer configuration!"; @@ -459,7 +489,7 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) { } } selected_count++; - } while (itpd.next()); + } while (itpd->next()); } catch(...) {} } @@ -505,31 +535,49 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV intLayout = InferenceEngine::Layout::OIHW; InferenceEngine::TensorDesc desc(blb->precision(), dims, intLayout); - InferenceEngine::TBlob::Ptr internalBlob = InferenceEngine::make_shared_blob(desc); - internalBlob->allocate(); - char *data = internalBlob->buffer(); - size_t intBuffSize = internalBlob->byteSize(); - - size_t offset = blb->byteSize(); - checkSize(intBuffSize, offset); - ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize()); - data += blb->byteSize(); - for (const auto &merged : getMergeWith()) { - wLayer = dynamic_cast(merged->getCnnLayer().get()); - if (wLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot convert merged weightable layer for node " - << getName() << "."; - blb = weights ? wLayer->_weights : wLayer->_biases; - - if (blb == nullptr) - THROW_IE_EXCEPTION << "Cannot get internal blob layer for node " << getName() << "."; - offset += blb->byteSize(); + + auto fillInternalBlob = [&](char *data, size_t intBuffSize) { + size_t offset = blb->byteSize(); checkSize(intBuffSize, offset); - ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize()); + ie_memcpy(data, intBuffSize, blb->buffer(), blb->byteSize()); data += blb->byteSize(); - } + for (const auto &merged : getMergeWith()) { + wLayer = dynamic_cast(merged->getCnnLayer().get()); + if (wLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert merged weightable layer for node " + << getName() << "."; + blb = weights ? wLayer->_weights : wLayer->_biases; + + if (blb == nullptr) + THROW_IE_EXCEPTION << "Cannot get internal blob layer for node " << getName() << "."; + offset += blb->byteSize(); + checkSize(intBuffSize, offset); + ie_memcpy(data, intBuffSize, blb->buffer(), blb->byteSize()); + data += blb->byteSize(); + } + }; - return internalBlob; + if (blb->precision() == Precision::BIN) { + InferenceEngine::TBlob::Ptr internalBlob = InferenceEngine::make_shared_blob(desc); + + internalBlob->allocate(); + char *data = internalBlob->buffer(); + size_t intBuffSize = internalBlob->byteSize(); + + fillInternalBlob(data, intBuffSize); + + return internalBlob; + } else { + InferenceEngine::TBlob::Ptr internalBlob = InferenceEngine::make_shared_blob(desc); + + internalBlob->allocate(); + char *data = internalBlob->buffer(); + size_t intBuffSize = internalBlob->byteSize(); + + fillInternalBlob(data, intBuffSize); + + return internalBlob; + } } void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::primitive_desc_iterator& itpd) { @@ -632,6 +680,15 @@ MKLDNNNode::ConstantType MKLDNNNode::checkConstant(LOOK look, std::vectorname; + } else { + originalLayers += "," + layer->name; + } +} + void MKLDNNNode::cleanup() { internalBlobs.clear(); cnnLayer.reset(); @@ -673,6 +730,8 @@ std::string MKLDNNNode::typeToStr(Type type) { return "Pooling"; case FullyConnected: return "FullyConnected"; + case FullyConnected_Activation: + return "FullyConnected_Activation"; case Gemm: return "Gemm"; case SoftMax: @@ -707,10 +766,10 @@ std::string MKLDNNNode::typeToStr(Type type) { return "MemoryOutput"; case MemoryInput: return "MemoryInput"; - case RNN: - return "RNN"; - case LSTMCell: - return "LSTMCell"; + case RNNSeq: + return "RNNSeq"; + case RNNCell: + return "RNNCell"; default: return "Unknown"; @@ -877,7 +936,7 @@ void MKLDNNNode::initOptimalPrimitiveDescriptor() { config.outConfs[i].desc = getConfiguredOutputDesc(config, i); } initDescriptor(config); - } else if (getType() != RNN && getType() != LSTMCell) { + } else if (getType() != RNNSeq && getType() != RNNCell) { initDescriptor(config); } } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index fe71c66..b3060f8 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -43,6 +43,7 @@ enum Type { Lrn, Pooling, FullyConnected, + FullyConnected_Activation, SoftMax, Split, Concatenation, @@ -60,8 +61,10 @@ enum Type { Copy, MemoryOutput, MemoryInput, - LSTMCell, - RNN + RNNCell, + RNNSeq, + Quantize, + BinaryConvolution }; static Type TypeFromName(const std::string type) { @@ -78,6 +81,8 @@ static Type TypeFromName(const std::string type) { { "Logistic", Activation }, { "TanH", Activation }, { "ReLU6", Activation }, + { "Exp", Activation }, + { "Not", Activation }, { "Activation", Activation }, { "ScaleShift", Depthwise }, { "PReLU", Depthwise }, @@ -105,8 +110,14 @@ static Type TypeFromName(const std::string type) { { "Flatten", Flatten }, { "Permute", Permute }, { "Copy", Copy }, - { "LSTMCell", LSTMCell }, - { "RNN", RNN }, + { "LSTMCell", RNNCell }, + { "GRUCell", RNNCell }, + { "RNNCell", RNNCell }, + { "LSTMSequence", RNNSeq }, + { "GRUSequence", RNNSeq }, + { "RNNSequence", RNNSeq }, + { "Quantize", Quantize }, + { "BinaryConvolution", BinaryConvolution }, { "MemoryInput", MemoryInput}, // for construction from name ctor, arbitrary name is used { "Memory", MemoryOutput }, // for construction from layer ctor }; @@ -152,7 +163,7 @@ public: ~MKLDNNNode() override = default; - void addEdge(const MKLDNNEdgeWeakPtr& edge, size_t pIndex, size_t cIndex, bool insertChildIndex = false); + void addEdge(const MKLDNNEdgeWeakPtr& edge); void removeEdge(const MKLDNNEdgeWeakPtr& edge); virtual void cleanup(); @@ -169,6 +180,8 @@ public: const MKLDNNEdgePtr getParentEdgeAt(size_t idx) const; virtual const MKLDNNEdgePtr getChildEdgeAt(size_t idx) const; + const std::vector getParentEdgesAtPort(size_t idx) const; + const std::vector getChildEdgesAtPort(size_t idx) const; bool isDropped() { return (isEdgesEmpty(childEdges) && isEdgesEmpty(parentEdges)); @@ -190,6 +203,8 @@ public: mergedWith.push_back(merge); } + void addOriginalLayer(const InferenceEngine::CNNLayerPtr &layer); + const std::vector &getMergeWith() { return mergedWith; } @@ -202,6 +217,10 @@ public: return name; } + const std::string getOriginalLayers() const { + return originalLayers; + } + Type getType() const { return type; } @@ -309,17 +328,19 @@ public: THROW_IE_EXCEPTION << "Primitive descriptor was not found for node " << getName() << "."; } - static void invertVectorCopyUtoI(const InferenceEngine::PropertyVector& src, std::vector& dst) { + static void invertVectorCopyUtoI(const InferenceEngine::PropertyVector& src, std::vector& dst) { dst.clear(); for (int i = 1; i <= src.size(); i++) { - dst.push_back(static_cast(src[src.size() - i])); + dst.push_back(static_cast(src[src.size() - i])); } } + std::vector inDims; + + protected: // TODO: It is necessary only in order to avoid modifications of cnnLayers and original topology std::vector outDims; - std::vector inDims; void setType(Type type) { this->type = type; } @@ -331,6 +352,8 @@ protected: virtual MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx); virtual MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx); + virtual std::shared_ptr initPrimitiveAttr() const { return nullptr; } + typedef std::function GetPrimitiveMemoryFormatFunc; std::vector internalBlobDesc; @@ -339,6 +362,8 @@ protected: std::vector mergedWith; std::vector implPriorities; + std::string originalLayers; // contains names of the original layers separated by comma + MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng); int selectedPrimitiveDescriptorIndex = -1; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp index 35a965a..d5a48aa 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -92,12 +92,8 @@ void Engine::QueryNetwork(const ICNNNetwork& network, const std::map()); return OK; } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h index 383feaa..6cbed84 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp index f9e59f2..96672cb 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h index 075afff..f960d53 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp index a519837..b50552f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -34,7 +34,7 @@ bool check_env_variables() { #if !(defined(__APPLE__) || defined(_WIN32)) /* Get the cores affinity mask for the current process */ bool get_process_mask(int& ncpus, cpu_set_t*& mask) { - for (ncpus = sizeof(cpu_set_t) / CHAR_BIT; ncpus < 1024 /* reasonable limit of #cores*/; ncpus <<= 1) { + for (ncpus = sizeof(cpu_set_t) / CHAR_BIT; ncpus < 32768 /* reasonable limit of #cores*/; ncpus <<= 1) { mask = CPU_ALLOC(ncpus); if (!mask) return false; @@ -61,6 +61,8 @@ bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) { /* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask. * The function can also handle the hyper-threading (by populating the physical cores first) */ bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) { + if (proc_mask == nullptr) + return false; const size_t size = CPU_ALLOC_SIZE(ncores); const int num_cpus = CPU_COUNT_S(size, proc_mask); thr_idx %= num_cpus; // To limit unique number in [; num_cpus-1] range @@ -337,6 +339,7 @@ void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBlob(const char *name, const } if (foundInput->getPreProcess().getResizeAlgorithm() != InferenceEngine::ResizeAlgorithm::NO_RESIZE) { + PreProcessData::isApplicable(data, _inputs[name]); // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing. _preProcData[name].setRoiBlob(data); } else { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.h b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h index 31558fe..baa7c8d 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_streams.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp index d23b12e..4379d8a 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -76,6 +76,16 @@ caseless_mapGetParamAsFloat("max", 1.0f); beta = activationLayer->GetParamAsFloat("min", 0.0f); algorithm = eltwise_clamp; + }}, + {"exp", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + algorithm = eltwise_exp; + }}, + {"not", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { + alpha = 0.0f; + beta = 0.0f; + algorithm = eltwise_not; }} }; @@ -107,9 +117,9 @@ void MKLDNNActivationNode::createPrimitive() { if (prim) return; - auto prim_desc = createPrimitiveDescriptor(); + auto prim_desc = createPrimitiveDescriptor(); - prim.reset(new relu_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), + prim.reset(new eltwise_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), getChildEdgeAt(0)->getMemory().GetPrimitive())); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h index 9dac150..3b9cc7e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp index 173df1c..d1f777f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -71,6 +71,8 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() { InferenceEngine::TBlob::Ptr internalBlob = InferenceEngine::make_shared_blob(desc); internalBlob->allocate(); float * data = internalBlob->buffer(); + if (data == nullptr) + THROW_IE_EXCEPTION << "Cannot get memory!"; InferenceEngine::Blob::Ptr blb = scshLayer->_weights; if (blb == nullptr) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h index c7d9d3e..b306f5e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp new file mode 100644 index 0000000..b1e3ac2 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp @@ -0,0 +1,461 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_bin_conv_node.h" +#include "mkldnn_reorder_node.h" +#include "mkldnn_input_node.h" +#include "mkldnn_activation_node.h" +#include "desc_iterator.hpp" +#include "mkldnn_eltwise_node.h" +#include "mkldnn_depthwise_node.h" +#include "mkldnn_quantize_node.h" +#include "mkldnn_conv_node.h" +#include +#include +#include +#include +#include +#include + +using namespace mkldnn; +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +MKLDNNBinaryConvolutionNode::MKLDNNBinaryConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) + : MKLDNNNode(layer, eng) { + internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc { + return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc()); + }); +} + +void MKLDNNBinaryConvolutionNode::getSupportedDescriptors() { + if (!descs.empty()) + return; + + auto* binConvLayer = dynamic_cast(getCnnLayer().get()); + if (binConvLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert convolution layer."; + + if (getChildEdges().empty()) + THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); + + if ((getParentEdgeAt(0)->getDims().ndims() < 4) || (getParentEdgeAt(0)->getDims().ndims() > 5)) { + THROW_IE_EXCEPTION << "Convolution layer. Unsupported mode. Only 4D and 5D blobs are supported as input."; + } + + isMerged = (!getMergeWith().empty()); // grouped convolution was constructed from split->concat subgraph + isGrouped = binConvLayer->_group != 1; // group info available from IR + if (isMerged && isGrouped) + THROW_IE_EXCEPTION << "Convolution initialization. Group splitted mode are used together with direct group specification."; + + // default values. Can be replaced in next steps + size_t groupNum = binConvLayer->_group; + pad_value = binConvLayer->_pad_value; + size_t groupIC = binConvLayer->_in_depth; + size_t groupOC = binConvLayer->_out_depth; + + isDW = groupNum == groupOC && groupNum == groupIC; + + if (isMerged) { + groupNum = getMergeWith().size() + 1; + } + if (isGrouped) { + groupIC /= groupNum; + groupOC /= groupNum; + } + + weightDims.clear(); + weightDims.push_back(groupOC); + weightDims.push_back(groupIC); + for (int i = 1; i <= binConvLayer->_kernel.size(); i++) { + weightDims.push_back(binConvLayer->_kernel[binConvLayer->_kernel.size() - i]); + } + biasesDims = { groupOC * groupNum }; + + if (isGrouped || isMerged) weightDims.insert(weightDims.begin(), groupNum); + + internalBlobs.push_back(createInternalBlob(weightDims, true)); + + Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second; + + invertVectorCopyUtoI(binConvLayer->_stride, stride); + for (int i = 1; i <= binConvLayer->_dilation.size(); i++) { + dilation.push_back(static_cast(binConvLayer->_dilation[binConvLayer->_dilation.size() - i]) - 1); + } + + auto allPads = getPaddings(*binConvLayer); + invertVectorCopyUtoI(allPads.begin, paddingL); + invertVectorCopyUtoI(allPads.end, paddingR); + + MKLDNNDims weightsDims = MKLDNNDims(weightDims); + + for (int i = 0; i < paddingR.size(); i++) { + int with_group = (isGrouped || isMerged) ? 1 : 0; + int krn = weightsDims[with_group + 2 + i]; + int src = getParentEdgeAt(0)->getDims()[2 + i]; + int dst = getChildEdgeAt(0)->getDims()[2 + i]; + + krn = (krn - 1)*(dilation[i] + 1) + 1; + int calc_dst = (src - krn + paddingL[i]) / stride[i] + 1; + paddingR[i] = (dst - calc_dst) * stride[i]; + } + + withSum = false; + withBinarization = false; + for (auto &node : fusedWith) { + auto* convolutionNode = dynamic_cast(node.get()); + if (convolutionNode) { + auto *convLayer = reinterpret_cast(convolutionNode->getCnnLayer().get()); + dw_conv_ih = convolutionNode->inDims[0][convolutionNode->inDims[0].ndims() - 2]; + dw_conv_iw = convolutionNode->inDims[0][convolutionNode->inDims[0].ndims() - 1]; + dw_conv_oc = convLayer->_out_depth; + for (int i = 0; i < convLayer->_kernel.size(); i++) { + dw_conv_kernel.push_back(convLayer->_kernel[i]); + } + for (int i = 0; i < convLayer->_stride.size(); i++) { + dw_conv_strides.push_back(convLayer->_stride[i]); + } + } + + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode) { + withSum = true; + } + + auto* quantizationNode = dynamic_cast(node.get()); + if (quantizationNode) { + withBinarization = true; + } + } + + if ((!withSum && getParentEdges().size() != 1) || (withSum && getParentEdges().size() != 2)) + THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); + + auto inputDataType = memory::bin; + auto outputDataType = withBinarization ? memory::bin : memory::f32; + + MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc); + MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc); + createDescriptor({in_candidate}, {out_candidate}); +} + +void MKLDNNBinaryConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) { + int blob_idx = 0; + mkldnn::post_ops ops; + + for (auto &node : fusedWith) { + auto* eltwiseNode = dynamic_cast(node.get()); + if (eltwiseNode) { + if (eltwiseNode->getCnnLayer()->precision == Precision::I8) { + auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale"); + if (it != eltwiseNode->getCnnLayer()->blobs.end()) { + // currently there is the only one scale while we need scale by channel :( + ops.append_sum(it->second->buffer().as()[0]); + } + } else { + ops.append_sum(1.0); + } + continue; + } + + auto* activationNode = dynamic_cast(node.get()); + if (activationNode) { + ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), + activationNode->getBeta()); + continue; + } + + auto* depthwiseNode = dynamic_cast(node.get()); + if (depthwiseNode) { + auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); + + if (initWeights) { + MKLDNNDims depthwiseDims({static_cast(rnd_up(biasesDims[0], 16))}); + + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); + + PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, + depthwiseLayer->_weights->buffer(), + depthwiseLayer->_weights->size() * + MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); + + if (depthwiseNode->isBroadcast()) { + float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; + for (int i = 1; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { + static_cast(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue; + } + } + + if (depthwiseNode->getAlgorithm() == depthwise_scale_shift) { + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, + memory::format::x); + PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, + depthwiseLayer->_biases->buffer(), + depthwiseLayer->_biases->size() * + MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); + + if (depthwiseNode->isBroadcast()) { + float broadcastValue = static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; + for (int i = 1; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { + static_cast(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue; + } + } + + ops.append_depthwise(depthwiseNode->getAlgorithm(), + (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), + (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); + + blob_idx += 2; + } else { + ops.append_depthwise(depthwiseNode->getAlgorithm(), + (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), + nullptr); + + blob_idx += 1; + } + } else { + ops.append_depthwise(depthwiseNode->getAlgorithm(), + nullptr, + nullptr); + } + + continue; + } + + auto* quantizeNode = dynamic_cast(node.get()); + if (quantizeNode) { + if (initWeights) { + MKLDNNDims binarizationDims({static_cast(rnd_up(biasesDims[0], 16))}); + + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + PostOpsIntBlobMemory[blob_idx]->Create(binarizationDims, memory::data_type::f32, memory::format::x); + + PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, + &binarizationThresholds[0], + binarizationThresholds.size() * + MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); + + ops.append_binarization(binarization_depthwise, (const float*)PostOpsIntBlobMemory[blob_idx]->GetData()); + + blob_idx += 1; + } else { + ops.append_binarization(binarization_depthwise, nullptr); + } + } + + auto* convolutionNode = dynamic_cast(node.get()); + if (convolutionNode) { + auto* convLayer = reinterpret_cast(convolutionNode->getCnnLayer().get()); + + if (initWeights) { + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}); + PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, memory::data_type::f32, + memory::format::Goihw8g); + + PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::goihw, + convLayer->_weights->buffer(), + dwWeightsDims.size() * + MKLDNNExtensionUtils::sizeOfDataType( + memory::data_type::f32)); + + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + MKLDNNDims dwBiasesDims({dw_conv_oc}); + PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, memory::data_type::f32, + memory::format::x); + PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, + convLayer->_biases->buffer(), + dwBiasesDims.size() * + MKLDNNExtensionUtils::sizeOfDataType( + memory::data_type::f32)); + ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS], + dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS], + (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), + (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData()); + + blob_idx += 2; + } else { + ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS], + dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS], + nullptr, + nullptr); + } + for (auto &dwConvFusedNode : convolutionNode->getFusedWith()) { + auto* dwConvActivationNode = dynamic_cast(dwConvFusedNode.get()); + if (dwConvActivationNode) { + ops.append_eltwise(1.0, dwConvActivationNode->getAlgorithm(), dwConvActivationNode->getAlpha(), + dwConvActivationNode->getBeta()); + } + } + + continue; + } + } + + attr.set_post_ops(ops); +} + +void MKLDNNBinaryConvolutionNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + mkldnn::primitive_attr attr; + setPostOps(attr); + + for (auto& desc : descs) { + try { + primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr); + do { + InferenceEngine::LayerConfig config; + config.dynBatchSupport = true; + for (size_t i = 0; i < desc.inputNumbers(); i++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + dataConfig.desc = getSrcMemDesc(itpd, i); + if (!isGrouped) + dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(dataConfig.desc); + config.inConfs.push_back(dataConfig); + } + + for (size_t i = 0; i < desc.outputNumbers(); i++) { + InferenceEngine::DataConfig dataConfig; + if (withSum) { + dataConfig.inPlace = 1; + } + + dataConfig.constant = false; + dataConfig.desc = getDstMemDesc(itpd, i); + if (!isGrouped) + dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(dataConfig.desc); + config.outConfs.push_back(dataConfig); + + if (withSum) { + dataConfig.inPlace = -1; + config.inConfs.push_back(dataConfig); + } + } + impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); + + supportedPrimitiveDescriptors.emplace_back(config, impl_type); + } while (itpd.next()); + } catch (std::exception& e) { + // it throw exception in case of no implementation found + continue; + } + } +} + + +void MKLDNNBinaryConvolutionNode::createPrimitive() { + if (prim) + return; + + mkldnn::primitive_attr attr; + setPostOps(attr, true); + + auto prim_desc = createPrimitiveDescriptor(attr); + + prim.reset(new binary_convolution_forward(prim_desc, + getParentEdgeAt(0)->getMemory().GetPrimitive(), + internalBlobMemory[0]->GetPrimitive(), + getChildEdgeAt(0)->getMemory().GetPrimitive())); +} + +bool MKLDNNBinaryConvolutionNode::created() const { + return getType() == BinaryConvolution; +} + +void MKLDNNBinaryConvolutionNode::createDescriptor(const std::vector &inputDesc, + const std::vector &outputDesc) { + TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0]; + mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision()); + + MKLDNNMemoryDesc in_candidate(inDesc); + MKLDNNMemoryDesc out_candidate(outDesc); + + // grouping and autoblocking is not compatible + if (((isGrouped && !isDW) || isMerged) && (in_candidate.blocksExtended() || out_candidate.blocksExtended())) + return; + + MKLDNNDims blocked_weightDims(weightDims); + MKLDNNDims blocked_biasesDims(biasesDims); + MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::any}; + + std::shared_ptr bin_conv_desc; + bin_conv_desc.reset(new binary_convolution_forward::desc(prop_kind::forward_scoring, algorithm::binary_convolution_direct, + in_candidate, wgh_candidate, out_candidate, stride, dilation, + paddingL, paddingR, pad_value)); + + descs.emplace_back(bin_conv_desc); +} + +void MKLDNNBinaryConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& config) { + auto* selectedPD = getSelectedPrimitiveDescriptor(); + if (!selectedPD) { + return; + } + + createDescriptor({config.inConfs[0].desc}, {config.outConfs[0].desc}); + + mkldnn::primitive_attr attr; + setPostOps(attr); + + InferenceEngine::LayerConfig rightConfig = selectedPD->getConfig(); + size_t selected_count = 0; + for (size_t i = 0; i < descs.size(); i++) { + const auto& desc = descs[i]; + try { + primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr); + do { + InferenceEngine::LayerConfig cfg; + cfg.dynBatchSupport = true; + for (size_t j = 0; j < desc.inputNumbers(); j++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + dataConfig.desc = getSrcMemDesc(itpd, j); + cfg.inConfs.push_back(dataConfig); + } + + for (size_t j = 0; j < desc.outputNumbers(); j++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + if (withSum) { + cfg.inConfs.push_back(dataConfig); + dataConfig.inPlace = 1; + } + dataConfig.constant = false; + dataConfig.desc = getDstMemDesc(itpd, j); + + cfg.outConfs.push_back(dataConfig); + } + impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); + + if (selected_count == selectedPrimitiveDescriptorIndex) { + if (impl_type != selectedPD->getImplementationType()) { + THROW_IE_EXCEPTION << "Cannot get the original layer configuration!"; + } + rightConfig = cfg; + } + if (i == descs.size() - 1) { + if (impl_type == selectedPD->getImplementationType()) { + rightConfig = config; + } + } + selected_count++; + } while (itpd.next()); + } catch (std::exception& e) { + continue; + } + } + selectedPD->getConfig() = rightConfig; +} + +void MKLDNNBinaryConvolutionNode::pushBinarizationThreshold(float value) { + binarizationThresholds.push_back(value); +} \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h new file mode 100644 index 0000000..659345d --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h @@ -0,0 +1,60 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNBinaryConvolutionNode : public MKLDNNNode { +public: + MKLDNNBinaryConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng); + ~MKLDNNBinaryConvolutionNode() override = default; + + void getSupportedDescriptors() override; + void createDescriptor(const std::vector& inputDesc, + const std::vector& outputDesc) override; + void initDescriptor(const InferenceEngine::LayerConfig& config) override; + void createPrimitive() override; + void initSupportedPrimitiveDescriptors() override; + bool created() const override; + bool canBeInPlace() const override { + return false; + } + void setPostOps(mkldnn::primitive_attr &attr, bool initWeights); + void pushBinarizationThreshold(float value); + +private: + static Register reg; + bool withSum; + bool withBinarization; + bool isDW; + bool isMerged; + bool isGrouped; + std::vector stride; + std::vector dilation; + std::vector paddingL; + std::vector paddingR; + InferenceEngine::SizeVector weightDims; + InferenceEngine::SizeVector biasesDims; + + ptrdiff_t dw_conv_oc; + ptrdiff_t dw_conv_ih; + ptrdiff_t dw_conv_iw; + std::vector dw_conv_kernel; + std::vector dw_conv_strides; + std::vector PostOpsIntBlobMemory; + + float pad_value; + + std::vector binarizationThresholds; +}; + +} // namespace MKLDNNPlugin + diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp index fd2893e..ec370ee 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -16,6 +16,7 @@ #include "mkldnn_dims.h" #include "mkldnn_edge.h" #include "mkldnn_memory.h" +#include "ie_parallel.hpp" #include using namespace mkldnn; @@ -509,3 +510,46 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() { } initDescriptor(config); } + +void MKLDNNConcatNode::execute(mkldnn::stream strm) { + if (isOptimized()) { + return; + } + + const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory(); + const mkldnn::memory::data_type data_type = dst_memory.GetDataType(); + + const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8); + + if (isInt8) { + uint8_t* dst_ptr = reinterpret_cast(dst_memory.GetData()); + + const size_t num_src = getParentEdges().size(); + + std::vector channels; + size_t channels_size = 0; + std::vector src_ptrs; + std::vector dst_ptrs; + + for (size_t i = 0; i < num_src; i++) { + const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory(); + const size_t num_channels = src_mem.GetDims()[1]; + + channels.push_back(num_channels); + src_ptrs.push_back(reinterpret_cast(src_mem.GetData())); + dst_ptrs.push_back(dst_ptr + channels_size); + channels_size += num_channels; + } + + const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0]; + + parallel_for(iter_count, [&](int i) { + const size_t dst_off = i * channels_size; + for (int j = 0; j < num_src; j++) { + memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]); + } + }); + } else { + MKLDNNNode::execute(strm); + } +} diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h index 9aa51d7..5af4a10 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,6 +21,7 @@ public: void createPrimitive() override; void selectOptimalPrimitiveDescriptor() override; bool created() const override; + void execute(mkldnn::stream strm) override; bool isOptimized() const; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp index ea1aee8..18e98a7 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,7 +21,8 @@ using namespace MKLDNNPlugin; using namespace InferenceEngine; MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) - : MKLDNNNode(layer, eng), withBiases(false) { + : MKLDNNNode(layer, eng), withBiases(false), withSum(false), dw_conv_iw(0), dw_conv_ih(0), + dw_conv_oc(0), isDW(false), isMerged(false), withActivation(false), convLayer(nullptr), isGrouped(false) { internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc { return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc()); }); @@ -41,7 +42,7 @@ MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& auto ois = layer->blobs.find("oi-scale"); if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8) && ois == layer->blobs.end()) { - THROW_IE_EXCEPTION << "Internal error of graph quantization - missmatch of intermediate scales and next layer type for convolution " + THROW_IE_EXCEPTION << "Internal error of graph quantization - mismatch of intermediate scales and next layer type for convolution " << getCnnLayer()->name; } if (ois != layer->blobs.end()) { @@ -262,7 +263,7 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe auto* depthwiseLayer = reinterpret_cast(depthwiseNode->getCnnLayer().get()); if (initWeights) { - MKLDNNDims depthwiseDims({static_cast(rnd_up(biasesDims[0], 16))}); + MKLDNNDims depthwiseDims({static_cast(rnd_up(biasesDims[0], 16))}); PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); @@ -320,27 +321,25 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe if (convolutionNode) { auto* convLayer = reinterpret_cast(convolutionNode->getCnnLayer().get()); + auto weightsPrc = MKLDNNExtensionUtils::IEPrecisionToDataType(convLayer->precision); + auto biasPrc = memory::data_type::s32; + if (initWeights) { PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); - MKLDNNDims dwWeightsDims({dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}); - PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, memory::data_type::f32, - memory::format::Goihw8g); + MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}); + PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, weightsPrc, memory::format::Goihw8g); + + Blob::Ptr weights = convLayer->blobs.find("weights")->second; + Blob::Ptr biases = convLayer->blobs.find("biases")->second; - PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::goihw, - convLayer->_weights->buffer(), - dwWeightsDims.size() * - MKLDNNExtensionUtils::sizeOfDataType( - memory::data_type::f32)); + PostOpsIntBlobMemory[blob_idx]->SetData(weightsPrc, memory::goihw, weights->buffer(), + dwWeightsDims.size() * MKLDNNExtensionUtils::sizeOfDataType(weightsPrc)); PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); MKLDNNDims dwBiasesDims({dw_conv_oc}); - PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, memory::data_type::f32, - memory::format::x); - PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, - convLayer->_biases->buffer(), - dwBiasesDims.size() * - MKLDNNExtensionUtils::sizeOfDataType( - memory::data_type::f32)); + PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, biasPrc, memory::format::x); + PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::x, biases->buffer(), + dwBiasesDims.size() * MKLDNNExtensionUtils::sizeOfDataType(biasPrc)); ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS], dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS], (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(), @@ -353,6 +352,46 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe nullptr, nullptr); } + + if (convolutionNode->wScale != nullptr) { + float* wScaleData = static_cast(convolutionNode->wScale->buffer()); + + std::vector oScaleDataVector; + std::vector oShiftDataVector; + if (convolutionNode->getCnnLayer()->precision == Precision::I8 && + convolutionNode->getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) { + float *oScaleData = static_cast(convolutionNode->oScale->buffer()); + + for (size_t c = 0; c < convolutionNode->wScale->size(); c++) { + oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]); + oShiftDataVector.push_back(0.f); + } + } else { + for (size_t c = 0; c < convolutionNode->wScale->size(); c++) { + oScaleDataVector.push_back(wScaleData[c]); + oShiftDataVector.push_back(0.f); + } + } + + MKLDNNDims oScaleDims({static_cast(rnd_up(biasesDims[0], 16))}); + + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + PostOpsIntBlobMemory[blob_idx]->Create(oScaleDims, memory::data_type::f32, memory::format::x); + PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, &oScaleDataVector[0], + oScaleDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); + + PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); + PostOpsIntBlobMemory[blob_idx + 1]->Create(oScaleDims, memory::data_type::f32, memory::format::x); + PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, &oShiftDataVector[0], + oShiftDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); + + ops.append_depthwise(depthwise_scale_shift, + (const float *)PostOpsIntBlobMemory[blob_idx]->GetData(), + (const float *)PostOpsIntBlobMemory[blob_idx + 1]->GetData()); + + blob_idx += 2; + } + for (auto &dwConvFusedNode : convolutionNode->fusedWith) { auto* dwConvActivationNode = dynamic_cast(dwConvFusedNode.get()); if (dwConvActivationNode) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h index 19191ee..45d45e2 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -40,18 +40,18 @@ private: bool isDW; bool isMerged; bool isGrouped; - std::vector stride; - std::vector dilation; - std::vector paddingL; - std::vector paddingR; + std::vector stride; + std::vector dilation; + std::vector paddingL; + std::vector paddingR; InferenceEngine::SizeVector weightDims; InferenceEngine::SizeVector biasesDims; - int dw_conv_oc; - int dw_conv_ih; - int dw_conv_iw; - std::vector dw_conv_kernel; - std::vector dw_conv_strides; + ptrdiff_t dw_conv_oc; + ptrdiff_t dw_conv_ih; + ptrdiff_t dw_conv_iw; + std::vector dw_conv_kernel; + std::vector dw_conv_strides; std::vector PostOpsIntBlobMemory; InferenceEngine::ConvolutionLayer* convLayer; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp index 8b11c29..25fa018 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h index f74ab29..08965b9 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp index 38ca06c..497da39 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h index e32a66a..aad12ed 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -35,10 +35,10 @@ private: bool withGroups; bool isDW; size_t groupNum = 1; - std::vector stride; - std::vector paddingL; - std::vector dilation; - std::vector paddingR; + std::vector stride; + std::vector paddingL; + std::vector dilation; + std::vector paddingR; MKLDNNDims weightsDims; static Register reg; InferenceEngine::Blob::Ptr biases; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp index 6b1097a..03e4473 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -35,6 +35,11 @@ void MKLDNNDepthwiseNode::getSupportedDescriptors() { auto parentOutDims = getParentEdgeAt(0)->getDims(); + if (getParentEdges().size() != 1) + THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect number of inputs!"; + if (parentOutDims != getChildEdgeAt(0)->getDims()) + THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect dimensions!"; + SizeVector weightDims = { (long unsigned int)parentOutDims[1] }; MKLDNNDims blocked_weightDims(weightDims); @@ -76,7 +81,7 @@ void MKLDNNDepthwiseNode::createPrimitive() { if (isBroadcast()) { float broadcastValue = static_cast(internalBlobMemory[0]->GetData())[0]; - int blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; + size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) { static_cast(internalBlobMemory[0]->GetData())[i] = broadcastValue; } @@ -88,6 +93,15 @@ void MKLDNNDepthwiseNode::createPrimitive() { static_cast(internalBlobMemory[1]->GetData())[i] = broadcastValue; } } + } else { + size_t blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; + if (realWeightSize != blbSize) + THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect weights!"; + if (isWithBiases()) { + blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; + if (realBiasSize != blbSize) + THROW_IE_EXCEPTION << "Cannot create layer " << getName() << ": Incorrect biases!"; + } } if (isWithBiases()) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h index 16bd3a5..00b60ab 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 1111968..fdb5eeb 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -12,12 +12,15 @@ #include #include #include "ie_parallel.hpp" +#include using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; -MKLDNNEltwiseNode::MKLDNNEltwiseNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {} +MKLDNNEltwiseNode::MKLDNNEltwiseNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) { + op = EltwiseLayer::Sum; +} bool MKLDNNEltwiseNode::isSum() { auto * eltwiseLayer = dynamic_cast(getCnnLayer().get()); @@ -45,16 +48,36 @@ void MKLDNNEltwiseNode::getSupportedDescriptors() { THROW_IE_EXCEPTION << "Cannot convert eltwise layer."; op = eltwiseLayer->_operation; - if (getParentEdges().empty()) + if (getParentEdges().size() < 2) THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); if (getChildEdges().empty()) THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); + if (op == EltwiseLayer::Squared_diff) + if (getParentEdges().size() != 2) + THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName() << " for operation squared_diff.\n" + << "Expected: 2\n" << "Actual: " << getParentEdges().size(); + + auto outDims = getChildEdgeAt(0)->getDims(); + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto inDims = getParentEdgeAt(i)->getDims(); + for (size_t j = 1; j <= inDims.ndims(); j++) { + if (outDims[outDims.ndims() - j] != inDims[inDims.ndims() - j]) { + if (inDims[inDims.ndims() - j] == 1) { + broadcast = true; + } else { + THROW_IE_EXCEPTION << "Incorrect dimentions for broadcasting for " << eltwiseLayer->name; + } + } + } + } - auto outDims = getParentEdgeAt(0)->getDims(); - for (size_t i = 1; i < getParentEdges().size(); i++) { - auto oDims = getParentEdgeAt(i)->getDims(); - if (outDims.size() != oDims.size() || outDims.ndims() != oDims.ndims()) - THROW_IE_EXCEPTION << "Dimentions of input layers are not equal for " << eltwiseLayer->name; + if (broadcast) { + auto outDims = getChildEdgeAt(0)->getDims(); + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto inDims = getParentEdgeAt(i)->getDims(); + if (inDims.ndims() > 5 || outDims.ndims() > 5) + THROW_IE_EXCEPTION << "Eltwise node in broadcasting mode doesn't support more than 5 dims for blobs"; + } } bool with_coeffs = !eltwiseLayer->coeff.empty(); @@ -64,6 +87,9 @@ void MKLDNNEltwiseNode::getSupportedDescriptors() { if (with_coeffs && eltwiseLayer->coeff.size() != getParentEdges().size()) THROW_IE_EXCEPTION << "Number of provided coefficients is not equal to number of operands"; + if (with_coeffs && eltwiseLayer->precision != Precision::FP32) + THROW_IE_EXCEPTION << "Sum with coefficients supports only FP32 precision"; + sum_scales.clear(); for (int i = 0; i < getParentEdges().size(); i++) sum_scales.push_back(with_coeffs ? eltwiseLayer->coeff[i] : 1.0f); @@ -73,33 +99,38 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - auto same = [&] (mkldnn::memory::data_type inputDT, mkldnn::memory::data_type outputDT, memory::format fmt) -> PrimitiveDescInfo { + auto initDesc = [&] (mkldnn::memory::data_type inputDT, mkldnn::memory::data_type outputDT, memory::format format) -> PrimitiveDescInfo { InferenceEngine::LayerConfig config; config.dynBatchSupport = true; for (size_t i = 0; i < getParentEdges().size(); i++) { InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = (!i && canBeInPlace()) ? 0 : -1; dataConfig.constant = false; - dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, fmt); - config.inConfs.push_back(dataConfig); + + if (getParentEdgeAt(i)->getDims().ndims() == getChildEdgeAt(0)->getDims().ndims()) { + dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, format); + config.inConfs.push_back(dataConfig); + } else { + // Broadcasting support + if (MKLDNNMemory::IsPlainFormat(format)) { + dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDT, MKLDNNMemory::GetPlainFormat(getParentEdgeAt(i)->getDims())); + config.inConfs.push_back(dataConfig); + } + } } InferenceEngine::DataConfig dataConfig; dataConfig.inPlace = -1; dataConfig.constant = false; - dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, fmt); + dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDT, format); config.outConfs.push_back(dataConfig); return {config, impl_desc_type::ref}; }; for (const auto& format : getAvailableFormatsForDims(getChildEdgeAt(0)->getDims())) { - if (getCnnLayer()->precision == Precision::FP32) { - mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32); - mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32); - supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, format)); - } else { - THROW_IE_EXCEPTION << "Invalid Eltwise layer precision: " << getCnnLayer()->name; - } + mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->precision); + mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->precision); + supportedPrimitiveDescriptors.push_back(initDesc(inputDT, outputDT, format)); } } @@ -127,10 +158,10 @@ void MKLDNNEltwiseNode::createPrimitive() { srcs_p.emplace_back(srcMemPtr->GetPrimitive()); } } - if (op == EltwiseLayer::Sum) { + if (op == EltwiseLayer::Sum && !broadcast) { try { - auto primitive_desc = sum::primitive_desc(dstMemPtr->GetDescriptor(), sum_scales, srcs_pd); - prim = std::shared_ptr(new sum(primitive_desc, srcs_p, dstMemPtr->GetPrimitive())); + auto primitive_desc = mkldnn::sum::primitive_desc(dstMemPtr->GetDescriptor(), sum_scales, srcs_pd); + prim = std::shared_ptr(new mkldnn::sum(primitive_desc, srcs_p, dstMemPtr->GetPrimitive())); } catch (...) { std::cerr << "Handle this problem correctly!" << std::endl; prim = nullptr; @@ -158,101 +189,1797 @@ void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() { } } -template void MKLDNNEltwiseNode::ref_eltwise(int in0, int in1) { - IE_ASSERT(getParentEdges().size() > 1); +void MKLDNNEltwiseNode::dims_calc(int *dims, const MKLDNNDims &edge_dims) { + for (int i = 0; i < 5; i++) + dims[i] = 1; + int ndims = edge_dims.ndims(); + if (ndims > 5) { + THROW_IE_EXCEPTION << "ndims should be less then 5"; + } + for (int i = 0; i < ndims; i++) { + dims[4 - i] = edge_dims[ndims - 1 - i]; + } + dims[5 - ndims] = std::min(dims[5 - ndims], batchToProcess()); +} - auto& srcMemory0 = getParentEdgeAt(in0)->getMemory(); - auto& srcMemory1 = getParentEdgeAt(in1)->getMemory(); - const T0 *src0_ptr = reinterpret_cast(srcMemory0.GetData()) + - srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding; - const T1 *src1_ptr = reinterpret_cast(srcMemory1.GetData()) + - srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding; - T0 *dst_ptr = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + - getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess(); +void MKLDNNEltwiseNode::offset_out_calc(int *offset, int *dims) { + int k = 1; + for (int i = 4; i >= 0; i--) { + offset[i] = k; + k *= dims[i]; + } +} + +void MKLDNNEltwiseNode::offset_in_calc(int *offset, int *dims_in, int *dims_out) { + int k = 1; + for (int i = 4; i >= 0; i--) { + offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; + k *= dims_in[i]; + } +} + +// Intel C++ Compiler 18.0 for Windows contains bug that doesn't allow to use templates to generate eltwise implementations +// and to avoid all copypaste below +template void MKLDNNEltwiseNode::eltwise_add( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] + src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] + src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] + src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] + src_ptr[index_in]; + }); +#endif + } + } +} - if (op == EltwiseLayer::Prod) { +template void MKLDNNEltwiseNode::eltwise_prod( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { #ifdef _WIN32 - for (int i = 0; i < dst_data_size; i++) + for (size_t i = 0; i < dst_data_size; i++) { dst_ptr[i] = src0_ptr[i] * src1_ptr[i]; + } #else - parallel_for(dst_data_size, [&](int i) { + parallel_for(dst_data_size, [&](size_t i) { dst_ptr[i] = src0_ptr[i] * src1_ptr[i]; }); #endif - for (int j = 2; j < getParentEdges().size(); j++) { - const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; #ifdef _WIN32 - for (int i = 0; i < dst_data_size; i++) + for (size_t i = 0; i < dst_data_size; i++) { dst_ptr[i] = dst_ptr[i] * src_ptr[i]; + } #else - parallel_for(dst_data_size, [&](int i) { + parallel_for(dst_data_size, [&](size_t i) { dst_ptr[i] = dst_ptr[i] * src_ptr[i]; }); #endif } - } else if (op == EltwiseLayer::Max) { + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] * src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] * src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_max( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { #ifdef _WIN32 - for (int i = 0; i < dst_data_size; i++) + for (size_t i = 0; i < dst_data_size; i++) { dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]); + } #else - parallel_for(dst_data_size, [&](int i) { - dst_ptr[i] = std::max(src0_ptr[i], (T0) src1_ptr[i]); + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = std::max(src0_ptr[i], (T0)src1_ptr[i]); }); #endif for (int j = 2; j < getParentEdges().size(); j++) { const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; #ifdef _WIN32 - for (int i = 0; i < dst_data_size; i++) + for (size_t i = 0; i < dst_data_size; i++) { dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]); + } #else - parallel_for(dst_data_size, [&](int i) { - dst_ptr[i] = std::max(dst_ptr[i], (T0) src_ptr[i]); + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = std::max(dst_ptr[i], (T0)src_ptr[i]); }); #endif } - } else if (op == EltwiseLayer::Sum) { + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + #ifdef _WIN32 - for (int i = 0; i < dst_data_size; i++) - dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); + } + } + } + } + } #else - parallel_for(dst_data_size, [&](int i) { - dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::max(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::max(dst_ptr[index_out], (T0)src_ptr[index_in]); + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_sub( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] - src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] - src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] - src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] - src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] - src1_ptr[index_in1]; }); #endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] - src_ptr[index_in]; + }); +#endif + } + } +} +template void MKLDNNEltwiseNode::eltwise_min( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = std::min(src0_ptr[i], (T0)src1_ptr[i]); + }); +#endif for (int j = 2; j < getParentEdges().size(); j++) { const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + - getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; #ifdef _WIN32 - for (int i = 0; i < dst_data_size; i++) - dst_ptr[i] = dst_ptr[i] + src_ptr[i]; + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]); + } #else - parallel_for(dst_data_size, [&](int i) { - dst_ptr[i] = dst_ptr[i] + src_ptr[i]; + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = std::min(dst_ptr[i], (T0)src_ptr[i]); + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::min(src0_ptr[index_in0], (T0)src1_ptr[index_in1]); + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::min(dst_ptr[index_out], (T0)src_ptr[index_in]); }); #endif } } } +template void MKLDNNEltwiseNode::eltwise_div( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] / src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] / src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] / src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] / src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] / src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); -void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { - if (prim) { - MKLDNNNode::execute(strm); +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] / src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_squared_diff( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = (src0_ptr[i] - src1_ptr[i]) * (src0_ptr[i] - src1_ptr[i]); + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = (dst_ptr[i] - src_ptr[i]) * (dst_ptr[i] - src_ptr[i]); + }); +#endif + } } else { - if (getParentEdges().size() > 2) { - // Only float supported in this case - for (int i = 0; i < getParentEdges().size(); i++) { - if (getParentEdgeAt(i)->getDesc().getPrecision() != Precision::FP32) { - THROW_IE_EXCEPTION << "If ref eltwise has more than 2 inputs, only FP32 inputs are supported"; + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]); + } } } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (src0_ptr[index_in0] - src1_ptr[index_in1]) * (src0_ptr[index_in0] - src1_ptr[index_in1]); + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; - ref_eltwise(0, 1); + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (dst_ptr[index_out] - src_ptr[index_in]) * (dst_ptr[index_out] - src_ptr[index_in]); + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_floor_mod( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] - src0_ptr[i] / src1_ptr[i] * src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] - dst_ptr[i] / src_ptr[i] * src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in1] / src1_ptr[index_in0] * src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] - src0_ptr[index_in1] / src1_ptr[index_in0] * src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_in] / src_ptr[index_out] * src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] - dst_ptr[index_in] / src_ptr[index_out] * src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_pow( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = std::pow(src0_ptr[i], src1_ptr[i]); + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = std::pow(dst_ptr[i], src_ptr[i]); + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::pow(src0_ptr[index_in0], src1_ptr[index_in1]); + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = std::pow(dst_ptr[index_out], src_ptr[index_in]); + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_equal( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] == src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] == src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] == src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] == src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] == src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] == src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_not_equal( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] != src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] != src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] != src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] != src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] != src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] != src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_less( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] < src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] < src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] < src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] < src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] < src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] < src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_less_equal( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] <= src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] <= src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] <= src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] <= src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] <= src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] <= src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_greater( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] > src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] > src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] > src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] > src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] > src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] > src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_greater_equal( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] >= src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] >= src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] >= src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] >= src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] >= src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] >= src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_logical_and( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] && src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] && src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] && src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] && src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] && src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] && src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_logical_or( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = src0_ptr[i] || src1_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = src0_ptr[i] || src1_ptr[i]; + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = dst_ptr[i] || src_ptr[i]; + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = dst_ptr[i] || src_ptr[i]; + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = src0_ptr[index_in0] || src1_ptr[index_in1]; + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in]; + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = dst_ptr[index_out] || src_ptr[index_in]; + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::eltwise_logical_xor( + const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, const size_t dst_data_size) { + if (!broadcast) { +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = (src0_ptr[i] || src1_ptr[i]) - (src0_ptr[i] && src1_ptr[i]); + }); +#endif + for (int j = 2; j < getParentEdges().size(); j++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(j)->getMemory().GetData()) + + getParentEdgeAt(j)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; +#ifdef _WIN32 + for (size_t i = 0; i < dst_data_size; i++) { + dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]); + } +#else + parallel_for(dst_data_size, [&](size_t i) { + dst_ptr[i] = (dst_ptr[i] || src_ptr[i]) - (dst_ptr[i] && src_ptr[i]); + }); +#endif + } + } else { + int dims_out[5], dims_in0[5], dims_in1[5]; + int offset_out[5], offset_in0[5], offset_in1[5]; + auto& child_edge_dims = getChildEdgeAt(0)->getDims(); + auto& parent0_edge_dims = getParentEdgeAt(0)->getDims(); + auto& parent1_edge_dims = getParentEdgeAt(1)->getDims(); + dims_calc(dims_out, child_edge_dims); + dims_calc(dims_in0, parent0_edge_dims); + dims_calc(dims_in1, parent1_edge_dims); + offset_out_calc(offset_out, dims_out); + offset_in_calc(offset_in0, dims_in0, dims_out); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in0 = i0 * offset_in0[0] + i1 * offset_in0[1] + i2 * offset_in0[2] + i3 * offset_in0[3] + i4 * offset_in0[4]; + size_t index_in1 = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (src0_ptr[index_in0] || src1_ptr[index_in1]) - (src0_ptr[index_in0] && src1_ptr[index_in1]); + }); +#endif + for (size_t n = 2; n < getParentEdges().size(); n++) { + const T1 *src_ptr = reinterpret_cast(getParentEdgeAt(n)->getMemory().GetData()) + + getParentEdgeAt(n)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + auto& parent_edge_dims = getParentEdgeAt(n)->getDims(); + dims_calc(dims_in1, parent_edge_dims); + offset_in_calc(offset_in1, dims_in1, dims_out); + +#ifdef _WIN32 + for (size_t i0 = 0; i0 < dims_out[0]; i0++) { + for (size_t i1 = 0; i1 < dims_out[1]; i1++) { + for (size_t i2 = 0; i2 < dims_out[2]; i2++) { + for (size_t i3 = 0; i3 < dims_out[3]; i3++) { + for (size_t i4 = 0; i4 < dims_out[4]; i4++) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]); + } + } + } + } + } +#else + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + size_t index_out = i0 * offset_out[0] + i1 * offset_out[1] + i2 * offset_out[2] + i3 * offset_out[3] + i4 * offset_out[4]; + size_t index_in = i0 * offset_in1[0] + i1 * offset_in1[1] + i2 * offset_in1[2] + i3 * offset_in1[3] + i4 * offset_in1[4]; + dst_ptr[index_out] = (dst_ptr[index_out] || src_ptr[index_in]) - (dst_ptr[index_out] && src_ptr[index_in]); + }); +#endif + } + } +} + +template void MKLDNNEltwiseNode::ref_eltwise(int in0, int in1) { + IE_ASSERT(getParentEdges().size() > 1); + + auto& srcMemory0 = getParentEdgeAt(in0)->getMemory(); + auto& srcMemory1 = getParentEdgeAt(in1)->getMemory(); + const T0 *src0_ptr = reinterpret_cast(srcMemory0.GetData()) + + srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding; + const T1 *src1_ptr = reinterpret_cast(srcMemory1.GetData()) + + srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding; + T0 *dst_ptr = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + + getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + const size_t dst_data_size = srcMemory0.GetSize() / sizeof(T0) / srcMemory0.GetDims()[0] * batchToProcess(); + + switch (op) { + case EltwiseLayer::eOperation::Sum: eltwise_add(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Prod: eltwise_prod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Max: eltwise_max(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Sub: eltwise_sub(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Min: eltwise_min(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Div: eltwise_div(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Squared_diff: eltwise_squared_diff(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Floor_mod: eltwise_floor_mod(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Pow: eltwise_pow(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Equal: eltwise_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Not_equal: eltwise_not_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Less: eltwise_less(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Less_equal: eltwise_less_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Greater: eltwise_greater(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Greater_equal: eltwise_greater_equal(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Logical_AND: eltwise_logical_and(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Logical_OR: eltwise_logical_or(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + case EltwiseLayer::eOperation::Logical_XOR: eltwise_logical_xor(src0_ptr, src1_ptr, dst_ptr, dst_data_size); break; + default: THROW_IE_EXCEPTION << "Unsupported operation type for Eltwise node"; + } +} + +void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { + if (prim) { + MKLDNNNode::execute(strm); + } else { + if (op == EltwiseLayer::Floor_mod) { + for (size_t i = 0; i < getParentEdges().size(); i++) + if (getParentEdgeAt(i)->getDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of inputs"; + if (getChildEdgeAt(0)->getDesc().getPrecision() != Precision::I32) + THROW_IE_EXCEPTION << "Floor_mod supports only I32 precision of output"; + } + if (getParentEdges().size() > 2) { + Precision pi = getParentEdgeAt(0)->getDesc().getPrecision(); + Precision po = getChildEdgeAt(0)->getDesc().getPrecision(); + for (int i = 1; i < getParentEdges().size(); i++) { + if (getParentEdgeAt(i)->getDesc().getPrecision() != pi) + THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs must have same precision"; + } + if (pi != po) { + THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, all inputs and output must have same precision"; + } + if (pi == Precision::FP32) + ref_eltwise(0, 1); + else if (pi == Precision::I32) + ref_eltwise(0, 1); + else if (pi == Precision::I8) + ref_eltwise(0, 1); + else if (pi == Precision::U8) + ref_eltwise(0, 1); + else + THROW_IE_EXCEPTION << "If Eltwise node has more than 2 inputs, only FP32, I32, I8, U8 are supported"; return; } @@ -278,6 +2005,8 @@ void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { ref_eltwise(0, 1); } else if (po == Precision::I8 && pi1 == po && pi0 == Precision::U8) { ref_eltwise(1, 0); + } else if (po == Precision::I32 && pi0 == po && pi1 == po) { + ref_eltwise(0, 1); } } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index 0395cd4..2a6e3f5 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,8 +31,31 @@ private: static Register reg; InferenceEngine::EltwiseLayer::eOperation op; std::vector sum_scales; + bool broadcast = false; template void ref_eltwise(int in0, int in1); + void dims_calc(int *dims, const MKLDNNDims &edge_dims); + void offset_out_calc(int *offset, int *dims); + void offset_in_calc(int *offset, int *dims_in, int *dims_out); + + template void eltwise_add(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_prod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_max(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_sub(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_min(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_div(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_squared_diff(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_floor_mod(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_pow(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_equal(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_not_equal(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_less(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_less_equal(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_greater(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_greater_equal(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_logical_and(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_logical_or(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); + template void eltwise_logical_xor(const T0 *src0_ptr, const T1 *src1_ptr, T0 *dst_ptr, size_t dst_data_size); }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp index 75b814e..a777b54 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp @@ -1,13 +1,15 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "mkldnn_fullyconnected_node.h" +#include "mkldnn_activation_node.h" #include "desc_iterator.hpp" #include #include #include #include +#include using namespace mkldnn; using namespace MKLDNNPlugin; @@ -22,6 +24,25 @@ MKLDNNFullyConnectedNode::MKLDNNFullyConnectedNode(const InferenceEngine::CNNLay return MKLDNNMemoryDesc(); return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc()); }); + + auto ws = layer->blobs.find("w-scale"); + if (ws != layer->blobs.end()) { + wScale = ws->second; + } + + // Trying to find oi-scale + if (getCnnLayer()->type == "FullyConnected" && getCnnLayer()->precision == Precision::I8) { + auto ois = layer->blobs.find("oi-scale"); + if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8) + && ois == layer->blobs.end()) { + THROW_IE_EXCEPTION << "Internal error of graph quantization - mismatch of intermediate scales and next layer type for fully connected " + << getCnnLayer()->name; + } + if (ois != layer->blobs.end()) { + // If we can find an oi-scale, then the next layer has to be an INT8. + oScale = ois->second; + } + } } void MKLDNNFullyConnectedNode::getSupportedDescriptors() { @@ -29,12 +50,8 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() { return; InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); precision = getCnnLayer()->outData[0]->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); auto * fcLayer = dynamic_cast(getCnnLayer().get()); @@ -75,6 +92,27 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() { internalBlobs.push_back(createInternalBlob(biasesDims, false)); } + Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second; + if (weights->precision() == Precision::I8) { + // The weights blob has incorrect dims, so we have to fix it + TensorDesc wdesc = internalBlobs[0]->getTensorDesc(); + wdesc.setPrecision(Precision::I8); + InferenceEngine::TBlob::Ptr reshapedInt8Weights = + InferenceEngine::TBlob::Ptr( + new InferenceEngine::TBlob(wdesc, static_cast(weights->buffer()), weights->byteSize())); + + internalBlobs[0] = reshapedInt8Weights; + if (withBiases) { + Blob::Ptr biases = this->getCnnLayer()->blobs.find("biases")->second; + TensorDesc bdesc = internalBlobs[1]->getTensorDesc(); + bdesc.setPrecision(Precision::I32); + InferenceEngine::TBlob::Ptr reshapedInt32Biases = + InferenceEngine::TBlob::Ptr( + new InferenceEngine::TBlob(bdesc, static_cast(biases->buffer()), biases->byteSize())); + internalBlobs[1] = reshapedInt32Biases; + } + } + for (auto format : getAvailableFormatsForDims(getParentEdgeAt(0)->getDims())) { MKLDNNMemoryDesc in_candidate(inDims, inputDataType, format); MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::any); @@ -87,16 +125,24 @@ void MKLDNNFullyConnectedNode::createPrimitive() { if (prim) return; - auto prim_desc = createPrimitiveDescriptor(); + std::shared_ptr attr = initPrimitiveAttr(); + std::shared_ptr prim_desc; + if (attr == nullptr) { + prim_desc = std::make_shared( + createPrimitiveDescriptor(*attr)); + } else { + prim_desc = std::make_shared( + createPrimitiveDescriptor(*attr)); + } if (internalBlobs.size() > 1) { - prim.reset(new inner_product_forward(prim_desc, + prim.reset(new inner_product_forward(*prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), internalBlobMemory[0]->GetPrimitive(), internalBlobMemory[1]->GetPrimitive(), getChildEdgeAt(0)->getMemory().GetPrimitive())); } else { - prim.reset(new inner_product_forward(prim_desc, + prim.reset(new inner_product_forward(*prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), internalBlobMemory[0]->GetPrimitive(), getChildEdgeAt(0)->getMemory().GetPrimitive())); @@ -104,7 +150,8 @@ void MKLDNNFullyConnectedNode::createPrimitive() { } bool MKLDNNFullyConnectedNode::created() const { - return getType() == FullyConnected; + return getType() == FullyConnected || + getType() == FullyConnected_Activation; } memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::format sourceFormat) { @@ -164,16 +211,74 @@ const std::vector& MKLDNNFullyConnectedNode::getPrimitivesPriori return implPriorities; } +std::shared_ptr MKLDNNFullyConnectedNode::initPrimitiveAttr() const { + auto attr = std::make_shared(mkldnn::primitive_attr()); + bool scaled = false; + if (wScale != nullptr) { + float* wScaleData = static_cast(wScale->buffer()); + + std::vector oScaleDataVector; + if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) { + float *oScaleData = static_cast(oScale->buffer()); + + for (size_t c = 0; c < wScale->size(); c++) { + oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]); + } + } else { + for (size_t c = 0; c < wScale->size(); c++) { + oScaleDataVector.push_back(wScaleData[c]); + } + } + + attr->set_int_output_round_mode(mkldnn::round_nearest); + attr->set_output_scales(1 << 1 /*through C dim*/, oScaleDataVector); + } + mkldnn::post_ops ops; + for (auto &node : fusedWith) { + auto* activationNode = dynamic_cast(node.get()); + if (activationNode) { + ops.append_eltwise(1.0, activationNode->getAlgorithm(), activationNode->getAlpha(), + activationNode->getBeta()); + } + attr->set_post_ops(ops); + } + return attr; +} + void MKLDNNFullyConnectedNode::createDescriptor(const std::vector &inputDesc, const std::vector &outputDesc) { - MKLDNNMemoryDesc in_candidate(inputDesc[0]); - MKLDNNMemoryDesc out_candidate(outputDesc[0]); + TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0]; + mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision()); + mkldnn::memory::data_type bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision()); + + Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second; + + if (weights->precision() == Precision::I8) { + wdt = memory::s8; + bdt = memory::s32; + + Precision outPrec; + if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) { + outPrec = Precision::FP32; + } else { + // define precision accordninly normalizer + // TODO(amalyshe) do we need to have separate flow for last in int8 chain or not? + outPrec = outDesc.getPrecision(); + } + + inDesc = TensorDesc(inDesc.getPrecision() , inputDesc[0].getDims(), inputDesc[0].getBlockingDesc()); + outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), Layout::NC/*, outputDesc[0].getBlockingDesc()*/); + } + + MKLDNNMemoryDesc in_candidate(inDesc); + MKLDNNMemoryDesc out_candidate(outDesc); + memory::format weights_fmt = weightsFormatForSrcFormat(in_candidate.getFormat()); - MKLDNNMemoryDesc wgh_candidate(MKLDNNDims(weightsDims), in_candidate.getDataType(), weights_fmt); - MKLDNNMemoryDesc bias_candidate(MKLDNNDims(biasesDims), in_candidate.getDataType(), memory::any); + MKLDNNMemoryDesc wgh_candidate(MKLDNNDims(weightsDims), wdt, weights_fmt); if (internalBlobs.size() > 1) { + MKLDNNMemoryDesc bias_candidate(MKLDNNDims(biasesDims), bdt, memory::any); MKLDNNDescriptor desc(std::shared_ptr( new inner_product_forward::desc(prop_kind::forward_scoring, in_candidate, wgh_candidate, bias_candidate, out_candidate))); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h index 73c06f7..3e6c5fb 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,11 +28,16 @@ public: void createDescriptor(const std::vector& inputDesc, const std::vector& outputDesc) override; +protected: + std::shared_ptr initPrimitiveAttr() const override; + private: static Register reg; InferenceEngine::SizeVector weightsDims; InferenceEngine::SizeVector biasesDims; mkldnn::memory::format weightsFormatForSrcFormat(mkldnn::memory::format sourceFormat); + + InferenceEngine::Blob::Ptr wScale, oScale; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp index 2874d9d..2ff862f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h index da171a0..94c4e15 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp index b31b491..bc5d6e5 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h index 7bdd4a0..71f86f0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp index 9b42bee..69ab336 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h index 99b4c86..9640e50 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp index 4b1192b..0675c32 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h index 9d85dab..52de049 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp index a37a253..09cb566 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp index ebc6774..cca5fb3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp index c23ce6e..9a25c0f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -299,7 +299,7 @@ static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPt } } -std::map MKLDNNPermuteNode::OptimizedCases = { +std::multimap MKLDNNPermuteNode::OptimizedCases = { {{0, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_0231, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { return true; })}, // NCHW -> NHWC case @@ -329,26 +329,28 @@ void MKLDNNPermuteNode::execute(mkldnn::stream strm) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); auto dst_data = reinterpret_cast(dstMemPtr->GetData()); - auto perm = OptimizedCases.find(order); - if (perm != OptimizedCases.end() && perm->second.isValidParams(srcMemPtr, dstMemPtr)) { - perm->second.execute(batchToProcess(), srcMemPtr, dstMemPtr); - } else { - auto srcBlob = getParentEdgeAt(0)->getBlob(); - TensorDesc srcDesc = srcBlob->getTensorDesc(); - - SizeVector& dims = srcDesc.getDims(); - InferenceEngine::SizeVector orderedDims; - for (auto ord : order) { - orderedDims.push_back(dims[ord]); + for (const auto &impl : OptimizedCases) { + if (impl.first == order && impl.second.isValidParams(srcMemPtr, dstMemPtr)) { + impl.second.execute(batchToProcess(), srcMemPtr, dstMemPtr); + return; } - TensorDesc dstDesc(InferenceEngine::Precision::FP32, dims, {orderedDims, order}); + } - int dataSize = srcBlob->size() / srcDesc.getDims()[0] * batchToProcess(); + auto srcBlob = getParentEdgeAt(0)->getBlob(); + TensorDesc srcDesc = srcBlob->getTensorDesc(); - parallel_for(dataSize, [&](int i) { - dst_data[dstDesc.offset(i)] = src_data[srcDesc.offset(i)]; - }); + SizeVector& dims = srcDesc.getDims(); + InferenceEngine::SizeVector orderedDims; + for (auto ord : order) { + orderedDims.push_back(dims[ord]); } + TensorDesc dstDesc(InferenceEngine::Precision::FP32, dims, {orderedDims, order}); + + int dataSize = srcBlob->size() / srcDesc.getDims()[0] * batchToProcess(); + + parallel_for(dataSize, [&](int i) { + dst_data[dstDesc.offset(i)] = src_data[srcDesc.offset(i)]; + }); } bool MKLDNNPermuteNode::created() const { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h index 9c0ce0d..cad6f90 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -40,7 +40,7 @@ private: isApplicable isValidParams; }; - static std::map OptimizedCases; + static std::multimap OptimizedCases; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp index 82e3eac..e501bba 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h index e5309f4..cee6404 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -30,10 +30,10 @@ private: static Register reg; InferenceEngine::PoolingLayer::PoolType type; bool exclude_pad; - std::vector stride; - std::vector paddingL; - std::vector paddingR; - std::vector kernel; + std::vector stride; + std::vector paddingL; + std::vector paddingR; + std::vector kernel; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp index 01ae0e6..974eded 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -89,7 +89,7 @@ void MKLDNNPowerNode::createPrimitive() { void MKLDNNPowerNode::execute(mkldnn::stream strm) { auto& srcMemory = getParentEdgeAt(0)->getMemory(); auto& dstMemory = getChildEdgeAt(0)->getMemory(); - const int data_size = srcMemory.GetSize() / sizeof(float) / srcMemory.GetDims()[0] * batchToProcess(); + const size_t data_size = srcMemory.GetSize() / sizeof(float) / srcMemory.GetDims()[0] * batchToProcess(); const auto *src_ptr = reinterpret_cast(srcMemory.GetData()) + srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding; @@ -97,11 +97,11 @@ void MKLDNNPowerNode::execute(mkldnn::stream strm) { dstMemory.GetDescriptor().data.layout_desc.blocking.offset_padding; if (power == 1.0f) { - parallel_for(data_size, [&](int i) { + parallel_for(data_size, [&](size_t i) { dst_ptr[i] = src_ptr[i] * scale + shift; }); } else { - parallel_for(data_size, [&](int i) { + parallel_for(data_size, [&](size_t i) { dst_ptr[i] = pow(src_ptr[i] * scale + shift, power); }); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h index a6fce5c..0bd33d2 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp new file mode 100644 index 0000000..85e0067 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp @@ -0,0 +1,229 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_quantize_node.h" +#include "desc_iterator.hpp" +#include +#include +#include +#include +#include +#include +#include "details/caseless.hpp" + +using namespace mkldnn; +using namespace MKLDNNPlugin; +using namespace InferenceEngine; +using namespace InferenceEngine::details; + +MKLDNNQuantizeNode::MKLDNNQuantizeNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {} + +void MKLDNNQuantizeNode::getSupportedDescriptors() { + InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); + if (precision != InferenceEngine::Precision::FP32) + THROW_IE_EXCEPTION << "Quantize layer " << getName() << " supports only FP32 precision"; + + auto* quantizeLayer = dynamic_cast(getCnnLayer().get()); + if (quantizeLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert Quantize layer " << getName(); + + levels = quantizeLayer->levels; + if (levels <= 1) + THROW_IE_EXCEPTION << "Quantize layer " << getName() << "supports only parameter levels > 1"; + + if (getParentEdges().size() != 5) + THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); + if (getChildEdges().empty()) + THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); + + if (getParentEdgeAt(0)->getDims().ndims() != 4) { + THROW_IE_EXCEPTION << "Quantize layer " << getName() << "supports only 4D input at edge 0"; + } + + for (int i = 1; i < 5; i++) { + if (getParentEdgeAt(i)->getDims().ndims() != 1 && getParentEdgeAt(i)->getDims().ndims() != 4) { + THROW_IE_EXCEPTION << "Quantize layer " << getName() << "supports only 1D or 4D inputs at edge " << i; + } + } + + canStorePacked = getChildEdges().size() == 1 && getChildEdgeAt(0)->getChild()->getType() == BinaryConvolution; + + if (canStorePacked) { + mkldnn::memory::data_type idt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); + mkldnn::memory::data_type ddt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::BIN); + mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); + + MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), idt, memory::nhwc); + MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), ddt, memory::nhwc); + + InferenceEngine::SizeVector weightDims; + weightDims.push_back(getParentEdgeAt(0)->getDims()[1]); + MKLDNNDims blocked_weightDims(weightDims); + MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::x}; + + + std::shared_ptr bin_conv_desc; + bin_conv_desc.reset(new binarization_forward::desc(prop_kind::forward_scoring, algorithm::binarization_depthwise, + in_candidate, wgh_candidate, out_candidate)); + + descs.emplace_back(bin_conv_desc); + + InferenceEngine::SizeVector dims; + dims.push_back(getParentEdgeAt(0)->getDims()[1]); + + auto InputLowBlob = dynamic_cast*>(getParentEdgeAt(1)->getParent()->getCnnLayer()->blobs["custom"].get()); + + auto inputLowData = InputLowBlob->buffer().as(); + int inputLowAxis = getParentEdgeAt(1)->getDims().ndims() == 1 ? 0 : 1; + bool isInputLowBroadcasted = getParentEdgeAt(1)->getDims()[inputLowAxis] != dims[0]; + + for (int i = 0; i < dims[0]; i++) { + binarizationThresholds.push_back(inputLowData[isInputLowBroadcasted ? 0 : i]); + } + } +} + +void MKLDNNQuantizeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); + auto outputDataType = canStorePacked ? MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::BIN) + : MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); + + + + auto same = [&] (memory::format fmt, impl_desc_type impl) -> PrimitiveDescInfo { + InferenceEngine::LayerConfig config; + config.dynBatchSupport = true; + for (size_t i = 0; i < getParentEdges().size(); i++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + + if (i == 0) { + dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, fmt); + } else { + dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, + getParentEdgeAt(i)->getDims().ndims() == 1 ? memory::x : memory::nchw); + } + config.inConfs.push_back(dataConfig); + } + + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt); + config.outConfs.push_back(dataConfig); + return {config, impl}; + }; + + supportedPrimitiveDescriptors.push_back(same(memory::nhwc, ref_any)); + + if (canStorePacked) { + primitive_desc_iterator itpd = descs[0].createPrimitiveDescriptorIterator(getEngine()); + do { + impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str()); + supportedPrimitiveDescriptors.push_back(same(memory::nhwc, impl_type)); + } while (itpd.next()); + } +} + +void MKLDNNQuantizeNode::createPrimitive() { + if (prim) + return; + + auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); + auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); + if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Destination memory isn't allocated."; + if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Input memory isn't allocated."; + if (getSelectedPrimitiveDescriptor() == nullptr) + THROW_IE_EXCEPTION << "Preferable primitive descriptor isn't set."; + + if (canStorePacked) { + auto prim_desc = createPrimitiveDescriptor(); + + MKLDNNMemoryDesc binarizationDataDesc = {{getParentEdgeAt(0)->getDims()[1]}, memory::f32, memory::x}; + auto binarizationDataMem = std::make_shared(getEngine()); + binarizationDataMem->Create(binarizationDataDesc, &binarizationThresholds[0]); + internalBlobMemory.push_back(binarizationDataMem); + + prim.reset(new binarization_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(), + internalBlobMemory[0]->GetPrimitive(), + getChildEdgeAt(0)->getMemory().GetPrimitive())); + } +} + +void MKLDNNQuantizeNode::execute(mkldnn::stream strm) { + if (prim) { + MKLDNNNode::execute(strm); + } else { + auto &srcMemory = getParentEdgeAt(0)->getMemoryPtr(); + auto &inputLowMemory = getParentEdgeAt(1)->getMemoryPtr(); + auto &inputHighMemory = getParentEdgeAt(2)->getMemoryPtr(); + auto &outputLowMemory = getParentEdgeAt(3)->getMemoryPtr(); + auto &outputHighMemory = getParentEdgeAt(4)->getMemoryPtr(); + auto &dstMemory = getChildEdgeAt(0)->getMemoryPtr(); + + auto srcData = reinterpret_cast(srcMemory->GetData()); + auto inputLowData = reinterpret_cast(inputLowMemory->GetData()); + auto inputHighData = reinterpret_cast(inputHighMemory->GetData()); + auto outputLowData = reinterpret_cast(outputLowMemory->GetData()); + auto outputHighData = reinterpret_cast(outputHighMemory->GetData()); + auto dstData = reinterpret_cast(dstMemory->GetData()); + + srcData += srcMemory->GetDescriptor().data.layout_desc.blocking.offset_padding; + inputLowData += inputLowMemory->GetDescriptor().data.layout_desc.blocking.offset_padding; + inputHighData += inputHighMemory->GetDescriptor().data.layout_desc.blocking.offset_padding; + outputLowData += outputLowMemory->GetDescriptor().data.layout_desc.blocking.offset_padding; + outputHighData += outputHighMemory->GetDescriptor().data.layout_desc.blocking.offset_padding; + dstData += dstMemory->GetDescriptor().data.layout_desc.blocking.offset_padding; + + size_t N = static_cast(batchToProcess()); + size_t C = static_cast(srcMemory->GetDims()[1]); + size_t H = static_cast(srcMemory->GetDims()[2]); + size_t W = static_cast(srcMemory->GetDims()[3]); + + int inputLowAxis = inputLowMemory->GetDims().size() == 1 ? 0 : 1; + bool isInputLowBroadcasted = inputLowMemory->GetDims()[inputLowAxis] != C; + + int inputHighAxis = inputHighMemory->GetDims().size() == 1 ? 0 : 1; + bool isInputHighBroadcasted = inputHighMemory->GetDims()[inputHighAxis] != C; + + int outputLowAxis = outputLowMemory->GetDims().size() == 1 ? 0 : 1; + bool isOutputLowBroadcasted = outputLowMemory->GetDims()[outputLowAxis] != C; + + int outputHighAxis = outputHighMemory->GetDims().size() == 1 ? 0 : 1; + bool isOutputHighBroadcasted = outputHighMemory->GetDims()[outputHighAxis] != C; + + for (int n = 0; n < N; n++) { + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + for (int c = 0; c < C; c++) { + size_t idx = n * H * W * C + h * W * C + w * C + c; + + float inputLow = inputLowData[isInputLowBroadcasted ? 0 : c]; + float inputHigh = inputHighData[isInputHighBroadcasted ? 0 : c]; + float outputLow = outputLowData[isOutputLowBroadcasted ? 0 : c]; + float outputHigh = outputHighData[isOutputHighBroadcasted ? 0 : c]; + + if (srcData[idx] <= inputLow) + dstData[idx] = outputLow; + else if (srcData[idx] > inputHigh) + dstData[idx] = outputHigh; + else + dstData[idx] = roundf((srcData[idx] - inputLow) / (inputHigh - inputLow) * (levels - 1)) / + (levels - 1) * (outputHigh - outputLow) + outputLow; + } + } + } + } + } +} + +bool MKLDNNQuantizeNode::created() const { + return getType() == Quantize; +} diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h new file mode 100644 index 0000000..644926c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h @@ -0,0 +1,36 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNQuantizeNode : public MKLDNNNode { +public: + MKLDNNQuantizeNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng); + ~MKLDNNQuantizeNode() override = default; + + void initSupportedPrimitiveDescriptors() override; + void getSupportedDescriptors() override; + void createPrimitive() override; + bool created() const override; + void execute(mkldnn::stream strm) override; + + +private: + static Register reg; + + bool canStorePacked; + int levels; + + std::vector binarizationThresholds; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp index 345b215..103f49d 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h index 7a228ec..32c3736 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp index d959aa5..4d2c34b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h index bb30099..b172ef8 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp index ba32285..af11763 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp @@ -1,11 +1,10 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "mkldnn_rnn.h" #include "mkldnn_extension_utils.h" #include "desc_iterator.hpp" -#include #include #include @@ -22,19 +21,40 @@ inline bool one_of(T val, P item, Args... item_others) { return val == item || one_of(val, item_others...); } -rnn_direction ie2mkl(RNNLayer::Direction &direction) { - return direction == RNNLayer::RNN_FWD ? unidirectional_left2right - : direction == RNNLayer::RNN_BWD ? unidirectional_right2left - : direction == RNNLayer::RNN_BDR ? bidirectional_concat - : unidirectional; +using _RNN = RNNSequenceLayer; // alias + +static rnn_direction ie2mkl(_RNN::Direction &direction) { + return direction == _RNN::FWD ? unidirectional_left2right + : direction == _RNN::BWD ? unidirectional_right2left + : direction == _RNN::BDR ? bidirectional_concat + : unidirectional; +} + +static algorithm ie2mkl(std::string act_type) { + return act_type == "sigmoid" ? eltwise_logistic + : act_type == "tanh" ? eltwise_tanh + : act_type == "relu" ? eltwise_relu + : algorithm_undef; +} + +static algorithm ie2mkl(RNNCellBase::CellType cell_type) { + switch (cell_type) { + case RNNCellBase::LSTM: return vanilla_lstm; + case RNNCellBase::GRU: return vanilla_gru; + case RNNCellBase::GRU_LBR: return gru_linear_before_reset; + case RNNCellBase::RNN: return vanilla_rnn; + default: + THROW_IE_EXCEPTION << "Unsoupported cell type"; + return algorithm_undef; + } } MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) { - is_cell = layer->type == "LSTMCell"; + is_cell = one_of(layer->type, "LSTMCell", "GRUCell", "RNNCell"); } bool MKLDNNRNN::created() const { - return getType() == (is_cell ? LSTMCell : RNN); + return getType() == (is_cell ? RNNCell : RNNSeq); } void MKLDNNRNN::getSupportedDescriptors() { @@ -46,48 +66,59 @@ void MKLDNNRNN::getSupportedDescriptors() { void MKLDNNRNN::fillCellDesc() { if (!descs.empty()) return; - auto cellLayer = std::dynamic_pointer_cast(getCnnLayer()); + auto cellLayer = std::dynamic_pointer_cast(getCnnLayer()); if (!cellLayer) - THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer."; + THROW_IE_EXCEPTION << "No original layer for RNNCell."; + + algorithm cell_type = ie2mkl(cellLayer->cellType); + algorithm cell_act = ie2mkl(cellLayer->activations[0]); // Works only for RNN with one gate + + cell_desc = {cell_type, cell_act}; + if (cellLayer->clip != 0.0f) + cell_desc.set_clipping(cellLayer->clip); auto &ins = cellLayer->insData; auto &outs = cellLayer->outData; - if (ins.size() != 3) + if (!one_of(ins.size(), 3, 2)) THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName(); - if (outs.size() != 2) + if (!one_of(outs.size(), 2, 1)) THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName(); auto in_data_dims = getParentEdgeAt(0)->getDims(); auto in_h_state_dims = getParentEdgeAt(1)->getDims(); - auto in_c_state_dims = getParentEdgeAt(2)->getDims(); - auto out_h_state_dims = getChildEdgeAt(0)->getDims(); - auto out_c_state_dims = getChildEdgeAt(1)->getDims(); - if (in_data_dims.ndims() != 2 - || in_h_state_dims.ndims() != 2 - || in_c_state_dims.ndims() != 2 - || out_h_state_dims.ndims() != 2 - || out_c_state_dims.ndims() != 2) + if (in_data_dims.ndims() != 2 || in_h_state_dims.ndims() != 2) THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); + G = cell_desc.get_gates_count(); + S = cell_desc.get_state_count(); T = 1; N = in_data_dims[0]; DC = in_data_dims[1]; SC = in_h_state_dims[1]; + Gb = (cell_type != gru_linear_before_reset) ? G : G + 1; + // Expected shapes MKLDNNDims D_shape {N, DC}, S_shape {N, SC}; if (in_data_dims != D_shape || in_h_state_dims != S_shape - || in_c_state_dims != S_shape - || out_h_state_dims != S_shape - || out_c_state_dims != S_shape) + || out_h_state_dims != S_shape) THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); + if (S == 2) { + auto in_c_state_dims = getParentEdgeAt(2)->getDims(); + auto out_c_state_dims = getChildEdgeAt(1)->getDims(); + + if (in_c_state_dims != S_shape + || out_c_state_dims != S_shape) + THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); + } + auto blobs = cellLayer->blobs; Blob::Ptr weights, bias; if (blobs.find("weights") != blobs.end()) weights = blobs["weights"]; @@ -99,7 +130,7 @@ void MKLDNNRNN::fillCellDesc() { if (weights->size() != G*SC*(SC+DC)) THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC); - if (bias && bias->size() != G*SC) + if (bias && bias->size() != Gb*SC) THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC; // Shapes and Attributes are correct. Can start internal stuff initialization. @@ -114,44 +145,55 @@ void MKLDNNRNN::fillCellDesc() { w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo}; if (bias) - w_bias_d = {{L, D, G, SC}, memory::f32, memory::ldgo}; + w_bias_d = {{L, D, Gb, SC}, memory::f32, memory::ldgo}; - std::vector in_candidate; + std::vector in_candidate, out_candidate; in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc}); in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); - in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); - - std::vector out_candidate; - out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + if (S == 2) { + in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + } + createDescriptor(in_candidate, out_candidate); } void MKLDNNRNN::fillSeqDesc() { if (!descs.empty()) return; - auto rnnLayer = std::dynamic_pointer_cast(getCnnLayer()); + auto rnnLayer = std::dynamic_pointer_cast(getCnnLayer()); if (!rnnLayer) - THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer."; + THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNSequenceLayer."; + + if (!one_of(rnnLayer->cellType, _RNN::LSTM, _RNN::GRU, _RNN::GRU_LBR, _RNN::RNN)) + THROW_IE_EXCEPTION << "RNN layer supports only LSTM/GRU/RNN cell"; - if (!one_of(rnnLayer->cellType, "LSTM")) - THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell"; + algorithm cell_type = ie2mkl(rnnLayer->cellType); + algorithm cell_act = algorithm_undef; + if (!rnnLayer->activations.empty()) + cell_act = ie2mkl(rnnLayer->activations[0]); // Works only for RNN with one gate + + cell_desc = {cell_type, cell_act}; + + if (rnnLayer->clip != 0.0f) + cell_desc.set_clipping(rnnLayer->clip); if (!one_of(rnnLayer->axis, 0, 1)) THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1"; nativeOrder = rnnLayer->axis == 0; - if (!one_of(rnnLayer->direction, RNNLayer::RNN_FWD, RNNLayer::RNN_BWD)) + if (!one_of(rnnLayer->direction, _RNN::FWD, _RNN::BWD)) THROW_IE_EXCEPTION << "RNN layer supports only unidirectional RNN layer"; direction = ie2mkl(rnnLayer->direction); auto &ins = rnnLayer->insData; auto &outs = rnnLayer->outData; - if (!one_of(ins.size(), 3, 1)) + if (!one_of(ins.size(), 3, 2, 1)) THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName(); - if (!one_of(outs.size(), 3, 1)) + if (!one_of(outs.size(), 3, 2, 1)) THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName(); auto in_data_dims = getParentEdgeAt(0)->getDims(); @@ -165,32 +207,32 @@ void MKLDNNRNN::fillSeqDesc() { std::swap(out_data_dims[0], out_data_dims[1]); } + G = cell_desc.get_gates_count(); + S = cell_desc.get_state_count(); T = in_data_dims[0]; N = in_data_dims[1]; DC = in_data_dims[2]; SC = out_data_dims[2]; + Gb = (cell_type != gru_linear_before_reset) ? G : G + 1; + MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC}; if (out_data_dims != OD_shape) THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); - if (ins.size() == 3) { - auto state_dims1 = getParentEdgeAt(1)->getDims(); - auto stats_dims2 = getParentEdgeAt(2)->getDims(); - - if (state_dims1 != S_shape || stats_dims2 != S_shape) - THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName(); + if (ins.size() > 1) { + for (int i = 1; i < ins.size(); i++) + if (getParentEdgeAt(i)->getDims() != S_shape) + THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName(); in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc}; } - if (outs.size() == 3) { - auto state_dims1 = getChildEdgeAt(1)->getDims(); - auto stats_dims2 = getChildEdgeAt(2)->getDims(); - - if (state_dims1 != S_shape || stats_dims2 != S_shape) - THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName(); + if (outs.size() > 1) { + for (int i = 1; i < outs.size(); i++) + if (getChildEdgeAt(i)->getDims() != S_shape) + THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName(); out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc}; } @@ -209,11 +251,11 @@ void MKLDNNRNN::fillSeqDesc() { w_data_d = {{L, D, DC, G, SC}, memory::f32, memory::ldigo}; w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo}; - if (bias && bias->size() != G*SC) + if (bias && bias->size() != Gb*SC) THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC; if (bias) - w_bias_d = {{L, D, G, SC}, memory::f32, memory::ldgo}; + w_bias_d = {{L, D, Gb, SC}, memory::f32, memory::ldgo}; // Try to create descriptor and corresponding configuration in_data_d = {in_data_dims, memory::f32, memory::tnc}; @@ -225,10 +267,8 @@ void MKLDNNRNN::fillSeqDesc() { else in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc}); - if (ins.size() == 3) { - in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + for (int i = 1; i < ins.size(); i++) in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); - } std::vector out_candidate; if (nativeOrder) @@ -236,10 +276,8 @@ void MKLDNNRNN::fillSeqDesc() { else out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc}); - if (outs.size() == 3) { - out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + for (int i = 1; i < outs.size(); i++) out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); - } createDescriptor(in_candidate, out_candidate); } @@ -247,8 +285,7 @@ void MKLDNNRNN::fillSeqDesc() { void MKLDNNRNN::createDescriptor(const std::vector &inputDesc, const std::vector &outputDesc) { MKLDNNDescriptor desc(std::shared_ptr( - new rnn_forward::desc(forward_scoring, - {algorithm::vanilla_lstm, algorithm::eltwise_tanh }, + new rnn_forward::desc(forward_scoring, cell_desc, direction, /* In Data */ in_data_d, /* In State */ in_state_d, @@ -305,7 +342,6 @@ void MKLDNNRNN::createPrimitive() { { /* Copy Weight data - * * IE format: * W - [gates, out_state_size, in_data_size + in_state_size] * B - [gates, out_state_size] @@ -316,11 +352,46 @@ void MKLDNNRNN::createPrimitive() { * B - [gates, out_state_size] * * Gate order + * ====== LSTM ====== * Caffe - IFOC, ONNX - IOFC * IE - FICO, mkldnn - IFCO + * + * ====== GRU ====== + * IE - URO, mkldnn - URO */ - // FICO -> IFCO - const int gate_map[] = {1, 0, 2, 3}; + const int gate_map_lstm[] = {1, 0, 2, 3}; // FICO -> IFCO + const int gate_map_gru[] = {0, 1, 2, 3}; + const int gate_map_rnn[] = {0}; + const int *gate_map; + const int gate_map_lstm_size = sizeof(gate_map_lstm) / sizeof(int); + const int gate_map_gru_size = sizeof(gate_map_gru) / sizeof(int); + const int gate_map_rnn_size = sizeof(gate_map_rnn) / sizeof(int); + if (cell_desc.get_cell_kind() == vanilla_lstm) { + gate_map = gate_map_lstm; + if (G > gate_map_lstm_size) { + THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map"; + } + } else if (cell_desc.get_cell_kind() == vanilla_gru) { + gate_map = gate_map_gru; + if (G > gate_map_gru_size) { + THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map"; + } + } else if (cell_desc.get_cell_kind() == gru_linear_before_reset) { + gate_map = gate_map_gru; + if (G > gate_map_gru_size) { + THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map"; + } + } else if (cell_desc.get_cell_kind() == vanilla_rnn) { + gate_map = gate_map_rnn; + if (G > gate_map_rnn_size) { + THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map"; + } + } else { + gate_map = gate_map_gru; + if (G > gate_map_gru_size) { + THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map"; + } + } auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as(); auto w_ptr = static_cast(w_data_mem->GetData()); @@ -348,7 +419,7 @@ void MKLDNNRNN::createPrimitive() { if (w_bias_d) { auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as(); auto b_ptr = static_cast(w_bias_mem->GetData()); - for (int g = 0; g < G; g++) { + for (int g = 0; g < Gb; g++) { float *l_b_ptr = b_ptr + gate_map[g]*SC; for (int out_i = 0; out_i < SC; out_i++) { *l_b_ptr = *ie_b_ptr; @@ -363,53 +434,44 @@ void MKLDNNRNN::createPrimitive() { src_state_mem->Create(in_state_d); internalBlobMemory.push_back(src_state_mem); if (in_state_d) { - /* create copy/concat primitive */ - auto src_stat_1 = getParentEdgeAt(1)->getMemory().GetPrimitive(); - auto src_stat_2 = getParentEdgeAt(2)->getMemory().GetPrimitive(); - - auto low_half_state_mem = std::make_shared(getEngine()); - low_half_state_mem->Create( - src_stat_1.get_primitive_desc().desc(), - src_state_mem->GetPrimitive().get_data_handle()); - internalBlobMemory.push_back(low_half_state_mem); - - auto high_half_state_mem = std::make_shared(getEngine()); - high_half_state_mem->Create( - src_stat_2.get_primitive_desc().desc(), - static_cast(src_state_mem->GetPrimitive().get_data_handle()) + - src_stat_1.get_primitive_desc().get_size()); - internalBlobMemory.push_back(high_half_state_mem); - - exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive()); - exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive()); + int offset = 0; + for (int i = 0; i < S; i++) { + /* create copy/concat primitive */ + auto src_stat = getParentEdgeAt(i+1)->getMemory().GetPrimitive(); + + auto state_mem = std::make_shared(getEngine()); + state_mem->Create( + src_stat.get_primitive_desc().desc(), + static_cast(src_state_mem->GetPrimitive().get_data_handle()) + offset); + offset += src_stat.get_primitive_desc().get_size(); + + internalBlobMemory.push_back(state_mem); + + exec_before.emplace_back(src_stat, state_mem->GetPrimitive()); + } } auto dst_state_mem = std::make_shared(getEngine()); dst_state_mem->Create(out_state_d); internalBlobMemory.push_back(dst_state_mem); if (out_state_d) { - int idx_H = is_cell ? 0 : 1; - int idx_C = is_cell ? 1 : 2; - /* create copy/split primitive */ - auto dst_stat_1 = getChildEdgeAt(idx_H)->getMemory().GetPrimitive(); - auto dst_stat_2 = getChildEdgeAt(idx_C)->getMemory().GetPrimitive(); - - auto low_half_state_mem = std::make_shared(getEngine()); - low_half_state_mem->Create( - dst_stat_1.get_primitive_desc().desc(), - dst_state_mem->GetPrimitive().get_data_handle()); - internalBlobMemory.push_back(low_half_state_mem); - - auto high_half_state_mem = std::make_shared(getEngine()); - high_half_state_mem->Create( - dst_stat_2.get_primitive_desc().desc(), - static_cast(dst_state_mem->GetPrimitive().get_data_handle()) + - dst_stat_1.get_primitive_desc().get_size()); - internalBlobMemory.push_back(high_half_state_mem); - - - if (!is_cell) exec_after.emplace_back(low_half_state_mem->GetPrimitive(), dst_stat_1); - exec_after.emplace_back(high_half_state_mem->GetPrimitive(), dst_stat_2); + int offset = 0; + int idx_start = is_cell ? 0 : 1; + for (int i = 0; i < S; i++) { + /* create copy/split primitive */ + auto dst_stat = getChildEdgeAt(idx_start + i)->getMemory().GetPrimitive(); + + auto state_mem = std::make_shared(getEngine()); + state_mem->Create( + dst_stat.get_primitive_desc().desc(), + static_cast(dst_state_mem->GetPrimitive().get_data_handle()) + offset); + offset += dst_stat.get_primitive_desc().get_size(); + + internalBlobMemory.push_back(state_mem); + + if (is_cell && i == 0) continue; + exec_after.emplace_back(state_mem->GetPrimitive(), dst_stat); + } } auto workspace_mem = std::make_shared(getEngine()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h index 4399c30..6404596 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -42,15 +42,19 @@ private: /** Direction of iteration through sequence dimension */ mkldnn::rnn_direction direction = mkldnn::unidirectional; + /** RNN Cell desc (type/activation_alg/clip)*/ + mkldnn::rnn_cell::desc cell_desc { mkldnn::algorithm::vanilla_lstm }; + // Internal attributes - int N = 0; /**< Batch value */ - int T = 0; /**< Sequence value */ - int DC = 0; /**< Input data channel size */ - int SC = 0; /**< State channel size value */ - const int G = 4; /**< Gate size. 4 for LSTM */ - const int L = 1; /**< What is it??. Constant for mkldnn impl */ - const int D = 1; /**< Num of direction. 1 or 2 */ - const int S = 2; /**< Num of state. 2 for LSTM (hidden and sell state). */ + ptrdiff_t N = 0; /**< Batch value */ + ptrdiff_t T = 0; /**< Sequence value */ + ptrdiff_t DC = 0; /**< Input data channel size */ + ptrdiff_t SC = 0; /**< State channel size value */ + ptrdiff_t G = 0; /**< Gate size. LSTM - 4, GRU - 3, RNN - 1 */ + ptrdiff_t Gb = 0; /**< Gate size for biases. Gb = GRU_lbr ? G+1 : G */ + ptrdiff_t S = 2; /**< Num of state. LSTM - 2, GRU & RNN - 1 */ + const ptrdiff_t L = 1; /**< What is it??. Constant for mkldnn impl */ + const ptrdiff_t D = 1; /**< Num of direction. 1 or 2 */ MKLDNNMemoryDesc in_data_d; MKLDNNMemoryDesc out_data_d; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp index 4088a1f..2843a6f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h index ca2bafd..34333d5 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp index 7521727..949815c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h index 8e199f3..be9a542 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp index 90cf4f4..cce7264 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,17 +31,6 @@ void MKLDNNSplitNode::getSupportedDescriptors() { axis = splitLayer->_axis; if (axis >= getParentEdgeAt(0)->getDims().ndims()) THROW_IE_EXCEPTION << "Invalid value of axis parameter in split layer"; - - // WA. Check applicability and limitations - for (size_t i = 1; i < getCnnLayer()->outData.size(); i++) { - int num_port_connection = getCnnLayer()->outData[i]->inputTo.size(); - // limitation. If num of edges more than num of ports, - // we connect it to first port. So check that all ports [1:] - // have only one connection. - if (num_port_connection > 1) - THROW_IE_EXCEPTION << "Unsupported topology. Split layer \"" << getCnnLayer()->name << "\" " - << "has output edges more than output ports."; - } } void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { @@ -65,7 +54,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { config.inConfs[0].inPlace = -1; config.inConfs[0].constant = false; config.inConfs[0].desc = MKLDNNMemoryDesc(srcDims, inputDataType, memory::format::any); - config.outConfs.resize(getChildEdges().size()); + config.outConfs.resize(outDims.size()); if (srcDims.ndims() < 2) THROW_IE_EXCEPTION << "Split " << getName() << " isn't supported 1d blobs"; @@ -114,11 +103,11 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { } config.inConfs[0].desc = TensorDesc(Precision::FP32, srcDims.ToSizeVector(), {srcDims.ToSizeVector(), order, offset, offsets, strides}); - for (size_t i = 0; i < getChildEdges().size(); i++) { - auto outDims = getChildEdgeAt(i)->getDims(); + for (size_t i = 0; i < outDims.size(); i++) { + auto dims = outDims[i].ToSizeVector(); config.outConfs[i].inPlace = 0; - config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims.ToSizeVector(), - {outDims.ToSizeVector(), order, offset, offsets, strides}); + config.outConfs[i].desc = TensorDesc(Precision::FP32, dims, + {dims, order, offset, offsets, strides}); } supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); @@ -149,9 +138,9 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { config.inConfs[0].desc = TensorDesc(Precision::FP32, srcDims.ToSizeVector(), {blkDims, order, offset, offsets, strides}); bool canInplace = true; - for (size_t i = 0; i < getChildEdges().size(); i++) { - auto outDims = getChildEdgeAt(i)->getDims().ToSizeVector(); - blkDims = outDims; + for (size_t i = 0; i < outDims.size(); i++) { + auto dims = outDims[i].ToSizeVector(); + blkDims = dims; if (blkDims[1] % sizeS) { canInplace = false; @@ -159,7 +148,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { } blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); blkDims.push_back(sizeS); - config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides}); + config.outConfs[i].desc = TensorDesc(Precision::FP32, dims, {blkDims, order, offset, offsets, strides}); } if (canInplace) supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); @@ -408,37 +397,19 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() { const auto& cnnLayer = getCnnLayer(); if (!cnnLayer) THROW_IE_EXCEPTION << "Cannot create Split layer " << getName() << " without CNNLayer!"; - if (config.outConfs.size() != getChildEdges().size()) + if (config.outConfs.size() != outDims.size()) THROW_IE_EXCEPTION << "Invalid config for Split layer " << getName(); size_t offset = 0; for (size_t i = 0; i < cnnLayer->outData.size(); i++) { - size_t confNum(0); - bool found = false; - for (size_t j = i; j < getChildEdges().size(); j++) { - if (cnnLayer->outData[i]->inputTo.find(getChildEdgeAt(j)->getChild()->getName()) == cnnLayer->outData[i]->inputTo.end()) - continue; - confNum = j; - config.outConfs[j].desc = InferenceEngine::TensorDesc(config.outConfs[j].desc.getPrecision(), - config.outConfs[j].desc.getDims(), { - config.outConfs[j].desc.getBlockingDesc().getBlockDims(), - config.outConfs[j].desc.getBlockingDesc().getOrder(), - config.inConfs[0].desc.getBlockingDesc().getOffsetPadding() + offset, - config.inConfs[0].desc.getBlockingDesc().getOffsetPaddingToData(), - config.inConfs[0].desc.getBlockingDesc().getStrides() - }); - found = true; - } - if (!found) { - confNum = i; - config.outConfs[i].desc = InferenceEngine::TensorDesc(config.outConfs[i].desc.getPrecision(), - config.outConfs[i].desc.getDims(), { - config.outConfs[i].desc.getBlockingDesc().getBlockDims(), - config.outConfs[i].desc.getBlockingDesc().getOrder(), - config.inConfs[0].desc.getBlockingDesc().getOffsetPadding() + offset, - config.inConfs[0].desc.getBlockingDesc().getOffsetPaddingToData(), - config.inConfs[0].desc.getBlockingDesc().getStrides() - }); - } + size_t confNum = i; + config.outConfs[i].desc = InferenceEngine::TensorDesc(config.outConfs[i].desc.getPrecision(), + config.outConfs[i].desc.getDims(), { + config.outConfs[i].desc.getBlockingDesc().getBlockDims(), + config.outConfs[i].desc.getBlockingDesc().getOrder(), + config.inConfs[0].desc.getBlockingDesc().getOffsetPadding() + offset, + config.inConfs[0].desc.getBlockingDesc().getOffsetPaddingToData(), + config.inConfs[0].desc.getBlockingDesc().getStrides() + }); size_t axisSize = 1; for (size_t j = axis; j < config.outConfs[confNum].desc.getBlockingDesc().getBlockDims().size(); j++) { axisSize *= config.outConfs[confNum].desc.getBlockingDesc().getBlockDims()[j]; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h index 905f806..3fca021 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp index 1226716..b7cdd40 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h index d6a7594..572a98a 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/perf_count.h b/inference-engine/src/mkldnn_plugin/perf_count.h index 3770a24..988054d 100644 --- a/inference-engine/src/mkldnn_plugin/perf_count.h +++ b/inference-engine/src/mkldnn_plugin/perf_count.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp index 24d2931..7f61fce 100644 --- a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp +++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "blob_dump.h" diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.h b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h index 4130d53..1390c18 100644 --- a/inference-engine/src/mkldnn_plugin/utils/blob_dump.h +++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/tests/CMakeLists.txt b/inference-engine/tests/CMakeLists.txt index 4fa0b44..2918415 100644 --- a/inference-engine/tests/CMakeLists.txt +++ b/inference-engine/tests/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -11,35 +11,18 @@ set (CMAKE_CXX_STANDARD_REQUIRED ON) set (gtest_force_shared_crt ON CACHE BOOL "disable static CRT for google test") -#detecting regex support -if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) - add_definitions(-DUSE_BOOST_RE) - set(USE_BOOST_RE ON) -else() - set(USE_BOOST_RE OFF) -endif() - add_subdirectory(mock_engine) -add_subdirectory(libs/gtest) +##################################################################################################### +# SETUP GOOGLE TESTS # +##################################################################################################### -include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR} - ${gmock_SOURCE_DIR}/include - ${gmock_SOURCE_DIR} - ${IE_MAIN_SOURCE_DIR}/include - ${IE_MAIN_SOURCE_DIR}/src - ${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src - ${IE_MAIN_SOURCE_DIR}/tests/helpers - ${IE_MAIN_SOURCE_DIR}/samples/common - ${IE_MAIN_SOURCE_DIR}/samples/common/format_reader - ${MKLDNN}/include) +add_subdirectory(libs/gtest) ##################################################################################################### # SETUP GOOGLE TESTS # ##################################################################################################### enable_testing() -link_directories(${LIBRARY_OUTPUT_DIRECTORY}) - add_subdirectory(helpers) add_subdirectory(unit) diff --git a/inference-engine/tests/helpers/CMakeLists.txt b/inference-engine/tests/helpers/CMakeLists.txt index 4ab1278..9c1e197 100644 --- a/inference-engine/tests/helpers/CMakeLists.txt +++ b/inference-engine/tests/helpers/CMakeLists.txt @@ -1,8 +1,7 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) set(TARGET_NAME helpers) file(GLOB HELPERS_SRC @@ -15,16 +14,49 @@ file (GLOB HELPERS_INCLUDES ) ## Enable Models multiple search pathes -message("configuring file: ${PROJECT_BINARY_DIR}/test_model_repo.h") -configure_file(test_model_repo.hpp.in ${PROJECT_BINARY_DIR}/test_model_repo.hpp @ONLY) +message("configuring file: ${CMAKE_CURRENT_BINARY_DIR}/test_model_repo.h") +configure_file(test_model_repo.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/test_model_repo.hpp @ONLY) add_library(${TARGET_NAME} STATIC ${HELPERS_SRC} ${HELPERS_HEADERS}) -target_include_directories(${TARGET_NAME} PUBLIC ${PROJECT_BINARY_DIR}) -target_compile_definitions(${TARGET_NAME} PUBLIC -DMODELS_PATH=\"${MODELS_PATH}\") +# detecting regex support +if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) + target_compile_definitions(${TARGET_NAME} PUBLIC USE_BOOST_RE) + + debug_message(STATUS "Adding boost dependency") + if (VERBOSE_BUILD) + set(Boost_DEBUG on) + endif () + find_package(Boost REQUIRED COMPONENTS regex) + target_link_libraries(${TARGET_NAME} PUBLIC ${Boost_REGEX_LIBRARY}) + target_include_directories(${TARGET_NAME} PUBLIC ${Boost_INCLUDE_DIRS}) +endif() + +if(MSVC) + set(PUGI pugixml_mt) +else() + set(PUGI pugixml) +endif() + +if(WIN32) + target_include_directories(${TARGET_NAME} PUBLIC "${IE_MAIN_SOURCE_DIR}/samples/common") +endif() + +target_link_libraries(${TARGET_NAME} PUBLIC inference_engine ${PUGI}) + +target_include_directories(${TARGET_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}" "${gtest_SOURCE_DIR}/include" + "${IE_MAIN_SOURCE_DIR}/src" "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src" + "${gmock_SOURCE_DIR}/include" + PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") + +# TODO: eliminate dependency on samples +target_include_directories(${TARGET_NAME} PUBLIC + "${IE_MAIN_SOURCE_DIR}/samples/common") + +target_compile_definitions(${TARGET_NAME} PUBLIC MODELS_PATH=\"${MODELS_PATH}\" DATA_PATH=\"${VALIDATION_SET}\") set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 11) set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD_REQUIRED ON) diff --git a/inference-engine/tests/helpers/disable_tests.hpp b/inference-engine/tests/helpers/disable_tests.hpp index d0f0949..04c9b68 100644 --- a/inference-engine/tests/helpers/disable_tests.hpp +++ b/inference-engine/tests/helpers/disable_tests.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/ir_gen_helper.cpp b/inference-engine/tests/helpers/ir_gen_helper.cpp index 40a05c4..3679d29 100644 --- a/inference-engine/tests/helpers/ir_gen_helper.cpp +++ b/inference-engine/tests/helpers/ir_gen_helper.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "ir_gen_helper.hpp" diff --git a/inference-engine/tests/helpers/ir_gen_helper.hpp b/inference-engine/tests/helpers/ir_gen_helper.hpp index db8bff5..bdb0e16 100644 --- a/inference-engine/tests/helpers/ir_gen_helper.hpp +++ b/inference-engine/tests/helpers/ir_gen_helper.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -19,7 +19,7 @@ namespace single_layer_tests { const std::string& precision, const std::string& layers, const std::string& edges, - const unsigned ir_version = 4u); + const unsigned ir_version = 5u); }; } // namespace single_layer_tests diff --git a/inference-engine/tests/helpers/single_layer_common.cpp b/inference-engine/tests/helpers/single_layer_common.cpp index 434d3f2..d0310b0 100644 --- a/inference-engine/tests/helpers/single_layer_common.cpp +++ b/inference-engine/tests/helpers/single_layer_common.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/single_layer_common.hpp b/inference-engine/tests/helpers/single_layer_common.hpp index 1354129..a5cc968 100644 --- a/inference-engine/tests/helpers/single_layer_common.hpp +++ b/inference-engine/tests/helpers/single_layer_common.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -54,6 +54,7 @@ struct conv_common_params { std::string auto_pad; size_t group; size_t out_c; + bool with_bias; }; struct pool_common_params { @@ -66,6 +67,11 @@ struct pool_common_params { bool exclude_pad; }; +struct eltwise_common_params { + std::string operation; + std::vector coeff; +}; + #define PRETTY_PARAM(name, type) \ class name \ { \ @@ -104,7 +110,7 @@ template inline InferenceEngine::details::CNNNetworkImplPtr buildSingleLayerNetworkCommon(InferenceEngine::details::IFormatParser *parser, const std::string &layerType, - const testing::InOutData &inOutShapes, + const testing::InOutShapes &inOutShapes, std::map *params, const std::string &layerDataName = "data", const InferenceEngine::Precision &precision = InferenceEngine::Precision::FP32, diff --git a/inference-engine/tests/helpers/test_assertions.hpp b/inference-engine/tests/helpers/test_assertions.hpp index 5e2ee36..44bbdb0 100644 --- a/inference-engine/tests/helpers/test_assertions.hpp +++ b/inference-engine/tests/helpers/test_assertions.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/test_model_path.hpp b/inference-engine/tests/helpers/test_model_path.hpp index 73f4fc6..a0acd93 100644 --- a/inference-engine/tests/helpers/test_model_path.hpp +++ b/inference-engine/tests/helpers/test_model_path.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/test_model_repo.hpp.in b/inference-engine/tests/helpers/test_model_repo.hpp.in index 5356f98..6c3f3be 100644 --- a/inference-engine/tests/helpers/test_model_repo.hpp.in +++ b/inference-engine/tests/helpers/test_model_repo.hpp.in @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/tests/helpers/test_models_path.cpp b/inference-engine/tests/helpers/test_models_path.cpp index 69d97b8..cef7482 100644 --- a/inference-engine/tests/helpers/test_models_path.cpp +++ b/inference-engine/tests/helpers/test_models_path.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/tests_common.hpp b/inference-engine/tests/helpers/tests_common.hpp index d9698ae..08135ac 100644 --- a/inference-engine/tests/helpers/tests_common.hpp +++ b/inference-engine/tests/helpers/tests_common.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,8 +27,83 @@ #include "Psapi.h" #endif +class BaseTestCreator { +protected: + std::string _type; +public: + explicit BaseTestCreator(const std::string& type) : _type(type) {} + + virtual InferenceEngine::CNNLayerPtr create(const std::string& type) = 0; + + virtual bool shouldCreate(const std::string& type) = 0; +}; + +template +class LayerTestCreator : public BaseTestCreator { +public: + explicit LayerTestCreator(const std::string& type) : BaseTestCreator(type) {} + + InferenceEngine::CNNLayerPtr create(const std::string& type) override { + InferenceEngine::LayerParams params; + params.type = type; + return std::make_shared(params); + } + + bool shouldCreate(const std::string& type) override { + return type == _type; + } +}; + class TestsCommon : public ::testing::Test { +private: + static std::vector>& getCreators() { + // there should be unique_ptr but it cant be used with initializer lists + static std::vector > creators = { + std::make_shared>("Power"), + std::make_shared>("Convolution"), + std::make_shared>("Deconvolution"), + std::make_shared>("Pooling"), + std::make_shared>("InnerProduct"), + std::make_shared>("FullyConnected"), + std::make_shared>("LRN"), + std::make_shared>("Norm"), + std::make_shared>("Softmax"), + std::make_shared>("SoftMax"), + std::make_shared>("GRN"), + std::make_shared>("MVN"), + std::make_shared>("ReLU"), + std::make_shared>("Clamp"), + std::make_shared>("Split"), + std::make_shared>("Slice"), + std::make_shared>("Concat"), + std::make_shared>("Eltwise"), + std::make_shared>("ScaleShift"), + std::make_shared>("PReLU"), + std::make_shared>("Crop"), + std::make_shared>("Reshape"), + std::make_shared>("Tile"), + std::make_shared>("BatchNormalization"), + std::make_shared>("Gemm"), + std::make_shared>("Pad"), + std::make_shared>("Gather"), + std::make_shared>("StridedSlice"), + std::make_shared>("ShuffleChannels"), + std::make_shared>("DepthToSpace"), + std::make_shared>("ReverseSequence") + }; + return creators; + } public: + static InferenceEngine::CNNLayer::Ptr createLayer(const std::string& type) { + for (auto& creator : getCreators()) { + if (!creator->shouldCreate(type)) + continue; + return creator->create(type); + } + static LayerTestCreator genericCreator(""); + return genericCreator.create(type); + } + static size_t parseLine(char* line) { // This assumes that a digit will be found and the line ends in " Kb". size_t i = strlen(line); @@ -56,12 +131,12 @@ public: return result; } #ifdef _WIN32 - static size_t getVmSizeInKBWin() { - PROCESS_MEMORY_COUNTERS pmc; - pmc.cb = sizeof(PROCESS_MEMORY_COUNTERS); - GetProcessMemoryInfo(GetCurrentProcess(),&pmc, pmc.cb); - return pmc.WorkingSetSize; - } + static size_t getVmSizeInKBWin() { + PROCESS_MEMORY_COUNTERS pmc; + pmc.cb = sizeof(PROCESS_MEMORY_COUNTERS); + GetProcessMemoryInfo(GetCurrentProcess(),&pmc, pmc.cb); + return pmc.WorkingSetSize; + } #endif public: @@ -135,8 +210,8 @@ public: return make_so_name(input); } - static void fill_data(InferenceEngine::Blob::Ptr blob) { - fill_data(blob->buffer().as(), blob->size()); + static void fill_data(InferenceEngine::Blob::Ptr& blob) { + fill_data(blob->buffer().as(), blob->byteSize() / sizeof(float)); } static void fill_data(float *data, size_t size, size_t duty_ratio = 10) { @@ -149,6 +224,25 @@ public: } } + static void fill_data_non_zero(int32_t *data, size_t size, int n) { + for (size_t i = 0; i < size; i++) { + data[i] = n*i%254+1; + } + } + + static void fill_data_bin(float *data, size_t size) { + for (size_t i = 0; i < size; i++) { + data[i] = sinf((float)i) > 0.f ? 1.f : -1.f; + } + } + + static void fill_data_bin_packed(int8_t *data, size_t size) { + int nbits = 8; + for (size_t i = 0; i < div_up(size, nbits); i++) { + data[i] = static_cast(i % 255); + } + } + static void fill_data_sine(float *data, size_t size, float center, float ampl, float omega) { for (size_t i = 0; i < size; i++) { data[i] = center + ampl * sin((float)i * omega); @@ -168,7 +262,6 @@ public: } static void compare(InferenceEngine::Blob &res, InferenceEngine::Blob &ref, float max_diff = 0.01f) { - float *res_ptr = res.buffer().as(); size_t res_size = res.size(); @@ -183,7 +276,6 @@ public: } static void compare_NRMSD(InferenceEngine::Blob &res, InferenceEngine::Blob &ref, float max_nrmsd = 0.01f) { - float *res_ptr = res.buffer().as(); size_t res_size = res.size(); @@ -224,8 +316,7 @@ public: } } - void replace(std::string& str, const std::string& from, const std::string& to) - { + void replace(std::string& str, const std::string& from, const std::string& to) { std::string::size_type pos = 0; while((pos = str.find(from, pos)) != std::string::npos) { @@ -326,6 +417,11 @@ public: return sts; } + template + static inline T div_up(const T a, const U b) { + assert(b); + return (a + b - 1) / b; + } }; diff --git a/inference-engine/tests/helpers/tests_common_func.hpp b/inference-engine/tests/helpers/tests_common_func.hpp index 387d5a6..66e0ed8 100644 --- a/inference-engine/tests/helpers/tests_common_func.hpp +++ b/inference-engine/tests/helpers/tests_common_func.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/tests_file_utils.cpp b/inference-engine/tests/helpers/tests_file_utils.cpp index b23e726..8bc0612 100644 --- a/inference-engine/tests/helpers/tests_file_utils.cpp +++ b/inference-engine/tests/helpers/tests_file_utils.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/tests_file_utils.hpp b/inference-engine/tests/helpers/tests_file_utils.hpp index dbfa50c..3abb891 100644 --- a/inference-engine/tests/helpers/tests_file_utils.hpp +++ b/inference-engine/tests/helpers/tests_file_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/tests_utils.hpp b/inference-engine/tests/helpers/tests_utils.hpp index 3a44889..21351ed 100644 --- a/inference-engine/tests/helpers/tests_utils.hpp +++ b/inference-engine/tests/helpers/tests_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/version_printer.cpp b/inference-engine/tests/helpers/version_printer.cpp index 7448c99..7aa1ba4 100644 --- a/inference-engine/tests/helpers/version_printer.cpp +++ b/inference-engine/tests/helpers/version_printer.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/xml_father.hpp b/inference-engine/tests/helpers/xml_father.hpp index 90b7d73..243a38b 100644 --- a/inference-engine/tests/helpers/xml_father.hpp +++ b/inference-engine/tests/helpers/xml_father.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/xml_helper.hpp b/inference-engine/tests/helpers/xml_helper.hpp index 75cc131..85d389f 100644 --- a/inference-engine/tests/helpers/xml_helper.hpp +++ b/inference-engine/tests/helpers/xml_helper.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/helpers/xml_net_builder.cpp b/inference-engine/tests/helpers/xml_net_builder.cpp index 45f9672..e313ba0 100644 --- a/inference-engine/tests/helpers/xml_net_builder.cpp +++ b/inference-engine/tests/helpers/xml_net_builder.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -21,7 +21,7 @@ void IDManager::reset() { portID = layerID = 0; } -LayerDesc::LayerDesc(std::string type, InOutData& shapes, IDManager &id_manager) : _type(std::move(type)) { +LayerDesc::LayerDesc(std::string type, InOutShapes& shapes, IDManager &id_manager) : _type(std::move(type)) { _layerID = id_manager.getNextLayerID(); auto inDims = shapes.inDims; auto outDims = shapes.outDims; diff --git a/inference-engine/tests/helpers/xml_net_builder.hpp b/inference-engine/tests/helpers/xml_net_builder.hpp index 81fa21d..ba9f1a7 100644 --- a/inference-engine/tests/helpers/xml_net_builder.hpp +++ b/inference-engine/tests/helpers/xml_net_builder.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,11 +22,11 @@ struct CropData { typedef std::vector CropParams; -struct InOutData { +struct InOutShapes { std::vector> inDims; std::vector> outDims; - friend std::ostream& operator<<(std::ostream& os, InOutData const& inout) { + friend std::ostream& operator<<(std::ostream& os, InOutShapes const& inout) { auto dumpVec = [](const std::vector& vec) -> std::string { if (vec.empty()) return "[]"; std::stringstream oss; @@ -137,7 +137,7 @@ public: * @param type - string with type of the layer * @param shapes - reference to the structure with input and output shapes */ - explicit LayerDesc(std::string type, InOutData& shapes, IDManager &id_manager); + explicit LayerDesc(std::string type, InOutShapes& shapes, IDManager &id_manager); /** * @brief Resets current input and output ports to iterate over all input and output ports @@ -252,7 +252,7 @@ public: return EdgesBuilder(exp.node("edges"), layersDesc); } - XmlNetBuilder& cropLayer(CropParams params, const InOutData& inout) { + XmlNetBuilder& cropLayer(CropParams params, const InOutShapes& inout) { std::map generalParams; for (CropData crop : params) { generalParams["axis"] = std::to_string(crop.axis); @@ -262,7 +262,7 @@ public: return addLayer("Crop", "", &generalParams, inout, 0, 0, "crop-data"); } - XmlNetBuilder& convolutionLayer(const std::string& precision, const InOutData& inout) { + XmlNetBuilder& convolutionLayer(const std::string& precision, const InOutShapes& inout) { std::map params{ {"stride-x", "4"}, {"stride-y", "4"}, @@ -275,7 +275,7 @@ public: return addLayer("Convolution", precision, ¶ms, inout, 0, 0, "convolution_data"); } - XmlNetBuilder& poolingLayer(const InOutData& inout) { + XmlNetBuilder& poolingLayer(const InOutShapes& inout) { std::map params{ {"stride-x", "4"}, {"stride-y", "4"}, @@ -289,7 +289,7 @@ public: struct TIPortMap { int from_l, from_p, to_l, to_p, axis, stride, start, end; }; - XmlNetBuilder& TILayer(InOutData inout, + XmlNetBuilder& TILayer(InOutShapes inout, std::string body, std::vector inMap, std::vector outMap, @@ -329,7 +329,7 @@ public: XmlNetBuilder& addLayer(const std::string& type, const std::string& precision, std::map* params, - InOutData inout, + InOutShapes inout, int weightsSize = 0, int biasesSize = 0, std::string layerDataName = "data", @@ -361,7 +361,7 @@ public: } XmlNetBuilder& addInputLayer(const std::string& precision, const std::vector& out) { - InOutData inout{}; + InOutShapes inout{}; inout.outDims.push_back(out); return addLayer("Input", precision, nullptr, inout); } diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2005/gmock.sln b/inference-engine/tests/libs/gtest/googlemock/msvc/2005/gmock.sln new file mode 100644 index 0000000..0cf57a3 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2005/gmock.sln @@ -0,0 +1,32 @@ + +Microsoft Visual Studio Solution File, Format Version 9.00 +# Visual Studio 2005 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock", "gmock.vcproj", "{34681F0D-CE45-415D-B5F2-5C662DFE3BD5}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock_test", "gmock_test.vcproj", "{F10D22F8-AC7B-4213-8720-608E7D878CD2}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock_main", "gmock_main.vcproj", "{E4EF614B-30DF-4954-8C53-580A0BF6B589}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|Win32.ActiveCfg = Debug|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|Win32.Build.0 = Debug|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|Win32.ActiveCfg = Release|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|Win32.Build.0 = Release|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|Win32.ActiveCfg = Debug|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|Win32.Build.0 = Debug|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|Win32.ActiveCfg = Release|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|Win32.Build.0 = Release|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|Win32.ActiveCfg = Debug|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|Win32.Build.0 = Debug|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|Win32.ActiveCfg = Release|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.sln b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.sln new file mode 100644 index 0000000..f192bd2 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.sln @@ -0,0 +1,46 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual C++ Express 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock", "gmock.vcxproj", "{34681F0D-CE45-415D-B5F2-5C662DFE3BD5}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock_test", "gmock_test.vcxproj", "{F10D22F8-AC7B-4213-8720-608E7D878CD2}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock_main", "gmock_main.vcxproj", "{E4EF614B-30DF-4954-8C53-580A0BF6B589}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|Win32.ActiveCfg = Debug|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|Win32.Build.0 = Debug|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|x64.ActiveCfg = Debug|x64 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|x64.Build.0 = Debug|x64 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|Win32.ActiveCfg = Release|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|Win32.Build.0 = Release|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|x64.ActiveCfg = Release|x64 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|x64.Build.0 = Release|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|Win32.ActiveCfg = Debug|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|Win32.Build.0 = Debug|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|x64.ActiveCfg = Debug|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|x64.Build.0 = Debug|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|Win32.ActiveCfg = Release|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|Win32.Build.0 = Release|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|x64.ActiveCfg = Release|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|x64.Build.0 = Release|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|Win32.ActiveCfg = Debug|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|Win32.Build.0 = Debug|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|x64.ActiveCfg = Debug|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|x64.Build.0 = Debug|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|Win32.ActiveCfg = Release|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|Win32.Build.0 = Release|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|x64.ActiveCfg = Release|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.vcxproj b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.vcxproj new file mode 100644 index 0000000..eea87dc --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock.vcxproj @@ -0,0 +1,145 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5} + gmock + Win32Proj + + + + StaticLibrary + Unicode + true + v100 + + + StaticLibrary + Unicode + true + v100 + + + StaticLibrary + Unicode + v100 + + + StaticLibrary + Unicode + v100 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + + Disabled + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + Disabled + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + + $(GTestDir);%(AdditionalIncludeDirectories) + $(GTestDir);%(AdditionalIncludeDirectories) + $(GTestDir);%(AdditionalIncludeDirectories) + $(GTestDir);%(AdditionalIncludeDirectories) + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_main.vcxproj b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_main.vcxproj new file mode 100644 index 0000000..991687a --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_main.vcxproj @@ -0,0 +1,151 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E4EF614B-30DF-4954-8C53-580A0BF6B589} + gmock_main + Win32Proj + + + + StaticLibrary + Unicode + true + v100 + + + StaticLibrary + Unicode + true + v100 + + + StaticLibrary + Unicode + v100 + + + StaticLibrary + Unicode + v100 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + + Disabled + ../../include;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + Disabled + ../../include;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + ../../include;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + ../../include;%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + {34681f0d-ce45-415d-b5f2-5c662dfe3bd5} + true + true + + + + + ../../include;%(AdditionalIncludeDirectories) + ../../include;%(AdditionalIncludeDirectories) + ../../include;%(AdditionalIncludeDirectories) + ../../include;%(AdditionalIncludeDirectories) + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_test.vcxproj b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_test.vcxproj new file mode 100644 index 0000000..0a65597 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2010/gmock_test.vcxproj @@ -0,0 +1,176 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {F10D22F8-AC7B-4213-8720-608E7D878CD2} + gmock_test + Win32Proj + + + + Application + Unicode + true + v100 + + + Application + Unicode + true + v100 + + + Application + Unicode + v100 + + + Application + Unicode + v100 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + true + true + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + false + false + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + + /bigobj %(AdditionalOptions) + Disabled + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + true + Console + MachineX86 + + + + + /bigobj %(AdditionalOptions) + Disabled + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + true + Console + + + + + /bigobj %(AdditionalOptions) + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + true + Console + true + true + MachineX86 + + + + + /bigobj %(AdditionalOptions) + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + true + Console + true + true + + + + + {e4ef614b-30df-4954-8c53-580a0bf6b589} + true + true + + + + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.sln b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.sln new file mode 100644 index 0000000..d4203a8 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.sln @@ -0,0 +1,46 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock", "gmock.vcxproj", "{34681F0D-CE45-415D-B5F2-5C662DFE3BD5}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock_test", "gmock_test.vcxproj", "{F10D22F8-AC7B-4213-8720-608E7D878CD2}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gmock_main", "gmock_main.vcxproj", "{E4EF614B-30DF-4954-8C53-580A0BF6B589}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|Win32.ActiveCfg = Debug|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|Win32.Build.0 = Debug|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|x64.ActiveCfg = Debug|x64 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Debug|x64.Build.0 = Debug|x64 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|Win32.ActiveCfg = Release|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|Win32.Build.0 = Release|Win32 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|x64.ActiveCfg = Release|x64 + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5}.Release|x64.Build.0 = Release|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|Win32.ActiveCfg = Debug|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|Win32.Build.0 = Debug|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|x64.ActiveCfg = Debug|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Debug|x64.Build.0 = Debug|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|Win32.ActiveCfg = Release|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|Win32.Build.0 = Release|Win32 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|x64.ActiveCfg = Release|x64 + {F10D22F8-AC7B-4213-8720-608E7D878CD2}.Release|x64.Build.0 = Release|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|Win32.ActiveCfg = Debug|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|Win32.Build.0 = Debug|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|x64.ActiveCfg = Debug|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Debug|x64.Build.0 = Debug|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|Win32.ActiveCfg = Release|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|Win32.Build.0 = Release|Win32 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|x64.ActiveCfg = Release|x64 + {E4EF614B-30DF-4954-8C53-580A0BF6B589}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.vcxproj b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.vcxproj new file mode 100644 index 0000000..c6b56e6 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock.vcxproj @@ -0,0 +1,145 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {34681F0D-CE45-415D-B5F2-5C662DFE3BD5} + gmock + Win32Proj + + + + StaticLibrary + Unicode + true + v140 + + + StaticLibrary + Unicode + true + v140 + + + StaticLibrary + Unicode + v140 + + + StaticLibrary + Unicode + v140 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + + Disabled + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + Disabled + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + ..\..\include;..\..;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + + $(GTestDir);%(AdditionalIncludeDirectories) + $(GTestDir);%(AdditionalIncludeDirectories) + $(GTestDir);%(AdditionalIncludeDirectories) + $(GTestDir);%(AdditionalIncludeDirectories) + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_main.vcxproj b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_main.vcxproj new file mode 100644 index 0000000..42381df --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_main.vcxproj @@ -0,0 +1,151 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E4EF614B-30DF-4954-8C53-580A0BF6B589} + gmock_main + Win32Proj + + + + StaticLibrary + Unicode + true + v140 + + + StaticLibrary + Unicode + true + v140 + + + StaticLibrary + Unicode + v140 + + + StaticLibrary + Unicode + v140 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + + Disabled + ../../include;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + Disabled + ../../include;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + + + ../../include;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + ../../include;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + + + {34681f0d-ce45-415d-b5f2-5c662dfe3bd5} + true + true + + + + + ../../include;%(AdditionalIncludeDirectories) + ../../include;%(AdditionalIncludeDirectories) + ../../include;%(AdditionalIncludeDirectories) + ../../include;%(AdditionalIncludeDirectories) + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_test.vcxproj b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_test.vcxproj new file mode 100644 index 0000000..01d1f20 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googlemock/msvc/2015/gmock_test.vcxproj @@ -0,0 +1,176 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {F10D22F8-AC7B-4213-8720-608E7D878CD2} + gmock_test + Win32Proj + + + + Application + Unicode + true + v140 + + + Application + Unicode + true + v140 + + + Application + Unicode + v140 + + + Application + Unicode + v140 + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + true + true + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + false + false + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + + /bigobj %(AdditionalOptions) + Disabled + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + true + Console + MachineX86 + + + + + /bigobj %(AdditionalOptions) + Disabled + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + + + true + Console + + + + + /bigobj %(AdditionalOptions) + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + true + Console + true + true + MachineX86 + + + + + /bigobj %(AdditionalOptions) + ..\..\include;..\..;$(GTestDir);%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + + + true + Console + true + true + + + + + {e4ef614b-30df-4954-8c53-580a0bf6b589} + true + true + + + + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.sln b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.sln new file mode 100644 index 0000000..e36b33b --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.sln @@ -0,0 +1,55 @@ +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual C++ Express 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest-md", "gtest-md.vcxproj", "{C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest_main-md", "gtest_main-md.vcxproj", "{3AF54C8A-10BF-4332-9147-F68ED9862033}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest_prod_test-md", "gtest_prod_test-md.vcxproj", "{24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest_unittest-md", "gtest_unittest-md.vcxproj", "{4D9FDFB5-986A-4139-823C-F4EE0ED481A2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Debug|Win32.ActiveCfg = Debug|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Debug|Win32.Build.0 = Debug|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Debug|x64.ActiveCfg = Debug|x64 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Debug|x64.Build.0 = Debug|x64 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Release|Win32.ActiveCfg = Release|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Release|Win32.Build.0 = Release|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Release|x64.ActiveCfg = Release|x64 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8}.Release|x64.Build.0 = Release|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Debug|Win32.ActiveCfg = Debug|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Debug|Win32.Build.0 = Debug|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Debug|x64.ActiveCfg = Debug|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Debug|x64.Build.0 = Debug|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Release|Win32.ActiveCfg = Release|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Release|Win32.Build.0 = Release|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Release|x64.ActiveCfg = Release|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862033}.Release|x64.Build.0 = Release|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Debug|Win32.ActiveCfg = Debug|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Debug|Win32.Build.0 = Debug|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Debug|x64.ActiveCfg = Debug|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Debug|x64.Build.0 = Debug|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Release|Win32.ActiveCfg = Release|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Release|Win32.Build.0 = Release|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Release|x64.ActiveCfg = Release|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB}.Release|x64.Build.0 = Release|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Debug|Win32.ActiveCfg = Debug|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Debug|Win32.Build.0 = Debug|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Debug|x64.ActiveCfg = Debug|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Debug|x64.Build.0 = Debug|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Release|Win32.ActiveCfg = Release|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Release|Win32.Build.0 = Release|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Release|x64.ActiveCfg = Release|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj new file mode 100644 index 0000000..16a6ff1 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj @@ -0,0 +1,149 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE8} + Win32Proj + + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtestd + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest + + + gtestd + + + gtest + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj.filters new file mode 100644 index 0000000..69edeff --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest-md.vcxproj.filters @@ -0,0 +1,18 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.sln b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.sln new file mode 100644 index 0000000..cacd5c0 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.sln @@ -0,0 +1,55 @@ +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual C++ Express 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest", "gtest.vcxproj", "{C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest_main", "gtest_main.vcxproj", "{3AF54C8A-10BF-4332-9147-F68ED9862032}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest_unittest", "gtest_unittest.vcxproj", "{4D9FDFB5-986A-4139-823C-F4EE0ED481A1}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest_prod_test", "gtest_prod_test.vcxproj", "{24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Debug|Win32.ActiveCfg = Debug|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Debug|Win32.Build.0 = Debug|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Debug|x64.ActiveCfg = Debug|x64 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Debug|x64.Build.0 = Debug|x64 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Release|Win32.ActiveCfg = Release|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Release|Win32.Build.0 = Release|Win32 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Release|x64.ActiveCfg = Release|x64 + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7}.Release|x64.Build.0 = Release|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Debug|Win32.ActiveCfg = Debug|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Debug|Win32.Build.0 = Debug|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Debug|x64.ActiveCfg = Debug|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Debug|x64.Build.0 = Debug|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Release|Win32.ActiveCfg = Release|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Release|Win32.Build.0 = Release|Win32 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Release|x64.ActiveCfg = Release|x64 + {3AF54C8A-10BF-4332-9147-F68ED9862032}.Release|x64.Build.0 = Release|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Debug|Win32.ActiveCfg = Debug|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Debug|Win32.Build.0 = Debug|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Debug|x64.ActiveCfg = Debug|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Debug|x64.Build.0 = Debug|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Release|Win32.ActiveCfg = Release|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Release|Win32.Build.0 = Release|Win32 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Release|x64.ActiveCfg = Release|x64 + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1}.Release|x64.Build.0 = Release|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Debug|Win32.ActiveCfg = Debug|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Debug|Win32.Build.0 = Debug|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Debug|x64.ActiveCfg = Debug|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Debug|x64.Build.0 = Debug|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Release|Win32.ActiveCfg = Release|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Release|Win32.Build.0 = Release|Win32 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Release|x64.ActiveCfg = Release|x64 + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj new file mode 100644 index 0000000..a46f5c7 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj @@ -0,0 +1,149 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {C8F6C172-56F2-4E76-B5FA-C3B423B31BE7} + Win32Proj + + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + gtestd + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + gtest + + + gtestd + + + gtest + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj.filters new file mode 100644 index 0000000..69edeff --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest.vcxproj.filters @@ -0,0 +1,18 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj new file mode 100644 index 0000000..3d77389 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj @@ -0,0 +1,154 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {3AF54C8A-10BF-4332-9147-F68ED9862033} + Win32Proj + + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest_maind + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest_main + + + gtest_maind + + + gtest_main + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + {c8f6c172-56f2-4e76-b5fa-c3b423b31be8} + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj.filters new file mode 100644 index 0000000..726c773 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main-md.vcxproj.filters @@ -0,0 +1,18 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj new file mode 100644 index 0000000..8fb2589 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {3AF54C8A-10BF-4332-9147-F68ED9862032} + Win32Proj + + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + StaticLibrary + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + gtest_maind + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + gtest_main + + + gtest_maind + + + gtest_main + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + + + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + $(OutDir)$(ProjectName)d.lib + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_LIB;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + $(OutDir)$(ProjectName)d.lib + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + $(OutDir)$(ProjectName).lib + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + + + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + $(OutDir)$(ProjectName).lib + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + {c8f6c172-56f2-4e76-b5fa-c3b423b31be7} + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj.filters new file mode 100644 index 0000000..726c773 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_main.vcxproj.filters @@ -0,0 +1,18 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj new file mode 100644 index 0000000..830e5dc --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj @@ -0,0 +1,199 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECB} + Win32Proj + + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + true + true + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + false + false + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest_prod_test + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest_prod_test + + + gtest_prod_test + + + gtest_prod_test + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + Use + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_prod_test.pdb + Console + MachineX86 + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_prod_test.pdb + Console + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + MachineX86 + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + + + + + + + {3af54c8a-10bf-4332-9147-f68ed9862033} + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj.filters new file mode 100644 index 0000000..ac36731 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test-md.vcxproj.filters @@ -0,0 +1,26 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj new file mode 100644 index 0000000..d42e135 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj @@ -0,0 +1,191 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {24848551-EF4F-47E8-9A9D-EA4D49BC3ECA} + Win32Proj + + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + true + true + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + false + false + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + Use + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_prod_test.pdb + Console + MachineX86 + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_prod_test.pdb + Console + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + MachineX86 + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + + + + + + + {3af54c8a-10bf-4332-9147-f68ed9862032} + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj.filters new file mode 100644 index 0000000..ac36731 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_prod_test.vcxproj.filters @@ -0,0 +1,26 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + Source Files + + + + + Header Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj new file mode 100644 index 0000000..93b0dc4 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj @@ -0,0 +1,188 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {4D9FDFB5-986A-4139-823C-F4EE0ED481A2} + Win32Proj + + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + true + true + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + false + false + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest_unittest + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)$(ProjectName)\ + gtest_unittest + + + gtest_unittest + + + gtest_unittest + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + Use + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_unittest.pdb + Console + MachineX86 + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_unittest.pdb + Console + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + MachineX86 + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + + + + + MinSpace + MinSpace + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + Default + Default + + + + + ProgramDatabase + ProgramDatabase + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + + + + {3af54c8a-10bf-4332-9147-f68ed9862033} + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj.filters new file mode 100644 index 0000000..047dae5 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest-md.vcxproj.filters @@ -0,0 +1,18 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj new file mode 100644 index 0000000..ec6abde --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj @@ -0,0 +1,180 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {4D9FDFB5-986A-4139-823C-F4EE0ED481A1} + Win32Proj + + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + Application + MultiByte + v100 + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.40219.1 + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + true + true + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + false + false + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + + + $(SolutionDir)$(SolutionName)\$(Platform)-$(Configuration)\ + $(OutDir)temp\$(ProjectName)\ + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebug + Use + Level3 + EditAndContinue + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_unittest.pdb + Console + MachineX86 + + + + + Disabled + WIN32;_VARIADIC_MAX=10;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebug + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + $(OutDir)gtest_unittest.pdb + Console + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + MachineX86 + + + + + WIN32;_VARIADIC_MAX=10;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreaded + Use + Level3 + ProgramDatabase + ..\..\include;..\..;%(AdditionalIncludeDirectories) + + + true + Console + true + true + + + + + MinSpace + MinSpace + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + Default + Default + + + + + ProgramDatabase + ProgramDatabase + ..;..\include;%(AdditionalIncludeDirectories) + ..;..\include;%(AdditionalIncludeDirectories) + + + + + + + + + {3af54c8a-10bf-4332-9147-f68ed9862032} + + + + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj.filters b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj.filters new file mode 100644 index 0000000..047dae5 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/msvc/2010/gtest_unittest.vcxproj.filters @@ -0,0 +1,18 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + + + Source Files + + + \ No newline at end of file diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Config/DebugProject.xcconfig b/inference-engine/tests/libs/gtest/googletest/xcode/Config/DebugProject.xcconfig new file mode 100644 index 0000000..3d68157 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Config/DebugProject.xcconfig @@ -0,0 +1,30 @@ +// +// DebugProject.xcconfig +// +// These are Debug Configuration project settings for the gtest framework and +// examples. It is set in the "Based On:" dropdown in the "Project" info +// dialog. +// This file is based on the Xcode Configuration files in: +// http://code.google.com/p/google-toolbox-for-mac/ +// + +#include "General.xcconfig" + +// No optimization +GCC_OPTIMIZATION_LEVEL = 0 + +// Deployment postprocessing is what triggers Xcode to strip, turn it off +DEPLOYMENT_POSTPROCESSING = NO + +// Dead code stripping off +DEAD_CODE_STRIPPING = NO + +// Debug symbols should be on obviously +GCC_GENERATE_DEBUGGING_SYMBOLS = YES + +// Define the DEBUG macro in all debug builds +OTHER_CFLAGS = $(OTHER_CFLAGS) -DDEBUG=1 + +// These are turned off to avoid STL incompatibilities with client code +// // Turns on special C++ STL checks to "encourage" good STL use +// GCC_PREPROCESSOR_DEFINITIONS = $(GCC_PREPROCESSOR_DEFINITIONS) _GLIBCXX_DEBUG_PEDANTIC _GLIBCXX_DEBUG _GLIBCPP_CONCEPT_CHECKS diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Config/FrameworkTarget.xcconfig b/inference-engine/tests/libs/gtest/googletest/xcode/Config/FrameworkTarget.xcconfig new file mode 100644 index 0000000..357b1c8 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Config/FrameworkTarget.xcconfig @@ -0,0 +1,17 @@ +// +// FrameworkTarget.xcconfig +// +// These are Framework target settings for the gtest framework and examples. It +// is set in the "Based On:" dropdown in the "Target" info dialog. +// This file is based on the Xcode Configuration files in: +// http://code.google.com/p/google-toolbox-for-mac/ +// + +// Dynamic libs need to be position independent +GCC_DYNAMIC_NO_PIC = NO + +// Dynamic libs should not have their external symbols stripped. +STRIP_STYLE = non-global + +// Let the user install by specifying the $DSTROOT with xcodebuild +SKIP_INSTALL = NO diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Config/General.xcconfig b/inference-engine/tests/libs/gtest/googletest/xcode/Config/General.xcconfig new file mode 100644 index 0000000..f23e322 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Config/General.xcconfig @@ -0,0 +1,41 @@ +// +// General.xcconfig +// +// These are General configuration settings for the gtest framework and +// examples. +// This file is based on the Xcode Configuration files in: +// http://code.google.com/p/google-toolbox-for-mac/ +// + +// Build for PPC and Intel, 32- and 64-bit +ARCHS = i386 x86_64 ppc ppc64 + +// Zerolink prevents link warnings so turn it off +ZERO_LINK = NO + +// Prebinding considered unhelpful in 10.3 and later +PREBINDING = NO + +// Strictest warning policy +WARNING_CFLAGS = -Wall -Werror -Wendif-labels -Wnewline-eof -Wno-sign-compare -Wshadow + +// Work around Xcode bugs by using external strip. See: +// http://lists.apple.com/archives/Xcode-users/2006/Feb/msg00050.html +SEPARATE_STRIP = YES + +// Force C99 dialect +GCC_C_LANGUAGE_STANDARD = c99 + +// not sure why apple defaults this on, but it's pretty risky +ALWAYS_SEARCH_USER_PATHS = NO + +// Turn on position dependent code for most cases (overridden where appropriate) +GCC_DYNAMIC_NO_PIC = YES + +// Default SDK and minimum OS version is 10.4 +SDKROOT = $(DEVELOPER_SDK_DIR)/MacOSX10.4u.sdk +MACOSX_DEPLOYMENT_TARGET = 10.4 +GCC_VERSION = 4.0 + +// VERSIONING BUILD SETTINGS (used in Info.plist) +GTEST_VERSIONINFO_ABOUT = © 2008 Google Inc. diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Config/ReleaseProject.xcconfig b/inference-engine/tests/libs/gtest/googletest/xcode/Config/ReleaseProject.xcconfig new file mode 100644 index 0000000..5349f0a --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Config/ReleaseProject.xcconfig @@ -0,0 +1,32 @@ +// +// ReleaseProject.xcconfig +// +// These are Release Configuration project settings for the gtest framework +// and examples. It is set in the "Based On:" dropdown in the "Project" info +// dialog. +// This file is based on the Xcode Configuration files in: +// http://code.google.com/p/google-toolbox-for-mac/ +// + +#include "General.xcconfig" + +// subconfig/Release.xcconfig + +// Optimize for space and size (Apple recommendation) +GCC_OPTIMIZATION_LEVEL = s + +// Deploment postprocessing is what triggers Xcode to strip +DEPLOYMENT_POSTPROCESSING = YES + +// No symbols +GCC_GENERATE_DEBUGGING_SYMBOLS = NO + +// Dead code strip does not affect ObjC code but can help for C +DEAD_CODE_STRIPPING = YES + +// NDEBUG is used by things like assert.h, so define it for general compat. +// ASSERT going away in release tends to create unused vars. +OTHER_CFLAGS = $(OTHER_CFLAGS) -DNDEBUG=1 -Wno-unused-variable + +// When we strip we want to strip all symbols in release, but save externals. +STRIP_STYLE = all diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Config/StaticLibraryTarget.xcconfig b/inference-engine/tests/libs/gtest/googletest/xcode/Config/StaticLibraryTarget.xcconfig new file mode 100644 index 0000000..3922fa5 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Config/StaticLibraryTarget.xcconfig @@ -0,0 +1,18 @@ +// +// StaticLibraryTarget.xcconfig +// +// These are static library target settings for libgtest.a. It +// is set in the "Based On:" dropdown in the "Target" info dialog. +// This file is based on the Xcode Configuration files in: +// http://code.google.com/p/google-toolbox-for-mac/ +// + +// Static libs can be included in bundles so make them position independent +GCC_DYNAMIC_NO_PIC = NO + +// Static libs should not have their internal globals or external symbols +// stripped. +STRIP_STYLE = debugging + +// Let the user install by specifying the $DSTROOT with xcodebuild +SKIP_INSTALL = NO diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Config/TestTarget.xcconfig b/inference-engine/tests/libs/gtest/googletest/xcode/Config/TestTarget.xcconfig new file mode 100644 index 0000000..e6652ba --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Config/TestTarget.xcconfig @@ -0,0 +1,8 @@ +// +// TestTarget.xcconfig +// +// These are Test target settings for the gtest framework and examples. It +// is set in the "Based On:" dropdown in the "Target" info dialog. + +PRODUCT_NAME = $(TARGET_NAME) +HEADER_SEARCH_PATHS = ../include diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Resources/Info.plist b/inference-engine/tests/libs/gtest/googletest/xcode/Resources/Info.plist new file mode 100644 index 0000000..9dd28ea --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Resources/Info.plist @@ -0,0 +1,30 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleExecutable + ${EXECUTABLE_NAME} + CFBundleIconFile + + CFBundleIdentifier + com.google.${PRODUCT_NAME} + CFBundleInfoDictionaryVersion + 6.0 + CFBundlePackageType + FMWK + CFBundleSignature + ???? + CFBundleVersion + GTEST_VERSIONINFO_LONG + CFBundleShortVersionString + GTEST_VERSIONINFO_SHORT + CFBundleGetInfoString + ${PRODUCT_NAME} GTEST_VERSIONINFO_LONG, ${GTEST_VERSIONINFO_ABOUT} + NSHumanReadableCopyright + ${GTEST_VERSIONINFO_ABOUT} + CSResourcesFileMapped + + + diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/Info.plist b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/Info.plist new file mode 100644 index 0000000..f3852ed --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/Info.plist @@ -0,0 +1,28 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleExecutable + ${EXECUTABLE_NAME} + CFBundleIconFile + + CFBundleIdentifier + com.google.gtest.${PRODUCT_NAME:identifier} + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + ${PRODUCT_NAME} + CFBundlePackageType + FMWK + CFBundleShortVersionString + 1.0 + CFBundleSignature + ???? + CFBundleVersion + 1.0 + CSResourcesFileMapped + + + diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/WidgetFramework.xcodeproj/project.pbxproj b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/WidgetFramework.xcodeproj/project.pbxproj new file mode 100644 index 0000000..497617e --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/WidgetFramework.xcodeproj/project.pbxproj @@ -0,0 +1,457 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 42; + objects = { + +/* Begin PBXAggregateTarget section */ + 4024D162113D7D2400C7059E /* Test */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 4024D169113D7D4600C7059E /* Build configuration list for PBXAggregateTarget "Test" */; + buildPhases = ( + 4024D161113D7D2400C7059E /* ShellScript */, + ); + dependencies = ( + 4024D166113D7D3100C7059E /* PBXTargetDependency */, + ); + name = Test; + productName = TestAndBuild; + }; + 4024D1E9113D83FF00C7059E /* TestAndBuild */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 4024D1F0113D842B00C7059E /* Build configuration list for PBXAggregateTarget "TestAndBuild" */; + buildPhases = ( + ); + dependencies = ( + 4024D1ED113D840900C7059E /* PBXTargetDependency */, + 4024D1EF113D840D00C7059E /* PBXTargetDependency */, + ); + name = TestAndBuild; + productName = TestAndBuild; + }; +/* End PBXAggregateTarget section */ + +/* Begin PBXBuildFile section */ + 3B7EB1250E5AEE3500C7F239 /* widget.cc in Sources */ = {isa = PBXBuildFile; fileRef = 3B7EB1230E5AEE3500C7F239 /* widget.cc */; }; + 3B7EB1260E5AEE3500C7F239 /* widget.h in Headers */ = {isa = PBXBuildFile; fileRef = 3B7EB1240E5AEE3500C7F239 /* widget.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 3B7EB1280E5AEE4600C7F239 /* widget_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 3B7EB1270E5AEE4600C7F239 /* widget_test.cc */; }; + 3B7EB1480E5AF3B400C7F239 /* Widget.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8D07F2C80486CC7A007CD1D0 /* Widget.framework */; }; + 4024D188113D7D7800C7059E /* libgtest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 4024D185113D7D5500C7059E /* libgtest.a */; }; + 4024D189113D7D7A00C7059E /* libgtest_main.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 4024D183113D7D5500C7059E /* libgtest_main.a */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + 3B07BDF00E3F3FAE00647869 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 8D07F2BC0486CC7A007CD1D0; + remoteInfo = gTestExample; + }; + 4024D165113D7D3100C7059E /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 3B07BDE90E3F3F9E00647869; + remoteInfo = WidgetFrameworkTest; + }; + 4024D1EC113D840900C7059E /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 8D07F2BC0486CC7A007CD1D0; + remoteInfo = WidgetFramework; + }; + 4024D1EE113D840D00C7059E /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 4024D162113D7D2400C7059E; + remoteInfo = Test; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + 3B07BDEA0E3F3F9E00647869 /* WidgetFrameworkTest */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = WidgetFrameworkTest; sourceTree = BUILT_PRODUCTS_DIR; }; + 3B7EB1230E5AEE3500C7F239 /* widget.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = widget.cc; sourceTree = ""; }; + 3B7EB1240E5AEE3500C7F239 /* widget.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = widget.h; sourceTree = ""; }; + 3B7EB1270E5AEE4600C7F239 /* widget_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = widget_test.cc; sourceTree = ""; }; + 4024D183113D7D5500C7059E /* libgtest_main.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libgtest_main.a; path = /usr/local/lib/libgtest_main.a; sourceTree = ""; }; + 4024D185113D7D5500C7059E /* libgtest.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libgtest.a; path = /usr/local/lib/libgtest.a; sourceTree = ""; }; + 4024D1E2113D838200C7059E /* runtests.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = runtests.sh; sourceTree = ""; }; + 8D07F2C70486CC7A007CD1D0 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist; path = Info.plist; sourceTree = ""; }; + 8D07F2C80486CC7A007CD1D0 /* Widget.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Widget.framework; sourceTree = BUILT_PRODUCTS_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 3B07BDE80E3F3F9E00647869 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 4024D189113D7D7A00C7059E /* libgtest_main.a in Frameworks */, + 4024D188113D7D7800C7059E /* libgtest.a in Frameworks */, + 3B7EB1480E5AF3B400C7F239 /* Widget.framework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 8D07F2C30486CC7A007CD1D0 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 034768DDFF38A45A11DB9C8B /* Products */ = { + isa = PBXGroup; + children = ( + 8D07F2C80486CC7A007CD1D0 /* Widget.framework */, + 3B07BDEA0E3F3F9E00647869 /* WidgetFrameworkTest */, + ); + name = Products; + sourceTree = ""; + }; + 0867D691FE84028FC02AAC07 /* gTestExample */ = { + isa = PBXGroup; + children = ( + 4024D1E1113D836C00C7059E /* Scripts */, + 08FB77ACFE841707C02AAC07 /* Source */, + 089C1665FE841158C02AAC07 /* Resources */, + 3B07BE350E4094E400647869 /* Test */, + 0867D69AFE84028FC02AAC07 /* External Frameworks and Libraries */, + 034768DDFF38A45A11DB9C8B /* Products */, + ); + name = gTestExample; + sourceTree = ""; + }; + 0867D69AFE84028FC02AAC07 /* External Frameworks and Libraries */ = { + isa = PBXGroup; + children = ( + 4024D183113D7D5500C7059E /* libgtest_main.a */, + 4024D185113D7D5500C7059E /* libgtest.a */, + ); + name = "External Frameworks and Libraries"; + sourceTree = ""; + }; + 089C1665FE841158C02AAC07 /* Resources */ = { + isa = PBXGroup; + children = ( + 8D07F2C70486CC7A007CD1D0 /* Info.plist */, + ); + name = Resources; + sourceTree = ""; + }; + 08FB77ACFE841707C02AAC07 /* Source */ = { + isa = PBXGroup; + children = ( + 3B7EB1230E5AEE3500C7F239 /* widget.cc */, + 3B7EB1240E5AEE3500C7F239 /* widget.h */, + ); + name = Source; + sourceTree = ""; + }; + 3B07BE350E4094E400647869 /* Test */ = { + isa = PBXGroup; + children = ( + 3B7EB1270E5AEE4600C7F239 /* widget_test.cc */, + ); + name = Test; + sourceTree = ""; + }; + 4024D1E1113D836C00C7059E /* Scripts */ = { + isa = PBXGroup; + children = ( + 4024D1E2113D838200C7059E /* runtests.sh */, + ); + name = Scripts; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXHeadersBuildPhase section */ + 8D07F2BD0486CC7A007CD1D0 /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + 3B7EB1260E5AEE3500C7F239 /* widget.h in Headers */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXHeadersBuildPhase section */ + +/* Begin PBXNativeTarget section */ + 3B07BDE90E3F3F9E00647869 /* WidgetFrameworkTest */ = { + isa = PBXNativeTarget; + buildConfigurationList = 3B07BDF40E3F3FB600647869 /* Build configuration list for PBXNativeTarget "WidgetFrameworkTest" */; + buildPhases = ( + 3B07BDE70E3F3F9E00647869 /* Sources */, + 3B07BDE80E3F3F9E00647869 /* Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + 3B07BDF10E3F3FAE00647869 /* PBXTargetDependency */, + ); + name = WidgetFrameworkTest; + productName = gTestExampleTest; + productReference = 3B07BDEA0E3F3F9E00647869 /* WidgetFrameworkTest */; + productType = "com.apple.product-type.tool"; + }; + 8D07F2BC0486CC7A007CD1D0 /* WidgetFramework */ = { + isa = PBXNativeTarget; + buildConfigurationList = 4FADC24208B4156D00ABE55E /* Build configuration list for PBXNativeTarget "WidgetFramework" */; + buildPhases = ( + 8D07F2C10486CC7A007CD1D0 /* Sources */, + 8D07F2C30486CC7A007CD1D0 /* Frameworks */, + 8D07F2BD0486CC7A007CD1D0 /* Headers */, + 8D07F2BF0486CC7A007CD1D0 /* Resources */, + 8D07F2C50486CC7A007CD1D0 /* Rez */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = WidgetFramework; + productInstallPath = "$(HOME)/Library/Frameworks"; + productName = gTestExample; + productReference = 8D07F2C80486CC7A007CD1D0 /* Widget.framework */; + productType = "com.apple.product-type.framework"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 0867D690FE84028FC02AAC07 /* Project object */ = { + isa = PBXProject; + buildConfigurationList = 4FADC24608B4156D00ABE55E /* Build configuration list for PBXProject "WidgetFramework" */; + compatibilityVersion = "Xcode 2.4"; + hasScannedForEncodings = 1; + mainGroup = 0867D691FE84028FC02AAC07 /* gTestExample */; + productRefGroup = 034768DDFF38A45A11DB9C8B /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8D07F2BC0486CC7A007CD1D0 /* WidgetFramework */, + 3B07BDE90E3F3F9E00647869 /* WidgetFrameworkTest */, + 4024D162113D7D2400C7059E /* Test */, + 4024D1E9113D83FF00C7059E /* TestAndBuild */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + 8D07F2BF0486CC7A007CD1D0 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXRezBuildPhase section */ + 8D07F2C50486CC7A007CD1D0 /* Rez */ = { + isa = PBXRezBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXRezBuildPhase section */ + +/* Begin PBXShellScriptBuildPhase section */ + 4024D161113D7D2400C7059E /* ShellScript */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputPaths = ( + ); + outputPaths = ( + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "/bin/bash $SRCROOT/runtests.sh $BUILT_PRODUCTS_DIR/WidgetFrameworkTest\n"; + }; +/* End PBXShellScriptBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 3B07BDE70E3F3F9E00647869 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 3B7EB1280E5AEE4600C7F239 /* widget_test.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 8D07F2C10486CC7A007CD1D0 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 3B7EB1250E5AEE3500C7F239 /* widget.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + 3B07BDF10E3F3FAE00647869 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 8D07F2BC0486CC7A007CD1D0 /* WidgetFramework */; + targetProxy = 3B07BDF00E3F3FAE00647869 /* PBXContainerItemProxy */; + }; + 4024D166113D7D3100C7059E /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 3B07BDE90E3F3F9E00647869 /* WidgetFrameworkTest */; + targetProxy = 4024D165113D7D3100C7059E /* PBXContainerItemProxy */; + }; + 4024D1ED113D840900C7059E /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 8D07F2BC0486CC7A007CD1D0 /* WidgetFramework */; + targetProxy = 4024D1EC113D840900C7059E /* PBXContainerItemProxy */; + }; + 4024D1EF113D840D00C7059E /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 4024D162113D7D2400C7059E /* Test */; + targetProxy = 4024D1EE113D840D00C7059E /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + 3B07BDEC0E3F3F9F00647869 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = WidgetFrameworkTest; + }; + name = Debug; + }; + 3B07BDED0E3F3F9F00647869 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = WidgetFrameworkTest; + }; + name = Release; + }; + 4024D163113D7D2400C7059E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = TestAndBuild; + }; + name = Debug; + }; + 4024D164113D7D2400C7059E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = TestAndBuild; + }; + name = Release; + }; + 4024D1EA113D83FF00C7059E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = TestAndBuild; + }; + name = Debug; + }; + 4024D1EB113D83FF00C7059E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = TestAndBuild; + }; + name = Release; + }; + 4FADC24308B4156D00ABE55E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + FRAMEWORK_VERSION = A; + INFOPLIST_FILE = Info.plist; + INSTALL_PATH = "@loader_path/../Frameworks"; + PRODUCT_NAME = Widget; + }; + name = Debug; + }; + 4FADC24408B4156D00ABE55E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + FRAMEWORK_VERSION = A; + INFOPLIST_FILE = Info.plist; + INSTALL_PATH = "@loader_path/../Frameworks"; + PRODUCT_NAME = Widget; + }; + name = Release; + }; + 4FADC24708B4156D00ABE55E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = 4.0; + SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk; + }; + name = Debug; + }; + 4FADC24808B4156D00ABE55E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = 4.0; + SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 3B07BDF40E3F3FB600647869 /* Build configuration list for PBXNativeTarget "WidgetFrameworkTest" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 3B07BDEC0E3F3F9F00647869 /* Debug */, + 3B07BDED0E3F3F9F00647869 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4024D169113D7D4600C7059E /* Build configuration list for PBXAggregateTarget "Test" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4024D163113D7D2400C7059E /* Debug */, + 4024D164113D7D2400C7059E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4024D1F0113D842B00C7059E /* Build configuration list for PBXAggregateTarget "TestAndBuild" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4024D1EA113D83FF00C7059E /* Debug */, + 4024D1EB113D83FF00C7059E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4FADC24208B4156D00ABE55E /* Build configuration list for PBXNativeTarget "WidgetFramework" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4FADC24308B4156D00ABE55E /* Debug */, + 4FADC24408B4156D00ABE55E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4FADC24608B4156D00ABE55E /* Build configuration list for PBXProject "WidgetFramework" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4FADC24708B4156D00ABE55E /* Debug */, + 4FADC24808B4156D00ABE55E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 0867D690FE84028FC02AAC07 /* Project object */; +} diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/runtests.sh b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/runtests.sh new file mode 100644 index 0000000..4a0d413 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/runtests.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Copyright 2008, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Executes the samples and tests for the Google Test Framework. + +# Help the dynamic linker find the path to the libraries. +export DYLD_FRAMEWORK_PATH=$BUILT_PRODUCTS_DIR +export DYLD_LIBRARY_PATH=$BUILT_PRODUCTS_DIR + +# Create some executables. +test_executables=$@ + +# Now execute each one in turn keeping track of how many succeeded and failed. +succeeded=0 +failed=0 +failed_list=() +for test in ${test_executables[*]}; do + "$test" + result=$? + if [ $result -eq 0 ]; then + succeeded=$(( $succeeded + 1 )) + else + failed=$(( failed + 1 )) + failed_list="$failed_list $test" + fi +done + +# Report the successes and failures to the console. +echo "Tests complete with $succeeded successes and $failed failures." +if [ $failed -ne 0 ]; then + echo "The following tests failed:" + echo $failed_list +fi +exit $failed diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.cc b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.cc new file mode 100644 index 0000000..bfc4e7f --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.cc @@ -0,0 +1,63 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: preston.a.jackson@gmail.com (Preston Jackson) +// +// Google Test - FrameworkSample +// widget.cc +// + +// Widget is a very simple class used for demonstrating the use of gtest + +#include "widget.h" + +Widget::Widget(int number, const std::string& name) + : number_(number), + name_(name) {} + +Widget::~Widget() {} + +float Widget::GetFloatValue() const { + return number_; +} + +int Widget::GetIntValue() const { + return static_cast(number_); +} + +std::string Widget::GetStringValue() const { + return name_; +} + +void Widget::GetCharPtrValue(char* buffer, size_t max_size) const { + // Copy the char* representation of name_ into buffer, up to max_size. + strncpy(buffer, name_.c_str(), max_size-1); + buffer[max_size-1] = '\0'; + return; +} diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.h b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.h new file mode 100644 index 0000000..0c55cdc --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget.h @@ -0,0 +1,59 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: preston.a.jackson@gmail.com (Preston Jackson) +// +// Google Test - FrameworkSample +// widget.h +// + +// Widget is a very simple class used for demonstrating the use of gtest. It +// simply stores two values a string and an integer, which are returned via +// public accessors in multiple forms. + +#import + +class Widget { + public: + Widget(int number, const std::string& name); + ~Widget(); + + // Public accessors to number data + float GetFloatValue() const; + int GetIntValue() const; + + // Public accessors to the string data + std::string GetStringValue() const; + void GetCharPtrValue(char* buffer, size_t max_size) const; + + private: + // Data members + float number_; + std::string name_; +}; diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget_test.cc b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget_test.cc new file mode 100644 index 0000000..8725994 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Samples/FrameworkSample/widget_test.cc @@ -0,0 +1,68 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: preston.a.jackson@gmail.com (Preston Jackson) +// +// Google Test - FrameworkSample +// widget_test.cc +// + +// This is a simple test file for the Widget class in the Widget.framework + +#include +#include "gtest/gtest.h" + +#include + +// This test verifies that the constructor sets the internal state of the +// Widget class correctly. +TEST(WidgetInitializerTest, TestConstructor) { + Widget widget(1.0f, "name"); + EXPECT_FLOAT_EQ(1.0f, widget.GetFloatValue()); + EXPECT_EQ(std::string("name"), widget.GetStringValue()); +} + +// This test verifies the conversion of the float and string values to int and +// char*, respectively. +TEST(WidgetInitializerTest, TestConversion) { + Widget widget(1.0f, "name"); + EXPECT_EQ(1, widget.GetIntValue()); + + size_t max_size = 128; + char buffer[max_size]; + widget.GetCharPtrValue(buffer, max_size); + EXPECT_STREQ("name", buffer); +} + +// Use the Google Test main that is linked into the framework. It does something +// like this: +// int main(int argc, char** argv) { +// testing::InitGoogleTest(&argc, argv); +// return RUN_ALL_TESTS(); +// } diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Scripts/runtests.sh b/inference-engine/tests/libs/gtest/googletest/xcode/Scripts/runtests.sh new file mode 100644 index 0000000..3fc229f --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Scripts/runtests.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# +# Copyright 2008, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Executes the samples and tests for the Google Test Framework. + +# Help the dynamic linker find the path to the libraries. +export DYLD_FRAMEWORK_PATH=$BUILT_PRODUCTS_DIR +export DYLD_LIBRARY_PATH=$BUILT_PRODUCTS_DIR + +# Create some executables. +test_executables=("$BUILT_PRODUCTS_DIR/gtest_unittest-framework" + "$BUILT_PRODUCTS_DIR/gtest_unittest" + "$BUILT_PRODUCTS_DIR/sample1_unittest-framework" + "$BUILT_PRODUCTS_DIR/sample1_unittest-static") + +# Now execute each one in turn keeping track of how many succeeded and failed. +succeeded=0 +failed=0 +failed_list=() +for test in ${test_executables[*]}; do + "$test" + result=$? + if [ $result -eq 0 ]; then + succeeded=$(( $succeeded + 1 )) + else + failed=$(( failed + 1 )) + failed_list="$failed_list $test" + fi +done + +# Report the successes and failures to the console. +echo "Tests complete with $succeeded successes and $failed failures." +if [ $failed -ne 0 ]; then + echo "The following tests failed:" + echo $failed_list +fi +exit $failed diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/Scripts/versiongenerate.py b/inference-engine/tests/libs/gtest/googletest/xcode/Scripts/versiongenerate.py new file mode 100644 index 0000000..16791d2 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/Scripts/versiongenerate.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# +# Copyright 2008, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""A script to prepare version informtion for use the gtest Info.plist file. + + This script extracts the version information from the configure.ac file and + uses it to generate a header file containing the same information. The + #defines in this header file will be included in during the generation of + the Info.plist of the framework, giving the correct value to the version + shown in the Finder. + + This script makes the following assumptions (these are faults of the script, + not problems with the Autoconf): + 1. The AC_INIT macro will be contained within the first 1024 characters + of configure.ac + 2. The version string will be 3 integers separated by periods and will be + surrounded by square brackets, "[" and "]" (e.g. [1.0.1]). The first + segment represents the major version, the second represents the minor + version and the third represents the fix version. + 3. No ")" character exists between the opening "(" and closing ")" of + AC_INIT, including in comments and character strings. +""" + +import sys +import re + +# Read the command line argument (the output directory for Version.h) +if (len(sys.argv) < 3): + print "Usage: versiongenerate.py input_dir output_dir" + sys.exit(1) +else: + input_dir = sys.argv[1] + output_dir = sys.argv[2] + +# Read the first 1024 characters of the configure.ac file +config_file = open("%s/configure.ac" % input_dir, 'r') +buffer_size = 1024 +opening_string = config_file.read(buffer_size) +config_file.close() + +# Extract the version string from the AC_INIT macro +# The following init_expression means: +# Extract three integers separated by periods and surrounded by square +# brackets(e.g. "[1.0.1]") between "AC_INIT(" and ")". Do not be greedy +# (*? is the non-greedy flag) since that would pull in everything between +# the first "(" and the last ")" in the file. +version_expression = re.compile(r"AC_INIT\(.*?\[(\d+)\.(\d+)\.(\d+)\].*?\)", + re.DOTALL) +version_values = version_expression.search(opening_string) +major_version = version_values.group(1) +minor_version = version_values.group(2) +fix_version = version_values.group(3) + +# Write the version information to a header file to be included in the +# Info.plist file. +file_data = """// +// DO NOT MODIFY THIS FILE (but you can delete it) +// +// This file is autogenerated by the versiongenerate.py script. This script +// is executed in a "Run Script" build phase when creating gtest.framework. This +// header file is not used during compilation of C-source. Rather, it simply +// defines some version strings for substitution in the Info.plist. Because of +// this, we are not restricted to C-syntax nor are we using include guards. +// + +#define GTEST_VERSIONINFO_SHORT %s.%s +#define GTEST_VERSIONINFO_LONG %s.%s.%s + +""" % (major_version, minor_version, major_version, minor_version, fix_version) +version_file = open("%s/Version.h" % output_dir, 'w') +version_file.write(file_data) +version_file.close() diff --git a/inference-engine/tests/libs/gtest/googletest/xcode/gtest.xcodeproj/project.pbxproj b/inference-engine/tests/libs/gtest/googletest/xcode/gtest.xcodeproj/project.pbxproj new file mode 100644 index 0000000..003bff8 --- /dev/null +++ b/inference-engine/tests/libs/gtest/googletest/xcode/gtest.xcodeproj/project.pbxproj @@ -0,0 +1,1182 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 46; + objects = { + +/* Begin PBXAggregateTarget section */ + 3B238F5F0E828B5400846E11 /* Check */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 3B238FA30E828BB600846E11 /* Build configuration list for PBXAggregateTarget "Check" */; + buildPhases = ( + 3B238F5E0E828B5400846E11 /* ShellScript */, + ); + dependencies = ( + 40899F9D0FFA740F000B29AE /* PBXTargetDependency */, + 40C849F7101A43440083642A /* PBXTargetDependency */, + 4089A0980FFAD34A000B29AE /* PBXTargetDependency */, + 40C849F9101A43490083642A /* PBXTargetDependency */, + ); + name = Check; + productName = Check; + }; + 40C44ADC0E3798F4008FCC51 /* Version Info */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 40C44AE40E379905008FCC51 /* Build configuration list for PBXAggregateTarget "Version Info" */; + buildPhases = ( + 40C44ADB0E3798F4008FCC51 /* Generate Version.h */, + ); + comments = "The generation of Version.h must be performed in its own target. Since the Info.plist is preprocessed before any of the other build phases in gtest, the Version.h file would not be ready if included as a build phase of that target."; + dependencies = ( + ); + name = "Version Info"; + productName = Version.h; + }; +/* End PBXAggregateTarget section */ + +/* Begin PBXBuildFile section */ + 224A12A30E9EADCC00BD17FD /* gtest-test-part.h in Headers */ = {isa = PBXBuildFile; fileRef = 224A12A20E9EADCC00BD17FD /* gtest-test-part.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 3BF6F2A00E79B5AD000F2EEE /* gtest-type-util.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 3BF6F29F0E79B5AD000F2EEE /* gtest-type-util.h */; }; + 3BF6F2A50E79B616000F2EEE /* gtest-typed-test.h in Headers */ = {isa = PBXBuildFile; fileRef = 3BF6F2A40E79B616000F2EEE /* gtest-typed-test.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 404884380E2F799B00CF7658 /* gtest-death-test.h in Headers */ = {isa = PBXBuildFile; fileRef = 404883DB0E2F799B00CF7658 /* gtest-death-test.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 404884390E2F799B00CF7658 /* gtest-message.h in Headers */ = {isa = PBXBuildFile; fileRef = 404883DC0E2F799B00CF7658 /* gtest-message.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 4048843A0E2F799B00CF7658 /* gtest-spi.h in Headers */ = {isa = PBXBuildFile; fileRef = 404883DD0E2F799B00CF7658 /* gtest-spi.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 4048843B0E2F799B00CF7658 /* gtest.h in Headers */ = {isa = PBXBuildFile; fileRef = 404883DE0E2F799B00CF7658 /* gtest.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 4048843C0E2F799B00CF7658 /* gtest_pred_impl.h in Headers */ = {isa = PBXBuildFile; fileRef = 404883DF0E2F799B00CF7658 /* gtest_pred_impl.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 4048843D0E2F799B00CF7658 /* gtest_prod.h in Headers */ = {isa = PBXBuildFile; fileRef = 404883E00E2F799B00CF7658 /* gtest_prod.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 404884500E2F799B00CF7658 /* README.md in Resources */ = {isa = PBXBuildFile; fileRef = 404883F60E2F799B00CF7658 /* README.md */; }; + 404884A00E2F7BE600CF7658 /* gtest-death-test-internal.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 404883E20E2F799B00CF7658 /* gtest-death-test-internal.h */; }; + 404884A10E2F7BE600CF7658 /* gtest-filepath.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 404883E30E2F799B00CF7658 /* gtest-filepath.h */; }; + 404884A20E2F7BE600CF7658 /* gtest-internal.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 404883E40E2F799B00CF7658 /* gtest-internal.h */; }; + 404884A30E2F7BE600CF7658 /* gtest-port.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 404883E50E2F799B00CF7658 /* gtest-port.h */; }; + 404884A40E2F7BE600CF7658 /* gtest-string.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 404883E60E2F799B00CF7658 /* gtest-string.h */; }; + 404884AC0E2F7CD900CF7658 /* CHANGES in Resources */ = {isa = PBXBuildFile; fileRef = 404884A90E2F7CD900CF7658 /* CHANGES */; }; + 404884AD0E2F7CD900CF7658 /* CONTRIBUTORS in Resources */ = {isa = PBXBuildFile; fileRef = 404884AA0E2F7CD900CF7658 /* CONTRIBUTORS */; }; + 404884AE0E2F7CD900CF7658 /* LICENSE in Resources */ = {isa = PBXBuildFile; fileRef = 404884AB0E2F7CD900CF7658 /* LICENSE */; }; + 40899F3A0FFA70D4000B29AE /* gtest-all.cc in Sources */ = {isa = PBXBuildFile; fileRef = 224A12A10E9EADA700BD17FD /* gtest-all.cc */; }; + 40899F500FFA7281000B29AE /* gtest-tuple.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 40899F4D0FFA7271000B29AE /* gtest-tuple.h */; }; + 40899F530FFA72A0000B29AE /* gtest_unittest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 3B238C120E7FE13C00846E11 /* gtest_unittest.cc */; }; + 4089A0440FFAD1BE000B29AE /* sample1.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4089A02C0FFACF7F000B29AE /* sample1.cc */; }; + 4089A0460FFAD1BE000B29AE /* sample1_unittest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4089A02E0FFACF7F000B29AE /* sample1_unittest.cc */; }; + 40C848FF101A21150083642A /* gtest-all.cc in Sources */ = {isa = PBXBuildFile; fileRef = 224A12A10E9EADA700BD17FD /* gtest-all.cc */; }; + 40C84915101A21DF0083642A /* gtest_main.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4048840D0E2F799B00CF7658 /* gtest_main.cc */; }; + 40C84916101A235B0083642A /* libgtest_main.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 40C8490B101A217E0083642A /* libgtest_main.a */; }; + 40C84921101A23AD0083642A /* libgtest_main.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 40C8490B101A217E0083642A /* libgtest_main.a */; }; + 40C84978101A36540083642A /* libgtest_main.a in Resources */ = {isa = PBXBuildFile; fileRef = 40C8490B101A217E0083642A /* libgtest_main.a */; }; + 40C84980101A36850083642A /* gtest_unittest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 3B238C120E7FE13C00846E11 /* gtest_unittest.cc */; }; + 40C84982101A36850083642A /* libgtest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 40C848FA101A209C0083642A /* libgtest.a */; }; + 40C84983101A36850083642A /* libgtest_main.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 40C8490B101A217E0083642A /* libgtest_main.a */; }; + 40C8498F101A36A60083642A /* sample1.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4089A02C0FFACF7F000B29AE /* sample1.cc */; }; + 40C84990101A36A60083642A /* sample1_unittest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4089A02E0FFACF7F000B29AE /* sample1_unittest.cc */; }; + 40C84992101A36A60083642A /* libgtest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 40C848FA101A209C0083642A /* libgtest.a */; }; + 40C84993101A36A60083642A /* libgtest_main.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 40C8490B101A217E0083642A /* libgtest_main.a */; }; + 40C849A2101A37050083642A /* gtest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4539C8FF0EC27F6400A70F4C /* gtest.framework */; }; + 40C849A4101A37150083642A /* gtest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4539C8FF0EC27F6400A70F4C /* gtest.framework */; }; + 4539C9340EC280AE00A70F4C /* gtest-param-test.h in Headers */ = {isa = PBXBuildFile; fileRef = 4539C9330EC280AE00A70F4C /* gtest-param-test.h */; settings = {ATTRIBUTES = (Public, ); }; }; + 4539C9380EC280E200A70F4C /* gtest-linked_ptr.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 4539C9350EC280E200A70F4C /* gtest-linked_ptr.h */; }; + 4539C9390EC280E200A70F4C /* gtest-param-util-generated.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 4539C9360EC280E200A70F4C /* gtest-param-util-generated.h */; }; + 4539C93A0EC280E200A70F4C /* gtest-param-util.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = 4539C9370EC280E200A70F4C /* gtest-param-util.h */; }; + 4567C8181264FF71007740BE /* gtest-printers.h in Headers */ = {isa = PBXBuildFile; fileRef = 4567C8171264FF71007740BE /* gtest-printers.h */; settings = {ATTRIBUTES = (Public, ); }; }; + F67D4F3E1C7F5D8B0017C729 /* gtest-port-arch.h in Headers */ = {isa = PBXBuildFile; fileRef = F67D4F3D1C7F5D8B0017C729 /* gtest-port-arch.h */; }; + F67D4F3F1C7F5DA70017C729 /* gtest-port-arch.h in Copy Headers Internal */ = {isa = PBXBuildFile; fileRef = F67D4F3D1C7F5D8B0017C729 /* gtest-port-arch.h */; }; + F67D4F441C7F5DD00017C729 /* gtest-port.h in Headers */ = {isa = PBXBuildFile; fileRef = F67D4F411C7F5DD00017C729 /* gtest-port.h */; }; + F67D4F451C7F5DD00017C729 /* gtest-printers.h in Headers */ = {isa = PBXBuildFile; fileRef = F67D4F421C7F5DD00017C729 /* gtest-printers.h */; }; + F67D4F461C7F5DD00017C729 /* gtest.h in Headers */ = {isa = PBXBuildFile; fileRef = F67D4F431C7F5DD00017C729 /* gtest.h */; }; + F67D4F481C7F5E160017C729 /* gtest-port.h in Copy Headers Internal Custom */ = {isa = PBXBuildFile; fileRef = F67D4F411C7F5DD00017C729 /* gtest-port.h */; }; + F67D4F491C7F5E260017C729 /* gtest-printers.h in Copy Headers Internal Custom */ = {isa = PBXBuildFile; fileRef = F67D4F421C7F5DD00017C729 /* gtest-printers.h */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + 40899F9C0FFA740F000B29AE /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40899F420FFA7184000B29AE; + remoteInfo = gtest_unittest; + }; + 4089A0970FFAD34A000B29AE /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 4089A0120FFACEFC000B29AE; + remoteInfo = sample1_unittest; + }; + 408BEC0F1046CFE900DEF522 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C848F9101A209C0083642A; + remoteInfo = "gtest-static"; + }; + 40C44AE50E379922008FCC51 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C44ADC0E3798F4008FCC51; + remoteInfo = Version.h; + }; + 40C8497C101A36850083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C848F9101A209C0083642A; + remoteInfo = "gtest-static"; + }; + 40C8497E101A36850083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C8490A101A217E0083642A; + remoteInfo = "gtest_main-static"; + }; + 40C8498B101A36A60083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C848F9101A209C0083642A; + remoteInfo = "gtest-static"; + }; + 40C8498D101A36A60083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C8490A101A217E0083642A; + remoteInfo = "gtest_main-static"; + }; + 40C8499B101A36DC0083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C8490A101A217E0083642A; + remoteInfo = "gtest_main-static"; + }; + 40C8499D101A36E50083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 8D07F2BC0486CC7A007CD1D0; + remoteInfo = "gtest-framework"; + }; + 40C8499F101A36F10083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 8D07F2BC0486CC7A007CD1D0; + remoteInfo = "gtest-framework"; + }; + 40C849F6101A43440083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C8497A101A36850083642A; + remoteInfo = "gtest_unittest-static"; + }; + 40C849F8101A43490083642A /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 0867D690FE84028FC02AAC07 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 40C84989101A36A60083642A; + remoteInfo = "sample1_unittest-static"; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXCopyFilesBuildPhase section */ + 404884A50E2F7C0400CF7658 /* Copy Headers Internal */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = Headers/internal; + dstSubfolderSpec = 6; + files = ( + F67D4F3F1C7F5DA70017C729 /* gtest-port-arch.h in Copy Headers Internal */, + 404884A00E2F7BE600CF7658 /* gtest-death-test-internal.h in Copy Headers Internal */, + 404884A10E2F7BE600CF7658 /* gtest-filepath.h in Copy Headers Internal */, + 404884A20E2F7BE600CF7658 /* gtest-internal.h in Copy Headers Internal */, + 4539C9380EC280E200A70F4C /* gtest-linked_ptr.h in Copy Headers Internal */, + 4539C9390EC280E200A70F4C /* gtest-param-util-generated.h in Copy Headers Internal */, + 4539C93A0EC280E200A70F4C /* gtest-param-util.h in Copy Headers Internal */, + 404884A30E2F7BE600CF7658 /* gtest-port.h in Copy Headers Internal */, + 404884A40E2F7BE600CF7658 /* gtest-string.h in Copy Headers Internal */, + 40899F500FFA7281000B29AE /* gtest-tuple.h in Copy Headers Internal */, + 3BF6F2A00E79B5AD000F2EEE /* gtest-type-util.h in Copy Headers Internal */, + ); + name = "Copy Headers Internal"; + runOnlyForDeploymentPostprocessing = 0; + }; + F67D4F471C7F5DF60017C729 /* Copy Headers Internal Custom */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = Headers/internal/custom; + dstSubfolderSpec = 6; + files = ( + F67D4F491C7F5E260017C729 /* gtest-printers.h in Copy Headers Internal Custom */, + F67D4F481C7F5E160017C729 /* gtest-port.h in Copy Headers Internal Custom */, + ); + name = "Copy Headers Internal Custom"; + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXCopyFilesBuildPhase section */ + +/* Begin PBXFileReference section */ + 224A12A10E9EADA700BD17FD /* gtest-all.cc */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = "gtest-all.cc"; sourceTree = ""; }; + 224A12A20E9EADCC00BD17FD /* gtest-test-part.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = "gtest-test-part.h"; sourceTree = ""; }; + 3B238C120E7FE13C00846E11 /* gtest_unittest.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gtest_unittest.cc; sourceTree = ""; }; + 3B87D2100E96B92E000D1852 /* runtests.sh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.sh; path = runtests.sh; sourceTree = ""; }; + 3BF6F29F0E79B5AD000F2EEE /* gtest-type-util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-type-util.h"; sourceTree = ""; }; + 3BF6F2A40E79B616000F2EEE /* gtest-typed-test.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-typed-test.h"; sourceTree = ""; }; + 403EE37C0E377822004BD1E2 /* versiongenerate.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = versiongenerate.py; sourceTree = ""; }; + 404883DB0E2F799B00CF7658 /* gtest-death-test.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-death-test.h"; sourceTree = ""; }; + 404883DC0E2F799B00CF7658 /* gtest-message.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-message.h"; sourceTree = ""; }; + 404883DD0E2F799B00CF7658 /* gtest-spi.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-spi.h"; sourceTree = ""; }; + 404883DE0E2F799B00CF7658 /* gtest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gtest.h; sourceTree = ""; }; + 404883DF0E2F799B00CF7658 /* gtest_pred_impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gtest_pred_impl.h; sourceTree = ""; }; + 404883E00E2F799B00CF7658 /* gtest_prod.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gtest_prod.h; sourceTree = ""; }; + 404883E20E2F799B00CF7658 /* gtest-death-test-internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-death-test-internal.h"; sourceTree = ""; }; + 404883E30E2F799B00CF7658 /* gtest-filepath.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-filepath.h"; sourceTree = ""; }; + 404883E40E2F799B00CF7658 /* gtest-internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-internal.h"; sourceTree = ""; }; + 404883E50E2F799B00CF7658 /* gtest-port.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-port.h"; sourceTree = ""; }; + 404883E60E2F799B00CF7658 /* gtest-string.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-string.h"; sourceTree = ""; }; + 404883F60E2F799B00CF7658 /* README.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = README.md; path = ../README.md; sourceTree = SOURCE_ROOT; }; + 4048840D0E2F799B00CF7658 /* gtest_main.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gtest_main.cc; sourceTree = ""; }; + 404884A90E2F7CD900CF7658 /* CHANGES */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = CHANGES; path = ../CHANGES; sourceTree = SOURCE_ROOT; }; + 404884AA0E2F7CD900CF7658 /* CONTRIBUTORS */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = CONTRIBUTORS; path = ../CONTRIBUTORS; sourceTree = SOURCE_ROOT; }; + 404884AB0E2F7CD900CF7658 /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = LICENSE; path = ../LICENSE; sourceTree = SOURCE_ROOT; }; + 40899F430FFA7184000B29AE /* gtest_unittest-framework */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "gtest_unittest-framework"; sourceTree = BUILT_PRODUCTS_DIR; }; + 40899F4D0FFA7271000B29AE /* gtest-tuple.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-tuple.h"; sourceTree = ""; }; + 40899FB30FFA7567000B29AE /* StaticLibraryTarget.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = StaticLibraryTarget.xcconfig; sourceTree = ""; }; + 4089A0130FFACEFC000B29AE /* sample1_unittest-framework */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "sample1_unittest-framework"; sourceTree = BUILT_PRODUCTS_DIR; }; + 4089A02C0FFACF7F000B29AE /* sample1.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sample1.cc; sourceTree = ""; }; + 4089A02D0FFACF7F000B29AE /* sample1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sample1.h; sourceTree = ""; }; + 4089A02E0FFACF7F000B29AE /* sample1_unittest.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sample1_unittest.cc; sourceTree = ""; }; + 40C848FA101A209C0083642A /* libgtest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libgtest.a; sourceTree = BUILT_PRODUCTS_DIR; }; + 40C8490B101A217E0083642A /* libgtest_main.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libgtest_main.a; sourceTree = BUILT_PRODUCTS_DIR; }; + 40C84987101A36850083642A /* gtest_unittest */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = gtest_unittest; sourceTree = BUILT_PRODUCTS_DIR; }; + 40C84997101A36A60083642A /* sample1_unittest-static */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "sample1_unittest-static"; sourceTree = BUILT_PRODUCTS_DIR; }; + 40D4CDF10E30E07400294801 /* DebugProject.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = DebugProject.xcconfig; sourceTree = ""; }; + 40D4CDF20E30E07400294801 /* FrameworkTarget.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = FrameworkTarget.xcconfig; sourceTree = ""; }; + 40D4CDF30E30E07400294801 /* General.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = General.xcconfig; sourceTree = ""; }; + 40D4CDF40E30E07400294801 /* ReleaseProject.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = ReleaseProject.xcconfig; sourceTree = ""; }; + 40D4CF510E30F5E200294801 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + 4539C8FF0EC27F6400A70F4C /* gtest.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = gtest.framework; sourceTree = BUILT_PRODUCTS_DIR; }; + 4539C9330EC280AE00A70F4C /* gtest-param-test.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-param-test.h"; sourceTree = ""; }; + 4539C9350EC280E200A70F4C /* gtest-linked_ptr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-linked_ptr.h"; sourceTree = ""; }; + 4539C9360EC280E200A70F4C /* gtest-param-util-generated.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-param-util-generated.h"; sourceTree = ""; }; + 4539C9370EC280E200A70F4C /* gtest-param-util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-param-util.h"; sourceTree = ""; }; + 4567C8171264FF71007740BE /* gtest-printers.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-printers.h"; sourceTree = ""; }; + F67D4F3D1C7F5D8B0017C729 /* gtest-port-arch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-port-arch.h"; sourceTree = ""; }; + F67D4F411C7F5DD00017C729 /* gtest-port.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-port.h"; sourceTree = ""; }; + F67D4F421C7F5DD00017C729 /* gtest-printers.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "gtest-printers.h"; sourceTree = ""; }; + F67D4F431C7F5DD00017C729 /* gtest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gtest.h; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 40899F410FFA7184000B29AE /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C849A4101A37150083642A /* gtest.framework in Frameworks */, + 40C84916101A235B0083642A /* libgtest_main.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 4089A0110FFACEFC000B29AE /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C849A2101A37050083642A /* gtest.framework in Frameworks */, + 40C84921101A23AD0083642A /* libgtest_main.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 40C84981101A36850083642A /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C84982101A36850083642A /* libgtest.a in Frameworks */, + 40C84983101A36850083642A /* libgtest_main.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 40C84991101A36A60083642A /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C84992101A36A60083642A /* libgtest.a in Frameworks */, + 40C84993101A36A60083642A /* libgtest_main.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 034768DDFF38A45A11DB9C8B /* Products */ = { + isa = PBXGroup; + children = ( + 4539C8FF0EC27F6400A70F4C /* gtest.framework */, + 40C848FA101A209C0083642A /* libgtest.a */, + 40C8490B101A217E0083642A /* libgtest_main.a */, + 40899F430FFA7184000B29AE /* gtest_unittest-framework */, + 40C84987101A36850083642A /* gtest_unittest */, + 4089A0130FFACEFC000B29AE /* sample1_unittest-framework */, + 40C84997101A36A60083642A /* sample1_unittest-static */, + ); + name = Products; + sourceTree = ""; + }; + 0867D691FE84028FC02AAC07 /* gtest */ = { + isa = PBXGroup; + children = ( + 40D4CDF00E30E07400294801 /* Config */, + 08FB77ACFE841707C02AAC07 /* Source */, + 40D4CF4E0E30F5E200294801 /* Resources */, + 403EE37B0E377822004BD1E2 /* Scripts */, + 034768DDFF38A45A11DB9C8B /* Products */, + ); + name = gtest; + sourceTree = ""; + }; + 08FB77ACFE841707C02AAC07 /* Source */ = { + isa = PBXGroup; + children = ( + 404884A90E2F7CD900CF7658 /* CHANGES */, + 404884AA0E2F7CD900CF7658 /* CONTRIBUTORS */, + 404884AB0E2F7CD900CF7658 /* LICENSE */, + 404883F60E2F799B00CF7658 /* README.md */, + 404883D90E2F799B00CF7658 /* include */, + 4089A02F0FFACF84000B29AE /* samples */, + 404884070E2F799B00CF7658 /* src */, + 3B238BF00E7FE13B00846E11 /* test */, + ); + name = Source; + sourceTree = ""; + }; + 3B238BF00E7FE13B00846E11 /* test */ = { + isa = PBXGroup; + children = ( + 3B238C120E7FE13C00846E11 /* gtest_unittest.cc */, + ); + name = test; + path = ../test; + sourceTree = SOURCE_ROOT; + }; + 403EE37B0E377822004BD1E2 /* Scripts */ = { + isa = PBXGroup; + children = ( + 403EE37C0E377822004BD1E2 /* versiongenerate.py */, + 3B87D2100E96B92E000D1852 /* runtests.sh */, + ); + path = Scripts; + sourceTree = ""; + }; + 404883D90E2F799B00CF7658 /* include */ = { + isa = PBXGroup; + children = ( + 404883DA0E2F799B00CF7658 /* gtest */, + ); + name = include; + path = ../include; + sourceTree = SOURCE_ROOT; + }; + 404883DA0E2F799B00CF7658 /* gtest */ = { + isa = PBXGroup; + children = ( + 404883E10E2F799B00CF7658 /* internal */, + 224A12A20E9EADCC00BD17FD /* gtest-test-part.h */, + 404883DB0E2F799B00CF7658 /* gtest-death-test.h */, + 404883DC0E2F799B00CF7658 /* gtest-message.h */, + 4539C9330EC280AE00A70F4C /* gtest-param-test.h */, + 4567C8171264FF71007740BE /* gtest-printers.h */, + 404883DD0E2F799B00CF7658 /* gtest-spi.h */, + 404883DE0E2F799B00CF7658 /* gtest.h */, + 404883DF0E2F799B00CF7658 /* gtest_pred_impl.h */, + 404883E00E2F799B00CF7658 /* gtest_prod.h */, + 3BF6F2A40E79B616000F2EEE /* gtest-typed-test.h */, + ); + path = gtest; + sourceTree = ""; + }; + 404883E10E2F799B00CF7658 /* internal */ = { + isa = PBXGroup; + children = ( + F67D4F401C7F5DD00017C729 /* custom */, + 404883E20E2F799B00CF7658 /* gtest-death-test-internal.h */, + 404883E30E2F799B00CF7658 /* gtest-filepath.h */, + 404883E40E2F799B00CF7658 /* gtest-internal.h */, + 4539C9350EC280E200A70F4C /* gtest-linked_ptr.h */, + 4539C9360EC280E200A70F4C /* gtest-param-util-generated.h */, + 4539C9370EC280E200A70F4C /* gtest-param-util.h */, + 404883E50E2F799B00CF7658 /* gtest-port.h */, + F67D4F3D1C7F5D8B0017C729 /* gtest-port-arch.h */, + 404883E60E2F799B00CF7658 /* gtest-string.h */, + 40899F4D0FFA7271000B29AE /* gtest-tuple.h */, + 3BF6F29F0E79B5AD000F2EEE /* gtest-type-util.h */, + ); + path = internal; + sourceTree = ""; + }; + 404884070E2F799B00CF7658 /* src */ = { + isa = PBXGroup; + children = ( + 224A12A10E9EADA700BD17FD /* gtest-all.cc */, + 4048840D0E2F799B00CF7658 /* gtest_main.cc */, + ); + name = src; + path = ../src; + sourceTree = SOURCE_ROOT; + }; + 4089A02F0FFACF84000B29AE /* samples */ = { + isa = PBXGroup; + children = ( + 4089A02C0FFACF7F000B29AE /* sample1.cc */, + 4089A02D0FFACF7F000B29AE /* sample1.h */, + 4089A02E0FFACF7F000B29AE /* sample1_unittest.cc */, + ); + name = samples; + path = ../samples; + sourceTree = SOURCE_ROOT; + }; + 40D4CDF00E30E07400294801 /* Config */ = { + isa = PBXGroup; + children = ( + 40D4CDF10E30E07400294801 /* DebugProject.xcconfig */, + 40D4CDF20E30E07400294801 /* FrameworkTarget.xcconfig */, + 40D4CDF30E30E07400294801 /* General.xcconfig */, + 40D4CDF40E30E07400294801 /* ReleaseProject.xcconfig */, + 40899FB30FFA7567000B29AE /* StaticLibraryTarget.xcconfig */, + ); + path = Config; + sourceTree = ""; + }; + 40D4CF4E0E30F5E200294801 /* Resources */ = { + isa = PBXGroup; + children = ( + 40D4CF510E30F5E200294801 /* Info.plist */, + ); + path = Resources; + sourceTree = ""; + }; + F67D4F401C7F5DD00017C729 /* custom */ = { + isa = PBXGroup; + children = ( + F67D4F411C7F5DD00017C729 /* gtest-port.h */, + F67D4F421C7F5DD00017C729 /* gtest-printers.h */, + F67D4F431C7F5DD00017C729 /* gtest.h */, + ); + path = custom; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXHeadersBuildPhase section */ + 8D07F2BD0486CC7A007CD1D0 /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + F67D4F451C7F5DD00017C729 /* gtest-printers.h in Headers */, + 404884380E2F799B00CF7658 /* gtest-death-test.h in Headers */, + 404884390E2F799B00CF7658 /* gtest-message.h in Headers */, + 4539C9340EC280AE00A70F4C /* gtest-param-test.h in Headers */, + F67D4F461C7F5DD00017C729 /* gtest.h in Headers */, + F67D4F441C7F5DD00017C729 /* gtest-port.h in Headers */, + 4567C8181264FF71007740BE /* gtest-printers.h in Headers */, + F67D4F3E1C7F5D8B0017C729 /* gtest-port-arch.h in Headers */, + 3BF6F2A50E79B616000F2EEE /* gtest-typed-test.h in Headers */, + 4048843A0E2F799B00CF7658 /* gtest-spi.h in Headers */, + 4048843B0E2F799B00CF7658 /* gtest.h in Headers */, + 4048843C0E2F799B00CF7658 /* gtest_pred_impl.h in Headers */, + 4048843D0E2F799B00CF7658 /* gtest_prod.h in Headers */, + 224A12A30E9EADCC00BD17FD /* gtest-test-part.h in Headers */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXHeadersBuildPhase section */ + +/* Begin PBXNativeTarget section */ + 40899F420FFA7184000B29AE /* gtest_unittest-framework */ = { + isa = PBXNativeTarget; + buildConfigurationList = 40899F4A0FFA71BC000B29AE /* Build configuration list for PBXNativeTarget "gtest_unittest-framework" */; + buildPhases = ( + 40899F400FFA7184000B29AE /* Sources */, + 40899F410FFA7184000B29AE /* Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + 40C849A0101A36F10083642A /* PBXTargetDependency */, + ); + name = "gtest_unittest-framework"; + productName = gtest_unittest; + productReference = 40899F430FFA7184000B29AE /* gtest_unittest-framework */; + productType = "com.apple.product-type.tool"; + }; + 4089A0120FFACEFC000B29AE /* sample1_unittest-framework */ = { + isa = PBXNativeTarget; + buildConfigurationList = 4089A0240FFACF01000B29AE /* Build configuration list for PBXNativeTarget "sample1_unittest-framework" */; + buildPhases = ( + 4089A0100FFACEFC000B29AE /* Sources */, + 4089A0110FFACEFC000B29AE /* Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + 40C8499E101A36E50083642A /* PBXTargetDependency */, + ); + name = "sample1_unittest-framework"; + productName = sample1_unittest; + productReference = 4089A0130FFACEFC000B29AE /* sample1_unittest-framework */; + productType = "com.apple.product-type.tool"; + }; + 40C848F9101A209C0083642A /* gtest-static */ = { + isa = PBXNativeTarget; + buildConfigurationList = 40C84902101A212E0083642A /* Build configuration list for PBXNativeTarget "gtest-static" */; + buildPhases = ( + 40C848F7101A209C0083642A /* Sources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = "gtest-static"; + productName = "gtest-static"; + productReference = 40C848FA101A209C0083642A /* libgtest.a */; + productType = "com.apple.product-type.library.static"; + }; + 40C8490A101A217E0083642A /* gtest_main-static */ = { + isa = PBXNativeTarget; + buildConfigurationList = 40C84912101A21D20083642A /* Build configuration list for PBXNativeTarget "gtest_main-static" */; + buildPhases = ( + 40C84908101A217E0083642A /* Sources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = "gtest_main-static"; + productName = "gtest_main-static"; + productReference = 40C8490B101A217E0083642A /* libgtest_main.a */; + productType = "com.apple.product-type.library.static"; + }; + 40C8497A101A36850083642A /* gtest_unittest-static */ = { + isa = PBXNativeTarget; + buildConfigurationList = 40C84984101A36850083642A /* Build configuration list for PBXNativeTarget "gtest_unittest-static" */; + buildPhases = ( + 40C8497F101A36850083642A /* Sources */, + 40C84981101A36850083642A /* Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + 40C8497B101A36850083642A /* PBXTargetDependency */, + 40C8497D101A36850083642A /* PBXTargetDependency */, + ); + name = "gtest_unittest-static"; + productName = gtest_unittest; + productReference = 40C84987101A36850083642A /* gtest_unittest */; + productType = "com.apple.product-type.tool"; + }; + 40C84989101A36A60083642A /* sample1_unittest-static */ = { + isa = PBXNativeTarget; + buildConfigurationList = 40C84994101A36A60083642A /* Build configuration list for PBXNativeTarget "sample1_unittest-static" */; + buildPhases = ( + 40C8498E101A36A60083642A /* Sources */, + 40C84991101A36A60083642A /* Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + 40C8498A101A36A60083642A /* PBXTargetDependency */, + 40C8498C101A36A60083642A /* PBXTargetDependency */, + ); + name = "sample1_unittest-static"; + productName = sample1_unittest; + productReference = 40C84997101A36A60083642A /* sample1_unittest-static */; + productType = "com.apple.product-type.tool"; + }; + 8D07F2BC0486CC7A007CD1D0 /* gtest-framework */ = { + isa = PBXNativeTarget; + buildConfigurationList = 4FADC24208B4156D00ABE55E /* Build configuration list for PBXNativeTarget "gtest-framework" */; + buildPhases = ( + 8D07F2C10486CC7A007CD1D0 /* Sources */, + 8D07F2BD0486CC7A007CD1D0 /* Headers */, + 404884A50E2F7C0400CF7658 /* Copy Headers Internal */, + F67D4F471C7F5DF60017C729 /* Copy Headers Internal Custom */, + 8D07F2BF0486CC7A007CD1D0 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + 40C44AE60E379922008FCC51 /* PBXTargetDependency */, + 408BEC101046CFE900DEF522 /* PBXTargetDependency */, + 40C8499C101A36DC0083642A /* PBXTargetDependency */, + ); + name = "gtest-framework"; + productInstallPath = "$(HOME)/Library/Frameworks"; + productName = gtest; + productReference = 4539C8FF0EC27F6400A70F4C /* gtest.framework */; + productType = "com.apple.product-type.framework"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 0867D690FE84028FC02AAC07 /* Project object */ = { + isa = PBXProject; + attributes = { + LastUpgradeCheck = 0460; + }; + buildConfigurationList = 4FADC24608B4156D00ABE55E /* Build configuration list for PBXProject "gtest" */; + compatibilityVersion = "Xcode 3.2"; + developmentRegion = English; + hasScannedForEncodings = 1; + knownRegions = ( + English, + Japanese, + French, + German, + en, + ); + mainGroup = 0867D691FE84028FC02AAC07 /* gtest */; + productRefGroup = 034768DDFF38A45A11DB9C8B /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8D07F2BC0486CC7A007CD1D0 /* gtest-framework */, + 40C848F9101A209C0083642A /* gtest-static */, + 40C8490A101A217E0083642A /* gtest_main-static */, + 40899F420FFA7184000B29AE /* gtest_unittest-framework */, + 40C8497A101A36850083642A /* gtest_unittest-static */, + 4089A0120FFACEFC000B29AE /* sample1_unittest-framework */, + 40C84989101A36A60083642A /* sample1_unittest-static */, + 3B238F5F0E828B5400846E11 /* Check */, + 40C44ADC0E3798F4008FCC51 /* Version Info */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + 8D07F2BF0486CC7A007CD1D0 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 404884500E2F799B00CF7658 /* README.md in Resources */, + 404884AC0E2F7CD900CF7658 /* CHANGES in Resources */, + 404884AD0E2F7CD900CF7658 /* CONTRIBUTORS in Resources */, + 404884AE0E2F7CD900CF7658 /* LICENSE in Resources */, + 40C84978101A36540083642A /* libgtest_main.a in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXShellScriptBuildPhase section */ + 3B238F5E0E828B5400846E11 /* ShellScript */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputPaths = ( + ); + outputPaths = ( + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "# Remember, this \"Run Script\" build phase will be executed from $SRCROOT\n/bin/bash Scripts/runtests.sh"; + }; + 40C44ADB0E3798F4008FCC51 /* Generate Version.h */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputPaths = ( + "$(SRCROOT)/Scripts/versiongenerate.py", + "$(SRCROOT)/../configure.ac", + ); + name = "Generate Version.h"; + outputPaths = ( + "$(PROJECT_TEMP_DIR)/Version.h", + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "# Remember, this \"Run Script\" build phase will be executed from $SRCROOT\n/usr/bin/python Scripts/versiongenerate.py ../ $PROJECT_TEMP_DIR"; + }; +/* End PBXShellScriptBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 40899F400FFA7184000B29AE /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 40899F530FFA72A0000B29AE /* gtest_unittest.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 4089A0100FFACEFC000B29AE /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 4089A0440FFAD1BE000B29AE /* sample1.cc in Sources */, + 4089A0460FFAD1BE000B29AE /* sample1_unittest.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 40C848F7101A209C0083642A /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C848FF101A21150083642A /* gtest-all.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 40C84908101A217E0083642A /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C84915101A21DF0083642A /* gtest_main.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 40C8497F101A36850083642A /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C84980101A36850083642A /* gtest_unittest.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 40C8498E101A36A60083642A /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 40C8498F101A36A60083642A /* sample1.cc in Sources */, + 40C84990101A36A60083642A /* sample1_unittest.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 8D07F2C10486CC7A007CD1D0 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 40899F3A0FFA70D4000B29AE /* gtest-all.cc in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + 40899F9D0FFA740F000B29AE /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40899F420FFA7184000B29AE /* gtest_unittest-framework */; + targetProxy = 40899F9C0FFA740F000B29AE /* PBXContainerItemProxy */; + }; + 4089A0980FFAD34A000B29AE /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 4089A0120FFACEFC000B29AE /* sample1_unittest-framework */; + targetProxy = 4089A0970FFAD34A000B29AE /* PBXContainerItemProxy */; + }; + 408BEC101046CFE900DEF522 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C848F9101A209C0083642A /* gtest-static */; + targetProxy = 408BEC0F1046CFE900DEF522 /* PBXContainerItemProxy */; + }; + 40C44AE60E379922008FCC51 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C44ADC0E3798F4008FCC51 /* Version Info */; + targetProxy = 40C44AE50E379922008FCC51 /* PBXContainerItemProxy */; + }; + 40C8497B101A36850083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C848F9101A209C0083642A /* gtest-static */; + targetProxy = 40C8497C101A36850083642A /* PBXContainerItemProxy */; + }; + 40C8497D101A36850083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C8490A101A217E0083642A /* gtest_main-static */; + targetProxy = 40C8497E101A36850083642A /* PBXContainerItemProxy */; + }; + 40C8498A101A36A60083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C848F9101A209C0083642A /* gtest-static */; + targetProxy = 40C8498B101A36A60083642A /* PBXContainerItemProxy */; + }; + 40C8498C101A36A60083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C8490A101A217E0083642A /* gtest_main-static */; + targetProxy = 40C8498D101A36A60083642A /* PBXContainerItemProxy */; + }; + 40C8499C101A36DC0083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C8490A101A217E0083642A /* gtest_main-static */; + targetProxy = 40C8499B101A36DC0083642A /* PBXContainerItemProxy */; + }; + 40C8499E101A36E50083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 8D07F2BC0486CC7A007CD1D0 /* gtest-framework */; + targetProxy = 40C8499D101A36E50083642A /* PBXContainerItemProxy */; + }; + 40C849A0101A36F10083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 8D07F2BC0486CC7A007CD1D0 /* gtest-framework */; + targetProxy = 40C8499F101A36F10083642A /* PBXContainerItemProxy */; + }; + 40C849F7101A43440083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C8497A101A36850083642A /* gtest_unittest-static */; + targetProxy = 40C849F6101A43440083642A /* PBXContainerItemProxy */; + }; + 40C849F9101A43490083642A /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 40C84989101A36A60083642A /* sample1_unittest-static */; + targetProxy = 40C849F8101A43490083642A /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + 3B238F600E828B5400846E11 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + COPY_PHASE_STRIP = NO; + GCC_DYNAMIC_NO_PIC = NO; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + PRODUCT_NAME = Check; + SDKROOT = macosx; + }; + name = Debug; + }; + 3B238F610E828B5400846E11 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + COPY_PHASE_STRIP = YES; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + PRODUCT_NAME = Check; + SDKROOT = macosx; + ZERO_LINK = NO; + }; + name = Release; + }; + 40899F450FFA7185000B29AE /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ../; + PRODUCT_NAME = "gtest_unittest-framework"; + SDKROOT = macosx; + }; + name = Debug; + }; + 40899F460FFA7185000B29AE /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ../; + PRODUCT_NAME = "gtest_unittest-framework"; + SDKROOT = macosx; + }; + name = Release; + }; + 4089A0150FFACEFD000B29AE /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + PRODUCT_NAME = "sample1_unittest-framework"; + SDKROOT = macosx; + }; + name = Debug; + }; + 4089A0160FFACEFD000B29AE /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + PRODUCT_NAME = "sample1_unittest-framework"; + SDKROOT = macosx; + }; + name = Release; + }; + 40C44ADF0E3798F4008FCC51 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + MACOSX_DEPLOYMENT_TARGET = 10.7; + PRODUCT_NAME = gtest; + SDKROOT = macosx; + TARGET_NAME = gtest; + }; + name = Debug; + }; + 40C44AE00E3798F4008FCC51 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + MACOSX_DEPLOYMENT_TARGET = 10.7; + PRODUCT_NAME = gtest; + SDKROOT = macosx; + TARGET_NAME = gtest; + }; + name = Release; + }; + 40C848FB101A209D0083642A /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40899FB30FFA7567000B29AE /* StaticLibraryTarget.xcconfig */; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + GCC_INLINES_ARE_PRIVATE_EXTERN = YES; + GCC_SYMBOLS_PRIVATE_EXTERN = YES; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ( + ../, + ../include/, + ); + PRODUCT_NAME = gtest; + SDKROOT = macosx; + }; + name = Debug; + }; + 40C848FC101A209D0083642A /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40899FB30FFA7567000B29AE /* StaticLibraryTarget.xcconfig */; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + GCC_INLINES_ARE_PRIVATE_EXTERN = YES; + GCC_SYMBOLS_PRIVATE_EXTERN = YES; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ( + ../, + ../include/, + ); + PRODUCT_NAME = gtest; + SDKROOT = macosx; + }; + name = Release; + }; + 40C8490E101A217F0083642A /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40899FB30FFA7567000B29AE /* StaticLibraryTarget.xcconfig */; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ( + ../, + ../include/, + ); + PRODUCT_NAME = gtest_main; + SDKROOT = macosx; + }; + name = Debug; + }; + 40C8490F101A217F0083642A /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40899FB30FFA7567000B29AE /* StaticLibraryTarget.xcconfig */; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ( + ../, + ../include/, + ); + PRODUCT_NAME = gtest_main; + SDKROOT = macosx; + }; + name = Release; + }; + 40C84985101A36850083642A /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ../; + PRODUCT_NAME = gtest_unittest; + SDKROOT = macosx; + }; + name = Debug; + }; + 40C84986101A36850083642A /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ../; + PRODUCT_NAME = gtest_unittest; + SDKROOT = macosx; + }; + name = Release; + }; + 40C84995101A36A60083642A /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + PRODUCT_NAME = "sample1_unittest-static"; + SDKROOT = macosx; + }; + name = Debug; + }; + 40C84996101A36A60083642A /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + PRODUCT_NAME = "sample1_unittest-static"; + SDKROOT = macosx; + }; + name = Release; + }; + 4FADC24308B4156D00ABE55E /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40D4CDF20E30E07400294801 /* FrameworkTarget.xcconfig */; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ( + ../, + ../include/, + ); + INFOPLIST_FILE = Resources/Info.plist; + INFOPLIST_PREFIX_HEADER = "$(PROJECT_TEMP_DIR)/Version.h"; + INFOPLIST_PREPROCESS = YES; + PRODUCT_NAME = gtest; + SDKROOT = macosx; + VERSIONING_SYSTEM = "apple-generic"; + }; + name = Debug; + }; + 4FADC24408B4156D00ABE55E /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40D4CDF20E30E07400294801 /* FrameworkTarget.xcconfig */; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + GCC_VERSION = com.apple.compilers.llvm.clang.1_0; + HEADER_SEARCH_PATHS = ( + ../, + ../include/, + ); + INFOPLIST_FILE = Resources/Info.plist; + INFOPLIST_PREFIX_HEADER = "$(PROJECT_TEMP_DIR)/Version.h"; + INFOPLIST_PREPROCESS = YES; + PRODUCT_NAME = gtest; + SDKROOT = macosx; + VERSIONING_SYSTEM = "apple-generic"; + }; + name = Release; + }; + 4FADC24708B4156D00ABE55E /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40D4CDF10E30E07400294801 /* DebugProject.xcconfig */; + buildSettings = { + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + MACOSX_DEPLOYMENT_TARGET = 10.7; + }; + name = Debug; + }; + 4FADC24808B4156D00ABE55E /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 40D4CDF40E30E07400294801 /* ReleaseProject.xcconfig */; + buildSettings = { + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + MACOSX_DEPLOYMENT_TARGET = 10.7; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 3B238FA30E828BB600846E11 /* Build configuration list for PBXAggregateTarget "Check" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 3B238F600E828B5400846E11 /* Debug */, + 3B238F610E828B5400846E11 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 40899F4A0FFA71BC000B29AE /* Build configuration list for PBXNativeTarget "gtest_unittest-framework" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 40899F450FFA7185000B29AE /* Debug */, + 40899F460FFA7185000B29AE /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4089A0240FFACF01000B29AE /* Build configuration list for PBXNativeTarget "sample1_unittest-framework" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4089A0150FFACEFD000B29AE /* Debug */, + 4089A0160FFACEFD000B29AE /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 40C44AE40E379905008FCC51 /* Build configuration list for PBXAggregateTarget "Version Info" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 40C44ADF0E3798F4008FCC51 /* Debug */, + 40C44AE00E3798F4008FCC51 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 40C84902101A212E0083642A /* Build configuration list for PBXNativeTarget "gtest-static" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 40C848FB101A209D0083642A /* Debug */, + 40C848FC101A209D0083642A /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 40C84912101A21D20083642A /* Build configuration list for PBXNativeTarget "gtest_main-static" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 40C8490E101A217F0083642A /* Debug */, + 40C8490F101A217F0083642A /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 40C84984101A36850083642A /* Build configuration list for PBXNativeTarget "gtest_unittest-static" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 40C84985101A36850083642A /* Debug */, + 40C84986101A36850083642A /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 40C84994101A36A60083642A /* Build configuration list for PBXNativeTarget "sample1_unittest-static" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 40C84995101A36A60083642A /* Debug */, + 40C84996101A36A60083642A /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4FADC24208B4156D00ABE55E /* Build configuration list for PBXNativeTarget "gtest-framework" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4FADC24308B4156D00ABE55E /* Debug */, + 4FADC24408B4156D00ABE55E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 4FADC24608B4156D00ABE55E /* Build configuration list for PBXProject "gtest" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 4FADC24708B4156D00ABE55E /* Debug */, + 4FADC24808B4156D00ABE55E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 0867D690FE84028FC02AAC07 /* Project object */; +} diff --git a/inference-engine/tests/mock_engine/CMakeLists.txt b/inference-engine/tests/mock_engine/CMakeLists.txt index dc1edfb..a0f77cf 100644 --- a/inference-engine/tests/mock_engine/CMakeLists.txt +++ b/inference-engine/tests/mock_engine/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -17,24 +17,23 @@ file (GLOB LIBRARY_HEADERS if(UNIX) list(REMOVE_ITEM LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dllmain.cpp) endif() -add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API) # Create named folders for the sources within the .vcproj # Empty name lists them directly under the .vcproj source_group("src" FILES ${LIBRARY_SRC}) source_group("include" FILES ${LIBRARY_HEADERS}) -# Properties->C/C++->General->Additional Include Directories -include_directories (${IE_MAIN_SOURCE_DIR}/include - ${IE_MAIN_SOURCE_DIR}/src/inference_engine - ${IE_MAIN_SOURCE_DIR}/include - ${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src) - # Create library file from sources. add_library(${TARGET_NAME} SHARED ${LIBRARY_SRC} ${LIBRARY_HEADERS}) +target_include_directories (${TARGET_NAME} PRIVATE + "${IE_MAIN_SOURCE_DIR}/src/inference_engine") + +target_link_libraries(${TARGET_NAME} PRIVATE inference_engine) + +target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_INFERENCE_ENGINE_API) set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 11) set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD_REQUIRED ON) diff --git a/inference-engine/tests/mock_engine/dllmain.cpp b/inference-engine/tests/mock_engine/dllmain.cpp index a9dd58a..88a8815 100644 --- a/inference-engine/tests/mock_engine/dllmain.cpp +++ b/inference-engine/tests/mock_engine/dllmain.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // // dllmain.cpp : Defines the entry point for the DLL application. diff --git a/inference-engine/tests/mock_engine/mock_plugin.cpp b/inference-engine/tests/mock_engine/mock_plugin.cpp index 0d344c8..587d224 100644 --- a/inference-engine/tests/mock_engine/mock_plugin.cpp +++ b/inference-engine/tests/mock_engine/mock_plugin.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/mock_engine/mock_plugin.hpp b/inference-engine/tests/mock_engine/mock_plugin.hpp index 9706381..3a2c952 100644 --- a/inference-engine/tests/mock_engine/mock_plugin.hpp +++ b/inference-engine/tests/mock_engine/mock_plugin.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/mock_engine/stub_inference_engine.xpp b/inference-engine/tests/mock_engine/stub_inference_engine.xpp index fa2d9de..008e9a0 100644 --- a/inference-engine/tests/mock_engine/stub_inference_engine.xpp +++ b/inference-engine/tests/mock_engine/stub_inference_engine.xpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include diff --git a/inference-engine/tests/unit/CMakeLists.txt b/inference-engine/tests/unit/CMakeLists.txt index 4761e83..9c0f539 100644 --- a/inference-engine/tests/unit/CMakeLists.txt +++ b/inference-engine/tests/unit/CMakeLists.txt @@ -1,31 +1,15 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -cmake_minimum_required(VERSION 2.8) cmake_policy(SET CMP0054 NEW) set(TARGET_NAME InferenceEngineUnitTests) #rpath enabled for unit tests only -SET (CMAKE_SKIP_RPATH FALSE) - -if (UNIX AND NOT APPLE) - set(ARCH_SPECIFIC_FOLDER_TBB /gcc4.4) - set(ARCH_SPECIFIC_FOLDER intel64_lin) - if ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(ARCH_SPECIFIC_FOLDER intel64_lin) - else ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(ARCH_SPECIFIC_FOLDER intel32_lin) - endif ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") -else () - set(ARCH_SPECIFIC_FOLDER_TBB /vc_mt) - if ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(ARCH_SPECIFIC_FOLDER intel64_win) - else ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(ARCH_SPECIFIC_FOLDER ia32_win) - endif ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") -endif () +SET (CMAKE_SKIP_RPATH OFF) + +# collect sources file(GLOB TEST_SRC @@ -34,6 +18,8 @@ file(GLOB inference_engine_tests/cpp_interfaces/*.cpp mem_solver/*.cpp cnn_network/*.cpp + builders/*.cpp + transformations/*.cpp shape_infer/*.cpp shape_infer/built-in/*.cpp topology_verification_tests/*.cpp @@ -57,7 +43,7 @@ endif() if (ENABLE_MKL_DNN) if (GEMM STREQUAL "MKL") add_definitions(-DUSE_MKL) - endif () + endif () file(GLOB MKLDNN_TESTS engines/mkldnn/*.cpp @@ -68,80 +54,49 @@ if (ENABLE_MKL_DNN) file(GLOB MKLDNN_TESTS_INCLUDE engines/mkldnn/graph/*.hpp) - if (USE_BOOST_RE) - debug_message(STATUS "Adding boost dependency") - if (VERBOSE_BUILD) - set(Boost_DEBUG on) - endif () - find_package(Boost REQUIRED COMPONENTS regex) - link_directories(${Boost_LIBRARY_DIRS}) - include_directories(${Boost_INCLUDE_DIRS}) - endif () - include_directories( ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include - engines/mkldnn/graph) + engines/mkldnn/graph + ${CMAKE_BINARY_DIR}/include/) source_group("mkldnn" FILES ${MKLDNN_TESTS} ${MKLDNN_TESTS_INCLUDE}) endif () file(GLOB TEST_INCLUDE - ${IE_MAIN_SOURCE_DIR}/tests/helpers/*.hpp shape_infer/*.hpp) source_group("src" FILES ${TEST_SRC}) source_group("include" FILES ${TEST_INCLUDE}) -include_directories( - ${IE_MAIN_SOURCE_DIR}/include - ${IE_MAIN_SOURCE_DIR}/src/inference_engine +# create target + +add_executable(${TARGET_NAME} ${TEST_SRC} ${TEST_INCLUDE} ${MKLDNN_TESTS} ${MKLDNN_TESTS_INCLUDE} ${DLAI_TESTS} transformations/sub_test.cpp transformations/tranformations_test.hpp) +set_ie_threading_interface_for(${TARGET_NAME}) + +target_include_directories(${TARGET_NAME} PRIVATE ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin ${IE_MAIN_SOURCE_DIR}/src/gna_plugin + ${IE_MAIN_SOURCE_DIR}/src/inference_engine ${IE_MAIN_SOURCE_DIR}/src/extension ${IE_MAIN_SOURCE_DIR}/src/extension/common - ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/gflags/include - mocks) -add_executable(${TARGET_NAME} ${TEST_SRC} ${TEST_INCLUDE} ${MKLDNN_TESTS} ${MKLDNN_TESTS_INCLUDE} ${DLAI_TESTS}) -set_ie_threading_interface_for(${TARGET_NAME}) + "${CMAKE_CURRENT_SOURCE_DIR}/mocks") -set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE" -COMPILE_PDB_NAME ${TARGET_NAME}) +set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) ## Mock macros doesn't use "override" specificator target_compile_options(${TARGET_NAME} PRIVATE $<$: -Wno-inconsistent-missing-override >) - -if (FALSE) - add_custom_command( - TARGET ${TARGET_NAME} - POST_BUILD COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/data/*.xml ${LIBRARY_OUTPUT_DIRECTORY} - POST_BUILD COMMAND cp -R ${IE_MAIN_SOURCE_DIR}/temp/models ${LIBRARY_OUTPUT_DIRECTORY}/models - ) -endif () - -if (MSVC) - set(PUGI pugixml_mt) -else () - set(PUGI pugixml) -endif () - -add_definitions(-DMODELS_PATH=\"${MODELS_PATH}\" -DDATA_PATH=\"${IE_MAIN_SOURCE_DIR}/tests/data\") +target_compile_options(${TARGET_NAME} PRIVATE $<$: -Wno-inconsistent-missing-override >) target_link_libraries(${TARGET_NAME} PRIVATE - gtest - gmock - gtest_main - inference_engine_s - ie_cpu_extension - helpers - ${PUGI} - ${LIB_DL} - ${MKLDNN_STATIC_ENGINE} - ${INTEL_ITT_LIBS} - ${Boost_REGEX_LIBRARY} - ${TBB_LIBRARY} - ${TBBMALLOC_LIBRARY} - ${GNA_TEST_ENGINE}) + gtest + gtest_main + gmock + gflags + inference_engine_s + helpers + ${CMAKE_DL_LIBS} + ${GNA_TEST_ENGINE}) add_dependencies(${TARGET_NAME} ie_cpu_extension) diff --git a/inference-engine/tests/unit/builders/argmax_layer_test.cpp b/inference-engine/tests/unit/builders/argmax_layer_test.cpp new file mode 100644 index 0000000..40e1595 --- /dev/null +++ b/inference-engine/tests/unit/builders/argmax_layer_test.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ArgMaxLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ArgMaxLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network network("network"); + Builder::ArgMaxLayer argMaxLayer("ArgMax layer"); + argMaxLayer.setAxis(1); + argMaxLayer.setOutMaxVal(0); + argMaxLayer.setTopK(20); + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(argMaxLayer)); + Builder::ArgMaxLayer layerFromNetwork(network.getLayer(ind)); + ASSERT_EQ(argMaxLayer.getAxis(), layerFromNetwork.getAxis()); + ASSERT_EQ(argMaxLayer.getOutMaxVal(), layerFromNetwork.getOutMaxVal()); + ASSERT_EQ(argMaxLayer.getTopK(), layerFromNetwork.getTopK()); +} + +TEST_F(ArgMaxLayerBuilderTest, cannotAddLayerWithWrongAxis) { + Builder::Network network("network"); + Builder::ArgMaxLayer argMaxLayer("ArgMax layer"); + argMaxLayer.setAxis(500); // here + argMaxLayer.setOutMaxVal(0); + argMaxLayer.setTopK(20); + ASSERT_THROW(network.addLayer(argMaxLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ArgMaxLayerBuilderTest, cannotAddLayerWithWrongOutMaxVal) { + Builder::Network network("network"); + Builder::ArgMaxLayer argMaxLayer("ArgMax layer"); + argMaxLayer.setAxis(1); + argMaxLayer.setOutMaxVal(500); // here + argMaxLayer.setTopK(20); + ASSERT_THROW(network.addLayer(argMaxLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp b/inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp index 5d55c17..1ae7f46 100644 --- a/inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp +++ b/inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -14,23 +14,23 @@ using namespace InferenceEngine; class BatchNormalizationLayerBuilderTest : public BuilderTestCommon {}; -TEST_F(BatchNormalizationLayerBuilderTest, cannotCreateBatchNormalizationWithoutWeightOrBiases) { - ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1")), InferenceEngine::details::InferenceEngineException); - ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1") - .setWeights(generateBlob(Precision::FP32, {3}, Layout::C))), InferenceEngine::details::InferenceEngineException); - ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1") - .setBiases(generateBlob(Precision::FP32, {3}, Layout::C))), InferenceEngine::details::InferenceEngineException); -} +//TEST_F(BatchNormalizationLayerBuilderTest, cannotCreateBatchNormalizationWithoutWeightOrBiases) { +// ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1")), InferenceEngine::details::InferenceEngineException); +// ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1") +// .setWeights(generateBlob(Precision::FP32, {3}, Layout::C))), InferenceEngine::details::InferenceEngineException); +// ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1") +// .setBiases(generateBlob(Precision::FP32, {3}, Layout::C))), InferenceEngine::details::InferenceEngineException); +//} TEST_F(BatchNormalizationLayerBuilderTest, getExistsLayerFromNetworkBuilder) { Builder::Network network("Test"); + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {3}, Layout::C))); + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {3}, Layout::C))); Builder::BatchNormalizationLayer bnBuilder("bn"); - bnBuilder.setWeights(generateBlob(Precision::FP32, {3}, Layout::C)); - bnBuilder.setBiases(generateBlob(Precision::FP32, {3}, Layout::C)); - size_t bnId = network.addLayer(bnBuilder); + idx_t bnId = network.addLayer({{0}, {weightsId}, {biasesId}}, bnBuilder); Builder::BatchNormalizationLayer bnBuilderFromNetwork(network.getLayer(bnId)); ASSERT_EQ(bnBuilderFromNetwork.getEpsilon(), bnBuilder.getEpsilon()); bnBuilderFromNetwork.setEpsilon(2); ASSERT_NE(bnBuilderFromNetwork.getEpsilon(), bnBuilder.getEpsilon()); - ASSERT_EQ(bnBuilderFromNetwork.getEpsilon(), network.getLayer(bnId).getParameters()["epsilon"].asFloat()); + ASSERT_EQ(bnBuilderFromNetwork.getEpsilon(), network.getLayer(bnId)->getParameters()["epsilon"].as()); } \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/builder_test.hpp b/inference-engine/tests/unit/builders/builder_test.hpp index 28ef342..728a346 100644 --- a/inference-engine/tests/unit/builders/builder_test.hpp +++ b/inference-engine/tests/unit/builders/builder_test.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/builders/clamp_layer_test.cpp b/inference-engine/tests/unit/builders/clamp_layer_test.cpp new file mode 100644 index 0000000..d912b26 --- /dev/null +++ b/inference-engine/tests/unit/builders/clamp_layer_test.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ClampLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ClampLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::ClampLayer clampLayer("clampLayer"); + clampLayer.setMinValue(0.1).setMaxValue(0.2); + size_t ind = net.addLayer(clampLayer); + Builder::ClampLayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(layerFromNet.getMinValue(), clampLayer.getMinValue()); + ASSERT_EQ(layerFromNet.getMaxValue(), clampLayer.getMaxValue()); +} + +TEST_F(ClampLayerBuilderTest, cannotCreateLayerWithWrongMinValue) { + Builder::Network net("network"); + Builder::ClampLayer clampLayer("clampLayer"); + clampLayer.setMinValue(0).setMaxValue(0.2); + ASSERT_NO_THROW(net.addLayer(clampLayer)); +} + +TEST_F(ClampLayerBuilderTest, cannotCreateLayerWithWrongMaxValue) { + Builder::Network net("network"); + Builder::ClampLayer clampLayer("clampLayer"); + clampLayer.setMinValue(10).setMaxValue(-0.2); + ASSERT_THROW(net.addLayer(clampLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ClampLayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeClampLayerPtr = std::make_shared("Clamp", "Clamp layer"); + fakeClampLayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeClampLayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::ClampLayer clampLayer(fakeClampLayerPtr); + clampLayer.setMinValue(0.0f).setMaxValue(1.0f); + ASSERT_THROW(net.addLayer(clampLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/concat_layer_test.cpp b/inference-engine/tests/unit/builders/concat_layer_test.cpp new file mode 100644 index 0000000..3c2ba90 --- /dev/null +++ b/inference-engine/tests/unit/builders/concat_layer_test.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ConcatLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ConcatLayerBuilderTest, getExistsLayerFromNetworkBuilderAxis) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(0); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 55})}); + layer.setOutputPort(Port({1 + 3, 2, 55, 55})); + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + network.getLayer(ind)->validate(false); + ASSERT_NO_THROW(network.getLayer(ind)->validate(false)); + Builder::ConcatLayer layerFromNet(network.getLayer(ind)); + + ASSERT_EQ(layer.getAxis(), layerFromNet.getAxis()); + ASSERT_EQ(layer.getInputPorts(), layerFromNet.getInputPorts()); + ASSERT_EQ(layer.getOutputPort(), layerFromNet.getOutputPort()); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithNoInputPorts) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(1); + layer.setOutputPort(Port({1, 2 + 4, 55, 55})); + // here should be layer.setInputPort(...) + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithOneInputPort) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(1); + layer.setInputPorts({Port({1, 2, 55, 55})}); // here + layer.setOutputPort(Port({1, 2 + 4, 55, 55})); + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithWrongAxis) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(50); // here + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 55})}); + layer.setOutputPort(Port({1 + 3, 2, 55, 55})); + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithUnalignedPorts1) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(0); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 55})}); + layer.setOutputPort(Port({1 + 3, 2, 55, 155})); // should be {1 + 3, 2, 55, 55} + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithUnalignedPorts2) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(0); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 55})}); + layer.setOutputPort(Port({1 + 3, 2, 155, 55})); // should be {1 + 3, 2, 55, 55} + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithUnalignedPorts3) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(0); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 55})}); + layer.setOutputPort(Port({100, 2, 55, 55})); // should be {1 + 3, 2, 55, 55} + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithUnalignedPorts4) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(1); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 55})}); + layer.setOutputPort(Port({1, 100, 55, 55})); // should be {1, 2 + 4, 55, 55} + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithDifferentInputPorts1) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(0); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 55, 155})}); // here + layer.setOutputPort(Port({1 + 3, 4, 55, 55})); + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConcatLayerBuilderTest, cannotCreateLayerWithDifferentInputPorts2) { + Builder::Network network("network"); + Builder::ConcatLayer layer("concat layer"); + + layer.setAxis(0); + layer.setInputPorts({Port({1, 2, 55, 55}), Port({3, 2, 155, 55})}); // here + layer.setOutputPort(Port({1 + 3, 4, 55, 55})); + + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/const_layer_test.cpp b/inference-engine/tests/unit/builders/const_layer_test.cpp new file mode 100644 index 0000000..1905096 --- /dev/null +++ b/inference-engine/tests/unit/builders/const_layer_test.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ConstLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ConstLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::ConstLayer layer("const layer"); + layer.setData(generateBlob(Precision::FP32, {3}, Layout::C)); + const size_t ind = net.addLayer(layer); + ASSERT_NO_THROW(net.getLayer(ind)->validate(false)); +} + +TEST_F(ConstLayerBuilderTest, cannotCreateLayerWithoutData) { + Builder::Network net("network"); + Builder::ConstLayer layer("const layer"); + ASSERT_THROW(net.addLayer(layer), + InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/convolution_layer_test.cpp b/inference-engine/tests/unit/builders/convolution_layer_test.cpp new file mode 100644 index 0000000..0b1ca8e --- /dev/null +++ b/inference-engine/tests/unit/builders/convolution_layer_test.cpp @@ -0,0 +1,307 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ConvolutionLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithoutWeight) { + Builder::Network network("Test"); + + Builder::ConvolutionLayer convBuilder("Convolution"); + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + convBuilder.setDilation({1, 1}); + size_t ind = network.addLayer(convBuilder); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, getExistsLayerFromNetworkBuilderWithInputPort) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + convBuilder.setDilation({1, 1}); + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + Builder::ConvolutionLayer convBuilderFromNetwork(network.getLayer(convId)); + + ASSERT_EQ(convBuilderFromNetwork.getStrides(), convBuilder.getStrides()); + ASSERT_EQ(convBuilderFromNetwork.getKernel(), convBuilder.getKernel()); + ASSERT_EQ(convBuilderFromNetwork.getPaddingsEnd(), convBuilder.getPaddingsEnd()); + ASSERT_EQ(convBuilderFromNetwork.getPaddingsBegin(), convBuilder.getPaddingsBegin()); + ASSERT_EQ(convBuilderFromNetwork.getOutDepth(), convBuilder.getOutDepth()); + ASSERT_EQ(convBuilderFromNetwork.getDilation(), convBuilder.getDilation()); +} + +TEST_F(ConvolutionLayerBuilderTest, getExistsLayerFromNetworkBuilderWithoutInputPort) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setDilation({1, 1}); + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + Builder::ConvolutionLayer convBuilderFromNetwork(network.getLayer(convId)); + + ASSERT_EQ(convBuilderFromNetwork.getStrides(), convBuilder.getStrides()); + ASSERT_EQ(convBuilderFromNetwork.getKernel(), convBuilder.getKernel()); + ASSERT_EQ(convBuilderFromNetwork.getPaddingsEnd(), convBuilder.getPaddingsEnd()); + ASSERT_EQ(convBuilderFromNetwork.getPaddingsBegin(), convBuilder.getPaddingsBegin()); + ASSERT_EQ(convBuilderFromNetwork.getOutDepth(), convBuilder.getOutDepth()); + ASSERT_EQ(convBuilderFromNetwork.getDilation(), convBuilder.getDilation()); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongNumberOfInputChannels) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 64, 225, 225})); // here + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_THROW(network.getLayer(convId)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, canCreateCorrcetConvolution) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); // here + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_NO_THROW(network.getLayer(convId)->validate(false)); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithGroup) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setGroup(2); + convBuilder.setInputPort(Port({1, 6, 225, 225})); + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 6, 11, 11}, Layout::OIHW))); + // should be {96, 6 / 2, 11, 11} + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_THROW(network.getLayer(convId)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, canCreateConvolution) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setGroup(2); + convBuilder.setInputPort(Port({1, 6, 225, 225})); // here + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_NO_THROW(network.getLayer(convId)->validate(false)); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongOutDepth) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(4); // here + convBuilder.setInputPort(Port({1, 3, 225, 225})); + + idx_t convId = network.addLayer(convBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_THROW(network.getLayer(convId)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongStrides) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 0}); // here + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + convBuilder.setPaddingsEnd({0, 0}); + convBuilder.setPaddingsBegin({0, 0}); + convBuilder.setDilation({0, 0}); + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongKernel1) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 0}); // here + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongKernel2) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11, 11}); // here + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongDilation1) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + convBuilder.setDilation({1, 0}); // here + + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongDilation2) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convBuilder("Convolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + convBuilder.setDilation({1, 1, 1}); // here + + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ConvolutionLayerBuilderTest, canCreateLayerWithNumberOfGroupDividingNumberOfInputChannels) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convLayer("Convolution"); + + size_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 2, 11, 11}, Layout::OIHW))); + size_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + + convLayer.setStrides({4, 4}); + convLayer.setKernel({11, 11}); + convLayer.setOutDepth(96); + convLayer.setInputPort(Port({1, 6, 225, 225})); + convLayer.setDilation({1, 1}); + + convLayer.setGroup(3); + size_t convId = network.addLayer(convLayer); + network.connect({weightsId}, {convId, 1}); + network.connect({biasesId}, {convId, 2}); + ASSERT_NO_THROW(network.getLayer(convId)->validate(false)); +} + +TEST_F(ConvolutionLayerBuilderTest, canCreateLayerWithWeightsNotAvailableForGroup) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convLayer("Convolution"); + + size_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 5, 11, 11}, Layout::OIHW))); + size_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + + convLayer.setStrides({4, 4}); + convLayer.setKernel({11, 11}); + convLayer.setOutDepth(96); + convLayer.setInputPort(Port({1, 6, 225, 225})); + convLayer.setDilation({1, 1}); + + convLayer.setGroup(3); + ASSERT_THROW(network.addLayer({{weightsId}, {biasesId}}, convLayer), + InferenceEngine::details::InferenceEngineException); // 6 / 3 != 5 +} + +TEST_F(ConvolutionLayerBuilderTest, cannotCreateLayerWithNumberOfGroupNotDividingNumberOfInputChannels) { + Builder::Network network("Test"); + Builder::ConvolutionLayer convLayer("Convolution"); + + size_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 2, 11, 11}, Layout::OIHW))); + size_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + + convLayer.setStrides({4, 4}); + convLayer.setKernel({11, 11}); + convLayer.setOutDepth(96); + convLayer.setInputPort(Port({1, 6, 225, 225})); + convLayer.setDilation({1, 1}); + + convLayer.setGroup(4); + ASSERT_THROW(network.addLayer({{weightsId}, {biasesId}}, convLayer), + InferenceEngine::details::InferenceEngineException); // 6 % 4 == 2 +} + diff --git a/inference-engine/tests/unit/builders/crop_layer_test.cpp b/inference-engine/tests/unit/builders/crop_layer_test.cpp new file mode 100644 index 0000000..c098bd6 --- /dev/null +++ b/inference-engine/tests/unit/builders/crop_layer_test.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class CropLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(CropLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network network("network"); + Builder::CropLayer cropLayer("Crop layer"); + std::vector input_ports; + input_ports.push_back(Port({1, 21, 44, 44})); + input_ports.push_back(Port({1, 21, 44, 44})); + cropLayer.setInputPorts(input_ports); + cropLayer.setOutputPort(Port({1, 21, 44, 44})); + cropLayer.setAxis({2, 3}); + cropLayer.setOffset({0, 0}); + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(cropLayer)); + Builder::CropLayer layerFromNet(network.getLayer(ind)); + ASSERT_EQ(layerFromNet.getAxis(), cropLayer.getAxis()); + ASSERT_EQ(layerFromNet.getOffset(), cropLayer.getOffset()); +} + +TEST_F(CropLayerBuilderTest, cannotCreateLayerWithOneInputShape) { + Builder::Network network("network"); + Builder::CropLayer cropLayer("Crop layer"); + std::vector input_ports; + input_ports.push_back(Port({1, 21, 44, 44})); // here + cropLayer.setInputPorts(input_ports); + cropLayer.setOutputPort(Port({1, 21, 44, 44})); + cropLayer.setAxis({2, 3}); + cropLayer.setOffset({0, 0}); + ASSERT_THROW(network.addLayer(cropLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(CropLayerBuilderTest, cannotCreateLayerWithThreeInputShapes) { + Builder::Network network("network"); + Builder::CropLayer cropLayer("Crop layer"); + std::vector input_ports; + input_ports.push_back(Port({1, 21, 44, 44})); + input_ports.push_back(Port({1, 21, 44, 44})); + input_ports.push_back(Port({1, 21, 44, 44})); // here + cropLayer.setInputPorts(input_ports); + cropLayer.setOutputPort(Port({1, 21, 44, 44})); + cropLayer.setAxis({2, 3}); + cropLayer.setOffset({0, 0}); + ASSERT_THROW(network.addLayer(cropLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(CropLayerBuilderTest, cannotCreateLayerWithDifferentSizeOfAxisAndOffset) { + Builder::Network network("network"); + Builder::CropLayer cropLayer("Crop layer"); + std::vector input_ports; + input_ports.push_back(Port({1, 21, 44, 44})); + input_ports.push_back(Port({1, 21, 44, 44})); + cropLayer.setInputPorts(input_ports); + cropLayer.setOutputPort(Port({1, 21, 44, 44})); + cropLayer.setAxis({2, 3}); + cropLayer.setOffset({0, 0, 0}); // here + ASSERT_THROW(network.addLayer(cropLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(CropLayerBuilderTest, cannotCreateLayerWithSoBigOffset) { + Builder::Network network("network"); + Builder::CropLayer cropLayer("Crop layer"); + std::vector input_ports; + input_ports.push_back(Port({1, 21, 44, 44})); + input_ports.push_back(Port({1, 21, 34, 34})); + cropLayer.setInputPorts(input_ports); + cropLayer.setOutputPort(Port({1, 21, 34, 34})); + cropLayer.setAxis({2, 3}); + cropLayer.setOffset({0, 50}); // here + ASSERT_THROW(network.addLayer(cropLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/ctc_greedy_decoder_layer_test.cpp b/inference-engine/tests/unit/builders/ctc_greedy_decoder_layer_test.cpp new file mode 100644 index 0000000..a8e7bf5 --- /dev/null +++ b/inference-engine/tests/unit/builders/ctc_greedy_decoder_layer_test.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class CTCGreedyDecoderLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(CTCGreedyDecoderLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network network("network"); + Builder::CTCGreedyDecoderLayer ctcGreedyDecoderLayer("CTCGreedyDecoder"); + ctcGreedyDecoderLayer.setInputPorts({Port({88, 1, 71}), Port({88, 1})}); + ctcGreedyDecoderLayer.setOutputPort(Port({1, 88, 1, 1})); + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(ctcGreedyDecoderLayer)); + Builder::CTCGreedyDecoderLayer layerFromNet(network.getLayer(ind)); + ASSERT_EQ(ctcGreedyDecoderLayer.getInputPorts(), layerFromNet.getInputPorts()); + ASSERT_EQ(ctcGreedyDecoderLayer.getOutputPort(), layerFromNet.getOutputPort()); +} + +TEST_F(CTCGreedyDecoderLayerBuilderTest, cannotCreateLayerWithoutInputPorts) { + Builder::Network network("network"); + Builder::CTCGreedyDecoderLayer ctcGreedyDecoderLayer("CTCGreedyDecoder"); + ctcGreedyDecoderLayer.setOutputPort(Port({1, 88, 1, 1})); + ASSERT_THROW(network.addLayer(ctcGreedyDecoderLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(CTCGreedyDecoderLayerBuilderTest, cannotCreateLayerWithThreeInputPorts) { + Builder::Network network("network"); + Builder::CTCGreedyDecoderLayer ctcGreedyDecoderLayer("CTCGreedyDecoder"); + ctcGreedyDecoderLayer.setInputPorts({Port({88, 1, 71}), Port({88, 1}), Port({88, 1})}); + ctcGreedyDecoderLayer.setOutputPort(Port({1, 88, 1, 1})); + ASSERT_THROW(network.addLayer(ctcGreedyDecoderLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/deconvolution_layer_test.cpp b/inference-engine/tests/unit/builders/deconvolution_layer_test.cpp new file mode 100644 index 0000000..73a9657 --- /dev/null +++ b/inference-engine/tests/unit/builders/deconvolution_layer_test.cpp @@ -0,0 +1,306 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class DeconvolutionLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithoutWeight) { + Builder::Network network("Test"); + + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); + deconvBuilder.setDilation({1, 1}); + size_t ind = network.addLayer(deconvBuilder); + ASSERT_THROW(network.getLayer(ind)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, getExistsLayerFromNetworkBuilderWithInputPort) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); + deconvBuilder.setDilation({1, 1}); + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + Builder::DeconvolutionLayer deconvBuilderFromNetwork(network.getLayer(convId)); + + ASSERT_EQ(deconvBuilderFromNetwork.getStrides(), deconvBuilder.getStrides()); + ASSERT_EQ(deconvBuilderFromNetwork.getKernel(), deconvBuilder.getKernel()); + ASSERT_EQ(deconvBuilderFromNetwork.getPaddingsEnd(), deconvBuilder.getPaddingsEnd()); + ASSERT_EQ(deconvBuilderFromNetwork.getPaddingsBegin(), deconvBuilder.getPaddingsBegin()); + ASSERT_EQ(deconvBuilderFromNetwork.getOutDepth(), deconvBuilder.getOutDepth()); + ASSERT_EQ(deconvBuilderFromNetwork.getDilation(), deconvBuilder.getDilation()); +} + +TEST_F(DeconvolutionLayerBuilderTest, getExistsLayerFromNetworkBuilderWithoutInputPort) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setDilation({1, 1}); + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + Builder::DeconvolutionLayer deconvBuilderFromNetwork(network.getLayer(convId)); + + ASSERT_EQ(deconvBuilderFromNetwork.getStrides(), deconvBuilder.getStrides()); + ASSERT_EQ(deconvBuilderFromNetwork.getKernel(), deconvBuilder.getKernel()); + ASSERT_EQ(deconvBuilderFromNetwork.getPaddingsEnd(), deconvBuilder.getPaddingsEnd()); + ASSERT_EQ(deconvBuilderFromNetwork.getPaddingsBegin(), deconvBuilder.getPaddingsBegin()); + ASSERT_EQ(deconvBuilderFromNetwork.getOutDepth(), deconvBuilder.getOutDepth()); + ASSERT_EQ(deconvBuilderFromNetwork.getDilation(), deconvBuilder.getDilation()); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongNumberOfInputChannels) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 64, 225, 225})); // here + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_THROW(network.getLayer(convId)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, canCreateCorrcetConvolution) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); // here + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_NO_THROW(network.getLayer(convId)->validate(false)); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithGroup) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setGroup(2); + deconvBuilder.setInputPort(Port({1, 6, 225, 225})); + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 6, 11, 11}, Layout::OIHW))); + // should be {96, 6 / 2, 11, 11} + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_THROW(network.getLayer(convId)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, canCreateConvolution) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setGroup(2); + deconvBuilder.setInputPort(Port({1, 6, 225, 225})); // here + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_NO_THROW(network.getLayer(convId)->validate(false)); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongOutDepth) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(4); // here + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); + + idx_t convId = network.addLayer(deconvBuilder); + + idx_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + network.connect({weightsId}, {convId, 1}); + + idx_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + network.connect({biasesId}, {convId, 2}); + + ASSERT_THROW(network.getLayer(convId)->validate(false), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongStrides) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 0}); // here + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); + deconvBuilder.setPaddingsEnd({0, 0}); + deconvBuilder.setPaddingsBegin({0, 0}); + deconvBuilder.setDilation({0, 0}); + ASSERT_THROW(network.addLayer(deconvBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongKernel1) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 0}); // here + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); + + ASSERT_THROW(network.addLayer(deconvBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongKernel2) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer convBuilder("Deconvolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11, 11}); // here + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongDilation1) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 3, 225, 225})); + deconvBuilder.setDilation({1, 0}); // here + + ASSERT_THROW(network.addLayer(deconvBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateConvolutionWithWrongDilation2) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer convBuilder("Deconvolution"); + + convBuilder.setStrides({4, 4}); + convBuilder.setKernel({11, 11}); + convBuilder.setOutDepth(96); + convBuilder.setInputPort(Port({1, 3, 225, 225})); + convBuilder.setDilation({1, 1, 1}); // here + + ASSERT_THROW(network.addLayer(convBuilder), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DeconvolutionLayerBuilderTest, canCreateLayerWithNumberOfGroupDividingNumberOfInputChannels) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + size_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 2, 11, 11}, Layout::OIHW))); + size_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 6, 225, 225})); + deconvBuilder.setDilation({1, 1}); + + deconvBuilder.setGroup(3); + size_t convId = network.addLayer(deconvBuilder); + network.connect({weightsId}, {convId, 1}); + network.connect({biasesId}, {convId, 2}); + ASSERT_NO_THROW(network.getLayer(convId)->validate(false)); +} + +TEST_F(DeconvolutionLayerBuilderTest, canCreateLayerWithWeightsNotAvailableForGroup) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + size_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 5, 11, 11}, Layout::OIHW))); + size_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 6, 225, 225})); + deconvBuilder.setDilation({1, 1}); + + deconvBuilder.setGroup(3); + ASSERT_THROW(network.addLayer({{weightsId}, {biasesId}}, deconvBuilder), + InferenceEngine::details::InferenceEngineException); // 6 / 3 != 5 +} + +TEST_F(DeconvolutionLayerBuilderTest, cannotCreateLayerWithNumberOfGroupNotDividingNumberOfInputChannels) { + Builder::Network network("Test"); + Builder::DeconvolutionLayer deconvBuilder("Deconvolution"); + + size_t weightsId = network.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 2, 11, 11}, Layout::OIHW))); + size_t biasesId = network.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + + deconvBuilder.setStrides({4, 4}); + deconvBuilder.setKernel({11, 11}); + deconvBuilder.setOutDepth(96); + deconvBuilder.setInputPort(Port({1, 6, 225, 225})); + deconvBuilder.setDilation({1, 1}); + + deconvBuilder.setGroup(4); + ASSERT_THROW(network.addLayer({{weightsId}, {biasesId}}, deconvBuilder), + InferenceEngine::details::InferenceEngineException); // 6 % 4 == 2 +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/detection_output_layer_test.cpp b/inference-engine/tests/unit/builders/detection_output_layer_test.cpp new file mode 100644 index 0000000..e636be9 --- /dev/null +++ b/inference-engine/tests/unit/builders/detection_output_layer_test.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class DetectionOutputLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(DetectionOutputLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network network("network"); + Builder::DetectionOutputLayer layer("detection output layer"); + layer.setNumClasses(2); + layer.setShareLocation(true); + layer.setBackgroudLabelId(-1); + layer.setNMSThreshold(0.45); + layer.setTopK(400); + layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE"); + layer.setVariantEncodedInTarget(false); + layer.setKeepTopK(200); + layer.setConfidenceThreshold(0.01); + size_t ind = 0; + ASSERT_NO_THROW(ind = network.addLayer(layer)); + Builder::DetectionOutputLayer layerFromNet(network.getLayer(ind)); + ASSERT_EQ(layerFromNet.getName(), layer.getName()); + ASSERT_EQ(layerFromNet.getNumClasses(), layer.getNumClasses()); + ASSERT_EQ(layerFromNet.getShareLocation(), layer.getShareLocation()); + ASSERT_EQ(layerFromNet.getBackgroudLabelId(), layer.getBackgroudLabelId()); + ASSERT_EQ(layerFromNet.getNMSThreshold(), layer.getNMSThreshold()); + ASSERT_EQ(layerFromNet.getTopK(), layer.getTopK()); + ASSERT_EQ(layerFromNet.getCodeType(), layer.getCodeType()); + ASSERT_EQ(layerFromNet.getVariantEncodedInTarget(), layer.getVariantEncodedInTarget()); + ASSERT_EQ(layerFromNet.getKeepTopK(), layer.getKeepTopK()); + ASSERT_EQ(layerFromNet.getConfidenceThreshold(), layer.getConfidenceThreshold()); +} + +TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongNumClasses) { + Builder::Network network("network"); + Builder::DetectionOutputLayer layer("detection output layer"); + layer.setNumClasses(0); // here + layer.setShareLocation(true); + layer.setBackgroudLabelId(-1); + layer.setNMSThreshold(0.45); + layer.setTopK(400); + layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE"); + layer.setVariantEncodedInTarget(false); + layer.setKeepTopK(200); + layer.setConfidenceThreshold(0.01); + ASSERT_THROW(network.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongCodeType) { + Builder::Network network("network"); + Builder::DetectionOutputLayer layer("detection output layer"); + layer.setNumClasses(2); + layer.setShareLocation(true); + layer.setBackgroudLabelId(-1); + layer.setNMSThreshold(0.45); + layer.setTopK(400); + layer.setCodeType("trololo"); // here + layer.setVariantEncodedInTarget(false); + layer.setKeepTopK(200); + layer.setConfidenceThreshold(0.01); + ASSERT_THROW(network.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongBackLabelId) { + Builder::Network network("network"); + Builder::DetectionOutputLayer layer("detection output layer"); + layer.setNumClasses(2); + layer.setShareLocation(true); + layer.setBackgroudLabelId(-100); // here + layer.setNMSThreshold(0.45); + layer.setTopK(400); + layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE"); + layer.setVariantEncodedInTarget(false); + layer.setKeepTopK(200); + layer.setConfidenceThreshold(0.01); + ASSERT_THROW(network.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongNMSThreshold) { + Builder::Network network("network"); + Builder::DetectionOutputLayer layer("detection output layer"); + layer.setNumClasses(2); + layer.setShareLocation(true); + layer.setBackgroudLabelId(-1); + layer.setNMSThreshold(0); // here + layer.setTopK(400); + layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE"); + layer.setVariantEncodedInTarget(false); + layer.setKeepTopK(200); + layer.setConfidenceThreshold(0.01); + ASSERT_THROW(network.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(DetectionOutputLayerBuilderTest, cannotCreateLayerWithWrongConfidenceThreshold) { + Builder::Network network("network"); + Builder::DetectionOutputLayer layer("detection output layer"); + layer.setNumClasses(2); + layer.setShareLocation(true); + layer.setBackgroudLabelId(-1); + layer.setNMSThreshold(0.45); + layer.setTopK(400); + layer.setCodeType("caffe.PriorBoxParameter.CENTER_SIZE"); + layer.setVariantEncodedInTarget(false); + layer.setKeepTopK(200); + layer.setConfidenceThreshold(0); // here + ASSERT_THROW(network.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/eltwise_layer_test.cpp b/inference-engine/tests/unit/builders/eltwise_layer_test.cpp new file mode 100644 index 0000000..d85595a --- /dev/null +++ b/inference-engine/tests/unit/builders/eltwise_layer_test.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class EltwiseLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(EltwiseLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::EltwiseLayer layer("Eltwise layer"); + + layer.setInputPorts({Port({1, 2, 3, 4}), Port({1, 2, 3, 4})}); + layer.setOutputPort(Port({1, 2, 3, 4})); + size_t ind = 0; + ASSERT_NO_THROW(ind = net.addLayer(layer)); + Builder::EltwiseLayer layerFromNet(net.getLayer(ind)); + + ASSERT_EQ(layer.getInputPorts(), layerFromNet.getInputPorts()); + ASSERT_EQ(layer.getOutputPort(), layerFromNet.getOutputPort()); + ASSERT_EQ(layer.getEltwiseType(), layerFromNet.getEltwiseType()); +} + +TEST_F(EltwiseLayerBuilderTest, checkOnlineEltwiseTypeChanging) { + Builder::Network net("network"); + Builder::EltwiseLayer layer("Eltwise layer"); + + layer.setInputPorts({Port({1, 2, 3}), Port({1, 2, 3})}); + layer.setOutputPort(Port({1, 2, 3})); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::MAX); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::MAX); + ASSERT_NO_THROW(net.addLayer(layer)); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::DIV); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::DIV); + ASSERT_NO_THROW(net.addLayer(layer)); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::MIN); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::MIN); + ASSERT_NO_THROW(net.addLayer(layer)); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::MUL); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::MUL); + ASSERT_NO_THROW(net.addLayer(layer)); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::SQUARED_DIFF); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::SQUARED_DIFF); + ASSERT_NO_THROW(net.addLayer(layer)); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUB); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::SUB); + ASSERT_NO_THROW(net.addLayer(layer)); + + layer.setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUM); + ASSERT_EQ(layer.getEltwiseType(), Builder::EltwiseLayer::EltwiseType::SUM); + ASSERT_NO_THROW(net.addLayer(layer)); +} + +TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithOneInputPort) { + Builder::Network net("network"); + Builder::EltwiseLayer layer("Eltwise layer"); + + layer.setInputPorts({Port({1, 2, 3, 4})}); // here + layer.setOutputPort(Port({1, 2, 3, 4})); + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithThreeInputPort) { + Builder::Network net("network"); + Builder::EltwiseLayer layer("Eltwise layer"); + + layer.setInputPorts({Port({1, 2, 3, 4}), Port({1, 2, 3, 4}), Port({1, 2, 3, 4})}); // here + layer.setOutputPort(Port({1, 2, 3, 4})); + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithDifferentInputPorts) { + Builder::Network net("network"); + Builder::EltwiseLayer layer("Eltwise layer"); + + layer.setInputPorts({Port({1, 2, 3, 4}), Port({1, 2, 3, 1000})}); // here + layer.setOutputPort(Port({1, 2, 3, 4})); + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(EltwiseLayerBuilderTest, cannotCreateLayerWithDifferentInputAndOutputPorts) { + Builder::Network net("network"); + Builder::EltwiseLayer layer("Eltwise layer"); + + layer.setInputPorts({Port({1, 2, 3, 4}), Port({1, 2, 3, 4})}); + layer.setOutputPort(Port({1, 2, 3, 100})); // here + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} diff --git a/inference-engine/tests/unit/builders/elu_layer_test.cpp b/inference-engine/tests/unit/builders/elu_layer_test.cpp new file mode 100644 index 0000000..4ddbda3 --- /dev/null +++ b/inference-engine/tests/unit/builders/elu_layer_test.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ELULayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ELULayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::ELULayer eluLayer("ELU_layer"); + eluLayer.setAlpha(100); + size_t ind = net.addLayer(eluLayer); + Builder::ELULayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(eluLayer.getAlpha(), layerFromNet.getAlpha()); +} + +TEST_F(ELULayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeELULayerPtr = std::make_shared("ELU", "ELU layer"); + fakeELULayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeELULayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::ELULayer eluLayer(fakeELULayerPtr); + eluLayer.setAlpha(100); + ASSERT_THROW(net.addLayer(eluLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(ELULayerBuilderTest, cannotCreateLayerWithWrongAlpha) { + Builder::Network net("network"); + Builder::ELULayer eluLayer("ELU_layer"); + eluLayer.setAlpha(-100); + ASSERT_THROW(net.addLayer(eluLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/input_layer_test.cpp b/inference-engine/tests/unit/builders/input_layer_test.cpp index 6a30fdb..2e840de 100644 --- a/inference-engine/tests/unit/builders/input_layer_test.cpp +++ b/inference-engine/tests/unit/builders/input_layer_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,6 +27,6 @@ TEST_F(InputLayerBuilderTest, getExistsLayerFromNetworkBuilder) { ASSERT_EQ(inBuilderFromNetwork.getPort().shape(), Port({1, 3, 3, 3}).shape()); inBuilderFromNetwork.setPort(Port({1, 3, 4, 4})); ASSERT_EQ(inBuilderFromNetwork.getPort().shape(), Port({1, 3, 4, 4}).shape()); - ASSERT_EQ(network.getLayer(inId).getOutputPorts()[0].shape(), Port({1, 3, 4, 4}).shape()); + ASSERT_EQ(network.getLayer(inId)->getOutputPorts()[0].shape(), Port({1, 3, 4, 4}).shape()); ASSERT_EQ(inBuilder.getPort().shape(), Port({1, 3, 3, 3}).shape()); } \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/mvn_layer_test.cpp b/inference-engine/tests/unit/builders/mvn_layer_test.cpp new file mode 100644 index 0000000..01cf448 --- /dev/null +++ b/inference-engine/tests/unit/builders/mvn_layer_test.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class MVNLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(MVNLayerBuilderTest, getExistsLayerFromNetworkBuilder1) { + Builder::Network net("network"); + Builder::MVNLayer mvnLayer("MVN_layer"); + mvnLayer.setEpsilon(99.9).setAcrossChannels(true).setNormalize(true); + size_t ind = net.addLayer(mvnLayer); + Builder::MVNLayer layerFromNet(net.getLayer(ind)); +} + +TEST_F(MVNLayerBuilderTest, getExistsLayerFromNetworkBuilder2) { + Builder::Network net("network"); + Builder::MVNLayer mvnLayer("MVN_layer"); + mvnLayer.setEpsilon(99.9).setAcrossChannels(true).setNormalize(false); + size_t ind = net.addLayer(mvnLayer); + Builder::MVNLayer layerFromNet(net.getLayer(ind)); +} + +TEST_F(MVNLayerBuilderTest, getExistsLayerFromNetworkBuilder3) { + Builder::Network net("network"); + Builder::MVNLayer mvnLayer("MVN_layer"); + mvnLayer.setEpsilon(99.9).setAcrossChannels(false).setNormalize(true); + size_t ind = net.addLayer(mvnLayer); + Builder::MVNLayer layerFromNet(net.getLayer(ind)); +} + +TEST_F(MVNLayerBuilderTest, getExistsLayerFromNetworkBuilder4) { + Builder::Network net("network"); + Builder::MVNLayer mvnLayer("MVN_layer"); + mvnLayer.setEpsilon(99.9).setAcrossChannels(false).setNormalize(false); + size_t ind = net.addLayer(mvnLayer); + Builder::MVNLayer layerFromNet(net.getLayer(ind)); +} + +TEST_F(MVNLayerBuilderTest, cannotCreateLayerWithWrongEpsion) { + Builder::Network net("network"); + Builder::MVNLayer mvnLayer("MVN_layer"); + mvnLayer.setEpsilon(-100).setAcrossChannels(true).setNormalize(true); // here + ASSERT_THROW(net.addLayer(mvnLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(MVNLayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeMVNLayerPtr = std::make_shared("MVN", "MVN layer"); + fakeMVNLayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeMVNLayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::MVNLayer mvnLayer(fakeMVNLayerPtr); + mvnLayer.setEpsilon(100).setAcrossChannels(true).setNormalize(true); + ASSERT_THROW(net.addLayer(mvnLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/network_builder_test.cpp b/inference-engine/tests/unit/builders/network_builder_test.cpp index 3b53f12..45a18a1 100644 --- a/inference-engine/tests/unit/builders/network_builder_test.cpp +++ b/inference-engine/tests/unit/builders/network_builder_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -43,61 +43,65 @@ protected: public: - Builder::Network prepateAlexnetBuilder() { + Builder::Network prepateAlexnetBuilder(Precision precision = Precision::FP32) { Context ctx; Builder::Network builder(ctx, "AlexNet"); + idx_t weightsId, biasesId; idx_t layerId = builder.addLayer(Builder::InputLayer(alexNetNames[0]).setPort(Port({1,3, 227, 227}))); - layerId = builder.addLayer({{layerId}}, Builder::ScaleShiftLayer(alexNetNames[1]).setBiases(generateBlob(Precision::FP32, {3}, Layout::C))); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}).setStrides({4, 4}).setOutDepth(96) - .setWeights(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {96}, Layout::C))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {3}, Layout::C))); + layerId = builder.addLayer({{layerId}}, Builder::ScaleShiftLayer(alexNetNames[1])); + builder.connect({biasesId}, {layerId, 2}); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {96, 3, 11, 11}, Layout::OIHW))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {96}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}) + .setStrides({4, 4}).setOutDepth(96)); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[3])); layerId = builder.addLayer({{layerId}}, Builder::NormLayer(alexNetNames[4]).setAlpha(9.999999747378752e-05f).setBeta(0.75f).setSize(5).setAcrossMaps(true)); layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer(alexNetNames[5]).setExcludePad(false).setKernel({3, 3}).setPaddingsBegin({0, 0}) .setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX).setStrides({2, 2})); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[6]).setKernel({5, 5}).setStrides({1, 1}).setOutDepth(256) - .setPaddingsBegin({2, 2}).setPaddingsEnd({2, 2}).setGroup(2).setDilation({1, 1}) - .setWeights(generateBlob(Precision::FP32, {96, 256, 5, 5}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {256}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {256, 96 / 2, 5, 5}, Layout::OIHW))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {256}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[6]).setKernel({5, 5}).setStrides({1, 1}).setOutDepth(256) + .setPaddingsBegin({2, 2}).setPaddingsEnd({2, 2}).setGroup(2).setDilation({1, 1})); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[7])); layerId = builder.addLayer({{layerId}}, Builder::NormLayer(alexNetNames[8]).setAlpha(9.999999747378752e-05f).setBeta(0.75f).setSize(5).setAcrossMaps(true)); layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer(alexNetNames[9]).setExcludePad(false).setKernel({3, 3}).setPaddingsBegin({0, 0}) .setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX).setStrides({2, 2})); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[10]).setKernel({3, 3}).setStrides({1, 1}).setOutDepth(384) - .setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(1).setDilation({1, 1}) - .setWeights(generateBlob(Precision::FP32, {256, 384, 3, 3}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {384}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {256, 384, 3, 3}, Layout::OIHW))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {384}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[10]).setKernel({3, 3}) + .setStrides({1, 1}).setOutDepth(384).setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(1).setDilation({1, 1})); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[11])); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[12]).setKernel({3, 3}).setStrides({1, 1}).setOutDepth(384) - .setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(2).setDilation({1, 1}) - .setWeights(generateBlob(Precision::FP32, {384, 384, 3, 3}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {384}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {384, 384 / 2, 3, 3}, Layout::OIHW))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {384}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[12]).setKernel({3, 3}) + .setStrides({1, 1}).setOutDepth(384).setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(2).setDilation({1, 1})); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[13])); - layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[14]).setKernel({3, 3}).setStrides({1, 1}).setOutDepth(256) - .setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(2).setDilation({1, 1}) - .setWeights(generateBlob(Precision::FP32, {256, 384, 3, 3}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {384}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {256, 384 / 2, 3, 3}, Layout::OIHW))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {256}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[14]).setKernel({3, 3}) + .setStrides({1, 1}).setOutDepth(256).setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(2).setDilation({1, 1})); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[15])); layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer(alexNetNames[16]).setExcludePad(false).setKernel({3, 3}).setPaddingsBegin({0, 0}) .setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX).setStrides({2, 2})); - layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer(alexNetNames[17]).setOutputNum(4096) - .setWeights(generateBlob(Precision::FP32, {4096, 256, 6, 6}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {4096}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {4096, 256, 6, 6}, Layout::OIHW))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {4096}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::FullyConnectedLayer(alexNetNames[17]).setOutputNum(4096)); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[18])); - layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer(alexNetNames[19]).setOutputNum(4096) - .setWeights(generateBlob(Precision::FP32, {4096, 4096}, Layout::NC)) - .setBiases(generateBlob(Precision::FP32, {4096}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {4096, 4096}, Layout::NC))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {4096}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::FullyConnectedLayer(alexNetNames[19]).setOutputNum(4096)); layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[20])); - layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer(alexNetNames[21]).setOutputNum(1000) - .setWeights(generateBlob(Precision::FP32, {1000, 4096}, Layout::NC)) - .setBiases(generateBlob(Precision::FP32, {1000}, Layout::C))); + weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(precision, {1000, 4096}, Layout::NC))); + biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(precision, {1000}, Layout::C))); + layerId = builder.addLayer({{layerId}, {weightsId}, {biasesId}}, Builder::FullyConnectedLayer(alexNetNames[21]).setOutputNum(1000)); layerId = builder.addLayer({{layerId}}, Builder::SoftMaxLayer(alexNetNames[22]).setAxis(1)); idx_t outputId = builder.addLayer({PortInfo(layerId)}, Builder::OutputLayer(alexNetNames[23])); return builder; } - const INetwork::Ptr createAlexnet() { + const INetwork::CPtr createAlexnet() { return prepateAlexnetBuilder().build(); } @@ -106,12 +110,11 @@ public: auto connections = network.getLayerConnections(layer->getId()); CNNLayerPtr cnnLayer; StatusCode sts = cnnNetwork.getLayerByName(layer->getName().c_str(), cnnLayer, nullptr); - if (sts != OK && layer->getType() == "Output") + if (sts != OK && (layer->getType() == "Output" || layer->getType() == "Const")) continue; else if (sts != OK) THROW_IE_EXCEPTION << "Cannot find CNNLayer by name: " << layer->getName(); - // Output connections for (size_t i = 0; i < cnnLayer->outData.size(); i++) { for (const auto& it : cnnLayer->outData[i]->inputTo) { @@ -124,9 +127,16 @@ public: } for (auto conIt = connections.begin(); conIt != connections.end(); conIt++) { + const auto& inputPorts = network.getLayer(conIt->to().layerId())->getInputPorts(); + idx_t realPortId(0); + for (size_t q = 0; q < conIt->to().portId() && q < inputPorts.size(); q++) { + if (inputPorts[q].getParameters().find("type") == inputPorts[q].getParameters().end()) + realPortId++; + } + if (conIt->from().layerId() == layer->getId() && conIt->from().portId() == i && - network.getLayer(conIt->to().layerId())->getName() == it.second->name && - conIt->to().portId() == j) { + network.getLayer(conIt->to().layerId())->getName() == it.second->name && + realPortId == j) { connections.erase(conIt); break; } @@ -162,7 +172,20 @@ public: if (connections.size() == 1 && network.getLayer(connections[0].to().layerId())->getType() == "Output") connections.erase(connections.begin()); - if (!connections.empty()) + bool connectionsConnected = true; + for (const auto& connection : connections) { + if (connection.to().layerId() != layer->getId()) { + connectionsConnected = false; + break; + } + const auto& port = layer->getInputPorts()[connection.to().portId()]; + if (port.getParameters().find("type") == port.getParameters().end()) { + connectionsConnected = false; + break; + } + } + + if (!connectionsConnected) THROW_IE_EXCEPTION << "Not all connections were connected."; } } @@ -282,18 +305,22 @@ TEST_F(NetworkBuilderTest, checkReshapeAlexNet) { Builder::Network builder = prepateAlexnetBuilder(); for (const auto &layer : builder.getLayers()) { - if (layer.getType() == "Input") { - ASSERT_EQ(outPorts[layer.getName()][0], layer.getOutputPorts()[0].shape()); - } else { - for (size_t j = 0; j < layer.getOutputPorts().size(); j++) { - ASSERT_TRUE(layer.getOutputPorts()[j].shape().empty()); + if (layer->getType() == "Input") { + ASSERT_EQ(outPorts[layer->getName()][0], layer->getOutputPorts()[0].shape()); + } else if (layer->getType() != "Const") { + for (const auto &port : layer->getOutputPorts()) { + ASSERT_TRUE(port.shape().empty()); } } } - INetwork::Ptr graph; + INetwork::CPtr graph; ASSERT_NO_THROW(graph = builder.build()); for (const auto &layer : *graph) { + if (layer->getType() == "Const") + continue; for (size_t i = 0; i < layer->getInputPorts().size(); i++) { + if (layer->getInputPorts()[i].getParameters().find("type") != layer->getInputPorts()[i].getParameters().end()) + continue; ASSERT_EQ(inPorts[layer->getName()][i], layer->getInputPorts()[i].shape()); } for (size_t i = 0; i < layer->getOutputPorts().size(); i++) { @@ -306,10 +333,10 @@ TEST_F(NetworkBuilderTest, checkNoImplWithCorrectPorts) { Context ctx; Builder::Network builder(ctx, "TestAlexNet"); idx_t inId = builder.addLayer(Builder::InputLayer(alexNetNames[0]).setPort(Port({1,3, 227, 227}))); - idx_t convId = builder.addLayer({{inId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}).setStrides({4, 4}).setOutDepth(96) - .setInputPort(Port({1,3, 227, 227})).setOutputPort(Port({1, 96, 55, 55})) - .setWeights(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {96}, Layout::C))); + idx_t weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + idx_t biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + idx_t convId = builder.addLayer({{inId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}) + .setStrides({4, 4}).setOutDepth(96).setInputPort(Port({1,3, 227, 227})).setOutputPort(Port({1, 96, 55, 55}))); idx_t testLayerId = builder.addLayer({PortInfo(convId)}, Builder::Layer("TestLayer", "testPort") .setInputPorts({Port({1, 96, 55, 55})}).setOutputPorts({Port({1, 96, 55, 55})})); idx_t outputId = builder.addLayer({PortInfo(testLayerId)}, Builder::OutputLayer("out").setPort({Port({1, 96, 55, 55})})); @@ -321,33 +348,34 @@ TEST_F(NetworkBuilderTest, checkNoImplWithIncorrectPorts) { Context ctx; Builder::Network builder(ctx, "TestAlexNet"); idx_t inId = builder.addLayer(Builder::InputLayer(alexNetNames[0]).setPort(Port({1,3, 227, 227}))); - idx_t convId = builder.addLayer({{inId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}).setStrides({4, 4}).setOutDepth(96) - .setInputPort(Port({1,3, 227, 227})).setOutputPort(Port({1, 96, 55, 55})) - .setWeights(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW)) - .setBiases(generateBlob(Precision::FP32, {96}, Layout::C))); - idx_t testLayerId = builder.addLayer({PortInfo(convId)}, Builder::Layer("TestLayer", "testPort") - .setInputPorts({Port({1, 3, 55, 55})}).setOutputPorts({Port({1, 96, 55, 55})})); - - ASSERT_THROW(builder.build(), InferenceEngine::details::InferenceEngineException); + idx_t weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))); + idx_t biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {96}, Layout::C))); + idx_t convId = builder.addLayer({{inId}, {weightsId}, {biasesId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}) + .setStrides({4, 4}).setOutDepth(96).setInputPort(Port({1,3, 227, 227})).setOutputPort(Port({1, 96, 55, 55}))); + ASSERT_THROW(builder.addLayer({PortInfo(convId)}, Builder::Layer("TestLayer", "testPort") + .setInputPorts({Port({1, 3, 55, 55})}).setOutputPorts({Port({1, 96, 55, 55})})), + InferenceEngine::details::InferenceEngineException); } TEST_F(NetworkBuilderTest, createNetworkIterator) { - const INetwork::Ptr graph = createAlexnet(); + const INetwork::CPtr graph = createAlexnet(); ASSERT_NO_THROW(graph->begin()); } TEST_F(NetworkBuilderTest, checkNetworkSize) { - const INetwork::Ptr graph = createAlexnet(); + const INetwork::CPtr graph = createAlexnet(); - ASSERT_EQ(24, graph->size()); + ASSERT_EQ(41, graph->size()); } TEST_F(NetworkBuilderTest, iterateNetworkForeach) { - const INetwork::Ptr graph = createAlexnet(); + const INetwork::CPtr graph = createAlexnet(); size_t idx = 0; for (const auto& layer : *graph) { + if (layer->getType() == "Const") + continue; ASSERT_NE(idx, alexNetNames.size()); ASSERT_EQ(alexNetNames[idx], layer->getName()); idx++; @@ -355,10 +383,12 @@ TEST_F(NetworkBuilderTest, iterateNetworkForeach) { } TEST_F(NetworkBuilderTest, iterateNetworkFor) { - const INetwork::Ptr graph = createAlexnet(); + const INetwork::CPtr graph = createAlexnet(); size_t idx = 0; for (auto it = graph->begin(); it != graph->end(); it++) { + if ((*it)->getType() == "Const") + continue; ASSERT_EQ(alexNetNames[idx], (*it)->getName()); idx++; } @@ -522,7 +552,7 @@ TEST_F(NetworkBuilderTest, convertFromICNNNetwork) { InferenceEngine::TBlob::Ptr weights_ptr = InferenceEngine::TBlob::Ptr(weights); net_reader.SetWeights(weights_ptr); - INetwork::Ptr network = Builder::Network(net_reader.getNetwork()).build(); + INetwork::CPtr network = Builder::Network(net_reader.getNetwork()).build(); try { compareWithICNNNetwork(*network, net_reader.getNetwork()); @@ -801,26 +831,35 @@ TEST_F(NetworkBuilderTest, connectTwoNetworks) { // Find output idx_t lastLayerId(0); for (const auto& layer : originalNetwork.getLayers()) { - if (layer.getType() != "Output") + if (layer->getType() != "Output") continue; - const auto connections = originalNetwork.getLayerConnections(layer.getId()); + const auto connections = originalNetwork.getLayerConnections(layer->getId()); ASSERT_EQ(1, connections.size()); - ASSERT_EQ(layer.getId(), connections[0].to().layerId()); + ASSERT_EQ(layer->getId(), connections[0].to().layerId()); ASSERT_EQ(0, connections[0].from().portId()); lastLayerId = connections[0].from().layerId(); originalNetwork.disconnect(connections[0]); - originalNetwork.removeLayer(layer.getId()); + originalNetwork.removeLayer(layer->getId()); break; } std::map oldNewId; - for (const auto& layer : addNetwork.getLayers()) { - if (layer.getType() == "Input") { - oldNewId[layer.getId()] = lastLayerId; + for (const auto& layer : addNetwork) { + if (layer->getType() == "Input") { + oldNewId[layer->getId()] = lastLayerId; continue; } - oldNewId[layer.getId()] = originalNetwork.addLayer(layer); - const auto connections = addNetwork.getLayerConnections(layer.getId()); + auto newLayer = layer; + if (newLayer->getType() != "Const") { + for (size_t i = 0; i < newLayer->getInputPorts().size(); i++) { + newLayer->getInputPorts()[i].setData(std::make_shared()); + } + for (size_t i = 0; i < newLayer->getOutputPorts().size(); i++) { + newLayer->getOutputPorts()[i].setData(std::make_shared()); + } + } + oldNewId[layer->getId()] = originalNetwork.addLayer(*newLayer); + const auto connections = addNetwork.getLayerConnections(layer->getId()); for (const auto& connection : connections) { if (oldNewId.find(connection.from().layerId()) == oldNewId.end() || oldNewId.find(connection.to().layerId()) == oldNewId.end()) @@ -829,8 +868,15 @@ TEST_F(NetworkBuilderTest, connectTwoNetworks) { {oldNewId[connection.to().layerId()], connection.to().portId()}); } - if (layer.getType() == "Convolution") { - Builder::ConvolutionLayer(originalNetwork.getLayer(oldNewId[layer.getId()])).setWeights(generateBlob(Precision::FP32, {16, 32, 7, 7}, Layout::OIHW)); + if (layer->getType() == "Convolution") { + idx_t weightsId = originalNetwork.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {16, 32, 7, 7}, Layout::OIHW))); + for (const auto& connection : originalNetwork.getLayerConnections(oldNewId[layer->getId()])) { + if (connection.to().layerId() != oldNewId[layer->getId()] || connection.to().portId() != 1) + continue; + originalNetwork.removeLayer(connection.from().layerId()); + originalNetwork.disconnect(connection); + } + originalNetwork.connect({weightsId}, {oldNewId[layer->getId()], 1}); } } ASSERT_NO_THROW(originalNetwork.build()); @@ -855,29 +901,41 @@ TEST_F(NetworkBuilderTest, createLayersWithTheSameNames) { ieLayer.setPaddingsEnd({0, 0, 0, 0}); ieLayer.setGroup(1); ieLayer.setOutDepth(outCn); - auto convLayerId = netBuilder.addLayer({inpLayerId}, ieLayer); + idx_t weightsId = netBuilder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {1, 1, 3, 3}, Layout::OIHW))); + auto convLayerId = netBuilder.addLayer({{inpLayerId}, {weightsId}}, ieLayer); // Connect convolution layer with it's output InferenceEngine::Builder::OutputLayer outLayer("conv1"); auto convOutLayerId = netBuilder.addLayer({convLayerId}, outLayer); - ASSERT_NE(netBuilder.getLayer(convLayerId).getName(), netBuilder.getLayer(convOutLayerId).getName()); + ASSERT_NE(netBuilder.getLayer(convLayerId)->getName(), netBuilder.getLayer(convOutLayerId)->getName()); InferenceEngine::Builder::ReLULayer reLULayer("relu1"); reLULayer.setNegativeSlope(0); auto reluLayerId = netBuilder.addLayer({convLayerId}, reLULayer); InferenceEngine::Builder::OutputLayer outReLULayer("relu1"); auto reluOutLayerId = netBuilder.addLayer({reluLayerId}, outReLULayer); - ASSERT_NE(netBuilder.getLayer(reluLayerId).getName(), netBuilder.getLayer(reluOutLayerId).getName()); + ASSERT_NE(netBuilder.getLayer(reluLayerId)->getName(), netBuilder.getLayer(reluOutLayerId)->getName()); ASSERT_NO_THROW(netBuilder.build()); } TEST_F(NetworkBuilderTest, RemoveLayerAndBuild) { auto builder = prepateAlexnetBuilder(); - builder.removeLayer(builder.getLayers()[2].getId()); + builder.removeLayer(builder.getLayers()[2]->getId()); ASSERT_THROW(builder.build(), InferenceEngine::details::InferenceEngineException); } +TEST_F(NetworkBuilderTest, CheckConnectionsData) { + auto builder = prepateAlexnetBuilder(); + + for (const auto& connection : builder.getConnections()) { + const auto srcPort = builder.getLayer(connection.from().layerId())->getOutputPorts()[connection.from().portId()]; + const auto dstPort = builder.getLayer(connection.to().layerId())->getInputPorts()[connection.to().portId()]; + + ASSERT_EQ(srcPort.getData(), dstPort.getData()); + } +} + TEST_F(NetworkBuilderTest, DocumentationExample) { // Create graph with name InferenceEngine::Builder::Network graph("Example1"); @@ -897,11 +955,12 @@ TEST_F(NetworkBuilderTest, DocumentationExample) { data[0] = 1; data[1] = 2; data[2] = 3; - idx_t scaleShiftId = graph.addLayer(Builder::ScaleShiftLayer("scaleShift1").setBiases(blobWithScaleShiftBiases)); + idx_t biasesId = graph.addLayer(Builder::ConstLayer("biases").setData(blobWithScaleShiftBiases)); + idx_t scaleShiftId = graph.addLayer(Builder::ScaleShiftLayer("scaleShift1")); // Connect ScaleShift layer with relu1 graph.connect({relu1Id}, {scaleShiftId}); // Also port indexes could be defined (0 is default value) builder.connect({layerId, outPortIdx}, {scaleShiftId, inPortIdx}); - + graph.connect({biasesId}, {scaleShiftId, 2}); // Create ReLU layer with a negative slope 0.2 using generic layer builder and connect it with scaleShift idx_t relu2Id = graph.addLayer({{scaleShiftId}}, Builder::Layer("ReLU", "relu2").setParameters({{"negative_slope", 0.2f}}).setOutputPorts({Port()}).setInputPorts({Port()})); @@ -909,7 +968,7 @@ TEST_F(NetworkBuilderTest, DocumentationExample) { idx_t outId = graph.addLayer({{relu2Id, 0}}, Builder::OutputLayer("out")); // Build original network - InferenceEngine::INetwork::Ptr finalNetwork = graph.build(); + InferenceEngine::INetwork::CPtr finalNetwork = graph.build(); std::shared_ptr cnnNetwork = InferenceEngine::Builder::convertToICNNNetwork(finalNetwork); // Modify network @@ -923,5 +982,255 @@ TEST_F(NetworkBuilderTest, DocumentationExample) { // Connect scaleShift1 and out graph.connect({scaleShiftId}, {outId}); // Build network without relu2 - InferenceEngine::INetwork::Ptr changedNetwork = graph.build(); + InferenceEngine::INetwork::CPtr changedNetwork = graph.build(); +} + +TEST_F(NetworkBuilderTest, CreateFullyConnectedWithoutBiases) { + Builder::Network builder("network"); + Builder::FullyConnectedLayer fcBuilder("FullyConnected"); + + SizeVector inputDims = {1, 2, 16, 16}; // 1 KB + + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port(inputDims))); + + idx_t weightsId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, + {1024, 2, 16, 16}, Layout::OIHW))); + + layerId = builder.addLayer({{layerId}, {weightsId} }, Builder::FullyConnectedLayer("FullyConnected").setOutputNum(1024 * 1)); + + builder.addLayer({PortInfo(layerId)}, Builder::OutputLayer("output")); + + ASSERT_NO_THROW(std::shared_ptr cnnNetwork = InferenceEngine::Builder::convertToICNNNetwork(builder.build())); +} + +TEST_F(NetworkBuilderTest, CreateAndConvertNetworkWithoutWeightsWithConst) { + Builder::Network builder("network"); + + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port({1, 1, 10, 10}))); + layerId = builder.addLayer({layerId}, Builder::PoolingLayer("pool").setKernel({2, 2}).setStrides({2, 2}) + .setPoolingType(Builder::PoolingLayer::PoolingType::MAX)); + builder.addLayer({layerId}, Builder::OutputLayer("output")); + + + layerId = builder.addLayer(Builder::ConstLayer("constWA").setData(generateBlob(Precision::FP16, {1}, Layout::C))); + builder.addLayer({layerId}, Builder::OutputLayer("output_const")); + + auto cnnNetwork = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(builder.build())); + ASSERT_EQ(Precision::FP16, cnnNetwork.getPrecision()); +} + +TEST_F(NetworkBuilderTest, CreateAndConvertNetworkWithoutWeights) { + Builder::Network builder("network"); + + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port({1, 1, 10, 10}, Precision::FP16))); + layerId = builder.addLayer({layerId}, Builder::PoolingLayer("pool").setKernel({2, 2}).setStrides({2, 2}) + .setPoolingType(Builder::PoolingLayer::PoolingType::MAX)); + builder.addLayer({layerId}, Builder::OutputLayer("output")); + + auto cnnNetwork = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(builder.build())); + ASSERT_EQ(Precision::FP16, cnnNetwork.getPrecision()); +} + +TEST_F(NetworkBuilderTest, CreateAndNetworkWithPadLayer) { + Builder::Network builder("network"); + + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port({1, 2, 3, 4}))); + Builder::Layer padLayer("Pad", "padding"); + padLayer.getParameters()["pads_begin"] = std::vector({0, 0, 1, 1}); + padLayer.getParameters()["pads_end"] = std::vector({0, 0, 1, 1}); + padLayer.getParameters()["pad_mode"] = std::string("constant"); + padLayer.getParameters()["pad_value"] = 0; + padLayer.setInputPorts(std::vector(1)); + padLayer.setOutputPorts(std::vector(1)); + layerId = builder.addLayer({layerId}, padLayer); + builder.addLayer({layerId}, Builder::OutputLayer("output")); + + ASSERT_NO_THROW(InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(builder.build()))); +} + +TEST_F(NetworkBuilderTest, CreateLSTMFromBuilder) { + std::string model = R"V0G0N( + + + + + + 1 + 3 + 10 + + + + + + + 1 + 5 + + + + + + + 1 + 5 + + + + + + + + 1 + 3 + 10 + + + 1 + 5 + + + 1 + 5 + + + + + 1 + 3 + 5 + + + 1 + 5 + + + 1 + 5 + + + + + + + + + + + + + + )V0G0N"; + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + Builder::Network builder("LSTMTINet"); + idx_t in0 = builder.addLayer(Builder::InputLayer("Input0").setPort(Port({1, 3, 10}))); + idx_t in1 = builder.addLayer(Builder::InputLayer("Input1").setPort(Port({1, 5}))); + idx_t in2 = builder.addLayer(Builder::InputLayer("Input2").setPort(Port({1, 5}))); + idx_t weightId = builder.addLayer(Builder::ConstLayer("weights").setData(generateBlob(Precision::FP32, {300}, Layout::C))); + idx_t biasesId = builder.addLayer(Builder::ConstLayer("biases").setData(generateBlob(Precision::FP32, {20}, Layout::C))); + idx_t lstm = builder.addLayer({{in0}, {weightId}, {biasesId}}, + Builder::LSTMSequenceLayer("RNN3") + .setDirection("Backward") + .setHiddenSize(5)); + builder.getLayer(lstm)->getOutputPorts()[0].setShape({1, 3, 5}); + builder.getLayer(lstm)->getOutputPorts()[1].setShape({1, 5}); + builder.getLayer(lstm)->getOutputPorts()[2].setShape({1, 5}); + builder.connect({in1}, {lstm, 4}); + builder.connect({in2}, {lstm, 5}); + + builder.addLayer({{lstm, 0}}, Builder::OutputLayer("output0")); + builder.addLayer({{lstm, 1}}, Builder::OutputLayer("output1")); + builder.addLayer({{lstm, 2}}, Builder::OutputLayer("output2")); + const auto network = Builder::convertToICNNNetwork(builder.build()); + try { + compareICNNNetworks(*network, net_reader.getNetwork()); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + FAIL() << ex.what(); + } } + +TEST_F(NetworkBuilderTest, Fp16AlexNetInputPrecision) { + auto cnnNetwork = Builder::convertToICNNNetwork(prepateAlexnetBuilder(Precision::FP16).build()); + + OutputsDataMap outputs; + InputsDataMap inputs; + + cnnNetwork->getInputsInfo(inputs); + cnnNetwork->getOutputsInfo(outputs); + + auto input = inputs.begin()->second; + auto output = outputs.begin()->second; + ASSERT_EQ(Precision::FP32, input->getPrecision()); + ASSERT_EQ(Precision::FP32, output->getPrecision()); +} + +TEST_F(NetworkBuilderTest, CheckPreProcessAlexNet) { + auto cnnNetwork = Builder::convertToICNNNetwork(createAlexnet()); + + InputsDataMap inputs; + + cnnNetwork->getInputsInfo(inputs); + + auto input = inputs.begin()->second; + ASSERT_NE(input->getPreProcess().getResizeAlgorithm(), ResizeAlgorithm::RESIZE_BILINEAR); + input->getPreProcess().setResizeAlgorithm(ResizeAlgorithm::RESIZE_BILINEAR); + + auto newCnnNetwork = Builder::convertToICNNNetwork(Builder::Network(*cnnNetwork).build()); + newCnnNetwork->getInputsInfo(inputs); + input = inputs.begin()->second; + ASSERT_EQ(input->getPreProcess().getResizeAlgorithm(), ResizeAlgorithm::RESIZE_BILINEAR); +} + +TEST_F(NetworkBuilderTest, ReshapeNetworkTest) { + std::string model = R"V0G0N( + + + + + + 1 + 1000 + 1 + 1 + + + + + + + + 1 + 1000 + 1 + 1 + + + + + 1 + 1000 + + + + + + + +)V0G0N"; + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + auto network = Builder::convertToICNNNetwork(Builder::Network(net_reader.getNetwork()).build()); + + CNNLayerPtr layer; + network->getLayerByName("flatten", layer, nullptr); + ASSERT_EQ(layer->outData[0]->getDims().size(), 2); + try { + compareICNNNetworks(*network, net_reader.getNetwork()); + } catch (InferenceEngine::details::InferenceEngineException &ex) { + FAIL() << ex.what(); + } +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/norm_layer_test.cpp b/inference-engine/tests/unit/builders/norm_layer_test.cpp new file mode 100644 index 0000000..72f2581 --- /dev/null +++ b/inference-engine/tests/unit/builders/norm_layer_test.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class NormLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(NormLayerBuilderTest, getExistsLayerFromNetworkBuilderWithAcrossMapsEqualTrue) { + Builder::Network net("Test"); + auto layer = Builder::NormLayer("NormLayer").setAlpha(9.999999747378752e-05f).setBeta(0.75f).setSize(5).setAcrossMaps(true).setPort(Port({10, 10, 100, 100})); + size_t id = net.addLayer(layer); + Builder::NormLayer layerFromNetwork(net.getLayer(id)); + ASSERT_EQ(layer.getAlpha(), layerFromNetwork.getAlpha()); + ASSERT_EQ(layer.getBeta(), layerFromNetwork.getBeta()); + ASSERT_EQ(layer.getAcrossMaps(), layerFromNetwork.getAcrossMaps()); +} + +TEST_F(NormLayerBuilderTest, getExistsLayerFromNetworkBuilderWithAcrossMapsEqualFalse) { + Builder::Network net("Test"); + auto layer = Builder::NormLayer("NormLayer").setAlpha(9.999999747378752e-05f).setBeta(0.75f).setSize(5).setAcrossMaps(false).setPort(Port({10, 10, 100, 100})); + size_t id = net.addLayer(layer); + Builder::NormLayer layerFromNetwork(net.getLayer(id)); + ASSERT_EQ(layer.getAlpha(), layerFromNetwork.getAlpha()); + ASSERT_EQ(layer.getBeta(), layerFromNetwork.getBeta()); + ASSERT_EQ(layer.getAcrossMaps(), layerFromNetwork.getAcrossMaps()); +} + +TEST_F(NormLayerBuilderTest, cannotCreateNormLayerWithWrongAlpha) { + Builder::Network net("Test"); + auto layer = Builder::NormLayer("NormLayer").setAlpha(0).setBeta(0.75f).setSize(5).setAcrossMaps(true).setPort(Port({10, 10, 100, 100})); + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormLayerBuilderTest, cannotCreateNormLayerWithWrongBeta) { + Builder::Network net("Test"); + auto layer = Builder::NormLayer("NormLayer").setAlpha(1).setBeta(0).setSize(5).setAcrossMaps(true).setPort(Port({10, 10, 100, 100})); + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormLayerBuilderTest, cannotCreateNormLayerWithWrongSize) { + Builder::Network net("Test"); + auto layer = Builder::NormLayer("NormLayer").setAlpha(1).setBeta(1).setSize(0).setAcrossMaps(true).setPort(Port({10, 10, 100, 100})); + ASSERT_THROW(net.addLayer(layer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormLayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeNormLayerPtr = std::make_shared("Norm", "Norm layer"); + fakeNormLayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeNormLayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::NormLayer normLayer(fakeNormLayerPtr); + normLayer.setAlpha(1).setBeta(0).setSize(5).setAcrossMaps(true); + ASSERT_THROW(net.addLayer(normLayer), InferenceEngine::details::InferenceEngineException); +} + diff --git a/inference-engine/tests/unit/builders/normalize_layer_test.cpp b/inference-engine/tests/unit/builders/normalize_layer_test.cpp new file mode 100644 index 0000000..809f2b1 --- /dev/null +++ b/inference-engine/tests/unit/builders/normalize_layer_test.cpp @@ -0,0 +1,89 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class NormalizeLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(NormalizeLayerBuilderTest, getExistsLayerFromNetworkBuilder1) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0.1).setChannelShared(true).setAcrossMaps(true); + size_t ind = net.addLayer(normalizeLayer); + Builder::NormalizeLayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(normalizeLayer.getEpsilon(), layerFromNet.getEpsilon()); +} + +TEST_F(NormalizeLayerBuilderTest, getExistsLayerFromNetworkBuilder2) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0.1).setChannelShared(true).setAcrossMaps(false); + size_t ind = net.addLayer(normalizeLayer); + Builder::NormalizeLayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(normalizeLayer.getEpsilon(), layerFromNet.getEpsilon()); +} + +TEST_F(NormalizeLayerBuilderTest, getExistsLayerFromNetworkBuilder3) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0.1).setChannelShared(false).setAcrossMaps(true); + size_t ind = net.addLayer(normalizeLayer); + Builder::NormalizeLayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(normalizeLayer.getEpsilon(), layerFromNet.getEpsilon()); +} + +TEST_F(NormalizeLayerBuilderTest, getExistsLayerFromNetworkBuilder4) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0.1).setChannelShared(false).setAcrossMaps(false); + size_t ind = net.addLayer(normalizeLayer); + Builder::NormalizeLayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(normalizeLayer.getEpsilon(), layerFromNet.getEpsilon()); +} + +TEST_F(NormalizeLayerBuilderTest, cannotCreateLayerWithWrongEpsilon1) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0).setChannelShared(true).setAcrossMaps(true); + ASSERT_THROW(net.addLayer(normalizeLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormalizeLayerBuilderTest, cannotCreateLayerWithWrongEpsilon2) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0).setChannelShared(true).setAcrossMaps(false); + ASSERT_THROW(net.addLayer(normalizeLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormalizeLayerBuilderTest, cannotCreateLayerWithWrongEpsilon3) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0).setChannelShared(false).setAcrossMaps(true); + ASSERT_THROW(net.addLayer(normalizeLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormalizeLayerBuilderTest, cannotCreateLayerWithWrongEpsilon4) { + Builder::Network net("network"); + Builder::NormalizeLayer normalizeLayer("normalizeLayer"); + normalizeLayer.setEpsilon(0).setChannelShared(false).setAcrossMaps(false); + ASSERT_THROW(net.addLayer(normalizeLayer), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(NormalizeLayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeNormalizeLayerPtr = std::make_shared("Normalize", "Normalize layer"); + fakeNormalizeLayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeNormalizeLayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::NormalizeLayer normalizeLayer(fakeNormalizeLayerPtr); + normalizeLayer.setEpsilon(0.1).setChannelShared(true).setAcrossMaps(true); + ASSERT_THROW(net.addLayer(normalizeLayer), InferenceEngine::details::InferenceEngineException); +} diff --git a/inference-engine/tests/unit/builders/output_layer_test.cpp b/inference-engine/tests/unit/builders/output_layer_test.cpp new file mode 100644 index 0000000..dc2b91b --- /dev/null +++ b/inference-engine/tests/unit/builders/output_layer_test.cpp @@ -0,0 +1,25 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class OutputLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(OutputLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network network("network"); + Builder::OutputLayer layer("output layer"); + layer.setPort(Port({1, 1, 1, 1})); + size_t ind = network.addLayer(layer); + Builder::OutputLayer layerFromNet(network.getLayer(ind)); + ASSERT_EQ(layer.getPort().shape(), layerFromNet.getPort().shape()); + ASSERT_EQ(layer.getPort().shape(), Port({1, 1, 1, 1}).shape()); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/relu6_layer_test.cpp b/inference-engine/tests/unit/builders/relu6_layer_test.cpp new file mode 100644 index 0000000..a0e9340 --- /dev/null +++ b/inference-engine/tests/unit/builders/relu6_layer_test.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ReLU6LayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ReLU6LayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::ReLU6Layer relu6Layer("relu6layer"); + relu6Layer.setN(100); + size_t ind = net.addLayer(relu6Layer); + Builder::ReLU6Layer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(relu6Layer.getN(), layerFromNet.getN()); +} + +TEST_F(ReLU6LayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeReLU6LayerPtr = std::make_shared("ReLU6", "ReLU6 layer"); + fakeReLU6LayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeReLU6LayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::ReLU6Layer reLU6Layer(fakeReLU6LayerPtr); + reLU6Layer.setN(10); + ASSERT_THROW(net.addLayer(reLU6Layer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/relu_layer_test.cpp b/inference-engine/tests/unit/builders/relu_layer_test.cpp new file mode 100644 index 0000000..a05a5d9 --- /dev/null +++ b/inference-engine/tests/unit/builders/relu_layer_test.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ReLULayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ReLULayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::ReLULayer reluLayer("ReLU_layer"); + reluLayer.setNegativeSlope(100); + size_t ind = net.addLayer(reluLayer); + Builder::ReLULayer layerFromNet(net.getLayer(ind)); + ASSERT_EQ(reluLayer.getNegativeSlope(), layerFromNet.getNegativeSlope()); +} + +TEST_F(ReLULayerBuilderTest, cannotCreateLayerWithWrongNegativeSlope) { + Builder::Network net("network"); + Builder::ReLULayer reluLayer("ReLU_layer"); + reluLayer.setNegativeSlope(-10); + ASSERT_NO_THROW(net.addLayer(reluLayer)); +} + +TEST_F(ReLULayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeReLULayerPtr = std::make_shared("ReLU", "ReLU layer"); + fakeReLULayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeReLULayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::ReLULayer reluLayer(fakeReLULayerPtr); + reluLayer.setNegativeSlope(100); + ASSERT_THROW(net.addLayer(reluLayer), InferenceEngine::details::InferenceEngineException); +} diff --git a/inference-engine/tests/unit/builders/resample_layer_test.cpp b/inference-engine/tests/unit/builders/resample_layer_test.cpp new file mode 100644 index 0000000..0591080 --- /dev/null +++ b/inference-engine/tests/unit/builders/resample_layer_test.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class ResampleLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(ResampleLayerBuilderTest, checkTypeParameter) { + InferenceEngine::Builder::Layer ieLayer("Resample", "upsample"); + ieLayer.getParameters()["type"] = std::string("caffe.ResampleParameter.NEAREST"); + ieLayer.getParameters()["antialias"] = false; + ieLayer.getParameters()["factor"] = 2.0f; + ieLayer.getParameters()["width"] = 10; + ieLayer.getParameters()["height"] = 10; + + ASSERT_EQ("Resample", ieLayer.getType()); + ASSERT_EQ("caffe.ResampleParameter.NEAREST", ieLayer.getParameters()["type"].as()); + + InferenceEngine::Builder::ResampleLayer resampleLayer("upsample"); + resampleLayer.setResampleType("caffe.ResampleParameter.NEAREST"); + resampleLayer.setAntialias(false); + resampleLayer.setFactor(2); + resampleLayer.setWidth(10); + resampleLayer.setHeight(10); + ASSERT_EQ("Resample", resampleLayer.getType()); + ASSERT_EQ("caffe.ResampleParameter.NEAREST", resampleLayer.getResampleType()); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/split_layer_test.cpp b/inference-engine/tests/unit/builders/split_layer_test.cpp new file mode 100644 index 0000000..145295e --- /dev/null +++ b/inference-engine/tests/unit/builders/split_layer_test.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class SplitLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(SplitLayerBuilderTest, CreateIdentitySplitLayer) { + Builder::Network builder("network"); + SizeVector shape = {1, 4, 3, 4}; + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port(shape, Precision::FP16))); + layerId = builder.addLayer({layerId}, Builder::SplitLayer("identity").setOutputPorts({Port()})); + builder.addLayer({layerId}, Builder::OutputLayer("output")); + + const auto network = builder.build(); + ASSERT_EQ(shape, network->getLayer(layerId)->getOutputPorts()[0].shape()); +} + +TEST_F(SplitLayerBuilderTest, CreateSplitLayerWithTwoOutputs) { + Builder::Network builder("network"); + SizeVector shape = {1, 4, 3, 4}; + SizeVector outShape = {1, 2, 3, 4}; + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port(shape, Precision::FP16))); + layerId = builder.addLayer({layerId}, Builder::SplitLayer("split").setOutputPorts({Port(), Port()})); + builder.addLayer({{layerId}}, Builder::OutputLayer("output1")); + builder.addLayer({{layerId, 1}}, Builder::OutputLayer("output2")); + + const auto network = builder.build(); + ASSERT_EQ(outShape, network->getLayer(layerId)->getOutputPorts()[0].shape()); + ASSERT_EQ(outShape, network->getLayer(layerId)->getOutputPorts()[1].shape()); +} + +TEST_F(SplitLayerBuilderTest, CreateSplitLayerWithTwoOutputsAndOneInitialized) { + Builder::Network builder("network"); + SizeVector shape = {1, 4, 3, 4}; + SizeVector outShape1 = {1, 3, 3, 4}; + SizeVector outShape2 = {1, 1, 3, 4}; + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port(shape, Precision::FP16))); + layerId = builder.addLayer({layerId}, Builder::SplitLayer("split").setOutputPorts({Port(outShape1), Port()})); + builder.addLayer({{layerId}}, Builder::OutputLayer("output1")); + builder.addLayer({{layerId, 1}}, Builder::OutputLayer("output2")); + + const auto network = builder.build(); + ASSERT_EQ(outShape1, network->getLayer(layerId)->getOutputPorts()[0].shape()); + ASSERT_EQ(outShape2, network->getLayer(layerId)->getOutputPorts()[1].shape()); +} + +TEST_F(SplitLayerBuilderTest, CreateSplitLayerWithTwoOutputsAxis3) { + Builder::Network builder("network"); + SizeVector shape = {1, 4, 3, 4}; + SizeVector outShape = {1, 4, 3, 2}; + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port(shape, Precision::FP16))); + layerId = builder.addLayer({layerId}, Builder::SplitLayer("split").setAxis(3).setOutputPorts({Port(), Port()})); + builder.addLayer({{layerId}}, Builder::OutputLayer("output1")); + builder.addLayer({{layerId, 1}}, Builder::OutputLayer("output2")); + + const auto network = builder.build(); + ASSERT_EQ(outShape, network->getLayer(layerId)->getOutputPorts()[0].shape()); + ASSERT_EQ(outShape, network->getLayer(layerId)->getOutputPorts()[1].shape()); +} + +TEST_F(SplitLayerBuilderTest, CreateSplitLayerWithTwoOutputsAxis3AndOneInitialized) { + Builder::Network builder("network"); + SizeVector shape = {1, 4, 3, 4}; + SizeVector outShape1 = {1, 4, 3, 1}; + SizeVector outShape2 = {1, 4, 3, 3}; + idx_t layerId = builder.addLayer(Builder::InputLayer("input").setPort(Port(shape, Precision::FP16))); + layerId = builder.addLayer({layerId}, Builder::SplitLayer("split").setAxis(3).setOutputPorts({Port(outShape1), Port()})); + builder.addLayer({{layerId}}, Builder::OutputLayer("output1")); + builder.addLayer({{layerId, 1}}, Builder::OutputLayer("output2")); + + const auto network = builder.build(); + ASSERT_EQ(outShape1, network->getLayer(layerId)->getOutputPorts()[0].shape()); + ASSERT_EQ(outShape2, network->getLayer(layerId)->getOutputPorts()[1].shape()); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/tanh_layer_test.cpp b/inference-engine/tests/unit/builders/tanh_layer_test.cpp new file mode 100644 index 0000000..0e37aa5 --- /dev/null +++ b/inference-engine/tests/unit/builders/tanh_layer_test.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class TanHLayerBuilderTest : public BuilderTestCommon {}; + +TEST_F(TanHLayerBuilderTest, getExistsLayerFromNetworkBuilder) { + Builder::Network net("network"); + Builder::TanHLayer tanhLayer("TanH_layer"); + size_t ind = net.addLayer(tanhLayer); + Builder::TanHLayer layerFromNet(net.getLayer(ind)); +} + +TEST_F(TanHLayerBuilderTest, cannotCreateLayerWithWrongShapes) { + Builder::Network net("network"); + Builder::Layer::Ptr fakeTanHLayerPtr = std::make_shared("TanH", "TanH layer"); + fakeTanHLayerPtr->getInputPorts().push_back(Port({1, 1, 1, 1})); + fakeTanHLayerPtr->getOutputPorts().push_back(Port({1, 1, 1, 2})); + Builder::TanHLayer tanhLayer(fakeTanHLayerPtr); + ASSERT_THROW(net.addLayer(tanhLayer), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/builders/transform_network_test.cpp b/inference-engine/tests/unit/builders/transform_network_test.cpp new file mode 100644 index 0000000..2ae9968 --- /dev/null +++ b/inference-engine/tests/unit/builders/transform_network_test.cpp @@ -0,0 +1,185 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "builder_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class TransformNetworkTest: public BuilderTestCommon {}; + +TEST_F(TransformNetworkTest, AddNewLayer) { + Builder::Network builder("test"); + Transform::Network network(builder); + ASSERT_EQ(0, builder.size()); + network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + ASSERT_EQ(1, builder.size()); +} + +TEST_F(TransformNetworkTest, RemoveLayer) { + Builder::Network builder("test"); + Transform::Network network(builder); + ASSERT_EQ(0, builder.size()); + Transform::Layer layer = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + ASSERT_EQ(1, builder.size()); + + network.removeLayer(layer); + ASSERT_EQ(0, builder.size()); +} + +TEST_F(TransformNetworkTest, GetIncorrectPort) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer layer = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + ASSERT_THROW(layer.getInPort(), InferenceEngine::details::InferenceEngineException); + ASSERT_THROW(layer.getOutPort(1), InferenceEngine::details::InferenceEngineException); +} + + +TEST_F(TransformNetworkTest, GetCorrectPort) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer layer = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + ASSERT_NO_THROW(layer.getOutPort()); + ASSERT_NO_THROW(layer.getOutPort(0)); +} + +TEST_F(TransformNetworkTest, GetLayerById) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer layer = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + ASSERT_NO_THROW(network.getLayer(layer.getId())); +} + +TEST_F(TransformNetworkTest, GetLayerByName) { + Builder::Network builder("test"); + Transform::Network network(builder); + network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + ASSERT_NO_THROW(network.getLayer("in1")); +} + +TEST_F(TransformNetworkTest, ConnectTwoLayers) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer input = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + Transform::Layer relu = network.addLayer(Builder::ReLULayer("relu1")); + ASSERT_EQ(2, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + network.connect(input, relu); + ASSERT_EQ(1, builder.getConnections().size()); +} + +TEST_F(TransformNetworkTest, ConnectTwoPorts) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Port inputPort = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))).getOutPort(); + Transform::Port reluPort = network.addLayer(Builder::ReLULayer("relu1")).getInPort(); + ASSERT_EQ(2, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + network.connect(inputPort, reluPort); + ASSERT_EQ(1, builder.getConnections().size()); +} + +TEST_F(TransformNetworkTest, DisconnectTwoLayers) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer input = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + Transform::Layer relu = network.addLayer(Builder::ReLULayer("relu1")); + ASSERT_EQ(2, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + network.connect(input, relu); + ASSERT_EQ(1, builder.getConnections().size()); + network.disconnect(input, relu); + ASSERT_EQ(0, builder.getConnections().size()); +} + +TEST_F(TransformNetworkTest, DisonnectTwoPorts) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Port inputPort = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))).getOutPort(); + Transform::Port reluPort = network.addLayer(Builder::ReLULayer("relu1")).getInPort(); + ASSERT_EQ(2, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + network.connect(inputPort, reluPort); + ASSERT_EQ(1, builder.getConnections().size()); + network.disconnect(inputPort, reluPort); + ASSERT_EQ(0, builder.getConnections().size()); +} + +TEST_F(TransformNetworkTest, RemoveLayerAndConnection) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer input = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + Transform::Layer relu = network.addLayer(Builder::ReLULayer("relu1")); + network.connect(input, relu); + ASSERT_EQ(1, builder.getConnections().size()); + ASSERT_EQ(2, builder.size()); + network.removeLayer(relu); + ASSERT_EQ(0, builder.getConnections().size()); + ASSERT_EQ(1, builder.size()); +} + +TEST_F(TransformNetworkTest, GetInitializedConnection) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer input = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + Transform::Layer relu = network.addLayer(Builder::ReLULayer("relu1")); + network.connect(input, relu); + ASSERT_EQ(input.getOutPort(), relu.getInPort().getConnection().getSource()); +} + +TEST_F(TransformNetworkTest, GetIncorrectConnections) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Layer input = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))); + Transform::Layer relu = network.addLayer(Builder::ReLULayer("relu1")); + ASSERT_THROW(relu.getInPort().getConnection().getSource(), InferenceEngine::details::InferenceEngineException); + ASSERT_THROW(input.getOutPort().getConnection().getDestination(), InferenceEngine::details::InferenceEngineException); + ASSERT_NO_THROW(input.getOutPort().getConnection().getSource()); + ASSERT_NO_THROW(relu.getInPort().getConnection().getDestination()); +} + +TEST_F(TransformNetworkTest, ConnectToSourcePortsFromConnection) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Port inputPort = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))).getOutPort(); + Transform::Port reluPort = network.addLayer(Builder::ReLULayer("relu1")).getInPort(); + ASSERT_EQ(2, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + ASSERT_NO_THROW(inputPort.getConnection().setDestination(reluPort)); + ASSERT_EQ(1, builder.getConnections().size()); +} + +TEST_F(TransformNetworkTest, ConnectWithTwoDestinations) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Port inputPort = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))).getOutPort(); + Transform::Port reluPort1 = network.addLayer(Builder::ReLULayer("relu1")).getInPort(); + Transform::Port reluPort2 = network.addLayer(Builder::ReLULayer("relu2")).getInPort(); + ASSERT_EQ(3, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + ASSERT_NO_THROW(inputPort.getConnection().setDestination(reluPort1)); + ASSERT_NO_THROW(inputPort.getConnection().addDestination(reluPort2)); + ASSERT_THROW(inputPort.getConnection().addDestination(reluPort2), InferenceEngine::details::InferenceEngineException); + ASSERT_EQ(2, builder.getConnections().size()); + ASSERT_THROW(inputPort.getConnection().setDestination(reluPort2), InferenceEngine::details::InferenceEngineException); + ASSERT_NO_THROW(inputPort.getConnection().setDestinations({reluPort2, reluPort1})); + ASSERT_EQ(2, builder.getConnections().size()); +} + +TEST_F(TransformNetworkTest, ConnectToDestinationPortsFromConnection) { + Builder::Network builder("test"); + Transform::Network network(builder); + Transform::Port inputPort = network.addLayer(Builder::InputLayer("in1").setPort(Port({1, 3, 27, 27}))).getOutPort(); + Transform::Port reluPort = network.addLayer(Builder::ReLULayer("relu1")).getInPort(); + ASSERT_EQ(2, builder.size()); + ASSERT_EQ(0, builder.getConnections().size()); + reluPort.getConnection().setSource(inputPort); + ASSERT_EQ(1, builder.getConnections().size()); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/cnn_network/cnn_layer_validation_tests.cpp b/inference-engine/tests/unit/cnn_network/cnn_layer_validation_tests.cpp new file mode 100644 index 0000000..d06687e --- /dev/null +++ b/inference-engine/tests/unit/cnn_network/cnn_layer_validation_tests.cpp @@ -0,0 +1,99 @@ +/* +* INTEL CONFIDENTIAL +* Copyright (C) 2018-2019 Intel Corporation. +* +* The source code contained or described herein and all documents +* related to the source code ("Material") are owned by Intel Corporation +* or its suppliers or licensors. Title to the Material remains with +* Intel Corporation or its suppliers and licensors. The Material may +* contain trade secrets and proprietary and confidential information +* of Intel Corporation and its suppliers and licensors, and is protected +* by worldwide copyright and trade secret laws and treaty provisions. +* No part of the Material may be used, copied, reproduced, modified, +* published, uploaded, posted, transmitted, distributed, or disclosed +* in any way without Intel's prior express written permission. +* +* No license under any patent, copyright, trade secret or other +* intellectual property right is granted to or conferred upon you by +* disclosure or delivery of the Materials, either expressly, by implication, +* inducement, estoppel or otherwise. Any license under such intellectual +* property rights must be express and approved by Intel in writing. +* +* Include any supplier copyright notices as supplier requires Intel to use. +* +* Include supplier trademarks or logos as supplier requires Intel to use, +* preceded by an asterisk. An asterisked footnote can be added as follows: +* *Third Party trademarks are the property of their respective owners. +* +* Unless otherwise agreed by Intel in writing, you may not remove or alter +* this notice or any other notice embedded in Materials by Intel or Intel's +* suppliers or licensors in any way. +*/ +#include +#include +#include +#include +#include +#include +#include <../shape_infer/built_in_shape_infer_general_test.hpp> +#include +#include <../include/ie_data.h> + +#include "layer_builder.h" +#include "shapes.h" +using namespace InferenceEngine; +using namespace InferenceEngine::details; + +TEST_P(CNNLayerValidationTests, checkValidParams) { + + assertThat(type)->setParams(valid_params); + auto layer = getLayer(); + LayerValidator::Ptr validator = LayerValidators::getInstance()->getValidator(type); + + ASSERT_NO_THROW(validator->parseParams(layer.get())); + ASSERT_NO_THROW(validator->checkParams(layer.get())); +} + +TEST_P(CNNLayerValidationTests, checkInvalidParams) { + + assertThat(type); + int numberOfParams = getNumOfParams(); + LayerValidator::Ptr validator = LayerValidators::getInstance()->getValidator(type); + auto layer_ = getLayer(); + for (int i = 0; i < numberOfParams; ++i) { + layer->setParams(!valid_params); + ASSERT_THROW(validator->parseParams(layer_.get()), InferenceEngineException); + ASSERT_THROW(validator->checkParams(layer_.get()), InferenceEngineException); + } +} + +TEST_P(CNNLayerValidationTests, checkInvalidInputShapes) { + LayerValidator::Ptr validator = LayerValidators::getInstance()->getValidator(type); + std::vector spData; + assertThat(type)->setShapes(spData, !valid_input); + + auto layer_ = getLayer(); + InOutDims shapes; + InferenceEngine::details::getInOutShapes(layer_.get(), shapes); + ASSERT_THROW(validator->checkShapes(layer_.get(), shapes.inDims), InferenceEngineException); +} + +TEST_P(CNNLayerValidationTests, checkValidShapes) { + + std::vector spData; + assertThat(type)->setShapes(spData, valid_input); + auto layer = getLayer(); + LayerValidator::Ptr validator = LayerValidators::getInstance()->getValidator(type); + InOutDims shapes; + InferenceEngine::details::getInOutShapes(layer.get(), shapes); + ASSERT_NO_THROW(validator->checkShapes(layer.get(), shapes.inDims)); +} + +INSTANTIATE_TEST_CASE_P( + InstantiationName, CNNLayerValidationTests, + ::testing::Values( + "Convolution" + ,"Deconvolution" + ,"DetectionOutput" + ) +); diff --git a/inference-engine/tests/unit/cnn_network/cnn_net_reader_impl_test.cpp b/inference-engine/tests/unit/cnn_network/cnn_net_reader_impl_test.cpp index e33362f..d3c96be 100644 --- a/inference-engine/tests/unit/cnn_network/cnn_net_reader_impl_test.cpp +++ b/inference-engine/tests/unit/cnn_network/cnn_net_reader_impl_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #include #include "cnn_network_impl.hpp" #include "mock_iformat_parser.hpp" +#include +#include using namespace testing; using namespace InferenceEngine; @@ -26,6 +28,7 @@ struct MockFormatParserCreator : public FormatParserCreator { MockFormatParserCreator() { _parser = make_shared(); } + std::shared_ptr create(int version) override { return _parser; } @@ -1697,49 +1700,49 @@ TEST_F(CNNNetReaderImplTest, cycleIsDetectedInReader) { TEST_F(CNNNetReaderImplTest, canRead3DConvolution) { std::string model = - "" - " " - " " - " " - " " - " 1" - " 3" - " 16" - " 112" - " 112" - " " - " " - " " - " " - " " - " " - " " - " 1" - " 3" - " 16" - " 112" - " 112" - " " - " " - " " - " " - " 1" - " 64" - " 16" - " 56" - " 56" - " " - " " - " " - " " - " " - " " - " " - " " - " " - " " - " " - ""; + "" + " " + " " + " " + " " + " 1" + " 3" + " 16" + " 112" + " 112" + " " + " " + " " + " " + " " + " " + " " + " 1" + " 3" + " 16" + " 112" + " 112" + " " + " " + " " + " " + " 1" + " 64" + " 16" + " 56" + " 56" + " " + " " + " " + " " + " " + " " + " " + " " + " " + " " + " " + ""; CNNNetReaderImpl reader(make_shared()); ASSERT_EQ(OK, reader.ReadNetwork(model.data(), model.length(), &resp)); @@ -1748,7 +1751,7 @@ TEST_F(CNNNetReaderImplTest, canRead3DConvolution) { CNNLayerPtr layer; ASSERT_EQ(OK, network->getLayerByName("3D_conv", layer, nullptr)); - auto *conv = dynamic_cast(layer.get()); + auto* conv = dynamic_cast(layer.get()); ASSERT_NE(nullptr, conv); ASSERT_EQ(conv->_kernel[X_AXIS], 5); ASSERT_EQ(conv->_kernel[Y_AXIS], 3); @@ -1769,45 +1772,45 @@ TEST_F(CNNNetReaderImplTest, canRead3DConvolution) { TEST_F(CNNNetReaderImplTest, canRead3DPooling) { std::string model = - "" - " " - " " - " " - " " - " 1" - " 3" - " 16" - " 112" - " 112" - " " - " " - " " - " " - " " - " " - " " - " 1" - " 3" - " 16" - " 112" - " 112" - " " - " " - " " - " " - " 1" - " 64" - " 8" - " 28" - " 28" - " " - " " - " " - " " - " " - " " - " " - ""; + "" + " " + " " + " " + " " + " 1" + " 3" + " 16" + " 112" + " 112" + " " + " " + " " + " " + " " + " " + " " + " 1" + " 3" + " 16" + " 112" + " 112" + " " + " " + " " + " " + " 1" + " 64" + " 8" + " 28" + " 28" + " " + " " + " " + " " + " " + " " + " " + ""; CNNNetReaderImpl reader(make_shared()); ASSERT_EQ(OK, reader.ReadNetwork(model.data(), model.length(), &resp)); @@ -1817,7 +1820,7 @@ TEST_F(CNNNetReaderImplTest, canRead3DPooling) { CNNLayerPtr layer; ASSERT_EQ(OK, network->getLayerByName("3D_pooling", layer, nullptr)); - auto *pool = dynamic_cast(layer.get()); + auto* pool = dynamic_cast(layer.get()); ASSERT_NE(nullptr, pool); ASSERT_EQ(pool->_kernel[X_AXIS], 5); ASSERT_EQ(pool->_kernel[Y_AXIS], 3); @@ -1862,22 +1865,7 @@ TEST_F(CNNNetReaderImplTest, canParseWithoutInput_1to2) { CNNNetReaderImpl reader(make_shared()); sts = reader.ReadNetwork(model.data(), model.length(), &resp); - ASSERT_EQ(OK, sts) << resp.msg; - - auto net = reader.getNetwork(&resp); - ASSERT_NE(nullptr, net ) << resp.msg; - - InputsDataMap in_map; - OutputsDataMap out_map; - net->getInputsInfo(in_map); - net->getOutputsInfo(out_map); - - ASSERT_EQ(in_map.size(), 1); auto i = in_map.begin(); - ASSERT_EQ(i++->second->name(), "Boo"); - - ASSERT_EQ(out_map.size(), 2); auto o = out_map.begin(); - ASSERT_EQ(o++->second->getName(), "Boo.0"); - ASSERT_EQ(o++->second->getName(), "Boo.1"); + ASSERT_EQ(GENERAL_ERROR, sts) << resp.msg; } TEST_F(CNNNetReaderImplTest, canParseWithoutInput_2to1) { @@ -1909,26 +1897,11 @@ TEST_F(CNNNetReaderImplTest, canParseWithoutInput_2to1) { CNNNetReaderImpl reader(make_shared()); sts = reader.ReadNetwork(model.data(), model.length(), &resp); - ASSERT_EQ(OK, sts) << resp.msg; - - auto net = reader.getNetwork(&resp); - ASSERT_NE(nullptr, net ) << resp.msg; - - InputsDataMap in_map; - OutputsDataMap out_map; - net->getInputsInfo(in_map); - net->getOutputsInfo(out_map); - - ASSERT_EQ(in_map.size(), 2); auto i = in_map.begin(); - ASSERT_EQ(i++->second->name(), "Foo.0"); - ASSERT_EQ(i++->second->name(), "Foo.1"); - - ASSERT_EQ(out_map.size(), 1); auto o = out_map.begin(); - ASSERT_EQ(o++->second->getName(), "Foo"); + ASSERT_EQ(GENERAL_ERROR, sts) << resp.msg; } TEST_F(CNNNetReaderImplTest, canParseSimpleTI) { - std::string model = R"V0G0N( + std::string model = R"V0G0N( @@ -2046,50 +2019,122 @@ TEST_F(CNNNetReaderImplTest, canParseSimpleTI) { )V0G0N"; - CNNNetReaderImpl reader(make_shared()); - sts = reader.ReadNetwork(model.data(), model.length(), &resp); - ASSERT_EQ(OK, sts) << resp.msg; + CNNNetReaderImpl reader(make_shared()); + sts = reader.ReadNetwork(model.data(), model.length(), &resp); + ASSERT_EQ(OK, sts) << resp.msg; - auto network = reader.getNetwork(&resp); - ASSERT_NE(nullptr, network ) << resp.msg; + auto network = reader.getNetwork(&resp); + ASSERT_NE(nullptr, network) << resp.msg; - CNNLayerPtr layer; - sts = network->getLayerByName("SomeTI", layer, &resp); - ASSERT_EQ(OK, sts) << resp.msg; + CNNLayerPtr layer; + sts = network->getLayerByName("SomeTI", layer, &resp); + ASSERT_EQ(OK, sts) << resp.msg; + + auto* ti = dynamic_cast(layer.get()); + ASSERT_NE(nullptr, ti); + ASSERT_EQ(ti->type, "TensorIterator"); + + // Check Input port mapping + ASSERT_EQ(ti->input_port_map.size(), 2); + int i = ti->input_port_map[0].axis == 1 ? 0 : 1; + ASSERT_EQ(ti->input_port_map[i].axis, 1); + ASSERT_EQ(ti->input_port_map[i].stride, 1); + ASSERT_EQ(ti->input_port_map[i].start, 0); + ASSERT_EQ(ti->input_port_map[i].end, -1); + ASSERT_EQ(ti->input_port_map[i].part_size, 1); + ASSERT_EQ(ti->input_port_map[1 - i].axis, -1); + ASSERT_EQ(ti->input_port_map[1 - i].stride, 1); + ASSERT_EQ(ti->input_port_map[1 - i].start, 0); + ASSERT_EQ(ti->input_port_map[1 - i].end, -1); + ASSERT_EQ(ti->input_port_map[1 - i].part_size, 1); + + // Check Output port mapping + ASSERT_EQ(ti->output_port_map.size(), 1); + ASSERT_EQ(ti->output_port_map[0].axis, 1); + ASSERT_EQ(ti->output_port_map[0].stride, 1); + ASSERT_EQ(ti->output_port_map[0].start, 0); + ASSERT_EQ(ti->output_port_map[0].end, -1); + ASSERT_EQ(ti->output_port_map[0].part_size, 1); + + // No back edges + ASSERT_EQ(ti->back_edges.size(), 1); + ASSERT_EQ(ti->back_edges[0].from, 0); + ASSERT_EQ(ti->back_edges[0].to, 1); + ASSERT_EQ(ti->back_edges[0].axis, -1); + ASSERT_EQ(ti->back_edges[0].stride, 1); + ASSERT_EQ(ti->back_edges[0].start, 0); + ASSERT_EQ(ti->back_edges[0].end, -1); + ASSERT_EQ(ti->back_edges[0].part_size, 1); +} + +TEST_F(CNNNetReaderImplTest, canParseScalar) { + std::string model = R"V0G0N( + + + + + + 1 + 5 + 16 + + + + + + + + + + + + + + + 1 + 5 + 16 + + + + + + 90 + + + + + + + + + + )V0G0N"; - auto *ti = dynamic_cast(layer.get()); - ASSERT_NE(nullptr, ti); - ASSERT_EQ(ti->type, "TensorIterator"); + CNNNetReaderImpl reader(make_shared()); + sts = reader.ReadNetwork(model.data(), model.length(), &resp); + ASSERT_EQ(OK, sts) << resp.msg; + auto blob = make_shared_blob(TensorDesc(Precision::U8, {4}, Layout::C)); + blob->allocate(); + auto buffer = blob->buffer().as(); + float SCALAR_VALUE = 90; + buffer[0] = SCALAR_VALUE; - // Check Input port mapping - ASSERT_EQ(ti->input_port_map.size(), 2); - int i = ti->input_port_map[0].axis == 1 ? 0 : 1; - ASSERT_EQ(ti->input_port_map[i].axis, 1); - ASSERT_EQ(ti->input_port_map[i].stride, 1); - ASSERT_EQ(ti->input_port_map[i].start, 0); - ASSERT_EQ(ti->input_port_map[i].end, -1); - ASSERT_EQ(ti->input_port_map[i].part_size, 1); - ASSERT_EQ(ti->input_port_map[1-i].axis, -1); - ASSERT_EQ(ti->input_port_map[1-i].stride, 1); - ASSERT_EQ(ti->input_port_map[1-i].start, 0); - ASSERT_EQ(ti->input_port_map[1-i].end, -1); - ASSERT_EQ(ti->input_port_map[1-i].part_size, 1); + sts = reader.SetWeights(blob, &resp); + ASSERT_EQ(OK, sts) << resp.msg; - // Check Output port mapping - ASSERT_EQ(ti->output_port_map.size(), 1); - ASSERT_EQ(ti->output_port_map[0].axis, 1); - ASSERT_EQ(ti->output_port_map[0].stride, 1); - ASSERT_EQ(ti->output_port_map[0].start, 0); - ASSERT_EQ(ti->output_port_map[0].end, -1); - ASSERT_EQ(ti->output_port_map[0].part_size, 1); + auto net = reader.getNetwork(&resp); - // No back edges - ASSERT_EQ(ti->back_edges.size(), 1); - ASSERT_EQ(ti->back_edges[0].from, 0); - ASSERT_EQ(ti->back_edges[0].to, 1); - ASSERT_EQ(ti->back_edges[0].axis, -1); - ASSERT_EQ(ti->back_edges[0].stride, 1); - ASSERT_EQ(ti->back_edges[0].start, 0); - ASSERT_EQ(ti->back_edges[0].end, -1); - ASSERT_EQ(ti->back_edges[0].part_size, 1); + ASSERT_NE(nullptr, net) << resp.msg; + CNNLayerPtr layer; + sts = net->getLayerByName("scalar", layer, &resp); + ASSERT_EQ(OK, sts) << resp.msg; + ASSERT_NE(nullptr, layer.get()); + ASSERT_EQ(layer->type, "Const"); + auto actualBlob = layer->blobs.begin()->second; + ASSERT_EQ(actualBlob->buffer().as()[0], SCALAR_VALUE); + auto scalarDesc = layer->outData[0]->getTensorDesc(); + ASSERT_TRUE(scalarDesc.getDims().empty()); + ASSERT_EQ(scalarDesc.getLayout(), SCALAR); + ASSERT_EQ(scalarDesc.getPrecision(), Precision::FP32); } diff --git a/inference-engine/tests/unit/cnn_network/cnn_network_impl_test.cpp b/inference-engine/tests/unit/cnn_network/cnn_network_impl_test.cpp index 9a5a47a..8963331 100644 --- a/inference-engine/tests/unit/cnn_network/cnn_network_impl_test.cpp +++ b/inference-engine/tests/unit/cnn_network/cnn_network_impl_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/cnn_network/layer_builder.h b/inference-engine/tests/unit/cnn_network/layer_builder.h new file mode 100644 index 0000000..2de6472 --- /dev/null +++ b/inference-engine/tests/unit/cnn_network/layer_builder.h @@ -0,0 +1,150 @@ +#include + +/* +* INTEL CONFIDENTIAL +* Copyright (C) 2018-2019 Intel Corporation. +* +* The source code contained or described herein and all documents +* related to the source code ("Material") are owned by Intel Corporation +* or its suppliers or licensors. Title to the Material remains with +* Intel Corporation or its suppliers and licensors. The Material may +* contain trade secrets and proprietary and confidential information +* of Intel Corporation and its suppliers and licensors, and is protected +* by worldwide copyright and trade secret laws and treaty provisions. +* No part of the Material may be used, copied, reproduced, modified, +* published, uploaded, posted, transmitted, distributed, or disclosed +* in any way without Intel's prior express written permission. +* +* No license under any patent, copyright, trade secret or other +* intellectual property right is granted to or conferred upon you by +* disclosure or delivery of the Materials, either expressly, by implication, +* inducement, estoppel or otherwise. Any license under such intellectual +* property rights must be express and approved by Intel in writing. +* +* Include any supplier copyright notices as supplier requires Intel to use. +* +* Include supplier trademarks or logos as supplier requires Intel to use, +* preceded by an asterisk. An asterisked footnote can be added as follows: +* *Third Party trademarks are the property of their respective owners. +* +* Unless otherwise agreed by Intel in writing, you may not remove or alter +* this notice or any other notice embedded in Materials by Intel or Intel's +* suppliers or licensors in any way. +*/ + +#include +#include +#include +#include "parameters.h" +#include "shapes.h" + +using namespace InferenceEngine; +using namespace InferenceEngine::details; + +class LayerBuilder { +private: + CNNLayerPtr layer; + std::string dataName = "data"; + std::unique_ptr parameters; +public: + explicit LayerBuilder (InferenceEngine::CNNLayer::Ptr createdLayer) : layer(std::move(createdLayer)) { + parameters = std::unique_ptr(new Parameters(layer->type)); + } + + LayerBuilder& setParams(bool valid) { + if (valid) { + layer->params = parameters->getValidParameters(); + } else { + layer->params = parameters->getInvalidParameters(); + } + return *this; + } + + LayerBuilder& setShapes(std::vector& spData, bool valid_input) { + testing::InOutShapes shapes; + LayersWithNotEqualIO layersWithNotEqualIO; + LayersWithEqualIO layersWithEqualIO; + LayersWithNIO layersWithNIO; + std::vector layers{&layersWithNotEqualIO, &layersWithEqualIO, &layersWithNIO}; + ShapesHelper* shapesHelper = nullptr; + for(const auto& layer : layers) { + if (layer->containLayer(this->layer->type)) { + shapesHelper = layer->factoryShape(); + break; + } + } + if (valid_input) { + shapes = shapesHelper->getValidShapes(); + } else { + shapes = shapesHelper->getInvalidInputShapes(); + } + for (const auto& inData : shapes.inDims) { + DataPtr data = std::make_shared(dataName, inData, InferenceEngine::Precision::FP32); + spData.push_back(data); + layer->insData.push_back(data); + } + for (const auto& outData : shapes.outDims) { + layer->outData.push_back(std::make_shared(dataName, outData, InferenceEngine::Precision::FP32)); + } + delete shapesHelper; + return *this; + } + + CNNLayerPtr get() { + return layer; + } + + int getNumOfParams() { + return parameters->getNumOfParameters(); + } + + int getNumOfLayerVariant() { + LayersWithNotEqualIO layersWithNotEqualIO; + LayersWithEqualIO layersWithEqualIO; + LayersWithNIO layersWithNIO; + Layers* layers[] = {&layersWithNotEqualIO, &layersWithEqualIO, &layersWithNIO}; + int cnt = 0; + for(const auto& layer : layers) { + if (layer->containLayer(this->layer->type)) { + cnt++; + } + } + return cnt; + } +}; + +class CNNLayerValidationTests : public testing::TestWithParam{ +public: + void SetUp() override { + auto params = GetParam(); + type = params; + } + + std::shared_ptr& createConcreteLayer(const std::string& type) { + layer = std::make_shared(TestsCommon::createLayer(type)); + return layer; + } + + std::shared_ptr& getBuilder() { + return layer; + } + + CNNLayerPtr getLayer() { + return layer.get()->get(); + } + + int getNumOfParams() { + return layer.get()->getNumOfParams(); + } + + int getNumOfLayerVariant() { + return layer.get()->getNumOfLayerVariant(); + } +protected: + std::string type; + bool valid_params = true; + bool valid_input = true; + std::shared_ptr layer; +}; + +#define assertThat(type) SCOPED_TRACE("");createConcreteLayer(type) \ No newline at end of file diff --git a/inference-engine/tests/unit/cnn_network/layout_tests.cpp b/inference-engine/tests/unit/cnn_network/layout_tests.cpp index 11ad645..49faf87 100644 --- a/inference-engine/tests/unit/cnn_network/layout_tests.cpp +++ b/inference-engine/tests/unit/cnn_network/layout_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/cnn_network/mean_image.cpp b/inference-engine/tests/unit/cnn_network/mean_image.cpp index 2c31fa1..cd7c922 100644 --- a/inference-engine/tests/unit/cnn_network/mean_image.cpp +++ b/inference-engine/tests/unit/cnn_network/mean_image.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/cnn_network/mean_image.h b/inference-engine/tests/unit/cnn_network/mean_image.h index 3b4ffce..5b85aa8 100644 --- a/inference-engine/tests/unit/cnn_network/mean_image.h +++ b/inference-engine/tests/unit/cnn_network/mean_image.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/cnn_network/parameters.h b/inference-engine/tests/unit/cnn_network/parameters.h new file mode 100644 index 0000000..45420d6 --- /dev/null +++ b/inference-engine/tests/unit/cnn_network/parameters.h @@ -0,0 +1,319 @@ +/* +* INTEL CONFIDENTIAL +* Copyright (C) 2018-2019 Intel Corporation. +* +* The source code contained or described herein and all documents +* related to the source code ("Material") are owned by Intel Corporation +* or its suppliers or licensors. Title to the Material remains with +* Intel Corporation or its suppliers and licensors. The Material may +* contain trade secrets and proprietary and confidential information +* of Intel Corporation and its suppliers and licensors, and is protected +* by worldwide copyright and trade secret laws and treaty provisions. +* No part of the Material may be used, copied, reproduced, modified, +* published, uploaded, posted, transmitted, distributed, or disclosed +* in any way without Intel's prior express written permission. +* +* No license under any patent, copyright, trade secret or other +* intellectual property right is granted to or conferred upon you by +* disclosure or delivery of the Materials, either expressly, by implication, +* inducement, estoppel or otherwise. Any license under such intellectual +* property rights must be express and approved by Intel in writing. +* +* Include any supplier copyright notices as supplier requires Intel to use. +* +* Include supplier trademarks or logos as supplier requires Intel to use, +* preceded by an asterisk. An asterisked footnote can be added as follows: +* *Third Party trademarks are the property of their respective owners. +* +* Unless otherwise agreed by Intel in writing, you may not remove or alter +* this notice or any other notice embedded in Materials by Intel or Intel's +* suppliers or licensors in any way. +*/ + +#include +#include +#include +#include +#include +#include + +enum class ParametersValues { + ZERO, + INT_POSITIVE, + INT_NEGATIVE, + FLOAT_POSITIVE, + FLOAT_NEGATIVE, + STRING +}; +enum class ParameterRange { + SET, + SINGLE +}; +using GoodBadParams = std::pair, std::vector>; +using Params = std::map>; + +Params operator + (const Params& val1, const Params& val2) { + Params result; + result.insert(val1.begin(), val1.end()); + result.insert(val2.begin(), val2.end()); + return result; +} + +class Parameters { +private: + // Common for Convolution, Deconvolution, Pooling layers + Params common { + // Parameter name, range, type of good values, type of bad + {"stride-x", {ParameterRange::SINGLE, + {{ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"stride-y", {ParameterRange::SINGLE, + {{ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"kernel-x", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"kernel-y", {ParameterRange::SINGLE, {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"pad-x", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"pad-y", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}} + }; + Params conv { + // Parameter name, range, type of good values, type of bad + {"dilation-x", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"dilation-y", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"output", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"group", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + }; + Params pooling { + // Parameter name, range, type of good values, type of bad + {"pool-method", {ParameterRange::SINGLE, + {{ParametersValues::STRING}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"exclude-pad", {ParameterRange::SINGLE, + {{ParametersValues::STRING}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}} + }; + Params detectionOutput { + // Parameter name, range, type of good values, type of bad + {"num_classes", {ParameterRange::SINGLE, + {{ParametersValues::INT_POSITIVE}, + {ParametersValues::ZERO, ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"background_label_id", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"top_k", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"variance_encoded_in_target", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"keep_top_k", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"num_orient_classes", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"code_type", {ParameterRange::SINGLE, + {{ParametersValues::STRING}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"share_location", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"interpolate_orientation", {ParameterRange::SINGLE, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::STRING}}}}, + {"nms_threshold", {ParameterRange::SINGLE, + {{ParametersValues::FLOAT_POSITIVE}, + {ParametersValues::FLOAT_NEGATIVE, ParametersValues::STRING}}}}, + {"confidence_threshold", {ParameterRange::SINGLE, + {{ParametersValues::FLOAT_POSITIVE}, + {ParametersValues::FLOAT_NEGATIVE, ParametersValues::STRING}}}} + }; + Params crop { + {"axis", {ParameterRange::SET, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"offset", {ParameterRange::SET, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"dim", {ParameterRange::SET, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"crop_begin", {ParameterRange::SET, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"crop_end", {ParameterRange::SET, + {{ParametersValues::ZERO, ParametersValues::INT_POSITIVE}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + }; + Params interp { + {"height", {ParameterRange::SINGLE, + {{ParametersValues::INT_POSITIVE, ParametersValues::ZERO}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + {"factor", {ParameterRange::SINGLE, + {{ParametersValues::FLOAT_POSITIVE}, + {ParametersValues::ZERO, ParametersValues::FLOAT_NEGATIVE, ParametersValues::STRING}}}}, + {"shrink_factor", {ParameterRange::SINGLE, + {{ParametersValues::FLOAT_POSITIVE}, + {ParametersValues::ZERO, ParametersValues::FLOAT_NEGATIVE, ParametersValues::STRING}}}}, + {"zoom_factor", {ParameterRange::SINGLE, + {{ParametersValues::FLOAT_POSITIVE}, + {ParametersValues::ZERO, ParametersValues::FLOAT_NEGATIVE, ParametersValues::STRING}}}}, + {"width", {ParameterRange::SINGLE, + {{ParametersValues::INT_POSITIVE, ParametersValues::ZERO}, + {ParametersValues::INT_NEGATIVE, ParametersValues::STRING}}}}, + }; + std::map>> stringParams { + {"Eltwise", {{"operation", {"sum", "max", "mul"}}}}, + {"LRN", {{"region", {"across", "same"}}}}, + {"Activation", {{"type", {"sigmoid", "tanh", "elu", "relu6"}}}}, + {"Pooling", {{"pool_method", {"max", "avg"}}, {"exlude_pad", {"true", "false"}}}}, + {"Resample", {{"type", {"caffe.ResampleParameter.LINEAR", "caffe.ResampleParameter.CUBIC", + "caffe.ResampleParameter.NEAREST"}}}}, + {"DetectionOutput", {{"code_type", {"caffe.PriorBoxParameter.CENTER_SIZE", "caffe.PriorBoxParameter.CORNER"}}}} + }; + std::map layerParamsNames { + // Layer name, Corresponding params names + {"Convolution", common + conv}, + {"Deconvolution", common + conv}, + {"Pooling", common + pooling}, + {"DetectionOutput", detectionOutput}, + {"Crop", crop}, + {"Interp", interp} + }; + const int zero = 0; + std::string type; + std::mt19937 gen; + std::uniform_int_distribution distIntPositive; + std::uniform_int_distribution distIntNegative; + std::uniform_real_distribution distFloatNegative; + std::uniform_real_distribution distFloatPositive; + std::queue paramWasInvalid; +public: + Parameters() {} + Parameters(const std::string& type) : gen(static_cast(std::chrono::system_clock::now().time_since_epoch().count())), + distIntPositive(1, 100), + distIntNegative(-100, -1), + distFloatNegative(-10.0, -0.1), + distFloatPositive(0.1, 10.0) { + this->type = type; + Params param = getParametersByLayerName(); + for (auto iter : param) { + paramWasInvalid.push(iter.first); + } + } + Params getParametersByLayerName() { + return layerParamsNames[type]; + } + + std::vector getDifferentParamValues(const std::vector& valuesType, + const std::vector& stringValues) { + int magicNumber = 10; + std::vector paramsValues = {}; + for (auto i : valuesType) { + switch(i) { + case ParametersValues::ZERO: { + paramsValues.push_back("0"); + break; + } + case ParametersValues::INT_POSITIVE: { + for (int j = 0; j < magicNumber; ++j) { + paramsValues.push_back(std::to_string(distIntPositive(gen))); + } + break; + } + case ParametersValues::INT_NEGATIVE: { + for (int j = 0; j < magicNumber; ++j) { + paramsValues.push_back(std::to_string(distIntNegative(gen))); + } + break; + } + case ParametersValues::FLOAT_POSITIVE: { + for (int j = 0; j < magicNumber; ++j) { + paramsValues.push_back(std::to_string(distFloatPositive(gen))); + } + break; + } + case ParametersValues::FLOAT_NEGATIVE: { + for (int j = 0; j < magicNumber; ++j) { + paramsValues.push_back(std::to_string(distFloatNegative(gen))); + } + break; + } + case ParametersValues::STRING: { + paramsValues.insert(paramsValues.begin(), stringValues.begin(), stringValues.end()); + break; + } + } + } + + return paramsValues; + } + + std::map getValidParameters() { + Params param = getParametersByLayerName(); + std::map params; + for (auto i : param) { + params[i.first] = getCorrectParamValue(i.second, i.first); + } + return params; + } + + std::string getCorrectParamValue(const std::pair& values, + const std::string& paramName) { + std::string parameter = ""; + ParameterRange howMany = values.first; + std::vector valuesType = values.second.first; + + std::vector paramsValues = getDifferentParamValues(valuesType, stringParams[type][paramName]); + + std::uniform_int_distribution indexesDist(0, static_cast(paramsValues.size() - 1)); + if (howMany == ParameterRange::SINGLE) { + int index = indexesDist(gen); + parameter = paramsValues[index]; + } else { + int numOfDigits = indexesDist(gen); + for (int i = 0; i < numOfDigits; i++) { + parameter += paramsValues[i] + ", "; + } + } + return parameter; + } + + std::string getIncorrectParamValue(const std::pair& values) { + std::string parameter = ""; + std::vector valuesType = values.second.second; + + std::vector paramsValues = getDifferentParamValues(valuesType, {"foo", "bar"}); + std::uniform_int_distribution indexesDist(0, static_cast(paramsValues.size() - 1)); + int index = indexesDist(gen); + parameter = paramsValues[index]; + + return parameter; + } + + std::map getInvalidParameters() { + std::map params = getValidParameters(); + + std::string paramName = paramWasInvalid.front(); + paramWasInvalid.pop(); + params[paramName] = getIncorrectParamValue(layerParamsNames[type][paramName]); + return params; + } + + int getNumOfParameters() { + return static_cast(layerParamsNames[type].size()); + } +}; diff --git a/inference-engine/tests/unit/cnn_network/parser_tests_base.hpp b/inference-engine/tests/unit/cnn_network/parser_tests_base.hpp index 28c4646..866da4d 100644 --- a/inference-engine/tests/unit/cnn_network/parser_tests_base.hpp +++ b/inference-engine/tests/unit/cnn_network/parser_tests_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -56,7 +56,7 @@ class FormatParserTest : public TestsCommon { void assertParseFail(const std::string& fileContent) { try { parse(fileContent); - FAIL() << "Parser didn't trow"; + FAIL() << "Parser didn't throw"; } catch (const std::exception& ex) { SUCCEED() << ex.what(); } @@ -69,7 +69,7 @@ class FormatParserTest : public TestsCommon { void assertSetWeightsFail(const InferenceEngine::TBlob::Ptr& binBlob) { try { parser->SetWeights(binBlob); - FAIL() << "Parser didn't trow"; + FAIL() << "Parser didn't throw"; } catch (const std::exception& ex) { SUCCEED() << ex.what(); } diff --git a/inference-engine/tests/unit/cnn_network/shapes.h b/inference-engine/tests/unit/cnn_network/shapes.h new file mode 100644 index 0000000..87198f6 --- /dev/null +++ b/inference-engine/tests/unit/cnn_network/shapes.h @@ -0,0 +1,257 @@ +/* +* INTEL CONFIDENTIAL +* Copyright (C) 2018-2019 Intel Corporation. +* +* The source code contained or described herein and all documents +* related to the source code ("Material") are owned by Intel Corporation +* or its suppliers or licensors. Title to the Material remains with +* Intel Corporation or its suppliers and licensors. The Material may +* contain trade secrets and proprietary and confidential information +* of Intel Corporation and its suppliers and licensors, and is protected +* by worldwide copyright and trade secret laws and treaty provisions. +* No part of the Material may be used, copied, reproduced, modified, +* published, uploaded, posted, transmitted, distributed, or disclosed +* in any way without Intel's prior express written permission. +* +* No license under any patent, copyright, trade secret or other +* intellectual property right is granted to or conferred upon you by +* disclosure or delivery of the Materials, either expressly, by implication, +* inducement, estoppel or otherwise. Any license under such intellectual +* property rights must be express and approved by Intel in writing. +* +* Include any supplier copyright notices as supplier requires Intel to use. +* +* Include supplier trademarks or logos as supplier requires Intel to use, +* preceded by an asterisk. An asterisked footnote can be added as follows: +* *Third Party trademarks are the property of their respective owners. +* +* Unless otherwise agreed by Intel in writing, you may not remove or alter +* this notice or any other notice embedded in Materials by Intel or Intel's +* suppliers or licensors in any way. +*/ + +#ifndef SHAPES_H +#define SHAPES_H + +#include +#include +#include +#include +#include +#include +#include + +using namespace testing; + +struct Maps{ + std::map mapOfEqualShapes { + // Layer name, Correct num of input, Correct num of output + { "Convolution", 1}, + { "Deconvolution", 1}, + { "Crop", 1}, + { "Interp", 1} + }; + + std::map> mapOfUnequalShapes { + // Layer name, Correct num of input, Correct num of output + { "Crop", {2, 1}}, + { "DetectionOutput", {3, 1}}, + { "Interp", {2, 1}} + }; + + std::map> mapOfContinuousShapes { + // Layer name, Correct num of input, Correct num of output + { "Slice", {"1", "N"}}, + { "Eltwise", {"N", "1"}} + }; +} maps; + +class ShapesHelper { +protected: + std::string type; +public: + ShapesHelper() = default; + + explicit ShapesHelper(std::string& type) { + this->type = type; + } + + std::string getType() {return type;} + + virtual testing::InOutShapes getValidShapes() = 0; + virtual testing::InOutShapes getInvalidInputShapes() = 0; + + std::vector> generateShapes(const int& numOfShapes) { + std::mt19937 gen(static_cast(std::chrono::high_resolution_clock::now().time_since_epoch().count())); + std::uniform_int_distribution dist(1, 256); + + std::vector> shape; + shape.reserve(static_cast(numOfShapes)); + for (int i = 0; i < numOfShapes; ++i) { + shape.push_back({dist(gen), dist(gen), dist(gen), 7}); + } + return shape; + } + virtual ~ShapesHelper() = default; +}; + +class EqualIOShapesHelper : public ShapesHelper { +public: + explicit EqualIOShapesHelper(std::string& type) : ShapesHelper(type) {}; + + testing::InOutShapes getValidShapes() override { + int numOfInput = {maps.mapOfEqualShapes[type]}; + int numOfOutput = {maps.mapOfEqualShapes[type]}; + std::vector> inputs = generateShapes(numOfInput); + std::vector> outputs = generateShapes(numOfOutput); + return {inputs, outputs}; + } + + testing::InOutShapes getInvalidInputShapes() override { + int numOfOutput = maps.mapOfEqualShapes[type]; + int numOfInput = maps.mapOfEqualShapes[type] + numOfOutput; + std::vector> inputs = generateShapes(numOfInput); + std::vector> outputs = generateShapes(numOfOutput); + return {inputs, outputs}; + } + ~EqualIOShapesHelper() override = default; +}; + +class NotEqualConcreteIOShapesHelper : public ShapesHelper { +public: + explicit NotEqualConcreteIOShapesHelper(std::string& type) : ShapesHelper(type) {}; + + testing::InOutShapes getValidShapes() override { + int numOfInput = maps.mapOfUnequalShapes[type].first; + int numOfOutput = maps.mapOfUnequalShapes[type].second; + std::vector> inputs = generateShapes(numOfInput); + std::vector> outputs = generateShapes(numOfOutput); + return {inputs, outputs}; + } + + testing::InOutShapes getInvalidInputShapes() override { + int numOfOutput = maps.mapOfUnequalShapes[type].second; + int numOfInput = maps. mapOfUnequalShapes[type].first + numOfOutput; + + std::vector> inputs = generateShapes(numOfInput); + std::vector> outputs = generateShapes(numOfOutput); + return {inputs, outputs}; + } + ~NotEqualConcreteIOShapesHelper() override = default; +}; + +class NotEqualIOShapesHelper : public ShapesHelper { +private: + bool is_number(const std::string& s) + { + return !s.empty() && std::find_if(s.begin(), + s.end(), [](char c) { return !std::isdigit(c); }) == s.end(); + } + +public: + + explicit NotEqualIOShapesHelper(std::string& type) : ShapesHelper(type) {}; + + testing::InOutShapes getValidShapes() override { + int numOfInput; + int numOfOutput; + std::vector> inputs; + std::vector> outputs; + if (is_number(maps.mapOfContinuousShapes[type].first)) { + numOfInput = std::stoi(maps.mapOfContinuousShapes[type].first); + inputs = generateShapes(numOfInput); + outputs = generateShapes(100); + } else { + numOfOutput = std::stoi(maps.mapOfContinuousShapes[type].second); + outputs = generateShapes(numOfOutput); + inputs = generateShapes(100); + } + + return {inputs, outputs}; + } + + testing::InOutShapes getInvalidInputShapes() override { + int numOfInput; + int numOfOutput; + std::vector> inputs; + std::vector> outputs; + if (is_number(maps.mapOfContinuousShapes[type].first)) { + numOfInput = std::stoi(maps.mapOfContinuousShapes[type].first) * 2; + inputs = generateShapes(numOfInput); + outputs = generateShapes(100); + } else { + numOfOutput = std::stoi(maps.mapOfContinuousShapes[type].second); + outputs = generateShapes(numOfOutput); + inputs = generateShapes(100); + } + return {inputs, outputs}; + } + + ~NotEqualIOShapesHelper() override = default; +}; + +class Layers { +public: + virtual bool containLayer(std::string concrete_layer) = 0; + virtual ShapesHelper* factoryShape() = 0; + virtual ~Layers() = default; +}; + +class LayersWithEqualIO : public Layers { +private: + std::string layer = ""; +public: + bool containLayer(std::string concrete_layer) override { + for (const auto& layer : maps.mapOfEqualShapes) { + if (concrete_layer == layer.first) { + this->layer = concrete_layer; + return true; + } + } + return false; + } + ShapesHelper* factoryShape() override { + return new EqualIOShapesHelper(this->layer); + } + ~LayersWithEqualIO() override = default; +}; + +class LayersWithNotEqualIO : public Layers{ +private: + std::string layer = ""; +public: + bool containLayer(std::string concrete_layer) override { + for (const auto& layer : maps.mapOfUnequalShapes) { + if (concrete_layer == layer.first) { + this->layer = concrete_layer; + return true; + } + } + return false; + } + ShapesHelper* factoryShape() override { + return new NotEqualConcreteIOShapesHelper(this->layer); + } + ~LayersWithNotEqualIO() override = default; +}; + +class LayersWithNIO : public Layers{ +private: + std::string layer = ""; +public: + bool containLayer(std::string concrete_layer) override { + for (const auto& layer : maps.mapOfContinuousShapes) { + if (concrete_layer == layer.first) { + this->layer = concrete_layer; + return true; + } + } + return false; + } + ShapesHelper* factoryShape() override { + return new NotEqualIOShapesHelper(this->layer); + } + ~LayersWithNIO() override = default; +}; + +#endif // SHAPES_H \ No newline at end of file diff --git a/inference-engine/tests/unit/cnn_network/v2_format_parser_test.cpp b/inference-engine/tests/unit/cnn_network/v2_format_parser_test.cpp index 1b9cdc0..36b49dd 100644 --- a/inference-engine/tests/unit/cnn_network/v2_format_parser_test.cpp +++ b/inference-engine/tests/unit/cnn_network/v2_format_parser_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -144,6 +144,24 @@ TEST_F(V2FormatParserTest, failIfIdLessThanZero) { ASSERT_NO_FATAL_FAILURE(assertParseFail(content)); } +TEST_F(V2FormatParserTest, failIfIdNotInteger) { + string content = MAKE_ALEXNET_FOR_MEAN_TESTS_V2() + .node("channel").attr("id", "0").node("mean").attr("value", "104.5").close() + .newnode("channel").attr("id", "1").node("mean").attr("value", "117.8").close() + .newnode("channel").attr("id", "2_2").node("mean").attr("value", "123").close(); + + ASSERT_NO_FATAL_FAILURE(assertParseFail(content)); +} + +TEST_F(V2FormatParserTest, failIfValueNotFloat) { + string content = MAKE_ALEXNET_FOR_MEAN_TESTS_V2() + .node("channel").attr("id", "0").node("mean").attr("value", "104,5").close() + .newnode("channel").attr("id", "1").node("mean").attr("value", "117.8").close() + .newnode("channel").attr("id", "2").node("mean").attr("value", "123").close(); + + ASSERT_NO_FATAL_FAILURE(assertParseFail(content)); +} + TEST_F(V2FormatParserTest, failIfIdMoreThanNumChannels) { string content = MAKE_ALEXNET_FOR_MEAN_TESTS_V2() .node("channel").attr("id", "4").node("mean").attr("value", "104.5").close(); @@ -653,4 +671,4 @@ TEST_F(V2FormatParserTest, canConvertActivationLayerAsClamp) { ASSERT_EQ(clamp->min_value, -5); ASSERT_EQ(clamp->max_value, 5); ASSERT_EQ(clamp->params.find("type"), clamp->params.end()); -} \ No newline at end of file +} diff --git a/inference-engine/tests/unit/cnn_network/v3_format_parser_test.cpp b/inference-engine/tests/unit/cnn_network/v3_format_parser_test.cpp index b80d2cb..085c299 100644 --- a/inference-engine/tests/unit/cnn_network/v3_format_parser_test.cpp +++ b/inference-engine/tests/unit/cnn_network/v3_format_parser_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/cnn_network/xml_father_tests.cpp b/inference-engine/tests/unit/cnn_network/xml_father_tests.cpp index 451a15b..f43021d 100644 --- a/inference-engine/tests/unit/cnn_network/xml_father_tests.cpp +++ b/inference-engine/tests/unit/cnn_network/xml_father_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp b/inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp index 8e69a3b..cb6e800 100644 --- a/inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp +++ b/inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include diff --git a/inference-engine/tests/unit/engines/gna/configuration_test.cpp b/inference-engine/tests/unit/engines/gna/configuration_test.cpp index e17e6db..70229c6 100644 --- a/inference-engine/tests/unit/engines/gna/configuration_test.cpp +++ b/inference-engine/tests/unit/engines/gna/configuration_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include @@ -133,4 +145,10 @@ TEST_F(GNAConfigTest, canMatchWithSingleMultipleOMPThreads) { .inNotCompactMode() .enable_omp_multithreading() .gna().propagate_forward().called_without().pwl_inserted_into_nnet(); +} + +TEST_F(GNAConfigTest, failToCreatePluginWithDifferentInputScaleFactors) { + assert_that().creating().gna_plugin() + .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR))+"_1", 1000) + .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR))+"_2", 2000).throws(); } \ No newline at end of file diff --git a/inference-engine/tests/unit/engines/gna/fp32_non_quantized_tests.cpp b/inference-engine/tests/unit/engines/gna/fp32_non_quantized_tests.cpp new file mode 100644 index 0000000..faf574e --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/fp32_non_quantized_tests.cpp @@ -0,0 +1,208 @@ +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. +// + + +#include +#include +#include +#include "gna_plugin/quantization/model_quantizer.hpp" +#include "gna_plugin/quantization/layer_quantizer.hpp" +#include "gna_matcher.hpp" + +using namespace InferenceEngine; +using namespace GNAPluginNS; +using namespace GNATestIRs; + +class FP32NonQuantizedTest : public GNATest { + protected: + + void SetUp() override { + } +}; + +TEST_F(FP32NonQuantizedTest, SplitFollowedByFCAndEltwiseOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {12.0, 12.0, 12.0, 12.0, 12.0, + 12.0, 12.0, 12.0, 12.0, 12.0}; + assert_that().onInferModel(FCWithPaddingAfterSplitModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, SliceFollowedByFCAndEltwiseOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0}; + assert_that().onInferModel(FCWithPaddingAfterSliceModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, SliceFollowedByAlignedFCAndEltwiseOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {18.0, 18.0, 18.0, 18.0}; + assert_that().onInferModel(SliceModelWithAlignedOutputs()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, DISABLED_SliceFollowedBy2FCsAnd2EltwisesOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0}; + assert_that().onInferModel(twoFCWithPaddingAfterSliceModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, SplitAfterFCFollowedByFCAndEltwiseOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {232.0, 232.0, 232.0, 232.0, 232.0, + 232.0, 232.0, 232.0, 232.0, 232.0}; + assert_that().onInferModel(FCBeforeSplitModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + + +TEST_F(FP32NonQuantizedTest, ConcatPropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {121.0, 121.0, 121.0, 121.0, 121.0, + 121.0, 121.0, 121.0, 121.0, 121.0, + 121.0, 121.0, 121.0, 121.0, 121.0, + 121.0, 121.0, 121.0, 121.0, 121.0}; + + assert_that().onInferModel(concatModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, DoubleConcatPropageteForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector expected_result = {141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0, + 141.0, 141.0, 141.0, 141.0, 141.0}; + + assert_that().onInferModel(doubleConcatModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, multiple_inputs_correct_results) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector input2_data = {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}; + std::vector result = {30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0}; + + assert_that().onInferModel(two_inputs_to_affine()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with().input("input_1", input_data).And().input("input_2", input2_data).result().equal_to(result); +} + + +TEST_F(FP32NonQuantizedTest, CropWithoutOffsetPropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector expected_result = {11.0, 11.0, 11.0, 11.0, 11.0, + 11.0, 11.0, 11.0, 11.0, 11.0}; + + assert_that().onInferModel(cropWithoutOffsetModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, CropWithAlignedOffsetPropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector expected_result = {3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0}; + + assert_that().onInferModel(cropWithAlignedOffsetModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, CropWithOffsetPropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector expected_result = {7.0, 7.0, 7.0, 7.0, 7.0, + 7.0, 7.0, 7.0, 7.0, 7.0}; + + assert_that().onInferModel(cropWithOffsetModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, CropWithMaxOffsetPropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector expected_result = {1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0}; + + assert_that().onInferModel(cropWithMaxOffsetModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, CropWithOffsetAfterFCPropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector expected_result = {111.0, 111.0, 111.0, 111.0, 111.0, + 111.0, 111.0, 111.0, 111.0, 111.0}; + + assert_that().onInferModel(cropWithOffsetExtendedModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + +TEST_F(FP32NonQuantizedTest, CopySimpleCasePropagateForwardWithSuccessOnCPU) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector expected_result = {12.0, 12.0, 12.0, 12.0, 12.0, + 12.0, 12.0, 12.0, 12.0, 12.0, + 11.0, 11.0, 11.0, 11.0, 11.0, + 11.0, 11.0, 11.0, 11.0, 11.0,}; + + assert_that().onInferModel(copyModel()) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} + + +TEST_F(FP32NonQuantizedTest, ScaleShiftWithBroadcastSupported) { + std::vector input_data (40, 1.0); + + std::vector expected_result = {2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0}; + + assert_that().onInferModel(ScaleShift3DModel()).withWeigthsPattern({1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}) + .inNotCompactMode().gna().propagate_forward().onCPU() + .called_with_input_and_expected_output(input_data, expected_result); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp b/inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp index 35ddc77..d83c1c3 100644 --- a/inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include "gna_plugin/gna_allocator.hpp" diff --git a/inference-engine/tests/unit/engines/gna/gna_aminteldnn_test.cpp b/inference-engine/tests/unit/engines/gna/gna_aminteldnn_test.cpp new file mode 100644 index 0000000..2dfd288 --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/gna_aminteldnn_test.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "gna_matcher.hpp" +#include "inference_engine.hpp" +#include "dnn.h" + +using namespace testing; +using namespace InferenceEngine; + +class GNA_AmIntelDnn_test : public GNATest { +protected: + AmIntelDnn amIntelDnn; + intel_nnet_type_t desc = {}; +}; + +TEST_F(GNA_AmIntelDnn_test, intel_nnet_type_tDoesNotFreeHisMemory) { + desc.pLayers = nullptr; + amIntelDnn.component.resize(1); + amIntelDnn.component[0].operation = kDnnAffineOp; + ASSERT_NO_THROW(amIntelDnn.InitGNAStruct(&desc)); // thirst init is ok + ASSERT_THROW(amIntelDnn.InitGNAStruct(&desc), InferenceEngine::details::InferenceEngineException); // second init involves memory leak +} + +TEST_F(GNA_AmIntelDnn_test, intel_nnet_type_t_ptrIsNullptr) { + ASSERT_THROW(amIntelDnn.InitGNAStruct(nullptr), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(GNA_AmIntelDnn_test, intel_nnet_type_t_pLayersIsNotNullptr) { + ASSERT_THROW(amIntelDnn.InitGNAStruct(&desc), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(GNA_AmIntelDnn_test, ComponentIsEmpty) { + desc.pLayers = nullptr; + ASSERT_THROW(amIntelDnn.InitGNAStruct(&desc), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/engines/gna/gna_api_stub.cpp b/inference-engine/tests/unit/engines/gna/gna_api_stub.cpp index 5417e52..0223fc0 100644 --- a/inference-engine/tests/unit/engines/gna/gna_api_stub.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_api_stub.cpp @@ -1,6 +1,24 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +//***************************************************************************** // +// INTEL CONFIDENTIAL +// Copyright (C) 2018-2019 Intel Corporation +// +// The source code contained or described herein and all documents related +// to the source code ("Material") are owned by Intel Corporation or its suppliers +// or licensors. Title to the Material remains with Intel Corporation or its suppliers +// and licensors. The Material contains trade secrets and proprietary +// and confidential information of Intel or its suppliers and licensors. +// The Material is protected by worldwide copyright and trade secret laws and treaty +// provisions. No part of the Material may be used, copied, reproduced, modified, +// published, uploaded, posted, transmitted, distributed, or disclosed in any way +// without Intel's prior express written permission. +// +// No license under any patent, copyright, trade secret or other intellectual +// property right is granted to or conferred upon you by disclosure or delivery +// of the Materials, either expressly, by implication, inducement, estoppel +// or otherwise. Any license under such intellectual property rights must +// be express and approved by Intel in writing. +//***************************************************************************** #define INTEL_GNA_DLLEXPORT 1 #include diff --git a/inference-engine/tests/unit/engines/gna/gna_cppwraper_test.cpp b/inference-engine/tests/unit/engines/gna/gna_cppwraper_test.cpp new file mode 100644 index 0000000..de937d2 --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/gna_cppwraper_test.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef _WIN32 +#include +#endif +#include "gna_api_wrapper.hpp" +#include + +using namespace testing; +using namespace InferenceEngine; + +class GNA_CPPWrapper_test : public ::testing::Test {}; + +TEST_F(GNA_CPPWrapper_test, CPPWrapperConstructorCannotWorkWithInputEqualToZero) { + ASSERT_THROW(GNAPluginNS::CPPWrapper(0), InferenceEngine::details::InferenceEngineException); +} + +TEST_F(GNA_CPPWrapper_test, CPPWrapperConstructorCanWorkWithInputNotEqualToZero) { + ASSERT_NO_THROW(GNAPluginNS::CPPWrapper(3)); +} + +TEST_F(GNA_CPPWrapper_test, CPPWrapperConstructorCanWorkWithoutAnyInput) { + ASSERT_NO_THROW(GNAPluginNS::CPPWrapper()); +} + diff --git a/inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp b/inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp index 45385be..0add255 100644 --- a/inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp @@ -1,6 +1,35 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ + #include #include diff --git a/inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp b/inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp index b7dba21..c9f4bce 100644 --- a/inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include diff --git a/inference-engine/tests/unit/engines/gna/gna_input_precision_test.cpp b/inference-engine/tests/unit/engines/gna/gna_input_precision_test.cpp new file mode 100644 index 0000000..d776c03 --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/gna_input_precision_test.cpp @@ -0,0 +1,51 @@ +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. +// + +#include +#include +#include "gna_matcher.hpp" +#include "matchers/input_data_matcher.hpp" +#include "test_irs.hpp" + +using namespace std; +using namespace InferenceEngine; +using namespace ::testing; +using namespace GNATestIRs; + +class GNAInputPrecisionTest : public GNATest { +}; + +TEST_F(GNAInputPrecisionTest, CanProcessU8Input) { + std::vector input_init = {128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; + double scale = 1.f / 128; + std::vector input_processed = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + + assert_that().onInferModel(Fc2DOutputModel()) + .inNotCompactMode().gna().propagate_forward().called_with() + .preprocessed_input_data(input_init, input_processed, Precision::U8) + .withGNAConfig(GNA_CONFIG_KEY(SCALE_FACTOR), scale); +} + +TEST_F(GNAInputPrecisionTest, CanProcessFP32Input) { + std::vector input_init = {1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280, 1280}; + double scale = 1.f / 1280; + std::vector input_processed = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + + assert_that().onInferModel(Fc2DOutputModel()) + .inNotCompactMode().gna().propagate_forward().called_with() + .preprocessed_input_data(input_init, input_processed, Precision::FP32) + .withGNAConfig(GNA_CONFIG_KEY(SCALE_FACTOR), scale); +} diff --git a/inference-engine/tests/unit/engines/gna/gna_matcher.cpp b/inference-engine/tests/unit/engines/gna/gna_matcher.cpp index c609e4e..016ae35 100644 --- a/inference-engine/tests/unit/engines/gna/gna_matcher.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_matcher.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include @@ -16,10 +28,14 @@ #include "matchers/pwl_quantization_metrics_matcher.hpp" #include "matchers/conv_matcher.hpp" #include "matchers/pool_matcher.hpp" +#include "matchers/fill_with_data.hpp" +#include "matchers/weights_matcher.hpp" #include #include #include "gmock/gmock.h" +#include "matchers/input_data_matcher.hpp" +#include using namespace std; using namespace InferenceEngine; @@ -30,10 +46,10 @@ class NullAllocator : public IAllocator { void * ptr = nullptr; public: NullAllocator() { - ptr = malloc(1); + ptr = new char[1]; } ~NullAllocator() { - free(ptr); + delete[] static_cast(ptr); } void * lock(void * handle, LockOp = LOCK_FOR_WRITE) noexcept override { return ptr; @@ -56,8 +72,11 @@ void GNAPropagateMatcher :: match() { try { // matching gna propagate forward call. GNAPlugin plugin(_env.config); + plugin.SetPolicy(_env.policy); size_t inputSize = 10; size_t outputSize = 10; + InputsDataMap inputsInfo; + OutputsDataMap outputsInfo; auto loadNetworkFromIR = [&] () { CNNNetReader net_reader; @@ -90,7 +109,11 @@ void GNAPropagateMatcher :: match() { auto weights = make_shared_blob(Precision::U8, C, {weightsSize}); weights->allocate(); - GNATest::fillWeights(weights); + if (!_env.weightsFillPattern.empty()) { + GNATest::fillWeights(weights, _env.weightsFillPattern); + } else { + GNATest::fillWeights(weights); + } net_reader.SetWeights(weights); net_reader.getNetwork().setTargetDevice(_env.target_device); @@ -101,35 +124,60 @@ void GNAPropagateMatcher :: match() { } plugin.LoadNetwork(net_reader.getNetwork()); + + inputsInfo = net_reader.getNetwork().getInputsInfo(); + outputsInfo = net_reader.getNetwork().getOutputsInfo(); }; auto loadNetworkFromAOT = [&] () { - plugin.ImportNetwork(_env.importedModelFileName); + auto sp = plugin.ImportNetwork(_env.importedModelFileName); + inputsInfo = plugin.GetInputs(); + outputsInfo = plugin.GetOutputs(); }; - TBlob::Ptr input, output; + std::map input; + TBlob::Ptr output; size_t in_N = 1; size_t out_N = in_N; size_t in_C; size_t out_C; - auto loadNetwork = [&]() { if (!_env.importedModelFileName.empty()) { ASSERT_NO_FATAL_FAILURE(loadNetworkFromAOT()); } else { ASSERT_NO_FATAL_FAILURE(loadNetworkFromIR()); } - in_C = _env.matchOutput == true ? _env.input_init.size(): inputSize; - out_C = _env.matchOutput == true ? _env.expected_output.size(): outputSize; - - input.reset(new TBlob(Precision::FP32, NC, {in_C, in_N})); - input->allocate(); + const int channel_idx = 0; + bool haveInputs = !_env.input_init.empty(); + for (auto && info :inputsInfo) { + decltype(_env.input_init)::iterator it; + auto & inputBlob = input[info.first]; + if (haveInputs) { + if (inputsInfo.size() != 1) { + ASSERT_NE(it = _env.input_init.find(info.first), _env.input_init.end()); + } else { + ASSERT_NE(0, _env.input_init.size()); + it = _env.input_init.begin(); + } + in_C = it->second.size(); + ASSERT_EQ(in_C, info.second->getDims()[channel_idx]); + } - if(_env.matchOutput == true) { - std::copy_n(_env.input_init.cbegin(), in_N * in_C, input->buffer().as()); + inputBlob = make_blob_with_precision(_env.input_precision, info.second->getLayout(), info.second->getDims()); + inputBlob->allocate(); + if (haveInputs) { + if (_env.input_precision == Precision::FP32) { + std::copy_n(it->second.cbegin(), in_N * in_C, inputBlob->buffer().as()); + } else if (_env.input_precision == Precision::U8) { + std::copy_n(it->second.cbegin(), in_N * in_C, inputBlob->buffer().as()); + } else { + std::logic_error(std::string("Unsupported input precision: ") + _env.input_precision.name()); + } + } } + out_C = _env.matchOutput == true ? _env.expected_output.size(): outputSize; output.reset(new TBlob(Precision::FP32, NC, {out_C, out_N})); output->allocate(); }; @@ -199,6 +247,21 @@ void GNAPropagateMatcher :: match() { EXPECT_CALL(mockApi, GNAPropagateForward(_, _, _, _, _, _)) .WillOnce(DoAll(SaveArgPointee<1>(savedNet), Return(GNA_NOERROR))); break; + case GnaPluginTestEnvironment::matchInputData : + combined->add(new InputDataMatcher(_env.input_processed)); + break; + case GnaPluginTestEnvironment::fillOutputValues : + combined->add(new OutputFiller(_env.fillValue, _env.fillValue)); + break; + case GnaPluginTestEnvironment::matchAffineWeightsTranspose: + HasWeightsTranspozed(combined, _env.transposedData, _env.transposeArgs); + break; + case GnaPluginTestEnvironment::matchAffineWeights: + HasWeightsEq(combined, _env.transposedData); + break; + case GnaPluginTestEnvironment::saveAffineWeights: + SaveWeights(combined, _env.transposedData, _env.transposedArgsForSaving); + break; default: EXPECT_CALL(mockApi, GNAPropagateForward(_, _, _, _, _, _)) .WillOnce(Return(GNA_NOERROR)); @@ -211,15 +274,39 @@ void GNAPropagateMatcher :: match() { } loadNetwork(); - plugin.Infer(*input, *output); - if(_env.matchOutput == true) { + + if (!inputsInfo.empty()) { + BlobMap input_blob_map; + BlobMap output_blob_map; + for (auto info : inputsInfo) { + size_t current_size = InferenceEngine::details::product(info.second->getTensorDesc().getDims()); + input_blob_map[info.first] = input[info.first]; + } + size_t offset = 0; + for (auto info : outputsInfo) { + size_t current_size = InferenceEngine::details::product(info.second->getTensorDesc().getDims()); + output_blob_map[info.first] = make_shared_blob( + info.second->getPrecision(), NC, + {1, details::product(info.second->getDims())}, output->data() + offset, current_size * sizeof(float)); + offset += current_size; + } + + plugin.Infer(input_blob_map, output_blob_map); + + } else { + plugin.Infer(*input.begin()->second, *output); + } + + + if (_env.matchOutput) { std::vector actual_output(output->size()); std::copy_n(output->cbuffer().as(), out_C * out_N, actual_output.begin()); - ASSERT_EQ(true, - std::equal(_env.expected_output.begin(), _env.expected_output.end(), actual_output.begin()) - ); + for (auto ref = _env.expected_output.begin(); ref != _env.expected_output.end(); ref++ ) { + auto idx = std::distance( _env.expected_output.begin(), ref); + ASSERT_FLOAT_EQ(*ref, actual_output[idx]) << "at "<< idx; + } } std::map perfMap; @@ -437,4 +524,4 @@ void GNAQueryStateMatcher :: match() { catch(...) { FAIL() << "unknown exception thrown"; } -} \ No newline at end of file +} diff --git a/inference-engine/tests/unit/engines/gna/gna_matcher.hpp b/inference-engine/tests/unit/engines/gna/gna_matcher.hpp index b249aa2..cd3680c 100644 --- a/inference-engine/tests/unit/engines/gna/gna_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/gna_matcher.hpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once @@ -47,7 +59,12 @@ class GnaPluginTestEnvironment { matchPwlQuantizeMetrics, matchCopyInserted, matchDiagonalInserted, - saveArgs + saveArgs, + matchInputData, + fillOutputValues, + matchAffineWeightsTranspose, + matchAffineWeights, + saveAffineWeights }; std::vector whatToMatch; enum { @@ -68,14 +85,22 @@ class GnaPluginTestEnvironment { bool exportNetworkOnly = false; std::function cb; std::map config; + GNAPluginNS::Policy policy; bool matchThrows = false; uint32_t proc_type = static_cast(GNA_SOFTWARE & GNA_HARDWARE); std::string importedModelFileName; bool is_profiling_enabled = false; bool matchOutput = false; bool is_setup_of_omp_theads_expected = false; - std::vector input_init; + std::vector input_processed; + InferenceEngine::Precision input_precision = InferenceEngine::Precision::FP32; + std::map> input_init; std::vector expected_output; + int16_t fillValue = 0; + std::vector weightsFillPattern; + std::pair transposeArgs; + std::pair transposedArgsForSaving; + std::vector* transposedData; }; class GNATestBase { @@ -103,7 +128,7 @@ class GNATestConfigurability : public GNATestBase{ return *dynamic_cast(this); } template - T & withGNAConfig(const std::string keyName, const VType &value) { + T & withGNAConfig(const std::string &keyName, const VType &value) { std::stringstream ss; ss << value; _env.config[keyName] = ss.str(); @@ -153,6 +178,22 @@ class GNAPropagateMatcher : public GNATestConfigurability { return *this; } + GNAPropagateMatcher & returns() { + return *this; + } + + GNAPropagateMatcher & And() { + return *this; + } + + GNAPropagateMatcher & that() { + return *this; + } + + GNAPropagateMatcher & result() { + return *this; + } + GNAPropagateMatcher & called_with() { return *this; } @@ -161,11 +202,35 @@ class GNAPropagateMatcher : public GNATestConfigurability { _env.matchInserted = false; return *this; } + /** + * @brief gna_propagate_forward will fill all output pointers of 16 bits with this value + */ + GNAPropagateMatcher & filledWith(int16_t valueToFill) { + _env.fillValue = valueToFill; + getMatcher() = GnaPluginTestEnvironment::fillOutputValues; + return *this; + } - GNAPropagateMatcher & called_with_input_and_expected_output(std::vector& input_data, - std::vector& expect) { + GNAPropagateMatcher & equal_to(const std::vector& expect) { _env.matchOutput = true; - _env.input_init = input_data; + _env.expected_output = expect; + return *this; + } + + GNAPropagateMatcher & input(const std::string & inputName, const std::vector& inputData) { + _env.input_init[inputName] = inputData; + return *this; + } + + GNAPropagateMatcher & inputScale(const std::string & inputName, float scaleFactor) { + _env.config[std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_" + inputName] = std::to_string(scaleFactor); + return *this; + } + + GNAPropagateMatcher & called_with_input_and_expected_output(const std::vector& input_data, + const std::vector& expect) { + _env.matchOutput = true; + _env.input_init["any_input_name"] = input_data; _env.expected_output = expect; return *this; } @@ -234,11 +299,47 @@ class GNAPropagateMatcher : public GNATestConfigurability { return *this; } + GNAPropagateMatcher &preprocessed_input_data(std::vector input_init, std::vector input_processed, + InferenceEngine::Precision inputPrecision) { + getMatcher() = GnaPluginTestEnvironment::matchInputData; + _env.input_processed = std::move(input_processed); + _env.input_init["placeholder"] = std::move(input_init); + _env.input_precision = inputPrecision; + return *this; + } + GNAPropagateMatcher & copy_inserted_into_nnet() { getMatcher() = GnaPluginTestEnvironment::matchCopyInserted; return *this; } + + GNAPropagateMatcher & affine_weights_transpozed(std::pair &&transpozedArgs) { + getMatcher() = GnaPluginTestEnvironment::saveAffineWeights; + _env.transposedArgsForSaving = std::move(transpozedArgs); + + return *this; + } + + GNAPropagateMatcher & affine_weights() { + getMatcher() = GnaPluginTestEnvironment::saveAffineWeights; + return *this; + } + + GNAPropagateMatcher & affine_weights_eq(std::vector & sourceWeights) { + getMatcher() = GnaPluginTestEnvironment::matchAffineWeights; + _env.transposedData = &sourceWeights; + return *this; + } + + + GNAPropagateMatcher & affine_weights_transposed(std::vector & sourceWeights, std::pair transposeData) { + getMatcher() = GnaPluginTestEnvironment::matchAffineWeightsTranspose; + _env.transposeArgs = transposeData; + _env.transposedData = &sourceWeights; + return *this; + } + GNAPropagateMatcher & nnet_input_precision(const InferenceEngine::Precision &precision) { getMatcher() = GnaPluginTestEnvironment::matchPrecision; _env.nnet_precision.input_precision = precision; @@ -271,6 +372,13 @@ class GNAPropagateMatcher : public GNATestConfigurability { return *this; } + GNAPropagateMatcher & to(std::vector & sourceWeights) { + _env.transposedData = &sourceWeights; + return *this; + } + + + GNAPropagateMatcher & onCPU() { _env.target_device = InferenceEngine::TargetDevice::eCPU; return *this; @@ -371,14 +479,29 @@ class GNAQueryStateMatcher : public GNADumpXNNMatcher { class GNATest : public ::testing::Test, public GNATestConfigurability { using base = GNATestConfigurability; using base::_env; - std::list> dataUsedInMatchers; + class XStorage { + public: + std::vector data; + std::function destroyer; + ~XStorage() { + destroyer(&data.front()); + } + }; + std::list dataUsedInMatchers; std::list> returnedMatchers; public: template T & storage () { - dataUsedInMatchers.push_back(std::vector(sizeof(T))); - return *reinterpret_cast (&dataUsedInMatchers.back().front()); + dataUsedInMatchers.push_back({std::vector(sizeof(T)), [](void * toDestroy) { + reinterpret_cast(toDestroy)->~T(); + }}); + + auto ptr = reinterpret_cast (&dataUsedInMatchers.back().data.front()); + // sad to say we are not using destructors here so data might leak + new(ptr) T; + + return *ptr; } GNATest() : base(GnaPluginTestEnvironment()) {} GNATest & as() { @@ -399,6 +522,9 @@ class GNATest : public ::testing::Test, public GNATestConfigurability getMatcher() = GnaPluginTestEnvironment::saveArgs; return *this; } + GNATest & save() { + return *this; + } GNATest & onInfer1AFModel() { _env.model = GNATestIRs::Fc2DOutputModel(); @@ -438,6 +564,10 @@ class GNATest : public ::testing::Test, public GNATestConfigurability _env.cb = _cb; return *this; } + GNATest & withWeigthsPattern(std::vector && initializer) { + _env.weightsFillPattern = std::move(initializer); + return *this; + } GNATest & gna() { return *this; } @@ -484,7 +614,16 @@ class GNATest : public ::testing::Test, public GNATestConfigurability return dynamic_cast(*returnedMatchers.back()); } - static void fillWeights(InferenceEngine::Blob::Ptr weights, float value = 1) { - std::fill_n(weights->buffer().as(), weights->byteSize()/sizeof(float), value); + static void fillWeights(InferenceEngine::Blob::Ptr weights, std::vector pattern = {1.f}) { + float * p = weights->buffer().as(); + float * pEnd = p + weights->byteSize() / sizeof(float); + + for(; p!=pEnd ;) { + for (int i = 0; i != (weights->byteSize() / sizeof(float) / 3) + 1; i++) { + for (int j = 0; j != pattern.size() && p != pEnd; j++, p++) { + *p = pattern[j]; + } + } + } } }; diff --git a/inference-engine/tests/unit/engines/gna/gna_memory_test.cpp b/inference-engine/tests/unit/engines/gna/gna_memory_test.cpp index aaf0f57..3c46c50 100644 --- a/inference-engine/tests/unit/engines/gna/gna_memory_test.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_memory_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include diff --git a/inference-engine/tests/unit/engines/gna/gna_mock_api.hpp b/inference-engine/tests/unit/engines/gna/gna_mock_api.hpp index 230c5ab..20a60c7 100644 --- a/inference-engine/tests/unit/engines/gna/gna_mock_api.hpp +++ b/inference-engine/tests/unit/engines/gna/gna_mock_api.hpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2017-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #pragma once diff --git a/inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp b/inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp index de17de7..7373c98 100644 --- a/inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include diff --git a/inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp b/inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp index 408deec..865649f 100644 --- a/inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp @@ -1,6 +1,35 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ + #include #include diff --git a/inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp b/inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp index f61aecd..27725d6 100644 --- a/inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp +++ b/inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp @@ -1,6 +1,35 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ + #include #include diff --git a/inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp b/inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp index c8767b0..cf42599 100644 --- a/inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp +++ b/inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp @@ -1,10 +1,23 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include #include #include +#include #include "gna_plugin/quantization/model_quantizer.hpp" #include "gna_plugin/quantization/layer_quantizer.hpp" #include "gna_matcher.hpp" @@ -123,7 +136,7 @@ TEST_F(I16QuantisationTest, DISABLED_outputScaleFactorForAffineIsCorrect){ auto weights = make_shared_blob(Precision::U8, C, {440}); weights->allocate(); - fillWeights(weights, 100); + fillWeights(weights, {100}); net_reader.SetWeights(weights); auto newNet = q.quantize(net_reader.getNetwork(), 1000); @@ -190,41 +203,16 @@ TEST_F(I16QuantisationTest, SplitFollowedByActivation_DummyDiagonalAffineInserti .inNotCompactMode().gna().propagate_forward().called_with().diagonal_inserted_into_nnet(); } -TEST_F(I16QuantisationTest, SplitFollowedByFCAndEltwiseOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - std::vector expected_result = {12.0, 12.0, 12.0, 12.0, 12.0, - 12.0, 12.0, 12.0, 12.0, 12.0}; - assert_that().onInferModel(FCWithPaddingAfterSplitModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} - -TEST_F(I16QuantisationTest, SliceFollowedByFCAndEltwiseOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - std::vector expected_result = {14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0}; - assert_that().onInferModel(FCWithPaddingAfterSliceModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} - -TEST_F(I16QuantisationTest, SliceFollowedByAlignedFCAndEltwiseOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - std::vector expected_result = {18.0, 18.0, 18.0, 18.0}; - assert_that().onInferModel(SliceModelWithAlignedOutputs()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); +TEST_F(I16QuantisationTest, DISABLED_SliceFollowedBy2FCsAnd2Eltwises_AlignedFilterInsertion) { + assert_that().onInferModel(twoFCWithPaddingAfterSliceModel()) + .inNotCompactMode().gna().propagate_forward().called_with().diagonal_inserted_into_nnet(); } -TEST_F(I16QuantisationTest, SliceFollowedBy2FCsAnd2EltwisesOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - std::vector expected_result = {27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0}; - assert_that().onInferModel(twoFCWithPaddingAfterSliceModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); +// ToDo requires implementation of aligning filter for concat inputs and improvement of +// qunatization/scaling algorithm for concat +TEST_F(I16QuantisationTest, DISABLED_DoubleConcatPropageteForwardWithSuccess_AlignedFilterInsertion) { + assert_that().onInferModel(doubleConcatModel()) + .inNotCompactMode().gna().propagate_forward().called_with().diagonal_inserted_into_nnet(); } TEST_F(I16QuantisationTest, EltwiseSumm_onlyOneIdentityInsertion) { @@ -253,36 +241,24 @@ TEST_F(I16QuantisationTest, EltwiseMull_willInsertTwoIdentities) { .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().twice(); } -TEST_F(I16QuantisationTest, ConcatPropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - std::vector expected_result = {121.0, 121.0, 121.0, 121.0, 121.0, - 121.0, 121.0, 121.0, 121.0, 121.0, - 121.0, 121.0, 121.0, 121.0, 121.0, - 121.0, 121.0, 121.0, 121.0, 121.0}; - - assert_that().onInferModel(concatModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); +TEST_F(I16QuantisationTest, multiple_inputs_supported) { + assert_that().onInferModel(two_inputs_to_affine()) + .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().once(); } +TEST_F(I16QuantisationTest, multiple_inputs_can_handle_individual_scale_factors) { + std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector input2_data = {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}; + std::vector result = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; -TEST_F(I16QuantisationTest, DoubleConcatPropageteForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - std::vector expected_result = {141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0, - 141.0, 141.0, 141.0, 141.0, 141.0}; + assert_that().onInferModel(two_inputs_to_affine()) + .inNotCompactMode().gna().propagate_forward() + .called_with().inputScale("input_1", 2).And() + .inputScale("input_2", 2).returns().result().filledWith(16384).that().equal_to(result); +} - assert_that().onInferModel(doubleConcatModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); +TEST_F(I16QuantisationTest, DISABLED_multiple_inputs_into_concat_supported) { + assert_that().onInferModel(two_inputs_to_concat()) + .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().once(); } TEST_F(I16QuantisationTest, ScaleShift_Affine_WillResultInIdentityInsertion) { @@ -306,76 +282,52 @@ TEST_F(I16QuantisationTest, AffineWith2AffineOutputs_ResultInOnlyOneIdentityInse .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().twice(); } +TEST_F(I16QuantisationTest, ScaleShiftWithBroadcast_ResultInDiagonalInsertion) { + + auto & affineWeights = storage>(); + + affineWeights = { + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + }; + + assert_that().onInferModel(ScaleShift3DModel()).withWeigthsPattern({1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}) + .inNotCompactMode().gna().propagate_forward().called_with().called_with().affine_weights_eq(affineWeights); +} + // TODO: this mode not required in rel life scenarios so far TEST_F(I16QuantisationTest, DISABLED_AffineWithOutputToMemoryAndToAnotherNode_ResultInCopyInsertion) { assert_that().onInferModel(affineToMemoryModel()).inNotCompactMode().gna().propagate_forward(). called_with().copy_inserted_into_nnet(); } -TEST_F(I16QuantisationTest, CropWithoutOffsetPropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - std::vector expected_result = {11.0, 11.0, 11.0, 11.0, 11.0, - 11.0, 11.0, 11.0, 11.0, 11.0}; +TEST_F(I16QuantisationTest, DISABLED_permutationOfWeightsBetweenConvAndAffine) { + auto & affineWeights = storage>(); - assert_that().onInferModel(cropWithoutOffsetModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} + // least likely that width and height both are multiple of 7 + auto weigthsPattern = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; -TEST_F(I16QuantisationTest, CropWithAlignedOffsetPropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - std::vector expected_result = {3.0, 3.0, 3.0, 3.0, 3.0, - 3.0, 3.0, 3.0, 3.0, 3.0}; + // here weights are transpozed + save().onInferModel(affineAfterConvNoPermute()).withWeigthsPattern(weigthsPattern) + .inNotCompactMode().from().propagate_forward().affine_weights_transpozed({128, 61}).to(affineWeights); - assert_that().onInferModel(cropWithAlignedOffsetModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); + // here weights shouldn't be transposed + assert_that().onInferModel(affineAfterConvWithPermute()).withWeigthsPattern(weigthsPattern) + .inNotCompactMode().gna().propagate_forward().called_with().affine_weights_eq(affineWeights); } -TEST_F(I16QuantisationTest, CropWithOffsetPropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - std::vector expected_result = {7.0, 7.0, 7.0, 7.0, 7.0, - 7.0, 7.0, 7.0, 7.0, 7.0}; +TEST_F(I16QuantisationTest, DISABLED_noPermutationOfWeightsBetweenConvAndAffineIfPermuteLayerWithCorrectArgs) { + auto & affineWeights = storage>(); - assert_that().onInferModel(cropWithOffsetModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} - -TEST_F(I16QuantisationTest, CropWithMaxOffsetPropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - std::vector expected_result = {1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0}; - - assert_that().onInferModel(cropWithMaxOffsetModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} + // least likely that width and height both are multiple of 7 + auto weigthsPattern = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; -TEST_F(I16QuantisationTest, CropWithOffsetAfterFCPropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - std::vector expected_result = {111.0, 111.0, 111.0, 111.0, 111.0, - 111.0, 111.0, 111.0, 111.0, 111.0}; + save().onInferModel(affineAfterConvWithPermute()).withWeigthsPattern(weigthsPattern) + .inNotCompactMode().from().propagate_forward().affine_weights().to(affineWeights); - assert_that().onInferModel(cropWithOffsetExtendedModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} - -TEST_F(I16QuantisationTest, CopySimpleCasePropagateForwardWithSuccessOnCPU) { - std::vector input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - std::vector expected_result = {12.0, 12.0, 12.0, 12.0, 12.0, - 12.0, 12.0, 12.0, 12.0, 12.0, - 11.0, 11.0, 11.0, 11.0, 11.0, - 11.0, 11.0, 11.0, 11.0, 11.0,}; - - assert_that().onInferModel(copyModel()) - .inNotCompactMode().gna().propagate_forward().onCPU() - .called_with_input_and_expected_output(input_data, expected_result); -} + assert_that().onInferModel(affineAfterConvNoPermute()).withWeigthsPattern(weigthsPattern) + .inNotCompactMode().gna().propagate_forward().called_with().affine_weights_transposed(affineWeights, {128, 61}); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp index 4d59470..db64350 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once diff --git a/inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp index c947ecd..4c32f33 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once #include "nnet_base_matcher.hpp" diff --git a/inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp index cd6c246..e2bb023 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once #include"gna-api.h" diff --git a/inference-engine/tests/unit/engines/gna/matchers/fill_with_data.hpp b/inference-engine/tests/unit/engines/gna/matchers/fill_with_data.hpp new file mode 100644 index 0000000..d46ab30 --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/matchers/fill_with_data.hpp @@ -0,0 +1,74 @@ +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ + + #pragma once + + +class OutputFiller : public ::testing::MatcherInterface { + mutable std::stringstream reason; + int32_t fill32BValue; + int16_t fill16BValue; + + public: + OutputFiller(int32_t fill32BValue, int16_t fill16BValue) : fill32BValue(fill32BValue), fill16BValue(fill16BValue) {} + + + bool MatchAndExplain(const intel_nnet_type_t* foo, ::testing::MatchResultListener* listener) const override { + if (foo == nullptr) + return false; + reason.str(""); + // checking pointers are set + for (int i=0; i < foo->nLayers; i++) { + if (nullptr == foo->pLayers[i].pInputs || + nullptr == foo->pLayers[i].pOutputs) { + reason << "input/output pointers in pLayers[" << i << "] shouldn't be null NULL"; + return false; + } + auto nElements = foo->pLayers[i].nOutputColumns * foo->pLayers[i].nOutputRows; + if (foo->pLayers[i].nBytesPerOutput == 2) { + std::fill_n((int16_t *) foo->pLayers[i].pOutputs, nElements, fill16BValue); + } else if (foo->pLayers[i].nBytesPerOutput == 4) { + std::fill_n((int32_t *) foo->pLayers[i].pOutputs, nElements, fill32BValue); + } else { + reason << "output bitness of layer [" << i << "] shouldn't be 16 or 32, but was " << foo->pLayers[i].nBytesPerOutput; + return false; + } + } + return true; + } + + void DescribeTo(::std::ostream *os) const override { + *os << "Not a Matcher but a fake, but error happened anyway: " << reason.str(); + } + +}; + diff --git a/inference-engine/tests/unit/engines/gna/matchers/input_data_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/input_data_matcher.hpp new file mode 100644 index 0000000..f45f9ee --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/matchers/input_data_matcher.hpp @@ -0,0 +1,69 @@ +#include + +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ + +#pragma once + +#include +#include "nnet_base_matcher.hpp" + +class InputDataMatcher : public ::testing::MatcherInterface { + std::vector refInput; +public: + + explicit InputDataMatcher(const std::vector &_refInput) : refInput(_refInput) {} + + bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override { + if (foo->pLayers == nullptr) { + *listener << "Address of the first layer descriptor is NULL"; + return false; + } + auto firstLayer = foo->pLayers[0]; + auto actualInput = firstLayer.pInputs; + if (!actualInput) { + *listener << "Input of the first layer is NULL"; + return false; + } + + auto *actualInputI16 = reinterpret_cast(actualInput); + for (int i = 0; i < refInput.size(); i++) { + if (actualInputI16[i] != refInput[i]) { + *listener << "Actual and reference value of input doesn't match: " << actualInputI16[i] << " vs " + << refInput[i]; + } + } + return true; + } + + void DescribeTo(::std::ostream *os) const override {} +}; diff --git a/inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp index 7c1f69b..267777c 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once diff --git a/inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp index 009e61c..e9b6ae9 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once diff --git a/inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp index 9dfdc87..1d04fad 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once #include "nnet_base_matcher.hpp" diff --git a/inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp index 9060cd5..1efba3c 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once #include "nnet_base_matcher.hpp" diff --git a/inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp index cccd940..c55cad8 100644 --- a/inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp +++ b/inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once #include diff --git a/inference-engine/tests/unit/engines/gna/matchers/weights_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/weights_matcher.hpp new file mode 100644 index 0000000..3c50f85 --- /dev/null +++ b/inference-engine/tests/unit/engines/gna/matchers/weights_matcher.hpp @@ -0,0 +1,212 @@ +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ + +#pragma once +#include"gna-api.h" +#include "nnet_base_matcher.hpp" +#include "quantization/quantization.h" + +using TranspozedData = std::tuple*, int, int>; + +class TranspozeIterator { + std::pair dims; + int _offset = 0; + int _row = 0; + int _col = 0; + int _outputRow = 0; + public : + TranspozeIterator(const std::pair & dims) : dims(std::move(dims)) { + } + TranspozeIterator(const TranspozedData & data) : TranspozeIterator({std::get<1>(data), std::get<2>(data)}) { + } + + TranspozeIterator operator ++ (int) { + TranspozeIterator c(*this); + this->operator++(); + return c; + } + + void reset() { + _offset = 0; + _row = 0; + _col = 0; + _outputRow = 0; + } + + // prefix form + TranspozeIterator& operator ++ () { + if (dims.first == 0 || dims.second == 0) { + _offset ++; + } else { + // step over whole row length + _row++; + // once number of rows hit max value + if (_row == dims.second) { + // increment offset within row + _col++; + // restart from first row + _row = 0; + // restart from next output channel + if (_col == dims.first) { + _outputRow++; + _col = 0; + } + } + _offset = _col + _row * dims.first + _outputRow * dims.first * dims.second; + } + return *this; + } + // getting index + operator int() { + return _offset; + } + int row() const noexcept { + return _row; + } + int col() const noexcept { + return _col; + } + int outputRow() const noexcept{ + return _outputRow; + } +}; + +class WeightsMatcher : public ::testing::MatcherInterface { + enum HowMatch{ + eNone, + eEq, + eTranspozed + } eMatchKind; + TranspozedData transpozedData; + + mutable std::stringstream error; + mutable TranspozeIterator iterator; + mutable int actual; + public: + explicit WeightsMatcher(const TranspozedData & data) : + eMatchKind(eTranspozed), + transpozedData(data), + iterator(data) { + if (0 == std::get<1>(transpozedData) || 0 == std::get<2>(transpozedData)) { + eMatchKind = eEq; + } + } + bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override { + if (foo == nullptr) + return false; + iterator.reset(); + + for(int i = 0; i < foo->nLayers; i++) { + if (foo->pLayers[i].nLayerKind != INTEL_AFFINE && + foo->pLayers[i].nLayerKind != INTEL_AFFINE_DIAGONAL) continue; + + auto affine = (intel_affine_func_t*)foo->pLayers[i].pLayerStruct; + + auto affineWeightsSize = foo->pLayers[i].nOutputRows * + foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows; + + if (affineWeightsSize != std::get<0>(transpozedData)->size()) { + error << "gna-xnn layer(" << i << ") weights size mismatch: expected " + << std::get<0>(transpozedData)->size() << ", but was: " << affineWeightsSize; + break; + } + + auto pWeights = reinterpret_cast(affine->pWeights); + + for (int i = 0; i != affineWeightsSize; i++, iterator++) { + auto savedVal = (&std::get<0>(transpozedData)->front())[iterator]; + if (pWeights[i] != savedVal) { + actual = pWeights[i]; + return false; + } + } + return true; + } + return false; + }; + void DescribeTo(::std::ostream *os) const override { + *os << error.str() << std::endl; + if (eMatchKind == eEq) { + *os << "weights of affine layers are not equal, error at: "; + } else { + *os << "weights of affine layers are not transpozed, error at: "; + } + *os << (int)iterator << ", actual=" << actual<<", expected=" << (&std::get<0>(transpozedData)->front())[iterator]; + } +}; + + +class WeightsSaver: public ::testing::MatcherInterface { + mutable TranspozeIterator iterator; + std::vector* weights; + public: + explicit WeightsSaver(TranspozedData data) : + weights(std::get<0>(data)), iterator(data) { + } + bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override { + if (foo == nullptr) + return false; + for(int i = 0; i < foo->nLayers; i++) { + if (foo->pLayers[i].nLayerKind != INTEL_AFFINE) continue; + + auto affine = (intel_affine_func_t*)foo->pLayers[i].pLayerStruct; + + auto affineWeightsSize = foo->pLayers[i].nOutputRows * foo->pLayers[i].nInputRows; + auto pWeights = reinterpret_cast(affine->pWeights); + weights->resize(affineWeightsSize); + + for (int i=0; i != affineWeightsSize; i++, ++iterator) { + (*weights)[i] = pWeights[iterator]; + } + + return true; + } + return false; + }; + void DescribeTo(::std::ostream *os) const override { + *os << "affine layer not found"; + } +}; + + +void HasWeightsTranspozed(std::unique_ptr& components, std::vector* data, std::pair dims) { + components->add(new WeightsMatcher(make_tuple(data, dims.first, dims.second))); +} + +void HasWeightsEq(std::unique_ptr& components, std::vector* data) { + components->add(new WeightsMatcher(make_tuple(data, 0, 0))); +} + +void SaveWeights(std::unique_ptr& components, std::vector* data, std::pair dims) { + components->add(new WeightsSaver(make_tuple(data, dims.first, dims.second))); +} + diff --git a/inference-engine/tests/unit/engines/gna/test_irs.cpp b/inference-engine/tests/unit/engines/gna/test_irs.cpp index f9a0353..0ab9a07 100644 --- a/inference-engine/tests/unit/engines/gna/test_irs.cpp +++ b/inference-engine/tests/unit/engines/gna/test_irs.cpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #include "test_irs.hpp" @@ -342,7 +370,7 @@ std::string eltwiseToMemoryModel() { std::string activationAfterSplitModel() { return R"V0G0N( - + @@ -420,7 +448,7 @@ std::string activationAfterSplitModel() { - + )V0G0N"; } @@ -505,6 +533,104 @@ std::string FCWithPaddingAfterSplitModel() { )V0G0N"; } +std::string FCBeforeSplitModel() { + return R"V0G0N( + + + + + + 1 + 20 + + + + + + + + + + 1 + 20 + + + + + 1 + 20 + + + + + + + + + 1 + 20 + + + + + + 1 + 10 + + + + 1 + 10 + + + + + + + + + + 1 + 10 + + + + + 1 + 10 + + + + + + + + 1 + 10 + + + 1 + 10 + + + + + 1 + 10 + + + + + + + + + + + + + )V0G0N"; +} std::string twoFCWithPaddingAfterSliceModel() { return R"V0G0N( @@ -1803,6 +1929,7 @@ std::string TFLeakyReluModel() { )V0G0N"; } + std::string maxpoolAfterRelu() { return R"V0G0N( @@ -2319,6 +2446,7 @@ std::string doubleConcatModel() { )V0G0N"; } + std::string cropWithoutOffsetModel() { return R"V0G0N( @@ -2675,4 +2803,498 @@ std::string copyModel() { )V0G0N"; } + +std::string two_inputs_to_concat() { + return R"V0G0N( + + + + + + + 1 + 600 + + + + + + + 1 + 600 + + + + + + + + 1 + 600 + + + + + 1 + 600 + + + + + 1 + 1200 + + + + + + + + 1 + 600 + + + + + 1 + 600 + + + + + + + + + + + )V0G0N"; + +} + +std::string two_inputs_to_affine() { + return R"V0G0N( + + + + + + + 1 + 10 + + + + + + + 1 + 10 + + + + + + + + 1 + 10 + + + + + 1 + 10 + + + + + + + + + + + 1 + 10 + + + + + 1 + 10 + + + + + + + + + + + 1 + 10 + + + 1 + 10 + + + + + 1 + 10 + + + + + + + + + + + + )V0G0N"; + +} + + +std::string affineAfterConvNoPermute() { + return R"V0G0N( + + + + + + + 1 + 128 + 1 + 126 + + + + + + + + 1 + 128 + 1 + 126 + + + + + 1 + 128 + 1 + 122 + + + + + + + + + + + 1 + 128 + 1 + 122 + + + + + 1 + 128 + 1 + 122 + + + + + + + + 1 + 128 + 1 + 122 + + + + + 1 + 128 + 1 + 61 + + + + + + + + + 1 + 128 + 1 + 61 + + + + + 1 + 7808 + + + + + + + + + + + + + + 1 + 7808 + + + + + 1 + 10 + + + + + + + + + + + + + + )V0G0N"; +} + +std::string affineAfterConvWithPermute() { + return R"V0G0N( + + + + + + + 1 + 128 + 1 + 126 + + + + + + + + 1 + 128 + 1 + 126 + + + + + 1 + 128 + 1 + 122 + + + + + + + + + + + 1 + 128 + 1 + 122 + + + + + 1 + 128 + 1 + 122 + + + + + + + + 1 + 128 + 1 + 122 + + + + + 1 + 128 + 1 + 61 + + + + + + + + + 1 + 128 + 1 + 61 + + + + + 1 + 61 + 1 + 128 + + + + + + + + + 1 + 61 + 1 + 128 + + + + + 1 + 7808 + + + + + + + + + + + + + + 1 + 7808 + + + + + 1 + 10 + + + + + + + + + + + + + + + )V0G0N"; +} + + + +std::string ScaleShift3DModel() { + return R"V0G0N( + + + + + + + 1 + 40 + + + + + + + + 1 + 40 + + + + + 1 + 5 + 8 + + + + + + + 1 + 5 + 8 + + + + + 1 + 5 + 8 + + + + + + + + + + + + + + + )V0G0N"; +} + } // namespace GNATestIRs diff --git a/inference-engine/tests/unit/engines/gna/test_irs.hpp b/inference-engine/tests/unit/engines/gna/test_irs.hpp index c7b4b0c..c0194dc 100644 --- a/inference-engine/tests/unit/engines/gna/test_irs.hpp +++ b/inference-engine/tests/unit/engines/gna/test_irs.hpp @@ -1,6 +1,34 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// +/* + * INTEL CONFIDENTIAL + * Copyright (C) 2018-2019 Intel Corporation. + * + * The source code contained or described herein and all documents + * related to the source code ("Material") are owned by Intel Corporation + * or its suppliers or licensors. Title to the Material remains with + * Intel Corporation or its suppliers and licensors. The Material may + * contain trade secrets and proprietary and confidential information + * of Intel Corporation and its suppliers and licensors, and is protected + * by worldwide copyright and trade secret laws and treaty provisions. + * No part of the Material may be used, copied, reproduced, modified, + * published, uploaded, posted, transmitted, distributed, or disclosed + * in any way without Intel's prior express written permission. + * + * No license under any patent, copyright, trade secret or other + * intellectual property right is granted to or conferred upon you by + * disclosure or delivery of the Materials, either expressly, by implication, + * inducement, estoppel or otherwise. Any license under such intellectual + * property rights must be express and approved by Intel in writing. + * + * Include any supplier copyright notices as supplier requires Intel to use. + * + * Include supplier trademarks or logos as supplier requires Intel to use, + * preceded by an asterisk. An asterisked footnote can be added as follows: + * *Third Party trademarks are the property of their respective owners. + * + * Unless otherwise agreed by Intel in writing, you may not remove or alter + * this notice or any other notice embedded in Materials by Intel or Intel's + * suppliers or licensors in any way. + */ #pragma once @@ -17,6 +45,7 @@ std::string activationAfterSplitModel(); std::string FCWithPaddingAfterSplitModel(); std::string SliceModelWithAlignedOutputs(); std::string FCWithPaddingAfterSliceModel(); +std::string FCBeforeSplitModel(); std::string twoFCWithPaddingAfterSliceModel(); std::string eltwiseSummModel(); std::string eltwiseMulModel(); @@ -40,4 +69,9 @@ std::string cropWithOffsetModel(); std::string cropWithMaxOffsetModel(); std::string cropWithOffsetExtendedModel(); std::string copyModel(); +std::string two_inputs_to_affine(); +std::string two_inputs_to_concat(); +std::string affineAfterConvNoPermute(); +std::string affineAfterConvWithPermute(); +std::string ScaleShift3DModel(); } // namespace GNATestIRs diff --git a/inference-engine/tests/unit/engines/mkldnn/constant_propagation_test.cpp b/inference-engine/tests/unit/engines/mkldnn/constant_propagation_test.cpp index 5d817f8..c0c25eb 100644 --- a/inference-engine/tests/unit/engines/mkldnn/constant_propagation_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/constant_propagation_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/convert_desc_test.cpp b/inference-engine/tests/unit/engines/mkldnn/convert_desc_test.cpp index ddd2444..e4fa4fc 100644 --- a/inference-engine/tests/unit/engines/mkldnn/convert_desc_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/convert_desc_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/dump_test.cpp b/inference-engine/tests/unit/engines/mkldnn/dump_test.cpp index 042f7ac..25ec76c 100644 --- a/inference-engine/tests/unit/engines/mkldnn/dump_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/dump_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include diff --git a/inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp b/inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp index 383a1e7..0fc2eff 100644 --- a/inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp @@ -1,5 +1,17 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2016-2018 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you (End User License Agreement for the Intel(R) Software +// Development Products (Version May 2017)). Unless the License provides +// otherwise, you may not use, modify, copy, publish, distribute, disclose or +// transmit this software or the related documents without Intel's prior +// written permission. +// +// This software and the related documents are provided as is, with no +// express or implied warranties, other than those that are expressly +// stated in the License. // #include @@ -29,7 +41,7 @@ public: "SomeNet", {2,3,16,16}, "FP32")) { using prm_t = map; - testing::InOutData inout = {{{2,3,16,16}},{{2,16,16,16}}}; + testing::InOutShapes inout = {{{2,3,16,16}},{{2,16,16,16}}}; prm_t conv_prm = { {"stride-x", std::to_string(1)}, @@ -96,4 +108,4 @@ TEST(MKLDNNLayersTests, DumpSimpleGraphToDot) { ASSERT_EQ(std::count(dot.begin(), dot.end(), '['), 10); // 4-node 3-data 3-shape ASSERT_EQ(std::count(dot.begin(), dot.end(), ']'), 10); ASSERT_EQ(std::count(dot.begin(), dot.end(), '>'), 6); // connection -} \ No newline at end of file +} diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/depth_to_space_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/depth_to_space_tests.cpp new file mode 100644 index 0000000..d122450 --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/depth_to_space_tests.cpp @@ -0,0 +1,525 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct depth_to_space_test_params { + InferenceEngine::SizeVector in_shape; + size_t block_size; + InferenceEngine::SizeVector out_shape; + + std::vector reference; + std::vector> comp; +}; + +void ref_depth_to_space( + InferenceEngine::TBlob &src, + InferenceEngine::TBlob &dst, + size_t block_size +) { + size_t i; + const float *src_data = src.data(); + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = src.getTensorDesc().getBlockingDesc().getStrides(); + float* dst_data = dst.data(); + InferenceEngine::SizeVector dst_dims = dst.getTensorDesc().getDims(); + InferenceEngine::SizeVector dstStrides = dst.getTensorDesc().getBlockingDesc().getStrides(); + + if (src_dims.size() < 3) + FAIL() << " Incorrect number of input dimensions!"; + + if (dst_dims.size() < 2) + FAIL() << " Incorrect number of output dimensions!"; + + if (block_size == 0) + FAIL() << " Incorrect block_size parameter is zero!"; + + if (src_dims[src_dims.size() - 3] % (block_size * block_size)) + FAIL() << " block_size parameter is incompatible with input tensor Color dimension size!"; + + if (dst_dims.size() > 2 && src_dims[src_dims.size() - 3] != (dst_dims[dst_dims.size() - 3] * block_size * block_size)) + FAIL() << " Input/Output tensor Color dimension is incompatible with block_size!"; + + if (dst_dims[dst_dims.size() - 2] != (src_dims[src_dims.size() - 2] * block_size)) + FAIL() << " Input/Output tensor Height dimension is incompatible with block_size!"; + + if (dst_dims[dst_dims.size() - 1] != (src_dims[src_dims.size() - 1] * block_size)) + FAIL() << " Input/Output tensor Width dimension is incompatible with block_size!"; + + size_t X = 1; + for (i = 0; i < (src_dims.size() - 3); i++) + X *= src_dims[i]; + + size_t C = src_dims[src_dims.size() - 3]; + size_t H = src_dims[src_dims.size() - 2]; + size_t W = src_dims[src_dims.size() - 1]; + + for (size_t x = 0, k = 0; x < X; ++x) { + for (size_t h = 0; h < H; ++h) { + for (size_t c = 0; c < C; c += block_size) { + for (size_t w = 0; w < W; ++w) { + for (size_t b = 0; b < block_size; ++b) { + size_t idx = x * C*H*W + (c + b) * H*W + h * W + w; + dst_data[k++] = src_data[idx]; + } + } + } + } + } +} + +void ref_space_to_depth( + InferenceEngine::TBlob &src, + InferenceEngine::TBlob &dst, + size_t block_size +) { + size_t i; + const float *src_data = src.data(); + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = src.getTensorDesc().getBlockingDesc().getStrides(); + float* dst_data = dst.data(); + InferenceEngine::SizeVector dst_dims = dst.getTensorDesc().getDims(); + InferenceEngine::SizeVector dstStrides = dst.getTensorDesc().getBlockingDesc().getStrides(); + + if (dst_dims.size() < 3) + FAIL() << " Incorrect number of output dimensions!"; + + if (src_dims.size() < 2) + FAIL() << " Incorrect number of input dimensions!"; + + if (block_size == 0) + FAIL() << " Incorrect block_size parameter is zero!"; + + if (dst_dims[dst_dims.size() - 3] % (block_size * block_size)) + FAIL() << " block_size parameter is incompatible with input tensor Color dimension size!"; + + if (src_dims.size() > 2 && dst_dims[dst_dims.size() - 3] != (src_dims[dst_dims.size() - 3] * block_size * block_size)) + FAIL() << " Input/Output tensor Color dimension is incompatible with block_size!"; + + if (src_dims[src_dims.size() - 2] != (dst_dims[dst_dims.size() - 2] * block_size)) + FAIL() << " Input/Output tensor Height dimension is incompatible with block_size!"; + + if (src_dims[src_dims.size() - 1] != (dst_dims[dst_dims.size() - 1] * block_size)) + FAIL() << " Input/Output tensor Width dimension is incompatible with block_size!"; + + size_t X = 1; + for (i = 0; i < (dst_dims.size() - 3); i++) + X *= dst_dims[i]; + + size_t C = dst_dims[dst_dims.size() - 3]; + size_t H = dst_dims[dst_dims.size() - 2]; + size_t W = dst_dims[dst_dims.size() - 1]; + + for (size_t x = 0, k = 0; x < X; ++x) { + for (size_t h = 0; h < H; ++h) { + for (size_t c = 0; c < C; c += block_size) { + for (size_t w = 0; w < W; ++w) { + for (size_t b = 0; b < block_size; ++b) { + size_t idx = x * C*H*W + (c + b) * H*W + h * W + w; + dst_data[idx] = src_data[k++]; + } + } + } + } + } +} + +class MKLDNNCPUExtDepthToSpaceTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + s + + + + + + _IN_ + + + + + _OUT_ + + + + + + + + +)V0G0N"; + + std::string getModel(depth_to_space_test_params p) { + std::string model = model_t; + std::string in_shape, out_shape; + + for (size_t i = 0; i < p.in_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.in_shape[i]) + "\n"; + } + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_", in_shape); + REPLACE_WITH_STR(model, "_OUT_", out_shape); + REPLACE_WITH_NUM(model, "_BS_", p.block_size); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + depth_to_space_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Check results + InferenceEngine::SizeVector out_dims; + ref_depth_to_space(*srcPtr, dst_ref, p.block_size); + + // Check results + if(p.reference.size()) + if (memcmp(dst_ref.data(), &p.reference[0], p.reference.size() * sizeof(float)) != 0) + FAIL() << "Wrong result with compare TF reference!"; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +class MKLDNNCPUExtSpaceToDepthTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + s + + + + + + _IN_ + + + + + _OUT_ + + + + + + + + +)V0G0N"; + + std::string getModel(depth_to_space_test_params p) { + std::string model = model_t; + std::string in_shape, out_shape; + + for (size_t i = 0; i < p.out_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.out_shape[i]) + "\n"; + } + for (size_t i = 0; i < p.in_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.in_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_", in_shape); + REPLACE_WITH_STR(model, "_OUT_", out_shape); + REPLACE_WITH_NUM(model, "_BS_", p.block_size); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + depth_to_space_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + //std::cout << model; + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*) {})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.out_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.out_shape) }); + src->allocate(); + if (p.reference.size()) + memcpy(static_cast(src->buffer()), &p.reference[0], sizeof(float)*p.reference.size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Check results + InferenceEngine::SizeVector out_dims; + ref_space_to_depth(*srcPtr, dst_ref, p.block_size); + + // Check results + if (p.reference.size()) { + // fill_data_dbgval(src->buffer(), src->size()); + // if (memcmp(dst_ref.data(), &p.reference[0], p.reference.size() * sizeof(float)) != 0) + // FAIL() << "Wrong result with compare TF reference!"; + } + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } + catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + + + +class MKLDNNCPUExtDepthToSpaceToDepthTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + s + + + + + + _IN_ + + + + + _OUT_ + + + + + + + + _OUT_ + + + + + _IN_ + + + + + + + + + +)V0G0N"; + + std::string getModel(depth_to_space_test_params p) { + std::string model = model_t; + std::string in_shape, out_shape; + + for (size_t i = 0; i < p.in_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.in_shape[i]) + "\n"; + } + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_", in_shape); + REPLACE_WITH_STR(model, "_OUT_", out_shape); + REPLACE_WITH_NUM(model, "_BS_", p.block_size); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + depth_to_space_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*) {})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, *src); + } + catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNCPUExtDepthToSpaceTests, TestsDepthToSpace) {} +// Test data vectors +static std::vector test0 = { 0.f, 6.f, 1.f, 7.f, 2.f, 8.f, 12.f, 18.f, 13.f, 19.f, 14.f, 20.f, 3.f, 9.f, 4.f, 10.f, 5.f, 11.f, 15.f, 21.f, 16.f, 22.f, 17.f, 23.f}; +INSTANTIATE_TEST_CASE_P( + TestsDepthToSpace, MKLDNNCPUExtDepthToSpaceTests, + ::testing::Values( +// Params: in_shape, block_size, out_shape, reference + depth_to_space_test_params{ { 1, 4, 2, 3 }, 2, { 1, 1, 4, 6 }, test0 }, + depth_to_space_test_params{ { 4, 2, 3 }, 2, { 1, 1, 4, 6 }, test0 }, + depth_to_space_test_params{ { 1, 4, 2, 3 }, 2, { 4, 6 }, test0 }, + depth_to_space_test_params{ { 4, 2, 3 }, 2, { 4, 6 }, test0 }, + depth_to_space_test_params{ { 5, 4, 2, 3 }, 2, { 5, 1, 4, 6 }, test0 }, + depth_to_space_test_params{ { 2, 3, 5, 4, 2, 3 }, 2, { 2, 3, 5, 1, 4, 6 }, test0 } +)); + + +TEST_P(MKLDNNCPUExtDepthToSpaceToDepthTests, TestsDepthToSpaceToDepth) {} +INSTANTIATE_TEST_CASE_P( + TestsDepthToSpaceToDepth, MKLDNNCPUExtDepthToSpaceToDepthTests, + ::testing::Values( + // Params: in_shape, block_size, out_shape, reference + depth_to_space_test_params{ { 1, 9, 2, 3 }, 3,{ 1, 1, 6, 9 },{} }, + depth_to_space_test_params{ { 16, 2, 3 }, 4,{ 1, 1, 8, 12 },{} }, + depth_to_space_test_params{ { 1, 25, 4, 3 }, 5,{ 20, 15 },{} }, + depth_to_space_test_params{ { 72, 10, 3 }, 6,{ 2, 60, 18 },{} }, + depth_to_space_test_params{ { 5, 8, 2, 3 }, 2,{ 5, 2, 4, 6 },{} }, + depth_to_space_test_params{ { 2, 3, 5, 16, 2, 3 }, 2,{ 2, 3, 5, 4, 4, 6 },{} } +)); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/expand_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/expand_tests.cpp new file mode 100644 index 0000000..4db82c9 --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/expand_tests.cpp @@ -0,0 +1,265 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct expand_test_params { + std::string precision; + InferenceEngine::SizeVector in_shape; + InferenceEngine::SizeVector out_shape; + + std::vector> comp; +}; + + +template +void ref_expand(InferenceEngine::TBlob &src, InferenceEngine::TBlob &dst) { + size_t i; + const data_t *src_data = src.data(); + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = src.getTensorDesc().getBlockingDesc().getStrides(); + data_t* dst_data = dst.data(); + InferenceEngine::SizeVector dst_dims = dst.getTensorDesc().getDims(); + InferenceEngine::SizeVector dstStrides = dst.getTensorDesc().getBlockingDesc().getStrides(); + + if (src_dims.size() > dst_dims.size()) + FAIL() << "Output tensor dimension is smaller then input tensor dimension"; + + size_t prefix_size = dst_dims.size() - src_dims.size(); + for (i = 0; i < src_dims.size(); i++) { + if (src_dims[i] != 1 && src_dims[i] != dst_dims[i + prefix_size]) + FAIL() << "In/Output corresponding dimension must have the same value, or Input dimension is equal to 1"; + } + + InferenceEngine::SizeVector src_aligned(dst_dims.size()); + InferenceEngine::SizeVector srcStrides_aligned(dst_dims.size()); + for (i = 0; i < dst_dims.size(); i++) { + if (i < prefix_size) { + src_aligned[i] = 1; + srcStrides_aligned[i] = srcStrides[0]; + } else { + src_aligned[i] = src_dims[i - prefix_size]; + srcStrides_aligned[i] = srcStrides[i - prefix_size]; + } + } + + size_t src_idx, work_amount_dst = dstStrides[0] * dst_dims[0]; + InferenceEngine::SizeVector counters(dst_dims.size(), 0); + + for (size_t iwork = 0; iwork < work_amount_dst; ++iwork) { + for (i = 0, src_idx = 0; i < dst_dims.size(); ++i) + src_idx += counters[i] ? ((counters[i] % src_aligned[i]) * srcStrides_aligned[i]) : 0; + + dst_data[iwork] = src_data[src_idx]; + + for (int j = dst_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % dst_dims[j]; + if (counters[j] != 0) break; + } + } +} + + +class MKLDNNCPUExtExpandTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + + + + + + _DIM_SIZE_ + + + + + + + + _IN_ + + + _DIM_SIZE_ + + + + + _OUT_ + + + + + + + + + +)V0G0N"; + + std::string getModel(expand_test_params p) { + std::string model = model_t; + std::string in_shape; + std::string out_shape; + + REPLACE_WITH_STR(model, "_IIDXP_", p.precision); + for (size_t i = 0; i < p.in_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.in_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_", in_shape); + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_OUT_", out_shape); + REPLACE_WITH_NUM(model, "_DIM_SIZE_", p.out_shape.size()); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + expand_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*) {})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + // Input Data + InferenceEngine::Blob::Ptr dims; + InferenceEngine::SizeVector vector_dim(1, p.out_shape.size()); + dims = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, vector_dim, InferenceEngine::TensorDesc::getLayoutByDims(vector_dim) }); + dims->allocate(); + for (size_t i = 0; i < p.out_shape.size(); i++) { + static_cast(dims->buffer())[i] = static_cast(p.out_shape[i]); + } + auto * dimsPtr = dynamic_cast*>(dims.get()); + if (dimsPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + InferenceEngine::Blob::Ptr src; + std::pair item = *out.begin(); + if (p.precision == "I32") { + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + for (size_t i = 0; i < src->size(); i++) + static_cast(src->buffer())[i] = static_cast(i); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("input", src)); + srcs.insert(std::pair("shape", dims)); + + // Output Blob + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + ref_expand(*srcPtr, dst_ref); + + // Infer + graph.Infer(srcs, outputBlobs); + for (int i = 0; i < dst_ref.size(); i++) { + if (dst_ref.data()[i] != (*output).data()[i]) + FAIL() << "The difference between res_ptr[i] and ref_ptr[i]"; + } + } + else if (p.precision == "FP32") { + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("input", src)); + srcs.insert(std::pair("shape", dims)); + + // Output Blob + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + ref_expand(*srcPtr, dst_ref); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } + else { + return; + } + } + catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNCPUExtExpandTests, TestsExpand) {} + +INSTANTIATE_TEST_CASE_P( + TestsExpand, MKLDNNCPUExtExpandTests, + ::testing::Values( + // Params: precision, in_shape, out_shape + expand_test_params{ "I32", { 1 }, { 2, 3, 4 } }, + expand_test_params{ "I32", { 4, 1, 2 }, { 4, 2, 2 } }, + expand_test_params{ "I32", { 4, 2, 1 }, { 4, 2, 2 } }, + expand_test_params{ "I32", { 4, 2 }, { 2, 4, 2 } }, + expand_test_params{ "I32", { 4, 1, 1 }, { 4, 2, 1 } }, + expand_test_params{ "I32", { 2, 1, 3, 1 },{ 2, 2, 2, 3, 1 } }, + expand_test_params{"FP32", { 1 }, { 2, 3, 4 } }, + expand_test_params{"FP32", { 4, 1, 2 }, { 4, 2, 2 } }, + expand_test_params{"FP32", { 4, 2, 1 }, { 4, 2, 2 } }, + expand_test_params{"FP32", { 4, 2 }, { 2, 4, 2 } }, + expand_test_params{"FP32", { 4, 1, 1 }, { 4, 2, 1 } }, + expand_test_params{"FP32", { 2, 1, 3, 1 },{ 2, 2, 2, 3, 1 } } +)); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fake_layer.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fake_layer.cpp index 4e22a72..1b9d936 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fake_layer.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fake_layer.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -44,8 +44,8 @@ class FakeExtensions : public IExtension { void GetVersion(const Version *&versionInfo) const noexcept override { static Version ExtensionDescription = { - {1, 0}, // extension API version - "1.0", + {1, 6}, // extension API version + "1.6", "ie-cpu-ext" // extension description message }; diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fill_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fill_tests.cpp new file mode 100644 index 0000000..55dc9d3 --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fill_tests.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct fill_test_params { + std::string precision; + InferenceEngine::SizeVector out_shape; + float value; + + std::vector> comp; +}; + +class MKLDNNCPUExtFillTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _DIM_SIZE_ + + + + + + + 1 + + + + + + + + _DIM_SIZE_ + + + 1 + + + + + _OUT_ + + + + + + + + + +)V0G0N"; + + std::string getModel(fill_test_params p) { + std::string model = model_t; + std::string out_shape; + + REPLACE_WITH_STR(model, "_IIDXP_", p.precision); + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_OUT_", out_shape); + REPLACE_WITH_NUM(model, "_DIM_SIZE_", p.out_shape.size()); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + fill_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + // Input Data + InferenceEngine::Blob::Ptr dims; + InferenceEngine::SizeVector vector_dim(1, p.out_shape.size()); + dims = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, vector_dim, InferenceEngine::TensorDesc::getLayoutByDims(vector_dim) }); + dims->allocate(); + for (size_t i = 0; i < p.out_shape.size(); i++) { + static_cast(dims->buffer())[i] = static_cast(p.out_shape[i]); + } + auto * srcPtr = dynamic_cast*>(dims.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + InferenceEngine::Blob::Ptr value_scalar; + InferenceEngine::SizeVector value_scalar_dim(1, 1); + std::pair item = *out.begin(); + if (p.precision == "I32") { + value_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, value_scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(value_scalar_dim) }); + value_scalar->allocate(); + static_cast(value_scalar->buffer())[0] = static_cast(p.value); + auto * value_scalarPtr = dynamic_cast*>(value_scalar.get()); + if (value_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("dims", dims)); + srcs.insert(std::pair("value", value_scalar)); + + // Output Blob + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + std::fill_n(static_cast(dst_ref.data()), dst_ref.size(), static_cast(p.value)); + + // Infer + graph.Infer(srcs, outputBlobs); + for (int i = 0; i < dst_ref.size(); i++) { + if(dst_ref.data()[i] != (*output).data()[i]) + FAIL() << "The difference between res_ptr[i] and ref_ptr[i]"; + } + } else if (p.precision == "FP32") { + value_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, value_scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(value_scalar_dim) }); + value_scalar->allocate(); + static_cast(value_scalar->buffer())[0] = p.value; + auto * value_scalarPtr = dynamic_cast*>(value_scalar.get()); + if (value_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("dims", dims)); + srcs.insert(std::pair("value", value_scalar)); + + // Output Blob + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + std::fill_n(static_cast(dst_ref.data()), dst_ref.size(), p.value); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } else { + return; + } + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNCPUExtFillTests, TestsFill) {} + +INSTANTIATE_TEST_CASE_P( + TestsFill, MKLDNNCPUExtFillTests, + ::testing::Values( +// Params: precision, value, out_shape + fill_test_params{ "I32", { 1 }, 1.f }, + fill_test_params{ "I32", { 1, 3, 1 }, 1.f }, + fill_test_params{ "I32", { 2, 3, 6 }, -1.f }, + fill_test_params{"FP32", { 2, 3, 6 }, -1.f }, + fill_test_params{"FP32", { 1 }, 1.f }, + fill_test_params{"FP32", { 1, 3, 1, 2 }, .5f }, + fill_test_params{"FP32", { 4, 3, 2, 5, 4, 2 }, .25f } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp index b4300fb..d92a4f2 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -32,13 +32,6 @@ struct gather_test_params { std::vector> comp; }; - -inline void clipping(int *idx, const int min, const int max) { - (*idx) = ((*idx) > min) ? (*idx) : min; - (*idx) = ((*idx) < max) ? (*idx) : (max - 1); - return; -} - template void ref_gather(InferenceEngine::TBlob &srcIdx, InferenceEngine::TBlob &srcDct, InferenceEngine::TBlob &dst, size_t axis) { size_t i, j; @@ -70,15 +63,20 @@ void ref_gather(InferenceEngine::TBlob &srcIdx, InferenceEngine::TBlob(src_dataIdx[i]); + unsigned int idx = static_cast(src_dataIdx[i]); // Index clipping - clipping(&idx, 0, indexRange); - - // Copying data to destination from Dictionary - for (j = 0; j < numDictionaries; j++) { - memcpy(&dst_data[dataLength * (i + j * src_size)], - &src_dataDict[dataLength * (idx + j * indexRange)], sizeof(float)*dataLength); + if (idx < indexRange) + { + // Copying data to destination from Dictionary + for (j = 0; j < numDictionaries; j++) { + memcpy(&dst_data[dataLength * (i + j * src_size)], + &src_dataDict[dataLength * (idx + j * indexRange)], sizeof(float) * dataLength); + } + } else { + for (j = 0; j < numDictionaries; j++) { + std::fill_n(&dst_data[dataLength * (i + j * src_size)], dataLength, 0.0f); + } } } } @@ -313,9 +311,6 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( gather_test_params{ "FP32", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown }, gather_test_params{ "I32", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown }, - gather_test_params{ "I16", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown }, - gather_test_params{ "U8", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown }, - gather_test_params{ "I8", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown }, gather_test_params{ "I32", {12, 256}, {71, 16}, 0, {12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown }, gather_test_params{ "I32", {3, 4}, {2, 5, 6}, 0, {3, 4, 5, 6}, 1, MKLDNNPlugin::impl_desc_type::unknown }, gather_test_params{ "I32", {3, 4}, {5, 1}, 0, {3, 4, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown }, diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/graph_generic_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/graph_generic_test.cpp index 49e62bc..793d43a 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/graph_generic_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/graph_generic_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp index 6bc9b75..94e0d35 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -136,7 +136,7 @@ class MKLDNNCPUExtInterpTests: public TestsCommon, public WithParamInterface - + diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/mvn_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/mvn_tests.cpp index 84511a1..bb31c09 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/mvn_tests.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/mvn_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/range_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/range_tests.cpp new file mode 100644 index 0000000..292c99b --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/range_tests.cpp @@ -0,0 +1,255 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct range_test_params { + std::string precision; + float start; + float limit; + float delta; + InferenceEngine::SizeVector out_shape; + + std::vector> comp; +}; + +template +void ref_range( + float start, + float limit, + float delta, + InferenceEngine::TBlob &dst +) { + data_t* dst_data = dst.data(); + size_t work_amount_dst = std::floor(std::abs((limit - start) / delta)); + if (work_amount_dst != dst.size()) + FAIL() << "Range indexes exceeds data tensor dimension"; + + data_t dst_value = static_cast(start); + for (size_t iwork = 0; iwork < work_amount_dst; ++iwork, dst_value += static_cast(delta)) { + dst_data[iwork] = dst_value; + } +} + +class MKLDNNCPUExtRangeTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + 1 + + + + + + + 1 + + + + + + + 1 + + + + + + + + 1 + + + 1 + + + 1 + + + + + _OUT_ + + + + + + + + + + +)V0G0N"; + + std::string getModel(range_test_params p) { + std::string model = model_t; + std::string out_shape; + + REPLACE_WITH_STR(model, "_IIDXP_", p.precision); + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_OUT_", out_shape); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + range_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + // Input Data + InferenceEngine::Blob::Ptr start_scalar; + InferenceEngine::Blob::Ptr limit_scalar; + InferenceEngine::Blob::Ptr delta_scalar; + std::pair item = *out.begin(); + InferenceEngine::SizeVector scalar_dim(1, 1); + InferenceEngine::BlobMap srcs; + InferenceEngine::SizeVector out_dims; + if (p.precision == "I32") { + start_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(scalar_dim) }); + start_scalar->allocate(); + static_cast(start_scalar->buffer())[0] = static_cast(p.start); + auto * start_scalarPtr = dynamic_cast*>(start_scalar.get()); + if (start_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + limit_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(scalar_dim) }); + limit_scalar->allocate(); + static_cast(limit_scalar->buffer())[0] = static_cast(p.limit); + auto * limit_scalarPtr = dynamic_cast*>(limit_scalar.get()); + if (limit_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + delta_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(scalar_dim) }); + delta_scalar->allocate(); + static_cast(delta_scalar->buffer())[0] = static_cast(p.delta); + auto * delta_scalarPtr = dynamic_cast*>(delta_scalar.get()); + if (delta_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("start", start_scalar)); + srcs.insert(std::pair("limit", limit_scalar)); + srcs.insert(std::pair("delta", delta_scalar)); + + // Output Blob + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + ref_range(p.start, p.limit, p.delta, dst_ref); + + // Infer + graph.Infer(srcs, outputBlobs); + for (int i = 0; i < dst_ref.size(); i++) { + if (dst_ref.data()[i] != (*output).data()[i]) + FAIL() << "The difference between res_ptr[i] and ref_ptr[i]"; + } + } else if (p.precision == "FP32") { + start_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(scalar_dim) }); + start_scalar->allocate(); + static_cast(start_scalar->buffer())[0] = p.start; + auto * start_scalarPtr = dynamic_cast*>(start_scalar.get()); + if (start_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + limit_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(scalar_dim) }); + limit_scalar->allocate(); + static_cast(limit_scalar->buffer())[0] = p.limit; + auto * limit_scalarPtr = dynamic_cast*>(limit_scalar.get()); + if (limit_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + delta_scalar = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, scalar_dim, InferenceEngine::TensorDesc::getLayoutByDims(scalar_dim) }); + delta_scalar->allocate(); + static_cast(delta_scalar->buffer())[0] = p.delta; + auto * delta_scalarPtr = dynamic_cast*>(delta_scalar.get()); + if (delta_scalarPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("start", start_scalar)); + srcs.insert(std::pair("limit", limit_scalar)); + srcs.insert(std::pair("delta", delta_scalar)); + + // Output Blob + InferenceEngine::Blob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + ref_range(p.start, p.limit, p.delta, dst_ref); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } else { + return; + } + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNCPUExtRangeTests, TestsRange) {} + +INSTANTIATE_TEST_CASE_P( + TestsRange, MKLDNNCPUExtRangeTests, + ::testing::Values( +// Params: precision, start, limit, delta, out_shape + range_test_params{ "I32", 3.f, 18.f, 3.f, { 5 } }, + range_test_params{ "I32", 3.f, 1.f, -1.f, { 2 } }, + range_test_params{ "I32", 3.f, -3.f, -1.f, { 6 } }, + range_test_params{ "I32", 0.f, 5.f, 1.f, { 5 } }, + range_test_params{"FP32", 3.f, 18.f, 3.f, { 5 } }, + range_test_params{"FP32", 3.f, 1.f, -.5f, { 4 } }, + range_test_params{"FP32", 3.f, -1.f, -.5f, { 8 } }, + range_test_params{"FP32", 0.f, 5.f, 1.f, { 5 } } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp index f3e4bad..1494731 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/reverse_sequence_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/reverse_sequence_tests.cpp new file mode 100644 index 0000000..66ee38b --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/reverse_sequence_tests.cpp @@ -0,0 +1,273 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + + +struct reverse_sequence_test_params { + std::string inIdxPrecision; + InferenceEngine::SizeVector in_out_shape; + std::vector seq_lengths; + int seq_axis; + int batch_axis; + std::vector reference; + + std::vector> comp; +}; + +template +void ref_reverse_sequence( + InferenceEngine::TBlob &src, + InferenceEngine::TBlob &seq_lengths, + InferenceEngine::TBlob &dst, + int seq_axis, + int batch_axis +) { + size_t i, src_idx; + const float *src_data = src.data(); + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = src.getTensorDesc().getBlockingDesc().getStrides(); + const data_t *seq_lengths_data = seq_lengths.data(); + InferenceEngine::SizeVector seq_lengths_dims = seq_lengths.getTensorDesc().getDims(); + float* dst_data = dst.data(); + + if (seq_axis < 0) + seq_axis += src_dims.size(); + + if (seq_axis < 0 || seq_axis >= src_dims.size()) + FAIL() << "Incorrect 'seq_axis' parameters dimensions and axis number!"; + + if (batch_axis < 0) + batch_axis += src_dims.size(); + + if (batch_axis < 0 || batch_axis >= src_dims.size()) + FAIL() << "Incorrect 'batch_axis' parameters dimensions and axis number!"; + + for (i = 0; i < src_dims[batch_axis]; i++) { + if (static_cast(seq_lengths_data[i]) > src_dims[seq_axis]) + FAIL() << "Incorrect input 'seq_lengths' values!"; + } + + size_t work_amount_dst = srcStrides[0] * src_dims[0]; + InferenceEngine::SizeVector counters(src_dims.size(), 0); + for (size_t iwork = 0; iwork < work_amount_dst; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (i == seq_axis && idx < static_cast(seq_lengths_data[counters[batch_axis]])) { + idx = static_cast(seq_lengths_data[counters[batch_axis]]) - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + + dst_data[iwork] = src_data[src_idx]; + + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } +} + +class MKLDNNCPUExtReverseSequenceTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_OUT_ + + + + + + + _DIM_SIZE_ + + + + + + + + _IN_OUT_ + + + _DIM_SIZE_ + + + + + _IN_OUT_ + + + + + + + + + +)V0G0N"; + + std::string getModel(reverse_sequence_test_params p) { + std::string model = model_t; + std::string in_out_shape; + for (size_t i = 0; i < p.in_out_shape.size(); i++) { + in_out_shape += ""; + in_out_shape += std::to_string(p.in_out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IIDXP_", p.inIdxPrecision); + REPLACE_WITH_STR(model, "_IN_OUT_", in_out_shape); + REPLACE_WITH_NUM(model, "_DIM_SIZE_", p.seq_lengths.size()); + REPLACE_WITH_NUM(model, "_SA_", p.seq_axis); + REPLACE_WITH_NUM(model, "_BA_", p.batch_axis); + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + reverse_sequence_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + ////std::cout << model; + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_out_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_out_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + InferenceEngine::Blob::Ptr seq_lengthsIdx; + InferenceEngine::SizeVector seq_lengths_dim(1, p.seq_lengths.size()); + if (p.inIdxPrecision == "I32") { + seq_lengthsIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, seq_lengths_dim, InferenceEngine::TensorDesc::getLayoutByDims(seq_lengths_dim) }); + seq_lengthsIdx->allocate(); + if (p.seq_lengths.size()) + memcpy(static_cast(seq_lengthsIdx->buffer()), &p.seq_lengths[0], sizeof(int32_t)*p.seq_lengths.size()); + auto * seq_lengthsIdxPtr = dynamic_cast*>(seq_lengthsIdx.get()); + if (seq_lengthsIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Check results + ref_reverse_sequence(*srcPtr, *seq_lengthsIdxPtr, dst_ref, p.seq_axis, p.batch_axis); + if (p.reference.size()) { + if (memcmp(dst_ref.data(), &p.reference[0], p.reference.size() * sizeof(float)) != 0) + FAIL() << "Wrong result with compare TF reference!"; + } + srcs.insert(std::pair("seq_lengths", seq_lengthsIdx)); + } else if (p.inIdxPrecision == "FP32") { + seq_lengthsIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, seq_lengths_dim, InferenceEngine::TensorDesc::getLayoutByDims(seq_lengths_dim) }); + seq_lengthsIdx->allocate(); + if (p.seq_lengths.size()) + for (size_t i = 0; i < p.seq_lengths.size(); i++) { + static_cast(seq_lengthsIdx->buffer())[i] = static_cast(p.seq_lengths[i]); + } + auto * seq_lengthsIdxPtr = dynamic_cast*>(seq_lengthsIdx.get()); + if (seq_lengthsIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Check results + ref_reverse_sequence(*srcPtr, *seq_lengthsIdxPtr, dst_ref, p.seq_axis, p.batch_axis); + if (p.reference.size()) { + if (memcmp(dst_ref.data(), &p.reference[0], p.reference.size() * sizeof(float)) != 0) + FAIL() << "Wrong result with compare TF reference!"; + } + srcs.insert(std::pair("seq_lengths", seq_lengthsIdx)); + } else { + return; + } + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +// Test data vectors +static std::vector test0 = { 9.f,10.f,11.f,12.f,13.f,14.f,15.f,16.f,17.f,0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f,18.f,19.f,20.f,21.f,22.f,23.f,24.f,25.f,26.f }; +static std::vector test2 = { 3.f,4.f,5.f,0.f,1.f,2.f,6.f,7.f,8.f,12.f,13.f,14.f,9.f,10.f,11.f,15.f,16.f,17.f,21.f,22.f,23.f,18.f,19.f,20.f,24.f,25.f,26.f }; +static std::vector test4 = { 1.f,0.f,2.f,4.f,3.f,5.f,7.f,6.f,8.f,10.f,9.f,11.f,13.f,12.f,14.f,16.f,15.f,17.f,19.f,18.f,20.f,22.f,21.f,23.f,25.f,24.f,26.f }; +static std::vector test6 = { 2.f,1.f,0.f,4.f,3.f,5.f }; +static std::vector test7 = { 0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f,12.f,13.f,14.f,9.f,10.f,11.f,15.f,16.f,17.f,24.f,25.f,26.f,21.f,22.f,23.f,18.f,19.f,20.f }; +static std::vector test8 = { 0.f,4.f,8.f,3.f,1.f,5.f,6.f,7.f,2.f,9.f,13.f,17.f,12.f,10.f,14.f,15.f,16.f,11.f,18.f,22.f,26.f,21.f,19.f,23.f,24.f,25.f,20.f }; + +TEST_P(MKLDNNCPUExtReverseSequenceTests, TestsReverseSequence) {} +INSTANTIATE_TEST_CASE_P( + TestsReverseSequence, MKLDNNCPUExtReverseSequenceTests, + ::testing::Values( +// Params: in_out_shape, seq_lengths, seq_axis, batch_axis, reference +/* 0 */ reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 2, 2, 2 }, 0, 0, test0 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 2, 2, 2 }, -3, 0, test0 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 2, 2, 2 }, 1, 0, test2 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 2, 2, 2 }, -2, 0, test2 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 2, 2, 2 }, 2, 1, test4 }, +/* 5 */ reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 2, 2, 2 }, -1, 1, test4 }, + reverse_sequence_test_params{ "I32", { 2, 3 },{ 3, 2 }, 1, 0, test6 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 1, 2, 3 }, 1, 0, test7 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 1, 2, 3 }, 1,-3, test7 }, + reverse_sequence_test_params{ "I32", { 3, 3, 3 },{ 1, 2, 3 }, 1, 2, test8 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 2, 2, 2 }, 0, 0, test0 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 2, 2, 2 }, -3, 0, test0 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 2, 2, 2 }, 1, 0, test2 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 2, 2, 2 }, -2, 0, test2 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 2, 2, 2 }, 2, 1, test4 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 2, 2, 2 }, -1, 1, test4 }, +/* 15 */ reverse_sequence_test_params{"FP32", { 2, 3 },{ 3, 2 }, 1, 0, test6 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 1, 2, 3 }, 1, 0, test7 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 1, 2, 3 }, 1,-3, test7 }, + reverse_sequence_test_params{"FP32", { 3, 3, 3 },{ 1, 2, 3 }, 1, 2, test8 } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/shuffle_channels_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/shuffle_channels_tests.cpp new file mode 100644 index 0000000..9d2310d --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/shuffle_channels_tests.cpp @@ -0,0 +1,213 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct shuffle_channels_test_params { + InferenceEngine::SizeVector in_out_shape; + int axis; + int group; + + std::vector reference; + std::vector> comp; +}; + +void ref_shuffle_channels( + InferenceEngine::TBlob &src, + InferenceEngine::TBlob &dst, + int axis, + int group +) { + size_t i; + const float *src_data = src.data(); + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = src.getTensorDesc().getBlockingDesc().getStrides(); + float* dst_data = dst.data(); + InferenceEngine::SizeVector dst_dims = dst.getTensorDesc().getDims(); + InferenceEngine::SizeVector dstStrides = dst.getTensorDesc().getBlockingDesc().getStrides(); + + if (axis < 0) + axis += dst_dims.size(); + + if (axis < 0 || axis >= dst_dims.size()) + FAIL() << "Incorrect input parameters dimensions and axis number!"; + + if (dst_dims[axis] % group) + FAIL() << "Group parameter must evenly divide the channel dimension!"; + + // Find number of dictionaries, index range and data length + size_t numDictionaries = 1; + for (i = 0; i <= axis; i++) + numDictionaries *= dst_dims[i]; + + size_t channelsNum = dst_dims[axis] / group; + + size_t dataLength = 1; + for (i = axis + 1; i < dst_dims.size(); i++) + dataLength *= dst_dims[i]; + + if (dataLength == 0) + FAIL() << "Incorrect input parameters dimension!"; + + size_t j, k; + for (j = 0, k = 0; j < numDictionaries; j += dst_dims[axis]) { + for (i = 0; i < (dst_dims[axis] * channelsNum); i += channelsNum, k += dataLength) { + int idx = j + i / dst_dims[axis] + i % dst_dims[axis]; + memcpy(&dst_data[k], &src_data[dataLength * idx], sizeof(float) * dataLength); + } + } +} + +class MKLDNNCPUExtShuffleChannelsTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_OUT_ + + + + + + + + _IN_OUT_ + + + + + _IN_OUT_ + + + + + + + + +)V0G0N"; + + std::string getModel(shuffle_channels_test_params p) { + std::string model = model_t; + std::string in_out_shape; + + for (size_t i = 0; i < p.in_out_shape.size(); i++) { + in_out_shape += ""; + in_out_shape += std::to_string(p.in_out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_OUT_", in_out_shape); + REPLACE_WITH_NUM(model, "_AX_", p.axis); + REPLACE_WITH_NUM(model, "_GR_", p.group); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + shuffle_channels_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + ////std::cout << model; + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_out_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_out_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Check results + InferenceEngine::SizeVector out_dims; + ref_shuffle_channels(*srcPtr, dst_ref, p.axis, p.group); + + // Check results + if (memcmp(dst_ref.data(), &p.reference[0], p.reference.size() * sizeof(float)) != 0) + FAIL() << "Wrong result with compare TF reference!"; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + + +TEST_P(MKLDNNCPUExtShuffleChannelsTests, TestsShuffleChannels) {} + +// Test data vectors +static std::vector test0 = { 0.f, 1.f, 2.f, 3.f, 12.f, 13.f, 14.f, 15.f, 24.f, 25.f, 26.f, 27.f, 36.f, 37.f, 38.f, 39.f, 48.f, 49.f, 50.f, 51.f, + 4.f, 5.f, 6.f, 7.f, 16.f, 17.f, 18.f, 19.f, 28.f, 29.f, 30.f, 31.f, 40.f, 41.f, 42.f, 43.f, 52.f, 53.f, 54.f, 55.f, + 8.f, 9.f, 10.f, 11.f, 20.f, 21.f, 22.f, 23.f, 32.f, 33.f, 34.f, 35.f, 44.f, 45.f, 46.f, 47.f, 56.f, 57.f, 58.f, 59.f }; +static std::vector test4 = { 0.f, 2.f, 4.f, 1.f, 3.f, 5.f, 6.f, 8.f, 10.f, 7.f, 9.f, 11.f, 12.f, 14.f, 16.f, 13.f, 15.f, 17.f, 18.f, 20.f, 22.f, 19.f, 21.f, 23.f }; +static std::vector test5 = { 0.f, 1.f, 4.f, 5.f, 8.f, 9.f, 2.f, 3.f, 6.f, 7.f, 10.f, 11.f, 12.f, 13.f, 16.f, 17.f, 20.f, 21.f, 14.f, 15.f, 18.f, 19.f, 22.f, 23.f }; +static std::vector test6 = { 0.f, 3.f, 1.f, 4.f, 2.f, 5.f, 6.f, 9.f, 7.f, 10.f, 8.f, 11.f, 12.f, 15.f, 13.f, 16.f, 14.f, 17.f, 18.f, 21.f, 19.f, 22.f, 20.f, 23.f }; +static std::vector test7 = { 0.f, 1.f, 6.f, 7.f, 2.f, 3.f, 8.f, 9.f, 4.f, 5.f, 10.f, 11.f, 12.f, 13.f, 18.f, 19.f, 14.f, 15.f, 20.f, 21.f, 16.f, 17.f, 22.f, 23.f }; +static std::vector test8 = { 0.f, 3.f, 1.f, 4.f, 2.f, 5.f }; + +INSTANTIATE_TEST_CASE_P( + TestsShuffleChannels, MKLDNNCPUExtShuffleChannelsTests, + ::testing::Values( +// Params: in_out_shape, axis, group, reference +/* 0 */ shuffle_channels_test_params{ { 1, 15, 2, 2 }, 1, 5, test0 }, + shuffle_channels_test_params{ { 1, 15, 2, 2 }, -3, 5, test0 }, + shuffle_channels_test_params{ { 15, 2, 2 }, 0, 5, test0 }, + shuffle_channels_test_params{ { 15, 2, 2 }, -3, 5, test0 }, + shuffle_channels_test_params{ { 2, 2, 6 }, -1, 3, test4 }, +/* 5 */ shuffle_channels_test_params{ { 2, 6, 2 }, -2, 3, test5 }, + shuffle_channels_test_params{ { 2, 2, 6 }, -1, 2, test6 }, + shuffle_channels_test_params{ { 2, 6, 2 }, -2, 2, test7 }, + shuffle_channels_test_params{ { 6 }, 0, 2, test8 } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/squeeze_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/squeeze_tests.cpp new file mode 100644 index 0000000..fb315cb --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/squeeze_tests.cpp @@ -0,0 +1,244 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct squeeze_test_params { + std::string inIdxPrecision; + InferenceEngine::SizeVector in_shape; + std::vector indices_to_squeeze; + InferenceEngine::SizeVector out_shape; + + std::vector> comp; +}; + +void ref_squeeze( + InferenceEngine::TBlob &src, + InferenceEngine::SizeVector &out_dims, + std::vector indices_to_squeeze +) { + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + + if (indices_to_squeeze.size() == 0) + FAIL() << " Index vector should be 1 dimension"; + + for (size_t i = 0; i < indices_to_squeeze.size(); i++) { + int32_t axis = indices_to_squeeze[i]; + if (axis < 0) + axis += src_dims.size(); + + if (axis > src_dims.size()) + FAIL() << " Index to squeeze exceeds data tensor dimension"; + else if (src_dims[axis] != 1) + FAIL() << " Index to squeeze of data tensor dimension is not 1"; + } + + for (size_t j = 0; j < src_dims.size(); j++) { + bool found = false; + for (size_t i = 0; i < indices_to_squeeze.size(); i++) { + int32_t axis = indices_to_squeeze[i]; + if (axis < 0) + axis += src_dims.size(); + if (j == static_cast(axis)) found = true; + } + if(!found) out_dims.push_back(src_dims[j]); + } +} + +class MKLDNNCPUExtSqueezeTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + + + + + + _DIM_SIZE_ + + + + + + + + _IN_ + + + _DIM_SIZE_ + + + + + _OUT_ + + + + + + + + + +)V0G0N"; + + std::string getModel(squeeze_test_params p) { + std::string model = model_t; + std::string in_shape; + std::string out_shape; + + for (size_t i = 0; i < p.in_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.in_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_", in_shape); + REPLACE_WITH_STR(model, "_IIDXP_", p.inIdxPrecision); + REPLACE_WITH_NUM(model, "_DIM_SIZE_", p.indices_to_squeeze.size()); + if (p.out_shape.size()) { + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + } else { + out_shape = "1\n"; + } + REPLACE_WITH_STR(model, "_OUT_", out_shape); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + squeeze_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + InferenceEngine::Blob::Ptr seq_lengthsIdx; + InferenceEngine::SizeVector seq_lengths_dim(1, p.indices_to_squeeze.size()); + if (p.inIdxPrecision == "I32") { + seq_lengthsIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, seq_lengths_dim, InferenceEngine::TensorDesc::getLayoutByDims(seq_lengths_dim) }); + seq_lengthsIdx->allocate(); + if (p.indices_to_squeeze.size()) + memcpy(static_cast(seq_lengthsIdx->buffer()), &p.indices_to_squeeze[0], sizeof(int32_t)*p.indices_to_squeeze.size()); + auto * seq_lengthsIdxPtr = dynamic_cast*>(seq_lengthsIdx.get()); + if (seq_lengthsIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("indices_to_squeeze", seq_lengthsIdx)); + } else if (p.inIdxPrecision == "FP32") { + seq_lengthsIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, seq_lengths_dim, InferenceEngine::TensorDesc::getLayoutByDims(seq_lengths_dim) }); + seq_lengthsIdx->allocate(); + if (p.indices_to_squeeze.size()) + for (size_t i = 0; i < p.indices_to_squeeze.size(); i++) { + static_cast(seq_lengthsIdx->buffer())[i] = static_cast(p.indices_to_squeeze[i]); + } + auto * seq_lengthsIdxPtr = dynamic_cast*>(seq_lengthsIdx.get()); + if (seq_lengthsIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("indices_to_squeeze", seq_lengthsIdx)); + } + else { + return; + } + + // Check results + InferenceEngine::SizeVector out_dims; + ref_squeeze(*srcPtr, out_dims, p.indices_to_squeeze); + if (out_dims.size() != p.out_shape.size()) + FAIL() << "Wrong out_shape size!"; + for (size_t i = 0; i < p.out_shape.size(); i++) { + if (out_dims[i] != p.out_shape[i]) + FAIL() << "Wrong out_shape dimensions!"; + } + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, *src); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNCPUExtSqueezeTests, TestsSqueeze) {} + +INSTANTIATE_TEST_CASE_P( + TestsSqueeze, MKLDNNCPUExtSqueezeTests, + ::testing::Values( +// Params: inIdxPrecision, in_shape, indices_to_squeeze, out_shape + squeeze_test_params{ "I32",{ 1 },{ 0 },{ } }, + squeeze_test_params{ "I32",{ 1, 3, 1 },{ 0 },{ 3, 1 } }, + squeeze_test_params{ "I32",{ 1, 3, 1 },{ 2 },{ 1, 3 } }, + squeeze_test_params{ "I32",{ 1, 3, 1 },{ 0, 2 },{ 3 } }, + squeeze_test_params{ "I32",{ 1, 3, 1 },{ -1 },{ 1, 3 } }, + squeeze_test_params{ "I32",{ 1, 3, 1, 2 },{ 0, 2 },{ 3, 2 } }, + squeeze_test_params{"FP32",{ 1 },{ 0 },{} }, + squeeze_test_params{"FP32",{ 1, 3, 1 },{ 0 },{ 3, 1 } }, + squeeze_test_params{"FP32",{ 1, 3, 1 },{ 2 },{ 1, 3 } }, + squeeze_test_params{"FP32",{ 1, 3, 1 },{ 0, 2 },{ 3 } }, + squeeze_test_params{"FP32",{ 1, 3, 1 },{ -1 },{ 1, 3 } }, + squeeze_test_params{"FP32",{ 1, 3, 1, 2 },{ 0, 2 },{ 3, 2 } } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/strided_slice_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/strided_slice_tests.cpp new file mode 100644 index 0000000..f8a588a --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/strided_slice_tests.cpp @@ -0,0 +1,489 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + + +struct strided_slice_test_params { + InferenceEngine::SizeVector in_shape; + size_t dim_size; + std::vector begin; + std::vector end; + std::vector stride; + + InferenceEngine::SizeVector begin_mask; + InferenceEngine::SizeVector end_mask; + InferenceEngine::SizeVector ellipsis_mask; + InferenceEngine::SizeVector new_axis_mask; + InferenceEngine::SizeVector shrink_axis_mask; + InferenceEngine::SizeVector out_shape; + std::vector reference; + + std::vector> comp; +}; + +inline void clipping(int *idx, const int min, const int max) { + (*idx) = ((*idx) > min) ? (*idx) : min; + (*idx) = ((*idx) < max) ? (*idx) : (max - 1); + return; +} + +void ref_strided_slice( + InferenceEngine::TBlob &src, + InferenceEngine::TBlob &dst, + InferenceEngine::SizeVector &out_dims, + std::vector begin, + std::vector end, + std::vector stride, + InferenceEngine::SizeVector begin_mask, + InferenceEngine::SizeVector end_mask, + InferenceEngine::SizeVector ellipsis_mask, + InferenceEngine::SizeVector new_axis_mask, + InferenceEngine::SizeVector shrink_axis_mask +) { + size_t i; + const float *src_data = src.data(); + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + InferenceEngine::SizeVector srcStrides = src.getTensorDesc().getBlockingDesc().getStrides(); + float* dst_data = dst.data(); + InferenceEngine::SizeVector dst_dims = dst.getTensorDesc().getDims(); + InferenceEngine::SizeVector dstStrides = dst.getTensorDesc().getBlockingDesc().getStrides(); + + int new_axis = 0; + for (auto& na : new_axis_mask) + new_axis += na; + + int shrink_axis = 0; + for (auto& sa : shrink_axis_mask) + shrink_axis += sa; + int max_dims = src_dims.size() + new_axis; +// if ((max_dims - shrink_axis) != dst_dims.size()) +// FAIL() << "Destination dims should be equal source dims + new axis - shrink_axis"; + + // Check beging/end/stride vector sizes + int bounds_size = 0; + if (begin.size() && end.size() && begin.size() != end.size()) FAIL() << "Begin vector size should be equal end vectror size"; + if (begin.size() && stride.size() && stride.size() != begin.size()) FAIL() << "Stride vector size should be equal begin vectror size"; + if (end.size() && stride.size() && stride.size() != end.size()) FAIL() << "Stride vector size should be equal end vectror size"; + + if (begin.size()) bounds_size = begin.size(); + if (end.size()) bounds_size = end.size(); + if (stride.size()) bounds_size = stride.size(); + + // ellipsis_mask must be a power of two (only one ellipsis), so to take a first position + int ellipsis_pos1, ellipsis_pos2; + ellipsis_pos1 = ellipsis_pos2 = max_dims; + for (i = 0; i < ellipsis_mask.size(); i++) { + if (ellipsis_mask[i] > 0) { + ellipsis_pos1 = i; + break; + } + } + bounds_size -= ellipsis_pos1; + if(bounds_size > 0 && (max_dims - bounds_size) > ellipsis_pos1) + ellipsis_pos2 = max_dims - bounds_size; + + std::vector begin_dms(max_dims, 0); + std::vector end_dms(max_dims, -1); + std::vector stride_dms(max_dims, 1); + + int j, k, bj, ej, sj; + InferenceEngine::SizeVector our_dims; + for (i = 0, j = 0, k = 0, bj = 0, ej = 0, sj = 0; i < max_dims; i++) { + if (i >= ellipsis_pos1 && i < ellipsis_pos2) { + if (!(new_axis_mask.size() > i && new_axis_mask[i] == 1)) { + end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j++] + end_dms[i]; + } else { + //end_dms[i] = 0; + end_dms[i] = begin_dms[i]; + } + out_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast(abs(stride_dms[i]))))); + our_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast(abs(stride_dms[i]))))); + k = ellipsis_pos1; + continue; + } + stride_dms[i] = (stride.size() > sj && stride[sj] != 0) ? stride[sj++] : 1; + + if (!(begin_mask.size() > j && begin_mask[j] == 0)) + begin_dms[i] = begin.size() > bj ? begin[bj] : (stride_dms[i] > 0 ? 0 : -1); + else + begin_dms[i] = stride_dms[i] > 0 ? 0 : -1; + bj++; + begin_dms[i] = begin_dms[i] >= 0 ? begin_dms[i] : src_dims[j] + begin_dms[i]; + // Clipping 'begin' + clipping(&begin_dms[i], 0, src_dims[j]); + + if (!(end_mask.size() > j && end_mask[j] == 0)) { + int end_dms_tmp = end.size() > ej ? (stride_dms[i] > 0 ? end[ej] - 1 : end[ej] + 1) : end_dms[i]; + end_dms[i] = end.size() > ej ? end_dms_tmp : (stride_dms[i] > 0 ? -1 : 0); + } + else { + end_dms[i] = stride_dms[i] > 0 ? -1 : 0; + } + ej++; + end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j] + end_dms[i]; + // Clipping 'end' + clipping(&end_dms[i], 0, src_dims[j]); + + if (!(new_axis_mask.size() > i && new_axis_mask[i] == 1)) + j++; + else + end_dms[i] = 0; + + if (shrink_axis_mask.size() > k && shrink_axis_mask[k] == 1) + end_dms[i] = begin_dms[i]; + else + out_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast(abs(stride_dms[i]))))); + + our_dims.push_back(static_cast(ceil(static_cast(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast(abs(stride_dms[i]))))); + k++; + } + + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; + InferenceEngine::SizeVector counters(max_dims, 0); + + for (size_t iwork = 0, dst_idx = 0; iwork < work_amount_dst; ++iwork) { + int src_idx = 0; + for (i = 0, j = 0; i < max_dims; ++i) { + src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[j]; + if (!(new_axis_mask.size() > i && new_axis_mask[i] == 1)) j++; + } + + dst_data[dst_idx++] = src_data[src_idx]; + + for (j = max_dims - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % our_dims[j]; + if (counters[j] != 0) break; + } + } +} + +class MKLDNNCPUExtStridedSliceTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + + + + + + _DIM_SIZE_ + + + + + + + _DIM_SIZE_ + + + + + + + _DIM_SIZE_ + + + + + + + + _IN_ + + + _DIM_SIZE_ + + + _DIM_SIZE_ + + + _DIM_SIZE_ + + + + + _OUT_ + + + + + + + + + + + +)V0G0N"; + + std::string getModel(strided_slice_test_params p) { + std::string model = model_t; + std::string in_shape; + std::string out_shape; + std::string begin; + std::string end; + std::string ellipsis; + std::string new_axis; + std::string shrink_axis; + + for (size_t i = 0; i < p.in_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.in_shape[i]) + "\n"; + } + in_shape.pop_back(); + REPLACE_WITH_STR(model, "_IN_", in_shape); + REPLACE_WITH_NUM(model, "_DIM_SIZE_", p.dim_size); + + if (p.begin_mask.size()) { + begin = "begin_mask=\""; + for (auto& pb : p.begin_mask) + begin += std::to_string(pb) + ","; + begin.pop_back(); + begin += "\""; + } + REPLACE_WITH_STR(model, "_BEGIN_", begin); + + if (p.end_mask.size()) { + end = "end_mask=\""; + for (auto& pb : p.end_mask) + end += std::to_string(pb) + ","; + end.pop_back(); + end += "\""; + } + REPLACE_WITH_STR(model, "_END_", end); + + if (p.ellipsis_mask.size()) { + ellipsis = "ellipsis_mask=\""; + for (auto& pb : p.ellipsis_mask) + ellipsis += std::to_string(pb) + ","; + ellipsis.pop_back(); + ellipsis += "\""; + } + REPLACE_WITH_STR(model, "_ELLIPSIS_", ellipsis); + + if (p.new_axis_mask.size()) { + new_axis = "new_axis_mask=\""; + for (auto& pb : p.new_axis_mask) + new_axis += std::to_string(pb) + ","; + new_axis.pop_back(); + new_axis += "\""; + } + REPLACE_WITH_STR(model, "_NEW_AXIS_", new_axis); + + if (p.shrink_axis_mask.size()) { + shrink_axis = "shrink_axis_mask=\""; + for (auto& pb : p.shrink_axis_mask) + shrink_axis += std::to_string(pb) + ","; + shrink_axis.pop_back(); + shrink_axis += "\""; + } + REPLACE_WITH_STR(model, "_SHRINK_", shrink_axis); + + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + out_shape.pop_back(); + REPLACE_WITH_STR(model, "_OUT_", out_shape); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + strided_slice_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + ////std::cout << model; + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Input Begin + InferenceEngine::Blob::Ptr beginIdx; + InferenceEngine::SizeVector begin_dim(1, p.begin.size()); + beginIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, begin_dim, InferenceEngine::TensorDesc::getLayoutByDims(begin_dim) }); + beginIdx->allocate(); + if (p.begin.size()) + memcpy(static_cast(beginIdx->buffer()), &p.begin[0], sizeof(int32_t)*p.begin.size()); + auto * beginIdxPtr = dynamic_cast*>(beginIdx.get()); + if (beginIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Input End + InferenceEngine::Blob::Ptr endIdx; + InferenceEngine::SizeVector end_dim(1, p.end.size()); + endIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, end_dim, InferenceEngine::TensorDesc::getLayoutByDims(end_dim) }); + endIdx->allocate(); + if (p.end.size()) + memcpy(static_cast(endIdx->buffer()), &p.end[0], sizeof(int32_t)*p.end.size()); + auto * endIdxPtr = dynamic_cast*>(endIdx.get()); + if (endIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Input Stride + InferenceEngine::Blob::Ptr stridesIdx; + InferenceEngine::SizeVector strides_dim(1, p.stride.size()); + stridesIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, strides_dim, InferenceEngine::TensorDesc::getLayoutByDims(strides_dim) }); + stridesIdx->allocate(); + if (p.stride.size()) + memcpy(static_cast(stridesIdx->buffer()), &p.stride[0], sizeof(int32_t)*p.stride.size()); + auto * stridesIdxPtr = dynamic_cast*>(stridesIdx.get()); + if (stridesIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + // Check results + InferenceEngine::SizeVector out_dims; + ref_strided_slice(*srcPtr, dst_ref, out_dims, p.begin, p.end, p.stride, p.begin_mask, p.end_mask, p.ellipsis_mask, p.new_axis_mask, p.shrink_axis_mask); + + // Check results + if(out_dims.size() != p.out_shape.size()) + FAIL() << "Wrong out_shape size!"; + for (size_t i = 0; i < p.out_shape.size(); i++) { + if (out_dims[i] != p.out_shape[i]) + FAIL() << "Wrong out_shape dimensions!"; + } + if (memcmp(dst_ref.data(), &p.reference[0], p.reference.size() * sizeof(float)) != 0) + FAIL() << "Wrong result with compare TF reference!"; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + srcs.insert(std::pair("begin", beginIdx)); + srcs.insert(std::pair("end", endIdx)); + srcs.insert(std::pair("strides", stridesIdx)); + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, dst_ref); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + + +// Test data vectors +std::vector test0 = { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f }; +std::vector test2 = { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }; +std::vector test5 = { 5.f, 6.f, 7.f, 8.f }; +std::vector test6 = { 0.f, 1.f, 2.f, 3.f, 4.f, 5.f }; +std::vector test8 = { 5.f, 4.f, 3.f, 2.f, 1.f }; +std::vector test9 = { 5.f, 4.f, 3.f, 2.f, 1.f, 0.f }; +std::vector test10 = { 5.f, 4.f, 3.f }; +std::vector test11 = { 0.f, 2.f, 4.f, 6.f, 8.f }; +std::vector test12 = { 1.f, 3.f, 5.f, 7.f, 9.f }; +std::vector test13 = { 9.f, 8.f, 7.f, 6.f, 5.f, 4.f, 3.f, 2.f, 1.f, 0.f }; +std::vector test14 = { 9.f, 7.f, 5.f, 3.f, 1.f }; +std::vector test16 = { 0.f, 1.f, 3.f, 4.f }; +std::vector test17 = { 1.f, 4.f }; +std::vector test19 = { 0.f, 1.f, 2.f, 3.f }; +std::vector test20 = { 4.f, 5.f, 6.f, 7.f }; +/* +0. [0,1,2,3,4,5,6,7,8,9], shape=[10] +1. [0,1,2,3,4,5,6,7,8,9], shape=[10] +2. [0,1,2,3,4,5,6,7,8], shape=[9] +3. [0,1,2,3,4,5,6,7,8], shape=[9] +4. [0,1,2,3,4,5,6,7,8,9], shape=[10] +5. [5,6,7,8,9], shape=[5] +6. [0,1,2,3,4,5], shape=[6] +7. [5,6,7,8,9], shape=[5] +8. [5,4,3,2,1], shape=[5] +9. [5,4,3,2,1,0], shape=[6] +10. [5,4,3], shape=[3] +11. [0,2,4,6,8], shape=[5] +12. [1,3,5,7,9], shape=[5] +13. [9,8,7,6,5,4,3,2,1,0], shape=[10] +14. [9,7,5,3,1], shape=[5] +15. [[0,1,2,3,4,5,6,7,8,9]], shape=[1,10] +16. [[[0,1,2],[3,4,5]]], shape=[1,2,2] +17. [[[0,1,2],[3,4,5]]], shape=[1,2,1] +18. [[[0,1,2],[3,4,5]]], shape=[1,1,2,1] +19. [[[[0,1],[2,3]],[[4,5],[6,7]]]], shape=[1,2,2] +20. [[[[0,1],[2,3]],[[4,5],[6,7]]]], shape=[1,2,2] +21. [[[0,1,2],[3,4,5]]], shape=[1,1,2] +*/ + +TEST_P(MKLDNNCPUExtStridedSliceTests, TestsStridedSlice) {} +INSTANTIATE_TEST_CASE_P( + TestsStridedSlice, MKLDNNCPUExtStridedSliceTests, + ::testing::Values( +// Params: in_shape, dim_size, begin, end, stride, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask, out_shape, reference +/* 0 */ strided_slice_test_params{ { 10 }, 1, {}, {}, {}, {}, {}, {}, {}, {}, { 10 }, test0 }, + strided_slice_test_params{ { 10 }, 1, {0}, {0}, {}, {}, {0}, {}, {}, {}, { 10 }, test0 }, + strided_slice_test_params{ { 10 }, 1,{ -1 },{ -1 },{},{ 0 },{},{},{},{},{ 9 }, test2 }, + strided_slice_test_params{ { 10 }, 1,{ 0 },{ -1 },{},{},{},{},{},{},{ 9 }, test2 }, + strided_slice_test_params{ { 10 }, 1,{ 0 },{ 10 },{},{},{},{},{},{},{ 10 }, test0 }, +/* 5 */ strided_slice_test_params{ { 10 }, 1,{ 5 },{ 10 },{},{},{},{},{},{},{ 5 }, test5 }, + strided_slice_test_params{ { 10 }, 1,{ 0 },{ 6 },{},{},{},{},{},{},{ 6 }, test6 }, + strided_slice_test_params{ { 10 }, 1,{ -5 },{ 10 },{},{},{},{},{},{},{ 5 }, test5 }, + strided_slice_test_params{ { 10 }, 1,{ -5 },{ 0 },{-1},{},{},{},{},{},{ 5 }, test8 }, + strided_slice_test_params{ { 10 }, 1,{ -5 },{ 0 },{ -1 },{},{0},{},{},{},{ 6 }, test9 }, +/* 10 */ strided_slice_test_params{ { 10 }, 1,{ -5 },{ 2 },{ -1 },{},{},{},{},{},{ 3 }, test10 }, + strided_slice_test_params{ { 10 }, 1,{ 0 },{ 0 },{ 2 },{},{0},{},{},{},{ 5 }, test11 }, + strided_slice_test_params{ { 10 }, 1,{ 1 },{ 0 },{ 2 },{},{ 0 },{},{},{},{ 5 }, test12 }, + strided_slice_test_params{ { 10 }, 1,{ -1 },{ 0 },{ -1 },{},{ 0 },{},{},{},{ 10 }, test13 }, + strided_slice_test_params{ { 10 }, 1,{ -1 },{ 0 },{ -2 },{},{ 0 },{},{},{},{ 5 }, test14 }, +/* 15 */ strided_slice_test_params{ { 10 }, 1,{ 0 },{ 10 },{},{},{},{},{1},{},{ 1, 10 }, test0 }, + strided_slice_test_params{ { 1, 2, 3 }, 2,{ 0, 0 },{ 1, 2 },{},{},{},{0, 1},{},{},{ 1, 2, 2 }, test16 }, + strided_slice_test_params{ { 1, 2, 3 }, 4,{ 0, 0, 0, 1 },{ 2, 3, 2, 2 },{},{},{},{},{ 0,0,1,0 },{ 0,0,0,1 },{ 1,2,1 }, test17 }, + strided_slice_test_params{ { 1, 2, 3 }, 3,{ 0, 0, 1 },{ 2, 2, 2 },{},{},{},{ 0, 1 },{ 1 },{},{ 1, 1, 2, 1 }, test17 }, + strided_slice_test_params{ { 1, 2, 2, 2 }, 4,{},{},{},{ 0,1,0,0 },{ 0,1,0,0 },{},{},{ 0,1 },{ 1,2,2 }, test19 }, +/* 20 */ strided_slice_test_params{ { 1, 2, 2, 2 }, 4,{ 0,1,0,0 },{ 1,2,2,2 },{},{ 0,1,0,0 },{ 0,1,0,0 },{},{},{ 0,1,0,0 },{ 1,2,2 }, test20 }, + strided_slice_test_params{ { 1, 2, 3 }, 3,{ 0, 0, 1 },{ 2, 2, 2 },{},{},{},{ 0, 1 },{ 1 },{ 0, 0, 1 },{ 1, 1, 2 }, test17 } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/unsqueeze_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/unsqueeze_tests.cpp new file mode 100644 index 0000000..1b073be --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/unsqueeze_tests.cpp @@ -0,0 +1,235 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include +#include "tests_common.hpp" + + +using namespace ::testing; +using namespace std; +using namespace mkldnn; + +struct unsqueeze_test_params { + std::string inIdxPrecision; + InferenceEngine::SizeVector in_shape; + std::vector indices_to_set; + InferenceEngine::SizeVector out_shape; + + std::vector> comp; +}; + +void ref_unsqueeze( + InferenceEngine::TBlob &src, + InferenceEngine::SizeVector &out_dims, + std::vector indices_to_set +) { + InferenceEngine::SizeVector src_dims = src.getTensorDesc().getDims(); + + if (indices_to_set.size() == 0) + FAIL() << " Index vector should be 1 dimension"; + + size_t i, j, k, max = src_dims.size(); + for (size_t i = 0; i < indices_to_set.size(); i++) { + if (indices_to_set[i] > max) max = indices_to_set[i]; + } + max++; + + if ((indices_to_set.size() + src_dims.size()) < max) + FAIL() << " Indices_to_set for unsqueeze layer is out of tensor dimension"; + + max = indices_to_set.size() + src_dims.size(); + for (i = 0, j = 0, k = 0; i < max; i++) { + if (k < indices_to_set.size() && i == indices_to_set[k]) { + out_dims.push_back(1); + k++; + } else { + out_dims.push_back(src_dims[j++]); + } + } +} + +class MKLDNNCPUExtUnsqueezeTests : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + _IN_ + + + + + + + _DIM_SIZE_ + + + + + + + + _IN_ + + + _DIM_SIZE_ + + + + + _OUT_ + + + + + + + + + +)V0G0N"; + + std::string getModel(unsqueeze_test_params p) { + std::string model = model_t; + std::string in_shape; + std::string out_shape; + + for (size_t i = 0; i < p.in_shape.size(); i++) { + in_shape += ""; + in_shape += std::to_string(p.in_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_IN_", in_shape); + REPLACE_WITH_STR(model, "_IIDXP_", p.inIdxPrecision); + REPLACE_WITH_NUM(model, "_DIM_SIZE_", p.indices_to_set.size()); + for (size_t i = 0; i < p.out_shape.size(); i++) { + out_shape += ""; + out_shape += std::to_string(p.out_shape[i]) + "\n"; + } + REPLACE_WITH_STR(model, "_OUT_", out_shape); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + unsqueeze_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + ////std::cout << model; + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + InferenceEngine::Extension cpuExt(make_so_name("cpu_extension")); + MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager()); + extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){})); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork(), extMgr); + + // Output Data + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + // Output Reference + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + // Input Data + InferenceEngine::Blob::Ptr src; + src = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, p.in_shape, InferenceEngine::TensorDesc::getLayoutByDims(p.in_shape) }); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + auto * srcPtr = dynamic_cast*>(src.get()); + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + InferenceEngine::Blob::Ptr seq_lengthsIdx; + InferenceEngine::SizeVector seq_lengths_dim(1, p.indices_to_set.size()); + if (p.inIdxPrecision == "I32") { + seq_lengthsIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::I32, seq_lengths_dim, InferenceEngine::TensorDesc::getLayoutByDims(seq_lengths_dim) }); + seq_lengthsIdx->allocate(); + if (p.indices_to_set.size()) + memcpy(static_cast(seq_lengthsIdx->buffer()), &p.indices_to_set[0], sizeof(int32_t)*p.indices_to_set.size()); + auto * seq_lengthsIdxPtr = dynamic_cast*>(seq_lengthsIdx.get()); + if (seq_lengthsIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("indices_to_set", seq_lengthsIdx)); + } else if (p.inIdxPrecision == "FP32") { + seq_lengthsIdx = InferenceEngine::make_shared_blob({ InferenceEngine::Precision::FP32, seq_lengths_dim, InferenceEngine::TensorDesc::getLayoutByDims(seq_lengths_dim) }); + seq_lengthsIdx->allocate(); + if (p.indices_to_set.size()) + for (size_t i = 0; i < p.indices_to_set.size(); i++) { + static_cast(seq_lengthsIdx->buffer())[i] = static_cast(p.indices_to_set[i]); + } + auto * seq_lengthsIdxPtr = dynamic_cast*>(seq_lengthsIdx.get()); + if (seq_lengthsIdxPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + srcs.insert(std::pair("indices_to_set", seq_lengthsIdx)); + } + else { + return; + } + + // Check results + InferenceEngine::SizeVector out_dims; + ref_unsqueeze(*srcPtr, out_dims, p.indices_to_set); + if (out_dims.size() != p.out_shape.size()) + FAIL() << "Wrong out_shape size!"; + for (size_t i = 0; i < p.out_shape.size(); i++) { + if (out_dims[i] != p.out_shape[i]) + FAIL() << "Wrong out_shape dimensions!"; + } + + // Infer + graph.Infer(srcs, outputBlobs); + compare(*output, *src); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNCPUExtUnsqueezeTests, TestsUnsqueeze) {} + +INSTANTIATE_TEST_CASE_P( + TestsUnsqueeze, MKLDNNCPUExtUnsqueezeTests, + ::testing::Values( +// Params: inIdxPrecision, in_shape, indices_to_set, out_shape + unsqueeze_test_params{ "I32",{ 3 },{ 0 },{ 1, 3 } }, + unsqueeze_test_params{ "I32",{ 3 },{ 0, 1, 2 },{ 1, 1, 1, 3 } }, + unsqueeze_test_params{ "I32",{ 3 },{ 0, 2, 3 },{ 1, 3, 1, 1 } }, + unsqueeze_test_params{ "I32",{ 2, 3 },{ 0, 3 },{ 1, 2, 3, 1 } }, + unsqueeze_test_params{ "I32",{ 2, 3 },{ 1 },{ 2, 1, 3 } }, + unsqueeze_test_params{"FP32",{ 3 },{ 0 },{ 1, 3 } }, + unsqueeze_test_params{"FP32",{ 3 },{ 0, 1, 2 },{ 1, 1, 1, 3 } }, + unsqueeze_test_params{"FP32",{ 3 },{ 0, 2, 3 },{ 1, 3, 1, 1 } }, + unsqueeze_test_params{"FP32",{ 2, 3 },{ 0, 3 },{ 1, 2, 3, 1 } }, + unsqueeze_test_params{"FP32",{ 2, 3 },{ 1 },{ 2, 1, 3 } } + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_activation_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_activation_test.cpp index a0898b5..227f632 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_activation_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_activation_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_scaleshift_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_scaleshift_test.cpp index 544f51a..979796f 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_scaleshift_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_scaleshift_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_test.cpp index 6920b55..450abbe 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_concat_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_concat_test.cpp index 7396700..e9c7eec 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_concat_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_concat_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_conv_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_conv_test.cpp index dbfbc06..7eae8c4 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_conv_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_conv_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -356,15 +356,6 @@ INSTANTIATE_TEST_CASE_P( {3, 3}, {1, 2}, {0, 0}, {0, 0}, 20, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit }, conv_test_params{{1, 1, 32, 16}, {2, 4}, {2, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit }, -#ifdef USE_MKL - conv_test_params{{1, 9, 16, 32}, - {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 6, MKLDNNPlugin::impl_desc_type::gemm, - {MKLDNNPlugin::impl_desc_type::gemm_any, - MKLDNNPlugin::impl_desc_type::gemm_blas, - MKLDNNPlugin::impl_desc_type::gemm_avx512, - MKLDNNPlugin::impl_desc_type::gemm_avx2, - MKLDNNPlugin::impl_desc_type::gemm_sse42} }, -#endif conv_test_params{{1, 9, 32, 16}, {2, 4}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} }, @@ -372,7 +363,7 @@ INSTANTIATE_TEST_CASE_P( {3, 3}, {1, 1}, {1, 1}, {0, 0}, 64, 1, "", 3, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd, MKLDNNPlugin::impl_desc_type::ref_any}}, // 5D - /*9*/ conv_test_params{{1, 3, 15, 20, 20}, + /*8*/ conv_test_params{{1, 3, 15, 20, 20}, {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} }, conv_test_params{{1, 24, 15, 20, 20}, @@ -385,9 +376,16 @@ INSTANTIATE_TEST_CASE_P( {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit }, conv_test_params{{1, 24, 15, 25, 20}, {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit }, - /*14*/ conv_test_params{{1, 32, 15, 25, 20}, + /*13*/ conv_test_params{{1, 32, 15, 25, 20}, {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit }, #ifdef USE_MKL + conv_test_params{{1, 9, 16, 32}, + {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 6, MKLDNNPlugin::impl_desc_type::gemm, + {MKLDNNPlugin::impl_desc_type::gemm_any, + MKLDNNPlugin::impl_desc_type::gemm_blas, + MKLDNNPlugin::impl_desc_type::gemm_avx512, + MKLDNNPlugin::impl_desc_type::gemm_avx2, + MKLDNNPlugin::impl_desc_type::gemm_sse42} }, conv_test_params{{1, 5, 15, 20, 20}, {3, 3, 3}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::gemm_blas }, conv_test_params{{1, 5, 15, 20, 20}, @@ -406,7 +404,6 @@ INSTANTIATE_TEST_CASE_P( {5, 5, 5}, {1, 1, 1}, {2, 2, 2}, {2, 2, 2}, 16, 1, "", 2, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} })); - class MKLDNNGraphDynBatchConvolutionTests: public MKLDNNGraphConvolutionTests { protected: virtual void SetUp() { @@ -515,6 +512,7 @@ INSTANTIATE_TEST_CASE_P( conv_test_params{{1, 1, 32, 16}, {2, 4}, {2, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit, {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd} }, +#ifdef USE_MKL conv_test_params{{1, 9, 16, 32}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 7, MKLDNNPlugin::impl_desc_type::gemm, {MKLDNNPlugin::impl_desc_type::gemm_any, @@ -523,5 +521,6 @@ INSTANTIATE_TEST_CASE_P( MKLDNNPlugin::impl_desc_type::gemm_avx2, MKLDNNPlugin::impl_desc_type::gemm_sse42} }, +#endif conv_test_params{{1, 9, 32, 16}, {2, 4}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} })); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp index 545ac15..1371900 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_deconv_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_deconv_test.cpp index b263511..d416f81 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_deconv_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_deconv_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +11,7 @@ #include "single_layer_common.hpp" #include #include +#include "ir_gen_helper.hpp" #include "tests_common.hpp" @@ -18,6 +19,7 @@ using namespace InferenceEngine; using namespace ::testing; using namespace std; using namespace mkldnn; +using namespace single_layer_tests; struct deconv_test_params { @@ -69,8 +71,8 @@ void ref_deconv(const InferenceEngine::TBlob &src, const InferenceEngine size_t OC = prm.out_c; - size_t OW = SW * (IW - 1) + KW - 2 * PW; - size_t OH = SH * (IH - 1) + KH - 2 * PH; + size_t OW = SW * (IW - 1lu) + KW - 2lu * PW; + size_t OH = SH * (IH - 1lu) + KH - 2lu * PH; size_t OD = dims_size == 5 ? (SD * (ID - 1) + KD - 2 * PD) : 1u; const data_t *src_data = src.readOnly(); @@ -86,61 +88,70 @@ void ref_deconv(const InferenceEngine::TBlob &src, const InferenceEngine size_t CI1 = IH * IW; size_t CI2 = CI1 * ID; size_t CI3 = CI2 * IC; + + size_t OC_G = OC / G; + size_t IC_G = IC / G; size_t CK1 = KH * KW; size_t CK2 = CK1 * KD; - size_t CK3 = CK2 * (OC / G); - size_t CK4 = CK3 * (IC / G); - - for (int g = 0; g < G; ++g) { - for (int mb = 0; mb < MB; ++mb) { - for (int oc = 0; oc < OC / G; ++oc) { - for (int od = 0; od < OD; ++od) { - for (int oh = 0; oh < OH; ++oh) { - for (int ow = 0; ow < OW; ++ow) { - size_t didx = mb * CS3 - + (g * OC / G + oc) * CS2 - + od * CS1 - + oh * OW - + ow; + size_t CK3 = CK2 * OC_G; + size_t CK4 = CK3 * IC_G; + + for (size_t g = 0lu; g < G; ++g) { + size_t g_OC_G = g * OC_G; + size_t g_IC_G = g * IC_G; + size_t g_CK4 = g * CK4; + for (size_t mb = 0lu; mb < MB; ++mb) { + size_t mb_CS3 = mb * CS3; + size_t mb_CI3 = mb * CI3; + for (size_t oc = 0lu; oc < OC_G; ++oc) { + size_t g_OC_G_oc = g_OC_G + oc; + size_t mb_CS3_g_OC_G_oc_CS2 = mb_CS3 + g_OC_G_oc * CS2; + size_t g_CK4_oc_CK2 = g_CK4 + oc * CK2; + for (size_t od = 0lu; od < OD; ++od) { + size_t mb_CS3_g_OC_G_oc_CS2_od_CS1 = mb_CS3_g_OC_G_oc_CS2 + od * CS1; + size_t od_PD = od + PD; + for (size_t oh = 0lu; oh < OH; ++oh) { + size_t mb_CS3_g_OC_G_oc_CS2_od_CS1_oh_OW = mb_CS3_g_OC_G_oc_CS2_od_CS1 + oh * OW; + size_t oh_PH = oh + PH; + for (size_t ow = 0lu; ow < OW; ++ow) { + size_t didx = mb_CS3_g_OC_G_oc_CS2_od_CS1_oh_OW + ow; + size_t ow_PW = ow + PW; dst_data[didx] = data_t(0); - if (prm.with_bias) dst_data[didx] += bias_data[g * OC / G + oc]; - - for (int ic = 0; ic < IC / G; ic++) { - for (int kd = 0; kd < KD; kd++) { - for (int kh = 0; kh < KH; kh++) { - for (int kw = 0; kw < KW; kw++) { - if (ow + PW < kw || oh + PH < kh || od + PD < kd) - continue; + if (prm.with_bias) dst_data[didx] += bias_data[g_OC_G_oc]; + + for (size_t ic = 0lu; ic < IC_G; ic++) { + size_t mb_CI3_g_IC_G_ic_CI2 = mb_CI3 + (g_IC_G + ic) * CI2; + size_t g_CK4_oc_CK2_ic_CK3 = g_CK4_oc_CK2 + ic * CK3; + for (int kd = 0lu; kd < KD; kd++) { + if (od_PD < kd) continue; + size_t id = od_PD - kd; + if (id % SD != 0) continue; + id /= SD; + if (id >= ID) continue; + size_t mb_CI3_g_IC_G_ic_CI2_id_CI1 = mb_CI3_g_IC_G_ic_CI2 + id * CI1; + size_t g_CK4_oc_CK2_ic_CK3_kd_CK1 = g_CK4_oc_CK2_ic_CK3 + kd * CK1; + for (size_t kh = 0lu; kh < KH; kh++) { + if (oh_PH < kh) continue; + size_t ih = oh_PH - kh; + if (ih % SH != 0) continue; + ih /= SH; + if (ih >= IH) continue; + size_t mb_CI3_g_IC_G_ic_CI2_id_CI1_ih_IW = mb_CI3_g_IC_G_ic_CI2_id_CI1 + ih * IW; + size_t g_CK4_oc_CK2_ic_CK3_kd_CK1_kh_KW = g_CK4_oc_CK2_ic_CK3_kd_CK1 + kh * KW; + for (size_t kw = 0lu; kw < KW; kw++) { + if (ow_PW < kw) continue; + size_t iw = ow_PW - kw; + if (iw % SW != 0) continue; + iw /= SW; + if (iw >= IW) continue; - size_t iw = ow - kw + PW; - size_t ih = oh - kh + PH; - size_t id = od - kd + PD; + size_t sidx = mb_CI3_g_IC_G_ic_CI2_id_CI1_ih_IW + iw; - if (iw % SW != 0 || ih % SH != 0 || id % SD != 0) - continue; + size_t widx = g_CK4_oc_CK2_ic_CK3_kd_CK1_kh_KW + kw; - iw /= SW; - ih /= SH; - id /= SD; - - if (ih < IH && iw < IW && id < ID) { - size_t sidx = mb * CI3 - + (g * IC / G + ic) * CI2 - + id * CI1 - + ih * IW - + iw; - - size_t widx = g * CK4 - + ic * CK3 - + oc * CK2 - + kd * CK1 - + kh * KW - + kw; - - dst_data[didx] += src_data[sidx] * weights_data[widx]; - } + dst_data[didx] += src_data[sidx] * weights_data[widx]; } } } @@ -155,15 +166,7 @@ void ref_deconv(const InferenceEngine::TBlob &src, const InferenceEngine class MKLDNNGraphDeconvolutionalTests: public TestsCommon, public WithParamInterface { - std::string model_t_5D = R"V0G0N( - - - - - __SRC_DIMS__ - - - + std::string layers_t = R"V0G0N( - __SRC_DIMS__ + + __SRC_DIMS__ _IN_ - _OC___DST_DIMS__ + _OC_ + __DST_DIMS__ - - +)V0G0N"; + + std::string edges_t = R"V0G0N( - - )V0G0N"; protected: std::string getModel(deconv_test_params p) { - std::string model = model_t_5D; - auto dims_size = p.dims.size(); + std::string model = layers_t; + std::string s_dims; for (auto& dim : p.dims) { s_dims += "\n "; @@ -243,6 +247,8 @@ protected: } REPLACE_WITH_STR(model, "_IMPLS_", impls); + model = IRTemplateGenerator::getIRTemplate("Deconvolution_Only", p.dims, "FP32", model, edges_t); + return model; } @@ -308,16 +314,8 @@ protected: InferenceEngine::SizeVector dims_src = p.dims; - InferenceEngine::Layout layout = ANY; - switch (p.dims.size()) { - case 4: - layout = InferenceEngine::NCHW; - break; - case 5: - layout = InferenceEngine::NCDHW; - break; - } - InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob( + InferenceEngine::Precision::FP32, InferenceEngine::TensorDesc::getLayoutByDims(p.dims), dims_src); src->allocate(); fill_data(src->buffer(), src->size()); @@ -362,32 +360,28 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( /*0*/ deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{3, 3, 3, 3}, {4, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::jit} }, - deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, - deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, - deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{2, 8, 5, 5}, {8, 8}, {4, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{2, 8, 5, 5}, {4, 8}, {2, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, - /*8*/ deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::jit} }, + /*5*/ deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{3, 3, 3, 3}, {4, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::jit} }, - deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, - deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, - deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 8, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{2, 8, 5, 5}, {8, 8}, {4, 4}, {1, 1}, {0, 0}, 8, 8, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{2, 8, 5, 5}, {4, 8}, {2, 4}, {1, 1}, {0, 0}, 8, 8, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any}}, - /*17*/ deconv_test_params{{2, 8, 5, 5}, {1, 3}, {1, 1}, {0, 1}, {0, 1}, 8, 8, true, "", 2, + /*11*/ deconv_test_params{{2, 8, 5, 5}, {1, 3}, {1, 1}, {0, 1}, {0, 1}, 8, 8, true, "", 2, {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any}}, deconv_test_params{{1, 6, 6, 5}, {3, 1}, {1, 1}, {1, 0}, {1, 0}, 9, 3, true, "", 2, {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any}}, - deconv_test_params{{2, 24, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, - deconv_test_params{{2, 24, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 1, true, "", 3, {MKLDNNPlugin::impl_desc_type::jit}}, - deconv_test_params{{2, 72, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 72, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, - deconv_test_params{{1, 12, 2, 2}, {4, 4}, {2, 2}, {1, 1}, {1, 1}, 12, 12, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, #ifdef USE_MKL + deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, + deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, + deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 2, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm}}, + deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, + deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, + deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 2, true, "", 3, {MKLDNNPlugin::impl_desc_type::gemm}}, deconv_test_params{{1, 6, 6, 5}, {3, 1}, {1, 1}, {1, 0}, {1, 0}, 9, 3, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm_blas}}, @@ -396,7 +390,7 @@ INSTANTIATE_TEST_CASE_P( deconv_test_params{{1, 32, 12, 12, 2}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, {1, 0, 0}, 16, 1, true, "", 4, {MKLDNNPlugin::impl_desc_type::gemm_blas} }, deconv_test_params{{1, 25, 1, 1, 1}, {4, 4, 4}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 64, 1, true, "valid", 3, - {MKLDNNPlugin::impl_desc_type::gemm_blas} }, + {MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{1, 32, 16, 16, 16}, {4, 4, 4}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, 1, 1, true, "same_upper", 3, {MKLDNNPlugin::impl_desc_type::gemm_blas} }, deconv_test_params{{1, 64, 12, 12, 2}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, {1, 0, 0}, 32, 1, true, "same_upper", 3, @@ -404,10 +398,13 @@ INSTANTIATE_TEST_CASE_P( deconv_test_params{{1, 50, 1, 1, 1}, {4, 4, 4}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 128, 1, true, "", 3, {MKLDNNPlugin::impl_desc_type::gemm_blas}, {MKLDNNPlugin::impl_desc_type::gemm_blas}}, #endif + deconv_test_params{{2, 24, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, + deconv_test_params{{2, 24, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 1, true, "", 3, {MKLDNNPlugin::impl_desc_type::jit}}, + deconv_test_params{{2, 72, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 72, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, + deconv_test_params{{1, 12, 2, 2}, {4, 4}, {2, 2}, {1, 1}, {1, 1}, 12, 12, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, // 5D deconv_test_params{{1, 2, 8, 5, 5}, {3, 3, 3}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 4, 1, true, "", 4, {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any} } - // Blocked, with biases // TODO support on jit // deconv_test_params{{2, 24, 5, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}, @@ -471,18 +468,8 @@ protected: graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}}); graph.CreateGraph(net_reader.getNetwork()); - InferenceEngine::SizeVector dims_src = p.dims; - - InferenceEngine::Layout layout = ANY; - switch (p.dims.size()) { - case 4: - layout = InferenceEngine::NCHW; - break; - case 5: - layout = InferenceEngine::NCDHW; - break; - } - InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob( + InferenceEngine::Precision::FP32, InferenceEngine::TensorDesc::getLayoutByDims(p.dims), p.dims); InferenceEngine::TBlob* srcPtr = dynamic_cast*>(src.get()); if (srcPtr == nullptr) FAIL() << "Cannot cast blob to TBlob."; @@ -523,10 +510,12 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 5, {MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{3, 3, 3, 3}, {4, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 5, {MKLDNNPlugin::impl_desc_type::jit} }, +#ifdef USE_MKL deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 4, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} }, deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 2, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm}}, +#endif deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{2, 8, 5, 5}, {8, 8}, {4, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}, deconv_test_params{{2, 8, 5, 5}, {4, 8}, {2, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}} diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_depthwise_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_depthwise_test.cpp index 27bd241..f7c1368 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_depthwise_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_depthwise_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp index e1d288d..38f95ca 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -19,10 +19,14 @@ using namespace mkldnn; struct eltwise_test_params { // Formats: NCHW, NCDHW - vector dims; + vector dims1; + vector dims2; + vector dims3; enum opType { - Sum = 0, Prod = 1, Max = 2 + Sum = 0, Prod, Max, Min, Sub, Div, Squared_diff, Floor_mod, Pow, + Logical_AND, Logical_OR, Logical_XOR, + Less, Less_equal, Greater, Greater_equal, Equal, Not_equal }; opType op; @@ -55,74 +59,235 @@ void ref_eltwise(const std::vector> &src, Inferen data_t *dst_data = dst.data(); const data_t *src_data = src[0].readOnly(); + auto& dims = dst.getTensorDesc().getDims(); + auto& dims0 = src[0].dims(); - for (int i = 0; i < src[0].size(); i++) { - switch (prm.op) { - case eltwise_test_params::Sum: { - dst_data[i] = scales[0]*src_data[i]; + int offset_in[5] = {1, 1, 1, 1, 1}; + int offset_out[5] = {1, 1, 1, 1, 1}; + + for (int i = 0; i < dims0.size(); i++) + offset_in[5 - dims0.size() + i] = dims0[i]; + for (int i = 0; i < dims.size(); i++) + offset_out[5 - dims.size() + i] = dims[i]; + + unsigned long j = 0, k = 0; + + for (int i0 = 0; i0 < offset_out[0]; i0++) { + if (i0 > offset_in[0] - 1) { + k -= offset_in[1]*offset_in[2]*offset_in[3]*offset_in[4]; + } + for (int i1 = 0; i1 < offset_out[1]; i1++) { + if (i1 > offset_in[1] - 1) { + k -= offset_in[2]*offset_in[3]*offset_in[4]; } - break; - default: { - dst_data[i] = src_data[i]; + for (int i2 = 0; i2 < offset_out[2]; i2++) { + if (i2 > offset_in[2] - 1) { + k -= offset_in[3]*offset_in[4]; + } + for (int i3 = 0; i3 < offset_out[3]; i3++) { + if (i3 > offset_in[3] - 1) { + k -= offset_in[4]; + } + for (int i4 = 0; i4 < offset_out[4]; i4++) { + if (i4 > offset_in[4] - 1) { + k -= 1; + } + if (prm.op == eltwise_test_params::Sum) { + dst_data[j++] = scales[0] * src_data[k++]; + } else { + dst_data[j++] = src_data[k++]; + } + } + } } } } for (int n = 1; n < src.size(); n++) { + j = 0; + k = 0; src_data = src[n].readOnly(); - - for (int i = 0; i < src[n].size(); i++) { - switch (prm.op) { - case eltwise_test_params::Sum: { - dst_data[i] += scales[n]*src_data[i]; - } - break; - - case eltwise_test_params::Prod: { - dst_data[i] *= src_data[i]; + auto& dims1 = src[n].dims(); + int offset_in1[5] = {1, 1, 1, 1, 1}; + for (int i = 0; i < dims1.size(); i++) + offset_in1[5 - dims1.size() + i] = dims1[i]; + + for (int i0 = 0; i0 < offset_out[0]; i0++) { + if (i0 > offset_in1[0] - 1) { + k -= offset_in1[1]*offset_in1[2]*offset_in1[3]*offset_in1[4]; + } + for (int i1 = 0; i1 < offset_out[1]; i1++) { + if (i1 > offset_in1[1] - 1) { + k -= offset_in1[2]*offset_in1[3]*offset_in1[4]; } - break; - - case eltwise_test_params::Max: { - dst_data[i] = (std::max)(dst_data[i], src_data[i]); + for (int i2 = 0; i2 < offset_out[2]; i2++) { + if (i2 > offset_in1[2] - 1) { + k -= offset_in1[3]*offset_in1[4]; + } + for (int i3 = 0; i3 < offset_out[3]; i3++) { + if (i3 > offset_in1[3] - 1) { + k -= offset_in1[4]; + } + for (int i4 = 0; i4 < offset_out[4]; i4++, j++, k++) { + if (i4 > offset_in1[4] - 1) { + k -= 1; + } + switch (prm.op) { + case eltwise_test_params::Sum: + dst_data[j] += scales[n] * src_data[k]; + break; + case eltwise_test_params::Sub: + dst_data[j] = dst_data[j] - src_data[k]; + break; + case eltwise_test_params::Min: + dst_data[j] = (std::min)(dst_data[j], src_data[k]); + break; + case eltwise_test_params::Max: + dst_data[j] = (std::max)(dst_data[j], src_data[k]); + break; + case eltwise_test_params::Prod: + dst_data[j] = dst_data[j] * src_data[k]; + break; + case eltwise_test_params::Div: + dst_data[j] = dst_data[j] / src_data[k]; + break; + case eltwise_test_params::Squared_diff: + dst_data[j] = (dst_data[j] - src_data[k]) * (dst_data[j] - src_data[k]); + break; + case eltwise_test_params::Logical_OR: + dst_data[j] = dst_data[j] || src_data[k]; + break; + case eltwise_test_params::Logical_AND: + dst_data[j] = dst_data[j] && src_data[k]; + break; + case eltwise_test_params::Logical_XOR: + dst_data[j] = (dst_data[j] || src_data[k]) - (dst_data[j] && src_data[k]); + break; + case eltwise_test_params::Less: + dst_data[j] = dst_data[j] < src_data[k]; + break; + case eltwise_test_params::Less_equal: + dst_data[j] = dst_data[j] <= src_data[k]; + break; + case eltwise_test_params::Greater: + dst_data[j] = dst_data[j] > src_data[k]; + break; + case eltwise_test_params::Greater_equal: + dst_data[j] = dst_data[j] >= src_data[k]; + break; + case eltwise_test_params::Equal: + dst_data[j] = dst_data[j] == src_data[k]; + break; + case eltwise_test_params::Not_equal: + dst_data[j] = dst_data[j] != src_data[k]; + break; + case eltwise_test_params::Pow: + dst_data[j] = std::pow(dst_data[j], src_data[k]); + break; + case eltwise_test_params::Floor_mod: + dst_data[j] = dst_data[j] - dst_data[j] / src_data[k] * src_data[k]; + break; + } + } + } } - break; } } } } -class MKLDNNGraphEltwiseTests: public TestsCommon, +std::string select_op(eltwise_test_params::opType op) { + std::string str_op; + switch(op){ + case eltwise_test_params::opType::Sum: + str_op = "sum"; + break; + case eltwise_test_params::opType::Prod: + str_op = "prod"; + break; + case eltwise_test_params::opType::Max: + str_op = "max"; + break; + case eltwise_test_params::opType::Min: + str_op = "min"; + break; + case eltwise_test_params::opType::Sub: + str_op = "sub"; + break; + case eltwise_test_params::opType::Div: + str_op = "div"; + break; + case eltwise_test_params::opType::Squared_diff: + str_op = "squared_diff"; + break; + case eltwise_test_params::opType::Logical_AND: + str_op = "logical_and"; + break; + case eltwise_test_params::opType::Logical_OR: + str_op = "logical_or"; + break; + case eltwise_test_params::opType::Logical_XOR: + str_op = "logical_xor"; + break; + case eltwise_test_params::opType ::Less: + str_op = "less"; + break; + case eltwise_test_params::opType::Less_equal: + str_op = "less_equal"; + break; + case eltwise_test_params::opType::Greater: + str_op = "greater"; + break; + case eltwise_test_params::opType::Greater_equal: + str_op = "greater_equal"; + break; + case eltwise_test_params::opType::Equal: + str_op = "equal"; + break; + case eltwise_test_params::opType::Not_equal: + str_op = "not_equal"; + break; + case eltwise_test_params::opType::Pow: + str_op = "pow"; + break; + case eltwise_test_params::opType::Floor_mod: + str_op = "floor_mod"; + break; + } + return str_op; +} + +class MKLDNNGraphEltwise3InputsTests: public TestsCommon, public WithParamInterface { std::string model_t = R"V0G0N( - __SRC_DIMS__ + __SRC_DIMS_1__ - __SRC_DIMS__ + __SRC_DIMS_2__ - __SRC_DIMS__ + __SRC_DIMS_3__ - __SRC_DIMS__ + __SRC_DIMS_1__ - __SRC_DIMS__ + __SRC_DIMS_2__ - __SRC_DIMS__ + __SRC_DIMS_3__ @@ -142,22 +307,40 @@ class MKLDNNGraphEltwiseTests: public TestsCommon, protected: std::string getModel(eltwise_test_params p) { std::string model = model_t; - std::string op; - - if (p.op == 0) { - op = "sum"; - } else if (p.op == 1) { - op = "mul"; - } else if (p.op == 2) { - op = "max"; + std::string op = select_op(p.op); + + std::string src_dims1; + for (auto &dim : p.dims1) { + src_dims1 += "\n "; + src_dims1 += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1); + + std::string src_dims2; + for (auto &dim : p.dims2) { + src_dims2 += "\n "; + src_dims2 += std::to_string(dim) + ""; } + REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2); + + std::string src_dims3; + for (auto &dim : p.dims3) { + src_dims3 += "\n "; + src_dims3 += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__SRC_DIMS_3__", src_dims3); std::string src_dims; - for (auto& dim : p.dims) { - src_dims += "\n "; - src_dims += std::to_string(dim) + ""; + std::vector dims = p.dims1; + for (int i = 0; i < dims.size(); i++) { + dims[i] = std::max(p.dims1[i], p.dims2[i]); + dims[i] = std::max(dims[i], p.dims3[i]); + } + for (auto &dim : dims) { + src_dims += "\n "; + src_dims += std::to_string(dim) + ""; } - REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims); + REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims); std::string scale; if (!p.scales.empty()) { @@ -165,6 +348,7 @@ protected: } REPLACE_WITH_STR(model, "_OP_", op); REPLACE_WITH_STR(model, "_COEFF_", scale); + return model; } @@ -194,43 +378,61 @@ protected: ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType()); } } - - InferenceEngine::SizeVector dims_src = p.dims; - InferenceEngine::Layout layout = InferenceEngine::ANY; - switch (p.dims.size()) { + InferenceEngine::SizeVector dims_src1 = p.dims1; + InferenceEngine::Layout layout1 = InferenceEngine::ANY; + switch (p.dims1.size()) { + case 4: + layout1 = InferenceEngine::NCHW; + break; + case 5: + layout1 = InferenceEngine::NCDHW; + break; + } + InferenceEngine::SizeVector dims_src2 = p.dims2; + InferenceEngine::Layout layout2 = InferenceEngine::ANY; + switch (p.dims2.size()) { case 4: - layout = InferenceEngine::NCHW; + layout2 = InferenceEngine::NCHW; break; case 5: - layout = InferenceEngine::NCDHW; + layout2 = InferenceEngine::NCDHW; + break; + } + InferenceEngine::SizeVector dims_src3 = p.dims3; + InferenceEngine::Layout layout3 = InferenceEngine::ANY; + switch (p.dims3.size()) { + case 4: + layout3 = InferenceEngine::NCHW; + break; + case 5: + layout3 = InferenceEngine::NCDHW; break; } - InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout1, dims_src1); src1->allocate(); InferenceEngine::TBlob* srcPtr1 = dynamic_cast*>(src1.get()); if (srcPtr1 == nullptr) FAIL() << "Cannot cast blob to TBlob."; - - fill_data(src1->buffer(), src1->size()); - InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1); + InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout2, dims_src2); src2->allocate(); InferenceEngine::TBlob* srcPtr2 = dynamic_cast*>(src2.get()); if (srcPtr2 == nullptr) FAIL() << "Cannot cast blob to TBlob."; - fill_data(src2->buffer(), src2->size()); - InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2); + InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout3, dims_src3); src3->allocate(); InferenceEngine::TBlob* srcPtr3 = dynamic_cast*>(src3.get()); if (srcPtr3 == nullptr) FAIL() << "Cannot cast blob to TBlob."; - fill_data(src3->buffer(), src3->size()); + fill_data_sine(src3->buffer(), src3->size(), 0.1, 0.9, 3); InferenceEngine::BlobMap srcs; srcs.insert(std::pair("in1", src1)); srcs.insert(std::pair("in2", src2)); @@ -263,33 +465,35 @@ protected: } }; -TEST_P(MKLDNNGraphEltwiseTests, TestsEltwise) {} +TEST_P(MKLDNNGraphEltwise3InputsTests, TestsEltwise) {} INSTANTIATE_TEST_CASE_P( - TestsEltwise, MKLDNNGraphEltwiseTests, + TestsEltwise, MKLDNNGraphEltwise3InputsTests, ::testing::Values( - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, { + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); ASSERT_EQ(3, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); + ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); } } }, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref, { + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); ASSERT_EQ(3, impl.getConfig().inConfs.size()); ASSERT_EQ(1, impl.getConfig().outConfs.size()); ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout()); ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout()); + ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout()); ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); } } }, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref, { + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); ASSERT_EQ(3, impl.getConfig().inConfs.size()); @@ -300,7 +504,7 @@ INSTANTIATE_TEST_CASE_P( ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); } } }, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref, { + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); ASSERT_EQ(3, impl.getConfig().inConfs.size()); @@ -311,7 +515,7 @@ INSTANTIATE_TEST_CASE_P( ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); } } }, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref, { + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); ASSERT_EQ(3, impl.getConfig().inConfs.size()); @@ -322,7 +526,7 @@ INSTANTIATE_TEST_CASE_P( ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout()); } } }, - eltwise_test_params{{1, 32, 16, 16, 16}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, { + eltwise_test_params{{1, 32, 16, 16, 16},{1, 32, 16, 16, 16},{1, 32, 16, 16, 16}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, { [](MKLDNNPlugin::PrimitiveDescInfo impl) { ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType()); ASSERT_EQ(3, impl.getConfig().inConfs.size()); @@ -332,17 +536,258 @@ INSTANTIATE_TEST_CASE_P( ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(2).desc.getLayout()); ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout()); } - } } + } }, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref} )); + +class MKLDNNGraphEltwise2InputsTests: public TestsCommon, + public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + __SRC_DIMS_1__ + + + + + + __SRC_DIMS_2__ + + + + + + + __SRC_DIMS_1__ + + __SRC_DIMS_2__ + + + + __SRC_DIMS__ + + + + + + + + + +)V0G0N"; -class MKLDNNGraphDynBatchEltwiseTests: public MKLDNNGraphEltwiseTests { protected: + std::string getModel(eltwise_test_params p) { + std::string model = model_t; + std::string op = select_op(p.op); + + std::string src_dims1; + for (auto &dim : p.dims1) { + src_dims1 += "\n "; + src_dims1 += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__SRC_DIMS_1__", src_dims1); + + std::string src_dims2; + for (auto &dim : p.dims2) { + src_dims2 += "\n "; + src_dims2 += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__SRC_DIMS_2__", src_dims2); + + std::string src_dims; + std::vector dims = (p.dims1.size() >= p.dims2.size()) ? p.dims1 : p.dims2; + int i = dims.size() - 1, j = p.dims1.size() - 1, k = p.dims2.size() - 1; + for (; j >= 0 && k >= 0; i--, j--, k-- ) { + dims[i] = std::max(p.dims1[j], p.dims2[k]); + } + + for (auto &dim : dims) { + src_dims += "\n "; + src_dims += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims); + + std::string scale; + if (!p.scales.empty()) { + scale = std::string("coeff=\"") + p.scales + std::string("\""); + } + REPLACE_WITH_STR(model, "_OP_", op); + REPLACE_WITH_STR(model, "_COEFF_", scale); + + return model; + } + + virtual void TearDown() { + } + virtual void SetUp() { try { TestsCommon::SetUp(); eltwise_test_params p = ::testing::WithParamInterface::GetParam(); std::string model = getModel(p); - size_t MB = p.dims[0]; + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork()); + + auto& nodes = graph.getNodes(); + for (int i = 0; i < nodes.size(); i++) { + if (nodes[i]->getType() == MKLDNNPlugin::Eltwise) { + ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size()); + for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) { + p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j)); + } + ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor()); + ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType()); + } + } + InferenceEngine::SizeVector dims_src1 = p.dims1; + InferenceEngine::Layout layout1 = InferenceEngine::ANY; + switch (p.dims1.size()) { + case 4: + layout1 = InferenceEngine::NCHW; + break; + case 5: + layout1 = InferenceEngine::NCDHW; + break; + } + InferenceEngine::SizeVector dims_src2 = p.dims2; + InferenceEngine::Layout layout2 = InferenceEngine::ANY; + switch (p.dims2.size()) { + case 4: + layout2 = InferenceEngine::NCHW; + break; + case 5: + layout2 = InferenceEngine::NCDHW; + break; + } + + InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout1, dims_src1); + src1->allocate(); + + InferenceEngine::TBlob* srcPtr1 = dynamic_cast*>(src1.get()); + + if (srcPtr1 == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + fill_data_sine(src1->buffer(), src1->size(), 0.1, 0.9, 1); + InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout2, dims_src2); + src2->allocate(); + + InferenceEngine::TBlob* srcPtr2 = dynamic_cast*>(src2.get()); + + if (srcPtr2 == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + fill_data_sine(src2->buffer(), src2->size(), 0.1, 0.9, 2); + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("in1", src1)); + srcs.insert(std::pair("in2", src2)); + + InferenceEngine::OutputsDataMap out; + out = net_reader.getNetwork().getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + graph.Infer(srcs, outputBlobs); + + InferenceEngine::TBlob dst_ref(item.second->getTensorDesc()); + dst_ref.allocate(); + + std::vector> src_vec = {*srcPtr1, *srcPtr2}; + + ref_eltwise(src_vec, dst_ref, p); + + compare(*output, dst_ref, 0.0005f); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } + +}; + +TEST_P(MKLDNNGraphEltwise2InputsTests, TestsEltwise) {} + +INSTANTIATE_TEST_CASE_P( + TestsEltwise, MKLDNNGraphEltwise2InputsTests, + ::testing::Values( + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref} + )); + +INSTANTIATE_TEST_CASE_P( + TestsBroadcasting, MKLDNNGraphEltwise2InputsTests, + ::testing::Values( + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Squared_diff, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 1, 3},{1, 1, 3, 3},{}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref} + )); + +INSTANTIATE_TEST_CASE_P( + TestsDiffDims, MKLDNNGraphEltwise2InputsTests, + ::testing::Values( + eltwise_test_params{{1},{1, 3},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3},{1},{}, eltwise_test_params::opType::Sum, "", 1, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3},{1},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 2, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3, 3},{1},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3, 3},{1, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3, 3},{1, 3, 3, 3},{}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref} + )); + +class MKLDNNGraphEltwiseDynBatchTests: public MKLDNNGraphEltwise3InputsTests { +protected: + virtual void SetUp() { + try { + TestsCommon::SetUp(); + eltwise_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + size_t MB = p.dims1[0]; if (MB < 2) MB = 2; @@ -359,18 +804,38 @@ protected: graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}}); graph.CreateGraph(net_reader.getNetwork()); - InferenceEngine::SizeVector dims_src = p.dims; - InferenceEngine::Layout layout = InferenceEngine::ANY; - switch (p.dims.size()) { + InferenceEngine::SizeVector dims_src1 = p.dims1; + InferenceEngine::Layout layout1 = InferenceEngine::ANY; + switch (p.dims1.size()) { case 4: - layout = InferenceEngine::NCHW; + layout1 = InferenceEngine::NCHW; break; case 5: - layout = InferenceEngine::NCDHW; + layout1 = InferenceEngine::NCDHW; + break; + } + InferenceEngine::SizeVector dims_src2 = p.dims2; + InferenceEngine::Layout layout2 = InferenceEngine::ANY; + switch (p.dims2.size()) { + case 4: + layout2 = InferenceEngine::NCHW; + break; + case 5: + layout2 = InferenceEngine::NCDHW; + break; + } + InferenceEngine::SizeVector dims_src3 = p.dims3; + InferenceEngine::Layout layout3 = InferenceEngine::ANY; + switch (p.dims3.size()) { + case 4: + layout3 = InferenceEngine::NCHW; + break; + case 5: + layout3 = InferenceEngine::NCDHW; break; } - InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout1, dims_src1); src1->allocate(); InferenceEngine::TBlob* srcPtr1 = dynamic_cast*>(src1.get()); @@ -379,7 +844,7 @@ protected: FAIL() << "Cannot cast blob to TBlob."; fill_data(src1->buffer(), src1->size()); - InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout2, dims_src2); src2->allocate(); InferenceEngine::TBlob* srcPtr2 = dynamic_cast*>(src2.get()); @@ -387,7 +852,7 @@ protected: if (srcPtr2 == nullptr) FAIL() << "Cannot cast blob to TBlob."; fill_data(src2->buffer(), src2->size()); - InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout, dims_src); + InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, layout3, dims_src3); src3->allocate(); InferenceEngine::TBlob* srcPtr3 = dynamic_cast*>(src3.get()); @@ -424,17 +889,24 @@ protected: } }; -TEST_P(MKLDNNGraphDynBatchEltwiseTests, TestsDynBatchEltwise) {} +TEST_P(MKLDNNGraphEltwiseDynBatchTests, TestsDynBatchEltwise) {} INSTANTIATE_TEST_CASE_P( - TestsDynBatchEltwise, MKLDNNGraphDynBatchEltwiseTests, + TestsDynBatchEltwise, MKLDNNGraphEltwiseDynBatchTests, ::testing::Values( - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref}, - eltwise_test_params{{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref})); - + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.0,1.0,1.0", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sum, "1.5,0.5,-2.0", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Prod, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Max, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Sub, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Min, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Div, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Pow, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_AND, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_OR, "", 3, MKLDNNPlugin::impl_desc_type::ref}, + eltwise_test_params{{1, 3, 3, 3},{1, 3, 3, 3},{1, 3, 3, 3}, eltwise_test_params::opType::Logical_XOR, "", 3, MKLDNNPlugin::impl_desc_type::ref} + )); struct precisions_test_2params { struct { @@ -551,4 +1023,3 @@ INSTANTIATE_TEST_CASE_P( precisions_test_2params{ {"FP32", "U8"}, 5, 1 }, precisions_test_2params{ { "U8", "U8"}, 6, 2 } )); - diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_fullyconnected_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_fullyconnected_test.cpp index dcf001f..4b74d64 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_fullyconnected_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_fullyconnected_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp index 8a2acf0..5921872 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_input_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_input_test.cpp index 1c1d76d..e5c1479 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_input_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_input_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -359,3 +359,114 @@ protected: }; TEST_F(MKLDNNGraphConstInputTests, TestsConstInput) {} + + +struct input_layout_test_params { + InferenceEngine::Layout layout; + std::vector reference; + MKLDNNPlugin::impl_desc_type selectedType; + std::vector> comp; +}; + +class MKLDNNGraphInputLayoutTest : public TestsCommon, public WithParamInterface { + std::string model_t = R"V0G0N( + + + + + + 1 + 3 + 2 + 2 + + + + + + + + 1 + 3 + 2 + 2 + + + + + 1 + 3 + 2 + 2 + + + + + + + + + + + + + + + + + + + +)V0G0N"; + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + input_layout_test_params p = ::testing::WithParamInterface::GetParam(); + std::string model = model_t; + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net_reader.getNetwork()); + + InferenceEngine::TensorDesc desc(InferenceEngine::Precision::FP32, { 1, 3, 2, 2 }, p.layout); + InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob(desc); + src->allocate(); + fill_data_dbgval(src->buffer(), src->size()); + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("input", src)); + + InferenceEngine::OutputsDataMap out = net_reader.getNetwork().getOutputsInfo(); + std::pair item = *out.begin(); + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + InferenceEngine::BlobMap outputBlobs; + outputBlobs[item.first] = output; + + graph.Infer(srcs, outputBlobs); + // Check results + if (memcmp((*output).data(), &p.reference[0], p.reference.size()) != 0) + FAIL() << "Wrong result with compare reference!"; + } + catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNGraphInputLayoutTest, TestsLayoutInput) {} + +INSTANTIATE_TEST_CASE_P( + TestsLayoutInput, MKLDNNGraphInputLayoutTest, + ::testing::Values( + input_layout_test_params{ InferenceEngine::NCHW, { 0,1,2,3,3,4,5,6,6,7,8,9 }, MKLDNNPlugin::impl_desc_type::unknown }, + input_layout_test_params{ InferenceEngine::NHWC, { 0,0,0,3,3,3,6,6,6,9,9,9 }, MKLDNNPlugin::impl_desc_type::unknown } +)); + diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp index 793e3d4..885cea5 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_lrn_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_lrn_test.cpp index 873bae5..a36717c 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_lrn_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_lrn_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_permute_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_permute_test.cpp index a40add1..492f8e5 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_permute_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_permute_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_pooling_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_pooling_test.cpp index a1ee6bd..8286c01 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_pooling_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_pooling_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp index 83cde28..1ea16ef 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_relu_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_relu_test.cpp index ce860c2..a55e731 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_relu_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_relu_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reorder_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reorder_test.cpp index c7c13ad..ab915d3 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reorder_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reorder_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reshape_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reshape_test.cpp index d85aaa5..3304a33 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reshape_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reshape_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_roi_pooling_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_roi_pooling_test.cpp index 1706f57..0c61255 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_roi_pooling_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_roi_pooling_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_simplernms_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_simplernms_test.cpp index 7109bdc..8e53244 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_simplernms_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_simplernms_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_softmax_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_softmax_test.cpp index 1675b09..e740124 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_softmax_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_softmax_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_split_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_split_test.cpp index e253a82..a3fe7d8 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_split_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_split_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_tile_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_tile_test.cpp index 4bb207d..5d7ba11 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_tile_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_tile_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp index 2974b37..330db7b 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_deconv_concat_tets.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_deconv_concat_tets.cpp new file mode 100644 index 0000000..b348610 --- /dev/null +++ b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_deconv_concat_tets.cpp @@ -0,0 +1,400 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "mkldnn_plugin/mkldnn_graph.h" + +#include "test_graph.hpp" + +#include "single_layer_common.hpp" +#include +#include "tests_common.hpp" +#include "ir_gen_helper.hpp" + +using namespace ::testing; +using namespace std; +using namespace mkldnn; +using namespace single_layer_tests; + +struct concat_params { + size_t axis; +}; + +struct deconv_concat_params { + // Formats: NCHW, NCDHW + std::vector in; + + conv_common_params deconv; + concat_params concat; + + std::vector preferTypes; +}; + +void ref_deconv_common(const InferenceEngine::Blob &src, + InferenceEngine::Blob &dst, + const float *weights_data, + size_t weights_size, + const float *bias_data, + size_t bias_size, + const conv_common_params &prm) { + auto dims_size = src.dims().size(); + + size_t G = prm.group; + size_t KW = prm.kernel[InferenceEngine::X_AXIS]; + size_t KH = prm.kernel[InferenceEngine::Y_AXIS]; + size_t KD = prm.kernel.size() > InferenceEngine::Z_AXIS ? prm.kernel[InferenceEngine::Z_AXIS] : 1u; + + size_t PW = prm.pads_begin[InferenceEngine::X_AXIS]; + size_t PH = prm.pads_begin[InferenceEngine::Y_AXIS]; + size_t PD = prm.pads_begin.size() > InferenceEngine::Z_AXIS ? prm.pads_begin[InferenceEngine::Z_AXIS] : 0u; + + size_t SW = prm.stride[InferenceEngine::X_AXIS]; + size_t SH = prm.stride[InferenceEngine::Y_AXIS]; + size_t SD = prm.stride.size() > InferenceEngine::Z_AXIS ? prm.stride[InferenceEngine::Z_AXIS] : 1u; + + size_t IW = src.dims()[dims_size - 1]; + size_t IH = src.dims()[dims_size - 2]; + size_t ID = dims_size == 5 ? src.dims()[dims_size - 3] : 1u; + size_t IC = src.dims()[1]; + size_t MB = src.dims()[0]; + + size_t OC = prm.out_c; + + size_t OW = SW * (IW - 1lu) + KW - 2lu * PW; + size_t OH = SH * (IH - 1lu) + KH - 2lu * PH; + size_t OD = dims_size == 5 ? (SD * (ID - 1) + KD - 2 * PD) : 1u; + + const float *src_data = src.cbuffer().as(); + float *dst_data = dst.buffer().as(); + + size_t CS1 = OH * OW; + size_t CS2 = CS1 * OD; + size_t CS3 = CS2 * OC; + + size_t CI1 = IH * IW; + size_t CI2 = CI1 * ID; + size_t CI3 = CI2 * IC; + + size_t OC_G = OC / G; + size_t IC_G = IC / G; + + size_t CK1 = KH * KW; + size_t CK2 = CK1 * KD; + size_t CK3 = CK2 * OC_G; + size_t CK4 = CK3 * IC_G; + + for (size_t g = 0lu; g < G; ++g) { + size_t g_OC_G = g * OC_G; + size_t g_IC_G = g * IC_G; + size_t g_CK4 = g * CK4; + for (size_t mb = 0lu; mb < MB; ++mb) { + size_t mb_CS3 = mb * CS3; + size_t mb_CI3 = mb * CI3; + for (size_t oc = 0lu; oc < OC_G; ++oc) { + size_t g_OC_G_oc = g_OC_G + oc; + size_t mb_CS3_g_OC_G_oc_CS2 = mb_CS3 + g_OC_G_oc * CS2; + size_t g_CK4_oc_CK2 = g_CK4 + oc * CK2; + for (size_t od = 0lu; od < OD; ++od) { + size_t mb_CS3_g_OC_G_oc_CS2_od_CS1 = mb_CS3_g_OC_G_oc_CS2 + od * CS1; + size_t od_PD = od + PD; + for (size_t oh = 0lu; oh < OH; ++oh) { + size_t mb_CS3_g_OC_G_oc_CS2_od_CS1_oh_OW = mb_CS3_g_OC_G_oc_CS2_od_CS1 + oh * OW; + size_t oh_PH = oh + PH; + for (size_t ow = 0lu; ow < OW; ++ow) { + size_t didx = mb_CS3_g_OC_G_oc_CS2_od_CS1_oh_OW + ow; + size_t ow_PW = ow + PW; + + dst_data[didx] = float(0); + if (prm.with_bias) dst_data[didx] += bias_data[g_OC_G_oc]; + + for (size_t ic = 0lu; ic < IC_G; ic++) { + size_t mb_CI3_g_IC_G_ic_CI2 = mb_CI3 + (g_IC_G + ic) * CI2; + size_t g_CK4_oc_CK2_ic_CK3 = g_CK4_oc_CK2 + ic * CK3; + for (int kd = 0lu; kd < KD; kd++) { + if (od_PD < kd) continue; + size_t id = od_PD - kd; + if (id % SD != 0) continue; + id /= SD; + if (id >= ID) continue; + size_t mb_CI3_g_IC_G_ic_CI2_id_CI1 = mb_CI3_g_IC_G_ic_CI2 + id * CI1; + size_t g_CK4_oc_CK2_ic_CK3_kd_CK1 = g_CK4_oc_CK2_ic_CK3 + kd * CK1; + for (size_t kh = 0lu; kh < KH; kh++) { + if (oh_PH < kh) continue; + size_t ih = oh_PH - kh; + if (ih % SH != 0) continue; + ih /= SH; + if (ih >= IH) continue; + size_t mb_CI3_g_IC_G_ic_CI2_id_CI1_ih_IW = mb_CI3_g_IC_G_ic_CI2_id_CI1 + ih * IW; + size_t g_CK4_oc_CK2_ic_CK3_kd_CK1_kh_KW = g_CK4_oc_CK2_ic_CK3_kd_CK1 + kh * KW; + for (size_t kw = 0lu; kw < KW; kw++) { + if (ow_PW < kw) continue; + size_t iw = ow_PW - kw; + if (iw % SW != 0) continue; + iw /= SW; + if (iw >= IW) continue; + + size_t sidx = mb_CI3_g_IC_G_ic_CI2_id_CI1_ih_IW + iw; + + size_t widx = g_CK4_oc_CK2_ic_CK3_kd_CK1_kh_KW + kw; + + dst_data[didx] += src_data[sidx] * weights_data[widx]; + } + } + } + } + } + } + } + } + } + } +} + +class MKLDNNDeconvConcatTests: public TestsCommon, + public WithParamInterface { + std::string layers_t = R"V0G0N( + + + + + __INP_DIMS__ + + + + + _IN_ + _OC_ + __DECONV_OUT_DIMS__ + + + + + + + + + + + _IN_ + _OC_ + __DECONV_OUT_DIMS__ + + + __INP_DIMS__ + + + + + __CONCAT_OUT_DIMS__ + + + +)V0G0N"; + + std::string edges_t = R"V0G0N( + + + +)V0G0N"; + + std::string getModel(deconv_concat_params p) { + std::string model = layers_t; + + std::string s_dims; + for (auto& dim : p.in) { + s_dims += "\n "; + s_dims += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__INP_DIMS__", s_dims); + + s_dims = ""; + size_t deconv_axis_val = p.in[p.concat.axis]; + int k_len = p.deconv.kernel.size(); + for (size_t i = 2lu; i < p.in.size(); i++) { + size_t inx = k_len - i + 1; + size_t dim = p.deconv.stride[inx] * (p.in[i] - 1) + p.deconv.kernel[inx] - 2 * p.deconv.pads_begin[inx]; + s_dims += "\n "; + s_dims += std::to_string(dim) + ""; + if (i == p.concat.axis) { + deconv_axis_val = dim; + } + } + REPLACE_WITH_STR(model, "__DECONV_OUT_DIMS__", s_dims); + + s_dims = ""; + for (size_t i = 0lu; i < p.in.size(); i++) { + size_t val = p.in[i]; + if (i == p.concat.axis) { + val += deconv_axis_val; + } + s_dims += "\n "; + s_dims += std::to_string(val) + ""; + } + REPLACE_WITH_STR(model, "__CONCAT_OUT_DIMS__", s_dims); + + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_K_", p.deconv.kernel); + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_KS_", p.deconv.stride); + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PB_", p.deconv.pads_begin); + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PE_", p.deconv.pads_end); + REPLACE_WITH_NUM(model, "_GC_", p.deconv.group); + REPLACE_WITH_NUM(model, "_OC_", p.deconv.out_c); + REPLACE_WITH_NUM(model, "_IN_", p.in[0]); + REPLACE_WITH_NUM(model, "__AXIS__", p.concat.axis); + + std::string impls; + for (const auto& preferType : p.preferTypes) { + if (!impls.empty()) + impls += ","; + impls += "cpu:" + MKLDNNGraphTestClass::getStrPrimitiveDescriptorType(preferType); + } + REPLACE_WITH_STR(model, "_IMPLS_", impls); + + model = IRTemplateGenerator::getIRTemplate("Deconvolution_Concat", p.in, "FP32", model, edges_t); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + deconv_concat_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + size_t blob_size = p.deconv.out_c * (p.in[1] / p.deconv.group); + for (int i = 0 ; i < p.deconv.kernel.size(); i++) { + blob_size *= p.deconv.kernel[i]; + } + InferenceEngine::SizeVector dims_weights = { blob_size }; + + std::vector blob_to_model; + InferenceEngine::Blob::Ptr weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, InferenceEngine::C, dims_weights); + weights->allocate(); + fill_data(weights->buffer().as(), weights->size()); + blob_to_model.push_back(weights); + + InferenceEngine::Blob::Ptr bias = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, InferenceEngine::C, {p.deconv.out_c}); + bias->allocate(); + fill_data(bias->buffer().as(), bias->size()); + blob_to_model.push_back(bias); + + size_t total_size_in_bytes = 0; + for (InferenceEngine::Blob::Ptr blb : blob_to_model) total_size_in_bytes += blb->byteSize(); + + InferenceEngine::TBlob::Ptr model_blob = + InferenceEngine::make_shared_blob(InferenceEngine::Precision::U8, InferenceEngine::C, {total_size_in_bytes}); + model_blob->allocate(); + uint8_t* model_blob_ptr = model_blob->buffer().as(); + for (InferenceEngine::Blob::Ptr blb : blob_to_model) { + memcpy(model_blob_ptr, blb->buffer().as(), blb->byteSize()); + model_blob_ptr += blb->byteSize(); + } + net_reader.SetWeights(model_blob); + + auto network = net_reader.getNetwork(); + MKLDNNGraphTestClass graph; + graph.CreateGraph(network); + + InferenceEngine::SizeVector dims_src = p.in; + + InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob( + InferenceEngine::Precision::FP32, InferenceEngine::TensorDesc::getLayoutByDims(p.in), dims_src); + src->allocate(); + fill_data(src->buffer(), src->size()); + + InferenceEngine::TBlob* srcPtr = dynamic_cast*>(src.get()); + + if (srcPtr == nullptr) + FAIL() << "Cannot cast blob to TBlob."; + + InferenceEngine::BlobMap srcs; + srcs.insert(std::pair("in1", src)); + + InferenceEngine::OutputsDataMap out; + out = network.getOutputsInfo(); + InferenceEngine::BlobMap outputBlobs; + + std::pair item = *out.begin(); + + InferenceEngine::TBlob::Ptr output; + output = InferenceEngine::make_shared_blob(item.second->getTensorDesc()); + output->allocate(); + outputBlobs[item.first] = output; + + graph.Infer(srcs, outputBlobs); + + // Compare with reference + + auto deconv = network.getLayerByName("Deconvolution_1"); + InferenceEngine::TBlob deconv_ref(deconv->outData[0]->getTensorDesc()); + deconv_ref.allocate(); + + ref_deconv_common(*srcPtr, deconv_ref, weights->buffer().as(), weights->size(), + bias->buffer().as(), bias->size(), p.deconv); + + float *src1_ptr = deconv_ref.buffer(); + size_t src1_size = deconv_ref.size(); + float *src2_ptr = src->buffer(); + size_t src2_size = src->size(); + float *dst_ptr = output->buffer(); + size_t dst_size = output->size(); + + int len1 = 1, len2 = 1; + for (int dim = p.concat.axis; dim < output->dims().size(); dim++) { + len1 *= deconv->outData[0]->getTensorDesc().getDims()[dim]; + len2 *= src->dims()[dim]; + } + + size_t index1 = 0, index2 = 0, index = 0; + float max_diff = 0.0001f; + for (size_t cycle = 0lu; cycle < p.concat.axis; cycle ++) { + for (int i1 = 0; i1 < len1; i1++) { + if (fabs(src1_ptr[index1] - dst_ptr[index]) > max_diff) + { + FAIL() << "index: " << index << " src: " << src1_ptr[index1] << ", dst: " << dst_ptr[index]; + } + index1++; index++; + } + for (int i2 = 0; i2 < len2; i2++) { + if (fabs(src2_ptr[index2] - dst_ptr[index]) > max_diff) + { + FAIL() << "index: " << index << " src: " << src2_ptr[index2] << ", dst: " << dst_ptr[index]; + } + index2++; index++; + } + } + + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(MKLDNNDeconvConcatTests, TestsDwConvFusing) {} + +INSTANTIATE_TEST_CASE_P( + TestsDwConvFusing, MKLDNNDeconvConcatTests, + ::testing::Values( + deconv_concat_params{{1, 256, 4, 4}, + { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 256, false }, + {1}, {MKLDNNPlugin::impl_desc_type::gemm_blas}}, + deconv_concat_params{{2, 256, 4, 4}, + { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 256, false }, + {1}, {MKLDNNPlugin::impl_desc_type::gemm_blas}}, + deconv_concat_params{{1, 256, 4, 4, 4}, + { {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, "", 1, 256, false }, + {1}, {MKLDNNPlugin::impl_desc_type::gemm_blas}}, + deconv_concat_params{{2, 256, 4, 4, 4}, + { {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, "", 1, 256, false }, + {1}, {MKLDNNPlugin::impl_desc_type::gemm_blas}} + )); diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_dw_conv_fusing_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_dw_conv_fusing_test.cpp index bc653a1..9078a77 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_dw_conv_fusing_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_dw_conv_fusing_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_optimization_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_optimization_test.cpp index 72c0c8e..700bf7a 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_optimization_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_optimization_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp index 52bcb45..363febb 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +11,7 @@ #include "tests_common.hpp" #include "../test_graph.hpp" #include +#include using namespace ::testing; using namespace std; @@ -3001,24 +3002,16 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersRmnet_SSSSD) { MKLDNNGraphTestClass graph; graph.CreateGraph(net_reader.getNetwork()); - // TODO: WA for ttps://jira01.devtools.intel.com/browse/CVS-10715 - bool isAvx512 = false; - size_t reorders_num = 0; auto& nodes = graph.getNodes(); for (auto &node : nodes) { if (node->getType() == MKLDNNPlugin::Reorder) { reorders_num++; - if (!isAvx512 && node->getChildEdgeAt(0)->getMemory().GetFormat() == memory::nChw16c) - isAvx512 = true; - if (!isAvx512) ASSERT_EQ(MKLDNNPlugin::Output, node->getChildEdgeAt(0)->getChild()->getType()); } } - if (!isAvx512) - ASSERT_EQ(reorders_num, 1); - else - ASSERT_EQ(reorders_num, 3); + + ASSERT_EQ(reorders_num, 1); } TEST_F(MKLDNNGraphStructureTests, TestFailedPartDPN92) { @@ -3806,7 +3799,6 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersForXceptionTopology) { net_reader.SetWeights(weights_ptr); - MKLDNNGraphTestClass graph; graph.CreateGraph(net_reader.getNetwork()); @@ -6391,18 +6383,18 @@ TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithMultipleData) { const auto& nodes = graph.getNodes(); ASSERT_EQ(nodes.size(), 12); - ASSERT_EQ(nodes[0].get()->getType(), MKLDNNPlugin::Type::Input); - ASSERT_EQ(nodes[1].get()->getType(), MKLDNNPlugin::Type::Split); - ASSERT_EQ(nodes[2].get()->getType(), MKLDNNPlugin::Type::Reorder); - ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reshape); - ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output); - ASSERT_EQ(nodes[5].get()->getType(), MKLDNNPlugin::Type::Reorder); - ASSERT_EQ(nodes[6].get()->getType(), MKLDNNPlugin::Type::Reshape); - ASSERT_EQ(nodes[7].get()->getType(), MKLDNNPlugin::Type::Output); - ASSERT_EQ(nodes[8].get()->getType(), MKLDNNPlugin::Type::Reorder); - ASSERT_EQ(nodes[9].get()->getType(), MKLDNNPlugin::Type::Output); - ASSERT_EQ(nodes[10].get()->getType(), MKLDNNPlugin::Type::Reshape); - ASSERT_EQ(nodes[11].get()->getType(), MKLDNNPlugin::Type::Output); + ASSERT_EQ(nodes[0]->getType(), MKLDNNPlugin::Type::Input); + ASSERT_EQ(nodes[1]->getType(), MKLDNNPlugin::Type::Split); + ASSERT_EQ(nodes[2]->getType(), MKLDNNPlugin::Type::Reorder); + ASSERT_EQ(nodes[3]->getType(), MKLDNNPlugin::Type::Reshape); + ASSERT_EQ(nodes[4]->getType(), MKLDNNPlugin::Type::Output); + ASSERT_EQ(nodes[5]->getType(), MKLDNNPlugin::Type::Reorder); + ASSERT_EQ(nodes[6]->getType(), MKLDNNPlugin::Type::Reshape); + ASSERT_EQ(nodes[7]->getType(), MKLDNNPlugin::Type::Output); + ASSERT_EQ(nodes[8]->getType(), MKLDNNPlugin::Type::Reorder); + ASSERT_EQ(nodes[9]->getType(), MKLDNNPlugin::Type::Reshape); + ASSERT_EQ(nodes[10]->getType(), MKLDNNPlugin::Type::Output); + ASSERT_EQ(nodes[11]->getType(), MKLDNNPlugin::Type::Output); InferenceEngine::OutputsDataMap outputs = reader.getNetwork().getOutputsInfo(); std::vector> outputItems = { @@ -6451,3 +6443,297 @@ TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithMultipleData) { compare(*outputBlobs[i], *expectedOutputBlobs[i]); } } + +TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithMultipleData_2) { + std::string model = R"V0G0N( + + + + + + 1 + 2 + 8 + 8 + + + + + + + + 1 + 2 + 8 + 8 + + + + + 1 + 1 + 8 + 8 + + + 1 + 1 + 8 + 8 + + + + + + + + 1 + 1 + 8 + 8 + + + + + 1 + 1 + 8 + 8 + + + + + + + + + +)V0G0N"; + using namespace InferenceEngine; + + const size_t H = 8; + const size_t W = 8; + const size_t imgSz = H * W; + const float channel1Value = 1.0; + const float channel2Value = 2.0; + + const auto weights = std::make_shared>(Precision::U8, Layout::C, SizeVector{0}); + + InferenceEngine::CNNNetReader reader; + reader.ReadNetwork(model.data(), model.size()); + reader.SetWeights(weights); + + auto net = reader.getNetwork(); + net.addOutput("split", 0); + + MKLDNNGraphTestClass graph; + graph.CreateGraph(net); + + auto inBlob = make_shared_blob(Precision::FP32, SizeVector{1, 2, H, W}); + auto outBlob1 = make_shared_blob(Precision::FP32, SizeVector{1, 1, H, W}); + auto outBlob2 = make_shared_blob(Precision::FP32, SizeVector{1, 1, H, W}); + auto outBlob3 = make_shared_blob(Precision::FP32, SizeVector{1, 1, H, W}); + + inBlob->allocate(); + outBlob1->allocate(); + outBlob2->allocate(); + outBlob3->allocate(); + + auto in_ptr = inBlob->buffer().as(); + for (int i = 0; i < imgSz; i++) { + in_ptr[i] = channel1Value; + in_ptr[i + imgSz] = channel2Value; + } + + BlobMap inputBlobMap = { {"data" , inBlob } }, + outputBlobMap = { {"split.0", outBlob1}, + {"split.1", outBlob2}, + {"power" , outBlob3} }; + + graph.Infer(inputBlobMap, outputBlobMap); + + auto out_check = [] ( Blob::Ptr blob, float val) { + auto size = blob->size(); + auto ptr = blob->buffer().as(); + bool res = true; + for (int i = 0; i < size; i++) + res &= ( std::abs( ptr[i] - val ) < 0.00001f ); + return res; + }; + + EXPECT_TRUE(out_check(outBlob1, 1)); + EXPECT_TRUE(out_check(outBlob2, 2)); + EXPECT_TRUE(out_check(outBlob3, -1)); +} + +TEST_F(MKLDNNGraphStructureTests, TestCreateGraphAllDataToConcat) { + using namespace InferenceEngine; + // Build the network. + Builder::Network netBuilder(""); + + // First input layer + idx_t inpId = netBuilder.addLayer(InferenceEngine::Builder::InputLayer("input").setPort(InferenceEngine::Port({1, 1, 4, 5}))); + + std::vector weightsSize = {1, 1, 1, 1}; // OIHW + auto weights = make_shared_blob(Precision::FP32, InferenceEngine::Layout::OIHW, weightsSize); + weights->allocate(); + + std::vector twos(1, 2); + weights->set(twos); + idx_t weightsId = netBuilder.addLayer({}, Builder::ConstLayer("weights").setData(weights)); + + // Convolution layer + idx_t firstConvId = netBuilder.addLayer({{inpId}, {weightsId}}, Builder::ConvolutionLayer("conv").setKernel({1, 1}) + .setStrides({1, 1}).setDilation({1, 1}).setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}).setGroup(1).setOutDepth(1)); + + weights = make_shared_blob(Precision::FP32, InferenceEngine::Layout::OIHW, weightsSize); + weights->allocate(); + + std::vector threes(1, 3); + weights->set(threes); + + weightsId = netBuilder.addLayer({}, Builder::ConstLayer("weights").setData(weights)); + // Convolution layer + idx_t secondConvId = netBuilder.addLayer({{inpId}, {weightsId}}, Builder::ConvolutionLayer("conv").setKernel({1, 1}) + .setStrides({1, 1}).setDilation({1, 1}).setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}).setGroup(1).setOutDepth(1)); + + // Concat layer + idx_t concatId = netBuilder.addLayer({{inpId}, {firstConvId}, {secondConvId}}, + InferenceEngine::Builder::ConcatLayer("concat").setAxis(1).setInputPorts(std::vector(3))); + + // Output layer + InferenceEngine::Builder::OutputLayer outLayer("output"); + netBuilder.addLayer({concatId}, outLayer); + + auto cnn = CNNNetwork(Builder::convertToICNNNetwork(netBuilder.build())); + + // Load the network + std::vector inpSize = {5, 4, 1, 1}; + std::vector outSize = {5, 4, 3, 1}; + + InferenceEngine::BlobMap inputBlobs; + InferenceEngine::BlobMap outputBlobs; + + std::vector inpData(4*5, 1); + std::vector outData(3*4*5, 1); + for (int i = 0; i < 4*5; ++i) + { + inpData[i] = i; + } + + inputBlobs["input"] = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, inpSize, &inpData[0]); + outputBlobs["concat"] = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, outSize, &outData[0]); + + + MKLDNNGraphTestClass graph; + graph.CreateGraph(cnn); + graph.Infer(inputBlobs, outputBlobs); + + std::vector refDst = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57}; + + InferenceEngine::TBlob::Ptr dstOut = InferenceEngine::make_shared_blob(outputBlobs["concat"]->getTensorDesc(), refDst.data()); + + compare(*outputBlobs["concat"], *dstOut); +} + +TEST_F(MKLDNNGraphStructureTests, TestCreateGraphAllDataFromInputToConcat) { + using namespace InferenceEngine; + // Build the network. + Builder::Network netBuilder(""); + + // First input layer + idx_t inpId = netBuilder.addLayer(InferenceEngine::Builder::InputLayer("input").setPort(InferenceEngine::Port({1, 1, 4, 5}))); + + // Concat layer + idx_t concatId = netBuilder.addLayer({{inpId}, {inpId}, {inpId}}, + InferenceEngine::Builder::ConcatLayer("concat").setAxis(1).setInputPorts(std::vector(3))); + + // Output layer + InferenceEngine::Builder::OutputLayer outLayer("output"); + netBuilder.addLayer({concatId}, outLayer); + + auto cnn = CNNNetwork(Builder::convertToICNNNetwork(netBuilder.build())); + + // Load the network + std::vector inpSize = {5, 4, 1, 1}; + std::vector outSize = {5, 4, 3, 1}; + + InferenceEngine::BlobMap inputBlobs; + InferenceEngine::BlobMap outputBlobs; + + std::vector inpData(4*5, 1); + std::vector outData(3*4*5, 1); + for (int i = 0; i < 4*5; ++i) + { + inpData[i] = i; + } + + inputBlobs["input"] = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, inpSize, &inpData[0]); + outputBlobs["concat"] = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, outSize, &outData[0]); + + + MKLDNNGraphTestClass graph; + graph.CreateGraph(cnn); + graph.Infer(inputBlobs, outputBlobs); + + std::vector refDst = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,}; + + InferenceEngine::TBlob::Ptr dstOut = InferenceEngine::make_shared_blob(outputBlobs["concat"]->getTensorDesc(), refDst.data()); + + compare(*outputBlobs["concat"], *dstOut); +} + + +TEST_F(MKLDNNGraphStructureTests, TestCheckIncorrectScaleShift) { + std::string model = R"V0G0N( + + + + + + 1 + 1000 + 16 + + + + + + + 1 + 1000 + 16 + + + + + 1 + 100 + 16 + + + + + + + + + + + + +)V0G0N"; + using namespace InferenceEngine; + const auto weights = std::make_shared>(Precision::U8, Layout::C, SizeVector{64}); + + InferenceEngine::CNNNetReader reader; + reader.ReadNetwork(model.data(), model.size()); + reader.SetWeights(weights); + + MKLDNNGraphTestClass graph; + ASSERT_THROW(graph.CreateGraph(reader.getNetwork()), InferenceEngine::details::InferenceEngineException); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/test_graph.hpp b/inference-engine/tests/unit/engines/mkldnn/graph/test_graph.hpp index b0d7bfb..e6ca63e 100644 --- a/inference-engine/tests/unit/engines/mkldnn/graph/test_graph.hpp +++ b/inference-engine/tests/unit/engines/mkldnn/graph/test_graph.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -78,7 +78,7 @@ public: // todo: make sure 'name' exists in this map... if (_meanImages.find(name) != _meanImages.end()) { if (in->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) { - _meanImages[name].Subtract(outDims, reinterpret_cast(inter_data_ptr)); + _meanImages[name].Subtract(outDims, reinterpret_cast(inter_data_ptr), in->getTensorDesc().getLayout()); } else { THROW_IE_EXCEPTION << "Mean image of type " << in->getTensorDesc().getPrecision().name() << " is unsupported"; } @@ -89,13 +89,6 @@ public: } void Infer(const InferenceEngine::BlobMap& inputs, InferenceEngine::BlobMap& result, int batch = -1) { - for (auto it = result.begin(); it != result.end(); it++) { - InferenceEngine::TBlob *out = dynamic_cast *>((*it).second.get()); - if (out == nullptr) { - FAIL() << "Output data precision not supported. Expected float."; - } - } - try { // need to retain converted blobs until infer finish std::vector convertedInputs; diff --git a/inference-engine/tests/unit/engines/mkldnn/mkldnn_primitive_test.cpp b/inference-engine/tests/unit/engines/mkldnn/mkldnn_primitive_test.cpp index fd517de..518f0d6 100644 --- a/inference-engine/tests/unit/engines/mkldnn/mkldnn_primitive_test.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/mkldnn_primitive_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/engines/mkldnn/test_layers.cpp b/inference-engine/tests/unit/engines/mkldnn/test_layers.cpp index 7db4174..38164f8 100644 --- a/inference-engine/tests/unit/engines/mkldnn/test_layers.cpp +++ b/inference-engine/tests/unit/engines/mkldnn/test_layers.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/graph_tools/graph_copy_tests.cpp b/inference-engine/tests/unit/graph_tools/graph_copy_tests.cpp index 2971eb7..f3498f3 100644 --- a/inference-engine/tests/unit/graph_tools/graph_copy_tests.cpp +++ b/inference-engine/tests/unit/graph_tools/graph_copy_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -359,4 +359,4 @@ TEST(CNNSpecificGraphCopyTests, copyNetworkWithDeconvolution) { auto layer = std::dynamic_pointer_cast(copied_net.getLayerByName("upsample_merged")); ASSERT_NE(layer, nullptr) << "Could not perform dynamic cast from base pointer to Deconvolution layer pointer. " "Net copy could be incorrect."; -} \ No newline at end of file +} diff --git a/inference-engine/tests/unit/graph_tools/graph_test_base.hpp b/inference-engine/tests/unit/graph_tools/graph_test_base.hpp index 94c0876..79a1f4a 100644 --- a/inference-engine/tests/unit/graph_tools/graph_test_base.hpp +++ b/inference-engine/tests/unit/graph_tools/graph_test_base.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -73,8 +73,27 @@ class GraphTestsBase : public ::testing::Test { } return nullptr; } + + + #define ASSERT_N_CONNECTIONS(a, b, n) \ + ASSERT_EQ(countForwardConnections(#a, #b), n);\ + ASSERT_EQ(countBackwardConnections(#a, #b), n); + #define ASSERT_CONNECTION(a, b) \ - ASSERT_TRUE(assertConnection(#a, #b)); + ASSERT_N_CONNECTIONS(a,b,1); + + #define ASSERT_2_CONNECTIONS(a, b) \ + ASSERT_N_CONNECTIONS(a,b,2); + + #define ASSERT_3_CONNECTIONS(a, b) \ + ASSERT_N_CONNECTIONS(a,b,3); + + /** + * @brief check connection without direction + */ + #define ASSERT_NO_CONNECTION(a, b) \ + ASSERT_EQ(countConnections(#a, #b), 0);\ + ASSERT_EQ(countConnections(#b, #a), 0);\ void ASSERT_DIMS(int x, const SizeVector & dims) { @@ -84,30 +103,51 @@ class GraphTestsBase : public ::testing::Test { } } - bool assertConnection(std::string a, std::string b) { + int countForwardConnections(std::string a, std::string b) { + long int nForward = 0; + CNNLayerPtr layerExist; + try { + layerExist = wrap.getLayerByName(a.c_str()); + if (!layerExist) { + return 0; + } + } catch(...) { + return 0; + } - bool bForward = false; - for (auto && outData : wrap.getLayerByName(a.c_str())->outData) { + for (auto && outData : layerExist->outData) { auto &inputMap = outData->inputTo; - auto i = - std::find_if(inputMap.begin(), inputMap.end(), [&](std::map::value_type &vt) { + nForward += + std::count_if(inputMap.begin(), inputMap.end(), [&](std::map::value_type &vt) { return vt.second->name == b; }); - if (i != inputMap.end()) { - bForward = true; - break; - } } - if (!bForward) { - return false; + + return nForward; + } + + int countBackwardConnections(std::string a, std::string b) { + CNNLayerPtr layerExist; + try { + layerExist = wrap.getLayerByName(b.c_str()); + if (!layerExist) { + return 0; + } + } catch(...) { + return 0; } - auto prevData = wrap.getLayerByName(b.c_str())->insData; + auto prevData = layerExist->insData; - auto j = std::find_if(prevData.begin(), prevData.end(), [&](DataWeakPtr wp) { + auto nBackward = std::count_if(prevData.begin(), prevData.end(), [&](DataWeakPtr wp) { return wp.lock()->getCreatorLayer().lock()->name == a; }); - return j != prevData.end(); + + return nBackward; + } + + int countConnections(std::string a, std::string b) { + return countForwardConnections(a, b) + countBackwardConnections(a, b); } int numCreated = 0; @@ -189,6 +229,17 @@ class GraphTestsBase : public ::testing::Test { } } + void TearDown() override { + // Reset shared_pointer circular dependencies to mitigate memory leaks. + for (auto& items : datas) { + for (auto& data : items) { + for (auto& input : data->getInputTo()) { + input.second.reset(); + } + } + } + } + int ID(const CNNLayerPtr &ptr) { for (int i = 0; i < layers.size(); i++) { if (layers[i].get() == ptr.get()) diff --git a/inference-engine/tests/unit/graph_tools/graph_tools_test.cpp b/inference-engine/tests/unit/graph_tools/graph_tools_test.cpp index 94e9c51..8c15ffd 100644 --- a/inference-engine/tests/unit/graph_tools/graph_tools_test.cpp +++ b/inference-engine/tests/unit/graph_tools/graph_tools_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -701,7 +701,7 @@ TEST_F(GraphToolsTest, CNNNetworkInsertAllAfterSplit) { CNNNetworkInsertLayer(wrap.getLayerByName("1"), nullptr, createGenericLayer("5")); - ASSERT_CONNECTION(1, 5); + ASSERT_2_CONNECTIONS(1, 5); ASSERT_CONNECTION(5, 2); ASSERT_CONNECTION(5, 3); } @@ -729,6 +729,156 @@ TEST_F(GraphToolsTest, CNNNetworkInsert1AfterSplit) { ASSERT_CONNECTION(5, 4); } +TEST_F(GraphToolsTest, CNNNetworkRemoveNullPointerLayer) { + + CONNECT_FROM_PORT(1, 0, 2); + CONNECT_FROM_PORT(1, 1, 3); + CONNECT_FROM_PORT(1, 2, 4); + + EXPECT_CALL(mockNet, getInputsInfo(_)).WillRepeatedly(WithArg<0>(Invoke([&](InputsDataMap & maps){ + prepareInputs(maps); + }))); + + EXPECT_CALL(mockNet, getLayerByName(_,_,_)).WillRepeatedly(WithArgs<0, 1>(Invoke([&](const char* name, InferenceEngine::CNNLayerPtr& l){ + l = layerByName(name); + return l== nullptr ? GENERAL_ERROR : OK; + }))); + + ASSERT_ANY_THROW(CNNNetworkRemoveLayer(nullptr)); +} + +TEST_F(GraphToolsTest, CNNNetworkRemoveInputOrOutputLayer) { + + CONNECT_FROM_PORT(1, 0, 2); + CONNECT_FROM_PORT(2, 0, 3); + CONNECT_FROM_PORT(1, 0, 3); + + EXPECT_CALL(mockNet, getInputsInfo(_)).WillRepeatedly(WithArg<0>(Invoke([&](InputsDataMap & maps){ + prepareInputs(maps); + }))); + + EXPECT_CALL(mockNet, getLayerByName(_,_,_)).WillRepeatedly(WithArgs<0, 1>(Invoke([&](const char* name, InferenceEngine::CNNLayerPtr& l){ + l = layerByName(name); + return l== nullptr ? GENERAL_ERROR : OK; + }))); + + ASSERT_ANY_THROW(CNNNetworkRemoveLayer(wrap.getLayerByName("1"))); + ASSERT_ANY_THROW(CNNNetworkRemoveLayer(wrap.getLayerByName("3"))); +} + +TEST_F(GraphToolsTest, CNNNetworkRemoveLayerThaHas2Outputs) { + + CONNECT_FROM_PORT(1, 0, 2); + CONNECT_FROM_PORT(2, 0, 3); + CONNECT_FROM_PORT(2, 0, 4); + CONNECT_FROM_PORT(1, 0, 3); + CONNECT_FROM_PORT(5, 0, 4); + + EXPECT_CALL(mockNet, getInputsInfo(_)).WillRepeatedly(WithArg<0>(Invoke([&](InputsDataMap & maps){ + prepareInputs(maps); + }))); + + EXPECT_CALL(mockNet, getLayerByName(_,_,_)).WillRepeatedly(WithArgs<0, 1>(Invoke([&](const char* name, InferenceEngine::CNNLayerPtr& l){ + l = layerByName(name); + return l== nullptr ? GENERAL_ERROR : OK; + }))); + + CNNNetworkRemoveLayer(wrap.getLayerByName("2")); + + ASSERT_2_CONNECTIONS(1, 3); + ASSERT_CONNECTION(1, 4); + ASSERT_CONNECTION(5, 4); + + // means all remained references removed + ASSERT_NO_CONNECTION(1, 2); + ASSERT_NO_CONNECTION(2, 2); + ASSERT_NO_CONNECTION(3, 2); + ASSERT_NO_CONNECTION(4, 2); +} + +TEST_F(GraphToolsTest, CNNNetworkRemoveLayerSplit) { + + CONNECT_FROM_PORT(1, 0, 2); + CONNECT_FROM_PORT(1, 1, 3); + CONNECT_FROM_PORT(2, 0, 3); + + EXPECT_CALL(mockNet, getInputsInfo(_)).WillRepeatedly(WithArg<0>(Invoke([&](InputsDataMap & maps){ + prepareInputs(maps); + }))); + + EXPECT_CALL(mockNet, getLayerByName(_,_,_)).WillRepeatedly(WithArgs<0, 1>(Invoke([&](const char* name, InferenceEngine::CNNLayerPtr& l){ + l = layerByName(name); + return l== nullptr ? GENERAL_ERROR : OK; + }))); + + CNNNetworkRemoveLayer(wrap.getLayerByName("2")); + + ASSERT_2_CONNECTIONS(1, 3); + // means all remained references removed + ASSERT_NO_CONNECTION(1, 2); + ASSERT_NO_CONNECTION(2, 2); + ASSERT_NO_CONNECTION(3, 2); +} + +TEST_F(GraphToolsTest, CNNNetworkRemoveLayerSplit2) { + + CONNECT_FROM_PORT(1, 0, 2); + CONNECT_FROM_PORT(1, 0, 3); + CONNECT_FROM_PORT(1, 0, 4); + CONNECT_FROM_PORT(1, 1, 4); + CONNECT_FROM_PORT(1, 2, 5); + + CONNECT_FROM_PORT(2, 0, 3); + CONNECT_FROM_PORT(2, 0, 4); + CONNECT_FROM_PORT(2, 0, 5); + + EXPECT_CALL(mockNet, getInputsInfo(_)).WillRepeatedly(WithArg<0>(Invoke([&](InputsDataMap & maps){ + prepareInputs(maps); + }))); + + EXPECT_CALL(mockNet, getLayerByName(_,_,_)).WillRepeatedly(WithArgs<0, 1>(Invoke([&](const char* name, InferenceEngine::CNNLayerPtr& l){ + l = layerByName(name); + return l== nullptr ? GENERAL_ERROR : OK; + }))); + + CNNNetworkRemoveLayer(wrap.getLayerByName("2")); + + ASSERT_2_CONNECTIONS(1, 3); + ASSERT_3_CONNECTIONS(1, 4); + ASSERT_2_CONNECTIONS(1, 5); + + // means all remained references removed + ASSERT_NO_CONNECTION(1, 2); + ASSERT_NO_CONNECTION(2, 2); + ASSERT_NO_CONNECTION(3, 2); + ASSERT_NO_CONNECTION(4, 2); + ASSERT_NO_CONNECTION(5, 2); +} + +TEST_F(GraphToolsTest, CNNNetworkRemoveSimpleLayer) { + + CONNECT_FROM_PORT(1, 0, 2); + CONNECT_FROM_PORT(2, 0, 3); + + EXPECT_CALL(mockNet, getInputsInfo(_)).WillRepeatedly(WithArg<0>(Invoke([&](InputsDataMap & maps){ + prepareInputs(maps); + }))); + + EXPECT_CALL(mockNet, getLayerByName(_,_,_)).WillRepeatedly(WithArgs<0, 1>(Invoke([&](const char* name, InferenceEngine::CNNLayerPtr& l){ + l = layerByName(name); + return l== nullptr ? GENERAL_ERROR : OK; + }))); + + CNNNetworkRemoveLayer(wrap.getLayerByName("2")); + + ASSERT_CONNECTION(1, 3); + + // means all remained references removed + ASSERT_NO_CONNECTION(1, 2); + ASSERT_NO_CONNECTION(2, 2); + ASSERT_NO_CONNECTION(3, 2); +} + //TEST_F(GraphToolsTest, CNNNetworkInsertLayerBeforeAll) { // CONNECT(1, 2); diff --git a/inference-engine/tests/unit/inference_engine_tests/alocator_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/alocator_tests.cpp index 178f116..5ae1fb5 100644 --- a/inference-engine/tests/unit/inference_engine_tests/alocator_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/alocator_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -25,7 +25,9 @@ public: }; TEST_F(SystemAllocatorTests, canAllocate) { - EXPECT_NO_THROW(allocator->alloc(100)); + void* handle = allocator->alloc(100); + EXPECT_NE(nullptr, handle); + allocator->free(handle); } TEST_F(SystemAllocatorTests, canLockAllocatedMemory) { @@ -34,4 +36,6 @@ TEST_F(SystemAllocatorTests, canLockAllocatedMemory) { char * ptr = (char *)allocator->lock(handle); ptr [9999] = 11; ASSERT_EQ(ptr[9999], 11); + allocator->unlock(ptr); + allocator->free(handle); } diff --git a/inference-engine/tests/unit/inference_engine_tests/blob_proxy_test.cpp b/inference-engine/tests/unit/inference_engine_tests/blob_proxy_test.cpp index 9de222c..cbe0914 100644 --- a/inference-engine/tests/unit/inference_engine_tests/blob_proxy_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/blob_proxy_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/blob_test.cpp b/inference-engine/tests/unit/inference_engine_tests/blob_test.cpp index e104c4c..8dbaf4d 100644 --- a/inference-engine/tests/unit/inference_engine_tests/blob_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/blob_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -311,6 +311,11 @@ TEST_F(BlobTests, canMakeSharedBlob) { ASSERT_EQ(blob3->size(), 0); } +TEST_F(BlobTests, cannotCreateBlobWithIncorrectPrecision) { + InferenceEngine::TensorDesc desc(InferenceEngine::Precision::FP16, {1, 3, 227, 227}, Layout::NCHW); + ASSERT_THROW(InferenceEngine::make_shared_blob(desc), InferenceEngine::details::InferenceEngineException); +} + TEST_F(BlobTests, canUseBlobInMoveSemantics) { TBlob b(Precision::FP32, C); diff --git a/inference-engine/tests/unit/inference_engine_tests/caslesseq_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/caslesseq_tests.cpp index b0c23e5..ec88bbe 100644 --- a/inference-engine/tests/unit/inference_engine_tests/caslesseq_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/caslesseq_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cnn_network_test.cpp b/inference-engine/tests/unit/inference_engine_tests/cnn_network_test.cpp index 4a4b3d4..af75edd 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cnn_network_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cnn_network_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_base_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_base_tests.cpp index 425b062..5cb6e9a 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_base_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_base_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_tests.cpp index b1c9368..8f2baa6 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_default_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_default_tests.cpp index 594ee19..0115944 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_default_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_default_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_internal.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_internal.cpp index 49cdadc..09e3aa9 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_internal.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_internal.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/callback_manager_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/callback_manager_tests.cpp index 7d10137..06991e4 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/callback_manager_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/callback_manager_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_base_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_base_tests.cpp index f4c472e..815f9b8 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_base_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_base_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_tests.cpp index 399ec7a..19a77fc 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_async_only_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_async_only_tests.cpp index 2542017..d138cc7 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_async_only_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_async_only_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_tests.cpp index 3c09801..4dd38b7 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executor_manager_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executor_manager_tests.cpp index 450bcd3..022cc67 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executor_manager_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executor_manager_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/iinference_plugin_internal_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/iinference_plugin_internal_tests.cpp index a76857b..5a4ae54 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/iinference_plugin_internal_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/iinference_plugin_internal_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -43,7 +43,7 @@ protected: virtual void SetUp() { mock_plugin_impl.reset(new MockInferencePluginInternal()); - plugin = details::shared_from_irelease(make_ie_compatible_plugin({1, 2, "test", "version"}, mock_plugin_impl)); + plugin = details::shared_from_irelease(make_ie_compatible_plugin({1, 6, "test", "version"}, mock_plugin_impl)); mockExeNetworkInternal = make_shared(); } @@ -183,7 +183,7 @@ protected: virtual void SetUp() { mockPluginImpl = make_shared(); - plugin = details::shared_from_irelease(make_ie_compatible_plugin({1, 2, "test", "version"}, mockPluginImpl)); + plugin = details::shared_from_irelease(make_ie_compatible_plugin({1, 6, "test", "version"}, mockPluginImpl)); mockExeNetwork = make_shared(); } diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/memory_state_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/memory_state_tests.cpp index 799f0bd..8f4426b 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/memory_state_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/memory_state_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/plugin_base_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/plugin_base_tests.cpp index 3df6a60..6a1004c 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/plugin_base_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/plugin_base_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,7 +22,7 @@ class PluginBaseTests: public ::testing::Test { } virtual void SetUp() { mock_impl.reset(new MockPluginImpl()); - plugin = details::shared_from_irelease(make_ie_compatible_plugin({1,2,"test", "version"}, mock_impl)); + plugin = details::shared_from_irelease(make_ie_compatible_plugin({1,6,"test", "version"}, mock_impl)); } }; @@ -33,7 +33,7 @@ TEST_F(PluginBaseTests, canReportVersion) { EXPECT_STREQ(V->buildNumber, "test"); EXPECT_STREQ(V->description, "version"); EXPECT_EQ(V->apiVersion.major, 1); - EXPECT_EQ(V->apiVersion.minor, 2); + EXPECT_EQ(V->apiVersion.minor, 6); } diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_common_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_common_tests.cpp index e0918ab..00fa876 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_common_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_common_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_executor_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_executor_tests.cpp index 0cbc516..31aaaab 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_executor_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_executor_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_synchronizer_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_synchronizer_tests.cpp index 47b1ef2..9564ff0 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_synchronizer_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_synchronizer_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests.cpp index 792e134..94f4910 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests_utils.hpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests_utils.hpp index 5f4238f..9f6e3cc 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests_utils.hpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_with_stages_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_with_stages_tests.cpp index 6f665e6..1f8c445 100644 --- a/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_with_stages_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_with_stages_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/data_test.cpp b/inference-engine/tests/unit/inference_engine_tests/data_test.cpp index 8839861..3d3cea1 100644 --- a/inference-engine/tests/unit/inference_engine_tests/data_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/data_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/debug_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/debug_tests.cpp index 5acaeab..6d28b0d 100644 --- a/inference-engine/tests/unit/inference_engine_tests/debug_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/debug_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/device_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/device_tests.cpp index c83d89a..368e8fd 100644 --- a/inference-engine/tests/unit/inference_engine_tests/device_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/device_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -61,7 +61,8 @@ TEST_F(DeviceTests, returnsProperDeviceName) { ASSERT_STREQ(getDeviceName(TargetDevice::eMYRIAD), "MYRIAD"); ASSERT_STREQ(getDeviceName(TargetDevice::eGNA), "GNA"); ASSERT_STREQ(getDeviceName(TargetDevice::eHETERO), "HETERO"); + ASSERT_STREQ(getDeviceName(TargetDevice::eKMB), "KMB"); ASSERT_STREQ(getDeviceName(static_cast(-1)), "Unknown device"); //off by one test - might not be enough - ASSERT_STREQ(getDeviceName(static_cast((uint8_t)TargetDevice::eHETERO + 1)), "Unknown device"); + ASSERT_STREQ(getDeviceName(static_cast((uint8_t)TargetDevice::eKMB + 1)), "Unknown device"); } diff --git a/inference-engine/tests/unit/inference_engine_tests/exception_test.cpp b/inference-engine/tests/unit/inference_engine_tests/exception_test.cpp index fc93d48..0ebe0ce 100644 --- a/inference-engine/tests/unit/inference_engine_tests/exception_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/exception_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/inference_engine_plugin_test.cpp b/inference-engine/tests/unit/inference_engine_tests/inference_engine_plugin_test.cpp index a23b74c..c88e9fb 100644 --- a/inference-engine/tests/unit/inference_engine_tests/inference_engine_plugin_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/inference_engine_plugin_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/inference_engine_test.cpp b/inference-engine/tests/unit/inference_engine_tests/inference_engine_test.cpp index ab307cf..451f1ee 100644 --- a/inference-engine/tests/unit/inference_engine_tests/inference_engine_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/inference_engine_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/layer_transform_test.cpp b/inference-engine/tests/unit/inference_engine_tests/layer_transform_test.cpp index fcb5875..229af8f 100644 --- a/inference-engine/tests/unit/inference_engine_tests/layer_transform_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/layer_transform_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/layers_test.cpp b/inference-engine/tests/unit/inference_engine_tests/layers_test.cpp index 6d18b64..0b0409b 100644 --- a/inference-engine/tests/unit/inference_engine_tests/layers_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/layers_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/locked_memory_test.cpp b/inference-engine/tests/unit/inference_engine_tests/locked_memory_test.cpp index 7a7ee5e..7bc0950 100644 --- a/inference-engine/tests/unit/inference_engine_tests/locked_memory_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/locked_memory_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/normalization/latest_in_fuse_test.cpp b/inference-engine/tests/unit/inference_engine_tests/normalization/latest_in_fuse_test.cpp new file mode 100644 index 0000000..1b8da0d --- /dev/null +++ b/inference-engine/tests/unit/inference_engine_tests/normalization/latest_in_fuse_test.cpp @@ -0,0 +1,163 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include +#include "tests_common.hpp" +#include "ir_gen_helper.hpp" + +using namespace ::testing; +using namespace single_layer_tests; + +struct conv_conv_eltwise_params { + // Formats: NCHW, NCDHW + std::vector in; + + conv_common_params conv; + eltwise_common_params eltwise; +}; + +class NormalizationConvConvEltwiseTests: public TestsCommon, + public WithParamInterface { + std::string layers_t = R"V0G0N( + + + + + __INP_DIMS__ + + + + + __CONV_OUT_DIMS__ + + + + + + + + + + + + __INP_DIMS__ + + + + + __CONV_OUT_DIMS__ + + + + + + + + + + + + __CONV_OUT_DIMS__ + + + __CONV_OUT_DIMS__ + + + + + __CONV_OUT_DIMS__ + + + +)V0G0N"; + + std::string edges_t = R"V0G0N( + + + + +)V0G0N"; + + std::string getModel(conv_conv_eltwise_params p) { + std::string model = layers_t; + + std::string s_dims; + for (auto& dim : p.in) { + s_dims += "\n "; + s_dims += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__INP_DIMS__", s_dims); + + s_dims = "\n "; + s_dims += std::to_string(p.in[0]) + ""; + s_dims += "\n "; + s_dims += std::to_string(p.conv.out_c) + ""; + int k_len = p.conv.kernel.size(); + for (size_t i = 2; i < p.in.size(); i++) { + size_t inx = k_len - i + 1; + size_t dim = (p.in[i] + 2lu * p.conv.pads_begin[inx] - p.conv.kernel[inx]) / p.conv.stride[inx] + 1lu; + s_dims += "\n "; + s_dims += std::to_string(dim) + ""; + } + REPLACE_WITH_STR(model, "__CONV_OUT_DIMS__", s_dims); + + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_K_", p.conv.kernel); + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_KS_", p.conv.stride); + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PB_", p.conv.pads_begin); + REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PE_", p.conv.pads_end); + REPLACE_WITH_NUM(model, "_GC_", p.conv.group); + REPLACE_WITH_NUM(model, "_OC_", p.conv.out_c); + + model = IRTemplateGenerator::getIRTemplate("Deconvolution_Concat", p.in, "FP32", model, edges_t); + + return model; + } + +protected: + virtual void TearDown() { + } + + virtual void SetUp() { + try { + TestsCommon::SetUp(); + conv_conv_eltwise_params p = ::testing::WithParamInterface::GetParam(); + std::string model = getModel(p); + + InferenceEngine::CNNNetReader net_reader; + ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length())); + + auto network = net_reader.getNetwork(); + + int maxSign = 0x7F; + int maxUnsign = 0xFF; + + InferenceEngine::details::CNNStatisticHelper statHelper(network, {}, maxSign, maxUnsign); + auto conv_1 = network.getLayerByName("conv_1"); + auto conv_2 = network.getLayerByName("conv_2"); + auto eltwise = network.getLayerByName("eltwise_block"); + + ASSERT_EQ(eltwise, statHelper.getLatestInFuse(conv_1)); + ASSERT_EQ(conv_2, statHelper.getLatestInFuse(conv_2)); + ASSERT_EQ(eltwise, statHelper.getLatestInFuse(eltwise)); + } catch (const InferenceEngine::details::InferenceEngineException &e) { + FAIL() << e.what(); + } + } +}; + +TEST_P(NormalizationConvConvEltwiseTests, TestsConvConvEltwise) {} + +INSTANTIATE_TEST_CASE_P( + TestsConvConvEltwise, NormalizationConvConvEltwiseTests, + ::testing::Values( + conv_conv_eltwise_params{{1, 16, 4, 4}, + { {1, 1}, {1, 1}, {0, 0}, {0, 0}, {1, 1}, "", 1, 32, true }, + {"sum", {}} }, + conv_conv_eltwise_params{{1, 16, 4, 4, 4}, + { {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, "", 1, 32, true }, + {"sum", {}} } + )); diff --git a/inference-engine/tests/unit/inference_engine_tests/parameter_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/parameter_tests.cpp new file mode 100644 index 0000000..673d5c7 --- /dev/null +++ b/inference-engine/tests/unit/inference_engine_tests/parameter_tests.cpp @@ -0,0 +1,292 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +using namespace InferenceEngine; + +class DestructorTest { +public: + DestructorTest() { + constructorCount++; + } + + DestructorTest(const DestructorTest& c) { + constructorCount++; + } + + DestructorTest(const DestructorTest&& c) { + constructorCount++; + } + + ~DestructorTest() { + destructorCount++; + } + + static size_t destructorCount; + static size_t constructorCount; +}; +size_t DestructorTest::destructorCount = 0; +size_t DestructorTest::constructorCount = 0; + +class ParameterTests : public TestsCommon { +public: + void SetUp() override { + TestsCommon::SetUp(); + DestructorTest::destructorCount = 0; + DestructorTest::constructorCount = 0; + } +}; + +TEST_F(ParameterTests, ParameterAsInt) { + Parameter p = 4; + ASSERT_TRUE(p.is()); + int test = p; + ASSERT_EQ(4, test); +} + +TEST_F(ParameterTests, ParameterAsUInt) { + Parameter p = 4u; + ASSERT_TRUE(p.is()); + ASSERT_FALSE(p.is()); + unsigned int test = p; + ASSERT_EQ(4, test); +} + +TEST_F(ParameterTests, ParameterAsSize_t) { + size_t ref = 4; + Parameter p = ref; + ASSERT_TRUE(p.is()); + size_t test = p; + ASSERT_EQ(ref, test); +} + +TEST_F(ParameterTests, ParameterAsFloat) { + Parameter p = 4.f; + ASSERT_TRUE(p.is()); + float test = p; + ASSERT_EQ(4.f, test); +} + +TEST_F(ParameterTests, ParameterAsString) { + std::string ref = "test"; + Parameter p = ref; + std::string test = p; + ASSERT_TRUE(p.is()); + ASSERT_EQ(ref, test); +} + +TEST_F(ParameterTests, ParameterAsStringInLine) { + Parameter p = "test"; + std::string test = p; + ASSERT_TRUE(p.is()); + ASSERT_EQ("test", test); +} + +TEST_F(ParameterTests, IntParameterAsString) { + Parameter p = 4; + ASSERT_TRUE(p.is()); + ASSERT_FALSE(p.is()); + ASSERT_THROW(std::string test = p, std::bad_cast); + ASSERT_THROW(std::string test = p.as(), std::bad_cast); +} + +TEST_F(ParameterTests, StringParameterAsInt) { + Parameter p = "4"; + ASSERT_FALSE(p.is()); + ASSERT_TRUE(p.is()); + ASSERT_THROW(int test = p, std::bad_cast); + ASSERT_THROW(int test = p.as(), std::bad_cast); +} + +TEST_F(ParameterTests, ParameterAsTensorDesc) { + TensorDesc ref(Precision::FP32, {1, 3, 2, 2}, Layout::NCHW); + Parameter p = ref; + ASSERT_TRUE(p.is()); + TensorDesc test = p; + ASSERT_EQ(ref, test); +} + +TEST_F(ParameterTests, ParameterAsInts) { + std::vector ref = {1, 2, 3, 4, 5}; + Parameter p = ref; + ASSERT_TRUE(p.is>()); + std::vector test = p; + ASSERT_EQ(ref.size(), test.size()); + for (size_t i = 0; i < ref.size(); i++) { + ASSERT_EQ(ref[i], test[i]); + } +} + +TEST_F(ParameterTests, ParameterAsUInts) { + std::vector ref = {1, 2, 3, 4, 5}; + Parameter p = ref; + ASSERT_TRUE(p.is>()); + std::vector test = p; + ASSERT_EQ(ref.size(), test.size()); + for (size_t i = 0; i < ref.size(); i++) { + ASSERT_EQ(ref[i], test[i]); + } +} + +TEST_F(ParameterTests, ParameterAsSize_ts) { + std::vector ref = {1, 2, 3, 4, 5}; + Parameter p = ref; + ASSERT_TRUE(p.is>()); + std::vector test = p; + ASSERT_EQ(ref.size(), test.size()); + for (size_t i = 0; i < ref.size(); i++) { + ASSERT_EQ(ref[i], test[i]); + } +} + +TEST_F(ParameterTests, ParameterAsFloats) { + std::vector ref = {1, 2, 3, 4, 5}; + Parameter p = ref; + ASSERT_TRUE(p.is>()); + std::vector test = p; + ASSERT_EQ(ref.size(), test.size()); + for (size_t i = 0; i < ref.size(); i++) { + ASSERT_EQ(ref[i], test[i]); + } +} + +TEST_F(ParameterTests, ParameterAsStrings) { + std::vector ref = {"test1", "test2", "test3", "test4", "test1"}; + Parameter p = ref; + ASSERT_TRUE(p.is>()); + std::vector test = p; + ASSERT_EQ(ref.size(), test.size()); + for (size_t i = 0; i < ref.size(); i++) { + ASSERT_EQ(ref[i], test[i]); + } +} + +TEST_F(ParameterTests, ParameterAsMapOfParameters) { + std::map refMap; + refMap["testParamInt"] = 4; + refMap["testParamString"] = "test"; + Parameter p = refMap; + bool isMap = p.is>(); + ASSERT_TRUE(isMap); + std::map testMap = p; + + ASSERT_NE(testMap.find("testParamInt"), testMap.end()); + ASSERT_NE(testMap.find("testParamString"), testMap.end()); + + int testInt = testMap["testParamInt"]; + std::string testString = testMap["testParamString"]; + + ASSERT_EQ(refMap["testParamInt"].as(), testInt); + ASSERT_EQ(refMap["testParamString"].as(), testString); +} + +TEST_F(ParameterTests, ParameterNotEmpty) { + Parameter p = 4; + ASSERT_FALSE(p.empty()); +} + +TEST_F(ParameterTests, ParameterEmpty) { + Parameter p; + ASSERT_TRUE(p.empty()); +} + +TEST_F(ParameterTests, ParameterClear) { + Parameter p = 4; + ASSERT_FALSE(p.empty()); + p.clear(); + ASSERT_TRUE(p.empty()); +} + +TEST_F(ParameterTests, ParametersNotEqualByType) { + Parameter p1 = 4; + Parameter p2 = "string"; + ASSERT_TRUE(p1 != p2); + ASSERT_FALSE(p1 == p2); +} + +TEST_F(ParameterTests, ParametersNotEqualByValue) { + Parameter p1 = 4; + Parameter p2 = 5; + ASSERT_TRUE(p1 != p2); + ASSERT_FALSE(p1 == p2); +} + +TEST_F(ParameterTests, ParametersEqual) { + Parameter p1 = 4; + Parameter p2 = 4; + ASSERT_TRUE(p1 == p2); + ASSERT_FALSE(p1 != p2); +} + +TEST_F(ParameterTests, CompareParametersWithoutEqualOperator) { + class TestClass { + public: + TestClass(int test, int* testPtr): test(test), testPtr(testPtr) {} + + private: + int test; + int* testPtr; + }; + + TestClass a(2, (int *)0x234); + TestClass b(2, (int *)0x234); + TestClass c(3, (int *)0x234); + Parameter parA = a; + Parameter parB = b; + Parameter parC = c; + + ASSERT_THROW(bool equal = parA == parB, details::InferenceEngineException); + ASSERT_THROW(bool equal = parA != parB, details::InferenceEngineException); + ASSERT_THROW(bool equal = parA == parC, details::InferenceEngineException); + ASSERT_THROW(bool equal = parA != parC, details::InferenceEngineException); +} + +TEST_F(ParameterTests, ParameterRemovedRealObject) { + ASSERT_EQ(0, DestructorTest::constructorCount); + ASSERT_EQ(0, DestructorTest::destructorCount); + { + DestructorTest t; + Parameter p1 = t; + } + ASSERT_EQ(2, DestructorTest::constructorCount); + ASSERT_EQ(2, DestructorTest::destructorCount); +} + +TEST_F(ParameterTests, ParameterRemovedRealObjectWithDuplication) { + ASSERT_EQ(0, DestructorTest::constructorCount); + ASSERT_EQ(0, DestructorTest::destructorCount); + { + DestructorTest t; + Parameter p = t; + ASSERT_EQ(0, DestructorTest::destructorCount); + p = t; + ASSERT_EQ(2, DestructorTest::destructorCount); + } + ASSERT_EQ(4, DestructorTest::constructorCount); + ASSERT_EQ(4, DestructorTest::destructorCount); +} + +TEST_F(ParameterTests, ParameterRemovedRealObjectPointerWithDuplication) { + ASSERT_EQ(0, DestructorTest::constructorCount); + ASSERT_EQ(0, DestructorTest::destructorCount); + { + auto * t = new DestructorTest(); + Parameter p = t; + ASSERT_EQ(1, DestructorTest::constructorCount); + ASSERT_EQ(0, DestructorTest::destructorCount); + p = t; + ASSERT_TRUE(p.is()); + DestructorTest* t2 = p; + ASSERT_EQ(0, DestructorTest::destructorCount); + delete t; + auto * t3 = p.as(); + ASSERT_EQ(t2, t3); + } + ASSERT_EQ(1, DestructorTest::constructorCount); + ASSERT_EQ(1, DestructorTest::destructorCount); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/inference_engine_tests/plugin_dispatcher_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/plugin_dispatcher_tests.cpp index b54aa38..b2d2671 100644 --- a/inference-engine/tests/unit/inference_engine_tests/plugin_dispatcher_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/plugin_dispatcher_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/pointer_test.cpp b/inference-engine/tests/unit/inference_engine_tests/pointer_test.cpp index 78985fe..374c1b4 100644 --- a/inference-engine/tests/unit/inference_engine_tests/pointer_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/pointer_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/pre_allocator_test.cpp b/inference-engine/tests/unit/inference_engine_tests/pre_allocator_test.cpp index 42e06a0..15a7db4 100644 --- a/inference-engine/tests/unit/inference_engine_tests/pre_allocator_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/pre_allocator_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/precision_test.cpp b/inference-engine/tests/unit/inference_engine_tests/precision_test.cpp index a044b95..51cb2fb 100644 --- a/inference-engine/tests/unit/inference_engine_tests/precision_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/precision_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/preprocess_test.cpp b/inference-engine/tests/unit/inference_engine_tests/preprocess_test.cpp index 70ef0dc..5a9fe3d 100644 --- a/inference-engine/tests/unit/inference_engine_tests/preprocess_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/preprocess_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/range_iterator_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/range_iterator_tests.cpp index 0a10c8c..367840a 100644 --- a/inference-engine/tests/unit/inference_engine_tests/range_iterator_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/range_iterator_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/response_buffer_test.cpp b/inference-engine/tests/unit/inference_engine_tests/response_buffer_test.cpp index 4087637..992045f 100644 --- a/inference-engine/tests/unit/inference_engine_tests/response_buffer_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/response_buffer_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/shared_object_loader_test.cpp b/inference-engine/tests/unit/inference_engine_tests/shared_object_loader_test.cpp index cdea8de..41ca783 100644 --- a/inference-engine/tests/unit/inference_engine_tests/shared_object_loader_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/shared_object_loader_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/so_pointer_tests.cpp b/inference-engine/tests/unit/inference_engine_tests/so_pointer_tests.cpp index ed0e352..9398b69 100644 --- a/inference-engine/tests/unit/inference_engine_tests/so_pointer_tests.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/so_pointer_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/tensor_desc_test.cpp b/inference-engine/tests/unit/inference_engine_tests/tensor_desc_test.cpp index 16bd43b..8010db6 100644 --- a/inference-engine/tests/unit/inference_engine_tests/tensor_desc_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/tensor_desc_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.cpp b/inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.cpp new file mode 100644 index 0000000..1a3e5bb --- /dev/null +++ b/inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.cpp @@ -0,0 +1,830 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "ie_utils.hpp" +#include "blob_factory.hpp" +#include "debug.h" +#include "util_test.hpp" +#include "util_const_infer_test.hpp" +#include
+ +namespace IE = InferenceEngine; + +void RemoveLayerTests::SetUp() { + net = getNetwork(); + originalLayersNum = net->allLayers().size(); + testTransformator.reset(new ConstTransformatorTest(net.get())); +} + +// +// I1-d1-L1-d4 I4 +// / \ \ \ +// | d7 \ d10 +// | | \ / +// I2-d2-L2-d5-L4-d6-L5-d9-L10 +// / / +// / ____d8___/ +// / / +// I3-d3-L3 +// +IE::details::CNNNetworkImplPtr RemoveLayerTests::getNetwork() { + return netBuilder + .data("data1", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data2", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data3", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data4", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data5", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data6", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data7", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data8", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data9", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data10", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data11", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .layer(IE::LayerParams{"input1", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"input2", "Input", IE::Precision::FP32}) + .layer(IE::LayerParams{"input3", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"input4", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer1", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer2", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer3", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer4", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer5", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer6", "dummy", IE::Precision::FP32}) + .linkToData("input1", "data1") + .linkToData("input2", "data2") + .linkToData("input3", "data3") + .linkToData("input4", "data10") + + .linkDataTo("data1", "layer1") + .linkDataTo("data2", "layer2") + .linkDataTo("data2", "layer1") + .linkDataTo("data3", "layer3") + .linkDataTo("data3", "layer2") + .linkDataTo("data10", "layer6") + + .linkToData("layer1", "data4") + .linkToData("layer1", "data7") + .linkToData("layer2", "data5") + .linkToData("layer3", "data8") + + .linkDataTo("data4", "layer4") + .linkDataTo("data5", "layer4") + .linkDataTo("data8", "layer5") + .linkDataTo("data7", "layer2") + + .linkToData("layer4", "data6") + + .linkDataTo("data6", "layer5") + + .linkToData("layer5", "data9") + + .linkDataTo("data9", "layer6") + + .linkToData("layer6", "data11") + + .addInput("data1") + .addInput("data2") + .addInput("data3") + .finalize(); +} + +IE::CNNLayerPtr RemoveLayerTests::getLayer(const std::string& name) { + const auto& layers = netBuilder.getLayersMap(); + auto it = layers.find(name); + if (it == layers.end()) throw std::logic_error("Failed to find layer: " + name); + return it->second; +} + +IE::DataPtr RemoveLayerTests::getData(const std::string& name) { + const auto& datas = netBuilder.getDataMap(); + auto it = datas.find(name); + if (it == datas.end()) throw std::logic_error("Failed to find data: " + name); + return it->second; +} + +IE::BlobMap RemoveLayerTests::fillConstData(const std::vector& constLayers) { + IE::BlobMap constData; + for (const auto& name:constLayers) { + auto layer = getLayer(name); + for (const auto& outData:layer->outData) { + IE::TensorDesc desc = outData->getTensorDesc(); + IE::Blob::Ptr blob = make_blob_with_precision(desc); + blob->allocate(); + auto* buffer = blob->buffer().as(); + for (int i = 0; i < blob->size(); i++) { + buffer[i] = i + 1; + } + constData[outData->name] = blob; + } + } + return constData; +} + +IE::BlobMap RemoveLayerTests::initConstLayers(const std::vector& constLayers) { + for (const auto& name : constLayers) { + getLayer(name)->type = "Const"; + } + IE::BlobMap customBlobs = fillConstData(constLayers); + for (const auto& layerName: constLayers) { + auto layer = getLayer(layerName); + layer->type = "Const"; + layer->blobs["custom"] = customBlobs[layer->outData[0]->name]; + } + return customBlobs; +} + +TEST_F(RemoveLayerTests, canTrimL2) { + auto layer1 = getLayer("layer1"); + auto layer4 = getLayer("layer4"); + auto data2 = getData("data2"); + auto data3 = getData("data3"); + auto data7 = getData("data7"); + auto data5 = getData("data5"); + std::vector constLayers = {"layer2"}; + std::vector refNewLayers = {constLayers[0] + "__data5__Const"}; + auto constData = fillConstData(constLayers); + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + + auto newLayers = testTransformator->foldConstSubgraphsInternal({{constLayers[0], false}}, constData, sortedLayers); + + ASSERT_EQ(newLayers, refNewLayers); + IE::CNNNetwork cnnNetwork(net); + ASSERT_THROW(cnnNetwork.getLayerByName("layer2"), IE::NotFound); + auto newLayer = cnnNetwork.getLayerByName(refNewLayers[0].c_str()); + ASSERT_EQ(newLayer->type, "Const"); + ASSERT_EQ(constData["data5"], newLayer->blobs.at("custom")); + ASSERT_EQ(nullptr, net->getData("data7")); + net->removeData("data7"); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + ASSERT_EQ(data2->inputTo.size(), 1); + ASSERT_EQ(data2->inputTo.find("layer1")->second, layer1); + ASSERT_EQ(data5->creatorLayer.lock(), newLayer); + ASSERT_EQ(layer4->insData.size(), 2); + ASSERT_EQ(layer4->insData[1].lock(), data5); + ASSERT_EQ(layer1->insData.size(), 2); + ASSERT_EQ(layer1->insData[0].lock(), getData("data1")); + ASSERT_EQ(layer1->insData[1].lock(), data2); + ASSERT_EQ(layer1->outData.size(), 1); + ASSERT_EQ(layer1->outData[0], getData("data4")); + ASSERT_EQ(newLayer->outData.size(), 1); + ASSERT_EQ(newLayer->outData[0], data5); + ASSERT_EQ(data3->inputTo.size(), 1); + ASSERT_EQ(data3->inputTo.find("layer3")->second, getLayer("layer3")); +} + +TEST_F(RemoveLayerTests, canTrimI1andL1) { + auto layer4 = getLayer("layer4"); + auto layer2 = getLayer("layer2"); + auto data2 = getData("data2"); + std::vector constLayers = {"input1", "layer1"}; + std::map mapConstLayers; + for (const auto& it : constLayers) { + mapConstLayers[it] = false; + } + std::vector refNewLayers = {(constLayers[1] + "__data4__Const"), (constLayers[1] + "__data7__Const")}; + + auto constData = fillConstData(constLayers); + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto newLayers = testTransformator->foldConstSubgraphsInternal(mapConstLayers, constData, sortedLayers); + + ASSERT_EQ(newLayers, refNewLayers); + IE::CNNNetwork cnnNetwork(net); + ASSERT_THROW(cnnNetwork.getLayerByName("input1"), IE::NotFound); + ASSERT_THROW(cnnNetwork.getLayerByName("layer1"), IE::NotFound); + auto newLayerD4 = cnnNetwork.getLayerByName(refNewLayers[0].c_str()); + auto newLayerD7 = cnnNetwork.getLayerByName(refNewLayers[1].c_str()); + auto newData4 = net->getData("data4__layer4"); + auto newData7 = net->getData("data7__layer2"); + ASSERT_EQ(newLayerD4->type, "Const"); + ASSERT_EQ(newLayerD7->type, "Const"); + ASSERT_EQ(constData["data4"], newLayerD4->blobs.at("custom")); + ASSERT_EQ(constData["data7"], newLayerD7->blobs.at("custom")); + ASSERT_EQ(nullptr, net->getData("data1")); + net->removeData("data1"); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + ASSERT_EQ(data2->inputTo.size(), 1); + ASSERT_EQ(data2->inputTo.find("layer2")->second, layer2); + ASSERT_EQ(newData4->creatorLayer.lock(), newLayerD4); + ASSERT_EQ(newData7->creatorLayer.lock(), newLayerD7); + ASSERT_EQ(newLayerD4->outData.size(), 1); + ASSERT_EQ(newLayerD7->outData.size(), 1); + ASSERT_EQ(newLayerD4->outData[0], newData4); + ASSERT_EQ(newLayerD7->outData[0], newData7); + ASSERT_EQ(layer4->insData.size(), 2); + ASSERT_EQ(layer4->insData[0].lock(), newData4); + ASSERT_EQ(layer4->insData[1].lock(), getData("data5")); + ASSERT_EQ(layer2->insData.size(), 3); + ASSERT_EQ(layer2->insData[0].lock(), data2); + ASSERT_EQ(layer2->insData[1].lock(), getData("data3")); + ASSERT_EQ(layer2->insData[2].lock(), newData7); +} + +TEST_F(RemoveLayerTests, canFindConstLayers) { + getLayer("input1")->type = "Const"; + getLayer("layer2")->type = "Shape"; + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto constLayers = testTransformator->getConstLayers(sortedLayers); + + ASSERT_EQ(constLayers.size(), 2); + auto begin = constLayers.begin(); + auto end = constLayers.end(); + ASSERT_FALSE(constLayers.at("input1")); + ASSERT_FALSE(constLayers.at("layer2")); +} + +TEST_F(RemoveLayerTests, canFindConstLayers2) { + getLayer("input3")->type = "Const"; + getLayer("input2")->type = "Const"; + getLayer("layer2")->type = "Shape"; + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto constLayers = testTransformator->getConstLayers(sortedLayers); + + ASSERT_EQ(constLayers.size(), 4); + ASSERT_FALSE(constLayers.at("input3")); + ASSERT_FALSE(constLayers.at("layer2")); + ASSERT_FALSE(constLayers.at("layer3")); + ASSERT_FALSE(constLayers.at("input2")); +} + +TEST_F(RemoveLayerTests, canFindConstLayers3) { + getLayer("input3")->type = "Const"; + getLayer("layer2")->type = "Shape"; + getLayer("layer1")->type = "Shape"; + getLayer("layer4")->type = "Reshape"; + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto constLayers = testTransformator->getConstLayers(sortedLayers); + + ASSERT_EQ(constLayers.size(), 6); + ASSERT_FALSE(constLayers.at("input3")); + ASSERT_FALSE(constLayers.at("layer1")); + ASSERT_TRUE(constLayers.at("layer2")); + ASSERT_FALSE(constLayers.at("layer3")); + ASSERT_FALSE(constLayers.at("layer4")); + ASSERT_FALSE(constLayers.at("layer5")); +} + +TEST_F(RemoveLayerTests, canFindShapeConstLayers) { + getLayer("input3")->type = "Const"; + getLayer("layer2")->type = "Shape"; + getLayer("layer1")->type = "Shape"; + getLayer("layer6")->type = "Interp"; + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto constLayers = testTransformator->getConstLayers(sortedLayers); + + ASSERT_EQ(constLayers.size(), 6); + ASSERT_TRUE(constLayers.at("input3")); + ASSERT_TRUE(constLayers.at("layer1")); + ASSERT_TRUE(constLayers.at("layer2")); + ASSERT_TRUE(constLayers.at("layer3")); + ASSERT_TRUE(constLayers.at("layer4")); + ASSERT_TRUE(constLayers.at("layer5")); +} + +TEST_F(RemoveLayerTests, canFindShapeConstLayers2) { + getLayer("input3")->type = "Const"; + getLayer("input2")->type = "Const"; + getLayer("layer2")->type = "Shape"; + getLayer("layer1")->type = "Resample"; + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto constLayers = testTransformator->getConstLayers(sortedLayers); + + ASSERT_EQ(constLayers.size(), 4); + ASSERT_FALSE(constLayers.at("input3")); + ASSERT_FALSE(constLayers.at("layer2")); + ASSERT_FALSE(constLayers.at("layer3")); + ASSERT_FALSE(constLayers.at("input2")); +} + +TEST_F(RemoveLayerTests, canTrimShapeInput) { + std::vector constLayers = {"input3", "layer3", "input2"}; + for (const auto& name : constLayers) { + getLayer(name)->type = "Const"; + } + getLayer("layer2")->type = "Shape"; + getLayer("layer1")->type = "Interp"; + getLayer("layer4")->type = "Reshape"; + getLayer("layer5")->type = "Reshape"; + auto layer1 = getLayer("layer1"); + auto layer4 = getLayer("layer4"); + auto layer5 = getLayer("layer5"); + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto mapConstLayers = testTransformator->getConstLayers(sortedLayers); + auto newLayers = testTransformator->foldConstSubgraphsInternal(mapConstLayers, {}, sortedLayers); + testTransformator->trimShapeInputs(newLayers); + + ASSERT_EQ(nullptr, net->getData("data5")); + ASSERT_EQ(nullptr, net->getData("data2")); + net->removeData("data5"); + net->removeData("data2"); + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 3); + ASSERT_EQ(layer1->insData.size(), 1); + ASSERT_EQ(layer1->insData[0].lock(), getData("data1")); + ASSERT_EQ(layer4->insData.size(), 1); + ASSERT_EQ(layer4->insData[0].lock(), getData("data4")); + ASSERT_EQ(layer5->insData.size(), 2); + ASSERT_EQ(layer5->insData[0].lock(), getData("data8")); + ASSERT_EQ(layer5->insData[1].lock(), getData("data6")); +} + +TEST_F(RemoveLayerTests, canTrimShapeInput2) { + std::vector constLayers = {"input3", "input2"}; + for (const auto& name : constLayers) { + getLayer(name)->type = "Const"; + } + auto layer1 = getLayer("layer1"); + auto layer2 = getLayer("layer2"); + layer1->type = "Resample"; + layer2->type = "StridedSlice"; + + testTransformator->trimShapeInputs(constLayers); + + auto data6 = net->getData("data6"); + auto data2 = net->getData("data2"); + ASSERT_EQ(data2->inputTo.size(), 1); + ASSERT_EQ(data2->inputTo.at(layer2->name), layer2); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + ASSERT_EQ(layer1->insData.size(), 1); + ASSERT_EQ(layer1->insData[0].lock(), getData("data1")); + ASSERT_EQ(layer2->insData.size(), 3); + ASSERT_EQ(layer2->insData[0].lock(), getData("data2")); + ASSERT_EQ(layer2->insData[1].lock(), getData("data3")); + ASSERT_EQ(layer2->insData[2].lock(), getData("data7")); +} + +TEST_F(RemoveLayerTests, notTrimFirstConstInput) { + std::vector testLayers = {"Interp", "Reshape", "Pad", "Gather", "Resample"}; + std::string constLayer = "input4"; + getLayer(constLayer)->type = "Const"; + auto layer6 = getLayer("layer6"); + auto data10 = getData("data10"); + for (const auto& name: testLayers) { + layer6->type = name; + + testTransformator->trimShapeInputs({constLayer}); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + IE::CNNNetwork cnnNetwork(net); + auto input4 = cnnNetwork.getLayerByName(constLayer.c_str()); + ASSERT_EQ(data10->inputTo.size(), 1); + ASSERT_EQ(data10->creatorLayer.lock(), input4); + ASSERT_EQ(layer6->insData.size(), 2); + ASSERT_EQ(layer6->insData[0].lock(), data10); + ASSERT_EQ(layer6->insData[1].lock(), getData("data9")); + } +} + +TEST_F(RemoveLayerTests, canSaveConstForEltWise) { + auto input2 = getLayer("input2"); + auto layer1 = getLayer("layer1"); + auto data2 = getData("data2"); + input2->type = "Const"; + layer1->type = "Eltwise"; + + testTransformator->trimShapeInputs({input2->name}); + + IE::CNNNetwork cnnNetwork(net); + ASSERT_NO_THROW(input2 = cnnNetwork.getLayerByName(input2->name.c_str())); + ASSERT_EQ(net->allLayers().size(), 10); + ASSERT_EQ(layer1->insData.size(), 2); + ASSERT_EQ(layer1->insData[1].lock(), data2); + ASSERT_EQ(data2->inputTo.size(), 2); + ASSERT_EQ(data2->inputTo.at(layer1->name), layer1); + ASSERT_EQ(data2->creatorLayer.lock(), input2); +} + +TEST_F(RemoveLayerTests, canSaveDataWithMultipleInputTo) { + auto input3 = getLayer("input3"); + auto layer2 = getLayer("layer2"); + auto layer3 = getLayer("layer3"); + auto data3 = getData("data3"); + input3->type = "Const"; + layer2->type = "Reshape"; + + testTransformator->trimShapeInputs({input3->name}); + + IE::CNNNetwork cnnNetwork(net); + ASSERT_NO_THROW(input3 = cnnNetwork.getLayerByName(input3->name.c_str())); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + ASSERT_EQ(layer2->insData.size(), 2); + ASSERT_EQ(layer2->insData[0].lock(), getData("data2")); + ASSERT_EQ(layer2->insData[1].lock(), getData("data7")); + ASSERT_EQ(data3->inputTo.size(), 1); + ASSERT_EQ(data3->inputTo.at(layer3->name), layer3); + ASSERT_EQ(data3->creatorLayer.lock(), input3); + ASSERT_EQ(layer3->insData.size(), 1); + ASSERT_EQ(layer3->insData[0].lock(), data3); +} + +TEST_F(RemoveLayerTests, canFoldConstSubgraphToConst) { + std::vector constLayers = {"input1", "input2", "input3"}; + std::vector refNewLayers = {"layer5__data9__Const"}; + for (const auto& name : constLayers) { + getLayer(name)->type = "Const"; + } + getLayer("layer2")->type = "Shape"; + + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + auto mapConstLayers = testTransformator->getConstLayers(sortedLayers); + auto newLayers = testTransformator->foldConstSubgraphsInternal(mapConstLayers, {}, sortedLayers); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 7); + ASSERT_EQ(newLayers, refNewLayers); + IE::CNNNetwork cnnNetwork(net); + auto newLayer = cnnNetwork.getLayerByName(refNewLayers[0].c_str()); + ASSERT_EQ(newLayer->type, "Const"); + ASSERT_EQ(newLayer->outData[0], getData("data9")); +} + +TEST_F(RemoveLayerTests, canGetConstData) { + std::vector constLayers = {"input2", "input3", "layer3"}; + IE::BlobMap refBlobs = initConstLayers(constLayers); + std::map mapConstLayers; + for (const auto& it : constLayers) { + mapConstLayers[it] = false; + } + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + + auto actBlobs = testTransformator->getConstData(mapConstLayers, sortedLayers); + + ASSERT_EQ(actBlobs.size(), refBlobs.size()); + for (const auto& it: refBlobs) { + ASSERT_EQ(it.second, actBlobs[it.first]); + } +} + +TEST_F(RemoveLayerTests, canGetConstDataForUnknownImpl) { + initConstLayers({"input1", "input2", "input3"}); + { + getLayer("layer1")->type = "UNKNOWN"; + getLayer("layer2")->type = "UNKNOWN"; + getLayer("layer3")->type = "Shape"; + getLayer("layer4")->type = "UNKNOWN"; + getLayer("layer5")->type = "Mul"; + getLayer("layer6")->type = "Reshape"; + } + auto sortedLayers = IE::details::CNNNetSortTopologically(*net); + IE::SizeVector refShape = {1, 1, 3}; + + auto mapConstLayers = testTransformator->getConstLayers(sortedLayers); + auto actBlobs = testTransformator->getConstData(mapConstLayers, sortedLayers); + + ASSERT_EQ(getData("data9")->getTensorDesc().getDims(), refShape); +} + +TEST_F(RemoveLayerTests, canFoldConstSubgraphs) { + IE::BlobMap refBlobs = initConstLayers({"input1", "input2", "input3"}); + std::vector refNewLayers = {"layer5__data9__Const"}; + { // TODO: method for marking layers + getLayer("layer1")->type = "Mul"; + getLayer("layer2")->type = "Shape"; + getLayer("layer3")->type = "Power"; + getLayer("layer3")->params = {{"power", "1"}, + {"scale", "2"}, + {"shift", "-4"}}; + getLayer("layer4")->type = "Mul"; + getLayer("layer5")->type = "Mul"; + } + float arr[] = {-2.f, 0.f, 54.f}; + auto ref5 = make_blob_with_precision(getData("data9")->getTensorDesc(), arr); + + IE::ConstTransformer transformator(net.get()); + transformator.foldConstSubgraphs(); + + IE::CNNNetwork cnnNetwork(net); + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 7); + auto newLayer = cnnNetwork.getLayerByName(refNewLayers[0].c_str()); + auto actualBlob = newLayer->blobs["custom"]; + ASSERT_NE(actualBlob, nullptr); + ASSERT_FALSE(actualBlob->buffer() == nullptr); + TestsCommon::compare(*actualBlob, *ref5); + ASSERT_EQ(newLayer->type, "Const"); +} + +TEST_F(RemoveLayerTests, canSkipConstCalculation) { + IE::BlobMap refBlobs = initConstLayers({"input1", "input2", "input3"}); + getLayer("layer6")->type = "Reshape"; + + IE::ConstTransformer transformator(net.get()); + transformator.foldConstSubgraphs(); + + IE::CNNNetwork cnnNetwork(net); + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 8); +} + +TEST_F(RemoveLayerTests, canFoldConstWithUnknownImplForShapeDefiningLayers) { + IE::BlobMap refBlobs = initConstLayers({"input1", "input2", "input3"}); + { + getLayer("layer1")->type = "UNKNOWN"; + getLayer("layer2")->type = "UNKNOWN"; + getLayer("layer3")->type = "Shape"; + getLayer("layer4")->type = "Reshape"; + getLayer("layer5")->type = "Mul"; + getLayer("layer6")->type = "Reshape"; + } + + IE::ConstTransformer transformator(net.get()); + transformator.foldConstSubgraphs(); + + IE::CNNNetwork cnnNetwork(net); + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 8); + ASSERT_EQ(getLayer("layer6")->insData.size(), 1); +} + +TEST_F(RemoveLayerTests, throwErrorOnFoldWithUnknownImplForNotShapeDefiningLayers) { + IE::BlobMap refBlobs = initConstLayers({"input1", "input2", "input3"}); + { + getLayer("layer1")->type = "UNKNOWN"; + getLayer("layer2")->type = "Shape"; + getLayer("layer3")->type = "Shape"; + getLayer("layer4")->type = "Mul"; + getLayer("layer5")->type = "Mul"; + getLayer("layer6")->type = "Gather"; + } + + IE::ConstTransformer transformator(net.get()); + ASSERT_THROW(transformator.foldConstSubgraphs(), IE::details::InferenceEngineException); +} + +TEST_F(RemoveLayerTests, canFullTrim) { + IE::BlobMap refBlobs = initConstLayers({"input1", "input2", "input3"}); + auto layer6 = getLayer("layer6"); + { // TODO: method for marking layers + getLayer("layer1")->type = "Mul"; + getLayer("layer2")->type = "Shape"; + getLayer("layer3")->type = "Power"; + getLayer("layer3")->params = {{"power", "1"}, + {"scale", "2"}, + {"shift", "-4"}}; + getLayer("layer4")->type = "Mul"; + getLayer("layer5")->type = "Mul"; + layer6->type = "Reshape"; + } + + IE::ConstTransformer transformator(net.get()); + transformator.fullTrim(); + + IE::CNNNetwork cnnNetwork(net); + std::string newName = "layer5__data9__Const"; + ASSERT_THROW(cnnNetwork.getLayerByName(newName.c_str()), IE::NotFound); + ASSERT_EQ(net->allLayers().size(), 2); + ASSERT_EQ(layer6->insData.size(), 1); + ASSERT_EQ(layer6->insData[0].lock(), getData("data10")); +} + +TEST_F(RemoveLayerTests, canFullTrimConstToReshape) { + IE::BlobMap refBlobs = initConstLayers({"input2"}); + auto layer1 = getLayer("layer1"); + layer1->type = "Reshape"; + + IE::ConstTransformer transformator(net.get()); + transformator.fullTrim(); + + IE::CNNNetwork cnnNetwork(net); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + ASSERT_EQ(layer1->insData.size(), 1); + ASSERT_EQ(layer1->insData[0].lock(), getData("data1")); +} + +TEST_F(AdvancedShapeInferTests, canReshape) { + // + // I2-d2-Shape + // \ + // d3 + // \ + // I1-d1-Reshape-d4 + // + net = netBuilder + .data("data1", IE::SizeVector{1, 1, 3}, IE::Precision::FP32, IE::Layout::CHW) + .data("data2", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data3", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data4", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .layer(IE::LayerParams{"input1", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"input2", "Input", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer1", "Reshape", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer2", "Shape", IE::Precision::FP32}) + .linkToData("input1", "data1") + .linkToData("input2", "data2") + .linkDataTo("data1", "layer1") + .linkDataTo("data2", "layer2") + .linkToData("layer2", "data3") + .linkDataTo("data3", "layer1") + .linkToData("layer1", "data4") + .addInput("data1") + .addInput("data2") + .finalize(); + originalLayersNum = net->allLayers().size(); + IE::CNNNetwork cnnNetwork(net); + IE::SizeVector newShape = {1, 3, 1}; + std::map inputShapes = {{"data2", newShape}}; + cnnNetwork.reshape(inputShapes); + + ASSERT_NO_THROW(cnnNetwork.getLayerByName("layer2")); + ASSERT_EQ(getData("data3")->getTensorDesc().getDims(), IE::SizeVector{3}); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + + IE::ConstTransformer transformator(net.get()); + transformator.fullTrim(); + + ASSERT_THROW(cnnNetwork.getLayerByName("layer2"), IE::NotFound); + ASSERT_EQ(getData("data4")->getTensorDesc().getDims(), newShape); + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 1); +} + +TEST_F(AdvancedShapeInferTests, canReshape2) { + // + // I3-d3-Shape(L3)-d5 + // \ + // I2-d2-Shape(L2)-d4-Power(L4)-d6-Mul(L5)-d7 + // \ + // I1-d1-Reshape(L1)-d8 + // + net = netBuilder + .data("data1", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data2", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data3", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data4", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data5", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data6", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data7", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data8", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .layer(IE::LayerParams{"input1", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"input2", "Input", IE::Precision::FP32}) + .layer(IE::LayerParams{"input3", "Input", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer1", "Reshape", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer2", "Shape", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer3", "Shape", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer4", "Power", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer5", "Mul", IE::Precision::FP32}) + .linkToData("input1", "data1") + .linkToData("input2", "data2") + .linkToData("input3", "data3") + + .linkDataTo("data1", "layer1") + .linkDataTo("data2", "layer2") + .linkDataTo("data3", "layer3") + + .linkToData("layer2", "data4") + .linkToData("layer3", "data5") + + .linkDataTo("data4", "layer4") + + .linkToData("layer4", "data6") + + .linkDataTo("data5", "layer5") + .linkDataTo("data6", "layer5") + + .linkToData("layer5", "data7") + + .linkDataTo("data7", "layer1") + + .linkToData("layer1", "data8") + + .addInput("data1") + .addInput("data2") + .addInput("data3") + .finalize(); + originalLayersNum = net->allLayers().size(); + IE::CNNNetwork cnnNetwork(net); + IE::SizeVector newShape = {5, 9, 3}; + std::map inputShapes = {{"data1", {135}}, + {"data2", {2, 1, 1}}, + {"data3", {1, 3, 1}}}; + getLayer("layer4")->params = {{"power", "1"}, + {"scale", "2"}, + {"shift", "1"}}; + + cnnNetwork.reshape(inputShapes); + + ASSERT_EQ(getData("data7")->getTensorDesc().getDims(), IE::SizeVector{3}); + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + + IE::ConstTransformer transformator(net.get()); + transformator.fullTrim(); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 4); + ASSERT_EQ(getData("data8")->getTensorDesc().getDims(), newShape); +} + +TEST_F(AdvancedShapeInferTests, canReshapeConst) { + // + // Const-d2 + // \ + // I1-d1-Reshape(L1)-d3 + // + net = netBuilder + .data("data1", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data2", IE::SizeVector{3}, IE::Precision::FP32, IE::Layout::C) + .data("data3", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .layer(IE::LayerParams{"input1", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"const1", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer1", "Reshape", IE::Precision::FP32}) + .linkToData("input1", "data1") + .linkToData("const1", "data2") + .linkDataTo("data1", "layer1") + .linkDataTo("data2", "layer1") + .linkToData("layer1", "data3") + .addInput("data1") + .finalize(); + originalLayersNum = net->allLayers().size(); + IE::CNNNetwork cnnNetwork(net); + initConstLayers({"const1"}); + IE::SizeVector newOutShape = {1, 2, 3}; + IE::SizeVector newInShape = {IE::details::product(newOutShape)}; + + std::map inputShapes = {{"data1", newInShape}}; + + cnnNetwork.reshape(inputShapes); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + + IE::ConstTransformer transformator(net.get()); + transformator.fullTrim(); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 1); + ASSERT_EQ(getData("data1")->getTensorDesc().getDims(), newInShape); + ASSERT_EQ(getData("data3")->getTensorDesc().getDims(), newOutShape); +} + +TEST_F(AdvancedShapeInferTests, canReshapeCHWConst) { + // + // Const-d1-Tile-d2 + // + net = netBuilder + .data("data1", IE::SizeVector{3, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .data("data2", IE::SizeVector{1, 1, 1}, IE::Precision::FP32, IE::Layout::CHW) + .layer(IE::LayerParams{"const", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"tile", "Tile", IE::Precision::FP32}) + .linkToData("const", "data1") + .linkDataTo("data1", "tile") + .linkToData("tile", "data2") + .addInput("data1") + .finalize(); + getLayer("tile")->params = {{"axis", "0"}, + {"tiles", "2"}}; + originalLayersNum = net->allLayers().size(); + IE::CNNNetwork cnnNetwork(net); + initConstLayers({"const"}); + + cnnNetwork.reshape({}); + + IE::SizeVector expectedDims = {2, 1, 3}; + ASSERT_EQ(getData("data2")->getTensorDesc().getDims(), expectedDims); +} + +TEST_F(AdvancedShapeInferTests, canReshapeWithScalar) { + // + // Scalar-d2 + // \ + // I1-d1-Reshape(L1)-d3 + // + net = netBuilder + .data("data1", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .data("data2", IE::SizeVector{}, IE::Precision::FP32, IE::Layout::SCALAR) + .data("data3", IE::SizeVector{1}, IE::Precision::FP32, IE::Layout::C) + .layer(IE::LayerParams{"input1", "input", IE::Precision::FP32}) + .layer(IE::LayerParams{"scalar", "dummy", IE::Precision::FP32}) + .layer(IE::LayerParams{"layer1", "Reshape", IE::Precision::FP32}) + .linkToData("input1", "data1") + .linkToData("scalar", "data2") + .linkDataTo("data1", "layer1") + .linkDataTo("data2", "layer1") + .linkToData("layer1", "data3") + .addInput("data1") + .finalize(); + originalLayersNum = net->allLayers().size(); + IE::CNNNetwork cnnNetwork(net); + initConstLayers({"scalar"}); + IE::SizeVector newOutShape = {1}; + IE::SizeVector newInShape = {IE::details::product(newOutShape)}; + + std::map inputShapes = {{"data1", newInShape}}; + + cnnNetwork.reshape(inputShapes); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum); + + IE::ConstTransformer transformator(net.get()); + transformator.fullTrim(); + + ASSERT_EQ(net->allLayers().size(), originalLayersNum - 1); + ASSERT_EQ(getData("data1")->getTensorDesc().getDims(), newInShape); + ASSERT_EQ(getData("data3")->getTensorDesc().getDims(), newOutShape); +} diff --git a/inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.hpp b/inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.hpp new file mode 100644 index 0000000..b5fe89a --- /dev/null +++ b/inference-engine/tests/unit/inference_engine_tests/util_const_infer_test.hpp @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "ie_utils.hpp" +#include "blob_factory.hpp" +#include "debug.h" +#include "util_test.hpp" +#include
+ +namespace IE = InferenceEngine; + +class ConstTransformatorTest : public IE::ConstTransformer { +public: + explicit ConstTransformatorTest(IE::details::CNNNetworkImpl* network) : IE::ConstTransformer(network) {} + + const std::map + getConstLayers(const std::vector& sortedLayers) override { + return ConstTransformer::getConstLayers(sortedLayers); + } + + const InferenceEngine::BlobMap getConstData(const std::map& constLayers, + const std::vector& sortedLayers) override { + return ConstTransformer::getConstData(constLayers, sortedLayers); + } + + std::vector + foldConstSubgraphsInternal(const std::map& constLayers, const IE::BlobMap& constData, + const std::vector& sortedLayers) override { + return ConstTransformer::foldConstSubgraphsInternal(constLayers, constData, sortedLayers); + } + + void trimShapeInputs(const std::vector& constLayers) override { + ConstTransformer::trimShapeInputs(constLayers); + } + +}; + +class RemoveLayerTests : public testing::Test { +protected: + void SetUp() override; + + // + // I1-d1-L1-d4 I4 + // / \ \ \ + // | d7 \ d10 + // | | \ / + // I2-d2-L2-d5-L4-d6-L5-d9-L10 + // / / + // / ____d8___/ + // / / + // I3-d3-L3 + // + IE::details::CNNNetworkImplPtr getNetwork(); + + IE::CNNLayerPtr getLayer(const std::string& name); + + IE::DataPtr getData(const std::string& name); + + IE::BlobMap fillConstData(const std::vector& constLayers); + + IE::BlobMap initConstLayers(const std::vector& constLayers); + + NetBuilder netBuilder; + IE::details::CNNNetworkImplPtr net; + size_t originalLayersNum; + std::unique_ptr testTransformator; +}; + +class AdvancedShapeInferTests : public RemoveLayerTests { +protected: + void SetUp() override {}; +}; diff --git a/inference-engine/tests/unit/inference_engine_tests/util_test.cpp b/inference-engine/tests/unit/inference_engine_tests/util_test.cpp index d62e0a1..7c9222e 100644 --- a/inference-engine/tests/unit/inference_engine_tests/util_test.cpp +++ b/inference-engine/tests/unit/inference_engine_tests/util_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,124 +15,12 @@ #include #include #include "ie_utils.hpp" +#include "util_test.hpp" +#include "graph_tools.hpp" namespace IE = InferenceEngine; namespace { -class NetBuilder { - using LayersMap = std::unordered_map; - using DataMap = std::unordered_map; - using InputsSet = std::unordered_set; - LayersMap _layers; - DataMap _data; - InputsSet _inputs; -public: - NetBuilder() = default; - NetBuilder(const NetBuilder&) = delete; - - template - NetBuilder& data(Args&&... args) { - auto newData = std::make_shared(std::forward(args)...); - assert(!IE::contains(_data, newData->getName())); - _data[newData->getName()] = newData; - return *this; - } - - template - NetBuilder& layer(Args&&... args) { - auto newLayer = std::make_shared(std::forward(args)...); - assert(!IE::contains(_layers, newLayer->name)); - _layers[newLayer->name] = std::static_pointer_cast(newLayer); - return *this; - } - - const LayersMap& getLayersMap() const { - return _layers; - } - - const DataMap& getDataMap() const { - return _data; - } - - NetBuilder& linkDataTo(const std::string& dataName, - const std::string& nextlayerName) { - assert(IE::contains(_layers, nextlayerName)); - assert(IE::contains(_data, dataName)); - - auto nextlayer = _layers[nextlayerName]; - auto data = _data[dataName]; - - nextlayer->insData.push_back(data); - data->getInputTo().insert({nextlayerName, nextlayer}); - return *this; - } - - NetBuilder& linkToData(const std::string& prevlayerName, - const std::string& dataName) { - assert(IE::contains(_layers, prevlayerName)); - assert(IE::contains(_data, dataName)); - - auto prevlayer = _layers[prevlayerName]; - auto data = _data[dataName]; - assert(nullptr == data->getCreatorLayer().lock()); - - prevlayer->outData.push_back(data); - data->getCreatorLayer() = prevlayer; - return *this; - } - - NetBuilder& linkLayers(const std::string& prevlayerName, - const std::string& nextlayerName, - const std::string& dataName) { - linkToData(prevlayerName, dataName); - linkDataTo(dataName, nextlayerName); - return *this; - } - - NetBuilder& linkData(const std::string& prevDataName, - const std::string& nextDataName, - const std::string& layerName) { - linkDataTo(prevDataName, layerName); - linkToData(layerName, nextDataName); - return *this; - } - - template - NetBuilder& addInput(const std::string& dataName, Args&&... args) { - assert(!dataName.empty()); - assert(IE::contains(_data, dataName)); - auto input = std::make_shared( - std::forward(args)...); - input->setInputData(_data[dataName]); - _inputs.insert(std::move(input)); - return *this; - } - - IE::details::CNNNetworkImplPtr finalize() { - auto net = std::make_shared(); - - for (auto&& it: _data) { - auto& data = it.second; - net->getData(it.first) = data; - if (nullptr == data->getCreatorLayer().lock()) { - auto input = std::make_shared(); - input->setInputData(data); - net->setInputInfo(input); - } - } - for (auto&& it: _layers) { - net->addLayer(it.second); - } - for (auto& i : _inputs) { - net->setInputInfo(std::move(i)); - } - - net->resolveOutput(); - - return net; - } -}; - bool checkLayers(const std::vector& layers, std::initializer_list layersToCheck) { if (layers.size() != layersToCheck.size()) { return false; @@ -537,7 +425,7 @@ TEST(UtilTests, cloneNet) { { auto layer = getLayer(net, "layer1"); - auto cloned = IE::cloneNet({layer}); + auto cloned = IE::cloneNet({layer}, nullptr); EXPECT_EQ(2, cloned->layerCount()); auto clonedLayer = getLayer(cloned, "layer1"); ASSERT_NE(nullptr, clonedLayer); @@ -555,7 +443,7 @@ TEST(UtilTests, cloneNet) { { auto layer1 = getLayer(net, "layer1"); auto layer2 = getLayer(net, "layer2"); - auto cloned = IE::cloneNet({layer1,layer2}); + auto cloned = IE::cloneNet({layer1,layer2}, nullptr); EXPECT_EQ(4, cloned->layerCount()); auto clonedLayer1 = getLayer(cloned, "layer1"); auto clonedLayer2 = getLayer(cloned, "layer2"); @@ -576,7 +464,7 @@ TEST(UtilTests, cloneNet) { { auto layer4 = getLayer(net, "layer4"); auto layer5 = getLayer(net, "layer5"); - auto cloned = IE::cloneNet({layer4,layer5}); + auto cloned = IE::cloneNet({layer4,layer5}, nullptr); EXPECT_EQ(4, cloned->layerCount()); auto clonedLayer4 = getLayer(cloned, "layer4"); auto clonedLayer5 = getLayer(cloned, "layer5"); @@ -608,7 +496,7 @@ TEST(UtilTests, cloneNet) { } { auto layer3 = getLayer(net, "layer3"); - auto cloned = IE::cloneNet({layer3}); + auto cloned = IE::cloneNet({layer3}, nullptr); EXPECT_EQ(2, cloned->layerCount()); auto clonedLayer3 = getLayer(cloned, "layer3"); ASSERT_NE(nullptr, clonedLayer3); @@ -638,7 +526,7 @@ TEST(UtilTests, cloneNet) { auto layer5 = getLayer(net, "layer5"); auto layer6 = getLayer(net, "layer6"); auto layer7 = getLayer(net, "layer7"); - auto cloned = IE::cloneNet({layer1,layer2,layer3,layer4,layer5,layer6,layer7}); + auto cloned = IE::cloneNet({layer1,layer2,layer3,layer4,layer5,layer6,layer7}, nullptr); EXPECT_EQ(9, cloned->layerCount()); auto clonedLayer1 = getLayer(cloned, "layer1"); auto clonedLayer2 = getLayer(cloned, "layer2"); @@ -771,7 +659,7 @@ TEST(UtilTests, cloneNet_input) { auto cloned = IE::cloneNet({getLayer(net, "layer1"), getLayer(net, "layer2"), - getLayer(net, "layer3")}); + getLayer(net, "layer3")}, nullptr); ASSERT_EQ(6, cloned->layerCount()); ASSERT_NE(nullptr, getLayer(cloned, "input1")); @@ -825,7 +713,7 @@ TEST(UtilTests, cloneNet_const) { auto cloned = IE::cloneNet({getLayer(net, "layer1"), getLayer(net, "layer2"), - getLayer(net, "layer3")}); + getLayer(net, "layer3")}, nullptr); ASSERT_EQ(6, cloned->layerCount()); ASSERT_NE(nullptr, getLayer(cloned, "input1")); @@ -1673,7 +1561,7 @@ TEST(UtilTests, replaceLayerWithNewLayer) { auto newLayer1 = std::make_shared(IE::LayerParams{"layer1", "dummy", IE::Precision::UNSPECIFIED}); auto layer1 = layers.find("layer1"); EXPECT_TRUE(layer1 != layers.end()); - IE::replaceLayerWithNewLayer(*net, layer1->second, newLayer1); + CNNNetSubstituteLayer(*net, layer1->second, newLayer1); IE::CNNLayerPtr layer1Check = nullptr; net->getLayerByName("layer1", layer1Check, nullptr); ASSERT_EQ(layer1Check, newLayer1); @@ -1685,7 +1573,7 @@ TEST(UtilTests, replaceLayerWithNewLayer) { auto newLayer2 = std::make_shared(IE::LayerParams{"layer2", "dummy", IE::Precision::UNSPECIFIED}); auto layer2 = layers.find("layer2"); EXPECT_TRUE(layer2 != layers.end()); - IE::replaceLayerWithNewLayer(*net, layer2->second, newLayer2); + CNNNetSubstituteLayer(*net, layer2->second, newLayer2); IE::CNNLayerPtr layer2Check = nullptr; net->getLayerByName("layer2", layer2Check, nullptr); ASSERT_EQ(layer2Check, newLayer2); @@ -1697,7 +1585,7 @@ TEST(UtilTests, replaceLayerWithNewLayer) { auto newLayer3 = std::make_shared(IE::LayerParams{"layer3", "dummy", IE::Precision::UNSPECIFIED}); auto layer3 = layers.find("layer3"); EXPECT_TRUE(layer3 != layers.end()); - IE::replaceLayerWithNewLayer(*net, layer3->second, newLayer3); + CNNNetSubstituteLayer(*net, layer3->second, newLayer3); IE::CNNLayerPtr layer3Check = nullptr; net->getLayerByName("layer3", layer3Check, nullptr); ASSERT_EQ(layer3Check, newLayer3); diff --git a/inference-engine/tests/unit/inference_engine_tests/util_test.hpp b/inference-engine/tests/unit/inference_engine_tests/util_test.hpp new file mode 100644 index 0000000..76225e0 --- /dev/null +++ b/inference-engine/tests/unit/inference_engine_tests/util_test.hpp @@ -0,0 +1,121 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +namespace IE = InferenceEngine; + +class NetBuilder { + using LayersMap = std::unordered_map; + using DataMap = std::unordered_map; + using InputsSet = std::unordered_set; + LayersMap _layers; + DataMap _data; + InputsSet _inputs; +public: + NetBuilder() = default; + + NetBuilder(const NetBuilder&) = delete; + + template + NetBuilder& data(Args&& ... args) { + auto newData = std::make_shared(std::forward(args)...); + assert(!IE::contains(_data, newData->getName())); + _data[newData->getName()] = newData; + return *this; + } + + template + NetBuilder& layer(Args&& ... args) { + auto newLayer = std::make_shared(std::forward(args)...); + assert(!IE::contains(_layers, newLayer->name)); + _layers[newLayer->name] = std::static_pointer_cast(newLayer); + return *this; + } + + const LayersMap& getLayersMap() const { + return _layers; + } + + const DataMap& getDataMap() const { + return _data; + } + + NetBuilder& linkDataTo(const std::string& dataName, + const std::string& nextlayerName) { + assert(IE::contains(_layers, nextlayerName)); + assert(IE::contains(_data, dataName)); + + auto nextlayer = _layers[nextlayerName]; + auto data = _data[dataName]; + + nextlayer->insData.push_back(data); + data->getInputTo().insert({nextlayerName, nextlayer}); + return *this; + } + + NetBuilder& linkToData(const std::string& prevlayerName, + const std::string& dataName) { + assert(IE::contains(_layers, prevlayerName)); + assert(IE::contains(_data, dataName)); + + auto prevlayer = _layers[prevlayerName]; + auto data = _data[dataName]; + assert(nullptr == data->getCreatorLayer().lock()); + + prevlayer->outData.push_back(data); + data->getCreatorLayer() = prevlayer; + return *this; + } + + NetBuilder& linkLayers(const std::string& prevlayerName, + const std::string& nextlayerName, + const std::string& dataName) { + linkToData(prevlayerName, dataName); + linkDataTo(dataName, nextlayerName); + return *this; + } + + NetBuilder& linkData(const std::string& prevDataName, + const std::string& nextDataName, + const std::string& layerName) { + linkDataTo(prevDataName, layerName); + linkToData(layerName, nextDataName); + return *this; + } + + template + NetBuilder& addInput(const std::string& dataName, Args&& ... args) { + assert(!dataName.empty()); + assert(IE::contains(_data, dataName)); + auto input = std::make_shared( + std::forward(args)...); + input->setInputData(_data[dataName]); + _inputs.insert(std::move(input)); + return *this; + } + + IE::details::CNNNetworkImplPtr finalize() { + auto net = std::make_shared(); + + for (auto&& it: _data) { + auto& data = it.second; + net->getData(it.first) = data; + if (nullptr == data->getCreatorLayer().lock()) { + auto input = std::make_shared(); + input->setInputData(data); + net->setInputInfo(input); + } + } + for (auto&& it: _layers) { + net->addLayer(it.second); + } + for (auto& i : _inputs) { + net->setInputInfo(std::move(i)); + } + + net->resolveOutput(); + + return net; + } +}; diff --git a/inference-engine/tests/unit/mem_solver/mem_solver_test.cpp b/inference-engine/tests/unit/mem_solver/mem_solver_test.cpp index 8ffab0f..0f430c0 100644 --- a/inference-engine/tests/unit/mem_solver/mem_solver_test.cpp +++ b/inference-engine/tests/unit/mem_solver/mem_solver_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_default.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_default.hpp index 5141a1f..bd49a13 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_default.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_default.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_internal.hpp index a8adfbb..c65279d 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_thread_safe_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_thread_safe_internal.hpp index 720c584..6fdc1d0 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_thread_safe_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_thread_safe_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_network_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_network_internal.hpp index 08bd367..8ab78b2 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_network_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_network_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,6 +28,6 @@ public: MOCK_METHOD1(CreateInferRequest, void(IInferRequest::Ptr &)); MOCK_METHOD1(Export, void(const std::string &)); MOCK_METHOD1(GetMappedTopology, void(std::map> &)); - + MOCK_METHOD1(GetExecGraphInfo, void(ICNNNetwork::Ptr &)); }; diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_async_only.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_async_only.hpp index d6658bc..587dbea 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_async_only.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_async_only.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_default.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_default.hpp index f67323b..9e5a254 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_default.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_default.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_infer_request_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_infer_request_internal.hpp index 42e299b..fc40d03 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_infer_request_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_infer_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp index bf3b540..9898c65 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iasync_infer_request_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iasync_infer_request_internal.hpp index cf37848..9ae7837 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iasync_infer_request_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iasync_infer_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp index c5316bf..e630167 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,4 +28,5 @@ public: MOCK_METHOD1(Export, void(const std::string &)); MOCK_METHOD1(GetMappedTopology, void(std::map> &)); MOCK_METHOD0(QueryState, std::vector()); + MOCK_METHOD1(GetExecGraphInfo, void(ICNNNetwork::Ptr &)); }; diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iinfer_request_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iinfer_request_internal.hpp index dd1bb49..253a548 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iinfer_request_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iinfer_request_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_imemory_state_internal.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_imemory_state_internal.hpp index 03a4043..667a794 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_imemory_state_internal.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_imemory_state_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/mock_plugin_impl.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/mock_plugin_impl.hpp index 66c9910..99ba25b 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/mock_plugin_impl.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/mock_plugin_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_executor.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_executor.hpp index 34ccb5c..1d9b79a 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_executor.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_executor.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_synchronizer.hpp b/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_synchronizer.hpp index 2d34f1e..e8aedba 100644 --- a/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_synchronizer.hpp +++ b/inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_synchronizer.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_allocator.hpp b/inference-engine/tests/unit/mocks/mock_allocator.hpp index ad53afb..ce632f9 100644 --- a/inference-engine/tests/unit/mocks/mock_allocator.hpp +++ b/inference-engine/tests/unit/mocks/mock_allocator.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_error_listener.hpp b/inference-engine/tests/unit/mocks/mock_error_listener.hpp index 420fc22..e2b2783 100644 --- a/inference-engine/tests/unit/mocks/mock_error_listener.hpp +++ b/inference-engine/tests/unit/mocks/mock_error_listener.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_iasync_infer_request.hpp b/inference-engine/tests/unit/mocks/mock_iasync_infer_request.hpp index 571a7d4..e9f1aad 100644 --- a/inference-engine/tests/unit/mocks/mock_iasync_infer_request.hpp +++ b/inference-engine/tests/unit/mocks/mock_iasync_infer_request.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_icnn_network.hpp b/inference-engine/tests/unit/mocks/mock_icnn_network.hpp index 43337fb..1bdac7d 100644 --- a/inference-engine/tests/unit/mocks/mock_icnn_network.hpp +++ b/inference-engine/tests/unit/mocks/mock_icnn_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_iexecutable_network.hpp b/inference-engine/tests/unit/mocks/mock_iexecutable_network.hpp index d28d81c..5ddf2f6 100644 --- a/inference-engine/tests/unit/mocks/mock_iexecutable_network.hpp +++ b/inference-engine/tests/unit/mocks/mock_iexecutable_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -22,4 +22,5 @@ public: MOCK_QUALIFIED_METHOD2(GetMappedTopology, noexcept, StatusCode(std::map> &, ResponseDesc*)); MOCK_QUALIFIED_METHOD0(Release, noexcept, void ()); MOCK_QUALIFIED_METHOD3(QueryState, noexcept, StatusCode(IMemoryState::Ptr &, size_t , ResponseDesc*)); + MOCK_QUALIFIED_METHOD2(GetExecGraphInfo, noexcept, StatusCode(ICNNNetwork::Ptr &, ResponseDesc*)); }; diff --git a/inference-engine/tests/unit/mocks/mock_iformat_parser.hpp b/inference-engine/tests/unit/mocks/mock_iformat_parser.hpp index 12b7c2f..750b2f0 100644 --- a/inference-engine/tests/unit/mocks/mock_iformat_parser.hpp +++ b/inference-engine/tests/unit/mocks/mock_iformat_parser.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_inference_engine.hpp b/inference-engine/tests/unit/mocks/mock_inference_engine.hpp index 150629c..dd3a991 100644 --- a/inference-engine/tests/unit/mocks/mock_inference_engine.hpp +++ b/inference-engine/tests/unit/mocks/mock_inference_engine.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/mock_not_empty_icnn_network.hpp b/inference-engine/tests/unit/mocks/mock_not_empty_icnn_network.hpp index bc71bae..1edefb7 100644 --- a/inference-engine/tests/unit/mocks/mock_not_empty_icnn_network.hpp +++ b/inference-engine/tests/unit/mocks/mock_not_empty_icnn_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -18,10 +18,16 @@ public: static constexpr const char* OUTPUT_BLOB_NAME = "first_output"; MOCK_QUALIFIED_METHOD0(getPrecision, const noexcept, Precision ()); void getOutputsInfo(OutputsDataMap& out) const noexcept override { - out[OUTPUT_BLOB_NAME] = nullptr; + auto data = std::make_shared("", Precision::UNSPECIFIED); + data->getInputTo()[""] = std::make_shared(LayerParams{}); + out[OUTPUT_BLOB_NAME] = data; }; void getInputsInfo(InputsDataMap &inputs) const noexcept override { - inputs[INPUT_BLOB_NAME] = nullptr; + auto inputInfo = std::make_shared(); + auto data = std::make_shared("", Precision::UNSPECIFIED); + data->getInputTo()[""] = std::make_shared(LayerParams{}); + inputInfo->setInputData(data); + inputs[INPUT_BLOB_NAME] = inputInfo; }; MOCK_QUALIFIED_METHOD1(getInput, const noexcept, InputInfo::Ptr (const std::string &inputName)); MOCK_QUALIFIED_METHOD2(getName, const noexcept, void (char* pName, size_t len)); diff --git a/inference-engine/tests/unit/mocks/mock_plugin_dispatcher.hpp b/inference-engine/tests/unit/mocks/mock_plugin_dispatcher.hpp index 769690f..aaa1658 100644 --- a/inference-engine/tests/unit/mocks/mock_plugin_dispatcher.hpp +++ b/inference-engine/tests/unit/mocks/mock_plugin_dispatcher.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/shape_infer/mock_input_controller.hpp b/inference-engine/tests/unit/mocks/shape_infer/mock_input_controller.hpp index 4e2c2d4..e971ee7 100644 --- a/inference-engine/tests/unit/mocks/shape_infer/mock_input_controller.hpp +++ b/inference-engine/tests/unit/mocks/shape_infer/mock_input_controller.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -23,6 +23,8 @@ public: MOCK_METHOD1(getShapes, std::vector(bool)); + MOCK_METHOD1(getBlobs, std::vector(bool)); + MOCK_METHOD0(getIRShapes, std::vector()); MOCK_METHOD1(getIRShapeByName, SizeVector( diff --git a/inference-engine/tests/unit/mocks/shape_infer/mock_ishape_infer_impl.hpp b/inference-engine/tests/unit/mocks/shape_infer/mock_ishape_infer_impl.hpp index 75e70de..9868310 100644 --- a/inference-engine/tests/unit/mocks/shape_infer/mock_ishape_infer_impl.hpp +++ b/inference-engine/tests/unit/mocks/shape_infer/mock_ishape_infer_impl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +15,7 @@ public: using Ptr = std::shared_ptr; MOCK_QUALIFIED_METHOD5(inferShapes, noexcept, StatusCode( - const std::vector &, const std::map&, const std::map&, std::vector &, ResponseDesc *)); + const std::vector &, const std::map&, const std::map&, std::vector &, ResponseDesc *)); }; diff --git a/inference-engine/tests/unit/mocks/shape_infer/mock_output_controller.hpp b/inference-engine/tests/unit/mocks/shape_infer/mock_output_controller.hpp index a3cc339..b7b1b07 100644 --- a/inference-engine/tests/unit/mocks/shape_infer/mock_output_controller.hpp +++ b/inference-engine/tests/unit/mocks/shape_infer/mock_output_controller.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/shape_infer/mock_reshaper_launcher.hpp b/inference-engine/tests/unit/mocks/shape_infer/mock_reshaper_launcher.hpp index 4604546..7784a09 100644 --- a/inference-engine/tests/unit/mocks/shape_infer/mock_reshaper_launcher.hpp +++ b/inference-engine/tests/unit/mocks/shape_infer/mock_reshaper_launcher.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/mocks/shape_infer/mock_shape_infer_extension.hpp b/inference-engine/tests/unit/mocks/shape_infer/mock_shape_infer_extension.hpp index f579954..8ef5152 100644 --- a/inference-engine/tests/unit/mocks/shape_infer/mock_shape_infer_extension.hpp +++ b/inference-engine/tests/unit/mocks/shape_infer/mock_shape_infer_extension.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt b/inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt index 73d3af5..5a4248f 100644 --- a/inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt +++ b/inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt @@ -1,5 +1,17 @@ -# Copyright (C) 2018 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 +# +# Copyright (C) 2018-2019 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you (End User License Agreement for the Intel(R) Software +# Development Products (Version May 2017)). Unless the License provides +# otherwise, you may not use, modify, copy, publish, distribute, disclose or +# transmit this software or the related documents without Intel's prior +# written permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. # if(NOT ENABLE_GAPI_TESTS) @@ -8,24 +20,25 @@ if(NOT ENABLE_GAPI_TESTS) endif() find_package(OpenCV COMPONENTS gapi) -if(NOT(OpenCV_FOUND)) +if(NOT OpenCV_FOUND) message(WARNING "No suitable OpenCV version detected, " ${TARGET_NAME} " skipped") return() endif() +add_subdirectory(fluid_test_computations) + file(GLOB SOURCES *.cpp common/*.cpp cpu/*.cpp) file(GLOB HEADERS *.hpp common/*.hpp cpu/*.hpp) set(TARGET opencv_test_gapi) add_executable(${TARGET} ${SOURCES} ${HEADERS}) -target_include_directories(${TARGET} - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/common" - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/cpu" - PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/fluid/modules/gapi/include/") +target_include_directories(${TARGET} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/common" + "${CMAKE_CURRENT_SOURCE_DIR}/cpu") -target_link_libraries(${TARGET} ${OpenCV_LIBS} inference_engine gtest gtest_main) +target_link_libraries(${TARGET} PRIVATE ${OpenCV_LIBS} inference_engine_s fluid_test_computations gtest gtest_main) if(GAPI_TEST_PERF) target_compile_definitions(${TARGET} PRIVATE -DPERF_TEST=1) diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.cpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.cpp index fb57725..e46d81a 100644 --- a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.cpp +++ b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp index 7a251f9..884554f 100644 --- a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp +++ b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,15 +13,9 @@ namespace opencv_test { -struct ResizeTestGAPI: public testing::TestWithParam, double, cv::GCompileArgs>> {}; - -struct Split2TestGAPI: public TestParams> {}; -struct Split3TestGAPI: public TestParams> {}; -struct Split4TestGAPI: public TestParams> {}; - -struct Merge2TestGAPI: public TestParams> {}; -struct Merge3TestGAPI: public TestParams> {}; -struct Merge4TestGAPI: public TestParams> {}; +struct ResizeTestGAPI: public testing::TestWithParam, double>> {}; +struct SplitTestGAPI: public TestParams> {}; +struct MergeTestGAPI: public TestParams> {}; //------------------------------------------------------------------------------ diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp index 3daaba5..9f92449 100644 --- a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp +++ b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -9,9 +9,7 @@ #include "blob_factory.hpp" #include "blob_transform.hpp" -#include "ie_preprocess.hpp" #include "ie_preprocess_data.hpp" -#include "ie_preprocess_gapi_kernels.hpp" #include #include @@ -23,7 +21,7 @@ #include -#define CV_MAT_CHANNELS(flags) (((flags) >> CV_CN_SHIFT) + 1) +#include // Can be set externally (via CMake) if built with -DGAPI_TEST_PERF=ON #ifndef PERF_TEST @@ -107,14 +105,27 @@ static cv::String typeToString(int type) } #endif // PERF_TEST +namespace { + +test::Mat to_test(cv::Mat& mat) { return {mat.rows, mat.cols, mat.type(), mat.data}; } +std::vector to_test(std::vector& mats) +{ + std::vector test_mats(mats.size()); + for (int i = 0; i < mats.size(); i++) { + test_mats[i] = to_test(mats[i]); + } + return test_mats; +} + +} // anonymous namespace + TEST_P(ResizeTestGAPI, AccuracyTest) { int type = 0, interp = 0; cv::Size sz_in, sz_out; double tolerance = 0.0; - cv::GCompileArgs compile_args; std::pair sizes; - std::tie(type, interp, sizes, tolerance, compile_args) = GetParam(); + std::tie(type, interp, sizes, tolerance) = GetParam(); std::tie(sz_in, sz_out) = sizes; cv::Mat in_mat1 (sz_in, type ); @@ -127,42 +138,12 @@ TEST_P(ResizeTestGAPI, AccuracyTest) cv::Mat out_mat_ocv(sz_out, type); // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in, out; - switch (CV_MAT_CHANNELS(type)) - { - case 1: - out = InferenceEngine::gapi::ScalePlane::on(in, type, sz_in, sz_out, interp); - break; - case 3: - { - int depth = CV_MAT_DEPTH(type); - int type1 = CV_MAKE_TYPE(depth, 1); - cv::GMat in0, in1, in2, out0, out1, out2; - std::tie(in0, in1, in2) = InferenceEngine::gapi::Split3::on(in); - out0 = InferenceEngine::gapi::ScalePlane::on(in0, type1, sz_in, sz_out, interp); - out1 = InferenceEngine::gapi::ScalePlane::on(in1, type1, sz_in, sz_out, interp); - out2 = InferenceEngine::gapi::ScalePlane::on(in2, type1, sz_in, sz_out, interp); - out = InferenceEngine::gapi::Merge3::on(out0, out1, out2); - } - break; - default: CV_Assert(!"ERROR: unsupported number of channels!"); - } - - cv::GComputation c(in, out); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_out_mat = cv::to_own(out_mat); - - std::vector v_in = { own_in_mat1 }; - std::vector v_out = { own_out_mat }; - - c.apply(v_in, v_out, std::move(compile_args)); + FluidResizeComputation rc(to_test(in_mat1), to_test(out_mat), interp); + rc.warmUp(); #if PERF_TEST // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, + test_ms([&](){ rc.apply(); }, 100, "Resize GAPI %s %s %dx%d -> %dx%d", interpToString(interp).c_str(), typeToString(type).c_str(), sz_in.width, sz_in.height, sz_out.width, sz_out.height); @@ -180,299 +161,75 @@ TEST_P(ResizeTestGAPI, AccuracyTest) } } -TEST_P(Split2TestGAPI, AccuracyTest) -{ - int depth = std::get<0>(GetParam()); - cv::Size sz_in = std::get<1>(GetParam()); - auto compile_args = std::get<2>(GetParam()); - - int type1 = CV_MAKE_TYPE(depth, 1); - int type2 = CV_MAKE_TYPE(depth, 2); - initMatrixRandU(type2, sz_in, type1); - - cv::Mat out_mat2 = cv::Mat(sz_in, type1); - cv::Mat out_mat_ocv2 = cv::Mat(sz_in, type1); - - // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in1, out1, out2; - std::tie(out1, out2) = InferenceEngine::gapi::Split2::on(in1); - cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2)); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_out_mat_gapi = cv::to_own(out_mat_gapi); - auto own_out_mat2 = cv::to_own(out_mat2); - - std::vector v_in = { own_in_mat1 }; - std::vector v_out = { own_out_mat_gapi, own_out_mat2 }; - - c.apply(v_in, v_out, std::move(compile_args)); - -#if PERF_TEST - // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, - 400, "Split GAPI %s %dx%d", typeToString(type2).c_str(), sz_in.width, sz_in.height); -#endif - - // OpenCV code ///////////////////////////////////////////////////////////// - { - std::vector out_mats_ocv = {out_mat_ocv, out_mat_ocv2}; - cv::split(in_mat1, out_mats_ocv); - } - // Comparison ////////////////////////////////////////////////////////////// - { - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi)); - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2)); - } -} - -TEST_P(Split3TestGAPI, AccuracyTest) -{ - int depth = std::get<0>(GetParam()); - cv::Size sz_in = std::get<1>(GetParam()); - auto compile_args = std::get<2>(GetParam()); - - int type1 = CV_MAKE_TYPE(depth, 1); - int type3 = CV_MAKE_TYPE(depth, 3); - initMatrixRandU(type3, sz_in, type1); - - cv::Mat out_mat2 = cv::Mat(sz_in, type1); - cv::Mat out_mat3 = cv::Mat(sz_in, type1); - cv::Mat out_mat_ocv2 = cv::Mat(sz_in, type1); - cv::Mat out_mat_ocv3 = cv::Mat(sz_in, type1); - - // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in1, out1, out2, out3; - std::tie(out1, out2, out3) = InferenceEngine::gapi::Split3::on(in1); - cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3)); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_out_mat_gapi = cv::to_own(out_mat_gapi); - auto own_out_mat2 = cv::to_own(out_mat2); - auto own_out_mat3 = cv::to_own(out_mat3); - - std::vector v_in = { own_in_mat1 }; - std::vector v_out = { own_out_mat_gapi, own_out_mat2, own_out_mat3 }; - - c.apply(v_in, v_out, std::move(compile_args)); - -#if PERF_TEST - // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, - 400, "Split GAPI %s %dx%d", typeToString(type3).c_str(), sz_in.width, sz_in.height); -#endif - - // OpenCV code ///////////////////////////////////////////////////////////// - { - std::vector out_mats_ocv = {out_mat_ocv, out_mat_ocv2, out_mat_ocv3}; - cv::split(in_mat1, out_mats_ocv); - } - // Comparison ////////////////////////////////////////////////////////////// - { - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi)); - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2)); - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv3 != out_mat3)); - } -} - -TEST_P(Split4TestGAPI, AccuracyTest) +TEST_P(SplitTestGAPI, AccuracyTest) { - int depth = std::get<0>(GetParam()); - cv::Size sz_in = std::get<1>(GetParam()); - auto compile_args = std::get<2>(GetParam()); + const auto params = GetParam(); + int planes = std::get<0>(params); + int depth = std::get<1>(params); + cv::Size sz = std::get<2>(params); - int type1 = CV_MAKE_TYPE(depth, 1); - int type4 = CV_MAKE_TYPE(depth, 4); - initMatrixRandU(type4, sz_in, type1); - - cv::Mat out_mat2 = cv::Mat(sz_in, type1); - cv::Mat out_mat3 = cv::Mat(sz_in, type1); - cv::Mat out_mat4 = cv::Mat(sz_in, type1); - cv::Mat out_mat_ocv2 = cv::Mat(sz_in, type1); - cv::Mat out_mat_ocv3 = cv::Mat(sz_in, type1); - cv::Mat out_mat_ocv4 = cv::Mat(sz_in, type1); - - // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in1, out1, out2, out3, out4; - std::tie(out1, out2, out3, out4) = InferenceEngine::gapi::Split4::on(in1); - cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3, out4)); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_out_mat_gapi = cv::to_own(out_mat_gapi); - auto own_out_mat2 = cv::to_own(out_mat2); - auto own_out_mat3 = cv::to_own(out_mat3); - auto own_out_mat4 = cv::to_own(out_mat4); - - std::vector v_in = { own_in_mat1 }; - std::vector v_out = { own_out_mat_gapi, own_out_mat2, - own_out_mat3, own_out_mat4 }; - - c.apply(v_in, v_out, std::move(compile_args)); - -#if PERF_TEST - // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, - 400, "Split GAPI %s %dx%d", typeToString(type4).c_str(), sz_in.width, sz_in.height); -#endif - - // OpenCV code ///////////////////////////////////////////////////////////// - { - std::vector out_mats_ocv = {out_mat_ocv, out_mat_ocv2, out_mat_ocv3, out_mat_ocv4}; - cv::split(in_mat1, out_mats_ocv); - } - // Comparison ////////////////////////////////////////////////////////////// - { - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi)); - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2)); - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv3 != out_mat3)); - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv4 != out_mat4)); - } -} + int srcType = CV_MAKE_TYPE(depth, planes); + int dstType = CV_MAKE_TYPE(depth, 1); -TEST_P(Merge2TestGAPI, AccuracyTest) -{ - int depth = std::get<0>(GetParam()); - cv::Size sz_in = std::get<1>(GetParam()); - auto compile_args = std::get<2>(GetParam()); + cv::Mat in_mat(sz, srcType); + cv::randn(in_mat, cv::Scalar::all(127), cv::Scalar::all(40.f)); - int type1 = CV_MAKE_TYPE(depth, 1); - int type2 = CV_MAKE_TYPE(depth, 2); - initMatsRandU(type1, sz_in, type2); + std::vector out_mats_gapi(planes, cv::Mat::zeros(sz, dstType)); + std::vector out_mats_ocv (planes, cv::Mat::zeros(sz, dstType)); // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in1, in2; - auto out = InferenceEngine::gapi::Merge2::on(in1, in2); - cv::GComputation c(cv::GIn(in1, in2), cv::GOut(out)); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_in_mat2 = cv::to_own(in_mat2); - auto own_out_mat_gapi = cv::to_own(out_mat_gapi); - - std::vector v_in = { own_in_mat1, own_in_mat2 }; - std::vector v_out = { own_out_mat_gapi }; - - c.apply(v_in, v_out, std::move(compile_args)); + FluidSplitComputation sc(to_test(in_mat), to_test(out_mats_gapi)); + sc.warmUp(); #if PERF_TEST // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, - 400, "Merge GAPI %s %dx%d", typeToString(type2).c_str(), sz_in.width, sz_in.height); + test_ms([&](){ sc.apply(); }, + 400, "Split GAPI %s %dx%d", typeToString(srcType).c_str(), sz.width, sz.height); #endif // OpenCV code ///////////////////////////////////////////////////////////// { - std::vector in_mats_ocv = {in_mat1, in_mat2}; - cv::merge(in_mats_ocv, out_mat_ocv); + cv::split(in_mat, out_mats_ocv); } // Comparison ////////////////////////////////////////////////////////////// { - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi)); + for (int p = 0; p < planes; p++) { + EXPECT_EQ(0, cv::countNonZero(out_mats_ocv[p] != out_mats_gapi[p])); + } } } -TEST_P(Merge3TestGAPI, AccuracyTest) +TEST_P(MergeTestGAPI, AccuracyTest) { - int depth = std::get<0>(GetParam()); - cv::Size sz_in = std::get<1>(GetParam()); - auto compile_args = std::get<2>(GetParam()); + const auto params = GetParam(); + int planes = std::get<0>(params); + int depth = std::get<1>(params); + cv::Size sz = std::get<2>(params); - int type1 = CV_MAKE_TYPE(depth, 1); - int type3 = CV_MAKE_TYPE(depth, 3); - initMatsRandU(type1, sz_in, type3); + int srcType = CV_MAKE_TYPE(depth, 1); + int dstType = CV_MAKE_TYPE(depth, planes); - cv::Scalar mean = cv::Scalar::all(127); - cv::Scalar stddev = cv::Scalar::all(40.f); - - cv::Mat in_mat3(sz_in, type1); - cv::randn(in_mat3, mean, stddev); - - // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in1, in2, in3; - auto out = InferenceEngine::gapi::Merge3::on(in1, in2, in3); - cv::GComputation c(cv::GIn(in1, in2, in3), cv::GOut(out)); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_in_mat2 = cv::to_own(in_mat2); - auto own_in_mat3 = cv::to_own(in_mat3); - auto own_out_mat_gapi = cv::to_own(out_mat_gapi); - - std::vector v_in = { own_in_mat1, own_in_mat2, own_in_mat3 }; - std::vector v_out = { own_out_mat_gapi }; - - c.apply(v_in, v_out, std::move(compile_args)); - -#if PERF_TEST - // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, - 400, "Merge GAPI %s %dx%d", typeToString(type3).c_str(), sz_in.width, sz_in.height); -#endif - - // OpenCV code ///////////////////////////////////////////////////////////// - { - std::vector in_mats_ocv = {in_mat1, in_mat2, in_mat3}; - cv::merge(in_mats_ocv, out_mat_ocv); + std::vector in_mats(planes, cv::Mat(sz, srcType)); + for (int p = 0; p < planes; p++) { + cv::randn(in_mats[p], cv::Scalar::all(127), cv::Scalar::all(40.f)); } - // Comparison ////////////////////////////////////////////////////////////// - { - EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi)); - } -} - -TEST_P(Merge4TestGAPI, AccuracyTest) -{ - int depth = std::get<0>(GetParam()); - cv::Size sz_in = std::get<1>(GetParam()); - auto compile_args = std::get<2>(GetParam()); - int type1 = CV_MAKE_TYPE(depth, 1); - int type4 = CV_MAKE_TYPE(depth, 4); - initMatsRandU(type1, sz_in, type4); - - cv::Scalar mean = cv::Scalar::all(127); - cv::Scalar stddev = cv::Scalar::all(40.f); - - cv::Mat in_mat3(sz_in, type1); - cv::Mat in_mat4(sz_in, type1); - cv::randn(in_mat3, mean, stddev); - cv::randn(in_mat4, mean, stddev); + cv::Mat out_mat_ocv = cv::Mat::zeros(sz, dstType); + cv::Mat out_mat_gapi = cv::Mat::zeros(sz, dstType); // G-API code ////////////////////////////////////////////////////////////// - cv::GMat in1, in2, in3, in4; - auto out = InferenceEngine::gapi::Merge4::on(in1, in2, in3, in4); - cv::GComputation c(cv::GIn(in1, in2, in3, in4), cv::GOut(out)); - - // compile graph, and test once - - auto own_in_mat1 = cv::to_own(in_mat1); - auto own_in_mat2 = cv::to_own(in_mat2); - auto own_in_mat3 = cv::to_own(in_mat3); - auto own_in_mat4 = cv::to_own(in_mat4); - auto own_out_mat_gapi = cv::to_own(out_mat_gapi); - - std::vector v_in = { own_in_mat1, own_in_mat2, own_in_mat3, own_in_mat4 }; - std::vector v_out = { own_out_mat_gapi }; - - c.apply(v_in, v_out, std::move(compile_args)); + FluidMergeComputation mc(to_test(in_mats), to_test(out_mat_gapi)); + mc.warmUp(); #if PERF_TEST // iterate testing, and print performance - test_ms([&](){ c.apply(v_in, v_out); }, - 400, "Merge GAPI %s %dx%d", typeToString(type4).c_str(), sz_in.width, sz_in.height); + test_ms([&](){ mc.apply(); }, + 400, "Merge GAPI %s %dx%d", typeToString(dstType).c_str(), sz.width, sz.height); #endif // OpenCV code ///////////////////////////////////////////////////////////// { - std::vector in_mats_ocv = {in_mat1, in_mat2, in_mat3, in_mat4}; - cv::merge(in_mats_ocv, out_mat_ocv); + cv::merge(in_mats, out_mat_ocv); } // Comparison ////////////////////////////////////////////////////////////// { @@ -534,11 +291,11 @@ TEST_P(ResizeTestIE, AccuracyTest) ResizeAlgorithm algorithm = cv::INTER_AREA == interp ? RESIZE_AREA : RESIZE_BILINEAR; // test once to warm-up cache - preprocess.execute(out_blob, algorithm); + preprocess.execute(out_blob, algorithm, false); #if PERF_TEST // iterate testing, and print performance - test_ms([&](){ preprocess.execute(out_blob, algorithm); }, + test_ms([&](){ preprocess.execute(out_blob, algorithm, false); }, 100, "Resize IE %s %s %dx%d -> %dx%d", interpToString(interp).c_str(), typeToString(type).c_str(), sz_in.width, sz_in.height, sz_out.width, sz_out.height); @@ -827,7 +584,7 @@ TEST_P(PreprocTest, Performance) preprocess.setRoiBlob(in_blob); // test once to warm-up cache - preprocess.execute(out_blob, interp); + preprocess.execute(out_blob, interp, false); switch (prec) { @@ -859,7 +616,7 @@ TEST_P(PreprocTest, Performance) const auto in_layout_str = layout_to_str(in_layout); const auto out_layout_str = layout_to_str(out_layout); - test_ms([&]() { preprocess.execute(out_blob, interp); }, + test_ms([&]() { preprocess.execute(out_blob, interp, false); }, 300, "Preproc %s %d %s %s %dx%d %s %dx%d", type_str.c_str(), diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp index 27b43e3..f442076 100644 --- a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp +++ b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp b/inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp index 31714b6..040dfe6 100644 --- a/inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp +++ b/inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -101,45 +101,23 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI, Combine(Values(CV_8UC1, CV_8UC3), Values(cv::INTER_LINEAR, cv::INTER_AREA), Values(TEST_RESIZE_PAIRS), - Values(1), // error not more than 1 unit - Values(cv::compile_args(CORE_FLUID)))); + Values(1))); // error not more than 1 unit INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI, Combine(Values(CV_32FC1, CV_32FC3), Values(cv::INTER_LINEAR, cv::INTER_AREA), Values(TEST_RESIZE_PAIRS), - Values(0.015), // accuracy like ~1.5% - Values(cv::compile_args(CORE_FLUID)))); - -INSTANTIATE_TEST_CASE_P(Split2TestFluid, Split2TestGAPI, - Combine(Values(CV_8U, CV_32F), - Values(TEST_SIZES), - Values(cv::compile_args(CORE_FLUID)))); - -INSTANTIATE_TEST_CASE_P(Split3TestFluid, Split3TestGAPI, - Combine(Values(CV_8U, CV_32F), - Values(TEST_SIZES), - Values(cv::compile_args(CORE_FLUID)))); - -INSTANTIATE_TEST_CASE_P(Split4TestFluid, Split4TestGAPI, - Combine(Values(CV_8U, CV_32F), - Values(TEST_SIZES), - Values(cv::compile_args(CORE_FLUID)))); - -INSTANTIATE_TEST_CASE_P(Merge2TestFluid, Merge2TestGAPI, - Combine(Values(CV_8U, CV_32F), - Values(TEST_SIZES), - Values(cv::compile_args(CORE_FLUID)))); - -INSTANTIATE_TEST_CASE_P(Merge3TestFluid, Merge3TestGAPI, - Combine(Values(CV_8U, CV_32F), - Values(TEST_SIZES), - Values(cv::compile_args(CORE_FLUID)))); - -INSTANTIATE_TEST_CASE_P(Merge4TestFluid, Merge4TestGAPI, - Combine(Values(CV_8U, CV_32F), - Values(TEST_SIZES), - Values(cv::compile_args(CORE_FLUID)))); + Values(0.015))); // accuracy like ~1.5% + +INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestGAPI, + Combine(Values(2, 3, 4), + Values(CV_8U, CV_32F), + Values(TEST_SIZES))); + +INSTANTIATE_TEST_CASE_P(MergeTestFluid, MergeTestGAPI, + Combine(Values(2, 3, 4), + Values(CV_8U, CV_32F), + Values(TEST_SIZES))); //---------------------------------------------------------------------- diff --git a/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/CMakeLists.txt b/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/CMakeLists.txt new file mode 100644 index 0000000..5ade83a --- /dev/null +++ b/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/CMakeLists.txt @@ -0,0 +1,25 @@ +# +# Copyright 2019 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you (End User License Agreement for the Intel(R) Software +# Development Products (Version May 2017)). Unless the License provides +# otherwise, you may not use, modify, copy, publish, distribute, disclose or +# transmit this software or the related documents without Intel's prior +# written permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +file(GLOB SRC *.cpp) +file(GLOB HDR *.hpp) + +add_library(fluid_test_computations SHARED ${SRC} ${HDR}) + +target_include_directories(fluid_test_computations PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") + +target_link_libraries(fluid_test_computations PRIVATE inference_engine_s + PRIVATE fluid) diff --git a/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.cpp b/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.cpp new file mode 100644 index 0000000..9efd2ee --- /dev/null +++ b/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.cpp @@ -0,0 +1,133 @@ +#include +#include +#include + +#define CV_MAT_CHANNELS(flags) (((flags) >> CV_CN_SHIFT) + 1) + +namespace opencv_test +{ +struct FluidComputation::Priv +{ + cv::GComputation m_c; + std::vector m_v_in; + std::vector m_v_out; +}; + +FluidComputation::FluidComputation(Priv *priv) + : m_priv(priv) +{} + +void FluidComputation::warmUp() +{ + m_priv->m_c.apply(m_priv->m_v_in, m_priv->m_v_out, cv::compile_args(InferenceEngine::gapi::preprocKernels())); +} + +void FluidComputation::apply() +{ + m_priv->m_c.apply(m_priv->m_v_in, m_priv->m_v_out); +} + +namespace +{ +cv::gapi::own::Mat to_own(test::Mat mat) { return {mat.rows, mat.cols, mat.type, mat.data}; } + +std::vector to_own(std::vector mats) +{ + std::vector own_mats(mats.size()); + for (int i = 0; i < mats.size(); i++) { + own_mats[i] = to_own(mats[i]); + } + return own_mats; +} + +template +std::vector to_vec_impl(std::tuple &&gmats, cv::detail::Seq) { + return { std::get(gmats)... }; +} + +template +std::vector to_vec(std::tuple &&gmats) { + return to_vec_impl(std::move(gmats), typename cv::detail::MkSeq::type()); +} +} // anonymous namespace + +static cv::GComputation buildResizeComputation(test::Mat inMat, test::Mat outMat, int interp) +{ + cv::gapi::own::Size sz_in { inMat.cols, inMat.rows}; + cv::gapi::own::Size sz_out {outMat.cols, outMat.rows}; + int type = outMat.type; + cv::GMat in, out; + switch (CV_MAT_CHANNELS(type)) { + case 1: + out = InferenceEngine::gapi::ScalePlane::on(in, type, sz_in, sz_out, interp); + break; + case 3: + { + int depth = CV_MAT_DEPTH(type); + int type1 = CV_MAKE_TYPE(depth, 1); + cv::GMat in0, in1, in2, out0, out1, out2; + std::tie(in0, in1, in2) = InferenceEngine::gapi::Split3::on(in); + out0 = InferenceEngine::gapi::ScalePlane::on(in0, type1, sz_in, sz_out, interp); + out1 = InferenceEngine::gapi::ScalePlane::on(in1, type1, sz_in, sz_out, interp); + out2 = InferenceEngine::gapi::ScalePlane::on(in2, type1, sz_in, sz_out, interp); + out = InferenceEngine::gapi::Merge3::on(out0, out1, out2); + } + break; + default: GAPI_Assert(!"ERROR: unsupported number of channels!"); + } + + return cv::GComputation(in, out); +} + +FluidResizeComputation::FluidResizeComputation(test::Mat inMat, test::Mat outMat, int interp) + : FluidComputation(new Priv{buildResizeComputation(inMat, outMat, interp) + ,{to_own(inMat)} + ,{to_own(outMat)} + }) +{} + +static cv::GComputation buildSplitComputation(int planes) +{ + std::vector ins(1); + std::vector outs(planes); + + switch (planes) { + case 2: outs = to_vec(InferenceEngine::gapi::Split2::on(ins[0])); break; + case 3: outs = to_vec(InferenceEngine::gapi::Split3::on(ins[0])); break; + case 4: outs = to_vec(InferenceEngine::gapi::Split4::on(ins[0])); break; + default: GAPI_Assert(false); + } + + return cv::GComputation(ins, outs); +} + +FluidSplitComputation::FluidSplitComputation(test::Mat inMat, std::vector outMats) + : FluidComputation(new Priv{buildSplitComputation(outMats.size()) + ,{to_own(inMat)} + ,to_own(outMats) + }) +{} + +static cv::GComputation buildMergeComputation(int planes) +{ + std::vector ins(planes); + std::vector outs(1); + + switch (planes) { + case 2: outs[0] = InferenceEngine::gapi::Merge2::on(ins[0], ins[1]); break; + case 3: outs[0] = InferenceEngine::gapi::Merge3::on(ins[0], ins[1], ins[2]); break; + case 4: outs[0] = InferenceEngine::gapi::Merge4::on(ins[0], ins[1], ins[2], ins[3]); break; + default: GAPI_Assert(false); + } + + return cv::GComputation(ins, outs); +} + +FluidMergeComputation::FluidMergeComputation(std::vector inMats, test::Mat outMat) + : FluidComputation(new Priv{buildMergeComputation(inMats.size()) + ,to_own(inMats) + ,{to_own(outMat)} + }) +{} + +} // namespace opencv_test diff --git a/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.hpp b/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.hpp new file mode 100644 index 0000000..52a8bf6 --- /dev/null +++ b/inference-engine/tests/unit/opencv_test_gapi/fluid_test_computations/fluid_test_computations.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef FLUID_TEST_COMPUTATIONS_HPP +#define FLUID_TEST_COMPUTATIONS_HPP + +#include + +#include +#include + +namespace opencv_test +{ +namespace test +{ +struct Mat +{ + int rows; + int cols; + int type; + void* data; +}; +} + +class __attribute__((visibility("default"))) FluidComputation +{ +protected: + struct Priv; + std::shared_ptr m_priv; +public: + FluidComputation(Priv* priv); + void warmUp(); + void apply(); +}; + +class __attribute__((visibility("default"))) FluidResizeComputation : public FluidComputation +{ +public: + FluidResizeComputation(test::Mat inMat, test::Mat outMat, int interp); +}; + +class __attribute__((visibility("default"))) FluidSplitComputation : public FluidComputation +{ +public: + FluidSplitComputation(test::Mat inMat, std::vector outMats); +}; + +class __attribute__((visibility("default"))) FluidMergeComputation : public FluidComputation +{ +public: + FluidMergeComputation(std::vector inMats, test::Mat outMat); +}; + +} // namespace opencv_test + +#endif // FLUID_TEST_COMPUTATIONS_HPP diff --git a/inference-engine/tests/unit/shape_infer/adult_test.cpp b/inference-engine/tests/unit/shape_infer/adult_test.cpp new file mode 100644 index 0000000..0dd1c49 --- /dev/null +++ b/inference-engine/tests/unit/shape_infer/adult_test.cpp @@ -0,0 +1,648 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "adult_test.hpp" +#include "debug.h" +#include + +using namespace InferenceEngine; +using namespace details; +using namespace ShapeInfer; +using namespace ShapeInferTests; + +void BasicTest::SetUp() { + auto params = GetParam(); + type = std::get<0>(params); + inOutData = std::get<1>(params); +} + +void BlobTest::SetUp() { + auto params = GetParam(); + type = std::get<0>(params); + inOutData = std::get<1>(params); + blobsParam = std::get<2>(params); +} + +void ParamsTest::SetUp() { + auto params = GetParam(); + type = std::get<0>(params); + inOutData = std::get<1>(params); + strParams = std::get<2>(params); +} + +ASITestBuilder CommonTests::assertThat() { + return ASITestBuilder().withType(type).withData(inOutData); +} + +std::vector StridedSliceTest::getPrecisions() { + size_t size = inOutData.inData.size(); + std::vector result; + if (!size) THROW_IE_EXCEPTION << "unsupported number of precisions"; + result.emplace_back(Precision::FP32); + for (int i = 1; i < size; i++) { + result.emplace_back(Precision::I32); + } + return result; +} + +std::vector FillTest::refGen(const InOutData& inOutData) { + const size_t FILL_DIMS = 0; + const size_t FILL_VALUE = 1; + float value = inOutData.inData[FILL_VALUE][0]; + auto shape = inOutData.inData[FILL_DIMS]; + return std::vector(product(shape), value); +} + +std::vector RangeTest::refGen(const InOutData& inOutData) { + std::vector result; + float start = inOutData.inData[0][0]; + float limit = inOutData.inData[1][0]; + float delta = inOutData.inData[2][0]; + size_t work_amount_dst = std::floor(std::abs((limit - start) / delta)); + if (work_amount_dst != product(inOutData.inOutShapes.outDims[0])) + THROW_IE_EXCEPTION << "Range indexes exceeds data tensor dimension"; + + float dst_value = start; + for (size_t iwork = 0; iwork < work_amount_dst; ++iwork, dst_value += delta) { + result.push_back(dst_value); + } + return result; +} + +TEST_P(BlobTest, impl) { + assertThat().constInferResultFor().withBlobs(blobsParam).equals().toData(inOutData.outData); +} + +TEST_P(BasicTest, impl) { + assertThat().constInferResultFor().equals().toData(inOutData.outData); +} + +TEST_P(ParamsTest, impl) { + assertThat().constInferResultFor().withParams(strParams.data).equals().toData(inOutData.outData); +} + +TEST_P(StridedSliceTest, impl) { + assertThat().constInferResultFor().withParams(strParams.data) + .withInputPrecisions(getPrecisions()).equals().toData(inOutData.outData); +} + +TEST_P(StridedSliceTest, shapeInfer) { + assertThat().shapeInferResultFor().withParams(strParams.data) + .withInputPrecisions(getPrecisions()) + .equals().toShapes(inOutData.inOutShapes.outDims); +} + +TEST_P(BasicAdultTest, impl) { + assertThat().shapeInferResultFor().equals().toShapes(inOutData.inOutShapes.outDims); +} + +TEST_P(FillTest, impl) { + assertThat().constInferResultFor().withInputPrecisions({Precision::I32, Precision::FP32}) + .equals().toData({refGen(inOutData)}); +} + +TEST_P(FillTest, shapeInfer) { + assertThat().shapeInferResultFor().withInputPrecisions({Precision::I32, Precision::FP32}) + .equals().toShapes(inOutData.inOutShapes.outDims); +} + +TEST_P(RangeTest, impl) { + assertThat().constInferResultFor().equals().toData({refGen(inOutData)}); +} + +TEST_P(RangeTest, shapeInfer) { + assertThat().shapeInferResultFor().equals().toShapes(inOutData.inOutShapes.outDims); +} + +static std::vector singleInputData = {4.f, 8.f, 12.f, 16.f}; + +static testing::InOutShapes singleSmallShapes = {{{1, 3}}, + {{1, 3}}}; +static std::vector singleSmallData = {1.f, 2.f, 4.f}; + +static testing::InOutShapes singleSmall2Shapes = {{{1, 3}, {1, 3}}, + {{1, 3}}}; + +static testing::InOutShapes singleInOutShape = {{{4, 8, 12, 16}}, + {{4}}}; + +static std::vector fourInARow = {1.f, 2.f, 3.f, 4.f}; + +static SizeVector threeDeuces = {2, 2, 2}; + +INSTANTIATE_TEST_CASE_P( + CheckOutputDirectly, BlobTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Const"), InOutDataParam({singleInOutShape, {}, {singleInputData}}), + BlobsParam(FloatMap{{"custom", singleInputData}})) + ) +); + +INSTANTIATE_TEST_CASE_P( + CheckOutputDirectly, ParamsTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Power"), + InOutDataParam({singleSmallShapes, + {singleSmallData}, + {{-2 / 3.f, -2 / 7.f, -2 / 15.f}}}), + MapParams(MapStrStr(std::map{{"power", "-1"}, + {"scale", "-2"}, + {"shift", "0.5"}}))), + ::testing::make_tuple(LayerType("Power"), + InOutDataParam({singleSmallShapes, + {singleSmallData}, + {{-3.375f, -1.f, 0.f,}}}), + MapParams(MapStrStr(std::map{{"power", "3"}, + {"scale", "0.5"}, + {"shift", "-2"}}))), + ::testing::make_tuple(LayerType("Power"), + InOutDataParam({singleSmallShapes, + {singleSmallData}, + {{10.f, 10.f, 10.f,}}}), + MapParams(MapStrStr(std::map{{"power", "1"}, + {"scale", "0"}, + {"shift", "10"}}))), + ::testing::make_tuple(LayerType("Tile"), + InOutDataParam({{{{2, 1, 2}}, + {threeDeuces}}, + {fourInARow}, + {{1.f, 2.f, 1.f, 2.f, 3.f, 4.f, 3.f, 4.f}}}), + MapParams(MapStrStr(std::map{{"axis", "1"}, + {"tiles", "2"}}))), + ::testing::make_tuple(LayerType("Tile"), + InOutDataParam({{{{2, 2, 1}}, + {threeDeuces}}, + {fourInARow}, + {{1.f, 1.f, 2.f, 2.f, 3.f, 3.f, 4.f, 4.f}}}), + MapParams(MapStrStr(std::map{{"axis", "2"}, + {"tiles", "2"}}))), + ::testing::make_tuple(LayerType("Tile"), + InOutDataParam({{{{1, 2, 2}}, + {threeDeuces}}, + {fourInARow}, + {{1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}}}), + MapParams(MapStrStr(std::map{{"axis", "0"}, + {"tiles", "2"}}))), + ::testing::make_tuple(LayerType("Reshape"), + InOutDataParam({{{{1, 2, 2}}, {{4}}}, + {fourInARow}, + {fourInARow}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("Split"), + InOutDataParam({{{{2, 1, 2}}, {{2, 1, 1}, {2, 1, 1}}}, + {fourInARow}, + {{1.f, 3.f}, {2.f, 4.f}}}), + MapParams(MapStrStr(std::map{{"axis", "2"}}))), + ::testing::make_tuple(LayerType("Split"), + InOutDataParam({{{{2, 1, 2}}, {{1, 1, 2}, {1, 1, 2}}}, + {fourInARow}, + {{1.f, 2.f}, {3.f, 4.f}}}), + MapParams(MapStrStr(std::map{{"axis", "0"}}))), + ::testing::make_tuple(LayerType("Split"), + InOutDataParam({{{{4, 1, 1}}, {{2, 1, 1}, {1, 1, 1}, {1, 1, 1}}}, + {fourInARow}, + {{1.f, 2.f}, {3.f}, {4.f}}}), + MapParams(MapStrStr(std::map{{"axis", "0"}}))), + ::testing::make_tuple(LayerType("Concat"), + InOutDataParam({{{{2, 1, 1}, {2, 1, 1}}, {{2, 1, 2}}}, + {{1.f, 3.f}, {2.f, 4.f}}, + {fourInARow}}), + MapParams(MapStrStr(std::map{{"axis", "2"}}))), + ::testing::make_tuple(LayerType("Concat"), + InOutDataParam({{{{1, 1, 2}, {1, 1, 2}}, {{2, 1, 2}}}, + {{1.f, 2.f}, {3.f, 4.f}}, + {fourInARow}}), + MapParams(MapStrStr(std::map{{"axis", "0"}}))), + ::testing::make_tuple(LayerType("Concat"), + InOutDataParam({{{{2, 1, 1}, {1, 1, 1}, {1, 1, 1}}, {{4, 1, 1}}}, + {{1.f, 2.f}, {3.f}, {4.f}}, + {fourInARow}}), + MapParams(MapStrStr(std::map{{"axis", "0"}}))) + ) +); + +namespace { +// Test data vectors +std::vector in0 = {0.f, 1.f, 1.f, 0.f}; +std::vector in1 = {0.f, 1.f, 2.f, 1.f}; +std::vector dict = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f}; +std::vector dict2D = {1.f, 2.f, 3.f, 4.f}; // 2x2 +std::vector ref_in0_a0_d223 = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 7.f, 8.f, 9.f, + 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f}; // 2x2x2x3 +std::vector ref_in1_a2_d223 = {1.f, 2.f, 3.f, 2.f, 4.f, 5.f, 6.f, 5.f, 7.f, 8.f, 9.f, 8.f, 10.f, 11.f, 12.f, + 11.f}; // 2x2x2x2 +std::vector ref_in0_a0_d22 = {1.f, 2.f, 3.f, 4.f, 3.f, 4.f, 1.f, 2.f}; // 2x2x2 +} + +INSTANTIATE_TEST_CASE_P( + TestsGather, ParamsTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Gather"), + InOutDataParam({{{{2, 2}, {1, 4}}, {{1, 4, 2}}}, + {dict2D, in0}, + {ref_in0_a0_d22}}), + MapParams(MapStrStr(std::map{{"axis", "0"}}))), + ::testing::make_tuple(LayerType("Gather"), + InOutDataParam({{{{2, 2, 3}, {2, 2}}, {{2, 2, 2, 3}}}, + {dict, in0}, + {ref_in0_a0_d223}}), + MapParams(MapStrStr(std::map{{"axis", "0"}}))), + ::testing::make_tuple(LayerType("Gather"), + InOutDataParam({{{{2, 2, 3}, {2, 2}}, {{2, 2, 2, 3}}}, + {dict, in0}, + {ref_in0_a0_d223}}), + MapParams(MapStrStr(std::map{{"axis", "-3"}}))), + ::testing::make_tuple(LayerType("Gather"), + InOutDataParam({{{{2, 2, 3}, {2, 2}}, {{2, 2, 2, 2}}}, + {dict, in1}, + {ref_in1_a2_d223}}), + MapParams(MapStrStr(std::map{{"axis", "2"}}))) + ) +); + +//static testing::InOutShapes eltWiseShapes1 = {{{4}, {1}}, +// {{4}}}; +//static std::vector> eltWiseInputs1 = {singleInputData, +// {4.f}}; +// +//static testing::InOutShapes eltWiseShapes2 = {{{2, 3}, {3}}, +// {{2, 3}}}; +//static std::vector> eltWiseInputs2 = {{4.f, 8.f, 12.f, 4.f, 8.f, 8.f}, +// {4.f, 8.f, 4.f}}; +INSTANTIATE_TEST_CASE_P( + CheckOutputDirectly, BasicTest, + ::testing::Values( + ::testing::make_tuple( + LayerType("Shape"), + InOutDataParam({singleInOutShape, {}, {singleInputData}})), +// ::testing::make_tuple( +// LayerType("Mul"), +// InOutDataParam({eltWiseShapes1, eltWiseInputs1, {{16.f, 32.f, 48.f, 64.f}}})), +// ::testing::make_tuple( +// LayerType("Add"), +// InOutDataParam({eltWiseShapes1, eltWiseInputs1, {{8.f, 12.f, 16.f, 20.f}}})), +// ::testing::make_tuple( +// LayerType("Div"), +// InOutDataParam({eltWiseShapes1, eltWiseInputs1, {{1.f, 2.f, 3.f, 4.f}}})), +// ::testing::make_tuple( +// LayerType("Mul"), +// InOutDataParam({eltWiseShapes2, eltWiseInputs2, {{16.f, 64.f, 48.f, 16.f, 64.f, 32.f}}})), +// ::testing::make_tuple( +// LayerType("Add"), +// InOutDataParam({eltWiseShapes2, eltWiseInputs2, {{8.f, 16.f, 16.f, 8.f, 16.f, 12.f}}})), +// ::testing::make_tuple( +// LayerType("Div"), +// InOutDataParam({eltWiseShapes2, eltWiseInputs2, {{1.f, 1.f, 3.f, 1.f, 1.f, 2.f}}})), + ::testing::make_tuple(LayerType("Mul"), + InOutDataParam({singleSmall2Shapes, {singleSmallData, singleSmallData}, + {{1.f, 4.f, 16.f}}})), + ::testing::make_tuple(LayerType("Add"), + InOutDataParam({singleSmall2Shapes, {singleSmallData, singleSmallData}, + {{2.f, 4.f, 8.f}}})), + ::testing::make_tuple(LayerType("Div"), + InOutDataParam({singleSmall2Shapes, {singleSmallData, singleSmallData}, + {{1.f, 1.f, 1.f}}})) + ) +); + +INSTANTIATE_TEST_CASE_P( + SecondInput, BasicAdultTest, + ::testing::Combine(::testing::Values(LayerType("Reshape"), LayerType("Interp"), LayerType("Resample")), + ::testing::Values(InOutDataParam({{{{2, 3}, {2}}, + {{1, 6}}}, + {{}, {1.f, 6.f}}, + {}}))) +); + +INSTANTIATE_TEST_CASE_P( + DimSemantic, BasicAdultTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Reshape"), + InOutDataParam({{{{2, 3}, {2}}, + {{1, 6}}}, + {{}, {1.f, -1.f}}, + {}})) + ) +); + +INSTANTIATE_TEST_CASE_P( + SqueezeUnsqueeze, BasicAdultTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{3}, {1}}, + {{1, 3}}}, + {{}, {0.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{3}, {3}}, + {{1, 1, 1, 3}}}, + {{}, {0.f, 1.f, 2.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{3}, {3}}, + {{1, 3, 1, 1}}}, + {{}, {0.f, 2.f, 3.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{2, 3}, {2}}, + {{1, 2, 3, 1}}}, + {{}, {0.f, 3.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{2, 3}, {1}}, + {{2, 1, 3}}}, + {{}, {1.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{3}, {1}}, + {{1, 3}}}, + {{}, {0.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{3}, {3}}, + {{1, 1, 1, 3}}}, + {{}, {0.f, 1.f, 2.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{3}, {3}}, + {{1, 3, 1, 1}}}, + {{}, {0.f, 2.f, 3.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{2, 3}, {2}}, + {{1, 2, 3, 1}}}, + {{}, {0.f, 3.f}}, + {}})), + ::testing::make_tuple(LayerType("Unsqueeze"), + InOutDataParam({{{{2, 3}, {1}}, + {{2, 1, 3}}}, + {{}, {1.f,}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1}, {1}}, + {{}}}, + {{}, {0.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {1}}, + {{3, 1}}}, + {{}, {0.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {1}}, + {{1, 3}}}, + {{}, {2.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {2}}, + {{3}}}, + {{}, {0.f, 2.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {1}}, + {{1, 3}}}, + {{}, {-1.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1, 2}, {2}}, + {{3, 2}}}, + {{}, {0.f, 2.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1}, {1}}, + {{}}}, + {{}, {0.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {1}}, + {{1, 3}}}, + {{}, {2.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {2}}, + {{3}}}, + {{}, {0.f, 2.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1}, {1}}, + {{1, 3}}}, + {{}, {-1.f}}, + {}})), + ::testing::make_tuple(LayerType("Squeeze"), + InOutDataParam({{{{1, 3, 1, 2}, {2}}, + {{3, 2}}}, + {{}, {0.f, 2.f}}, + {}})) + ) +); +namespace { +// Test data vectors +std::vector test0 = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f}; +std::vector test2 = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f}; +std::vector test5 = {5.f, 6.f, 7.f, 8.f}; +std::vector test6 = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f}; +std::vector test8 = {5.f, 4.f, 3.f, 2.f, 1.f}; +std::vector test9 = {5.f, 4.f, 3.f, 2.f, 1.f, 0.f}; +std::vector test10 = {5.f, 4.f, 3.f}; +std::vector test11 = {0.f, 2.f, 4.f, 6.f, 8.f}; +std::vector test12 = {1.f, 3.f, 5.f, 7.f, 9.f}; +std::vector test13 = {9.f, 8.f, 7.f, 6.f, 5.f, 4.f, 3.f, 2.f, 1.f, 0.f}; +std::vector test14 = {9.f, 7.f, 5.f, 3.f, 1.f}; +std::vector test16 = {0.f, 1.f, 3.f, 4.f}; +std::vector test17 = {1.f, 4.f}; +std::vector test19 = {0.f, 1.f, 2.f, 3.f}; +std::vector test20 = {4.f, 5.f, 6.f, 7.f}; +/* +0. [0,1,2,3,4,5,6,7,8,9], shape=[10] +1. [0,1,2,3,4,5,6,7,8,9], shape=[10] +2. [0,1,2,3,4,5,6,7,8], shape=[9] +3. [0,1,2,3,4,5,6,7,8], shape=[9] +4. [0,1,2,3,4,5,6,7,8,9], shape=[10] +5. [5,6,7,8,9], shape=[5] +6. [0,1,2,3,4,5], shape=[6] +7. [5,6,7,8,9], shape=[5] +8. [5,4,3,2,1], shape=[5] +9. [5,4,3,2,1,0], shape=[6] +10. [5,4,3], shape=[3] +11. [0,2,4,6,8], shape=[5] +12. [1,3,5,7,9], shape=[5] +13. [9,8,7,6,5,4,3,2,1,0], shape=[10] +14. [9,7,5,3,1], shape=[5] +15. [[0,1,2,3,4,5,6,7,8,9]], shape=[1,10] +16. [[[0,1,2],[3,4,5]]], shape=[1,2,2] +17. [[[0,1,2],[3,4,5]]], shape=[1,2,1] +18. [[[0,1,2],[3,4,5]]], shape=[1,1,2,1] +19. [[[[0,1],[2,3]],[[4,5],[6,7]]]], shape=[1,2,2] +20. [[[[0,1],[2,3]],[[4,5],[6,7]]]], shape=[1,2,2] +21. [[[0,1,2],[3,4,5]]], shape=[1,1,2] +*/ +} + +INSTANTIATE_TEST_CASE_P( + StridedSlice, StridedSliceTest, + ::testing::Values( + /* 0 */ + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {}, {}, {}}, {{10}}}, + {{test0}, {}, {}, {}}, + {test0}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{10}}}, + {{test0}, {0.f}, {0.f}, {}}, + {test0}}), + MapParams(MapStrStr(std::map{{"end_mask", "0"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{9}}}, + {{test0}, {-1.f}, {-1.f}, {}}, + {test2}}), + MapParams(MapStrStr(std::map{{"begin_mask", "0"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{9}}}, + {{test0}, {0.f}, {-1.f}, {}}, + {test2}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{10}}}, + {{test0}, {0.f}, {10.f}, {}}, + {test0}}), + MapParams(MapStrStr())), +/* 5 */ + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{5}}}, + {{test0}, {5.f}, {10.f}, {}}, + {test5}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{6}}}, + {{test0}, {0.f}, {6.f}, {}}, + {test6}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{5}}}, + {{test0}, {-5.f}, {10.f}, {}}, + {test5}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{5}}}, + {{test0}, {-5.f}, {0.f}, {-1.f}}, + {test8}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{6}}}, + {{test0}, {-5.f}, {0.f}, {-1.f}}, + {test9}}), + MapParams(MapStrStr(std::map{{"end_mask", "0"}})) + ), +/* 10 */ + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{3}}}, + {{test0}, {-5.f}, {2.f}, {-1.f}}, + {test10}}), + MapParams(MapStrStr())), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{5}}}, + {{test0}, {0.f}, {0.f}, {2.f}}, + {test11}}), + MapParams(MapStrStr(std::map{{"end_mask", "0"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{5}}}, + {{test0}, {1.f}, {0.f}, {2.f}}, + {test12}}), + MapParams(MapStrStr(std::map{{"end_mask", "0"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{10}}}, + {{test0}, {-1.f}, {0.f}, {-1.f}}, + {test13}}), + MapParams(MapStrStr( + std::map{{"end_mask", "0"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {1}}, {{5}}}, + {{test0}, {-1.f}, {0.f}, {-2.f}}, + {test14}}), + MapParams(MapStrStr(std::map{{"end_mask", "0"}}))), +/* 15 */ + ::testing::make_tuple(LayerType("StridedSlice"), InOutDataParam({{{{10}, {1}, {1}, {}}, {{1, 10}}}, + {{test0}, {0.f}, {10.f}, {}}, + {test0}}), + MapParams(MapStrStr(std::map{{"new_axis_mask", "1"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), + InOutDataParam({{{{1, 2, 3}, {2}, {2}, {}}, {{1, 2, 2}}}, + {{test0}, {0.f, 0.f}, {1.f, 2.f}, {}}, + {test16}}), + MapParams( + MapStrStr(std::map{{"ellipsis_mask", "0,1"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), + InOutDataParam({{{{1, 2, 3}, {4}, {4}, {}}, {{1, 2, 1}}}, + {{test0}, {{0.f, 0.f, 0.f, 1.f}}, {2.f, 3.f, 2.f, 2.f}, {}}, + {test17}}), + MapParams( + MapStrStr(std::map{{"new_axis_mask", "0,0,1,0"}, + {"shrink_axis_mask", "0,0,0,1"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), + InOutDataParam({{{{1, 2, 3}, {3}, {3}, {}}, {{1, 1, 2, 1}}}, + {{test0}, {0.f, 0.f, 1.f}, {2.f, 2.f, 2.f}, {}}, + {test17}}), + MapParams(MapStrStr( + std::map{{"ellipsis_mask", "0,1"}, + {"new_axis_mask", "1"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), + InOutDataParam({{{{1, 2, 2, 2}, {1}, {1}, {1}}, {{1, 2, 2}}}, + {{test0}, {-1.f}, {0.f}, {-2.f}}, + {test19}}), + MapParams(MapStrStr(std::map{{"begin_mask", "0,1,0,0"}, + {"end_mask", "0,1,0,0"}, + {"shrink_axis_mask", "0,1"}}))), +/* 20 */ + ::testing::make_tuple(LayerType("StridedSlice"), + InOutDataParam({{{{1, 2, 2, 2}, {4}, {4}, {}}, {{1, 2, 2}}}, + {{test0}, {0.f, 1.f, 0.f, 0.f}, {1.f, 2.f, 2.f, 2.f}, {}}, + {test20}}), + MapParams(MapStrStr(std::map{{"begin_mask", "0,1,0,0"}, + {"end_mask", "0,1,0,0"}, + {"shrink_axis_mask", "0,1,0,0"}}))), + ::testing::make_tuple(LayerType("StridedSlice"), + InOutDataParam({{{{1, 2, 3}, {3}, {3}, {}}, {{1, 1, 2}}}, + {{test0}, {0.f, 0.f, 1.f}, {2.f, 2.f, 2.f}, {}}, + {test17}}), + MapParams(MapStrStr(std::map{{"ellipsis_mask", "0,1"}, + {"new_axis_mask", "1"}, + {"shrink_axis_mask", "0,0,1"}}))) + ) +); + +INSTANTIATE_TEST_CASE_P( + Fill, FillTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Fill"), InOutDataParam({{{{1}, {1}}, + {{1}}}, + {{1.f}, {1.f}}, + {}})), + ::testing::make_tuple(LayerType("Fill"), InOutDataParam({{{{3}, {1}}, + {{1, 3, 1}}}, + {{1.f, 3.f, 1.f}, {1.f}}, + {}})), + ::testing::make_tuple(LayerType("Fill"), InOutDataParam({{{{3}, {1}}, + {{2, 3, 6}}}, + {{2.f, 3.f, 6.f}, {-1.f}}, + {}})), + ::testing::make_tuple(LayerType("Fill"), InOutDataParam({{{{4}, {1}}, + {{1, 3, 1, 2}}}, + {{1.f, 3.f, 1.f, 2.f}, {.5f}}, + {}})), + ::testing::make_tuple(LayerType("Fill"), InOutDataParam({{{{6}, {1}}, + {{4, 3, 2, 5, 4, 2}}}, + {{4.f, 3.f, 2.f, 5.f, 4.f, 2.f}, {.25f}}, + {}})) + ) +); + +INSTANTIATE_TEST_CASE_P( + Range, RangeTest, + ::testing::Values( + ::testing::make_tuple(LayerType("Range"), InOutDataParam({{{{1}, {1}, {1}}, + {{5}}}, + {{3.f}, {18.f}, {3.f}}, + {{}}})), + ::testing::make_tuple(LayerType("Range"), InOutDataParam({{{{1}, {1}, {1}}, + {{2}}}, + {{3.f}, {1.f}, {-1.f}}, + {{}}})), + ::testing::make_tuple(LayerType("Range"), InOutDataParam({{{{1}, {1}, {1}}, + {{6}}}, + {{3.f}, {-3.f}, {-1.f}}, + {{}}})), + ::testing::make_tuple(LayerType("Range"), InOutDataParam({{{{1}, {1}, {1}}, + {{5}}}, + {{0.f}, {5.f}, {1.f}}, + {{}}})) + ) +); diff --git a/inference-engine/tests/unit/shape_infer/adult_test.hpp b/inference-engine/tests/unit/shape_infer/adult_test.hpp new file mode 100644 index 0000000..44478ad --- /dev/null +++ b/inference-engine/tests/unit/shape_infer/adult_test.hpp @@ -0,0 +1,74 @@ +#include + +#include + +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "built_in_shape_infer_general_test.hpp" +#include "adult_test_utils.hpp" + +namespace IE = InferenceEngine; + +namespace ShapeInferTests { + +class CommonTests : public ::testing::Test { +protected: + ASITestBuilder assertThat(); + +protected: + std::string type; + InOutData inOutData; +}; + +class BasicTest + : public CommonTests, + public testing::WithParamInterface> { +protected: + void SetUp() override; +}; + +class BlobTest + : public CommonTests, + public testing::WithParamInterface> { +protected: + void SetUp() override; + +protected: + FloatMap blobsParam; +}; + +class ParamsTest + : public CommonTests, + public testing::WithParamInterface> { +protected: + void SetUp() override; + +protected: + MapStrStr strParams; +}; + +class BasicAdultTest : public BasicTest { +}; + +class StridedSliceTest : public ParamsTest { +public: + std::vector getPrecisions(); +}; + +class FillTest : public BasicTest { +protected: + std::vector refGen(const InOutData& inOutData); +}; + +class RangeTest : public BasicTest { +protected: + std::vector refGen(const InOutData& inOutData); +}; + +} // namespace ShapeInferTests diff --git a/inference-engine/tests/unit/shape_infer/adult_test_utils.cpp b/inference-engine/tests/unit/shape_infer/adult_test_utils.cpp new file mode 100644 index 0000000..2088727 --- /dev/null +++ b/inference-engine/tests/unit/shape_infer/adult_test_utils.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "adult_test.hpp" +#include "adult_test_utils.hpp" + + +using namespace InferenceEngine; +using namespace details; +using namespace ShapeInfer; + +void BaseMatcher::compareWithRef(const std::vector& outBlobs, + const std::vector>& refData, + float tolerance) { + for (int outIdx = 0; outIdx < outBlobs.size(); outIdx++) { + auto* data = outBlobs[outIdx]->buffer().as(); + for (int elemIdx = 0; elemIdx < refData[outIdx].size(); elemIdx++) { + ASSERT_NEAR(data[elemIdx], refData[outIdx][elemIdx], tolerance); + } + } +} + +std::vector +BaseMatcher::createBlobs(const std::vector& shapes, const std::vector& precisions) { + if (shapes.size() != precisions.size()) + THROW_IE_EXCEPTION << "Vectors of shapes and precisions can't have different sizes"; + std::vector blobs; + int i = 0; + for (const auto& dims : shapes) { + // it's assumed that empty dims = empty data = no blob + if (!dims.empty()) { + TensorDesc inDesc(precisions[i++], dims, TensorDesc::getLayoutByDims(dims)); + auto blob = make_blob_with_precision(inDesc); + blob->allocate(); + blobs.push_back(blob); + } + } + return blobs; +} + +void BaseMatcher::fillBlobs(const std::vector& blobs, const std::vector>& data) { + if (!data.empty()) { + for (int blobIdx = 0; blobIdx < blobs.size(); blobIdx++) { + auto blob = blobs[blobIdx]; + // it's assumed that empty dims = empty data = no blob + if (!data[blobIdx].empty()) { + switch (blob->precision()) { + case Precision::FP32: { + auto* buffer = blob->buffer().as(); + for (int dataIdx = 0; dataIdx < blob->size(); dataIdx++) { + buffer[dataIdx] = data[blobIdx][dataIdx]; + } + } + break; + case Precision::I32: { + auto* buffer = blob->buffer().as(); + for (int dataIdx = 0; dataIdx < blob->size(); dataIdx++) { + buffer[dataIdx] = static_cast(data[blobIdx][dataIdx]); + } + } + break; + default: + THROW_IE_EXCEPTION << "Unsupported precision " << blob->precision() << " to fill blobs"; + } + } + } + } +} + +void ConstInferMatcher::toData(const std::vector>& refData) { + auto impl = holder->getConstInferImpl(config.type); + ASSERT_NE(nullptr, impl); + auto outBlobs = createBlobs(config.inOutData.inOutShapes.outDims, config.outPrecisions); + auto inBlobs = createBlobs(config.inOutData.inOutShapes.inDims, config.inPrecisions); + fillBlobs(inBlobs, config.inOutData.inData); + auto blobs = config.initBlobs(config.floatBlobData); + std::vector inCBlobs; + std::copy(inBlobs.begin(), inBlobs.end(), back_inserter(inCBlobs)); + ASSERT_NO_THROW(impl->infer(inCBlobs, config.strParams, blobs, outBlobs)); + compareWithRef(outBlobs, refData); +} + +void ShapeInferMatcher::toShapes(const std::vector& refShape) { + siHolder.reset(new IE::ShapeInfer::BuiltInShapeInferHolder()); + IE::IShapeInferImpl::Ptr impl; + std::vector outShapes; + sts = siHolder->getShapeInferImpl(impl, config.type.c_str(), &desc); + ASSERT_NE(nullptr, impl); + auto inBlobs = createBlobs(config.inOutData.inOutShapes.inDims, config.inPrecisions); + fillBlobs(inBlobs, config.inOutData.inData); + std::vector inCBlobs; + std::copy(inBlobs.begin(), inBlobs.end(), back_inserter(inCBlobs)); + auto blobs = config.initBlobs(config.floatBlobData); + sts = impl->inferShapes(inCBlobs, config.strParams, blobs, outShapes, &desc); + ASSERT_EQ(sts, IE::OK) << desc.msg; + ASSERT_EQ(config.inOutData.inOutShapes.outDims, outShapes); +} + +InitBlobsFunc ASITestBuilder::defaultBlobInit() { + return [](const FloatMap& blobDataMap) -> BlobMap { + BlobMap blobs; + for (const auto& it : blobDataMap) { + std::string blobName; + std::vector data; + std::tie(blobName, data) = it; + SizeVector blobDims = {data.size()}; + auto blob = make_shared_blob(Precision::FP32, TensorDesc::getLayoutByDims(blobDims), blobDims, + data); + blobs[blobName] = blob; + } + return blobs; + }; +} + +MatcherConfigurator ASITestBuilder::constInferResultFor() { + return MatcherConfigurator(config); +} + +MatcherConfigurator ASITestBuilder::shapeInferResultFor() { + return MatcherConfigurator(config); +} diff --git a/inference-engine/tests/unit/shape_infer/adult_test_utils.hpp b/inference-engine/tests/unit/shape_infer/adult_test_utils.hpp new file mode 100644 index 0000000..451799c --- /dev/null +++ b/inference-engine/tests/unit/shape_infer/adult_test_utils.hpp @@ -0,0 +1,137 @@ +#include + +#include + +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "built_in_shape_infer_general_test.hpp" + +namespace IE = InferenceEngine; + +struct InOutData { + testing::InOutShapes inOutShapes; + std::vector> inData; + std::vector> outData; +}; + +using FloatMap = std::map>; +using InitBlobsFunc = std::function; + +struct ASIConfig { + InOutData inOutData; + std::string type; + FloatMap floatBlobData; + std::map strParams; + InitBlobsFunc initBlobs; + std::vector inPrecisions; + std::vector outPrecisions; +}; + +class BaseMatcher { +public: + explicit BaseMatcher(ASIConfig config) : config(std::move(config)) {} + +protected: + void compareWithRef(const std::vector& outBlobs, + const std::vector>& refData, + float tolerance = 0.0001); + + std::vector + createBlobs(const std::vector& shapes, const std::vector& precisions); + + void fillBlobs(const std::vector& blobs, const std::vector>& data); + + ASIConfig config; +}; + +class ConstInferMatcher : public BaseMatcher { +public: + explicit ConstInferMatcher(const ASIConfig& config) : BaseMatcher(config) {} + + void toData(const std::vector>& refData); + +private: + std::shared_ptr holder; +}; + +class ShapeInferMatcher : public BaseMatcher { +public: + explicit ShapeInferMatcher(const ASIConfig& config) : BaseMatcher(config) {} + + void toShapes(const std::vector& refShape); + +private: + std::unique_ptr siHolder; + IE::StatusCode sts; + IE::ResponseDesc desc; +}; + +template +class MatcherConfigurator { +public: + explicit MatcherConfigurator(ASIConfig config) : config(std::move(config)) {} + + MatcherConfigurator& withParams(const std::map& params) { + config.strParams = params; + return *this; + } + + MatcherConfigurator& withInputPrecisions(const std::vector& inputPrecisions) { + config.inPrecisions = inputPrecisions; + return *this; + } + + MatcherConfigurator& withOutputPrecisions(const std::vector& outputPrecisions) { + config.outPrecisions = outputPrecisions; + return *this; + } + + MatcherConfigurator& withBlobs(const FloatMap& blobDataMap) { + config.floatBlobData = blobDataMap; + return *this; + } + + M equals() { + return M(config); + } + +private: + ASIConfig config; +}; + +class ASITestBuilder { + ASIConfig config; +public: + ASITestBuilder() { + config.initBlobs = defaultBlobInit(); + } + + ASITestBuilder& withData(const InOutData& data) { + config.inOutData = data; + config.inPrecisions = {data.inOutShapes.inDims.size(), IE::Precision::FP32}; + config.outPrecisions = {data.inOutShapes.outDims.size(), IE::Precision::FP32}; + return *this; + } + + ASITestBuilder& withType(const std::string& type) { + config.type = type; + return *this; + } + + MatcherConfigurator constInferResultFor(); + + MatcherConfigurator shapeInferResultFor(); + +private: + InitBlobsFunc defaultBlobInit(); +}; + +PRETTY_PARAM(BlobsParam, FloatMap) + +PRETTY_PARAM(InOutDataParam, InOutData) diff --git a/inference-engine/tests/unit/shape_infer/built_in_holder_test.cpp b/inference-engine/tests/unit/shape_infer/built_in_holder_test.cpp index b8661bd..35e16eb 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_holder_test.cpp +++ b/inference-engine/tests/unit/shape_infer/built_in_holder_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_batch_test.cpp b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_batch_test.cpp index 9f57e35..ebf728a 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_batch_test.cpp +++ b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_batch_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_conv_test.cpp b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_conv_test.cpp index 07aaf7f..fefdaeb 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_conv_test.cpp +++ b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_conv_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +15,7 @@ using namespace InferenceEngine; using namespace ShapeInfer; class BuiltInShapeInferConvImplTest - : public BuiltInShapeInferTestWithParam> { + : public BuiltInShapeInferTestWithParam> { protected: void SetUp() override { BuiltInShapeInferCommon::SetUp(); @@ -30,7 +30,7 @@ protected: dilation_factor = std::get<7>(params); newInOutShapes = std::get<8>(params); canInfer = std::get<9>(params); - padrb = std::get<10>(params); + pad_end = std::get<10>(params); isTransposed = std::get<11>(params); if (isTransposed) { type = "Deconvolution"; @@ -40,25 +40,6 @@ protected: std::map getMapParams() { std::map params = { - {"kernel-x", std::to_string(kernel.x)}, - {"kernel-y", std::to_string(kernel.y)}, - {"stride-x", std::to_string(stride.x)}, - {"stride-y", std::to_string(stride.y)}, - {"pad-x", std::to_string(pad.x)}, - {"pad-y", std::to_string(pad.y)}, - {"output", std::to_string(out_channels)}, - {"group", std::to_string(group)}, - {"dilation-x", std::to_string(dilation_factor.x)}, - {"dilation-y", std::to_string(dilation_factor.y)} - }; - if (!auto_pad.empty()) params["auto_pad"] = auto_pad; - if (padrb.x) params["pad-r"] = std::to_string(padrb.x); - if (padrb.y) params["pad-b"] = std::to_string(padrb.y); - return params; - } - - std::map getMapParams_IRv3() { - std::map params = { {"kernel", kernel.toSeparetedRow(",")}, {"strides", stride.toSeparetedRow(",")}, {"pads_begin", pad.toSeparetedRow(",")}, @@ -67,21 +48,19 @@ protected: {"dilations", dilation_factor.toSeparetedRow(",")} }; if (!auto_pad.empty()) params["auto_pad"] = auto_pad; - if (padrb.x != 0 && padrb.y != 0) { - params["pads_end"] = padrb.toSeparetedRow(","); - } + if (!pad_end.empty()) params["pads_end"] = pad_end.toSeparetedRow(","); return params; } protected: std::string type = "Convolution"; std::string dataName = "convolution_data"; - testing::InOutData inOutShapes; - testing::InOutData newInOutShapes; + testing::InOutShapes inOutShapes; + testing::InOutShapes newInOutShapes; param_size kernel{}; param_size stride{}; param_size pad{}; - param_size padrb{}; + param_size pad_end{}; param_size dilation_factor{}; std::string auto_pad; unsigned out_channels{}; @@ -92,20 +71,22 @@ protected: TEST_P(BuiltInShapeInferConvImplTest, impl) { - InferenceEngine::details::BaseCreator::version_ = 2; auto impl = getShapeInferImpl(type); ASSERT_NE(nullptr, impl); if (!group) group = 1; - SizeVector weightsDim{kernel.x * kernel.y * out_channels * inOutShapes.inDims[0][1] / group}; + unsigned w_dim = out_channels * inOutShapes.inDims[0][1] / group; + for (auto k : kernel.dims) + w_dim *= k; + SizeVector weightsDim{w_dim}; blobs["weights"] = make_shared_blob(Precision::fromType(), weightsDim); - ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams(), blobs, outShapes, &resp)); + ASSERT_NO_THROW(sts = impl->inferShapes(getBlobs(inOutShapes.inDims), getMapParams(), blobs, outShapes, &resp)); ASSERT_EQ(int(OK), sts) << resp.msg; ASSERT_EQ(inOutShapes.outDims, outShapes); } TEST_P(BuiltInShapeInferConvImplTest, batch) { auto layerParams = getMapParams(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, dataName); + auto cnnNetworkImplPtr = buildSingleLayerNetwork<4>(type, inOutShapes, &layerParams, dataName); auto reshaper = std::make_shared(*cnnNetworkImplPtr); sts = cnnNetworkImplPtr->setBatchSizeReshape(BATCH, &resp); ASSERT_EQ((int) OK, sts) << resp.msg; @@ -115,38 +96,7 @@ TEST_P(BuiltInShapeInferConvImplTest, batch) { TEST_P(BuiltInShapeInferConvImplTest, reshaper) { auto layerParams = getMapParams(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, dataName); - auto reshaper = std::make_shared(*cnnNetworkImplPtr); - auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims); - reshaper->run(inputShapes); - checkNetworkInOut(*cnnNetworkImplPtr, newInOutShapes); -} - -TEST_P(BuiltInShapeInferConvImplTest, impl_IRv3) { - InferenceEngine::details::BaseCreator::version_ = 3; - auto impl = getShapeInferImpl(type); - ASSERT_NE(nullptr, impl); - if (!group) group = 1; - SizeVector weightsDim{kernel.x * kernel.y * out_channels * inOutShapes.inDims[0][1] / group}; - blobs["weights"] = make_shared_blob(Precision::fromType(), weightsDim); - ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams_IRv3(), blobs, outShapes, &resp)); - ASSERT_EQ(int(OK), sts) << resp.msg; - ASSERT_EQ(inOutShapes.outDims, outShapes); -} - -TEST_P(BuiltInShapeInferConvImplTest, batch_IRv3) { - auto layerParams = getMapParams_IRv3(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, dataName); - auto reshaper = std::make_shared(*cnnNetworkImplPtr); - sts = cnnNetworkImplPtr->setBatchSizeReshape(BATCH, &resp); - ASSERT_EQ((int) OK, sts) << resp.msg; - inOutShapes.inDims[0][0] = inOutShapes.outDims[0][0] = BATCH; - checkNetworkInOut(*cnnNetworkImplPtr, inOutShapes); -} - -TEST_P(BuiltInShapeInferConvImplTest, reshaper_IRv3) { - auto layerParams = getMapParams_IRv3(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, dataName); + auto cnnNetworkImplPtr = buildSingleLayerNetwork<4>(type, inOutShapes, &layerParams, dataName); auto reshaper = std::make_shared(*cnnNetworkImplPtr); auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims); reshaper->run(inputShapes); @@ -162,42 +112,42 @@ INSTANTIATE_TEST_CASE_P( pad({2, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 229, 115}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end(), IsTransposed(false)), // fixate pad + dilation ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 225, 109}}}), kernel({4, 2}), stride({2, 1}), pad({2, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 225, 109}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end(), IsTransposed(false)), // fixate pad + right/bottom ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 230, 115}}}), kernel({4, 2}), stride({2, 1}), pad({2, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 230, 115}}}), - CanInfer(true), padrb({3, 2}), IsTransposed(false)), + CanInfer(true), pad_end({3, 2}), IsTransposed(false)), // valid + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 227, 113}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 227, 113}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end(), IsTransposed(false)), // valid + dilation ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 223, 107}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 223, 107}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end({0, 0}), IsTransposed(false)), // valid + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 227, 113}}}), kernel({4, 2}), stride({2, 1}), pad({2, 4}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 227, 113}}}), - CanInfer(true), padrb({3, 2}), IsTransposed(false)), + CanInfer(true), pad_end({3, 2}), IsTransposed(false)), // same_upper + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 114}}}), kernel({4, 2}), stride({2, 1}), @@ -205,7 +155,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 114}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end(), IsTransposed(false)), // same_upper + dilation paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 114}}}), kernel({4, 2}), stride({2, 1}), @@ -213,7 +163,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 114}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end({0, 0}), IsTransposed(false)), // same_upper + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 114}}}), kernel({4, 2}), stride({2, 1}), @@ -221,7 +171,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 114}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end({0, 0}), IsTransposed(false)), // same_lower + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 113}}}), kernel({4, 2}), stride({2, 1}), @@ -229,7 +179,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 113}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end(), IsTransposed(false)), // same_lower + dilation ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 113}}}), kernel({4, 2}), stride({2, 1}), @@ -237,7 +187,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 113}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)), + CanInfer(true), pad_end({0, 0}), IsTransposed(false)), // same_lower + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 113}}}), kernel({4, 2}), stride({2, 1}), @@ -245,7 +195,37 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 113}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(false)) + CanInfer(true), pad_end({0, 0}), IsTransposed(false)), + // 5D tensors + // fixate pad + ::testing::make_tuple(InOutShapes({{{4, 3, 64, 100, 120}}, + {{4, 64, 66, 101, 61}}}), kernel({4, 2, 1}), stride({2, 1, 1}), + pad({2, 1, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 64, 100, 120}}, + {{1, 64, 66, 101, 61}}}), + CanInfer(true), pad_end(), IsTransposed(false)), + // fixate pad + right/bottom + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 128}}, + {{4, 64, 18, 130, 65}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({2, 1, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 16, 128, 128}}, + {{1, 64, 18, 130, 65}}}), + CanInfer(true), pad_end({3, 2, 2}), IsTransposed(false)), + // valid + fixated paddings (shouldn't affect) + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 64, 15, 127, 64}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({2, 4, 2}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 64, 15, 127, 64}}}), + CanInfer(true), pad_end({3, 2, 2}), IsTransposed(false)), + // same_lower + empty paddings + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 64, 16, 128, 65}}}), kernel({4, 2, 1}), stride({2, 1, 1}), + pad({0, 0, 0}), auto_pad("same_lower"), out_channels(64), group(1), + dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 64, 16, 128, 65}}}), + CanInfer(true), pad_end(), IsTransposed(false)) ) ); @@ -258,42 +238,42 @@ INSTANTIATE_TEST_CASE_P( pad({2, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end(), IsTransposed(true)), // fixate pad + dilation ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 231, 466}}}), kernel({4, 2}), stride({2, 1}), pad({2, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 231, 466}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end(), IsTransposed(true)), // fixate pad + right/bottom ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 226, 453}}}), kernel({4, 2}), stride({2, 1}), pad({2, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 226, 453}}}), - CanInfer(true), padrb({3, 2}), IsTransposed(true)), + CanInfer(true), pad_end({3, 2}), IsTransposed(true)), // valid + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 229, 459}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 229, 459}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // valid + dilation ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 233, 471}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 233, 471}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // valid + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 64, 233, 471}}}), kernel({4, 2}), stride({2, 1}), pad({2, 4}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 228, 228}}, {{1, 64, 233, 471}}}), - CanInfer(true), padrb({3, 2}), IsTransposed(true)), + CanInfer(true), pad_end({3, 2}), IsTransposed(true)), // same_upper + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 454}}}), kernel({4, 2}), stride({2, 1}), @@ -301,7 +281,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // same_upper + dilation paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 454}}}), kernel({4, 2}), stride({2, 1}), @@ -309,7 +289,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({5, 5}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // same_upper + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 454}}}), kernel({4, 2}), stride({2, 1}), @@ -317,7 +297,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // same_lower + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 454}}}), kernel({4, 2}), stride({2, 1}), @@ -325,7 +305,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // same_lower + dilation ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 454}}}), kernel({4, 2}), stride({2, 1}), @@ -333,7 +313,7 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)), + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), // same_lower + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 64, 227, 454}}}), kernel({4, 2}), stride({2, 1}), @@ -341,6 +321,36 @@ INSTANTIATE_TEST_CASE_P( dilation_factor({0, 0}), NewInOutShapes({{{1, 3, 227, 227}}, {{1, 64, 227, 454}}}), - CanInfer(true), padrb({0, 0}), IsTransposed(true)) + CanInfer(true), pad_end({0, 0}), IsTransposed(true)), + // 5D tensors + // fixate pad + ::testing::make_tuple(InOutShapes({{{4, 3, 64, 100, 120}}, + {{4, 64, 66, 101, 61}}}), kernel({4, 2, 1}), stride({2, 1, 1}), + pad({2, 1, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0 ,0}), + NewInOutShapes({{{1, 3, 64, 100, 120}}, + {{1, 64, 66, 101, 61}}}), + CanInfer(true), pad_end(), IsTransposed(false)), + // fixate pad + right/bottom + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 64, 14, 126, 257}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({2, 1, 1}), auto_pad(""), out_channels(64), group(1), dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 64, 14, 126, 257 }}}), + CanInfer(true), pad_end({3, 2, 2}), IsTransposed(true)), + // valid + fixated paddings (shouldn't affect) + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 64, 15, 127, 64}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({2, 4, 2}), auto_pad("valid"), out_channels(64), group(1), dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 64, 15, 127, 64}}}), + CanInfer(true), pad_end({3, 2, 2}), IsTransposed(false)), + // same_lower + empty paddings + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 64, 16, 128, 65}}}), kernel({4, 2, 1}), stride({2, 1, 1}), + pad({0, 0, 0}), auto_pad("same_lower"), out_channels(64), group(1), + dilation_factor({0, 0, 0}), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 64, 16, 128, 65}}}), + CanInfer(true), pad_end(), IsTransposed(false)) ) ); diff --git a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_fake_test.cpp b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_fake_test.cpp index 2b66d59..17f64c0 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_fake_test.cpp +++ b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_fake_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.cpp b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.cpp index a7d3a64..1914365 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.cpp +++ b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -19,7 +19,8 @@ using namespace ShapeInfer; TEST_P(BuiltInShapeInferImplTest, impl) { auto impl = getShapeInferImpl(type); ASSERT_NE(nullptr, impl); - ASSERT_NO_THROW(sts = impl->inferShapes(newInOutShapes.inDims, layerParams.data, blobs, outShapes, &resp)); + ASSERT_NO_THROW( + sts = impl->inferShapes(getBlobs(newInOutShapes.inDims), layerParams.data, blobs, outShapes, &resp)); if (canInfer) { ASSERT_EQ(int(OK), sts) << resp.msg; @@ -33,7 +34,6 @@ TEST_P(BuiltInShapeInferImplTest, reshaper) { auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName); auto reshaper = std::make_shared(*cnnNetworkImplPtr); auto inputShapes = setInputShapes(*cnnNetworkImplPtr.get(), newInOutShapes.inDims); - if (canInfer) { reshaper->run(inputShapes); checkNetworkInOut(*cnnNetworkImplPtr, newInOutShapes); @@ -63,6 +63,19 @@ INSTANTIATE_TEST_CASE_P( ); INSTANTIATE_TEST_CASE_P( + BuiltInMultiImpls, BuiltInShapeInferImplTest, + ::testing::Combine( + ::testing::Values(LayerType("Mul"), LayerType("Eltwise"), LayerType("Add"), LayerType("Div")), + ::testing::Values(InOutShapes({{{1, 1, 1, 1}, {1, 1, 1, 1}}, + {{1, 1, 1, 1}}})), + ::testing::Values(NewInOutShapes({{{1, 3, 228, 228}, {1, 3, 228, 228}}, + {{1, 3, 228, 228}}})), + ::testing::Values(MapParams(MapStrStr())), + ::testing::Values(LayerDataName("data")), + ::testing::Values(CanInfer(true))) +); + +INSTANTIATE_TEST_CASE_P( BuiltInGeneralImpls, BuiltInShapeInferImplTest, ::testing::Values( ::testing::make_tuple(LayerType("LRN"), @@ -144,9 +157,9 @@ INSTANTIATE_TEST_CASE_P( LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Reshape"), - InOutShapes({{{1, 1, 300, 4}}, + InOutShapes({{{1, 1, 300, 4}}, {{300, 4}}}), - NewInOutShapes({{{1, 1, 500, 4}}, + NewInOutShapes({{{1, 1, 500, 4}}, {{500, 4}}}), MapParams(MapStrStr(std::map{{"dim", "-1,4"}})), LayerDataName("data"), @@ -159,11 +172,11 @@ INSTANTIATE_TEST_CASE_P( MapParams(MapParams(MapStrStr())), LayerDataName("data"), CanInfer(true)), - ::testing::make_tuple(LayerType("PriorBoxClustered"), - InOutShapes({{{2, 1, 4, 5}}, + ::testing::make_tuple(LayerType("PriorBoxClustered"), // TODO 5D test + InOutShapes({ {{2, 1, 4, 5}, {2, 4, 5, 6}}, {{1, 2, 400}}}), - NewInOutShapes({{{4, 1, 5, 5}}, - {{1, 2, 500}}}), + NewInOutShapes({ {{4, 1, 5, 5}, {3, 5, 6, 3}}, + {{1, 2, 500}} }), MapParams(MapStrStr( std::map{{"width", "86.000000,13.000000,57.000000,39.000000,68.000000"}, {"clip", "0"}, @@ -181,6 +194,7 @@ INSTANTIATE_TEST_CASE_P( {"max_size", "315"}, {"clip", "0"}, {"flip", "1"}, + { "offset", "0.5" }, {"aspect_ratio", "2"}})), LayerDataName("data"), CanInfer(true)), @@ -188,16 +202,16 @@ INSTANTIATE_TEST_CASE_P( InOutShapes({{{2, 512, 32, 32}, {2, 3, 512, 512}}, {{1, 2, 16384}}}), NewInOutShapes({{{2, 512, 32, 32}, {2, 3, 512, 512}}, - {{1, 2, 16384}}}), + {{1, 2, 16384}}}), MapParams(MapStrStr( - std::map{{"min_size", "35.84,52.46464"}, - {"max_size", ""}, - {"clip", "0"}, - {"step", "16"}, - {"flip", "0"}, - {"offset", "0.5"}, - {"aspect_ratio", "1.0,2.0,0.5"}, - {"scale_all_sizes", "0"}})), + std::map{{"min_size", "35.84,52.46464"}, + {"max_size", ""}, + {"clip", "0"}, + {"step", "16"}, + {"flip", "0"}, + {"offset", "0.5"}, + {"aspect_ratio", "1.0,2.0,0.5"}, + {"scale_all_sizes", "0"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("PriorBox"), @@ -206,20 +220,20 @@ INSTANTIATE_TEST_CASE_P( NewInOutShapes({{{2, 512, 32, 32}, {2, 3, 512, 512}}, {{1, 2, 28672}}}), MapParams(MapStrStr( - std::map{{"min_size", "35.84,52.46464"}, - {"max_size", ""}, - {"clip", "0"}, - {"step", "16"}, - {"offset", "0.5"}, - {"flip", "1"}, - {"aspect_ratio", "1.0,2.0,0.5"}, - {"scale_all_sizes", "0"}})), + std::map{{"min_size", "35.84,52.46464"}, + {"max_size", ""}, + {"clip", "0"}, + {"step", "16"}, + {"offset", "0.5"}, + {"flip", "1"}, + {"aspect_ratio", "1.0,2.0,0.5"}, + {"scale_all_sizes", "0"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("DetectionOutput"), - InOutShapes({{{2, 1, 4, 5}}, + InOutShapes({{{2, 1, 4, 5}, { 2, 1, 4, 5 }, { 2, 1, 4, 5 }}, {{2, 1, 200, 7}}}), - NewInOutShapes({{{4, 1, 5, 5}}, + NewInOutShapes({{{4, 1, 5, 5}, { 4, 1, 5, 5 }, { 4, 1, 5, 5 }}, {{1, 1, 800, 7}}}), MapParams(MapStrStr(std::map{{"keep_top_k", "200"}, {"num_classes", "21"}, @@ -227,52 +241,41 @@ INSTANTIATE_TEST_CASE_P( LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Interp"), - InOutShapes({{{2, 2, 33, 65}}, + InOutShapes({{{2, 2, 33, 65}}, {{2, 2, 257, 513}}}), - NewInOutShapes({{{2, 2, 33, 65}}, + NewInOutShapes({{{2, 2, 33, 65}}, {{2, 2, 257, 513}}}), MapParams(MapStrStr(std::map{{"align_corners", "1"}, - {"height", "257"}, - {"pad_beg", "0"}, - {"pad_end", "0"}, - {"width", "513"}})), + {"height", "257"}, + {"pad_beg", "0"}, + {"pad_end", "0"}, + {"width", "513"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Interp"), - InOutShapes({{{2, 2, 33, 65}}, + InOutShapes({{{2, 2, 33, 65}}, {{2, 2, 66, 513}}}), - NewInOutShapes({{{2, 2, 33, 65}}, + NewInOutShapes({{{2, 2, 33, 65}}, {{2, 2, 66, 513}}}), MapParams(MapStrStr(std::map{{"align_corners", "1"}, - {"factor", "2"}, - {"width", "513"}, - {"pad_beg", "0"}, - {"pad_end", "0"}})), + {"factor", "2"}, + {"width", "513"}, + {"pad_beg", "0"}, + {"pad_end", "0"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Interp"), - InOutShapes({{{2, 2, 33, 65}}, + InOutShapes({{{2, 2, 33, 65}}, {{2, 2, 257, 130}}}), - NewInOutShapes({{{2, 2, 33, 65}}, + NewInOutShapes({{{2, 2, 33, 65}}, {{2, 2, 257, 130}}}), MapParams(MapStrStr(std::map{{"align_corners", "1"}, - {"factor", "2"}, - {"height", "257"}, - {"pad_beg", "0"}, - {"pad_end", "0"}})), + {"factor", "2"}, + {"height", "257"}, + {"pad_beg", "0"}, + {"pad_end", "0"}})), LayerDataName("data"), CanInfer(true)), - ::testing::make_tuple(LayerType("Interp"), - InOutShapes({{{2, 2, 33, 65}}, - {{2, 2, 257, 130}}}), - NewInOutShapes({{{2, 2, 33, 65}}, - {{2, 2, 257, 130}}}), - MapParams(MapStrStr(std::map{{"align_corners", "1"}, - {"width", "513"}, - {"pad_beg", "0"}, - {"pad_end", "0"}})), - LayerDataName("data"), - CanInfer(false)), ::testing::make_tuple(LayerType("ROIPooling"), InOutShapes({{{2, 3, 4, 5}, {150, 5}}, {{150, 3, 6, 6}}}), @@ -292,7 +295,7 @@ INSTANTIATE_TEST_CASE_P( LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("PSROIPooling"), - InOutShapes({{{1, 3, 4, 5}, {150, 5}}, + InOutShapes({{{1, 3, 4, 5}, {150, 5}}, {{150, 2, 6, 6}}}), NewInOutShapes({{{2, 1, 5, 5}, {200, 5}}, {{200, 2, 6, 6}}}), @@ -385,14 +388,6 @@ INSTANTIATE_TEST_CASE_P( {"out_sizes", "2,4"}})), LayerDataName("data"), CanInfer(true)), - ::testing::make_tuple(LayerType("CTCGreedyDecoder"), - InOutShapes({{{88, 1, 48, 1}}, - {{1, 88, 1, 1}}}), - NewInOutShapes({{{88, 2, 48, 1}}, - {{2, 88, 1, 1}}}), - MapParams(MapStrStr()), - LayerDataName("data"), - CanInfer(true)), ::testing::make_tuple(LayerType("Proposal"), InOutShapes({{{1, 12, 34, 62}, {1, 24, 34, 62}, {1, 6}}, {{200, 5}}}), @@ -416,7 +411,9 @@ INSTANTIATE_TEST_CASE_P( {{1, 21125}}}), NewInOutShapes({{{20, 125, 16, 13}}, {{20, 26000}}}), - MapParams(MapStrStr()), + MapParams(MapStrStr({{"axis", "1"}, + {"end_axis", "-1"}, + {"do_softmax", "1"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("ArgMax"), @@ -535,14 +532,14 @@ INSTANTIATE_TEST_CASE_P( LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Pad"), - InOutShapes({{{3, 3, 15, 10}}, + InOutShapes({{{3, 3, 15, 10}}, {{9, 11, 25, 22}}}), - NewInOutShapes({{{4, 2, 20, 15}}, + NewInOutShapes({{{4, 2, 20, 15}}, {{10, 10, 30, 27}}}), - MapParams(MapStrStr({{"pads_begin", "1,2,3,4"}, - {"pads_end", "5,6,7,8"}, - {"pad_mode", "edge"}, - {"pad_value", "1.0f"}})), + MapParams(MapStrStr({{"pads_begin", "1,2,3,4"}, + {"pads_end", "5,6,7,8"}, + {"pad_mode", "edge"}, + {"pad_value", "1.0f"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Pad"), @@ -550,33 +547,34 @@ INSTANTIATE_TEST_CASE_P( {{16, 18, 25, 22}}}), NewInOutShapes({{{20, 30, 40, 50}}, {{26, 38, 40, 50}}}), - MapParams(MapStrStr({{"pads_begin", "1,2,0,0"}, - {"pads_end", "5,6,0,0"}, - {"pad_mode", "reflect"}, - {"pad_value", "1.0f"}})), + MapParams(MapStrStr({{"pads_begin", "1,2,0,0"}, + {"pads_end", "5,6,0,0"}, + {"pad_mode", "reflect"}, + {"pad_value", "1.0f"}})), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Pad"), InOutShapes({{{10, 10, 15, 10}}, {{16, 18, 25, 22}}}), - NewInOutShapes({{{4, 2, 20, 15}}, + NewInOutShapes({{{4, 2, 20, 15}}, {{10, 10, 30, 27}}}), - MapParams(MapStrStr({{"pads_begin", "1,2,3,4"}, - {"pads_end", "5,6,7,8"}, - {"pad_mode", "reflect"}, - {"pad_value", "1.0f"}})), + MapParams(MapStrStr({{"pads_begin", "1,2,3,4"}, + {"pads_end", "5,6,7,8"}, + {"pad_mode", "reflect"}, + {"pad_value", "1.0f"}})), LayerDataName("data"), CanInfer(false)) ) ); +// There are gtest limitation on tests number: 50 INSTANTIATE_TEST_CASE_P( BuiltInGeneralImpls2, BuiltInShapeInferImplTest, ::testing::Values( ::testing::make_tuple(LayerType("Gather"), InOutShapes({{{7, 16}, {1, 25}}, {{1, 25, 16}}}), - NewInOutShapes({{{7, 16}, {12, 25}}, + NewInOutShapes({{{7, 16}, {12, 25}}, {{12, 25, 16}}}), MapParams(MapStrStr(std::map{{"axis", "0"}})), LayerDataName("data"), @@ -597,12 +595,29 @@ INSTANTIATE_TEST_CASE_P( MapParams(MapStrStr(std::map{{"axis", "-1"}})), LayerDataName("data"), CanInfer(true)), + ::testing::make_tuple(LayerType("CTCGreedyDecoder"), + InOutShapes({{{88, 1, 48, 1}}, + {{1, 88, 1, 1}}}), + NewInOutShapes({{{88, 2, 48, 1}}, + {{2, 88, 1, 1}}}), + MapParams(MapStrStr()), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("CTCGreedyDecoder"), + InOutShapes({{{88, 1, 71}, {88, 1}}, + {{1, 88, 1, 1}}}), + NewInOutShapes({{{88, 2, 71}, {88, 2}}, + {{2, 88, 1, 1}}}), + MapParams(MapStrStr()), + LayerDataName("data"), + CanInfer(true)), ::testing::make_tuple(LayerType("Reshape"), InOutShapes({{{1, 2}}, {{1, 1}}}), NewInOutShapes({{{1, 2}}, {{1, 1}}}), - MapParams(MapStrStr(std::map{{"dim", "1,1"}})), // dim doesn't match input + MapParams(MapStrStr( + std::map{{"dim", "1,1"}})), // dim doesn't match input LayerDataName("data"), CanInfer(false)), ::testing::make_tuple(LayerType("Flatten"), @@ -610,7 +625,7 @@ INSTANTIATE_TEST_CASE_P( {{40}}}), NewInOutShapes({{{4, 1, 4, 5}}, {{80}}}), - MapParams(MapParams(MapStrStr(std::map{{"axis", "0"}, + MapParams(MapParams(MapStrStr(std::map{{"axis", "0"}, {"end_axis", "-1"}}))), LayerDataName("data"), CanInfer(true)), @@ -619,7 +634,7 @@ INSTANTIATE_TEST_CASE_P( {{2, 8, 5}}}), NewInOutShapes({{{4, 2, 4, 5}}, {{4, 8, 5}}}), - MapParams(MapParams(MapStrStr(std::map{{"axis", "1"}, + MapParams(MapParams(MapStrStr(std::map{{"axis", "1"}, {"end_axis", "2"}}))), LayerDataName("data"), CanInfer(true)), @@ -628,7 +643,8 @@ INSTANTIATE_TEST_CASE_P( {{2, 40}}}), NewInOutShapes({{{4, 2, 4, 5}}, {{4, 40}}}), - MapParams(MapParams(MapStrStr(std::map{{"axis", "1"}}))), + MapParams( + MapParams(MapStrStr(std::map{{"axis", "1"}}))), LayerDataName("data"), CanInfer(true)), ::testing::make_tuple(LayerType("Flatten"), @@ -636,7 +652,114 @@ INSTANTIATE_TEST_CASE_P( {{4, 4, 5}}}), NewInOutShapes({{{4, 2, 4, 5}}, {{8, 4, 5}}}), - MapParams(MapParams(MapStrStr(std::map{{"end_axis", "1"}}))), + MapParams(MapParams( + MapStrStr(std::map{{"end_axis", "1"}}))), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("Interp"), + InOutShapes({{{2, 2, 100, 16}}, + {{2, 2, 25, 4}}}), + NewInOutShapes({{{2, 2, 201, 33}}, + {{2, 2, 50, 8}}}), + MapParams(MapStrStr(std::map{{"align_corners", "1"}, + {"factor", "0.25"}, + {"pad_beg", "0"}, + {"pad_end", "0"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("Interp"), + InOutShapes({{{2, 2, 100, 16}}, + {{2, 2, 100, 16}}}), + NewInOutShapes({{{2, 2, 101, 33}}, + {{2, 2, 101, 33}}}), + MapParams(MapStrStr(std::map{{"align_corners", "1"}, + {"shrink_factor", "1.5"}, + {"zoom_factor", "1.5"}, + {"pad_beg", "0"}, + {"pad_end", "0"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("ShuffleChannels"), + InOutShapes({{{1, 2, 3, 4}}, + {{1, 2, 3, 4}}}), + NewInOutShapes({{{2, 4, 4, 7}}, + {{2, 4, 4, 7}}}), + MapParams(MapStrStr(std::map{{"axis", "1"}, + {"group", "2"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("DepthToSpace"), + InOutShapes({{{4, 2, 3}}, + {{1, 4, 6}}}), + NewInOutShapes({{{8, 3, 4}}, + {{2, 6, 8}}}), + MapParams(MapStrStr(std::map{{"block_size", "2"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("SpaceToDepth"), + InOutShapes({ { { 1, 4, 6 } }, + { { 4, 2, 3 } } }), + NewInOutShapes({ { { 2, 6, 8 } }, + { { 8, 3, 4 } } }), + MapParams(MapStrStr(std::map{ {"block_size", "2"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("ReverseSequence"), + InOutShapes({{{3, 4, 5}, {3}}, + {{3, 4, 5}}}), + NewInOutShapes({{{4, 8, 9}, {4}}, + {{4, 8, 9}}}), + MapParams(MapStrStr(std::map{{"seq_axis", "1"}, + {"batch_axis", "0"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("RegionYolo"), + InOutShapes({{{1, 125, 13, 13}}, + {{1 * 125, 13, 13}}}), + NewInOutShapes({{{20, 125, 16, 13}}, + {{20 * 125, 16, 13}}}), + MapParams(MapStrStr({{"axis", "0"}, + {"end_axis", "1"}, + {"do_softmax", "1"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("RegionYolo"), + InOutShapes({{{1, 125, 13, 13}}, + {{1 * 125 * 13, 13}}}), + NewInOutShapes({{{20, 125, 16, 13}}, + {{20 * 125 * 16, 13}}}), + MapParams(MapStrStr({{"axis", "0"}, + {"end_axis", "2"}, + {"do_softmax", "1"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("RegionYolo"), + InOutShapes({{{1, 125, 13, 13}}, + {{1, (80 + 4 + 1) * 125, 13, 13}}}), + NewInOutShapes({{{20, 125, 16, 13}}, + {{20, (80 + 4 + 1) * 3, 16, 13}}}), + MapParams(MapStrStr({{"axis", "1"}, + {"end_axis", "-1"}, + {"do_softmax", "0"}, + {"classes", "80"}, + {"coords", "4"}, + {"mask", "6,7,8"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("Upsampling"), + InOutShapes({{{1, 3, 4, 5, 6}}, + {{1, 3, 8, 10, 12}}}), + NewInOutShapes({{{2, 1, 7, 5, 5}}, + {{2, 1, 14, 10, 10}}}), + MapParams(MapStrStr(std::map{{"scale", "2"}})), + LayerDataName("data"), + CanInfer(true)), + ::testing::make_tuple(LayerType("Quantize"), + InOutShapes({{{1, 64, 10, 10}, {1, 64, 1, 1}, {1, 64, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}, + {{1, 64, 10, 10}}}), + NewInOutShapes({{{2, 128, 10, 10}, {1, 128, 1, 1}, {1, 128, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}, + {{2, 128, 10, 10}}}), + MapParams(MapStrStr(std::map{ {"levels", "2"}})), LayerDataName("data"), CanInfer(true)) ) @@ -668,3 +791,4 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(CanInfer()) ) ); + diff --git a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.hpp b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.hpp index 5eac622..89f7b5a 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.hpp +++ b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.hpp @@ -1,55 +1,54 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include +#include #include #include #include #include #include #include +#include -class BaseTestCreator { -protected: - std::string _type; -public: - explicit BaseTestCreator(const std::string &type) : _type(type) {} - - virtual InferenceEngine::CNNLayerPtr create(const std::string &type) = 0; - - virtual bool shouldCreate(const std::string &type) = 0; -}; - -template -class LayerTestCreator : public BaseTestCreator { -public: - explicit LayerTestCreator(const std::string &type) : BaseTestCreator(type) {} +namespace IE = InferenceEngine; - InferenceEngine::CNNLayerPtr create(const std::string &type) override { - InferenceEngine::LayerParams params; - params.type = type; - return std::make_shared(params); +struct param_size { + // dimensions order: x, y, z, ... + std::vector dims; + param_size() {} +// param_size(const std::vector& dims) { +// this->dims = dims; +// } + param_size(std::initializer_list dims) { + this->dims = dims; } - - bool shouldCreate(const std::string &type) override { - return type == _type; + bool empty() { + return dims.empty(); } -}; - -struct param_size { - unsigned x; - unsigned y; friend std::ostream &operator<<(std::ostream &os, param_size const ¶mSize) { - os << "x=" << std::to_string(paramSize.x) << ", y=" << std::to_string(paramSize.y); + auto d_size = paramSize.dims.size(); + if (d_size > 0) { + os << "dims[" << std::to_string(0) << "]=" << std::to_string(paramSize.dims[0]); + for (int i = 1; i < paramSize.dims.size(); i++) + os << ", dims[" << std::to_string(i) << "]=" << std::to_string(paramSize.dims[i]); + } return os; }; std::string toSeparetedRow(const char *separator) { - std::string res = std::to_string(y) + separator + std::to_string(x); + auto d_size = dims.size(); + std::string res; + if (d_size > 0) { + res = std::to_string(dims[d_size - 1]); + for (int i = d_size - 2; i >= 0; i--) { + res += separator + std::to_string(dims[i]); + } + } return res; } }; @@ -60,7 +59,7 @@ PRETTY_PARAM(stride, param_size); PRETTY_PARAM(pad, param_size); -PRETTY_PARAM(padrb, param_size); +PRETTY_PARAM(pad_end, param_size); PRETTY_PARAM(auto_pad, std::string); @@ -78,9 +77,9 @@ PRETTY_PARAM(LayerType, std::string) PRETTY_PARAM(LayerDataName, std::string) -PRETTY_PARAM(InOutShapes, testing::InOutData) +PRETTY_PARAM(InOutShapes, testing::InOutShapes) -PRETTY_PARAM(NewInOutShapes, testing::InOutData) +PRETTY_PARAM(NewInOutShapes, testing::InOutShapes) PRETTY_PARAM(MapParams, MapStrStr) @@ -94,107 +93,45 @@ PRETTY_PARAM(ModelPath, std::string); static size_t BATCH = 100; -class BuiltInShapeInferCommon : public ::testing::Test { +class BuiltInShapeInferCommon : public TestsCommon { protected: void SetUp() override { - holder = std::make_shared(); + holder = std::make_shared(); } - InferenceEngine::IShapeInferImpl::Ptr getShapeInferImpl(const std::string &type) { - InferenceEngine::IShapeInferImpl::Ptr impl; + IE::IShapeInferImpl::Ptr getShapeInferImpl(const std::string &type) { + IE::IShapeInferImpl::Ptr impl; sts = holder->getShapeInferImpl(impl, type.c_str(), &resp); - if (sts != InferenceEngine::StatusCode::OK) THROW_IE_EXCEPTION << resp.msg; + if (sts != IE::StatusCode::OK) THROW_IE_EXCEPTION << resp.msg; return impl; } protected: - InferenceEngine::StatusCode sts = InferenceEngine::StatusCode::GENERAL_ERROR; - InferenceEngine::ResponseDesc resp; - std::shared_ptr holder; + IE::StatusCode sts = IE::StatusCode::GENERAL_ERROR; + IE::ResponseDesc resp; + std::shared_ptr holder; }; template class BuiltInShapeInferTestWithParam : public BuiltInShapeInferCommon, public testing::WithParamInterface { - const std::vector> &getCreators() const { - // there should be unique_ptr but it cant be used with initializer lists - static std::vector > creators = { - std::make_shared>("Power"), - std::make_shared>("Convolution"), - std::make_shared>("Deconvolution"), - std::make_shared>("Pooling"), - std::make_shared>("InnerProduct"), - std::make_shared>("FullyConnected"), - std::make_shared>("LRN"), - std::make_shared>("Norm"), - std::make_shared>("Softmax"), - std::make_shared>("SoftMax"), - std::make_shared>("GRN"), - std::make_shared>("MVN"), - std::make_shared>("ReLU"), - std::make_shared>("Clamp"), - std::make_shared>("Split"), - std::make_shared>("Slice"), - std::make_shared>("Concat"), - std::make_shared>("Eltwise"), - std::make_shared>("ScaleShift"), - std::make_shared>("PReLU"), - std::make_shared>("Crop"), - std::make_shared>("Reshape"), - std::make_shared>("Tile"), - std::make_shared>("BatchNormalization"), - std::make_shared>("Gemm"), - std::make_shared>("Pad"), - std::make_shared>("Gather") - }; - return creators; - } protected: - InferenceEngine::DataPtr - getNotEmptyData(std::string const &name = "", const InferenceEngine::SizeVector &dims = {}) { - InferenceEngine::TensorDesc desc(InferenceEngine::Precision::UNSPECIFIED, dims, - InferenceEngine::TensorDesc::getLayoutByDims(dims)); - return std::make_shared(name, desc); - } - - InferenceEngine::CNNLayer::Ptr createLayer(const std::string &type) const { - for (auto &creator : getCreators()) { - if (!creator->shouldCreate(type)) - continue; - return creator->create(type); - } - static LayerTestCreator genericCreator(""); - return genericCreator.create(type); - } - - void initLayer(const InferenceEngine::CNNLayerPtr &layer, const testing::InOutData &inOutData) { - for (const auto &in:inOutData.inDims) { - auto data = getNotEmptyData("", in); - _savedData.push_back(data); - layer->insData.push_back(data); - } - for (const auto &out:inOutData.outDims) { - layer->outData.push_back(getNotEmptyData("", out)); - } - } - - static testing::InOutData getFakeData(const testing::InOutData &inOutShapes) { - testing::InOutData initial = inOutShapes; - for (auto &dims : initial.inDims) { - std::fill(dims.begin(), dims.end(), 1); - } - for (auto &dims : initial.outDims) { - std::fill(dims.begin(), dims.end(), 1); + static std::vector getBlobs(const std::vector& shapes) { + std::vector inBlobs; + for (auto const& dims : shapes) { + IE::TensorDesc desc(IE::Precision::FP32, dims, IE::TensorDesc::getLayoutByDims(dims)); + auto blob = make_blob_with_precision(desc); + inBlobs.push_back(blob); } - return initial; + return inBlobs; } - static InferenceEngine::ICNNNetwork::InputShapes - setInputShapes(const InferenceEngine::ICNNNetwork &cnnNetwork, - const std::vector &shapesToSet) { - InferenceEngine::ICNNNetwork::InputShapes inputShapes; - InferenceEngine::InputsDataMap inputs; + static IE::ICNNNetwork::InputShapes + setInputShapes(const IE::ICNNNetwork &cnnNetwork, + const std::vector &shapesToSet) { + IE::ICNNNetwork::InputShapes inputShapes; + IE::InputsDataMap inputs; cnnNetwork.getInputsInfo(inputs); for (const auto &pair : inputs) { auto info = pair.second; @@ -212,10 +149,10 @@ protected: return inputShapes; } - static void checkNetworkInOut(const InferenceEngine::ICNNNetwork &network, - const testing::InOutData &inOutData) { - InferenceEngine::InputsDataMap inputsDataMap; - InferenceEngine::OutputsDataMap outputsDataMap; + static void checkNetworkInOut(const IE::ICNNNetwork &network, + const testing::InOutShapes &inOutData) { + IE::InputsDataMap inputsDataMap; + IE::OutputsDataMap outputsDataMap; network.getInputsInfo(inputsDataMap); network.getOutputsInfo(outputsDataMap); int i = 0; @@ -229,20 +166,19 @@ protected: } template - static InferenceEngine::details::CNNNetworkImplPtr + static IE::details::CNNNetworkImplPtr buildSingleLayerNetwork(const std::string &layerType, - const testing::InOutData &inOutShapes, + const testing::InOutShapes &inOutShapes, std::map *params, const std::string &layerDataName = "data") { - auto *parser = new InferenceEngine::details::FormatParser(Version); + auto *parser = new IE::details::FormatParser(Version); return buildSingleLayerNetworkCommon(parser, layerType, inOutShapes, params, layerDataName); } protected: - std::vector outShapes; + std::vector outShapes; std::map params; - std::map blobs; - std::vector _savedData; + std::map blobs; }; class BuiltInShapeInferImplTest @@ -261,8 +197,8 @@ protected: protected: std::string type; - testing::InOutData inOutShapes; - testing::InOutData newInOutShapes; + testing::InOutShapes inOutShapes; + testing::InOutShapes newInOutShapes; MapStrStr layerParams; std::string layerDataName; bool canInfer{}; diff --git a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_pool_test.cpp b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_pool_test.cpp index 487ff84..de82eb4 100644 --- a/inference-engine/tests/unit/shape_infer/built_in_shape_infer_pool_test.cpp +++ b/inference-engine/tests/unit/shape_infer/built_in_shape_infer_pool_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +15,7 @@ using namespace InferenceEngine; using namespace ShapeInfer; class BuiltInShapeInferPoolImplTest - : public BuiltInShapeInferTestWithParam> { + : public BuiltInShapeInferTestWithParam> { protected: void SetUp() override { BuiltInShapeInferCommon::SetUp(); @@ -28,27 +28,10 @@ protected: exclude_pad = std::get<5>(params); auto_pad = std::get<6>(params); newInOutShapes = std::get<7>(params); - padrb = std::get<8>(params); + pad_end = std::get<8>(params); } std::map getMapParams() { - std::map params{ - {"kernel-x", std::to_string(kernel.x)}, - {"kernel-y", std::to_string(kernel.y)}, - {"stride-x", std::to_string(stride.x)}, - {"stride-y", std::to_string(stride.y)}, - {"pad-x", std::to_string(pad.x)}, - {"pad-y", std::to_string(pad.y)}, - {"pool-method", pool_type}, - {"exclude-pad", exclude_pad ? "false" : "true"}, - }; - if (!auto_pad.empty()) params["auto_pad"] = auto_pad; - if (padrb.x) params["pad-r"] = std::to_string(padrb.x); - if (padrb.y) params["pad-b"] = std::to_string(padrb.y); - return params; - } - - std::map getMapParams_IRv3() { std::map params = { {"kernel", kernel.toSeparetedRow(",")}, {"strides", stride.toSeparetedRow(",")}, @@ -57,36 +40,34 @@ protected: {"exclude-pad", exclude_pad ? "false" : "true"} }; if (!auto_pad.empty()) params["auto_pad"] = auto_pad; - if (padrb.x != 0 && padrb.y != 0) { - params["pads_end"] = padrb.toSeparetedRow(","); - } + if (!pad_end.empty()) params["pads_end"] = pad_end.toSeparetedRow(","); return params; } protected: std::string type = "Pooling"; - testing::InOutData inOutShapes; - testing::InOutData newInOutShapes; + testing::InOutShapes inOutShapes; + testing::InOutShapes newInOutShapes; param_size kernel; param_size stride; param_size pad; std::string pool_type; bool exclude_pad; std::string auto_pad; - param_size padrb; + param_size pad_end; }; TEST_P(BuiltInShapeInferPoolImplTest, body) { auto impl = getShapeInferImpl(type); ASSERT_NE(nullptr, impl); - ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams(), blobs, outShapes, &resp)); + ASSERT_NO_THROW(sts = impl->inferShapes(getBlobs(inOutShapes.inDims), getMapParams(), blobs, outShapes, &resp)); ASSERT_EQ(int(OK), sts) << resp.msg; ASSERT_EQ(inOutShapes.outDims, outShapes); } TEST_P(BuiltInShapeInferPoolImplTest, reshaper) { auto layerParams = getMapParams(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, "pooling_data"); + auto cnnNetworkImplPtr = buildSingleLayerNetwork<4>(type, inOutShapes, &layerParams, "pooling_data"); auto reshaper = std::make_shared(*cnnNetworkImplPtr); auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims); reshaper->run(inputShapes); @@ -95,34 +76,7 @@ TEST_P(BuiltInShapeInferPoolImplTest, reshaper) { TEST_P(BuiltInShapeInferPoolImplTest, batch) { auto layerParams = getMapParams(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, "pooling_data"); - auto reshaper = std::make_shared(*cnnNetworkImplPtr); - sts = cnnNetworkImplPtr->setBatchSize(BATCH, &resp); - ASSERT_EQ((int)OK, sts) << resp.msg; - inOutShapes.inDims[0][0] = inOutShapes.outDims[0][0] = BATCH; - checkNetworkInOut(*cnnNetworkImplPtr, inOutShapes); -} - -TEST_P(BuiltInShapeInferPoolImplTest, body_IRv3) { - auto impl = getShapeInferImpl(type); - ASSERT_NE(nullptr, impl); - ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams_IRv3(), blobs, outShapes, &resp)); - ASSERT_EQ(int(OK), sts) << resp.msg; - ASSERT_EQ(inOutShapes.outDims, outShapes); -} - -TEST_P(BuiltInShapeInferPoolImplTest, reshaper_IRv3) { - auto layerParams = getMapParams_IRv3(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, "pooling_data"); - auto reshaper = std::make_shared(*cnnNetworkImplPtr); - auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims); - reshaper->run(inputShapes); - checkNetworkInOut(*cnnNetworkImplPtr, newInOutShapes); -} - -TEST_P(BuiltInShapeInferPoolImplTest, batch_IRv3) { - auto layerParams = getMapParams_IRv3(); - auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, "pooling_data"); + auto cnnNetworkImplPtr = buildSingleLayerNetwork<4>(type, inOutShapes, &layerParams, "pooling_data"); auto reshaper = std::make_shared(*cnnNetworkImplPtr); sts = cnnNetworkImplPtr->setBatchSize(BATCH, &resp); ASSERT_EQ((int)OK, sts) << resp.msg; @@ -138,48 +92,67 @@ INSTANTIATE_TEST_CASE_P( {{4, 3, 229, 115}}}), kernel({4, 2}), stride({2, 1}), pad({2, 1}), pool_type("max"), exclude_pad(true), auto_pad(""), NewInOutShapes({{{1, 3, 228, 228}}, - {{1, 3, 229, 115}}}), padrb({0, 0})), + {{1, 3, 229, 115}}}), pad_end()), // fixate pad + right/bottom ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 3, 229, 115}}}), kernel({4, 2}), stride({2, 1}), pad({2, 1}), pool_type("max"), exclude_pad(true), auto_pad(""), NewInOutShapes({{{1, 3, 228, 228}}, - {{1, 3, 229, 115}}}), padrb({3, 2})), + {{1, 3, 229, 115}}}), pad_end({3, 2})), // valid + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 3, 227, 113}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), pool_type("max"), exclude_pad(true), auto_pad("valid"), NewInOutShapes({{{1, 3, 228, 228}}, - {{1, 3, 227, 113}}}), padrb({0, 0})), + {{1, 3, 227, 113}}}), pad_end()), // valid + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 228, 228}}, {{4, 3, 227, 113}}}), kernel({4, 2}), stride({2, 1}), pad({2, 4}), pool_type("max"), exclude_pad(true), auto_pad("valid"), NewInOutShapes({{{1, 3, 228, 228}}, - {{1, 3, 227, 113}}}), padrb({2, 1})), + {{1, 3, 227, 113}}}), pad_end({2, 1})), // same_upper + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 3, 227, 114}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), pool_type("max"), exclude_pad(true), auto_pad("same_upper"), NewInOutShapes({{{1, 3, 227, 227}}, - {{1, 3, 227, 114}}}), padrb({0, 0})), + {{1, 3, 227, 114}}}), pad_end()), // same_upper + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 3, 227, 114}}}), kernel({4, 2}), stride({2, 1}), pad({2, 4}), pool_type("max"), exclude_pad(true), auto_pad("same_upper"), NewInOutShapes({{{1, 3, 227, 227}}, - {{1, 3, 227, 114}}}), padrb({0, 0})), + {{1, 3, 227, 114}}}), pad_end({0, 0})), // same_lower + empty paddings ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 3, 227, 113}}}), kernel({4, 2}), stride({2, 1}), pad({0, 0}), pool_type("max"), exclude_pad(true), auto_pad("same_lower"), NewInOutShapes({{{1, 3, 227, 227}}, - {{1, 3, 227, 113}}}), padrb({0, 0})), + {{1, 3, 227, 113}}}), pad_end({0, 0})), // same_lower + fixated paddings (shouldn't affect) ::testing::make_tuple(InOutShapes({{{4, 3, 227, 227}}, {{4, 3, 227, 113}}}), kernel({4, 2}), stride({2, 1}), pad({2, 4}), pool_type("max"), exclude_pad(true), auto_pad("same_lower"), NewInOutShapes({{{1, 3, 227, 227}}, - {{1, 3, 227, 113}}}), padrb({0, 0})) + {{1, 3, 227, 113}}}), pad_end({0, 0})), + // 5D tensors + // fixate pad + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 3, 17, 129, 66}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({2, 1, 1}), pool_type("max"), exclude_pad(true), auto_pad(""), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 3, 17, 129, 66}}}), pad_end()), + // valid + empty paddings + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 3, 15, 127, 64}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({0, 0, 0}), pool_type("max"), exclude_pad(true), auto_pad("valid"), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 3, 15, 127, 64}}}), pad_end()), + // same_upper + empty paddings + ::testing::make_tuple(InOutShapes({{{4, 3, 16, 128, 130}}, + {{4, 3, 16, 128, 65}}}), kernel({4, 2, 2}), stride({2, 1, 1}), + pad({0, 0, 0}), pool_type("max"), exclude_pad(true), auto_pad("same_upper"), + NewInOutShapes({{{1, 3, 16, 128, 130}}, + {{1, 3, 16, 128, 65}}}), pad_end()) ) ); diff --git a/inference-engine/tests/unit/shape_infer/cpu_ext_shape_infer_general_test.cpp b/inference-engine/tests/unit/shape_infer/cpu_ext_shape_infer_general_test.cpp deleted file mode 100644 index 4551dd7..0000000 --- a/inference-engine/tests/unit/shape_infer/cpu_ext_shape_infer_general_test.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "built_in_shape_infer_general_test.hpp" - -using namespace InferenceEngine; -using namespace InferenceEngine::details; -using namespace ShapeInfer; - -class CPUExtShapeInferTests : public BuiltInShapeInferImplTest { -protected: - InferenceEngine::ShapeInferExtension shapeInferExt; - CPUExtShapeInferTests () : shapeInferExt(TestsCommon::make_so_name("cpu_extension")) {} - - void SetUp() override { - BuiltInShapeInferImplTest::SetUp(); - holder = std::shared_ptr(&shapeInferExt, [](IShapeInferExtension*){}); - } -}; - -TEST_P(CPUExtShapeInferTests, impl) { - auto impl = getShapeInferImpl(type); - ASSERT_NE(nullptr, impl); - ASSERT_NO_THROW(sts = impl->inferShapes(newInOutShapes.inDims, layerParams.data, blobs, outShapes, &resp)); - - if (canInfer) { - ASSERT_EQ(int(OK), sts) << resp.msg; - ASSERT_EQ(newInOutShapes.outDims, outShapes); - } else { - ASSERT_EQ(GENERAL_ERROR, sts) << resp.msg; - } -} - -TEST_P(CPUExtShapeInferTests, reshaper) { - auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName); - auto reshaper = std::make_shared(*cnnNetworkImplPtr); - auto inputShapes = setInputShapes(*cnnNetworkImplPtr.get(), newInOutShapes.inDims); - reshaper->AddExtension(holder); - - if (canInfer) { - reshaper->run(inputShapes); - checkNetworkInOut(*cnnNetworkImplPtr, newInOutShapes); - } else { - ASSERT_THROW(reshaper->run(inputShapes), InferenceEngine::details::InferenceEngineException); - } -} - -INSTANTIATE_TEST_CASE_P( - CPUExtGeneralImpls, CPUExtShapeInferTests, - ::testing::Values( - ::testing::make_tuple(LayerType("SpatialTransformer"), - InOutShapes({{{1, 6, 5, 5}, {1, 3}}, - {{1, 6, 5, 5}}}), - NewInOutShapes({{{2, 6, 5, 6}, {1, 3}}, - {{2, 6, 5, 6}}}), - MapParams(MapStrStr()), - LayerDataName("data"), - CanInfer(true)) - ) -); diff --git a/inference-engine/tests/unit/shape_infer/input_controller_test.cpp b/inference-engine/tests/unit/shape_infer/input_controller_test.cpp index c6fc375..80b7e86 100644 --- a/inference-engine/tests/unit/shape_infer/input_controller_test.cpp +++ b/inference-engine/tests/unit/shape_infer/input_controller_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -39,7 +39,7 @@ TEST_F(InputControllerTest, canPushShapes) { ASSERT_NO_THROW(controller.setShapeByName(inDims, TEST_NAME)); } -TEST_F(InputControllerTest, throwOnGetWithNotEnoughShapes) { +TEST_F(InputControllerTest, DISABLED_throwOnGetWithNotEnoughShapes) { InputController controller({notEmptyData, notEmptyData}, TEST_NAME); controller.setShapeByName(inDims, TEST_NAME); ASSERT_THROW(controller.getShapes(true), InferenceEngineException); @@ -57,7 +57,7 @@ TEST_F(InputControllerTest, canGetChanges) { ASSERT_NO_THROW(controller.getShapes(true)); } -TEST_F(InputControllerTest, throwOnApplyWithNotEnoughShapes) { +TEST_F(InputControllerTest, DISABLED_throwOnApplyWithNotEnoughShapes) { InputController controller({notEmptyData, notEmptyData}, TEST_NAME); controller.setShapeByName(inDims, TEST_NAME); ASSERT_THROW(controller.applyChanges(), InferenceEngineException); @@ -72,7 +72,7 @@ TEST_F(InputControllerTest, canApplyChanges) { TEST_F(InputControllerTest, canResetShapes) { InputController controller({notEmptyData}, TEST_NAME); controller.setShapeByName(inDims, TEST_NAME); - ASSERT_FALSE(controller.getShapes(true).empty()); + ASSERT_EQ(controller.getShapes(true)[0], inDims); ASSERT_NO_THROW(controller.reset()); - ASSERT_THROW(controller.getShapes(true), InferenceEngineException); + ASSERT_NE(controller.getShapes(true)[0], inDims); } diff --git a/inference-engine/tests/unit/shape_infer/input_reshape_launcher_test.cpp b/inference-engine/tests/unit/shape_infer/input_reshape_launcher_test.cpp index 7d99fcb..e877334 100644 --- a/inference-engine/tests/unit/shape_infer/input_reshape_launcher_test.cpp +++ b/inference-engine/tests/unit/shape_infer/input_reshape_launcher_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/shape_infer/output_controller_test.cpp b/inference-engine/tests/unit/shape_infer/output_controller_test.cpp index 8083875..c9c197a 100644 --- a/inference-engine/tests/unit/shape_infer/output_controller_test.cpp +++ b/inference-engine/tests/unit/shape_infer/output_controller_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/shape_infer/reshape_launcher_test.cpp b/inference-engine/tests/unit/shape_infer/reshape_launcher_test.cpp index 372d3f4..22e49b4 100644 --- a/inference-engine/tests/unit/shape_infer/reshape_launcher_test.cpp +++ b/inference-engine/tests/unit/shape_infer/reshape_launcher_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -20,7 +21,15 @@ protected: notEmptyData = getNotEmptyData(); impl = std::make_shared(); }; - + std::vector getBlobs(const std::vector& shapes) { + std::vector inBlobs; + for (auto const& dims : shapes) { + TensorDesc desc(Precision::FP32, dims, TensorDesc::getLayoutByDims(dims)); + auto blob = make_blob_with_precision(desc); + inBlobs.push_back(blob); + } + return inBlobs; + } public: StatusCode sts = GENERAL_ERROR; ResponseDesc resp; @@ -32,7 +41,7 @@ public: std::map changedParams{{TEST_NAME, TEST_NAME}}; public: DataPtr getNotEmptyData() { - return std::make_shared(TEST_NAME, Precision::UNSPECIFIED, Layout::C); + return std::make_shared(TEST_NAME, Precision::FP32, Layout::C); } }; @@ -92,7 +101,10 @@ TEST_F(ReshapeLauncherTest, throwOnReshapeWihtNotEnoughShapes) { ReshapeLauncher launcher(&layer, impl); launcher.setShapeByName(inDims, TEST_NAME); - ASSERT_THROW(launcher.reshape({}), InferenceEngineException); + try { + launcher.reshape({}); + FAIL() << "Reshape should be failed!"; + } catch (...) {} } TEST_F(ReshapeLauncherTest, implIsCalledOnReshape) { @@ -103,11 +115,12 @@ TEST_F(ReshapeLauncherTest, implIsCalledOnReshape) { auto inputController = initializer->getInputController(); auto outputController = initializer->getOutputController(); std::vector shapes{inDims}; + auto blobs = getBlobs(shapes); EXPECT_CALL(*inputController, setShapeByName(inDims, TEST_NAME)); - EXPECT_CALL(*inputController, getShapes(true)).WillOnce(Return(shapes)); + EXPECT_CALL(*inputController, getBlobs(true)).WillOnce(Return(blobs)); EXPECT_CALL(*outputController, setShapes(_)); EXPECT_CALL(*outputController, propagateShapes(_)); - EXPECT_CALL(*impl.get(), inferShapes(shapes, _, _, _, _)).WillOnce(Return(OK)); + EXPECT_CALL(*impl.get(), inferShapes(blobs, _, _, _, _)).WillOnce(Return(OK)); launcher.setShapeByName(inDims, TEST_NAME); launcher.reshape({}); } diff --git a/inference-engine/tests/unit/shape_infer/reshaper_test.cpp b/inference-engine/tests/unit/shape_infer/reshaper_test.cpp index 86364ea..0566e14 100644 --- a/inference-engine/tests/unit/shape_infer/reshaper_test.cpp +++ b/inference-engine/tests/unit/shape_infer/reshaper_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/stress_tests/stress_tests.cpp b/inference-engine/tests/unit/stress_tests/stress_tests.cpp index 5bb764f..28bba2f 100644 --- a/inference-engine/tests/unit/stress_tests/stress_tests.cpp +++ b/inference-engine/tests/unit/stress_tests/stress_tests.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,7 +8,6 @@ using namespace std; -#ifdef ENABLE_STRESS_UNIT_TESTS class StressTests : public ::testing::Test { protected: const std::string DUMMY_FILE_NAME = "Dummy.txt"; @@ -43,4 +42,3 @@ TEST_F(StressTests, checkBigFileSize) { DummyFileManager::deleteFile(DUMMY_FILE_NAME); ASSERT_EQ(size, BIG_FILE_SIZE); } -#endif //ENABLE_STRESS_UNIT_TESTS diff --git a/inference-engine/tests/unit/topology_verification_tests/v2_topology_verification_test.cpp b/inference-engine/tests/unit/topology_verification_tests/v2_topology_verification_test.cpp index 34ff736..44457b9 100644 --- a/inference-engine/tests/unit/topology_verification_tests/v2_topology_verification_test.cpp +++ b/inference-engine/tests/unit/topology_verification_tests/v2_topology_verification_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/tests/unit/transformations/eltwise_broadcast_test.cpp b/inference-engine/tests/unit/transformations/eltwise_broadcast_test.cpp new file mode 100644 index 0000000..83f48ef --- /dev/null +++ b/inference-engine/tests/unit/transformations/eltwise_broadcast_test.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "tranformations_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class TransformNetworkTest: public TransformationTestCommon {}; + +TEST_F(TransformationTestCommon, EltwiseBroadcastOneDimension) { + Builder::Network builder("eltwiseBroadcast"); + + idx_t firstInputId = builder.addLayer(Builder::InputLayer("FirstInput").setPort(Port({1, 3, 227, 1}))); + idx_t secondInputId = builder.addLayer(Builder::InputLayer("SecondInput").setPort(Port({1, 3, 227, 227}))); + idx_t eltwiseSumId = builder.addLayer({firstInputId, secondInputId}, Builder::EltwiseLayer("Sum"). + setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUM). + setOutputPort(Port({1, 3, 227, 227}))); + auto network = Transform::Network(builder); + + Transform::TransformationEltwiseBroadcast transformationEltwiseBroadcast; + transformationEltwiseBroadcast.execute(network); + auto firstInputLayer = network.getLayer(firstInputId); + auto tileLayer = network.getLayer(firstInputId).getOutPort().getConnection().getDestination().getLayer(); + ASSERT_EQ(tileLayer.getType(), "Tile"); + ASSERT_EQ(tileLayer.getParameter("axis").as(), 3); + ASSERT_EQ(tileLayer.getParameter("tiles").as(), 227); + ASSERT_EQ(firstInputLayer.getOutPort().getConnection().getDestination().getLayer().getId(), tileLayer.getId()); + ASSERT_EQ(tileLayer.getOutPort().getConnection().getDestination().getLayer().getId(), eltwiseSumId); +} + +TEST_F(TransformationTestCommon, EltwiseBroadcastTwoDimensions) { + Builder::Network builder("eltwiseBroadcast"); + + idx_t firstInputId = builder.addLayer(Builder::InputLayer("FirstInput").setPort(Port({1, 1, 227, 1}))); + idx_t secondInputId = builder.addLayer(Builder::InputLayer("SecondInput").setPort(Port({1, 3, 227, 227}))); + idx_t eltwiseSumId = builder.addLayer({firstInputId, secondInputId}, Builder::EltwiseLayer("Sum"). + setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUM). + setOutputPort(Port({1, 3, 227, 227}))); + auto network = Transform::Network(builder); + + Transform::TransformationEltwiseBroadcast transformationEltwiseBroadcast; + transformationEltwiseBroadcast.execute(network); + auto firstInputLayer = network.getLayer(firstInputId); + auto tile1Layer = network.getLayer(firstInputId).getOutPort().getConnection().getDestination().getLayer(); + auto tile2Layer = tile1Layer.getOutPort().getConnection().getDestination().getLayer(); + ASSERT_EQ(tile1Layer.getType(), "Tile"); + ASSERT_EQ(tile1Layer.getParameter("axis").as(), 1); + ASSERT_EQ(tile1Layer.getParameter("tiles").as(), 3); + ASSERT_EQ(tile2Layer.getType(), "Tile"); + ASSERT_EQ(tile2Layer.getParameter("axis").as(), 3); + ASSERT_EQ(tile2Layer.getParameter("tiles").as(), 227); + ASSERT_EQ(firstInputLayer.getOutPort().getConnection().getDestination().getLayer().getId(), tile1Layer.getId()); + ASSERT_EQ(tile1Layer.getOutPort().getConnection().getDestination().getLayer().getId(), tile2Layer.getId()); + ASSERT_EQ(tile2Layer.getOutPort().getConnection().getDestination().getLayer().getId(), eltwiseSumId); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/transformations/sub_test.cpp b/inference-engine/tests/unit/transformations/sub_test.cpp new file mode 100644 index 0000000..9e2f935 --- /dev/null +++ b/inference-engine/tests/unit/transformations/sub_test.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "tranformations_test.hpp" + +using namespace testing; +using namespace InferenceEngine; + +class TransformNetworkTest: public TransformationTestCommon {}; + +TEST_F(TransformationTestCommon, Sub) { + Builder::Network builder("sub"); + + idx_t firstInputId = builder.addLayer(Builder::InputLayer("FirstInput").setPort(Port({1,3, 227, 227}))); + idx_t secondInputId = builder.addLayer(Builder::InputLayer("SecondInput").setPort(Port({1,3, 227, 227}))); + idx_t eltwiseSubId = builder.addLayer({firstInputId, secondInputId}, Builder::EltwiseLayer("Sub").setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUB)); + idx_t clampId = builder.addLayer({eltwiseSubId}, Builder::ClampLayer("clamp")); + auto network = Transform::Network(builder); + + Transform::TransformationSub transformationSub; + transformationSub.execute(network); + ASSERT_THROW(network.getLayer("Sub"), InferenceEngine::details::InferenceEngineException); + auto sumLayer = network.getLayer(firstInputId).getOutPort().getConnection().getDestination().getLayer(); + auto powerLayer = network.getLayer(secondInputId).getOutPort().getConnection().getDestination().getLayer(); + ASSERT_EQ(sumLayer.getType(), "Eltwise"); + ASSERT_EQ(sumLayer.getParameter("operation").as(), "sum"); + ASSERT_EQ(powerLayer.getType(), "Power"); + ASSERT_EQ(powerLayer.getParameter("power").as(), 1.0f); + ASSERT_EQ(powerLayer.getParameter("scale").as(), -1.0f); + ASSERT_EQ(powerLayer.getParameter("shift").as(), 0.0f); + ASSERT_EQ(sumLayer.getOutPort().getConnection().getDestination().getLayer().getId(), clampId); +} \ No newline at end of file diff --git a/inference-engine/tests/unit/transformations/tranformations_test.hpp b/inference-engine/tests/unit/transformations/tranformations_test.hpp new file mode 100644 index 0000000..797c298 --- /dev/null +++ b/inference-engine/tests/unit/transformations/tranformations_test.hpp @@ -0,0 +1,13 @@ +// Copyright (C) 2018-2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "../builders/builder_test.hpp" + +class TransformationTestCommon : public BuilderTestCommon { +public: +}; \ No newline at end of file diff --git a/inference-engine/tests/validation_app/CMakeLists.txt b/inference-engine/tests/validation_app/CMakeLists.txt new file mode 100644 index 0000000..04be08c --- /dev/null +++ b/inference-engine/tests/validation_app/CMakeLists.txt @@ -0,0 +1,62 @@ +# +# Copyright (C) 2018-2019 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you (End User License Agreement for the Intel(R) Software +# Development Products (Version May 2017)). Unless the License provides +# otherwise, you may not use, modify, copy, publish, distribute, disclose or +# transmit this software or the related documents without Intel's prior +# written permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +set (TARGET_NAME "test_validation_app") + +# Find OpenCV components if exist +find_package(OpenCV COMPONENTS imgcodecs) +if(NOT(OpenCV_FOUND)) + message(WARNING "No suitable OpenCV version detected, " ${TARGET_NAME} " skipped") + return() +endif() + +set(VALIDATION_APP_SOURCE "${IE_MAIN_SOURCE_DIR}/samples/validation_app") + +file (GLOB MAIN_SRC + ${VALIDATION_APP_SOURCE}/*.cpp + ${VALIDATION_APP_SOURCE}/pugixml/*.cpp + ) + +file (GLOB MAIN_HEADERS + ${VALIDATION_APP_SOURCE}/*.hpp + ${VALIDATION_APP_SOURCE}/pugixml/*.hpp + ) + +# Create named folders for the sources within the .vcproj +# Empty name lists them directly under the .vcproj +source_group("src" FILES ${MAIN_SRC}) +source_group("include" FILES ${MAIN_HEADERS}) + +if (WIN32) + if(NOT "${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + message(FATAL_ERROR "Only 64-bit supported on Windows") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS -DNOMINMAX") +endif() + +# Properties->C/C++->General->Additional Include Directories +include_directories (${VALIDATION_APP_SOURCE}/../classification_sample/core + ${VALIDATION_APP_SOURCE}/../common + ${VALIDATION_APP_SOURCE}/../common/os/windows + ${VALIDATION_APP_SOURCE}/../../include) + +# Create library file from sources. + +list(REMOVE_ITEM MAIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) + +add_library(${TARGET_NAME} STATIC ${MAIN_SRC} ${MAIN_HEADERS}) +set_target_properties(${TARGET_NAME} PROPERTIES "COMPILE_PDB_NAME" ${TARGET_NAME}) +target_link_libraries(${TARGET_NAME} gflags ie_cpu_extension ${OpenCV_LIBRARIES}) \ No newline at end of file diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt index 8277d6c..f65f38c 100644 --- a/inference-engine/thirdparty/CMakeLists.txt +++ b/inference-engine/thirdparty/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -8,12 +8,16 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") endif() add_subdirectory(pugixml) +export(TARGETS pugixml NAMESPACE IE:: APPEND FILE "${CMAKE_BINARY_DIR}/targets.cmake") + add_subdirectory(stb_lib) add_subdirectory(ade) if (ENABLE_CLDNN) - set(CLDNN__OUTPUT_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) + set(CLDNN__OUTPUT_BIN_DIR ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + set(CLDNN__OUTPUT_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) set(CLDNN__INCLUDE_TESTS OFF CACHE BOOL "" FORCE) + set(CLDNN__INCLUDE_CORE_INTERNAL_TESTS OFF CACHE BOOL "" FORCE) set(CLDNN__INCLUDE_EXAMPLES OFF CACHE BOOL "" FORCE) set(CLDNN__INCLUDE_TUTORIAL OFF CACHE BOOL "" FORCE) if (WIN32) @@ -22,9 +26,7 @@ if (ENABLE_CLDNN) set(CLDNN__ARCHITECTURE_TARGET "Linux64" CACHE STRING "" FORCE) endif() - remove_definitions(-fvisibility=default) add_subdirectory(clDNN) - add_definitions(-fvisibility=default) endif() if(ENABLE_MKL_DNN) diff --git a/inference-engine/thirdparty/clDNN/.gitignore b/inference-engine/thirdparty/clDNN/.gitignore new file mode 100644 index 0000000..8359a8c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/.gitignore @@ -0,0 +1,7 @@ +build/* +*.pyc +*~ +UnixMk +**/.idea/* +src/caps/private/*.inc +/examples/utils/venv diff --git a/inference-engine/thirdparty/clDNN/CMakeLists.txt b/inference-engine/thirdparty/clDNN/CMakeLists.txt index 6ce8119..624d95c 100644 --- a/inference-engine/thirdparty/clDNN/CMakeLists.txt +++ b/inference-engine/thirdparty/clDNN/CMakeLists.txt @@ -75,6 +75,9 @@ set(CLDNN__COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common") # Path which points to directory with interface for framework. set(CLDNN__API_DIR "${CMAKE_CURRENT_SOURCE_DIR}/api") +# Path which points to directory with interface extension for framework. +set(CLDNN__API_EXTENSION_DIR "${CMAKE_CURRENT_SOURCE_DIR}/api_extension") + # Path which points to directory with interface for framework. set(CLDNN__KERNEL_SELECTOR_DIR "${CMAKE_CURRENT_SOURCE_DIR}/kernel_selector") @@ -338,7 +341,7 @@ endif() if(DEFINED CLDNN__OUTPUT_DIR) set(CLDNN__OUTPUT_BIN_DIR "${CLDNN__OUTPUT_DIR}" CACHE PATH "Output directory path where the final exetuables, examples and tests will be stored.") set(CLDNN__OUTPUT_LIB_DIR "${CLDNN__OUTPUT_DIR}" CACHE PATH "Output directory path where the final libraries will be stored.") -else() +elseif(NOT DEFINED CLDNN__OUTPUT_BIN_DIR AND NOT DEFINED CLDNN__OUTPUT_LIB_DIR) # Output directory path where the final libraries, examples and tests will be stored. if(CLDNN__MULTI_CFG_GEN) # Multi-configuration generators automatically append build type subdirectory. @@ -382,6 +385,13 @@ mark_as_advanced(CLDNN__INCLUDE_TESTS) # ====================================================================================================== +# Include and build: Core Internal Tests (unit tests and small acceptance tests) for core internal clDNN framework mechanisms. +set(CLDNN__INCLUDE_CORE_INTERNAL_TESTS ON CACHE BOOL "Include and build: clDNN framework's core internal tests.") +mark_as_advanced(CLDNN__INCLUDE_CORE_INTERNAL_TESTS) + +# ====================================================================================================== + + # Include and build: clDNN tutorial. set(CLDNN__INCLUDE_TUTORIAL ON CACHE BOOL "Include and build: clDNN Tutorial.") mark_as_advanced(CLDNN__INCLUDE_TUTORIAL) @@ -394,6 +404,12 @@ mark_as_advanced(CLDNN__RUN_TESTS) # ====================================================================================================== +# Run (requires CLDNN__INCLUDE_CORE_INTERNAL_TESTS to be true): Tests (unit tests and small acceptance core internal tests) for clDNN framework. +set(CLDNN__RUN_CORE_INTERNAL_TESTS OFF CACHE BOOL "Run: clDNN framework's core internal tests.") +mark_as_advanced(CLDNN__RUN_CORE_INTERNAL_TESTS) + +# ====================================================================================================== + # Compile / Link: Use static C++ Runtime library. set(CLDNN__COMPILE_LINK_USE_STATIC_RUNTIME OFF CACHE BOOL "Compile / Link: Use static version of C++ Runtime library instead of shared one.") mark_as_advanced(CLDNN__COMPILE_LINK_USE_STATIC_RUNTIME) @@ -429,6 +445,14 @@ endif() # ====================================================================================================== +# Checking whether tests can be run. +if((NOT CLDNN__INCLUDE_CORE_INTERNAL_TESTS) AND CLDNN__RUN_CORE_INTERNAL_TESTS) + message(WARNING "[clDNN] CLDNN__INCLUDE_CORE_INTERNAL_TESTS: Selected running of core internal tests, but test are not built. Option will be disabled.") + set(CLDNN__RUN_CORE_INTERNAL_TESTS OFF) +endif() + +# ====================================================================================================== + # Check for python 2.7 interpreter (required tool). find_package(PythonInterp 2.7) if(NOT PYTHONINTERP_FOUND) @@ -534,6 +558,8 @@ unset(__CLDNN_IOclIcdDefaultVersion) unset(__CLDNN_IOclIcdVersionIdx) +# ====================================================================================================== +set(CLDNN_UTILS__RAPIDJSON_INCDIRS "utils/rapidjson" CACHE INTERNAL "Paths to interface headers for rapidjson.") # ====================================== Version Calculation =========================================== if(EXISTS "${CLDNN__VERSION_FILE_NAME}") @@ -619,17 +645,19 @@ message(STATUS "[clDNN]") message(STATUS "[clDNN]") message(STATUS "[clDNN] Advanced:") if (CLDNN__IOCL_ICD_USE_EXTERNAL) - message(STATUS "[clDNN] - ICD version used to build: N/A (installed externally)") + message(STATUS "[clDNN] - ICD version used to build: N/A (installed externally)") else() - message(STATUS "[clDNN] - ICD version used to build: ${CLDNN__IOCL_ICD_VERSION}") + message(STATUS "[clDNN] - ICD version used to build: ${CLDNN__IOCL_ICD_VERSION}") endif() message(STATUS "[clDNN]") -message(STATUS "[clDNN] - Include/Build cldnn core: ${CLDNN__INCLUDE_CORE}") -message(STATUS "[clDNN] - Include/Build kernel selector: ${CLDNN__INCLUDE_KERNEL_SELECTOR}") -message(STATUS "[clDNN] - Include/Build tests: ${CLDNN__INCLUDE_TESTS}") -message(STATUS "[clDNN] - Include/Build tutorial: ${CLDNN__INCLUDE_TUTORIAL}") +message(STATUS "[clDNN] - Include/Build cldnn core: ${CLDNN__INCLUDE_CORE}") +message(STATUS "[clDNN] - Include/Build kernel selector: ${CLDNN__INCLUDE_KERNEL_SELECTOR}") +message(STATUS "[clDNN] - Include/Build tests: ${CLDNN__INCLUDE_TESTS}") +message(STATUS "[clDNN] - Include/Build core internal tests: ${CLDNN__INCLUDE_CORE_INTERNAL_TESTS}") +message(STATUS "[clDNN] - Include/Build tutorial: ${CLDNN__INCLUDE_TUTORIAL}") message(STATUS "[clDNN]") message(STATUS "[clDNN] - Run tests: ${CLDNN__RUN_TESTS}") +message(STATUS "[clDNN] - Run core internal tests: ${CLDNN__RUN_CORE_INTERNAL_TESTS}") message(STATUS "[clDNN]") message(STATUS "[clDNN] - Use static C++ Runtime: ${CLDNN__COMPILE_LINK_USE_STATIC_RUNTIME}") message(STATUS "[clDNN] - Allow unsafe size opts: ${CLDNN__COMPILE_LINK_ALLOW_UNSAFE_SIZE_OPT}") @@ -659,10 +687,10 @@ set(CLDNN_BUILD__PROJ_LABEL__clDNN "clDNN") # Old. set(EXECUTABLE_OUTPUT_PATH "${CLDNN__OUTPUT_BIN_DIR}") -set(LIBRARY_OUTPUT_PATH "${CLDNN__OUTPUT_BIN_DIR}") +set(LIBRARY_OUTPUT_PATH "${CLDNN__OUTPUT_LIB_DIR}") # New. set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CLDNN__OUTPUT_LIB_DIR}") -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CLDNN__OUTPUT_BIN_DIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CLDNN__OUTPUT_LIB_DIR}") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CLDNN__OUTPUT_BIN_DIR}") @@ -679,7 +707,7 @@ intel_arch_get_os(__CLDNN_TargetOs "${CLDNN__ARCHITECTURE_TARGET}") if(__CLDNN_TargetOs MATCHES "^Darwin$") set(CMAKE_INSTALL_RPATH "@executable_path") else() - set(CMAKE_INSTALL_RPATH "$ORIGIN") + set(CMAKE_INSTALL_RPATH "$ORIGIN/lib") endif() unset(__CLDNN_TargetOs) @@ -931,6 +959,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) SET_RAW -Wl,-z,noexecstack,-z,relro,-z,now ) + list(APPEND CLDNN__SYSTEM_LINK_LIBRARIES "dl") endif() if((CMAKE_C_COMPILER_ID MATCHES "^Clang$") OR (CMAKE_CXX_COMPILER_ID MATCHES "^Clang$")) @@ -944,7 +973,7 @@ if((CMAKE_C_COMPILER_ID MATCHES "^Clang$") OR (CMAKE_CXX_COMPILER_ID MATCHES "^C -Wl,-headerpad_max_install_names ) - list(APPEND CLDNN__SYSTEM_LINK_LIBRARIES "c++" "c++abi" "supc++") + list(APPEND CLDNN__SYSTEM_LINK_LIBRARIES "c++" "c++abi" "supc++" "dl") endif() unset(__CLDNN_LinkerFlagName) @@ -995,6 +1024,7 @@ endif() include_directories( ${CLDNN__IOCL_ICD_INCDIRS} + ${CLDNN_UTILS__RAPIDJSON_INCDIRS} "${CLDNN__KHR_CLHPP_DIR}" "${CLDNN__CODEGEN_INCDIR}" ) @@ -1010,6 +1040,9 @@ endif() if(CLDNN__INCLUDE_TESTS) add_subdirectory(tests) endif() +if(CLDNN__INCLUDE_CORE_INTERNAL_TESTS) + add_subdirectory(tests_core_internal) +endif() if(CLDNN__INCLUDE_KERNEL_SELECTOR) add_subdirectory(kernel_selector) endif() diff --git a/inference-engine/thirdparty/clDNN/README.md b/inference-engine/thirdparty/clDNN/README.md index 6a31eb2..fc0d77e 100644 --- a/inference-engine/thirdparty/clDNN/README.md +++ b/inference-engine/thirdparty/clDNN/README.md @@ -6,7 +6,7 @@ *Compute Library for Deep Neural Networks* (*clDNN*) is an open source performance library for Deep Learning (DL) applications intended for acceleration of DL Inference on Intel® Processor Graphics – including HD Graphics and -Iris® Graphics. +Iris® Graphics. *clDNN* includes highly optimized building blocks for implementation of convolutional neural networks (CNN) with C and C++ interfaces. We created this project to enable the DL community to innovate on Intel® processors. @@ -25,6 +25,7 @@ clDNN is licensed is licensed under clDNN uses 3rd-party components licensed under following licenses: - *googletest* under [Google\* License](https://github.com/google/googletest/blob/master/googletest/LICENSE) - *OpenCL™ ICD and C++ Wrapper* under [Khronos™ License](https://github.com/KhronosGroup/OpenCL-CLHPP/blob/master/LICENSE.txt) +- *RapidJSON* under [Tencent\* License](https://github.com/Tencent/rapidjson/blob/master/license.txt) ## Documentation The latest clDNN documentation is at [GitHub pages](https://intel.github.io/clDNN/index.html). @@ -41,8 +42,126 @@ clDNN is released also together with Intel® OpenVino™ Toolkit, which contains You can find more information [here](https://software.intel.com/en-us/openvino-toolkit/deep-learning-cv). +## OpenVINO specific changes + New features: + - added `not` activation type + - added `depth_to_space` layer + - new clip options in `detection_output` (cpu impl) and `proposal` layers + - added eltwise `xor` and `squared_diff` operations + - added `gather` layer + - added `bilinear` mode for position sensitive `roi_pooling` layer + - added `shuffle_channels` layer + - added `strided_slice` layer + - added IE gates ordering for lstm layer + - added `reverse_sequence` layer + Bug fixes: + - fixed unknown bool type error in C API + - fixed non-relu activation fusing with conv_eltwise node + - fixed infinite performance regression on several topologies + - minor internal fixes + - unified the permute order with cldnn's tensor order + Other: + - removed boost + - supported compilation with c++11 only + + ## Changelog +### Drop 13.1 + New features: + - added max mode for contract primitive + - added one_hot primitive + - optional explicit output data type support for all primitives + Bug fixes: + - fix for graph optimizer (crop primitive) + - fix for processing order (deconvolution primitive) + - fix for convolution-eltwise primitive + UX: + - cache.json is searched in to library directory + Performance: + - optimizations for lstm_gemm primitive + +### Drop 13.0 + New features: + - events pool + - group support in convolution and deconvolution primitives + - broadcastable inputs support for eltwise primitive + - asymmetric padding for convolution primitive + - fused convolution-eltwise primitive (API extension) + - auto-calculated output shape support for reshape primitive + - crop support for i8/s8/i32/i64 types + - broadcast axis support for broadcast primitive + - logic and comparison operations support for eltwise primitive + Bug fixes: + - added required alignment checks for some fc implementations + - added lstm support for f16 (half) type + - reorders for fc moved to graph compiler + - primitive fusing and reorder fixes + UX: + - added internal core tests project + - refactored optimizations pass manager and passes + Performance: + - optimized concatenation during upsampling (unpool) + - IMAD-based optimizations for convolution, fc, eltwise and pooling primitives (i8/s8) + - convolution-eltwise fusing optimizations + - partial writes optimizations for block-based kernels + +### Drop 12.1 + - gtests code refactor + - buildbreak fix + +### Drop 12.0 + New features: + - pyramidRoiAlign primitive + - multiple axes support for reverse mode in index_select + - eltwise min/max/mod support for i8/i32/i64 + - broadcast support for i32/i64 + Bug fixes: + - memory leak fixes + - in-place reshape + - no padding for output primitives + UX: + - RapidJSON library for auto-tune cache + - less dependencies in program.cpp + - do not throw error, when device not validated + - global pooling in c API + - optimized padding for convolution + +### Drop 11.0 + New features: + - throttle hints + - extended border and tile + - GPU implementation of Detection Output + - More cases for BatchNorm primitive + Bug fixes: + - GEMM fix (align with ONNX) + - memory leak fix in memory pool + - increase FC precision for fp16 (fp32 accu) + Performance: + - cache for new topologies and devices + - conv1x1 with stride >1 into eltwise optimization + +### Drop 10.0 + New features: + - condition primitive + - fused convolution with bn and scale (backprop) + - scale/shit and mean/var as an output in batch norm + - add LSTM output selection + Bug fixes: + - memory pool fixes + UX: + - downgrade to cxx11 + - add support for u8 data type in custom primitive + - library size optimizations + Performance: + - in place concatenation optimization + - conv1x1 with stride >1 into eltwise optimization + +### Drop 9.2 + New features + - local convolution + - eltwise with strie + ### Drop 9.1 New features: - select index primitive @@ -161,7 +280,7 @@ You can find more information [here](https://software.intel.com/en-us/openvino-t - reorder optimization - concatenation optimization - eltwise optimization - - activation fusing + - activation fusing ### Drop 3.0 Added: @@ -183,7 +302,7 @@ You can find more information [here](https://software.intel.com/en-us/openvino-t - initial drop of clDNN ## Support -Please report issues and suggestions +Please report issues and suggestions [GitHub issues](https://github.com/01org/cldnn/issues). ## How to Contribute @@ -224,7 +343,7 @@ clDNN supports Intel® HD Graphics and Intel® Iris® Graphics and is optimized * Intel® Iris® Graphics 650 (GT3e, *client* market) * Intel® HD Graphics P630 (GT2, *server* market) * Intel® Iris® Pro Graphics 630 (GT2, *server* market) - + clDNN currently uses OpenCL™ with multiple Intel® OpenCL™ extensions and requires Intel® Graphics Driver to run. clDNN requires CPU with Intel® SSE/Intel® AVX support. @@ -232,9 +351,9 @@ clDNN requires CPU with Intel® SSE/Intel® AVX support. --- The software dependencies are: -- [CMake\*](https://cmake.org/download/) 3.5 or later +- [CMake\*](https://cmake.org/download/) 3.5 or later - C++ compiler with C++11 standard support compatible with: - * GNU\* Compiler Collection 4.8 or later + * GNU\* Compiler Collection 4.8 or later * clang 3.5 or later * [Intel® C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) 17.0 or later * Visual C++ 2015 (MSVC++ 19.0) or later @@ -242,10 +361,10 @@ The software dependencies are: > Intel® CPU intrinsics header (``) must be available during compilation. - [python™](https://www.python.org/downloads/) 2.7 or later (scripts are both compatible with python™ 2.7.x and python™ 3.x) -- *(optional)* [Doxygen\*](http://www.stack.nl/~dimitri/doxygen/download.html) 1.8.13 or later +- *(optional)* [Doxygen\*](http://www.stack.nl/~dimitri/doxygen/download.html) 1.8.13 or later Needed for manual generation of documentation from inline comments or running `docs` custom target which will generate it automatically. -> [GraphViz\*](http://www.graphviz.org/Download..php) (2.38 or later) is also recommended to generate documentation with all embedded diagrams. +> [GraphViz\*](http://www.graphviz.org/Download..php) (2.38 or later) is also recommended to generate documentation with all embedded diagrams. (Make sure that `dot` application is visible in the `PATH` environment variable.) --- @@ -275,14 +394,14 @@ clDNN uses multiple 3rd-party components. They are stored in binary f --- -clDNN uses a CMake-based build system. You can use CMake command-line tool or CMake GUI (`cmake-gui`) to generate required solution. +clDNN uses a CMake-based build system. You can use CMake command-line tool or CMake GUI (`cmake-gui`) to generate required solution. For Windows system, you can call in `cmd` (or `powershell`): ```shellscript @REM Generate 32-bit solution (solution contains multiple build configurations)... cmake -E make_directory build && cd build && cmake -G "Visual Studio 14 2015" .. @REM Generate 64-bit solution (solution contains multiple build configurations)... cmake -E make_directory build && cd build && cmake -G "Visual Studio 14 2015 Win64" .. -``` +``` Created solution can be opened in Visual Studio 2015 or built using appropriate `msbuild` tool (you can also use `cmake --build .` to select build tool automatically). @@ -324,7 +443,7 @@ CMake solution offers multiple options which you can specify using normal CMake | CLDNN__RUN_TESTS | BOOL | Run tests after building `tests` project. This option requires `CLDNN__INCLUDE_TESTS` option to be `ON`. Default: `OFF` | | | | | | CLDNN__CMAKE_DEBUG | BOOL | Enable extended debug messages in CMake. Default: `OFF` | - + --- clDNN includes unit tests implemented using the googletest framework. To validate your build, run `tests` target, e.g.: diff --git a/inference-engine/thirdparty/clDNN/api/C/batch_norm.h b/inference-engine/thirdparty/clDNN/api/C/batch_norm.h index c35351c..e108a41 100644 --- a/inference-engine/thirdparty/clDNN/api/C/batch_norm.h +++ b/inference-engine/thirdparty/clDNN/api/C/batch_norm.h @@ -37,13 +37,17 @@ extern "C" { /// /// Algorithm: /// @n global stats can be computed as: -/// @n out[i] = in[i] - mean[b] / sqrt(variance[b] + epsilon) +/// @n out[i] = ( (in[i] - mean[b]) / sqrt(variance[b] + epsilon) ) * scale[b] + shift[b] CLDNN_BEGIN_PRIMITIVE_DESC(batch_norm) /// @brief Primitive id containing mean data. cldnn_primitive_id mean; /// @brief Primitive id containing variance. cldnn_primitive_id variance; +/// @brief Primitive id containing scale. +cldnn_primitive_id scale; +/// @brief Primitive id containing shift. +cldnn_primitive_id shift; /// @brief Primitive id containing inverted variance used in future gradient computing. cldnn_primitive_id inv_variance; /// @brief Epsilon. diff --git a/inference-engine/thirdparty/clDNN/api/C/border.h b/inference-engine/thirdparty/clDNN/api/C/border.h index a7b90fb..5537ca7 100644 --- a/inference-engine/thirdparty/clDNN/api/C/border.h +++ b/inference-engine/thirdparty/clDNN/api/C/border.h @@ -36,18 +36,19 @@ typedef enum /*:int32_t*/ { /// @brief All points in the border are set to constant value. cldnn_border_constant, + cldnn_border_zero = cldnn_border_constant, /// keep bwd compatibilty /// @brief Border is constructed as an mirror of image (edge is also mirrored). /// @details Size of border in any dimension cannot be larger than size of /// input in the same dimension. cldnn_border_mirror, - /// @brief Border is constructed as an replication of edge. - /// @details Size of border in any dimension cannot be larger than size of - /// input in the same dimension. - cldnn_border_edge, /// @brief Border is constructed as an mirror of image (edge is NOT mirrored). /// @details Size of border in any dimension cannot be larger than size of /// input in the same dimension decreased by @c 1. - cldnn_border_mirror_101 + cldnn_border_mirror_101, + /// @brief Border is constructed as an replication of edge. + /// @details Size of border in any dimension cannot be larger than size of + /// input in the same dimension. + cldnn_border_edge } cldnn_border_type; diff --git a/inference-engine/thirdparty/clDNN/api/C/broadcast.h b/inference-engine/thirdparty/clDNN/api/C/broadcast.h index d431b5c..d820de2 100644 --- a/inference-engine/thirdparty/clDNN/api/C/broadcast.h +++ b/inference-engine/thirdparty/clDNN/api/C/broadcast.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,14 +30,16 @@ extern "C" { #endif -/// @brief Broadcasts input to specified output size (broadcast size). +/// @brief Broadcasts input to defined by @p broadcast_sizes output. @p broadcast_axes are used to +/// reinterpret input (reshape) inside algorithm. /// -/// @details Takes input and copies it to output once or multiple times, until output will -/// reach the sizes specified in @p broadcast_sizes. +/// @details Takes input, reinterpret it according to @p broadcast_axes +/// and copies it to output once or multiple times. /// @n -/// @n Lets assume that: +/// @n Simple example with empty @p broadcast_axes. Lets assume that: /// @n input_sizes = (in_b, in_f, in_y, in_x) /// @n broadcast_sizes = (bs_b, bs_f, bs_y, bs_x) +/// @n broadcast_axes = () - empty /// @n The input is broadcasted on each dimension where bs_{dim} > in_{dim} and bs_{dim} /// is dividable by in_{dim} (input is copied bs_{dim} / in_{dim} times). /// The dimensions where bs_{dim} is equal to in_{dim} remain unchanged. @@ -46,22 +48,36 @@ extern "C" { /// @n output[(b, f, y, x)] = input[(b % in_b, f % in_f, y % in_y, x % in_x)] /// @n where (b, f, y, x) is a position of value in a primitive output. /// @n +/// @n More complicated example with non empty @p broadcast_axes. Lets assume that: +/// @n broadcast_sizes = (bs_b, bs_f, bs_y, bs_x) +/// @n broadcast_axes = (2) +/// @n Taking into account broadcast_axes size (=1) primitive's input must be (4 - 1 = 3): +/// @n primitive input = (1, in_b, in_f, in_x) +/// @n Due to broadcast_axes = (2) primitive will interpret input as: +/// @n primitive input(internal representation) = (in_b, in_f, 1, in_x) +/// @n Now, you can apply broadcast rules from previous example to modified (reinterpreted) +/// input and output: +/// @n input_sizes = (in_b, in_f, 1, in_x) +/// @n output_shape = (bs_b, bs_f, bs_y, bs_x) +/// @n broadcast_axes = () - empty +/// @n /// @n@b Requirements: -/// @n - @p broadcast_sizes must be positive on all dimensions and compatible -/// with size of input (describe the same dimensions). -/// @n - @p broadcast_sizes must be greater than or equal to input sizes on -/// all dimensions. (For any dimension, if @p broadcast_sizes is lower -/// than input size on the dimension then @p broadcast_sizes will be replaced -/// by input size on this dimension.) -/// @n - For any dimension, if @p broadcast_sizes is greater than input size on -/// the dimension then @p broadcast_sizes must be dividable by input size -/// on this dimension. -/// @n Breaking any of these conditions will raise an exeption. +/// @n - @p broadcast_sizes must be positive on all dimensions. +/// @n - @p broadcast_axes size (dimensions count) must be within (inclusive) range +/// 0 - 4. +/// @n - @p broadcast_axes mustn't have duplicate values. +/// @n - Values of @p broadcast_axes must be within (inclusive) range 0 - 3 +/// @n - @p output_shape must be greater (dividable) than or equal to reinterpreted +/// input on all dimensions. +/// @n Breaking any of these conditions will raise an exception. CLDNN_BEGIN_PRIMITIVE_DESC(broadcast) /// @brief Sizes of broadcast. Output size of current primitive will match broadcast sizes (layout type /// will not change). -/// If @p broadcast_sizes are not specified (all zeros), the input sizes are used as @p broadcast_sizes. cldnn_tensor broadcast_sizes; +/// @brief Array of axes positions from output shape (0-based, from left to right) +/// along which broadcast should happen. +cldnn_uint16_t_arr broadcast_axes; + CLDNN_END_PRIMITIVE_DESC(broadcast) diff --git a/inference-engine/thirdparty/clDNN/api/C/cldnn.h b/inference-engine/thirdparty/clDNN/api/C/cldnn.h index 6a61b9e..9b705fb 100644 --- a/inference-engine/thirdparty/clDNN/api/C/cldnn.h +++ b/inference-engine/thirdparty/clDNN/api/C/cldnn.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -157,8 +157,10 @@ typedef struct const char* engine_log; ///< Specifies a file to which engine log should be dumped. Null/empty values means no logging. const char* sources_dumps_dir; ///< Specifies a directory where sources of cldnn::program objects should be dumped. Null/empty values means no loggins. /*cldnn_priority_mode_type*/ int16_t priority_mode; ///< Priority mode (support of OpenCL priority hints in command queue). - /*cldnn_throttle_mode_type*/ int16_t throttle_mode; ///< Placeholder for throttle mode (support of throttle hints in command queue). It has no effect for now and should be set to cldnn_throttle_disabled. + /*cldnn_throttle_mode_type*/ int16_t throttle_mode; ///< Throttle mode (support of throttle hints in command queue). uint32_t enable_memory_pool; ///< Enables memory usage optimization. memory objects will be reused when possible. + void* context; + const char* tuning_cache_path; ///< Enables defining other than default path to tuning cache json } cldnn_engine_configuration; /// @brief Information about the engine returned by cldnn_get_engine_info(). @@ -212,7 +214,8 @@ typedef enum /*:int32_t*/ cldnn_build_option_graph_dumps_dir, ///< Specifies a directory to which stages of network compilation should be dumped. cldnn_build_option_serialization, ///< Specifies a name of files to which serialization should be dumped. cldnn_build_option_load_program, ///< Specifies a name of load_program process. - cldnn_build_option_learning_config ///< User defined learning parameters. + cldnn_build_option_learning_config, ///< User defined learning parameters. + cldnn_build_option_detection_output_gpu ///< Run detection output layer always on GPU, regardless performance } cldnn_build_option_type; /// @brief Tuning modes. @@ -275,6 +278,8 @@ typedef enum /*:int32_t*/ cldnn_format_fyxb, ///< format not used inside clDNN, but supported in reorder as extension for user provided formats. cldnn_format_os_iyx_osv16, ///< format used only for convolution weights: os - output feature maps slice, i - input feature maps, yx - spatials, sv16 - 16 values of single slice. ///< \n \image html os_iyx_osv16.jpg + cldnn_format_os_iyx_osv32, ///< format used only for convolution weights: os - output feature maps slice, i - input feature maps, yx - spatials, sv32 - 32 values of single slice. + cldnn_format_os_iyx_osv64, ///< format used only for convolution weights: os - output feature maps slice, i - input feature maps, yx - spatials, sv64 - 64 values of single slice. cldnn_format_bs_xs_xsv8_bsv8, ///< format used only for fully connected weights: bs - batch slice, xs - x slice, bsv8 - 8 values of single slice. ///< \n \image html bs_xs_xsv8_bsv8.jpg cldnn_format_bs_xs_xsv8_bsv16,///< format used only for fully connected weights: bs - batch slice, xs - x slice, bsv16 - 16 values of single slice. @@ -287,10 +292,23 @@ typedef enum /*:int32_t*/ ///< \n \image html image_2d_weights_c4_fyx_b.jpg cldnn_format_image_2d_weights_c1_b_fyx, ///< image format for weights, image 2d, single channel, width size is b, height is f*y*x ///< \n \image html image_2d_weights_c1_b_fyx.jpg - cldnn_format_byxf_af32, /// < \n format for input for primitives using MMAD - cldnn_format_fs_bs_yx_bs4_fs32, /// < \n format for batched input for primitives using MMAD + cldnn_format_winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1 + cldnn_format_winograd_2x3_s1_weights, ///< format used for weights for winograd non-fused convolution, F(2,3) -- filter 3x3 with stride 1 + cldnn_format_winograd_2x3_s1_fused_weights, ///< format used for weights for winograd fused convolution, F(2,3) -- filter 3x3 with stride 1 + cldnn_format_winograd_6x3_s1_fused_weights, ///< format used for weights for winograd fused convolution, F(6,3) -- filter 3x3 with stride 1 + cldnn_format_image_2d_weights_winograd_6x3_s1_fbxyb, ///< image format used for weights for winograd fused convolution, F(6,3) -- filter 3x3 with stride 1 + cldnn_format_image_2d_weights_winograd_6x3_s1_xfbyb, ///< image format used for weights for winograd fused convolution, F(6,3) -- filter 3x3 with stride 1 + cldnn_format_byxf_af32, /// < \n format for input for primitives using MMAD + cldnn_format_byx8_f4, /// < \n format for input for MMAD convolutions + cldnn_format_fs_bs_yx_bs4_fs32, /// < \n format for batched input for primitives using MMAD cldnn_format_os_is_yx_isa8_osv8_isv4, /// < \n format for weights for MMAD convolutions, stored as ((aligned_to_8(O)/8) * (aligned_to_32(I)/32) * Y * X * ( 8 ) * ( 8 ) * ( 4 ) + cldnn_format_os_is_yx_isa8_osv8_isv4_swizzled_by_4, /// < \n format for weights for MMAD convolutions cldnn_format_is_o_yx_isv32, /// < \n format for weights for 1x1 MMAD convolutions + cldnn_format_is_o32_yx_isv32_swizzled_by_4, /// < \n format for weights for 1x1 MMAD convolutions + cldnn_format_os_is_y_x8_osv8_isv4, /// < n\ format for weights for MMAD convolutions + cldnn_bf_lyx_yx, /// < \n format for local convolution weights + cldnn_format_b_fs_yx_fsv4, /// < \n format for input for IMAD convolutions + cldnn_format_os_is_yx_osv16_isv4, /// < \n format for weights for IMAD convolutions cldnn_format_format_num, ///< number of format types cldnn_format_any = -1 } cldnn_format_type; @@ -301,6 +319,7 @@ typedef enum /*:int32_t*/ #define CLDNN_TENSOR_BATCH_DIM_MAX 1 #define CLDNN_TENSOR_FEATURE_DIM_MAX 1 #define CLDNN_TENSOR_SPATIAL_DIM_MAX 2 +#define CLDNN_TENSOR_LOCAL_DIM_MAX 2 #define CLDNN_TENSOR_DIM_MAX 8 /// @brief N-dimensional vector. Mostly used to represent memory size. @@ -309,6 +328,7 @@ typedef struct size_t batch_num; size_t feature_num; size_t spatial_num; + size_t local_num; int32_t sizes[CLDNN_TENSOR_DIM_MAX]; } cldnn_tensor; @@ -361,6 +381,13 @@ typedef struct size_t size; ///< Size (in uint16_t) of the array. } cldnn_uint16_t_arr; +/// @brief Represents reference to an array of uint8_t. +typedef struct +{ + const uint8_t* data; ///< Pointer to uint8_t array. + size_t size; ///< Size (in uint8_t) of the array. +} cldnn_uint8_t_arr; + /// @brief Represents reference to an array of tensor. typedef struct { @@ -381,6 +408,13 @@ typedef struct size_t size; ///< Number of ids in the array. } cldnn_primitive_id_arr; +typedef struct +{ + cldnn_data_type data_type; + // No bool type available... + char enabled; +} cldnn_optional_data_type; + /// @brief Custom primitive kernel source code typedef const char* cldnn_kernel_code; /// @brief Custom primitive kernel source code array @@ -434,8 +468,9 @@ typedef enum cldnn_activation_func_t activation_acos, // acos(val) activation_cosh, // cosh(val) activation_log, // log(val) - activation_log2, // log2(val) + activation_log2, // log2(val) activation_exp, // exp(val) + activation_not // !(val) } cldnn_activation_func; /// @brief activation gradient functions @@ -452,6 +487,17 @@ typedef struct cldnn_activation_additional_params_t float a, b; } cldnn_activation_additional_params; +/// @brief Axis which index_select primitive will index. +typedef enum index_select_axis_name_t +{ + along_b, + along_f, + along_y, + along_x +} index_select_axis_name; + +/// @brief Axis which index_select primitive will index array +typedef const index_select_axis_name* index_select_axis_name_arr; /// @brief reorder mean operation modes typedef enum cldnn_reorder_mean_mode_t @@ -470,7 +516,8 @@ typedef enum cldnn_reorder_mean_mode_t cldnn_primitive_type_id type; /**< @brief Primitive type identificator. */\ cldnn_primitive_id id; /**< @brief Primitive id unique within a topology. */\ cldnn_primitive_id_arr input; /**< @brief Input primitives ids. */\ - cldnn_padding output_padding; /**< @brief Output padding information. */ + cldnn_padding output_padding; /**< @brief Output padding information. */\ + cldnn_optional_data_type output_data_type; /**< @brief If specified, describes an explicit change of the output precision of the primitive. */ /// @brief Close primitive descriptor definition. #define CLDNN_END_PRIMITIVE_DESC(PType) }; diff --git a/inference-engine/thirdparty/clDNN/api/C/condition.h b/inference-engine/thirdparty/clDNN/api/C/condition.h new file mode 100644 index 0000000..425803e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/condition.h @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef CONDITION_H +#define CONDITION_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +/// @brief Function, which will be used during comparison. +typedef enum /*:int32_t*/ +{ + EQUAL, + GREATER, + LESS +} cldnn_cond_functions; + +/// @brief Adds primitive, which works like "if". +/// +/// @details +/// @n Applies comparision between 2 inputs. +/// @n Compare data - sizes of that input specifes the range of the comparison. +/// @n Offset - offset in memory, when comparing values. +CLDNN_BEGIN_PRIMITIVE_DESC(condition) +/// @brief An identifier of topology, which will be executed when comparison returns true. +cldnn_topology topology_true; +/// @brief An identifier of topology, which will be executed when comparison returns false. +cldnn_topology topology_false; +/// @brief An identifier of primitive which contains compare values. +cldnn_primitive_id compare_data; +/// @brief Used function during comparison. +cldnn_cond_functions function; +/// @brief Offset for compare data. +cldnn_tensor offset; + +CLDNN_END_PRIMITIVE_DESC(condition) +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(condition); + + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // CONDITION_H diff --git a/inference-engine/thirdparty/clDNN/api/C/contract.h b/inference-engine/thirdparty/clDNN/api/C/contract.h new file mode 100644 index 0000000..9e12cb8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/contract.h @@ -0,0 +1,89 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef CONTRACT_H +#define CONTRACT_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + + /// @brief Select reduction operation for contract layer ( @CLDNN_PRIMITIVE_DESC{contract} ?). + typedef enum /*:int32_t*/ + { + /// @brief Sum reduction. + cldnn_contract_sum, + /// @brief Product reduction. + cldnn_contract_product, + /// @brief All reduction. + cldnn_contract_all, + /// @brief Any reduction. + cldnn_contract_any, + /// @brief Max reduction. + cldnn_contract_max + } cldnn_contract_mode; + + /// @brief Reduces input with an operation defined by @p mode along defined + /// by @p reduction_axes dimensions. + /// + /// @details Reduces the input using the binary operation determined by + /// @p mode. The @p reduction_axes determine the final shape of the + /// output, which is calculated based on the input shape by + /// collapsing the dimensions along which the reduction happens. + /// For example, for the input with + /// @n input_sizes = (in_b, in_f, in_y, in_x) + /// @n a reduction with + /// @n reduction_axes = (2) + /// @n would collapse the Y dimension, producing + /// @n output_shape = (1, in_b, in_f, in_x) + /// @n where every element is a @p mode reduction of the input elements with + /// @n the same B, F and X coordinates. + /// @n + /// @n@b Requirements: + /// @n - @p reduction_axes size (dimensions count) must be within (inclusive) range + /// 1 - 4. + /// @n - @p reduction_axes mustn't have duplicate values. + /// @n - Values of @p reduction_axes must be within (inclusive) range 0 - 3 + /// @n Breaking any of these conditions will raise an exception. + CLDNN_BEGIN_PRIMITIVE_DESC(contract) + /// @brief Reduction mode. See #cldnn_contract_mode. + int32_t mode; /*cldnn_contract_mode*/ + /// @brief Array of axes positions from input shape (0-based, from left to right) + /// along which reduction should happen. + cldnn_uint16_t_arr reduction_axes; + + CLDNN_END_PRIMITIVE_DESC(contract) + + + CLDNN_DECLARE_PRIMITIVE_TYPE_ID(contract); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // CONTRACT_H diff --git a/inference-engine/thirdparty/clDNN/api/C/convolution.h b/inference-engine/thirdparty/clDNN/api/C/convolution.h index 4be5c23..bd79ed2 100644 --- a/inference-engine/thirdparty/clDNN/api/C/convolution.h +++ b/inference-engine/thirdparty/clDNN/api/C/convolution.h @@ -64,6 +64,12 @@ cldnn_primitive_id_arr output_calibration_factors; float input_quantization_factor; /// @brief Output quantization factor float output_quantization_factor; +/// @brief Number of feature groups (grouped convolution). If more than 1 then weights/bias count needs to be 1. +uint32_t groups; +/// @param padding_above Defines a padding added to input image on left (x axis) and top (y axis). +cldnn_tensor padding_above; +/// @param padding_below Defines a padding added to input image on right (x axis) and bottom (y axis). +cldnn_tensor padding_below; CLDNN_END_PRIMITIVE_DESC(convolution) diff --git a/inference-engine/thirdparty/clDNN/api/C/convolution_grad_weights.h b/inference-engine/thirdparty/clDNN/api/C/convolution_grad_weights.h index aacd8ff..ebf783b 100644 --- a/inference-engine/thirdparty/clDNN/api/C/convolution_grad_weights.h +++ b/inference-engine/thirdparty/clDNN/api/C/convolution_grad_weights.h @@ -18,6 +18,7 @@ #ifndef CONVOLUTION_GRAD_WEIGHTS_H #define CONVOLUTION_GRAD_WEIGHTS_H +#include #include "cldnn.h" /// @addtogroup c_api C API /// @{ @@ -54,6 +55,9 @@ cldnn_primitive_id conv_grad; cldnn_primitive_id_arr prev_weights_grad; /// @brief Array of primitive ids containing bias gradient data calculated in previous iteration. Amount of primitives and their memory sizes should be same as biases. cldnn_primitive_id_arr prev_bias_grad; +/// @brief Should primitive give weights gradient (delta) as an output +bool output_grad_w; + CLDNN_END_PRIMITIVE_DESC(convolution_grad_weights) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(convolution_grad_weights); diff --git a/inference-engine/thirdparty/clDNN/api/C/crop.h b/inference-engine/thirdparty/clDNN/api/C/crop.h index fd977f0..caa7bf1 100644 --- a/inference-engine/thirdparty/clDNN/api/C/crop.h +++ b/inference-engine/thirdparty/clDNN/api/C/crop.h @@ -31,7 +31,8 @@ extern "C" { #endif /// @brief Performs crop operation on input. -/// @details Crops the input to the shape of reference_input accross all dimensions taking into account specified input offsets. +/// @details Crops the input to the shape of reference_input across all dimensions taking into account specified input offsets. +/// @n Borders variant calculated output shape from input shape minus the specified borders. /// @n /// @n\b Examples /// @n Crop without offset example: @@ -39,17 +40,24 @@ extern "C" { /// @n Crop with offset example: /// \image html crop_w_offset.jpg /// @n -/// @n\b Requirements -/// @n - Input, reference and offset layout (order) has to be the same +/// @n\b Requirements (reference size variant) /// @n - Input size cannot be greater than reference size in any dimension /// @n - All sizes have to have positive numbers /// @n - Reference size plus offset cannot exceed input size -/// @n Breaking any of this conditions will cause exeption throw. - +/// @n +/// @n\b Requirements (borders variant) +/// @n - Borders support batch, feature and spatial dimensions (rest of dimensions ignored). +/// @n - Input size cannot be greater than reference size in any dimension +/// @n - All sizes specified in borders have to have non-negative values (positive or @c 0). +/// @n - Sum of sizes of opposite borders must be lower than input size (on all non-ignored dimensions). +/// @n +/// @n Breaking any of this conditions will cause exception throw. CLDNN_BEGIN_PRIMITIVE_DESC(crop) -/// @brief Reference input tensor with the required dimensions. +/// @brief Reference input tensor with the required dimensions (if positive) or +/// negated value of right/bottom/upper border size (if non-positive). cldnn_tensor reference_input; -/// @brief Input offsets. +/// @brief Input offsets (reference_input is positive) or left/top/lower border +/// size (reference_input is negative). cldnn_tensor offsets; CLDNN_END_PRIMITIVE_DESC(crop) diff --git a/inference-engine/thirdparty/clDNN/api/C/deconvolution.h b/inference-engine/thirdparty/clDNN/api/C/deconvolution.h index dd1b8e5..a1f0347 100644 --- a/inference-engine/thirdparty/clDNN/api/C/deconvolution.h +++ b/inference-engine/thirdparty/clDNN/api/C/deconvolution.h @@ -54,6 +54,8 @@ cldnn_primitive_id_arr weights; cldnn_primitive_id_arr bias; /// @brief Indicates that deconvolution is used for convolution backward computation (convolution_grad_input) uint32_t gradient; +/// @brief Number of feature groups (grouped deconvolution). If more than 1 then weights/bias count needs to be 1. +uint32_t groups; CLDNN_END_PRIMITIVE_DESC(deconvolution) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(deconvolution); diff --git a/inference-engine/thirdparty/clDNN/api/C/depth_to_space.h b/inference-engine/thirdparty/clDNN/api/C/depth_to_space.h new file mode 100644 index 0000000..64e579e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/depth_to_space.h @@ -0,0 +1,49 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef DEPTH_TO_SPACE_H +#define DEPTH_TO_SPACE_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +CLDNN_BEGIN_PRIMITIVE_DESC(depth_to_space) +/// @brief Size of spatial block in the output tensor. Should be >= 2. +size_t block_size; +CLDNN_END_PRIMITIVE_DESC(depth_to_space) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(depth_to_space); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // DEPTH_TO_SPACE_H diff --git a/inference-engine/thirdparty/clDNN/api/C/detection_output.h b/inference-engine/thirdparty/clDNN/api/C/detection_output.h index 38d71d5..82e1d03 100644 --- a/inference-engine/thirdparty/clDNN/api/C/detection_output.h +++ b/inference-engine/thirdparty/clDNN/api/C/detection_output.h @@ -40,7 +40,7 @@ typedef enum /*:int32_t*/ /// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression. /// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax]. -/// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1. +/// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1. CLDNN_BEGIN_PRIMITIVE_DESC(detection_output) /// @brief Number of classes to be predicted. uint32_t num_classes; @@ -74,8 +74,10 @@ int32_t input_width; int32_t input_height; /// @brief Decrease label id to skip background label equal to 0. Can't be used simultaneously with background_label_id. int32_t decrease_label_id; -/// @brief Clip decoded boxes -int32_t clip; +/// @brief Clip decoded boxes right after decoding +int32_t clip_before_nms; +/// @brief Clip decoded boxes after nms step +int32_t clip_after_nms; CLDNN_END_PRIMITIVE_DESC(detection_output) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(detection_output); diff --git a/inference-engine/thirdparty/clDNN/api/C/detection_output_sort.h b/inference-engine/thirdparty/clDNN/api/C/detection_output_sort.h new file mode 100644 index 0000000..b1e5f38 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/detection_output_sort.h @@ -0,0 +1,60 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef DETECTION_OUTPUT_SORT_H +#define DETECTION_OUTPUT_SORT_H + +#include "cldnn.h" +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + + /// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression. + /// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax]. + /// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1. + CLDNN_BEGIN_PRIMITIVE_DESC(detection_output_sort) + /// @brief Number of classes to be predicted. + uint32_t num_classes; + /// @brief Number of classes to be predicted. + uint32_t num_images; + /// @brief Number of total bounding boxes to be kept per image after NMS step. + uint32_t keep_top_k; + /// @brief If true, bounding box are shared among different classes. + uint32_t share_location; + /// @brief Maximum number of results to be kept in NMS. + int top_k; + /// @brief Background label id (-1 if there is no background class). + int background_label_id; + CLDNN_END_PRIMITIVE_DESC(detection_output_sort) + + CLDNN_DECLARE_PRIMITIVE_TYPE_ID(detection_output_sort); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif /* DETECTION_OUTPUT_SORT_H */ diff --git a/inference-engine/thirdparty/clDNN/api/C/eltwise.h b/inference-engine/thirdparty/clDNN/api/C/eltwise.h index 1668fdd..e0f8a79 100644 --- a/inference-engine/thirdparty/clDNN/api/C/eltwise.h +++ b/inference-engine/thirdparty/clDNN/api/C/eltwise.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,13 +48,34 @@ typedef enum /*:int32_t*/ /// @brief Eltwise pow. cldnn_eltwise_pow, /// @brief Eltwise mod. - cldnn_eltwise_mod + cldnn_eltwise_mod, + /// @brief Eltwise equal. + cldnn_eltwise_eq, + /// @brief Eltwise not equal. + cldnn_eltwise_ne, + /// @brief Eltwise less. + cldnn_eltwise_lt, + /// @brief Eltwise less of equal. + cldnn_eltwise_le, + /// @brief Eltwise greater. + cldnn_eltwise_gt, + /// @brief Eltwise greater or equal. + cldnn_eltwise_ge, + /// @brief Eltwise and. + cldnn_eltwise_and, + /// @brief Eltwise or. + cldnn_eltwise_or, + /// @brief Eltwise xor. + cldnn_eltwise_xor, + /// @brief Eltwise squared diff. + cldnn_eltwise_squared_diff } cldnn_eltwise_mode; /// @brief Performs elementwise operations (sum, subtract, max or product) on two input primitives /// Also supports built-in Relu @CLDNN_PRIMITIVE_DESC{activation} available by setting it in arguments. /// @notes -/// - both inputs have to have equal sizes in all dimensions +/// - both inputs have to have equal sizes in all dimensions or the input tensors are broadcastable +/// to the same shape in which the size of each dimention is a max. of input sizes on this dimension) /// - format of both inputs has to be the same /// - when using integer types, only following eltwise modes are supported: sum, sub, prod, div CLDNN_BEGIN_PRIMITIVE_DESC(eltwise) @@ -70,6 +91,9 @@ cldnn_float_arr coefficients; uint32_t with_activation; /// @brief Relu activation slope. float activation_negative_slope; +/// @brief Defines shift in input buffers between adjacent calculations of output values. +cldnn_tensor_arr stride; + CLDNN_END_PRIMITIVE_DESC(eltwise) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(eltwise); diff --git a/inference-engine/thirdparty/clDNN/api/C/gather.h b/inference-engine/thirdparty/clDNN/api/C/gather.h new file mode 100644 index 0000000..5457b0c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/gather.h @@ -0,0 +1,58 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef GATHER_H +#define GATHER_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif +typedef enum +{ + cldnn_gather_along_b = 0, + cldnn_gather_along_f = CLDNN_TENSOR_BATCH_DIM_MAX, + cldnn_gather_along_x = CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX, + cldnn_gather_along_y = cldnn_gather_along_x + 1 +} cldnn_gather_axis; + +CLDNN_BEGIN_PRIMITIVE_DESC(gather) +/// @brief Gathering axis; +cldnn_gather_axis axis; +/// @brief Output shape +cldnn_tensor output_shape; +CLDNN_END_PRIMITIVE_DESC(gather) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(gather); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // GATHER_H diff --git a/inference-engine/thirdparty/clDNN/api/C/gemm.h b/inference-engine/thirdparty/clDNN/api/C/gemm.h index 7169036..f0311db 100644 --- a/inference-engine/thirdparty/clDNN/api/C/gemm.h +++ b/inference-engine/thirdparty/clDNN/api/C/gemm.h @@ -34,12 +34,6 @@ extern "C" { /// @brief Performs forward attention layer. CLDNN_BEGIN_PRIMITIVE_DESC(gemm) -/// @brief Primitive id containing first matrix -cldnn_primitive_id input1; -/// @brief Primitive id containing second matrix -cldnn_primitive_id input2; -/// @brief Primitive id containing output matrix bias -cldnn_primitive_id input3; /// @brief Variable containing ALPHA parameter float alpha; /// @brief Variable containing BETA parameter @@ -48,9 +42,6 @@ float beta; bool transpose_input1; /// @brief Flag for transposing second input matrix bool transpose_input2; -// NOT SUPPORTED YET -// /// @brief The sequence output for the hidden. This is not clearly specified in the ONNX definition. -// uint32_t output_sequence; CLDNN_END_PRIMITIVE_DESC(gemm) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(gemm); diff --git a/inference-engine/thirdparty/clDNN/api/C/index_select.h b/inference-engine/thirdparty/clDNN/api/C/index_select.h index 907217f..d7e1388 100644 --- a/inference-engine/thirdparty/clDNN/api/C/index_select.h +++ b/inference-engine/thirdparty/clDNN/api/C/index_select.h @@ -18,6 +18,7 @@ #include "cldnn.h" +#include /// @addtogroup c_api C API /// @{ @@ -30,15 +31,6 @@ extern "C" { #endif -/// @brief Axis which index_select primitive will index. -typedef enum /*:int32_t*/ -{ - cldnn_along_b, - cldnn_along_f, - cldnn_along_x, - cldnn_along_y, -} cldnn_index_select_axis; - /// @brief Select index, which will be copied to the output.. /// /// @details Applies index selecting along specified dimension. The indices, which will be copied are specifed by @@ -64,8 +56,12 @@ typedef enum /*:int32_t*/ /// @n Breaking any of this conditions will cause exeption throw. CLDNN_BEGIN_PRIMITIVE_DESC(index_select) -/// @brief Axis of index selecting. -cldnn_index_select_axis axis; +/// @brief A list of axes of index selecting. +index_select_axis_name_arr axis; +/// @brief Number of axes of index selecting. +int axis_num; +/// @brief Do index_select in reverse order on axis. +bool reverse; CLDNN_END_PRIMITIVE_DESC(index_select) diff --git a/inference-engine/thirdparty/clDNN/api/C/lstm.h b/inference-engine/thirdparty/clDNN/api/C/lstm.h index 10e8eea..fa68f51 100644 --- a/inference-engine/thirdparty/clDNN/api/C/lstm.h +++ b/inference-engine/thirdparty/clDNN/api/C/lstm.h @@ -31,20 +31,43 @@ extern "C" { #endif +/// @brief Weights orders +/// @details Specifies the order in which the weights are concatenated. +/// e.g. [i, o, f, z] : [input, output, forget, block] +/// ONNX order: iofz +/// Caffe order: ifoz +/// pyTorch order: izof +/// IE order: fizo typedef enum /*:int32_t*/ { - cldnn_lstm_offset_order_iofz = 0, // ONNX - cldnn_lstm_offset_order_ifoz // Caffe + cldnn_lstm_offset_order_iofz = 0, + cldnn_lstm_offset_order_ifoz, + cldnn_lstm_offset_order_izof, + cldnn_lstm_offset_order_fizo } cldnn_lstm_offset_order; +/// @brief LSTM Output selection +/// @details The current implementation allows the use to select the output +/// of an LSTM node by specifing any of the following options +typedef enum /*:int32_t*/ +{ + /// output the entire hidden sequence + cldnn_lstm_output_sequence = 0, + /// output just the last hidden value + cldnn_lstm_output_hidden, + /// output the last hidden and last cell values + cldnn_lstm_output_hidden_cell, + /// output the hidden sequence concatenated with the last cell + cldnn_lstm_output_sequence_cell +} cldnn_lstm_output; /// @brief Performs forward Long Short-Term Memory (LSTM) layer. -/// @details The current implementation of LSTM supports Peepholes. -/// it = f(Xt*(Wi^T) + Ht-1*Ri + Pi (.) Ct-1 + Wbi + Rbi) -/// ft = f(Xt*(Wf^T) + Ht-1*Rf + Pf (.) Ct-1 + Wbf + Rbf) -/// ct = g(Xt*(Wc^T) + Ht-1*Rc + Wbc + Rbc) +/// @details The current implementation of LSTM is described the following equations. +/// it = f(Xt*(Wi^T) + Ht-1*Ri + Wbi) +/// ft = f(Xt*(Wf^T) + Ht-1*Rf + Wbf) +/// ct = g(Xt*(Wc^T) + Ht-1*Rc + Wbc) /// Ct = ft (.) Ct-1 + it (.) ct -/// ot = f(Xt*(Wo^T) + Ht-1*Ro + Po (.) Ct + Wbo + Rbo) +/// ot = f(Xt*(Wo^T) + Ht-1*Ro + Wbo) /// Ht = ot (.) h(Ct) /// Where f = Sigmoid, g = Tanh, and h = Tanh. CLDNN_BEGIN_PRIMITIVE_DESC(lstm) @@ -68,10 +91,11 @@ bool input_forget; cldnn_activation_func activations[3]; /// @brief Optional scaling values used by some activation functions. The values are consumed in the order of activation functions. cldnn_activation_additional_params activation_params[3]; +/// @brief Output selection. Default the entire hidden sequence is returned +cldnn_lstm_output output_selection; /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe cldnn_lstm_offset_order offset_order; // NOT SUPPORTED YET -// /// @brief The sequence output for the hidden. This is not clearly specified in the ONNX definition. // uint32_t output_sequence; CLDNN_END_PRIMITIVE_DESC(lstm) @@ -113,8 +137,9 @@ cldnn_activation_func activations[3]; cldnn_activation_additional_params activation_params[3]; /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe cldnn_lstm_offset_order offset_order; +/// @brief direction default = 0, bidirectional = 1. +uint32_t direction; // NOT SUPPORTED YET -// /// @brief The sequence output for the hidden. This is not clearly specified in the ONNX definition. // uint32_t output_sequence; CLDNN_END_PRIMITIVE_DESC(lstm_elt) diff --git a/inference-engine/thirdparty/clDNN/api/C/one_hot.h b/inference-engine/thirdparty/clDNN/api/C/one_hot.h new file mode 100644 index 0000000..d53cc6c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/one_hot.h @@ -0,0 +1,71 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef ONE_HOT_H +#define ONE_HOT_H + +#include "cldnn.h" +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + + /// @brief Creates a one-hot encoding of the input. + /// @details Creates a one-hot encoding of the input, putting the new one-hot axis in the position + /// @n specified by the @p one_hot_axis input, using the @p shape tensor as size reference. + /// @n The size of @p shape must be appropriate for adding a one-hot axis to input. For example, + /// @n input_sizes = (1, in_f, in_y, in_x) + /// @n expanded with + /// @n one_hot_axis = 2 + /// @n would insert the one-hot axis in the Y dimension, requiring + /// @n shape = (in_f, in_y, one-hot_limit, in_x) + /// @n The output values would then be determined by input as + /// @n output[f, y, i, x] = (input[0, f, y, x] == i) ? 1 : 0; + /// @n Since determining whether the input is appropriate (that the one-hot axis + /// @n has enough space to fully encode all inputs) requires scanning the whole + /// @n input, the primitive doesn't check for that, instead producing all-zeros + /// @n output axes for inputs below 0 and greater than the limit set by + /// @n @p shape. + /// @n + /// @n\b Requirements + /// @n - @p one_hot_axis must be within (inclusive) range 0 - 3. + /// @n - @p shape must fit input sizes (see example above). + /// @n - input batch size must be equal to 1. + /// @n + /// @n Breaking any of this conditions will cause exception throw. + CLDNN_BEGIN_PRIMITIVE_DESC(one_hot) + /// @brief Output size reference. + cldnn_tensor shape; + /// @brief One-hot axis position in output shape (0-based, from left to right). + uint16_t one_hot_axis; + CLDNN_END_PRIMITIVE_DESC(one_hot) + + CLDNN_DECLARE_PRIMITIVE_TYPE_ID(one_hot); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif /* ONE_HOT_H */ + diff --git a/inference-engine/thirdparty/clDNN/api/C/pooling.h b/inference-engine/thirdparty/clDNN/api/C/pooling.h index a8148fc..1078a46 100644 --- a/inference-engine/thirdparty/clDNN/api/C/pooling.h +++ b/inference-engine/thirdparty/clDNN/api/C/pooling.h @@ -52,6 +52,8 @@ CLDNN_BEGIN_PRIMITIVE_DESC(pooling) cldnn_primitive_id argmax; /// @brief Pooling method. See #cldnn_pooling_mode. int32_t mode; +/// @brief Global pooling (kernel size is equal to the spatial dimension of input tensor) +int8_t global_pooling; /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the pooling window should start calculations. cldnn_tensor input_offset; /// @brief Defines shift in input buffer between adjacent calculations of output values. diff --git a/inference-engine/thirdparty/clDNN/api/C/proposal.h b/inference-engine/thirdparty/clDNN/api/C/proposal.h index c571759..991cae4 100644 --- a/inference-engine/thirdparty/clDNN/api/C/proposal.h +++ b/inference-engine/thirdparty/clDNN/api/C/proposal.h @@ -47,8 +47,11 @@ CLDNN_BEGIN_PRIMITIVE_DESC(proposal) float box_size_scale; uint32_t swap_xy; uint32_t initial_clip; + uint32_t clip_before_nms; + uint32_t clip_after_nms; uint32_t round_ratios; uint32_t shift_anchors; + uint32_t normalize; CLDNN_END_PRIMITIVE_DESC(proposal) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(proposal); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2.cpp b/inference-engine/thirdparty/clDNN/api/C/pyramid_roi_align.h similarity index 68% rename from inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2.cpp rename to inference-engine/thirdparty/clDNN/api/C/pyramid_roi_align.h index 4ebd2fc..e33663a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2.cpp +++ b/inference-engine/thirdparty/clDNN/api/C/pyramid_roi_align.h @@ -1,4 +1,3 @@ -/* // Copyright (c) 2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +11,22 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - //SKL GT2 - void tuning_cache_1912(tuning_data& td) - { - tuning_cache_1912_B1_B16(td); - tuning_cache_1912_B8(td); - tuning_cache_1912_B32_B64(td); - } -} \ No newline at end of file + +#pragma once + +#include "cldnn.h" + +#ifdef __cplusplus +extern "C" { +#endif + + CLDNN_BEGIN_PRIMITIVE_DESC(pyramid_roi_align) + + CLDNN_END_PRIMITIVE_DESC(pyramid_roi_align) + + CLDNN_DECLARE_PRIMITIVE_TYPE_ID(pyramid_roi_align); + + +#ifdef __cplusplus +} +#endif diff --git a/inference-engine/thirdparty/clDNN/api/C/reorder.h b/inference-engine/thirdparty/clDNN/api/C/reorder.h index 67c504f..bfe37a4 100644 --- a/inference-engine/thirdparty/clDNN/api/C/reorder.h +++ b/inference-engine/thirdparty/clDNN/api/C/reorder.h @@ -37,8 +37,6 @@ extern "C" { CLDNN_BEGIN_PRIMITIVE_DESC(reorder) /// @brief Requested memory format. cldnn_format_type output_format; -/// @brief Requested memory data type. -cldnn_data_type output_data_type; /// @brief Primitive id to get mean subtract values. Ignored if subtract_per_featrue is set. cldnn_primitive_id mean_subtract; /// @brief Array of mean subtract values. diff --git a/inference-engine/thirdparty/clDNN/api/C/reverse_sequence.h b/inference-engine/thirdparty/clDNN/api/C/reverse_sequence.h new file mode 100644 index 0000000..7a7ec96 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/reverse_sequence.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef REVERSE_SEQUENCE_H +#define REVERSE_SEQUENCE_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +CLDNN_BEGIN_PRIMITIVE_DESC(reverse_sequence) +/// @brief The axis which is partially reversed. +int32_t seq_axis; +/// @brief The axis along which reversal is performed. +int32_t batch_axis; +CLDNN_END_PRIMITIVE_DESC(reverse_sequence) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(reverse_sequence); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // REVERSE_SEQUENCE_H diff --git a/inference-engine/thirdparty/clDNN/api/C/roi_pooling.h b/inference-engine/thirdparty/clDNN/api/C/roi_pooling.h index 846d1ee..7ada955 100644 --- a/inference-engine/thirdparty/clDNN/api/C/roi_pooling.h +++ b/inference-engine/thirdparty/clDNN/api/C/roi_pooling.h @@ -18,6 +18,7 @@ #ifndef ROI_POOLING_H #define ROI_POOLING_H +#include #include "cldnn.h" /// @addtogroup c_api C API /// @{ @@ -34,16 +35,20 @@ extern "C" { CLDNN_BEGIN_PRIMITIVE_DESC(roi_pooling) /// @brief Pooling method. See #cldnn_pooling_mode. int32_t mode; - +/// @brief True, if pooling is position sensitive (PSROIPoolng) +bool position_sensitive; /// @brief Output width. int pooled_width; /// @brief Output height. int pooled_height; +/// @brief Count of sub bins in x spatial dimension +int spatial_bins_x; +/// @brief Count of sub bins in y spatial dimension +int spatial_bins_y; +/// @brief Output features count (applied for position sensitive case only) +int output_dim; /// @brief Ratio of the coordinates used in RoIs to the width (and height) of the input data. float spatial_scale; - -/// @brief Group size as defined by PSRoIPooling when > 0, else if 0 means regular RoIPooling. -int group_sz; CLDNN_END_PRIMITIVE_DESC(roi_pooling) CLDNN_DECLARE_PRIMITIVE_TYPE_ID(roi_pooling); diff --git a/inference-engine/thirdparty/clDNN/api/C/shuffle_channels.h b/inference-engine/thirdparty/clDNN/api/C/shuffle_channels.h new file mode 100644 index 0000000..a5a4b07 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/shuffle_channels.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef SHUFFLE_CHANNELS_H +#define SHUFFLE_CHANNELS_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +CLDNN_BEGIN_PRIMITIVE_DESC(shuffle_channels) +/// @brief The number of groups to split the channel dimension. This number must evenly divide the channel dimension size. +int32_t group; +/// @brief The index of the channel dimension (default is 1). +int32_t axis; +CLDNN_END_PRIMITIVE_DESC(shuffle_channels) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(shuffle_channels); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // SHUFFLE_CHANNELS_H diff --git a/inference-engine/thirdparty/clDNN/api/C/strided_slice.h b/inference-engine/thirdparty/clDNN/api/C/strided_slice.h new file mode 100644 index 0000000..9f6f081 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/C/strided_slice.h @@ -0,0 +1,55 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef STRIDED_SLICE_H +#define STRIDED_SLICE_H + +#include "cldnn.h" + + +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +CLDNN_BEGIN_PRIMITIVE_DESC(strided_slice) +/// @brief Array of bits, that provide replace begin[i] to max possible range in that dimension. +cldnn_uint8_t_arr begin_mask; +/// @brief Array of bits, that provide replace end[i] to max possible range in that dimension. +cldnn_uint8_t_arr end_mask; +/// @brief Array of bits, that provide adding a new length 1 dimension at ith position in the output tensor. +cldnn_uint8_t_arr new_axis_mask; +/// @brief Array of bits, that provide shrinks the dimensionality by 1, taking on the value at index begin[i]. +cldnn_uint8_t_arr shrink_axis_mask; +CLDNN_END_PRIMITIVE_DESC(strided_slice) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(strided_slice); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif // STRIDED_SLICE_H diff --git a/inference-engine/thirdparty/clDNN/api/CPP/batch_norm.hpp b/inference-engine/thirdparty/clDNN/api/CPP/batch_norm.hpp index 3933203..7962d71 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/batch_norm.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/batch_norm.hpp @@ -35,7 +35,7 @@ namespace cldnn /// /// Algorithm: /// @n global stats can be computed as: -/// @n out[i] = (in[i] - mean[b]) / sqrt(variance[b] + epsilon) +/// @n out[i] = ( (in[i] - mean[b]) / sqrt(variance[b] + epsilon) ) * scale[b] + shift[b] struct batch_norm : public primitive_base { @@ -63,6 +63,34 @@ struct batch_norm : public primitive_basemean) , variance(dto->variance) + , scale(dto->scale) + , shift(dto->shift) , inv_variance(dto->inv_variance) , epsilon(dto->epsilon) { @@ -97,20 +183,36 @@ struct batch_norm : public primitive_base> get_dependencies() const override - { - if (!mean.empty() && !variance.empty()) - return{ mean, variance }; - else if (!inv_variance.empty()) - return{ inv_variance }; - else - return{}; + std::vector> get_dependencies() const override + { + std::vector> deps; + + if (!mean.empty() && !variance.empty()) + { + deps.push_back(mean); + deps.push_back(variance); + } + + if (!scale.empty() && !shift.empty()) + { + deps.push_back(scale); + deps.push_back(shift); + } + + if (!inv_variance.empty()) + deps.push_back(inv_variance); + + return deps; } void update_dto(dto& dto) const override @@ -118,6 +220,8 @@ protected: dto.mean = mean.c_str(); dto.variance = variance.c_str(); dto.inv_variance = inv_variance.c_str(); + dto.scale = scale.c_str(); + dto.shift = shift.c_str(); dto.epsilon = epsilon; } }; diff --git a/inference-engine/thirdparty/clDNN/api/CPP/border.hpp b/inference-engine/thirdparty/clDNN/api/CPP/border.hpp index 6171b6c..862421f 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/border.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/border.hpp @@ -33,18 +33,19 @@ enum class border_type : std::int32_t { /// @brief All points in the border are set to constant value. constant = cldnn_border_constant, + zero = cldnn_border_zero, /// @brief Border is constructed as an mirror of image (edge is also mirrored). /// @details Size of border in any dimension cannot be larger than size of /// input in the same dimension. mirror = cldnn_border_mirror, - /// @brief Border is constructed as an replication of edge. - /// @details Size of border in any dimension cannot be larger than size of - /// input in the same dimension. - edge = cldnn_border_edge, /// @brief Border is constructed as an mirror of image (edge is NOT mirrored). /// @details Size of border in any dimension cannot be larger than size of /// input in the same dimension decreased by @c 1. - mirror_101 = cldnn_border_mirror_101 + mirror_101 = cldnn_border_mirror_101, + /// @brief Border is constructed as an replication of edge. + /// @details Size of border in any dimension cannot be larger than size of + /// input in the same dimension. + edge = cldnn_border_edge }; @@ -80,9 +81,9 @@ struct border : public primitive_base border( const primitive_id& id, const primitive_id& input, - const tensor& left_top_sizes, - const tensor& right_bottom_sizes, - const border_type type, + const tensor& left_top_sizes = { 0, 0, 0, 0 }, + const tensor& right_bottom_sizes = { 0, 0, 0, 0 }, + const border_type type = border_type::constant, const float border_value = 0.0f, const padding& output_padding = padding() ) @@ -94,6 +95,28 @@ struct border : public primitive_base { } + /// @brief Constructs border primitive / layer. + /// + /// @param id An identifier of new primitive. + /// @param input An identifier of primitive which is an input for newly created + /// border primitive. + /// @param x_y_sizes Sizes of border that needs to be added from left and right + /// (in X dimension) and from top and bottom (in Y dimension). + /// Created border is simmetric (the same size of border applied + /// from both sides of input). + /// @param type Type of added border. + /// @param output_padding Optional padding for output from primitive. + border( + const primitive_id& id, + const primitive_id& input, + const tensor& x_y_sizes, + const border_type type = border_type::constant, + const padding& output_padding = padding() + ) + : border(id, input, x_y_sizes, x_y_sizes, type, 0.0f, output_padding) + { + } + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{border} border(const dto* dto) : primitive_base(dto), diff --git a/inference-engine/thirdparty/clDNN/api/CPP/broadcast.hpp b/inference-engine/thirdparty/clDNN/api/CPP/broadcast.hpp index 686358f..cc27d7f 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/broadcast.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/broadcast.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,14 +28,16 @@ namespace cldnn /// @addtogroup cpp_primitives Primitives /// @{ -/// @brief Broadcasts input to specified output size (broadcast size). +/// @brief Broadcasts input to defined by @p broadcast_sizes output. @p broadcast_axes are used to +/// reinterpret input (reshape) inside algorithm. /// -/// @details Takes input and copies it to output once or multiple times, until output will -/// reach the sizes specified in @p broadcast_sizes. +/// @details Takes input, reinterpret it according to @p broadcast_axes +/// and copies it to output once or multiple times. /// @n -/// @n Lets assume that: +/// @n Simple example with empty @p broadcast_axes. Lets assume that: /// @n input_sizes = (in_b, in_f, in_y, in_x) /// @n broadcast_sizes = (bs_b, bs_f, bs_y, bs_x) +/// @n broadcast_axes = () - empty /// @n The input is broadcasted on each dimension where bs_{dim} > in_{dim} and bs_{dim} /// is dividable by in_{dim} (input is copied bs_{dim} / in_{dim} times). /// The dimensions where bs_{dim} is equal to in_{dim} remain unchanged. @@ -44,17 +46,28 @@ namespace cldnn /// @n output[(b, f, y, x)] = input[(b % in_b, f % in_f, y % in_y, x % in_x)] /// @n where (b, f, y, x) is a position of value in a primitive output. /// @n +/// @n More complicated example with non empty @p broadcast_axes. Lets assume that: +/// @n broadcast_sizes = (bs_b, bs_f, bs_y, bs_x) +/// @n broadcast_axes = (2) +/// @n Taking into account broadcast_axes size (=1) primitive's input must be (4 - 1 = 3): +/// @n primitive input = (1, in_b, in_f, in_x) +/// @n Due to broadcast_axes = (2) primitive will interpret input as: +/// @n primitive input(internal representation) = (in_b, in_f, 1, in_x) +/// @n Now, you can apply broadcast rules from previous example to modified (reinterpreted) +/// input and output: +/// @n input_sizes = (in_b, in_f, 1, in_x) +/// @n output_shape = (bs_b, bs_f, bs_y, bs_x) +/// @n broadcast_axes = () - empty +/// @n /// @n@b Requirements: -/// @n - @p broadcast_sizes must be positive on all dimensions and compatible -/// with size of input (describe the same dimensions). -/// @n - @p broadcast_sizes must be greater than or equal to input sizes on -/// all dimensions. (For any dimension, if @p broadcast_sizes is lower -/// than input size on the dimension then @p broadcast_sizes will be replaced -/// by input size on this dimension.) -/// @n - For any dimension, if @p broadcast_sizes is greater than input size on -/// the dimension then @p broadcast_sizes must be dividable by input size -/// on this dimension. -/// @n Breaking any of these conditions will raise an exeption. +/// @n - @p broadcast_sizes must be positive on all dimensions. +/// @n - @p broadcast_axes size (dimensions count) must be within (inclusive) range +/// 0 - 4. +/// @n - @p broadcast_axes mustn't have duplicate values. +/// @n - Values of @p broadcast_axes must be within (inclusive) range 0 - 3 +/// @n - @p output_shape must be greater (dividable) than or equal to reinterpreted +/// input on all dimensions. +/// @n Breaking any of these conditions will raise an exception. struct broadcast : public primitive_base { CLDNN_DECLARE_PRIMITIVE(broadcast) @@ -66,34 +79,45 @@ struct broadcast : public primitive_base& broadcast_axes = {}, const padding& output_padding = padding() ) : primitive_base(id, {input}, output_padding), - broadcast_sizes(broadcast_sizes) + broadcast_sizes(broadcast_sizes), + broadcast_axes(broadcast_axes) { } /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{broadcast} broadcast(const dto* dto) : primitive_base(dto), - broadcast_sizes(dto->broadcast_sizes) + broadcast_sizes(dto->broadcast_sizes), + broadcast_axes(uint16_t_arr_to_vector(dto->broadcast_axes)) + { } /// @brief Expected sizes of output from broadcast primitive. tensor broadcast_sizes; + /// @brief Array of axes positions from output shape (0-based, from left to right) + /// along which broadcast should happen. + std::vector broadcast_axes; protected: void update_dto(dto& dto) const override { dto.broadcast_sizes = broadcast_sizes; + dto.broadcast_axes = uint16_t_vector_to_arr(broadcast_axes); + } }; /// @} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/cldnn_defs.h b/inference-engine/thirdparty/clDNN/api/CPP/cldnn_defs.h index 7e82d2c..7281bd3 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/cldnn_defs.h +++ b/inference-engine/thirdparty/clDNN/api/CPP/cldnn_defs.h @@ -349,6 +349,18 @@ inline std::vector uint16_t_arr_to_vector(const cldnn_uint16_t_arr& ar return result; } +/// +/// \brief Converts C API uint8_t array to std::vector +/// +inline std::vector uint8_t_arr_to_vector(const cldnn_uint8_t_arr& arr) +{ + std::vector result(arr.size); + for (size_t i = 0; i < arr.size; i++) + { + result[i] = arr.data[i]; + } + return result; +} /// /// \brief Converts std::vector to C API float_array @@ -367,6 +379,14 @@ inline cldnn_uint16_t_arr uint16_t_vector_to_arr(const std::vector& st } /// +/// \brief Converts std::vector to C API uint8_t array +/// +inline cldnn_uint8_t_arr uint8_t_vector_to_arr(const std::vector& stor) +{ + return{ stor.data(), stor.size() }; +} + +/// /// \brief Converts std::vector to C API tensor_array /// inline cldnn_tensor_arr tensor_vector_to_arr(const std::vector& stor) @@ -374,6 +394,18 @@ inline cldnn_tensor_arr tensor_vector_to_arr(const std::vector& st return cldnn_tensor_arr{ stor.data(), stor.size() }; } +/// +/// \brief Converts C API tensor_array to std::vector of C API tensor +/// +inline std::vector tensor_arr_to_cldnn_vector(const cldnn_tensor_arr& arr) +{ + std::vector result(arr.size); + for (size_t i = 0; i < arr.size; i++) + result[i] = arr.data[i]; + + return result; +} + /// @} /// @endcond diff --git a/inference-engine/thirdparty/clDNN/api/CPP/condition.hpp b/inference-engine/thirdparty/clDNN/api/CPP/condition.hpp new file mode 100644 index 0000000..0ad6c3e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/condition.hpp @@ -0,0 +1,119 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "../C/condition.h" +#include "primitive.hpp" +#include "topology.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ +/// @brief Function, which will be used during comparison. +enum cond_functions : int32_t +{ + EQUAL, + GREATER, + LESS +}; + +/// @brief Adds primitive, which works like "if". +/// +/// @details +/// @n Applies comparision between 2 inputs. +/// @n Compare data - sizes of that input specifes the range of the comparison. +/// @n Offset - offset in memory, when comparing values. +struct condition : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(condition) + + /// @brief Constructs condition primitive / layer. + /// + /// @param id An identifier of new primitive. + /// @param input An identifier of primitive which is an input for newly created + /// condition primitive. + /// @param topology_true Topolgoy containg primitives, which will be executed when comparsion results + /// true. + /// @param topology_false Topolgoy containg primitives, which will be executed when comparsion results + /// false.. + /// @param compare_Data An identifier of primitive which contains compare values + /// @param func Used function during comparison. + /// @param offseg Offset for compare data. + /// @param output_padding Optional padding for output from primitive. + condition( + const primitive_id& id, + const primitive_id& input, + const topology& topology_true, + const topology& topology_false, + const primitive_id& compare_data, + const cond_functions& func, + const tensor& offset = { 0, 0, 0, 0 }, + const padding& output_padding = padding() + ) + : primitive_base(id, { input }, output_padding) + , topology_true(topology_true) + , topology_false(topology_false) + , compare_data(compare_data) + , function(func) + , offset(offset) + {} + + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{condition} + condition(const dto* dto) + : primitive_base(dto) + , topology_true(dto->topology_true) + , topology_false(dto->topology_false) + , compare_data(dto->compare_data) + , function(static_cast(dto->function)) + , offset(dto->offset) + {} + + + /// @brief An identifier of topology, which will be executed when comparison returns true. + topology topology_true; + /// @brief An identifier of topology, which will be executed when comparison returns false. + topology topology_false; + /// @brief An identifier of primitive which contains compare values. + primitive_id compare_data; + /// @brief Used function during comparison. + cond_functions function; + /// @brief Offset for compare data. + tensor offset; +protected: + void update_dto(dto& dto) const override + { + dto.compare_data = compare_data.c_str(); + dto.function = static_cast(function); + dto.offset = offset; + dto.topology_true = topology_true.get(); + dto.topology_false = topology_false.get(); + } + + std::vector> get_dependencies() const override + { + return { compare_data }; + } +}; +} +/// @} +/// @} +/// @} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/api/CPP/contract.hpp b/inference-engine/thirdparty/clDNN/api/CPP/contract.hpp new file mode 100644 index 0000000..9ce79ca --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/contract.hpp @@ -0,0 +1,119 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "../C/contract.h" +#include "primitive.hpp" + + +namespace cldnn +{ + /// @addtogroup cpp_api C++ API + /// @{ + /// @addtogroup cpp_topology Network Topology + /// @{ + /// @addtogroup cpp_primitives Primitives + /// @{ + + /// @brief Select mode for the @ref contract layer. + enum class contract_mode : int32_t + { + /// @brief Sum reduction. + sum = cldnn_contract_sum, + /// @brief Product reduction. + prod = cldnn_contract_product, + /// @brief All reduction. + all = cldnn_contract_all, + /// @brief Any reduction. + any = cldnn_contract_any, + /// @brief Max reduction. + max = cldnn_contract_max + }; + + /// @brief Reduces input with an operation defined by @p mode along defined + /// by @p reduction_axes dimensions. + /// + /// @details Reduces the input using the binary operation determined by + /// @p mode. The @p reduction_axes determine the final shape of the + /// output, which is calculated based on the input shape by + /// collapsing the dimensions along which the reduction happens. + /// For example, for the input with + /// @n input_sizes = (in_b, in_f, in_y, in_x) + /// @n a reduction with + /// @n reduction_axes = (2) + /// @n would collapse the Y dimension, producing + /// @n output_shape = (1, in_b, in_f, in_x) + /// @n where every element is a @p mode reduction of the input elements with + /// @n the same B, F and X coordinates. + /// @n + /// @n@b Requirements: + /// @n - @p reduction_axes size (dimensions count) must be within (inclusive) range + /// 1 - 4. + /// @n - @p reduction_axes mustn't have duplicate values. + /// @n - Values of @p reduction_axes must be within (inclusive) range 0 - 3 + /// @n Breaking any of these conditions will raise an exception. + struct contract : public primitive_base + { + CLDNN_DECLARE_PRIMITIVE(contract) + + /// @brief Constructs contract primitive / layer. + /// + /// @param id An identifier of new primitive. + /// @param input An identifier of primitive which is an input for newly created + /// contract primitive. + /// @param mode Reduction mode. + /// @param reduction_axes Axes positions (0-based, from left to right) in input_shape + /// that are being reduced. + /// @param output_padding Optional padding for output from primitive. + contract( + const primitive_id& id, + const primitive_id& input, + contract_mode mode, + const std::vector& reduction_axes = {}, + const padding& output_padding = padding() + ) + : primitive_base(id, { input }, output_padding), + mode(mode), + reduction_axes(reduction_axes) + { + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{contract} + contract(const dto* dto) + : primitive_base(dto), + mode(static_cast(dto->mode)), + reduction_axes(uint16_t_arr_to_vector(dto->reduction_axes)) + + { + } + + /// @param mode Contract mode. + contract_mode mode; + /// @brief Array of axes positions from input shape (0-based, from left to right) + /// along which reduction should happen. + std::vector reduction_axes; + + protected: + void update_dto(dto& dto) const override + { + dto.mode = static_cast(mode); + dto.reduction_axes = uint16_t_vector_to_arr(reduction_axes); + } + }; + /// @} + /// @} + /// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/convolution.hpp b/inference-engine/thirdparty/clDNN/api/CPP/convolution.hpp index 8efecd8..a8ae603 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/convolution.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/convolution.hpp @@ -72,6 +72,9 @@ struct convolution : public primitive_base(0)) @@ -81,6 +84,217 @@ struct convolution : public primitive_base& weights, + const std::vector& bias, + tensor stride, + tensor input_offset, + tensor dilation, + tensor padding_above, + tensor padding_below, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(1) + , padding_above(padding_above) + , padding_below(padding_below) + , _weights(weights) + , _bias(bias) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + if ((bias.size() != 0) && (weights.size() != bias.size())) + throw std::runtime_error("convolution's weights/bias count does not match"); + } + + /// @brief Constructs convolution primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param groups Number of filter groups. + /// @param bias List of primitive ids containing bias data. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param padding_above Defines a padding added to input image on left (x axis) and top (y axis). + /// @param padding_below Defines a padding added to input image on right (x axis) and bottom (y axis). + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + convolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + const std::vector& bias, + uint32_t groups, + tensor stride, + tensor input_offset, + tensor dilation, + tensor padding_above, + tensor padding_below, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(groups) + , padding_above(padding_above) + , padding_below(padding_below) + , _weights(weights) + , _bias(bias) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + if ((bias.size() != 0) && (weights.size() != bias.size())) + throw std::runtime_error("convolution's weights/bias count does not match"); + } + + /// @brief Constructs convolution primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param groups Number of filter groups. + /// @param bias List of primitive ids containing bias data. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + /// @param output_size User-defined output data size of the primitive (w/o padding). + convolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + const std::vector& bias, + uint32_t groups, + tensor stride, + tensor input_offset, + tensor dilation, + bool with_activation, + float activation_slp, + tensor output_size, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(true) + , output_size(output_size) + , groups(groups) + , padding_above(tensor(0, 0, 0, 0)) + , padding_below(tensor(0, 0, 0, 0)) + , _weights(weights) + , _bias(bias) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + if ((bias.size() != 0) && (weights.size() != bias.size())) + throw std::runtime_error("convolution's weights/bias count does not match"); + } + + /// @brief Constructs convolution primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param groups Number of filter groups. + /// @param bias List of primitive ids containing bias data. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + convolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + const std::vector& bias, + uint32_t groups, + tensor stride = { 1, 1, 1, 1 }, + tensor input_offset = { 0,0,0,0 }, + tensor dilation = { 1, 1, 1, 1 }, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(groups) + , padding_above(tensor(0, 0, 0, 0)) + , padding_below(tensor(0, 0, 0, 0)) + , _weights(weights) + , _bias(bias) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + if ((bias.size() != 0) && (weights.size() != bias.size())) + throw std::runtime_error("convolution's weights/bias count does not match"); + if ((groups > 1) && ((weights.size() != 1) || ((bias.size() != 0) && (bias.size() != 1)))) + throw std::runtime_error("grouped convolution's weights/bias count must be 1"); + } /// @brief Constructs convolution primitive. /// @param id This primitive id. @@ -125,6 +339,9 @@ struct convolution : public primitive_base(0)) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + } + + /// @brief Constructs convolution primitive (w/o bias). + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param padding_above Defines a padding added to input image on left (x axis) and top (y axis). + /// @param padding_below Defines a padding added to input image on right (x axis) and bottom (y axis). + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + convolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + tensor stride, + tensor input_offset, + tensor dilation, + tensor padding_above, + tensor padding_below, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(1) + , padding_above(padding_above) + , padding_below(padding_below) + , _weights(weights) + , _bias(std::vector(0)) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + } + + /// @brief Constructs convolution primitive (w/o bias). + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param groups Number of filter groups. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param padding_above Defines a padding added to input image on left (x axis) and top (y axis). + /// @param padding_below Defines a padding added to input image on right (x axis) and bottom (y axis). + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + convolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + uint32_t groups, + tensor stride, + tensor input_offset, + tensor dilation, + tensor padding_above, + tensor padding_below, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(groups) + , padding_above(padding_above) + , padding_below(padding_below) + , _weights(weights) + , _bias(std::vector(0)) + , _weights_quantization_factors(std::vector(0)) + , _output_calibration_factors(std::vector(0)) + { + } + + /// @brief Constructs convolution primitive (w/o bias). + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param groups Number of filter groups. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + convolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + uint32_t groups, + tensor stride = { 1, 1, 1, 1 }, + tensor input_offset = { 0,0,0,0 }, + tensor dilation = { 1, 1, 1, 1 }, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , weights_quantization_factors(_weights_quantization_factors.cpp_ids) + , output_calibration_factors(_output_calibration_factors.cpp_ids) + , input_quantization_factor(1.0f) + , output_quantization_factor(1.0f) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(groups) + , padding_above(tensor(0, 0, 0, 0)) + , padding_below(tensor(0, 0, 0, 0)) , _weights(weights) , _bias(std::vector(0)) , _weights_quantization_factors(std::vector(0)) @@ -274,6 +644,9 @@ struct convolution : public primitive_base(0)) @@ -321,6 +694,9 @@ struct convolution : public primitive_base(0)) , _weights_quantization_factors(std::vector(0)) @@ -344,6 +720,9 @@ struct convolution : public primitive_baseactivation_negative_slope) , with_output_size(dto->with_output_size != 0) , output_size(dto->output_size) + , groups(dto->groups) + , padding_above(dto->padding_above) + , padding_below(dto->padding_below) , _weights(dto->weights) , _bias(dto->bias) , _weights_quantization_factors(dto->weights_quantization_factors) @@ -443,6 +822,12 @@ struct convolution : public primitive_base(weights.size()); } @@ -484,7 +869,9 @@ protected: dto.dilation = dilation; dto.with_output_size = with_output_size; dto.output_size = output_size; - + dto.groups = groups; + dto.padding_above = padding_above; + dto.padding_below = padding_below; } }; /// @} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/convolution_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/CPP/convolution_grad_weights.hpp index 54c361c..2b485b8 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/convolution_grad_weights.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/convolution_grad_weights.hpp @@ -66,6 +66,7 @@ struct convolution_grad_weights : public primitive_base(0)) @@ -81,6 +82,7 @@ struct convolution_grad_weights : public primitive_base(0)) + , _prev_weights_grad(std::vector(0)) + , _prev_bias_grad(std::vector(0)) + { + } + + /// @brief Constructs convolution_grad_weights primitive (w/o bias). + /// @param id This primitive id. + /// @param input Input gradient primitive id. + /// @param input Input primitive id from convolution forward pass. + /// @param weights List of primitive ids containing weights data. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution_grad_weights window should start calculations. + /// @param dilation Defines dilation size. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating. Leave empty if primitive is last in backward pass. + convolution_grad_weights( + const primitive_id& id, + const primitive_id& input_grad, + const primitive_id& input, + const std::vector& weights, + tensor stride, + tensor input_offset, + tensor dilation, + const primitive_id& conv_grad = "", + const padding& output_padding = padding() + ) + :primitive_base(id, { input_grad, input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , prev_weights_grad(_prev_weights_grad.cpp_ids) + , prev_bias_grad(_prev_bias_grad.cpp_ids) + , conv_grad(conv_grad) + , stride(stride) + , input_offset(input_offset) + , dilation(dilation) + , output_grad_w(false) , _weights(weights) , _bias(std::vector(0)) , _prev_weights_grad(std::vector(0)) @@ -144,6 +185,7 @@ struct convolution_grad_weights : public primitive_basestride) , input_offset(dto->input_offset) , dilation(dto->dilation) + , output_grad_w(dto->output_grad_w) , _weights(dto->weights) , _bias(dto->bias) , _prev_weights_grad(dto->prev_weights_grad) @@ -189,6 +232,8 @@ struct convolution_grad_weights : public primitive_base(weights.size()); } @@ -226,6 +271,7 @@ protected: dto.dilation = dilation; dto.split = split(); dto.stride = stride; + dto.output_grad_w = output_grad_w; dto.conv_grad = conv_grad.c_str(); dto.prev_bias_grad = _prev_bias_grad.ref(); dto.prev_weights_grad = _prev_weights_grad.ref(); diff --git a/inference-engine/thirdparty/clDNN/api/CPP/crop.hpp b/inference-engine/thirdparty/clDNN/api/CPP/crop.hpp index 7395d18..3d74c96 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/crop.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/crop.hpp @@ -28,8 +28,20 @@ namespace cldnn /// @addtogroup cpp_primitives Primitives /// @{ + +/// @brief Marker type indicating that instead of reference input size left, top, +/// right and bottom borders (to cut out) should be specified. +/// +/// @details Used to differentiate constructors. +struct crop_borders_t {}; + +/// @brief Marker indicating that instead of reference input size left, top, +/// right and bottom borders (to cut out) should be specified. +constexpr auto crop_borders = crop_borders_t{}; + /// @brief Performs crop operation on input. -/// @details Crops the input to the shape of reference_input accross all dimensions taking into account specified input offsets. +/// @details Crops the input to the shape of reference_input across all dimensions taking into account specified input offsets. +/// @n Borders variant calculated output shape from input shape minus the specified borders. /// @n /// @n\b Examples /// @n Crop without offset example: @@ -37,13 +49,18 @@ namespace cldnn /// @n Crop with offset example: /// \image html crop_w_offset.jpg /// @n -/// @n\b Requirements -/// @n - Input and reference format has to be same -/// @n - Input, reference and offset layout (order) has to be the same +/// @n\b Requirements (reference size variant) /// @n - Input size cannot be greater than reference size in any dimension /// @n - All sizes have to have positive numbers /// @n - Reference size plus offset cannot exceed input size -/// @n Breaking any of this conditions will cause exeption throw. +/// @n +/// @n\b Requirements (borders variant) +/// @n - Borders support batch, feature and spatial dimensions (rest of dimensions ignored). +/// @n - Input size cannot be greater than reference size in any dimension +/// @n - All sizes specified in borders have to have non-negative values (positive or @c 0). +/// @n - Sum of sizes of opposite borders must be lower than input size (on all non-ignored dimensions). +/// @n +/// @n Breaking any of this conditions will cause exception throw. struct crop : public primitive_base { CLDNN_DECLARE_PRIMITIVE(crop) @@ -66,6 +83,55 @@ struct crop : public primitive_base { } + /// @brief Constructs crop primitive (borders variant). + /// + /// @details Allows to specify borders from each side that should be cut out + /// by the primitive. + /// @n NOTE: Borders variant supports only up to four dimensions. + /// + /// @param id Identifier of newly created primitive. + /// @param input Identifier of input primitive which dimensions will be cropped. + /// @param lt_borders Border sizes (spatial dimensions define left (X) and top (Y) + /// borders, non-spatial dimensions - lower borders) + /// @param rb_borders Border sizes (spatial dimensions define right (X) and bottom (Y) + /// borders, non-spatial dimensions - upper borders) + crop( + const primitive_id& id, + const primitive_id& input, + const tensor& lt_borders, + const tensor& rb_borders, + const crop_borders_t, + const padding& output_padding = padding() + ) + :primitive_base(id, {input}, output_padding) + , reference_input(rb_borders.negate()) + , offsets(lt_borders) + { + } + + /// @brief Constructs crop primitive (symmetric borders variant). + /// + /// @details Allows to specify borders from each side that should be cut out + /// by the primitive. + /// @n NOTE: Borders variant supports only up to four dimensions. + /// + /// @param id Identifier of newly created primitive. + /// @param input Identifier of input primitive which dimensions will be cropped. + /// @param xy_borders Border sizes (symmetric; spatial dimensions define left/right (X) + /// and top/bottom (Y) borders, non-spatial dimensions - lower/upper borders). + crop( + const primitive_id& id, + const primitive_id& input, + const tensor& xy_borders, + const crop_borders_t, + const padding& output_padding = padding() + ) + :primitive_base(id, {input}, output_padding) + , reference_input(xy_borders.negate()) + , offsets(xy_borders) + { + } + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{crop} crop(const dto* dto) :primitive_base(dto) diff --git a/inference-engine/thirdparty/clDNN/api/CPP/deconvolution.hpp b/inference-engine/thirdparty/clDNN/api/CPP/deconvolution.hpp index f1de10d..21607b1 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/deconvolution.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/deconvolution.hpp @@ -63,16 +63,88 @@ struct deconvolution : public primitive_base& weights, + const std::vector& bias, + uint32_t groups, + tensor stride = { 1, 1, 1, 1 }, + tensor input_offset = { 0,0,0,0 }, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , input_offset(input_offset) + , stride(stride) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(groups) + , _weights(weights) + , _bias(bias) + , _gradient(false) + { + } + + /// @brief Constructs deconvolution primitive (w/o bias). + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the deconvolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param with_activation Enables Relu activation. + /// @param activation_slp Relu activation slope. + deconvolution( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + tensor stride = { 1, 1, 1, 1 }, + tensor input_offset = { 0,0,0,0 }, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding(), + bool gradient = false + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , input_offset(input_offset) + , stride(stride) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , groups(1) + , _weights(weights) + , _bias(std::vector(0)) + , _gradient(gradient) + { + } /// @brief Constructs deconvolution primitive (w/o bias). /// @param id This primitive id. /// @param input Input primitive id. /// @param weights List of primitive ids containing weights data. + /// @param groups Number of filter groups. /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the deconvolution window should start calculations. /// @param stride Defines shift in input buffer between adjacent calculations of output values. /// @param with_activation Enables Relu activation. @@ -81,6 +153,7 @@ struct deconvolution : public primitive_base& weights, + uint32_t groups, tensor stride = { 1, 1, 1, 1 }, tensor input_offset = { 0,0,0,0 }, bool with_activation = false, @@ -96,6 +169,7 @@ struct deconvolution : public primitive_base(0)) , _gradient(gradient) @@ -133,12 +207,54 @@ struct deconvolution : public primitive_base& weights, + const std::vector& bias, + uint32_t groups, + tensor stride, + tensor input_offset, + bool with_activation, + float activation_slp, + tensor output_size, + const padding& output_padding = padding() + ) + :primitive_base(id, { input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , input_offset(input_offset) + , stride(stride) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(true) + , output_size(output_size) + , groups(groups) , _weights(weights) , _bias(bias) , _gradient(false) { } + /// @brief Constructs deconvolution primitive (w/o bias, computes input paddings to match output size). /// @param id This primitive id. /// @param input Input primitive id. @@ -169,6 +285,7 @@ struct deconvolution : public primitive_base(0)) , _gradient(gradient) @@ -186,6 +303,7 @@ struct deconvolution : public primitive_baseactivation_negative_slope) , with_output_size(dto->with_output_size != 0) , output_size(dto->output_size) + , groups(dto->groups) , _weights(dto->weights) , _bias(dto->bias) , _gradient(dto->gradient != 0) @@ -264,6 +382,8 @@ struct deconvolution : public primitive_base(weights.size()); } @@ -299,9 +419,10 @@ protected: dto.with_output_size = with_output_size; dto.output_size = output_size; dto.gradient = _gradient; + dto.groups = groups; } }; /// @} /// @} /// @} -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/depth_to_space.hpp b/inference-engine/thirdparty/clDNN/api/CPP/depth_to_space.hpp new file mode 100644 index 0000000..d083103 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/depth_to_space.hpp @@ -0,0 +1,72 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "../C/depth_to_space.h" +#include "primitive.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief +/// @details +struct depth_to_space : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(depth_to_space) + + /// @brief Constructs depth_to_space primitive. + /// @param id This primitive id. + /// @param input Input dictionary primitive id. + /// @param block_size Block size. + depth_to_space( + const primitive_id& id, + const primitive_id& input, + const size_t block_size, + const padding& output_padding = padding() + ) + : primitive_base(id, {input}, output_padding) + , block_size(block_size) + { + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{depth_to_space} + depth_to_space(const dto* dto) + : primitive_base(dto) + , block_size(dto->block_size) + { + } + + /// @brief Block size. + size_t block_size; +protected: + + void update_dto(dto& dto) const override + { + dto.block_size = block_size; + } +}; +/// @} +/// @} +/// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/detection_output.hpp b/inference-engine/thirdparty/clDNN/api/CPP/detection_output.hpp index 8d3d75c..87ea568 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/detection_output.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/detection_output.hpp @@ -18,6 +18,7 @@ #pragma once #include #include "../C/detection_output.h" +#include "../C/detection_output_sort.h" #include "primitive.hpp" namespace cldnn @@ -39,7 +40,7 @@ enum class prior_box_code_type : int32_t /// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression. /// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax]. -/// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1. +/// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1. struct detection_output : public primitive_base { CLDNN_DECLARE_PRIMITIVE(detection_output) @@ -80,7 +81,8 @@ struct detection_output : public primitive_baseinput_width) , input_height(dto->input_height) , decrease_label_id(dto->decrease_label_id != 0) - , clip(dto->clip != 0) + , clip_before_nms(dto->clip_before_nms != 0) + , clip_after_nms(dto->clip_after_nms != 0) { if (decrease_label_id && background_label_id != 0) throw std::invalid_argument("Cannot use decrease_label_id and background_label_id parameter simultaneously."); @@ -163,8 +167,10 @@ struct detection_output : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(detection_output_sort) + + /// @brief Constructs detection output primitive. + /// @param id This primitive id. + /// @param input_bboxes Input bounding boxes primitive id. + /// @param num_images Number of images to be predicted. + /// @param num_classes Number of classes to be predicted. + /// @param keep_top_k Number of total bounding boxes to be kept per image after NMS step. + /// @param share_location If true bounding box are shared among different classes. + /// @param top_k Maximum number of results to be kept in NMS. + /// @param output_padding Output padding. + detection_output_sort( + const primitive_id& id, + const primitive_id& input_bboxes, + const uint32_t num_images, + const uint32_t num_classes, + const uint32_t keep_top_k, + const bool share_location = true, + const int top_k = -1, + const int background_label_id = -1, + const padding& output_padding = padding() + ) + : primitive_base(id, { input_bboxes }, output_padding) + , num_images(num_images) + , num_classes(num_classes) + , keep_top_k(keep_top_k) + , share_location(share_location) + , top_k(top_k) + , background_label_id(background_label_id) + {} + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{detection_output} + detection_output_sort(const dto* dto) + : primitive_base(dto) + , num_images(dto->num_images) + , num_classes(dto->num_classes) + , keep_top_k(dto->keep_top_k) + , share_location(dto->share_location != 0) + , top_k(dto->top_k) + , background_label_id(dto->background_label_id) + {} + + /// @brief Number of classes to be predicted. + const uint32_t num_images; + /// @brief Number of classes to be predicted. + const uint32_t num_classes; + /// @brief Number of total bounding boxes to be kept per image after NMS step. + const int keep_top_k; + /// @brief If true, bounding box are shared among different classes. + const bool share_location; + /// @brief Maximum number of results to be kept in NMS. + const int top_k; + /// @brief Background label id (-1 if there is no background class). + const int background_label_id; + + +protected: + void update_dto(dto& dto) const override + { + dto.num_classes = num_classes; + dto.num_images = num_images; + dto.keep_top_k = keep_top_k; + dto.share_location = share_location; + dto.top_k = top_k; + dto.background_label_id = background_label_id; } }; /// @} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/eltwise.hpp b/inference-engine/thirdparty/clDNN/api/CPP/eltwise.hpp index f1b2084..619be49 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/eltwise.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/eltwise.hpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -45,14 +45,35 @@ enum class eltwise_mode : int32_t min = cldnn_eltwise_min, /// @brief Eltwise pow. pow = cldnn_eltwise_pow, + /// @brief Eltwise squared diff. + squared_diff = cldnn_eltwise_squared_diff, /// @brief Eltwise mod. mod = cldnn_eltwise_mod, + /// @brief Eltwise equal. + eq = cldnn_eltwise_eq, + /// @brief Eltwise not equal. + ne = cldnn_eltwise_ne, + /// @brief Eltwise less. + lt = cldnn_eltwise_lt, + /// @brief Eltwise less of equal. + le = cldnn_eltwise_le, + /// @brief Eltwise greater. + gt = cldnn_eltwise_gt, + /// @brief Eltwise greater or equal. + ge = cldnn_eltwise_ge, + /// @brief Eltwise and. + logic_and = cldnn_eltwise_and, + /// @brief Eltwise or. + logic_or = cldnn_eltwise_or, + /// @brief Eltwise XOR. + logic_xor = cldnn_eltwise_xor }; /// @brief Performs elementwise operations (sum, subtract, max or product) on two input primitives /// Also supports built-in Relu @ref activation available by setting it in arguments. /// @notes -/// - both inputs have to have equal sizes in all dimensions +/// - both inputs have to have equal sizes in all dimensions or the input tensors are broadcastable +/// to the same shape in which the size of each dimention is a max. of input sizes on this dimension) /// - format of both inputs has to be the same /// - when using integer types, only following eltwise modes are supported: sum, sub, prod, div struct eltwise : public primitive_base @@ -82,6 +103,38 @@ struct eltwise : public primitive_base , coefficients(std::vector(0)) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) + { + } + + /// @brief Constructs eltwise primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param input2 Second input primitive id with values needed for eltwise computation. + /// @param stride Defines shift in input buffers between adjacent calculations of output values. + /// @param mode Eltwise mode. + /// @param with_activation Enables Relu activation. + /// @param activation_slp Relu activation slope. + eltwise( + const primitive_id& id, + const primitive_id& input, + const primitive_id& input2, + std::vector stride, + eltwise_mode mode, + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input, input2 }, output_padding) + , output_calibration_factors("") + , output_quantization_factor(1.0f) + , mode(mode) + , coefficients(std::vector(0)) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , stride(stride) + , _stride(tensor_vector_to_cldnn_vector(stride)) { } @@ -106,6 +159,8 @@ struct eltwise : public primitive_base , coefficients(std::vector(0)) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { } @@ -134,6 +189,8 @@ struct eltwise : public primitive_base , coefficients(std::vector(0)) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { } @@ -160,6 +217,8 @@ struct eltwise : public primitive_base , coefficients(std::vector(0)) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { } @@ -188,6 +247,8 @@ struct eltwise : public primitive_base , coefficients(std::vector(0)) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { } @@ -214,6 +275,8 @@ struct eltwise : public primitive_base , coefficients(std::vector(0)) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { } @@ -240,6 +303,8 @@ struct eltwise : public primitive_base , coefficients(coefficients) , with_activation(with_activation) , activation_negative_slope(activation_slp) + , stride(std::vector(0)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { if (mode == eltwise_mode::sum && !coefficients.empty() && coefficients.size() != inputs.size()) { @@ -260,6 +325,8 @@ struct eltwise : public primitive_base , coefficients(float_arr_to_vector(dto->coefficients)) , with_activation(dto->with_activation != 0) , activation_negative_slope(dto->activation_negative_slope) + , stride(tensor_arr_to_vector(dto->stride)) + , _stride(tensor_vector_to_cldnn_vector(stride)) { if (dto->input.size < 2) throw std::invalid_argument("eltiwise dto should containt at least two inputs"); @@ -279,8 +346,11 @@ struct eltwise : public primitive_base bool with_activation; /// @brief Relu activation slope. float activation_negative_slope; + /// @brief Defines shift in input buffers between adjacent calculations of output values. + std::vector stride; protected: + std::vector _stride; std::vector> get_dependencies() const override { std::vector> ret; @@ -298,6 +368,7 @@ protected: dto.coefficients = float_vector_to_arr(coefficients); dto.with_activation = with_activation; dto.activation_negative_slope = activation_negative_slope; + dto.stride = tensor_vector_to_arr(_stride); } }; /// @} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/embed.hpp b/inference-engine/thirdparty/clDNN/api/CPP/embed.hpp index 8acb967..0c1d492 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/embed.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/embed.hpp @@ -56,6 +56,19 @@ namespace cldnn , bias(bias) {} + /// @brief Constructs embed primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + embed( + const primitive_id& id, + const primitive_id& input, + const primitive_id& weights + ) + : primitive_base(id, { input }) + , weights(weights) + , bias("") + {} + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{embed} embed(const dto* dto) :primitive_base(dto) diff --git a/inference-engine/thirdparty/clDNN/api/CPP/engine.hpp b/inference-engine/thirdparty/clDNN/api/CPP/engine.hpp index 83090a1..66fbd2b 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/engine.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/engine.hpp @@ -63,8 +63,12 @@ struct engine_configuration const std::string engine_log; ///< Specifies a file to which engine log should be dumped. Empty by default (means no logging). const std::string sources_dumps_dir; ///< Specifies a directory where sources of cldnn::program objects should be dumped. Empty by default (means no dumping). const priority_mode_types priority_mode; ///< Priority mode (support of priority hints in command queue). If cl_khr_priority_hints extension is not supported by current OpenCL implementation, the value must be set to cldnn_priority_disabled. - const throttle_mode_types throttle_mode; ///< Placeholder for throttle mode (support of throttle hints in command queue). It has no effect for now and should be set to cldnn_throttle_disabled. - bool enable_memory_pool; ///< Enables memory usage optimization. memory objects will be reused when possible (switched off for older drivers then NEO). + + const throttle_mode_types throttle_mode; ///< Throttle mode (support of throttle hints in command queue). If cl_khr_throttle_hints extension is not supported by current OpenCL implementation, the value must be set to cldnn_throttle_disabled. + + bool enable_memory_pool; ///< Enables memory usage optimization. memory objects will be reused when possible (switched off for older drivers then NEO). + void* context; ///< Pointer to user context + const std::string tuning_cache_path; ///< Path to tuning kernel cache /// @brief Constructs engine configuration with specified options. /// @param profiling Enable per-primitive profiling. @@ -83,7 +87,9 @@ struct engine_configuration const std::string& sources_dumps_dir = std::string(), priority_mode_types priority_mode = priority_mode_types::disabled, throttle_mode_types throttle_mode = throttle_mode_types::disabled, - bool memory_pool = true) + bool memory_pool = true, + void* context = nullptr, + const std::string& tuning_cache_path = "cache.json") : enable_profiling(profiling) , meaningful_kernels_names(decorate_kernel_names) , dump_custom_program(dump_custom_program) @@ -95,6 +101,8 @@ struct engine_configuration , priority_mode(priority_mode) , throttle_mode(throttle_mode) , enable_memory_pool(memory_pool) + , context(context) + , tuning_cache_path(tuning_cache_path) {} engine_configuration(const cldnn_engine_configuration& c_conf) @@ -109,6 +117,8 @@ struct engine_configuration , priority_mode(static_cast(c_conf.priority_mode)) , throttle_mode(static_cast(c_conf.throttle_mode)) , enable_memory_pool(c_conf.enable_memory_pool != 0) + , context(c_conf.context) + , tuning_cache_path(c_conf.tuning_cache_path) {} /// @brief Implicit conversion to C API @ref ::cldnn_engine_configuration @@ -125,7 +135,9 @@ struct engine_configuration sources_dumps_dir.c_str(), static_cast(priority_mode), static_cast(throttle_mode), - enable_memory_pool + enable_memory_pool, + context, + tuning_cache_path.c_str() }; } }; diff --git a/inference-engine/thirdparty/clDNN/api/CPP/gather.hpp b/inference-engine/thirdparty/clDNN/api/CPP/gather.hpp new file mode 100644 index 0000000..68669c1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/gather.hpp @@ -0,0 +1,88 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "../C/gather.h" +#include "primitive.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief +/// @details +struct gather : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(gather) + + enum gather_axis + { + along_b = cldnn_gather_along_b, + along_f = cldnn_gather_along_f, + along_x = cldnn_gather_along_x, + along_y = cldnn_gather_along_y + }; + + /// @brief Constructs gather primitive. + /// @param id This primitive id. + /// @param dict Input dictionary primitive id. + /// @param idx Input indexes primitive id. + /// @param axis Gathering axis. + /// @param output_shape Output shape. + gather( + const primitive_id& id, + const primitive_id& dict, + const primitive_id& idx, + const gather_axis axis, + const tensor& output_shape, + const padding& output_padding = padding() + ) + : primitive_base(id, {dict, idx}, output_padding) + , axis(axis) + , output_shape(output_shape) + { + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{gather} + gather(const dto* dto) + : primitive_base(dto) + , axis(static_cast(dto->axis)) + , output_shape(dto->output_shape) + { + } + + /// @brief Gathering axis + gather_axis axis; + /// @brief Gathering input shape + tensor output_shape; +protected: + + void update_dto(dto& dto) const override + { + dto.axis = static_cast(axis); + dto.output_shape = output_shape; + } +}; +/// @} +/// @} +/// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/gemm.hpp b/inference-engine/thirdparty/clDNN/api/CPP/gemm.hpp index ee25c70..1c3bc11 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/gemm.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/gemm.hpp @@ -49,10 +49,10 @@ struct gemm : public primitive_base /// @brief Constructs gemm layer. /// @brief Primitive id containing first matrix /// @brief Primitive id containing second matrix - /// @brief Variable containing ALPHA parameter - /// @brief Variable containing BETA parameter /// @brief Flag for transposing first input matrix /// @brief Flag for transposing second input matrix + /// @brief Variable containing ALPHA parameter + /// @brief Variable containing BETA parameter gemm( const primitive_id& id, @@ -75,10 +75,11 @@ struct gemm : public primitive_base /// @brief Primitive id containing first matrix /// @brief Primitive id containing second matrix /// @brief Primitive id containing third matrix - /// @brief Variable containing ALPHA parameter - /// @brief Variable containing BETA parameter /// @brief Flag for transposing first input matrix /// @brief Flag for transposing second input matrix + /// @brief Variable containing ALPHA parameter + /// @brief Variable containing BETA parameter + gemm( const primitive_id& id, const primitive_id& input, diff --git a/inference-engine/thirdparty/clDNN/api/CPP/index_select.hpp b/inference-engine/thirdparty/clDNN/api/CPP/index_select.hpp index 11ff25a..5897533 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/index_select.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/index_select.hpp @@ -21,15 +21,6 @@ namespace cldnn { -/// @brief Axis which index_select primitive will index. -enum class index_select_axis_name : int32_t -{ - along_b, - along_f, - along_y, - along_x -}; - /// @brief Select index, which will be copied to the output.. /// /// @details Applies index selecting along specified dimension. The indices, which will be copied are specifed by @@ -63,7 +54,7 @@ struct index_select : public primitive_base& axis = { index_select_axis_name::along_b }, + const padding& output_padding = padding() + ) + : primitive_base(id, { input }, output_padding) , axis(axis) + , reverse(true) {} /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{broadcast} index_select(const dto* dto) : primitive_base(dto) - , axis(static_cast(dto->axis)) + , axis(dto->axis, dto->axis + dto->axis_num) + , reverse(dto->reverse) {} - /// @brief Axis of index selecting. - index_select_axis_name axis; + /// @brief A list of axes of index selecting + std::vector axis; + /// @brief Do index_select in reverse order on axis/axes. + bool reverse; protected: void update_dto(dto& dto) const override { - dto.axis = static_cast(axis); + dto.axis = axis.data(); + dto.axis_num = (int)axis.size(); + dto.reverse = reverse; } }; /// @} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/layout.hpp b/inference-engine/thirdparty/clDNN/api/CPP/layout.hpp index 56b1998..1f94384 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/layout.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/layout.hpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -39,6 +39,40 @@ enum class data_types : size_t f32 = cldnn_f32, }; +class optional_data_type +{ + // Must be the same as the undrelying type of `data_types`. + using storage_type = size_t; + + // Implicitly assumes that this value is not used in the `data_types`. + static constexpr auto non_specified_type = + std::numeric_limits::max(); + +public: + optional_data_type() + : storage(non_specified_type) + {} + + optional_data_type(data_types type) + : storage(static_cast(type)) + {} + + operator bool() const { return storage != non_specified_type; } + + // Similarly to std::optional does *not* verify that the object has the type + // set. Unlike it, though, returns the value instead of pointer/reference. + data_types operator*() const { return static_cast(storage); } + + optional_data_type& operator=(const data_types new_type) + { + storage = static_cast(new_type); + return *this; + } + +private: + storage_type storage; +}; + /// Converts C++ type to @ref data_types . template struct type_to_data_type; #ifndef DOXYGEN_SHOULD_SKIP_THIS @@ -98,6 +132,8 @@ struct data_type_traits { case data_types::i8: return "i8"; + case data_types::u8: + return "u8"; case data_types::i32: return "i32"; case data_types::i64: @@ -312,6 +348,11 @@ struct layout sizes[3] = align_to(sizes[3], 32); } + if (format == format::byx8_f4) + { + sizes[3] = align_to(sizes[3], 4); + sizes[2] = align_to(sizes[2], 8); + } std::vector pitches(sizes.size(), tensor::value_type(1)); std::partial_sum(sizes.rbegin(), sizes.rend() - 1, pitches.rbegin() + 1, std::multiplies()); return{ format, pitches }; @@ -352,6 +393,14 @@ struct layout { sizes[0] = align_to(sizes[0], 16); } + else if (this->format == cldnn::format::os_iyx_osv32 && !is_aligned_to(sizes[0], 32)) + { + sizes[0] = align_to(sizes[0], 32); + } + else if (this->format == cldnn::format::os_iyx_osv64 && !is_aligned_to(sizes[0], 64)) + { + sizes[0] = align_to(sizes[0], 64); + } else if (this->format == cldnn::format::bs_xs_xsv8_bsv8 && !(is_aligned_to(sizes[0], 8) && is_aligned_to(sizes[2], 8))) { sizes[0] = align_to(sizes[0], 8); @@ -376,20 +425,49 @@ struct layout { sizes[1] = align_to(sizes[1], 32); } + else if (this->format == cldnn::format::byx8_f4 && (!is_aligned_to(sizes[1], 4) || !is_aligned_to(sizes[2], 8))) + { + // for this case we want to make sure, that with padding we're aligned to 8 in x + auto lp = data_padding.lower_size().spatial[0]; + auto up = data_padding.upper_size().spatial[0]; + sizes[1] = align_to(sizes[1], 4); + sizes[2] = align_to(lp + up + sizes[2], 8); + sizes[2] -= lp + up; + } else if (this->format == cldnn::format::fs_bs_yx_bsv4_fsv32 && (!(is_aligned_to(sizes[1], 32)) || !(is_aligned_to(sizes[0], 4)) ) ) { sizes[1] = align_to(sizes[1], 32); sizes[0] = align_to(sizes[0], 4); } + else if (this->format == cldnn::format::b_fs_yx_fsv4 && !(is_aligned_to(sizes[1], 4))) + { + sizes[1] = align_to(sizes[1], 4); + } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) { sizes[0] = align_to(sizes[0], 8); sizes[1] = align_to(sizes[1], 32); } + else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) + { + sizes[0] = align_to(sizes[0], 32); + sizes[1] = align_to(sizes[1], 32); + } else if (this->format == cldnn::format::is_o_yx_isv32 && !(is_aligned_to(sizes[1], 32))) { sizes[1] = align_to(sizes[1], 32); } + else if (this->format == cldnn::format::is_o32_yx_isv32_swizzled_by_4 && (!is_aligned_to(sizes[1], 32) || !(is_aligned_to(sizes[0], 32)))) + { + sizes[0] = align_to(sizes[0], 32); + sizes[1] = align_to(sizes[1], 32); + } + else if (this->format == cldnn::format::os_is_y_x8_osv8_isv4) + { + sizes[1] = align_to(sizes[1], 4); + sizes[0] = align_to(sizes[0], 8); + sizes[2] = align_to(sizes[2], 8); + } return std::accumulate( sizes.begin(), sizes.end(), diff --git a/inference-engine/thirdparty/clDNN/api/CPP/lstm.hpp b/inference-engine/thirdparty/clDNN/api/CPP/lstm.hpp index dd9e992..2276616 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/lstm.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/lstm.hpp @@ -29,14 +29,14 @@ namespace cldnn /// @{ /// @brief Performs forward Long Short-Term Memory (LSTM) layer. -/// @details The current implementation of LSTM supports Peepholes. -/// it = f(Xt*(Wi^T) + Ht-1*Ri + Pi (.) Ct-1 + Wbi + Rbi) -/// ft = f(Xt*(Wf^T) + Ht-1*Rf + Pf (.) Ct-1 + Wbf + Rbf) -/// ct = g(Xt*(Wc^T) + Ht-1*Rc + Wbc + Rbc) +/// @details The current implementation of LSTM is described the following equations. +/// it = f(Xt*(Wi^T) + Ht-1*Ri + Wbi) +/// ft = f(Xt*(Wf^T) + Ht-1*Rf + Wbf) +/// ct = g(Xt*(Wc^T) + Ht-1*Rc + Wbc) /// Ct = ft (.) Ct-1 + it (.) ct -/// ot = f(Xt*(Wo^T) + Ht-1*Ro + Po (.) Ct + Wbo + Rbo) +/// ot = f(Xt*(Wo^T) + Ht-1*Ro + Wbo) /// Ht = ot (.) h(Ct) -/// Where f=Sigmoid, g=Tanh, and h = Tanh. +/// Where f = Sigmoid, g = Tanh, and h = Tanh. struct lstm : public primitive_base { CLDNN_DECLARE_PRIMITIVE(lstm) @@ -53,6 +53,7 @@ struct lstm : public primitive_base /// @param input_forget Provide 0 if using lstm without coupled input-forget gates. /// @param activations Vector of activations. Specify [f, g, h]. Default are [sigmoid, tanh, tanh] /// @param activation_params Vector of ativation params. Specify params for each [f, g, h] activation. + /// @brief Output selection. Default the entire hidden sequence is returned. /// @param offset_order Order of the concatenated weights, recurrent, and bias. ONNX default is iofz [input, output, forget, block]. lstm( const primitive_id& id, @@ -67,6 +68,7 @@ struct lstm : public primitive_base const bool input_forget = 0, const std::vector& activations = {}, const std::vector activation_params = {}, + const cldnn_lstm_output output_selection = cldnn_lstm_output_sequence, const cldnn_lstm_offset_order offset_order = cldnn_lstm_offset_order_iofz, const padding& output_padding = padding() ) @@ -81,6 +83,7 @@ struct lstm : public primitive_base , input_forget(input_forget) , activations(activations) , activation_params(activation_params) + , output_selection(output_selection) , offset_order(offset_order) { } @@ -98,6 +101,7 @@ struct lstm : public primitive_base , input_forget(dto->input_forget) , activations(dto->activations, std::end(dto->activations)) , activation_params(dto->activation_params, std::end(dto->activation_params)) + , output_selection(dto->output_selection) , offset_order(dto->offset_order) { } @@ -122,6 +126,8 @@ struct lstm : public primitive_base std::vector activations; /// @brief Optional scaling values used by some activation functions. The values are consumed in the order of activation functions. std::vector activation_params; + /// @brief Output selection. Default the entire hidden sequence is returned. + cldnn_lstm_output output_selection; /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe cldnn_lstm_offset_order offset_order; @@ -129,7 +135,7 @@ struct lstm : public primitive_base // /// @brief Optional tensor specifying lengths of the sequences in a batch. // /// If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`. // tensor sequence_lens; - // /// @brief The sequence output for the hidden??? This is not clearly specified in the ONNX definition. + // /// @brief The sequence output for the hidden. // uint32_t output_sequence; protected: std::vector> get_dependencies() const override @@ -160,6 +166,7 @@ protected: dto.peepholes = peepholes.c_str(); dto.initial_hidden = initial_hidden.c_str(); dto.initial_cell = initial_cell.c_str(); + dto.output_selection = output_selection; dto.offset_order = offset_order; if (activations.size() == 3) { std::copy_n(activations.begin(), 3, dto.activations); @@ -271,6 +278,7 @@ struct lstm_elt : public primitive_base activations = {}, const std::vector activation_params = {}, const cldnn_lstm_offset_order offset_order = cldnn_lstm_offset_order_iofz, + const uint32_t direction = 0, const padding& output_padding = padding() ) : primitive_base(id, {input}, output_padding) @@ -280,6 +288,7 @@ struct lstm_elt : public primitive_baseactivations, std::end(dto->activations)) , activation_params(dto->activation_params, std::end(dto->activation_params)) , offset_order(dto->offset_order) + , direction(dto->direction) { } @@ -307,6 +317,9 @@ struct lstm_elt : public primitive_base activation_params; /// @brief Weights, recurrent weights, and biases order. [iofz] : ONNX, [ifoz] : Caffe cldnn_lstm_offset_order offset_order; + /// @brief direction default = 0, bidirectional = 1. + uint32_t direction; + protected: std::vector> get_dependencies() const override { @@ -328,6 +341,7 @@ protected: if (activation_params.size() == 3) { std::copy_n(activation_params.begin(), 3, dto.activation_params); } + dto.direction = direction; } }; diff --git a/inference-engine/thirdparty/clDNN/api/CPP/one_hot.hpp b/inference-engine/thirdparty/clDNN/api/CPP/one_hot.hpp new file mode 100644 index 0000000..5f997b2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/one_hot.hpp @@ -0,0 +1,103 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "../C/one_hot.h" +#include "primitive.hpp" + + +namespace cldnn +{ + /// @addtogroup cpp_api C++ API + /// @{ + /// @addtogroup cpp_topology Network Topology + /// @{ + /// @addtogroup cpp_primitives Primitives + /// @{ + + /// @brief Creates a one-hot encoding of the input. + /// @details Creates a one-hot encoding of the input, putting the new one-hot axis in the position + /// @n specified by the @p one_hot_axis input, using the @p shape tensor as size reference. + /// @n The size of @p shape must be appropriate for adding a one-hot axis to input. For example, + /// @n input_sizes = (1, in_f, in_y, in_x) + /// @n expanded with + /// @n one_hot_axis = 2 + /// @n would insert the one-hot axis in the Y dimension, requiring + /// @n shape = (in_f, in_y, one-hot_limit, in_x) + /// @n The output values would then be determined by input as + /// @n output[f, y, i, x] = (input[0, f, y, x] == i) ? 1 : 0; + /// @n Since determining whether the input is appropriate (that the one-hot axis + /// @n has enough space to fully encode all inputs) requires scanning the whole + /// @n input, the primitive doesn't check for that, instead producing all-zeros + /// @n output axes for inputs below 0 and greater than the limit set by + /// @n @p shape. + /// @n + /// @n@b Requirements + /// @n - @p one_hot_axis must be within (inclusive) range 0 - 3. + /// @n - @p shape must fit input sizes (see example above). + /// @n - input batch size must be equal to 1. + /// @n + /// @n Breaking any of this conditions will cause exception throw. + struct one_hot : public primitive_base + { + CLDNN_DECLARE_PRIMITIVE(one_hot) + + /// @brief Constructs one-hot primitive / layer. + /// + /// @param id An identifier of new primitive. + /// @param input An identifier of primitive which is an input for newly created + /// one-hot primitive. + /// @param shape Size of the output primitive. + /// @param one_hot_axis One-hot axis position (0-based, from left to right) in shape. + /// @param output_padding Optional padding for output from primitive. + one_hot( + const primitive_id& id, + const primitive_id& input, + const tensor& shape, + const uint16_t& one_hot_axis, + const padding& output_padding = padding() + ) + : primitive_base(id, { input }, output_padding), + shape(shape), + one_hot_axis(one_hot_axis) + { + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{one_hot} + one_hot(const dto* dto) + : primitive_base(dto), + shape(dto->shape), + one_hot_axis(dto->one_hot_axis) + { + } + + /// @brief Output size reference. + tensor shape; + /// @brief One-hot axis position in output shape (0-based, from left to right). + uint16_t one_hot_axis; + + protected: + void update_dto(dto& dto) const override + { + dto.shape = shape; + dto.one_hot_axis = one_hot_axis; + + } + }; + /// @} + /// @} + /// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/pooling.hpp b/inference-engine/thirdparty/clDNN/api/CPP/pooling.hpp index 1ca6d8f..3e60f79 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/pooling.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/pooling.hpp @@ -68,6 +68,7 @@ struct pooling : public primitive_base : primitive_base(id, {input}, output_padding) , argmax("") , mode(static_cast(mode)) + , global_pooling(false) , input_offset(input_offset) , stride(stride) , size(size) @@ -95,6 +96,7 @@ struct pooling : public primitive_base : primitive_base(id, { input }, output_padding) , argmax(argmax) , mode(static_cast(mode)) + , global_pooling(false) , input_offset(input_offset) , stride(stride) , size(size) @@ -122,6 +124,7 @@ struct pooling : public primitive_base : primitive_base(id, {input}, output_padding) , argmax("") , mode(static_cast(mode)) + , global_pooling(false) , input_offset(input_offset) , stride(stride) , size(size) @@ -152,6 +155,7 @@ struct pooling : public primitive_base : primitive_base(id, { input }, output_padding) , argmax(argmax) , mode(static_cast(mode)) + , global_pooling(false) , input_offset(input_offset) , stride(stride) , size(size) @@ -159,11 +163,32 @@ struct pooling : public primitive_base , output_size(output_size) {} + /// @brief Constructs pooling primitive with kernel size equal to the spatial dimension of input tensor. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param mode Pooling mode. + pooling( + const primitive_id& id, + const primitive_id& input, + pooling_mode mode, + const padding& output_padding = padding() + ) + : primitive_base(id, { input }, output_padding) + , argmax("") + , mode(static_cast(mode)) + , global_pooling(true) + , input_offset(0, 0, 0, 0) + , stride(1, 1, 1, 1) + , size(0, 0, 0, 0) + , with_output_size(false) + {} + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{pooling} pooling(const dto* dto) : primitive_base(dto) , argmax(dto->argmax) , mode(static_cast(dto->mode)) + , global_pooling(dto->global_pooling != 0) , input_offset(dto->input_offset) , stride(dto->stride) , size(dto->size) @@ -223,6 +248,8 @@ struct pooling : public primitive_base primitive_id argmax; /// @brief Pooling mode. pooling_mode mode; + /// @brief Global pooling (kernel size is equal to the spatial dimension of input tensor) + bool global_pooling; /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the pooling window should start calculations. tensor input_offset; /// @brief Defines shift in input buffer between adjacent calculations of output values. @@ -251,9 +278,10 @@ protected: dto.size = size; dto.with_output_size = with_output_size; dto.output_size = output_size; + dto.global_pooling = global_pooling; } }; /// @} /// @} /// @} -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/api/CPP/primitive.hpp b/inference-engine/thirdparty/clDNN/api/CPP/primitive.hpp index 8314afc..41fa27d 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/primitive.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/primitive.hpp @@ -92,15 +92,30 @@ public: const primitive_type_id& type, const primitive_id& id, const std::vector& input, - const padding& output_padding = padding() + const padding& output_padding = padding(), + const optional_data_type output_data_type = optional_data_type() ) - :type(type), id(id), input(_input.cpp_ids), output_padding(output_padding), _input(input) + : type(type) + , id(id) + , input(_input.cpp_ids) + , output_padding(output_padding) + , output_data_type(output_data_type) + , _input(input) {} /// @brief Constructs a copy from basic C API @CLDNN_PRIMITIVE_DESC{primitive} - primitive(const CLDNN_PRIMITIVE_DESC(primitive)* dto) - :type(dto->type), id(dto->id), input(_input.cpp_ids), output_padding(dto->output_padding), _input(dto->input) - {} + primitive(const CLDNN_PRIMITIVE_DESC(primitive) * dto) + : type(dto->type) + , id(dto->id) + , input(_input.cpp_ids) + , output_padding(dto->output_padding) + , output_data_type(dto->output_data_type.enabled + ? optional_data_type{static_cast( + dto->output_data_type.data_type)} + : optional_data_type{}) + , _input(dto->input) + { + } virtual ~primitive() = default; @@ -114,7 +129,7 @@ public: { std::vector> result; auto&& deps = get_dependencies(); - + result.reserve(_input.size() + deps.size()); for (auto& pid : _input.cpp_ids) result.push_back(std::ref(pid)); @@ -148,6 +163,9 @@ public: /// @brief Requested output padding. padding output_padding; + /// @brief Requested output precision, if any. + optional_data_type output_data_type; + protected: struct primitive_id_arr { @@ -198,6 +216,9 @@ public: _dto.type = type; _dto.input = _input.ref(); _dto.output_padding = output_padding; + _dto.output_data_type.enabled = (bool)output_data_type; + _dto.output_data_type.data_type = + static_cast(*output_data_type); //call abstract method to update primitive-specific fields update_dto(_dto); @@ -208,14 +229,15 @@ protected: explicit primitive_base( const primitive_id& id, const std::vector& input, - const padding& output_padding = padding()) - : primitive(PType::type_id(), id, input, output_padding) + const padding& output_padding = padding(), + optional_data_type output_data_type = optional_data_type()) + : primitive(PType::type_id(), id, input, output_padding, output_data_type) {} primitive_base(const DTO* dto) : primitive(reinterpret_cast(dto)) { - if (dto->type != PType::type_id()) + if (dto->type != PType::type_id()) throw std::invalid_argument("DTO type mismatch"); } diff --git a/inference-engine/thirdparty/clDNN/api/CPP/prior_box.hpp b/inference-engine/thirdparty/clDNN/api/CPP/prior_box.hpp index a21afda..c5ad40a 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/prior_box.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/prior_box.hpp @@ -91,6 +91,9 @@ struct prior_box : public primitive_baseaspect_ratios.push_back(new_aspect_ratio); if (flip) { + if (std::fabs(new_aspect_ratio) < std::numeric_limits::epsilon()) { + throw std::runtime_error("prior_box aspect ratio can't be zero!"); + } this->aspect_ratios.push_back(1.f / new_aspect_ratio); } } diff --git a/inference-engine/thirdparty/clDNN/api/CPP/program.hpp b/inference-engine/thirdparty/clDNN/api/CPP/program.hpp index 6657765..a8520ad 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/program.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/program.hpp @@ -41,6 +41,9 @@ enum class build_option_type /// @brief Enable implicit reordering for user inputs (default: false). optimize_data = cldnn_build_option_optimize_data, + /// @brief Enable running detection output layer always on gpu, regardless performance + detection_output_gpu = cldnn_build_option_detection_output_gpu, + /// @brief Enable debug mode (default: false). /// @details This option enforce all program primitives to be accessible as outputs. debug = cldnn_build_option_debug, @@ -112,6 +115,9 @@ struct build_option /// @brief Enable implicit reordering for user inputs (default: false). static std::shared_ptr optimize_data(bool enable = false); + /// @brief Enable running detection output layer always on GPU, regardless performance (default: false). + static std::shared_ptr detection_output_gpu(bool enable = false); + /// @brief Enable debug mode (default: false). /// @details This option enforce all program primitives to be accessible as outputs. static std::shared_ptr debug(bool enable = false); @@ -462,6 +468,16 @@ namespace detail return std::make_shared(option); } }; + template<> struct build_option_traits + { + typedef build_option_bool object_type; + static std::shared_ptr make_default() { return build_option::detection_output_gpu(); } + static std::shared_ptr make_option(const cldnn_build_option& option) + { + assert(option.type == cldnn_build_option_detection_output_gpu); + return std::make_shared(option); + } + }; template<> struct build_option_traits { typedef build_option_bool object_type; @@ -547,6 +563,11 @@ inline std::shared_ptr build_option::optimize_data(bool enab return std::make_shared>(enable); } +inline std::shared_ptr build_option::detection_output_gpu(bool enable) +{ + return std::make_shared>(enable); +} + inline std::shared_ptr build_option::debug(bool enable) { return std::make_shared>(enable); @@ -664,10 +685,12 @@ private: { case cldnn_build_option_fusing: return detail::build_option_traits::make_option(option); - case cldnn_build_option_learning_config: - return detail::build_option_traits::make_option(option); + case cldnn_build_option_learning_config: + return detail::build_option_traits::make_option(option); case cldnn_build_option_optimize_data: return detail::build_option_traits::make_option(option); + case cldnn_build_option_detection_output_gpu: + return detail::build_option_traits::make_option(option); case cldnn_build_option_debug: return detail::build_option_traits::make_option(option); case cldnn_build_option_outputs: diff --git a/inference-engine/thirdparty/clDNN/api/CPP/proposal.hpp b/inference-engine/thirdparty/clDNN/api/CPP/proposal.hpp index ab4bb33..8de42da 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/proposal.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/proposal.hpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2017-2018 Intel Corporation +// Copyright (c) 2017-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -34,9 +34,9 @@ namespace cldnn struct proposal : public primitive_base { CLDNN_DECLARE_PRIMITIVE(proposal) - + proposal( - const primitive_id& id, + const primitive_id& id, const primitive_id& cls_scores, const primitive_id& bbox_pred, const primitive_id& image_info, @@ -65,8 +65,11 @@ struct proposal : public primitive_basebox_size_scale), swap_xy(dto->swap_xy != 0), initial_clip(dto->initial_clip != 0), + clip_before_nms(dto->clip_before_nms != 0), + clip_after_nms(dto->clip_after_nms != 0), round_ratios(dto->round_ratios != 0), - shift_anchors(dto->shift_anchors != 0) + shift_anchors(dto->shift_anchors != 0), + normalize(dto->normalize != 0) { } @@ -140,7 +152,7 @@ struct proposal : public primitive_base ratios; std::vector scales; float coordinates_offset; @@ -148,8 +160,11 @@ struct proposal : public primitive_base + { + CLDNN_DECLARE_PRIMITIVE(pyramid_roi_align) + + pyramid_roi_align( + const primitive_id& id, + const primitive_id& input, + const padding& output_padding = padding() + ) + : primitive_base(id, { input }, output_padding) + {} + + pyramid_roi_align( + const primitive_id &id_c, + const primitive_id &base_str, + const primitive_id &meta_str, + const primitive_id &P2_str, + const primitive_id &P3_str, + const primitive_id &P4_str, + const primitive_id &P5_str, + const primitive_id &pool_size_str, + const padding& output_padding = padding() + ) + : primitive_base(std::string(id_c), { + base_str, meta_str, P2_str, P3_str, + P4_str, P5_str, pool_size_str}, + output_padding) + {} + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{broadcast} + pyramid_roi_align(const dto* dto) + : primitive_base(dto) + + {} + + protected: + void update_dto(dto &) const override + {} + + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/api/CPP/reorder.hpp b/inference-engine/thirdparty/clDNN/api/CPP/reorder.hpp index cf39f71..78001ef 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/reorder.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/reorder.hpp @@ -49,9 +49,8 @@ struct reorder : public primitive_base const std::vector& values_to_subtract = {}, const cldnn_reorder_mean_mode mode = cldnn_reorder_mean_mode::mean_subtract ) - : primitive_base(id, { input }, output_layout.data_padding) + : primitive_base(id, { input }, output_layout.data_padding, { output_layout.data_type }) , output_format(output_layout.format) - , output_data_type(output_layout.data_type) , mean("") , subtract_per_feature(values_to_subtract) , mean_mode(mode) @@ -70,9 +69,8 @@ struct reorder : public primitive_base primitive_id const& mean, const cldnn_reorder_mean_mode mode = cldnn_reorder_mean_mode::mean_subtract ) - : primitive_base(id, { input }, output_layout.data_padding) + : primitive_base(id, { input }, output_layout.data_padding, { output_layout.data_type }) , output_format(output_layout.format) - , output_data_type(output_layout.data_type) , mean(mean) , subtract_per_feature(0) , mean_mode(mode) @@ -93,9 +91,8 @@ struct reorder : public primitive_base const cldnn_reorder_mean_mode mode = cldnn_reorder_mean_mode::mean_subtract, const padding& output_padding = padding() ) - : primitive_base(id, { input }, output_padding) + : primitive_base(id, { input }, output_padding, { output_data_type }) , output_format(output_format) - , output_data_type(output_data_type) , mean("") , subtract_per_feature(values_to_subtract) , mean_mode(mode) @@ -116,9 +113,8 @@ struct reorder : public primitive_base const cldnn_reorder_mean_mode mode = cldnn_reorder_mean_mode::mean_subtract, const padding& output_padding = padding() ) - : primitive_base(id, { input }, output_padding) + : primitive_base(id, { input }, output_padding, { output_data_type }) , output_format(output_format) - , output_data_type(output_data_type) , mean(mean) , subtract_per_feature(0) , mean_mode(mode) @@ -129,7 +125,6 @@ struct reorder : public primitive_base reorder(const dto* dto) : primitive_base(dto) , output_format(dto->output_format) - , output_data_type(static_cast(dto->output_data_type)) , mean(dto->mean_subtract) , subtract_per_feature(float_arr_to_vector(dto->subtract_per_feature)) , mean_mode(dto->mean_mode) @@ -138,8 +133,6 @@ struct reorder : public primitive_base /// @brief Requested memory format. format output_format; - /// @brief Requested memory data type. - data_types output_data_type; /// @brief Primitive id to get mean subtract values. Ignored if subtract_per_featrue is set. primitive_id mean; /// @brief Array of mean subtract values. @@ -158,7 +151,6 @@ protected: void update_dto(dto& dto) const override { dto.output_format = static_cast(output_format.value); - dto.output_data_type = static_cast(output_data_type); dto.mean_subtract = mean.c_str(); dto.subtract_per_feature = float_vector_to_arr(subtract_per_feature); dto.mean_mode = mean_mode; diff --git a/inference-engine/thirdparty/clDNN/api/CPP/reshape.hpp b/inference-engine/thirdparty/clDNN/api/CPP/reshape.hpp index 233ee91..7c834d5 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/reshape.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/reshape.hpp @@ -41,6 +41,8 @@ struct reshape : public primitive_base /// @param id This primitive id. /// @param input Input primitive id. /// @param output_shape Requested memory shape (excluding padding). + /// A dimension could be 0, in this case, the value is taken from the input tensor. + /// At most one dimension of the new shape can be -1. In this case, the value is inferred from the size of the tensor and the remaining dimensions. /// @param output_padding Requested memory padding. reshape( const primitive_id& id, diff --git a/inference-engine/thirdparty/clDNN/api/CPP/reverse_sequence.hpp b/inference-engine/thirdparty/clDNN/api/CPP/reverse_sequence.hpp new file mode 100644 index 0000000..9269e42 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/reverse_sequence.hpp @@ -0,0 +1,100 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "../C/reverse_sequence.h" +#include "primitive.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief +/// @details +struct reverse_sequence : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(reverse_sequence) + + /// @brief Constructs reverse_sequence primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param seq_lengths Sequence lengths primitive id. + /// @param seq_axis The axis which is partially reversed. + /// @param batch_axis The axis along which reversal is performed. + reverse_sequence( + const primitive_id& id, + const primitive_id& input, + const primitive_id& seq_lengths, + const int32_t seq_axis, + const int32_t batch_axis = 0, + const padding& output_padding = padding() + ) + : primitive_base(id, {input, seq_lengths}, output_padding) + , seq_axis(seq_axis) + , batch_axis(batch_axis) + { + const int32_t number_of_dims = 4; + + int32_t batch_a = batch_axis; + int32_t seq_a = seq_axis; + + if (batch_a < 0) + batch_a += number_of_dims; + + if (seq_a < 0) + seq_a += number_of_dims; + + if (batch_a == seq_a) + throw std::runtime_error("Batch axis and sequence axis should not be equal\n"); + + if (batch_a < 0 || batch_a >= number_of_dims) + throw std::runtime_error("Incorrect batch axis value! Actual axis is" + std::to_string(batch_a)); + + if (seq_a < 0 || seq_a >= number_of_dims) + throw std::runtime_error("Incorrect sequence axis value! Actual axis is" + std::to_string(seq_a)); + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{reverse_sequence} + reverse_sequence(const dto* dto) + : primitive_base(dto) + , seq_axis(dto->seq_axis) + , batch_axis(dto->batch_axis) + { + } + + /// @brief The axis which is partially reversed. + int32_t seq_axis; + /// @brief The axis along which reversal is performed. + int32_t batch_axis; +protected: + + void update_dto(dto& dto) const override + { + dto.seq_axis = seq_axis; + dto.batch_axis = batch_axis; + } +}; +/// @} +/// @} +/// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/roi_pooling.hpp b/inference-engine/thirdparty/clDNN/api/CPP/roi_pooling.hpp index 3007f8c..1b5afa6 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/roi_pooling.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/roi_pooling.hpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2017 Intel Corporation +// Copyright (c) 2017-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -39,43 +39,58 @@ struct roi_pooling : public primitive_base(dto->mode)) + , position_sensitive(dto->position_sensitive) , pooled_width(dto->pooled_width) , pooled_height(dto->pooled_height) , spatial_scale(dto->spatial_scale) - , group_sz(dto->group_sz) + , output_dim(dto->output_dim) + , spatial_bins_x(dto->spatial_bins_x) + , spatial_bins_y(dto->spatial_bins_y) {} pooling_mode mode; + bool position_sensitive; int pooled_width; int pooled_height; float spatial_scale; - int group_sz; + int output_dim; + int spatial_bins_x; + int spatial_bins_y; protected: void update_dto(dto& dto) const override { dto.mode = static_cast(mode); + dto.position_sensitive = position_sensitive; dto.pooled_width = pooled_width; dto.pooled_height = pooled_height; dto.spatial_scale = spatial_scale; - dto.group_sz = group_sz; + dto.output_dim = output_dim; + dto.spatial_bins_x = spatial_bins_x; + dto.spatial_bins_y = spatial_bins_y; } }; diff --git a/inference-engine/thirdparty/clDNN/api/CPP/shuffle_channels.hpp b/inference-engine/thirdparty/clDNN/api/CPP/shuffle_channels.hpp new file mode 100644 index 0000000..03c974e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/shuffle_channels.hpp @@ -0,0 +1,79 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "../C/shuffle_channels.h" +#include "primitive.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief +/// @details +struct shuffle_channels : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(shuffle_channels) + + /// @brief Constructs shuffle_channels primitive. + /// @param id This primitive id. + /// @param input Input dictionary primitive id. + /// @param group The number of groups to split the channel dimension. + /// @param axis The index of the channel dimension. + shuffle_channels( + const primitive_id& id, + const primitive_id& input, + const int32_t group, + const int32_t axis = 1, + const padding& output_padding = padding() + ) + : primitive_base(id, {input}, output_padding) + , group(group) + , axis(axis) + { + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{shuffle_channels} + shuffle_channels(const dto* dto) + : primitive_base(dto) + , group(dto->group) + , axis(dto->axis) + { + } + + /// @brief The number of groups to split the channel dimension. This number must evenly divide the channel dimension size. + int32_t group; + /// @brief The index of the channel dimension (default is 1). + int32_t axis; +protected: + + void update_dto(dto& dto) const override + { + dto.group = group; + dto.axis = axis; + } +}; +/// @} +/// @} +/// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/split.hpp b/inference-engine/thirdparty/clDNN/api/CPP/split.hpp index 0ed7f22..08e3789 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/split.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/split.hpp @@ -112,35 +112,6 @@ protected: return res; } - - static std::vector tensor_arr_to_vector(const cldnn_tensor_arr& arr) - { - std::vector result(arr.size); - for (size_t i = 0; i < arr.size; i++) - result[i] = arr.data[i]; - - return result; - } - - static std::vector tensor_arr_to_cldnn_vector(const cldnn_tensor_arr& arr) - { - std::vector result(arr.size); - for (size_t i = 0; i < arr.size; i++) - result[i] = arr.data[i]; - - return result; - } - - static std::vector tensor_vector_to_cldnn_vector(const std::vector& stor) - { - std::vector res; - res.resize(stor.size()); - for (size_t i = 0; i < stor.size(); ++i) - res[i] = stor[i]; - - return res; - } - }; /// @} /// @} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/strided_slice.hpp b/inference-engine/thirdparty/clDNN/api/CPP/strided_slice.hpp new file mode 100644 index 0000000..98bcc74 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api/CPP/strided_slice.hpp @@ -0,0 +1,99 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "../C/strided_slice.h" +#include "primitive.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief +/// @details +struct strided_slice : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(strided_slice) + + /// @brief Constructs strided_slice primitive. + /// @param id This primitive id. + /// @param input Input data primitive id. + /// @param begin_id Begin position primitive id. + /// @param end_id End position primitive id. + /// @param strides_id Step of slicing primitive id. + /// @param begin_mask Array of bits, that provide replace begin[i] to max possible range in that dimension. + /// @param end_mask Array of bits, that provide replace end[i] to max possible range in that dimension. + /// @param new_axis_mask Array of bits, that provide adding a new length 1 dimension at ith position in the output tensor. + /// @param shrink_axis_mask Array of bits, that provide shrinks the dimensionality by 1, taking on the value at index begin[i]. + strided_slice( + const primitive_id& id, + const primitive_id& input, + const primitive_id& begin_id, + const primitive_id& end_id, + const primitive_id& strides_id, + std::vector begin_mask, + std::vector end_mask, + std::vector new_axis_mask, + std::vector shrink_axis_mask, + const padding& output_padding = padding() + ) + : primitive_base(id, {input, begin_id, end_id, strides_id}, output_padding) + , begin_mask(begin_mask) + , end_mask(end_mask) + , new_axis_mask(new_axis_mask) + , shrink_axis_mask(shrink_axis_mask) + { + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{strided_slice} + strided_slice(const dto* dto) + : primitive_base(dto) + , begin_mask(uint8_t_arr_to_vector(dto->begin_mask)) + , end_mask(uint8_t_arr_to_vector(dto->end_mask)) + , new_axis_mask(uint8_t_arr_to_vector(dto->new_axis_mask)) + , shrink_axis_mask(uint8_t_arr_to_vector(dto->shrink_axis_mask)) + { + } + + /// @param begin_mask Array of bits, that provide replace begin[i] to max possible range in that dimension. + std::vector begin_mask; + /// @param end_mask Array of bits, that provide replace end[i] to max possible range in that dimension. + std::vector end_mask; + /// @param new_axis_mask Array of bits, that provide adding a new length 1 dimension at ith position in the output tensor. + std::vector new_axis_mask; + /// @param shrink_axis_mask Array of bits, that provide shrinks the dimensionality by 1, taking on the value at index begin[i]. + std::vector shrink_axis_mask; + +protected: + + void update_dto(dto& dto) const override + { + dto.begin_mask = uint8_t_vector_to_arr(begin_mask); + dto.end_mask = uint8_t_vector_to_arr(end_mask); + dto.new_axis_mask = uint8_t_vector_to_arr(new_axis_mask); + dto.shrink_axis_mask = uint8_t_vector_to_arr(shrink_axis_mask); + } +}; +/// @} +/// @} +/// @} +} diff --git a/inference-engine/thirdparty/clDNN/api/CPP/tensor.hpp b/inference-engine/thirdparty/clDNN/api/CPP/tensor.hpp index 2a5439e..9528f01 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/tensor.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/tensor.hpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -45,6 +45,8 @@ struct format_traits size_t feature_num; /// @brief Number of spatial (x,y) dimensions in a format. size_t spatial_num; + /// @brief Number of local (x,y) dimensions in a format. + size_t local_num; /// @brief Dimensions changing order from rare to often. std::string order; /// @brief Dimensions order for internal storage. @@ -55,12 +57,16 @@ struct format_traits static const char* feature_chars() { return "fioc"; } /// @brief Characters representing spatial dimensions in an order. static const char* spatial_chars() { return "xyzhsw"; } + /// @brief Characters representing local dimensions in an order. + static const char* local_chars() { return "kl"; } /// @brief Checks if @p c represents batch dimension. static bool is_batch_char(char c) { return std::string(batch_chars()).find_first_of(c) != std::string::npos; } /// @brief Checks if @p c represents feature map/channel dimension. static bool is_feature_char(char c) { return std::string(feature_chars()).find_first_of(c) != std::string::npos; } /// @brief Checks if @p c represents spatial dimension. static bool is_spatial_char(char c) { return std::string(spatial_chars()).find_first_of(c) != std::string::npos; } + /// @brief Checks if @p c represents local dimensions. + static bool is_local_char(char c) { return std::string(local_chars()).find_first_of(c) != std::string::npos; } }; /// @brief Represents memory formats (orders). @@ -82,6 +88,8 @@ struct format fyxb = cldnn_format_fyxb, ///< format not used inside clDNN, but supported in reorder as extension for user provided formats. os_iyx_osv16 = cldnn_format_os_iyx_osv16, ///< format used only for convolution weights: os - output feature maps slice, i - input feature maps, yx - spatials, sv16 - 16 values of single slice. ///< \n \image html os_iyx_osv16.jpg + os_iyx_osv32 = cldnn_format_os_iyx_osv32, ///< format used only for convolution weights: os - output feature maps slice, i - input feature maps, yx - spatials, sv32 - 32 values of single slice. + os_iyx_osv64 = cldnn_format_os_iyx_osv64, ///< format used only for convolution weights: os - output feature maps slice, i - input feature maps, yx - spatials, sv64 - 64 values of single slice. bs_xs_xsv8_bsv8 = cldnn_format_bs_xs_xsv8_bsv8, ///< format used only for fully connected weights: bs - batch slice, xs - x slice, bsv8 - 8 values of single slice. ///< \n \image html bs_xs_xsv8_bsv8.jpg bs_xs_xsv8_bsv16 = cldnn_format_bs_xs_xsv8_bsv16,///< format used only for fully connected weights: bs - batch slice, xs - x slice, bsv16 - 16 values of single slice. @@ -101,9 +109,16 @@ struct format image_2d_weights_winograd_6x3_s1_fbxyb, ///< image format used for weights for winograd fused convolution, F(6,3) -- filter 3x3 with stride 1 image_2d_weights_winograd_6x3_s1_xfbyb, ///< image format used for weights for winograd fused convolution, F(6,3) -- filter 3x3 with stride 1 os_is_yx_isa8_osv8_isv4, /// format for weights for MMAD convolution + os_is_yx_isa8_osv8_isv4_swizzled_by_4, /// format for weights for MMAD convolution is_o_yx_isv32, /// format for weights for 1x1 MMAD convolutions + is_o32_yx_isv32_swizzled_by_4, /// format for weights for 1x1 MMAD convolutions + os_is_y_x8_osv8_isv4, /// format for weights for 1x1 MMAD convolutions byxf_af32, /// < \n format for input for primitives using MMAD + byx8_f4, /// < \n format for input for MMAD convolutions fs_bs_yx_bsv4_fsv32, /// < \n format for batched input for primitives using MMAD + bf_lyx_yx = cldnn_bf_lyx_yx, /// < \n format for local convolution weights + b_fs_yx_fsv4, /// < \n format for input for IMAD convolutions + os_is_yx_osv16_isv4, /// < \n format for weights for IMAD convolutions format_num = cldnn_format_format_num, ///< number of format types any = cldnn_format_any }; @@ -113,27 +128,36 @@ struct format { static const std::map traits { - { yxfb,{ 1, 1, 2, "yxfb", "bfxy" } }, - { byxf,{ 1, 1, 2, "byxf", "bfxy" } }, - { bfyx,{ 1, 1, 2, "bfyx", "bfxy" } }, - { fyxb,{ 1, 1, 2, "fyxb", "bfxy" } }, - { os_iyx_osv16, { 1, 1, 2, "bfyx", "bfxy" } }, - { bs_xs_xsv8_bsv8, { 1, 1, 1, "bx", "b?x?" } }, - { bs_xs_xsv8_bsv16,{ 1, 1, 1, "bx", "b?x?" } }, - { bs_x_bsv16, { 1, 1, 1, "bx", "b?x?" } }, - { bf8_xy16, { 1, 1, 2, "bfyx", "bfxy" }}, - { image_2d_weights_c4_fyx_b, { 1, 1, 2, "bfyx", "bfxy" } }, - { image_2d_weights_c1_b_fyx, { 1, 1, 2, "bfyx", "bfxy" } }, - { winograd_2x3_s1_data, { 1, 1, 2, "bxyf", "bfxy" } }, - { winograd_2x3_s1_weights, { 1, 1, 2, "bfyx", "bfxy" } }, - { winograd_2x3_s1_fused_weights, { 1, 1, 2, "xyfb", "bfxy" } }, - { winograd_6x3_s1_fused_weights,{ 1, 1, 2, "xyfb", "bfxy" } }, - { image_2d_weights_winograd_6x3_s1_fbxyb,{ 1, 1, 2, "xyfb", "bfxy" } }, - { image_2d_weights_winograd_6x3_s1_xfbyb,{ 1, 1, 2, "xyfb", "bfxy" } }, - { os_is_yx_isa8_osv8_isv4, { 1, 1, 2, "bfyx", "bfxy" } }, - { is_o_yx_isv32 , {1, 1, 2, "byxf", "bfxy" } }, - { byxf_af32, { 1, 1, 2, "byxf", "bfxy" } }, - { fs_bs_yx_bsv4_fsv32 , { 1, 1, 2, "fbyx", "bfxy" }} + { yxfb,{ 1, 1, 2, 0, "yxfb", "bfxy" } }, + { byxf,{ 1, 1, 2, 0, "byxf", "bfxy" } }, + { bfyx,{ 1, 1, 2, 0, "bfyx", "bfxy" } }, + { fyxb,{ 1, 1, 2, 0, "fyxb", "bfxy" } }, + { os_iyx_osv16, { 1, 1, 2, 0, "bfyx", "bfxy" } }, + { os_iyx_osv32,{ 1, 1, 2, 0, "bfyx", "bfxy" } }, + { os_iyx_osv64,{ 1, 1, 2, 0, "bfyx", "bfxy" } }, + { bs_xs_xsv8_bsv8, { 1, 1, 1, 0, "bx", "b?x?" } }, + { bs_xs_xsv8_bsv16,{ 1, 1, 1, 0, "bx", "b?x?" } }, + { bs_x_bsv16, { 1, 1, 1, 0, "bx", "b?x?" } }, + { bf8_xy16, { 1, 1, 2, 0, "bfyx", "bfxy" }}, + { image_2d_weights_c4_fyx_b, { 1, 1, 2, 0, "bfyx", "bfxy" } }, + { image_2d_weights_c1_b_fyx, { 1, 1, 2, 0, "bfyx", "bfxy" } }, + { winograd_2x3_s1_data, { 1, 1, 2, 0, "bxyf", "bfxy" } }, + { winograd_2x3_s1_weights, { 1, 1, 2, 0, "bfyx", "bfxy" } }, + { winograd_2x3_s1_fused_weights, { 1, 1, 2, 0, "xyfb", "bfxy" } }, + { winograd_6x3_s1_fused_weights,{ 1, 1, 2, 0, "xyfb", "bfxy" } }, + { image_2d_weights_winograd_6x3_s1_fbxyb,{ 1, 1, 2, 0, "xyfb", "bfxy" } }, + { image_2d_weights_winograd_6x3_s1_xfbyb,{ 1, 1, 2, 0, "xyfb", "bfxy" } }, + { os_is_yx_isa8_osv8_isv4, { 1, 1, 2, 0, "bfyx", "bfxy" } }, + { os_is_yx_isa8_osv8_isv4_swizzled_by_4,{ 1, 1, 2, 0, "bfyx", "bfxy" } }, + { byxf_af32, { 1, 1, 2, 0, "byxf", "bfxy" } }, + { byx8_f4 , { 1, 1, 2, 0, "byxf", "bfyx"} }, + { fs_bs_yx_bsv4_fsv32 , { 1, 1, 2, 0, "fbyx", "bfxy" }}, + { is_o_yx_isv32 , {1, 1, 2, 0, "byxf", "bfxy" } }, + { is_o32_yx_isv32_swizzled_by_4 , {1,1,2,0,"byxf", "bfxy" } }, + { os_is_y_x8_osv8_isv4 , { 1, 1, 2, 0, "byxf", "bfxy" } }, + { bf_lyx_yx,{ 1, 1, 2, 2, "bfklyx", "bfklxy" } }, + { b_fs_yx_fsv4,{ 1, 1, 1, 0, "bfyx", "bfxy" } }, + { os_is_yx_osv16_isv4,{ 1, 1, 1, 0, "bfxy", "bfxy?" } }, }; return traits.at(fmt); } @@ -144,6 +168,8 @@ struct format static size_t feature_num(type fmt) { return traits(fmt).feature_num; } /// @brief Returns number of spatial dimensions for a @p format. static size_t spatial_num(type fmt) { return traits(fmt).spatial_num; } + /// @brief Returns number of local dimensions for a @p format. + static size_t local_num(type fmt) { return traits(fmt).local_num; } /// @brief Returns an order of dimensions for a @ format. static const std::string& order(type fmt) { return traits(fmt).order; } /// @brief Returns an internal orders of dimensions for a @p format. @@ -163,6 +189,8 @@ struct format size_t feature_num() const { return traits(value).feature_num; } /// @brief Returns number of spatial dimensions. size_t spatial_num() const { return traits(value).spatial_num; } + /// @brief Returns number of local dimensions. + size_t local_num() const { return traits(value).local_num; } /// @brief Returns an order of dimensions in form of string. const std::string& order() const { return traits(value).order; } /// @brief Returns an internal orders of dimensions form of string. @@ -197,7 +225,8 @@ enum class dim_vec_kind { batch, feature, - spatial + spatial, + local }; /// @brief template class with max_dimensionalities and dimension offset for dimension kinds @@ -228,6 +257,13 @@ struct dim_vec_limits static constexpr int32_t dim_offset = CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX; }; +template <> +struct dim_vec_limits +{ + static constexpr int32_t max_dimentionality = CLDNN_TENSOR_LOCAL_DIM_MAX; + static constexpr int32_t dim_offset = CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX; +}; + /// @brief Template class used in tensor constructor using dim_vec_kinds template class dim_vec_kind_init @@ -267,12 +303,19 @@ details::dim_vec_kind_init spatial(InitTys&& ... return details::dim_vec_kind_init(std::forward(inits) ...); } +template +details::dim_vec_kind_init local(InitTys&& ... inits) +{ + return details::dim_vec_kind_init(std::forward(inits) ...); +} + /// @brief N-dimensional vector. Mostly used to represent memory size. struct tensor { friend class details::dim_vec_kind_init; friend class details::dim_vec_kind_init; friend class details::dim_vec_kind_init; + friend class details::dim_vec_kind_init; typedef int32_t value_type; ///< Values type stored in tensor. //TODO find the way to prevent direct change of following fields. @@ -280,6 +323,7 @@ struct tensor mutable_array_ref batch; ///< Batch dimensions. mutable_array_ref feature; ///< Feature maps. mutable_array_ref spatial; ///< Spatial dimensions. + mutable_array_ref local; ///< Local dimensions. private: value_type _sizes[CLDNN_TENSOR_DIM_MAX]; @@ -292,6 +336,8 @@ public: , batch(_sizes, CLDNN_TENSOR_BATCH_DIM_MAX) , feature(_sizes + CLDNN_TENSOR_BATCH_DIM_MAX, CLDNN_TENSOR_FEATURE_DIM_MAX) , spatial(_sizes + CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX, CLDNN_TENSOR_SPATIAL_DIM_MAX) + , local(_sizes + CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + + CLDNN_TENSOR_SPATIAL_DIM_MAX, CLDNN_TENSOR_LOCAL_DIM_MAX) { std::fill_n(_sizes, CLDNN_TENSOR_DIM_MAX, default_size); } @@ -345,6 +391,32 @@ public: _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + 1] = height; } + /// @brief Constructs @p tensor. + /// @details Example: + /*! @code + * + tensor my_tensor( 2, 3, 4, 5, 6, 7 ); // b=2, f=3, x=4, y=5, lx= 6, ly =7 + cout << my_tensor.batch[0] << endl; // 2 + cout << my_tensor.feature[0] << endl; // 3 + cout << "x=" << my_tensor.spatial[0] << endl; // x=4 + cout << "y=" << my_tensor.spatial[1] << endl; // y=5 + cout << "local x=" << my_tensor.local[0] << endl; // local x=6 + cout << "loxal y=" << my_tensor.local[1] << endl; // local y=7 + * + * @endcode + */ + tensor(value_type batch_num, value_type feature_num, value_type width, + value_type height, value_type local_x, value_type local_y) + : tensor(1) + { + _sizes[0] = batch_num; + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX] = feature_num; + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX] = width; + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + 1] = height; + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX] = local_x; + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX + 1] = local_y; + } + /// @brief Constructs @p tensor using vector of sizes. /// @param[in] sizes dimensions need to be provided in the following order {batch, feature, spatial_x, spatial_y}. /// @param[in] default_size default_size for tensor dimensions. @@ -366,6 +438,13 @@ public: _sizes[CLDNN_TENSOR_BATCH_DIM_MAX] = sizes[CLDNN_TENSOR_BATCH_DIM_MAX]; _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX] = sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX]; _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + 1] = sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + 1]; + if (sizes.size() == 6) + { + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX] = + sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX]; + _sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX + 1] = + sizes[CLDNN_TENSOR_BATCH_DIM_MAX + CLDNN_TENSOR_FEATURE_DIM_MAX + CLDNN_TENSOR_SPATIAL_DIM_MAX + 1]; + } } tensor(format fmt, const std::vector& sizes, value_type default_size = 1) @@ -404,6 +483,7 @@ public: result.batch_num = batch.size(); result.feature_num = feature.size(); result.spatial_num = spatial.size(); + result.local_num = local.size(); std::copy_n(_sizes, CLDNN_TENSOR_DIM_MAX, result.sizes); return result; } @@ -664,6 +744,16 @@ public: my_sizes[0] = align_to(my_sizes[0], 16); adjusted_coords[0] = align_to(adjusted_coords[0], 16); } + else if (fmt == cldnn::format::os_iyx_osv32 && !is_aligned_to(my_sizes[0], 32)) + { + my_sizes[0] = align_to(my_sizes[0], 32); + adjusted_coords[0] = align_to(adjusted_coords[0], 32); + } + else if (fmt == cldnn::format::os_iyx_osv64 && !is_aligned_to(my_sizes[0], 64)) + { + my_sizes[0] = align_to(my_sizes[0], 64); + adjusted_coords[0] = align_to(adjusted_coords[0], 64); + } else if (fmt == cldnn::format::bs_xs_xsv8_bsv8 && !(is_aligned_to(my_sizes[0], 8) && is_aligned_to(my_sizes[1], 8))) { my_sizes[0] = align_to(my_sizes[0], 8); @@ -699,16 +789,43 @@ public: adjusted_coords[0] = align_to(adjusted_coords[0], 8); adjusted_coords[1] = align_to(adjusted_coords[1], 32); } + else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(my_sizes[0], 32)) && !(is_aligned_to(my_sizes[1], 32))) + { + my_sizes[0] = align_to(my_sizes[0], 32); + my_sizes[1] = align_to(my_sizes[1], 32); + adjusted_coords[0] = align_to(adjusted_coords[0], 32); + adjusted_coords[1] = align_to(adjusted_coords[1], 32); + } else if (fmt == cldnn::format::is_o_yx_isv32 && !(is_aligned_to(my_sizes[1], 32))) { my_sizes[1] = align_to(my_sizes[1], 32); adjusted_coords[1] = align_to(adjusted_coords[1], 32); } + else if (fmt == cldnn::format::is_o32_yx_isv32_swizzled_by_4 && (!is_aligned_to(my_sizes[1], 32) || !is_aligned_to(my_sizes[0], 32))) + { + my_sizes[0] = align_to(my_sizes[0], 32); + my_sizes[1] = align_to(my_sizes[1], 32); + adjusted_coords[0] = align_to(adjusted_coords[0], 32); + adjusted_coords[1] = align_to(adjusted_coords[1], 32); + } + else if (fmt == cldnn::format::os_is_y_x8_osv8_isv4) + { + my_sizes[1] = align_to(my_sizes[1], 4); + my_sizes[0] = align_to(my_sizes[0], 8); + my_sizes[2] = align_to(my_sizes[2], 8); + } else if (fmt == cldnn::format::byxf_af32 && !(is_aligned_to(my_sizes[1], 32))) { my_sizes[1] = align_to(my_sizes[1], 32); adjusted_coords[1] = align_to(adjusted_coords[1], 32); } + else if (fmt == cldnn::format::byx8_f4 && (!(is_aligned_to(my_sizes[1], 4)) || !(is_aligned_to(my_sizes[2], 8)))) + { + my_sizes[1] = align_to(my_sizes[1], 4); + my_sizes[2] = align_to(my_sizes[2], 8); + adjusted_coords[1] = align_to(adjusted_coords[1], 4); + adjusted_coords[2] = align_to(adjusted_coords[2], 8); + } else if (fmt == cldnn::format::fs_bs_yx_bsv4_fsv32 && (!is_aligned_to(my_sizes[1], 32) || !is_aligned_to(my_sizes[0], 4) )) { my_sizes[1] = align_to(my_sizes[1], 32); @@ -764,6 +881,7 @@ private: } }; +#define TensorValue(val) static_cast(val) template inline void details::dim_vec_kind_init::init_tensor_values(cldnn::tensor & t) @@ -781,6 +899,26 @@ inline tensor operator*(const tensor& lhs, tensor::value_type rhs) { return lhs. /// @brief Divides a @p tensor by a @p scalar inline tensor operator/(const tensor& lhs, tensor::value_type rhs) { return lhs.div(rhs); } +/// +/// \brief Converts C API tensor_array to std::vector +/// +inline std::vector tensor_arr_to_vector(const cldnn_tensor_arr& arr) +{ + std::vector result(arr.size); + for (size_t i = 0; i < arr.size; i++) + result[i] = arr.data[i]; + + return result; +} + +/// +/// \brief Converts std::vector to std::vector of C API tensor +/// +inline std::vector tensor_vector_to_cldnn_vector(const std::vector& stor) +{ + return std::vector(stor.begin(), stor.end()); +} + /// @} /// @} } diff --git a/inference-engine/thirdparty/clDNN/api/CPP/topology.hpp b/inference-engine/thirdparty/clDNN/api/CPP/topology.hpp index e5a44e4..37481ab 100644 --- a/inference-engine/thirdparty/clDNN/api/CPP/topology.hpp +++ b/inference-engine/thirdparty/clDNN/api/CPP/topology.hpp @@ -61,6 +61,13 @@ struct topology return *this; } + /// Construct C++ topology based on C API @p cldnn_topology + topology(const cldnn_topology& other) + :_impl(other) + { + if (_impl == nullptr) throw std::invalid_argument("implementation pointer should not be null"); + } + /// @brief Releases wrapped C API @ref cldnn_topology. ~topology() { @@ -124,11 +131,6 @@ private: friend struct network; cldnn_topology _impl; - topology(cldnn_topology impl) :_impl(impl) - { - if (_impl == nullptr) throw std::invalid_argument("implementation pointer should not be null"); - } - void retain() { check_status("retain topology failed", [=](status_t* status) { cldnn_retain_topology(_impl, status); }); diff --git a/inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_bn_scale.h b/inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_bn_scale.h new file mode 100644 index 0000000..a57d752 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_bn_scale.h @@ -0,0 +1,73 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef FUSED_CONV_BN_SCALE_H +#define FUSED_CONV_BN_SCALE_H + +#include "api/C/cldnn.h" +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +/// @brief Primitives that fuses convolution, batch norm, scale and optionally Relu. +CLDNN_BEGIN_PRIMITIVE_DESC(fused_conv_bn_scale) +/// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. +cldnn_tensor input_offset; +/// @brief Defines shift in input buffer between adjacent calculations of output values. +cldnn_tensor stride; +/// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. +/// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. +/// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. +cldnn_tensor dilation; +/// @brief Enable Relu activation. +uint32_t with_activation; +/// @brief Relu activation slope. +float activation_negative_slope; +/// @brief On how many cards split the computation to. +uint32_t split; +/// @brief Array of primitive ids containing weights data. Size of array should be equivalent to @p split. +cldnn_primitive_id_arr weights; +/// @brief Array of primitive ids containing bias data. Size of array should be equivalent to @p split. +cldnn_primitive_id_arr bias; +/// @brief Primitive id containing scale bias data for fused convolution. +cldnn_primitive_id scale_bias; +/// @brief Primitive id containing inverted variance used in future gradient computing for fused convolution. +cldnn_primitive_id inv_variance; +/// @brief Epsilon for fused convolution. +float epsilon; +/// @brief Indicates that primitive is fused with batch norm and scale. +uint32_t fused_batch_norm_scale; +CLDNN_END_PRIMITIVE_DESC(fused_conv_bn_scale) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(fused_conv_bn_scale); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif /* FUSED_CONV_BN_SCALE.H */ + diff --git a/inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_eltwise.h b/inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_eltwise.h new file mode 100644 index 0000000..4586487 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api_extension/C/fused_conv_eltwise.h @@ -0,0 +1,104 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#ifndef FUSED_CONV_ELTWISE_H +#define FUSED_CONV_ELTWISE_H + +#include "api/C/cldnn.h" +/// @addtogroup c_api C API +/// @{ +/// @addtogroup c_topology Network Topology +/// @{ +/// @addtogroup c_primitives Primitives +/// @{ + +#ifdef __cplusplus +extern "C" { +#endif + +/// @brief Performs forward spatial convolution with weight sharing fused with eltwise. +/// Also supports built-in Relu @CLDNN_PRIMITIVE_DESC{activation} separate for convolution and for eltwise, available by setting it in arguments. +CLDNN_BEGIN_PRIMITIVE_DESC(fused_conv_eltwise) + +struct conv_data +{ + /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + cldnn_tensor input_offset; + /// @brief Defines shift in input buffer between adjacent calculations of output values. + cldnn_tensor stride; + /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + cldnn_tensor dilation; + /// @brief Enable Relu activation. + uint32_t with_activation; + /// @brief Relu activation slope. + float activation_negative_slope; + /// @brief On how many cards split the computation to. + uint32_t split; + /// @brief Indicates that the primitive has user-defined output size (non-zero value). + uint32_t with_output_size; + /// @brief User-defined output data size of the primitive (w/o padding). + cldnn_tensor output_size; + /// @brief Array of primitive ids containing weights data. Size of array should be equivalent to @p split. + cldnn_primitive_id_arr weights; + /// @brief Array of primitive ids containing bias data. Size of array should be equivalent to @p split. + cldnn_primitive_id_arr bias; + /// @brief List of primitive ids containing weights quanitization factors per output feature map. + cldnn_primitive_id_arr weights_quantization_factors; + /// @brief List of primitive ids containing output calibration factors per output feature map. + cldnn_primitive_id_arr output_calibration_factors; + /// @brief Input quantization factor + float input_quantization_factor; + /// @brief Output quantization factor + float output_quantization_factor; +} conv; + +struct eltw_data +{ + /// @brief Primitive id containing output quanitization factors per output feature map. + cldnn_primitive_id output_calibration_factors; + /// @brief Output quantization factor + float output_quantization_factor; + /// @brief Eltwise mode. See #cldnn_eltwise_mode. + int32_t mode; /*cldnn_eltwise_mode*/ + /// @brief Blob-wise coefficient for SUM operation + cldnn_float_arr coefficients; + /// @brief Enables Relu activation. + uint32_t with_activation; + /// @brief Relu activation slope. + float activation_negative_slope; + /// @brief Defines shift in input buffers between adjacent calculations of output values. + cldnn_tensor_arr stride; +} eltw; + +/// @brief Is optimization that output contains data from second input ON ? +bool second_input_in_output = false; + +CLDNN_END_PRIMITIVE_DESC(fused_conv_eltwise) + +CLDNN_DECLARE_PRIMITIVE_TYPE_ID(fused_conv_eltwise); + +#ifdef __cplusplus +} +#endif + +/// @} +/// @} +/// @} +#endif /* FUSED_CONV_ELTWISE_H */ + diff --git a/inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_bn_scale.hpp b/inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_bn_scale.hpp new file mode 100644 index 0000000..117e4ac --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_bn_scale.hpp @@ -0,0 +1,170 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "../C/fused_conv_bn_scale.h" +#include "api/CPP/primitive.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief Primitives that fuses convolution, batch norm, scale and optionally Relu. +struct fused_conv_bn_scale : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(fused_conv_bn_scale) + + /// @brief Constructs convolution primitive fused with batch norm and scale. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param bias List of primitive ids containing bias data. + /// @param epsilon Small number to protect from 0 dividing. + /// @param scale_input Scale input primitive id with values needed for product computation. Used in fused scale part. + /// @param scale_bias Primitive id containing bias data for fused scale part. + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. Used in fused batch norm part. + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + fused_conv_bn_scale( + const primitive_id& id, + const primitive_id& input, + const std::vector& weights, + const std::vector& bias, + float epsilon, + const primitive_id& scale_input, + const primitive_id& scale_bias = "", + tensor stride = { 1, 1, 1, 1 }, + tensor dilation = { 1, 1, 1, 1 }, + tensor input_offset = { 0,0,0,0 }, + const primitive_id& inv_variance = "", + bool with_activation = false, + float activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input, scale_input }, output_padding) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , input_offset(input_offset) + , stride(stride) + , dilation(dilation) + , with_activation(with_activation) + , activation_negative_slope(activation_slp) + , with_output_size(false) + , scale_bias(scale_bias) + , inv_variance(inv_variance) + , epsilon(epsilon) + , _weights(weights) + , _bias(bias) + { + if ((bias.size() != 0) && (weights.size() != bias.size())) + throw std::runtime_error("convolution's weights/bias count does not match"); + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{fused_conv_bn_scale} + fused_conv_bn_scale(const dto* dto) + :primitive_base(dto) + , weights(_weights.cpp_ids) + , bias(_bias.cpp_ids) + , input_offset(dto->input_offset) + , stride(dto->stride) + , dilation(dto->dilation) + , with_activation(dto->with_activation != 0) + , activation_negative_slope(dto->activation_negative_slope) + , scale_bias(dto->scale_bias) + , inv_variance(dto->inv_variance) + , epsilon(dto->epsilon) + , _weights(dto->weights) + , _bias(dto->bias) + { + if (!dto->split || (weights.size() != bias.size() && bias.size() != 0) || dto->split != weights.size()) + throw std::invalid_argument("Invalid convolution dto: bad split value"); + } + + /// @brief List of primitive ids containing weights data. + fixed_size_vector_ref weights; + /// @brief List of primitive ids containing bias data. + fixed_size_vector_ref bias; + /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + tensor input_offset; + /// @brief Defines shift in input buffer between adjacent calculations of output values. + tensor stride; + /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + tensor dilation; + /// @brief Enable Relu activation. + bool with_activation; + /// @brief Relu activation slope. + float activation_negative_slope; + /// @brief Indicates that the primitive has user-defined output size (non-zero value). + bool with_output_size; + /// @brief User-defined output data size of the primitive (w/o padding). + tensor output_size; + /// @brief Primitive id containing scale bias data for fused convolution. + primitive_id scale_bias; + /// @brief Primitive id containing inverted variance used in future gradient computing for fused convolution. + primitive_id inv_variance; + /// @brief Epsilon for fused convolution. + float epsilon; + /// @brief On how many cards split the computation to. + int32_t split() const { return static_cast(weights.size()); } + +protected: + primitive_id_arr _weights; + primitive_id_arr _bias; + + std::vector> get_dependencies() const override + { + std::vector> ret; + ret.reserve(weights.size() + bias.size() + !scale_bias.empty() + !inv_variance.empty()); + for (auto& w : weights) + ret.push_back(w); + for (auto& b : bias) + ret.push_back(b); + if (!scale_bias.empty()) + ret.push_back(scale_bias); + if (!inv_variance.empty()) + ret.push_back(inv_variance); + return ret; + } + + void update_dto(dto& dto) const override + { + dto.weights = _weights.ref(); + dto.bias = _bias.ref(); + dto.input_offset = input_offset; + dto.stride = stride; + dto.dilation = dilation; + dto.split = split(); + dto.with_activation = with_activation; + dto.activation_negative_slope = activation_negative_slope; + dto.epsilon = epsilon; + dto.inv_variance = inv_variance.c_str(); + dto.scale_bias = scale_bias.c_str(); + } +}; +/// @} +/// @} +/// @} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_eltwise.hpp b/inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_eltwise.hpp new file mode 100644 index 0000000..bc3a278 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/api_extension/CPP/fused_conv_eltwise.hpp @@ -0,0 +1,262 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "../C/fused_conv_eltwise.h" +#include "api/CPP/primitive.hpp" +#include "api/CPP/eltwise.hpp" + +namespace cldnn +{ +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief Performs forward spatial convolution with fused eltwise and optionally Relu. +struct fused_conv_eltwise : public primitive_base +{ + CLDNN_DECLARE_PRIMITIVE(fused_conv_eltwise) + + /// @brief Constructs fused_conv_eltwise primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights List of primitive ids containing weights data. + /// @param bias List of primitive ids containing bias data. + /// @param w_quantization_factor List of primitive ids containing weights quanitization factors per output feature map. + /// @param output_calibration_factors List of primitive ids output containing calibration factors per output feature map. + /// @param i_quantization_factor Input quantization factor + /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + /// @param stride Defines shift in input buffer between adjacent calculations of output values. + /// @param dilation Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + /// @param with_activation Enable Relu activation. + /// @param activation_slp Relu activation slope. + fused_conv_eltwise( + const primitive_id& id, + const primitive_id& input, + const primitive_id& input2, + eltwise_mode mode, + const std::vector& weights, + const std::vector& bias, + const std::vector& conv_w_quantization_factor, + const std::vector& conv_output_calibration_factors, + const float conv_i_quantization_factor, + const primitive_id& eltw_output_calibration_factors, + const std::vector& eltw_stride, + tensor stride = { 1, 1, 1, 1 }, + tensor input_offset = { 0,0,0,0 }, + tensor dilation = { 1, 1, 1, 1 }, + bool conv_with_activation = false, + float conv_activation_slp = 0.0f, + bool eltw_with_activation = false, + float eltw_activation_slp = 0.0f, + const padding& output_padding = padding() + ) + :primitive_base(id, { input, input2 }, output_padding) + , conv(_conv_weights.cpp_ids, _conv_bias.cpp_ids, _conv_weights_quantization_factors.cpp_ids, _conv_output_calibration_factors.cpp_ids) + , eltw(eltw_output_calibration_factors) + , _conv_weights(weights) + , _conv_bias(bias) + , _conv_weights_quantization_factors(conv_w_quantization_factor) + , _conv_output_calibration_factors(conv_output_calibration_factors) + { + + conv.input_quantization_factor = conv_i_quantization_factor; + conv.output_quantization_factor = 1.0f; + + conv.input_offset = input_offset; + conv.stride = stride; + conv.dilation = dilation; + conv.with_activation = conv_with_activation; + conv.activation_negative_slope = conv_activation_slp; + conv.with_output_size = false; + + eltw.mode = mode; + eltw.with_activation = eltw_with_activation; + eltw.activation_negative_slope = eltw_activation_slp; + eltw.stride = eltw_stride; + + if ((bias.size() != 0) && (weights.size() != bias.size())) + throw std::runtime_error("convolution's weights/bias count does not match"); + if (conv.output_calibration_factors.size()) + { + if ((weights.size() != 0) && (weights.size() != conv.weights_quantization_factors.size())) + throw std::runtime_error("convolution's weights count does not match quantization factors count"); + } + } + + /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{convolution} + fused_conv_eltwise(const dto* dto) + :primitive_base(dto) + , conv(_conv_weights.cpp_ids, _conv_bias.cpp_ids, _conv_weights_quantization_factors.cpp_ids, _conv_output_calibration_factors.cpp_ids) + , eltw(dto->eltw.output_calibration_factors) + , _conv_weights(dto->conv.weights) + , _conv_bias(dto->conv.bias) + , _conv_weights_quantization_factors(dto->conv.weights_quantization_factors) + , _conv_output_calibration_factors(dto->conv.output_calibration_factors) + , _eltw_stride(tensor_vector_to_cldnn_vector(eltw.stride)) + { + conv.input_quantization_factor = dto->conv.input_quantization_factor; + conv.output_quantization_factor = dto->conv.output_quantization_factor; + conv.input_offset = dto->conv.input_offset; + conv.stride = dto->conv.stride; + conv.dilation = dto->conv.dilation; + conv.with_activation = dto->conv.with_activation != 0; + conv.activation_negative_slope = dto->conv.activation_negative_slope; + conv.with_output_size = dto->conv.with_output_size != 0; + conv.output_size = dto->conv.output_size; + + second_input_in_output = dto->second_input_in_output; + + if (!dto->conv.split || (conv.weights.size() != conv.bias.size() && conv.bias.size() != 0) || dto->conv.split != conv.weights.size()) + throw std::invalid_argument("Invalid convolution dto: bad split value"); + } + + struct conv_data + { + /// @brief List of primitive ids containing weights data. + fixed_size_vector_ref weights; + /// @brief List of primitive ids containing bias data. + fixed_size_vector_ref bias; + /// @brief List of primitive ids containing weights quanitization factors per output feature map. + fixed_size_vector_ref weights_quantization_factors; + /// @brief List of primitive ids containing output quanitization factors per output feature map for convolution. + fixed_size_vector_ref output_calibration_factors; + /// @brief Input quantization factor for convolution + float input_quantization_factor; + /// @brief Output quantization factor for convolution + float output_quantization_factor; + /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. + tensor input_offset; + /// @brief Defines shift in input buffer between adjacent calculations of output values. + tensor stride; + /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. + /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. + /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. + tensor dilation; + /// @brief Enable Relu activation. + bool with_activation; + /// @brief Relu activation slope. + float activation_negative_slope; + /// @brief Indicates that the primitive has user-defined output size (non-zero value). + bool with_output_size; + /// @brief User-defined output data size of the primitive (w/o padding). + tensor output_size; + + conv_data(const fixed_size_vector_ref& weights, + const fixed_size_vector_ref& bias, + const fixed_size_vector_ref& weights_quantization_factors, + const fixed_size_vector_ref& output_calibration_factors + ) : weights(weights), + bias(bias), + weights_quantization_factors(weights_quantization_factors), + output_calibration_factors(output_calibration_factors) + {} + } conv; + + struct eltw_data + { + /// @brief Primitive id containing output quanitization factors per output feature map. + primitive_id output_calibration_factors; + /// @brief Output quantization factor for eltwise + float output_quantization_factor; + /// @param mode Eltwise mode. + eltwise_mode mode; + /// @brief Enable Relu activation. + bool with_activation; + /// @brief Relu activation slope. + float activation_negative_slope; + /// @brief Defines shift in input buffers between adjacent calculations of output values. + std::vector stride; + + eltw_data(const primitive_id& output_calibration_factors) + : output_calibration_factors(output_calibration_factors) + {} + } eltw; + + /// @brief On how many cards split the computation to. + int32_t split() const { return static_cast(conv.weights.size()); } + + /// @brief Is optimization that output contains data from second input ON ? + bool second_input_in_output = false; +protected: + primitive_id_arr _conv_weights; + primitive_id_arr _conv_bias; + primitive_id_arr _conv_weights_quantization_factors; + primitive_id_arr _conv_output_calibration_factors; + + std::vector _eltw_stride; + + std::vector> get_dependencies() const override + { + std::vector> ret; + ret.reserve(conv.weights.size() + + conv.bias.size() + + conv.weights_quantization_factors.size() + + conv.output_calibration_factors.size() + + (eltw.output_calibration_factors.empty() ? 0 : 1)); + + for (auto& w : conv.weights) + ret.push_back(w); + for (auto& b : conv.bias) + ret.push_back(b); + for (auto& q : conv.weights_quantization_factors) + ret.push_back(q); + for (auto& q : conv.output_calibration_factors) + ret.push_back(q); + + if (!eltw.output_calibration_factors.empty()) + ret.push_back(eltw.output_calibration_factors); + + return ret; + } + + void update_dto(dto& dto) const override + { + dto.conv.weights = _conv_weights.ref(); + dto.conv.bias = _conv_bias.ref(); + dto.conv.weights_quantization_factors = _conv_weights_quantization_factors.ref(); + dto.conv.output_calibration_factors = _conv_output_calibration_factors.ref(); + dto.conv.input_quantization_factor = conv.input_quantization_factor; + dto.conv.output_quantization_factor = conv.output_quantization_factor; + dto.conv.input_offset = conv.input_offset; + dto.conv.stride = conv.stride; + dto.conv.split = split(); + dto.conv.with_activation = conv.with_activation; + dto.conv.activation_negative_slope = conv.activation_negative_slope; + dto.conv.dilation = conv.dilation; + dto.conv.with_output_size = conv.with_output_size; + dto.conv.output_size = conv.output_size; + + dto.eltw.output_calibration_factors = eltw.output_calibration_factors.c_str(); + dto.eltw.output_quantization_factor = eltw.output_quantization_factor; + dto.eltw.mode = static_cast(eltw.mode); + dto.eltw.with_activation = eltw.with_activation; + dto.eltw.activation_negative_slope = eltw.activation_negative_slope; + dto.eltw.stride = tensor_vector_to_arr(_eltw_stride); + + dto.second_input_in_output = second_input_in_output; + } +}; +/// @} +/// @} +/// @} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/make_unique.hpp b/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/make_unique.hpp new file mode 100644 index 0000000..7189d6e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/make_unique.hpp @@ -0,0 +1,13 @@ +/* +Copyright 2014 Glen Joseph Fernandes +(glenjofe@gmail.com) + +Distributed under the Boost Software License, Version 1.0. +(http://www.boost.org/LICENSE_1_0.txt) +*/ +#ifndef BOOST_MAKE_UNIQUE_HPP_INCLUDED +#define BOOST_MAKE_UNIQUE_HPP_INCLUDED + +#include + +#endif diff --git a/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/smart_ptr/make_unique.hpp b/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/smart_ptr/make_unique.hpp new file mode 100644 index 0000000..eed5033 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/smart_ptr/make_unique.hpp @@ -0,0 +1,110 @@ +/* +Copyright 2012-2015 Glen Joseph Fernandes +(glenjofe@gmail.com) + +Distributed under the Boost Software License, Version 1.0. +(http://www.boost.org/LICENSE_1_0.txt) +*/ +#ifndef BOOST_SMART_PTR_MAKE_UNIQUE_HPP +#define BOOST_SMART_PTR_MAKE_UNIQUE_HPP + +#include +#include +#include + +namespace boost { +namespace detail { + +template +struct up_if_object { + typedef std::unique_ptr type; +}; + +template +struct up_if_object { }; + +template +struct up_if_object { }; + +template +struct up_if_array { }; + +template +struct up_if_array { + typedef std::unique_ptr type; +}; + +template +struct up_remove_reference { + typedef T type; +}; + +template +struct up_remove_reference { + typedef T type; +}; + +template +struct up_remove_reference { + typedef T type; +}; + +template +struct up_element { }; + +template +struct up_element { + typedef T type; +}; + +} /* detail */ + +template +inline typename detail::up_if_object::type +make_unique() +{ + return std::unique_ptr(new T()); +} + +#if !defined(BOOST_NO_CXX11_VARIADIC_TEMPLATES) +template +inline typename detail::up_if_object::type +make_unique(Args&&... args) +{ + return std::unique_ptr(new T(std::forward(args)...)); +} +#endif + +template +inline typename detail::up_if_object::type +make_unique(typename detail::up_remove_reference::type&& value) +{ + return std::unique_ptr(new T(std::move(value))); +} + +template +inline typename detail::up_if_object::type +make_unique_noinit() +{ + return std::unique_ptr(new T); +} + +template +inline typename detail::up_if_array::type +make_unique(std::size_t size) +{ + return std::unique_ptr(new typename + detail::up_element::type[size]()); +} + +template +inline typename detail::up_if_array::type +make_unique_noinit(std::size_t size) +{ + return std::unique_ptr(new typename + detail::up_element::type[size]); +} + +} /* boost */ + +#endif diff --git a/inference-engine/thirdparty/clDNN/create_msvc_mscc.bat b/inference-engine/thirdparty/clDNN/create_msvc_mscc.bat index 9149792..156bc08 100644 --- a/inference-engine/thirdparty/clDNN/create_msvc_mscc.bat +++ b/inference-engine/thirdparty/clDNN/create_msvc_mscc.bat @@ -31,7 +31,7 @@ rmdir /S /Q %SOLUTION_DIR64%\codegen echo Creating Visual Studio 2015 (Win32) files in %SOLUTION_DIR32%... && ^ cd "%ROOT_DIR%" && cmake -E make_directory "%SOLUTION_DIR32%" && cd "%SOLUTION_DIR32%" && cmake -G "Visual Studio 14 2015" "-DCLDNN__ARCHITECTURE_TARGET=%SOLUTION_TARGET32%" "%ROOT_DIR%" echo Creating Visual Studio 2015 (x64) files in %SOLUTION_DIR64%... && ^ -cd "%ROOT_DIR%" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio 14 2015 Win64" "-DCLDNN__ARCHITECTURE_TARGET=%SOLUTION_TARGET64%""%ROOT_DIR%" +cd "%ROOT_DIR%" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio 14 2015 Win64" "-DCLDNN__ARCHITECTURE_TARGET=%SOLUTION_TARGET64%" "%ROOT_DIR%" echo Done. pause diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/CMakeLists.txt b/inference-engine/thirdparty/clDNN/kernel_selector/CMakeLists.txt index f8f6837..ecaede5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/CMakeLists.txt +++ b/inference-engine/thirdparty/clDNN/kernel_selector/CMakeLists.txt @@ -43,7 +43,7 @@ file(GLOB __CLDNN_Sources__main "${__CLDNN_Directory__main}/*.hpp" "${__CLDNN_Directory__main}/*.cpp" ) - + set(__CLDNN_Directory__core "${__CLDNN_Directory__main}/core") set(__CLDNN_Label__core "core") file(GLOB __CLDNN_Sources__core @@ -59,7 +59,7 @@ file(GLOB __CLDNN_Sources__common "${__CLDNN_Directory__common}/*.hpp" "${__CLDNN_Directory__common}/*.cpp" ) - + set(__CLDNN_Directory__core_common "${__CLDNN_Directory__core}/common") set(__CLDNN_Label__core_common "${__CLDNN_Label__core}\\common") file(GLOB __CLDNN_Sources__core_common @@ -87,7 +87,7 @@ foreach(__CLDNN_FilePath ${__CLDNN_Sources__actual_kernels}) string(REPLACE ";" "\;" __CLDNN_FilePath "${__CLDNN_FilePath}") # [WA#1] Must escape ; again if occurred in item. get_filename_component(__CLDNN_FileDir "${__CLDNN_FilePath}" DIRECTORY) get_filename_component(__CLDNN_DirName "${__CLDNN_FileDir}" NAME) - + set(__CLDNN_FileLabel "${__CLDNN_Label__actual_kernels}\\${__CLDNN_DirName}") source_group("${__CLDNN_FileLabel}" FILES ${__CLDNN_FilePath}) endforeach() @@ -137,7 +137,7 @@ include_directories( "${__CLDNN_Directory__main}" "${__CLDNN_Directory__core}" "${__CLDNN_Directory__core}/common" - "${__CLDNN_Directory__core}/cache" + "${__CLDNN_Directory__core}/cache" "${__CLDNN_Directory__actual_kernels}" "${__CLDNN_Directory__common}" ) @@ -165,7 +165,6 @@ endif() target_link_libraries("${CLDNN_BUILD__PROJ}" ${CLDNN__SYSTEM_LINK_LIBRARIES}) # =================================== Custom pre- and post-steps ======================================= - add_custom_command(OUTPUT "${__CLDNN_CGDirectory__cg_cache}/${__CLDNN_File__cg_cache__prim_db}" COMMAND "${CMAKE_COMMAND}" -E make_directory "${__CLDNN_CGDirectory__cg_cache}" COMMAND "${PYTHON_EXECUTABLE}" "${__CLDNN_Directory__core_common}/primitive_db_gen.py" -out_path "${__CLDNN_CGDirectory__cg_cache}" -out_file_name "${__CLDNN_File__cg_cache__prim_db}" -kernels "${__CLDNN_Directory__cl_kernels}" @@ -177,5 +176,17 @@ add_custom_command(OUTPUT "${__CLDNN_Directory__cg_cache}/${__CLDNN_File__cg_cac DEPENDS "${__CLDNN_CGDirectory__cg_cache}/${__CLDNN_File__cg_cache__prim_db}" ${__CLDNN_Sources__cl_kernels} "${__CLDNN_Directory__core_common}/primitive_db_gen.py" COMMENT "Updating file if the file changed (${__CLDNN_File__cg_cache__prim_db}) ..." ) +if(WIN32) + set(CLDNN_CACHE_PATH "${CLDNN__OUTPUT_BIN_DIR}/$") +else((NOT ANDROID) AND (UNIX)) + set(CLDNN_CACHE_PATH "${CLDNN__OUTPUT_LIB_DIR}/") +endif() + +message(STATUS "[CACHE COMMAND]: " "${CMAKE_COMMAND} -E copy_if_different ${__CLDNN_Directory__core}/cache/cache.json ${CLDNN_CACHE_PATH}") + +add_custom_command( + TARGET "${CLDNN_BUILD__PROJ}" POST_BUILD + COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${__CLDNN_Directory__core}/cache/cache.json ${CLDNN_CACHE_PATH}) + # ====================================================================================================== diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h index 0f23cbe..509ead6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h @@ -50,6 +50,7 @@ namespace kernel_selector switch (wt) { case WeightsType::INT8: + case WeightsType::UINT8: return 1; case WeightsType::F16: return 2; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h index c244209..e923c78 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2018 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -40,6 +40,8 @@ namespace kernel_selector ACTIVATION, SOFT_MAX, ELTWISE, + FUSED_CONV_BN_SCALE, + FUSED_CONV_ELTWISE, TABLE_LOOKUP, REORDER, RESHAPE, @@ -63,7 +65,16 @@ namespace kernel_selector SELECT, BROADCAST, GEMM, - INDEX_SELECT + INDEX_SELECT, + PYRAMID_ROI_ALIGN, + CONTRACT, + ONE_HOT, + DETECTION_OUTPUT, + GATHER, + DEPTH_TO_SPACE, + SHUFFLE_CHANNELS, + STRIDED_SLICE, + REVERSE_SEQUENCE }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -92,6 +103,7 @@ namespace kernel_selector F16, F32, INT8, + UINT8, }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -119,9 +131,10 @@ namespace kernel_selector ACOS, COSH, LOG, - LOG2, + LOG2, EXP, NONE, + NOT, NONE_GRAD }; @@ -243,7 +256,17 @@ namespace kernel_selector MODULU, SQRT, RSQRT, - ASSIGN + ASSIGN, + EQ, + NE, + LT, + LE, + GT, + GE, + LOGIC_AND, + LOGIC_OR, + LOGIC_XOR, + SQUARED_DIFF }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -288,7 +311,7 @@ namespace kernel_selector enum class MeanSubtractMode { NONE, - INSIDE_PARAMS, // the index is feature id (modulu size) + INSIDE_PARAMS, // the index is feature id (modulu size) IN_BUFFER, }; @@ -299,7 +322,7 @@ namespace kernel_selector { NONE, SUB, - MUL, + MUL, DIV, }; @@ -357,18 +380,6 @@ namespace kernel_selector }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // NonLinearParams - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - struct NonLinearParams - { - float m = 1.f; - float n = 0.f; - - NonLinearParams() = default; - NonLinearParams(const float m, const float n) : m(m), n(n) {} - }; - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Size //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template @@ -409,4 +420,27 @@ namespace kernel_selector //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// using uSize = Size; using stSize = Size; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ContractMode + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + enum class ContractMode + { + SUM, + PRODUCT, + ALL, + ANY, + MAX, + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // GatherAxis + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + enum class GatherAxis + { + X, + Y, + FEATURE, + BATCH, + }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp index 555ca1e..4773448 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,45 +30,54 @@ namespace kernel_selector //X, Y, F, R, B {-1,-1, 0,-1, 1 }, // DataLayout::bf {-1,-1, 1,-1, 0 }, // DataLayout::fb - { 0, 1, 2,-1, 3 }, // DataLayout::bfyx - { 2, 3, 1,-1, 0 }, // DataLayout::yxfb - { 1, 2, 0,-1, 3 }, // DataLayout::byxf - { 1, 2, 3,-1, 0 }, // DataLayout::fyxb - {-1,-1, 0,-1, 1 }, // DataLayout::bs_f_bsv8__af8 - {-1,-1, 0,-1, 1 }, // DataLayout::bs_f_bsv16__af8 - { 0, 1, 2,-1, 3 }, // DataLayout::bf8_xy16 - { 0, 1, 2, 3, 4 }, // DataLayout::brfyx - { 2, 1, 0,-1, 3 }, // DataLayout::winograd_2x3_s1_data - { 1, 2, 0,-1, 3 }, // DataLayout::byxf_af32 - { 0, 1, 3,-1, 2 }, // DataLayout::fs_bs_yx_bsv4_fsv32 + { 0, 1, 2,-1, 3 }, // DataLayout::bfyx + { 2, 3, 1,-1, 0 }, // DataLayout::yxfb + { 1, 2, 0,-1, 3 }, // DataLayout::byxf + { 1, 2, 3,-1, 0 }, // DataLayout::fyxb + {-1,-1, 0,-1, 1 }, // DataLayout::bs_f_bsv8__af8 + {-1,-1, 0,-1, 1 }, // DataLayout::bs_f_bsv16__af8 + { 0, 1, 2,-1, 3 }, // DataLayout::bf8_xy16 + { 0, 1, 2, 3, 4 }, // DataLayout::brfyx + { 2, 1, 0,-1, 3 }, // DataLayout::winograd_2x3_s1_data + { 1, 2, 0,-1, 3 }, // DataLayout::byxf_af32 + { 1, 2, 0,-1, 3 }, // DataLayout::byx8_f8 + { 0, 1, 3,-1, 2 }, // DataLayout::fs_bs_yx_bsv4_fsv32 + { 0, 1, 2, -1, 3 },// DataLayout::b_fs_yx_fsv4 } }; - std::array, WeightsLayout::WeightsLayoutCount> WeightsTensor::weightsChannelArray + std::array, WeightsLayout::WeightsLayoutCount> WeightsTensor::weightsChannelArray { { - //X, Y, I, O - {-1,-1, 0, 1 }, // WeightsLayout::oi - {-1,-1, 1, 0 }, // WeightsLayout::io - { 0, 1, 2, 3 }, // WeightsLayout::oiyx - { 1, 2, 0, 3 }, // WeightsLayout::oyxi - { 1, 2, 3, 0 }, // WeightsLayout::iyxo - { 2, 3, 1, 0 }, // WeightsLayout::yxio - { 0, 1, 2, 3 }, // WeightsLayout::os_iyx_osv16 - { 0, 1, 2, 3 }, // WeightsLayout::os_iyx_osv16_rotate_180 - {-1,-1, 0, 1 }, // WeightsLayout::os_i_osv8__ai8 - {-1,-1, 0, 1 }, // WeightsLayout::os_i_osv16__ai8 - {-1,-1, 0, 1 }, // WeightsLayout::os_i_osv16 - { 1, 2, 3, 0 }, // WeightsLayout::i_yxs_os_yxsv2_osv16 - { 1, 2, 3, 0 }, // WeightsLayout::iy_xs_os_xsv2_osv16__ao32 - { 1, 2, 3, 0 }, // WeightsLayout::iy_xs_os_xsv2_osv8__ao32 - { 0, 1, 2, 3 }, // WeightsLayout::image_2d_weights_c4_fyx_b - { 0, 1, 2, 3 }, // WeightsLayout::image_2d_weights_c1_b_fyx - { 3, 2, 1, 0 }, // WeightsLayout::winograd_2x3_s1_weights - { 0, 1, 2, 3 }, // WeightsLayout::winograd_2x3_s1_fused_weights - { 0, 1, 2, 3 }, // WeightsLayout::winograd_6x3_s1_fused_weights - { 0, 1, 2, 3 }, // WeightsLayout::image_2d_weights_winograd_6x3_s1_fbxyb - { 0, 1, 2, 3 }, // WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb - { 0, 1, 2, 3 }, // WeightsLayout::os_is_yx_isa8_osv8_isv4 - { 1, 2, 0, 3 }, // WeightsLayout::is_o_yx_isv32 + // X, Y, I, O, LX, LY, + { -1, -1, 0, 1, -1, -1 }, // WeightsLayout::oi + { -1, -1, 1, 0, -1, -1 }, // WeightsLayout::io + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::oiyx + { 1, 2, 0, 3, -1, -1 }, // WeightsLayout::oyxi + { 1, 2, 3, 0, -1, -1 }, // WeightsLayout::iyxo + { 2, 3, 1, 0, -1, -1 }, // WeightsLayout::yxio + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_iyx_osv16 + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_iyx_osv32 + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_iyx_osv64 + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_iyx_osv16_rotate_180 + { -1, -1, 0, 1, -1, -1 }, // WeightsLayout::os_i_osv8__ai8 + { -1, -1, 0, 1, -1, -1 }, // WeightsLayout::os_i_osv16__ai8 + { -1, -1, 0, 1, -1, -1 }, // WeightsLayout::os_i_osv16 + { 1, 2, 3, 0, -1, -1 }, // WeightsLayout::i_yxs_os_yxsv2_osv16 + { 1, 2, 3, 0, -1, -1 }, // WeightsLayout::iy_xs_os_xsv2_osv16__ao32 + { 1, 2, 3, 0, -1, -1 }, // WeightsLayout::iy_xs_os_xsv2_osv8__ao32 + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::image_2d_weights_c4_fyx_b + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::image_2d_weights_c1_b_fyx + { 3, 2, 1, 0, -1, -1 }, // WeightsLayout::winograd_2x3_s1_weights + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::winograd_2x3_s1_fused_weights + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::winograd_6x3_s1_fused_weights + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::image_2d_weights_winograd_6x3_s1_fbxyb + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_is_yx_isa8_osv8_isv4 + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4 + { 1, 2, 0, 3, -1, -1 }, // WeightsLayout::is_o_yx_isv32 + { 1, 2, 0, 3, -1, -1 }, // WeightsLayout::is_o32_yx_isv32_swizzled_by_4 + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_is_y_x8_osv8_isv4 + { 0, 1, 2, 3, 4, 5 }, // WeightsLayout::bf_lyx_yx + { 0, 1, 2, 3, -1, -1 }, // WeightsLayout::os_is_yx_osv16_isv4 } }; NDims DataTensor::GetSimpleDims(const std::vector& d, DataLayout l) @@ -98,6 +107,11 @@ namespace kernel_selector assert(newDims.size() == 4); newDims[0] = RoundUp(newDims[0], 32); break; + case byx8_f4: + assert(newDims.size() == 4); + newDims[0] = RoundUp(newDims[0], 4); + newDims[1] = RoundUp(newDims[1], 8); + break; case fs_bs_yx_bsv4_fsv32: assert(newDims.size() == 4); newDims[3] = RoundUp(newDims[3], 32); @@ -117,7 +131,7 @@ namespace kernel_selector pitch *= newDims[i]; } - if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32) + if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32 || l == byx8_f4) { ret[0].pitch = 1; ret[1].pitch = ret[0].pitch * newDims[0]; @@ -266,6 +280,14 @@ namespace kernel_selector assert(newDims.size() == 4); newDims[3] = RoundUp(newDims[3], 16); break; + case os_iyx_osv32: + assert(newDims.size() == 4); + newDims[3] = RoundUp(newDims[3], 32); + break; + case os_iyx_osv64: + assert(newDims.size() == 4); + newDims[3] = RoundUp(newDims[3], 64); + break; case os_i_osv8__ai8: assert(newDims.size() == 2); newDims[0] = RoundUp(newDims[0], 8); @@ -294,10 +316,31 @@ namespace kernel_selector newDims[3] = RoundUp(newDims[3], 8); newDims[2] = RoundUp(newDims[2], 32); break; + case os_is_yx_isa8_osv8_isv4_swizzled_by_4: + assert(newDims.size() == 4); + newDims[3] = RoundUp(newDims[3], 32); + newDims[2] = RoundUp(newDims[2], 32); + break; case is_o_yx_isv32: assert(newDims.size() == 4); newDims[0] = RoundUp(newDims[0], 32); break; + case is_o32_yx_isv32_swizzled_by_4: + assert(newDims.size() == 4); + newDims[0] = RoundUp(newDims[0], 32); + newDims[3] = RoundUp(newDims[3], 32); + break; + case os_is_y_x8_osv8_isv4: + assert(newDims.size() == 4); + newDims[2] = RoundUp(newDims[2], 4); + newDims[3] = RoundUp(newDims[3], 8); + newDims[0] = RoundUp(newDims[0], 8); + break; + case os_is_yx_osv16_isv4: + assert(newDims.size() == 4); + newDims[2] = RoundUp(newDims[2], 4); + newDims[3] = RoundUp(newDims[3], 16); + break; default: break; } @@ -322,15 +365,20 @@ namespace kernel_selector { ret[2].pitch = RoundUp(ret[1].v, 2) * ret[1].pitch; ret[1].pad.after = newDims[1] - ret[1].v; - + ret[3].pitch = ret[2].v * ret[2].pitch; ret[2].pad.after = newDims[2] - ret[2].v; } - else if (l == os_is_yx_isa8_osv8_isv4) + else if (l == os_is_yx_isa8_osv8_isv4 || l == os_is_yx_isa8_osv8_isv4_swizzled_by_4) { ret[0].pitch = 256; ret[1].pitch = ret[0].pitch * ret[0].v; } + else if (l == bf_lyx_yx) + { + ret[2].pitch = ret[0].v * ret[1].v * ret[2].v * ret[3].v; + ret[3].pitch = ret[2].pitch * ret[5].v; + } return ret; } @@ -385,6 +433,15 @@ namespace kernel_selector vec[Channelndex(l, WeightsChannelName::IFM)] = dst_ifm; vec[Channelndex(l, WeightsChannelName::OFM)] = OFM().v; } + else if (src_channels == 6 && dst_channels == 6) + { + vec[Channelndex(l, WeightsChannelName::X)] = IFM().v; + vec[Channelndex(l, WeightsChannelName::Y)] = OFM().v; + vec[Channelndex(l, WeightsChannelName::IFM)] = LX().v; + vec[Channelndex(l, WeightsChannelName::OFM)] = LY().v; + vec[Channelndex(l, WeightsChannelName::LX)] = X().v; + vec[Channelndex(l, WeightsChannelName::LY)] = Y().v; + } else { assert(0); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h index 8331ab0..cb3d3e9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include "common_types.h" +#include "common_tools.h" #include #include #include @@ -48,7 +49,9 @@ namespace kernel_selector brfyx, // 4D+batch winograd_2x3_s1_data, //winograd convolution input, F(2,3) -- filter 3x3 with stride 1 byxf_af32, // for MMAD convolution + byx8_f4, // for MMAD convolution fs_bs_yx_bsv4_fsv32, // for batched MMAD + b_fs_yx_fsv4, // reordering format for swizzled input for convolution using IMAD DataLayoutCount // NMBER OF ELEMENTS IN ENUM }; @@ -64,6 +67,8 @@ namespace kernel_selector iyxo, yxio, os_iyx_osv16, + os_iyx_osv32, + os_iyx_osv64, os_iyx_osv16_rotate_180, os_i_osv16, os_i_osv8__ai8, // TODO can we drop the alignment form layout name? @@ -79,8 +84,13 @@ namespace kernel_selector image_2d_weights_winograd_6x3_s1_fbxyb, // image 2d winograd convolution weights for fused kernel, F(2, 3) --filter 3x3 with stride 1 image_2d_weights_winograd_6x3_s1_xfbyb, // image 2d winograd convolution weights for fused kernel, F(2, 3) --filter 3x3 with stride 1 os_is_yx_isa8_osv8_isv4, // for MMAD convolution - is_o_yx_isv32, // for MMAD 1x1 convolutions - WeightsLayoutCount // NMBER OF ELEMENTS IN ENUM + os_is_yx_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, 1,5... + is_o_yx_isv32, // for MMAD 1x1 convolutions + is_o32_yx_isv32_swizzled_by_4, // for MMAD 1x1 convolutions swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, 1,5... + os_is_y_x8_osv8_isv4, // for MMAD convolutions + bf_lyx_yx, // local convolution + os_is_yx_osv16_isv4, // swizzled weights for convolution using IMAD + WeightsLayoutCount // NMBER OF ELEMENTS IN ENUM }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -126,6 +136,8 @@ namespace kernel_selector Y = 1, IFM = 2, OFM = 3, + LX = 4, + LY = 5, }; inline bool SimpleLayout(WeightsLayout l) @@ -495,6 +507,8 @@ namespace kernel_selector Dim Y() const { return Extract(layout, WeightsChannelName::Y, dims); } Dim IFM() const { return Extract(layout, WeightsChannelName::IFM, dims); } Dim OFM() const { return Extract(layout, WeightsChannelName::OFM, dims); } + Dim LX() const { return Extract(layout, WeightsChannelName::LX, dims); } + Dim LY() const { return Extract(layout, WeightsChannelName::LY, dims); } static inline Dim Extract(WeightsLayout l, WeightsChannelName channel, const NDims& d) { @@ -512,7 +526,7 @@ namespace kernel_selector } private: static NDims GetSimpleDims(const std::vector& d, WeightsLayout l); - static std::array, WeightsLayout::WeightsLayoutCount> weightsChannelArray; + static std::array, WeightsLayout::WeightsLayoutCount> weightsChannelArray; }; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp index 358b66d..caca728 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp @@ -53,7 +53,7 @@ namespace kernel_selector const auto& inputNlParams = params.inputActivationParams; jit.AddConstants({ - MakeJitConstant("PARAMS_NUM", GetActivationAdditionalParamsNumber(params.activationFunc)), + MakeJitConstant("PARAMS_NUM", GetActivationAdditionalParamsNumber(params.activation.function)), }); if (!inputNlParams.empty()) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h index e2c6092..51d1f90 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h @@ -28,12 +28,12 @@ namespace kernel_selector virtual ~ActivationKernelOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; static const int NUM_COLS_WI = 4; virtual DispatchData SetDefault(const activation_params& arg) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; virtual JitConstants GetJitConstants(const activation_params& params, DispatchData kd) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h index c07a449..7dcccfa 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~ActivationKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_tutorial.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_tutorial.h index 8fef335..c479a8e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_tutorial.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_tutorial.h @@ -38,13 +38,13 @@ namespace kernel_selector { virtual ~ActivationKernel_Tutorial() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; #ifdef ADVANCED_TUTORIAL virtual DispatchData SetDefault(const activation_params& arg) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; virtual JitConstants GetJitConstants(const activation_params& params, DispatchData) const override; #endif }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.h index a3b2623..a3bd109 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~ArgMaxMinKernelAxis() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_gpu_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_gpu_ref.h index c492e77..080b912 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_gpu_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_gpu_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~ArgMaxMinKernelGPURef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_opt.h index 0b12923..5f06f44 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_opt.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~ArgMaxMinKernelOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/average_unpooling/average_unpooling_kernel_gpu_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/average_unpooling/average_unpooling_kernel_gpu_ref.h index a8ce320..a632c56 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/average_unpooling/average_unpooling_kernel_gpu_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/average_unpooling/average_unpooling_kernel_gpu_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~AverageUnpoolingKernelGPURef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp index ebf881f..064d8a5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp @@ -36,6 +36,10 @@ namespace kernel_selector jit.AddConstant(MakeJitConstant("EPSILON", params.batchNormParams.epsilon)); if (params.batchNormParams.with_inv_var) jit.AddConstant(MakeJitConstant("FORWARD", 1)); + if (params.batchNormParams.with_scale_shift) + jit.AddConstant(MakeJitConstant("SCALE_SHIFT", 1)); + if (params.batchNormParams.with_mean_var_out) + jit.AddConstant(MakeJitConstant("MEAN_VAR_OUT", 1)); return jit; } @@ -79,7 +83,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - int inputs_num = 1 + orgParams.batchNormParams.with_inv_var; + int inputs_num = 1 + orgParams.batchNormParams.with_inv_var + 2*orgParams.batchNormParams.with_scale_shift + 2 * orgParams.batchNormParams.with_mean_var_out; FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, inputs_num); kd.estimatedTime = estimatedTime; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h index 30855ef..ebc4c05 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h @@ -32,6 +32,8 @@ namespace kernel_selector { float epsilon; bool with_inv_var; + bool with_scale_shift; + bool with_mean_var_out = false; }; DedicatedParams batchNormParams; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h index ccf8008..6a7da91 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~BatchNormKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h index 5c36e1c..5c85871 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~BatchNormGradKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.h index fbce8a6..4e778a4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.h @@ -32,7 +32,9 @@ namespace kernel_selector border_params() - : base_params(KernelType::BORDER) + : base_params(KernelType::BORDER), + b_type(BorderType::CONSTANT), + border_value(0.0f) { } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp index 9e42901..3e51384 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp @@ -39,9 +39,9 @@ namespace kernel_selector k.EnableOutputLayout(DataLayout::yxfb); k.EnableOutputLayout(DataLayout::byxf); - k.EnableBatching(); k.EnableTensorOffset(); k.EnableTensorPitches(); + k.EnableBatching(); return k; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.h index 0862ed1..f5f4c81 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.h @@ -25,6 +25,8 @@ namespace kernel_selector BorderKernelRef() : BorderKernelBase("border_gpu_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp index 3d3b2f4..795871b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp @@ -23,6 +23,11 @@ namespace kernel_selector JitConstants BroadcastKernelBase::GetJitConstants(const broadcast_params& params) { JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.AddConstants({ + MakeJitConstant("BROADCAST_ORDER", params.input_order) + }); + return jit; } @@ -63,7 +68,6 @@ namespace kernel_selector auto& kernel = k_data.kernels[0]; FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point); - k_data.estimatedTime = estimated_time; return {k_data}; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.h index cf4865e..f13192a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.h @@ -29,6 +29,8 @@ namespace kernel_selector : base_params(KernelType::BROADCAST) { } + std::vector input_order; + }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.cpp index 0be42a5..f7fe764 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.cpp @@ -25,20 +25,22 @@ namespace kernel_selector k.EnableInputDataType(Datatype::F32); k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::INT64); k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::INT8); k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT32); + k.EnableOutputDataType(Datatype::INT64); k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::byxf); k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::byxf); + k.EnableTensorOffset(); + k.EnableTensorPitches(); k.EnableBatching(); return k; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.h index ccca397..3f6fee8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_ref.h @@ -25,6 +25,8 @@ namespace kernel_selector BroadcastKernelRef() : BroadcastKernelBase("broadcast_gpu_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_base.h index 70cba27..c9e577a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_base.h @@ -46,6 +46,8 @@ namespace kernel_selector concatenation_optional_params() : optional_params(KernelType::CONCATENATION) {} bool kernelPerInput = true; + + protected: virtual ParamsKey GetSupportedKey() const { ParamsKey k = optional_params::GetSupportedKey(); @@ -80,4 +82,4 @@ namespace kernel_selector virtual DispatchData SetDefault(const concatenation_params& params) const; KernelsData GetCommonKernelsData(const Params& params, const optional_params&) const; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_depth_bfyx_no_pitch.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_depth_bfyx_no_pitch.h index 2b40366..f21e56b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_depth_bfyx_no_pitch.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_depth_bfyx_no_pitch.h @@ -27,8 +27,10 @@ namespace kernel_selector { virtual ~ConcatenationKernel_depth_bfyx_no_pitch() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual DispatchData SetDefault(const concatenation_params& params) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.h index 2b7379c..3020b17 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_ref.h @@ -27,8 +27,8 @@ namespace kernel_selector { virtual ~ConcatenationKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const concatenation_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp new file mode 100644 index 0000000..d5ac28e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp @@ -0,0 +1,138 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "contract_kernel_base.h" + +#include "kernel_selector_utils.h" + + +namespace kernel_selector +{ + JitConstants ContractKernelBase::GetJitConstants(const contract_params& params) + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const size_t no_dim_flag = 6; + std::vector output_dims(4, no_dim_flag); + int out_dim = 2; + for (int i = 3; i >= 0; --i) + { + if (std::find(params.reduction_axes.begin(), params.reduction_axes.end(), i) == params.reduction_axes.end()) + output_dims.at(i) = out_dim--; + } + + if (output_dims[3] != no_dim_flag) + jit.AddConstants({ + MakeJitConstant("DIM_X", output_dims.at(3)) + }); + if (output_dims[2] != no_dim_flag) + jit.AddConstants({ + MakeJitConstant("DIM_Y", output_dims.at(2)) + }); + if (output_dims[1] != no_dim_flag) + jit.AddConstants({ + MakeJitConstant("DIM_F", output_dims.at(1)) + }); + if (output_dims[0] != no_dim_flag) + jit.AddConstants({ + MakeJitConstant("DIM_B", output_dims.at(0)) + }); + + jit.AddConstants({ + MakeJitConstant("REDUCE_X", output_dims.at(3) == no_dim_flag), + MakeJitConstant("REDUCE_Y", output_dims.at(2) == no_dim_flag), + MakeJitConstant("REDUCE_F", output_dims.at(1) == no_dim_flag), + MakeJitConstant("REDUCE_B", output_dims.at(0) == no_dim_flag) + }); + + switch (params.mode) + { + case ContractMode::SUM: + jit.AddConstants({ + MakeJitConstant("REDUCE_SEED", "0"), + MakeJitConstant("REDUCE_OPERATION(a, b)", "a + b") + }); + break; + case ContractMode::PRODUCT: + jit.AddConstants({ + MakeJitConstant("REDUCE_SEED", "1"), + MakeJitConstant("REDUCE_OPERATION(a, b)", "a * b") + }); + break; + case ContractMode::ALL: + jit.AddConstants({ + MakeJitConstant("REDUCE_SEED", "1"), + MakeJitConstant("REDUCE_OPERATION(a, b)", "a && b") + }); + break; + case ContractMode::ANY: + jit.AddConstants({ + MakeJitConstant("REDUCE_SEED", "0"), + MakeJitConstant("REDUCE_OPERATION(a, b)", "a || b") + }); + break; + case ContractMode::MAX: + jit.AddConstants({ + MakeJitConstant("REDUCE_SEED", "UNIT_VAL_MIN"), + MakeJitConstant("REDUCE_OPERATION(a, b)", "UNIT_MAX_FUNC(a,b)") + }); + break; + } + + return jit; + } + + ContractKernelBase::DispatchData ContractKernelBase::SetDefault(const contract_params& params) + { + const auto& output = params.output; + + DispatchData kd; + + kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; + + std::vector global{ output.Feature().v, output.Y().v, output.X().v }; + const auto& local = GetOptimalLocalWorkGroupSizes(global); + + kd.gws0 = global[0]; + kd.gws1 = global[1]; + kd.gws2 = global[2]; + + kd.lws0 = local[0]; + kd.lws1 = local[1]; + kd.lws2 = local[2]; + + return kd; + } + + KernelsData ContractKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const + { + assert(params.GetType() == KernelType::CONTRACT); + + const auto& prim_params = static_cast(params); // NOLINT(cppcoreguidelines-pro-type-static-cast-downcast) + + auto run_info = SetDefault(prim_params); + KernelData k_data = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = k_data.kernels[0]; + FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point); + k_data.estimatedTime = estimated_time; + + return{ k_data }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h new file mode 100644 index 0000000..22e308c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "common_kernel_base.h" +#include "kernel_selector_params.h" + + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // contract_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct contract_params : public base_params + { + contract_params() + : base_params(KernelType::CONTRACT) + { + } + ContractMode mode; + std::vector reduction_axes; + + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // contract_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct contract_optional_params : optional_params + { + contract_optional_params() + : optional_params(KernelType::CONTRACT) + { + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ContractKernelBase + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class ContractKernelBase : public common_kernel_base + { + public: + using common_kernel_base::common_kernel_base; + + using DispatchData = CommonDispatchData; + + protected: + static JitConstants GetJitConstants(const contract_params& params); + static DispatchData SetDefault(const contract_params& params); + KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp new file mode 100644 index 0000000..ba42e28 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp @@ -0,0 +1,53 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "contract_kernel_ref.h" + + +namespace kernel_selector +{ + ParamsKey ContractKernelRef::GetSupportedKey() const + { + ParamsKey k; + + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::INT64); + + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT32); + k.EnableOutputDataType(Datatype::INT64); + + k.EnableInputLayout(DataLayout::bfyx); + + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + + return k; + } + + KernelsData ContractKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h new file mode 100644 index 0000000..eb8a6cc --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h @@ -0,0 +1,30 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "contract_kernel_base.h" + + +namespace kernel_selector +{ + class ContractKernelRef : public ContractKernelBase + { + public: + ContractKernelRef() : ContractKernelBase("contract_ref") {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp new file mode 100644 index 0000000..06d7569 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "contract_kernel_selector.h" +#include "contract_kernel_ref.h" + +namespace kernel_selector +{ + contract_kernel_selector::contract_kernel_selector() + { + Attach(); + } + + KernelsData contract_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::CONTRACT); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h new file mode 100644 index 0000000..3c9e87a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h @@ -0,0 +1,34 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "kernel_selector.h" + + +namespace kernel_selector +{ + class contract_kernel_selector : public kernel_selector_base + { + public: + static contract_kernel_selector &Instance() { + static contract_kernel_selector instance; + return instance; + } + + contract_kernel_selector(); + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.cpp index ab76906..67f0a04 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_1x1_gemm_MMAD.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -82,7 +81,7 @@ namespace kernel_selector { const auto of_maps = arg.output.Feature().v; const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - runInfo.effiency = FORCE_PRIORITY_1; + runInfo.effiency = FORCE_PRIORITY_2; runInfo.gws0 = RoundUp(arg.output.X().v * arg.output.Y().v, 8) / 8; runInfo.gws1 = of_threads_per_batch * arg.output.Batch().v; @@ -111,6 +110,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_1x1_gemm_MMAD::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.h index 5c664f6..7596f86 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_1x1_gemm_MMAD.h @@ -28,9 +28,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_1x1_gemm_MMAD() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; bool Validate(const Params& p, const optional_params& o) const override; @@ -41,4 +41,4 @@ namespace kernel_selector { }; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.cpp index 0963f0b..f4a3863 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_MMAD.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -50,7 +49,7 @@ namespace kernel_selector { const auto of_maps = arg.output.Feature().v; const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - runInfo.effiency = FORCE_PRIORITY_3; + runInfo.effiency = FORCE_PRIORITY_4; runInfo.gws0 = arg.output.X().v; runInfo.gws1 = arg.output.Y().v; @@ -79,9 +78,9 @@ namespace kernel_selector { KernelsData ConvolutionKernel_MMAD::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); + KernelsData kd = GetTunedKernelsDataByIndex(params, options); if(!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_3; + kd[0].estimatedTime = FORCE_PRIORITY_4; return kd; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.h index 824fcf7..1b2bbab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD.h @@ -28,9 +28,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_MMAD() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override @@ -40,4 +40,4 @@ namespace kernel_selector { }; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.cpp index dd2a03c..6c892e8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_MMAD_blocks.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -25,21 +24,21 @@ namespace kernel_selector std::vector blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16,18,20,22,24,26,28,30,32 }; std::vector blockHeightSizes = { 1,2,3,4,5,6,7,8,9,10 }; std::vector prefetchSizes = { 1,2,3,4,5,6,8,10 }; - std::vector executionModes = { /*AGE_BASED ,*/ ROUND_ROBIN }; + std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; const size_t maxBlockSize = 240; - - for (auto blockWidth : blockWidthSizes) + for (auto executionMode : executionModes) { - for (auto blockHeight : blockHeightSizes) + for (auto blockWidth : blockWidthSizes) { - for (auto prefetch : prefetchSizes) + for (auto blockHeight : blockHeightSizes) { - for (auto executionMode : executionModes) + for (auto prefetch : prefetchSizes) { if (blockWidth * blockHeight <= maxBlockSize) { autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode }); } + } } } @@ -110,7 +109,7 @@ namespace kernel_selector // Sub-group size used by "convolution_gpu_mmad_blocks" kernel. constexpr size_t sub_group_size = 16; - AutoTuneOption option = { 0, 0, 0, ROUND_ROBIN }; + AutoTuneOption option = { 0, 0, 0, DEFAULT }; const convolution_params& cp = static_cast(p); @@ -255,14 +254,9 @@ namespace kernel_selector return jit; } - KernelsData ConvolutionKernel_MMAD_blocks::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const - { - return GetCommonKernelsData(params, options, GetAutoTuneOptions(params, autoTuneIndex).exeMode, autoTuneIndex); - } - KernelsData ConvolutionKernel_MMAD_blocks::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); + KernelsData kd = GetTunedKernelsDataByIndex(params, options); if (!kd.empty()) kd[0].estimatedTime = FORCE_PRIORITY_2; @@ -287,9 +281,6 @@ namespace kernel_selector } } - KernelsData defaultKds = GetKernelsData(params, options); - res.insert(res.end(), defaultKds.begin(), defaultKds.end()); - return res; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.h index 03137b4..a495613 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_MMAD_blocks.h @@ -29,10 +29,9 @@ namespace kernel_selector { virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; - virtual KernelsData GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, int autoTuneIndex) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; @@ -54,4 +53,4 @@ namespace kernel_selector { AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; std::vector autoTuneOptions = {}; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp index 86bfe93..d40c8ab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp @@ -61,10 +61,11 @@ namespace kernel_selector MakeJitConstant("STRIDE", params.stride), MakeJitConstant("PADDING", params.padding), MakeJitConstant("DILATION", params.dilation), - MakeJitConstant("FILTER_ARRAY_NUM", params.split), + MakeJitConstant("FILTER_ARRAY_NUM", params.split * params.groups), MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), - MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", params.depthwiseSeparableOpt), + MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", params.depthwise_separable_opt), MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization), + MakeJitConstant("GROUPED", (params.groups > 1) ? 1 : 0), }); if (params.int8_quantization) @@ -82,6 +83,11 @@ namespace kernel_selector mem_consts.AddConstants({ MakeJitConstant("O_QF", params.output_quantization_factor) }); } + if (params.local_convolution) + { + mem_consts.AddConstants({ MakeJitConstant("LOCAL_CONVOLUTION", params.local_convolution) }); + } + std::vector unrollLoopParams{ params.filterSize.x, params.filterSize.y, @@ -249,4 +255,123 @@ namespace kernel_selector return{ kd }; } + + bool CheckConvolutionPaddedInputDesc(const convolution_params& params, const DataTensor& reqDesc) + { + assert(params.inputs.size() == 1); + + bool properPadding = + reqDesc.X().pad.before <= params.inputs[0].X().pad.before && + reqDesc.Y().pad.before <= params.inputs[0].Y().pad.before && + reqDesc.Feature().pad.before <= params.inputs[0].Feature().pad.before && + reqDesc.Batch().pad.before <= params.inputs[0].Batch().pad.before; + + properPadding &= + reqDesc.X().pad.after <= params.inputs[0].X().pad.after && + reqDesc.Y().pad.after <= params.inputs[0].Y().pad.after && + reqDesc.Feature().pad.after <= params.inputs[0].Feature().pad.after && + reqDesc.Batch().pad.after <= params.inputs[0].Batch().pad.after; + + properPadding &= ((params.padding.x == 0 && params.padding.y == 0) || params.inputs[0].GetPaddedVal() == 0.f); + + return properPadding; + } + + static DataTensor GetConvolutionBFYXPaddedTensor(const convolution_params& cp) + { + assert(cp.inputs.size() == 1); + assert(cp.inputs[0].GetDims().size() == 4U); + + DataTensor t = cp.inputs[0]; + std::vector pad{ { 0,0 },{ 0,0 },{ 0,0 },{ 0,0 } }; + + pad[0].before = cp.padding.x; + pad[1].before = cp.padding.y; + + const auto inputLimitX = (cp.output.X().v - 1) * cp.stride.x + (cp.filterSize.x - 1) * cp.dilation.x + 1; + const auto inputLimitY = (cp.output.Y().v - 1) * cp.stride.y + (cp.filterSize.y - 1) * cp.dilation.y + 1; + + pad[0].after = (size_t)std::max((int)inputLimitX - (int)t.X().v - (int)pad[0].before, (int)0); + pad[1].after = (size_t)std::max((int)inputLimitY - (int)t.Y().v - (int)pad[1].before, (int)0); + + Tensor::NDims dims(4); + const Tensor::NDims& orgDims = cp.inputs[0].GetDims(); + size_t pitch = 1; + for (size_t i = 0; i < dims.size(); i++) + { + dims[i].pad = pad[i]; + dims[i].v = orgDims[i].v; + dims[i].pitch = pitch; + pitch *= dims[i].LogicalDimPadded(); + } + + return{ dims, t.GetDType(), t.GetLayout() }; + } + + bool CovolutionCheckInput(const Params& p, const optional_params& o) + { + const convolution_params& params = static_cast(p); + const convolution_optional_params& optParams = static_cast(o); + + const auto req_input = GetConvolutionBFYXPaddedTensor(params); + const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input); + const bool bInputPadded = optParams.allowInputReordering || bProperInputDesc; + + if (!bInputPadded) + { + return false; + } + + return true; + } + + bool CovolutionUpdateInputParams(convolution_params& params) + { + const auto req_input = GetConvolutionBFYXPaddedTensor(params); + const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input); + + if (!bProperInputDesc) + { + params.inputs[0] = req_input; + return true; + } + + return false; + } + + std::string ConvolutionKernelBase::GetAutoTuneOptions(int autoTuneIndex) const + { + if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size())) + { + return autoTuneOptions[autoTuneIndex]; + } + + return DEFAULT; + } + + KernelsData ConvolutionKernelBase::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const + { + return GetCommonKernelsData(params, options, GetAutoTuneOptions(autoTuneIndex), autoTuneIndex); + } + + KernelsData ConvolutionKernelBase::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelsData res = {}; + + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; + } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.h index 4e7c82f..d6dc476 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.h @@ -56,6 +56,11 @@ namespace kernel_selector GEMMStyle gemmStyle; }; }; + + std::string GetAutoTuneOptions(int autoTuneIndex) const; + std::vector autoTuneOptions = { DEFAULT, NO_PRERA_SCH, AGE_BASED }; + virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; + virtual KernelsData GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, int autoTuneIndex = -1) const override; protected: virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const = 0; @@ -66,6 +71,11 @@ namespace kernel_selector virtual DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const; static bool CheckWorkGroups(const DispatchData&); static bool CheckPitchForSplitOnly(const convolution_params& params); - KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode = ROUND_ROBIN, int autoTuneIndex = -1) const; + KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode = DEFAULT, int autoTuneIndex = -1) const; }; + + bool CovolutionCheckInput(const Params& p, const optional_params& o); + bool CheckConvolutionPaddedInputDesc(const convolution_params& params, const DataTensor& reqDesc); + bool CovolutionUpdateInputParams(convolution_params& params); + } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp index cba3ba3..e0fab58 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_bfyx_1x1.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -107,6 +106,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_bfyx_1x1::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.h index 7ea7456..0f11ddd 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1.h @@ -29,9 +29,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_bfyx_1x1() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ @@ -42,4 +42,4 @@ namespace kernel_selector { DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.cpp index a34add2..b1c15ae 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_bfyx_1x1_gemm_buf.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -112,6 +111,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_bfyx_1x1_gemm_buf::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.h index 61eb826..55ecfbd 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_gemm_buf.h @@ -29,9 +29,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_bfyx_1x1_gemm_buf() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ @@ -42,4 +42,4 @@ namespace kernel_selector { DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp new file mode 100644 index 0000000..1c08d2a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp @@ -0,0 +1,173 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_bfyx_1x1_opt.h" + +namespace kernel_selector +{ + + convolution_kernel_bfyx_1x1_opt::convolution_kernel_bfyx_1x1_opt() : ConvolutionKernelBase("convolution_gpu_bfyx_1x1_opt") + { + } + + ParamsKey convolution_kernel_bfyx_1x1_opt::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F32); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableSubGroup(); + k.EnableBiasPerFeature(); + k.EnableBiasPerOutput(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + return k; + } + + struct block_params + { + int32_t out_width; + int32_t out_height; + int32_t out_depth; + }; + + static block_params get_out_block_size(const convolution_params& p) + { + auto out_depth = 8; + + if (p.output.X().v == 7) + { + auto gws0 = p.output.X().v / 7; + auto gws1 = p.output.Y().v / 1; + auto gws2 = 2*(p.output.Feature().v * p.output.Batch().v) / 8 ; // process 8 output channels per Workitem + + auto compute_units = p.engineInfo.computeUnitsCount; + auto total_threads = (gws0 * gws1 * gws2) / 64; + if (total_threads < compute_units) + { + out_depth /= 2; + total_threads *= 2; + } + if (total_threads < compute_units) + { + out_depth /= 2; + total_threads *= 2; + } + return { 7,1,out_depth }; + } + else if (p.output.X().v == 14) + return { 7,1,8 }; + else if (p.output.X().v == 28) + return { 7,2,4 }; + else if (p.output.X().v == 56) + return { 8,1,8 }; + + return { 1,1,1 }; + } + + + ConvolutionKernelBase::DispatchData convolution_kernel_bfyx_1x1_opt::SetDefault(const convolution_params& cp, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp); + + constexpr size_t sub_group_size = 8; + + runInfo.effiency = FORCE_PRIORITY_3; + + auto block = get_out_block_size(cp); + + runInfo.gws0 = cp.output.X().v / block.out_width; + runInfo.gws1 = cp.output.Y().v / block.out_height; + runInfo.gws2 = 2*(cp.output.Feature().v * cp.output.Batch().v) / block.out_depth; // process 8 output channels per Workitem + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = 2*sub_group_size; + + return runInfo; + } + + bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o)) + { + return false; + } + const convolution_params& cp = static_cast(p); + + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + if (cp.output.Feature().v % 64 != 0) + return false; + + if (cp.padding.x != 0 || cp.padding.y != 0) + return false; + + // if block sizes are 1x1, then this algorithm is probably not the best + auto block = get_out_block_size(cp); + if (block.out_width == 1 && block.out_height == 1) + return false; + + if (cp.output.X().v % block.out_width != 0) + return false; + if (cp.output.Y().v % block.out_height != 0) + return false; + + return true; + } + + JitConstants convolution_kernel_bfyx_1x1_opt::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + auto block = get_out_block_size(params); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block.out_depth)); + + return jit; + } + + std::vector convolution_kernel_bfyx_1x1_opt::GetSupportedWeightLayouts(const convolution_params& cp) const + { + auto block = get_out_block_size(cp); + if (block.out_depth == 8) + return { WeightsLayout::os_iyx_osv64 }; + if (block.out_depth == 4) + return { WeightsLayout::os_iyx_osv32 }; + if (block.out_depth == 2) + return { WeightsLayout::os_iyx_osv16 }; + else + return{ WeightsLayout::yxio }; + } + + KernelsData convolution_kernel_bfyx_1x1_opt::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_1; + return kd; + } + +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.h new file mode 100644 index 0000000..969dadb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.h @@ -0,0 +1,40 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class convolution_kernel_bfyx_1x1_opt : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + convolution_kernel_bfyx_1x1_opt(); + virtual ~convolution_kernel_bfyx_1x1_opt() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const convolution_params&) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + bool Validate(const Params& p, const optional_params& o) const override; + bool NeedPaddedInput() const override { return true; } + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp index b92df30..9bbfdcb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_bfyx_3x3_dw_opt.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -24,7 +23,7 @@ namespace kernel_selector // Generate the dispatch options to the auto-tuner. std::vector tileXDimSizes = { 1,2,4,5,6,8,10,12,14 }; std::vector tileYDimSizes = { 1,2,3,4,5,6,7 }; - std::vector executionModes = { /*AGE_BASED ,*/ ROUND_ROBIN }; + std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; for (auto tileXDim : tileXDimSizes) { @@ -95,7 +94,7 @@ namespace kernel_selector constexpr int simdSize = 16; - return AutoTuneOption{ { simdSize - 2, 7 }, ROUND_ROBIN }; + return AutoTuneOption{ { simdSize - 2, 7 }, DEFAULT }; } ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_3x3_dw_opt::SetDefault(const convolution_params& params, int autoTuneIndex) const diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h index 0c9cf0e..9606b4e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h @@ -30,9 +30,9 @@ namespace kernel_selector virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; virtual KernelsData GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, int autoTuneIndex) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::oiyx }; } JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; @@ -47,4 +47,4 @@ namespace kernel_selector AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; std::vector autoTuneOptions = {}; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.cpp index f6841db..12478e8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.cpp @@ -15,8 +15,7 @@ */ #include "convolution_kernel_bfyx_depthwise_weights_lwg.h" -#include "kernel_selector_utils.h" - + namespace kernel_selector { ParamsKey ConvolutionKernel_bfyx_depthwise_weights_lwg::GetSupportedKey() const @@ -39,6 +38,7 @@ namespace kernel_selector k.EnableSubGroup(); k.EnableSubGroupShort(); k.EnableDepthwiseSeparableOpt(); + k.EnableDilation(); return k; } @@ -51,12 +51,11 @@ namespace kernel_selector } const convolution_params& cp = static_cast(p); - if (!cp.depthwiseSeparableOpt) + if (!cp.depthwise_separable_opt) return false; - if ((cp.filterSize.x > 4) || (cp.filterSize.y > 4) || - (cp.inputs[0].Feature().v != cp.split)) + ((cp.inputs[0].Feature().v != cp.split) && (cp.inputs[0].Feature().v != cp.groups))) { return false; } @@ -95,6 +94,6 @@ namespace kernel_selector KernelsData ConvolutionKernel_bfyx_depthwise_weights_lwg::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.h index b578f8f..96a79d2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_depthwise_weights_lwg.h @@ -28,12 +28,12 @@ namespace kernel_selector virtual ~ConvolutionKernel_bfyx_depthwise_weights_lwg() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::oiyx }; } JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.cpp index 95d012b..17c9cab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.cpp @@ -15,8 +15,6 @@ */ #include "convolution_kernel_bfyx_direct_10_12_16.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -111,6 +109,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_bfyx_Direct_10_10_12::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, AGE_BASED); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.h index 68ae13a..a337825 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_direct_10_12_16.h @@ -28,9 +28,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_bfyx_Direct_10_10_12() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::i_yxs_os_yxsv2_osv16 }; } JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; @@ -38,4 +38,4 @@ namespace kernel_selector { bool NeedPaddedInput() const override { return true; } DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp index c712748..e44b521 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp @@ -14,10 +14,7 @@ // limitations under the License. */ -#include #include "convolution_kernel_bfyx_gemm_like.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -137,6 +134,6 @@ namespace kernel_selector KernelsData ConvolutionKernel_bfyx_GEMMLike::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, AGE_BASED); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.h index 693687d..4074f8b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.h @@ -28,9 +28,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_bfyx_GEMMLike() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override; std::string GetKernelName(const convolution_params& params) const override; bool NeedPaddedInput() const override { return true; } @@ -38,4 +38,4 @@ namespace kernel_selector { bool Validate(const Params& p, const optional_params& o) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp index 854c12e..730f88f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp @@ -15,8 +15,6 @@ */ #include "convolution_kernel_bfyx_os_iyx_osv16.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -29,21 +27,21 @@ namespace kernel_selector std::vector blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 }; std::vector blockHeightSizes = { 1,2,3,4,5 }; std::vector prefetchSizes = { 1,2,3,4,5,6,8,10 }; - std::vector executionModes = { /*AGE_BASED ,*/ ROUND_ROBIN }; + std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; const size_t maxBlockSize = 60; - for (auto blockWidth : blockWidthSizes) + for (auto executionMode : executionModes) { - for (auto blockHeight : blockHeightSizes) + for (auto blockWidth : blockWidthSizes) { - for (auto prefetch : prefetchSizes) + for (auto blockHeight : blockHeightSizes) { - for (auto executionMode : executionModes) + for (auto prefetch : prefetchSizes) { - if (blockWidth * blockHeight <= maxBlockSize) - { - autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode }); - } + if (blockWidth * blockHeight <= maxBlockSize) + { + autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode }); + } } } } @@ -124,7 +122,7 @@ namespace kernel_selector return autoTuneOptions[autoTuneIndex]; } - AutoTuneOption option = { 0, 0, 0, ROUND_ROBIN }; + AutoTuneOption option = { 0, 0, 0, DEFAULT }; const convolution_params& cp = static_cast(p); @@ -252,11 +250,6 @@ namespace kernel_selector return jit; } - KernelsData ConvolutionKernel_bfyx_os_iyx_osv16::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const - { - return GetCommonKernelsData(params, options, GetAutoTuneOptions(params, autoTuneIndex).exeMode, autoTuneIndex); - } - std::vector ConvolutionKernel_bfyx_os_iyx_osv16::GetSupportedWeightLayouts(const convolution_params& params) const { if (!params.transposed) @@ -271,7 +264,7 @@ namespace kernel_selector KernelsData ConvolutionKernel_bfyx_os_iyx_osv16::GetKernelsData(const Params& params, const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options, -1); + return GetTunedKernelsDataByIndex(params, options); } KernelsData ConvolutionKernel_bfyx_os_iyx_osv16::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const @@ -283,7 +276,7 @@ namespace kernel_selector KernelsData res = {}; - for (size_t i = 0 ; i < autoTuneOptions.size(); i++) + for (size_t i = 0; i < autoTuneOptions.size(); i++) { KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i); if (!kd.empty()) @@ -292,9 +285,7 @@ namespace kernel_selector } } - KernelsData defaultKds = GetKernelsData(params, options); - res.insert(res.end(), defaultKds.begin(), defaultKds.end()); - return res; } + } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h index 0b0ebc8..4f82540 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h @@ -29,10 +29,9 @@ namespace kernel_selector { virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; - virtual KernelsData GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, int autoTuneIndex) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; bool Validate(const Params& p, const optional_params& o) const override; @@ -52,4 +51,4 @@ namespace kernel_selector { std::vector autoTuneOptions = {}; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp new file mode 100644 index 0000000..3eac169 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp @@ -0,0 +1,299 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h" + +namespace kernel_selector +{ + // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel. + constexpr size_t sub_group_size = 16; + + ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16_2_sg") + { + // Generate the dispatch options to the auto-tuner. + std::vector blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 }; + std::vector blockHeightSizes = { 1,2,3,4,5 }; + std::vector prefetchSizes = { 1,2,3,4,5,6,8,10 }; + std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; + const size_t maxBlockSize = 60; + + for (auto executionMode : executionModes) + { + for (auto blockWidth : blockWidthSizes) + { + for (auto blockHeight : blockHeightSizes) + { + for (auto prefetch : prefetchSizes) + { + if (blockWidth * blockHeight <= maxBlockSize) + { + autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode }); + } + } + } + } + } + } + + ParamsKey ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputWeightsType(WeightsType::F16); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableSubGroup(); + k.EnableBiasPerFeature(); + k.EnableBiasPerOutput(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableSplitSupport(); + k.EnableDilation(); + k.EnableTranspose(); + return k; + } + + static std::pair get_bfyx_req_input_block_dims( + size_t output_block_width, + size_t output_block_height, + const uSize& filter_size, + const uSize& stride, + const uSize& dilation, + size_t sg_size = 16, + size_t read_chunk_size = 8, + size_t min_read_size = 16) + { + assert(output_block_width > 0 && output_block_height > 0); + assert(stride.x > 0 && stride.y > 0); + assert(filter_size.x > 0 && filter_size.y > 0); + + // Number of elements in X dimension needed from input to compute output block without re-reading input. + size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1; + // Number of elements in Y dimension needed from input to compute output block without re-reading input. + size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1; + + // Required number of elements in X dimension rounded to nearest >= read chunk size. + size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size); + // Number of sub-group-sized vectors of unit type needed to store input block. + size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size); + + return std::make_pair(input_block_array_size, input_block_read_width); + } + + static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y) + { + // how many elements we will compute in each dimension + size_t computed_x = Align(output_x, block_x); + size_t computed_y = Align(output_y, block_y); + // how many simds we need in each dimension + size_t simds_x = computed_x / block_x; + size_t simds_y = computed_y / block_y; + // how many unused values we have in each dimension + size_t unused_x = computed_x - output_x; + size_t unused_y = computed_y - output_y; + + block_x -= unused_x / simds_x; + block_y -= unused_y / simds_y; + } + + ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const + { + if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size())) + { + return autoTuneOptions[autoTuneIndex]; + } + + AutoTuneOption option = { 0, 0, 0, DEFAULT }; + + const convolution_params& cp = static_cast(p); + + if (cp.stride.x == 1 && cp.stride.y == 1) + { + if (cp.filterSize.x == 1 && cp.filterSize.y == 1) + { + option.blockWidth = 16; + option.blockHeight = 1; + option.prefetch = 4; + } + //if less than 16 values is required to compute one single row of output + //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results) + else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size) + { + option.blockWidth = cp.output.X().v; + option.blockHeight = 1; + option.prefetch = 4; + } + else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) + { + option.blockWidth = sub_group_size - cp.filterSize.x + 1; + option.blockHeight = 2; + option.prefetch = 4; + } + else + { + option.blockWidth = 4; + option.blockHeight = 3; + option.prefetch = 4; + } + } + else if (cp.stride.x == 2 && cp.stride.y == 2) + { + option.blockWidth = 5; + option.blockHeight = 4; + option.prefetch = 4; + } + else + { + option.blockWidth = 4; + option.blockHeight = 3; + option.prefetch = 5; + //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better + } + + // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes + if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) + { + shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, + option.blockWidth, option.blockHeight); + } + + return option; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::SetDefault(const convolution_params& cp, int autoTuneIndex) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp); + + const auto of_maps = cp.output.Feature().v; + const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); + + runInfo.effiency = FORCE_PRIORITY_3; + + auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex); + runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth; + runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight; + runInfo.cldnnStyle.prefetch = tuneOptions.prefetch; + + auto input_block_dims = get_bfyx_req_input_block_dims( + runInfo.cldnnStyle.blockWidth, + runInfo.cldnnStyle.blockHeight, + cp.filterSize, + cp.stride, + cp.dilation, + sub_group_size, + runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2, + sub_group_size); + runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first; + runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second; + + runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth); + runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight); + runInfo.gws2 = 2 * of_threads_per_batch * cp.output.Batch().v; + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = 2*sub_group_size; + + return runInfo; + } + + bool ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + if (cp.inputs[0].Feature().v % 2 != 0 || cp.inputs[0].Feature().v < 64) + return false; + + if (cp.output.Feature().v % 64 != 0) + return false; + + return true; + } + + JitConstants ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + const auto of_maps = params.output.Feature().v; + const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); + size_t leftovers = of_threads_per_batch - of_maps; + + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 16)); + jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth)); + jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight)); + jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize)); + jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth)); + jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch)); + + if (leftovers) + { + jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers)); + } + + return jit; + } + + std::vector ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedWeightLayouts(const convolution_params& params) const + { + if (!params.transposed) + { + return{ WeightsLayout::os_iyx_osv16 }; + } + else + { + return{ WeightsLayout::os_iyx_osv16_rotate_180 }; + } + } + + KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetTunedKernelsDataByIndex(params, options); + } + + KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelsData res = {}; + + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; + } + +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h new file mode 100644 index 0000000..02af557 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h @@ -0,0 +1,54 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_bfyx_os_iyx_osv16_2_sg : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_bfyx_os_iyx_osv16_2_sg(); + virtual ~ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const convolution_params&) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + bool Validate(const Params& p, const optional_params& o) const override; + bool NeedPaddedInput() const override { return true; } + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + + private: + struct AutoTuneOption + { + size_t blockWidth; + size_t blockHeight; + size_t prefetch; + std::string exeMode; + }; + + AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; + + std::vector autoTuneOptions = {}; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.cpp index a625c64..a4e83b0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_bfyx_ref.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -47,11 +46,13 @@ namespace kernel_selector { k.EnableInt8Quantization(); k.EnableOutputCalibration(); k.DisableTuning(); + k.EnableLocalConvolution(); + k.EnableGroupedConvolution(); return k; } KernelsData ConvolutionKernel_bfyx_Ref::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.h index f005457..0835bab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_ref.h @@ -27,9 +27,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_bfyx_Ref() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ @@ -37,7 +37,8 @@ namespace kernel_selector { WeightsLayout::yxio, WeightsLayout::iyxo, WeightsLayout::oyxi, + WeightsLayout::bf_lyx_yx, }; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp new file mode 100644 index 0000000..7bbf435 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp @@ -0,0 +1,81 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::byx8_f4); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableDilation(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + bool ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::Validate(const Params& p, const optional_params& o) const + { + if (!Parent::Validate(p, o)) + { + return false; + } + + const convolution_params& params = static_cast(p); + + // this kernel is designed for quantization use case + if (!params.int8_quantization) + return false; + + return true; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + runInfo.gws0 = (arg.output.Batch().v * arg.output.Feature().v) / 4; + runInfo.gws1 = arg.output.X().v / 8; + runInfo.gws2 = arg.output.Y().v / 4; + + runInfo.lws0 = 8; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } + + KernelsData ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_3; + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h new file mode 100644 index 0000000..312310b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h @@ -0,0 +1,43 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32 : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32() : ConvolutionKernelBase("convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32") {} + virtual ~ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + ConvolutionKernelBase::DispatchData SetDefault(const convolution_params& arg, int) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::os_is_y_x8_osv8_isv4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.cpp index 154b4e5..7c5fe4b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_byxf_af32_depthwise.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -52,7 +51,7 @@ namespace kernel_selector { const convolution_params& params = static_cast(p); // this kernel is designed for quantization use case - if (!params.depthwiseSeparableOpt) + if (!params.depthwise_separable_opt) return false; return true; @@ -60,7 +59,7 @@ namespace kernel_selector { KernelsData ConvolutionKernel_byxf_af32_depthiwise::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); + KernelsData kd = GetTunedKernelsDataByIndex(params, options); if(!kd.empty()) kd[0].estimatedTime = FORCE_PRIORITY_3; return kd; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.h index b71b629..2b4fdef 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_af32_depthwise.h @@ -28,9 +28,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_byxf_af32_depthiwise() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { @@ -42,4 +42,4 @@ namespace kernel_selector { }; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp new file mode 100644 index 0000000..728ab58 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp @@ -0,0 +1,62 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::byxf); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + runInfo.gws0 = (arg.output.Batch().v * arg.output.Feature().v) / 4; + runInfo.gws1 = arg.output.X().v / 8; + runInfo.gws2 = arg.output.Y().v; + + runInfo.lws0 = 8; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } + + KernelsData ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h new file mode 100644 index 0000000..18cf868 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h @@ -0,0 +1,41 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32 : public ConvolutionKernelBase + { + public: + ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32() : ConvolutionKernelBase("convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32") {} + virtual ~ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + ConvolutionKernelBase::DispatchData SetDefault(const convolution_params& arg, int) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::yxio + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.cpp new file mode 100644 index 0000000..91a42e5 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.cpp @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_imad_1x1.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" + +namespace kernel_selector { + + JitConstants + ConvolutionKernel_imad_1x1::GetJitConstants( + const convolution_params& params, + const DispatchData& kd) const + { + auto mem_consts = Parent::GetJitConstants(params, kd); + + mem_consts.AddConstants({ + // Block reading optimization is implemented for 3x3 only. + // For 1x1 it should be disabled. + MakeJitConstant("NON_BLOCK_LOAD", 1), + }); + return mem_consts; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.h new file mode 100644 index 0000000..11c4e06 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_1x1.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_imad_3x3.h" + +namespace kernel_selector { + + // TODO Currently the best 1x1 IMAD convolution kernel is not completely done. + // Temporary solution to implement 1x1 using 3x3 IMAD convolution kernel with a + // little modifications. + class ConvolutionKernel_imad_1x1 : public ConvolutionKernel_imad_3x3 + { + public: + using Parent = ConvolutionKernel_imad_3x3; + ConvolutionKernel_imad_1x1() : ConvolutionKernel_imad_3x3(1, 1) {} + virtual ~ConvolutionKernel_imad_1x1() {} + + protected: + // For 3x3 based IMAD convolution only 'GetJitConstants' method is required + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.cpp new file mode 100644 index 0000000..980e001 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.cpp @@ -0,0 +1,305 @@ +/* +// Copyright (c) 2018-2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_imad_3x3.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" + +// +// Kernel specific constants +// +#define SIMD_SIZE 16 +// Threshold value to calculate the block size. +#define OUT_BLOCK_THRESHOLD 7 +// For images 7x7 it's 7 (default), for 14x14 and above it's 14. +#define OUT_BLOCK_WIDTH 7 +// For images 7x7 it's 1 (default), for 14x14 and above it's 2. +#define OUT_BLOCK_HEIGHT 1 + +static void getOutBlock_WH(size_t inW, size_t Stride, size_t Pad, size_t& outW, size_t& outH) +{ + outW = OUT_BLOCK_WIDTH * 2; + outH = OUT_BLOCK_HEIGHT * 2; + + if ((inW <= OUT_BLOCK_THRESHOLD) || + (outW * Stride + Pad > SIMD_SIZE)) { + outW = OUT_BLOCK_WIDTH; + outH = OUT_BLOCK_HEIGHT; + } + if (outW * Stride + Pad > SIMD_SIZE) { + outW = outH = 4; + } + + assert(outW * Stride + Pad <= SIMD_SIZE); +} // getOutBlock_WH + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_imad_3x3::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputWeightsType(WeightsType::UINT8); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableDifferentInputWeightsTypes(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableDilation(); + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + KernelsData + ConvolutionKernel_imad_3x3::GetKernelsData( + const Params& params, + const optional_params& options) const + { + return GetCommonKernelsData(params, options); + } + + JitConstants + ConvolutionKernel_imad_3x3::GetJitConstants( + const convolution_params& params, + const DispatchData& kd) const + { + auto mem_consts = Parent::GetJitConstants(params, kd); + + const auto& input = params.inputs[0]; + const auto& output = params.output; + + const auto& iDims = input.GetDims(); + const auto& oDims = output.GetDims(); + const auto& weights = params.weights; + const auto& wDims = weights.GetDims(); + const int iX = DataTensor::Channelndex( + input.GetLayout(), Tensor::DataChannelName::X); + const int iY = DataTensor::Channelndex( + input.GetLayout(), Tensor::DataChannelName::Y); + const int iB = DataTensor::Channelndex( + input.GetLayout(), Tensor::DataChannelName::BATCH); + const int iF = DataTensor::Channelndex( + input.GetLayout(), Tensor::DataChannelName::FEATURE); + const int wOD = WeightsTensor::Channelndex( + weights.GetLayout(), Tensor::WeightsChannelName::OFM); + const int oX = DataTensor::Channelndex( + output.GetLayout(), Tensor::DataChannelName::X); + const int oY = DataTensor::Channelndex( + output.GetLayout(), Tensor::DataChannelName::Y); + mem_consts.AddConstants({ + MakeJitConstant("_IMAD_DEFINES", 1), + //MakeJitConstant("SCALE_FACTOR", m_ScaleFactor), //(255.0f / 700000.0f); + MakeJitConstant("_IW", iDims[iX].v), + MakeJitConstant("_IH", iDims[iY].v), + MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)), + MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after), + MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after), + MakeJitConstant("_OW", oDims[oX].v), + MakeJitConstant("_OH", oDims[oY].v), + MakeJitConstant("_OD", wDims[wOD].v), + MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after), + MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after), + MakeJitConstant("SIMD_SIZE", SIMD_SIZE), + MakeJitConstant("K_HEIGHT", wDims[iY].v), + MakeJitConstant("K_WIDTH", wDims[iX].v), + MakeJitConstant("K_STRIDE", params.stride.x), // X and Y must be equal + MakeJitConstant("BATCH_SIZE", iDims[iB].v), + MakeJitConstant("WORKGROUP_SIZE", "SIMD_SIZE"), + }); + + size_t obw, obh; + getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after, + obw, obh); + mem_consts.AddConstants({ + MakeJitConstant("OUT_BLOCK_WIDTH", obw), + MakeJitConstant("OUT_BLOCK_HEIGHT", obh) + }); + + // FM_TILE definition + mem_consts.AddConstants({ + MakeJitConstant("IMAD_LENGTH", 4), + MakeJitConstant("SYSTOLIC_DEPTH", 1), + MakeJitConstant("FM_TILE", "(IMAD_LENGTH * SYSTOLIC_DEPTH)") + }); + + if (input.GetDType() == Datatype::UINT8) { + // For unsigned types IMAD convolution kernel should skip + // all negative values. + mem_consts.AddConstants({ + MakeJitConstant("CONVO_UNSIGNED", 1) + }); + } + + if (params.output.GetLayout() != DataLayout::b_fs_yx_fsv4) { + mem_consts.AddConstants({ + // Produce unswizzelled results. + MakeJitConstant("TO_UNSWIZZLE", 1), + }); + } + + return mem_consts; + + } // GetJitConstants + + + ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_3x3::SetDefault( + const convolution_params& params, + int) const + { + DispatchData kd; + + const auto& in = params.inputs[0]; + const auto& weights = params.weights; + const auto& iDims = in.GetDims(); + const auto& wDims = weights.GetDims(); + const int iX = DataTensor::Channelndex( + in.GetLayout(), Tensor::DataChannelName::X); + const int iY = DataTensor::Channelndex( + in.GetLayout(), Tensor::DataChannelName::Y); + const int iB = DataTensor::Channelndex( + in.GetLayout(), Tensor::DataChannelName::BATCH); + const int wOD = WeightsTensor::Channelndex( + weights.GetLayout(), Tensor::WeightsChannelName::OFM); + + size_t otw, oth; + getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after, + otw, oth); + + std::vector global = { + //globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW; + // number of tiles needed to cover output width + (((iDims[iX].v / params.stride.x) + (otw - 1)) / otw), + + //globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH; + // number of tiles needed to cover output height + (((iDims[iY].v / params.stride.y) + (oth - 1)) / oth), + + // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE); + // round depth range up + ((wDims[wOD].v * iDims[iB].v) + ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE)) + }; + + std::vector local = {1, 1, SIMD_SIZE}; + + kd.gws0 = global[0]; + kd.gws1 = global[1]; + kd.gws2 = global[2]; + + kd.lws0 = local[0]; + kd.lws1 = local[1]; + kd.lws2 = local[2]; + + kd.cldnnStyle = { 0 }; + kd.gemmStyle = { 0 }; + kd.effiency = FORCE_PRIORITY_1; + + return kd; + + } // SetDefault + + bool + ConvolutionKernel_imad_3x3::Validate( + const Params& params, + const optional_params& options) const + { + if (!Parent::Validate(params, options)) + { + return false; + } + + KernelData kd = KernelData::Default(params); + convolution_params& newParams = *static_cast(kd.params.get()); + + if (newParams.stride.x != newParams.stride.y) { + // Strides must be equial + return false; + } + else if ((newParams.filterSize.x != m_FilterSizeX) || + (newParams.filterSize.y != m_FilterSizeY)) { + // Kernel does not support such filter size + return false; + } + else { + const auto& in = newParams.inputs[0]; + const auto& iDims = in.GetDims(); + const int iX = DataTensor::Channelndex( + in.GetLayout(), Tensor::DataChannelName::X); + if (iDims[iX].v % OUT_BLOCK_THRESHOLD != 0) { + // Input size must be multiple of OUT_BLOCK_THRESHOLD + return false; + } + } + + return true; + } + + KernelsData + ConvolutionKernel_imad_3x3::GetCommonKernelsData( + const Params& params, + const optional_params& options, + const std::string exeMode, + int autoTuneIndex) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelData kd = KernelData::Default(params); + convolution_params& newParams = *static_cast(kd.params.get()); + DispatchData runInfo = SetDefault(newParams, autoTuneIndex); + if (!CheckWorkGroups(runInfo)) + { + // Internal Error - wrong calculation of global/local work group sizes + return{}; + } + + bool succeed = UpdateWeightsParams( + newParams, + options, + GetSupportedWeightLayouts(newParams), + kd.weightsReorderParams, + GetSupportedKey()); + + if (!succeed) + { + return{}; + } + + auto finalKernelName = GetKernelName(newParams); + auto cldnnJit = GetJitConstants(newParams, runInfo); + auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options); + auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.int8_quantization, newParams.output_calibration); + + kd.estimatedTime = runInfo.effiency; + kd.autoTuneIndex = autoTuneIndex; + + return{ kd }; + + } // GetCommonKernelsData +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.h new file mode 100644 index 0000000..a255883 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_3x3.h @@ -0,0 +1,58 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_imad_3x3 : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_imad_3x3() : ConvolutionKernelBase("convolution_gpu_imad") {} + ConvolutionKernel_imad_3x3(size_t FilterSizeX, size_t FilterSizeY) + : ConvolutionKernelBase("convolution_gpu_imad"), + m_FilterSizeX(FilterSizeX), + m_FilterSizeY(FilterSizeY) {} + virtual ~ConvolutionKernel_imad_3x3() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode = DEFAULT, int autoTuneIndex = -1) const; + + protected: + virtual ParamsKey GetSupportedKey() const override; + virtual bool Validate(const Params& params, const optional_params& options) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override; + + std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::os_is_yx_osv16_isv4 + }; + } + + protected: + // This class is base one for several similar classes with different + // filter sizes. That's why the actual filters sizes must be explicitly + // specified. + size_t m_FilterSizeX = 3; + size_t m_FilterSizeY = 3; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.cpp new file mode 100644 index 0000000..9a6da69 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.cpp @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_imad_7x7.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" + +namespace kernel_selector { + + JitConstants + ConvolutionKernel_imad_7x7::GetJitConstants( + const convolution_params& params, + const DispatchData& kd) const + { + auto mem_consts = Parent::GetJitConstants(params, kd); + + mem_consts.AddConstants({ + // Block reading optimization is implemented for 3x3 only. + // For 7x7 it should be disabled. + MakeJitConstant("NON_BLOCK_LOAD", 1), + }); + return mem_consts; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.h new file mode 100644 index 0000000..0e268ff --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_7x7.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_imad_3x3.h" + +namespace kernel_selector { + + // TODO Currently the best 7x7 IMAD convolution kernel is not completely done. + // Temporary solution to implement 7x7 using 3x3 IMAD convolution kernel with a + // little modifications. + class ConvolutionKernel_imad_7x7 : public ConvolutionKernel_imad_3x3 + { + public: + using Parent = ConvolutionKernel_imad_3x3; + ConvolutionKernel_imad_7x7() : ConvolutionKernel_imad_3x3(7, 7) {} + virtual ~ConvolutionKernel_imad_7x7() {} + + protected: + // For 3x3 based IMAD convolution only 'GetJitConstants' method is required + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp new file mode 100644 index 0000000..9b141ec --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp @@ -0,0 +1,187 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + static const size_t _SG_TILE_M = 32; + static const size_t _SG_TILE_N = 32; + static const size_t _SG_SIZE = 8; // sub group size + static const size_t _TILES_PER_SG_X = 1; // Persistent threads + static const size_t _TILES_PER_SG_Y = 1; // Persistent threads + + ParamsKey ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + bool ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + // make sure it's 1x1 conv + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + // make sure stride is 1x1 + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + // input padding not supported + if (cp.inputs[0].X().pad.Total() != 0 || + cp.inputs[0].Y().pad.Total() != 0 || + cp.inputs[0].Feature().pad.Total() != 0 || + cp.inputs[0].Batch().pad.Total() != 0) + return false; + + // input and output spatial sizes must match + if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) + return false; + + const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ; + const auto k = cp.inputs[0].Feature().v; + const auto n = cp.output.Feature().v ; + + if (m % 32 != 0 && m % 128 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 + return false; + + if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 + return false; + + if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 + return false; + + return true; + } + + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; + size_t mat_n = arg.output.Feature().v; + + size_t _MATRIX_M = mat_m; + size_t _MATRIX_N = mat_n; + + size_t _WG_TILE_M = 128; + size_t _WG_TILE_N = 128; + + // Calculate number of threads needed + const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; + const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y ; + + // Define execution setup for kernel: + size_t globalWorkSize[3] = { threadsX, threadsY, 1 }; + size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 }; + + runInfo.gws0 = globalWorkSize[0]; + runInfo.gws1 = globalWorkSize[1]; + runInfo.gws2 = globalWorkSize[2]; + + runInfo.lws0 = localWorkSize[0]; + runInfo.lws1 = localWorkSize[1]; + runInfo.lws2 = localWorkSize[2]; + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("WG_TILE_M", 128)); // Work-Group tile size M, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1)); // Persistent threads + jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1)); // Persistent threads + + // Do not change values below + jit.AddConstant(MakeJitConstant("DIM_X", 0)); + jit.AddConstant(MakeJitConstant("DIM_Y", 1)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); + jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); + jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); + jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); + jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); + jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); + jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); + + jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); + jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); + jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); + + const auto& input = params.inputs[0]; + const auto& output = params.output; + + auto m = output.X().v * output.Y().v * output.Batch().v; + auto k = input.Feature().v; + auto n = output.Feature().v; + + jit.AddConstant(MakeJitConstant("MATRIX_M", m)); + jit.AddConstant(MakeJitConstant("MATRIX_K", k)); + jit.AddConstant(MakeJitConstant("MATRIX_N", n)); + + const size_t out_x_pitch = 32 * 4; + const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); + const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); + const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); + const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; + + jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); + jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); + jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); + + bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; + jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); + + return jit; + } + + KernelsData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_1; //_3 + return kd; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h new file mode 100644 index 0000000..6be4748 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h @@ -0,0 +1,45 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8 : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8() : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_128x128wg_slm_int8") {} + + virtual ~ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::is_o32_yx_isv32_swizzled_by_4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp new file mode 100644 index 0000000..5e84d7f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp @@ -0,0 +1,187 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + static const size_t _SG_TILE_M = 32; + static const size_t _SG_TILE_N = 32; + static const size_t _SG_SIZE = 8; // sub group size + static const size_t _TILES_PER_SG_X = 1; // Persistent threads + static const size_t _TILES_PER_SG_Y = 1; // Persistent threads + + ParamsKey ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + bool ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + // make sure it's 1x1 conv + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + // make sure stride is 1x1 + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + // input padding not supported + if (cp.inputs[0].X().pad.Total() != 0 || + cp.inputs[0].Y().pad.Total() != 0 || + cp.inputs[0].Feature().pad.Total() != 0 || + cp.inputs[0].Batch().pad.Total() != 0) + return false; + + // input and output spatial sizes must match + if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) + return false; + + const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ; + const auto k = cp.inputs[0].Feature().v; + const auto n = cp.output.Feature().v ; + + if (m % 32 != 0 && m % 224 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 + return false; + + if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 + return false; + + if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 + return false; + + return true; + } + + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; + size_t mat_n = arg.output.Feature().v; + + size_t _MATRIX_M = mat_m; + size_t _MATRIX_N = mat_n; + + size_t _WG_TILE_M = 224; + size_t _WG_TILE_N = 128; + + // Calculate number of threads needed + const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; + const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y ; + + // Define execution setup for kernel: + size_t globalWorkSize[3] = { threadsX, threadsY, 1 }; + size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 }; + + runInfo.gws0 = globalWorkSize[0]; + runInfo.gws1 = globalWorkSize[1]; + runInfo.gws2 = globalWorkSize[2]; + + runInfo.lws0 = localWorkSize[0]; + runInfo.lws1 = localWorkSize[1]; + runInfo.lws2 = localWorkSize[2]; + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("WG_TILE_M", 224)); // Work-Group tile size M, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X)); + jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y)); + + // Do not change values below + jit.AddConstant(MakeJitConstant("DIM_X", 0)); + jit.AddConstant(MakeJitConstant("DIM_Y", 1)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); + jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); + jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); + jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); + jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); + jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); + jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); + + jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); + jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); + jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); + + const auto& input = params.inputs[0]; + const auto& output = params.output; + + auto m = output.X().v * output.Y().v * output.Batch().v; + auto k = input.Feature().v; + auto n = output.Feature().v; + + jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M + jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N + + const size_t out_x_pitch = 32 * 4; + const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); + const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); + const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); + const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; + + jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); + jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); + jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); + + bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; + jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); + + return jit; + } + + KernelsData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_1; //_3 + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h new file mode 100644 index 0000000..dd57937 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h @@ -0,0 +1,45 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8 : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8() : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_224x128wg_slm_int8") {} + + virtual ~ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::is_o32_yx_isv32_swizzled_by_4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp new file mode 100644 index 0000000..b9a6e18 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp @@ -0,0 +1,184 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_32x32sg_slm_int8.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + static const size_t _SG_TILE_M = 32; + static const size_t _SG_TILE_N = 32; + static const size_t _SG_SIZE = 8; // sub group size + static const size_t _TILES_PER_SG_X = 1; // Persistent threads + static const size_t _TILES_PER_SG_Y = 1; // Persistent threads + + ParamsKey ConvolutionKernel_mmad_32x32sg_slm_int8::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + bool ConvolutionKernel_mmad_32x32sg_slm_int8::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + // make sure it's 1x1 conv + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + // make sure stride is 1x1 + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + // input padding not supported + if (cp.inputs[0].X().pad.Total() != 0 || + cp.inputs[0].Y().pad.Total() != 0 || + cp.inputs[0].Feature().pad.Total() != 0 || + cp.inputs[0].Batch().pad.Total() != 0) + return false; + + // input and output spatial sizes must match + if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) + return false; + + const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ; + const auto k = cp.inputs[0].Feature().v; + const auto n = cp.output.Feature().v ; + + if (m % 32 != 0) // Matrix size M, Must be mutliple of 32 + return false; + + if (k % 32 != 0) // Matrix size K, Must be multiple of 32 + return false; + + if (n % 32 != 0) // Matrix size N, Must be mutliple of 32 + return false; + + return true; + } + + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_slm_int8::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_2; + + size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; + size_t mat_n = arg.output.Feature().v; + + size_t _MATRIX_M = mat_m; + size_t _MATRIX_N = mat_n; + + size_t _WG_TILE_M = 32; + size_t _WG_TILE_N = 32; + + // Calculate number of threads needed + const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; + const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y ; + + // Define execution setup for kernel: + size_t globalWorkSize[3] = { threadsX, threadsY, 1 }; + size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 }; + + runInfo.gws0 = globalWorkSize[0]; + runInfo.gws1 = globalWorkSize[1]; + runInfo.gws2 = globalWorkSize[2]; + + runInfo.lws0 = localWorkSize[0]; + runInfo.lws1 = localWorkSize[1]; + runInfo.lws2 = localWorkSize[2]; + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_32x32sg_slm_int8::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("WG_TILE_M", 32)); // Work-Group tile size M, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("WG_TILE_N", 32)); // Work-Group tile size N, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X)); + jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y)); + + // Do not change values below + jit.AddConstant(MakeJitConstant("DIM_X", 0)); + jit.AddConstant(MakeJitConstant("DIM_Y", 1)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); + jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); + jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); + jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); + jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); + jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); + jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); + + jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); + jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); + jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); + + const auto& input = params.inputs[0]; + const auto& output = params.output; + + auto m = output.X().v * output.Y().v * output.Batch().v; + auto k = input.Feature().v; + auto n = output.Feature().v; + + jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M + jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N + + const size_t out_x_pitch = 32 * 4; + const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); + const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); + const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); + const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; + + jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); + jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); + jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); + + return jit; + } + + KernelsData ConvolutionKernel_mmad_32x32sg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_2; //_3 + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h new file mode 100644 index 0000000..448a657 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h @@ -0,0 +1,45 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_32x32sg_slm_int8 : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_mmad_32x32sg_slm_int8() : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_slm_int8") {} + + virtual ~ConvolutionKernel_mmad_32x32sg_slm_int8() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::is_o_yx_isv32, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp index ce73392..178c078 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_mmad_batched.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -49,7 +48,7 @@ namespace kernel_selector { const auto of_maps = arg.output.Feature().v; const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - runInfo.effiency = FORCE_PRIORITY_3; + runInfo.effiency = FORCE_PRIORITY_6; runInfo.gws0 = arg.output.X().v; runInfo.gws1 = arg.output.Y().v; @@ -89,9 +88,9 @@ namespace kernel_selector { KernelsData ConvolutionKernel_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); + KernelsData kd = GetTunedKernelsDataByIndex(params, options); if(!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_3; + kd[0].estimatedTime = FORCE_PRIORITY_6; return kd; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h index 8a3dda4..366ceb4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched.h @@ -28,9 +28,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_mmad_batched() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override @@ -40,4 +40,4 @@ namespace kernel_selector { }; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp new file mode 100644 index 0000000..716b895 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.cpp @@ -0,0 +1,157 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_batched_block.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_mmad_batched_block::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + struct block_params + { + int32_t out_width; + int32_t out_height; + int32_t out_depth; + }; + + static block_params get_out_block_size(const convolution_params& p) + { + if (p.filterSize.x == 3 && p.filterSize.y == 3) + { + if (p.output.X().v == 7) + return { 7, 1, 4 }; + else if (p.output.X().v == 14) + return { 7, 1, 4 }; + else if (p.output.X().v == 28) + return { 7, 1, 4 }; + else if (p.output.X().v == 56) + return { 8, 1, 4 }; + } + + return { 1,1,1 }; + } + + std::vector ConvolutionKernel_mmad_batched_block::GetSupportedWeightLayouts(const convolution_params& cp) const + { + auto block = get_out_block_size(cp); + if (block.out_depth == 4) + return { WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4 }; + else + return { WeightsLayout::os_is_yx_isa8_osv8_isv4 }; + } + + bool ConvolutionKernel_mmad_batched_block::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + const convolution_params& cp = static_cast(p); + + // if block sizes are 1x1, then this algorithm is probably not the best + auto block = get_out_block_size(cp); + if (block.out_width == 1 && block.out_height == 1) + return false; + + if (cp.output.X().v % block.out_width != 0) + return false; + if (cp.output.Y().v % block.out_height != 0) + return false; + + if (cp.filterSize.x == 1) + return false; + + return true; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + constexpr size_t sub_group_size = 8; + + runInfo.effiency = FORCE_PRIORITY_5; + + auto block = get_out_block_size(arg); + + runInfo.gws0 = arg.output.X().v / block.out_width; + runInfo.gws1 = arg.output.Y().v / block.out_height; + runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v+3) / 4) / block.out_depth; // process 4 output channels per Workitem + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = sub_group_size; + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_batched_block::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2)); + + // pitch for special block format used in this kernel + const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); + const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; + jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); + + const size_t in_x_pitch = 32 * 4; + const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); + const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); + const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); + const size_t in_offset = in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; + + jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); + jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); + jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); + + auto block = get_out_block_size(params); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height)); + jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth)); + + return jit; + } + + KernelsData ConvolutionKernel_mmad_batched_block::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if(!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_5; + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h new file mode 100644 index 0000000..5902878 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block.h @@ -0,0 +1,39 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_batched_block : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_mmad_batched_block() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block") {} + virtual ~ConvolutionKernel_mmad_batched_block() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp new file mode 100644 index 0000000..1d79872 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.cpp @@ -0,0 +1,159 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_batched_block_1x1.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_mmad_batched_block_1x1::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + struct block_params + { + int32_t out_width; + int32_t out_height; + int32_t out_depth; + }; + + static block_params get_out_block_size(const convolution_params& p) + { + if (p.output.X().v == 7) + return { 7,1,4 }; + else if (p.output.X().v == 14) + return { 7,1,4 }; + else if (p.output.X().v == 28) + return { 4,2,4 }; + else if (p.output.X().v == 56) + return { 8,1,4 }; + + return { 1,1,1 }; + } + + std::vector ConvolutionKernel_mmad_batched_block_1x1::GetSupportedWeightLayouts(const convolution_params& cp) const + { + auto block = get_out_block_size(cp); + if (block.out_depth == 4) + return { WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4 }; + else + return { WeightsLayout::os_is_yx_isa8_osv8_isv4 }; + } + + bool ConvolutionKernel_mmad_batched_block_1x1::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + const convolution_params& cp = static_cast(p); + + // only for conv 1x1 + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + // only for stride 1x1 + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + // if block sizes are 1x1, then this algorithm is probably not the best + auto block = get_out_block_size(cp); + if (block.out_depth != 4) + return false; + + if (cp.output.X().v % block.out_width != 0) + return false; + if (cp.output.Y().v % block.out_height != 0) + return false; + + return true; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block_1x1::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + constexpr size_t sub_group_size = 8; + + runInfo.effiency = FORCE_PRIORITY_3; + + auto block = get_out_block_size(arg); + + runInfo.gws0 = arg.output.X().v / block.out_width; + runInfo.gws1 = arg.output.Y().v / block.out_height; + runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) / block.out_depth; // process 4 output channels per Workitem + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = sub_group_size; + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_batched_block_1x1::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2)); + + // pitch for special block format used in this kernel + const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); + const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; + jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); + + const size_t in_x_pitch = 32 * 4; + const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); + const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); + const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); + const size_t in_offset = in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; + + jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); + jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); + jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); + + auto block = get_out_block_size(params); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height)); + jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth)); + + return jit; + } + + KernelsData ConvolutionKernel_mmad_batched_block_1x1::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_3; + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h new file mode 100644 index 0000000..5d3c11a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_batched_block_1x1.h @@ -0,0 +1,39 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_batched_block_1x1 : public ConvolutionKernelBase + { + public: + using Parent = ConvolutionKernelBase; + ConvolutionKernel_mmad_batched_block_1x1() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block_1x1") {} + virtual ~ConvolutionKernel_mmad_batched_block_1x1() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp new file mode 100644 index 0000000..7c66076 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.cpp @@ -0,0 +1,121 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_slm_2x14_rep4.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_mmad_slm_2x14_rep4::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBiasPerOutput(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + bool ConvolutionKernel_mmad_slm_2x14_rep4::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + if (cp.filterSize.x != 3 || cp.filterSize.y != 3) + return false; + + if (cp.inputs[0].X().v != 56 || cp.inputs[0].Y().v != 56) + return false; + + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + return true; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_2x14_rep4::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + const size_t rep_count = 4; + const size_t batch_per_wi = 1; + const size_t out_block_width = 14; + const size_t out_block_height = 2; + runInfo.gws0 = arg.output.Feature().v * (arg.output.Batch().v / (rep_count * batch_per_wi)); // number of tiles needed to cover output width + runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width; + runInfo.gws2 = ((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height; + + runInfo.lws0 = 32; // depth + runInfo.lws1 = 1; // width + runInfo.lws2 = 4; // height + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_slm_2x14_rep4::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8)); + + // pitch for special block format used in this kernel + const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); + const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; + jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); + + const size_t in_x_pitch = 32 * 4; + const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); + const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); + const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); + const size_t in_offset = in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; + + jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); + jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); + jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); + + jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 14)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 2)); + jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0)); + jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1)); + jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", runInfo.lws2)); + + return jit; + } + + KernelsData ConvolutionKernel_mmad_slm_2x14_rep4::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h new file mode 100644 index 0000000..b158a98 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_2x14_rep4.h @@ -0,0 +1,43 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_slm_2x14_rep4 : public ConvolutionKernelBase + { + public: + ConvolutionKernel_mmad_slm_2x14_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_2x14_rep4") {} + virtual ~ConvolutionKernel_mmad_slm_2x14_rep4() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + bool Validate(const Params& p, const optional_params& o) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::os_is_yx_isa8_osv8_isv4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp new file mode 100644 index 0000000..bf6863f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.cpp @@ -0,0 +1,129 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "convolution_kernel_mmad_slm_7x7_rep4.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey ConvolutionKernel_mmad_slm_7x7_rep4::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBiasPerOutput(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.DisableTuning(); + return k; + } + + bool ConvolutionKernel_mmad_slm_7x7_rep4::Validate(const Params& p, const optional_params& o) const + { + if (!ConvolutionKernelBase::Validate(p, o) || + !CovolutionCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + if (cp.filterSize.x != 3 || cp.filterSize.y != 3) + return false; + + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + if (cp.inputs[0].X().v == 7 && cp.inputs[0].Y().v == 7) + return true; + + if (cp.inputs[0].X().v == 14 && cp.inputs[0].Y().v == 14) + return true; + + return false; + } + + ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_7x7_rep4::SetDefault(const convolution_params& arg, int) const + { + DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + const size_t rep_count = 4; + const size_t batch_per_wi = 4; + const size_t out_block_width = 7; + //const size_t out_block_height = 1; + runInfo.gws0 = (arg.output.Feature().v * arg.output.Batch().v) / (rep_count * batch_per_wi); // number of tiles needed to cover output width + runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width; + // since this kernel only apply to 7x7 sizes we need to manually set gws2 to 8 + runInfo.gws2 = Align(arg.inputs[0].Y().v, 8);//8;//((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height; + + runInfo.lws0 = 16; // depth + runInfo.lws1 = 1; // width + runInfo.lws2 = 8; // height + + return runInfo; + } + + JitConstants ConvolutionKernel_mmad_slm_7x7_rep4::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const + { + auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8)); + + // pitch for special block format used in this kernel + const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); + const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; + jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); + + const size_t in_x_pitch = 32 * 4; + const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); + const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); + const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); + const size_t in_offset = in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; + + const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); + + jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); + jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); + jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); + jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); + + jit.AddConstant(MakeJitConstant("OUT_X_PITCH", in_x_pitch)); + jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 7)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 1)); + jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0)); + jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1)); + jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", 7)); // must be 7 since we process 7 in Y per workgroup + + return jit; + } + + KernelsData ConvolutionKernel_mmad_slm_7x7_rep4::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h new file mode 100644 index 0000000..0bfe238 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_slm_7x7_rep4.h @@ -0,0 +1,43 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "convolution_kernel_base.h" + +namespace kernel_selector { + + class ConvolutionKernel_mmad_slm_7x7_rep4 : public ConvolutionKernelBase + { + public: + ConvolutionKernel_mmad_slm_7x7_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_7x7_rep4") {} + virtual ~ConvolutionKernel_mmad_slm_7x7_rep4() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + bool Validate(const Params& p, const optional_params& o) const override; + virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override + { + return{ + WeightsLayout::os_is_yx_isa8_osv8_isv4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp index aa58505..c87b2b4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp @@ -16,16 +16,18 @@ #include "convolution_kernel_selector.h" #include "convolution_kernel_bfyx_ref.h" +#include "convolution_kernel_bfyx_1x1_opt.h" #include "convolution_kernel_bfyx_gemm_like.h" #include "convolution_kernel_bfyx_direct_10_12_16.h" #include "convolution_kernel_bfyx_os_iyx_osv16.h" +#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h" #include "convolution_kernel_yxfb_ref.h" #include "convolution_kernel_yxfb_yxio_b16.h" #include "convolution_kernel_yxfb_yxio_b8.h" #include "convolution_kernel_yxfb_yxio_b1_block.h" #include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h" #include "convolution_kernel_tutorial.h" -#include "convolution_kernel_bfyx_3x3_dw_opt.h" +//#include "convolution_kernel_bfyx_3x3_dw_opt.h" #include "convolution_kernel_winograd_2x3_s1.h" #include "convolution_kernel_bfyx_1x1.h" #include "convolution_kernel_bfyx_1x1_gemm_buf.h" @@ -37,23 +39,36 @@ #include "convolution_kernel_byxf_af32_depthwise.h" #include "convolution_kernel_mmad_batched.h" #include "convolution_kernel_bfyx_depthwise_weights_lwg.h" +#include "convolution_kernel_mmad_slm_2x14_rep4.h" +#include "convolution_kernel_mmad_slm_7x7_rep4.h" +#include "convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h" +#include "convolution_kernel_mmad_batched_block.h" +#include "convolution_kernel_mmad_batched_block_1x1.h" +#include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h" +#include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h" +#include "convolution_kernel_mmad_32x32sg_slm_int8.h" +#include "convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h" +#include "convolution_kernel_imad_3x3.h" +#include "convolution_kernel_imad_1x1.h" +#include "convolution_kernel_imad_7x7.h" -#include - namespace kernel_selector { convolution_kernel_selector::convolution_kernel_selector() { Attach(); + Attach(); Attach(); Attach(); Attach(); + // commented out to not get in our way, will enable in future after autotuning +// Attach(); Attach(); Attach(); Attach(); //Attach(); // TODO: need to finish integration Attach(); - Attach(); + //Attach(); Attach(); Attach(); Attach(); @@ -65,13 +80,23 @@ namespace kernel_selector Attach(); Attach(); Attach(); + Attach(); + Attach(); + Attach(); + Attach(); + Attach(); + Attach(); + Attach(); + Attach(); +// Attach(); //Attach(); //In order to use this implementation for tutorial purposes please uncomment this line + Attach(); + Attach(); + Attach(); } KernelsData convolution_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - //const ConvolutionParams& orgParams = static_cast(params); - //std::cout << orgParams.to_string() << std::endl; return GetAutoTuneBestKernel(params, options, KernelType::CONVOLUTION); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.cpp index 7d6bda1..fa8ac6e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_tutorial.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -181,7 +180,7 @@ namespace kernel_selector { KernelsData ConvolutionKernel_Tutorial::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } #endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.h index e2cbdfe..77f7135 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_tutorial.h @@ -38,9 +38,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_Tutorial() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ @@ -57,4 +57,4 @@ namespace kernel_selector { DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; #endif }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.cpp index 98876df..e019051 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.cpp @@ -15,8 +15,6 @@ */ #include "convolution_kernel_winograd_2x3_s1.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -120,6 +118,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_Winograd_2x3_s1::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.h index 04f61ac..491eeb3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1.h @@ -28,13 +28,13 @@ namespace kernel_selector { virtual ~ConvolutionKernel_Winograd_2x3_s1() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::winograd_2x3_s1_weights }; } JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; bool Validate(const Params& p, const optional_params& o) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.cpp index f2d5999..f26abb8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_winograd_2x3_s1_fused.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -148,6 +147,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_Winograd_2x3_s1_fused::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.h index bb520a0..770f0fa 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_2x3_s1_fused.h @@ -28,13 +28,13 @@ namespace kernel_selector { virtual ~ConvolutionKernel_Winograd_2x3_s1_fused() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::winograd_2x3_s1_fused_weights }; } JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; bool Validate(const Params& p, const optional_params& o) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.cpp index 1a06f04..a93a4b6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.cpp @@ -159,6 +159,6 @@ namespace kernel_selector { KernelsData ConvolutionKernel_Winograd_6x3_s1_fused::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, AGE_BASED); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.h index 39b9fd8..665e5a8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_winograd_6x3_s1_fused.h @@ -28,12 +28,12 @@ namespace kernel_selector { virtual ~ConvolutionKernel_Winograd_6x3_s1_fused() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; bool Validate(const Params& p, const optional_params& o) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.cpp index 6726433..584d343 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.cpp @@ -39,11 +39,12 @@ namespace kernel_selector k.EnableDilation(); k.EnableDepthwiseSeparableOpt(); k.DisableTuning(); + k.EnableGroupedConvolution(); return k; } KernelsData ConvolutionKernel_yxfb_Ref::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.h index 1d6a7df..1f2239f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_ref.h @@ -27,9 +27,9 @@ namespace kernel_selector { virtual ~ConvolutionKernel_yxfb_Ref() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ @@ -40,4 +40,4 @@ namespace kernel_selector { }; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp index 04508ef..7dae5c2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp @@ -15,7 +15,6 @@ */ #include "convolution_kernel_yxfb_yxio_b16.h" -#include "convolution_params.h" namespace kernel_selector { @@ -210,6 +209,6 @@ namespace kernel_selector KernelsData ConvolutionKernel_yxfb_yxio_b16::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.h index 9a4c2fc..e60ceae 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.h @@ -28,13 +28,13 @@ namespace kernel_selector { virtual ~ConvolutionKernel_yxfb_yxio_b16() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::yxio }; } std::string GetKernelName(const convolution_params&) const override; bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp index 431cfe1..3600917 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp @@ -15,8 +15,6 @@ */ #include "convolution_kernel_yxfb_yxio_b1_block.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -58,6 +56,6 @@ namespace kernel_selector KernelsData ConvolutionKernel_yxfb_yxio_b1_block::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h index 8d19b7c..6b170c4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h @@ -27,11 +27,11 @@ namespace kernel_selector { virtual ~ConvolutionKernel_yxfb_yxio_b1_block() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::yxio }; } DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.cpp index 81646f4..9097311 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.cpp @@ -15,8 +15,6 @@ */ #include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -155,6 +153,6 @@ namespace kernel_selector KernelsData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.h index 8571eb5..2b77f70 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block_multiple_x.h @@ -27,12 +27,12 @@ namespace kernel_selector { virtual ~ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::yxio }; } bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.cpp index ccee6e6..84dba18 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.cpp @@ -15,8 +15,6 @@ */ #include "convolution_kernel_yxfb_yxio_b8.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -130,6 +128,6 @@ namespace kernel_selector KernelsData ConvolutionKernel_yxfb_yxio_b8::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + return GetTunedKernelsDataByIndex(params, options); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.h index dd7f8c5..4659e2d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b8.h @@ -27,12 +27,12 @@ namespace kernel_selector { virtual ~ConvolutionKernel_yxfb_yxio_b8() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - + protected: + virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; virtual std::vector GetSupportedWeightLayouts(const convolution_params&) const override { return{ WeightsLayout::yxio }; } bool Validate(const Params& p, const optional_params& o) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp index 9b76961..16bbaf2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp @@ -56,7 +56,7 @@ namespace kernel_selector k.EnableDilation(); } - if (depthwiseSeparableOpt) + if (depthwise_separable_opt) { k.EnableDepthwiseSeparableOpt(); } @@ -76,6 +76,16 @@ namespace kernel_selector k.EnableOutputCalibration(); } + if (local_convolution) + { + k.EnableLocalConvolution(); + } + + if (groups > 1 && !depthwise_separable_opt) + { + k.EnableGroupedConvolution(); + } + return k; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.h index 91ab419..5188498 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.h @@ -33,12 +33,14 @@ namespace kernel_selector uSize dilation; uSize padding; uint32_t split = 1; - bool depthwiseSeparableOpt = false; + bool depthwise_separable_opt = false; bool transposed = false; bool int8_quantization = false; bool output_calibration = false; + bool local_convolution = false; float input_quantization_factor = 1.0f; float output_quantization_factor = 1.0f; + uint32_t groups = 1; MultiDataTensor weights_quantization_factors; MultiDataTensor output_calibration_factors; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h index aee5a6f..28d7828 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h @@ -28,6 +28,8 @@ namespace kernel_selector { virtual DispatchData SetDefault(const convolution_grad_weights_params& params) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h index 39fcb7e..c4051e6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h @@ -28,6 +28,8 @@ namespace kernel_selector { virtual DispatchData SetDefault(const convolution_grad_weights_params& params) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h index 286caf5..2c9a134 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h @@ -28,6 +28,8 @@ namespace kernel_selector { virtual DispatchData SetDefault(const convolution_grad_weights_params& params) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp index 1e2cd30..e24f696 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp @@ -56,7 +56,8 @@ namespace kernel_selector MakeJitConstant("DILATION", cp.dilation), MakeJitConstant("FILTER_ARRAY_NUM", cp.split), MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), - MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", cp.depthwiseSeparableOpt), + MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", cp.depthwise_separable_opt), + MakeJitConstant("OUTPUT_GRAD_W", cp.output_grad_w), }); return jit; @@ -124,7 +125,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !orgParams.bias.empty()); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, true, !orgParams.bias.empty()); if (newParams.use_momentum) { kernel.arguments.push_back({ ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0 }); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h index bf5100f..1331afd 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h @@ -33,7 +33,8 @@ namespace kernel_selector uSize dilation; uSize padding; uint32_t split = 1; - bool depthwiseSeparableOpt = false; + bool depthwise_separable_opt = false; + bool output_grad_w = false; virtual std::string to_string() const override; @@ -52,7 +53,7 @@ namespace kernel_selector k.EnableDilation(); } - if (depthwiseSeparableOpt) + if (depthwise_separable_opt) { k.EnableDepthwiseSeparableOpt(); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h index 3c95c4d..be09a66 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h @@ -26,6 +26,8 @@ namespace kernel_selector { ConvolutionGradWeightsKernelRef() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_ref") {} virtual ~ConvolutionGradWeightsKernelRef() {} + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h index 23a149b..904884b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h @@ -28,6 +28,8 @@ namespace kernel_selector { virtual DispatchData SetDefault(const convolution_grad_weights_params& params) const override; virtual bool Validate(const Params& p, const optional_params& o) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.cpp index cbc0bd7..242cc9a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.cpp @@ -56,8 +56,9 @@ namespace kernel_selector MakeJitConstant("DILATION", dp.dilation), MakeJitConstant("FILTER_ARRAY_NUM", dp.split), MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), - MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", dp.depthwiseSeparableOpt), - MakeJitConstant("FUSED_ELTWISE", dp.fused_eltwise) + MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", dp.depthwise_separable_opt), + MakeJitConstant("FUSED_ELTWISE", dp.fused_eltwise), + MakeJitConstant("GROUPED", (dp.groups > 1) ? 1 : 0) }); return jit; @@ -120,7 +121,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !newParams.bias.empty()); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, true, !newParams.bias.empty()); kernel.arguments.push_back({ ArgumentDescriptor::Types::SPLIT, 0 }); if (orgParams.fused_eltwise) kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 }); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.h index 206614a..46a1527 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_base.h @@ -33,7 +33,8 @@ namespace kernel_selector uSize dilation; uSize padding; uint32_t split = 1; - bool depthwiseSeparableOpt = false; + uint32_t groups = 1; + bool depthwise_separable_opt = false; bool fused_eltwise = false; virtual std::string to_string() const override; @@ -53,11 +54,16 @@ namespace kernel_selector k.EnableDilation(); } - if (depthwiseSeparableOpt) + if (depthwise_separable_opt) { k.EnableDepthwiseSeparableOpt(); } + if (groups > 1 && !depthwise_separable_opt) + { + k.EnableGroupedConvolution(); + } + return k; } }; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.h index 1c18e15..178cb34 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.h @@ -26,9 +26,8 @@ namespace kernel_selector { DeconvolutionKernel_bfyx_opt() : DeconvolutionKernelBase("deconvolution_gpu_bfyx_opt") {} virtual ~DeconvolutionKernel_bfyx_opt() {} - virtual ParamsKey GetSupportedKey() const override; - protected: + virtual ParamsKey GetSupportedKey() const override; CommonDispatchData SetDefault(const deconvolution_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp index fd5c28f..73e25b2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp @@ -42,6 +42,7 @@ namespace kernel_selector k.EnableSplitSupport(); k.EnableDepthwiseSeparableOpt(); k.EnableGradient(); + k.EnableGroupedConvolution(); return k; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.h index 2d2c89d..ae6a172 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.h @@ -26,10 +26,9 @@ namespace kernel_selector { DeconvolutionKernelRef() : DeconvolutionKernelBase("deconvolution_gpu_ref") {} virtual ~DeconvolutionKernelRef() {} - virtual ParamsKey GetSupportedKey() const override; - protected: + virtual ParamsKey GetSupportedKey() const override; CommonDispatchData SetDefault(const deconvolution_params& params) const override; JitConstants GetJitConstants(const deconvolution_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp new file mode 100644 index 0000000..2f6f338 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp @@ -0,0 +1,85 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "depth_to_space_kernel_ref.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector +{ + ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; + } + + CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params, const optional_params&) const + { + CommonDispatchData runInfo; + + std::vector global = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; + + auto local = GetOptimalLocalWorkGroupSizes(global); + + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = global[2]; + + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = local[2]; + + return runInfo; + } + + JitConstants DepthToSpaceKernelRef::GetJitConstants(const depth_to_space_params& params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.AddConstant(MakeJitConstant("BLOCK_SIZE", params.block_size)); + + return jit; + } + + KernelsData DepthToSpaceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelData kd = KernelData::Default(params); + depth_to_space_params& newParams = *static_cast(kd.params.get()); + + assert(params.GetType() == KernelType::DEPTH_TO_SPACE); + + auto runInfo = SetDefault(newParams, options); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); + auto cldnn_jit = GetJitConstants(newParams); + std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point); + + kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h new file mode 100644 index 0000000..9db06c0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h @@ -0,0 +1,56 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // depth_to_space_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct depth_to_space_params : public base_params + { + depth_to_space_params() : base_params(KernelType::DEPTH_TO_SPACE) {} + + size_t block_size; + + virtual ParamsKey GetParamsKey() const + { + return base_params::GetParamsKey(); + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // depth_to_space_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct depth_to_space_optional_params : optional_params + { + depth_to_space_optional_params() : optional_params(KernelType::DEPTH_TO_SPACE) {} + }; + + class DepthToSpaceKernelRef : public common_kernel_base + { + public: + DepthToSpaceKernelRef() : common_kernel_base("depth_to_space_ref") {} + virtual ~DepthToSpaceKernelRef() {} + virtual JitConstants GetJitConstants(const depth_to_space_params& params) const; + virtual CommonDispatchData SetDefault(const depth_to_space_params& params, const optional_params&) const; + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.cpp new file mode 100644 index 0000000..f50ba40 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.cpp @@ -0,0 +1,31 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "depth_to_space_kernel_selector.h" +#include "depth_to_space_kernel_ref.h" + +namespace kernel_selector { + + depth_to_space_kernel_selector::depth_to_space_kernel_selector() + { + Attach(); + } + + KernelsData depth_to_space_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::DEPTH_TO_SPACE); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.h new file mode 100644 index 0000000..1ddb54d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class depth_to_space_kernel_selector : public kernel_selector_base + { + public: + static depth_to_space_kernel_selector &Instance() { + static depth_to_space_kernel_selector instance_; + return instance_; + } + + depth_to_space_kernel_selector(); + + virtual ~depth_to_space_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.cpp new file mode 100644 index 0000000..3e6a053 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.cpp @@ -0,0 +1,67 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "detection_output_kernel_base.h" + +namespace kernel_selector +{ + JitConstants DetectionOutputKernelBase::GetJitConstants(const detection_output_params & params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const auto& detectOutParams = params.detectOutParams; + + jit.AddConstants({ + MakeJitConstant("NUM_IMAGES", detectOutParams.num_images), + MakeJitConstant("NUM_CLASSES", detectOutParams.num_classes), + MakeJitConstant("KEEP_TOP_K", detectOutParams.keep_top_k), + MakeJitConstant("TOP_K", detectOutParams.top_k), + MakeJitConstant("BACKGROUND_LABEL_ID", detectOutParams.background_label_id), + MakeJitConstant("CODE_TYPE", detectOutParams.code_type), + MakeJitConstant("CONF_SIZE_X", detectOutParams.conf_size_x), + MakeJitConstant("CONF_SIZE_Y", detectOutParams.conf_size_y), + MakeJitConstant("CONF_PADDING_X", detectOutParams.conf_padding_x), + MakeJitConstant("CONF_PADDING_Y", detectOutParams.conf_padding_y), + MakeJitConstant("SHARE_LOCATION", detectOutParams.share_location), + MakeJitConstant("VARIANCE_ENCODED_IN_TARGET", detectOutParams.variance_encoded_in_target), + MakeJitConstant("NMS_THRESHOLD", detectOutParams.nms_threshold), + MakeJitConstant("ETA", detectOutParams.eta), + MakeJitConstant("CONFIDENCE_THRESHOLD", detectOutParams.confidence_threshold), + MakeJitConstant("IMAGE_WIDTH", detectOutParams.input_width), + MakeJitConstant("IMAGE_HEIGH", detectOutParams.input_heigh), + MakeJitConstant("ELEMENTS_PER_THREAD", detectOutParams.elements_per_thread), + MakeJitConstant("PRIOR_COORD_OFFSET", detectOutParams.prior_coordinates_offset), + MakeJitConstant("PRIOR_INFO_SIZE", detectOutParams.prior_info_size), + MakeJitConstant("PRIOR_IS_NORMALIZED", detectOutParams.prior_is_normalized), + }); + + return jit; + } + + DetectionOutputKernelBase::DispatchData DetectionOutputKernelBase::SetDefault(const detection_output_params& params) const + { + DispatchData kd; + + kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; + kd.gws0 = 0; + kd.gws1 = 0; + kd.gws2 = 0; + kd.lws0 = 0; + kd.lws1 = 0; + kd.lws2 = 0; + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.h new file mode 100644 index 0000000..8d267d1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.h @@ -0,0 +1,87 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" +#include "kernel_selector_params.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // detection_output_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct detection_output_params : public base_params + { + detection_output_params() : base_params(KernelType::DETECTION_OUTPUT), detectOutParams() {} + + struct DedicatedParams + { + uint32_t num_images; + uint32_t num_classes; + int32_t keep_top_k; + int32_t top_k; + int32_t background_label_id; + int32_t code_type; + int32_t conf_size_x; + int32_t conf_size_y; + int32_t conf_padding_x; + int32_t conf_padding_y; + int32_t elements_per_thread; + int32_t input_width; + int32_t input_heigh; + int32_t prior_coordinates_offset; + int32_t prior_info_size; + bool prior_is_normalized; + bool share_location; + bool variance_encoded_in_target; + float nms_threshold; + float eta; + float confidence_threshold; + }; + + DedicatedParams detectOutParams; + + virtual ParamsKey GetParamsKey() const + { + return base_params::GetParamsKey(); + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // detection_output_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct detection_output_optional_params : optional_params + { + detection_output_optional_params() : optional_params(KernelType::DETECTION_OUTPUT) {} + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // DetectionOutputKernelBase + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class DetectionOutputKernelBase : public common_kernel_base + { + public: + using common_kernel_base :: common_kernel_base; + virtual ~DetectionOutputKernelBase() {} + + using DispatchData = CommonDispatchData; + + protected: + JitConstants GetJitConstants(const detection_output_params& params) const; + virtual DispatchData SetDefault(const detection_output_params& params) const; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp new file mode 100644 index 0000000..b9e3463 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp @@ -0,0 +1,95 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "detection_output_kernel_ref.h" +#include "kernel_selector_utils.h" + +#define PRIOR_BOX_SIZE 4 // Each prior-box consists of [xmin, ymin, xmax, ymax]. + +namespace kernel_selector +{ + + ParamsKey DetectionOutputKernel::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; + } + + CommonDispatchData DetectionOutputKernel::SetDefault(const detection_output_params& params) const + { + CommonDispatchData runInfo = DetectionOutputKernelBase::SetDefault(params); + + // Number of all work items is set to total number of bounding boxes - + // one bounding box is procerssed by one work item + size_t num_classes = (params.detectOutParams.share_location)? 1 : params.detectOutParams.num_classes; + + // Size of input0 (input location), if shared loaction it is equal to size of one class, + // else it has size of all items for all classes + size_t bboxesNum = params.inputs[0].LogicalSize() / PRIOR_BOX_SIZE / num_classes; + // Work group size is set to number of bounding boxes per image for sorting purpose + // (access to one table with sorted values) + size_t work_group_size = bboxesNum / params.inputs[0].Batch().v; + + if (work_group_size > 256) + { + work_group_size = work_group_size / ((work_group_size / 256) + 1) + 1; + } + + bboxesNum = work_group_size * params.inputs[0].Batch().v; + + runInfo.gws0 = Align(bboxesNum, work_group_size); + runInfo.gws1 = 1; + runInfo.gws2 = 1; + + runInfo.lws0 = work_group_size; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } + + KernelsData DetectionOutputKernel::GetKernelsData(const Params& params, const optional_params& options) const + { + assert(params.GetType() == KernelType::DETECTION_OUTPUT && + options.GetType() == KernelType::DETECTION_OUTPUT); + + KernelData kd = KernelData::Default(params); + const detection_output_params& detectOutParams = static_cast(params); + DispatchData runInfo = SetDefault(detectOutParams); + + auto cldnnJit = GetJitConstants(detectOutParams); + auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options); + auto jit = CreateJit(kernelName, cldnnJit, entryPoint); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entryPoint); + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 }); + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 2 }); + + kd.estimatedTime = FORCE_PRIORITY_8; + + return{ kd }; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h new file mode 100644 index 0000000..42d342a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "detection_output_kernel_base.h" + +namespace kernel_selector { + + class DetectionOutputKernel : public DetectionOutputKernelBase + { + public: + DetectionOutputKernel() : DetectionOutputKernelBase("detection_output") {} + virtual ~DetectionOutputKernel() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + + private: + CommonDispatchData SetDefault(const detection_output_params& params) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.cpp new file mode 100644 index 0000000..19fe97b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.cpp @@ -0,0 +1,42 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "detection_output_kernel_selector.h" +#include "detection_output_kernel_ref.h" +#include "detection_output_kernel_sort.h" + +namespace kernel_selector +{ + detection_output_kernel_selector::detection_output_kernel_selector() + { + Attach(); + } + + KernelsData detection_output_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::DETECTION_OUTPUT); + } + + detection_output_sort_kernel_selector::detection_output_sort_kernel_selector() + { + Attach(); + } + + KernelsData detection_output_sort_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::DETECTION_OUTPUT); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.h new file mode 100644 index 0000000..f2c8db7 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.h @@ -0,0 +1,52 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class detection_output_kernel_selector : public kernel_selector_base + { + public: + static detection_output_kernel_selector &Instance() { + static detection_output_kernel_selector instance_; + return instance_; + } + + detection_output_kernel_selector(); + + virtual ~detection_output_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; + + class detection_output_sort_kernel_selector : public kernel_selector_base + { + public: + static detection_output_sort_kernel_selector &Instance() { + static detection_output_sort_kernel_selector instance_; + return instance_; + } + + detection_output_sort_kernel_selector(); + + virtual ~detection_output_sort_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.cpp new file mode 100644 index 0000000..b1d8fa9 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.cpp @@ -0,0 +1,89 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "detection_output_kernel_sort.h" +#include "kernel_selector_utils.h" + +#define DETECTION_OUTPUT_ROW_SIZE 7 // Each detection consists of [image_id, label, confidence, xmin, ymin, xmax, ymax]. + +namespace kernel_selector +{ + + ParamsKey DetectionOutputKernel_sort::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; + } + + CommonDispatchData DetectionOutputKernel_sort::SetDefault(const detection_output_params& params) const + { + CommonDispatchData runInfo = DetectionOutputKernelBase::SetDefault(params); + + unsigned class_num = params.detectOutParams.num_classes; + if (params.detectOutParams.share_location && params.detectOutParams.background_label_id == 0) + { + class_num -= 1; + } + const size_t bboxesNum = class_num * params.detectOutParams.num_images; + // Work group size is set to number of bounding boxes per image + size_t work_group_size = class_num; + + if (work_group_size > 256) + { + work_group_size = (work_group_size + work_group_size % 2) / (work_group_size / 256 + 1); + } + + runInfo.gws0 = Align(bboxesNum, work_group_size); + runInfo.gws1 = 1; + runInfo.gws2 = 1; + + runInfo.lws0 = work_group_size; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } + + KernelsData DetectionOutputKernel_sort::GetKernelsData(const Params& params, const optional_params& options) const + { + assert(params.GetType() == KernelType::DETECTION_OUTPUT && + options.GetType() == KernelType::DETECTION_OUTPUT); + + KernelData kd = KernelData::Default(params); + const detection_output_params& detectOutParams = static_cast(params); + DispatchData runInfo = SetDefault(detectOutParams); + + auto cldnnJit = GetJitConstants(detectOutParams); + auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options); + auto jit = CreateJit(kernelName, cldnnJit, entryPoint); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entryPoint); + + kd.estimatedTime = FORCE_PRIORITY_8; + + return{ kd }; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.h new file mode 100644 index 0000000..b06ea1c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "detection_output_kernel_base.h" + +namespace kernel_selector { + + class DetectionOutputKernel_sort : public DetectionOutputKernelBase + { + public: + DetectionOutputKernel_sort() : DetectionOutputKernelBase("detection_output_sort") {} + virtual ~DetectionOutputKernel_sort() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + + private: + CommonDispatchData SetDefault(const detection_output_params& params) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp new file mode 100644 index 0000000..28758b2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp @@ -0,0 +1,301 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "eltwise_kernel_b_fs_yx_fsv4.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey EltwiseKernel_b_fs_yx_fsv4::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + k.EnableEltwiseStride(); + return k; + } + + EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv4::SetDefault(const eltwise_params& params) const + { + DispatchData kd; + + // Because of very specific requirements for data, we may linearize the data, + // i.e. use only one dimension, e.g. 'X'. + + //GWS: + // we process 4*4 (4 int8 bytes per on block_read4 reading) features per workitem + kd.gws0 = params.output.X().v * params.output.Y().v * + params.output.Batch().v * params.output.Feature().v / (4*4); + kd.gws1 = 1; + kd.gws2 = 1; + // LWS: + kd.lws0 = 8; + kd.lws1 = 1; + kd.lws2 = 1; + + kd.effiency = FORCE_PRIORITY_1; + return kd; + } + + bool EltwiseKernel_b_fs_yx_fsv4::Validate(const Params& params, const optional_params& options) const + { + // Requirents to use 'eltwise_b_fs_yx_fsv4' kernel are below: + // 1. No stride + // 2. All dimensions for all inputs are the same + // 3. No padding + // So, it can be linearized + + if (!Parent::Validate(params, options)) { + return false; + } + + KernelData kd = KernelData::Default(params); + eltwise_params& newParams = *static_cast(kd.params.get()); + + // 1. No stride + if (!newParams.stride.empty()) { + return false; + } + + for (size_t i = 0; i < newParams.inputs.size() - 1; i++) + { + // 2. All dimensions for all inputs are the same + if (!(newParams.inputs[i] == newParams.inputs[i + 1])) { + return false; + } + } + + const auto& in = newParams.inputs[0]; + for (size_t i = 0; i < in.Dimentions(); i++) + { + // 3. No padding + if ((in.GetDims()[i].pad.before != 0) || + (in.GetDims()[i].pad.after != 0)) { + return false; + } + } + + return true; + } + + JitConstants EltwiseKernel_b_fs_yx_fsv4::GetJitConstants(const eltwise_params& params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + if (params.inputs[0].GetDType() == Datatype::UINT8) { + // Special handler for unsigned types + jit.AddConstants({ + MakeJitConstant("ELTW_UNSIGNED", 1) + }); + } + + /////////////// + jit.AddConstants({ + MakeJitConstant("ELTWISE_LAYOUT_BASED", params.layoutBased), + MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization), + }); + + if (params.int8_quantization) + { + if (params.output_calibration) + { + jit.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.output_calibration)); + jit.AddConstant(MakeJitConstant("O_QF", params.output_calibration_factors[0])); + + } + else + jit.AddConstants({ MakeJitConstant("O_QF", params.output_quantization_factor) }); + } + + std::string inputs_decls; + auto& updateInputs = params.updateInputIds; + + for (size_t i = 0; i < params.inputs.size(); i++) + { + //const should be added only to inputs which will not be updated + std::string const_str = "const"; + for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) + { + if (updateInputs[update_input_idx].inputId == i) + { + const_str = ""; + break; + } + } + + inputs_decls += const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", "; + } + + jit.AddConstant(MakeJitConstant("INPUTS_DECLS", inputs_decls)); + jit.AddConstant(MakeJitConstant("ELTWISE_NO_PITCH_SAME_DIMS", CheckInputsOutputNoPitchSameDims(params))); + + std::string do_eltwise; + + auto& operations = params.operations; + auto& coefficients = params.coefficients; + + for (size_t op_num = 0; op_num < operations.size(); op_num++) + { + const std::string op_num_str = std::to_string(op_num); + const auto& ew = operations[op_num]; + + for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) + { + const auto& input = ew.inputs[input_idx]; + const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx); + switch (input.mode) + { + case EltwiseInputMode::SCALAR: + jit.AddConstant(MakeJitConstant(name, input.scalar)); + break; + case EltwiseInputMode::INPUT_BUFFER: + jit.AddConstant(MakeJitConstant(name, "GET_INPUT(input" + std::to_string(input.index) + ", INPUT" + std::to_string(input.index) + ")")); + break; + case EltwiseInputMode::OUTPUT_BUFFER: + jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT, )]")); + break; + case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER: + jit.AddConstant(MakeJitConstant(name, "input" + std::to_string(input.index) + "[(size_t)tmp" + std::to_string(input.tmpIndex) + "]")); + break; + case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX: + jit.AddConstant(MakeJitConstant(name, "tmp" + std::to_string(input.tmpIndex))); + break; + default: + break; + } + } + std::string input0_str, input1_str, cast_type, op; + + cast_type = "(int16)"; + op = "const int16 tmp" + op_num_str + " = "; + + input0_str = cast_type + "INPUT_" + op_num_str + "_0"; + input1_str = cast_type + "INPUT_" + op_num_str + "_1"; + + if (ew.mode == EltwiseMode::ADD) + { + std::vector coeff_strings(ew.inputs.size(), ""); + for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) + { + const auto& input = ew.inputs[input_idx]; + if (input.mode == EltwiseInputMode::INPUT_BUFFER && input.index < coefficients.size()) + { + const float c = coefficients[input.index]; + if (c != 1.0f) + coeff_strings[input_idx] = cast_type + "(" + std::to_string(c) + ")*"; + } + } + + input0_str = coeff_strings[0] + input0_str; + input1_str = coeff_strings[1] + input1_str; + } + + + switch (ew.mode) + { + case EltwiseMode::ADD: op += input0_str + " + " + input1_str; break; + case EltwiseMode::SUB: op += input0_str + " - " + input1_str; break; + case EltwiseMode::MUL: op += input0_str + " * " + input1_str; break; + case EltwiseMode::DIV: op += input0_str + " / " + input1_str; break; + case EltwiseMode::MODULU: + case EltwiseMode::MIN: + case EltwiseMode::MAX: + { + auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max")); + auto input_0_type = params.inputs[0].GetDType(); + auto input_1_type = params.inputs[1].GetDType(); + + // input_0 == int + if (input_0_type == kernel_selector::Datatype::INT8 || + input_0_type == kernel_selector::Datatype::UINT8) + { + // input_0 == int && input_1 == int + if (input_1_type == kernel_selector::Datatype::INT8 || + input_1_type == kernel_selector::Datatype::UINT8) + { + if (ew.mode == EltwiseMode::MODULU) + op += input0_str + " % " + input1_str; + else + op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")"; + } + // input_0 == int && input_1 != int + else + { + op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")"; + } + } + // input_0 != int && input_1 == int + else if (input_1_type == kernel_selector::Datatype::INT8 || + input_1_type == kernel_selector::Datatype::UINT8) + { + op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))"; + } + // input_0 != int && input_1 != int + else + { + op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")"; + } + } break; + case EltwiseMode::POW: op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; break; + case EltwiseMode::SQRT: op += cast_type + "sqrt(" + input0_str + ")"; break; + case EltwiseMode::RSQRT: op += cast_type + "1/sqrt(" + input0_str + ")"; break; + case EltwiseMode::ASSIGN: op += input0_str; break; + default: + break; + } + + std::string opname = "OPERATION" + op_num_str; + jit.AddConstant(MakeJitConstant(opname, op)); + do_eltwise += "\\\n\t" + opname + ";"; + } + + for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) + do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + + "[GET_INDEX(INPUT, " + std::to_string(updateInputs[update_input_idx].inputId) + + ")] = tmp" + std::to_string(updateInputs[update_input_idx].tmpId) + ";"; + + do_eltwise += "\\\n\tres = tmp" + std::to_string(operations.size() - 1) + ";"; + + jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise)); + + if (params.layoutBased || params.int8_quantization) + { + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); + } + + if (!params.stride.empty()) + { + jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); + } + + /////////////// + return jit; + } + + KernelsData EltwiseKernel_b_fs_yx_fsv4::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h new file mode 100644 index 0000000..1032b68 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "eltwise_kernel_base.h" + +namespace kernel_selector +{ + class EltwiseKernel_b_fs_yx_fsv4 : public EltwiseKernelBase + { + public: + using Parent = EltwiseKernelBase; + EltwiseKernel_b_fs_yx_fsv4() : EltwiseKernelBase("eltwise_b_fs_yx_fsv4") {} + virtual ~EltwiseKernel_b_fs_yx_fsv4() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + protected: + virtual ParamsKey GetSupportedKey() const override; + virtual bool Validate(const Params& params, const optional_params& options) const override; + JitConstants GetJitConstants(const eltwise_params& params) const override; + virtual DispatchData SetDefault(const eltwise_params& params) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp index 5feac0c..85cedc3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #include "eltwise_kernel_base.h" -#include "kernel_selector_utils.h" +#include "kernel_selector_utils.h" namespace kernel_selector { @@ -31,6 +31,16 @@ namespace kernel_selector case EltwiseMode::MAX: case EltwiseMode::POW: case EltwiseMode::MODULU: + case EltwiseMode::EQ: + case EltwiseMode::NE: + case EltwiseMode::LT: + case EltwiseMode::LE: + case EltwiseMode::GT: + case EltwiseMode::GE: + case EltwiseMode::LOGIC_AND: + case EltwiseMode::LOGIC_OR: + case EltwiseMode::LOGIC_XOR: + case EltwiseMode::SQUARED_DIFF: return 2; case EltwiseMode::SQRT: case EltwiseMode::RSQRT: @@ -54,6 +64,16 @@ namespace kernel_selector k.EnableOutputCalibration(); } + if (!stride.empty()) + { + k.EnableEltwiseStride(); + } + + if (broadcast) + { + k.EnableEltwiseBroadcast(); + } + return k; } @@ -109,6 +129,7 @@ namespace kernel_selector jit.AddConstants({ MakeJitConstant("ELTWISE_LAYOUT_BASED", params.layoutBased), MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization), + MakeJitConstant("ELTWISE_BROADCAST", params.broadcast), }); if (params.int8_quantization) @@ -140,6 +161,11 @@ namespace kernel_selector } inputs_decls += const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", "; + if (!params.stride.empty()) + { + jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_X", params.stride[i].x)); + jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_Y", params.stride[i].y)); + } if (useVload8) { vload_decls += "\\\n\tconst " + toCLType(params.inputs[i].GetDType()) + "8 in" + std::to_string(i); @@ -196,7 +222,7 @@ namespace kernel_selector } } - std::string input0_str, input1_str, cast_type, op; + std::string input0_str, input1_str, cast_type, output_cast, op; if (useVload8) { @@ -214,6 +240,11 @@ namespace kernel_selector op = "const UNIT_TYPE tmp" + op_num_str + " = "; } + if (params.output.GetDType() == Datatype::INT8 && !params.int8_quantization) { + output_cast = "(char)"; + cast_type = "(" + toCLType(params.inputs[op_num].GetDType()) + ")"; + } + input0_str = cast_type + "INPUT_" + op_num_str + "_0"; input1_str = cast_type + "INPUT_" + op_num_str + "_1"; @@ -238,17 +269,67 @@ namespace kernel_selector switch (ew.mode) { - case EltwiseMode::ADD: op += input0_str + " + " + input1_str; break; - case EltwiseMode::SUB: op += input0_str + " - " + input1_str; break; - case EltwiseMode::MUL: op += input0_str + " * " + input1_str; break; - case EltwiseMode::DIV: op += input0_str + " / " + input1_str; break; - case EltwiseMode::MODULU: op += cast_type + "fmod(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::MIN: op += cast_type + "fmin(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::MAX: op += cast_type + "fmax(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::POW: op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::SQRT: op += cast_type + "sqrt(" + input0_str + ")"; break; - case EltwiseMode::RSQRT: op += cast_type + "1/sqrt(" + input0_str + ")"; break; - case EltwiseMode::ASSIGN: op += input0_str; break; + case EltwiseMode::ADD: op += input0_str + " + " + input1_str; break; + case EltwiseMode::SUB: op += input0_str + " - " + input1_str; break; + case EltwiseMode::MUL: op += input0_str + " * " + input1_str; break; + case EltwiseMode::DIV: op += input0_str + " / " + input1_str; break; + case EltwiseMode::MODULU: + case EltwiseMode::MIN: + case EltwiseMode::MAX: + { + auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max" )); + auto input_0_type = params.inputs[0].GetDType(); + auto input_1_type = params.inputs[1].GetDType(); + + // input_0 == int + if (input_0_type == kernel_selector::Datatype::INT8 || + input_0_type == kernel_selector::Datatype::INT32 || + input_0_type == kernel_selector::Datatype::INT64) + { + // input_0 == int && input_1 == int + if (input_1_type == kernel_selector::Datatype::INT8 || + input_1_type == kernel_selector::Datatype::INT32 || + input_1_type == kernel_selector::Datatype::INT64) + { + if (ew.mode == EltwiseMode::MODULU) + op += input0_str + " % " + input1_str; + else + op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")"; + } + // input_0 == int && input_1 != int + else + { + op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")"; + } + } + // input_0 != int && input_1 == int + else if ( input_1_type == kernel_selector::Datatype::INT8 || + input_1_type == kernel_selector::Datatype::INT32 || + input_1_type == kernel_selector::Datatype::INT64) + { + op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))"; + } + // input_0 != int && input_1 != int + else + { + op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")"; + } + } break; + case EltwiseMode::POW: op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; break; + case EltwiseMode::SQRT: op += cast_type + "sqrt(" + input0_str + ")"; break; + case EltwiseMode::RSQRT: op += cast_type + "1/sqrt(" + input0_str + ")"; break; + case EltwiseMode::SQUARED_DIFF: op += cast_type + "((" + input0_str + " - " + input1_str + ")" + " * (" + input0_str + " - " + input1_str + "))"; break; + case EltwiseMode::EQ: op += output_cast + "(" + input0_str + " == " + input1_str + ")"; break; + case EltwiseMode::NE: op += output_cast + "(" + input0_str + " != " + input1_str + ")"; break; + case EltwiseMode::LT: op += output_cast + "(" + input0_str + " < " + input1_str + ")"; break; + case EltwiseMode::LE: op += output_cast + "(" + input0_str + " <= " + input1_str + ")"; break; + case EltwiseMode::GT: op += output_cast + "(" + input0_str + " > " + input1_str + ")"; break; + case EltwiseMode::GE: op += output_cast + "(" + input0_str + " >= " + input1_str + ")"; break; + case EltwiseMode::LOGIC_AND: op += output_cast + "(" + input0_str + " && " + input1_str + ")"; break; + case EltwiseMode::LOGIC_OR: op += output_cast + "(" + input0_str + " || " + input1_str + ")"; break; + case EltwiseMode::LOGIC_XOR: op += output_cast + "(!" + input0_str + " != !" + input1_str + ")"; break; + case EltwiseMode::ASSIGN: op += input0_str; break; default: break; } @@ -259,7 +340,7 @@ namespace kernel_selector } for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) - do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + + do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + "[GET_INDEX(INPUT, " + std::to_string(updateInputs[update_input_idx].inputId) + ")] = tmp" + std::to_string(updateInputs[update_input_idx].tmpId) + ";"; @@ -267,9 +348,14 @@ namespace kernel_selector jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise)); - if (params.layoutBased || params.int8_quantization) + if (params.layoutBased || params.int8_quantization || params.broadcast) + { + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.output)); + } + + if (!params.stride.empty()) { - jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); + jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); } return jit; @@ -284,12 +370,17 @@ namespace kernel_selector { DispatchData kd; - if (params.layoutBased || params.int8_quantization) + if (params.layoutBased || params.int8_quantization || params.broadcast) { - auto global = GetTensorFriendlyWorkGroups(params.inputs[0]); + auto global = GetTensorFriendlyWorkGroups(params.output); kd.gws0 = global[0]; kd.gws1 = global[1]; kd.gws2 = global[2]; + if (!params.stride.empty()) + { + kd.gws0 /= params.stride[0].x; + kd.gws0 /= params.stride[0].y; + } } else if (CheckInputsOutputNoPitchSameDims(params)) { @@ -346,7 +437,7 @@ namespace kernel_selector kernel.workGroups.global = { runInfo.gws0, runInfo.gws1, runInfo.gws2 }; kernel.workGroups.local = { runInfo.lws0, runInfo.lws1, runInfo.lws2 }; - kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, ROUND_ROBIN); + kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, false, newParams.int8_quantization, newParams.output_calibration); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h index 1611408..458f3b9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h @@ -90,12 +90,14 @@ namespace kernel_selector std::vector operations; std::vector coefficients; std::vector updateInputIds; - + std::vector stride; + bool layoutBased = false; bool int8_quantization = false; bool output_calibration = false; float output_quantization_factor = 1.0f; - + bool broadcast = false; + MultiDataTensor output_calibration_factors; virtual ParamsKey GetParamsKey() const; }; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp index 571a013..e644505 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2018-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #include "eltwise_kernel_fs_bs_yx_bsv4_fsv32.h" -#include "kernel_selector_utils.h" +#include "kernel_selector_utils.h" namespace kernel_selector { @@ -31,6 +31,7 @@ namespace kernel_selector { k.EnableBatching(); k.EnableInt8Quantization(); k.EnableOutputCalibration(); + k.EnableEltwiseStride(); return k; } @@ -46,6 +47,7 @@ namespace kernel_selector { kd.lws1 = 1; kd.lws2 = 8; + kd.effiency = FORCE_PRIORITY_3; return kd; } @@ -100,6 +102,12 @@ namespace kernel_selector { } inputs_decls += const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", "; + + if (!params.stride.empty()) + { + jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_X", params.stride[i].x)); + jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_Y", params.stride[i].y)); + } } jit.AddConstant(MakeJitConstant("INPUTS_DECLS", inputs_decls)); @@ -177,17 +185,67 @@ namespace kernel_selector { switch (ew.mode) { - case EltwiseMode::ADD: op += input0_str + " + " + input1_str; break; - case EltwiseMode::SUB: op += input0_str + " - " + input1_str; break; - case EltwiseMode::MUL: op += input0_str + " * " + input1_str; break; - case EltwiseMode::DIV: op += input0_str + " / " + input1_str; break; - case EltwiseMode::MODULU: op += cast_type + "fmod(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::MIN: op += cast_type + "fmin(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::MAX: op += cast_type + "fmax(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::POW: op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; break; - case EltwiseMode::SQRT: op += cast_type + "sqrt(" + input0_str + ")"; break; - case EltwiseMode::RSQRT: op += cast_type + "1/sqrt(" + input0_str + ")"; break; - case EltwiseMode::ASSIGN: op += input0_str; break; + case EltwiseMode::ADD: op += input0_str + " + " + input1_str; break; + case EltwiseMode::SUB: op += input0_str + " - " + input1_str; break; + case EltwiseMode::MUL: op += input0_str + " * " + input1_str; break; + case EltwiseMode::DIV: op += input0_str + " / " + input1_str; break; + case EltwiseMode::MODULU: + case EltwiseMode::MIN: + case EltwiseMode::MAX: + { + auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max")); + auto input_0_type = params.inputs[0].GetDType(); + auto input_1_type = params.inputs[1].GetDType(); + + // input_0 == int + if (input_0_type == kernel_selector::Datatype::INT8 || + input_0_type == kernel_selector::Datatype::INT32 || + input_0_type == kernel_selector::Datatype::INT64) + { + // input_0 == int && input_1 == int + if (input_1_type == kernel_selector::Datatype::INT8 || + input_1_type == kernel_selector::Datatype::INT32 || + input_1_type == kernel_selector::Datatype::INT64) + { + if (ew.mode == EltwiseMode::MODULU) + op += input0_str + " % " + input1_str; + else + op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")"; + } + // input_0 == int && input_1 != int + else + { + op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")"; + } + } + // input_0 != int && input_1 == int + else if (input_1_type == kernel_selector::Datatype::INT8 || + input_1_type == kernel_selector::Datatype::INT32 || + input_1_type == kernel_selector::Datatype::INT64) + { + op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))"; + } + // input_0 != int && input_1 != int + else + { + op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")"; + } + } break; + case EltwiseMode::POW: op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; break; + case EltwiseMode::SQRT: op += cast_type + "sqrt(" + input0_str + ")"; break; + case EltwiseMode::RSQRT: op += cast_type + "1/sqrt(" + input0_str + ")"; break; + case EltwiseMode::SQUARED_DIFF: op += cast_type + "((" + input0_str + " - " + input1_str + ")" + " * (" + input0_str + " - " + input1_str + "))"; break; + case EltwiseMode::EQ: op += cast_type + "(" + input0_str + " == " + input1_str + ")"; break; + case EltwiseMode::NE: op += cast_type + "(" + input0_str + " != " + input1_str + ")"; break; + case EltwiseMode::LT: op += cast_type + "(" + input0_str + " < " + input1_str + ")"; break; + case EltwiseMode::LE: op += cast_type + "(" + input0_str + " <= " + input1_str + ")"; break; + case EltwiseMode::GT: op += cast_type + "(" + input0_str + " > " + input1_str + ")"; break; + case EltwiseMode::GE: op += cast_type + "(" + input0_str + " >= " + input1_str + ")"; break; + case EltwiseMode::LOGIC_AND: op += cast_type + "(" + input0_str + " && " + input1_str + ")"; break; + case EltwiseMode::LOGIC_OR: op += cast_type + "(" + input0_str + " || " + input1_str + ")"; break; + case EltwiseMode::LOGIC_XOR: op += cast_type + "(!" + input0_str + " != !" + input1_str + ")"; break; + case EltwiseMode::ASSIGN: op += input0_str; break; default: break; } @@ -211,6 +269,11 @@ namespace kernel_selector { jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); } + if (!params.stride.empty()) + { + jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); + } + /////////////// return jit; } @@ -219,4 +282,4 @@ namespace kernel_selector { { return GetCommonKernelsData(params, options); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h index b1fb3e9..7cd0fe6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h @@ -27,9 +27,9 @@ namespace kernel_selector virtual ~EltwiseKernel_fs_bs_yx_bsv4_fsv32() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const eltwise_params& params) const override; virtual DispatchData SetDefault(const eltwise_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp index 3a77765..6b1e6ea 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #include "eltwise_kernel_ref.h" -#include "kernel_selector_utils.h" +#include "kernel_selector_utils.h" namespace kernel_selector { @@ -40,6 +40,8 @@ namespace kernel_selector { k.EnableBatching(); k.EnableInt8Quantization(); k.EnableOutputCalibration(); + k.EnableEltwiseStride(); + k.EnableEltwiseBroadcast(); return k; } @@ -56,7 +58,8 @@ namespace kernel_selector { if (params.inputs[i].GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32) return false; } - if (params.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32) + if (params.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 || + params.output.GetLayout() == DataLayout::b_fs_yx_fsv4) return false; return true; @@ -66,4 +69,4 @@ namespace kernel_selector { { return GetCommonKernelsData(params, options); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.h index c2ccf05..4f89ba4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.h @@ -27,9 +27,9 @@ namespace kernel_selector virtual ~EltwiseKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp index cf75652..1f0e01e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "eltwise_kernel_ref.h" #include "eltwise_kernel_vload8.h" #include "eltwise_kernel_fs_bs_yx_bsv4_fsv32.h" +#include "eltwise_kernel_b_fs_yx_fsv4.h" namespace kernel_selector { @@ -26,6 +27,7 @@ namespace kernel_selector Attach(); Attach(); Attach(); + Attach(); } KernelsData eltwise_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp index 5ceb750..cd5285e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp @@ -123,7 +123,7 @@ namespace kernel_selector { auto& kernel = kd.kernels[0]; kernel.workGroups.global = { std::max(newParams.inputs[0].LogicalSize()/8, (size_t)1), 1, 1 }; kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global); - kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, ROUND_ROBIN); + kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, false); kd.estimatedTime = FORCE_PRIORITY_8; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.h index 8f716ae..a369b22 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.h @@ -27,10 +27,10 @@ namespace kernel_selector virtual ~EltwiseKernel_vload8() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual bool Validate(const Params& p, const optional_params& o) const override; virtual JitConstants GetJitConstants(const eltwise_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp index f126daa..57091fb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp @@ -40,6 +40,7 @@ namespace kernel_selector k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); + k.EnableNonBiasTerm(); return k; } @@ -58,7 +59,7 @@ namespace kernel_selector EmbedKernelRef::DispatchData EmbedKernelRef::SetDefault(const embed_params& params) const { DispatchData kd; - std::vector global = { params.inputs[0].Y().v , params.weights.OFM().v, params.inputs[0].Batch().v }; + std::vector global = { params.inputs[0].X().v , params.weights.OFM().v, params.inputs[0].Batch().v }; std::vector local = GetOptimalLocalWorkGroupSizes(global); kd.gws0 = global[0]; @@ -103,7 +104,7 @@ namespace kernel_selector auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !newParams.bias.empty()); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, true, !newParams.bias.empty()); kd.estimatedTime = runInfo.effiency; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h index 6ff98b0..2df8446 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h @@ -36,10 +36,10 @@ namespace kernel_selector { }; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; virtual JitConstants GetJitConstants(const embed_params& params) const; virtual DispatchData SetDefault(const embed_params& params) const; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h index bb2a109..f4b6b4f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h @@ -17,6 +17,7 @@ #pragma once #include "weight_bias_params.h" +#include namespace kernel_selector { @@ -28,6 +29,22 @@ namespace kernel_selector { embed_params() : weight_bias_params(KernelType::EMBED) {} + + std::string to_string() const + { + std::stringstream s; + + s << base_params::to_string() << "_"; + if (bias.empty()) + { + s << "no_bias" << "_"; + } + else + { + s << "bias_" << bias[0].PhysicalSize() << "_"; + } + return s.str(); + } virtual ParamsKey GetParamsKey() const { return weight_bias_params::GetParamsKey(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_block_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_block_kernel_base.cpp index baed45d..24cc0ba 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_block_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_block_kernel_base.cpp @@ -15,8 +15,6 @@ */ #include "fully_connected_block_kernel_base.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" namespace kernel_selector { @@ -32,4 +30,5 @@ namespace kernel_selector return cldnnJit; } + } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.cpp index fd2f617..dd07347 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_MMAD.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -39,7 +38,7 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params) const + FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params, int) const { auto runInfo = Parent::SetDefault(params); @@ -47,15 +46,15 @@ namespace kernel_selector const auto of_maps = params.output.Feature().v; const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - runInfo->gws0 = 1; - runInfo->gws1 = 1; - runInfo->gws2 = of_threads_per_batch * params.output.Batch().v; + runInfo.gws0 = 1; + runInfo.gws1 = 1; + runInfo.gws2 = of_threads_per_batch * params.output.Batch().v; - runInfo->lws0 = 1; - runInfo->lws1 = 1; - runInfo->lws2 = sub_group_size; + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = sub_group_size; - return std::move(runInfo); + return runInfo; } JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params, const DispatchData& runInfo) const @@ -74,8 +73,17 @@ namespace kernel_selector KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, DataLayout::byxf_af32, - { WeightsLayout::os_is_yx_isa8_osv8_isv4 } - ); + + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, DataLayout::byxf_af32, + { WeightsLayout::os_is_yx_isa8_osv8_isv4 }, DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.h index 048ed23..5004c40 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_MMAD.h @@ -28,10 +28,10 @@ namespace kernel_selector { FullyConnectedKernelMMAD() : Parent("fully_connected_gpu_MMAD") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - + protected: + ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; - std::unique_ptr SetDefault(const fully_connected_params& params) const override; + DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp index 20e6e8d..9b4cbb7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp @@ -47,27 +47,27 @@ namespace kernel_selector return jit; } - std::unique_ptr FullyConnectedKernelBase::SetDefault(const fully_connected_params& params) const + FullyConnectedKernelBase::DispatchData FullyConnectedKernelBase::SetDefault(const fully_connected_params& params, int) const { - std::unique_ptr dispatchData = std::unique_ptr(new DispatchData()); - dispatchData->fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; + DispatchData dispatchData; + dispatchData.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; // Determine global work sizes. - dispatchData->gws0 = params.output.LogicalSize(); - dispatchData->gws1 = dispatchData->gws2 = 1; + dispatchData.gws0 = params.output.LogicalSize(); + dispatchData.gws1 = dispatchData.gws2 = 1; // Find largest positive local work size that is divider for global work size. - dispatchData->lws0 = std::min(std::max(dispatchData->gws0, static_cast(1)), static_cast(32)); - while (dispatchData->gws0 % dispatchData->lws0 != 0) + dispatchData.lws0 = std::min(std::max(dispatchData.gws0, static_cast(1)), static_cast(32)); + while (dispatchData.gws0 % dispatchData.lws0 != 0) { - --dispatchData->lws0; + --dispatchData.lws0; } - dispatchData->lws1 = dispatchData->lws2 = 1; + dispatchData.lws1 = dispatchData.lws2 = 1; - return std::move(dispatchData); + return dispatchData; } - KernelsData FullyConnectedKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, DataLayout dl, std::vector wl, float estimated_time) const + KernelsData FullyConnectedKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, DataLayout dl, std::vector wl, float estimated_time, const std::string exeMode, int autoTuneIndex) const { if (!Validate(params, options) || wl.empty()) @@ -117,15 +117,31 @@ namespace kernel_selector auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - const std::unique_ptr runInfo = SetDefault(newParams); - auto cldnn_jit = GetJitConstants(newParams, *runInfo.get()); + const DispatchData runInfo = SetDefault(newParams, autoTuneIndex); + auto cldnn_jit = GetJitConstants(newParams, runInfo); std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, *runInfo.get(), params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !orgParams.bias.empty(), 1, newParams.int8_quantization, newParams.output_calibration); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, exeMode, true, !orgParams.bias.empty(), 1, newParams.int8_quantization, newParams.output_calibration); kd.estimatedTime = estimated_time; - kd.autoTuneIndex = -1; + kd.autoTuneIndex = autoTuneIndex; return{ kd }; } + + std::string FullyConnectedKernelBase::GetAutoTuneOptions(int autoTuneIndex) const + { + if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size())) + { + return autoTuneOptions[autoTuneIndex]; + } + + return DEFAULT; +} + + KernelsData FullyConnectedKernelBase::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, DataLayout dl, std::vector wl, float estimated_time, const int autoTuneIndex) const + { + return GetCommonKernelsData(params, options, dl, wl, estimated_time, GetAutoTuneOptions(autoTuneIndex), autoTuneIndex); + } + } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.h index d7d47e6..a4f32c2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.h @@ -31,12 +31,29 @@ namespace kernel_selector virtual ~FullyConnectedKernelBase() {} struct DispatchData : public CommonDispatchData - {}; + { + uint32_t unit_byte_size; + const char* chunk_type; + uint32_t chunk_byte_size; + uint32_t units_per_chunk; + uint32_t bytes_per_sg_read; + uint32_t units_per_sg_read; + uint32_t responses_per_sg_exec; + uint32_t in_chunk_prefetch_size; + uint32_t filter_chunk_prefetch_size; + + uint32_t last_rg_size; + uint32_t rg_count; + }; + std::string GetAutoTuneOptions(int autoTuneIndex) const; + std::vector autoTuneOptions = { DEFAULT, NO_PRERA_SCH, AGE_BASED }; + virtual KernelsData GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, DataLayout dl, std::vector wl, float estimated_time = DONT_USE_IF_HAVE_SOMETHING_ELSE, int autoTuneIndex = -1) const ; + protected: virtual JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const; - virtual std::unique_ptr SetDefault(const fully_connected_params& params) const; - KernelsData GetCommonKernelsData(const Params& params, const optional_params& optParams, DataLayout dl, std::vector wl, float estimated_time = DONT_USE_IF_HAVE_SOMETHING_ELSE) const; + virtual DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const; + KernelsData GetCommonKernelsData(const Params& params, const optional_params& optParams, DataLayout dl, std::vector wl, float estimated_time = DONT_USE_IF_HAVE_SOMETHING_ELSE, const std::string exeMode = DEFAULT, int autoTuneIndex = -1) const; bool Validate(const Params& p, const optional_params&) const override { @@ -48,4 +65,4 @@ namespace kernel_selector return true; } }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.cpp index 61d5edc..8b762f9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_bf_io_gemm.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -38,9 +37,9 @@ namespace kernel_selector { return k; } - std::unique_ptr FullyConnected_bf_io_GEMM::SetDefault(const fully_connected_params& params) const + FullyConnected_bf_io_GEMM::DispatchData FullyConnected_bf_io_GEMM::SetDefault(const fully_connected_params& params, int autoTuneIndex) const { - auto runInfo = Parent::SetDefault(params); + auto runInfo = Parent::SetDefault(params, autoTuneIndex); const uint32_t localWorkSizeX = 64; const uint32_t globalWorkSizeX = localWorkSizeX; @@ -48,17 +47,17 @@ namespace kernel_selector { std::vector global = { globalWorkSizeX, params.output.Feature().v, params.output.Batch().v }; std::vector local = { localWorkSizeX, 1, 1 }; - runInfo->gws0 = global[0]; - runInfo->gws1 = global[1]; - runInfo->gws2 = 1; + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = 1; - runInfo->lws0 = local[0]; - runInfo->lws1 = local[1]; - runInfo->lws2 = 1; + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = 1; - runInfo->effiency = FORCE_PRIORITY_6; + runInfo.effiency = FORCE_PRIORITY_6; - return std::move(runInfo); + return runInfo; } JitConstants FullyConnected_bf_io_GEMM::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const @@ -89,6 +88,16 @@ namespace kernel_selector { KernelsData FullyConnected_bf_io_GEMM::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, DataLayout::bf, { WeightsLayout::oiyx }, FORCE_PRIORITY_6); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, DataLayout::bf, { WeightsLayout::oiyx }, FORCE_PRIORITY_6, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.h index 80b799b..fa56bb3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_gemm.h @@ -17,9 +17,9 @@ #pragma once #include "fully_connected_kernel_base.h" - + namespace kernel_selector { - + class FullyConnected_bf_io_GEMM : public FullyConnectedKernelBase { public: @@ -27,10 +27,10 @@ namespace kernel_selector { FullyConnected_bf_io_GEMM() : Parent("fully_connected_gpu_bf_io_gemm") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: - std::unique_ptr SetDefault(const fully_connected_params& params) const override; + ParamsKey GetSupportedKey() const override; + DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.cpp index b19a923..383e1b5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_bf_io_input_spatial.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -36,18 +35,18 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_bf_io_input_spatial::SetDefault(const fully_connected_params& arg) const + FullyConnected_bf_io_input_spatial::DispatchData FullyConnected_bf_io_input_spatial::SetDefault(const fully_connected_params& arg, int ) const { auto kd = FullyConnectedKernelBase::SetDefault(arg); - kd->gws0 = Align(arg.output.LogicalSize() / arg.inputs[0].Batch().v, 16); - kd->gws1 = arg.inputs[0].Batch().v; - kd->gws2 = 1; - kd->lws0 = 16; - kd->lws1 = 1; - kd->lws2 = 1; + kd.gws0 = Align(arg.output.LogicalSize() / arg.inputs[0].Batch().v, 16); + kd.gws1 = arg.inputs[0].Batch().v; + kd.gws2 = 1; + kd.lws0 = 16; + kd.lws1 = 1; + kd.lws2 = 1; - kd->effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; + kd.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; const auto &input = arg.inputs[0]; const auto &output = arg.output; @@ -56,11 +55,11 @@ namespace kernel_selector { if ((input.LogicalSize() / output.Batch().v >= 9216) && (output.Feature().v >= 4096)) { - kd->effiency = FORCE_PRIORITY_1; + kd.effiency = FORCE_PRIORITY_1; } } - return std::move(kd); + return kd; } bool FullyConnected_bf_io_input_spatial::Validate(const Params& p, const optional_params& o) const @@ -85,21 +84,42 @@ namespace kernel_selector KernelsData FullyConnected_bf_io_input_spatial::GetKernelsData(const Params& params, const optional_params& optParams) const { + KernelsData res = {}; const auto& orgParams = static_cast(params); const auto& input = orgParams.inputs[0]; const auto& output = orgParams.output; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::bf, { WeightsLayout::io }, DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + if (input.GetLayout() == DataLayout::bfyx) { if (input.Batch().v == 1 && output.Batch().v == 1) { if ((input.LogicalSize() / output.Batch().v >= 9216) && (output.Feature().v >= 4096)) { - return GetCommonKernelsData(params, optParams, DataLayout::bf, { WeightsLayout::io }, FORCE_PRIORITY_1); + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::bf, { WeightsLayout::io }, FORCE_PRIORITY_1, (int)i+3); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } } } } - return GetCommonKernelsData(params, optParams, DataLayout::bf, { WeightsLayout::io }); + + + + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.h index 5c6fddb..9d81bd8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_input_spatial.h @@ -26,10 +26,10 @@ namespace kernel_selector { FullyConnected_bf_io_input_spatial() : FullyConnectedKernelBase("fully_connected_gpu_bf_io_input_spatial") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.cpp index 0c4efe2..3ede922 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_bf_io_ref.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -38,6 +37,17 @@ namespace kernel_selector KernelsData FullyConnected_bf_io_ref::GetKernelsData(const Params& params, const optional_params& optParams) const { - return GetCommonKernelsData(params, optParams, DataLayout::bf, { WeightsLayout::io }); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::bf, { WeightsLayout::io }, + DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.h index 8d708fd..2a08938 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_io_ref.h @@ -26,6 +26,8 @@ namespace kernel_selector { FullyConnected_bf_io_ref() : FullyConnectedKernelBase("fully_connected_gpu_bf_io_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.cpp index 0c50aec..4c4ddd2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.cpp @@ -46,27 +46,38 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_bfyx_Ref::SetDefault(const fully_connected_params& params) const + FullyConnected_bfyx_Ref::DispatchData FullyConnected_bfyx_Ref::SetDefault(const fully_connected_params& params, int ) const { auto runInfo = Parent::SetDefault(params); std::vector global = { params.output.Feature().v, params.output.Batch().v }; std::vector local = GetOptimalLocalWorkGroupSizes(global); - runInfo->gws0 = global[0]; - runInfo->gws1 = global[1]; - runInfo->gws2 = 1; + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = 1; - runInfo->lws0 = local[0]; - runInfo->lws1 = local[1]; - runInfo->lws2 = 1; + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = 1; - return std::move(runInfo); + return runInfo; } KernelsData FullyConnected_bfyx_Ref::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, DataLayout::bfyx, - { WeightsLayout::oiyx, WeightsLayout::oyxi, WeightsLayout::iyxo, WeightsLayout::yxio }); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, DataLayout::bfyx, + { WeightsLayout::oiyx, WeightsLayout::oyxi, WeightsLayout::iyxo, WeightsLayout::yxio }, + DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.h index 8ea52d5..65dc611 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.h @@ -17,9 +17,9 @@ #pragma once #include "fully_connected_kernel_base.h" - + namespace kernel_selector { - + class FullyConnected_bfyx_Ref : public FullyConnectedKernelBase { public: @@ -28,9 +28,9 @@ namespace kernel_selector { FullyConnected_bfyx_Ref() : Parent("fully_connected_gpu_bfyx_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - + protected: - std::unique_ptr SetDefault(const fully_connected_params& params) const override; + ParamsKey GetSupportedKey() const override; + DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.cpp index a7b77e0..08562dc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_bs_f_bsv16_af8.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -36,17 +35,17 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_bs_f_bsv16_af8::SetDefault(const fully_connected_params& arg) const + FullyConnected_bs_f_bsv16_af8::DispatchData FullyConnected_bs_f_bsv16_af8::SetDefault(const fully_connected_params& arg, int ) const { auto kd = FullyConnectedBlockKernelBase::SetDefault(arg); size_t groups_per_batches = GetLocalGroupsSize(arg); - kd->gws0 = Align(arg.output.LogicalSize() / (GetBatchesPerWorkItem(arg) * groups_per_batches), 16); - kd->gws1 = groups_per_batches; - kd->lws0 = 16; - kd->lws1 = 1; + kd.gws0 = Align(arg.output.LogicalSize() / (GetBatchesPerWorkItem(arg) * groups_per_batches), 16); + kd.gws1 = groups_per_batches; + kd.lws0 = 16; + kd.lws1 = 1; - return std::move(kd); + return kd; } static bool check_input_layout(const DataTensor& t) @@ -86,6 +85,16 @@ namespace kernel_selector KernelsData FullyConnected_bs_f_bsv16_af8::GetKernelsData(const Params& params, const optional_params& optParams) const { - return GetCommonKernelsData(params, optParams, DataLayout::bs_f_bsv16__af8, { WeightsLayout::os_i_osv16__ai8 }, FORCE_PRIORITY_2); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::bs_f_bsv16__af8, { WeightsLayout::os_i_osv16__ai8 }, FORCE_PRIORITY_2, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.h index 63a5075..57bdef5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_af8.h @@ -26,10 +26,10 @@ namespace kernel_selector { FullyConnected_bs_f_bsv16_af8() : FullyConnectedBlockKernelBase("fully_connected_gpu_bs_f_bsv16_af8_vload") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - + protected: + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.cpp index b98b528..eec40eb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_bs_f_bsv16_b1.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -58,13 +57,13 @@ namespace kernel_selector return cldnn_jit; } - std::unique_ptr FullyConnected_bs_f_bsv16_b1::SetDefault(const fully_connected_params& arg) const + FullyConnected_bs_f_bsv16_b1::DispatchData FullyConnected_bs_f_bsv16_b1::SetDefault(const fully_connected_params& arg, int ) const { - auto run_info = std::unique_ptr(new DispatchData(*FullyConnectedKernelBase::SetDefault(arg))); + DispatchData run_info = FullyConnectedKernelBase::SetDefault(arg); // Properties of chunk and unit. const char* chunk_type = "uint"; - const uint32_t unit_byte_size = run_info->fp16UnitUsed ? sizeof(short) : sizeof(float); + const uint32_t unit_byte_size = run_info.fp16UnitUsed ? sizeof(short) : sizeof(float); constexpr uint32_t chunk_byte_size = sizeof(uint32_t); constexpr uint32_t sub_group_size = 16; const uint32_t units_per_chunk = chunk_byte_size / unit_byte_size; @@ -76,28 +75,37 @@ namespace kernel_selector const auto response_size = arg.output.Feature().v; auto rg_count = CeilDiv(response_size, responses_per_sg_exec); - run_info->lws0 = sub_group_size; + run_info.lws0 = sub_group_size; // Number of work items needed to process all response groups. - run_info->gws0 = rg_count * sub_group_size; - run_info->lws1 = run_info->lws2 = 1; - run_info->gws1 = run_info->gws2 = 1; + run_info.gws0 = rg_count * sub_group_size; + run_info.lws1 = run_info.lws2 = 1; + run_info.gws1 = run_info.gws2 = 1; - auto& kd = run_info; - kd->unit_byte_size = unit_byte_size; - kd->chunk_type = chunk_type; - kd->chunk_byte_size = chunk_byte_size; - kd->units_per_chunk = units_per_chunk; - kd->bytes_per_sg_read = sub_group_size * chunk_byte_size; - kd->units_per_sg_read = units_per_sg_read; - kd->responses_per_sg_exec = responses_per_sg_exec; - kd->in_chunk_prefetch_size = 2; - kd->filter_chunk_prefetch_size = responses_per_sg_exec; + run_info.unit_byte_size = unit_byte_size; + run_info.chunk_type = chunk_type; + run_info.chunk_byte_size = chunk_byte_size; + run_info.units_per_chunk = units_per_chunk; + run_info.bytes_per_sg_read = sub_group_size * chunk_byte_size; + run_info.units_per_sg_read = units_per_sg_read; + run_info.responses_per_sg_exec = responses_per_sg_exec; + run_info.in_chunk_prefetch_size = 2; + run_info.filter_chunk_prefetch_size = responses_per_sg_exec; - return std::move(run_info); + return run_info; } KernelsData FullyConnected_bs_f_bsv16_b1::GetKernelsData(const Params& params, const optional_params& optParams) const { - return GetCommonKernelsData(params, optParams, DataLayout::bf, {WeightsLayout::os_i_osv16}, FORCE_PRIORITY_5); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::bf, { WeightsLayout::os_i_osv16 }, FORCE_PRIORITY_5, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } +} + + return res; } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.h index d440e60..4d453dc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv16_b1.h @@ -26,29 +26,10 @@ namespace kernel_selector { FullyConnected_bs_f_bsv16_b1() : FullyConnectedKernelBase("fully_connected_gpu_bs_f_bsv16_b1") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - - protected: - struct DispatchData : public FullyConnectedKernelBase::DispatchData - { - DispatchData(const FullyConnectedKernelBase::DispatchData& base_dispatch_data) - : FullyConnectedKernelBase::DispatchData(base_dispatch_data), - unit_byte_size(0), chunk_type(nullptr), chunk_byte_size(0), units_per_chunk(0), bytes_per_sg_read(0), - units_per_sg_read(0), responses_per_sg_exec(0), in_chunk_prefetch_size(0), filter_chunk_prefetch_size(0) - {} - - uint32_t unit_byte_size; - const char* chunk_type; - uint32_t chunk_byte_size; - uint32_t units_per_chunk; - uint32_t bytes_per_sg_read; - uint32_t units_per_sg_read; - uint32_t responses_per_sg_exec; - uint32_t in_chunk_prefetch_size; - uint32_t filter_chunk_prefetch_size; - }; + protected: + ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const fully_connected_params& params, const FullyConnectedKernelBase::DispatchData& kd) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp index 6b8fbfa..234a941 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_bs_f_bsv8_af8.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -38,32 +37,32 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_bs_f_bsv8_af8::SetDefault(const fully_connected_params& arg) const + FullyConnected_bs_f_bsv8_af8::DispatchData FullyConnected_bs_f_bsv8_af8::SetDefault(const fully_connected_params& arg, int ) const { auto kd = FullyConnectedBlockKernelBase::SetDefault(arg); size_t groups_per_batches = GetLocalGroupsSize(arg); - kd->gws0 = Align(arg.output.LogicalSize() / (GetNeuronsPerWorkItem(arg) * GetBatchesPerWorkItem(arg) * groups_per_batches), 8); - kd->gws1 = groups_per_batches; - kd->lws0 = 8; - kd->lws1 = 1; + kd.gws0 = Align(arg.output.LogicalSize() / (GetNeuronsPerWorkItem(arg) * GetBatchesPerWorkItem(arg) * groups_per_batches), 8); + kd.gws1 = groups_per_batches; + kd.lws0 = 8; + kd.lws1 = 1; - return std::move(kd); + return kd; } static bool check_input_layout(const DataTensor& t) { bool b16_layout = false; b16_layout |= t.GetLayout() == DataLayout::bs_f_bsv8__af8; - b16_layout |= DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::BATCH) == 0 && (t.Batch().v == 8); // TODO - check f alignment to 8 + b16_layout |= DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::BATCH) == 0 && (t.Batch().v == 8); return b16_layout; } static bool check_output_layout(const DataTensor& t) { bool b16_layout = false; - b16_layout |= (t.GetLayout() == DataLayout::fb); - b16_layout |= (t.GetLayout() == DataLayout::bs_f_bsv8__af8) && (t.Batch().v == 8); + b16_layout |= (t.GetLayout() == DataLayout::fb) && (t.Batch().v == 8); + b16_layout |= (t.GetLayout() == DataLayout::bs_f_bsv8__af8); return b16_layout; } @@ -85,11 +84,14 @@ namespace kernel_selector const bool bProperBatch = params.inputs[0].Batch().v >= 8 && params.inputs[0].Batch().v % 8 == 0; + const bool bProperFeature = + params.inputs[0].Feature().v >= 8 && + params.inputs[0].Feature().v % 8 == 0; const bool bProperInput = check_input_layout(params.inputs[0]); const bool bProperOutput = check_output_layout(params.output); const bool bSupportedLayout = optParams.allowInputReordering || bProperInput; - if (!bProperBatch || !bSupportedLayout || !bProperOutput) + if (!bProperBatch || !bProperFeature || !bSupportedLayout || !bProperOutput) { return false; } @@ -99,6 +101,16 @@ namespace kernel_selector KernelsData FullyConnected_bs_f_bsv8_af8::GetKernelsData(const Params& params, const optional_params& optParams) const { - return GetCommonKernelsData(params, optParams, DataLayout::bs_f_bsv8__af8, { WeightsLayout::os_i_osv8__ai8 }, FORCE_PRIORITY_4); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::bs_f_bsv8__af8, { WeightsLayout::os_i_osv8__ai8 }, FORCE_PRIORITY_4, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.h index 666df90..13799e2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.h @@ -26,10 +26,10 @@ namespace kernel_selector { FullyConnected_bs_f_bsv8_af8() : FullyConnectedBlockKernelBase("fully_connected_gpu_bs_f_bsv8_af8_vload") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - + protected: + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp index 1a3e98d..839f940 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_fb_io_b8_f8.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -37,19 +36,19 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_fb_io_b8_f8::SetDefault(const fully_connected_params& arg) const + FullyConnected_fb_io_b8_f8::DispatchData FullyConnected_fb_io_b8_f8::SetDefault(const fully_connected_params& arg, int ) const { auto kd = FullyConnectedBlockKernelBase::SetDefault(arg); const auto& output = arg.output; size_t groups_per_batches = GetLocalGroupsSize(arg); - kd->gws0 = output.LogicalSize() / (GetNeuronsPerWorkItem(arg) * GetBatchesPerWorkItem(arg) * groups_per_batches); - kd->gws1 = groups_per_batches; - kd->lws0 = 8; - kd->lws1 = 1; + kd.gws0 = Align(output.LogicalSize() / (GetNeuronsPerWorkItem(arg) * GetBatchesPerWorkItem(arg) * groups_per_batches), 8); + kd.gws1 = groups_per_batches; + kd.lws0 = 8; + kd.lws1 = 1; - return std::move(kd); + return kd; } bool FullyConnected_fb_io_b8_f8::Validate(const Params& p, const optional_params& o) const @@ -65,11 +64,17 @@ namespace kernel_selector const auto batches = output.Batch().v; const auto x_size = output.LogicalSize() / batches; + const auto& input = params.inputs[0]; + const auto input_x_size = input.LogicalSize() / input.Batch().v; + const bool proper_input_aligment = (input_x_size % 8) == 0; + const bool proper_output_aligment = (output.LogicalSize() / (GetNeuronsPerWorkItem(params) * GetBatchesPerWorkItem(params) * GetLocalGroupsSize(params)) % 8) == 0; const bool bSupportedBatch = (batches % 8) == 0; const bool bSupportedFeature = (x_size % 8) == 0; if (!bSupportedBatch || - !bSupportedFeature) + !bSupportedFeature || + !proper_input_aligment || + !proper_output_aligment) { return false; } @@ -80,13 +85,22 @@ namespace kernel_selector KernelsData FullyConnected_fb_io_b8_f8::GetKernelsData(const Params& params, const optional_params& optParams) const { assert(params.GetType() == KernelType::FULLY_CONNECTED); - + KernelsData res = {}; const auto& orgParams = static_cast(params); float estimated_time = orgParams.inputs[0].GetDType() == Datatype::F16 && orgParams.output.Batch().v >= 16 ? FORCE_PRIORITY_3 : FORCE_PRIORITY_5; - return GetCommonKernelsData(params, optParams, DataLayout::fb, { WeightsLayout::io }, estimated_time); + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::fb, { WeightsLayout::io }, estimated_time, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.h index d380862..2bb0117 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.h @@ -26,10 +26,10 @@ namespace kernel_selector { FullyConnected_fb_io_b8_f8() : FullyConnectedBlockKernelBase("fully_connected_gpu_fb_io_b8_f8_vload") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.cpp index b32c8a5..01a7061 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_fb_io_block.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -35,9 +34,10 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_fb_io_block::SetDefault(const fully_connected_params& arg) const + + FullyConnected_fb_io_block::DispatchData FullyConnected_fb_io_block::SetDefault(const fully_connected_params& arg, int ) const { - auto kd = std::unique_ptr(new DispatchData(*FullyConnectedKernelBase::SetDefault(arg))); + auto kd = FullyConnectedKernelBase::SetDefault(arg); const auto& output = arg.output; auto batch_size = output.Batch().v; @@ -55,38 +55,37 @@ namespace kernel_selector // for at least one input data set from batch. auto rg_count = CeilDiv(response_size, units_per_sg_read); - kd->lws0 = sub_group_size; + kd.lws0 = sub_group_size; // Number of work items needed to process all response groups. - kd->gws0 = rg_count * sub_group_size; - kd->lws1 = 1; - kd->gws1 = batch_size / units_per_sg_read; - - kd->unit_byte_size = unit_byte_size; - kd->chunk_type = chunk_type; - kd->chunk_byte_size = chunk_byte_size; - kd->units_per_chunk = units_per_chunk; - kd->bytes_per_sg_read = sub_group_size * chunk_byte_size; - kd->units_per_sg_read = units_per_sg_read; - kd->rg_count = (uint32_t)rg_count; - kd->last_rg_size = response_size % units_per_sg_read; - return std::move(kd); + kd.gws0 = rg_count * sub_group_size; + kd.lws1 = 1; + kd.gws1 = batch_size / units_per_sg_read; + + kd.unit_byte_size = unit_byte_size; + kd.chunk_type = chunk_type; + kd.chunk_byte_size = chunk_byte_size; + kd.units_per_chunk = units_per_chunk; + kd.bytes_per_sg_read = sub_group_size * chunk_byte_size; + kd.units_per_sg_read = units_per_sg_read; + kd.rg_count = (uint32_t)rg_count; + kd.last_rg_size = response_size % units_per_sg_read; + return kd; } JitConstants FullyConnected_fb_io_block::GetJitConstants(const fully_connected_params& params, const FullyConnectedKernelBase::DispatchData& run_info) const { - auto &d = static_cast(run_info); auto cldnn_jit = FullyConnectedKernelBase::GetJitConstants(params, run_info); cldnn_jit.AddConstants({ - MakeJitConstant("SUB_GROUP_SIZE", d.lws0), - MakeJitConstant("WORK_ITEMS_PER_BATCH", d.gws1), - MakeJitConstant("UNIT_BYTE_SIZE", d.unit_byte_size), - MakeJitConstant("CHUNK_TYPE", d.chunk_type), - MakeJitConstant("CHUNK_BYTE_SIZE", d.chunk_byte_size), - MakeJitConstant("UNITS_PER_CHUNK", d.units_per_chunk), - MakeJitConstant("BYTES_PER_SG_READ", d.bytes_per_sg_read), - MakeJitConstant("UNITS_PER_SG_READ", d.units_per_sg_read), - MakeJitConstant("RG_COUNT", d.rg_count), - MakeJitConstant("LAST_RG_SIZE", d.last_rg_size), + MakeJitConstant("SUB_GROUP_SIZE", run_info.lws0), + MakeJitConstant("WORK_ITEMS_PER_BATCH", run_info.gws1), + MakeJitConstant("UNIT_BYTE_SIZE", run_info.unit_byte_size), + MakeJitConstant("CHUNK_TYPE", run_info.chunk_type), + MakeJitConstant("CHUNK_BYTE_SIZE", run_info.chunk_byte_size), + MakeJitConstant("UNITS_PER_CHUNK", run_info.units_per_chunk), + MakeJitConstant("BYTES_PER_SG_READ", run_info.bytes_per_sg_read), + MakeJitConstant("UNITS_PER_SG_READ", run_info.units_per_sg_read), + MakeJitConstant("RG_COUNT", run_info.rg_count), + MakeJitConstant("LAST_RG_SIZE", run_info.last_rg_size), }); return cldnn_jit; } @@ -144,6 +143,18 @@ namespace kernel_selector // (fb == fyxb flatten fyx, not yxfb flatten yxf). // the order of the add operation cause some numeric changes. in order to avoid them right now we use yxfb/oiyx instead. // return GetCommonKernelsData(params, optParams, DataLayout::fb, WeightsLayout::io, estimated_time); - return GetCommonKernelsData(params, optParams, DataLayout::yxfb, { WeightsLayout::yxio }, estimated_time); - } + //return GetCommonKernelsData(params, optParams, DataLayout::yxfb, { WeightsLayout::yxio }, estimated_time); + + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::yxfb, { WeightsLayout::yxio }, estimated_time, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; + } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.h index c3c433c..98ced4a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_block.h @@ -26,29 +26,11 @@ namespace kernel_selector { FullyConnected_fb_io_block() : FullyConnectedKernelBase("fully_connected_gpu_fb_io_block_fp16") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: - struct DispatchData : public FullyConnectedKernelBase::DispatchData - { - DispatchData(const FullyConnectedKernelBase::DispatchData& base_dispatch_data) - : FullyConnectedKernelBase::DispatchData(base_dispatch_data), - unit_byte_size(0), chunk_type(nullptr), chunk_byte_size(0), units_per_chunk(0), - bytes_per_sg_read(0), units_per_sg_read(0), last_rg_size(0), rg_count(0) - {} - - uint32_t unit_byte_size; - const char *chunk_type; - uint32_t chunk_byte_size; - uint32_t units_per_chunk; - uint32_t bytes_per_sg_read; - uint32_t units_per_sg_read; - uint32_t last_rg_size; - uint32_t rg_count; - }; - + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const fully_connected_params& params, const FullyConnectedKernelBase::DispatchData& kd) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.cpp index f91078a..84e3c80 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_fb_io_ref.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -42,7 +41,15 @@ namespace kernel_selector // (fb == fyxb flatten fyx, not yxfb flatten yxf). // the order of the add operation cause some numeric changes. in order to avoid them right now we use yxfb/oiyx instead. // return GetCommonKernelsData(params, optParams, DataLayout::fb, WeightsLayout::io, FORCE_PRIORITY_6); - - return GetCommonKernelsData(params, optParams, DataLayout::yxfb, { WeightsLayout::yxio }, FORCE_PRIORITY_6); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::yxfb, { WeightsLayout::yxio }, FORCE_PRIORITY_6, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.h index 9d5e5b5..46ee639 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_ref.h @@ -26,6 +26,8 @@ namespace kernel_selector { FullyConnected_fb_io_ref() : FullyConnectedKernelBase("fully_connected_gpu_fb_io_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.cpp index 5d1c8aa..8232e5e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_fb_oi_b8_ref.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -35,17 +34,17 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_fb_oi_b8_ref::SetDefault(const fully_connected_params& arg) const + FullyConnected_fb_oi_b8_ref::DispatchData FullyConnected_fb_oi_b8_ref::SetDefault(const fully_connected_params& arg, int ) const { auto kd = FullyConnectedKernelBase::SetDefault(arg); const auto& output = arg.output; - kd->gws0 = output.Batch().v; - kd->gws1 = output.LogicalSize() / kd->gws0; - kd->lws0 = 8; - kd->lws1 = 1; + kd.gws0 = output.Batch().v; + kd.gws1 = output.LogicalSize() / kd.gws0; + kd.lws0 = 8; + kd.lws1 = 1; - return std::move(kd); + return kd; } bool FullyConnected_fb_oi_b8_ref::Validate(const Params& p, const optional_params& o) const @@ -67,6 +66,15 @@ namespace kernel_selector KernelsData FullyConnected_fb_oi_b8_ref::GetKernelsData(const Params& params, const optional_params& optParams) const { - return GetCommonKernelsData(params, optParams, DataLayout::fb, { WeightsLayout::oi }, FORCE_PRIORITY_6); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::fb, { WeightsLayout::oi }, FORCE_PRIORITY_6, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.h index 0c063e2..f7a3785 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_b8_ref.h @@ -26,10 +26,10 @@ namespace kernel_selector { FullyConnected_fb_oi_b8_ref() : FullyConnectedKernelBase("fully_connected_gpu_fb_oi_b8_fp32_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; - std::unique_ptr SetDefault(const fully_connected_params& arg) const override; + DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.cpp index 6d16701..8ace812 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_fb_oi_ref.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -38,6 +37,15 @@ namespace kernel_selector KernelsData FullyConnected_fb_oi_ref::GetKernelsData(const Params& params, const optional_params& optParams) const { - return GetCommonKernelsData(params, optParams, DataLayout::fb, { WeightsLayout::oi }); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, optParams, DataLayout::fb, { WeightsLayout::oi }, DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.h index 814ad60..4f74e77 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_oi_ref.h @@ -26,6 +26,8 @@ namespace kernel_selector { FullyConnected_fb_oi_ref() : FullyConnectedKernelBase("fully_connected_gpu_fb_oi_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp new file mode 100644 index 0000000..28f6052 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp @@ -0,0 +1,116 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fully_connected_kernel_imad.h" + +// IMAD Fully_Connected primitive implementation. +// Limitations are: +// 1. Input=Fx1x1 with Filter=1x1 +// 2. No data padding + +namespace kernel_selector +{ + ParamsKey FullyConnectedKernelIMAD::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::bf); + k.EnableBiasPerOutput(); + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableInt8Quantization(); + k.EnableOutputCalibration(); + return k; + } + + FullyConnectedKernelIMAD::Parent::DispatchData + FullyConnectedKernelIMAD::SetDefault(const fully_connected_params& params, int) const + { + const int simdSize = 16; + + auto runInfo = Parent::SetDefault(params); + + runInfo.gws0 = RoundUp(params.output.Feature().v, simdSize); + runInfo.gws1 = params.output.Batch().v; + runInfo.gws2 = 1; + + runInfo.lws0 = simdSize; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } // SetDefault + + bool FullyConnectedKernelIMAD::Validate(const Params& params, const optional_params& options) const + { + if (!Parent::Validate(params, options)) { + return false; + } + + const auto& newParams = static_cast(params); + const auto& in = newParams.inputs[0]; + const auto& weights = newParams.weights; + + if ((in.X().v != 1) || + (in.Y().v != 1) || + (weights.X().v != 1) || + (weights.Y().v != 1)) { + // Currently only Input=Fx1x1 with Filter=1x1 is supported + return false; + } + if ((in.X().pad.before != 0) || + (in.X().pad.after != 0) || + (in.Y().pad.before != 0) || + (in.Y().pad.after != 0)) { + // Padding is not supported + return false; + } + if (in.Feature().v % (4 * 8)) { + // Algorith requires 4 bytes read as one int + // with specific weight format os_is_yx_osv16_isv4 + // wich will read 8 elements per reading + return false; + } + + return true; + } // Validate + + KernelsData FullyConnectedKernelIMAD::GetKernelsData(const Params& params, const optional_params& options) const + { + + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex( + params, options, DataLayout::b_fs_yx_fsv4, + { WeightsLayout::os_is_yx_osv16_isv4 }, + FORCE_PRIORITY_1, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.h new file mode 100644 index 0000000..e6c3bf8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fully_connected_kernel_base.h" + +namespace kernel_selector { + + class FullyConnectedKernelIMAD : public FullyConnectedKernelBase + { + public: + using Parent = FullyConnectedKernelBase; + + FullyConnectedKernelIMAD() : Parent("fully_connected_gpu_imad") {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + ParamsKey GetSupportedKey() const override; + virtual bool Validate(const Params& params, const optional_params& options) const override; + DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.cpp index 78bc497..b7942a9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.cpp @@ -40,30 +40,38 @@ namespace kernel_selector return k; } - std::unique_ptr FullyConnected_image_tutorial::SetDefault(const fully_connected_params& params) const + FullyConnected_image_tutorial::DispatchData FullyConnected_image_tutorial::SetDefault(const fully_connected_params& params, int ) const { auto runInfo = Parent::SetDefault(params); std::vector global = { params.output.Feature().v, params.output.Batch().v }; std::vector local = GetOptimalLocalWorkGroupSizes(global); - runInfo->gws0 = global[0]; - runInfo->gws1 = global[1]; - runInfo->gws2 = 1; + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = 1; - runInfo->lws0 = local[0]; - runInfo->lws1 = local[1]; - runInfo->lws2 = 1; + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = 1; - runInfo->effiency = TUTORIAL_PRIORITY; + runInfo.effiency = TUTORIAL_PRIORITY; - return std::move(runInfo); + return runInfo; } KernelsData FullyConnected_image_tutorial::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, DataLayout::bfyx, - { WeightsLayout::image_2d_weights_c4_fyx_b } - ); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, DataLayout::bfyx, + { WeightsLayout::image_2d_weights_c4_fyx_b }, DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.h index 95adf3a..12b35d0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_image_tutorial.h @@ -17,9 +17,9 @@ #pragma once #include "fully_connected_kernel_base.h" - + namespace kernel_selector { - + class FullyConnected_image_tutorial : public FullyConnectedKernelBase { public: @@ -28,9 +28,9 @@ namespace kernel_selector { FullyConnected_image_tutorial() : Parent("fully_connected_gpu_image_tutorial") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - + protected: - std::unique_ptr SetDefault(const fully_connected_params& params) const override; + ParamsKey GetSupportedKey() const override; + DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp index 46e4dea..ad57397 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_mmad_batched.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -88,7 +87,7 @@ namespace kernel_selector return jit; } - std::unique_ptr FullyConnected_mmad_batched::SetDefault(const fully_connected_params& params) const + FullyConnected_mmad_batched::DispatchData FullyConnected_mmad_batched::SetDefault(const fully_connected_params& params, int) const { auto runInfo = Parent::SetDefault(params); @@ -97,21 +96,30 @@ namespace kernel_selector const auto of_maps = params.output.Feature().v; const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - runInfo->gws0 = params.output.Batch().v / 8; // we process 8 batches in a single WG - runInfo->gws1 = of_threads_per_batch; - runInfo->gws2 = 1; + runInfo.gws0 = params.output.Batch().v / 8; // we process 8 batches in a single WG + runInfo.gws1 = of_threads_per_batch; + runInfo.gws2 = 1; - runInfo->lws0 = 1; - runInfo->lws1 = sub_group_size; - runInfo->lws2 = 1; + runInfo.lws0 = 1; + runInfo.lws1 = sub_group_size; + runInfo.lws2 = 1; - runInfo->effiency = FORCE_PRIORITY_1; - return std::move(runInfo); + runInfo.effiency = FORCE_PRIORITY_1; + return runInfo; } KernelsData FullyConnected_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, DataLayout::fs_bs_yx_bsv4_fsv32, - { WeightsLayout::os_is_yx_isa8_osv8_isv4 }, FORCE_PRIORITY_1); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, DataLayout::fs_bs_yx_bsv4_fsv32, + { WeightsLayout::os_is_yx_isa8_osv8_isv4 }, FORCE_PRIORITY_1, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h index 61af89f..b08fe32 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h @@ -28,11 +28,11 @@ namespace kernel_selector { FullyConnected_mmad_batched() : Parent("fully_connected_gpu_mmad_batched") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - + protected: + ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; - std::unique_ptr SetDefault(const fully_connected_params& params) const override; + DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp index 529e1ca..80a345e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ #include "fully_connected_kernel_image_tutorial.h" #include "fully_connected_kernel_MMAD.h" #include "fully_connected_kernel_mmad_batched.h" +#include "fully_connected_kernel_imad.h" namespace kernel_selector { @@ -51,10 +52,11 @@ namespace kernel_selector { Attach(); Attach(); Attach(); + Attach(); } KernelsData fully_connected_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { return GetAutoTuneBestKernel(params, options, KernelType::FULLY_CONNECTED); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.cpp index 5afb9ca..9a5d2de 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.cpp @@ -15,7 +15,6 @@ */ #include "fully_connected_kernel_yxfb_ref.h" -#include "kernel_selector_utils.h" namespace kernel_selector { @@ -40,8 +39,16 @@ namespace kernel_selector KernelsData FullyConnected_yxfb_ref::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, DataLayout::yxfb, - { WeightsLayout::io, WeightsLayout::oi, WeightsLayout::oiyx, WeightsLayout::oyxi, WeightsLayout::iyxo, WeightsLayout::yxio } - ); + KernelsData res = {}; + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, DataLayout::yxfb, + { WeightsLayout::io, WeightsLayout::oi, WeightsLayout::oiyx, WeightsLayout::oyxi, WeightsLayout::iyxo, WeightsLayout::yxio }, DONT_USE_IF_HAVE_SOMETHING_ELSE, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + return res; } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.h index c76e50b..1dcc5d0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_yxfb_ref.h @@ -26,6 +26,8 @@ namespace kernel_selector { FullyConnected_yxfb_ref() : FullyConnectedKernelBase("fully_connected_gpu_yxfb_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp index e40848a..39d4817 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp @@ -80,7 +80,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !orgParams.bias.empty()); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, true, !orgParams.bias.empty()); kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 }); kd.estimatedTime = runInfo.effiency; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h index 7d1068b..c70293c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h @@ -26,6 +26,8 @@ namespace kernel_selector { FullyConnectedGradInputKernelRef() : FullyConnectedGradInputKernelBase("fully_connected_grad_input_gpu_ref") {} virtual ~FullyConnectedGradInputKernelRef() {} + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp index 67328ac..3af05f9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp @@ -82,7 +82,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !orgParams.bias.empty()); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, true, !orgParams.bias.empty()); if (orgParams.use_momentum) { kernel.arguments.push_back({ ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0 }); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h index 9291287..78bba9c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h @@ -26,6 +26,7 @@ namespace kernel_selector { FullyConnectedGradWeightsKernelRef() : FullyConnectedGradWeightsKernelBase("fully_connected_grad_weights_gpu_ref") {} virtual ~FullyConnectedGradWeightsKernelRef() {} + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp new file mode 100644 index 0000000..e74eb7f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp @@ -0,0 +1,176 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_bn_scale_kernel_base.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" + +namespace kernel_selector +{ + bool fused_conv_bn_scale_kernel_base::Validate(const Params& p, const optional_params& o) const + { + if (p.GetType() != KernelType::FUSED_CONV_BN_SCALE || + o.GetType() != KernelType::FUSED_CONV_BN_SCALE) + { + return false; + } + + const fused_conv_bn_scale_params& params = static_cast(p); + const fused_conv_bn_scale_optional_params& optParams = static_cast(o); + + bool bSupportedWeightsLayout = false; + + for (WeightsLayout l : GetSupportedWeightLayouts(params)) + { + bSupportedWeightsLayout |= params.weights.GetLayout() == l; + } + + const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering; + + return bWeightsOK; + } + + JitConstants fused_conv_bn_scale_kernel_base::GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData&) const + { + JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params); + const auto& padding = params.padding; + const auto& input = params.inputs[0]; + + int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() - padding.x*input.X().pitch - input.Y().pitch*padding.y; + input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0); + + mem_consts.AddConstants({ + MakeJitConstant("STRIDE", params.stride), + MakeJitConstant("PADDING", params.padding), + MakeJitConstant("FILTER_ARRAY_NUM", params.split), + MakeJitConstant("DILATION", params.dilation), + MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), + MakeJitConstant("EPSILON", params.epsilon) + }); + + if (params.fused_in_training) + mem_consts.AddConstant(MakeJitConstant("FUSED_TRAINING", 1)); + if (params.scale_bias) + mem_consts.AddConstant(MakeJitConstant("SCALE_BIAS_TERM", 1)); + + return mem_consts; + } + + bool fused_conv_bn_scale_kernel_base::CheckWorkGroups(const DispatchData& kd) + { + if (kd.gws0 == 0 || + kd.gws1 == 0 || + kd.gws2 == 0 || + kd.lws0 == 0 || + kd.lws1 == 0 || + kd.lws2 == 0) + { + return false; + } + + if ((kd.gws0 % kd.lws0) != 0 || + (kd.gws1 % kd.lws1) != 0 || + (kd.gws2 % kd.lws2) != 0) + { + return false; + } + + return true; + } + + fused_conv_bn_scale_kernel_base::DispatchData fused_conv_bn_scale_kernel_base::SetDefault(const fused_conv_bn_scale_params& params) const + { + DispatchData kd; + + const auto& out = params.output; + kd.fp16UnitUsed = out.GetDType() == Datatype::F16; + std::vector global; + if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) + { + global = { out.X().v, out.Y().v, out.Feature().v*out.Batch().v }; + } + else + { + global = { out.Feature().v*out.Batch().v, out.X().v, out.Y().v }; + } + + auto local = GetOptimalLocalWorkGroupSizes(global); + + kd.gws0 = global[0]; + kd.gws1 = global[1]; + kd.gws2 = global[2]; + + kd.lws0 = local[0]; + kd.lws1 = local[1]; + kd.lws2 = local[2]; + + kd.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; + return kd; + } + + KernelsData fused_conv_bn_scale_kernel_base::GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelData kd = KernelData::Default(params); + fused_conv_bn_scale_params& newParams = *static_cast(kd.params.get()); + + DispatchData runInfo = SetDefault(newParams); + + if (!CheckWorkGroups(runInfo)) + { + // Internal Error - wrong calculation of global/local work group sizes + return{}; + } + + bool succeed = UpdateWeightsParams( + newParams, + options, + GetSupportedWeightLayouts(newParams), + kd.weightsReorderParams); + + if (!succeed) + { + return{}; + } + + auto finalKernelName = GetKernelName(newParams); + auto cldnnJit = GetJitConstants(newParams, runInfo); + auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options); + auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, "", true, !newParams.bias.empty(), 1); + kernel.arguments.push_back({ ArgumentDescriptor::Types::SPLIT, 0 }); + uint32_t idx = 1; + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, idx++ }); + if (newParams.scale_bias) + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, idx++ }); + if (newParams.fused_in_training) + { + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, idx++ }); + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, idx++ }); + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, idx }); + } + + kd.estimatedTime = estimated_time; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h new file mode 100644 index 0000000..cdd8878 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h @@ -0,0 +1,81 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "weight_bias_kernel_base.h" +#include "actual_kernels/convolution/convolution_params.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // fused_conv_bn_scale_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct fused_conv_bn_scale_params : public weight_bias_params + { + fused_conv_bn_scale_params() : weight_bias_params(KernelType::FUSED_CONV_BN_SCALE) {} + + uSize filterSize; + uSize stride; + uSize dilation; + uSize padding; + uint32_t split = 1; + bool fused_in_training = false; + bool scale_bias = false; + float epsilon = 0.00001f; + + ParamsKey GetParamsKey() const override + { + ParamsKey k = weight_bias_params::GetParamsKey(); + + if (split > 1) + { + k.EnableSplitSupport(); + } + + return k; + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // fused_conv_bn_scale_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct fused_conv_bn_scale_optional_params : weight_bias_optional_params + { + fused_conv_bn_scale_optional_params() : weight_bias_optional_params(KernelType::FUSED_CONV_BN_SCALE) {} + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // fused_conv_bn_scale_kernel_base + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class fused_conv_bn_scale_kernel_base : public WeightBiasKernelBase + { + public: + using WeightBiasKernelBase::WeightBiasKernelBase; + virtual ~fused_conv_bn_scale_kernel_base() {} + + using DispatchData = CommonDispatchData; + + protected: + virtual std::vector GetSupportedWeightLayouts(const fused_conv_bn_scale_params&) const = 0; + virtual std::string GetKernelName(const fused_conv_bn_scale_params&) const { return kernelName; } + virtual bool Validate(const Params& p, const optional_params& o) const override; + virtual JitConstants GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& kd) const; + virtual DispatchData SetDefault(const fused_conv_bn_scale_params& params) const; + static bool CheckWorkGroups(const DispatchData&); + KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp new file mode 100644 index 0000000..e3317bf --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp @@ -0,0 +1,74 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_bn_scale_kernel_ref.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey fused_conv_bn_scale_kernel_ref::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableSplitSupport(); + k.EnableBatching(); + k.DisableTuning(); + return k; + } + + fused_conv_bn_scale_kernel_base::DispatchData fused_conv_bn_scale_kernel_ref::SetDefault(const fused_conv_bn_scale_params& arg) const + { + DispatchData runInfo = fused_conv_bn_scale_kernel_base::SetDefault(arg); + + runInfo.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + runInfo.gws0 = arg.output.Batch().v; + runInfo.gws1 = arg.output.Feature().v; + runInfo.gws2 = 1; + + runInfo.lws0 = std::min(std::max(runInfo.gws0, static_cast(1)), static_cast(32)); + while (runInfo.gws0 % runInfo.lws0 != 0) + { + --runInfo.lws0; + } + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } + + JitConstants fused_conv_bn_scale_kernel_ref::GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + return jit; + } + + KernelsData fused_conv_bn_scale_kernel_ref::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options, DONT_USE_IF_HAVE_SOMETHING_ELSE); + + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h new file mode 100644 index 0000000..fc36068 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h @@ -0,0 +1,44 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_bn_scale_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_bn_scale_kernel_ref : public fused_conv_bn_scale_kernel_base + { + public: + using Parent = fused_conv_bn_scale_kernel_base; + + fused_conv_bn_scale_kernel_ref() : fused_conv_bn_scale_kernel_base("fused_conv_bn_scale_kernel_ref") {} + virtual ~fused_conv_bn_scale_kernel_ref() {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const fused_conv_bn_scale_params&) const override + { + return{ + WeightsLayout::oiyx, + }; + } + DispatchData SetDefault(const fused_conv_bn_scale_params& arg) const override; + JitConstants GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& kd) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B32_B64.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp similarity index 58% rename from inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B32_B64.cpp rename to inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp index 08b1953..f51cdc1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B32_B64.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp @@ -14,16 +14,18 @@ // limitations under the License. */ -#include "auto_tuner.h" -#include "auto_tuner_offline.h" +#include "fused_conv_bn_scale_kernel_selector.h" +#include "fused_conv_bn_scale_kernel_ref.h" + namespace kernel_selector { - //SKL GT2 - void tuning_cache_1912_B32_B64(tuning_data& td) + fused_conv_bn_scale_kernel_selector::fused_conv_bn_scale_kernel_selector() { - td.td.insert({ - - { "9500850790449116723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 16) }, - }); + Attach(); + } + + KernelsData fused_conv_bn_scale_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::FUSED_CONV_BN_SCALE); } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h new file mode 100644 index 0000000..2b63db7 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class fused_conv_bn_scale_kernel_selector : public kernel_selector_base + { + public: + static fused_conv_bn_scale_kernel_selector &Instance() { + static fused_conv_bn_scale_kernel_selector instance_; + return instance_; + } + + fused_conv_bn_scale_kernel_selector(); + + virtual ~fused_conv_bn_scale_kernel_selector() {} + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp new file mode 100644 index 0000000..3ac4e9e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp @@ -0,0 +1,464 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_base.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" + +namespace kernel_selector +{ + std::string fused_conv_eltwise_params::to_string() const + { + std::stringstream s; + + s << base_params::to_string() << "_"; + if (bias.empty()) + { + s << "no_bias" << "_"; + } + else + { + s << "bias_" << bias[0].PhysicalSize() << "_"; + } + + s << conv.filterSize.x << "_" << conv.filterSize.y << "_"; + s << conv.stride.x << "_" << conv.stride.y << "_"; + s << conv.dilation.x << "_" << conv.dilation.y << "_"; + s << conv.padding.x << "_" << conv.padding.y << "_"; + s << conv.split; + + return s.str(); + } + + ParamsKey fused_conv_eltwise_params::GetParamsKey() const + { + ParamsKey k = weight_bias_params::GetParamsKey(); + + if (conv.split > 1) + { + k.EnableFusedConvEltwSplitSupport(); + } + + if (conv.dilation.x != 1 || + conv.dilation.y != 1) + { + k.EnableFusedConvEltwDilation(); + } + + if (conv.depthwise_separable_opt) + { + k.EnableFusedConvEltwDepthwiseSeparableOpt(); + } + + if (conv.transposed) + { + k.EnableFusedConvEltwTranspose(); + } + + if (conv.int8_quantization) + { + k.EnableFusedConvEltwInt8Quantization(); + } + + if (conv.output_calibration) + { + k.EnableFusedConvEltwOutputCalibration(); + } + + if (conv.local_convolution) + { + k.EnableFusedConvEltwLocalConvolution(); + } + + if (second_input_in_output) + { + k.EnableFusedConvEltwiseRWOutOpt(); + } + + return k; + } + + bool fused_conv_eltwise_kernel_base::Validate(const Params& p, const optional_params& o) const + { + if (p.GetType() != KernelType::FUSED_CONV_ELTWISE || + o.GetType() != KernelType::FUSED_CONV_ELTWISE) + { + return false; + } + + const fused_conv_eltwise_params& params = static_cast(p); + const fused_conv_eltwise_optional_params& optParams = static_cast(o); + + bool bSupportedWeightsLayout = false; + + for (WeightsLayout l : GetSupportedWeightLayouts(params)) + { + bSupportedWeightsLayout |= params.weights.GetLayout() == l; + } + + const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering; + + if (!bWeightsOK) + { + return false; + } + + return true; + } + + JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const + { + JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params); + const auto& padding = params.conv.padding; + const auto& input = params.inputs[0]; + + int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() - padding.x*input.X().pitch - input.Y().pitch*padding.y; + input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0); + + mem_consts.AddConstants({ + MakeJitConstant("STRIDE", params.conv.stride), + MakeJitConstant("PADDING", params.conv.padding), + MakeJitConstant("DILATION", params.conv.dilation), + MakeJitConstant("FILTER_ARRAY_NUM", params.conv.split), + MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), + MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", params.conv.depthwise_separable_opt), + MakeJitConstant("QUANTIZATION_TERM", params.conv.int8_quantization), + }); + + if (params.conv.int8_quantization) + { + mem_consts.AddConstants({ MakeJitConstant("W_QF", params.conv.weights_quantization_factors[0]) }); + mem_consts.AddConstants({ MakeJitConstant("I_QF",params.conv.input_quantization_factor) }); + + if (params.conv.output_calibration) + { + mem_consts.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.conv.output_calibration)); + mem_consts.AddConstant(MakeJitConstant("O_QF", params.conv.output_calibration_factors[0])); + + } + else + mem_consts.AddConstants({ MakeJitConstant("O_QF", params.conv.output_quantization_factor) }); + } + + if (params.conv.local_convolution) + { + mem_consts.AddConstants({ MakeJitConstant("LOCAL_CONVOLUTION", params.conv.local_convolution) }); + } + + JitConstants eltw_activations = MakeActivationJitConstants(params.eltw.activation, "_ELTW"); + mem_consts.Merge(eltw_activations); + + mem_consts.AddConstant(MakeJitConstant("IN_OUT_OPT", params.second_input_in_output ? 1 : 0)); + + std::vector unrollLoopParams{ + params.conv.filterSize.x, + params.conv.filterSize.y, + (uint32_t)kd.gemmStyle.globalWorkSizeDX, + (uint32_t)kd.gemmStyle.globalWorkSizeDY, + (uint32_t)kd.gemmStyle.globalWorkSizeDZ, + (uint32_t)kd.gemmStyle.subBlockDimM, + (uint32_t)kd.gemmStyle.subBlockDimK, + (uint32_t)kd.gemmStyle.subBlockDimN + }; + + auto loopCount = *std::max_element(unrollLoopParams.begin(), unrollLoopParams.end()); + + JitConstants mem_consts_loop = MakeLoopUnrollParamsJitConstants(loopCount); + mem_consts.Merge(mem_consts_loop); + + return mem_consts; + } + + bool fused_conv_eltwise_kernel_base::CheckWorkGroups(const fused_conv_eltwise_kernel_base::DispatchData& kd) + { + if (kd.gws0 == 0 || + kd.gws1 == 0 || + kd.gws2 == 0 || + kd.lws0 == 0 || + kd.lws1 == 0 || + kd.lws2 == 0) + { + return false; + } + + if ((kd.gws0 % kd.lws0) != 0 || + (kd.gws1 % kd.lws1) != 0 || + (kd.gws2 % kd.lws2) != 0) + { + return false; + } + + return true; + } + + namespace + { + bool CheckTensorForSplit(const DataTensor& t, uint32_t split) + { + if (t.PitchesDifferFromLogicalDims()) + { + auto feature = t.Feature(); + auto featureIndex = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::FEATURE); + if (featureIndex >= 0 && featureIndex+1 < (int)DataTensor::ChannelsCount(t.GetLayout())) + { + if (feature.v*split <= t.GetDims()[featureIndex+1].pitch) + { + Tensor::NDims newDims = t.GetDims(); + newDims[featureIndex].v = feature.v*split; + + DataTensor newTensor{ newDims, t.GetDType(), t.GetLayout(), t.GetViewOffset(), t.PhysicalSize(), t.GetPaddedVal()}; + + if (newTensor.PitchesDifferFromLogicalDims() == false) + { + return true; + } + } + } + + return false; + } + + return true; + } + } + + bool fused_conv_eltwise_kernel_base::CheckPitchForSplitOnly(const fused_conv_eltwise_params& params) + { + // TODO: it's better to add pitch+offset support than handle this case + return CheckTensorForSplit(params.inputs[0], params.conv.split); + } + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::SetDefault(const fused_conv_eltwise_params& params, int) const + { + DispatchData kd; + + const auto& out = params.output; + kd.fp16UnitUsed = out.GetDType() == Datatype::F16; + std::vector global; + if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) + { + global = { out.X().v, out.Y().v, out.Feature().v*out.Batch().v }; + } + else + { + global = { out.Feature().v*out.Batch().v, out.X().v, out.Y().v }; + } + + auto local = GetOptimalLocalWorkGroupSizes(global); + + kd.gws0 = global[0]; + kd.gws1 = global[1]; + kd.gws2 = global[2]; + + kd.lws0 = local[0]; + kd.lws1 = local[1]; + kd.lws2 = local[2]; + + kd.cldnnStyle.blockWidth = 1; + kd.cldnnStyle.blockHeight = 1; + kd.cldnnStyle.prefetch = 0; + kd.cldnnStyle.inputBlockArraySize = 0; + kd.cldnnStyle.inputBlockWidth = 0; + + kd.gemmStyle.globalWorkSizeDX = 1; + kd.gemmStyle.globalWorkSizeDY = 1; + kd.gemmStyle.globalWorkSizeDZ = 1; + kd.gemmStyle.subBlockDimK = 1; + kd.gemmStyle.subBlockDimM = 0; + kd.gemmStyle.subBlockDimN = 0; + kd.effiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; + return kd; + } + + KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode, int autoTuneIndex) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelData kd = KernelData::Default(params); + fused_conv_eltwise_params& newParams = *static_cast(kd.params.get()); + + if (NeedPaddedInput()) + { + kd.reorderInput = CovolutionUpdateInputParams(newParams); + } + DispatchData runInfo = SetDefault(newParams, autoTuneIndex); + + if (!CheckWorkGroups(runInfo)) + { + // Internal Error - wrong calculation of global/local work group sizes + return{}; + } + + bool succeed = UpdateWeightsParams( + newParams, + options, + GetSupportedWeightLayouts(newParams), + kd.weightsReorderParams); + + if (!succeed) + { + return{}; + } + + auto finalKernelName = GetKernelName(newParams); + auto cldnnJit = GetJitConstants(newParams, runInfo); + auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options); + auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.conv.int8_quantization, newParams.conv.output_calibration); + kernel.arguments.push_back({ ArgumentDescriptor::Types::SPLIT, 0 }); + // eltwise's second input + if(newParams.second_input_in_output) + { + kernel.arguments.push_back({ ArgumentDescriptor::Types::OUTPUT, 0 }); + } + else + { + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 }); + } + if (!newParams.eltw.output_calibration_factors.empty()) + kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 1}); + + kd.estimatedTime = runInfo.effiency; + kd.autoTuneIndex = autoTuneIndex; + + return{ kd }; + } + + std::string fused_conv_eltwise_kernel_base::GetAutoTuneOptions(int autoTuneIndex) const + { + if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size())) + { + return autoTuneOptions[autoTuneIndex]; + } + + return DEFAULT; + } + + KernelsData fused_conv_eltwise_kernel_base::GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, const int autoTuneIndex) const + { + return GetCommonKernelsData(params, options, GetAutoTuneOptions(autoTuneIndex), autoTuneIndex); + } + + KernelsData fused_conv_eltwise_kernel_base::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelsData res = {}; + + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; + } + + static DataTensor GetConvolutionBFYXPaddedTensor(const fused_conv_eltwise_params& cp) + { + assert(cp.inputs[0].GetDims().size() == 4U); + + DataTensor t = cp.inputs[0]; + std::vector pad{ { 0,0 },{ 0,0 },{ 0,0 },{ 0,0 } }; + + auto& conv = cp.conv; + + pad[0].before = conv.padding.x; + pad[1].before = conv.padding.y; + + + const auto inputLimitX = (cp.output.X().v - 1) * conv.stride.x + (conv.filterSize.x - 1) * conv.dilation.x + 1; + const auto inputLimitY = (cp.output.Y().v - 1) * conv.stride.y + (conv.filterSize.y - 1) * conv.dilation.y + 1; + + pad[0].after = (size_t)std::max((int)inputLimitX - (int)t.X().v - (int)pad[0].before, (int)0); + pad[1].after = (size_t)std::max((int)inputLimitY - (int)t.Y().v - (int)pad[1].before, (int)0); + + Tensor::NDims dims(4); + const Tensor::NDims& orgDims = cp.inputs[0].GetDims(); + size_t pitch = 1; + for (size_t i = 0; i < dims.size(); i++) + { + dims[i].pad = pad[i]; + dims[i].v = orgDims[i].v; + dims[i].pitch = pitch; + pitch *= dims[i].LogicalDimPadded(); + } + + return{ dims, t.GetDType(), t.GetLayout() }; + } + + bool CheckConvolutionPaddedInputDesc(const fused_conv_eltwise_params& params, const DataTensor& reqDesc) + { + bool properPadding = + reqDesc.X().pad.before <= params.inputs[0].X().pad.before && + reqDesc.Y().pad.before <= params.inputs[0].Y().pad.before && + reqDesc.Feature().pad.before <= params.inputs[0].Feature().pad.before && + reqDesc.Batch().pad.before <= params.inputs[0].Batch().pad.before; + + properPadding &= + reqDesc.X().pad.after <= params.inputs[0].X().pad.after && + reqDesc.Y().pad.after <= params.inputs[0].Y().pad.after && + reqDesc.Feature().pad.after <= params.inputs[0].Feature().pad.after && + reqDesc.Batch().pad.after <= params.inputs[0].Batch().pad.after; + + properPadding &= ((params.conv.padding.x == 0 && params.conv.padding.y == 0) || params.inputs[0].GetPaddedVal() == 0.f); + + return properPadding; + } + + bool CovolutionUpdateInputParams(fused_conv_eltwise_params& params) + { + const auto req_input = GetConvolutionBFYXPaddedTensor(params); + const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input); + + if (!bProperInputDesc) + { + params.inputs[0] = req_input; + return true; + } + + return false; + } + + bool FusedConvolutionEltwiseCheckInput(const Params& p, const optional_params& o) + { + const fused_conv_eltwise_params& params = static_cast(p); + const fused_conv_eltwise_optional_params& optParams = static_cast(o); + + const auto req_input = GetConvolutionBFYXPaddedTensor(params); + const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input); + const bool bInputPadded = optParams.allowInputReordering || bProperInputDesc; + + if (!bInputPadded) + { + return false; + } + + return true; + } + +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h new file mode 100644 index 0000000..1bdebab --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h @@ -0,0 +1,138 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "weight_bias_kernel_base.h" +#include "actual_kernels/convolution/convolution_params.h" +#include "actual_kernels/eltwise/eltwise_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // convolution_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct fused_conv_eltwise_params : public weight_bias_params + { + fused_conv_eltwise_params() : weight_bias_params(KernelType::FUSED_CONV_ELTWISE) {} + + struct conv_data + { + uSize filterSize; + uSize stride; + uSize dilation; + uSize padding; + uint32_t split = 1; + bool depthwise_separable_opt = false; + bool transposed = false; + bool int8_quantization = false; + bool output_calibration = false; + bool local_convolution = false; + float input_quantization_factor = 1.0f; + float output_quantization_factor = 1.0f; + MultiDataTensor weights_quantization_factors; + MultiDataTensor output_calibration_factors; + } conv; + + struct eltw_data + { + std::vector operations; + std::vector coefficients; + std::vector updateInputIds; + std::vector stride; + + bool layoutBased = false; + bool int8_quantization = false; + bool output_calibration = false; + float output_quantization_factor = 1.0f; + + MultiDataTensor output_calibration_factors; + + base_activation_params activation; + } eltw; + + bool second_input_in_output = false; + + virtual std::string to_string() const override; + virtual ParamsKey GetParamsKey() const override; + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // convolution_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct fused_conv_eltwise_optional_params : weight_bias_optional_params + { + fused_conv_eltwise_optional_params() : weight_bias_optional_params(KernelType::FUSED_CONV_ELTWISE) {} + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ConvolutionKernelBase + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class fused_conv_eltwise_kernel_base : public WeightBiasKernelBase + { + public: + using WeightBiasKernelBase::WeightBiasKernelBase; + virtual ~fused_conv_eltwise_kernel_base() {} + + struct DispatchData : public CommonDispatchData + { + struct CLDNNStyle + { + size_t blockWidth, blockHeight; // used for kernels processing blocks + size_t prefetch; + size_t inputBlockArraySize; // Number of elements in array of UNIT_TYPE that must be specified in kernel to store/cache input block. + size_t inputBlockWidth; // Number of elements in X dimension stored/cached in input block. + }; + + struct GEMMStyle + { + size_t subBlockDimM; + size_t subBlockDimK; + size_t subBlockDimN; + size_t globalWorkSizeDX; + size_t globalWorkSizeDY; + size_t globalWorkSizeDZ; + }; + + union + { + CLDNNStyle cldnnStyle; + GEMMStyle gemmStyle; + }; + }; + + std::string GetAutoTuneOptions(int autoTuneIndex) const; + std::vector autoTuneOptions = { DEFAULT, NO_PRERA_SCH, AGE_BASED }; + virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; + virtual KernelsData GetTunedKernelsDataByIndex(const Params& params, const optional_params& options, int autoTuneIndex = -1) const override; + + protected: + virtual std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const = 0; + virtual std::string GetKernelName(const fused_conv_eltwise_params&) const { return kernelName; } + virtual bool NeedPaddedInput() const { return false; } + virtual bool Validate(const Params& p, const optional_params& o) const override; + virtual JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const; + virtual DispatchData SetDefault(const fused_conv_eltwise_params& params, int autoTuneIndex = -1) const; + static bool CheckWorkGroups(const DispatchData&); + static bool CheckPitchForSplitOnly(const fused_conv_eltwise_params& params); + KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, const std::string exeMode = DEFAULT, int autoTuneIndex = -1) const; + }; + + bool FusedConvolutionEltwiseCheckInput(const Params& p, const optional_params& o); + bool CheckConvolutionPaddedInputDesc(const fused_conv_eltwise_params& params, const DataTensor& reqDesc); + bool CovolutionUpdateInputParams(fused_conv_eltwise_params& params); + +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp new file mode 100644 index 0000000..8c68a9b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp @@ -0,0 +1,194 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_bfyx_1x1_opt.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey fused_conv_eltwise_kernel_bfyx_1x1_opt::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F32); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableSubGroup(); + //k.EnableSubGroupShort(); // we need it for FP16 only. we check it on the Validate phase + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableFusedConvEltwSplitSupport(); + k.EnableFusedConvEltwiseRWOutOpt(); // data for second input are already in output + return k; + } + + struct block_params + { + int32_t out_width; + int32_t out_height; + int32_t out_depth; + }; + + static block_params get_out_block_size(const fused_conv_eltwise_params& p) + { + auto out_depth = 8; + + if (p.output.X().v == 7) + { + auto gws0 = p.output.X().v / 7; + auto gws1 = p.output.Y().v / 1; + auto gws2 = 2 * (p.output.Feature().v * p.output.Batch().v) / 8; // process 8 output channels per Workitem + + auto compute_units = p.engineInfo.computeUnitsCount; + auto total_threads = (gws0 * gws1 * gws2) / 64; + if (total_threads < compute_units) + { + out_depth /= 2; + total_threads *= 2; + } + if (total_threads < compute_units) + { + out_depth /= 2; + total_threads *= 2; + } + return { 7,1,out_depth }; + } + else if (p.output.X().v == 14) + return { 7,1,8 }; + else if (p.output.X().v == 28) + return { 7,2,4 }; + else if (p.output.X().v == 56) + return { 8,1,8 }; + + return { 1,1,1 }; + } + + std::string fused_conv_eltwise_kernel_bfyx_1x1_opt::GetKernelName(const fused_conv_eltwise_params& params) const + { + if (params.inputs[0].GetDType() == Datatype::F32) + { + return kernelName + "_fp32"; + } + else + { + return kernelName + "_fp16"; + } + } + + bool fused_conv_eltwise_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_params& o) const + { + if (!fused_conv_eltwise_kernel_base::Validate(p, o) || + !FusedConvolutionEltwiseCheckInput(p, o)) + { + return false; + } + + const fused_conv_eltwise_params& cp = static_cast(p); + + if (cp.conv.stride.x != 1 || cp.conv.stride.y != 1) + return false; + + if (cp.conv.filterSize.x != 1 || cp.conv.filterSize.y != 1) + return false; + + if (cp.output.Feature().v % 64 != 0) + return false; + + if (cp.conv.padding.x != 0 || cp.conv.padding.y != 0) + return false; + + // if block sizes are 1x1, then this algorithm is probably not the best + auto block = get_out_block_size(cp); + if (block.out_width == 1 && block.out_height == 1) + return false; + + if (cp.output.X().v % block.out_width != 0) + return false; + if (cp.output.Y().v % block.out_height != 0) + return false; + + return true; + } + + std::vector fused_conv_eltwise_kernel_bfyx_1x1_opt::GetSupportedWeightLayouts(const fused_conv_eltwise_params& p) const + { + auto block = get_out_block_size(p); + if (block.out_depth == 8) + return { WeightsLayout::os_iyx_osv64 }; + if (block.out_depth == 4) + return { WeightsLayout::os_iyx_osv32 }; + if (block.out_depth == 2) + return { WeightsLayout::os_iyx_osv16 }; + else + return{ WeightsLayout::yxio }; + } + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_1x1_opt::SetDefault(const fused_conv_eltwise_params& arg, int) const + { + DispatchData runInfo = Parent::SetDefault(arg); + + constexpr size_t sub_group_size = 8; + + runInfo.effiency = FORCE_PRIORITY_3; + + auto block = get_out_block_size(arg); + + runInfo.gws0 = arg.output.X().v / block.out_width; + runInfo.gws1 = arg.output.Y().v / block.out_height; + runInfo.gws2 = 2 * (arg.output.Feature().v * arg.output.Batch().v) / block.out_depth; // process 8 output channels per Workitem + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = 2 * sub_group_size; + + return runInfo; + } + + JitConstants fused_conv_eltwise_kernel_bfyx_1x1_opt::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + auto block = get_out_block_size(params); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height)); + jit.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block.out_depth)); + + if (!params.eltw.stride.empty()) + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y)); + } + else + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1)); + } + + return jit; + } + + KernelsData fused_conv_eltwise_kernel_bfyx_1x1_opt::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_1; + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.h new file mode 100644 index 0000000..688c8ed --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_1x1_opt.h @@ -0,0 +1,42 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_eltwise_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_eltwise_kernel_bfyx_1x1_opt : public fused_conv_eltwise_kernel_base + { + public: + using Parent = fused_conv_eltwise_kernel_base; + fused_conv_eltwise_kernel_bfyx_1x1_opt() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_bfyx_1x1_opt") {} + + virtual ~fused_conv_eltwise_kernel_bfyx_1x1_opt() {} + + + protected: + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const override; + std::string GetKernelName(const fused_conv_eltwise_params& params) const override; + bool NeedPaddedInput() const override { return true; } + JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; + bool Validate(const Params& p, const optional_params& o) const override; + DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp new file mode 100644 index 0000000..99a8c12 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp @@ -0,0 +1,303 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h" + +namespace kernel_selector +{ + // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel. + constexpr size_t sub_group_size = 16; + + fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::fused_conv_eltwise_kernel_bfyx_os_iyx_osv16() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_bfyx_os_iyx_osv16") + { + // Generate the dispatch options to the auto-tuner. + std::vector blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 }; + std::vector blockHeightSizes = { 1,2,3,4,5 }; + std::vector prefetchSizes = { 1,2,3,4,5,6,8,10 }; + std::vector executionModes = fused_conv_eltwise_kernel_base::autoTuneOptions; + const size_t maxBlockSize = 60; + + for (auto executionMode : executionModes) + { + for (auto blockWidth : blockWidthSizes) + { + for (auto blockHeight : blockHeightSizes) + { + for (auto prefetch : prefetchSizes) + { + if (blockWidth * blockHeight <= maxBlockSize) + { + autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode }); + } + } + } + } + } + } + + ParamsKey fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputWeightsType(WeightsType::F16); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableSubGroup(); + k.EnableBiasPerFeature(); + k.EnableBiasPerOutput(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableFusedConvEltwSplitSupport(); + k.EnableFusedConvEltwDilation(); + k.EnableFusedConvEltwTranspose(); + k.EnableFusedConvEltwiseRWOutOpt(); // data for second input are already in output + return k; + } + + static std::pair get_bfyx_req_input_block_dims( + size_t output_block_width, + size_t output_block_height, + const uSize& filter_size, + const uSize& stride, + const uSize& dilation, + size_t sg_size = 16, + size_t read_chunk_size = 8, + size_t min_read_size = 16) + { + assert(output_block_width > 0 && output_block_height > 0); + assert(stride.x > 0 && stride.y > 0); + assert(filter_size.x > 0 && filter_size.y > 0); + + // Number of elements in X dimension needed from input to compute output block without re-reading input. + size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1; + // Number of elements in Y dimension needed from input to compute output block without re-reading input. + size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1; + + // Required number of elements in X dimension rounded to nearest >= read chunk size. + size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size); + // Number of sub-group-sized vectors of unit type needed to store input block. + size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size); + + return std::make_pair(input_block_array_size, input_block_read_width); + } + + static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y) + { + // how many elements we will compute in each dimension + size_t computed_x = Align(output_x, block_x); + size_t computed_y = Align(output_y, block_y); + // how many simds we need in each dimension + size_t simds_x = computed_x / block_x; + size_t simds_y = computed_y / block_y; + // how many unused values we have in each dimension + size_t unused_x = computed_x - output_x; + size_t unused_y = computed_y - output_y; + + block_x -= unused_x / simds_x; + block_y -= unused_y / simds_y; + } + + fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::AutoTuneOption fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const + { + if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size())) + { + return autoTuneOptions[autoTuneIndex]; + } + + AutoTuneOption option = { 0, 0, 0, DEFAULT }; + + const convolution_params& cp = static_cast(p); + + if (cp.stride.x == 1 && cp.stride.y == 1) + { + if (cp.filterSize.x == 1 && cp.filterSize.y == 1) + { + option.blockWidth = 16; + option.blockHeight = 1; + option.prefetch = 4; + } + //if less than 16 values is required to compute one single row of output + //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results) + else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size) + { + option.blockWidth = cp.output.X().v; + option.blockHeight = 1; + option.prefetch = 4; + } + else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) + { + option.blockWidth = sub_group_size - cp.filterSize.x + 1; + option.blockHeight = 2; + option.prefetch = 4; + } + else + { + option.blockWidth = 4; + option.blockHeight = 3; + option.prefetch = 4; + } + } + else if (cp.stride.x == 2 && cp.stride.y == 2) + { + option.blockWidth = 5; + option.blockHeight = 4; + option.prefetch = 4; + } + else + { + option.blockWidth = 4; + option.blockHeight = 3; + option.prefetch = 5; + //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better + } + + // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes + if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) + { + shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, + option.blockWidth, option.blockHeight); + } + + return option; + } + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::SetDefault(const fused_conv_eltwise_params& cp, int autoTuneIndex) const + { + DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp); + + const auto of_maps = cp.output.Feature().v; + const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); + + runInfo.effiency = FORCE_PRIORITY_3; + + auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex); + runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth; + runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight; + runInfo.cldnnStyle.prefetch = tuneOptions.prefetch; + + auto input_block_dims = get_bfyx_req_input_block_dims( + runInfo.cldnnStyle.blockWidth, + runInfo.cldnnStyle.blockHeight, + cp.conv.filterSize, + cp.conv.stride, + cp.conv.dilation, + sub_group_size, + runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2, + sub_group_size); + runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first; + runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second; + + runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth); + runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight); + runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v; + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = sub_group_size; + + return runInfo; + } + + bool fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::Validate(const Params& p, const optional_params& o) const + { + if (!fused_conv_eltwise_kernel_base::Validate(p, o) || + !FusedConvolutionEltwiseCheckInput(p, o)) + { + return false; + } + + return true; + } + + JitConstants fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const + { + const auto of_maps = params.output.Feature().v; + const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); + size_t leftovers = of_threads_per_batch - of_maps; + + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2)); + jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth)); + jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight)); + jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize)); + jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth)); + jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch)); + + if (leftovers) + { + jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers)); + } + + if (!params.eltw.stride.empty()) + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y)); + } + else + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1)); + } + + return jit; + } + + std::vector fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetSupportedWeightLayouts(const fused_conv_eltwise_params& params) const + { + if (!params.conv.transposed) + { + return{ WeightsLayout::os_iyx_osv16 }; + } + else + { + return{ WeightsLayout::os_iyx_osv16_rotate_180 }; + } + } + + KernelsData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetTunedKernelsDataByIndex(params, options); + } + + KernelsData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const + { + if (!Validate(params, options)) + { + return{}; + } + + KernelsData res = {}; + + for (size_t i = 0; i < autoTuneOptions.size(); i++) + { + KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i); + if (!kd.empty()) + { + res.emplace_back(kd[0]); + } + } + + return res; + } + +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h new file mode 100644 index 0000000..9ded5dd --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h @@ -0,0 +1,54 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_eltwise_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_eltwise_kernel_bfyx_os_iyx_osv16 : public fused_conv_eltwise_kernel_base + { + public: + using Parent = fused_conv_eltwise_kernel_base; + fused_conv_eltwise_kernel_bfyx_os_iyx_osv16(); + virtual ~fused_conv_eltwise_kernel_bfyx_os_iyx_osv16() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const override; + JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; + bool Validate(const Params& p, const optional_params& o) const override; + bool NeedPaddedInput() const override { return true; } + DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; + + private: + struct AutoTuneOption + { + size_t blockWidth; + size_t blockHeight; + size_t prefetch; + std::string exeMode; + }; + + AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; + + std::vector autoTuneOptions = {}; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp new file mode 100644 index 0000000..fefe82b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp @@ -0,0 +1,164 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_gemm.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey fused_conv_eltwise_kernel_gemm::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputWeightsType(WeightsType::F16); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableSubGroup(); + //k.EnableSubGroupShort(); // we need it for FP16 only. we check it on the Validate phase + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableFusedConvEltwSplitSupport(); + return k; + } + + std::string fused_conv_eltwise_kernel_gemm::GetKernelName(const fused_conv_eltwise_params& params) const + { + if (params.inputs[0].GetDType() == Datatype::F32) + { + return kernelName + "_fp32"; + } + else + { + return kernelName + "_fp16"; + } + } + + bool fused_conv_eltwise_kernel_gemm::Validate(const Params& p, const optional_params& o) const + { + if (!fused_conv_eltwise_kernel_base::Validate(p, o) || + !FusedConvolutionEltwiseCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + // make sure it's 1x1 conv + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + // make sure stride is 1x1 + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + // input padding not supported + if (cp.inputs[0].X().pad.Total() != 0 || + cp.inputs[0].Y().pad.Total() != 0 || + cp.inputs[0].Feature().pad.Total() != 0 || + cp.inputs[0].Batch().pad.Total() != 0) + return false; + + // input and output spatial sizes must match + if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) + return false; + + return true; + } + + std::vector fused_conv_eltwise_kernel_gemm::GetSupportedWeightLayouts(const fused_conv_eltwise_params& params) const + { + if (params.inputs[0].GetDType() == Datatype::F16) + { + return{ WeightsLayout::iy_xs_os_xsv2_osv16__ao32 }; + } + else + { + return{ WeightsLayout::iy_xs_os_xsv2_osv8__ao32 }; + } + } + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_gemm::SetDefault(const fused_conv_eltwise_params& arg, int) const + { + DispatchData runInfo = Parent::SetDefault(arg); + + runInfo.lws0 = 1; + runInfo.lws2 = 1; + + if (arg.inputs[0].GetDType() == Datatype::F16) + { + runInfo.gemmStyle = { 1, arg.conv.filterSize.x, 32, 32, 1, 1 }; + runInfo.lws1 = 16; + runInfo.effiency = FORCE_PRIORITY_6; + } + else + { + runInfo.gemmStyle = { 2, arg.conv.filterSize.x, 32, 32, 2, 1 }; + runInfo.lws1 = 8; + runInfo.effiency = FORCE_PRIORITY_8; + } + + size_t sgemm_m = RoundUp(arg.output.X().v * arg.output.Y().v, runInfo.gemmStyle.subBlockDimM); + size_t sgemm_n = RoundUp(arg.output.Feature().v, runInfo.gemmStyle.subBlockDimN); + + runInfo.gws0 = RoundUp(CeilDiv(sgemm_n, runInfo.gemmStyle.globalWorkSizeDX), runInfo.lws0); + runInfo.gws1 = RoundUp(CeilDiv(sgemm_m, runInfo.gemmStyle.globalWorkSizeDY), runInfo.lws1); + runInfo.gws2 = arg.output.Batch().v; + + return runInfo; + } + + JitConstants fused_conv_eltwise_kernel_gemm::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstants({ + MakeJitConstant("ALIGNED_OFM", RoundUp(params.output.Feature().v, runInfo.gemmStyle.subBlockDimN)), + MakeJitConstant("DX", runInfo.gemmStyle.globalWorkSizeDX), + MakeJitConstant("DY", runInfo.gemmStyle.globalWorkSizeDY), + MakeJitConstant("FILTER_SIZE_X_DIV2", params.conv.filterSize.x / 2), + MakeJitConstant("INPUT_BUFFER_WIDTH_PADDED", ""), // TODO: enable non padding path again + MakeJitConstant("INPUT_BUFFER_HEIGHT_PADDED", ""), + }); + + if (CeilDiv(RoundUp(params.output.X().v * params.output.Y().v, runInfo.gemmStyle.subBlockDimM), runInfo.gemmStyle.globalWorkSizeDY) % runInfo.lws1 != 0) + jit.AddConstant(MakeJitConstant("LEFTOVERS", 1)); + + if (!params.eltw.stride.empty()) + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y)); + } + else + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1)); + } + + return jit; + } + + KernelsData fused_conv_eltwise_kernel_gemm::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetTunedKernelsDataByIndex(params, options); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h new file mode 100644 index 0000000..476d875 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h @@ -0,0 +1,42 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_eltwise_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_eltwise_kernel_gemm : public fused_conv_eltwise_kernel_base + { + public: + using Parent = fused_conv_eltwise_kernel_base; + fused_conv_eltwise_kernel_gemm() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_gemm") {} + + virtual ~fused_conv_eltwise_kernel_gemm() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const override; + std::string GetKernelName(const fused_conv_eltwise_params& params) const override; + bool NeedPaddedInput() const override { return true; } + JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; + bool Validate(const Params& p, const optional_params& o) const override; + DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp new file mode 100644 index 0000000..dd21850 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp @@ -0,0 +1,224 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + static const size_t _SG_TILE_M = 32; + static const size_t _SG_TILE_N = 32; + static const size_t _SG_SIZE = 8; // sub group size + static const size_t _TILES_PER_SG_X = 1; // Persistent threads + static const size_t _TILES_PER_SG_Y = 1; // Persistent threads + + ParamsKey fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableFusedConvEltwInt8Quantization(); + k.EnableFusedConvEltwOutputCalibration(); + k.DisableTuning(); + k.EnableFusedConvEltwiseRWOutOpt(); + return k; + } + + bool fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const + { + if (!fused_conv_eltwise_kernel_base::Validate(p, o) || + !FusedConvolutionEltwiseCheckInput(p, o)) + { + return false; + } + + const fused_conv_eltwise_params& cp = static_cast(p); + + // make sure it's 1x1 conv + if (cp.conv.filterSize.x != 1 || cp.conv.filterSize.y != 1) + return false; + + // make sure stride is 1x1 + if (cp.conv.stride.x != 1 || cp.conv.stride.y != 1) + return false; + + // input padding not supported + if (cp.inputs[0].X().pad.Total() != 0 || + cp.inputs[0].Y().pad.Total() != 0 || + cp.inputs[0].Feature().pad.Total() != 0 || + cp.inputs[0].Batch().pad.Total() != 0) + return false; + + // input and output spatial sizes must match + if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) + return false; + + const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ; + const auto k = cp.inputs[0].Feature().v; + const auto n = cp.output.Feature().v ; + + if (m % 32 != 0 && m % 128 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 + return false; + + if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 + return false; + + if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 + return false; + + return true; + } + + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault(const fused_conv_eltwise_params& arg, int) const + { + DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; + size_t mat_n = arg.output.Feature().v; + + size_t _MATRIX_M = mat_m; + size_t _MATRIX_N = mat_n; + + size_t _WG_TILE_M = 128; + size_t _WG_TILE_N = 128; + + // Calculate number of threads needed + const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; + const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y ; + + // Define execution setup for kernel: + size_t globalWorkSize[3] = { threadsX, threadsY, 1 }; + size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 }; + + runInfo.gws0 = globalWorkSize[0]; + runInfo.gws1 = globalWorkSize[1]; + runInfo.gws2 = globalWorkSize[2]; + + runInfo.lws0 = localWorkSize[0]; + runInfo.lws1 = localWorkSize[1]; + runInfo.lws2 = localWorkSize[2]; + + return runInfo; + } + + JitConstants fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("WG_TILE_M", 128)); // Work-Group tile size M, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1)); // Persistent threads + jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1)); // Persistent threads + + // Do not change values below + jit.AddConstant(MakeJitConstant("DIM_X", 0)); + jit.AddConstant(MakeJitConstant("DIM_Y", 1)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); + jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); + jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); + jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); + jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); + jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); + jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); + + jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); + jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); + jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); + + const auto& input = params.inputs[0]; + const auto& output = params.output; + + auto m = output.X().v * output.Y().v * output.Batch().v; + auto k = input.Feature().v; + auto n = output.Feature().v; + + jit.AddConstant(MakeJitConstant("MATRIX_M", m)); + jit.AddConstant(MakeJitConstant("MATRIX_K", k)); + jit.AddConstant(MakeJitConstant("MATRIX_N", n)); + + const size_t out_x_pitch = 32 * 4; + const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); + const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); + const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); + const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; + + jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); + jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); + jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); + + bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; + jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); + + bool eltw_padding = false; + if (!params.second_input_in_output) + { + // for second input + const size_t in2_x_pitch = 32 * 4; + const size_t in2_y_pitch = 32 * 4 * params.inputs[1].X().LogicalDimPadded(); + const size_t in2_b_block_pitch = in2_y_pitch * params.inputs[1].Y().LogicalDimPadded(); + const size_t in2_f_block_pitch = in2_b_block_pitch * ((params.inputs[1].Batch().v + 3) / 4); + const size_t in2_offset = in2_x_pitch * params.inputs[1].X().pad.before + in2_y_pitch * params.inputs[1].Y().pad.before; + + jit.AddConstant(MakeJitConstant("IN2_X_PITCH", in2_x_pitch)); + jit.AddConstant(MakeJitConstant("IN2_Y_PITCH", in2_y_pitch)); + jit.AddConstant(MakeJitConstant("IN2_B_BLOCK_PITCH", in2_b_block_pitch)); + jit.AddConstant(MakeJitConstant("IN2_F_BLOCK_PITCH", in2_f_block_pitch)); + jit.AddConstant(MakeJitConstant("IN2_OFFSET", in2_offset)); + + eltw_padding = params.inputs[1].X().pad.Total() != 0 || params.inputs[1].Y().pad.Total() != 0;; + } + else + { + eltw_padding = out_padding; + } + + jit.AddConstant(MakeJitConstant("ELTW_WITH_PADDING", eltw_padding)); + + if (!params.eltw.stride.empty()) + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y)); + } + else + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1)); + } + + return jit; + } + + KernelsData fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_1; //_3 + return kd; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h new file mode 100644 index 0000000..331a50c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h @@ -0,0 +1,45 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_eltwise_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8 : public fused_conv_eltwise_kernel_base + { + public: + using Parent = fused_conv_eltwise_kernel_base; + fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8") {} + + virtual ~fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const override + { + return{ + WeightsLayout::is_o32_yx_isv32_swizzled_by_4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp new file mode 100644 index 0000000..f3052eb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp @@ -0,0 +1,224 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + static const size_t _SG_TILE_M = 32; + static const size_t _SG_TILE_N = 32; + static const size_t _SG_SIZE = 8; // sub group size + static const size_t _TILES_PER_SG_X = 1; // Persistent threads + static const size_t _TILES_PER_SG_Y = 1; // Persistent threads + + ParamsKey fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableInputWeightsType(WeightsType::INT8); + k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableBatching(); + k.EnableFusedConvEltwInt8Quantization(); + k.EnableFusedConvEltwOutputCalibration(); + k.DisableTuning(); + k.EnableFusedConvEltwiseRWOutOpt(); + return k; + } + + bool fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const + { + if (!fused_conv_eltwise_kernel_base::Validate(p, o) || + !FusedConvolutionEltwiseCheckInput(p, o)) + { + return false; + } + + const convolution_params& cp = static_cast(p); + + // make sure it's 1x1 conv + if (cp.filterSize.x != 1 || cp.filterSize.y != 1) + return false; + + // make sure stride is 1x1 + if (cp.stride.x != 1 || cp.stride.y != 1) + return false; + + // input padding not supported + if (cp.inputs[0].X().pad.Total() != 0 || + cp.inputs[0].Y().pad.Total() != 0 || + cp.inputs[0].Feature().pad.Total() != 0 || + cp.inputs[0].Batch().pad.Total() != 0) + return false; + + // input and output spatial sizes must match + if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) + return false; + + const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ; + const auto k = cp.inputs[0].Feature().v; + const auto n = cp.output.Feature().v ; + + if (m % 32 != 0 && m % 224 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 + return false; + + if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 + return false; + + if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 + return false; + + return true; + } + + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault(const fused_conv_eltwise_params& arg, int) const + { + DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg); + + runInfo.effiency = FORCE_PRIORITY_1; + + size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; + size_t mat_n = arg.output.Feature().v; + + size_t _MATRIX_M = mat_m; + size_t _MATRIX_N = mat_n; + + size_t _WG_TILE_M = 224; + size_t _WG_TILE_N = 128; + + // Calculate number of threads needed + const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; + const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y ; + + // Define execution setup for kernel: + size_t globalWorkSize[3] = { threadsX, threadsY, 1 }; + size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 }; + + runInfo.gws0 = globalWorkSize[0]; + runInfo.gws1 = globalWorkSize[1]; + runInfo.gws2 = globalWorkSize[2]; + + runInfo.lws0 = localWorkSize[0]; + runInfo.lws1 = localWorkSize[1]; + runInfo.lws2 = localWorkSize[2]; + + return runInfo; + } + + JitConstants fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const + { + auto jit = Parent::GetJitConstants(params, runInfo); + + jit.AddConstant(MakeJitConstant("WG_TILE_M", 224)); // Work-Group tile size M, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X)); + jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y)); + + // Do not change values below + jit.AddConstant(MakeJitConstant("DIM_X", 0)); + jit.AddConstant(MakeJitConstant("DIM_Y", 1)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); + jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); + jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); + jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); + jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); + jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); + jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); + jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); + + jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); + jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); + jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); + + const auto& input = params.inputs[0]; + const auto& output = params.output; + + auto m = output.X().v * output.Y().v * output.Batch().v; + auto k = input.Feature().v; + auto n = output.Feature().v; + + jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M + jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32 + jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N + + const size_t out_x_pitch = 32 * 4; + const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); + const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); + const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); + const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; + + jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); + jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); + jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); + jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); + + bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; + jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); + + bool eltw_padding = false; + if (!params.second_input_in_output) + { + // for second input + const size_t in2_x_pitch = 32 * 4; + const size_t in2_y_pitch = 32 * 4 * params.inputs[1].X().LogicalDimPadded(); + const size_t in2_b_block_pitch = in2_y_pitch * params.inputs[1].Y().LogicalDimPadded(); + const size_t in2_f_block_pitch = in2_b_block_pitch * ((params.inputs[1].Batch().v + 3) / 4); + const size_t in2_offset = in2_x_pitch * params.inputs[1].X().pad.before + in2_y_pitch * params.inputs[1].Y().pad.before; + + jit.AddConstant(MakeJitConstant("IN2_X_PITCH", in2_x_pitch)); + jit.AddConstant(MakeJitConstant("IN2_Y_PITCH", in2_y_pitch)); + jit.AddConstant(MakeJitConstant("IN2_B_BLOCK_PITCH", in2_b_block_pitch)); + jit.AddConstant(MakeJitConstant("IN2_F_BLOCK_PITCH", in2_f_block_pitch)); + jit.AddConstant(MakeJitConstant("IN2_OFFSET", in2_offset)); + + eltw_padding = params.inputs[1].X().pad.Total() != 0 || params.inputs[1].Y().pad.Total() != 0;; + } + else + { + eltw_padding = out_padding; + } + + jit.AddConstant(MakeJitConstant("ELTW_WITH_PADDING", eltw_padding)); + + if (!params.eltw.stride.empty()) + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y)); + } + else + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1)); + } + + return jit; + } + + KernelsData fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kd = GetCommonKernelsData(params, options); + if (!kd.empty()) + kd[0].estimatedTime = FORCE_PRIORITY_1; //_3 + return kd; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h new file mode 100644 index 0000000..a5ca36c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h @@ -0,0 +1,45 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_eltwise_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8 : public fused_conv_eltwise_kernel_base + { + public: + using Parent = fused_conv_eltwise_kernel_base; + fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8") {} + + virtual ~fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; + virtual std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const override + { + return{ + WeightsLayout::is_o32_yx_isv32_swizzled_by_4, + }; + } + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp new file mode 100644 index 0000000..670fae8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp @@ -0,0 +1,41 @@ +/* +// Copyright (c) 2016-2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_selector.h" +#include "fused_conv_eltwise_kernel_gemm.h" +#include "fused_conv_eltwise_kernel_bfyx_1x1_opt.h" +#include "fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h" +#include "fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h" +#include "fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h" +#include "fused_conv_eltwise_kernel_yxfb_yxio_b16.h" + +namespace kernel_selector +{ + fused_conv_eltwise_kernel_selector::fused_conv_eltwise_kernel_selector() + { +// Attach(); + Attach(); + Attach(); + Attach(); + Attach(); + Attach(); + } + + KernelsData fused_conv_eltwise_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetAutoTuneBestKernel(params, options, KernelType::FUSED_CONV_ELTWISE); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h new file mode 100644 index 0000000..94225b8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class fused_conv_eltwise_kernel_selector : public kernel_selector_base + { + public: + static fused_conv_eltwise_kernel_selector &Instance() { + static fused_conv_eltwise_kernel_selector instance_; + return instance_; + } + + fused_conv_eltwise_kernel_selector(); + + virtual ~fused_conv_eltwise_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.cpp new file mode 100644 index 0000000..77b2093 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.cpp @@ -0,0 +1,224 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_kernel_yxfb_yxio_b16.h" + +namespace kernel_selector +{ + + ParamsKey fused_conv_eltwise_kernel_yxfb_yxio_b16::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputWeightsType(WeightsType::F16); + k.EnableInputWeightsType(WeightsType::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableInputLayout(DataLayout::yxfb); + k.EnableOutputLayout(DataLayout::yxfb); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBiasPerFeature(); + k.EnableNonBiasTerm(); + k.EnableBatching(); + k.EnableSplitSupport(); + k.EnableDilation(); + k.EnableSubGroup(); + k.EnableFusedConvEltwiseRWOutOpt(); + return k; + } + + std::string fused_conv_eltwise_kernel_yxfb_yxio_b16::GetKernelName(const fused_conv_eltwise_params& params) const + { + if (params.inputs[0].GetDType() == Datatype::F32) + { + return kernelName + "_fp32"; + } + else + { + return kernelName + "_fp16"; + } + } + + namespace { + // how many batches will a single work item compute + size_t GetBatchesPerWorkItem(size_t batch_size, Datatype dataType) + { + if (dataType == Datatype::F16) + { + const uint32_t min_batches_per_wi = 1; + const uint32_t min_lws = 16; + + if (batch_size % (4 * min_batches_per_wi * min_lws) == 0) + { + return 4 * min_batches_per_wi; // USE_BLOCK_READ_2 + as_half4 + } + else if (batch_size % (2 * min_batches_per_wi * min_lws) == 0) + { + return 2 * min_batches_per_wi; // USE_BLOCK_READ_1 + as_half2 + } + else + { + return min_batches_per_wi; + } + } + else + { + return 2; + } + } + + size_t GetOfmPerWorkitem(Datatype dataType) + { + if (dataType == Datatype::F16) + return 16; + return 8; + } + } + + fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_yxfb_yxio_b16::SetDefault(const fused_conv_eltwise_params& arg, int) const + { + DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg); + + const auto filter_ofm_num = arg.weights.OFM().v; + const auto batch_size = arg.output.Batch().v; + const uint32_t min_lws = 16; + + const size_t batchesPerWorkItem = GetBatchesPerWorkItem(batch_size, arg.inputs[0].GetDType()); + const size_t ofmPerWorkItem = GetOfmPerWorkitem(arg.inputs[0].GetDType()); + + if (arg.inputs[0].GetDType() == Datatype::F16) + { + runInfo.effiency = FORCE_PRIORITY_7; + } + else + { + runInfo.effiency = FORCE_PRIORITY_9; + } + + runInfo.lws0 = min_lws; + runInfo.gws0 = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem); + + return runInfo; + } + + bool fused_conv_eltwise_kernel_yxfb_yxio_b16::Validate(const Params& p, const optional_params& o) const + { + if (!fused_conv_eltwise_kernel_base::Validate(p, o)) + { + return false; + } + const convolution_params& params = static_cast(p); + + const auto filter_ofm_num = params.weights.OFM().v; + const auto batch_size = params.output.Batch().v; + const uint32_t min_lws = 16; + + const bool bInputValidated = + (filter_ofm_num > 0) && + (batch_size > 0) && + (params.output.Feature().v == filter_ofm_num); + + if (!bInputValidated) + { + return false; + } + + if (params.inputs[0].GetDType() == Datatype::F16) + { + const uint32_t min_ofm_per_wi = 16; + const uint32_t min_batches_per_wi = 1; + + const bool bFilterOK = filter_ofm_num % min_ofm_per_wi == 0; // Number of output features dividable by minimum number of output features processed inside work item. + const bool bBatchOK = batch_size % (min_batches_per_wi * min_lws) == 0; // Batch size dividable by minimum number of batches processed when smallest local work size is used. + + if (!bFilterOK || !bBatchOK) + { + return false; + } + } + else + { + if ((filter_ofm_num * batch_size) % min_lws != 0 || + batch_size < 32) // TODO: check why it's not supported + { + return false; + } + } + + return true; + } + + JitConstants fused_conv_eltwise_kernel_yxfb_yxio_b16::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const + { + auto jit = Parent::GetJitConstants(params, kd); + + const auto local_work_group_size = kd.lws0; + const auto batch_size = params.output.Batch().v; + + if (params.inputs[0].GetDType() == Datatype::F32) + { + // A LITTLE HACK, for convolutions with low number of input features don't use block reads, and it will speed up by 25% + // TODO - investigate why is this happening + if (params.inputs[0].Feature().v > 4) + { + jit.AddConstant(MakeJitConstant("USE_BLOCK_READ_2", "")); + } + } + else + { + const auto batch_pad_before = params.output.Batch().pad.before; + const auto feature_pitch = params.output.Feature().pitch; + + if (batch_size >= 64 && (feature_pitch % 2 == 0) && (batch_pad_before % 2 == 0)) + { + jit.AddConstant(MakeJitConstant("USE_BLOCK_READ_2", "")); + } + else if (batch_size >= 32 && (feature_pitch % 2 == 0) && (batch_pad_before % 2 == 0)) + { + jit.AddConstant(MakeJitConstant("USE_BLOCK_READ_1", "")); + } + } + + const size_t batchesPerWorkItem = GetBatchesPerWorkItem(batch_size, params.inputs[0].GetDType()); + const size_t ofmPerWorkItem = GetOfmPerWorkitem(params.inputs[0].GetDType()); + + jit.AddConstants({ + MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0), + MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem), + MakeJitConstant("BATCHES_PER_WORK_ITEM", batchesPerWorkItem), // how many batches will a single work item compute + MakeJitConstant("LOCAL_WORK_GROUPS_PER_SINGLE_BATCHES_ELEMENTS", std::max(batch_size / batchesPerWorkItem / local_work_group_size, static_cast(1))), // how many local work groups we need to compute single element for each batch + MakeJitConstant("WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS", batch_size / batchesPerWorkItem), // how many work items we need to compute single element for each batch + }); + + if (!params.eltw.stride.empty()) + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y)); + } + else + { + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1)); + jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1)); + } + + return jit; + } + + KernelsData fused_conv_eltwise_kernel_yxfb_yxio_b16::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetTunedKernelsDataByIndex(params, options); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.h new file mode 100644 index 0000000..91d22d1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_yxfb_yxio_b16.h @@ -0,0 +1,40 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "fused_conv_eltwise_kernel_base.h" + +namespace kernel_selector { + + class fused_conv_eltwise_kernel_yxfb_yxio_b16 : public fused_conv_eltwise_kernel_base + { + public: + using Parent = fused_conv_eltwise_kernel_base; + fused_conv_eltwise_kernel_yxfb_yxio_b16() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_yxfb_yxio_b16") {} + virtual ~fused_conv_eltwise_kernel_yxfb_yxio_b16() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + std::vector GetSupportedWeightLayouts(const fused_conv_eltwise_params&) const override { return{ WeightsLayout::yxio }; } + std::string GetKernelName(const fused_conv_eltwise_params&) const override; + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; + DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp new file mode 100644 index 0000000..5a9d50b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp @@ -0,0 +1,144 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "gather_kernel_ref.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector +{ + static int32_t GetGatherChannelIndex(const gather_params& params) + { + Tensor::DataChannelName name = Tensor::DataChannelName::X; + + switch (params.axis) + { + case GatherAxis::X: + return 3; + case GatherAxis::Y: + return 2; + case GatherAxis::FEATURE: + return 1; + case GatherAxis::BATCH: + return 0; + default: break; + } + + return DataTensor::Channelndex(params.output.GetLayout(), name); + } + + ParamsKey GatherKernelRef::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnableLookUpTableIndicesFormat(Datatype::F32); + return k; + } + + static size_t getPartSize(const gather_params& params, int32_t axis) + { + size_t partSize = 1; + for (size_t i = params.inputs[0].Dimentions() - axis; i > 0; --i) + partSize *= params.inputs[0].GetDims()[i-1].v; + return partSize; + } + + static size_t getNumberOfParts(const gather_params& params, size_t partSize) + { + return params.inputs[0].LogicalSize() / partSize; + } + + static size_t getSliceSize(const gather_params& params, int32_t axis) + { + size_t numberOfItemsInSlice = 1; + for (size_t i = params.inputs[0].Dimentions() - axis - 1; i > 0; --i) + numberOfItemsInSlice *= params.inputs[0].GetDims()[i-1].v; + return numberOfItemsInSlice; + } + + CommonDispatchData GatherKernelRef::SetDefault(const gather_params& params, const optional_params&) const + { + CommonDispatchData runInfo; + + const int32_t axis = GetGatherChannelIndex(params); + + const size_t numberOfParts = params.inputs[0].LogicalSize() / getPartSize(params, axis); + + size_t gws = numberOfParts * params.inputs[1].LogicalSize(); + + const size_t vectorSize = 16; + + runInfo.gws0 = Align(gws, vectorSize); + runInfo.gws1 = 1; + runInfo.gws2 = 1; + + runInfo.lws0 = vectorSize; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; + + return runInfo; + } + + JitConstants GatherKernelRef::GetJitConstants(const gather_params& params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + int32_t axis = GetGatherChannelIndex(params); + size_t partSize = getPartSize(params, axis); + size_t sliceSize = getSliceSize(params, axis); + size_t numberOfParts = getNumberOfParts(params, partSize); + size_t numberOfIndexes = params.inputs[1].LogicalSize(); + + jit.AddConstant(MakeJitConstant("AXIS", axis)); + jit.AddConstant(MakeJitConstant("PART_SIZE", partSize)); + jit.AddConstant(MakeJitConstant("SLICE_SIZE", sliceSize)); + jit.AddConstant(MakeJitConstant("PARTS_NUMBER", numberOfParts)); + jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", numberOfParts * numberOfIndexes)); + + return jit; + } + + KernelsData GatherKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelData kd = KernelData::Default(params); + gather_params& newParams = *static_cast(kd.params.get()); + + assert(params.GetType() == KernelType::GATHER); + + auto runInfo = SetDefault(newParams, options); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); + auto cldnn_jit = GetJitConstants(newParams); + std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2); + + kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h new file mode 100644 index 0000000..630cf14 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h @@ -0,0 +1,56 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // gather_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct gather_params : public base_params + { + gather_params() : base_params(KernelType::GATHER) {} + + GatherAxis axis; + + virtual ParamsKey GetParamsKey() const + { + return base_params::GetParamsKey(); + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // gather_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct gather_optional_params : optional_params + { + gather_optional_params() : optional_params(KernelType::GATHER) {} + }; + + class GatherKernelRef : public common_kernel_base + { + public: + GatherKernelRef() : common_kernel_base("gather_ref") {} + virtual ~GatherKernelRef() {} + virtual JitConstants GetJitConstants(const gather_params& params) const; + virtual CommonDispatchData SetDefault(const gather_params& params, const optional_params&) const; + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.cpp new file mode 100644 index 0000000..3f7962a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.cpp @@ -0,0 +1,31 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "gather_kernel_selector.h" +#include "gather_kernel_ref.h" + +namespace kernel_selector { + + gather_kernel_selector::gather_kernel_selector() + { + Attach(); + } + + KernelsData gather_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::GATHER); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.h new file mode 100644 index 0000000..630c1ef --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class gather_kernel_selector : public kernel_selector_base + { + public: + static gather_kernel_selector &Instance() { + static gather_kernel_selector instance_; + return instance_; + } + + gather_kernel_selector(); + + virtual ~gather_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_base.cpp index 12af8a1..4d5e5d7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_base.cpp @@ -89,7 +89,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = k_data.kernels[0]; - FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, false, false, (uint32_t)prim_params.inputs.size()); + FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, (uint32_t)prim_params.inputs.size()); k_data.estimatedTime = estimated_time; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_ref.h index 8972759..8b7410a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_ref.h @@ -25,6 +25,8 @@ namespace kernel_selector GemmKernelRef() : GemmKernelBase("gemm_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp index c0dc085..0aa05d3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp @@ -24,7 +24,42 @@ namespace kernel_selector { JitConstants jit = MakeBaseParamsJitConstants(params); - jit.AddConstant(MakeJitConstant(toString(params.axis), "")); + jit.AddConstant(MakeJitConstant("AXES_NUMBER", params.axes.size())); + + if (params.reverse) { + jit.AddConstant(MakeJitConstant("REVERSE", 1)); + } + + for (size_t i = 0; i < params.axes.size(); i++) + { + std::string size_name = "REVERSE_AXIS_SIZE"; + size_t size_value = 0; + if (params.axes.size() > 1) { + std::stringstream ss; + ss << "REVERSE_" << toString(params.axes[i]) << "_SIZE"; + size_name = ss.str(); + } + jit.AddConstant(MakeJitConstant(toString(params.axes[i]), "")); + if (params.reverse) { + if (params.axes[i] == IndexSelectAxis::BATCH) + { + size_value = params.inputs.at(0).Batch().v; + } + else if (params.axes[i] == IndexSelectAxis::X) + { + size_value = params.inputs.at(0).X().v; + } + else if (params.axes[i] == IndexSelectAxis::Y) + { + size_value = params.inputs.at(0).Y().v; + } + else if (params.axes[i] == IndexSelectAxis::FEATURE) + { + size_value = params.inputs.at(0).Feature().v; + } + } + jit.AddConstant(MakeJitConstant(size_name, size_value)); + } return jit; } @@ -32,24 +67,58 @@ namespace kernel_selector IndexSelectKernelBase::DispatchData IndexSelectKernelBase::SetDefault(const index_select_params& params) { const auto& output = params.output; - const auto& indices = params.inputs.at(1); DispatchData kd; kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; std::vector global; - if (params.axis == IndexSelectAxis::BATCH) - { - global = { 1, indices.X().v, output.Feature().v }; - } - else if (params.axis == IndexSelectAxis::X || params.axis == IndexSelectAxis::Y) - { - global = { output.Batch().v, indices.X().v, output.Feature().v }; + + if(params.axes.size() == 1) { + if (params.reverse) + { + if (params.axes[0] == IndexSelectAxis::BATCH) + { + global = { 1, params.inputs.at(0).Batch().v, output.Feature().v }; + } + else if (params.axes[0] == IndexSelectAxis::X) + { + global = { output.Batch().v, params.inputs.at(0).X().v, output.Feature().v }; + } + else if (params.axes[0] == IndexSelectAxis::Y) + { + global = { output.Batch().v, params.inputs.at(0).Y().v, output.Feature().v }; + } + else if (params.axes[0] == IndexSelectAxis::FEATURE) + { + global = { output.Batch().v, params.inputs.at(0).Feature().v, output.Y().v }; + } + } + else + { + const auto indices = params.inputs.at(1).X().v; + + if (params.axes[0] == IndexSelectAxis::BATCH) + { + global = { 1, indices, output.Feature().v }; + } + else if (params.axes[0] == IndexSelectAxis::X || params.axes[0] == IndexSelectAxis::Y) + { + global = { output.Batch().v, indices, output.Feature().v }; + } + else if (params.axes[0] == IndexSelectAxis::FEATURE) + { + global = { output.Batch().v, indices, output.Y().v }; + } + } } - else if(params.axis == IndexSelectAxis::FEATURE) + else { - global = { output.Batch().v, indices.X().v, output.Y().v }; + if (params.reverse) + { + global = { output.Batch().v, output.Y().v, output.Feature().v }; + } } + const auto& local = GetOptimalLocalWorkGroupSizes(global); kd.gws0 = global[0]; @@ -77,7 +146,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = k_data.kernels[0]; - FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, false, false, (uint32_t)prim_params.inputs.size()); + FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, (uint32_t)prim_params.inputs.size()); k_data.estimatedTime = estimated_time; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h index c7abe43..2142c60 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h @@ -29,7 +29,8 @@ namespace kernel_selector : base_params(KernelType::INDEX_SELECT) {} - IndexSelectAxis axis = IndexSelectAxis::BATCH; + std::vector axes = { IndexSelectAxis::BATCH }; + bool reverse = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h index 3dd1619..e3a339b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h @@ -25,6 +25,8 @@ namespace kernel_selector IndexSelectKernelRef() : IndexSelectKernelBase("index_select_gpu_ref") {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h index 358aa8a..555531b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~LookUpTableKernelAxis() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h index 8d33d23..45385b2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~LookUpTableKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features.h index edaba0a..0586ce1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features.h @@ -26,6 +26,8 @@ namespace kernel_selector LRNKernelAcrossChannelMultipleFeatures() : LRNKernelBase("lrn_gpu_across_channel_multiple_features") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; private: @@ -33,4 +35,4 @@ namespace kernel_selector JitConstants GetJitConstants(const lrn_params& params, DispatchData kd) const override; CommonDispatchData SetDefault(const lrn_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.h index 629cc3a..17d336c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~LRNKernelAcrossChannel_b8() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; private: @@ -34,4 +36,4 @@ namespace kernel_selector JitConstants GetJitConstants(const lrn_params& params, DispatchData kd) const override; CommonDispatchData SetDefault(const lrn_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_ref.h index 20146ac..7d9e775 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_ref.h @@ -27,9 +27,11 @@ namespace kernel_selector virtual ~LRNKernelAcrossChannelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; private: CommonDispatchData SetDefault(const lrn_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.h index f15d493..9eb4d37 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.h @@ -28,6 +28,8 @@ namespace kernel_selector virtual ~LRNKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; protected: diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.h index f8eb027..51dc718 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.h @@ -28,9 +28,9 @@ namespace kernel_selector virtual ~LRNKernelWithinChannelByxfOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; virtual JitConstants GetJitConstants(const lrn_params& params, DispatchData kd) const override; virtual DispatchData SetDefault(const lrn_params& params) const override; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref.h index 0545a1d..4ae8eab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref.h @@ -27,9 +27,11 @@ namespace kernel_selector virtual ~LRNKernelWithinChannel() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; private: CommonDispatchData SetDefault(const lrn_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref_opt.h index 0fd00b4..ad4221e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_ref_opt.h @@ -27,9 +27,11 @@ namespace kernel_selector virtual ~LRNKernelWithinChannelOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; private: CommonDispatchData SetDefault(const lrn_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.cpp index 6170abd..26fdb93 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.cpp @@ -26,7 +26,11 @@ namespace kernel_selector if (params.has_cell) { const auto& cell = params.cell; - jit.AddConstants({ MakeJitConstant("CELL_TERM", true), MakeJitConstant("CELL", cell) }); + jit.AddConstants({ + MakeJitConstant("CELL_TERM", true), + MakeJitConstant("CELL", cell), + MakeJitConstant("CELL_DIRECTION", params.cell_direction) + }); } if (params.clip > 0) { std::string psclip = toCodeString(params.clip); @@ -40,6 +44,7 @@ namespace kernel_selector if (params.input_forget) { jit.AddConstants({ MakeJitConstant("INPUT_FORGET", true) }); } + jit.AddConstants({ MakeJitConstant("DIRECTION", params.direction) }); const auto& GEMMInput = params.inputs[0]; size_t size = GEMMInput.X().v / 4; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.h index c9082ce..c6d16e7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_base.h @@ -29,6 +29,8 @@ namespace kernel_selector enum order_type : int32_t { offset_iofz, // ONNX default offset_ifoz, // caffe + offset_izof, // pyTorch + offset_fizo // IE default }; lstm_elt_params() @@ -40,11 +42,15 @@ namespace kernel_selector order_type gate_order = offset_iofz; float clip = 0; bool input_forget = false; + uint32_t direction = 0; + uint32_t cell_direction = 0; size_t GetOffsetIndex(order_type type, size_t idx) const { static const std::map> offset_map { - {offset_iofz, {0, 1, 2, 3}}, - {offset_ifoz, {0, 2, 1, 3}} + {offset_iofz, { 0, 1, 2, 3}}, + {offset_ifoz, { 0, 2, 1, 3}}, + {offset_izof, { 0, 3, 1, 2}}, + {offset_fizo, { 1, 3, 0, 2}} }; return offset_map.at(type)[idx]; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_ref.h index 8213167..356d9e6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_elt_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~LSTMEltKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.cpp index 7030085..a684643 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.cpp @@ -31,11 +31,15 @@ namespace kernel_selector jit.AddConstants({ MakeJitConstant("BIAS", bias), MakeJitConstant("BIAS_TERM", true) }); } if (params.hasHidden) { - jit.AddConstants({ MakeJitConstant("HIDDEN", hidden), MakeJitConstant("HIDDEN_TERM", true) , MakeJitConstant("RECURRENT", recurrent) }); + jit.AddConstants({ MakeJitConstant("HIDDEN", hidden), + MakeJitConstant("HIDDEN_TERM", true), + MakeJitConstant("RECURRENT", recurrent), + MakeJitConstant("HIDDEN_DIRECTION", params.hidden_direction) + }); } - jit.AddConstants({ MakeJitConstant("WEIGHTS", weights)}); jit.AddConstants({ MakeJitConstant("DIRECTION", params.direction)}); + jit.AddConstants({ MakeJitConstant("INPUT_DIRECTION", params.input_direction)}); return jit; } @@ -51,7 +55,7 @@ namespace kernel_selector KernelData kd = KernelData::Default(params, orgParams.inputs.size()); - float effiency = FORCE_PRIORITY_1; + float effiency = FORCE_PRIORITY_9; const auto& input = orgParams.inputs[0]; auto newParams = orgParams; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.h index e766120..261b8e2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_base.h @@ -35,6 +35,8 @@ namespace kernel_selector bool hasBias = false; bool hasHidden = false; uint32_t direction = 0; + uint32_t input_direction = 0; // for bidirectional node fusion in stacked LSTMs + uint32_t hidden_direction = 0; void SetBias(const DataTensor& v) { bias = v; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_ref.h index 15488ac..b382309 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~LSTMGemmKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_selector.cpp index 79296da..b372bb7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemm_kernel_selector.cpp @@ -16,12 +16,16 @@ #include "lstm_gemm_kernel_selector.h" #include "lstm_gemm_kernel_ref.h" +#include "lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h" +#include "lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h" namespace kernel_selector { lstm_gemm_kernel_selector::lstm_gemm_kernel_selector() { Attach(); + Attach(); + Attach(); } KernelsData lstm_gemm_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cpp new file mode 100644 index 0000000..fcea587 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cpp @@ -0,0 +1,62 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey LSTMGemvKernel_subgroup1x64_bfyx_ff_SIMD16::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F32); + k.EnableDifferentTypes(); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableLSTMGEMMBias(); + k.EnableLSTMGEMMHidden(); + k.EnableSubGroup(); + return k; + } + + KernelsData LSTMGemvKernel_subgroup1x64_bfyx_ff_SIMD16::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kernelsData = GetCommonKernelsData(params, options); + auto &kernel = kernelsData[0].kernels[0]; + + // This kernel is good if + // 1) Batch size is 1 + // 2) The input size y-x size is 64x1 + const lstm_gemm_params& orgParams = static_cast(params); + const auto& input = orgParams.inputs[0]; + + if ( (input.Batch().v == 1) + && (input.X().v >= 64) + && (input.Y().v == 1)) + { + auto out = orgParams.output; + + kernel.workGroups.global = { 16, out.X().v, out.Batch().v }; + kernelsData[0].estimatedTime = FORCE_PRIORITY_1; + } + + return kernelsData; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h new file mode 100644 index 0000000..e0ee836 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.h @@ -0,0 +1,32 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "lstm_gemm_kernel_base.h" + +namespace kernel_selector +{ + class LSTMGemvKernel_subgroup1x64_bfyx_ff_SIMD16 : public LSTMGemmKernelBase + { + public: + LSTMGemvKernel_subgroup1x64_bfyx_ff_SIMD16() : LSTMGemmKernelBase("lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16") {} + virtual ~LSTMGemvKernel_subgroup1x64_bfyx_ff_SIMD16() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cpp new file mode 100644 index 0000000..7d34a10 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cpp @@ -0,0 +1,62 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + ParamsKey LSTMGemvKernel_subgroup1x64_bfyx_hh_SIMD16::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F16); + k.EnableDifferentTypes(); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableLSTMGEMMBias(); + k.EnableLSTMGEMMHidden(); + k.EnableSubGroup(); + return k; + } + + KernelsData LSTMGemvKernel_subgroup1x64_bfyx_hh_SIMD16::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelsData kernelsData = GetCommonKernelsData(params, options); + auto &kernel = kernelsData[0].kernels[0]; + + // This kernel is good if + // 1) Batch size is 1 + // 2) The input size y-x size is 64x1 + const lstm_gemm_params& orgParams = static_cast(params); + const auto& input = orgParams.inputs[0]; + + if ( (input.Batch().v == 1) + && (input.X().v >= 64) + && (input.Y().v == 1)) + { + auto out = orgParams.output; + + kernel.workGroups.global = { 16, out.X().v, out.Batch().v }; + kernelsData[0].estimatedTime = FORCE_PRIORITY_1; + } + + return kernelsData; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h new file mode 100644 index 0000000..c315a41 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.h @@ -0,0 +1,32 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "lstm_gemm_kernel_base.h" + +namespace kernel_selector +{ + class LSTMGemvKernel_subgroup1x64_bfyx_hh_SIMD16 : public LSTMGemmKernelBase + { + public: + LSTMGemvKernel_subgroup1x64_bfyx_hh_SIMD16() : LSTMGemmKernelBase("lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16") {} + virtual ~LSTMGemvKernel_subgroup1x64_bfyx_hh_SIMD16() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/max_unpooling/max_unpooling_kernel_gpu_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/max_unpooling/max_unpooling_kernel_gpu_ref.h index eae5976..e0ba99b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/max_unpooling/max_unpooling_kernel_gpu_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/max_unpooling/max_unpooling_kernel_gpu_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~MaxUnpoolingKernelGPURef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bfyx_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bfyx_opt.h index 9127187..bd9c3fe 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bfyx_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bfyx_opt.h @@ -27,9 +27,11 @@ namespace kernel_selector virtual ~MVNKernelBfyxOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; using Parent = MVNKernelBase; + protected: + virtual ParamsKey GetSupportedKey() const override; + private: DispatchData SetDefault(const mvn_params& params) const override; JitConstants GetJitConstants(const mvn_params& params, MVNKernelBase::DispatchData kd) const override; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_ref.h index 9a88c8d..cd0a4fb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_ref.h @@ -27,9 +27,9 @@ namespace kernel_selector virtual ~MVNKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; std::string GetKernelName(const mvn_params&) const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_across_spatial_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_across_spatial_ref.h index cc202b7..b7243f7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_across_spatial_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_across_spatial_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~NormalizeKernelAcrossSpatialRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_within_spatial_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_within_spatial_ref.h index 20f0860..fd4d36a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_within_spatial_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/normalize/normalize_kernel_within_spatial_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~NormalizeKernelWithinSpatialRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp new file mode 100644 index 0000000..c36456c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp @@ -0,0 +1,76 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "one_hot_kernel_base.h" + +#include "kernel_selector_utils.h" + + +namespace kernel_selector +{ + JitConstants OneHotKernelBase::GetJitConstants(const one_hot_params& params) + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.AddConstants({ + MakeJitConstant("ONE_HOT_AXIS", params.one_hot_axis), + MakeJitConstant("ONE_HOT_LIMIT", params.one_hot_limit) + }); + + return jit; + } + + OneHotKernelBase::DispatchData OneHotKernelBase::SetDefault(const one_hot_params& params) + { + const auto& input = params.inputs[0]; + + DispatchData kd; + + kd.fp16UnitUsed = input.GetDType() == Datatype::F16; + + std::vector global{ input.Feature().v, input.Y().v, input.X().v }; + const auto& local = GetOptimalLocalWorkGroupSizes(global); + + kd.gws0 = global[0]; + kd.gws1 = global[1]; + kd.gws2 = global[2]; + + kd.lws0 = local[0]; + kd.lws1 = local[1]; + kd.lws2 = local[2]; + + return kd; + } + + KernelsData OneHotKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const + { + assert(params.GetType() == KernelType::ONE_HOT); + + const auto& prim_params = static_cast(params); // NOLINT(cppcoreguidelines-pro-type-static-cast-downcast) + + auto run_info = SetDefault(prim_params); + KernelData k_data = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = k_data.kernels[0]; + FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point); + k_data.estimatedTime = estimated_time; + + return{ k_data }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.h new file mode 100644 index 0000000..ab387ea --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "common_kernel_base.h" +#include "kernel_selector_params.h" + + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // one_hot_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct one_hot_params : public base_params + { + one_hot_params() + : base_params(KernelType::ONE_HOT) + { + } + uint16_t one_hot_axis; + int32_t one_hot_limit; + + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // one_hot_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct one_hot_optional_params : optional_params + { + one_hot_optional_params() + : optional_params(KernelType::ONE_HOT) + { + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // OneHotKernelBase + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class OneHotKernelBase : public common_kernel_base + { + public: + using common_kernel_base::common_kernel_base; + + using DispatchData = CommonDispatchData; + + protected: + static JitConstants GetJitConstants(const one_hot_params& params); + static DispatchData SetDefault(const one_hot_params& params); + KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.cpp new file mode 100644 index 0000000..712422e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.cpp @@ -0,0 +1,49 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "one_hot_kernel_ref.h" + + +namespace kernel_selector +{ + ParamsKey OneHotKernelRef::GetSupportedKey() const + { + ParamsKey k; + + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::INT64); + + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT32); + k.EnableOutputDataType(Datatype::INT64); + + k.EnableInputLayout(DataLayout::bfyx); + + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + + return k; + } + + KernelsData OneHotKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.h new file mode 100644 index 0000000..972b7ae --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_ref.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "one_hot_kernel_base.h" + + +namespace kernel_selector +{ + class OneHotKernelRef : public OneHotKernelBase + { + public: + OneHotKernelRef() : OneHotKernelBase("one_hot_ref") {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.cpp new file mode 100644 index 0000000..230dd32 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "one_hot_kernel_selector.h" +#include "one_hot_kernel_ref.h" + +namespace kernel_selector +{ + one_hot_kernel_selector::one_hot_kernel_selector() + { + Attach(); + } + + KernelsData one_hot_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::ONE_HOT); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.h new file mode 100644 index 0000000..79c8c34 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_selector.h @@ -0,0 +1,34 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "kernel_selector.h" + + +namespace kernel_selector +{ + class one_hot_kernel_selector : public kernel_selector_base + { + public: + static one_hot_kernel_selector &Instance() { + static one_hot_kernel_selector instance; + return instance; + } + + one_hot_kernel_selector(); + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp index ca69779..14a9c95 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -42,12 +42,8 @@ namespace kernel_selector inline JitConstants MakePermuteJitConstants(const permute_params& params) { - JitConstants jit = MakeBaseParamsJitConstants(params); - - jit.AddConstants({ - MakeJitConstant("PERMUTE_ORDER", params.order) - }); - + JitConstants jit = MakeBaseParamsJitConstants(params);; + jit.AddConstant(MakeJitConstant("PERMUTE_ORDER", params.order)); return jit; } @@ -65,24 +61,14 @@ namespace kernel_selector const auto& in = newParams.inputs[0]; auto& kernel = kd.kernels[0]; - std::vector gws; - for (const auto& o : in.GetDims()) - { - gws.push_back(o.v); - } - - for (size_t i = gws.size(); i < 4; i++) - { - gws.push_back(1U); - } - kernel.workGroups.global = { gws[0], gws[1], gws[2] * gws[3] }; + kernel.workGroups.global = { in.Y().v, in.X().v, in.Feature().v * in.Batch().v}; kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global); - kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, ROUND_ROBIN); + kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc(1, false, false); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; return{ kd }; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.h index 978717c..83e4e8b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.h @@ -53,6 +53,8 @@ namespace kernel_selector virtual ~PermuteKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp index 29822a9..aa17455 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp @@ -41,7 +41,7 @@ namespace kernel_selector const pooling_params& params = static_cast(p); - if (params.activationFunc != ActivationFunction::NONE) + if (params.activation.function != ActivationFunction::NONE) { return{}; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h index 5c46d65..b343d10 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h @@ -27,10 +27,10 @@ namespace kernel_selector virtual ~PoolingKernelGPUAverageOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp new file mode 100644 index 0000000..3a50ee3 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp @@ -0,0 +1,77 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "pooling_kernel_gpu_b_fs_yx_fsv4.h" + +namespace kernel_selector +{ + ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnablePoolType(PoolType::MAX); + k.EnablePoolType(PoolType::AVG); + k.EnablePoolRemainder(PoolRemainder::FLOOR); + k.EnablePoolRemainder(PoolRemainder::CEIL); + k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); + k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC); + k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING); + k.EnableDifferentTypes(); + return k; + } + + PoolingKernelBase::DispatchData PoolingKerneGPU_b_fs_yx_fsv4::SetDefault(const pooling_params& params) const + { + DispatchData runInfo = PoolingKernelBase::SetDefault(params); + + runInfo.gws0 = params.output.X().v; // X + runInfo.gws1 = params.output.Y().v; // Y + // we got b_fs_yx_fsv4 format, we process 4 features per workitem + runInfo.gws2 = (params.output.Feature().v * params.output.Batch().v) / 4; + + runInfo.lws0 = 1; + runInfo.lws1 = 1; + runInfo.lws2 = 1; + + return runInfo; + } + + JitConstants PoolingKerneGPU_b_fs_yx_fsv4::GetJitConstants(const pooling_params& params, DispatchData kd) const + { + auto jit = PoolingKernelBase::GetJitConstants(params, kd); + + const size_t in_x_pitch = 4; + const size_t in_y_pitch = 4 * params.inputs[0].X().LogicalDimPadded(); + jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); + jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); + + return jit; + } + + KernelsData PoolingKerneGPU_b_fs_yx_fsv4::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, FORCE_PRIORITY_1); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h new file mode 100644 index 0000000..43d1f8a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h @@ -0,0 +1,36 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "pooling_kernel_base.h" + +namespace kernel_selector +{ + class PoolingKerneGPU_b_fs_yx_fsv4 : public PoolingKernelBase + { + public: + PoolingKerneGPU_b_fs_yx_fsv4() : PoolingKernelBase("pooling_gpu_b_fs_yx_fsv4") {} + virtual ~PoolingKerneGPU_b_fs_yx_fsv4() {} + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + DispatchData SetDefault(const pooling_params& params) const override; + protected: + virtual ParamsKey GetSupportedKey() const override; + JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; + + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h index b9831b9..2dddbf5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h @@ -27,10 +27,10 @@ namespace kernel_selector virtual ~PoolingKernelGPUBfyxBlockOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h index c515282..a250495 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h @@ -27,7 +27,9 @@ namespace kernel_selector virtual ~PoolingKerneGPU_byxf_af32() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; DispatchData SetDefault(const pooling_params& params) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h index 9b3ad11..7537280 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h @@ -25,12 +25,12 @@ namespace kernel_selector public: PoolingKernelGPUByxfOpt() : PoolingKernelBase("pooling_gpu_byxf_opt") {} virtual ~PoolingKernelGPUByxfOpt() {} - virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; + protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h index eb0f0d1..dfe6ddc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h @@ -27,10 +27,10 @@ namespace kernel_selector virtual ~PoolingKernelGPUByxfPaddingOpt() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h index efb5c67..034392d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h @@ -27,10 +27,10 @@ namespace kernel_selector virtual ~PoolingKerneGPU_fs_bs_yx_bsv4_fsv32() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; DispatchData SetDefault(const pooling_params& params) const override; protected: + virtual ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h index ec05c08..b50fad8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~PoolingKernelGPUInt8Ref() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h index 9bfd687..3d39e99 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~PoolingKernelGPURef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp index 91ec4d2..6538212 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include "pooling_kernel_gpu_byxf_af32.h" #include "pooling_kernel_gpu_int8_ref.h" #include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h" +#include "pooling_kernel_gpu_b_fs_yx_fsv4.h" namespace kernel_selector { @@ -36,6 +37,7 @@ namespace kernel_selector { Attach(); Attach(); Attach(); + Attach(); } KernelsData pooling_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.cpp new file mode 100644 index 0000000..c7bbdbb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.cpp @@ -0,0 +1,67 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "pyramid_roi_align_kernel_base.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector { + + JitConstants PyramidROIAlignKernelBase::GetJitConstants(const PyramidROIAlign_params& params) + { + JitConstants jit = MakeBaseParamsJitConstants(params); + return jit; + } + + PyramidROIAlignKernelBase::DispatchData PyramidROIAlignKernelBase::SetDefault(const PyramidROIAlign_params& params) + { + const auto& boxes = params.inputs.at(0); + DispatchData kd; + + kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; + + std::vector global; + global = { boxes.Y().v, 1, 1 }; + + const auto& local = GetOptimalLocalWorkGroupSizes(global); + + kd.gws0 = global[0]; + kd.gws1 = global[1]; + kd.gws2 = global[2]; + + kd.lws0 = local[0]; + kd.lws1 = local[1]; + kd.lws2 = local[2]; + + return kd; + } + + KernelsData PyramidROIAlignKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const + { + assert(params.GetType() == KernelType::PYRAMID_ROI_ALIGN); + + const auto& prim_params = static_cast(params); // NOLINT(cppcoreguidelines-pro-type-static-cast-downcast) + auto run_info = SetDefault(prim_params); + KernelData k_data = KernelData::Default(params); + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = k_data.kernels[0]; + FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point, "", false, false, (uint32_t)prim_params.inputs.size()); + + k_data.estimatedTime = estimated_time; + + return { k_data }; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.h new file mode 100644 index 0000000..1d7a0f3 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_base.h @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "common_kernel_base.h" +#include "kernel_selector_params.h" + +namespace kernel_selector { + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PyramidROIAlign_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct PyramidROIAlign_params : public base_params + { + PyramidROIAlign_params() + : base_params(KernelType::PYRAMID_ROI_ALIGN) + {} + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // index_select_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct PyramidROIAlign_optional_params : optional_params + { + PyramidROIAlign_optional_params() + : optional_params(KernelType::PYRAMID_ROI_ALIGN) + {} + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PyramidROIAlignKernelBase + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class PyramidROIAlignKernelBase : public common_kernel_base + { + public: + using common_kernel_base::common_kernel_base; + virtual ~PyramidROIAlignKernelBase() {} + + using DispatchData = CommonDispatchData; + + protected: + static JitConstants GetJitConstants(const PyramidROIAlign_params& params); + static DispatchData SetDefault(const PyramidROIAlign_params& params); + KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp new file mode 100644 index 0000000..4de5ec1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp @@ -0,0 +1,40 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "pyramid_roi_align_kernel_ref.h" + +namespace kernel_selector { + ParamsKey PyramidROIAlignKernelRef::GetSupportedKey() const + { + ParamsKey k; + + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableBatching(); + k.EnableDifferentTypes(); + + return k; + } + + KernelsData PyramidROIAlignKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.h new file mode 100644 index 0000000..8194d88 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.h @@ -0,0 +1,29 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pyramid_roi_align_kernel_base.h" + +namespace kernel_selector { + class PyramidROIAlignKernelRef : public PyramidROIAlignKernelBase + { + public: + PyramidROIAlignKernelRef() : PyramidROIAlignKernelBase("pyramid_roi_align_gpu_ref") {} + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: + ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.cpp new file mode 100644 index 0000000..90e5912 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "pyramid_roi_align_kernel_selector.h" +#include "pyramid_roi_align_kernel_ref.h" + +namespace kernel_selector { + PyramidROIAlign_kernel_selector::PyramidROIAlign_kernel_selector() + { + Attach(); + } + + KernelsData PyramidROIAlign_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::PYRAMID_ROI_ALIGN); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.h new file mode 100644 index 0000000..82c4f01 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_selector.h @@ -0,0 +1,31 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { + class PyramidROIAlign_kernel_selector : public kernel_selector_base + { + public: + static PyramidROIAlign_kernel_selector &Instance() { + static PyramidROIAlign_kernel_selector instance; + return instance; + } + + PyramidROIAlign_kernel_selector(); + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/region_yolo/region_yolo_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/region_yolo/region_yolo_kernel_ref.h index 53eb762..27fae9b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/region_yolo/region_yolo_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/region_yolo/region_yolo_kernel_ref.h @@ -60,10 +60,10 @@ namespace kernel_selector using DispatchData = CommonDispatchData; virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const region_yolo_params& params) const; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_from_winograd_2x3_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_from_winograd_2x3_kernel.h index a3081de..79913b2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_from_winograd_2x3_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_from_winograd_2x3_kernel.h @@ -26,8 +26,10 @@ namespace kernel_selector ReorderFromWinograd2x3Kernel() : ReorderKernelBase("reorder_from_winograd_2x3_s1") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const reorder_params& params) const override; virtual DispatchData SetDefault(const reorder_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel.h index 08d78f4..88a6bde 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel.h @@ -27,7 +27,9 @@ namespace kernel_selector virtual ~ReorderKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const reorder_params& params) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp index 867a3c8..8d0edcb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp @@ -26,6 +26,8 @@ namespace kernel_selector switch (l) { case WeightsLayout::os_iyx_osv16: + case WeightsLayout::os_iyx_osv32: + case WeightsLayout::os_iyx_osv64: case WeightsLayout::os_iyx_osv16_rotate_180: case WeightsLayout::os_i_osv16: case WeightsLayout::os_i_osv16__ai8: diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp new file mode 100644 index 0000000..e3562ea --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.cpp @@ -0,0 +1,83 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector +{ + ParamsKey reorder_kernel_byxf_f32_to_byx8_f4_i8::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableInputLayout(DataLayout::byxf); + k.EnableOutputLayout(DataLayout::byx8_f4); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; + } + + bool reorder_kernel_byxf_f32_to_byx8_f4_i8::Validate(const Params& p, const optional_params& o) const + { + if (!ReorderKernelBase::Validate(p, o)) + { + return false; + } + + const reorder_params& params = static_cast(p); + + if (params.output.X().v % 16 != 0) + return false; + + if (params.inputs[0].Feature().v != 3) + return false; + + return true; + } + + reorder_kernel_byxf_f32_to_byx8_f4_i8::DispatchData reorder_kernel_byxf_f32_to_byx8_f4_i8::SetDefault(const reorder_params& params) const + { + DispatchData kd; + + const auto& input = params.inputs[0]; + + kd.gws0 = input.X().v; + kd.gws1 = input.Y().v; + kd.gws2 = input.Batch().v; + + kd.lws0 = 16; + kd.lws1 = 1; + kd.lws2 = 1; + + return kd; + } + + JitConstants reorder_kernel_byxf_f32_to_byx8_f4_i8::GetJitConstants(const reorder_params& params) const + { + auto jit = ReorderKernelBase::GetJitConstants(params); + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); + return jit; + } + + KernelsData reorder_kernel_byxf_f32_to_byx8_f4_i8::GetKernelsData(const Params& params, const optional_params& options) const + { + const reorder_params& orgParams = static_cast(params); + return GetCommonKernelsData(orgParams, options, FORCE_PRIORITY_5); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h new file mode 100644 index 0000000..1a8882d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_byxf_f32_to_byx8_f4_i8.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "reorder_kernel_base.h" + +namespace kernel_selector +{ + class reorder_kernel_byxf_f32_to_byx8_f4_i8 : public ReorderKernelBase + { + public: + reorder_kernel_byxf_f32_to_byx8_f4_i8() : ReorderKernelBase("reorder_data_byxf_f32_to_byx8_f4_i8") {} + virtual ~reorder_kernel_byxf_f32_to_byx8_f4_i8() {} + + virtual bool Validate(const Params& p, const optional_params& o) const override; + virtual DispatchData SetDefault(const reorder_params& params) const override; + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual JitConstants GetJitConstants(const reorder_params& params) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.h index ea1a828..4a6105f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_fast_b1.h @@ -26,8 +26,10 @@ namespace kernel_selector ReorderKernelFastBatch1() : ReorderKernelBase("reorder_data_fast_b1") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const reorder_params& params) const override; virtual DispatchData SetDefault(const reorder_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp index db2b538..0cad960 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp @@ -20,6 +20,7 @@ #include "reorder_from_winograd_2x3_kernel.h" #include "reorder_to_winograd_2x3_kernel.h" #include "reorder_kernel_to_yxfb_batched.h" +#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h" namespace kernel_selector { @@ -30,6 +31,7 @@ namespace kernel_selector { Attach(); Attach(); Attach(); + //Attach(); // Slower than default! } KernelsData reorder_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.h index 8bea6ef..82dd844 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.h @@ -24,12 +24,12 @@ namespace kernel_selector { public: ReorderKernel_to_yxfb_batched() : ReorderKernelBase("reorder_data_to_yxfb_batched") {} - virtual ParamsKey GetSupportedKey() const override; virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const reorder_params& params) const override; virtual DispatchData SetDefault(const reorder_params& arg) const override; bool Validate(const Params& p, const optional_params& o) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_to_winograd_2x3_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_to_winograd_2x3_kernel.h index 1c07f9e..1a43824 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_to_winograd_2x3_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_to_winograd_2x3_kernel.h @@ -26,8 +26,10 @@ namespace kernel_selector ReorderToWinograd2x3Kernel() : ReorderKernelBase("reorder_to_winograd_2x3_s1") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const reorder_params& params) const override; virtual DispatchData SetDefault(const reorder_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_fyx_b_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_fyx_b_kernel.h index a3c021d..22de3a4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_fyx_b_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_fyx_b_kernel.h @@ -26,7 +26,9 @@ namespace kernel_selector ReorderWeightsImage_fyx_b_Kernel() : ReorderKernelBase("reorder_weights_image_2d_c4_fyx_b") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual DispatchData SetDefault(const reorder_weights_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_winograd_6x3_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_winograd_6x3_kernel.h index 6cb1c84..48940da 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_winograd_6x3_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_image_winograd_6x3_kernel.h @@ -26,7 +26,9 @@ namespace kernel_selector ReorderWeightsImageWinograd6x3Kernel() : ReorderKernelBase("reorder_weights_image_winograd_6x3_s1") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual DispatchData SetDefault(const reorder_weights_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_kernel.h index 635b346..f769c11 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_kernel.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~ReorderWeightsKernel() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_2x3_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_2x3_kernel.h index c77354a..6e4b75d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_2x3_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_2x3_kernel.h @@ -26,7 +26,9 @@ namespace kernel_selector ReorderWeightsWinograd2x3Kernel() : ReorderKernelBase("reorder_weights_winograd_2x3_s1") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual DispatchData SetDefault(const reorder_weights_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_6x3_kernel.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_6x3_kernel.h index 51f86fe..9ffd3ab 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_6x3_kernel.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_weights_winograd_6x3_kernel.h @@ -26,7 +26,9 @@ namespace kernel_selector ReorderWeightsWinograd6x3Kernel() : ReorderKernelBase("reorder_weights_winograd_6x3_s1") {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; virtual DispatchData SetDefault(const reorder_weights_params& arg) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.h index 9f5b6db..05c6fd4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.h @@ -56,10 +56,9 @@ namespace kernel_selector using DispatchData = CommonDispatchData; virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; - protected: + virtual ParamsKey GetSupportedKey() const override; virtual JitConstants GetJitConstants(const reorg_yolo_params& params) const; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp index 9c9c760..e0efa9c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp @@ -66,7 +66,7 @@ namespace kernel_selector kernel.workGroups.global = { gws[0], gws[1], gws[2] * gws[3] }; kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global); - kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, ROUND_ROBIN); + kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc(1, false, false); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.h index 86595fc..b4d8757 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.h @@ -48,6 +48,8 @@ namespace kernel_selector virtual ~ReshapeKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp new file mode 100644 index 0000000..7a12119 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp @@ -0,0 +1,87 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "reverse_sequence_kernel_ref.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector +{ + ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + return k; + } + + CommonDispatchData ReverseSequenceKernelRef::SetDefault(const reverse_sequence_params& params, const optional_params&) const + { + CommonDispatchData runInfo; + + std::vector global = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; + + auto local = GetOptimalLocalWorkGroupSizes(global); + + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = global[2]; + + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = local[2]; + + return runInfo; + } + + JitConstants ReverseSequenceKernelRef::GetJitConstants(const reverse_sequence_params& params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.AddConstant(MakeJitConstant("SEQ_AXIS", params.seq_axis)); + jit.AddConstant(MakeJitConstant("BATCH_AXIS", params.batch_axis)); + + return jit; + } + + KernelsData ReverseSequenceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelData kd = KernelData::Default(params); + reverse_sequence_params& newParams = *static_cast(kd.params.get()); + + assert(params.GetType() == KernelType::REVERSE_SEQUENCE); + + auto runInfo = SetDefault(newParams, options); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); + auto cldnn_jit = GetJitConstants(newParams); + std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2); + + kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.h new file mode 100644 index 0000000..c12a5f9 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.h @@ -0,0 +1,57 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // reverse_sequence_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct reverse_sequence_params : public base_params + { + reverse_sequence_params() : base_params(KernelType::REVERSE_SEQUENCE) {} + + int32_t seq_axis; + int32_t batch_axis; + + virtual ParamsKey GetParamsKey() const + { + return base_params::GetParamsKey(); + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // reverse_sequence_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct reverse_sequence_optional_params : optional_params + { + reverse_sequence_optional_params() : optional_params(KernelType::REVERSE_SEQUENCE) {} + }; + + class ReverseSequenceKernelRef : public common_kernel_base + { + public: + ReverseSequenceKernelRef() : common_kernel_base("reverse_sequence_ref") {} + virtual ~ReverseSequenceKernelRef() {} + virtual JitConstants GetJitConstants(const reverse_sequence_params& params) const; + virtual CommonDispatchData SetDefault(const reverse_sequence_params& params, const optional_params&) const; + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.cpp new file mode 100644 index 0000000..490406b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.cpp @@ -0,0 +1,31 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "reverse_sequence_kernel_selector.h" +#include "reverse_sequence_kernel_ref.h" + +namespace kernel_selector { + + reverse_sequence_kernel_selector::reverse_sequence_kernel_selector() + { + Attach(); + } + + KernelsData reverse_sequence_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::REVERSE_SEQUENCE); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.h new file mode 100644 index 0000000..18067f2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class reverse_sequence_kernel_selector : public kernel_selector_base + { + public: + static reverse_sequence_kernel_selector &Instance() { + static reverse_sequence_kernel_selector instance_; + return instance_; + } + + reverse_sequence_kernel_selector(); + + virtual ~reverse_sequence_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.cpp new file mode 100644 index 0000000..14523fa --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.cpp @@ -0,0 +1,83 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "roi_pooling_kernel_base.h" + +namespace kernel_selector { + + static ROIPoolingKernelBase::DispatchData SetDefault(const roi_pooling_params& params) + { + ROIPoolingKernelBase::DispatchData kd; + + kd.fp16UnitUsed = (params.inputs[0].GetDType() == Datatype::F16); + + // Determine global work sizes. + kd.gws0 = params.output.LogicalSize(); + kd.gws1 = 1; + kd.gws2 = 1; + + // Find largest positive local work size that is divider for global work size. + kd.lws0 = std::min(std::max(kd.gws0, static_cast(1)), static_cast(32)); + while (kd.gws0 % kd.lws0 != 0) + { + --kd.lws0; + } + kd.lws1 = 1; + kd.lws2 = 1; + + return kd; + } + + JitConstants ROIPoolingKernelBase::GetJitConstants(const roi_pooling_params& rp) const + { + JitConstants jit = MakeBaseParamsJitConstants(rp); + + jit.AddConstants({ + MakeJitConstant("POOLED_HEIGHT", rp.pooledHeight), + MakeJitConstant("POOLED_WIDTH", rp.pooledWidth), + MakeJitConstant("SPATIAL_SCALE", rp.spatialScale), + MakeJitConstant(toString(rp.mode) + "_POOLING", 1), + }); + + return jit; + } + + KernelsData ROIPoolingKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const + { + assert(params.GetType() == KernelType::ROI_POOLING); + const roi_pooling_params& orgParams = static_cast(params); + + if (orgParams.activation.function != ActivationFunction::NONE) + { + return{}; + } + + DispatchData runInfo = SetDefault(orgParams); + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(orgParams); + auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point); + kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 }); + + kd.estimatedTime = estimatedTime; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.h new file mode 100644 index 0000000..ca27f47 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_base.h @@ -0,0 +1,75 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include +#include "common_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // roi_pooling_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct roi_pooling_params : public base_params + { + roi_pooling_params() : base_params(KernelType::ROI_POOLING) {} + + PoolType mode = PoolType::MAX; + bool position_sensitive = false; + int pooledWidth = 0; + int pooledHeight = 0; + int spatial_bins_x = 1; + int spatial_bins_y = 1; + float spatialScale = 1.f; + + virtual ParamsKey GetParamsKey() const + { + auto k = base_params::GetParamsKey(); + if (position_sensitive) + { + k.EnablePositionSensitivePooling(); + } + k.EnablePoolType(mode); + + return k; + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // roi_pooling_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct roi_pooling_optional_params : optional_params + { + roi_pooling_optional_params() : optional_params(KernelType::ROI_POOLING) {} + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // ROIPoolingKernelBase + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class ROIPoolingKernelBase : public common_kernel_base + { + public: + using common_kernel_base::common_kernel_base; + virtual ~ROIPoolingKernelBase() {}; + + using DispatchData = CommonDispatchData; + + KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const; + protected: + virtual JitConstants GetJitConstants(const roi_pooling_params& params) const; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.cpp new file mode 100644 index 0000000..ba22e21 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.cpp @@ -0,0 +1,55 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "roi_pooling_kernel_ps_ref.h" + +namespace kernel_selector { + + ParamsKey PSROIPoolingKernelRef::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::brfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnablePoolType(PoolType::AVG); + k.EnablePoolType(PoolType::BILINEAR); + k.EnablePositionSensitivePooling(); + return k; + } + + JitConstants PSROIPoolingKernelRef::GetJitConstants(const roi_pooling_params& rp) const + { + JitConstants jit = ROIPoolingKernelBase::GetJitConstants(rp); + + jit.AddConstants({ MakeJitConstant("SPATIAL_BINS_X", rp.spatial_bins_x), + MakeJitConstant("SPATIAL_BINS_Y", rp.spatial_bins_y), + }); + + return jit; + } + + KernelsData PSROIPoolingKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.h new file mode 100644 index 0000000..280f950 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ps_ref.h @@ -0,0 +1,40 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "roi_pooling_kernel_base.h" + +namespace kernel_selector +{ + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // PSROIPoolingKernelRef + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class PSROIPoolingKernelRef : public ROIPoolingKernelBase + { + public: + PSROIPoolingKernelRef() : ROIPoolingKernelBase("roi_pooling_ps_ref") {} + virtual ~PSROIPoolingKernelRef() {} + + using DispatchData = CommonDispatchData; + + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + protected: + JitConstants GetJitConstants(const roi_pooling_params& params) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.cpp index 1e5a0f5..375db2d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016-2018 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -27,81 +27,18 @@ namespace kernel_selector { k.EnableOutputDataType(Datatype::F32); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::brfyx); - k.EnablePoolType(PoolType::MAX); - k.EnablePoolType(PoolType::AVG); - k.EnablePoolType(PoolType::BILINEAR); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); k.EnableDifferentTypes(); + k.EnablePoolType(PoolType::MAX); + k.EnablePoolType(PoolType::AVG); + k.EnablePoolType(PoolType::BILINEAR); return k; } - static ROIPoolingKernelRef::DispatchData SetDefault(const roi_pooling_params& params) - { - ROIPoolingKernelRef::DispatchData kd; - - kd.fp16UnitUsed = (params.inputs[0].GetDType() == Datatype::F16); - - // Determine global work sizes. - kd.gws0 = params.output.LogicalSize(); - kd.gws1 = 1; - kd.gws2 = 1; - - // Find largest positive local work size that is divider for global work size. - kd.lws0 = std::min(std::max(kd.gws0, static_cast(1)), static_cast(32)); - while (kd.gws0 % kd.lws0 != 0) - { - --kd.lws0; - } - kd.lws1 = 1; - kd.lws2 = 1; - - return kd; - } - - JitConstants ROIPoolingKernelRef::GetJitConstants(const roi_pooling_params& rp) const - { - JitConstants jit = MakeBaseParamsJitConstants(rp); - - jit.AddConstants({ - MakeJitConstant("POOLED_HEIGHT", rp.pooledHeight), - MakeJitConstant("POOLED_WIDTH", rp.pooledWidth), - MakeJitConstant("SPATIAL_SCALE", rp.spatialScale), - MakeJitConstant("GROUP_SIZE", rp.groupSize), - MakeJitConstant(toString(rp.mode) + "_POOLING", 1), - }); - - jit.AddConstants({ - MakeJitConstant("USE_OLD_SCALE_AND_ROUNDING", rp.groupSize == 0) - }); - - return jit; - } - KernelsData ROIPoolingKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - assert(params.GetType() == KernelType::ROI_POOLING); - const roi_pooling_params& orgParams = static_cast(params); - - if (orgParams.activationFunc != ActivationFunction::NONE) - { - return{}; - } - - DispatchData runInfo = SetDefault(orgParams); - KernelData kd = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point); - kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 1 }); - - kd.estimatedTime = FORCE_PRIORITY_9; - - return{ kd }; + return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.h index 1bc3c60..e4c8934 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,52 +16,25 @@ #pragma once -#include "common_kernel_base.h" +#include "roi_pooling_kernel_base.h" namespace kernel_selector { - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // roi_pooling_params - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - struct roi_pooling_params : public base_params - { - roi_pooling_params() : base_params(KernelType::ROI_POOLING) {} - - PoolType mode = PoolType::MAX; - size_t pooledWidth = 0; - size_t pooledHeight = 0; - size_t groupSize = 0; - float spatialScale = 1.f; - - virtual ParamsKey GetParamsKey() const - { - return base_params::GetParamsKey(); - } - }; - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // roi_pooling_optional_params - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - struct roi_pooling_optional_params : optional_params - { - roi_pooling_optional_params() : optional_params(KernelType::ROI_POOLING) {} - }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // ROIPoolingKernelRef //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - class ROIPoolingKernelRef : public common_kernel_base + class ROIPoolingKernelRef : public ROIPoolingKernelBase { public: - ROIPoolingKernelRef() : common_kernel_base("roi_pooling_ref") {} + ROIPoolingKernelRef() : ROIPoolingKernelBase("roi_pooling_ref") {} virtual ~ROIPoolingKernelRef() {} using DispatchData = CommonDispatchData; virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: - JitConstants GetJitConstants(const roi_pooling_params& params) const; + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_selector.cpp index 9dbb71c..969362d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_selector.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,16 +16,18 @@ #include "roi_pooling_kernel_selector.h" #include "roi_pooling_kernel_ref.h" - -namespace kernel_selector +#include "roi_pooling_kernel_ps_ref.h" + +namespace kernel_selector { roi_pooling_kernel_selector::roi_pooling_kernel_selector() { Attach(); + Attach(); } KernelsData roi_pooling_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { return GetNaiveBestKernel(params, options, KernelType::ROI_POOLING); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp index 61eddda..e90f591 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp @@ -57,7 +57,7 @@ namespace kernel_selector auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, ROUND_ROBIN, true, !orgParams.bias.empty(), 2); + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, true, !orgParams.bias.empty(), 2); if (orgParams.use_momentum) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h index 59ed5d7..3538572 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h @@ -26,6 +26,7 @@ namespace kernel_selector { ScaleGradWeightsKernelRef() : ScaleGradWeightsKernelBase("scale_grad_weights_gpu_ref") {} virtual ~ScaleGradWeightsKernelRef() {} + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp index 09b3a01..43d2e11 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp @@ -167,7 +167,7 @@ namespace kernel_selector kernel.workGroups.global = { runInfo.gws0, runInfo.gws1, runInfo.gws2 }; kernel.workGroups.local = { runInfo.lws0, runInfo.lws1, runInfo.lws2 }; - kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, ROUND_ROBIN); + kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, false); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_ref.h index a72c0e9..4663a38 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_ref.h @@ -27,9 +27,9 @@ namespace kernel_selector virtual ~SelectKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; bool Validate(const Params& p, const optional_params& o) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp new file mode 100644 index 0000000..fd5528f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp @@ -0,0 +1,102 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "shuffle_channels_kernel_ref.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector +{ + ParamsKey ShuffleChannelsKernelRef::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; + } + + CommonDispatchData ShuffleChannelsKernelRef::SetDefault(const shuffle_channels_params& params, const optional_params&) const + { + CommonDispatchData runInfo; + + std::vector global = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; + + auto local = GetOptimalLocalWorkGroupSizes(global); + + runInfo.gws0 = global[0]; + runInfo.gws1 = global[1]; + runInfo.gws2 = global[2]; + + runInfo.lws0 = local[0]; + runInfo.lws1 = local[1]; + runInfo.lws2 = local[2]; + + return runInfo; + } + + JitConstants ShuffleChannelsKernelRef::GetJitConstants(const shuffle_channels_params& params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + jit.AddConstant(MakeJitConstant("GROUPS_NUMBER", params.group)); + + auto getDimSizeByAxis = [](const shuffle_channels_params& params) -> size_t { + switch (params.axis) { + case 0: + return params.inputs[0].Batch().v; + case 1: + return params.inputs[0].Feature().v; + case 2: + return params.inputs[0].Y().v; + case 3: + return params.inputs[0].X().v; + } + return 0; + }; + + jit.AddConstant(MakeJitConstant("GROUP_SIZE", getDimSizeByAxis(params) / params.group)); + jit.AddConstant(MakeJitConstant("AXIS", params.axis)); + + return jit; + } + + KernelsData ShuffleChannelsKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelData kd = KernelData::Default(params); + shuffle_channels_params& newParams = *static_cast(kd.params.get()); + + assert(params.GetType() == KernelType::SHUFFLE_CHANNELS); + + auto runInfo = SetDefault(newParams, options); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); + auto cldnn_jit = GetJitConstants(newParams); + std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point); + + kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.h new file mode 100644 index 0000000..6f6f3d0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.h @@ -0,0 +1,57 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // shuffle_channels_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct shuffle_channels_params : public base_params + { + shuffle_channels_params() : base_params(KernelType::SHUFFLE_CHANNELS) {} + + int32_t group; + int32_t axis; + + virtual ParamsKey GetParamsKey() const + { + return base_params::GetParamsKey(); + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // shuffle_channels_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct shuffle_channels_optional_params : optional_params + { + shuffle_channels_optional_params() : optional_params(KernelType::SHUFFLE_CHANNELS) {} + }; + + class ShuffleChannelsKernelRef : public common_kernel_base + { + public: + ShuffleChannelsKernelRef() : common_kernel_base("shuffle_channels_ref") {} + virtual ~ShuffleChannelsKernelRef() {} + virtual JitConstants GetJitConstants(const shuffle_channels_params& params) const; + virtual CommonDispatchData SetDefault(const shuffle_channels_params& params, const optional_params&) const; + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.cpp new file mode 100644 index 0000000..41088de --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.cpp @@ -0,0 +1,31 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "shuffle_channels_kernel_selector.h" +#include "shuffle_channels_kernel_ref.h" + +namespace kernel_selector { + + shuffle_channels_kernel_selector::shuffle_channels_kernel_selector() + { + Attach(); + } + + KernelsData shuffle_channels_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::SHUFFLE_CHANNELS); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.h new file mode 100644 index 0000000..dadc63f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class shuffle_channels_kernel_selector : public kernel_selector_base + { + public: + static shuffle_channels_kernel_selector &Instance() { + static shuffle_channels_kernel_selector instance_; + return instance_; + } + + shuffle_channels_kernel_selector(); + + virtual ~shuffle_channels_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_base.cpp index 4d2c36d..e0f93f0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_base.cpp @@ -105,7 +105,7 @@ namespace kernel_selector const softmax_params& params = static_cast(p); const auto& input = params.inputs[0]; - if (params.activationFunc != ActivationFunction::NONE) + if (params.activation.function != ActivationFunction::NONE) { return false; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_bf.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_bf.h index 52a30f5..5f96abf 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_bf.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_bf.h @@ -28,7 +28,9 @@ namespace kernel_selector virtual ~SoftmaxKernel_bf() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const override; + + protected: + virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_fb.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_fb.h index 461670a..bb9c45c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_fb.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_fb.h @@ -28,10 +28,10 @@ namespace kernel_selector virtual ~SoftmaxKernel_fb() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual ParamsKey GetSupportedKey() const override; protected: + virtual ParamsKey GetSupportedKey() const override; virtual bool Validate(const Params& p, const optional_params& o) const override; DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_items_class_optimized.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_items_class_optimized.h index 3f3bf6f..51afb84 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_items_class_optimized.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_items_class_optimized.h @@ -28,10 +28,10 @@ namespace kernel_selector virtual ~SoftmaxKerneItemsClassOptimized() {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: + ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const softmax_params& params, DispatchData kd) const override; DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_ref.h index f517a42..4c6fd10 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_ref.h @@ -28,9 +28,9 @@ namespace kernel_selector virtual ~SoftmaxKernelRef() {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; protected: + ParamsKey GetSupportedKey() const override; DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h index a30be9b..b9494e6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h @@ -28,6 +28,8 @@ namespace kernel_selector virtual ~SoftmaxLossGradKernelRef() {} KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp new file mode 100644 index 0000000..c34d554 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp @@ -0,0 +1,104 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "strided_slice_kernel_ref.h" +#include "kernel_selector_utils.h" + +namespace kernel_selector +{ + ParamsKey StridedSliceKernelRef::GetSupportedKey() const + { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; + } + + CommonDispatchData StridedSliceKernelRef::SetDefault(const strided_slice_params& params, const optional_params&) const + { + CommonDispatchData runInfo; + std::vector gws; + + // If the new_axis_mask is set, then begin, end, and stride are ignored + // and a new length 1 dimension is adding. Input data just copying to output + // TODO: remove data copying in case where only shape size changing + if (params.new_axis_mask.size() != 0) + gws = { params.inputs[0].Batch().v, params.inputs[0].Feature().v, params.inputs[0].Y().v * params.inputs[0].X().v }; + else + gws = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; + + auto lws = GetOptimalLocalWorkGroupSizes(gws); + + runInfo.gws0 = gws[0]; + runInfo.gws1 = gws[1]; + runInfo.gws2 = gws[2]; + + runInfo.lws0 = lws[0]; + runInfo.lws1 = lws[1]; + runInfo.lws2 = lws[2]; + + return runInfo; + } + + JitConstants StridedSliceKernelRef::GetJitConstants(const strided_slice_params& params) const + { + JitConstants jit = MakeBaseParamsJitConstants(params); + + auto makeJitConstForParam = [](JitConstants& jit, const std::string name, const std::vector vec) { + jit.AddConstant(MakeJitConstant(name + "_SIZES", vec)); + jit.AddConstant(MakeJitConstant(name + "_BATCH", vec[0])); + jit.AddConstant(MakeJitConstant(name + "_FEATURE", vec[1])); + jit.AddConstant(MakeJitConstant(name + "_Y", vec[2])); + jit.AddConstant(MakeJitConstant(name + "_X", vec[3])); + }; + + makeJitConstForParam(jit, "SLICE_BEGIN", params.striding_params[0]); + makeJitConstForParam(jit, "SLICE_END", params.striding_params[1]); + makeJitConstForParam(jit, "SLICE_STEPS", params.striding_params[2]); + + jit.AddConstant(MakeJitConstant("NEW_AXIS_MODE", std::find(params.new_axis_mask.begin(), params.new_axis_mask.end(), 1) != params.new_axis_mask.end())); + + return jit; + } + + KernelsData StridedSliceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const + { + KernelData kd = KernelData::Default(params); + strided_slice_params& newParams = *static_cast(kd.params.get()); + + assert(params.GetType() == KernelType::STRIDED_SLICE); + + auto runInfo = SetDefault(newParams, options); + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); + auto cldnn_jit = GetJitConstants(newParams); + std::string jit = CreateJit(kernelName, cldnn_jit, entry_point); + + auto& kernel = kd.kernels[0]; + + FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point); + + kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; + + return{ kd }; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.h new file mode 100644 index 0000000..159e658 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.h @@ -0,0 +1,61 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "common_kernel_base.h" + +namespace kernel_selector +{ + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // strided_slice_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct strided_slice_params : public base_params + { + strided_slice_params() : base_params(KernelType::STRIDED_SLICE) {} + + std::vector> striding_params; + std::vector begin_mask; + std::vector end_mask; + std::vector ellipsis_mask; + std::vector new_axis_mask; + std::vector shrink_axis_mask; + + virtual ParamsKey GetParamsKey() const + { + return base_params::GetParamsKey(); + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // strided_slice_optional_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct strided_slice_optional_params : optional_params + { + strided_slice_optional_params() : optional_params(KernelType::STRIDED_SLICE) {} + }; + + class StridedSliceKernelRef : public common_kernel_base + { + public: + StridedSliceKernelRef() : common_kernel_base("strided_slice_ref") {} + virtual ~StridedSliceKernelRef() {} + virtual JitConstants GetJitConstants(const strided_slice_params& params) const; + virtual CommonDispatchData SetDefault(const strided_slice_params& params, const optional_params&) const; + virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + virtual ParamsKey GetSupportedKey() const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.cpp new file mode 100644 index 0000000..7dfba71 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.cpp @@ -0,0 +1,31 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "strided_slice_kernel_selector.h" +#include "strided_slice_kernel_ref.h" + +namespace kernel_selector { + + strided_slice_kernel_selector::strided_slice_kernel_selector() + { + Attach(); + } + + KernelsData strided_slice_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const + { + return GetNaiveBestKernel(params, options, KernelType::STRIDED_SLICE); + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.h new file mode 100644 index 0000000..6f983b1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_selector.h @@ -0,0 +1,37 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector +{ + class strided_slice_kernel_selector : public kernel_selector_base + { + public: + static strided_slice_kernel_selector &Instance() { + static strided_slice_kernel_selector instance_; + return instance_; + } + + strided_slice_kernel_selector(); + + virtual ~strided_slice_kernel_selector() {} + + virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; + }; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.h index 967dab8..4f08d7a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.h @@ -53,6 +53,8 @@ namespace kernel_selector virtual JitConstants GetJitConstants(const tile_params& params) const; virtual CommonDispatchData SetDefault(const tile_params& params, const optional_params&) const; virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_base.cpp index 889daf8..ae696f8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_base.cpp @@ -76,7 +76,7 @@ namespace kernel_selector kernel.workGroups.global = { out.X().v, out.Y().v, out.Feature().v * out.Batch().v }; kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global); - kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, ROUND_ROBIN); + kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, false); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_ref.h index b7b5596..de27559 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/upsampling/upsampling_kernel_ref.h @@ -27,6 +27,8 @@ namespace kernel_selector virtual ~UpSamplingKernelRef() {} virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + + protected: virtual ParamsKey GetSupportedKey() const override; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.cpp index d9ccd15..307390d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.cpp @@ -15,143 +15,106 @@ */ #include "auto_tuner.h" -#include "auto_tuner_offline.h" #include #include #include +#include +#include "istreamwrapper.h" +#include "stringbuffer.h" +#include "prettywriter.h" - -namespace kernel_selector + +namespace kernel_selector { - std::tuple AutoTuner::LoadKernelOnline(const TuningMode tuningMode, const std::string& tuningFilePath, const std::string& deviceID, const std::string& driverVersion, const std::string& hostVersion, const std::string& hash) + std::tuple AutoTuner::LoadKernelOnline(const TuningMode tuningMode, const std::string& cacheFilePath, const uint32_t computeUnitsCount, const std::string& hash) { std::lock_guard lock(mutex); - - //First, check if the tuning file has been already loaded to cache - auto const& tuningFileCache = onlineCache.find(tuningFilePath); - if (tuningFileCache == onlineCache.end()) + rapidjson::Document cacheData; + std::ifstream tuningFile(cacheFilePath); + if (tuningFile && tuningFile.good()) { - // Load tuning file to cache - onlineCache[tuningFilePath] = {}; - - std::ifstream tuningFile(tuningFilePath); - std::string cachedDeviceId; - std::string cachedDriverVersion; - std::string cachedHostVersion; - std::string cachedhash; - std::string cachedkernelName; - int cachedIndex; - std::string line; - - if (tuningFile) // Tuning file exists + rapidjson::IStreamWrapper isw{ tuningFile }; + cacheData.ParseStream(isw); + } + else // Tuning file doesn't exist + { + if (tuningMode == TuningMode::TUNING_USE_CACHE) { - // Read device ID - tuningFile >> cachedDeviceId; - if (!tuningFile.good() || (cachedDeviceId.compare(deviceID) != 0)) - { - throw std::runtime_error("Tuning file bad structure or wrong device ID. Re-generate cache in TUNE_AND_CACHE mode."); - } - - // Read driver version - tuningFile >> cachedDriverVersion; - if (!tuningFile.good() || (cachedDriverVersion.compare(driverVersion) != 0)) - { - throw std::runtime_error("Tuning file bad structure or wrong driver version. Re-generate cache in TUNE_AND_CACHE mode."); - } + throw std::runtime_error("Tuning file: " + cacheFilePath + " could not be read! Must provide a valid cache file in USE_CACHE mode."); + } - // Read host version - tuningFile >> cachedHostVersion; - if (!tuningFile.good() || (cachedHostVersion.compare(hostVersion) != 0)) - { - throw std::runtime_error("Tuning file bad structure or wrong host version. Re-generate cache in TUNE_AND_CACHE mode."); - } + // Create a new tuning file and write the versions + std::ofstream newTuningFile(cacheFilePath, std::ofstream::out); - // Read optimal kernel/config data - while (std::getline(tuningFile, line)) - { - if (line.empty()) - { - continue; - } - std::istringstream iss(line); - iss >> cachedhash >> cachedkernelName >> cachedIndex; - if (iss.fail()) - { - throw std::runtime_error("Tuning file bad structure. Re-generate cache in TUNE_AND_CACHE mode."); - } + } + tuningFile.close(); - // Update tuning cache - onlineCache[tuningFilePath].td[cachedhash] = std::make_tuple(cachedkernelName, cachedIndex); - } + onlineCache = std::make_shared(std::move(cacheData)); - tuningFile.close(); - } - else // Tuning file doesn't exist + // Tuning file is loaded + auto computeUnitsStr = std::to_string(computeUnitsCount); + if (!onlineCache->IsNull()) + { + auto cacheObject = onlineCache->GetObject(); + if (onlineCache->HasMember(computeUnitsStr.c_str())) { - if (tuningMode == TuningMode::TUNING_USE_CACHE) + if (cacheObject[computeUnitsStr.c_str()].HasMember(hash.c_str())) { - throw std::runtime_error("Tuning file: " + tuningFilePath + " could not be read! Must provide a valid cache file in USE_CACHE mode."); + const rapidjson::Value& prog = cacheObject[computeUnitsStr.c_str()][hash.c_str()]; + return std::make_tuple(prog[0].GetString(), prog[1].GetInt()); } - - // Create a new tuning file and write the versions - std::ofstream newTuningFile(tuningFilePath, std::ofstream::out); - - newTuningFile << deviceID << "\n"; - newTuningFile << driverVersion << "\n"; - newTuningFile << hostVersion << "\n"; } } + return std::make_pair("", 0); + + } - // Tuning file is loaded - auto const& tuningFileData = onlineCache[tuningFilePath]; - auto const& hashData = tuningFileData.td.find(hash); - if (hashData != tuningFileData.td.end()) + void AutoTuner::StoreKernel(const std::string& cacheFilePath, const std::string& hash, std::string implementationName, const int tuneIndex, const uint32_t computeUnitsCount) + { + std::lock_guard lock(mutex); + auto computeUnitsStr = std::to_string(computeUnitsCount); + rapidjson::Document::AllocatorType& allocator = onlineCache->GetAllocator(); + rapidjson::Value dataArray(rapidjson::kArrayType); + rapidjson::Value hashStr(rapidjson::kStringType); + hashStr.Set(hash.c_str(), allocator); + dataArray.PushBack(rapidjson::Value().Set(implementationName.c_str(),allocator) , allocator); + dataArray.PushBack(rapidjson::Value().SetInt(tuneIndex), allocator); + + rapidjson::Value newVal(rapidjson::kObjectType); + newVal.SetObject(); + if (onlineCache->IsNull()) { - // Tuning data exists for this hash. - return hashData->second; + onlineCache->Parse("{}"); } - else + if (!onlineCache->HasMember(computeUnitsStr.c_str())) { - // Tuning data doesn't exists for this hash - on-line tuning is needed. - return std::make_pair("", 0); + onlineCache->AddMember(rapidjson::Value(computeUnitsStr.c_str(), allocator), newVal, allocator); } - } - - void AutoTuner::StoreKernel(const std::string& tuningFilePath, const std::string& hash, const std::string& implementationName, const int tuneIndex) - { - std::lock_guard lock(mutex); - // Add the new tuning data to cache - onlineCache[tuningFilePath].td[hash] = std::make_tuple(implementationName, tuneIndex); + auto cache = onlineCache->GetObject(); + cache[computeUnitsStr.c_str()].AddMember(hashStr, dataArray, allocator); - // Add the new tuning data to tuning file - std::ofstream cachedKernelsFile(tuningFilePath, std::ofstream::out | std::ofstream::app); - if (!cachedKernelsFile.good()) - { - throw std::runtime_error("Tuning file: " + tuningFilePath + " could not be written!"); - } - cachedKernelsFile << hash << " "; - cachedKernelsFile << implementationName << " "; - cachedKernelsFile << tuneIndex << "\n"; + std::ofstream cachedKernelsFile(cacheFilePath); + rapidjson::StringBuffer buffer(0, 1024); + rapidjson::PrettyWriter writer(buffer); + onlineCache->Accept(writer); + auto temp = buffer.GetString(); + cachedKernelsFile << temp; cachedKernelsFile.close(); } - std::tuple AutoTuner::LoadKernelOffline(const std::string& deviceID, const std::string& hash) + + std::tuple AutoTuner::LoadKernelOffline(std::shared_ptr deviceCache, const std::string& hash) { - auto const& deviceCache = auto_tuner_offline::get_instance(deviceID)->get_tuning_data(); - if (deviceCache.td.empty()) - { - return std::make_pair("", 0); - } - auto const& deviceCacheData = deviceCache.td; - auto const& hashData = deviceCacheData.find(hash); - if (hashData == deviceCacheData.end()) + if (!deviceCache->IsNull()) { - return std::make_pair("", 0); - } - else - { - return hashData->second; + auto cache = deviceCache->GetObject(); + if (deviceCache->HasMember(hash.c_str())) + { + const rapidjson::Value& prog = cache[hash.c_str()]; + return std::make_tuple(prog[0].GetString(), prog[1].GetInt()); + } } + return std::make_tuple("", 0); } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.h index 864e1ce..7e4a7cd 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner.h @@ -19,25 +19,23 @@ #include #include #include -#include "kernel_selector_common.h" +#include "kernel_selector_common.h" +#include "document.h" + namespace kernel_selector { - struct tuning_data // this could be replaced with - { - std::map> td; - }; class AutoTuner { public: AutoTuner() = default; - std::tuple LoadKernelOnline(const TuningMode tuningMode, const std::string& tuningFilePath, const std::string& deviceID, const std::string& driverVersion, const std::string& hostVersion, const std::string& hash); - void StoreKernel(const std::string& tuningFilePath, const std::string& hash, const std::string& implementationName, const int tuneIndex); - std::tuple LoadKernelOffline(const std::string& deviceID, const std::string& hash); + std::tuple LoadKernelOnline(const TuningMode tuningMode, const std::string& tuningFilePath, const uint32_t computeUnitsCount, const std::string& hash); + void StoreKernel(const std::string& tuningFilePath, const std::string& hash, std::string implementationName, const int tuneIndex, const uint32_t computeUnitsCount); + std::tuple LoadKernelOffline(std::shared_ptr cache, const std::string& hash); private: - std::map onlineCache; // Tuning file name -> kernel/config per hash (hash -> [implementation name, tuning index]) + std::shared_ptr onlineCache; // Tuning file name -> kernel/config per hash (hash -> [implementation name, tuning index]) std::mutex mutex; // Mutex to synchronize cache updates /* diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.cpp deleted file mode 100644 index 062138a..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - std::shared_ptr auto_tuner_offline::instance = 0; - std::mutex auto_tuner_offline::mutex; - - auto_tuner_offline::auto_tuner_offline(const std::string& hw_id) - { - std::string temp_hw_id = hw_id; - // TODO: this is temporary solution of cases where user has non-tuned configuration. needs to implement better logic - // i.e. create table with number of eu's configuration that will point to common cache. - if (sku_cache_fillers.count(hw_id) == 0) - temp_hw_id = "0x1912"; - sku_cache_fillers.at(temp_hw_id)(t_data); - } - - std::shared_ptr auto_tuner_offline::get_instance(const std::string& hw_id) - { - std::lock_guard lock(mutex); - if (instance == nullptr) - { - instance = std::make_shared(auto_tuner_offline(hw_id)); - } - return instance; - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.h deleted file mode 100644 index b7008d6..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/auto_tuner_offline.h +++ /dev/null @@ -1,73 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include -#include -#include "auto_tuner.h" -#include "kernel_selector_common.h" - -namespace kernel_selector -{ - // SKL GT4e - void tuning_cache_193B(tuning_data&); - void tuning_cache_193B_B1_B16(tuning_data&); - void tuning_cache_193B_B8(tuning_data&); - void tuning_cache_193B_B32_B64(tuning_data&); - //SKL GT2 - void tuning_cache_1912(tuning_data&); - void tuning_cache_1912_B1_B16(tuning_data&); - void tuning_cache_1912_B8(tuning_data&); - void tuning_cache_1912_B32_B64(tuning_data&); - //KBL GT3e - void tuning_cache_5927(tuning_data&); - void tuning_cache_5927_B1(tuning_data&); - //ICL GT2 - void tuning_cache_8A52(tuning_data&); - void tuning_cache_8A52_B1_B16(tuning_data&); - //APL 10W - void tuning_cache_5A84(tuning_data&); - // Device ID for APL E3930. - void tuning_cache_5A85(tuning_data&); - - class auto_tuner_offline - { - private: - static std::shared_ptr instance; - static std::mutex mutex; - auto_tuner_offline() = delete; - // this is singleton implementation, if called twice with different parameter, - // second call param will be ignored - auto_tuner_offline(const std::string& hw_id); - tuning_data t_data; - - const std::map sku_cache_fillers - { - { "0x193B" , tuning_cache_193B }, - { "0x1912" , tuning_cache_1912 }, - { "0x5927" , tuning_cache_5927 }, - { "0x8A52" , tuning_cache_8A52 }, - { "0x5A84" , tuning_cache_5A84 }, - { "0x5A85" , tuning_cache_5A84 }, - { "0x3184" , tuning_cache_5A84 }, - }; - - public: - static std::shared_ptr get_instance(const std::string& hw_id); - tuning_data get_tuning_data() const { return t_data; } - }; -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache.json b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache.json new file mode 100644 index 0000000..29cd72a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache.json @@ -0,0 +1,52153 @@ +{ + "24": { + "1447947330145817080": ["convolution_gpu_bfyx_gemm_like", 2], + "7822260665195993699": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "8834376889372261135": ["convolution_gpu_bfyx_gemm_like",2], + "13198642774931141302": ["convolution_gpu_bfyx_gemm_like",1], + "14147966687151087307": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12416108838449201073": ["convolution_gpu_bfyx_gemm_like",2], + "2981613830919028333": ["convolution_gpu_bfyx_gemm_like",2], + "8083720773671701257": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2088791910163600059": ["convolution_gpu_bfyx_gemm_like",2], + "10501842258923285952": ["convolution_gpu_bfyx_gemm_like",2], + "18377151309967754698": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11463423774446158264": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "1907052741356343855": ["convolution_gpu_bfyx_gemm_like",2], + "17107836795750250005": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12392243022666304830": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "7210665245866922495": ["convolution_gpu_bfyx_gemm_like",1], + "15377692880620850674": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "2235284465019694961": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "17891191718277641356": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "11506567689103579136": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "13566452591890409921": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "10984167927862279982": ["convolution_gpu_bfyx_gemm_like",2], + "11442013495763732580": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "15472674298322946992": ["convolution_gpu_bfyx_gemm_like",2], + "1814045892909314674": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "10888799142381813035": ["convolution_gpu_bfyx_gemm_like",2], + "2121110886540804293": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1544283806060575584": ["convolution_gpu_bfyx_gemm_like",2], + "3773802352282967589": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3665566135022890729": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2714391204826997965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "683383121058719452": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3963106895592011725": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "956022649859563080": ["convolution_gpu_bfyx_gemm_like",1], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "2623687018437195679": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",166], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "16312223896859176991": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "9367157746678824712": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",470], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8819268903800581706": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11066913713501760080": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",2], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",466], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8376077531098664520": ["convolution_gpu_bfyx_gemm_like",1], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "9780938731831129283": ["convolution_gpu_bfyx_gemm_like",2], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "15591167992985613695": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",2], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",1], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",809], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "1306339989221885682": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",522], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "18424400171776141118": ["convolution_gpu_bfyx_gemm_like",2], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",2], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",2], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10290107543739998181": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2108296560864415762": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "15315327794058441258": ["convolution_gpu_bfyx_gemm_like",2], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "16190949264253468961": ["convolution_gpu_bfyx_gemm_like",1], + "15661322183507404821": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1742897526168249500": ["convolution_gpu_bfyx_gemm_like",2], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",1], + "9287404618748313247": ["convolution_gpu_bfyx_gemm_like",2], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13842309033760176194": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11267742746905371769": ["convolution_gpu_bfyx_gemm_like",1], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "2543041530639980505": ["convolution_gpu_bfyx_gemm_like",1], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "4652136280940317116": ["convolution_gpu_bfyx_gemm_like",2], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "11806402239500046867": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",1], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "12242618640422208652": ["convolution_gpu_bfyx_gemm_like",2], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "7279393739634103483": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5629373398445592781": ["convolution_gpu_bfyx_gemm_like",2], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "11619548409913646265": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "1003101267609305257": ["convolution_gpu_bfyx_gemm_like",2], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",808], + "11507538232733291666": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",831], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "10554266898346470422": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "9285566577169147378": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7998930863626763670": ["convolution_gpu_bfyx_gemm_like",2], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "15374625876485618845": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6981537186704688907": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",2], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",1], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",2], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9410978119783758141": ["convolution_gpu_bfyx_gemm_like",2], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",2], + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "13248567106128518549": ["convolution_gpu_bfyx_gemm_like",2], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",2], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "939718260623752240": ["convolution_gpu_bfyx_gemm_like",1], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1081962464388501987": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",525], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "7688176479120305539": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",2], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",2], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",2], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "9660812093766156608": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",2], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "14650567822254940018": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "659150305191479097": ["convolution_gpu_bfyx_gemm_like",2], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",2], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",875], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5311718276151327830": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5103094815475470596": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "1077773457856682663": ["convolution_gpu_bfyx_gemm_like",2], + "4992668316921598993": ["convolution_gpu_bfyx_gemm_like",2], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17006655627343469372": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6323026044750482867": ["convolution_gpu_bfyx_gemm_like",2], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12427258337646070422": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",1], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "6025872155179042054": ["convolution_gpu_bfyx_gemm_like",2], + "75742659105146536": ["convolution_gpu_bfyx_gemm_like",1], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "8560635685184432720": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8906588133431586825": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1051506168926530904": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15026219694198820614": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10642327923162019888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3895088069642140043": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "3277243911383750280": ["convolution_gpu_bfyx_gemm_like",1], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "2651385050387738902": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "10890975553758439233": ["convolution_gpu_bfyx_gemm_like",2], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "6263019986730305851": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "10384537928514123040": ["convolution_gpu_bfyx_gemm_like",2], + "5688478347124565305": ["convolution_gpu_bfyx_gemm_like",1], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",1], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "4073467095502162430": ["convolution_gpu_bfyx_gemm_like",1], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "12672995204641007004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "15466940145773097237": ["convolution_gpu_bfyx_gemm_like",1], + "17790026124881397912": ["fully_connected_gpu_yxfb_ref",2], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16025442470600124062": ["convolution_gpu_bfyx_gemm_like",2], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "4897991181236908768": ["convolution_gpu_bfyx_gemm_like",2], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5953754321266570854": ["convolution_gpu_bfyx_gemm_like",1], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "4999505377862312410": ["fully_connected_gpu_bf_io_gemm",2], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8707189142909022305": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17808913959977434594": ["convolution_gpu_bfyx_gemm_like",1], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13300022131572486202": ["convolution_gpu_bfyx_gemm_like",2], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "9191832520273617003": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17343050785312683560": ["convolution_gpu_bfyx_gemm_like",2], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",2], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "3432296808755992670": ["convolution_gpu_bfyx_gemm_like",2], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "15322019609805777935": ["convolution_gpu_bfyx_gemm_like",2], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11609821372586026178": ["convolution_gpu_bfyx_gemm_like",2], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8153567933591966877": ["convolution_gpu_bfyx_gemm_like",1], + "2732519635571994212": ["convolution_gpu_bfyx_gemm_like",2], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "15511138074959300404": ["convolution_gpu_bfyx_gemm_like",2], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",165], + "8061914949376516780": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "3526580286148537369": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "6329618009202266591": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1838534101161814609": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",0], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "17208186152576814861": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "13468713306678453952": ["convolution_gpu_bfyx_direct_10_12_16",2], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",2], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "4355933224673863178": ["convolution_gpu_bfyx_gemm_like",2], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "17638692805430115529": ["convolution_gpu_bfyx_gemm_like",2], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "2727219457659794468": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",2], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "13102754309439605192": ["convolution_gpu_bfyx_gemm_like",2], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",2], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "8751016391945753900": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "9522661528867955338": ["convolution_gpu_bfyx_gemm_like",2], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",2], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "17015791782274123780": ["convolution_gpu_bfyx_gemm_like",1], + "11465965972527519631": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",2], + "13328449155966085543": ["convolution_gpu_bfyx_gemm_like",2], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "153117141968471446": ["convolution_gpu_bfyx_gemm_like",1], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "10084794570892043447": ["convolution_gpu_bfyx_gemm_like",2], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16076153317792960383": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",1], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "2451712485584835395": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12523676912856063091": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8818070832398055086": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "13702692566238948173": ["convolution_gpu_bfyx_gemm_like",1], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "17515573322312447679": ["convolution_gpu_bfyx_gemm_like",2], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "10323345824599612614": ["convolution_gpu_bfyx_gemm_like",2], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14811022197918391667": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",927], + "13115589642140732066": ["convolution_gpu_bfyx_gemm_like",1], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8108843303778211282": ["convolution_gpu_bfyx_gemm_like",2], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6644418194983229139": ["convolution_gpu_bfyx_gemm_like",2], + "11086699387784339943": ["convolution_gpu_bfyx_gemm_like",2], + "7843498978148810586": ["convolution_gpu_bfyx_gemm_like",2], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "54019631544204590": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "4366168099274266975": ["convolution_gpu_bfyx_gemm_like",1], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "11599932445375240727": ["convolution_gpu_bfyx_gemm_like",2], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",1], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "5740738339752793113": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",571], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",0], + "276407276027553756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "7264756313770306662": ["convolution_gpu_bfyx_gemm_like",2], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "9529614587861271730": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "1138439260035360722": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "1116274074896622552": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "6133592828563353516": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16053585286807864356": ["convolution_gpu_bfyx_gemm_like",2], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "13538051178827008933": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18077281411861416889": ["convolution_gpu_bfyx_gemm_like",1], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "522181557896569275": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "5843679089588930933": ["convolution_gpu_bfyx_os_iyx_osv16",146], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5381578460674280089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10320711719466983961": ["convolution_gpu_bfyx_gemm_like",2], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",2], + "15490478608105402679": ["convolution_gpu_bfyx_gemm_like",2], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "10509933181132310969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16955653765071712611": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5275016494706355806": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15579919505002150556": ["convolution_gpu_bfyx_gemm_like",2], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",1], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "288853243482418538": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "16491532291908469567": ["convolution_gpu_bfyx_gemm_like",1], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",0], + "17382660912493284320": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9100044555742394133": ["convolution_gpu_bfyx_gemm_like",1], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2525260242689556544": ["convolution_gpu_bfyx_gemm_like",2], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",288], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",1], + "52089503050497755": ["convolution_gpu_bfyx_gemm_like",2], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17050143605017295447": ["convolution_gpu_bfyx_gemm_like",2], + "5047419871737940985": ["convolution_gpu_bfyx_direct_10_12_16",2], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12796777049340516563": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17243648226968859637": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",2], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",2], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "1996860183441418841": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "8258382025812748961": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "8618835732380720921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "14484890926084856480": ["convolution_gpu_bfyx_gemm_like",1], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",536], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2040762223425679479": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",2], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "9882204352209412039": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2622434279674583815": ["convolution_gpu_bfyx_gemm_like",1], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "789359733867650915": ["convolution_gpu_bfyx_gemm_like",1], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "15378025640603637387": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16683485007140805060": ["fully_connected_gpu_yxfb_ref",2], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",504], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "6129602738379919488": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6438522646185979880": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14077148976508649021": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",1], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "13410850301164057911": ["convolution_gpu_bfyx_gemm_like",1], + "15993427814066246646": ["convolution_gpu_bfyx_gemm_like",2], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "2893564501191050837": ["convolution_gpu_bfyx_gemm_like",1], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "12190841837604350271": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "16547425454653232058": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",11], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17089801601582809764": ["convolution_gpu_bfyx_gemm_like",1], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13681462437496627948": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "1882052795393187384": ["convolution_gpu_bfyx_gemm_like",1], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "18043340998699622388": ["convolution_gpu_bfyx_gemm_like",2], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "11459784003592366395": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",1], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",2], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "1701609125136907870": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "2294026590516781945": ["convolution_gpu_bfyx_gemm_like",2], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",2], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "16230621843665445228": ["convolution_gpu_bfyx_gemm_like",2], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",9], + "5933743119393822386": ["convolution_gpu_bfyx_gemm_like",1], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",1], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13933912937625580405": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",538], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "3349519148124496343": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",2], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "15287650965861631130": ["convolution_gpu_bfyx_gemm_like",2], + "16172528828198474326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",2], + "11951606039079763598": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",809], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "5912303851874077576": ["convolution_gpu_bfyx_gemm_like",2], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",1], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",613], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3750338655074082587": ["fully_connected_gpu_fb_io_ref",1], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "18012549942299450620": ["convolution_gpu_bfyx_gemm_like",2], + "2140514316203117958": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "8025053805734757314": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5622089373755094139": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2730604806511016352": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "15640202505592598653": ["convolution_gpu_bfyx_gemm_like",2], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",1041], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",2], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "17824431042110985323": ["convolution_gpu_bfyx_gemm_like",1], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "10424278617647597641": ["convolution_gpu_bfyx_gemm_like",2], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2571882179292959757": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "8253823502854784432": ["convolution_gpu_bfyx_gemm_like",2], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",1], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "4229105529069729944": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",60], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17035903590837750750": ["convolution_gpu_bfyx_direct_10_12_16",2], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",2], + "9707630588260222630": ["convolution_gpu_bfyx_gemm_like",2], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",2], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17522452942286240233": ["convolution_gpu_bfyx_gemm_like",2], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",2], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7744787957569714828": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",2], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "17912189681971987483": ["convolution_gpu_bfyx_gemm_like",2], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5219048275475447369": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",1], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",2], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15997754881872769378": ["convolution_gpu_bfyx_gemm_like",2], + "700717277178942679": ["convolution_gpu_bfyx_gemm_like",2], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",942], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "2038505773698938555": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",1], + "17825280904760131680": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "16559140502701231107": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "482564204402769504": ["convolution_gpu_bfyx_gemm_like",2], + "1089679781525023551": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",1], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",2], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "17713034180977313726": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7903891232234389925": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",2], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "7430073011895298582": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "879896719155824868": ["convolution_gpu_bfyx_gemm_like",2], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "15494543914974994991": ["convolution_gpu_bfyx_gemm_like",1], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",2], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "17310332946322628458": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "7808544677773370430": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",2], + "18136765667969393174": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "8434794604559592624": ["convolution_gpu_bfyx_gemm_like",1], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "1103228955716492167": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",2], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",506], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4232250144427804891": ["fully_connected_gpu_bf_io_input_spatial",2], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",2], + "2534408579674556441": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5643908654122573882": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "10118395047539851751": ["convolution_gpu_bfyx_gemm_like",2], + "792684262493086891": ["convolution_gpu_bfyx_gemm_like",1], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",2], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "18386376129938707290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",940], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",390], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "4196367396954155354": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",59], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17723621158215826108": ["convolution_gpu_bfyx_gemm_like",2], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4220826666482500445": ["convolution_gpu_bfyx_gemm_like",2], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "4716188972902735458": ["convolution_gpu_bfyx_gemm_like",2], + "2007192658799516915": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12461575861709234385": ["convolution_gpu_bfyx_gemm_like",2], + "4766071144928072260": ["convolution_gpu_bfyx_gemm_like",1], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "18384657372655350144": ["convolution_gpu_bfyx_gemm_like",2], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "2345023488044002149": ["convolution_gpu_bfyx_gemm_like",1], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "7211355951470869591": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12644942072153919043": ["convolution_gpu_bfyx_direct_10_12_16",2], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",2], + "8578747191812631883": ["convolution_gpu_bfyx_gemm_like",2], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4239415134522959352": ["convolution_gpu_bfyx_gemm_like",2], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16884228931101540030": ["convolution_gpu_bfyx_gemm_like",2], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "6181272224000872375": ["convolution_gpu_bfyx_gemm_like",2], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "12693511427898130707": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13775529405693629438": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3746573775462003750": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "5409924335138540834": ["convolution_gpu_bfyx_gemm_like",2], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",0], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "11829442945690098558": ["convolution_gpu_bfyx_gemm_like",1], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7590767013583950613": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "5687802882700097624": ["convolution_gpu_bfyx_gemm_like",2], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",1], + "3755253206085028904": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "16264774056719724826": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",0], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",23], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15781622938833984014": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12160764253455777655": ["convolution_gpu_bfyx_gemm_like",2], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",5], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "287386909600391846": ["convolution_gpu_bfyx_direct_10_12_16",2], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",490], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",106], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "10607904718265020949": ["convolution_gpu_bfyx_gemm_like",2], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "9454954846682513038": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2305461098719675735": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "659846949368492111": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5754844816339228920": ["convolution_gpu_bfyx_gemm_like",1], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "12655099960717366198": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "7199295899520406795": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8321769923556905957": ["convolution_gpu_bfyx_gemm_like",2], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "18337160891834020517": ["convolution_gpu_bfyx_gemm_like",2], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1372939511728986224": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",438], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",1], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "14744368497944610864": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10792503079194374004": ["convolution_gpu_bfyx_gemm_like",1], + "14335423820860953927": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10433541468308381909": ["convolution_gpu_bfyx_gemm_like",2], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16818714747882774917": ["convolution_gpu_bfyx_gemm_like",2], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",940], + "3221221905804708596": ["convolution_gpu_bfyx_gemm_like",2], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "11192356850081328892": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "8451212914744825089": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "4640696923527766618": ["convolution_gpu_bfyx_gemm_like",2], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",348], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3448477246688526708": ["convolution_gpu_bfyx_gemm_like",1], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "14716719350966652036": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",2], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6902644989079870993": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",860], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "12353956380178079089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5287076386757143976": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",164], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",2], + "17025268985366223779": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16758962840329202004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "4239133538073498792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",2], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "8939683514448064461": ["convolution_gpu_bfyx_gemm_like",2], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",439], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "9226443907548972870": ["convolution_gpu_bfyx_gemm_like",2], + "15114370307779942381": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "15417738436777481469": ["convolution_gpu_bfyx_gemm_like",2], + "8057302050645780813": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "14681717813022425567": ["convolution_gpu_bfyx_gemm_like",2], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "8032685176029570383": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "1104489643524273315": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8507854696766492454": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14221578799010900252": ["convolution_gpu_bfyx_gemm_like",2], + "14902389080201926109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",0], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",1], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",0], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",592], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",2], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12534001599784153836": ["convolution_gpu_bfyx_gemm_like",1], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16710651492402564794": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18375125668176498051": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "7026575758396092435": ["convolution_gpu_bfyx_gemm_like",1], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15129834325410878425": ["convolution_gpu_bfyx_direct_10_12_16",2], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",91], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "11107930597263802755": ["convolution_gpu_bfyx_gemm_like",2], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",844], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",1], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "8797843396807284399": ["convolution_gpu_bfyx_gemm_like",2], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "17318287523550546026": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",553], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",1], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "12319073009094248232": ["convolution_gpu_bfyx_gemm_like",2], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9040046051053703359": ["convolution_gpu_bfyx_gemm_like",2], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "9447458159095730492": ["convolution_gpu_bfyx_gemm_like",2], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",2], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",1], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",59], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",1], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "6065819201836017182": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",0], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1934379409955686502": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2349007644347065353": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "14667209474639064623": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8079376692609682448": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",610], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "11655994466278963438": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",873], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",1], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5019077257951332016": ["convolution_gpu_bfyx_gemm_like",2], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4479979951990338510": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",1], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",1], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "529543453251381109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1410630713443793537": ["convolution_gpu_bfyx_gemm_like",1], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "1541754036637209097": ["convolution_gpu_bfyx_gemm_like",2], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14289048840489035546": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",1], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5095827462645341808": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "1249137685908951501": ["convolution_gpu_bfyx_gemm_like",1], + "8757900457181374694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9226912483632588371": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",898], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "72444706264681262": ["convolution_gpu_bfyx_gemm_like",2], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",90], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",0], + "6571438978296387721": ["convolution_gpu_bfyx_gemm_like",2], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",656], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "6522575549211855712": ["convolution_gpu_bfyx_gemm_like",2], + "689445825453914111": ["convolution_gpu_bfyx_gemm_like",2], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "4550028191070279999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",1], + "9475130054420979752": ["convolution_gpu_bfyx_gemm_like",2], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "14397348576352573007": ["convolution_gpu_bfyx_gemm_like",2], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "11806105193035393795": ["convolution_gpu_bfyx_gemm_like",2], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",1], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",0], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "12493863403516600413": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "7575675354187625951": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14213516751025324346": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1435153323458789173": ["convolution_gpu_bfyx_gemm_like",2], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "18109284647478027063": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",304], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10628725059172743408": ["convolution_gpu_bfyx_gemm_like",2], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2418288192668085805": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",1], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",724], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4217179485243909459": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "13810995219720233595": ["convolution_gpu_bfyx_gemm_like",2], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "7148542290597073512": ["convolution_gpu_bfyx_gemm_like",2], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "18218631037214746168": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16173557782125372935": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3106922888635965020": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13447028922679236865": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11060822686394981344": ["convolution_gpu_bfyx_gemm_like",2], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5334190564423375247": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7712831597869354170": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "15464327246951632247": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9759380701896779097": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",2], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11490143853656040028": ["convolution_gpu_bfyx_gemm_like",2], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1697248235682953135": ["convolution_gpu_bfyx_gemm_like",2], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",849], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3150231129728961455": ["convolution_gpu_bfyx_gemm_like",1], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3036808833459559381": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",2], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",188], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "1436052878894538927": ["convolution_gpu_bfyx_gemm_like",2], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "13418701036204748812": ["convolution_gpu_bfyx_direct_10_12_16",2], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "14810839157236175179": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",2], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "13503688893307029975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8272823732258536202": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "1954052357826969119": ["convolution_gpu_bfyx_gemm_like",1], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",2], + "5948701218437980356": ["convolution_gpu_bfyx_gemm_like",2], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",2], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",2], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "8195881973746570408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",946], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "3860603464276263676": ["convolution_gpu_bfyx_gemm_like",2], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "9954050478761346921": ["convolution_gpu_bfyx_gemm_like",2], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "16071723603031305677": ["convolution_gpu_bfyx_gemm_like",2], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",2], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16286085532892593349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1907439276166837309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "15857087373591747006": ["convolution_gpu_bfyx_gemm_like",2], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16044646335477470657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "15156525717629023944": ["convolution_gpu_bfyx_gemm_like",2], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "9404677451270692749": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",2], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "14131851237755716991": ["convolution_gpu_bfyx_gemm_like",2], + "3179874645565098825": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3355259926747524578": ["convolution_gpu_bfyx_gemm_like",2], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "9069334144391048686": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17928043901784474130": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "13762042713029963144": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11919129623429545762": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "8528750110601691390": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "2265784112305305260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "12394049027081208902": ["convolution_gpu_bfyx_gemm_like",1], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",761], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "16327433707667075261": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",2], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4927360358387344983": ["convolution_gpu_bfyx_gemm_like",1], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "9562527071055150197": ["convolution_gpu_bfyx_1x1",2], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17778091287904736965": ["convolution_gpu_bfyx_gemm_like",2], + "12375919467924385618": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "6288489890578212082": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10864011008000364415": ["convolution_gpu_bfyx_1x1",2], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",2], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13830605041347009953": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "14544219140091420262": ["convolution_gpu_bfyx_gemm_like",1], + "3141886504884887200": ["convolution_gpu_bfyx_gemm_like",2], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",2], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7292351660229751817": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "8943913562339525413": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "11706446082856895571": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",2], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "8651641584737798174": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",2], + "5941298590926032148": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "2929190644951986399": ["convolution_gpu_bfyx_gemm_like",2], + "8421388456873652700": ["convolution_gpu_bfyx_gemm_like",2], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",2], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2242829490403202087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",2], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "12900949103593247293": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",0], + "3872151366780051246": ["convolution_gpu_bfyx_gemm_like",1], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",1], + "10753540518493641553": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "16182470664818268848": ["convolution_gpu_bfyx_gemm_like",1], + "9076758673133996959": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "6726099352298108756": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "2339864165283480961": ["convolution_gpu_bfyx_1x1",2], + "1760391741350091665": ["convolution_gpu_bfyx_gemm_like",2], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",2], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",0], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12194037100109755112": ["convolution_gpu_bfyx_gemm_like",2], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",2], + "9426665763007611385": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",774], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",882], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",1], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "13328911884191551889": ["convolution_gpu_bfyx_1x1",2], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "584086621952390547": ["convolution_gpu_bfyx_gemm_like",2], + "12370729327673204804": ["convolution_gpu_bfyx_gemm_like",2], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",659], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "14349625788399542568": ["convolution_gpu_bfyx_gemm_like",1], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "7870154008378361670": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",2], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "1569043950563130463": ["convolution_gpu_bfyx_gemm_like",1], + "16839741351990811959": ["convolution_gpu_bfyx_gemm_like",2], + "5012013738970489338": ["convolution_gpu_bfyx_1x1",2], + "15289152041466330689": ["convolution_gpu_bfyx_gemm_like",2], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",2], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",1], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",2], + "7531346828150129063": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",2], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "11939914680143672459": ["fully_connected_gpu_fb_oi_ref",1], + "7106362077449435105": ["convolution_gpu_bfyx_gemm_like",0], + "5600807544955072308": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",1], + "2296581485980163665": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13809330759308309353": ["convolution_gpu_bfyx_gemm_like",2], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "1345101751956733589": ["convolution_gpu_bfyx_gemm_like",2], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "4834446692898125871": ["convolution_gpu_bfyx_gemm_like",2], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",2], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",1], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",2], + "49948277487706148": ["convolution_gpu_bfyx_1x1",2], + "17214254645087272557": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",1], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",106], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",1], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "4091702228990140696": ["convolution_gpu_bfyx_gemm_like",1], + "7913076120244203725": ["convolution_gpu_bfyx_gemm_like",2], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",2], + "1632416005093914709": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12972798847556569913": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",2], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "16925721317097534009": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12977678792503377525": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",0], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7015738038963065110": ["convolution_gpu_bfyx_gemm_like",2], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "11132679855317294753": ["convolution_gpu_bfyx_gemm_like",1], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7432142107544210174": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",1], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "2936333406928424760": ["convolution_gpu_bfyx_1x1",2], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",2], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",2], + "14115742296883450319": ["convolution_gpu_bfyx_gemm_like",1], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4461989328775275994": ["convolution_gpu_bfyx_gemm_like",2], + "12397280593466519809": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "17672785701483179117": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",2], + "6254141935545262078": ["convolution_gpu_bfyx_gemm_like",1], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",1], + "12495003066477974474": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "7481256533438761028": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "12319073009094248232": ["convolution_gpu_bfyx_gemm_like",2], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "1680468564927032670": ["convolution_gpu_bfyx_gemm_like",1], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "13472577372534605883": ["convolution_gpu_bfyx_gemm_like",1], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",2], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",0], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",0], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",1], + "6942016672941874829": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9440117898128288296": ["convolution_gpu_bfyx_gemm_like",2], + "11324651029379152442": ["convolution_gpu_bfyx_1x1",2], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "394778201589371681": ["convolution_gpu_bfyx_gemm_like",2], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",1], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "1359720957005310113": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5720964268093705079": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",2], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",2], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",0], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",2], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",2], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",0], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1186545671730357033": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "15065019229949449623": ["convolution_gpu_bfyx_gemm_like",1], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "7770000755097925765": ["convolution_gpu_bfyx_1x1",2], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "14151747022287993729": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9741607635826869269": ["convolution_gpu_bfyx_gemm_like",1], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",1], + "17025324057045572535": ["convolution_gpu_bfyx_gemm_like",1], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",0], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "18035673326929466074": ["convolution_gpu_bfyx_gemm_like",1], + "2273992727647793692": ["convolution_gpu_bfyx_gemm_like",1], + "677249604491773387": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",2], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",1], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",2], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "17742192339816511494": ["convolution_gpu_bfyx_gemm_like",2], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",2], + "6217542346826403576": ["convolution_gpu_bfyx_1x1",2], + "2242602888499888844": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",2], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",2], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",2], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",1], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",2], + "15271783562528081169": ["convolution_gpu_bfyx_gemm_like",2], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "17084977396231597605": ["convolution_gpu_bfyx_gemm_like",1], + "8541982562061181756": ["convolution_gpu_bfyx_gemm_like",1], + "13735180250757239202": ["convolution_gpu_bfyx_gemm_like",2], + "7375461241315602473": ["convolution_gpu_bfyx_gemm_like",2], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9954050478761346921": ["convolution_gpu_bfyx_gemm_like",2], + "13960388312976163971": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "1208161922424418734": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",2], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8712136292276123857": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",2], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",2], + "11277866878590984477": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "3563872903821081702": ["convolution_gpu_bfyx_gemm_like",1], + "8155268141318893606": ["convolution_gpu_bfyx_gemm_like",1], + "13038533272699602337": ["convolution_gpu_bfyx_gemm_like",2], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",2], + "15031155621982459860": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5040095338370816349": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",2], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15363606233048272809": ["convolution_gpu_bfyx_1x1",2], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",2], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",2], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "8857763129101380288": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",2], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "4135068756462147853": ["convolution_gpu_bfyx_gemm_like",1], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "10837496380266058422": ["convolution_gpu_bfyx_gemm_like",2], + "17174919737114915467": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "2527276292172180386": ["convolution_gpu_bfyx_gemm_like",2], + "13006774775034887171": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "17318287523550546026": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",1], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",2], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",478], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6300691162962736560": ["convolution_gpu_bfyx_direct_10_12_16",2], + "632116056424249698": ["convolution_gpu_bfyx_gemm_like",1], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",2], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",2], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",2], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "12501619443242354860": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",2], + "5600128039063009632": ["convolution_gpu_bfyx_gemm_like",1], + "9737565171095493297": ["convolution_gpu_bfyx_gemm_like",1], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",2], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16949056117405140365": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",1], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",2], + "10899110544832584656": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",0], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",2], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "3622409603053918029": ["convolution_gpu_bfyx_gemm_like",1], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",2], + "4571404165794634411": ["convolution_gpu_bfyx_1x1",2], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13565691057064774487": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "15106614232165315070": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",2], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",2], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",2], + "6114241186364821679": ["convolution_gpu_bfyx_gemm_like",2], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",1], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",2], + "9525535670799618110": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "345043289576587800": ["convolution_gpu_bfyx_1x1",2], + "14466032674083938714": ["convolution_gpu_bfyx_gemm_like",1], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "8519354640245415816": ["convolution_gpu_bfyx_gemm_like",1], + "8761283252495354972": ["convolution_gpu_bfyx_gemm_like",2], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",2], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",1], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",2], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",2], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "721174714308243785": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2321148334382088982": ["convolution_gpu_bfyx_gemm_like",2], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "9803492989444302959": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",2], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",2], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "2173720698351153121": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",0], + "1485662490111767875": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "13161997040644039778": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16293465561256937726": ["convolution_gpu_bfyx_gemm_like",2], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",2], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",1], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "883436333317162926": ["convolution_gpu_bfyx_1x1",2], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1318571118468536310": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "150132162949295379": ["convolution_gpu_bfyx_1x1",2], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",2], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",2], + "13602140021189675477": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",79], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "11604794601689380990": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "5754396201681434378": ["convolution_gpu_bfyx_1x1",2], + "11241838709529552265": ["convolution_gpu_bfyx_gemm_like",2], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "4933831571091731212": ["convolution_gpu_bfyx_gemm_like",1], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "11807282628372660280": ["convolution_gpu_bfyx_1x1",2], + "15822546325822628634": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",1], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14147460733160099960": ["convolution_gpu_bfyx_gemm_like",1], + "14578867494693499627": ["convolution_gpu_bfyx_gemm_like",2], + "3541538046227217664": ["convolution_gpu_bfyx_gemm_like",1], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "17344974951998490453": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "10292349730148518173": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",2], + "10292585962794261197": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",2], + "9809458159478958866": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "11872943152839631823": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "16364494883229084045": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15924916465272239832": ["convolution_gpu_bfyx_gemm_like",2], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4759671642533786591": ["convolution_gpu_bfyx_gemm_like",2], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",0], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "4079026972040047969": ["convolution_gpu_bfyx_gemm_like",2], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",2], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",2], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "15078168059698267650": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",1], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",2], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",2], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",0], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",1], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "9714508918051740792": ["convolution_gpu_bfyx_gemm_like",1], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",2], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "8787438180071123604": ["convolution_gpu_bfyx_gemm_like",1], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",2], + "10471519687597963116": ["convolution_gpu_bfyx_gemm_like",1], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "959260710517842876": ["convolution_gpu_bfyx_gemm_like",2], + "15078590909693331731": ["convolution_gpu_bfyx_gemm_like",2], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",2], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",0], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",2], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",2], + "12712071520541638451": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "994842991399671507": ["convolution_gpu_bfyx_gemm_like",1], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "1596353239542510685": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",2], + "3398322619007806698": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13575423234109624706": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "15809639778580769565": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2920840796593281126": ["convolution_gpu_bfyx_gemm_like",2], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",1038], + "1418595171949196661": ["convolution_gpu_bfyx_gemm_like",2], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",2], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",0], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",2], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",1], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",2], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",1], + "18184621367843960190": ["convolution_gpu_bfyx_gemm_like",2], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "5911282942658469852": ["convolution_gpu_bfyx_gemm_like",1], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "3154539627593235077": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",1], + "579781312141502576": ["convolution_gpu_bfyx_1x1",2], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",0], + "12625112690264223217": ["convolution_gpu_bfyx_gemm_like",2], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",1], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "4819131094439732065": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "331661172067077796": ["convolution_gpu_bfyx_1x1",2], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15859493313686060349": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",2], + "13425251102263428554": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",2], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",2], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",2], + "11800783548769329949": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13124342334495538095": ["convolution_gpu_bfyx_gemm_like",2], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15799159401545270696": ["convolution_gpu_bfyx_gemm_like",1], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "15961487889420208188": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "787203599734115483": ["convolution_gpu_bfyx_1x1",0], + "6856130385095139346": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "13369603621524676979": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",0], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "17224104246148265328": ["convolution_gpu_bfyx_gemm_like",2], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "8543619733732987550": ["convolution_gpu_bfyx_gemm_like",1], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",1], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "14362876471450307424": ["convolution_gpu_bfyx_1x1",2], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",2], + "142329025839464842": ["convolution_gpu_bfyx_1x1",2], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",2], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "2727175120437582536": ["convolution_gpu_bfyx_gemm_like",1], + "631489011812924153": ["convolution_gpu_bfyx_1x1",2], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "5044721291675005144": ["convolution_gpu_bfyx_1x1",2], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9314293064351558241": ["convolution_gpu_bfyx_gemm_like",2], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",2], + "3699344686791530101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",2], + "2204178900998688268": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13833960927635646899": ["convolution_gpu_bfyx_gemm_like",1], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",0], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1982176363226079588": ["convolution_gpu_bfyx_gemm_like",2], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7669403041163460089": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",2], + "6817494598328071314": ["convolution_gpu_bfyx_gemm_like",2], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",2], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "9243949750444156746": ["convolution_gpu_bfyx_gemm_like",1], + "9545968464906009869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16748662918272106932": ["convolution_gpu_bfyx_gemm_like",1], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "1390379098099686972": ["convolution_gpu_bfyx_1x1",2], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",2], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",2], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",0], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",2], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "2581414750854621875": ["convolution_gpu_bfyx_gemm_like",2], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16911464046178654033": ["convolution_gpu_bfyx_1x1",2], + "16341722570340169855": ["convolution_gpu_bfyx_1x1",2], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",2], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",2], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",0], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",2], + "3976736548270395981": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "7235358742317442134": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",2], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "8490260671996115530": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",1], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",2], + "6603778920476932267": ["convolution_gpu_bfyx_gemm_like",1], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9480653639044390919": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",2], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "1074748462756364699": ["fully_connected_gpu_fb_oi_ref",1], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "16075006181495932250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13468081302022888489": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",2], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",2], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",2], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",2], + "6648876837655776653": ["convolution_gpu_bfyx_1x1",2], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "135072053401934228": ["convolution_gpu_bfyx_1x1",2], + "2477849395789783501": ["convolution_gpu_bfyx_gemm_like",2], + "17109520309574369561": ["convolution_gpu_bfyx_gemm_like",2], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",2], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "14502856487639608696": ["convolution_gpu_bfyx_gemm_like",2], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",0], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",1], + "8700574100180128776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",2], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",2], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",2], + "15192230303376521834": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15188570678726970998": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4718716595177056289": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",2], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",0], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",2], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",2], + "15773157615731010456": ["convolution_gpu_bfyx_gemm_like",2], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "11130439225010714550": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",1], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3779229442395464456": ["convolution_gpu_bfyx_gemm_like",1], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",0], + "9193880745263317167": ["convolution_gpu_bfyx_gemm_like",2], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",1], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",1], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",2], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",2], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",1], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",2], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",0], + "10785966734346479177": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "18416908414174464784": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",2], + "2817919813339364130": ["convolution_gpu_bfyx_gemm_like",1], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",0], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",2], + "3603187029740446600": ["convolution_gpu_bfyx_gemm_like",2], + "5758133252959371492": ["convolution_gpu_bfyx_gemm_like",2], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",1], + "537074122417021898": ["convolution_gpu_bfyx_gemm_like",2], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "15231987838322151865": ["convolution_gpu_bfyx_1x1",2], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7107677063657303327": ["convolution_gpu_bfyx_1x1",2], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",2], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",1], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",2], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7171904645566467208": ["convolution_gpu_bfyx_gemm_like",2], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11669828823444745889": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "5941092474669713339": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",1], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",0], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",1], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",1], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16610284927818475574": ["convolution_gpu_bfyx_gemm_like",2], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4403753181729432604": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "3820661057776133570": ["convolution_gpu_bfyx_1x1",2], + "14985236276429954162": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8096131027165540886": ["convolution_gpu_bfyx_gemm_like",2], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "1984152634309440563": ["convolution_gpu_bfyx_gemm_like",2], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",2], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10893432143734884603": ["convolution_gpu_bfyx_gemm_like",2], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "9328223957245552723": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2173867324489962689": ["convolution_gpu_bfyx_gemm_like",1], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",2], + "7232326270078161768": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "13590444711975157776": ["convolution_gpu_bfyx_gemm_like",1], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "10724501418439612080": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14795618530175274538": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "8792202318168046223": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "3477539135137665170": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "938848188161536107": ["convolution_gpu_bfyx_1x1",0], + "4455369117448405874": ["convolution_gpu_bfyx_1x1",2], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",2], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",2], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",1], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",2], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5291011077679733990": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "14204609663091442879": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",0], + "16361932270527364507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "14206076551739831333": ["convolution_gpu_bfyx_gemm_like",1], + "818998169319147148": ["convolution_gpu_bfyx_gemm_like",1], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",2], + "16139615240471264488": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",1], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "4849343880559509889": ["convolution_gpu_bfyx_1x1",2], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",1], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "8561261337239934159": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",2], + "4848143712599565301": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",2], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "5963901433137582265": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",2], + "2856601829807186494": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "4238885454989272754": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",2], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",1], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10883992248631603006": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",0], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "14885031472057965707": ["convolution_gpu_bfyx_direct_10_12_16",2], + "216603198215625772": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "7532088618116521936": ["convolution_gpu_bfyx_gemm_like",2], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7075659071934895087": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "15757308772667178999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "12421204749289937399": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17515847111676784130": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "8130920994920685157": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "17228810554159747400": ["convolution_gpu_bfyx_gemm_like",2], + "3503893875515897267": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",1], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "13145474177271090694": ["convolution_gpu_bfyx_gemm_like",2], + "15967614281807823696": ["convolution_gpu_bfyx_gemm_like",2], + "13474805373264874144": ["convolution_gpu_bfyx_1x1",2], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",0], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "3190494353583341446": ["convolution_gpu_bfyx_gemm_like",1], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "16574710115918192418": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "6651389480007764007": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "2866656294663853474": ["convolution_gpu_bfyx_1x1",2], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "6484375582324852109": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "14001406016806064079": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",1], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",2], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",0], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "3499106702307464480": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3056212889689424946": ["convolution_gpu_bfyx_1x1",2], + "13251091004269229867": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4362304842016958728": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "1520529227443340435": ["convolution_gpu_bfyx_gemm_like",2], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",2], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "17011363406405852347": ["convolution_gpu_bfyx_gemm_like",2], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "6328802691680458752": ["convolution_gpu_bfyx_gemm_like",2], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4804533178560338520": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "9918371346247634545": ["convolution_gpu_bfyx_gemm_like",2], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "18427056032084727710": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",2], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "10765280349477640969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4815047491742617397": ["convolution_gpu_bfyx_gemm_like",2], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "2863465257341735941": ["convolution_gpu_bfyx_1x1",2], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",2], + "5074273865983613482": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "998876398773540321": ["convolution_gpu_bfyx_1x1",2], + "5926747396493954633": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13454265023861566476": ["convolution_gpu_bfyx_gemm_like",2], + "5008350851224686853": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "1425953627379976115": ["convolution_gpu_bfyx_gemm_like",1], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",2], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "15739278428190392018": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "10480527638577674825": ["convolution_gpu_bfyx_1x1",2], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",2], + "10709828018763273371": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",2], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",89], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "490931535580183607": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",2], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",2], + "14289082888174784976": ["convolution_gpu_bfyx_gemm_like",2], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",2], + "16238415425814188039": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",2], + "8984436655107983227": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",0], + "4531222427159927606": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "7780140599533242850": ["convolution_gpu_bfyx_gemm_like",1], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7201521533301617290": ["convolution_gpu_bfyx_gemm_like",1], + "10509933181132310969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",2], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "1245259979364728404": ["convolution_gpu_bfyx_1x1",2], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",2], + "14034525799882831106": ["convolution_gpu_bfyx_gemm_like",2], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5118467701668427545": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "9987415314864002460": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13585163747565192884": ["convolution_gpu_bfyx_gemm_like",2], + "3385797925880519845": ["convolution_gpu_bfyx_1x1",2], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "3106922888635965020": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14126906427006602775": ["convolution_gpu_bfyx_1x1",2], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",1], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "12871555773123368130": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "739676584505475609": ["convolution_gpu_bfyx_gemm_like",2], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",2], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "10330180429524641331": ["convolution_gpu_bfyx_gemm_like",2], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",2], + "14540578324750869319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15989894214714907271": ["convolution_gpu_bfyx_gemm_like",2], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "7132328255408635227": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "3441335188113424896": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "11806402239500046867": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10432365444137108781": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",1], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7683334381958571864": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8931169575495985034": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12166852830214895457": ["convolution_gpu_bfyx_1x1",2], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",2], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12279771749366327372": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",1], + "4378422094110940766": ["convolution_gpu_bfyx_gemm_like",1], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",1], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",2], + "13760645810144930270": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16681690088928624738": ["convolution_gpu_bfyx_gemm_like",2], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",2], + "4708035980731751007": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",2], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "9955939178447682108": ["convolution_gpu_bfyx_1x1",2], + "11705756153433897198": ["convolution_gpu_bfyx_1x1",2], + "12806934028210472719": ["convolution_gpu_bfyx_gemm_like",2], + "10681768474583067517": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",2], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "5657471280535146301": ["convolution_gpu_bfyx_gemm_like",1], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "2984726467649419856": ["convolution_gpu_bfyx_gemm_like",2], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",2], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",1], + "2188101366183302888": ["convolution_gpu_bfyx_gemm_like",1], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "3725013268198063198": ["convolution_gpu_bfyx_1x1",2], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "3635446784873718932": ["convolution_gpu_bfyx_gemm_like",2], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",1], + "87031578643428011": ["convolution_gpu_bfyx_1x1",2], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",2], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",1], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "18436249934780056991": ["convolution_gpu_bfyx_gemm_like",2], + "5965451243366505522": ["convolution_gpu_bfyx_gemm_like",1], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",2], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17207560805775399864": ["convolution_gpu_bfyx_gemm_like",1], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "13403161389559730": ["convolution_gpu_bfyx_gemm_like",2], + "7223801044761006523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8837721075413149240": ["convolution_gpu_bfyx_gemm_like",1], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "8609939102588915855": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "10914921540144371519": ["convolution_gpu_bfyx_gemm_like",1], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "15555083739490354527": ["convolution_gpu_bfyx_gemm_like",2], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",2], + "951747146164097188": ["convolution_gpu_bfyx_1x1",2], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "6845814820599174031": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",1], + "10415046594066474634": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "10722677916294015259": ["convolution_gpu_bfyx_gemm_like",2], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "5740745357953479527": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "138379779469699309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",2], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "11910735867274493498": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2439993891369206440": ["convolution_gpu_bfyx_1x1",2], + "13302687772426736346": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14764715930784496165": ["convolution_gpu_bfyx_gemm_like",2], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",2], + "17647962002015093887": ["convolution_gpu_bfyx_gemm_like",2], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",2], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",2], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4890043345392707202": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "438528596970898721": ["convolution_gpu_bfyx_gemm_like",2], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",1], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "15677717057398875599": ["convolution_gpu_bfyx_gemm_like",2], + "12512751736409465214": ["convolution_gpu_bfyx_gemm_like",1], + "708747442142592697": ["convolution_gpu_bfyx_gemm_like",2], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",2], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "14263790627243107300": ["convolution_gpu_bfyx_gemm_like",1], + "3383222668132648804": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",1], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",1], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",2], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",2], + "13264617841270329349": ["convolution_gpu_bfyx_1x1",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4398371999113956082": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "603883331897298932": ["convolution_gpu_bfyx_direct_10_12_16",2], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",2], + "14532519639619315651": ["convolution_gpu_bfyx_gemm_like",2], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "16773645387243701837": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",2], + "15980348884716629349": ["convolution_gpu_bfyx_gemm_like",1], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "6020017927557041768": ["convolution_gpu_bfyx_gemm_like",1], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",2], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",2], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "3177304125602972370": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "11637325834858582585": ["convolution_gpu_bfyx_gemm_like",2], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5245526691775741296": ["convolution_gpu_bfyx_gemm_like",1], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",2], + "6222595759158615206": ["convolution_gpu_bfyx_gemm_like",1], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3662747857062156477": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",2], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",2], + "14079654309452583394": ["convolution_gpu_bfyx_gemm_like",1], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",1], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",2], + "16347412180100581330": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",2], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",2], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "2683304757433993300": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "905526102343710614": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "4098191685457418125": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "11690533591656807605": ["convolution_gpu_bfyx_gemm_like",2], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",2], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",2], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "10256831975351722184": ["convolution_gpu_bfyx_gemm_like",2], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3336076058264596420": ["convolution_gpu_bfyx_gemm_like",2], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "6109013751635776331": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",2], + "1963081583851864291": ["convolution_gpu_bfyx_gemm_like",1], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "10645625090439446714": ["convolution_gpu_bfyx_gemm_like",2], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "12755692101476964677": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5156033406916344703": ["convolution_gpu_bfyx_gemm_like",1], + "5479761740065152589": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12054200116003751590": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",2], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",2], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",2], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",2], + "8069537351442302814": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "10447947790216991304": ["convolution_gpu_bfyx_gemm_like",2], + "7667898603371717971": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",1], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",1], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",1], + "16910952799476896905": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "7183578232279711009": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "11263540528012919947": ["convolution_gpu_bfyx_1x1",2], + "5115007207028125638": ["convolution_gpu_bfyx_gemm_like",2], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "4252157815622916471": ["convolution_gpu_bfyx_1x1",2], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",2], + "16579057939215877904": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "14558572801374416278": ["convolution_gpu_bfyx_gemm_like",2], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",0], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "851057218719456209": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",0], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "5459463503840817402": ["convolution_gpu_bfyx_1x1",2], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",2], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10912495395422146386": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",2], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "12866217660635921034": ["convolution_gpu_bfyx_gemm_like",1], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "1089944493540593798": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "7650862961269327235": ["convolution_gpu_bfyx_1x1",2], + "12353956380178079089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",2], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",2], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",1], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",1], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",904], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5751283221740229986": ["convolution_gpu_bfyx_gemm_like",2], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",803], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",2], + "8257103926661643451": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15386715291503303766": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "5941852872160795604": ["convolution_gpu_bfyx_gemm_like",2], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",2], + "9585113116232600562": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "3114869763557037270": ["fully_connected_gpu_fb_oi_ref",2], + "577844026691991089": ["convolution_gpu_bfyx_gemm_like",1], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "3219408878901707426": ["convolution_gpu_bfyx_gemm_like",1], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12151068022697708126": ["convolution_gpu_bfyx_gemm_like",2], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "10979362792894404338": ["convolution_gpu_bfyx_gemm_like",2], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",2], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",2], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",2], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2114232149447438823": ["convolution_gpu_bfyx_gemm_like",2], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4216958486055161753": ["convolution_gpu_bfyx_gemm_like",2], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "15924583510704449214": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17446505012657609153": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "794499287296495726": ["convolution_gpu_bfyx_1x1",2], + "4773077837537775324": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16986358655784856534": ["convolution_gpu_bfyx_os_iyx_osv16",724], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8794896449397768269": ["convolution_gpu_bfyx_gemm_like",2], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "4126895998426674411": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2608363732937932266": ["convolution_gpu_bfyx_gemm_like",2], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",2], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",2], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",2], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "18255227391100087860": ["convolution_gpu_bfyx_1x1",2], + "11587239927319376658": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "10141927023849730720": ["convolution_gpu_bfyx_1x1",2], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",0], + "733956743303342862": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "9737833587413114584": ["convolution_gpu_bfyx_gemm_like",1], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "14930789530046665855": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "5539793555189956907": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "16243196137456624852": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "503369896500284129": ["convolution_gpu_bfyx_1x1",2], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",1], + "4769003637955328938": ["convolution_gpu_bfyx_gemm_like",1], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6131481289104111211": ["convolution_gpu_bfyx_gemm_like",2], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "10022487076451608714": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "6290584630172122012": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",1], + "15636128989267984459": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",1], + "12917241193304093727": ["convolution_gpu_bfyx_gemm_like",2], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15329680728165965773": ["convolution_gpu_bfyx_gemm_like",2], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "17477062954520561609": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14184895905338394239": ["convolution_gpu_bfyx_gemm_like",2], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",2], + "12046017161414846599": ["convolution_gpu_bfyx_1x1",2], + "10106454449619141260": ["convolution_gpu_bfyx_1x1",2], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "11626398907755088688": ["convolution_gpu_bfyx_gemm_like",1], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "6664482192233202590": ["convolution_gpu_bfyx_gemm_like",2], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",1], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "12985942652866621579": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",2], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "15497797842820949408": ["convolution_gpu_bfyx_gemm_like",1], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",153], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",2], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12604104383683210104": ["convolution_gpu_bfyx_gemm_like",2], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "2598267743388306204": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",2], + "17264010982688979937": ["convolution_gpu_bfyx_1x1",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",0], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "8458082326743351141": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "290134020607738418": ["convolution_gpu_bfyx_gemm_like",1], + "7846384623429362522": ["convolution_gpu_bfyx_1x1",2], + "2929715823970060874": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "3265415000818832667": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",0], + "9542325095876448686": ["convolution_gpu_bfyx_gemm_like",1], + "13596876807637507229": ["convolution_gpu_bfyx_1x1",2], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",1], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",2], + "7450417963648518926": ["convolution_gpu_bfyx_gemm_like",2], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",2], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "7447163906170805189": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "11768117585574496387": ["convolution_gpu_bfyx_gemm_like",2], + "14766477690417085350": ["convolution_gpu_bfyx_1x1",2], + "15006321421735686121": ["convolution_gpu_bfyx_gemm_like",2], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",1], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10536316961655703500": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "9439431829175743345": ["convolution_gpu_bfyx_gemm_like",1], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",2], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",2], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2554991397391195611": ["convolution_gpu_bfyx_gemm_like",2], + "11634932044447867039": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",0], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "11626402549863483301": ["convolution_gpu_bfyx_gemm_like",2], + "7954972694876158422": ["convolution_gpu_bfyx_1x1",2], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "671453551040072499": ["convolution_gpu_bfyx_gemm_like",2], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "5240706676373148280": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "5346898505346646714": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "3017824560305532066": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "10572945270796129630": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",1], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "15578456771467281881": ["convolution_gpu_bfyx_gemm_like",1], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "13851025202247070979": ["convolution_gpu_yxfb_yxio_b16",2], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "11604794601689380990": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "4531222427159927606": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",0], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "18161786710055240343": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6832967250168141428": ["convolution_gpu_yxfb_yxio_b16",2], + "11705756153433897198": ["convolution_gpu_bfyx_1x1",2], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "994842991399671507": ["convolution_gpu_bfyx_gemm_like",1], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "10480527638577674825": ["convolution_gpu_bfyx_1x1",2], + "11679235499894668689": ["convolution_gpu_yxfb_yxio_b16",2], + "12617736879671137111": ["convolution_gpu_yxfb_yxio_b16",2], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4723919313760470311": ["convolution_gpu_yxfb_yxio_b16",2], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "8686733586982652897": ["convolution_gpu_yxfb_yxio_b16",2], + "17436550598696178210": ["convolution_gpu_yxfb_yxio_b16",2], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "9328585005923667676": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12469992822259989528": ["convolution_gpu_yxfb_yxio_b16",2], + "17397600088595751782": ["convolution_gpu_yxfb_yxio_b16",2], + "17085927772068621152": ["convolution_gpu_yxfb_yxio_b16",2], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "1281190653081960886": ["convolution_gpu_yxfb_yxio_b16",2], + "15548854462657362014": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",2], + "4815047491742617397": ["convolution_gpu_bfyx_gemm_like",2], + "16711142379173254655": ["convolution_gpu_yxfb_yxio_b16",2], + "12703696322769371912": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "13830605041347009953": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "15646081020506130125": ["convolution_gpu_yxfb_yxio_b16",2], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "1208161922424418734": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",2], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",2], + "5754396201681434378": ["convolution_gpu_bfyx_1x1",2], + "6776601719651959634": ["convolution_gpu_yxfb_yxio_b16",2], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "18333355024265557430": ["convolution_gpu_yxfb_yxio_b16",2], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",2], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "3536359641225772698": ["convolution_gpu_yxfb_yxio_b16",2], + "15363606233048272809": ["convolution_gpu_bfyx_1x1",2], + "6859143702528475520": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8458082326743351141": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "3477539135137665170": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "12546446257192651407": ["convolution_gpu_yxfb_yxio_b16",2], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "1425953627379976115": ["convolution_gpu_bfyx_gemm_like",1], + "12913866095318048752": ["convolution_gpu_bfyx_gemm_like",2], + "3582256192870592087": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "8585205898894363799": ["convolution_gpu_yxfb_yxio_b16",2], + "16270745071180354612": ["convolution_gpu_bfyx_gemm_like",2], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",1], + "6962062962411903140": ["convolution_gpu_yxfb_yxio_b16",2], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "10005348255972308430": ["convolution_gpu_yxfb_yxio_b16",2], + "9974905660671605427": ["convolution_gpu_yxfb_yxio_b16",2], + "17344974951998490453": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "8250212706222997384": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3541538046227217664": ["convolution_gpu_bfyx_gemm_like",1], + "6438721407426283362": ["convolution_gpu_yxfb_yxio_b16",2], + "18433141005552346566": ["convolution_gpu_yxfb_yxio_b16",1], + "215512025430490450": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "5977875644245993099": ["convolution_gpu_yxfb_yxio_b16",2], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "4780291919667721265": ["convolution_gpu_yxfb_yxio_b16",2], + "15135644084742750702": ["convolution_gpu_bfyx_gemm_like",2], + "16214394186337220006": ["convolution_gpu_yxfb_yxio_b16",2], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18416908414174464784": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "5727758374304309350": ["convolution_gpu_yxfb_yxio_b16",2], + "9832505855130134649": ["convolution_gpu_yxfb_yxio_b16",2], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7949069388917479511": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8974851555526896131": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "14307705501349750896": ["convolution_gpu_yxfb_yxio_b16",2], + "15871357525719630224": ["convolution_gpu_bfyx_1x1",2], + "9674248159643501374": ["convolution_gpu_yxfb_yxio_b16",2], + "11807282628372660280": ["convolution_gpu_bfyx_1x1",2], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "3731224822876468602": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",2], + "13009381943944182288": ["convolution_gpu_yxfb_yxio_b16",2], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3648713169465596196": ["convolution_gpu_yxfb_yxio_b16",2], + "3779229442395464456": ["convolution_gpu_bfyx_gemm_like",1], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "4261192887643002603": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "17045386022302353268": ["convolution_gpu_yxfb_yxio_b16",2], + "2290965424106255219": ["convolution_gpu_yxfb_yxio_b16",2], + "11537166370263116277": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "6418500550523945192": ["convolution_gpu_yxfb_yxio_b16",2], + "577844026691991089": ["convolution_gpu_bfyx_gemm_like",1], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "8450272092307894299": ["convolution_gpu_yxfb_yxio_b16",2], + "5802466130040230797": ["convolution_gpu_yxfb_yxio_b16",2], + "6950586691727980329": ["convolution_gpu_yxfb_yxio_b16",2], + "6445721440921372329": ["convolution_gpu_yxfb_yxio_b16",2], + "16901594465545439334": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18033349045324117723": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "10996596479775375564": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15188273255634848057": ["convolution_gpu_yxfb_yxio_b16",2], + "1135062632388082485": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "142329025839464842": ["convolution_gpu_bfyx_1x1",2], + "4863644213728386734": ["convolution_gpu_yxfb_yxio_b16",2], + "5941298590926032148": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "13426254939418471242": ["convolution_gpu_yxfb_yxio_b16",2], + "7132328255408635227": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "6208201398783088425": ["convolution_gpu_bfyx_gemm_like",0], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16385915289511951113": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9532499374173117612": ["fully_connected_gpu_fb_oi_ref",2], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "15989894214714907271": ["convolution_gpu_bfyx_gemm_like",2], + "9899211365930959346": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "402932154499003993": ["convolution_gpu_yxfb_yxio_b16",2], + "7393551951402219833": ["convolution_gpu_yxfb_yxio_b16",2], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14242202444788213591": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "5155616842071169667": ["convolution_gpu_yxfb_yxio_b16",2], + "13810735868750326592": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "4021045600853993587": ["convolution_gpu_yxfb_yxio_b16",2], + "12397280593466519809": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "417352773179383568": ["convolution_gpu_yxfb_yxio_b16",2], + "4137738705782981426": ["convolution_gpu_bfyx_gemm_like",2], + "6726099352298108756": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "14263605862840500474": ["convolution_gpu_yxfb_yxio_b16",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",2], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",1], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",2], + "11787674847611032323": ["convolution_gpu_yxfb_yxio_b16",2], + "18239740525818575112": ["convolution_gpu_yxfb_yxio_b16",2], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17536591931934691648": ["convolution_gpu_yxfb_yxio_b16",2], + "17051718450741106678": ["convolution_gpu_yxfb_yxio_b16",2], + "10681768474583067517": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1786105567361070086": ["convolution_gpu_yxfb_yxio_b16",2], + "14044732537191084187": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14165325329016075285": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "7897973318803646560": ["convolution_gpu_yxfb_yxio_b16",2], + "6097086855988597139": ["convolution_gpu_bfyx_1x1",2], + "3713558537660711857": ["convolution_gpu_yxfb_yxio_b16",2], + "5355283113999405036": ["convolution_gpu_yxfb_yxio_b16",2], + "2457671437276780303": ["convolution_gpu_yxfb_yxio_b16",2], + "7822463130304602936": ["convolution_gpu_yxfb_yxio_b16",2], + "13111122805945249561": ["convolution_gpu_yxfb_yxio_b16",2], + "5938850739683493929": ["convolution_gpu_yxfb_yxio_b16",2], + "2835909063063272102": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "4261215727469154244": ["convolution_gpu_yxfb_yxio_b16",2], + "8096131027165540886": ["convolution_gpu_bfyx_gemm_like",2], + "6490907666077364481": ["convolution_gpu_yxfb_yxio_b16",2], + "16161974964662774501": ["convolution_gpu_yxfb_yxio_b16",2], + "6772954924703365345": ["convolution_gpu_bfyx_gemm_like",2], + "11921652085115182024": ["convolution_gpu_yxfb_yxio_b16",2], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "9737565171095493297": ["convolution_gpu_bfyx_gemm_like",1], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7338932272767555117": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "5668538167635622474": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "8857763129101380288": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9099056013518879466": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "8459380583159325597": ["convolution_gpu_yxfb_yxio_b16",2], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "14771341796915983228": ["convolution_gpu_yxfb_yxio_b16",1], + "1520529227443340435": ["convolution_gpu_bfyx_gemm_like",2], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "4574541202890196191": ["convolution_gpu_yxfb_yxio_b16",2], + "9603926867418680768": ["convolution_gpu_yxfb_yxio_b16",2], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11241838709529552265": ["convolution_gpu_bfyx_gemm_like",2], + "15799159401545270696": ["convolution_gpu_bfyx_gemm_like",1], + "4142555169083069413": ["convolution_gpu_bfyx_gemm_like",0], + "11152334947349565403": ["convolution_gpu_yxfb_yxio_b16",1], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "3358616456137155015": ["convolution_gpu_yxfb_yxio_b16",2], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",2], + "10278515360013727367": ["convolution_gpu_yxfb_yxio_b16",1], + "7571716782558859443": ["convolution_gpu_yxfb_yxio_b16",2], + "7540655869186258692": ["convolution_gpu_yxfb_yxio_b16",2], + "6115915509370042166": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",1], + "13575423234109624706": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "16015963261509760799": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",1], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7894230717547658326": ["convolution_gpu_yxfb_yxio_b16",2], + "12933785392937626017": ["convolution_gpu_yxfb_yxio_b16",1], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",2], + "712495040970043706": ["convolution_gpu_yxfb_yxio_b16",2], + "11719957578496407410": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6538526180355194359": ["convolution_gpu_yxfb_yxio_b16",2], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "6427979320488981912": ["convolution_gpu_yxfb_yxio_b16",2], + "12721294268595880422": ["convolution_gpu_yxfb_yxio_b16",2], + "16293465561256937726": ["convolution_gpu_bfyx_gemm_like",2], + "3806806400778685133": ["convolution_gpu_yxfb_yxio_b16",2], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7532088618116521936": ["convolution_gpu_bfyx_gemm_like",2], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "14528180674573671874": ["convolution_gpu_yxfb_yxio_b16",2], + "6344600111737335616": ["convolution_gpu_yxfb_yxio_b16",2], + "6331794802915121861": ["convolution_gpu_yxfb_yxio_b16",2], + "5600128039063009632": ["convolution_gpu_bfyx_gemm_like",1], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8879618489623984140": ["convolution_gpu_yxfb_yxio_b16",2], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "14263790627243107300": ["convolution_gpu_bfyx_gemm_like",1], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "4252157815622916471": ["convolution_gpu_bfyx_1x1",2], + "17676344219475515993": ["convolution_gpu_yxfb_yxio_b16",2], + "2283020548041814543": ["convolution_gpu_yxfb_yxio_b16",2], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "8004244584949995244": ["convolution_gpu_yxfb_yxio_b16",2], + "7027962921778599989": ["convolution_gpu_yxfb_yxio_b16",2], + "2727175120437582536": ["convolution_gpu_bfyx_gemm_like",1], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",1], + "14288463473159113326": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10762489947656697207": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15956352026642286295": ["convolution_gpu_yxfb_yxio_b16",2], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9787359208094141129": ["fully_connected_gpu_fb_oi_ref",1], + "12370729327673204804": ["convolution_gpu_bfyx_gemm_like",2], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "14206076551739831333": ["convolution_gpu_bfyx_gemm_like",1], + "16996895381161031110": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",0], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "888110783182849535": ["convolution_gpu_yxfb_yxio_b16",2], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",2], + "11806402239500046867": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "12977678792503377525": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5519781859090160931": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "5317076157086789437": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18249888571553409563": ["convolution_gpu_yxfb_yxio_b16",2], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5584145249514762750": ["convolution_gpu_yxfb_yxio_b16",2], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "6703148006012061136": ["convolution_gpu_yxfb_yxio_b16",2], + "10463632805036507382": ["convolution_gpu_yxfb_yxio_b16",2], + "2608363732937932266": ["convolution_gpu_bfyx_gemm_like",2], + "4039483032571506874": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "671453551040072499": ["convolution_gpu_bfyx_gemm_like",2], + "9401409770128851474": ["convolution_gpu_bfyx_gemm_like",0], + "13912728810446567016": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "15449774545834423274": ["convolution_gpu_yxfb_yxio_b16",1], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",2], + "16455941573984854254": ["convolution_gpu_yxfb_yxio_b16",2], + "6469277112054008613": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",2], + "2907572047024872990": ["convolution_gpu_yxfb_yxio_b16",2], + "8700574100180128776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "5224252360611200472": ["convolution_gpu_bfyx_gemm_like",2], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6862489207967519978": ["convolution_gpu_bfyx_gemm_like",2], + "5149303626508247520": ["convolution_gpu_yxfb_yxio_b16",2], + "17082268616134506581": ["convolution_gpu_yxfb_yxio_b16",2], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",1], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "3217674729821898463": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17546090415334871175": ["convolution_gpu_yxfb_yxio_b16",2], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9375272277044782377": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "5257134257307295031": ["convolution_gpu_yxfb_yxio_b16",2], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",2], + "9293682866734263821": ["convolution_gpu_yxfb_yxio_b16",2], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9105127035114339269": ["convolution_gpu_yxfb_yxio_b16",2], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",0], + "2915777749501772828": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3286250915720444467": ["convolution_gpu_yxfb_yxio_b16",2], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "5041922366297242362": ["convolution_gpu_yxfb_yxio_b16",2], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15199604820473713622": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",1], + "998876398773540321": ["convolution_gpu_bfyx_1x1",2], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",2], + "6400660469217490279": ["convolution_gpu_yxfb_yxio_b16",2], + "6902485831441844789": ["convolution_gpu_yxfb_yxio_b16",2], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "10706180189726741161": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17996535939348094624": ["convolution_gpu_yxfb_yxio_b16",2], + "17945600479510493949": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",2], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16573597215928075233": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "378292944207609677": ["convolution_gpu_yxfb_yxio_b16",2], + "14807466024030301968": ["convolution_gpu_yxfb_yxio_b16",2], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",882], + "6820224292713065232": ["convolution_gpu_yxfb_yxio_b16",2], + "17025324057045572535": ["convolution_gpu_bfyx_gemm_like",1], + "2973337989445169388": ["convolution_gpu_yxfb_yxio_b16",2], + "2135164671985938807": ["convolution_gpu_yxfb_yxio_b16",2], + "6822432085522584060": ["convolution_gpu_yxfb_yxio_b16",2], + "8469874583725132145": ["fully_connected_gpu_fb_oi_ref",2], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "15971340431600153619": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13077012961563218195": ["convolution_gpu_yxfb_yxio_b16",2], + "12259611546528256409": ["convolution_gpu_yxfb_yxio_b16",2], + "3154539627593235077": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "10883992248631603006": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "9987415314864002460": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "7719954202744123391": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "5079381702867378605": ["convolution_gpu_yxfb_yxio_b16",2], + "10528894716283673051": ["convolution_gpu_yxfb_yxio_b16",2], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "18120079746729314878": ["convolution_gpu_yxfb_yxio_b16",2], + "13705072264927031658": ["convolution_gpu_yxfb_yxio_b16",2], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "8865700182878875593": ["convolution_gpu_yxfb_yxio_b16",2], + "5104519293341299859": ["convolution_gpu_yxfb_yxio_b16",2], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "11910735867274493498": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "740260423018155343": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "3346891393420268502": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3935750066315595083": ["convolution_gpu_yxfb_yxio_b16",2], + "14149210193687890597": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17965267346493659374": ["convolution_gpu_yxfb_yxio_b16",2], + "4740585760177040164": ["convolution_gpu_yxfb_yxio_b16",2], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "11342135956789192833": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "9319064434175105168": ["convolution_gpu_yxfb_yxio_b16",2], + "17705992851440826353": ["convolution_gpu_yxfb_yxio_b16",2], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1042605521041579458": ["convolution_gpu_yxfb_yxio_b16",2], + "975943900172381326": ["convolution_gpu_yxfb_yxio_b16",2], + "1345101751956733589": ["convolution_gpu_bfyx_gemm_like",2], + "5576296603250158603": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",1], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",0], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",2], + "8614375489387596119": ["convolution_gpu_yxfb_yxio_b16",2], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",2], + "3615052707933370958": ["convolution_gpu_yxfb_yxio_b16",2], + "15865753975271064117": ["convolution_gpu_yxfb_yxio_b16",2], + "2673903488704336606": ["convolution_gpu_bfyx_gemm_like",2], + "6260684231055362504": ["convolution_gpu_yxfb_yxio_b16",2], + "13325762052023866627": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10745099399736462076": ["convolution_gpu_yxfb_yxio_b16",2], + "17766628441954343001": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "14126906427006602775": ["convolution_gpu_bfyx_1x1",2], + "4805402210873641704": ["convolution_gpu_yxfb_yxio_b16",2], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",2], + "968105804060326332": ["convolution_gpu_yxfb_yxio_b16",2], + "1697260854781788314": ["convolution_gpu_yxfb_yxio_b16",1], + "9017605508157213607": ["convolution_gpu_yxfb_yxio_b16",2], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",2], + "10882719585803523032": ["convolution_gpu_yxfb_yxio_b16",2], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "12878346173547852969": ["convolution_gpu_yxfb_yxio_b16",2], + "7650862961269327235": ["convolution_gpu_bfyx_1x1",2], + "12680688623162482255": ["convolution_gpu_bfyx_1x1",2], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",2], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",2], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "8638074773026771425": ["convolution_gpu_yxfb_yxio_b16",2], + "17966898762317477857": ["convolution_gpu_yxfb_yxio_b16",2], + "5924271203978892761": ["convolution_gpu_yxfb_yxio_b16",2], + "2119566651547512543": ["convolution_gpu_yxfb_yxio_b16",2], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "465567788283624320": ["convolution_gpu_yxfb_yxio_b16",2], + "10538010212480716275": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3135889221160961020": ["convolution_gpu_yxfb_yxio_b16",2], + "16238415425814188039": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "144634005596305959": ["fully_connected_gpu_fb_io_block_fp16",2], + "13962325395021860937": ["convolution_gpu_yxfb_yxio_b16",2], + "2219693989290882970": ["convolution_gpu_yxfb_yxio_b16",2], + "14942858162799632403": ["convolution_gpu_yxfb_yxio_b16",2], + "11184290482439221741": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13130001092233798285": ["convolution_gpu_yxfb_yxio_b16",2], + "7992077349568239994": ["convolution_gpu_yxfb_yxio_b16",2], + "16883372966656079608": ["convolution_gpu_yxfb_yxio_b16",2], + "16839741351990811959": ["convolution_gpu_bfyx_gemm_like",2], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17228615388053183744": ["convolution_gpu_yxfb_yxio_b16",2], + "17264671167892237524": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "5596408142536691534": ["convolution_gpu_yxfb_yxio_b16",2], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",1], + "6931062623510631425": ["convolution_gpu_yxfb_yxio_b16",2], + "17647962002015093887": ["convolution_gpu_bfyx_gemm_like",2], + "9162862507585693061": ["convolution_gpu_yxfb_yxio_b16",2], + "4773077837537775324": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7305582749708309904": ["convolution_gpu_yxfb_yxio_b16",2], + "15800554162607246964": ["convolution_gpu_bfyx_gemm_like",2], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "10183537720515608": ["convolution_gpu_yxfb_yxio_b16",2], + "8541982562061181756": ["convolution_gpu_bfyx_gemm_like",1], + "8890400423799565844": ["convolution_gpu_yxfb_yxio_b16",2], + "5895417825685090256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",2], + "4804533178560338520": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "10850369799801518638": ["convolution_gpu_yxfb_yxio_b16",2], + "8113660920207936963": ["convolution_gpu_yxfb_yxio_b16",2], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "15275978123703636572": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "17214254645087272557": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "12421204749289937399": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "12327057172281102984": ["convolution_gpu_yxfb_yxio_b16",2], + "11910900938442124765": ["convolution_gpu_bfyx_gemm_like",1], + "15497263259976427714": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",2], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15757308772667178999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2945245652128285151": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "9062781751511609244": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "9079676771143357396": ["convolution_gpu_yxfb_yxio_b16",2], + "12714892326998505133": ["convolution_gpu_yxfb_yxio_b16",2], + "7171904645566467208": ["convolution_gpu_bfyx_gemm_like",2], + "6318214731544748245": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",2], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",2], + "12185561188335760786": ["convolution_gpu_yxfb_yxio_b16",1], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "5115298857582076692": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "17682152011630274259": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "794499287296495726": ["convolution_gpu_bfyx_1x1",2], + "18279416225045612845": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2506424495656099512": ["convolution_gpu_yxfb_yxio_b16",2], + "14466032674083938714": ["convolution_gpu_bfyx_gemm_like",1], + "1531349457115735845": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "2909347733581487795": ["convolution_gpu_yxfb_yxio_b16",2], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",2], + "16709930291825881111": ["convolution_gpu_yxfb_yxio_b16",2], + "7946262362930618714": ["convolution_gpu_yxfb_yxio_b16",2], + "2431241169199693527": ["convolution_gpu_yxfb_yxio_b16",2], + "2598267743388306204": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",89], + "4759671642533786591": ["convolution_gpu_bfyx_gemm_like",2], + "8575833423399668525": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "2685061316482503878": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "7105279481103494151": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6744692937598310090": ["convolution_gpu_yxfb_yxio_b16",2], + "15656706773401161497": ["convolution_gpu_yxfb_yxio_b16",2], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "5465400164581117113": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",0], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",2], + "12755692101476964677": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16742058312847401360": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5284456216115118110": ["convolution_gpu_yxfb_yxio_b16",2], + "18101509783610609787": ["convolution_gpu_yxfb_yxio_b16",2], + "6464050901421037006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6509271384550125629": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "11624226818593966530": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",2], + "6081038474197004540": ["convolution_gpu_yxfb_yxio_b16",2], + "3220756134650041028": ["convolution_gpu_yxfb_yxio_b16",2], + "11196245220967135443": ["convolution_gpu_yxfb_yxio_b16",2], + "12512751736409465214": ["convolution_gpu_bfyx_gemm_like",1], + "13467831091041327178": ["convolution_gpu_yxfb_yxio_b16",2], + "18253299978538051201": ["convolution_gpu_yxfb_yxio_b16",2], + "16339187733937346919": ["convolution_gpu_yxfb_yxio_b16",2], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13120262386070281193": ["convolution_gpu_yxfb_yxio_b16",2], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",0], + "2129742884686884642": ["convolution_gpu_yxfb_yxio_b16",2], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "9312974578711092131": ["convolution_gpu_yxfb_yxio_b16",2], + "5312269140190538942": ["convolution_gpu_yxfb_yxio_b16",2], + "3571330754519284334": ["convolution_gpu_yxfb_yxio_b16",2], + "13842149852156451845": ["convolution_gpu_yxfb_yxio_b16",2], + "1868805550246252143": ["convolution_gpu_yxfb_yxio_b16",2], + "7954972694876158422": ["convolution_gpu_bfyx_1x1",2], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",0], + "14248622935809594779": ["convolution_gpu_yxfb_yxio_b16",2], + "9178915201681884122": ["convolution_gpu_yxfb_yxio_b16",2], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",2], + "12567935463143860469": ["convolution_gpu_yxfb_yxio_b16",1], + "17386047378634216634": ["convolution_gpu_yxfb_yxio_b16",2], + "3980835859526174461": ["convolution_gpu_yxfb_yxio_b16",2], + "8075261051536686307": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "12054200116003751590": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "14712972289919865502": ["convolution_gpu_bfyx_gemm_like",2], + "3074436655804078403": ["convolution_gpu_yxfb_yxio_b16",2], + "16013560489115457872": ["convolution_gpu_yxfb_yxio_b16",2], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",2], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",1], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15115780248032030963": ["convolution_gpu_yxfb_yxio_b16",2], + "6631103268546309714": ["convolution_gpu_yxfb_yxio_b16",2], + "3724572174214794659": ["convolution_gpu_yxfb_yxio_b16",2], + "3364141707903132298": ["convolution_gpu_yxfb_yxio_b16",2], + "6328802691680458752": ["convolution_gpu_bfyx_gemm_like",2], + "5593329151028712439": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",2], + "497488185553682238": ["convolution_gpu_bfyx_gemm_like",1], + "16341131728764501904": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8768300687476117215": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "249639220178603842": ["convolution_gpu_bfyx_gemm_like",0], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",2], + "1786732163438555728": ["convolution_gpu_yxfb_yxio_b16",2], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "7667898603371717971": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "490931535580183607": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "6051877311645456194": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10617442099961865960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12362290144183018227": ["convolution_gpu_yxfb_yxio_b16",2], + "5150256051921098637": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16335738565228204503": ["convolution_gpu_yxfb_yxio_b16",2], + "3287181725010492879": ["convolution_gpu_yxfb_yxio_b16",2], + "16075006181495932250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3427691447288240419": ["convolution_gpu_yxfb_yxio_b16",2], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",0], + "18187345248160481425": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "6439316331231400868": ["convolution_gpu_yxfb_yxio_b16",2], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "14731054961557547253": ["convolution_gpu_yxfb_yxio_b16",2], + "990199360818917334": ["convolution_gpu_yxfb_yxio_b16",2], + "18035673326929466074": ["convolution_gpu_bfyx_gemm_like",1], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6654167459904026563": ["convolution_gpu_yxfb_yxio_b16",2], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",2], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "11545529736818363243": ["convolution_gpu_yxfb_yxio_b16",2], + "5854093367753757010": ["convolution_gpu_yxfb_yxio_b16",2], + "3398322619007806698": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "12867177334690636800": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "1463649546800120847": ["convolution_gpu_yxfb_yxio_b16",2], + "12600479027568241746": ["convolution_gpu_yxfb_yxio_b16",2], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",2], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",2], + "13079058582191027406": ["convolution_gpu_yxfb_yxio_b16",1], + "6214677989814002369": ["convolution_gpu_yxfb_yxio_b16",2], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8321204816277460837": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15488532485794545310": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "16052741298509954954": ["convolution_gpu_yxfb_yxio_b16",2], + "16541970206584576833": ["convolution_gpu_bfyx_gemm_like",2], + "15739756988784344130": ["convolution_gpu_yxfb_yxio_b16",2], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17764795635957985989": ["convolution_gpu_yxfb_yxio_b16",2], + "2936333406928424760": ["convolution_gpu_bfyx_1x1",2], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "10812324504777808014": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "302694026179841870": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "13042938686374926241": ["convolution_gpu_yxfb_yxio_b16",2], + "13158449455164143947": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12221101678609734421": ["convolution_gpu_yxfb_yxio_b16",2], + "8099100633390626027": ["convolution_gpu_yxfb_yxio_b16",2], + "7780336054545552428": ["convolution_gpu_yxfb_yxio_b16",2], + "10432365444137108781": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8174833187387604731": ["convolution_gpu_yxfb_yxio_b16",2], + "10560559646371329711": ["convolution_gpu_bfyx_os_iyx_osv16",377], + "16589848737162195829": ["convolution_gpu_yxfb_yxio_b16",2], + "18279927175542031567": ["convolution_gpu_yxfb_yxio_b16",2], + "14363025045807200040": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",2], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14808079119439455357": ["convolution_gpu_yxfb_yxio_b16",2], + "15101834579076569231": ["convolution_gpu_yxfb_yxio_b16",1], + "7400937639903461446": ["convolution_gpu_yxfb_yxio_b16",2], + "3914143598803149415": ["convolution_gpu_yxfb_yxio_b16",2], + "12550985938092975889": ["convolution_gpu_bfyx_1x1",2], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2670216237572554944": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "13618411266808159341": ["convolution_gpu_yxfb_yxio_b16",2], + "13472577372534605883": ["convolution_gpu_bfyx_gemm_like",1], + "12745552951204330052": ["convolution_gpu_yxfb_yxio_b16",2], + "17361714725103230834": ["convolution_gpu_bfyx_os_iyx_osv16",528], + "9996142812492415452": ["convolution_gpu_yxfb_yxio_b16",2], + "4353583636655606632": ["convolution_gpu_yxfb_yxio_b16",2], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18359731130169236059": ["convolution_gpu_yxfb_yxio_b16",2], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",1], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "16361932270527364507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14517191894006411358": ["convolution_gpu_yxfb_yxio_b16",2], + "9285202897230250613": ["convolution_gpu_yxfb_yxio_b16",2], + "294153950488131608": ["convolution_gpu_yxfb_yxio_b16",2], + "8846314870152404018": ["convolution_gpu_bfyx_gemm_like",2], + "15438470456977849772": ["convolution_gpu_yxfb_yxio_b16",2], + "1089944493540593798": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10069896554844445748": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",2], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",1], + "13636407347458845915": ["convolution_gpu_yxfb_yxio_b16",2], + "15811723176266128065": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10572380563704942622": ["convolution_gpu_yxfb_yxio_b16",0], + "11669828823444745889": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "16341700680310033430": ["fully_connected_gpu_fb_io_block_fp16",1], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6769524481210107636": ["convolution_gpu_yxfb_yxio_b16",1], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",1], + "12788611449571149037": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "9306120768594851497": ["convolution_gpu_yxfb_yxio_b16",2], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",2], + "9263784636194609884": ["convolution_gpu_yxfb_yxio_b16",2], + "10693837788817206459": ["convolution_gpu_yxfb_yxio_b16",2], + "15886016297043613632": ["convolution_gpu_yxfb_yxio_b16",2], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2034811390140488812": ["convolution_gpu_yxfb_yxio_b16",2], + "10232809153913700925": ["convolution_gpu_yxfb_yxio_b16",2], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "10704906466618081803": ["convolution_gpu_yxfb_yxio_b16",2], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "9463256538942644563": ["convolution_gpu_yxfb_yxio_b16",2], + "11888011890096886932": ["convolution_gpu_yxfb_yxio_b16",2], + "1898243736289257252": ["convolution_gpu_yxfb_yxio_b16",2], + "5319459637051859849": ["convolution_gpu_yxfb_yxio_b16",2], + "12637509262827320678": ["convolution_gpu_yxfb_yxio_b16",2], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14568618538516685994": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",1], + "10330180429524641331": ["convolution_gpu_bfyx_gemm_like",2], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "2328919599530851492": ["convolution_gpu_yxfb_yxio_b16",2], + "3349468433721705582": ["convolution_gpu_yxfb_yxio_b16",2], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",2], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "4098191685457418125": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "3940619509778739158": ["convolution_gpu_yxfb_yxio_b16",2], + "13022797264172398260": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "15522099459864628246": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3531786338249174486": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6845814820599174031": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "4342446399224806160": ["convolution_gpu_yxfb_yxio_b16",2], + "1325669650629605592": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15678768217453692725": ["convolution_gpu_yxfb_yxio_b16",2], + "15227189929676013024": ["convolution_gpu_yxfb_yxio_b16",2], + "9237587440336828595": ["convolution_gpu_yxfb_yxio_b16",2], + "1680468564927032670": ["convolution_gpu_bfyx_gemm_like",1], + "15267084369543546013": ["convolution_gpu_yxfb_yxio_b16",2], + "4740864135937875560": ["convolution_gpu_yxfb_yxio_b16",2], + "4839205075057964902": ["convolution_gpu_yxfb_yxio_b16",2], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17266121859044814533": ["convolution_gpu_yxfb_yxio_b16",2], + "9526266653688168429": ["convolution_gpu_yxfb_yxio_b16",2], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "10893628699015898230": ["convolution_gpu_yxfb_yxio_b16",2], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "7669403041163460089": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "3056212889689424946": ["convolution_gpu_bfyx_1x1",2], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17826868890632814593": ["convolution_gpu_yxfb_yxio_b16",2], + "8466986812935642059": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "12264240305528403865": ["convolution_gpu_yxfb_yxio_b16",2], + "13914239937595549448": ["convolution_gpu_yxfb_yxio_b16",2], + "4416793079965040181": ["convolution_gpu_yxfb_yxio_b16",2], + "462240909302334133": ["convolution_gpu_yxfb_yxio_b16",2], + "9162469583721135043": ["convolution_gpu_yxfb_yxio_b16",1], + "13101474064130881526": ["convolution_gpu_yxfb_yxio_b16",2], + "12081698011407453832": ["convolution_gpu_yxfb_yxio_b16",2], + "4282756088824939292": ["convolution_gpu_yxfb_yxio_b16",2], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",1], + "1198893312653197535": ["convolution_gpu_yxfb_yxio_b16",2], + "13075579052866074866": ["convolution_gpu_bfyx_gemm_like",2], + "3370082268529091875": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14887465694301281952": ["convolution_gpu_yxfb_yxio_b16",2], + "10782169939706303899": ["convolution_gpu_yxfb_yxio_b16",1], + "11606895513516475339": ["convolution_gpu_yxfb_yxio_b16",2], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "16768497046700403748": ["convolution_gpu_yxfb_yxio_b16",2], + "430132942408244070": ["convolution_gpu_bfyx_gemm_like",2], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",1], + "7203620615363933078": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "7826406759309418010": ["convolution_gpu_yxfb_yxio_b16",2], + "1919535500129437217": ["convolution_gpu_yxfb_yxio_b16",2], + "167635075964111628": ["convolution_gpu_yxfb_yxio_b16",2], + "8652128863605749877": ["convolution_gpu_yxfb_yxio_b16",2], + "8115522418294960470": ["convolution_gpu_yxfb_yxio_b16",2], + "3830842631023415233": ["convolution_gpu_yxfb_yxio_b16",2], + "17811558714592064184": ["convolution_gpu_yxfb_yxio_b16",2], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14699357144600604190": ["convolution_gpu_yxfb_yxio_b16",2], + "13408839571805750778": ["convolution_gpu_yxfb_yxio_b16",2], + "3211829722778368758": ["convolution_gpu_yxfb_yxio_b16",2], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",2], + "16870036853278751563": ["convolution_gpu_yxfb_yxio_b16",2], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "12256193738921380409": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4280250278457269231": ["convolution_gpu_yxfb_yxio_b16",2], + "2931988747601319855": ["convolution_gpu_bfyx_1x1",2], + "8898095926967052382": ["convolution_gpu_yxfb_yxio_b16",2], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "15833461718320604065": ["convolution_gpu_bfyx_gemm_like",2], + "15078168059698267650": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "2269140636553245446": ["convolution_gpu_yxfb_yxio_b16",1], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "9363988379673156863": ["convolution_gpu_yxfb_yxio_b16",2], + "14808759315730413993": ["convolution_gpu_yxfb_yxio_b16",2], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16582761411084080015": ["convolution_gpu_yxfb_yxio_b16",2], + "4461989328775275994": ["convolution_gpu_bfyx_gemm_like",2], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "17970424536559595893": ["convolution_gpu_yxfb_yxio_b16",2], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "10701208905236219083": ["convolution_gpu_yxfb_yxio_b16",2], + "4752129805031267391": ["convolution_gpu_yxfb_yxio_b16",2], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "6131481289104111211": ["convolution_gpu_bfyx_gemm_like",2], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "5422432655714154738": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12681408370704556588": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13302687772426736346": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "9073757008455674094": ["convolution_gpu_yxfb_yxio_b16",2], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",2], + "762634810164167963": ["convolution_gpu_yxfb_yxio_b16",2], + "8353259929933281349": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "69832608384091511": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14020956765444878761": ["convolution_gpu_bfyx_gemm_like",2], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",2], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",1], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "3265415000818832667": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",2], + "12137340921829511472": ["convolution_gpu_yxfb_yxio_b16",2], + "16620268338434572068": ["convolution_gpu_yxfb_yxio_b16",2], + "6102330514901613158": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9737833587413114584": ["convolution_gpu_bfyx_gemm_like",1], + "10006197783106691106": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "14337168375989245254": ["convolution_gpu_yxfb_yxio_b16",2], + "708201295462256406": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "16312739695844838884": ["convolution_gpu_yxfb_yxio_b16",2], + "17188004018198554470": ["convolution_gpu_yxfb_yxio_b16",2], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",2], + "12268912077694742671": ["convolution_gpu_yxfb_yxio_b16",2], + "17536482873064844308": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17109520309574369561": ["convolution_gpu_bfyx_gemm_like",2], + "10997156099709436375": ["convolution_gpu_yxfb_yxio_b16",2], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "5526223938481098693": ["convolution_gpu_yxfb_yxio_b16",2], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "8094920912208664820": ["convolution_gpu_yxfb_yxio_b16",2], + "15879385408480411034": ["convolution_gpu_yxfb_yxio_b16",2], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "17479773641824222843": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "14264584839702225855": ["convolution_gpu_yxfb_yxio_b16",1], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4633923265089466898": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1933147648540963732": ["convolution_gpu_yxfb_yxio_b16",2], + "6926590672771069689": ["convolution_gpu_yxfb_yxio_b16",2], + "17446505012657609153": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",2], + "10899110544832584656": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "7469107606686458209": ["convolution_gpu_yxfb_yxio_b16",2], + "7720153213673170931": ["convolution_gpu_yxfb_yxio_b16",2], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",2], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",2], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "13809046727894108358": ["convolution_gpu_yxfb_yxio_b16",2], + "14944590179685661287": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "11298854310398101852": ["convolution_gpu_yxfb_yxio_b16",2], + "17672785701483179117": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "6921081008428242060": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",2], + "13289438471364352634": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9456645866001656225": ["convolution_gpu_yxfb_yxio_b16",2], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "6022695488769618639": ["convolution_gpu_yxfb_yxio_b16",2], + "12213908871711628660": ["convolution_gpu_yxfb_yxio_b16",2], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "3574585436812909168": ["convolution_gpu_yxfb_yxio_b16",2], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",2], + "14791575777969587370": ["convolution_gpu_yxfb_yxio_b16",2], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "11455055202624479980": ["convolution_gpu_yxfb_yxio_b16",2], + "14120569486714455490": ["convolution_gpu_yxfb_yxio_b16",1], + "13636859714649629789": ["convolution_gpu_yxfb_yxio_b16",2], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "7498614018449036163": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "15597317305719116351": ["convolution_gpu_yxfb_yxio_b16",2], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9694701402170070080": ["convolution_gpu_yxfb_yxio_b16",2], + "13943983517468412332": ["convolution_gpu_yxfb_yxio_b16",2], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",0], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15924583510704449214": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9120374653477510318": ["convolution_gpu_yxfb_yxio_b16",2], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",2], + "13576010631084066792": ["convolution_gpu_yxfb_yxio_b16",2], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11153522012082333137": ["convolution_gpu_yxfb_yxio_b16",2], + "17248329632819747646": ["convolution_gpu_yxfb_yxio_b16",2], + "12334522314915706512": ["convolution_gpu_yxfb_yxio_b16",2], + "14065215389112262561": ["convolution_gpu_yxfb_yxio_b16",2], + "5401380444992462053": ["convolution_gpu_yxfb_yxio_b16",2], + "8300290944865904942": ["convolution_gpu_yxfb_yxio_b16",2], + "10996429218747311159": ["convolution_gpu_yxfb_yxio_b16",2], + "9059418187274548462": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "9542325095876448686": ["convolution_gpu_bfyx_gemm_like",1], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",1], + "9588943054777767098": ["convolution_gpu_yxfb_yxio_b16",0], + "12353956380178079089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7332664632757815486": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "15231987838322151865": ["convolution_gpu_bfyx_1x1",2], + "11626402549863483301": ["convolution_gpu_bfyx_gemm_like",2], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "10722677916294015259": ["convolution_gpu_bfyx_gemm_like",2], + "9951951467222189282": ["convolution_gpu_yxfb_yxio_b16",2], + "1387945708447092123": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "5671289201458690944": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7450417963648518926": ["convolution_gpu_bfyx_gemm_like",2], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "1173136780324694038": ["convolution_gpu_yxfb_yxio_b16",2], + "13387545865482261974": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "11871319147579477936": ["convolution_gpu_yxfb_yxio_b16",2], + "2173720698351153121": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",2], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "16469493066700118274": ["convolution_gpu_yxfb_yxio_b16",2], + "13119040261291835298": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12616205756849913359": ["convolution_gpu_yxfb_yxio_b16",2], + "4010419602093863685": ["convolution_gpu_yxfb_yxio_b16",2], + "8163000689380461611": ["convolution_gpu_yxfb_yxio_b16",2], + "17158401628206867933": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14365232561737454031": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "4500107195684703428": ["convolution_gpu_yxfb_yxio_b16",2], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",2], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "7211179360844946434": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",0], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",2], + "1142968634734769401": ["convolution_gpu_yxfb_yxio_b16",2], + "15159534367247036982": ["convolution_gpu_yxfb_yxio_b16",2], + "18131954418490925431": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5602377914578322577": ["convolution_gpu_yxfb_yxio_b16",2], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "8267783192628619295": ["convolution_gpu_yxfb_yxio_b16",2], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "17077815973022307612": ["convolution_gpu_yxfb_yxio_b16",2], + "9775648000771985077": ["convolution_gpu_yxfb_yxio_b16",2], + "3321251856445833973": ["convolution_gpu_yxfb_yxio_b16",2], + "13161997040644039778": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "7986797517722531256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6109013751635776331": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8092673566670222445": ["convolution_gpu_yxfb_yxio_b16",2], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18243724217479803107": ["convolution_gpu_yxfb_yxio_b16",2], + "16986358655784856534": ["convolution_gpu_bfyx_os_iyx_osv16",724], + "12248852114219058572": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "14675165976583799157": ["convolution_gpu_yxfb_yxio_b16",2], + "3837190939606792435": ["fully_connected_gpu_fb_io_block_fp16",1], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "10785252006948647963": ["convolution_gpu_yxfb_yxio_b16",2], + "15958886009743157242": ["convolution_gpu_bfyx_gemm_like",2], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "10720769054729185991": ["convolution_gpu_yxfb_yxio_b16",2], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3141886504884887200": ["convolution_gpu_bfyx_gemm_like",2], + "6558436237075337721": ["convolution_gpu_yxfb_yxio_b16",1], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "10939522663236304689": ["convolution_gpu_yxfb_yxio_b16",2], + "14362876471450307424": ["convolution_gpu_bfyx_1x1",2], + "16949056117405140365": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13456967132681889167": ["convolution_gpu_yxfb_yxio_b16",2], + "11612044653200304877": ["convolution_gpu_yxfb_yxio_b16",2], + "4776685525963461501": ["convolution_gpu_yxfb_yxio_b16",2], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "1895560603400089814": ["convolution_gpu_yxfb_yxio_b16",2], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7541325258238317885": ["convolution_gpu_yxfb_yxio_b16",2], + "688897645422834994": ["convolution_gpu_yxfb_yxio_b16",1], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "8125500765566111746": ["convolution_gpu_yxfb_yxio_b16",2], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "9250030880535336888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11908169713247209976": ["convolution_gpu_yxfb_yxio_b16",2], + "7493567975736494003": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "13207134083675064956": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "1527126728636583082": ["convolution_gpu_yxfb_yxio_b16",2], + "4391695940614024479": ["convolution_gpu_yxfb_yxio_b16",2], + "17012832508134584917": ["convolution_gpu_yxfb_yxio_b16",2], + "14697908554930995949": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14258941821319200170": ["convolution_gpu_yxfb_yxio_b16",2], + "867868384380428650": ["convolution_gpu_yxfb_yxio_b16",2], + "3647203315640064927": ["convolution_gpu_yxfb_yxio_b16",2], + "11587239927319376658": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "2367452220382767844": ["convolution_gpu_yxfb_yxio_b16",2], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2807516818436584831": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",1], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",1], + "17777248703109395158": ["convolution_gpu_yxfb_yxio_b16",2], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "13633048912926365931": ["convolution_gpu_yxfb_yxio_b16",1], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "16925721317097534009": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3364467044587904559": ["convolution_gpu_yxfb_yxio_b16",2], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "18148431787172327554": ["convolution_gpu_yxfb_yxio_b16",2], + "1485662490111767875": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "959260710517842876": ["convolution_gpu_bfyx_gemm_like",2], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6114241186364821679": ["convolution_gpu_bfyx_gemm_like",2], + "14764715930784496165": ["convolution_gpu_bfyx_gemm_like",2], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15666720796968090760": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13862199647000195451": ["convolution_gpu_yxfb_yxio_b16",2], + "14689423748560749566": ["fully_connected_gpu_fb_oi_ref",2], + "9190054801124577726": ["convolution_gpu_yxfb_yxio_b16",2], + "4818598834950786080": ["convolution_gpu_yxfb_yxio_b16",2], + "3894121333485095575": ["convolution_gpu_yxfb_yxio_b16",2], + "5436553435132026991": ["convolution_gpu_yxfb_yxio_b16",2], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",0], + "12526988667216482085": ["convolution_gpu_yxfb_yxio_b16",2], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "4588117321438490483": ["convolution_gpu_yxfb_yxio_b16",2], + "4834446692898125871": ["convolution_gpu_bfyx_gemm_like",2], + "14058311587429063829": ["convolution_gpu_yxfb_yxio_b16",2], + "3423717644513543253": ["convolution_gpu_yxfb_yxio_b16",2], + "10524079700393212963": ["convolution_gpu_yxfb_yxio_b16",2], + "6875055157295709098": ["convolution_gpu_yxfb_yxio_b16",2], + "14749758365915995876": ["convolution_gpu_yxfb_yxio_b16",2], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11226912053840621089": ["convolution_gpu_yxfb_yxio_b16",2], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "11179211757115972103": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "708747442142592697": ["convolution_gpu_bfyx_gemm_like",2], + "5931972000452008090": ["convolution_gpu_yxfb_yxio_b16",2], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9542795021683486547": ["convolution_gpu_yxfb_yxio_b16",2], + "15578456771467281881": ["convolution_gpu_bfyx_gemm_like",1], + "14742998604680438008": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "11612209645710419427": ["convolution_gpu_yxfb_yxio_b16",2], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",1], + "8456185296386225533": ["convolution_gpu_yxfb_yxio_b16",2], + "7178866013527118649": ["convolution_gpu_yxfb_yxio_b16",2], + "3503893875515897267": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4023281997496669037": ["convolution_gpu_yxfb_yxio_b16",2], + "12411075288896909468": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6286349307417232815": ["convolution_gpu_yxfb_yxio_b16",2], + "10531218595816974659": ["convolution_gpu_bfyx_gemm_like",2], + "9654944848074437064": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "17868834743037242721": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17207560805775399864": ["convolution_gpu_bfyx_gemm_like",1], + "10070051133200561606": ["convolution_gpu_yxfb_yxio_b16",2], + "18027243127893440568": ["convolution_gpu_yxfb_yxio_b16",2], + "13598062803968442253": ["convolution_gpu_yxfb_yxio_b16",2], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7397376454528841634": ["convolution_gpu_yxfb_yxio_b16",2], + "3924212595662208655": ["convolution_gpu_yxfb_yxio_b16",2], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "16653412888821076903": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "14311888412221174224": ["convolution_gpu_yxfb_yxio_b16",2], + "1498389965422474930": ["convolution_gpu_yxfb_yxio_b16",2], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",2], + "4849343880559509889": ["convolution_gpu_bfyx_1x1",2], + "2296581485980163665": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15438530452161762045": ["convolution_gpu_yxfb_yxio_b16",0], + "8837721075413149240": ["convolution_gpu_bfyx_gemm_like",1], + "6126579157025017808": ["convolution_gpu_yxfb_yxio_b16",2], + "10995886682834858002": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11685571068419983048": ["convolution_gpu_bfyx_1x1",2], + "18186612931984342471": ["convolution_gpu_yxfb_yxio_b16",2], + "632116056424249698": ["convolution_gpu_bfyx_gemm_like",1], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "530973311459168543": ["convolution_gpu_yxfb_yxio_b16",2], + "11277866878590984477": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "7206226541369793931": ["convolution_gpu_yxfb_yxio_b16",2], + "9562527071055150197": ["convolution_gpu_bfyx_1x1",2], + "5852569526295779497": ["convolution_gpu_yxfb_yxio_b16",2], + "9433162648796382333": ["convolution_gpu_yxfb_yxio_b16",2], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",1], + "16111630594575598044": ["convolution_gpu_yxfb_yxio_b16",2], + "13200834963067135502": ["fully_connected_gpu_fb_oi_ref",1], + "741727668385951462": ["convolution_gpu_yxfb_yxio_b16",2], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "16535378085465418910": ["convolution_gpu_yxfb_yxio_b16",2], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7247475218645942682": ["convolution_gpu_yxfb_yxio_b16",2], + "1141277975467180549": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "3934090072734175564": ["convolution_gpu_yxfb_yxio_b16",2], + "14769111376729628572": ["convolution_gpu_yxfb_yxio_b16",2], + "13124342334495538095": ["convolution_gpu_bfyx_gemm_like",2], + "10184417796355593956": ["convolution_gpu_yxfb_yxio_b16",2], + "4079026972040047969": ["convolution_gpu_bfyx_gemm_like",2], + "17848582668902427291": ["convolution_gpu_yxfb_yxio_b16",2], + "12455871938978342189": ["convolution_gpu_yxfb_yxio_b16",2], + "16585502133291740543": ["convolution_gpu_yxfb_yxio_b16",2], + "223412492545617963": ["convolution_gpu_yxfb_yxio_b16",2], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "18322435770607273817": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "11596971301790598405": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17195293614280872622": ["convolution_gpu_yxfb_yxio_b16",2], + "4982549855424649217": ["convolution_gpu_yxfb_yxio_b16",0], + "2191416057399400794": ["convolution_gpu_yxfb_yxio_b16",2], + "5539793555189956907": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "8931469268093714938": ["convolution_gpu_yxfb_yxio_b16",2], + "951747146164097188": ["convolution_gpu_bfyx_1x1",2], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",2], + "2542112741645712811": ["fully_connected_gpu_fb_io_block_fp16",1], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "1662588605309237309": ["convolution_gpu_yxfb_yxio_b16",2], + "14646141746558153748": ["convolution_gpu_yxfb_yxio_b16",2], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "252188028702250668": ["convolution_gpu_yxfb_yxio_b16",2], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "10322427853063201289": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "18432421400879260832": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "7612288596055048389": ["convolution_gpu_yxfb_yxio_b16",2], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",2], + "6014752258124559691": ["convolution_gpu_yxfb_yxio_b16",2], + "11822555173696078282": ["convolution_gpu_bfyx_gemm_like",1], + "2531597468539205600": ["convolution_gpu_yxfb_yxio_b16",2], + "4708035980731751007": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "9165817820007469505": ["convolution_gpu_yxfb_yxio_b16",2], + "18093895673012393740": ["convolution_gpu_yxfb_yxio_b16",2], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "42935035304560876": ["convolution_gpu_yxfb_yxio_b16",2], + "2744566213784972700": ["convolution_gpu_yxfb_yxio_b16",2], + "9815961128076948768": ["fully_connected_gpu_fb_io_block_fp16",0], + "5834825835421819800": ["convolution_gpu_yxfb_yxio_b16",2], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "5328004363712610999": ["convolution_gpu_yxfb_yxio_b16",2], + "11824946481875102910": ["convolution_gpu_yxfb_yxio_b16",2], + "9523941899498458600": ["convolution_gpu_yxfb_yxio_b16",2], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "10914921540144371519": ["convolution_gpu_bfyx_gemm_like",1], + "9545968464906009869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8780671766122887951": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9659837320293869285": ["convolution_gpu_yxfb_yxio_b16",2], + "7600034850149968684": ["convolution_gpu_yxfb_yxio_b16",2], + "15581997249051127645": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "1787152688807233651": ["convolution_gpu_yxfb_yxio_b16",2], + "9309173544512377803": ["convolution_gpu_yxfb_yxio_b16",2], + "8257103926661643451": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "8260073247636023575": ["convolution_gpu_yxfb_yxio_b16",2], + "13124659308711651699": ["convolution_gpu_bfyx_gemm_like",2], + "15612334131144235342": ["convolution_gpu_yxfb_yxio_b16",2], + "3635446784873718932": ["convolution_gpu_bfyx_gemm_like",2], + "269167598200943915": ["convolution_gpu_yxfb_yxio_b16",2], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "1802510952374368682": ["convolution_gpu_yxfb_yxio_b16",2], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",2], + "15337841577110104431": ["convolution_gpu_yxfb_yxio_b16",2], + "7590390572139249734": ["convolution_gpu_yxfb_yxio_b16",2], + "14918482938530107806": ["convolution_gpu_bfyx_gemm_like",1], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "5449117614287394433": ["convolution_gpu_yxfb_yxio_b16",2], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",1], + "8999570321113443117": ["convolution_gpu_yxfb_yxio_b16",2], + "16347412180100581330": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3463206409786541741": ["convolution_gpu_yxfb_yxio_b16",2], + "1354199155380786906": ["convolution_gpu_yxfb_yxio_b16",2], + "3894130445933963911": ["convolution_gpu_yxfb_yxio_b16",2], + "1634884284544380004": ["convolution_gpu_yxfb_yxio_b16",2], + "12818786388125465101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14847662630748580880": ["convolution_gpu_yxfb_yxio_b16",2], + "10961696014697611547": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13493119419114659706": ["convolution_gpu_yxfb_yxio_b16",2], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "7440953406601377619": ["convolution_gpu_yxfb_yxio_b16",2], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",2], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "3019864917236424168": ["convolution_gpu_yxfb_yxio_b16",2], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "11254744277059719812": ["convolution_gpu_yxfb_yxio_b16",2], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16120120950870908964": ["convolution_gpu_yxfb_yxio_b16",2], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",2], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "16548491024653039967": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "15188570678726970998": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "8609939102588915855": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "12926382190254407283": ["convolution_gpu_yxfb_yxio_b16",2], + "3859139031732555228": ["convolution_gpu_yxfb_yxio_b16",2], + "8792202318168046223": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "447943521999310356": ["convolution_gpu_yxfb_yxio_b16",2], + "14043064718932538557": ["convolution_gpu_yxfb_yxio_b16",2], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "13006774775034887171": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",2], + "15106614232165315070": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",2], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",1], + "7815650257256675477": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "17617204422090117691": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "12896226291465522304": ["convolution_gpu_yxfb_yxio_b16",2], + "11317843493537672866": ["convolution_gpu_yxfb_yxio_b16",2], + "11972290239275366299": ["convolution_gpu_yxfb_yxio_b16",2], + "9735280865199145311": ["convolution_gpu_yxfb_yxio_b16",2], + "3172518362830684966": ["convolution_gpu_yxfb_yxio_b16",2], + "12933253554354951910": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "8527055001340219573": ["convolution_gpu_yxfb_yxio_b16",2], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "637115537820955017": ["convolution_gpu_yxfb_yxio_b16",2], + "15303251546207338960": ["convolution_gpu_yxfb_yxio_b16",2], + "15961487889420208188": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "6511742759171254447": ["convolution_gpu_yxfb_yxio_b16",2], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "3699344686791530101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "5330130011321223525": ["convolution_gpu_yxfb_yxio_b16",2], + "2477849395789783501": ["convolution_gpu_bfyx_gemm_like",2], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12319073009094248232": ["convolution_gpu_bfyx_gemm_like",2], + "13833960927635646899": ["convolution_gpu_bfyx_gemm_like",1], + "12825407709419526493": ["convolution_gpu_yxfb_yxio_b16",2], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "12379166764490359144": ["convolution_gpu_yxfb_yxio_b16",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6118737381591369532": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12793347723828876280": ["convolution_gpu_yxfb_yxio_b16",2], + "11263540528012919947": ["convolution_gpu_bfyx_1x1",2], + "14544219140091420262": ["convolution_gpu_bfyx_gemm_like",1], + "1157069349112113377": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13861223834466385546": ["convolution_gpu_bfyx_gemm_like",1], + "3385797925880519845": ["convolution_gpu_bfyx_1x1",2], + "7432142107544210174": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15398380328746287438": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "5291817530552764387": ["convolution_gpu_yxfb_yxio_b16",2], + "3231651468686543808": ["convolution_gpu_bfyx_os_iyx_osv16",528], + "1088710562928089772": ["convolution_gpu_yxfb_yxio_b16",2], + "9414927552739380436": ["convolution_gpu_yxfb_yxio_b16",2], + "7565006185780806333": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",0], + "768820004084041271": ["convolution_gpu_yxfb_yxio_b16",2], + "14346466672686303107": ["convolution_gpu_yxfb_yxio_b16",2], + "14416897092729861207": ["convolution_gpu_yxfb_yxio_b16",2], + "18209930746627816139": ["convolution_gpu_yxfb_yxio_b16",2], + "13531892014108749846": ["convolution_gpu_yxfb_yxio_b16",2], + "7463517383354309469": ["convolution_gpu_bfyx_gemm_like",0], + "17030051116023319382": ["convolution_gpu_yxfb_yxio_b16",2], + "11497761673211348612": ["convolution_gpu_yxfb_yxio_b16",2], + "10555835101752189454": ["convolution_gpu_yxfb_yxio_b16",2], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "11971736882960844905": ["convolution_gpu_yxfb_yxio_b16",2], + "14175962333785791005": ["convolution_gpu_yxfb_yxio_b16",2], + "12131461096501477069": ["convolution_gpu_yxfb_yxio_b16",2], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12771841901357553928": ["convolution_gpu_yxfb_yxio_b16",2], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2576773809294607971": ["convolution_gpu_yxfb_yxio_b16",2], + "6685985905221810743": ["convolution_gpu_yxfb_yxio_b16",2], + "2487679091192300910": ["convolution_gpu_yxfb_yxio_b16",2], + "2863465257341735941": ["convolution_gpu_bfyx_1x1",2], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "6141193842171342687": ["convolution_gpu_yxfb_yxio_b16",2], + "844576097677576405": ["convolution_gpu_yxfb_yxio_b16",2], + "290134020607738418": ["convolution_gpu_bfyx_gemm_like",1], + "6692085187697087807": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",2], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",1], + "17224655686568797096": ["convolution_gpu_yxfb_yxio_b16",2], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "17651477639302255490": ["convolution_gpu_yxfb_yxio_b16",2], + "3017824560305532066": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "4651261398203912503": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1638858323987412931": ["convolution_gpu_yxfb_yxio_b16",2], + "577842450575835175": ["convolution_gpu_yxfb_yxio_b16",2], + "3156783219125679946": ["convolution_gpu_bfyx_1x1",2], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17587625589456309495": ["convolution_gpu_yxfb_yxio_b16",2], + "17133376737554844449": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "10055247339012492459": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "18164706399147697716": ["convolution_gpu_yxfb_yxio_b16",1], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7927587739463421727": ["convolution_gpu_yxfb_yxio_b16",2], + "11732321796147239597": ["convolution_gpu_yxfb_yxio_b16",2], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",2], + "15747571668131081693": ["convolution_gpu_yxfb_yxio_b16",0], + "574869992355132069": ["convolution_gpu_bfyx_gemm_like",2], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "3061372669831947873": ["convolution_gpu_yxfb_yxio_b16",2], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",2], + "11759322316883943989": ["convolution_gpu_yxfb_yxio_b16",2], + "14280056365441354869": ["convolution_gpu_yxfb_yxio_b16",2], + "6123707371654753818": ["convolution_gpu_yxfb_yxio_b16",2], + "8039645104667120991": ["convolution_gpu_yxfb_yxio_b16",2], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",1], + "2295659951331099829": ["convolution_gpu_yxfb_yxio_b16",2], + "11585430081839020501": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11079061135559995449": ["convolution_gpu_yxfb_yxio_b16",2], + "15192022454507415969": ["convolution_gpu_yxfb_yxio_b16",2], + "2014114949154914483": ["convolution_gpu_yxfb_yxio_b16",1], + "18427056032084727710": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "9434143681116089888": ["convolution_gpu_bfyx_gemm_like",2], + "16096353398003405565": ["convolution_gpu_yxfb_yxio_b16",2], + "2789901295967374316": ["convolution_gpu_yxfb_yxio_b16",2], + "9439431829175743345": ["convolution_gpu_bfyx_gemm_like",1], + "11694428890484758107": ["convolution_gpu_yxfb_yxio_b16",2], + "8542782888102516498": ["convolution_gpu_yxfb_yxio_b16",2], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",2], + "6664482192233202590": ["convolution_gpu_bfyx_gemm_like",2], + "6267138247577676996": ["convolution_gpu_yxfb_yxio_b16",2], + "13199672084171648305": ["convolution_gpu_yxfb_yxio_b16",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",2], + "15181987458871339815": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "8710469645764612897": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10626018319543075871": ["convolution_gpu_yxfb_yxio_b16",2], + "11086464266772450142": ["convolution_gpu_yxfb_yxio_b16",2], + "2085467192625870436": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "12287667143602938393": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "8155268141318893606": ["convolution_gpu_bfyx_gemm_like",1], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17763347648779573375": ["convolution_gpu_yxfb_yxio_b16",2], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",659], + "12712071520541638451": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "18244966393978155130": ["convolution_gpu_yxfb_yxio_b16",2], + "3114210363452108737": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14339479547451422762": ["convolution_gpu_yxfb_yxio_b16",2], + "16674633029045714564": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17054207561525574617": ["convolution_gpu_yxfb_yxio_b16",2], + "900243696733233996": ["convolution_gpu_yxfb_yxio_b16",2], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "4298242568890525997": ["convolution_gpu_yxfb_yxio_b16",2], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "15421280195211166867": ["convolution_gpu_yxfb_yxio_b16",2], + "13957350536347764705": ["convolution_gpu_bfyx_gemm_like",2], + "2803569867265035123": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4999171487916568471": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7748514992101811029": ["convolution_gpu_yxfb_yxio_b16",2], + "11892455357792445192": ["convolution_gpu_yxfb_yxio_b16",2], + "7762916621662364082": ["convolution_gpu_yxfb_yxio_b16",2], + "17462996923473002801": ["convolution_gpu_yxfb_yxio_b16",2], + "7201521533301617290": ["convolution_gpu_bfyx_gemm_like",1], + "6288489890578212082": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10645625090439446714": ["convolution_gpu_bfyx_gemm_like",2], + "4169042131399110713": ["convolution_gpu_yxfb_yxio_b16",2], + "14884315147107686805": ["convolution_gpu_bfyx_gemm_like",2], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",2], + "17277917672233464304": ["convolution_gpu_yxfb_yxio_b16",2], + "2204178900998688268": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",2], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "15757351352532908153": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "486816652607164926": ["convolution_gpu_yxfb_yxio_b16",2], + "1310498917952637709": ["convolution_gpu_yxfb_yxio_b16",2], + "14553577436929219470": ["convolution_gpu_yxfb_yxio_b16",2], + "7233783054884565746": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "11800783548769329949": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9814647153117279415": ["convolution_gpu_yxfb_yxio_b16",2], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",1], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "15317946705199574301": ["convolution_gpu_yxfb_yxio_b16",2], + "8210092359850191682": ["convolution_gpu_yxfb_yxio_b16",2], + "1784892318069674949": ["convolution_gpu_yxfb_yxio_b16",2], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5965451243366505522": ["convolution_gpu_bfyx_gemm_like",1], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",2], + "14770895149190975433": ["convolution_gpu_yxfb_yxio_b16",2], + "6816632607384969096": ["convolution_gpu_yxfb_yxio_b16",2], + "2817919813339364130": ["convolution_gpu_bfyx_gemm_like",1], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",2], + "6253009218981124949": ["convolution_gpu_yxfb_yxio_b16",2], + "5720964268093705079": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "1379758215293949563": ["convolution_gpu_yxfb_yxio_b16",2], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "12686330321897091505": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "9169935203300589222": ["convolution_gpu_yxfb_yxio_b16",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",1], + "5507373575763339429": ["convolution_gpu_yxfb_yxio_b16",1], + "6910589963488897537": ["convolution_gpu_yxfb_yxio_b16",2], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "10292349730148518173": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "1161304401293419103": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11856815095538913065": ["convolution_gpu_yxfb_yxio_b16",2], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4445913285957791409": ["convolution_gpu_yxfb_yxio_b16",2], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "75120034961995929": ["convolution_gpu_yxfb_yxio_b16",2], + "123026136670202868": ["convolution_gpu_yxfb_yxio_b16",2], + "3444250649099578792": ["convolution_gpu_yxfb_yxio_b16",2], + "15863531785836309247": ["convolution_gpu_yxfb_yxio_b16",2], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "16681690088928624738": ["convolution_gpu_bfyx_gemm_like",2], + "7979265448683159733": ["convolution_gpu_yxfb_yxio_b16",2], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",2], + "8302886228681027388": ["convolution_gpu_yxfb_yxio_b16",2], + "6479042072492268780": ["convolution_gpu_yxfb_yxio_b16",2], + "15531306520021286502": ["convolution_gpu_bfyx_gemm_like",2], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",2], + "15813044197987178947": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "12430677767405883160": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "787203599734115483": ["convolution_gpu_bfyx_1x1",0], + "12768933181342249823": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14963614790718019676": ["convolution_gpu_yxfb_yxio_b16",2], + "12275528180752359999": ["convolution_gpu_yxfb_yxio_b16",2], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12867038076564517306": ["convolution_gpu_yxfb_yxio_b16",2], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "14262482011051329729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1798440805196304745": ["convolution_gpu_yxfb_yxio_b16",2], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",0], + "16513038896689318072": ["convolution_gpu_yxfb_yxio_b16",2], + "15464554714318666871": ["convolution_gpu_yxfb_yxio_b16",2], + "5751283221740229986": ["convolution_gpu_bfyx_gemm_like",2], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "4242173940230902960": ["convolution_gpu_yxfb_yxio_b16",2], + "5525691792821548743": ["convolution_gpu_yxfb_yxio_b16",2], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "7972861956906521660": ["convolution_gpu_yxfb_yxio_b16",2], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "14670339865153970893": ["convolution_gpu_yxfb_yxio_b16",1], + "7861119251077361882": ["convolution_gpu_yxfb_yxio_b16",2], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "12004628115138530335": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14526262781657292025": ["convolution_gpu_yxfb_yxio_b16",2], + "13051390418571971928": ["convolution_gpu_yxfb_yxio_b16",2], + "15487730714504758208": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",2], + "9871407256481442790": ["convolution_gpu_yxfb_yxio_b16",2], + "6808843088626121909": ["convolution_gpu_bfyx_gemm_like",2], + "4251673416603443503": ["convolution_gpu_yxfb_yxio_b16",2], + "6250785177115691293": ["convolution_gpu_yxfb_yxio_b16",2], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",2], + "6484375582324852109": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "563440246018637010": ["convolution_gpu_yxfb_yxio_b16",2], + "9909564412554801760": ["convolution_gpu_yxfb_yxio_b16",2], + "14085753024976995311": ["convolution_gpu_yxfb_yxio_b16",2], + "16182470664818268848": ["convolution_gpu_bfyx_gemm_like",1], + "5266313052389515491": ["convolution_gpu_yxfb_yxio_b16",2], + "3114869763557037270": ["fully_connected_gpu_fb_oi_ref",2], + "7792512829747836997": ["convolution_gpu_yxfb_yxio_b16",2], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "12850195004093999773": ["convolution_gpu_yxfb_yxio_b16",2], + "11279789373735965856": ["convolution_gpu_yxfb_yxio_b16",2], + "16561618767117193109": ["convolution_gpu_bfyx_1x1",2], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "1427040855295681285": ["convolution_gpu_yxfb_yxio_b16",2], + "7614673554809134631": ["convolution_gpu_yxfb_yxio_b16",2], + "7628077869220463202": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "5245526691775741296": ["convolution_gpu_bfyx_gemm_like",1], + "15859493313686060349": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "15551338663759394064": ["convolution_gpu_yxfb_yxio_b16",2], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",2], + "12871555773123368130": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "17876939980356283351": ["convolution_gpu_yxfb_yxio_b16",2], + "14968401410355925289": ["convolution_gpu_yxfb_yxio_b16",2], + "12407890437443790515": ["convolution_gpu_bfyx_gemm_like",0], + "15271783562528081169": ["convolution_gpu_bfyx_gemm_like",2], + "7861234698413147249": ["convolution_gpu_yxfb_yxio_b16",2], + "14034525799882831106": ["convolution_gpu_bfyx_gemm_like",2], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15967614281807823696": ["convolution_gpu_bfyx_gemm_like",2], + "11738360883999461965": ["convolution_gpu_yxfb_yxio_b16",2], + "1658174263018326745": ["convolution_gpu_yxfb_yxio_b16",2], + "697333686114567307": ["convolution_gpu_bfyx_gemm_like",2], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "522313477023837056": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14273849038400888518": ["convolution_gpu_yxfb_yxio_b16",2], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",2], + "17737878867906137388": ["convolution_gpu_yxfb_yxio_b16",1], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "15858356755924943957": ["convolution_gpu_yxfb_yxio_b16",2], + "5705056256080522960": ["convolution_gpu_yxfb_yxio_b16",2], + "636447309806530300": ["convolution_gpu_yxfb_yxio_b16",2], + "10309986238001994183": ["convolution_gpu_yxfb_yxio_b16",2], + "15693204620575485046": ["convolution_gpu_yxfb_yxio_b16",2], + "4995051972576749717": ["convolution_gpu_yxfb_yxio_b16",2], + "9412392168031560549": ["convolution_gpu_yxfb_yxio_b16",2], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13082313288887957490": ["convolution_gpu_yxfb_yxio_b16",2], + "16610284927818475574": ["convolution_gpu_bfyx_gemm_like",2], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",0], + "7106362077449435105": ["convolution_gpu_bfyx_gemm_like",0], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",1], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "17580363505072477558": ["convolution_gpu_yxfb_yxio_b16",1], + "6934915634718835911": ["convolution_gpu_yxfb_yxio_b16",2], + "15594673952484539994": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "6055793483770886264": ["convolution_gpu_yxfb_yxio_b16",2], + "8519354640245415816": ["convolution_gpu_bfyx_gemm_like",1], + "108442764389420633": ["convolution_gpu_yxfb_yxio_b16",1], + "13187657215288939912": ["convolution_gpu_yxfb_yxio_b16",2], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2447893458816856522": ["convolution_gpu_bfyx_gemm_like",2], + "2644054989263429508": ["convolution_gpu_yxfb_yxio_b16",2], + "9521715904587435700": ["convolution_gpu_yxfb_yxio_b16",2], + "4936961129835214448": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16209868158768307271": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "14193777296032212476": ["convolution_gpu_yxfb_yxio_b16",2], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12179968379663737450": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9314293064351558241": ["convolution_gpu_bfyx_gemm_like",2], + "4090512597925170883": ["convolution_gpu_yxfb_yxio_b16",2], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8613740762403897614": ["convolution_gpu_yxfb_yxio_b16",2], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "10465119306486335226": ["convolution_gpu_yxfb_yxio_b16",2], + "4116610956045302817": ["convolution_gpu_yxfb_yxio_b16",2], + "12960590161485806657": ["convolution_gpu_bfyx_gemm_like",2], + "6723804327185132790": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10811224523636009881": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7552544688541855979": ["convolution_gpu_bfyx_gemm_like",2], + "8561261337239934159": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2819475920524949313": ["convolution_gpu_yxfb_yxio_b16",2], + "14501815053459103515": ["convolution_gpu_yxfb_yxio_b16",2], + "18161971781834208343": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9518071423184197213": ["convolution_gpu_bfyx_gemm_like",0], + "17487594336237597163": ["convolution_gpu_yxfb_yxio_b16",2], + "10113696658040720628": ["convolution_gpu_yxfb_yxio_b16",2], + "6624079551747071383": ["convolution_gpu_yxfb_yxio_b16",2], + "3449007266907948591": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5656320098721954644": ["convolution_gpu_yxfb_yxio_b16",2], + "5109636469531439569": ["convolution_gpu_yxfb_yxio_b16",2], + "3658599312236344017": ["convolution_gpu_yxfb_yxio_b16",2], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",1], + "16579057939215877904": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "16428789154716792138": ["convolution_gpu_yxfb_yxio_b16",2], + "1390379098099686972": ["convolution_gpu_bfyx_1x1",2], + "9065137335863605013": ["convolution_gpu_yxfb_yxio_b16",2], + "17019474731460049248": ["convolution_gpu_yxfb_yxio_b16",2], + "6808980404170272597": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "15457040168177954463": ["convolution_gpu_yxfb_yxio_b16",2], + "13398986810666238552": ["convolution_gpu_yxfb_yxio_b16",2], + "8976966933427522253": ["convolution_gpu_bfyx_gemm_like",0], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",1], + "151851883170419907": ["convolution_gpu_yxfb_yxio_b16",2], + "3080612075440389053": ["convolution_gpu_yxfb_yxio_b16",2], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "16617569629839911513": ["convolution_gpu_yxfb_yxio_b16",2], + "2527018855890902975": ["convolution_gpu_bfyx_gemm_like",1], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9280279544075738476": ["convolution_gpu_yxfb_yxio_b16",1], + "7213383384662748578": ["convolution_gpu_yxfb_yxio_b16",2], + "568191462231494113": ["convolution_gpu_yxfb_yxio_b16",1], + "13182623473102074079": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "7683334381958571864": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "13835859040765465258": ["convolution_gpu_bfyx_gemm_like",0], + "9184275066167601343": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "12866217660635921034": ["convolution_gpu_bfyx_gemm_like",1], + "14184895905338394239": ["convolution_gpu_bfyx_gemm_like",2], + "13058026769607428653": ["convolution_gpu_yxfb_yxio_b16",1], + "7226002258982605405": ["convolution_gpu_yxfb_yxio_b16",1], + "12375919467924385618": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "17893696934478535385": ["convolution_gpu_yxfb_yxio_b16",2], + "9955816463820554626": ["convolution_gpu_yxfb_yxio_b16",2], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "6715523440337925186": ["convolution_gpu_yxfb_yxio_b16",2], + "5179013491581036103": ["convolution_gpu_yxfb_yxio_b16",2], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "17491825380936802930": ["convolution_gpu_yxfb_yxio_b16",2], + "1117729599102132243": ["convolution_gpu_yxfb_yxio_b16",1], + "16210934187492210542": ["convolution_gpu_yxfb_yxio_b16",2], + "7274647463152753603": ["convolution_gpu_yxfb_yxio_b16",2], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11010673493295430801": ["convolution_gpu_yxfb_yxio_b16",2], + "7364084475361144967": ["convolution_gpu_yxfb_yxio_b16",2], + "5751553671208192963": ["convolution_gpu_yxfb_yxio_b16",2], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "10982526068861394162": ["convolution_gpu_yxfb_yxio_b16",2], + "14359026450472189405": ["convolution_gpu_yxfb_yxio_b16",1], + "3107655421406621915": ["convolution_gpu_yxfb_yxio_b16",2], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12311849904266608701": ["convolution_gpu_yxfb_yxio_b16",2], + "11280672272221124024": ["convolution_gpu_yxfb_yxio_b16",2], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "3383222668132648804": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "5106072383853469966": ["convolution_gpu_yxfb_yxio_b16",2], + "16563030700888982979": ["convolution_gpu_yxfb_yxio_b16",2], + "17538518333907257868": ["convolution_gpu_bfyx_gemm_like",2], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",0], + "1594829714229111215": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "14461365896122393071": ["convolution_gpu_yxfb_yxio_b16",2], + "14445031303145992349": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "16912738776771289379": ["convolution_gpu_yxfb_yxio_b16",2], + "860852602930021016": ["convolution_gpu_yxfb_yxio_b16",2], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "8611873585228858719": ["convolution_gpu_yxfb_yxio_b16",2], + "3366647240745174769": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "12989677691575632174": ["convolution_gpu_yxfb_yxio_b16",2], + "3117673619907511009": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "959666756751640874": ["convolution_gpu_yxfb_yxio_b16",2], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",2], + "9736684300833719045": ["convolution_gpu_yxfb_yxio_b16",2], + "7425369489110576363": ["convolution_gpu_yxfb_yxio_b16",2], + "5926747396493954633": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12850044341631872743": ["convolution_gpu_yxfb_yxio_b16",2], + "10864011008000364415": ["convolution_gpu_bfyx_1x1",2], + "2242602888499888844": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "17036482252028102703": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "9366100787108468082": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6666210546769702280": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2753393184265405425": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "15669242195570440840": ["convolution_gpu_yxfb_yxio_b16",2], + "4121535611334103359": ["convolution_gpu_yxfb_yxio_b16",2], + "2450251936650841836": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14963449045970262346": ["convolution_gpu_yxfb_yxio_b16",2], + "5464801565268066541": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7075659071934895087": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "15138641310139776109": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13155570698198686211": ["convolution_gpu_yxfb_yxio_b16",2], + "13590444711975157776": ["convolution_gpu_bfyx_gemm_like",1], + "264466528528245004": ["convolution_gpu_yxfb_yxio_b16",2], + "537074122417021898": ["convolution_gpu_bfyx_gemm_like",2], + "13637537549252005181": ["convolution_gpu_yxfb_yxio_b16",2], + "8117638644045799192": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12829916847670789556": ["convolution_gpu_yxfb_yxio_b16",2], + "11299021927882809469": ["convolution_gpu_yxfb_yxio_b16",2], + "11096750581455917678": ["convolution_gpu_yxfb_yxio_b16",2], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",2], + "15949311219856917559": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "16129682385980878760": ["convolution_gpu_yxfb_yxio_b16",2], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2147896649835170790": ["convolution_gpu_yxfb_yxio_b16",2], + "17480277135590489472": ["convolution_gpu_yxfb_yxio_b16",2], + "15223779293313750042": ["convolution_gpu_yxfb_yxio_b16",2], + "13264617841270329349": ["convolution_gpu_bfyx_1x1",2], + "2060161076370553192": ["convolution_gpu_yxfb_yxio_b16",2], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",774], + "6846760451124717672": ["convolution_gpu_yxfb_yxio_b16",2], + "3872151366780051246": ["convolution_gpu_bfyx_gemm_like",1], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "6713554643048248003": ["convolution_gpu_yxfb_yxio_b16",2], + "12984970933638742657": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "3797986765970777456": ["convolution_gpu_yxfb_yxio_b16",2], + "17342758321852264926": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2581014920570427861": ["convolution_gpu_yxfb_yxio_b16",2], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6677367803113594603": ["convolution_gpu_yxfb_yxio_b16",2], + "11939914680143672459": ["fully_connected_gpu_fb_oi_ref",1], + "14885031472057965707": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "1330337530094825121": ["convolution_gpu_yxfb_yxio_b16",2], + "9906138392975645747": ["convolution_gpu_yxfb_yxio_b16",2], + "16955829428734830876": ["convolution_gpu_yxfb_yxio_b16",2], + "15974208269240775349": ["convolution_gpu_yxfb_yxio_b16",2], + "1235864574444794315": ["convolution_gpu_yxfb_yxio_b16",2], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",1], + "2706024586717944825": ["convolution_gpu_yxfb_yxio_b16",2], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13598984763955239116": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "6880746917399866285": ["convolution_gpu_bfyx_gemm_like",2], + "3622778166646258015": ["convolution_gpu_yxfb_yxio_b16",2], + "15998609626878578708": ["convolution_gpu_yxfb_yxio_b16",1], + "10117092543913369513": ["convolution_gpu_yxfb_yxio_b16",2], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",2], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "4727628999533330347": ["convolution_gpu_yxfb_yxio_b16",2], + "6128157319666849074": ["convolution_gpu_yxfb_yxio_b16",2], + "848735117501914374": ["convolution_gpu_yxfb_yxio_b16",2], + "816527348871309530": ["convolution_gpu_yxfb_yxio_b16",2], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "6210866413385292851": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "12624762527234542946": ["convolution_gpu_yxfb_yxio_b16",2], + "518733575377143679": ["convolution_gpu_yxfb_yxio_b16",2], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "5541365322085427177": ["convolution_gpu_yxfb_yxio_b16",1], + "4633763257197651352": ["convolution_gpu_yxfb_yxio_b16",2], + "501138469231848694": ["convolution_gpu_yxfb_yxio_b16",2], + "3567607339495161307": ["convolution_gpu_yxfb_yxio_b16",2], + "8794896449397768269": ["convolution_gpu_bfyx_gemm_like",2], + "12531880391016521628": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11682041005124075890": ["convolution_gpu_yxfb_yxio_b16",2], + "9778670810863940690": ["convolution_gpu_yxfb_yxio_b16",2], + "9116620473576064051": ["convolution_gpu_yxfb_yxio_b16",2], + "1734769856106746136": ["convolution_gpu_yxfb_yxio_b16",2], + "938848188161536107": ["convolution_gpu_bfyx_1x1",0], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",2], + "14304497513584420080": ["convolution_gpu_yxfb_yxio_b16",2], + "15617599138946168772": ["convolution_gpu_yxfb_yxio_b16",2], + "13962189339706230770": ["convolution_gpu_yxfb_yxio_b16",2], + "16404362308829952450": ["convolution_gpu_yxfb_yxio_b16",2], + "16128152634974034731": ["convolution_gpu_yxfb_yxio_b16",2], + "1982176363226079588": ["convolution_gpu_bfyx_gemm_like",2], + "361497145093734608": ["convolution_gpu_bfyx_gemm_like",2], + "1596353239542510685": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "11693134363909241514": ["convolution_gpu_yxfb_yxio_b16",2], + "14819324687394700033": ["convolution_gpu_bfyx_1x1",2], + "13145474177271090694": ["convolution_gpu_bfyx_gemm_like",2], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "3285968426413869315": ["convolution_gpu_yxfb_yxio_b16",2], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",0], + "2527276292172180386": ["convolution_gpu_bfyx_gemm_like",2], + "677249604491773387": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4291531885506213180": ["convolution_gpu_yxfb_yxio_b16",2], + "2048528188026477374": ["convolution_gpu_yxfb_yxio_b16",2], + "14151747022287993729": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4240975186599864955": ["convolution_gpu_yxfb_yxio_b16",2], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13530377297525480029": ["convolution_gpu_yxfb_yxio_b16",2], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "4683320313995550908": ["convolution_gpu_yxfb_yxio_b16",2], + "4165926748138587705": ["convolution_gpu_yxfb_yxio_b16",2], + "16851716501872033211": ["fully_connected_gpu_fb_io_block_fp16",2], + "11148428797294511280": ["convolution_gpu_yxfb_yxio_b16",1], + "5050273611519516510": ["convolution_gpu_bfyx_gemm_like",2], + "13668940862847596363": ["convolution_gpu_yxfb_yxio_b16",1], + "11563334365673075610": ["convolution_gpu_yxfb_yxio_b16",1], + "13040213971461407125": ["convolution_gpu_yxfb_yxio_b16",2], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",0], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "8941904405273405481": ["fully_connected_gpu_fb_io_b8_f8_vload",0], + "14349625788399542568": ["convolution_gpu_bfyx_gemm_like",1], + "13507437548205340054": ["convolution_gpu_yxfb_yxio_b16",2], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "14616413139039308367": ["fully_connected_gpu_fb_oi_ref",1], + "13854845390344305906": ["convolution_gpu_yxfb_yxio_b16",2], + "8956566633622104099": ["convolution_gpu_yxfb_yxio_b16",1], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "6232452664016831516": ["convolution_gpu_yxfb_yxio_b16",2], + "4398371999113956082": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",2], + "5924698731432597368": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "17616719165728687438": ["convolution_gpu_yxfb_yxio_b16",2], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",0], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7375461241315602473": ["convolution_gpu_bfyx_gemm_like",2], + "16516262096533373158": ["convolution_gpu_yxfb_yxio_b16",2], + "2133236128630074068": ["convolution_gpu_yxfb_yxio_b16",2], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "12476381811279163147": ["convolution_gpu_yxfb_yxio_b16",2], + "4883588237027084166": ["convolution_gpu_yxfb_yxio_b16",2], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "12816344078518706065": ["convolution_gpu_yxfb_yxio_b16",2], + "16910952799476896905": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",2], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8655739705298627602": ["convolution_gpu_bfyx_gemm_like",2], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "17399728556634171321": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3738514326459749974": ["convolution_gpu_yxfb_yxio_b16",2], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6550549654706796887": ["convolution_gpu_yxfb_yxio_b16",2], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",1], + "15449715596597016714": ["convolution_gpu_bfyx_gemm_like",1], + "11775265110573621330": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13723434004563378589": ["convolution_gpu_yxfb_yxio_b16",2], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "17987739992848266169": ["convolution_gpu_yxfb_yxio_b16",2], + "6792281830591233968": ["convolution_gpu_yxfb_yxio_b16",2], + "13585916416233680276": ["convolution_gpu_yxfb_yxio_b16",2], + "5942742563827424666": ["convolution_gpu_yxfb_yxio_b16",2], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",2], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",1], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "10717031088082350652": ["convolution_gpu_yxfb_yxio_b16",2], + "8260024340787818709": ["convolution_gpu_yxfb_yxio_b16",2], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",2], + "8494725779002762049": ["convolution_gpu_bfyx_gemm_like",0], + "8177017967170389275": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15483343060578660278": ["convolution_gpu_yxfb_yxio_b16",2], + "10174346112533671798": ["convolution_gpu_yxfb_yxio_b16",2], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4885504197789468842": ["convolution_gpu_yxfb_yxio_b16",2], + "5020763861388859254": ["convolution_gpu_bfyx_gemm_like",1], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12384317536636082264": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",1], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "3835387982926010630": ["convolution_gpu_yxfb_yxio_b16",2], + "7369109502608631066": ["convolution_gpu_yxfb_yxio_b16",2], + "10979362792894404338": ["convolution_gpu_bfyx_gemm_like",2], + "7824075236081312706": ["convolution_gpu_yxfb_yxio_b16",2], + "17638753020411096694": ["convolution_gpu_yxfb_yxio_b16",2], + "2759142157812694203": ["convolution_gpu_yxfb_yxio_b16",2], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",2], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "14215394208930955062": ["convolution_gpu_yxfb_yxio_b16",2], + "16161112020028389294": ["convolution_gpu_yxfb_yxio_b16",2], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",2], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",1], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "14469011068777098822": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "8069537351442302814": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "10577357333308653027": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9207799012657103903": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2149299205144202701": ["convolution_gpu_yxfb_yxio_b16",2], + "6942606834115081953": ["convolution_gpu_yxfb_yxio_b16",2], + "5421397731090158382": ["convolution_gpu_yxfb_yxio_b16",2], + "10309586646776223605": ["convolution_gpu_yxfb_yxio_b16",2], + "3820661057776133570": ["convolution_gpu_bfyx_1x1",2], + "16290551573997593168": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11051684565403294370": ["convolution_gpu_yxfb_yxio_b16",2], + "11208625628954179200": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5516518048239364231": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "16950925976172895196": ["convolution_gpu_yxfb_yxio_b16",2], + "16995444341569389342": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "10588059104387338398": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2199167704280374654": ["convolution_gpu_yxfb_yxio_b16",2], + "9999543693712389402": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10572945270796129630": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "7051238664181857633": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "9436893310034662243": ["convolution_gpu_bfyx_gemm_like",1], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "13767500791267563349": ["convolution_gpu_yxfb_yxio_b16",2], + "15310474203328198827": ["convolution_gpu_yxfb_yxio_b16",2], + "11872943152839631823": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "11070968498963106073": ["fully_connected_gpu_fb_io_block_fp16",0], + "7168028033666253263": ["convolution_gpu_bfyx_gemm_like",2], + "7412772553395852003": ["convolution_gpu_yxfb_yxio_b16",2], + "8021915447462898777": ["convolution_gpu_bfyx_gemm_like",0], + "10833423331830484028": ["convolution_gpu_yxfb_yxio_b16",2], + "8717456809499914445": ["convolution_gpu_yxfb_yxio_b16",1], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",2], + "18384215264061386089": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "5721096633060535553": ["convolution_gpu_yxfb_yxio_b16",1], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2882493407831196579": ["fully_connected_gpu_fb_io_block_fp16",0], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",0], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "2583562092192709891": ["convolution_gpu_yxfb_yxio_b16",2], + "4848143712599565301": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4126895998426674411": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4571404165794634411": ["convolution_gpu_bfyx_1x1",2], + "17713666626443142908": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6467251764899975676": ["convolution_gpu_bfyx_direct_10_12_16",2], + "230697511447695268": ["convolution_gpu_yxfb_yxio_b16",2], + "6959692641873234850": ["convolution_gpu_yxfb_yxio_b16",2], + "6469003096932778978": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "10803929517111130153": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",2], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",2], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",2], + "11361202190524990711": ["convolution_gpu_bfyx_os_iyx_osv16",882], + "7104266560248570112": ["convolution_gpu_yxfb_yxio_b16",2], + "9504349455215835807": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1454014148777456006": ["convolution_gpu_yxfb_yxio_b16",2], + "14689812157592240007": ["convolution_gpu_yxfb_yxio_b16",2], + "5008541841892687897": ["convolution_gpu_yxfb_yxio_b16",2], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "13731797251725972855": ["convolution_gpu_yxfb_yxio_b16",2], + "7107677063657303327": ["convolution_gpu_bfyx_1x1",2], + "5632958791318880428": ["convolution_gpu_yxfb_yxio_b16",2], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "149810021216592597": ["convolution_gpu_yxfb_yxio_b16",1], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "8506271633579173639": ["convolution_gpu_yxfb_yxio_b16",2], + "12762301414049772746": ["convolution_gpu_yxfb_yxio_b16",2], + "15418732002117930760": ["convolution_gpu_yxfb_yxio_b16",2], + "603883331897298932": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1338581414403268264": ["convolution_gpu_yxfb_yxio_b16",2], + "2603233376890892194": ["convolution_gpu_yxfb_yxio_b16",2], + "12656228464579497510": ["convolution_gpu_yxfb_yxio_b16",2], + "3986429358782189117": ["convolution_gpu_yxfb_yxio_b16",2], + "3499106702307464480": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7223737889890738294": ["convolution_gpu_yxfb_yxio_b16",2], + "6388086351909447495": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11291868421122092629": ["convolution_gpu_yxfb_yxio_b16",2], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "6992073477131490452": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "18184621367843960190": ["convolution_gpu_bfyx_gemm_like",2], + "6613116267521819997": ["convolution_gpu_yxfb_yxio_b16",1], + "10525462454857911293": ["convolution_gpu_yxfb_yxio_b16",2], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "13326339730522937517": ["convolution_gpu_yxfb_yxio_b16",2], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "345043289576587800": ["convolution_gpu_bfyx_1x1",2], + "10406201782146034797": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11070446574652704629": ["convolution_gpu_yxfb_yxio_b16",2], + "6018481198468872040": ["convolution_gpu_yxfb_yxio_b16",2], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "13368203360773949292": ["convolution_gpu_yxfb_yxio_b16",2], + "4455369117448405874": ["convolution_gpu_bfyx_1x1",2], + "14236681916032484600": ["convolution_gpu_yxfb_yxio_b16",2], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "14147460733160099960": ["convolution_gpu_bfyx_gemm_like",1], + "3965327578193694832": ["convolution_gpu_yxfb_yxio_b16",2], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16626502801066228405": ["convolution_gpu_yxfb_yxio_b16",2], + "12625112690264223217": ["convolution_gpu_bfyx_gemm_like",2], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",2], + "14930789530046665855": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "1186545671730357033": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2554991397391195611": ["convolution_gpu_bfyx_gemm_like",2], + "2282123636764935353": ["convolution_gpu_yxfb_yxio_b16",2], + "6371463287631658789": ["convolution_gpu_bfyx_gemm_like",2], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "2737064424879246276": ["convolution_gpu_bfyx_gemm_like",2], + "5012013738970489338": ["convolution_gpu_bfyx_1x1",2], + "15710826363434377015": ["convolution_gpu_yxfb_yxio_b16",2], + "16889886654893884746": ["convolution_gpu_bfyx_1x1",2], + "4329042569031331949": ["convolution_gpu_yxfb_yxio_b16",2], + "5420215220876162902": ["convolution_gpu_yxfb_yxio_b16",1], + "824911124897042617": ["convolution_gpu_yxfb_yxio_b16",2], + "4208702365182336507": ["convolution_gpu_yxfb_yxio_b16",2], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "5963901433137582265": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16527840366172690992": ["convolution_gpu_yxfb_yxio_b16",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "415232223198122046": ["convolution_gpu_yxfb_yxio_b16",2], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "1984152634309440563": ["convolution_gpu_bfyx_gemm_like",2], + "5546447512898130524": ["convolution_gpu_yxfb_yxio_b16",2], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",2], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "7100056605355325582": ["convolution_gpu_yxfb_yxio_b16",2], + "8611710048909301596": ["convolution_gpu_yxfb_yxio_b16",2], + "16895523130717954500": ["convolution_gpu_yxfb_yxio_b16",2], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "5250257911846706612": ["convolution_gpu_yxfb_yxio_b16",2], + "9096495972770198040": ["convolution_gpu_yxfb_yxio_b16",2], + "16936968151775497887": ["convolution_gpu_bfyx_gemm_like",2], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15980348884716629349": ["convolution_gpu_bfyx_gemm_like",1], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",2], + "14385148066232093878": ["convolution_gpu_yxfb_yxio_b16",2], + "9714508918051740792": ["convolution_gpu_bfyx_gemm_like",1], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",904], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",2], + "4732226322522411018": ["fully_connected_gpu_fb_io_block_fp16",0], + "4723643671527109645": ["convolution_gpu_yxfb_yxio_b16",1], + "4615766471724791034": ["convolution_gpu_yxfb_yxio_b16",2], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10034575179959785704": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1584906448442153128": ["convolution_gpu_yxfb_yxio_b16",2], + "17264010982688979937": ["convolution_gpu_bfyx_1x1",2], + "10514865654990433040": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "11411413051626428349": ["convolution_gpu_yxfb_yxio_b16",2], + "2412846055735335136": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "14686272582436109012": ["convolution_gpu_yxfb_yxio_b16",2], + "1973819632224480598": ["convolution_gpu_yxfb_yxio_b16",2], + "12011606174372081253": ["convolution_gpu_yxfb_yxio_b16",2], + "12287827551127082597": ["convolution_gpu_yxfb_yxio_b16",2], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "1775515808301276388": ["convolution_gpu_yxfb_yxio_b16",2], + "2356785927637873692": ["convolution_gpu_bfyx_gemm_like",2], + "3976736548270395981": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "331661172067077796": ["convolution_gpu_bfyx_1x1",2], + "8809438390805488749": ["convolution_gpu_yxfb_yxio_b16",2], + "7678226048807568024": ["convolution_gpu_yxfb_yxio_b16",2], + "17430994325635361377": ["convolution_gpu_yxfb_yxio_b16",2], + "15641322340289892344": ["convolution_gpu_yxfb_yxio_b16",2], + "3225866261943242708": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2102169562353089558": ["convolution_gpu_yxfb_yxio_b16",2], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "6764038061921866053": ["convolution_gpu_yxfb_yxio_b16",2], + "13809330759308309353": ["convolution_gpu_bfyx_gemm_like",2], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",2], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16252420150239789472": ["convolution_gpu_yxfb_yxio_b16",2], + "14558572801374416278": ["convolution_gpu_bfyx_gemm_like",2], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "8645965165922150743": ["convolution_gpu_yxfb_yxio_b16",2], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "3904383357046705799": ["convolution_gpu_yxfb_yxio_b16",2], + "11313025178951972247": ["convolution_gpu_bfyx_gemm_like",2], + "2095245727814188300": ["convolution_gpu_bfyx_gemm_like",2], + "11979032916453246611": ["convolution_gpu_yxfb_yxio_b16",2], + "4805194563120934409": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18094592431313771787": ["convolution_gpu_yxfb_yxio_b16",2], + "8010456208258134834": ["convolution_gpu_yxfb_yxio_b16",2], + "12093737479877309006": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "818998169319147148": ["convolution_gpu_bfyx_gemm_like",1], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",2], + "16016396784190934729": ["convolution_gpu_yxfb_yxio_b16",2], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "4113061482402915179": ["convolution_gpu_yxfb_yxio_b16",2], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5445584581720919223": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11516184047320372729": ["convolution_gpu_yxfb_yxio_b16",2], + "17559750858236255044": ["convolution_gpu_yxfb_yxio_b16",2], + "4171374172427814762": ["convolution_gpu_yxfb_yxio_b16",2], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8372855367097191197": ["convolution_gpu_yxfb_yxio_b16",2], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9272405129875537865": ["convolution_gpu_yxfb_yxio_b16",2], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "14281801257982447624": ["convolution_gpu_yxfb_yxio_b16",2], + "16247799703932868151": ["convolution_gpu_yxfb_yxio_b16",2], + "7287107719392705356": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "9492026326463873766": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7830644361525332797": ["convolution_gpu_yxfb_yxio_b16",2], + "12536364199388193516": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "5074273865983613482": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "8015885733173521367": ["convolution_gpu_yxfb_yxio_b16",2], + "10262850086265676378": ["convolution_gpu_yxfb_yxio_b16",2], + "16644952765107909604": ["convolution_gpu_yxfb_yxio_b16",2], + "8219179055259247644": ["convolution_gpu_yxfb_yxio_b16",2], + "2844794465598309010": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "8726274320876550785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "579781312141502576": ["convolution_gpu_bfyx_1x1",2], + "10613621801998459768": ["convolution_gpu_yxfb_yxio_b16",2], + "560996739186313493": ["convolution_gpu_yxfb_yxio_b16",2], + "47872288115972996": ["convolution_gpu_yxfb_yxio_b16",2], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15281554100135159550": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "9193880745263317167": ["convolution_gpu_bfyx_gemm_like",2], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "3793265335909270748": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11002875874008272679": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "9870432551513415176": ["convolution_gpu_yxfb_yxio_b16",2], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "967141158966448909": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",0], + "14612206111651511130": ["convolution_gpu_yxfb_yxio_b16",2], + "13009612703754510124": ["convolution_gpu_yxfb_yxio_b16",2], + "2884499360870038648": ["convolution_gpu_yxfb_yxio_b16",2], + "16966477504105790279": ["convolution_gpu_yxfb_yxio_b16",2], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",2], + "13218298785325404589": ["convolution_gpu_yxfb_yxio_b16",2], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",1038], + "7615563770941714046": ["convolution_gpu_yxfb_yxio_b16",2], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "18029396837690671545": ["convolution_gpu_yxfb_yxio_b16",2], + "3889456478817717702": ["convolution_gpu_yxfb_yxio_b16",2], + "7241156141838776126": ["convolution_gpu_bfyx_gemm_like",2], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17961793197503317952": ["convolution_gpu_yxfb_yxio_b16",2], + "9418041909134721047": ["convolution_gpu_bfyx_gemm_like",2], + "12439827609628473238": ["convolution_gpu_yxfb_yxio_b16",2], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",2], + "17969195175890497912": ["convolution_gpu_yxfb_yxio_b16",2], + "4683575221310726091": ["convolution_gpu_yxfb_yxio_b16",2], + "15555083739490354527": ["convolution_gpu_bfyx_gemm_like",2], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",2], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "10232429887105708502": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",2], + "17734480671864478402": ["convolution_gpu_yxfb_yxio_b16",2], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "16003914811215141863": ["convolution_gpu_yxfb_yxio_b16",2], + "7378840969627751667": ["convolution_gpu_yxfb_yxio_b16",2], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "11411580529501121244": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "18152894191323920027": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "11880337915508207160": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2314579504260247470": ["convolution_gpu_yxfb_yxio_b16",2], + "12669783714916998842": ["convolution_gpu_yxfb_yxio_b16",2], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "15774073623451382326": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13621771094745539509": ["convolution_gpu_yxfb_yxio_b16",2], + "576164857039495839": ["convolution_gpu_yxfb_yxio_b16",0], + "757225477250808939": ["convolution_gpu_yxfb_yxio_b16",2], + "10271261715175176019": ["convolution_gpu_yxfb_yxio_b16",2], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "16974981142389546385": ["convolution_gpu_yxfb_yxio_b16",2], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1208665743495618456": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17618727959983224888": ["convolution_gpu_yxfb_yxio_b16",2], + "2562815925396318565": ["convolution_gpu_yxfb_yxio_b16",2], + "14795618530175274538": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5479761740065152589": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "15924916465272239832": ["convolution_gpu_bfyx_gemm_like",2], + "2629918844315184499": ["convolution_gpu_yxfb_yxio_b16",2], + "3442845193734599342": ["convolution_gpu_yxfb_yxio_b16",2], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16188473537674428539": ["convolution_gpu_yxfb_yxio_b16",2], + "7210854698870587826": ["convolution_gpu_yxfb_yxio_b16",2], + "3336076058264596420": ["convolution_gpu_bfyx_gemm_like",2], + "4988480452582288323": ["convolution_gpu_yxfb_yxio_b16",2], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",2], + "1646638859396929303": ["convolution_gpu_yxfb_yxio_b16",2], + "9079203986633151014": ["convolution_gpu_bfyx_1x1",2], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",2], + "17255805293355120219": ["convolution_gpu_yxfb_yxio_b16",2], + "2321773209766424929": ["convolution_gpu_yxfb_yxio_b16",2], + "9495192057713157041": ["convolution_gpu_yxfb_yxio_b16",2], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",2], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "3406812365298442897": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "14066675688397331406": ["convolution_gpu_yxfb_yxio_b16",2], + "7447163906170805189": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3396731547696204011": ["convolution_gpu_yxfb_yxio_b16",2], + "10217182484138821482": ["convolution_gpu_yxfb_yxio_b16",2], + "883436333317162926": ["convolution_gpu_bfyx_1x1",2], + "12184235281888559274": ["convolution_gpu_yxfb_yxio_b16",2], + "14025678657541870252": ["convolution_gpu_yxfb_yxio_b16",2], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1068155851494601726": ["convolution_gpu_yxfb_yxio_b16",2], + "10598099730944525581": ["fully_connected_gpu_fb_io_b8_f8_vload",0], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",2], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "10396788403466463989": ["convolution_gpu_yxfb_yxio_b16",2], + "10447947790216991304": ["convolution_gpu_bfyx_gemm_like",2], + "12972798847556569913": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "17081449111821382308": ["convolution_gpu_yxfb_yxio_b16",2], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3725013268198063198": ["convolution_gpu_bfyx_1x1",2], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "5312413491828906254": ["convolution_gpu_yxfb_yxio_b16",1], + "13766070202060785219": ["convolution_gpu_yxfb_yxio_b16",2], + "10318417166945621015": ["convolution_gpu_yxfb_yxio_b16",2], + "16184142990117192433": ["convolution_gpu_yxfb_yxio_b16",2], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",2], + "1095959046309466012": ["convolution_gpu_yxfb_yxio_b16",2], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",2], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",0], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",1], + "14651159827389223108": ["convolution_gpu_bfyx_gemm_like",1], + "9576962489937466093": ["convolution_gpu_yxfb_yxio_b16",1], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",2], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "11319799002723299753": ["convolution_gpu_yxfb_yxio_b16",2], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "932195814187889636": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "14799012895945855878": ["convolution_gpu_yxfb_yxio_b16",2], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",2], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",2], + "10868287582480518153": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "14199158130218117084": ["convolution_gpu_bfyx_gemm_like",2], + "8618627241234406784": ["convolution_gpu_yxfb_yxio_b16",2], + "7232326270078161768": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3932617680771387232": ["convolution_gpu_yxfb_yxio_b16",2], + "18080788888293706149": ["convolution_gpu_yxfb_yxio_b16",2], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12903015669020591018": ["convolution_gpu_yxfb_yxio_b16",2], + "1963081583851864291": ["convolution_gpu_bfyx_gemm_like",1], + "2683304757433993300": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16911464046178654033": ["convolution_gpu_bfyx_1x1",2], + "5313382805395362669": ["convolution_gpu_yxfb_yxio_b16",2], + "3680396164645753224": ["convolution_gpu_yxfb_yxio_b16",2], + "10656486867659934705": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "15497797842820949408": ["convolution_gpu_bfyx_gemm_like",1], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "13509275050322423832": ["convolution_gpu_yxfb_yxio_b16",2], + "4802009650745059499": ["convolution_gpu_yxfb_yxio_b16",2], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11175936010605958812": ["convolution_gpu_yxfb_yxio_b16",2], + "6126073246053235472": ["convolution_gpu_yxfb_yxio_b16",2], + "16271675466919087248": ["convolution_gpu_yxfb_yxio_b16",2], + "18199824206329982249": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",2], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",803], + "11498084465186986412": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12867590715338247144": ["convolution_gpu_yxfb_yxio_b16",2], + "11614353411428360211": ["convolution_gpu_yxfb_yxio_b16",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6214624887470295152": ["convolution_gpu_bfyx_1x1",2], + "15534876725099279666": ["convolution_gpu_yxfb_yxio_b16",2], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",2], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1450861513159359637": ["convolution_gpu_yxfb_yxio_b16",2], + "7837876599690110056": ["convolution_gpu_bfyx_gemm_like",2], + "1703738105910059846": ["convolution_gpu_yxfb_yxio_b16",2], + "5046089607609787258": ["convolution_gpu_yxfb_yxio_b16",2], + "3211956138512889433": ["convolution_gpu_yxfb_yxio_b16",2], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13668072006310741601": ["convolution_gpu_yxfb_yxio_b16",2], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",2], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "2740885908397449753": ["convolution_gpu_yxfb_yxio_b16",2], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "1854612313463195535": ["convolution_gpu_yxfb_yxio_b16",1], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1318571118468536310": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12600707101000510621": ["convolution_gpu_yxfb_yxio_b16",2], + "5940007433515335594": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "3603187029740446600": ["convolution_gpu_bfyx_gemm_like",2], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "15136770992109675092": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "12604104383683210104": ["convolution_gpu_bfyx_gemm_like",2], + "12305397676800089268": ["convolution_gpu_yxfb_yxio_b16",2], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4216366893358625960": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "1944461047787586724": ["convolution_gpu_yxfb_yxio_b16",2], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6512987867462549101": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "15006321421735686121": ["convolution_gpu_bfyx_gemm_like",2], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",1], + "5886032409392368342": ["convolution_gpu_yxfb_yxio_b16",2], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "7155796826953849982": ["convolution_gpu_yxfb_yxio_b16",2], + "1635121016109328853": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8651641584737798174": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "12051398350382954787": ["convolution_gpu_yxfb_yxio_b16",2], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "6107700818115209289": ["convolution_gpu_yxfb_yxio_b16",2], + "18337762134908554532": ["convolution_gpu_yxfb_yxio_b16",2], + "6784853321527374515": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",2], + "9763754389347695094": ["convolution_gpu_yxfb_yxio_b16",2], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "6300691162962736560": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7441188930428385142": ["convolution_gpu_yxfb_yxio_b16",2], + "3662747857062156477": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2180039710632160943": ["convolution_gpu_yxfb_yxio_b16",2], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",1], + "8185193068790365354": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "11658751382892761740": ["convolution_gpu_yxfb_yxio_b16",2], + "11324651029379152442": ["convolution_gpu_bfyx_1x1",2], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",0], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",2], + "10509933181132310969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "789202969657820559": ["convolution_gpu_yxfb_yxio_b16",2], + "5099947445888268507": ["convolution_gpu_yxfb_yxio_b16",1], + "5657471280535146301": ["convolution_gpu_bfyx_gemm_like",1], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "155962454315573087": ["convolution_gpu_yxfb_yxio_b16",2], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "16101625311127899143": ["convolution_gpu_bfyx_gemm_like",2], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",2], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",1], + "838726445796308454": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "905526102343710614": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "2042946928570163140": ["convolution_gpu_yxfb_yxio_b16",2], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "16243196137456624852": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "4438526427135833402": ["convolution_gpu_yxfb_yxio_b16",2], + "14585000863294748739": ["convolution_gpu_bfyx_gemm_like",0], + "5950285227163574810": ["convolution_gpu_bfyx_os_iyx_osv16",528], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "15308196586729169691": ["convolution_gpu_yxfb_yxio_b16",2], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "4937688558707451907": ["convolution_gpu_yxfb_yxio_b16",2], + "10956668791040094584": ["convolution_gpu_yxfb_yxio_b16",2], + "6578517057140155080": ["convolution_gpu_yxfb_yxio_b16",2], + "2215570184121152738": ["convolution_gpu_bfyx_gemm_like",1], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",2], + "584086621952390547": ["convolution_gpu_bfyx_gemm_like",2], + "6367371992814643260": ["convolution_gpu_yxfb_yxio_b16",2], + "4802014352392262053": ["convolution_gpu_yxfb_yxio_b16",2], + "16139615240471264488": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "7405315582091905378": ["convolution_gpu_yxfb_yxio_b16",2], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12166852830214895457": ["convolution_gpu_bfyx_1x1",2], + "8954488655859677891": ["convolution_gpu_yxfb_yxio_b16",2], + "17084977396231597605": ["convolution_gpu_bfyx_gemm_like",1], + "5346898505346646714": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "10789133352712755945": ["convolution_gpu_yxfb_yxio_b16",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "11731277083374465361": ["convolution_gpu_yxfb_yxio_b16",1], + "9127827617126714860": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "9811086682271990794": ["convolution_gpu_yxfb_yxio_b16",2], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",2], + "12686015414958770329": ["convolution_gpu_bfyx_gemm_like",2], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8952733400567254769": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1771347579022727189": ["convolution_gpu_yxfb_yxio_b16",2], + "8325903548627432": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",2], + "17958575161092859465": ["convolution_gpu_yxfb_yxio_b16",2], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "216603198215625772": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",2], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "15822546325822628634": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "9649445293567537596": ["convolution_gpu_yxfb_yxio_b16",2], + "6744044115114192916": ["convolution_gpu_yxfb_yxio_b16",2], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "9482749589540764069": ["convolution_gpu_yxfb_yxio_b16",2], + "6603778920476932267": ["convolution_gpu_bfyx_gemm_like",1], + "8976238022515713641": ["convolution_gpu_bfyx_gemm_like",2], + "14840851809642905875": ["convolution_gpu_yxfb_yxio_b16",2], + "15131258379753113816": ["convolution_gpu_yxfb_yxio_b16",2], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "10292585962794261197": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "5941095082097535176": ["convolution_gpu_bfyx_gemm_like",2], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",2], + "3631332752661975859": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3219408878901707426": ["convolution_gpu_bfyx_gemm_like",1], + "18091349188280218186": ["convolution_gpu_yxfb_yxio_b16",2], + "10820312036555742020": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "17377204616846724192": ["convolution_gpu_bfyx_gemm_like",2], + "7998455776901877973": ["convolution_gpu_yxfb_yxio_b16",2], + "4562591438007476419": ["convolution_gpu_bfyx_gemm_like",2], + "11113125355390956764": ["convolution_gpu_yxfb_yxio_b16",2], + "18040183500393090505": ["convolution_gpu_yxfb_yxio_b16",2], + "16316483048621486077": ["convolution_gpu_bfyx_gemm_like",2], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "16267531927647687641": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6699877220571254719": ["convolution_gpu_yxfb_yxio_b16",1], + "9834941975457910988": ["convolution_gpu_yxfb_yxio_b16",2], + "15449650271741732512": ["convolution_gpu_yxfb_yxio_b16",2], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16364494883229084045": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "14789782064157699768": ["convolution_gpu_yxfb_yxio_b16",2], + "6290584630172122012": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "208915399644127739": ["convolution_gpu_bfyx_gemm_like",1], + "14985236276429954162": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5156033406916344703": ["convolution_gpu_bfyx_gemm_like",1], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "14135593723444205032": ["convolution_gpu_bfyx_gemm_like",0], + "7105219760750474587": ["convolution_gpu_yxfb_yxio_b16",2], + "4656068024153891922": ["convolution_gpu_yxfb_yxio_b16",2], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "11637325834858582585": ["convolution_gpu_bfyx_gemm_like",2], + "401304652492444430": ["convolution_gpu_bfyx_gemm_like",2], + "2920840796593281126": ["convolution_gpu_bfyx_gemm_like",2], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "18226737525116147628": ["convolution_gpu_yxfb_yxio_b16",2], + "10358170616931426647": ["convolution_gpu_yxfb_yxio_b16",2], + "17040537179740138304": ["convolution_gpu_yxfb_yxio_b16",2], + "10912495395422146386": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "225809055928705881": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "15206249797344242666": ["convolution_gpu_yxfb_yxio_b16",1], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",2], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "12713821004129672990": ["convolution_gpu_yxfb_yxio_b16",2], + "11706446082856895571": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "8787438180071123604": ["convolution_gpu_bfyx_gemm_like",1], + "1880137091477870982": ["convolution_gpu_yxfb_yxio_b16",2], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",2], + "7913076120244203725": ["convolution_gpu_bfyx_gemm_like",2], + "13326492157370934949": ["convolution_gpu_bfyx_gemm_like",2], + "9182260316973872633": ["convolution_gpu_yxfb_yxio_b16",2], + "6888842613779488104": ["convolution_gpu_bfyx_1x1",2], + "12267555886404772991": ["convolution_gpu_yxfb_yxio_b16",2], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",2], + "13291402786934990349": ["convolution_gpu_yxfb_yxio_b16",2], + "8900977003907025003": ["convolution_gpu_yxfb_yxio_b16",2], + "2581414750854621875": ["convolution_gpu_bfyx_gemm_like",2], + "1044889231088602677": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "6817494598328071314": ["convolution_gpu_bfyx_gemm_like",2], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "14041970415787494000": ["convolution_gpu_yxfb_yxio_b16",2], + "12985942652866621579": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3022939690177474442": ["convolution_gpu_yxfb_yxio_b16",2], + "2031558560788449957": ["convolution_gpu_yxfb_yxio_b16",2], + "12487879163561616870": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13821224753538037982": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11007175027950132719": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "17321934232458063571": ["convolution_gpu_yxfb_yxio_b16",2], + "2609346307827449622": ["convolution_gpu_yxfb_yxio_b16",2], + "14502856487639608696": ["convolution_gpu_bfyx_gemm_like",2], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2247717767819293683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13960388312976163971": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "17041468169694105561": ["convolution_gpu_yxfb_yxio_b16",2], + "57372993988016244": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4731836216299455047": ["convolution_gpu_yxfb_yxio_b16",2], + "5047972486012090625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10041205516209288381": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "1298596164164324360": ["convolution_gpu_yxfb_yxio_b16",2], + "12501619443242354860": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "13527018660229167386": ["convolution_gpu_yxfb_yxio_b16",2], + "1580344438642032807": ["convolution_gpu_bfyx_gemm_like",1], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "7280502812960451465": ["convolution_gpu_yxfb_yxio_b16",2], + "17258128299721452811": ["convolution_gpu_yxfb_yxio_b16",2], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "7870154008378361670": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "4216958486055161753": ["convolution_gpu_bfyx_gemm_like",2], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "15409184364121627414": ["convolution_gpu_yxfb_yxio_b16",2], + "12879367655655932174": ["convolution_gpu_yxfb_yxio_b16",2], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "13077917010686381919": ["convolution_gpu_yxfb_yxio_b16",2], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",1], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "3117175697326325371": ["convolution_gpu_bfyx_os_iyx_osv16",505], + "8362179886017398479": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",153], + "7008873036126556197": ["convolution_gpu_yxfb_yxio_b16",2], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "2761862049452027986": ["convolution_gpu_yxfb_yxio_b16",2], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",1], + "8470959792634864749": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",2], + "6934241437968723825": ["convolution_gpu_yxfb_yxio_b16",1], + "14132543442791497311": ["convolution_gpu_yxfb_yxio_b16",2], + "10471519687597963116": ["convolution_gpu_bfyx_gemm_like",1], + "10148067979123062638": ["convolution_gpu_yxfb_yxio_b16",2], + "18087356517015630281": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6856130385095139346": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "9440117898128288296": ["convolution_gpu_bfyx_gemm_like",2], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2949545414911764346": ["convolution_gpu_yxfb_yxio_b16",2], + "12643643553436503069": ["convolution_gpu_yxfb_yxio_b16",2], + "4154830034576950123": ["convolution_gpu_yxfb_yxio_b16",2], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",2], + "16851082749395991194": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3049097498155857895": ["convolution_gpu_yxfb_yxio_b16",2], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",2], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "5578991261564497604": ["convolution_gpu_yxfb_yxio_b16",2], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15997145184054496085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10141558851476164734": ["convolution_gpu_yxfb_yxio_b16",2], + "11942424927004660476": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7590734607006912544": ["convolution_gpu_yxfb_yxio_b16",2], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "11376953876369788199": ["convolution_gpu_yxfb_yxio_b16",1], + "8731079912830889828": ["convolution_gpu_yxfb_yxio_b16",2], + "17990326690659802090": ["convolution_gpu_yxfb_yxio_b16",2], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "8733371726903473932": ["convolution_gpu_yxfb_yxio_b16",2], + "12194037100109755112": ["convolution_gpu_bfyx_gemm_like",2], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "4135975804549022456": ["convolution_gpu_yxfb_yxio_b16",2], + "9076758673133996959": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "3001162215282339268": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "6981294059746462667": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "17762455138615317884": ["convolution_gpu_yxfb_yxio_b16",2], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",2], + "13468081302022888489": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3962138884698789654": ["convolution_gpu_yxfb_yxio_b16",2], + "7814543122045448412": ["convolution_gpu_bfyx_gemm_like",2], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12818012741490629493": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "9170163372548895531": ["convolution_gpu_yxfb_yxio_b16",2], + "3171354702636014224": ["convolution_gpu_yxfb_yxio_b16",2], + "5672464491301994292": ["convolution_gpu_bfyx_gemm_like",1], + "6825390996679224270": ["convolution_gpu_yxfb_yxio_b16",2], + "897253033961107413": ["convolution_gpu_yxfb_yxio_b16",2], + "512446355173752600": ["convolution_gpu_yxfb_yxio_b16",1], + "13648761167622654288": ["fully_connected_gpu_fb_oi_ref",2], + "10760094119259477688": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "4165019140664090799": ["convolution_gpu_yxfb_yxio_b16",2], + "17935612508319394087": ["convolution_gpu_yxfb_yxio_b16",2], + "877901260688090160": ["convolution_gpu_yxfb_yxio_b16",2], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "17800115051456107658": ["convolution_gpu_yxfb_yxio_b16",2], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",1], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "7349168847581850619": ["convolution_gpu_yxfb_yxio_b16",2], + "8161520217142313996": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10747101719272611563": ["convolution_gpu_yxfb_yxio_b16",2], + "5832851215142537445": ["convolution_gpu_yxfb_yxio_b16",2], + "9332701118402940384": ["convolution_gpu_yxfb_yxio_b16",2], + "5795524493577277985": ["convolution_gpu_yxfb_yxio_b16",2], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "16426655160932259558": ["convolution_gpu_yxfb_yxio_b16",2], + "14331554754171207866": ["convolution_gpu_bfyx_gemm_like",2], + "3928356751040028375": ["convolution_gpu_bfyx_gemm_like",2], + "3451309062150982886": ["convolution_gpu_yxfb_yxio_b16",2], + "12389854459474697184": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13565691057064774487": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "3815222814331650224": ["convolution_gpu_yxfb_yxio_b16",2], + "4201057957682777280": ["convolution_gpu_yxfb_yxio_b16",2], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",2], + "14943031375539993004": ["convolution_gpu_yxfb_yxio_b16",2], + "14204609663091442879": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "5968129546023764583": ["convolution_gpu_yxfb_yxio_b16",2], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",2], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "16898785030254336705": ["convolution_gpu_yxfb_yxio_b16",2], + "18385086614524985975": ["convolution_gpu_yxfb_yxio_b16",2], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",1], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5595779343671478945": ["convolution_gpu_yxfb_yxio_b16",2], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",2], + "14324166291904435508": ["convolution_gpu_yxfb_yxio_b16",2], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",0], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9955939178447682108": ["convolution_gpu_bfyx_1x1",2], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",2], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "18080848057281093190": ["convolution_gpu_yxfb_yxio_b16",2], + "18071280811713424504": ["convolution_gpu_yxfb_yxio_b16",2], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "1154763947184432124": ["convolution_gpu_yxfb_yxio_b16",2], + "9827177798112814604": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4089043893927493060": ["convolution_gpu_yxfb_yxio_b16",2], + "4433497906256257606": ["convolution_gpu_yxfb_yxio_b16",1], + "10751633292301177132": ["convolution_gpu_yxfb_yxio_b16",2], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",1], + "14345755557418971954": ["convolution_gpu_yxfb_yxio_b16",2], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "5008350851224686853": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "13760645810144930270": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "7099035779223341587": ["convolution_gpu_yxfb_yxio_b16",2], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",1], + "15635250842093678965": ["convolution_gpu_yxfb_yxio_b16",2], + "10419440621736450993": ["convolution_gpu_yxfb_yxio_b16",2], + "12961109385388101976": ["convolution_gpu_yxfb_yxio_b16",0], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "2847490224869294354": ["convolution_gpu_bfyx_gemm_like",2], + "9954050478761346921": ["convolution_gpu_bfyx_gemm_like",2], + "7600296832974673294": ["convolution_gpu_yxfb_yxio_b16",2], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "4894469114343061704": ["convolution_gpu_yxfb_yxio_b16",2], + "16989896550094613437": ["convolution_gpu_yxfb_yxio_b16",1], + "2984726467649419856": ["convolution_gpu_bfyx_gemm_like",2], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",2], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",2], + "6388117241933586388": ["convolution_gpu_bfyx_gemm_like",2], + "10645057595080511813": ["convolution_gpu_yxfb_yxio_b16",2], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "5062815196458225737": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "426827405952656362": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2917999294360728537": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",2], + "11761545976388416063": ["convolution_gpu_yxfb_yxio_b16",2], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",79], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17228810554159747400": ["convolution_gpu_bfyx_gemm_like",2], + "14754849694687093032": ["convolution_gpu_yxfb_yxio_b16",2], + "10002044609138970243": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "3102538312627892960": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8078028207842958010": ["convolution_gpu_yxfb_yxio_b16",2], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "6341363789473021047": ["convolution_gpu_yxfb_yxio_b16",2], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",2], + "10022487076451608714": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "7902473777019759045": ["convolution_gpu_bfyx_gemm_like",1], + "17096735128393723245": ["convolution_gpu_yxfb_yxio_b16",2], + "17498483343394902796": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "6260115080574637314": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3010520839193613803": ["convolution_gpu_yxfb_yxio_b16",2], + "7451154080124553318": ["convolution_gpu_yxfb_yxio_b16",2], + "1040411949730118556": ["convolution_gpu_yxfb_yxio_b16",2], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",2], + "15820005010263193043": ["convolution_gpu_yxfb_yxio_b16",2], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",2], + "7923576965630818418": ["convolution_gpu_yxfb_yxio_b16",2], + "2321148334382088982": ["convolution_gpu_bfyx_gemm_like",2], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "1641111108888949123": ["convolution_gpu_yxfb_yxio_b16",2], + "13234055353608734080": ["convolution_gpu_yxfb_yxio_b16",2], + "1223196405651730260": ["convolution_gpu_yxfb_yxio_b16",2], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "3217295012596892181": ["convolution_gpu_yxfb_yxio_b16",2], + "16606674008248299103": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7172357320005702833": ["convolution_gpu_yxfb_yxio_b16",2], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "13585163747565192884": ["convolution_gpu_bfyx_gemm_like",2], + "5112480593385320005": ["convolution_gpu_yxfb_yxio_b16",2], + "9711184878666366204": ["convolution_gpu_yxfb_yxio_b16",2], + "2578325663193624576": ["convolution_gpu_yxfb_yxio_b16",2], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "13020331397245585657": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "13325287783358291692": ["convolution_gpu_yxfb_yxio_b16",2], + "15866935886105967122": ["convolution_gpu_yxfb_yxio_b16",1], + "15474155528481683394": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "10753540518493641553": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "9078447949109922472": ["convolution_gpu_yxfb_yxio_b16",2], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",2], + "14540578324750869319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17052161869014993719": ["convolution_gpu_yxfb_yxio_b16",2], + "721174714308243785": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14817801788424046035": ["convolution_gpu_yxfb_yxio_b16",2], + "3286496836813087881": ["convolution_gpu_yxfb_yxio_b16",2], + "7482459536338668149": ["convolution_gpu_yxfb_yxio_b16",2], + "16794854619854992714": ["convolution_gpu_yxfb_yxio_b16",2], + "10558609844937234631": ["convolution_gpu_yxfb_yxio_b16",2], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "13193898459027972719": ["convolution_gpu_yxfb_yxio_b16",2], + "1148949417144436507": ["convolution_gpu_yxfb_yxio_b16",2], + "5602328731722824868": ["convolution_gpu_yxfb_yxio_b16",2], + "10015368609444108372": ["convolution_gpu_yxfb_yxio_b16",2], + "11799180632798787251": ["convolution_gpu_yxfb_yxio_b16",2], + "14766694310604777253": ["convolution_gpu_yxfb_yxio_b16",2], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2273992727647793692": ["convolution_gpu_bfyx_gemm_like",1], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "2439993891369206440": ["convolution_gpu_bfyx_1x1",2], + "101387140804297623": ["convolution_gpu_yxfb_yxio_b16",1], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",2], + "15031155621982459860": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "16304192736281226143": ["convolution_gpu_yxfb_yxio_b16",1], + "15104727000375811836": ["convolution_gpu_yxfb_yxio_b16",2], + "2058364830449635556": ["convolution_gpu_yxfb_yxio_b16",2], + "6648876837655776653": ["convolution_gpu_bfyx_1x1",2], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "13454265023861566476": ["convolution_gpu_bfyx_gemm_like",2], + "6799631962511042762": ["convolution_gpu_yxfb_yxio_b16",2], + "9162359935098885411": ["convolution_gpu_yxfb_yxio_b16",2], + "15148442194461613102": ["fully_connected_gpu_fb_io_block_fp16",2], + "913861052717410566": ["convolution_gpu_yxfb_yxio_b16",2], + "5958300749101873980": ["convolution_gpu_yxfb_yxio_b16",2], + "13369603621524676979": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18436249934780056991": ["convolution_gpu_bfyx_gemm_like",2], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "17318287523550546026": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7933217973342728190": ["convolution_gpu_yxfb_yxio_b16",2], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "9659814105483633858": ["convolution_gpu_yxfb_yxio_b16",2], + "10724501418439612080": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "2114232149447438823": ["convolution_gpu_bfyx_gemm_like",2], + "11564071490267241224": ["convolution_gpu_yxfb_yxio_b16",2], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",2], + "10981374120597916521": ["convolution_gpu_yxfb_yxio_b16",2], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "1418595171949196661": ["convolution_gpu_bfyx_gemm_like",2], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4670487436469119872": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9169324504353459004": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10325138269934303618": ["convolution_gpu_yxfb_yxio_b16",2], + "8490260671996115530": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17011363406405852347": ["convolution_gpu_bfyx_gemm_like",2], + "7126667413990834481": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9452094307760005150": ["convolution_gpu_bfyx_gemm_like",2], + "2840794055129352139": ["convolution_gpu_yxfb_yxio_b16",2], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3987482581128838173": ["convolution_gpu_yxfb_yxio_b16",2], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1117836569328440439": ["convolution_gpu_yxfb_yxio_b16",2], + "13702254392810961772": ["convolution_gpu_yxfb_yxio_b16",2], + "12780116250427776647": ["convolution_gpu_yxfb_yxio_b16",2], + "7737977992444172757": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7095629088416100928": ["convolution_gpu_bfyx_gemm_like",2], + "3155353791103196186": ["convolution_gpu_yxfb_yxio_b16",2], + "7992444232916226938": ["convolution_gpu_yxfb_yxio_b16",2], + "15932838442166411183": ["convolution_gpu_yxfb_yxio_b16",2], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",2], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13328583512713703122": ["convolution_gpu_yxfb_yxio_b16",2], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",2], + "7926989875988735079": ["convolution_gpu_yxfb_yxio_b16",2], + "12877601016766418505": ["convolution_gpu_bfyx_gemm_like",0], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16574710115918192418": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12107079280128343726": ["convolution_gpu_yxfb_yxio_b16",2], + "994252691216116396": ["convolution_gpu_yxfb_yxio_b16",2], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "10785966734346479177": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "7777279468029216688": ["convolution_gpu_yxfb_yxio_b16",2], + "11109044986816563101": ["convolution_gpu_yxfb_yxio_b16",2], + "396580837423299119": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6756771670011959646": ["convolution_gpu_bfyx_gemm_like",2], + "981803877097233095": ["convolution_gpu_yxfb_yxio_b16",1], + "12477315042623518609": ["convolution_gpu_yxfb_yxio_b16",2], + "12892265081710606252": ["convolution_gpu_yxfb_yxio_b16",2], + "3782308167335660154": ["convolution_gpu_yxfb_yxio_b16",2], + "87031578643428011": ["convolution_gpu_bfyx_1x1",2], + "12308956927236847009": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6458189051305803360": ["convolution_gpu_yxfb_yxio_b16",2], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "6159729136505378486": ["convolution_gpu_yxfb_yxio_b16",2], + "2856601829807186494": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "5209144536543011657": ["convolution_gpu_yxfb_yxio_b16",2], + "14281154151197472605": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "143667964449473415": ["convolution_gpu_yxfb_yxio_b16",2], + "6721354194352192662": ["convolution_gpu_yxfb_yxio_b16",2], + "16271970578584267980": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8079914471491171372": ["convolution_gpu_yxfb_yxio_b16",2], + "1539677456611270609": ["convolution_gpu_yxfb_yxio_b16",2], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10009796094612770326": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "3856976081672275637": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "17192352762166764393": ["convolution_gpu_yxfb_yxio_b16",2], + "14931590390643373866": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12561177248542630652": ["convolution_gpu_yxfb_yxio_b16",2], + "14971506154649368216": ["convolution_gpu_yxfb_yxio_b16",0], + "12917241193304093727": ["convolution_gpu_bfyx_gemm_like",2], + "8584375748627260395": ["convolution_gpu_yxfb_yxio_b16",2], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "15783429395177379897": ["convolution_gpu_yxfb_yxio_b16",2], + "11782188262748842182": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",0], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",2], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15128816312559638985": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5337351591182109481": ["convolution_gpu_bfyx_os_iyx_osv16",131], + "8931169575495985034": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14216513246096503793": ["convolution_gpu_yxfb_yxio_b16",2], + "13123709697607309884": ["convolution_gpu_yxfb_yxio_b16",2], + "11073090858361674041": ["convolution_gpu_yxfb_yxio_b16",2], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10747688146893187959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11563892089503603030": ["convolution_gpu_yxfb_yxio_b16",1], + "5551484040302194648": ["convolution_gpu_yxfb_yxio_b16",2], + "7183578232279711009": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "503369896500284129": ["convolution_gpu_bfyx_1x1",2], + "3419536918610303807": ["convolution_gpu_yxfb_yxio_b16",1], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "15596408854298291433": ["convolution_gpu_yxfb_yxio_b16",2], + "2016932800158392200": ["convolution_gpu_yxfb_yxio_b16",2], + "15705908639736679687": ["convolution_gpu_yxfb_yxio_b16",2], + "5919454297699648428": ["convolution_gpu_yxfb_yxio_b16",2], + "11500205299047837289": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5643920882179676695": ["convolution_gpu_yxfb_yxio_b16",2], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "17542035367134614728": ["convolution_gpu_yxfb_yxio_b16",2], + "9426665763007611385": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "8712136292276123857": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16828388628569377322": ["convolution_gpu_yxfb_yxio_b16",2], + "14801210545983960599": ["convolution_gpu_yxfb_yxio_b16",2], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "2052010432187897741": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "835053793432636355": ["convolution_gpu_yxfb_yxio_b16",2], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13512059751838488458": ["convolution_gpu_yxfb_yxio_b16",2], + "12002302929446578025": ["convolution_gpu_yxfb_yxio_b16",2], + "5735703235236456131": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5040095338370816349": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13803790014241837327": ["convolution_gpu_yxfb_yxio_b16",2], + "6254141935545262078": ["convolution_gpu_bfyx_gemm_like",1], + "104765009188090817": ["convolution_gpu_yxfb_yxio_b16",1], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10504318542015227515": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "14667793472412360981": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15773157615731010456": ["convolution_gpu_bfyx_gemm_like",2], + "15060535689318007173": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2206771663823062080": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15070618248849566698": ["convolution_gpu_yxfb_yxio_b16",2], + "4744578087509837185": ["convolution_gpu_yxfb_yxio_b16",2], + "18417830391649460864": ["convolution_gpu_yxfb_yxio_b16",2], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5774841809066688068": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "5115007207028125638": ["convolution_gpu_bfyx_gemm_like",2], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13412516623201653283": ["convolution_gpu_yxfb_yxio_b16",2], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "17224104246148265328": ["convolution_gpu_bfyx_gemm_like",2], + "14827882251752394500": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7223801044761006523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15367649112776077240": ["convolution_gpu_yxfb_yxio_b16",2], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "5567628205735744449": ["convolution_gpu_yxfb_yxio_b16",2], + "359617184733439511": ["convolution_gpu_yxfb_yxio_b16",2], + "6948455759869670955": ["convolution_gpu_yxfb_yxio_b16",2], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1692473411043262397": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17439276474731842060": ["convolution_gpu_yxfb_yxio_b16",2], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",2], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",2], + "17822988909419777692": ["convolution_gpu_yxfb_yxio_b16",2], + "13541382855330226000": ["convolution_gpu_yxfb_yxio_b16",2], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",1], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "2705031521944165712": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16773645387243701837": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "7273427309587902237": ["convolution_gpu_bfyx_gemm_like",1], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "1114679698826953542": ["convolution_gpu_yxfb_yxio_b16",2], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "3106922888635965020": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10774872391768741315": ["convolution_gpu_yxfb_yxio_b16",2], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "8527069404111265568": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "12585864429067596351": ["convolution_gpu_yxfb_yxio_b16",2], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "12279771749366327372": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",2], + "12146979849998627283": ["convolution_gpu_bfyx_gemm_like",2], + "3163833930628348446": ["convolution_gpu_yxfb_yxio_b16",2], + "12495003066477974474": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "5032841266226405428": ["convolution_gpu_yxfb_yxio_b16",2], + "12810833895438895155": ["convolution_gpu_yxfb_yxio_b16",2], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",2], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "276313536076170391": ["convolution_gpu_bfyx_gemm_like",2], + "2920017342405650206": ["convolution_gpu_yxfb_yxio_b16",2], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",2], + "1054954263090546905": ["convolution_gpu_yxfb_yxio_b16",2], + "968092788032627444": ["convolution_gpu_yxfb_yxio_b16",2], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14289082888174784976": ["convolution_gpu_bfyx_gemm_like",2], + "14446688005815492020": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "8556125699591344922": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",2], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15329680728165965773": ["convolution_gpu_bfyx_gemm_like",2], + "15739278428190392018": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "6883767567034259453": ["convolution_gpu_yxfb_yxio_b16",2], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "14074996784220709246": ["convolution_gpu_yxfb_yxio_b16",2], + "3748621266324665764": ["convolution_gpu_yxfb_yxio_b16",1], + "2469579114592379040": ["convolution_gpu_bfyx_gemm_like",2], + "3742751561273931407": ["convolution_gpu_yxfb_yxio_b16",0], + "2561508262445368003": ["convolution_gpu_yxfb_yxio_b16",2], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "7134419022268272901": ["convolution_gpu_yxfb_yxio_b16",2], + "12308895602001600327": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1359720957005310113": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "16549854027697846882": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "15295172519920136220": ["convolution_gpu_yxfb_yxio_b16",2], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",1], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12241130380766920378": ["convolution_gpu_yxfb_yxio_b16",2], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "17769159396346490074": ["convolution_gpu_yxfb_yxio_b16",2], + "2905979727479716212": ["convolution_gpu_yxfb_yxio_b16",2], + "3244803973821375252": ["convolution_gpu_yxfb_yxio_b16",2], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "8095675456938934982": ["convolution_gpu_yxfb_yxio_b16",2], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",2], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "15112393534380347357": ["convolution_gpu_yxfb_yxio_b16",2], + "7605652809856543211": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12923298574715329852": ["convolution_gpu_yxfb_yxio_b16",2], + "8407012082034007985": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9119268982510599778": ["convolution_gpu_yxfb_yxio_b16",2], + "13569453018083742128": ["convolution_gpu_yxfb_yxio_b16",2], + "12234313962656804631": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "14677968346503677769": ["convolution_gpu_yxfb_yxio_b16",2], + "8543619733732987550": ["convolution_gpu_bfyx_gemm_like",1], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "17725637691681205907": ["convolution_gpu_bfyx_gemm_like",2], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "10194187012252949909": ["convolution_gpu_yxfb_yxio_b16",2], + "4521622755195947253": ["convolution_gpu_yxfb_yxio_b16",2], + "394778201589371681": ["convolution_gpu_bfyx_gemm_like",2], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "10178171262128338408": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5740745357953479527": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2242829490403202087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3325575565536567070": ["convolution_gpu_yxfb_yxio_b16",2], + "15222260213708019662": ["convolution_gpu_yxfb_yxio_b16",2], + "6167369758442930886": ["convolution_gpu_bfyx_gemm_like",2], + "13425251102263428554": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "9480653639044390919": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3911736807429733938": ["convolution_gpu_yxfb_yxio_b16",2], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "7770000755097925765": ["convolution_gpu_bfyx_1x1",2], + "11155444222714959508": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "10076885835791159907": ["convolution_gpu_yxfb_yxio_b16",0], + "16788162879714733906": ["convolution_gpu_yxfb_yxio_b16",2], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",1], + "5762631094740444698": ["convolution_gpu_yxfb_yxio_b16",2], + "14366395926517590797": ["convolution_gpu_yxfb_yxio_b16",2], + "5017701748886087836": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7742126547476513275": ["convolution_gpu_yxfb_yxio_b16",2], + "18129795023552968695": ["convolution_gpu_yxfb_yxio_b16",2], + "11698754846673268046": ["convolution_gpu_yxfb_yxio_b16",2], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",2], + "4925720860007127584": ["convolution_gpu_yxfb_yxio_b16",2], + "9827201026276954165": ["convolution_gpu_yxfb_yxio_b16",2], + "3805667660217578518": ["convolution_gpu_yxfb_yxio_b16",2], + "14868677663932902695": ["convolution_gpu_bfyx_gemm_like",2], + "824380206255396866": ["convolution_gpu_yxfb_yxio_b16",2], + "13390197134230598693": ["convolution_gpu_yxfb_yxio_b16",2], + "17833304859352483840": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "1099404514975797315": ["convolution_gpu_yxfb_yxio_b16",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "2147962310424425158": ["convolution_gpu_yxfb_yxio_b16",2], + "3343020946662226400": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "11730276873446857018": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8553491894663686698": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "15206381185687737007": ["convolution_gpu_bfyx_gemm_like",2], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",2], + "15525903155475629518": ["convolution_gpu_bfyx_gemm_like",2], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "18408107772851888061": ["convolution_gpu_bfyx_gemm_like",0], + "5118467701668427545": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "14318347197994059448": ["convolution_gpu_yxfb_yxio_b16",2], + "13991572769793610416": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4439786737038041995": ["convolution_gpu_yxfb_yxio_b16",2], + "6284333183047854748": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8131879590716437354": ["convolution_gpu_yxfb_yxio_b16",2], + "6805188858008657978": ["convolution_gpu_bfyx_gemm_like",2], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "135072053401934228": ["convolution_gpu_bfyx_1x1",2], + "15006204461468698734": ["convolution_gpu_yxfb_yxio_b16",1], + "15636128989267984459": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "1157388265135592238": ["convolution_gpu_yxfb_yxio_b16",2], + "5941852872160795604": ["convolution_gpu_bfyx_gemm_like",2], + "1466455001976212160": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11436473937404565094": ["convolution_gpu_yxfb_yxio_b16",2], + "15681189418847392587": ["convolution_gpu_bfyx_gemm_like",1], + "6525496212688896740": ["convolution_gpu_yxfb_yxio_b16",2], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "7535571298845832061": ["convolution_gpu_yxfb_yxio_b16",2], + "8537824547722216155": ["convolution_gpu_yxfb_yxio_b16",2], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "4830454154838353056": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "12887076860522920405": ["convolution_gpu_yxfb_yxio_b16",2], + "10547134120307382906": ["convolution_gpu_yxfb_yxio_b16",2], + "4283886984540574108": ["convolution_gpu_yxfb_yxio_b16",2], + "16532743776403877084": ["convolution_gpu_yxfb_yxio_b16",2], + "11086471945045031067": ["convolution_gpu_yxfb_yxio_b16",2], + "16739031949237426992": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",1], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",2], + "16788715253205076219": ["fully_connected_gpu_fb_oi_ref",1], + "9617316303048974588": ["convolution_gpu_yxfb_yxio_b16",2], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15599983560500910839": ["convolution_gpu_yxfb_yxio_b16",2], + "4299773714254046691": ["convolution_gpu_yxfb_yxio_b16",2], + "130427456111826171": ["convolution_gpu_yxfb_yxio_b16",2], + "3766048787611884529": ["convolution_gpu_yxfb_yxio_b16",2], + "2895819653081408358": ["convolution_gpu_yxfb_yxio_b16",2], + "10254566865260697753": ["convolution_gpu_yxfb_yxio_b16",2], + "6059368508708501002": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "17167229341919111718": ["convolution_gpu_bfyx_gemm_like",2], + "10007925729029867733": ["convolution_gpu_yxfb_yxio_b16",2], + "2173867324489962689": ["convolution_gpu_bfyx_gemm_like",1], + "8978764053524288494": ["convolution_gpu_bfyx_gemm_like",2], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",106], + "6970636030494405299": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "9900658671239107502": ["convolution_gpu_bfyx_1x1",2], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",2], + "9453100135791813000": ["convolution_gpu_yxfb_yxio_b16",2], + "17433340097721474017": ["convolution_gpu_yxfb_yxio_b16",2], + "7235358742317442134": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "5240706676373148280": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "17583785768334531086": ["convolution_gpu_yxfb_yxio_b16",2], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "17292751972745231011": ["convolution_gpu_yxfb_yxio_b16",2], + "1123577455191848310": ["convolution_gpu_bfyx_gemm_like",2], + "3928596145340765666": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "10099598062509781441": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",1], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",1], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",2], + "10626281431800814406": ["convolution_gpu_yxfb_yxio_b16",2], + "8881906040469243354": ["convolution_gpu_yxfb_yxio_b16",2], + "16837963510205857013": ["convolution_gpu_yxfb_yxio_b16",2], + "1413558157882728476": ["convolution_gpu_yxfb_yxio_b16",2], + "2923543983518895756": ["convolution_gpu_yxfb_yxio_b16",2], + "14578867494693499627": ["convolution_gpu_bfyx_gemm_like",2], + "10816702874143297564": ["convolution_gpu_yxfb_yxio_b16",2], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5924341622384096919": ["convolution_gpu_bfyx_gemm_like",1], + "10324485383646920518": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "3957253946857103590": ["convolution_gpu_yxfb_yxio_b16",2], + "8943651590146149679": ["convolution_gpu_yxfb_yxio_b16",2], + "17043601935017365442": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13767985623872409391": ["convolution_gpu_yxfb_yxio_b16",2], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4890043345392707202": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "11942019076226205097": ["convolution_gpu_yxfb_yxio_b16",2], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",1], + "16223356735957394429": ["convolution_gpu_bfyx_gemm_like",0], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "241656278218999298": ["convolution_gpu_yxfb_yxio_b16",2], + "6070612528095353265": ["convolution_gpu_yxfb_yxio_b16",2], + "10783046011829953095": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14244689429217411113": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6944031900067948180": ["convolution_gpu_yxfb_yxio_b16",2], + "3177304125602972370": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "231083216612056805": ["convolution_gpu_yxfb_yxio_b16",2], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "7839141505912665157": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",0], + "11417406326478154077": ["convolution_gpu_yxfb_yxio_b16",2], + "13365950526881732374": ["convolution_gpu_yxfb_yxio_b16",2], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",2], + "1108229954015380813": ["convolution_gpu_yxfb_yxio_b16",2], + "11724732387425614709": ["convolution_gpu_yxfb_yxio_b16",2], + "6651389480007764007": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "16870110185980402237": ["convolution_gpu_yxfb_yxio_b16",2], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17567012866823126402": ["convolution_gpu_yxfb_yxio_b16",2], + "14417401878572618236": ["convolution_gpu_yxfb_yxio_b16",2], + "13223232888554043645": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10709828018763273371": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3096280563014331836": ["convolution_gpu_yxfb_yxio_b16",2], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "14774814395786139876": ["convolution_gpu_yxfb_yxio_b16",2], + "15050884844653850678": ["convolution_gpu_yxfb_yxio_b16",2], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",1], + "9918371346247634545": ["convolution_gpu_bfyx_gemm_like",2], + "15670767419106537809": ["convolution_gpu_yxfb_yxio_b16",2], + "3948843501884284998": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",0], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",2], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "880603384896315783": ["convolution_gpu_yxfb_yxio_b16",2], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "6730447536124542965": ["convolution_gpu_yxfb_yxio_b16",1], + "4927360358387344983": ["convolution_gpu_bfyx_gemm_like",1], + "8183383667948205424": ["convolution_gpu_yxfb_yxio_b16",2], + "10815244730103375973": ["convolution_gpu_yxfb_yxio_b16",2], + "16597170760061556882": ["convolution_gpu_yxfb_yxio_b16",2], + "16569637518948306471": ["convolution_gpu_bfyx_gemm_like",2], + "3602929955785812025": ["convolution_gpu_yxfb_yxio_b16",2], + "3622409603053918029": ["convolution_gpu_bfyx_gemm_like",1], + "10429104188258277773": ["convolution_gpu_yxfb_yxio_b16",2], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "6818140422066151642": ["convolution_gpu_yxfb_yxio_b16",2], + "15311930929656759371": ["convolution_gpu_yxfb_yxio_b16",2], + "15915715422308762909": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "5884951148427535208": ["convolution_gpu_yxfb_yxio_b16",2], + "8093401822846123153": ["convolution_gpu_yxfb_yxio_b16",2], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9655242408142699694": ["convolution_gpu_yxfb_yxio_b16",2], + "10226095100825845185": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "15985980444340490463": ["convolution_gpu_yxfb_yxio_b16",2], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "13602140021189675477": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2343310394723780653": ["convolution_gpu_yxfb_yxio_b16",2], + "11861634536583463947": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "8933701347987963693": ["convolution_gpu_yxfb_yxio_b16",2], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1304921846760027440": ["convolution_gpu_yxfb_yxio_b16",2], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2621495864635590903": ["convolution_gpu_yxfb_yxio_b16",2], + "8494385862885499798": ["convolution_gpu_yxfb_yxio_b16",2], + "13701870576531008278": ["convolution_gpu_yxfb_yxio_b16",2], + "5638640164891118162": ["convolution_gpu_yxfb_yxio_b16",2], + "13520876347177213888": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "4306052436602921234": ["convolution_gpu_yxfb_yxio_b16",2], + "11289650463922092775": ["convolution_gpu_bfyx_direct_10_12_16",2], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",2], + "377219085802486361": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17211590259060346125": ["convolution_gpu_yxfb_yxio_b16",2], + "4367991456894497706": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "7767103488808670253": ["convolution_gpu_yxfb_yxio_b16",2], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8509024280905303927": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13966416504547680082": ["convolution_gpu_yxfb_yxio_b16",2], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "8130920994920685157": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3058716597925544041": ["convolution_gpu_yxfb_yxio_b16",2], + "11690533591656807605": ["convolution_gpu_bfyx_gemm_like",2], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",2], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "14001406016806064079": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "15012885932988454455": ["convolution_gpu_yxfb_yxio_b16",2], + "13683623172740048376": ["convolution_gpu_bfyx_gemm_like",2], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "6461637373691101671": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15720012960520885263": ["convolution_gpu_yxfb_yxio_b16",2], + "4779919236230154165": ["convolution_gpu_bfyx_gemm_like",0], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "12151068022697708126": ["convolution_gpu_bfyx_gemm_like",2], + "17515847111676784130": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "3369689552455141157": ["convolution_gpu_yxfb_yxio_b16",2], + "4718716595177056289": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "5274929595362413625": ["convolution_gpu_yxfb_yxio_b16",2], + "9696168324381001582": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "9738776059655610885": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "15677717057398875599": ["convolution_gpu_bfyx_gemm_like",2], + "17718424965214606218": ["convolution_gpu_yxfb_yxio_b16",2], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "921209976738626097": ["convolution_gpu_yxfb_yxio_b16",2], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16341722570340169855": ["convolution_gpu_bfyx_1x1",2], + "3176785355296130660": ["convolution_gpu_bfyx_gemm_like",0], + "11300415556407923335": ["convolution_gpu_yxfb_yxio_b16",2], + "4072951883124129646": ["convolution_gpu_yxfb_yxio_b16",2], + "8000679297338683619": ["convolution_gpu_yxfb_yxio_b16",2], + "15897457705071738591": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "6945787904293959477": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "727216855315869048": ["convolution_gpu_yxfb_yxio_b16",2], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",2], + "7260204889552803221": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "360064276184684693": ["convolution_gpu_yxfb_yxio_b16",2], + "5758133252959371492": ["convolution_gpu_bfyx_gemm_like",2], + "886880682650879171": ["convolution_gpu_bfyx_gemm_like",2], + "15269988216002549857": ["convolution_gpu_yxfb_yxio_b16",2], + "10272016038525930672": ["convolution_gpu_bfyx_gemm_like",2], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",1], + "5596441339918073261": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "10106454449619141260": ["convolution_gpu_bfyx_1x1",2], + "5291011077679733990": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3701838669605585798": ["convolution_gpu_yxfb_yxio_b16",2], + "9940761514291929473": ["convolution_gpu_yxfb_yxio_b16",2], + "14115742296883450319": ["convolution_gpu_bfyx_gemm_like",1], + "12741762570001404232": ["convolution_gpu_yxfb_yxio_b16",2], + "14034402827496819479": ["convolution_gpu_bfyx_gemm_like",2], + "12294364015803004575": ["fully_connected_gpu_fb_io_block_fp16",0], + "150132162949295379": ["convolution_gpu_bfyx_1x1",2], + "7863319552895863063": ["convolution_gpu_yxfb_yxio_b16",2], + "1250095876638711647": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "18373951194274306895": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11088324811742486481": ["convolution_gpu_bfyx_gemm_like",2], + "9243949750444156746": ["convolution_gpu_bfyx_gemm_like",1], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13403161389559730": ["convolution_gpu_bfyx_gemm_like",2], + "4833749391314748606": ["convolution_gpu_yxfb_yxio_b16",0], + "13409744191227471760": ["convolution_gpu_bfyx_gemm_like",0], + "17817043205731836063": ["convolution_gpu_yxfb_yxio_b16",2], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "10256831975351722184": ["convolution_gpu_bfyx_gemm_like",2], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "9541996065561509160": ["convolution_gpu_yxfb_yxio_b16",2], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3121704239277217273": ["convolution_gpu_yxfb_yxio_b16",2], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",0], + "7395419333138772074": ["convolution_gpu_yxfb_yxio_b16",2], + "14910911338105922048": ["convolution_gpu_yxfb_yxio_b16",2], + "15325852281951905610": ["convolution_gpu_bfyx_os_iyx_osv16",803], + "1299760574827253811": ["convolution_gpu_yxfb_yxio_b16",2], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "1569043950563130463": ["convolution_gpu_bfyx_gemm_like",1], + "8555049634736330391": ["convolution_gpu_yxfb_yxio_b16",2], + "16502045034098739466": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2737352811173555281": ["convolution_gpu_yxfb_yxio_b16",1], + "4030004320208162301": ["convolution_gpu_yxfb_yxio_b16",2], + "4403753181729432604": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "7444165397413360181": ["convolution_gpu_yxfb_yxio_b16",2], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",2], + "12714814165247623529": ["convolution_gpu_yxfb_yxio_b16",2], + "3792276488551864121": ["convolution_gpu_yxfb_yxio_b16",2], + "16425374300157280628": ["convolution_gpu_yxfb_yxio_b16",2], + "15298221796479574600": ["convolution_gpu_yxfb_yxio_b16",2], + "49948277487706148": ["convolution_gpu_bfyx_1x1",2], + "4098581145478965082": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",1], + "13586735166545634506": ["convolution_gpu_yxfb_yxio_b16",2], + "2782970766870172398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17140702790441856730": ["convolution_gpu_bfyx_os_iyx_osv16",722], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",2], + "3534874664568214253": ["convolution_gpu_bfyx_1x1",2], + "7823257556787476006": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12374775091628199854": ["convolution_gpu_bfyx_1x1",2], + "7846384623429362522": ["convolution_gpu_bfyx_1x1",2], + "5788018146987909930": ["convolution_gpu_yxfb_yxio_b16",2], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "17742192339816511494": ["convolution_gpu_bfyx_gemm_like",2], + "166091609652531090": ["convolution_gpu_yxfb_yxio_b16",2], + "7843180034077880658": ["convolution_gpu_yxfb_yxio_b16",2], + "382811963722907674": ["convolution_gpu_bfyx_gemm_like",2], + "2188101366183302888": ["convolution_gpu_bfyx_gemm_like",1], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",0], + "10009559358571629502": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10399620940700804517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "8323669961818535927": ["convolution_gpu_yxfb_yxio_b16",2], + "2058172559199858297": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "13717351126657739994": ["convolution_gpu_yxfb_yxio_b16",2], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "2173163618947713953": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",0], + "7779562434199107586": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1375084615110147615": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11194372303922533529": ["convolution_gpu_yxfb_yxio_b16",1], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5359510718430377298": ["convolution_gpu_yxfb_yxio_b16",2], + "5285172225938230524": ["convolution_gpu_yxfb_yxio_b16",2], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",1], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8732952254407298868": ["convolution_gpu_bfyx_gemm_like",2], + "1245259979364728404": ["convolution_gpu_bfyx_1x1",2], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13038533272699602337": ["convolution_gpu_bfyx_gemm_like",2], + "17181874388601550941": ["convolution_gpu_yxfb_yxio_b16",2], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "10049294964307823692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4561778392194061215": ["convolution_gpu_yxfb_yxio_b16",2], + "1448440012428740463": ["convolution_gpu_yxfb_yxio_b16",2], + "4824040283449153298": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2148877522799179369": ["convolution_gpu_yxfb_yxio_b16",2], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",2], + "3101885395179993708": ["convolution_gpu_yxfb_yxio_b16",2], + "9399511839804500548": ["convolution_gpu_yxfb_yxio_b16",2], + "10775271979871646995": ["convolution_gpu_yxfb_yxio_b16",2], + "9161616741940575576": ["convolution_gpu_yxfb_yxio_b16",2], + "5578850952665051661": ["convolution_gpu_yxfb_yxio_b16",2], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1336739931702966228": ["convolution_gpu_yxfb_yxio_b16",2], + "415826393421796195": ["convolution_gpu_yxfb_yxio_b16",2], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15640466585550013905": ["convolution_gpu_bfyx_gemm_like",1], + "17955326503130437346": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",0], + "748236447365453504": ["convolution_gpu_yxfb_yxio_b16",2], + "3735753364888836383": ["convolution_gpu_yxfb_yxio_b16",2], + "7329924387620542330": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "3067930325929862490": ["convolution_gpu_yxfb_yxio_b16",2], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "4773482308451190487": ["convolution_gpu_yxfb_yxio_b16",2], + "12032580551021546487": ["convolution_gpu_yxfb_yxio_b16",2], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "8265982881100325775": ["convolution_gpu_yxfb_yxio_b16",2], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3112648799276134590": ["convolution_gpu_yxfb_yxio_b16",2], + "14830991971271385876": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",2], + "15526021915035861514": ["convolution_gpu_bfyx_gemm_like",2], + "14173531787508017136": ["convolution_gpu_yxfb_yxio_b16",0], + "7565348337952384040": ["convolution_gpu_yxfb_yxio_b16",2], + "16195893521207315456": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "4099828484175044842": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8348997431940166878": ["convolution_gpu_yxfb_yxio_b16",1], + "7346046748383284270": ["convolution_gpu_yxfb_yxio_b16",2], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "9569522500959727054": ["convolution_gpu_yxfb_yxio_b16",2], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7481256533438761028": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "13700014916680753395": ["convolution_gpu_bfyx_gemm_like",2], + "14263055580023018733": ["convolution_gpu_yxfb_yxio_b16",2], + "14916236722843741326": ["convolution_gpu_yxfb_yxio_b16",2], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "3308955824300750921": ["convolution_gpu_yxfb_yxio_b16",2], + "10612739622648878242": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10878198256414940305": ["convolution_gpu_yxfb_yxio_b16",2], + "2459018025887933198": ["convolution_gpu_yxfb_yxio_b16",2], + "12745631396795162505": ["convolution_gpu_yxfb_yxio_b16",2], + "8876704486585503280": ["convolution_gpu_yxfb_yxio_b16",2], + "15973363403733281926": ["convolution_gpu_yxfb_yxio_b16",2], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",1], + "16374675547140209181": ["convolution_gpu_yxfb_yxio_b16",2], + "6136232084354304563": ["convolution_gpu_yxfb_yxio_b16",1], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11557032521956761994": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "17712558058168648648": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11015074526119891710": ["convolution_gpu_yxfb_yxio_b16",2], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",2], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "14807357397951247957": ["convolution_gpu_yxfb_yxio_b16",2], + "5899560521070338192": ["convolution_gpu_yxfb_yxio_b16",2], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",2], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11634932044447867039": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "17778091287904736965": ["convolution_gpu_bfyx_gemm_like",2], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "3684792790546138809": ["convolution_gpu_yxfb_yxio_b16",2], + "12046017161414846599": ["convolution_gpu_bfyx_1x1",2], + "6546440095044731932": ["convolution_gpu_yxfb_yxio_b16",2], + "3735605582512535278": ["convolution_gpu_yxfb_yxio_b16",2], + "17088011073114549679": ["convolution_gpu_yxfb_yxio_b16",2], + "5600807544955072308": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2314805462821790774": ["convolution_gpu_yxfb_yxio_b16",2], + "17439102502195540957": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "9590161922224578217": ["convolution_gpu_yxfb_yxio_b16",2], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2319519208813614116": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2553539191926275121": ["convolution_gpu_yxfb_yxio_b16",2], + "12900949103593247293": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "3563872903821081702": ["convolution_gpu_bfyx_gemm_like",1], + "15600841108426475615": ["convolution_gpu_yxfb_yxio_b16",2], + "16242136888057221574": ["convolution_gpu_yxfb_yxio_b16",2], + "15765592038173567297": ["convolution_gpu_yxfb_yxio_b16",2], + "16601230690171340432": ["convolution_gpu_yxfb_yxio_b16",2], + "13634686998599681086": ["convolution_gpu_yxfb_yxio_b16",2], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "467070383257529689": ["convolution_gpu_yxfb_yxio_b16",2], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16748662918272106932": ["convolution_gpu_bfyx_gemm_like",1], + "3365786526859737112": ["convolution_gpu_yxfb_yxio_b16",2], + "16738951239219589307": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4769003637955328938": ["convolution_gpu_bfyx_gemm_like",1], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "15065019229949449623": ["convolution_gpu_bfyx_gemm_like",1], + "11267495078361954131": ["convolution_gpu_yxfb_yxio_b16",2], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "16961326251624610778": ["convolution_gpu_yxfb_yxio_b16",2], + "11626398907755088688": ["convolution_gpu_bfyx_gemm_like",1], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",2], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",2], + "480310470450900836": ["convolution_gpu_bfyx_gemm_like",2], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "17107083637007906184": ["convolution_gpu_bfyx_gemm_like",1], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",0], + "3242391637018676328": ["convolution_gpu_yxfb_yxio_b16",2], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",1], + "9534041402131086717": ["convolution_gpu_bfyx_direct_10_12_16",2], + "856949500975232838": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "9647916259092117712": ["convolution_gpu_bfyx_gemm_like",2], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "12619739385084492771": ["convolution_gpu_yxfb_yxio_b16",2], + "12210280332071091209": ["fully_connected_gpu_fb_oi_ref",1], + "6317575981520135028": ["convolution_gpu_bfyx_gemm_like",0], + "15293835051273372438": ["convolution_gpu_yxfb_yxio_b16",2], + "7788374869410867297": ["convolution_gpu_bfyx_gemm_like",2], + "10477588607457125173": ["convolution_gpu_bfyx_gemm_like",2], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "7817691489550523328": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6182829358839578529": ["convolution_gpu_bfyx_gemm_like",2], + "13320473279945887641": ["convolution_gpu_yxfb_yxio_b16",2], + "2866656294663853474": ["convolution_gpu_bfyx_1x1",2], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "7889602687414497280": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "7724125714360985807": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10879171754021534649": ["convolution_gpu_yxfb_yxio_b16",2], + "3190494353583341446": ["convolution_gpu_bfyx_gemm_like",1], + "2501411300945696806": ["convolution_gpu_yxfb_yxio_b16",2], + "15217573782563469232": ["convolution_gpu_yxfb_yxio_b16",2], + "2026622899016787854": ["convolution_gpu_yxfb_yxio_b16",0], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "13464697394408238115": ["convolution_gpu_yxfb_yxio_b16",2], + "488798544312719183": ["convolution_gpu_yxfb_yxio_b16",2], + "12710794174926396540": ["convolution_gpu_yxfb_yxio_b16",2], + "13156052826121673994": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "6423354409210936959": ["convolution_gpu_yxfb_yxio_b16",2], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",2], + "9291397338108903174": ["convolution_gpu_yxfb_yxio_b16",2], + "1786821683911142459": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16253244737884854313": ["convolution_gpu_yxfb_yxio_b16",2], + "13323186744342557015": ["convolution_gpu_yxfb_yxio_b16",2], + "15078590909693331731": ["convolution_gpu_bfyx_gemm_like",2], + "13777174566683935109": ["convolution_gpu_yxfb_yxio_b16",2], + "9589361786336650748": ["convolution_gpu_yxfb_yxio_b16",2], + "15625374380046476173": ["convolution_gpu_yxfb_yxio_b16",2], + "4897690791599638716": ["convolution_gpu_yxfb_yxio_b16",2], + "9177211394807412309": ["convolution_gpu_yxfb_yxio_b16",2], + "8723078862651154959": ["convolution_gpu_yxfb_yxio_b16",2], + "739676584505475609": ["convolution_gpu_bfyx_gemm_like",2], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "4701235352806075765": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "6363788325163726004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2172999245833525797": ["convolution_gpu_yxfb_yxio_b16",2], + "6820134899097582639": ["convolution_gpu_yxfb_yxio_b16",0], + "7292351660229751817": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "16725049805030712400": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1308980444055174254": ["convolution_gpu_bfyx_gemm_like",2], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",0], + "12722153168975105360": ["convolution_gpu_yxfb_yxio_b16",2], + "12121204870979363096": ["convolution_gpu_yxfb_yxio_b16",2], + "17272600601478967434": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "14553813154800569861": ["convolution_gpu_yxfb_yxio_b16",1], + "2722062599746670336": ["convolution_gpu_yxfb_yxio_b16",2], + "12331134162344797761": ["convolution_gpu_yxfb_yxio_b16",2], + "2816339200381598722": ["convolution_gpu_yxfb_yxio_b16",2], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",2], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10632933069865171963": ["convolution_gpu_yxfb_yxio_b16",2], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "15897300973213364823": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "8339704352841356825": ["convolution_gpu_yxfb_yxio_b16",2], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",0], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "12623375499927200341": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10765280349477640969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13234872695521811652": ["convolution_gpu_yxfb_yxio_b16",2], + "12303905514885913537": ["convolution_gpu_yxfb_yxio_b16",2], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",2], + "4755225554035527185": ["convolution_gpu_yxfb_yxio_b16",2], + "10133398220120888583": ["convolution_gpu_yxfb_yxio_b16",2], + "13474805373264874144": ["convolution_gpu_bfyx_1x1",2], + "5941092474669713339": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",1], + "6756679359093569015": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "14188157670969097508": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",2], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "12053562297742437099": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12806934028210472719": ["convolution_gpu_bfyx_gemm_like",2], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "10837496380266058422": ["convolution_gpu_bfyx_gemm_like",2], + "6942016672941874829": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",2], + "13735180250757239202": ["convolution_gpu_bfyx_gemm_like",2], + "10065714384927707796": ["convolution_gpu_yxfb_yxio_b16",2], + "9589718307719207394": ["convolution_gpu_yxfb_yxio_b16",2], + "11175353869874626110": ["convolution_gpu_yxfb_yxio_b16",2], + "10718764522366711114": ["convolution_gpu_yxfb_yxio_b16",1], + "4430932059574900921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "6963293142152132518": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",2], + "15542520725696027828": ["convolution_gpu_yxfb_yxio_b16",2], + "13596876807637507229": ["convolution_gpu_bfyx_1x1",2], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",2], + "11270855425262923989": ["convolution_gpu_yxfb_yxio_b16",2], + "4135068756462147853": ["convolution_gpu_bfyx_gemm_like",1], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1973051991518953158": ["convolution_gpu_yxfb_yxio_b16",2], + "5723759573058003971": ["convolution_gpu_yxfb_yxio_b16",2], + "9525535670799618110": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15192230303376521834": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "11262989876326061679": ["convolution_gpu_yxfb_yxio_b16",2], + "1507504848332592003": ["convolution_gpu_yxfb_yxio_b16",2], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8577875628223148806": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",2], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "12458921031453334451": ["convolution_gpu_yxfb_yxio_b16",2], + "15604634351310647589": ["convolution_gpu_yxfb_yxio_b16",2], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8984436655107983227": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "13369751385866224286": ["convolution_gpu_yxfb_yxio_b16",2], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",2], + "7139719632093090046": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "466744273945239777": ["convolution_gpu_yxfb_yxio_b16",2], + "5788323787676797805": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",2], + "11971853138084108953": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "2917735110073643952": ["convolution_gpu_bfyx_gemm_like",2], + "10141927023849730720": ["convolution_gpu_bfyx_1x1",2], + "362823013207940830": ["convolution_gpu_yxfb_yxio_b16",2], + "10893432143734884603": ["convolution_gpu_bfyx_gemm_like",2], + "16437093737761968743": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3101748967012684440": ["convolution_gpu_yxfb_yxio_b16",2], + "1582751548472076534": ["convolution_gpu_yxfb_yxio_b16",2], + "15809639778580769565": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9741607635826869269": ["convolution_gpu_bfyx_gemm_like",1], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "13501352378461071771": ["convolution_gpu_yxfb_yxio_b16",2], + "4685236901551256966": ["convolution_gpu_yxfb_yxio_b16",2], + "4378422094110940766": ["convolution_gpu_bfyx_gemm_like",1], + "14532519639619315651": ["convolution_gpu_bfyx_gemm_like",2], + "138379779469699309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2929715823970060874": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15289152041466330689": ["convolution_gpu_bfyx_gemm_like",2], + "9967611023372430532": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2929190644951986399": ["convolution_gpu_bfyx_gemm_like",2], + "3441335188113424896": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4256155212405177844": ["convolution_gpu_yxfb_yxio_b16",2], + "18075395502550596586": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "6887205509732544213": ["convolution_gpu_yxfb_yxio_b16",2], + "12013818650853034767": ["convolution_gpu_yxfb_yxio_b16",2], + "8935522915553126640": ["convolution_gpu_bfyx_gemm_like",1], + "5044721291675005144": ["convolution_gpu_bfyx_1x1",2], + "4422642146063042868": ["convolution_gpu_yxfb_yxio_b16",2], + "14835641172229643545": ["convolution_gpu_bfyx_gemm_like",2], + "7289535479247584635": ["convolution_gpu_bfyx_1x1",2], + "5258372022038629529": ["convolution_gpu_yxfb_yxio_b16",2], + "13149617013851130587": ["convolution_gpu_yxfb_yxio_b16",2], + "9538863363710651909": ["convolution_gpu_yxfb_yxio_b16",2], + "8065866013404161366": ["convolution_gpu_yxfb_yxio_b16",2], + "14116275901314596944": ["convolution_gpu_yxfb_yxio_b16",2], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",2], + "16072525303202287969": ["convolution_gpu_yxfb_yxio_b16",2], + "9629460794894999510": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5576305720733717044": ["convolution_gpu_yxfb_yxio_b16",2], + "16780457022162749898": ["convolution_gpu_bfyx_gemm_like",2], + "411016281538345537": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18083803358410976976": ["convolution_gpu_yxfb_yxio_b16",2], + "5479590921345335946": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",2], + "5568728266639058524": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14091543526898531200": ["convolution_gpu_yxfb_yxio_b16",2], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",2], + "8193369947544085921": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "7065244994574625911": ["convolution_gpu_yxfb_yxio_b16",2], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",2], + "15988378956341507229": ["convolution_gpu_yxfb_yxio_b16",2], + "8943913562339525413": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "7008509833947166548": ["convolution_gpu_yxfb_yxio_b16",2], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",0], + "136349424199140459": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10952045211444638649": ["convolution_gpu_yxfb_yxio_b16",2], + "13123561937554734618": ["convolution_gpu_yxfb_yxio_b16",2], + "12344689711325644622": ["convolution_gpu_yxfb_yxio_b16",2], + "12023260267201191955": ["convolution_gpu_yxfb_yxio_b16",2], + "601430670855155006": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "11130439225010714550": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5564881878876582769": ["convolution_gpu_yxfb_yxio_b16",2], + "1197281505560782577": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "10424643336435622408": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8421388456873652700": ["convolution_gpu_bfyx_gemm_like",2], + "12181310683533105454": ["fully_connected_gpu_fb_oi_ref",2], + "17806747473167329833": ["convolution_gpu_yxfb_yxio_b16",2], + "3784684114139223050": ["convolution_gpu_yxfb_yxio_b16",2], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10431728173806991521": ["convolution_gpu_yxfb_yxio_b16",2], + "16633540487930201533": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "9642965664913867675": ["convolution_gpu_yxfb_yxio_b16",2], + "3533556385636018581": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "14897384423894125457": ["convolution_gpu_yxfb_yxio_b16",2], + "4091702228990140696": ["convolution_gpu_bfyx_gemm_like",1], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "12787837386653002743": ["convolution_gpu_yxfb_yxio_b16",2], + "3047407458812880288": ["convolution_gpu_yxfb_yxio_b16",2], + "6249875772709398338": ["convolution_gpu_yxfb_yxio_b16",2], + "2399313178951511557": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11224051407822914513": ["convolution_gpu_yxfb_yxio_b16",2], + "18255227391100087860": ["convolution_gpu_bfyx_1x1",2], + "5552699731399195573": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4104679489383377966": ["convolution_gpu_yxfb_yxio_b16",2], + "6871131333562410117": ["convolution_gpu_yxfb_yxio_b16",2], + "7263796835299019284": ["convolution_gpu_bfyx_gemm_like",2], + "7015738038963065110": ["convolution_gpu_bfyx_gemm_like",2], + "8616584380583931648": ["convolution_gpu_yxfb_yxio_b16",2], + "7343590049199309046": ["convolution_gpu_yxfb_yxio_b16",2], + "9144269202766996508": ["convolution_gpu_yxfb_yxio_b16",2], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",2], + "13328911884191551889": ["convolution_gpu_bfyx_1x1",2], + "16434358667865869005": ["convolution_gpu_yxfb_yxio_b16",2], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "1129349074674368869": ["convolution_gpu_yxfb_yxio_b16",2], + "6602394091385112575": ["convolution_gpu_yxfb_yxio_b16",2], + "6839795451275143093": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "7162155897369277782": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15848096609835347542": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5995121118186531621": ["convolution_gpu_yxfb_yxio_b16",1], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "18214716801063702171": ["convolution_gpu_yxfb_yxio_b16",2], + "2339864165283480961": ["convolution_gpu_bfyx_1x1",2], + "11052275099129482401": ["convolution_gpu_yxfb_yxio_b16",2], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",478], + "393387269914864557": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "4714289593698160876": ["convolution_gpu_yxfb_yxio_b16",2], + "14892045745899927762": ["convolution_gpu_yxfb_yxio_b16",2], + "18029395208219861440": ["convolution_gpu_yxfb_yxio_b16",2], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "11012427206693842637": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14758040027936817208": ["convolution_gpu_yxfb_yxio_b16",2], + "14097394936362526559": ["convolution_gpu_yxfb_yxio_b16",2], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",2], + "12388894315292201102": ["convolution_gpu_yxfb_yxio_b16",2], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",2], + "5857101685300045443": ["convolution_gpu_yxfb_yxio_b16",1], + "16632786413927045192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5884802375772043861": ["convolution_gpu_yxfb_yxio_b16",2], + "2955459120402821540": ["convolution_gpu_yxfb_yxio_b16",2], + "12027202455592387086": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "9026883911202247185": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16814025114202322376": ["convolution_gpu_yxfb_yxio_b16",2], + "3319827933068341610": ["convolution_gpu_yxfb_yxio_b16",2], + "17477062954520561609": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "8784358107340738205": ["convolution_gpu_yxfb_yxio_b16",2], + "4674504221851042542": ["convolution_gpu_yxfb_yxio_b16",2], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "17413191440314817117": ["convolution_gpu_yxfb_yxio_b16",2], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "10896935976330351144": ["convolution_gpu_yxfb_yxio_b16",2], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "6709883527730513363": ["convolution_gpu_yxfb_yxio_b16",2], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15141893564826036993": ["convolution_gpu_yxfb_yxio_b16",2], + "9135116285263927211": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2908156087871187676": ["convolution_gpu_yxfb_yxio_b16",1], + "698274493570551388": ["convolution_gpu_yxfb_yxio_b16",2], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",2], + "9922764846020092836": ["convolution_gpu_yxfb_yxio_b16",2], + "8365255170846178102": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "14616801816838734032": ["convolution_gpu_yxfb_yxio_b16",2], + "7531346828150129063": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "15228390729175722409": ["convolution_gpu_yxfb_yxio_b16",2], + "18435632962969462312": ["convolution_gpu_yxfb_yxio_b16",2], + "13978649386370395620": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "15397084091361096354": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",1], + "14909506411483112959": ["convolution_gpu_yxfb_yxio_b16",2], + "7715937239456300593": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1921500066107090648": ["convolution_gpu_yxfb_yxio_b16",2], + "12767065362702304803": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7859659993155959174": ["convolution_gpu_yxfb_yxio_b16",2], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15386715291503303766": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "13571587312517912280": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "1290180607037086383": ["convolution_gpu_yxfb_yxio_b16",2], + "15504618703544589723": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11311839946200066200": ["convolution_gpu_yxfb_yxio_b16",2], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "6222595759158615206": ["convolution_gpu_bfyx_gemm_like",1], + "8262469434265124590": ["convolution_gpu_yxfb_yxio_b16",2], + "166437837813304707": ["convolution_gpu_yxfb_yxio_b16",2], + "15352064186447212862": ["convolution_gpu_yxfb_yxio_b16",2], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "9585113116232600562": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14908477489231326997": ["convolution_gpu_yxfb_yxio_b16",2], + "11768117585574496387": ["convolution_gpu_bfyx_gemm_like",2], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "657356383636782030": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "16094455700371652312": ["convolution_gpu_yxfb_yxio_b16",2], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "11002165738333323413": ["convolution_gpu_yxfb_yxio_b16",2], + "875146113874776902": ["convolution_gpu_yxfb_yxio_b16",2], + "6020017927557041768": ["convolution_gpu_bfyx_gemm_like",1], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",0], + "3861351835305151926": ["convolution_gpu_yxfb_yxio_b16",1], + "9890700023578477203": ["convolution_gpu_bfyx_gemm_like",2], + "16828961272295386615": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",2], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "2227700097134029783": ["convolution_gpu_yxfb_yxio_b16",2], + "132437164570900392": ["convolution_gpu_yxfb_yxio_b16",2], + "17789969008677638142": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "8205640825965213946": ["convolution_gpu_yxfb_yxio_b16",1], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "4306881509708040723": ["convolution_gpu_yxfb_yxio_b16",2], + "15612797125081819500": ["convolution_gpu_yxfb_yxio_b16",2], + "13797759143769042759": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "15217077412685024074": ["convolution_gpu_yxfb_yxio_b16",2], + "17178308105985812083": ["convolution_gpu_yxfb_yxio_b16",2], + "11132679855317294753": ["convolution_gpu_bfyx_gemm_like",1], + "13076935351221777993": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3301356450249305137": ["convolution_gpu_yxfb_yxio_b16",2], + "4947788161154370784": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "2064464435352777854": ["convolution_gpu_bfyx_gemm_like",2], + "4049224463072418218": ["convolution_gpu_yxfb_yxio_b16",2], + "5642822685234782052": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2208765794404376467": ["convolution_gpu_yxfb_yxio_b16",2], + "6280726148869856021": ["convolution_gpu_yxfb_yxio_b16",2], + "733956743303342862": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "10415046594066474634": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3976197003067656339": ["convolution_gpu_yxfb_yxio_b16",2], + "2714322766616035858": ["convolution_gpu_yxfb_yxio_b16",2], + "4362304842016958728": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1074748462756364699": ["fully_connected_gpu_fb_oi_ref",1], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "17174919737114915467": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14001048251986195179": ["convolution_gpu_bfyx_gemm_like",2], + "13927671398099556854": ["convolution_gpu_yxfb_yxio_b16",2], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "119047044057950958": ["convolution_gpu_bfyx_gemm_like",1], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",2], + "17466963970980708210": ["convolution_gpu_yxfb_yxio_b16",2], + "15167962750603978874": ["convolution_gpu_yxfb_yxio_b16",2], + "5293502980575652171": ["convolution_gpu_yxfb_yxio_b16",2], + "3059575629482816852": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "334703311738467111": ["convolution_gpu_bfyx_gemm_like",2], + "4972952621622984792": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1556975727728498645": ["convolution_gpu_yxfb_yxio_b16",2], + "15609860394182767048": ["convolution_gpu_yxfb_yxio_b16",1], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "4597873630741623918": ["convolution_gpu_yxfb_yxio_b16",1], + "904355798061005466": ["convolution_gpu_yxfb_yxio_b16",2], + "13094289895577333088": ["convolution_gpu_yxfb_yxio_b16",2], + "234288286732396704": ["convolution_gpu_yxfb_yxio_b16",2], + "10880830033700542216": ["convolution_gpu_yxfb_yxio_b16",2], + "15576534481170615301": ["convolution_gpu_yxfb_yxio_b16",2], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",2], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "7804715870037416579": ["convolution_gpu_bfyx_gemm_like",0], + "13367787254519749641": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14823789570149356458": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",2], + "8779960552750034544": ["convolution_gpu_yxfb_yxio_b16",2], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "14752182392048929103": ["convolution_gpu_yxfb_yxio_b16",2], + "6217542346826403576": ["convolution_gpu_bfyx_1x1",2], + "14113320831418478396": ["convolution_gpu_yxfb_yxio_b16",2], + "9435086287598656868": ["convolution_gpu_yxfb_yxio_b16",2], + "3067001341355453846": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "15424646499666127616": ["convolution_gpu_yxfb_yxio_b16",2], + "16953093098789113080": ["convolution_gpu_yxfb_yxio_b16",2], + "9803492989444302959": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14510495923021693109": ["convolution_gpu_yxfb_yxio_b16",2], + "5459463503840817402": ["convolution_gpu_bfyx_1x1",2], + "17893181511546734799": ["convolution_gpu_yxfb_yxio_b16",2], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",2], + "7752913515036871482": ["convolution_gpu_bfyx_gemm_like",0], + "14079654309452583394": ["convolution_gpu_bfyx_gemm_like",1], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",2], + "2263637493894079492": ["convolution_gpu_yxfb_yxio_b16",2], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",2], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "7585777271711713778": ["convolution_gpu_yxfb_yxio_b16",2], + "5897564616927353003": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11738780323979052397": ["convolution_gpu_bfyx_direct_10_12_16",2], + "598214270378842167": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",0], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "720558977788683564": ["convolution_gpu_yxfb_yxio_b16",2], + "1216021647922150199": ["convolution_gpu_yxfb_yxio_b16",2], + "5606914392662771013": ["convolution_gpu_yxfb_yxio_b16",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",2], + "15715029280006557222": ["convolution_gpu_yxfb_yxio_b16",2], + "14315760630997175346": ["convolution_gpu_yxfb_yxio_b16",2], + "10536316961655703500": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5583453364991774426": ["convolution_gpu_yxfb_yxio_b16",2], + "248133885018839814": ["convolution_gpu_yxfb_yxio_b16",2], + "12808456612606675259": ["convolution_gpu_yxfb_yxio_b16",2], + "9809458159478958866": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "3571030800252732358": ["convolution_gpu_yxfb_yxio_b16",2], + "4238885454989272754": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "6709083009339039603": ["convolution_gpu_yxfb_yxio_b16",2], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",1], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",2], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "4933831571091731212": ["convolution_gpu_bfyx_gemm_like",1], + "8399668174006528237": ["convolution_gpu_bfyx_gemm_like",2], + "7398158542592530232": ["convolution_gpu_yxfb_yxio_b16",2], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",1], + "16000428520749664687": ["convolution_gpu_yxfb_yxio_b16",2], + "14971270053929063630": ["convolution_gpu_yxfb_yxio_b16",2], + "3652414035262499383": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "7780140599533242850": ["convolution_gpu_bfyx_gemm_like",1], + "5911282942658469852": ["convolution_gpu_bfyx_gemm_like",1], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",1], + "5157949342388119167": ["convolution_gpu_bfyx_gemm_like",2], + "11891319657803057127": ["convolution_gpu_yxfb_yxio_b16",1], + "8638227907054657946": ["convolution_gpu_yxfb_yxio_b16",2], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "10718639465064821919": ["convolution_gpu_yxfb_yxio_b16",2], + "16371608027363202992": ["convolution_gpu_yxfb_yxio_b16",2], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "2797436491596125131": ["convolution_gpu_yxfb_yxio_b16",2], + "1632416005093914709": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4819131094439732065": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "7317391511452227268": ["convolution_gpu_bfyx_direct_10_12_16",2], + "438528596970898721": ["convolution_gpu_bfyx_gemm_like",2], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "8063236641629084352": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4241055784642339756": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9812438080378091263": ["convolution_gpu_yxfb_yxio_b16",2], + "11157773554806649837": ["convolution_gpu_yxfb_yxio_b16",2], + "16459072408799224894": ["convolution_gpu_yxfb_yxio_b16",2], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "9982350570959875159": ["convolution_gpu_yxfb_yxio_b16",2], + "16956263773967652552": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2164314506903530487": ["convolution_gpu_yxfb_yxio_b16",2], + "14766477690417085350": ["convolution_gpu_bfyx_1x1",2], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",2], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",1], + "18424611729838147994": ["convolution_gpu_yxfb_yxio_b16",2], + "15476491807306982382": ["fully_connected_gpu_fb_io_block_fp16",0], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",1], + "2571289358202565251": ["convolution_gpu_yxfb_yxio_b16",2], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "188830358699960789": ["convolution_gpu_yxfb_yxio_b16",2], + "16813995580382709489": ["convolution_gpu_yxfb_yxio_b16",2], + "12988253829685880778": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "5577571901049952658": ["convolution_gpu_yxfb_yxio_b16",2], + "9663847096617096629": ["convolution_gpu_yxfb_yxio_b16",2], + "6935581283700404601": ["convolution_gpu_yxfb_yxio_b16",2], + "11973034261101454380": ["convolution_gpu_yxfb_yxio_b16",2], + "185782385623159958": ["convolution_gpu_bfyx_gemm_like",2], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",1], + "17270057383792994793": ["convolution_gpu_yxfb_yxio_b16",2], + "5928392400230917930": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4627958043707973483": ["convolution_gpu_yxfb_yxio_b16",1], + "8735534480653818425": ["convolution_gpu_yxfb_yxio_b16",2], + "15226556774612169126": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1760391741350091665": ["convolution_gpu_bfyx_gemm_like",2], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15816807118780455948": ["convolution_gpu_yxfb_yxio_b16",2], + "10225565543636007389": ["convolution_gpu_yxfb_yxio_b16",2], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",1], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "631489011812924153": ["convolution_gpu_bfyx_1x1",2], + "17434141039341226796": ["convolution_gpu_yxfb_yxio_b16",2], + "9328223957245552723": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15750539817895707253": ["convolution_gpu_yxfb_yxio_b16",0], + "9515771738501683": ["convolution_gpu_yxfb_yxio_b16",2], + "13251091004269229867": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "14497254583210965214": ["convolution_gpu_yxfb_yxio_b16",2], + "3377052601059116318": ["convolution_gpu_yxfb_yxio_b16",2], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "16836088134347394854": ["convolution_gpu_yxfb_yxio_b16",2], + "2449586975250543578": ["convolution_gpu_yxfb_yxio_b16",2], + "10211403590176354415": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4602232889230956461": ["convolution_gpu_yxfb_yxio_b16",2], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "8761283252495354972": ["convolution_gpu_bfyx_gemm_like",2], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "4776446300552810228": ["convolution_gpu_bfyx_gemm_like",2], + "2412069259085234287": ["convolution_gpu_yxfb_yxio_b16",2], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "851057218719456209": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "5327803911898085293": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1265277707626014051": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "16408015571155576773": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "3652749152621176846": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12968458217519563011": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "17066417894262330033": ["convolution_gpu_bfyx_gemm_like",1], + "10129351141713628942": ["convolution_gpu_bfyx_gemm_like",2], + "18020588962875998441": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10114123606924808948": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12138556002719602750": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",2], + "7308442824625238429": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "17993865017392965282": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "12047878068525808907": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16237775310369180101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16565784556269819846": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "13858485871773319706": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9298483238271063853": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6810243879781619546": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17740553615487239243": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "2690771087990667627": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "12175796957622122377": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13343968006718934574": ["convolution_gpu_bfyx_gemm_like",2], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "14010642743400284761": ["convolution_gpu_bfyx_direct_10_12_16",2], + "460780635491857522": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "14650567822254940018": ["convolution_gpu_bfyx_direct_10_12_16",2], + "684240994243755872": ["convolution_gpu_bfyx_gemm_like",1], + "11257892554921100776": ["convolution_gpu_bfyx_gemm_like",2], + "7419216766190700536": ["convolution_gpu_bfyx_gemm_like",1], + "5033753554611312392": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "7185832253431234935": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15289017003172341090": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "331390460560782085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "683530182479794259": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "1081969835308672753": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "4494583230309471319": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "3755253206085028904": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18422772756265807456": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "4436265026202671742": ["convolution_gpu_bfyx_gemm_like",0], + "9008848676120441863": ["convolution_gpu_bfyx_gemm_like",1], + "8986253016099337778": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6661117204204077150": ["convolution_gpu_bfyx_gemm_like",2], + "17220204850799701232": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "17520777331163825810": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13326233188936584240": ["convolution_gpu_bfyx_gemm_like",2], + "9361149482291015906": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14157776769026046014": ["fully_connected_gpu_fb_oi_ref",0], + "6031307393395339699": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9104710269725948935": ["convolution_gpu_bfyx_gemm_like",2], + "15132518566122695317": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "11642972419456492482": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9402935157379983392": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10073936467467965122": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1233021176530240722": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1764398518968720486": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "9325064517683111898": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13753670205703732353": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12063837066704136739": ["convolution_gpu_bfyx_gemm_like",2], + "9546990560009724329": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11879484013890539145": ["convolution_gpu_bfyx_gemm_like",1], + "13301652037182491495": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15706410484838871362": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12815588500303820284": ["convolution_gpu_bfyx_gemm_like",1], + "1629816265162728770": ["convolution_gpu_bfyx_gemm_like",2], + "10574694721257478408": ["convolution_gpu_bfyx_gemm_like",0], + "6178519342290638130": ["convolution_gpu_bfyx_gemm_like",2], + "10273183900108661041": ["convolution_gpu_bfyx_gemm_like",0], + "11379252854859166206": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13439272015824246074": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10853161782230763798": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "15379873910046172004": ["convolution_gpu_bfyx_gemm_like",1], + "9419334015760594582": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "1096929244128185929": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "13804435767468730732": ["convolution_gpu_bfyx_gemm_like",2], + "8174734104495927379": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "4903043177313730317": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "3927333491885837374": ["fully_connected_gpu_fb_oi_ref",2], + "5920614348521143999": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "2305461098719675735": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13193571607788569533": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18375944751155613159": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "5282780697382984776": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14611470203914805229": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2605525859754242318": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1835975757316320402": ["convolution_gpu_bfyx_gemm_like",1], + "404419072921281472": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "15291457825664605611": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "263575476655527355": ["convolution_gpu_bfyx_gemm_like",2], + "10898684230183205955": ["convolution_gpu_bfyx_gemm_like",2], + "7954822934649213505": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14214141488645257351": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "16580523689587532278": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "7744644472305197412": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13394233139064923018": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6220616397859143111": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9535474159134436170": ["convolution_gpu_bfyx_gemm_like",1], + "12493863403516600413": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12793814016409887162": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "17796867588410764794": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "11359020774437470164": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "4254313567858225805": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13065517911798224579": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "13342769641176584743": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "14266210014132784194": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",1], + "7558864177789582540": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "13115589642140732066": ["convolution_gpu_bfyx_gemm_like",1], + "8008513163448840421": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17794162443307839614": ["convolution_gpu_bfyx_gemm_like",1], + "7174790971918109163": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13974740392602492680": ["convolution_gpu_bfyx_gemm_like",2], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "11190259822407791373": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "1996317479484023889": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8854234880878427078": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "10114186450910665716": ["convolution_gpu_bfyx_gemm_like",2], + "906587812125311288": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1640358227345963848": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "6210074450403696110": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "841243068178925457": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17900440115872409689": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3469963495451100978": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5275016494706355806": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14524678598440880756": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "15715775011639091549": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14898829474012181950": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9643671820560131959": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14825587275976212624": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "18280672126778847258": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12382761700262813898": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "1093840152689636371": ["convolution_gpu_bfyx_gemm_like",1], + "12700051513124813499": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "6214312494103149808": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "15385836287435319028": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14639233649574991406": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "7995002764260542332": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18420783889227814721": ["convolution_gpu_bfyx_gemm_like",1], + "11883632480024839484": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9069334144391048686": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8751016391945753900": ["convolution_gpu_bfyx_direct_10_12_16",2], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4722824701199486161": ["convolution_gpu_bfyx_gemm_like",1], + "1653438360841004980": ["fully_connected_gpu_fb_oi_ref",1], + "11267742746905371769": ["convolution_gpu_bfyx_gemm_like",1], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14995412997472381785": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "7544565739420583104": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "8159367017950578067": ["convolution_gpu_bfyx_gemm_like",0], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "9609257787066002999": ["convolution_gpu_bfyx_gemm_like",0], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11985789598994479652": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",1], + "851140387756761667": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "12713087335581316946": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "8358425189419823078": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",610], + "9484428757321765863": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7714783879762659458": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3612493075378459996": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "17765244777397448823": ["convolution_gpu_bfyx_gemm_like",2], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "6740545361286720494": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "9194441947620820715": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15898888434295644774": ["convolution_gpu_bfyx_gemm_like",2], + "17821196374523699955": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18136765667969393174": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8420176522157084802": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17221173795372066030": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15576932271488848457": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1961348920992050029": ["convolution_gpu_bfyx_gemm_like",1], + "12076322142162382598": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7109332037985838172": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "14854353557342075292": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "7086554406050778468": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "253337639942573142": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "1211404528755199615": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6213353364768643062": ["convolution_gpu_bfyx_gemm_like",1], + "9796347091019799053": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "15316782593191029443": ["convolution_gpu_bfyx_gemm_like",1], + "1421879144542252228": ["convolution_gpu_bfyx_gemm_like",1], + "14271777022638592600": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",1], + "6478247863479663432": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9172699707430374863": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5375957124102705020": ["convolution_gpu_bfyx_gemm_like",2], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1231806423322813287": ["convolution_gpu_bfyx_gemm_like",2], + "4646795194660982475": ["convolution_gpu_bfyx_gemm_like",2], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "11070696274716018686": ["convolution_gpu_bfyx_os_iyx_osv16",196], + "5334190564423375247": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2155348872565175553": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14600700464602327710": ["convolution_gpu_bfyx_gemm_like",2], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "10842505566649585090": ["convolution_gpu_bfyx_gemm_like",1], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "17130630712943165823": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10173382130572498594": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "954347958041231578": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14669219788000023965": ["fully_connected_gpu_fb_oi_ref",1], + "15997231252708686870": ["convolution_gpu_bfyx_gemm_like",2], + "3963106895592011725": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17692144048680858991": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",164], + "10835684445936063871": ["convolution_gpu_bfyx_gemm_like",1], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "779633618375662086": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "4046513842327685203": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7722090560547236852": ["convolution_gpu_bfyx_gemm_like",2], + "17864395500488861670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13161798453564436688": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8557939065994799094": ["convolution_gpu_bfyx_gemm_like",2], + "17749857812061795980": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11152834864013527469": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "14946519992043402896": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "10660230104888153758": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "16061176355133391199": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "10252133892687581839": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "7026575758396092435": ["convolution_gpu_bfyx_gemm_like",1], + "8422808932256100230": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11021014846012559932": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1197101651805223230": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16998662249038174039": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13231291236739587033": ["convolution_gpu_bfyx_gemm_like",2], + "11129224786768161139": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "7601006550805536675": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4783866236592802336": ["convolution_gpu_bfyx_gemm_like",2], + "7575675354187625951": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14361697687217060995": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1865187811299838654": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12651215303242591871": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "641798291578647186": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9522661528867955338": ["convolution_gpu_bfyx_gemm_like",2], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "16549498607618849252": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "3349108500387301004": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "14213516751025324346": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6104567430127604601": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "15733030371524967129": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "4765132143483233538": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "14253275166085865948": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5045339651649581926": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "9468314291932574827": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "4213330047036138895": ["convolution_gpu_bfyx_gemm_like",2], + "17116130466596594359": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "14233388108948021331": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12522364636280164681": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10992999157318221164": ["convolution_gpu_bfyx_gemm_like",0], + "1781189282179491198": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15033864286535250007": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "13110173649734084688": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17946191056428828467": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "8260689555974656662": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "16053585286807864356": ["convolution_gpu_bfyx_gemm_like",2], + "4588420324030315321": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "4369346833875105372": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5385637020152792781": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13982221711075598070": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8281411537393664160": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12070592804878487941": ["convolution_gpu_bfyx_gemm_like",1], + "8528750110601691390": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17761681290527373180": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "7092429446071184360": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14776308019009874809": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "11012846743944132853": ["convolution_gpu_bfyx_gemm_like",2], + "7315740838189400004": ["convolution_gpu_bfyx_gemm_like",2], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "14141983383097250411": ["convolution_gpu_bfyx_gemm_like",1], + "8925796987351708085": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9875319892082750080": ["convolution_gpu_bfyx_gemm_like",2], + "13787118639037730152": ["convolution_gpu_bfyx_gemm_like",2], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "8730097760819044515": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6373173636869473046": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "9318652504803279936": ["convolution_gpu_bfyx_gemm_like",2], + "16306284020664131647": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",1], + "37061093840513038": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "11828522357351010810": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "16710651492402564794": ["convolution_gpu_bfyx_direct_10_12_16",2], + "94012300876418257": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "10362906912545982002": ["convolution_gpu_bfyx_gemm_like",2], + "9318550032135064372": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "2678815609451494274": ["convolution_gpu_bfyx_1x1",2], + "11704394720448242086": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "8071652278387309042": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",2], + "9390919808369333231": ["convolution_gpu_bfyx_gemm_like",2], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "1996860183441418841": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6098207667540641715": ["convolution_gpu_bfyx_gemm_like",2], + "10159790066948852390": ["convolution_gpu_bfyx_gemm_like",1], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "772794189370544860": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7080501503636539396": ["convolution_gpu_bfyx_gemm_like",2], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5635500901926740475": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "8470783908138180217": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6542486391263861823": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "7404732699742965436": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6027350558532160900": ["convolution_gpu_bfyx_gemm_like",2], + "7228139313323996640": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "12020033193997292057": ["convolution_gpu_bfyx_gemm_like",0], + "17523255657410563512": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5536424274663702901": ["convolution_gpu_bfyx_gemm_like",2], + "3547275591884493445": ["convolution_gpu_bfyx_gemm_like",1], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9216695884134021401": ["convolution_gpu_bfyx_gemm_like",2], + "3234263189133106948": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7678730081652720605": ["convolution_gpu_bfyx_os_iyx_osv16",881], + "2691043943297793735": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "17182558720652199559": ["fully_connected_gpu_fb_io_ref",0], + "7533669599936874355": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5036963191507722541": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "6984620248108632462": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "14560435854055940143": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "9988347141056982336": ["convolution_gpu_bfyx_gemm_like",2], + "11769511287553067221": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1100681675092122613": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "5150467145740542480": ["convolution_gpu_bfyx_gemm_like",2], + "14332388011233886083": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "792684262493086891": ["convolution_gpu_bfyx_gemm_like",1], + "2055914145961691571": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "11928926429060828408": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "18250076003231973692": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2647922515901529845": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "14167086447992316314": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6474882514032493642": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6983544541444063131": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "17308907916370632622": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9863856393759813897": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "876164657126345894": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "12692563384795319282": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "14599150265057284139": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "4941660917457387098": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "16852207712205172744": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "1269703478898366518": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "80038800201815976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10589803022753839539": ["convolution_gpu_bfyx_gemm_like",2], + "4586246090279043149": ["convolution_gpu_bfyx_gemm_like",1], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",0], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "16805562203348924108": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2534408579674556441": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "15781220232431782560": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15743075522781198932": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",439], + "8039045580314824307": ["convolution_gpu_bfyx_gemm_like",2], + "9377779605078400305": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "1795659014508380077": ["convolution_gpu_bfyx_gemm_like",2], + "8228641750970480948": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "4184442166820068862": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "522181557896569275": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8321148793275220552": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "17828453493113919756": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4016652650196255483": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12201437677145858979": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3686062608868674589": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6512006285490280576": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5559417017584278927": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7843833033404155302": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4395247494007025604": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",2], + "17754836801944078461": ["convolution_gpu_bfyx_gemm_like",0], + "17376180096577763039": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1629280013296592298": ["convolution_gpu_bfyx_gemm_like",2], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3169696741777363811": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13423515205322319913": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",831], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1436052878894538927": ["convolution_gpu_bfyx_gemm_like",2], + "12225119940380026093": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "15891529662801690234": ["convolution_gpu_bfyx_gemm_like",2], + "13613948678997524330": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5342657840254586591": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3539764293444807886": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "15391215077224693736": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "2148648022160178995": ["convolution_gpu_bfyx_gemm_like",2], + "3378088934862423864": ["convolution_gpu_bfyx_gemm_like",1], + "3965871278597751318": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9522947878591994913": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4965629769516591986": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11588201241814594642": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "2651385050387738902": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "3827177373408316820": ["convolution_gpu_bfyx_gemm_like",1], + "16701880594348935298": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "16091195788712971747": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "12283317230112506089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7706467560568261104": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4077290190620885361": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16661248688859994717": ["convolution_gpu_bfyx_gemm_like",2], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15052127817178941719": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "11855137287698046529": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12561852932488001568": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16802487456370986847": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12465913523583743669": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2335783507270234825": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "801486567558674495": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7512702933193596918": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "12671153706040443724": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "8236792121585073064": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",808], + "15743461017318513847": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "9882204352209412039": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11287863182337672053": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "5503904988517480229": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "7552144047474664265": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "2070909131301595402": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16076153317792960383": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13090596133852586482": ["fully_connected_gpu_fb_io_ref",0], + "15156805695359911457": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12150109996250730485": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17342603054992556378": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "6897348673467297407": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4862869094913223247": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12888823040206007493": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16327433707667075261": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7148542290597073512": ["convolution_gpu_bfyx_gemm_like",2], + "5119087113905313336": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10628989973647855390": ["convolution_gpu_bfyx_gemm_like",2], + "13012283016751495099": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3239100076064406977": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "7174804306958128658": ["convolution_gpu_bfyx_gemm_like",1], + "13948512795148364852": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "14177187878748170225": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10600040563032392126": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "13776186230202020053": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "16424490086911928793": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9308999849183405794": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6626716013917662606": ["convolution_gpu_bfyx_gemm_like",2], + "5164372816534616260": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2836903620603494117": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13731964100893109797": ["convolution_gpu_bfyx_gemm_like",2], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "12318898203127226615": ["convolution_gpu_bfyx_gemm_like",2], + "14848351491062336554": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5934211962000091180": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "1950057741678433412": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5481293245081340756": ["convolution_gpu_bfyx_gemm_like",1], + "8094836777153039013": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "11044223289209000460": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13276959978962672952": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17575293085957492821": ["convolution_gpu_bfyx_gemm_like",1], + "4974435385259831818": ["convolution_gpu_bfyx_gemm_like",2], + "10209532888121442060": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "6306539529168638031": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "2590380836212070761": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14168946412009689868": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3455720400625598790": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "1410630713443793537": ["convolution_gpu_bfyx_gemm_like",1], + "12211848608269437730": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6129602738379919488": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13073917160317338455": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "11913865086932469909": ["convolution_gpu_bfyx_gemm_like",2], + "15199289022783178329": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1697248235682953135": ["convolution_gpu_bfyx_gemm_like",2], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "3855151839445505918": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "9669968379760494342": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "1299452063079314341": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1617362484243823916": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "6561864486643226753": ["fully_connected_gpu_fb_io_ref",1], + "10213461713478260558": ["convolution_gpu_bfyx_gemm_like",2], + "11463162527165083478": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "2705394837952559308": ["convolution_gpu_bfyx_gemm_like",2], + "1676419079398771261": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "11075875009517060583": ["convolution_gpu_bfyx_gemm_like",2], + "3423392897831164719": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5996261744926399743": ["convolution_gpu_bfyx_gemm_like",2], + "12711558966638028352": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12458305535453345462": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17040970955448750876": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14176233347574275776": ["convolution_gpu_bfyx_gemm_like",2], + "12669547093826826335": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "9831195630506601660": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14661447197300866468": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9823752892549805496": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12627697289412631340": ["convolution_gpu_bfyx_gemm_like",1], + "3839690227347352846": ["convolution_gpu_bfyx_gemm_like",2], + "3018306533413795559": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "15615172858007002100": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5031342439443897167": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "4689190485668249985": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2419819939573989749": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "18384657372655350144": ["convolution_gpu_bfyx_gemm_like",2], + "12685978195521469707": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "868177350337221377": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7341140956759424033": ["convolution_gpu_bfyx_gemm_like",2], + "15656843575192319040": ["convolution_gpu_bfyx_gemm_like",2], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "4407683781177409314": ["convolution_gpu_bfyx_gemm_like",2], + "2722965005012667650": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6140789642561898454": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15667487381692577290": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "4244790495090049295": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "14609655423082082099": ["convolution_gpu_bfyx_gemm_like",2], + "14221578799010900252": ["convolution_gpu_bfyx_gemm_like",2], + "10607904718265020949": ["convolution_gpu_bfyx_gemm_like",2], + "9140953654075340568": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "14683086376707577764": ["convolution_gpu_bfyx_gemm_like",1], + "6747799061507191246": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17771487895874668302": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "1143558550529121379": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4652136280940317116": ["convolution_gpu_bfyx_gemm_like",2], + "2105482100745329286": ["convolution_gpu_bfyx_gemm_like",2], + "16285256723517297210": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11796671083187280457": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "16507285966998102421": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "7807704275483318300": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15597522934012485452": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9640773327221702885": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6080989915764831447": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11630475290242283451": ["convolution_gpu_bfyx_gemm_like",2], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "7636001038842031672": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17586562074575968095": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13769943652297353544": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6019638262018414923": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10463896120685306944": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10928995765778560784": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8757900457181374694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "15197400201857680173": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15552287544878243347": ["convolution_gpu_bfyx_gemm_like",1], + "8734220847509054149": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5033665285977853779": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "708347829794105085": ["convolution_gpu_bfyx_gemm_like",1], + "13721983823460534294": ["convolution_gpu_bfyx_gemm_like",2], + "13328449155966085543": ["convolution_gpu_bfyx_gemm_like",2], + "10568883265991969648": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6519443541076418301": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "4812064663748033253": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "4011704860949525864": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6335402359295811260": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "16367495521884864886": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",1], + "15295261978800289225": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "13296566345005640760": ["convolution_gpu_bfyx_gemm_like",1], + "1505929048307200803": ["convolution_gpu_bfyx_gemm_like",2], + "2108296560864415762": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17011927973643184196": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "15072402334212221980": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "12895496994338720556": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8124166677361481618": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11407554707582995190": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5339358831190803597": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12584870629297848143": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7203545612536771243": ["convolution_gpu_bfyx_gemm_like",2], + "13850920989756588064": ["convolution_gpu_bfyx_gemm_like",2], + "8525704362451630717": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2809463221123384600": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5254115874873721374": ["convolution_gpu_bfyx_os_iyx_osv16",844], + "18308541794729223940": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "17464785726466943638": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "13146231972557134419": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "14998412675237613013": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7031342689301066532": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "1724222702460860833": ["convolution_gpu_bfyx_gemm_like",2], + "4800208854712166990": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6610054713068442549": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "287386909600391846": ["convolution_gpu_bfyx_direct_10_12_16",2], + "700717277178942679": ["convolution_gpu_bfyx_gemm_like",2], + "3280795516668356985": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",2], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "7023033151960653752": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17387764798693150143": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "18273922178875123753": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "204378699575356398": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7712831597869354170": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",504], + "12992163255353386581": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9945721344229922405": ["convolution_gpu_bfyx_os_iyx_osv16",485], + "8578774826625315147": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "6555440973226014216": ["convolution_gpu_bfyx_gemm_like",2], + "6750003965952674453": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",849], + "203639177311791127": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "9890252170749328138": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10155417869639270818": ["convolution_gpu_bfyx_gemm_like",2], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "2585176064846114298": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13546898787965086743": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16666383605403885590": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "6364288463529107554": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",2], + "10391152927913101404": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5290935680520661218": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16229324496308453344": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "8873424072104563382": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "7530197659550301431": ["convolution_gpu_bfyx_gemm_like",2], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12925156865008155065": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "14862938122758223157": ["convolution_gpu_bfyx_gemm_like",1], + "813347941036099284": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2070429718533716882": ["convolution_gpu_bfyx_gemm_like",2], + "11883941040326858829": ["convolution_gpu_bfyx_gemm_like",2], + "4477250064118514397": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",2], + "17543094050285028967": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11778866470635184668": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "11388177266504804841": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "16243813701829982936": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13727643349589056375": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "1313038182637545943": ["convolution_gpu_bfyx_gemm_like",2], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "5341876404211768451": ["convolution_gpu_bfyx_gemm_like",2], + "16195252193236429176": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "4503960445974334415": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "4307817040832953223": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6896806672575430025": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "11369389082421346630": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8325686349100774855": ["convolution_gpu_bfyx_gemm_like",1], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4607428643002808173": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "475079717987185580": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "15890749658785957481": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11511221956203704038": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12896164738668798380": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17835134875461003221": ["convolution_gpu_bfyx_gemm_like",2], + "9256308629247511374": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16392283136103456949": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "15459849799278480779": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "8995892222116060827": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13112861120841066430": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12952160708294444403": ["convolution_gpu_bfyx_gemm_like",2], + "9070474871526366492": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "17802261444972408048": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "15581678976147496970": ["convolution_gpu_bfyx_gemm_like",0], + "5934841294975212773": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "1204089510255285420": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "15365776263895633531": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4887564143681507924": ["convolution_gpu_bfyx_gemm_like",2], + "8663545677000846511": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4063525218682664832": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "802853291842159625": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "2595273700611743351": ["convolution_gpu_bfyx_gemm_like",2], + "18060514966005474708": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1413598669014941757": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "5756918912614763074": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6649759230117795192": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "11583791752668920812": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15993427814066246646": ["convolution_gpu_bfyx_gemm_like",2], + "4684985181211883028": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "598745924736700294": ["convolution_gpu_bfyx_gemm_like",2], + "18382226420077875582": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2598910952085172410": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10338444429123971258": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12411228585189337571": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5733530388090903847": ["convolution_gpu_bfyx_gemm_like",2], + "13019190248083899887": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13345599888287912619": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12944449254981328284": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "3329610414149222728": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17442035600389810700": ["convolution_gpu_bfyx_gemm_like",0], + "5047419871737940985": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13122637768866153753": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8818070832398055086": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6898793319624390153": ["convolution_gpu_bfyx_gemm_like",2], + "15679696422603106163": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "1824009696938637196": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "777107147173214189": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2915952195141872726": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "14795626641169374231": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "11312481316584327495": ["convolution_gpu_bfyx_gemm_like",0], + "3783485901378896953": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "6404731509766519779": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10356951625481502476": ["convolution_gpu_bfyx_gemm_like",2], + "8254412626112343365": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13792918179373942640": ["convolution_gpu_bfyx_gemm_like",2], + "16507216630035678597": ["convolution_gpu_bfyx_gemm_like",2], + "2592242929641774198": ["convolution_gpu_bfyx_gemm_like",0], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7596423139159263456": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13276867073526485069": ["convolution_gpu_bfyx_gemm_like",1], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "18076018773227225156": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14799589725341253463": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "4553508439536472227": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "9974986004361966590": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9880591864624136517": ["convolution_gpu_bfyx_gemm_like",2], + "14885519273643841492": ["convolution_gpu_bfyx_gemm_like",0], + "9513545197321447870": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "4165920860392215245": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "9235762655002034553": ["convolution_gpu_bfyx_gemm_like",2], + "2684971093531227585": ["convolution_gpu_bfyx_gemm_like",2], + "10629681722649771498": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15592321818359223008": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "1753515740487760297": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4305170667287274371": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12523676912856063091": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16462602383546733062": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",1], + "2772704069752888874": ["convolution_gpu_bfyx_gemm_like",2], + "17211272113483906944": ["convolution_gpu_bfyx_gemm_like",2], + "11858246418724176452": ["convolution_gpu_bfyx_gemm_like",1], + "10263861857115868555": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "3202034075645193740": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8734483136584351066": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "7860086755625626604": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "3688864365328401568": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "14900099988131599740": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "15463873588896650327": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "2888587871912905870": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "12821282158186877473": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14512311371993445906": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "5176939691838030517": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "6307840223437204536": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "14484890926084856480": ["convolution_gpu_bfyx_gemm_like",1], + "10153070641942936648": ["convolution_gpu_bfyx_gemm_like",1], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "443863053598769137": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "6513705142577622089": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8545063312289220869": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6587817876244206939": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "4673618329986777239": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "17089801601582809764": ["convolution_gpu_bfyx_gemm_like",1], + "1908733355560815063": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5369464352361405510": ["convolution_gpu_bfyx_gemm_like",0], + "17646394278957547470": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6412452556355382032": ["convolution_gpu_bfyx_1x1",2], + "8468092944055919238": ["convolution_gpu_bfyx_gemm_like",2], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17734437318941312627": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "7930154826818165796": ["convolution_gpu_bfyx_gemm_like",1], + "8054562515577756499": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "14652719560551657529": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "2341006744107937832": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "13337315872184544686": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "1615155632991337496": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1607916839270914773": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9806689250758752070": ["convolution_gpu_bfyx_gemm_like",2], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "69884424286147709": ["convolution_gpu_bfyx_gemm_like",2], + "3623866842874047894": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "11191005013126286532": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "939718260623752240": ["convolution_gpu_bfyx_gemm_like",1], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",940], + "13108356579957761944": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11185041745377164894": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14959281374959998609": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8751967016877067287": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "9266375177690276615": ["convolution_gpu_bfyx_gemm_like",2], + "5149553691611520515": ["convolution_gpu_bfyx_gemm_like",2], + "75742659105146536": ["convolution_gpu_bfyx_gemm_like",1], + "11845013061234102293": ["convolution_gpu_bfyx_gemm_like",2], + "15786313441300512560": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15857087373591747006": ["convolution_gpu_bfyx_gemm_like",2], + "2255387202504703562": ["convolution_gpu_bfyx_gemm_like",1], + "11640468046947233335": ["convolution_gpu_bfyx_gemm_like",2], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "16078334558348380858": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "13728914881583145008": ["convolution_gpu_bfyx_gemm_like",1], + "5084402281339667158": ["convolution_gpu_bfyx_gemm_like",0], + "2881475011209167644": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3813463368918975003": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "15677832333607749130": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12547252593506448096": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "14086074948200412805": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "13388004363210658650": ["convolution_gpu_bfyx_gemm_like",2], + "9968496035529786888": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8648848365873958010": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "14366861063858001106": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7844764086278702374": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4239277257640567966": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17195491464960153261": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10432687907685994204": ["convolution_gpu_bfyx_gemm_like",1], + "7864880361674128748": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "1444256562477852389": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "557926911473978758": ["convolution_gpu_bfyx_gemm_like",1], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "11757919563609176713": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "17140704838989242732": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13429534778879474114": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "8489998884193999354": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "1404523328737649536": ["convolution_gpu_bfyx_gemm_like",1], + "9692949270906064580": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",1], + "4131038864155440038": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1553825475921110392": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "4927139127938739019": ["convolution_gpu_bfyx_gemm_like",2], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "11462394098346770463": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "11981887712163064333": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8132803057215688544": ["convolution_gpu_bfyx_gemm_like",2], + "9810703513111623136": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "15598527290222497283": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3409255127071376537": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "6877976003072165363": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3433877094202077256": ["convolution_gpu_bfyx_direct_10_12_16",1], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "14797994820826922836": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8806330242319534440": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9447458159095730492": ["convolution_gpu_bfyx_gemm_like",2], + "4920194716156732643": ["convolution_gpu_bfyx_gemm_like",2], + "286393043958202995": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14249346934748369643": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13654408396081513312": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "10168317560306247723": ["convolution_gpu_bfyx_gemm_like",2], + "2479282650381163888": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2715131647421221125": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "2808205041095636198": ["convolution_gpu_bfyx_gemm_like",2], + "3697631094971930011": ["convolution_gpu_bfyx_gemm_like",2], + "1760779615705074283": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "8707189142909022305": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "2084855707532555969": ["convolution_gpu_yxfb_yxio_b16",2], + "12248119734016401633": ["fully_connected_gpu_fb_io_ref",2], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "2014911634432127630": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3192518239721798250": ["convolution_gpu_bfyx_gemm_like",2], + "13064477237937322246": ["convolution_gpu_bfyx_gemm_like",1], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "7289907211627391947": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "2917248122493101477": ["fully_connected_gpu_fb_io_block_fp16",1], + "3509811595028801757": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "16496066467505445971": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6728889146307098720": ["convolution_gpu_bfyx_gemm_like",2], + "3939805316470672966": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12675858428585873471": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5983610157873969708": ["convolution_gpu_bfyx_gemm_like",2], + "13226478376552374040": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "4317173590203436940": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "3568749741838926204": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5049534591553232781": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5754301693527535975": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "13503688893307029975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10400727836871462348": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "12965800692507042874": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6040623414692799116": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1394872024856809266": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "11798081355131440794": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "17723621158215826108": ["convolution_gpu_bfyx_gemm_like",2], + "9454954846682513038": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3201851883430682391": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "3572202652824023801": ["convolution_gpu_bfyx_gemm_like",2], + "15921072201288695017": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "15045861858500584001": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16091165907421819456": ["convolution_gpu_bfyx_gemm_like",2], + "17050143605017295447": ["convolution_gpu_bfyx_gemm_like",2], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "16462029188795652848": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "10286228358844791913": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "6402941068107243403": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "18159049252673770569": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "10972033292930619311": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14634044133573461949": ["convolution_gpu_bfyx_gemm_like",0], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7105622384646913935": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1106762955109168526": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "8050798452111667069": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "6020570210392850503": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "1149548328523286475": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "6225447513745282621": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4960466075321426984": ["convolution_gpu_bfyx_gemm_like",2], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2832331506191733785": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "7808544677773370430": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17622515300258231642": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "17838473675663772639": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5488168361113140102": ["convolution_gpu_bfyx_gemm_like",2], + "4665029580355133140": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "15395497315929884637": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15374625876485618845": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "18286006396667126860": ["convolution_gpu_bfyx_gemm_like",1], + "14535007186125575064": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15490478608105402679": ["convolution_gpu_bfyx_gemm_like",2], + "642256034968512602": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "10412902860958663054": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "169973842603492802": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15661055655577513377": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7177701509002270324": ["convolution_gpu_bfyx_gemm_like",0], + "11207578758583923357": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7671440804202996063": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "10445587307296180364": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2616828683870391718": ["convolution_gpu_bfyx_gemm_like",2], + "14471867575610362464": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7606277451240586967": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11430797372848621790": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14189775376370027482": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "9448537968809630184": ["convolution_gpu_bfyx_direct_10_12_16",2], + "814227839929688672": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2376239021851907962": ["convolution_gpu_bfyx_gemm_like",2], + "4021097865391343020": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "13845827017732177448": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "523055954326631884": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15184258464890250739": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16703049240941366828": ["convolution_gpu_bfyx_gemm_like",2], + "13410850301164057911": ["convolution_gpu_bfyx_gemm_like",1], + "7616752360105602320": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "11825209936640729550": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8733109144496806085": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "16483792160297698151": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5966963943739041502": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10856527039674342926": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5246955189449281709": ["convolution_gpu_bfyx_gemm_like",2], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9126242742012768166": ["convolution_gpu_bfyx_gemm_like",2], + "17052596472114345717": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1818433662409886324": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "336151670657372877": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "13066019581499650377": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4172485608495372888": ["convolution_gpu_bfyx_gemm_like",1], + "14142812374094816721": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12451602623042934613": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6246148818627951104": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "15280273795883244074": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10134411551190003359": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3311449696894745049": ["convolution_gpu_bfyx_os_iyx_osv16",399], + "6701235077433821331": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "11148502358361704423": ["convolution_gpu_bfyx_gemm_like",1], + "17390307025967314108": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9527075413813342687": ["convolution_gpu_bfyx_gemm_like",2], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "17962578815194404362": ["convolution_gpu_bfyx_gemm_like",1], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "1176958491218281154": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14705457019471647279": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17621284804179990612": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "15911352758031362713": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "342174683264941351": ["convolution_gpu_bfyx_gemm_like",2], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "10683839359385393536": ["convolution_gpu_bfyx_gemm_like",1], + "15143544451530667222": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "498239903908845198": ["convolution_gpu_bfyx_gemm_like",2], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15596913527233792996": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "12757564215386697460": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3272776991539782834": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14300946078988784221": ["convolution_gpu_bfyx_gemm_like",1], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",0], + "393884269158067083": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "3725060015826635697": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5595802790436774398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13537323999534292650": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "15682441855379046778": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7946776740333736799": ["convolution_gpu_bfyx_gemm_like",1], + "8501760360687221821": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6515141738021465336": ["convolution_gpu_bfyx_gemm_like",2], + "11446181888102710561": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6478054912653910426": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12673168008792254171": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6275903692904946376": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17503210896556316294": ["convolution_gpu_bfyx_gemm_like",1], + "7557446085365037177": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "17924819398394001587": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "1187817806204244044": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "17054734441457769665": ["convolution_gpu_bfyx_gemm_like",2], + "1081962464388501987": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9861846661532177405": ["convolution_gpu_bfyx_gemm_like",2], + "12714194906146827658": ["convolution_gpu_bfyx_gemm_like",1], + "10010921697596131761": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "7154364270315480182": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14810839157236175179": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18386376129938707290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7595481705069674721": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "16218121706393504358": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "18348301285923584995": ["convolution_gpu_bfyx_gemm_like",2], + "4011606166408526342": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "2822531372171708171": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10612049417873776481": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2451603338483395600": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "15263499602817313477": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",2], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "17489420766684604600": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",2], + "5367634698951188749": ["convolution_gpu_bfyx_gemm_like",2], + "17846007967411480006": ["convolution_gpu_bfyx_gemm_like",1], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "11244704751123402754": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",2], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10205696100164492716": ["convolution_gpu_bfyx_gemm_like",2], + "17049054004246292085": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "13398875754083902831": ["fully_connected_gpu_fb_oi_ref",1], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16162899163122139501": ["fully_connected_gpu_fb_io_ref",1], + "12889351859522118935": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "9805748332775912215": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14905705901815863508": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15893297349596399716": ["convolution_gpu_bfyx_gemm_like",2], + "2805931700404492624": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11673506380927771816": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "17014952568021457244": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8550133332738529361": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "17854138024884397413": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "714397516895317906": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1240102354814495870": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "6948696390129114563": ["convolution_gpu_bfyx_gemm_like",2], + "6895664772793074050": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "885661562948597780": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6534932244936310237": ["convolution_gpu_bfyx_gemm_like",2], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "15151957983054148973": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "8442368383427915597": ["convolution_gpu_bfyx_gemm_like",2], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "8035784732695264817": ["convolution_gpu_bfyx_os_iyx_osv16",882], + "1529658068204046700": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4186957909762095019": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3727796815945431654": ["convolution_gpu_bfyx_gemm_like",2], + "11051434650031832658": ["convolution_gpu_bfyx_gemm_like",1], + "7247891577022043949": ["convolution_gpu_bfyx_gemm_like",2], + "16303870101043861053": ["convolution_gpu_bfyx_gemm_like",2], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "1555841293175143289": ["convolution_gpu_bfyx_gemm_like",1], + "11047759270093007856": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "9335016444137172241": ["convolution_gpu_bfyx_gemm_like",2], + "6069028745615910182": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "9164584153555521506": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "5085190482265319015": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10928764471719815519": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1147744092130296563": ["convolution_gpu_bfyx_gemm_like",1], + "6876164425008541018": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8560635685184432720": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2047041720569246861": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",946], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "4682428771166816734": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13262749073059058405": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16165264024659208580": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "5635504912415420460": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "16912035321030511639": ["convolution_gpu_bfyx_gemm_like",1], + "5409924335138540834": ["convolution_gpu_bfyx_gemm_like",2], + "6133592828563353516": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "659150305191479097": ["convolution_gpu_bfyx_gemm_like",2], + "3811462129131022619": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "8291770994531919371": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "15974241934088373021": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6625355663340809894": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14270450799210365812": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "8682149821028981871": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12906669887096343446": ["convolution_gpu_bfyx_gemm_like",1], + "14346703182362139650": ["convolution_gpu_bfyx_gemm_like",0], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5536595882075097311": ["convolution_gpu_bfyx_gemm_like",2], + "11510063368067539341": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "12794030011655906930": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "5229688072405810569": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7921388663815287395": ["convolution_gpu_bfyx_gemm_like",1], + "14667209474639064623": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6437820621340256996": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15630712601053635938": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17059095074211347838": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "16053383948025511837": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "9552615241912277692": ["convolution_gpu_bfyx_gemm_like",2], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "12024416333474523686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2727219457659794468": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9513218905938141296": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "13842309033760176194": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12319165874575782715": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16304963156448605623": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "153117141968471446": ["convolution_gpu_bfyx_gemm_like",1], + "572265264921910408": ["convolution_gpu_bfyx_gemm_like",2], + "10736892779278378335": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9907053348268964966": ["convolution_gpu_bfyx_gemm_like",2], + "13116746433291181712": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",1], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4600698444492242585": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7480968533463196410": ["convolution_gpu_bfyx_gemm_like",2], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "4010329161090285019": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "15406324750533549980": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14164778301660100413": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14953809073272885651": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "10648806188852074159": ["convolution_gpu_bfyx_gemm_like",2], + "15551453802011405101": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6762862978340755053": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15733883474006568340": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "12442273255786121651": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5019077257951332016": ["convolution_gpu_bfyx_gemm_like",2], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13212959214376905822": ["convolution_gpu_bfyx_gemm_like",2], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1183774022668948480": ["convolution_gpu_bfyx_gemm_like",2], + "12694001580800313954": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15025260753866131193": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3704618172730076978": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "5488296540132936296": ["convolution_gpu_bfyx_gemm_like",2], + "9738285774864435144": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "4788094685976850847": ["convolution_gpu_bfyx_gemm_like",1], + "5181206680937070543": ["convolution_gpu_bfyx_1x1",2], + "14216698267977999547": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "5695368162557483073": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16688894228380134416": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1882912836250239503": ["convolution_gpu_bfyx_gemm_like",1], + "10373791029573299582": ["convolution_gpu_bfyx_gemm_like",0], + "5872553335123308034": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5381578460674280089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "213518984547400496": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "2387389473399444503": ["convolution_gpu_bfyx_gemm_like",2], + "14296771090926462138": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "15199659885055090985": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3701795558556637835": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13975759856997443246": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "8898449752724034655": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15278336216464964580": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10701231567226563098": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "1735849969339696694": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17006655627343469372": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2268291720177538378": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16857192626139882429": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "4544147798324802817": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "13352151930345854198": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "15444345793124210505": ["convolution_gpu_bfyx_gemm_like",1], + "1213958002895787672": ["convolution_gpu_bfyx_gemm_like",2], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "15095146351334328804": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5407778324198159962": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14808831640065476291": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12341291953192305346": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "2096167792705935744": ["convolution_gpu_bfyx_gemm_like",0], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16614092873294424156": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3355259926747524578": ["convolution_gpu_bfyx_gemm_like",2], + "3296059171653513862": ["convolution_gpu_bfyx_gemm_like",1], + "4574242607119408140": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9760847838439331960": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "17231014023477377001": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12305383126483033452": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",2], + "15088940149962496972": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8158000313391713522": ["convolution_gpu_bfyx_gemm_like",2], + "8471867907212890827": ["convolution_gpu_bfyx_gemm_like",0], + "10134863884423338495": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "8732106543033226791": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "9632178829095307219": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11922163303962372849": ["convolution_gpu_bfyx_gemm_like",1], + "447152944190888653": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12445292008737311977": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5338109154207406041": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "2518919454830671073": ["convolution_gpu_bfyx_gemm_like",1], + "10158890414412187141": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14445981111412755844": ["convolution_gpu_bfyx_gemm_like",2], + "11007100272494557520": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "1663732107639157701": ["convolution_gpu_bfyx_gemm_like",1], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "17680403286850504499": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "871656942964602772": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "412314676462573090": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1944067639361309743": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "12709406234969954619": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "7715520469947900684": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1646362346584649954": ["fully_connected_gpu_fb_io_b8_f8_vload",2], + "12654574135415748217": ["convolution_gpu_bfyx_os_iyx_osv16",569], + "4691552892932405676": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "2168955429090043259": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "18269382610859905921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "6369089883691693453": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15636407980943172317": ["convolution_gpu_bfyx_gemm_like",1], + "15149336254307320187": ["convolution_gpu_bfyx_gemm_like",1], + "2287331417346465035": ["convolution_gpu_bfyx_gemm_like",2], + "5191016422297403500": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2407509127927738079": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "13874754478479442212": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "16822728519529055454": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8897786294680986991": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "7801270668419570665": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "3432296808755992670": ["convolution_gpu_bfyx_gemm_like",2], + "2128062528433088944": ["convolution_gpu_bfyx_gemm_like",1], + "17845905249343189063": ["convolution_gpu_bfyx_gemm_like",1], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16067821671414842756": ["convolution_gpu_bfyx_gemm_like",2], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "2248754661513284642": ["convolution_gpu_bfyx_gemm_like",2], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "13985989113434682460": ["convolution_gpu_bfyx_gemm_like",1], + "17354626928258309128": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18431307741997030842": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",1], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",2], + "8562093724840063781": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9548658329589481069": ["convolution_gpu_bfyx_gemm_like",1], + "12570087709404311189": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "7256947320128669983": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "1414092714405352435": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13007534905441600782": ["convolution_gpu_bfyx_gemm_like",2], + "4273605292522062969": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "16578265652036967656": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4157063588837576075": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "7291920886894073603": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "13478922504367374201": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "10265955847846166394": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "6621371075123542816": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17118569850095586049": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10643373404881648498": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13683563727561197895": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17732250360268013336": ["convolution_gpu_bfyx_gemm_like",2], + "6278892144796112655": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "12879205642236526041": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15952399564161253450": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14077148976508649021": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",2], + "9069245927173134634": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16494403731659808258": ["convolution_gpu_bfyx_os_iyx_osv16",844], + "8324250071425605671": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1015184966858657992": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "12541764833974378504": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1587220602242157814": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7471714472577512044": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3664842151999943": ["convolution_gpu_bfyx_gemm_like",0], + "15594091060902767607": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3239779684432082106": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "7612252849133077309": ["fully_connected_gpu_fb_oi_ref",1], + "15641049130597645936": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11851526665791263153": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "7152107839144357830": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10472893418729915556": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "11579025491409526679": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "142162982878269165": ["convolution_gpu_bfyx_gemm_like",1], + "13471241383850968329": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "3800864312883193560": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "5649838591590266046": ["convolution_gpu_bfyx_gemm_like",2], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13839590781642269381": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "3179296883398083696": ["convolution_gpu_bfyx_gemm_like",2], + "13492216433886201174": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "16184979150665364486": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5758223108250439377": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",5], + "15379595951542162189": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "482564204402769504": ["convolution_gpu_bfyx_gemm_like",2], + "15588841557002049726": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12329302439548900551": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "5523778675167321193": ["fully_connected_gpu_fb_oi_ref",0], + "4054010905884346287": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "18417880214901227799": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "3475222563515381706": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5605603969528988532": ["convolution_gpu_bfyx_1x1",2], + "8435953773852854494": ["convolution_gpu_bfyx_os_iyx_osv16",398], + "10147140488258047779": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18229087521018116863": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8474585711383508493": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "26434141991791193": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "18043745678739016406": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "2317476796706098254": ["convolution_gpu_bfyx_gemm_like",2], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "5135353986081664933": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10089588313551601914": ["convolution_gpu_bfyx_gemm_like",2], + "17855733925989425515": ["convolution_gpu_bfyx_gemm_like",2], + "2935787827649981367": ["convolution_gpu_bfyx_gemm_like",1], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "17494823614269622175": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "17784357412228522825": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "12390011660072693092": ["convolution_gpu_bfyx_gemm_like",2], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "9639014900668946045": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1464276409229103946": ["convolution_gpu_bfyx_gemm_like",2], + "3664532426561688336": ["convolution_gpu_bfyx_gemm_like",2], + "15529767675448574617": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12318427976031000768": ["convolution_gpu_bfyx_gemm_like",2], + "15438623619938843299": ["convolution_gpu_bfyx_gemm_like",2], + "2549584578485278083": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6672808203620992802": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "2464201299319518869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18398231411109020099": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14970517289345999487": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9180575279116075400": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14975859027256879948": ["convolution_gpu_bfyx_gemm_like",1], + "10207459870439759692": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "5424159498790442193": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12024318713420323349": ["convolution_gpu_bfyx_gemm_like",2], + "16684378382033936005": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "12379734005351960619": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "707449835235490641": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16361249849376112433": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2088422904562849807": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "4890932609897686394": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1565612286723277822": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "7004953121070642766": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14460931972510023382": ["convolution_gpu_bfyx_gemm_like",2], + "16536775289334717044": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "14897935118679731283": ["convolution_gpu_bfyx_gemm_like",1], + "7630776235327261710": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "18026754720065676632": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17934338042329576850": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12828115278384825394": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14350963106032411355": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "10772763339005937717": ["convolution_gpu_bfyx_gemm_like",1], + "6505706083205285176": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "15294692035670155801": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "16264774056719724826": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10395191003166536655": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "10202794960937110471": ["convolution_gpu_bfyx_gemm_like",2], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5120274680151325194": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",553], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "13297691763391637265": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "15335516948540868535": ["convolution_gpu_bfyx_gemm_like",2], + "7431469348791099474": ["convolution_gpu_bfyx_gemm_like",2], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "12262273765279224456": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "3879520363526481335": ["convolution_gpu_bfyx_gemm_like",2], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "13340998273773542342": ["convolution_gpu_bfyx_gemm_like",1], + "1312322903335525510": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8292979162428130363": ["convolution_gpu_bfyx_gemm_like",1], + "6819846227498139601": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "4165515078945360525": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "12823842409678756966": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13616241450266119966": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17585210048585855482": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "15924146956535930192": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "14376192291828307385": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "4477135619420651110": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15513894336778253285": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9056812077282494074": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "12136458184046915563": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18166732758694978380": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9053383117071470496": ["convolution_gpu_bfyx_direct_10_12_16",1], + "609926704263171728": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "18117954008112578376": ["convolution_gpu_bfyx_gemm_like",2], + "7088331918128954410": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13356152596085257346": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "311101627084421734": ["convolution_gpu_bfyx_gemm_like",2], + "1462775202780029067": ["convolution_gpu_bfyx_gemm_like",2], + "15830721134654889992": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "2743892624333411461": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "13833252058258614175": ["convolution_gpu_bfyx_gemm_like",2], + "2007192658799516915": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "1894591633696862066": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "11354523117287453982": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1299160913578942012": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "14277843123789500234": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "13680502636898130714": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16837749846151508824": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",2], + "14008438372661779490": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "870448505006560377": ["convolution_gpu_bfyx_gemm_like",0], + "5087812112020408781": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "6656668362090313451": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18216392915308276053": ["convolution_gpu_bfyx_direct_10_12_16",2], + "59384288121901543": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9996590003462421281": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "6364765994481977132": ["convolution_gpu_bfyx_gemm_like",2], + "10110359677546019738": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "15579919505002150556": ["convolution_gpu_bfyx_gemm_like",2], + "17966517080605659454": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "3234567405788241673": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "6889498170947481097": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "9427999492792081454": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "16122815225820081176": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17396226612787250663": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "12478041902013146137": ["convolution_gpu_bfyx_os_iyx_osv16",89], + "17361849627958781572": ["convolution_gpu_bfyx_gemm_like",1], + "13915749401892931804": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "26773921190137993": ["convolution_gpu_bfyx_gemm_like",1], + "11386443944172875185": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6233455595448276342": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11490143853656040028": ["convolution_gpu_bfyx_gemm_like",2], + "5691889055008878111": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "15928746165235747659": ["convolution_gpu_bfyx_gemm_like",1], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "16683909937519981313": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "3565702695809105495": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "17499047811775012205": ["convolution_gpu_bfyx_gemm_like",1], + "15265621959560796543": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",0], + "13855910108498240870": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6505035828719376225": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "4700147248198305671": ["convolution_gpu_bfyx_gemm_like",2], + "4879523846205649729": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "3661361503342294227": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4160065196876225262": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16159055229009077435": ["convolution_gpu_bfyx_gemm_like",2], + "14130300861965892020": ["convolution_gpu_bfyx_gemm_like",2], + "2740287492529009109": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "4737347018334654530": ["convolution_gpu_bfyx_1x1",2], + "8332688858465419317": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "17750329428766282997": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "4316519748653705692": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "2184670359551186734": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8571662320744858201": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "13453226687921450129": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4181049793451733466": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8403560033589747065": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6831045740006076251": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "15336590103518398224": ["convolution_gpu_bfyx_gemm_like",1], + "6041249121715337066": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "16583563382485459718": ["convolution_gpu_bfyx_gemm_like",2], + "4725009116734166168": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "10890538764006500546": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12394049027081208902": ["convolution_gpu_bfyx_gemm_like",1], + "8374345306483326015": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "11775667915453535428": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11078289776590382448": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "4316278502963439894": ["convolution_gpu_bfyx_gemm_like",2], + "3214253333840552610": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8670512344429807851": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13636129806349817264": ["convolution_gpu_bfyx_gemm_like",1], + "15315327794058441258": ["convolution_gpu_bfyx_gemm_like",2], + "7282751412088726760": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2384942244346844027": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6410694203929640959": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8155752116518841384": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "8054599744123820194": ["convolution_gpu_bfyx_gemm_like",1], + "3349519148124496343": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "8395521198680584245": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11404331488962230130": ["convolution_gpu_bfyx_gemm_like",2], + "2415478259408761142": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "11173744709088359283": ["fully_connected_gpu_fb_oi_ref",1], + "16419903786705052849": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5970516037710024187": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16022858814676339910": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "5629373398445592781": ["convolution_gpu_bfyx_gemm_like",2], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4212194737559719449": ["convolution_gpu_bfyx_gemm_like",2], + "9557728221162137067": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "15012744672096562609": ["convolution_gpu_bfyx_gemm_like",0], + "7117825897866941983": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10884966210360699082": ["convolution_gpu_bfyx_gemm_like",2], + "11642941943446484202": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12219239604684537521": ["convolution_gpu_bfyx_gemm_like",1], + "4036143655651874318": ["convolution_gpu_bfyx_gemm_like",1], + "12341247287556387988": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "8262441556572334783": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "16159309494101203811": ["convolution_gpu_bfyx_gemm_like",1], + "17325362379118492558": ["convolution_gpu_bfyx_gemm_like",1], + "13745327504866194229": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6508892940062336667": ["convolution_gpu_bfyx_gemm_like",2], + "11079710960007068860": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "16245852986663960440": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5982637097503543357": ["convolution_gpu_bfyx_gemm_like",2], + "218070270815606832": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4455497237293642238": ["convolution_gpu_bfyx_gemm_like",0], + "237302155033013557": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18232459663207612727": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "13234170505677988638": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "4238163995861108694": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "14685573786743639408": ["convolution_gpu_bfyx_gemm_like",2], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "16632447105476661928": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "13727585908419292912": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "1127844465496534455": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "452869991150713968": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "981733129438741439": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9457038545823436137": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12345000525470836335": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1346716334208025932": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "11465965972527519631": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12883021432082543848": ["convolution_gpu_bfyx_gemm_like",1], + "7883469783245625654": ["convolution_gpu_bfyx_gemm_like",2], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5089359404080552270": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "11782525502250249483": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17119834538806653818": ["convolution_gpu_bfyx_gemm_like",1], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "53692441535283176": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "5566145479615299930": ["convolution_gpu_bfyx_direct_10_12_16",1], + "56327004269432885": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4265991006340418914": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7338578624767544128": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "3094541981461578435": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "12956535344568057480": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11431776034512615562": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1051506168926530904": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "3956185868703826254": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1127598752149871162": ["convolution_gpu_bfyx_os_iyx_osv16",437], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9040046051053703359": ["convolution_gpu_bfyx_gemm_like",2], + "12169148580322697755": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5461649843950745696": ["convolution_gpu_bfyx_gemm_like",2], + "7405835196787288054": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "2801984749519758568": ["convolution_gpu_bfyx_gemm_like",2], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "9980945809859857871": ["convolution_gpu_bfyx_gemm_like",1], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "13435416060730279243": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15838058479520696173": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "9999963747832102729": ["convolution_gpu_bfyx_1x1",2], + "9803306661531470015": ["fully_connected_gpu_fb_oi_ref",0], + "6129884455218252024": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "269334626439013799": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "11002656253983635383": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5245087746877459629": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "14762859593402798050": ["convolution_gpu_bfyx_gemm_like",1], + "14707855908416908375": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "16747069131271457481": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "3790881125495367946": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5073623316666025204": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",2], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "8007491455800395118": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "541744773413565297": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "11086699387784339943": ["convolution_gpu_bfyx_gemm_like",2], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "14763015336626099830": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14492935486352505845": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17188170051014066220": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12482312825666761192": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3179874645565098825": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14807299286266923693": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "18414480146618201609": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7455983063685796863": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "9378419102254633989": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2028119808899845451": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "16359282790151128772": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8451901619003558199": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12771805545455650546": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "4565037760028957581": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "16112835627818488034": ["convolution_gpu_bfyx_gemm_like",2], + "8885012252853227025": ["convolution_gpu_bfyx_gemm_like",1], + "17479614483340719566": ["convolution_gpu_bfyx_gemm_like",2], + "9486447779233331380": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "591445875836641836": ["convolution_gpu_bfyx_gemm_like",1], + "6263019986730305851": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "6489645404977288242": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "15693851280141842140": ["convolution_gpu_bfyx_gemm_like",2], + "390943380079040179": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "14903430454784452446": ["convolution_gpu_bfyx_gemm_like",2], + "3885931890288969926": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14004618842373739106": ["convolution_gpu_bfyx_gemm_like",2], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "13017541921351620667": ["convolution_gpu_bfyx_gemm_like",1], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",2], + "4682062886371423209": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "12057000101434512661": ["convolution_gpu_bfyx_gemm_like",2], + "14184440545916228597": ["convolution_gpu_bfyx_gemm_like",1], + "16745988677098035122": ["convolution_gpu_bfyx_gemm_like",2], + "1276881030620698911": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14998779987429927952": ["convolution_gpu_bfyx_gemm_like",2], + "18113235498360281695": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "17400844732252600825": ["convolution_gpu_bfyx_gemm_like",1], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "17338623890209792485": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11000064679911527524": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18259018980049662870": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "3685556976073096544": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16482763280295827563": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6057433908801727873": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13785621878621289403": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "17223169013008075474": ["convolution_gpu_bfyx_gemm_like",2], + "15285236716284874711": ["convolution_gpu_bfyx_gemm_like",1], + "16181974394948732584": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "8701639906504450534": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1403373982815401451": ["convolution_gpu_bfyx_gemm_like",1], + "11927673108508931485": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "14467326533329852095": ["convolution_gpu_bfyx_gemm_like",1], + "12191056298847752438": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7313000297447719088": ["convolution_gpu_bfyx_gemm_like",2], + "15329647206594763271": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "9150686862263626364": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "14713376061469695024": ["convolution_gpu_bfyx_gemm_like",2], + "13779700363254765602": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "7606097739225472283": ["convolution_gpu_bfyx_gemm_like",2], + "9307683865422702618": ["convolution_gpu_bfyx_gemm_like",2], + "1109243878358317937": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "7020655100877544328": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "12392988351482826871": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "962311766200741205": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8512711227383782401": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "7033442247935655919": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15936513690378208182": ["convolution_gpu_bfyx_gemm_like",0], + "12610854610554906160": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8140242320379485952": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15891746043846062984": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "997155336931700015": ["convolution_gpu_bfyx_gemm_like",2], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5065071428884648135": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4770478662275293849": ["convolution_gpu_bfyx_gemm_like",2], + "2832311883163804015": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "712420402191459810": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3855859061709004677": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "16687701987371294908": ["convolution_gpu_bfyx_gemm_like",2], + "18418073826375395057": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15156525717629023944": ["convolution_gpu_bfyx_gemm_like",2], + "6571438978296387721": ["convolution_gpu_bfyx_gemm_like",2], + "3994033185122319003": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "5949713204609055571": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "16039372573821594566": ["convolution_gpu_bfyx_gemm_like",0], + "8779947213821605681": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14425082589599804235": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16629493658542781988": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",1], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",1], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5510336500642744696": ["convolution_gpu_bfyx_gemm_like",2], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "15862793522143880668": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "2346992541638145615": ["convolution_gpu_bfyx_gemm_like",2], + "2580909693815921167": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15452906059667613512": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10993061520709478334": ["convolution_gpu_bfyx_gemm_like",1], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "1601512693620510391": ["convolution_gpu_bfyx_gemm_like",2], + "12478496773222604204": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "13772598362521854438": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9165275903833498932": ["convolution_gpu_bfyx_gemm_like",1], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "3609233164979051271": ["convolution_gpu_bfyx_gemm_like",2], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "18066249200906113142": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "937200116534179904": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "16448023768045157448": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12274965963922410259": ["convolution_gpu_bfyx_gemm_like",1], + "2424832456352484524": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",2], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "2072252610120557179": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11619548409913646265": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10973647655853229395": ["convolution_gpu_bfyx_gemm_like",1], + "8507854696766492454": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "15688260390755491480": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2520734476651273971": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "17824431042110985323": ["convolution_gpu_bfyx_gemm_like",1], + "2858694223939965231": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "8357109553923988018": ["convolution_gpu_bfyx_gemm_like",2], + "14849708746319190277": ["convolution_gpu_bfyx_gemm_like",2], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14647949921048404551": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "14839051765301295219": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6223991300587768990": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",2], + "12996812489446605594": ["convolution_gpu_bfyx_gemm_like",2], + "5011190083565902614": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16995919898822376726": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "937763627727362899": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "4554398307153171456": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4220826666482500445": ["convolution_gpu_bfyx_gemm_like",2], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "981276017776678882": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "5648099611567577611": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "12421464739243825246": ["convolution_gpu_bfyx_gemm_like",2], + "10186866999254188246": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17716151880660804743": ["convolution_gpu_bfyx_gemm_like",0], + "8886676435675463412": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13766538247146238357": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "5515216528474382598": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4279694886527244747": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "142345353315012903": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "9092949297095391463": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5550000568272972532": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "1617907811128880383": ["convolution_gpu_bfyx_gemm_like",2], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12364947728685604753": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "815847426244665239": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7253709516917901897": ["convolution_gpu_bfyx_gemm_like",2], + "8481272193490654884": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "11178675492112714513": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "6494837659483504443": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "52089503050497755": ["convolution_gpu_bfyx_gemm_like",2], + "6830643729780599672": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "13538051178827008933": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9650737941239265593": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "8281212003098870446": ["convolution_gpu_bfyx_gemm_like",2], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16923874271029636508": ["convolution_gpu_bfyx_gemm_like",2], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "9516102312850256675": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "2888315406857606108": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "12932174902085755507": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10428477376571919905": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "9753436607600877081": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "5124645583449732785": ["convolution_gpu_bfyx_gemm_like",2], + "7453661005436415653": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8104509697376352086": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "6355819766289051977": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12582624102297726596": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5195515230960933214": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "11529521968552409482": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15360511165237335684": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3799171258564824874": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11732742421854164761": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17486925527036786359": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17122338330334998991": ["convolution_gpu_bfyx_gemm_like",1], + "7107313154723472157": ["convolution_gpu_bfyx_gemm_like",2], + "5519835581976587401": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12425310792514818973": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10892053822730512072": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "13448159575961515854": ["convolution_gpu_bfyx_gemm_like",0], + "6953478877896677022": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "15076307524263378967": ["convolution_gpu_bfyx_gemm_like",2], + "18369396029431709828": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "6695336381467406810": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2827850900421982274": ["convolution_gpu_bfyx_gemm_like",1], + "8149815705026829258": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "15783329079045263237": ["convolution_gpu_bfyx_gemm_like",1], + "16540183777173974162": ["convolution_gpu_bfyx_gemm_like",1], + "10908411570889102154": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14203061085285979556": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "10880656082867082647": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2850279308978256234": ["convolution_gpu_bfyx_gemm_like",2], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9343876424591024597": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13711710595263882397": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "447683677378974131": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "6853844061175773603": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3741411131962514208": ["convolution_gpu_bfyx_gemm_like",0], + "11191071895289217783": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11437885274663749440": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "14112695611389738149": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "6296371382672640627": ["convolution_gpu_bfyx_gemm_like",1], + "1743672154424707483": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4056723579347929559": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "2102507337684140674": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1713947356482032411": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1226681724476075216": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "1362239912535573615": ["convolution_gpu_bfyx_gemm_like",1], + "9146427497025645310": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "17342868362584820356": ["convolution_gpu_bfyx_gemm_like",1], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "11696231285411686761": ["convolution_gpu_bfyx_gemm_like",1], + "5186963188234940985": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9001645663675631429": ["fully_connected_gpu_fb_oi_ref",2], + "9055254157155243850": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11421235118459218209": ["convolution_gpu_bfyx_gemm_like",1], + "17991368786018745231": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13583272198088247606": ["convolution_gpu_bfyx_gemm_like",2], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12951069548510783681": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "2267942216745157485": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "12797434473085560369": ["convolution_gpu_bfyx_gemm_like",2], + "6155686980102491192": ["convolution_gpu_bfyx_gemm_like",2], + "11246470701714560770": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14559599508798500518": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "3617433210865054182": ["convolution_gpu_bfyx_direct_10_12_16",1], + "536646811796032046": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "6879801583428507100": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "6924316691569831424": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "11435397993598981900": ["convolution_gpu_bfyx_gemm_like",1], + "6322831233548420761": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "12182468247297592907": ["convolution_gpu_bfyx_gemm_like",2], + "10305912614137623024": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16244270858428653037": ["convolution_gpu_bfyx_gemm_like",2], + "2086001721804797157": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13952295742818866246": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "9579316322704307175": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10983344268706058114": ["convolution_gpu_bfyx_gemm_like",2], + "7199295899520406795": ["convolution_gpu_bfyx_direct_10_12_16",2], + "54019631544204590": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "4131527916449986086": ["convolution_gpu_bfyx_gemm_like",1], + "12659539044474018256": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "9771430089730856496": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "15834666915651997510": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "10045446802759419956": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8124881451525075977": ["convolution_gpu_bfyx_gemm_like",2], + "2430404993947067949": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "3668065353749623655": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "9358401110755269308": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7771969115805231266": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "5831305777612569716": ["convolution_gpu_bfyx_gemm_like",2], + "14174805457643822445": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12176879951537921518": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "9255337426504113924": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16608940349080184786": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14046217730873620907": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "6438522646185979880": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10708706979952421150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",724], + "4127717437639868970": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11761085899600261002": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8104609318998060422": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "2367791050032803116": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "4790960977352818689": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15775917744517770768": ["convolution_gpu_bfyx_gemm_like",1], + "16131671779145781667": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13485431068391184236": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18260147016899103633": ["convolution_gpu_bfyx_gemm_like",2], + "7002547494442875680": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17093159649157277089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16569200335969311660": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "15160322051545035612": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5514520264534847093": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3643056883397245235": ["convolution_gpu_bfyx_gemm_like",2], + "6423120553520000795": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "13385026134633096129": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11376522803174788945": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9694891301950867606": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "3102693432769248723": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15434536162164591656": ["convolution_gpu_bfyx_gemm_like",1], + "3593665238922509290": ["convolution_gpu_bfyx_gemm_like",1], + "6235132681081375078": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3910733479592621526": ["convolution_gpu_bfyx_gemm_like",2], + "6603489144277795818": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "9561367273233389233": ["convolution_gpu_bfyx_gemm_like",1], + "7890098956860637458": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "5094600092408024387": ["convolution_gpu_bfyx_gemm_like",2], + "2215194389847256545": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "11425187789506600967": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "16687215861591748162": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "18331981707436752260": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6778781361481531516": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11739629316219263056": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13661880440426932218": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11341771589317480665": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3646069704724135633": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3803179179802002296": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5458310740719324710": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "2654793073145467058": ["convolution_gpu_bfyx_gemm_like",2], + "17236135174912837061": ["convolution_gpu_bfyx_gemm_like",2], + "11962382064404466630": ["convolution_gpu_bfyx_gemm_like",1], + "3170274732463232729": ["convolution_gpu_bfyx_gemm_like",2], + "1137647382605909133": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "2335428826699999827": ["convolution_gpu_bfyx_os_iyx_osv16",884], + "18337160891834020517": ["convolution_gpu_bfyx_gemm_like",2], + "17908636589626460288": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5948701218437980356": ["convolution_gpu_bfyx_gemm_like",2], + "9424928280483728754": ["convolution_gpu_bfyx_gemm_like",2], + "143255828863957128": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16254257590403370542": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3816774953143987171": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2854124603710900850": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14981122123483756686": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "5912303851874077576": ["convolution_gpu_bfyx_gemm_like",2], + "1698847067049584068": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8618835732380720921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",1], + "17851024468934906318": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "2893564501191050837": ["convolution_gpu_bfyx_gemm_like",1], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3442073007560756473": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "10615252189597863928": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "5311718276151327830": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13076725905503922540": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15911434513425038508": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5890683283363730941": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7320142714269929201": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "2844746478867668588": ["convolution_gpu_bfyx_gemm_like",1], + "11188849626443657384": ["convolution_gpu_bfyx_gemm_like",2], + "13273455049742872922": ["convolution_gpu_bfyx_gemm_like",1], + "8575296926578119953": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16302630993799781492": ["convolution_gpu_bfyx_gemm_like",2], + "7218689869635572700": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "8647850242104327366": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "8036592210244553232": ["convolution_gpu_bfyx_gemm_like",2], + "1242366856673194709": ["convolution_gpu_bfyx_gemm_like",1], + "15841489476316341204": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "6737332058785771073": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11599932445375240727": ["convolution_gpu_bfyx_gemm_like",2], + "9209450984098528310": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "444533022549215983": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "13199524367893035805": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "3177915003579216846": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "755849895494634465": ["convolution_gpu_bfyx_gemm_like",2], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "5552958912776013600": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "7071991799972799089": ["convolution_gpu_bfyx_gemm_like",1], + "3747518910079195578": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "17228877915053571642": ["convolution_gpu_bfyx_gemm_like",1], + "1486768204660092247": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6946815194102787268": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "1469048759583678106": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "5024113153979057835": ["convolution_gpu_bfyx_gemm_like",2], + "17772882818194611202": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "5658491804782285708": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17425725917335895000": ["convolution_gpu_bfyx_gemm_like",2], + "2525260242689556544": ["convolution_gpu_bfyx_gemm_like",2], + "2257384183256237750": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "15566108481408840783": ["convolution_gpu_bfyx_gemm_like",2], + "1806154107556234": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "8449108317864057899": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "5947492124433175601": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "17920083826450150627": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5219048275475447369": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "5475537064464968733": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5368419079251107469": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3416636940668221406": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13831458435772917577": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17774979615691038302": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "1395293354112586043": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "15752695063119223631": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "5816730482014477109": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2198100074518629980": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14115040663093081148": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "17087143277789116317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3754411063032102107": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9753894415895178843": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "4692951005189464579": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "15180747404865201068": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "3297036980627776719": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3651651926851660222": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9816834679089152140": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "7059729537732609153": ["convolution_gpu_bfyx_os_iyx_osv16",487], + "15635018081312614614": ["convolution_gpu_bfyx_gemm_like",1], + "16440598510199834213": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "13405310261845268772": ["convolution_gpu_bfyx_gemm_like",1], + "7173828525834910425": ["convolution_gpu_bfyx_gemm_like",2], + "17907732260451873185": ["convolution_gpu_bfyx_gemm_like",1], + "7351443601143314161": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "6213444978855892717": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "9434761058126895612": ["convolution_gpu_bfyx_gemm_like",2], + "6149261133858739754": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16510194749934323304": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16978447917682236120": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6431225873891612234": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14502746747899017937": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11534123522633460320": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10961049607808752432": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13054706902087663592": ["convolution_gpu_bfyx_gemm_like",2], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",2], + "14218701503304823803": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16179959997108523051": ["convolution_gpu_bfyx_gemm_like",1], + "1945630503883822822": ["convolution_gpu_bfyx_gemm_like",2], + "6261584163347634965": ["convolution_gpu_bfyx_gemm_like",2], + "890897381495317874": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16862485519640051995": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17015421289522369423": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "16674897846232931666": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "7796037793136254198": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14394427817253242611": ["convolution_gpu_bfyx_gemm_like",2], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "3436770797199367854": ["convolution_gpu_bfyx_gemm_like",1], + "9100044555742394133": ["convolution_gpu_bfyx_gemm_like",1], + "912423125050985716": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7059809764116926828": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14066219153422011272": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "459936950868112292": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14284223645235602230": ["fully_connected_gpu_fb_io_ref",1], + "6659313690133629176": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17370560568464798319": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "11210961619302975072": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1557549837620967530": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5953754321266570854": ["convolution_gpu_bfyx_gemm_like",1], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8258382025812748961": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17464465663391774069": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7431069335622070596": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2410828969408182980": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15038779174806415801": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2070351447898375901": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "12352083215873760290": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "8269543491844451750": ["convolution_gpu_bfyx_os_iyx_osv16",189], + "5906083739416582743": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "18184154104081850641": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2111049986724040641": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "13366059704398720237": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4040607776348275579": ["convolution_gpu_bfyx_gemm_like",2], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4766071144928072260": ["convolution_gpu_bfyx_gemm_like",1], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "2081318772333460627": ["convolution_gpu_bfyx_direct_10_12_16",1], + "596528462327775677": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "2730604806511016352": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "13772209672418897120": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7156300614592977977": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "17598441149165536737": ["convolution_gpu_bfyx_gemm_like",2], + "11093147488085506266": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "17347387929692736001": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "820777941033224662": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "13045206675957093567": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "4652308622880770983": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6615830390513317821": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11353671464383068485": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "2510093757258898215": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10534355502345993326": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16590893345666612869": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1339402691552717009": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6323504675912413145": ["convolution_gpu_bfyx_gemm_like",2], + "5533829915176762003": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12361909180687647792": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "8948718883406304307": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "5066247088968357726": ["convolution_gpu_bfyx_gemm_like",2], + "1005880016096298476": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "15568690152071176945": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "14188045559946481097": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "15085980226773631346": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "9993925424761661218": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "10447427622114317323": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "12312291300513951124": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "60749853744407778": ["convolution_gpu_bfyx_gemm_like",0], + "16932090423428476170": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9996196793804333253": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5887877259873928726": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "2722601800398376127": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "13524128602135083081": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "123283730755186382": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10922353028117588062": ["convolution_gpu_bfyx_gemm_like",2], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "5845969526791988973": ["convolution_gpu_bfyx_direct_10_12_16",1], + "737706555781027628": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "14845194064376163156": ["convolution_gpu_bfyx_direct_10_12_16",2], + "202304354656398848": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2534285363781495903": ["convolution_gpu_bfyx_gemm_like",2], + "7883108394284369445": ["convolution_gpu_bfyx_gemm_like",1], + "3292554262586950764": ["convolution_gpu_bfyx_gemm_like",2], + "10205929431600082124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12440561123106715688": ["convolution_gpu_bfyx_gemm_like",2], + "14602509614865844486": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "16646144748089558351": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14725765847498813247": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5795940144756238917": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9133224739401155411": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12443662237620745732": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "4572185168237245759": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8271034912009744989": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "5116633474932727191": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5113313241198299504": ["convolution_gpu_bfyx_gemm_like",2], + "7247414730479113619": ["convolution_gpu_bfyx_gemm_like",1], + "14247451223653900488": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "13047793996728441528": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6420851258772300332": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "8451212914744825089": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "16511261203374835334": ["convolution_gpu_bfyx_gemm_like",2], + "13387766889016280910": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17997314629342774968": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6695224851008237679": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1208534686657112759": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14566544143931267758": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9198073694219066216": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "7217405970420485152": ["convolution_gpu_bfyx_gemm_like",2], + "1659851931406041285": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "7431237779891953779": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17942120824047252501": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16031140952379208074": ["convolution_gpu_bfyx_gemm_like",2], + "8195881973746570408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2010255131587843361": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "18206785126134139000": ["convolution_gpu_bfyx_gemm_like",2], + "2870715678422088243": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15757254795151275190": ["convolution_gpu_bfyx_gemm_like",2], + "3683201905077543598": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "4911398420005278258": ["convolution_gpu_bfyx_gemm_like",1], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13285123703712436126": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "970596838400633278": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6362453779168658462": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "13015379405020620466": ["convolution_gpu_bfyx_gemm_like",2], + "157852787707383962": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7083152697366621236": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14559552090809408184": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10995424394152951534": ["convolution_gpu_bfyx_gemm_like",2], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "17005088865778247367": ["convolution_gpu_bfyx_gemm_like",2], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "17651949893303962955": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9217386935739152562": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "16307719105384538170": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "5797545757863100286": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "17675227620234837075": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9004823715680825977": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10317038568333963064": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "868488930567226694": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "2789386984431816449": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13674246753382740056": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7569785094993085356": ["convolution_gpu_bfyx_gemm_like",1], + "13088023076667575514": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "5834006438103071406": ["convolution_gpu_bfyx_gemm_like",1], + "1603703756241612948": ["convolution_gpu_bfyx_gemm_like",2], + "10073779356457603252": ["convolution_gpu_bfyx_gemm_like",2], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13980058444317683376": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "1980887257657896260": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6447172410311223671": ["convolution_gpu_bfyx_gemm_like",2], + "5490683510357615963": ["convolution_gpu_bfyx_gemm_like",2], + "1281814301909101836": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3048753162882302153": ["convolution_gpu_bfyx_gemm_like",1], + "9667762333290150436": ["convolution_gpu_bfyx_gemm_like",1], + "13141069720428059461": ["convolution_gpu_bfyx_gemm_like",1], + "18114029275806885644": ["convolution_gpu_bfyx_1x1",2], + "16767657090925788431": ["convolution_gpu_bfyx_gemm_like",2], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2732519635571994212": ["convolution_gpu_bfyx_gemm_like",2], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "5115051214738974496": ["convolution_gpu_bfyx_gemm_like",2], + "17258278942367320412": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "9746964858035717775": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2460361970017706505": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "7450915928720828406": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9770300588867836071": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9628702542543622433": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "5230871884758163940": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",1], + "294103776081392899": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2986309211691835971": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1564774057733793087": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "9246213432501129631": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "11964639701912187118": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7863886351122918972": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "277151219694781348": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "9352866803638271156": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "13358754652597677285": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "3809343305878998617": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "861944552852043171": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7043547563530810431": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "4679163800360809315": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3746573775462003750": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15943174060386142134": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "7369471926167902143": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "2467766894778630615": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "6954257882806659594": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "11607736973932389832": ["convolution_gpu_bfyx_gemm_like",1], + "4145496852718466030": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17912189681971987483": ["convolution_gpu_bfyx_gemm_like",2], + "17073183514200378702": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "15591167992985613695": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "18957204268374834": ["convolution_gpu_bfyx_gemm_like",2], + "12478421208861550581": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "16956980254113285457": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "10348660503952680688": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3510837206834640871": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15507430010796753396": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17965825642065048619": ["fully_connected_gpu_yxfb_ref",0], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16025442470600124062": ["convolution_gpu_bfyx_gemm_like",2], + "18031896952099861060": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14788817017267716113": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15031089621161080026": ["convolution_gpu_bfyx_direct_10_12_16",0], + "879896719155824868": ["convolution_gpu_bfyx_gemm_like",2], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11205571992835612111": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12309226514391994607": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12608839247035566137": ["convolution_gpu_bfyx_gemm_like",2], + "13602299412525111348": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "8143125165478395106": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "3182329375739242693": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14044495589185586465": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "14581447673401303181": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14172081523880352608": ["convolution_gpu_bfyx_os_iyx_osv16",569], + "1173321935056172683": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1314612539156304342": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "8707484843981694525": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "4073467095502162430": ["convolution_gpu_bfyx_gemm_like",1], + "10073439287681954518": ["convolution_gpu_bfyx_gemm_like",2], + "13267743753217317315": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6324194607665787911": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6157727013102138824": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "9947693652506812817": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "17790026124881397912": ["fully_connected_gpu_yxfb_ref",2], + "1757047061843709948": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "12571532345206950176": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "14848732804958314374": ["fully_connected_gpu_yxfb_ref",1], + "7476503420928065329": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "3457676694935264283": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11210371874006224582": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12715500118796263683": ["convolution_gpu_bfyx_gemm_like",2], + "3555204322491340337": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11620960210789252617": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "15896132602902277133": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14936045362442728963": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "8394944698739627742": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1938627662342504660": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17932475157983250382": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "5688623850477433571": ["convolution_gpu_bfyx_gemm_like",2], + "15868648764972133201": ["fully_connected_gpu_fb_oi_ref",1], + "9896765610231507042": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "10323345824599612614": ["convolution_gpu_bfyx_gemm_like",2], + "6381439938385141423": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1471017943056596406": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "2691406689892290663": ["convolution_gpu_bfyx_gemm_like",1], + "14864150409380754546": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "13896429056884108617": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16230621843665445228": ["convolution_gpu_bfyx_gemm_like",2], + "6639715607290389968": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5187613930764630394": ["convolution_gpu_bfyx_gemm_like",2], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "17212292336626940406": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12850610175882424919": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "12511186263003392018": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",927], + "18347915312427917189": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3665837617379468265": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17646712050658428055": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "884923290083082187": ["convolution_gpu_bfyx_gemm_like",2], + "2180753144963020203": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "727203296169504486": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4032516698162311723": ["convolution_gpu_bfyx_gemm_like",2], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "5750277248295796439": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6586833064055001967": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "10572208209982879914": ["convolution_gpu_bfyx_gemm_like",1], + "1305434952341925041": ["convolution_gpu_bfyx_gemm_like",2], + "11544455862638831851": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14336344152455180534": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "9151597254187513724": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "218477594596081189": ["convolution_gpu_bfyx_gemm_like",1], + "9794439339209980030": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5109770354438894645": ["convolution_gpu_bfyx_gemm_like",2], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "11646035413147246650": ["convolution_gpu_bfyx_gemm_like",2], + "2597435203284675496": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7312862821818362095": ["convolution_gpu_bfyx_gemm_like",2], + "2080397907007737054": ["convolution_gpu_bfyx_os_iyx_osv16",337], + "13596494923128445274": ["convolution_gpu_bfyx_gemm_like",2], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10831460252334010668": ["convolution_gpu_bfyx_gemm_like",2], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",1], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4259929195364411411": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10016243001407196485": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17044070592136685322": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12691733869577147545": ["convolution_gpu_bfyx_gemm_like",0], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2481005139798378616": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "10973267399508186283": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "15925338073584559984": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "13459568779083836506": ["convolution_gpu_bfyx_gemm_like",1], + "9380980604821454646": ["convolution_gpu_bfyx_gemm_like",2], + "2225233951957105071": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "2702566744272427570": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13058929683986290038": ["convolution_gpu_bfyx_gemm_like",2], + "14463983770858421738": ["convolution_gpu_bfyx_gemm_like",1], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "8204962103567653154": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12161602271403760008": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "4590784654677429162": ["convolution_gpu_bfyx_gemm_like",2], + "14696479950182046016": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "1917986916390093536": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "16644809154210062742": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6556795059657533200": ["convolution_gpu_bfyx_gemm_like",2], + "8100051552977329013": ["convolution_gpu_bfyx_gemm_like",1], + "11095908837221722097": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "9802832901508552733": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13502487084912428404": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "364197229238830807": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "10195952041746407559": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "3787897045202294227": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "15383553612351941890": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "5280450544965361875": ["convolution_gpu_bfyx_gemm_like",1], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "4753055238892504599": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2908856453997530641": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14865708345458193472": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",2], + "15325302411038679750": ["convolution_gpu_bfyx_gemm_like",2], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "14674266217397415571": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9267417754412894234": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4515798403196565084": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",873], + "17713011656078651": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "9963020556968031682": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2892571961726771633": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "11805311302922325617": ["convolution_gpu_bfyx_gemm_like",1], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "1882052795393187384": ["convolution_gpu_bfyx_gemm_like",1], + "12792454713887439830": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "12170874893413205000": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "12631324498619207834": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15300588247579013966": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "5116562847410288642": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "16985565646738638215": ["convolution_gpu_bfyx_gemm_like",0], + "17902799955139047426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12460004417430913427": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5352896995050401444": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6551173574001309451": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8108843303778211282": ["convolution_gpu_bfyx_gemm_like",2], + "1306339989221885682": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6905249031401202060": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "17798626036576472760": ["convolution_gpu_bfyx_os_iyx_osv16",153], + "6990161783770805523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9454028594043242985": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12756296523829594388": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17472252137354770318": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10786022075687454490": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "7820430581748383571": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16897485136352617189": ["convolution_gpu_bfyx_gemm_like",1], + "17664704673433112966": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7316825051569394089": ["convolution_gpu_bfyx_gemm_like",2], + "4563773888811395621": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16149924641081427062": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6124219814856247918": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "16499919609457089685": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10616832946298118456": ["convolution_gpu_bfyx_gemm_like",2], + "97332433783610027": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "3075961585045028347": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3276455911598591170": ["convolution_gpu_bfyx_1x1",2], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",613], + "499739705596245675": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17372520271370779917": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "14017025411515888007": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "17580970614129952250": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",60], + "6973621625148257910": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16705941191876956548": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12823080103951853168": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17128723415461475388": ["convolution_gpu_bfyx_gemm_like",2], + "3826083535442459719": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13411431109933021193": ["convolution_gpu_bfyx_gemm_like",1], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16053441017037949431": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7344363094493575878": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",1], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "15918017311798856029": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "7658318862249823838": ["convolution_gpu_bfyx_gemm_like",2], + "4136736579788862192": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "2540513729176799897": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2239948568632407776": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "8382509515623938786": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "1801066876009461857": ["convolution_gpu_bfyx_gemm_like",1], + "973402921452083017": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "1021364163511049664": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "10756831914332769026": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14969813450703071948": ["convolution_gpu_bfyx_gemm_like",1], + "1781619247831135285": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "2062195022363480864": ["convolution_gpu_bfyx_gemm_like",2], + "16587078304821304948": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "12461575861709234385": ["convolution_gpu_bfyx_gemm_like",2], + "9753702905908744910": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2665148871393634012": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "16953502084939981636": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "9835338452418388180": ["convolution_gpu_bfyx_gemm_like",2], + "13400559817638330692": ["convolution_gpu_bfyx_gemm_like",1], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7974614031099580856": ["convolution_gpu_bfyx_gemm_like",1], + "15803050672115583478": ["convolution_gpu_bfyx_gemm_like",2], + "10437861085319472289": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "11006325877486632502": ["convolution_gpu_bfyx_gemm_like",2], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "9593975471009029134": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "10455850115486014344": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15863083575228705763": ["fully_connected_gpu_fb_oi_ref",1], + "5733701901687257088": ["convolution_gpu_bfyx_gemm_like",2], + "7490524380333929773": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "13198480749588992978": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17097621900023182992": ["convolution_gpu_bfyx_gemm_like",1], + "17800494747865760215": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5280182001774668876": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3318430113631867573": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13155901262605819372": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5414285637221358737": ["convolution_gpu_bfyx_gemm_like",2], + "11878200328276635385": ["convolution_gpu_bfyx_gemm_like",2], + "8775336277634573074": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "8166976803757624321": ["convolution_gpu_bfyx_gemm_like",1], + "1825914669961085928": ["convolution_gpu_bfyx_gemm_like",2], + "8354812222032899427": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "7052552351421332490": ["convolution_gpu_bfyx_gemm_like",1], + "9780938731831129283": ["convolution_gpu_bfyx_gemm_like",2], + "7019316994558628633": ["convolution_gpu_bfyx_gemm_like",2], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15285660674737231657": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "10492401059875127091": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14645023135017806432": ["convolution_gpu_bfyx_gemm_like",2], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "16590030963319267708": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "11192914853196766423": ["convolution_gpu_bfyx_gemm_like",2], + "2841749330967314053": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "4412343276595791077": ["convolution_gpu_bfyx_gemm_like",2], + "15404352708246779967": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11327678075247102542": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "2004691166378443418": ["convolution_gpu_bfyx_gemm_like",2], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "710166379854475667": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "8012414839721814470": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "2072246877651869428": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11128727891847758901": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "16354698991868048871": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "17340789730321673934": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "15232673324549539143": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9383222411929463824": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16935619230235600309": ["convolution_gpu_bfyx_gemm_like",2], + "7390896672639655716": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10523106317496576486": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6641684310751726510": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "12621528958448913800": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14743760934522111296": ["convolution_gpu_bfyx_gemm_like",2], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7576873892262851401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11047327014045909812": ["convolution_gpu_bfyx_gemm_like",2], + "14724862072414829490": ["convolution_gpu_bfyx_gemm_like",1], + "8017024160145338317": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "11504777464995699839": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "13430897815414587336": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "919788620883613958": ["convolution_gpu_bfyx_gemm_like",2], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",1], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "2001464747481073870": ["convolution_gpu_bfyx_gemm_like",2], + "172303227623890951": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15872143905824807656": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",0], + "8850600236849718709": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5397783260083330774": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5706423911886410117": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12293705794290797805": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10468562355439385073": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7536267099632318821": ["convolution_gpu_bfyx_gemm_like",1], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",1], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "7768680313873061531": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1294871956977733262": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7917673216808705075": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13470016086265528105": ["convolution_gpu_bfyx_gemm_like",2], + "10628725059172743408": ["convolution_gpu_bfyx_gemm_like",2], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "8525631489886320841": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12963601040302529291": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "12324580272733221544": ["convolution_gpu_bfyx_gemm_like",2], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15011507454681836178": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11297512843662536362": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1592619919721912789": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "13480393611172760874": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "17888721282811720634": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "15299926486228458704": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "17575578027095664417": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15411603884973340468": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "7137632495125292608": ["convolution_gpu_bfyx_gemm_like",2], + "13367043015761260275": ["convolution_gpu_bfyx_gemm_like",1], + "4225955829811705872": ["convolution_gpu_bfyx_gemm_like",1], + "4082218299236753259": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "13372079273473545269": ["convolution_gpu_bfyx_gemm_like",2], + "3805854200552708060": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4693778191222244259": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "13288357587089816620": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3415589023848700079": ["convolution_gpu_bfyx_gemm_like",2], + "11111488580071749965": ["convolution_gpu_bfyx_direct_10_12_16",2], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "2431923918345445420": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "8703051983346886620": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "480374950802530618": ["convolution_gpu_bfyx_gemm_like",0], + "6569793510829850291": ["convolution_gpu_bfyx_gemm_like",1], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16614678178197571772": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "1249137685908951501": ["convolution_gpu_bfyx_gemm_like",1], + "14517120053341144411": ["convolution_gpu_bfyx_gemm_like",0], + "14733291836016183044": ["convolution_gpu_bfyx_gemm_like",2], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17769940507971546305": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13500369101462555447": ["convolution_gpu_bfyx_gemm_like",2], + "10995849055789490935": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15972830392998437739": ["convolution_gpu_bfyx_gemm_like",1], + "11984095218733350838": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10322586483496198615": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15818237122613168508": ["convolution_gpu_bfyx_gemm_like",2], + "6313048719388952335": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15417738436777481469": ["convolution_gpu_bfyx_gemm_like",2], + "7390201584703727318": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "16114623916610925741": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17974200478864274127": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3237680963342495368": ["convolution_gpu_bfyx_gemm_like",2], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "12971833748980664090": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "17128760774072077101": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3913951712614107871": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13282612510005390816": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4604220876945646096": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "2056597791109604534": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4713580645061462578": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12927339938362960563": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14132860735060026066": ["convolution_gpu_bfyx_gemm_like",2], + "12642701787250074691": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "3448477246688526708": ["convolution_gpu_bfyx_gemm_like",1], + "5751627653496545003": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "11033824757086203326": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "10600884986702650404": ["convolution_gpu_bfyx_gemm_like",1], + "9853089109234784643": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "11568162864377479487": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "18372277746801271292": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15173187675372221634": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "18218631037214746168": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13676670925355487305": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1061595672605627170": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4718705504966715203": ["convolution_gpu_bfyx_gemm_like",2], + "8007667797556094444": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "601591624187191068": ["convolution_gpu_bfyx_gemm_like",2], + "6375149408738336520": ["convolution_gpu_bfyx_gemm_like",2], + "16273414163942580140": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15075932061614449973": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15661322183507404821": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2758256770667070477": ["convolution_gpu_bfyx_gemm_like",1], + "15603710070700542017": ["convolution_gpu_bfyx_gemm_like",2], + "8901432555239515645": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "3610579553304450107": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "5162737590442940024": ["convolution_gpu_bfyx_gemm_like",1], + "8981229334098733320": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3828988304073539836": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14945451027055549800": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10084794570892043447": ["convolution_gpu_bfyx_gemm_like",2], + "8363432163596927598": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6046380638013542109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2850803473613487020": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "3895088069642140043": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "11004350075893421731": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "659846949368492111": ["convolution_gpu_bfyx_direct_10_12_16",2], + "768765852586619095": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14421061973479991516": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "846485116335195633": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "13696782397412896129": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15071888879264671307": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "13479754018079206598": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "237384442106085756": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "1879844536951785808": ["convolution_gpu_bfyx_gemm_like",2], + "9776332064497085361": ["convolution_gpu_bfyx_gemm_like",2], + "4897991181236908768": ["convolution_gpu_bfyx_gemm_like",2], + "15959241441689395955": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "16168891366331544806": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "14595102366207856448": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12184558469694708819": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "1522591417942130702": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "11447737411040418462": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "6137405768481559638": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3502889736327580141": ["convolution_gpu_bfyx_gemm_like",1], + "5632101951796129342": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16140133852987111783": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "6203602270552179462": ["convolution_gpu_bfyx_gemm_like",1], + "2124458313471852768": ["convolution_gpu_bfyx_gemm_like",1], + "10033076377998157101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "11623764266322172086": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8569122574675372789": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9883682535839267422": ["convolution_gpu_bfyx_gemm_like",2], + "11773726534842908728": ["convolution_gpu_bfyx_gemm_like",2], + "16614170159588864300": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "1168311873250200110": ["convolution_gpu_bfyx_gemm_like",2], + "3615203440895591147": ["convolution_gpu_bfyx_gemm_like",1], + "5435560857659377132": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1027438463802481676": ["convolution_gpu_bfyx_gemm_like",2], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "7303492518741737111": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17631458041591681785": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17691748026963003695": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "17580933462801685507": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15826150125827529199": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17163595630291422874": ["convolution_gpu_bfyx_gemm_like",1], + "2737840613867456953": ["convolution_gpu_bfyx_gemm_like",0], + "9177395776408296291": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17923035110851963413": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13447028922679236865": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1423297940282476513": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "4296524295134959042": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11807945822985245634": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16070611944881238498": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "17515573322312447679": ["convolution_gpu_bfyx_gemm_like",2], + "15308667224953963012": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "11561352430430157770": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "890679620691833367": ["convolution_gpu_bfyx_gemm_like",2], + "13194245601015251743": ["fully_connected_gpu_fb_io_ref",0], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "2542984219353153495": ["convolution_gpu_bfyx_gemm_like",2], + "8642397690605957294": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "15451193085395494344": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12698546873263218041": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14446441689031758543": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "12169920104076167571": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17309224746854446222": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8703758535351908295": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4030835922805418609": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12916369918132790013": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "6707221689266688389": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "4890442595203749341": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "2294026590516781945": ["convolution_gpu_bfyx_gemm_like",2], + "8422748157997350873": ["convolution_gpu_bfyx_gemm_like",2], + "6495132856471482043": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "1285313118947640320": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1429370139030130929": ["convolution_gpu_bfyx_gemm_like",1], + "4999505377862312410": ["fully_connected_gpu_bf_io_gemm",2], + "16957170318200599740": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10134708781744282286": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12466721526829931923": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15470323769252511904": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "15348127927851026409": ["convolution_gpu_bfyx_gemm_like",2], + "7926301289570686825": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15221712686851573528": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8790992468693685188": ["fully_connected_gpu_fb_io_ref",1], + "6266336185072196699": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "7957927312958744432": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "17810119189318801197": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "8317140711232187781": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "14132290154676895976": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "9185109795156451440": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "5969899876159536205": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18308661808437079996": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5461980510262646821": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "9614300332487270888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17511724795386380064": ["convolution_gpu_bfyx_gemm_like",0], + "9666426531743983113": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "5519244962044894877": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9740466267717175474": ["convolution_gpu_bfyx_gemm_like",2], + "7562624810837784407": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10271474583233390474": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "10050254009828302053": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "988812830514150932": ["convolution_gpu_bfyx_gemm_like",2], + "12063854963434677046": ["convolution_gpu_bfyx_gemm_like",2], + "12418390364502912036": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "426267761240826769": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15464327246951632247": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12848303763972625729": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "11269720109905550213": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7440546908141206022": ["convolution_gpu_bfyx_gemm_like",1], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",2], + "6578804773136886939": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11589555938436186313": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8374232727884943288": ["convolution_gpu_bfyx_gemm_like",1], + "10591159235183381823": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "7819934200255007163": ["fully_connected_gpu_fb_oi_ref",2], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "13892202459701213504": ["convolution_gpu_bfyx_gemm_like",0], + "9181826459972753268": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "8612114608666892632": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "1827410519323879183": ["convolution_gpu_bfyx_1x1",2], + "12190841837604350271": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "16568662638983972991": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "13979227237506927267": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "11823106525249133834": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1254745727978231148": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "533820672115442982": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "18146068930296529306": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17790622334577372736": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13388424034634316547": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "17171513366028235799": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10031973538398542700": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14869125900405603130": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11522488904021243956": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "11338906515425639970": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1228256819256996416": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "12253987037990618484": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1208243889917809864": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9104236539185546468": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14681705641267917886": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13695012630130671371": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "11932768899981458741": ["convolution_gpu_bfyx_gemm_like",2], + "15282806587681892519": ["convolution_gpu_bfyx_gemm_like",1], + "16851949759898002809": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "1338534626640014074": ["convolution_gpu_bfyx_gemm_like",2], + "7496699438957793920": ["convolution_gpu_bfyx_gemm_like",2], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14234117003504517946": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "15412690778572403180": ["convolution_gpu_bfyx_1x1",2], + "16832083703120717402": ["convolution_gpu_bfyx_gemm_like",1], + "12507525913398812998": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13143747549517987032": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12013883366396753346": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5718747983756317198": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "1771153051233437607": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3853598651573655548": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11686716391002981733": ["convolution_gpu_bfyx_gemm_like",1], + "9513403717116039597": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "10713207196920878995": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "466868648178437688": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12396552020665536506": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "17281198415161259885": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1596472719837608525": ["convolution_gpu_bfyx_gemm_like",2], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "11976258954756052550": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "6985970932645412773": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "498439373962299687": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "9850711648349010674": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8375778282166369933": ["convolution_gpu_bfyx_gemm_like",1], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "10359995612603125965": ["convolution_gpu_bfyx_gemm_like",2], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "16360543923316690540": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "9714770878761308566": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "4122312805832663323": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2096021095904820251": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1677118421195120152": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9285566577169147378": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10168217053882274702": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "16322719022997791344": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "16741985699154392565": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16429816273405099453": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3596159214965874273": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4212697578665550281": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "15646774522467486699": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "7617123358753247310": ["fully_connected_gpu_fb_io_ref",1], + "1142725391726703078": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5843291595446603376": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "15972805725107234322": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "4859271780094116779": ["convolution_gpu_bfyx_gemm_like",2], + "14811022197918391667": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2438221595194783178": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "5957444113623953990": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7941359635463232326": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10732225577823701543": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16063854283763838910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4292467512797995948": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16628180201355989101": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "1745930004673880589": ["convolution_gpu_bfyx_gemm_like",2], + "6377828127090689238": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "1787598049938821496": ["convolution_gpu_bfyx_gemm_like",1], + "7380979920013545867": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12978004383198641522": ["convolution_gpu_bfyx_gemm_like",1], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "14815498807515058447": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "5889635603816026293": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "17615365894230830516": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9739077580693165062": ["convolution_gpu_bfyx_gemm_like",2], + "17429692714456679999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1924673125135960260": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "12981316015058930198": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "12573987322091254072": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "3446991010350155849": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "3865480446980740412": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10788148990012795028": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6150043972317126583": ["convolution_gpu_bfyx_gemm_like",1], + "15995056067568652754": ["convolution_gpu_bfyx_gemm_like",1], + "16463454447642623848": ["convolution_gpu_bfyx_gemm_like",2], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "14601912265050074833": ["convolution_gpu_bfyx_gemm_like",2], + "2415883693527779570": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "9788704336046308724": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "2599817012641445801": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "4476218615403440835": ["convolution_gpu_bfyx_gemm_like",2], + "14463841899941062548": ["convolution_gpu_bfyx_direct_10_12_16",1], + "82249723699159955": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",809], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14463173937397982331": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "8797661560676476245": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3021451990778420603": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8866164762286856139": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3216604922889072404": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15024023281204917061": ["convolution_gpu_bfyx_gemm_like",2], + "5233164031954315264": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "14326748416648598247": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "259085394007031207": ["convolution_gpu_bfyx_gemm_like",2], + "7877637636782924097": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13775683667344570223": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",0], + "531020979837645217": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13771196685227797262": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "2546472090573813082": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2399812257701033542": ["convolution_gpu_bfyx_gemm_like",2], + "11306782565667740785": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "9454146598828084176": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "7486133596762640215": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "12609790757824750429": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "17142080999569154649": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8684867236134349888": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",2], + "7999747927804607567": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6522575549211855712": ["convolution_gpu_bfyx_gemm_like",2], + "9596656797750683465": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9929060811766882316": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2042821994795163366": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "14365699621119565405": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5054574917425211132": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18085089358509617299": ["convolution_gpu_bfyx_gemm_like",2], + "13675314612031135613": ["convolution_gpu_bfyx_gemm_like",1], + "8047078039937885319": ["convolution_gpu_bfyx_gemm_like",2], + "10897008852059401902": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "12323840136934980793": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "11710299944796838170": ["convolution_gpu_bfyx_gemm_like",2], + "16172528828198474326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16026019808764920641": ["convolution_gpu_bfyx_gemm_like",2], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "704262295684441748": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3227725087355827716": ["convolution_gpu_bfyx_gemm_like",2], + "9207413252274439059": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "7666505529539001492": ["convolution_gpu_bfyx_gemm_like",2], + "2451627421465368826": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14385995236701277049": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "7371498023669344385": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "978154682881866623": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "6210051945051792519": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2078717472711037103": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",2], + "15741938682483664203": ["convolution_gpu_bfyx_1x1",2], + "9226443907548972870": ["convolution_gpu_bfyx_gemm_like",2], + "15322019609805777935": ["convolution_gpu_bfyx_gemm_like",2], + "4132087699110753428": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7259373400504003467": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13381441263790184121": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "10409424254454997557": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "360764089318153518": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10792503079194374004": ["convolution_gpu_bfyx_gemm_like",1], + "16153434096698006308": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13925839061045347955": ["convolution_gpu_bfyx_gemm_like",1], + "2295643314299482773": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "6456426339461437148": ["convolution_gpu_bfyx_gemm_like",2], + "16454286604955135655": ["convolution_gpu_bfyx_gemm_like",2], + "17721709435558297965": ["convolution_gpu_bfyx_gemm_like",1], + "8230144305844912369": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5083776511235413204": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5002362836567498954": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12825029449351875037": ["convolution_gpu_bfyx_gemm_like",1], + "4409539711630405776": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4714858252066253834": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11679869968143173159": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "3291900073868076610": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "5961488595080209440": ["convolution_gpu_bfyx_gemm_like",2], + "984472462878596435": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "5601320732740276692": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "1090447867763814054": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16081023484008718887": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "10879183694331631189": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8549465639583777774": ["convolution_gpu_bfyx_gemm_like",2], + "17500224380474287862": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "14568560907026487922": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4729855738455185191": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "11135894989941122115": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "6983900601570231321": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4239133538073498792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10771178773821148370": ["convolution_gpu_bfyx_gemm_like",2], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9924213107024674692": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16932172538978111342": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "14249486431781112226": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "13987250743654950733": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "8817624284607822971": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6692408578556372014": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",106], + "14291113322487568376": ["convolution_gpu_bfyx_gemm_like",2], + "4690935789908896751": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9349890134436171288": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10720782649044333851": ["convolution_gpu_bfyx_gemm_like",2], + "8159489372517869446": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "16916632481840858091": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5854267518455107328": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4287441125635022306": ["convolution_gpu_bfyx_direct_10_12_16",2], + "784988240891749445": ["convolution_gpu_bfyx_gemm_like",2], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1855527356709753100": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14387663434151374245": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "9455406830371528486": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2482449683288477640": ["convolution_gpu_bfyx_gemm_like",2], + "656536921219262336": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "11918018989601427118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "861813331533609605": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "8855801044538137828": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "8045393243176844621": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "10935410906182995784": ["convolution_gpu_bfyx_gemm_like",1], + "16642535448111764945": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "3743573500773847162": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14213127286928643795": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "6750269489578112382": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "6784146431605417954": ["convolution_gpu_bfyx_gemm_like",1], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "15838114628203742383": ["convolution_gpu_bfyx_gemm_like",2], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "8316848551837633169": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "7981376447277193852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "702096475436365058": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15786328370300803713": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4224423702382859092": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "8061914949376516780": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13412296930014397060": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "893885204484374577": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "13174363822969694054": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "7200893702912130808": ["convolution_gpu_bfyx_gemm_like",1], + "1608378717397996752": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "8272823732258536202": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2946926779445063554": ["convolution_gpu_bfyx_gemm_like",2], + "18010600104565458874": ["convolution_gpu_bfyx_gemm_like",1], + "1364905900191854779": ["convolution_gpu_bfyx_gemm_like",2], + "3509027370372599394": ["fully_connected_gpu_fb_io_ref",2], + "1889773840456761365": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8054185159612481260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12831670701606794888": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18006581941186887676": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1303304215797905198": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17480519865636248903": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8619380242063264016": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12242618640422208652": ["convolution_gpu_bfyx_gemm_like",2], + "12360796145248339074": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "10670829898588047148": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7982784766505903515": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13989803206226593565": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "15024130918582332928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15026219694198820614": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17827762625385383658": ["convolution_gpu_bfyx_gemm_like",2], + "13939772608127902428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",536], + "15953607231296296913": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10890975553758439233": ["convolution_gpu_bfyx_gemm_like",2], + "2777614869053822003": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9884646296875511696": ["convolution_gpu_bfyx_gemm_like",2], + "3989707993712888760": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4082623789007884063": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1160579996766519752": ["convolution_gpu_bfyx_gemm_like",2], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "3574679673239756551": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10117376369841171716": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "13289306769823703069": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "2079476232214121671": ["convolution_gpu_bfyx_gemm_like",2], + "3831257753143317802": ["convolution_gpu_bfyx_gemm_like",2], + "18074320074700491416": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9005351264094503686": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5440622601084846974": ["convolution_gpu_bfyx_gemm_like",1], + "9833242806281729759": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2411809718611709031": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "5516343490635816913": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8735735614506773179": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "2032438743863827309": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "16691293834516280510": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4466552246808462897": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "6547565989244888354": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",2], + "7367814057959247537": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16668140522258646445": ["convolution_gpu_bfyx_gemm_like",2], + "4631772220201098020": ["convolution_gpu_bfyx_gemm_like",2], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4417341352109525283": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17955654518744592086": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7146559117784312265": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "699127221549844251": ["convolution_gpu_bfyx_gemm_like",1], + "2712946943923358377": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2662628817605495834": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "13625877249040282040": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "8146559042269976123": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8057302050645780813": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13476976389397273052": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "13765632280570725774": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "5643908654122573882": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17254775053427612466": ["fully_connected_gpu_fb_oi_ref",2], + "17471843449888763571": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17707294419513060769": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7494124707566708728": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "1589338074286085915": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "18167956836333309556": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "587350550384936211": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "2104529100867065546": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "8709180250014055873": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "15392592805235453180": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "10285802605410795788": ["convolution_gpu_bfyx_gemm_like",2], + "9794061741834174000": ["convolution_gpu_bfyx_gemm_like",2], + "15594387862678649962": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "15715522462313302642": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "2752322006160986801": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15156015174611610705": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "17850932752450917677": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "498420237272375425": ["convolution_gpu_bfyx_direct_10_12_16",2], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",1], + "1884327428051733366": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "11455732989503244360": ["convolution_gpu_bfyx_gemm_like",1], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "832976844701988460": ["convolution_gpu_bfyx_gemm_like",2], + "1054159213127890689": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "6293500642319778096": ["convolution_gpu_bfyx_gemm_like",2], + "13255006150107668739": ["convolution_gpu_bfyx_gemm_like",1], + "1994707002538257258": ["convolution_gpu_yxfb_yxio_b16",2], + "7395593936948809439": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "12009524797137164943": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11151426820269138585": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7762778382848852790": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15963358868537664345": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7277156316894715321": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "4817953977830392054": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "7925721388119083644": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "555153826947872383": ["convolution_gpu_bfyx_gemm_like",2], + "8797843396807284399": ["convolution_gpu_bfyx_gemm_like",2], + "574359978358296617": ["convolution_gpu_bfyx_gemm_like",2], + "7419990519344756626": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5786828339670204894": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10545749454895857995": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7923602459997389254": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13580438297062687335": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "7054270030260701612": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "1265107284215037966": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "9197931868200777891": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "7996470545015324613": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17543625777838573622": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "17361319565503258506": ["convolution_gpu_bfyx_gemm_like",1], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",1041], + "2039909180006215069": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14292252222828824305": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "8994777547915132466": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "2521072060867896298": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17550795608527501180": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7439918590741058820": ["convolution_gpu_bfyx_gemm_like",2], + "14113510820933411052": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4949865765880884373": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "12361848206190267821": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "6128534975733321186": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "14400339764883906933": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1957975992563882145": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "7963120178142346699": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13941251104772804303": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "7385295618478993079": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14219526370377548492": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "3693042354944382600": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9416285845239621878": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15361186788588226064": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "5933743119393822386": ["convolution_gpu_bfyx_gemm_like",1], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3995098494991567714": ["convolution_gpu_bfyx_gemm_like",0], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "17160724961832795383": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "6145197915306632859": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1336477297334930004": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6719956770229212208": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "8848042913869254179": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3762117189312286955": ["convolution_gpu_bfyx_gemm_like",2], + "3142706898070129318": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "16122033101591094139": ["fully_connected_gpu_fb_oi_ref",1], + "4992668316921598993": ["convolution_gpu_bfyx_gemm_like",2], + "12174729877807876787": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "12706645084970410965": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8375465895534833097": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "2384310584901598995": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "1499841226042523429": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "11050239499079842408": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2325807459008347256": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9899897639161550704": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16443833779968719790": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1592994755823247500": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "18219755699990183812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17197868427757781334": ["convolution_gpu_bfyx_os_iyx_osv16",623], + "7573223193924678686": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "6858245954375015939": ["convolution_gpu_bfyx_gemm_like",2], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "10396343030099602596": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11680829908738480957": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11033758130987285174": ["convolution_gpu_bfyx_gemm_like",2], + "17332395907621747512": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8221243069068316492": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13468713306678453952": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3039050517419021849": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "18052322665755789573": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "8774613863662947205": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "18429276095695345973": ["convolution_gpu_bfyx_direct_10_12_16",1], + "800262759663182290": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "2299440282267661763": ["convolution_gpu_bfyx_gemm_like",2], + "7353255713834431471": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "17294244481988344762": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "9953329530402569669": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "3623695848220673001": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4104803308438043557": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16149794106807509790": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5326891298755303584": ["convolution_gpu_bfyx_gemm_like",2], + "3626743386403140330": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2260718905219541967": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15668791697154389130": ["convolution_gpu_bfyx_gemm_like",2], + "14034487492239603874": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "5020605371834958647": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11341287517759485930": ["convolution_gpu_bfyx_gemm_like",2], + "12940491379482292807": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "11767263058642131204": ["convolution_gpu_bfyx_gemm_like",0], + "5603409300903611279": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13373912451448693522": ["convolution_gpu_bfyx_gemm_like",1], + "16001665772103476029": ["convolution_gpu_bfyx_gemm_like",2], + "15107740124884150777": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "11215297942420903101": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "10710426249911063154": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15047163348308549816": ["convolution_gpu_bfyx_gemm_like",2], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "12434799432980627966": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "11507538232733291666": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "7550660458541314838": ["convolution_gpu_bfyx_gemm_like",2], + "749424160149709131": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14606504543906913119": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "7084794834886364709": ["convolution_gpu_bfyx_gemm_like",0], + "7071864660784255328": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3495464175121035222": ["convolution_gpu_bfyx_gemm_like",1], + "2388209402010617408": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1114661658519542600": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9743549865786050651": ["convolution_gpu_bfyx_gemm_like",1], + "13646026173083209094": ["convolution_gpu_bfyx_gemm_like",1], + "4440261013093281358": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7589346100701197023": ["convolution_gpu_bfyx_gemm_like",2], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "13102754309439605192": ["convolution_gpu_bfyx_gemm_like",2], + "14094981198645015124": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17829854042305231384": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "1673458534805854479": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9767355861002822967": ["convolution_gpu_bfyx_gemm_like",2], + "10040774301055885786": ["convolution_gpu_bfyx_gemm_like",2], + "8422541638844255768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10392013312924273545": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6236173564220169058": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "15487686565734149288": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "12044635257539223503": ["convolution_gpu_bfyx_gemm_like",1], + "17716065235878633691": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3217555855036660482": ["fully_connected_gpu_fb_io_ref",1], + "10000918095695585210": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18021893665721597443": ["convolution_gpu_bfyx_gemm_like",2], + "1540459344569916165": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2460365527384422680": ["convolution_gpu_bfyx_1x1",2], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "2820364088001594654": ["convolution_gpu_bfyx_os_iyx_osv16",193], + "6536333665377249409": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5329218407413679209": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "2425177545256374371": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "2944333966072327932": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "3007505068107685147": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14727155647330710270": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13600579723542095577": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "8611417708673038653": ["convolution_gpu_bfyx_gemm_like",1], + "6876300000441081789": ["convolution_gpu_bfyx_gemm_like",1], + "642695492431061226": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7451956047774945675": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1954255299238402738": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "10231289519907741812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7919434905719674781": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "16928564394848059094": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12274268980330855890": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "12270548292992377827": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "8181704316455400709": ["convolution_gpu_bfyx_gemm_like",2], + "465434718088281598": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "18009765676050504407": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5409329687010951601": ["convolution_gpu_bfyx_gemm_like",2], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13133323947490009546": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "14600118619533737293": ["fully_connected_gpu_fb_oi_ref",1], + "13872507386032159320": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "3223726179820717808": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "12407002532205454767": ["convolution_gpu_bfyx_os_iyx_osv16",1020], + "18214405165366931407": ["convolution_gpu_bfyx_gemm_like",2], + "13661225837036677371": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "505102470055903237": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "3393657180338401174": ["convolution_gpu_bfyx_gemm_like",2], + "3013359852055354405": ["convolution_gpu_bfyx_os_iyx_osv16",659], + "7662818300983256668": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "14206328165498357760": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "2688060699200137048": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12397493112115605421": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3095800485689583188": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14890705803637193714": ["fully_connected_gpu_fb_oi_ref",1], + "12942776337163777730": ["convolution_gpu_bfyx_gemm_like",2], + "4550028191070279999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3060709449176556770": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "7878546319081647695": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13648462079765466923": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10178462061836778766": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "323234725943768094": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "4940950742383121943": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "15630324874714927821": ["convolution_gpu_bfyx_gemm_like",2], + "14355612297330229277": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13499476832444042458": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "3910579267273061669": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "1188428190761098784": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8880141633878776982": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2253443114793765536": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14512407261081843554": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "9067207838429479363": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3119045125726216156": ["convolution_gpu_bfyx_gemm_like",2], + "374917621051549930": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",2], + "6717243674054760598": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11501291170503766805": ["convolution_gpu_bfyx_1x1",2], + "7210729932836957540": ["convolution_gpu_bfyx_gemm_like",1], + "7202348866484870042": ["convolution_gpu_bfyx_gemm_like",1], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "14579050468883613611": ["convolution_gpu_bfyx_gemm_like",2], + "2511318920505993508": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12606196670791209919": ["convolution_gpu_bfyx_gemm_like",2], + "10857567623940140266": ["fully_connected_gpu_fb_oi_ref",0], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "3420595282107277905": ["convolution_gpu_bfyx_gemm_like",2], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7978370756654787278": ["convolution_gpu_bfyx_gemm_like",2], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",2], + "11855777686733253894": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "1708527842474979709": ["convolution_gpu_bfyx_gemm_like",2], + "11352094952907979172": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "11875516764635427358": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "8954139494467782298": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1071663904249509302": ["convolution_gpu_bfyx_gemm_like",1], + "14002149958562285929": ["convolution_gpu_bfyx_os_iyx_osv16",722], + "10726604761650410429": ["fully_connected_gpu_fb_io_block_fp16",1], + "2491079452377917458": ["convolution_gpu_bfyx_gemm_like",2], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",2], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6577754887650563753": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12878858391355259417": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6545814945227676265": ["convolution_gpu_bfyx_gemm_like",1], + "2183193161596798350": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14204028212129440429": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4551182180668229945": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "14386256118128644729": ["convolution_gpu_bfyx_gemm_like",2], + "11901687795497708884": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15214779483545052950": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "16125965158927145599": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8799427328659766574": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1431307776181554710": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "12260041857695743504": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "5301394322453453489": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "568023964685613279": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "11845504142528424662": ["convolution_gpu_bfyx_gemm_like",2], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "11206468937763516689": ["convolution_gpu_bfyx_gemm_like",2], + "3134642518413656360": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15718011075217705480": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "13539754964691689955": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "6108475838757986889": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13505239531682993049": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "10775785602937893911": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14640909901379728455": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4152919461079296700": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14443599718173185176": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4438055737691342460": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2315979511894958580": ["convolution_gpu_bfyx_gemm_like",2], + "6673345869874137667": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1166351402218387037": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17495070522944546801": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "14749290801006453098": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "4950144098898276785": ["convolution_gpu_bfyx_gemm_like",2], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",940], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "17818587793483875865": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "18400137500031567479": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6497227130861473497": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "5821887901198535792": ["convolution_gpu_bfyx_gemm_like",1], + "16768470780681544910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "10105539975183207700": ["convolution_gpu_bfyx_gemm_like",1], + "4834591210311380436": ["convolution_gpu_bfyx_gemm_like",2], + "9191832520273617003": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "14793709237400480942": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11322451605795727486": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "16671217333627463205": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "8252948921459286528": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10098661517988566506": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12156683064218448087": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13919146899409616452": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "6366477005383470532": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "11066538564303243604": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14004715832115880216": ["convolution_gpu_bfyx_gemm_like",2], + "14156264942337528284": ["convolution_gpu_bfyx_gemm_like",1], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",525], + "1640247336720128805": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "11299275869800089824": ["convolution_gpu_bfyx_gemm_like",1], + "6549150139619174585": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "17715478364817621621": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "12864338805958186191": ["convolution_gpu_bfyx_gemm_like",1], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "1014934490175718598": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "3889688816787688160": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10131754493574658838": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14185215566042478462": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "868827643007921561": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "14373201903743002596": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "16947456984272008059": ["convolution_gpu_bfyx_gemm_like",2], + "8676627474831455650": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "528618206870447012": ["convolution_gpu_bfyx_gemm_like",1], + "4091785563304559606": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "15225331270926229394": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "15136557970717196814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7958595516465029682": ["convolution_gpu_bfyx_gemm_like",2], + "11696708134796103802": ["convolution_gpu_bfyx_gemm_like",2], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "9938569017948413183": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "18245807830790717634": ["convolution_gpu_bfyx_gemm_like",2], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "5953847130949209741": ["convolution_gpu_bfyx_gemm_like",2], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4705082468295108028": ["convolution_gpu_bfyx_gemm_like",2], + "16955653765071712611": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1300605032840412845": ["fully_connected_gpu_fb_io_block_fp16",0], + "436514945529747349": ["convolution_gpu_bfyx_gemm_like",1], + "13225520357177380691": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1817929353109443200": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1146419220317481042": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18215260982292770252": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14066660382918185188": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13140527131098422428": ["convolution_gpu_bfyx_gemm_like",2], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17796784393519192261": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "2524233418633897945": ["convolution_gpu_bfyx_gemm_like",2], + "14548629377527143409": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7345632855842905966": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "15487538714246568015": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10569290125322858127": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7570078010521452080": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "8321769923556905957": ["convolution_gpu_bfyx_gemm_like",2], + "15861253904810475842": ["convolution_gpu_bfyx_gemm_like",2], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "6616869272699525153": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2246205611561147645": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "18120169120088482114": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "10931533380146553429": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3524702814173574637": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "6026065914078520895": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "12169896916690963726": ["convolution_gpu_bfyx_gemm_like",2], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7688176479120305539": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7065121716452374910": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "10971971008143485353": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "13748207123919546925": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14944798586094927774": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11333068902248367382": ["convolution_gpu_bfyx_gemm_like",2], + "5687802882700097624": ["convolution_gpu_bfyx_gemm_like",2], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11345101652477732928": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1103228955716492167": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "11006013403687198405": ["convolution_gpu_bfyx_gemm_like",1], + "17337689605705740533": ["convolution_gpu_bfyx_gemm_like",1], + "17035903590837750750": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",1], + "6774610647537858980": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9127066823698894015": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "17261237809202428783": ["convolution_gpu_bfyx_gemm_like",2], + "16207793515276299964": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "6096189754478965440": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9224223997975166038": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "16763947298003094797": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "7777462936697576463": ["convolution_gpu_bfyx_gemm_like",2], + "16349083818768061549": ["convolution_gpu_bfyx_gemm_like",2], + "17969061908734583627": ["convolution_gpu_bfyx_gemm_like",2], + "5901470393936541758": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14173867073407110501": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17602686382249457351": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "801943727169437597": ["convolution_gpu_bfyx_gemm_like",1], + "5261762234237034874": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6258191734224827354": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "1138439260035360722": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "4239273649303286078": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "17002053020454970509": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9305957796037500628": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5547961548101779135": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5756395349044790327": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "765085235448596225": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12664952811642406457": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5688478347124565305": ["convolution_gpu_bfyx_gemm_like",1], + "18325123280144403295": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12491350649215984657": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16192971634546462244": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "7962383460496540840": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6383465957427680176": ["convolution_gpu_bfyx_gemm_like",2], + "13846039323711897088": ["convolution_gpu_bfyx_gemm_like",2], + "10380031655567712558": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",2], + "4424258528650299664": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "15467064540951151390": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10036998353100219512": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "6851536988434597530": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "15764181772410734606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6218328594667952152": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1289009275012699560": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",1], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "7670176887560273910": ["convolution_gpu_bfyx_1x1",2], + "2687781952021151359": ["convolution_gpu_bfyx_gemm_like",1], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",188], + "6500666367043862023": ["convolution_gpu_bfyx_gemm_like",2], + "17970855913877771858": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "5011769546010018777": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "7833495651619250213": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9839216696114127569": ["convolution_gpu_bfyx_gemm_like",2], + "3221221905804708596": ["convolution_gpu_bfyx_gemm_like",2], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",2], + "15695275881213623746": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1089679781525023551": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14211549589070739656": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "10422138282116598013": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16981010901052181199": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "18083041911869525296": ["convolution_gpu_bfyx_gemm_like",0], + "7458923250983373160": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4184357870886924038": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "7727871584058599163": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9105388853296359769": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "846177346130290194": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1760830986937165861": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2345023488044002149": ["convolution_gpu_bfyx_gemm_like",1], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "5175845410753897614": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "10743138314323119696": ["convolution_gpu_bfyx_gemm_like",2], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",2], + "7578465277886568471": ["convolution_gpu_bfyx_gemm_like",2], + "11080118408282076423": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13691555384698806010": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18132981365225439999": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4854802313728023001": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "9833540739021310892": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6640926908025731367": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12311901617815857033": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "11642345039270524373": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11609821372586026178": ["convolution_gpu_bfyx_gemm_like",2], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "16547425454653232058": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16940359862475871276": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3420064118559852968": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12061391584831995030": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5401946420641519048": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8921169563466511475": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10476627457539425144": ["convolution_gpu_bfyx_gemm_like",2], + "5570191330195573102": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13182965457868586949": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "8525389694584008001": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2390769652732034937": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3935174650108042053": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "12591586661644753936": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "3810356382905059819": ["convolution_gpu_bfyx_gemm_like",1], + "14707884854112495064": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13383524675055536682": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "16896434896068867157": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "496948821475405395": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10509933181132310969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "3604379857905625467": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "3923715765392385764": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "11746829511394166662": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "5013936351898884291": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17633445715900116866": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "1422402723172447295": ["convolution_gpu_bfyx_gemm_like",2], + "17778706153204631930": ["convolution_gpu_bfyx_gemm_like",2], + "12930435393720466720": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "17807033661138518449": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "11962541545116807979": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "10844622369472649330": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "3598116387801985039": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4607013085883384144": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8739570656208259296": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1077773457856682663": ["convolution_gpu_bfyx_gemm_like",2], + "3666268650646000870": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7379959915507694400": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7573459699367415551": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16103653667647559851": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "11140613052840033128": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "11033507346101404633": ["fully_connected_gpu_fb_oi_ref",0], + "4586266886779200588": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "18191573176587760698": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7873648177300629037": ["convolution_gpu_bfyx_gemm_like",2], + "7386836350136973872": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4476037346005841003": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13758938418512211194": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3154903035376733831": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "2995957440356398418": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17829983167337875463": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15989164585998175871": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",1], + "8289989008260635006": ["convolution_gpu_bfyx_gemm_like",2], + "948917645960296825": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "2538377242539785672": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "17182839667242694171": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "8116504545035982006": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "374553246608550876": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9010159579786049147": ["convolution_gpu_yxfb_yxio_b16",2], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "9003196270667188479": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3006979228759768702": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7717602860943327535": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "12716923819769400487": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17516369849823844076": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6919081291036849635": ["convolution_gpu_bfyx_gemm_like",1], + "14108091242461324109": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "7032373341094904961": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15452996816194024433": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "18044455700176500102": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5983808817108775912": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15178327647765537565": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4766447533088048613": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "18193831330827252971": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13348329768178411596": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14746516289087513444": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10250378821078082800": ["convolution_gpu_bfyx_gemm_like",2], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "16797936364395702812": ["convolution_gpu_bfyx_gemm_like",2], + "9559533345689069514": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "4396653960950462197": ["convolution_gpu_bfyx_gemm_like",2], + "16968664807495872526": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15168098632351740923": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16688500506096347178": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9410978119783758141": ["convolution_gpu_bfyx_gemm_like",2], + "17521647426452186921": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "12836639380579091509": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "4820628266094118650": ["convolution_gpu_bfyx_gemm_like",0], + "18232408112396439386": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "17325129240374428839": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "15678329601718218341": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "11863623794400366834": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6948606378949354116": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "9605161323000741578": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13787155972060672772": ["convolution_gpu_bfyx_gemm_like",2], + "15051114821536746998": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8713776440298790672": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "6290180140047520382": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18153597620760635012": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "2506154888542197909": ["convolution_gpu_bfyx_gemm_like",1], + "311255514995417672": ["convolution_gpu_bfyx_gemm_like",2], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "3119235799568225015": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",1], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "276407276027553756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16404059675217592817": ["fully_connected_gpu_fb_oi_ref",2], + "13185831669530779595": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "16073578125651112218": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "9905716283229191208": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8390953788659916133": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "2830019939638455400": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16228026045292341333": ["convolution_gpu_bfyx_gemm_like",1], + "12046638414686283134": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1354230973143520455": ["convolution_gpu_bfyx_gemm_like",0], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "2349007644347065353": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "14996839491874598555": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13931470674812510958": ["convolution_gpu_bfyx_gemm_like",1], + "9404677451270692749": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17176310030469904708": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "4049386115353229125": ["convolution_gpu_bfyx_gemm_like",2], + "12960666483922103702": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2212821435607151031": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17693518538833606792": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "14486903620614795721": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12495153386758666911": ["convolution_gpu_bfyx_gemm_like",2], + "3856394004079548211": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "7974918595373182037": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14605107834931199380": ["convolution_gpu_bfyx_gemm_like",2], + "12705054744767500423": ["fully_connected_gpu_fb_oi_ref",2], + "9707630588260222630": ["convolution_gpu_bfyx_gemm_like",2], + "17674340174982758744": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10982693252072682414": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "15115440616185035720": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "3504421925108785018": ["convolution_gpu_bfyx_gemm_like",1], + "977617597166653416": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8146906136296114696": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "12427258337646070422": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10174752213614931877": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4184940877670248246": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5060817429317741254": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "11897886369869427808": ["convolution_gpu_bfyx_gemm_like",2], + "13522405005274414664": ["convolution_gpu_bfyx_gemm_like",2], + "16062811901668074268": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "9019451572520595738": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6074997181157712886": ["convolution_gpu_bfyx_gemm_like",2], + "10898210758890334465": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12381377111003298809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7877256119877423528": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9454457647272059910": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "2387628682187438903": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "7089077910858800239": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "689445825453914111": ["convolution_gpu_bfyx_gemm_like",2], + "6964180083696019970": ["convolution_gpu_bfyx_gemm_like",2], + "2038505773698938555": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "1588995902283491029": ["convolution_gpu_bfyx_gemm_like",2], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "1772363899841601255": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "1350402181555441235": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9899242398980336120": ["convolution_gpu_bfyx_gemm_like",1], + "3177362994630209421": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "15378025640603637387": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "7527175223662342321": ["convolution_gpu_bfyx_gemm_like",1], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "5675497261720118479": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7356440848422235031": ["convolution_gpu_bfyx_gemm_like",1], + "6557338279391882446": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4186140878816408491": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1062464852330435815": ["convolution_gpu_bfyx_gemm_like",2], + "6161072079255825074": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",2], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "232917916392453671": ["convolution_gpu_bfyx_gemm_like",2], + "2281043373250691228": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9181466280310872332": ["convolution_gpu_bfyx_gemm_like",2], + "11860902750907076009": ["convolution_gpu_bfyx_gemm_like",1], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9680288044487406977": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "17433689016343629925": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "3259455156773630257": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "8751367574402839332": ["convolution_gpu_bfyx_os_iyx_osv16",281], + "4578587579993676820": ["convolution_gpu_bfyx_gemm_like",1], + "16415344078703911571": ["convolution_gpu_bfyx_gemm_like",2], + "17178808153714023980": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4399656162365214694": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "15778476379845872053": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4407164552309929507": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "14915908231779912828": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "8316011587868622301": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13353123037511986804": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "15958017891397409552": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7782443708015375487": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6571473790090353005": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17753585752923130911": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "5172823024549700279": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13336576524443897680": ["convolution_gpu_bfyx_direct_10_12_16",2], + "582386337144876096": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "104165137500939902": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3980754726678047241": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2146633923143071497": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18383733736250135501": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",2], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "5073980187181521102": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "3561366509539440079": ["convolution_gpu_bfyx_gemm_like",2], + "7399775379344444344": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "16190949264253468961": ["convolution_gpu_bfyx_gemm_like",1], + "11117529413698667591": ["convolution_gpu_bfyx_gemm_like",2], + "11164600098693999456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "1116274074896622552": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "2947060249866633912": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "8789802900075401620": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "7282595712912388754": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "4519609440668743423": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9300767936311837876": ["convolution_gpu_bfyx_gemm_like",0], + "621272125402238670": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11704369548723383645": ["convolution_gpu_bfyx_gemm_like",0], + "7838176322738051195": ["convolution_gpu_bfyx_gemm_like",1], + "4883106423598271822": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "6103824715103416420": ["convolution_gpu_bfyx_gemm_like",2], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "2789137853864057385": ["convolution_gpu_bfyx_gemm_like",1], + "17659601542171299562": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "6789547098653828902": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "16132186023443894579": ["convolution_gpu_bfyx_gemm_like",2], + "2172636954267255416": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "6612243861034102250": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17080372737840346243": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18146184020578260553": ["convolution_gpu_bfyx_gemm_like",2], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13756024658546934803": ["convolution_gpu_bfyx_gemm_like",2], + "14537109978413728476": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "2235888904701517631": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "17641033958594901664": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9110265526128628472": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5911310036550570440": ["convolution_gpu_bfyx_gemm_like",0], + "288825580282908143": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16342158355942808662": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "11083777913844441475": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "18109284647478027063": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4366043672240989175": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "3718980061704064547": ["convolution_gpu_bfyx_gemm_like",1], + "7287802938269404923": ["convolution_gpu_bfyx_gemm_like",1], + "8611997227481032137": ["convolution_gpu_bfyx_os_iyx_osv16",103], + "1154469970162137785": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16124818805329568431": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "4310557764929939942": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "10506079835013332412": ["convolution_gpu_bfyx_gemm_like",2], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",2], + "12363462562375148101": ["convolution_gpu_bfyx_gemm_like",2], + "5408469943982199754": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8616175124735896626": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "12802376937099168127": ["convolution_gpu_bfyx_gemm_like",2], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17366007551797367227": ["convolution_gpu_bfyx_gemm_like",1], + "15247278167909654073": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "6902644989079870993": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14523905821262502926": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "11668043528929060706": ["convolution_gpu_bfyx_gemm_like",2], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8906588133431586825": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11071972036962275632": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "16997897512818072938": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "8484380699802533068": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "4113935675071480884": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "7289633911925073088": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6835280231174703662": ["convolution_gpu_bfyx_gemm_like",1], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",390], + "11977806053733461574": ["convolution_gpu_bfyx_gemm_like",2], + "15191864907092681849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8909239203149651260": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14446344744130895614": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15683344003370367509": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "17795358440179122086": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16766706479910720794": ["convolution_gpu_bfyx_gemm_like",2], + "5701438170070600512": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "7336911146060959485": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "15520716279021654196": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "17382660912493284320": ["convolution_gpu_bfyx_direct_10_12_16",0], + "10838972820886273680": ["convolution_gpu_bfyx_gemm_like",2], + "11872894645888259277": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "18337975902615310907": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1019936903773818652": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17744551201434706388": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "4269447138276727632": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17981604038340576961": ["convolution_gpu_bfyx_gemm_like",2], + "1018319414633271980": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "6722358544720547260": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "16723949803487501587": ["convolution_gpu_bfyx_gemm_like",1], + "4858337483345561292": ["convolution_gpu_bfyx_gemm_like",2], + "5842284971563375197": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "10424278617647597641": ["convolution_gpu_bfyx_gemm_like",2], + "13201854669827561901": ["convolution_gpu_bfyx_gemm_like",2], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12319073009094248232": ["convolution_gpu_bfyx_gemm_like",2], + "9676055912997166605": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18402875771862490280": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "1966540437574889257": ["convolution_gpu_bfyx_gemm_like",1], + "12967849866710811070": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16352438188558979362": ["convolution_gpu_bfyx_gemm_like",1], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14257548530334193336": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17774902969414949042": ["convolution_gpu_bfyx_gemm_like",1], + "7032409836645019505": ["convolution_gpu_bfyx_gemm_like",2], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "7520300815632157008": ["convolution_gpu_bfyx_direct_10_12_16",2], + "529543453251381109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17641726060706984007": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "1962479636209947761": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "1179906398014559042": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "517802466588815950": ["convolution_gpu_bfyx_gemm_like",2], + "17592646937716566803": ["convolution_gpu_bfyx_os_iyx_osv16",672], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "1755021778097194246": ["convolution_gpu_bfyx_gemm_like",2], + "2057345549105608748": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "17259951372033727587": ["convolution_gpu_bfyx_gemm_like",2], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "13418701036204748812": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4355933224673863178": ["convolution_gpu_bfyx_gemm_like",2], + "10648332321840733110": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",288], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "6955820760012983739": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "17897500485405386991": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "13205973783895006074": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "12366546292695084543": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "13565027847255501776": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16576300898841314587": ["convolution_gpu_bfyx_gemm_like",2], + "12315068368597230211": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2230884858122788172": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "1006828591724642933": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "16884228931101540030": ["convolution_gpu_bfyx_gemm_like",2], + "3750338655074082587": ["fully_connected_gpu_fb_io_ref",1], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "8334832698020211623": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "17243648226968859637": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "9981938305144461962": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17224820843490443805": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "15989730594386153813": ["convolution_gpu_bfyx_gemm_like",1], + "15603643151057665338": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3034466284781235431": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "484412270668341493": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17489255290900178723": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1149585571789157695": ["convolution_gpu_bfyx_gemm_like",2], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14746900092090885770": ["convolution_gpu_bfyx_gemm_like",1], + "14242742178240625833": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6929786386716045077": ["convolution_gpu_bfyx_gemm_like",2], + "5215755301612973095": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11389000759226546186": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2564518461717467683": ["convolution_gpu_bfyx_gemm_like",1], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "14322754320861242412": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "6085098225080533278": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5358925179582853152": ["convolution_gpu_bfyx_gemm_like",2], + "15492793021506324472": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "582360460084115077": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "15322989486222859378": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10274587614581350261": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "3363675939515208883": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "15887938842582811165": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18259787991864449280": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "1139581213977408268": ["fully_connected_gpu_fb_io_ref",0], + "7703363154993904399": ["convolution_gpu_bfyx_gemm_like",2], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13762042713029963144": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12881836161162762524": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7843498978148810586": ["convolution_gpu_bfyx_gemm_like",2], + "9451273689649467046": ["convolution_gpu_bfyx_gemm_like",2], + "10358359789382196576": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "761183183078910587": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "9501165931845934084": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9497934813418221769": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "5622089373755094139": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10191238133281607150": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14230197617570499447": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14244966672894707129": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "4577872082734403187": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "2305345466244887603": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5222741986856655072": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13244693761392741931": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "13941188114382863776": ["fully_connected_gpu_fb_oi_ref",1], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "3935404533406270186": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17559685912375493682": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "17343050785312683560": ["convolution_gpu_bfyx_gemm_like",2], + "17034122796081495259": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1660279112011537957": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "6761884403006803451": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3621070130367713395": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2801141274570069180": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "10565789595834959047": ["convolution_gpu_bfyx_gemm_like",2], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6680398880450269343": ["convolution_gpu_bfyx_gemm_like",2], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "3730238135300250205": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "2930545263523345204": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",875], + "16650590194585316886": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1091511312740979158": ["convolution_gpu_bfyx_gemm_like",2], + "3661305534604931936": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "2763902728396558645": ["convolution_gpu_bfyx_gemm_like",2], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",761], + "5890599002797783437": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "8882042369902399339": ["convolution_gpu_bfyx_gemm_like",2], + "18375125668176498051": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "12756432707088842236": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "11192356850081328892": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "1999892441424036372": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "3089303702413279458": ["convolution_gpu_bfyx_gemm_like",2], + "5973242004448142604": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2265784112305305260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5850218300545888277": ["convolution_gpu_bfyx_gemm_like",1], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6298422182853095672": ["convolution_gpu_bfyx_gemm_like",2], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",506], + "11553355518677163509": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2528245911029869890": ["convolution_gpu_bfyx_gemm_like",2], + "2100387626452428743": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "7129623351507828661": ["convolution_gpu_bfyx_gemm_like",2], + "4702017956226464806": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "4356806313729405658": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",1], + "1541754036637209097": ["convolution_gpu_bfyx_gemm_like",2], + "9134203155715293387": ["convolution_gpu_bfyx_gemm_like",2], + "4601800315090684242": ["convolution_gpu_bfyx_gemm_like",0], + "16975382270657256942": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14151249542292579535": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "14757855448502485216": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "9057158661097863887": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "11851216776536423298": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "1686420552593340731": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "1852269248476496933": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10008202802779981732": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17126714253919198029": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "4245229655273611845": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "8939520209266902800": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14629433964319883917": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "16622402936526588344": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3055842046969432235": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "3200047546714112402": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "9386678255270055573": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2283707846991978126": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "8426489532875918560": ["convolution_gpu_bfyx_gemm_like",2], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13660573428614001128": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10862735194945768250": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "17867620992288101450": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "10156210866362845661": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "8819268903800581706": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8583431477863678969": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "15578217564714846277": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "5158468772356420379": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16328232350072955252": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",438], + "15043469350539759410": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "7102173884859438914": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "5379608399492828685": ["convolution_gpu_bfyx_gemm_like",1], + "17242820574559628535": ["convolution_gpu_bfyx_gemm_like",2], + "6745633232989303110": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "10885752780697269323": ["convolution_gpu_bfyx_gemm_like",1], + "7997955859883990923": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "7044087204529042819": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "11066913713501760080": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16761867442537880229": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "10527256963399838405": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3164513064874019611": ["convolution_gpu_bfyx_gemm_like",1], + "15219830328945680713": ["convolution_gpu_bfyx_gemm_like",0], + "879461985074219072": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "16628679902327485435": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12409554044517232554": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "15402502830461368746": ["convolution_gpu_bfyx_gemm_like",2], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1187224156936080964": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "5962764672151728219": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10107951904294860034": ["convolution_gpu_bfyx_gemm_like",2], + "2649948006897488504": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "1355462205983418380": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9419803870518687519": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13993045680928507594": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6754359635395225555": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "7640517221915599813": ["convolution_gpu_bfyx_gemm_like",2], + "4460838234035901102": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15846416859925768761": ["convolution_gpu_bfyx_gemm_like",2], + "10842828403850880541": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17846557385112426504": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11318404975804457466": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "11914756126771310827": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6365510146855048488": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "13291816522762326802": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "11060822686394981344": ["convolution_gpu_bfyx_gemm_like",2], + "6448710747704334053": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "5270599940168849812": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2559310381697374321": ["convolution_gpu_bfyx_gemm_like",1], + "7904735292914337507": ["convolution_gpu_bfyx_gemm_like",2], + "17368161816774674256": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14681717813022425567": ["convolution_gpu_bfyx_gemm_like",2], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "10252930102508743294": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9519113693008246391": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2159503178414447904": ["convolution_gpu_bfyx_gemm_like",2], + "2094546483928406874": ["convolution_gpu_bfyx_gemm_like",1], + "11273554217552152172": ["convolution_gpu_bfyx_gemm_like",1], + "12266072789949082198": ["convolution_gpu_bfyx_gemm_like",2], + "1569111625440278287": ["convolution_gpu_bfyx_gemm_like",2], + "411914986559525749": ["convolution_gpu_bfyx_direct_10_12_16",2], + "710656784939783221": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "2602209853120236226": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8632281866212611140": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",90], + "261021128656714770": ["convolution_gpu_bfyx_os_iyx_osv16",659], + "12644942072153919043": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "2740834366358352617": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14682537852514419239": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "18278174626712547691": ["convolution_gpu_bfyx_1x1",2], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7657964685067862984": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3198726093355425150": ["convolution_gpu_bfyx_gemm_like",2], + "9322808125154719434": ["convolution_gpu_bfyx_gemm_like",0], + "13850807749756445264": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "16113766751106329485": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "10462144647439624978": ["convolution_gpu_bfyx_gemm_like",2], + "5424164608102708333": ["convolution_gpu_bfyx_gemm_like",2], + "9895036366054127607": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6730474465453860479": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "4107088111454348836": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "9339038855869763548": ["convolution_gpu_bfyx_gemm_like",1], + "15875968032394961531": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5244441996055494170": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7335403151694644211": ["convolution_gpu_bfyx_gemm_like",1], + "15737542477498282367": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9660587580162063066": ["convolution_gpu_bfyx_gemm_like",1], + "123132396286232401": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1682486914760867977": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "10838721873837128971": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "8145385916241200820": ["convolution_gpu_bfyx_gemm_like",2], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11864780937861562358": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17172842643607718498": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2754879558245728361": ["convolution_gpu_yxfb_yxio_b16",2], + "15260010680436431377": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3242468066266096173": ["fully_connected_gpu_fb_oi_ref",2], + "8413117662038329068": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3723082283919334922": ["convolution_gpu_bfyx_gemm_like",1], + "11258182961445417799": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10340099951904598712": ["convolution_gpu_bfyx_gemm_like",1], + "9198752981132674942": ["convolution_gpu_bfyx_gemm_like",2], + "2467535554409643460": ["convolution_gpu_bfyx_gemm_like",1], + "2748579123295571094": ["convolution_gpu_bfyx_gemm_like",2], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",1], + "6768451741770053089": ["convolution_gpu_bfyx_gemm_like",2], + "3046878786712386934": ["convolution_gpu_bfyx_gemm_like",2], + "9351428703239678614": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11290558687608213321": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1502236537645808646": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12767115494378788592": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "15432337846778101995": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10049329759351957685": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "1482319750326346549": ["convolution_gpu_bfyx_gemm_like",2], + "14744368497944610864": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17602810216393274602": ["convolution_gpu_bfyx_gemm_like",2], + "8529571293598502239": ["convolution_gpu_bfyx_gemm_like",2], + "11726125778063855770": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "6275318358833298854": ["convolution_gpu_bfyx_gemm_like",2], + "10175721494218314250": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11868789283464117390": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "3527012447011885981": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "13891498649894490342": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "12568071362640409835": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3436576388124386308": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "18369668865072009928": ["convolution_gpu_bfyx_gemm_like",1], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "12965552570525926289": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "889943986793446284": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "15129201859573664210": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7198242727502284570": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7826714904736870517": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5702807185231177394": ["convolution_gpu_bfyx_gemm_like",2], + "4585891362157592384": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "8503207028307570404": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "2788116002380533417": ["convolution_gpu_bfyx_gemm_like",1], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15660316437768312006": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17357800564047774826": ["convolution_gpu_bfyx_gemm_like",2], + "15696133206063951076": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "12814676907278614920": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "28534640470354264": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10190532901392055501": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "13659291428095454839": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "14385181780082014495": ["convolution_gpu_bfyx_gemm_like",2], + "13483407708449667171": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "1720791539242542292": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14673890892774965970": ["convolution_gpu_bfyx_gemm_like",2], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "1641881628032037384": ["convolution_gpu_bfyx_gemm_like",1], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8153567933591966877": ["convolution_gpu_bfyx_gemm_like",1], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "830199932582554906": ["convolution_gpu_bfyx_gemm_like",1], + "17144223055397369799": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12089505956882731481": ["convolution_gpu_bfyx_gemm_like",2], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",2], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "13599555566632152241": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10599639229366933472": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "152263592822875549": ["convolution_gpu_bfyx_gemm_like",2], + "352808518345312040": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "11674630830833831209": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12854110364457722483": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "4887402175773881313": ["convolution_gpu_bfyx_gemm_like",2], + "4524347845016978037": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4034250407843183678": ["convolution_gpu_bfyx_gemm_like",1], + "4209610989252810404": ["convolution_gpu_bfyx_gemm_like",1], + "17221958812979739319": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "2008064690158516711": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10451904743064959757": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "13728180355108851541": ["convolution_gpu_bfyx_gemm_like",2], + "3098585338129539028": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "3892873577927627992": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14463506867389575739": ["convolution_gpu_bfyx_gemm_like",1], + "7527121935101118719": ["convolution_gpu_bfyx_gemm_like",2], + "13764532551476584909": ["convolution_gpu_bfyx_gemm_like",1], + "15287650965861631130": ["convolution_gpu_bfyx_gemm_like",2], + "5981885264666023260": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13926730608213207277": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13226254161087770253": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12166710900466116000": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6210483922262161762": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7509199936979430017": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11979910991788695837": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "7833280896841707248": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6812025576584060234": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "1878953827218615252": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15227034948424983496": ["convolution_gpu_bfyx_gemm_like",1], + "2054895351334936744": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15765198153800696060": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "16071723603031305677": ["convolution_gpu_bfyx_gemm_like",2], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14652791434312888296": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "5553779954745929430": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",942], + "16907043223873231356": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "2959008804873881193": ["convolution_gpu_bfyx_gemm_like",1], + "331490096600171689": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16202841384048331166": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5922243230245842969": ["convolution_gpu_bfyx_gemm_like",1], + "8488789346759658706": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4810979456269693700": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "13770716520774847938": ["convolution_gpu_bfyx_gemm_like",1], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "9600125229193280365": ["convolution_gpu_bfyx_gemm_like",2], + "13337122303005980542": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "6740385846687754849": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "1504867045084152953": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4995510103045767117": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14243609293683870669": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "11491172180673411322": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "8376077531098664520": ["convolution_gpu_bfyx_gemm_like",1], + "7035625231891242247": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7678168522030142454": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8841627473398015595": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "9493034132406318197": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "16197538586133639338": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "7505608160068471520": ["fully_connected_gpu_fb_io_ref",2], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "3106922888635965020": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4886289616235149731": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14755869345266103764": ["fully_connected_gpu_fb_oi_ref",2], + "3277243911383750280": ["convolution_gpu_bfyx_gemm_like",1], + "12630173933512965589": ["convolution_gpu_bfyx_gemm_like",2], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "11439519952236570490": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14017106221778585861": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "2816982827037092536": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "6763373100985812924": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8737417433314100353": ["convolution_gpu_bfyx_gemm_like",2], + "2303141161423252932": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10412748832841674068": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15083602050538795803": ["convolution_gpu_bfyx_gemm_like",2], + "11733721371402545268": ["fully_connected_gpu_fb_io_ref",1], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "11640865562390693266": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "12819626280531787705": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2504018828500488106": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "1362540464632328798": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12791525533856308302": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13993319023992950944": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10485534959656860449": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "9552312946391901745": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "6673690359191617215": ["fully_connected_gpu_fb_oi_ref",2], + "3120885087070223590": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6277198010392189880": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "4434505319447395291": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13821388909343378606": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "11882388384272635526": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5251771557248725731": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15929361440504489924": ["convolution_gpu_bfyx_os_iyx_osv16",424], + "9940908487812223059": ["convolution_gpu_bfyx_gemm_like",2], + "2085738943081638802": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "7111620180131341264": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7748357850995979651": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4830121683809417143": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16477108783154865570": ["convolution_gpu_bfyx_gemm_like",2], + "6121673167888047110": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "12279591818557049086": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "13300022131572486202": ["convolution_gpu_bfyx_gemm_like",2], + "9492331996847106233": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17358006976602795707": ["convolution_gpu_bfyx_gemm_like",2], + "10900962238463588974": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "5398895598407183682": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15096978026328154490": ["convolution_gpu_bfyx_gemm_like",2], + "15213473731205734586": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "15092483859565823523": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16488426854651696706": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "14682894856346977838": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "6025872155179042054": ["convolution_gpu_bfyx_gemm_like",2], + "11452661262277158611": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3266638956600784732": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2423162087154134021": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8827683910847407160": ["convolution_gpu_bfyx_gemm_like",2], + "13353269683286187221": ["convolution_gpu_bfyx_gemm_like",0], + "11852328241822224147": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4593862318851730430": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5055133356846736609": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12972406304361050136": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "107527758399960384": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "16071030448801649281": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "3087295384028350107": ["convolution_gpu_yxfb_yxio_b16",2], + "3558174319433648829": ["convolution_gpu_bfyx_gemm_like",2], + "277410555520090949": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15117830538655814853": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "3991584206721185508": ["fully_connected_gpu_fb_oi_ref",1], + "12310462218432530363": ["convolution_gpu_bfyx_gemm_like",2], + "7879588938300868891": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10869059995205753062": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "17279975778400757791": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "11831092915967558428": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "14097319816812992451": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "11025455960289445816": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "8215519118071138614": ["convolution_gpu_bfyx_gemm_like",2], + "91915122883128106": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "15929970324703663357": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13232269620066140073": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "11523864029587161089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8090497202997192142": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17113350507039887381": ["convolution_gpu_bfyx_gemm_like",2], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1034911525083515252": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "4217179485243909459": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8793779433658187978": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10754321688472707825": ["convolution_gpu_bfyx_gemm_like",1], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8320522112821700316": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "1396516976059964423": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "8398760317387811024": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17540928447332229457": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13077961697656030315": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6613282637922219205": ["convolution_gpu_bfyx_gemm_like",1], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "6447357750120537934": ["convolution_gpu_bfyx_gemm_like",2], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "6413565827738894970": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "4344644499804057502": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8027062545185940933": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16958661630307271135": ["convolution_gpu_bfyx_gemm_like",1], + "18187262802267413585": ["fully_connected_gpu_fb_oi_ref",2], + "615833743936753727": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "2020044486043617858": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7264756313770306662": ["convolution_gpu_bfyx_gemm_like",2], + "4479979951990338510": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7730305811644972643": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "9350073350568836719": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5734909305243135224": ["convolution_gpu_bfyx_gemm_like",1], + "14520482703619969447": ["fully_connected_gpu_fb_io_block_fp16",0], + "5103094815475470596": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10809330882739297269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "2947753291378607664": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4945845875046545967": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "1112828128944231163": ["convolution_gpu_bfyx_gemm_like",2], + "17025268985366223779": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "15592248516895826924": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14173804995472477932": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4624363818743696582": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8689463522180659045": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "11907507085694711513": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "16566714514564722975": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11393439616752806572": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3789890554711038921": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",2], + "61390148213644186": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "8234878941966364642": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15372944709956866587": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",1], + "13311560756985319232": ["convolution_gpu_bfyx_gemm_like",2], + "16835545111241063900": ["convolution_gpu_bfyx_gemm_like",1], + "3853138649112340419": ["convolution_gpu_bfyx_gemm_like",2], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "7606716827635769887": ["convolution_gpu_bfyx_gemm_like",1], + "789359733867650915": ["convolution_gpu_bfyx_gemm_like",1], + "6578239603654034233": ["convolution_gpu_bfyx_os_iyx_osv16",122], + "6012477132351580695": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "155988420513611659": ["convolution_gpu_bfyx_gemm_like",2], + "10025893052937028511": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "475665035119038846": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12307446289692143781": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13410178186827874638": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "16896863928108200897": ["convolution_gpu_bfyx_gemm_like",2], + "2103507679502667581": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "8295066904650070896": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "4774186037059137781": ["convolution_gpu_bfyx_1x1",2], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "15858485865603722138": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "14131851237755716991": ["convolution_gpu_bfyx_gemm_like",2], + "14911763273270477925": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17089332981370803321": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12615462894236933223": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11026432639515866259": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "16992620579546408448": ["convolution_gpu_bfyx_os_iyx_osv16",128], + "15310138877321331399": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "10111038481447198008": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7323343770209750835": ["convolution_gpu_bfyx_gemm_like",1], + "13401926003864565026": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "8403919905230540356": ["fully_connected_gpu_fb_io_ref",2], + "11757953304204716753": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "4783126652984096700": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "13599438824699346708": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "18057258413318190788": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7211355951470869591": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8566695253227825439": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8697631439739291302": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "15088446688058274991": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1849035883815257432": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13817553830305981296": ["convolution_gpu_bfyx_gemm_like",2], + "1605295763358374504": ["convolution_gpu_bfyx_gemm_like",2], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "1258881146411114485": ["convolution_gpu_bfyx_gemm_like",2], + "13797057152042581440": ["convolution_gpu_bfyx_gemm_like",1], + "14541063954080306476": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "12608653044712562811": ["convolution_gpu_bfyx_direct_10_12_16",0], + "18076121920579110076": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4831224999851230245": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "2660620513253264815": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "11264412030568042996": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3317498303952226642": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4264078972561407296": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "5983162283897982344": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2838789360952219092": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2839370555757225469": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12465040766199807760": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11549611099429682170": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "17869928048344193660": ["fully_connected_gpu_fb_io_ref",2], + "10462203417605590793": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4569416043426963318": ["convolution_gpu_bfyx_gemm_like",1], + "12818953631784587919": ["convolution_gpu_bfyx_gemm_like",1], + "15622339218175336908": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4669204329917622837": ["convolution_gpu_bfyx_gemm_like",1], + "7215460815798365056": ["convolution_gpu_bfyx_gemm_like",2], + "10650242500904186542": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "5378151578014945610": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6355678392953568007": ["convolution_gpu_bfyx_gemm_like",2], + "7627882727285402176": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4947961640303581107": ["convolution_gpu_bfyx_gemm_like",2], + "14670952132900619664": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "390219891876240081": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5230406405159608187": ["convolution_gpu_bfyx_os_iyx_osv16",187], + "12711366212612147422": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3704271978133986620": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "9367157746678824712": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11213667690594303395": ["fully_connected_gpu_fb_io_ref",1], + "8104522072297740079": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9391102514951576629": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8109572327736409899": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9035867067423437834": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6232596685071671579": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "1334121138243951086": ["convolution_gpu_bfyx_gemm_like",2], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "9287404618748313247": ["convolution_gpu_bfyx_gemm_like",2], + "8307147375351882939": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10554266898346470422": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14719871224178118299": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9391425117463100557": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "1501328995320618233": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11725629762660987217": ["convolution_gpu_bfyx_gemm_like",1], + "8531836171622495872": ["convolution_gpu_bfyx_gemm_like",1], + "14045907210413991971": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "8171897258557801015": ["convolution_gpu_bfyx_gemm_like",2], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13142183299783041623": ["convolution_gpu_bfyx_gemm_like",1], + "8578747191812631883": ["convolution_gpu_bfyx_gemm_like",2], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9052153145556623933": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "12163456975896925619": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "755157892988514864": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "7770438611007743835": ["fully_connected_gpu_fb_io_block_fp16",1], + "4347494599650425733": ["convolution_gpu_bfyx_gemm_like",1], + "10433456687054381828": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10182490653383265979": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "13948873105076070952": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "5797243082477551421": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "16358588755272162237": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7508931961595339477": ["convolution_gpu_bfyx_gemm_like",1], + "11263725357444590346": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4185477435943946730": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "12141880589558027223": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "11145411572841972268": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13473730516782884152": ["convolution_gpu_bfyx_gemm_like",0], + "14951164724050668856": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "13359643347682243944": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "2932953010695506533": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "18116824232149703772": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4658091014944825771": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "15016406041863758148": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17921616427936768657": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "14407614314124529121": ["convolution_gpu_bfyx_gemm_like",1], + "13624106485902414324": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8870736106637803783": ["convolution_gpu_bfyx_os_iyx_osv16",385], + "384240534894352154": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "12650986929262866534": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17514082938765137629": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1892198178635468999": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "363330365598760149": ["convolution_gpu_bfyx_gemm_like",1], + "12569856169024791306": ["convolution_gpu_bfyx_gemm_like",2], + "11091004452522208782": ["convolution_gpu_bfyx_gemm_like",2], + "17242442529374722270": ["fully_connected_gpu_fb_oi_ref",1], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "9649533822873928984": ["convolution_gpu_bfyx_gemm_like",1], + "7463657272687673896": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",2], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "1751540546502480266": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",1], + "2705534741438659581": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "4072967257556128157": ["convolution_gpu_bfyx_gemm_like",2], + "8961544327690568390": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8473962320928461448": ["convolution_gpu_bfyx_gemm_like",2], + "16170708786673864371": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "18424400171776141118": ["convolution_gpu_bfyx_gemm_like",2], + "7000326048755427076": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11298638173197050575": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",490], + "2338535084014610258": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "346832567535597247": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "9031338938030715616": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1200162031019105686": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10320711719466983961": ["convolution_gpu_bfyx_gemm_like",2], + "13614921331048223116": ["convolution_gpu_bfyx_gemm_like",1], + "3526580286148537369": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "7086574330273897976": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",592], + "6148794431848761670": ["convolution_gpu_bfyx_gemm_like",2], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "17967188184891337660": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11234976958917093838": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "17419610762909854340": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "13836867092941506302": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "282581251783414872": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "2053428297205345660": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14742909697076926475": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "11715731071598552513": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1967810052096853804": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "340606466693982406": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3088402690095697589": ["convolution_gpu_bfyx_gemm_like",1], + "5294364781478821403": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "844278648549884313": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "11560441698542238940": ["convolution_gpu_bfyx_os_iyx_osv16",480], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7020743056013297476": ["convolution_gpu_bfyx_gemm_like",2], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "11862162783632998191": ["convolution_gpu_bfyx_gemm_like",2], + "2248628426797793532": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "11868419561534906809": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6739799137687789012": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4463585976112702040": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15929295825192449880": ["convolution_gpu_bfyx_gemm_like",2], + "17665874097707161453": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4872433441839808585": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9657585348407617520": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "3917482908041199389": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "18341524156838963264": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12526417587678222534": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10914336346597505098": ["convolution_gpu_yxfb_yxio_b16",2], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",9], + "5509852360472061267": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17269318621094624075": ["convolution_gpu_bfyx_gemm_like",1], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13624969243174329965": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4634475069086874260": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9305758766575321575": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "697609699740088622": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "11885660439698926227": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10462797712860969072": ["convolution_gpu_bfyx_gemm_like",2], + "13551767519605460627": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4868400250190558111": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "13051342120933385671": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7162701010394257343": ["convolution_gpu_bfyx_gemm_like",2], + "13660015013041074867": ["convolution_gpu_bfyx_gemm_like",2], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "4206637285289830669": ["convolution_gpu_bfyx_gemm_like",2], + "7157499157310356912": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5930451476167223501": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2881769839926594784": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11362244289696496732": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8938942439963723596": ["convolution_gpu_bfyx_gemm_like",1], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "13286723666743148654": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15112118829970177073": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "17683350638672326642": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",2], + "3173044753177123454": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "7693556065684619275": ["convolution_gpu_bfyx_os_iyx_osv16",177], + "10160082844961863335": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "1828547823690389920": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "13336847303794450665": ["convolution_gpu_bfyx_gemm_like",2], + "2054100643811117871": ["convolution_gpu_bfyx_gemm_like",2], + "6899658518070473523": ["convolution_gpu_bfyx_gemm_like",2], + "11931909191490706784": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",2], + "16996022503617157059": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "3138712043201001156": ["convolution_gpu_bfyx_gemm_like",1], + "17790954200356837750": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14786904599410885158": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "1435153323458789173": ["convolution_gpu_bfyx_gemm_like",2], + "875552069535001284": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "17420288204511371476": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1650519167046658780": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "17939745299931100048": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "18137301493811026488": ["convolution_gpu_bfyx_gemm_like",1], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "1436830013293669148": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "2479856511929768548": ["convolution_gpu_bfyx_gemm_like",2], + "8067518815436853042": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14098084847097251914": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "15922076723067110929": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "1204640737451377030": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "1521992965089360209": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "7441139786825555264": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12924910330295852704": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10446500827044060319": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1056494963618130644": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5825664545247017348": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6680219899975628258": ["convolution_gpu_bfyx_os_iyx_osv16",569], + "1922168904767469999": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "7671016314869993705": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1827273736951105482": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "13647773816638053437": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17331582127656317117": ["convolution_gpu_bfyx_gemm_like",2], + "11988463489006787939": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4457404272076798129": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3816139128011494515": ["convolution_gpu_bfyx_gemm_like",2], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14704939880642470064": ["convolution_gpu_bfyx_gemm_like",1], + "12534755422857294243": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "369250798206414410": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "8360628955300060520": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "12150384018379393131": ["convolution_gpu_bfyx_gemm_like",2], + "3930526618478171342": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "7410220112400588068": ["convolution_gpu_bfyx_gemm_like",2], + "4563529605364580848": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "11910060331768652144": ["convolution_gpu_bfyx_gemm_like",2], + "11327867170377736609": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7053070767227498983": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15257886319670476581": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "68637843533109734": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1145700078649932035": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "9876098429582714576": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14454927839795553295": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "8129414331584785189": ["convolution_gpu_bfyx_gemm_like",2], + "2904162348196990593": ["convolution_gpu_bfyx_gemm_like",1], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "1077224320045437593": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "581553908799266285": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "13869279315296163696": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13004055504657277105": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13140254055376365092": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16906866971084527970": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "18156747282906367814": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "18355551625040856531": ["convolution_gpu_bfyx_gemm_like",1], + "6256217572152039230": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10087048842366891699": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16801553481899627402": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "4554343896877444783": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8510044123592842725": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "9639125104707961956": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "5136111979773513341": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "16886045176231683312": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12526627889432649075": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "5163641718529821203": ["convolution_gpu_bfyx_gemm_like",2], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5511347850693802982": ["fully_connected_gpu_fb_io_b8_f8_vload",0], + "3806791682244402910": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12676139447729343679": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16450345154125804290": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",1], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15777551868644801538": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12796777049340516563": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5385395378424322451": ["convolution_gpu_bfyx_gemm_like",2], + "4917595053453614536": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "6537576410448334203": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3768977479127609228": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15640202505592598653": ["convolution_gpu_bfyx_gemm_like",2], + "10857084376518292379": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10551742525038893508": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "270573524496930135": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "1104098779103065492": ["convolution_gpu_bfyx_gemm_like",1], + "8243230863677884952": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "17302671258991071440": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",2], + "8655525088525612583": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15711618559677233865": ["convolution_gpu_bfyx_gemm_like",2], + "7813041847979170166": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17184638213817814424": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6432519735121751346": ["convolution_gpu_bfyx_gemm_like",1], + "11546295514640813785": ["convolution_gpu_bfyx_gemm_like",2], + "13959998803881264899": ["convolution_gpu_bfyx_gemm_like",2], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",2], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "11409066626289209846": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1128944012801956636": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "9397711809671506538": ["convolution_gpu_bfyx_os_iyx_osv16",106], + "12386930130408773521": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3027775502561362722": ["convolution_gpu_bfyx_gemm_like",1], + "2778141440914991349": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "18379763351534914922": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "15781622938833984014": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8127853538569353431": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",2], + "1766961036311612128": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6261121070004228939": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3191047205441946466": ["convolution_gpu_bfyx_gemm_like",2], + "5629582391075745771": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "10392297152843428925": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "14741012384358891350": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7953340333870774815": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6262190151863459214": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "8108939799996498955": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "11223947043157461994": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2352142833866194508": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4370628494554426971": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "11539652577193034099": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "11622271315873664622": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6525052296614701517": ["convolution_gpu_bfyx_gemm_like",1], + "3300655231758263066": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "13802834658447955377": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "16170237673140354764": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11054953301882177295": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "14159293183840880884": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",1], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "8415763978601237333": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "3638987901025418036": ["convolution_gpu_bfyx_gemm_like",1], + "6674643031068271417": ["convolution_gpu_bfyx_direct_10_12_16",2], + "470101933740495567": ["convolution_gpu_bfyx_gemm_like",2], + "5219818570070061892": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16453041919970581620": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "18171940644650760608": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "310584224049735004": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1006721963560645335": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17843570854284772921": ["convolution_gpu_bfyx_gemm_like",2], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "8863731258634577277": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6418748992581951435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "17160915544701715607": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "2215533237231530097": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11208787273440167590": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5906712613621491207": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7289594989625385620": ["convolution_gpu_bfyx_gemm_like",0], + "2191939052196737757": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5343186686923330871": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "15228614030349540878": ["convolution_gpu_bfyx_gemm_like",2], + "13083412418930786217": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15381551674482810230": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "5955810688179557560": ["convolution_gpu_bfyx_gemm_like",2], + "8509882139595784161": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11357813056434049302": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17230103497915224469": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "8337457116169698090": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10222020393925339442": ["convolution_gpu_bfyx_gemm_like",2], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "18190085718345933756": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2043990557089419633": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9679023228597590356": ["convolution_gpu_bfyx_gemm_like",2], + "16748743818537812349": ["convolution_gpu_bfyx_gemm_like",2], + "18167100055915766856": ["convolution_gpu_bfyx_gemm_like",2], + "9400558994532871122": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14659204578478669831": ["convolution_gpu_yxfb_yxio_b16",2], + "10784905418636316601": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "2270733937722366926": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "15563546888345388359": ["convolution_gpu_bfyx_gemm_like",2], + "16935426150666181858": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "3240428557350945267": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4272417312859966238": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12018398218876712811": ["convolution_gpu_bfyx_gemm_like",1], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",522], + "13947140171097868740": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "3811325657214369711": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "16559140502701231107": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9608148784787572220": ["convolution_gpu_bfyx_gemm_like",1], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4208026832369242882": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2008999755215725290": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6878922067845522655": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13297875917250935192": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7848121247546147821": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "5353170440534073482": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16582080251500644069": ["convolution_gpu_bfyx_gemm_like",2], + "16715151641337602113": ["convolution_gpu_bfyx_gemm_like",2], + "9289375071420565548": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "70244312667395170": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "16385712633367611786": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "17626938391567407401": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5933483880333895572": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "8942548644169090240": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "13810995219720233595": ["convolution_gpu_bfyx_gemm_like",2], + "9883719542550391149": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "11098189888598804624": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "5688161172644782612": ["convolution_gpu_bfyx_gemm_like",2], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "4597954342704466825": ["convolution_gpu_bfyx_gemm_like",1], + "12325592439309417414": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6491772898618671653": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3069726952591207961": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "17355826643208208691": ["convolution_gpu_bfyx_gemm_like",2], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10117784802089387496": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",2], + "9937387440035377216": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "9954853231955573552": ["convolution_gpu_bfyx_1x1",2], + "3594327736281012643": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "5433618404351968121": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13441117085490814804": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "10168272404395268951": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17016846635668370921": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17508987219281192918": ["convolution_gpu_bfyx_gemm_like",2], + "8964252048679144533": ["convolution_gpu_bfyx_gemm_like",2], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17190698921280188790": ["convolution_gpu_bfyx_gemm_like",2], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "10906417366145323499": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3953213564511738847": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "2173649669339714890": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15509845164085518352": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10397253349562394184": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1552088062654417187": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "17286180622990393912": ["convolution_gpu_bfyx_gemm_like",2], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8035084960535483680": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4239415134522959352": ["convolution_gpu_bfyx_gemm_like",2], + "844742962836593299": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "7505966294864890221": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8306931146242110738": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11674725184029885494": ["convolution_gpu_bfyx_gemm_like",1], + "6245361626768537926": ["convolution_gpu_bfyx_gemm_like",2], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "10691347880912431064": ["convolution_gpu_bfyx_gemm_like",2], + "17610648476343170476": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8549811622247170014": ["fully_connected_gpu_fb_oi_ref",2], + "11362615856022848825": ["convolution_gpu_yxfb_yxio_b16",2], + "16323870023648254366": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8860685325047463026": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14545322358931928911": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "7483972013701858698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7584912988728072414": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "139367204458861048": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9358320688298379206": ["convolution_gpu_bfyx_gemm_like",1], + "6638696743420807294": ["convolution_gpu_bfyx_gemm_like",2], + "10433541468308381909": ["convolution_gpu_bfyx_gemm_like",2], + "2507750416500565780": ["convolution_gpu_bfyx_1x1",2], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "9068406831482072377": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "41250455178236256": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "9245770108138984525": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "10603542859148554015": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "1372939511728986224": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5040944983588288886": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "1655841524658081889": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "6335628260431943016": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7905503566052181015": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "16283197954769879909": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "12015922610963701033": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "5457559128595532093": ["convolution_gpu_bfyx_gemm_like",1], + "16613907066461513431": ["convolution_gpu_bfyx_gemm_like",2], + "5902427784683046762": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "5989664002046950385": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "9101571410887509600": ["convolution_gpu_bfyx_gemm_like",1], + "13839075443229327158": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6224167817672480442": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14812010622304650503": ["convolution_gpu_bfyx_gemm_like",2], + "16934386540875904239": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "13121196588092064246": ["convolution_gpu_bfyx_gemm_like",2], + "6352796762984487375": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",0], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "9429586951778813053": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "1811357700607919311": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "18434406492564982566": ["convolution_gpu_bfyx_gemm_like",2], + "8942221095468681112": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6636049821584137799": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "8641167903508739082": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6753857156025715321": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "2873387231297790075": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "10689303050557631712": ["convolution_gpu_bfyx_gemm_like",2], + "3416059550012678486": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "18395970344992997862": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "5556023021504556658": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2382194958531920812": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "15825993019555657125": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "907233163535348999": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "330278641539729021": ["convolution_gpu_bfyx_gemm_like",2], + "14301661367597749567": ["convolution_gpu_bfyx_gemm_like",2], + "6351924049625723579": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10726830507311062380": ["fully_connected_gpu_fb_io_ref",2], + "2094213523530180653": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2664944425727769475": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "16954232936536653281": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "3187628264815974849": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "999907268780362316": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "3649980610274946512": ["fully_connected_gpu_fb_io_ref",1], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6089202061701179659": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13085261987388297912": ["convolution_gpu_bfyx_gemm_like",2], + "4650645000018045553": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "12277537216735931250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3859314295530377028": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "578940134826172063": ["convolution_gpu_bfyx_gemm_like",2], + "433161293684647032": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "6660221471357497741": ["convolution_gpu_bfyx_gemm_like",2], + "14625389915334622267": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "15883541155556528149": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7400370437512056636": ["convolution_gpu_bfyx_gemm_like",2], + "8665233719288454405": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9199198661789368378": ["convolution_gpu_bfyx_gemm_like",2], + "16620032793356620588": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "12085208566397959149": ["convolution_gpu_bfyx_gemm_like",2], + "4718568664715549075": ["convolution_gpu_bfyx_gemm_like",2], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "832830374368320801": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12672995204641007004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16072242340501555867": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "3816979903860227798": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "16256970928603738516": ["convolution_gpu_bfyx_direct_10_12_16",2], + "32035190068479388": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16042236932298055236": ["convolution_gpu_bfyx_gemm_like",1], + "11806105193035393795": ["convolution_gpu_bfyx_gemm_like",2], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "16037225955601275305": ["convolution_gpu_bfyx_gemm_like",1], + "3034947396960425753": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "1206646015768146562": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "13094313253457422444": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3896848534552901221": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15365628642332393565": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "7103345484511147373": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7857909522677175325": ["convolution_gpu_bfyx_gemm_like",2], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "14039055710777697188": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5617115485659763469": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "15751445344585167275": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "6439778526899109398": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17287404861045114619": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",348], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",59], + "10716913534741102635": ["convolution_gpu_bfyx_gemm_like",1], + "14784115394395151055": ["convolution_gpu_bfyx_gemm_like",2], + "9261867808456596636": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "14642845734482478360": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "4178614913813882037": ["convolution_gpu_bfyx_gemm_like",2], + "8282940696864401735": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10970459222330057357": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "7126601602274920416": ["convolution_gpu_bfyx_gemm_like",0], + "13946367911927964830": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4108707041101687664": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "8779987507326777359": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3377472614945731801": ["convolution_gpu_bfyx_gemm_like",2], + "1720057192283799086": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10174616678364842740": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2780358937598873103": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1410512481031922864": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13082713280504953535": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6827316954140278736": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "4243114942173293897": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13593258537178247801": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "5023609284081684300": ["convolution_gpu_bfyx_gemm_like",2], + "8837079302496539409": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "8178825467227185946": ["convolution_gpu_bfyx_gemm_like",2], + "2028273519579688266": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "15381014522874131924": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "15156836293519486753": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14571528890474602715": ["convolution_gpu_bfyx_gemm_like",2], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12921171323911432795": ["convolution_gpu_bfyx_gemm_like",0], + "5018845267269043034": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "9475130054420979752": ["convolution_gpu_bfyx_gemm_like",2], + "13491221531603384511": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "568114041320772862": ["convolution_gpu_bfyx_gemm_like",2], + "11413890625163220846": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2887152687927903549": ["convolution_gpu_bfyx_os_iyx_osv16",153], + "5573639264204952559": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "3961000444895975975": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17699579394941627848": ["convolution_gpu_bfyx_gemm_like",2], + "1465692634334679413": ["convolution_gpu_bfyx_gemm_like",0], + "3513523165606656242": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13906695412889750672": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "8361403425124294653": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10946069941293798874": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "8309889975288645282": ["convolution_gpu_bfyx_1x1",2], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "4679070030774970232": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "11229587372764249222": ["convolution_gpu_bfyx_gemm_like",2], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17608288706234084973": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "12700008320838073774": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "15822975685755664152": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "2930702812469156271": ["fully_connected_gpu_fb_io_ref",2], + "14907038741687299621": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15342520770460205985": ["convolution_gpu_bfyx_gemm_like",2], + "11795686089670429481": ["convolution_gpu_bfyx_gemm_like",2], + "17018377589252417538": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "17039095054151625163": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2446435710311724460": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10835598123347764626": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "304721598975479337": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "7713736987017889212": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "12427490329663434604": ["convolution_gpu_bfyx_gemm_like",2], + "3436433254188539886": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "6865406633958213363": ["convolution_gpu_bfyx_gemm_like",2], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "9130971535185609293": ["convolution_gpu_bfyx_gemm_like",2], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7908036427091174081": ["convolution_gpu_bfyx_gemm_like",2], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "13681462437496627948": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14545094765855515974": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3148053731303748054": ["convolution_gpu_bfyx_gemm_like",2], + "3036808833459559381": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4082046235109198108": ["convolution_gpu_bfyx_gemm_like",2], + "16027853591907232537": ["convolution_gpu_bfyx_gemm_like",1], + "3166885953206195915": ["convolution_gpu_bfyx_gemm_like",2], + "4242438539626727158": ["convolution_gpu_bfyx_gemm_like",1], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5724069285122500749": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "9239048433297419320": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14269161473352876138": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "170594581804738255": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10175150090660795910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11804035561861841621": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13312401790608349463": ["convolution_gpu_bfyx_gemm_like",1], + "1904461959474455864": ["convolution_gpu_bfyx_gemm_like",2], + "1847170421455825520": ["convolution_gpu_bfyx_gemm_like",1], + "9660812093766156608": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16587387608532583713": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11782514629636023633": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14513925709624513868": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8713639086785023623": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "15652392678782222737": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "3191417938329385213": ["convolution_gpu_yxfb_yxio_b16",2], + "10642327923162019888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15888454525088587794": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10736915975072972467": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "17267132595546153629": ["convolution_gpu_bfyx_gemm_like",2], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "5743482411668939203": ["convolution_gpu_bfyx_gemm_like",2], + "4251588408225461731": ["convolution_gpu_bfyx_gemm_like",1], + "16286085532892593349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3625906783784771100": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "4402303539054523204": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "8749468546606972791": ["convolution_gpu_bfyx_gemm_like",2], + "3714179297375678368": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4272784935990323993": ["convolution_gpu_bfyx_gemm_like",1], + "10205576142280465189": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17332230377845694888": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13663612869789682704": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16497757978901707098": ["convolution_gpu_bfyx_gemm_like",1], + "14853629175426765699": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6078344073564209080": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17287487062245049466": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "4554218761970822728": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "15531908897773912572": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6706802683366112205": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13387602037439694372": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5934532691347082124": ["convolution_gpu_bfyx_gemm_like",1], + "4491694127072416122": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "8122815203088327658": ["convolution_gpu_bfyx_gemm_like",2], + "770376597027620107": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14309292105974991733": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "685140170576742460": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18169371857833455144": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "18373068999874730591": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "16491532291908469567": ["convolution_gpu_bfyx_gemm_like",1], + "17628984504073918701": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "10384537928514123040": ["convolution_gpu_bfyx_gemm_like",2], + "5646139101524964833": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9758907700230386910": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2049445812114632861": ["convolution_gpu_bfyx_os_iyx_osv16",529], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "9968478753009937857": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "2668985670745598382": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "14433662482531248989": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8169762955969255618": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17875492671709861777": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13066055561434178894": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7318929661124340248": ["convolution_gpu_bfyx_gemm_like",1], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",2], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "13368477378531148593": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10082542799898846504": ["convolution_gpu_bfyx_gemm_like",2], + "265378250397648692": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15571801737237063594": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17825953644228876369": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "16355518852513270001": ["convolution_gpu_bfyx_gemm_like",2], + "13610246822402943068": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15939309688773899430": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4017163133829149027": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3518981281605476136": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "786418751322581924": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13002723770137829128": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "13671635457689276237": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1131384986902172221": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "17392347485675658099": ["convolution_gpu_bfyx_gemm_like",1], + "9241243727411869340": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16364899406120840449": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "9559550404190168365": ["convolution_gpu_bfyx_gemm_like",2], + "18012549942299450620": ["convolution_gpu_bfyx_gemm_like",2], + "5513667102916409932": ["convolution_gpu_bfyx_gemm_like",1], + "12144421857685107073": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10526411638069090068": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "3752278444736105763": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "6181272224000872375": ["convolution_gpu_bfyx_gemm_like",2], + "5167141379778311462": ["convolution_gpu_bfyx_gemm_like",1], + "9599667132406949054": ["convolution_gpu_bfyx_gemm_like",2], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "14298701404596322580": ["convolution_gpu_bfyx_gemm_like",2], + "1584529435111149552": ["convolution_gpu_bfyx_gemm_like",1], + "14261214737408786954": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "4366168099274266975": ["convolution_gpu_bfyx_gemm_like",1], + "14436334357815544497": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9464448984918455020": ["fully_connected_gpu_fb_io_ref",1], + "16462862831307415504": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11758765408733113291": ["convolution_gpu_bfyx_gemm_like",1], + "2140514316203117958": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17310332946322628458": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15465799788109255561": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3659996017773078064": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15808629700189777056": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12584692605608021657": ["fully_connected_gpu_fb_oi_ref",2], + "9042812985530274425": ["convolution_gpu_bfyx_gemm_like",2], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4298629909621573311": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "10578656188786691161": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1086052166358768751": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17459500507201824299": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "12022980249970038824": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "14118838785256822389": ["convolution_gpu_bfyx_gemm_like",2], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10972882561062503097": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "6100453836448514115": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1388093734262707746": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "4349976387188497685": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "2817383483458239293": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "8734419426540206087": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "18026468427978643933": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "6493920223660825755": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "262113403359175565": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "3086110559166474482": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "4614700272179482173": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10158184435144178161": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15515233599783472078": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5320623021116851093": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4790599496008369129": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14592395793778583608": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "17500857407975308984": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "17078700948595127028": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "788516646345239698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14244541340756841557": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "13939763360217628282": ["convolution_gpu_bfyx_gemm_like",1], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7157064096682175957": ["convolution_gpu_bfyx_gemm_like",1], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",1], + "7846532542186702987": ["convolution_gpu_bfyx_gemm_like",1], + "1452841775482537260": ["convolution_gpu_bfyx_gemm_like",2], + "3882955134902442387": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "10831204282620894983": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "2004120786408087671": ["convolution_gpu_bfyx_gemm_like",1], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "9696588462876533517": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "14792711236336832808": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "8485845304380573432": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8399477322910720113": ["convolution_gpu_bfyx_gemm_like",0], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "2968144776497288135": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "4839357013731987873": ["convolution_gpu_bfyx_direct_10_12_16",2], + "435261825003875448": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "14566257978356851712": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "11892210755884128272": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "11327237143350479466": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "9547451431091729288": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17252689774572814142": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "7070374681687005676": ["convolution_gpu_bfyx_gemm_like",2], + "3509502334639215181": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3547854341779526869": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "17303981366934280174": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7246177123265734169": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "4091001168041745125": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10118395047539851751": ["convolution_gpu_bfyx_gemm_like",2], + "11951606039079763598": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8154794217037682993": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12761366575293006784": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "12052225815821079044": ["fully_connected_gpu_fb_io_ref",1], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "16482301217529090205": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "7753336153932360422": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11561790484526369917": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "4660214425505918397": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2903075619523363020": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "40684756725622867": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "10902108166827340970": ["convolution_gpu_bfyx_gemm_like",2], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10269005969451576527": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "14712137616211915593": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "17405865057155583042": ["convolution_gpu_bfyx_gemm_like",2], + "14908665013877276517": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "13775529405693629438": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8240616667079698459": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "8402396502992483524": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "201277063146140086": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "1187622888238643867": ["convolution_gpu_bfyx_gemm_like",2], + "12634802060661668222": ["convolution_gpu_bfyx_1x1",1], + "4563407231964979217": ["convolution_gpu_bfyx_gemm_like",1], + "15890492401334524258": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16164111348549092216": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "17433037267999205350": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "7371339724529362579": ["convolution_gpu_bfyx_gemm_like",2], + "18259001228411909210": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "12994819742376207273": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "1630585964216121575": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11576182324195008022": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "14539163960605215528": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17609882667499000436": ["convolution_gpu_bfyx_gemm_like",0], + "14251403312385260177": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "17868294056467093895": ["convolution_gpu_bfyx_gemm_like",2], + "5754844816339228920": ["convolution_gpu_bfyx_gemm_like",1], + "16124702296533772526": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",2], + "10624567684389583173": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "14146157492452859667": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13074593348097634731": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "5335250793358473555": ["convolution_gpu_bfyx_gemm_like",1], + "10104091044601583658": ["convolution_gpu_bfyx_gemm_like",1], + "3020115657931277672": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9549667332801021099": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15816540550252147706": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14484004336536993120": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7122950455826378169": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17208186152576814861": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "5291944277945000781": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "11433534680781300610": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "1925626127045202964": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "15421166985948480394": ["convolution_gpu_bfyx_gemm_like",1], + "4640696923527766618": ["convolution_gpu_bfyx_gemm_like",2], + "3064765745900772872": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "7744787957569714828": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10500029207807372735": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2894138412746654795": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "11599990834682830362": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17873182129275583020": ["convolution_gpu_bfyx_gemm_like",2], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "2585767464396438954": ["convolution_gpu_bfyx_gemm_like",0], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "166267183356660549": ["convolution_gpu_bfyx_gemm_like",2], + "13657522194775317201": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "18404344881797725263": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "12647099325257717945": ["convolution_gpu_bfyx_gemm_like",2], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15718782218800307385": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "6397841935795796056": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "1050570995635673400": ["convolution_gpu_bfyx_gemm_like",2], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "4642402648038764246": ["convolution_gpu_bfyx_gemm_like",2], + "1197184887743937394": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5357531127711906072": ["convolution_gpu_bfyx_gemm_like",1], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9468542963649996822": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "14930745998253392722": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "12896159402462325805": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "241860795253927746": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "1484007449719260391": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1891216794223363114": ["convolution_gpu_bfyx_gemm_like",2], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "17096175733187202673": ["convolution_gpu_bfyx_gemm_like",2], + "7391591731082133842": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "8643089982608103149": ["convolution_gpu_bfyx_1x1",2], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "2728956755635458379": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4716188972902735458": ["convolution_gpu_bfyx_gemm_like",2], + "10093554313775878065": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",538], + "4495774394017823312": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "6644418194983229139": ["convolution_gpu_bfyx_gemm_like",2], + "3389739049224815652": ["convolution_gpu_bfyx_gemm_like",2], + "13027039165868458729": ["convolution_gpu_bfyx_gemm_like",1], + "16831114690704826637": ["convolution_gpu_bfyx_gemm_like",1], + "8863398172720091880": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "7108596712012465804": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1907439276166837309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10743628077362128751": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11374410888638324212": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2786925522916317149": ["convolution_gpu_bfyx_os_iyx_osv16",761], + "4797026040899499511": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "17257466221539644081": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "16468779692009938330": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "10608496431404827757": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12138341287265949399": ["convolution_gpu_bfyx_gemm_like",1], + "1682776041247037802": ["convolution_gpu_bfyx_gemm_like",2], + "14716719350966652036": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15260448822338206631": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17509205154057032109": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "12840204133991239572": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "3522455279376021211": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "12753199606413122334": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "15183511809138557392": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "6527268791835193134": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10885831773581103653": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1954052357826969119": ["convolution_gpu_bfyx_gemm_like",1], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "2746052215199129520": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "6943519872561469460": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "8367989677286805427": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12534001599784153836": ["convolution_gpu_bfyx_gemm_like",1], + "11632948358256249708": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "7982628452987720190": ["convolution_gpu_bfyx_gemm_like",2], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "2929980913168445753": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "17308063122516317342": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "4202116155711873525": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17947613081555491099": ["fully_connected_gpu_fb_oi_ref",1], + "8963262014498730146": ["convolution_gpu_bfyx_gemm_like",1], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "8740196547852036537": ["convolution_gpu_bfyx_gemm_like",1], + "3503236715353689942": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "77240414396225397": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16081988990653666386": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "15668060723417155782": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2460415719642436412": ["convolution_gpu_bfyx_gemm_like",1], + "8335501317577461610": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7811861756798601201": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4196367396954155354": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3910549475873353422": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "3438852523146175580": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12700957546822808929": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3588791913550955553": ["fully_connected_gpu_fb_oi_ref",1], + "6303003639592032299": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "8467771025017377254": ["convolution_gpu_bfyx_gemm_like",2], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "9854440591497995284": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9459869325970475576": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "451787079167744428": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "4369680877112803848": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10425622870001886240": ["convolution_gpu_bfyx_gemm_like",2], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "11595387512434355394": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "17021953651379372973": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7853648744637103420": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "16181623411787179429": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11231597775940542830": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "14384062335728088286": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "4860861645314518892": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "14178934083928811388": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8509941319309380587": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "6857064389795419021": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "12314918602191412697": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14349335089732252796": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12923653434892323603": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "10625675062556386448": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17504669611941355931": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4248427635083216412": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "11815825155082424936": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "15325810055037682679": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "11857403052583858392": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2328698995040390396": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "11107930597263802755": ["convolution_gpu_bfyx_gemm_like",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "10127626701775288565": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1579733029852052699": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "4860019935631927113": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "3434842614653335826": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "12882754981683858333": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "17001492460236540325": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "10902747200305475466": ["convolution_gpu_bfyx_gemm_like",0], + "9987939079053625302": ["convolution_gpu_bfyx_gemm_like",2], + "850343942782057099": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12371817808483211497": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12977141272959735649": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10295400862890021635": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8962502004422485576": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1691020960118022320": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "14397348576352573007": ["convolution_gpu_bfyx_gemm_like",2], + "5053369963163583573": ["convolution_gpu_bfyx_gemm_like",1], + "13447226378200557777": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13564654155363057485": ["convolution_gpu_bfyx_gemm_like",2], + "15496355513574200965": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "14022116362268035779": ["convolution_gpu_bfyx_gemm_like",2], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "17923260699148240081": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "9940300152880498818": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "5284132464580556804": ["convolution_gpu_bfyx_gemm_like",1], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "2830742500858558621": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16596028606733932975": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17392732266843821039": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "10806992251978564302": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "13716836930727272782": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13126786259906598018": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "368147139706197757": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14973411884734235059": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "269829518575229806": ["convolution_gpu_bfyx_direct_10_12_16",2], + "425222358618423500": ["convolution_gpu_bfyx_gemm_like",2], + "8212533074856783509": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "1158407843601379115": ["convolution_gpu_bfyx_gemm_like",0], + "17026348860895225619": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "13702914647519703599": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9647713236241614167": ["convolution_gpu_bfyx_gemm_like",2], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15982499072593548907": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "15158468970890089465": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "2116524516810466877": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4600261954762222519": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "14116682822622440033": ["convolution_gpu_bfyx_gemm_like",2], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",0], + "13071064509662090710": ["convolution_gpu_bfyx_gemm_like",2], + "15178012823756517910": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14702670413549232065": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1818234431954731769": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "4104945759139088078": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "13420802275377435086": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "84858894896261863": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "14893822644567136435": ["convolution_gpu_bfyx_gemm_like",2], + "10365519690439054710": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16852690434396099861": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "1034716660124798032": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7561761907958081895": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "835367600773871252": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4151997155802743451": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5699637716202391188": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "238804705672659503": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "5340016094501559693": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "922541506531537121": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "8529170838214082841": ["convolution_gpu_bfyx_gemm_like",2], + "6323026044750482867": ["convolution_gpu_bfyx_gemm_like",2], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4398254363079659976": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3340594153142636962": ["convolution_gpu_bfyx_gemm_like",2], + "3430998232987873998": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "17526891234501366023": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "15078379507314446744": ["convolution_gpu_bfyx_gemm_like",2], + "585914943085061885": ["convolution_gpu_bfyx_gemm_like",2], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "17453621319901961773": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "13657774210341324470": ["convolution_gpu_bfyx_gemm_like",1], + "14237815472706635543": ["convolution_gpu_bfyx_gemm_like",2], + "16767564582561837873": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "9539616823548370185": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "176148486634277377": ["convolution_gpu_bfyx_gemm_like",2], + "1891073256003809934": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "8800251965243080024": ["convolution_gpu_bfyx_gemm_like",2], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "3015996171698570561": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3219239043521617253": ["convolution_gpu_bfyx_gemm_like",2], + "6630020506382714373": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "10384416235770656262": ["convolution_gpu_bfyx_gemm_like",1], + "4732699611696731044": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8680545947510235993": ["convolution_gpu_bfyx_os_iyx_osv16",667], + "13940433448128376511": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "7822148442995976259": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1550689033020233966": ["convolution_gpu_bfyx_gemm_like",2], + "10848407542826653699": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "6517802281521111563": ["convolution_gpu_bfyx_gemm_like",1], + "9144136375141111897": ["convolution_gpu_bfyx_gemm_like",2], + "11307721164906705899": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "1854265455057352782": ["convolution_gpu_bfyx_direct_10_12_16",2], + "604467633591545941": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11240189248024145687": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "505027953105355818": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4211445170027080823": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4406157095142118884": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "12329909110827539139": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10062957707721107508": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15971924211584724882": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4447895709141687848": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6641348239674215714": ["convolution_gpu_bfyx_gemm_like",2], + "544003022213487787": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "7881314798558018337": ["convolution_gpu_bfyx_gemm_like",2], + "14718143989976451689": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "2305706332728008948": ["convolution_gpu_bfyx_gemm_like",1], + "12451592945087000191": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7877872008801536537": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",860], + "1460916897832302487": ["convolution_gpu_bfyx_gemm_like",1], + "14156845527754813253": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15659671804906879034": ["convolution_gpu_bfyx_gemm_like",2], + "14838067105091112485": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4451257789691974239": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "7410628771323937530": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "3285688984628545255": ["fully_connected_gpu_fb_io_ref",1], + "6768322540857745605": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16126210124715599267": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "15911508155433936727": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "10794662801660960189": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "8985531644129639832": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",165], + "7396823789595001064": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "10533367671706069274": ["convolution_gpu_bfyx_gemm_like",2], + "4007319206075386920": ["convolution_gpu_bfyx_gemm_like",2], + "4614042998549572181": ["convolution_gpu_bfyx_gemm_like",2], + "13694766887442024878": ["fully_connected_gpu_fb_io_ref",2], + "9556219639756304369": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "14269654271903961430": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "13319880343534837963": ["convolution_gpu_bfyx_gemm_like",1], + "15643053402284856082": ["convolution_gpu_bfyx_gemm_like",1], + "15101986369567160956": ["convolution_gpu_bfyx_gemm_like",2], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "7726714223809300966": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14600034178934274457": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "14248587383098743406": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "3935883681780676157": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "11800958516083095340": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3860603464276263676": ["convolution_gpu_bfyx_gemm_like",2], + "1117787205894124896": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "3031115694124492679": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "12194352995334529714": ["convolution_gpu_bfyx_gemm_like",2], + "11058082057683584650": ["convolution_gpu_bfyx_gemm_like",2], + "5485050451156514865": ["convolution_gpu_bfyx_gemm_like",2], + "17886436103211436626": ["convolution_gpu_bfyx_gemm_like",2], + "13170031087212196468": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "14433939319502072879": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5134857932624749530": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15737508945513376813": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15809072026388479729": ["convolution_gpu_bfyx_os_iyx_osv16",283], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11367813096511965002": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "3919577663893354177": ["convolution_gpu_bfyx_gemm_like",1], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "12136803297132972709": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3526198034974948081": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "7831542641855749925": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7254869458810021127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "830147122986411443": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2033072905537284499": ["convolution_gpu_bfyx_gemm_like",2], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "15464714725848277081": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5185125307593023170": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "13550337096609413041": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "41672385434660942": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "2933183897022161826": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",59], + "13886526360627032217": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "4047806462440750215": ["convolution_gpu_bfyx_gemm_like",1], + "17713034180977313726": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9025790715924779508": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "12281346074445607180": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13414375996946350733": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17466025028296506313": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "316225690176910392": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3005178737729927131": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "5011273172385428756": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17715553891959228879": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2301409406426420354": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "7281661441196896385": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "5714538749435744920": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "10888435127006141874": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11312797737791604596": ["convolution_gpu_bfyx_gemm_like",2], + "1062508357634542606": ["convolution_gpu_bfyx_direct_10_12_16",0], + "18009083375897554008": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "13036499105391951007": ["convolution_gpu_bfyx_gemm_like",2], + "13132550921538397546": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "10290107543739998181": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "16484600784717969318": ["convolution_gpu_bfyx_gemm_like",1], + "6086336348849756671": ["fully_connected_gpu_fb_io_block_fp16",0], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "572155668587252712": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "12576360049619146496": ["convolution_gpu_bfyx_gemm_like",2], + "16683485007140805060": ["fully_connected_gpu_yxfb_ref",2], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "3256940792095638732": ["convolution_gpu_bfyx_gemm_like",1], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4428125859693766145": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9655550151067451233": ["convolution_gpu_bfyx_gemm_like",2], + "1480287432874335824": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "14366252780310630703": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8333743604646422982": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2932914865200583326": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "9970142663470031403": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15259825477604482502": ["convolution_gpu_bfyx_gemm_like",0], + "8939683514448064461": ["convolution_gpu_bfyx_gemm_like",2], + "11612998433409522582": ["convolution_gpu_bfyx_gemm_like",2], + "6914536960012332706": ["convolution_gpu_bfyx_gemm_like",2], + "13076343553185159307": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18086782289842715645": ["convolution_gpu_bfyx_gemm_like",1], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "15561518067918160695": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "4805958162773855302": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "14122018505646948996": ["convolution_gpu_bfyx_gemm_like",2], + "17407904982433770732": ["convolution_gpu_bfyx_gemm_like",1], + "2623687018437195679": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5127769906401798990": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16689318540732157754": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",2], + "6647969101146756031": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "8812763803467512830": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7499082230554771515": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "13510598063226540077": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9963817056423168830": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4171848506399696854": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "7780366826820540504": ["convolution_gpu_bfyx_gemm_like",2], + "13206826317378863148": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "8175595372513695437": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "8224143262995973449": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4999210721703970274": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "9937641338455246118": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8161047856682416508": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13443130482173929700": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "281287280558289393": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "10413043556440687328": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "12890207857767896504": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "18157442326218165947": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15384168056682476462": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "557778263661655803": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "14650273075211365393": ["convolution_gpu_bfyx_gemm_like",2], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7559892774312756176": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "14203217958874365062": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "7230623964042057933": ["convolution_gpu_bfyx_gemm_like",2], + "3761770343527826418": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9416236213942870134": ["convolution_gpu_bfyx_gemm_like",2], + "12096396455109952715": ["convolution_gpu_bfyx_gemm_like",2], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15380105196319354141": ["convolution_gpu_bfyx_gemm_like",1], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9423239651872522813": ["convolution_gpu_bfyx_gemm_like",1], + "6326191473779365124": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "17430593168191424639": ["convolution_gpu_bfyx_gemm_like",1], + "9516288831713776693": ["convolution_gpu_bfyx_os_iyx_osv16",272], + "15364374265752682266": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "14037325204801680738": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "3170336071769787200": ["convolution_gpu_bfyx_gemm_like",2], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11532872181912525509": ["convolution_gpu_bfyx_gemm_like",2], + "2652267888871336297": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "2470579932413307757": ["convolution_gpu_bfyx_gemm_like",2], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14904665242518014005": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "4986977887030495943": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14537994197428038805": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12655099960717366198": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6013434489252641471": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "1132353580998754406": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14824758036755713701": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9080269503597463911": ["convolution_gpu_bfyx_gemm_like",2], + "8620072463881015653": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14084855778741260863": ["convolution_gpu_bfyx_gemm_like",2], + "5185895996350118172": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2532962442388536022": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4545501713797069587": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2152903140704848574": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5871082277006078841": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "5050495757462452653": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3830091089824446164": ["convolution_gpu_bfyx_gemm_like",1], + "6981537186704688907": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7005371843527735283": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "15314178289202641916": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "9531730330306606343": ["convolution_gpu_bfyx_os_iyx_osv16",153], + "11825205449232126827": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2724007091383127418": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "18242682488017822077": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",0], + "6962268765187856246": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12610004507393467447": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5448665190811365701": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",91], + "10236258478395201152": ["convolution_gpu_bfyx_direct_10_12_16",1], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "1895945774251432343": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11102920976866402928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13277308739029064167": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "4834743410195700260": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "397445657349822499": ["convolution_gpu_bfyx_direct_10_12_16",2], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "16494581774051338901": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5608447459568229694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3819763245853861272": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "12323418436121785375": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8200094670006738584": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15550722997950669458": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16801078648431425148": ["convolution_gpu_bfyx_gemm_like",0], + "1995546197385478214": ["convolution_gpu_bfyx_gemm_like",2], + "9601849246293120347": ["convolution_gpu_bfyx_gemm_like",2], + "6638761803107874904": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "1622731194539871461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13192885349640152576": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4129586781834275070": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "15456771485750114116": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4327450388326573746": ["convolution_gpu_bfyx_gemm_like",2], + "12503605837910457108": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "4445912157712391517": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3971456598769336038": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5454796925594082324": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4229105529069729944": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15924144379094505874": ["fully_connected_gpu_fb_oi_ref",1], + "17015791782274123780": ["convolution_gpu_bfyx_gemm_like",1], + "18139055731468596187": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11806402239500046867": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "12008952324872799824": ["convolution_gpu_bfyx_gemm_like",2], + "5939121107940759940": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "10897622326486559468": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "10609980283092655115": ["convolution_gpu_bfyx_gemm_like",1], + "3557182643072772598": ["convolution_gpu_bfyx_gemm_like",1], + "5589785455223385189": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9410125656044318792": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "1375259485223819020": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "2101721234597882962": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10858234923346500323": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18112958483003382733": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1330842758352650583": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8898910394425958745": ["convolution_gpu_bfyx_gemm_like",1], + "3314459110790355757": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15976399554094563736": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "16389826434776949524": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12251901229904154127": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "2264520082689779253": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "8093154215631195896": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9236621881488650027": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10965563190266380694": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "5587539329568150667": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6651097363666320726": ["convolution_gpu_bfyx_gemm_like",2], + "3831201505512446456": ["convolution_gpu_bfyx_gemm_like",1], + "17086887873464601732": ["convolution_gpu_bfyx_gemm_like",1], + "13292923826380958700": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2511072616914149110": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10598995451755327159": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "6925829066248055368": ["convolution_gpu_bfyx_gemm_like",2], + "517601465150912854": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "251300311986835571": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1241188741090538769": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "18133614045401867449": ["convolution_gpu_bfyx_gemm_like",2], + "10933247456003592661": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "17666004363345457085": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17771447090715962298": ["convolution_gpu_yxfb_yxio_b16",2], + "9608917563823863132": ["convolution_gpu_bfyx_gemm_like",2], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1006527610094211417": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "15927212142469570269": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8235002440285527553": ["convolution_gpu_bfyx_gemm_like",1], + "4192716493303517040": ["convolution_gpu_bfyx_gemm_like",2], + "3380653500106294036": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "11453044274130869816": ["convolution_gpu_bfyx_gemm_like",2], + "9547404823672679740": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12862797248089361992": ["convolution_gpu_bfyx_gemm_like",2], + "17179123144975837983": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2543041530639980505": ["convolution_gpu_bfyx_gemm_like",1], + "7439340221097179208": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4101449235783342476": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5032195346490064156": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15754688305730191542": ["convolution_gpu_bfyx_gemm_like",2], + "6914775146138105785": ["convolution_gpu_bfyx_gemm_like",2], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "15245792492785141641": ["convolution_gpu_bfyx_gemm_like",2], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "3005276417937854742": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9636232825599826837": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "4646176801168621136": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1104489643524273315": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8131682691875884781": ["convolution_gpu_bfyx_gemm_like",1], + "3730207439375250056": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17399103575103078835": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2310549887200001260": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "13491655481292956895": ["convolution_gpu_bfyx_gemm_like",1], + "11569367085498045793": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "13729951531199985382": ["convolution_gpu_bfyx_gemm_like",2], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4792657031481471098": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "296202142406900242": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "11140864132614066113": ["convolution_gpu_bfyx_gemm_like",2], + "12072890225919159372": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14840301687056551916": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "16211466749116679534": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16304402386608713955": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18332090297993015499": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "11777373751892075391": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "95993272253183796": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3292879092145281224": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "15642549417953837059": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10354305663463607086": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "18160969423211875528": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "4593261844817210660": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15240660399630429406": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3643466095681664346": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "12847879935060092791": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "17769703068450272262": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "232382233865868417": ["convolution_gpu_bfyx_gemm_like",2], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "3244402155461139559": ["convolution_gpu_bfyx_gemm_like",2], + "7903891232234389925": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8479047101064948298": ["convolution_gpu_bfyx_gemm_like",0], + "7430073011895298582": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16033144151193421543": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4304943753428518690": ["convolution_gpu_bfyx_gemm_like",2], + "9756049510998074315": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "592364460086746355": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "8922463054055280800": ["convolution_gpu_bfyx_gemm_like",2], + "17116941326889312928": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9040986180016264906": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17185089684685480638": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12386437738920143482": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10804406975968573869": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "2338707843044884352": ["convolution_gpu_bfyx_gemm_like",2], + "10773411423039491193": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "15097371415144491976": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "8107597524360102037": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8740183428702591218": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "596934040273798962": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18067353229273804720": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8469338060514215816": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1202020283576886284": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11261619081095309088": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "9573589861499897842": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11459784003592366395": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17517541283617012275": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "2418288192668085805": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17406383217119217230": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2916077416184925232": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "937050062571228573": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7806837641999814363": ["convolution_gpu_bfyx_gemm_like",2], + "3939977982577786175": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "5635449856699664273": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4846216894450341698": ["convolution_gpu_bfyx_gemm_like",1], + "801864263975761712": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "18349087959351486710": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "9475812329914836280": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17784882947271841103": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "530825424084837479": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "16900305050319129555": ["convolution_gpu_bfyx_gemm_like",2], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "8913451832923806760": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5287076386757143976": ["convolution_gpu_bfyx_direct_10_12_16",2], + "88592091379585141": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1838534101161814609": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11970466555294072275": ["convolution_gpu_bfyx_gemm_like",2], + "14335423820860953927": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7441199361135503715": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "14731393773801790100": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3012268657922581268": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10835321391911234206": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10532500300200244159": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "8028456017016080468": ["convolution_gpu_bfyx_gemm_like",1], + "14289048840489035546": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8653024334982611044": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "4557272439632791722": ["convolution_gpu_bfyx_gemm_like",1], + "18134140047840716203": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "8436644625511258721": ["convolution_gpu_bfyx_gemm_like",2], + "15516674573659704770": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "2521821959816944292": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17703907155485973486": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "2839767407547705101": ["convolution_gpu_bfyx_gemm_like",2], + "11829442945690098558": ["convolution_gpu_bfyx_gemm_like",1], + "7132441144511706824": ["convolution_gpu_bfyx_gemm_like",0], + "5745481082184931194": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4860779741225078946": ["convolution_gpu_bfyx_gemm_like",1], + "5296506025538423220": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17601171646153308079": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "14553738887970260308": ["convolution_gpu_bfyx_gemm_like",2], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",435], + "12353956380178079089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "3150231129728961455": ["convolution_gpu_bfyx_gemm_like",1], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "10604830376938742429": ["convolution_gpu_bfyx_gemm_like",2], + "3170785962566427770": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11809236497308682596": ["convolution_gpu_bfyx_gemm_like",2], + "8997120235555587461": ["convolution_gpu_bfyx_gemm_like",2], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11067412830219638639": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5812274221348979687": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "545425355231744794": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "18445243511250094011": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "796900095669815456": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "12492763342322011136": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13767795972414139958": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "9083686317073801642": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10068502639160680134": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "7504663136669214601": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3968994333196289265": ["convolution_gpu_bfyx_gemm_like",2], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "11673569290324764842": ["convolution_gpu_bfyx_gemm_like",0], + "10306169610486701545": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "1474719104479956715": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6496839689453807726": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "9199174367023202640": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7959969582538910953": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13744951984978188201": ["fully_connected_gpu_fb_io_ref",0], + "6643161848623134458": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "18235067315439611192": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "12028030221272546172": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "2451712485584835395": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13818587810073749596": ["convolution_gpu_bfyx_gemm_like",1], + "9673176853197584682": ["convolution_gpu_bfyx_gemm_like",1], + "18076129452098771655": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9765339420071627045": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "17762040448815681058": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "8892991171111842341": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10662239532841666965": ["convolution_gpu_bfyx_gemm_like",1], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "3164422950831542784": ["convolution_gpu_bfyx_gemm_like",2], + "17075150439662364176": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11361013180071053597": ["convolution_gpu_bfyx_gemm_like",1], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2150284597332493904": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "1961296939362567851": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9743806043658380623": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "7243161613448507792": ["convolution_gpu_bfyx_gemm_like",1], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "5401523175111660554": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5298952273692538291": ["convolution_gpu_bfyx_gemm_like",1], + "7085416207166146240": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "3830787224073518842": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "8305500373806058745": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2577413012740709678": ["convolution_gpu_bfyx_gemm_like",2], + "15134268179029323647": ["convolution_gpu_bfyx_gemm_like",2], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4055753250105853003": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6158514925486943212": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8131617570786904723": ["convolution_gpu_bfyx_gemm_like",2], + "7510055418609679364": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "11655994466278963438": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15912553971677187913": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "16485921493309285440": ["convolution_gpu_bfyx_gemm_like",2], + "9758759365463492505": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "3290503865540626256": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10681304359334525584": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8631194673451861459": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12707748441880165396": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4318632837402329958": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "3475757648408068589": ["convolution_gpu_bfyx_gemm_like",2], + "16522546805419218429": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14634279730953549909": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "17446388159565719362": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2251572761614039612": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "14287890401250603057": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4987922194420804256": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "3382494956350224120": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9747165558500755104": ["convolution_gpu_bfyx_gemm_like",2], + "15434706304418357961": ["convolution_gpu_bfyx_gemm_like",2], + "10624246057883518638": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "18106333667377667797": ["convolution_gpu_bfyx_gemm_like",2], + "6329618009202266591": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "16120159001372711511": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "5124241485043124110": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "14108113294744119367": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "6809026385816665583": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2174528711050181972": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "15212317205888563836": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14408266407898585602": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "17993337310288098038": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4586633477264151844": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15180406256083730261": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "9145357433824567384": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "791937929163665770": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "5214678408335388758": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15653223776766070604": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "962676948282027870": ["fully_connected_gpu_fb_io_ref",1], + "12843856637642525155": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "5769404877199637961": ["convolution_gpu_bfyx_gemm_like",2], + "2056766012044921101": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17358462939783262207": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3658149289395969504": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "17638692805430115529": ["convolution_gpu_bfyx_gemm_like",2], + "13462726136352103466": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4538102435488584866": ["convolution_gpu_bfyx_gemm_like",1], + "8809794528993445200": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",2], + "6133854782246597175": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10686870945055880185": ["convolution_gpu_bfyx_gemm_like",0], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13044020050176766314": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17152100243867367458": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "15511138074959300404": ["convolution_gpu_bfyx_gemm_like",2], + "14001920054473316909": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14233219774448115529": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4377137812917082153": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "11494395549955384747": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "5843679089588930933": ["convolution_gpu_bfyx_os_iyx_osv16",146], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "5582107298039488951": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "7394848434332739139": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8515479970005301094": ["convolution_gpu_bfyx_gemm_like",2], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6771637612965430926": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "16150934538381572916": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "12083217714727863832": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "4368522743441422202": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "2238901105639912692": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "14522844693999581518": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "12173409033330010794": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16815680874311765189": ["convolution_gpu_bfyx_gemm_like",1], + "9852052796465340830": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "9628735886189157469": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "2348721939771018658": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4833761011498696645": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "11088128828863596806": ["convolution_gpu_bfyx_gemm_like",1], + "14322392426975869640": ["convolution_gpu_bfyx_gemm_like",1], + "6056291179600370019": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1934379409955686502": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17235360775064303316": ["convolution_gpu_bfyx_gemm_like",2], + "4568839461523224811": ["convolution_gpu_bfyx_gemm_like",2], + "5680888227752935228": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "3223787640285180270": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15160192060731796225": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "18125075313255528454": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11487565672628286526": ["convolution_gpu_bfyx_gemm_like",0], + "9038991914155436715": ["convolution_gpu_bfyx_gemm_like",1], + "17274625805315816028": ["convolution_gpu_bfyx_gemm_like",1], + "12278842522836720245": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2912984501615111849": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2363414141971004557": ["convolution_gpu_bfyx_gemm_like",1], + "10665697051755790682": ["convolution_gpu_bfyx_gemm_like",2], + "8162762980597497749": ["convolution_gpu_bfyx_gemm_like",2], + "288853243482418538": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11661214901264500438": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17285699593273891901": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4780830855450408093": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7504074736798125353": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11114015660322254541": ["convolution_gpu_bfyx_gemm_like",2], + "12421288552109066791": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",2], + "5122639094068865656": ["convolution_gpu_bfyx_direct_10_12_16",2], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15936869458531244961": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11331539079347079374": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "9154705094446538279": ["fully_connected_gpu_fb_oi_ref",0], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9914440875772341708": ["convolution_gpu_bfyx_gemm_like",2], + "6660077021779164371": ["convolution_gpu_bfyx_gemm_like",2], + "9729771183572950642": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5740738339752793113": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5361028467247182860": ["convolution_gpu_bfyx_gemm_like",2], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "8045697952241865861": ["convolution_gpu_bfyx_gemm_like",2], + "10876578967419315028": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "1650080413259413393": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "2281832083123936555": ["convolution_gpu_bfyx_gemm_like",2], + "914589847837601900": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6400671582981760192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18186437875509712500": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "12582321591799165205": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "12518571127411736885": ["convolution_gpu_bfyx_gemm_like",1], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "16169024543367503806": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "15993651594402422200": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3447774474841314860": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "3805991105758534542": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "14601915376467155290": ["convolution_gpu_bfyx_gemm_like",0], + "15140592697506341614": ["convolution_gpu_bfyx_gemm_like",1], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",2], + "7426788519998680898": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "5321807316257768": ["convolution_gpu_bfyx_gemm_like",1], + "13629962867123974535": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15470979879166640563": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "6065819201836017182": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9219978118417391687": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10253092389452603623": ["convolution_gpu_bfyx_gemm_like",1], + "2950917846016525392": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6489448536745533209": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "15863633107759120207": ["convolution_gpu_bfyx_gemm_like",0], + "15816980369722540994": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "11000413508839562976": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16131386739027190836": ["convolution_gpu_bfyx_gemm_like",2], + "6553565990795990748": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "4695182996147218495": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "11812216902426327523": ["convolution_gpu_yxfb_yxio_b16",2], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "15158997684077722015": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "10672816826126184746": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",656], + "11906319144823550582": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "9781830607177020570": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4534480875955599254": ["convolution_gpu_bfyx_direct_10_12_16",2], + "682912708716537431": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2346855978590136528": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "6095158932103797740": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11997615422168828775": ["convolution_gpu_bfyx_gemm_like",2], + "14423094456821270228": ["convolution_gpu_bfyx_gemm_like",2], + "10997029728191881587": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7279393739634103483": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7998930863626763670": ["convolution_gpu_bfyx_gemm_like",2], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6526586547926160627": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11797589297451289242": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "5271530745426214211": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "1876286132660871464": ["convolution_gpu_bfyx_gemm_like",0], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "18043340998699622388": ["convolution_gpu_bfyx_gemm_like",2], + "3671753639665974938": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "16035563519857925932": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "6683090495189325653": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "7142195383189497127": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "10381752670329683275": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "7390751298966198773": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8377593240579657721": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2498920887656279332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14577775579978745344": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10054253863699485503": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "928757863265393904": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17845195044080380488": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9444953530704856016": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13221156296791499146": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "1239861345413267621": ["convolution_gpu_bfyx_gemm_like",2], + "13219865669259079983": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",0], + "8779164026828163571": ["convolution_gpu_bfyx_gemm_like",1], + "5094419710576598497": ["convolution_gpu_bfyx_gemm_like",2], + "14585370009659482450": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "2794704364476462562": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "562221645849170027": ["convolution_gpu_bfyx_gemm_like",2], + "1878679922772738648": ["convolution_gpu_bfyx_gemm_like",2], + "5759260743809103651": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "7878217536124016199": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "2213068950786625268": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "9875997976286355123": ["convolution_gpu_bfyx_gemm_like",1], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "2713481951804190325": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "17162489604305127396": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "9322011063845207679": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17025997656996518171": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6251247460381059571": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2930848604606590505": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4141616050120443260": ["convolution_gpu_bfyx_gemm_like",1], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "13248567106128518549": ["convolution_gpu_bfyx_gemm_like",2], + "11555678098290364758": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14827538610133799379": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3286476039871096924": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3047710665820732705": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "7014674808417899328": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2656076513222828369": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "17381682740282686038": ["convolution_gpu_bfyx_gemm_like",1], + "2510919738337557939": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14121939808880396150": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "8253823502854784432": ["convolution_gpu_bfyx_gemm_like",2], + "1373904073013943690": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "17808913959977434594": ["convolution_gpu_bfyx_gemm_like",1], + "13103537372248097713": ["convolution_gpu_bfyx_gemm_like",1], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "9207334433308148635": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "838825600917352376": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15148625184033310404": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14705509109623500235": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18443643871208996500": ["convolution_gpu_bfyx_gemm_like",2], + "12338760476079493547": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "172584114180442549": ["convolution_gpu_bfyx_gemm_like",1], + "1923745286075356181": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17844743590995529463": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "15241191584896579183": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",166], + "4107186383182650542": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "15662207751131195569": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3124997104810767514": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13095408117538194584": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "7807168142899312025": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "18265901700619296616": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11213283109763090897": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2420425134749678611": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "4195847890935259046": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4108579755980014185": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17023103136234805388": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9823997593704517392": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "10133406610245448421": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6953499208425592115": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "6431838057506760173": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17551915565459110848": ["convolution_gpu_bfyx_gemm_like",2], + "3590316457726550768": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14128599551956588603": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "11049130623091275457": ["convolution_gpu_bfyx_gemm_like",2], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",0], + "17825280904760131680": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",2], + "18180491232489548313": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14980327142253281498": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8529647257749011908": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13289721141799196039": ["convolution_gpu_bfyx_gemm_like",2], + "14200479385082007529": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "11533151357949131860": ["convolution_gpu_bfyx_gemm_like",2], + "7761195307416102494": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "10966081583785531511": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "3464774409833295689": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "6430450975098624706": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11378458002317912396": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9910414853336797922": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7617773507561261623": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17546650302679801134": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4301372734564127254": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10540323786245205242": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "2929690114697368478": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12554532636938441328": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8114928396876060694": ["convolution_gpu_bfyx_direct_10_12_16",0], + "4573547058027867538": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1146282291269334070": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1192709652314183388": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7552049239568474944": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10720525166362537653": ["convolution_gpu_bfyx_gemm_like",2], + "3332444589775844154": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5977248663249062384": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11215862132334892351": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "10362264665270226136": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "6999530153839596796": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "6823494099194746145": ["convolution_gpu_bfyx_gemm_like",1], + "9141802671320572984": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8032685176029570383": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11260588538207111217": ["convolution_gpu_bfyx_gemm_like",1], + "11892088065638996743": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13014443130752087867": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "17177353407003831190": ["convolution_gpu_bfyx_gemm_like",2], + "16351593165006175213": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "10254790628108678637": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2431427502927207912": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16808618754363181939": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4695273549696315193": ["convolution_gpu_bfyx_gemm_like",2], + "10289725524396556967": ["convolution_gpu_bfyx_gemm_like",2], + "5348059680010171141": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "3125577147662589592": ["convolution_gpu_bfyx_gemm_like",2], + "3400775107143248024": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9172445047535982729": ["convolution_gpu_bfyx_gemm_like",1], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",466], + "6796998865297819946": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "7654445730724243959": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17300963371220857043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",11], + "8079376692609682448": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "14276876004054588508": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "8369833730195120673": ["convolution_gpu_bfyx_gemm_like",2], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "8434794604559592624": ["convolution_gpu_bfyx_gemm_like",1], + "7942294816235384071": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "5553176511624221429": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "17869697579874327192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "10754450245035836188": ["convolution_gpu_bfyx_gemm_like",2], + "9583760104223104233": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8104715661182291749": ["convolution_gpu_bfyx_gemm_like",1], + "6577240413312348523": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16440449399643706863": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "16261543808418336089": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "2328951328483718941": ["convolution_gpu_bfyx_gemm_like",1], + "5095827462645341808": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14524011013133838054": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15091825614924466766": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "18194662560696168435": ["convolution_gpu_bfyx_gemm_like",1], + "15329084374930297871": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16044646335477470657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12854272540346358832": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "13773898185415904435": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "2737738314051715813": ["convolution_gpu_bfyx_gemm_like",2], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4104062066031480003": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3355824730785179775": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "3069396488274616770": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7473012539094225392": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11878217002671373638": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "20037669704517227": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13403617010417893318": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15378707205730840765": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10904228118889057467": ["convolution_gpu_bfyx_gemm_like",2], + "10660722770448981436": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "911927861489659568": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4648739521905300372": ["convolution_gpu_bfyx_gemm_like",2], + "14223878376624781235": ["convolution_gpu_bfyx_gemm_like",2], + "10131771849139346986": ["fully_connected_gpu_fb_io_ref",1], + "7157531901512507924": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15317510501392280831": ["convolution_gpu_bfyx_gemm_like",2], + "3745433390861789238": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1671347101986657824": ["convolution_gpu_bfyx_gemm_like",2], + "16201999154635899927": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "7457951266863598199": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "6755802278188792577": ["convolution_gpu_bfyx_gemm_like",2], + "9782864129820122469": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "14034029872538173432": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "16861900412880466222": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1743572310914695413": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "33889407315234685": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16339114929185730551": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1648021476477101532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8749399240948437294": ["convolution_gpu_bfyx_gemm_like",2], + "15598570851049411521": ["convolution_gpu_bfyx_gemm_like",2], + "9888097487468905169": ["convolution_gpu_bfyx_gemm_like",2], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14071393823183565145": ["convolution_gpu_bfyx_gemm_like",2], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",844], + "14330281759626724494": ["convolution_gpu_bfyx_gemm_like",1], + "8139461711635049443": ["convolution_gpu_bfyx_os_iyx_osv16",610], + "17372326727957287976": ["convolution_gpu_bfyx_gemm_like",2], + "12636120902231094700": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "16396393355098283060": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "18114814167694102037": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "10177466042250039828": ["convolution_gpu_bfyx_gemm_like",2], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17061233750738578337": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "3654489958995965359": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "12884622643701027202": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9324602658580246084": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "13762814538289753428": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3518605747492037670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "8430177853357865174": ["convolution_gpu_bfyx_gemm_like",2], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "13288543822410746011": ["convolution_gpu_bfyx_gemm_like",1], + "14800933038795670868": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "6476480727582657308": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "7196214243890296121": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14133509766683767462": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15019050434475217267": ["convolution_gpu_bfyx_gemm_like",1], + "6538694526777067399": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1119928633562250911": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10979317886451847755": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15749335301736571135": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "6973224830546378808": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "670951751279091662": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2542506456395240890": ["convolution_gpu_bfyx_gemm_like",2], + "4745007371868123765": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1810943242998123550": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "6678101356115372537": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2040762223425679479": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "2317409971670298599": ["convolution_gpu_bfyx_os_iyx_osv16",877], + "6296118677770264276": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "18072663736237323230": ["convolution_gpu_bfyx_gemm_like",2], + "10002942280571012447": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6476949395889340429": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "8263822658108674162": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3167115892101501516": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10744779302034526105": ["convolution_gpu_bfyx_gemm_like",2], + "12010294231983179604": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "8374409021681741916": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "9503908816088325966": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14740550583313186369": ["convolution_gpu_bfyx_gemm_like",2], + "10614918790075146626": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "15119063070382146368": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3072535365860940873": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13836645410780461434": ["convolution_gpu_bfyx_gemm_like",2], + "16946947983339327902": ["convolution_gpu_bfyx_gemm_like",2], + "6744583842563891546": ["convolution_gpu_bfyx_gemm_like",1], + "8484526109354576450": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "8528886126454874796": ["convolution_gpu_bfyx_gemm_like",2], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "15466940145773097237": ["convolution_gpu_bfyx_gemm_like",1], + "12755991236707113150": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16774186226654475036": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "9277633677927827724": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "6996679663761370444": ["convolution_gpu_bfyx_gemm_like",0], + "1724898827344855006": ["convolution_gpu_bfyx_gemm_like",2], + "17758354062670710364": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",23], + "17620801628577659506": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5776920093461427179": ["convolution_gpu_bfyx_gemm_like",1], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "8858009650512312226": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "2850118175701764737": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "5685381761573686628": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "4942131377140353094": ["convolution_gpu_bfyx_gemm_like",1], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "15741360654354155504": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "16462033126494826292": ["convolution_gpu_bfyx_gemm_like",1], + "16312223896859176991": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15637565679147396649": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15489882561480858974": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "16951050796024922417": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11273168411455998347": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "11919129623429545762": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4936968239673204144": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "2469138375598281399": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1593086572473375988": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "10982382214349160582": ["convolution_gpu_bfyx_gemm_like",2], + "15271492161940795681": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",470], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",898], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "8465142022921853516": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "3541828356667081528": ["convolution_gpu_bfyx_gemm_like",2], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "14306044182355683449": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8312903198090907576": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6997121306455110286": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18305785425659656349": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "6579950270997373448": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "5828768432282043413": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8483234129545181544": ["convolution_gpu_bfyx_gemm_like",2], + "2946518372087114752": ["convolution_gpu_bfyx_gemm_like",2], + "9954050478761346921": ["convolution_gpu_bfyx_gemm_like",2], + "15939740070666326125": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15154934905173371714": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4003468969524607815": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "13816748148836642416": ["convolution_gpu_bfyx_gemm_like",0], + "11047625525388102466": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18178391985193947355": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "11291881629276762730": ["convolution_gpu_bfyx_gemm_like",1], + "4678945085654662665": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1655427025346068673": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8382355932367801226": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "1836277956961261472": ["convolution_gpu_bfyx_gemm_like",2], + "427362429809315581": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "625469553102754234": ["convolution_gpu_bfyx_gemm_like",2], + "5267143428977695208": ["convolution_gpu_bfyx_gemm_like",1], + "9724624621108712962": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15532419087060587119": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13091799752362714688": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "15485011864326008444": ["fully_connected_gpu_fb_io_ref",2], + "1370827524176794227": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "11062005455602919062": ["convolution_gpu_bfyx_gemm_like",2], + "15494543914974994991": ["convolution_gpu_bfyx_gemm_like",1], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",1], + "2694529308199677811": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "1081287304647703427": ["convolution_gpu_bfyx_gemm_like",2], + "11783851440679657276": ["convolution_gpu_bfyx_gemm_like",2], + "10127598593949337541": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "1071090704302849258": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7363788553442810299": ["convolution_gpu_bfyx_gemm_like",2], + "3497946462254198388": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "6621483425195088869": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14503814672536990561": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "6794427012971589670": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11307531462784240962": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5032866547826271476": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "5246229312484886433": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5670530004773188380": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4827354455626446376": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17149185480630228380": ["convolution_gpu_bfyx_gemm_like",2], + "7866867237563799289": ["convolution_gpu_yxfb_yxio_b16",0], + "13395562320893799513": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13041981853634484809": ["convolution_gpu_bfyx_gemm_like",2], + "16173557782125372935": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2477866283402053371": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4286652913945761799": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "15114370307779942381": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "11091771531609585709": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15409755591665753258": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1168589063110524328": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "17251021943762069083": ["convolution_gpu_bfyx_gemm_like",1], + "104321144590863458": ["convolution_gpu_bfyx_gemm_like",1], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "10068872968385049754": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12811104880512633036": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9352385417006844121": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16818714747882774917": ["convolution_gpu_bfyx_gemm_like",2], + "13592532173351964111": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12120302918788959150": ["convolution_gpu_bfyx_gemm_like",2], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "7590767013583950613": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "8202626341817892707": ["convolution_gpu_bfyx_gemm_like",0], + "2966185891283165994": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9759380701896779097": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7781809277449433812": ["convolution_gpu_bfyx_gemm_like",2], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "191374388179598660": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10652512666086843369": ["convolution_gpu_bfyx_gemm_like",2], + "16843976559933040107": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",1], + "6673753637296082820": ["convolution_gpu_bfyx_gemm_like",2], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "8431845338648284548": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12175297963550750804": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "12312934163571823042": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",809], + "13432509006553485205": ["convolution_gpu_bfyx_gemm_like",2], + "7989188632557972153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17079309368548171402": ["convolution_gpu_bfyx_gemm_like",1], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3392632422002516166": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "5313528120127506058": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1784095455470808903": ["convolution_gpu_bfyx_gemm_like",2], + "2110090486638190463": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "14257161696605459633": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "11933283931932057859": ["convolution_gpu_bfyx_gemm_like",1], + "15585700465988560560": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "6235096928786525260": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11648841195768568983": ["convolution_gpu_bfyx_gemm_like",1], + "17522452942286240233": ["convolution_gpu_bfyx_gemm_like",2], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "5627351109775149477": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "16167185344265573939": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11521288355888665606": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17555040035075346152": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "7331552952865138030": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14902389080201926109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8158983334404475382": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7127306913758514626": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17065380294456704620": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "3107611675766875160": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2821441037530057414": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "8550783999616052522": ["convolution_gpu_bfyx_gemm_like",2], + "16627410412068117729": ["convolution_gpu_bfyx_1x1",2], + "15210302033167762581": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3861084063403560668": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "10849780273184392468": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17365039759826870533": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "8453402620168400406": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15394217414267195999": ["convolution_gpu_bfyx_os_iyx_osv16",386], + "4059085986365258440": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "9968686603153440164": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "11919579121199894437": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "8104331313502492541": ["convolution_gpu_bfyx_gemm_like",2], + "2679903779216253668": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "6564126728704461285": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15018685799485128700": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16320454719906370247": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13702692566238948173": ["convolution_gpu_bfyx_gemm_like",1], + "9626028243479089234": ["convolution_gpu_bfyx_gemm_like",2], + "4864384537857484286": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "6427724955844538652": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "5000147505578625898": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "148355059345569721": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "778175413671462719": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2213990183618003353": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "6724516766412732606": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12018506264719915873": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17959539037614502049": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "2984236836610169934": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "13020929028222837402": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "393130776826919699": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",2], + "2571186327837339204": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3574733745204419723": ["convolution_gpu_bfyx_gemm_like",2], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10386584706491193379": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3488828327160968117": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "14668529234172928874": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1190134214210434381": ["convolution_gpu_bfyx_gemm_like",1], + "15241636061003642501": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "4620230702710590164": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8058623285594809047": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "3621449131285713809": ["convolution_gpu_bfyx_gemm_like",0], + "3435773540391994106": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "11934033658708880765": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "10841786394951910408": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "15670841106242481912": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "16881320590336043120": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8069865332677721685": ["convolution_gpu_bfyx_gemm_like",1], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14706510405720911492": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "11994423635588727210": ["convolution_gpu_bfyx_gemm_like",2], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "11955762239379054277": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1691554843141984381": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "7203566080268546556": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12077176094606956613": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3709364270141803019": ["convolution_gpu_yxfb_yxio_b16",2], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "7385225716957197459": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10136297272678091418": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3296080624478711270": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6236857636305802170": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15534517308430424624": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7518734167761579102": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5951228846460391670": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5301440603380967612": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12160764253455777655": ["convolution_gpu_bfyx_gemm_like",2], + "956022649859563080": ["convolution_gpu_bfyx_gemm_like",1], + "6612643056203714506": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10944997349682267106": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "173772845058977237": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4198666727524342442": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "2571882179292959757": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "16892873598489732462": ["convolution_gpu_bfyx_gemm_like",0], + "5849577829817109757": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "12983461576274227638": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15705195224249560587": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10670104149348964875": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "8394337033015371278": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1898912620350738645": ["convolution_gpu_bfyx_gemm_like",2], + "13933912937625580405": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "14852990574796128305": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "2571778193407799664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8672860483905060438": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2124776616364429517": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "118898027441804310": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "6172851296465788161": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6696330836969622824": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11926378988530133568": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "16398511553605808939": ["convolution_gpu_bfyx_gemm_like",2], + "4510003738155830628": ["convolution_gpu_bfyx_gemm_like",1], + "6706491729783125139": ["convolution_gpu_bfyx_gemm_like",1], + "14212924711992025243": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "9034951536385533818": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6066347819693426556": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12153119102645240327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16381344499660251151": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17878271352732707544": ["convolution_gpu_bfyx_gemm_like",2], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "829667328391742224": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2079353700062014100": ["fully_connected_gpu_fb_io_block_fp16",2], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "8454760437961964894": ["convolution_gpu_bfyx_gemm_like",2], + "1397214434971745171": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "8378690770140438511": ["convolution_gpu_bfyx_os_iyx_osv16",58], + "1200058627526593421": ["convolution_gpu_bfyx_gemm_like",2], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "14547907449418439737": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "14417033368952865805": ["convolution_gpu_bfyx_gemm_like",1], + "17928043901784474130": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13381833588713493653": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "9644723852089512961": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1013207188944763398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "16758962840329202004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16872172036344096583": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5124291229936820926": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "1233962450359295141": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12602356791053445447": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "16434635675895599016": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3362829461757548683": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "5163965164859517893": ["convolution_gpu_bfyx_gemm_like",2], + "11641605357868918146": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4674296632914491946": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3320392060021963536": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10747768416582634270": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "8025053805734757314": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17350963651826443169": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8255732638278792698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14256842018830898376": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "707979507145930311": ["convolution_gpu_bfyx_gemm_like",2], + "14880517974968280393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3359547327521773367": ["convolution_gpu_bfyx_gemm_like",2], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "332090597573908506": ["convolution_gpu_bfyx_gemm_like",1], + "4134729533276761488": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "4483155585853926891": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "11493371521058673700": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "8717393423378690149": ["convolution_gpu_bfyx_os_iyx_osv16",269], + "7937517564893685647": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "11141999085710526242": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "5748047690737461635": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "11314436000791223218": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13073788277284969422": ["convolution_gpu_bfyx_gemm_like",1], + "6522974911083412812": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9599099244072080863": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",304], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3703292222363446463": ["convolution_gpu_bfyx_os_iyx_osv16",286], + "16728826595086368897": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "11311890411536750673": ["convolution_gpu_bfyx_gemm_like",2], + "9574931298183748343": ["convolution_gpu_bfyx_gemm_like",2], + "17604747523124060652": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "11571049833132558023": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "12590495767805868405": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "10800323158234163234": ["fully_connected_gpu_fb_oi_ref",2], + "1208483520611545642": ["convolution_gpu_bfyx_gemm_like",2], + "4200340674281276565": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "5781431860747226742": ["convolution_gpu_bfyx_gemm_like",1], + "2588106330058954614": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "18077281411861416889": ["convolution_gpu_bfyx_gemm_like",1], + "15887484617041779814": ["convolution_gpu_bfyx_gemm_like",2], + "8954957191824520301": ["convolution_gpu_bfyx_gemm_like",2], + "8500612796090968552": ["convolution_gpu_bfyx_gemm_like",1], + "11740474593275702888": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3141554560840195766": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "10250778203413648582": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10712251675747436685": ["convolution_gpu_bfyx_gemm_like",2], + "18243018097656671503": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "9529614587861271730": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16495435651959280198": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6849874726361751307": ["convolution_gpu_bfyx_gemm_like",2], + "16937207522545573792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "7145194061073256844": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "10683462376964742177": ["convolution_gpu_bfyx_1x1",2], + "14561847633011875566": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5911574919905523294": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9625931001541723278": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "759904421452233375": ["convolution_gpu_bfyx_gemm_like",0], + "2656031443043933969": ["convolution_gpu_bfyx_gemm_like",2], + "8484176982872847423": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17260550967427796490": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "2841943277631596989": ["convolution_gpu_bfyx_gemm_like",2], + "16295742665642026049": ["convolution_gpu_bfyx_gemm_like",0], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "18196676408993954972": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "730498656295487620": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "981197653890885407": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15334769670416409064": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14802650433258854647": ["convolution_gpu_bfyx_gemm_like",2], + "13248218293365141596": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3419335618146360217": ["convolution_gpu_bfyx_gemm_like",2], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "9700592037514669700": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "10721811813682112908": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12012860334670244716": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "5169676188205309169": ["convolution_gpu_bfyx_gemm_like",2], + "7041670015280138712": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15696864960068112631": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "4332002982390788477": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "16113302464937833403": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5425221744593278983": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "15984373369388044924": ["convolution_gpu_bfyx_gemm_like",2], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "8976474887968287066": ["convolution_gpu_bfyx_gemm_like",1], + "17585852525746136080": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15124985846197662243": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "1771663698943903325": ["convolution_gpu_bfyx_gemm_like",2], + "4878084041222897879": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "17795554443343871443": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9750510172185801133": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "10994887986667360638": ["convolution_gpu_bfyx_gemm_like",2], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "3807725810350819929": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1118106412799660613": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1898776014554946000": ["convolution_gpu_bfyx_gemm_like",2], + "16461300997058854554": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2354885756165078342": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6948147789605707774": ["fully_connected_gpu_fb_io_ref",1], + "16847817828600381030": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "10714306166715959794": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "9212091835906796243": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "2343921093633784755": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",2], + "17356122476662104613": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12478914547444399288": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13395074742046717601": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "8451179695288093195": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "15356995665520295246": ["convolution_gpu_bfyx_gemm_like",1], + "13283018618260255620": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12136625628940225638": ["convolution_gpu_bfyx_gemm_like",2], + "15461879919099373703": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "13031027103925431505": ["convolution_gpu_bfyx_gemm_like",0], + "13387804712929042302": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "2999825793036702585": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "14558850297291634005": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5180223624868784700": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9353412605649860251": ["convolution_gpu_bfyx_gemm_like",2], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",2], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "11612908466465510939": ["convolution_gpu_bfyx_gemm_like",2], + "10076578838853982233": ["convolution_gpu_bfyx_gemm_like",1], + "17947097500350250352": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14805212478405698245": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "5896089609470353090": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12752101288912456176": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "9524663472084054050": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11684927349056930189": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3404911902272307873": ["convolution_gpu_bfyx_gemm_like",2], + "10730856574108806045": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "13809218391763818477": ["convolution_gpu_bfyx_gemm_like",2], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8708323717539569536": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6717268005860715462": ["convolution_gpu_bfyx_gemm_like",2], + "8700953648388124963": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13572134043095673708": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "14459249705747952583": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "17608082492919905570": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5351705572686943348": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8347537383976709519": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "6604223938357238686": ["convolution_gpu_bfyx_direct_10_12_16",1], + "389822325870173489": ["convolution_gpu_bfyx_gemm_like",2], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14083279273292567319": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "1701609125136907870": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13683797097980916261": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6995472847770703647": ["convolution_gpu_bfyx_gemm_like",2], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "6735135795253013220": ["convolution_gpu_bfyx_gemm_like",1], + "9340606088243696490": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11541706477255587105": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14277432520333139165": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "4680261350523889008": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2753702428731469792": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "8319405652132127420": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "11215766166462244180": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "13881505737488515065": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "3113016029551460773": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "8336494030011542852": ["convolution_gpu_bfyx_gemm_like",2], + "18439017855540532958": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2945414822360653904": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "414342067295883061": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "7289940394271052757": ["convolution_gpu_bfyx_gemm_like",1], + "6148022455516485135": ["convolution_gpu_bfyx_gemm_like",2], + "2481473548445286504": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "8509748651922589684": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "15230961192722285950": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14514450640485628836": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12878631058803628679": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5061053593616346116": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17393241435373906917": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "15385506288692289568": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10918743320372308981": ["convolution_gpu_bfyx_gemm_like",1], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "16615858951735101760": ["fully_connected_gpu_fb_oi_ref",2], + "6603817696964851209": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "5507708258753405429": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14445520478857662586": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "5297273225749803700": ["convolution_gpu_bfyx_gemm_like",0], + "11195875185591819437": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13810716860158972470": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "12942085219027232135": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "12529210672030682764": ["convolution_gpu_bfyx_gemm_like",1], + "8176520928011006903": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10613156984920928792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9105431502075531641": ["convolution_gpu_bfyx_gemm_like",2], + "2220961811760955456": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9105949910901552052": ["convolution_gpu_bfyx_gemm_like",1], + "9116206094279111365": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "810244829776621501": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "10270203686708782941": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "15479071839425218367": ["convolution_gpu_bfyx_gemm_like",2], + "17835592722977214177": ["convolution_gpu_bfyx_gemm_like",2], + "13994738382469480124": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11915835787294686201": ["fully_connected_gpu_fb_io_ref",2], + "13535031376667778809": ["convolution_gpu_bfyx_gemm_like",1], + "15905812449037427213": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "11992158790035075804": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6748628505489041229": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "15222823942088272038": ["convolution_gpu_bfyx_gemm_like",2], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8844619836383523698": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "12440883214879663043": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "676641023579624117": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "17303584953298149285": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "10191980053492569024": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "16125365972873290572": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13816380312874384117": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10340626080611300806": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "16763335832616216769": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7876355212013100281": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "14579042972443651846": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16917495876041966553": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6489074577147494118": ["convolution_gpu_bfyx_gemm_like",1], + "761984225415608773": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "621927597604688551": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14074914477149374595": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2297846338452062425": ["convolution_gpu_bfyx_gemm_like",2], + "14234254258925470171": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2768512766772748723": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9758033083211570158": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11802527991096689252": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "1919460437053604108": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "11198378813600875939": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "12301464827222654105": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "12565318283493666631": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "4370027682980493159": ["convolution_gpu_bfyx_gemm_like",1], + "14880029436467076847": ["convolution_gpu_yxfb_yxio_b16",2], + "14555366228958374512": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "4232250144427804891": ["fully_connected_gpu_bf_io_input_spatial",2], + "774981050284188673": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4004333174619528327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2567809041240246707": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "14013561425708390846": ["convolution_gpu_bfyx_gemm_like",2], + "15687441275464931484": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",1], + "2590143768280076032": ["convolution_gpu_bfyx_gemm_like",2], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "726898338396698172": ["convolution_gpu_bfyx_gemm_like",2], + "2307629242354292362": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4536811685836767511": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14122647818827599984": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "10394041365384258612": ["convolution_gpu_bfyx_gemm_like",2], + "11882021989615795558": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "14786800939708939361": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4959718589070770515": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "18136968124686255108": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "7630342538679060038": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "17614929666625976544": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15911644545988936270": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11734299455885510243": ["convolution_gpu_bfyx_os_iyx_osv16",663], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4334698056820320220": ["convolution_gpu_bfyx_gemm_like",1], + "10420516636613025222": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "13558603350852076889": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4276712095427918904": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "11857822504978122919": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "15198419554644505600": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "7536287105029319189": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "5698743977411325127": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16134637021630473012": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2098357709530580176": ["convolution_gpu_bfyx_gemm_like",1], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13348855287761849180": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "10011668671963948912": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17523210737277743952": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "8065408380801722040": ["convolution_gpu_bfyx_gemm_like",1], + "140463250258747810": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8394085742794617896": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1400089266180918877": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8725673763972618034": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3828569468687251275": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17318287523550546026": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13498795599230228492": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "9061025737181218101": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "4445257000541366640": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "3126316723202463622": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "17264554677210911187": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "859377216693940737": ["convolution_gpu_bfyx_gemm_like",2], + "1742897526168249500": ["convolution_gpu_bfyx_gemm_like",2], + "17399542571019639128": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "10186942318345695432": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "4010650902230520983": ["convolution_gpu_bfyx_gemm_like",1], + "3324979924867461126": ["convolution_gpu_bfyx_gemm_like",0], + "13083981648347252910": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "16159852373972174245": ["convolution_gpu_bfyx_gemm_like",1], + "9226912483632588371": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16794102497779310636": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6581494673640781863": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7517800202981394755": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12631385844456089132": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6638154580507569953": ["convolution_gpu_yxfb_yxio_b16",2], + "7557439160429040689": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "8069829594586311016": ["convolution_gpu_bfyx_gemm_like",2], + "3281411665507625899": ["convolution_gpu_bfyx_gemm_like",2], + "9368244029111057323": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15692223101958737604": ["convolution_gpu_bfyx_gemm_like",2], + "6893451271566946459": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6370629727707634189": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "6873973504717201270": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",2], + "15121608487896365221": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4319047524534407016": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15677062663215157168": ["convolution_gpu_bfyx_gemm_like",2], + "498221230041656321": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "460346381952024719": ["convolution_gpu_bfyx_gemm_like",2], + "13267438341255312172": ["convolution_gpu_bfyx_gemm_like",2], + "6053594232298534345": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4750897775273897282": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "18204971481718743856": ["convolution_gpu_bfyx_gemm_like",2], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "18096803908321982720": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "141166664952282933": ["convolution_gpu_bfyx_gemm_like",2], + "13506060627438652817": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "15616954046484566002": ["convolution_gpu_bfyx_gemm_like",1], + "264371219192743152": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "15258215535586455016": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15262493122847269333": ["convolution_gpu_bfyx_gemm_like",2], + "18265020664540913473": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12602193792076781600": ["convolution_gpu_bfyx_gemm_like",2], + "9700098364581157575": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7945923871349397386": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6453222793515233963": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2446257282140830646": ["convolution_gpu_bfyx_gemm_like",2], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "8951503172834790833": ["convolution_gpu_bfyx_gemm_like",2], + "11986642867827682648": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2999633429402781278": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "9357359875134299131": ["convolution_gpu_bfyx_gemm_like",1], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",2], + "8075453526439606224": ["convolution_gpu_bfyx_gemm_like",2], + "14403780921831769097": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "16605697831520435304": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "4024491643929554510": ["convolution_gpu_bfyx_gemm_like",2], + "16815373779430857324": ["convolution_gpu_bfyx_gemm_like",1], + "16541535256432192398": ["convolution_gpu_bfyx_gemm_like",2], + "1194267934213722567": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "10071611039987219440": ["convolution_gpu_bfyx_gemm_like",1], + "7227174766917523481": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "12808154347573074859": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12412224630798427948": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "7058458405375602606": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "13186342942242476803": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "15747538142554815480": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "1350953652678789564": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "15783558375979538895": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "11232261979256657934": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8997817508830449863": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1003101267609305257": ["convolution_gpu_bfyx_gemm_like",2], + "7017157908391870084": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "1896394898744191046": ["convolution_gpu_bfyx_gemm_like",2], + "16865271154583564899": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "72444706264681262": ["convolution_gpu_bfyx_gemm_like",2], + "13139953964389811410": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "11602830611894444581": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8511244943596227719": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15220874718853723626": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10093371683053539916": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "15671873744670386067": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "8576229375621297412": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2686152083115758704": ["convolution_gpu_bfyx_gemm_like",1], + "18415227597391874233": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7650874310714729923": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "17508515605648584094": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11665313746896806563": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11275526584835606578": ["convolution_gpu_bfyx_gemm_like",1], + "13150876648527896999": ["convolution_gpu_bfyx_gemm_like",1], + "17767784103977797843": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15618891972122000521": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4049276089777687996": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2772149704821395618": ["convolution_gpu_bfyx_direct_10_12_16",2], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2069311169819696343": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "45545661884854912": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "6227066883925046010": ["convolution_gpu_bfyx_gemm_like",2], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "15387047026300787039": ["convolution_gpu_bfyx_gemm_like",2], + "5567670507334783760": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9569446666675696513": ["convolution_gpu_bfyx_gemm_like",1], + "5509631031571317557": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5384134329664434112": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "1617993599154234262": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "14585144905582599299": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "16695020005258780885": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12038525298168664305": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9654726486719966937": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10942743767167283370": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17243953172314194409": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "7263339400190408379": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "4241640917176830862": ["convolution_gpu_bfyx_gemm_like",2], + "8770858724416759637": ["convolution_gpu_bfyx_gemm_like",2], + "2622434279674583815": ["convolution_gpu_bfyx_gemm_like",1], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4035015193331696438": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "5061795324735006354": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "8623022306922454565": ["convolution_gpu_bfyx_gemm_like",1], + "6542417269641204414": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "16021335552443492452": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "2602811890459789252": ["convolution_gpu_bfyx_gemm_like",2], + "7963529808900784906": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "10468108569766167175": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "15479549936562568596": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1467428583618467133": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13553045975561262752": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12744887771237881196": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13485140643204970345": ["convolution_gpu_bfyx_gemm_like",2], + "12693511427898130707": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18213389163198755626": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "15129834325410878425": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2574815123023594315": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "759816003617478606": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3051823462382231650": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7107513718824525169": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",2], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",571], + "16774728502960825097": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8942942026369874093": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4916569245937189632": ["convolution_gpu_bfyx_gemm_like",2], + "3006428377575478529": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "8833400244933346226": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2764034841399585177": ["fully_connected_gpu_fb_oi_ref",1], + "4294879469633231552": ["convolution_gpu_bfyx_gemm_like",2], + "65349392124461285": ["convolution_gpu_bfyx_gemm_like",2], + "1426606766274640878": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14313201046801286869": ["convolution_gpu_bfyx_gemm_like",2], + "6801897580177846120": ["convolution_gpu_bfyx_os_iyx_osv16",656], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "9497269191159495932": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "11253790393313445931": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8021962180961047152": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "10208132281050693649": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "15997754881872769378": ["convolution_gpu_bfyx_gemm_like",2], + "10344489318472060767": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "15938703221521364046": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4147006350295905486": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "3664562521273273709": ["convolution_gpu_bfyx_os_iyx_osv16",208 + ] + }, + "48": { + "883436333317162926": ["convolution_gpu_bfyx_1x1",0], + "4232250144427804891": ["fully_connected_gpu_bf_io_input_spatial",2], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "150132162949295379": ["convolution_gpu_bfyx_1x1",2], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6603778920476932267": ["convolution_gpu_bfyx_direct_10_12_16",1], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "15661322183507404821": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",1], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "10893432143734884603": ["convolution_gpu_bfyx_gemm_like",2], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",1039], + "5629373398445592781": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "10005177465075197768": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",125], + "4238885454989272754": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10890975553758439233": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13878967140838761911": ["convolution_gpu_bfyx_1x1",2], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",2], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3860603464276263676": ["convolution_gpu_bfyx_gemm_like",2], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "9404677451270692749": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "9522661528867955338": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",642], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "5287076386757143976": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",2], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",2], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14766477690417085350": ["convolution_gpu_bfyx_1x1",2], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "16773645387243701837": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "1632416005093914709": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",0], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",2], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",871], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "3177304125602972370": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "17225552472711821360": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "2732519635571994212": ["convolution_gpu_bfyx_gemm_like",2], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",1], + "6109013751635776331": ["convolution_gpu_bfyx_gemm_like",2], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "7375461241315602473": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "17522452942286240233": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "1954052357826969119": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "9987415314864002460": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13762042713029963144": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "5740738339752793113": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "14540578324750869319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",2], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",2], + "13590444711975157776": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",2], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "5118467701668427545": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10914921540144371519": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "5311718276151327830": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17109520309574369561": ["convolution_gpu_bfyx_os_iyx_osv16",298], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "9454954846682513038": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6571438978296387721": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "12427258337646070422": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5012013738970489338": ["convolution_gpu_bfyx_1x1",2], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "8943913562339525413": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "14532519639619315651": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "13115589642140732066": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4216958486055161753": ["convolution_gpu_bfyx_gemm_like",2], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",2], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "482564204402769504": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "577844026691991089": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8263423704888556491": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "4084516853815444743": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9226912483632588371": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",2], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10084794570892043447": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",391], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "3622409603053918029": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10601684126917601680": ["convolution_gpu_bfyx_gemm_like",2], + "11604794601689380990": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",2], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",1], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",2], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",2], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",0], + "6339908713513858301": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",2], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17214254645087272557": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "6585223640997887253": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "11669828823444745889": ["convolution_gpu_bfyx_gemm_like",2], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1103228955716492167": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "10536316961655703500": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "14222482954865351228": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12353956380178079089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "7199295899520406795": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9191832520273617003": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "7759812946257541251": ["convolution_gpu_bfyx_os_iyx_osv16",109], + "2188101366183302888": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "6438522646185979880": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "4584970211859494304": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "12495003066477974474": ["convolution_gpu_bfyx_os_iyx_osv16",109], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "15494543914974994991": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "14383657211047876136": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",2], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "11622925573287101001": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15555083739490354527": ["convolution_gpu_bfyx_gemm_like",2], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "4759671642533786591": ["convolution_gpu_bfyx_os_iyx_osv16",553], + "15781622938833984014": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "7132328255408635227": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "5044721291675005144": ["convolution_gpu_bfyx_1x1",2], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11043866034742707103": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "4999505377862312410": ["fully_connected_gpu_bf_io_input_spatial",1], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",1], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "14558572801374416278": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "14204609663091442879": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "2345023488044002149": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "14872992823083730615": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7450417963648518926": ["convolution_gpu_bfyx_gemm_like",2], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14362876471450307424": ["convolution_gpu_bfyx_1x1",0], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "851057218719456209": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",1], + "4079026972040047969": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",304], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "4154403364889130045": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15967614281807823696": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "3024355261291518180": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",4], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4834446692898125871": ["convolution_gpu_bfyx_gemm_like",2], + "14466032674083938714": ["convolution_gpu_bfyx_direct_10_12_16",1], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",470], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7624259732952222597": ["convolution_gpu_bfyx_gemm_like",2], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",1], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "879896719155824868": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "8618835732380720921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",2], + "6025872155179042054": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "15578456771467281881": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8787438180071123604": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",2], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "17723621158215826108": ["convolution_gpu_bfyx_gemm_like",2], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",2], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",0], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "394778201589371681": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",2], + "1369161172432667462": ["convolution_gpu_bfyx_gemm_like",0], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "10642327923162019888": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",2], + "17243648226968859637": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",2], + "11893419236649064317": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1003101267609305257": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",0], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",0], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14221578799010900252": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7590767013583950613": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "12194037100109755112": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "17224104246148265328": ["convolution_gpu_bfyx_gemm_like",2], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "18384657372655350144": ["convolution_gpu_bfyx_gemm_like",2], + "13176385389367548697": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "2339864165283480961": ["convolution_gpu_bfyx_1x1",2], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",1], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "994842991399671507": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",125], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",2], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13538051178827008933": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",2], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",0], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "9480653639044390919": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "12512751736409465214": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "11768117585574496387": ["convolution_gpu_bfyx_gemm_like",2], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "15464327246951632247": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "4897991181236908768": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",2], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "2242829490403202087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "1996860183441418841": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12166852830214895457": ["convolution_gpu_bfyx_1x1",2], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",1], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14184895905338394239": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",643], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",2], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "1435153323458789173": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16617945088781950664": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13596876807637507229": ["convolution_gpu_bfyx_1x1",2], + "16327433707667075261": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "5688478347124565305": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11619548409913646265": ["convolution_gpu_bfyx_direct_10_12_16",2], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "10118395047539851751": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",0], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",1], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "10724501418439612080": ["convolution_gpu_bfyx_os_iyx_osv16",643], + "579781312141502576": ["convolution_gpu_bfyx_1x1",2], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "2984726467649419856": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "1742897526168249500": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "15322019609805777935": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "5657471280535146301": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",2], + "7954972694876158422": ["convolution_gpu_bfyx_1x1",2], + "11951606039079763598": ["convolution_gpu_bfyx_gemm_like",1], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "3872151366780051246": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",0], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "216603198215625772": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8792202318168046223": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4229105529069729944": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",196], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",0], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4911903898045460096": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4531222427159927606": ["convolution_gpu_bfyx_gemm_like",2], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "5911282942658469852": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16020916772006653269": ["convolution_gpu_bfyx_1x1",2], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",2], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",2], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7903891232234389925": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "12917241193304093727": ["convolution_gpu_bfyx_gemm_like",2], + "12382399034878624010": ["convolution_gpu_bfyx_gemm_like",2], + "14902389080201926109": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",2], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",2], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",0], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3036808833459559381": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",0], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "11459784003592366395": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "2007192658799516915": ["fully_connected_gpu_bf_io_gemm",1], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "49948277487706148": ["convolution_gpu_bfyx_1x1",1], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",0], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "10433541468308381909": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",642], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2439993891369206440": ["convolution_gpu_bfyx_1x1",2], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "2920840796593281126": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "4252157815622916471": ["convolution_gpu_bfyx_1x1",2], + "17824431042110985323": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "1485662490111767875": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "9525535670799618110": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",621], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11275109735493317886": ["convolution_gpu_bfyx_gemm_like",2], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",2], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10290107543739998181": ["fully_connected_gpu_bf_io_input_spatial",0], + "6254141935545262078": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "15672624168541469192": ["convolution_gpu_bfyx_gemm_like",2], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "3563872903821081702": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16839741351990811959": ["convolution_gpu_bfyx_gemm_like",2], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",2], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8321769923556905957": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18436249934780056991": ["convolution_gpu_bfyx_os_iyx_osv16",1048], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "708747442142592697": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "11609821372586026178": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "3820661057776133570": ["convolution_gpu_bfyx_1x1",2], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",132], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10722677916294015259": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "490931535580183607": ["convolution_gpu_bfyx_os_iyx_osv16",486], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",2], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "10292585962794261197": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",1], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",2], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "7748233564411787605": ["convolution_gpu_bfyx_gemm_like",2], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",2], + "11797601971796699898": ["convolution_gpu_bfyx_gemm_like",2], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "331661172067077796": ["convolution_gpu_bfyx_1x1",2], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "5334190564423375247": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",2], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",2], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",1], + "3265415000818832667": ["convolution_gpu_bfyx_gemm_like",2], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",0], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "721174714308243785": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "6065819201836017182": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "10572945270796129630": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3154539627593235077": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",2], + "4455369117448405874": ["convolution_gpu_bfyx_1x1",1], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",1], + "8857763129101380288": ["convolution_gpu_bfyx_os_iyx_osv16",267], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",2], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17207560805775399864": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2608363732937932266": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "1697248235682953135": ["convolution_gpu_bfyx_gemm_like",1], + "16884228931101540030": ["convolution_gpu_bfyx_os_iyx_osv16",1018], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14458851250685872417": ["convolution_gpu_bfyx_gemm_like",0], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",2], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",0], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9562527071055150197": ["convolution_gpu_bfyx_1x1",1], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",1], + "14744368497944610864": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "17015791782274123780": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4769003637955328938": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18035673326929466074": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",0], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",0], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",1], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",2], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3017824560305532066": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "18012549942299450620": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "13468081302022888489": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "522181557896569275": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "8709632541892447149": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",570], + "959260710517842876": ["convolution_gpu_bfyx_gemm_like",1], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "3916913157877412361": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "9941035405796680081": ["convolution_gpu_bfyx_1x1",2], + "9737565171095493297": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "72444706264681262": ["convolution_gpu_bfyx_gemm_like",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",1], + "16075006181495932250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",0], + "4718716595177056289": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14795618530175274538": ["convolution_gpu_bfyx_os_iyx_osv16",486], + "13124342334495538095": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "10022487076451608714": ["convolution_gpu_bfyx_gemm_like",2], + "537074122417021898": ["convolution_gpu_bfyx_gemm_like",2], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5963901433137582265": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",924], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",2], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",1], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",0], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",1], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",316], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",0], + "529543453251381109": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",2], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",2], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "10330180429524641331": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7264756313770306662": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",840], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "2554991397391195611": ["convolution_gpu_bfyx_gemm_like",2], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",1], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "9741607635826869269": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",12], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",1], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",2], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",2], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "7697369026397443797": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "4073467095502162430": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "12972798847556569913": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8272823732258536202": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "14484890926084856480": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4180325737406616940": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "17174919737114915467": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7481256533438761028": ["convolution_gpu_bfyx_os_iyx_osv16",1069], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",2], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",1], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "15289152041466330689": ["convolution_gpu_bfyx_os_iyx_osv16",267], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "178353385245384751": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "11987564534722442223": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "15231987838322151865": ["convolution_gpu_bfyx_1x1",2], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",1], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",1], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",1050], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",393], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "17713034180977313726": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",2], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",1], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",0], + "1520529227443340435": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "15924583510704449214": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "13851851281384416649": ["convolution_gpu_bfyx_1x1",1], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "11806105193035393795": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "16011429608661242565": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13248567106128518549": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13735180250757239202": ["convolution_gpu_bfyx_gemm_like",2], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",0], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2038505773698938555": ["fully_connected_gpu_bf_io_gemm",2], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",114], + "12625112690264223217": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15857087373591747006": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "17310332946322628458": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "12375919467924385618": ["convolution_gpu_bfyx_os_iyx_osv16",486], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",1], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",2], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",122], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",0], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "15006321421735686121": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9328223957245552723": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14301049621912707511": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "6798405629870473128": ["convolution_gpu_bfyx_1x1",2], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "16911464046178654033": ["convolution_gpu_bfyx_1x1",2], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",2], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",2], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",2], + "5754844816339228920": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "4339711224604149541": ["convolution_gpu_bfyx_gemm_like",2], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",1], + "3441335188113424896": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",666], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4716188972902735458": ["convolution_gpu_bfyx_gemm_like",2], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "13503688893307029975": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "13681462437496627948": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",0], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7870154008378361670": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",2], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "9243949750444156746": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "8541982562061181756": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "17025268985366223779": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",764], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "5074273865983613482": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",0], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",903], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5933743119393822386": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "17672785701483179117": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18427056032084727710": ["convolution_gpu_bfyx_os_iyx_osv16",485], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",753], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",491], + "8561261337239934159": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "15961487889420208188": ["convolution_gpu_bfyx_gemm_like",0], + "4239415134522959352": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "14263790627243107300": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",1], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "15374625876485618845": ["convolution_gpu_bfyx_gemm_like",2], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",2], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",2], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "739676584505475609": ["convolution_gpu_bfyx_gemm_like",2], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "10384537928514123040": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "9737833587413114584": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "1318571118468536310": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",2], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "1680468564927032670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "6644418194983229139": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",2], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",1], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",2], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",1], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",0], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",2], + "4091702228990140696": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",398], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1104489643524273315": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "14335423820860953927": ["convolution_gpu_bfyx_os_iyx_osv16",881], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "818998169319147148": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "1885075753696445410": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16230621843665445228": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",0], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "11807282628372660280": ["convolution_gpu_bfyx_1x1",2], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "15591167992985613695": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9447458159095730492": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "5953754321266570854": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",2], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",1], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "5008350851224686853": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16910952799476896905": ["convolution_gpu_bfyx_gemm_like",2], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",2], + "3398322619007806698": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "12242618640422208652": ["convolution_gpu_bfyx_gemm_like",1], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "9757389422721488173": ["convolution_gpu_bfyx_1x1",1], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "13208778119673683349": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16025442470600124062": ["convolution_gpu_bfyx_gemm_like",1], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",623], + "1963081583851864291": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4220826666482500445": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1249137685908951501": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5622089373755094139": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4461989328775275994": ["convolution_gpu_bfyx_gemm_like",2], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",2], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2929715823970060874": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "938848188161536107": ["convolution_gpu_bfyx_1x1",1], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "11086699387784339943": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "1509728225855233852": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "6845814820599174031": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",722], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "11595465382166985232": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",2], + "1569043950563130463": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13161997040644039778": ["convolution_gpu_bfyx_gemm_like",2], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",0], + "10935309102034762723": ["convolution_gpu_bfyx_1x1",1], + "8543619733732987550": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "659846949368492111": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "2730604806511016352": ["convolution_gpu_bfyx_gemm_like",2], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "5040095338370816349": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "3412573508101980656": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1208161922424418734": ["convolution_gpu_bfyx_gemm_like",2], + "733956743303342862": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "13369603621524676979": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4672441137336208890": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "7292351660229751817": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2727175120437582536": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",0], + "16781127329510211966": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "1507839533611760093": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "14213516751025324346": ["convolution_gpu_bfyx_gemm_like",2], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",623], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3385797925880519845": ["convolution_gpu_bfyx_1x1",2], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",2], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",1], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "755577773771316277": ["convolution_gpu_bfyx_1x1",2], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "6329618009202266591": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "8257103926661643451": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",149], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "11587239927319376658": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9714508918051740792": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "4773077837537775324": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "5941852872160795604": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",2], + "7171904645566467208": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "15315327794058441258": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "15636128989267984459": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15773157615731010456": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3662747857062156477": ["convolution_gpu_bfyx_gemm_like",2], + "15031155621982459860": ["convolution_gpu_bfyx_gemm_like",2], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3362190082518348071": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9440117898128288296": ["convolution_gpu_bfyx_gemm_like",2], + "12225380215512887632": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "7211355951470869591": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",2], + "1934379409955686502": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "1700222876284611258": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "5754396201681434378": ["convolution_gpu_bfyx_1x1",2], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "6288489890578212082": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",1069], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1077773457856682663": ["convolution_gpu_bfyx_gemm_like",2], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",2], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",2], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",0], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",881], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5115007207028125638": ["convolution_gpu_bfyx_os_iyx_osv16",643], + "13264617841270329349": ["convolution_gpu_bfyx_1x1",2], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "7279393739634103483": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",2], + "16076153317792960383": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "9759380701896779097": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",1], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",2], + "4550028191070279999": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "10628725059172743408": ["convolution_gpu_bfyx_gemm_like",1], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2893564501191050837": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",136], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",0], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",2], + "3334339484693730802": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7201521533301617290": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",0], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",0], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "138379779469699309": ["convolution_gpu_bfyx_gemm_like",2], + "2571882179292959757": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",0], + "14810839157236175179": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",2], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "13038533272699602337": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16053585286807864356": ["convolution_gpu_bfyx_gemm_like",1], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "15924916465272239832": ["convolution_gpu_bfyx_gemm_like",2], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",0], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",2], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2727219457659794468": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",0], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11130439225010714550": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",1], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "12151068022697708126": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "13160712904661288567": ["convolution_gpu_bfyx_1x1",2], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",2], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",1], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15106614232165315070": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",398], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "11507538232733291666": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "2349007644347065353": ["convolution_gpu_bfyx_gemm_like",2], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9040046051053703359": ["convolution_gpu_bfyx_gemm_like",2], + "8794896449397768269": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11878734040194151073": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "17343050785312683560": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17912189681971987483": ["convolution_gpu_bfyx_gemm_like",0], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",1], + "13565691057064774487": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "6942016672941874829": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",1], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "8079376692609682448": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "12796777049340516563": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "8700574100180128776": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "5781098222688514465": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",490], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",2], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",1], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "9707630588260222630": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "6020017927557041768": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9809458159478958866": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",2], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "17084977396231597605": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",2], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "2108296560864415762": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",2], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18218631037214746168": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "2273992727647793692": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "5288793454052261767": ["convolution_gpu_bfyx_os_iyx_osv16",1020], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",1], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9475130054420979752": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "4135068756462147853": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2543041530639980505": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "17742192339816511494": ["convolution_gpu_bfyx_gemm_like",2], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5346898505346646714": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "15078168059698267650": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "8819268903800581706": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "2817919813339364130": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",2], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",2], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",2], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",2], + "700717277178942679": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17790026124881397912": ["fully_connected_gpu_yxfb_ref",0], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",0], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16173557782125372935": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",1], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",2], + "3976736548270395981": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",2], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16469788155263456039": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "5926747396493954633": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",623], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",2], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",664], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "2265784112305305260": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "6856130385095139346": ["convolution_gpu_bfyx_os_iyx_osv16",1018], + "4617347486560666277": ["convolution_gpu_bfyx_1x1",2], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17778091287904736965": ["convolution_gpu_bfyx_gemm_like",2], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",0], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",1], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",2], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",0], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1889911210088209867": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",1], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3190494353583341446": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6254161707168091438": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2534408579674556441": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",1], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17011363406405852347": ["convolution_gpu_bfyx_gemm_like",2], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "10837496380266058422": ["convolution_gpu_bfyx_gemm_like",2], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "15511138074959300404": ["convolution_gpu_bfyx_gemm_like",2], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",1], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",2], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",2], + "9226443907548972870": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",1], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",2], + "13760645810144930270": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",2], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6107031848283462574": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "13960388312976163971": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "8174040194088942964": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",2], + "14650567822254940018": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15378025640603637387": ["convolution_gpu_bfyx_os_iyx_osv16",284], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",1], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "4239133538073498792": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",2], + "13102754309439605192": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",2], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14678312911245000804": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "14525127290591744848": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8751016391945753900": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "7808544677773370430": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9426665763007611385": ["convolution_gpu_bfyx_gemm_like",2], + "3355259926747524578": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",1], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",17], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "2527276292172180386": ["convolution_gpu_bfyx_gemm_like",0], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13006774775034887171": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "6664482192233202590": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "12831298482349900359": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",2], + "5720964268093705079": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5019077257951332016": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "2204178900998688268": ["convolution_gpu_bfyx_gemm_like",2], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "7223801044761006523": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "2173867324489962689": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1701609125136907870": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",2], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",2], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "17515847111676784130": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",2], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",1], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11066913713501760080": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "2296581485980163665": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "3383222668132648804": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "3499106702307464480": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11948858355027908365": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",2], + "15287650965861631130": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "13809330759308309353": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",1], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",1], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",0], + "3477539135137665170": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",0], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",0], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3755253206085028904": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16172528828198474326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7107677063657303327": ["convolution_gpu_bfyx_1x1",2], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",2], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",11], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",2], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "15739278428190392018": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",2], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "13300022131572486202": ["convolution_gpu_bfyx_gemm_like",2], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",881], + "8032685176029570383": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",1], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",1], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",2], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3963106895592011725": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "6300691162962736560": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2242602888499888844": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "12421204749289937399": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "9918371346247634545": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",0], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",2], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "14578867494693499627": ["convolution_gpu_bfyx_gemm_like",2], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14905520834426630145": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "4274425737610351312": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7575675354187625951": ["convolution_gpu_bfyx_gemm_like",2], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",2], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",0], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "10509933181132310969": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7843498978148810586": ["convolution_gpu_bfyx_gemm_like",2], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",2], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",848], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1857923215589370245": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",0], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14985236276429954162": ["convolution_gpu_bfyx_gemm_like",1], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",1], + "4640696923527766618": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15026219694198820614": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",2], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "8376077531098664520": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "8939683514448064461": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11655994466278963438": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "3503893875515897267": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",0], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "632116056424249698": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "4056979460327024961": ["convolution_gpu_bfyx_gemm_like",0], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17477062954520561609": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "2418288192668085805": ["convolution_gpu_bfyx_gemm_like",2], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "11263540528012919947": ["convolution_gpu_bfyx_1x1",2], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",1], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",2], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",0], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12534001599784153836": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",0], + "7232326270078161768": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5409924335138540834": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "18386376129938707290": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "2623687018437195679": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "16182470664818268848": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "8025053805734757314": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17344974951998490453": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",0], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "12279771749366327372": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",2], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "1306339989221885682": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",2], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7667898603371717971": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",0], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",2], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "503369896500284129": ["convolution_gpu_bfyx_1x1",2], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",2], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",2], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13702692566238948173": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10765280349477640969": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",2], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4933831571091731212": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",0], + "14764715930784496165": ["convolution_gpu_bfyx_gemm_like",2], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",1], + "5479761740065152589": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5815789824950542164": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "1186545671730357033": ["convolution_gpu_bfyx_os_iyx_osv16",654], + "3699344686791530101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "3141886504884887200": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",2], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2140514316203117958": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",1], + "3635446784873718932": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",2], + "18184621367843960190": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "9989055862610193828": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",120], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "16341722570340169855": ["convolution_gpu_bfyx_1x1",0], + "16139615240471264488": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "671453551040072499": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "3221221905804708596": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",0], + "12866217660635921034": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "856877003890134554": ["convolution_gpu_bfyx_gemm_like",0], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "13302687772426736346": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "6323026044750482867": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "54019631544204590": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "3541538046227217664": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "4992668316921598993": ["convolution_gpu_bfyx_os_iyx_osv16",283], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",2], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",2], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8130920994920685157": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "14077148976508649021": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "52089503050497755": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "4890043345392707202": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "17928043901784474130": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",490], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",1], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",1], + "5156033406916344703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11634932044447867039": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17638692805430115529": ["convolution_gpu_bfyx_gemm_like",1], + "8507854696766492454": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",0], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5381578460674280089": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",2], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7075659071934895087": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",2], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "13357431438267043322": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "603883331897298932": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8153567933591966877": ["convolution_gpu_bfyx_gemm_like",1], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",395], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",1], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",1], + "8560635685184432720": ["convolution_gpu_bfyx_direct_10_12_16",2], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "17228810554159747400": ["convolution_gpu_bfyx_gemm_like",2], + "13251091004269229867": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",2], + "6760797535531423152": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "13403161389559730": ["convolution_gpu_bfyx_gemm_like",2], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",0], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",2], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",2], + "3725013268198063198": ["convolution_gpu_bfyx_1x1",2], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "16818206615424635387": ["convolution_gpu_bfyx_1x1",1], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "5539793555189956907": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10753540518493641553": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "13328449155966085543": ["convolution_gpu_bfyx_gemm_like",2], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",2], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "18122858611264877646": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",2], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",736], + "2040762223425679479": ["fully_connected_gpu_bf_io_input_spatial",0], + "7688176479120305539": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5240706676373148280": ["convolution_gpu_bfyx_gemm_like",0], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2465684728484709259": ["convolution_gpu_bfyx_1x1",1], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",1], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",0], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18416908414174464784": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "14289082888174784976": ["convolution_gpu_bfyx_gemm_like",0], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",1], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",2], + "7531346828150129063": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8528750110601691390": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7712831597869354170": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",553], + "12672995204641007004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1089944493540593798": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "16264774056719724826": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6222595759158615206": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13503555814874045782": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15417738436777481469": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "1345101751956733589": ["convolution_gpu_bfyx_gemm_like",2], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "2305461098719675735": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "584086621952390547": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "1882052795393187384": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "2525260242689556544": ["convolution_gpu_bfyx_gemm_like",2], + "13447028922679236865": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "6131481289104111211": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "5965451243366505522": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10435566004514173951": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11265079350845539239": ["convolution_gpu_bfyx_gemm_like",0], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",2], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "1541754036637209097": ["convolution_gpu_bfyx_gemm_like",2], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",1], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",1], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2114232149447438823": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",0], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10447947790216991304": ["convolution_gpu_bfyx_gemm_like",2], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "16818714747882774917": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "2451712485584835395": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",661], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "9780938731831129283": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "15497797842820949408": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",1], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",2], + "6328802691680458752": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",2], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",1], + "4610200388191607540": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "8057302050645780813": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",1], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "276407276027553756": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "9585113116232600562": ["convolution_gpu_bfyx_gemm_like",2], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "17025324057045572535": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "16610284927818475574": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18077281411861416889": ["convolution_gpu_bfyx_os_iyx_osv16",667], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10792503079194374004": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16547425454653232058": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",870], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",2], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5135539474649575477": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "12394049027081208902": ["convolution_gpu_bfyx_gemm_like",1], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "16238415425814188039": ["convolution_gpu_bfyx_os_iyx_osv16",486], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2683304757433993300": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",0], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",571], + "8712136292276123857": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",1], + "16758962840329202004": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "6651389480007764007": ["convolution_gpu_bfyx_os_iyx_osv16",109], + "16710651492402564794": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "135072053401934228": ["convolution_gpu_bfyx_1x1",0], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "7026575758396092435": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3603187029740446600": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "1138439260035360722": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18446245971488003004": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "9660812093766156608": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "9545968464906009869": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",2], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "15989894214714907271": ["convolution_gpu_bfyx_os_iyx_osv16",150], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",1], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "951747146164097188": ["convolution_gpu_bfyx_1x1",2], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14502856487639608696": ["convolution_gpu_bfyx_gemm_like",2], + "13810995219720233595": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",1], + "15799159401545270696": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "10979362792894404338": ["convolution_gpu_bfyx_gemm_like",2], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "13418701036204748812": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "5843679089588930933": ["convolution_gpu_bfyx_os_iyx_osv16",900], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1425953627379976115": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",1], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",1], + "288853243482418538": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "14681717813022425567": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "8761283252495354972": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",2], + "9076758673133996959": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",2], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "11599932445375240727": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "15329680728165965773": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "8168240543278779314": ["convolution_gpu_bfyx_1x1",1], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",1], + "11919129623429545762": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",2], + "7669403041163460089": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",2], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16681690088928624738": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "4126895998426674411": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12900949103593247293": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "17208186152576814861": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "4708035980731751007": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",0], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "16312223896859176991": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "4804533178560338520": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",1], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",2], + "3150231129728961455": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2477849395789783501": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1778345646142852816": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",2], + "14147460733160099960": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",1], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "3779229442395464456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",901], + "13830605041347009953": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "14206076551739831333": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8931169575495985034": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "956022649859563080": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",2], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16849652692746541462": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",2], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",2], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",2], + "18255227391100087860": ["convolution_gpu_bfyx_1x1",2], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10415046594066474634": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "10864011008000364415": ["convolution_gpu_bfyx_1x1",2], + "4196367396954155354": ["convolution_gpu_bfyx_gemm_like",0], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",2], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "11277866878590984477": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "11690533591656807605": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "3432296808755992670": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "12523676912856063091": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "13328911884191551889": ["convolution_gpu_bfyx_1x1",2], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "142329025839464842": ["convolution_gpu_bfyx_1x1",2], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "9193880745263317167": ["convolution_gpu_bfyx_gemm_like",2], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",1], + "16293465561256937726": ["convolution_gpu_bfyx_os_iyx_osv16",664], + "8651641584737798174": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "16347412180100581330": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8519354640245415816": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13410850301164057911": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11626398907755088688": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "794499287296495726": ["convolution_gpu_bfyx_1x1",2], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "9955939178447682108": ["convolution_gpu_bfyx_1x1",0], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",2], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",2], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "4571404165794634411": ["convolution_gpu_bfyx_1x1",2], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",1], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",1], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "11060822686394981344": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15757308772667178999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",1], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10912495395422146386": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "11324651029379152442": ["convolution_gpu_bfyx_1x1",2], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",2], + "11241838709529552265": ["convolution_gpu_bfyx_gemm_like",2], + "4819131094439732065": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "87031578643428011": ["convolution_gpu_bfyx_1x1",2], + "6114241186364821679": ["convolution_gpu_bfyx_gemm_like",2], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",1021], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "5740745357953479527": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7683334381958571864": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "10432365444137108781": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",869], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",2], + "14811022197918391667": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "14268594692585922659": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",2], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "15156525717629023944": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",2], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15490478608105402679": ["convolution_gpu_bfyx_gemm_like",1], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "10141927023849730720": ["convolution_gpu_bfyx_1x1",1], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",91], + "13472577372534605883": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11267742746905371769": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "153117141968471446": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",2], + "7780140599533242850": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8458082326743351141": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5948701218437980356": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6981537186704688907": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "9069334144391048686": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "438528596970898721": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",2], + "6522575549211855712": ["convolution_gpu_bfyx_gemm_like",2], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "11465965972527519631": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3219408878901707426": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16915857558806082023": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "1089679781525023551": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3349519148124496343": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "13738442755456366277": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14079654309452583394": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",0], + "2832268621630415376": ["convolution_gpu_bfyx_gemm_like",0], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "1081962464388501987": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7009459929666511861": ["convolution_gpu_bfyx_1x1",1], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13654895364175354091": ["convolution_gpu_bfyx_1x1",2], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",1], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "6217542346826403576": ["convolution_gpu_bfyx_1x1",2], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",1], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "7770000755097925765": ["convolution_gpu_bfyx_1x1",2], + "11626402549863483301": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "3895088069642140043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16574710115918192418": ["convolution_gpu_bfyx_os_iyx_osv16",643], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",2], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",149], + "9542325095876448686": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",592], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "15997754881872769378": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "3106922888635965020": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "1116274074896622552": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",420], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "12604104383683210104": ["convolution_gpu_bfyx_gemm_like",1], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",2], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "16243196137456624852": ["convolution_gpu_bfyx_gemm_like",2], + "3746573775462003750": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",2], + "14001406016806064079": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",150], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",0], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",149], + "10883992248631603006": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",0], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "5459463503840817402": ["convolution_gpu_bfyx_1x1",2], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "16361932270527364507": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",2], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",2], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "8984436655107983227": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11872943152839631823": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "13468713306678453952": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",1], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",2], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",2], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",0], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",2], + "12397280593466519809": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "10766317990628501609": ["convolution_gpu_bfyx_os_iyx_osv16",153], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "789359733867650915": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "7430073011895298582": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",2], + "2929190644951986399": ["convolution_gpu_bfyx_os_iyx_osv16",664], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "14026570177552137240": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",116], + "18043340998699622388": ["convolution_gpu_bfyx_gemm_like",2], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "16949056117405140365": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15363606233048272809": ["convolution_gpu_bfyx_1x1",2], + "7647236080048602591": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10681768474583067517": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "17006655627343469372": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "16559140502701231107": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",2], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",0], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",2], + "17264010982688979937": ["convolution_gpu_bfyx_1x1",2], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15114370307779942381": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5055568897499186908": ["convolution_gpu_bfyx_gemm_like",0], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10106454449619141260": ["convolution_gpu_bfyx_1x1",2], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4766071144928072260": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "1984152634309440563": ["convolution_gpu_bfyx_gemm_like",2], + "290134020607738418": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15065019229949449623": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",2], + "5941298590926032148": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14151747022287993729": ["convolution_gpu_bfyx_gemm_like",0], + "12977678792503377525": ["convolution_gpu_bfyx_gemm_like",0], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "8096131027165540886": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",1], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",1], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",0], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8483523994859880782": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "787203599734115483": ["convolution_gpu_bfyx_1x1",2], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",0], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15466940145773097237": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",2], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "4378422094110940766": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12860222041026638681": ["convolution_gpu_bfyx_os_iyx_osv16",660], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "1338705434700924127": ["convolution_gpu_bfyx_1x1",1], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17647962002015093887": ["convolution_gpu_bfyx_gemm_like",2], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",1], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",492], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "1907439276166837309": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11132679855317294753": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "4403753181729432604": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "1372939511728986224": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",0], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12755692101476964677": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "1410630713443793537": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "1838534101161814609": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12461575861709234385": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12655099960717366198": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4927360358387344983": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "8253823502854784432": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7840966363183459431": ["convolution_gpu_bfyx_os_iyx_osv16",467], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",1], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "5752292348709244393": ["convolution_gpu_bfyx_gemm_like",1], + "6902644989079870993": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18136765667969393174": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "689445825453914111": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",1], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "1359720957005310113": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "16364494883229084045": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",1], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "10709828018763273371": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",0], + "8108843303778211282": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "12046017161414846599": ["convolution_gpu_bfyx_1x1",0], + "12054200116003751590": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "16917253324065998643": ["convolution_gpu_bfyx_1x1",1], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13933912937625580405": ["fully_connected_gpu_bf_io_input_spatial",2], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "5291011077679733990": ["convolution_gpu_bfyx_gemm_like",2], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",0], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",2], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",2], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",1], + "3179874645565098825": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "16925721317097534009": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "8578747191812631883": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",2], + "5758133252959371492": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "14885031472057965707": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "12061567381160185735": ["convolution_gpu_bfyx_1x1",1], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",2], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "13425251102263428554": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "2581414750854621875": ["convolution_gpu_bfyx_gemm_like",2], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "9439431829175743345": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",23], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "2866656294663853474": ["convolution_gpu_bfyx_1x1",2], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "8906588133431586825": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "8195881973746570408": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "786401653335542559": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "12493863403516600413": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "10480527638577674825": ["convolution_gpu_bfyx_1x1",2], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11192356850081328892": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14115742296883450319": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13833960927635646899": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1390379098099686972": ["convolution_gpu_bfyx_1x1",2], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",2], + "4217179485243909459": ["convolution_gpu_bfyx_gemm_like",1], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "13454265023861566476": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",1], + "15579919505002150556": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "3277243911383750280": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "8390889357546397717": ["convolution_gpu_bfyx_1x1",0], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",2], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",2], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",2], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",196], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13105192484434299621": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",2], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",0], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16683485007140805060": ["fully_connected_gpu_yxfb_ref",0], + "4848143712599565301": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5643908654122573882": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",0], + "16044646335477470657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12806934028210472719": ["convolution_gpu_bfyx_gemm_like",2], + "14930789530046665855": ["convolution_gpu_bfyx_gemm_like",2], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "17825280904760131680": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "12707946849050970702": ["convolution_gpu_bfyx_gemm_like",2], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "8058419689646625853": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4849343880559509889": ["convolution_gpu_bfyx_1x1",2], + "10320711719466983961": ["convolution_gpu_bfyx_os_iyx_osv16",610], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "3336076058264596420": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",2], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "12278786796362166070": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",0], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",2], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "792684262493086891": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "12501619443242354860": ["convolution_gpu_bfyx_gemm_like",2], + "3114869763557037270": ["fully_connected_gpu_fb_oi_ref",2], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",0], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10424278617647597641": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "11939914680143672459": ["fully_connected_gpu_fb_oi_ref",2], + "11705756153433897198": ["convolution_gpu_bfyx_1x1",2], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",1], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10554266898346470422": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "2321148334382088982": ["convolution_gpu_bfyx_gemm_like",2], + "9529614587861271730": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "15271783562528081169": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "9285566577169147378": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "7532088618116521936": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",2], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "6263019986730305851": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12871555773123368130": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6484375582324852109": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",150], + "13842309033760176194": ["convolution_gpu_bfyx_gemm_like",0], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "6290584630172122012": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5047419871737940985": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8837721075413149240": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "11490143853656040028": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "17446505012657609153": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",871], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",288], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14544219140091420262": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",0], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",0], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",150], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "2245166025103475783": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17035903590837750750": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "10785966734346479177": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",1], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "631489011812924153": ["convolution_gpu_bfyx_1x1",2], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "4366168099274266975": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",2], + "15859493313686060349": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "18337160891834020517": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "6133592828563353516": ["convolution_gpu_bfyx_os_iyx_osv16",306], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",5], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",2], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",2], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",1], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "12693511427898130707": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9524303276541517389": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",1], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "677249604491773387": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",2], + "12370729327673204804": ["convolution_gpu_bfyx_gemm_like",2], + "14206125678667603810": ["convolution_gpu_bfyx_1x1",2], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",1], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6726099352298108756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10292349730148518173": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "1351633819648952297": ["convolution_gpu_bfyx_1x1",2], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "13585163747565192884": ["convolution_gpu_bfyx_gemm_like",0], + "14015062122217462983": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11706446082856895571": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",0], + "345043289576587800": ["convolution_gpu_bfyx_1x1",2], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",666], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",2], + "9367157746678824712": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "9865252947376418804": ["convolution_gpu_bfyx_1x1",2], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",2], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",2], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",2], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4815047491742617397": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "7447163906170805189": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",2], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "8155268141318893606": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",2], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14126906427006602775": ["convolution_gpu_bfyx_1x1",2], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12190841837604350271": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5275016494706355806": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "7744787957569714828": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "5751283221740229986": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "15078590909693331731": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",395], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12985942652866621579": ["fully_connected_gpu_fb_io_ref",1], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "10899110544832584656": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17382660912493284320": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1760391741350091665": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",1], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",0], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",1], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "14397348576352573007": ["convolution_gpu_bfyx_gemm_like",1], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",2], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",387], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "905526102343710614": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18424400171776141118": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9100044555742394133": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",1030], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",1], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7913076120244203725": ["convolution_gpu_bfyx_gemm_like",2], + "5245526691775741296": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "18375125668176498051": ["convolution_gpu_bfyx_gemm_like",2], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12712071520541638451": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16190949264253468961": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",1], + "8434794604559592624": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "13775529405693629438": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",0], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "9803492989444302959": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "4479979951990338510": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "2294026590516781945": ["convolution_gpu_bfyx_gemm_like",1], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "5600128039063009632": ["convolution_gpu_bfyx_direct_10_12_16",1], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",1], + "2173720698351153121": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "8818070832398055086": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",2], + "11910735867274493498": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",2], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",2], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "13602140021189675477": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "14349625788399542568": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15192230303376521834": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "15822546325822628634": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5103094815475470596": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",0], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",1], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",2], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",2], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "3526580286148537369": ["convolution_gpu_bfyx_gemm_like",2], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",2], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",1], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14034525799882831106": ["convolution_gpu_bfyx_gemm_like",2], + "8707189142909022305": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",1], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "6129602738379919488": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9410978119783758141": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17515573322312447679": ["convolution_gpu_bfyx_gemm_like",2], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",662], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "11107930597263802755": ["convolution_gpu_bfyx_gemm_like",2], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",124], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "15129834325410878425": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15640202505592598653": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",0], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "15677717057398875599": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",2], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "8609939102588915855": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",809], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",0], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "14289048840489035546": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",2], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "16286085532892593349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "8757900457181374694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11637325834858582585": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1701412735970485849": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "4674416595144505741": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "7472330881076141262": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7015738038963065110": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",900], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "17808913959977434594": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "18109284647478027063": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "7148542290597073512": ["convolution_gpu_bfyx_gemm_like",0], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "6648876837655776653": ["convolution_gpu_bfyx_1x1",0], + "4482135524904874942": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",2], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "8421388456873652700": ["convolution_gpu_bfyx_gemm_like",2], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "7432142107544210174": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",2], + "16748662918272106932": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",1], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "9926384320714453815": ["convolution_gpu_bfyx_1x1",1], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",2], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",100], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "17050143605017295447": ["convolution_gpu_bfyx_os_iyx_osv16",150], + "9287404618748313247": ["convolution_gpu_bfyx_gemm_like",2], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "11800783548769329949": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "1418595171949196661": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "1597770067928214597": ["convolution_gpu_bfyx_1x1",1], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5687802882700097624": ["convolution_gpu_bfyx_gemm_like",2], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",2], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",2], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "13474805373264874144": ["convolution_gpu_bfyx_1x1",2], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "2816353973187452604": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",2], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",2], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",0], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",2], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "1982176363226079588": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7183578232279711009": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",2], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",1], + "15980348884716629349": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "2856601829807186494": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",0], + "1051506168926530904": ["fully_connected_gpu_bf_io_input_spatial",1], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "4168273493370024327": ["convolution_gpu_bfyx_1x1",2], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",0], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "11829442945690098558": ["convolution_gpu_bfyx_gemm_like",1], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "16986358655784856534": ["convolution_gpu_bfyx_gemm_like",0], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10471519687597963116": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11919846322488132883": ["convolution_gpu_bfyx_1x1",2], + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",898], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",1], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",2], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",2], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4398371999113956082": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13575423234109624706": ["fully_connected_gpu_yxfb_ref",0], + "659150305191479097": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "2598267743388306204": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",146], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",1], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "8258382025812748961": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15188570678726970998": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "14667209474639064623": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "10645625090439446714": ["convolution_gpu_bfyx_gemm_like",2], + "7650862961269327235": ["convolution_gpu_bfyx_1x1",2], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4423866541063606768": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",19], + "2651385050387738902": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "1596353239542510685": ["convolution_gpu_bfyx_gemm_like",2], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10256831975351722184": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",1], + "6181272224000872375": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8797843396807284399": ["convolution_gpu_bfyx_os_iyx_osv16",642], + "16955653765071712611": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16982829522704429982": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "9882204352209412039": ["convolution_gpu_bfyx_gemm_like",1], + "10838138488789241338": ["convolution_gpu_bfyx_gemm_like",1], + "7235358742317442134": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "1245259979364728404": ["convolution_gpu_bfyx_1x1",2], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6817494598328071314": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "8490260671996115530": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "11625231046723308981": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13106818352216009354": ["convolution_gpu_bfyx_gemm_like",2], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",2], + "8061914949376516780": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "2863465257341735941": ["convolution_gpu_bfyx_1x1",2], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "10607904718265020949": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",0], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15993427814066246646": ["convolution_gpu_bfyx_gemm_like",2], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",2], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",1], + "2936333406928424760": ["convolution_gpu_bfyx_1x1",2], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",0], + "14716719350966652036": ["convolution_gpu_bfyx_gemm_like",1], + "17347670200862870457": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",1], + "8451212914744825089": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",1], + "4362304842016958728": ["convolution_gpu_bfyx_gemm_like",2], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "5095827462645341808": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16579057939215877904": ["convolution_gpu_bfyx_os_iyx_osv16",317], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",2], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",1], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "3448477246688526708": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "5600807544955072308": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",285], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "9354818521586974021": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",2], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",2], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "3056212889689424946": ["convolution_gpu_bfyx_1x1",2], + "5219048275475447369": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "3687215302429221155": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "9314293064351558241": ["convolution_gpu_bfyx_gemm_like",2], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "15809639778580769565": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5941092474669713339": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "287386909600391846": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "13145474177271090694": ["convolution_gpu_bfyx_gemm_like",2], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",2], + "4098191685457418125": ["convolution_gpu_bfyx_os_iyx_osv16",485], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "15386715291503303766": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "7846384623429362522": ["convolution_gpu_bfyx_1x1",2], + "998876398773540321": ["convolution_gpu_bfyx_1x1",2], + "8069537351442302814": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",1], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "12644942072153919043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",2], + "1074748462756364699": ["fully_connected_gpu_fb_oi_ref",2], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "7106362077449435105": ["convolution_gpu_bfyx_gemm_like",0], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "18180655791734632264": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",1], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "11856815095538913065": ["convolution_gpu_yxfb_yxio_b16",2], + "18209930746627816139": ["convolution_gpu_yxfb_yxio_b16",2], + "7780336054545552428": ["convolution_gpu_yxfb_yxio_b16",2], + "7201521533301617290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",1], + "3211956138512889433": ["convolution_gpu_yxfb_yxio_b16",1], + "6944031900067948180": ["convolution_gpu_yxfb_yxio_b16",0], + "5449117614287394433": ["convolution_gpu_yxfb_yxio_b16",2], + "87031578643428011": ["convolution_gpu_bfyx_1x1",2], + "4833749391314748606": ["convolution_gpu_yxfb_yxio_b16",2], + "8450272092307894299": ["convolution_gpu_yxfb_yxio_b16",2], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "13325762052023866627": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "15773157615731010456": ["convolution_gpu_bfyx_gemm_like",2], + "6666210546769702280": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "11254744277059719812": ["convolution_gpu_yxfb_yxio_b16",1], + "10309586646776223605": ["convolution_gpu_yxfb_yxio_b16",2], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",1], + "12151068022697708126": ["convolution_gpu_bfyx_gemm_like",2], + "6464050901421037006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12550985938092975889": ["convolution_gpu_bfyx_1x1",2], + "3680396164645753224": ["convolution_gpu_yxfb_yxio_b16",0], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",2], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "15705908639736679687": ["convolution_gpu_yxfb_yxio_b16",2], + "15281554100135159550": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16238415425814188039": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "7226002258982605405": ["convolution_gpu_yxfb_yxio_b16",2], + "12397280593466519809": ["convolution_gpu_bfyx_gemm_like",2], + "2527018855890902975": ["convolution_gpu_bfyx_gemm_like",2], + "8555049634736330391": ["convolution_gpu_yxfb_yxio_b16",2], + "3107655421406621915": ["convolution_gpu_yxfb_yxio_b16",1], + "14754849694687093032": ["convolution_gpu_yxfb_yxio_b16",2], + "15065019229949449623": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17036482252028102703": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "14304497513584420080": ["convolution_gpu_yxfb_yxio_b16",2], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "7498614018449036163": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "12221101678609734421": ["convolution_gpu_yxfb_yxio_b16",2], + "560996739186313493": ["convolution_gpu_yxfb_yxio_b16",2], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1223196405651730260": ["convolution_gpu_yxfb_yxio_b16",2], + "7889602687414497280": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13160712904661288567": ["convolution_gpu_bfyx_1x1",1], + "17178308105985812083": ["convolution_gpu_yxfb_yxio_b16",2], + "17742192339816511494": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "13082313288887957490": ["convolution_gpu_yxfb_yxio_b16",2], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "6318214731544748245": ["convolution_gpu_bfyx_gemm_like",2], + "5899560521070338192": ["convolution_gpu_yxfb_yxio_b16",1], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "17192352762166764393": ["convolution_gpu_yxfb_yxio_b16",2], + "2487679091192300910": ["convolution_gpu_yxfb_yxio_b16",2], + "14126906427006602775": ["convolution_gpu_bfyx_1x1",2], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "15504618703544589723": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12867038076564517306": ["convolution_gpu_yxfb_yxio_b16",2], + "11361202190524990711": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",147], + "15231987838322151865": ["convolution_gpu_bfyx_1x1",2], + "15897300973213364823": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "2412846055735335136": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "8339704352841356825": ["convolution_gpu_yxfb_yxio_b16",1], + "14667793472412360981": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16312739695844838884": ["convolution_gpu_yxfb_yxio_b16",2], + "11861634536583463947": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "13425251102263428554": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "216603198215625772": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12714814165247623529": ["convolution_gpu_yxfb_yxio_b16",2], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16883372966656079608": ["convolution_gpu_yxfb_yxio_b16",1], + "5274929595362413625": ["convolution_gpu_yxfb_yxio_b16",2], + "2242829490403202087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6832967250168141428": ["convolution_gpu_yxfb_yxio_b16",1], + "13454265023861566476": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4242173940230902960": ["convolution_gpu_yxfb_yxio_b16",2], + "3699344686791530101": ["convolution_gpu_bfyx_gemm_like",2], + "11342135956789192833": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "10850369799801518638": ["convolution_gpu_yxfb_yxio_b16",2], + "4283886984540574108": ["convolution_gpu_yxfb_yxio_b16",1], + "12308956927236847009": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1933147648540963732": ["convolution_gpu_yxfb_yxio_b16",2], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6784853321527374515": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9363988379673156863": ["convolution_gpu_yxfb_yxio_b16",2], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",2], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "5328004363712610999": ["convolution_gpu_yxfb_yxio_b16",1], + "10429104188258277773": ["convolution_gpu_yxfb_yxio_b16",2], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "13474805373264874144": ["convolution_gpu_bfyx_1x1",2], + "2048528188026477374": ["convolution_gpu_yxfb_yxio_b16",2], + "3797986765970777456": ["convolution_gpu_yxfb_yxio_b16",2], + "2148877522799179369": ["convolution_gpu_yxfb_yxio_b16",2], + "14085753024976995311": ["convolution_gpu_yxfb_yxio_b16",2], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",2], + "15800554162607246964": ["convolution_gpu_bfyx_gemm_like",1], + "17258128299721452811": ["convolution_gpu_yxfb_yxio_b16",2], + "6214624887470295152": ["convolution_gpu_bfyx_1x1",1], + "13842149852156451845": ["convolution_gpu_yxfb_yxio_b16",2], + "7482459536338668149": ["convolution_gpu_yxfb_yxio_b16",2], + "1786821683911142459": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5436553435132026991": ["convolution_gpu_yxfb_yxio_b16",2], + "14677968346503677769": ["convolution_gpu_yxfb_yxio_b16",2], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",2], + "932195814187889636": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13106818352216009354": ["convolution_gpu_bfyx_gemm_like",2], + "12933253554354951910": ["convolution_gpu_bfyx_gemm_like",2], + "2945245652128285151": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",1], + "1697260854781788314": ["convolution_gpu_yxfb_yxio_b16",2], + "10816702874143297564": ["convolution_gpu_yxfb_yxio_b16",2], + "2884499360870038648": ["convolution_gpu_yxfb_yxio_b16",2], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "6400660469217490279": ["convolution_gpu_yxfb_yxio_b16",2], + "16293465561256937726": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "14165325329016075285": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16912738776771289379": ["convolution_gpu_yxfb_yxio_b16",2], + "13702254392810961772": ["convolution_gpu_yxfb_yxio_b16",2], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "5276029719268937229": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7822463130304602936": ["convolution_gpu_yxfb_yxio_b16",2], + "8065866013404161366": ["convolution_gpu_yxfb_yxio_b16",2], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "968092788032627444": ["convolution_gpu_yxfb_yxio_b16",2], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2531597468539205600": ["convolution_gpu_yxfb_yxio_b16",2], + "3287181725010492879": ["convolution_gpu_yxfb_yxio_b16",2], + "8577875628223148806": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "6764038061921866053": ["convolution_gpu_yxfb_yxio_b16",2], + "3061372669831947873": ["convolution_gpu_yxfb_yxio_b16",2], + "15604634351310647589": ["convolution_gpu_yxfb_yxio_b16",2], + "10803929517111130153": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5040095338370816349": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3176785355296130660": ["convolution_gpu_bfyx_gemm_like",2], + "12810833895438895155": ["convolution_gpu_yxfb_yxio_b16",2], + "4154403364889130045": ["convolution_gpu_bfyx_gemm_like",2], + "14034402827496819479": ["convolution_gpu_bfyx_gemm_like",2], + "16794854619854992714": ["convolution_gpu_yxfb_yxio_b16",1], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "905526102343710614": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "7463517383354309469": ["convolution_gpu_bfyx_gemm_like",0], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4135975804549022456": ["convolution_gpu_yxfb_yxio_b16",2], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",1], + "17990326690659802090": ["convolution_gpu_yxfb_yxio_b16",2], + "10883992248631603006": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "2328919599530851492": ["convolution_gpu_yxfb_yxio_b16",2], + "3602929955785812025": ["convolution_gpu_yxfb_yxio_b16",2], + "10100171358681249181": ["convolution_gpu_yxfb_yxio_b16",2], + "11184290482439221741": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "9433162648796382333": ["convolution_gpu_yxfb_yxio_b16",2], + "9542325095876448686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "3117175697326325371": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1641111108888949123": ["convolution_gpu_yxfb_yxio_b16",2], + "15871357525719630224": ["convolution_gpu_bfyx_1x1",1], + "10598099730944525581": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "5099947445888268507": ["convolution_gpu_yxfb_yxio_b16",2], + "15958886009743157242": ["convolution_gpu_bfyx_gemm_like",2], + "5551484040302194648": ["convolution_gpu_yxfb_yxio_b16",2], + "2014114949154914483": ["convolution_gpu_yxfb_yxio_b16",2], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "2095245727814188300": ["convolution_gpu_bfyx_gemm_like",2], + "1692473411043262397": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11563892089503603030": ["convolution_gpu_yxfb_yxio_b16",2], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17789969008677638142": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1556975727728498645": ["convolution_gpu_yxfb_yxio_b16",2], + "5593329151028712439": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "1963081583851864291": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12308895602001600327": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "741727668385951462": ["convolution_gpu_yxfb_yxio_b16",2], + "5884951148427535208": ["convolution_gpu_yxfb_yxio_b16",2], + "8976966933427522253": ["convolution_gpu_bfyx_gemm_like",2], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",1], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",2], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "10141558851476164734": ["convolution_gpu_yxfb_yxio_b16",2], + "2727175120437582536": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8390889357546397717": ["convolution_gpu_bfyx_1x1",1], + "16915857558806082023": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17181874388601550941": ["convolution_gpu_yxfb_yxio_b16",2], + "8093401822846123153": ["convolution_gpu_yxfb_yxio_b16",2], + "8494725779002762049": ["convolution_gpu_bfyx_gemm_like",2], + "4165926748138587705": ["convolution_gpu_yxfb_yxio_b16",2], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5911282942658469852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "762634810164167963": ["convolution_gpu_yxfb_yxio_b16",0], + "1154763947184432124": ["convolution_gpu_yxfb_yxio_b16",2], + "8686733586982652897": ["convolution_gpu_yxfb_yxio_b16",2], + "9065137335863605013": ["convolution_gpu_yxfb_yxio_b16",2], + "7203620615363933078": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "12806934028210472719": ["convolution_gpu_bfyx_gemm_like",2], + "2459018025887933198": ["convolution_gpu_yxfb_yxio_b16",2], + "14128122558476128712": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",2], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",2], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",2], + "4744578087509837185": ["convolution_gpu_yxfb_yxio_b16",0], + "8407012082034007985": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "5928392400230917930": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6744692937598310090": ["convolution_gpu_yxfb_yxio_b16",2], + "16437093737761968743": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "959260710517842876": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",642], + "14675165976583799157": ["convolution_gpu_yxfb_yxio_b16",2], + "8353259929933281349": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3364467044587904559": ["convolution_gpu_yxfb_yxio_b16",2], + "3385797925880519845": ["convolution_gpu_bfyx_1x1",2], + "15924583510704449214": ["convolution_gpu_bfyx_gemm_like",1], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8168240543278779314": ["convolution_gpu_bfyx_1x1",1], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "11175353869874626110": ["convolution_gpu_yxfb_yxio_b16",2], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "757225477250808939": ["convolution_gpu_yxfb_yxio_b16",2], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16516262096533373158": ["convolution_gpu_yxfb_yxio_b16",2], + "16052741298509954954": ["convolution_gpu_yxfb_yxio_b16",2], + "9585113116232600562": ["convolution_gpu_bfyx_gemm_like",1], + "7008873036126556197": ["convolution_gpu_yxfb_yxio_b16",2], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4137738705782981426": ["convolution_gpu_bfyx_gemm_like",2], + "6484375582324852109": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "8954488655859677891": ["convolution_gpu_yxfb_yxio_b16",2], + "14058311587429063829": ["convolution_gpu_yxfb_yxio_b16",2], + "8952733400567254769": ["convolution_gpu_bfyx_gemm_like",2], + "3022939690177474442": ["convolution_gpu_yxfb_yxio_b16",1], + "7748514992101811029": ["convolution_gpu_yxfb_yxio_b16",1], + "4571404165794634411": ["convolution_gpu_bfyx_1x1",2], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16788162879714733906": ["convolution_gpu_yxfb_yxio_b16",2], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7923576965630818418": ["convolution_gpu_yxfb_yxio_b16",2], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15641322340289892344": ["convolution_gpu_yxfb_yxio_b16",1], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",1], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4863644213728386734": ["convolution_gpu_yxfb_yxio_b16",2], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "16016396784190934729": ["convolution_gpu_yxfb_yxio_b16",2], + "14120569486714455490": ["convolution_gpu_yxfb_yxio_b16",2], + "2180039710632160943": ["convolution_gpu_yxfb_yxio_b16",1], + "3396731547696204011": ["convolution_gpu_yxfb_yxio_b16",2], + "1427040855295681285": ["convolution_gpu_yxfb_yxio_b16",2], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "10183537720515608": ["convolution_gpu_yxfb_yxio_b16",1], + "7425369489110576363": ["convolution_gpu_yxfb_yxio_b16",2], + "14830991971271385876": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "13957350536347764705": ["convolution_gpu_bfyx_gemm_like",2], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "11768117585574496387": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "7972861956906521660": ["convolution_gpu_yxfb_yxio_b16",2], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "14366395926517590797": ["convolution_gpu_yxfb_yxio_b16",1], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12818786388125465101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "16247799703932868151": ["convolution_gpu_yxfb_yxio_b16",2], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "16582761411084080015": ["convolution_gpu_yxfb_yxio_b16",2], + "1157069349112113377": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16836088134347394854": ["convolution_gpu_yxfb_yxio_b16",2], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "12421204749289937399": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",2], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "14699357144600604190": ["convolution_gpu_yxfb_yxio_b16",1], + "13767985623872409391": ["convolution_gpu_yxfb_yxio_b16",1], + "17386047378634216634": ["convolution_gpu_yxfb_yxio_b16",2], + "2245166025103475783": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "8193369947544085921": ["convolution_gpu_bfyx_gemm_like",2], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "6284333183047854748": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4121535611334103359": ["convolution_gpu_yxfb_yxio_b16",2], + "4165019140664090799": ["convolution_gpu_yxfb_yxio_b16",2], + "15397084091361096354": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "13387545865482261974": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "12722153168975105360": ["convolution_gpu_yxfb_yxio_b16",2], + "150132162949295379": ["convolution_gpu_bfyx_1x1",2], + "15101834579076569231": ["convolution_gpu_yxfb_yxio_b16",2], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12166852830214895457": ["convolution_gpu_bfyx_1x1",2], + "1944461047787586724": ["convolution_gpu_yxfb_yxio_b16",1], + "6875055157295709098": ["convolution_gpu_yxfb_yxio_b16",2], + "6887205509732544213": ["convolution_gpu_yxfb_yxio_b16",2], + "16738951239219589307": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "16889886654893884746": ["convolution_gpu_bfyx_1x1",2], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",1], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "3793265335909270748": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11626398907755088688": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12567935463143860469": ["convolution_gpu_yxfb_yxio_b16",2], + "5155616842071169667": ["convolution_gpu_yxfb_yxio_b16",2], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "6942016672941874829": ["convolution_gpu_bfyx_gemm_like",2], + "7719954202744123391": ["convolution_gpu_bfyx_gemm_like",2], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2789901295967374316": ["convolution_gpu_yxfb_yxio_b16",2], + "188830358699960789": ["convolution_gpu_yxfb_yxio_b16",2], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9193880745263317167": ["convolution_gpu_bfyx_gemm_like",2], + "15012885932988454455": ["convolution_gpu_yxfb_yxio_b16",2], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "12179968379663737450": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17096735128393723245": ["convolution_gpu_yxfb_yxio_b16",1], + "10424643336435622408": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",2], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "503369896500284129": ["convolution_gpu_bfyx_1x1",2], + "7432142107544210174": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",1], + "9955939178447682108": ["convolution_gpu_bfyx_1x1",2], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",2], + "18080788888293706149": ["convolution_gpu_yxfb_yxio_b16",2], + "4674504221851042542": ["convolution_gpu_yxfb_yxio_b16",2], + "9401409770128851474": ["convolution_gpu_bfyx_gemm_like",0], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9414927552739380436": ["convolution_gpu_yxfb_yxio_b16",1], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4089043893927493060": ["convolution_gpu_yxfb_yxio_b16",2], + "15822546325822628634": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "17434141039341226796": ["convolution_gpu_yxfb_yxio_b16",2], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "6109013751635776331": ["convolution_gpu_bfyx_gemm_like",2], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16214394186337220006": ["convolution_gpu_yxfb_yxio_b16",2], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "16339187733937346919": ["convolution_gpu_yxfb_yxio_b16",2], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",2], + "2314579504260247470": ["convolution_gpu_yxfb_yxio_b16",2], + "8092673566670222445": ["convolution_gpu_yxfb_yxio_b16",2], + "13302687772426736346": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14686272582436109012": ["convolution_gpu_yxfb_yxio_b16",2], + "6934915634718835911": ["convolution_gpu_yxfb_yxio_b16",2], + "5293502980575652171": ["convolution_gpu_yxfb_yxio_b16",2], + "9190054801124577726": ["convolution_gpu_yxfb_yxio_b16",2], + "14268594692585922659": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "18279927175542031567": ["convolution_gpu_yxfb_yxio_b16",2], + "4755225554035527185": ["convolution_gpu_yxfb_yxio_b16",2], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "9328585005923667676": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9541996065561509160": ["convolution_gpu_yxfb_yxio_b16",2], + "3346891393420268502": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16986358655784856534": ["convolution_gpu_bfyx_gemm_like",2], + "14216513246096503793": ["convolution_gpu_yxfb_yxio_b16",2], + "17446505012657609153": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16434358667865869005": ["convolution_gpu_yxfb_yxio_b16",2], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "7155796826953849982": ["convolution_gpu_yxfb_yxio_b16",2], + "11545529736818363243": ["convolution_gpu_yxfb_yxio_b16",2], + "10613621801998459768": ["convolution_gpu_yxfb_yxio_b16",2], + "42935035304560876": ["convolution_gpu_yxfb_yxio_b16",1], + "185782385623159958": ["convolution_gpu_bfyx_gemm_like",2], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "2542112741645712811": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "2714322766616035858": ["convolution_gpu_yxfb_yxio_b16",2], + "166437837813304707": ["convolution_gpu_yxfb_yxio_b16",2], + "5832851215142537445": ["convolution_gpu_yxfb_yxio_b16",2], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",1], + "9120374653477510318": ["convolution_gpu_yxfb_yxio_b16",2], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "12536364199388193516": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15457040168177954463": ["convolution_gpu_yxfb_yxio_b16",2], + "3019864917236424168": ["convolution_gpu_yxfb_yxio_b16",1], + "16974981142389546385": ["convolution_gpu_yxfb_yxio_b16",2], + "17167229341919111718": ["convolution_gpu_bfyx_gemm_like",2], + "14043064718932538557": ["convolution_gpu_yxfb_yxio_b16",2], + "9812438080378091263": ["convolution_gpu_yxfb_yxio_b16",2], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",1], + "5995121118186531621": ["convolution_gpu_yxfb_yxio_b16",1], + "9177211394807412309": ["convolution_gpu_yxfb_yxio_b16",2], + "9144269202766996508": ["convolution_gpu_yxfb_yxio_b16",2], + "9803492989444302959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18186612931984342471": ["convolution_gpu_yxfb_yxio_b16",2], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "16101625311127899143": ["convolution_gpu_bfyx_gemm_like",2], + "10514865654990433040": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "18120079746729314878": ["convolution_gpu_yxfb_yxio_b16",2], + "13006774775034887171": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5774841809066688068": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "18435632962969462312": ["convolution_gpu_yxfb_yxio_b16",2], + "14799012895945855878": ["convolution_gpu_yxfb_yxio_b16",2], + "8095675456938934982": ["convolution_gpu_yxfb_yxio_b16",2], + "8561261337239934159": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12887076860522920405": ["convolution_gpu_yxfb_yxio_b16",2], + "16548491024653039967": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "12275528180752359999": ["convolution_gpu_yxfb_yxio_b16",2], + "15199604820473713622": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "4890043345392707202": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "3894130445933963911": ["convolution_gpu_yxfb_yxio_b16",2], + "10041205516209288381": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "7447163906170805189": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "14884315147107686805": ["convolution_gpu_bfyx_gemm_like",1], + "8490260671996115530": ["convolution_gpu_bfyx_gemm_like",1], + "7667898603371717971": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10775271979871646995": ["convolution_gpu_yxfb_yxio_b16",2], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "768820004084041271": ["convolution_gpu_yxfb_yxio_b16",2], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8161520217142313996": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8542782888102516498": ["convolution_gpu_yxfb_yxio_b16",2], + "967141158966448909": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7171904645566467208": ["convolution_gpu_bfyx_gemm_like",2], + "1634884284544380004": ["convolution_gpu_yxfb_yxio_b16",1], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "10226095100825845185": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16271675466919087248": ["convolution_gpu_yxfb_yxio_b16",2], + "7839141505912665157": ["fully_connected_gpu_fb_oi_ref",1], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13683623172740048376": ["convolution_gpu_bfyx_gemm_like",2], + "13991572769793610416": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2058172559199858297": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",2], + "7737977992444172757": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16911464046178654033": ["convolution_gpu_bfyx_1x1",2], + "4769003637955328938": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1117836569328440439": ["convolution_gpu_yxfb_yxio_b16",2], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9399511839804500548": ["convolution_gpu_yxfb_yxio_b16",1], + "11002165738333323413": ["convolution_gpu_yxfb_yxio_b16",2], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "7992077349568239994": ["convolution_gpu_yxfb_yxio_b16",2], + "14345755557418971954": ["convolution_gpu_yxfb_yxio_b16",2], + "7132328255408635227": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",2], + "7450417963648518926": ["convolution_gpu_bfyx_gemm_like",2], + "132437164570900392": ["convolution_gpu_yxfb_yxio_b16",2], + "8865700182878875593": ["convolution_gpu_yxfb_yxio_b16",2], + "466744273945239777": ["convolution_gpu_yxfb_yxio_b16",2], + "10785966734346479177": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "1117729599102132243": ["convolution_gpu_yxfb_yxio_b16",2], + "17025324057045572535": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8348997431940166878": ["convolution_gpu_yxfb_yxio_b16",2], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "13586735166545634506": ["convolution_gpu_yxfb_yxio_b16",2], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "10211403590176354415": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "276313536076170391": ["convolution_gpu_bfyx_gemm_like",2], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "15596408854298291433": ["convolution_gpu_yxfb_yxio_b16",2], + "13042938686374926241": ["convolution_gpu_yxfb_yxio_b16",2], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7375461241315602473": ["convolution_gpu_bfyx_gemm_like",2], + "14346466672686303107": ["convolution_gpu_yxfb_yxio_b16",2], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9456645866001656225": ["convolution_gpu_yxfb_yxio_b16",2], + "17970424536559595893": ["convolution_gpu_yxfb_yxio_b16",2], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "7614673554809134631": ["convolution_gpu_yxfb_yxio_b16",2], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "16851716501872033211": ["fully_connected_gpu_fb_io_block_fp16",1], + "15594673952484539994": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "6141193842171342687": ["convolution_gpu_yxfb_yxio_b16",2], + "818998169319147148": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",2], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "7894230717547658326": ["convolution_gpu_yxfb_yxio_b16",1], + "15522099459864628246": ["convolution_gpu_bfyx_gemm_like",2], + "2844794465598309010": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17583785768334531086": ["convolution_gpu_yxfb_yxio_b16",2], + "11685571068419983048": ["convolution_gpu_bfyx_1x1",2], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",1], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14025678657541870252": ["convolution_gpu_yxfb_yxio_b16",2], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18091349188280218186": ["convolution_gpu_yxfb_yxio_b16",2], + "15974208269240775349": ["convolution_gpu_yxfb_yxio_b16",1], + "11152334947349565403": ["convolution_gpu_yxfb_yxio_b16",2], + "4367991456894497706": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "13833960927635646899": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13700014916680753395": ["convolution_gpu_bfyx_gemm_like",2], + "15070618248849566698": ["convolution_gpu_yxfb_yxio_b16",2], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18427056032084727710": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1135062632388082485": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "16870110185980402237": ["convolution_gpu_yxfb_yxio_b16",2], + "10704906466618081803": ["convolution_gpu_yxfb_yxio_b16",2], + "8857763129101380288": ["convolution_gpu_bfyx_gemm_like",2], + "1775515808301276388": ["convolution_gpu_yxfb_yxio_b16",2], + "14147460733160099960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4824040283449153298": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "5606914392662771013": ["convolution_gpu_yxfb_yxio_b16",2], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "16617569629839911513": ["convolution_gpu_yxfb_yxio_b16",2], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6942606834115081953": ["convolution_gpu_yxfb_yxio_b16",2], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",2], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "4588117321438490483": ["convolution_gpu_yxfb_yxio_b16",2], + "4342446399224806160": ["convolution_gpu_yxfb_yxio_b16",2], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11910735867274493498": ["convolution_gpu_bfyx_gemm_like",2], + "1281190653081960886": ["convolution_gpu_yxfb_yxio_b16",2], + "9099056013518879466": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "12686330321897091505": ["convolution_gpu_bfyx_gemm_like",2], + "8731079912830889828": ["convolution_gpu_yxfb_yxio_b16",2], + "15739756988784344130": ["convolution_gpu_yxfb_yxio_b16",2], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "6845814820599174031": ["convolution_gpu_bfyx_direct_10_12_16",0], + "2929715823970060874": ["convolution_gpu_bfyx_gemm_like",1], + "9463256538942644563": ["convolution_gpu_yxfb_yxio_b16",2], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "13512059751838488458": ["convolution_gpu_yxfb_yxio_b16",2], + "10820312036555742020": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "6055793483770886264": ["convolution_gpu_yxfb_yxio_b16",2], + "1330337530094825121": ["convolution_gpu_yxfb_yxio_b16",2], + "6254141935545262078": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",1], + "234288286732396704": ["convolution_gpu_yxfb_yxio_b16",1], + "467070383257529689": ["convolution_gpu_yxfb_yxio_b16",2], + "6654167459904026563": ["convolution_gpu_yxfb_yxio_b16",2], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "10979362792894404338": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "11693134363909241514": ["convolution_gpu_yxfb_yxio_b16",2], + "1921500066107090648": ["convolution_gpu_yxfb_yxio_b16",2], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",167], + "3117673619907511009": ["convolution_gpu_bfyx_os_iyx_osv16",487], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "17081449111821382308": ["convolution_gpu_yxfb_yxio_b16",1], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",2], + "13123561937554734618": ["convolution_gpu_yxfb_yxio_b16",2], + "1290180607037086383": ["convolution_gpu_yxfb_yxio_b16",2], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "13022797264172398260": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "3986429358782189117": ["convolution_gpu_yxfb_yxio_b16",1], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",2], + "14244689429217411113": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "6546440095044731932": ["convolution_gpu_yxfb_yxio_b16",2], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10747101719272611563": ["convolution_gpu_yxfb_yxio_b16",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",2], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "2917999294360728537": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3948843501884284998": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13124659308711651699": ["convolution_gpu_bfyx_gemm_like",2], + "2119566651547512543": ["convolution_gpu_yxfb_yxio_b16",1], + "1318571118468536310": ["convolution_gpu_bfyx_gemm_like",2], + "9545968464906009869": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "11455055202624479980": ["convolution_gpu_yxfb_yxio_b16",2], + "5285172225938230524": ["convolution_gpu_yxfb_yxio_b16",2], + "11669828823444745889": ["convolution_gpu_bfyx_gemm_like",2], + "12184235281888559274": ["convolution_gpu_yxfb_yxio_b16",2], + "5257134257307295031": ["convolution_gpu_yxfb_yxio_b16",1], + "17085927772068621152": ["convolution_gpu_yxfb_yxio_b16",2], + "1359720957005310113": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",2], + "16818206615424635387": ["convolution_gpu_bfyx_1x1",1], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "11679235499894668689": ["convolution_gpu_yxfb_yxio_b16",2], + "8655739705298627602": ["convolution_gpu_bfyx_gemm_like",0], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5240706676373148280": ["convolution_gpu_bfyx_gemm_like",2], + "3325575565536567070": ["convolution_gpu_yxfb_yxio_b16",2], + "1074748462756364699": ["fully_connected_gpu_fb_oi_ref",1], + "6926590672771069689": ["convolution_gpu_yxfb_yxio_b16",2], + "9603926867418680768": ["convolution_gpu_yxfb_yxio_b16",2], + "9412392168031560549": ["convolution_gpu_yxfb_yxio_b16",2], + "16003914811215141863": ["convolution_gpu_yxfb_yxio_b16",2], + "8956566633622104099": ["convolution_gpu_yxfb_yxio_b16",2], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "574869992355132069": ["convolution_gpu_bfyx_gemm_like",2], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12933785392937626017": ["convolution_gpu_yxfb_yxio_b16",2], + "16316483048621486077": ["convolution_gpu_bfyx_gemm_like",2], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14248622935809594779": ["convolution_gpu_yxfb_yxio_b16",2], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14752182392048929103": ["convolution_gpu_yxfb_yxio_b16",2], + "6744044115114192916": ["convolution_gpu_yxfb_yxio_b16",2], + "16385915289511951113": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "5017701748886087836": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "10878198256414940305": ["convolution_gpu_yxfb_yxio_b16",2], + "12287667143602938393": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "697333686114567307": ["convolution_gpu_bfyx_gemm_like",2], + "10113696658040720628": ["convolution_gpu_yxfb_yxio_b16",1], + "16013560489115457872": ["convolution_gpu_yxfb_yxio_b16",2], + "9741607635826869269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12023260267201191955": ["convolution_gpu_yxfb_yxio_b16",1], + "4776446300552810228": ["convolution_gpu_bfyx_gemm_like",0], + "15329680728165965773": ["convolution_gpu_bfyx_gemm_like",2], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10148067979123062638": ["convolution_gpu_yxfb_yxio_b16",1], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "16072525303202287969": ["convolution_gpu_yxfb_yxio_b16",2], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",1], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",0], + "6822432085522584060": ["convolution_gpu_yxfb_yxio_b16",2], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "7715937239456300593": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2321773209766424929": ["convolution_gpu_yxfb_yxio_b16",2], + "12710794174926396540": ["convolution_gpu_yxfb_yxio_b16",2], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",2], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8466986812935642059": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "1786105567361070086": ["convolution_gpu_yxfb_yxio_b16",2], + "12877601016766418505": ["convolution_gpu_bfyx_gemm_like",2], + "12241130380766920378": ["convolution_gpu_yxfb_yxio_b16",1], + "7837876599690110056": ["convolution_gpu_bfyx_gemm_like",2], + "17536482873064844308": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "632116056424249698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12146979849998627283": ["convolution_gpu_bfyx_gemm_like",2], + "824911124897042617": ["convolution_gpu_yxfb_yxio_b16",2], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",2], + "2835909063063272102": ["convolution_gpu_bfyx_gemm_like",2], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",1], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",1], + "3101748967012684440": ["convolution_gpu_yxfb_yxio_b16",2], + "8837721075413149240": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12287827551127082597": ["convolution_gpu_yxfb_yxio_b16",2], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "875146113874776902": ["convolution_gpu_yxfb_yxio_b16",2], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",2], + "1157388265135592238": ["convolution_gpu_yxfb_yxio_b16",2], + "11096750581455917678": ["convolution_gpu_yxfb_yxio_b16",2], + "10325138269934303618": ["convolution_gpu_yxfb_yxio_b16",2], + "14359026450472189405": ["convolution_gpu_yxfb_yxio_b16",2], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",1], + "14795618530175274538": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "6825390996679224270": ["convolution_gpu_yxfb_yxio_b16",2], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "14963614790718019676": ["convolution_gpu_yxfb_yxio_b16",2], + "11775265110573621330": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "7552544688541855979": ["convolution_gpu_bfyx_gemm_like",2], + "15078590909693331731": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12081698011407453832": ["convolution_gpu_yxfb_yxio_b16",2], + "5968129546023764583": ["convolution_gpu_yxfb_yxio_b16",2], + "3190494353583341446": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5567628205735744449": ["convolution_gpu_yxfb_yxio_b16",2], + "11705756153433897198": ["convolution_gpu_bfyx_1x1",2], + "10693837788817206459": ["convolution_gpu_yxfb_yxio_b16",2], + "17264671167892237524": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",2], + "8645965165922150743": ["convolution_gpu_yxfb_yxio_b16",2], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6805188858008657978": ["convolution_gpu_bfyx_gemm_like",2], + "16901594465545439334": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7541325258238317885": ["convolution_gpu_yxfb_yxio_b16",2], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "3965327578193694832": ["convolution_gpu_yxfb_yxio_b16",2], + "3742751561273931407": ["convolution_gpu_yxfb_yxio_b16",1], + "12469992822259989528": ["convolution_gpu_yxfb_yxio_b16",2], + "17292751972745231011": ["convolution_gpu_yxfb_yxio_b16",2], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",2], + "3735605582512535278": ["convolution_gpu_yxfb_yxio_b16",2], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",2], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "2449586975250543578": ["convolution_gpu_yxfb_yxio_b16",2], + "13701870576531008278": ["convolution_gpu_yxfb_yxio_b16",2], + "6699877220571254719": ["convolution_gpu_yxfb_yxio_b16",2], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5079381702867378605": ["convolution_gpu_yxfb_yxio_b16",1], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "14910911338105922048": ["convolution_gpu_yxfb_yxio_b16",2], + "10528894716283673051": ["convolution_gpu_yxfb_yxio_b16",2], + "12793347723828876280": ["convolution_gpu_yxfb_yxio_b16",2], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "10399620940700804517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11694428890484758107": ["convolution_gpu_yxfb_yxio_b16",1], + "15666720796968090760": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7770000755097925765": ["convolution_gpu_bfyx_1x1",2], + "3419536918610303807": ["convolution_gpu_yxfb_yxio_b16",2], + "18446245971488003004": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "13927671398099556854": ["convolution_gpu_yxfb_yxio_b16",2], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "1880137091477870982": ["convolution_gpu_yxfb_yxio_b16",1], + "9076758673133996959": ["convolution_gpu_bfyx_gemm_like",2], + "8302886228681027388": ["convolution_gpu_yxfb_yxio_b16",2], + "7292351660229751817": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",2], + "17996535939348094624": ["convolution_gpu_yxfb_yxio_b16",2], + "4039483032571506874": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "15289152041466330689": ["convolution_gpu_bfyx_gemm_like",2], + "2191416057399400794": ["convolution_gpu_yxfb_yxio_b16",2], + "2608363732937932266": ["convolution_gpu_bfyx_gemm_like",2], + "396580837423299119": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",1], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4116610956045302817": ["convolution_gpu_yxfb_yxio_b16",2], + "15476491807306982382": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6102330514901613158": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16898785030254336705": ["convolution_gpu_yxfb_yxio_b16",2], + "16094455700371652312": ["convolution_gpu_yxfb_yxio_b16",2], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",2], + "16936968151775497887": ["convolution_gpu_bfyx_gemm_like",2], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16188473537674428539": ["convolution_gpu_yxfb_yxio_b16",2], + "6756771670011959646": ["convolution_gpu_bfyx_gemm_like",2], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7051238664181857633": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "3571030800252732358": ["convolution_gpu_yxfb_yxio_b16",2], + "11065709388908213457": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5924341622384096919": ["convolution_gpu_bfyx_gemm_like",2], + "13821224753538037982": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "5595779343671478945": ["convolution_gpu_yxfb_yxio_b16",2], + "3220756134650041028": ["convolution_gpu_yxfb_yxio_b16",2], + "11824946481875102910": ["convolution_gpu_yxfb_yxio_b16",2], + "14044732537191084187": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9647916259092117712": ["convolution_gpu_bfyx_gemm_like",2], + "7008509833947166548": ["convolution_gpu_yxfb_yxio_b16",2], + "1099404514975797315": ["convolution_gpu_yxfb_yxio_b16",2], + "3114869763557037270": ["fully_connected_gpu_fb_oi_ref",1], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",526], + "6427979320488981912": ["convolution_gpu_yxfb_yxio_b16",1], + "5312269140190538942": ["convolution_gpu_yxfb_yxio_b16",2], + "16813995580382709489": ["convolution_gpu_yxfb_yxio_b16",2], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "4090512597925170883": ["convolution_gpu_yxfb_yxio_b16",2], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "7585777271711713778": ["convolution_gpu_yxfb_yxio_b16",2], + "6721354194352192662": ["convolution_gpu_yxfb_yxio_b16",2], + "14771341796915983228": ["convolution_gpu_yxfb_yxio_b16",2], + "18416908414174464784": ["convolution_gpu_bfyx_gemm_like",2], + "15956352026642286295": ["convolution_gpu_yxfb_yxio_b16",2], + "9312974578711092131": ["convolution_gpu_yxfb_yxio_b16",2], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",2], + "1208161922424418734": ["convolution_gpu_bfyx_gemm_like",2], + "12305397676800089268": ["convolution_gpu_yxfb_yxio_b16",2], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",2], + "17082268616134506581": ["convolution_gpu_yxfb_yxio_b16",2], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",2], + "17955326503130437346": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "13962325395021860937": ["convolution_gpu_yxfb_yxio_b16",2], + "16589848737162195829": ["convolution_gpu_yxfb_yxio_b16",2], + "11497761673211348612": ["convolution_gpu_yxfb_yxio_b16",2], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "5938850739683493929": ["convolution_gpu_yxfb_yxio_b16",0], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",2], + "15188273255634848057": ["convolution_gpu_yxfb_yxio_b16",2], + "1413558157882728476": ["convolution_gpu_yxfb_yxio_b16",2], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",1], + "8459380583159325597": ["convolution_gpu_yxfb_yxio_b16",1], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",0], + "4718716595177056289": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "12107079280128343726": ["convolution_gpu_yxfb_yxio_b16",2], + "3217295012596892181": ["convolution_gpu_yxfb_yxio_b16",2], + "12926382190254407283": ["convolution_gpu_yxfb_yxio_b16",2], + "8543619733732987550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1787152688807233651": ["convolution_gpu_yxfb_yxio_b16",2], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",1], + "4021045600853993587": ["convolution_gpu_yxfb_yxio_b16",2], + "9654944848074437064": ["convolution_gpu_bfyx_gemm_like",2], + "15635250842093678965": ["convolution_gpu_yxfb_yxio_b16",2], + "9418041909134721047": ["convolution_gpu_bfyx_gemm_like",2], + "2031558560788449957": ["convolution_gpu_yxfb_yxio_b16",1], + "14807466024030301968": ["convolution_gpu_yxfb_yxio_b16",2], + "14135593723444205032": ["convolution_gpu_bfyx_gemm_like",2], + "14646141746558153748": ["convolution_gpu_yxfb_yxio_b16",2], + "5583453364991774426": ["convolution_gpu_yxfb_yxio_b16",2], + "11436473937404565094": ["convolution_gpu_yxfb_yxio_b16",0], + "11719957578496407410": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "462240909302334133": ["convolution_gpu_yxfb_yxio_b16",2], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1042605521041579458": ["convolution_gpu_yxfb_yxio_b16",2], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",1], + "13193898459027972719": ["convolution_gpu_yxfb_yxio_b16",0], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12780116250427776647": ["convolution_gpu_yxfb_yxio_b16",2], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "7949069388917479511": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18161971781834208343": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "16404362308829952450": ["convolution_gpu_yxfb_yxio_b16",2], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "3301356450249305137": ["convolution_gpu_yxfb_yxio_b16",2], + "9589361786336650748": ["convolution_gpu_yxfb_yxio_b16",1], + "11807282628372660280": ["convolution_gpu_bfyx_1x1",2], + "16953093098789113080": ["convolution_gpu_yxfb_yxio_b16",2], + "10525462454857911293": ["convolution_gpu_yxfb_yxio_b16",2], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "11313025178951972247": ["convolution_gpu_bfyx_gemm_like",1], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2399313178951511557": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "16644952765107909604": ["convolution_gpu_yxfb_yxio_b16",1], + "3724572174214794659": ["convolution_gpu_yxfb_yxio_b16",2], + "10893628699015898230": ["convolution_gpu_yxfb_yxio_b16",1], + "7954972694876158422": ["convolution_gpu_bfyx_1x1",2], + "8458082326743351141": ["convolution_gpu_bfyx_gemm_like",2], + "13468081302022888489": ["convolution_gpu_bfyx_gemm_like",2], + "14789782064157699768": ["convolution_gpu_yxfb_yxio_b16",2], + "5578991261564497604": ["convolution_gpu_yxfb_yxio_b16",2], + "13767500791267563349": ["convolution_gpu_yxfb_yxio_b16",2], + "5919454297699648428": ["convolution_gpu_yxfb_yxio_b16",1], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6602394091385112575": ["convolution_gpu_yxfb_yxio_b16",2], + "17777248703109395158": ["convolution_gpu_yxfb_yxio_b16",2], + "15031155621982459860": ["convolution_gpu_bfyx_gemm_like",2], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "904355798061005466": ["convolution_gpu_yxfb_yxio_b16",2], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7095629088416100928": ["convolution_gpu_bfyx_gemm_like",2], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "3859139031732555228": ["convolution_gpu_yxfb_yxio_b16",2], + "12194037100109755112": ["convolution_gpu_bfyx_gemm_like",2], + "4500107195684703428": ["convolution_gpu_yxfb_yxio_b16",2], + "11759322316883943989": ["convolution_gpu_yxfb_yxio_b16",2], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "12831298482349900359": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15227189929676013024": ["convolution_gpu_yxfb_yxio_b16",2], + "3177304125602972370": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15656706773401161497": ["convolution_gpu_yxfb_yxio_b16",2], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",1], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4670487436469119872": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8879618489623984140": ["convolution_gpu_yxfb_yxio_b16",2], + "11797601971796699898": ["convolution_gpu_bfyx_gemm_like",2], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",1039], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "3067001341355453846": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "18027243127893440568": ["convolution_gpu_yxfb_yxio_b16",2], + "5751283221740229986": ["convolution_gpu_bfyx_gemm_like",1], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9243949750444156746": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "7444165397413360181": ["convolution_gpu_yxfb_yxio_b16",2], + "6418500550523945192": ["convolution_gpu_yxfb_yxio_b16",2], + "17826868890632814593": ["convolution_gpu_yxfb_yxio_b16",2], + "10271261715175176019": ["convolution_gpu_yxfb_yxio_b16",2], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9738776059655610885": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1304921846760027440": ["convolution_gpu_yxfb_yxio_b16",1], + "3059575629482816852": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3156783219125679946": ["convolution_gpu_bfyx_1x1",2], + "16739031949237426992": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "5602328731722824868": ["convolution_gpu_yxfb_yxio_b16",1], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "18129795023552968695": ["convolution_gpu_yxfb_yxio_b16",2], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3244803973821375252": ["convolution_gpu_yxfb_yxio_b16",2], + "12808456612606675259": ["convolution_gpu_yxfb_yxio_b16",1], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",2], + "11908169713247209976": ["convolution_gpu_yxfb_yxio_b16",2], + "5046089607609787258": ["convolution_gpu_yxfb_yxio_b16",2], + "13960388312976163971": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10632933069865171963": ["convolution_gpu_yxfb_yxio_b16",2], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "15398380328746287438": ["convolution_gpu_bfyx_gemm_like",2], + "8456185296386225533": ["convolution_gpu_yxfb_yxio_b16",1], + "13633048912926365931": ["convolution_gpu_yxfb_yxio_b16",2], + "345043289576587800": ["convolution_gpu_bfyx_1x1",2], + "17413191440314817117": ["convolution_gpu_yxfb_yxio_b16",2], + "8365255170846178102": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10504318542015227515": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "12407890437443790515": ["convolution_gpu_bfyx_gemm_like",2], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",2], + "17052161869014993719": ["convolution_gpu_yxfb_yxio_b16",2], + "17195293614280872622": ["convolution_gpu_yxfb_yxio_b16",2], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",2], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",2], + "10745099399736462076": ["convolution_gpu_yxfb_yxio_b16",2], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "3820661057776133570": ["convolution_gpu_bfyx_1x1",2], + "9079203986633151014": ["convolution_gpu_bfyx_1x1",1], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16347412180100581330": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "3242391637018676328": ["convolution_gpu_yxfb_yxio_b16",2], + "14689812157592240007": ["convolution_gpu_yxfb_yxio_b16",2], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",2], + "12989677691575632174": ["convolution_gpu_yxfb_yxio_b16",1], + "3444250649099578792": ["convolution_gpu_yxfb_yxio_b16",1], + "8174833187387604731": ["convolution_gpu_yxfb_yxio_b16",2], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",2], + "12379166764490359144": ["convolution_gpu_yxfb_yxio_b16",2], + "631489011812924153": ["convolution_gpu_bfyx_1x1",2], + "16837963510205857013": ["convolution_gpu_yxfb_yxio_b16",2], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9495192057713157041": ["convolution_gpu_yxfb_yxio_b16",2], + "5727758374304309350": ["convolution_gpu_yxfb_yxio_b16",2], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17762455138615317884": ["convolution_gpu_yxfb_yxio_b16",2], + "10709828018763273371": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "5313382805395362669": ["convolution_gpu_yxfb_yxio_b16",2], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",2], + "17088011073114549679": ["convolution_gpu_yxfb_yxio_b16",2], + "4135068756462147853": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3731224822876468602": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4362304842016958728": ["convolution_gpu_bfyx_gemm_like",2], + "8250212706222997384": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "10536316961655703500": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "17214254645087272557": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "9617316303048974588": ["convolution_gpu_yxfb_yxio_b16",2], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17399728556634171321": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7612288596055048389": ["convolution_gpu_yxfb_yxio_b16",1], + "8733371726903473932": ["convolution_gpu_yxfb_yxio_b16",2], + "4651261398203912503": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "9119268982510599778": ["convolution_gpu_yxfb_yxio_b16",2], + "6328802691680458752": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13218298785325404589": ["convolution_gpu_yxfb_yxio_b16",1], + "5754396201681434378": ["convolution_gpu_bfyx_1x1",2], + "15989894214714907271": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9922764846020092836": ["convolution_gpu_yxfb_yxio_b16",2], + "4972952621622984792": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",1], + "18199824206329982249": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5157949342388119167": ["convolution_gpu_bfyx_gemm_like",2], + "14151747022287993729": ["convolution_gpu_bfyx_gemm_like",2], + "10747688146893187959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "75120034961995929": ["convolution_gpu_yxfb_yxio_b16",2], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9237587440336828595": ["convolution_gpu_yxfb_yxio_b16",2], + "13326492157370934949": ["convolution_gpu_bfyx_gemm_like",2], + "1387945708447092123": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "13962189339706230770": ["convolution_gpu_yxfb_yxio_b16",2], + "17848582668902427291": ["convolution_gpu_yxfb_yxio_b16",2], + "14281801257982447624": ["convolution_gpu_yxfb_yxio_b16",2], + "497488185553682238": ["convolution_gpu_bfyx_1x1",1], + "16535378085465418910": ["convolution_gpu_yxfb_yxio_b16",2], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "8900977003907025003": ["convolution_gpu_yxfb_yxio_b16",2], + "11690533591656807605": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17133376737554844449": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "5330130011321223525": ["convolution_gpu_yxfb_yxio_b16",1], + "3121704239277217273": ["convolution_gpu_yxfb_yxio_b16",2], + "7233783054884565746": ["convolution_gpu_bfyx_gemm_like",2], + "3080612075440389053": ["convolution_gpu_yxfb_yxio_b16",2], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",1], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",2], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",103], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",2], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",2], + "11196245220967135443": ["convolution_gpu_yxfb_yxio_b16",2], + "6820224292713065232": ["convolution_gpu_yxfb_yxio_b16",2], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13009381943944182288": ["convolution_gpu_yxfb_yxio_b16",2], + "8210092359850191682": ["convolution_gpu_yxfb_yxio_b16",2], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15311930929656759371": ["convolution_gpu_yxfb_yxio_b16",2], + "5857101685300045443": ["convolution_gpu_yxfb_yxio_b16",1], + "334703311738467111": ["convolution_gpu_bfyx_gemm_like",1], + "3451309062150982886": ["convolution_gpu_yxfb_yxio_b16",2], + "14808079119439455357": ["convolution_gpu_yxfb_yxio_b16",2], + "6760797535531423152": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",2], + "3928596145340765666": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2164314506903530487": ["convolution_gpu_yxfb_yxio_b16",2], + "15757351352532908153": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "13249852145471010452": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7926989875988735079": ["convolution_gpu_yxfb_yxio_b16",2], + "10896935976330351144": ["convolution_gpu_yxfb_yxio_b16",1], + "16469493066700118274": ["convolution_gpu_yxfb_yxio_b16",2], + "16459072408799224894": ["convolution_gpu_yxfb_yxio_b16",2], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13766070202060785219": ["convolution_gpu_yxfb_yxio_b16",2], + "5723759573058003971": ["convolution_gpu_yxfb_yxio_b16",2], + "2314805462821790774": ["convolution_gpu_yxfb_yxio_b16",1], + "9319064434175105168": ["convolution_gpu_yxfb_yxio_b16",2], + "2319519208813614116": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5115298857582076692": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4925720860007127584": ["convolution_gpu_yxfb_yxio_b16",2], + "8614375489387596119": ["convolution_gpu_yxfb_yxio_b16",2], + "14206125678667603810": ["convolution_gpu_bfyx_1x1",1], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16455941573984854254": ["convolution_gpu_yxfb_yxio_b16",1], + "2126208024616319501": ["convolution_gpu_yxfb_yxio_b16",2], + "5795524493577277985": ["convolution_gpu_yxfb_yxio_b16",2], + "17491825380936802930": ["convolution_gpu_yxfb_yxio_b16",2], + "5319459637051859849": ["convolution_gpu_yxfb_yxio_b16",2], + "18333355024265557430": ["convolution_gpu_yxfb_yxio_b16",2], + "8794896449397768269": ["convolution_gpu_bfyx_gemm_like",2], + "16223356735957394429": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5258372022038629529": ["convolution_gpu_yxfb_yxio_b16",1], + "6014752258124559691": ["convolution_gpu_yxfb_yxio_b16",2], + "10256831975351722184": ["convolution_gpu_bfyx_gemm_like",1], + "17987739992848266169": ["convolution_gpu_yxfb_yxio_b16",2], + "10982526068861394162": ["convolution_gpu_yxfb_yxio_b16",2], + "144634005596305959": ["fully_connected_gpu_fb_io_block_fp16",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "14116275901314596944": ["convolution_gpu_yxfb_yxio_b16",2], + "4240975186599864955": ["convolution_gpu_yxfb_yxio_b16",2], + "16267531927647687641": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5643920882179676695": ["convolution_gpu_yxfb_yxio_b16",1], + "9170163372548895531": ["convolution_gpu_yxfb_yxio_b16",2], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",1], + "7178866013527118649": ["convolution_gpu_yxfb_yxio_b16",2], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",2], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "8325903548627432": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18244966393978155130": ["convolution_gpu_yxfb_yxio_b16",2], + "8257103926661643451": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4256155212405177844": ["convolution_gpu_yxfb_yxio_b16",1], + "5047972486012090625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "10007925729029867733": ["convolution_gpu_yxfb_yxio_b16",2], + "4430932059574900921": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15449650271741732512": ["convolution_gpu_yxfb_yxio_b16",2], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5705056256080522960": ["convolution_gpu_yxfb_yxio_b16",2], + "12248852114219058572": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "11979032916453246611": ["convolution_gpu_yxfb_yxio_b16",1], + "15136770992109675092": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1632416005093914709": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5577571901049952658": ["convolution_gpu_yxfb_yxio_b16",2], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",2], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9736684300833719045": ["convolution_gpu_yxfb_yxio_b16",2], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",1], + "15303251546207338960": ["convolution_gpu_yxfb_yxio_b16",0], + "362823013207940830": ["convolution_gpu_yxfb_yxio_b16",2], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",2], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",1], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "16000428520749664687": ["convolution_gpu_yxfb_yxio_b16",2], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "7211179360844946434": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "6692085187697087807": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "4531222427159927606": ["convolution_gpu_bfyx_gemm_like",2], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14817801788424046035": ["convolution_gpu_yxfb_yxio_b16",2], + "5958300749101873980": ["convolution_gpu_yxfb_yxio_b16",2], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "8575833423399668525": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "1040411949730118556": ["convolution_gpu_yxfb_yxio_b16",2], + "15542520725696027828": ["convolution_gpu_yxfb_yxio_b16",1], + "15961487889420208188": ["convolution_gpu_bfyx_gemm_like",2], + "12879367655655932174": ["convolution_gpu_yxfb_yxio_b16",2], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "10626281431800814406": ["convolution_gpu_yxfb_yxio_b16",2], + "3406812365298442897": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "13668940862847596363": ["convolution_gpu_yxfb_yxio_b16",2], + "11317843493537672866": ["convolution_gpu_yxfb_yxio_b16",2], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "3635446784873718932": ["convolution_gpu_bfyx_gemm_like",2], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",1], + "1108229954015380813": ["convolution_gpu_yxfb_yxio_b16",2], + "15449774545834423274": ["convolution_gpu_yxfb_yxio_b16",2], + "6808843088626121909": ["convolution_gpu_bfyx_gemm_like",2], + "3492178441007007033": ["convolution_gpu_yxfb_yxio_b16",2], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "16502045034098739466": ["convolution_gpu_bfyx_gemm_like",2], + "975943900172381326": ["convolution_gpu_yxfb_yxio_b16",2], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",2], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17277917672233464304": ["convolution_gpu_yxfb_yxio_b16",2], + "11113125355390956764": ["convolution_gpu_yxfb_yxio_b16",0], + "16513038896689318072": ["convolution_gpu_yxfb_yxio_b16",1], + "7531346828150129063": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "10701208905236219083": ["convolution_gpu_yxfb_yxio_b16",2], + "14930789530046665855": ["convolution_gpu_bfyx_gemm_like",2], + "14065215389112262561": ["convolution_gpu_yxfb_yxio_b16",1], + "13051390418571971928": ["convolution_gpu_yxfb_yxio_b16",2], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13797759143769042759": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "8099100633390626027": ["convolution_gpu_yxfb_yxio_b16",2], + "7412772553395852003": ["convolution_gpu_yxfb_yxio_b16",2], + "16428789154716792138": ["convolution_gpu_yxfb_yxio_b16",2], + "2477849395789783501": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17800115051456107658": ["convolution_gpu_yxfb_yxio_b16",2], + "4773482308451190487": ["convolution_gpu_yxfb_yxio_b16",2], + "4830454154838353056": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "9659814105483633858": ["convolution_gpu_yxfb_yxio_b16",2], + "8933701347987963693": ["convolution_gpu_yxfb_yxio_b16",2], + "4216366893358625960": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "14791575777969587370": ["convolution_gpu_yxfb_yxio_b16",1], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16989896550094613437": ["convolution_gpu_yxfb_yxio_b16",2], + "7535571298845832061": ["convolution_gpu_yxfb_yxio_b16",1], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4640696923527766618": ["convolution_gpu_bfyx_gemm_like",2], + "13585163747565192884": ["convolution_gpu_bfyx_gemm_like",2], + "9135116285263927211": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12712071520541638451": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "13058026769607428653": ["convolution_gpu_yxfb_yxio_b16",2], + "10431728173806991521": ["convolution_gpu_yxfb_yxio_b16",2], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",2], + "2450251936650841836": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "14931590390643373866": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12866217660635921034": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15747571668131081693": ["convolution_gpu_yxfb_yxio_b16",1], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "6106367716877633757": ["convolution_gpu_yxfb_yxio_b16",2], + "5963901433137582265": ["convolution_gpu_bfyx_gemm_like",2], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "18226737525116147628": ["convolution_gpu_yxfb_yxio_b16",2], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",2], + "17966898762317477857": ["convolution_gpu_yxfb_yxio_b16",1], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",2], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9542795021683486547": ["convolution_gpu_yxfb_yxio_b16",2], + "12093737479877309006": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8651641584737798174": ["convolution_gpu_bfyx_gemm_like",2], + "10194187012252949909": ["convolution_gpu_yxfb_yxio_b16",2], + "15181987458871339815": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "6458189051305803360": ["convolution_gpu_yxfb_yxio_b16",2], + "12616205756849913359": ["convolution_gpu_yxfb_yxio_b16",2], + "5941095082097535176": ["convolution_gpu_bfyx_gemm_like",1], + "12762301414049772746": ["convolution_gpu_yxfb_yxio_b16",2], + "8399668174006528237": ["convolution_gpu_bfyx_gemm_like",1], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",2], + "7481256533438761028": ["convolution_gpu_bfyx_gemm_like",2], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",2], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "584086621952390547": ["convolution_gpu_bfyx_gemm_like",2], + "4439786737038041995": ["convolution_gpu_yxfb_yxio_b16",2], + "2034811390140488812": ["convolution_gpu_yxfb_yxio_b16",2], + "7767103488808670253": ["convolution_gpu_yxfb_yxio_b16",2], + "10396788403466463989": ["convolution_gpu_yxfb_yxio_b16",2], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "7986797517722531256": ["convolution_gpu_bfyx_gemm_like",2], + "13569453018083742128": ["convolution_gpu_yxfb_yxio_b16",2], + "2583562092192709891": ["convolution_gpu_yxfb_yxio_b16",2], + "8063236641629084352": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6222595759158615206": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2058364830449635556": ["convolution_gpu_yxfb_yxio_b16",2], + "2644054989263429508": ["convolution_gpu_yxfb_yxio_b16",2], + "537074122417021898": ["convolution_gpu_bfyx_os_iyx_osv16",100], + "17040537179740138304": ["convolution_gpu_yxfb_yxio_b16",2], + "7213383384662748578": ["convolution_gpu_yxfb_yxio_b16",2], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13648761167622654288": ["fully_connected_gpu_yxfb_ref",0], + "18253299978538051201": ["convolution_gpu_yxfb_yxio_b16",2], + "15526021915035861514": ["convolution_gpu_bfyx_gemm_like",1], + "14670339865153970893": ["convolution_gpu_yxfb_yxio_b16",2], + "9655242408142699694": ["convolution_gpu_yxfb_yxio_b16",1], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",1], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12656228464579497510": ["convolution_gpu_yxfb_yxio_b16",2], + "9309173544512377803": ["convolution_gpu_yxfb_yxio_b16",2], + "13079058582191027406": ["convolution_gpu_yxfb_yxio_b16",2], + "951747146164097188": ["convolution_gpu_bfyx_1x1",2], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "712495040970043706": ["convolution_gpu_yxfb_yxio_b16",2], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",1], + "9306120768594851497": ["convolution_gpu_yxfb_yxio_b16",2], + "7777279468029216688": ["convolution_gpu_yxfb_yxio_b16",2], + "18029396837690671545": ["convolution_gpu_yxfb_yxio_b16",2], + "14909506411483112959": ["convolution_gpu_yxfb_yxio_b16",1], + "1734769856106746136": ["convolution_gpu_yxfb_yxio_b16",2], + "9987415314864002460": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6772954924703365345": ["convolution_gpu_bfyx_gemm_like",2], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "13403161389559730": ["convolution_gpu_bfyx_gemm_like",2], + "9811086682271990794": ["convolution_gpu_yxfb_yxio_b16",2], + "9674248159643501374": ["convolution_gpu_yxfb_yxio_b16",2], + "7830644361525332797": ["convolution_gpu_yxfb_yxio_b16",2], + "2290965424106255219": ["convolution_gpu_yxfb_yxio_b16",2], + "11208625628954179200": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "15774073623451382326": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "10099598062509781441": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "8618627241234406784": ["convolution_gpu_yxfb_yxio_b16",2], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1784892318069674949": ["convolution_gpu_yxfb_yxio_b16",2], + "5568728266639058524": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9328223957245552723": ["convolution_gpu_bfyx_gemm_like",2], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "8809438390805488749": ["convolution_gpu_yxfb_yxio_b16",2], + "15317946705199574301": ["convolution_gpu_yxfb_yxio_b16",0], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "18093895673012393740": ["convolution_gpu_yxfb_yxio_b16",1], + "10232809153913700925": ["convolution_gpu_yxfb_yxio_b16",2], + "17763347648779573375": ["convolution_gpu_yxfb_yxio_b16",1], + "2895819653081408358": ["convolution_gpu_yxfb_yxio_b16",2], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5600128039063009632": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6490907666077364481": ["convolution_gpu_yxfb_yxio_b16",2], + "17188004018198554470": ["convolution_gpu_yxfb_yxio_b16",2], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13149617013851130587": ["convolution_gpu_yxfb_yxio_b16",1], + "11130439225010714550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "10588059104387338398": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "8117638644045799192": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6818140422066151642": ["convolution_gpu_yxfb_yxio_b16",1], + "5156033406916344703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15409184364121627414": ["convolution_gpu_yxfb_yxio_b16",2], + "3536359641225772698": ["convolution_gpu_yxfb_yxio_b16",2], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",0], + "13590444711975157776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "9165817820007469505": ["convolution_gpu_yxfb_yxio_b16",2], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "17109520309574369561": ["convolution_gpu_bfyx_gemm_like",2], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "886880682650879171": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14263605862840500474": ["convolution_gpu_yxfb_yxio_b16",2], + "119047044057950958": ["convolution_gpu_bfyx_gemm_like",1], + "11153522012082333137": ["convolution_gpu_yxfb_yxio_b16",2], + "3074436655804078403": ["convolution_gpu_yxfb_yxio_b16",2], + "15271783562528081169": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",2], + "7247475218645942682": ["convolution_gpu_yxfb_yxio_b16",2], + "2753393184265405425": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",2], + "12788611449571149037": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12829916847670789556": ["convolution_gpu_yxfb_yxio_b16",2], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",2], + "17140702790441856730": ["convolution_gpu_bfyx_gemm_like",1], + "7792512829747836997": ["convolution_gpu_yxfb_yxio_b16",2], + "8935522915553126640": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9332701118402940384": ["convolution_gpu_yxfb_yxio_b16",1], + "16573597215928075233": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",2], + "15424646499666127616": ["convolution_gpu_yxfb_yxio_b16",0], + "16561618767117193109": ["convolution_gpu_bfyx_1x1",2], + "12374775091628199854": ["convolution_gpu_bfyx_1x1",2], + "14416897092729861207": ["convolution_gpu_yxfb_yxio_b16",2], + "16569637518948306471": ["convolution_gpu_bfyx_gemm_like",2], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "8792202318168046223": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "18417830391649460864": ["convolution_gpu_yxfb_yxio_b16",2], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7817691489550523328": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10555835101752189454": ["convolution_gpu_yxfb_yxio_b16",2], + "15858356755924943957": ["convolution_gpu_yxfb_yxio_b16",2], + "16402386400454963239": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "7398158542592530232": ["convolution_gpu_yxfb_yxio_b16",2], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "269167598200943915": ["convolution_gpu_yxfb_yxio_b16",2], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2629918844315184499": ["convolution_gpu_yxfb_yxio_b16",1], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",2], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "17479773641824222843": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10868287582480518153": ["convolution_gpu_bfyx_gemm_like",2], + "11738780323979052397": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14616801816838734032": ["convolution_gpu_yxfb_yxio_b16",2], + "9589718307719207394": ["convolution_gpu_yxfb_yxio_b16",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "5367618411887849711": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "3582256192870592087": ["convolution_gpu_bfyx_os_iyx_osv16",1029], + "16253244737884854313": ["convolution_gpu_yxfb_yxio_b16",2], + "13076935351221777993": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "7724125714360985807": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "2955459120402821540": ["convolution_gpu_yxfb_yxio_b16",2], + "4815047491742617397": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "6250785177115691293": ["convolution_gpu_yxfb_yxio_b16",2], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",2], + "7106362077449435105": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "18164706399147697716": ["convolution_gpu_yxfb_yxio_b16",1], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",2], + "17536591931934691648": ["convolution_gpu_yxfb_yxio_b16",2], + "15112393534380347357": ["convolution_gpu_yxfb_yxio_b16",2], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "252188028702250668": ["convolution_gpu_yxfb_yxio_b16",2], + "5519781859090160931": ["convolution_gpu_bfyx_os_iyx_osv16",760], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",2], + "17713666626443142908": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15617599138946168772": ["convolution_gpu_yxfb_yxio_b16",2], + "11319799002723299753": ["convolution_gpu_yxfb_yxio_b16",2], + "4154830034576950123": ["convolution_gpu_yxfb_yxio_b16",2], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "12344689711325644622": ["convolution_gpu_yxfb_yxio_b16",2], + "9757389422721488173": ["convolution_gpu_bfyx_1x1",1], + "17833304859352483840": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12625112690264223217": ["convolution_gpu_bfyx_gemm_like",2], + "6888842613779488104": ["convolution_gpu_bfyx_1x1",2], + "18080848057281093190": ["convolution_gpu_yxfb_yxio_b16",2], + "10785252006948647963": ["convolution_gpu_yxfb_yxio_b16",2], + "14827882251752394500": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6713554643048248003": ["convolution_gpu_yxfb_yxio_b16",2], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",2], + "15131258379753113816": ["convolution_gpu_yxfb_yxio_b16",2], + "11658751382892761740": ["convolution_gpu_yxfb_yxio_b16",2], + "1973051991518953158": ["convolution_gpu_yxfb_yxio_b16",2], + "14091543526898531200": ["convolution_gpu_yxfb_yxio_b16",2], + "14887465694301281952": ["convolution_gpu_yxfb_yxio_b16",2], + "9426665763007611385": ["convolution_gpu_bfyx_gemm_like",2], + "3533556385636018581": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "5291011077679733990": ["convolution_gpu_bfyx_gemm_like",2], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "4282756088824939292": ["convolution_gpu_yxfb_yxio_b16",2], + "15967614281807823696": ["convolution_gpu_bfyx_gemm_like",2], + "5656320098721954644": ["convolution_gpu_yxfb_yxio_b16",2], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "2571289358202565251": ["convolution_gpu_yxfb_yxio_b16",2], + "3449007266907948591": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "8890400423799565844": ["convolution_gpu_yxfb_yxio_b16",2], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "12745552951204330052": ["convolution_gpu_yxfb_yxio_b16",2], + "16851082749395991194": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "15924916465272239832": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "4280250278457269231": ["convolution_gpu_yxfb_yxio_b16",2], + "13326339730522937517": ["convolution_gpu_yxfb_yxio_b16",2], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15799159401545270696": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17255805293355120219": ["convolution_gpu_yxfb_yxio_b16",2], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "12818012741490629493": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "18075395502550596586": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3172518362830684966": ["convolution_gpu_yxfb_yxio_b16",1], + "14892045745899927762": ["convolution_gpu_yxfb_yxio_b16",2], + "11263540528012919947": ["convolution_gpu_bfyx_1x1",2], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "9982350570959875159": ["convolution_gpu_yxfb_yxio_b16",2], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15973363403733281926": ["convolution_gpu_yxfb_yxio_b16",2], + "6820134899097582639": ["convolution_gpu_yxfb_yxio_b16",2], + "1129349074674368869": ["convolution_gpu_yxfb_yxio_b16",2], + "7235358742317442134": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16252420150239789472": ["convolution_gpu_yxfb_yxio_b16",2], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17041468169694105561": ["convolution_gpu_yxfb_yxio_b16",2], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4995051972576749717": ["convolution_gpu_yxfb_yxio_b16",2], + "2199167704280374654": ["convolution_gpu_yxfb_yxio_b16",2], + "15739278428190392018": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6131481289104111211": ["convolution_gpu_bfyx_gemm_like",2], + "10751633292301177132": ["convolution_gpu_yxfb_yxio_b16",2], + "3049097498155857895": ["convolution_gpu_yxfb_yxio_b16",2], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9116620473576064051": ["convolution_gpu_yxfb_yxio_b16",2], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",2], + "2907572047024872990": ["convolution_gpu_yxfb_yxio_b16",1], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",2], + "9169324504353459004": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17272600601478967434": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",2], + "5926747396493954633": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15949311219856917559": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "848735117501914374": ["convolution_gpu_yxfb_yxio_b16",2], + "11226912053840621089": ["convolution_gpu_yxfb_yxio_b16",2], + "8155268141318893606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8616584380583931648": ["convolution_gpu_yxfb_yxio_b16",1], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "10879171754021534649": ["convolution_gpu_yxfb_yxio_b16",2], + "4805194563120934409": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1235864574444794315": ["convolution_gpu_yxfb_yxio_b16",2], + "7979265448683159733": ["convolution_gpu_yxfb_yxio_b16",2], + "16709930291825881111": ["convolution_gpu_yxfb_yxio_b16",1], + "10406201782146034797": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10812324504777808014": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14385148066232093878": ["convolution_gpu_yxfb_yxio_b16",2], + "8362179886017398479": ["convolution_gpu_bfyx_os_iyx_osv16",8], + "4615766471724791034": ["convolution_gpu_yxfb_yxio_b16",2], + "6511742759171254447": ["convolution_gpu_yxfb_yxio_b16",2], + "9523941899498458600": ["convolution_gpu_yxfb_yxio_b16",2], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9026883911202247185": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "4779919236230154165": ["convolution_gpu_bfyx_gemm_like",0], + "1054954263090546905": ["convolution_gpu_yxfb_yxio_b16",1], + "14097394936362526559": ["convolution_gpu_yxfb_yxio_b16",2], + "10558609844937234631": ["convolution_gpu_yxfb_yxio_b16",2], + "10318417166945621015": ["convolution_gpu_yxfb_yxio_b16",2], + "4988480452582288323": ["convolution_gpu_yxfb_yxio_b16",2], + "490931535580183607": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1354199155380786906": ["convolution_gpu_yxfb_yxio_b16",2], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",2], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",2], + "10272016038525930672": ["convolution_gpu_bfyx_gemm_like",2], + "18337762134908554532": ["convolution_gpu_yxfb_yxio_b16",2], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",2], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "11738360883999461965": ["convolution_gpu_yxfb_yxio_b16",1], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "10997156099709436375": ["convolution_gpu_yxfb_yxio_b16",2], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",2], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10720769054729185991": ["convolution_gpu_yxfb_yxio_b16",2], + "14365232561737454031": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "10952045211444638649": ["convolution_gpu_yxfb_yxio_b16",1], + "14885031472057965707": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "3648713169465596196": ["convolution_gpu_yxfb_yxio_b16",2], + "2469579114592379040": ["convolution_gpu_bfyx_gemm_like",2], + "13507437548205340054": ["convolution_gpu_yxfb_yxio_b16",2], + "11010673493295430801": ["convolution_gpu_yxfb_yxio_b16",2], + "3792276488551864121": ["convolution_gpu_yxfb_yxio_b16",2], + "1336739931702966228": ["convolution_gpu_yxfb_yxio_b16",1], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",2], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6931062623510631425": ["convolution_gpu_yxfb_yxio_b16",2], + "13723434004563378589": ["convolution_gpu_yxfb_yxio_b16",2], + "12917241193304093727": ["convolution_gpu_bfyx_gemm_like",2], + "17618727959983224888": ["convolution_gpu_yxfb_yxio_b16",2], + "2060161076370553192": ["convolution_gpu_yxfb_yxio_b16",2], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "12714892326998505133": ["convolution_gpu_yxfb_yxio_b16",2], + "13040213971461407125": ["convolution_gpu_yxfb_yxio_b16",2], + "5852569526295779497": ["convolution_gpu_yxfb_yxio_b16",2], + "11724732387425614709": ["convolution_gpu_yxfb_yxio_b16",2], + "11892455357792445192": ["convolution_gpu_yxfb_yxio_b16",2], + "4773077837537775324": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3904383357046705799": ["convolution_gpu_yxfb_yxio_b16",2], + "3286496836813087881": ["convolution_gpu_yxfb_yxio_b16",2], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",2], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14497254583210965214": ["convolution_gpu_yxfb_yxio_b16",2], + "72745257233374197": ["convolution_gpu_yxfb_yxio_b16",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11289650463922092775": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2673903488704336606": ["convolution_gpu_bfyx_gemm_like",2], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "11606895513516475339": ["convolution_gpu_yxfb_yxio_b16",2], + "13124342334495538095": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "12546446257192651407": ["convolution_gpu_yxfb_yxio_b16",1], + "2856601829807186494": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13412516623201653283": ["convolution_gpu_yxfb_yxio_b16",2], + "17868834743037242721": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4805402210873641704": ["convolution_gpu_yxfb_yxio_b16",2], + "6816632607384969096": ["convolution_gpu_yxfb_yxio_b16",1], + "9899211365930959346": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "10612739622648878242": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4780291919667721265": ["convolution_gpu_yxfb_yxio_b16",2], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "10178171262128338408": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14502856487639608696": ["convolution_gpu_bfyx_gemm_like",2], + "4894469114343061704": ["convolution_gpu_yxfb_yxio_b16",0], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "14417401878572618236": ["convolution_gpu_yxfb_yxio_b16",2], + "10914921540144371519": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10724501418439612080": ["convolution_gpu_bfyx_gemm_like",1], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7100056605355325582": ["convolution_gpu_yxfb_yxio_b16",2], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "15859493313686060349": ["convolution_gpu_bfyx_gemm_like",2], + "14540578324750869319": ["convolution_gpu_bfyx_gemm_like",2], + "7565006185780806333": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "15192022454507415969": ["convolution_gpu_yxfb_yxio_b16",1], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "15325852281951905610": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",1], + "8519354640245415816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12669783714916998842": ["convolution_gpu_yxfb_yxio_b16",2], + "13187657215288939912": ["convolution_gpu_yxfb_yxio_b16",2], + "6123707371654753818": ["convolution_gpu_yxfb_yxio_b16",2], + "7343590049199309046": ["convolution_gpu_yxfb_yxio_b16",2], + "5526223938481098693": ["convolution_gpu_yxfb_yxio_b16",2], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",2], + "9521715904587435700": ["convolution_gpu_yxfb_yxio_b16",2], + "3058716597925544041": ["convolution_gpu_yxfb_yxio_b16",2], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "2431241169199693527": ["convolution_gpu_yxfb_yxio_b16",1], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1425953627379976115": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "6118737381591369532": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9184275066167601343": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "18083803358410976976": ["convolution_gpu_yxfb_yxio_b16",2], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "1299760574827253811": ["convolution_gpu_yxfb_yxio_b16",2], + "4126895998426674411": ["convolution_gpu_bfyx_gemm_like",2], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",2], + "8421388456873652700": ["convolution_gpu_bfyx_gemm_like",2], + "1584906448442153128": ["convolution_gpu_yxfb_yxio_b16",1], + "3096280563014331836": ["convolution_gpu_yxfb_yxio_b16",2], + "12334522314915706512": ["convolution_gpu_yxfb_yxio_b16",2], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "9832505855130134649": ["convolution_gpu_yxfb_yxio_b16",2], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",2], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",1], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",1], + "7015738038963065110": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "6963293142152132518": ["convolution_gpu_bfyx_os_iyx_osv16",165], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16195893521207315456": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6509271384550125629": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12137340921829511472": ["convolution_gpu_yxfb_yxio_b16",2], + "1760391741350091665": ["convolution_gpu_bfyx_gemm_like",2], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13289438471364352634": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4610200388191607540": ["convolution_gpu_bfyx_gemm_like",2], + "10882719585803523032": ["convolution_gpu_yxfb_yxio_b16",2], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "13943983517468412332": ["convolution_gpu_yxfb_yxio_b16",1], + "5479761740065152589": ["convolution_gpu_bfyx_gemm_like",2], + "10133398220120888583": ["convolution_gpu_yxfb_yxio_b16",2], + "13156052826121673994": ["convolution_gpu_bfyx_gemm_like",2], + "10006197783106691106": ["convolution_gpu_bfyx_gemm_like",2], + "4602232889230956461": ["convolution_gpu_yxfb_yxio_b16",2], + "15669242195570440840": ["convolution_gpu_yxfb_yxio_b16",2], + "13365950526881732374": ["convolution_gpu_yxfb_yxio_b16",1], + "14469011068777098822": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16341722570340169855": ["convolution_gpu_bfyx_1x1",2], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",2], + "6715523440337925186": ["convolution_gpu_yxfb_yxio_b16",2], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "10717031088082350652": ["convolution_gpu_yxfb_yxio_b16",2], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "2016932800158392200": ["convolution_gpu_yxfb_yxio_b16",2], + "13467831091041327178": ["convolution_gpu_yxfb_yxio_b16",1], + "2135164671985938807": ["convolution_gpu_yxfb_yxio_b16",2], + "9711184878666366204": ["convolution_gpu_yxfb_yxio_b16",1], + "968105804060326332": ["convolution_gpu_yxfb_yxio_b16",2], + "579781312141502576": ["convolution_gpu_bfyx_1x1",1], + "17248329632819747646": ["convolution_gpu_yxfb_yxio_b16",1], + "11942019076226205097": ["convolution_gpu_yxfb_yxio_b16",2], + "6902485831441844789": ["convolution_gpu_yxfb_yxio_b16",1], + "4885504197789468842": ["convolution_gpu_yxfb_yxio_b16",1], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13464697394408238115": ["convolution_gpu_yxfb_yxio_b16",2], + "10789133352712755945": ["convolution_gpu_yxfb_yxio_b16",2], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "7065244994574625911": ["convolution_gpu_yxfb_yxio_b16",2], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",2], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",2], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "16129682385980878760": ["convolution_gpu_yxfb_yxio_b16",2], + "17043601935017365442": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2905979727479716212": ["convolution_gpu_yxfb_yxio_b16",2], + "4391695940614024479": ["convolution_gpu_yxfb_yxio_b16",2], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",1], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "708201295462256406": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "9827177798112814604": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "5050273611519516510": ["convolution_gpu_bfyx_gemm_like",1], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",1], + "17672785701483179117": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4834446692898125871": ["convolution_gpu_bfyx_gemm_like",2], + "7628077869220463202": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "11417406326478154077": ["convolution_gpu_yxfb_yxio_b16",2], + "12985942652866621579": ["fully_connected_gpu_fb_io_ref",2], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",2], + "5638640164891118162": ["convolution_gpu_yxfb_yxio_b16",2], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "603883331897298932": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16633540487930201533": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "3872151366780051246": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17515847111676784130": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "6613116267521819997": ["convolution_gpu_yxfb_yxio_b16",2], + "3377052601059116318": ["convolution_gpu_yxfb_yxio_b16",0], + "13509275050322423832": ["convolution_gpu_yxfb_yxio_b16",2], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13119040261291835298": ["convolution_gpu_bfyx_gemm_like",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6799631962511042762": ["convolution_gpu_yxfb_yxio_b16",2], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "6959692641873234850": ["convolution_gpu_yxfb_yxio_b16",2], + "18152894191323920027": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6512987867462549101": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16371608027363202992": ["convolution_gpu_yxfb_yxio_b16",2], + "6210866413385292851": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "465567788283624320": ["convolution_gpu_yxfb_yxio_b16",2], + "6756679359093569015": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "5740745357953479527": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "17676344219475515993": ["convolution_gpu_yxfb_yxio_b16",2], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5346898505346646714": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",1], + "6945787904293959477": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "18243724217479803107": ["convolution_gpu_yxfb_yxio_b16",2], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "2227700097134029783": ["convolution_gpu_yxfb_yxio_b16",1], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10309986238001994183": ["convolution_gpu_yxfb_yxio_b16",2], + "12531880391016521628": ["convolution_gpu_bfyx_gemm_like",2], + "15115780248032030963": ["convolution_gpu_yxfb_yxio_b16",2], + "4298242568890525997": ["convolution_gpu_yxfb_yxio_b16",2], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16870036853278751563": ["convolution_gpu_yxfb_yxio_b16",2], + "13161997040644039778": ["convolution_gpu_bfyx_gemm_like",2], + "15833461718320604065": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "7669403041163460089": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",2], + "15482685355538566951": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",2], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",2], + "6558436237075337721": ["convolution_gpu_yxfb_yxio_b16",2], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1580344438642032807": ["convolution_gpu_bfyx_gemm_like",2], + "5578850952665051661": ["convolution_gpu_yxfb_yxio_b16",1], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13493119419114659706": ["convolution_gpu_yxfb_yxio_b16",2], + "16425374300157280628": ["convolution_gpu_yxfb_yxio_b16",1], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11698754846673268046": ["convolution_gpu_yxfb_yxio_b16",2], + "12121204870979363096": ["convolution_gpu_yxfb_yxio_b16",2], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "12054200116003751590": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",1], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "1448440012428740463": ["convolution_gpu_yxfb_yxio_b16",1], + "8976238022515713641": ["convolution_gpu_bfyx_gemm_like",2], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",1], + "10681768474583067517": ["convolution_gpu_bfyx_gemm_like",1], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "359617184733439511": ["convolution_gpu_yxfb_yxio_b16",2], + "9366100787108468082": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2269140636553245446": ["convolution_gpu_yxfb_yxio_b16",2], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9870432551513415176": ["convolution_gpu_yxfb_yxio_b16",2], + "1984152634309440563": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "361497145093734608": ["convolution_gpu_bfyx_gemm_like",2], + "13861223834466385546": ["convolution_gpu_bfyx_gemm_like",1], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",1], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9941035405796680081": ["convolution_gpu_bfyx_1x1",1], + "12051398350382954787": ["convolution_gpu_yxfb_yxio_b16",0], + "14242202444788213591": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "8999570321113443117": ["convolution_gpu_yxfb_yxio_b16",2], + "838726445796308454": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",2], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9518071423184197213": ["convolution_gpu_bfyx_gemm_like",2], + "8004244584949995244": ["convolution_gpu_yxfb_yxio_b16",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17480277135590489472": ["convolution_gpu_yxfb_yxio_b16",2], + "167635075964111628": ["convolution_gpu_yxfb_yxio_b16",2], + "1463649546800120847": ["convolution_gpu_yxfb_yxio_b16",2], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",2], + "225809055928705881": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "7998455776901877973": ["convolution_gpu_yxfb_yxio_b16",2], + "4804533178560338520": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11596971301790598405": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "5312413491828906254": ["convolution_gpu_yxfb_yxio_b16",2], + "12600707101000510621": ["convolution_gpu_yxfb_yxio_b16",2], + "7346046748383284270": ["convolution_gpu_yxfb_yxio_b16",2], + "7804715870037416579": ["convolution_gpu_bfyx_gemm_like",1], + "18433141005552346566": ["convolution_gpu_yxfb_yxio_b16",2], + "17893181511546734799": ["convolution_gpu_yxfb_yxio_b16",2], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",1], + "7647236080048602591": ["convolution_gpu_bfyx_gemm_like",1], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6910589963488897537": ["convolution_gpu_yxfb_yxio_b16",2], + "11175936010605958812": ["convolution_gpu_yxfb_yxio_b16",1], + "568191462231494113": ["convolution_gpu_yxfb_yxio_b16",2], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "3861351835305151926": ["convolution_gpu_yxfb_yxio_b16",2], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",2], + "7779562434199107586": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "18385086614524985975": ["convolution_gpu_yxfb_yxio_b16",2], + "4731836216299455047": ["convolution_gpu_yxfb_yxio_b16",2], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13878967140838761911": ["convolution_gpu_bfyx_1x1",1], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11888011890096886932": ["convolution_gpu_yxfb_yxio_b16",2], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "17961793197503317952": ["convolution_gpu_yxfb_yxio_b16",2], + "5720964268093705079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "877901260688090160": ["convolution_gpu_yxfb_yxio_b16",2], + "1310498917952637709": ["convolution_gpu_yxfb_yxio_b16",2], + "6871131333562410117": ["convolution_gpu_yxfb_yxio_b16",2], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",664], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "7139719632093090046": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "14553577436929219470": ["convolution_gpu_yxfb_yxio_b16",2], + "16814025114202322376": ["convolution_gpu_yxfb_yxio_b16",1], + "11880337915508207160": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6467251764899975676": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16995444341569389342": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "14206076551739831333": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7393551951402219833": ["convolution_gpu_yxfb_yxio_b16",2], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "1596353239542510685": ["convolution_gpu_bfyx_gemm_like",2], + "2578325663193624576": ["convolution_gpu_yxfb_yxio_b16",2], + "13426254939418471242": ["convolution_gpu_yxfb_yxio_b16",2], + "15337841577110104431": ["convolution_gpu_yxfb_yxio_b16",1], + "14868677663932902695": ["convolution_gpu_bfyx_gemm_like",2], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",2], + "14766694310604777253": ["convolution_gpu_yxfb_yxio_b16",1], + "15135644084742750702": ["convolution_gpu_bfyx_gemm_like",2], + "12787837386653002743": ["convolution_gpu_yxfb_yxio_b16",2], + "6167369758442930886": ["convolution_gpu_bfyx_gemm_like",2], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5118467701668427545": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2026622899016787854": ["convolution_gpu_yxfb_yxio_b16",1], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "16374675547140209181": ["convolution_gpu_yxfb_yxio_b16",2], + "888110783182849535": ["convolution_gpu_yxfb_yxio_b16",2], + "16683485007140805060": ["fully_connected_gpu_yxfb_ref",2], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",2], + "8260024340787818709": ["convolution_gpu_yxfb_yxio_b16",2], + "10480527638577674825": ["convolution_gpu_bfyx_1x1",2], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9162469583721135043": ["convolution_gpu_yxfb_yxio_b16",2], + "4685236901551256966": ["convolution_gpu_yxfb_yxio_b16",1], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",2], + "2296581485980163665": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1635121016109328853": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15670767419106537809": ["convolution_gpu_yxfb_yxio_b16",2], + "4848143712599565301": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3102538312627892960": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",2], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18408107772851888061": ["convolution_gpu_bfyx_gemm_like",2], + "11179211757115972103": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3980835859526174461": ["convolution_gpu_yxfb_yxio_b16",2], + "15525903155475629518": ["convolution_gpu_bfyx_gemm_like",2], + "14175962333785791005": ["convolution_gpu_yxfb_yxio_b16",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "290134020607738418": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "9534041402131086717": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "17638753020411096694": ["convolution_gpu_yxfb_yxio_b16",2], + "18432421400879260832": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "16304192736281226143": ["convolution_gpu_yxfb_yxio_b16",2], + "7305582749708309904": ["convolution_gpu_yxfb_yxio_b16",2], + "3622409603053918029": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13207134083675064956": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "8931169575495985034": ["convolution_gpu_bfyx_gemm_like",2], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",2], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",2], + "12850044341631872743": ["convolution_gpu_yxfb_yxio_b16",2], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11086471945045031067": ["convolution_gpu_yxfb_yxio_b16",2], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",1], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5507373575763339429": ["convolution_gpu_yxfb_yxio_b16",2], + "13328911884191551889": ["convolution_gpu_bfyx_1x1",2], + "5104519293341299859": ["convolution_gpu_yxfb_yxio_b16",2], + "249639220178603842": ["convolution_gpu_bfyx_gemm_like",2], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14942858162799632403": ["convolution_gpu_yxfb_yxio_b16",2], + "5576296603250158603": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "2683304757433993300": ["convolution_gpu_bfyx_gemm_like",2], + "16128152634974034731": ["convolution_gpu_yxfb_yxio_b16",2], + "15677717057398875599": ["convolution_gpu_bfyx_gemm_like",1], + "851057218719456209": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3349468433721705582": ["convolution_gpu_yxfb_yxio_b16",1], + "15863531785836309247": ["convolution_gpu_yxfb_yxio_b16",2], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15367649112776077240": ["convolution_gpu_yxfb_yxio_b16",2], + "9492026326463873766": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16925721317097534009": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "3319827933068341610": ["convolution_gpu_yxfb_yxio_b16",2], + "12268912077694742671": ["convolution_gpu_yxfb_yxio_b16",2], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "9073757008455674094": ["convolution_gpu_yxfb_yxio_b16",2], + "8780671766122887951": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "15625374380046476173": ["convolution_gpu_yxfb_yxio_b16",2], + "7105219760750474587": ["convolution_gpu_yxfb_yxio_b16",2], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16961326251624610778": ["convolution_gpu_yxfb_yxio_b16",2], + "10076885835791159907": ["convolution_gpu_yxfb_yxio_b16",2], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",2], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13541382855330226000": ["convolution_gpu_yxfb_yxio_b16",2], + "11015074526119891710": ["convolution_gpu_yxfb_yxio_b16",2], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "7897973318803646560": ["convolution_gpu_yxfb_yxio_b16",2], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",2], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "11871319147579477936": ["convolution_gpu_yxfb_yxio_b16",2], + "16601230690171340432": ["convolution_gpu_yxfb_yxio_b16",2], + "17207560805775399864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9737565171095493297": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "17466963970980708210": ["convolution_gpu_yxfb_yxio_b16",2], + "6128157319666849074": ["convolution_gpu_yxfb_yxio_b16",2], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "1197281505560782577": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13264617841270329349": ["convolution_gpu_bfyx_1x1",2], + "6550549654706796887": ["convolution_gpu_yxfb_yxio_b16",0], + "13120262386070281193": ["convolution_gpu_yxfb_yxio_b16",1], + "13368203360773949292": ["convolution_gpu_yxfb_yxio_b16",2], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "3615052707933370958": ["convolution_gpu_yxfb_yxio_b16",1], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",1], + "2554991397391195611": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "7206226541369793931": ["convolution_gpu_yxfb_yxio_b16",2], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",2], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",2], + "2949545414911764346": ["convolution_gpu_yxfb_yxio_b16",2], + "1173136780324694038": ["convolution_gpu_yxfb_yxio_b16",2], + "12512751736409465214": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9079676771143357396": ["convolution_gpu_yxfb_yxio_b16",1], + "15474155528481683394": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "9649445293567537596": ["convolution_gpu_yxfb_yxio_b16",2], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "3603187029740446600": ["convolution_gpu_bfyx_gemm_like",2], + "1418595171949196661": ["convolution_gpu_bfyx_gemm_like",2], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6578517057140155080": ["convolution_gpu_yxfb_yxio_b16",2], + "16674633029045714564": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "15228390729175722409": ["convolution_gpu_yxfb_yxio_b16",2], + "1245259979364728404": ["convolution_gpu_bfyx_1x1",2], + "101387140804297623": ["convolution_gpu_yxfb_yxio_b16",2], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",2], + "13234055353608734080": ["convolution_gpu_yxfb_yxio_b16",1], + "136349424199140459": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9182260316973872633": ["convolution_gpu_yxfb_yxio_b16",2], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",2], + "17651477639302255490": ["convolution_gpu_yxfb_yxio_b16",2], + "8768300687476117215": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "6651389480007764007": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14461365896122393071": ["convolution_gpu_yxfb_yxio_b16",2], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",2], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "13596876807637507229": ["convolution_gpu_bfyx_1x1",2], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2208765794404376467": ["convolution_gpu_yxfb_yxio_b16",2], + "1345101751956733589": ["convolution_gpu_bfyx_gemm_like",2], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12185561188335760786": ["convolution_gpu_yxfb_yxio_b16",2], + "14113320831418478396": ["convolution_gpu_yxfb_yxio_b16",2], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "4098191685457418125": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "12637509262827320678": ["convolution_gpu_yxfb_yxio_b16",1], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "5897564616927353003": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "15006204461468698734": ["convolution_gpu_yxfb_yxio_b16",2], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13398986810666238552": ["convolution_gpu_yxfb_yxio_b16",2], + "11731277083374465361": ["convolution_gpu_yxfb_yxio_b16",2], + "6020017927557041768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "10880830033700542216": ["convolution_gpu_yxfb_yxio_b16",1], + "6934241437968723825": ["convolution_gpu_yxfb_yxio_b16",1], + "4740585760177040164": ["convolution_gpu_yxfb_yxio_b16",1], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7009459929666511861": ["convolution_gpu_bfyx_1x1",1], + "5602377914578322577": ["convolution_gpu_yxfb_yxio_b16",2], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",1], + "2561508262445368003": ["convolution_gpu_yxfb_yxio_b16",2], + "17935612508319394087": ["convolution_gpu_yxfb_yxio_b16",2], + "6126073246053235472": ["convolution_gpu_yxfb_yxio_b16",2], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",0], + "786401653335542559": ["convolution_gpu_bfyx_gemm_like",2], + "123026136670202868": ["convolution_gpu_yxfb_yxio_b16",2], + "4999171487916568471": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "14204609663091442879": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12181310683533105454": ["fully_connected_gpu_fb_oi_ref",1], + "15765592038173567297": ["convolution_gpu_yxfb_yxio_b16",2], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "15497263259976427714": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "6817494598328071314": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12234313962656804631": ["convolution_gpu_bfyx_gemm_like",2], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",1], + "155962454315573087": ["convolution_gpu_yxfb_yxio_b16",2], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",2], + "15681189418847392587": ["convolution_gpu_bfyx_os_iyx_osv16",857], + "9989055862610193828": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "3779229442395464456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15678768217453692725": ["convolution_gpu_yxfb_yxio_b16",1], + "6950586691727980329": ["convolution_gpu_yxfb_yxio_b16",1], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3365786526859737112": ["convolution_gpu_yxfb_yxio_b16",1], + "6022695488769618639": ["convolution_gpu_yxfb_yxio_b16",2], + "11612044653200304877": ["convolution_gpu_yxfb_yxio_b16",2], + "12960590161485806657": ["convolution_gpu_bfyx_gemm_like",2], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",2], + "15223779293313750042": ["convolution_gpu_yxfb_yxio_b16",2], + "14749758365915995876": ["convolution_gpu_yxfb_yxio_b16",2], + "11973034261101454380": ["convolution_gpu_yxfb_yxio_b16",2], + "8205640825965213946": ["convolution_gpu_yxfb_yxio_b16",1], + "10774872391768741315": ["convolution_gpu_yxfb_yxio_b16",2], + "11564071490267241224": ["convolution_gpu_yxfb_yxio_b16",2], + "12604104383683210104": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "3658599312236344017": ["convolution_gpu_yxfb_yxio_b16",2], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "8974851555526896131": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",455], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "3976736548270395981": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5576305720733717044": ["convolution_gpu_yxfb_yxio_b16",1], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",1], + "8584375748627260395": ["convolution_gpu_yxfb_yxio_b16",2], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",2], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",2], + "18040183500393090505": ["convolution_gpu_yxfb_yxio_b16",1], + "4291531885506213180": ["convolution_gpu_yxfb_yxio_b16",2], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12476381811279163147": ["convolution_gpu_yxfb_yxio_b16",2], + "12972798847556569913": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2936333406928424760": ["convolution_gpu_bfyx_1x1",2], + "6081038474197004540": ["convolution_gpu_yxfb_yxio_b16",1], + "577842450575835175": ["convolution_gpu_yxfb_yxio_b16",2], + "401304652492444430": ["convolution_gpu_bfyx_gemm_like",2], + "13009612703754510124": ["convolution_gpu_yxfb_yxio_b16",2], + "5020763861388859254": ["convolution_gpu_bfyx_gemm_like",2], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "16681690088928624738": ["convolution_gpu_bfyx_gemm_like",2], + "2173163618947713953": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "12477315042623518609": ["convolution_gpu_yxfb_yxio_b16",2], + "3067930325929862490": ["convolution_gpu_yxfb_yxio_b16",2], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",2], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5012013738970489338": ["convolution_gpu_bfyx_1x1",1], + "8735534480653818425": ["convolution_gpu_yxfb_yxio_b16",2], + "4627958043707973483": ["convolution_gpu_yxfb_yxio_b16",1], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "16610284927818475574": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13618411266808159341": ["convolution_gpu_yxfb_yxio_b16",1], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2188101366183302888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12923298574715329852": ["convolution_gpu_yxfb_yxio_b16",2], + "1531349457115735845": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5751553671208192963": ["convolution_gpu_yxfb_yxio_b16",2], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "2882493407831196579": ["fully_connected_gpu_fb_io_block_fp16",2], + "7232326270078161768": ["convolution_gpu_bfyx_gemm_like",2], + "17778091287904736965": ["convolution_gpu_bfyx_gemm_like",2], + "4113061482402915179": ["convolution_gpu_yxfb_yxio_b16",2], + "8732952254407298868": ["convolution_gpu_bfyx_gemm_like",0], + "5564881878876582769": ["convolution_gpu_yxfb_yxio_b16",2], + "3217674729821898463": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "15720012960520885263": ["convolution_gpu_yxfb_yxio_b16",1], + "3286250915720444467": ["convolution_gpu_yxfb_yxio_b16",2], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "10292585962794261197": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4633923265089466898": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "11270855425262923989": ["convolution_gpu_yxfb_yxio_b16",2], + "8761283252495354972": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "18187345248160481425": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",2], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",1], + "13531892014108749846": ["convolution_gpu_yxfb_yxio_b16",2], + "3622778166646258015": ["convolution_gpu_yxfb_yxio_b16",1], + "12745631396795162505": ["convolution_gpu_yxfb_yxio_b16",2], + "6948455759869670955": ["convolution_gpu_yxfb_yxio_b16",2], + "12721294268595880422": ["convolution_gpu_yxfb_yxio_b16",2], + "17439102502195540957": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1198893312653197535": ["convolution_gpu_yxfb_yxio_b16",2], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",2], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "636447309806530300": ["convolution_gpu_yxfb_yxio_b16",2], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "16768497046700403748": ["convolution_gpu_yxfb_yxio_b16",2], + "15167962750603978874": ["convolution_gpu_yxfb_yxio_b16",2], + "6267138247577676996": ["convolution_gpu_yxfb_yxio_b16",2], + "7895030495055232460": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2576773809294607971": ["convolution_gpu_yxfb_yxio_b16",2], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",2], + "3962138884698789654": ["convolution_gpu_yxfb_yxio_b16",2], + "10547134120307382906": ["convolution_gpu_yxfb_yxio_b16",2], + "7223801044761006523": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "377219085802486361": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9435086287598656868": ["convolution_gpu_yxfb_yxio_b16",2], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "10706180189726741161": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16711142379173254655": ["convolution_gpu_yxfb_yxio_b16",2], + "3171354702636014224": ["convolution_gpu_yxfb_yxio_b16",2], + "11007175027950132719": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11194372303922533529": ["convolution_gpu_yxfb_yxio_b16",2], + "13705072264927031658": ["convolution_gpu_yxfb_yxio_b16",2], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9576962489937466093": ["convolution_gpu_yxfb_yxio_b16",2], + "4445913285957791409": ["convolution_gpu_yxfb_yxio_b16",1], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "17734480671864478402": ["convolution_gpu_yxfb_yxio_b16",2], + "5112480593385320005": ["convolution_gpu_yxfb_yxio_b16",2], + "6603778920476932267": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6253009218981124949": ["convolution_gpu_yxfb_yxio_b16",2], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",2], + "14971270053929063630": ["convolution_gpu_yxfb_yxio_b16",2], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",2], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "522313477023837056": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12256193738921380409": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9096495972770198040": ["convolution_gpu_yxfb_yxio_b16",2], + "3056212889689424946": ["convolution_gpu_bfyx_1x1",2], + "426827405952656362": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "12871555773123368130": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11088324811742486481": ["convolution_gpu_bfyx_gemm_like",2], + "4398371999113956082": ["convolution_gpu_bfyx_gemm_like",2], + "14774814395786139876": ["convolution_gpu_yxfb_yxio_b16",2], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16242136888057221574": ["convolution_gpu_yxfb_yxio_b16",2], + "13777174566683935109": ["convolution_gpu_yxfb_yxio_b16",2], + "5337351591182109481": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "6249875772709398338": ["convolution_gpu_yxfb_yxio_b16",2], + "913861052717410566": ["convolution_gpu_yxfb_yxio_b16",2], + "1114679698826953542": ["convolution_gpu_yxfb_yxio_b16",1], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "5516518048239364231": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "2581414750854621875": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "12327057172281102984": ["convolution_gpu_yxfb_yxio_b16",2], + "576164857039495839": ["convolution_gpu_yxfb_yxio_b16",2], + "18184621367843960190": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "6341363789473021047": ["convolution_gpu_yxfb_yxio_b16",2], + "1325669650629605592": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5106072383853469966": ["convolution_gpu_yxfb_yxio_b16",1], + "7800262579057534804": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10935309102034762723": ["convolution_gpu_bfyx_1x1",1], + "9453100135791813000": ["convolution_gpu_yxfb_yxio_b16",2], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",2], + "3856976081672275637": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "6288489890578212082": ["convolution_gpu_bfyx_gemm_like",2], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "4937688558707451907": ["convolution_gpu_yxfb_yxio_b16",2], + "6317575981520135028": ["convolution_gpu_bfyx_gemm_like",1], + "15531306520021286502": ["convolution_gpu_bfyx_gemm_like",2], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",0], + "13809330759308309353": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7683334381958571864": ["convolution_gpu_bfyx_gemm_like",2], + "3017824560305532066": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",2], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "2282123636764935353": ["convolution_gpu_yxfb_yxio_b16",2], + "13978649386370395620": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "5942742563827424666": ["convolution_gpu_yxfb_yxio_b16",2], + "671453551040072499": ["convolution_gpu_bfyx_gemm_like",2], + "2737064424879246276": ["convolution_gpu_bfyx_gemm_like",2], + "18087356517015630281": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "3336076058264596420": ["convolution_gpu_bfyx_gemm_like",2], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",2], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12370729327673204804": ["convolution_gpu_bfyx_gemm_like",2], + "13251091004269229867": ["convolution_gpu_bfyx_gemm_like",2], + "2817919813339364130": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",2], + "12768933181342249823": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "2173720698351153121": ["convolution_gpu_bfyx_gemm_like",2], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "11109044986816563101": ["convolution_gpu_yxfb_yxio_b16",2], + "17682152011630274259": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18214716801063702171": ["convolution_gpu_yxfb_yxio_b16",2], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",2], + "15636128989267984459": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13409744191227471760": ["convolution_gpu_bfyx_gemm_like",1], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "57372993988016244": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "13912728810446567016": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "14324166291904435508": ["convolution_gpu_yxfb_yxio_b16",2], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15418732002117930760": ["convolution_gpu_yxfb_yxio_b16",2], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",1], + "9017605508157213607": ["convolution_gpu_yxfb_yxio_b16",2], + "17617204422090117691": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2506424495656099512": ["convolution_gpu_yxfb_yxio_b16",2], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "3423717644513543253": ["convolution_gpu_yxfb_yxio_b16",2], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",1], + "3652414035262499383": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",2], + "17811558714592064184": ["convolution_gpu_yxfb_yxio_b16",2], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "12680688623162482255": ["convolution_gpu_bfyx_1x1",2], + "7824075236081312706": ["convolution_gpu_yxfb_yxio_b16",2], + "15159534367247036982": ["convolution_gpu_yxfb_yxio_b16",2], + "14074996784220709246": ["convolution_gpu_yxfb_yxio_b16",2], + "8021915447462898777": ["convolution_gpu_bfyx_gemm_like",0], + "1973819632224480598": ["convolution_gpu_yxfb_yxio_b16",1], + "12411075288896909468": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",2], + "2908156087871187676": ["convolution_gpu_yxfb_yxio_b16",2], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",2], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",2], + "13851851281384416649": ["convolution_gpu_bfyx_1x1",1], + "6217542346826403576": ["convolution_gpu_bfyx_1x1",2], + "11557032521956761994": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "13199672084171648305": ["convolution_gpu_yxfb_yxio_b16",2], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3225866261943242708": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10765280349477640969": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "17876939980356283351": ["convolution_gpu_yxfb_yxio_b16",2], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",0], + "6280726148869856021": ["convolution_gpu_yxfb_yxio_b16",2], + "10471519687597963116": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8470959792634864749": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13501352378461071771": ["convolution_gpu_yxfb_yxio_b16",2], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "14908477489231326997": ["convolution_gpu_yxfb_yxio_b16",2], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "12767065362702304803": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "8039645104667120991": ["convolution_gpu_yxfb_yxio_b16",2], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",1], + "563440246018637010": ["convolution_gpu_yxfb_yxio_b16",2], + "17361714725103230834": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "16182470664818268848": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5668538167635622474": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "7532088618116521936": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13130001092233798285": ["convolution_gpu_yxfb_yxio_b16",2], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8185193068790365354": ["convolution_gpu_bfyx_gemm_like",2], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "17225552472711821360": ["convolution_gpu_bfyx_os_iyx_osv16",946], + "4759671642533786591": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "14132543442791497311": ["convolution_gpu_yxfb_yxio_b16",2], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16341131728764501904": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8183383667948205424": ["convolution_gpu_yxfb_yxio_b16",2], + "4701235352806075765": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "13527018660229167386": ["convolution_gpu_yxfb_yxio_b16",1], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8943913562339525413": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4274425737610351312": ["convolution_gpu_bfyx_gemm_like",2], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",2], + "4378422094110940766": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5284456216115118110": ["convolution_gpu_yxfb_yxio_b16",2], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "2439993891369206440": ["convolution_gpu_bfyx_1x1",2], + "7902473777019759045": ["convolution_gpu_bfyx_gemm_like",2], + "10322427853063201289": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "15497797842820949408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7762916621662364082": ["convolution_gpu_yxfb_yxio_b16",2], + "4072951883124129646": ["convolution_gpu_yxfb_yxio_b16",1], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",2], + "12384317536636082264": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15578456771467281881": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12741762570001404232": ["convolution_gpu_yxfb_yxio_b16",1], + "2581014920570427861": ["convolution_gpu_yxfb_yxio_b16",2], + "5854093367753757010": ["convolution_gpu_yxfb_yxio_b16",2], + "15104727000375811836": ["convolution_gpu_yxfb_yxio_b16",2], + "13966416504547680082": ["convolution_gpu_yxfb_yxio_b16",2], + "16620268338434572068": ["convolution_gpu_yxfb_yxio_b16",1], + "875296362957469305": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7405315582091905378": ["convolution_gpu_yxfb_yxio_b16",1], + "12961109385388101976": ["convolution_gpu_yxfb_yxio_b16",0], + "998876398773540321": ["convolution_gpu_bfyx_1x1",1], + "10463632805036507382": ["convolution_gpu_yxfb_yxio_b16",2], + "1485662490111767875": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "5552699731399195573": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "13038533272699602337": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12013818650853034767": ["convolution_gpu_yxfb_yxio_b16",2], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "11079061135559995449": ["convolution_gpu_yxfb_yxio_b16",1], + "7364084475361144967": ["convolution_gpu_yxfb_yxio_b16",1], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",2], + "7441188930428385142": ["convolution_gpu_yxfb_yxio_b16",1], + "3830842631023415233": ["convolution_gpu_yxfb_yxio_b16",2], + "11073090858361674041": ["convolution_gpu_yxfb_yxio_b16",2], + "8611873585228858719": ["convolution_gpu_yxfb_yxio_b16",2], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",2], + "3047407458812880288": ["convolution_gpu_yxfb_yxio_b16",2], + "1123577455191848310": ["convolution_gpu_bfyx_gemm_like",2], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "17737878867906137388": ["convolution_gpu_yxfb_yxio_b16",2], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "16788715253205076219": ["fully_connected_gpu_fb_oi_ref",1], + "17559750858236255044": ["convolution_gpu_yxfb_yxio_b16",2], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",2], + "10002044609138970243": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11012427206693842637": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "5977875644245993099": ["convolution_gpu_yxfb_yxio_b16",1], + "411016281538345537": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10766317990628501609": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "9737833587413114584": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16161974964662774501": ["convolution_gpu_yxfb_yxio_b16",2], + "7571716782558859443": ["convolution_gpu_yxfb_yxio_b16",2], + "13291402786934990349": ["convolution_gpu_yxfb_yxio_b16",2], + "8177017967170389275": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15811723176266128065": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "18033349045324117723": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11500205299047837289": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4947788161154370784": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "13585916416233680276": ["convolution_gpu_yxfb_yxio_b16",2], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "8611710048909301596": ["convolution_gpu_yxfb_yxio_b16",2], + "3366647240745174769": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5421397731090158382": ["convolution_gpu_yxfb_yxio_b16",1], + "577844026691991089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6962062962411903140": ["convolution_gpu_yxfb_yxio_b16",2], + "1148949417144436507": ["convolution_gpu_yxfb_yxio_b16",2], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1208665743495618456": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5788018146987909930": ["convolution_gpu_yxfb_yxio_b16",2], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",2], + "3914143598803149415": ["convolution_gpu_yxfb_yxio_b16",2], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",1], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11921652085115182024": ["convolution_gpu_yxfb_yxio_b16",2], + "586134723922638373": ["convolution_gpu_bfyx_gemm_like",2], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",1], + "9955816463820554626": ["convolution_gpu_yxfb_yxio_b16",2], + "10560559646371329711": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18279416225045612845": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2819475920524949313": ["convolution_gpu_yxfb_yxio_b16",2], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7861119251077361882": ["convolution_gpu_yxfb_yxio_b16",2], + "7369109502608631066": ["convolution_gpu_yxfb_yxio_b16",2], + "1216021647922150199": ["convolution_gpu_yxfb_yxio_b16",2], + "4238885454989272754": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "2816339200381598722": ["convolution_gpu_yxfb_yxio_b16",2], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "1141277975467180549": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "7107677063657303327": ["convolution_gpu_bfyx_1x1",2], + "8079914471491171372": ["convolution_gpu_yxfb_yxio_b16",1], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "14263790627243107300": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",2], + "6859143702528475520": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4098581145478965082": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11241838709529552265": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "331661172067077796": ["convolution_gpu_bfyx_1x1",2], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "3932617680771387232": ["convolution_gpu_yxfb_yxio_b16",2], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18161786710055240343": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "12388894315292201102": ["convolution_gpu_yxfb_yxio_b16",2], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "6723804327185132790": ["convolution_gpu_bfyx_gemm_like",2], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",2], + "15576534481170615301": ["convolution_gpu_yxfb_yxio_b16",2], + "7223737889890738294": ["convolution_gpu_yxfb_yxio_b16",2], + "16341700680310033430": ["fully_connected_gpu_fb_io_block_fp16",2], + "10996429218747311159": ["convolution_gpu_yxfb_yxio_b16",1], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "5895417825685090256": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "7317391511452227268": ["convolution_gpu_bfyx_gemm_like",2], + "2147962310424425158": ["convolution_gpu_yxfb_yxio_b16",2], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",2], + "9940761514291929473": ["convolution_gpu_yxfb_yxio_b16",2], + "5288793454052261767": ["convolution_gpu_bfyx_gemm_like",2], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3806806400778685133": ["convolution_gpu_yxfb_yxio_b16",2], + "16161112020028389294": ["convolution_gpu_yxfb_yxio_b16",2], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "7590734607006912544": ["convolution_gpu_yxfb_yxio_b16",2], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17270057383792994793": ["convolution_gpu_yxfb_yxio_b16",2], + "13830605041347009953": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14273849038400888518": ["convolution_gpu_yxfb_yxio_b16",2], + "360064276184684693": ["convolution_gpu_yxfb_yxio_b16",1], + "4597873630741623918": ["convolution_gpu_yxfb_yxio_b16",1], + "69832608384091511": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7260204889552803221": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14034525799882831106": ["convolution_gpu_bfyx_gemm_like",2], + "4169042131399110713": ["convolution_gpu_yxfb_yxio_b16",2], + "1089944493540593798": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "8931469268093714938": ["convolution_gpu_yxfb_yxio_b16",1], + "4208702365182336507": ["convolution_gpu_yxfb_yxio_b16",2], + "13914239937595549448": ["convolution_gpu_yxfb_yxio_b16",2], + "15488532485794545310": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3231651468686543808": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "241656278218999298": ["convolution_gpu_yxfb_yxio_b16",2], + "10956668791040094584": ["convolution_gpu_yxfb_yxio_b16",2], + "844576097677576405": ["convolution_gpu_yxfb_yxio_b16",2], + "3631332752661975859": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15421280195211166867": ["convolution_gpu_yxfb_yxio_b16",2], + "14823789570149356458": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7104266560248570112": ["convolution_gpu_yxfb_yxio_b16",2], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "8075261051536686307": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "4142555169083069413": ["convolution_gpu_bfyx_gemm_like",2], + "12501619443242354860": ["convolution_gpu_bfyx_gemm_like",2], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11987564534722442223": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "13200834963067135502": ["fully_connected_gpu_fb_oi_ref",1], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",2], + "14280056365441354869": ["convolution_gpu_yxfb_yxio_b16",2], + "3766048787611884529": ["convolution_gpu_yxfb_yxio_b16",2], + "12878346173547852969": ["convolution_gpu_yxfb_yxio_b16",2], + "8723078862651154959": ["convolution_gpu_yxfb_yxio_b16",2], + "135072053401934228": ["convolution_gpu_bfyx_1x1",2], + "8115522418294960470": ["convolution_gpu_yxfb_yxio_b16",2], + "15998609626878578708": ["convolution_gpu_yxfb_yxio_b16",2], + "2149299205144202701": ["convolution_gpu_yxfb_yxio_b16",2], + "5940007433515335594": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1539677456611270609": ["convolution_gpu_yxfb_yxio_b16",2], + "4683320313995550908": ["convolution_gpu_yxfb_yxio_b16",2], + "15060535689318007173": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "18239740525818575112": ["convolution_gpu_yxfb_yxio_b16",2], + "9814647153117279415": ["convolution_gpu_yxfb_yxio_b16",2], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",1], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",2], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",1], + "15932838442166411183": ["convolution_gpu_yxfb_yxio_b16",2], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11971853138084108953": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "721174714308243785": ["convolution_gpu_bfyx_gemm_like",2], + "6290584630172122012": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "13636407347458845915": ["convolution_gpu_yxfb_yxio_b16",2], + "15534876725099279666": ["convolution_gpu_yxfb_yxio_b16",2], + "9967611023372430532": ["convolution_gpu_bfyx_gemm_like",2], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "4201057957682777280": ["convolution_gpu_yxfb_yxio_b16",1], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "12311849904266608701": ["convolution_gpu_yxfb_yxio_b16",2], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10815244730103375973": ["convolution_gpu_yxfb_yxio_b16",1], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",2], + "12526988667216482085": ["convolution_gpu_yxfb_yxio_b16",2], + "13077917010686381919": ["convolution_gpu_yxfb_yxio_b16",2], + "16828961272295386615": ["convolution_gpu_bfyx_os_iyx_osv16",539], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "12046017161414846599": ["convolution_gpu_bfyx_1x1",2], + "17344974951998490453": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8622014461615231500": ["convolution_gpu_yxfb_yxio_b16",2], + "15438470456977849772": ["convolution_gpu_yxfb_yxio_b16",2], + "1868805550246252143": ["convolution_gpu_yxfb_yxio_b16",2], + "7846384623429362522": ["convolution_gpu_bfyx_1x1",1], + "6388117241933586388": ["convolution_gpu_bfyx_gemm_like",2], + "15188570678726970998": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12002302929446578025": ["convolution_gpu_yxfb_yxio_b16",2], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",2], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14808759315730413993": ["convolution_gpu_yxfb_yxio_b16",2], + "3211829722778368758": ["convolution_gpu_yxfb_yxio_b16",2], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "16950925976172895196": ["convolution_gpu_yxfb_yxio_b16",2], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "3370082268529091875": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "7493567975736494003": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",2], + "4683575221310726091": ["convolution_gpu_yxfb_yxio_b16",2], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5401380444992462053": ["convolution_gpu_yxfb_yxio_b16",1], + "11052275099129482401": ["convolution_gpu_yxfb_yxio_b16",2], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18249888571553409563": ["convolution_gpu_yxfb_yxio_b16",2], + "15612334131144235342": ["convolution_gpu_yxfb_yxio_b16",2], + "3001162215282339268": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16895523130717954500": ["convolution_gpu_yxfb_yxio_b16",2], + "14236681916032484600": ["convolution_gpu_yxfb_yxio_b16",2], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "2339864165283480961": ["convolution_gpu_bfyx_1x1",2], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2247717767819293683": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4818598834950786080": ["convolution_gpu_yxfb_yxio_b16",2], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",2], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "16335738565228204503": ["convolution_gpu_yxfb_yxio_b16",2], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "17224104246148265328": ["convolution_gpu_bfyx_gemm_like",2], + "13182623473102074079": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11157773554806649837": ["convolution_gpu_yxfb_yxio_b16",2], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12267555886404772991": ["convolution_gpu_yxfb_yxio_b16",2], + "2847490224869294354": ["convolution_gpu_bfyx_gemm_like",0], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "2215570184121152738": ["convolution_gpu_bfyx_gemm_like",2], + "5584145249514762750": ["convolution_gpu_yxfb_yxio_b16",2], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",565], + "10837496380266058422": ["convolution_gpu_bfyx_gemm_like",2], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "7913076120244203725": ["convolution_gpu_bfyx_gemm_like",2], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "16773645387243701837": ["convolution_gpu_bfyx_gemm_like",2], + "4049224463072418218": ["convolution_gpu_yxfb_yxio_b16",1], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",1], + "17462996923473002801": ["convolution_gpu_yxfb_yxio_b16",2], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5965451243366505522": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11614353411428360211": ["convolution_gpu_yxfb_yxio_b16",2], + "7565348337952384040": ["convolution_gpu_yxfb_yxio_b16",2], + "3782308167335660154": ["convolution_gpu_yxfb_yxio_b16",2], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "8779960552750034544": ["convolution_gpu_yxfb_yxio_b16",2], + "3934090072734175564": ["convolution_gpu_yxfb_yxio_b16",2], + "880603384896315783": ["convolution_gpu_yxfb_yxio_b16",2], + "1658174263018326745": ["convolution_gpu_yxfb_yxio_b16",2], + "2917735110073643952": ["convolution_gpu_bfyx_gemm_like",2], + "9280279544075738476": ["convolution_gpu_yxfb_yxio_b16",1], + "12131461096501477069": ["convolution_gpu_yxfb_yxio_b16",2], + "14585000863294748739": ["convolution_gpu_bfyx_gemm_like",2], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "14712972289919865502": ["convolution_gpu_bfyx_gemm_like",1], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11761545976388416063": ["convolution_gpu_yxfb_yxio_b16",2], + "6769524481210107636": ["convolution_gpu_yxfb_yxio_b16",2], + "17107083637007906184": ["convolution_gpu_bfyx_gemm_like",2], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "8469874583725132145": ["fully_connected_gpu_fb_oi_ref",1], + "4423866541063606768": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",1], + "11298854310398101852": ["convolution_gpu_yxfb_yxio_b16",2], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4933831571091731212": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10718764522366711114": ["convolution_gpu_yxfb_yxio_b16",2], + "10912495395422146386": ["convolution_gpu_bfyx_gemm_like",2], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",2], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "15886016297043613632": ["convolution_gpu_yxfb_yxio_b16",1], + "15980348884716629349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",1], + "12389854459474697184": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15693204620575485046": ["convolution_gpu_yxfb_yxio_b16",2], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7329924387620542330": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "14528180674573671874": ["convolution_gpu_yxfb_yxio_b16",2], + "5931972000452008090": ["convolution_gpu_yxfb_yxio_b16",2], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "11291868421122092629": ["convolution_gpu_yxfb_yxio_b16",2], + "2923543983518895756": ["convolution_gpu_yxfb_yxio_b16",1], + "8506271633579173639": ["convolution_gpu_yxfb_yxio_b16",2], + "2759142157812694203": ["convolution_gpu_yxfb_yxio_b16",2], + "294153950488131608": ["convolution_gpu_yxfb_yxio_b16",2], + "1569043950563130463": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4216958486055161753": ["convolution_gpu_bfyx_os_iyx_osv16",105], + "6388086351909447495": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",1], + "15267084369543546013": ["convolution_gpu_yxfb_yxio_b16",2], + "8260073247636023575": ["convolution_gpu_yxfb_yxio_b16",2], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",1], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",2], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12713821004129672990": ["convolution_gpu_yxfb_yxio_b16",2], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7780140599533242850": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12816344078518706065": ["convolution_gpu_yxfb_yxio_b16",2], + "3499406509137418124": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "5291817530552764387": ["convolution_gpu_yxfb_yxio_b16",2], + "12977678792503377525": ["convolution_gpu_bfyx_gemm_like",1], + "9827201026276954165": ["convolution_gpu_yxfb_yxio_b16",2], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15971340431600153619": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "9162862507585693061": ["convolution_gpu_yxfb_yxio_b16",2], + "14963449045970262346": ["convolution_gpu_yxfb_yxio_b16",0], + "16949056117405140365": ["convolution_gpu_bfyx_gemm_like",2], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "17764795635957985989": ["convolution_gpu_yxfb_yxio_b16",2], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",2], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "5672464491301994292": ["convolution_gpu_bfyx_gemm_like",2], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1898243736289257252": ["convolution_gpu_yxfb_yxio_b16",2], + "4617347486560666277": ["convolution_gpu_bfyx_1x1",1], + "7273427309587902237": ["convolution_gpu_bfyx_gemm_like",2], + "2866656294663853474": ["convolution_gpu_bfyx_1x1",2], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12903015669020591018": ["convolution_gpu_yxfb_yxio_b16",2], + "8941904405273405481": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "16290551573997593168": ["convolution_gpu_bfyx_gemm_like",2], + "14944590179685661287": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "787203599734115483": ["convolution_gpu_bfyx_1x1",1], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "8323669961818535927": ["convolution_gpu_yxfb_yxio_b16",2], + "12623375499927200341": ["convolution_gpu_bfyx_gemm_like",2], + "10141927023849730720": ["convolution_gpu_bfyx_1x1",1], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "1390379098099686972": ["convolution_gpu_bfyx_1x1",2], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "4982549855424649217": ["convolution_gpu_yxfb_yxio_b16",2], + "15295172519920136220": ["convolution_gpu_yxfb_yxio_b16",2], + "15750539817895707253": ["convolution_gpu_yxfb_yxio_b16",2], + "12585864429067596351": ["convolution_gpu_yxfb_yxio_b16",1], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16725049805030712400": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3738514326459749974": ["convolution_gpu_yxfb_yxio_b16",1], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",2], + "11800783548769329949": ["convolution_gpu_bfyx_gemm_like",2], + "13598062803968442253": ["convolution_gpu_yxfb_yxio_b16",2], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17498483343394902796": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12027202455592387086": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5834825835421819800": ["convolution_gpu_yxfb_yxio_b16",2], + "816527348871309530": ["convolution_gpu_yxfb_yxio_b16",2], + "8321204816277460837": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10626018319543075871": ["convolution_gpu_yxfb_yxio_b16",2], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",2], + "11891319657803057127": ["convolution_gpu_yxfb_yxio_b16",2], + "3308955824300750921": ["convolution_gpu_yxfb_yxio_b16",2], + "1095959046309466012": ["convolution_gpu_yxfb_yxio_b16",2], + "14184895905338394239": ["convolution_gpu_bfyx_gemm_like",2], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",1], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "11706446082856895571": ["convolution_gpu_bfyx_gemm_like",2], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16563030700888982979": ["convolution_gpu_yxfb_yxio_b16",2], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",1], + "430132942408244070": ["convolution_gpu_bfyx_gemm_like",2], + "11299021927882809469": ["convolution_gpu_yxfb_yxio_b16",2], + "9562527071055150197": ["convolution_gpu_bfyx_1x1",2], + "1250095876638711647": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "14079654309452583394": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7742126547476513275": ["convolution_gpu_yxfb_yxio_b16",2], + "3343020946662226400": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",1], + "2670216237572554944": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "15363606233048272809": ["convolution_gpu_bfyx_1x1",2], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13735180250757239202": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",2], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "7395419333138772074": ["convolution_gpu_yxfb_yxio_b16",1], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "13158449455164143947": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "2782970766870172398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15449715596597016714": ["convolution_gpu_bfyx_gemm_like",2], + "9809458159478958866": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "11224051407822914513": ["convolution_gpu_yxfb_yxio_b16",2], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",2], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "4897690791599638716": ["convolution_gpu_yxfb_yxio_b16",2], + "4776685525963461501": ["convolution_gpu_yxfb_yxio_b16",2], + "938848188161536107": ["convolution_gpu_bfyx_1x1",2], + "16742058312847401360": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17266121859044814533": ["convolution_gpu_yxfb_yxio_b16",2], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "14764715930784496165": ["convolution_gpu_bfyx_gemm_like",2], + "17580363505072477558": ["convolution_gpu_yxfb_yxio_b16",2], + "14578867494693499627": ["convolution_gpu_bfyx_gemm_like",2], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",2], + "15148442194461613102": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5941298590926032148": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7126667413990834481": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "223412492545617963": ["convolution_gpu_yxfb_yxio_b16",2], + "13621771094745539509": ["convolution_gpu_yxfb_yxio_b16",2], + "9871407256481442790": ["convolution_gpu_yxfb_yxio_b16",2], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",2], + "7815650257256675477": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "4461989328775275994": ["convolution_gpu_bfyx_gemm_like",2], + "13123709697607309884": ["convolution_gpu_yxfb_yxio_b16",1], + "14331554754171207866": ["convolution_gpu_bfyx_gemm_like",1], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",2], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16597170760061556882": ["convolution_gpu_yxfb_yxio_b16",2], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "15106614232165315070": ["convolution_gpu_bfyx_gemm_like",2], + "12913866095318048752": ["convolution_gpu_bfyx_gemm_like",2], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "8943651590146149679": ["convolution_gpu_yxfb_yxio_b16",2], + "6469277112054008613": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "18322435770607273817": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6862489207967519978": ["convolution_gpu_bfyx_gemm_like",2], + "11051684565403294370": ["convolution_gpu_yxfb_yxio_b16",2], + "14262482011051329729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1498389965422474930": ["convolution_gpu_yxfb_yxio_b16",2], + "14766477690417085350": ["convolution_gpu_bfyx_1x1",2], + "14819324687394700033": ["convolution_gpu_bfyx_1x1",2], + "4574541202890196191": ["convolution_gpu_yxfb_yxio_b16",2], + "5884802375772043861": ["convolution_gpu_yxfb_yxio_b16",1], + "9272405129875537865": ["convolution_gpu_yxfb_yxio_b16",2], + "14445031303145992349": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "15310474203328198827": ["convolution_gpu_yxfb_yxio_b16",2], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",1], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",2], + "17969195175890497912": ["convolution_gpu_yxfb_yxio_b16",2], + "9162359935098885411": ["convolution_gpu_yxfb_yxio_b16",2], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3364141707903132298": ["convolution_gpu_yxfb_yxio_b16",2], + "3647203315640064927": ["convolution_gpu_yxfb_yxio_b16",2], + "17342758321852264926": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4438526427135833402": ["convolution_gpu_yxfb_yxio_b16",2], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",1], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",1], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "130427456111826171": ["convolution_gpu_yxfb_yxio_b16",2], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "10572945270796129630": ["fully_connected_gpu_fb_io_ref",1], + "4936961129835214448": ["convolution_gpu_bfyx_gemm_like",2], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11148428797294511280": ["convolution_gpu_yxfb_yxio_b16",2], + "13408839571805750778": ["convolution_gpu_yxfb_yxio_b16",2], + "14558572801374416278": ["convolution_gpu_bfyx_gemm_like",1], + "10005348255972308430": ["convolution_gpu_yxfb_yxio_b16",2], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",2], + "8876704486585503280": ["convolution_gpu_yxfb_yxio_b16",2], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",2], + "10174346112533671798": ["convolution_gpu_yxfb_yxio_b16",2], + "1527126728636583082": ["convolution_gpu_yxfb_yxio_b16",0], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",2], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9538863363710651909": ["convolution_gpu_yxfb_yxio_b16",2], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5224252360611200472": ["convolution_gpu_bfyx_gemm_like",2], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "9642965664913867675": ["convolution_gpu_yxfb_yxio_b16",2], + "7397376454528841634": ["convolution_gpu_yxfb_yxio_b16",2], + "14742998604680438008": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "6921081008428242060": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "10939522663236304689": ["convolution_gpu_yxfb_yxio_b16",2], + "13155570698198686211": ["convolution_gpu_yxfb_yxio_b16",2], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "11939914680143672459": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "9590161922224578217": ["convolution_gpu_yxfb_yxio_b16",1], + "2797436491596125131": ["convolution_gpu_yxfb_yxio_b16",2], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "7600034850149968684": ["convolution_gpu_yxfb_yxio_b16",0], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",2], + "6839795451275143093": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",2], + "11277866878590984477": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9918371346247634545": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "14258941821319200170": ["convolution_gpu_yxfb_yxio_b16",2], + "2447893458816856522": ["convolution_gpu_bfyx_gemm_like",2], + "15078168059698267650": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2367452220382767844": ["convolution_gpu_yxfb_yxio_b16",2], + "3987482581128838173": ["convolution_gpu_yxfb_yxio_b16",2], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "10762489947656697207": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5924698731432597368": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "5422432655714154738": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3285968426413869315": ["convolution_gpu_yxfb_yxio_b16",1], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13320473279945887641": ["convolution_gpu_yxfb_yxio_b16",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",1], + "5008350851224686853": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "13668072006310741601": ["convolution_gpu_yxfb_yxio_b16",2], + "994252691216116396": ["convolution_gpu_yxfb_yxio_b16",1], + "149810021216592597": ["convolution_gpu_yxfb_yxio_b16",2], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "4633763257197651352": ["convolution_gpu_yxfb_yxio_b16",2], + "16209868158768307271": ["convolution_gpu_bfyx_os_iyx_osv16",919], + "10572380563704942622": ["convolution_gpu_yxfb_yxio_b16",2], + "11411413051626428349": ["convolution_gpu_yxfb_yxio_b16",2], + "8058419689646625853": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7590390572139249734": ["convolution_gpu_yxfb_yxio_b16",2], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "720558977788683564": ["convolution_gpu_yxfb_yxio_b16",2], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",387], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7814543122045448412": ["convolution_gpu_bfyx_gemm_like",2], + "17538518333907257868": ["convolution_gpu_bfyx_gemm_like",2], + "2283020548041814543": ["convolution_gpu_yxfb_yxio_b16",2], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",1], + "14651159827389223108": ["convolution_gpu_bfyx_gemm_like",2], + "17224655686568797096": ["convolution_gpu_yxfb_yxio_b16",1], + "1703738105910059846": ["convolution_gpu_yxfb_yxio_b16",2], + "14215394208930955062": ["convolution_gpu_yxfb_yxio_b16",0], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "7678226048807568024": ["convolution_gpu_yxfb_yxio_b16",2], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "1308980444055174254": ["convolution_gpu_bfyx_gemm_like",2], + "4727628999533330347": ["convolution_gpu_yxfb_yxio_b16",2], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "14149210193687890597": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14769111376729628572": ["convolution_gpu_yxfb_yxio_b16",2], + "501138469231848694": ["convolution_gpu_yxfb_yxio_b16",2], + "15645112311663561994": ["convolution_gpu_yxfb_yxio_b16",2], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5041922366297242362": ["convolution_gpu_yxfb_yxio_b16",2], + "16271970578584267980": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "8494385862885499798": ["convolution_gpu_yxfb_yxio_b16",1], + "7400937639903461446": ["convolution_gpu_yxfb_yxio_b16",2], + "18384215264061386089": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",2], + "15269988216002549857": ["convolution_gpu_yxfb_yxio_b16",2], + "8846314870152404018": ["convolution_gpu_bfyx_gemm_like",2], + "18373951194274306895": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "3101885395179993708": ["convolution_gpu_yxfb_yxio_b16",2], + "14315760630997175346": ["convolution_gpu_yxfb_yxio_b16",2], + "12331134162344797761": ["convolution_gpu_yxfb_yxio_b16",2], + "6254161707168091438": ["convolution_gpu_bfyx_gemm_like",2], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",2], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",2], + "897253033961107413": ["convolution_gpu_yxfb_yxio_b16",2], + "15206249797344242666": ["convolution_gpu_yxfb_yxio_b16",2], + "16210934187492210542": ["convolution_gpu_yxfb_yxio_b16",2], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18436249934780056991": ["convolution_gpu_bfyx_os_iyx_osv16",296], + "4104679489383377966": ["convolution_gpu_yxfb_yxio_b16",2], + "3713558537660711857": ["convolution_gpu_yxfb_yxio_b16",2], + "10961696014697611547": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2685061316482503878": ["convolution_gpu_bfyx_gemm_like",2], + "12487879163561616870": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10996596479775375564": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "15050884844653850678": ["convolution_gpu_yxfb_yxio_b16",2], + "12681408370704556588": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",1], + "13636859714649629789": ["convolution_gpu_yxfb_yxio_b16",1], + "18071280811713424504": ["convolution_gpu_yxfb_yxio_b16",2], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "698274493570551388": ["convolution_gpu_yxfb_yxio_b16",2], + "10034575179959785704": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "17616719165728687438": ["convolution_gpu_yxfb_yxio_b16",2], + "15308196586729169691": ["convolution_gpu_yxfb_yxio_b16",2], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",2], + "10782169939706303899": ["convolution_gpu_yxfb_yxio_b16",2], + "17806747473167329833": ["convolution_gpu_yxfb_yxio_b16",2], + "6438721407426283362": ["convolution_gpu_yxfb_yxio_b16",1], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "9714508918051740792": ["convolution_gpu_bfyx_direct_10_12_16",1], + "740260423018155343": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "3662747857062156477": ["convolution_gpu_bfyx_gemm_like",2], + "13637537549252005181": ["convolution_gpu_yxfb_yxio_b16",2], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "12896226291465522304": ["convolution_gpu_yxfb_yxio_b16",2], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "9906138392975645747": ["convolution_gpu_yxfb_yxio_b16",2], + "11730276873446857018": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",2], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",1], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",275], + "15640466585550013905": ["convolution_gpu_bfyx_gemm_like",2], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",1], + "14193777296032212476": ["convolution_gpu_yxfb_yxio_b16",2], + "6776601719651959634": ["convolution_gpu_yxfb_yxio_b16",2], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2863465257341735941": ["convolution_gpu_bfyx_1x1",1], + "11634932044447867039": ["convolution_gpu_bfyx_gemm_like",2], + "8096131027165540886": ["convolution_gpu_bfyx_gemm_like",2], + "15192230303376521834": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "900243696733233996": ["convolution_gpu_yxfb_yxio_b16",2], + "7927587739463421727": ["convolution_gpu_yxfb_yxio_b16",2], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",2], + "10645625090439446714": ["convolution_gpu_bfyx_gemm_like",2], + "4252157815622916471": ["convolution_gpu_bfyx_1x1",2], + "3135889221160961020": ["convolution_gpu_yxfb_yxio_b16",2], + "5886032409392368342": ["convolution_gpu_yxfb_yxio_b16",2], + "2740885908397449753": ["convolution_gpu_yxfb_yxio_b16",2], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",1], + "15609860394182767048": ["convolution_gpu_yxfb_yxio_b16",2], + "11311839946200066200": ["convolution_gpu_yxfb_yxio_b16",2], + "16910952799476896905": ["convolution_gpu_bfyx_gemm_like",2], + "1597770067928214597": ["convolution_gpu_bfyx_1x1",1], + "1802510952374368682": ["convolution_gpu_yxfb_yxio_b16",2], + "10893432143734884603": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "1662588605309237309": ["convolution_gpu_yxfb_yxio_b16",2], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "14020956765444878761": ["convolution_gpu_bfyx_gemm_like",2], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9482749589540764069": ["convolution_gpu_yxfb_yxio_b16",2], + "12053562297742437099": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6883767567034259453": ["convolution_gpu_yxfb_yxio_b16",2], + "17725637691681205907": ["convolution_gpu_bfyx_gemm_like",2], + "14446688005815492020": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",2], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "4752129805031267391": ["convolution_gpu_yxfb_yxio_b16",2], + "14689423748560749566": ["fully_connected_gpu_fb_oi_ref",1], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "3935750066315595083": ["convolution_gpu_yxfb_yxio_b16",1], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13503555814874045782": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "5600807544955072308": ["convolution_gpu_bfyx_gemm_like",2], + "8652128863605749877": ["convolution_gpu_yxfb_yxio_b16",2], + "4521622755195947253": ["convolution_gpu_yxfb_yxio_b16",2], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",281], + "11732321796147239597": ["convolution_gpu_yxfb_yxio_b16",2], + "9285202897230250613": ["convolution_gpu_yxfb_yxio_b16",2], + "10070051133200561606": ["convolution_gpu_yxfb_yxio_b16",1], + "13520876347177213888": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "5671289201458690944": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "486816652607164926": ["convolution_gpu_yxfb_yxio_b16",2], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3112648799276134590": ["convolution_gpu_yxfb_yxio_b16",2], + "17477062954520561609": ["convolution_gpu_bfyx_gemm_like",2], + "208915399644127739": ["convolution_gpu_bfyx_gemm_like",2], + "5596408142536691534": ["convolution_gpu_yxfb_yxio_b16",2], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5632958791318880428": ["convolution_gpu_yxfb_yxio_b16",2], + "6159729136505378486": ["convolution_gpu_yxfb_yxio_b16",2], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",2], + "9263784636194609884": ["convolution_gpu_yxfb_yxio_b16",2], + "11942424927004660476": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "6423354409210936959": ["convolution_gpu_yxfb_yxio_b16",1], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "6525496212688896740": ["convolution_gpu_yxfb_yxio_b16",2], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",2], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "9890700023578477203": ["convolution_gpu_bfyx_gemm_like",2], + "3541538046227217664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15217573782563469232": ["convolution_gpu_yxfb_yxio_b16",2], + "11587239927319376658": ["convolution_gpu_bfyx_gemm_like",2], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "15985980444340490463": ["convolution_gpu_yxfb_yxio_b16",2], + "1044889231088602677": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14264584839702225855": ["convolution_gpu_yxfb_yxio_b16",2], + "1186545671730357033": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",2], + "6469003096932778978": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "11787674847611032323": ["convolution_gpu_yxfb_yxio_b16",2], + "15646081020506130125": ["convolution_gpu_yxfb_yxio_b16",2], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "9735280865199145311": ["convolution_gpu_yxfb_yxio_b16",2], + "3265415000818832667": ["convolution_gpu_bfyx_gemm_like",2], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7859659993155959174": ["convolution_gpu_yxfb_yxio_b16",2], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",1], + "2204178900998688268": ["convolution_gpu_bfyx_gemm_like",2], + "10718639465064821919": ["convolution_gpu_yxfb_yxio_b16",2], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "1161304401293419103": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12686015414958770329": ["convolution_gpu_bfyx_gemm_like",2], + "17051718450741106678": ["convolution_gpu_yxfb_yxio_b16",2], + "7624259732952222597": ["convolution_gpu_bfyx_gemm_like",2], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "14349625788399542568": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17945600479510493949": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3383222668132648804": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7210854698870587826": ["convolution_gpu_yxfb_yxio_b16",2], + "1298596164164324360": ["convolution_gpu_yxfb_yxio_b16",2], + "7162155897369277782": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5317076157086789437": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9900658671239107502": ["convolution_gpu_bfyx_1x1",2], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",1], + "14339479547451422762": ["convolution_gpu_yxfb_yxio_b16",2], + "13576010631084066792": ["convolution_gpu_yxfb_yxio_b16",1], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4353583636655606632": ["convolution_gpu_yxfb_yxio_b16",1], + "16020916772006653269": ["convolution_gpu_bfyx_1x1",1], + "5596441339918073261": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "733956743303342862": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6880746917399866285": ["convolution_gpu_bfyx_gemm_like",2], + "6992073477131490452": ["convolution_gpu_bfyx_gemm_like",2], + "15865753975271064117": ["convolution_gpu_yxfb_yxio_b16",2], + "15275978123703636572": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "16748662918272106932": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",2], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "8174040194088942964": ["convolution_gpu_bfyx_os_iyx_osv16",945], + "4839205075057964902": ["convolution_gpu_yxfb_yxio_b16",2], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14501815053459103515": ["convolution_gpu_yxfb_yxio_b16",2], + "3684792790546138809": ["convolution_gpu_yxfb_yxio_b16",2], + "12032580551021546487": ["convolution_gpu_yxfb_yxio_b16",2], + "6709083009339039603": ["convolution_gpu_yxfb_yxio_b16",2], + "16828388628569377322": ["convolution_gpu_yxfb_yxio_b16",2], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "11637325834858582585": ["convolution_gpu_bfyx_gemm_like",2], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "1507504848332592003": ["convolution_gpu_yxfb_yxio_b16",1], + "16426655160932259558": ["convolution_gpu_yxfb_yxio_b16",2], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "8527069404111265568": ["convolution_gpu_bfyx_os_iyx_osv16",434], + "7280502812960451465": ["convolution_gpu_yxfb_yxio_b16",2], + "16532743776403877084": ["convolution_gpu_yxfb_yxio_b16",1], + "5657471280535146301": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2085467192625870436": ["convolution_gpu_bfyx_gemm_like",2], + "7168028033666253263": ["convolution_gpu_bfyx_gemm_like",2], + "12303905514885913537": ["convolution_gpu_yxfb_yxio_b16",1], + "15612797125081819500": ["convolution_gpu_yxfb_yxio_b16",2], + "9452094307760005150": ["convolution_gpu_bfyx_gemm_like",2], + "13862199647000195451": ["convolution_gpu_yxfb_yxio_b16",2], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "8372855367097191197": ["convolution_gpu_yxfb_yxio_b16",2], + "14544219140091420262": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17228615388053183744": ["convolution_gpu_yxfb_yxio_b16",2], + "16606674008248299103": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "8726274320876550785": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12892265081710606252": ["convolution_gpu_yxfb_yxio_b16",1], + "7826406759309418010": ["convolution_gpu_yxfb_yxio_b16",2], + "8078028207842958010": ["convolution_gpu_yxfb_yxio_b16",2], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "6631103268546309714": ["convolution_gpu_yxfb_yxio_b16",2], + "231083216612056805": ["convolution_gpu_yxfb_yxio_b16",2], + "9951951467222189282": ["convolution_gpu_yxfb_yxio_b16",2], + "12755692101476964677": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "2295659951331099829": ["convolution_gpu_yxfb_yxio_b16",2], + "17019474731460049248": ["convolution_gpu_yxfb_yxio_b16",2], + "16579057939215877904": ["convolution_gpu_bfyx_gemm_like",2], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",2], + "11279789373735965856": ["convolution_gpu_yxfb_yxio_b16",2], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",1], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3499106702307464480": ["convolution_gpu_bfyx_gemm_like",2], + "8541982562061181756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "6808980404170272597": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "13234872695521811652": ["convolution_gpu_yxfb_yxio_b16",1], + "921209976738626097": ["convolution_gpu_yxfb_yxio_b16",2], + "17321934232458063571": ["convolution_gpu_yxfb_yxio_b16",2], + "2042946928570163140": ["convolution_gpu_yxfb_yxio_b16",2], + "17542035367134614728": ["convolution_gpu_yxfb_yxio_b16",1], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "13634686998599681086": ["convolution_gpu_yxfb_yxio_b16",2], + "13223232888554043645": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3531786338249174486": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2242602888499888844": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14363025045807200040": ["convolution_gpu_bfyx_os_iyx_osv16",541], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9291397338108903174": ["convolution_gpu_yxfb_yxio_b16",2], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17264010982688979937": ["convolution_gpu_bfyx_1x1",2], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "4819131094439732065": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7748233564411787605": ["convolution_gpu_bfyx_gemm_like",2], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11972290239275366299": ["convolution_gpu_yxfb_yxio_b16",2], + "2553539191926275121": ["convolution_gpu_yxfb_yxio_b16",2], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "2133236128630074068": ["convolution_gpu_yxfb_yxio_b16",2], + "3571330754519284334": ["convolution_gpu_yxfb_yxio_b16",2], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3574585436812909168": ["convolution_gpu_yxfb_yxio_b16",1], + "4561778392194061215": ["convolution_gpu_yxfb_yxio_b16",1], + "3701838669605585798": ["convolution_gpu_yxfb_yxio_b16",2], + "14466032674083938714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15783429395177379897": ["convolution_gpu_yxfb_yxio_b16",2], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6685985905221810743": ["convolution_gpu_yxfb_yxio_b16",1], + "2102169562353089558": ["convolution_gpu_yxfb_yxio_b16",2], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "13810735868750326592": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "4883588237027084166": ["convolution_gpu_yxfb_yxio_b16",2], + "8219179055259247644": ["convolution_gpu_yxfb_yxio_b16",2], + "15548854462657362014": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17769159396346490074": ["convolution_gpu_yxfb_yxio_b16",1], + "7263796835299019284": ["convolution_gpu_bfyx_gemm_like",2], + "3477539135137665170": ["convolution_gpu_bfyx_gemm_like",2], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "10049294964307823692": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "12900949103593247293": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13760645810144930270": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "15597317305719116351": ["convolution_gpu_yxfb_yxio_b16",2], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9778670810863940690": ["convolution_gpu_yxfb_yxio_b16",2], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",2], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "13101474064130881526": ["convolution_gpu_yxfb_yxio_b16",2], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "18029395208219861440": ["convolution_gpu_yxfb_yxio_b16",2], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "3441335188113424896": ["convolution_gpu_bfyx_gemm_like",2], + "11267495078361954131": ["convolution_gpu_yxfb_yxio_b16",2], + "1520529227443340435": ["convolution_gpu_bfyx_gemm_like",2], + "8300290944865904942": ["convolution_gpu_yxfb_yxio_b16",1], + "12561177248542630652": ["convolution_gpu_yxfb_yxio_b16",2], + "11300415556407923335": ["convolution_gpu_yxfb_yxio_b16",2], + "3503893875515897267": ["convolution_gpu_bfyx_gemm_like",2], + "7241156141838776126": ["convolution_gpu_bfyx_gemm_like",1], + "15138641310139776109": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15483343060578660278": ["convolution_gpu_yxfb_yxio_b16",2], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "6051877311645456194": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5115007207028125638": ["convolution_gpu_bfyx_gemm_like",2], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17965267346493659374": ["convolution_gpu_yxfb_yxio_b16",2], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "530973311459168543": ["convolution_gpu_yxfb_yxio_b16",2], + "15988378956341507229": ["convolution_gpu_yxfb_yxio_b16",2], + "6648876837655776653": ["convolution_gpu_bfyx_1x1",2], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "10645057595080511813": ["convolution_gpu_yxfb_yxio_b16",2], + "3835387982926010630": ["convolution_gpu_yxfb_yxio_b16",2], + "4802014352392262053": ["convolution_gpu_yxfb_yxio_b16",2], + "10577357333308653027": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "1375084615110147615": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14532519639619315651": ["convolution_gpu_bfyx_gemm_like",2], + "7027962921778599989": ["convolution_gpu_yxfb_yxio_b16",1], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",2], + "6970636030494405299": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "12375919467924385618": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "10531218595816974659": ["convolution_gpu_bfyx_gemm_like",2], + "15757308772667178999": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1450861513159359637": ["convolution_gpu_yxfb_yxio_b16",2], + "5062815196458225737": ["convolution_gpu_bfyx_os_iyx_osv16",487], + "5464801565268066541": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "17397600088595751782": ["convolution_gpu_yxfb_yxio_b16",2], + "5525691792821548743": ["convolution_gpu_yxfb_yxio_b16",2], + "11910900938442124765": ["convolution_gpu_bfyx_gemm_like",2], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "13575423234109624706": ["fully_connected_gpu_yxfb_ref",2], + "480310470450900836": ["convolution_gpu_bfyx_gemm_like",2], + "4656068024153891922": ["convolution_gpu_yxfb_yxio_b16",1], + "14616413139039308367": ["fully_connected_gpu_fb_oi_ref",2], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14971506154649368216": ["convolution_gpu_yxfb_yxio_b16",1], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",1], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",1], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",2], + "10055247339012492459": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "2356785927637873692": ["convolution_gpu_bfyx_gemm_like",2], + "959666756751640874": ["convolution_gpu_yxfb_yxio_b16",2], + "11002875874008272679": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1582751548472076534": ["convolution_gpu_yxfb_yxio_b16",2], + "5032841266226405428": ["convolution_gpu_yxfb_yxio_b16",2], + "14553813154800569861": ["convolution_gpu_yxfb_yxio_b16",2], + "12825407709419526493": ["convolution_gpu_yxfb_yxio_b16",2], + "9525535670799618110": ["convolution_gpu_bfyx_gemm_like",2], + "14289082888174784976": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "5541365322085427177": ["convolution_gpu_yxfb_yxio_b16",2], + "17546090415334871175": ["convolution_gpu_yxfb_yxio_b16",2], + "10617442099961865960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8509024280905303927": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5924271203978892761": ["convolution_gpu_yxfb_yxio_b16",2], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",2], + "9440117898128288296": ["convolution_gpu_bfyx_gemm_like",2], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10009559358571629502": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13571587312517912280": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2705031521944165712": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "12210280332071091209": ["fully_connected_gpu_fb_oi_ref",1], + "2761862049452027986": ["convolution_gpu_yxfb_yxio_b16",2], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",904], + "11626402549863483301": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "9250030880535336888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10117092543913369513": ["convolution_gpu_yxfb_yxio_b16",2], + "708747442142592697": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16075006181495932250": ["convolution_gpu_bfyx_gemm_like",2], + "16996895381161031110": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10015368609444108372": ["convolution_gpu_yxfb_yxio_b16",2], + "17958575161092859465": ["convolution_gpu_yxfb_yxio_b16",1], + "12619739385084492771": ["convolution_gpu_yxfb_yxio_b16",2], + "7349168847581850619": ["convolution_gpu_yxfb_yxio_b16",2], + "14801210545983960599": ["convolution_gpu_yxfb_yxio_b16",2], + "488798544312719183": ["convolution_gpu_yxfb_yxio_b16",2], + "415826393421796195": ["convolution_gpu_yxfb_yxio_b16",2], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "2321148334382088982": ["convolution_gpu_bfyx_gemm_like",2], + "3563872903821081702": ["convolution_gpu_bfyx_direct_10_12_16",2], + "143667964449473415": ["convolution_gpu_yxfb_yxio_b16",0], + "7469107606686458209": ["convolution_gpu_yxfb_yxio_b16",2], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",2], + "14943031375539993004": ["convolution_gpu_yxfb_yxio_b16",2], + "14307705501349750896": ["convolution_gpu_yxfb_yxio_b16",2], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8094920912208664820": ["convolution_gpu_yxfb_yxio_b16",2], + "13111122805945249561": ["convolution_gpu_yxfb_yxio_b16",2], + "2052010432187897741": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "11822555173696078282": ["convolution_gpu_bfyx_gemm_like",0], + "11612209645710419427": ["convolution_gpu_yxfb_yxio_b16",2], + "9062781751511609244": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5250257911846706612": ["convolution_gpu_yxfb_yxio_b16",2], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8881906040469243354": ["convolution_gpu_yxfb_yxio_b16",2], + "5941852872160795604": ["convolution_gpu_bfyx_gemm_like",2], + "13598984763955239116": ["convolution_gpu_bfyx_gemm_like",0], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",2], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "7105279481103494151": ["fully_connected_gpu_fb_oi_ref",1], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "2803569867265035123": ["convolution_gpu_bfyx_os_iyx_osv16",1029], + "7720153213673170931": ["convolution_gpu_yxfb_yxio_b16",2], + "4010419602093863685": ["convolution_gpu_yxfb_yxio_b16",2], + "7274647463152753603": ["convolution_gpu_yxfb_yxio_b16",2], + "17030051116023319382": ["convolution_gpu_yxfb_yxio_b16",1], + "794499287296495726": ["convolution_gpu_bfyx_1x1",2], + "4802009650745059499": ["convolution_gpu_yxfb_yxio_b16",2], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "138379779469699309": ["convolution_gpu_bfyx_gemm_like",2], + "14968401410355925289": ["convolution_gpu_yxfb_yxio_b16",2], + "11132679855317294753": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15600841108426475615": ["convolution_gpu_yxfb_yxio_b16",2], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",2], + "3114210363452108737": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1895560603400089814": ["convolution_gpu_yxfb_yxio_b16",1], + "11516184047320372729": ["convolution_gpu_yxfb_yxio_b16",2], + "16044646335477470657": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15848096609835347542": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8898095926967052382": ["convolution_gpu_yxfb_yxio_b16",2], + "5758133252959371492": ["convolution_gpu_bfyx_gemm_like",1], + "17822988909419777692": ["convolution_gpu_yxfb_yxio_b16",1], + "15006321421735686121": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "9434143681116089888": ["convolution_gpu_bfyx_gemm_like",2], + "17712558058168648648": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "994842991399671507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "8163000689380461611": ["convolution_gpu_yxfb_yxio_b16",1], + "14612206111651511130": ["convolution_gpu_yxfb_yxio_b16",2], + "10065714384927707796": ["convolution_gpu_yxfb_yxio_b16",2], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2915777749501772828": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9588943054777767098": ["convolution_gpu_yxfb_yxio_b16",2], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "13851025202247070979": ["convolution_gpu_yxfb_yxio_b16",1], + "13380637319403400851": ["convolution_gpu_yxfb_yxio_b16",2], + "3321251856445833973": ["convolution_gpu_yxfb_yxio_b16",1], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13325287783358291692": ["convolution_gpu_yxfb_yxio_b16",2], + "7863319552895863063": ["convolution_gpu_yxfb_yxio_b16",2], + "1771347579022727189": ["convolution_gpu_yxfb_yxio_b16",2], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",2], + "981803877097233095": ["convolution_gpu_yxfb_yxio_b16",2], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3784684114139223050": ["convolution_gpu_yxfb_yxio_b16",2], + "13731797251725972855": ["convolution_gpu_yxfb_yxio_b16",2], + "17228810554159747400": ["convolution_gpu_bfyx_gemm_like",2], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "518733575377143679": ["convolution_gpu_yxfb_yxio_b16",2], + "10324485383646920518": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "14066675688397331406": ["convolution_gpu_yxfb_yxio_b16",2], + "6730447536124542965": ["convolution_gpu_yxfb_yxio_b16",1], + "8537824547722216155": ["convolution_gpu_yxfb_yxio_b16",1], + "6344600111737335616": ["convolution_gpu_yxfb_yxio_b16",2], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "2603233376890892194": ["convolution_gpu_yxfb_yxio_b16",2], + "12600479027568241746": ["convolution_gpu_yxfb_yxio_b16",2], + "1379758215293949563": ["convolution_gpu_yxfb_yxio_b16",2], + "17893696934478535385": ["convolution_gpu_yxfb_yxio_b16",2], + "11498084465186986412": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13565691057064774487": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "11411580529501121244": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "9569522500959727054": ["convolution_gpu_yxfb_yxio_b16",2], + "6371463287631658789": ["convolution_gpu_bfyx_gemm_like",2], + "10330180429524641331": ["convolution_gpu_bfyx_gemm_like",2], + "2984726467649419856": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "4740864135937875560": ["convolution_gpu_yxfb_yxio_b16",1], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",2], + "10106454449619141260": ["convolution_gpu_bfyx_1x1",2], + "1594829714229111215": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "727216855315869048": ["convolution_gpu_yxfb_yxio_b16",2], + "5044721291675005144": ["convolution_gpu_bfyx_1x1",2], + "8712136292276123857": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "9909564412554801760": ["convolution_gpu_yxfb_yxio_b16",2], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",1], + "8609939102588915855": ["convolution_gpu_bfyx_gemm_like",2], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "3219408878901707426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10022487076451608714": ["convolution_gpu_bfyx_gemm_like",2], + "1338705434700924127": ["convolution_gpu_bfyx_1x1",1], + "2737352811173555281": ["convolution_gpu_yxfb_yxio_b16",2], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",2], + "14311888412221174224": ["convolution_gpu_yxfb_yxio_b16",2], + "16015963261509760799": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "11376953876369788199": ["convolution_gpu_yxfb_yxio_b16",1], + "3463206409786541741": ["convolution_gpu_yxfb_yxio_b16",2], + "15217077412685024074": ["convolution_gpu_yxfb_yxio_b16",2], + "6792281830591233968": ["convolution_gpu_yxfb_yxio_b16",2], + "9504349455215835807": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",0], + "13602140021189675477": ["convolution_gpu_bfyx_gemm_like",2], + "13369751385866224286": ["convolution_gpu_yxfb_yxio_b16",2], + "6367371992814643260": ["convolution_gpu_yxfb_yxio_b16",2], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",2], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6260684231055362504": ["convolution_gpu_yxfb_yxio_b16",2], + "1088710562928089772": ["convolution_gpu_yxfb_yxio_b16",2], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",1], + "14383657211047876136": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "3163833930628348446": ["convolution_gpu_yxfb_yxio_b16",2], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "9999543693712389402": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4099828484175044842": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2172999245833525797": ["convolution_gpu_yxfb_yxio_b16",2], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "12259611546528256409": ["convolution_gpu_yxfb_yxio_b16",1], + "9439431829175743345": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10722677916294015259": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13456967132681889167": ["convolution_gpu_yxfb_yxio_b16",2], + "104765009188090817": ["convolution_gpu_yxfb_yxio_b16",2], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",1], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "7870154008378361670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",1], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "10783046011829953095": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2609346307827449622": ["convolution_gpu_yxfb_yxio_b16",2], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "9314293064351558241": ["convolution_gpu_bfyx_gemm_like",2], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "11155444222714959508": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "5008541841892687897": ["convolution_gpu_yxfb_yxio_b16",2], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "5534071639452404412": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11872943152839631823": ["convolution_gpu_bfyx_gemm_like",2], + "8717456809499914445": ["convolution_gpu_yxfb_yxio_b16",2], + "10254566865260697753": ["convolution_gpu_yxfb_yxio_b16",2], + "13809046727894108358": ["convolution_gpu_yxfb_yxio_b16",2], + "3534874664568214253": ["convolution_gpu_bfyx_1x1",2], + "13717351126657739994": ["convolution_gpu_yxfb_yxio_b16",1], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10432365444137108781": ["convolution_gpu_bfyx_gemm_like",2], + "10009796094612770326": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11971736882960844905": ["convolution_gpu_yxfb_yxio_b16",2], + "3567607339495161307": ["convolution_gpu_yxfb_yxio_b16",2], + "14916236722843741326": ["convolution_gpu_yxfb_yxio_b16",2], + "16955829428734830876": ["convolution_gpu_yxfb_yxio_b16",1], + "9696168324381001582": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "11537166370263116277": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "4261192887643002603": ["convolution_gpu_bfyx_gemm_like",2], + "14041970415787494000": ["convolution_gpu_yxfb_yxio_b16",2], + "12643643553436503069": ["convolution_gpu_yxfb_yxio_b16",2], + "7440953406601377619": ["convolution_gpu_yxfb_yxio_b16",2], + "7134419022268272901": ["convolution_gpu_yxfb_yxio_b16",2], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "14362876471450307424": ["convolution_gpu_bfyx_1x1",2], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "11280672272221124024": ["convolution_gpu_yxfb_yxio_b16",2], + "3442845193734599342": ["convolution_gpu_yxfb_yxio_b16",2], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "9105127035114339269": ["convolution_gpu_yxfb_yxio_b16",1], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "5149303626508247520": ["convolution_gpu_yxfb_yxio_b16",2], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",1], + "13077012961563218195": ["convolution_gpu_yxfb_yxio_b16",2], + "5642822685234782052": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "3748621266324665764": ["convolution_gpu_yxfb_yxio_b16",2], + "15915715422308762909": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2909347733581487795": ["convolution_gpu_yxfb_yxio_b16",1], + "13075579052866074866": ["convolution_gpu_bfyx_gemm_like",2], + "5209144536543011657": ["convolution_gpu_yxfb_yxio_b16",1], + "8740268039366363321": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "17430994325635361377": ["convolution_gpu_yxfb_yxio_b16",2], + "9293682866734263821": ["convolution_gpu_yxfb_yxio_b16",2], + "5459463503840817402": ["convolution_gpu_bfyx_1x1",2], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",2], + "3805667660217578518": ["convolution_gpu_yxfb_yxio_b16",2], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",2], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "14517191894006411358": ["convolution_gpu_yxfb_yxio_b16",2], + "1241355545294259810": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "378292944207609677": ["convolution_gpu_yxfb_yxio_b16",2], + "248133885018839814": ["convolution_gpu_yxfb_yxio_b16",2], + "14697908554930995949": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",2], + "5355283113999405036": ["convolution_gpu_yxfb_yxio_b16",1], + "8553491894663686698": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "8113660920207936963": ["convolution_gpu_yxfb_yxio_b16",2], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6214677989814002369": ["convolution_gpu_yxfb_yxio_b16",2], + "16626502801066228405": ["convolution_gpu_yxfb_yxio_b16",1], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",0], + "12867590715338247144": ["convolution_gpu_yxfb_yxio_b16",1], + "302694026179841870": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "10864011008000364415": ["convolution_gpu_bfyx_1x1",2], + "16527840366172690992": ["convolution_gpu_yxfb_yxio_b16",2], + "18101509783610609787": ["convolution_gpu_yxfb_yxio_b16",2], + "4079026972040047969": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "16966477504105790279": ["convolution_gpu_yxfb_yxio_b16",2], + "7183578232279711009": ["convolution_gpu_bfyx_gemm_like",2], + "4708035980731751007": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14115742296883450319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2343310394723780653": ["convolution_gpu_yxfb_yxio_b16",2], + "7451154080124553318": ["convolution_gpu_yxfb_yxio_b16",2], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "5788323787676797805": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "230697511447695268": ["convolution_gpu_yxfb_yxio_b16",2], + "9775648000771985077": ["convolution_gpu_yxfb_yxio_b16",2], + "10278515360013727367": ["convolution_gpu_yxfb_yxio_b16",2], + "10524079700393212963": ["convolution_gpu_yxfb_yxio_b16",2], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14918482938530107806": ["convolution_gpu_bfyx_gemm_like",2], + "10262850086265676378": ["convolution_gpu_yxfb_yxio_b16",2], + "7289535479247584635": ["convolution_gpu_bfyx_1x1",2], + "17377204616846724192": ["convolution_gpu_bfyx_gemm_like",2], + "402932154499003993": ["convolution_gpu_yxfb_yxio_b16",2], + "5179013491581036103": ["convolution_gpu_yxfb_yxio_b16",2], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12362290144183018227": ["convolution_gpu_yxfb_yxio_b16",1], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "6114241186364821679": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17567012866823126402": ["convolution_gpu_yxfb_yxio_b16",2], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3889456478817717702": ["convolution_gpu_yxfb_yxio_b16",2], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "3398322619007806698": ["convolution_gpu_bfyx_direct_10_12_16",1], + "637115537820955017": ["convolution_gpu_yxfb_yxio_b16",2], + "9763754389347695094": ["convolution_gpu_yxfb_yxio_b16",2], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "11782188262748842182": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "2273992727647793692": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "12455871938978342189": ["convolution_gpu_yxfb_yxio_b16",2], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "3957253946857103590": ["convolution_gpu_yxfb_yxio_b16",2], + "13369603621524676979": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "17817043205731836063": ["convolution_gpu_yxfb_yxio_b16",2], + "512446355173752600": ["convolution_gpu_yxfb_yxio_b16",2], + "3735753364888836383": ["convolution_gpu_yxfb_yxio_b16",1], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",2], + "10995886682834858002": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4030004320208162301": ["convolution_gpu_yxfb_yxio_b16",2], + "11262989876326061679": ["convolution_gpu_yxfb_yxio_b16",0], + "748236447365453504": ["convolution_gpu_yxfb_yxio_b16",2], + "7861234698413147249": ["convolution_gpu_yxfb_yxio_b16",2], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4562591438007476419": ["convolution_gpu_bfyx_gemm_like",2], + "15293835051273372438": ["convolution_gpu_yxfb_yxio_b16",2], + "8015885733173521367": ["convolution_gpu_yxfb_yxio_b16",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "17077815973022307612": ["convolution_gpu_yxfb_yxio_b16",2], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",1], + "10447947790216991304": ["convolution_gpu_bfyx_gemm_like",2], + "5721096633060535553": ["convolution_gpu_yxfb_yxio_b16",2], + "12458921031453334451": ["convolution_gpu_yxfb_yxio_b16",2], + "15816807118780455948": ["convolution_gpu_yxfb_yxio_b16",2], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",1], + "9834941975457910988": ["convolution_gpu_yxfb_yxio_b16",2], + "6208201398783088425": ["convolution_gpu_bfyx_gemm_like",2], + "8265982881100325775": ["convolution_gpu_yxfb_yxio_b16",2], + "8638074773026771425": ["convolution_gpu_yxfb_yxio_b16",2], + "6846760451124717672": ["convolution_gpu_yxfb_yxio_b16",1], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "5465400164581117113": ["convolution_gpu_bfyx_gemm_like",2], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5359510718430377298": ["convolution_gpu_yxfb_yxio_b16",2], + "4403753181729432604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "7600296832974673294": ["convolution_gpu_yxfb_yxio_b16",2], + "3911736807429733938": ["convolution_gpu_yxfb_yxio_b16",2], + "1701412735970485849": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "8527055001340219573": ["convolution_gpu_yxfb_yxio_b16",2], + "2562815925396318565": ["convolution_gpu_yxfb_yxio_b16",2], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",2], + "13323186744342557015": ["convolution_gpu_yxfb_yxio_b16",1], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9659837320293869285": ["convolution_gpu_yxfb_yxio_b16",1], + "14188157670969097508": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2457671437276780303": ["convolution_gpu_yxfb_yxio_b16",2], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "6018481198468872040": ["convolution_gpu_yxfb_yxio_b16",2], + "835053793432636355": ["convolution_gpu_yxfb_yxio_b16",2], + "14337168375989245254": ["convolution_gpu_yxfb_yxio_b16",2], + "9127827617126714860": ["fully_connected_gpu_fb_oi_ref",2], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8131879590716437354": ["convolution_gpu_yxfb_yxio_b16",2], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "1982176363226079588": ["convolution_gpu_bfyx_gemm_like",2], + "13145474177271090694": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "17174919737114915467": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5420215220876162902": ["convolution_gpu_yxfb_yxio_b16",2], + "2129742884686884642": ["convolution_gpu_yxfb_yxio_b16",2], + "4241055784642339756": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7933217973342728190": ["convolution_gpu_yxfb_yxio_b16",2], + "5950285227163574810": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",1], + "10232429887105708502": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6439316331231400868": ["convolution_gpu_yxfb_yxio_b16",0], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4416793079965040181": ["convolution_gpu_yxfb_yxio_b16",1], + "9526266653688168429": ["convolution_gpu_yxfb_yxio_b16",2], + "15879385408480411034": ["convolution_gpu_yxfb_yxio_b16",2], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",1], + "677249604491773387": ["convolution_gpu_bfyx_gemm_like",2], + "990199360818917334": ["convolution_gpu_yxfb_yxio_b16",2], + "4455369117448405874": ["convolution_gpu_bfyx_1x1",2], + "3369689552455141157": ["convolution_gpu_yxfb_yxio_b16",2], + "5802466130040230797": ["convolution_gpu_yxfb_yxio_b16",2], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "14526262781657292025": ["convolution_gpu_yxfb_yxio_b16",2], + "12984970933638742657": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "9629460794894999510": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "601430670855155006": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12439827609628473238": ["convolution_gpu_yxfb_yxio_b16",2], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",2], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",1], + "17158401628206867933": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2114232149447438823": ["convolution_gpu_bfyx_gemm_like",2], + "7650862961269327235": ["convolution_gpu_bfyx_1x1",2], + "3940619509778739158": ["convolution_gpu_yxfb_yxio_b16",1], + "7843180034077880658": ["convolution_gpu_yxfb_yxio_b16",1], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "17718424965214606218": ["convolution_gpu_yxfb_yxio_b16",2], + "7287107719392705356": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",2], + "14199158130218117084": ["convolution_gpu_bfyx_gemm_like",2], + "394778201589371681": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "14807357397951247957": ["convolution_gpu_yxfb_yxio_b16",1], + "4171374172427814762": ["convolution_gpu_yxfb_yxio_b16",2], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8262469434265124590": ["convolution_gpu_yxfb_yxio_b16",1], + "16541970206584576833": ["convolution_gpu_bfyx_gemm_like",2], + "12004628115138530335": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",2], + "3725013268198063198": ["convolution_gpu_bfyx_1x1",2], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",2], + "14897384423894125457": ["convolution_gpu_yxfb_yxio_b16",2], + "12011606174372081253": ["convolution_gpu_yxfb_yxio_b16",2], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",1], + "8010456208258134834": ["convolution_gpu_yxfb_yxio_b16",2], + "6538526180355194359": ["convolution_gpu_yxfb_yxio_b16",2], + "18359731130169236059": ["convolution_gpu_yxfb_yxio_b16",2], + "6097086855988597139": ["convolution_gpu_bfyx_1x1",2], + "9059418187274548462": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",2], + "1680468564927032670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4261215727469154244": ["convolution_gpu_yxfb_yxio_b16",2], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",2], + "14001048251986195179": ["convolution_gpu_bfyx_gemm_like",2], + "6726099352298108756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10225565543636007389": ["convolution_gpu_yxfb_yxio_b16",2], + "13094289895577333088": ["convolution_gpu_yxfb_yxio_b16",2], + "15599983560500910839": ["convolution_gpu_yxfb_yxio_b16",2], + "18131954418490925431": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "1919535500129437217": ["convolution_gpu_yxfb_yxio_b16",2], + "5539793555189956907": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "6624079551747071383": ["convolution_gpu_yxfb_yxio_b16",1], + "16364494883229084045": ["convolution_gpu_bfyx_gemm_like",2], + "4723919313760470311": ["convolution_gpu_yxfb_yxio_b16",1], + "11324651029379152442": ["convolution_gpu_bfyx_1x1",2], + "3358616456137155015": ["convolution_gpu_yxfb_yxio_b16",2], + "264466528528245004": ["convolution_gpu_yxfb_yxio_b16",1], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4732226322522411018": ["fully_connected_gpu_fb_io_b8_f8_vload",0], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",2], + "4168273493370024327": ["convolution_gpu_bfyx_1x1",1], + "860852602930021016": ["convolution_gpu_yxfb_yxio_b16",2], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15581997249051127645": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "6136232084354304563": ["convolution_gpu_yxfb_yxio_b16",2], + "2744566213784972700": ["convolution_gpu_yxfb_yxio_b16",2], + "2412069259085234287": ["convolution_gpu_yxfb_yxio_b16",1], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11682041005124075890": ["convolution_gpu_yxfb_yxio_b16",2], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14281154151197472605": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",2], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9433875341212148858": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "5735703235236456131": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "9815961128076948768": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9480653639044390919": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",2], + "4329042569031331949": ["convolution_gpu_yxfb_yxio_b16",2], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "1646638859396929303": ["convolution_gpu_yxfb_yxio_b16",2], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",1], + "789202969657820559": ["convolution_gpu_yxfb_yxio_b16",2], + "12988253829685880778": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "8978764053524288494": ["convolution_gpu_bfyx_gemm_like",0], + "6935581283700404601": ["convolution_gpu_yxfb_yxio_b16",2], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "9436893310034662243": ["convolution_gpu_bfyx_gemm_like",2], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "2527276292172180386": ["convolution_gpu_bfyx_gemm_like",2], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1786732163438555728": ["convolution_gpu_yxfb_yxio_b16",0], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12279771749366327372": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "2173867324489962689": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13530377297525480029": ["convolution_gpu_yxfb_yxio_b16",1], + "12495003066477974474": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "9787359208094141129": ["fully_connected_gpu_fb_oi_ref",1], + "6709883527730513363": ["convolution_gpu_yxfb_yxio_b16",2], + "7172357320005702833": ["convolution_gpu_yxfb_yxio_b16",2], + "16139615240471264488": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "6331794802915121861": ["convolution_gpu_yxfb_yxio_b16",2], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "8784358107340738205": ["convolution_gpu_yxfb_yxio_b16",2], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "2501411300945696806": ["convolution_gpu_yxfb_yxio_b16",2], + "7378840969627751667": ["convolution_gpu_yxfb_yxio_b16",2], + "14568618538516685994": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6126579157025017808": ["convolution_gpu_yxfb_yxio_b16",2], + "8125500765566111746": ["convolution_gpu_yxfb_yxio_b16",2], + "9663847096617096629": ["convolution_gpu_yxfb_yxio_b16",2], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3924212595662208655": ["convolution_gpu_yxfb_yxio_b16",2], + "8585205898894363799": ["convolution_gpu_yxfb_yxio_b16",2], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "10184417796355593956": ["convolution_gpu_yxfb_yxio_b16",2], + "7605652809856543211": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "8267783192628619295": ["convolution_gpu_yxfb_yxio_b16",2], + "10358170616931426647": ["convolution_gpu_yxfb_yxio_b16",1], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "14770895149190975433": ["convolution_gpu_yxfb_yxio_b16",1], + "2931988747601319855": ["convolution_gpu_bfyx_1x1",2], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "4306052436602921234": ["convolution_gpu_yxfb_yxio_b16",2], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",2], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "17054207561525574617": ["convolution_gpu_yxfb_yxio_b16",2], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",2], + "6479042072492268780": ["convolution_gpu_yxfb_yxio_b16",2], + "10981374120597916521": ["convolution_gpu_yxfb_yxio_b16",1], + "18424611729838147994": ["convolution_gpu_yxfb_yxio_b16",2], + "7099035779223341587": ["convolution_gpu_yxfb_yxio_b16",2], + "9207799012657103903": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10811224523636009881": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15866935886105967122": ["convolution_gpu_yxfb_yxio_b16",2], + "8700574100180128776": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16270745071180354612": ["convolution_gpu_bfyx_gemm_like",2], + "10760094119259477688": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9169935203300589222": ["convolution_gpu_yxfb_yxio_b16",1], + "15222260213708019662": ["convolution_gpu_yxfb_yxio_b16",2], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15710826363434377015": ["convolution_gpu_yxfb_yxio_b16",2], + "7332664632757815486": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "2706024586717944825": ["convolution_gpu_yxfb_yxio_b16",2], + "6363788325163726004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14847662630748580880": ["convolution_gpu_yxfb_yxio_b16",2], + "7075659071934895087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9694701402170070080": ["convolution_gpu_yxfb_yxio_b16",2], + "7540655869186258692": ["convolution_gpu_yxfb_yxio_b16",2], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "16632786413927045192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "12213908871711628660": ["convolution_gpu_yxfb_yxio_b16",2], + "3815222814331650224": ["convolution_gpu_yxfb_yxio_b16",2], + "10833423331830484028": ["convolution_gpu_yxfb_yxio_b16",2], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",2], + "151851883170419907": ["convolution_gpu_yxfb_yxio_b16",2], + "1798440805196304745": ["convolution_gpu_yxfb_yxio_b16",2], + "15352064186447212862": ["convolution_gpu_yxfb_yxio_b16",2], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "2722062599746670336": ["convolution_gpu_yxfb_yxio_b16",2], + "11070968498963106073": ["fully_connected_gpu_fb_io_block_fp16",2], + "856949500975232838": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17705992851440826353": ["convolution_gpu_yxfb_yxio_b16",2], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "12771841901357553928": ["convolution_gpu_yxfb_yxio_b16",2], + "17084977396231597605": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "7992444232916226938": ["convolution_gpu_yxfb_yxio_b16",1], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4849343880559509889": ["convolution_gpu_bfyx_1x1",2], + "11086464266772450142": ["convolution_gpu_yxfb_yxio_b16",2], + "15464554714318666871": ["convolution_gpu_yxfb_yxio_b16",2], + "11799180632798787251": ["convolution_gpu_yxfb_yxio_b16",2], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9078447949109922472": ["convolution_gpu_yxfb_yxio_b16",2], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "108442764389420633": ["convolution_gpu_yxfb_yxio_b16",2], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "15551338663759394064": ["convolution_gpu_yxfb_yxio_b16",1], + "10538010212480716275": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14173531787508017136": ["convolution_gpu_yxfb_yxio_b16",2], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",2], + "12867177334690636800": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "16585502133291740543": ["convolution_gpu_yxfb_yxio_b16",2], + "11070446574652704629": ["convolution_gpu_yxfb_yxio_b16",2], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4023281997496669037": ["convolution_gpu_yxfb_yxio_b16",2], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",2], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "14263055580023018733": ["convolution_gpu_yxfb_yxio_b16",2], + "688897645422834994": ["convolution_gpu_yxfb_yxio_b16",2], + "6232452664016831516": ["convolution_gpu_yxfb_yxio_b16",2], + "15715029280006557222": ["convolution_gpu_yxfb_yxio_b16",1], + "2840794055129352139": ["convolution_gpu_yxfb_yxio_b16",2], + "14905520834426630145": ["convolution_gpu_bfyx_gemm_like",2], + "16096353398003405565": ["convolution_gpu_yxfb_yxio_b16",2], + "15997145184054496085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11585430081839020501": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15820005010263193043": ["convolution_gpu_yxfb_yxio_b16",2], + "3141886504884887200": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",2], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "3427691447288240419": ["convolution_gpu_yxfb_yxio_b16",1], + "14731054961557547253": ["convolution_gpu_yxfb_yxio_b16",2], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "6703148006012061136": ["convolution_gpu_yxfb_yxio_b16",2], + "382811963722907674": ["convolution_gpu_bfyx_gemm_like",2], + "12617736879671137111": ["convolution_gpu_yxfb_yxio_b16",2], + "10419440621736450993": ["convolution_gpu_yxfb_yxio_b16",2], + "17211590259060346125": ["convolution_gpu_yxfb_yxio_b16",1], + "13328583512713703122": ["convolution_gpu_yxfb_yxio_b16",2], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "415232223198122046": ["convolution_gpu_yxfb_yxio_b16",2], + "14985236276429954162": ["convolution_gpu_bfyx_gemm_like",2], + "15487730714504758208": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "4299773714254046691": ["convolution_gpu_yxfb_yxio_b16",2], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7615563770941714046": ["convolution_gpu_yxfb_yxio_b16",2], + "6664482192233202590": ["convolution_gpu_bfyx_gemm_like",2], + "3155353791103196186": ["convolution_gpu_yxfb_yxio_b16",2], + "3894121333485095575": ["convolution_gpu_yxfb_yxio_b16",2], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "9275371801303143499": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "3928356751040028375": ["convolution_gpu_bfyx_gemm_like",2], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",1], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",2], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "14510495923021693109": ["convolution_gpu_yxfb_yxio_b16",2], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17487594336237597163": ["convolution_gpu_yxfb_yxio_b16",2], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "8638227907054657946": ["convolution_gpu_yxfb_yxio_b16",2], + "16574710115918192418": ["convolution_gpu_bfyx_gemm_like",2], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "8130920994920685157": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17647962002015093887": ["convolution_gpu_bfyx_gemm_like",2], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",1], + "4714289593698160876": ["convolution_gpu_yxfb_yxio_b16",2], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",1], + "10415046594066474634": ["convolution_gpu_bfyx_gemm_like",2], + "12624762527234542946": ["convolution_gpu_yxfb_yxio_b16",2], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "15809639778580769565": ["convolution_gpu_bfyx_gemm_like",2], + "17587625589456309495": ["convolution_gpu_yxfb_yxio_b16",2], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8069537351442302814": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "2147896649835170790": ["convolution_gpu_yxfb_yxio_b16",2], + "1142968634734769401": ["convolution_gpu_yxfb_yxio_b16",2], + "5245526691775741296": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6300691162962736560": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "16653412888821076903": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5074273865983613482": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9375272277044782377": ["convolution_gpu_bfyx_gemm_like",0], + "10465119306486335226": ["convolution_gpu_yxfb_yxio_b16",2], + "17436550598696178210": ["convolution_gpu_yxfb_yxio_b16",2], + "1854612313463195535": ["convolution_gpu_yxfb_yxio_b16",0], + "14758040027936817208": ["convolution_gpu_yxfb_yxio_b16",2], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",2], + "2807516818436584831": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1638858323987412931": ["convolution_gpu_yxfb_yxio_b16",2], + "18035673326929466074": ["convolution_gpu_bfyx_direct_10_12_16",2], + "417352773179383568": ["convolution_gpu_yxfb_yxio_b16",2], + "4339711224604149541": ["convolution_gpu_bfyx_gemm_like",2], + "5266313052389515491": ["convolution_gpu_yxfb_yxio_b16",2], + "7946262362930618714": ["convolution_gpu_yxfb_yxio_b16",2], + "14835641172229643545": ["convolution_gpu_bfyx_gemm_like",2], + "5150256051921098637": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14001406016806064079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10899110544832584656": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6981294059746462667": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",2], + "9996142812492415452": ["convolution_gpu_yxfb_yxio_b16",1], + "13835859040765465258": ["convolution_gpu_bfyx_gemm_like",1], + "13472577372534605883": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5762631094740444698": ["convolution_gpu_yxfb_yxio_b16",1], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "2920017342405650206": ["convolution_gpu_yxfb_yxio_b16",2], + "11624226818593966530": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6461637373691101671": ["convolution_gpu_bfyx_direct_10_12_16",2], + "215512025430490450": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6070612528095353265": ["convolution_gpu_yxfb_yxio_b16",2], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",2], + "10656486867659934705": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16111630594575598044": ["convolution_gpu_yxfb_yxio_b16",2], + "17439276474731842060": ["convolution_gpu_yxfb_yxio_b16",2], + "10069896554844445748": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "3837190939606792435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "739676584505475609": ["convolution_gpu_bfyx_gemm_like",2], + "4306881509708040723": ["convolution_gpu_yxfb_yxio_b16",2], + "178353385245384751": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "1466455001976212160": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "15226556774612169126": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18255227391100087860": ["convolution_gpu_bfyx_1x1",2], + "16120120950870908964": ["convolution_gpu_yxfb_yxio_b16",2], + "2219693989290882970": ["convolution_gpu_yxfb_yxio_b16",2], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4091702228990140696": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10509933181132310969": ["convolution_gpu_bfyx_gemm_like",2], + "166091609652531090": ["convolution_gpu_yxfb_yxio_b16",2], + "47872288115972996": ["convolution_gpu_yxfb_yxio_b16",2], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "18094592431313771787": ["convolution_gpu_yxfb_yxio_b16",2], + "8787438180071123604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",2], + "6445721440921372329": ["convolution_gpu_yxfb_yxio_b16",2], + "17012832508134584917": ["convolution_gpu_yxfb_yxio_b16",2], + "12264240305528403865": ["convolution_gpu_yxfb_yxio_b16",2], + "4433497906256257606": ["convolution_gpu_yxfb_yxio_b16",2], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "7752913515036871482": ["convolution_gpu_bfyx_gemm_like",1], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",1], + "13390197134230598693": ["convolution_gpu_yxfb_yxio_b16",2], + "10217182484138821482": ["convolution_gpu_yxfb_yxio_b16",2], + "16780457022162749898": ["convolution_gpu_bfyx_gemm_like",0], + "5445584581720919223": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",1], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",2], + "6260115080574637314": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12294364015803004575": ["fully_connected_gpu_fb_io_block_fp16",2], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "438528596970898721": ["convolution_gpu_bfyx_gemm_like",1], + "3976197003067656339": ["convolution_gpu_yxfb_yxio_b16",2], + "2263637493894079492": ["convolution_gpu_yxfb_yxio_b16",2], + "15813044197987178947": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "17433340097721474017": ["convolution_gpu_yxfb_yxio_b16",2], + "9515771738501683": ["convolution_gpu_yxfb_yxio_b16",2], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "6182829358839578529": ["convolution_gpu_bfyx_gemm_like",2], + "13803790014241837327": ["convolution_gpu_yxfb_yxio_b16",1], + "13367787254519749641": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1454014148777456006": ["convolution_gpu_yxfb_yxio_b16",2], + "49948277487706148": ["convolution_gpu_bfyx_1x1",2], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "867868384380428650": ["convolution_gpu_yxfb_yxio_b16",2], + "1068155851494601726": ["convolution_gpu_yxfb_yxio_b16",2], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2621495864635590903": ["convolution_gpu_yxfb_yxio_b16",2], + "2929190644951986399": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "6115915509370042166": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1338581414403268264": ["convolution_gpu_yxfb_yxio_b16",2], + "12850195004093999773": ["convolution_gpu_yxfb_yxio_b16",2], + "9532499374173117612": ["fully_connected_gpu_fb_oi_ref",1], + "12061567381160185735": ["convolution_gpu_bfyx_1x1",1], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "9178915201681884122": ["convolution_gpu_yxfb_yxio_b16",2], + "17045386022302353268": ["convolution_gpu_yxfb_yxio_b16",2], + "6107700818115209289": ["convolution_gpu_yxfb_yxio_b16",2], + "15141893564826036993": ["convolution_gpu_yxfb_yxio_b16",2], + "8984436655107983227": ["convolution_gpu_bfyx_gemm_like",2], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17766628441954343001": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "7823257556787476006": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2973337989445169388": ["convolution_gpu_yxfb_yxio_b16",1], + "3010520839193613803": ["convolution_gpu_yxfb_yxio_b16",2], + "8000679297338683619": ["convolution_gpu_yxfb_yxio_b16",2], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2598267743388306204": ["convolution_gpu_bfyx_gemm_like",2], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14840851809642905875": ["convolution_gpu_yxfb_yxio_b16",2], + "447943521999310356": ["convolution_gpu_yxfb_yxio_b16",2], + "16361932270527364507": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "4422642146063042868": ["convolution_gpu_yxfb_yxio_b16",2], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9974905660671605427": ["convolution_gpu_yxfb_yxio_b16",2], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "18148431787172327554": ["convolution_gpu_yxfb_yxio_b16",2], + "7338932272767555117": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5941092474669713339": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11604794601689380990": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "598214270378842167": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "824380206255396866": ["convolution_gpu_yxfb_yxio_b16",2], + "10753540518493641553": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8613740762403897614": ["convolution_gpu_yxfb_yxio_b16",2], + "142329025839464842": ["convolution_gpu_bfyx_1x1",2], + "6286349307417232815": ["convolution_gpu_yxfb_yxio_b16",2], + "883436333317162926": ["convolution_gpu_bfyx_1x1",2], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",2], + "5109636469531439569": ["convolution_gpu_yxfb_yxio_b16",2], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "12430677767405883160": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11563334365673075610": ["convolution_gpu_yxfb_yxio_b16",2], + "9161616741940575576": ["convolution_gpu_yxfb_yxio_b16",2], + "10477588607457125173": ["convolution_gpu_bfyx_gemm_like",2], + "4723643671527109645": ["convolution_gpu_yxfb_yxio_b16",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "16549854027697846882": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16839741351990811959": ["convolution_gpu_bfyx_gemm_like",2], + "16243196137456624852": ["convolution_gpu_bfyx_gemm_like",2], + "13020331397245585657": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "14318347197994059448": ["convolution_gpu_yxfb_yxio_b16",2], + "4251673416603443503": ["convolution_gpu_yxfb_yxio_b16",2], + "6677367803113594603": ["convolution_gpu_yxfb_yxio_b16",2], + "15298221796479574600": ["convolution_gpu_yxfb_yxio_b16",1], + "5546447512898130524": ["convolution_gpu_yxfb_yxio_b16",2], + "13854845390344305906": ["convolution_gpu_yxfb_yxio_b16",2], + "16184142990117192433": ["convolution_gpu_yxfb_yxio_b16",2], + "12703696322769371912": ["convolution_gpu_bfyx_gemm_like",2], + "2920840796593281126": ["convolution_gpu_bfyx_gemm_like",2], + "2064464435352777854": ["convolution_gpu_bfyx_gemm_like",1], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",1], + "16956263773967652552": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4927360358387344983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15438530452161762045": ["convolution_gpu_yxfb_yxio_b16",1], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",2], + "14288463473159113326": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15555083739490354527": ["convolution_gpu_bfyx_gemm_like",2], + "5479590921345335946": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "14421061973479991516": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "18076121920579110076": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "16998662249038174039": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "1240102354814495870": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "707979507145930311": ["convolution_gpu_bfyx_gemm_like",1], + "14795626641169374231": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "8512711227383782401": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6578804773136886939": ["convolution_gpu_bfyx_gemm_like",2], + "18180491232489548313": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "3572202652824023801": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "3816979903860227798": ["convolution_gpu_bfyx_gemm_like",2], + "4790960977352818689": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "4868400250190558111": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3509502334639215181": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "3697631094971930011": ["convolution_gpu_bfyx_gemm_like",2], + "1467428583618467133": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "9335016444137172241": ["convolution_gpu_bfyx_gemm_like",2], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "8127853538569353431": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "1484007449719260391": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "9056812077282494074": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "7127306913758514626": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10209532888121442060": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "17354626928258309128": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1569111625440278287": ["convolution_gpu_bfyx_gemm_like",2], + "213518984547400496": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "384240534894352154": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "4732699611696731044": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "7059729537732609153": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "15743461017318513847": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "2778141440914991349": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4588420324030315321": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "885661562948597780": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15687441275464931484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "5358925179582853152": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "3610579553304450107": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "3047710665820732705": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8363432163596927598": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "11758765408733113291": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "5050495757462452653": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8399477322910720113": ["convolution_gpu_bfyx_gemm_like",2], + "8921169563466511475": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "12571532345206950176": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "9552615241912277692": ["convolution_gpu_bfyx_gemm_like",2], + "16628180201355989101": ["convolution_gpu_bfyx_os_iyx_osv16",884], + "17808913959977434594": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "286393043958202995": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "6258191734224827354": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "18043745678739016406": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "17946191056428828467": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6263019986730305851": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9546990560009724329": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "16462602383546733062": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "1350953652678789564": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "330278641539729021": ["convolution_gpu_bfyx_gemm_like",2], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2908856453997530641": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17059095074211347838": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "14668529234172928874": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "14346703182362139650": ["convolution_gpu_bfyx_gemm_like",2], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "10114123606924808948": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5280450544965361875": ["convolution_gpu_bfyx_gemm_like",1], + "15025260753866131193": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "604467633591545941": ["convolution_gpu_bfyx_gemm_like",2], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "7256947320128669983": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16256970928603738516": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10269005969451576527": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "6745633232989303110": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12364947728685604753": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "12173409033330010794": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "11128727891847758901": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "1093840152689636371": ["convolution_gpu_bfyx_gemm_like",1], + "9714770878761308566": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15083602050538795803": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7527121935101118719": ["convolution_gpu_bfyx_gemm_like",2], + "5116562847410288642": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5385395378424322451": ["convolution_gpu_bfyx_gemm_like",2], + "11602830611894444581": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "2096167792705935744": ["convolution_gpu_bfyx_gemm_like",2], + "3433877094202077256": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "12610004507393467447": ["convolution_gpu_bfyx_gemm_like",2], + "15939740070666326125": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8422541638844255768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13082713280504953535": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8961544327690568390": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11883632480024839484": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5769404877199637961": ["convolution_gpu_bfyx_gemm_like",2], + "3296059171653513862": ["convolution_gpu_bfyx_gemm_like",2], + "9968496035529786888": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "3664842151999943": ["convolution_gpu_bfyx_gemm_like",1], + "11539652577193034099": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "2524233418633897945": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "3743573500773847162": ["convolution_gpu_bfyx_os_iyx_osv16",506], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "3813463368918975003": ["convolution_gpu_bfyx_gemm_like",2], + "7530197659550301431": ["convolution_gpu_bfyx_gemm_like",2], + "9700098364581157575": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4269447138276727632": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "1061595672605627170": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "7569785094993085356": ["convolution_gpu_bfyx_gemm_like",2], + "11504777464995699839": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "8224143262995973449": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1501328995320618233": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "16197538586133639338": ["convolution_gpu_bfyx_gemm_like",1], + "237384442106085756": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "15972830392998437739": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "15421166985948480394": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "6794427012971589670": ["convolution_gpu_bfyx_gemm_like",2], + "2420425134749678611": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "8050798452111667069": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "3668065353749623655": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "4251588408225461731": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16957170318200599740": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12644942072153919043": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17715478364817621621": ["convolution_gpu_bfyx_gemm_like",2], + "2854124603710900850": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "9380980604821454646": ["convolution_gpu_bfyx_gemm_like",1], + "1879844536951785808": ["convolution_gpu_bfyx_gemm_like",2], + "1086052166358768751": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "861813331533609605": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16440449399643706863": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11733721371402545268": ["fully_connected_gpu_fb_io_ref",2], + "15816540550252147706": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "13979227237506927267": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "10492401059875127091": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "4600698444492242585": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7157064096682175957": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "14221578799010900252": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "1285313118947640320": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "15858485865603722138": ["convolution_gpu_bfyx_gemm_like",2], + "2116524516810466877": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "12182468247297592907": ["convolution_gpu_bfyx_gemm_like",1], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "5582107298039488951": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "11773726534842908728": ["convolution_gpu_bfyx_os_iyx_osv16",187], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "8844619836383523698": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "14548629377527143409": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13366059704398720237": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "18349087959351486710": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15868648764972133201": ["fully_connected_gpu_fb_oi_ref",1], + "15322019609805777935": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "4451257789691974239": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1480287432874335824": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "13657522194775317201": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "8032685176029570383": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4334698056820320220": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "15378707205730840765": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5977248663249062384": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3170785962566427770": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2585176064846114298": ["convolution_gpu_bfyx_gemm_like",2], + "18337975902615310907": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6768322540857745605": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13657774210341324470": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3072535365860940873": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14230197617570499447": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10049329759351957685": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10305912614137623024": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3896848534552901221": ["convolution_gpu_bfyx_gemm_like",2], + "7405835196787288054": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "7020655100877544328": ["convolution_gpu_bfyx_gemm_like",1], + "13174363822969694054": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13232269620066140073": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "10317038568333963064": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "2180753144963020203": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15271492161940795681": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12281346074445607180": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "8451179695288093195": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "2085738943081638802": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15563546888345388359": ["convolution_gpu_bfyx_gemm_like",2], + "8525389694584008001": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2481005139798378616": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "574359978358296617": ["convolution_gpu_bfyx_gemm_like",2], + "15764181772410734606": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9217386935739152562": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "12161602271403760008": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "9758907700230386910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8707189142909022305": ["convolution_gpu_bfyx_gemm_like",2], + "1375259485223819020": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9053383117071470496": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6261121070004228939": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1112828128944231163": ["convolution_gpu_bfyx_gemm_like",1], + "5843679089588930933": ["convolution_gpu_bfyx_gemm_like",2], + "11083777913844441475": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1923745286075356181": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3827177373408316820": ["convolution_gpu_bfyx_gemm_like",1], + "5488168361113140102": ["convolution_gpu_bfyx_gemm_like",1], + "7982628452987720190": ["convolution_gpu_bfyx_gemm_like",2], + "8140242320379485952": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "15615172858007002100": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15210302033167762581": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17392347485675658099": ["convolution_gpu_bfyx_gemm_like",2], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "1231806423322813287": ["convolution_gpu_bfyx_gemm_like",2], + "166267183356660549": ["convolution_gpu_bfyx_gemm_like",1], + "8281212003098870446": ["convolution_gpu_bfyx_gemm_like",0], + "14650273075211365393": ["convolution_gpu_bfyx_gemm_like",1], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "12012860334670244716": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "15646774522467486699": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "18265901700619296616": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1653438360841004980": ["fully_connected_gpu_fb_oi_ref",2], + "6103824715103416420": ["convolution_gpu_bfyx_gemm_like",2], + "15409755591665753258": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16946947983339327902": ["convolution_gpu_bfyx_gemm_like",2], + "6431838057506760173": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "14705457019471647279": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6801897580177846120": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",472], + "16801553481899627402": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5339358831190803597": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11732742421854164761": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "14568560907026487922": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4184442166820068862": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "17967188184891337660": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",1], + "5109770354438894645": ["convolution_gpu_bfyx_gemm_like",2], + "4691552892932405676": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7331552952865138030": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1422402723172447295": ["convolution_gpu_bfyx_gemm_like",1], + "14292252222828824305": ["convolution_gpu_bfyx_gemm_like",2], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "16695020005258780885": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "6129884455218252024": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13772598362521854438": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "9940908487812223059": ["convolution_gpu_bfyx_gemm_like",2], + "4753055238892504599": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "15803050672115583478": ["convolution_gpu_bfyx_gemm_like",1], + "3154903035376733831": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9191832520273617003": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "7557446085365037177": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "6213353364768643062": ["convolution_gpu_bfyx_gemm_like",2], + "4035015193331696438": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "4368522743441422202": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "15974241934088373021": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17254775053427612466": ["fully_connected_gpu_fb_oi_ref",1], + "447683677378974131": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14244966672894707129": ["convolution_gpu_bfyx_gemm_like",2], + "7946776740333736799": ["convolution_gpu_bfyx_gemm_like",2], + "15496355513574200965": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "9239048433297419320": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12971833748980664090": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "1810943242998123550": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9767355861002822967": ["convolution_gpu_bfyx_gemm_like",2], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",0], + "11805311302922325617": ["convolution_gpu_bfyx_gemm_like",2], + "9788704336046308724": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15383553612351941890": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16590030963319267708": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "6739799137687789012": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "17442035600389810700": ["convolution_gpu_bfyx_gemm_like",2], + "10890975553758439233": ["convolution_gpu_bfyx_gemm_like",1], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "12409554044517232554": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "9796347091019799053": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "17508987219281192918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "8670512344429807851": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13206826317378863148": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13727585908419292912": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "6996679663761370444": ["convolution_gpu_bfyx_gemm_like",1], + "13915749401892931804": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "16596028606733932975": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "4198666727524342442": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "16125365972873290572": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "9040046051053703359": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11918018989601427118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7312862821818362095": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "8357109553923988018": ["convolution_gpu_bfyx_gemm_like",2], + "8730097760819044515": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6218328594667952152": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3939977982577786175": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "18112958483003382733": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "8507854696766492454": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "5556023021504556658": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "17740553615487239243": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8684867236134349888": ["convolution_gpu_bfyx_os_iyx_osv16",193], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "18235067315439611192": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4965629769516591986": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "13537323999534292650": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3214253333840552610": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6478054912653910426": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",2], + "9154705094446538279": ["fully_connected_gpu_fb_oi_ref",0], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "6513705142577622089": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "1766961036311612128": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "977617597166653416": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17182558720652199559": ["fully_connected_gpu_fb_io_ref",1], + "17854138024884397413": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8426489532875918560": ["convolution_gpu_bfyx_gemm_like",1], + "17869697579874327192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10928995765778560784": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "770376597027620107": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17683350638672326642": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17790954200356837750": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "14349335089732252796": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11033507346101404633": ["fully_connected_gpu_fb_oi_ref",2], + "13775529405693629438": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "9459869325970475576": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "2542506456395240890": ["convolution_gpu_bfyx_gemm_like",1], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16510194749934323304": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "12952160708294444403": ["convolution_gpu_bfyx_gemm_like",2], + "11541706477255587105": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1771663698943903325": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "17771487895874668302": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "412314676462573090": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7367814057959247537": ["convolution_gpu_bfyx_gemm_like",2], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "1192709652314183388": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12427490329663434604": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "13170031087212196468": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12381377111003298809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "16710651492402564794": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13447028922679236865": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "18026468427978643933": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "17285699593273891901": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "9963817056423168830": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "13388424034634316547": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "2780358937598873103": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "587350550384936211": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12534001599784153836": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "4082046235109198108": ["convolution_gpu_bfyx_gemm_like",1], + "2317476796706098254": ["convolution_gpu_bfyx_gemm_like",2], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "14296771090926462138": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10853161782230763798": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8390953788659916133": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2310549887200001260": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4854802313728023001": ["convolution_gpu_bfyx_os_iyx_osv16",621], + "11264412030568042996": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "5906083739416582743": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8707484843981694525": ["convolution_gpu_bfyx_os_iyx_osv16",1021], + "2947753291378607664": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "17585852525746136080": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "2303141161423252932": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14039055710777697188": ["convolution_gpu_bfyx_gemm_like",2], + "3919577663893354177": ["convolution_gpu_bfyx_gemm_like",1], + "16578265652036967656": ["convolution_gpu_bfyx_gemm_like",2], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5643908654122573882": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",2], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "1465692634334679413": ["convolution_gpu_bfyx_gemm_like",2], + "13439272015824246074": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15781220232431782560": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2590380836212070761": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10437861085319472289": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "707449835235490641": ["convolution_gpu_bfyx_gemm_like",1], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "14667209474639064623": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15984373369388044924": ["convolution_gpu_bfyx_gemm_like",2], + "1486768204660092247": ["convolution_gpu_bfyx_gemm_like",1], + "8360628955300060520": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "12808154347573074859": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1131384986902172221": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11051434650031832658": ["convolution_gpu_bfyx_gemm_like",1], + "3623695848220673001": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2172636954267255416": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14522844693999581518": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "12136458184046915563": ["convolution_gpu_bfyx_gemm_like",0], + "13115589642140732066": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "2654793073145467058": ["convolution_gpu_bfyx_gemm_like",2], + "1967810052096853804": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6796998865297819946": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3314459110790355757": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "13193571607788569533": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15911434513425038508": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4534480875955599254": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11253790393313445931": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",1], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11338906515425639970": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "789359733867650915": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "16173557782125372935": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "14133509766683767462": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "1116274074896622552": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "7995002764260542332": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15380105196319354141": ["convolution_gpu_bfyx_os_iyx_osv16",481], + "17732250360268013336": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "1622731194539871461": ["convolution_gpu_bfyx_gemm_like",2], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15115440616185035720": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4927139127938739019": ["convolution_gpu_bfyx_gemm_like",2], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",1], + "1081962464388501987": ["convolution_gpu_bfyx_os_iyx_osv16",873], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "1362239912535573615": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "2230884858122788172": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12771805545455650546": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2007192658799516915": ["fully_connected_gpu_bf_io_gemm",1], + "6489645404977288242": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "4229105529069729944": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10743138314323119696": ["convolution_gpu_bfyx_gemm_like",2], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",2], + "768765852586619095": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16396393355098283060": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12392988351482826871": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10485534959656860449": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "13083981648347252910": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "2248628426797793532": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2498920887656279332": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12864338805958186191": ["convolution_gpu_bfyx_gemm_like",2], + "5124645583449732785": ["convolution_gpu_bfyx_gemm_like",2], + "15024023281204917061": ["convolution_gpu_bfyx_gemm_like",2], + "11331539079347079374": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "11857822504978122919": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11665313746896806563": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14911763273270477925": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2096021095904820251": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12010294231983179604": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "7877256119877423528": ["convolution_gpu_bfyx_os_iyx_osv16",489], + "18243018097656671503": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "8061914949376516780": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11314436000791223218": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9516102312850256675": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "14188045559946481097": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "2418288192668085805": ["convolution_gpu_bfyx_gemm_like",2], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14261214737408786954": ["convolution_gpu_bfyx_os_iyx_osv16",621], + "7336911146060959485": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11234976958917093838": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "7058458405375602606": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "13654408396081513312": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1593086572473375988": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "13387766889016280910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5966963943739041502": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "13267743753217317315": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",185], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "16103653667647559851": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "17025997656996518171": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10935410906182995784": ["convolution_gpu_bfyx_gemm_like",1], + "15749335301736571135": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6362453779168658462": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "14541063954080306476": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",120], + "11058082057683584650": ["convolution_gpu_bfyx_gemm_like",2], + "6750269489578112382": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17774979615691038302": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "3219239043521617253": ["convolution_gpu_bfyx_gemm_like",2], + "10973647655853229395": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "1521992965089360209": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4145496852718466030": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2317409971670298599": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "10966081583785531511": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13745327504866194229": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "390943380079040179": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2999825793036702585": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4692951005189464579": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "9905716283229191208": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4860019935631927113": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1835975757316320402": ["convolution_gpu_bfyx_gemm_like",2], + "18265020664540913473": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1444256562477852389": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "8510044123592842725": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "10689303050557631712": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "390219891876240081": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10838972820886273680": ["convolution_gpu_bfyx_gemm_like",2], + "15682441855379046778": ["convolution_gpu_bfyx_os_iyx_osv16",130], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "14719871224178118299": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14880517974968280393": ["convolution_gpu_bfyx_gemm_like",2], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9696588462876533517": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "11964639701912187118": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "12190841837604350271": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4254313567858225805": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "1190134214210434381": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "2894138412746654795": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "11378458002317912396": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "18337160891834020517": ["convolution_gpu_bfyx_os_iyx_osv16",151], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "12584692605608021657": ["fully_connected_gpu_fb_oi_ref",1], + "907233163535348999": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11510063368067539341": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3164513064874019611": ["convolution_gpu_bfyx_gemm_like",2], + "5298952273692538291": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "8382509515623938786": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "14013561425708390846": ["convolution_gpu_bfyx_gemm_like",2], + "7801270668419570665": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "11188849626443657384": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13296566345005640760": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "4165920860392215245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7905503566052181015": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15872143905824807656": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "10983344268706058114": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "5553176511624221429": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16033144151193421543": ["convolution_gpu_bfyx_gemm_like",2], + "2571882179292959757": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13810716860158972470": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "16264774056719724826": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1919460437053604108": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12767115494378788592": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "9861846661532177405": ["convolution_gpu_bfyx_gemm_like",2], + "7419990519344756626": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "13660573428614001128": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2305706332728008948": ["convolution_gpu_bfyx_gemm_like",2], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "1104489643524273315": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "4607013085883384144": ["convolution_gpu_bfyx_gemm_like",2], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "9486447779233331380": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "12096396455109952715": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15509845164085518352": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6525052296614701517": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3475757648408068589": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "14599150265057284139": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "7678168522030142454": ["convolution_gpu_bfyx_gemm_like",2], + "8799427328659766574": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15384168056682476462": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "1801066876009461857": ["convolution_gpu_bfyx_gemm_like",1], + "13787155972060672772": ["convolution_gpu_bfyx_gemm_like",1], + "4974435385259831818": ["convolution_gpu_bfyx_gemm_like",2], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "15184258464890250739": ["convolution_gpu_bfyx_gemm_like",2], + "7550660458541314838": ["convolution_gpu_bfyx_gemm_like",2], + "11367813096511965002": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "11393439616752806572": ["convolution_gpu_bfyx_gemm_like",2], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "838825600917352376": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9383222411929463824": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3192518239721798250": ["convolution_gpu_bfyx_gemm_like",2], + "12478914547444399288": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10036998353100219512": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13212959214376905822": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "1724898827344855006": ["convolution_gpu_bfyx_gemm_like",1], + "10890538764006500546": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12978004383198641522": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "18166732758694978380": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7727871584058599163": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14113510820933411052": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "6897348673467297407": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "15191864907092681849": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16569200335969311660": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "11642941943446484202": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "12825029449351875037": ["convolution_gpu_bfyx_gemm_like",1], + "12818953631784587919": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "9654726486719966937": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10158890414412187141": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8367989677286805427": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15953607231296296913": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9255337426504113924": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6762862978340755053": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8374345306483326015": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10386584706491193379": ["convolution_gpu_bfyx_gemm_like",2], + "18067353229273804720": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3588791913550955553": ["fully_connected_gpu_fb_oi_ref",1], + "5047419871737940985": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "6078344073564209080": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "15492793021506324472": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "801486567558674495": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15652392678782222737": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3701795558556637835": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8369833730195120673": ["convolution_gpu_bfyx_gemm_like",2], + "7103345484511147373": ["convolution_gpu_bfyx_gemm_like",2], + "4412343276595791077": ["convolution_gpu_bfyx_gemm_like",2], + "1596472719837608525": ["convolution_gpu_bfyx_gemm_like",2], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "15636407980943172317": ["convolution_gpu_bfyx_gemm_like",2], + "2816982827037092536": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "3469963495451100978": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "9386678255270055573": ["convolution_gpu_bfyx_direct_10_12_16",2], + "172584114180442549": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "3828569468687251275": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "5513667102916409932": ["convolution_gpu_bfyx_gemm_like",2], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "15112118829970177073": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "8566695253227825439": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "13002723770137829128": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8511244943596227719": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1745930004673880589": ["convolution_gpu_bfyx_gemm_like",1], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "12707748441880165396": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3277243911383750280": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15952399564161253450": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12063837066704136739": ["convolution_gpu_bfyx_gemm_like",1], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "1330842758352650583": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "4007319206075386920": ["convolution_gpu_bfyx_gemm_like",2], + "1592619919721912789": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "984472462878596435": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "813347941036099284": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "15091825614924466766": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",0], + "3436433254188539886": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "17997314629342774968": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "3524702814173574637": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "340606466693982406": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "544003022213487787": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6948696390129114563": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4563529605364580848": ["convolution_gpu_bfyx_os_iyx_osv16",131], + "2124776616364429517": ["convolution_gpu_bfyx_gemm_like",1], + "2946926779445063554": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "11240189248024145687": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "4494583230309471319": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "8104609318998060422": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "16587078304821304948": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "237302155033013557": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "13810995219720233595": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "15299926486228458704": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "10879183694331631189": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "3745433390861789238": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6275903692904946376": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "572265264921910408": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "276407276027553756": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "3747518910079195578": ["convolution_gpu_bfyx_os_iyx_osv16",103], + "15198419554644505600": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17073183514200378702": ["convolution_gpu_bfyx_os_iyx_osv16",667], + "8611417708673038653": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "8375778282166369933": ["convolution_gpu_bfyx_gemm_like",2], + "6831045740006076251": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6577754887650563753": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11775667915453535428": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "8898910394425958745": ["convolution_gpu_bfyx_gemm_like",2], + "15781622938833984014": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "8035084960535483680": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "6065819201836017182": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7873648177300629037": ["convolution_gpu_bfyx_gemm_like",2], + "18134140047840716203": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "12046638414686283134": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10008202802779981732": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "954347958041231578": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5871082277006078841": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "6137405768481559638": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "9105388853296359769": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",2], + "11795686089670429481": ["convolution_gpu_bfyx_gemm_like",2], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "4224423702382859092": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2270733937722366926": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "5646139101524964833": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "3239100076064406977": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3730238135300250205": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "7227174766917523481": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "17772882818194611202": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "18375125668176498051": ["convolution_gpu_bfyx_gemm_like",2], + "16091165907421819456": ["convolution_gpu_bfyx_gemm_like",2], + "7726714223809300966": ["convolution_gpu_bfyx_gemm_like",1], + "13926730608213207277": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11533151357949131860": ["convolution_gpu_bfyx_gemm_like",2], + "14805212478405698245": ["convolution_gpu_bfyx_gemm_like",1], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "9468314291932574827": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "8324250071425605671": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6579950270997373448": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1426606766274640878": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "5953754321266570854": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "14827538610133799379": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4920194716156732643": ["convolution_gpu_bfyx_gemm_like",2], + "9740466267717175474": ["convolution_gpu_bfyx_gemm_like",2], + "6755802278188792577": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "4417341352109525283": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8442368383427915597": ["convolution_gpu_bfyx_gemm_like",1], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "3693042354944382600": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12390011660072693092": ["convolution_gpu_bfyx_gemm_like",1], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "12425310792514818973": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "17331582127656317117": ["convolution_gpu_bfyx_gemm_like",1], + "13492216433886201174": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2338535084014610258": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "4396653960950462197": ["convolution_gpu_bfyx_gemm_like",1], + "5825664545247017348": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "4624363818743696582": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "152263592822875549": ["convolution_gpu_bfyx_gemm_like",2], + "17015791782274123780": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "16831114690704826637": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17350963651826443169": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7277156316894715321": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17559685912375493682": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "9083686317073801642": ["convolution_gpu_bfyx_gemm_like",1], + "311101627084421734": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",1], + "10322586483496198615": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7154364270315480182": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "9947693652506812817": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "13593258537178247801": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "16758962840329202004": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1077224320045437593": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2999633429402781278": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "2184670359551186734": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17798626036576472760": ["convolution_gpu_bfyx_os_iyx_osv16",545], + "14705509109623500235": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "8079376692609682448": ["convolution_gpu_bfyx_gemm_like",0], + "4585891362157592384": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "5748047690737461635": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "173772845058977237": ["convolution_gpu_bfyx_os_iyx_osv16",512], + "6899658518070473523": ["convolution_gpu_bfyx_gemm_like",2], + "9455406830371528486": ["convolution_gpu_bfyx_gemm_like",1], + "3027775502561362722": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "1006828591724642933": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "12136625628940225638": ["convolution_gpu_bfyx_gemm_like",2], + "14253275166085865948": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "9875997976286355123": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "14017025411515888007": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14077148976508649021": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13140254055376365092": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4476218615403440835": ["convolution_gpu_bfyx_gemm_like",2], + "11465965972527519631": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8045697952241865861": ["convolution_gpu_bfyx_gemm_like",2], + "8109572327736409899": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12325592439309417414": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "18280672126778847258": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "14725765847498813247": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2014911634432127630": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9835338452418388180": ["convolution_gpu_bfyx_gemm_like",2], + "16912035321030511639": ["convolution_gpu_bfyx_gemm_like",1], + "5701438170070600512": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "1499841226042523429": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "9823752892549805496": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "9101571410887509600": ["convolution_gpu_bfyx_gemm_like",0], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "9410125656044318792": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1818433662409886324": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2827850900421982274": ["convolution_gpu_bfyx_gemm_like",1], + "11507538232733291666": ["convolution_gpu_bfyx_direct_10_12_16",1], + "536646811796032046": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18167100055915766856": ["convolution_gpu_bfyx_gemm_like",1], + "14184440545916228597": ["convolution_gpu_bfyx_gemm_like",2], + "9068406831482072377": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16190949264253468961": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "475665035119038846": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "4172485608495372888": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "13696782397412896129": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6056291179600370019": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14492935486352505845": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4316519748653705692": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "16453041919970581620": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "14696479950182046016": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "1925626127045202964": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "16614170159588864300": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "7185832253431234935": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14004715832115880216": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "7157531901512507924": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "14681705641267917886": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "4872433441839808585": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13503688893307029975": ["convolution_gpu_bfyx_direct_10_12_16",0], + "6914775146138105785": ["convolution_gpu_bfyx_gemm_like",2], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",2], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "2451712485584835395": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "9305957796037500628": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4040607776348275579": ["convolution_gpu_bfyx_gemm_like",2], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "7088331918128954410": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "9377779605078400305": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "6740545361286720494": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15490478608105402679": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9548658329589481069": ["convolution_gpu_bfyx_gemm_like",2], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "5280182001774668876": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1142725391726703078": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7876355212013100281": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "2363414141971004557": ["convolution_gpu_bfyx_gemm_like",2], + "9019451572520595738": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2111049986724040641": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "6610054713068442549": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "13163026305514410688": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7338578624767544128": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13491221531603384511": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12038525298168664305": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1015184966858657992": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4010329161090285019": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6722358544720547260": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "5553779954745929430": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "169973842603492802": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14203061085285979556": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5361028467247182860": ["convolution_gpu_bfyx_gemm_like",1], + "11630475290242283451": ["convolution_gpu_bfyx_gemm_like",2], + "16768470780681544910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "7480968533463196410": ["convolution_gpu_bfyx_gemm_like",2], + "13818587810073749596": ["convolution_gpu_bfyx_gemm_like",1], + "12700051513124813499": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "18386376129938707290": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "11333068902248367382": ["convolution_gpu_bfyx_gemm_like",2], + "13219865669259079983": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "9700592037514669700": ["convolution_gpu_bfyx_gemm_like",2], + "10105539975183207700": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "4239415134522959352": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "12170874893413205000": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "8325686349100774855": ["convolution_gpu_bfyx_gemm_like",2], + "8413117662038329068": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2904162348196990593": ["convolution_gpu_bfyx_gemm_like",1], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3527012447011885981": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5230406405159608187": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "8779947213821605681": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7505966294864890221": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1213958002895787672": ["convolution_gpu_bfyx_gemm_like",2], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8575296926578119953": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17641033958594901664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "9105431502075531641": ["convolution_gpu_bfyx_gemm_like",2], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "10794662801660960189": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "14579042972443651846": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "13403617010417893318": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18242682488017822077": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "6149261133858739754": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",420], + "13088023076667575514": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "14277843123789500234": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1370827524176794227": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "12293705794290797805": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3034947396960425753": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "11680829908738480957": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "316225690176910392": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "17236135174912837061": ["convolution_gpu_bfyx_gemm_like",2], + "6851536988434597530": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6612643056203714506": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "3446991010350155849": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15071888879264671307": ["convolution_gpu_bfyx_os_iyx_osv16",104], + "1228256819256996416": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17118569850095586049": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16201999154635899927": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "6235096928786525260": ["convolution_gpu_bfyx_os_iyx_osv16",337], + "11493371521058673700": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "699127221549844251": ["convolution_gpu_bfyx_gemm_like",2], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "11129224786768161139": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "4631772220201098020": ["convolution_gpu_bfyx_gemm_like",2], + "7536287105029319189": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "10412748832841674068": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7385295618478993079": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5934841294975212773": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14815498807515058447": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "13773898185415904435": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16997897512818072938": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12201437677145858979": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "16067821671414842756": ["convolution_gpu_bfyx_gemm_like",1], + "11191071895289217783": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6542417269641204414": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9226443907548972870": ["convolution_gpu_bfyx_gemm_like",1], + "6948606378949354116": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "4652308622880770983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3285688984628545255": ["fully_connected_gpu_fb_io_ref",1], + "17396226612787250663": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4695182996147218495": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "17235360775064303316": ["convolution_gpu_bfyx_gemm_like",2], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6402941068107243403": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12166710900466116000": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17358006976602795707": ["convolution_gpu_bfyx_gemm_like",2], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5680888227752935228": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "13288543822410746011": ["convolution_gpu_bfyx_gemm_like",1], + "1603703756241612948": ["convolution_gpu_bfyx_gemm_like",2], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "2820364088001594654": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "14513925709624513868": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "13244693761392741931": ["fully_connected_gpu_fb_oi_ref",0], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "12211848608269437730": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "1898912620350738645": ["convolution_gpu_bfyx_gemm_like",2], + "5849577829817109757": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "12811104880512633036": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "10736915975072972467": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15047163348308549816": ["convolution_gpu_bfyx_gemm_like",1], + "6673690359191617215": ["fully_connected_gpu_fb_oi_ref",1], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "13019190248083899887": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "9318652504803279936": ["convolution_gpu_bfyx_gemm_like",2], + "2345023488044002149": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "9692949270906064580": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6489074577147494118": ["convolution_gpu_bfyx_gemm_like",1], + "8271034912009744989": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2248754661513284642": ["convolution_gpu_bfyx_gemm_like",2], + "6865406633958213363": ["convolution_gpu_bfyx_gemm_like",2], + "14600118619533737293": ["fully_connected_gpu_fb_oi_ref",0], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "13014443130752087867": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "3730207439375250056": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "17113350507039887381": ["convolution_gpu_bfyx_gemm_like",1], + "6604223938357238686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2817383483458239293": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "17692144048680858991": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15178327647765537565": ["convolution_gpu_bfyx_os_iyx_osv16",666], + "7544565739420583104": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8529571293598502239": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "16328232350072955252": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "14746900092090885770": ["convolution_gpu_bfyx_gemm_like",2], + "1200162031019105686": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4510003738155830628": ["convolution_gpu_bfyx_gemm_like",1], + "8057302050645780813": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6784146431605417954": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "14084855778741260863": ["convolution_gpu_bfyx_gemm_like",2], + "9883719542550391149": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6999530153839596796": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "13412296930014397060": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2062195022363480864": ["convolution_gpu_bfyx_gemm_like",1], + "10806992251978564302": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9352385417006844121": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "4890932609897686394": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "484412270668341493": ["convolution_gpu_bfyx_gemm_like",1], + "15662207751131195569": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",1], + "15183511809138557392": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5733530388090903847": ["convolution_gpu_bfyx_gemm_like",2], + "9574931298183748343": ["convolution_gpu_bfyx_gemm_like",2], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8751367574402839332": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "18259787991864449280": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "6373173636869473046": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6012477132351580695": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "16367495521884864886": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "13095408117538194584": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "3020115657931277672": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4941660917457387098": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "2238901105639912692": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "1671347101986657824": ["convolution_gpu_bfyx_gemm_like",2], + "12274268980330855890": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "17079309368548171402": ["convolution_gpu_bfyx_gemm_like",1], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",2], + "4684985181211883028": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "9882204352209412039": ["convolution_gpu_bfyx_gemm_like",1], + "14600700464602327710": ["convolution_gpu_bfyx_gemm_like",2], + "1682486914760867977": ["convolution_gpu_bfyx_gemm_like",2], + "5013936351898884291": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8292979162428130363": ["convolution_gpu_bfyx_gemm_like",2], + "2564518461717467683": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13613948678997524330": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "919788620883613958": ["convolution_gpu_bfyx_os_iyx_osv16",464], + "18060514966005474708": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "13044020050176766314": ["convolution_gpu_bfyx_gemm_like",1], + "10720782649044333851": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "1966540437574889257": ["convolution_gpu_bfyx_gemm_like",1], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "16076153317792960383": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "12960666483922103702": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "2264520082689779253": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "6220616397859143111": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10857084376518292379": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15487686565734149288": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "6647969101146756031": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12301464827222654105": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "8560635685184432720": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "9694891301950867606": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "13345599888287912619": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2511072616914149110": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "15890749658785957481": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14386256118128644729": ["convolution_gpu_bfyx_gemm_like",2], + "7806837641999814363": ["convolution_gpu_bfyx_gemm_like",2], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5164372816534616260": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14895352662503433583": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3889688816787688160": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "16499919609457089685": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "11825209936640729550": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "4366168099274266975": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "11962541545116807979": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "17939745299931100048": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "11583791752668920812": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "14116682822622440033": ["convolution_gpu_bfyx_gemm_like",1], + "15178012823756517910": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14276876004054588508": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7627882727285402176": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "1504867045084152953": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8488789346759658706": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2446257282140830646": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "17310332946322628458": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "14905705901815863508": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1553825475921110392": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7335403151694644211": ["convolution_gpu_bfyx_gemm_like",1], + "2310159350914289605": ["convolution_gpu_bfyx_gemm_like",2], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14128599551956588603": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "16614678178197571772": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "16805562203348924108": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "13565027847255501776": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16364899406120840449": ["convolution_gpu_bfyx_os_iyx_osv16",398], + "17128760774072077101": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9358401110755269308": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8703758535351908295": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10136297272678091418": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13065517911798224579": ["convolution_gpu_bfyx_os_iyx_osv16",377], + "7722090560547236852": ["convolution_gpu_bfyx_gemm_like",1], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1962479636209947761": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "16392283136103456949": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "4438055737691342460": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2520734476651273971": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8569122574675372789": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "8159489372517869446": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "11599990834682830362": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "17825953644228876369": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8885012252853227025": ["convolution_gpu_bfyx_gemm_like",1], + "8484526109354576450": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17096175733187202673": ["convolution_gpu_bfyx_gemm_like",2], + "9596656797750683465": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12883021432082543848": ["convolution_gpu_bfyx_gemm_like",1], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7504074736798125353": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17184638213817814424": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",1], + "13681462437496627948": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11091771531609585709": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2599817012641445801": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "15921072201288695017": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11258182961445417799": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6214312494103149808": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "1673458534805854479": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10944997349682267106": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "2887152687927903549": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13017541921351620667": ["convolution_gpu_bfyx_gemm_like",2], + "17626938391567407401": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "517802466588815950": ["convolution_gpu_bfyx_gemm_like",2], + "2079476232214121671": ["convolution_gpu_bfyx_gemm_like",1], + "2225233951957105071": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16035563519857925932": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "8525704362451630717": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12022980249970038824": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "17230103497915224469": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "17666004363345457085": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6224167817672480442": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12144421857685107073": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6581494673640781863": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5461649843950745696": ["convolution_gpu_bfyx_gemm_like",2], + "3718980061704064547": ["convolution_gpu_bfyx_gemm_like",2], + "712420402191459810": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "11757919563609176713": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "7808544677773370430": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15997231252708686870": ["convolution_gpu_bfyx_gemm_like",2], + "12924910330295852704": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7499082230554771515": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "4702017956226464806": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "10532500300200244159": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "11298638173197050575": ["convolution_gpu_bfyx_os_iyx_osv16",942], + "5675497261720118479": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "13845827017732177448": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14854353557342075292": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "8948718883406304307": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7510055418609679364": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2821441037530057414": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "13524128602135083081": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "9707630588260222630": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "9181466280310872332": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "11148502358361704423": ["convolution_gpu_bfyx_gemm_like",1], + "7959969582538910953": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6613282637922219205": ["convolution_gpu_bfyx_gemm_like",2], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "12028030221272546172": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "4995510103045767117": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "14707884854112495064": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7323343770209750835": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3292879092145281224": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "15592248516895826924": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9400558994532871122": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15875968032394961531": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4073467095502162430": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "16044646335477470657": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14376192291828307385": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6323026044750482867": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5094419710576598497": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13085261987388297912": ["convolution_gpu_bfyx_gemm_like",1], + "7463657272687673896": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "3789890554711038921": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12070592804878487941": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "1208483520611545642": ["convolution_gpu_bfyx_gemm_like",2], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",763], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "2652267888871336297": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5507708258753405429": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "9475812329914836280": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10025893052937028511": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "15221712686851573528": ["convolution_gpu_bfyx_gemm_like",2], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "3703292222363446463": ["convolution_gpu_bfyx_os_iyx_osv16",762], + "9608148784787572220": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4036143655651874318": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "7371339724529362579": ["convolution_gpu_bfyx_gemm_like",2], + "16847817828600381030": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15334769670416409064": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3910549475873353422": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13762814538289753428": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",2], + "16070611944881238498": ["convolution_gpu_bfyx_os_iyx_osv16",884], + "9910414853336797922": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15180747404865201068": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "11060822686394981344": ["convolution_gpu_bfyx_gemm_like",1], + "18146184020578260553": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "13328449155966085543": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4129586781834275070": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9649533822873928984": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9593975471009029134": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10572208209982879914": ["convolution_gpu_bfyx_gemm_like",0], + "4999505377862312410": ["fully_connected_gpu_bf_io_gemm",2], + "16124702296533772526": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "14558850297291634005": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "1254745727978231148": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13283018618260255620": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "16992620579546408448": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13352151930345854198": ["convolution_gpu_bfyx_os_iyx_osv16",275], + "2690771087990667627": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "1208243889917809864": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "7494124707566708728": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "13564654155363057485": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15160322051545035612": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "7473012539094225392": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "16896863928108200897": ["convolution_gpu_bfyx_gemm_like",2], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "11648841195768568983": ["convolution_gpu_bfyx_gemm_like",0], + "13831458435772917577": ["convolution_gpu_bfyx_gemm_like",2], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15378025640603637387": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "16852207712205172744": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15132518566122695317": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "1168311873250200110": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3541828356667081528": ["convolution_gpu_bfyx_gemm_like",1], + "9524663472084054050": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7431237779891953779": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "9197931868200777891": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "9451273689649467046": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "6878922067845522655": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "17242820574559628535": ["convolution_gpu_bfyx_gemm_like",1], + "15452996816194024433": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "70244312667395170": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11919129623429545762": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17795358440179122086": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8263822658108674162": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "2152903140704848574": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6735135795253013220": ["convolution_gpu_bfyx_gemm_like",2], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5215755301612973095": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "4122312805832663323": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11619548409913646265": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2108296560864415762": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "912423125050985716": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17281198415161259885": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2110090486638190463": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "3240428557350945267": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13491655481292956895": ["convolution_gpu_bfyx_gemm_like",1], + "2343921093633784755": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "3148053731303748054": ["convolution_gpu_bfyx_gemm_like",2], + "16404059675217592817": ["fully_connected_gpu_fb_oi_ref",1], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "9034951536385533818": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "12756432707088842236": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "523055954326631884": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17850932752450917677": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "14973411884734235059": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "16229324496308453344": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10736892779278378335": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "11261619081095309088": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "13368477378531148593": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "5401523175111660554": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9802832901508552733": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11361013180071053597": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "269334626439013799": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17970855913877771858": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "18332090297993015499": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3665837617379468265": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8316848551837633169": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "17807033661138518449": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6571473790090353005": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "499739705596245675": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5603409300903611279": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14332388011233886083": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6673753637296082820": ["convolution_gpu_bfyx_gemm_like",2], + "8528886126454874796": ["convolution_gpu_bfyx_gemm_like",1], + "10946069941293798874": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10054253863699485503": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9416285845239621878": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9042812985530274425": ["convolution_gpu_bfyx_gemm_like",2], + "12671153706040443724": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "12705054744767500423": ["fully_connected_gpu_fb_io_ref",1], + "8503207028307570404": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "5049534591553232781": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6456426339461437148": ["convolution_gpu_bfyx_gemm_like",1], + "1289009275012699560": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "3965871278597751318": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13744951984978188201": ["fully_connected_gpu_fb_io_ref",1], + "13728180355108851541": ["convolution_gpu_bfyx_gemm_like",2], + "4524347845016978037": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "5011273172385428756": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "12283317230112506089": ["convolution_gpu_bfyx_gemm_like",2], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "10175721494218314250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10432687907685994204": ["convolution_gpu_bfyx_gemm_like",1], + "13614921331048223116": ["convolution_gpu_bfyx_gemm_like",2], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9765339420071627045": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "10660230104888153758": ["convolution_gpu_bfyx_gemm_like",2], + "12386930130408773521": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6706491729783125139": ["convolution_gpu_bfyx_gemm_like",1], + "12675858428585873471": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "9888097487468905169": ["convolution_gpu_bfyx_gemm_like",2], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "466868648178437688": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "7282751412088726760": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "14270450799210365812": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7518734167761579102": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9854440591497995284": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13405310261845268772": ["convolution_gpu_bfyx_gemm_like",2], + "7715520469947900684": ["convolution_gpu_bfyx_os_iyx_osv16",571], + "16408015571155576773": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4783126652984096700": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "13388004363210658650": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "14256842018830898376": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "16114623916610925741": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10397253349562394184": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8007667797556094444": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "15129834325410878425": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "18417880214901227799": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4722824701199486161": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "17011927973643184196": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15212317205888563836": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13802834658447955377": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "6527268791835193134": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "10918743320372308981": ["convolution_gpu_bfyx_gemm_like",2], + "2737840613867456953": ["convolution_gpu_bfyx_gemm_like",2], + "269829518575229806": ["convolution_gpu_bfyx_gemm_like",2], + "2944333966072327932": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "12744887771237881196": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "1242366856673194709": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "17753585752923130911": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "7282595712912388754": ["convolution_gpu_bfyx_os_iyx_osv16",189], + "6985970932645412773": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "7930154826818165796": ["convolution_gpu_bfyx_gemm_like",2], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "6953478877896677022": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "10607904718265020949": ["convolution_gpu_bfyx_gemm_like",2], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8008513163448840421": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13221156296791499146": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "15391215077224693736": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "704262295684441748": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "11455732989503244360": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "18424400171776141118": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "4286652913945761799": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "5379608399492828685": ["convolution_gpu_bfyx_gemm_like",1], + "4614700272179482173": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7441139786825555264": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "202304354656398848": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "11962382064404466630": ["convolution_gpu_bfyx_gemm_like",1], + "5301440603380967612": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "12018398218876712811": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "10898684230183205955": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "2752322006160986801": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15660316437768312006": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15668791697154389130": ["convolution_gpu_bfyx_gemm_like",1], + "1139581213977408268": ["fully_connected_gpu_fb_io_ref",2], + "6649759230117795192": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "5244441996055494170": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",62], + "11070696274716018686": ["convolution_gpu_bfyx_os_iyx_osv16",570], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "8146906136296114696": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "435261825003875448": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "8922463054055280800": ["convolution_gpu_bfyx_gemm_like",1], + "13674246753382740056": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14189775376370027482": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8254412626112343365": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13596494923128445274": ["convolution_gpu_bfyx_gemm_like",2], + "7085416207166146240": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "10320711719466983961": ["convolution_gpu_bfyx_gemm_like",2], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13624106485902414324": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15566108481408840783": ["convolution_gpu_bfyx_gemm_like",2], + "15225331270926229394": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13659291428095454839": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16932090423428476170": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1882912836250239503": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "3442073007560756473": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3609233164979051271": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13108356579957761944": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16027853591907232537": ["convolution_gpu_bfyx_gemm_like",1], + "14446344744130895614": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17924819398394001587": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1608378717397996752": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "393884269158067083": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14903430454784452446": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11079710960007068860": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "11815825155082424936": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "2367791050032803116": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "11868789283464117390": ["convolution_gpu_bfyx_gemm_like",2], + "11207578758583923357": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17368161816774674256": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "4551182180668229945": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "9001645663675631429": ["fully_connected_gpu_yxfb_ref",2], + "18191573176587760698": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "6027350558532160900": ["convolution_gpu_bfyx_gemm_like",2], + "11229587372764249222": ["convolution_gpu_bfyx_gemm_like",2], + "15838058479520696173": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "7318929661124340248": ["convolution_gpu_bfyx_gemm_like",0], + "3177915003579216846": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "7052552351421332490": ["convolution_gpu_bfyx_gemm_like",2], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",1], + "5589785455223385189": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5163965164859517893": ["convolution_gpu_bfyx_gemm_like",2], + "2268291720177538378": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13205973783895006074": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "11553355518677163509": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "14108113294744119367": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4673618329986777239": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14287890401250603057": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "3448477246688526708": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",91], + "2532962442388536022": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14433662482531248989": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2335428826699999827": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "82249723699159955": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "16547425454653232058": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16613907066461513431": ["convolution_gpu_bfyx_gemm_like",0], + "11725629762660987217": ["convolution_gpu_bfyx_gemm_like",1], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8171897258557801015": ["convolution_gpu_bfyx_gemm_like",1], + "15959241441689395955": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "14585370009659482450": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "15838114628203742383": ["convolution_gpu_bfyx_gemm_like",2], + "2399812257701033542": ["convolution_gpu_bfyx_gemm_like",2], + "7962383460496540840": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",0], + "3828988304073539836": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11307531462784240962": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14838067105091112485": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4790599496008369129": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "10358359789382196576": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "16073578125651112218": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17405865057155583042": ["convolution_gpu_bfyx_gemm_like",1], + "8312903198090907576": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10173382130572498594": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "331390460560782085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",1], + "1811357700607919311": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11986642867827682648": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "11114015660322254541": ["convolution_gpu_bfyx_gemm_like",1], + "6420851258772300332": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "14793709237400480942": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "12569856169024791306": ["convolution_gpu_bfyx_gemm_like",2], + "2001464747481073870": ["convolution_gpu_bfyx_gemm_like",1], + "8863398172720091880": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "15148625184033310404": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10624246057883518638": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "6730474465453860479": ["convolution_gpu_bfyx_os_iyx_osv16",1039], + "10073439287681954518": ["convolution_gpu_bfyx_gemm_like",2], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",523], + "15465799788109255561": ["convolution_gpu_bfyx_gemm_like",2], + "11757953304204716753": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "1306339989221885682": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3198726093355425150": ["convolution_gpu_bfyx_gemm_like",2], + "962311766200741205": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16728826595086368897": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "1147744092130296563": ["convolution_gpu_bfyx_gemm_like",1], + "7146559117784312265": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13073788277284969422": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "2305461098719675735": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "17521647426452186921": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "5433618404351968121": ["convolution_gpu_bfyx_gemm_like",2], + "17794162443307839614": ["convolution_gpu_bfyx_gemm_like",1], + "16440598510199834213": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "18009765676050504407": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3509811595028801757": ["convolution_gpu_bfyx_os_iyx_osv16",131], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "1109243878358317937": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "7254869458810021127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12615462894236933223": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11926378988530133568": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2930545263523345204": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "7630776235327261710": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13787118639037730152": ["convolution_gpu_bfyx_os_iyx_osv16",298], + "404419072921281472": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "17749857812061795980": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4101449235783342476": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "14385181780082014495": ["convolution_gpu_bfyx_gemm_like",2], + "6013434489252641471": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8175595372513695437": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15092483859565823523": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3503236715353689942": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "10831460252334010668": ["convolution_gpu_bfyx_gemm_like",2], + "14681717813022425567": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6157727013102138824": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "9823997593704517392": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "3223726179820717808": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "10033076377998157101": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2571778193407799664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "13769943652297353544": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "16031140952379208074": ["convolution_gpu_bfyx_gemm_like",2], + "6128534975733321186": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10273183900108661041": ["convolution_gpu_bfyx_gemm_like",2], + "8316011587868622301": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "11907507085694711513": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "2004120786408087671": ["convolution_gpu_bfyx_gemm_like",2], + "17515573322312447679": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "5162737590442940024": ["convolution_gpu_bfyx_gemm_like",1], + "10906417366145323499": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11992158790035075804": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7606097739225472283": ["convolution_gpu_bfyx_gemm_like",2], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7753336153932360422": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6549150139619174585": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "505102470055903237": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "157852787707383962": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "8909239203149651260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14537109978413728476": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",767], + "17420288204511371476": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12570087709404311189": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6210483922262161762": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11883941040326858829": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "17303981366934280174": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "4127717437639868970": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17981604038340576961": ["convolution_gpu_bfyx_gemm_like",1], + "4301372734564127254": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "2086001721804797157": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "16184979150665364486": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "1934379409955686502": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "11655994466278963438": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "1945630503883822822": ["convolution_gpu_bfyx_gemm_like",1], + "15232673324549539143": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "6661117204204077150": ["convolution_gpu_bfyx_gemm_like",2], + "10384416235770656262": ["convolution_gpu_bfyx_gemm_like",1], + "13716836930727272782": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3819763245853861272": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "7345632855842905966": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2571186327837339204": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "9996196793804333253": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "11246470701714560770": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "8212533074856783509": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "33889407315234685": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17242442529374722270": ["fully_connected_gpu_fb_oi_ref",1], + "7496699438957793920": ["convolution_gpu_bfyx_gemm_like",2], + "8375465895534833097": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "6476949395889340429": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "18187262802267413585": ["fully_connected_gpu_fb_io_ref",1], + "9454146598828084176": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4241640917176830862": ["convolution_gpu_bfyx_gemm_like",2], + "10446500827044060319": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "7908036427091174081": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6948147789605707774": ["fully_connected_gpu_fb_io_ref",2], + "18159049252673770569": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "10904228118889057467": ["convolution_gpu_bfyx_gemm_like",2], + "14266210014132784194": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "5587539329568150667": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "10098661517988566506": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5519244962044894877": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "11777373751892075391": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "17575293085957492821": ["convolution_gpu_bfyx_gemm_like",2], + "7145194061073256844": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7243161613448507792": ["convolution_gpu_bfyx_gemm_like",1], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2056597791109604534": ["convolution_gpu_bfyx_gemm_like",2], + "2873387231297790075": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "4243114942173293897": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "18232408112396439386": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14335423820860953927": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "1187224156936080964": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "5759260743809103651": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16622402936526588344": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "9061025737181218101": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "670951751279091662": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "13133323947490009546": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10424278617647597641": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "6551173574001309451": ["convolution_gpu_bfyx_gemm_like",1], + "397445657349822499": ["convolution_gpu_bfyx_gemm_like",2], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "17016846635668370921": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "10898210758890334465": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "11684927349056930189": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "332090597573908506": ["convolution_gpu_bfyx_gemm_like",1], + "4682428771166816734": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "18006581941186887676": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "15661322183507404821": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "14634044133573461949": ["convolution_gpu_bfyx_gemm_like",2], + "7714783879762659458": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9806689250758752070": ["convolution_gpu_bfyx_gemm_like",0], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "3166885953206195915": ["convolution_gpu_bfyx_gemm_like",2], + "4574242607119408140": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "531020979837645217": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9666426531743983113": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "262113403359175565": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "4634475069086874260": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9397711809671506538": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12008952324872799824": ["convolution_gpu_bfyx_gemm_like",2], + "1907439276166837309": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",2], + "13071064509662090710": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15928746165235747659": ["convolution_gpu_bfyx_gemm_like",2], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8725673763972618034": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7819934200255007163": ["fully_connected_gpu_fb_oi_ref",2], + "13051342120933385671": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17664704673433112966": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5353170440534073482": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "8240616667079698459": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4600261954762222519": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7070374681687005676": ["convolution_gpu_bfyx_gemm_like",1], + "16968664807495872526": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18404344881797725263": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5267143428977695208": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "7811861756798601201": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "1056494963618130644": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "962676948282027870": ["fully_connected_gpu_fb_io_ref",2], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2788116002380533417": ["convolution_gpu_bfyx_gemm_like",2], + "10378966564497668941": ["convolution_gpu_bfyx_os_iyx_osv16",283], + "7086574330273897976": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "16244270858428653037": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "11970466555294072275": ["convolution_gpu_bfyx_gemm_like",2], + "4586633477264151844": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "6547565989244888354": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "218477594596081189": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "5834006438103071406": ["convolution_gpu_bfyx_gemm_like",2], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "12461575861709234385": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1592994755823247500": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1922168904767469999": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "15718011075217705480": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13485140643204970345": ["convolution_gpu_bfyx_gemm_like",1], + "2664944425727769475": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",1], + "12314918602191412697": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "11341771589317480665": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "6133854782246597175": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "7394848434332739139": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9937387440035377216": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "11804035561861841621": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9352866803638271156": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "1894591633696862066": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "17580933462801685507": ["convolution_gpu_bfyx_gemm_like",1], + "5408469943982199754": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15260448822338206631": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12492763342322011136": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "14975859027256879948": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "615833743936753727": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5982637097503543357": ["convolution_gpu_bfyx_gemm_like",2], + "9025790715924779508": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "17078700948595127028": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "10662239532841666965": ["convolution_gpu_bfyx_gemm_like",2], + "11049130623091275457": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "7921388663815287395": ["convolution_gpu_bfyx_gemm_like",2], + "3811462129131022619": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3555204322491340337": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13047793996728441528": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "4047806462440750215": ["convolution_gpu_bfyx_gemm_like",2], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12518571127411736885": ["convolution_gpu_bfyx_gemm_like",2], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",1], + "12248119734016401633": ["fully_connected_gpu_fb_io_ref",1], + "7671016314869993705": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "8054562515577756499": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "10732225577823701543": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "2836903620603494117": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "1650080413259413393": ["convolution_gpu_bfyx_gemm_like",2], + "7864880361674128748": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2903075619523363020": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "14211549589070739656": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8749468546606972791": ["convolution_gpu_bfyx_gemm_like",2], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "148355059345569721": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "4304943753428518690": ["convolution_gpu_bfyx_gemm_like",1], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "15364374265752682266": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "5136111979773513341": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "15667487381692577290": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "482564204402769504": ["convolution_gpu_bfyx_gemm_like",1], + "5983808817108775912": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "14849708746319190277": ["convolution_gpu_bfyx_gemm_like",2], + "4646795194660982475": ["convolution_gpu_bfyx_gemm_like",2], + "94012300876418257": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "15786313441300512560": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1895945774251432343": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6512006285490280576": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3565702695809105495": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12610854610554906160": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "3895088069642140043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2100387626452428743": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "1362540464632328798": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13012283016751495099": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "436514945529747349": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6719956770229212208": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "12692563384795319282": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "5933743119393822386": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12523676912856063091": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "14744368497944610864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "868177350337221377": ["convolution_gpu_bfyx_direct_10_12_16",2], + "832976844701988460": ["convolution_gpu_bfyx_gemm_like",1], + "14034487492239603874": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12669547093826826335": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "5947492124433175601": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "13276867073526485069": ["convolution_gpu_bfyx_gemm_like",2], + "528618206870447012": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "7426788519998680898": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "9803306661531470015": ["fully_connected_gpu_fb_io_ref",2], + "6476480727582657308": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "16774728502960825097": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6517802281521111563": ["convolution_gpu_bfyx_gemm_like",1], + "10652512666086843369": ["convolution_gpu_bfyx_gemm_like",2], + "1452841775482537260": ["convolution_gpu_bfyx_gemm_like",2], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",1], + "16285256723517297210": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14852990574796128305": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "8550783999616052522": ["convolution_gpu_bfyx_gemm_like",2], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5733701901687257088": ["convolution_gpu_bfyx_gemm_like",2], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6089202061701179659": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16443833779968719790": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",764], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "15888454525088587794": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "8116504545035982006": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "5275016494706355806": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "15585700465988560560": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "9127066823698894015": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5961488595080209440": ["convolution_gpu_bfyx_gemm_like",2], + "4665029580355133140": ["convolution_gpu_bfyx_gemm_like",2], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",1], + "5845969526791988973": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12307446289692143781": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "5251771557248725731": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "13758938418512211194": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12700008320838073774": ["convolution_gpu_bfyx_gemm_like",2], + "14164778301660100413": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12711558966638028352": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12522364636280164681": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18369668865072009928": ["convolution_gpu_bfyx_gemm_like",2], + "727203296169504486": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "5214678408335388758": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17835134875461003221": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8465142022921853516": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "11599932445375240727": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "4601800315090684242": ["convolution_gpu_bfyx_gemm_like",2], + "18382226420077875582": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14459249705747952583": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12411228585189337571": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "8995892222116060827": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "12310462218432530363": ["convolution_gpu_bfyx_gemm_like",0], + "9776332064497085361": ["convolution_gpu_bfyx_gemm_like",2], + "9993925424761661218": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "17824431042110985323": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "17001492460236540325": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9454457647272059910": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "4578587579993676820": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "16113302464937833403": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "17825280904760131680": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1999892441424036372": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "13074593348097634731": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17392732266843821039": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2966185891283165994": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "14566257978356851712": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15783329079045263237": ["convolution_gpu_bfyx_gemm_like",1], + "9547451431091729288": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "15149336254307320187": ["convolution_gpu_bfyx_gemm_like",2], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "3961000444895975975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9513545197321447870": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "13031027103925431505": ["convolution_gpu_bfyx_gemm_like",2], + "16583563382485459718": ["convolution_gpu_bfyx_gemm_like",1], + "4858337483345561292": ["convolution_gpu_bfyx_gemm_like",2], + "6536333665377249409": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8374409021681741916": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2307629242354292362": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7670176887560273910": ["convolution_gpu_bfyx_1x1",2], + "1847170421455825520": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17407904982433770732": ["convolution_gpu_bfyx_gemm_like",1], + "2460415719642436412": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "11437885274663749440": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "5032195346490064156": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18077281411861416889": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "7527175223662342321": ["convolution_gpu_bfyx_gemm_like",1], + "68637843533109734": ["convolution_gpu_bfyx_gemm_like",1], + "8501760360687221821": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "8906588133431586825": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "5890599002797783437": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "16981010901052181199": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13636129806349817264": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "14900099988131599740": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "17867620992288101450": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "621272125402238670": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13155901262605819372": ["convolution_gpu_bfyx_os_iyx_osv16",292], + "5040944983588288886": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10897622326486559468": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "15356995665520295246": ["convolution_gpu_bfyx_gemm_like",0], + "17907732260451873185": ["convolution_gpu_bfyx_gemm_like",2], + "13762042713029963144": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15365628642332393565": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15777551868644801538": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17680403286850504499": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4833761011498696645": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "17601171646153308079": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "8204962103567653154": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13974740392602492680": ["convolution_gpu_bfyx_gemm_like",2], + "2712946943923358377": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5367634698951188749": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "15361186788588226064": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "95993272253183796": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2173649669339714890": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "14355612297330229277": ["convolution_gpu_bfyx_gemm_like",2], + "10888435127006141874": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "17754836801944078461": ["convolution_gpu_bfyx_gemm_like",2], + "5608447459568229694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "659846949368492111": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "2850118175701764737": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "17093159649157277089": ["convolution_gpu_bfyx_gemm_like",2], + "277410555520090949": ["convolution_gpu_bfyx_gemm_like",0], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "10612049417873776481": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9987939079053625302": ["convolution_gpu_bfyx_gemm_like",2], + "18341524156838963264": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17784882947271841103": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13367043015761260275": ["convolution_gpu_bfyx_gemm_like",0], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "12983461576274227638": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9747165558500755104": ["convolution_gpu_bfyx_gemm_like",0], + "12793814016409887162": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "15653223776766070604": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9194441947620820715": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "15329084374930297871": ["convolution_gpu_bfyx_gemm_like",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "7509199936979430017": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4553508439536472227": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "6638696743420807294": ["convolution_gpu_bfyx_gemm_like",2], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "1720057192283799086": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "13809218391763818477": ["convolution_gpu_bfyx_gemm_like",2], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9261867808456596636": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "16568662638983972991": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12024318713420323349": ["convolution_gpu_bfyx_gemm_like",2], + "7831542641855749925": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13356152596085257346": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12584870629297848143": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2198100074518629980": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1552088062654417187": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "4407683781177409314": ["convolution_gpu_bfyx_gemm_like",2], + "16747069131271457481": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "2213068950786625268": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "17400844732252600825": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "7400370437512056636": ["convolution_gpu_bfyx_gemm_like",2], + "1436830013293669148": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8243230863677884952": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4750897775273897282": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14639233649574991406": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13940433448128376511": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10127598593949337541": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "9660812093766156608": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",1], + "13764532551476584909": ["convolution_gpu_bfyx_gemm_like",2], + "14908665013877276517": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "6555440973226014216": ["convolution_gpu_bfyx_gemm_like",2], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "2032438743863827309": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "17303584953298149285": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8036592210244553232": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "15550722997950669458": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8007491455800395118": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",90], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",2], + "14384062335728088286": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "16202841384048331166": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "9427999492792081454": ["convolution_gpu_bfyx_os_iyx_osv16",128], + "8469338060514215816": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "13291816522762326802": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8104522072297740079": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10127626701775288565": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "14799589725341253463": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12906669887096343446": ["convolution_gpu_bfyx_gemm_like",2], + "17966517080605659454": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11086699387784339943": ["convolution_gpu_bfyx_os_iyx_osv16",495], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17796867588410764794": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18395970344992997862": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9367157746678824712": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11446181888102710561": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "6085098225080533278": ["convolution_gpu_bfyx_gemm_like",2], + "4200340674281276565": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "8335501317577461610": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "3991584206721185508": ["fully_connected_gpu_yxfb_ref",2], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "4131527916449986086": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "7505608160068471520": ["fully_connected_gpu_fb_io_ref",2], + "6148794431848761670": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "11571049833132558023": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "52089503050497755": ["convolution_gpu_bfyx_os_iyx_osv16",899], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "17130630712943165823": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15240660399630429406": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15531908897773912572": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10771178773821148370": ["convolution_gpu_bfyx_gemm_like",2], + "12279591818557049086": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "5290935680520661218": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "16691293834516280510": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "18157442326218165947": ["convolution_gpu_bfyx_gemm_like",2], + "15379873910046172004": ["convolution_gpu_bfyx_gemm_like",1], + "11345101652477732928": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "5595802790436774398": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17267132595546153629": ["convolution_gpu_bfyx_gemm_like",2], + "15887484617041779814": ["convolution_gpu_bfyx_gemm_like",2], + "12052225815821079044": ["fully_connected_gpu_fb_io_ref",1], + "14112695611389738149": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "12831670701606794888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17778706153204631930": ["convolution_gpu_bfyx_gemm_like",1], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "9447458159095730492": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1334121138243951086": ["convolution_gpu_bfyx_gemm_like",1], + "13939763360217628282": ["convolution_gpu_bfyx_gemm_like",2], + "16303870101043861053": ["convolution_gpu_bfyx_gemm_like",2], + "16237775310369180101": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11421235118459218209": ["convolution_gpu_bfyx_gemm_like",1], + "5033753554611312392": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "11269720109905550213": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "517601465150912854": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "5233164031954315264": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "7303492518741737111": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "4134729533276761488": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "5397783260083330774": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "5222741986856655072": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4186140878816408491": ["convolution_gpu_bfyx_os_iyx_osv16",125], + "9573589861499897842": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "8623022306922454565": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3237680963342495368": ["convolution_gpu_bfyx_gemm_like",1], + "2446435710311724460": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15561518067918160695": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "1852269248476496933": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16001665772103476029": ["convolution_gpu_bfyx_gemm_like",0], + "8757900457181374694": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "6902644989079870993": ["convolution_gpu_bfyx_gemm_like",1], + "17758354062670710364": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17464785726466943638": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10754321688472707825": ["convolution_gpu_bfyx_gemm_like",2], + "13993045680928507594": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "5335250793358473555": ["convolution_gpu_bfyx_gemm_like",1], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "16021335552443492452": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1469048759583678106": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8549811622247170014": ["fully_connected_gpu_fb_io_ref",2], + "9816834679089152140": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2054100643811117871": ["convolution_gpu_bfyx_gemm_like",2], + "12700957546822808929": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18020588962875998441": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4272417312859966238": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "3714179297375678368": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "498221230041656321": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",0], + "17869928048344193660": ["fully_connected_gpu_yxfb_ref",2], + "6439778526899109398": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2881475011209167644": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "16934386540875904239": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "8129414331584785189": ["convolution_gpu_bfyx_gemm_like",1], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "3244402155461139559": ["convolution_gpu_bfyx_gemm_like",1], + "17602686382249457351": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "15374625876485618845": ["convolution_gpu_bfyx_gemm_like",2], + "13083412418930786217": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15262493122847269333": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3291900073868076610": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15993651594402422200": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4265991006340418914": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6080989915764831447": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2649948006897488504": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "9640773327221702885": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "3557182643072772598": ["convolution_gpu_bfyx_gemm_like",2], + "6962268765187856246": ["convolution_gpu_bfyx_gemm_like",2], + "18402875771862490280": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "6057433908801727873": ["convolution_gpu_bfyx_gemm_like",2], + "11828522357351010810": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "15245792492785141641": ["convolution_gpu_bfyx_gemm_like",2], + "2668985670745598382": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16642535448111764945": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17366007551797367227": ["convolution_gpu_bfyx_gemm_like",2], + "2470579932413307757": ["convolution_gpu_bfyx_gemm_like",1], + "13480393611172760874": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "13414375996946350733": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10118395047539851751": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17399103575103078835": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "11210371874006224582": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "10093371683053539916": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15213473731205734586": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "16306284020664131647": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "9140953654075340568": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "659150305191479097": ["convolution_gpu_bfyx_os_iyx_osv16",902], + "10186942318345695432": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",0], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "15456771485750114116": ["convolution_gpu_bfyx_gemm_like",2], + "5011190083565902614": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "3768977479127609228": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9105949910901552052": ["convolution_gpu_bfyx_gemm_like",1], + "16195252193236429176": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",1], + "10726830507311062380": ["fully_connected_gpu_fb_io_ref",1], + "6724516766412732606": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16958661630307271135": ["convolution_gpu_bfyx_gemm_like",1], + "1187622888238643867": ["convolution_gpu_bfyx_gemm_like",2], + "17796784393519192261": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "14749290801006453098": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "12963601040302529291": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",2], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "1781619247831135285": ["convolution_gpu_bfyx_os_iyx_osv16",305], + "4424258528650299664": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "1996860183441418841": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "2662628817605495834": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8641167903508739082": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "15247278167909654073": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "568023964685613279": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "17212292336626940406": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "3202034075645193740": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16355518852513270001": ["convolution_gpu_bfyx_gemm_like",2], + "9172445047535982729": ["convolution_gpu_bfyx_gemm_like",2], + "17257466221539644081": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16511261203374835334": ["convolution_gpu_bfyx_gemm_like",2], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17845905249343189063": ["convolution_gpu_bfyx_gemm_like",2], + "1676419079398771261": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3755253206085028904": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "11696708134796103802": ["convolution_gpu_bfyx_gemm_like",1], + "9756049510998074315": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "13182965457868586949": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "1474719104479956715": ["convolution_gpu_bfyx_gemm_like",2], + "9464448984918455020": ["fully_connected_gpu_fb_io_ref",0], + "10344489318472060767": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "8107597524360102037": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "16349083818768061549": ["convolution_gpu_bfyx_gemm_like",2], + "3861084063403560668": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "6534932244936310237": ["convolution_gpu_bfyx_gemm_like",2], + "5254115874873721374": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8320522112821700316": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "14980327142253281498": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "10995849055789490935": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "2430404993947067949": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "1100681675092122613": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "956022649859563080": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "13610246822402943068": ["convolution_gpu_bfyx_gemm_like",2], + "9559533345689069514": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "7601006550805536675": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "6493509887452943215": ["convolution_gpu_bfyx_gemm_like",1], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "61390148213644186": ["convolution_gpu_bfyx_gemm_like",1], + "1183774022668948480": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "2294026590516781945": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12494969618927201911": ["fully_connected_gpu_fb_oi_ref",1], + "2740834366358352617": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12156683064218448087": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15581678976147496970": ["convolution_gpu_bfyx_gemm_like",0], + "4332002982390788477": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "7844764086278702374": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7650874310714729923": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8484380699802533068": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "10900962238463588974": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13443130482173929700": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4307817040832953223": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2933183897022161826": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "11341287517759485930": ["convolution_gpu_bfyx_gemm_like",2], + "11164600098693999456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15718782218800307385": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11767263058642131204": ["convolution_gpu_bfyx_gemm_like",1], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "17251021943762069083": ["convolution_gpu_bfyx_gemm_like",1], + "1249137685908951501": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10205929431600082124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1824009696938637196": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2691406689892290663": ["convolution_gpu_bfyx_gemm_like",1], + "9144136375141111897": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "14702670413549232065": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11033758130987285174": ["convolution_gpu_bfyx_gemm_like",2], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "609926704263171728": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1312322903335525510": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9241243727411869340": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "7576873892262851401": ["convolution_gpu_bfyx_gemm_like",1], + "14936045362442728963": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "16628679902327485435": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "13112861120841066430": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "9974986004361966590": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13775683667344570223": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "15170578644807800052": ["convolution_gpu_bfyx_gemm_like",2], + "868827643007921561": ["convolution_gpu_bfyx_gemm_like",2], + "12361848206190267821": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1564774057733793087": ["convolution_gpu_bfyx_os_iyx_osv16",97], + "10354305663463607086": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9172699707430374863": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "16322719022997791344": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "3221221905804708596": ["convolution_gpu_bfyx_gemm_like",1], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "8146559042269976123": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "18009083375897554008": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "16482301217529090205": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9246213432501129631": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "8733109144496806085": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "3021451990778420603": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "844278648549884313": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10286228358844791913": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13201854669827561901": ["convolution_gpu_bfyx_gemm_like",2], + "12184558469694708819": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3803179179802002296": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "13248218293365141596": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "41250455178236256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2730604806511016352": ["convolution_gpu_bfyx_gemm_like",2], + "7044087204529042819": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "14001920054473316909": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10093554313775878065": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8108939799996498955": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12503605837910457108": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "32035190068479388": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15971924211584724882": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "16763335832616216769": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "7196214243890296121": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "7102173884859438914": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "16896434896068867157": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "17608288706234084973": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15642549417953837059": ["convolution_gpu_bfyx_gemm_like",2], + "8484176982872847423": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6643161848623134458": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2794704364476462562": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "8108843303778211282": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4593862318851730430": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14463983770858421738": ["convolution_gpu_bfyx_gemm_like",2], + "8291770994531919371": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "6538694526777067399": ["convolution_gpu_bfyx_gemm_like",1], + "14484890926084856480": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "15963358868537664345": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "796900095669815456": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "505027953105355818": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5573639264204952559": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "1106762955109168526": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "16632447105476661928": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3170274732463232729": ["convolution_gpu_bfyx_gemm_like",1], + "88592091379585141": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11976258954756052550": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14514450640485628836": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "10134708781744282286": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "3006428377575478529": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "6737332058785771073": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4660214425505918397": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6877976003072165363": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17516369849823844076": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6789547098653828902": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7595481705069674721": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "9805748332775912215": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16580523689587532278": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11407554707582995190": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "8358425189419823078": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17784357412228522825": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "12916369918132790013": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",1037], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",767], + "12819626280531787705": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "10231289519907741812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4157063588837576075": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "8751967016877067287": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "10289725524396556967": ["convolution_gpu_bfyx_gemm_like",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13948512795148364852": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "3436770797199367854": ["convolution_gpu_bfyx_gemm_like",1], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",0], + "16169024543367503806": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "13140527131098422428": ["convolution_gpu_bfyx_gemm_like",2], + "5167141379778311462": ["convolution_gpu_bfyx_gemm_like",2], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8028456017016080468": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "259085394007031207": ["convolution_gpu_bfyx_gemm_like",1], + "13959998803881264899": ["convolution_gpu_bfyx_gemm_like",2], + "3686062608868674589": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "14727155647330710270": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2415478259408761142": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "14602509614865844486": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "9289375071420565548": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7440546908141206022": ["convolution_gpu_bfyx_gemm_like",2], + "15485011864326008444": ["fully_connected_gpu_fb_io_ref",0], + "8470783908138180217": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17845195044080380488": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15459849799278480779": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17721709435558297965": ["convolution_gpu_bfyx_gemm_like",1], + "14132860735060026066": ["convolution_gpu_bfyx_gemm_like",2], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6983544541444063131": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "13340998273773542342": ["convolution_gpu_bfyx_gemm_like",2], + "3134642518413656360": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "12341291953192305346": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4986977887030495943": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "16852690434396099861": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3526198034974948081": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "16053585286807864356": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1882052795393187384": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7617773507561261623": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13459568779083836506": ["convolution_gpu_bfyx_gemm_like",2], + "13785621878621289403": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1980887257657896260": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4886289616235149731": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1898776014554946000": ["convolution_gpu_bfyx_gemm_like",2], + "4770478662275293849": ["convolution_gpu_bfyx_gemm_like",2], + "15117830538655814853": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17178808153714023980": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "1629280013296592298": ["convolution_gpu_bfyx_gemm_like",2], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "13663612869789682704": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14423094456821270228": ["convolution_gpu_bfyx_gemm_like",2], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",523], + "15989730594386153813": ["convolution_gpu_bfyx_gemm_like",1], + "6095158932103797740": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "7399775379344444344": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "13381833588713493653": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3380653500106294036": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "9981938305144461962": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4519609440668743423": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "15097371415144491976": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "12338760476079493547": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "13933912937625580405": ["fully_connected_gpu_bf_io_input_spatial",0], + "17126714253919198029": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "2341006744107937832": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "18178391985193947355": ["convolution_gpu_bfyx_gemm_like",2], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "11641605357868918146": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "562221645849170027": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "11561790484526369917": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3658149289395969504": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "8818070832398055086": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5509631031571317557": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5357531127711906072": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8994777547915132466": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "2687781952021151359": ["convolution_gpu_bfyx_gemm_like",1], + "18083041911869525296": ["convolution_gpu_bfyx_gemm_like",2], + "9876098429582714576": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12466721526829931923": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10848407542826653699": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16808618754363181939": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "7657964685067862984": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13141069720428059461": ["convolution_gpu_bfyx_gemm_like",2], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",1], + "1138439260035360722": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "18184154104081850641": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "2338707843044884352": ["convolution_gpu_bfyx_gemm_like",1], + "13850920989756588064": ["convolution_gpu_bfyx_gemm_like",2], + "17176310030469904708": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9146427497025645310": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11291881629276762730": ["convolution_gpu_bfyx_gemm_like",1], + "9850711648349010674": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "700717277178942679": ["convolution_gpu_bfyx_gemm_like",1], + "6827316954140278736": ["convolution_gpu_bfyx_os_iyx_osv16",125], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "8509941319309380587": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "16488426854651696706": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "10432925516327889351": ["convolution_gpu_bfyx_gemm_like",1], + "10600040563032392126": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "11511221956203704038": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "13839590781642269381": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7508931961595339477": ["convolution_gpu_bfyx_gemm_like",1], + "10500029207807372735": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14330281759626724494": ["convolution_gpu_bfyx_gemm_like",2], + "7419216766190700536": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "3404911902272307873": ["convolution_gpu_bfyx_gemm_like",2], + "17489420766684604600": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "18196676408993954972": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "10186866999254188246": ["convolution_gpu_bfyx_gemm_like",1], + "4817953977830392054": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "2930702812469156271": ["fully_connected_gpu_fb_io_ref",1], + "16549498607618849252": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11855777686733253894": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "4936968239673204144": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11988463489006787939": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13326233188936584240": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "11290558687608213321": ["convolution_gpu_bfyx_gemm_like",2], + "12366546292695084543": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "11267742746905371769": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "16582080251500644069": ["convolution_gpu_bfyx_gemm_like",2], + "18113235498360281695": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16851949759898002809": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "14233388108948021331": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12434799432980627966": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16192971634546462244": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "7744644472305197412": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16733587306017341904": ["convolution_gpu_bfyx_gemm_like",2], + "10089588313551601914": ["convolution_gpu_bfyx_gemm_like",2], + "14397348576352573007": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "11823106525249133834": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13122637768866153753": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10110359677546019738": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12312291300513951124": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5989664002046950385": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "2346855978590136528": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1372939511728986224": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17025268985366223779": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "3971456598769336038": ["convolution_gpu_bfyx_gemm_like",2], + "5329218407413679209": ["convolution_gpu_bfyx_gemm_like",2], + "18171940644650760608": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "850343942782057099": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "11215862132334892351": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13453226687921450129": ["convolution_gpu_bfyx_gemm_like",2], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "18273922178875123753": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "2028273519579688266": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "6233455595448276342": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11185041745377164894": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "4887402175773881313": ["convolution_gpu_bfyx_gemm_like",1], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "3086110559166474482": ["convolution_gpu_bfyx_gemm_like",2], + "3234567405788241673": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "814227839929688672": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7771969115805231266": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "17622515300258231642": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11806105193035393795": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "17715553891959228879": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "11829442945690098558": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9141802671320572984": ["convolution_gpu_bfyx_gemm_like",2], + "16170237673140354764": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "8616175124735896626": ["convolution_gpu_bfyx_gemm_like",2], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "17269318621094624075": ["convolution_gpu_bfyx_gemm_like",2], + "1529658068204046700": ["convolution_gpu_bfyx_gemm_like",2], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "15317510501392280831": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5688478347124565305": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "447152944190888653": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",2], + "14606504543906913119": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "3930526618478171342": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "10455850115486014344": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6210051945051792519": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "15451193085395494344": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "5163641718529821203": ["convolution_gpu_bfyx_gemm_like",1], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",1], + "11374410888638324212": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16661248688859994717": ["convolution_gpu_bfyx_gemm_like",2], + "3518981281605476136": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "10413043556440687328": ["convolution_gpu_bfyx_gemm_like",2], + "911927861489659568": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6561864486643226753": ["fully_connected_gpu_fb_io_ref",1], + "17494823614269622175": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "8071652278387309042": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "4805958162773855302": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "16666383605403885590": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "1410512481031922864": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "7033442247935655919": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",1], + "12141880589558027223": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2328698995040390396": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "4195847890935259046": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3923715765392385764": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2348721939771018658": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8500612796090968552": ["convolution_gpu_bfyx_gemm_like",1], + "13695012630130671371": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "5475537064464968733": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6914536960012332706": ["convolution_gpu_bfyx_gemm_like",0], + "3242468066266096173": ["fully_connected_gpu_fb_oi_ref",2], + "8817624284607822971": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "8453402620168400406": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",546], + "5934211962000091180": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10178462061836778766": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "9810703513111623136": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8870736106637803783": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9628702542543622433": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "14845194064376163156": ["convolution_gpu_bfyx_gemm_like",1], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3436576388124386308": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17152100243867367458": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4614042998549572181": ["convolution_gpu_bfyx_gemm_like",2], + "7807168142899312025": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12150109996250730485": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13553045975561262752": ["convolution_gpu_bfyx_gemm_like",2], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16683909937519981313": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "7441199361135503715": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",2], + "1091511312740979158": ["convolution_gpu_bfyx_gemm_like",2], + "9134203155715293387": ["convolution_gpu_bfyx_gemm_like",2], + "17089332981370803321": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "16434635675895599016": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "5186963188234940985": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16951050796024922417": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13395074742046717601": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "14284223645235602230": ["fully_connected_gpu_fb_io_ref",2], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "13248567106128518549": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11739629316219263056": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14797994820826922836": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "1743572310914695413": ["convolution_gpu_bfyx_gemm_like",2], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9579316322704307175": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10131754493574658838": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13273455049742872922": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "15085980226773631346": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "15325810055037682679": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "13447226378200557777": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "3075961585045028347": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11851216776536423298": ["convolution_gpu_bfyx_gemm_like",2], + "12251901229904154127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12716923819769400487": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "3438852523146175580": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3638987901025418036": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "10445587307296180364": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6692408578556372014": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16053383948025511837": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7703363154993904399": ["convolution_gpu_bfyx_gemm_like",2], + "5632101951796129342": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3666268650646000870": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "10551742525038893508": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "153117141968471446": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "6313048719388952335": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "5981885264666023260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11462394098346770463": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "1698847067049584068": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4046513842327685203": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16181974394948732584": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2431427502927207912": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15119063070382146368": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "5023609284081684300": ["convolution_gpu_bfyx_gemm_like",2], + "5797545757863100286": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "3853598651573655548": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "17332395907621747512": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "6981537186704688907": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7162701010394257343": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13383524675055536682": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7105622384646913935": ["convolution_gpu_bfyx_gemm_like",2], + "1908733355560815063": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "12278842522836720245": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7210729932836957540": ["convolution_gpu_bfyx_gemm_like",1], + "2239948568632407776": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "582386337144876096": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "4569416043426963318": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17921616427936768657": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "2354885756165078342": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "11915835787294686201": ["fully_connected_gpu_fb_io_ref",2], + "11588201241814594642": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "17171513366028235799": ["convolution_gpu_bfyx_gemm_like",2], + "1313038182637545943": ["convolution_gpu_bfyx_gemm_like",2], + "14066660382918185188": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "17810119189318801197": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "884923290083082187": ["convolution_gpu_bfyx_gemm_like",1], + "2786925522916317149": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "10701231567226563098": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11260588538207111217": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "3256940792095638732": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "10156210866362845661": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "16482763280295827563": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "13661225837036677371": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "11569367085498045793": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12324580272733221544": ["convolution_gpu_bfyx_gemm_like",2], + "10885831773581103653": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "14897935118679731283": ["convolution_gpu_bfyx_gemm_like",2], + "6413565827738894970": ["convolution_gpu_bfyx_gemm_like",2], + "17221173795372066030": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "18116824232149703772": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10472893418729915556": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13090596133852586482": ["fully_connected_gpu_fb_io_ref",2], + "10274587614581350261": ["convolution_gpu_bfyx_gemm_like",2], + "10831204282620894983": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "10394041365384258612": ["convolution_gpu_bfyx_gemm_like",1], + "16843976559933040107": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1051506168926530904": ["fully_connected_gpu_bf_io_input_spatial",0], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",1], + "346832567535597247": ["convolution_gpu_bfyx_os_iyx_osv16",515], + "17934338042329576850": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15715522462313302642": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "6569793510829850291": ["convolution_gpu_bfyx_gemm_like",2], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "15711618559677233865": ["convolution_gpu_bfyx_gemm_like",2], + "15136557970717196814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6603817696964851209": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "9104236539185546468": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "7247414730479113619": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "1314612539156304342": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "5368419079251107469": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "16723949803487501587": ["convolution_gpu_bfyx_gemm_like",1], + "15640202505592598653": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17258278942367320412": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "11872894645888259277": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6889498170947481097": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "9667762333290150436": ["convolution_gpu_bfyx_gemm_like",2], + "12797434473085560369": ["convolution_gpu_bfyx_gemm_like",1], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "18086782289842715645": ["convolution_gpu_bfyx_gemm_like",2], + "10880656082867082647": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "14108091242461324109": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "12478041902013146137": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5375957124102705020": ["convolution_gpu_bfyx_gemm_like",2], + "5122639094068865656": ["convolution_gpu_bfyx_gemm_like",2], + "3741411131962514208": ["convolution_gpu_bfyx_gemm_like",0], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "8376077531098664520": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17221958812979739319": ["convolution_gpu_bfyx_gemm_like",2], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "11075875009517060583": ["convolution_gpu_bfyx_gemm_like",1], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "4209610989252810404": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",1], + "7883469783245625654": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",388], + "12814676907278614920": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "15402502830461368746": ["convolution_gpu_bfyx_gemm_like",2], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1104098779103065492": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "6423120553520000795": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "10392297152843428925": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "374553246608550876": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "642256034968512602": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "1701609125136907870": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7059809764116926828": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "15291457825664605611": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1817929353109443200": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10182490653383265979": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "2660620513253264815": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "13116746433291181712": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "8017024160145338317": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "2407509127927738079": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12345000525470836335": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "7107313154723472157": ["convolution_gpu_bfyx_gemm_like",1], + "17116130466596594359": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "6096189754478965440": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "9100044555742394133": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "389822325870173489": ["convolution_gpu_bfyx_gemm_like",2], + "12608653044712562811": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "17827762625385383658": ["convolution_gpu_bfyx_gemm_like",1], + "1103228955716492167": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6744583842563891546": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4830121683809417143": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "14400339764883906933": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "6100453836448514115": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "460780635491857522": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8054185159612481260": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10468108569766167175": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10290107543739998181": ["fully_connected_gpu_bf_io_input_spatial",2], + "12881836161162762524": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "10533367671706069274": ["convolution_gpu_bfyx_gemm_like",2], + "2616828683870391718": ["convolution_gpu_bfyx_gemm_like",2], + "18215260982292770252": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15308667224953963012": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "7678730081652720605": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "7536267099632318821": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "3649980610274946512": ["fully_connected_gpu_fb_io_ref",0], + "14642845734482478360": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17550795608527501180": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8090497202997192142": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15300588247579013966": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "12940491379482292807": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "761183183078910587": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "9853089109234784643": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "9151597254187513724": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "592364460086746355": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "14560435854055940143": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "2534408579674556441": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "970596838400633278": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "10131771849139346986": ["fully_connected_gpu_fb_io_ref",1], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12412224630798427948": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "9378419102254633989": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "17543094050285028967": ["convolution_gpu_bfyx_os_iyx_osv16",348], + "15095146351334328804": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6763373100985812924": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9126242742012768166": ["convolution_gpu_bfyx_gemm_like",2], + "9501165931845934084": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8200094670006738584": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "13091799752362714688": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "7654445730724243959": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "15936513690378208182": ["convolution_gpu_bfyx_gemm_like",2], + "2510919738337557939": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14157776769026046014": ["fully_connected_gpu_fb_io_ref",1], + "2888587871912905870": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "15107740124884150777": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "8848042913869254179": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "2538377242539785672": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11047625525388102466": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "194324011642969540": ["convolution_gpu_bfyx_gemm_like",1], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16159309494101203811": ["convolution_gpu_bfyx_gemm_like",2], + "2299440282267661763": ["convolution_gpu_bfyx_gemm_like",2], + "2451603338483395600": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17044070592136685322": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10994887986667360638": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "7450915928720828406": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "2004691166378443418": ["convolution_gpu_bfyx_gemm_like",2], + "2595273700611743351": ["convolution_gpu_bfyx_gemm_like",2], + "12175796957622122377": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "11190259822407791373": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5116633474932727191": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13821388909343378606": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "8997120235555587461": ["convolution_gpu_bfyx_gemm_like",2], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",1], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "16797936364395702812": ["convolution_gpu_bfyx_gemm_like",2], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "5957444113623953990": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14566544143931267758": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7391591731082133842": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15592321818359223008": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15997754881872769378": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "15688260390755491480": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "444533022549215983": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5629373398445592781": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "12930435393720466720": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9300767936311837876": ["convolution_gpu_bfyx_gemm_like",2], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6323504675912413145": ["convolution_gpu_bfyx_gemm_like",2], + "6364288463529107554": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3089303702413279458": ["convolution_gpu_bfyx_gemm_like",1], + "13418701036204748812": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "8873424072104563382": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "5085190482265319015": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10384537928514123040": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "8075453526439606224": ["convolution_gpu_bfyx_gemm_like",2], + "9988347141056982336": ["convolution_gpu_bfyx_gemm_like",2], + "18146068930296529306": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3809343305878998617": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "15670841106242481912": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "5516343490635816913": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5552958912776013600": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "6717268005860715462": ["convolution_gpu_bfyx_gemm_like",1], + "15154934905173371714": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "6919081291036849635": ["convolution_gpu_bfyx_gemm_like",0], + "13599555566632152241": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7431069335622070596": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2105482100745329286": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "4108579755980014185": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "12360796145248339074": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "11318404975804457466": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15596913527233792996": ["convolution_gpu_bfyx_gemm_like",2], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "11674630830833831209": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12089505956882731481": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5970516037710024187": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "6377828127090689238": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "4213330047036138895": ["convolution_gpu_bfyx_gemm_like",2], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "11931909191490706784": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12706645084970410965": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12631385844456089132": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10205576142280465189": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14776308019009874809": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "4557272439632791722": ["convolution_gpu_bfyx_gemm_like",2], + "8939683514448064461": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "8307147375351882939": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10997029728191881587": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "17765244777397448823": ["convolution_gpu_bfyx_gemm_like",2], + "13906695412889750672": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "12397493112115605421": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "2043990557089419633": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11047327014045909812": ["convolution_gpu_bfyx_gemm_like",2], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",2], + "16168891366331544806": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11825205449232126827": ["convolution_gpu_bfyx_gemm_like",2], + "6680219899975628258": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "11996551650886043090": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "12691733869577147545": ["convolution_gpu_bfyx_gemm_like",2], + "761984225415608773": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14545322358931928911": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3286476039871096924": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "12878858391355259417": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2460361970017706505": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "11623764266322172086": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "9852052796465340830": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "5559417017584278927": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "15470979879166640563": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "4030835922805418609": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13121196588092064246": ["convolution_gpu_bfyx_gemm_like",2], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "1338534626640014074": ["convolution_gpu_bfyx_gemm_like",2], + "16112835627818488034": ["convolution_gpu_bfyx_gemm_like",2], + "12013883366396753346": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9777638299795801012": ["convolution_gpu_bfyx_gemm_like",2], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5481293245081340756": ["convolution_gpu_bfyx_gemm_like",1], + "2888315406857606108": ["convolution_gpu_bfyx_gemm_like",2], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "2415883693527779570": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "7953340333870774815": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10971971008143485353": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "5842284971563375197": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18076018773227225156": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "832830374368320801": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "2724007091383127418": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "9632178829095307219": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "1429370139030130929": ["convolution_gpu_bfyx_gemm_like",1], + "12478496773222604204": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "8467771025017377254": ["convolution_gpu_bfyx_gemm_like",2], + "685140170576742460": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14704939880642470064": ["convolution_gpu_bfyx_gemm_like",2], + "17264554677210911187": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8549465639583777774": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6181272224000872375": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1350402181555441235": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "9552312946391901745": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13300022131572486202": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "5294364781478821403": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "16985565646738638215": ["convolution_gpu_bfyx_gemm_like",2], + "14545094765855515974": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11860902750907076009": ["convolution_gpu_bfyx_gemm_like",1], + "3790881125495367946": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2072246877651869428": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "16125965158927145599": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6748628505489041229": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3119235799568225015": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "5094600092408024387": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "3504421925108785018": ["convolution_gpu_bfyx_gemm_like",1], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "14810839157236175179": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10295400862890021635": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "3830787224073518842": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "6586833064055001967": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "5191016422297403500": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15160192060731796225": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "10858234923346500323": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8913451832923806760": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7264756313770306662": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "11878200328276635385": ["convolution_gpu_bfyx_gemm_like",2], + "8253823502854784432": ["convolution_gpu_bfyx_gemm_like",2], + "12270548292992377827": ["convolution_gpu_bfyx_gemm_like",2], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5018845267269043034": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2183193161596798350": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "2737738314051715813": ["convolution_gpu_bfyx_gemm_like",2], + "15434536162164591656": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14743760934522111296": ["convolution_gpu_bfyx_gemm_like",1], + "578940134826172063": ["convolution_gpu_bfyx_gemm_like",2], + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "10842828403850880541": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "10754450245035836188": ["convolution_gpu_bfyx_gemm_like",2], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "1226681724476075216": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15012744672096562609": ["convolution_gpu_bfyx_gemm_like",1], + "12024416333474523686": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "14366861063858001106": ["convolution_gpu_bfyx_gemm_like",2], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",1], + "4104803308438043557": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8557939065994799094": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "3725060015826635697": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "8676627474831455650": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "15342520770460205985": ["convolution_gpu_bfyx_gemm_like",2], + "12061391584831995030": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "18420783889227814721": ["convolution_gpu_bfyx_gemm_like",1], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",1], + "8751016391945753900": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "5175845410753897614": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4318632837402329958": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12608839247035566137": ["convolution_gpu_bfyx_gemm_like",2], + "1081969835308672753": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9340606088243696490": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "8143125165478395106": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "948917645960296825": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15733030371524967129": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18325123280144403295": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "7430073011895298582": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "3006979228759768702": ["convolution_gpu_bfyx_gemm_like",2], + "9899897639161550704": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "15516674573659704770": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "17675227620234837075": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8774613863662947205": ["convolution_gpu_bfyx_os_iyx_osv16",113], + "411914986559525749": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "13994738382469480124": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "6261584163347634965": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8648848365873958010": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "17358462939783262207": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10117784802089387496": ["convolution_gpu_bfyx_gemm_like",2], + "1021364163511049664": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15576932271488848457": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",468], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "8901432555239515645": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17489255290900178723": ["convolution_gpu_bfyx_gemm_like",2], + "6819846227498139601": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12967849866710811070": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16484600784717969318": ["convolution_gpu_bfyx_gemm_like",1], + "7904735292914337507": ["convolution_gpu_bfyx_gemm_like",2], + "1346716334208025932": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "16419903786705052849": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16954232936536653281": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "6140789642561898454": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10396343030099602596": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6522575549211855712": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11306782565667740785": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "8114928396876060694": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "17598441149165536737": ["convolution_gpu_bfyx_gemm_like",2], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "11413890625163220846": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "3811325657214369711": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "11798081355131440794": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14763015336626099830": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2808205041095636198": ["convolution_gpu_bfyx_gemm_like",2], + "11006325877486632502": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "11135894989941122115": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16818714747882774917": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "1197101651805223230": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5754301693527535975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9940300152880498818": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15809072026388479729": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "2525260242689556544": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "1462775202780029067": ["convolution_gpu_bfyx_gemm_like",2], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",0], + "15096978026328154490": ["convolution_gpu_bfyx_gemm_like",2], + "14945451027055549800": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2411809718611709031": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "6364765994481977132": ["convolution_gpu_bfyx_gemm_like",2], + "7606716827635769887": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "759816003617478606": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "8100051552977329013": ["convolution_gpu_bfyx_gemm_like",2], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "16061176355133391199": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "16801078648431425148": ["convolution_gpu_bfyx_gemm_like",2], + "16497757978901707098": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "7230623964042057933": ["convolution_gpu_bfyx_gemm_like",2], + "15461879919099373703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "13671635457689276237": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8149815705026829258": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5115051214738974496": ["convolution_gpu_bfyx_gemm_like",2], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17208186152576814861": ["convolution_gpu_bfyx_gemm_like",1], + "13502487084912428404": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "12972406304361050136": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "2623687018437195679": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "2451627421465368826": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "1145700078649932035": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11932768899981458741": ["convolution_gpu_bfyx_gemm_like",2], + "17188170051014066220": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13073917160317338455": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "18156747282906367814": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "18355551625040856531": ["convolution_gpu_bfyx_gemm_like",1], + "9657585348407617520": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10841786394951910408": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8897786294680986991": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "9067207838429479363": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "17430593168191424639": ["convolution_gpu_bfyx_gemm_like",2], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11523864029587161089": ["convolution_gpu_bfyx_gemm_like",0], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",2], + "4017163133829149027": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3320392060021963536": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "368147139706197757": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5381578460674280089": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "5523778675167321193": ["fully_connected_gpu_fb_io_ref",0], + "2597435203284675496": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2081318772333460627": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "9104710269725948935": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "10447427622114317323": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "10263861857115868555": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "14561847633011875566": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5169676188205309169": ["convolution_gpu_bfyx_gemm_like",2], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "11192356850081328892": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "3170336071769787200": ["convolution_gpu_bfyx_gemm_like",1], + "1938627662342504660": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "13505239531682993049": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11327678075247102542": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2585767464396438954": ["convolution_gpu_bfyx_gemm_like",2], + "3377472614945731801": ["convolution_gpu_bfyx_gemm_like",2], + "7838176322738051195": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "7520300815632157008": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5124291229936820926": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17182839667242694171": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "2098357709530580176": ["convolution_gpu_bfyx_gemm_like",2], + "10856527039674342926": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "5658491804782285708": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2510093757258898215": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13817553830305981296": ["convolution_gpu_bfyx_gemm_like",1], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17923035110851963413": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "8468092944055919238": ["convolution_gpu_bfyx_gemm_like",2], + "2893564501191050837": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "218070270815606832": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3124997104810767514": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "16565784556269819846": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16429816273405099453": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",1], + "11006013403687198405": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "697609699740088622": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "15641049130597645936": ["convolution_gpu_bfyx_gemm_like",2], + "17287487062245049466": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7575675354187625951": ["convolution_gpu_bfyx_gemm_like",2], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "18348301285923584995": ["convolution_gpu_bfyx_gemm_like",2], + "11098189888598804624": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13015379405020620466": ["convolution_gpu_bfyx_gemm_like",2], + "17287404861045114619": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "13045206675957093567": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "2479282650381163888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16053441017037949431": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "10451904743064959757": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5902427784683046762": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1006721963560645335": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17243953172314194409": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "17223169013008075474": ["convolution_gpu_bfyx_gemm_like",2], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "14008438372661779490": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7958595516465029682": ["convolution_gpu_bfyx_gemm_like",2], + "426267761240826769": ["convolution_gpu_bfyx_gemm_like",1], + "241860795253927746": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7937517564893685647": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "8166976803757624321": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17163595630291422874": ["convolution_gpu_bfyx_gemm_like",2], + "3502889736327580141": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17338623890209792485": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "3362829461757548683": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16865271154583564899": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "17185089684685480638": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2702566744272427570": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7712831597869354170": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12219239604684537521": ["convolution_gpu_bfyx_gemm_like",1], + "9318550032135064372": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "6494837659483504443": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6621483425195088869": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "5458310740719324710": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "12840204133991239572": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",2], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "15715775011639091549": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "5065071428884648135": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "4296524295134959042": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "18384657372655350144": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "2912984501615111849": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2103507679502667581": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "4597954342704466825": ["convolution_gpu_bfyx_gemm_like",1], + "5567670507334783760": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3561366509539440079": ["convolution_gpu_bfyx_gemm_like",1], + "1364905900191854779": ["convolution_gpu_bfyx_gemm_like",0], + "1339402691552717009": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "15678329601718218341": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9005351264094503686": ["convolution_gpu_bfyx_gemm_like",2], + "3518605747492037670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "16359282790151128772": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "360764089318153518": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "15834666915651997510": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "11851526665791263153": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "13007534905441600782": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16323870023648254366": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17190698921280188790": ["convolution_gpu_bfyx_gemm_like",2], + "9753702905908744910": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "56327004269432885": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15936869458531244961": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "13702914647519703599": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11459784003592366395": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9004823715680825977": ["convolution_gpu_bfyx_gemm_like",2], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "6505706083205285176": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14074914477149374595": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "8131682691875884781": ["convolution_gpu_bfyx_gemm_like",2], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3177362994630209421": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9780938731831129283": ["convolution_gpu_bfyx_gemm_like",2], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "17795554443343871443": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "545425355231744794": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3885931890288969926": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "1054159213127890689": ["convolution_gpu_bfyx_gemm_like",2], + "12664952811642406457": ["convolution_gpu_bfyx_os_iyx_osv16",569], + "2080397907007737054": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "16494403731659808258": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "14716719350966652036": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1119928633562250911": ["convolution_gpu_bfyx_os_iyx_osv16",947], + "7713736987017889212": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "14939750655636313880": ["convolution_gpu_bfyx_gemm_like",2], + "1646362346584649954": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13192885349640152576": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "16025442470600124062": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "17142080999569154649": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13394233139064923018": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9410978119783758141": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2047041720569246861": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5454796925594082324": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",0], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1682776041247037802": ["convolution_gpu_bfyx_gemm_like",0], + "10624567684389583173": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "17959539037614502049": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1954052357826969119": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "16956980254113285457": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "922541506531537121": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6447172410311223671": ["convolution_gpu_bfyx_gemm_like",1], + "15052127817178941719": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11062005455602919062": ["convolution_gpu_bfyx_gemm_like",1], + "6351924049625723579": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12925156865008155065": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5291944277945000781": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "311255514995417672": ["convolution_gpu_bfyx_gemm_like",2], + "11868419561534906809": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "3664562521273273709": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11299275869800089824": ["convolution_gpu_bfyx_gemm_like",1], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11107930597263802755": ["convolution_gpu_bfyx_gemm_like",2], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",2], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",2], + "5284132464580556804": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "17309224746854446222": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "8154794217037682993": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18133614045401867449": ["convolution_gpu_bfyx_gemm_like",2], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "3897967722980386263": ["convolution_gpu_bfyx_gemm_like",2], + "15088940149962496972": ["convolution_gpu_bfyx_gemm_like",1], + "7083152697366621236": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9298483238271063853": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "10625675062556386448": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6121673167888047110": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12541764833974378504": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12923653434892323603": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2567809041240246707": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17942120824047252501": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17912189681971987483": ["convolution_gpu_bfyx_gemm_like",2], + "7573459699367415551": ["convolution_gpu_bfyx_os_iyx_osv16",515], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",539], + "8489998884193999354": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10356951625481502476": ["convolution_gpu_bfyx_gemm_like",2], + "14044495589185586465": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "41672385434660942": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "8262441556572334783": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "16748743818537812349": ["convolution_gpu_bfyx_gemm_like",2], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "8234878941966364642": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "7396823789595001064": ["convolution_gpu_bfyx_gemm_like",2], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2850279308978256234": ["convolution_gpu_bfyx_gemm_like",2], + "10068502639160680134": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "4766071144928072260": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "4455497237293642238": ["convolution_gpu_bfyx_gemm_like",2], + "3621449131285713809": ["convolution_gpu_bfyx_gemm_like",2], + "18044455700176500102": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3623866842874047894": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "17332230377845694888": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3200047546714112402": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "8682149821028981871": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",661], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "6290180140047520382": ["convolution_gpu_bfyx_gemm_like",1], + "5135353986081664933": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12138556002719602750": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "15603643151057665338": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "5033665285977853779": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "17433037267999205350": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",2], + "9069334144391048686": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "13199524367893035805": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17279975778400757791": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "13925839061045347955": ["convolution_gpu_bfyx_gemm_like",1], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "3761770343527826418": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9092949297095391463": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10545749454895857995": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4590784654677429162": ["convolution_gpu_bfyx_gemm_like",2], + "7981376447277193852": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "6964180083696019970": ["convolution_gpu_bfyx_gemm_like",1], + "6496839689453807726": ["convolution_gpu_bfyx_gemm_like",2], + "203639177311791127": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "1005880016096298476": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "9750510172185801133": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6905249031401202060": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17575578027095664417": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3036808833459559381": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "3125577147662589592": ["convolution_gpu_bfyx_gemm_like",1], + "10708706979952421150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "5756918912614763074": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5973242004448142604": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "9863856393759813897": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9609257787066002999": ["convolution_gpu_bfyx_gemm_like",2], + "8454760437961964894": ["convolution_gpu_bfyx_gemm_like",2], + "1117787205894124896": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "14471867575610362464": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "3816774953143987171": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8576229375621297412": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "9759380701896779097": ["convolution_gpu_bfyx_gemm_like",2], + "17774902969414949042": ["convolution_gpu_bfyx_gemm_like",2], + "3882955134902442387": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "16683485007140805060": ["fully_connected_gpu_fb_io_ref",1], + "16767564582561837873": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2104529100867065546": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "10158184435144178161": ["convolution_gpu_bfyx_os_iyx_osv16",337], + "11892088065638996743": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "9743806043658380623": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "17228877915053571642": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3272776991539782834": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14234117003504517946": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2220961811760955456": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "8850600236849718709": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",831], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "5519835581976587401": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "17040970955448750876": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "4239133538073498792": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "5409924335138540834": ["convolution_gpu_bfyx_os_iyx_osv16",526], + "14132290154676895976": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12582624102297726596": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "9454954846682513038": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4950144098898276785": ["convolution_gpu_bfyx_gemm_like",2], + "12427258337646070422": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7228139313323996640": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9569446666675696513": ["convolution_gpu_bfyx_gemm_like",1], + "7813041847979170166": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17628984504073918701": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2713481951804190325": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "15489882561480858974": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",0], + "3939805316470672966": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3839690227347352846": ["convolution_gpu_bfyx_gemm_like",2], + "17864395500488861670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5036963191507722541": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "261021128656714770": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "12482312825666761192": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "18219755699990183812": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9070474871526366492": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "2841943277631596989": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "5570191330195573102": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "12823842409678756966": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "6262190151863459214": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "69884424286147709": ["convolution_gpu_bfyx_gemm_like",2], + "2521821959816944292": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "17471843449888763571": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2777614869053822003": ["convolution_gpu_bfyx_os_iyx_osv16",377], + "13126786259906598018": ["convolution_gpu_bfyx_os_iyx_osv16",1026], + "13948873105076070952": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "4220826666482500445": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "8422808932256100230": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "6621371075123542816": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",512], + "15360511165237335684": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17399542571019639128": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10117376369841171716": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "846177346130290194": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "15336590103518398224": ["convolution_gpu_bfyx_gemm_like",2], + "17243648226968859637": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2930848604606590505": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "3621070130367713395": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15411603884973340468": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15016406041863758148": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "13804435767468730732": ["convolution_gpu_bfyx_gemm_like",2], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "10463896120685306944": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15808629700189777056": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17713034180977313726": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "9404677451270692749": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "7942294816235384071": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1865187811299838654": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "17049054004246292085": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "4147006350295905486": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9655550151067451233": ["convolution_gpu_bfyx_gemm_like",2], + "9833242806281729759": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "9038991914155436715": ["convolution_gpu_bfyx_gemm_like",1], + "10730856574108806045": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "5461980510262646821": ["convolution_gpu_bfyx_gemm_like",2], + "4679163800360809315": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12714194906146827658": ["convolution_gpu_bfyx_gemm_like",1], + "3859314295530377028": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "7263339400190408379": ["convolution_gpu_bfyx_gemm_like",2], + "15532419087060587119": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5536424274663702901": ["convolution_gpu_bfyx_gemm_like",2], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "2746052215199129520": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "17716065235878633691": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5685381761573686628": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8258382025812748961": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "14707855908416908375": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "14650567822254940018": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "15294692035670155801": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "498239903908845198": ["convolution_gpu_bfyx_gemm_like",2], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "15479549936562568596": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7363788553442810299": ["convolution_gpu_bfyx_gemm_like",2], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "15775917744517770768": ["convolution_gpu_bfyx_gemm_like",2], + "9899242398980336120": ["convolution_gpu_bfyx_gemm_like",1], + "12791525533856308302": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "9256308629247511374": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "11433534680781300610": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10965563190266380694": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "17252689774572814142": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "6158514925486943212": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "14786904599410885158": ["convolution_gpu_bfyx_os_iyx_osv16",465], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "11151426820269138585": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1276881030620698911": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17523210737277743952": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4883106423598271822": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "8800251965243080024": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3382494956350224120": ["convolution_gpu_bfyx_gemm_like",1], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "15322989486222859378": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",1], + "801864263975761712": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "9457038545823436137": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "12654574135415748217": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "8131617570786904723": ["convolution_gpu_bfyx_gemm_like",2], + "1663732107639157701": ["convolution_gpu_bfyx_gemm_like",2], + "6695336381467406810": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "11984095218733350838": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14953809073272885651": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4911398420005278258": ["convolution_gpu_bfyx_gemm_like",1], + "4940950742383121943": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "17614929666625976544": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "10683839359385393536": ["convolution_gpu_bfyx_gemm_like",1], + "9207334433308148635": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "18153597620760635012": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13373912451448693522": ["convolution_gpu_bfyx_gemm_like",1], + "7369471926167902143": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "10076578838853982233": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "2935787827649981367": ["convolution_gpu_bfyx_gemm_like",1], + "9198752981132674942": ["convolution_gpu_bfyx_gemm_like",1], + "17693518538833606792": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "572155668587252712": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "530825424084837479": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "1655427025346068673": ["convolution_gpu_bfyx_gemm_like",1], + "10084794570892043447": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "4495774394017823312": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "13359643347682243944": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "11568162864377479487": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "8155752116518841384": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "7982784766505903515": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "4185477435943946730": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8354812222032899427": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16131386739027190836": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "6277198010392189880": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "11845013061234102293": ["convolution_gpu_bfyx_gemm_like",2], + "11287863182337672053": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4716188972902735458": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3704618172730076978": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7768680313873061531": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15265621959560796543": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "9130971535185609293": ["convolution_gpu_bfyx_gemm_like",2], + "14930745998253392722": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "8963262014498730146": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",1], + "4947961640303581107": ["convolution_gpu_bfyx_gemm_like",2], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "11088128828863596806": ["convolution_gpu_bfyx_gemm_like",2], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "4294879469633231552": ["convolution_gpu_bfyx_gemm_like",2], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "9608917563823863132": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12889351859522118935": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1233021176530240722": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4999210721703970274": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14086074948200412805": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8515479970005301094": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "7071991799972799089": ["convolution_gpu_bfyx_gemm_like",2], + "54019631544204590": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12590495767805868405": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "15316782593191029443": ["convolution_gpu_bfyx_gemm_like",2], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17466025028296506313": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "17259951372033727587": ["convolution_gpu_bfyx_gemm_like",2], + "15385506288692289568": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17087143277789116317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14811022197918391667": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1423297940282476513": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "16996022503617157059": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10509933181132310969": ["convolution_gpu_bfyx_gemm_like",1], + "6225447513745282621": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11195875185591819437": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7863886351122918972": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "17006655627343469372": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "8485845304380573432": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10628725059172743408": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17302671258991071440": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2479856511929768548": ["convolution_gpu_bfyx_gemm_like",1], + "702096475436365058": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4327450388326573746": ["convolution_gpu_bfyx_gemm_like",1], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",2], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "7848121247546147821": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1003101267609305257": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10462144647439624978": ["convolution_gpu_bfyx_gemm_like",2], + "16170708786673864371": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "5229688072405810569": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8269543491844451750": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "11612998433409522582": ["convolution_gpu_bfyx_gemm_like",2], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "11704369548723383645": ["convolution_gpu_bfyx_gemm_like",2], + "16122033101591094139": ["fully_connected_gpu_fb_oi_ref",1], + "2094213523530180653": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "5011769546010018777": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "18117954008112578376": ["convolution_gpu_bfyx_gemm_like",2], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "1540459344569916165": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5055133356846736609": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "10608496431404827757": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8797661560676476245": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "582360460084115077": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "8529170838214082841": ["convolution_gpu_bfyx_gemm_like",2], + "8378690770140438511": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "3860603464276263676": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13616241450266119966": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "16802487456370986847": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15315327794058441258": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "16063854283763838910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15924144379094505874": ["fully_connected_gpu_fb_io_ref",1], + "868488930567226694": ["convolution_gpu_bfyx_gemm_like",2], + "10348660503952680688": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10208132281050693649": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "14394427817253242611": ["convolution_gpu_bfyx_gemm_like",2], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17406383217119217230": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "6495132856471482043": ["convolution_gpu_bfyx_os_iyx_osv16",865], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "14094981198645015124": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "11782525502250249483": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5230871884758163940": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "6898793319624390153": ["convolution_gpu_bfyx_gemm_like",2], + "13600579723542095577": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "9207413252274439059": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15151957983054148973": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "14366252780310630703": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10428477376571919905": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "18250076003231973692": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6778781361481531516": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4196367396954155354": ["convolution_gpu_bfyx_gemm_like",2], + "4406157095142118884": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "15381551674482810230": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "18308661808437079996": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "8732106543033226791": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "10568883265991969648": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1795659014508380077": ["convolution_gpu_bfyx_gemm_like",1], + "14141983383097250411": ["convolution_gpu_bfyx_gemm_like",1], + "6651097363666320726": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "10902108166827340970": ["convolution_gpu_bfyx_gemm_like",2], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "9357359875134299131": ["convolution_gpu_bfyx_gemm_like",2], + "14579050468883613611": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "1876286132660871464": ["convolution_gpu_bfyx_gemm_like",2], + "2740287492529009109": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "15285236716284874711": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "1062508357634542606": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "18373068999874730591": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "9831195630506601660": ["convolution_gpu_bfyx_gemm_like",2], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "7019316994558628633": ["convolution_gpu_bfyx_gemm_like",2], + "13729951531199985382": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "9643671820560131959": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "15841489476316341204": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "15024130918582332928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14301661367597749567": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "6707221689266688389": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "1303304215797905198": ["convolution_gpu_bfyx_gemm_like",2], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "7658318862249823838": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "4347494599650425733": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "8939520209266902800": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17886436103211436626": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12757564215386697460": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "14959281374959998609": ["convolution_gpu_bfyx_gemm_like",2], + "18204971481718743856": ["convolution_gpu_bfyx_gemm_like",2], + "7174804306958128658": ["convolution_gpu_bfyx_gemm_like",2], + "4550028191070279999": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1605295763358374504": ["convolution_gpu_bfyx_gemm_like",2], + "12493863403516600413": ["convolution_gpu_bfyx_gemm_like",1], + "8749399240948437294": ["convolution_gpu_bfyx_gemm_like",2], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "16494581774051338901": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "4054010905884346287": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17713011656078651": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15683344003370367509": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "17604747523124060652": ["convolution_gpu_bfyx_gemm_like",2], + "7688176479120305539": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "12319165874575782715": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "3935883681780676157": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "17828453493113919756": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "9639014900668946045": ["convolution_gpu_bfyx_gemm_like",2], + "15280273795883244074": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7761195307416102494": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5095827462645341808": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "17496371501557652357": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "12085208566397959149": ["convolution_gpu_bfyx_gemm_like",2], + "5996261744926399743": ["convolution_gpu_bfyx_gemm_like",2], + "6954257882806659594": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "16937207522545573792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10600884986702650404": ["convolution_gpu_bfyx_gemm_like",2], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "4797026040899499511": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1127598752149871162": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5939121107940759940": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "585914943085061885": ["convolution_gpu_bfyx_gemm_like",1], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "11579025491409526679": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "14512407261081843554": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "3963106895592011725": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "2346992541638145615": ["convolution_gpu_bfyx_gemm_like",2], + "12655099960717366198": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",2], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4859271780094116779": ["convolution_gpu_bfyx_gemm_like",2], + "13027039165868458729": ["convolution_gpu_bfyx_gemm_like",2], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "1760830986937165861": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "15551453802011405101": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "2467535554409643460": ["convolution_gpu_bfyx_gemm_like",1], + "15124985846197662243": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "3615203440895591147": ["convolution_gpu_bfyx_gemm_like",1], + "8230144305844912369": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "7826714904736870517": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17342868362584820356": ["convolution_gpu_bfyx_gemm_like",2], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "13462726136352103466": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10433456687054381828": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2577413012740709678": ["convolution_gpu_bfyx_gemm_like",2], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",2], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "15434706304418357961": ["convolution_gpu_bfyx_gemm_like",2], + "12636120902231094700": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6717243674054760598": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "16684378382033936005": ["convolution_gpu_bfyx_gemm_like",2], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "8431845338648284548": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "1410630713443793537": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "1760779615705074283": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "13020929028222837402": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "16228026045292341333": ["convolution_gpu_bfyx_gemm_like",2], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18445243511250094011": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "4860779741225078946": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "12965552570525926289": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "14683086376707577764": ["convolution_gpu_bfyx_gemm_like",1], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "1146282291269334070": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2425177545256374371": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "6995472847770703647": ["convolution_gpu_bfyx_gemm_like",2], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "10270203686708782941": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3150231129728961455": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "14365699621119565405": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11430797372848621790": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "841243068178925457": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "3855151839445505918": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1179906398014559042": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "6578239603654034233": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "11322451605795727486": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7410628771323937530": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7490524380333929773": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "5695368162557483073": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "12136803297132972709": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6526586547926160627": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "11910060331768652144": ["convolution_gpu_bfyx_gemm_like",2], + "6603489144277795818": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7356440848422235031": ["convolution_gpu_bfyx_gemm_like",1], + "17846557385112426504": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12713087335581316946": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "3831257753143317802": ["convolution_gpu_bfyx_gemm_like",2], + "17372520271370779917": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "8860685325047463026": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13731964100893109797": ["convolution_gpu_bfyx_gemm_like",1], + "2916077416184925232": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7157499157310356912": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8509748651922589684": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10756831914332769026": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",1], + "5369464352361405510": ["convolution_gpu_bfyx_gemm_like",2], + "9522661528867955338": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1691554843141984381": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "13797057152042581440": ["convolution_gpu_bfyx_gemm_like",1], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "10205696100164492716": ["convolution_gpu_bfyx_gemm_like",2], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2055914145961691571": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8104331313502492541": ["convolution_gpu_bfyx_gemm_like",1], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "15282806587681892519": ["convolution_gpu_bfyx_gemm_like",1], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15552287544878243347": ["convolution_gpu_bfyx_gemm_like",1], + "14156845527754813253": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "6740385846687754849": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "12823080103951853168": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17851024468934906318": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "10190532901392055501": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4113935675071480884": ["convolution_gpu_bfyx_gemm_like",2], + "14757855448502485216": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "15857087373591747006": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5352896995050401444": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8701639906504450534": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "17526891234501366023": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14269161473352876138": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "17631458041591681785": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14213516751025324346": ["convolution_gpu_bfyx_gemm_like",2], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "5754844816339228920": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "9324602658580246084": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "10660722770448981436": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9743549865786050651": ["convolution_gpu_bfyx_gemm_like",2], + "4356806313729405658": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "5906712613621491207": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9522947878591994913": ["convolution_gpu_bfyx_gemm_like",2], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "13468713306678453952": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "9527075413813342687": ["convolution_gpu_bfyx_gemm_like",2], + "11369389082421346630": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "4479979951990338510": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2039909180006215069": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "14174805457643822445": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "533820672115442982": ["convolution_gpu_bfyx_gemm_like",2], + "459936950868112292": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "6747799061507191246": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9468542963649996822": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "6108475838757986889": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17769703068450272262": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "17128723415461475388": ["convolution_gpu_bfyx_gemm_like",2], + "1713947356482032411": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5887877259873928726": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4607428643002808173": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "16149924641081427062": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "2388209402010617408": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "18043340998699622388": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15511138074959300404": ["convolution_gpu_bfyx_gemm_like",2], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "13348329768178411596": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "1299452063079314341": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9497934813418221769": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "1395293354112586043": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7730305811644972643": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "17514082938765137629": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "18259001228411909210": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "6587817876244206939": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7089077910858800239": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "7289940394271052757": ["convolution_gpu_bfyx_gemm_like",1], + "13702692566238948173": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9391425117463100557": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "15695275881213623746": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "4897991181236908768": ["convolution_gpu_bfyx_gemm_like",1], + "12582321591799165205": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "1629816265162728770": ["convolution_gpu_bfyx_gemm_like",1], + "14740550583313186369": ["convolution_gpu_bfyx_gemm_like",1], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14808831640065476291": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4369346833875105372": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12836639380579091509": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "1650519167046658780": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "1114661658519542600": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18132981365225439999": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2467766894778630615": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",876], + "3107611675766875160": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "4202116155711873525": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8509882139595784161": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "3995098494991567714": ["convolution_gpu_bfyx_gemm_like",2], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "363330365598760149": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "10395191003166536655": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11696231285411686761": ["convolution_gpu_bfyx_gemm_like",2], + "14289048840489035546": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "8655525088525612583": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11640865562390693266": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5020605371834958647": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "6296371382672640627": ["convolution_gpu_bfyx_gemm_like",1], + "13337315872184544686": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "2376239021851907962": ["convolution_gpu_bfyx_gemm_like",1], + "1208534686657112759": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "310584224049735004": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "3435773540391994106": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "13676670925355487305": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "17343050785312683560": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "14176233347574275776": ["convolution_gpu_bfyx_gemm_like",1], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",1], + "5691889055008878111": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6306539529168638031": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12253987037990618484": ["convolution_gpu_bfyx_gemm_like",1], + "2040762223425679479": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "9165275903833498932": ["convolution_gpu_bfyx_gemm_like",2], + "15156836293519486753": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7974614031099580856": ["convolution_gpu_bfyx_gemm_like",2], + "11928926429060828408": ["convolution_gpu_bfyx_os_iyx_osv16",132], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14755869345266103764": ["fully_connected_gpu_fb_oi_ref",1], + "9557728221162137067": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "14417033368952865805": ["convolution_gpu_bfyx_gemm_like",1], + "16026019808764920641": ["convolution_gpu_bfyx_gemm_like",2], + "16897485136352617189": ["convolution_gpu_bfyx_gemm_like",2], + "2688060699200137048": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4834591210311380436": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "13500369101462555447": ["convolution_gpu_bfyx_gemm_like",2], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "5334190564423375247": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "15679696422603106163": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3522455279376021211": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2246205611561147645": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5301394322453453489": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17429692714456679999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",2], + "9177395776408296291": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14904665242518014005": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "1565612286723277822": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5718472464360340274": ["convolution_gpu_bfyx_gemm_like",2], + "10897008852059401902": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "12676139447729343679": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7142195383189497127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2789137853864057385": ["convolution_gpu_bfyx_gemm_like",2], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",2], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "5797243082477551421": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3574679673239756551": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "8025053805734757314": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "17991368786018745231": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6203602270552179462": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "16559140502701231107": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "13675314612031135613": ["convolution_gpu_bfyx_gemm_like",1], + "8962502004422485576": ["convolution_gpu_bfyx_gemm_like",2], + "16955653765071712611": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "3217555855036660482": ["fully_connected_gpu_fb_io_ref",2], + "8775336277634573074": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "6876300000441081789": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2070429718533716882": ["convolution_gpu_bfyx_gemm_like",2], + "13941251104772804303": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",1], + "14083279273292567319": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "8336494030011542852": ["convolution_gpu_bfyx_gemm_like",1], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "14010642743400284761": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "498420237272375425": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13765632280570725774": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "9475130054420979752": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "14046217730873620907": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17086887873464601732": ["convolution_gpu_bfyx_gemm_like",1], + "8734483136584351066": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3018306533413795559": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3355259926747524578": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "17818587793483875865": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13064477237937322246": ["convolution_gpu_bfyx_gemm_like",1], + "18193831330827252971": ["convolution_gpu_bfyx_gemm_like",2], + "12044635257539223503": ["convolution_gpu_bfyx_gemm_like",2], + "4725009116734166168": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "18232459663207612727": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11327867170377736609": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",1], + "1197184887743937394": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9833540739021310892": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16304963156448605623": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "11213667690594303395": ["fully_connected_gpu_fb_io_ref",1], + "9368244029111057323": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "1168589063110524328": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "6026065914078520895": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "12083217714727863832": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "5246955189449281709": ["convolution_gpu_bfyx_gemm_like",2], + "1724222702460860833": ["convolution_gpu_bfyx_gemm_like",2], + "6973621625148257910": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18010600104565458874": ["convolution_gpu_bfyx_gemm_like",2], + "11981887712163064333": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "7152107839144357830": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "5955810688179557560": ["convolution_gpu_bfyx_gemm_like",2], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "710166379854475667": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "1579733029852052699": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8680545947510235993": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7380979920013545867": ["convolution_gpu_bfyx_gemm_like",2], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "5488296540132936296": ["convolution_gpu_bfyx_gemm_like",1], + "304721598975479337": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "6133592828563353516": ["convolution_gpu_bfyx_gemm_like",1], + "8158983334404475382": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "7353255713834431471": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "11280403113463077620": ["convolution_gpu_bfyx_gemm_like",2], + "12794030011655906930": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "17361319565503258506": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "3856394004079548211": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12163456975896925619": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "12311901617815857033": ["convolution_gpu_bfyx_gemm_like",1], + "10527256963399838405": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8334832698020211623": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17965825642065048619": ["fully_connected_gpu_fb_oi_ref",2], + "8235002440285527553": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "4846216894450341698": ["convolution_gpu_bfyx_gemm_like",2], + "7878217536124016199": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6537576410448334203": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "7289633911925073088": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13946367911927964830": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12175297963550750804": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "3432296808755992670": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "13836867092941506302": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "9758759365463492505": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12305383126483033452": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "9497269191159495932": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "11979910991788695837": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "12792454713887439830": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "15241191584896579183": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",0], + "15534517308430424624": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11878217002671373638": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "8045393243176844621": ["convolution_gpu_bfyx_gemm_like",2], + "4245229655273611845": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "12315068368597230211": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "12169896916690963726": ["convolution_gpu_bfyx_gemm_like",2], + "6674643031068271417": ["convolution_gpu_bfyx_gemm_like",2], + "10838721873837128971": ["convolution_gpu_bfyx_os_iyx_osv16",676], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6796758191974756201": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2215194389847256545": ["convolution_gpu_bfyx_direct_10_12_16",2], + "496948821475405395": ["convolution_gpu_bfyx_gemm_like",2], + "18286006396667126860": ["convolution_gpu_bfyx_gemm_like",1], + "10713207196920878995": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "17446388159565719362": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6493920223660825755": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "10011668671963948912": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5172823024549700279": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5635449856699664273": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "12451592945087000191": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "3363675939515208883": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "2257384183256237750": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14463173937397982331": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "8402396502992483524": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17888721282811720634": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2410828969408182980": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16928564394848059094": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14742909697076926475": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14807299286266923693": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "7316825051569394089": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "16935619230235600309": ["convolution_gpu_bfyx_gemm_like",2], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "7439340221097179208": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10614918790075146626": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3069396488274616770": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15929361440504489924": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "10591159235183381823": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "7558864177789582540": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4232250144427804891": ["fully_connected_gpu_bf_io_gemm",1], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6895664772793074050": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "14206328165498357760": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13766538247146238357": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "4945845875046545967": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14309292105974991733": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "15214779483545052950": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "792684262493086891": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "5120274680151325194": ["convolution_gpu_bfyx_gemm_like",2], + "14848732804958314374": ["fully_connected_gpu_yxfb_ref",0], + "1034911525083515252": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13941188114382863776": ["fully_connected_gpu_fb_oi_ref",2], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "1373904073013943690": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "3746573775462003750": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13282612510005390816": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "10073779356457603252": ["convolution_gpu_bfyx_gemm_like",2], + "7404732699742965436": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10306169610486701545": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "11007100272494557520": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3752278444736105763": ["convolution_gpu_bfyx_gemm_like",1], + "11404331488962230130": ["convolution_gpu_bfyx_gemm_like",1], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "15394217414267195999": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13721983823460534294": ["convolution_gpu_bfyx_gemm_like",2], + "937200116534179904": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "5341876404211768451": ["convolution_gpu_bfyx_gemm_like",1], + "9953329530402569669": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5872553335123308034": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3434842614653335826": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6232596685071671579": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7173828525834910425": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "3409255127071376537": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1149548328523286475": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "15019050434475217267": ["convolution_gpu_bfyx_gemm_like",2], + "11093147488085506266": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "3604379857905625467": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "3447774474841314860": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "16705941191876956548": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "6491772898618671653": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3318430113631867573": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "3416636940668221406": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "6753857156025715321": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "755157892988514864": ["convolution_gpu_bfyx_os_iyx_osv16",136], + "16159852373972174245": ["convolution_gpu_bfyx_gemm_like",2], + "10168317560306247723": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "4370027682980493159": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "13694766887442024878": ["fully_connected_gpu_fb_io_ref",1], + "6556795059657533200": ["convolution_gpu_bfyx_gemm_like",2], + "15387047026300787039": ["convolution_gpu_bfyx_gemm_like",2], + "875552069535001284": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "364197229238830807": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "6293500642319778096": ["convolution_gpu_bfyx_gemm_like",1], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "2477866283402053371": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "5448665190811365701": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "16689318540732157754": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "598745924736700294": ["convolution_gpu_bfyx_gemm_like",2], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8054599744123820194": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "10159790066948852390": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "2805931700404492624": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4718705504966715203": ["convolution_gpu_bfyx_gemm_like",2], + "9444953530704856016": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1677118421195120152": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "16150934538381572916": ["convolution_gpu_bfyx_gemm_like",2], + "11004350075893421731": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6849874726361751307": ["convolution_gpu_bfyx_gemm_like",2], + "16312223896859176991": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "433161293684647032": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6639715607290389968": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",284], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15737508945513376813": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5743482411668939203": ["convolution_gpu_bfyx_gemm_like",2], + "7148542290597073512": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "7281661441196896385": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2542984219353153495": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "6322831233548420761": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "15733883474006568340": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15918017311798856029": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "11522488904021243956": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3420064118559852968": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "2431923918345445420": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "7860086755625626604": ["convolution_gpu_bfyx_gemm_like",2], + "10982693252072682414": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "973402921452083017": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "7218689869635572700": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9116206094279111365": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12329909110827539139": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16385712633367611786": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "4063525218682664832": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10881884300766361791": ["convolution_gpu_bfyx_gemm_like",2], + "3704271978133986620": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7717602860943327535": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16766706479910720794": ["convolution_gpu_bfyx_gemm_like",2], + "10629681722649771498": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1659851931406041285": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17902799955139047426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15737542477498282367": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8550133332738529361": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6626716013917662606": ["convolution_gpu_bfyx_gemm_like",2], + "5920614348521143999": ["convolution_gpu_bfyx_os_iyx_osv16",129], + "3617433210865054182": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "2772704069752888874": ["convolution_gpu_bfyx_gemm_like",2], + "9968686603153440164": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14151249542292579535": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "17947613081555491099": ["fully_connected_gpu_fb_oi_ref",2], + "4244790495090049295": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "4554343896877444783": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13599438824699346708": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "937050062571228573": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10250778203413648582": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "10153070641942936648": ["convolution_gpu_bfyx_gemm_like",1], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2838789360952219092": ["convolution_gpu_bfyx_gemm_like",2], + "8272823732258536202": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9884646296875511696": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4445912157712391517": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",2], + "8153567933591966877": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "7315740838189400004": ["convolution_gpu_bfyx_gemm_like",2], + "5060817429317741254": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14724862072414829490": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "981276017776678882": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "10643373404881648498": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3355824730785179775": ["convolution_gpu_bfyx_os_iyx_osv16",899], + "1018319414633271980": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "2764034841399585177": ["fully_connected_gpu_fb_oi_ref",2], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "8474585711383508493": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16687701987371294908": ["convolution_gpu_bfyx_gemm_like",2], + "15594091060902767607": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "1787598049938821496": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2072252610120557179": ["convolution_gpu_bfyx_gemm_like",2], + "6053594232298534345": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "10995424394152951534": ["convolution_gpu_bfyx_gemm_like",2], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "15741360654354155504": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",124], + "12878631058803628679": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9531730330306606343": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "1640358227345963848": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "8737417433314100353": ["convolution_gpu_bfyx_gemm_like",2], + "14445520478857662586": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6040623414692799116": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "10381752670329683275": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "14066219153422011272": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "3113016029551460773": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "1089679781525023551": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4091785563304559606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "9945721344229922405": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12176879951537921518": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "14173867073407110501": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "277151219694781348": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "14629433964319883917": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "14669219788000023965": ["fully_connected_gpu_fb_oi_ref",0], + "889943986793446284": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "15325302411038679750": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10177466042250039828": ["convolution_gpu_bfyx_gemm_like",2], + "16140133852987111783": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "15693851280141842140": ["convolution_gpu_bfyx_gemm_like",2], + "7562624810837784407": ["convolution_gpu_bfyx_gemm_like",2], + "14535007186125575064": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14864150409380754546": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5831305777612569716": ["convolution_gpu_bfyx_gemm_like",2], + "6660221471357497741": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10168217053882274702": ["convolution_gpu_bfyx_gemm_like",2], + "13874754478479442212": ["convolution_gpu_bfyx_gemm_like",2], + "11951606039079763598": ["convolution_gpu_bfyx_gemm_like",2], + "5326891298755303584": ["convolution_gpu_bfyx_gemm_like",2], + "5550000568272972532": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2387628682187438903": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "72444706264681262": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "14257548530334193336": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13711710595263882397": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14436334357815544497": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11231597775940542830": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4536811685836767511": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8161047856682416508": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15257886319670476581": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12879205642236526041": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7215460815798365056": ["convolution_gpu_bfyx_gemm_like",2], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2881769839926594784": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "11529521968552409482": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6641684310751726510": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17122338330334998991": ["convolution_gpu_bfyx_gemm_like",1], + "5185895996350118172": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "714397516895317906": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "13146231972557134419": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7005371843527735283": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "4890442595203749341": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "9216695884134021401": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17382660912493284320": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14652719560551657529": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "4690935789908896751": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "17610648476343170476": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6210074450403696110": ["convolution_gpu_bfyx_gemm_like",2], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2287331417346465035": ["convolution_gpu_bfyx_gemm_like",2], + "9235762655002034553": ["convolution_gpu_bfyx_gemm_like",2], + "14996839491874598555": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16507285966998102421": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "557778263661655803": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "7344363094493575878": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17947097500350250352": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "8855801044538137828": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18214405165366931407": ["convolution_gpu_bfyx_gemm_like",2], + "11095908837221722097": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14902389080201926109": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12526627889432649075": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10340099951904598712": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "6489448536745533209": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "12063854963434677046": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "10931533380146553429": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17021953651379372973": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16907043223873231356": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17633445715900116866": ["convolution_gpu_bfyx_gemm_like",2], + "13980058444317683376": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "8039045580314824307": ["convolution_gpu_bfyx_gemm_like",1], + "13286723666743148654": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "13277308739029064167": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14203217958874365062": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "15278336216464964580": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5724069285122500749": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12460004417430913427": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1755021778097194246": ["convolution_gpu_bfyx_gemm_like",1], + "1062464852330435815": ["convolution_gpu_bfyx_gemm_like",2], + "2267942216745157485": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "4766447533088048613": ["convolution_gpu_bfyx_gemm_like",2], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "2705394837952559308": ["convolution_gpu_bfyx_gemm_like",2], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5734909305243135224": ["convolution_gpu_bfyx_gemm_like",0], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "10155417869639270818": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16815373779430857324": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5439738552514649732": ["convolution_gpu_bfyx_gemm_like",2], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "17928043901784474130": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "18216392915308276053": ["convolution_gpu_bfyx_gemm_like",2], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12418390364502912036": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "5821887901198535792": ["convolution_gpu_bfyx_gemm_like",2], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6370629727707634189": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10613156984920928792": ["convolution_gpu_bfyx_gemm_like",1], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "14585144905582599299": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",2], + "7071864660784255328": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15310138877321331399": ["convolution_gpu_bfyx_gemm_like",2], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "4788094685976850847": ["convolution_gpu_bfyx_gemm_like",1], + "5699637716202391188": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "451787079167744428": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "6696330836969622824": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "1617907811128880383": ["convolution_gpu_bfyx_gemm_like",2], + "11173744709088359283": ["fully_connected_gpu_fb_oi_ref",2], + "15173187675372221634": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "17868294056467093895": ["convolution_gpu_bfyx_gemm_like",2], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "10050254009828302053": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "17390307025967314108": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "7457951266863598199": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14595102366207856448": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "9391102514951576629": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2155348872565175553": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6381439938385141423": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2265784112305305260": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7666505529539001492": ["convolution_gpu_bfyx_gemm_like",2], + "17300963371220857043": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9150686862263626364": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "6066347819693426556": ["convolution_gpu_bfyx_direct_10_12_16",2], + "581553908799266285": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",2], + "2543041530639980505": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "7084794834886364709": ["convolution_gpu_bfyx_gemm_like",2], + "8977099691399563065": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "15747538142554815480": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "14156264942337528284": ["convolution_gpu_bfyx_gemm_like",2], + "893885204484374577": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "7671440804202996063": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "11882388384272635526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9080269503597463911": ["convolution_gpu_bfyx_gemm_like",2], + "11985789598994479652": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "861944552852043171": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",0], + "15076307524263378967": ["convolution_gpu_bfyx_gemm_like",2], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "11646035413147246650": ["convolution_gpu_bfyx_gemm_like",1], + "8436644625511258721": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17499047811775012205": ["convolution_gpu_bfyx_gemm_like",1], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "40684756725622867": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15404352708246779967": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17703907155485973486": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "18269382610859905921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "683530182479794259": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "10506079835013332412": ["convolution_gpu_bfyx_gemm_like",2], + "10433541468308381909": ["convolution_gpu_bfyx_gemm_like",1], + "3652749152621176846": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10747768416582634270": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "14433939319502072879": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12854110364457722483": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10002942280571012447": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14611470203914805229": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "3317498303952226642": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7957927312958744432": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "14784115394395151055": ["convolution_gpu_bfyx_gemm_like",2], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6990161783770805523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12361909180687647792": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9219978118417391687": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15184480575877095737": ["convolution_gpu_bfyx_gemm_like",1], + "18400137500031567479": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "10712251675747436685": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "1404523328737649536": ["convolution_gpu_bfyx_gemm_like",1], + "10340626080611300806": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11913865086932469909": ["convolution_gpu_bfyx_gemm_like",2], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6955820760012983739": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "5901470393936541758": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "11561352430430157770": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "12659539044474018256": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "17479614483340719566": ["convolution_gpu_bfyx_gemm_like",2], + "15630712601053635938": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "15314178289202641916": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15385836287435319028": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13931470674812510958": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "15982499072593548907": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "3805991105758534542": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4810979456269693700": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14387663434151374245": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "8093154215631195896": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "879461985074219072": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16468779692009938330": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "16507216630035678597": ["convolution_gpu_bfyx_gemm_like",1], + "8525631489886320841": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "16495435651959280198": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "14017106221778585861": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16620032793356620588": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15668060723417155782": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "15905812449037427213": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15372944709956866587": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "393130776826919699": ["convolution_gpu_bfyx_gemm_like",2], + "10710426249911063154": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "17790622334577372736": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12138341287265949399": ["convolution_gpu_bfyx_gemm_like",1], + "9110265526128628472": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "14322754320861242412": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "11388177266504804841": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14243609293683870669": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "5385637020152792781": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "17651949893303962955": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "557926911473978758": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9133224739401155411": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6946815194102787268": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "3095800485689583188": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "779633618375662086": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3094541981461578435": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "15444345793124210505": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "2822531372171708171": ["convolution_gpu_bfyx_gemm_like",1], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3787897045202294227": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "9285566577169147378": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "8954139494467782298": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4184940877670248246": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "3013359852055354405": ["convolution_gpu_bfyx_os_iyx_osv16",1049], + "15927212142469570269": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "10744779302034526105": ["convolution_gpu_bfyx_gemm_like",1], + "10422138282116598013": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6046380638013542109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18169371857833455144": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15140592697506341614": ["convolution_gpu_bfyx_gemm_like",2], + "15033864286535250007": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "6925829066248055368": ["convolution_gpu_bfyx_gemm_like",2], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3167115892101501516": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "11379252854859166206": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17829983167337875463": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10409424254454997557": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8435953773852854494": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "10772763339005937717": ["convolution_gpu_bfyx_gemm_like",2], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13771196685227797262": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "6754359635395225555": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "890679620691833367": ["convolution_gpu_bfyx_gemm_like",2], + "871656942964602772": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "7458923250983373160": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "18305785425659656349": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "10869059995205753062": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "9626028243479089234": ["convolution_gpu_bfyx_gemm_like",2], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13816380312874384117": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7963120178142346699": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5061053593616346116": ["convolution_gpu_bfyx_gemm_like",2], + "801943727169437597": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14503814672536990561": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "12693511427898130707": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1891216794223363114": ["convolution_gpu_bfyx_gemm_like",1], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3201851883430682391": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "914589847837601900": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "1305434952341925041": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "11213283109763090897": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "3290503865540626256": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "6375149408738336520": ["convolution_gpu_bfyx_gemm_like",2], + "8094836777153039013": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "774981050284188673": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15529767675448574617": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15464327246951632247": ["convolution_gpu_bfyx_gemm_like",1], + "3179874645565098825": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "5776920093461427179": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "8790992468693685188": ["fully_connected_gpu_fb_io_ref",2], + "17608082492919905570": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5150467145740542480": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10252930102508743294": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9660587580162063066": ["convolution_gpu_bfyx_gemm_like",2], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "10133406610245448421": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17195491464960153261": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "1557549837620967530": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15197400201857680173": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17376180096577763039": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11353671464383068485": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "14322392426975869640": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "6227066883925046010": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "10578656188786691161": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "11800958516083095340": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "4072967257556128157": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "4292467512797995948": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7287802938269404923": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9180575279116075400": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6404731509766519779": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8195881973746570408": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10792503079194374004": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "8021962180961047152": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "18136765667969393174": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9895036366054127607": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11002656253983635383": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "8481272193490654884": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "4016652650196255483": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "16159055229009077435": ["convolution_gpu_bfyx_gemm_like",2], + "4573547058027867538": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16165264024659208580": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "3539764293444807886": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10849780273184392468": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",2], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "7023033151960653752": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7636001038842031672": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2858694223939965231": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "4680261350523889008": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14951164724050668856": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "15594387862678649962": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "11273554217552152172": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6438522646185979880": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "80038800201815976": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1917986916390093536": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2054895351334936744": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "4151997155802743451": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "18213389163198755626": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "12363462562375148101": ["convolution_gpu_bfyx_gemm_like",1], + "11312797737791604596": ["convolution_gpu_bfyx_gemm_like",2], + "15392592805235453180": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5424159498790442193": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "7390751298966198773": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "6695224851008237679": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3865480446980740412": ["convolution_gpu_bfyx_gemm_like",2], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "1615155632991337496": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14326748416648598247": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "2518919454830671073": ["convolution_gpu_bfyx_gemm_like",2], + "17750329428766282997": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "414342067295883061": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9358320688298379206": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "18139055731468596187": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6129602738379919488": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "5922243230245842969": ["convolution_gpu_bfyx_gemm_like",2], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13387804712929042302": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11927673108508931485": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "13429534778879474114": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11066538564303243604": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4440261013093281358": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17325129240374428839": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18074320074700491416": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "12352083215873760290": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5601320732740276692": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10462203417605590793": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12561852932488001568": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17337689605705740533": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "684240994243755872": ["convolution_gpu_bfyx_gemm_like",2], + "10973267399508186283": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "8703051983346886620": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3807725810350819929": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14998412675237613013": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17800494747865760215": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "1241188741090538769": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "2605525859754242318": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15743075522781198932": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7903891232234389925": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "1818234431954731769": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "1555841293175143289": ["convolution_gpu_bfyx_gemm_like",2], + "2140514316203117958": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1691020960118022320": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15260010680436431377": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11066913713501760080": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6879801583428507100": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "7945923871349397386": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "737706555781027628": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "3826083535442459719": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4460838234035901102": ["convolution_gpu_bfyx_gemm_like",2], + "17393241435373906917": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "791937929163665770": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13855910108498240870": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",989], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "4987922194420804256": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "10665697051755790682": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "4004333174619528327": ["convolution_gpu_bfyx_gemm_like",1], + "11215297942420903101": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "12260041857695743504": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15220874718853723626": ["convolution_gpu_bfyx_gemm_like",2], + "17993337310288098038": ["convolution_gpu_bfyx_gemm_like",2], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "6683090495189325653": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "8065408380801722040": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "17370560568464798319": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1541754036637209097": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13381441263790184121": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "1742897526168249500": ["convolution_gpu_bfyx_gemm_like",1], + "17508515605648584094": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3142706898070129318": ["convolution_gpu_bfyx_gemm_like",2], + "7833495651619250213": ["convolution_gpu_bfyx_gemm_like",2], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "9549667332801021099": ["convolution_gpu_bfyx_gemm_like",2], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11740474593275702888": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5648099611567577611": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "8162762980597497749": ["convolution_gpu_bfyx_gemm_like",2], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2727219457659794468": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15825993019555657125": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "13186342942242476803": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "13267438341255312172": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16566714514564722975": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "16857192626139882429": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13353123037511986804": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "265378250397648692": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "18260147016899103633": ["convolution_gpu_bfyx_gemm_like",1], + "8374232727884943288": ["convolution_gpu_bfyx_gemm_like",1], + "2253443114793765536": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16132186023443894579": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "16461300997058854554": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14122647818827599984": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "16091195788712971747": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "14869125900405603130": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "4152919461079296700": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16062811901668074268": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "17761681290527373180": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "12266072789949082198": ["convolution_gpu_bfyx_gemm_like",2], + "3349519148124496343": ["fully_connected_gpu_bf_io_gemm",2], + "13410178186827874638": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "9226912483632588371": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13426413463253581310": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "10010921697596131761": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "16042236932298055236": ["convolution_gpu_bfyx_gemm_like",0], + "8713639086785023623": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "3855859061709004677": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "17873182129275583020": ["convolution_gpu_bfyx_gemm_like",2], + "5073980187181521102": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14214141488645257351": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "8700953648388124963": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "263575476655527355": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "16273414163942580140": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "17034122796081495259": ["convolution_gpu_bfyx_gemm_like",2], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "10835684445936063871": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4409539711630405776": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5627351109775149477": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "14204028212129440429": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "2235888904701517631": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13947140171097868740": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "2305345466244887603": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4693778191222244259": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "16126210124715599267": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5440622601084846974": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "6104567430127604601": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "12576360049619146496": ["convolution_gpu_bfyx_gemm_like",2], + "7533669599936874355": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "14408266407898585602": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "26434141991791193": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4186957909762095019": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "17075150439662364176": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",0], + "12015922610963701033": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",1], + "16763947298003094797": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "7476503420928065329": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "2839767407547705101": ["convolution_gpu_bfyx_gemm_like",2], + "15466940145773097237": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "11208787273440167590": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14944798586094927774": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "10670829898588047148": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13842309033760176194": ["convolution_gpu_bfyx_gemm_like",2], + "2588106330058954614": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4011704860949525864": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16207793515276299964": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",1], + "3495464175121035222": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "13869279315296163696": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "14146157492452859667": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "4428125859693766145": ["convolution_gpu_bfyx_gemm_like",2], + "18052322665755789573": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",1], + "8420176522157084802": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8619380242063264016": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2315979511894958580": ["convolution_gpu_bfyx_gemm_like",2], + "8394085742794617896": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "14862938122758223157": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "5084402281339667158": ["convolution_gpu_bfyx_gemm_like",1], + "3800864312883193560": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "3643056883397245235": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "5812274221348979687": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "18109284647478027063": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",2], + "13485431068391184236": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1096929244128185929": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "10545983240319359348": ["convolution_gpu_bfyx_direct_10_12_16",2], + "555153826947872383": ["convolution_gpu_bfyx_gemm_like",2], + "18194662560696168435": ["convolution_gpu_bfyx_gemm_like",1], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17699579394941627848": ["convolution_gpu_bfyx_gemm_like",2], + "18106333667377667797": ["convolution_gpu_bfyx_gemm_like",2], + "2424832456352484524": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14559552090809408184": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14350963106032411355": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17347387929692736001": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7917673216808705075": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "6329618009202266591": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "7111620180131341264": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "12711366212612147422": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14605107834931199380": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17381682740282686038": ["convolution_gpu_bfyx_gemm_like",1], + "553884705007944190": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",2], + "13748207123919546925": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7822148442995976259": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7379959915507694400": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2615550169523847175": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13400559817638330692": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17061233750738578337": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "4238163995861108694": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "1961296939362567851": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "11431776034512615562": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11490143853656040028": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "11080118408282076423": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2038505773698938555": ["fully_connected_gpu_bf_io_gemm",1], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "5514520264534847093": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6478247863479663432": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14361697687217060995": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6857064389795419021": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "8332688858465419317": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13094313253457422444": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "2053428297205345660": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16674897846232931666": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13646026173083209094": ["convolution_gpu_bfyx_gemm_like",1], + "10253092389452603623": ["convolution_gpu_bfyx_gemm_like",2], + "8012414839721814470": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11102920976866402928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14609655423082082099": ["convolution_gpu_bfyx_gemm_like",2], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",1], + "3927333491885837374": ["fully_connected_gpu_fb_oi_ref",2], + "18136968124686255108": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "656536921219262336": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17140704838989242732": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13891498649894490342": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6625355663340809894": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8382355932367801226": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "7486133596762640215": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "3457676694935264283": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4242438539626727158": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "1188428190761098784": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9996590003462421281": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16614092873294424156": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8964252048679144533": ["convolution_gpu_bfyx_gemm_like",2], + "17821196374523699955": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14788817017267716113": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "7578465277886568471": ["convolution_gpu_bfyx_gemm_like",2], + "7877872008801536537": ["convolution_gpu_bfyx_gemm_like",2], + "12174729877807876787": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12651215303242591871": ["convolution_gpu_bfyx_gemm_like",2], + "13499476832444042458": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "7596423139159263456": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "10462797712860969072": ["convolution_gpu_bfyx_gemm_like",2], + "12526417587678222534": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6223991300587768990": ["convolution_gpu_bfyx_direct_10_12_16",2], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5287076386757143976": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "15199659885055090985": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13510598063226540077": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "11232261979256657934": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",2], + "8394337033015371278": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11864780937861562358": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "2602209853120236226": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "8104509697376352086": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11863623794400366834": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5490683510357615963": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "2349007644347065353": ["convolution_gpu_bfyx_gemm_like",2], + "11769511287553067221": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "16286085532892593349": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7853648744637103420": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "12882754981683858333": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17387764798693150143": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "18026754720065676632": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "10942743767167283370": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "708347829794105085": ["convolution_gpu_bfyx_gemm_like",1], + "18372277746801271292": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3046878786712386934": ["convolution_gpu_bfyx_gemm_like",2], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "18012549942299450620": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17119834538806653818": ["convolution_gpu_bfyx_gemm_like",2], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15989164585998175871": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "17274625805315816028": ["convolution_gpu_bfyx_gemm_like",1], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "6366477005383470532": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "4678945085654662665": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "3266638956600784732": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1708527842474979709": ["convolution_gpu_bfyx_gemm_like",2], + "15038779174806415801": ["convolution_gpu_bfyx_gemm_like",2], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "8415763978601237333": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "12944449254981328284": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16351593165006175213": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "16496066467505445971": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17480519865636248903": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "2830019939638455400": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3547275591884493445": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "11439519952236570490": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",1], + "3497946462254198388": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "13041981853634484809": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "13343968006718934574": ["convolution_gpu_bfyx_gemm_like",2], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",1], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "139367204458861048": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2174528711050181972": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "5566145479615299930": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "10134863884423338495": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "2297846338452062425": ["convolution_gpu_bfyx_gemm_like",2], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17838473675663772639": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "772794189370544860": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "204378699575356398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11275526584835606578": ["convolution_gpu_bfyx_gemm_like",1], + "14168946412009689868": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "18259018980049662870": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8403919905230540356": ["fully_connected_gpu_fb_io_ref",2], + "17509205154057032109": ["convolution_gpu_bfyx_os_iyx_osv16",471], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7584912988728072414": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",1], + "3069726952591207961": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15890492401334524258": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "16916632481840858091": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "9031338938030715616": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "2684971093531227585": ["convolution_gpu_bfyx_gemm_like",2], + "9970142663470031403": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12932174902085755507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10681304359334525584": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "15507430010796753396": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "3723082283919334922": ["convolution_gpu_bfyx_gemm_like",2], + "17286180622990393912": ["convolution_gpu_bfyx_gemm_like",2], + "16881320590336043120": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "11178675492112714513": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "2102507337684140674": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7606277451240586967": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13372079273473545269": ["convolution_gpu_bfyx_gemm_like",2], + "12077176094606956613": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16832083703120717402": ["convolution_gpu_bfyx_gemm_like",2], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "5930451476167223501": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14524011013133838054": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "6324194607665787911": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "18057258413318190788": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6858245954375015939": ["convolution_gpu_bfyx_gemm_like",2], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "5740738339752793113": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "7092429446071184360": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "14840301687056551916": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "6307840223437204536": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "2758256770667070477": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "17621284804179990612": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15752695063119223631": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "9322808125154719434": ["convolution_gpu_bfyx_gemm_like",1], + "5019077257951332016": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10534355502345993326": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "786418751322581924": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15078379507314446744": ["convolution_gpu_bfyx_gemm_like",2], + "11673506380927771816": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4563407231964979217": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "522181557896569275": ["convolution_gpu_bfyx_gemm_like",0], + "8954957191824520301": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "3055842046969432235": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "765085235448596225": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "176148486634277377": ["convolution_gpu_bfyx_gemm_like",2], + "1743672154424707483": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "9519113693008246391": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "3892873577927627992": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10565789595834959047": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10743628077362128751": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10031973538398542700": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "9236621881488650027": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "12850610175882424919": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "16822728519529055454": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8886676435675463412": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "11726125778063855770": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "7002547494442875680": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15751445344585167275": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "1187817806204244044": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",2], + "253337639942573142": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",1], + "10254790628108678637": ["convolution_gpu_bfyx_gemm_like",1], + "9513218905938141296": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "170594581804738255": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "18415227597391874233": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "17707294419513060769": ["convolution_gpu_bfyx_gemm_like",2], + "15861253904810475842": ["convolution_gpu_bfyx_gemm_like",2], + "6638761803107874904": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11033824757086203326": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "16767657090925788431": ["convolution_gpu_bfyx_gemm_like",2], + "7174790971918109163": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "18096803908321982720": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8938942439963723596": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "6447357750120537934": ["convolution_gpu_bfyx_gemm_like",2], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13476976389397273052": ["convolution_gpu_bfyx_gemm_like",2], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16703049240941366828": ["convolution_gpu_bfyx_gemm_like",2], + "14121939808880396150": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "11044223289209000460": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13387602037439694372": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "104165137500939902": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7552144047474664265": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "15598527290222497283": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13881505737488515065": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "1014934490175718598": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15891746043846062984": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "11782514629636023633": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1896394898744191046": ["convolution_gpu_bfyx_gemm_like",1], + "9055254157155243850": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "475079717987185580": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "4344644499804057502": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15467064540951151390": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16120159001372711511": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "15643053402284856082": ["convolution_gpu_bfyx_gemm_like",2], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "17517541283617012275": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "8321148793275220552": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11078289776590382448": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "9423239651872522813": ["convolution_gpu_bfyx_gemm_like",2], + "1957975992563882145": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "10104091044601583658": ["convolution_gpu_bfyx_gemm_like",2], + "2686152083115758704": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "6672808203620992802": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "17172842643607718498": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15911508155433936727": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "529543453251381109": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9305758766575321575": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14555366228958374512": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "7199295899520406795": ["convolution_gpu_bfyx_gemm_like",2], + "12796777049340516563": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "7020743056013297476": ["convolution_gpu_bfyx_gemm_like",2], + "14071393823183565145": ["convolution_gpu_bfyx_gemm_like",2], + "13602299412525111348": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "12394049027081208902": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",812], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "10087048842366891699": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "142345353315012903": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4132087699110753428": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "591445875836641836": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "4960466075321426984": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "15976399554094563736": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11386443944172875185": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5485050451156514865": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "17615365894230830516": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14118838785256822389": ["convolution_gpu_bfyx_gemm_like",2], + "8866164762286856139": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "97332433783610027": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "17080372737840346243": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12650986929262866534": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "4477135619420651110": ["convolution_gpu_bfyx_gemm_like",2], + "9040986180016264906": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "1413598669014941757": ["convolution_gpu_bfyx_gemm_like",2], + "7431469348791099474": ["convolution_gpu_bfyx_gemm_like",2], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "13470016086265528105": ["convolution_gpu_bfyx_gemm_like",1], + "5854267518455107328": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15603710070700542017": ["convolution_gpu_bfyx_gemm_like",2], + "5219818570070061892": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1601512693620510391": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13297691763391637265": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",1], + "5706423911886410117": ["convolution_gpu_bfyx_gemm_like",2], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "12951069548510783681": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15591167992985613695": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4107088111454348836": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "6124219814856247918": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "10062957707721107508": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16179959997108523051": ["convolution_gpu_bfyx_gemm_like",2], + "9647713236241614167": ["convolution_gpu_bfyx_gemm_like",2], + "10884966210360699082": ["convolution_gpu_bfyx_gemm_like",1], + "2728956755635458379": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8578774826625315147": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5414285637221358737": ["convolution_gpu_bfyx_gemm_like",1], + "14172081523880352608": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "15786328370300803713": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12647099325257717945": ["convolution_gpu_bfyx_gemm_like",2], + "13292923826380958700": ["convolution_gpu_bfyx_gemm_like",2], + "18439017855540532958": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "3683201905077543598": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11855137287698046529": ["convolution_gpu_bfyx_gemm_like",2], + "15479071839425218367": ["convolution_gpu_bfyx_gemm_like",2], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "9145357433824567384": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "8797843396807284399": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11544455862638831851": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3296080624478711270": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "15929970324703663357": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",2], + "16181623411787179429": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "7780366826820540504": ["convolution_gpu_bfyx_gemm_like",2], + "4538102435488584866": ["convolution_gpu_bfyx_gemm_like",1], + "7129623351507828661": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "16629493658542781988": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "14177187878748170225": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4049276089777687996": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "143255828863957128": ["convolution_gpu_bfyx_gemm_like",2], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "8882042369902399339": ["convolution_gpu_bfyx_gemm_like",1], + "676641023579624117": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13194245601015251743": ["fully_connected_gpu_fb_io_ref",1], + "1641881628032037384": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "9529614587861271730": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "17116941326889312928": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14336344152455180534": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9559550404190168365": ["convolution_gpu_bfyx_gemm_like",2], + "8985531644129639832": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "875296362957469305": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "10071611039987219440": ["convolution_gpu_bfyx_gemm_like",2], + "17585210048585855482": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13558603350852076889": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "13839075443229327158": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7570078010521452080": ["convolution_gpu_bfyx_gemm_like",1], + "7054270030260701612": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "12847879935060092791": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16483792160297698151": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5343186686923330871": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "2438221595194783178": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "294103776081392899": ["convolution_gpu_bfyx_gemm_like",2], + "689445825453914111": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4729855738455185191": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "1630585964216121575": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "7630342538679060038": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "18383733736250135501": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15995056067568652754": ["convolution_gpu_bfyx_gemm_like",1], + "15129201859573664210": ["convolution_gpu_bfyx_gemm_like",2], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "3012268657922581268": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1160579996766519752": ["convolution_gpu_bfyx_gemm_like",1], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16522546805419218429": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12992163255353386581": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "11071972036962275632": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "1269703478898366518": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",1], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",1], + "9052153145556623933": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "14037325204801680738": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "2482449683288477640": ["convolution_gpu_bfyx_gemm_like",2], + "6515141738021465336": ["convolution_gpu_bfyx_gemm_like",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "12709406234969954619": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7963529808900784906": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "9890252170749328138": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5053369963163583573": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "14247451223653900488": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "12698546873263218041": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3408249386342406615": ["convolution_gpu_bfyx_gemm_like",1], + "9454028594043242985": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "13401926003864565026": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "8058623285594809047": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13624969243174329965": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3810356382905059819": ["convolution_gpu_bfyx_gemm_like",1], + "1836277956961261472": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9164584153555521506": ["convolution_gpu_bfyx_gemm_like",2], + "10265955847846166394": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7291920886894073603": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3191047205441946466": ["convolution_gpu_bfyx_gemm_like",0], + "15862793522143880668": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "8434794604559592624": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "11595387512434355394": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "17035903590837750750": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3510837206834640871": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "14291113322487568376": ["convolution_gpu_bfyx_gemm_like",2], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "11892210755884128272": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "844742962836593299": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "11191005013126286532": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "13727643349589056375": ["convolution_gpu_bfyx_os_iyx_osv16",439], + "11273168411455998347": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "3509027370372599394": ["fully_connected_gpu_fb_io_ref",2], + "14185215566042478462": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "12927339938362960563": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "1265277707626014051": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "4491694127072416122": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "5340016094501559693": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6150043972317126583": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",21], + "281287280558289393": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "4264078972561407296": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "997155336931700015": ["convolution_gpu_bfyx_gemm_like",2], + "7552049239568474944": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3280795516668356985": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "5296506025538423220": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "13987250743654950733": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15381014522874131924": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "11026432639515866259": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3625906783784771100": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "9339038855869763548": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "14907038741687299621": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4206637285289830669": ["convolution_gpu_bfyx_gemm_like",1], + "9266375177690276615": ["convolution_gpu_bfyx_gemm_like",2], + "17543625777838573622": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "5515216528474382598": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "18076129452098771655": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "5750277248295796439": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "12815588500303820284": ["convolution_gpu_bfyx_gemm_like",1], + "10809330882739297269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11359020774437470164": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4476037346005841003": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13198480749588992978": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15452906059667613512": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4178614913813882037": ["convolution_gpu_bfyx_gemm_like",2], + "1435153323458789173": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13312401790608349463": ["convolution_gpu_bfyx_gemm_like",1], + "11919579121199894437": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "7351443601143314161": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2801141274570069180": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "9883682535839267422": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "1686420552593340731": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "8898449752724034655": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "830147122986411443": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8837079302496539409": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "2995957440356398418": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4316278502963439894": ["convolution_gpu_bfyx_gemm_like",2], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "14645023135017806432": ["convolution_gpu_bfyx_gemm_like",2], + "13054706902087663592": ["convolution_gpu_bfyx_gemm_like",2], + "17372326727957287976": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10554266898346470422": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "10168272404395268951": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9556219639756304369": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "906587812125311288": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15406324750533549980": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6750003965952674453": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "4949865765880884373": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "11622271315873664622": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "6278892144796112655": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1090447867763814054": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11845504142528424662": ["convolution_gpu_bfyx_gemm_like",2], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "16995919898822376726": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "11354523117287453982": ["convolution_gpu_bfyx_gemm_like",2], + "3239779684432082106": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "15783558375979538895": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",883], + "16605697831520435304": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "625469553102754234": ["convolution_gpu_bfyx_gemm_like",2], + "20037669704517227": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2328951328483718941": ["convolution_gpu_bfyx_gemm_like",2], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "9513403717116039597": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12685978195521469707": ["convolution_gpu_bfyx_os_iyx_osv16",189], + "12752101288912456176": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "1294871956977733262": ["convolution_gpu_bfyx_gemm_like",2], + "15692223101958737604": ["convolution_gpu_bfyx_gemm_like",1], + "11453044274130869816": ["convolution_gpu_bfyx_gemm_like",2], + "12379734005351960619": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5786828339670204894": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "4010650902230520983": ["convolution_gpu_bfyx_gemm_like",0], + "13583272198088247606": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15134268179029323647": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7395593936948809439": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "3349108500387301004": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12407002532205454767": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "7004953121070642766": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "6644418194983229139": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "1027438463802481676": ["convolution_gpu_bfyx_gemm_like",2], + "10642327923162019888": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "5061795324735006354": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1421879144542252228": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "16978447917682236120": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6771637612965430926": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "4586246090279043149": ["convolution_gpu_bfyx_gemm_like",2], + "17357800564047774826": ["convolution_gpu_bfyx_gemm_like",2], + "2008999755215725290": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "981733129438741439": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7211355951470869591": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "5338109154207406041": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5031342439443897167": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "14249486431781112226": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "5424164608102708333": ["convolution_gpu_bfyx_gemm_like",2], + "11802527991096689252": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "981197653890885407": ["convolution_gpu_bfyx_gemm_like",1], + "8612114608666892632": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1019936903773818652": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13077961697656030315": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8317140711232187781": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8169762955969255618": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "123283730755186382": ["convolution_gpu_bfyx_gemm_like",1], + "5083776511235413204": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5510336500642744696": ["convolution_gpu_bfyx_gemm_like",2], + "9625931001541723278": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13538051178827008933": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "7744787957569714828": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "3378088934862423864": ["convolution_gpu_bfyx_gemm_like",1], + "7978370756654787278": ["convolution_gpu_bfyx_gemm_like",1], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "17340789730321673934": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "7843833033404155302": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5670530004773188380": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",1], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "15031089621161080026": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15156015174611610705": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "8055193939726603877": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10598995451755327159": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13336847303794450665": ["convolution_gpu_bfyx_gemm_like",2], + "4992668316921598993": ["convolution_gpu_bfyx_gemm_like",1], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "8578747191812631883": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14915908231779912828": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9676055912997166605": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4369680877112803848": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "4745007371868123765": ["convolution_gpu_bfyx_gemm_like",2], + "288825580282908143": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "16932172538978111342": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "13850807749756445264": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "778175413671462719": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "10704037259494193565": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "11734299455885510243": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15395497315929884637": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "17769940507971546305": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7246177123265734169": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "14848351491062336554": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "14443599718173185176": ["convolution_gpu_bfyx_gemm_like",2], + "4217179485243909459": ["convolution_gpu_bfyx_gemm_like",1], + "13625877249040282040": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3526580286148537369": ["convolution_gpu_bfyx_gemm_like",2], + "14257161696605459633": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "12529210672030682764": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "6768451741770053089": ["convolution_gpu_bfyx_gemm_like",2], + "15943174060386142134": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "16415344078703911571": ["convolution_gpu_bfyx_gemm_like",2], + "15822975685755664152": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "6577240413312348523": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "11668043528929060706": ["convolution_gpu_bfyx_gemm_like",1], + "15379595951542162189": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2056766012044921101": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "2384942244346844027": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "6400671582981760192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9746964858035717775": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "9583760104223104233": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13956744866244022582": ["convolution_gpu_bfyx_gemm_like",2], + "14403780921831769097": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12956535344568057480": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "1753515740487760297": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10160082844961863335": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "11875516764635427358": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "12242618640422208652": ["convolution_gpu_bfyx_gemm_like",0], + "12761366575293006784": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15051114821536746998": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6706802683366112205": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3661305534604931936": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3598116387801985039": ["convolution_gpu_bfyx_os_iyx_osv16",676], + "12478421208861550581": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13066055561434178894": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "18160969423211875528": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "104321144590863458": ["convolution_gpu_bfyx_gemm_like",2], + "9008848676120441863": ["convolution_gpu_bfyx_gemm_like",2], + "4695273549696315193": ["convolution_gpu_bfyx_gemm_like",2], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "4563773888811395621": ["convolution_gpu_bfyx_gemm_like",2], + "5351705572686943348": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2647922515901529845": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "296202142406900242": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13342769641176584743": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10468562355439385073": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "4503960445974334415": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "9492331996847106233": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7107513718824525169": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11376522803174788945": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1772363899841601255": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "16715151641337602113": ["convolution_gpu_bfyx_gemm_like",1], + "7997955859883990923": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "6474882514032493642": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13348855287761849180": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15922076723067110929": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3980754726678047241": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9794061741834174000": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "7410220112400588068": ["convolution_gpu_bfyx_gemm_like",2], + "12323840136934980793": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "13110173649734084688": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13411431109933021193": ["convolution_gpu_bfyx_gemm_like",2], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3590316457726550768": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12942085219027232135": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "8981229334098733320": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "14682537852514419239": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1884327428051733366": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "2301409406426420354": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11091004452522208782": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "12386437738920143482": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "1660279112011537957": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16122815225820081176": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "10599639229366933472": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "11674725184029885494": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "12225119940380026093": ["convolution_gpu_bfyx_os_iyx_osv16",1034], + "10908411570889102154": ["convolution_gpu_bfyx_gemm_like",1], + "15227034948424983496": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17659601542171299562": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12895496994338720556": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2506154888542197909": ["convolution_gpu_bfyx_os_iyx_osv16",860], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",2], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11534123522633460320": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11922163303962372849": ["convolution_gpu_bfyx_gemm_like",1], + "11357813056434049302": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "2950917846016525392": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "15156525717629023944": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "6172851296465788161": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6432519735121751346": ["convolution_gpu_bfyx_gemm_like",1], + "14685573786743639408": ["convolution_gpu_bfyx_gemm_like",1], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "11141999085710526242": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8951503172834790833": ["convolution_gpu_bfyx_gemm_like",2], + "13498795599230228492": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "16815680874311765189": ["convolution_gpu_bfyx_gemm_like",2], + "13886526360627032217": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "10476627457539425144": ["convolution_gpu_bfyx_gemm_like",2], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "17065380294456704620": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13441117085490814804": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1034], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14601912265050074833": ["convolution_gpu_bfyx_gemm_like",2], + "5816730482014477109": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10016243001407196485": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1502236537645808646": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "14682894856346977838": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "60749853744407778": ["convolution_gpu_bfyx_gemm_like",2], + "5032866547826271476": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "12630173933512965589": ["convolution_gpu_bfyx_gemm_like",2], + "3297036980627776719": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17160915544701715607": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13285123703712436126": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "682912708716537431": ["convolution_gpu_bfyx_gemm_like",2], + "14454927839795553295": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16039372573821594566": ["convolution_gpu_bfyx_gemm_like",2], + "9929060811766882316": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "7043547563530810431": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "11546295514640813785": ["convolution_gpu_bfyx_gemm_like",2], + "7693556065684619275": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8618835732380720921": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11906319144823550582": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",2], + "815847426244665239": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "4212194737559719449": ["convolution_gpu_bfyx_gemm_like",0], + "2352142833866194508": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "568114041320772862": ["convolution_gpu_bfyx_gemm_like",2], + "10616832946298118456": ["convolution_gpu_bfyx_gemm_like",2], + "14581447673401303181": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "26773921190137993": ["convolution_gpu_bfyx_gemm_like",2], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "3762117189312286955": ["convolution_gpu_bfyx_gemm_like",2], + "17453621319901961773": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "4565037760028957581": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "15578217564714846277": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "8697631439739291302": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "7313000297447719088": ["convolution_gpu_bfyx_gemm_like",2], + "13993319023992950944": ["convolution_gpu_bfyx_gemm_like",2], + "11796671083187280457": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "15637565679147396649": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",4], + "14385995236701277049": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "6031307393395339699": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15432337846778101995": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "2722601800398376127": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "15616954046484566002": ["convolution_gpu_bfyx_gemm_like",2], + "15830721134654889992": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "7974918595373182037": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8178825467227185946": ["convolution_gpu_bfyx_gemm_like",2], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13102754309439605192": ["convolution_gpu_bfyx_gemm_like",2], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "13691555384698806010": ["convolution_gpu_bfyx_gemm_like",1], + "15863633107759120207": ["convolution_gpu_bfyx_gemm_like",1], + "12511186263003392018": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "15168098632351740923": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "16650590194585316886": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2743892624333411461": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "17177353407003831190": ["convolution_gpu_bfyx_gemm_like",2], + "3292554262586950764": ["convolution_gpu_bfyx_gemm_like",2], + "5635504912415420460": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",151], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "11067412830219638639": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "14865708345458193472": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "15464714725848277081": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10716913534741102635": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "3596159214965874273": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "11210961619302975072": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8319405652132127420": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9980945809859857871": ["convolution_gpu_bfyx_gemm_like",2], + "13858485871773319706": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13319880343534837963": ["convolution_gpu_bfyx_gemm_like",1], + "6983900601570231321": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11897886369869427808": ["convolution_gpu_bfyx_gemm_like",2], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "16540183777173974162": ["convolution_gpu_bfyx_gemm_like",1], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17052596472114345717": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "12458305535453345462": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13255006150107668739": ["convolution_gpu_bfyx_gemm_like",2], + "17097621900023182992": ["convolution_gpu_bfyx_gemm_like",2], + "14523905821262502926": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5687802882700097624": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "16162899163122139501": ["fully_connected_gpu_fb_io_ref",1], + "15891505875671050928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10271474583233390474": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4640696923527766618": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "13473730516782884152": ["convolution_gpu_bfyx_gemm_like",2], + "9245770108138984525": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4959718589070770515": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",1], + "13234170505677988638": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "45545661884854912": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "5311718276151327830": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12896159402462325805": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "14647949921048404551": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5327803911898085293": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",1], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11012846743944132853": ["convolution_gpu_bfyx_gemm_like",2], + "4713580645061462578": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "11576182324195008022": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9673176853197584682": ["convolution_gpu_bfyx_gemm_like",1], + "3935404533406270186": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13358754652597677285": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "5246229312484886433": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "15939309688773899430": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3805854200552708060": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5219048275475447369": ["convolution_gpu_bfyx_gemm_like",2], + "2832331506191733785": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "710656784939783221": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8306931146242110738": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "118898027441804310": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "7941359635463232326": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "18418073826375395057": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3935174650108042053": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "288853243482418538": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "5342657840254586591": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6025872155179042054": ["convolution_gpu_bfyx_gemm_like",2], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8739570656208259296": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "7086554406050778468": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16342158355942808662": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15887938842582811165": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "4211445170027080823": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15487538714246568015": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17641726060706984007": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "8449108317864057899": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "16536775289334717044": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "13150876648527896999": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "8779987507326777359": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8215519118071138614": ["convolution_gpu_bfyx_gemm_like",2], + "9069245927173134634": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "3120885087070223590": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "6728889146307098720": ["convolution_gpu_bfyx_gemm_like",1], + "14004618842373739106": ["convolution_gpu_bfyx_gemm_like",2], + "16741985699154392565": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8176520928011006903": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14213127286928643795": ["convolution_gpu_bfyx_gemm_like",2], + "1336477297334930004": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12565318283493666631": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "11901687795497708884": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "11858246418724176452": ["convolution_gpu_bfyx_gemm_like",1], + "17355826643208208691": ["convolution_gpu_bfyx_gemm_like",2], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1204089510255285420": ["convolution_gpu_bfyx_gemm_like",2], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",22], + "12621528958448913800": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "2768512766772748723": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15579919505002150556": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "16352438188558979362": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "3594327736281012643": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "8281411537393664160": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "11152834864013527469": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "5384134329664434112": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "17197868427757781334": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "14463841899941062548": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8734220847509054149": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "15597522934012485452": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8770858724416759637": ["convolution_gpu_bfyx_gemm_like",2], + "3651651926851660222": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1460916897832302487": ["convolution_gpu_bfyx_gemm_like",2], + "2251572761614039612": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17503210896556316294": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "14547907449418439737": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15618891972122000521": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "10380031655567712558": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9516288831713776693": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "13550337096609413041": ["convolution_gpu_bfyx_gemm_like",2], + "17459500507201824299": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "18379763351534914922": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "15993427814066246646": ["convolution_gpu_bfyx_gemm_like",1], + "7781809277449433812": ["convolution_gpu_bfyx_gemm_like",2], + "9003196270667188479": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3034466284781235431": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "11198378813600875939": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5509852360472061267": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "1239861345413267621": ["convolution_gpu_bfyx_gemm_like",2], + "1720791539242542292": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "2419819939573989749": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "9390919808369333231": ["convolution_gpu_bfyx_gemm_like",2], + "11882021989615795558": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "4003468969524607815": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12169148580322697755": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",2], + "14524678598440880756": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "14592395793778583608": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1781189282179491198": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16587387608532583713": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "11205571992835612111": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14674266217397415571": ["convolution_gpu_bfyx_gemm_like",2], + "8642397690605957294": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "172303227623890951": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "17855733925989425515": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "13982221711075598070": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "13337122303005980542": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "5134857932624749530": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8740196547852036537": ["convolution_gpu_bfyx_gemm_like",2], + "9781830607177020570": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11297512843662536362": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "16071030448801649281": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4034250407843183678": ["convolution_gpu_bfyx_gemm_like",1], + "3661361503342294227": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7247891577022043949": ["convolution_gpu_bfyx_gemm_like",2], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "8734419426540206087": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "2559310381697374321": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "6659313690133629176": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7617123358753247310": ["fully_connected_gpu_fb_io_ref",2], + "10784905418636316601": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "7999747927804607567": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14670952132900619664": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4276712095427918904": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3806791682244402910": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "11879484013890539145": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "9287404618748313247": ["convolution_gpu_bfyx_gemm_like",1], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "13952295742818866246": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "2094546483928406874": ["convolution_gpu_bfyx_gemm_like",1], + "3831201505512446456": ["convolution_gpu_bfyx_gemm_like",0], + "14097319816812992451": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "7279393739634103483": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "13661880440426932218": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7590767013583950613": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1617362484243823916": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",813], + "8819268903800581706": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "16320454719906370247": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "10972033292930619311": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "16230621843665445228": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "8892991171111842341": ["convolution_gpu_bfyx_gemm_like",2], + "323234725943768094": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15287650965861631130": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "15818237122613168508": ["convolution_gpu_bfyx_gemm_like",0], + "6542486391263861823": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15938703221521364046": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8333743604646422982": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4800208854712166990": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16590893345666612869": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14733291836016183044": ["convolution_gpu_bfyx_gemm_like",2], + "15494543914974994991": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "1081287304647703427": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11609821372586026178": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "10961049607808752432": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6161072079255825074": ["convolution_gpu_bfyx_gemm_like",2], + "10392013312924273545": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "10400727836871462348": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11494395549955384747": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "3329610414149222728": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8986253016099337778": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12606196670791209919": ["convolution_gpu_bfyx_gemm_like",2], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "8863731258634577277": ["convolution_gpu_bfyx_gemm_like",2], + "2586132860307138964": ["convolution_gpu_bfyx_gemm_like",2], + "2844746478867668588": ["convolution_gpu_bfyx_gemm_like",2], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "1400089266180918877": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "17522452942286240233": ["convolution_gpu_bfyx_gemm_like",2], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "15417738436777481469": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4917595053453614536": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "5688623850477433571": ["convolution_gpu_bfyx_gemm_like",2], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",1], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "10773411423039491193": ["convolution_gpu_bfyx_gemm_like",2], + "11809236497308682596": ["convolution_gpu_bfyx_gemm_like",1], + "2146633923143071497": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "2968144776497288135": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "3311449696894745049": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "17472252137354770318": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7026575758396092435": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "15898888434295644774": ["convolution_gpu_bfyx_gemm_like",1], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",883], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "15661055655577513377": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12965800692507042874": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "287386909600391846": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "14800933038795670868": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10721811813682112908": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14762859593402798050": ["convolution_gpu_bfyx_gemm_like",2], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "15972805725107234322": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "11140864132614066113": ["convolution_gpu_bfyx_gemm_like",2], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16261543808418336089": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "15228614030349540878": ["convolution_gpu_bfyx_gemm_like",1], + "6335628260431943016": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "6545814945227676265": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3007505068107685147": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "13722424507812159961": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15659671804906879034": ["convolution_gpu_bfyx_gemm_like",2], + "15893297349596399716": ["convolution_gpu_bfyx_gemm_like",1], + "6612243861034102250": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "3913951712614107871": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2546472090573813082": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1013207188944763398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2679903779216253668": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11409066626289209846": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "7386836350136973872": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "16211466749116679534": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1403373982815401451": ["convolution_gpu_bfyx_gemm_like",1], + "7126601602274920416": ["convolution_gpu_bfyx_gemm_like",2], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",1], + "11914756126771310827": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17224820843490443805": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13683563727561197895": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "14159293183840880884": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11885660439698926227": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13448159575961515854": ["convolution_gpu_bfyx_gemm_like",0], + "13779700363254765602": ["convolution_gpu_bfyx_gemm_like",2], + "18125075313255528454": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2260718905219541967": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5688161172644782612": ["convolution_gpu_bfyx_gemm_like",1], + "12896164738668798380": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5635500901926740475": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "17691748026963003695": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "3513523165606656242": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15754688305730191542": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "37061093840513038": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6830643729780599672": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "8779164026828163571": ["convolution_gpu_bfyx_gemm_like",1], + "352808518345312040": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16835545111241063900": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9343876424591024597": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3064765745900772872": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2008064690158516711": ["convolution_gpu_bfyx_gemm_like",2], + "11447737411040418462": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "16485921493309285440": ["convolution_gpu_bfyx_gemm_like",2], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7589346100701197023": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16615858951735101760": ["fully_connected_gpu_fb_io_ref",1], + "13551767519605460627": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "3830091089824446164": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "5758223108250439377": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4399656162365214694": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "15571801737237063594": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6236857636305802170": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "8631194673451861459": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3392632422002516166": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "9402935157379983392": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13680502636898130714": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5503904988517480229": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "9561367273233389233": ["convolution_gpu_bfyx_gemm_like",2], + "17495070522944546801": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "5176939691838030517": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "4942131377140353094": ["convolution_gpu_bfyx_gemm_like",0], + "14946519992043402896": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "5398895598407183682": ["convolution_gpu_bfyx_gemm_like",2], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",467], + "13753670205703732353": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "2148648022160178995": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "596528462327775677": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "7512702933193596918": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "9644723852089512961": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "264371219192743152": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "8663545677000846511": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7200893702912130808": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "5718747983756317198": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2850803473613487020": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2335783507270234825": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3088402690095697589": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "1211404528755199615": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11521288355888665606": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15816980369722540994": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "5781431860747226742": ["convolution_gpu_bfyx_gemm_like",2], + "15365776263895633531": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "3389739049224815652": ["convolution_gpu_bfyx_gemm_like",2], + "7877637636782924097": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "18398231411109020099": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17520777331163825810": ["convolution_gpu_bfyx_gemm_like",2], + "16462862831307415504": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5348059680010171141": ["convolution_gpu_bfyx_gemm_like",1], + "7289907211627391947": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5378151578014945610": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "5629582391075745771": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "11607736973932389832": ["convolution_gpu_bfyx_gemm_like",0], + "2598910952085172410": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "17342603054992556378": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3332444589775844154": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4136736579788862192": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13161798453564436688": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "18429276095695345973": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9539616823548370185": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9914440875772341708": ["convolution_gpu_bfyx_gemm_like",1], + "14484004336536993120": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "7065121716452374910": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8854234880878427078": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "1194267934213722567": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "2387389473399444503": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "10775785602937893911": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "8124166677361481618": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9267417754412894234": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "5896089609470353090": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "5119087113905313336": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4104062066031480003": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10857567623940140266": ["fully_connected_gpu_fb_io_ref",1], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "5149553691611520515": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11311890411536750673": ["convolution_gpu_bfyx_gemm_like",2], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "859377216693940737": ["convolution_gpu_bfyx_gemm_like",2], + "2915952195141872726": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14142812374094816721": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "12994819742376207273": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12057000101434512661": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "11047759270093007856": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "12715500118796263683": ["convolution_gpu_bfyx_gemm_like",2], + "2830742500858558621": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12445292008737311977": ["convolution_gpu_bfyx_gemm_like",2], + "15158997684077722015": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "13004055504657277105": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8347537383976709519": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "13398875754083902831": ["fully_connected_gpu_yxfb_ref",2], + "16450345154125804290": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6418748992581951435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "12642701787250074691": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "4642402648038764246": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "17026348860895225619": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "4554398307153171456": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4445257000541366640": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "4682062886371423209": ["convolution_gpu_bfyx_gemm_like",2], + "8337457116169698090": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "14969813450703071948": ["convolution_gpu_bfyx_gemm_like",1], + "14167086447992316314": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1961348920992050029": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",2], + "14218701503304823803": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",2], + "541744773413565297": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13647773816638053437": ["convolution_gpu_bfyx_gemm_like",2], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "3300655231758263066": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13985989113434682460": ["convolution_gpu_bfyx_gemm_like",1], + "16576300898841314587": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4082218299236753259": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3138712043201001156": ["convolution_gpu_bfyx_gemm_like",2], + "9493034132406318197": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2984236836610169934": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "9351428703239678614": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "17546650302679801134": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5089359404080552270": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5890683283363730941": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6678101356115372537": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1838534101161814609": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "17646394278957547470": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2651385050387738902": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "4792657031481471098": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13423515205322319913": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1431307776181554710": ["convolution_gpu_bfyx_gemm_like",2], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9771430089730856496": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "17308907916370632622": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "13435416060730279243": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "10842505566649585090": ["convolution_gpu_bfyx_gemm_like",1], + "6326191473779365124": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2705534741438659581": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "17050143605017295447": ["convolution_gpu_bfyx_gemm_like",2], + "11307721164906705899": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "11352094952907979172": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14512311371993445906": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13076343553185159307": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2832311883163804015": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "3182329375739242693": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1077773457856682663": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "17294244481988344762": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5401946420641519048": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10526411638069090068": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "8181704316455400709": ["convolution_gpu_bfyx_gemm_like",2], + "16462033126494826292": ["convolution_gpu_bfyx_gemm_like",2], + "12547252593506448096": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "5321807316257768": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "1071663904249509302": ["convolution_gpu_bfyx_gemm_like",2], + "1878953827218615252": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "8321769923556905957": ["convolution_gpu_bfyx_gemm_like",1], + "7053070767227498983": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12318427976031000768": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3060709449176556770": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "14741012384358891350": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3626743386403140330": ["convolution_gpu_bfyx_gemm_like",1], + "16134637021630473012": ["convolution_gpu_bfyx_gemm_like",1], + "15026219694198820614": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "15671873744670386067": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2870715678422088243": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "5103094815475470596": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3430998232987873998": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1127844465496534455": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15958017891397409552": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "2010255131587843361": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11679869968143173159": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1154469970162137785": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "8260689555974656662": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11206468937763516689": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "1265107284215037966": ["convolution_gpu_bfyx_gemm_like",2], + "6616869272699525153": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6953499208425592115": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "879896719155824868": ["convolution_gpu_bfyx_gemm_like",2], + "10111038481447198008": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6519443541076418301": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "7253709516917901897": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "10236258478395201152": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15513894336778253285": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "8942221095468681112": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13410850301164057911": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "6571438978296387721": ["convolution_gpu_bfyx_gemm_like",2], + "2020044486043617858": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14233219774448115529": ["convolution_gpu_bfyx_gemm_like",2], + "9770300588867836071": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "191374388179598660": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4184357870886924038": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "6235132681081375078": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "13297875917250935192": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14577775579978745344": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "9724624621108712962": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "17638692805430115529": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9729771183572950642": ["convolution_gpu_bfyx_gemm_like",1], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "9212091835906796243": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8528750110601691390": ["convolution_gpu_bfyx_direct_10_12_16",0], + "4737347018334654530": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "17829854042305231384": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "8571662320744858201": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "5828768432282043413": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "3685556976073096544": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "8047078039937885319": ["convolution_gpu_bfyx_gemm_like",2], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4366043672240989175": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6148022455516485135": ["convolution_gpu_bfyx_gemm_like",2], + "2932914865200583326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13225520357177380691": ["convolution_gpu_bfyx_gemm_like",2], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "5261762234237034874": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "5409329687010951601": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "10885752780697269323": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4577872082734403187": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "9614300332487270888": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6997121306455110286": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "1071090704302849258": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "937763627727362899": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "5185125307593023170": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "11933283931932057859": ["convolution_gpu_bfyx_gemm_like",1], + "18120169120088482114": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "16541535256432192398": ["convolution_gpu_bfyx_gemm_like",2], + "4646176801168621136": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3119045125726216156": ["convolution_gpu_bfyx_gemm_like",1], + "141166664952282933": ["convolution_gpu_bfyx_gemm_like",2], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8228641750970480948": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "835367600773871252": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15114370307779942381": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17962578815194404362": ["convolution_gpu_bfyx_gemm_like",2], + "4831224999851230245": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6812025576584060234": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "9601849246293120347": ["convolution_gpu_bfyx_gemm_like",2], + "15156805695359911457": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "4515798403196565084": ["convolution_gpu_bfyx_gemm_like",2], + "8122815203088327658": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5962764672151728219": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",1], + "8809794528993445200": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13660015013041074867": ["convolution_gpu_bfyx_gemm_like",2], + "5948701218437980356": ["convolution_gpu_bfyx_gemm_like",2], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",0], + "10800323158234163234": ["fully_connected_gpu_fb_oi_ref",2], + "6876164425008541018": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "14652791434312888296": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "12942776337163777730": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16884228931101540030": ["convolution_gpu_bfyx_gemm_like",2], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "18331981707436752260": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15765198153800696060": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9628735886189157469": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",2], + "12854272540346358832": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "11831092915967558428": ["convolution_gpu_bfyx_os_iyx_osv16",647 + ] + }, + "72": { + "9226443907548972870": ["convolution_gpu_bfyx_gemm_like",1], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "1051506168926530904": ["fully_connected_gpu_bf_io_input_spatial",0], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "17382660912493284320": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",767], + "12693511427898130707": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",1], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",883], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8079376692609682448": ["convolution_gpu_bfyx_gemm_like",0], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",2], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14810839157236175179": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "17243648226968859637": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "9660812093766156608": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",539], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",2], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4232250144427804891": ["fully_connected_gpu_bf_io_gemm",1], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14335423820860953927": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1103228955716492167": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "16312223896859176991": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "10628725059172743408": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2108296560864415762": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "956022649859563080": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "9780938731831129283": ["convolution_gpu_bfyx_gemm_like",2], + "1701609125136907870": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",1], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "7590767013583950613": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "5103094815475470596": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "4229105529069729944": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "13102754309439605192": ["convolution_gpu_bfyx_gemm_like",2], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "3355259926747524578": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "5740738339752793113": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "1003101267609305257": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11267742746905371769": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",0], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "10509933181132310969": ["convolution_gpu_bfyx_gemm_like",1], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "8751016391945753900": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",0], + "11465965972527519631": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3526580286148537369": ["convolution_gpu_bfyx_gemm_like",2], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",2], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "10433541468308381909": ["convolution_gpu_bfyx_gemm_like",1], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "10642327923162019888": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",22], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11806105193035393795": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "11919129623429545762": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",0], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11086699387784339943": ["convolution_gpu_bfyx_os_iyx_osv16",495], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "8578747191812631883": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",62], + "17638692805430115529": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "8195881973746570408": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "12523676912856063091": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "13503688893307029975": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17025268985366223779": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",91], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",1], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "17824431042110985323": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15374625876485618845": ["convolution_gpu_bfyx_gemm_like",2], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13418701036204748812": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "16025442470600124062": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",1], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "11655994466278963438": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3221221905804708596": ["convolution_gpu_bfyx_gemm_like",1], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2305461098719675735": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "4640696923527766618": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",546], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "13775529405693629438": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",1], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "1435153323458789173": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "4217179485243909459": ["convolution_gpu_bfyx_gemm_like",1], + "875296362957469305": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "17035903590837750750": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "6133592828563353516": ["convolution_gpu_bfyx_gemm_like",1], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",523], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2534408579674556441": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "700717277178942679": ["convolution_gpu_bfyx_gemm_like",1], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",1], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1104489643524273315": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7744787957569714828": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7430073011895298582": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",0], + "2571882179292959757": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "13248567106128518549": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "9882204352209412039": ["convolution_gpu_bfyx_gemm_like",1], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "3277243911383750280": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4992668316921598993": ["convolution_gpu_bfyx_gemm_like",1], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",1], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",813], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "8061914949376516780": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15466940145773097237": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",0], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",2], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",4], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",661], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",1], + "11507538232733291666": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",2], + "5019077257951332016": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",0], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7199295899520406795": ["convolution_gpu_bfyx_gemm_like",2], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "18077281411861416889": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",812], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7712831597869354170": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15378025640603637387": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "16884228931101540030": ["convolution_gpu_bfyx_gemm_like",2], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3895088069642140043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "1081962464388501987": ["convolution_gpu_bfyx_os_iyx_osv16",873], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15640202505592598653": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "8434794604559592624": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17006655627343469372": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",1], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",0], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",2], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4239133538073498792": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "10290107543739998181": ["fully_connected_gpu_bf_io_input_spatial",2], + "17713034180977313726": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6329618009202266591": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "14289048840489035546": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1089679781525023551": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "54019631544204590": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "5095827462645341808": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "5047419871737940985": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2543041530639980505": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "16264774056719724826": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",2], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12190841837604350271": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",1], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "3432296808755992670": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "8560635685184432720": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",831], + "18136765667969393174": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "5933743119393822386": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "8939683514448064461": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "11066913713501760080": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",120], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13468713306678453952": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",1], + "1996860183441418841": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "17522452942286240233": ["convolution_gpu_bfyx_gemm_like",2], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",468], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",763], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "12534001599784153836": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "16190949264253468961": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12242618640422208652": ["convolution_gpu_bfyx_gemm_like",0], + "8818070832398055086": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "13300022131572486202": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",1], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "8025053805734757314": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "1934379409955686502": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "15857087373591747006": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "5311718276151327830": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",2], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "6025872155179042054": ["convolution_gpu_bfyx_gemm_like",2], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",2], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",472], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "8258382025812748961": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",124], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "10890975553758439233": ["convolution_gpu_bfyx_gemm_like",1], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",1], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "18337160891834020517": ["convolution_gpu_bfyx_os_iyx_osv16",151], + "14744368497944610864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",767], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",2], + "8376077531098664520": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "15322019609805777935": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "13842309033760176194": ["convolution_gpu_bfyx_gemm_like",2], + "5643908654122573882": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",0], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",1], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6323026044750482867": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11107930597263802755": ["convolution_gpu_bfyx_gemm_like",2], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4196367396954155354": ["convolution_gpu_bfyx_gemm_like",2], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",0], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",1], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17515573322312447679": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",2], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "10554266898346470422": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "12494969618927201911": ["fully_connected_gpu_fb_oi_ref",1], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4550028191070279999": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "8153567933591966877": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",1], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "482564204402769504": ["convolution_gpu_bfyx_gemm_like",1], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17208186152576814861": ["convolution_gpu_bfyx_gemm_like",1], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4716188972902735458": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",1], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1306339989221885682": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "14811022197918391667": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "7903891232234389925": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "4479979951990338510": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",1], + "13447028922679236865": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "15781622938833984014": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12644942072153919043": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4999505377862312410": ["fully_connected_gpu_bf_io_gemm",2], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "529543453251381109": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2730604806511016352": ["convolution_gpu_bfyx_gemm_like",2], + "12655099960717366198": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "15129834325410878425": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",1], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5381578460674280089": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "14681717813022425567": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "8707189142909022305": ["convolution_gpu_bfyx_gemm_like",2], + "5275016494706355806": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",0], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "1249137685908951501": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "13410850301164057911": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13702692566238948173": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3150231129728961455": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "16044646335477470657": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "9226912483632588371": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",0], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "9069334144391048686": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "9475130054420979752": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",2], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",2], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16955653765071712611": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "11490143853656040028": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8757900457181374694": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "8507854696766492454": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "10424278617647597641": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",1037], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",2], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "4766071144928072260": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",2], + "10084794570892043447": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "2040762223425679479": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "13681462437496627948": ["convolution_gpu_bfyx_direct_10_12_16",0], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "17343050785312683560": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "14716719350966652036": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",185], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",1], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1372939511728986224": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "15661322183507404821": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "6065819201836017182": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "16710651492402564794": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "11192356850081328892": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",1], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "689445825453914111": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "18375125668176498051": ["convolution_gpu_bfyx_gemm_like",2], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2140514316203117958": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10384537928514123040": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",1], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "6571438978296387721": ["convolution_gpu_bfyx_gemm_like",2], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "7026575758396092435": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "2294026590516781945": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "1954052357826969119": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",2], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "9454954846682513038": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "15315327794058441258": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "276407276027553756": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",1], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8618835732380720921": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "3349519148124496343": ["fully_connected_gpu_bf_io_gemm",2], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "5334190564423375247": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11060822686394981344": ["convolution_gpu_bfyx_gemm_like",1], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17825280904760131680": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "5409924335138540834": ["convolution_gpu_bfyx_os_iyx_osv16",526], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "9447458159095730492": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "659150305191479097": ["convolution_gpu_bfyx_os_iyx_osv16",902], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",151], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",2], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "153117141968471446": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "3036808833459559381": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "8321769923556905957": ["convolution_gpu_bfyx_gemm_like",1], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "6181272224000872375": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "7279393739634103483": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "879896719155824868": ["convolution_gpu_bfyx_gemm_like",2], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "6129602738379919488": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6981537186704688907": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11619548409913646265": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "288853243482418538": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "13538051178827008933": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "9367157746678824712": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "9040046051053703359": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",420], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9285566577169147378": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15287650965861631130": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1907439276166837309": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16559140502701231107": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9529614587861271730": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",21], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "4220826666482500445": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "2451712485584835395": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8032685176029570383": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16683485007140805060": ["fully_connected_gpu_fb_io_ref",1], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",1], + "5629373398445592781": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "17015791782274123780": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15997754881872769378": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17050143605017295447": ["convolution_gpu_bfyx_gemm_like",2], + "5953754321266570854": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "5843679089588930933": ["convolution_gpu_bfyx_gemm_like",2], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15993427814066246646": ["convolution_gpu_bfyx_gemm_like",1], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3860603464276263676": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",2], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6644418194983229139": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1138439260035360722": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",388], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",523], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "18043340998699622388": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "10792503079194374004": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "18424400171776141118": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16286085532892593349": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7688176479120305539": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8057302050645780813": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",512], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",2], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "792684262493086891": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",1], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "1410630713443793537": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "11609821372586026178": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "287386909600391846": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",2], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",284], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "16230621843665445228": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "9404677451270692749": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12493863403516600413": ["convolution_gpu_bfyx_gemm_like",1], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "7148542290597073512": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11951606039079763598": ["convolution_gpu_bfyx_gemm_like",2], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6902644989079870993": ["convolution_gpu_bfyx_gemm_like",1], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "6438522646185979880": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1541754036637209097": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13810995219720233595": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "18109284647478027063": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",2], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",2], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1034], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",0], + "16547425454653232058": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8108843303778211282": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "16076153317792960383": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",1], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14221578799010900252": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",2], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",2], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",2], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "6522575549211855712": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "15417738436777481469": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",764], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "2349007644347065353": ["convolution_gpu_bfyx_gemm_like",2], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "11829442945690098558": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "14902389080201926109": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "9522661528867955338": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",2], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "2418288192668085805": ["convolution_gpu_bfyx_gemm_like",2], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",2], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "15026219694198820614": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "2345023488044002149": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "5219048275475447369": ["convolution_gpu_bfyx_gemm_like",2], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",1], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "11599932445375240727": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "16818714747882774917": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "13115589642140732066": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15511138074959300404": ["convolution_gpu_bfyx_gemm_like",2], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "15579919505002150556": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "1116274074896622552": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",1], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "17808913959977434594": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",2], + "18012549942299450620": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "9759380701896779097": ["convolution_gpu_bfyx_gemm_like",2], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "1882052795393187384": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15464327246951632247": ["convolution_gpu_bfyx_gemm_like",1], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "12394049027081208902": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",1], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "14077148976508649021": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17310332946322628458": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15114370307779942381": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5287076386757143976": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "2525260242689556544": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "11459784003592366395": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",1], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",2], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "2651385050387738902": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4366168099274266975": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "4897991181236908768": ["convolution_gpu_bfyx_gemm_like",1], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "8797843396807284399": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",2], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "9287404618748313247": ["convolution_gpu_bfyx_gemm_like",1], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15494543914974994991": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "14484890926084856480": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",2], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5948701218437980356": ["convolution_gpu_bfyx_gemm_like",2], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",1], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",876], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3963106895592011725": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",467], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "16173557782125372935": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",989], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "8819268903800581706": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",90], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "72444706264681262": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "5688478347124565305": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",1], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "8253823502854784432": ["convolution_gpu_bfyx_gemm_like",2], + "3746573775462003750": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",1], + "4239415134522959352": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "3448477246688526708": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",2], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "17928043901784474130": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "14397348576352573007": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",1], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "16758962840329202004": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "522181557896569275": ["convolution_gpu_bfyx_gemm_like",0], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9410978119783758141": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "52089503050497755": ["convolution_gpu_bfyx_os_iyx_osv16",899], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",0], + "4073467095502162430": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",1], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2893564501191050837": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "1742897526168249500": ["convolution_gpu_bfyx_gemm_like",1], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",1], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2623687018437195679": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12796777049340516563": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",1], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "18384657372655350144": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",1], + "15490478608105402679": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",0], + "12427258337646070422": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "1077773457856682663": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "8906588133431586825": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "17912189681971987483": ["convolution_gpu_bfyx_gemm_like",2], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "18386376129938707290": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",883], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10118395047539851751": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "659846949368492111": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "12461575861709234385": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "9191832520273617003": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "1838534101161814609": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",2], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7264756313770306662": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "3755253206085028904": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",2], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "2265784112305305260": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "789359733867650915": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",1], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7211355951470869591": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "15591167992985613695": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "2007192658799516915": ["fully_connected_gpu_bf_io_gemm",1], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",2], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "9707630588260222630": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",1], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",1], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",2], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "14650567822254940018": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "14213516751025324346": ["convolution_gpu_bfyx_gemm_like",2], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "10320711719466983961": ["convolution_gpu_bfyx_gemm_like",2], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "2038505773698938555": ["fully_connected_gpu_bf_io_gemm",1], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15156525717629023944": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2727219457659794468": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5754844816339228920": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",2], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "6263019986730305851": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "7808544677773370430": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",1], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13328449155966085543": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10607904718265020949": ["convolution_gpu_bfyx_gemm_like",2], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",0], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "14667209474639064623": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "13933912937625580405": ["fully_connected_gpu_bf_io_input_spatial",0], + "3179874645565098825": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",2], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",1], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "5687802882700097624": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "9100044555742394133": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",1], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",1], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8528750110601691390": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "7575675354187625951": ["convolution_gpu_bfyx_gemm_like",2], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "13762042713029963144": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",2], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",1], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "16053585286807864356": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "8272823732258536202": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11130439225010714550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",1], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",2], + "16683485007140805060": ["fully_connected_gpu_yxfb_ref",2], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "7650862961269327235": ["convolution_gpu_bfyx_1x1",2], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",1], + "5291011077679733990": ["convolution_gpu_bfyx_gemm_like",2], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",2], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4362304842016958728": ["convolution_gpu_bfyx_gemm_like",2], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",2], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7669403041163460089": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12397280593466519809": ["convolution_gpu_bfyx_gemm_like",2], + "2296581485980163665": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7447163906170805189": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "16243196137456624852": ["convolution_gpu_bfyx_gemm_like",2], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",1], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",2], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "9439431829175743345": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10753540518493641553": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",2], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",2], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9193880745263317167": ["convolution_gpu_bfyx_gemm_like",2], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "1520529227443340435": ["convolution_gpu_bfyx_gemm_like",2], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",2], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "5288793454052261767": ["convolution_gpu_bfyx_gemm_like",2], + "3820661057776133570": ["convolution_gpu_bfyx_1x1",2], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "17025324057045572535": ["convolution_gpu_bfyx_direct_10_12_16",2], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",2], + "5941092474669713339": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14204609663091442879": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",2], + "959260710517842876": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",1], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",1], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2929715823970060874": ["convolution_gpu_bfyx_gemm_like",1], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14268594692585922659": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "10471519687597963116": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "15809639778580769565": ["convolution_gpu_bfyx_gemm_like",2], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",1], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "10864011008000364415": ["convolution_gpu_bfyx_1x1",2], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7201521533301617290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",2], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",2], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2114232149447438823": ["convolution_gpu_bfyx_gemm_like",2], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",2], + "5754396201681434378": ["convolution_gpu_bfyx_1x1",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "6217542346826403576": ["convolution_gpu_bfyx_1x1",2], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",1], + "11324651029379152442": ["convolution_gpu_bfyx_1x1",2], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",2], + "18446245971488003004": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "12279771749366327372": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "7223801044761006523": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3563872903821081702": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17672785701483179117": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",1], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "13878967140838761911": ["convolution_gpu_bfyx_1x1",1], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",2], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",2], + "7106362077449435105": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "4927360358387344983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "603883331897298932": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "11263540528012919947": ["convolution_gpu_bfyx_1x1",2], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "16949056117405140365": ["convolution_gpu_bfyx_gemm_like",2], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",0], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5044721291675005144": ["convolution_gpu_bfyx_1x1",2], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",2], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11690533591656807605": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "11872943152839631823": ["convolution_gpu_bfyx_gemm_like",2], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",455], + "2608363732937932266": ["convolution_gpu_bfyx_gemm_like",2], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "12151068022697708126": ["convolution_gpu_bfyx_gemm_like",2], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",2], + "9741607635826869269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5539793555189956907": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2173720698351153121": ["convolution_gpu_bfyx_gemm_like",2], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",1], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",1], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",2], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "2984726467649419856": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",2], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",2], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",2], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6651389480007764007": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",1], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "17228810554159747400": ["convolution_gpu_bfyx_gemm_like",2], + "13403161389559730": ["convolution_gpu_bfyx_gemm_like",2], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",2], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",1], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2273992727647793692": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8421388456873652700": ["convolution_gpu_bfyx_gemm_like",2], + "10837496380266058422": ["convolution_gpu_bfyx_gemm_like",2], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "5751283221740229986": ["convolution_gpu_bfyx_gemm_like",1], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "14034525799882831106": ["convolution_gpu_bfyx_gemm_like",2], + "1318571118468536310": ["convolution_gpu_bfyx_gemm_like",2], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",1], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "4461989328775275994": ["convolution_gpu_bfyx_gemm_like",2], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "9941035405796680081": ["convolution_gpu_bfyx_1x1",1], + "13833960927635646899": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "15289152041466330689": ["convolution_gpu_bfyx_gemm_like",2], + "10432365444137108781": ["convolution_gpu_bfyx_gemm_like",2], + "13454265023861566476": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",2], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",1], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "14766477690417085350": ["convolution_gpu_bfyx_1x1",2], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5074273865983613482": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "11706446082856895571": ["convolution_gpu_bfyx_gemm_like",2], + "4708035980731751007": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "150132162949295379": ["convolution_gpu_bfyx_1x1",2], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",2], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",1], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "9243949750444156746": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",2], + "5600128039063009632": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8984436655107983227": ["convolution_gpu_bfyx_gemm_like",2], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",1], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",2], + "10709828018763273371": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "13106818352216009354": ["convolution_gpu_bfyx_gemm_like",2], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",664], + "6254141935545262078": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8174040194088942964": ["convolution_gpu_bfyx_os_iyx_osv16",945], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "8458082326743351141": ["convolution_gpu_bfyx_gemm_like",2], + "16044646335477470657": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",2], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "14206125678667603810": ["convolution_gpu_bfyx_1x1",1], + "14147460733160099960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "12972798847556569913": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",1], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "4773077837537775324": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",2], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",2], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",1], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",2], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",2], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "18416908414174464784": ["convolution_gpu_bfyx_gemm_like",2], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "11705756153433897198": ["convolution_gpu_bfyx_1x1",2], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "14764715930784496165": ["convolution_gpu_bfyx_gemm_like",2], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",2], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12831298482349900359": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2856601829807186494": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1338705434700924127": ["convolution_gpu_bfyx_1x1",1], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",2], + "6328802691680458752": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12166852830214895457": ["convolution_gpu_bfyx_1x1",2], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",2], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "1208161922424418734": ["convolution_gpu_bfyx_gemm_like",2], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "1418595171949196661": ["convolution_gpu_bfyx_gemm_like",2], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",2], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8837721075413149240": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",2], + "11626398907755088688": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8651641584737798174": ["convolution_gpu_bfyx_gemm_like",2], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",2], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17742192339816511494": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1569043950563130463": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",2], + "12501619443242354860": ["convolution_gpu_bfyx_gemm_like",2], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",1], + "15924916465272239832": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",2], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",2], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12712071520541638451": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "6845814820599174031": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14289082888174784976": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "7450417963648518926": ["convolution_gpu_bfyx_gemm_like",2], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "2321148334382088982": ["convolution_gpu_bfyx_gemm_like",2], + "9757389422721488173": ["convolution_gpu_bfyx_1x1",1], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",2], + "8390889357546397717": ["convolution_gpu_bfyx_1x1",1], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",0], + "10415046594066474634": ["convolution_gpu_bfyx_gemm_like",2], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",2], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",904], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",2], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",2], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "8943913562339525413": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4216958486055161753": ["convolution_gpu_bfyx_os_iyx_osv16",105], + "5240706676373148280": ["convolution_gpu_bfyx_gemm_like",2], + "5245526691775741296": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",565], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",2], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",2], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "7432142107544210174": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "9542325095876448686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4804533178560338520": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13006774775034887171": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",2], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",1], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",2], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3699344686791530101": ["convolution_gpu_bfyx_gemm_like",2], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",2], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",1], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",1], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",2], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "7183578232279711009": ["convolution_gpu_bfyx_gemm_like",2], + "2204178900998688268": ["convolution_gpu_bfyx_gemm_like",2], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",1], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",1], + "7009459929666511861": ["convolution_gpu_bfyx_1x1",1], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",2], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",1], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2242602888499888844": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",1], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",2], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "12194037100109755112": ["convolution_gpu_bfyx_gemm_like",2], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "5115007207028125638": ["convolution_gpu_bfyx_gemm_like",2], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",2], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "5346898505346646714": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "13830605041347009953": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "4834446692898125871": ["convolution_gpu_bfyx_gemm_like",2], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",2], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3385797925880519845": ["convolution_gpu_bfyx_1x1",2], + "1596353239542510685": ["convolution_gpu_bfyx_gemm_like",2], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",2], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",2], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",2], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "12054200116003751590": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",2], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10724501418439612080": ["convolution_gpu_bfyx_gemm_like",1], + "3976736548270395981": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "15773157615731010456": ["convolution_gpu_bfyx_gemm_like",2], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "3541538046227217664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "10681768474583067517": ["convolution_gpu_bfyx_gemm_like",1], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10979362792894404338": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "9328223957245552723": ["convolution_gpu_bfyx_gemm_like",2], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",2], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "331661172067077796": ["convolution_gpu_bfyx_1x1",2], + "13161997040644039778": ["convolution_gpu_bfyx_gemm_like",2], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",1], + "12495003066477974474": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "7667898603371717971": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",2], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "490931535580183607": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "6020017927557041768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",2], + "1982176363226079588": ["convolution_gpu_bfyx_gemm_like",2], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "6290584630172122012": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "6109013751635776331": ["convolution_gpu_bfyx_gemm_like",2], + "9076758673133996959": ["convolution_gpu_bfyx_gemm_like",2], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "135072053401934228": ["convolution_gpu_bfyx_1x1",2], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",2], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",2], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",2], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",2], + "584086621952390547": ["convolution_gpu_bfyx_gemm_like",2], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",1], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "16915857558806082023": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "16139615240471264488": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",2], + "15031155621982459860": ["convolution_gpu_bfyx_gemm_like",2], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",2], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",167], + "13160712904661288567": ["convolution_gpu_bfyx_1x1",1], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",2], + "631489011812924153": ["convolution_gpu_bfyx_1x1",2], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "5008350851224686853": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "671453551040072499": ["convolution_gpu_bfyx_gemm_like",2], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7375461241315602473": ["convolution_gpu_bfyx_gemm_like",2], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",2], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",1], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",1], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",2], + "10722677916294015259": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",2], + "8712136292276123857": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "16818206615424635387": ["convolution_gpu_bfyx_1x1",1], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "1186545671730357033": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "11132679855317294753": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4135068756462147853": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "16293465561256937726": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "15989894214714907271": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",103], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "4339711224604149541": ["convolution_gpu_bfyx_gemm_like",2], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "5911282942658469852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "4398371999113956082": ["convolution_gpu_bfyx_gemm_like",2], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",2], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",2], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",2], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",2], + "87031578643428011": ["convolution_gpu_bfyx_1x1",2], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15739278428190392018": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",2], + "16075006181495932250": ["convolution_gpu_bfyx_gemm_like",2], + "786401653335542559": ["convolution_gpu_bfyx_gemm_like",2], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "9545968464906009869": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",2], + "15677717057398875599": ["convolution_gpu_bfyx_gemm_like",1], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "7770000755097925765": ["convolution_gpu_bfyx_1x1",2], + "8096131027165540886": ["convolution_gpu_bfyx_gemm_like",2], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "4759671642533786591": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",1], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",2], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "739676584505475609": ["convolution_gpu_bfyx_gemm_like",2], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",1], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",2], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",2], + "8155268141318893606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9585113116232600562": ["convolution_gpu_bfyx_gemm_like",1], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "875296362957469305": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",1], + "9440117898128288296": ["convolution_gpu_bfyx_gemm_like",2], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "16748662918272106932": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3190494353583341446": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "3499406509137418124": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "17207560805775399864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "8792202318168046223": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "14466032674083938714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",2], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "818998169319147148": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11807282628372660280": ["convolution_gpu_bfyx_1x1",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "13851851281384416649": ["convolution_gpu_bfyx_1x1",1], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",1], + "4455369117448405874": ["convolution_gpu_bfyx_1x1",2], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "16364494883229084045": ["convolution_gpu_bfyx_gemm_like",2], + "15006321421735686121": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",2], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "11987564534722442223": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",2], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "2936333406928424760": ["convolution_gpu_bfyx_1x1",2], + "8541982562061181756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "1345101751956733589": ["convolution_gpu_bfyx_gemm_like",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",1], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",2], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "2339864165283480961": ["convolution_gpu_bfyx_1x1",2], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "17174919737114915467": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",2], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "5657471280535146301": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",2], + "11800783548769329949": ["convolution_gpu_bfyx_gemm_like",2], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10509933181132310969": ["convolution_gpu_bfyx_gemm_like",2], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",2], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "345043289576587800": ["convolution_gpu_bfyx_1x1",2], + "13264617841270329349": ["convolution_gpu_bfyx_1x1",2], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",2], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "13145474177271090694": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17778091287904736965": ["convolution_gpu_bfyx_gemm_like",2], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "16610284927818475574": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3872151366780051246": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14001406016806064079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "13472577372534605883": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10536316961655703500": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "8130920994920685157": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4718716595177056289": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",0], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",2], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "9525535670799618110": ["convolution_gpu_bfyx_gemm_like",2], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",2], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "10480527638577674825": ["convolution_gpu_bfyx_1x1",2], + "3441335188113424896": ["convolution_gpu_bfyx_gemm_like",2], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "14885031472057965707": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "49948277487706148": ["convolution_gpu_bfyx_1x1",2], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",2], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",1], + "4098191685457418125": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "9562527071055150197": ["convolution_gpu_bfyx_1x1",2], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "1089944493540593798": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",2], + "3383222668132648804": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",2], + "4274425737610351312": ["convolution_gpu_bfyx_gemm_like",2], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4819131094439732065": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "503369896500284129": ["convolution_gpu_bfyx_1x1",2], + "10447947790216991304": ["convolution_gpu_bfyx_gemm_like",2], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",1039], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13474805373264874144": ["convolution_gpu_bfyx_1x1",2], + "2245166025103475783": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15363606233048272809": ["convolution_gpu_bfyx_1x1",2], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",1], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2554991397391195611": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "6288489890578212082": ["convolution_gpu_bfyx_gemm_like",2], + "10785966734346479177": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "16020916772006653269": ["convolution_gpu_bfyx_1x1",1], + "5720964268093705079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17477062954520561609": ["convolution_gpu_bfyx_gemm_like",2], + "994842991399671507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",2], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "12046017161414846599": ["convolution_gpu_bfyx_1x1",2], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15822546325822628634": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "5459463503840817402": ["convolution_gpu_bfyx_1x1",2], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",2], + "537074122417021898": ["convolution_gpu_bfyx_os_iyx_osv16",100], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",2], + "1245259979364728404": ["convolution_gpu_bfyx_1x1",2], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",2], + "15188570678726970998": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",2], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",0], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "16347412180100581330": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",2], + "3141886504884887200": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",2], + "4238885454989272754": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5012013738970489338": ["convolution_gpu_bfyx_1x1",1], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",0], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",1], + "13575423234109624706": ["fully_connected_gpu_yxfb_ref",2], + "3114869763557037270": ["fully_connected_gpu_fb_oi_ref",1], + "10292585962794261197": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "10914921540144371519": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",1], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10766317990628501609": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",1], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",0], + "8058419689646625853": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "10330180429524641331": ["convolution_gpu_bfyx_gemm_like",2], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",1], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "787203599734115483": ["convolution_gpu_bfyx_1x1",1], + "8519354640245415816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "16773645387243701837": ["convolution_gpu_bfyx_gemm_like",2], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "8069537351442302814": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "138379779469699309": ["convolution_gpu_bfyx_gemm_like",2], + "10106454449619141260": ["convolution_gpu_bfyx_1x1",2], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",2], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "16925721317097534009": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",2], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",2], + "851057218719456209": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3603187029740446600": ["convolution_gpu_bfyx_gemm_like",2], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",2], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "7531346828150129063": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "12977678792503377525": ["convolution_gpu_bfyx_gemm_like",1], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5600807544955072308": ["convolution_gpu_bfyx_gemm_like",2], + "7481256533438761028": ["convolution_gpu_bfyx_gemm_like",2], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",1], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "12806934028210472719": ["convolution_gpu_bfyx_gemm_like",2], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",2], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "4640696923527766618": ["convolution_gpu_bfyx_gemm_like",2], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",2], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",2], + "15497797842820949408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",1], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3635446784873718932": ["convolution_gpu_bfyx_gemm_like",2], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "14126906427006602775": ["convolution_gpu_bfyx_1x1",2], + "6664482192233202590": ["convolution_gpu_bfyx_gemm_like",2], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1485662490111767875": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13328911884191551889": ["convolution_gpu_bfyx_1x1",2], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",2], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "10935309102034762723": ["convolution_gpu_bfyx_1x1",1], + "17515847111676784130": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13565691057064774487": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",1], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",2], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",1], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "6760797535531423152": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12985942652866621579": ["fully_connected_gpu_fb_io_ref",2], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",2], + "13596876807637507229": ["convolution_gpu_bfyx_1x1",2], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",1], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",1], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",1], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "4610200388191607540": ["convolution_gpu_bfyx_gemm_like",2], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "16839741351990811959": ["convolution_gpu_bfyx_gemm_like",2], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",2], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "14362876471450307424": ["convolution_gpu_bfyx_1x1",2], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",1], + "2727175120437582536": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9314293064351558241": ["convolution_gpu_bfyx_gemm_like",2], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",2], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",1], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",1], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",2], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "6942016672941874829": ["convolution_gpu_bfyx_gemm_like",2], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17344974951998490453": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7870154008378361670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "2439993891369206440": ["convolution_gpu_bfyx_1x1",2], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "4617347486560666277": ["convolution_gpu_bfyx_1x1",1], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",2], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",2], + "16361932270527364507": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",1], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",2], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "14930789530046665855": ["convolution_gpu_bfyx_gemm_like",2], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",2], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",2], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "178353385245384751": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "794499287296495726": ["convolution_gpu_bfyx_1x1",2], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "13302687772426736346": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",2], + "7171904645566467208": ["convolution_gpu_bfyx_gemm_like",2], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16574710115918192418": ["convolution_gpu_bfyx_gemm_like",2], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",1], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",2], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",2], + "11604794601689380990": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3017824560305532066": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "16681690088928624738": ["convolution_gpu_bfyx_gemm_like",2], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "9987415314864002460": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7846384623429362522": ["convolution_gpu_bfyx_1x1",1], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "7235358742317442134": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2817919813339364130": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "4571404165794634411": ["convolution_gpu_bfyx_1x1",2], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",1], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2866656294663853474": ["convolution_gpu_bfyx_1x1",2], + "1701412735970485849": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "8168240543278779314": ["convolution_gpu_bfyx_1x1",1], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "9989055862610193828": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "632116056424249698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "142329025839464842": ["convolution_gpu_bfyx_1x1",2], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",2], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",1], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",2], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",1], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "2173867324489962689": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13425251102263428554": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "8561261337239934159": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8490260671996115530": ["convolution_gpu_bfyx_gemm_like",1], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "14578867494693499627": ["convolution_gpu_bfyx_gemm_like",2], + "14905520834426630145": ["convolution_gpu_bfyx_gemm_like",2], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "10883992248631603006": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2920840796593281126": ["convolution_gpu_bfyx_gemm_like",2], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "3398322619007806698": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",1], + "13602140021189675477": ["convolution_gpu_bfyx_gemm_like",2], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "14184895905338394239": ["convolution_gpu_bfyx_gemm_like",2], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "6114241186364821679": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",275], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7015738038963065110": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "579781312141502576": ["convolution_gpu_bfyx_1x1",1], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",2], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",2], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",526], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",2], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "10022487076451608714": ["convolution_gpu_bfyx_gemm_like",2], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "6726099352298108756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "905526102343710614": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",2], + "5926747396493954633": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",1], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "12871555773123368130": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6603778920476932267": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",1], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",1], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",2], + "15078590909693331731": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",1], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "2527276292172180386": ["convolution_gpu_bfyx_gemm_like",2], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "14115742296883450319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2929190644951986399": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "4849343880559509889": ["convolution_gpu_bfyx_1x1",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",2], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",387], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "15859493313686060349": ["convolution_gpu_bfyx_gemm_like",2], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",2], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",1], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",2], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",2], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",2], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",2], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9955939178447682108": ["convolution_gpu_bfyx_1x1",2], + "6648876837655776653": ["convolution_gpu_bfyx_1x1",2], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "12375919467924385618": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "15329680728165965773": ["convolution_gpu_bfyx_gemm_like",2], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "5040095338370816349": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",2], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",2], + "1390379098099686972": ["convolution_gpu_bfyx_1x1",2], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",1], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",1], + "290134020607738418": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "9426665763007611385": ["convolution_gpu_bfyx_gemm_like",2], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",1], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "7532088618116521936": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "3177304125602972370": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "216603198215625772": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",2], + "4154403364889130045": ["convolution_gpu_bfyx_gemm_like",2], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",2], + "3219408878901707426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "3499106702307464480": ["convolution_gpu_bfyx_gemm_like",2], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",2], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",2], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "3056212889689424946": ["convolution_gpu_bfyx_1x1",2], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6817494598328071314": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "15065019229949449623": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11626402549863483301": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",1], + "10893432143734884603": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "5118467701668427545": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",2], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",2], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "9737833587413114584": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",147], + "4815047491742617397": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "3622409603053918029": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15636128989267984459": ["convolution_gpu_bfyx_direct_10_12_16",1], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "8543619733732987550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1074748462756364699": ["fully_connected_gpu_fb_oi_ref",1], + "15924583510704449214": ["convolution_gpu_bfyx_gemm_like",1], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",2], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",2], + "4126895998426674411": ["convolution_gpu_bfyx_gemm_like",2], + "2477849395789783501": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",2], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",2], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "677249604491773387": ["convolution_gpu_bfyx_gemm_like",2], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",2], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",2], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",2], + "14263790627243107300": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",642], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",1], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",2], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10912495395422146386": ["convolution_gpu_bfyx_gemm_like",2], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",2], + "13124342334495538095": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "10645625090439446714": ["convolution_gpu_bfyx_gemm_like",2], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "10899110544832584656": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",1], + "577844026691991089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5963901433137582265": ["convolution_gpu_bfyx_gemm_like",2], + "12421204749289937399": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",2], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "938848188161536107": ["convolution_gpu_bfyx_1x1",2], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",2], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1597770067928214597": ["convolution_gpu_bfyx_1x1",1], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "11910735867274493498": ["convolution_gpu_bfyx_gemm_like",2], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",2], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4079026972040047969": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "7954972694876158422": ["convolution_gpu_bfyx_1x1",2], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "9803492989444302959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17264010982688979937": ["convolution_gpu_bfyx_1x1",2], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "15106614232165315070": ["convolution_gpu_bfyx_gemm_like",2], + "15271783562528081169": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "12604104383683210104": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "16341722570340169855": ["convolution_gpu_bfyx_1x1",2], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",1], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",2], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",2], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",2], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "2188101366183302888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "7232326270078161768": ["convolution_gpu_bfyx_gemm_like",2], + "4378422094110940766": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6131481289104111211": ["convolution_gpu_bfyx_gemm_like",2], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14540578324750869319": ["convolution_gpu_bfyx_gemm_like",2], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",1], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",2], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",1], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",2], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",2], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",2], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16182470664818268848": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "3725013268198063198": ["convolution_gpu_bfyx_1x1",2], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2581414750854621875": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",2], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "14544219140091420262": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "17214254645087272557": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",2], + "9480653639044390919": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",1], + "17084977396231597605": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "9737565171095493297": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1632416005093914709": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4531222427159927606": ["convolution_gpu_bfyx_gemm_like",2], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "18427056032084727710": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "15231987838322151865": ["convolution_gpu_bfyx_1x1",2], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12512751736409465214": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13590444711975157776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9809458159478958866": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",1], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14383657211047876136": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",2], + "8257103926661643451": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "733956743303342862": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "1359720957005310113": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "7292351660229751817": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",1], + "16986358655784856534": ["convolution_gpu_bfyx_gemm_like",2], + "10141927023849730720": ["convolution_gpu_bfyx_1x1",1], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",2], + "12755692101476964677": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "8931169575495985034": ["convolution_gpu_bfyx_gemm_like",2], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",2], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "2863465257341735941": ["convolution_gpu_bfyx_1x1",1], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "13760645810144930270": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",2], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "5941852872160795604": ["convolution_gpu_bfyx_gemm_like",2], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "7748233564411787605": ["convolution_gpu_bfyx_gemm_like",2], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",2], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "7913076120244203725": ["convolution_gpu_bfyx_gemm_like",2], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "18436249934780056991": ["convolution_gpu_bfyx_os_iyx_osv16",296], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "721174714308243785": ["convolution_gpu_bfyx_gemm_like",2], + "13809330759308309353": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "15799159401545270696": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14558572801374416278": ["convolution_gpu_bfyx_gemm_like",1], + "6222595759158615206": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13369603621524676979": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",2], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",1], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "12917241193304093727": ["convolution_gpu_bfyx_gemm_like",2], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "883436333317162926": ["convolution_gpu_bfyx_1x1",2], + "15967614281807823696": ["convolution_gpu_bfyx_gemm_like",2], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "438528596970898721": ["convolution_gpu_bfyx_gemm_like",1], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",2], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1984152634309440563": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "7780140599533242850": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "3265415000818832667": ["convolution_gpu_bfyx_gemm_like",2], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",2], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8761283252495354972": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "5965451243366505522": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5758133252959371492": ["convolution_gpu_bfyx_gemm_like",1], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8700574100180128776": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",2], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15578456771467281881": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",2], + "394778201589371681": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "10256831975351722184": ["convolution_gpu_bfyx_gemm_like",1], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",2], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "14151747022287993729": ["convolution_gpu_bfyx_gemm_like",2], + "6300691162962736560": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "3779229442395464456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10765280349477640969": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "6254161707168091438": ["convolution_gpu_bfyx_gemm_like",2], + "1760391741350091665": ["convolution_gpu_bfyx_gemm_like",2], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13038533272699602337": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "4091702228990140696": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4252157815622916471": ["convolution_gpu_bfyx_1x1",2], + "9918371346247634545": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "5941298590926032148": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5156033406916344703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "4848143712599565301": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "12900949103593247293": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7624259732952222597": ["convolution_gpu_bfyx_gemm_like",2], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",1], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",2], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "3503893875515897267": ["convolution_gpu_bfyx_gemm_like",2], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",1], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "1963081583851864291": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",1], + "17647962002015093887": ["convolution_gpu_bfyx_gemm_like",2], + "708747442142592697": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16911464046178654033": ["convolution_gpu_bfyx_1x1",2], + "18184621367843960190": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "4890043345392707202": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",2], + "4769003637955328938": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13251091004269229867": ["convolution_gpu_bfyx_gemm_like",2], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",1], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "13585163747565192884": ["convolution_gpu_bfyx_gemm_like",2], + "951747146164097188": ["convolution_gpu_bfyx_1x1",2], + "17224104246148265328": ["convolution_gpu_bfyx_gemm_like",2], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",1], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "14795618530175274538": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "11637325834858582585": ["convolution_gpu_bfyx_gemm_like",2], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11277866878590984477": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",1], + "7107677063657303327": ["convolution_gpu_bfyx_1x1",2], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",2], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "7075659071934895087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",2], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "6484375582324852109": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7683334381958571864": ["convolution_gpu_bfyx_gemm_like",2], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7647236080048602591": ["convolution_gpu_bfyx_gemm_like",1], + "12625112690264223217": ["convolution_gpu_bfyx_gemm_like",2], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",2], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15980348884716629349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15555083739490354527": ["convolution_gpu_bfyx_gemm_like",2], + "13960388312976163971": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2598267743388306204": ["convolution_gpu_bfyx_gemm_like",2], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",1], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",1], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",2], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",2], + "3336076058264596420": ["convolution_gpu_bfyx_gemm_like",2], + "12866217660635921034": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "8857763129101380288": ["convolution_gpu_bfyx_gemm_like",2], + "5479761740065152589": ["convolution_gpu_bfyx_gemm_like",2], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",2], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",1], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",2], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4423866541063606768": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",1], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",2], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "16910952799476896905": ["convolution_gpu_bfyx_gemm_like",2], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",1], + "14502856487639608696": ["convolution_gpu_bfyx_gemm_like",2], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",2], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",1], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",2], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2683304757433993300": ["convolution_gpu_bfyx_gemm_like",2], + "1680468564927032670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14206076551739831333": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11634932044447867039": ["convolution_gpu_bfyx_gemm_like",2], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "8794896449397768269": ["convolution_gpu_bfyx_gemm_like",2], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "9714508918051740792": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17446505012657609153": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",2], + "11669828823444745889": ["convolution_gpu_bfyx_gemm_like",2], + "18035673326929466074": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14985236276429954162": ["convolution_gpu_bfyx_gemm_like",2], + "3662747857062156477": ["convolution_gpu_bfyx_gemm_like",2], + "11797601971796699898": ["convolution_gpu_bfyx_gemm_like",2], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "12061567381160185735": ["convolution_gpu_bfyx_1x1",1], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",2], + "13468081302022888489": ["convolution_gpu_bfyx_gemm_like",2], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "14079654309452583394": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",2], + "8609939102588915855": ["convolution_gpu_bfyx_gemm_like",2], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",2], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",2], + "8787438180071123604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4933831571091731212": ["convolution_gpu_bfyx_direct_10_12_16",2], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "2242829490403202087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",2], + "16579057939215877904": ["convolution_gpu_bfyx_gemm_like",2], + "17109520309574369561": ["convolution_gpu_bfyx_gemm_like",2], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14349625788399542568": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",2], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "4168273493370024327": ["convolution_gpu_bfyx_1x1",1], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",2], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",1], + "11241838709529552265": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",281], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "11587239927319376658": ["convolution_gpu_bfyx_gemm_like",2], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",1], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",1], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17225552472711821360": ["convolution_gpu_bfyx_os_iyx_osv16",946], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",2], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "7132328255408635227": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15961487889420208188": ["convolution_gpu_bfyx_gemm_like",2], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",1], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",2], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "15757308772667178999": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4403753181729432604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "15078168059698267650": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "3477539135137665170": ["convolution_gpu_bfyx_gemm_like",2], + "1425953627379976115": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "11768117585574496387": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "11939914680143672459": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "13503555814874045782": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5740745357953479527": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "12370729327673204804": ["convolution_gpu_bfyx_gemm_like",2], + "15192230303376521834": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "998876398773540321": ["convolution_gpu_bfyx_1x1",1], + "14532519639619315651": ["convolution_gpu_bfyx_gemm_like",2], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",1], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",0], + "13735180250757239202": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "16238415425814188039": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "10572945270796129630": ["fully_connected_gpu_fb_io_ref",1], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",1], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "18255227391100087860": ["convolution_gpu_bfyx_1x1",2], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",2], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",2], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",1], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "11856815095538913065": ["convolution_gpu_yxfb_yxio_b16",2], + "18209930746627816139": ["convolution_gpu_yxfb_yxio_b16",2], + "7780336054545552428": ["convolution_gpu_yxfb_yxio_b16",2], + "7201521533301617290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",1], + "3211956138512889433": ["convolution_gpu_yxfb_yxio_b16",1], + "6944031900067948180": ["convolution_gpu_yxfb_yxio_b16",0], + "5449117614287394433": ["convolution_gpu_yxfb_yxio_b16",2], + "87031578643428011": ["convolution_gpu_bfyx_1x1",2], + "4833749391314748606": ["convolution_gpu_yxfb_yxio_b16",2], + "8450272092307894299": ["convolution_gpu_yxfb_yxio_b16",2], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "13325762052023866627": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "15773157615731010456": ["convolution_gpu_bfyx_gemm_like",2], + "6666210546769702280": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "11254744277059719812": ["convolution_gpu_yxfb_yxio_b16",1], + "10309586646776223605": ["convolution_gpu_yxfb_yxio_b16",2], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",1], + "12151068022697708126": ["convolution_gpu_bfyx_gemm_like",2], + "6464050901421037006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12550985938092975889": ["convolution_gpu_bfyx_1x1",2], + "3680396164645753224": ["convolution_gpu_yxfb_yxio_b16",0], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",2], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "15705908639736679687": ["convolution_gpu_yxfb_yxio_b16",2], + "15281554100135159550": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16238415425814188039": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "7226002258982605405": ["convolution_gpu_yxfb_yxio_b16",2], + "12397280593466519809": ["convolution_gpu_bfyx_gemm_like",2], + "2527018855890902975": ["convolution_gpu_bfyx_gemm_like",2], + "8555049634736330391": ["convolution_gpu_yxfb_yxio_b16",2], + "3107655421406621915": ["convolution_gpu_yxfb_yxio_b16",1], + "14754849694687093032": ["convolution_gpu_yxfb_yxio_b16",2], + "15065019229949449623": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17036482252028102703": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "14304497513584420080": ["convolution_gpu_yxfb_yxio_b16",2], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "7498614018449036163": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "12221101678609734421": ["convolution_gpu_yxfb_yxio_b16",2], + "560996739186313493": ["convolution_gpu_yxfb_yxio_b16",2], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1223196405651730260": ["convolution_gpu_yxfb_yxio_b16",2], + "7889602687414497280": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13160712904661288567": ["convolution_gpu_bfyx_1x1",1], + "17178308105985812083": ["convolution_gpu_yxfb_yxio_b16",2], + "17742192339816511494": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "13082313288887957490": ["convolution_gpu_yxfb_yxio_b16",2], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "6318214731544748245": ["convolution_gpu_bfyx_gemm_like",2], + "5899560521070338192": ["convolution_gpu_yxfb_yxio_b16",1], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "17192352762166764393": ["convolution_gpu_yxfb_yxio_b16",2], + "2487679091192300910": ["convolution_gpu_yxfb_yxio_b16",2], + "14126906427006602775": ["convolution_gpu_bfyx_1x1",2], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",1032], + "15504618703544589723": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12867038076564517306": ["convolution_gpu_yxfb_yxio_b16",2], + "11361202190524990711": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",147], + "15231987838322151865": ["convolution_gpu_bfyx_1x1",2], + "15897300973213364823": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "2412846055735335136": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "8339704352841356825": ["convolution_gpu_yxfb_yxio_b16",1], + "14667793472412360981": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16312739695844838884": ["convolution_gpu_yxfb_yxio_b16",2], + "11861634536583463947": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "13425251102263428554": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "216603198215625772": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12714814165247623529": ["convolution_gpu_yxfb_yxio_b16",2], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16883372966656079608": ["convolution_gpu_yxfb_yxio_b16",1], + "5274929595362413625": ["convolution_gpu_yxfb_yxio_b16",2], + "2242829490403202087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6832967250168141428": ["convolution_gpu_yxfb_yxio_b16",1], + "13454265023861566476": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4242173940230902960": ["convolution_gpu_yxfb_yxio_b16",2], + "3699344686791530101": ["convolution_gpu_bfyx_gemm_like",2], + "11342135956789192833": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "10850369799801518638": ["convolution_gpu_yxfb_yxio_b16",2], + "4283886984540574108": ["convolution_gpu_yxfb_yxio_b16",1], + "12308956927236847009": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1933147648540963732": ["convolution_gpu_yxfb_yxio_b16",2], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6784853321527374515": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9363988379673156863": ["convolution_gpu_yxfb_yxio_b16",2], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",2], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "5328004363712610999": ["convolution_gpu_yxfb_yxio_b16",1], + "10429104188258277773": ["convolution_gpu_yxfb_yxio_b16",2], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "13474805373264874144": ["convolution_gpu_bfyx_1x1",2], + "2048528188026477374": ["convolution_gpu_yxfb_yxio_b16",2], + "3797986765970777456": ["convolution_gpu_yxfb_yxio_b16",2], + "2148877522799179369": ["convolution_gpu_yxfb_yxio_b16",2], + "14085753024976995311": ["convolution_gpu_yxfb_yxio_b16",2], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",2], + "15800554162607246964": ["convolution_gpu_bfyx_gemm_like",1], + "17258128299721452811": ["convolution_gpu_yxfb_yxio_b16",2], + "6214624887470295152": ["convolution_gpu_bfyx_1x1",1], + "13842149852156451845": ["convolution_gpu_yxfb_yxio_b16",2], + "7482459536338668149": ["convolution_gpu_yxfb_yxio_b16",2], + "1786821683911142459": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5436553435132026991": ["convolution_gpu_yxfb_yxio_b16",2], + "14677968346503677769": ["convolution_gpu_yxfb_yxio_b16",2], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",2], + "932195814187889636": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13106818352216009354": ["convolution_gpu_bfyx_gemm_like",2], + "12933253554354951910": ["convolution_gpu_bfyx_gemm_like",2], + "2945245652128285151": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",1], + "1697260854781788314": ["convolution_gpu_yxfb_yxio_b16",2], + "10816702874143297564": ["convolution_gpu_yxfb_yxio_b16",2], + "2884499360870038648": ["convolution_gpu_yxfb_yxio_b16",2], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "6400660469217490279": ["convolution_gpu_yxfb_yxio_b16",2], + "16293465561256937726": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "14165325329016075285": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16912738776771289379": ["convolution_gpu_yxfb_yxio_b16",2], + "13702254392810961772": ["convolution_gpu_yxfb_yxio_b16",2], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "5276029719268937229": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7822463130304602936": ["convolution_gpu_yxfb_yxio_b16",2], + "8065866013404161366": ["convolution_gpu_yxfb_yxio_b16",2], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "968092788032627444": ["convolution_gpu_yxfb_yxio_b16",2], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2531597468539205600": ["convolution_gpu_yxfb_yxio_b16",2], + "3287181725010492879": ["convolution_gpu_yxfb_yxio_b16",2], + "8577875628223148806": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "6764038061921866053": ["convolution_gpu_yxfb_yxio_b16",2], + "3061372669831947873": ["convolution_gpu_yxfb_yxio_b16",2], + "15604634351310647589": ["convolution_gpu_yxfb_yxio_b16",2], + "10803929517111130153": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5040095338370816349": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3176785355296130660": ["convolution_gpu_bfyx_gemm_like",2], + "12810833895438895155": ["convolution_gpu_yxfb_yxio_b16",2], + "4154403364889130045": ["convolution_gpu_bfyx_gemm_like",2], + "14034402827496819479": ["convolution_gpu_bfyx_gemm_like",2], + "16794854619854992714": ["convolution_gpu_yxfb_yxio_b16",1], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "905526102343710614": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "7463517383354309469": ["convolution_gpu_bfyx_gemm_like",0], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4135975804549022456": ["convolution_gpu_yxfb_yxio_b16",2], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",1], + "17990326690659802090": ["convolution_gpu_yxfb_yxio_b16",2], + "10883992248631603006": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "2328919599530851492": ["convolution_gpu_yxfb_yxio_b16",2], + "3602929955785812025": ["convolution_gpu_yxfb_yxio_b16",2], + "10100171358681249181": ["convolution_gpu_yxfb_yxio_b16",2], + "11184290482439221741": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "9433162648796382333": ["convolution_gpu_yxfb_yxio_b16",2], + "9542325095876448686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "3117175697326325371": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1641111108888949123": ["convolution_gpu_yxfb_yxio_b16",2], + "15871357525719630224": ["convolution_gpu_bfyx_1x1",1], + "10598099730944525581": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "5099947445888268507": ["convolution_gpu_yxfb_yxio_b16",2], + "15958886009743157242": ["convolution_gpu_bfyx_gemm_like",2], + "5551484040302194648": ["convolution_gpu_yxfb_yxio_b16",2], + "2014114949154914483": ["convolution_gpu_yxfb_yxio_b16",2], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "2095245727814188300": ["convolution_gpu_bfyx_gemm_like",2], + "1692473411043262397": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "11563892089503603030": ["convolution_gpu_yxfb_yxio_b16",2], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17789969008677638142": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1556975727728498645": ["convolution_gpu_yxfb_yxio_b16",2], + "5593329151028712439": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "1963081583851864291": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12308895602001600327": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "741727668385951462": ["convolution_gpu_yxfb_yxio_b16",2], + "5884951148427535208": ["convolution_gpu_yxfb_yxio_b16",2], + "8976966933427522253": ["convolution_gpu_bfyx_gemm_like",2], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",1], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",2], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "10141558851476164734": ["convolution_gpu_yxfb_yxio_b16",2], + "2727175120437582536": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8390889357546397717": ["convolution_gpu_bfyx_1x1",1], + "16915857558806082023": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17181874388601550941": ["convolution_gpu_yxfb_yxio_b16",2], + "8093401822846123153": ["convolution_gpu_yxfb_yxio_b16",2], + "8494725779002762049": ["convolution_gpu_bfyx_gemm_like",2], + "4165926748138587705": ["convolution_gpu_yxfb_yxio_b16",2], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5911282942658469852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "762634810164167963": ["convolution_gpu_yxfb_yxio_b16",0], + "1154763947184432124": ["convolution_gpu_yxfb_yxio_b16",2], + "8686733586982652897": ["convolution_gpu_yxfb_yxio_b16",2], + "9065137335863605013": ["convolution_gpu_yxfb_yxio_b16",2], + "7203620615363933078": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "12806934028210472719": ["convolution_gpu_bfyx_gemm_like",2], + "2459018025887933198": ["convolution_gpu_yxfb_yxio_b16",2], + "14128122558476128712": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",2], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",2], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",2], + "4744578087509837185": ["convolution_gpu_yxfb_yxio_b16",0], + "8407012082034007985": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "5928392400230917930": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6744692937598310090": ["convolution_gpu_yxfb_yxio_b16",2], + "16437093737761968743": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "959260710517842876": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",642], + "14675165976583799157": ["convolution_gpu_yxfb_yxio_b16",2], + "8353259929933281349": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3364467044587904559": ["convolution_gpu_yxfb_yxio_b16",2], + "3385797925880519845": ["convolution_gpu_bfyx_1x1",2], + "15924583510704449214": ["convolution_gpu_bfyx_gemm_like",1], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8168240543278779314": ["convolution_gpu_bfyx_1x1",1], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "11175353869874626110": ["convolution_gpu_yxfb_yxio_b16",2], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "757225477250808939": ["convolution_gpu_yxfb_yxio_b16",2], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16516262096533373158": ["convolution_gpu_yxfb_yxio_b16",2], + "16052741298509954954": ["convolution_gpu_yxfb_yxio_b16",2], + "9585113116232600562": ["convolution_gpu_bfyx_gemm_like",1], + "7008873036126556197": ["convolution_gpu_yxfb_yxio_b16",2], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4137738705782981426": ["convolution_gpu_bfyx_gemm_like",2], + "6484375582324852109": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "8954488655859677891": ["convolution_gpu_yxfb_yxio_b16",2], + "14058311587429063829": ["convolution_gpu_yxfb_yxio_b16",2], + "8952733400567254769": ["convolution_gpu_bfyx_gemm_like",2], + "3022939690177474442": ["convolution_gpu_yxfb_yxio_b16",1], + "7748514992101811029": ["convolution_gpu_yxfb_yxio_b16",1], + "4571404165794634411": ["convolution_gpu_bfyx_1x1",2], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16788162879714733906": ["convolution_gpu_yxfb_yxio_b16",2], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7923576965630818418": ["convolution_gpu_yxfb_yxio_b16",2], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15641322340289892344": ["convolution_gpu_yxfb_yxio_b16",1], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",1], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4863644213728386734": ["convolution_gpu_yxfb_yxio_b16",2], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "16016396784190934729": ["convolution_gpu_yxfb_yxio_b16",2], + "14120569486714455490": ["convolution_gpu_yxfb_yxio_b16",2], + "2180039710632160943": ["convolution_gpu_yxfb_yxio_b16",1], + "3396731547696204011": ["convolution_gpu_yxfb_yxio_b16",2], + "1427040855295681285": ["convolution_gpu_yxfb_yxio_b16",2], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "10183537720515608": ["convolution_gpu_yxfb_yxio_b16",1], + "7425369489110576363": ["convolution_gpu_yxfb_yxio_b16",2], + "14830991971271385876": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "13957350536347764705": ["convolution_gpu_bfyx_gemm_like",2], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "11768117585574496387": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "7972861956906521660": ["convolution_gpu_yxfb_yxio_b16",2], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "14366395926517590797": ["convolution_gpu_yxfb_yxio_b16",1], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12818786388125465101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "16247799703932868151": ["convolution_gpu_yxfb_yxio_b16",2], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "16582761411084080015": ["convolution_gpu_yxfb_yxio_b16",2], + "1157069349112113377": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16836088134347394854": ["convolution_gpu_yxfb_yxio_b16",2], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "12421204749289937399": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",2], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "14699357144600604190": ["convolution_gpu_yxfb_yxio_b16",1], + "13767985623872409391": ["convolution_gpu_yxfb_yxio_b16",1], + "17386047378634216634": ["convolution_gpu_yxfb_yxio_b16",2], + "2245166025103475783": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "8193369947544085921": ["convolution_gpu_bfyx_gemm_like",2], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "6284333183047854748": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4121535611334103359": ["convolution_gpu_yxfb_yxio_b16",2], + "4165019140664090799": ["convolution_gpu_yxfb_yxio_b16",2], + "15397084091361096354": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "13387545865482261974": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "12722153168975105360": ["convolution_gpu_yxfb_yxio_b16",2], + "150132162949295379": ["convolution_gpu_bfyx_1x1",2], + "15101834579076569231": ["convolution_gpu_yxfb_yxio_b16",2], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12166852830214895457": ["convolution_gpu_bfyx_1x1",2], + "1944461047787586724": ["convolution_gpu_yxfb_yxio_b16",1], + "6875055157295709098": ["convolution_gpu_yxfb_yxio_b16",2], + "6887205509732544213": ["convolution_gpu_yxfb_yxio_b16",2], + "16738951239219589307": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "16889886654893884746": ["convolution_gpu_bfyx_1x1",2], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",1], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "3793265335909270748": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11626398907755088688": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12567935463143860469": ["convolution_gpu_yxfb_yxio_b16",2], + "5155616842071169667": ["convolution_gpu_yxfb_yxio_b16",2], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "6942016672941874829": ["convolution_gpu_bfyx_gemm_like",2], + "7719954202744123391": ["convolution_gpu_bfyx_gemm_like",2], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2789901295967374316": ["convolution_gpu_yxfb_yxio_b16",2], + "188830358699960789": ["convolution_gpu_yxfb_yxio_b16",2], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "9193880745263317167": ["convolution_gpu_bfyx_gemm_like",2], + "15012885932988454455": ["convolution_gpu_yxfb_yxio_b16",2], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "12179968379663737450": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17096735128393723245": ["convolution_gpu_yxfb_yxio_b16",1], + "10424643336435622408": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",2], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "503369896500284129": ["convolution_gpu_bfyx_1x1",2], + "7432142107544210174": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",1], + "9955939178447682108": ["convolution_gpu_bfyx_1x1",2], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",2], + "18080788888293706149": ["convolution_gpu_yxfb_yxio_b16",2], + "4674504221851042542": ["convolution_gpu_yxfb_yxio_b16",2], + "9401409770128851474": ["convolution_gpu_bfyx_gemm_like",0], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9414927552739380436": ["convolution_gpu_yxfb_yxio_b16",1], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4089043893927493060": ["convolution_gpu_yxfb_yxio_b16",2], + "15822546325822628634": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "17434141039341226796": ["convolution_gpu_yxfb_yxio_b16",2], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "6109013751635776331": ["convolution_gpu_bfyx_gemm_like",2], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16214394186337220006": ["convolution_gpu_yxfb_yxio_b16",2], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "16339187733937346919": ["convolution_gpu_yxfb_yxio_b16",2], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",2], + "2314579504260247470": ["convolution_gpu_yxfb_yxio_b16",2], + "8092673566670222445": ["convolution_gpu_yxfb_yxio_b16",2], + "13302687772426736346": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14686272582436109012": ["convolution_gpu_yxfb_yxio_b16",2], + "6934915634718835911": ["convolution_gpu_yxfb_yxio_b16",2], + "5293502980575652171": ["convolution_gpu_yxfb_yxio_b16",2], + "9190054801124577726": ["convolution_gpu_yxfb_yxio_b16",2], + "14268594692585922659": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "18279927175542031567": ["convolution_gpu_yxfb_yxio_b16",2], + "4755225554035527185": ["convolution_gpu_yxfb_yxio_b16",2], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "9328585005923667676": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9541996065561509160": ["convolution_gpu_yxfb_yxio_b16",2], + "3346891393420268502": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16986358655784856534": ["convolution_gpu_bfyx_gemm_like",2], + "14216513246096503793": ["convolution_gpu_yxfb_yxio_b16",2], + "17446505012657609153": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16434358667865869005": ["convolution_gpu_yxfb_yxio_b16",2], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "7155796826953849982": ["convolution_gpu_yxfb_yxio_b16",2], + "11545529736818363243": ["convolution_gpu_yxfb_yxio_b16",2], + "10613621801998459768": ["convolution_gpu_yxfb_yxio_b16",2], + "42935035304560876": ["convolution_gpu_yxfb_yxio_b16",1], + "185782385623159958": ["convolution_gpu_bfyx_gemm_like",2], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "2542112741645712811": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "2714322766616035858": ["convolution_gpu_yxfb_yxio_b16",2], + "166437837813304707": ["convolution_gpu_yxfb_yxio_b16",2], + "5832851215142537445": ["convolution_gpu_yxfb_yxio_b16",2], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",1], + "9120374653477510318": ["convolution_gpu_yxfb_yxio_b16",2], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "12536364199388193516": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15457040168177954463": ["convolution_gpu_yxfb_yxio_b16",2], + "3019864917236424168": ["convolution_gpu_yxfb_yxio_b16",1], + "16974981142389546385": ["convolution_gpu_yxfb_yxio_b16",2], + "17167229341919111718": ["convolution_gpu_bfyx_gemm_like",2], + "14043064718932538557": ["convolution_gpu_yxfb_yxio_b16",2], + "9812438080378091263": ["convolution_gpu_yxfb_yxio_b16",2], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",1], + "5995121118186531621": ["convolution_gpu_yxfb_yxio_b16",1], + "9177211394807412309": ["convolution_gpu_yxfb_yxio_b16",2], + "9144269202766996508": ["convolution_gpu_yxfb_yxio_b16",2], + "9803492989444302959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18186612931984342471": ["convolution_gpu_yxfb_yxio_b16",2], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "16101625311127899143": ["convolution_gpu_bfyx_gemm_like",2], + "10514865654990433040": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "18120079746729314878": ["convolution_gpu_yxfb_yxio_b16",2], + "13006774775034887171": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5774841809066688068": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "18435632962969462312": ["convolution_gpu_yxfb_yxio_b16",2], + "14799012895945855878": ["convolution_gpu_yxfb_yxio_b16",2], + "8095675456938934982": ["convolution_gpu_yxfb_yxio_b16",2], + "8561261337239934159": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12887076860522920405": ["convolution_gpu_yxfb_yxio_b16",2], + "16548491024653039967": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "12275528180752359999": ["convolution_gpu_yxfb_yxio_b16",2], + "15199604820473713622": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "4890043345392707202": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "3894130445933963911": ["convolution_gpu_yxfb_yxio_b16",2], + "10041205516209288381": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "7447163906170805189": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "14884315147107686805": ["convolution_gpu_bfyx_gemm_like",1], + "8490260671996115530": ["convolution_gpu_bfyx_gemm_like",1], + "7667898603371717971": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10775271979871646995": ["convolution_gpu_yxfb_yxio_b16",2], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "768820004084041271": ["convolution_gpu_yxfb_yxio_b16",2], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8161520217142313996": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8542782888102516498": ["convolution_gpu_yxfb_yxio_b16",2], + "967141158966448909": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7171904645566467208": ["convolution_gpu_bfyx_gemm_like",2], + "1634884284544380004": ["convolution_gpu_yxfb_yxio_b16",1], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "10226095100825845185": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16271675466919087248": ["convolution_gpu_yxfb_yxio_b16",2], + "7839141505912665157": ["fully_connected_gpu_fb_oi_ref",1], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13683623172740048376": ["convolution_gpu_bfyx_gemm_like",2], + "13991572769793610416": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2058172559199858297": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",2], + "7737977992444172757": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16911464046178654033": ["convolution_gpu_bfyx_1x1",2], + "4769003637955328938": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1117836569328440439": ["convolution_gpu_yxfb_yxio_b16",2], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9399511839804500548": ["convolution_gpu_yxfb_yxio_b16",1], + "11002165738333323413": ["convolution_gpu_yxfb_yxio_b16",2], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "7992077349568239994": ["convolution_gpu_yxfb_yxio_b16",2], + "14345755557418971954": ["convolution_gpu_yxfb_yxio_b16",2], + "7132328255408635227": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",2], + "7450417963648518926": ["convolution_gpu_bfyx_gemm_like",2], + "132437164570900392": ["convolution_gpu_yxfb_yxio_b16",2], + "8865700182878875593": ["convolution_gpu_yxfb_yxio_b16",2], + "466744273945239777": ["convolution_gpu_yxfb_yxio_b16",2], + "10785966734346479177": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "1117729599102132243": ["convolution_gpu_yxfb_yxio_b16",2], + "17025324057045572535": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8348997431940166878": ["convolution_gpu_yxfb_yxio_b16",2], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "13586735166545634506": ["convolution_gpu_yxfb_yxio_b16",2], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "10211403590176354415": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "276313536076170391": ["convolution_gpu_bfyx_gemm_like",2], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "15596408854298291433": ["convolution_gpu_yxfb_yxio_b16",2], + "13042938686374926241": ["convolution_gpu_yxfb_yxio_b16",2], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7375461241315602473": ["convolution_gpu_bfyx_gemm_like",2], + "14346466672686303107": ["convolution_gpu_yxfb_yxio_b16",2], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9456645866001656225": ["convolution_gpu_yxfb_yxio_b16",2], + "17970424536559595893": ["convolution_gpu_yxfb_yxio_b16",2], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "7614673554809134631": ["convolution_gpu_yxfb_yxio_b16",2], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "16851716501872033211": ["fully_connected_gpu_fb_io_block_fp16",1], + "15594673952484539994": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "6141193842171342687": ["convolution_gpu_yxfb_yxio_b16",2], + "818998169319147148": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",2], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "7894230717547658326": ["convolution_gpu_yxfb_yxio_b16",1], + "15522099459864628246": ["convolution_gpu_bfyx_gemm_like",2], + "2844794465598309010": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17583785768334531086": ["convolution_gpu_yxfb_yxio_b16",2], + "11685571068419983048": ["convolution_gpu_bfyx_1x1",2], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",1], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14025678657541870252": ["convolution_gpu_yxfb_yxio_b16",2], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18091349188280218186": ["convolution_gpu_yxfb_yxio_b16",2], + "15974208269240775349": ["convolution_gpu_yxfb_yxio_b16",1], + "11152334947349565403": ["convolution_gpu_yxfb_yxio_b16",2], + "4367991456894497706": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "13833960927635646899": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13700014916680753395": ["convolution_gpu_bfyx_gemm_like",2], + "15070618248849566698": ["convolution_gpu_yxfb_yxio_b16",2], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18427056032084727710": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1135062632388082485": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "16870110185980402237": ["convolution_gpu_yxfb_yxio_b16",2], + "10704906466618081803": ["convolution_gpu_yxfb_yxio_b16",2], + "8857763129101380288": ["convolution_gpu_bfyx_gemm_like",2], + "1775515808301276388": ["convolution_gpu_yxfb_yxio_b16",2], + "14147460733160099960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4824040283449153298": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "5606914392662771013": ["convolution_gpu_yxfb_yxio_b16",2], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "16617569629839911513": ["convolution_gpu_yxfb_yxio_b16",2], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6942606834115081953": ["convolution_gpu_yxfb_yxio_b16",2], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",2], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "4588117321438490483": ["convolution_gpu_yxfb_yxio_b16",2], + "4342446399224806160": ["convolution_gpu_yxfb_yxio_b16",2], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "11910735867274493498": ["convolution_gpu_bfyx_gemm_like",2], + "1281190653081960886": ["convolution_gpu_yxfb_yxio_b16",2], + "9099056013518879466": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "12686330321897091505": ["convolution_gpu_bfyx_gemm_like",2], + "8731079912830889828": ["convolution_gpu_yxfb_yxio_b16",2], + "15739756988784344130": ["convolution_gpu_yxfb_yxio_b16",2], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "6845814820599174031": ["convolution_gpu_bfyx_direct_10_12_16",0], + "2929715823970060874": ["convolution_gpu_bfyx_gemm_like",1], + "9463256538942644563": ["convolution_gpu_yxfb_yxio_b16",2], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "13512059751838488458": ["convolution_gpu_yxfb_yxio_b16",2], + "10820312036555742020": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "6055793483770886264": ["convolution_gpu_yxfb_yxio_b16",2], + "1330337530094825121": ["convolution_gpu_yxfb_yxio_b16",2], + "6254141935545262078": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",1], + "234288286732396704": ["convolution_gpu_yxfb_yxio_b16",1], + "467070383257529689": ["convolution_gpu_yxfb_yxio_b16",2], + "6654167459904026563": ["convolution_gpu_yxfb_yxio_b16",2], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "10979362792894404338": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "11693134363909241514": ["convolution_gpu_yxfb_yxio_b16",2], + "1921500066107090648": ["convolution_gpu_yxfb_yxio_b16",2], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",167], + "3117673619907511009": ["convolution_gpu_bfyx_os_iyx_osv16",487], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "17081449111821382308": ["convolution_gpu_yxfb_yxio_b16",1], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",2], + "13123561937554734618": ["convolution_gpu_yxfb_yxio_b16",2], + "1290180607037086383": ["convolution_gpu_yxfb_yxio_b16",2], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "13022797264172398260": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "3986429358782189117": ["convolution_gpu_yxfb_yxio_b16",1], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",2], + "14244689429217411113": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "6546440095044731932": ["convolution_gpu_yxfb_yxio_b16",2], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10747101719272611563": ["convolution_gpu_yxfb_yxio_b16",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",2], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "2917999294360728537": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3948843501884284998": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13124659308711651699": ["convolution_gpu_bfyx_gemm_like",2], + "2119566651547512543": ["convolution_gpu_yxfb_yxio_b16",1], + "1318571118468536310": ["convolution_gpu_bfyx_gemm_like",2], + "9545968464906009869": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "11455055202624479980": ["convolution_gpu_yxfb_yxio_b16",2], + "5285172225938230524": ["convolution_gpu_yxfb_yxio_b16",2], + "11669828823444745889": ["convolution_gpu_bfyx_gemm_like",2], + "12184235281888559274": ["convolution_gpu_yxfb_yxio_b16",2], + "5257134257307295031": ["convolution_gpu_yxfb_yxio_b16",1], + "17085927772068621152": ["convolution_gpu_yxfb_yxio_b16",2], + "1359720957005310113": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",2], + "16818206615424635387": ["convolution_gpu_bfyx_1x1",1], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "11679235499894668689": ["convolution_gpu_yxfb_yxio_b16",2], + "8655739705298627602": ["convolution_gpu_bfyx_gemm_like",0], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5240706676373148280": ["convolution_gpu_bfyx_gemm_like",2], + "3325575565536567070": ["convolution_gpu_yxfb_yxio_b16",2], + "1074748462756364699": ["fully_connected_gpu_fb_oi_ref",1], + "6926590672771069689": ["convolution_gpu_yxfb_yxio_b16",2], + "9603926867418680768": ["convolution_gpu_yxfb_yxio_b16",2], + "9412392168031560549": ["convolution_gpu_yxfb_yxio_b16",2], + "16003914811215141863": ["convolution_gpu_yxfb_yxio_b16",2], + "8956566633622104099": ["convolution_gpu_yxfb_yxio_b16",2], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "574869992355132069": ["convolution_gpu_bfyx_gemm_like",2], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12933785392937626017": ["convolution_gpu_yxfb_yxio_b16",2], + "16316483048621486077": ["convolution_gpu_bfyx_gemm_like",2], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14248622935809594779": ["convolution_gpu_yxfb_yxio_b16",2], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14752182392048929103": ["convolution_gpu_yxfb_yxio_b16",2], + "6744044115114192916": ["convolution_gpu_yxfb_yxio_b16",2], + "16385915289511951113": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "5017701748886087836": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "10878198256414940305": ["convolution_gpu_yxfb_yxio_b16",2], + "12287667143602938393": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "697333686114567307": ["convolution_gpu_bfyx_gemm_like",2], + "10113696658040720628": ["convolution_gpu_yxfb_yxio_b16",1], + "16013560489115457872": ["convolution_gpu_yxfb_yxio_b16",2], + "9741607635826869269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12023260267201191955": ["convolution_gpu_yxfb_yxio_b16",1], + "4776446300552810228": ["convolution_gpu_bfyx_gemm_like",0], + "15329680728165965773": ["convolution_gpu_bfyx_gemm_like",2], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10148067979123062638": ["convolution_gpu_yxfb_yxio_b16",1], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "16072525303202287969": ["convolution_gpu_yxfb_yxio_b16",2], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",1], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",0], + "6822432085522584060": ["convolution_gpu_yxfb_yxio_b16",2], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "7715937239456300593": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2321773209766424929": ["convolution_gpu_yxfb_yxio_b16",2], + "12710794174926396540": ["convolution_gpu_yxfb_yxio_b16",2], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",2], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8466986812935642059": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "1786105567361070086": ["convolution_gpu_yxfb_yxio_b16",2], + "12877601016766418505": ["convolution_gpu_bfyx_gemm_like",2], + "12241130380766920378": ["convolution_gpu_yxfb_yxio_b16",1], + "7837876599690110056": ["convolution_gpu_bfyx_gemm_like",2], + "17536482873064844308": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "632116056424249698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12146979849998627283": ["convolution_gpu_bfyx_gemm_like",2], + "824911124897042617": ["convolution_gpu_yxfb_yxio_b16",2], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",2], + "2835909063063272102": ["convolution_gpu_bfyx_gemm_like",2], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",1], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",1], + "3101748967012684440": ["convolution_gpu_yxfb_yxio_b16",2], + "8837721075413149240": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12287827551127082597": ["convolution_gpu_yxfb_yxio_b16",2], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "875146113874776902": ["convolution_gpu_yxfb_yxio_b16",2], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",2], + "1157388265135592238": ["convolution_gpu_yxfb_yxio_b16",2], + "11096750581455917678": ["convolution_gpu_yxfb_yxio_b16",2], + "10325138269934303618": ["convolution_gpu_yxfb_yxio_b16",2], + "14359026450472189405": ["convolution_gpu_yxfb_yxio_b16",2], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",1], + "14795618530175274538": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "6825390996679224270": ["convolution_gpu_yxfb_yxio_b16",2], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "14963614790718019676": ["convolution_gpu_yxfb_yxio_b16",2], + "11775265110573621330": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "7552544688541855979": ["convolution_gpu_bfyx_gemm_like",2], + "15078590909693331731": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12081698011407453832": ["convolution_gpu_yxfb_yxio_b16",2], + "5968129546023764583": ["convolution_gpu_yxfb_yxio_b16",2], + "3190494353583341446": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5567628205735744449": ["convolution_gpu_yxfb_yxio_b16",2], + "11705756153433897198": ["convolution_gpu_bfyx_1x1",2], + "10693837788817206459": ["convolution_gpu_yxfb_yxio_b16",2], + "17264671167892237524": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",2], + "8645965165922150743": ["convolution_gpu_yxfb_yxio_b16",2], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6805188858008657978": ["convolution_gpu_bfyx_gemm_like",2], + "16901594465545439334": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7541325258238317885": ["convolution_gpu_yxfb_yxio_b16",2], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "3965327578193694832": ["convolution_gpu_yxfb_yxio_b16",2], + "3742751561273931407": ["convolution_gpu_yxfb_yxio_b16",1], + "12469992822259989528": ["convolution_gpu_yxfb_yxio_b16",2], + "17292751972745231011": ["convolution_gpu_yxfb_yxio_b16",2], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",2], + "3735605582512535278": ["convolution_gpu_yxfb_yxio_b16",2], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",2], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "2449586975250543578": ["convolution_gpu_yxfb_yxio_b16",2], + "13701870576531008278": ["convolution_gpu_yxfb_yxio_b16",2], + "6699877220571254719": ["convolution_gpu_yxfb_yxio_b16",2], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5079381702867378605": ["convolution_gpu_yxfb_yxio_b16",1], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "14910911338105922048": ["convolution_gpu_yxfb_yxio_b16",2], + "10528894716283673051": ["convolution_gpu_yxfb_yxio_b16",2], + "12793347723828876280": ["convolution_gpu_yxfb_yxio_b16",2], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "10399620940700804517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11694428890484758107": ["convolution_gpu_yxfb_yxio_b16",1], + "15666720796968090760": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7770000755097925765": ["convolution_gpu_bfyx_1x1",2], + "3419536918610303807": ["convolution_gpu_yxfb_yxio_b16",2], + "18446245971488003004": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "13927671398099556854": ["convolution_gpu_yxfb_yxio_b16",2], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "1880137091477870982": ["convolution_gpu_yxfb_yxio_b16",1], + "9076758673133996959": ["convolution_gpu_bfyx_gemm_like",2], + "8302886228681027388": ["convolution_gpu_yxfb_yxio_b16",2], + "7292351660229751817": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",2], + "17996535939348094624": ["convolution_gpu_yxfb_yxio_b16",2], + "4039483032571506874": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "15289152041466330689": ["convolution_gpu_bfyx_gemm_like",2], + "2191416057399400794": ["convolution_gpu_yxfb_yxio_b16",2], + "2608363732937932266": ["convolution_gpu_bfyx_gemm_like",2], + "396580837423299119": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",1], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4116610956045302817": ["convolution_gpu_yxfb_yxio_b16",2], + "15476491807306982382": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6102330514901613158": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16898785030254336705": ["convolution_gpu_yxfb_yxio_b16",2], + "16094455700371652312": ["convolution_gpu_yxfb_yxio_b16",2], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",2], + "16936968151775497887": ["convolution_gpu_bfyx_gemm_like",2], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16188473537674428539": ["convolution_gpu_yxfb_yxio_b16",2], + "6756771670011959646": ["convolution_gpu_bfyx_gemm_like",2], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7051238664181857633": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "3571030800252732358": ["convolution_gpu_yxfb_yxio_b16",2], + "11065709388908213457": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5924341622384096919": ["convolution_gpu_bfyx_gemm_like",2], + "13821224753538037982": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "5595779343671478945": ["convolution_gpu_yxfb_yxio_b16",2], + "3220756134650041028": ["convolution_gpu_yxfb_yxio_b16",2], + "11824946481875102910": ["convolution_gpu_yxfb_yxio_b16",2], + "14044732537191084187": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9647916259092117712": ["convolution_gpu_bfyx_gemm_like",2], + "7008509833947166548": ["convolution_gpu_yxfb_yxio_b16",2], + "1099404514975797315": ["convolution_gpu_yxfb_yxio_b16",2], + "3114869763557037270": ["fully_connected_gpu_fb_oi_ref",1], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",526], + "6427979320488981912": ["convolution_gpu_yxfb_yxio_b16",1], + "5312269140190538942": ["convolution_gpu_yxfb_yxio_b16",2], + "16813995580382709489": ["convolution_gpu_yxfb_yxio_b16",2], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "4090512597925170883": ["convolution_gpu_yxfb_yxio_b16",2], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "7585777271711713778": ["convolution_gpu_yxfb_yxio_b16",2], + "6721354194352192662": ["convolution_gpu_yxfb_yxio_b16",2], + "14771341796915983228": ["convolution_gpu_yxfb_yxio_b16",2], + "18416908414174464784": ["convolution_gpu_bfyx_gemm_like",2], + "15956352026642286295": ["convolution_gpu_yxfb_yxio_b16",2], + "9312974578711092131": ["convolution_gpu_yxfb_yxio_b16",2], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",2], + "1208161922424418734": ["convolution_gpu_bfyx_gemm_like",2], + "12305397676800089268": ["convolution_gpu_yxfb_yxio_b16",2], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",2], + "17082268616134506581": ["convolution_gpu_yxfb_yxio_b16",2], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",2], + "17955326503130437346": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "13962325395021860937": ["convolution_gpu_yxfb_yxio_b16",2], + "16589848737162195829": ["convolution_gpu_yxfb_yxio_b16",2], + "11497761673211348612": ["convolution_gpu_yxfb_yxio_b16",2], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "5938850739683493929": ["convolution_gpu_yxfb_yxio_b16",0], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",2], + "15188273255634848057": ["convolution_gpu_yxfb_yxio_b16",2], + "1413558157882728476": ["convolution_gpu_yxfb_yxio_b16",2], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",1], + "8459380583159325597": ["convolution_gpu_yxfb_yxio_b16",1], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",0], + "4718716595177056289": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "12107079280128343726": ["convolution_gpu_yxfb_yxio_b16",2], + "3217295012596892181": ["convolution_gpu_yxfb_yxio_b16",2], + "12926382190254407283": ["convolution_gpu_yxfb_yxio_b16",2], + "8543619733732987550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1787152688807233651": ["convolution_gpu_yxfb_yxio_b16",2], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",1], + "4021045600853993587": ["convolution_gpu_yxfb_yxio_b16",2], + "9654944848074437064": ["convolution_gpu_bfyx_gemm_like",2], + "15635250842093678965": ["convolution_gpu_yxfb_yxio_b16",2], + "9418041909134721047": ["convolution_gpu_bfyx_gemm_like",2], + "2031558560788449957": ["convolution_gpu_yxfb_yxio_b16",1], + "14807466024030301968": ["convolution_gpu_yxfb_yxio_b16",2], + "14135593723444205032": ["convolution_gpu_bfyx_gemm_like",2], + "14646141746558153748": ["convolution_gpu_yxfb_yxio_b16",2], + "5583453364991774426": ["convolution_gpu_yxfb_yxio_b16",2], + "11436473937404565094": ["convolution_gpu_yxfb_yxio_b16",0], + "11719957578496407410": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "462240909302334133": ["convolution_gpu_yxfb_yxio_b16",2], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1042605521041579458": ["convolution_gpu_yxfb_yxio_b16",2], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",1], + "13193898459027972719": ["convolution_gpu_yxfb_yxio_b16",0], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12780116250427776647": ["convolution_gpu_yxfb_yxio_b16",2], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "7949069388917479511": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18161971781834208343": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "16404362308829952450": ["convolution_gpu_yxfb_yxio_b16",2], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "3301356450249305137": ["convolution_gpu_yxfb_yxio_b16",2], + "9589361786336650748": ["convolution_gpu_yxfb_yxio_b16",1], + "11807282628372660280": ["convolution_gpu_bfyx_1x1",2], + "16953093098789113080": ["convolution_gpu_yxfb_yxio_b16",2], + "10525462454857911293": ["convolution_gpu_yxfb_yxio_b16",2], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "11313025178951972247": ["convolution_gpu_bfyx_gemm_like",1], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2399313178951511557": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "16644952765107909604": ["convolution_gpu_yxfb_yxio_b16",1], + "3724572174214794659": ["convolution_gpu_yxfb_yxio_b16",2], + "10893628699015898230": ["convolution_gpu_yxfb_yxio_b16",1], + "7954972694876158422": ["convolution_gpu_bfyx_1x1",2], + "8458082326743351141": ["convolution_gpu_bfyx_gemm_like",2], + "13468081302022888489": ["convolution_gpu_bfyx_gemm_like",2], + "14789782064157699768": ["convolution_gpu_yxfb_yxio_b16",2], + "5578991261564497604": ["convolution_gpu_yxfb_yxio_b16",2], + "13767500791267563349": ["convolution_gpu_yxfb_yxio_b16",2], + "5919454297699648428": ["convolution_gpu_yxfb_yxio_b16",1], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6602394091385112575": ["convolution_gpu_yxfb_yxio_b16",2], + "17777248703109395158": ["convolution_gpu_yxfb_yxio_b16",2], + "15031155621982459860": ["convolution_gpu_bfyx_gemm_like",2], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "904355798061005466": ["convolution_gpu_yxfb_yxio_b16",2], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "7095629088416100928": ["convolution_gpu_bfyx_gemm_like",2], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "3859139031732555228": ["convolution_gpu_yxfb_yxio_b16",2], + "12194037100109755112": ["convolution_gpu_bfyx_gemm_like",2], + "4500107195684703428": ["convolution_gpu_yxfb_yxio_b16",2], + "11759322316883943989": ["convolution_gpu_yxfb_yxio_b16",2], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "12831298482349900359": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15227189929676013024": ["convolution_gpu_yxfb_yxio_b16",2], + "3177304125602972370": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15656706773401161497": ["convolution_gpu_yxfb_yxio_b16",2], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",1], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4670487436469119872": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8879618489623984140": ["convolution_gpu_yxfb_yxio_b16",2], + "11797601971796699898": ["convolution_gpu_bfyx_gemm_like",2], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",1039], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "3067001341355453846": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "18027243127893440568": ["convolution_gpu_yxfb_yxio_b16",2], + "5751283221740229986": ["convolution_gpu_bfyx_gemm_like",1], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9243949750444156746": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "7444165397413360181": ["convolution_gpu_yxfb_yxio_b16",2], + "6418500550523945192": ["convolution_gpu_yxfb_yxio_b16",2], + "17826868890632814593": ["convolution_gpu_yxfb_yxio_b16",2], + "10271261715175176019": ["convolution_gpu_yxfb_yxio_b16",2], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9738776059655610885": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1304921846760027440": ["convolution_gpu_yxfb_yxio_b16",1], + "3059575629482816852": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3156783219125679946": ["convolution_gpu_bfyx_1x1",2], + "16739031949237426992": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "5602328731722824868": ["convolution_gpu_yxfb_yxio_b16",1], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "18129795023552968695": ["convolution_gpu_yxfb_yxio_b16",2], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3244803973821375252": ["convolution_gpu_yxfb_yxio_b16",2], + "12808456612606675259": ["convolution_gpu_yxfb_yxio_b16",1], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",2], + "11908169713247209976": ["convolution_gpu_yxfb_yxio_b16",2], + "5046089607609787258": ["convolution_gpu_yxfb_yxio_b16",2], + "13960388312976163971": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10632933069865171963": ["convolution_gpu_yxfb_yxio_b16",2], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "15398380328746287438": ["convolution_gpu_bfyx_gemm_like",2], + "8456185296386225533": ["convolution_gpu_yxfb_yxio_b16",1], + "13633048912926365931": ["convolution_gpu_yxfb_yxio_b16",2], + "345043289576587800": ["convolution_gpu_bfyx_1x1",2], + "17413191440314817117": ["convolution_gpu_yxfb_yxio_b16",2], + "8365255170846178102": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10504318542015227515": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "12407890437443790515": ["convolution_gpu_bfyx_gemm_like",2], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",2], + "17052161869014993719": ["convolution_gpu_yxfb_yxio_b16",2], + "17195293614280872622": ["convolution_gpu_yxfb_yxio_b16",2], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",2], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",2], + "10745099399736462076": ["convolution_gpu_yxfb_yxio_b16",2], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "3820661057776133570": ["convolution_gpu_bfyx_1x1",2], + "9079203986633151014": ["convolution_gpu_bfyx_1x1",1], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16347412180100581330": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "3242391637018676328": ["convolution_gpu_yxfb_yxio_b16",2], + "14689812157592240007": ["convolution_gpu_yxfb_yxio_b16",2], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",2], + "12989677691575632174": ["convolution_gpu_yxfb_yxio_b16",1], + "3444250649099578792": ["convolution_gpu_yxfb_yxio_b16",1], + "8174833187387604731": ["convolution_gpu_yxfb_yxio_b16",2], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",2], + "12379166764490359144": ["convolution_gpu_yxfb_yxio_b16",2], + "631489011812924153": ["convolution_gpu_bfyx_1x1",2], + "16837963510205857013": ["convolution_gpu_yxfb_yxio_b16",2], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9495192057713157041": ["convolution_gpu_yxfb_yxio_b16",2], + "5727758374304309350": ["convolution_gpu_yxfb_yxio_b16",2], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17762455138615317884": ["convolution_gpu_yxfb_yxio_b16",2], + "10709828018763273371": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "5313382805395362669": ["convolution_gpu_yxfb_yxio_b16",2], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",2], + "17088011073114549679": ["convolution_gpu_yxfb_yxio_b16",2], + "4135068756462147853": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3731224822876468602": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4362304842016958728": ["convolution_gpu_bfyx_gemm_like",2], + "8250212706222997384": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "10536316961655703500": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "17214254645087272557": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "9617316303048974588": ["convolution_gpu_yxfb_yxio_b16",2], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17399728556634171321": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7612288596055048389": ["convolution_gpu_yxfb_yxio_b16",1], + "8733371726903473932": ["convolution_gpu_yxfb_yxio_b16",2], + "4651261398203912503": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "9119268982510599778": ["convolution_gpu_yxfb_yxio_b16",2], + "6328802691680458752": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13218298785325404589": ["convolution_gpu_yxfb_yxio_b16",1], + "5754396201681434378": ["convolution_gpu_bfyx_1x1",2], + "15989894214714907271": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9922764846020092836": ["convolution_gpu_yxfb_yxio_b16",2], + "4972952621622984792": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",1], + "18199824206329982249": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5157949342388119167": ["convolution_gpu_bfyx_gemm_like",2], + "14151747022287993729": ["convolution_gpu_bfyx_gemm_like",2], + "10747688146893187959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "75120034961995929": ["convolution_gpu_yxfb_yxio_b16",2], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9237587440336828595": ["convolution_gpu_yxfb_yxio_b16",2], + "13326492157370934949": ["convolution_gpu_bfyx_gemm_like",2], + "1387945708447092123": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "13962189339706230770": ["convolution_gpu_yxfb_yxio_b16",2], + "17848582668902427291": ["convolution_gpu_yxfb_yxio_b16",2], + "14281801257982447624": ["convolution_gpu_yxfb_yxio_b16",2], + "497488185553682238": ["convolution_gpu_bfyx_1x1",1], + "16535378085465418910": ["convolution_gpu_yxfb_yxio_b16",2], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "8900977003907025003": ["convolution_gpu_yxfb_yxio_b16",2], + "11690533591656807605": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17133376737554844449": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "5330130011321223525": ["convolution_gpu_yxfb_yxio_b16",1], + "3121704239277217273": ["convolution_gpu_yxfb_yxio_b16",2], + "7233783054884565746": ["convolution_gpu_bfyx_gemm_like",2], + "3080612075440389053": ["convolution_gpu_yxfb_yxio_b16",2], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",1], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",2], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",103], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",2], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",2], + "11196245220967135443": ["convolution_gpu_yxfb_yxio_b16",2], + "6820224292713065232": ["convolution_gpu_yxfb_yxio_b16",2], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13009381943944182288": ["convolution_gpu_yxfb_yxio_b16",2], + "8210092359850191682": ["convolution_gpu_yxfb_yxio_b16",2], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15311930929656759371": ["convolution_gpu_yxfb_yxio_b16",2], + "5857101685300045443": ["convolution_gpu_yxfb_yxio_b16",1], + "334703311738467111": ["convolution_gpu_bfyx_gemm_like",1], + "3451309062150982886": ["convolution_gpu_yxfb_yxio_b16",2], + "14808079119439455357": ["convolution_gpu_yxfb_yxio_b16",2], + "6760797535531423152": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",2], + "3928596145340765666": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2164314506903530487": ["convolution_gpu_yxfb_yxio_b16",2], + "15757351352532908153": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "13249852145471010452": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7926989875988735079": ["convolution_gpu_yxfb_yxio_b16",2], + "10896935976330351144": ["convolution_gpu_yxfb_yxio_b16",1], + "16469493066700118274": ["convolution_gpu_yxfb_yxio_b16",2], + "16459072408799224894": ["convolution_gpu_yxfb_yxio_b16",2], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13766070202060785219": ["convolution_gpu_yxfb_yxio_b16",2], + "5723759573058003971": ["convolution_gpu_yxfb_yxio_b16",2], + "2314805462821790774": ["convolution_gpu_yxfb_yxio_b16",1], + "9319064434175105168": ["convolution_gpu_yxfb_yxio_b16",2], + "2319519208813614116": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5115298857582076692": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4925720860007127584": ["convolution_gpu_yxfb_yxio_b16",2], + "8614375489387596119": ["convolution_gpu_yxfb_yxio_b16",2], + "14206125678667603810": ["convolution_gpu_bfyx_1x1",1], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16455941573984854254": ["convolution_gpu_yxfb_yxio_b16",1], + "2126208024616319501": ["convolution_gpu_yxfb_yxio_b16",2], + "5795524493577277985": ["convolution_gpu_yxfb_yxio_b16",2], + "17491825380936802930": ["convolution_gpu_yxfb_yxio_b16",2], + "5319459637051859849": ["convolution_gpu_yxfb_yxio_b16",2], + "18333355024265557430": ["convolution_gpu_yxfb_yxio_b16",2], + "8794896449397768269": ["convolution_gpu_bfyx_gemm_like",2], + "16223356735957394429": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5258372022038629529": ["convolution_gpu_yxfb_yxio_b16",1], + "6014752258124559691": ["convolution_gpu_yxfb_yxio_b16",2], + "10256831975351722184": ["convolution_gpu_bfyx_gemm_like",1], + "17987739992848266169": ["convolution_gpu_yxfb_yxio_b16",2], + "10982526068861394162": ["convolution_gpu_yxfb_yxio_b16",2], + "144634005596305959": ["fully_connected_gpu_fb_io_block_fp16",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "14116275901314596944": ["convolution_gpu_yxfb_yxio_b16",2], + "4240975186599864955": ["convolution_gpu_yxfb_yxio_b16",2], + "16267531927647687641": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5643920882179676695": ["convolution_gpu_yxfb_yxio_b16",1], + "9170163372548895531": ["convolution_gpu_yxfb_yxio_b16",2], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",1], + "7178866013527118649": ["convolution_gpu_yxfb_yxio_b16",2], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",2], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "8325903548627432": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18244966393978155130": ["convolution_gpu_yxfb_yxio_b16",2], + "8257103926661643451": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4256155212405177844": ["convolution_gpu_yxfb_yxio_b16",1], + "5047972486012090625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "10007925729029867733": ["convolution_gpu_yxfb_yxio_b16",2], + "4430932059574900921": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15449650271741732512": ["convolution_gpu_yxfb_yxio_b16",2], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5705056256080522960": ["convolution_gpu_yxfb_yxio_b16",2], + "12248852114219058572": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "11979032916453246611": ["convolution_gpu_yxfb_yxio_b16",1], + "15136770992109675092": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1632416005093914709": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5577571901049952658": ["convolution_gpu_yxfb_yxio_b16",2], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",2], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9736684300833719045": ["convolution_gpu_yxfb_yxio_b16",2], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",1], + "15303251546207338960": ["convolution_gpu_yxfb_yxio_b16",0], + "362823013207940830": ["convolution_gpu_yxfb_yxio_b16",2], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",2], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",1], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",173], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "16000428520749664687": ["convolution_gpu_yxfb_yxio_b16",2], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "7211179360844946434": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "6692085187697087807": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "4531222427159927606": ["convolution_gpu_bfyx_gemm_like",2], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14817801788424046035": ["convolution_gpu_yxfb_yxio_b16",2], + "5958300749101873980": ["convolution_gpu_yxfb_yxio_b16",2], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "8575833423399668525": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "1040411949730118556": ["convolution_gpu_yxfb_yxio_b16",2], + "15542520725696027828": ["convolution_gpu_yxfb_yxio_b16",1], + "15961487889420208188": ["convolution_gpu_bfyx_gemm_like",2], + "12879367655655932174": ["convolution_gpu_yxfb_yxio_b16",2], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "10626281431800814406": ["convolution_gpu_yxfb_yxio_b16",2], + "3406812365298442897": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "13668940862847596363": ["convolution_gpu_yxfb_yxio_b16",2], + "11317843493537672866": ["convolution_gpu_yxfb_yxio_b16",2], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "3635446784873718932": ["convolution_gpu_bfyx_gemm_like",2], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",1], + "1108229954015380813": ["convolution_gpu_yxfb_yxio_b16",2], + "15449774545834423274": ["convolution_gpu_yxfb_yxio_b16",2], + "6808843088626121909": ["convolution_gpu_bfyx_gemm_like",2], + "3492178441007007033": ["convolution_gpu_yxfb_yxio_b16",2], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "16502045034098739466": ["convolution_gpu_bfyx_gemm_like",2], + "975943900172381326": ["convolution_gpu_yxfb_yxio_b16",2], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",2], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17277917672233464304": ["convolution_gpu_yxfb_yxio_b16",2], + "11113125355390956764": ["convolution_gpu_yxfb_yxio_b16",0], + "16513038896689318072": ["convolution_gpu_yxfb_yxio_b16",1], + "7531346828150129063": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "10701208905236219083": ["convolution_gpu_yxfb_yxio_b16",2], + "14930789530046665855": ["convolution_gpu_bfyx_gemm_like",2], + "14065215389112262561": ["convolution_gpu_yxfb_yxio_b16",1], + "13051390418571971928": ["convolution_gpu_yxfb_yxio_b16",2], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13797759143769042759": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "8099100633390626027": ["convolution_gpu_yxfb_yxio_b16",2], + "7412772553395852003": ["convolution_gpu_yxfb_yxio_b16",2], + "16428789154716792138": ["convolution_gpu_yxfb_yxio_b16",2], + "2477849395789783501": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17800115051456107658": ["convolution_gpu_yxfb_yxio_b16",2], + "4773482308451190487": ["convolution_gpu_yxfb_yxio_b16",2], + "4830454154838353056": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "9659814105483633858": ["convolution_gpu_yxfb_yxio_b16",2], + "8933701347987963693": ["convolution_gpu_yxfb_yxio_b16",2], + "4216366893358625960": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "14791575777969587370": ["convolution_gpu_yxfb_yxio_b16",1], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16989896550094613437": ["convolution_gpu_yxfb_yxio_b16",2], + "7535571298845832061": ["convolution_gpu_yxfb_yxio_b16",1], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4640696923527766618": ["convolution_gpu_bfyx_gemm_like",2], + "13585163747565192884": ["convolution_gpu_bfyx_gemm_like",2], + "9135116285263927211": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12712071520541638451": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "13058026769607428653": ["convolution_gpu_yxfb_yxio_b16",2], + "10431728173806991521": ["convolution_gpu_yxfb_yxio_b16",2], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",2], + "2450251936650841836": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "14931590390643373866": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12866217660635921034": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15747571668131081693": ["convolution_gpu_yxfb_yxio_b16",1], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "6106367716877633757": ["convolution_gpu_yxfb_yxio_b16",2], + "5963901433137582265": ["convolution_gpu_bfyx_gemm_like",2], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "18226737525116147628": ["convolution_gpu_yxfb_yxio_b16",2], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",2], + "17966898762317477857": ["convolution_gpu_yxfb_yxio_b16",1], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",2], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9542795021683486547": ["convolution_gpu_yxfb_yxio_b16",2], + "12093737479877309006": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8651641584737798174": ["convolution_gpu_bfyx_gemm_like",2], + "10194187012252949909": ["convolution_gpu_yxfb_yxio_b16",2], + "15181987458871339815": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "6458189051305803360": ["convolution_gpu_yxfb_yxio_b16",2], + "12616205756849913359": ["convolution_gpu_yxfb_yxio_b16",2], + "5941095082097535176": ["convolution_gpu_bfyx_gemm_like",1], + "12762301414049772746": ["convolution_gpu_yxfb_yxio_b16",2], + "8399668174006528237": ["convolution_gpu_bfyx_gemm_like",1], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",2], + "7481256533438761028": ["convolution_gpu_bfyx_gemm_like",2], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",2], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "584086621952390547": ["convolution_gpu_bfyx_gemm_like",2], + "4439786737038041995": ["convolution_gpu_yxfb_yxio_b16",2], + "2034811390140488812": ["convolution_gpu_yxfb_yxio_b16",2], + "7767103488808670253": ["convolution_gpu_yxfb_yxio_b16",2], + "10396788403466463989": ["convolution_gpu_yxfb_yxio_b16",2], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "7986797517722531256": ["convolution_gpu_bfyx_gemm_like",2], + "13569453018083742128": ["convolution_gpu_yxfb_yxio_b16",2], + "2583562092192709891": ["convolution_gpu_yxfb_yxio_b16",2], + "8063236641629084352": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6222595759158615206": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2058364830449635556": ["convolution_gpu_yxfb_yxio_b16",2], + "2644054989263429508": ["convolution_gpu_yxfb_yxio_b16",2], + "537074122417021898": ["convolution_gpu_bfyx_os_iyx_osv16",100], + "17040537179740138304": ["convolution_gpu_yxfb_yxio_b16",2], + "7213383384662748578": ["convolution_gpu_yxfb_yxio_b16",2], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13648761167622654288": ["fully_connected_gpu_yxfb_ref",0], + "18253299978538051201": ["convolution_gpu_yxfb_yxio_b16",2], + "15526021915035861514": ["convolution_gpu_bfyx_gemm_like",1], + "14670339865153970893": ["convolution_gpu_yxfb_yxio_b16",2], + "9655242408142699694": ["convolution_gpu_yxfb_yxio_b16",1], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",1], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "12656228464579497510": ["convolution_gpu_yxfb_yxio_b16",2], + "9309173544512377803": ["convolution_gpu_yxfb_yxio_b16",2], + "13079058582191027406": ["convolution_gpu_yxfb_yxio_b16",2], + "951747146164097188": ["convolution_gpu_bfyx_1x1",2], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "712495040970043706": ["convolution_gpu_yxfb_yxio_b16",2], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",1], + "9306120768594851497": ["convolution_gpu_yxfb_yxio_b16",2], + "7777279468029216688": ["convolution_gpu_yxfb_yxio_b16",2], + "18029396837690671545": ["convolution_gpu_yxfb_yxio_b16",2], + "14909506411483112959": ["convolution_gpu_yxfb_yxio_b16",1], + "1734769856106746136": ["convolution_gpu_yxfb_yxio_b16",2], + "9987415314864002460": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6772954924703365345": ["convolution_gpu_bfyx_gemm_like",2], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "13403161389559730": ["convolution_gpu_bfyx_gemm_like",2], + "9811086682271990794": ["convolution_gpu_yxfb_yxio_b16",2], + "9674248159643501374": ["convolution_gpu_yxfb_yxio_b16",2], + "7830644361525332797": ["convolution_gpu_yxfb_yxio_b16",2], + "2290965424106255219": ["convolution_gpu_yxfb_yxio_b16",2], + "11208625628954179200": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "15774073623451382326": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "10099598062509781441": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "8618627241234406784": ["convolution_gpu_yxfb_yxio_b16",2], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1784892318069674949": ["convolution_gpu_yxfb_yxio_b16",2], + "5568728266639058524": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9328223957245552723": ["convolution_gpu_bfyx_gemm_like",2], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "8809438390805488749": ["convolution_gpu_yxfb_yxio_b16",2], + "15317946705199574301": ["convolution_gpu_yxfb_yxio_b16",0], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "18093895673012393740": ["convolution_gpu_yxfb_yxio_b16",1], + "10232809153913700925": ["convolution_gpu_yxfb_yxio_b16",2], + "17763347648779573375": ["convolution_gpu_yxfb_yxio_b16",1], + "2895819653081408358": ["convolution_gpu_yxfb_yxio_b16",2], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5600128039063009632": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6490907666077364481": ["convolution_gpu_yxfb_yxio_b16",2], + "17188004018198554470": ["convolution_gpu_yxfb_yxio_b16",2], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13149617013851130587": ["convolution_gpu_yxfb_yxio_b16",1], + "11130439225010714550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "10588059104387338398": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "8117638644045799192": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6818140422066151642": ["convolution_gpu_yxfb_yxio_b16",1], + "5156033406916344703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15409184364121627414": ["convolution_gpu_yxfb_yxio_b16",2], + "3536359641225772698": ["convolution_gpu_yxfb_yxio_b16",2], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",0], + "13590444711975157776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "9165817820007469505": ["convolution_gpu_yxfb_yxio_b16",2], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "17109520309574369561": ["convolution_gpu_bfyx_gemm_like",2], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "886880682650879171": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14263605862840500474": ["convolution_gpu_yxfb_yxio_b16",2], + "119047044057950958": ["convolution_gpu_bfyx_gemm_like",1], + "11153522012082333137": ["convolution_gpu_yxfb_yxio_b16",2], + "3074436655804078403": ["convolution_gpu_yxfb_yxio_b16",2], + "15271783562528081169": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",2], + "7247475218645942682": ["convolution_gpu_yxfb_yxio_b16",2], + "2753393184265405425": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",2], + "12788611449571149037": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12829916847670789556": ["convolution_gpu_yxfb_yxio_b16",2], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",2], + "17140702790441856730": ["convolution_gpu_bfyx_gemm_like",1], + "7792512829747836997": ["convolution_gpu_yxfb_yxio_b16",2], + "8935522915553126640": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9332701118402940384": ["convolution_gpu_yxfb_yxio_b16",1], + "16573597215928075233": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",2], + "15424646499666127616": ["convolution_gpu_yxfb_yxio_b16",0], + "16561618767117193109": ["convolution_gpu_bfyx_1x1",2], + "12374775091628199854": ["convolution_gpu_bfyx_1x1",2], + "14416897092729861207": ["convolution_gpu_yxfb_yxio_b16",2], + "16569637518948306471": ["convolution_gpu_bfyx_gemm_like",2], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "8792202318168046223": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "18417830391649460864": ["convolution_gpu_yxfb_yxio_b16",2], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7817691489550523328": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10555835101752189454": ["convolution_gpu_yxfb_yxio_b16",2], + "15858356755924943957": ["convolution_gpu_yxfb_yxio_b16",2], + "16402386400454963239": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "7398158542592530232": ["convolution_gpu_yxfb_yxio_b16",2], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "269167598200943915": ["convolution_gpu_yxfb_yxio_b16",2], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2629918844315184499": ["convolution_gpu_yxfb_yxio_b16",1], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",2], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "17479773641824222843": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10868287582480518153": ["convolution_gpu_bfyx_gemm_like",2], + "11738780323979052397": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14616801816838734032": ["convolution_gpu_yxfb_yxio_b16",2], + "9589718307719207394": ["convolution_gpu_yxfb_yxio_b16",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "5367618411887849711": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "3582256192870592087": ["convolution_gpu_bfyx_os_iyx_osv16",1029], + "16253244737884854313": ["convolution_gpu_yxfb_yxio_b16",2], + "13076935351221777993": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "7724125714360985807": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "2955459120402821540": ["convolution_gpu_yxfb_yxio_b16",2], + "4815047491742617397": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "6250785177115691293": ["convolution_gpu_yxfb_yxio_b16",2], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",2], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",2], + "7106362077449435105": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "18164706399147697716": ["convolution_gpu_yxfb_yxio_b16",1], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",2], + "17536591931934691648": ["convolution_gpu_yxfb_yxio_b16",2], + "15112393534380347357": ["convolution_gpu_yxfb_yxio_b16",2], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "252188028702250668": ["convolution_gpu_yxfb_yxio_b16",2], + "5519781859090160931": ["convolution_gpu_bfyx_os_iyx_osv16",760], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",2], + "17713666626443142908": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15617599138946168772": ["convolution_gpu_yxfb_yxio_b16",2], + "11319799002723299753": ["convolution_gpu_yxfb_yxio_b16",2], + "4154830034576950123": ["convolution_gpu_yxfb_yxio_b16",2], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "12344689711325644622": ["convolution_gpu_yxfb_yxio_b16",2], + "9757389422721488173": ["convolution_gpu_bfyx_1x1",1], + "17833304859352483840": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12625112690264223217": ["convolution_gpu_bfyx_gemm_like",2], + "6888842613779488104": ["convolution_gpu_bfyx_1x1",2], + "18080848057281093190": ["convolution_gpu_yxfb_yxio_b16",2], + "10785252006948647963": ["convolution_gpu_yxfb_yxio_b16",2], + "14827882251752394500": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6713554643048248003": ["convolution_gpu_yxfb_yxio_b16",2], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",2], + "15131258379753113816": ["convolution_gpu_yxfb_yxio_b16",2], + "11658751382892761740": ["convolution_gpu_yxfb_yxio_b16",2], + "1973051991518953158": ["convolution_gpu_yxfb_yxio_b16",2], + "14091543526898531200": ["convolution_gpu_yxfb_yxio_b16",2], + "14887465694301281952": ["convolution_gpu_yxfb_yxio_b16",2], + "9426665763007611385": ["convolution_gpu_bfyx_gemm_like",2], + "3533556385636018581": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "5291011077679733990": ["convolution_gpu_bfyx_gemm_like",2], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "4282756088824939292": ["convolution_gpu_yxfb_yxio_b16",2], + "15967614281807823696": ["convolution_gpu_bfyx_gemm_like",2], + "5656320098721954644": ["convolution_gpu_yxfb_yxio_b16",2], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "2571289358202565251": ["convolution_gpu_yxfb_yxio_b16",2], + "3449007266907948591": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "8890400423799565844": ["convolution_gpu_yxfb_yxio_b16",2], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "12745552951204330052": ["convolution_gpu_yxfb_yxio_b16",2], + "16851082749395991194": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "15924916465272239832": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "4280250278457269231": ["convolution_gpu_yxfb_yxio_b16",2], + "13326339730522937517": ["convolution_gpu_yxfb_yxio_b16",2], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15799159401545270696": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17255805293355120219": ["convolution_gpu_yxfb_yxio_b16",2], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "12818012741490629493": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "18075395502550596586": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "3172518362830684966": ["convolution_gpu_yxfb_yxio_b16",1], + "14892045745899927762": ["convolution_gpu_yxfb_yxio_b16",2], + "11263540528012919947": ["convolution_gpu_bfyx_1x1",2], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "9982350570959875159": ["convolution_gpu_yxfb_yxio_b16",2], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15973363403733281926": ["convolution_gpu_yxfb_yxio_b16",2], + "6820134899097582639": ["convolution_gpu_yxfb_yxio_b16",2], + "1129349074674368869": ["convolution_gpu_yxfb_yxio_b16",2], + "7235358742317442134": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16252420150239789472": ["convolution_gpu_yxfb_yxio_b16",2], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17041468169694105561": ["convolution_gpu_yxfb_yxio_b16",2], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4995051972576749717": ["convolution_gpu_yxfb_yxio_b16",2], + "2199167704280374654": ["convolution_gpu_yxfb_yxio_b16",2], + "15739278428190392018": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6131481289104111211": ["convolution_gpu_bfyx_gemm_like",2], + "10751633292301177132": ["convolution_gpu_yxfb_yxio_b16",2], + "3049097498155857895": ["convolution_gpu_yxfb_yxio_b16",2], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9116620473576064051": ["convolution_gpu_yxfb_yxio_b16",2], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",2], + "2907572047024872990": ["convolution_gpu_yxfb_yxio_b16",1], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",2], + "9169324504353459004": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17272600601478967434": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",2], + "5926747396493954633": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15949311219856917559": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "848735117501914374": ["convolution_gpu_yxfb_yxio_b16",2], + "11226912053840621089": ["convolution_gpu_yxfb_yxio_b16",2], + "8155268141318893606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8616584380583931648": ["convolution_gpu_yxfb_yxio_b16",1], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "10879171754021534649": ["convolution_gpu_yxfb_yxio_b16",2], + "4805194563120934409": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1235864574444794315": ["convolution_gpu_yxfb_yxio_b16",2], + "7979265448683159733": ["convolution_gpu_yxfb_yxio_b16",2], + "16709930291825881111": ["convolution_gpu_yxfb_yxio_b16",1], + "10406201782146034797": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10812324504777808014": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14385148066232093878": ["convolution_gpu_yxfb_yxio_b16",2], + "8362179886017398479": ["convolution_gpu_bfyx_os_iyx_osv16",8], + "4615766471724791034": ["convolution_gpu_yxfb_yxio_b16",2], + "6511742759171254447": ["convolution_gpu_yxfb_yxio_b16",2], + "9523941899498458600": ["convolution_gpu_yxfb_yxio_b16",2], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9026883911202247185": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "4779919236230154165": ["convolution_gpu_bfyx_gemm_like",0], + "1054954263090546905": ["convolution_gpu_yxfb_yxio_b16",1], + "14097394936362526559": ["convolution_gpu_yxfb_yxio_b16",2], + "10558609844937234631": ["convolution_gpu_yxfb_yxio_b16",2], + "10318417166945621015": ["convolution_gpu_yxfb_yxio_b16",2], + "4988480452582288323": ["convolution_gpu_yxfb_yxio_b16",2], + "490931535580183607": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1354199155380786906": ["convolution_gpu_yxfb_yxio_b16",2], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",2], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",2], + "10272016038525930672": ["convolution_gpu_bfyx_gemm_like",2], + "18337762134908554532": ["convolution_gpu_yxfb_yxio_b16",2], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",2], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "11738360883999461965": ["convolution_gpu_yxfb_yxio_b16",1], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "10997156099709436375": ["convolution_gpu_yxfb_yxio_b16",2], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",2], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10720769054729185991": ["convolution_gpu_yxfb_yxio_b16",2], + "14365232561737454031": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "10952045211444638649": ["convolution_gpu_yxfb_yxio_b16",1], + "14885031472057965707": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "3648713169465596196": ["convolution_gpu_yxfb_yxio_b16",2], + "2469579114592379040": ["convolution_gpu_bfyx_gemm_like",2], + "13507437548205340054": ["convolution_gpu_yxfb_yxio_b16",2], + "11010673493295430801": ["convolution_gpu_yxfb_yxio_b16",2], + "3792276488551864121": ["convolution_gpu_yxfb_yxio_b16",2], + "1336739931702966228": ["convolution_gpu_yxfb_yxio_b16",1], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",2], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6931062623510631425": ["convolution_gpu_yxfb_yxio_b16",2], + "13723434004563378589": ["convolution_gpu_yxfb_yxio_b16",2], + "12917241193304093727": ["convolution_gpu_bfyx_gemm_like",2], + "17618727959983224888": ["convolution_gpu_yxfb_yxio_b16",2], + "2060161076370553192": ["convolution_gpu_yxfb_yxio_b16",2], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "12714892326998505133": ["convolution_gpu_yxfb_yxio_b16",2], + "13040213971461407125": ["convolution_gpu_yxfb_yxio_b16",2], + "5852569526295779497": ["convolution_gpu_yxfb_yxio_b16",2], + "11724732387425614709": ["convolution_gpu_yxfb_yxio_b16",2], + "11892455357792445192": ["convolution_gpu_yxfb_yxio_b16",2], + "4773077837537775324": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3904383357046705799": ["convolution_gpu_yxfb_yxio_b16",2], + "3286496836813087881": ["convolution_gpu_yxfb_yxio_b16",2], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",2], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14497254583210965214": ["convolution_gpu_yxfb_yxio_b16",2], + "72745257233374197": ["convolution_gpu_yxfb_yxio_b16",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11289650463922092775": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2673903488704336606": ["convolution_gpu_bfyx_gemm_like",2], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "11606895513516475339": ["convolution_gpu_yxfb_yxio_b16",2], + "13124342334495538095": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "12546446257192651407": ["convolution_gpu_yxfb_yxio_b16",1], + "2856601829807186494": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13412516623201653283": ["convolution_gpu_yxfb_yxio_b16",2], + "17868834743037242721": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4805402210873641704": ["convolution_gpu_yxfb_yxio_b16",2], + "6816632607384969096": ["convolution_gpu_yxfb_yxio_b16",1], + "9899211365930959346": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "10612739622648878242": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4780291919667721265": ["convolution_gpu_yxfb_yxio_b16",2], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "10178171262128338408": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14502856487639608696": ["convolution_gpu_bfyx_gemm_like",2], + "4894469114343061704": ["convolution_gpu_yxfb_yxio_b16",0], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "14417401878572618236": ["convolution_gpu_yxfb_yxio_b16",2], + "10914921540144371519": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10724501418439612080": ["convolution_gpu_bfyx_gemm_like",1], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7100056605355325582": ["convolution_gpu_yxfb_yxio_b16",2], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "15859493313686060349": ["convolution_gpu_bfyx_gemm_like",2], + "14540578324750869319": ["convolution_gpu_bfyx_gemm_like",2], + "7565006185780806333": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "15192022454507415969": ["convolution_gpu_yxfb_yxio_b16",1], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "15325852281951905610": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",1], + "8519354640245415816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12669783714916998842": ["convolution_gpu_yxfb_yxio_b16",2], + "13187657215288939912": ["convolution_gpu_yxfb_yxio_b16",2], + "6123707371654753818": ["convolution_gpu_yxfb_yxio_b16",2], + "7343590049199309046": ["convolution_gpu_yxfb_yxio_b16",2], + "5526223938481098693": ["convolution_gpu_yxfb_yxio_b16",2], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",2], + "9521715904587435700": ["convolution_gpu_yxfb_yxio_b16",2], + "3058716597925544041": ["convolution_gpu_yxfb_yxio_b16",2], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "2431241169199693527": ["convolution_gpu_yxfb_yxio_b16",1], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1425953627379976115": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "6118737381591369532": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9184275066167601343": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "18083803358410976976": ["convolution_gpu_yxfb_yxio_b16",2], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "1299760574827253811": ["convolution_gpu_yxfb_yxio_b16",2], + "4126895998426674411": ["convolution_gpu_bfyx_gemm_like",2], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",2], + "8421388456873652700": ["convolution_gpu_bfyx_gemm_like",2], + "1584906448442153128": ["convolution_gpu_yxfb_yxio_b16",1], + "3096280563014331836": ["convolution_gpu_yxfb_yxio_b16",2], + "12334522314915706512": ["convolution_gpu_yxfb_yxio_b16",2], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "9832505855130134649": ["convolution_gpu_yxfb_yxio_b16",2], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",2], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",1], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",1], + "7015738038963065110": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "6963293142152132518": ["convolution_gpu_bfyx_os_iyx_osv16",165], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16195893521207315456": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6509271384550125629": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "12137340921829511472": ["convolution_gpu_yxfb_yxio_b16",2], + "1760391741350091665": ["convolution_gpu_bfyx_gemm_like",2], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13289438471364352634": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4610200388191607540": ["convolution_gpu_bfyx_gemm_like",2], + "10882719585803523032": ["convolution_gpu_yxfb_yxio_b16",2], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "13943983517468412332": ["convolution_gpu_yxfb_yxio_b16",1], + "5479761740065152589": ["convolution_gpu_bfyx_gemm_like",2], + "10133398220120888583": ["convolution_gpu_yxfb_yxio_b16",2], + "13156052826121673994": ["convolution_gpu_bfyx_gemm_like",2], + "10006197783106691106": ["convolution_gpu_bfyx_gemm_like",2], + "4602232889230956461": ["convolution_gpu_yxfb_yxio_b16",2], + "15669242195570440840": ["convolution_gpu_yxfb_yxio_b16",2], + "13365950526881732374": ["convolution_gpu_yxfb_yxio_b16",1], + "14469011068777098822": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16341722570340169855": ["convolution_gpu_bfyx_1x1",2], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",2], + "6715523440337925186": ["convolution_gpu_yxfb_yxio_b16",2], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "10717031088082350652": ["convolution_gpu_yxfb_yxio_b16",2], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "2016932800158392200": ["convolution_gpu_yxfb_yxio_b16",2], + "13467831091041327178": ["convolution_gpu_yxfb_yxio_b16",1], + "2135164671985938807": ["convolution_gpu_yxfb_yxio_b16",2], + "9711184878666366204": ["convolution_gpu_yxfb_yxio_b16",1], + "968105804060326332": ["convolution_gpu_yxfb_yxio_b16",2], + "579781312141502576": ["convolution_gpu_bfyx_1x1",1], + "17248329632819747646": ["convolution_gpu_yxfb_yxio_b16",1], + "11942019076226205097": ["convolution_gpu_yxfb_yxio_b16",2], + "6902485831441844789": ["convolution_gpu_yxfb_yxio_b16",1], + "4885504197789468842": ["convolution_gpu_yxfb_yxio_b16",1], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13464697394408238115": ["convolution_gpu_yxfb_yxio_b16",2], + "10789133352712755945": ["convolution_gpu_yxfb_yxio_b16",2], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "7065244994574625911": ["convolution_gpu_yxfb_yxio_b16",2], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",2], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",2], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "16129682385980878760": ["convolution_gpu_yxfb_yxio_b16",2], + "17043601935017365442": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2905979727479716212": ["convolution_gpu_yxfb_yxio_b16",2], + "4391695940614024479": ["convolution_gpu_yxfb_yxio_b16",2], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",1], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "708201295462256406": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "9827177798112814604": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "5050273611519516510": ["convolution_gpu_bfyx_gemm_like",1], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",1], + "17672785701483179117": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4834446692898125871": ["convolution_gpu_bfyx_gemm_like",2], + "7628077869220463202": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "11417406326478154077": ["convolution_gpu_yxfb_yxio_b16",2], + "12985942652866621579": ["fully_connected_gpu_fb_io_ref",2], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",2], + "5638640164891118162": ["convolution_gpu_yxfb_yxio_b16",2], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "603883331897298932": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16633540487930201533": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "3872151366780051246": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17515847111676784130": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "6613116267521819997": ["convolution_gpu_yxfb_yxio_b16",2], + "3377052601059116318": ["convolution_gpu_yxfb_yxio_b16",0], + "13509275050322423832": ["convolution_gpu_yxfb_yxio_b16",2], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13119040261291835298": ["convolution_gpu_bfyx_gemm_like",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6799631962511042762": ["convolution_gpu_yxfb_yxio_b16",2], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "6959692641873234850": ["convolution_gpu_yxfb_yxio_b16",2], + "18152894191323920027": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6512987867462549101": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16371608027363202992": ["convolution_gpu_yxfb_yxio_b16",2], + "6210866413385292851": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "465567788283624320": ["convolution_gpu_yxfb_yxio_b16",2], + "6756679359093569015": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "5740745357953479527": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "17676344219475515993": ["convolution_gpu_yxfb_yxio_b16",2], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5346898505346646714": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",1], + "6945787904293959477": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "18243724217479803107": ["convolution_gpu_yxfb_yxio_b16",2], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "2227700097134029783": ["convolution_gpu_yxfb_yxio_b16",1], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10309986238001994183": ["convolution_gpu_yxfb_yxio_b16",2], + "12531880391016521628": ["convolution_gpu_bfyx_gemm_like",2], + "15115780248032030963": ["convolution_gpu_yxfb_yxio_b16",2], + "4298242568890525997": ["convolution_gpu_yxfb_yxio_b16",2], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16870036853278751563": ["convolution_gpu_yxfb_yxio_b16",2], + "13161997040644039778": ["convolution_gpu_bfyx_gemm_like",2], + "15833461718320604065": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "7669403041163460089": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",2], + "15482685355538566951": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",2], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",2], + "6558436237075337721": ["convolution_gpu_yxfb_yxio_b16",2], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1580344438642032807": ["convolution_gpu_bfyx_gemm_like",2], + "5578850952665051661": ["convolution_gpu_yxfb_yxio_b16",1], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13493119419114659706": ["convolution_gpu_yxfb_yxio_b16",2], + "16425374300157280628": ["convolution_gpu_yxfb_yxio_b16",1], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11698754846673268046": ["convolution_gpu_yxfb_yxio_b16",2], + "12121204870979363096": ["convolution_gpu_yxfb_yxio_b16",2], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "12054200116003751590": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",1], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "1448440012428740463": ["convolution_gpu_yxfb_yxio_b16",1], + "8976238022515713641": ["convolution_gpu_bfyx_gemm_like",2], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",1], + "10681768474583067517": ["convolution_gpu_bfyx_gemm_like",1], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "359617184733439511": ["convolution_gpu_yxfb_yxio_b16",2], + "9366100787108468082": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2269140636553245446": ["convolution_gpu_yxfb_yxio_b16",2], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9870432551513415176": ["convolution_gpu_yxfb_yxio_b16",2], + "1984152634309440563": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "361497145093734608": ["convolution_gpu_bfyx_gemm_like",2], + "13861223834466385546": ["convolution_gpu_bfyx_gemm_like",1], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",1], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9941035405796680081": ["convolution_gpu_bfyx_1x1",1], + "12051398350382954787": ["convolution_gpu_yxfb_yxio_b16",0], + "14242202444788213591": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "8999570321113443117": ["convolution_gpu_yxfb_yxio_b16",2], + "838726445796308454": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",2], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "9518071423184197213": ["convolution_gpu_bfyx_gemm_like",2], + "8004244584949995244": ["convolution_gpu_yxfb_yxio_b16",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17480277135590489472": ["convolution_gpu_yxfb_yxio_b16",2], + "167635075964111628": ["convolution_gpu_yxfb_yxio_b16",2], + "1463649546800120847": ["convolution_gpu_yxfb_yxio_b16",2], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",2], + "225809055928705881": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "7998455776901877973": ["convolution_gpu_yxfb_yxio_b16",2], + "4804533178560338520": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11596971301790598405": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "5312413491828906254": ["convolution_gpu_yxfb_yxio_b16",2], + "12600707101000510621": ["convolution_gpu_yxfb_yxio_b16",2], + "7346046748383284270": ["convolution_gpu_yxfb_yxio_b16",2], + "7804715870037416579": ["convolution_gpu_bfyx_gemm_like",1], + "18433141005552346566": ["convolution_gpu_yxfb_yxio_b16",2], + "17893181511546734799": ["convolution_gpu_yxfb_yxio_b16",2], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",1], + "7647236080048602591": ["convolution_gpu_bfyx_gemm_like",1], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6910589963488897537": ["convolution_gpu_yxfb_yxio_b16",2], + "11175936010605958812": ["convolution_gpu_yxfb_yxio_b16",1], + "568191462231494113": ["convolution_gpu_yxfb_yxio_b16",2], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "3861351835305151926": ["convolution_gpu_yxfb_yxio_b16",2], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",2], + "7779562434199107586": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "18385086614524985975": ["convolution_gpu_yxfb_yxio_b16",2], + "4731836216299455047": ["convolution_gpu_yxfb_yxio_b16",2], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13878967140838761911": ["convolution_gpu_bfyx_1x1",1], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11888011890096886932": ["convolution_gpu_yxfb_yxio_b16",2], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "17961793197503317952": ["convolution_gpu_yxfb_yxio_b16",2], + "5720964268093705079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "877901260688090160": ["convolution_gpu_yxfb_yxio_b16",2], + "1310498917952637709": ["convolution_gpu_yxfb_yxio_b16",2], + "6871131333562410117": ["convolution_gpu_yxfb_yxio_b16",2], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",664], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "7139719632093090046": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "14553577436929219470": ["convolution_gpu_yxfb_yxio_b16",2], + "16814025114202322376": ["convolution_gpu_yxfb_yxio_b16",1], + "11880337915508207160": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6467251764899975676": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16995444341569389342": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "14206076551739831333": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7393551951402219833": ["convolution_gpu_yxfb_yxio_b16",2], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "1596353239542510685": ["convolution_gpu_bfyx_gemm_like",2], + "2578325663193624576": ["convolution_gpu_yxfb_yxio_b16",2], + "13426254939418471242": ["convolution_gpu_yxfb_yxio_b16",2], + "15337841577110104431": ["convolution_gpu_yxfb_yxio_b16",1], + "14868677663932902695": ["convolution_gpu_bfyx_gemm_like",2], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",2], + "14766694310604777253": ["convolution_gpu_yxfb_yxio_b16",1], + "15135644084742750702": ["convolution_gpu_bfyx_gemm_like",2], + "12787837386653002743": ["convolution_gpu_yxfb_yxio_b16",2], + "6167369758442930886": ["convolution_gpu_bfyx_gemm_like",2], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5118467701668427545": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2026622899016787854": ["convolution_gpu_yxfb_yxio_b16",1], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "16374675547140209181": ["convolution_gpu_yxfb_yxio_b16",2], + "888110783182849535": ["convolution_gpu_yxfb_yxio_b16",2], + "16683485007140805060": ["fully_connected_gpu_yxfb_ref",2], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",2], + "8260024340787818709": ["convolution_gpu_yxfb_yxio_b16",2], + "10480527638577674825": ["convolution_gpu_bfyx_1x1",2], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9162469583721135043": ["convolution_gpu_yxfb_yxio_b16",2], + "4685236901551256966": ["convolution_gpu_yxfb_yxio_b16",1], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",2], + "2296581485980163665": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1635121016109328853": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15670767419106537809": ["convolution_gpu_yxfb_yxio_b16",2], + "4848143712599565301": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3102538312627892960": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",2], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "18408107772851888061": ["convolution_gpu_bfyx_gemm_like",2], + "11179211757115972103": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3980835859526174461": ["convolution_gpu_yxfb_yxio_b16",2], + "15525903155475629518": ["convolution_gpu_bfyx_gemm_like",2], + "14175962333785791005": ["convolution_gpu_yxfb_yxio_b16",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "290134020607738418": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "9534041402131086717": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "17638753020411096694": ["convolution_gpu_yxfb_yxio_b16",2], + "18432421400879260832": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "16304192736281226143": ["convolution_gpu_yxfb_yxio_b16",2], + "7305582749708309904": ["convolution_gpu_yxfb_yxio_b16",2], + "3622409603053918029": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13207134083675064956": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "8931169575495985034": ["convolution_gpu_bfyx_gemm_like",2], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",2], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",2], + "12850044341631872743": ["convolution_gpu_yxfb_yxio_b16",2], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11086471945045031067": ["convolution_gpu_yxfb_yxio_b16",2], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",1], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5507373575763339429": ["convolution_gpu_yxfb_yxio_b16",2], + "13328911884191551889": ["convolution_gpu_bfyx_1x1",2], + "5104519293341299859": ["convolution_gpu_yxfb_yxio_b16",2], + "249639220178603842": ["convolution_gpu_bfyx_gemm_like",2], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14942858162799632403": ["convolution_gpu_yxfb_yxio_b16",2], + "5576296603250158603": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "2683304757433993300": ["convolution_gpu_bfyx_gemm_like",2], + "16128152634974034731": ["convolution_gpu_yxfb_yxio_b16",2], + "15677717057398875599": ["convolution_gpu_bfyx_gemm_like",1], + "851057218719456209": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3349468433721705582": ["convolution_gpu_yxfb_yxio_b16",1], + "15863531785836309247": ["convolution_gpu_yxfb_yxio_b16",2], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15367649112776077240": ["convolution_gpu_yxfb_yxio_b16",2], + "9492026326463873766": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16925721317097534009": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "3319827933068341610": ["convolution_gpu_yxfb_yxio_b16",2], + "12268912077694742671": ["convolution_gpu_yxfb_yxio_b16",2], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "9073757008455674094": ["convolution_gpu_yxfb_yxio_b16",2], + "8780671766122887951": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "15625374380046476173": ["convolution_gpu_yxfb_yxio_b16",2], + "7105219760750474587": ["convolution_gpu_yxfb_yxio_b16",2], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16961326251624610778": ["convolution_gpu_yxfb_yxio_b16",2], + "10076885835791159907": ["convolution_gpu_yxfb_yxio_b16",2], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",2], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13541382855330226000": ["convolution_gpu_yxfb_yxio_b16",2], + "11015074526119891710": ["convolution_gpu_yxfb_yxio_b16",2], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "7897973318803646560": ["convolution_gpu_yxfb_yxio_b16",2], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",2], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "11871319147579477936": ["convolution_gpu_yxfb_yxio_b16",2], + "16601230690171340432": ["convolution_gpu_yxfb_yxio_b16",2], + "17207560805775399864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9737565171095493297": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "17466963970980708210": ["convolution_gpu_yxfb_yxio_b16",2], + "6128157319666849074": ["convolution_gpu_yxfb_yxio_b16",2], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "1197281505560782577": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13264617841270329349": ["convolution_gpu_bfyx_1x1",2], + "6550549654706796887": ["convolution_gpu_yxfb_yxio_b16",0], + "13120262386070281193": ["convolution_gpu_yxfb_yxio_b16",1], + "13368203360773949292": ["convolution_gpu_yxfb_yxio_b16",2], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "3615052707933370958": ["convolution_gpu_yxfb_yxio_b16",1], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",1], + "2554991397391195611": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "7206226541369793931": ["convolution_gpu_yxfb_yxio_b16",2], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",2], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",2], + "2949545414911764346": ["convolution_gpu_yxfb_yxio_b16",2], + "1173136780324694038": ["convolution_gpu_yxfb_yxio_b16",2], + "12512751736409465214": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9079676771143357396": ["convolution_gpu_yxfb_yxio_b16",1], + "15474155528481683394": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "9649445293567537596": ["convolution_gpu_yxfb_yxio_b16",2], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "3603187029740446600": ["convolution_gpu_bfyx_gemm_like",2], + "1418595171949196661": ["convolution_gpu_bfyx_gemm_like",2], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6578517057140155080": ["convolution_gpu_yxfb_yxio_b16",2], + "16674633029045714564": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "15228390729175722409": ["convolution_gpu_yxfb_yxio_b16",2], + "1245259979364728404": ["convolution_gpu_bfyx_1x1",2], + "101387140804297623": ["convolution_gpu_yxfb_yxio_b16",2], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",2], + "13234055353608734080": ["convolution_gpu_yxfb_yxio_b16",1], + "136349424199140459": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9182260316973872633": ["convolution_gpu_yxfb_yxio_b16",2], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",2], + "17651477639302255490": ["convolution_gpu_yxfb_yxio_b16",2], + "8768300687476117215": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "6651389480007764007": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14461365896122393071": ["convolution_gpu_yxfb_yxio_b16",2], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",2], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "13596876807637507229": ["convolution_gpu_bfyx_1x1",2], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2208765794404376467": ["convolution_gpu_yxfb_yxio_b16",2], + "1345101751956733589": ["convolution_gpu_bfyx_gemm_like",2], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12185561188335760786": ["convolution_gpu_yxfb_yxio_b16",2], + "14113320831418478396": ["convolution_gpu_yxfb_yxio_b16",2], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",771], + "4098191685457418125": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "12637509262827320678": ["convolution_gpu_yxfb_yxio_b16",1], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "5897564616927353003": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "15006204461468698734": ["convolution_gpu_yxfb_yxio_b16",2], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "13398986810666238552": ["convolution_gpu_yxfb_yxio_b16",2], + "11731277083374465361": ["convolution_gpu_yxfb_yxio_b16",2], + "6020017927557041768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "10880830033700542216": ["convolution_gpu_yxfb_yxio_b16",1], + "6934241437968723825": ["convolution_gpu_yxfb_yxio_b16",1], + "4740585760177040164": ["convolution_gpu_yxfb_yxio_b16",1], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7009459929666511861": ["convolution_gpu_bfyx_1x1",1], + "5602377914578322577": ["convolution_gpu_yxfb_yxio_b16",2], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",1], + "2561508262445368003": ["convolution_gpu_yxfb_yxio_b16",2], + "17935612508319394087": ["convolution_gpu_yxfb_yxio_b16",2], + "6126073246053235472": ["convolution_gpu_yxfb_yxio_b16",2], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",0], + "786401653335542559": ["convolution_gpu_bfyx_gemm_like",2], + "123026136670202868": ["convolution_gpu_yxfb_yxio_b16",2], + "4999171487916568471": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "14204609663091442879": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12181310683533105454": ["fully_connected_gpu_fb_oi_ref",1], + "15765592038173567297": ["convolution_gpu_yxfb_yxio_b16",2], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "15497263259976427714": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "6817494598328071314": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12234313962656804631": ["convolution_gpu_bfyx_gemm_like",2], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",1], + "155962454315573087": ["convolution_gpu_yxfb_yxio_b16",2], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",2], + "15681189418847392587": ["convolution_gpu_bfyx_os_iyx_osv16",857], + "9989055862610193828": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "3779229442395464456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15678768217453692725": ["convolution_gpu_yxfb_yxio_b16",1], + "6950586691727980329": ["convolution_gpu_yxfb_yxio_b16",1], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3365786526859737112": ["convolution_gpu_yxfb_yxio_b16",1], + "6022695488769618639": ["convolution_gpu_yxfb_yxio_b16",2], + "11612044653200304877": ["convolution_gpu_yxfb_yxio_b16",2], + "12960590161485806657": ["convolution_gpu_bfyx_gemm_like",2], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",2], + "15223779293313750042": ["convolution_gpu_yxfb_yxio_b16",2], + "14749758365915995876": ["convolution_gpu_yxfb_yxio_b16",2], + "11973034261101454380": ["convolution_gpu_yxfb_yxio_b16",2], + "8205640825965213946": ["convolution_gpu_yxfb_yxio_b16",1], + "10774872391768741315": ["convolution_gpu_yxfb_yxio_b16",2], + "11564071490267241224": ["convolution_gpu_yxfb_yxio_b16",2], + "12604104383683210104": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "3658599312236344017": ["convolution_gpu_yxfb_yxio_b16",2], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "8974851555526896131": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",455], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "3976736548270395981": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5576305720733717044": ["convolution_gpu_yxfb_yxio_b16",1], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",1], + "8584375748627260395": ["convolution_gpu_yxfb_yxio_b16",2], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",2], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",2], + "18040183500393090505": ["convolution_gpu_yxfb_yxio_b16",1], + "4291531885506213180": ["convolution_gpu_yxfb_yxio_b16",2], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12476381811279163147": ["convolution_gpu_yxfb_yxio_b16",2], + "12972798847556569913": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2936333406928424760": ["convolution_gpu_bfyx_1x1",2], + "6081038474197004540": ["convolution_gpu_yxfb_yxio_b16",1], + "577842450575835175": ["convolution_gpu_yxfb_yxio_b16",2], + "401304652492444430": ["convolution_gpu_bfyx_gemm_like",2], + "13009612703754510124": ["convolution_gpu_yxfb_yxio_b16",2], + "5020763861388859254": ["convolution_gpu_bfyx_gemm_like",2], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "16681690088928624738": ["convolution_gpu_bfyx_gemm_like",2], + "2173163618947713953": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "12477315042623518609": ["convolution_gpu_yxfb_yxio_b16",2], + "3067930325929862490": ["convolution_gpu_yxfb_yxio_b16",2], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",2], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5012013738970489338": ["convolution_gpu_bfyx_1x1",1], + "8735534480653818425": ["convolution_gpu_yxfb_yxio_b16",2], + "4627958043707973483": ["convolution_gpu_yxfb_yxio_b16",1], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "16610284927818475574": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13618411266808159341": ["convolution_gpu_yxfb_yxio_b16",1], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2188101366183302888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12923298574715329852": ["convolution_gpu_yxfb_yxio_b16",2], + "1531349457115735845": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5751553671208192963": ["convolution_gpu_yxfb_yxio_b16",2], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "2882493407831196579": ["fully_connected_gpu_fb_io_block_fp16",2], + "7232326270078161768": ["convolution_gpu_bfyx_gemm_like",2], + "17778091287904736965": ["convolution_gpu_bfyx_gemm_like",2], + "4113061482402915179": ["convolution_gpu_yxfb_yxio_b16",2], + "8732952254407298868": ["convolution_gpu_bfyx_gemm_like",0], + "5564881878876582769": ["convolution_gpu_yxfb_yxio_b16",2], + "3217674729821898463": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "15720012960520885263": ["convolution_gpu_yxfb_yxio_b16",1], + "3286250915720444467": ["convolution_gpu_yxfb_yxio_b16",2], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",524], + "10292585962794261197": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4633923265089466898": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "11270855425262923989": ["convolution_gpu_yxfb_yxio_b16",2], + "8761283252495354972": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "18187345248160481425": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",2], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",1], + "13531892014108749846": ["convolution_gpu_yxfb_yxio_b16",2], + "3622778166646258015": ["convolution_gpu_yxfb_yxio_b16",1], + "12745631396795162505": ["convolution_gpu_yxfb_yxio_b16",2], + "6948455759869670955": ["convolution_gpu_yxfb_yxio_b16",2], + "12721294268595880422": ["convolution_gpu_yxfb_yxio_b16",2], + "17439102502195540957": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1198893312653197535": ["convolution_gpu_yxfb_yxio_b16",2], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",2], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "636447309806530300": ["convolution_gpu_yxfb_yxio_b16",2], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "16768497046700403748": ["convolution_gpu_yxfb_yxio_b16",2], + "15167962750603978874": ["convolution_gpu_yxfb_yxio_b16",2], + "6267138247577676996": ["convolution_gpu_yxfb_yxio_b16",2], + "7895030495055232460": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2576773809294607971": ["convolution_gpu_yxfb_yxio_b16",2], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",2], + "3962138884698789654": ["convolution_gpu_yxfb_yxio_b16",2], + "10547134120307382906": ["convolution_gpu_yxfb_yxio_b16",2], + "7223801044761006523": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "377219085802486361": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9435086287598656868": ["convolution_gpu_yxfb_yxio_b16",2], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "10706180189726741161": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16711142379173254655": ["convolution_gpu_yxfb_yxio_b16",2], + "3171354702636014224": ["convolution_gpu_yxfb_yxio_b16",2], + "11007175027950132719": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11194372303922533529": ["convolution_gpu_yxfb_yxio_b16",2], + "13705072264927031658": ["convolution_gpu_yxfb_yxio_b16",2], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "9576962489937466093": ["convolution_gpu_yxfb_yxio_b16",2], + "4445913285957791409": ["convolution_gpu_yxfb_yxio_b16",1], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "17734480671864478402": ["convolution_gpu_yxfb_yxio_b16",2], + "5112480593385320005": ["convolution_gpu_yxfb_yxio_b16",2], + "6603778920476932267": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6253009218981124949": ["convolution_gpu_yxfb_yxio_b16",2], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",2], + "14971270053929063630": ["convolution_gpu_yxfb_yxio_b16",2], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",2], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "522313477023837056": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12256193738921380409": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9096495972770198040": ["convolution_gpu_yxfb_yxio_b16",2], + "3056212889689424946": ["convolution_gpu_bfyx_1x1",2], + "426827405952656362": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "12871555773123368130": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11088324811742486481": ["convolution_gpu_bfyx_gemm_like",2], + "4398371999113956082": ["convolution_gpu_bfyx_gemm_like",2], + "14774814395786139876": ["convolution_gpu_yxfb_yxio_b16",2], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16242136888057221574": ["convolution_gpu_yxfb_yxio_b16",2], + "13777174566683935109": ["convolution_gpu_yxfb_yxio_b16",2], + "5337351591182109481": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "6249875772709398338": ["convolution_gpu_yxfb_yxio_b16",2], + "913861052717410566": ["convolution_gpu_yxfb_yxio_b16",2], + "1114679698826953542": ["convolution_gpu_yxfb_yxio_b16",1], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "5516518048239364231": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "2581414750854621875": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "12327057172281102984": ["convolution_gpu_yxfb_yxio_b16",2], + "576164857039495839": ["convolution_gpu_yxfb_yxio_b16",2], + "18184621367843960190": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "6341363789473021047": ["convolution_gpu_yxfb_yxio_b16",2], + "1325669650629605592": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5106072383853469966": ["convolution_gpu_yxfb_yxio_b16",1], + "7800262579057534804": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10935309102034762723": ["convolution_gpu_bfyx_1x1",1], + "9453100135791813000": ["convolution_gpu_yxfb_yxio_b16",2], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",2], + "3856976081672275637": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "6288489890578212082": ["convolution_gpu_bfyx_gemm_like",2], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "4937688558707451907": ["convolution_gpu_yxfb_yxio_b16",2], + "6317575981520135028": ["convolution_gpu_bfyx_gemm_like",1], + "15531306520021286502": ["convolution_gpu_bfyx_gemm_like",2], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",0], + "13809330759308309353": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7683334381958571864": ["convolution_gpu_bfyx_gemm_like",2], + "3017824560305532066": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",2], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "2282123636764935353": ["convolution_gpu_yxfb_yxio_b16",2], + "13978649386370395620": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "5942742563827424666": ["convolution_gpu_yxfb_yxio_b16",2], + "671453551040072499": ["convolution_gpu_bfyx_gemm_like",2], + "2737064424879246276": ["convolution_gpu_bfyx_gemm_like",2], + "18087356517015630281": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "3336076058264596420": ["convolution_gpu_bfyx_gemm_like",2], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",2], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12370729327673204804": ["convolution_gpu_bfyx_gemm_like",2], + "13251091004269229867": ["convolution_gpu_bfyx_gemm_like",2], + "2817919813339364130": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",2], + "12768933181342249823": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "2173720698351153121": ["convolution_gpu_bfyx_gemm_like",2], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "11109044986816563101": ["convolution_gpu_yxfb_yxio_b16",2], + "17682152011630274259": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18214716801063702171": ["convolution_gpu_yxfb_yxio_b16",2], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",2], + "15636128989267984459": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13409744191227471760": ["convolution_gpu_bfyx_gemm_like",1], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "57372993988016244": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "13912728810446567016": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "14324166291904435508": ["convolution_gpu_yxfb_yxio_b16",2], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15418732002117930760": ["convolution_gpu_yxfb_yxio_b16",2], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",1], + "9017605508157213607": ["convolution_gpu_yxfb_yxio_b16",2], + "17617204422090117691": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2506424495656099512": ["convolution_gpu_yxfb_yxio_b16",2], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "3423717644513543253": ["convolution_gpu_yxfb_yxio_b16",2], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",1], + "3652414035262499383": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",2], + "17811558714592064184": ["convolution_gpu_yxfb_yxio_b16",2], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "12680688623162482255": ["convolution_gpu_bfyx_1x1",2], + "7824075236081312706": ["convolution_gpu_yxfb_yxio_b16",2], + "15159534367247036982": ["convolution_gpu_yxfb_yxio_b16",2], + "14074996784220709246": ["convolution_gpu_yxfb_yxio_b16",2], + "8021915447462898777": ["convolution_gpu_bfyx_gemm_like",0], + "1973819632224480598": ["convolution_gpu_yxfb_yxio_b16",1], + "12411075288896909468": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",2], + "2908156087871187676": ["convolution_gpu_yxfb_yxio_b16",2], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",2], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",2], + "13851851281384416649": ["convolution_gpu_bfyx_1x1",1], + "6217542346826403576": ["convolution_gpu_bfyx_1x1",2], + "11557032521956761994": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "13199672084171648305": ["convolution_gpu_yxfb_yxio_b16",2], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3225866261943242708": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10765280349477640969": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "17876939980356283351": ["convolution_gpu_yxfb_yxio_b16",2], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",0], + "6280726148869856021": ["convolution_gpu_yxfb_yxio_b16",2], + "10471519687597963116": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8470959792634864749": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13501352378461071771": ["convolution_gpu_yxfb_yxio_b16",2], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "14908477489231326997": ["convolution_gpu_yxfb_yxio_b16",2], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "12767065362702304803": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "8039645104667120991": ["convolution_gpu_yxfb_yxio_b16",2], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",1], + "563440246018637010": ["convolution_gpu_yxfb_yxio_b16",2], + "17361714725103230834": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "16182470664818268848": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5668538167635622474": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "7532088618116521936": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13130001092233798285": ["convolution_gpu_yxfb_yxio_b16",2], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8185193068790365354": ["convolution_gpu_bfyx_gemm_like",2], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "17225552472711821360": ["convolution_gpu_bfyx_os_iyx_osv16",946], + "4759671642533786591": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "14132543442791497311": ["convolution_gpu_yxfb_yxio_b16",2], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16341131728764501904": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8183383667948205424": ["convolution_gpu_yxfb_yxio_b16",2], + "4701235352806075765": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "13527018660229167386": ["convolution_gpu_yxfb_yxio_b16",1], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8943913562339525413": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4274425737610351312": ["convolution_gpu_bfyx_gemm_like",2], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",2], + "4378422094110940766": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5284456216115118110": ["convolution_gpu_yxfb_yxio_b16",2], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "2439993891369206440": ["convolution_gpu_bfyx_1x1",2], + "7902473777019759045": ["convolution_gpu_bfyx_gemm_like",2], + "10322427853063201289": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "15497797842820949408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7762916621662364082": ["convolution_gpu_yxfb_yxio_b16",2], + "4072951883124129646": ["convolution_gpu_yxfb_yxio_b16",1], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",2], + "12384317536636082264": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15578456771467281881": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12741762570001404232": ["convolution_gpu_yxfb_yxio_b16",1], + "2581014920570427861": ["convolution_gpu_yxfb_yxio_b16",2], + "5854093367753757010": ["convolution_gpu_yxfb_yxio_b16",2], + "15104727000375811836": ["convolution_gpu_yxfb_yxio_b16",2], + "13966416504547680082": ["convolution_gpu_yxfb_yxio_b16",2], + "16620268338434572068": ["convolution_gpu_yxfb_yxio_b16",1], + "875296362957469305": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7405315582091905378": ["convolution_gpu_yxfb_yxio_b16",1], + "12961109385388101976": ["convolution_gpu_yxfb_yxio_b16",0], + "998876398773540321": ["convolution_gpu_bfyx_1x1",1], + "10463632805036507382": ["convolution_gpu_yxfb_yxio_b16",2], + "1485662490111767875": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "5552699731399195573": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "13038533272699602337": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12013818650853034767": ["convolution_gpu_yxfb_yxio_b16",2], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "11079061135559995449": ["convolution_gpu_yxfb_yxio_b16",1], + "7364084475361144967": ["convolution_gpu_yxfb_yxio_b16",1], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",2], + "7441188930428385142": ["convolution_gpu_yxfb_yxio_b16",1], + "3830842631023415233": ["convolution_gpu_yxfb_yxio_b16",2], + "11073090858361674041": ["convolution_gpu_yxfb_yxio_b16",2], + "8611873585228858719": ["convolution_gpu_yxfb_yxio_b16",2], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",2], + "3047407458812880288": ["convolution_gpu_yxfb_yxio_b16",2], + "1123577455191848310": ["convolution_gpu_bfyx_gemm_like",2], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "17737878867906137388": ["convolution_gpu_yxfb_yxio_b16",2], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "16788715253205076219": ["fully_connected_gpu_fb_oi_ref",1], + "17559750858236255044": ["convolution_gpu_yxfb_yxio_b16",2], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",2], + "10002044609138970243": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11012427206693842637": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "5977875644245993099": ["convolution_gpu_yxfb_yxio_b16",1], + "411016281538345537": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10766317990628501609": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "9737833587413114584": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16161974964662774501": ["convolution_gpu_yxfb_yxio_b16",2], + "7571716782558859443": ["convolution_gpu_yxfb_yxio_b16",2], + "13291402786934990349": ["convolution_gpu_yxfb_yxio_b16",2], + "8177017967170389275": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15811723176266128065": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "18033349045324117723": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11500205299047837289": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4947788161154370784": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "13585916416233680276": ["convolution_gpu_yxfb_yxio_b16",2], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "8611710048909301596": ["convolution_gpu_yxfb_yxio_b16",2], + "3366647240745174769": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5421397731090158382": ["convolution_gpu_yxfb_yxio_b16",1], + "577844026691991089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6962062962411903140": ["convolution_gpu_yxfb_yxio_b16",2], + "1148949417144436507": ["convolution_gpu_yxfb_yxio_b16",2], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1208665743495618456": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5788018146987909930": ["convolution_gpu_yxfb_yxio_b16",2], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",2], + "3914143598803149415": ["convolution_gpu_yxfb_yxio_b16",2], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",1], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11921652085115182024": ["convolution_gpu_yxfb_yxio_b16",2], + "586134723922638373": ["convolution_gpu_bfyx_gemm_like",2], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",1], + "9955816463820554626": ["convolution_gpu_yxfb_yxio_b16",2], + "10560559646371329711": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18279416225045612845": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2819475920524949313": ["convolution_gpu_yxfb_yxio_b16",2], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7861119251077361882": ["convolution_gpu_yxfb_yxio_b16",2], + "7369109502608631066": ["convolution_gpu_yxfb_yxio_b16",2], + "1216021647922150199": ["convolution_gpu_yxfb_yxio_b16",2], + "4238885454989272754": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "2816339200381598722": ["convolution_gpu_yxfb_yxio_b16",2], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "1141277975467180549": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "7107677063657303327": ["convolution_gpu_bfyx_1x1",2], + "8079914471491171372": ["convolution_gpu_yxfb_yxio_b16",1], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "14263790627243107300": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",2], + "6859143702528475520": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4098581145478965082": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11241838709529552265": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "331661172067077796": ["convolution_gpu_bfyx_1x1",2], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "3932617680771387232": ["convolution_gpu_yxfb_yxio_b16",2], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18161786710055240343": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "12388894315292201102": ["convolution_gpu_yxfb_yxio_b16",2], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "6723804327185132790": ["convolution_gpu_bfyx_gemm_like",2], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",2], + "15576534481170615301": ["convolution_gpu_yxfb_yxio_b16",2], + "7223737889890738294": ["convolution_gpu_yxfb_yxio_b16",2], + "16341700680310033430": ["fully_connected_gpu_fb_io_block_fp16",2], + "10996429218747311159": ["convolution_gpu_yxfb_yxio_b16",1], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "5895417825685090256": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "7317391511452227268": ["convolution_gpu_bfyx_gemm_like",2], + "2147962310424425158": ["convolution_gpu_yxfb_yxio_b16",2], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",2], + "9940761514291929473": ["convolution_gpu_yxfb_yxio_b16",2], + "5288793454052261767": ["convolution_gpu_bfyx_gemm_like",2], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3806806400778685133": ["convolution_gpu_yxfb_yxio_b16",2], + "16161112020028389294": ["convolution_gpu_yxfb_yxio_b16",2], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "7590734607006912544": ["convolution_gpu_yxfb_yxio_b16",2], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17270057383792994793": ["convolution_gpu_yxfb_yxio_b16",2], + "13830605041347009953": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14273849038400888518": ["convolution_gpu_yxfb_yxio_b16",2], + "360064276184684693": ["convolution_gpu_yxfb_yxio_b16",1], + "4597873630741623918": ["convolution_gpu_yxfb_yxio_b16",1], + "69832608384091511": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7260204889552803221": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14034525799882831106": ["convolution_gpu_bfyx_gemm_like",2], + "4169042131399110713": ["convolution_gpu_yxfb_yxio_b16",2], + "1089944493540593798": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "8931469268093714938": ["convolution_gpu_yxfb_yxio_b16",1], + "4208702365182336507": ["convolution_gpu_yxfb_yxio_b16",2], + "13914239937595549448": ["convolution_gpu_yxfb_yxio_b16",2], + "15488532485794545310": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3231651468686543808": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "241656278218999298": ["convolution_gpu_yxfb_yxio_b16",2], + "10956668791040094584": ["convolution_gpu_yxfb_yxio_b16",2], + "844576097677576405": ["convolution_gpu_yxfb_yxio_b16",2], + "3631332752661975859": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15421280195211166867": ["convolution_gpu_yxfb_yxio_b16",2], + "14823789570149356458": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7104266560248570112": ["convolution_gpu_yxfb_yxio_b16",2], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",265], + "8075261051536686307": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "4142555169083069413": ["convolution_gpu_bfyx_gemm_like",2], + "12501619443242354860": ["convolution_gpu_bfyx_gemm_like",2], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11987564534722442223": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "13200834963067135502": ["fully_connected_gpu_fb_oi_ref",1], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",2], + "14280056365441354869": ["convolution_gpu_yxfb_yxio_b16",2], + "3766048787611884529": ["convolution_gpu_yxfb_yxio_b16",2], + "12878346173547852969": ["convolution_gpu_yxfb_yxio_b16",2], + "8723078862651154959": ["convolution_gpu_yxfb_yxio_b16",2], + "135072053401934228": ["convolution_gpu_bfyx_1x1",2], + "8115522418294960470": ["convolution_gpu_yxfb_yxio_b16",2], + "15998609626878578708": ["convolution_gpu_yxfb_yxio_b16",2], + "2149299205144202701": ["convolution_gpu_yxfb_yxio_b16",2], + "5940007433515335594": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1539677456611270609": ["convolution_gpu_yxfb_yxio_b16",2], + "4683320313995550908": ["convolution_gpu_yxfb_yxio_b16",2], + "15060535689318007173": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "18239740525818575112": ["convolution_gpu_yxfb_yxio_b16",2], + "9814647153117279415": ["convolution_gpu_yxfb_yxio_b16",2], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",1], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",2], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",1], + "15932838442166411183": ["convolution_gpu_yxfb_yxio_b16",2], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11971853138084108953": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "721174714308243785": ["convolution_gpu_bfyx_gemm_like",2], + "6290584630172122012": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "13636407347458845915": ["convolution_gpu_yxfb_yxio_b16",2], + "15534876725099279666": ["convolution_gpu_yxfb_yxio_b16",2], + "9967611023372430532": ["convolution_gpu_bfyx_gemm_like",2], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "4201057957682777280": ["convolution_gpu_yxfb_yxio_b16",1], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "12311849904266608701": ["convolution_gpu_yxfb_yxio_b16",2], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10815244730103375973": ["convolution_gpu_yxfb_yxio_b16",1], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",2], + "12526988667216482085": ["convolution_gpu_yxfb_yxio_b16",2], + "13077917010686381919": ["convolution_gpu_yxfb_yxio_b16",2], + "16828961272295386615": ["convolution_gpu_bfyx_os_iyx_osv16",539], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "12046017161414846599": ["convolution_gpu_bfyx_1x1",2], + "17344974951998490453": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8622014461615231500": ["convolution_gpu_yxfb_yxio_b16",2], + "15438470456977849772": ["convolution_gpu_yxfb_yxio_b16",2], + "1868805550246252143": ["convolution_gpu_yxfb_yxio_b16",2], + "7846384623429362522": ["convolution_gpu_bfyx_1x1",1], + "6388117241933586388": ["convolution_gpu_bfyx_gemm_like",2], + "15188570678726970998": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12002302929446578025": ["convolution_gpu_yxfb_yxio_b16",2], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",2], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14808759315730413993": ["convolution_gpu_yxfb_yxio_b16",2], + "3211829722778368758": ["convolution_gpu_yxfb_yxio_b16",2], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "16950925976172895196": ["convolution_gpu_yxfb_yxio_b16",2], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "3370082268529091875": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "7493567975736494003": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",2], + "4683575221310726091": ["convolution_gpu_yxfb_yxio_b16",2], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5401380444992462053": ["convolution_gpu_yxfb_yxio_b16",1], + "11052275099129482401": ["convolution_gpu_yxfb_yxio_b16",2], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "18249888571553409563": ["convolution_gpu_yxfb_yxio_b16",2], + "15612334131144235342": ["convolution_gpu_yxfb_yxio_b16",2], + "3001162215282339268": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "16895523130717954500": ["convolution_gpu_yxfb_yxio_b16",2], + "14236681916032484600": ["convolution_gpu_yxfb_yxio_b16",2], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "2339864165283480961": ["convolution_gpu_bfyx_1x1",2], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2247717767819293683": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4818598834950786080": ["convolution_gpu_yxfb_yxio_b16",2], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",2], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "16335738565228204503": ["convolution_gpu_yxfb_yxio_b16",2], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "17224104246148265328": ["convolution_gpu_bfyx_gemm_like",2], + "13182623473102074079": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11157773554806649837": ["convolution_gpu_yxfb_yxio_b16",2], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12267555886404772991": ["convolution_gpu_yxfb_yxio_b16",2], + "2847490224869294354": ["convolution_gpu_bfyx_gemm_like",0], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "2215570184121152738": ["convolution_gpu_bfyx_gemm_like",2], + "5584145249514762750": ["convolution_gpu_yxfb_yxio_b16",2], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",565], + "10837496380266058422": ["convolution_gpu_bfyx_gemm_like",2], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "7913076120244203725": ["convolution_gpu_bfyx_gemm_like",2], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "16773645387243701837": ["convolution_gpu_bfyx_gemm_like",2], + "4049224463072418218": ["convolution_gpu_yxfb_yxio_b16",1], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",1], + "17462996923473002801": ["convolution_gpu_yxfb_yxio_b16",2], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5965451243366505522": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "11614353411428360211": ["convolution_gpu_yxfb_yxio_b16",2], + "7565348337952384040": ["convolution_gpu_yxfb_yxio_b16",2], + "3782308167335660154": ["convolution_gpu_yxfb_yxio_b16",2], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "8779960552750034544": ["convolution_gpu_yxfb_yxio_b16",2], + "3934090072734175564": ["convolution_gpu_yxfb_yxio_b16",2], + "880603384896315783": ["convolution_gpu_yxfb_yxio_b16",2], + "1658174263018326745": ["convolution_gpu_yxfb_yxio_b16",2], + "2917735110073643952": ["convolution_gpu_bfyx_gemm_like",2], + "9280279544075738476": ["convolution_gpu_yxfb_yxio_b16",1], + "12131461096501477069": ["convolution_gpu_yxfb_yxio_b16",2], + "14585000863294748739": ["convolution_gpu_bfyx_gemm_like",2], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "14712972289919865502": ["convolution_gpu_bfyx_gemm_like",1], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11761545976388416063": ["convolution_gpu_yxfb_yxio_b16",2], + "6769524481210107636": ["convolution_gpu_yxfb_yxio_b16",2], + "17107083637007906184": ["convolution_gpu_bfyx_gemm_like",2], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "8469874583725132145": ["fully_connected_gpu_fb_oi_ref",1], + "4423866541063606768": ["convolution_gpu_bfyx_os_iyx_osv16",949], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",1], + "11298854310398101852": ["convolution_gpu_yxfb_yxio_b16",2], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4933831571091731212": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10718764522366711114": ["convolution_gpu_yxfb_yxio_b16",2], + "10912495395422146386": ["convolution_gpu_bfyx_gemm_like",2], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",2], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "15886016297043613632": ["convolution_gpu_yxfb_yxio_b16",1], + "15980348884716629349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",1], + "12389854459474697184": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15693204620575485046": ["convolution_gpu_yxfb_yxio_b16",2], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7329924387620542330": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "14528180674573671874": ["convolution_gpu_yxfb_yxio_b16",2], + "5931972000452008090": ["convolution_gpu_yxfb_yxio_b16",2], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "11291868421122092629": ["convolution_gpu_yxfb_yxio_b16",2], + "2923543983518895756": ["convolution_gpu_yxfb_yxio_b16",1], + "8506271633579173639": ["convolution_gpu_yxfb_yxio_b16",2], + "2759142157812694203": ["convolution_gpu_yxfb_yxio_b16",2], + "294153950488131608": ["convolution_gpu_yxfb_yxio_b16",2], + "1569043950563130463": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4216958486055161753": ["convolution_gpu_bfyx_os_iyx_osv16",105], + "6388086351909447495": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",1], + "15267084369543546013": ["convolution_gpu_yxfb_yxio_b16",2], + "8260073247636023575": ["convolution_gpu_yxfb_yxio_b16",2], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",1], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",2], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12713821004129672990": ["convolution_gpu_yxfb_yxio_b16",2], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7780140599533242850": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12816344078518706065": ["convolution_gpu_yxfb_yxio_b16",2], + "3499406509137418124": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "5291817530552764387": ["convolution_gpu_yxfb_yxio_b16",2], + "12977678792503377525": ["convolution_gpu_bfyx_gemm_like",1], + "9827201026276954165": ["convolution_gpu_yxfb_yxio_b16",2], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15971340431600153619": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "9162862507585693061": ["convolution_gpu_yxfb_yxio_b16",2], + "14963449045970262346": ["convolution_gpu_yxfb_yxio_b16",0], + "16949056117405140365": ["convolution_gpu_bfyx_gemm_like",2], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "17764795635957985989": ["convolution_gpu_yxfb_yxio_b16",2], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",2], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "5672464491301994292": ["convolution_gpu_bfyx_gemm_like",2], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1898243736289257252": ["convolution_gpu_yxfb_yxio_b16",2], + "4617347486560666277": ["convolution_gpu_bfyx_1x1",1], + "7273427309587902237": ["convolution_gpu_bfyx_gemm_like",2], + "2866656294663853474": ["convolution_gpu_bfyx_1x1",2], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12903015669020591018": ["convolution_gpu_yxfb_yxio_b16",2], + "8941904405273405481": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "16290551573997593168": ["convolution_gpu_bfyx_gemm_like",2], + "14944590179685661287": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "787203599734115483": ["convolution_gpu_bfyx_1x1",1], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "8323669961818535927": ["convolution_gpu_yxfb_yxio_b16",2], + "12623375499927200341": ["convolution_gpu_bfyx_gemm_like",2], + "10141927023849730720": ["convolution_gpu_bfyx_1x1",1], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "1390379098099686972": ["convolution_gpu_bfyx_1x1",2], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "4982549855424649217": ["convolution_gpu_yxfb_yxio_b16",2], + "15295172519920136220": ["convolution_gpu_yxfb_yxio_b16",2], + "15750539817895707253": ["convolution_gpu_yxfb_yxio_b16",2], + "12585864429067596351": ["convolution_gpu_yxfb_yxio_b16",1], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16725049805030712400": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3738514326459749974": ["convolution_gpu_yxfb_yxio_b16",1], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",2], + "11800783548769329949": ["convolution_gpu_bfyx_gemm_like",2], + "13598062803968442253": ["convolution_gpu_yxfb_yxio_b16",2], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17498483343394902796": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12027202455592387086": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5834825835421819800": ["convolution_gpu_yxfb_yxio_b16",2], + "816527348871309530": ["convolution_gpu_yxfb_yxio_b16",2], + "8321204816277460837": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10626018319543075871": ["convolution_gpu_yxfb_yxio_b16",2], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",2], + "11891319657803057127": ["convolution_gpu_yxfb_yxio_b16",2], + "3308955824300750921": ["convolution_gpu_yxfb_yxio_b16",2], + "1095959046309466012": ["convolution_gpu_yxfb_yxio_b16",2], + "14184895905338394239": ["convolution_gpu_bfyx_gemm_like",2], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",1], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "11706446082856895571": ["convolution_gpu_bfyx_gemm_like",2], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16563030700888982979": ["convolution_gpu_yxfb_yxio_b16",2], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",1], + "430132942408244070": ["convolution_gpu_bfyx_gemm_like",2], + "11299021927882809469": ["convolution_gpu_yxfb_yxio_b16",2], + "9562527071055150197": ["convolution_gpu_bfyx_1x1",2], + "1250095876638711647": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "14079654309452583394": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7742126547476513275": ["convolution_gpu_yxfb_yxio_b16",2], + "3343020946662226400": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",1], + "2670216237572554944": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "15363606233048272809": ["convolution_gpu_bfyx_1x1",2], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13735180250757239202": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",2], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "7395419333138772074": ["convolution_gpu_yxfb_yxio_b16",1], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "13158449455164143947": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "2782970766870172398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15449715596597016714": ["convolution_gpu_bfyx_gemm_like",2], + "9809458159478958866": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "11224051407822914513": ["convolution_gpu_yxfb_yxio_b16",2], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",2], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "4897690791599638716": ["convolution_gpu_yxfb_yxio_b16",2], + "4776685525963461501": ["convolution_gpu_yxfb_yxio_b16",2], + "938848188161536107": ["convolution_gpu_bfyx_1x1",2], + "16742058312847401360": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17266121859044814533": ["convolution_gpu_yxfb_yxio_b16",2], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "14764715930784496165": ["convolution_gpu_bfyx_gemm_like",2], + "17580363505072477558": ["convolution_gpu_yxfb_yxio_b16",2], + "14578867494693499627": ["convolution_gpu_bfyx_gemm_like",2], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",2], + "15148442194461613102": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5941298590926032148": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7126667413990834481": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "223412492545617963": ["convolution_gpu_yxfb_yxio_b16",2], + "13621771094745539509": ["convolution_gpu_yxfb_yxio_b16",2], + "9871407256481442790": ["convolution_gpu_yxfb_yxio_b16",2], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",2], + "7815650257256675477": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "4461989328775275994": ["convolution_gpu_bfyx_gemm_like",2], + "13123709697607309884": ["convolution_gpu_yxfb_yxio_b16",1], + "14331554754171207866": ["convolution_gpu_bfyx_gemm_like",1], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",2], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16597170760061556882": ["convolution_gpu_yxfb_yxio_b16",2], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "15106614232165315070": ["convolution_gpu_bfyx_gemm_like",2], + "12913866095318048752": ["convolution_gpu_bfyx_gemm_like",2], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "8943651590146149679": ["convolution_gpu_yxfb_yxio_b16",2], + "6469277112054008613": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "18322435770607273817": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6862489207967519978": ["convolution_gpu_bfyx_gemm_like",2], + "11051684565403294370": ["convolution_gpu_yxfb_yxio_b16",2], + "14262482011051329729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1498389965422474930": ["convolution_gpu_yxfb_yxio_b16",2], + "14766477690417085350": ["convolution_gpu_bfyx_1x1",2], + "14819324687394700033": ["convolution_gpu_bfyx_1x1",2], + "4574541202890196191": ["convolution_gpu_yxfb_yxio_b16",2], + "5884802375772043861": ["convolution_gpu_yxfb_yxio_b16",1], + "9272405129875537865": ["convolution_gpu_yxfb_yxio_b16",2], + "14445031303145992349": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "15310474203328198827": ["convolution_gpu_yxfb_yxio_b16",2], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",1], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",2], + "17969195175890497912": ["convolution_gpu_yxfb_yxio_b16",2], + "9162359935098885411": ["convolution_gpu_yxfb_yxio_b16",2], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "3364141707903132298": ["convolution_gpu_yxfb_yxio_b16",2], + "3647203315640064927": ["convolution_gpu_yxfb_yxio_b16",2], + "17342758321852264926": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4438526427135833402": ["convolution_gpu_yxfb_yxio_b16",2], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",1], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",1], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "130427456111826171": ["convolution_gpu_yxfb_yxio_b16",2], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "10572945270796129630": ["fully_connected_gpu_fb_io_ref",1], + "4936961129835214448": ["convolution_gpu_bfyx_gemm_like",2], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11148428797294511280": ["convolution_gpu_yxfb_yxio_b16",2], + "13408839571805750778": ["convolution_gpu_yxfb_yxio_b16",2], + "14558572801374416278": ["convolution_gpu_bfyx_gemm_like",1], + "10005348255972308430": ["convolution_gpu_yxfb_yxio_b16",2], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",2], + "8876704486585503280": ["convolution_gpu_yxfb_yxio_b16",2], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",2], + "10174346112533671798": ["convolution_gpu_yxfb_yxio_b16",2], + "1527126728636583082": ["convolution_gpu_yxfb_yxio_b16",0], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",2], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9538863363710651909": ["convolution_gpu_yxfb_yxio_b16",2], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5224252360611200472": ["convolution_gpu_bfyx_gemm_like",2], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "9642965664913867675": ["convolution_gpu_yxfb_yxio_b16",2], + "7397376454528841634": ["convolution_gpu_yxfb_yxio_b16",2], + "14742998604680438008": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "6921081008428242060": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "10939522663236304689": ["convolution_gpu_yxfb_yxio_b16",2], + "13155570698198686211": ["convolution_gpu_yxfb_yxio_b16",2], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "11939914680143672459": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "9590161922224578217": ["convolution_gpu_yxfb_yxio_b16",1], + "2797436491596125131": ["convolution_gpu_yxfb_yxio_b16",2], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "7600034850149968684": ["convolution_gpu_yxfb_yxio_b16",0], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",2], + "6839795451275143093": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",2], + "11277866878590984477": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9918371346247634545": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "14258941821319200170": ["convolution_gpu_yxfb_yxio_b16",2], + "2447893458816856522": ["convolution_gpu_bfyx_gemm_like",2], + "15078168059698267650": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2367452220382767844": ["convolution_gpu_yxfb_yxio_b16",2], + "3987482581128838173": ["convolution_gpu_yxfb_yxio_b16",2], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "10762489947656697207": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5924698731432597368": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "5422432655714154738": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3285968426413869315": ["convolution_gpu_yxfb_yxio_b16",1], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "13320473279945887641": ["convolution_gpu_yxfb_yxio_b16",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",1], + "5008350851224686853": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "13668072006310741601": ["convolution_gpu_yxfb_yxio_b16",2], + "994252691216116396": ["convolution_gpu_yxfb_yxio_b16",1], + "149810021216592597": ["convolution_gpu_yxfb_yxio_b16",2], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "4633763257197651352": ["convolution_gpu_yxfb_yxio_b16",2], + "16209868158768307271": ["convolution_gpu_bfyx_os_iyx_osv16",919], + "10572380563704942622": ["convolution_gpu_yxfb_yxio_b16",2], + "11411413051626428349": ["convolution_gpu_yxfb_yxio_b16",2], + "8058419689646625853": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7590390572139249734": ["convolution_gpu_yxfb_yxio_b16",2], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "720558977788683564": ["convolution_gpu_yxfb_yxio_b16",2], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",387], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7814543122045448412": ["convolution_gpu_bfyx_gemm_like",2], + "17538518333907257868": ["convolution_gpu_bfyx_gemm_like",2], + "2283020548041814543": ["convolution_gpu_yxfb_yxio_b16",2], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",1], + "14651159827389223108": ["convolution_gpu_bfyx_gemm_like",2], + "17224655686568797096": ["convolution_gpu_yxfb_yxio_b16",1], + "1703738105910059846": ["convolution_gpu_yxfb_yxio_b16",2], + "14215394208930955062": ["convolution_gpu_yxfb_yxio_b16",0], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "7678226048807568024": ["convolution_gpu_yxfb_yxio_b16",2], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "1308980444055174254": ["convolution_gpu_bfyx_gemm_like",2], + "4727628999533330347": ["convolution_gpu_yxfb_yxio_b16",2], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "14149210193687890597": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14769111376729628572": ["convolution_gpu_yxfb_yxio_b16",2], + "501138469231848694": ["convolution_gpu_yxfb_yxio_b16",2], + "15645112311663561994": ["convolution_gpu_yxfb_yxio_b16",2], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5041922366297242362": ["convolution_gpu_yxfb_yxio_b16",2], + "16271970578584267980": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "8494385862885499798": ["convolution_gpu_yxfb_yxio_b16",1], + "7400937639903461446": ["convolution_gpu_yxfb_yxio_b16",2], + "18384215264061386089": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",2], + "15269988216002549857": ["convolution_gpu_yxfb_yxio_b16",2], + "8846314870152404018": ["convolution_gpu_bfyx_gemm_like",2], + "18373951194274306895": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "3101885395179993708": ["convolution_gpu_yxfb_yxio_b16",2], + "14315760630997175346": ["convolution_gpu_yxfb_yxio_b16",2], + "12331134162344797761": ["convolution_gpu_yxfb_yxio_b16",2], + "6254161707168091438": ["convolution_gpu_bfyx_gemm_like",2], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",2], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",2], + "897253033961107413": ["convolution_gpu_yxfb_yxio_b16",2], + "15206249797344242666": ["convolution_gpu_yxfb_yxio_b16",2], + "16210934187492210542": ["convolution_gpu_yxfb_yxio_b16",2], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18436249934780056991": ["convolution_gpu_bfyx_os_iyx_osv16",296], + "4104679489383377966": ["convolution_gpu_yxfb_yxio_b16",2], + "3713558537660711857": ["convolution_gpu_yxfb_yxio_b16",2], + "10961696014697611547": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2685061316482503878": ["convolution_gpu_bfyx_gemm_like",2], + "12487879163561616870": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10996596479775375564": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "15050884844653850678": ["convolution_gpu_yxfb_yxio_b16",2], + "12681408370704556588": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",1], + "13636859714649629789": ["convolution_gpu_yxfb_yxio_b16",1], + "18071280811713424504": ["convolution_gpu_yxfb_yxio_b16",2], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "698274493570551388": ["convolution_gpu_yxfb_yxio_b16",2], + "10034575179959785704": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "17616719165728687438": ["convolution_gpu_yxfb_yxio_b16",2], + "15308196586729169691": ["convolution_gpu_yxfb_yxio_b16",2], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",2], + "10782169939706303899": ["convolution_gpu_yxfb_yxio_b16",2], + "17806747473167329833": ["convolution_gpu_yxfb_yxio_b16",2], + "6438721407426283362": ["convolution_gpu_yxfb_yxio_b16",1], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "9714508918051740792": ["convolution_gpu_bfyx_direct_10_12_16",1], + "740260423018155343": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "3662747857062156477": ["convolution_gpu_bfyx_gemm_like",2], + "13637537549252005181": ["convolution_gpu_yxfb_yxio_b16",2], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "12896226291465522304": ["convolution_gpu_yxfb_yxio_b16",2], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "9906138392975645747": ["convolution_gpu_yxfb_yxio_b16",2], + "11730276873446857018": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",2], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",1], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",275], + "15640466585550013905": ["convolution_gpu_bfyx_gemm_like",2], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",1], + "14193777296032212476": ["convolution_gpu_yxfb_yxio_b16",2], + "6776601719651959634": ["convolution_gpu_yxfb_yxio_b16",2], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2863465257341735941": ["convolution_gpu_bfyx_1x1",1], + "11634932044447867039": ["convolution_gpu_bfyx_gemm_like",2], + "8096131027165540886": ["convolution_gpu_bfyx_gemm_like",2], + "15192230303376521834": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "900243696733233996": ["convolution_gpu_yxfb_yxio_b16",2], + "7927587739463421727": ["convolution_gpu_yxfb_yxio_b16",2], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",2], + "10645625090439446714": ["convolution_gpu_bfyx_gemm_like",2], + "4252157815622916471": ["convolution_gpu_bfyx_1x1",2], + "3135889221160961020": ["convolution_gpu_yxfb_yxio_b16",2], + "5886032409392368342": ["convolution_gpu_yxfb_yxio_b16",2], + "2740885908397449753": ["convolution_gpu_yxfb_yxio_b16",2], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",1], + "15609860394182767048": ["convolution_gpu_yxfb_yxio_b16",2], + "11311839946200066200": ["convolution_gpu_yxfb_yxio_b16",2], + "16910952799476896905": ["convolution_gpu_bfyx_gemm_like",2], + "1597770067928214597": ["convolution_gpu_bfyx_1x1",1], + "1802510952374368682": ["convolution_gpu_yxfb_yxio_b16",2], + "10893432143734884603": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "1662588605309237309": ["convolution_gpu_yxfb_yxio_b16",2], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "14020956765444878761": ["convolution_gpu_bfyx_gemm_like",2], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9482749589540764069": ["convolution_gpu_yxfb_yxio_b16",2], + "12053562297742437099": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6883767567034259453": ["convolution_gpu_yxfb_yxio_b16",2], + "17725637691681205907": ["convolution_gpu_bfyx_gemm_like",2], + "14446688005815492020": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",2], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "4752129805031267391": ["convolution_gpu_yxfb_yxio_b16",2], + "14689423748560749566": ["fully_connected_gpu_fb_oi_ref",1], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "3935750066315595083": ["convolution_gpu_yxfb_yxio_b16",1], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13503555814874045782": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "5600807544955072308": ["convolution_gpu_bfyx_gemm_like",2], + "8652128863605749877": ["convolution_gpu_yxfb_yxio_b16",2], + "4521622755195947253": ["convolution_gpu_yxfb_yxio_b16",2], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",281], + "11732321796147239597": ["convolution_gpu_yxfb_yxio_b16",2], + "9285202897230250613": ["convolution_gpu_yxfb_yxio_b16",2], + "10070051133200561606": ["convolution_gpu_yxfb_yxio_b16",1], + "13520876347177213888": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "5671289201458690944": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "486816652607164926": ["convolution_gpu_yxfb_yxio_b16",2], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3112648799276134590": ["convolution_gpu_yxfb_yxio_b16",2], + "17477062954520561609": ["convolution_gpu_bfyx_gemm_like",2], + "208915399644127739": ["convolution_gpu_bfyx_gemm_like",2], + "5596408142536691534": ["convolution_gpu_yxfb_yxio_b16",2], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5632958791318880428": ["convolution_gpu_yxfb_yxio_b16",2], + "6159729136505378486": ["convolution_gpu_yxfb_yxio_b16",2], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",2], + "9263784636194609884": ["convolution_gpu_yxfb_yxio_b16",2], + "11942424927004660476": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "6423354409210936959": ["convolution_gpu_yxfb_yxio_b16",1], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "6525496212688896740": ["convolution_gpu_yxfb_yxio_b16",2], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",2], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "9890700023578477203": ["convolution_gpu_bfyx_gemm_like",2], + "3541538046227217664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15217573782563469232": ["convolution_gpu_yxfb_yxio_b16",2], + "11587239927319376658": ["convolution_gpu_bfyx_gemm_like",2], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "15985980444340490463": ["convolution_gpu_yxfb_yxio_b16",2], + "1044889231088602677": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14264584839702225855": ["convolution_gpu_yxfb_yxio_b16",2], + "1186545671730357033": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",2], + "6469003096932778978": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "11787674847611032323": ["convolution_gpu_yxfb_yxio_b16",2], + "15646081020506130125": ["convolution_gpu_yxfb_yxio_b16",2], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "9735280865199145311": ["convolution_gpu_yxfb_yxio_b16",2], + "3265415000818832667": ["convolution_gpu_bfyx_gemm_like",2], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7859659993155959174": ["convolution_gpu_yxfb_yxio_b16",2], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",1], + "2204178900998688268": ["convolution_gpu_bfyx_gemm_like",2], + "10718639465064821919": ["convolution_gpu_yxfb_yxio_b16",2], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "1161304401293419103": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12686015414958770329": ["convolution_gpu_bfyx_gemm_like",2], + "17051718450741106678": ["convolution_gpu_yxfb_yxio_b16",2], + "7624259732952222597": ["convolution_gpu_bfyx_gemm_like",2], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "14349625788399542568": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17945600479510493949": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3383222668132648804": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7210854698870587826": ["convolution_gpu_yxfb_yxio_b16",2], + "1298596164164324360": ["convolution_gpu_yxfb_yxio_b16",2], + "7162155897369277782": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5317076157086789437": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9900658671239107502": ["convolution_gpu_bfyx_1x1",2], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",1], + "14339479547451422762": ["convolution_gpu_yxfb_yxio_b16",2], + "13576010631084066792": ["convolution_gpu_yxfb_yxio_b16",1], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4353583636655606632": ["convolution_gpu_yxfb_yxio_b16",1], + "16020916772006653269": ["convolution_gpu_bfyx_1x1",1], + "5596441339918073261": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "733956743303342862": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6880746917399866285": ["convolution_gpu_bfyx_gemm_like",2], + "6992073477131490452": ["convolution_gpu_bfyx_gemm_like",2], + "15865753975271064117": ["convolution_gpu_yxfb_yxio_b16",2], + "15275978123703636572": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "16748662918272106932": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",2], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "8174040194088942964": ["convolution_gpu_bfyx_os_iyx_osv16",945], + "4839205075057964902": ["convolution_gpu_yxfb_yxio_b16",2], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14501815053459103515": ["convolution_gpu_yxfb_yxio_b16",2], + "3684792790546138809": ["convolution_gpu_yxfb_yxio_b16",2], + "12032580551021546487": ["convolution_gpu_yxfb_yxio_b16",2], + "6709083009339039603": ["convolution_gpu_yxfb_yxio_b16",2], + "16828388628569377322": ["convolution_gpu_yxfb_yxio_b16",2], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "11637325834858582585": ["convolution_gpu_bfyx_gemm_like",2], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "1507504848332592003": ["convolution_gpu_yxfb_yxio_b16",1], + "16426655160932259558": ["convolution_gpu_yxfb_yxio_b16",2], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "8527069404111265568": ["convolution_gpu_bfyx_os_iyx_osv16",434], + "7280502812960451465": ["convolution_gpu_yxfb_yxio_b16",2], + "16532743776403877084": ["convolution_gpu_yxfb_yxio_b16",1], + "5657471280535146301": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2085467192625870436": ["convolution_gpu_bfyx_gemm_like",2], + "7168028033666253263": ["convolution_gpu_bfyx_gemm_like",2], + "12303905514885913537": ["convolution_gpu_yxfb_yxio_b16",1], + "15612797125081819500": ["convolution_gpu_yxfb_yxio_b16",2], + "9452094307760005150": ["convolution_gpu_bfyx_gemm_like",2], + "13862199647000195451": ["convolution_gpu_yxfb_yxio_b16",2], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "8372855367097191197": ["convolution_gpu_yxfb_yxio_b16",2], + "14544219140091420262": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17228615388053183744": ["convolution_gpu_yxfb_yxio_b16",2], + "16606674008248299103": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "8726274320876550785": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12892265081710606252": ["convolution_gpu_yxfb_yxio_b16",1], + "7826406759309418010": ["convolution_gpu_yxfb_yxio_b16",2], + "8078028207842958010": ["convolution_gpu_yxfb_yxio_b16",2], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "6631103268546309714": ["convolution_gpu_yxfb_yxio_b16",2], + "231083216612056805": ["convolution_gpu_yxfb_yxio_b16",2], + "9951951467222189282": ["convolution_gpu_yxfb_yxio_b16",2], + "12755692101476964677": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "2295659951331099829": ["convolution_gpu_yxfb_yxio_b16",2], + "17019474731460049248": ["convolution_gpu_yxfb_yxio_b16",2], + "16579057939215877904": ["convolution_gpu_bfyx_gemm_like",2], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",2], + "11279789373735965856": ["convolution_gpu_yxfb_yxio_b16",2], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",1], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3499106702307464480": ["convolution_gpu_bfyx_gemm_like",2], + "8541982562061181756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "6808980404170272597": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "13234872695521811652": ["convolution_gpu_yxfb_yxio_b16",1], + "921209976738626097": ["convolution_gpu_yxfb_yxio_b16",2], + "17321934232458063571": ["convolution_gpu_yxfb_yxio_b16",2], + "2042946928570163140": ["convolution_gpu_yxfb_yxio_b16",2], + "17542035367134614728": ["convolution_gpu_yxfb_yxio_b16",1], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "13634686998599681086": ["convolution_gpu_yxfb_yxio_b16",2], + "13223232888554043645": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3531786338249174486": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2242602888499888844": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14363025045807200040": ["convolution_gpu_bfyx_os_iyx_osv16",541], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9291397338108903174": ["convolution_gpu_yxfb_yxio_b16",2], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17264010982688979937": ["convolution_gpu_bfyx_1x1",2], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "4819131094439732065": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7748233564411787605": ["convolution_gpu_bfyx_gemm_like",2], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11972290239275366299": ["convolution_gpu_yxfb_yxio_b16",2], + "2553539191926275121": ["convolution_gpu_yxfb_yxio_b16",2], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "2133236128630074068": ["convolution_gpu_yxfb_yxio_b16",2], + "3571330754519284334": ["convolution_gpu_yxfb_yxio_b16",2], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3574585436812909168": ["convolution_gpu_yxfb_yxio_b16",1], + "4561778392194061215": ["convolution_gpu_yxfb_yxio_b16",1], + "3701838669605585798": ["convolution_gpu_yxfb_yxio_b16",2], + "14466032674083938714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15783429395177379897": ["convolution_gpu_yxfb_yxio_b16",2], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6685985905221810743": ["convolution_gpu_yxfb_yxio_b16",1], + "2102169562353089558": ["convolution_gpu_yxfb_yxio_b16",2], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "13810735868750326592": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "4883588237027084166": ["convolution_gpu_yxfb_yxio_b16",2], + "8219179055259247644": ["convolution_gpu_yxfb_yxio_b16",2], + "15548854462657362014": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17769159396346490074": ["convolution_gpu_yxfb_yxio_b16",1], + "7263796835299019284": ["convolution_gpu_bfyx_gemm_like",2], + "3477539135137665170": ["convolution_gpu_bfyx_gemm_like",2], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "10049294964307823692": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "12900949103593247293": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13760645810144930270": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "15597317305719116351": ["convolution_gpu_yxfb_yxio_b16",2], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "9778670810863940690": ["convolution_gpu_yxfb_yxio_b16",2], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",2], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "13101474064130881526": ["convolution_gpu_yxfb_yxio_b16",2], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "18029395208219861440": ["convolution_gpu_yxfb_yxio_b16",2], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "3441335188113424896": ["convolution_gpu_bfyx_gemm_like",2], + "11267495078361954131": ["convolution_gpu_yxfb_yxio_b16",2], + "1520529227443340435": ["convolution_gpu_bfyx_gemm_like",2], + "8300290944865904942": ["convolution_gpu_yxfb_yxio_b16",1], + "12561177248542630652": ["convolution_gpu_yxfb_yxio_b16",2], + "11300415556407923335": ["convolution_gpu_yxfb_yxio_b16",2], + "3503893875515897267": ["convolution_gpu_bfyx_gemm_like",2], + "7241156141838776126": ["convolution_gpu_bfyx_gemm_like",1], + "15138641310139776109": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15483343060578660278": ["convolution_gpu_yxfb_yxio_b16",2], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "6051877311645456194": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5115007207028125638": ["convolution_gpu_bfyx_gemm_like",2], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17965267346493659374": ["convolution_gpu_yxfb_yxio_b16",2], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "530973311459168543": ["convolution_gpu_yxfb_yxio_b16",2], + "15988378956341507229": ["convolution_gpu_yxfb_yxio_b16",2], + "6648876837655776653": ["convolution_gpu_bfyx_1x1",2], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "10645057595080511813": ["convolution_gpu_yxfb_yxio_b16",2], + "3835387982926010630": ["convolution_gpu_yxfb_yxio_b16",2], + "4802014352392262053": ["convolution_gpu_yxfb_yxio_b16",2], + "10577357333308653027": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "1375084615110147615": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14532519639619315651": ["convolution_gpu_bfyx_gemm_like",2], + "7027962921778599989": ["convolution_gpu_yxfb_yxio_b16",1], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",2], + "6970636030494405299": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "12375919467924385618": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "10531218595816974659": ["convolution_gpu_bfyx_gemm_like",2], + "15757308772667178999": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1450861513159359637": ["convolution_gpu_yxfb_yxio_b16",2], + "5062815196458225737": ["convolution_gpu_bfyx_os_iyx_osv16",487], + "5464801565268066541": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "17397600088595751782": ["convolution_gpu_yxfb_yxio_b16",2], + "5525691792821548743": ["convolution_gpu_yxfb_yxio_b16",2], + "11910900938442124765": ["convolution_gpu_bfyx_gemm_like",2], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "13575423234109624706": ["fully_connected_gpu_yxfb_ref",2], + "480310470450900836": ["convolution_gpu_bfyx_gemm_like",2], + "4656068024153891922": ["convolution_gpu_yxfb_yxio_b16",1], + "14616413139039308367": ["fully_connected_gpu_fb_oi_ref",2], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14971506154649368216": ["convolution_gpu_yxfb_yxio_b16",1], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",1], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",1], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",2], + "10055247339012492459": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "2356785927637873692": ["convolution_gpu_bfyx_gemm_like",2], + "959666756751640874": ["convolution_gpu_yxfb_yxio_b16",2], + "11002875874008272679": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1582751548472076534": ["convolution_gpu_yxfb_yxio_b16",2], + "5032841266226405428": ["convolution_gpu_yxfb_yxio_b16",2], + "14553813154800569861": ["convolution_gpu_yxfb_yxio_b16",2], + "12825407709419526493": ["convolution_gpu_yxfb_yxio_b16",2], + "9525535670799618110": ["convolution_gpu_bfyx_gemm_like",2], + "14289082888174784976": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "5541365322085427177": ["convolution_gpu_yxfb_yxio_b16",2], + "17546090415334871175": ["convolution_gpu_yxfb_yxio_b16",2], + "10617442099961865960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8509024280905303927": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5924271203978892761": ["convolution_gpu_yxfb_yxio_b16",2], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",2], + "9440117898128288296": ["convolution_gpu_bfyx_gemm_like",2], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10009559358571629502": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13571587312517912280": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2705031521944165712": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "12210280332071091209": ["fully_connected_gpu_fb_oi_ref",1], + "2761862049452027986": ["convolution_gpu_yxfb_yxio_b16",2], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",904], + "11626402549863483301": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "9250030880535336888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",596], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "10117092543913369513": ["convolution_gpu_yxfb_yxio_b16",2], + "708747442142592697": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16075006181495932250": ["convolution_gpu_bfyx_gemm_like",2], + "16996895381161031110": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10015368609444108372": ["convolution_gpu_yxfb_yxio_b16",2], + "17958575161092859465": ["convolution_gpu_yxfb_yxio_b16",1], + "12619739385084492771": ["convolution_gpu_yxfb_yxio_b16",2], + "7349168847581850619": ["convolution_gpu_yxfb_yxio_b16",2], + "14801210545983960599": ["convolution_gpu_yxfb_yxio_b16",2], + "488798544312719183": ["convolution_gpu_yxfb_yxio_b16",2], + "415826393421796195": ["convolution_gpu_yxfb_yxio_b16",2], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "2321148334382088982": ["convolution_gpu_bfyx_gemm_like",2], + "3563872903821081702": ["convolution_gpu_bfyx_direct_10_12_16",2], + "143667964449473415": ["convolution_gpu_yxfb_yxio_b16",0], + "7469107606686458209": ["convolution_gpu_yxfb_yxio_b16",2], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",2], + "14943031375539993004": ["convolution_gpu_yxfb_yxio_b16",2], + "14307705501349750896": ["convolution_gpu_yxfb_yxio_b16",2], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8094920912208664820": ["convolution_gpu_yxfb_yxio_b16",2], + "13111122805945249561": ["convolution_gpu_yxfb_yxio_b16",2], + "2052010432187897741": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "11822555173696078282": ["convolution_gpu_bfyx_gemm_like",0], + "11612209645710419427": ["convolution_gpu_yxfb_yxio_b16",2], + "9062781751511609244": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5250257911846706612": ["convolution_gpu_yxfb_yxio_b16",2], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8881906040469243354": ["convolution_gpu_yxfb_yxio_b16",2], + "5941852872160795604": ["convolution_gpu_bfyx_gemm_like",2], + "13598984763955239116": ["convolution_gpu_bfyx_gemm_like",0], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",2], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "7105279481103494151": ["fully_connected_gpu_fb_oi_ref",1], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "2803569867265035123": ["convolution_gpu_bfyx_os_iyx_osv16",1029], + "7720153213673170931": ["convolution_gpu_yxfb_yxio_b16",2], + "4010419602093863685": ["convolution_gpu_yxfb_yxio_b16",2], + "7274647463152753603": ["convolution_gpu_yxfb_yxio_b16",2], + "17030051116023319382": ["convolution_gpu_yxfb_yxio_b16",1], + "794499287296495726": ["convolution_gpu_bfyx_1x1",2], + "4802009650745059499": ["convolution_gpu_yxfb_yxio_b16",2], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "138379779469699309": ["convolution_gpu_bfyx_gemm_like",2], + "14968401410355925289": ["convolution_gpu_yxfb_yxio_b16",2], + "11132679855317294753": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15600841108426475615": ["convolution_gpu_yxfb_yxio_b16",2], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",2], + "3114210363452108737": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "1895560603400089814": ["convolution_gpu_yxfb_yxio_b16",1], + "11516184047320372729": ["convolution_gpu_yxfb_yxio_b16",2], + "16044646335477470657": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15848096609835347542": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8898095926967052382": ["convolution_gpu_yxfb_yxio_b16",2], + "5758133252959371492": ["convolution_gpu_bfyx_gemm_like",1], + "17822988909419777692": ["convolution_gpu_yxfb_yxio_b16",1], + "15006321421735686121": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "9434143681116089888": ["convolution_gpu_bfyx_gemm_like",2], + "17712558058168648648": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "994842991399671507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "8163000689380461611": ["convolution_gpu_yxfb_yxio_b16",1], + "14612206111651511130": ["convolution_gpu_yxfb_yxio_b16",2], + "10065714384927707796": ["convolution_gpu_yxfb_yxio_b16",2], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2915777749501772828": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9588943054777767098": ["convolution_gpu_yxfb_yxio_b16",2], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "13851025202247070979": ["convolution_gpu_yxfb_yxio_b16",1], + "13380637319403400851": ["convolution_gpu_yxfb_yxio_b16",2], + "3321251856445833973": ["convolution_gpu_yxfb_yxio_b16",1], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13325287783358291692": ["convolution_gpu_yxfb_yxio_b16",2], + "7863319552895863063": ["convolution_gpu_yxfb_yxio_b16",2], + "1771347579022727189": ["convolution_gpu_yxfb_yxio_b16",2], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",2], + "981803877097233095": ["convolution_gpu_yxfb_yxio_b16",2], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3784684114139223050": ["convolution_gpu_yxfb_yxio_b16",2], + "13731797251725972855": ["convolution_gpu_yxfb_yxio_b16",2], + "17228810554159747400": ["convolution_gpu_bfyx_gemm_like",2], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "518733575377143679": ["convolution_gpu_yxfb_yxio_b16",2], + "10324485383646920518": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "14066675688397331406": ["convolution_gpu_yxfb_yxio_b16",2], + "6730447536124542965": ["convolution_gpu_yxfb_yxio_b16",1], + "8537824547722216155": ["convolution_gpu_yxfb_yxio_b16",1], + "6344600111737335616": ["convolution_gpu_yxfb_yxio_b16",2], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "2603233376890892194": ["convolution_gpu_yxfb_yxio_b16",2], + "12600479027568241746": ["convolution_gpu_yxfb_yxio_b16",2], + "1379758215293949563": ["convolution_gpu_yxfb_yxio_b16",2], + "17893696934478535385": ["convolution_gpu_yxfb_yxio_b16",2], + "11498084465186986412": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "13565691057064774487": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "11411580529501121244": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "9569522500959727054": ["convolution_gpu_yxfb_yxio_b16",2], + "6371463287631658789": ["convolution_gpu_bfyx_gemm_like",2], + "10330180429524641331": ["convolution_gpu_bfyx_gemm_like",2], + "2984726467649419856": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "4740864135937875560": ["convolution_gpu_yxfb_yxio_b16",1], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",2], + "10106454449619141260": ["convolution_gpu_bfyx_1x1",2], + "1594829714229111215": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "727216855315869048": ["convolution_gpu_yxfb_yxio_b16",2], + "5044721291675005144": ["convolution_gpu_bfyx_1x1",2], + "8712136292276123857": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "9909564412554801760": ["convolution_gpu_yxfb_yxio_b16",2], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",1], + "8609939102588915855": ["convolution_gpu_bfyx_gemm_like",2], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "3219408878901707426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10022487076451608714": ["convolution_gpu_bfyx_gemm_like",2], + "1338705434700924127": ["convolution_gpu_bfyx_1x1",1], + "2737352811173555281": ["convolution_gpu_yxfb_yxio_b16",2], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",2], + "14311888412221174224": ["convolution_gpu_yxfb_yxio_b16",2], + "16015963261509760799": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "11376953876369788199": ["convolution_gpu_yxfb_yxio_b16",1], + "3463206409786541741": ["convolution_gpu_yxfb_yxio_b16",2], + "15217077412685024074": ["convolution_gpu_yxfb_yxio_b16",2], + "6792281830591233968": ["convolution_gpu_yxfb_yxio_b16",2], + "9504349455215835807": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",0], + "13602140021189675477": ["convolution_gpu_bfyx_gemm_like",2], + "13369751385866224286": ["convolution_gpu_yxfb_yxio_b16",2], + "6367371992814643260": ["convolution_gpu_yxfb_yxio_b16",2], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",2], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6260684231055362504": ["convolution_gpu_yxfb_yxio_b16",2], + "1088710562928089772": ["convolution_gpu_yxfb_yxio_b16",2], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",1], + "14383657211047876136": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "3163833930628348446": ["convolution_gpu_yxfb_yxio_b16",2], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "9999543693712389402": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4099828484175044842": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2172999245833525797": ["convolution_gpu_yxfb_yxio_b16",2], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "12259611546528256409": ["convolution_gpu_yxfb_yxio_b16",1], + "9439431829175743345": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10722677916294015259": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13456967132681889167": ["convolution_gpu_yxfb_yxio_b16",2], + "104765009188090817": ["convolution_gpu_yxfb_yxio_b16",2], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",1], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "7870154008378361670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",1], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "10783046011829953095": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2609346307827449622": ["convolution_gpu_yxfb_yxio_b16",2], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "9314293064351558241": ["convolution_gpu_bfyx_gemm_like",2], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "11155444222714959508": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "5008541841892687897": ["convolution_gpu_yxfb_yxio_b16",2], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "5534071639452404412": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11872943152839631823": ["convolution_gpu_bfyx_gemm_like",2], + "8717456809499914445": ["convolution_gpu_yxfb_yxio_b16",2], + "10254566865260697753": ["convolution_gpu_yxfb_yxio_b16",2], + "13809046727894108358": ["convolution_gpu_yxfb_yxio_b16",2], + "3534874664568214253": ["convolution_gpu_bfyx_1x1",2], + "13717351126657739994": ["convolution_gpu_yxfb_yxio_b16",1], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10432365444137108781": ["convolution_gpu_bfyx_gemm_like",2], + "10009796094612770326": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11971736882960844905": ["convolution_gpu_yxfb_yxio_b16",2], + "3567607339495161307": ["convolution_gpu_yxfb_yxio_b16",2], + "14916236722843741326": ["convolution_gpu_yxfb_yxio_b16",2], + "16955829428734830876": ["convolution_gpu_yxfb_yxio_b16",1], + "9696168324381001582": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "11537166370263116277": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "4261192887643002603": ["convolution_gpu_bfyx_gemm_like",2], + "14041970415787494000": ["convolution_gpu_yxfb_yxio_b16",2], + "12643643553436503069": ["convolution_gpu_yxfb_yxio_b16",2], + "7440953406601377619": ["convolution_gpu_yxfb_yxio_b16",2], + "7134419022268272901": ["convolution_gpu_yxfb_yxio_b16",2], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "14362876471450307424": ["convolution_gpu_bfyx_1x1",2], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "11280672272221124024": ["convolution_gpu_yxfb_yxio_b16",2], + "3442845193734599342": ["convolution_gpu_yxfb_yxio_b16",2], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "9105127035114339269": ["convolution_gpu_yxfb_yxio_b16",1], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "5149303626508247520": ["convolution_gpu_yxfb_yxio_b16",2], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",1], + "13077012961563218195": ["convolution_gpu_yxfb_yxio_b16",2], + "5642822685234782052": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "3748621266324665764": ["convolution_gpu_yxfb_yxio_b16",2], + "15915715422308762909": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2909347733581487795": ["convolution_gpu_yxfb_yxio_b16",1], + "13075579052866074866": ["convolution_gpu_bfyx_gemm_like",2], + "5209144536543011657": ["convolution_gpu_yxfb_yxio_b16",1], + "8740268039366363321": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "17430994325635361377": ["convolution_gpu_yxfb_yxio_b16",2], + "9293682866734263821": ["convolution_gpu_yxfb_yxio_b16",2], + "5459463503840817402": ["convolution_gpu_bfyx_1x1",2], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",2], + "3805667660217578518": ["convolution_gpu_yxfb_yxio_b16",2], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",2], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "14517191894006411358": ["convolution_gpu_yxfb_yxio_b16",2], + "1241355545294259810": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "378292944207609677": ["convolution_gpu_yxfb_yxio_b16",2], + "248133885018839814": ["convolution_gpu_yxfb_yxio_b16",2], + "14697908554930995949": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",2], + "5355283113999405036": ["convolution_gpu_yxfb_yxio_b16",1], + "8553491894663686698": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "8113660920207936963": ["convolution_gpu_yxfb_yxio_b16",2], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6214677989814002369": ["convolution_gpu_yxfb_yxio_b16",2], + "16626502801066228405": ["convolution_gpu_yxfb_yxio_b16",1], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",0], + "12867590715338247144": ["convolution_gpu_yxfb_yxio_b16",1], + "302694026179841870": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "10864011008000364415": ["convolution_gpu_bfyx_1x1",2], + "16527840366172690992": ["convolution_gpu_yxfb_yxio_b16",2], + "18101509783610609787": ["convolution_gpu_yxfb_yxio_b16",2], + "4079026972040047969": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "16966477504105790279": ["convolution_gpu_yxfb_yxio_b16",2], + "7183578232279711009": ["convolution_gpu_bfyx_gemm_like",2], + "4708035980731751007": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14115742296883450319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2343310394723780653": ["convolution_gpu_yxfb_yxio_b16",2], + "7451154080124553318": ["convolution_gpu_yxfb_yxio_b16",2], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "5788323787676797805": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "230697511447695268": ["convolution_gpu_yxfb_yxio_b16",2], + "9775648000771985077": ["convolution_gpu_yxfb_yxio_b16",2], + "10278515360013727367": ["convolution_gpu_yxfb_yxio_b16",2], + "10524079700393212963": ["convolution_gpu_yxfb_yxio_b16",2], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "14918482938530107806": ["convolution_gpu_bfyx_gemm_like",2], + "10262850086265676378": ["convolution_gpu_yxfb_yxio_b16",2], + "7289535479247584635": ["convolution_gpu_bfyx_1x1",2], + "17377204616846724192": ["convolution_gpu_bfyx_gemm_like",2], + "402932154499003993": ["convolution_gpu_yxfb_yxio_b16",2], + "5179013491581036103": ["convolution_gpu_yxfb_yxio_b16",2], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12362290144183018227": ["convolution_gpu_yxfb_yxio_b16",1], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "6114241186364821679": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17567012866823126402": ["convolution_gpu_yxfb_yxio_b16",2], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3889456478817717702": ["convolution_gpu_yxfb_yxio_b16",2], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "3398322619007806698": ["convolution_gpu_bfyx_direct_10_12_16",1], + "637115537820955017": ["convolution_gpu_yxfb_yxio_b16",2], + "9763754389347695094": ["convolution_gpu_yxfb_yxio_b16",2], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "11782188262748842182": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "2273992727647793692": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "12455871938978342189": ["convolution_gpu_yxfb_yxio_b16",2], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "3957253946857103590": ["convolution_gpu_yxfb_yxio_b16",2], + "13369603621524676979": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "17817043205731836063": ["convolution_gpu_yxfb_yxio_b16",2], + "512446355173752600": ["convolution_gpu_yxfb_yxio_b16",2], + "3735753364888836383": ["convolution_gpu_yxfb_yxio_b16",1], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",2], + "10995886682834858002": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4030004320208162301": ["convolution_gpu_yxfb_yxio_b16",2], + "11262989876326061679": ["convolution_gpu_yxfb_yxio_b16",0], + "748236447365453504": ["convolution_gpu_yxfb_yxio_b16",2], + "7861234698413147249": ["convolution_gpu_yxfb_yxio_b16",2], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "4562591438007476419": ["convolution_gpu_bfyx_gemm_like",2], + "15293835051273372438": ["convolution_gpu_yxfb_yxio_b16",2], + "8015885733173521367": ["convolution_gpu_yxfb_yxio_b16",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "17077815973022307612": ["convolution_gpu_yxfb_yxio_b16",2], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",1], + "10447947790216991304": ["convolution_gpu_bfyx_gemm_like",2], + "5721096633060535553": ["convolution_gpu_yxfb_yxio_b16",2], + "12458921031453334451": ["convolution_gpu_yxfb_yxio_b16",2], + "15816807118780455948": ["convolution_gpu_yxfb_yxio_b16",2], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",1], + "9834941975457910988": ["convolution_gpu_yxfb_yxio_b16",2], + "6208201398783088425": ["convolution_gpu_bfyx_gemm_like",2], + "8265982881100325775": ["convolution_gpu_yxfb_yxio_b16",2], + "8638074773026771425": ["convolution_gpu_yxfb_yxio_b16",2], + "6846760451124717672": ["convolution_gpu_yxfb_yxio_b16",1], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "5465400164581117113": ["convolution_gpu_bfyx_gemm_like",2], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5359510718430377298": ["convolution_gpu_yxfb_yxio_b16",2], + "4403753181729432604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "7600296832974673294": ["convolution_gpu_yxfb_yxio_b16",2], + "3911736807429733938": ["convolution_gpu_yxfb_yxio_b16",2], + "1701412735970485849": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "8527055001340219573": ["convolution_gpu_yxfb_yxio_b16",2], + "2562815925396318565": ["convolution_gpu_yxfb_yxio_b16",2], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",2], + "13323186744342557015": ["convolution_gpu_yxfb_yxio_b16",1], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "9659837320293869285": ["convolution_gpu_yxfb_yxio_b16",1], + "14188157670969097508": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2457671437276780303": ["convolution_gpu_yxfb_yxio_b16",2], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "6018481198468872040": ["convolution_gpu_yxfb_yxio_b16",2], + "835053793432636355": ["convolution_gpu_yxfb_yxio_b16",2], + "14337168375989245254": ["convolution_gpu_yxfb_yxio_b16",2], + "9127827617126714860": ["fully_connected_gpu_fb_oi_ref",2], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8131879590716437354": ["convolution_gpu_yxfb_yxio_b16",2], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "1982176363226079588": ["convolution_gpu_bfyx_gemm_like",2], + "13145474177271090694": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "17174919737114915467": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5420215220876162902": ["convolution_gpu_yxfb_yxio_b16",2], + "2129742884686884642": ["convolution_gpu_yxfb_yxio_b16",2], + "4241055784642339756": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7933217973342728190": ["convolution_gpu_yxfb_yxio_b16",2], + "5950285227163574810": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",1], + "10232429887105708502": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6439316331231400868": ["convolution_gpu_yxfb_yxio_b16",0], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4416793079965040181": ["convolution_gpu_yxfb_yxio_b16",1], + "9526266653688168429": ["convolution_gpu_yxfb_yxio_b16",2], + "15879385408480411034": ["convolution_gpu_yxfb_yxio_b16",2], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",1], + "677249604491773387": ["convolution_gpu_bfyx_gemm_like",2], + "990199360818917334": ["convolution_gpu_yxfb_yxio_b16",2], + "4455369117448405874": ["convolution_gpu_bfyx_1x1",2], + "3369689552455141157": ["convolution_gpu_yxfb_yxio_b16",2], + "5802466130040230797": ["convolution_gpu_yxfb_yxio_b16",2], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "14526262781657292025": ["convolution_gpu_yxfb_yxio_b16",2], + "12984970933638742657": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "9629460794894999510": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "601430670855155006": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12439827609628473238": ["convolution_gpu_yxfb_yxio_b16",2], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",2], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",1], + "17158401628206867933": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2114232149447438823": ["convolution_gpu_bfyx_gemm_like",2], + "7650862961269327235": ["convolution_gpu_bfyx_1x1",2], + "3940619509778739158": ["convolution_gpu_yxfb_yxio_b16",1], + "7843180034077880658": ["convolution_gpu_yxfb_yxio_b16",1], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "17718424965214606218": ["convolution_gpu_yxfb_yxio_b16",2], + "7287107719392705356": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",2], + "14199158130218117084": ["convolution_gpu_bfyx_gemm_like",2], + "394778201589371681": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "14807357397951247957": ["convolution_gpu_yxfb_yxio_b16",1], + "4171374172427814762": ["convolution_gpu_yxfb_yxio_b16",2], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8262469434265124590": ["convolution_gpu_yxfb_yxio_b16",1], + "16541970206584576833": ["convolution_gpu_bfyx_gemm_like",2], + "12004628115138530335": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",2], + "3725013268198063198": ["convolution_gpu_bfyx_1x1",2], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",2], + "14897384423894125457": ["convolution_gpu_yxfb_yxio_b16",2], + "12011606174372081253": ["convolution_gpu_yxfb_yxio_b16",2], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",1], + "8010456208258134834": ["convolution_gpu_yxfb_yxio_b16",2], + "6538526180355194359": ["convolution_gpu_yxfb_yxio_b16",2], + "18359731130169236059": ["convolution_gpu_yxfb_yxio_b16",2], + "6097086855988597139": ["convolution_gpu_bfyx_1x1",2], + "9059418187274548462": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",2], + "1680468564927032670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4261215727469154244": ["convolution_gpu_yxfb_yxio_b16",2], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",2], + "14001048251986195179": ["convolution_gpu_bfyx_gemm_like",2], + "6726099352298108756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10225565543636007389": ["convolution_gpu_yxfb_yxio_b16",2], + "13094289895577333088": ["convolution_gpu_yxfb_yxio_b16",2], + "15599983560500910839": ["convolution_gpu_yxfb_yxio_b16",2], + "18131954418490925431": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "1919535500129437217": ["convolution_gpu_yxfb_yxio_b16",2], + "5539793555189956907": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "6624079551747071383": ["convolution_gpu_yxfb_yxio_b16",1], + "16364494883229084045": ["convolution_gpu_bfyx_gemm_like",2], + "4723919313760470311": ["convolution_gpu_yxfb_yxio_b16",1], + "11324651029379152442": ["convolution_gpu_bfyx_1x1",2], + "3358616456137155015": ["convolution_gpu_yxfb_yxio_b16",2], + "264466528528245004": ["convolution_gpu_yxfb_yxio_b16",1], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4732226322522411018": ["fully_connected_gpu_fb_io_b8_f8_vload",0], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",2], + "4168273493370024327": ["convolution_gpu_bfyx_1x1",1], + "860852602930021016": ["convolution_gpu_yxfb_yxio_b16",2], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15581997249051127645": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "6136232084354304563": ["convolution_gpu_yxfb_yxio_b16",2], + "2744566213784972700": ["convolution_gpu_yxfb_yxio_b16",2], + "2412069259085234287": ["convolution_gpu_yxfb_yxio_b16",1], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11682041005124075890": ["convolution_gpu_yxfb_yxio_b16",2], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14281154151197472605": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",2], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "9433875341212148858": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "5735703235236456131": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "9815961128076948768": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9480653639044390919": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",2], + "4329042569031331949": ["convolution_gpu_yxfb_yxio_b16",2], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "1646638859396929303": ["convolution_gpu_yxfb_yxio_b16",2], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",1], + "789202969657820559": ["convolution_gpu_yxfb_yxio_b16",2], + "12988253829685880778": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "8978764053524288494": ["convolution_gpu_bfyx_gemm_like",0], + "6935581283700404601": ["convolution_gpu_yxfb_yxio_b16",2], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "9436893310034662243": ["convolution_gpu_bfyx_gemm_like",2], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "2527276292172180386": ["convolution_gpu_bfyx_gemm_like",2], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1786732163438555728": ["convolution_gpu_yxfb_yxio_b16",0], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "12279771749366327372": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "2173867324489962689": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13530377297525480029": ["convolution_gpu_yxfb_yxio_b16",1], + "12495003066477974474": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "9787359208094141129": ["fully_connected_gpu_fb_oi_ref",1], + "6709883527730513363": ["convolution_gpu_yxfb_yxio_b16",2], + "7172357320005702833": ["convolution_gpu_yxfb_yxio_b16",2], + "16139615240471264488": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "6331794802915121861": ["convolution_gpu_yxfb_yxio_b16",2], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "8784358107340738205": ["convolution_gpu_yxfb_yxio_b16",2], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "2501411300945696806": ["convolution_gpu_yxfb_yxio_b16",2], + "7378840969627751667": ["convolution_gpu_yxfb_yxio_b16",2], + "14568618538516685994": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6126579157025017808": ["convolution_gpu_yxfb_yxio_b16",2], + "8125500765566111746": ["convolution_gpu_yxfb_yxio_b16",2], + "9663847096617096629": ["convolution_gpu_yxfb_yxio_b16",2], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3924212595662208655": ["convolution_gpu_yxfb_yxio_b16",2], + "8585205898894363799": ["convolution_gpu_yxfb_yxio_b16",2], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "10184417796355593956": ["convolution_gpu_yxfb_yxio_b16",2], + "7605652809856543211": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "8267783192628619295": ["convolution_gpu_yxfb_yxio_b16",2], + "10358170616931426647": ["convolution_gpu_yxfb_yxio_b16",1], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "14770895149190975433": ["convolution_gpu_yxfb_yxio_b16",1], + "2931988747601319855": ["convolution_gpu_bfyx_1x1",2], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "4306052436602921234": ["convolution_gpu_yxfb_yxio_b16",2], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",2], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "17054207561525574617": ["convolution_gpu_yxfb_yxio_b16",2], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",2], + "6479042072492268780": ["convolution_gpu_yxfb_yxio_b16",2], + "10981374120597916521": ["convolution_gpu_yxfb_yxio_b16",1], + "18424611729838147994": ["convolution_gpu_yxfb_yxio_b16",2], + "7099035779223341587": ["convolution_gpu_yxfb_yxio_b16",2], + "9207799012657103903": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10811224523636009881": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15866935886105967122": ["convolution_gpu_yxfb_yxio_b16",2], + "8700574100180128776": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16270745071180354612": ["convolution_gpu_bfyx_gemm_like",2], + "10760094119259477688": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9169935203300589222": ["convolution_gpu_yxfb_yxio_b16",1], + "15222260213708019662": ["convolution_gpu_yxfb_yxio_b16",2], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "15710826363434377015": ["convolution_gpu_yxfb_yxio_b16",2], + "7332664632757815486": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "2706024586717944825": ["convolution_gpu_yxfb_yxio_b16",2], + "6363788325163726004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14847662630748580880": ["convolution_gpu_yxfb_yxio_b16",2], + "7075659071934895087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9694701402170070080": ["convolution_gpu_yxfb_yxio_b16",2], + "7540655869186258692": ["convolution_gpu_yxfb_yxio_b16",2], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",195], + "16632786413927045192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "12213908871711628660": ["convolution_gpu_yxfb_yxio_b16",2], + "3815222814331650224": ["convolution_gpu_yxfb_yxio_b16",2], + "10833423331830484028": ["convolution_gpu_yxfb_yxio_b16",2], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",2], + "151851883170419907": ["convolution_gpu_yxfb_yxio_b16",2], + "1798440805196304745": ["convolution_gpu_yxfb_yxio_b16",2], + "15352064186447212862": ["convolution_gpu_yxfb_yxio_b16",2], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "2722062599746670336": ["convolution_gpu_yxfb_yxio_b16",2], + "11070968498963106073": ["fully_connected_gpu_fb_io_block_fp16",2], + "856949500975232838": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17705992851440826353": ["convolution_gpu_yxfb_yxio_b16",2], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "12771841901357553928": ["convolution_gpu_yxfb_yxio_b16",2], + "17084977396231597605": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "7992444232916226938": ["convolution_gpu_yxfb_yxio_b16",1], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4849343880559509889": ["convolution_gpu_bfyx_1x1",2], + "11086464266772450142": ["convolution_gpu_yxfb_yxio_b16",2], + "15464554714318666871": ["convolution_gpu_yxfb_yxio_b16",2], + "11799180632798787251": ["convolution_gpu_yxfb_yxio_b16",2], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "9078447949109922472": ["convolution_gpu_yxfb_yxio_b16",2], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "108442764389420633": ["convolution_gpu_yxfb_yxio_b16",2], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "15551338663759394064": ["convolution_gpu_yxfb_yxio_b16",1], + "10538010212480716275": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14173531787508017136": ["convolution_gpu_yxfb_yxio_b16",2], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",2], + "12867177334690636800": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "16585502133291740543": ["convolution_gpu_yxfb_yxio_b16",2], + "11070446574652704629": ["convolution_gpu_yxfb_yxio_b16",2], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4023281997496669037": ["convolution_gpu_yxfb_yxio_b16",2], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",2], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "14263055580023018733": ["convolution_gpu_yxfb_yxio_b16",2], + "688897645422834994": ["convolution_gpu_yxfb_yxio_b16",2], + "6232452664016831516": ["convolution_gpu_yxfb_yxio_b16",2], + "15715029280006557222": ["convolution_gpu_yxfb_yxio_b16",1], + "2840794055129352139": ["convolution_gpu_yxfb_yxio_b16",2], + "14905520834426630145": ["convolution_gpu_bfyx_gemm_like",2], + "16096353398003405565": ["convolution_gpu_yxfb_yxio_b16",2], + "15997145184054496085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11585430081839020501": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15820005010263193043": ["convolution_gpu_yxfb_yxio_b16",2], + "3141886504884887200": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",2], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "3427691447288240419": ["convolution_gpu_yxfb_yxio_b16",1], + "14731054961557547253": ["convolution_gpu_yxfb_yxio_b16",2], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "6703148006012061136": ["convolution_gpu_yxfb_yxio_b16",2], + "382811963722907674": ["convolution_gpu_bfyx_gemm_like",2], + "12617736879671137111": ["convolution_gpu_yxfb_yxio_b16",2], + "10419440621736450993": ["convolution_gpu_yxfb_yxio_b16",2], + "17211590259060346125": ["convolution_gpu_yxfb_yxio_b16",1], + "13328583512713703122": ["convolution_gpu_yxfb_yxio_b16",2], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "415232223198122046": ["convolution_gpu_yxfb_yxio_b16",2], + "14985236276429954162": ["convolution_gpu_bfyx_gemm_like",2], + "15487730714504758208": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "4299773714254046691": ["convolution_gpu_yxfb_yxio_b16",2], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "7615563770941714046": ["convolution_gpu_yxfb_yxio_b16",2], + "6664482192233202590": ["convolution_gpu_bfyx_gemm_like",2], + "3155353791103196186": ["convolution_gpu_yxfb_yxio_b16",2], + "3894121333485095575": ["convolution_gpu_yxfb_yxio_b16",2], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "9275371801303143499": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "3928356751040028375": ["convolution_gpu_bfyx_gemm_like",2], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",1], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",2], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "14510495923021693109": ["convolution_gpu_yxfb_yxio_b16",2], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17487594336237597163": ["convolution_gpu_yxfb_yxio_b16",2], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "8638227907054657946": ["convolution_gpu_yxfb_yxio_b16",2], + "16574710115918192418": ["convolution_gpu_bfyx_gemm_like",2], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "8130920994920685157": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17647962002015093887": ["convolution_gpu_bfyx_gemm_like",2], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",1], + "4714289593698160876": ["convolution_gpu_yxfb_yxio_b16",2], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",1], + "10415046594066474634": ["convolution_gpu_bfyx_gemm_like",2], + "12624762527234542946": ["convolution_gpu_yxfb_yxio_b16",2], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "15809639778580769565": ["convolution_gpu_bfyx_gemm_like",2], + "17587625589456309495": ["convolution_gpu_yxfb_yxio_b16",2], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8069537351442302814": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "2147896649835170790": ["convolution_gpu_yxfb_yxio_b16",2], + "1142968634734769401": ["convolution_gpu_yxfb_yxio_b16",2], + "5245526691775741296": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6300691162962736560": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "16653412888821076903": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5074273865983613482": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9375272277044782377": ["convolution_gpu_bfyx_gemm_like",0], + "10465119306486335226": ["convolution_gpu_yxfb_yxio_b16",2], + "17436550598696178210": ["convolution_gpu_yxfb_yxio_b16",2], + "1854612313463195535": ["convolution_gpu_yxfb_yxio_b16",0], + "14758040027936817208": ["convolution_gpu_yxfb_yxio_b16",2], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",2], + "2807516818436584831": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1638858323987412931": ["convolution_gpu_yxfb_yxio_b16",2], + "18035673326929466074": ["convolution_gpu_bfyx_direct_10_12_16",2], + "417352773179383568": ["convolution_gpu_yxfb_yxio_b16",2], + "4339711224604149541": ["convolution_gpu_bfyx_gemm_like",2], + "5266313052389515491": ["convolution_gpu_yxfb_yxio_b16",2], + "7946262362930618714": ["convolution_gpu_yxfb_yxio_b16",2], + "14835641172229643545": ["convolution_gpu_bfyx_gemm_like",2], + "5150256051921098637": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14001406016806064079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10899110544832584656": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6981294059746462667": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",2], + "9996142812492415452": ["convolution_gpu_yxfb_yxio_b16",1], + "13835859040765465258": ["convolution_gpu_bfyx_gemm_like",1], + "13472577372534605883": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5762631094740444698": ["convolution_gpu_yxfb_yxio_b16",1], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "2920017342405650206": ["convolution_gpu_yxfb_yxio_b16",2], + "11624226818593966530": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6461637373691101671": ["convolution_gpu_bfyx_direct_10_12_16",2], + "215512025430490450": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6070612528095353265": ["convolution_gpu_yxfb_yxio_b16",2], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",2], + "10656486867659934705": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16111630594575598044": ["convolution_gpu_yxfb_yxio_b16",2], + "17439276474731842060": ["convolution_gpu_yxfb_yxio_b16",2], + "10069896554844445748": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "3837190939606792435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "739676584505475609": ["convolution_gpu_bfyx_gemm_like",2], + "4306881509708040723": ["convolution_gpu_yxfb_yxio_b16",2], + "178353385245384751": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "1466455001976212160": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "15226556774612169126": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18255227391100087860": ["convolution_gpu_bfyx_1x1",2], + "16120120950870908964": ["convolution_gpu_yxfb_yxio_b16",2], + "2219693989290882970": ["convolution_gpu_yxfb_yxio_b16",2], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "4091702228990140696": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10509933181132310969": ["convolution_gpu_bfyx_gemm_like",2], + "166091609652531090": ["convolution_gpu_yxfb_yxio_b16",2], + "47872288115972996": ["convolution_gpu_yxfb_yxio_b16",2], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "18094592431313771787": ["convolution_gpu_yxfb_yxio_b16",2], + "8787438180071123604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",2], + "6445721440921372329": ["convolution_gpu_yxfb_yxio_b16",2], + "17012832508134584917": ["convolution_gpu_yxfb_yxio_b16",2], + "12264240305528403865": ["convolution_gpu_yxfb_yxio_b16",2], + "4433497906256257606": ["convolution_gpu_yxfb_yxio_b16",2], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "7752913515036871482": ["convolution_gpu_bfyx_gemm_like",1], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",1], + "13390197134230598693": ["convolution_gpu_yxfb_yxio_b16",2], + "10217182484138821482": ["convolution_gpu_yxfb_yxio_b16",2], + "16780457022162749898": ["convolution_gpu_bfyx_gemm_like",0], + "5445584581720919223": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",1], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",2], + "6260115080574637314": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12294364015803004575": ["fully_connected_gpu_fb_io_block_fp16",2], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "438528596970898721": ["convolution_gpu_bfyx_gemm_like",1], + "3976197003067656339": ["convolution_gpu_yxfb_yxio_b16",2], + "2263637493894079492": ["convolution_gpu_yxfb_yxio_b16",2], + "15813044197987178947": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "17433340097721474017": ["convolution_gpu_yxfb_yxio_b16",2], + "9515771738501683": ["convolution_gpu_yxfb_yxio_b16",2], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "6182829358839578529": ["convolution_gpu_bfyx_gemm_like",2], + "13803790014241837327": ["convolution_gpu_yxfb_yxio_b16",1], + "13367787254519749641": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1454014148777456006": ["convolution_gpu_yxfb_yxio_b16",2], + "49948277487706148": ["convolution_gpu_bfyx_1x1",2], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "867868384380428650": ["convolution_gpu_yxfb_yxio_b16",2], + "1068155851494601726": ["convolution_gpu_yxfb_yxio_b16",2], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2621495864635590903": ["convolution_gpu_yxfb_yxio_b16",2], + "2929190644951986399": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "6115915509370042166": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "1338581414403268264": ["convolution_gpu_yxfb_yxio_b16",2], + "12850195004093999773": ["convolution_gpu_yxfb_yxio_b16",2], + "9532499374173117612": ["fully_connected_gpu_fb_oi_ref",1], + "12061567381160185735": ["convolution_gpu_bfyx_1x1",1], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "9178915201681884122": ["convolution_gpu_yxfb_yxio_b16",2], + "17045386022302353268": ["convolution_gpu_yxfb_yxio_b16",2], + "6107700818115209289": ["convolution_gpu_yxfb_yxio_b16",2], + "15141893564826036993": ["convolution_gpu_yxfb_yxio_b16",2], + "8984436655107983227": ["convolution_gpu_bfyx_gemm_like",2], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17766628441954343001": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "7823257556787476006": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2973337989445169388": ["convolution_gpu_yxfb_yxio_b16",1], + "3010520839193613803": ["convolution_gpu_yxfb_yxio_b16",2], + "8000679297338683619": ["convolution_gpu_yxfb_yxio_b16",2], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2598267743388306204": ["convolution_gpu_bfyx_gemm_like",2], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14840851809642905875": ["convolution_gpu_yxfb_yxio_b16",2], + "447943521999310356": ["convolution_gpu_yxfb_yxio_b16",2], + "16361932270527364507": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "4422642146063042868": ["convolution_gpu_yxfb_yxio_b16",2], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9974905660671605427": ["convolution_gpu_yxfb_yxio_b16",2], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "18148431787172327554": ["convolution_gpu_yxfb_yxio_b16",2], + "7338932272767555117": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5941092474669713339": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11604794601689380990": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "598214270378842167": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "824380206255396866": ["convolution_gpu_yxfb_yxio_b16",2], + "10753540518493641553": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8613740762403897614": ["convolution_gpu_yxfb_yxio_b16",2], + "142329025839464842": ["convolution_gpu_bfyx_1x1",2], + "6286349307417232815": ["convolution_gpu_yxfb_yxio_b16",2], + "883436333317162926": ["convolution_gpu_bfyx_1x1",2], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",2], + "5109636469531439569": ["convolution_gpu_yxfb_yxio_b16",2], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "12430677767405883160": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11563334365673075610": ["convolution_gpu_yxfb_yxio_b16",2], + "9161616741940575576": ["convolution_gpu_yxfb_yxio_b16",2], + "10477588607457125173": ["convolution_gpu_bfyx_gemm_like",2], + "4723643671527109645": ["convolution_gpu_yxfb_yxio_b16",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "16549854027697846882": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16839741351990811959": ["convolution_gpu_bfyx_gemm_like",2], + "16243196137456624852": ["convolution_gpu_bfyx_gemm_like",2], + "13020331397245585657": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "14318347197994059448": ["convolution_gpu_yxfb_yxio_b16",2], + "4251673416603443503": ["convolution_gpu_yxfb_yxio_b16",2], + "6677367803113594603": ["convolution_gpu_yxfb_yxio_b16",2], + "15298221796479574600": ["convolution_gpu_yxfb_yxio_b16",1], + "5546447512898130524": ["convolution_gpu_yxfb_yxio_b16",2], + "13854845390344305906": ["convolution_gpu_yxfb_yxio_b16",2], + "16184142990117192433": ["convolution_gpu_yxfb_yxio_b16",2], + "12703696322769371912": ["convolution_gpu_bfyx_gemm_like",2], + "2920840796593281126": ["convolution_gpu_bfyx_gemm_like",2], + "2064464435352777854": ["convolution_gpu_bfyx_gemm_like",1], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",1], + "16956263773967652552": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4927360358387344983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15438530452161762045": ["convolution_gpu_yxfb_yxio_b16",1], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",2], + "14288463473159113326": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15555083739490354527": ["convolution_gpu_bfyx_gemm_like",2], + "5479590921345335946": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "14421061973479991516": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "18076121920579110076": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "16998662249038174039": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "1240102354814495870": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "707979507145930311": ["convolution_gpu_bfyx_gemm_like",1], + "14795626641169374231": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "8512711227383782401": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6578804773136886939": ["convolution_gpu_bfyx_gemm_like",2], + "18180491232489548313": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "3572202652824023801": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "3816979903860227798": ["convolution_gpu_bfyx_gemm_like",2], + "4790960977352818689": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "4868400250190558111": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3509502334639215181": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "3697631094971930011": ["convolution_gpu_bfyx_gemm_like",2], + "1467428583618467133": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "9335016444137172241": ["convolution_gpu_bfyx_gemm_like",2], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "8127853538569353431": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "1484007449719260391": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "9056812077282494074": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "7127306913758514626": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10209532888121442060": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "17354626928258309128": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1569111625440278287": ["convolution_gpu_bfyx_gemm_like",2], + "213518984547400496": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "384240534894352154": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "4732699611696731044": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "7059729537732609153": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "15743461017318513847": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "2778141440914991349": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4588420324030315321": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "885661562948597780": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15687441275464931484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "5358925179582853152": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "3610579553304450107": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "3047710665820732705": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8363432163596927598": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "11758765408733113291": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "5050495757462452653": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8399477322910720113": ["convolution_gpu_bfyx_gemm_like",2], + "8921169563466511475": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "12571532345206950176": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "9552615241912277692": ["convolution_gpu_bfyx_gemm_like",2], + "16628180201355989101": ["convolution_gpu_bfyx_os_iyx_osv16",884], + "17808913959977434594": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "286393043958202995": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "6258191734224827354": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "18043745678739016406": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "17946191056428828467": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6263019986730305851": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9546990560009724329": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "16462602383546733062": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "1350953652678789564": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "330278641539729021": ["convolution_gpu_bfyx_gemm_like",2], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2908856453997530641": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17059095074211347838": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "14668529234172928874": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "14346703182362139650": ["convolution_gpu_bfyx_gemm_like",2], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "10114123606924808948": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5280450544965361875": ["convolution_gpu_bfyx_gemm_like",1], + "15025260753866131193": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "604467633591545941": ["convolution_gpu_bfyx_gemm_like",2], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "7256947320128669983": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "16256970928603738516": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10269005969451576527": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "6745633232989303110": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12364947728685604753": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "12173409033330010794": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "11128727891847758901": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "1093840152689636371": ["convolution_gpu_bfyx_gemm_like",1], + "9714770878761308566": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15083602050538795803": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7527121935101118719": ["convolution_gpu_bfyx_gemm_like",2], + "5116562847410288642": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5385395378424322451": ["convolution_gpu_bfyx_gemm_like",2], + "11602830611894444581": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "2096167792705935744": ["convolution_gpu_bfyx_gemm_like",2], + "3433877094202077256": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "12610004507393467447": ["convolution_gpu_bfyx_gemm_like",2], + "15939740070666326125": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8422541638844255768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13082713280504953535": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8961544327690568390": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11883632480024839484": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5769404877199637961": ["convolution_gpu_bfyx_gemm_like",2], + "3296059171653513862": ["convolution_gpu_bfyx_gemm_like",2], + "9968496035529786888": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "3664842151999943": ["convolution_gpu_bfyx_gemm_like",1], + "11539652577193034099": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "2524233418633897945": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "3743573500773847162": ["convolution_gpu_bfyx_os_iyx_osv16",506], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "3813463368918975003": ["convolution_gpu_bfyx_gemm_like",2], + "7530197659550301431": ["convolution_gpu_bfyx_gemm_like",2], + "9700098364581157575": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4269447138276727632": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "1061595672605627170": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "7569785094993085356": ["convolution_gpu_bfyx_gemm_like",2], + "11504777464995699839": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "8224143262995973449": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1501328995320618233": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "16197538586133639338": ["convolution_gpu_bfyx_gemm_like",1], + "237384442106085756": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "15972830392998437739": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "15421166985948480394": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "6794427012971589670": ["convolution_gpu_bfyx_gemm_like",2], + "2420425134749678611": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "8050798452111667069": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "3668065353749623655": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "4251588408225461731": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16957170318200599740": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12644942072153919043": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17715478364817621621": ["convolution_gpu_bfyx_gemm_like",2], + "2854124603710900850": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "9380980604821454646": ["convolution_gpu_bfyx_gemm_like",1], + "1879844536951785808": ["convolution_gpu_bfyx_gemm_like",2], + "1086052166358768751": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "861813331533609605": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16440449399643706863": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11733721371402545268": ["fully_connected_gpu_fb_io_ref",2], + "15816540550252147706": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "13979227237506927267": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "10492401059875127091": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "4600698444492242585": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7157064096682175957": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "14221578799010900252": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "1285313118947640320": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "15858485865603722138": ["convolution_gpu_bfyx_gemm_like",2], + "2116524516810466877": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "12182468247297592907": ["convolution_gpu_bfyx_gemm_like",1], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "5582107298039488951": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "11773726534842908728": ["convolution_gpu_bfyx_os_iyx_osv16",187], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "8844619836383523698": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "14548629377527143409": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13366059704398720237": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "18349087959351486710": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15868648764972133201": ["fully_connected_gpu_fb_oi_ref",1], + "15322019609805777935": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "4451257789691974239": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1480287432874335824": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "13657522194775317201": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "8032685176029570383": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4334698056820320220": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "15378707205730840765": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5977248663249062384": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3170785962566427770": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2585176064846114298": ["convolution_gpu_bfyx_gemm_like",2], + "18337975902615310907": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6768322540857745605": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13657774210341324470": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3072535365860940873": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14230197617570499447": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "10049329759351957685": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10305912614137623024": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3896848534552901221": ["convolution_gpu_bfyx_gemm_like",2], + "7405835196787288054": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "7020655100877544328": ["convolution_gpu_bfyx_gemm_like",1], + "13174363822969694054": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13232269620066140073": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "10317038568333963064": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "2180753144963020203": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15271492161940795681": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12281346074445607180": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "8451179695288093195": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "2085738943081638802": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15563546888345388359": ["convolution_gpu_bfyx_gemm_like",2], + "8525389694584008001": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2481005139798378616": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "574359978358296617": ["convolution_gpu_bfyx_gemm_like",2], + "15764181772410734606": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9217386935739152562": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "12161602271403760008": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "9758907700230386910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8707189142909022305": ["convolution_gpu_bfyx_gemm_like",2], + "1375259485223819020": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9053383117071470496": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6261121070004228939": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1112828128944231163": ["convolution_gpu_bfyx_gemm_like",1], + "5843679089588930933": ["convolution_gpu_bfyx_gemm_like",2], + "11083777913844441475": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1923745286075356181": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3827177373408316820": ["convolution_gpu_bfyx_gemm_like",1], + "5488168361113140102": ["convolution_gpu_bfyx_gemm_like",1], + "7982628452987720190": ["convolution_gpu_bfyx_gemm_like",2], + "8140242320379485952": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "15615172858007002100": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15210302033167762581": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "17392347485675658099": ["convolution_gpu_bfyx_gemm_like",2], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "1231806423322813287": ["convolution_gpu_bfyx_gemm_like",2], + "166267183356660549": ["convolution_gpu_bfyx_gemm_like",1], + "8281212003098870446": ["convolution_gpu_bfyx_gemm_like",0], + "14650273075211365393": ["convolution_gpu_bfyx_gemm_like",1], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "12012860334670244716": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "15646774522467486699": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "18265901700619296616": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1653438360841004980": ["fully_connected_gpu_fb_oi_ref",2], + "6103824715103416420": ["convolution_gpu_bfyx_gemm_like",2], + "15409755591665753258": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16946947983339327902": ["convolution_gpu_bfyx_gemm_like",2], + "6431838057506760173": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "14705457019471647279": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6801897580177846120": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",472], + "16801553481899627402": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5339358831190803597": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11732742421854164761": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "14568560907026487922": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4184442166820068862": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "17967188184891337660": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",1], + "5109770354438894645": ["convolution_gpu_bfyx_gemm_like",2], + "4691552892932405676": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7331552952865138030": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1422402723172447295": ["convolution_gpu_bfyx_gemm_like",1], + "14292252222828824305": ["convolution_gpu_bfyx_gemm_like",2], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "16695020005258780885": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "6129884455218252024": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13772598362521854438": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "9940908487812223059": ["convolution_gpu_bfyx_gemm_like",2], + "4753055238892504599": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "15803050672115583478": ["convolution_gpu_bfyx_gemm_like",1], + "3154903035376733831": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9191832520273617003": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "7557446085365037177": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "6213353364768643062": ["convolution_gpu_bfyx_gemm_like",2], + "4035015193331696438": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "4368522743441422202": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "15974241934088373021": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17254775053427612466": ["fully_connected_gpu_fb_oi_ref",1], + "447683677378974131": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14244966672894707129": ["convolution_gpu_bfyx_gemm_like",2], + "7946776740333736799": ["convolution_gpu_bfyx_gemm_like",2], + "15496355513574200965": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "9239048433297419320": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12971833748980664090": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "1810943242998123550": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9767355861002822967": ["convolution_gpu_bfyx_gemm_like",2], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",0], + "11805311302922325617": ["convolution_gpu_bfyx_gemm_like",2], + "9788704336046308724": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15383553612351941890": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16590030963319267708": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "6739799137687789012": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "17442035600389810700": ["convolution_gpu_bfyx_gemm_like",2], + "10890975553758439233": ["convolution_gpu_bfyx_gemm_like",1], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "12409554044517232554": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "9796347091019799053": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "17508987219281192918": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "8670512344429807851": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13206826317378863148": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13727585908419292912": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "6996679663761370444": ["convolution_gpu_bfyx_gemm_like",1], + "13915749401892931804": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "16596028606733932975": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "4198666727524342442": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "16125365972873290572": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "9040046051053703359": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11918018989601427118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7312862821818362095": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "8357109553923988018": ["convolution_gpu_bfyx_gemm_like",2], + "8730097760819044515": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6218328594667952152": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3939977982577786175": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "18112958483003382733": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "8507854696766492454": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "5556023021504556658": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "17740553615487239243": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8684867236134349888": ["convolution_gpu_bfyx_os_iyx_osv16",193], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "18235067315439611192": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4965629769516591986": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "13537323999534292650": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3214253333840552610": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6478054912653910426": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",2], + "9154705094446538279": ["fully_connected_gpu_fb_oi_ref",0], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "6513705142577622089": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "1766961036311612128": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "977617597166653416": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17182558720652199559": ["fully_connected_gpu_fb_io_ref",1], + "17854138024884397413": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8426489532875918560": ["convolution_gpu_bfyx_gemm_like",1], + "17869697579874327192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10928995765778560784": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "770376597027620107": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17683350638672326642": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17790954200356837750": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "14349335089732252796": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11033507346101404633": ["fully_connected_gpu_fb_oi_ref",2], + "13775529405693629438": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "9459869325970475576": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "2542506456395240890": ["convolution_gpu_bfyx_gemm_like",1], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16510194749934323304": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "12952160708294444403": ["convolution_gpu_bfyx_gemm_like",2], + "11541706477255587105": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1771663698943903325": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "17771487895874668302": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "412314676462573090": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7367814057959247537": ["convolution_gpu_bfyx_gemm_like",2], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "1192709652314183388": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12427490329663434604": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "13170031087212196468": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "12381377111003298809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "16710651492402564794": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13447028922679236865": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "18026468427978643933": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "17285699593273891901": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "9963817056423168830": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "13388424034634316547": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "2780358937598873103": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "587350550384936211": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12534001599784153836": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "4082046235109198108": ["convolution_gpu_bfyx_gemm_like",1], + "2317476796706098254": ["convolution_gpu_bfyx_gemm_like",2], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "14296771090926462138": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "10853161782230763798": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8390953788659916133": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2310549887200001260": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4854802313728023001": ["convolution_gpu_bfyx_os_iyx_osv16",621], + "11264412030568042996": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "5906083739416582743": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8707484843981694525": ["convolution_gpu_bfyx_os_iyx_osv16",1021], + "2947753291378607664": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "17585852525746136080": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "2303141161423252932": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14039055710777697188": ["convolution_gpu_bfyx_gemm_like",2], + "3919577663893354177": ["convolution_gpu_bfyx_gemm_like",1], + "16578265652036967656": ["convolution_gpu_bfyx_gemm_like",2], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5643908654122573882": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",2], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "1465692634334679413": ["convolution_gpu_bfyx_gemm_like",2], + "13439272015824246074": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15781220232431782560": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2590380836212070761": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10437861085319472289": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "707449835235490641": ["convolution_gpu_bfyx_gemm_like",1], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "14667209474639064623": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15984373369388044924": ["convolution_gpu_bfyx_gemm_like",2], + "1486768204660092247": ["convolution_gpu_bfyx_gemm_like",1], + "8360628955300060520": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "12808154347573074859": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1131384986902172221": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11051434650031832658": ["convolution_gpu_bfyx_gemm_like",1], + "3623695848220673001": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2172636954267255416": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14522844693999581518": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "12136458184046915563": ["convolution_gpu_bfyx_gemm_like",0], + "13115589642140732066": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "2654793073145467058": ["convolution_gpu_bfyx_gemm_like",2], + "1967810052096853804": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6796998865297819946": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3314459110790355757": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "13193571607788569533": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15911434513425038508": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4534480875955599254": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11253790393313445931": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",1], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11338906515425639970": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "789359733867650915": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "16173557782125372935": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "14133509766683767462": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "1116274074896622552": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "7995002764260542332": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15380105196319354141": ["convolution_gpu_bfyx_os_iyx_osv16",481], + "17732250360268013336": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "1622731194539871461": ["convolution_gpu_bfyx_gemm_like",2], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15115440616185035720": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4927139127938739019": ["convolution_gpu_bfyx_gemm_like",2], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",1], + "1081962464388501987": ["convolution_gpu_bfyx_os_iyx_osv16",873], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "1362239912535573615": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "2230884858122788172": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12771805545455650546": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2007192658799516915": ["fully_connected_gpu_bf_io_gemm",1], + "6489645404977288242": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "4229105529069729944": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10743138314323119696": ["convolution_gpu_bfyx_gemm_like",2], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",2], + "768765852586619095": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16396393355098283060": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12392988351482826871": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10485534959656860449": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "13083981648347252910": ["convolution_gpu_bfyx_os_iyx_osv16",511], + "2248628426797793532": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2498920887656279332": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12864338805958186191": ["convolution_gpu_bfyx_gemm_like",2], + "5124645583449732785": ["convolution_gpu_bfyx_gemm_like",2], + "15024023281204917061": ["convolution_gpu_bfyx_gemm_like",2], + "11331539079347079374": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "11857822504978122919": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11665313746896806563": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14911763273270477925": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2096021095904820251": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12010294231983179604": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "7877256119877423528": ["convolution_gpu_bfyx_os_iyx_osv16",489], + "18243018097656671503": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "8061914949376516780": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11314436000791223218": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9516102312850256675": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "14188045559946481097": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "2418288192668085805": ["convolution_gpu_bfyx_gemm_like",2], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",986], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14261214737408786954": ["convolution_gpu_bfyx_os_iyx_osv16",621], + "7336911146060959485": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11234976958917093838": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "7058458405375602606": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "13654408396081513312": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1593086572473375988": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "13387766889016280910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5966963943739041502": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "13267743753217317315": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",185], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "16103653667647559851": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "17025997656996518171": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10935410906182995784": ["convolution_gpu_bfyx_gemm_like",1], + "15749335301736571135": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6362453779168658462": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "14541063954080306476": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",120], + "11058082057683584650": ["convolution_gpu_bfyx_gemm_like",2], + "6750269489578112382": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17774979615691038302": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "3219239043521617253": ["convolution_gpu_bfyx_gemm_like",2], + "10973647655853229395": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "1521992965089360209": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4145496852718466030": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2317409971670298599": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "10966081583785531511": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13745327504866194229": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "390943380079040179": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2999825793036702585": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4692951005189464579": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "9905716283229191208": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4860019935631927113": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1835975757316320402": ["convolution_gpu_bfyx_gemm_like",2], + "18265020664540913473": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1444256562477852389": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "8510044123592842725": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "10689303050557631712": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "390219891876240081": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10838972820886273680": ["convolution_gpu_bfyx_gemm_like",2], + "15682441855379046778": ["convolution_gpu_bfyx_os_iyx_osv16",130], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "14719871224178118299": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14880517974968280393": ["convolution_gpu_bfyx_gemm_like",2], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9696588462876533517": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "11964639701912187118": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "12190841837604350271": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4254313567858225805": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "1190134214210434381": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "2894138412746654795": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "11378458002317912396": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "18337160891834020517": ["convolution_gpu_bfyx_os_iyx_osv16",151], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "12584692605608021657": ["fully_connected_gpu_fb_oi_ref",1], + "907233163535348999": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11510063368067539341": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3164513064874019611": ["convolution_gpu_bfyx_gemm_like",2], + "5298952273692538291": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "8382509515623938786": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "14013561425708390846": ["convolution_gpu_bfyx_gemm_like",2], + "7801270668419570665": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "11188849626443657384": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13296566345005640760": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "4165920860392215245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7905503566052181015": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15872143905824807656": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "10983344268706058114": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "5553176511624221429": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16033144151193421543": ["convolution_gpu_bfyx_gemm_like",2], + "2571882179292959757": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13810716860158972470": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "16264774056719724826": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1919460437053604108": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "12767115494378788592": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "9861846661532177405": ["convolution_gpu_bfyx_gemm_like",2], + "7419990519344756626": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "13660573428614001128": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2305706332728008948": ["convolution_gpu_bfyx_gemm_like",2], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "1104489643524273315": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "4607013085883384144": ["convolution_gpu_bfyx_gemm_like",2], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "9486447779233331380": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "12096396455109952715": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15509845164085518352": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6525052296614701517": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3475757648408068589": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "14599150265057284139": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "7678168522030142454": ["convolution_gpu_bfyx_gemm_like",2], + "8799427328659766574": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15384168056682476462": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "1801066876009461857": ["convolution_gpu_bfyx_gemm_like",1], + "13787155972060672772": ["convolution_gpu_bfyx_gemm_like",1], + "4974435385259831818": ["convolution_gpu_bfyx_gemm_like",2], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "15184258464890250739": ["convolution_gpu_bfyx_gemm_like",2], + "7550660458541314838": ["convolution_gpu_bfyx_gemm_like",2], + "11367813096511965002": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "11393439616752806572": ["convolution_gpu_bfyx_gemm_like",2], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "838825600917352376": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9383222411929463824": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "3192518239721798250": ["convolution_gpu_bfyx_gemm_like",2], + "12478914547444399288": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10036998353100219512": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13212959214376905822": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "1724898827344855006": ["convolution_gpu_bfyx_gemm_like",1], + "10890538764006500546": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12978004383198641522": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "18166732758694978380": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7727871584058599163": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14113510820933411052": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "6897348673467297407": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "15191864907092681849": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16569200335969311660": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "11642941943446484202": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "12825029449351875037": ["convolution_gpu_bfyx_gemm_like",1], + "12818953631784587919": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "9654726486719966937": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10158890414412187141": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8367989677286805427": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15953607231296296913": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9255337426504113924": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6762862978340755053": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8374345306483326015": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10386584706491193379": ["convolution_gpu_bfyx_gemm_like",2], + "18067353229273804720": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3588791913550955553": ["fully_connected_gpu_fb_oi_ref",1], + "5047419871737940985": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "6078344073564209080": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "15492793021506324472": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "801486567558674495": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15652392678782222737": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3701795558556637835": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8369833730195120673": ["convolution_gpu_bfyx_gemm_like",2], + "7103345484511147373": ["convolution_gpu_bfyx_gemm_like",2], + "4412343276595791077": ["convolution_gpu_bfyx_gemm_like",2], + "1596472719837608525": ["convolution_gpu_bfyx_gemm_like",2], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "15636407980943172317": ["convolution_gpu_bfyx_gemm_like",2], + "2816982827037092536": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "3469963495451100978": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "9386678255270055573": ["convolution_gpu_bfyx_direct_10_12_16",2], + "172584114180442549": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "3828569468687251275": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "5513667102916409932": ["convolution_gpu_bfyx_gemm_like",2], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "15112118829970177073": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "8566695253227825439": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "13002723770137829128": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8511244943596227719": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1745930004673880589": ["convolution_gpu_bfyx_gemm_like",1], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "12707748441880165396": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3277243911383750280": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15952399564161253450": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12063837066704136739": ["convolution_gpu_bfyx_gemm_like",1], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "1330842758352650583": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "4007319206075386920": ["convolution_gpu_bfyx_gemm_like",2], + "1592619919721912789": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "984472462878596435": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "813347941036099284": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "15091825614924466766": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",0], + "3436433254188539886": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "17997314629342774968": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "3524702814173574637": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "340606466693982406": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "544003022213487787": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6948696390129114563": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4563529605364580848": ["convolution_gpu_bfyx_os_iyx_osv16",131], + "2124776616364429517": ["convolution_gpu_bfyx_gemm_like",1], + "2946926779445063554": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "11240189248024145687": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "4494583230309471319": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "8104609318998060422": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "16587078304821304948": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "237302155033013557": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "13810995219720233595": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "15299926486228458704": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "10879183694331631189": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "3745433390861789238": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6275903692904946376": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "572265264921910408": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "276407276027553756": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "3747518910079195578": ["convolution_gpu_bfyx_os_iyx_osv16",103], + "15198419554644505600": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17073183514200378702": ["convolution_gpu_bfyx_os_iyx_osv16",667], + "8611417708673038653": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "8375778282166369933": ["convolution_gpu_bfyx_gemm_like",2], + "6831045740006076251": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6577754887650563753": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11775667915453535428": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "8898910394425958745": ["convolution_gpu_bfyx_gemm_like",2], + "15781622938833984014": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "8035084960535483680": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "6065819201836017182": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7873648177300629037": ["convolution_gpu_bfyx_gemm_like",2], + "18134140047840716203": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "12046638414686283134": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10008202802779981732": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "954347958041231578": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5871082277006078841": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "6137405768481559638": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "9105388853296359769": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",2], + "11795686089670429481": ["convolution_gpu_bfyx_gemm_like",2], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "4224423702382859092": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2270733937722366926": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "5646139101524964833": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "3239100076064406977": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3730238135300250205": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "7227174766917523481": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "17772882818194611202": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "18375125668176498051": ["convolution_gpu_bfyx_gemm_like",2], + "16091165907421819456": ["convolution_gpu_bfyx_gemm_like",2], + "7726714223809300966": ["convolution_gpu_bfyx_gemm_like",1], + "13926730608213207277": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11533151357949131860": ["convolution_gpu_bfyx_gemm_like",2], + "14805212478405698245": ["convolution_gpu_bfyx_gemm_like",1], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "9468314291932574827": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "8324250071425605671": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6579950270997373448": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1426606766274640878": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "5953754321266570854": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "14827538610133799379": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4920194716156732643": ["convolution_gpu_bfyx_gemm_like",2], + "9740466267717175474": ["convolution_gpu_bfyx_gemm_like",2], + "6755802278188792577": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "4417341352109525283": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8442368383427915597": ["convolution_gpu_bfyx_gemm_like",1], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "3693042354944382600": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12390011660072693092": ["convolution_gpu_bfyx_gemm_like",1], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "12425310792514818973": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "17331582127656317117": ["convolution_gpu_bfyx_gemm_like",1], + "13492216433886201174": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2338535084014610258": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "4396653960950462197": ["convolution_gpu_bfyx_gemm_like",1], + "5825664545247017348": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "4624363818743696582": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "152263592822875549": ["convolution_gpu_bfyx_gemm_like",2], + "17015791782274123780": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "16831114690704826637": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17350963651826443169": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7277156316894715321": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17559685912375493682": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "9083686317073801642": ["convolution_gpu_bfyx_gemm_like",1], + "311101627084421734": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",1], + "10322586483496198615": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7154364270315480182": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "9947693652506812817": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "13593258537178247801": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "16758962840329202004": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1077224320045437593": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2999633429402781278": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "2184670359551186734": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17798626036576472760": ["convolution_gpu_bfyx_os_iyx_osv16",545], + "14705509109623500235": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "8079376692609682448": ["convolution_gpu_bfyx_gemm_like",0], + "4585891362157592384": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "5748047690737461635": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "173772845058977237": ["convolution_gpu_bfyx_os_iyx_osv16",512], + "6899658518070473523": ["convolution_gpu_bfyx_gemm_like",2], + "9455406830371528486": ["convolution_gpu_bfyx_gemm_like",1], + "3027775502561362722": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "1006828591724642933": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "12136625628940225638": ["convolution_gpu_bfyx_gemm_like",2], + "14253275166085865948": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "9875997976286355123": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "14017025411515888007": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14077148976508649021": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13140254055376365092": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4476218615403440835": ["convolution_gpu_bfyx_gemm_like",2], + "11465965972527519631": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8045697952241865861": ["convolution_gpu_bfyx_gemm_like",2], + "8109572327736409899": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12325592439309417414": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "18280672126778847258": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "14725765847498813247": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "2014911634432127630": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9835338452418388180": ["convolution_gpu_bfyx_gemm_like",2], + "16912035321030511639": ["convolution_gpu_bfyx_gemm_like",1], + "5701438170070600512": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "1499841226042523429": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "9823752892549805496": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "9101571410887509600": ["convolution_gpu_bfyx_gemm_like",0], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "9410125656044318792": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1818433662409886324": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "2827850900421982274": ["convolution_gpu_bfyx_gemm_like",1], + "11507538232733291666": ["convolution_gpu_bfyx_direct_10_12_16",1], + "536646811796032046": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "18167100055915766856": ["convolution_gpu_bfyx_gemm_like",1], + "14184440545916228597": ["convolution_gpu_bfyx_gemm_like",2], + "9068406831482072377": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16190949264253468961": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "475665035119038846": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "4172485608495372888": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "13696782397412896129": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6056291179600370019": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14492935486352505845": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4316519748653705692": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "16453041919970581620": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "14696479950182046016": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "1925626127045202964": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "16614170159588864300": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "7185832253431234935": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14004715832115880216": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "7157531901512507924": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "14681705641267917886": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "4872433441839808585": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13503688893307029975": ["convolution_gpu_bfyx_direct_10_12_16",0], + "6914775146138105785": ["convolution_gpu_bfyx_gemm_like",2], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",2], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "2451712485584835395": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "9305957796037500628": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4040607776348275579": ["convolution_gpu_bfyx_gemm_like",2], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "7088331918128954410": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "9377779605078400305": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "6740545361286720494": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15490478608105402679": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9548658329589481069": ["convolution_gpu_bfyx_gemm_like",2], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "5280182001774668876": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1142725391726703078": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7876355212013100281": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "2363414141971004557": ["convolution_gpu_bfyx_gemm_like",2], + "9019451572520595738": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2111049986724040641": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "6610054713068442549": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "13163026305514410688": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7338578624767544128": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13491221531603384511": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12038525298168664305": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1015184966858657992": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4010329161090285019": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6722358544720547260": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "5553779954745929430": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "169973842603492802": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14203061085285979556": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5361028467247182860": ["convolution_gpu_bfyx_gemm_like",1], + "11630475290242283451": ["convolution_gpu_bfyx_gemm_like",2], + "16768470780681544910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "7480968533463196410": ["convolution_gpu_bfyx_gemm_like",2], + "13818587810073749596": ["convolution_gpu_bfyx_gemm_like",1], + "12700051513124813499": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "18386376129938707290": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "11333068902248367382": ["convolution_gpu_bfyx_gemm_like",2], + "13219865669259079983": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "9700592037514669700": ["convolution_gpu_bfyx_gemm_like",2], + "10105539975183207700": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "4239415134522959352": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "12170874893413205000": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "8325686349100774855": ["convolution_gpu_bfyx_gemm_like",2], + "8413117662038329068": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2904162348196990593": ["convolution_gpu_bfyx_gemm_like",1], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3527012447011885981": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5230406405159608187": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "8779947213821605681": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7505966294864890221": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "1213958002895787672": ["convolution_gpu_bfyx_gemm_like",2], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8575296926578119953": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17641033958594901664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "9105431502075531641": ["convolution_gpu_bfyx_gemm_like",2], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "10794662801660960189": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "14579042972443651846": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "13403617010417893318": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "18242682488017822077": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "6149261133858739754": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",420], + "13088023076667575514": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "14277843123789500234": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1370827524176794227": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "12293705794290797805": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "3034947396960425753": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "11680829908738480957": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "316225690176910392": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "17236135174912837061": ["convolution_gpu_bfyx_gemm_like",2], + "6851536988434597530": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6612643056203714506": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "3446991010350155849": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15071888879264671307": ["convolution_gpu_bfyx_os_iyx_osv16",104], + "1228256819256996416": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17118569850095586049": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "16201999154635899927": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "6235096928786525260": ["convolution_gpu_bfyx_os_iyx_osv16",337], + "11493371521058673700": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "699127221549844251": ["convolution_gpu_bfyx_gemm_like",2], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "11129224786768161139": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "4631772220201098020": ["convolution_gpu_bfyx_gemm_like",2], + "7536287105029319189": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "10412748832841674068": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7385295618478993079": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5934841294975212773": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14815498807515058447": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "13773898185415904435": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16997897512818072938": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12201437677145858979": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "16067821671414842756": ["convolution_gpu_bfyx_gemm_like",1], + "11191071895289217783": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6542417269641204414": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9226443907548972870": ["convolution_gpu_bfyx_gemm_like",1], + "6948606378949354116": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "4652308622880770983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3285688984628545255": ["fully_connected_gpu_fb_io_ref",1], + "17396226612787250663": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4695182996147218495": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "17235360775064303316": ["convolution_gpu_bfyx_gemm_like",2], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6402941068107243403": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12166710900466116000": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17358006976602795707": ["convolution_gpu_bfyx_gemm_like",2], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5680888227752935228": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "13288543822410746011": ["convolution_gpu_bfyx_gemm_like",1], + "1603703756241612948": ["convolution_gpu_bfyx_gemm_like",2], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "2820364088001594654": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "14513925709624513868": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "13244693761392741931": ["fully_connected_gpu_fb_oi_ref",0], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "12211848608269437730": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "1898912620350738645": ["convolution_gpu_bfyx_gemm_like",2], + "5849577829817109757": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "12811104880512633036": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "10736915975072972467": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15047163348308549816": ["convolution_gpu_bfyx_gemm_like",1], + "6673690359191617215": ["fully_connected_gpu_fb_oi_ref",1], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "13019190248083899887": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "9318652504803279936": ["convolution_gpu_bfyx_gemm_like",2], + "2345023488044002149": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "9692949270906064580": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6489074577147494118": ["convolution_gpu_bfyx_gemm_like",1], + "8271034912009744989": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2248754661513284642": ["convolution_gpu_bfyx_gemm_like",2], + "6865406633958213363": ["convolution_gpu_bfyx_gemm_like",2], + "14600118619533737293": ["fully_connected_gpu_fb_oi_ref",0], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "13014443130752087867": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "3730207439375250056": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "17113350507039887381": ["convolution_gpu_bfyx_gemm_like",1], + "6604223938357238686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2817383483458239293": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "17692144048680858991": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15178327647765537565": ["convolution_gpu_bfyx_os_iyx_osv16",666], + "7544565739420583104": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8529571293598502239": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "16328232350072955252": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "14746900092090885770": ["convolution_gpu_bfyx_gemm_like",2], + "1200162031019105686": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4510003738155830628": ["convolution_gpu_bfyx_gemm_like",1], + "8057302050645780813": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6784146431605417954": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "14084855778741260863": ["convolution_gpu_bfyx_gemm_like",2], + "9883719542550391149": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6999530153839596796": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "13412296930014397060": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2062195022363480864": ["convolution_gpu_bfyx_gemm_like",1], + "10806992251978564302": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9352385417006844121": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "4890932609897686394": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "484412270668341493": ["convolution_gpu_bfyx_gemm_like",1], + "15662207751131195569": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",1], + "15183511809138557392": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5733530388090903847": ["convolution_gpu_bfyx_gemm_like",2], + "9574931298183748343": ["convolution_gpu_bfyx_gemm_like",2], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8751367574402839332": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "18259787991864449280": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "6373173636869473046": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6012477132351580695": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "16367495521884864886": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "13095408117538194584": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "3020115657931277672": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4941660917457387098": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "2238901105639912692": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "1671347101986657824": ["convolution_gpu_bfyx_gemm_like",2], + "12274268980330855890": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "17079309368548171402": ["convolution_gpu_bfyx_gemm_like",1], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",2], + "4684985181211883028": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "9882204352209412039": ["convolution_gpu_bfyx_gemm_like",1], + "14600700464602327710": ["convolution_gpu_bfyx_gemm_like",2], + "1682486914760867977": ["convolution_gpu_bfyx_gemm_like",2], + "5013936351898884291": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8292979162428130363": ["convolution_gpu_bfyx_gemm_like",2], + "2564518461717467683": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13613948678997524330": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "919788620883613958": ["convolution_gpu_bfyx_os_iyx_osv16",464], + "18060514966005474708": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "13044020050176766314": ["convolution_gpu_bfyx_gemm_like",1], + "10720782649044333851": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "1966540437574889257": ["convolution_gpu_bfyx_gemm_like",1], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "16076153317792960383": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "12960666483922103702": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "2264520082689779253": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "6220616397859143111": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10857084376518292379": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15487686565734149288": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "6647969101146756031": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12301464827222654105": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "8560635685184432720": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "9694891301950867606": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "13345599888287912619": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2511072616914149110": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "15890749658785957481": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "14386256118128644729": ["convolution_gpu_bfyx_gemm_like",2], + "7806837641999814363": ["convolution_gpu_bfyx_gemm_like",2], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5164372816534616260": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14895352662503433583": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3889688816787688160": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "16499919609457089685": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "11825209936640729550": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "4366168099274266975": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "11962541545116807979": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "17939745299931100048": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "11583791752668920812": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "14116682822622440033": ["convolution_gpu_bfyx_gemm_like",1], + "15178012823756517910": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14276876004054588508": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7627882727285402176": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14912119584313592912": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "1504867045084152953": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8488789346759658706": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2446257282140830646": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "17310332946322628458": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "14905705901815863508": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1553825475921110392": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7335403151694644211": ["convolution_gpu_bfyx_gemm_like",1], + "2310159350914289605": ["convolution_gpu_bfyx_gemm_like",2], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14128599551956588603": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "16614678178197571772": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "16805562203348924108": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "13565027847255501776": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16364899406120840449": ["convolution_gpu_bfyx_os_iyx_osv16",398], + "17128760774072077101": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9358401110755269308": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8703758535351908295": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10136297272678091418": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13065517911798224579": ["convolution_gpu_bfyx_os_iyx_osv16",377], + "7722090560547236852": ["convolution_gpu_bfyx_gemm_like",1], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1962479636209947761": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "16392283136103456949": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "4438055737691342460": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2520734476651273971": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8569122574675372789": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "8159489372517869446": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "11599990834682830362": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "17825953644228876369": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8885012252853227025": ["convolution_gpu_bfyx_gemm_like",1], + "8484526109354576450": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17096175733187202673": ["convolution_gpu_bfyx_gemm_like",2], + "9596656797750683465": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12883021432082543848": ["convolution_gpu_bfyx_gemm_like",1], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7504074736798125353": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17184638213817814424": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",1], + "13681462437496627948": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11091771531609585709": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2599817012641445801": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "15921072201288695017": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11258182961445417799": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6214312494103149808": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "1673458534805854479": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "10944997349682267106": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "2887152687927903549": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13017541921351620667": ["convolution_gpu_bfyx_gemm_like",2], + "17626938391567407401": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "517802466588815950": ["convolution_gpu_bfyx_gemm_like",2], + "2079476232214121671": ["convolution_gpu_bfyx_gemm_like",1], + "2225233951957105071": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16035563519857925932": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "8525704362451630717": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12022980249970038824": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "17230103497915224469": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "17666004363345457085": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6224167817672480442": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12144421857685107073": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6581494673640781863": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5461649843950745696": ["convolution_gpu_bfyx_gemm_like",2], + "3718980061704064547": ["convolution_gpu_bfyx_gemm_like",2], + "712420402191459810": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "11757919563609176713": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "7808544677773370430": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15997231252708686870": ["convolution_gpu_bfyx_gemm_like",2], + "12924910330295852704": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7499082230554771515": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "4702017956226464806": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "10532500300200244159": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "11298638173197050575": ["convolution_gpu_bfyx_os_iyx_osv16",942], + "5675497261720118479": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "13845827017732177448": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14854353557342075292": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "8948718883406304307": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7510055418609679364": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2821441037530057414": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "13524128602135083081": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "9707630588260222630": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "9181466280310872332": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "11148502358361704423": ["convolution_gpu_bfyx_gemm_like",1], + "7959969582538910953": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6613282637922219205": ["convolution_gpu_bfyx_gemm_like",2], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "12028030221272546172": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "4995510103045767117": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "14707884854112495064": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7323343770209750835": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3292879092145281224": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "15592248516895826924": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9400558994532871122": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15875968032394961531": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4073467095502162430": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "16044646335477470657": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14376192291828307385": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6323026044750482867": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5094419710576598497": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13085261987388297912": ["convolution_gpu_bfyx_gemm_like",1], + "7463657272687673896": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "3789890554711038921": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12070592804878487941": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "1208483520611545642": ["convolution_gpu_bfyx_gemm_like",2], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",763], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "2652267888871336297": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5507708258753405429": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "9475812329914836280": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10025893052937028511": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "15221712686851573528": ["convolution_gpu_bfyx_gemm_like",2], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "3703292222363446463": ["convolution_gpu_bfyx_os_iyx_osv16",762], + "9608148784787572220": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4036143655651874318": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "7371339724529362579": ["convolution_gpu_bfyx_gemm_like",2], + "16847817828600381030": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15334769670416409064": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3910549475873353422": ["convolution_gpu_bfyx_os_iyx_osv16",380], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13762814538289753428": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",2], + "16070611944881238498": ["convolution_gpu_bfyx_os_iyx_osv16",884], + "9910414853336797922": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15180747404865201068": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "11060822686394981344": ["convolution_gpu_bfyx_gemm_like",1], + "18146184020578260553": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "13328449155966085543": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4129586781834275070": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9649533822873928984": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9593975471009029134": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10572208209982879914": ["convolution_gpu_bfyx_gemm_like",0], + "4999505377862312410": ["fully_connected_gpu_bf_io_gemm",2], + "16124702296533772526": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "14558850297291634005": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "1254745727978231148": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13283018618260255620": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "16992620579546408448": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13352151930345854198": ["convolution_gpu_bfyx_os_iyx_osv16",275], + "2690771087990667627": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "1208243889917809864": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "7494124707566708728": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "13564654155363057485": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15160322051545035612": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "7473012539094225392": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "16896863928108200897": ["convolution_gpu_bfyx_gemm_like",2], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",863], + "11648841195768568983": ["convolution_gpu_bfyx_gemm_like",0], + "13831458435772917577": ["convolution_gpu_bfyx_gemm_like",2], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15378025640603637387": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "16852207712205172744": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15132518566122695317": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "1168311873250200110": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3541828356667081528": ["convolution_gpu_bfyx_gemm_like",1], + "9524663472084054050": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7431237779891953779": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "9197931868200777891": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "9451273689649467046": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "6878922067845522655": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "17242820574559628535": ["convolution_gpu_bfyx_gemm_like",1], + "15452996816194024433": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "70244312667395170": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11919129623429545762": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "17795358440179122086": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8263822658108674162": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "2152903140704848574": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6735135795253013220": ["convolution_gpu_bfyx_gemm_like",2], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "5215755301612973095": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "4122312805832663323": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11619548409913646265": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2108296560864415762": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "912423125050985716": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17281198415161259885": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2110090486638190463": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "3240428557350945267": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13491655481292956895": ["convolution_gpu_bfyx_gemm_like",1], + "2343921093633784755": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "3148053731303748054": ["convolution_gpu_bfyx_gemm_like",2], + "16404059675217592817": ["fully_connected_gpu_fb_oi_ref",1], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "9034951536385533818": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "12756432707088842236": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "523055954326631884": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17850932752450917677": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "14973411884734235059": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "16229324496308453344": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10736892779278378335": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "11261619081095309088": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "13368477378531148593": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "5401523175111660554": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9802832901508552733": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11361013180071053597": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "269334626439013799": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17970855913877771858": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "18332090297993015499": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3665837617379468265": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8316848551837633169": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "17807033661138518449": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6571473790090353005": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "499739705596245675": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5603409300903611279": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14332388011233886083": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6673753637296082820": ["convolution_gpu_bfyx_gemm_like",2], + "8528886126454874796": ["convolution_gpu_bfyx_gemm_like",1], + "10946069941293798874": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10054253863699485503": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9416285845239621878": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9042812985530274425": ["convolution_gpu_bfyx_gemm_like",2], + "12671153706040443724": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "12705054744767500423": ["fully_connected_gpu_fb_io_ref",1], + "8503207028307570404": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "5049534591553232781": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6456426339461437148": ["convolution_gpu_bfyx_gemm_like",1], + "1289009275012699560": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "3965871278597751318": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13744951984978188201": ["fully_connected_gpu_fb_io_ref",1], + "13728180355108851541": ["convolution_gpu_bfyx_gemm_like",2], + "4524347845016978037": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "5011273172385428756": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "12283317230112506089": ["convolution_gpu_bfyx_gemm_like",2], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "10175721494218314250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10432687907685994204": ["convolution_gpu_bfyx_gemm_like",1], + "13614921331048223116": ["convolution_gpu_bfyx_gemm_like",2], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9765339420071627045": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "10660230104888153758": ["convolution_gpu_bfyx_gemm_like",2], + "12386930130408773521": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6706491729783125139": ["convolution_gpu_bfyx_gemm_like",1], + "12675858428585873471": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "9888097487468905169": ["convolution_gpu_bfyx_gemm_like",2], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "466868648178437688": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "7282751412088726760": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "14270450799210365812": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7518734167761579102": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9854440591497995284": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13405310261845268772": ["convolution_gpu_bfyx_gemm_like",2], + "7715520469947900684": ["convolution_gpu_bfyx_os_iyx_osv16",571], + "16408015571155576773": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4783126652984096700": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "13388004363210658650": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "14256842018830898376": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "16114623916610925741": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10397253349562394184": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8007667797556094444": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "15129834325410878425": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "18417880214901227799": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4722824701199486161": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "17011927973643184196": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15212317205888563836": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13802834658447955377": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "6527268791835193134": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "10918743320372308981": ["convolution_gpu_bfyx_gemm_like",2], + "2737840613867456953": ["convolution_gpu_bfyx_gemm_like",2], + "269829518575229806": ["convolution_gpu_bfyx_gemm_like",2], + "2944333966072327932": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "12744887771237881196": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "1242366856673194709": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "17753585752923130911": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "7282595712912388754": ["convolution_gpu_bfyx_os_iyx_osv16",189], + "6985970932645412773": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "7930154826818165796": ["convolution_gpu_bfyx_gemm_like",2], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "6953478877896677022": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "10607904718265020949": ["convolution_gpu_bfyx_gemm_like",2], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8008513163448840421": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13221156296791499146": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "15391215077224693736": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "704262295684441748": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "11455732989503244360": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "18424400171776141118": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "4286652913945761799": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "5379608399492828685": ["convolution_gpu_bfyx_gemm_like",1], + "4614700272179482173": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7441139786825555264": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "202304354656398848": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "11962382064404466630": ["convolution_gpu_bfyx_gemm_like",1], + "5301440603380967612": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "12018398218876712811": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "10898684230183205955": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "2752322006160986801": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15660316437768312006": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15668791697154389130": ["convolution_gpu_bfyx_gemm_like",1], + "1139581213977408268": ["fully_connected_gpu_fb_io_ref",2], + "6649759230117795192": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "5244441996055494170": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",62], + "11070696274716018686": ["convolution_gpu_bfyx_os_iyx_osv16",570], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "8146906136296114696": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "435261825003875448": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "8922463054055280800": ["convolution_gpu_bfyx_gemm_like",1], + "13674246753382740056": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14189775376370027482": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8254412626112343365": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13596494923128445274": ["convolution_gpu_bfyx_gemm_like",2], + "7085416207166146240": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "10320711719466983961": ["convolution_gpu_bfyx_gemm_like",2], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13624106485902414324": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "15566108481408840783": ["convolution_gpu_bfyx_gemm_like",2], + "15225331270926229394": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13659291428095454839": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16932090423428476170": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "1882912836250239503": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "3442073007560756473": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3609233164979051271": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13108356579957761944": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16027853591907232537": ["convolution_gpu_bfyx_gemm_like",1], + "14446344744130895614": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17924819398394001587": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1608378717397996752": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "393884269158067083": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14903430454784452446": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11079710960007068860": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "11815825155082424936": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "2367791050032803116": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "11868789283464117390": ["convolution_gpu_bfyx_gemm_like",2], + "11207578758583923357": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "17368161816774674256": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "4551182180668229945": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "9001645663675631429": ["fully_connected_gpu_yxfb_ref",2], + "18191573176587760698": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "6027350558532160900": ["convolution_gpu_bfyx_gemm_like",2], + "11229587372764249222": ["convolution_gpu_bfyx_gemm_like",2], + "15838058479520696173": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "7318929661124340248": ["convolution_gpu_bfyx_gemm_like",0], + "3177915003579216846": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "7052552351421332490": ["convolution_gpu_bfyx_gemm_like",2], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",1], + "5589785455223385189": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5163965164859517893": ["convolution_gpu_bfyx_gemm_like",2], + "2268291720177538378": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13205973783895006074": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "11553355518677163509": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "14108113294744119367": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4673618329986777239": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14287890401250603057": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "3448477246688526708": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",91], + "2532962442388536022": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "14433662482531248989": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2335428826699999827": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "82249723699159955": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "16547425454653232058": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "16613907066461513431": ["convolution_gpu_bfyx_gemm_like",0], + "11725629762660987217": ["convolution_gpu_bfyx_gemm_like",1], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8171897258557801015": ["convolution_gpu_bfyx_gemm_like",1], + "15959241441689395955": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "14585370009659482450": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "15838114628203742383": ["convolution_gpu_bfyx_gemm_like",2], + "2399812257701033542": ["convolution_gpu_bfyx_gemm_like",2], + "7962383460496540840": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",0], + "3828988304073539836": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11307531462784240962": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "14838067105091112485": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4790599496008369129": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "10358359789382196576": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "16073578125651112218": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17405865057155583042": ["convolution_gpu_bfyx_gemm_like",1], + "8312903198090907576": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10173382130572498594": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "331390460560782085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",1], + "1811357700607919311": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11986642867827682648": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "11114015660322254541": ["convolution_gpu_bfyx_gemm_like",1], + "6420851258772300332": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "14793709237400480942": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "12569856169024791306": ["convolution_gpu_bfyx_gemm_like",2], + "2001464747481073870": ["convolution_gpu_bfyx_gemm_like",1], + "8863398172720091880": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "15148625184033310404": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10624246057883518638": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "6730474465453860479": ["convolution_gpu_bfyx_os_iyx_osv16",1039], + "10073439287681954518": ["convolution_gpu_bfyx_gemm_like",2], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",523], + "15465799788109255561": ["convolution_gpu_bfyx_gemm_like",2], + "11757953304204716753": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "1306339989221885682": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3198726093355425150": ["convolution_gpu_bfyx_gemm_like",2], + "962311766200741205": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16728826595086368897": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "1147744092130296563": ["convolution_gpu_bfyx_gemm_like",1], + "7146559117784312265": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "13073788277284969422": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "2305461098719675735": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "17521647426452186921": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "5433618404351968121": ["convolution_gpu_bfyx_gemm_like",2], + "17794162443307839614": ["convolution_gpu_bfyx_gemm_like",1], + "16440598510199834213": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "18009765676050504407": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3509811595028801757": ["convolution_gpu_bfyx_os_iyx_osv16",131], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "1109243878358317937": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "7254869458810021127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12615462894236933223": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11926378988530133568": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2930545263523345204": ["convolution_gpu_bfyx_os_iyx_osv16",542], + "7630776235327261710": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13787118639037730152": ["convolution_gpu_bfyx_os_iyx_osv16",298], + "404419072921281472": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "17749857812061795980": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4101449235783342476": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "14385181780082014495": ["convolution_gpu_bfyx_gemm_like",2], + "6013434489252641471": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8175595372513695437": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15092483859565823523": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3503236715353689942": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "10831460252334010668": ["convolution_gpu_bfyx_gemm_like",2], + "14681717813022425567": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6157727013102138824": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "9823997593704517392": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "3223726179820717808": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "10033076377998157101": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2571778193407799664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "13769943652297353544": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "16031140952379208074": ["convolution_gpu_bfyx_gemm_like",2], + "6128534975733321186": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10273183900108661041": ["convolution_gpu_bfyx_gemm_like",2], + "8316011587868622301": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "11907507085694711513": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "2004120786408087671": ["convolution_gpu_bfyx_gemm_like",2], + "17515573322312447679": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "5162737590442940024": ["convolution_gpu_bfyx_gemm_like",1], + "10906417366145323499": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11992158790035075804": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7606097739225472283": ["convolution_gpu_bfyx_gemm_like",2], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7753336153932360422": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6549150139619174585": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "505102470055903237": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "157852787707383962": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "8909239203149651260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14537109978413728476": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",767], + "17420288204511371476": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12570087709404311189": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6210483922262161762": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11883941040326858829": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "17303981366934280174": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "4127717437639868970": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17981604038340576961": ["convolution_gpu_bfyx_gemm_like",1], + "4301372734564127254": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "2086001721804797157": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "16184979150665364486": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "1934379409955686502": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "11655994466278963438": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "1945630503883822822": ["convolution_gpu_bfyx_gemm_like",1], + "15232673324549539143": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "6661117204204077150": ["convolution_gpu_bfyx_gemm_like",2], + "10384416235770656262": ["convolution_gpu_bfyx_gemm_like",1], + "13716836930727272782": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3819763245853861272": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "7345632855842905966": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2571186327837339204": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "9996196793804333253": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "11246470701714560770": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "8212533074856783509": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "33889407315234685": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17242442529374722270": ["fully_connected_gpu_fb_oi_ref",1], + "7496699438957793920": ["convolution_gpu_bfyx_gemm_like",2], + "8375465895534833097": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "6476949395889340429": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "18187262802267413585": ["fully_connected_gpu_fb_io_ref",1], + "9454146598828084176": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4241640917176830862": ["convolution_gpu_bfyx_gemm_like",2], + "10446500827044060319": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "7908036427091174081": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6948147789605707774": ["fully_connected_gpu_fb_io_ref",2], + "18159049252673770569": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "10904228118889057467": ["convolution_gpu_bfyx_gemm_like",2], + "14266210014132784194": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "5587539329568150667": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "10098661517988566506": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5519244962044894877": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "11777373751892075391": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "17575293085957492821": ["convolution_gpu_bfyx_gemm_like",2], + "7145194061073256844": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7243161613448507792": ["convolution_gpu_bfyx_gemm_like",1], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2056597791109604534": ["convolution_gpu_bfyx_gemm_like",2], + "2873387231297790075": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "4243114942173293897": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "18232408112396439386": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14335423820860953927": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "1187224156936080964": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "5759260743809103651": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16622402936526588344": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "9061025737181218101": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "670951751279091662": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "13133323947490009546": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10424278617647597641": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "6551173574001309451": ["convolution_gpu_bfyx_gemm_like",1], + "397445657349822499": ["convolution_gpu_bfyx_gemm_like",2], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "17016846635668370921": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "10898210758890334465": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "11684927349056930189": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "332090597573908506": ["convolution_gpu_bfyx_gemm_like",1], + "4682428771166816734": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "18006581941186887676": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "15661322183507404821": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "14634044133573461949": ["convolution_gpu_bfyx_gemm_like",2], + "7714783879762659458": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9806689250758752070": ["convolution_gpu_bfyx_gemm_like",0], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "3166885953206195915": ["convolution_gpu_bfyx_gemm_like",2], + "4574242607119408140": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "531020979837645217": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9666426531743983113": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "262113403359175565": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "4634475069086874260": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9397711809671506538": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12008952324872799824": ["convolution_gpu_bfyx_gemm_like",2], + "1907439276166837309": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "13314092088416047551": ["fully_connected_gpu_yxfb_ref",1], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",2], + "13071064509662090710": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "15928746165235747659": ["convolution_gpu_bfyx_gemm_like",2], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8725673763972618034": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7819934200255007163": ["fully_connected_gpu_fb_oi_ref",2], + "13051342120933385671": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17664704673433112966": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5353170440534073482": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "8240616667079698459": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4600261954762222519": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7070374681687005676": ["convolution_gpu_bfyx_gemm_like",1], + "16968664807495872526": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18404344881797725263": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5267143428977695208": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "7811861756798601201": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",925], + "1056494963618130644": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "962676948282027870": ["fully_connected_gpu_fb_io_ref",2], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2788116002380533417": ["convolution_gpu_bfyx_gemm_like",2], + "10378966564497668941": ["convolution_gpu_bfyx_os_iyx_osv16",283], + "7086574330273897976": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "16244270858428653037": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "11970466555294072275": ["convolution_gpu_bfyx_gemm_like",2], + "4586633477264151844": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "6547565989244888354": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "218477594596081189": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "5834006438103071406": ["convolution_gpu_bfyx_gemm_like",2], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "12461575861709234385": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1592994755823247500": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1922168904767469999": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "15718011075217705480": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13485140643204970345": ["convolution_gpu_bfyx_gemm_like",1], + "2664944425727769475": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",1], + "12314918602191412697": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "11341771589317480665": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "6133854782246597175": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "7394848434332739139": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "9937387440035377216": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "11804035561861841621": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9352866803638271156": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "1894591633696862066": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "17580933462801685507": ["convolution_gpu_bfyx_gemm_like",1], + "5408469943982199754": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15260448822338206631": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12492763342322011136": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "14975859027256879948": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "615833743936753727": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5982637097503543357": ["convolution_gpu_bfyx_gemm_like",2], + "9025790715924779508": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "17078700948595127028": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "10662239532841666965": ["convolution_gpu_bfyx_gemm_like",2], + "11049130623091275457": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "7921388663815287395": ["convolution_gpu_bfyx_gemm_like",2], + "3811462129131022619": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3555204322491340337": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13047793996728441528": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "4047806462440750215": ["convolution_gpu_bfyx_gemm_like",2], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12518571127411736885": ["convolution_gpu_bfyx_gemm_like",2], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",1], + "12248119734016401633": ["fully_connected_gpu_fb_io_ref",1], + "7671016314869993705": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "8054562515577756499": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "10732225577823701543": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "2836903620603494117": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "1650080413259413393": ["convolution_gpu_bfyx_gemm_like",2], + "7864880361674128748": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2903075619523363020": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "14211549589070739656": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8749468546606972791": ["convolution_gpu_bfyx_gemm_like",2], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "148355059345569721": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "4304943753428518690": ["convolution_gpu_bfyx_gemm_like",1], + "17318287523550546026": ["convolution_gpu_bfyx_gemm_like",2], + "15364374265752682266": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "5136111979773513341": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "15667487381692577290": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "482564204402769504": ["convolution_gpu_bfyx_gemm_like",1], + "5983808817108775912": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "14849708746319190277": ["convolution_gpu_bfyx_gemm_like",2], + "4646795194660982475": ["convolution_gpu_bfyx_gemm_like",2], + "94012300876418257": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "15786313441300512560": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1895945774251432343": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6512006285490280576": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3565702695809105495": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12610854610554906160": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "3895088069642140043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2100387626452428743": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "1362540464632328798": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13012283016751495099": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "436514945529747349": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6719956770229212208": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "12692563384795319282": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "5933743119393822386": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12523676912856063091": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "14744368497944610864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "868177350337221377": ["convolution_gpu_bfyx_direct_10_12_16",2], + "832976844701988460": ["convolution_gpu_bfyx_gemm_like",1], + "14034487492239603874": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12669547093826826335": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "5947492124433175601": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "13276867073526485069": ["convolution_gpu_bfyx_gemm_like",2], + "528618206870447012": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "7426788519998680898": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "9803306661531470015": ["fully_connected_gpu_fb_io_ref",2], + "6476480727582657308": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "16774728502960825097": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "6517802281521111563": ["convolution_gpu_bfyx_gemm_like",1], + "10652512666086843369": ["convolution_gpu_bfyx_gemm_like",2], + "1452841775482537260": ["convolution_gpu_bfyx_gemm_like",2], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",1], + "16285256723517297210": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14852990574796128305": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "8550783999616052522": ["convolution_gpu_bfyx_gemm_like",2], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5733701901687257088": ["convolution_gpu_bfyx_gemm_like",2], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6089202061701179659": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16443833779968719790": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",764], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "15888454525088587794": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "8116504545035982006": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "5275016494706355806": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "15585700465988560560": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "9127066823698894015": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5961488595080209440": ["convolution_gpu_bfyx_gemm_like",2], + "4665029580355133140": ["convolution_gpu_bfyx_gemm_like",2], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",1], + "5845969526791988973": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12307446289692143781": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "5251771557248725731": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "13758938418512211194": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12700008320838073774": ["convolution_gpu_bfyx_gemm_like",2], + "14164778301660100413": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12711558966638028352": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12522364636280164681": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "18369668865072009928": ["convolution_gpu_bfyx_gemm_like",2], + "727203296169504486": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "5214678408335388758": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17835134875461003221": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8465142022921853516": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "11599932445375240727": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "4601800315090684242": ["convolution_gpu_bfyx_gemm_like",2], + "18382226420077875582": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14459249705747952583": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12411228585189337571": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "8995892222116060827": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "12310462218432530363": ["convolution_gpu_bfyx_gemm_like",0], + "9776332064497085361": ["convolution_gpu_bfyx_gemm_like",2], + "9993925424761661218": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "17824431042110985323": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "17001492460236540325": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9454457647272059910": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "4578587579993676820": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "16113302464937833403": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "17825280904760131680": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1999892441424036372": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "13074593348097634731": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17392732266843821039": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2966185891283165994": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "14566257978356851712": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15783329079045263237": ["convolution_gpu_bfyx_gemm_like",1], + "9547451431091729288": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "15149336254307320187": ["convolution_gpu_bfyx_gemm_like",2], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "3961000444895975975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9513545197321447870": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "13031027103925431505": ["convolution_gpu_bfyx_gemm_like",2], + "16583563382485459718": ["convolution_gpu_bfyx_gemm_like",1], + "4858337483345561292": ["convolution_gpu_bfyx_gemm_like",2], + "6536333665377249409": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8374409021681741916": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2307629242354292362": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7670176887560273910": ["convolution_gpu_bfyx_1x1",2], + "1847170421455825520": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17407904982433770732": ["convolution_gpu_bfyx_gemm_like",1], + "2460415719642436412": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "11437885274663749440": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "5032195346490064156": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "18077281411861416889": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "7527175223662342321": ["convolution_gpu_bfyx_gemm_like",1], + "68637843533109734": ["convolution_gpu_bfyx_gemm_like",1], + "8501760360687221821": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "8906588133431586825": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "5890599002797783437": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "16981010901052181199": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13636129806349817264": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "14900099988131599740": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "17867620992288101450": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "621272125402238670": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13155901262605819372": ["convolution_gpu_bfyx_os_iyx_osv16",292], + "5040944983588288886": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10897622326486559468": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "15356995665520295246": ["convolution_gpu_bfyx_gemm_like",0], + "17907732260451873185": ["convolution_gpu_bfyx_gemm_like",2], + "13762042713029963144": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "15365628642332393565": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15777551868644801538": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17680403286850504499": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4833761011498696645": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "17601171646153308079": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "8204962103567653154": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13974740392602492680": ["convolution_gpu_bfyx_gemm_like",2], + "2712946943923358377": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "5367634698951188749": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "15361186788588226064": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "95993272253183796": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2173649669339714890": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "14355612297330229277": ["convolution_gpu_bfyx_gemm_like",2], + "10888435127006141874": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "17754836801944078461": ["convolution_gpu_bfyx_gemm_like",2], + "5608447459568229694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "659846949368492111": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "2850118175701764737": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "17093159649157277089": ["convolution_gpu_bfyx_gemm_like",2], + "277410555520090949": ["convolution_gpu_bfyx_gemm_like",0], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "10612049417873776481": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9987939079053625302": ["convolution_gpu_bfyx_gemm_like",2], + "18341524156838963264": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17784882947271841103": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13367043015761260275": ["convolution_gpu_bfyx_gemm_like",0], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "12983461576274227638": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9747165558500755104": ["convolution_gpu_bfyx_gemm_like",0], + "12793814016409887162": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "15653223776766070604": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9194441947620820715": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "15329084374930297871": ["convolution_gpu_bfyx_gemm_like",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "7509199936979430017": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4553508439536472227": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "6638696743420807294": ["convolution_gpu_bfyx_gemm_like",2], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "1720057192283799086": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "13809218391763818477": ["convolution_gpu_bfyx_gemm_like",2], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "9261867808456596636": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "16568662638983972991": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12024318713420323349": ["convolution_gpu_bfyx_gemm_like",2], + "7831542641855749925": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13356152596085257346": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12584870629297848143": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2198100074518629980": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1552088062654417187": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "4407683781177409314": ["convolution_gpu_bfyx_gemm_like",2], + "16747069131271457481": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "2213068950786625268": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "17400844732252600825": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "7400370437512056636": ["convolution_gpu_bfyx_gemm_like",2], + "1436830013293669148": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8243230863677884952": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4750897775273897282": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14639233649574991406": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13940433448128376511": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10127598593949337541": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "9660812093766156608": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",1], + "13764532551476584909": ["convolution_gpu_bfyx_gemm_like",2], + "14908665013877276517": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "6555440973226014216": ["convolution_gpu_bfyx_gemm_like",2], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "2032438743863827309": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "17303584953298149285": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8036592210244553232": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "15550722997950669458": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8007491455800395118": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",90], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",2], + "14384062335728088286": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "16202841384048331166": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "9427999492792081454": ["convolution_gpu_bfyx_os_iyx_osv16",128], + "8469338060514215816": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "13291816522762326802": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8104522072297740079": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10127626701775288565": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "14799589725341253463": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12906669887096343446": ["convolution_gpu_bfyx_gemm_like",2], + "17966517080605659454": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11086699387784339943": ["convolution_gpu_bfyx_os_iyx_osv16",495], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17796867588410764794": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18395970344992997862": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9367157746678824712": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11446181888102710561": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "6085098225080533278": ["convolution_gpu_bfyx_gemm_like",2], + "4200340674281276565": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "8335501317577461610": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "3991584206721185508": ["fully_connected_gpu_yxfb_ref",2], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "4131527916449986086": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "7505608160068471520": ["fully_connected_gpu_fb_io_ref",2], + "6148794431848761670": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "11571049833132558023": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "52089503050497755": ["convolution_gpu_bfyx_os_iyx_osv16",899], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "17130630712943165823": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15240660399630429406": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15531908897773912572": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10771178773821148370": ["convolution_gpu_bfyx_gemm_like",2], + "12279591818557049086": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "5290935680520661218": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "16691293834516280510": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "18157442326218165947": ["convolution_gpu_bfyx_gemm_like",2], + "15379873910046172004": ["convolution_gpu_bfyx_gemm_like",1], + "11345101652477732928": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "5595802790436774398": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17267132595546153629": ["convolution_gpu_bfyx_gemm_like",2], + "15887484617041779814": ["convolution_gpu_bfyx_gemm_like",2], + "12052225815821079044": ["fully_connected_gpu_fb_io_ref",1], + "14112695611389738149": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "12831670701606794888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17778706153204631930": ["convolution_gpu_bfyx_gemm_like",1], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "9447458159095730492": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1334121138243951086": ["convolution_gpu_bfyx_gemm_like",1], + "13939763360217628282": ["convolution_gpu_bfyx_gemm_like",2], + "16303870101043861053": ["convolution_gpu_bfyx_gemm_like",2], + "16237775310369180101": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11421235118459218209": ["convolution_gpu_bfyx_gemm_like",1], + "5033753554611312392": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "11269720109905550213": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "517601465150912854": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "5233164031954315264": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "7303492518741737111": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "4134729533276761488": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "5397783260083330774": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "5222741986856655072": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4186140878816408491": ["convolution_gpu_bfyx_os_iyx_osv16",125], + "9573589861499897842": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "8623022306922454565": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3237680963342495368": ["convolution_gpu_bfyx_gemm_like",1], + "2446435710311724460": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15561518067918160695": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "1852269248476496933": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16001665772103476029": ["convolution_gpu_bfyx_gemm_like",0], + "8757900457181374694": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "6902644989079870993": ["convolution_gpu_bfyx_gemm_like",1], + "17758354062670710364": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "17464785726466943638": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10754321688472707825": ["convolution_gpu_bfyx_gemm_like",2], + "13993045680928507594": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "5335250793358473555": ["convolution_gpu_bfyx_gemm_like",1], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "16021335552443492452": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1469048759583678106": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8549811622247170014": ["fully_connected_gpu_fb_io_ref",2], + "9816834679089152140": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2054100643811117871": ["convolution_gpu_bfyx_gemm_like",2], + "12700957546822808929": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18020588962875998441": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4272417312859966238": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "3714179297375678368": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "498221230041656321": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",0], + "17869928048344193660": ["fully_connected_gpu_yxfb_ref",2], + "6439778526899109398": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2881475011209167644": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "16934386540875904239": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "8129414331584785189": ["convolution_gpu_bfyx_gemm_like",1], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "3244402155461139559": ["convolution_gpu_bfyx_gemm_like",1], + "17602686382249457351": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "15374625876485618845": ["convolution_gpu_bfyx_gemm_like",2], + "13083412418930786217": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15262493122847269333": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3291900073868076610": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15993651594402422200": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4265991006340418914": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6080989915764831447": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2649948006897488504": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "9640773327221702885": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "3557182643072772598": ["convolution_gpu_bfyx_gemm_like",2], + "6962268765187856246": ["convolution_gpu_bfyx_gemm_like",2], + "18402875771862490280": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "6057433908801727873": ["convolution_gpu_bfyx_gemm_like",2], + "11828522357351010810": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "15245792492785141641": ["convolution_gpu_bfyx_gemm_like",2], + "2668985670745598382": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16642535448111764945": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17366007551797367227": ["convolution_gpu_bfyx_gemm_like",2], + "2470579932413307757": ["convolution_gpu_bfyx_gemm_like",1], + "13480393611172760874": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "13414375996946350733": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "10118395047539851751": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17399103575103078835": ["convolution_gpu_bfyx_os_iyx_osv16",1089], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "11210371874006224582": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "10093371683053539916": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15213473731205734586": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "16306284020664131647": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "9140953654075340568": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "659150305191479097": ["convolution_gpu_bfyx_os_iyx_osv16",902], + "10186942318345695432": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",0], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "15456771485750114116": ["convolution_gpu_bfyx_gemm_like",2], + "5011190083565902614": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "3768977479127609228": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9105949910901552052": ["convolution_gpu_bfyx_gemm_like",1], + "16195252193236429176": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",1], + "10726830507311062380": ["fully_connected_gpu_fb_io_ref",1], + "6724516766412732606": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16958661630307271135": ["convolution_gpu_bfyx_gemm_like",1], + "1187622888238643867": ["convolution_gpu_bfyx_gemm_like",2], + "17796784393519192261": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "14749290801006453098": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "12963601040302529291": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",2], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "1781619247831135285": ["convolution_gpu_bfyx_os_iyx_osv16",305], + "4424258528650299664": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "1996860183441418841": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "2662628817605495834": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8641167903508739082": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "15247278167909654073": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "568023964685613279": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "17212292336626940406": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "3202034075645193740": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16355518852513270001": ["convolution_gpu_bfyx_gemm_like",2], + "9172445047535982729": ["convolution_gpu_bfyx_gemm_like",2], + "17257466221539644081": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16511261203374835334": ["convolution_gpu_bfyx_gemm_like",2], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17845905249343189063": ["convolution_gpu_bfyx_gemm_like",2], + "1676419079398771261": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3755253206085028904": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "11696708134796103802": ["convolution_gpu_bfyx_gemm_like",1], + "9756049510998074315": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "13182965457868586949": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "1474719104479956715": ["convolution_gpu_bfyx_gemm_like",2], + "9464448984918455020": ["fully_connected_gpu_fb_io_ref",0], + "10344489318472060767": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "8107597524360102037": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "16349083818768061549": ["convolution_gpu_bfyx_gemm_like",2], + "3861084063403560668": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "6534932244936310237": ["convolution_gpu_bfyx_gemm_like",2], + "5254115874873721374": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8320522112821700316": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "14980327142253281498": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "10995849055789490935": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "2430404993947067949": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "1100681675092122613": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "956022649859563080": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "13610246822402943068": ["convolution_gpu_bfyx_gemm_like",2], + "9559533345689069514": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "7601006550805536675": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "6493509887452943215": ["convolution_gpu_bfyx_gemm_like",1], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "61390148213644186": ["convolution_gpu_bfyx_gemm_like",1], + "1183774022668948480": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "2294026590516781945": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12494969618927201911": ["fully_connected_gpu_fb_oi_ref",1], + "2740834366358352617": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12156683064218448087": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15581678976147496970": ["convolution_gpu_bfyx_gemm_like",0], + "4332002982390788477": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "7844764086278702374": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7650874310714729923": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8484380699802533068": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "10900962238463588974": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13443130482173929700": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4307817040832953223": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2933183897022161826": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "11341287517759485930": ["convolution_gpu_bfyx_gemm_like",2], + "11164600098693999456": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15718782218800307385": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11767263058642131204": ["convolution_gpu_bfyx_gemm_like",1], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "17251021943762069083": ["convolution_gpu_bfyx_gemm_like",1], + "1249137685908951501": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10205929431600082124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1824009696938637196": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2691406689892290663": ["convolution_gpu_bfyx_gemm_like",1], + "9144136375141111897": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "14702670413549232065": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11033758130987285174": ["convolution_gpu_bfyx_gemm_like",2], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "609926704263171728": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "1312322903335525510": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9241243727411869340": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "7576873892262851401": ["convolution_gpu_bfyx_gemm_like",1], + "14936045362442728963": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "16628679902327485435": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "13112861120841066430": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "9974986004361966590": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13775683667344570223": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "15170578644807800052": ["convolution_gpu_bfyx_gemm_like",2], + "868827643007921561": ["convolution_gpu_bfyx_gemm_like",2], + "12361848206190267821": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1564774057733793087": ["convolution_gpu_bfyx_os_iyx_osv16",97], + "10354305663463607086": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9172699707430374863": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "16322719022997791344": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "3221221905804708596": ["convolution_gpu_bfyx_gemm_like",1], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "8146559042269976123": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "18009083375897554008": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "16482301217529090205": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9246213432501129631": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "8733109144496806085": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "3021451990778420603": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "844278648549884313": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10286228358844791913": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13201854669827561901": ["convolution_gpu_bfyx_gemm_like",2], + "12184558469694708819": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3803179179802002296": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "13248218293365141596": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "41250455178236256": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2730604806511016352": ["convolution_gpu_bfyx_gemm_like",2], + "7044087204529042819": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "14001920054473316909": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10093554313775878065": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8108939799996498955": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12503605837910457108": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "32035190068479388": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15971924211584724882": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "16763335832616216769": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "7196214243890296121": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "7102173884859438914": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "16896434896068867157": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "17608288706234084973": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15642549417953837059": ["convolution_gpu_bfyx_gemm_like",2], + "8484176982872847423": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6643161848623134458": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2794704364476462562": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "8108843303778211282": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4593862318851730430": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14463983770858421738": ["convolution_gpu_bfyx_gemm_like",2], + "8291770994531919371": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "6538694526777067399": ["convolution_gpu_bfyx_gemm_like",1], + "14484890926084856480": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "15963358868537664345": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "796900095669815456": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "505027953105355818": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5573639264204952559": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "1106762955109168526": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "16632447105476661928": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "3170274732463232729": ["convolution_gpu_bfyx_gemm_like",1], + "88592091379585141": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "11976258954756052550": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14514450640485628836": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "10134708781744282286": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "3006428377575478529": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "6737332058785771073": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4660214425505918397": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6877976003072165363": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17516369849823844076": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6789547098653828902": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7595481705069674721": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "9805748332775912215": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16580523689587532278": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11407554707582995190": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "8358425189419823078": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "17784357412228522825": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "12916369918132790013": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",1037], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",767], + "12819626280531787705": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "10231289519907741812": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4157063588837576075": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "8751967016877067287": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "10289725524396556967": ["convolution_gpu_bfyx_gemm_like",2], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13948512795148364852": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "3436770797199367854": ["convolution_gpu_bfyx_gemm_like",1], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",0], + "16169024543367503806": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "13140527131098422428": ["convolution_gpu_bfyx_gemm_like",2], + "5167141379778311462": ["convolution_gpu_bfyx_gemm_like",2], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8028456017016080468": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "259085394007031207": ["convolution_gpu_bfyx_gemm_like",1], + "13959998803881264899": ["convolution_gpu_bfyx_gemm_like",2], + "3686062608868674589": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "14727155647330710270": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2415478259408761142": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "14602509614865844486": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "9289375071420565548": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7440546908141206022": ["convolution_gpu_bfyx_gemm_like",2], + "15485011864326008444": ["fully_connected_gpu_fb_io_ref",0], + "8470783908138180217": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17845195044080380488": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15459849799278480779": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "17721709435558297965": ["convolution_gpu_bfyx_gemm_like",1], + "14132860735060026066": ["convolution_gpu_bfyx_gemm_like",2], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6983544541444063131": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "13340998273773542342": ["convolution_gpu_bfyx_gemm_like",2], + "3134642518413656360": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "12341291953192305346": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4986977887030495943": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "16852690434396099861": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "3526198034974948081": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "16053585286807864356": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1882052795393187384": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7617773507561261623": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13459568779083836506": ["convolution_gpu_bfyx_gemm_like",2], + "13785621878621289403": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1980887257657896260": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4886289616235149731": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1898776014554946000": ["convolution_gpu_bfyx_gemm_like",2], + "4770478662275293849": ["convolution_gpu_bfyx_gemm_like",2], + "15117830538655814853": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17178808153714023980": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "1629280013296592298": ["convolution_gpu_bfyx_gemm_like",2], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",287], + "13663612869789682704": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14423094456821270228": ["convolution_gpu_bfyx_gemm_like",2], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",523], + "15989730594386153813": ["convolution_gpu_bfyx_gemm_like",1], + "6095158932103797740": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "7399775379344444344": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "13381833588713493653": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3380653500106294036": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "9981938305144461962": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4519609440668743423": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "15097371415144491976": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "12338760476079493547": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "13933912937625580405": ["fully_connected_gpu_bf_io_input_spatial",0], + "17126714253919198029": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "2341006744107937832": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "18178391985193947355": ["convolution_gpu_bfyx_gemm_like",2], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "11641605357868918146": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "562221645849170027": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "11561790484526369917": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3658149289395969504": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "8818070832398055086": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5509631031571317557": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5357531127711906072": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "8994777547915132466": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "2687781952021151359": ["convolution_gpu_bfyx_gemm_like",1], + "18083041911869525296": ["convolution_gpu_bfyx_gemm_like",2], + "9876098429582714576": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12466721526829931923": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10848407542826653699": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16808618754363181939": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "7657964685067862984": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13141069720428059461": ["convolution_gpu_bfyx_gemm_like",2], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",1], + "1138439260035360722": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "18184154104081850641": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "2338707843044884352": ["convolution_gpu_bfyx_gemm_like",1], + "13850920989756588064": ["convolution_gpu_bfyx_gemm_like",2], + "17176310030469904708": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9146427497025645310": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11291881629276762730": ["convolution_gpu_bfyx_gemm_like",1], + "9850711648349010674": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "700717277178942679": ["convolution_gpu_bfyx_gemm_like",1], + "6827316954140278736": ["convolution_gpu_bfyx_os_iyx_osv16",125], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "8509941319309380587": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "16488426854651696706": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "10432925516327889351": ["convolution_gpu_bfyx_gemm_like",1], + "10600040563032392126": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "11511221956203704038": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "13839590781642269381": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7508931961595339477": ["convolution_gpu_bfyx_gemm_like",1], + "10500029207807372735": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14330281759626724494": ["convolution_gpu_bfyx_gemm_like",2], + "7419216766190700536": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "3404911902272307873": ["convolution_gpu_bfyx_gemm_like",2], + "17489420766684604600": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "18196676408993954972": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "10186866999254188246": ["convolution_gpu_bfyx_gemm_like",1], + "4817953977830392054": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "2930702812469156271": ["fully_connected_gpu_fb_io_ref",1], + "16549498607618849252": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "11855777686733253894": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "4936968239673204144": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11988463489006787939": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13326233188936584240": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "11290558687608213321": ["convolution_gpu_bfyx_gemm_like",2], + "12366546292695084543": ["convolution_gpu_bfyx_os_iyx_osv16",456], + "11267742746905371769": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "16582080251500644069": ["convolution_gpu_bfyx_gemm_like",2], + "18113235498360281695": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16851949759898002809": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "14233388108948021331": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12434799432980627966": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16192971634546462244": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "7744644472305197412": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16733587306017341904": ["convolution_gpu_bfyx_gemm_like",2], + "10089588313551601914": ["convolution_gpu_bfyx_gemm_like",2], + "14397348576352573007": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "11823106525249133834": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13122637768866153753": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10110359677546019738": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12312291300513951124": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5989664002046950385": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "2346855978590136528": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "1372939511728986224": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17025268985366223779": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "3971456598769336038": ["convolution_gpu_bfyx_gemm_like",2], + "5329218407413679209": ["convolution_gpu_bfyx_gemm_like",2], + "18171940644650760608": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "850343942782057099": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "11215862132334892351": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13453226687921450129": ["convolution_gpu_bfyx_gemm_like",2], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "18273922178875123753": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "2028273519579688266": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "6233455595448276342": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11185041745377164894": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "4887402175773881313": ["convolution_gpu_bfyx_gemm_like",1], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "3086110559166474482": ["convolution_gpu_bfyx_gemm_like",2], + "3234567405788241673": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "814227839929688672": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7771969115805231266": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "17622515300258231642": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11806105193035393795": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "17715553891959228879": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "11829442945690098558": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9141802671320572984": ["convolution_gpu_bfyx_gemm_like",2], + "16170237673140354764": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "8616175124735896626": ["convolution_gpu_bfyx_gemm_like",2], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "17269318621094624075": ["convolution_gpu_bfyx_gemm_like",2], + "1529658068204046700": ["convolution_gpu_bfyx_gemm_like",2], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "15317510501392280831": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5688478347124565305": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "447152944190888653": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",2], + "14606504543906913119": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "3930526618478171342": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "10455850115486014344": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6210051945051792519": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "15451193085395494344": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "5163641718529821203": ["convolution_gpu_bfyx_gemm_like",1], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",1], + "11374410888638324212": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16661248688859994717": ["convolution_gpu_bfyx_gemm_like",2], + "3518981281605476136": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "10413043556440687328": ["convolution_gpu_bfyx_gemm_like",2], + "911927861489659568": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6561864486643226753": ["fully_connected_gpu_fb_io_ref",1], + "17494823614269622175": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "8071652278387309042": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "4805958162773855302": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "16666383605403885590": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "1410512481031922864": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "7033442247935655919": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",1], + "12141880589558027223": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2328698995040390396": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "4195847890935259046": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3923715765392385764": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2348721939771018658": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8500612796090968552": ["convolution_gpu_bfyx_gemm_like",1], + "13695012630130671371": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "5475537064464968733": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6914536960012332706": ["convolution_gpu_bfyx_gemm_like",0], + "3242468066266096173": ["fully_connected_gpu_fb_oi_ref",2], + "8817624284607822971": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "8453402620168400406": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",546], + "5934211962000091180": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10178462061836778766": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "9810703513111623136": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8870736106637803783": ["convolution_gpu_bfyx_os_iyx_osv16",43], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9628702542543622433": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "14845194064376163156": ["convolution_gpu_bfyx_gemm_like",1], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3436576388124386308": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17152100243867367458": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4614042998549572181": ["convolution_gpu_bfyx_gemm_like",2], + "7807168142899312025": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12150109996250730485": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13553045975561262752": ["convolution_gpu_bfyx_gemm_like",2], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16683909937519981313": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "7441199361135503715": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",2], + "1091511312740979158": ["convolution_gpu_bfyx_gemm_like",2], + "9134203155715293387": ["convolution_gpu_bfyx_gemm_like",2], + "17089332981370803321": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "16434635675895599016": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "5186963188234940985": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16951050796024922417": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13395074742046717601": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "14284223645235602230": ["fully_connected_gpu_fb_io_ref",2], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "13248567106128518549": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11739629316219263056": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14797994820826922836": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "1743572310914695413": ["convolution_gpu_bfyx_gemm_like",2], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9579316322704307175": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10131754493574658838": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "13273455049742872922": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "15085980226773631346": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "15325810055037682679": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "13447226378200557777": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "3075961585045028347": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11851216776536423298": ["convolution_gpu_bfyx_gemm_like",2], + "12251901229904154127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12716923819769400487": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "3438852523146175580": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3638987901025418036": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "10445587307296180364": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6692408578556372014": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16053383948025511837": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7703363154993904399": ["convolution_gpu_bfyx_gemm_like",2], + "5632101951796129342": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3666268650646000870": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "10551742525038893508": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "153117141968471446": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "6313048719388952335": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "5981885264666023260": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11462394098346770463": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "1698847067049584068": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "4046513842327685203": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16181974394948732584": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2431427502927207912": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15119063070382146368": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "5023609284081684300": ["convolution_gpu_bfyx_gemm_like",2], + "5797545757863100286": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "3853598651573655548": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "17332395907621747512": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "6981537186704688907": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7162701010394257343": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13383524675055536682": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7105622384646913935": ["convolution_gpu_bfyx_gemm_like",2], + "1908733355560815063": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "12278842522836720245": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7210729932836957540": ["convolution_gpu_bfyx_gemm_like",1], + "2239948568632407776": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "582386337144876096": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "4569416043426963318": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "17921616427936768657": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "2354885756165078342": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "11915835787294686201": ["fully_connected_gpu_fb_io_ref",2], + "11588201241814594642": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "17171513366028235799": ["convolution_gpu_bfyx_gemm_like",2], + "1313038182637545943": ["convolution_gpu_bfyx_gemm_like",2], + "14066660382918185188": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "17810119189318801197": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "884923290083082187": ["convolution_gpu_bfyx_gemm_like",1], + "2786925522916317149": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "10701231567226563098": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11260588538207111217": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "3256940792095638732": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "10156210866362845661": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "16482763280295827563": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "13661225837036677371": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "11569367085498045793": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "12324580272733221544": ["convolution_gpu_bfyx_gemm_like",2], + "10885831773581103653": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "14897935118679731283": ["convolution_gpu_bfyx_gemm_like",2], + "6413565827738894970": ["convolution_gpu_bfyx_gemm_like",2], + "17221173795372066030": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "18116824232149703772": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10472893418729915556": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13090596133852586482": ["fully_connected_gpu_fb_io_ref",2], + "10274587614581350261": ["convolution_gpu_bfyx_gemm_like",2], + "10831204282620894983": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "10394041365384258612": ["convolution_gpu_bfyx_gemm_like",1], + "16843976559933040107": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1051506168926530904": ["fully_connected_gpu_bf_io_input_spatial",0], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",1], + "346832567535597247": ["convolution_gpu_bfyx_os_iyx_osv16",515], + "17934338042329576850": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15715522462313302642": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "6569793510829850291": ["convolution_gpu_bfyx_gemm_like",2], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "15711618559677233865": ["convolution_gpu_bfyx_gemm_like",2], + "15136557970717196814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6603817696964851209": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "9104236539185546468": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "7247414730479113619": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "1314612539156304342": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "5368419079251107469": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "16723949803487501587": ["convolution_gpu_bfyx_gemm_like",1], + "15640202505592598653": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "17258278942367320412": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "11872894645888259277": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6889498170947481097": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "9667762333290150436": ["convolution_gpu_bfyx_gemm_like",2], + "12797434473085560369": ["convolution_gpu_bfyx_gemm_like",1], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "18086782289842715645": ["convolution_gpu_bfyx_gemm_like",2], + "10880656082867082647": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "14108091242461324109": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "12478041902013146137": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "5375957124102705020": ["convolution_gpu_bfyx_gemm_like",2], + "5122639094068865656": ["convolution_gpu_bfyx_gemm_like",2], + "3741411131962514208": ["convolution_gpu_bfyx_gemm_like",0], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "8376077531098664520": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17221958812979739319": ["convolution_gpu_bfyx_gemm_like",2], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "11075875009517060583": ["convolution_gpu_bfyx_gemm_like",1], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "4209610989252810404": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",1], + "7883469783245625654": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",388], + "12814676907278614920": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "15402502830461368746": ["convolution_gpu_bfyx_gemm_like",2], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "1104098779103065492": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "6423120553520000795": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "10392297152843428925": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "374553246608550876": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "642256034968512602": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "1701609125136907870": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7059809764116926828": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "15291457825664605611": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1817929353109443200": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10182490653383265979": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "2660620513253264815": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "13116746433291181712": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "8017024160145338317": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "2407509127927738079": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "12345000525470836335": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "7107313154723472157": ["convolution_gpu_bfyx_gemm_like",1], + "17116130466596594359": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "6096189754478965440": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "9100044555742394133": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "389822325870173489": ["convolution_gpu_bfyx_gemm_like",2], + "12608653044712562811": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "17827762625385383658": ["convolution_gpu_bfyx_gemm_like",1], + "1103228955716492167": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6744583842563891546": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "4830121683809417143": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "14400339764883906933": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "6100453836448514115": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "460780635491857522": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8054185159612481260": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10468108569766167175": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10290107543739998181": ["fully_connected_gpu_bf_io_input_spatial",2], + "12881836161162762524": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "10533367671706069274": ["convolution_gpu_bfyx_gemm_like",2], + "2616828683870391718": ["convolution_gpu_bfyx_gemm_like",2], + "18215260982292770252": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15308667224953963012": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "7678730081652720605": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "7536267099632318821": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "3649980610274946512": ["fully_connected_gpu_fb_io_ref",0], + "14642845734482478360": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17550795608527501180": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8090497202997192142": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15300588247579013966": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "12940491379482292807": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "761183183078910587": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "9853089109234784643": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "9151597254187513724": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "592364460086746355": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "14560435854055940143": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "2534408579674556441": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "970596838400633278": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "10131771849139346986": ["fully_connected_gpu_fb_io_ref",1], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12412224630798427948": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "9378419102254633989": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "17543094050285028967": ["convolution_gpu_bfyx_os_iyx_osv16",348], + "15095146351334328804": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6763373100985812924": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "9126242742012768166": ["convolution_gpu_bfyx_gemm_like",2], + "9501165931845934084": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8200094670006738584": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "13091799752362714688": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "7654445730724243959": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "15936513690378208182": ["convolution_gpu_bfyx_gemm_like",2], + "2510919738337557939": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14157776769026046014": ["fully_connected_gpu_fb_io_ref",1], + "2888587871912905870": ["convolution_gpu_bfyx_os_iyx_osv16",45], + "15107740124884150777": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "8848042913869254179": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "2538377242539785672": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11047625525388102466": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "194324011642969540": ["convolution_gpu_bfyx_gemm_like",1], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16159309494101203811": ["convolution_gpu_bfyx_gemm_like",2], + "2299440282267661763": ["convolution_gpu_bfyx_gemm_like",2], + "2451603338483395600": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17044070592136685322": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10994887986667360638": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "7450915928720828406": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "2004691166378443418": ["convolution_gpu_bfyx_gemm_like",2], + "2595273700611743351": ["convolution_gpu_bfyx_gemm_like",2], + "12175796957622122377": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "11190259822407791373": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5116633474932727191": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13821388909343378606": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "8997120235555587461": ["convolution_gpu_bfyx_gemm_like",2], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",514], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",1], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "16797936364395702812": ["convolution_gpu_bfyx_gemm_like",2], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "5957444113623953990": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14566544143931267758": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7391591731082133842": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15592321818359223008": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15997754881872769378": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "15688260390755491480": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "444533022549215983": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5629373398445592781": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "12930435393720466720": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9300767936311837876": ["convolution_gpu_bfyx_gemm_like",2], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6323504675912413145": ["convolution_gpu_bfyx_gemm_like",2], + "6364288463529107554": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3089303702413279458": ["convolution_gpu_bfyx_gemm_like",1], + "13418701036204748812": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "8873424072104563382": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "5085190482265319015": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10384537928514123040": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "8075453526439606224": ["convolution_gpu_bfyx_gemm_like",2], + "9988347141056982336": ["convolution_gpu_bfyx_gemm_like",2], + "18146068930296529306": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3809343305878998617": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "15670841106242481912": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "5516343490635816913": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5552958912776013600": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "6717268005860715462": ["convolution_gpu_bfyx_gemm_like",1], + "15154934905173371714": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "6919081291036849635": ["convolution_gpu_bfyx_gemm_like",0], + "13599555566632152241": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7431069335622070596": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "2105482100745329286": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "4108579755980014185": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "12360796145248339074": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "11318404975804457466": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15596913527233792996": ["convolution_gpu_bfyx_gemm_like",2], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "11674630830833831209": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12089505956882731481": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5970516037710024187": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "6377828127090689238": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "4213330047036138895": ["convolution_gpu_bfyx_gemm_like",2], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "11931909191490706784": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "12706645084970410965": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12631385844456089132": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10205576142280465189": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14776308019009874809": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "4557272439632791722": ["convolution_gpu_bfyx_gemm_like",2], + "8939683514448064461": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "8307147375351882939": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10997029728191881587": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "17765244777397448823": ["convolution_gpu_bfyx_gemm_like",2], + "13906695412889750672": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "12397493112115605421": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "2043990557089419633": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11047327014045909812": ["convolution_gpu_bfyx_gemm_like",2], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",2], + "16168891366331544806": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11825205449232126827": ["convolution_gpu_bfyx_gemm_like",2], + "6680219899975628258": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "11996551650886043090": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "12691733869577147545": ["convolution_gpu_bfyx_gemm_like",2], + "761984225415608773": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14545322358931928911": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3286476039871096924": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "12878858391355259417": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2460361970017706505": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "11623764266322172086": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "9852052796465340830": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "5559417017584278927": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "15470979879166640563": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "4030835922805418609": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13121196588092064246": ["convolution_gpu_bfyx_gemm_like",2], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "1338534626640014074": ["convolution_gpu_bfyx_gemm_like",2], + "16112835627818488034": ["convolution_gpu_bfyx_gemm_like",2], + "12013883366396753346": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9777638299795801012": ["convolution_gpu_bfyx_gemm_like",2], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "5481293245081340756": ["convolution_gpu_bfyx_gemm_like",1], + "2888315406857606108": ["convolution_gpu_bfyx_gemm_like",2], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "2415883693527779570": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "7953340333870774815": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10971971008143485353": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "5842284971563375197": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "18076018773227225156": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "832830374368320801": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "2724007091383127418": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "9632178829095307219": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "1429370139030130929": ["convolution_gpu_bfyx_gemm_like",1], + "12478496773222604204": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "8467771025017377254": ["convolution_gpu_bfyx_gemm_like",2], + "685140170576742460": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14704939880642470064": ["convolution_gpu_bfyx_gemm_like",2], + "17264554677210911187": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8549465639583777774": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6181272224000872375": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1350402181555441235": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "9552312946391901745": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13300022131572486202": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "5294364781478821403": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "16985565646738638215": ["convolution_gpu_bfyx_gemm_like",2], + "14545094765855515974": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11860902750907076009": ["convolution_gpu_bfyx_gemm_like",1], + "3790881125495367946": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2072246877651869428": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "16125965158927145599": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6748628505489041229": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3119235799568225015": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "5094600092408024387": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "3504421925108785018": ["convolution_gpu_bfyx_gemm_like",1], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "14810839157236175179": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10295400862890021635": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "3830787224073518842": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "6586833064055001967": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "5191016422297403500": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15160192060731796225": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "10858234923346500323": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8913451832923806760": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7264756313770306662": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "11878200328276635385": ["convolution_gpu_bfyx_gemm_like",2], + "8253823502854784432": ["convolution_gpu_bfyx_gemm_like",2], + "12270548292992377827": ["convolution_gpu_bfyx_gemm_like",2], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5018845267269043034": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "2183193161596798350": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "2737738314051715813": ["convolution_gpu_bfyx_gemm_like",2], + "15434536162164591656": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14743760934522111296": ["convolution_gpu_bfyx_gemm_like",1], + "578940134826172063": ["convolution_gpu_bfyx_gemm_like",2], + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "10842828403850880541": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "10754450245035836188": ["convolution_gpu_bfyx_gemm_like",2], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "1226681724476075216": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15012744672096562609": ["convolution_gpu_bfyx_gemm_like",1], + "12024416333474523686": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "14366861063858001106": ["convolution_gpu_bfyx_gemm_like",2], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",1], + "4104803308438043557": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8557939065994799094": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "3725060015826635697": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "8676627474831455650": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "15342520770460205985": ["convolution_gpu_bfyx_gemm_like",2], + "12061391584831995030": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "18420783889227814721": ["convolution_gpu_bfyx_gemm_like",1], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",1], + "8751016391945753900": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "5175845410753897614": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4318632837402329958": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12608839247035566137": ["convolution_gpu_bfyx_gemm_like",2], + "1081969835308672753": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9340606088243696490": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "8143125165478395106": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "948917645960296825": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15733030371524967129": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18325123280144403295": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "7430073011895298582": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "3006979228759768702": ["convolution_gpu_bfyx_gemm_like",2], + "9899897639161550704": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "15516674573659704770": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "17675227620234837075": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8774613863662947205": ["convolution_gpu_bfyx_os_iyx_osv16",113], + "411914986559525749": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "13994738382469480124": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "6261584163347634965": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8648848365873958010": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "17358462939783262207": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10117784802089387496": ["convolution_gpu_bfyx_gemm_like",2], + "1021364163511049664": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15576932271488848457": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",468], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",575], + "8901432555239515645": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17489255290900178723": ["convolution_gpu_bfyx_gemm_like",2], + "6819846227498139601": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12967849866710811070": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16484600784717969318": ["convolution_gpu_bfyx_gemm_like",1], + "7904735292914337507": ["convolution_gpu_bfyx_gemm_like",2], + "1346716334208025932": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "16419903786705052849": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16954232936536653281": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "6140789642561898454": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10396343030099602596": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6522575549211855712": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11306782565667740785": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "8114928396876060694": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "17598441149165536737": ["convolution_gpu_bfyx_gemm_like",2], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "11413890625163220846": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "3811325657214369711": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "11798081355131440794": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14763015336626099830": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2808205041095636198": ["convolution_gpu_bfyx_gemm_like",2], + "11006325877486632502": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "11135894989941122115": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16818714747882774917": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "1197101651805223230": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5754301693527535975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "9940300152880498818": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15809072026388479729": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "2525260242689556544": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "1462775202780029067": ["convolution_gpu_bfyx_gemm_like",2], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",0], + "15096978026328154490": ["convolution_gpu_bfyx_gemm_like",2], + "14945451027055549800": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2411809718611709031": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "6364765994481977132": ["convolution_gpu_bfyx_gemm_like",2], + "7606716827635769887": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "759816003617478606": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "8100051552977329013": ["convolution_gpu_bfyx_gemm_like",2], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "16061176355133391199": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "16801078648431425148": ["convolution_gpu_bfyx_gemm_like",2], + "16497757978901707098": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "7230623964042057933": ["convolution_gpu_bfyx_gemm_like",2], + "15461879919099373703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "13671635457689276237": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8149815705026829258": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5115051214738974496": ["convolution_gpu_bfyx_gemm_like",2], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "17208186152576814861": ["convolution_gpu_bfyx_gemm_like",1], + "13502487084912428404": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "12972406304361050136": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "2623687018437195679": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "2451627421465368826": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "1145700078649932035": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "11932768899981458741": ["convolution_gpu_bfyx_gemm_like",2], + "17188170051014066220": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13073917160317338455": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "18156747282906367814": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "18355551625040856531": ["convolution_gpu_bfyx_gemm_like",1], + "9657585348407617520": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10841786394951910408": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8897786294680986991": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "9067207838429479363": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "17430593168191424639": ["convolution_gpu_bfyx_gemm_like",2], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11523864029587161089": ["convolution_gpu_bfyx_gemm_like",0], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",2], + "4017163133829149027": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3320392060021963536": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "368147139706197757": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5381578460674280089": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "5523778675167321193": ["fully_connected_gpu_fb_io_ref",0], + "2597435203284675496": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2081318772333460627": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",564], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "9104710269725948935": ["convolution_gpu_bfyx_os_iyx_osv16",562], + "10447427622114317323": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "10263861857115868555": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "14561847633011875566": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5169676188205309169": ["convolution_gpu_bfyx_gemm_like",2], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "11192356850081328892": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "3170336071769787200": ["convolution_gpu_bfyx_gemm_like",1], + "1938627662342504660": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "13505239531682993049": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11327678075247102542": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2585767464396438954": ["convolution_gpu_bfyx_gemm_like",2], + "3377472614945731801": ["convolution_gpu_bfyx_gemm_like",2], + "7838176322738051195": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "7520300815632157008": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5124291229936820926": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "17182839667242694171": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "2098357709530580176": ["convolution_gpu_bfyx_gemm_like",2], + "10856527039674342926": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "5658491804782285708": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2510093757258898215": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "13817553830305981296": ["convolution_gpu_bfyx_gemm_like",1], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17923035110851963413": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "8468092944055919238": ["convolution_gpu_bfyx_gemm_like",2], + "2893564501191050837": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "218070270815606832": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3124997104810767514": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "16565784556269819846": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16429816273405099453": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",1], + "11006013403687198405": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "697609699740088622": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "15641049130597645936": ["convolution_gpu_bfyx_gemm_like",2], + "17287487062245049466": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7575675354187625951": ["convolution_gpu_bfyx_gemm_like",2], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "18348301285923584995": ["convolution_gpu_bfyx_gemm_like",2], + "11098189888598804624": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13015379405020620466": ["convolution_gpu_bfyx_gemm_like",2], + "17287404861045114619": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "13045206675957093567": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "2479282650381163888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16053441017037949431": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "10451904743064959757": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "5902427784683046762": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "1006721963560645335": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17243953172314194409": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "17223169013008075474": ["convolution_gpu_bfyx_gemm_like",2], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "14008438372661779490": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7958595516465029682": ["convolution_gpu_bfyx_gemm_like",2], + "426267761240826769": ["convolution_gpu_bfyx_gemm_like",1], + "241860795253927746": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7937517564893685647": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "8166976803757624321": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17163595630291422874": ["convolution_gpu_bfyx_gemm_like",2], + "3502889736327580141": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17338623890209792485": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "3362829461757548683": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16865271154583564899": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "17185089684685480638": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2702566744272427570": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7712831597869354170": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12219239604684537521": ["convolution_gpu_bfyx_gemm_like",1], + "9318550032135064372": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "6494837659483504443": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6621483425195088869": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "5458310740719324710": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "12840204133991239572": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",2], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "15715775011639091549": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "5065071428884648135": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "4296524295134959042": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "18384657372655350144": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "2912984501615111849": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2103507679502667581": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "4597954342704466825": ["convolution_gpu_bfyx_gemm_like",1], + "5567670507334783760": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3561366509539440079": ["convolution_gpu_bfyx_gemm_like",1], + "1364905900191854779": ["convolution_gpu_bfyx_gemm_like",0], + "1339402691552717009": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "15678329601718218341": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "9005351264094503686": ["convolution_gpu_bfyx_gemm_like",2], + "3518605747492037670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "16359282790151128772": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "360764089318153518": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "15834666915651997510": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "11851526665791263153": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "13007534905441600782": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16323870023648254366": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "17190698921280188790": ["convolution_gpu_bfyx_gemm_like",2], + "9753702905908744910": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "56327004269432885": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15936869458531244961": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "13702914647519703599": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11459784003592366395": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9004823715680825977": ["convolution_gpu_bfyx_gemm_like",2], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "6505706083205285176": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14074914477149374595": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "8131682691875884781": ["convolution_gpu_bfyx_gemm_like",2], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3177362994630209421": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9780938731831129283": ["convolution_gpu_bfyx_gemm_like",2], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "17795554443343871443": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "545425355231744794": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3885931890288969926": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "1054159213127890689": ["convolution_gpu_bfyx_gemm_like",2], + "12664952811642406457": ["convolution_gpu_bfyx_os_iyx_osv16",569], + "2080397907007737054": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "16494403731659808258": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "14716719350966652036": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1119928633562250911": ["convolution_gpu_bfyx_os_iyx_osv16",947], + "7713736987017889212": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "14939750655636313880": ["convolution_gpu_bfyx_gemm_like",2], + "1646362346584649954": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13192885349640152576": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "16025442470600124062": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "17142080999569154649": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13394233139064923018": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9410978119783758141": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2047041720569246861": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5454796925594082324": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",0], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1682776041247037802": ["convolution_gpu_bfyx_gemm_like",0], + "10624567684389583173": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "17959539037614502049": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1954052357826969119": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "16956980254113285457": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "922541506531537121": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6447172410311223671": ["convolution_gpu_bfyx_gemm_like",1], + "15052127817178941719": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "11062005455602919062": ["convolution_gpu_bfyx_gemm_like",1], + "6351924049625723579": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "12925156865008155065": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5291944277945000781": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "311255514995417672": ["convolution_gpu_bfyx_gemm_like",2], + "11868419561534906809": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "3664562521273273709": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11299275869800089824": ["convolution_gpu_bfyx_gemm_like",1], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11107930597263802755": ["convolution_gpu_bfyx_gemm_like",2], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",2], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",2], + "5284132464580556804": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "17309224746854446222": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "8154794217037682993": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18133614045401867449": ["convolution_gpu_bfyx_gemm_like",2], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "3897967722980386263": ["convolution_gpu_bfyx_gemm_like",2], + "15088940149962496972": ["convolution_gpu_bfyx_gemm_like",1], + "7083152697366621236": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9298483238271063853": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "10625675062556386448": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6121673167888047110": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12541764833974378504": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12923653434892323603": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "2567809041240246707": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17942120824047252501": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17912189681971987483": ["convolution_gpu_bfyx_gemm_like",2], + "7573459699367415551": ["convolution_gpu_bfyx_os_iyx_osv16",515], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",539], + "8489998884193999354": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10356951625481502476": ["convolution_gpu_bfyx_gemm_like",2], + "14044495589185586465": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "41672385434660942": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "8262441556572334783": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "16748743818537812349": ["convolution_gpu_bfyx_gemm_like",2], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "8234878941966364642": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "7396823789595001064": ["convolution_gpu_bfyx_gemm_like",2], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2850279308978256234": ["convolution_gpu_bfyx_gemm_like",2], + "10068502639160680134": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "4766071144928072260": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "4455497237293642238": ["convolution_gpu_bfyx_gemm_like",2], + "3621449131285713809": ["convolution_gpu_bfyx_gemm_like",2], + "18044455700176500102": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3623866842874047894": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "17332230377845694888": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "3200047546714112402": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "8682149821028981871": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",661], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "6290180140047520382": ["convolution_gpu_bfyx_gemm_like",1], + "5135353986081664933": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12138556002719602750": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "15603643151057665338": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "5033665285977853779": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "17433037267999205350": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",2], + "9069334144391048686": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "13199524367893035805": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17279975778400757791": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "13925839061045347955": ["convolution_gpu_bfyx_gemm_like",1], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "3761770343527826418": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "9092949297095391463": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10545749454895857995": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4590784654677429162": ["convolution_gpu_bfyx_gemm_like",2], + "7981376447277193852": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "6964180083696019970": ["convolution_gpu_bfyx_gemm_like",1], + "6496839689453807726": ["convolution_gpu_bfyx_gemm_like",2], + "203639177311791127": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "1005880016096298476": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",42], + "9750510172185801133": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6905249031401202060": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17575578027095664417": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3036808833459559381": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "3125577147662589592": ["convolution_gpu_bfyx_gemm_like",1], + "10708706979952421150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "5756918912614763074": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5973242004448142604": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "9863856393759813897": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9609257787066002999": ["convolution_gpu_bfyx_gemm_like",2], + "8454760437961964894": ["convolution_gpu_bfyx_gemm_like",2], + "1117787205894124896": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "14471867575610362464": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "3816774953143987171": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8576229375621297412": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "9759380701896779097": ["convolution_gpu_bfyx_gemm_like",2], + "17774902969414949042": ["convolution_gpu_bfyx_gemm_like",2], + "3882955134902442387": ["convolution_gpu_bfyx_os_iyx_osv16",720], + "16683485007140805060": ["fully_connected_gpu_fb_io_ref",1], + "16767564582561837873": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2104529100867065546": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "10158184435144178161": ["convolution_gpu_bfyx_os_iyx_osv16",337], + "11892088065638996743": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "9743806043658380623": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "17228877915053571642": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3272776991539782834": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14234117003504517946": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2220961811760955456": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "8850600236849718709": ["convolution_gpu_bfyx_os_iyx_osv16",1024], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",831], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "5519835581976587401": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "17040970955448750876": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "4239133538073498792": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "5409924335138540834": ["convolution_gpu_bfyx_os_iyx_osv16",526], + "14132290154676895976": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12582624102297726596": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "9454954846682513038": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4950144098898276785": ["convolution_gpu_bfyx_gemm_like",2], + "12427258337646070422": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7228139313323996640": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9569446666675696513": ["convolution_gpu_bfyx_gemm_like",1], + "7813041847979170166": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17628984504073918701": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "2713481951804190325": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "15489882561480858974": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",0], + "3939805316470672966": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3839690227347352846": ["convolution_gpu_bfyx_gemm_like",2], + "17864395500488861670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5036963191507722541": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "261021128656714770": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "12482312825666761192": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "18219755699990183812": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9070474871526366492": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "2841943277631596989": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "5570191330195573102": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "12823842409678756966": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "6262190151863459214": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "69884424286147709": ["convolution_gpu_bfyx_gemm_like",2], + "2521821959816944292": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "17471843449888763571": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2777614869053822003": ["convolution_gpu_bfyx_os_iyx_osv16",377], + "13126786259906598018": ["convolution_gpu_bfyx_os_iyx_osv16",1026], + "13948873105076070952": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "4220826666482500445": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "8422808932256100230": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "6621371075123542816": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",512], + "15360511165237335684": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17399542571019639128": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10117376369841171716": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "846177346130290194": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "15336590103518398224": ["convolution_gpu_bfyx_gemm_like",2], + "17243648226968859637": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2930848604606590505": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "3621070130367713395": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "15411603884973340468": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15016406041863758148": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "13804435767468730732": ["convolution_gpu_bfyx_gemm_like",2], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "10463896120685306944": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15808629700189777056": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17713034180977313726": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "9404677451270692749": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "7942294816235384071": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "1865187811299838654": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "17049054004246292085": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "4147006350295905486": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "9655550151067451233": ["convolution_gpu_bfyx_gemm_like",2], + "9833242806281729759": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "9038991914155436715": ["convolution_gpu_bfyx_gemm_like",1], + "10730856574108806045": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "5461980510262646821": ["convolution_gpu_bfyx_gemm_like",2], + "4679163800360809315": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12714194906146827658": ["convolution_gpu_bfyx_gemm_like",1], + "3859314295530377028": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "7263339400190408379": ["convolution_gpu_bfyx_gemm_like",2], + "15532419087060587119": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5536424274663702901": ["convolution_gpu_bfyx_gemm_like",2], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "2746052215199129520": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "17716065235878633691": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5685381761573686628": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8258382025812748961": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "14707855908416908375": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "14650567822254940018": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "15294692035670155801": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "498239903908845198": ["convolution_gpu_bfyx_gemm_like",2], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "15479549936562568596": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7363788553442810299": ["convolution_gpu_bfyx_gemm_like",2], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "15775917744517770768": ["convolution_gpu_bfyx_gemm_like",2], + "9899242398980336120": ["convolution_gpu_bfyx_gemm_like",1], + "12791525533856308302": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "9256308629247511374": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "11433534680781300610": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10965563190266380694": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "17252689774572814142": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "6158514925486943212": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "14786904599410885158": ["convolution_gpu_bfyx_os_iyx_osv16",465], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "11151426820269138585": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1276881030620698911": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17523210737277743952": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4883106423598271822": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "8800251965243080024": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3382494956350224120": ["convolution_gpu_bfyx_gemm_like",1], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "15322989486222859378": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",1], + "801864263975761712": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "9457038545823436137": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "12654574135415748217": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "8131617570786904723": ["convolution_gpu_bfyx_gemm_like",2], + "1663732107639157701": ["convolution_gpu_bfyx_gemm_like",2], + "6695336381467406810": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "11984095218733350838": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14953809073272885651": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4911398420005278258": ["convolution_gpu_bfyx_gemm_like",1], + "4940950742383121943": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "17614929666625976544": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "10683839359385393536": ["convolution_gpu_bfyx_gemm_like",1], + "9207334433308148635": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "18153597620760635012": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13373912451448693522": ["convolution_gpu_bfyx_gemm_like",1], + "7369471926167902143": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "10076578838853982233": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "2935787827649981367": ["convolution_gpu_bfyx_gemm_like",1], + "9198752981132674942": ["convolution_gpu_bfyx_gemm_like",1], + "17693518538833606792": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "572155668587252712": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "530825424084837479": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "1655427025346068673": ["convolution_gpu_bfyx_gemm_like",1], + "10084794570892043447": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "4495774394017823312": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "13359643347682243944": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "11568162864377479487": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "8155752116518841384": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "7982784766505903515": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "4185477435943946730": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8354812222032899427": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16131386739027190836": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "6277198010392189880": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "11845013061234102293": ["convolution_gpu_bfyx_gemm_like",2], + "11287863182337672053": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "4716188972902735458": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "3704618172730076978": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7768680313873061531": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15265621959560796543": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "9130971535185609293": ["convolution_gpu_bfyx_gemm_like",2], + "14930745998253392722": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "8963262014498730146": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",1], + "4947961640303581107": ["convolution_gpu_bfyx_gemm_like",2], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "11088128828863596806": ["convolution_gpu_bfyx_gemm_like",2], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "4294879469633231552": ["convolution_gpu_bfyx_gemm_like",2], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "9608917563823863132": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12889351859522118935": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "1233021176530240722": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4999210721703970274": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14086074948200412805": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8515479970005301094": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "7071991799972799089": ["convolution_gpu_bfyx_gemm_like",2], + "54019631544204590": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12590495767805868405": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "15316782593191029443": ["convolution_gpu_bfyx_gemm_like",2], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17466025028296506313": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "17259951372033727587": ["convolution_gpu_bfyx_gemm_like",2], + "15385506288692289568": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17087143277789116317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14811022197918391667": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1423297940282476513": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "16996022503617157059": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10509933181132310969": ["convolution_gpu_bfyx_gemm_like",1], + "6225447513745282621": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11195875185591819437": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7863886351122918972": ["convolution_gpu_bfyx_os_iyx_osv16",194], + "17006655627343469372": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "8485845304380573432": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10628725059172743408": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17302671258991071440": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2479856511929768548": ["convolution_gpu_bfyx_gemm_like",1], + "702096475436365058": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4327450388326573746": ["convolution_gpu_bfyx_gemm_like",1], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "11806402239500046867": ["convolution_gpu_bfyx_gemm_like",2], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "7848121247546147821": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "1003101267609305257": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "10462144647439624978": ["convolution_gpu_bfyx_gemm_like",2], + "16170708786673864371": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "5229688072405810569": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8269543491844451750": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "11612998433409522582": ["convolution_gpu_bfyx_gemm_like",2], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "11704369548723383645": ["convolution_gpu_bfyx_gemm_like",2], + "16122033101591094139": ["fully_connected_gpu_fb_oi_ref",1], + "2094213523530180653": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "5011769546010018777": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "18117954008112578376": ["convolution_gpu_bfyx_gemm_like",2], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "1540459344569916165": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5055133356846736609": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "10608496431404827757": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8797661560676476245": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "582360460084115077": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "8529170838214082841": ["convolution_gpu_bfyx_gemm_like",2], + "8378690770140438511": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "3860603464276263676": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13616241450266119966": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "16802487456370986847": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15315327794058441258": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "16063854283763838910": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15924144379094505874": ["fully_connected_gpu_fb_io_ref",1], + "868488930567226694": ["convolution_gpu_bfyx_gemm_like",2], + "10348660503952680688": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10208132281050693649": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "14394427817253242611": ["convolution_gpu_bfyx_gemm_like",2], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17406383217119217230": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "6495132856471482043": ["convolution_gpu_bfyx_os_iyx_osv16",865], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "14094981198645015124": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "11782525502250249483": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5230871884758163940": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "6898793319624390153": ["convolution_gpu_bfyx_gemm_like",2], + "13600579723542095577": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "9207413252274439059": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15151957983054148973": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "14366252780310630703": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10428477376571919905": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "18250076003231973692": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6778781361481531516": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4196367396954155354": ["convolution_gpu_bfyx_gemm_like",2], + "4406157095142118884": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "15381551674482810230": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "18308661808437079996": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",280], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "8732106543033226791": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "10568883265991969648": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1795659014508380077": ["convolution_gpu_bfyx_gemm_like",1], + "14141983383097250411": ["convolution_gpu_bfyx_gemm_like",1], + "6651097363666320726": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "10902108166827340970": ["convolution_gpu_bfyx_gemm_like",2], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "9357359875134299131": ["convolution_gpu_bfyx_gemm_like",2], + "14579050468883613611": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "1876286132660871464": ["convolution_gpu_bfyx_gemm_like",2], + "2740287492529009109": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "15285236716284874711": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "1062508357634542606": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "18373068999874730591": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "9831195630506601660": ["convolution_gpu_bfyx_gemm_like",2], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "7019316994558628633": ["convolution_gpu_bfyx_gemm_like",2], + "13729951531199985382": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "9643671820560131959": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "15841489476316341204": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "15024130918582332928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14301661367597749567": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "6707221689266688389": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "1303304215797905198": ["convolution_gpu_bfyx_gemm_like",2], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "7658318862249823838": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "4347494599650425733": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "8939520209266902800": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17886436103211436626": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "12757564215386697460": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "14959281374959998609": ["convolution_gpu_bfyx_gemm_like",2], + "18204971481718743856": ["convolution_gpu_bfyx_gemm_like",2], + "7174804306958128658": ["convolution_gpu_bfyx_gemm_like",2], + "4550028191070279999": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1605295763358374504": ["convolution_gpu_bfyx_gemm_like",2], + "12493863403516600413": ["convolution_gpu_bfyx_gemm_like",1], + "8749399240948437294": ["convolution_gpu_bfyx_gemm_like",2], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "16494581774051338901": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "4054010905884346287": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "17713011656078651": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15683344003370367509": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "17604747523124060652": ["convolution_gpu_bfyx_gemm_like",2], + "7688176479120305539": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "12319165874575782715": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "3935883681780676157": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "17828453493113919756": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "9639014900668946045": ["convolution_gpu_bfyx_gemm_like",2], + "15280273795883244074": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7761195307416102494": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5095827462645341808": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "17496371501557652357": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "12085208566397959149": ["convolution_gpu_bfyx_gemm_like",2], + "5996261744926399743": ["convolution_gpu_bfyx_gemm_like",2], + "6954257882806659594": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "16937207522545573792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10600884986702650404": ["convolution_gpu_bfyx_gemm_like",2], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "4797026040899499511": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "1127598752149871162": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5939121107940759940": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "585914943085061885": ["convolution_gpu_bfyx_gemm_like",1], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "11579025491409526679": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "14512407261081843554": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "3963106895592011725": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "2346992541638145615": ["convolution_gpu_bfyx_gemm_like",2], + "12655099960717366198": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",2], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4859271780094116779": ["convolution_gpu_bfyx_gemm_like",2], + "13027039165868458729": ["convolution_gpu_bfyx_gemm_like",2], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "1760830986937165861": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "15551453802011405101": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "2467535554409643460": ["convolution_gpu_bfyx_gemm_like",1], + "15124985846197662243": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "3615203440895591147": ["convolution_gpu_bfyx_gemm_like",1], + "8230144305844912369": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "7826714904736870517": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17342868362584820356": ["convolution_gpu_bfyx_gemm_like",2], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "13462726136352103466": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10433456687054381828": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2577413012740709678": ["convolution_gpu_bfyx_gemm_like",2], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",2], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "15434706304418357961": ["convolution_gpu_bfyx_gemm_like",2], + "12636120902231094700": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6717243674054760598": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "16684378382033936005": ["convolution_gpu_bfyx_gemm_like",2], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "8431845338648284548": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "1410630713443793537": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "1760779615705074283": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "13020929028222837402": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "16228026045292341333": ["convolution_gpu_bfyx_gemm_like",2], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18445243511250094011": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "4860779741225078946": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "12965552570525926289": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "14683086376707577764": ["convolution_gpu_bfyx_gemm_like",1], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "1146282291269334070": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2425177545256374371": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "6995472847770703647": ["convolution_gpu_bfyx_gemm_like",2], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "10270203686708782941": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3150231129728961455": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "14365699621119565405": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11430797372848621790": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "841243068178925457": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "3855151839445505918": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "1179906398014559042": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "6578239603654034233": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "11322451605795727486": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7410628771323937530": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7490524380333929773": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "5695368162557483073": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "12136803297132972709": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6526586547926160627": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "11910060331768652144": ["convolution_gpu_bfyx_gemm_like",2], + "6603489144277795818": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7356440848422235031": ["convolution_gpu_bfyx_gemm_like",1], + "17846557385112426504": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12713087335581316946": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "3831257753143317802": ["convolution_gpu_bfyx_gemm_like",2], + "17372520271370779917": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "8860685325047463026": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13731964100893109797": ["convolution_gpu_bfyx_gemm_like",1], + "2916077416184925232": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7157499157310356912": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8509748651922589684": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10756831914332769026": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",1], + "5369464352361405510": ["convolution_gpu_bfyx_gemm_like",2], + "9522661528867955338": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1691554843141984381": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "13797057152042581440": ["convolution_gpu_bfyx_gemm_like",1], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "10205696100164492716": ["convolution_gpu_bfyx_gemm_like",2], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "2055914145961691571": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8104331313502492541": ["convolution_gpu_bfyx_gemm_like",1], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "15282806587681892519": ["convolution_gpu_bfyx_gemm_like",1], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15552287544878243347": ["convolution_gpu_bfyx_gemm_like",1], + "14156845527754813253": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "6740385846687754849": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "12823080103951853168": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17851024468934906318": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "10190532901392055501": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4113935675071480884": ["convolution_gpu_bfyx_gemm_like",2], + "14757855448502485216": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "15857087373591747006": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5352896995050401444": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8701639906504450534": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "17526891234501366023": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "14269161473352876138": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "17631458041591681785": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14213516751025324346": ["convolution_gpu_bfyx_gemm_like",2], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "5754844816339228920": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "9324602658580246084": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "10660722770448981436": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9743549865786050651": ["convolution_gpu_bfyx_gemm_like",2], + "4356806313729405658": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "5906712613621491207": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9522947878591994913": ["convolution_gpu_bfyx_gemm_like",2], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "13468713306678453952": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "9527075413813342687": ["convolution_gpu_bfyx_gemm_like",2], + "11369389082421346630": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "4479979951990338510": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2039909180006215069": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "14174805457643822445": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "533820672115442982": ["convolution_gpu_bfyx_gemm_like",2], + "459936950868112292": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "6747799061507191246": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9468542963649996822": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "6108475838757986889": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17769703068450272262": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "17128723415461475388": ["convolution_gpu_bfyx_gemm_like",2], + "1713947356482032411": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "5887877259873928726": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4607428643002808173": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "16149924641081427062": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "2388209402010617408": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "18043340998699622388": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "15511138074959300404": ["convolution_gpu_bfyx_gemm_like",2], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "13348329768178411596": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "1299452063079314341": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9497934813418221769": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "1395293354112586043": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7730305811644972643": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "17514082938765137629": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "18259001228411909210": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "6587817876244206939": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7089077910858800239": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "7289940394271052757": ["convolution_gpu_bfyx_gemm_like",1], + "13702692566238948173": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9391425117463100557": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "15695275881213623746": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "4897991181236908768": ["convolution_gpu_bfyx_gemm_like",1], + "12582321591799165205": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "1629816265162728770": ["convolution_gpu_bfyx_gemm_like",1], + "14740550583313186369": ["convolution_gpu_bfyx_gemm_like",1], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14808831640065476291": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4369346833875105372": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12836639380579091509": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "1650519167046658780": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "1114661658519542600": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18132981365225439999": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2467766894778630615": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",876], + "3107611675766875160": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "4202116155711873525": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "8509882139595784161": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "3995098494991567714": ["convolution_gpu_bfyx_gemm_like",2], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "363330365598760149": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "10395191003166536655": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "11696231285411686761": ["convolution_gpu_bfyx_gemm_like",2], + "14289048840489035546": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "8655525088525612583": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11640865562390693266": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "5020605371834958647": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "6296371382672640627": ["convolution_gpu_bfyx_gemm_like",1], + "13337315872184544686": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "2376239021851907962": ["convolution_gpu_bfyx_gemm_like",1], + "1208534686657112759": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "310584224049735004": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "3435773540391994106": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "13676670925355487305": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "17343050785312683560": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "14176233347574275776": ["convolution_gpu_bfyx_gemm_like",1], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",1], + "5691889055008878111": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "6306539529168638031": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12253987037990618484": ["convolution_gpu_bfyx_gemm_like",1], + "2040762223425679479": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "9165275903833498932": ["convolution_gpu_bfyx_gemm_like",2], + "15156836293519486753": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7974614031099580856": ["convolution_gpu_bfyx_gemm_like",2], + "11928926429060828408": ["convolution_gpu_bfyx_os_iyx_osv16",132], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14755869345266103764": ["fully_connected_gpu_fb_oi_ref",1], + "9557728221162137067": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "14417033368952865805": ["convolution_gpu_bfyx_gemm_like",1], + "16026019808764920641": ["convolution_gpu_bfyx_gemm_like",2], + "16897485136352617189": ["convolution_gpu_bfyx_gemm_like",2], + "2688060699200137048": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4834591210311380436": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",527], + "13500369101462555447": ["convolution_gpu_bfyx_gemm_like",2], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "5334190564423375247": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "15679696422603106163": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3522455279376021211": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2246205611561147645": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5301394322453453489": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17429692714456679999": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",2], + "9177395776408296291": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14904665242518014005": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "1565612286723277822": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5718472464360340274": ["convolution_gpu_bfyx_gemm_like",2], + "10897008852059401902": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "12676139447729343679": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7142195383189497127": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2789137853864057385": ["convolution_gpu_bfyx_gemm_like",2], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",2], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "5797243082477551421": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3574679673239756551": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "8025053805734757314": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "17991368786018745231": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6203602270552179462": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "16559140502701231107": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "13675314612031135613": ["convolution_gpu_bfyx_gemm_like",1], + "8962502004422485576": ["convolution_gpu_bfyx_gemm_like",2], + "16955653765071712611": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "3217555855036660482": ["fully_connected_gpu_fb_io_ref",2], + "8775336277634573074": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "6876300000441081789": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2070429718533716882": ["convolution_gpu_bfyx_gemm_like",2], + "13941251104772804303": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",1], + "14083279273292567319": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "8336494030011542852": ["convolution_gpu_bfyx_gemm_like",1], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "14010642743400284761": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "498420237272375425": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13765632280570725774": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "9475130054420979752": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "14046217730873620907": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17086887873464601732": ["convolution_gpu_bfyx_gemm_like",1], + "8734483136584351066": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "3018306533413795559": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3355259926747524578": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "17818587793483875865": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13064477237937322246": ["convolution_gpu_bfyx_gemm_like",1], + "18193831330827252971": ["convolution_gpu_bfyx_gemm_like",2], + "12044635257539223503": ["convolution_gpu_bfyx_gemm_like",2], + "4725009116734166168": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "18232459663207612727": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "11327867170377736609": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",1], + "1197184887743937394": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9833540739021310892": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16304963156448605623": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "11213667690594303395": ["fully_connected_gpu_fb_io_ref",1], + "9368244029111057323": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "1168589063110524328": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "6026065914078520895": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "12083217714727863832": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "5246955189449281709": ["convolution_gpu_bfyx_gemm_like",2], + "1724222702460860833": ["convolution_gpu_bfyx_gemm_like",2], + "6973621625148257910": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18010600104565458874": ["convolution_gpu_bfyx_gemm_like",2], + "11981887712163064333": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "7152107839144357830": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "5955810688179557560": ["convolution_gpu_bfyx_gemm_like",2], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "710166379854475667": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "1579733029852052699": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "8680545947510235993": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7380979920013545867": ["convolution_gpu_bfyx_gemm_like",2], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "5488296540132936296": ["convolution_gpu_bfyx_gemm_like",1], + "304721598975479337": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "6133592828563353516": ["convolution_gpu_bfyx_gemm_like",1], + "8158983334404475382": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "7353255713834431471": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "11280403113463077620": ["convolution_gpu_bfyx_gemm_like",2], + "12794030011655906930": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "17361319565503258506": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "3856394004079548211": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12163456975896925619": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "12311901617815857033": ["convolution_gpu_bfyx_gemm_like",1], + "10527256963399838405": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "8334832698020211623": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17965825642065048619": ["fully_connected_gpu_fb_oi_ref",2], + "8235002440285527553": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "4846216894450341698": ["convolution_gpu_bfyx_gemm_like",2], + "7878217536124016199": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6537576410448334203": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "7289633911925073088": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13946367911927964830": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12175297963550750804": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "3432296808755992670": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "13836867092941506302": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "9758759365463492505": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "12305383126483033452": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "9497269191159495932": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "16011429608661242565": ["convolution_gpu_bfyx_gemm_like",2], + "11979910991788695837": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "12792454713887439830": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "15241191584896579183": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",0], + "15534517308430424624": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11878217002671373638": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "8045393243176844621": ["convolution_gpu_bfyx_gemm_like",2], + "4245229655273611845": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "12315068368597230211": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "12169896916690963726": ["convolution_gpu_bfyx_gemm_like",2], + "6674643031068271417": ["convolution_gpu_bfyx_gemm_like",2], + "10838721873837128971": ["convolution_gpu_bfyx_os_iyx_osv16",676], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6796758191974756201": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2215194389847256545": ["convolution_gpu_bfyx_direct_10_12_16",2], + "496948821475405395": ["convolution_gpu_bfyx_gemm_like",2], + "18286006396667126860": ["convolution_gpu_bfyx_gemm_like",1], + "10713207196920878995": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "17446388159565719362": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6493920223660825755": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "10011668671963948912": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5172823024549700279": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5635449856699664273": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "12451592945087000191": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "3363675939515208883": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "2257384183256237750": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14463173937397982331": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "8402396502992483524": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17888721282811720634": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2410828969408182980": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16928564394848059094": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14742909697076926475": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14807299286266923693": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",417], + "7316825051569394089": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "16935619230235600309": ["convolution_gpu_bfyx_gemm_like",2], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "7439340221097179208": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10614918790075146626": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "3069396488274616770": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15929361440504489924": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "10591159235183381823": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "7558864177789582540": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4232250144427804891": ["fully_connected_gpu_bf_io_gemm",1], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6895664772793074050": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "14206328165498357760": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13766538247146238357": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "4945845875046545967": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14309292105974991733": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "15214779483545052950": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "792684262493086891": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "5120274680151325194": ["convolution_gpu_bfyx_gemm_like",2], + "14848732804958314374": ["fully_connected_gpu_yxfb_ref",0], + "1034911525083515252": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13941188114382863776": ["fully_connected_gpu_fb_oi_ref",2], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "1373904073013943690": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "3746573775462003750": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13282612510005390816": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "10073779356457603252": ["convolution_gpu_bfyx_gemm_like",2], + "7404732699742965436": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10306169610486701545": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "11007100272494557520": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "3752278444736105763": ["convolution_gpu_bfyx_gemm_like",1], + "11404331488962230130": ["convolution_gpu_bfyx_gemm_like",1], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "15394217414267195999": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13721983823460534294": ["convolution_gpu_bfyx_gemm_like",2], + "937200116534179904": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "5341876404211768451": ["convolution_gpu_bfyx_gemm_like",1], + "9953329530402569669": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5872553335123308034": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3434842614653335826": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6232596685071671579": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7173828525834910425": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "3409255127071376537": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1149548328523286475": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "15019050434475217267": ["convolution_gpu_bfyx_gemm_like",2], + "11093147488085506266": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "3604379857905625467": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "3447774474841314860": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "16705941191876956548": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "6491772898618671653": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3318430113631867573": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "3416636940668221406": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "6753857156025715321": ["convolution_gpu_bfyx_os_iyx_osv16",223], + "755157892988514864": ["convolution_gpu_bfyx_os_iyx_osv16",136], + "16159852373972174245": ["convolution_gpu_bfyx_gemm_like",2], + "10168317560306247723": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "4370027682980493159": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "13694766887442024878": ["fully_connected_gpu_fb_io_ref",1], + "6556795059657533200": ["convolution_gpu_bfyx_gemm_like",2], + "15387047026300787039": ["convolution_gpu_bfyx_gemm_like",2], + "875552069535001284": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "364197229238830807": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "6293500642319778096": ["convolution_gpu_bfyx_gemm_like",1], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "2477866283402053371": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "5448665190811365701": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "16689318540732157754": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "598745924736700294": ["convolution_gpu_bfyx_gemm_like",2], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8054599744123820194": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "10159790066948852390": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "2805931700404492624": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4718705504966715203": ["convolution_gpu_bfyx_gemm_like",2], + "9444953530704856016": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1677118421195120152": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "16150934538381572916": ["convolution_gpu_bfyx_gemm_like",2], + "11004350075893421731": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6849874726361751307": ["convolution_gpu_bfyx_gemm_like",2], + "16312223896859176991": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "433161293684647032": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6639715607290389968": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",284], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15737508945513376813": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5743482411668939203": ["convolution_gpu_bfyx_gemm_like",2], + "7148542290597073512": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "7281661441196896385": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2542984219353153495": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "6322831233548420761": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "15733883474006568340": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15918017311798856029": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",1], + "11522488904021243956": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3420064118559852968": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "2431923918345445420": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "7860086755625626604": ["convolution_gpu_bfyx_gemm_like",2], + "10982693252072682414": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "973402921452083017": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "7218689869635572700": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9116206094279111365": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12329909110827539139": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16385712633367611786": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "4063525218682664832": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10881884300766361791": ["convolution_gpu_bfyx_gemm_like",2], + "3704271978133986620": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7717602860943327535": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16766706479910720794": ["convolution_gpu_bfyx_gemm_like",2], + "10629681722649771498": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1659851931406041285": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17902799955139047426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15737542477498282367": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "8550133332738529361": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6626716013917662606": ["convolution_gpu_bfyx_gemm_like",2], + "5920614348521143999": ["convolution_gpu_bfyx_os_iyx_osv16",129], + "3617433210865054182": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "2772704069752888874": ["convolution_gpu_bfyx_gemm_like",2], + "9968686603153440164": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14151249542292579535": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "17947613081555491099": ["fully_connected_gpu_fb_oi_ref",2], + "4244790495090049295": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "4554343896877444783": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13599438824699346708": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "937050062571228573": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10250778203413648582": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "10153070641942936648": ["convolution_gpu_bfyx_gemm_like",1], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2838789360952219092": ["convolution_gpu_bfyx_gemm_like",2], + "8272823732258536202": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "9884646296875511696": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4445912157712391517": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",2], + "8153567933591966877": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "7315740838189400004": ["convolution_gpu_bfyx_gemm_like",2], + "5060817429317741254": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14724862072414829490": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "981276017776678882": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "10643373404881648498": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "3355824730785179775": ["convolution_gpu_bfyx_os_iyx_osv16",899], + "1018319414633271980": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "2764034841399585177": ["fully_connected_gpu_fb_oi_ref",2], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "8474585711383508493": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16687701987371294908": ["convolution_gpu_bfyx_gemm_like",2], + "15594091060902767607": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "1787598049938821496": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "2072252610120557179": ["convolution_gpu_bfyx_gemm_like",2], + "6053594232298534345": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "10995424394152951534": ["convolution_gpu_bfyx_gemm_like",2], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "15741360654354155504": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",124], + "12878631058803628679": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9531730330306606343": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "1640358227345963848": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "8737417433314100353": ["convolution_gpu_bfyx_gemm_like",2], + "14445520478857662586": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6040623414692799116": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "10381752670329683275": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "14066219153422011272": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "3113016029551460773": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "1089679781525023551": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "4091785563304559606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "9945721344229922405": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12176879951537921518": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "14173867073407110501": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "277151219694781348": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "14629433964319883917": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "14669219788000023965": ["fully_connected_gpu_fb_oi_ref",0], + "889943986793446284": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "15325302411038679750": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "10177466042250039828": ["convolution_gpu_bfyx_gemm_like",2], + "16140133852987111783": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "15693851280141842140": ["convolution_gpu_bfyx_gemm_like",2], + "7562624810837784407": ["convolution_gpu_bfyx_gemm_like",2], + "14535007186125575064": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14864150409380754546": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5831305777612569716": ["convolution_gpu_bfyx_gemm_like",2], + "6660221471357497741": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10168217053882274702": ["convolution_gpu_bfyx_gemm_like",2], + "13874754478479442212": ["convolution_gpu_bfyx_gemm_like",2], + "11951606039079763598": ["convolution_gpu_bfyx_gemm_like",2], + "5326891298755303584": ["convolution_gpu_bfyx_gemm_like",2], + "5550000568272972532": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2387628682187438903": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "72444706264681262": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "14257548530334193336": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13711710595263882397": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14436334357815544497": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11231597775940542830": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4536811685836767511": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8161047856682416508": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "15257886319670476581": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "12879205642236526041": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7215460815798365056": ["convolution_gpu_bfyx_gemm_like",2], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2881769839926594784": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "11529521968552409482": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6641684310751726510": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17122338330334998991": ["convolution_gpu_bfyx_gemm_like",1], + "5185895996350118172": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "714397516895317906": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "13146231972557134419": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7005371843527735283": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "4890442595203749341": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "9216695884134021401": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17382660912493284320": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14652719560551657529": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "4690935789908896751": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "17610648476343170476": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6210074450403696110": ["convolution_gpu_bfyx_gemm_like",2], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2287331417346465035": ["convolution_gpu_bfyx_gemm_like",2], + "9235762655002034553": ["convolution_gpu_bfyx_gemm_like",2], + "14996839491874598555": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16507285966998102421": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "557778263661655803": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "7344363094493575878": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17947097500350250352": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "8855801044538137828": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "18214405165366931407": ["convolution_gpu_bfyx_gemm_like",2], + "11095908837221722097": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14902389080201926109": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "12526627889432649075": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "10340099951904598712": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "6489448536745533209": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "12063854963434677046": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "10931533380146553429": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17021953651379372973": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16907043223873231356": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "17633445715900116866": ["convolution_gpu_bfyx_gemm_like",2], + "13980058444317683376": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "8039045580314824307": ["convolution_gpu_bfyx_gemm_like",1], + "13286723666743148654": ["convolution_gpu_bfyx_os_iyx_osv16",880], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "13277308739029064167": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14203217958874365062": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "15278336216464964580": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5724069285122500749": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12460004417430913427": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1755021778097194246": ["convolution_gpu_bfyx_gemm_like",1], + "1062464852330435815": ["convolution_gpu_bfyx_gemm_like",2], + "2267942216745157485": ["convolution_gpu_bfyx_os_iyx_osv16",886], + "4766447533088048613": ["convolution_gpu_bfyx_gemm_like",2], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "2705394837952559308": ["convolution_gpu_bfyx_gemm_like",2], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5734909305243135224": ["convolution_gpu_bfyx_gemm_like",0], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "10155417869639270818": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "16815373779430857324": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5439738552514649732": ["convolution_gpu_bfyx_gemm_like",2], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "17928043901784474130": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "18216392915308276053": ["convolution_gpu_bfyx_gemm_like",2], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12418390364502912036": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "5821887901198535792": ["convolution_gpu_bfyx_gemm_like",2], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6370629727707634189": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10613156984920928792": ["convolution_gpu_bfyx_gemm_like",1], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "14585144905582599299": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",2], + "7071864660784255328": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15310138877321331399": ["convolution_gpu_bfyx_gemm_like",2], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "4788094685976850847": ["convolution_gpu_bfyx_gemm_like",1], + "5699637716202391188": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "451787079167744428": ["convolution_gpu_bfyx_os_iyx_osv16",41], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "6696330836969622824": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "1617907811128880383": ["convolution_gpu_bfyx_gemm_like",2], + "11173744709088359283": ["fully_connected_gpu_fb_oi_ref",2], + "15173187675372221634": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "17868294056467093895": ["convolution_gpu_bfyx_gemm_like",2], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "10050254009828302053": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "17390307025967314108": ["convolution_gpu_bfyx_os_iyx_osv16",718], + "7457951266863598199": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "14595102366207856448": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "9391102514951576629": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2155348872565175553": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6381439938385141423": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2265784112305305260": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7666505529539001492": ["convolution_gpu_bfyx_gemm_like",2], + "17300963371220857043": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9150686862263626364": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "6066347819693426556": ["convolution_gpu_bfyx_direct_10_12_16",2], + "581553908799266285": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",2], + "2543041530639980505": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "7084794834886364709": ["convolution_gpu_bfyx_gemm_like",2], + "8977099691399563065": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "15747538142554815480": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "14156264942337528284": ["convolution_gpu_bfyx_gemm_like",2], + "893885204484374577": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "7671440804202996063": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "11882388384272635526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9080269503597463911": ["convolution_gpu_bfyx_gemm_like",2], + "11985789598994479652": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "861944552852043171": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",0], + "15076307524263378967": ["convolution_gpu_bfyx_gemm_like",2], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "11646035413147246650": ["convolution_gpu_bfyx_gemm_like",1], + "8436644625511258721": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17499047811775012205": ["convolution_gpu_bfyx_gemm_like",1], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "40684756725622867": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15404352708246779967": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17703907155485973486": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "18269382610859905921": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "683530182479794259": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "10506079835013332412": ["convolution_gpu_bfyx_gemm_like",2], + "10433541468308381909": ["convolution_gpu_bfyx_gemm_like",1], + "3652749152621176846": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10747768416582634270": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "14433939319502072879": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12854110364457722483": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "10002942280571012447": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "14611470203914805229": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "3317498303952226642": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7957927312958744432": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "14784115394395151055": ["convolution_gpu_bfyx_gemm_like",2], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "6990161783770805523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12361909180687647792": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9219978118417391687": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15184480575877095737": ["convolution_gpu_bfyx_gemm_like",1], + "18400137500031567479": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "10712251675747436685": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "1404523328737649536": ["convolution_gpu_bfyx_gemm_like",1], + "10340626080611300806": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11913865086932469909": ["convolution_gpu_bfyx_gemm_like",2], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6955820760012983739": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "5901470393936541758": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "11561352430430157770": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "12659539044474018256": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "17479614483340719566": ["convolution_gpu_bfyx_gemm_like",2], + "15630712601053635938": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "15314178289202641916": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15385836287435319028": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "13931470674812510958": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "15982499072593548907": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "3805991105758534542": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4810979456269693700": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14387663434151374245": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "8093154215631195896": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "879461985074219072": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16468779692009938330": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "16507216630035678597": ["convolution_gpu_bfyx_gemm_like",1], + "8525631489886320841": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "16495435651959280198": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "14017106221778585861": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "16620032793356620588": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15668060723417155782": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "15905812449037427213": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15372944709956866587": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "393130776826919699": ["convolution_gpu_bfyx_gemm_like",2], + "10710426249911063154": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "17790622334577372736": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12138341287265949399": ["convolution_gpu_bfyx_gemm_like",1], + "9110265526128628472": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "14322754320861242412": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "11388177266504804841": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14243609293683870669": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "5385637020152792781": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "17651949893303962955": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "557926911473978758": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "9133224739401155411": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6946815194102787268": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "3095800485689583188": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "779633618375662086": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3094541981461578435": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "15444345793124210505": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "2822531372171708171": ["convolution_gpu_bfyx_gemm_like",1], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3787897045202294227": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "9285566577169147378": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "8954139494467782298": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4184940877670248246": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "3013359852055354405": ["convolution_gpu_bfyx_os_iyx_osv16",1049], + "15927212142469570269": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "10744779302034526105": ["convolution_gpu_bfyx_gemm_like",1], + "10422138282116598013": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6046380638013542109": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18169371857833455144": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15140592697506341614": ["convolution_gpu_bfyx_gemm_like",2], + "15033864286535250007": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "6925829066248055368": ["convolution_gpu_bfyx_gemm_like",2], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3167115892101501516": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "11379252854859166206": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17829983167337875463": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "10409424254454997557": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8435953773852854494": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "10772763339005937717": ["convolution_gpu_bfyx_gemm_like",2], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13771196685227797262": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "6754359635395225555": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "890679620691833367": ["convolution_gpu_bfyx_gemm_like",2], + "871656942964602772": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "7458923250983373160": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "18305785425659656349": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "10869059995205753062": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "9626028243479089234": ["convolution_gpu_bfyx_gemm_like",2], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13816380312874384117": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7963120178142346699": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "5061053593616346116": ["convolution_gpu_bfyx_gemm_like",2], + "801943727169437597": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14503814672536990561": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "12693511427898130707": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "1891216794223363114": ["convolution_gpu_bfyx_gemm_like",1], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3201851883430682391": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "914589847837601900": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "1305434952341925041": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "11213283109763090897": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "3290503865540626256": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "6375149408738336520": ["convolution_gpu_bfyx_gemm_like",2], + "8094836777153039013": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "774981050284188673": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "15529767675448574617": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15464327246951632247": ["convolution_gpu_bfyx_gemm_like",1], + "3179874645565098825": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "5776920093461427179": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "8790992468693685188": ["fully_connected_gpu_fb_io_ref",2], + "17608082492919905570": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "5150467145740542480": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "10252930102508743294": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "9660587580162063066": ["convolution_gpu_bfyx_gemm_like",2], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "10133406610245448421": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17195491464960153261": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "1557549837620967530": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15197400201857680173": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17376180096577763039": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11353671464383068485": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "14322392426975869640": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "6227066883925046010": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "10578656188786691161": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "11800958516083095340": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "4072967257556128157": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "4292467512797995948": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7287802938269404923": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9180575279116075400": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6404731509766519779": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8195881973746570408": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10792503079194374004": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "8021962180961047152": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "18136765667969393174": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "9895036366054127607": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "11002656253983635383": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "8481272193490654884": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "4016652650196255483": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "16159055229009077435": ["convolution_gpu_bfyx_gemm_like",2], + "4573547058027867538": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16165264024659208580": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "3539764293444807886": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10849780273184392468": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",2], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "7023033151960653752": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7636001038842031672": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "2858694223939965231": ["convolution_gpu_bfyx_os_iyx_osv16",694], + "4680261350523889008": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14951164724050668856": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "15594387862678649962": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "11273554217552152172": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6438522646185979880": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "80038800201815976": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1917986916390093536": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2054895351334936744": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "4151997155802743451": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "18213389163198755626": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "12363462562375148101": ["convolution_gpu_bfyx_gemm_like",1], + "11312797737791604596": ["convolution_gpu_bfyx_gemm_like",2], + "15392592805235453180": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "5424159498790442193": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "7390751298966198773": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "6695224851008237679": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3865480446980740412": ["convolution_gpu_bfyx_gemm_like",2], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "1615155632991337496": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "14326748416648598247": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "2518919454830671073": ["convolution_gpu_bfyx_gemm_like",2], + "17750329428766282997": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "414342067295883061": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9358320688298379206": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "18139055731468596187": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6129602738379919488": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "5922243230245842969": ["convolution_gpu_bfyx_gemm_like",2], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13387804712929042302": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11927673108508931485": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "13429534778879474114": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "11066538564303243604": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "4440261013093281358": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17325129240374428839": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18074320074700491416": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "12352083215873760290": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "5601320732740276692": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "10462203417605590793": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12561852932488001568": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "17337689605705740533": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "684240994243755872": ["convolution_gpu_bfyx_gemm_like",2], + "10973267399508186283": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "8703051983346886620": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3807725810350819929": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14998412675237613013": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17800494747865760215": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "1241188741090538769": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "2605525859754242318": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15743075522781198932": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "7903891232234389925": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "1818234431954731769": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "1555841293175143289": ["convolution_gpu_bfyx_gemm_like",2], + "2140514316203117958": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "1691020960118022320": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15260010680436431377": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11066913713501760080": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6879801583428507100": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "7945923871349397386": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "737706555781027628": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "3826083535442459719": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4460838234035901102": ["convolution_gpu_bfyx_gemm_like",2], + "17393241435373906917": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "791937929163665770": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13855910108498240870": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",989], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "4987922194420804256": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "10665697051755790682": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "4004333174619528327": ["convolution_gpu_bfyx_gemm_like",1], + "11215297942420903101": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "12260041857695743504": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "15220874718853723626": ["convolution_gpu_bfyx_gemm_like",2], + "17993337310288098038": ["convolution_gpu_bfyx_gemm_like",2], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "6683090495189325653": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "8065408380801722040": ["convolution_gpu_bfyx_os_iyx_osv16",858], + "17370560568464798319": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "1541754036637209097": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "13381441263790184121": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "1742897526168249500": ["convolution_gpu_bfyx_gemm_like",1], + "17508515605648584094": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3142706898070129318": ["convolution_gpu_bfyx_gemm_like",2], + "7833495651619250213": ["convolution_gpu_bfyx_gemm_like",2], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "9549667332801021099": ["convolution_gpu_bfyx_gemm_like",2], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11740474593275702888": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5648099611567577611": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "8162762980597497749": ["convolution_gpu_bfyx_gemm_like",2], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2727219457659794468": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15825993019555657125": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "13186342942242476803": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "13267438341255312172": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16566714514564722975": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "16857192626139882429": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13353123037511986804": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "265378250397648692": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "18260147016899103633": ["convolution_gpu_bfyx_gemm_like",1], + "8374232727884943288": ["convolution_gpu_bfyx_gemm_like",1], + "2253443114793765536": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16132186023443894579": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "16461300997058854554": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "14122647818827599984": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "16091195788712971747": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "14869125900405603130": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "4152919461079296700": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16062811901668074268": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "17761681290527373180": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "12266072789949082198": ["convolution_gpu_bfyx_gemm_like",2], + "3349519148124496343": ["fully_connected_gpu_bf_io_gemm",2], + "13410178186827874638": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "9226912483632588371": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "13426413463253581310": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "10010921697596131761": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "16042236932298055236": ["convolution_gpu_bfyx_gemm_like",0], + "8713639086785023623": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "3855859061709004677": ["convolution_gpu_bfyx_os_iyx_osv16",969], + "17873182129275583020": ["convolution_gpu_bfyx_gemm_like",2], + "5073980187181521102": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "14214141488645257351": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "8700953648388124963": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "263575476655527355": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "16273414163942580140": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "17034122796081495259": ["convolution_gpu_bfyx_gemm_like",2], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "10835684445936063871": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4409539711630405776": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5627351109775149477": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "14204028212129440429": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "2235888904701517631": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13947140171097868740": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "2305345466244887603": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4693778191222244259": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "16126210124715599267": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5440622601084846974": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "6104567430127604601": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "12576360049619146496": ["convolution_gpu_bfyx_gemm_like",2], + "7533669599936874355": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "14408266407898585602": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "26434141991791193": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4186957909762095019": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "17075150439662364176": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",0], + "12015922610963701033": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",1], + "16763947298003094797": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "7476503420928065329": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "2839767407547705101": ["convolution_gpu_bfyx_gemm_like",2], + "15466940145773097237": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "11208787273440167590": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "14944798586094927774": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "10670829898588047148": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "13842309033760176194": ["convolution_gpu_bfyx_gemm_like",2], + "2588106330058954614": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4011704860949525864": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16207793515276299964": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",1], + "3495464175121035222": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "13869279315296163696": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "14146157492452859667": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "4428125859693766145": ["convolution_gpu_bfyx_gemm_like",2], + "18052322665755789573": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",1], + "8420176522157084802": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8619380242063264016": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2315979511894958580": ["convolution_gpu_bfyx_gemm_like",2], + "8394085742794617896": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "14862938122758223157": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "5084402281339667158": ["convolution_gpu_bfyx_gemm_like",1], + "3800864312883193560": ["convolution_gpu_bfyx_os_iyx_osv16",318], + "3643056883397245235": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "5812274221348979687": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "18109284647478027063": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",2], + "13485431068391184236": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1096929244128185929": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "10545983240319359348": ["convolution_gpu_bfyx_direct_10_12_16",2], + "555153826947872383": ["convolution_gpu_bfyx_gemm_like",2], + "18194662560696168435": ["convolution_gpu_bfyx_gemm_like",1], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17699579394941627848": ["convolution_gpu_bfyx_gemm_like",2], + "18106333667377667797": ["convolution_gpu_bfyx_gemm_like",2], + "2424832456352484524": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14559552090809408184": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "14350963106032411355": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17347387929692736001": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7917673216808705075": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "6329618009202266591": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "7111620180131341264": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "12711366212612147422": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14605107834931199380": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17381682740282686038": ["convolution_gpu_bfyx_gemm_like",1], + "553884705007944190": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",2], + "13748207123919546925": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7822148442995976259": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7379959915507694400": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2615550169523847175": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13400559817638330692": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17061233750738578337": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "4238163995861108694": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "1961296939362567851": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "11431776034512615562": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11490143853656040028": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "11080118408282076423": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2038505773698938555": ["fully_connected_gpu_bf_io_gemm",1], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "5514520264534847093": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6478247863479663432": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14361697687217060995": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6857064389795419021": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "8332688858465419317": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13094313253457422444": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "2053428297205345660": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16674897846232931666": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13646026173083209094": ["convolution_gpu_bfyx_gemm_like",1], + "10253092389452603623": ["convolution_gpu_bfyx_gemm_like",2], + "8012414839721814470": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11102920976866402928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14609655423082082099": ["convolution_gpu_bfyx_gemm_like",2], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",1], + "3927333491885837374": ["fully_connected_gpu_fb_oi_ref",2], + "18136968124686255108": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "656536921219262336": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "17140704838989242732": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13891498649894490342": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6625355663340809894": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8382355932367801226": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "7486133596762640215": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "3457676694935264283": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4242438539626727158": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "1188428190761098784": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9996590003462421281": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16614092873294424156": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8964252048679144533": ["convolution_gpu_bfyx_gemm_like",2], + "17821196374523699955": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14788817017267716113": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "7578465277886568471": ["convolution_gpu_bfyx_gemm_like",2], + "7877872008801536537": ["convolution_gpu_bfyx_gemm_like",2], + "12174729877807876787": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12651215303242591871": ["convolution_gpu_bfyx_gemm_like",2], + "13499476832444042458": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "7596423139159263456": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "10462797712860969072": ["convolution_gpu_bfyx_gemm_like",2], + "12526417587678222534": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6223991300587768990": ["convolution_gpu_bfyx_direct_10_12_16",2], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5287076386757143976": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "15199659885055090985": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13510598063226540077": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "11232261979256657934": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",2], + "8394337033015371278": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11864780937861562358": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "2602209853120236226": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "8104509697376352086": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11863623794400366834": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5490683510357615963": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "2349007644347065353": ["convolution_gpu_bfyx_gemm_like",2], + "11769511287553067221": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "16286085532892593349": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "7853648744637103420": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "12882754981683858333": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17387764798693150143": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "18026754720065676632": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "10942743767167283370": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "708347829794105085": ["convolution_gpu_bfyx_gemm_like",1], + "18372277746801271292": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3046878786712386934": ["convolution_gpu_bfyx_gemm_like",2], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "18012549942299450620": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "17119834538806653818": ["convolution_gpu_bfyx_gemm_like",2], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "15989164585998175871": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "17274625805315816028": ["convolution_gpu_bfyx_gemm_like",1], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "6366477005383470532": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "4678945085654662665": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "3266638956600784732": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "1708527842474979709": ["convolution_gpu_bfyx_gemm_like",2], + "15038779174806415801": ["convolution_gpu_bfyx_gemm_like",2], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "8415763978601237333": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "12944449254981328284": ["convolution_gpu_bfyx_os_iyx_osv16",510], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16351593165006175213": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "16496066467505445971": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "17480519865636248903": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "2830019939638455400": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3547275591884493445": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "11439519952236570490": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",1], + "3497946462254198388": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "13041981853634484809": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "13343968006718934574": ["convolution_gpu_bfyx_gemm_like",2], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",1], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "139367204458861048": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2174528711050181972": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "5566145479615299930": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "10134863884423338495": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "2297846338452062425": ["convolution_gpu_bfyx_gemm_like",2], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17838473675663772639": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "772794189370544860": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "204378699575356398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11275526584835606578": ["convolution_gpu_bfyx_gemm_like",1], + "14168946412009689868": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "18259018980049662870": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8403919905230540356": ["fully_connected_gpu_fb_io_ref",2], + "17509205154057032109": ["convolution_gpu_bfyx_os_iyx_osv16",471], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7584912988728072414": ["convolution_gpu_bfyx_os_iyx_osv16",336], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",1], + "3069726952591207961": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15890492401334524258": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "16916632481840858091": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "9031338938030715616": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "2684971093531227585": ["convolution_gpu_bfyx_gemm_like",2], + "9970142663470031403": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "12932174902085755507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10681304359334525584": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "15507430010796753396": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "3723082283919334922": ["convolution_gpu_bfyx_gemm_like",2], + "17286180622990393912": ["convolution_gpu_bfyx_gemm_like",2], + "16881320590336043120": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "11178675492112714513": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "2102507337684140674": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7606277451240586967": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13372079273473545269": ["convolution_gpu_bfyx_gemm_like",2], + "12077176094606956613": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16832083703120717402": ["convolution_gpu_bfyx_gemm_like",2], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "5930451476167223501": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14524011013133838054": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "6324194607665787911": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "18057258413318190788": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "6858245954375015939": ["convolution_gpu_bfyx_gemm_like",2], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "5740738339752793113": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "7092429446071184360": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "14840301687056551916": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "6307840223437204536": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "2758256770667070477": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "17621284804179990612": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15752695063119223631": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "9322808125154719434": ["convolution_gpu_bfyx_gemm_like",1], + "5019077257951332016": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10534355502345993326": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "786418751322581924": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15078379507314446744": ["convolution_gpu_bfyx_gemm_like",2], + "11673506380927771816": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4563407231964979217": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "522181557896569275": ["convolution_gpu_bfyx_gemm_like",0], + "8954957191824520301": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "3055842046969432235": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "765085235448596225": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "176148486634277377": ["convolution_gpu_bfyx_gemm_like",2], + "1743672154424707483": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "9519113693008246391": ["convolution_gpu_bfyx_os_iyx_osv16",1102], + "3892873577927627992": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "10565789595834959047": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10743628077362128751": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10031973538398542700": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "9236621881488650027": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "12850610175882424919": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "16822728519529055454": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8886676435675463412": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "11726125778063855770": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "7002547494442875680": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15751445344585167275": ["convolution_gpu_bfyx_os_iyx_osv16",1056], + "1187817806204244044": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",2], + "253337639942573142": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",1], + "10254790628108678637": ["convolution_gpu_bfyx_gemm_like",1], + "9513218905938141296": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "170594581804738255": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "18415227597391874233": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "17707294419513060769": ["convolution_gpu_bfyx_gemm_like",2], + "15861253904810475842": ["convolution_gpu_bfyx_gemm_like",2], + "6638761803107874904": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11033824757086203326": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "16767657090925788431": ["convolution_gpu_bfyx_gemm_like",2], + "7174790971918109163": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "18096803908321982720": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "8938942439963723596": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "6447357750120537934": ["convolution_gpu_bfyx_gemm_like",2], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "13476976389397273052": ["convolution_gpu_bfyx_gemm_like",2], + "16522364268583242080": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16703049240941366828": ["convolution_gpu_bfyx_gemm_like",2], + "14121939808880396150": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "11044223289209000460": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13387602037439694372": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "104165137500939902": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "7552144047474664265": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "15598527290222497283": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13881505737488515065": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "1014934490175718598": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15891746043846062984": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "11782514629636023633": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1896394898744191046": ["convolution_gpu_bfyx_gemm_like",1], + "9055254157155243850": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "475079717987185580": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "4344644499804057502": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15467064540951151390": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16120159001372711511": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "15643053402284856082": ["convolution_gpu_bfyx_gemm_like",2], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "17517541283617012275": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "8321148793275220552": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11078289776590382448": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "9423239651872522813": ["convolution_gpu_bfyx_gemm_like",2], + "1957975992563882145": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "10104091044601583658": ["convolution_gpu_bfyx_gemm_like",2], + "2686152083115758704": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "6672808203620992802": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "17172842643607718498": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15911508155433936727": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "529543453251381109": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9305758766575321575": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "14555366228958374512": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "7199295899520406795": ["convolution_gpu_bfyx_gemm_like",2], + "12796777049340516563": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "7020743056013297476": ["convolution_gpu_bfyx_gemm_like",2], + "14071393823183565145": ["convolution_gpu_bfyx_gemm_like",2], + "13602299412525111348": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "12394049027081208902": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",812], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "10087048842366891699": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "142345353315012903": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4132087699110753428": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "591445875836641836": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "4960466075321426984": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "15976399554094563736": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11386443944172875185": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5485050451156514865": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "17615365894230830516": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "14118838785256822389": ["convolution_gpu_bfyx_gemm_like",2], + "8866164762286856139": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "97332433783610027": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "17080372737840346243": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12650986929262866534": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "4477135619420651110": ["convolution_gpu_bfyx_gemm_like",2], + "9040986180016264906": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "1413598669014941757": ["convolution_gpu_bfyx_gemm_like",2], + "7431469348791099474": ["convolution_gpu_bfyx_gemm_like",2], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "13470016086265528105": ["convolution_gpu_bfyx_gemm_like",1], + "5854267518455107328": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15603710070700542017": ["convolution_gpu_bfyx_gemm_like",2], + "5219818570070061892": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1601512693620510391": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13297691763391637265": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",1], + "5706423911886410117": ["convolution_gpu_bfyx_gemm_like",2], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "12951069548510783681": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15591167992985613695": ["convolution_gpu_bfyx_os_iyx_osv16",503], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4107088111454348836": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "6124219814856247918": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "10062957707721107508": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16179959997108523051": ["convolution_gpu_bfyx_gemm_like",2], + "9647713236241614167": ["convolution_gpu_bfyx_gemm_like",2], + "10884966210360699082": ["convolution_gpu_bfyx_gemm_like",1], + "2728956755635458379": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8578774826625315147": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5414285637221358737": ["convolution_gpu_bfyx_gemm_like",1], + "14172081523880352608": ["convolution_gpu_bfyx_os_iyx_osv16",572], + "15786328370300803713": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12647099325257717945": ["convolution_gpu_bfyx_gemm_like",2], + "13292923826380958700": ["convolution_gpu_bfyx_gemm_like",2], + "18439017855540532958": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "3683201905077543598": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11855137287698046529": ["convolution_gpu_bfyx_gemm_like",2], + "15479071839425218367": ["convolution_gpu_bfyx_gemm_like",2], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "9145357433824567384": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "8797843396807284399": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11544455862638831851": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3296080624478711270": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "15929970324703663357": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",2], + "16181623411787179429": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "7780366826820540504": ["convolution_gpu_bfyx_gemm_like",2], + "4538102435488584866": ["convolution_gpu_bfyx_gemm_like",1], + "7129623351507828661": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "16629493658542781988": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "14177187878748170225": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4049276089777687996": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "143255828863957128": ["convolution_gpu_bfyx_gemm_like",2], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "8882042369902399339": ["convolution_gpu_bfyx_gemm_like",1], + "676641023579624117": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13194245601015251743": ["fully_connected_gpu_fb_io_ref",1], + "1641881628032037384": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "9529614587861271730": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "17116941326889312928": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14336344152455180534": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9559550404190168365": ["convolution_gpu_bfyx_gemm_like",2], + "8985531644129639832": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "875296362957469305": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "10071611039987219440": ["convolution_gpu_bfyx_gemm_like",2], + "17585210048585855482": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13558603350852076889": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "13839075443229327158": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "7570078010521452080": ["convolution_gpu_bfyx_gemm_like",1], + "7054270030260701612": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "12847879935060092791": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "16483792160297698151": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5343186686923330871": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "2438221595194783178": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "294103776081392899": ["convolution_gpu_bfyx_gemm_like",2], + "689445825453914111": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4729855738455185191": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "1630585964216121575": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "7630342538679060038": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "18383733736250135501": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15995056067568652754": ["convolution_gpu_bfyx_gemm_like",1], + "15129201859573664210": ["convolution_gpu_bfyx_gemm_like",2], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "3012268657922581268": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1160579996766519752": ["convolution_gpu_bfyx_gemm_like",1], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16522546805419218429": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12992163255353386581": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "11071972036962275632": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "1269703478898366518": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",1], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",1], + "9052153145556623933": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "14037325204801680738": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "2482449683288477640": ["convolution_gpu_bfyx_gemm_like",2], + "6515141738021465336": ["convolution_gpu_bfyx_gemm_like",2], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "12709406234969954619": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7963529808900784906": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "9890252170749328138": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5053369963163583573": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "14247451223653900488": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "12698546873263218041": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "3408249386342406615": ["convolution_gpu_bfyx_gemm_like",1], + "9454028594043242985": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "13401926003864565026": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "8058623285594809047": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13624969243174329965": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3810356382905059819": ["convolution_gpu_bfyx_gemm_like",1], + "1836277956961261472": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9164584153555521506": ["convolution_gpu_bfyx_gemm_like",2], + "10265955847846166394": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "7291920886894073603": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "3191047205441946466": ["convolution_gpu_bfyx_gemm_like",0], + "15862793522143880668": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "8434794604559592624": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "11595387512434355394": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "17035903590837750750": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3510837206834640871": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "14291113322487568376": ["convolution_gpu_bfyx_gemm_like",2], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "11892210755884128272": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "844742962836593299": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "11191005013126286532": ["convolution_gpu_bfyx_os_iyx_osv16",552], + "13727643349589056375": ["convolution_gpu_bfyx_os_iyx_osv16",439], + "11273168411455998347": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "3509027370372599394": ["fully_connected_gpu_fb_io_ref",2], + "14185215566042478462": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "12927339938362960563": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "1265277707626014051": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "4491694127072416122": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "5340016094501559693": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6150043972317126583": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",21], + "281287280558289393": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "4264078972561407296": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "997155336931700015": ["convolution_gpu_bfyx_gemm_like",2], + "7552049239568474944": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3280795516668356985": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "5296506025538423220": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "13987250743654950733": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15381014522874131924": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "11026432639515866259": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3625906783784771100": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "9339038855869763548": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "14907038741687299621": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4206637285289830669": ["convolution_gpu_bfyx_gemm_like",1], + "9266375177690276615": ["convolution_gpu_bfyx_gemm_like",2], + "17543625777838573622": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "5515216528474382598": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "18076129452098771655": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "5750277248295796439": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "12815588500303820284": ["convolution_gpu_bfyx_gemm_like",1], + "10809330882739297269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11359020774437470164": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4476037346005841003": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13198480749588992978": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15452906059667613512": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4178614913813882037": ["convolution_gpu_bfyx_gemm_like",2], + "1435153323458789173": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13312401790608349463": ["convolution_gpu_bfyx_gemm_like",1], + "11919579121199894437": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "7351443601143314161": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2801141274570069180": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "9883682535839267422": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "1686420552593340731": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "8898449752724034655": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "830147122986411443": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8837079302496539409": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "2995957440356398418": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4316278502963439894": ["convolution_gpu_bfyx_gemm_like",2], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "14645023135017806432": ["convolution_gpu_bfyx_gemm_like",2], + "13054706902087663592": ["convolution_gpu_bfyx_gemm_like",2], + "17372326727957287976": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10554266898346470422": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "10168272404395268951": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "9556219639756304369": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "906587812125311288": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15406324750533549980": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "6750003965952674453": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "4949865765880884373": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "11622271315873664622": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "6278892144796112655": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1090447867763814054": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11845504142528424662": ["convolution_gpu_bfyx_gemm_like",2], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "16995919898822376726": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "11354523117287453982": ["convolution_gpu_bfyx_gemm_like",2], + "3239779684432082106": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "15783558375979538895": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",883], + "16605697831520435304": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "625469553102754234": ["convolution_gpu_bfyx_gemm_like",2], + "20037669704517227": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "2328951328483718941": ["convolution_gpu_bfyx_gemm_like",2], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "9513403717116039597": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12685978195521469707": ["convolution_gpu_bfyx_os_iyx_osv16",189], + "12752101288912456176": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "1294871956977733262": ["convolution_gpu_bfyx_gemm_like",2], + "15692223101958737604": ["convolution_gpu_bfyx_gemm_like",1], + "11453044274130869816": ["convolution_gpu_bfyx_gemm_like",2], + "12379734005351960619": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5786828339670204894": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "4010650902230520983": ["convolution_gpu_bfyx_gemm_like",0], + "13583272198088247606": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15134268179029323647": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "7395593936948809439": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "3349108500387301004": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12407002532205454767": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "7004953121070642766": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "6644418194983229139": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "1027438463802481676": ["convolution_gpu_bfyx_gemm_like",2], + "10642327923162019888": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "5061795324735006354": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "1421879144542252228": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "16978447917682236120": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6771637612965430926": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "4586246090279043149": ["convolution_gpu_bfyx_gemm_like",2], + "17357800564047774826": ["convolution_gpu_bfyx_gemm_like",2], + "2008999755215725290": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "981733129438741439": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7211355951470869591": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "5338109154207406041": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5031342439443897167": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "14249486431781112226": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "5424164608102708333": ["convolution_gpu_bfyx_gemm_like",2], + "11802527991096689252": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "981197653890885407": ["convolution_gpu_bfyx_gemm_like",1], + "8612114608666892632": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "1019936903773818652": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13077961697656030315": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8317140711232187781": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8169762955969255618": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "123283730755186382": ["convolution_gpu_bfyx_gemm_like",1], + "5083776511235413204": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5510336500642744696": ["convolution_gpu_bfyx_gemm_like",2], + "9625931001541723278": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "13538051178827008933": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "7744787957569714828": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "3378088934862423864": ["convolution_gpu_bfyx_gemm_like",1], + "7978370756654787278": ["convolution_gpu_bfyx_gemm_like",1], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "17340789730321673934": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "7843833033404155302": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "5670530004773188380": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",1], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "15031089621161080026": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "15156015174611610705": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "8055193939726603877": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10598995451755327159": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13336847303794450665": ["convolution_gpu_bfyx_gemm_like",2], + "4992668316921598993": ["convolution_gpu_bfyx_gemm_like",1], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "8578747191812631883": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "14915908231779912828": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9676055912997166605": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4369680877112803848": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "4745007371868123765": ["convolution_gpu_bfyx_gemm_like",2], + "288825580282908143": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "16932172538978111342": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "13850807749756445264": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "778175413671462719": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "10704037259494193565": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "11734299455885510243": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15395497315929884637": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "17769940507971546305": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "7246177123265734169": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "14848351491062336554": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "14443599718173185176": ["convolution_gpu_bfyx_gemm_like",2], + "4217179485243909459": ["convolution_gpu_bfyx_gemm_like",1], + "13625877249040282040": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "3526580286148537369": ["convolution_gpu_bfyx_gemm_like",2], + "14257161696605459633": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "12529210672030682764": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "6768451741770053089": ["convolution_gpu_bfyx_gemm_like",2], + "15943174060386142134": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "16415344078703911571": ["convolution_gpu_bfyx_gemm_like",2], + "15822975685755664152": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "6577240413312348523": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "11668043528929060706": ["convolution_gpu_bfyx_gemm_like",1], + "15379595951542162189": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2056766012044921101": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "2384942244346844027": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "6400671582981760192": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9746964858035717775": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "9583760104223104233": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13956744866244022582": ["convolution_gpu_bfyx_gemm_like",2], + "14403780921831769097": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12956535344568057480": ["convolution_gpu_bfyx_os_iyx_osv16",84], + "1753515740487760297": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10160082844961863335": ["convolution_gpu_bfyx_os_iyx_osv16",199], + "11875516764635427358": ["convolution_gpu_bfyx_os_iyx_osv16",133], + "12242618640422208652": ["convolution_gpu_bfyx_gemm_like",0], + "12761366575293006784": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "15051114821536746998": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "6706802683366112205": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3661305534604931936": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3598116387801985039": ["convolution_gpu_bfyx_os_iyx_osv16",676], + "12478421208861550581": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13066055561434178894": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "18160969423211875528": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "104321144590863458": ["convolution_gpu_bfyx_gemm_like",2], + "9008848676120441863": ["convolution_gpu_bfyx_gemm_like",2], + "4695273549696315193": ["convolution_gpu_bfyx_gemm_like",2], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "4563773888811395621": ["convolution_gpu_bfyx_gemm_like",2], + "5351705572686943348": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2647922515901529845": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "296202142406900242": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "13342769641176584743": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "10468562355439385073": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "4503960445974334415": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "9492331996847106233": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "7107513718824525169": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "11376522803174788945": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1772363899841601255": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "16715151641337602113": ["convolution_gpu_bfyx_gemm_like",1], + "7997955859883990923": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "6474882514032493642": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13348855287761849180": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15922076723067110929": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "3980754726678047241": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9794061741834174000": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "7410220112400588068": ["convolution_gpu_bfyx_gemm_like",2], + "12323840136934980793": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "13110173649734084688": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "13411431109933021193": ["convolution_gpu_bfyx_gemm_like",2], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3590316457726550768": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "12942085219027232135": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",290], + "8981229334098733320": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "14682537852514419239": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1884327428051733366": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "2301409406426420354": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11091004452522208782": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "12386437738920143482": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "1660279112011537957": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "16122815225820081176": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "10599639229366933472": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "11674725184029885494": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "12225119940380026093": ["convolution_gpu_bfyx_os_iyx_osv16",1034], + "10908411570889102154": ["convolution_gpu_bfyx_gemm_like",1], + "15227034948424983496": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17659601542171299562": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "12895496994338720556": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2506154888542197909": ["convolution_gpu_bfyx_os_iyx_osv16",860], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",2], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "11534123522633460320": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11922163303962372849": ["convolution_gpu_bfyx_gemm_like",1], + "11357813056434049302": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "2950917846016525392": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "15156525717629023944": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "6172851296465788161": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "6432519735121751346": ["convolution_gpu_bfyx_gemm_like",1], + "14685573786743639408": ["convolution_gpu_bfyx_gemm_like",1], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "11141999085710526242": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8951503172834790833": ["convolution_gpu_bfyx_gemm_like",2], + "13498795599230228492": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "16815680874311765189": ["convolution_gpu_bfyx_gemm_like",2], + "13886526360627032217": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "10476627457539425144": ["convolution_gpu_bfyx_gemm_like",2], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",376], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "17065380294456704620": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "13441117085490814804": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",1034], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14601912265050074833": ["convolution_gpu_bfyx_gemm_like",2], + "5816730482014477109": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "10016243001407196485": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1502236537645808646": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "14682894856346977838": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "60749853744407778": ["convolution_gpu_bfyx_gemm_like",2], + "5032866547826271476": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "12630173933512965589": ["convolution_gpu_bfyx_gemm_like",2], + "3297036980627776719": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17160915544701715607": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13285123703712436126": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "682912708716537431": ["convolution_gpu_bfyx_gemm_like",2], + "14454927839795553295": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "16039372573821594566": ["convolution_gpu_bfyx_gemm_like",2], + "9929060811766882316": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "7043547563530810431": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "11546295514640813785": ["convolution_gpu_bfyx_gemm_like",2], + "7693556065684619275": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8618835732380720921": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11906319144823550582": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",2], + "815847426244665239": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "4212194737559719449": ["convolution_gpu_bfyx_gemm_like",0], + "2352142833866194508": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "568114041320772862": ["convolution_gpu_bfyx_gemm_like",2], + "10616832946298118456": ["convolution_gpu_bfyx_gemm_like",2], + "14581447673401303181": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "26773921190137993": ["convolution_gpu_bfyx_gemm_like",2], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "3762117189312286955": ["convolution_gpu_bfyx_gemm_like",2], + "17453621319901961773": ["convolution_gpu_bfyx_os_iyx_osv16",139], + "4565037760028957581": ["convolution_gpu_bfyx_os_iyx_osv16",852], + "15578217564714846277": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "8697631439739291302": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "7313000297447719088": ["convolution_gpu_bfyx_gemm_like",2], + "13993319023992950944": ["convolution_gpu_bfyx_gemm_like",2], + "11796671083187280457": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "15637565679147396649": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",4], + "14385995236701277049": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "6031307393395339699": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "15432337846778101995": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "2722601800398376127": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "15616954046484566002": ["convolution_gpu_bfyx_gemm_like",2], + "15830721134654889992": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "7974918595373182037": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8178825467227185946": ["convolution_gpu_bfyx_gemm_like",2], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "13102754309439605192": ["convolution_gpu_bfyx_gemm_like",2], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "13691555384698806010": ["convolution_gpu_bfyx_gemm_like",1], + "15863633107759120207": ["convolution_gpu_bfyx_gemm_like",1], + "12511186263003392018": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "15168098632351740923": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "16650590194585316886": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "2743892624333411461": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "17177353407003831190": ["convolution_gpu_bfyx_gemm_like",2], + "3292554262586950764": ["convolution_gpu_bfyx_gemm_like",2], + "5635504912415420460": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",151], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "11067412830219638639": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "14865708345458193472": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "15464714725848277081": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10716913534741102635": ["convolution_gpu_bfyx_os_iyx_osv16",483], + "3596159214965874273": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "11210961619302975072": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8319405652132127420": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9980945809859857871": ["convolution_gpu_bfyx_gemm_like",2], + "13858485871773319706": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "13319880343534837963": ["convolution_gpu_bfyx_gemm_like",1], + "6983900601570231321": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "11897886369869427808": ["convolution_gpu_bfyx_gemm_like",2], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "16540183777173974162": ["convolution_gpu_bfyx_gemm_like",1], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17052596472114345717": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "12458305535453345462": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13255006150107668739": ["convolution_gpu_bfyx_gemm_like",2], + "17097621900023182992": ["convolution_gpu_bfyx_gemm_like",2], + "14523905821262502926": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "5687802882700097624": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "16162899163122139501": ["fully_connected_gpu_fb_io_ref",1], + "15891505875671050928": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10271474583233390474": ["convolution_gpu_bfyx_os_iyx_osv16",155], + "4640696923527766618": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "13473730516782884152": ["convolution_gpu_bfyx_gemm_like",2], + "9245770108138984525": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4959718589070770515": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",1], + "13234170505677988638": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "45545661884854912": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "5311718276151327830": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "12896159402462325805": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "14647949921048404551": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "5327803911898085293": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",1], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11012846743944132853": ["convolution_gpu_bfyx_gemm_like",2], + "4713580645061462578": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "11576182324195008022": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9673176853197584682": ["convolution_gpu_bfyx_gemm_like",1], + "3935404533406270186": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13358754652597677285": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "5246229312484886433": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "15939309688773899430": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3805854200552708060": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5219048275475447369": ["convolution_gpu_bfyx_gemm_like",2], + "2832331506191733785": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "710656784939783221": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8306931146242110738": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "118898027441804310": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",941], + "7941359635463232326": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "18418073826375395057": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3935174650108042053": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "288853243482418538": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "5342657840254586591": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6025872155179042054": ["convolution_gpu_bfyx_gemm_like",2], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "8739570656208259296": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "7086554406050778468": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16342158355942808662": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15887938842582811165": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "4211445170027080823": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "15487538714246568015": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "17641726060706984007": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "8449108317864057899": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "16536775289334717044": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "13150876648527896999": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "8779987507326777359": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8215519118071138614": ["convolution_gpu_bfyx_gemm_like",2], + "9069245927173134634": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "3120885087070223590": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "6728889146307098720": ["convolution_gpu_bfyx_gemm_like",1], + "14004618842373739106": ["convolution_gpu_bfyx_gemm_like",2], + "16741985699154392565": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "8176520928011006903": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14213127286928643795": ["convolution_gpu_bfyx_gemm_like",2], + "1336477297334930004": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12565318283493666631": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "11901687795497708884": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "11858246418724176452": ["convolution_gpu_bfyx_gemm_like",1], + "17355826643208208691": ["convolution_gpu_bfyx_gemm_like",2], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1204089510255285420": ["convolution_gpu_bfyx_gemm_like",2], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",22], + "12621528958448913800": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "2768512766772748723": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "15579919505002150556": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "16352438188558979362": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "3594327736281012643": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "8281411537393664160": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "11152834864013527469": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "5384134329664434112": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "17197868427757781334": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "14463841899941062548": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8734220847509054149": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "15597522934012485452": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "8770858724416759637": ["convolution_gpu_bfyx_gemm_like",2], + "3651651926851660222": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1460916897832302487": ["convolution_gpu_bfyx_gemm_like",2], + "2251572761614039612": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "17503210896556316294": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "14547907449418439737": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "15618891972122000521": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "10380031655567712558": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "9516288831713776693": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "13550337096609413041": ["convolution_gpu_bfyx_gemm_like",2], + "17459500507201824299": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "18379763351534914922": ["convolution_gpu_bfyx_os_iyx_osv16",140], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",2], + "15993427814066246646": ["convolution_gpu_bfyx_gemm_like",1], + "7781809277449433812": ["convolution_gpu_bfyx_gemm_like",2], + "9003196270667188479": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3034466284781235431": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "11198378813600875939": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5509852360472061267": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "1239861345413267621": ["convolution_gpu_bfyx_gemm_like",2], + "1720791539242542292": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "2419819939573989749": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "9390919808369333231": ["convolution_gpu_bfyx_gemm_like",2], + "11882021989615795558": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "4003468969524607815": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "12169148580322697755": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",2], + "14524678598440880756": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "14592395793778583608": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "1781189282179491198": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "16587387608532583713": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "11205571992835612111": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14674266217397415571": ["convolution_gpu_bfyx_gemm_like",2], + "8642397690605957294": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "172303227623890951": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "17855733925989425515": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "13982221711075598070": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "13337122303005980542": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "5134857932624749530": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8740196547852036537": ["convolution_gpu_bfyx_gemm_like",2], + "9781830607177020570": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "11297512843662536362": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "16071030448801649281": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4034250407843183678": ["convolution_gpu_bfyx_gemm_like",1], + "3661361503342294227": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "7247891577022043949": ["convolution_gpu_bfyx_gemm_like",2], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "8734419426540206087": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "2559310381697374321": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "6659313690133629176": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "7617123358753247310": ["fully_connected_gpu_fb_io_ref",2], + "10784905418636316601": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "7999747927804607567": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "14670952132900619664": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "4276712095427918904": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "3806791682244402910": ["convolution_gpu_bfyx_os_iyx_osv16",1088], + "11879484013890539145": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "9287404618748313247": ["convolution_gpu_bfyx_gemm_like",1], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "13952295742818866246": ["convolution_gpu_bfyx_os_iyx_osv16",885], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "2094546483928406874": ["convolution_gpu_bfyx_gemm_like",1], + "3831201505512446456": ["convolution_gpu_bfyx_gemm_like",0], + "14097319816812992451": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "7279393739634103483": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "13661880440426932218": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7590767013583950613": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1617362484243823916": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",813], + "8819268903800581706": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "16320454719906370247": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "10972033292930619311": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "16230621843665445228": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "8892991171111842341": ["convolution_gpu_bfyx_gemm_like",2], + "323234725943768094": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15287650965861631130": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "15818237122613168508": ["convolution_gpu_bfyx_gemm_like",0], + "6542486391263861823": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "15938703221521364046": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "8333743604646422982": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4800208854712166990": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16590893345666612869": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14733291836016183044": ["convolution_gpu_bfyx_gemm_like",2], + "15494543914974994991": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "1081287304647703427": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "11609821372586026178": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "10961049607808752432": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6161072079255825074": ["convolution_gpu_bfyx_gemm_like",2], + "10392013312924273545": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "10400727836871462348": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11494395549955384747": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "3329610414149222728": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8986253016099337778": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "12606196670791209919": ["convolution_gpu_bfyx_gemm_like",2], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "8863731258634577277": ["convolution_gpu_bfyx_gemm_like",2], + "2586132860307138964": ["convolution_gpu_bfyx_gemm_like",2], + "2844746478867668588": ["convolution_gpu_bfyx_gemm_like",2], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "1400089266180918877": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "17522452942286240233": ["convolution_gpu_bfyx_gemm_like",2], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "15417738436777481469": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4917595053453614536": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "5688623850477433571": ["convolution_gpu_bfyx_gemm_like",2], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",1], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",995], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "10773411423039491193": ["convolution_gpu_bfyx_gemm_like",2], + "11809236497308682596": ["convolution_gpu_bfyx_gemm_like",1], + "2146633923143071497": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "2968144776497288135": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "3311449696894745049": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "17472252137354770318": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7026575758396092435": ["convolution_gpu_bfyx_os_iyx_osv16",172], + "15898888434295644774": ["convolution_gpu_bfyx_gemm_like",1], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",883], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "15661055655577513377": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12965800692507042874": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "287386909600391846": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "14800933038795670868": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10721811813682112908": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14762859593402798050": ["convolution_gpu_bfyx_gemm_like",2], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "15972805725107234322": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "11140864132614066113": ["convolution_gpu_bfyx_gemm_like",2], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16261543808418336089": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "15228614030349540878": ["convolution_gpu_bfyx_gemm_like",1], + "6335628260431943016": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "6545814945227676265": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3007505068107685147": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "13722424507812159961": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "15659671804906879034": ["convolution_gpu_bfyx_gemm_like",2], + "15893297349596399716": ["convolution_gpu_bfyx_gemm_like",1], + "6612243861034102250": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "3913951712614107871": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2546472090573813082": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1013207188944763398": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2679903779216253668": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11409066626289209846": ["convolution_gpu_bfyx_os_iyx_osv16",351], + "7386836350136973872": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "16211466749116679534": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "1403373982815401451": ["convolution_gpu_bfyx_gemm_like",1], + "7126601602274920416": ["convolution_gpu_bfyx_gemm_like",2], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",1], + "11914756126771310827": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17224820843490443805": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13683563727561197895": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "14159293183840880884": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11885660439698926227": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13448159575961515854": ["convolution_gpu_bfyx_gemm_like",0], + "13779700363254765602": ["convolution_gpu_bfyx_gemm_like",2], + "18125075313255528454": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "2260718905219541967": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "5688161172644782612": ["convolution_gpu_bfyx_gemm_like",1], + "12896164738668798380": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "5635500901926740475": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "17691748026963003695": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "3513523165606656242": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "15754688305730191542": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "37061093840513038": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6830643729780599672": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "8779164026828163571": ["convolution_gpu_bfyx_gemm_like",1], + "352808518345312040": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16835545111241063900": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "9343876424591024597": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3064765745900772872": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "2008064690158516711": ["convolution_gpu_bfyx_gemm_like",2], + "11447737411040418462": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "16485921493309285440": ["convolution_gpu_bfyx_gemm_like",2], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7589346100701197023": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16615858951735101760": ["fully_connected_gpu_fb_io_ref",1], + "13551767519605460627": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "3830091089824446164": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "5758223108250439377": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "4399656162365214694": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "15571801737237063594": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6236857636305802170": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "8631194673451861459": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3392632422002516166": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "9402935157379983392": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13680502636898130714": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5503904988517480229": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "9561367273233389233": ["convolution_gpu_bfyx_gemm_like",2], + "17495070522944546801": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "5176939691838030517": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "4942131377140353094": ["convolution_gpu_bfyx_gemm_like",0], + "14946519992043402896": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "5398895598407183682": ["convolution_gpu_bfyx_gemm_like",2], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",467], + "13753670205703732353": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "2148648022160178995": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "596528462327775677": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "7512702933193596918": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "9644723852089512961": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "264371219192743152": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "8663545677000846511": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7200893702912130808": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "5718747983756317198": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2850803473613487020": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "2335783507270234825": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "3088402690095697589": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "1211404528755199615": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11521288355888665606": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15816980369722540994": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "5781431860747226742": ["convolution_gpu_bfyx_gemm_like",2], + "15365776263895633531": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "3389739049224815652": ["convolution_gpu_bfyx_gemm_like",2], + "7877637636782924097": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "18398231411109020099": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17520777331163825810": ["convolution_gpu_bfyx_gemm_like",2], + "16462862831307415504": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5348059680010171141": ["convolution_gpu_bfyx_gemm_like",1], + "7289907211627391947": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5378151578014945610": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "5629582391075745771": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "11607736973932389832": ["convolution_gpu_bfyx_gemm_like",0], + "2598910952085172410": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",622], + "17342603054992556378": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "3332444589775844154": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "4136736579788862192": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13161798453564436688": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "18429276095695345973": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "9539616823548370185": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9914440875772341708": ["convolution_gpu_bfyx_gemm_like",1], + "14484004336536993120": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "7065121716452374910": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8854234880878427078": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "1194267934213722567": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "2387389473399444503": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "10775785602937893911": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "8124166677361481618": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9267417754412894234": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "5896089609470353090": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",641], + "5119087113905313336": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4104062066031480003": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10857567623940140266": ["fully_connected_gpu_fb_io_ref",1], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "5149553691611520515": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "11311890411536750673": ["convolution_gpu_bfyx_gemm_like",2], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "859377216693940737": ["convolution_gpu_bfyx_gemm_like",2], + "2915952195141872726": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14142812374094816721": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "12994819742376207273": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12057000101434512661": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "11047759270093007856": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "12715500118796263683": ["convolution_gpu_bfyx_gemm_like",2], + "2830742500858558621": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "12445292008737311977": ["convolution_gpu_bfyx_gemm_like",2], + "15158997684077722015": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "13004055504657277105": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8347537383976709519": ["convolution_gpu_bfyx_os_iyx_osv16",805], + "13398875754083902831": ["fully_connected_gpu_yxfb_ref",2], + "16450345154125804290": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6418748992581951435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",135], + "12642701787250074691": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "4642402648038764246": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "17026348860895225619": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "4554398307153171456": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "4445257000541366640": ["convolution_gpu_bfyx_os_iyx_osv16",416], + "4682062886371423209": ["convolution_gpu_bfyx_gemm_like",2], + "8337457116169698090": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "14969813450703071948": ["convolution_gpu_bfyx_gemm_like",1], + "14167086447992316314": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1961348920992050029": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",2], + "14218701503304823803": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",2], + "541744773413565297": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13647773816638053437": ["convolution_gpu_bfyx_gemm_like",2], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "3300655231758263066": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "13985989113434682460": ["convolution_gpu_bfyx_gemm_like",1], + "16576300898841314587": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "4082218299236753259": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "3138712043201001156": ["convolution_gpu_bfyx_gemm_like",2], + "9493034132406318197": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2984236836610169934": ["convolution_gpu_bfyx_os_iyx_osv16",142], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "9351428703239678614": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "17546650302679801134": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5089359404080552270": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "5890683283363730941": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6678101356115372537": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1838534101161814609": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "17646394278957547470": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2651385050387738902": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "4792657031481471098": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13423515205322319913": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "1431307776181554710": ["convolution_gpu_bfyx_gemm_like",2], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9771430089730856496": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "17308907916370632622": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",1070], + "13435416060730279243": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "10842505566649585090": ["convolution_gpu_bfyx_gemm_like",1], + "6326191473779365124": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2705534741438659581": ["convolution_gpu_bfyx_os_iyx_osv16",475], + "17050143605017295447": ["convolution_gpu_bfyx_gemm_like",2], + "11307721164906705899": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "11352094952907979172": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "14512311371993445906": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13076343553185159307": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2832311883163804015": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "3182329375739242693": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1077773457856682663": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "17294244481988344762": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5401946420641519048": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "10526411638069090068": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "8181704316455400709": ["convolution_gpu_bfyx_gemm_like",2], + "16462033126494826292": ["convolution_gpu_bfyx_gemm_like",2], + "12547252593506448096": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "5321807316257768": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "1071663904249509302": ["convolution_gpu_bfyx_gemm_like",2], + "1878953827218615252": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "8321769923556905957": ["convolution_gpu_bfyx_gemm_like",1], + "7053070767227498983": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12318427976031000768": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3060709449176556770": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",931], + "14741012384358891350": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "3626743386403140330": ["convolution_gpu_bfyx_gemm_like",1], + "16134637021630473012": ["convolution_gpu_bfyx_gemm_like",1], + "15026219694198820614": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "15671873744670386067": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "2870715678422088243": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "5103094815475470596": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3430998232987873998": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "1127844465496534455": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15958017891397409552": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "2010255131587843361": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "11679869968143173159": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "1154469970162137785": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "8260689555974656662": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "11206468937763516689": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "1265107284215037966": ["convolution_gpu_bfyx_gemm_like",2], + "6616869272699525153": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "6953499208425592115": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "879896719155824868": ["convolution_gpu_bfyx_gemm_like",2], + "10111038481447198008": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "6519443541076418301": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "7253709516917901897": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "10236258478395201152": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15513894336778253285": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "8942221095468681112": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "13410850301164057911": ["convolution_gpu_bfyx_os_iyx_osv16",252], + "6571438978296387721": ["convolution_gpu_bfyx_gemm_like",2], + "2020044486043617858": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14233219774448115529": ["convolution_gpu_bfyx_gemm_like",2], + "9770300588867836071": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "191374388179598660": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4184357870886924038": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "6235132681081375078": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "13297875917250935192": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "14577775579978745344": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "9724624621108712962": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "17638692805430115529": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "9729771183572950642": ["convolution_gpu_bfyx_gemm_like",1], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "9212091835906796243": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "8528750110601691390": ["convolution_gpu_bfyx_direct_10_12_16",0], + "4737347018334654530": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "17829854042305231384": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "8571662320744858201": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "5828768432282043413": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "3685556976073096544": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "8047078039937885319": ["convolution_gpu_bfyx_gemm_like",2], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "4366043672240989175": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "6148022455516485135": ["convolution_gpu_bfyx_gemm_like",2], + "2932914865200583326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13225520357177380691": ["convolution_gpu_bfyx_gemm_like",2], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "5261762234237034874": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "5409329687010951601": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "10885752780697269323": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "4577872082734403187": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "9614300332487270888": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6997121306455110286": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "1071090704302849258": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "937763627727362899": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "5185125307593023170": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "11933283931932057859": ["convolution_gpu_bfyx_gemm_like",1], + "18120169120088482114": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "16541535256432192398": ["convolution_gpu_bfyx_gemm_like",2], + "4646176801168621136": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3119045125726216156": ["convolution_gpu_bfyx_gemm_like",1], + "141166664952282933": ["convolution_gpu_bfyx_gemm_like",2], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "8228641750970480948": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "835367600773871252": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15114370307779942381": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17962578815194404362": ["convolution_gpu_bfyx_gemm_like",2], + "4831224999851230245": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6812025576584060234": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "9601849246293120347": ["convolution_gpu_bfyx_gemm_like",2], + "15156805695359911457": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "4515798403196565084": ["convolution_gpu_bfyx_gemm_like",2], + "8122815203088327658": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "5962764672151728219": ["convolution_gpu_bfyx_os_iyx_osv16",1108], + "2622434279674583815": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",1], + "8809794528993445200": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13660015013041074867": ["convolution_gpu_bfyx_gemm_like",2], + "5948701218437980356": ["convolution_gpu_bfyx_gemm_like",2], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",0], + "10800323158234163234": ["fully_connected_gpu_fb_oi_ref",2], + "6876164425008541018": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "14652791434312888296": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "12942776337163777730": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "16884228931101540030": ["convolution_gpu_bfyx_gemm_like",2], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "18331981707436752260": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15765198153800696060": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9628735886189157469": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",2], + "12854272540346358832": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "11831092915967558428": ["convolution_gpu_bfyx_os_iyx_osv16",647 + ] + }, + "64": { + "12297371032753209816": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",1041], + "290134020607738418": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16535378085465418910": ["convolution_gpu_yxfb_yxio_b16",0], + "1644335606100150388": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "3391032227732782982": ["convolution_gpu_bfyx_gemm_like",1], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "18180820925685532104": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "11132679855317294753": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10650698451740924172": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "13553263424160050064": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",0], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13970935346154374605": ["convolution_gpu_bfyx_gemm_like",2], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",2], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",97], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",0], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",0], + "2096167792705935744": ["convolution_gpu_bfyx_gemm_like",2], + "10471519687597963116": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",1], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",841], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "15078168059698267650": ["convolution_gpu_bfyx_direct_10_12_16",0], + "18041177945345031826": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7430073011895298582": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "2423754482456771339": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "16235115911229280717": ["convolution_gpu_bfyx_gemm_like",2], + "9410978119783758141": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "6391201577234440562": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",1], + "15781622938833984014": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "276407276027553756": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "1541754036637209097": ["convolution_gpu_bfyx_gemm_like",2], + "16292848987976256449": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11609821372586026178": ["convolution_gpu_bfyx_gemm_like",1], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "12028665820838352309": ["convolution_gpu_bfyx_gemm_like",2], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",350], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",424], + "5115661026367632863": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",1], + "7590767013583950613": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "6870942166356599956": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "9390478179772073718": ["convolution_gpu_bfyx_gemm_like",1], + "16728762255357411770": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",845], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",1], + "4135068756462147853": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9660812093766156608": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "522181557896569275": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9562291747339451180": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "7432142107544210174": ["convolution_gpu_bfyx_gemm_like",2], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "9407646138658641974": ["convolution_gpu_bfyx_gemm_like",1], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "467975197394411990": ["convolution_gpu_bfyx_gemm_like",1], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "13251091004269229867": ["convolution_gpu_bfyx_gemm_like",2], + "3568514382399560386": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "15528692642731712121": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "12246408434917478929": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "5843679089588930933": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "15914512645931208899": ["convolution_gpu_bfyx_gemm_like",2], + "8451212914744825089": ["convolution_gpu_bfyx_gemm_like",2], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "2826762745628486040": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",2], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "1171681987783013074": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "14037325204801680738": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17443356777503458523": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "5774841809066688068": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16944335478353845609": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "4640696923527766618": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "1104489643524273315": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "6669808855737023569": ["convolution_gpu_bfyx_gemm_like",1], + "14309292105974991733": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",92], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",2], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "2149582237161177965": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "3140230065585683313": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13464226348405628455": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "158222105675022402": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "3159147743553063163": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "15417738436777481469": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "5287076386757143976": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",267], + "7235358742317442134": ["convolution_gpu_bfyx_gemm_like",1], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "10670103699537731664": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",1], + "9979259596137305973": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15859493313686060349": ["convolution_gpu_bfyx_gemm_like",1], + "15988378956341507229": ["convolution_gpu_yxfb_yxio_b16",0], + "5156033406916344703": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "13102754309439605192": ["convolution_gpu_bfyx_gemm_like",1], + "3563872903821081702": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8792202318168046223": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",1], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",679], + "10990741293315393791": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "2770397466252831892": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "8609939102588915855": ["convolution_gpu_bfyx_gemm_like",2], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",494], + "13558656230312558247": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "15490478608105402679": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "16362139250976572928": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "5184121466994451498": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3509027370372599394": ["fully_connected_gpu_fb_io_ref",2], + "14008438372661779490": ["convolution_gpu_bfyx_gemm_like",2], + "12394049027081208902": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "15315327794058441258": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "10509933181132310969": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8036474422877454869": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "11800783548769329949": ["convolution_gpu_bfyx_gemm_like",2], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",712], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "778476198101178556": ["convolution_gpu_bfyx_gemm_like",1], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "17046662043776372746": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "2296581485980163665": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",496], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",2], + "10765280349477640969": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "6205240287062600210": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "12024143207855886580": ["convolution_gpu_bfyx_gemm_like",2], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "755414184406250882": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "6603778920476932267": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "8843585527713905568": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "16773645387243701837": ["convolution_gpu_bfyx_gemm_like",2], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "9373353053843326128": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "6664432489777052771": ["convolution_gpu_bfyx_gemm_like",2], + "805131056816361237": ["convolution_gpu_bfyx_os_iyx_osv16",681], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7744787957569714828": ["convolution_gpu_bfyx_gemm_like",1], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "13161997040644039778": ["convolution_gpu_bfyx_gemm_like",2], + "4897991181236908768": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "8690196189594920365": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",1], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",903], + "1082574490068006980": ["convolution_gpu_bfyx_gemm_like",2], + "9130971535185609293": ["convolution_gpu_bfyx_gemm_like",2], + "3662747857062156477": ["convolution_gpu_bfyx_gemm_like",1], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",1], + "15155676074658242659": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "16173557782125372935": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "16910952799476896905": ["convolution_gpu_bfyx_gemm_like",1], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17522452942286240233": ["convolution_gpu_bfyx_gemm_like",2], + "2727175120437582536": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",298], + "4279062247055842367": ["convolution_gpu_bfyx_gemm_like",1], + "3398322619007806698": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8740196547852036537": ["convolution_gpu_bfyx_gemm_like",2], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "10607904718265020949": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "6577505360421510286": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",1030], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "13410850301164057911": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "15082818876354718849": ["convolution_gpu_bfyx_gemm_like",1], + "9287404618748313247": ["convolution_gpu_bfyx_os_iyx_osv16",1062], + "9428176632140441528": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8507854696766492454": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",2], + "16182470664818268848": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5219048275475447369": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "4239133538073498792": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "3337625924046561031": ["convolution_gpu_bfyx_gemm_like",1], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "4400247897123856252": ["convolution_gpu_bfyx_gemm_like",2], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "6204183474669103812": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "6733731409232284409": ["convolution_gpu_bfyx_gemm_like",1], + "8578747191812631883": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "15901724303713479611": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10570285542015420072": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "2273992727647793692": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3221221905804708596": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "10433541468308381909": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",1], + "14424566003632608852": ["convolution_gpu_bfyx_gemm_like",2], + "14131851237755716991": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "11872943152839631823": ["convolution_gpu_bfyx_gemm_like",2], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",2], + "6458124573210430792": ["convolution_gpu_bfyx_gemm_like",2], + "16230621843665445228": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "17015791782274123780": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "4617809377006148936": ["convolution_gpu_bfyx_gemm_like",2], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "2451712485584835395": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "6214194654733781771": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12228610148087508521": ["convolution_gpu_bfyx_gemm_like",2], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",1], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12644942072153919043": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "11640865562390693266": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "6143200133853000387": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "13602140021189675477": ["convolution_gpu_bfyx_gemm_like",2], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "6578908625437515675": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "14896875712028630045": ["convolution_gpu_bfyx_gemm_like",2], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",2], + "2746052215199129520": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15129834325410878425": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10118395047539851751": ["convolution_gpu_bfyx_gemm_like",1], + "11284755586130392759": ["convolution_gpu_bfyx_os_iyx_osv16",243], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10899110544832584656": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "9069334144391048686": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "3600066510593746268": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "7148542290597073512": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "9274179337770060652": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "386749666417295495": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",1], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "3179874645565098825": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "10178145641713631806": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "10109431802089940590": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "8133587696326295326": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "13448845356783404653": ["convolution_gpu_bfyx_gemm_like",1], + "16172528828198474326": ["convolution_gpu_bfyx_os_iyx_osv16",608], + "8241070786700614317": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "10628725059172743408": ["convolution_gpu_bfyx_gemm_like",2], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",2], + "9737565171095493297": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14349625788399542568": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",2], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",504], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",1019], + "16347412180100581330": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",1], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",397], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",1], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",2], + "14355612297330229277": ["convolution_gpu_bfyx_gemm_like",0], + "8707189142909022305": ["convolution_gpu_bfyx_os_iyx_osv16",298], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "5643908654122573882": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "15856268902838573812": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "13810995219720233595": ["convolution_gpu_bfyx_gemm_like",2], + "659150305191479097": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13248567106128518549": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "6571438978296387721": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",2], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",2], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "7199295899520406795": ["convolution_gpu_bfyx_gemm_like",2], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",0], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "12972798847556569913": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "8376077531098664520": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "7330202944390548890": ["convolution_gpu_bfyx_gemm_like",1], + "4927360358387344983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3816674884393241704": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "2817919813339364130": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",2], + "14204609663091442879": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",2], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",592], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",681], + "17366007551797367227": ["convolution_gpu_bfyx_gemm_like",2], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3711525118850629466": ["convolution_gpu_bfyx_gemm_like",1], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "15578456771467281881": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "4738743763536059708": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",1], + "4046830923427667342": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "15757308772667178999": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4196367396954155354": ["convolution_gpu_bfyx_gemm_like",2], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "12193395770362986433": ["convolution_gpu_bfyx_os_iyx_osv16",684], + "10718764522366711114": ["convolution_gpu_yxfb_yxio_b16",1], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",2], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",2], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "2026622899016787854": ["convolution_gpu_yxfb_yxio_b16",0], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "14263790627243107300": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "3349519148124496343": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",1], + "6133592828563353516": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "15374625876485618845": ["convolution_gpu_bfyx_gemm_like",1], + "1425953627379976115": ["convolution_gpu_bfyx_direct_10_12_16",2], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16888412539296862194": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "3355259926747524578": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "10292585962794261197": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "15156525717629023944": ["convolution_gpu_bfyx_gemm_like",2], + "3224352307778512793": ["convolution_gpu_bfyx_gemm_like",1], + "3501882025888946886": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "8146945902795164796": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "5582896843095691256": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",1], + "15962533525948221648": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "85050336704401597": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "15857087373591747006": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "4091702228990140696": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "13192808619929896995": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",942], + "7531346828150129063": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",278], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "13583166868754499339": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",663], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "6522575549211855712": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "1640358227345963848": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "14159596290442764023": ["convolution_gpu_bfyx_gemm_like",1], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "16813995580382709489": ["convolution_gpu_yxfb_yxio_b16",1], + "8130920994920685157": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "1591199515536783245": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "1838534101161814609": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "2575631797904040925": ["convolution_gpu_bfyx_gemm_like",2], + "689445825453914111": ["convolution_gpu_bfyx_gemm_like",1], + "3438296636411972401": ["convolution_gpu_bfyx_gemm_like",2], + "4959403414256988744": ["convolution_gpu_bfyx_gemm_like",1], + "5933743119393822386": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "8906185843274300447": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9580986168276580598": ["convolution_gpu_bfyx_gemm_like",1], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "1540041682425757361": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3501667344669686338": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "14885031472057965707": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",2], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "9421927854269492263": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "348058686961206025": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",1], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",2], + "12523676912856063091": ["convolution_gpu_bfyx_os_iyx_osv16",565], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",1], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12315068368597230211": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "3190494353583341446": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",2], + "4220826666482500445": ["convolution_gpu_bfyx_os_iyx_osv16",1029], + "11095908837221722097": ["convolution_gpu_bfyx_gemm_like",2], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",2], + "11070620435959083971": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "4957638663977636791": ["convolution_gpu_bfyx_gemm_like",2], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",1], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",0], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "8560635685184432720": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "2116913943188857359": ["convolution_gpu_bfyx_gemm_like",2], + "8316848551837633169": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "18273537339378756543": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",1], + "3780320160034246719": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "2929715823970060874": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5159470523468873105": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "8176012042686275874": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "1306339989221885682": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "11421180829679625737": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "8651641584737798174": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "863057075064640334": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "8671491767142900139": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "12348602762263193288": ["convolution_gpu_bfyx_os_iyx_osv16",648], + "14088382963493477342": ["convolution_gpu_bfyx_gemm_like",2], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",12], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "4773077837537775324": ["convolution_gpu_bfyx_gemm_like",2], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "8984436655107983227": ["convolution_gpu_bfyx_gemm_like",1], + "14811022197918391667": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "16290626406346691996": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "15891662883560480723": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "8155268141318893606": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13972357557211413688": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6644418194983229139": ["convolution_gpu_bfyx_gemm_like",1], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14912119584313592912": ["convolution_gpu_bfyx_gemm_like",1], + "15914342421266687768": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "2571882179292959757": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "9643408025778914022": ["convolution_gpu_bfyx_os_iyx_osv16",236], + "9585113116232600562": ["convolution_gpu_bfyx_gemm_like",1], + "6726099352298108756": ["convolution_gpu_bfyx_gemm_like",1], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4848143712599565301": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7671016314869993705": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",1], + "3448477246688526708": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17824431042110985323": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "18150429561058646714": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "11841034668170849494": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "17174919737114915467": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "9999425239167488495": ["convolution_gpu_bfyx_gemm_like",0], + "15471470494305051299": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "10151922632636937118": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "338716975932676215": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "11951606039079763598": ["convolution_gpu_bfyx_gemm_like",2], + "9447458159095730492": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "14123081378489325832": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "14116800584981026541": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "17154337492545826355": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "1103228955716492167": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13753473508578037346": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "13094402291968806996": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",438], + "632116056424249698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12655099960717366198": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "13115589642140732066": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14994322266840011040": ["convolution_gpu_bfyx_gemm_like",1], + "9040046051053703359": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "10170577772376890221": ["convolution_gpu_bfyx_gemm_like",1], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "17207560805775399864": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",2], + "1332624116953483870": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "16076153317792960383": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11862259122805366807": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "17358006976602795707": ["convolution_gpu_bfyx_gemm_like",2], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",2], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "981733129438741439": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",1], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "9454146598828084176": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "9601412379897937608": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "4833749391314748606": ["convolution_gpu_yxfb_yxio_b16",1], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",1], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "16758962840329202004": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "8818070832398055086": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",1], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",0], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",1], + "13124342334495538095": ["convolution_gpu_bfyx_gemm_like",1], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",1], + "17303408650780384587": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",1033], + "12894240573737168362": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",1], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "3963106895592011725": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "582360460084115077": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",570], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",1], + "16361932270527364507": ["convolution_gpu_bfyx_os_iyx_osv16",651], + "15786328370300803713": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "8505040075968411726": ["convolution_gpu_bfyx_gemm_like",1], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "5115134711994944288": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "2502125887857336825": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "8431759922045602848": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "13927671398099556854": ["convolution_gpu_yxfb_yxio_b16",2], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "6328802691680458752": ["convolution_gpu_bfyx_os_iyx_osv16",643], + "11910735867274493498": ["convolution_gpu_bfyx_gemm_like",2], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "5091558853871982858": ["convolution_gpu_bfyx_gemm_like",2], + "4531222427159927606": ["convolution_gpu_bfyx_gemm_like",2], + "8409488188696700816": ["convolution_gpu_bfyx_gemm_like",1], + "3063055767192991776": ["convolution_gpu_bfyx_gemm_like",2], + "6708349666663292171": ["fully_connected_gpu_fb_oi_ref",1], + "17037462814585846902": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "16393176054374397767": ["convolution_gpu_bfyx_gemm_like",1], + "8257103926661643451": ["convolution_gpu_bfyx_os_iyx_osv16",275], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "9040145293899470160": ["convolution_gpu_bfyx_os_iyx_osv16",309], + "287386909600391846": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "5433618404351968121": ["convolution_gpu_bfyx_gemm_like",1], + "4652136280940317116": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",2], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "11507538232733291666": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "10405183426600618231": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "7605139219344415117": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "6329618009202266591": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "10747988576436391912": ["convolution_gpu_bfyx_gemm_like",1], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",1], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",803], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "10323345824599612614": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "10554266898346470422": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15579919505002150556": ["convolution_gpu_bfyx_gemm_like",1], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "2534408579674556441": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12932635875905153141": ["convolution_gpu_bfyx_gemm_like",2], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "1907439276166837309": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",1], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12308359047798183133": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "18337160891834020517": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "7548031489690889629": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "12393385058735194260": ["convolution_gpu_bfyx_gemm_like",1], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "17564338309805484464": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "8528750110601691390": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "15636128989267984459": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7903891232234389925": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",1], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",396], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",1], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "15968821946892330559": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9076758673133996959": ["convolution_gpu_bfyx_gemm_like",2], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "9111988592015450418": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "3830703844770425343": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "1628593159980574595": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "14910223536998380801": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "16559140502701231107": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17825280904760131680": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "1074748462756364699": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",2], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "14352303529756685990": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",197], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",226], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "9383182168277796969": ["convolution_gpu_bfyx_gemm_like",2], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "273242667845386507": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "12228963567837353733": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "15201438563802430490": ["fully_connected_gpu_fb_oi_ref",1], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",308], + "13314092088416047551": ["fully_connected_gpu_fb_oi_ref",2], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "726985753660756762": ["convolution_gpu_bfyx_gemm_like",2], + "2788116002380533417": ["convolution_gpu_bfyx_gemm_like",2], + "13320675959188615441": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "10982526068861394162": ["convolution_gpu_yxfb_yxio_b16",0], + "15770767768674603174": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "7304346312452588844": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "16467987800266816984": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "4073467095502162430": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",1], + "14213516751025324346": ["convolution_gpu_bfyx_gemm_like",2], + "768720470104458759": ["convolution_gpu_bfyx_os_iyx_osv16",1025], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "11565861421381730304": ["convolution_gpu_bfyx_gemm_like",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "10085059621136526248": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "5735608687257018419": ["convolution_gpu_bfyx_os_iyx_osv16",306], + "13954821927253849036": ["convolution_gpu_bfyx_gemm_like",2], + "15600841108426475615": ["convolution_gpu_yxfb_yxio_b16",0], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "14738573151275130683": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",2], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "7755177205197405275": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "10187930930336324253": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "14418429155823196539": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13831458435772917577": ["convolution_gpu_bfyx_gemm_like",2], + "18386376129938707290": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "1237920404306733800": ["convolution_gpu_bfyx_gemm_like",1], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",1], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10883341041912056319": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",2], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "17089801601582809764": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "14630499010941056793": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",424], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "10917498758625273194": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",2], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "5740738339752793113": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7818381040882768404": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "5657471280535146301": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "12388375914105990324": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",701], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "5948701218437980356": ["convolution_gpu_bfyx_gemm_like",1], + "6928835003016610382": ["convolution_gpu_bfyx_gemm_like",2], + "3325727286860556323": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "17908444616754154471": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "16441830491664937048": ["convolution_gpu_bfyx_gemm_like",2], + "17847109385592002207": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",1], + "16490405739040977260": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "4217179485243909459": ["convolution_gpu_bfyx_gemm_like",1], + "14885109535362957947": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "11756650366229979428": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "4403753181729432604": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "1270307036687208396": ["convolution_gpu_bfyx_gemm_like",1], + "9737833587413114584": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "12534001599784153836": ["convolution_gpu_bfyx_os_iyx_osv16",1041], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",0], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "8618835732380720921": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "15287650965861631130": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "10090036431487700311": ["convolution_gpu_bfyx_gemm_like",2], + "16327433707667075261": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",528], + "3244675355773468991": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "2038505773698938555": ["fully_connected_gpu_bf_io_gemm",1], + "5857101685300045443": ["convolution_gpu_yxfb_yxio_b16",2], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",198], + "9152433123828445089": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "11007944497812650617": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",2], + "17808913959977434594": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",251], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",1], + "18381791065890314250": ["convolution_gpu_bfyx_gemm_like",0], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "6351347283201596793": ["convolution_gpu_bfyx_os_iyx_osv16",97], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "9780938731831129283": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",2], + "14472187692485966933": ["convolution_gpu_bfyx_os_iyx_osv16",860], + "10136369729388564720": ["convolution_gpu_bfyx_gemm_like",2], + "59356084516953804": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",1], + "937159502066696999": ["convolution_gpu_bfyx_gemm_like",1], + "14484890926084856480": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "10536316961655703500": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8108933468437926367": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "12160764253455777655": ["convolution_gpu_bfyx_gemm_like",2], + "9226912483632588371": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3538679039078582272": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "852015206582470545": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "1908809004094565452": ["convolution_gpu_bfyx_os_iyx_osv16",134], + "7333511810266504718": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "5057534502588100071": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "3432296808755992670": ["convolution_gpu_bfyx_gemm_like",2], + "15759530339367380982": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "13546876216568825877": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "1617135706549276688": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "11459784003592366395": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3498490999014554104": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "5321698540631249776": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "2939605281692583169": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",187], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",0], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",148], + "84595904778810418": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "18035673326929466074": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5419041493176804960": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "15661322183507404821": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "9775648000771985077": ["convolution_gpu_yxfb_yxio_b16",1], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "187352687850707150": ["convolution_gpu_bfyx_gemm_like",2], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16071723603031305677": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "5269172622193124300": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",2], + "14681717813022425567": ["convolution_gpu_bfyx_gemm_like",1], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",1], + "4992668316921598993": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "16547425454653232058": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",479], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "1170380397764345558": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",307], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",1], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1930929857644673460": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "6294240435687565243": ["convolution_gpu_bfyx_os_iyx_osv16",989], + "3341302541468955849": ["convolution_gpu_bfyx_gemm_like",1], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "7208008921815475393": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",1], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",2], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",1], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",504], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",1], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "1905758333157310570": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "879005904827468163": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "12174571114411168588": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "8857763129101380288": ["convolution_gpu_bfyx_gemm_like",2], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",1], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "281287280558289393": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "8708643228914766202": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",1], + "17790026124881397912": ["fully_connected_gpu_fb_io_ref",1], + "11587239927319376658": ["convolution_gpu_bfyx_gemm_like",2], + "9328223957245552723": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2108296560864415762": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "5754844816339228920": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "8803037667261582905": ["convolution_gpu_bfyx_gemm_like",1], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "4085450203909854919": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15820359925623438341": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",0], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "11455843788148231615": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "11198908896401597838": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "654122557966242717": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",1], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",1], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",1], + "5211831143687501130": ["convolution_gpu_bfyx_gemm_like",1], + "16661843849495077745": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",1049], + "5766507688771440170": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",2], + "14403132596827435096": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "1018687388655376483": ["convolution_gpu_bfyx_gemm_like",1], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "2305706332728008948": ["convolution_gpu_bfyx_gemm_like",2], + "18267428053198215471": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "2662628817605495834": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "2128612971571865547": ["convolution_gpu_bfyx_gemm_like",2], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "17585206779958265260": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "7712831597869354170": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "5568753513029409478": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "1697248235682953135": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17343050785312683560": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7447163906170805189": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "482564204402769504": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "7869916853707978306": ["convolution_gpu_bfyx_os_iyx_osv16",834], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "1410630713443793537": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "15678385128478075284": ["convolution_gpu_bfyx_gemm_like",2], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",1], + "5245526691775741296": ["convolution_gpu_bfyx_direct_10_12_16",2], + "577844026691991089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "54019631544204590": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16925721317097534009": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "18136765667969393174": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "9475130054420979752": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "3118602494449249177": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "12004552919019936392": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "14217181622713951411": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "2777318471329665162": ["convolution_gpu_bfyx_gemm_like",2], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "7708321360699824256": ["convolution_gpu_bfyx_gemm_like",1], + "4216958486055161753": ["convolution_gpu_bfyx_gemm_like",2], + "6713985030102340818": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",1], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "17238880534517721334": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "2968439898708528834": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "12866217660635921034": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "11666250400445971335": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "5047419871737940985": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "9096495972770198040": ["convolution_gpu_yxfb_yxio_b16",2], + "17912189681971987483": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",894], + "10743138314323119696": ["convolution_gpu_bfyx_gemm_like",2], + "1680468564927032670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "3755253206085028904": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "6982733543386888622": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "10071449674652717890": ["convolution_gpu_bfyx_gemm_like",1], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",654], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "7223801044761006523": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "7206226541369793931": ["convolution_gpu_yxfb_yxio_b16",1], + "13282951481330978659": ["convolution_gpu_bfyx_gemm_like",2], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",1], + "5479761740065152589": ["convolution_gpu_bfyx_gemm_like",2], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",2], + "4346591404756288097": ["convolution_gpu_bfyx_gemm_like",0], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "2995134938466176198": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "4387041763614917736": ["convolution_gpu_bfyx_gemm_like",1], + "7916244303189113815": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "7171904645566467208": ["convolution_gpu_bfyx_gemm_like",2], + "6902644989079870993": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "16190949264253468961": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "12165079289914715018": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "15322609677356616580": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "8712136292276123857": ["convolution_gpu_bfyx_gemm_like",2], + "8819268903800581706": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "10100237101982273901": ["convolution_gpu_bfyx_os_iyx_osv16",238], + "14077148976508649021": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "6213353364768643062": ["convolution_gpu_bfyx_gemm_like",2], + "13842309033760176194": ["convolution_gpu_bfyx_gemm_like",2], + "9259437778054905599": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "72444706264681262": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "10424278617647597641": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",424], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "14079654309452583394": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7869779894480025247": ["convolution_gpu_bfyx_gemm_like",0], + "14397348576352573007": ["convolution_gpu_bfyx_gemm_like",1], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "7279393739634103483": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "9440117898128288296": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "8906588133431586825": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",2], + "3806131437010910920": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "9454954846682513038": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "7565867291827884997": ["convolution_gpu_bfyx_gemm_like",1], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",1], + "7585184325339753737": ["convolution_gpu_bfyx_os_iyx_osv16",238], + "4313392430539923574": ["convolution_gpu_bfyx_os_iyx_osv16",235], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",2], + "16969463538496570528": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",1], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",0], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",726], + "3515437649977762166": ["convolution_gpu_bfyx_gemm_like",0], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "14671212883301405408": ["convolution_gpu_bfyx_gemm_like",1], + "17113350507039887381": ["convolution_gpu_bfyx_gemm_like",2], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "7183578232279711009": ["convolution_gpu_bfyx_gemm_like",2], + "8101977280003030465": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "1318571118468536310": ["convolution_gpu_bfyx_gemm_like",2], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",2], + "288853243482418538": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "15589007878875898942": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "7082007579524697455": ["convolution_gpu_bfyx_gemm_like",2], + "8656468860180713379": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",1], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17225578855755054959": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "17252589865292797082": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",1], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "5319668297345215520": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "13954144830230671601": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "12242618640422208652": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "10914921540144371519": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14346703182362139650": ["convolution_gpu_bfyx_gemm_like",2], + "4550028191070279999": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",4], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "9529614587861271730": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",1], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "12871555773123368130": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "14487842225000203929": ["convolution_gpu_bfyx_gemm_like",2], + "4165036357594592683": ["convolution_gpu_bfyx_gemm_like",2], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",2], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "13991205023798493715": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "1934379409955686502": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "1596353239542510685": ["convolution_gpu_bfyx_gemm_like",1], + "14985236276429954162": ["convolution_gpu_bfyx_gemm_like",0], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "2969389503332309296": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5688478347124565305": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "14335423820860953927": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8439950151963452285": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "9213563311267466388": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "1116274074896622552": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2893564501191050837": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "2722124265986526212": ["convolution_gpu_bfyx_gemm_like",2], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",51], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "872401732136570312": ["convolution_gpu_bfyx_gemm_like",2], + "5965451243366505522": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "597650904461183283": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "15412447128995361859": ["convolution_gpu_bfyx_gemm_like",1], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",2], + "11398019086259011063": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "8057302050645780813": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11356842300444410831": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "9340159617983543624": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8757900457181374694": ["convolution_gpu_bfyx_gemm_like",1], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",0], + "3541538046227217664": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2047041720569246861": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "6531171505861182429": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "3219408878901707426": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7375461241315602473": ["convolution_gpu_bfyx_gemm_like",2], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",1], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "3265415000818832667": ["convolution_gpu_bfyx_gemm_like",1], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "9542325095876448686": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3699344686791530101": ["convolution_gpu_bfyx_gemm_like",2], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "16587061389996963349": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",1035], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "10930115765550856328": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",2], + "2651385050387738902": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9277176009071334860": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "8751016391945753900": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1843555260471832708": ["convolution_gpu_bfyx_gemm_like",1], + "17344974951998490453": ["convolution_gpu_bfyx_direct_10_12_16",1], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",2], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "2622434279674583815": ["convolution_gpu_bfyx_gemm_like",1], + "1309867416606346543": ["convolution_gpu_bfyx_os_iyx_osv16",1029], + "15997754881872769378": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "7132328255408635227": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "7274179284676568361": ["convolution_gpu_bfyx_os_iyx_osv16",236], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",0], + "11619548409913646265": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "8951040603784899163": ["convolution_gpu_bfyx_gemm_like",2], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "9751582946441607796": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "8701248964531180496": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "17512961503976896701": ["convolution_gpu_bfyx_os_iyx_osv16",1059], + "9762182215179534181": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "16243196137456624852": ["convolution_gpu_bfyx_gemm_like",2], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",1], + "8121179472578287280": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "5311718276151327830": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "3383222668132648804": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12493863403516600413": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "8444259010311137762": ["convolution_gpu_bfyx_gemm_like",2], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "12990341489637414845": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "5687802882700097624": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",1], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",2], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",1], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15031155621982459860": ["convolution_gpu_bfyx_gemm_like",2], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",0], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",613], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "7397341452130124383": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4867937397499803072": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "4523064418696274869": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "3039528482572243879": ["convolution_gpu_bfyx_os_iyx_osv16",238], + "16677044352793659175": ["convolution_gpu_bfyx_gemm_like",1], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",724], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "11942736969933408358": ["convolution_gpu_bfyx_gemm_like",2], + "14398854364550406668": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "586947787345351152": ["convolution_gpu_bfyx_gemm_like",1], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "9477562342190423343": ["convolution_gpu_bfyx_gemm_like",2], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3563614453014995411": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "12510951219501865365": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "4428101657497677982": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "5352861363832390974": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "5074273865983613482": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "11215297942420903101": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "17517495652165026573": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "12417253210787537988": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",2], + "9208964785762052001": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "7211355951470869591": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",2], + "2856601829807186494": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",2], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",586], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "859377216693940737": ["convolution_gpu_bfyx_gemm_like",1], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",2], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",2], + "15188570678726970998": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "14025235562200209723": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",614], + "4398371999113956082": ["convolution_gpu_bfyx_gemm_like",2], + "677249604491773387": ["convolution_gpu_bfyx_gemm_like",2], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "5497751772699578150": ["convolution_gpu_bfyx_gemm_like",1], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "5926747396493954633": ["convolution_gpu_bfyx_gemm_like",2], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",1], + "8183383667948205424": ["convolution_gpu_yxfb_yxio_b16",0], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "6830387121684699972": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "12972634653821069685": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13200151444914751729": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "7481256533438761028": ["convolution_gpu_bfyx_gemm_like",2], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "10128390168715530898": ["convolution_gpu_bfyx_gemm_like",2], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "8787438180071123604": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "12461575861709234385": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "16710651492402564794": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",2], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",875], + "1230262279011217327": ["convolution_gpu_bfyx_gemm_like",1], + "17024388383581997032": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "13702692566238948173": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "9545968464906009869": ["convolution_gpu_bfyx_gemm_like",1], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",420], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12194037100109755112": ["convolution_gpu_bfyx_gemm_like",2], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "11451740938287179908": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "16037141448095945650": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "1608378717397996752": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "16397733032387984819": ["convolution_gpu_bfyx_gemm_like",2], + "10384537928514123040": ["convolution_gpu_bfyx_gemm_like",2], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "16527840366172690992": ["convolution_gpu_yxfb_yxio_b16",0], + "6129884455218252024": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "14151747022287993729": ["convolution_gpu_bfyx_gemm_like",2], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "18310667924071639899": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "13369603621524676979": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "1801731858063091191": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",1059], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "14114380593731243715": ["convolution_gpu_bfyx_os_iyx_osv16",3], + "9702618600245321109": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "13739257060165119132": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "14810839157236175179": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "7692849839965441330": ["convolution_gpu_bfyx_gemm_like",2], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",888], + "12293786134765875615": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "5911282942658469852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",2], + "2632535010129224704": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16748662918272106932": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",292], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",1], + "7106362077449435105": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "5060012838564094182": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",4], + "8321769923556905957": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "6509758095668864050": ["convolution_gpu_bfyx_gemm_like",2], + "10058165874008941852": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "1139581213977408268": ["fully_connected_gpu_fb_io_ref",1], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",2], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",2], + "3141886504884887200": ["convolution_gpu_bfyx_gemm_like",1], + "4773123925616969670": ["convolution_gpu_bfyx_gemm_like",1], + "6310724136390087834": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",795], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",2], + "9522661528867955338": ["convolution_gpu_bfyx_gemm_like",1], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15839295895890205274": ["convolution_gpu_bfyx_gemm_like",2], + "18302892230881285207": ["convolution_gpu_bfyx_gemm_like",1], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",2], + "17281826959243966826": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "3106922888635965020": ["convolution_gpu_bfyx_gemm_like",2], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "12278364834477923930": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "15739278428190392018": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",266], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "4197617702037834389": ["convolution_gpu_bfyx_gemm_like",1], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16579057939215877904": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "6863331059471727622": ["convolution_gpu_bfyx_gemm_like",2], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "9649445293567537596": ["convolution_gpu_yxfb_yxio_b16",1], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "4021558014531645922": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "6222595759158615206": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7878605163588288309": ["convolution_gpu_bfyx_os_iyx_osv16",469], + "6263019986730305851": ["convolution_gpu_bfyx_os_iyx_osv16",857], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "13775529405693629438": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",0], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",1], + "73865742350616903": ["convolution_gpu_bfyx_gemm_like",1], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "378801963103874857": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "7264756313770306662": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",0], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",1030], + "16364494883229084045": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14248239982355212178": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "16818714747882774917": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "10437367877444543776": ["convolution_gpu_bfyx_gemm_like",0], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "17170858505976681742": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "3034482898462686729": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "14784115394395151055": ["convolution_gpu_bfyx_gemm_like",2], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9065894438656900887": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14001406016806064079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14757749560543979231": ["convolution_gpu_bfyx_gemm_like",2], + "6772239376357727149": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6398819277350155011": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "5308128387928804050": ["convolution_gpu_bfyx_gemm_like",2], + "3409043224171087168": ["convolution_gpu_bfyx_os_iyx_osv16",272], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "12946531140050029900": ["convolution_gpu_bfyx_gemm_like",2], + "10058614204420018541": ["convolution_gpu_bfyx_os_iyx_osv16",379], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",1], + "13933912937625580405": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "13781423818051299677": ["convolution_gpu_bfyx_gemm_like",2], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",2], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",1], + "9455406830371528486": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",1], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",1], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",684], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",2], + "3750338655074082587": ["fully_connected_gpu_fb_io_ref",0], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",1], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "10482582307328548806": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "4229105529069729944": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "6556424924189200804": ["convolution_gpu_bfyx_os_iyx_osv16",236], + "3872151366780051246": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "9222744127882324405": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "4780291919667721265": ["convolution_gpu_yxfb_yxio_b16",1], + "10792503079194374004": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "10890975553758439233": ["convolution_gpu_bfyx_gemm_like",2], + "17477062954520561609": ["convolution_gpu_bfyx_gemm_like",2], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "15530407024531326375": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "3860603464276263676": ["convolution_gpu_bfyx_gemm_like",1], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "5334190564423375247": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",1], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "12450814729547235386": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",892], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "14535007186125575064": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "12190841837604350271": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "2649948006897488504": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "16522364268583242080": ["convolution_gpu_bfyx_gemm_like",2], + "13468713306678453952": ["convolution_gpu_bfyx_gemm_like",1], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "7232326270078161768": ["convolution_gpu_bfyx_gemm_like",2], + "6254141935545262078": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",1], + "15247381586316467097": ["convolution_gpu_bfyx_gemm_like",2], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",0], + "17515573322312447679": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "11113256687741667688": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "17310332946322628458": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14744368497944610864": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",298], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12977678792503377525": ["convolution_gpu_bfyx_gemm_like",1], + "11829442945690098558": ["convolution_gpu_bfyx_gemm_like",2], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "13856271274572142709": ["convolution_gpu_bfyx_gemm_like",1], + "6471563320494376693": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3774285301357006334": ["convolution_gpu_bfyx_gemm_like",1], + "13283842370311517843": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",2], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",0], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",1], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",2], + "17094948685292534952": ["convolution_gpu_bfyx_os_iyx_osv16",923], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "11308583200952256245": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "8354579049246302728": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "8943913562339525413": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "17026284168840448378": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "17178308105985812083": ["convolution_gpu_yxfb_yxio_b16",1], + "9813748068195103720": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",1], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "4588420324030315321": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",2], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "7683334381958571864": ["convolution_gpu_bfyx_gemm_like",2], + "4265693151382066296": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "7975810844103449438": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "9839670675413379092": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "3503893875515897267": ["convolution_gpu_bfyx_gemm_like",0], + "3761770343527826418": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",897], + "10771803503544737080": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "563440246018637010": ["convolution_gpu_yxfb_yxio_b16",0], + "3150231129728961455": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16853250891250756537": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",2], + "7757331094141318304": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",1], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "11107930597263802755": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16986358655784856534": ["convolution_gpu_bfyx_gemm_like",2], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "11583017348580874022": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "17635171685500922207": ["convolution_gpu_bfyx_os_iyx_osv16",237], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "14716719350966652036": ["convolution_gpu_bfyx_gemm_like",1], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",2], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",797], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",6], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",1], + "17525564757769958678": ["convolution_gpu_bfyx_gemm_like",1], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15779837958180258409": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "16348402367953880206": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1051506168926530904": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "6438522646185979880": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2668729552208169959": ["convolution_gpu_bfyx_gemm_like",2], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16236397968499692493": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "5629373398445592781": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "16312223896859176991": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",2], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "15799159401545270696": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",1], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",1], + "5688623850477433571": ["convolution_gpu_bfyx_gemm_like",1], + "13468081302022888489": ["convolution_gpu_bfyx_gemm_like",2], + "2188101366183302888": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "9541630719145326121": ["convolution_gpu_bfyx_gemm_like",1], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "16574710115918192418": ["convolution_gpu_bfyx_gemm_like",2], + "10377729875228238588": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "9729771183572950642": ["convolution_gpu_bfyx_gemm_like",2], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",1], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4716188972902735458": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "8032685176029570383": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",2], + "9920155432685318259": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "7977195117668583981": ["convolution_gpu_bfyx_gemm_like",2], + "4366168099274266975": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13328449155966085543": ["convolution_gpu_bfyx_gemm_like",2], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",658], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",113], + "11936530628363072904": ["convolution_gpu_bfyx_gemm_like",0], + "10968768803038046390": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "7575675354187625951": ["convolution_gpu_bfyx_gemm_like",2], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "8561261337239934159": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",2], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "5103094815475470596": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",55], + "4766071144928072260": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "5409924335138540834": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",2], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",1], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",0], + "16566128345135114558": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",1], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "14805540705424073865": ["convolution_gpu_bfyx_gemm_like",2], + "3673781117412048086": ["convolution_gpu_bfyx_os_iyx_osv16",272], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "11025471731438443683": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15720507574336564201": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",1], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "12309955719964788034": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "8931169575495985034": ["convolution_gpu_bfyx_gemm_like",2], + "17025324057045572535": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",1], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "8272823732258536202": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "9671459469252116568": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "5095827462645341808": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "11893541520830049036": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "1103204698908514224": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",2], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "543472136359161929": ["convolution_gpu_bfyx_gemm_like",2], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "7669403041163460089": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17723621158215826108": ["convolution_gpu_bfyx_gemm_like",2], + "12693511427898130707": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "5019077257951332016": ["convolution_gpu_bfyx_gemm_like",2], + "3277243911383750280": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "10432365444137108781": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "3868149953087814447": ["convolution_gpu_bfyx_gemm_like",1], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",1], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16053585286807864356": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "11754316727756881612": ["convolution_gpu_bfyx_os_iyx_osv16",108], + "11626402549863483301": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",731], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",1], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "5159738930501638535": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15609627722687211129": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "8809017515482311843": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "15511138074959300404": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",1], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",48], + "4769003637955328938": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "2265784112305305260": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3477539135137665170": ["convolution_gpu_bfyx_gemm_like",2], + "14466032674083938714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "1509728225855233852": ["convolution_gpu_bfyx_gemm_like",1], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "4914474312076193952": ["convolution_gpu_bfyx_gemm_like",1], + "1996860183441418841": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "15690161340392005765": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",2], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",0], + "4362304842016958728": ["convolution_gpu_bfyx_os_iyx_osv16",1061], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",2], + "15438530452161762045": ["convolution_gpu_yxfb_yxio_b16",1], + "13477548641580029772": ["convolution_gpu_bfyx_gemm_like",1], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14221578799010900252": ["convolution_gpu_bfyx_gemm_like",2], + "15235409162483701027": ["convolution_gpu_bfyx_gemm_like",1], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "13131740479277027362": ["fully_connected_gpu_bf_io_gemm",2], + "11462462742322068863": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "13754540732991287617": ["convolution_gpu_bfyx_gemm_like",2], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "18067291256808591467": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",2], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "8108843303778211282": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "15217573782563469232": ["convolution_gpu_yxfb_yxio_b16",0], + "7940369586324090841": ["convolution_gpu_bfyx_gemm_like",2], + "17035903590837750750": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "18375125668176498051": ["convolution_gpu_bfyx_gemm_like",2], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "11806402239500046867": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "700717277178942679": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "4238885454989272754": ["convolution_gpu_bfyx_gemm_like",1], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",687], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "13300022131572486202": ["convolution_gpu_bfyx_gemm_like",2], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14540578324750869319": ["convolution_gpu_bfyx_gemm_like",0], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "11626398907755088688": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "17201365233492366678": ["convolution_gpu_bfyx_gemm_like",2], + "15115780248032030963": ["convolution_gpu_yxfb_yxio_b16",0], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "733956743303342862": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "1077773457856682663": ["convolution_gpu_bfyx_os_iyx_osv16",178], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15026219694198820614": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",284], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",929], + "5600128039063009632": ["convolution_gpu_bfyx_direct_10_12_16",2], + "792684262493086891": ["convolution_gpu_bfyx_os_iyx_osv16",1042], + "15497797842820949408": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",2], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "1186545671730357033": ["convolution_gpu_bfyx_gemm_like",2], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "1742897526168249500": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "14045927407431718832": ["convolution_gpu_bfyx_gemm_like",2], + "11130439225010714550": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "1701609125136907870": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "10308431308942416781": ["convolution_gpu_bfyx_gemm_like",2], + "5649150695527000655": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "17713034180977313726": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",1], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "17649961873981897621": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",2], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "2194607895573544953": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "875400109066360897": ["convolution_gpu_bfyx_gemm_like",2], + "15980348884716629349": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12003323477818208825": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "18109284647478027063": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "12868739680413736657": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "10316451248440741901": ["convolution_gpu_bfyx_gemm_like",1], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "1249137685908951501": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "2173720698351153121": ["convolution_gpu_bfyx_gemm_like",2], + "12647099325257717945": ["convolution_gpu_bfyx_gemm_like",1], + "11086699387784339943": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",136], + "2608363732937932266": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "5646139101524964833": ["convolution_gpu_bfyx_gemm_like",1], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "12831123539633580270": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",1049], + "3653156933813711765": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",1], + "142650579335909103": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "14026537760442360645": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10022487076451608714": ["convolution_gpu_bfyx_gemm_like",2], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "941626985322260281": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "3308770992373192529": ["convolution_gpu_bfyx_gemm_like",2], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "15705908639736679687": ["convolution_gpu_yxfb_yxio_b16",0], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",418], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",1], + "4708035980731751007": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11443268857010762276": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",1], + "6845814820599174031": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15641537661939240413": ["convolution_gpu_bfyx_gemm_like",2], + "9928406318940388716": ["convolution_gpu_bfyx_gemm_like",1], + "6065819201836017182": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "8746621720912032145": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "14098811155652990436": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "13384754476437374504": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "14115742296883450319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17738299860390552088": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",184], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "6942016672941874829": ["convolution_gpu_bfyx_gemm_like",2], + "9367157746678824712": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "11845189428639322474": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",2], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "6981537186704688907": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "17310409067211414565": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "9404677451270692749": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "7941729567451949422": ["convolution_gpu_bfyx_gemm_like",2], + "14289082888174784976": ["convolution_gpu_bfyx_gemm_like",1], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "7624476043779763605": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",2], + "9525535670799618110": ["convolution_gpu_bfyx_gemm_like",2], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "17084977396231597605": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4623542918584461522": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",853], + "7870154008378361670": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2525260242689556544": ["convolution_gpu_bfyx_gemm_like",2], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",347], + "4274801141127703532": ["convolution_gpu_bfyx_os_iyx_osv16",993], + "7056030150365552588": ["convolution_gpu_bfyx_gemm_like",2], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "1138439260035360722": ["convolution_gpu_bfyx_direct_10_12_16",1], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",1], + "5720964268093705079": ["convolution_gpu_bfyx_direct_10_12_16",1], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "3350601287664242323": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "13833960927635646899": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",1052], + "879896719155824868": ["convolution_gpu_bfyx_gemm_like",2], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "11706446082856895571": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "7958459862276998225": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "14100870590396726248": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "17329287216741045059": ["convolution_gpu_bfyx_gemm_like",2], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",989], + "13538051178827008933": ["convolution_gpu_bfyx_os_iyx_osv16",160], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3746573775462003750": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",2], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15662207751131195569": ["convolution_gpu_bfyx_gemm_like",2], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "18012549942299450620": ["convolution_gpu_bfyx_gemm_like",1], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "4456004887590847716": ["convolution_gpu_bfyx_gemm_like",2], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",2], + "3212789693085089063": ["convolution_gpu_bfyx_gemm_like",2], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",1], + "17870874477143985774": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "11337525286386930242": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "4435224497850514394": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",2], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "14974730512607138726": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "9133263538092913983": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",494], + "9243949750444156746": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",304], + "15464327246951632247": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "6181272224000872375": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13865227850818392065": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "13681462437496627948": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "4010419602093863685": ["convolution_gpu_yxfb_yxio_b16",0], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "2623687018437195679": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "10084794570892043447": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",997], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "9131235538209388787": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",1], + "16884228931101540030": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "8543619733732987550": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "9468684953949274635": ["convolution_gpu_bfyx_gemm_like",1], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "291868903926685441": ["convolution_gpu_bfyx_gemm_like",2], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "3007637520820789085": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "9480653639044390919": ["convolution_gpu_bfyx_os_iyx_osv16",1061], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",1], + "8854234880878427078": ["convolution_gpu_bfyx_gemm_like",2], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "2653651564133701304": ["convolution_gpu_bfyx_gemm_like",2], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",755], + "6020017927557041768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "9803492989444302959": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17041468169694105561": ["convolution_gpu_yxfb_yxio_b16",0], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "2322559721899919275": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "6942049339361951275": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "10782611933832492335": ["convolution_gpu_bfyx_gemm_like",2], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13447028922679236865": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "15924583510704449214": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "12467673564660108244": ["convolution_gpu_bfyx_os_iyx_osv16",306], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",1], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",1], + "12065769091972094756": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "15106614232165315070": ["convolution_gpu_bfyx_gemm_like",1], + "17025268985366223779": ["convolution_gpu_bfyx_os_iyx_osv16",857], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "16039372573821594566": ["convolution_gpu_bfyx_gemm_like",2], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",1], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "12625112690264223217": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",1], + "8104309105061227444": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "3372770576629463160": ["convolution_gpu_bfyx_gemm_like",1], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "2884499360870038648": ["convolution_gpu_yxfb_yxio_b16",1], + "11267742746905371769": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "16616945998593626851": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "18077281411861416889": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "2305461098719675735": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "5055133356846736609": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "7474639594232203854": ["convolution_gpu_bfyx_os_iyx_osv16",97], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",234], + "6962268765187856246": ["convolution_gpu_bfyx_gemm_like",2], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "16711142379173254655": ["convolution_gpu_yxfb_yxio_b16",0], + "14444475853714164129": ["convolution_gpu_bfyx_gemm_like",2], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",0], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "9604982746455852556": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "6656593119788274992": ["convolution_gpu_bfyx_gemm_like",1], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1173136780324694038": ["convolution_gpu_yxfb_yxio_b16",0], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "8619526128410675593": ["convolution_gpu_bfyx_gemm_like",2], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "5334566325056222430": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",425], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "15586047342916704364": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",2], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",357], + "8195881973746570408": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "5922142661777925178": ["convolution_gpu_bfyx_gemm_like",1], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",1], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "9741607635826869269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",2], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "1632416005093914709": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",1], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "15739274921308457528": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",122], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "12978370505631031751": ["convolution_gpu_bfyx_gemm_like",2], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "15993427814066246646": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "1882052795393187384": ["convolution_gpu_bfyx_os_iyx_osv16",665], + "12802517759474139810": ["convolution_gpu_bfyx_gemm_like",2], + "10808909442136736629": ["convolution_gpu_bfyx_gemm_like",2], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "2065752819810364738": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",0], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",0], + "10700011669103135203": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6428098122005804378": ["convolution_gpu_bfyx_os_iyx_osv16",651], + "15897477855246170861": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "13206826317378863148": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "10415046594066474634": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2683304757433993300": ["convolution_gpu_bfyx_gemm_like",2], + "721174714308243785": ["convolution_gpu_bfyx_gemm_like",2], + "2204178900998688268": ["convolution_gpu_bfyx_gemm_like",2], + "9740466267717175474": ["convolution_gpu_bfyx_gemm_like",1], + "8526484907799590618": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "407189201971322683": ["convolution_gpu_bfyx_os_iyx_osv16",792], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",1], + "9882204352209412039": ["convolution_gpu_bfyx_gemm_like",1], + "3855859061709004677": ["convolution_gpu_bfyx_gemm_like",2], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "5020788604681810984": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "1963081583851864291": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2370837049876630969": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16437124655147660375": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "12397280593466519809": ["convolution_gpu_bfyx_gemm_like",2], + "9311802150474489673": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "12058759356433220258": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "3266557807508325807": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",899], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "17928043901784474130": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12641170321047008726": ["convolution_gpu_bfyx_gemm_like",2], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",305], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "10294185397756053636": ["convolution_gpu_bfyx_gemm_like",2], + "4161141078006269526": ["convolution_gpu_bfyx_gemm_like",2], + "14147460733160099960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12512751736409465214": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "7026575758396092435": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "9692654253261175490": ["convolution_gpu_bfyx_gemm_like",2], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "15354185859262170540": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "15595549493819416194": ["convolution_gpu_bfyx_os_iyx_osv16",105], + "2040762223425679479": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "8253823502854784432": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "216603198215625772": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",2], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",0], + "18416908414174464784": ["convolution_gpu_bfyx_gemm_like",1], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",1], + "11634932044447867039": ["convolution_gpu_bfyx_gemm_like",2], + "17037416417174266088": ["convolution_gpu_bfyx_gemm_like",1], + "12421204749289937399": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "4678607855896512523": ["convolution_gpu_bfyx_gemm_like",2], + "8490260671996115530": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "15640202505592598653": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "15727611564408173858": ["convolution_gpu_bfyx_gemm_like",1], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",547], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "16913004986170202203": ["convolution_gpu_bfyx_gemm_like",2], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12900949103593247293": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "10753540518493641553": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",1], + "18043340998699622388": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "14117801387057507639": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "2730604806511016352": ["convolution_gpu_bfyx_os_iyx_osv16",906], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6014752258124559691": ["convolution_gpu_yxfb_yxio_b16",1], + "15805087418686802636": ["convolution_gpu_bfyx_gemm_like",1], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",1], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "7201521533301617290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13760645810144930270": ["convolution_gpu_bfyx_os_iyx_osv16",602], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "13762042713029963144": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5941298590926032148": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "6300691162962736560": ["convolution_gpu_bfyx_os_iyx_osv16",1030], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "14667209474639064623": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3236003754884728510": ["convolution_gpu_bfyx_gemm_like",2], + "7688176479120305539": ["convolution_gpu_bfyx_os_iyx_osv16",840], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",2], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",793], + "8025053805734757314": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "16432425079146486467": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15948383678216076358": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",686], + "10320711719466983961": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "13590444711975157776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "8939683514448064461": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "2418288192668085805": ["convolution_gpu_bfyx_gemm_like",2], + "6057433908801727873": ["convolution_gpu_bfyx_gemm_like",2], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "1154228007901031779": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",0], + "11726298758004767743": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "6129602738379919488": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "8737417433314100353": ["convolution_gpu_bfyx_gemm_like",2], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",1], + "9226443907548972870": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "4479979951990338510": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "11799179287124317845": ["convolution_gpu_bfyx_gemm_like",1], + "2004120786408087671": ["convolution_gpu_bfyx_gemm_like",1], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "1902656726461670148": ["convolution_gpu_bfyx_gemm_like",2], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "13011676362747785816": ["convolution_gpu_bfyx_gemm_like",2], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",1], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "16286085532892593349": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",1078], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",893], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "7780140599533242850": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "805221045541170643": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3388752887767453958": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "8837721075413149240": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",2], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",1], + "13646974121952099172": ["convolution_gpu_bfyx_gemm_like",1], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",2], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "8856888761246057127": ["convolution_gpu_bfyx_gemm_like",1], + "12812685418923919055": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4861982518177129729": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",1], + "17243648226968859637": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "15494543914974994991": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "2495655464941634884": ["convolution_gpu_bfyx_os_iyx_osv16",238], + "3036808833459559381": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "8460847842045253466": ["convolution_gpu_bfyx_os_iyx_osv16",11], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",752], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "13575423234109624706": ["fully_connected_gpu_yxfb_ref",1], + "17163595630291422874": ["convolution_gpu_bfyx_gemm_like",2], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "18417288692814472127": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "9482749589540764069": ["convolution_gpu_yxfb_yxio_b16",1], + "3441335188113424896": ["convolution_gpu_bfyx_gemm_like",1], + "12421707187947291166": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "8153567933591966877": ["convolution_gpu_bfyx_gemm_like",2], + "10130171279527667782": ["convolution_gpu_bfyx_gemm_like",1], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",497], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6109013751635776331": ["convolution_gpu_bfyx_gemm_like",2], + "14650567822254940018": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "18357544235608006954": ["convolution_gpu_bfyx_gemm_like",1], + "13503688893307029975": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "8061914949376516780": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4232250144427804891": ["fully_connected_gpu_bf_io_input_spatial",1], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",2], + "16779678846332091086": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "8761283252495354972": ["convolution_gpu_bfyx_gemm_like",1], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "15216108478837665623": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "939718260623752240": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "9441060601228656341": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "4999505377862312410": ["fully_connected_gpu_bf_io_gemm",1], + "1353170363915443814": ["convolution_gpu_bfyx_gemm_like",1], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "1854612313463195535": ["convolution_gpu_yxfb_yxio_b16",1], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",619], + "12796777049340516563": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",2], + "10001963042016663554": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12531580106484042446": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "17993337310288098038": ["convolution_gpu_bfyx_gemm_like",2], + "16949056117405140365": ["convolution_gpu_bfyx_gemm_like",2], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "17208186152576814861": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "8479958930889587809": ["fully_connected_gpu_fb_io_ref",2], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",1], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "2140514316203117958": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",645], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4085907608404305515": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "6791806088355877039": ["convolution_gpu_bfyx_gemm_like",0], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "10133054058562198093": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "994842991399671507": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",2], + "1760830986937165861": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "13815395589135469450": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "5083163738120585821": ["fully_connected_gpu_fb_io_ref",2], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "3220280315905987373": ["convolution_gpu_bfyx_gemm_like",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",560], + "17700958439420868719": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "9119618606914671839": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "10290107543739998181": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "11595387512434355394": ["convolution_gpu_bfyx_gemm_like",2], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",2], + "4933831571091731212": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3810356382905059819": ["convolution_gpu_bfyx_gemm_like",1], + "4642234334824303290": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "14578291812739325465": ["convolution_gpu_bfyx_os_iyx_osv16",1013], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",375], + "8258382025812748961": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "16683485007140805060": ["fully_connected_gpu_fb_io_ref",2], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "3622409603053918029": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",1], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "956022649859563080": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "14206076551739831333": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "10848277915422577656": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",2], + "6323026044750482867": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "2627779045483019709": ["convolution_gpu_bfyx_os_iyx_osv16",847], + "12494969618927201911": ["fully_connected_gpu_fb_oi_ref",1], + "4714289593698160876": ["convolution_gpu_yxfb_yxio_b16",2], + "1089944493540593798": ["convolution_gpu_bfyx_gemm_like",1], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",2], + "16000753982895054944": ["convolution_gpu_bfyx_gemm_like",1], + "5941092474669713339": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "12961109385388101976": ["convolution_gpu_yxfb_yxio_b16",1], + "60749853744407778": ["convolution_gpu_bfyx_gemm_like",2], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",1], + "12076058470574246054": ["convolution_gpu_bfyx_os_iyx_osv16",273], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",281], + "2783577080556699089": ["convolution_gpu_bfyx_gemm_like",1], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "4191326605459754690": ["convolution_gpu_bfyx_os_iyx_osv16",617], + "13621339501067135142": ["convolution_gpu_bfyx_gemm_like",2], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "5779388310240896974": ["convolution_gpu_bfyx_os_iyx_osv16",835], + "12522495848240087966": ["convolution_gpu_bfyx_gemm_like",1], + "878892264408839067": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "17442035600389810700": ["convolution_gpu_bfyx_gemm_like",2], + "8859895010324601937": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "4121109463284708890": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "5912303851874077576": ["convolution_gpu_bfyx_gemm_like",2], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "10294610483561043024": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "10544034939133448916": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",1], + "10280619408766255552": ["convolution_gpu_bfyx_gemm_like",2], + "18313088176414428990": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "17382660912493284320": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",303], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "5145853681977610916": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10702234389482091891": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "12788968383428254917": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "9100044555742394133": ["convolution_gpu_bfyx_os_iyx_osv16",926], + "52089503050497755": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "14930789530046665855": ["convolution_gpu_bfyx_gemm_like",0], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "4086556132337751931": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "3177304125602972370": ["convolution_gpu_bfyx_direct_10_12_16",0], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",1], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "529543453251381109": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "9707630588260222630": ["convolution_gpu_bfyx_gemm_like",2], + "17050143605017295447": ["convolution_gpu_bfyx_gemm_like",2], + "15065019229949449623": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6288489890578212082": ["convolution_gpu_bfyx_gemm_like",1], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",47], + "9987415314864002460": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "3782239800777370325": ["convolution_gpu_bfyx_gemm_like",1], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",347], + "11919129623429545762": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "13850920989756588064": ["convolution_gpu_bfyx_gemm_like",0], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",137], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",145], + "5291011077679733990": ["convolution_gpu_bfyx_gemm_like",2], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",613], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",397], + "9695024256541464964": ["convolution_gpu_bfyx_gemm_like",1], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "60267878504897170": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",1], + "3895088069642140043": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "2345023488044002149": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",2], + "2908249767551054613": ["convolution_gpu_bfyx_gemm_like",2], + "3491333679577961640": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1569043950563130463": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",2], + "16264774056719724826": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "14902389080201926109": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "17704040183891532914": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",756], + "2294800960010879540": ["convolution_gpu_bfyx_gemm_like",2], + "12181889163404078773": ["convolution_gpu_bfyx_gemm_like",2], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "789359733867650915": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "6149673627320838019": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "15591167992985613695": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "11465965972527519631": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "3399406641489305996": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "1954052357826969119": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10690972785852373520": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "1082586642383386489": ["convolution_gpu_bfyx_gemm_like",1], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",846], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",46], + "6631816968511312100": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "17446505012657609153": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",0], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",685], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "15078590909693331731": ["convolution_gpu_bfyx_gemm_like",2], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "4450409744922989123": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "1081962464388501987": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "11693134363909241514": ["convolution_gpu_yxfb_yxio_b16",2], + "10724501418439612080": ["convolution_gpu_bfyx_gemm_like",2], + "3499106702307464480": ["convolution_gpu_bfyx_gemm_like",2], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "16894871557229780934": ["convolution_gpu_bfyx_os_iyx_osv16",141], + "13483088320871913126": ["convolution_gpu_bfyx_gemm_like",1], + "9191832520273617003": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8482147530539941792": ["convolution_gpu_bfyx_gemm_like",2], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "15466940145773097237": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",1], + "9439431829175743345": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",1], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "5622089373755094139": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "153117141968471446": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",1], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "10642327923162019888": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "17442105631503326136": ["convolution_gpu_bfyx_gemm_like",2], + "9533360488591027707": ["fully_connected_gpu_fb_io_b8_f8_vload",2], + "1867337342417952506": ["convolution_gpu_bfyx_gemm_like",2], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",803], + "15961487889420208188": ["convolution_gpu_bfyx_os_iyx_osv16",683], + "3691705516240577130": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "5240706676373148280": ["convolution_gpu_bfyx_gemm_like",2], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",1], + "13912843078550000960": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "9759380701896779097": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "8873614802459592665": ["convolution_gpu_bfyx_gemm_like",2], + "14289048840489035546": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "12159582810513550491": ["convolution_gpu_bfyx_gemm_like",1], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",2], + "16307464696265537356": ["convolution_gpu_bfyx_gemm_like",2], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "9631481972809246378": ["convolution_gpu_bfyx_os_iyx_osv16",211], + "5109636469531439569": ["convolution_gpu_yxfb_yxio_b16",2], + "18215430801133520364": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "14532844474906286088": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",2], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",2], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",1], + "14031009077471784948": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",2], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",1], + "11845013061234102293": ["convolution_gpu_bfyx_gemm_like",2], + "16955653765071712611": ["convolution_gpu_bfyx_os_iyx_osv16",268], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",615], + "4355933224673863178": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "13820498543284008286": ["convolution_gpu_bfyx_gemm_like",1], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "3976736548270395981": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "75742659105146536": ["convolution_gpu_bfyx_os_iyx_osv16",1036], + "13418701036204748812": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4974320417566990034": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "15809639778580769565": ["convolution_gpu_bfyx_gemm_like",2], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "2394023805427701338": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",0], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "2294026590516781945": ["convolution_gpu_bfyx_gemm_like",1], + "3526580286148537369": ["convolution_gpu_bfyx_gemm_like",2], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "603883331897298932": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "16025442470600124062": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",554], + "17711453305763476458": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",2], + "7808544677773370430": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",2], + "16044646335477470657": ["convolution_gpu_bfyx_gemm_like",2], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "9933958860597451711": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "13038533272699602337": ["convolution_gpu_bfyx_gemm_like",1], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "17309326904418811234": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "5740745357953479527": ["convolution_gpu_bfyx_gemm_like",2], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",428], + "5448537627319798272": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "2727219457659794468": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",723], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "7075659071934895087": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "4488336106517889531": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "4381329435655511217": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "14544219140091420262": ["convolution_gpu_bfyx_direct_10_12_16",2], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10279778381617181802": ["convolution_gpu_bfyx_os_iyx_osv16",994], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",2], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "2007192658799516915": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "16075006181495932250": ["convolution_gpu_bfyx_gemm_like",1], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "17845905249343189063": ["convolution_gpu_bfyx_gemm_like",2], + "6203765709597125063": ["convolution_gpu_bfyx_gemm_like",1], + "15271783562528081169": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "5592428580503282095": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "12055647521556218046": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",1], + "11490143853656040028": ["convolution_gpu_bfyx_gemm_like",2], + "6156831095718536092": ["convolution_gpu_bfyx_os_iyx_osv16",233], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",1], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "13472577372534605883": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8961138963663532667": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "17123153447808465303": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "7369903937189508744": ["convolution_gpu_bfyx_os_iyx_osv16",276], + "14224121742920800990": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "1089679781525023551": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "2162882863309264684": ["convolution_gpu_bfyx_gemm_like",2], + "1972879521448306536": ["convolution_gpu_bfyx_gemm_like",2], + "10869005786136023160": ["convolution_gpu_bfyx_os_iyx_osv16",422], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "18372284940315010254": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "8306337702797456793": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",0], + "8541982562061181756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5550969016335082071": ["convolution_gpu_bfyx_gemm_like",0], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "2973436171295280783": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",0], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "12874626654611400042": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "18286006396667126860": ["convolution_gpu_bfyx_gemm_like",1], + "4079026972040047969": ["convolution_gpu_bfyx_gemm_like",1], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "6025872155179042054": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "18245935804520236353": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "226601879759378771": ["convolution_gpu_bfyx_os_iyx_osv16",613], + "11718418772370938734": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "15410074937424854348": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "2111669705686676421": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "1952863937205473292": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "7119182041840303390": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "10912495395422146386": ["convolution_gpu_bfyx_gemm_like",2], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "5381578460674280089": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "17318287523550546026": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",1], + "11033507346101404633": ["fully_connected_gpu_fb_oi_ref",0], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",423], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",2], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "15114370307779942381": ["convolution_gpu_bfyx_os_iyx_osv16",104], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",499], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",1], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "5953754321266570854": ["convolution_gpu_bfyx_os_iyx_osv16",1095], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",2], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2349007644347065353": ["convolution_gpu_bfyx_gemm_like",2], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "138379779469699309": ["convolution_gpu_bfyx_gemm_like",1], + "17638692805430115529": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",1], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "11192356850081328892": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12672995204641007004": ["convolution_gpu_bfyx_os_iyx_osv16",608], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",2], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",677], + "2844746478867668588": ["convolution_gpu_bfyx_gemm_like",2], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "3409255127071376537": ["convolution_gpu_bfyx_gemm_like",2], + "2242829490403202087": ["convolution_gpu_bfyx_gemm_like",1], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "6443517114667332732": ["convolution_gpu_bfyx_os_iyx_osv16",548], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",0], + "11060822686394981344": ["convolution_gpu_bfyx_gemm_like",1], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",609], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7667898603371717971": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "2797723586312707948": ["convolution_gpu_bfyx_gemm_like",2], + "7602222004475424358": ["convolution_gpu_bfyx_gemm_like",1], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "12353956380178079089": ["convolution_gpu_bfyx_gemm_like",2], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "12936220888307335332": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2850279308978256234": ["convolution_gpu_bfyx_gemm_like",2], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",803], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",358], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "15838113905712517735": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "17489680436564779197": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "7998930863626763670": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "659846949368492111": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",2], + "1653274345637156919": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "18424400171776141118": ["convolution_gpu_bfyx_gemm_like",1], + "11806105193035393795": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "7726714223809300966": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",1], + "12995903177757437362": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "12069726772532946193": ["convolution_gpu_bfyx_os_iyx_osv16",509], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",654], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "13960388312976163971": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",2], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "15378025640603637387": ["convolution_gpu_bfyx_gemm_like",2], + "13809330759308309353": ["convolution_gpu_bfyx_gemm_like",1], + "13607830451968188080": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "8797843396807284399": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "4466647043226271996": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",2], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",49], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "16011429608661242565": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "4747159205186229582": ["convolution_gpu_bfyx_os_iyx_osv16",484], + "1711220333751274603": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "4378422094110940766": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",421], + "4239415134522959352": ["convolution_gpu_bfyx_gemm_like",2], + "11155444222714959508": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "3779229442395464456": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13734043898517059207": ["convolution_gpu_bfyx_gemm_like",0], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",2], + "5963901433137582265": ["convolution_gpu_bfyx_gemm_like",2], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15322019609805777935": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "13919204232414535363": ["convolution_gpu_bfyx_os_iyx_osv16",372], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "2543041530639980505": ["convolution_gpu_bfyx_os_iyx_osv16",292], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "9285566577169147378": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "9714508918051740792": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3797957937905580811": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "9378269524012289175": ["convolution_gpu_bfyx_gemm_like",2], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",1], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",2], + "1157947252370351851": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",1], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",1], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "9300767936311837876": ["convolution_gpu_bfyx_gemm_like",0], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "9810904714798127155": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "5420766967862917815": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",396], + "5275016494706355806": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",1016], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "4792351255949877935": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "2114232149447438823": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10789133352712755945": ["convolution_gpu_yxfb_yxio_b16",1], + "11077503608116183709": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1237262535285717993": ["convolution_gpu_bfyx_os_iyx_osv16",272], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",794], + "4149728557142033774": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "8090497202997192142": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1094], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",5], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "1435153323458789173": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "18199526506796726885": ["convolution_gpu_bfyx_os_iyx_osv16",241], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",4], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "7272538316511343863": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "8434794604559592624": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",1], + "11599932445375240727": ["convolution_gpu_bfyx_os_iyx_osv16",568], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",424], + "5374664689223295796": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "8700574100180128776": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "12427258337646070422": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",242], + "11655994466278963438": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "17922279129043570176": ["convolution_gpu_bfyx_os_iyx_osv16",985], + "6323083153920795679": ["convolution_gpu_bfyx_os_iyx_osv16",239], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",2], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "4356806313729405658": ["convolution_gpu_bfyx_gemm_like",2], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",798], + "818998169319147148": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",0], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",430], + "1372939511728986224": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",1], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",208], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "12255528292506999241": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "2260718905219541967": ["convolution_gpu_bfyx_gemm_like",1], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "8519354640245415816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14985755375924972050": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",1], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",722], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",2], + "4356817283284529593": ["convolution_gpu_bfyx_gemm_like",2], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "10756831914332769026": ["convolution_gpu_bfyx_gemm_like",1], + "7720939595094113814": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "4200340674281276565": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "3995098494991567714": ["convolution_gpu_bfyx_gemm_like",2], + "2983038203471784211": ["convolution_gpu_bfyx_gemm_like",2], + "13191096881934434519": ["convolution_gpu_bfyx_gemm_like",2], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",0], + "5865480930796299143": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",1], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",1], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",878], + "1003101267609305257": ["convolution_gpu_bfyx_gemm_like",2], + "9453100135791813000": ["convolution_gpu_yxfb_yxio_b16",2], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "9796621763733208035": ["convolution_gpu_bfyx_gemm_like",2], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",1058], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",895], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "5509395737020858006": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "11666226259183201584": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "16626226341188424071": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "3534971503826416049": ["convolution_gpu_bfyx_gemm_like",1], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "2921118493468368908": ["convolution_gpu_bfyx_gemm_like",1], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "8913823292181409151": ["fully_connected_gpu_fb_io_b8_f8_vload",1], + "13537323999534292650": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "7545013298074733778": ["convolution_gpu_bfyx_os_iyx_osv16",176], + "5040095338370816349": ["convolution_gpu_bfyx_gemm_like",2], + "1367483816197881270": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "17006655627343469372": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "4917595053453614536": ["convolution_gpu_bfyx_gemm_like",0], + "12253049204822930675": ["convolution_gpu_bfyx_gemm_like",1], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",2], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",2], + "9250410390663336388": ["convolution_gpu_bfyx_gemm_like",1], + "17152614235879767116": ["convolution_gpu_bfyx_os_iyx_osv16",610], + "12755692101476964677": ["convolution_gpu_bfyx_gemm_like",2], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",946], + "6214677989814002369": ["convolution_gpu_yxfb_yxio_b16",1], + "4915831715914920982": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",1], + "14599780481362761532": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12577421746159122264": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",0], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "11066913713501760080": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "5955575949957198434": ["convolution_gpu_bfyx_gemm_like",1], + "3011188207492335920": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "2173867324489962689": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",2], + "2281119269283845320": ["convolution_gpu_bfyx_os_iyx_osv16",220], + "18384657372655350144": ["convolution_gpu_bfyx_os_iyx_osv16",944], + "8079376692609682448": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",86], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",130], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",2], + "9606639214735570069": ["convolution_gpu_bfyx_gemm_like",2], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",990], + "7356440848422235031": ["convolution_gpu_bfyx_gemm_like",1], + "17085927772068621152": ["convolution_gpu_yxfb_yxio_b16",1], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "9763754389347695094": ["convolution_gpu_yxfb_yxio_b16",1], + "4291531885506213180": ["convolution_gpu_yxfb_yxio_b16",1], + "6280726148869856021": ["convolution_gpu_yxfb_yxio_b16",2], + "8585205898894363799": ["convolution_gpu_yxfb_yxio_b16",2], + "6490907666077364481": ["convolution_gpu_yxfb_yxio_b16",0], + "8645965165922150743": ["convolution_gpu_yxfb_yxio_b16",2], + "13426254939418471242": ["convolution_gpu_yxfb_yxio_b16",2], + "13077917010686381919": ["convolution_gpu_yxfb_yxio_b16",1], + "5958300749101873980": ["convolution_gpu_yxfb_yxio_b16",2], + "16184142990117192433": ["convolution_gpu_yxfb_yxio_b16",0], + "18148431787172327554": ["convolution_gpu_yxfb_yxio_b16",1], + "6709883527730513363": ["convolution_gpu_yxfb_yxio_b16",2], + "16768497046700403748": ["convolution_gpu_yxfb_yxio_b16",1], + "12867038076564517306": ["convolution_gpu_yxfb_yxio_b16",0], + "6902485831441844789": ["convolution_gpu_yxfb_yxio_b16",1], + "13705072264927031658": ["convolution_gpu_yxfb_yxio_b16",2], + "3286496836813087881": ["convolution_gpu_yxfb_yxio_b16",2], + "11888011890096886932": ["convolution_gpu_yxfb_yxio_b16",2], + "417352773179383568": ["convolution_gpu_yxfb_yxio_b16",2], + "7178866013527118649": ["convolution_gpu_yxfb_yxio_b16",1], + "13821224753538037982": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "17811558714592064184": ["convolution_gpu_yxfb_yxio_b16",2], + "3571330754519284334": ["convolution_gpu_yxfb_yxio_b16",2], + "13408839571805750778": ["convolution_gpu_yxfb_yxio_b16",1], + "10015368609444108372": ["convolution_gpu_yxfb_yxio_b16",2], + "6822432085522584060": ["convolution_gpu_yxfb_yxio_b16",0], + "5802466130040230797": ["convolution_gpu_yxfb_yxio_b16",2], + "7742126547476513275": ["convolution_gpu_yxfb_yxio_b16",2], + "2761862049452027986": ["convolution_gpu_yxfb_yxio_b16",1], + "2629918844315184499": ["convolution_gpu_yxfb_yxio_b16",1], + "14501815053459103515": ["convolution_gpu_yxfb_yxio_b16",1], + "13493119419114659706": ["convolution_gpu_yxfb_yxio_b16",2], + "12051398350382954787": ["convolution_gpu_yxfb_yxio_b16",2], + "7792512829747836997": ["convolution_gpu_yxfb_yxio_b16",2], + "17990326690659802090": ["convolution_gpu_yxfb_yxio_b16",2], + "13218298785325404589": ["convolution_gpu_yxfb_yxio_b16",1], + "8099100633390626027": ["convolution_gpu_yxfb_yxio_b16",2], + "6331794802915121861": ["convolution_gpu_yxfb_yxio_b16",1], + "3242391637018676328": ["convolution_gpu_yxfb_yxio_b16",2], + "7946262362930618714": ["convolution_gpu_yxfb_yxio_b16",0], + "15932838442166411183": ["convolution_gpu_yxfb_yxio_b16",2], + "2269140636553245446": ["convolution_gpu_yxfb_yxio_b16",2], + "17096735128393723245": ["convolution_gpu_yxfb_yxio_b16",2], + "9736684300833719045": ["convolution_gpu_yxfb_yxio_b16",2], + "6846760451124717672": ["convolution_gpu_yxfb_yxio_b16",1], + "7065244994574625911": ["convolution_gpu_yxfb_yxio_b16",2], + "11157773554806649837": ["convolution_gpu_yxfb_yxio_b16",1], + "6934241437968723825": ["convolution_gpu_yxfb_yxio_b16",1], + "7105219760750474587": ["convolution_gpu_yxfb_yxio_b16",2], + "12771841901357553928": ["convolution_gpu_yxfb_yxio_b16",2], + "5977875644245993099": ["convolution_gpu_yxfb_yxio_b16",1], + "8652128863605749877": ["convolution_gpu_yxfb_yxio_b16",2], + "3766048787611884529": ["convolution_gpu_yxfb_yxio_b16",1], + "10961696014697611547": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "16428789154716792138": ["convolution_gpu_yxfb_yxio_b16",0], + "13809046727894108358": ["convolution_gpu_yxfb_yxio_b16",2], + "7398158542592530232": ["convolution_gpu_yxfb_yxio_b16",2], + "7998455776901877973": ["convolution_gpu_yxfb_yxio_b16",2], + "15181987458871339815": ["convolution_gpu_bfyx_os_iyx_osv16",1058], + "15311930929656759371": ["convolution_gpu_yxfb_yxio_b16",2], + "11761545976388416063": ["convolution_gpu_yxfb_yxio_b16",2], + "6546440095044731932": ["convolution_gpu_yxfb_yxio_b16",2], + "14497254583210965214": ["convolution_gpu_yxfb_yxio_b16",2], + "6126579157025017808": ["convolution_gpu_yxfb_yxio_b16",1], + "2135164671985938807": ["convolution_gpu_yxfb_yxio_b16",0], + "888110783182849535": ["convolution_gpu_yxfb_yxio_b16",0], + "12305397676800089268": ["convolution_gpu_yxfb_yxio_b16",2], + "14116275901314596944": ["convolution_gpu_yxfb_yxio_b16",2], + "4104679489383377966": ["convolution_gpu_yxfb_yxio_b16",2], + "3911736807429733938": ["convolution_gpu_yxfb_yxio_b16",2], + "5293502980575652171": ["convolution_gpu_yxfb_yxio_b16",0], + "16072525303202287969": ["convolution_gpu_yxfb_yxio_b16",0], + "17397600088595751782": ["convolution_gpu_yxfb_yxio_b16",2], + "15879385408480411034": ["convolution_gpu_yxfb_yxio_b16",2], + "12081698011407453832": ["convolution_gpu_yxfb_yxio_b16",1], + "10717031088082350652": ["convolution_gpu_yxfb_yxio_b16",2], + "7349168847581850619": ["convolution_gpu_yxfb_yxio_b16",1], + "1235864574444794315": ["convolution_gpu_yxfb_yxio_b16",1], + "636447309806530300": ["convolution_gpu_yxfb_yxio_b16",2], + "8260024340787818709": ["convolution_gpu_yxfb_yxio_b16",0], + "11942019076226205097": ["convolution_gpu_yxfb_yxio_b16",1], + "848735117501914374": ["convolution_gpu_yxfb_yxio_b16",1], + "16516262096533373158": ["convolution_gpu_yxfb_yxio_b16",2], + "904355798061005466": ["convolution_gpu_yxfb_yxio_b16",1], + "7585777271711713778": ["convolution_gpu_yxfb_yxio_b16",0], + "3101748967012684440": ["convolution_gpu_yxfb_yxio_b16",2], + "15693204620575485046": ["convolution_gpu_yxfb_yxio_b16",0], + "3805667660217578518": ["convolution_gpu_yxfb_yxio_b16",2], + "6875055157295709098": ["convolution_gpu_yxfb_yxio_b16",1], + "8210092359850191682": ["convolution_gpu_yxfb_yxio_b16",0], + "6070612528095353265": ["convolution_gpu_yxfb_yxio_b16",1], + "4773482308451190487": ["convolution_gpu_yxfb_yxio_b16",0], + "8723078862651154959": ["convolution_gpu_yxfb_yxio_b16",1], + "3244803973821375252": ["convolution_gpu_yxfb_yxio_b16",1], + "4683320313995550908": ["convolution_gpu_yxfb_yxio_b16",1], + "5931972000452008090": ["convolution_gpu_yxfb_yxio_b16",1], + "1216021647922150199": ["convolution_gpu_yxfb_yxio_b16",2], + "17970424536559595893": ["convolution_gpu_yxfb_yxio_b16",2], + "231083216612056805": ["convolution_gpu_yxfb_yxio_b16",1], + "3112648799276134590": ["convolution_gpu_yxfb_yxio_b16",1], + "12327057172281102984": ["convolution_gpu_yxfb_yxio_b16",0], + "7369109502608631066": ["convolution_gpu_yxfb_yxio_b16",1], + "7134419022268272901": ["convolution_gpu_yxfb_yxio_b16",0], + "2263637493894079492": ["convolution_gpu_yxfb_yxio_b16",1], + "5312269140190538942": ["convolution_gpu_yxfb_yxio_b16",2], + "9312974578711092131": ["convolution_gpu_yxfb_yxio_b16",0], + "18101509783610609787": ["convolution_gpu_yxfb_yxio_b16",2], + "18359731130169236059": ["convolution_gpu_yxfb_yxio_b16",0], + "13009612703754510124": ["convolution_gpu_yxfb_yxio_b16",1], + "10465119306486335226": ["convolution_gpu_yxfb_yxio_b16",0], + "6962030848164918578": ["convolution_gpu_bfyx_gemm_like",2], + "866962088075892990": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9234877552798111728": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1564644716020135424": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15873670348742608564": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "259019999386390213": ["convolution_gpu_bfyx_gemm_like",2], + "3710413162291194839": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "10059412755080252504": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "7775757657060166345": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "7667210091570135646": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "12826353318487441420": ["convolution_gpu_bfyx_gemm_like",2], + "9413263409511666221": ["convolution_gpu_bfyx_os_iyx_osv16",651], + "6932559254646823380": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "7724893184016174483": ["convolution_gpu_bfyx_os_iyx_osv16",650], + "14682047605098567432": ["convolution_gpu_bfyx_gemm_like",2], + "14456272420357730548": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "1739161573487933165": ["convolution_gpu_bfyx_gemm_like",1], + "10034746179209540014": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "10432925516327889351": ["convolution_gpu_bfyx_os_iyx_osv16",751], + "8977099691399563065": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "10462797712860969072": ["convolution_gpu_bfyx_os_iyx_osv16",264], + "7020743056013297476": ["convolution_gpu_bfyx_gemm_like",2], + "17939745299931100048": ["convolution_gpu_bfyx_os_iyx_osv16",308], + "4054010905884346287": ["convolution_gpu_bfyx_gemm_like",2], + "384240534894352154": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "16622402936526588344": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "11795686089670429481": ["convolution_gpu_bfyx_os_iyx_osv16",936], + "1213958002895787672": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "12715500118796263683": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "10178462061836778766": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "17924819398394001587": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "6831045740006076251": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "17236135174912837061": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "14084855778741260863": ["convolution_gpu_bfyx_os_iyx_osv16",192], + "4959718589070770515": ["convolution_gpu_bfyx_os_iyx_osv16",719], + "13337122303005980542": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "13947140171097868740": ["convolution_gpu_bfyx_os_iyx_osv16",987], + "1168311873250200110": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "5023609284081684300": ["convolution_gpu_bfyx_gemm_like",2], + "10159790066948852390": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "17381682740282686038": ["convolution_gpu_bfyx_os_iyx_osv16",649], + "2772704069752888874": ["convolution_gpu_bfyx_os_iyx_osv16",277], + "12318427976031000768": ["convolution_gpu_bfyx_gemm_like",1], + "15891746043846062984": ["convolution_gpu_bfyx_gemm_like",2], + "45545661884854912": ["convolution_gpu_bfyx_os_iyx_osv16",1122], + "11102920976866402928": ["convolution_gpu_bfyx_os_iyx_osv16",951], + "15737508945513376813": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "17869697579874327192": ["convolution_gpu_bfyx_os_iyx_osv16",950], + "14674266217397415571": ["convolution_gpu_bfyx_gemm_like",2], + "7813041847979170166": ["convolution_gpu_bfyx_gemm_like",2], + "1962479636209947761": ["convolution_gpu_bfyx_os_iyx_osv16",674], + "11539652577193034099": ["convolution_gpu_bfyx_gemm_like",1], + "13140527131098422428": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "498420237272375425": ["convolution_gpu_bfyx_gemm_like",2], + "779633618375662086": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "7344363094493575878": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "4186957909762095019": ["convolution_gpu_bfyx_gemm_like",2], + "2705394837952559308": ["convolution_gpu_bfyx_os_iyx_osv16",1030], + "17902799955139047426": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "7831542641855749925": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "14010642743400284761": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11868789283464117390": ["convolution_gpu_bfyx_os_iyx_osv16",1055], + "3598116387801985039": ["convolution_gpu_bfyx_os_iyx_osv16",370], + "5461980510262646821": ["convolution_gpu_bfyx_gemm_like",2], + "10809330882739297269": ["convolution_gpu_bfyx_os_iyx_osv16",640], + "15052127817178941719": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "8616175124735896626": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "12175297963550750804": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "2343921093633784755": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "17358462939783262207": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "17406383217119217230": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "15365628642332393565": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "6638761803107874904": ["convolution_gpu_bfyx_os_iyx_osv16",513], + "1630585964216121575": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "16705941191876956548": ["convolution_gpu_bfyx_os_iyx_osv16",512], + "13395074742046717601": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "12659539044474018256": ["convolution_gpu_bfyx_os_iyx_osv16",143], + "1557549837620967530": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "14322754320861242412": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "11369389082421346630": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "4986977887030495943": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "11962541545116807979": ["convolution_gpu_bfyx_os_iyx_osv16",543], + "8730097760819044515": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "11882021989615795558": ["convolution_gpu_bfyx_os_iyx_osv16",378], + "16780457022162749898": ["convolution_gpu_bfyx_gemm_like",2], + "17140702790441856730": ["convolution_gpu_bfyx_gemm_like",2], + "2578325663193624576": ["convolution_gpu_yxfb_yxio_b16",2], + "8784358107340738205": ["convolution_gpu_yxfb_yxio_b16",2], + "2955459120402821540": ["convolution_gpu_yxfb_yxio_b16",2], + "2840794055129352139": ["convolution_gpu_yxfb_yxio_b16",2], + "7104266560248570112": ["convolution_gpu_yxfb_yxio_b16",0], + "11113125355390956764": ["convolution_gpu_yxfb_yxio_b16",1], + "9127827617126714860": ["fully_connected_gpu_yxfb_ref",0], + "15148442194461613102": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "13520876347177213888": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "241656278218999298": ["convolution_gpu_yxfb_yxio_b16",0], + "2164314506903530487": ["convolution_gpu_yxfb_yxio_b16",2], + "15985980444340490463": ["convolution_gpu_yxfb_yxio_b16",0], + "5284456216115118110": ["convolution_gpu_yxfb_yxio_b16",1], + "466744273945239777": ["convolution_gpu_yxfb_yxio_b16",2], + "7099035779223341587": ["convolution_gpu_yxfb_yxio_b16",2], + "3096280563014331836": ["convolution_gpu_yxfb_yxio_b16",0], + "768820004084041271": ["convolution_gpu_yxfb_yxio_b16",2], + "15409184364121627414": ["convolution_gpu_yxfb_yxio_b16",0], + "4597873630741623918": ["convolution_gpu_yxfb_yxio_b16",1], + "11226912053840621089": ["convolution_gpu_yxfb_yxio_b16",0], + "18209930746627816139": ["convolution_gpu_yxfb_yxio_b16",2], + "15757351352532908153": ["convolution_gpu_bfyx_os_iyx_osv16",1051], + "5041922366297242362": ["convolution_gpu_yxfb_yxio_b16",2], + "1068155851494601726": ["convolution_gpu_yxfb_yxio_b16",2], + "9309173544512377803": ["convolution_gpu_yxfb_yxio_b16",1], + "12721294268595880422": ["convolution_gpu_yxfb_yxio_b16",1], + "14248622935809594779": ["convolution_gpu_yxfb_yxio_b16",0], + "3742751561273931407": ["convolution_gpu_yxfb_yxio_b16",1], + "10745099399736462076": ["convolution_gpu_yxfb_yxio_b16",2], + "7412772553395852003": ["convolution_gpu_yxfb_yxio_b16",2], + "1290180607037086383": ["convolution_gpu_yxfb_yxio_b16",2], + "2172999245833525797": ["convolution_gpu_yxfb_yxio_b16",2], + "16601230690171340432": ["convolution_gpu_yxfb_yxio_b16",1], + "15457040168177954463": ["convolution_gpu_yxfb_yxio_b16",2], + "1129349074674368869": ["convolution_gpu_yxfb_yxio_b16",2], + "15669242195570440840": ["convolution_gpu_yxfb_yxio_b16",1], + "560996739186313493": ["convolution_gpu_yxfb_yxio_b16",1], + "10572380563704942622": ["convolution_gpu_yxfb_yxio_b16",0], + "2501411300945696806": ["convolution_gpu_yxfb_yxio_b16",2], + "4216366893358625960": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "8846314870152404018": ["convolution_gpu_bfyx_gemm_like",2], + "15997145184054496085": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5941095082097535176": ["convolution_gpu_bfyx_gemm_like",1], + "15281554100135159550": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "430132942408244070": ["convolution_gpu_bfyx_gemm_like",2], + "225809055928705881": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11088324811742486481": ["convolution_gpu_bfyx_gemm_like",2], + "522313477023837056": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16316483048621486077": ["convolution_gpu_bfyx_gemm_like",2], + "14262482011051329729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4824040283449153298": ["convolution_gpu_bfyx_gemm_like",0], + "3948843501884284998": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10006197783106691106": ["convolution_gpu_bfyx_gemm_like",0], + "2917999294360728537": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7719954202744123391": ["convolution_gpu_bfyx_gemm_like",2], + "10399620940700804517": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5735703235236456131": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "8768300687476117215": ["convolution_gpu_bfyx_os_iyx_osv16",345], + "7815650257256675477": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "13325762052023866627": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2803569867265035123": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "3856976081672275637": ["convolution_gpu_bfyx_gemm_like",0], + "8365255170846178102": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "8075261051536686307": ["convolution_gpu_bfyx_os_iyx_osv16",651], + "9184275066167601343": ["convolution_gpu_bfyx_os_iyx_osv16",529], + "12248852114219058572": ["convolution_gpu_bfyx_gemm_like",1], + "17439102502195540957": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "3059575629482816852": ["convolution_gpu_bfyx_os_iyx_osv16",574], + "5516518048239364231": ["convolution_gpu_bfyx_os_iyx_osv16",98], + "15833461718320604065": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "16828961272295386615": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "886880682650879171": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "11861634536583463947": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "15325852281951905610": ["convolution_gpu_bfyx_os_iyx_osv16",94], + "14365232561737454031": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "7498614018449036163": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "15813044197987178947": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "7287107719392705356": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "2058172559199858297": ["convolution_gpu_bfyx_os_iyx_osv16",383 + ] + }, + "18": { + "14650567822254940018": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15726902746983125797": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "5211831143687501130": ["convolution_gpu_bfyx_os_iyx_osv16",316], + "1398177377739338750": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7693459946348737411": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "12494969618927201911": ["fully_connected_gpu_yxfb_ref",0], + "6324565723045697080": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "2057158988261512114": ["convolution_gpu_bfyx_1x1",2], + "5758133252959371492": ["convolution_gpu_bfyx_gemm_like",2], + "1934379409955686502": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3438116423688595487": ["convolution_gpu_bfyx_os_iyx_osv16",1026], + "15331103261044247142": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "18008552719153887303": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "16234606052818596502": ["convolution_gpu_bfyx_os_iyx_osv16",99], + "12932635875905153141": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "8922929126299811091": ["convolution_gpu_bfyx_1x1",0], + "12245096462203481681": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "12085348936192462321": ["convolution_gpu_bfyx_gemm_like",2], + "14878347463243157447": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "7474592508575297101": ["convolution_gpu_bfyx_1x1",1], + "7807983899017500046": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1081962464388501987": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13681462437496627948": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "17310332946322628458": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9243949750444156746": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17001502418583498926": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "7953255701516490034": ["convolution_gpu_bfyx_os_iyx_osv16",61], + "1230262279011217327": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4282661608732125403": ["convolution_gpu_bfyx_os_iyx_osv16",732], + "9389555743403158574": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7139714914586273766": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "7370273921473161914": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12590922530749026871": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "4185398348055518182": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "5303970743736042689": ["convolution_gpu_bfyx_gemm_like",2], + "11507538232733291666": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "10046663998164493552": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "16992405636352406660": ["convolution_gpu_bfyx_gemm_like",1], + "13434576226708227155": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "14136097914489095982": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "13105192484434299621": ["convolution_gpu_bfyx_gemm_like",2], + "9423958333298993923": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "14412158605670555579": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "8576733135863336233": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "3511588484597779204": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "14353390922580547467": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "7994179151788368291": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "8971115542951085891": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "17179609670678746034": ["convolution_gpu_bfyx_gemm_like",2], + "13268525255152984893": ["convolution_gpu_bfyx_os_iyx_osv16",939], + "1383899865465106141": ["convolution_gpu_bfyx_gemm_like",2], + "11717348577195224554": ["convolution_gpu_bfyx_gemm_like",2], + "135072053401934228": ["convolution_gpu_bfyx_1x1",2], + "10555597973766215754": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "14799579913711096584": ["convolution_gpu_bfyx_gemm_like",2], + "12813978452097969536": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "11669828823444745889": ["convolution_gpu_bfyx_gemm_like",2], + "3128856679264648666": ["convolution_gpu_bfyx_gemm_like",2], + "15696910741835640150": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "11806402239500046867": ["convolution_gpu_bfyx_os_iyx_osv16",373], + "10320711719466983961": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "4958835037528182801": ["convolution_gpu_bfyx_1x1",2], + "3727142736386026852": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "9213563311267466388": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2618108630886857741": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9367157746678824712": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14424566003632608852": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18268811652302076976": ["convolution_gpu_bfyx_gemm_like",2], + "9100044555742394133": ["convolution_gpu_bfyx_os_iyx_osv16",171], + "15790005937034794347": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "13898284586432291433": ["convolution_gpu_bfyx_gemm_like",2], + "3106911159524421371": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "792684262493086891": ["convolution_gpu_bfyx_gemm_like",0], + "14885031472057965707": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8398910340371320955": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10264913782610095832": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "13898821685774165645": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "14211903923555028634": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "17912189681971987483": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "15284262113150488297": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "12068974703657294908": ["convolution_gpu_bfyx_1x1",0], + "16988275131627316108": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "17983556812075120553": ["convolution_gpu_bfyx_1x1",1], + "2581414750854621875": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "2964705957088952872": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "6114147683777615071": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "12894625941923144893": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "12129572274423886770": ["convolution_gpu_bfyx_os_iyx_osv16",627], + "6341197991729122563": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "3285520504090196295": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3796274347773622633": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "12214162812589030126": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "7311120574972466702": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16811402686462277562": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "15493488989417521388": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "18133334552107213128": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "17342198739672369885": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15351724241036614758": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "6631816968511312100": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13603318842632052764": ["convolution_gpu_bfyx_os_iyx_osv16",381], + "2727219457659794468": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8101977280003030465": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12782191856884962803": ["convolution_gpu_bfyx_gemm_like",2], + "2704063557078535883": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "1559798212423183813": ["convolution_gpu_bfyx_os_iyx_osv16",274], + "11814740669468421049": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "18270587701371596297": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "4759671642533786591": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "18232278892738147217": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "7727001441358508665": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "3106591708459602370": ["convolution_gpu_bfyx_os_iyx_osv16",942], + "2161052921317193579": ["convolution_gpu_bfyx_gemm_like",2], + "14883438809987378616": ["convolution_gpu_bfyx_1x1",2], + "7590767013583950613": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "16370218798911151331": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "10014448860206587805": ["convolution_gpu_bfyx_gemm_like",2], + "5687802882700097624": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "1143214652021653634": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "641417817126876622": ["convolution_gpu_bfyx_gemm_like",2], + "13558618754911056302": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "3909551222373722085": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "5592556538784745960": ["convolution_gpu_bfyx_gemm_like",2], + "6623182990939010641": ["convolution_gpu_bfyx_gemm_like",1], + "11175955260573469979": ["convolution_gpu_bfyx_os_iyx_osv16",104], + "15410074937424854348": ["convolution_gpu_bfyx_direct_10_12_16",2], + "89439319782574517": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "11398019086259011063": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1116274074896622552": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11107930597263802755": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "2100891581797371600": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3522383297921565178": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "18233660940545931789": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "15426960908024585800": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1287490919205560806": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "9285566577169147378": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10572945270796129630": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "288853243482418538": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2421404763191415191": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2103882464623009432": ["convolution_gpu_winograd_6x3_s1_fused",0], + "5448537627319798272": ["convolution_gpu_bfyx_gemm_like",0], + "11657946392097042544": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "2920322372993101148": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16076153317792960383": ["convolution_gpu_bfyx_direct_10_12_16",1], + "761169277744593430": ["convolution_gpu_bfyx_os_iyx_osv16",105], + "385046297070779752": ["convolution_gpu_bfyx_os_iyx_osv16",533], + "7407975398526425554": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "534032316469702287": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "6681818065741882453": ["convolution_gpu_bfyx_gemm_like",0], + "13468713306678453952": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2806529556090896246": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "15720507574336564201": ["convolution_gpu_bfyx_gemm_like",2], + "9533360488591027707": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "14082448162400225052": ["convolution_gpu_bfyx_1x1",2], + "17726079670612220433": ["convolution_gpu_bfyx_gemm_like",2], + "11334122788337402526": ["convolution_gpu_bfyx_1x1",2], + "13558687084677943158": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "6297802534570892679": ["convolution_gpu_bfyx_os_iyx_osv16",158], + "7187734276051878356": ["convolution_gpu_bfyx_gemm_like",2], + "7868973874302246233": ["convolution_gpu_bfyx_gemm_like",2], + "2814805887448339818": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "3002986032379998259": ["convolution_gpu_bfyx_os_iyx_osv16",909], + "5597908143491399643": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "17209528805596238905": ["convolution_gpu_bfyx_gemm_like",2], + "2968094709908141988": ["convolution_gpu_bfyx_os_iyx_osv16",396], + "12796777049340516563": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1126499865206906037": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "5538883245745495145": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "14070988879848388270": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "12421707187947291166": ["convolution_gpu_bfyx_gemm_like",1], + "5385316497510064491": ["fully_connected_gpu_fb_oi_ref",2], + "8300655194765375060": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "11130439225010714550": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "13503688893307029975": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16044646335477470657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9406763539724266157": ["convolution_gpu_bfyx_1x1",0], + "3438296636411972401": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10987953316324712538": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "13104509059416300615": ["convolution_gpu_bfyx_os_iyx_osv16",296], + "1208161922424418734": ["convolution_gpu_bfyx_gemm_like",2], + "12793908914872030220": ["convolution_gpu_bfyx_gemm_like",2], + "2566302789609970663": ["convolution_gpu_bfyx_os_iyx_osv16",815], + "10765280349477640969": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "17596685300497748803": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "16103943009195163681": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "15249442550355454201": ["convolution_gpu_bfyx_gemm_like",2], + "7964396197946740183": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "17823133607491820214": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "15522785615618973614": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "8761283252495354972": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "11897113890115321056": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "12949204491386872217": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "8133587696326295326": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1036010477232750453": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "724953082687879224": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "182115051096556835": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "11723735945517472199": ["convolution_gpu_bfyx_os_iyx_osv16",668], + "265124365266629363": ["convolution_gpu_bfyx_os_iyx_osv16",246], + "18269685060032395235": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "5010119207726811326": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3976736548270395981": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "1628593159980574595": ["convolution_gpu_bfyx_gemm_like",1], + "187352687850707150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13450061819089402572": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "953306082374100275": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "7393601059996816014": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12136029303893296753": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "13410850301164057911": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "12977678792503377525": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10280619408766255552": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "6928835003016610382": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "10425889533411573166": ["convolution_gpu_bfyx_gemm_like",2], + "16995873636564597028": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "13364676690016875118": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "7009735776703529573": ["convolution_gpu_bfyx_os_iyx_osv16",968], + "7354234812009979811": ["convolution_gpu_bfyx_os_iyx_osv16",859], + "14757749560543979231": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "3336076058264596420": ["convolution_gpu_bfyx_gemm_like",2], + "7209217811135076623": ["convolution_gpu_bfyx_gemm_like",2], + "12071914115316550349": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "7903891232234389925": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7650375560336513366": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "13649894122307008732": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "10607904718265020949": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "10404725818204494388": ["convolution_gpu_bfyx_gemm_like",1], + "16327433707667075261": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10128120599276549920": ["convolution_gpu_bfyx_1x1",2], + "863952266514375915": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "10544411879329675593": ["convolution_gpu_bfyx_os_iyx_osv16",382], + "8203171222962341018": ["convolution_gpu_bfyx_gemm_like",2], + "17796310681498690253": ["convolution_gpu_winograd_6x3_s1_fused",1], + "12908594497114706897": ["convolution_gpu_bfyx_1x1",1], + "13781423818051299677": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "4717620775314557374": ["convolution_gpu_bfyx_gemm_like",2], + "4764776977138392550": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "12801481303602178879": ["convolution_gpu_bfyx_gemm_like",2], + "9216608098626790565": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "1028160614515220430": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "9589942627115344216": ["convolution_gpu_bfyx_os_iyx_osv16",851], + "17790026124881397912": ["fully_connected_gpu_yxfb_ref",2], + "10279778381617181802": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13853056718266488510": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "4299492266819967844": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "2490155559809645659": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "15190508870639648203": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3883845471211207871": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "7995820969034996638": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "15271783562528081169": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "742689192890486807": ["convolution_gpu_bfyx_gemm_like",2], + "13853630125050609175": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "14711697456265712456": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "2221145174704245189": ["convolution_gpu_bfyx_gemm_like",2], + "13038533272699602337": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "6345550009198921347": ["convolution_gpu_bfyx_os_iyx_osv16",998], + "3715177305271762194": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "9270950131920019932": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "14150012830816329527": ["convolution_gpu_bfyx_gemm_like",2], + "15817443774186015593": ["convolution_gpu_bfyx_1x1",0], + "15126660425728872065": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "14484890926084856480": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "15488550074426713959": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "592245952014430043": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "2116913943188857359": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "6860503758000008398": ["convolution_gpu_bfyx_os_iyx_osv16",832], + "16027853590391209100": ["convolution_gpu_bfyx_gemm_like",2], + "14088382963493477342": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "9019388470685749691": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "5078905972285278557": ["convolution_gpu_bfyx_gemm_like",2], + "1500571771538985941": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "11158789938857558596": ["convolution_gpu_bfyx_1x1",0], + "14387756025635589673": ["convolution_gpu_bfyx_1x1",0], + "4085907608404305515": ["convolution_gpu_bfyx_gemm_like",0], + "3782239800777370325": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "9481675228591993785": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "16511393582666965704": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "11970881115757095265": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "11810221946429451169": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14691372262153587653": ["convolution_gpu_bfyx_os_iyx_osv16",1059], + "1306339989221885682": ["convolution_gpu_bfyx_direct_10_12_16",0], + "994489782629179836": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "318377908569897093": ["convolution_gpu_bfyx_gemm_like",2], + "3281207855459771997": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "3819990462129075757": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "12512751736409465214": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2030309697153345387": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "12534001599784153836": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "12181607120522804433": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "7806129039150321333": ["convolution_gpu_bfyx_gemm_like",2], + "13477416097954638887": ["fully_connected_gpu_bf_io_gemm",1], + "13489318651148001664": ["convolution_gpu_bfyx_gemm_like",2], + "17306482303091342504": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "7744787957569714828": ["convolution_gpu_bfyx_os_iyx_osv16",1096], + "16667887002111125871": ["convolution_gpu_bfyx_gemm_like",2], + "3499243120652875549": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "3062101811226530720": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "10672380526821947133": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "18037918102910297531": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "10816637153861630723": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "6614833247756539341": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "4347816192417741558": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "5042176052323856983": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18400379759523099542": ["convolution_gpu_bfyx_gemm_like",2], + "2095802691829304676": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "2173720698351153121": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "7130694811424715594": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "17638692805430115529": ["convolution_gpu_bfyx_gemm_like",2], + "4056971751486746551": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7549378486471456156": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "13025361884606488732": ["convolution_gpu_bfyx_gemm_like",1], + "1299545313185409227": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "6942049339361951275": ["fully_connected_gpu_bf_io_input_spatial",2], + "3001615302961701154": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4994591211723226974": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "3621424752591567930": ["convolution_gpu_bfyx_gemm_like",2], + "17381516856910544374": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "4670443882075998209": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "6634330132674952638": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "9819596940685093690": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "7164580481046523192": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "840202264034382558": ["convolution_gpu_bfyx_os_iyx_osv16",843], + "10784073615329190425": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "15881381297320383917": ["convolution_gpu_winograd_6x3_s1_fused",0], + "4623542918584461522": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9868561386826862471": ["convolution_gpu_winograd_6x3_s1_fused",0], + "1450888744802985214": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "12051595062513871723": ["convolution_gpu_bfyx_1x1",0], + "4216958486055161753": ["convolution_gpu_bfyx_gemm_like",1], + "17881905640473324965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7056030150365552588": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "9860570706348640782": ["convolution_gpu_bfyx_gemm_like",0], + "8159303545761286685": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "17542176922797334839": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "5197105253412476591": ["convolution_gpu_bfyx_gemm_like",2], + "17729546848373991614": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "875296362957469305": ["convolution_gpu_bfyx_gemm_like",2], + "11754316727756881612": ["convolution_gpu_bfyx_gemm_like",1], + "2287356884312581209": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "13754408679115174221": ["convolution_gpu_bfyx_gemm_like",2], + "10892456883214928095": ["convolution_gpu_bfyx_os_iyx_osv16",565], + "12601126285773042005": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "17791024851737594885": ["convolution_gpu_bfyx_1x1",0], + "2524029454785583409": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "5482851829165191681": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "8236018377815149638": ["convolution_gpu_bfyx_os_iyx_osv16",247], + "2044363708106765326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "488298169768725160": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "2984726467649419856": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "17163158934005653629": ["convolution_gpu_bfyx_os_iyx_osv16",505], + "7172604084103519563": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "15112599407339712681": ["convolution_gpu_bfyx_1x1",1], + "17928043901784474130": ["convolution_gpu_bfyx_direct_10_12_16",0], + "2915165824085219545": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "15447513376965243034": ["convolution_gpu_bfyx_os_iyx_osv16",57], + "875400109066360897": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "1418595171949196661": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "16611452077660879545": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "16431165572426232677": ["convolution_gpu_bfyx_os_iyx_osv16",434], + "12015336418727455195": ["convolution_gpu_bfyx_1x1",0], + "13613399861925108148": ["convolution_gpu_bfyx_os_iyx_osv16",501], + "13059207969254830451": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "6644418194983229139": ["convolution_gpu_bfyx_gemm_like",2], + "4795705973706796563": ["fully_connected_gpu_bf_io_input_spatial",2], + "2625969259447793593": ["convolution_gpu_bfyx_1x1",2], + "10016815108730511683": ["convolution_gpu_bfyx_gemm_like",2], + "10090923790949378407": ["convolution_gpu_bfyx_gemm_like",2], + "2527189070714658176": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13611054146745413536": ["convolution_gpu_bfyx_gemm_like",2], + "2777318471329665162": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "1354647381212852890": ["convolution_gpu_bfyx_1x1",1], + "11192356850081328892": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3633858263279042265": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "15805087418686802636": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "9205978149692979955": ["convolution_gpu_bfyx_gemm_like",2], + "30229601562833524": ["convolution_gpu_bfyx_gemm_like",2], + "17839839336294937155": ["convolution_gpu_bfyx_gemm_like",1], + "4867937397499803072": ["convolution_gpu_bfyx_gemm_like",2], + "6075691042233712335": ["convolution_gpu_bfyx_gemm_like",2], + "11031358859656806724": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "16884228931101540030": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "2373860353284525265": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "12802517759474139810": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "1518270620354036926": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "10323345824599612614": ["convolution_gpu_bfyx_gemm_like",1], + "12278364834477923930": ["convolution_gpu_bfyx_gemm_like",2], + "5039037192630609823": ["convolution_gpu_bfyx_gemm_like",2], + "3835286851569826052": ["convolution_gpu_bfyx_gemm_like",2], + "7875724726741958520": ["fully_connected_gpu_fb_oi_ref",1], + "3441335188113424896": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "6067904130482758510": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "528295119724008711": ["convolution_gpu_bfyx_os_iyx_osv16",508], + "17208186152576814861": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "9328223957245552723": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "577182964135927041": ["convolution_gpu_bfyx_os_iyx_osv16",35], + "8040001390872143271": ["convolution_gpu_bfyx_gemm_like",2], + "3160543867929843861": ["convolution_gpu_bfyx_1x1",0], + "586947787345351152": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1122856374602590533": ["convolution_gpu_bfyx_1x1",0], + "15287650965861631130": ["convolution_gpu_bfyx_gemm_like",2], + "15123868617509445149": ["convolution_gpu_winograd_6x3_s1_fused",0], + "16542318967217020315": ["convolution_gpu_bfyx_gemm_like",2], + "7808544677773370430": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7969441643457570812": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "8907982643256296667": ["convolution_gpu_bfyx_1x1",1], + "5479761740065152589": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "3114869763557037270": ["fully_connected_gpu_bfyx_ref",2], + "2294026590516781945": ["convolution_gpu_bfyx_gemm_like",2], + "1249137685908951501": ["convolution_gpu_bfyx_gemm_like",1], + "2940027113687311893": ["convolution_gpu_bfyx_gemm_like",2], + "15943141845766932879": ["convolution_gpu_bfyx_1x1",1], + "17775705003104146872": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "403634422724914329": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "18154019240019929225": ["convolution_gpu_bfyx_gemm_like",2], + "9319254979377483709": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4424217045094988504": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "6800893510381991731": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "18436249934780056991": ["convolution_gpu_bfyx_gemm_like",2], + "8931169575495985034": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "10916647716124396856": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "8296551195150971668": ["convolution_gpu_winograd_6x3_s1_fused",0], + "3974589991022739479": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "9120377367517042357": ["convolution_gpu_bfyx_1x1",0], + "6817494598328071314": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "14289048840489035546": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8951040603784899163": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14046114605615338907": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "11626402549863483301": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "8057302050645780813": ["convolution_gpu_bfyx_direct_10_12_16",0], + "6711878663358611849": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "7076937538747704750": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "17891499682354369344": ["convolution_gpu_bfyx_gemm_like",2], + "537074122417021898": ["convolution_gpu_bfyx_gemm_like",2], + "13993548620104010490": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "16781127329510211966": ["convolution_gpu_bfyx_gemm_like",2], + "15239764240622554314": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",0], + "15746620724134970969": ["convolution_gpu_bfyx_1x1",0], + "15163327502374403643": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "15728009639807698634": ["convolution_gpu_bfyx_os_iyx_osv16",594], + "4436244774193918646": ["fully_connected_gpu_fb_oi_ref",0], + "2683507674615735878": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10795104632256101599": ["convolution_gpu_bfyx_os_iyx_osv16",214], + "2909728331855309274": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "13002363400738122017": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "11007944497812650617": ["convolution_gpu_bfyx_gemm_like",2], + "15695415285791951018": ["convolution_gpu_bfyx_gemm_like",0], + "14011124615649605281": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16245760498096322525": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "4124478505694604763": ["convolution_gpu_bfyx_1x1",1], + "17219920118109316867": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "17353894529222574441": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "15349944413643626251": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "5608133987357542077": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "17739868787095417856": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "9226912483632588371": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9057036344533510776": ["convolution_gpu_bfyx_gemm_like",2], + "797387385159110695": ["convolution_gpu_bfyx_gemm_like",2], + "13951781924205611716": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "2781309272856442321": ["convolution_gpu_bfyx_1x1",2], + "3863816884636503247": ["convolution_gpu_bfyx_gemm_like",2], + "4400247897123856252": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "2235210915304938149": ["convolution_gpu_bfyx_gemm_like",2], + "15890473622821659630": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "2638131332283395057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6678796313875454849": ["convolution_gpu_bfyx_gemm_like",2], + "4542143431130171516": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "17995371099806008878": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "4750755523645265967": ["convolution_gpu_bfyx_gemm_like",2], + "6854611304056079417": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "11709992724966310174": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "11988546375476924356": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "14671212883301405408": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14444475853714164129": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "17830290099875088207": ["convolution_gpu_bfyx_gemm_like",2], + "11771014003680394135": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11988285441493553006": ["convolution_gpu_bfyx_gemm_like",0], + "10408322429232132983": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "16577611471466452776": ["convolution_gpu_bfyx_gemm_like",1], + "726985753660756762": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "16172528828198474326": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8127190765748950828": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "15470013032930986062": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "11878734040194151073": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7817036102984218692": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "14046990030104971367": ["convolution_gpu_bfyx_os_iyx_osv16",733], + "18059267466971880386": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "12864558900883069118": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13793441296561946357": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "12391792381149655331": ["convolution_gpu_bfyx_gemm_like",2], + "5211191663202250117": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "2893564501191050837": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "7706714181281908433": ["convolution_gpu_bfyx_gemm_like",2], + "9824678205469832038": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "7603319690872333930": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "11315238071192463859": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8619526128410675593": ["convolution_gpu_bfyx_os_iyx_osv16",1112], + "4897991181236908768": ["convolution_gpu_bfyx_gemm_like",2], + "11530101016435264783": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "9794456440994218671": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "2653651564133701304": ["convolution_gpu_bfyx_os_iyx_osv16",1097], + "4190912926126844643": ["convolution_gpu_bfyx_1x1",1], + "17006133396401462698": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "12516911293946682547": ["convolution_gpu_bfyx_os_iyx_osv16",620], + "18245935804520236353": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14406070210216948643": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "4885944395876887711": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "7692849839965441330": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "18202222342562516071": ["convolution_gpu_bfyx_os_iyx_osv16",890], + "8616686489737649890": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "3106922888635965020": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8995598177504756805": ["convolution_gpu_bfyx_os_iyx_osv16",855], + "3063055767192991776": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "4220826666482500445": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "17108987360340581555": ["fully_connected_gpu_bf_io_input_spatial",0], + "9541630719145326121": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "12843671306854567956": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "509781001842353609": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "14823616678465136590": ["convolution_gpu_winograd_6x3_s1_fused",1], + "12644942072153919043": ["convolution_gpu_bfyx_direct_10_12_16",0], + "8739347545059610410": ["convolution_gpu_bfyx_gemm_like",2], + "2371412124305478965": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12790570304622911607": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "7351733901977025859": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "18128162750557822655": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "16710010075465723498": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "6650607472019166205": ["convolution_gpu_bfyx_1x1",1], + "8916983923551808409": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "15188570678726970998": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16871004845988227014": ["convolution_gpu_bfyx_1x1",1], + "8857763129101380288": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "7430073011895298582": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7181154048972884375": ["convolution_gpu_bfyx_gemm_like",2], + "18357544235608006954": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "16084700435355748612": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "8818070832398055086": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15831600396403741571": ["convolution_gpu_bfyx_gemm_like",2], + "8075180350084516696": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14898892437285105327": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "10022487076451608714": ["convolution_gpu_bfyx_os_iyx_osv16",695], + "17921973525603585874": ["convolution_gpu_bfyx_gemm_like",1], + "11311859068168414878": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "10509933181132310969": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11682323163346544125": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "15773157615731010456": ["convolution_gpu_bfyx_os_iyx_osv16",529], + "18126685473408206840": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "16720108310653948550": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15223164574152266895": ["convolution_gpu_bfyx_1x1",2], + "8881135571874888085": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "10308113903347312964": ["convolution_gpu_bfyx_gemm_like",2], + "5091558853871982858": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "5738835498104275267": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "10930640103080573253": ["convolution_gpu_bfyx_1x1",1], + "5609922876429907954": ["convolution_gpu_bfyx_gemm_like",0], + "15800447082078291243": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "14184895905338394239": ["convolution_gpu_bfyx_os_iyx_osv16",960], + "13842309033760176194": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2133849627845285277": ["convolution_gpu_bfyx_os_iyx_osv16",802], + "4801117903303888658": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "10890975553758439233": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "3646228701104397128": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "13145474177271090694": ["convolution_gpu_bfyx_gemm_like",2], + "2995134938466176198": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12134858519320245809": ["convolution_gpu_bfyx_1x1",2], + "18235209540858013173": ["convolution_gpu_bfyx_1x1",0], + "11083993858285515074": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "13710319251108632115": ["convolution_gpu_bfyx_1x1",1], + "15494543914974994991": ["convolution_gpu_bfyx_gemm_like",2], + "4479979951990338510": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1920042803083729276": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "16589607587365212240": ["convolution_gpu_bfyx_gemm_like",2], + "7678457226823073886": ["convolution_gpu_bfyx_os_iyx_osv16",187], + "3934290309368153435": ["fully_connected_gpu_bf_io_gemm",0], + "10151922632636937118": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12566041126392848976": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "14956246091163580499": ["convolution_gpu_bfyx_os_iyx_osv16",480], + "9028970753877215614": ["convolution_gpu_bfyx_os_iyx_osv16",861], + "7005509036795164602": ["convolution_gpu_bfyx_1x1",0], + "9918371346247634545": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "3814584042139408454": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "12177387334053203378": ["convolution_gpu_bfyx_gemm_like",2], + "5485749317130402302": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "13471752029049484143": ["convolution_gpu_bfyx_gemm_like",2], + "7368916076070115064": ["convolution_gpu_bfyx_os_iyx_osv16",1002], + "1044978617045366709": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "14621327324047759584": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "8961138963663532667": ["convolution_gpu_bfyx_gemm_like",1], + "9316082753126682958": ["convolution_gpu_bfyx_gemm_like",1], + "9058996149754556268": ["convolution_gpu_bfyx_os_iyx_osv16",673], + "7880845322716481548": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "14990645740260870030": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "6219075471508685758": ["convolution_gpu_bfyx_gemm_like",0], + "16208488491972128275": ["convolution_gpu_bfyx_os_iyx_osv16",814], + "6329618009202266591": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10486348549691280032": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "11872943152839631823": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "8560635685184432720": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3873183249402084406": ["convolution_gpu_bfyx_gemm_like",2], + "11418379777288974452": ["convolution_gpu_bfyx_gemm_like",0], + "5469227748156438008": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "394778201589371681": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13699740641705514374": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16955653765071712611": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1875764913306932583": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "18423051691107460439": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "7082007579524697455": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "3806761527342944195": ["convolution_gpu_bfyx_gemm_like",0], + "16243196137456624852": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "7683334381958571864": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "2406816735581074778": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11856266545854830143": ["convolution_gpu_bfyx_gemm_like",2], + "16881283637687482989": ["convolution_gpu_bfyx_os_iyx_osv16",458], + "8762901342272872498": ["convolution_gpu_bfyx_os_iyx_osv16",62], + "13602140021189675477": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "282274448389888221": ["convolution_gpu_bfyx_os_iyx_osv16",659], + "15641537661939240413": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "14896875712028630045": ["convolution_gpu_bfyx_os_iyx_osv16",1057], + "12427258337646070422": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16131448347558322280": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "677249604491773387": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16710651492402564794": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9525853014023664813": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "11169292427557543138": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "7338229552985076723": ["convolution_gpu_bfyx_gemm_like",2], + "18137106379929135901": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "13121297281694293907": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "10499265278415026816": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11062100629646715785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "579781312141502576": ["convolution_gpu_bfyx_1x1",2], + "7962991673727743706": ["convolution_gpu_bfyx_os_iyx_osv16",766], + "15908673392788376468": ["convolution_gpu_bfyx_os_iyx_osv16",1004], + "9462315044265139531": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "397770940444464146": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "15814015810740458605": ["convolution_gpu_bfyx_1x1",0], + "10290107543739998181": ["fully_connected_gpu_bf_io_input_spatial",0], + "4142978475842207311": ["convolution_gpu_bfyx_gemm_like",2], + "12190841837604350271": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "1778345646142852816": ["convolution_gpu_bfyx_gemm_like",2], + "1697248235682953135": ["convolution_gpu_bfyx_gemm_like",2], + "3041752019114501584": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13698389420396031586": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "12517838703662330663": ["convolution_gpu_bfyx_os_iyx_osv16",15], + "5319668297345215520": ["convolution_gpu_bfyx_gemm_like",1], + "6213386558868267629": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "9513032457323269513": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "15851356529373376076": ["convolution_gpu_bfyx_os_iyx_osv16",403], + "1212319037405620223": ["convolution_gpu_bfyx_gemm_like",2], + "1972879521448306536": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "12626014184575881530": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "3138374672801504481": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "14680730265621679042": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "10118395047539851751": ["convolution_gpu_bfyx_gemm_like",2], + "13954821927253849036": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "7852144838267007144": ["convolution_gpu_bfyx_os_iyx_osv16",833], + "14335423820860953927": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11690334177981352452": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "12213354854947437262": ["convolution_gpu_bfyx_1x1",1], + "11829442945690098558": ["convolution_gpu_bfyx_gemm_like",0], + "8609939102588915855": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "9076758673133996959": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "16036386660666696362": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "13609660900720370993": ["convolution_gpu_bfyx_1x1",2], + "1318571118468536310": ["convolution_gpu_bfyx_os_iyx_osv16",365], + "7264274394359484318": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "9213886570531053949": ["convolution_gpu_bfyx_os_iyx_osv16",801], + "18186615266760475767": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "9152451371616153112": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6843617687528352801": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "11113256687741667688": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13776178598632392721": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "3374410641320310726": ["convolution_gpu_bfyx_os_iyx_osv16",911], + "8594644182487917002": ["convolution_gpu_winograd_6x3_s1_fused",0], + "7009873605945341897": ["convolution_gpu_bfyx_gemm_like",2], + "10001963042016663554": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "8792010676469476740": ["convolution_gpu_bfyx_gemm_like",2], + "13282951481330978659": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "15471470494305051299": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17427036330773218054": ["convolution_gpu_bfyx_os_iyx_osv16",291], + "3534971503826416049": ["convolution_gpu_bfyx_os_iyx_osv16",736], + "11066913713501760080": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "2379484884827231127": ["fully_connected_gpu_bf_io_input_spatial",0], + "17536308070854915513": ["convolution_gpu_bfyx_1x1",1], + "3644282167178264526": ["convolution_gpu_bfyx_gemm_like",2], + "14204609663091442879": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "10782611933832492335": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "1059505639883914386": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "13804221028705631415": ["convolution_gpu_bfyx_gemm_like",2], + "10724501418439612080": ["convolution_gpu_bfyx_os_iyx_osv16",1001], + "9373353053843326128": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17123463568694499533": ["convolution_gpu_bfyx_gemm_like",2], + "7852745450437172519": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "17408275657360833363": ["convolution_gpu_bfyx_1x1",0], + "8689206546467098603": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3800011935243649447": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "7481256533438761028": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "16146350476627599543": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "1841155673858789206": ["fully_connected_gpu_fb_oi_ref",2], + "16067605128297748820": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18084635102736402756": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "15003778740401601065": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2283157145557154450": ["convolution_gpu_bfyx_1x1",0], + "3463959257726925426": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "2908249767551054613": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "8767817856303586064": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "11020315012951440351": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "7877332346656934022": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "12141300895511301068": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "14826791706471872785": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2242915551775617989": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "9798585825695496550": ["convolution_gpu_bfyx_gemm_like",2], + "6101196122606108273": ["convolution_gpu_bfyx_gemm_like",2], + "12971822824884826169": ["convolution_gpu_bfyx_gemm_like",2], + "15006321421735686121": ["convolution_gpu_bfyx_gemm_like",0], + "13488495920546871271": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "13163146272900339330": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16711955423531846725": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "14729854278671832528": ["convolution_gpu_bfyx_os_iyx_osv16",236], + "2683304757433993300": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "1474271081523145413": ["convolution_gpu_bfyx_gemm_like",2], + "15192024816519005250": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "13025323039227543550": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "8258382025812748961": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11686670048744589243": ["convolution_gpu_bfyx_gemm_like",2], + "314054598858070952": ["convolution_gpu_bfyx_gemm_like",2], + "15809639778580769565": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "4202645222013675478": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "8913823292181409151": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "15839295895890205274": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "11565861421381730304": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "805221045541170643": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10650698451740924172": ["convolution_gpu_bfyx_direct_10_12_16",1], + "490233152678323691": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "14089893422771228191": ["convolution_gpu_bfyx_os_iyx_osv16",52], + "12680339228267704518": ["convolution_gpu_bfyx_os_iyx_osv16",120], + "14230493618724018658": ["convolution_gpu_bfyx_gemm_like",2], + "3860667078458481972": ["convolution_gpu_bfyx_gemm_like",2], + "804195263636995800": ["convolution_gpu_bfyx_gemm_like",2], + "10682300249493137042": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "18012549942299450620": ["convolution_gpu_bfyx_gemm_like",2], + "15217183882858251099": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "5275016494706355806": ["convolution_gpu_bfyx_gemm_like",0], + "2567046336192437734": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "5927467766675317093": ["fully_connected_gpu_bf_io_input_spatial",2], + "13852065717057446998": ["convolution_gpu_bfyx_gemm_like",2], + "15378025640603637387": ["convolution_gpu_bfyx_gemm_like",2], + "11529876081402974396": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "8268533335852735248": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "116291934148608396": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "16949056117405140365": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "8251544171504007740": ["convolution_gpu_bfyx_gemm_like",2], + "6419580456182610836": ["convolution_gpu_bfyx_os_iyx_osv16",465], + "5834245904292669645": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "8132521728369930959": ["convolution_gpu_bfyx_gemm_like",2], + "8124736388338424498": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "2780423409483867058": ["convolution_gpu_bfyx_1x1",0], + "12098146032672599222": ["convolution_gpu_bfyx_os_iyx_osv16",667], + "4747159205186229582": ["convolution_gpu_bfyx_gemm_like",1], + "15967614281807823696": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "13486084204140096478": ["convolution_gpu_bfyx_gemm_like",2], + "14716719350966652036": ["convolution_gpu_bfyx_gemm_like",2], + "6821855018718422278": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "17599383258252980421": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16683485007140805060": ["fully_connected_gpu_fb_io_ref",1], + "3286629188347536485": ["fully_connected_gpu_bf_io_input_spatial",2], + "18275601715050791851": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "13272818502368975319": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7800015766976654402": ["convolution_gpu_bfyx_gemm_like",2], + "13835908664998757647": ["fully_connected_gpu_fb_oi_ref",1], + "12394049027081208902": ["convolution_gpu_bfyx_gemm_like",1], + "5245308722062496788": ["convolution_gpu_bfyx_os_iyx_osv16",316], + "5754844816339228920": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "10797908931694274013": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "13054405729329143152": ["convolution_gpu_bfyx_os_iyx_osv16",459], + "1563987925712579649": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "8260130048649729185": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5303170164698694791": ["fully_connected_gpu_bf_io_gemm",2], + "12541834857357563605": ["convolution_gpu_bfyx_os_iyx_osv16",884], + "7134654288295280046": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "14429081455612806819": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "16749148369456398030": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "7279393739634103483": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5622089373755094139": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14151747022287993729": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14025496192869856801": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "10436819182310112786": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2294800960010879540": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "11855070245618904113": ["convolution_gpu_bfyx_os_iyx_osv16",943], + "11243840588602365090": ["convolution_gpu_bfyx_os_iyx_osv16",757], + "1939140810847988694": ["convolution_gpu_bfyx_gemm_like",2], + "12028665820838352309": ["convolution_gpu_bfyx_direct_10_12_16",2], + "419783127503173016": ["convolution_gpu_bfyx_os_iyx_osv16",185], + "15947699374684516369": ["convolution_gpu_bfyx_gemm_like",2], + "11936419502418995274": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "2934519615045138808": ["convolution_gpu_bfyx_os_iyx_osv16",948], + "6293403765897901528": ["convolution_gpu_bfyx_gemm_like",2], + "5390559917122707732": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "9195732599757736182": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "11873734271080160669": ["convolution_gpu_bfyx_os_iyx_osv16",190], + "3499106702307464480": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "8881150100883636392": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4156384238797998294": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "6343888265369366589": ["convolution_gpu_bfyx_os_iyx_osv16",293], + "2369451367723962073": ["convolution_gpu_bfyx_1x1",2], + "5594180958505308003": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "10170577772376890221": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "543472136359161929": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "1791615587935799399": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "10483664832302187567": ["convolution_gpu_bfyx_os_iyx_osv16",774], + "7715649642603303319": ["convolution_gpu_bfyx_1x1",2], + "10626341369865893888": ["convolution_gpu_bfyx_gemm_like",2], + "17556238490521153146": ["convolution_gpu_bfyx_os_iyx_osv16",468], + "2912098199463107173": ["convolution_gpu_bfyx_1x1",1], + "12242618640422208652": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "8317673282128335201": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "10989937450490049763": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "4927360358387344983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13816104794723484993": ["convolution_gpu_winograd_6x3_s1_fused",1], + "1104489643524273315": ["convolution_gpu_bfyx_os_iyx_osv16",130], + "14805540705424073865": ["convolution_gpu_bfyx_os_iyx_osv16",614], + "18142462471803295391": ["convolution_gpu_bfyx_1x1",2], + "11327097771110264965": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "142329025839464842": ["convolution_gpu_bfyx_1x1",1], + "9257078583742821465": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "4865023158176874622": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "3212789693085089063": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "4716188972902735458": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "17490471699618303993": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "3239033622277917802": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "15094664469997373662": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "14959566236432790882": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "1458615259705605525": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "7937870623766562191": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "6727930402459775131": ["convolution_gpu_bfyx_gemm_like",0], + "2349007644347065353": ["convolution_gpu_bfyx_gemm_like",2], + "7565221050911842393": ["convolution_gpu_bfyx_os_iyx_osv16",110], + "17053671692908867872": ["convolution_gpu_bfyx_os_iyx_osv16",341], + "16362857896338778056": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "16694984452720336415": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "9381304526221508530": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "14554225625951128811": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "12860222041026638681": ["convolution_gpu_bfyx_gemm_like",2], + "709835724029986012": ["convolution_gpu_bfyx_os_iyx_osv16",565], + "7604075520418038662": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13890118723041457532": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "4703107905652287491": ["convolution_gpu_bfyx_gemm_like",2], + "2543995971214089085": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "12514693341682532560": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "15386715291503303766": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "4750894407873652809": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "11198908896401597838": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15451919862187018297": ["convolution_gpu_winograd_6x3_s1_fused",1], + "13436376034548670107": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "5374969798377773063": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "11330591026581463934": ["convolution_gpu_bfyx_gemm_like",2], + "1596353239542510685": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "3167336012388169649": ["convolution_gpu_bfyx_os_iyx_osv16",531], + "2802810524370514276": ["convolution_gpu_bfyx_gemm_like",0], + "12118387933632797428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4133424990380177132": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "12174571114411168588": ["convolution_gpu_bfyx_direct_10_12_16",0], + "10179916356323479080": ["convolution_gpu_bfyx_gemm_like",2], + "5926747396493954633": ["convolution_gpu_bfyx_os_iyx_osv16",229], + "4628748977913534701": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "2362092095402043749": ["convolution_gpu_bfyx_gemm_like",2], + "18375125668176498051": ["convolution_gpu_bfyx_gemm_like",2], + "11560634267092054110": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "13809330759308309353": ["convolution_gpu_bfyx_os_iyx_osv16",979], + "9891428775774615719": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "7578177053220150569": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "2089730611490367290": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1077773457856682663": ["convolution_gpu_bfyx_gemm_like",2], + "17343050785312683560": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "15065925414996398951": ["convolution_gpu_bfyx_1x1",0], + "597073780328219388": ["convolution_gpu_bfyx_gemm_like",2], + "5873257164958285393": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "3950738240651133849": ["convolution_gpu_bfyx_os_iyx_osv16",736], + "14026570177552137240": ["convolution_gpu_bfyx_gemm_like",2], + "10565371760124443824": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "4278280309700908015": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1561225943337590599": ["convolution_gpu_bfyx_os_iyx_osv16",518], + "17248756229500447131": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "381149736509958403": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "4474697990228400564": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14068780861332616363": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "745009493367761775": ["convolution_gpu_bfyx_gemm_like",2], + "1316444335300814745": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "13893808009363736870": ["convolution_gpu_bfyx_gemm_like",2], + "2668729552208169959": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8071957466247137919": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "16386955278777720573": ["convolution_gpu_bfyx_os_iyx_osv16",848], + "9723314434598141024": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "9314293064351558241": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "9321208819255762521": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "3880189981766119529": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "12052207771201936228": ["convolution_gpu_bfyx_gemm_like",2], + "18218755616248669884": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "10900880512948479338": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "11031625790234068916": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "9731370183088819573": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1616603916015535857": ["fully_connected_gpu_bf_io_input_spatial",1], + "4674416595144505741": ["convolution_gpu_bfyx_gemm_like",2], + "9144487908815767824": ["convolution_gpu_bfyx_1x1",0], + "8176012042686275874": ["convolution_gpu_bfyx_gemm_like",2], + "6548949901446632697": ["convolution_gpu_bfyx_1x1",2], + "2296581485980163665": ["convolution_gpu_bfyx_direct_10_12_16",2], + "872401732136570312": ["convolution_gpu_bfyx_os_iyx_osv16",342], + "14670068483447729857": ["convolution_gpu_winograd_6x3_s1_fused",0], + "14764715930784496165": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "17517495652165026573": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15884763176333003771": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "13973179950424276578": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "15428591250165788477": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "3988024997010367546": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "18184621367843960190": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "9488453013746383896": ["convolution_gpu_bfyx_gemm_like",1], + "4236174000795439083": ["convolution_gpu_bfyx_gemm_like",2], + "15914512645931208899": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "16158139166784964096": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "6767159196241633301": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "9421643783312790618": ["convolution_gpu_winograd_6x3_s1_fused",2], + "5056859994174498686": ["convolution_gpu_bfyx_gemm_like",2], + "8790625191540101806": ["convolution_gpu_bfyx_gemm_like",0], + "6642767323474835034": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "3491333679577961640": ["convolution_gpu_bfyx_gemm_like",0], + "15984885011101717258": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "3336303478756453360": ["convolution_gpu_bfyx_gemm_like",1], + "1096671695414716274": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "13775529405693629438": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6391847213494189692": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3325727286860556323": ["convolution_gpu_bfyx_gemm_like",1], + "17084977396231597605": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3928266232090746643": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5364060938737428149": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "4465701487417893814": ["convolution_gpu_bfyx_gemm_like",2], + "17770104464900126615": ["convolution_gpu_bfyx_1x1",2], + "18118237182023167949": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "530491406341772040": ["convolution_gpu_bfyx_gemm_like",2], + "13330734840729670622": ["convolution_gpu_bfyx_gemm_like",0], + "5782934278345953016": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "3255465741612432300": ["convolution_gpu_bfyx_os_iyx_osv16",814], + "2710485608298356329": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17808913959977434594": ["convolution_gpu_bfyx_gemm_like",0], + "3833510944499257797": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "10991423760161409883": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "11327228813412934262": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "15975964562807570772": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "1604661321386793876": ["convolution_gpu_winograd_6x3_s1_fused",1], + "16934879647229234163": ["convolution_gpu_bfyx_gemm_like",2], + "7969848911698660033": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6988492019664525206": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "16173557782125372935": ["convolution_gpu_bfyx_direct_10_12_16",0], + "879939701282942121": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "10892706534058849825": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "12917241193304093727": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "16120988958246503683": ["convolution_gpu_bfyx_os_iyx_osv16",1017], + "11726298758004767743": ["convolution_gpu_bfyx_gemm_like",2], + "7958443549125799229": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "9660812093766156608": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1907439276166837309": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9796621763733208035": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "6232363902828992968": ["convolution_gpu_bfyx_os_iyx_osv16",735], + "5124080536266387783": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "16998508915819714690": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "13663893159182636270": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "2582625260054352916": ["convolution_gpu_bfyx_gemm_like",2], + "5334566325056222430": ["convolution_gpu_bfyx_gemm_like",1], + "13338594271376045657": ["convolution_gpu_bfyx_gemm_like",2], + "17465517455679097501": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "15059549186302099880": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3860603464276263676": ["convolution_gpu_bfyx_gemm_like",2], + "2856601829807186494": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "1095495157025479260": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "2597453794298356435": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "18299254635579957284": ["convolution_gpu_bfyx_1x1",1], + "6848989271874647093": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "826850797666395121": ["convolution_gpu_bfyx_gemm_like",2], + "1403617451623027879": ["convolution_gpu_bfyx_os_iyx_osv16",297], + "2321767794934000238": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "4126895998426674411": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "15781622938833984014": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7351401242363888463": ["convolution_gpu_bfyx_gemm_like",2], + "8337820318779061494": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "360872770877634346": ["convolution_gpu_bfyx_gemm_like",2], + "5656623709782744241": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "4673127824919879657": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "12956726277674279950": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "7026575758396092435": ["convolution_gpu_bfyx_os_iyx_osv16",549], + "10792503079194374004": ["convolution_gpu_bfyx_os_iyx_osv16",174], + "13497225521878034159": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11215217005872946038": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "12293786134765875615": ["convolution_gpu_bfyx_gemm_like",2], + "4720851194954041037": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "472454322186482185": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "10722677916294015259": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "7824524940405130010": ["convolution_gpu_winograd_6x3_s1_fused",2], + "12232696287029987946": ["convolution_gpu_bfyx_os_iyx_osv16",914], + "5406129421969383274": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "6574971185849732667": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "12727541507197887360": ["convolution_gpu_bfyx_os_iyx_osv16",1023], + "4957638663977636791": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "846088275031979661": ["convolution_gpu_winograd_6x3_s1_fused",1], + "9153779186876518773": ["convolution_gpu_bfyx_gemm_like",2], + "11706378390483804857": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "5714365398623475983": ["convolution_gpu_bfyx_1x1",0], + "5195511638783481084": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "5941092474669713339": ["convolution_gpu_bfyx_gemm_like",1], + "13526488884846845330": ["convolution_gpu_bfyx_gemm_like",2], + "2659031931257084418": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "6708349666663292171": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "14767888121198814523": ["convolution_gpu_bfyx_os_iyx_osv16",128], + "8272823732258536202": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14331658870024759698": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "10432365444137108781": ["convolution_gpu_bfyx_os_iyx_osv16",1063], + "7546586420552408243": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "2877521658768725103": ["convolution_gpu_bfyx_gemm_like",2], + "13830605041347009953": ["convolution_gpu_bfyx_gemm_like",2], + "17434429579652310107": ["convolution_gpu_bfyx_gemm_like",2], + "6980201892073961793": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "8794896449397768269": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "951747146164097188": ["convolution_gpu_bfyx_1x1",1], + "2597523728660247862": ["convolution_gpu_bfyx_os_iyx_osv16",618], + "8376077531098664520": ["convolution_gpu_bfyx_gemm_like",0], + "11609821372586026178": ["convolution_gpu_bfyx_gemm_like",2], + "17316626950179740845": ["convolution_gpu_bfyx_os_iyx_osv16",561], + "8975333906619899020": ["convolution_gpu_bfyx_gemm_like",2], + "13189392239349392492": ["convolution_gpu_bfyx_os_iyx_osv16",662], + "6769243149577568817": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "7056293586529818253": ["convolution_gpu_bfyx_gemm_like",0], + "12707946849050970702": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "11308583200952256245": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9594594523961285945": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "15192230303376521834": ["convolution_gpu_bfyx_gemm_like",2], + "16717713360264747483": ["convolution_gpu_bfyx_gemm_like",2], + "475043738497218394": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "11110173861174257158": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6780215829176686721": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "13869716373706247686": ["convolution_gpu_bfyx_gemm_like",2], + "9323825370872655346": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "10774528268153772208": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "1353170363915443814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1419073145594317633": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "9545968464906009869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1902656726461670148": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "17006095064160484022": ["convolution_gpu_bfyx_os_iyx_osv16",810], + "14472187692485966933": ["convolution_gpu_bfyx_gemm_like",0], + "2575631797904040925": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "8780604510524622314": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "16925721317097534009": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5751283221740229986": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "13550435052563656432": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "18218631037214746168": ["convolution_gpu_bfyx_gemm_like",2], + "1045854873741563331": ["convolution_gpu_bfyx_gemm_like",2], + "11115684531624462986": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4617809377006148936": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "6997971129340865650": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "2930898141522848681": ["convolution_gpu_bfyx_1x1",2], + "6303682540621797774": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "16117738994809548007": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "4408772370026995920": ["convolution_gpu_bfyx_os_iyx_osv16",578], + "7060804814325505165": ["convolution_gpu_bfyx_gemm_like",2], + "1774158624592967937": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "123251351612308092": ["convolution_gpu_bfyx_os_iyx_osv16",836], + "1718634913016284523": ["convolution_gpu_bfyx_1x1",2], + "3101087806792514129": ["convolution_gpu_bfyx_1x1",2], + "14230385851791760020": ["convolution_gpu_bfyx_os_iyx_osv16",507], + "11430400968543668873": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "7843508201826629532": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "1984152634309440563": ["convolution_gpu_bfyx_gemm_like",2], + "3017891343734146267": ["convolution_gpu_bfyx_os_iyx_osv16",102], + "2038505773698938555": ["fully_connected_gpu_bs_f_bsv16_b1",2], + "13483175684542464385": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "17713034180977313726": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7561096442572829049": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1801731858063091191": ["convolution_gpu_bfyx_gemm_like",1], + "16772854836230971016": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "18122858611264877646": ["convolution_gpu_bfyx_gemm_like",2], + "8458082326743351141": ["convolution_gpu_bfyx_gemm_like",2], + "11634932044447867039": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "8153567933591966877": ["convolution_gpu_bfyx_gemm_like",2], + "2715447739580688669": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17377315194963069204": ["fully_connected_gpu_fb_oi_ref",2], + "7924408980408826942": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "15767973630744679517": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "959260710517842876": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "6192955702438301372": ["convolution_gpu_bfyx_os_iyx_osv16",647], + "17243648226968859637": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "9192665896782282996": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "14916625550370402883": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "12604104383683210104": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "3217246278485567748": ["convolution_gpu_bfyx_gemm_like",2], + "13011676362747785816": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17793292063552633023": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "12014527187730671229": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "1999979442136861875": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "14910223536998380801": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15914107501176673997": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "5824801192141531089": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "3277243911383750280": ["convolution_gpu_bfyx_gemm_like",1], + "16761856644242716357": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "10681768474583067517": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "9631545863582097486": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "1040030752340209480": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "7688176479120305539": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3499109651698979012": ["convolution_gpu_bfyx_os_iyx_osv16",1107], + "10556089809203693400": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "4880150897829846031": ["convolution_gpu_bfyx_1x1",2], + "9840495023131952174": ["convolution_gpu_winograd_6x3_s1_fused",0], + "954796765467489259": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "13810995219720233595": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "6578908625437515675": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1375156980278317418": ["convolution_gpu_bfyx_gemm_like",2], + "4750513665628842598": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "11254635684957519432": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "16692569816843207989": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "16566214123371867456": ["convolution_gpu_bfyx_gemm_like",2], + "9287404618748313247": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "16402312692470500253": ["convolution_gpu_bfyx_gemm_like",2], + "10880081193716628051": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "9942726414918759892": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12351866693978844266": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "5192552432194195116": ["convolution_gpu_bfyx_gemm_like",2], + "17089801601582809764": ["convolution_gpu_bfyx_gemm_like",2], + "693883892843558363": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "1594612401422787491": ["convolution_gpu_bfyx_gemm_like",2], + "16403423801823379909": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "989564341557094953": ["convolution_gpu_bfyx_os_iyx_osv16",69], + "2458592904274981909": ["fully_connected_gpu_bf_io_input_spatial",1], + "15678385128478075284": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6483208845600234755": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "16487774205195979355": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "12181889163404078773": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "10865695385270390803": ["convolution_gpu_bfyx_os_iyx_osv16",644], + "8797843396807284399": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "12558716383635737426": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "16384186388687043048": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "2968031010495399536": ["convolution_gpu_bfyx_gemm_like",2], + "17829148383265978140": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "12223993560805441284": ["convolution_gpu_bfyx_gemm_like",2], + "8421388456873652700": ["convolution_gpu_bfyx_os_iyx_osv16",267], + "7143510787416483146": ["convolution_gpu_bfyx_os_iyx_osv16",271], + "13144385730409574259": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "6003409324516527726": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1071007164550012186": ["convolution_gpu_bfyx_os_iyx_osv16",476], + "794499287296495726": ["convolution_gpu_bfyx_1x1",0], + "1152691534728260611": ["convolution_gpu_bfyx_1x1",0], + "7777333052643961206": ["convolution_gpu_bfyx_os_iyx_osv16",932], + "16511749893955141055": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12370729327673204804": ["convolution_gpu_bfyx_os_iyx_osv16",111], + "5381578460674280089": ["convolution_gpu_bfyx_direct_10_12_16",0], + "1336940384521633733": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "17423645390621980919": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "14283458015244508428": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5326247361632903583": ["convolution_gpu_bfyx_gemm_like",2], + "8708643228914766202": ["convolution_gpu_bfyx_gemm_like",1], + "11446745541571732900": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9835739612255048978": ["convolution_gpu_bfyx_os_iyx_osv16",377], + "3308770992373192529": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "11992625045241269569": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "7469127846325904854": ["convolution_gpu_bfyx_os_iyx_osv16",230], + "12617625046664709483": ["convolution_gpu_bfyx_os_iyx_osv16",1021], + "2659712601063515059": ["convolution_gpu_winograd_6x3_s1_fused",1], + "17906607354577138153": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "2632535010129224704": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15924583510704449214": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "14421898375873029115": ["convolution_gpu_bfyx_1x1",0], + "10037086825900566930": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8130920994920685157": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "15101680837342453931": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "14811022197918391667": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11759426200341586247": ["convolution_gpu_bfyx_os_iyx_osv16",121], + "13247725847475539658": ["convolution_gpu_bfyx_1x1",2], + "4285475880886685878": ["convolution_gpu_bfyx_gemm_like",2], + "14487842225000203929": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "18135307303959376082": ["convolution_gpu_bfyx_gemm_like",0], + "9437794960375526230": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "13224814158106791463": ["convolution_gpu_bfyx_gemm_like",2], + "9232653317479846765": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "5311718276151327830": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9439431829175743345": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16135569134646688251": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "4793007249026943006": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "15160738482264643601": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "15201438563802430490": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "8045367391487213749": ["convolution_gpu_bfyx_1x1",1], + "1089679781525023551": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10415046594066474634": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "12159582810513550491": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18253784177599134876": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "11939914680143672459": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "11239541755868028928": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "5592526760253524303": ["convolution_gpu_bfyx_os_iyx_osv16",426], + "17193614571243427089": ["convolution_gpu_bfyx_gemm_like",2], + "16336482874764861478": ["convolution_gpu_bfyx_gemm_like",2], + "3388752887767453958": ["convolution_gpu_bfyx_gemm_like",0], + "16081386644309102158": ["convolution_gpu_bfyx_gemm_like",2], + "12675313398314286884": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12339692995143159283": ["convolution_gpu_bfyx_gemm_like",2], + "3180320769716158201": ["convolution_gpu_bfyx_os_iyx_osv16",977], + "18416908414174464784": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "13932662890258900896": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "8170998059688907013": ["convolution_gpu_bfyx_1x1",2], + "6911215749850066204": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "4362304842016958728": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "18271689282126907793": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "14821616804286068969": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "7264756313770306662": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "17917978116807564183": ["convolution_gpu_bfyx_gemm_like",2], + "16395067736440127496": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "9780938731831129283": ["convolution_gpu_bfyx_gemm_like",2], + "2008424849669196225": ["convolution_gpu_bfyx_1x1",1], + "12794369485239257709": ["convolution_gpu_bfyx_gemm_like",0], + "2007192658799516915": ["fully_connected_gpu_bf_io_input_spatial",1], + "9131183544020825260": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "13374993751390784382": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "17224181038411430675": ["convolution_gpu_bfyx_os_iyx_osv16",202], + "7398196853452900099": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "10309504812060596568": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "1572991986657256775": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "1345101751956733589": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "14668725050395069435": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "11823205954749139338": ["convolution_gpu_bfyx_gemm_like",2], + "6964383468476265892": ["convolution_gpu_bfyx_1x1",2], + "3199841714087553410": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "631489011812924153": ["convolution_gpu_bfyx_1x1",0], + "669771152920944125": ["convolution_gpu_bfyx_gemm_like",2], + "14152716242882609401": ["convolution_gpu_bfyx_gemm_like",0], + "14902389080201926109": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14810839157236175179": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3362190082518348071": ["convolution_gpu_bfyx_gemm_like",2], + "13723543003759101485": ["convolution_gpu_bfyx_gemm_like",2], + "13642146548740074992": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "11022847760121601465": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "8787816339967963727": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "2530317332900569142": ["convolution_gpu_bfyx_os_iyx_osv16",807], + "9277610800970567810": ["convolution_gpu_bfyx_gemm_like",2], + "7915318733663535312": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "10879218241103462088": ["convolution_gpu_bfyx_gemm_like",2], + "4660288622381620227": ["convolution_gpu_bfyx_os_iyx_osv16",1027], + "1103204698908514224": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17900257435531434807": ["convolution_gpu_bfyx_gemm_like",2], + "8207349115037232863": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "12790788016297794214": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "12672995204641007004": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15486917753097743853": ["convolution_gpu_bfyx_1x1",2], + "8642107585829380438": ["convolution_gpu_bfyx_gemm_like",0], + "16117448559783537844": ["convolution_gpu_bfyx_os_iyx_osv16",343], + "2307310127637739872": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "4513063773753763458": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "4850497746076450913": ["convolution_gpu_bfyx_gemm_like",0], + "3226193790517362610": ["convolution_gpu_bfyx_1x1",0], + "8650948093564284852": ["convolution_gpu_bfyx_os_iyx_osv16",269], + "13735180250757239202": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "5807196005360653656": ["convolution_gpu_bfyx_gemm_like",2], + "16043683538361975370": ["convolution_gpu_bfyx_gemm_like",2], + "14603590053512154268": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "14045927407431718832": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "11241838709529552265": ["convolution_gpu_bfyx_gemm_like",2], + "6129602738379919488": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8096131027165540886": ["convolution_gpu_bfyx_gemm_like",2], + "4353842547963164546": ["convolution_gpu_bfyx_1x1",2], + "16731107540370927220": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "9748307611165615848": ["convolution_gpu_bfyx_gemm_like",2], + "142270860894725256": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "5179760459095053114": ["convolution_gpu_bfyx_os_iyx_osv16",207], + "1089944493540593798": ["convolution_gpu_bfyx_direct_10_12_16",2], + "190530884420224257": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "1997392406402548974": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "2321148334382088982": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "17854578307286932628": ["convolution_gpu_bfyx_gemm_like",2], + "12478309735214802531": ["convolution_gpu_bfyx_os_iyx_osv16",191], + "17039993918927377002": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "17329287216741045059": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "1107027047188366075": ["convolution_gpu_bfyx_os_iyx_osv16",918], + "13990028451169604107": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2738256633362038820": ["convolution_gpu_bfyx_gemm_like",2], + "14006248791647711759": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "10647227605517025377": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "822162932339827810": ["convolution_gpu_bfyx_os_iyx_osv16",539], + "4492332228252010118": ["convolution_gpu_bfyx_os_iyx_osv16",980], + "861419637283812778": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "17651821953342321913": ["convolution_gpu_bfyx_1x1",2], + "6020017927557041768": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2198278382394812839": ["convolution_gpu_bfyx_os_iyx_osv16",1007], + "3573490922300056520": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6065819201836017182": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16053585286807864356": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "8285478622349266483": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "11910735867274493498": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "16436006771518788093": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "16293465561256937726": ["convolution_gpu_bfyx_gemm_like",1], + "10437367877444543776": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4163359403543480821": ["fully_connected_gpu_bf_io_input_spatial",0], + "11305232900158601613": ["convolution_gpu_bfyx_1x1",1], + "276407276027553756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13761566845514364807": ["convolution_gpu_bfyx_os_iyx_osv16",1043], + "14546281065004619074": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "4135003545872878882": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "9525535670799618110": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "5682190700442712936": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "10971070835319242371": ["convolution_gpu_bfyx_os_iyx_osv16",841], + "11055049031355432623": ["convolution_gpu_bfyx_gemm_like",2], + "11318913630213187720": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "10642327923162019888": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2651385050387738902": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6712698149192186833": ["convolution_gpu_bfyx_gemm_like",2], + "1832310305089212990": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "15035800097152337587": ["convolution_gpu_bfyx_gemm_like",2], + "7712831597869354170": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12806934028210472719": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "1074748462756364699": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "6714886136800883594": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "16397733032387984819": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17252449599613270108": ["convolution_gpu_bfyx_os_iyx_osv16",741], + "17515064188391421150": ["convolution_gpu_bfyx_gemm_like",2], + "2762489653422414995": ["convolution_gpu_bfyx_gemm_like",2], + "5242271874488296527": ["convolution_gpu_bfyx_gemm_like",1], + "3699344686791530101": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2418288192668085805": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10632020369698615114": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "4958222070605478947": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "10136369729388564720": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "13713501506522022845": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4856470441452830056": ["convolution_gpu_bfyx_gemm_like",2], + "13786357802945430475": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "10084794570892043447": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "11637325834858582585": ["convolution_gpu_bfyx_gemm_like",2], + "5141753233513623264": ["convolution_gpu_bfyx_os_iyx_osv16",816], + "9416186718345824095": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "14614844213016502202": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "13597240991532942069": ["convolution_gpu_bfyx_os_iyx_osv16",1057], + "14558572801374416278": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "2171768477223405739": ["convolution_gpu_bfyx_os_iyx_osv16",933], + "13754540732991287617": ["convolution_gpu_bfyx_os_iyx_osv16",362], + "17833517350994024381": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "8990561333549136048": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "708452703070938673": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "8497468192424557348": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "2572395498687401679": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "621915374938805401": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "6146876760962332928": ["convolution_gpu_bfyx_gemm_like",2], + "60509335250891515": ["convolution_gpu_bfyx_gemm_like",2], + "4304041922043496030": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6767245864232675168": ["convolution_gpu_bfyx_gemm_like",2], + "755414184406250882": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8323445733669842657": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10049571207493913006": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "14792528369891965810": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16574710115918192418": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "5349415632630235233": ["convolution_gpu_bfyx_1x1",0], + "4429109491655891299": ["convolution_gpu_bfyx_gemm_like",2], + "2878824076934639346": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "18393312550272875456": ["convolution_gpu_bfyx_1x1",2], + "10978173291465325823": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "11473442921040533207": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "10171373375072694210": ["convolution_gpu_bfyx_1x1",1], + "2114599010013594942": ["convolution_gpu_bfyx_gemm_like",2], + "6688522645556262131": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "17870874477143985774": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3737552767159920174": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "10751536136794650334": ["convolution_gpu_bfyx_gemm_like",2], + "13800387305792597325": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "16431857516454692096": ["convolution_gpu_bfyx_os_iyx_osv16",185], + "16862145184923128012": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "15082818876354718849": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "17788367809717898285": ["convolution_gpu_bfyx_os_iyx_osv16",366], + "18068050257421269408": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "15914058104244750036": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "11795826875463204296": ["convolution_gpu_bfyx_1x1",0], + "9524303276541517389": ["convolution_gpu_bfyx_gemm_like",2], + "15322019609805777935": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "15649927926091502215": ["convolution_gpu_bfyx_os_iyx_osv16",62], + "15247381586316467097": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "70580716590540876": ["convolution_gpu_bfyx_gemm_like",2], + "598390166442977699": ["convolution_gpu_bfyx_os_iyx_osv16",540], + "4217179485243909459": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6288489890578212082": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12473600360154597915": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "7242013296950669829": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "10532183096485321729": ["convolution_gpu_bfyx_1x1",1], + "12247991248100147706": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "1452597292381229708": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15555083739490354527": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "9584652777232392944": ["convolution_gpu_bfyx_os_iyx_osv16",655], + "1867337342417952506": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "6942016672941874829": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "9194788897910888066": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "8264178890341675354": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "244921290040927639": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "5019077257951332016": ["convolution_gpu_bfyx_gemm_like",2], + "16969463538496570528": ["convolution_gpu_bfyx_gemm_like",2], + "1173986078589662704": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "16698547937652264447": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "6362428985273506890": ["convolution_gpu_bfyx_1x1",0], + "17377293745073971167": ["convolution_gpu_winograd_6x3_s1_fused",1], + "15490478608105402679": ["convolution_gpu_bfyx_gemm_like",2], + "11198301748997371475": ["convolution_gpu_bfyx_gemm_like",2], + "3316798708399098230": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "11267742746905371769": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "220326805056361171": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "14420809655798184553": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "3782315919331102574": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10783981060353445280": ["convolution_gpu_bfyx_os_iyx_osv16",62], + "706049518431331645": ["convolution_gpu_bfyx_gemm_like",2], + "17774424004510360936": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "18125732229366977468": ["convolution_gpu_winograd_6x3_s1_fused",0], + "14686278683380845546": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "2929190644951986399": ["convolution_gpu_bfyx_gemm_like",0], + "11640225461345567929": ["convolution_gpu_bfyx_os_iyx_osv16",259], + "1954052357826969119": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "8700574100180128776": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "15989894214714907271": ["convolution_gpu_bfyx_gemm_like",2], + "14872992823083730615": ["convolution_gpu_bfyx_gemm_like",2], + "2114232149447438823": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "17636500109629107732": ["convolution_gpu_bfyx_os_iyx_osv16",346], + "8769060267707904998": ["convolution_gpu_winograd_6x3_s1_fused",2], + "10546430708947911124": ["convolution_gpu_bfyx_gemm_like",2], + "12501619443242354860": ["convolution_gpu_bfyx_gemm_like",2], + "11087413527078604815": ["convolution_gpu_bfyx_gemm_like",2], + "3706994659266083979": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "8295126647635181949": ["convolution_gpu_bfyx_gemm_like",2], + "17243576882981097341": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "8701248964531180496": ["convolution_gpu_bfyx_gemm_like",0], + "16723478941106779069": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "17546566148752689536": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "13357365044448426880": ["convolution_gpu_bfyx_1x1",0], + "7148542290597073512": ["convolution_gpu_bfyx_gemm_like",2], + "49948277487706148": ["convolution_gpu_bfyx_1x1",1], + "10576856554114055028": ["convolution_gpu_bfyx_gemm_like",2], + "10309083227104422150": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "3750338655074082587": ["fully_connected_gpu_yxfb_ref",1], + "5951936376654416075": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "4141005390823981166": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10591379189397010097": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "16945184617367657570": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "3723613341885592267": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "17092525789052598917": ["convolution_gpu_bfyx_os_iyx_osv16",122], + "9616636708366808604": ["convolution_gpu_bfyx_gemm_like",1], + "4980217316169616839": ["convolution_gpu_bfyx_1x1",2], + "10019470094545733255": ["convolution_gpu_bfyx_gemm_like",2], + "10196332102593337214": ["convolution_gpu_bfyx_gemm_like",2], + "3499645386058307669": ["convolution_gpu_bfyx_gemm_like",2], + "4792351255949877935": ["convolution_gpu_bfyx_gemm_like",2], + "5585398540591396124": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "18062849937960759210": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "13787436604877398090": ["convolution_gpu_bfyx_os_iyx_osv16",599], + "12914986936318857086": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "7179714714302073459": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14616969385577243225": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "2108296560864415762": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9426665763007611385": ["convolution_gpu_bfyx_gemm_like",2], + "11645116728396933125": ["convolution_gpu_bfyx_gemm_like",0], + "14681717813022425567": ["convolution_gpu_bfyx_gemm_like",2], + "5308128387928804050": ["convolution_gpu_bfyx_os_iyx_osv16",736], + "6719302427415173754": ["convolution_gpu_bfyx_os_iyx_osv16",838], + "15430549683839591544": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "5214654427283761256": ["convolution_gpu_bfyx_gemm_like",2], + "9065894438656900887": ["convolution_gpu_bfyx_gemm_like",1], + "3236003754884728510": ["convolution_gpu_bfyx_os_iyx_osv16",681], + "11597391933877736800": ["convolution_gpu_bfyx_gemm_like",2], + "12151068022697708126": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "16958329690837977102": ["convolution_gpu_bfyx_gemm_like",2], + "3217574161785059951": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "2438374917504708831": ["convolution_gpu_bfyx_gemm_like",2], + "12255528292506999241": ["convolution_gpu_bfyx_gemm_like",1], + "10947686124973711385": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "11147816119060617810": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "18005721959893562716": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "8002233052700666718": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "9428176632140441528": ["convolution_gpu_bfyx_gemm_like",1], + "6131481289104111211": ["convolution_gpu_bfyx_gemm_like",2], + "2728938624042183713": ["convolution_gpu_bfyx_gemm_like",2], + "14213516751025324346": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10729288973933590396": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "14274685812676150168": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "3737576893817599311": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "16441830491664937048": ["convolution_gpu_bfyx_os_iyx_osv16",605], + "8127570953237266335": ["fully_connected_gpu_bf_io_input_spatial",3], + "17011363406405852347": ["convolution_gpu_bfyx_gemm_like",2], + "17009318615658405230": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "1885075753696445410": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11604111639041106489": ["convolution_gpu_bfyx_os_iyx_osv16",270], + "11619548409913646265": ["convolution_gpu_bfyx_direct_10_12_16",1], + "18356980026934328781": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "10416622008071151225": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "3524531620118359828": ["convolution_gpu_bfyx_os_iyx_osv16",902], + "9404953235624894187": ["convolution_gpu_bfyx_os_iyx_osv16",109], + "7000486794832106857": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "12655099960717366198": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3024402899381804809": ["convolution_gpu_bfyx_1x1",2], + "16266491618150971928": ["convolution_gpu_bfyx_os_iyx_osv16",360], + "13296242326766100583": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "1334070221835422461": ["convolution_gpu_bfyx_gemm_like",1], + "15961487889420208188": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "1760391741350091665": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1704404203639481753": ["convolution_gpu_bfyx_gemm_like",2], + "15193403354218116460": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14906458674793172507": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "1351033666248868977": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "5498839261395459224": ["convolution_gpu_bfyx_gemm_like",1], + "1207026216972160297": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "14719421757340260468": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "10679760989906275129": ["convolution_gpu_bfyx_os_iyx_osv16",433], + "1478419046264331178": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "2344498602308448450": ["convolution_gpu_bfyx_os_iyx_osv16",502], + "9763310312421884308": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "6093575518270471235": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "6673966852801136416": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "7998930863626763670": ["convolution_gpu_bfyx_gemm_like",1], + "13608239208821071914": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "8866716292621164810": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "5240181393417899912": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "10112032316939871435": ["convolution_gpu_bfyx_os_iyx_osv16",44], + "4818231379191523896": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "3240102173773280414": ["convolution_gpu_bfyx_1x1",0], + "2128612971571865547": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "713121569924250372": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13809898858049445969": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "17746215841755337461": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13358283026528078900": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "8275277322582733101": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "13569941893504840630": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "5963901433137582265": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6171845068913882721": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "15924916465272239832": ["convolution_gpu_bfyx_gemm_like",2], + "16683169947375504066": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "9173631510896381179": ["convolution_gpu_bfyx_gemm_like",2], + "6458124573210430792": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "16947969669087411530": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "17802514063213000148": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "12589440296742583335": ["convolution_gpu_bfyx_1x1",1], + "18432787283148809023": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "15464327246951632247": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "16014822406751503249": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "10783630257421062891": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "16037141448095945650": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15602863681196390535": ["convolution_gpu_bfyx_os_iyx_osv16",996], + "14973431782875808802": ["convolution_gpu_bfyx_gemm_like",2], + "3032921857841371728": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "11077876432364512822": ["fully_connected_gpu_bf_io_input_spatial",3], + "5240706676373148280": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8444259010311137762": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "5848293219267886434": ["convolution_gpu_bfyx_os_iyx_osv16",954], + "16773645387243701837": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1779941298820543013": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "16247399911710810038": ["convolution_gpu_bfyx_gemm_like",2], + "12134712464763856064": ["convolution_gpu_winograd_6x3_s1_fused",0], + "1497127399271219422": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "12397280593466519809": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "6104380778870471127": ["convolution_gpu_bfyx_1x1",0], + "3286330985102373533": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "12218337369633748663": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "938222258370511187": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "10308431308942416781": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "6585223640997887253": ["convolution_gpu_bfyx_gemm_like",2], + "3033264172690274208": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "14147460733160099960": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13762042713029963144": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16295660312557315941": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "17742192339816511494": ["convolution_gpu_bfyx_os_iyx_osv16",534], + "17087740929472936216": ["convolution_gpu_bfyx_os_iyx_osv16",482], + "16601702334097258697": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "17791773192152464021": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16816222375242496370": ["convolution_gpu_winograd_6x3_s1_fused",2], + "14733510474010040334": ["convolution_gpu_bfyx_gemm_like",2], + "12564687330941036772": ["convolution_gpu_bfyx_os_iyx_osv16",643], + "15882969506682501496": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10471519687597963116": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5485971317082563152": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "4238885454989272754": ["convolution_gpu_bfyx_gemm_like",2], + "14397348576352573007": ["convolution_gpu_bfyx_gemm_like",2], + "4229105529069729944": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16833026567865627676": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "5295693108687178880": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "12962558681443556219": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "9182897385081081193": ["convolution_gpu_winograd_6x3_s1_fused",0], + "1545105800386716684": ["convolution_gpu_bfyx_os_iyx_osv16",205], + "3603187029740446600": ["convolution_gpu_bfyx_gemm_like",2], + "8578747191812631883": ["convolution_gpu_bfyx_os_iyx_osv16",1009], + "5120466856097219243": ["convolution_gpu_bfyx_gemm_like",2], + "15548971488532746290": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10791067159964399241": ["convolution_gpu_bfyx_os_iyx_osv16",1045], + "5097818987523855112": ["convolution_gpu_bfyx_gemm_like",2], + "7958459862276998225": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16884396694505987920": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "15991460001131903561": ["convolution_gpu_bfyx_gemm_like",1], + "13468081302022888489": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "16896833230469488924": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "6777045876155144709": ["convolution_gpu_bfyx_os_iyx_osv16",875], + "3234107167862677811": ["convolution_gpu_bfyx_os_iyx_osv16",587], + "17700958439420868719": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12992061224471212714": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14389915292223442327": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "17832542092610191859": ["convolution_gpu_bfyx_os_iyx_osv16",528], + "69439315851965666": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2627779045483019709": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16729849855476690294": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "13733327241591630239": ["convolution_gpu_bfyx_os_iyx_osv16",11], + "14985236276429954162": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14050124896329573468": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6090625728451718945": ["convolution_gpu_winograd_6x3_s1_fused",1], + "14057348639391787117": ["convolution_gpu_bfyx_os_iyx_osv16",327], + "2609454334520044465": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "12160764253455777655": ["convolution_gpu_bfyx_os_iyx_osv16",984], + "14458851250685872417": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "6729785110495533200": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "787203599734115483": ["convolution_gpu_bfyx_1x1",0], + "4772696293208603817": ["convolution_gpu_bfyx_gemm_like",2], + "10861525139715322534": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "5093049998173715787": ["convolution_gpu_bfyx_gemm_like",2], + "3122997634505472500": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "423221712829930726": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17915846724151945664": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13954144830230671601": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13237050834496100264": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "16610284927818475574": ["convolution_gpu_bfyx_gemm_like",1], + "18199526506796726885": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11582534256623549131": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "17370051888730874220": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "13621339501067135142": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "13973028408397200796": ["convolution_gpu_bfyx_os_iyx_osv16",431], + "14251848023416168295": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16446533347502650316": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "153117141968471446": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "689445825453914111": ["convolution_gpu_bfyx_gemm_like",2], + "3830703844770425343": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3571959174116404960": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "11120846960057008937": ["convolution_gpu_bfyx_os_iyx_osv16",516], + "12693511427898130707": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9101018613418825655": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "1451466106918423837": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "1103228955716492167": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12352923639732112511": ["convolution_gpu_bfyx_os_iyx_osv16",75], + "17037416417174266088": ["convolution_gpu_bfyx_os_iyx_osv16",225], + "5629373398445592781": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "8929453032482114162": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "13485300684443803732": ["convolution_gpu_bfyx_os_iyx_osv16",344], + "12976499206227689731": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "14515066741400300669": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "16531824466148265247": ["convolution_gpu_bfyx_os_iyx_osv16",535], + "14746359019867963124": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3012566432840424198": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "17147293671640396193": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "9480653639044390919": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12741457056869452536": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "2969389503332309296": ["convolution_gpu_bfyx_direct_10_12_16",1], + "6509758095668864050": ["convolution_gpu_bfyx_os_iyx_osv16",743], + "6407471972820516685": ["fully_connected_gpu_fb_oi_ref",1], + "18043340998699622388": ["convolution_gpu_bfyx_os_iyx_osv16",138], + "16108759090923335184": ["convolution_gpu_bfyx_gemm_like",2], + "4355933224673863178": ["convolution_gpu_bfyx_gemm_like",0], + "13092232276822302626": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "12011982029561277581": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "5103094815475470596": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16307464696265537356": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "4967444801764057340": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "16783619135298589974": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "6040286126398028933": ["convolution_gpu_winograd_6x3_s1_fused",1], + "7375461241315602473": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "3892679716763161057": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1411786954276574458": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "9263063714383940562": ["convolution_gpu_bfyx_os_iyx_osv16",815], + "13102754309439605192": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "5524218746051008792": ["convolution_gpu_bfyx_os_iyx_osv16",844], + "6928136130626403937": ["convolution_gpu_bfyx_gemm_like",2], + "9761573038170759563": ["convolution_gpu_bfyx_os_iyx_osv16",316], + "6334639534663495263": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "7966454753124154534": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "10306542963828398049": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "3963106895592011725": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12107262410635772120": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "929378940515745198": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5627834277145735283": ["convolution_gpu_bfyx_os_iyx_osv16",83], + "11956435900037329302": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "5389189982064081933": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "9767294641786972359": ["convolution_gpu_bfyx_gemm_like",2], + "15511138074959300404": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "6181651715051152713": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "14763982961176216679": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "14907097142953816744": ["convolution_gpu_bfyx_gemm_like",2], + "13300022131572486202": ["convolution_gpu_bfyx_gemm_like",2], + "6418327009347170687": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "1192279884248226739": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "156456996459945842": ["convolution_gpu_bfyx_os_iyx_osv16",814], + "4165036357594592683": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15897477855246170861": ["convolution_gpu_bfyx_gemm_like",2], + "16789245987103323406": ["convolution_gpu_bfyx_gemm_like",2], + "3448477246688526708": ["convolution_gpu_bfyx_gemm_like",0], + "17977676737774695825": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "5352861363832390974": ["convolution_gpu_bfyx_direct_10_12_16",0], + "875142032423622622": ["convolution_gpu_bfyx_os_iyx_osv16",498], + "16924006268301179157": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "9741607635826869269": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10071449674652717890": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "9299299311101549958": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "17301887391757619741": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "11147573971701279689": ["convolution_gpu_bfyx_os_iyx_osv16",268], + "4809191606466167229": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "3291180926381314705": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "14316077757957132678": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "6109013751635776331": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "7940369586324090841": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "11069983292783104310": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "4481903208484313806": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4014667229872705228": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "18136135457402651842": ["convolution_gpu_winograd_6x3_s1_fused",0], + "4197617702037834389": ["convolution_gpu_bfyx_os_iyx_osv16",296], + "17222005830854879661": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "4701832665603867798": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "1138439260035360722": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6931953332823066530": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "13472532612464340803": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "2324120381399737261": ["convolution_gpu_bfyx_os_iyx_osv16",267], + "5235375820995365354": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "10995907213890714701": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "9502195532658935521": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "580936360000782237": ["fully_connected_gpu_bf_io_input_spatial",2], + "3106710091841093202": ["convolution_gpu_bfyx_os_iyx_osv16",991], + "12090536142661253835": ["fully_connected_gpu_bf_io_gemm",1], + "14352796912241296357": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "16765994345605657100": ["convolution_gpu_bfyx_1x1",0], + "9475130054420979752": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "8860443174052454332": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "14487682847898298214": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "8712136292276123857": ["convolution_gpu_bfyx_os_iyx_osv16",603], + "4366168099274266975": ["convolution_gpu_bfyx_gemm_like",1], + "654122557966242717": ["convolution_gpu_bfyx_gemm_like",1], + "9967101735808367971": ["convolution_gpu_bfyx_1x1",0], + "18132952464279667664": ["convolution_gpu_bfyx_1x1",2], + "14744368497944610864": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8507854696766492454": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16911450336605071390": ["convolution_gpu_bfyx_1x1",2], + "438528596970898721": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "3036512701943687724": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "13654816209891478730": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "5353552956675518468": ["convolution_gpu_bfyx_os_iyx_osv16",457], + "425744529089575241": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "9274179337770060652": ["convolution_gpu_bfyx_gemm_like",1], + "6193161166790398003": ["convolution_gpu_bfyx_gemm_like",1], + "16312223896859176991": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16286085532892593349": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11718418772370938734": ["convolution_gpu_bfyx_gemm_like",2], + "14363654136811880073": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "7786866732196451977": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "11455518069358829249": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "14171139920084409181": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "5504757952698692953": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "15548847099740441551": ["convolution_gpu_bfyx_1x1",1], + "4889188980319017094": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "12929981792125924963": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11857037689248685487": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "15591167992985613695": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13314092088416047551": ["fully_connected_gpu_fb_io_ref",1], + "16800575429414554907": ["convolution_gpu_bfyx_os_iyx_osv16",448], + "7799984350284425885": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "5600807544955072308": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "14221578799010900252": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "13484950419220835364": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "6557428245898292304": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5157249499936659040": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "5088898934670078153": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1152693503778768433": ["convolution_gpu_bfyx_os_iyx_osv16",963], + "4104562704039821482": ["convolution_gpu_bfyx_1x1",1], + "3159681096461848644": ["convolution_gpu_bfyx_os_iyx_osv16",295], + "9585113116232600562": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "1890739204389692970": ["convolution_gpu_bfyx_os_iyx_osv16",754], + "6856130385095139346": ["convolution_gpu_bfyx_os_iyx_osv16",1124], + "17104611871050967957": ["convolution_gpu_winograd_6x3_s1_fused",1], + "5644068493155655611": ["convolution_gpu_bfyx_gemm_like",2], + "8709632541892447149": ["convolution_gpu_bfyx_gemm_like",2], + "2172121470071868949": ["convolution_gpu_bfyx_gemm_like",2], + "16235115911229280717": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "16509472637458153234": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "16820082917500285799": ["convolution_gpu_bfyx_gemm_like",2], + "15329680728165965773": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "17882819773586674851": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "9987415314864002460": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13972357557211413688": ["convolution_gpu_bfyx_gemm_like",2], + "5940337324384948573": ["convolution_gpu_bfyx_gemm_like",2], + "2552187713769926425": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "17525531790109748810": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "8757900457181374694": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11932770338770247767": ["convolution_gpu_bfyx_os_iyx_osv16",438], + "7162575953766465459": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "5219399418946822456": ["convolution_gpu_bfyx_gemm_like",2], + "16561224775421968533": ["convolution_gpu_bfyx_os_iyx_osv16",759], + "11636129433022017868": ["convolution_gpu_bfyx_os_iyx_osv16",600], + "14559308665571750465": ["convolution_gpu_bfyx_gemm_like",2], + "4499586349553581439": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "17364712285968437405": ["convolution_gpu_bfyx_os_iyx_osv16",1018], + "18103534417093702556": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "14133958262039763609": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "5083163738120585821": ["fully_connected_gpu_fb_oi_ref",0], + "5649082203775427830": ["convolution_gpu_bfyx_gemm_like",2], + "5291011077679733990": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "383721620126444793": ["convolution_gpu_bfyx_gemm_like",2], + "11756881293845417212": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "17994361454416813294": ["convolution_gpu_bfyx_os_iyx_osv16",361], + "17549411807772646930": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "2649192407401044065": ["convolution_gpu_bfyx_gemm_like",0], + "1941341635794709702": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "16108573960501496757": ["convolution_gpu_bfyx_gemm_like",2], + "2721793280965260548": ["convolution_gpu_bfyx_os_iyx_osv16",325], + "12164298124869114517": ["convolution_gpu_bfyx_os_iyx_osv16",216], + "4584970211859494304": ["convolution_gpu_bfyx_direct_10_12_16",2], + "14281201038135286621": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "4264284648458489052": ["convolution_gpu_bfyx_os_iyx_osv16",221], + "6988674007771237080": ["convolution_gpu_bfyx_gemm_like",1], + "11800783548769329949": ["convolution_gpu_bfyx_os_iyx_osv16",693], + "8656468860180713379": ["convolution_gpu_bfyx_gemm_like",1], + "2066731703492755469": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "11772741918108731396": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "10990741293315393791": ["convolution_gpu_bfyx_gemm_like",1], + "15187035463799513424": ["convolution_gpu_bfyx_1x1",1], + "7575634241190730697": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "3067806959725855130": ["convolution_gpu_bfyx_os_iyx_osv16",152], + "12946540633035976364": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "6553736978928374036": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "4862529593282936100": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "9175450649281374948": ["convolution_gpu_bfyx_os_iyx_osv16",860], + "2842103889477438816": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "2173867324489962689": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9988801796928462423": ["convolution_gpu_bfyx_os_iyx_osv16",245], + "3870539490799697188": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "17854208422879910606": ["convolution_gpu_bfyx_gemm_like",2], + "5077214229434392730": ["convolution_gpu_bfyx_os_iyx_osv16",956], + "8303211644727914658": ["convolution_gpu_bfyx_1x1",1], + "6318228858846223186": ["convolution_gpu_bfyx_1x1",2], + "8614534946699754256": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "15485701086886851362": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "17021925795809437171": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5047419871737940985": ["convolution_gpu_bfyx_direct_10_12_16",0], + "4161612746310931789": ["convolution_gpu_bfyx_gemm_like",2], + "1587501521145162454": ["convolution_gpu_bfyx_gemm_like",2], + "8913950860101596091": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "8856888761246057127": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "481328129206881674": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "10486000767830001094": ["convolution_gpu_bfyx_1x1",1], + "4118073384938355655": ["convolution_gpu_bfyx_os_iyx_osv16",678], + "6981537186704688907": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9728611486592854529": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "789359733867650915": ["convolution_gpu_bfyx_gemm_like",1], + "17264608538692763688": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1653274345637156919": ["convolution_gpu_bfyx_direct_10_12_16",0], + "5801429077171542466": ["convolution_gpu_bfyx_os_iyx_osv16",485], + "9378269524012289175": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "1075027491444288875": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2912858944747613525": ["convolution_gpu_bfyx_os_iyx_osv16",244], + "10930115765550856328": ["convolution_gpu_bfyx_gemm_like",2], + "13119479079474639169": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "632116056424249698": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15109847707903824859": ["convolution_gpu_bfyx_1x1",0], + "5415319660821122528": ["fully_connected_gpu_bf_io_input_spatial",2], + "10702234389482091891": ["convolution_gpu_bfyx_gemm_like",2], + "12415368596357091523": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "17392594284473856393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "949330876419581703": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "7314288062932060863": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "18377298651236993830": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "4640028527711211109": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "12725675221990905186": ["convolution_gpu_bfyx_gemm_like",2], + "6670327979947471550": ["convolution_gpu_bfyx_os_iyx_osv16",163], + "2702144517025248597": ["convolution_gpu_bfyx_gemm_like",2], + "5581428998642936688": ["convolution_gpu_bfyx_1x1",2], + "6902644989079870993": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "13540002981450186147": ["convolution_gpu_bfyx_os_iyx_osv16",964], + "13093429681061786539": ["convolution_gpu_bfyx_os_iyx_osv16",1117], + "7208008921815475393": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15997754881872769378": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "15417738436777481469": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "2722124265986526212": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "11992353959766718397": ["convolution_gpu_bfyx_os_iyx_osv16",551], + "3759057398165607194": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15689502054035168040": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "11913020016435860608": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "4916769804113823482": ["convolution_gpu_bfyx_1x1",0], + "15154700439767512396": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "17975017633455909321": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "18121198117765854866": ["convolution_gpu_bfyx_1x1",2], + "9714764457768279762": ["convolution_gpu_bfyx_os_iyx_osv16",746], + "11025471731438443683": ["convolution_gpu_bfyx_os_iyx_osv16",536], + "11052732052072367261": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "13026555349791486777": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "17281202179589913619": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "7104756264011682902": ["convolution_gpu_bfyx_gemm_like",2], + "12700372241799686527": ["convolution_gpu_bfyx_gemm_like",2], + "16190949264253468961": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "16579057939215877904": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "17764033613416389758": ["convolution_gpu_bfyx_gemm_like",2], + "13046322179198317310": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "15354185859262170540": ["convolution_gpu_bfyx_gemm_like",1], + "12676167240795292217": ["convolution_gpu_bfyx_gemm_like",0], + "16383540667048742064": ["convolution_gpu_bfyx_gemm_like",2], + "287386909600391846": ["convolution_gpu_bfyx_direct_10_12_16",0], + "956022649859563080": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "6062246008880097669": ["fully_connected_gpu_bf_io_input_spatial",1], + "2534408579674556441": ["convolution_gpu_bfyx_os_iyx_osv16",966], + "2730604806511016352": ["convolution_gpu_bfyx_os_iyx_osv16",154], + "17806712457019493207": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "15914342421266687768": ["convolution_gpu_bfyx_gemm_like",2], + "13455881643467418059": ["convolution_gpu_bfyx_gemm_like",2], + "11104393974242049153": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1573498199681662714": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "17421991623849671076": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4232250144427804891": ["fully_connected_gpu_bf_io_input_spatial",0], + "4492673409319122180": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "7460672405409009037": ["convolution_gpu_bfyx_os_iyx_osv16",652], + "16541722316343690197": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "13404888565084206853": ["convolution_gpu_bfyx_os_iyx_osv16",703], + "8788703258318141635": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "8751016391945753900": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3159147743553063163": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2497756607567197523": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "8069537351442302814": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1819720745131968914": ["convolution_gpu_bfyx_gemm_like",2], + "14389719202147508599": ["convolution_gpu_bfyx_os_iyx_osv16",715], + "428659495445490820": ["convolution_gpu_bfyx_os_iyx_osv16",175], + "4161001033681779582": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "1120455113299469776": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "15688186132508213638": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13078401519973360182": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13161997040644039778": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15399245700982979379": ["convolution_gpu_bfyx_os_iyx_osv16",597], + "16105073808368936420": ["convolution_gpu_bfyx_gemm_like",2], + "12026482841341343242": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "10173283505468233128": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "12380856644683171627": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "16986610822918634530": ["convolution_gpu_bfyx_1x1",1], + "1003101267609305257": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "9305861997313663528": ["convolution_gpu_bfyx_gemm_like",2], + "3134489458855347772": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "14217181622713951411": ["convolution_gpu_bfyx_gemm_like",2], + "6620782733027313312": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "593712935037568960": ["convolution_gpu_bfyx_os_iyx_osv16",1068], + "14912119584313592912": ["convolution_gpu_bfyx_gemm_like",2], + "17026284168840448378": ["convolution_gpu_bfyx_direct_10_12_16",1], + "1541754036637209097": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "1202292109713947702": ["convolution_gpu_bfyx_gemm_like",2], + "12259844988981080505": ["convolution_gpu_bfyx_gemm_like",1], + "11528310408333718862": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "1471837664358450291": ["convolution_gpu_bfyx_gemm_like",2], + "178353385245384751": ["convolution_gpu_bfyx_gemm_like",2], + "14667209474639064623": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14091610802555875119": ["convolution_gpu_bfyx_gemm_like",2], + "8651641584737798174": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "17024388383581997032": ["convolution_gpu_bfyx_gemm_like",2], + "6340128090694375876": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "4274801141127703532": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3154539627593235077": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "6571438978296387721": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "2305461098719675735": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17705807503894740726": ["convolution_gpu_bfyx_gemm_like",0], + "1920070013712913772": ["convolution_gpu_bfyx_os_iyx_osv16",157], + "17824431042110985323": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "4290840152278060614": ["convolution_gpu_bfyx_gemm_like",2], + "13248567106128518549": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "11622925573287101001": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6788311046557489996": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "17599396373608265826": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "16469788155263456039": ["convolution_gpu_bfyx_gemm_like",2], + "7585785802379042424": ["convolution_gpu_bfyx_1x1",1], + "4678607855896512523": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13312514874803986753": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "14381420852659789698": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "1889171157980977747": ["convolution_gpu_bfyx_gemm_like",2], + "7881579844586294503": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "15069906408448814772": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "6949539207944972855": ["convolution_gpu_bfyx_gemm_like",2], + "10522649794540845800": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "15117880293418979489": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16706244336960642883": ["convolution_gpu_bfyx_gemm_like",1], + "5912303851874077576": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "10055549084854766170": ["convolution_gpu_bfyx_os_iyx_osv16",973], + "5648658688155716974": ["convolution_gpu_bfyx_1x1",1], + "12523676912856063091": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17446505012657609153": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "739676584505475609": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "13478984039708550410": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "13320675959188615441": ["convolution_gpu_bfyx_gemm_like",1], + "5336120047683197088": ["convolution_gpu_bfyx_gemm_like",2], + "1082574490068006980": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "4553409514380460123": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "13124342334495538095": ["convolution_gpu_bfyx_os_iyx_osv16",639], + "216603198215625772": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "13902214851539825156": ["convolution_gpu_bfyx_gemm_like",2], + "15675968397825708285": ["convolution_gpu_bfyx_os_iyx_osv16",928], + "40704767167309552": ["convolution_gpu_bfyx_os_iyx_osv16",206], + "969746749329671447": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "17015791782274123780": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "11443268857010762276": ["convolution_gpu_bfyx_gemm_like",2], + "17370158297470557151": ["convolution_gpu_bfyx_1x1",2], + "7084646429975006971": ["convolution_gpu_bfyx_1x1",0], + "5673972310424776040": ["convolution_gpu_bfyx_gemm_like",2], + "9148379585489720669": ["convolution_gpu_bfyx_os_iyx_osv16",862], + "15031155621982459860": ["convolution_gpu_bfyx_gemm_like",2], + "3509487327001107638": ["convolution_gpu_bfyx_gemm_like",2], + "12081835728078383819": ["fully_connected_gpu_bf_io_input_spatial",1], + "12947341728489226671": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4408600136502382976": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "570683988452622223": ["convolution_gpu_bfyx_os_iyx_osv16",804], + "6142707387281700290": ["convolution_gpu_bfyx_gemm_like",1], + "15209909241815414156": ["convolution_gpu_bfyx_os_iyx_osv16",183], + "529543453251381109": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5132761922124425835": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "11292995457386147494": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "10157866834809927320": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "9383182168277796969": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "5277400567128489977": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "12962552332511702682": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "18221867262301937903": ["convolution_gpu_bfyx_1x1",2], + "16294825599850364701": ["convolution_gpu_bfyx_os_iyx_osv16",1005], + "14043770215999952932": ["convolution_gpu_bfyx_gemm_like",2], + "17419874083634480896": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "15315327794058441258": ["convolution_gpu_bfyx_gemm_like",2], + "16567638487719493784": ["convolution_gpu_bfyx_os_iyx_osv16",721], + "16094174852600023296": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "7454366978268164047": ["convolution_gpu_bfyx_gemm_like",2], + "11834683513280095384": ["convolution_gpu_winograd_6x3_s1_fused",1], + "10302338806536775954": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "15962137123591591534": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "6149673627320838019": ["fully_connected_gpu_bf_io_input_spatial",0], + "1470933384474984858": ["convolution_gpu_bfyx_1x1",0], + "3662747857062156477": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "4815047491742617397": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "10887835418423052188": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7878605163588288309": ["convolution_gpu_bfyx_direct_10_12_16",0], + "18136765667969393174": ["convolution_gpu_bfyx_direct_10_12_16",1], + "937159502066696999": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "12022152681602871455": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "16985912104363932350": ["convolution_gpu_bfyx_os_iyx_osv16",505], + "3087801652564627458": ["convolution_gpu_bfyx_os_iyx_osv16",814], + "15643135666029727865": ["convolution_gpu_bfyx_gemm_like",2], + "9530116228032101908": ["convolution_gpu_bfyx_1x1",0], + "15235409162483701027": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4476928353532757380": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "970768445746568749": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "16833854122884184025": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "856877003890134554": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "75742659105146536": ["convolution_gpu_bfyx_gemm_like",1], + "6008613375871089139": ["convolution_gpu_bfyx_os_iyx_osv16",796], + "54019631544204590": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2832268621630415376": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "4615708568396290002": ["convolution_gpu_bfyx_1x1",2], + "7843498978148810586": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "15295951849706930711": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "8463615810239412362": ["convolution_gpu_bfyx_1x1",2], + "435888248913413834": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "1082586642383386489": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "6726099352298108756": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13676654389512816868": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "2162882863309264684": ["convolution_gpu_bfyx_os_iyx_osv16",313], + "6450532136308941035": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "5680236635030250712": ["convolution_gpu_bfyx_1x1",0], + "15411474884532403722": ["convolution_gpu_bfyx_os_iyx_osv16",404], + "13176385389367548697": ["convolution_gpu_bfyx_gemm_like",2], + "11661208196482963286": ["convolution_gpu_bfyx_os_iyx_osv16",85], + "3752171257634205726": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "1435153323458789173": ["convolution_gpu_bfyx_gemm_like",2], + "14841539539334726292": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "17015328096102652908": ["convolution_gpu_bfyx_gemm_like",2], + "8061914949376516780": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "17442105631503326136": ["convolution_gpu_bfyx_os_iyx_osv16",628], + "14599780481362761532": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14947798627499698329": ["convolution_gpu_bfyx_gemm_like",2], + "2526832080529662683": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "787363431787954804": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "13855438905855887272": ["convolution_gpu_bfyx_os_iyx_osv16",517], + "6204725118764552662": ["convolution_gpu_bfyx_gemm_like",0], + "15650839696475698676": ["convolution_gpu_bfyx_os_iyx_osv16",82], + "12277470820821378855": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "8528750110601691390": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "14462438074931673266": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "7862815466573236157": ["convolution_gpu_bfyx_os_iyx_osv16",875], + "10702465758376061967": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5572956736535433608": ["convolution_gpu_bfyx_1x1",1], + "18017913952946745878": ["convolution_gpu_bfyx_gemm_like",2], + "4840004190985490064": ["convolution_gpu_bfyx_gemm_like",2], + "9641089659148164809": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4160656836528944651": ["convolution_gpu_bfyx_os_iyx_osv16",750], + "1982176363226079588": ["convolution_gpu_bfyx_os_iyx_osv16",612], + "1938086876393565238": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "16609136488331186895": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "17035903590837750750": ["convolution_gpu_bfyx_direct_10_12_16",0], + "4228437925117070319": ["convolution_gpu_bfyx_1x1",0], + "6820284286806022849": ["convolution_gpu_bfyx_gemm_like",2], + "14923692894655929923": ["fully_connected_gpu_bf_io_gemm",1], + "14749947225382670869": ["convolution_gpu_bfyx_os_iyx_osv16",576], + "2816353973187452604": ["convolution_gpu_bfyx_gemm_like",2], + "8655315308767111198": ["convolution_gpu_bfyx_1x1",1], + "3503893875515897267": ["convolution_gpu_bfyx_os_iyx_osv16",1099], + "18426893729833771809": ["convolution_gpu_bfyx_1x1",0], + "7974670633697926450": ["convolution_gpu_bfyx_1x1",0], + "10178951466584845110": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "13883044928774243663": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "10085059621136526248": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4086556132337751931": ["convolution_gpu_bfyx_gemm_like",1], + "11627532066884923848": ["convolution_gpu_bfyx_1x1",0], + "15963038745470172423": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "1076005730007872492": ["convolution_gpu_bfyx_os_iyx_osv16",81], + "7431849514656037251": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "12348135936862667024": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "1622880009460832832": ["convolution_gpu_bfyx_os_iyx_osv16",680], + "4569338575782832784": ["convolution_gpu_bfyx_gemm_like",1], + "4226968857681929488": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7605139219344415117": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16230621843665445228": ["convolution_gpu_bfyx_gemm_like",2], + "5770286476124511234": ["convolution_gpu_bfyx_gemm_like",2], + "14930789530046665855": ["convolution_gpu_bfyx_os_iyx_osv16",1071], + "5953754321266570854": ["convolution_gpu_bfyx_gemm_like",2], + "3854114166348568039": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "16011429608661242565": ["convolution_gpu_bfyx_os_iyx_osv16",981], + "13327653786981478088": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "17948637243158994878": ["convolution_gpu_bfyx_gemm_like",0], + "13680926356824317761": ["convolution_gpu_bfyx_os_iyx_osv16",429], + "18174857480705846286": ["convolution_gpu_bfyx_os_iyx_osv16",579], + "9942099207256025216": ["convolution_gpu_bfyx_gemm_like",2], + "2598267743388306204": ["convolution_gpu_bfyx_gemm_like",2], + "9275303306340702111": ["convolution_gpu_bfyx_gemm_like",2], + "13702692566238948173": ["convolution_gpu_bfyx_gemm_like",1], + "14532519639619315651": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "10295330953350618042": ["convolution_gpu_bfyx_os_iyx_osv16",0], + "5079055505117153635": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "2438261005924916746": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "14962768577232034246": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9631481972809246378": ["convolution_gpu_bfyx_gemm_like",0], + "15048584393463312977": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "12725647706191463348": ["convolution_gpu_bfyx_gemm_like",2], + "12228610148087508521": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "10861769381993948050": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "3337625924046561031": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "7113777272518482528": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "6882621854468565774": ["convolution_gpu_bfyx_os_iyx_osv16",952], + "18034648276860485300": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "15602218079503030465": ["convolution_gpu_bfyx_gemm_like",2], + "16865879032845300007": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "4239133538073498792": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12972798847556569913": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "8203550467004532364": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "17422822627612865758": ["convolution_gpu_winograd_6x3_s1_fused",0], + "9999553425206328238": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "4773123925616969670": ["convolution_gpu_bfyx_direct_10_12_16",2], + "9191832520273617003": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7638626850074132214": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "7667898603371717971": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5312140481706133684": ["convolution_gpu_bfyx_os_iyx_osv16",806], + "7532088618116521936": ["convolution_gpu_bfyx_gemm_like",1], + "8728178019712933221": ["convolution_gpu_bfyx_os_iyx_osv16",1054], + "11919129623429545762": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15110359240685619357": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "13191096881934434519": ["convolution_gpu_bfyx_os_iyx_osv16",740], + "10267260789603562117": ["convolution_gpu_bfyx_os_iyx_osv16",1000], + "15225354446874994535": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "5352061583962489055": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "77073286362822723": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "522181557896569275": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "10308175009371219583": ["convolution_gpu_bfyx_os_iyx_osv16",254], + "8943913562339525413": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "11461581290174106570": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15320845027635796583": ["convolution_gpu_bfyx_gemm_like",2], + "11655994466278963438": ["convolution_gpu_bfyx_direct_10_12_16",1], + "503369896500284129": ["convolution_gpu_bfyx_1x1",1], + "15308578014507211237": ["convolution_gpu_bfyx_os_iyx_osv16",222], + "3216877571075556066": ["convolution_gpu_bfyx_os_iyx_osv16",557], + "991586070509079617": ["convolution_gpu_bfyx_gemm_like",2], + "2265784112305305260": ["convolution_gpu_bfyx_direct_10_12_16",1], + "8329846097322076175": ["convolution_gpu_bfyx_os_iyx_osv16",204], + "2129726780118554358": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11587239927319376658": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "15824189967727245909": ["convolution_gpu_bfyx_gemm_like",2], + "4191326605459754690": ["convolution_gpu_bfyx_direct_10_12_16",1], + "12319073009094248232": ["convolution_gpu_bfyx_os_iyx_osv16",236], + "13369603621524676979": ["convolution_gpu_bfyx_direct_10_12_16",2], + "18419183012101393192": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "3215659303601163167": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "659846949368492111": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14759179293743468995": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "16582132711225619740": ["convolution_gpu_bfyx_os_iyx_osv16",200], + "4915831715914920982": ["convolution_gpu_bfyx_gemm_like",0], + "17856816245251319111": ["convolution_gpu_bfyx_os_iyx_osv16",938], + "10727592780669452048": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "15899192375330393731": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "2012181953284568566": ["convolution_gpu_bfyx_os_iyx_osv16",749], + "3341302541468955849": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11465965972527519631": ["convolution_gpu_bfyx_direct_10_12_16",0], + "10706267011822108376": ["convolution_gpu_bfyx_1x1",0], + "2622434279674583815": ["convolution_gpu_bfyx_gemm_like",1], + "8464582977975377118": ["convolution_gpu_winograd_6x3_s1_fused",0], + "3621930417735246405": ["convolution_gpu_bfyx_os_iyx_osv16",260], + "4561874206785244358": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "7802311886554362782": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "6438522646185979880": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "10109431802089940590": ["convolution_gpu_bfyx_gemm_like",2], + "941829593638869991": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "15129834325410878425": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2451712485584835395": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5779388310240896974": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15106614232165315070": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "11690533591656807605": ["convolution_gpu_bfyx_gemm_like",2], + "3041612155708729812": ["convolution_gpu_bfyx_os_iyx_osv16",1120], + "8059328623525062913": ["convolution_gpu_bfyx_gemm_like",2], + "7171904645566467208": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "15901675909820977223": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "16489624657475712467": ["convolution_gpu_bfyx_os_iyx_osv16",800], + "14359530849521980269": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "4999505377862312410": ["fully_connected_gpu_bf_io_input_spatial",2], + "10414903047695486119": ["convolution_gpu_bfyx_os_iyx_osv16",675], + "5495776091407365966": ["convolution_gpu_bfyx_gemm_like",2], + "7372956570616880244": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "9700808806849459216": ["convolution_gpu_bfyx_1x1",2], + "3792945601873900927": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "8140094412609934765": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "11929531534620071758": ["convolution_gpu_bfyx_os_iyx_osv16",988], + "11955992313739654625": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "6370189612027110022": ["convolution_gpu_bfyx_gemm_like",2], + "18092842590142527927": ["convolution_gpu_bfyx_os_iyx_osv16",355], + "2903605246599054308": ["convolution_gpu_bfyx_os_iyx_osv16",250], + "12040626513219974957": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "14312549767853703411": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "13324157125165576832": ["convolution_gpu_bfyx_os_iyx_osv16",676], + "291868903926685441": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "952318454591754214": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "12757611260347801001": ["convolution_gpu_bfyx_os_iyx_osv16",312], + "8615481457481938667": ["convolution_gpu_bfyx_os_iyx_osv16",50], + "6307939332939714967": ["convolution_gpu_bfyx_1x1",1], + "12229574562535756991": ["convolution_gpu_bfyx_gemm_like",2], + "4282198629458668761": ["convolution_gpu_bfyx_gemm_like",1], + "17050675313067213312": ["convolution_gpu_bfyx_os_iyx_osv16",1044], + "10628725059172743408": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "9514210061704584354": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "7223801044761006523": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "938848188161536107": ["convolution_gpu_bfyx_1x1",1], + "10218763091060511457": ["convolution_gpu_bfyx_os_iyx_osv16",101], + "12355112948013108181": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12353956380178079089": ["convolution_gpu_bfyx_direct_10_12_16",1], + "13951717514084457087": ["convolution_gpu_bfyx_os_iyx_osv16",709], + "4444730303823507621": ["convolution_gpu_bfyx_gemm_like",2], + "7866128397931438774": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "6423785822515265784": ["convolution_gpu_bfyx_gemm_like",2], + "5495063314176654751": ["convolution_gpu_bfyx_gemm_like",0], + "1172103288112689821": ["convolution_gpu_bfyx_os_iyx_osv16",563], + "14431607479949498164": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "7732899312577293959": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "16504962609450876148": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "11868551452004726281": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "12987636957813312667": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15114370307779942381": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "10869005786136023160": ["convolution_gpu_bfyx_direct_10_12_16",0], + "11706446082856895571": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4062706195708729345": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "7243917162812988891": ["convolution_gpu_bfyx_gemm_like",2], + "4652136280940317116": ["convolution_gpu_bfyx_gemm_like",0], + "13503608041359512": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5601435819039968726": ["convolution_gpu_winograd_6x3_s1_fused",1], + "10292243973236220688": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "8025053805734757314": ["convolution_gpu_bfyx_direct_10_12_16",0], + "2891736961665476908": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "15161053469199826008": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9372916528346260712": ["convolution_gpu_bfyx_gemm_like",2], + "4892959859293355837": ["convolution_gpu_bfyx_gemm_like",0], + "9485825829394109934": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "10128390168715530898": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "9642229389394495047": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "6343396486660315308": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3477539135137665170": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "17658152048177750315": ["convolution_gpu_bfyx_os_iyx_osv16",1119], + "15993427814066246646": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "13820498543284008286": ["convolution_gpu_bfyx_os_iyx_osv16",727], + "487214150851213303": ["convolution_gpu_bfyx_gemm_like",2], + "15078590909693331731": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4447065688824381344": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "13644681270630373984": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "4381329435655511217": ["convolution_gpu_bfyx_gemm_like",0], + "1364546124782880196": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "15450609897480659306": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "10237524128771958432": ["convolution_gpu_bfyx_gemm_like",2], + "8108843303778211282": ["convolution_gpu_bfyx_os_iyx_osv16",1006], + "7662200927459001757": ["convolution_gpu_winograd_6x3_s1_fused",2], + "6139574161497189424": ["convolution_gpu_bfyx_direct_10_12_16",2], + "1643241486250690844": ["convolution_gpu_bfyx_os_iyx_osv16",338], + "5749536453225343663": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "4806571630436601566": ["fully_connected_gpu_bf_io_input_spatial",2], + "8241070786700614317": ["convolution_gpu_bfyx_os_iyx_osv16",1077], + "5287076386757143976": ["convolution_gpu_bfyx_direct_10_12_16",1], + "10968768803038046390": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5796500397424307442": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "10893432143734884603": ["convolution_gpu_bfyx_gemm_like",1], + "3265415000818832667": ["convolution_gpu_bfyx_direct_10_12_16",2], + "16768797136991242472": ["convolution_gpu_bfyx_os_iyx_osv16",972], + "8873614802459592665": ["convolution_gpu_bfyx_os_iyx_osv16",363], + "8270591002934311024": ["convolution_gpu_bfyx_1x1",0], + "17951403431757222177": ["fully_connected_gpu_bs_f_bsv16_af8_vload",2], + "13851240591038949807": ["convolution_gpu_bfyx_gemm_like",1], + "4137755981477177003": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "5589350202160007768": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "6302958994152837045": ["convolution_gpu_bfyx_os_iyx_osv16",126], + "6084775920382972735": ["convolution_gpu_bfyx_os_iyx_osv16",1018], + "6087091876057515304": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "13646974121952099172": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "13926122593957480821": ["convolution_gpu_winograd_6x3_s1_fused",1], + "905780459938651623": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "5381354625969068789": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9399994156762372761": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "3070859615622845671": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "7545013298074733778": ["convolution_gpu_bfyx_gemm_like",0], + "4282668574670785584": ["convolution_gpu_bfyx_gemm_like",2], + "9529614587861271730": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7977195117668583981": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "13439896617880328331": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "743941460026466526": ["convolution_gpu_bfyx_os_iyx_osv16",601], + "12954154886708228545": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "15293727142789007900": ["convolution_gpu_bfyx_os_iyx_osv16",333], + "8434794604559592624": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "8107447526839063293": ["convolution_gpu_bfyx_os_iyx_osv16",124], + "13403161389559730": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "16767392067294252396": ["convolution_gpu_bfyx_gemm_like",2], + "775538461106687677": ["fully_connected_gpu_fb_oi_ref",0], + "18210370419559876426": ["convolution_gpu_bfyx_os_iyx_osv16",589], + "10894058425957901202": ["convolution_gpu_bfyx_1x1",2], + "15579919505002150556": ["convolution_gpu_bfyx_gemm_like",2], + "14999920879568237166": ["convolution_gpu_bfyx_1x1",0], + "8747430148550634190": ["convolution_gpu_bfyx_gemm_like",2], + "14131851237755716991": ["convolution_gpu_bfyx_gemm_like",0], + "12198263593657033426": ["convolution_gpu_bfyx_os_iyx_osv16",637], + "12985650543127289023": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "2929715823970060874": ["convolution_gpu_bfyx_os_iyx_osv16",1103], + "12894240573737168362": ["convolution_gpu_bfyx_gemm_like",2], + "973966345068677905": ["convolution_gpu_bfyx_1x1",2], + "7465681710653503161": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "4664983769199548480": ["convolution_gpu_bfyx_1x1",1], + "9939234037869927090": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "7606728651572102823": ["convolution_gpu_bfyx_os_iyx_osv16",215], + "4084026445911476156": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "5170245731599664670": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12308359047798183133": ["convolution_gpu_bfyx_gemm_like",0], + "9069334144391048686": ["convolution_gpu_bfyx_os_iyx_osv16",856], + "13418701036204748812": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13320828013530046693": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "14483314305369207554": ["convolution_gpu_bfyx_1x1",1], + "14435120971846098308": ["convolution_gpu_bfyx_os_iyx_osv16",567], + "10492056481694320580": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "3007637520820789085": ["convolution_gpu_bfyx_gemm_like",2], + "3889519976910355277": ["fully_connected_gpu_bf_io_input_spatial",1], + "15829095120243431195": ["convolution_gpu_bfyx_os_iyx_osv16",1072], + "4456004887590847716": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "8079376692609682448": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "9043982883185435219": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "3003526572122876385": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "12024143207855886580": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "17174919737114915467": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10811837819834149164": ["convolution_gpu_bfyx_gemm_like",2], + "11583985978586657985": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "1900375942069325499": ["convolution_gpu_bfyx_1x1",2], + "7474639594232203854": ["convolution_gpu_bfyx_gemm_like",0], + "15489746763312425915": ["convolution_gpu_bfyx_gemm_like",2], + "15628121900226431719": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "5912451559447635837": ["convolution_gpu_bfyx_os_iyx_osv16",519], + "5221320470007950766": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "6263019986730305851": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "7881187047171099732": ["convolution_gpu_bfyx_gemm_like",0], + "2452226948562393335": ["convolution_gpu_bfyx_os_iyx_osv16",427], + "1838534101161814609": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5843679089588930933": ["convolution_gpu_bfyx_os_iyx_osv16",156], + "10412588668458621135": ["convolution_gpu_bfyx_os_iyx_osv16",13], + "2669822154816760632": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "517997325935712670": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "11828175723996627443": ["convolution_gpu_bfyx_os_iyx_osv16",975], + "4550028191070279999": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5762290464889692462": ["convolution_gpu_bfyx_os_iyx_osv16",974], + "5393510569127725391": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "17525564757769958678": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "15488340031228619748": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "2040762223425679479": ["fully_connected_gpu_bf_io_input_spatial",2], + "1760690277175249985": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "11092828091552833150": ["convolution_gpu_bfyx_os_iyx_osv16",874], + "10110395703775498948": ["convolution_gpu_bfyx_os_iyx_osv16",765], + "3216793152416217495": ["convolution_gpu_bfyx_gemm_like",2], + "9454512817077883797": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "13251091004269229867": ["convolution_gpu_bfyx_direct_10_12_16",2], + "13938466156916423478": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "16347412180100581330": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5643908654122573882": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10090036431487700311": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "738850098651678143": ["convolution_gpu_bfyx_os_iyx_osv16",889], + "2554991397391195611": ["convolution_gpu_bfyx_gemm_like",2], + "2204178900998688268": ["convolution_gpu_bfyx_gemm_like",2], + "8671491767142900139": ["convolution_gpu_bfyx_gemm_like",0], + "11850332373794932468": ["convolution_gpu_bfyx_os_iyx_osv16",978], + "3831261590121101287": ["convolution_gpu_bfyx_os_iyx_osv16",1003], + "4099859307693687554": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "1040650352205493707": ["convolution_gpu_bfyx_os_iyx_osv16",219], + "11583017348580874022": ["convolution_gpu_bfyx_gemm_like",2], + "17907223570737272640": ["convolution_gpu_bfyx_os_iyx_osv16",384], + "5898740235388207878": ["convolution_gpu_bfyx_1x1",2], + "17634966178519099371": ["convolution_gpu_bfyx_1x1",1], + "1701609125136907870": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12194037100109755112": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "10424278617647597641": ["convolution_gpu_bfyx_gemm_like",2], + "6996376303337512293": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "4640696923527766618": ["convolution_gpu_bfyx_os_iyx_osv16",228], + "11185156002426041243": ["convolution_gpu_bfyx_os_iyx_osv16",713], + "12892693137085610062": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "7024495439434892956": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "18180655791734632264": ["convolution_gpu_bfyx_gemm_like",2], + "4644580321919256401": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "8100595788531468781": ["convolution_gpu_bfyx_os_iyx_osv16",383], + "181006047500375768": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "14571022040013651253": ["convolution_gpu_bfyx_gemm_like",2], + "5115134711994944288": ["convolution_gpu_bfyx_gemm_like",1], + "16474284418841532356": ["convolution_gpu_bfyx_gemm_like",2], + "14762599606783897222": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "17106086048442658788": ["convolution_gpu_bfyx_gemm_like",2], + "17216583849049249733": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "13132804928635689780": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "7575675354187625951": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "3105425187506203551": ["convolution_gpu_bfyx_1x1",0], + "1698321314111848001": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "6664482192233202590": ["convolution_gpu_bfyx_gemm_like",2], + "12545558125736154584": ["convolution_gpu_bfyx_os_iyx_osv16",556], + "16559140502701231107": ["convolution_gpu_bfyx_direct_10_12_16",0], + "15823825508128158158": ["convolution_gpu_bfyx_gemm_like",2], + "8526484907799590618": ["convolution_gpu_bfyx_gemm_like",2], + "17825280904760131680": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14054116974002669018": ["convolution_gpu_bfyx_1x1",2], + "15197248015210313435": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16075006181495932250": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6669808855737023569": ["convolution_gpu_bfyx_os_iyx_osv16",1114], + "17025268985366223779": ["convolution_gpu_bfyx_os_iyx_osv16",224], + "649203303142950236": ["convolution_gpu_bfyx_os_iyx_osv16",461], + "13800760323805415740": ["convolution_gpu_bfyx_gemm_like",2], + "6195916781434462809": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11359409533744011242": ["convolution_gpu_bfyx_gemm_like",2], + "18109284647478027063": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5582896843095691256": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3332334993503432420": ["convolution_gpu_bfyx_os_iyx_osv16",716], + "5994204139128667921": ["convolution_gpu_bfyx_os_iyx_osv16",958], + "9759380701896779097": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "6233612563637601101": ["convolution_gpu_bfyx_os_iyx_osv16",324], + "12675840135830047968": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4992668316921598993": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "11086699387784339943": ["convolution_gpu_bfyx_gemm_like",2], + "3349519148124496343": ["fully_connected_gpu_bf_io_input_spatial",1], + "2128376438627103433": ["convolution_gpu_bfyx_gemm_like",2], + "52089503050497755": ["convolution_gpu_bfyx_os_iyx_osv16",638], + "16936366288366370882": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "7132328255408635227": ["convolution_gpu_bfyx_direct_10_12_16",2], + "2862999234347597091": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "14835309921389262864": ["convolution_gpu_bfyx_1x1",0], + "11499219760597131534": ["convolution_gpu_bfyx_os_iyx_osv16",323], + "3432296808755992670": ["convolution_gpu_bfyx_gemm_like",1], + "12822126914959112382": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "913496537924971856": ["convolution_gpu_bfyx_os_iyx_osv16",611], + "8402692278765063674": ["convolution_gpu_bfyx_os_iyx_osv16",887], + "3221469860582147955": ["convolution_gpu_bfyx_gemm_like",2], + "12773693193167844110": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "16491532291908469567": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "13328449155966085543": ["convolution_gpu_bfyx_os_iyx_osv16",359], + "17515573322312447679": ["convolution_gpu_bfyx_os_iyx_osv16",1053], + "6355395905401306995": ["convolution_gpu_bfyx_gemm_like",2], + "628191607060767879": ["convolution_gpu_bfyx_os_iyx_osv16",95], + "10554266898346470422": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14335074487552883436": ["convolution_gpu_bfyx_gemm_like",1], + "10294185397756053636": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "13814086981499638596": ["convolution_gpu_bfyx_os_iyx_osv16",1098], + "15979956159651515122": ["convolution_gpu_bfyx_gemm_like",2], + "3109104171383198425": ["convolution_gpu_winograd_6x3_s1_fused",1], + "541817615957967731": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "2809950092498355574": ["convolution_gpu_bfyx_os_iyx_osv16",299], + "2294318010381635693": ["convolution_gpu_bfyx_gemm_like",2], + "15847413004526420496": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "12992194515157698316": ["convolution_gpu_bfyx_os_iyx_osv16",717], + "16364494883229084045": ["convolution_gpu_bfyx_os_iyx_osv16",311], + "5584432943673435454": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "11862259122805366807": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "3643250372952944907": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "16293101831324587788": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1051506168926530904": ["fully_connected_gpu_bf_io_input_spatial",0], + "6656593119788274992": ["convolution_gpu_bfyx_os_iyx_osv16",1090], + "12946531140050029900": ["convolution_gpu_bfyx_os_iyx_osv16",337], + "18150429561058646714": ["convolution_gpu_bfyx_gemm_like",0], + "15677717057398875599": ["convolution_gpu_bfyx_gemm_like",2], + "18259656768460999562": ["convolution_gpu_bfyx_os_iyx_osv16",339], + "14103112843209793966": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "11149782181562145291": ["convolution_gpu_bfyx_gemm_like",2], + "11999246609107242706": ["convolution_gpu_bfyx_gemm_like",2], + "14540578324750869319": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3930314908786112883": ["convolution_gpu_bfyx_gemm_like",2], + "2571882179292959757": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5172712078329324967": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "17829047941256922307": ["convolution_gpu_bfyx_os_iyx_osv16",873], + "12515465135362865565": ["convolution_gpu_bfyx_os_iyx_osv16",583], + "8195881973746570408": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9133263538092913983": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8906588133431586825": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4672441137336208890": ["convolution_gpu_bfyx_gemm_like",2], + "18140951659547259039": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "3220280315905987373": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "10328182165125764988": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "17738299860390552088": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5582450255753679095": ["convolution_gpu_bfyx_1x1",1], + "3192332625020432602": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "9062774198518904260": ["convolution_gpu_bfyx_gemm_like",2], + "5497751772699578150": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15277856047844308598": ["convolution_gpu_bfyx_gemm_like",2], + "16913004986170202203": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "3392693938352572136": ["convolution_gpu_bfyx_gemm_like",2], + "3120553928584920777": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "15052577143485630617": ["convolution_gpu_bfyx_1x1",0], + "14808895254077106198": ["convolution_gpu_bfyx_gemm_like",2], + "17285815901490707654": ["convolution_gpu_winograd_6x3_s1_fused",2], + "9553032671453999824": ["convolution_gpu_bfyx_os_iyx_osv16",186], + "4800587664660105589": ["fully_connected_gpu_bf_io_input_spatial",2], + "8048617952947915835": ["convolution_gpu_bfyx_gemm_like",2], + "4560479630843098090": ["convolution_gpu_bfyx_gemm_like",2], + "8527193566719173253": ["convolution_gpu_bfyx_gemm_like",0], + "548663565933738403": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "13071545223094862275": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "1569043950563130463": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6214194654733781771": ["convolution_gpu_bfyx_direct_10_12_16",2], + "59739211822469868": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",1], + "11901740241052104941": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "380316849107383484": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "15374625876485618845": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "11825293922127550847": ["convolution_gpu_bfyx_gemm_like",2], + "2797723586312707948": ["convolution_gpu_bfyx_os_iyx_osv16",745], + "3759515057574218101": ["convolution_gpu_bfyx_gemm_like",2], + "17889864541794448203": ["convolution_gpu_bfyx_1x1",2], + "11428599290755097395": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "3635446784873718932": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "14038261392627717712": ["convolution_gpu_bfyx_os_iyx_osv16",581], + "11559360678008060513": ["convolution_gpu_bfyx_os_iyx_osv16",80], + "2525260242689556544": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "8479958930889587809": ["fully_connected_gpu_yxfb_ref",1], + "3116068331849795558": ["convolution_gpu_bfyx_gemm_like",2], + "852092858392507925": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "16463823433924519300": ["convolution_gpu_bfyx_os_iyx_osv16",1116], + "7353563160591978243": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "8451212914744825089": ["convolution_gpu_bfyx_os_iyx_osv16",738], + "15857087373591747006": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "712165731154577189": ["convolution_gpu_bfyx_os_iyx_osv16",606], + "1652781065871883392": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "17006655627343469372": ["convolution_gpu_bfyx_direct_10_12_16",1], + "3036808833459559381": ["convolution_gpu_bfyx_direct_10_12_16",0], + "10670103699537731664": ["convolution_gpu_bfyx_gemm_like",2], + "11164519756679631743": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "14491949194619001237": ["convolution_gpu_bfyx_os_iyx_osv16",209], + "14174888981602932979": ["convolution_gpu_bfyx_os_iyx_osv16",460], + "578703329577922869": ["convolution_gpu_bfyx_os_iyx_osv16",1028], + "10000618285883395700": ["convolution_gpu_bfyx_os_iyx_osv16",1079], + "16706121580364790904": ["convolution_gpu_bfyx_gemm_like",2], + "15381833359831622179": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "12493863403516600413": ["convolution_gpu_bfyx_os_iyx_osv16",263], + "12935563359569230797": ["convolution_gpu_bfyx_os_iyx_osv16",588], + "7000524935770116969": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "1532263118203058517": ["convolution_gpu_bfyx_os_iyx_osv16",267], + "18077281411861416889": ["convolution_gpu_bfyx_os_iyx_osv16",1046], + "1485662490111767875": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "12990527753120735255": ["convolution_gpu_bfyx_gemm_like",2], + "8306337702797456793": ["convolution_gpu_bfyx_gemm_like",2], + "11872464450773754851": ["fully_connected_gpu_bs_f_bsv8_af8_vload",2], + "7025975403069487257": ["convolution_gpu_bfyx_os_iyx_osv16",590], + "877436308867220589": ["convolution_gpu_bfyx_gemm_like",2], + "3797957937905580811": ["convolution_gpu_bfyx_gemm_like",2], + "10384537928514123040": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "9447458159095730492": ["convolution_gpu_bfyx_gemm_like",2], + "16475247464223458061": ["convolution_gpu_bfyx_gemm_like",0], + "18173314625562011976": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11670430946096342056": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "5040730152867713388": ["convolution_gpu_bfyx_gemm_like",2], + "584086621952390547": ["convolution_gpu_bfyx_gemm_like",2], + "16025442470600124062": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "5941852872160795604": ["convolution_gpu_bfyx_gemm_like",0], + "4356817283284529593": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12667014405537239093": ["convolution_gpu_bfyx_os_iyx_osv16",1093], + "3689722043202617487": ["convolution_gpu_bfyx_os_iyx_osv16",330], + "1473214668483422172": ["convolution_gpu_bfyx_gemm_like",2], + "9803492989444302959": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "15669490019428002270": ["convolution_gpu_bfyx_os_iyx_osv16",1092], + "7271236108345900406": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "12609361477548272638": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "10025839973092358719": ["convolution_gpu_bfyx_os_iyx_osv16",326], + "7211355951470869591": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13308187548669026714": ["convolution_gpu_bfyx_1x1",1], + "14695781272831602408": ["convolution_gpu_bfyx_os_iyx_osv16",632], + "17490188677223978661": ["convolution_gpu_bfyx_gemm_like",2], + "12553441041059632729": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "8500148569566077929": ["convolution_gpu_bfyx_os_iyx_osv16",616], + "998876398773540321": ["convolution_gpu_bfyx_1x1",2], + "4428101657497677982": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11931568365395665142": ["convolution_gpu_bfyx_gemm_like",2], + "9280431727790048190": ["convolution_gpu_bfyx_1x1",0], + "7875272450497189442": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "10899110544832584656": ["convolution_gpu_bfyx_os_iyx_osv16",1082], + "5419775002149092646": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "14385185911482960528": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "157805434489791310": ["convolution_gpu_bfyx_os_iyx_osv16",1030], + "9707630588260222630": ["convolution_gpu_bfyx_os_iyx_osv16",321], + "17542414935564676110": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "11649407835105973949": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "16218339663410630711": ["convolution_gpu_bfyx_gemm_like",2], + "12641170321047008726": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "5020788604681810984": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13970935346154374605": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "4544242784357021697": ["convolution_gpu_bfyx_gemm_like",2], + "4112696777811320312": ["convolution_gpu_bfyx_os_iyx_osv16",591], + "2355214244972870639": ["convolution_gpu_bfyx_os_iyx_osv16",530], + "2345023488044002149": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "3179874645565098825": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7869916853707978306": ["convolution_gpu_bfyx_direct_10_12_16",0], + "3480732841490521799": ["convolution_gpu_bfyx_os_iyx_osv16",128], + "18424400171776141118": ["convolution_gpu_bfyx_gemm_like",2], + "13454265023861566476": ["convolution_gpu_bfyx_gemm_like",0], + "9695024256541464964": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "12788968383428254917": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3526580286148537369": ["convolution_gpu_bfyx_gemm_like",2], + "6071668124835539929": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4754967381316623440": ["convolution_gpu_bfyx_gemm_like",2], + "6025872155179042054": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "4342360467977736802": ["convolution_gpu_bfyx_gemm_like",2], + "16758962840329202004": ["convolution_gpu_bfyx_direct_10_12_16",0], + "18251360413872841969": ["convolution_gpu_bfyx_os_iyx_osv16",669], + "1361159591875955678": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "2613462626256090659": ["fully_connected_gpu_bs_f_bsv16_af8_vload",0], + "11031569203645035546": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "9429695343610239088": ["convolution_gpu_bfyx_os_iyx_osv16",905], + "14289082888174784976": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "5933743119393822386": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "10730222715353420212": ["convolution_gpu_bfyx_os_iyx_osv16",706], + "11951606039079763598": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "13893789954946953427": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "1663285216972929652": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "17477062954520561609": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "8253823502854784432": ["convolution_gpu_bfyx_os_iyx_osv16",340], + "6323026044750482867": ["convolution_gpu_bfyx_os_iyx_osv16",1040], + "3141886504884887200": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "8490260671996115530": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "16033512206711124104": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "13961773444580398856": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "10387844339156517393": ["convolution_gpu_bfyx_1x1",2], + "7603872175048237237": ["convolution_gpu_bfyx_1x1",2], + "11724225282274130518": ["convolution_gpu_bfyx_os_iyx_osv16",714], + "16352331970945217438": ["convolution_gpu_bfyx_os_iyx_osv16",736], + "5864250949922222051": ["convolution_gpu_bfyx_os_iyx_osv16",319], + "12988961529988078346": ["convolution_gpu_bfyx_os_iyx_osv16",595], + "18386376129938707290": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16071723603031305677": ["convolution_gpu_bfyx_gemm_like",2], + "4437258459981739942": ["convolution_gpu_bfyx_os_iyx_osv16",1047], + "13933912937625580405": ["fully_connected_gpu_bf_io_input_spatial",2], + "5853697372844744672": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "5570311824197099845": ["convolution_gpu_winograd_6x3_s1_fused",1], + "7689320135952025041": ["convolution_gpu_bfyx_gemm_like",0], + "6491244517639245276": ["convolution_gpu_bfyx_os_iyx_osv16",747], + "17564338309805484464": ["convolution_gpu_bfyx_gemm_like",2], + "4766071144928072260": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "9091110033424983286": ["convolution_gpu_bfyx_os_iyx_osv16",322], + "9354818521586974021": ["convolution_gpu_bfyx_gemm_like",2], + "12985942652866621579": ["fully_connected_gpu_bs_f_bsv8_af8_vload",1], + "10967218651864700933": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "7334966010680206302": ["convolution_gpu_bfyx_gemm_like",2], + "17947818179123182001": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13190888313721073437": ["convolution_gpu_bfyx_os_iyx_osv16",1101], + "15078262396281327048": ["convolution_gpu_bfyx_gemm_like",1], + "7947870656736319919": ["convolution_gpu_bfyx_os_iyx_osv16",702], + "9090828337597312855": ["convolution_gpu_bfyx_gemm_like",2], + "3150231129728961455": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "1008476023750261156": ["convolution_gpu_bfyx_1x1",1], + "11528417522960871233": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7121708962074176240": ["convolution_gpu_bfyx_1x1",0], + "17101789600628162503": ["convolution_gpu_bfyx_direct_10_12_16",2], + "5893940382830835820": ["convolution_gpu_bfyx_os_iyx_osv16",729], + "583303098958523195": ["convolution_gpu_bfyx_os_iyx_osv16",212], + "14813178380338948912": ["convolution_gpu_bfyx_os_iyx_osv16",907], + "7349880498513046830": ["convolution_gpu_bfyx_1x1",2], + "13115589642140732066": ["convolution_gpu_bfyx_os_iyx_osv16",240], + "11744368351982723504": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "7394217382008802567": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "9454954846682513038": ["convolution_gpu_bfyx_direct_10_12_16",0], + "16683089431066989909": ["convolution_gpu_bfyx_gemm_like",2], + "6290317420155851465": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "11948858355027908365": ["convolution_gpu_bfyx_direct_10_12_16",2], + "6114241186364821679": ["convolution_gpu_bfyx_gemm_like",2], + "2937907409658060025": ["convolution_gpu_bfyx_os_iyx_osv16",201], + "17025182465337728023": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "4084106758501882407": ["fully_connected_gpu_bf_io_input_spatial",2], + "3985659568982275663": ["convolution_gpu_bfyx_os_iyx_osv16",1123], + "8655883535274781128": ["convolution_gpu_bfyx_gemm_like",2], + "12461575861709234385": ["convolution_gpu_bfyx_gemm_like",2], + "11768117585574496387": ["convolution_gpu_bfyx_os_iyx_osv16",301], + "9438739171104456179": ["convolution_gpu_bfyx_os_iyx_osv16",955], + "1742897526168249500": ["convolution_gpu_bfyx_gemm_like",2], + "8792202318168046223": ["convolution_gpu_bfyx_direct_10_12_16",2], + "11490143853656040028": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "4491380839102267034": ["convolution_gpu_bfyx_gemm_like",2], + "2967481531952454828": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "10256831975351722184": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "3788462090984291082": ["convolution_gpu_bfyx_os_iyx_osv16",1084], + "8106738346643994005": ["convolution_gpu_bfyx_gemm_like",1], + "9500850790449116723": ["convolution_gpu_bfyx_os_iyx_osv16",758], + "10979362792894404338": ["convolution_gpu_bfyx_gemm_like",0], + "3141773224039276177": ["convolution_gpu_bfyx_1x1",1], + "16588325081458426169": ["convolution_gpu_bfyx_gemm_like",0], + "17647962002015093887": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "12782932626966309185": ["convolution_gpu_bfyx_os_iyx_osv16",213], + "1706927777850488363": ["convolution_gpu_bfyx_os_iyx_osv16",1127], + "1372939511728986224": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17877776363798202236": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "9056038338958199256": ["convolution_gpu_bfyx_os_iyx_osv16",707], + "7708321360699824256": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15713964605078748923": ["convolution_gpu_bfyx_gemm_like",2], + "18172711677056449158": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "1218323229202187514": ["convolution_gpu_bfyx_gemm_like",2], + "3102816736961785641": ["convolution_gpu_bfyx_os_iyx_osv16",992], + "16863960779539003201": ["convolution_gpu_bfyx_os_iyx_osv16",555], + "4073467095502162430": ["convolution_gpu_bfyx_os_iyx_osv16",255], + "5688478347124565305": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "8921636651939679647": ["convolution_gpu_bfyx_1x1",2], + "388828310152538138": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "10292349730148518173": ["convolution_gpu_bfyx_gemm_like",2], + "13708979487306970634": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "4196367396954155354": ["convolution_gpu_bfyx_direct_10_12_16",2], + "7649413902932043811": ["convolution_gpu_bfyx_gemm_like",2], + "3926585856863002495": ["convolution_gpu_bfyx_os_iyx_osv16",227], + "8501145642605270365": ["convolution_gpu_bfyx_gemm_like",2], + "13170441257780067955": ["convolution_gpu_bfyx_os_iyx_osv16",1104], + "2983038203471784211": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "12480527132372884168": ["convolution_gpu_bfyx_1x1",2], + "17640725195881101275": ["convolution_gpu_bfyx_gemm_like",2], + "11716771904412649891": ["convolution_gpu_bfyx_os_iyx_osv16",438], + "1436052878894538927": ["convolution_gpu_bfyx_os_iyx_osv16",604], + "7918742312252115870": ["convolution_gpu_bfyx_os_iyx_osv16",279], + "1626430741965136732": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "16267682394077585279": ["convolution_gpu_bfyx_os_iyx_osv16",538], + "13709111882513486557": ["convolution_gpu_bfyx_os_iyx_osv16",999], + "11469881811044037340": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "9099720270958987421": ["convolution_gpu_bfyx_1x1",1], + "4983880246908724272": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "9657324846330221372": ["convolution_gpu_bfyx_1x1",2], + "8860815977851486767": ["convolution_gpu_bfyx_os_iyx_osv16",40], + "16426179645101678763": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "16206791915939407806": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "1033385936344875354": ["convolution_gpu_bfyx_gemm_like",2], + "16744011463988595802": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "11060822686394981344": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "4897448054295474302": ["convolution_gpu_bfyx_gemm_like",1], + "12031180482028822765": ["convolution_gpu_bfyx_gemm_like",1], + "16789135236017252073": ["convolution_gpu_bfyx_gemm_like",2], + "6942622405269419082": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "6181308879301978465": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "15860915170591763391": ["convolution_gpu_bfyx_os_iyx_osv16",1087], + "7447163906170805189": ["convolution_gpu_bfyx_os_iyx_osv16",1065], + "6312971928547466668": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "5524215233998361104": ["convolution_gpu_winograd_6x3_s1_fused",0], + "8032685176029570383": ["convolution_gpu_bfyx_direct_10_12_16",0], + "9947449295659685973": ["convolution_gpu_bfyx_gemm_like",0], + "12937333118472722002": ["convolution_gpu_bfyx_gemm_like",2], + "10023279637210292010": ["convolution_gpu_bfyx_os_iyx_osv16",1115], + "13183380647506951324": ["convolution_gpu_bfyx_gemm_like",1], + "18203935818408469865": ["convolution_gpu_bfyx_os_iyx_osv16",653], + "2623687018437195679": ["convolution_gpu_bfyx_direct_10_12_16",0], + "17723621158215826108": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "3273748387141431306": ["convolution_gpu_bfyx_os_iyx_osv16",93], + "18094205332383644037": ["convolution_gpu_bfyx_os_iyx_osv16",181], + "16547425454653232058": ["convolution_gpu_bfyx_direct_10_12_16",1], + "2857337999074313592": ["convolution_gpu_bfyx_os_iyx_osv16",1083], + "654821507679356726": ["convolution_gpu_bfyx_os_iyx_osv16",957], + "15026219694198820614": ["convolution_gpu_bfyx_direct_10_12_16",1], + "4533786844080178561": ["convolution_gpu_bfyx_os_iyx_osv16",1106], + "5440983284868981549": ["convolution_gpu_bfyx_gemm_like",2], + "8532217744217419503": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "6522575549211855712": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "15675903059949404837": ["convolution_gpu_bfyx_1x1",0], + "8567667881970262923": ["convolution_gpu_bfyx_os_iyx_osv16",634], + "7322472892320910654": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "13139625572508441980": ["convolution_gpu_bfyx_os_iyx_osv16",334], + "8965747921518186477": ["convolution_gpu_bfyx_os_iyx_osv16",419], + "13210604117940125947": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "4505008254511324231": ["convolution_gpu_bfyx_os_iyx_osv16",744], + "10635659193402005820": ["convolution_gpu_bfyx_os_iyx_osv16",737], + "10917498758625273194": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12581879452540858313": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "4082229510324076196": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3037042229494600258": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "8036474422877454869": ["convolution_gpu_bfyx_direct_10_12_16",2], + "15011504472108164173": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "603883331897298932": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "3017411837779243878": ["convolution_gpu_bfyx_gemm_like",2], + "10398572248321217585": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "10747988576436391912": ["convolution_gpu_bfyx_os_iyx_osv16",1011], + "8791285622784082122": ["convolution_gpu_bfyx_os_iyx_osv16",558], + "2041212737963974230": ["convolution_gpu_bfyx_gemm_like",2], + "916389941321470163": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "6580334406272192111": ["fully_connected_gpu_fb_io_ref",1], + "14343008518525689150": ["convolution_gpu_bfyx_1x1",0], + "7472330881076141262": ["convolution_gpu_bfyx_gemm_like",2], + "13575423234109624706": ["fully_connected_gpu_bs_f_bsv16_af8_vload",1], + "9954050478761346921": ["convolution_gpu_bfyx_os_iyx_osv16",610], + "7072606962946873975": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "12408889192918919210": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "4622514167765722873": ["convolution_gpu_bfyx_os_iyx_osv16",15], + "13184662326021747000": ["convolution_gpu_bfyx_os_iyx_osv16",577], + "5835634465164771899": ["convolution_gpu_bfyx_os_iyx_osv16",257], + "15778834188130183853": ["convolution_gpu_bfyx_os_iyx_osv16",935], + "994182747184593564": ["convolution_gpu_winograd_6x3_s1_fused",0], + "11207257238719531888": ["convolution_gpu_bfyx_gemm_like",2], + "9692654253261175490": ["convolution_gpu_bfyx_os_iyx_osv16",335], + "5629670679897666607": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "5762878778443755104": ["fully_connected_gpu_bs_f_bsv8_af8_vload",0], + "2150326211917340956": ["convolution_gpu_bfyx_gemm_like",2], + "939718260623752240": ["convolution_gpu_bfyx_gemm_like",1], + "11459784003592366395": ["convolution_gpu_bfyx_direct_10_12_16",1], + "7581174843529024536": ["convolution_gpu_bfyx_os_iyx_osv16",635], + "15879172437519876393": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1201692134690347847": ["convolution_gpu_bfyx_os_iyx_osv16",1075], + "15916505622570323098": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "10135458965276110244": ["convolution_gpu_bfyx_1x1",0], + "11820789223587555410": ["convolution_gpu_bfyx_1x1",0], + "15466940145773097237": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "9839670675413379092": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15392077168521832549": ["convolution_gpu_bfyx_os_iyx_osv16",710], + "296142385116663420": ["convolution_gpu_bfyx_os_iyx_osv16",374], + "14258499419905714808": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "15859493313686060349": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "17444003685761357480": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "5840254078917931433": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "4991419288164762786": ["convolution_gpu_bfyx_os_iyx_osv16",584], + "7104309382120208659": ["convolution_gpu_bfyx_gemm_like",2], + "2477849395789783501": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "5331173521406046122": ["convolution_gpu_bfyx_os_iyx_osv16",646], + "14553856088069405595": ["convolution_gpu_bfyx_os_iyx_osv16",1121], + "13379165253894817165": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "4871907623235871050": ["convolution_gpu_bfyx_os_iyx_osv16",837], + "9440117898128288296": ["convolution_gpu_bfyx_os_iyx_osv16",1010], + "11632275875447013409": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4865102850562917067": ["convolution_gpu_bfyx_os_iyx_osv16",854], + "386749666417295495": ["convolution_gpu_bfyx_direct_10_12_16",1], + "11834361584875491425": ["convolution_gpu_bfyx_1x1",0], + "1497560475414454618": ["convolution_gpu_bfyx_gemm_like",2], + "3746573775462003750": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16264774056719724826": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15799159401545270696": ["convolution_gpu_bfyx_direct_10_12_16",2], + "3599823735065658574": ["convolution_gpu_bfyx_os_iyx_osv16",959], + "10536316961655703500": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "13447028922679236865": ["convolution_gpu_bfyx_direct_10_12_16",0], + "6133592828563353516": ["convolution_gpu_bfyx_os_iyx_osv16",310], + "9452470718398027950": ["convolution_gpu_bfyx_os_iyx_osv16",1031], + "13512863534076172940": ["convolution_gpu_bfyx_gemm_like",2], + "12978370505631031751": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "1434535531617424039": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "16986358655784856534": ["convolution_gpu_bfyx_gemm_like",2], + "15661322183507404821": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7941729567451949422": ["convolution_gpu_bfyx_os_iyx_osv16",315], + "10488269059469838160": ["convolution_gpu_bfyx_os_iyx_osv16",908], + "12696412964119109465": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12776081190690731910": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "14994322266840011040": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "11275109735493317886": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "11806105193035393795": ["convolution_gpu_bfyx_gemm_like",2], + "13131740479277027362": ["fully_connected_gpu_bs_f_bsv16_b1",1], + "1996860183441418841": ["convolution_gpu_bfyx_direct_10_12_16",1], + "706370730287471796": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "4006884370026272807": ["convolution_gpu_bfyx_gemm_like",2], + "5519535335798045279": ["convolution_gpu_bfyx_gemm_like",2], + "8141428150264829362": ["convolution_gpu_bfyx_os_iyx_osv16",248], + "4803370483104261655": ["convolution_gpu_bfyx_gemm_like",2], + "9269175963143039426": ["convolution_gpu_bfyx_os_iyx_osv16",742], + "15178921033274918199": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "192209423643075326": ["convolution_gpu_bfyx_gemm_like",2], + "7570346182940928159": ["convolution_gpu_bfyx_gemm_like",0], + "1251525426317284548": ["convolution_gpu_bfyx_os_iyx_osv16",23], + "1509728225855233852": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "12643423612381102003": ["convolution_gpu_bfyx_os_iyx_osv16",830], + "8540111719936129376": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "5429130923188159806": ["convolution_gpu_bfyx_os_iyx_osv16",477], + "9849272539053219052": ["convolution_gpu_bfyx_os_iyx_osv16",965], + "5522698342845820411": ["convolution_gpu_bfyx_os_iyx_osv16",580], + "3691705516240577130": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17010172246526353957": ["convolution_gpu_bfyx_1x1",2], + "9928406318940388716": ["convolution_gpu_bfyx_os_iyx_osv16",1111], + "9999955037598579164": ["convolution_gpu_bfyx_os_iyx_osv16",314], + "6403698142681887543": ["convolution_gpu_bfyx_gemm_like",1], + "5795073619189010837": ["convolution_gpu_winograd_6x3_s1_fused",2], + "15118142492742177336": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "3895088069642140043": ["convolution_gpu_bfyx_os_iyx_osv16",976], + "14447191095937730964": ["convolution_gpu_bfyx_os_iyx_osv16",329], + "2124033349728954551": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "16522364268583242080": ["convolution_gpu_bfyx_gemm_like",2], + "4239415134522959352": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "5658664813683907476": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13738760763969959522": ["convolution_gpu_bfyx_gemm_like",0], + "272730229972987861": ["convolution_gpu_bfyx_os_iyx_osv16",734], + "4237276338897143680": ["convolution_gpu_bfyx_os_iyx_osv16",353], + "17961702508543961900": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "16152775342222431281": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "16425665058951535484": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "10787747981914307179": ["convolution_gpu_bfyx_1x1",2], + "11703557271443535142": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "8640150341228170279": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "2096779676054335057": ["convolution_gpu_bfyx_gemm_like",2], + "11975047184326016230": ["convolution_gpu_bfyx_gemm_like",1], + "8482147530539941792": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "4734389463002799056": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "4129722446574108695": ["convolution_gpu_bfyx_1x1",0], + "9795194069954915563": ["convolution_gpu_bfyx_gemm_like",2], + "16681690088928624738": ["convolution_gpu_bfyx_os_iyx_osv16",691], + "16567486018945740036": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "6664432489777052771": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "8006738296385794413": ["convolution_gpu_bfyx_os_iyx_osv16",1076], + "12809199739984715013": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "9366201112659847392": ["convolution_gpu_bfyx_os_iyx_osv16",364], + "577844026691991089": ["convolution_gpu_bfyx_direct_10_12_16",2], + "8866736221671835567": ["convolution_gpu_bfyx_os_iyx_osv16",53], + "13839116996827687373": ["convolution_gpu_bfyx_gemm_like",0], + "5095827462645341808": ["convolution_gpu_bfyx_direct_10_12_16",0], + "54975980454651672": ["convolution_gpu_bfyx_os_iyx_osv16",1081], + "18277685132620834972": ["convolution_gpu_bfyx_os_iyx_osv16",159], + "4914435717288687793": ["convolution_gpu_bfyx_1x1",2], + "6584960721513702502": ["convolution_gpu_bfyx_gemm_like",2], + "12393385058735194260": ["convolution_gpu_bfyx_os_iyx_osv16",983], + "4465781406991476376": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "4135068756462147853": ["convolution_gpu_bfyx_direct_10_12_16",2], + "10808909442136736629": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "6904130543085920483": ["convolution_gpu_bfyx_os_iyx_osv16",953], + "11450378244355788918": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "8618835732380720921": ["convolution_gpu_bfyx_direct_10_12_16",0], + "2251029128552117936": ["convolution_gpu_bfyx_os_iyx_osv16",369], + "17724604495865223459": ["convolution_gpu_bfyx_gemm_like",2], + "2226745622763268469": ["convolution_gpu_bfyx_os_iyx_osv16",331], + "7307271009495440764": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "138379779469699309": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10912495395422146386": ["convolution_gpu_bfyx_os_iyx_osv16",261], + "11239754372812258455": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9519623751582710696": ["convolution_gpu_bfyx_os_iyx_osv16",439], + "5041111302824362529": ["convolution_gpu_bfyx_os_iyx_osv16",1105], + "12489973984967168447": ["convolution_gpu_bfyx_1x1",2], + "16442107352245114876": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "17318287523550546026": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7199295899520406795": ["convolution_gpu_bfyx_direct_10_12_16",1], + "16461809076899645037": ["convolution_gpu_bfyx_os_iyx_osv16",9], + "3682813162987778705": ["convolution_gpu_bfyx_os_iyx_osv16",630], + "5876880412336151866": ["convolution_gpu_bfyx_os_iyx_osv16",218], + "16430562172386510259": ["convolution_gpu_bfyx_gemm_like",2], + "5831419373611158773": ["convolution_gpu_bfyx_os_iyx_osv16",739], + "2261453441277654139": ["convolution_gpu_bfyx_os_iyx_osv16",54], + "4398371999113956082": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "7106362077449435105": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "4635570915184713874": ["convolution_gpu_bfyx_gemm_like",1], + "6048964584602891448": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "1551596771935253711": ["convolution_gpu_bfyx_gemm_like",2], + "9522661528867955338": ["convolution_gpu_bfyx_gemm_like",2], + "5740738339752793113": ["convolution_gpu_bfyx_direct_10_12_16",1], + "9220830217525628783": ["convolution_gpu_bfyx_gemm_like",2], + "3372770576629463160": ["convolution_gpu_bfyx_os_iyx_osv16",231], + "15702382940521972117": ["convolution_gpu_bfyx_os_iyx_osv16",1008], + "2052712465925238009": ["convolution_gpu_bfyx_os_iyx_osv16",813], + "12864204111424196179": ["convolution_gpu_bfyx_1x1",2], + "5637480705139132901": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "4079026972040047969": ["convolution_gpu_bfyx_os_iyx_osv16",262], + "3603706453982734995": ["convolution_gpu_bfyx_os_iyx_osv16",573], + "1643122514049603104": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "4138968242532400395": ["convolution_gpu_bfyx_gemm_like",2], + "16666792471632326054": ["convolution_gpu_bfyx_gemm_like",2], + "8220168481755031959": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "3430266954211750407": ["convolution_gpu_bfyx_os_iyx_osv16",352], + "13760645810144930270": ["convolution_gpu_bfyx_direct_10_12_16",2], + "12207503176295152756": ["convolution_gpu_bfyx_1x1",0], + "1788455099959676873": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "2140514316203117958": ["convolution_gpu_bfyx_direct_10_12_16",1], + "38736266675995457": ["convolution_gpu_bfyx_os_iyx_osv16",607], + "5339985303398206057": ["convolution_gpu_bfyx_os_iyx_osv16",180], + "4161141078006269526": ["convolution_gpu_bfyx_direct_10_12_16",2], + "17443356777503458523": ["convolution_gpu_bfyx_gemm_like",1], + "6410682026872155392": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "13713406612642090169": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "16409729623371222748": ["convolution_gpu_bfyx_os_iyx_osv16",982], + "1155389358857780776": ["convolution_gpu_bfyx_os_iyx_osv16",463], + "2231648183489019418": ["convolution_gpu_bfyx_os_iyx_osv16",814], + "4056979460327024961": ["convolution_gpu_bfyx_os_iyx_osv16",708], + "13459514533473657102": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "6181272224000872375": ["convolution_gpu_bfyx_gemm_like",2], + "12625112690264223217": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "17109520309574369561": ["convolution_gpu_bfyx_gemm_like",2], + "12952980509662451384": ["convolution_gpu_bfyx_os_iyx_osv16",462], + "5805383505505929391": ["convolution_gpu_bfyx_os_iyx_osv16",624], + "9606639214735570069": ["convolution_gpu_bfyx_os_iyx_osv16",249], + "10169992769527680821": ["convolution_gpu_bfyx_os_iyx_osv16",700], + "4338023436590582323": ["convolution_gpu_bfyx_os_iyx_osv16",127], + "16705621644424684055": ["convolution_gpu_bfyx_os_iyx_osv16",356], + "15334195300678132907": ["fully_connected_gpu_bf_io_gemm",0], + "4625107584562815965": ["convolution_gpu_bfyx_os_iyx_osv16",585], + "11848462434662954749": ["convolution_gpu_bfyx_os_iyx_osv16",1015], + "13352000946213986936": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "17759505449240263390": ["convolution_gpu_bfyx_os_iyx_osv16",690], + "4026686872534942904": ["convolution_gpu_bfyx_os_iyx_osv16",179], + "5183231560876991543": ["convolution_gpu_bfyx_os_iyx_osv16",320], + "3391032227732782982": ["convolution_gpu_bfyx_os_iyx_osv16",1113], + "7177837234452118325": ["convolution_gpu_bfyx_os_iyx_osv16",123], + "14578867494693499627": ["convolution_gpu_bfyx_os_iyx_osv16",182], + "9410978119783758141": ["convolution_gpu_bfyx_os_iyx_osv16",625], + "3304589333915676807": ["convolution_gpu_bfyx_gemm_like",2], + "11728824117049687850": ["convolution_gpu_bfyx_gemm_like",2], + "708747442142592697": ["convolution_gpu_bfyx_gemm_like",1], + "17798636687709019154": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "15959543980008442942": ["convolution_gpu_bfyx_os_iyx_osv16",1085], + "16781187505186394353": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "7183578232279711009": ["convolution_gpu_bfyx_os_iyx_osv16",1064], + "12635265188475834607": ["convolution_gpu_bfyx_os_iyx_osv16",532], + "2800949804770763798": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "1882052795393187384": ["convolution_gpu_bfyx_os_iyx_osv16",671], + "9726913113016874092": ["convolution_gpu_bfyx_gemm_like",2], + "9048522050692986204": ["convolution_gpu_bfyx_os_iyx_osv16",371], + "15133468875250992696": ["convolution_gpu_bfyx_os_iyx_osv16",349], + "10429613013253088132": ["convolution_gpu_bfyx_gemm_like",1], + "8083672466967374860": ["convolution_gpu_bfyx_os_iyx_osv16",891], + "9882204352209412039": ["convolution_gpu_bfyx_os_iyx_osv16",728], + "14104238386345631681": ["convolution_gpu_winograd_6x3_s1_fused",0], + "15047676717402283805": ["convolution_gpu_bfyx_os_iyx_osv16",367], + "11324851661119942609": ["convolution_gpu_bfyx_os_iyx_osv16",253], + "15796677813117622429": ["convolution_gpu_bfyx_gemm_like",2], + "3429844423226609965": ["convolution_gpu_bfyx_gemm_like",2], + "8843585527713905568": ["convolution_gpu_bfyx_gemm_like",2], + "10293186062391000719": ["convolution_gpu_bfyx_os_iyx_osv16",7], + "2986189945936592561": ["convolution_gpu_bfyx_os_iyx_osv16",559], + "16921939234324970069": ["convolution_gpu_bfyx_os_iyx_osv16",1067], + "7900926714874404219": ["convolution_gpu_bfyx_depthwise_weights_lwg",1], + "10512507780534402341": ["convolution_gpu_bfyx_os_iyx_osv16",1022], + "3755253206085028904": ["convolution_gpu_bfyx_direct_10_12_16",0], + "7524311370696987092": ["convolution_gpu_bfyx_1x1_hgemm_buf_16x1",2], + "14991602704357959545": ["convolution_gpu_bfyx_os_iyx_osv16",699], + "10728212277329722684": ["convolution_gpu_bfyx_gemm_like",2], + "16758697697363920520": ["convolution_gpu_bfyx_os_iyx_osv16",930], + "7272538316511343863": ["convolution_gpu_bfyx_gemm_like",0], + "4403753181729432604": ["convolution_gpu_bfyx_os_iyx_osv16",1086], + "12365282242489300092": ["convolution_gpu_bfyx_os_iyx_osv16",879], + "6205240287062600210": ["convolution_gpu_bfyx_gemm_like",2], + "9869959062341950047": ["convolution_gpu_bfyx_1x1",1], + "11265472910579659280": ["convolution_gpu_bfyx_gemm_like",2], + "1540041682425757361": ["convolution_gpu_bfyx_gemm_like",1], + "12068797674575015662": ["convolution_gpu_bfyx_os_iyx_osv16",636], + "14418429155823196539": ["convolution_gpu_bfyx_gemm_like",2], + "15529757761327002288": ["convolution_gpu_bfyx_os_iyx_osv16",332], + "11727227430687227444": ["convolution_gpu_bfyx_os_iyx_osv16",354], + "12309132521191764927": ["convolution_gpu_bfyx_os_iyx_osv16",730], + "1056009037551688122": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3711525118850629466": ["convolution_gpu_bfyx_direct_10_12_16",1], + "15352245788978088971": ["convolution_gpu_bfyx_os_iyx_osv16",1014], + "8819268903800581706": ["convolution_gpu_bfyx_direct_10_12_16",1], + "17201365233492366678": ["convolution_gpu_bfyx_os_iyx_osv16",1012], + "7457899998356343871": ["convolution_gpu_bfyx_os_iyx_osv16",670], + "731825454731954517": ["convolution_gpu_bfyx_gemm_like",2], + "12972634653821069685": ["convolution_gpu_bfyx_direct_10_12_16",0], + "751912075185318190": ["convolution_gpu_bfyx_os_iyx_osv16",210], + "883436333317162926": ["convolution_gpu_bfyx_1x1",2], + "12024817951074673335": ["convolution_gpu_bfyx_1x1",0], + "9101903304994333336": ["convolution_gpu_bfyx_os_iyx_osv16",697], + "10573920781439771673": ["convolution_gpu_bfyx_os_iyx_osv16",300], + "7840653268996892538": ["convolution_gpu_bfyx_gemm_like",2], + "4738743763536059708": ["convolution_gpu_bfyx_direct_10_12_16",0], + "14077148976508649021": ["convolution_gpu_bfyx_direct_10_12_16",0], + "13590444711975157776": ["convolution_gpu_bfyx_direct_10_12_16",2], + "4479117540570599742": ["convolution_gpu_bfyx_gemm_like",2], + "10923480230259977438": ["convolution_gpu_bfyx_1x1",1], + "1914964404168211864": ["convolution_gpu_bfyx_gemm_like",2], + "9040046051053703359": ["convolution_gpu_bfyx_gemm_like",2], + "3218248162832023196": ["convolution_gpu_bfyx_os_iyx_osv16",970], + "11830297960718214360": ["convolution_gpu_bfyx_os_iyx_osv16",258], + "4325081100430903742": ["convolution_gpu_bfyx_gemm_like",2], + "3202085450628781999": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "16027456210394993913": ["convolution_gpu_bfyx_os_iyx_osv16",1080], + "142486914279119363": ["convolution_gpu_bfyx_os_iyx_osv16",633], + "10128143628088846123": ["convolution_gpu_bfyx_os_iyx_osv16",971], + "2732519635571994212": ["convolution_gpu_bfyx_os_iyx_osv16",967], + "8984436655107983227": ["convolution_gpu_bfyx_os_iyx_osv16",688], + "10722782762733112118": ["convolution_gpu_bfyx_1x1",1], + "12730339458081890990": ["convolution_gpu_bfyx_os_iyx_osv16",368], + "4531222427159927606": ["convolution_gpu_bfyx_os_iyx_osv16",725], + "12668149981216388765": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "12087141795291232248": ["convolution_gpu_bfyx_os_iyx_osv16",1118], + "5659168916726488798": ["convolution_gpu_bfyx_os_iyx_osv16",1074], + "18431306649860116380": ["convolution_gpu_bfyx_gemm_like",2], + "10682918518101379579": ["fully_connected_gpu_bf_io_input_spatial",1], + "1822096761703761792": ["convolution_gpu_bfyx_1x1",1], + "474139120607442270": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "5334190564423375247": ["convolution_gpu_bfyx_direct_10_12_16",2], + "760687670112194844": ["convolution_gpu_bfyx_os_iyx_osv16",1073], + "13178480813522103091": ["fully_connected_gpu_bf_io_gemm",0], + "17382660912493284320": ["convolution_gpu_bfyx_direct_10_12_16",1], + "5558136691773431495": ["convolution_gpu_bfyx_os_iyx_osv16",598], + "6863331059471727622": ["convolution_gpu_bfyx_os_iyx_osv16",692], + "941626985322260281": ["convolution_gpu_bfyx_gemm_like",1], + "101401523793806394": ["convolution_gpu_bfyx_gemm_like",2], + "12557015880639217508": ["convolution_gpu_bfyx_os_iyx_osv16",1091], + "5763440554939527411": ["convolution_gpu_bfyx_os_iyx_osv16",626], + "6635217802203685464": ["convolution_gpu_bfyx_os_iyx_osv16",961], + "14352303529756685990": ["convolution_gpu_bfyx_gemm_like",1], + "16882092367103683293": ["convolution_gpu_bfyx_depthwise_weights_lwg",2], + "3272017687600371031": ["convolution_gpu_bfyx_gemm_like",2], + "18424912460022156378": ["convolution_gpu_bfyx_os_iyx_osv16",962], + "16770615142634470903": ["convolution_gpu_bfyx_os_iyx_osv16",696], + "1752185056297124917": ["convolution_gpu_bfyx_1x1",2], + "13317417676446624018": ["convolution_gpu_bfyx_os_iyx_osv16",748], + "11883485911218628865": ["convolution_gpu_bfyx_os_iyx_osv16",302], + "7700321970687976931": ["convolution_gpu_bfyx_os_iyx_osv16",711], + "9423854233835016530": ["convolution_gpu_bfyx_depthwise_weights_lwg",0], + "721174714308243785": ["convolution_gpu_bfyx_os_iyx_osv16",631], + "10548792624072794724": ["convolution_gpu_bfyx_os_iyx_osv16",629], + "2608363732937932266": ["convolution_gpu_bfyx_gemm_like",2], + "946479876892100082": ["convolution_gpu_bfyx_gemm_like",2], + "17809920600993699808": ["convolution_gpu_bfyx_os_iyx_osv16",87], + "5334291640387922287": ["convolution_gpu_bfyx_os_iyx_osv16",1126], + "4894227264080887361": ["convolution_gpu_bfyx_os_iyx_osv16",799], + "12238674883388043717": ["convolution_gpu_bfyx_os_iyx_osv16",582], + "6733731409232284409": ["convolution_gpu_bfyx_os_iyx_osv16",1109], + "14502856487639608696": ["convolution_gpu_bfyx_os_iyx_osv16",934], + "13538051178827008933": ["convolution_gpu_bfyx_os_iyx_osv16",107], + "15759530339367380982": ["convolution_gpu_bfyx_gemm_like",2], + "4903592553439092472": ["convolution_gpu_bfyx_os_iyx_osv16",910], + "11163107409437069532": ["convolution_gpu_bfyx_os_iyx_osv16",203], + "12179581684777023804": ["convolution_gpu_bfyx_gemm_like",2], + "16129296588866116913": ["convolution_gpu_bfyx_os_iyx_osv16",1125], + "16065744898134487748": ["convolution_gpu_bfyx_os_iyx_osv16",689], + "2543041530639980505": ["convolution_gpu_bfyx_os_iyx_osv16",294], + "9751582946441607796": ["convolution_gpu_bfyx_gemm_like",1], + "9785114056964539323": ["convolution_gpu_bfyx_os_iyx_osv16",256], + "7500192998744460131": ["fully_connected_gpu_bf_io_input_spatial",1], + "9622546530872848323": ["convolution_gpu_bfyx_os_iyx_osv16",328], + "17154337492545826355": ["convolution_gpu_bfyx_gemm_like",1], + "9404677451270692749": ["convolution_gpu_bfyx_direct_10_12_16",0], + "12388375914105990324": ["convolution_gpu_bfyx_direct_10_12_16",1], + "14811603003184578943": ["convolution_gpu_bfyx_gemm_like",2], + "17522452942286240233": ["convolution_gpu_bfyx_os_iyx_osv16",704], + "18082422341304348326": ["convolution_gpu_bfyx_os_iyx_osv16",698], + "17224104246148265328": ["convolution_gpu_bfyx_os_iyx_osv16",705], + "9477562342190423343": ["convolution_gpu_bfyx_os_iyx_osv16",1100], + "5003718302026277632": ["convolution_gpu_bfyx_os_iyx_osv16",1066], + "13204120207726209723": ["fully_connected_gpu_bf_io_gemm",0], + "12112853999307505628": ["convolution_gpu_bfyx_gemm_like",2], + "2440366541074371090": ["convolution_gpu_bfyx_os_iyx_osv16",839], + "9101334153142718004": ["convolution_gpu_bfyx_gemm_like",2], + "15619086801947147359": ["convolution_gpu_bfyx_os_iyx_osv16",764], + "18180820925685532104": ["convolution_gpu_bfyx_gemm_like",2], + "14122213471825630433": ["convolution_gpu_bfyx_gemm_like",2], + "13253775441326432265": ["convolution_gpu_bfyx_os_iyx_osv16",1110], + "4013707396889204359": ["convolution_gpu_bfyx_os_iyx_osv16",231 + ] + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_APL.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_APL.cpp deleted file mode 100644 index 2069919..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_APL.cpp +++ /dev/null @@ -1,2572 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - //APL 10W - void tuning_cache_5A84(tuning_data& td) - { - td.td.insert({ - { "4583484812233029888", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7560832358324865221", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7382044526960590018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12372261924257291610", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "1547771611689525848", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3134973665622945888", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18260030211719729324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7416143717989012766", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12028963907131702705", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2464531851392092325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8181308759455478086", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "546062289721803579", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4889405384318695802", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12841232643395100314", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14108361259911144680", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2726453304845436156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "2607416795507802412", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "2175404966338020579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14666883719480623074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3752993663604843837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 56) }, - { "5274735654559844733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8174421295799601683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "1967655354607438665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15762542971370422224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8183203099539372914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4075343423548891274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "13264497096898621015", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "679058537775669048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "3375634256357960999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2844616672368585285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "14235558866846276172", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "18066867692765966577", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9861424412782371874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "607078314875528651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "6234885984223387670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "7223570329858821704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "17234843749633035510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11516168882438876247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11312664612825940140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "14846039494240217143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3390376200501119384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 115) }, - { "1113077760071340574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "4614875083188849196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10859023312681572942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5588692131556725717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "7653946972043115920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "9773458066743315157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10491513939202460216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "8140122945471321201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "15079423575410353790", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "12844146569641472927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "13443914015380511668", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "13404457916017756196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 69) }, - { "6402415801415013013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8595156989254845134", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14493123117003003092", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7391309333582046386", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9935182178960843140", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15422142509105297183", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14849987788569183527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14923132847727661051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13271555597925466454", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6865593216823998846", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12385437755245281331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4530047829451377456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "127643210248119703", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2599051617462913767", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3024020696533545102", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16205377892664082400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "15135655146332608939", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "10848724554175904486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15558120704022404428", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14120354125904513152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8475075092501403968", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15892943371777404347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15078418657871922661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "3502053626453342387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1230316443026403527", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12846418701225147646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "17386994561779281406", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7367684259946371231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5451072983028714092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11758623888547009364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15958650715061024845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13899144453581769028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "10884229860266073967", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6629431845229592220", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13199442294147992119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "5032929712205664246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14871333176552512036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "17890435688048047959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11353661571093800805", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3101908018947919238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "7495240482209084478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "6964506613327100469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "3670645005971806718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "69949758775887534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "13654393413005772278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "11101512074369779300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "2273811004985590823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7457154125218067377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "7709677514862642399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "8010619564572573208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16479793487852125428", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3147355028342035061", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12672939642957531547", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12627961914394914920", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16210688853876861607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "1899485873740458557", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "669151029135558505", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "8912067280071688393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "7714589858275971005", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9794413496918699979", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18350040136091421971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "16931304566154830346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "12816950084297042217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "2359632276970855181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16592641501972654496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7754054384598160936", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18195884921517044108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16256130331524359070", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "13497279823712860029", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "6095972148204769193", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4149964766407000732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10262104071809780712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10707129891337660055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "3585075254981736756", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3181067565488724209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "8636008354706344794", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4966150965920189853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "5569253153294942795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "11521347729886549503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "12399471154320580621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "14851218369956754103", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "14859848826604327499", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "14783159891899899660", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "2369671961317151564", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1691004331056506231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4465288557833228023", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13538111995551348621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "1886751914747841929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "8445964247944285746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "803205084059316676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "12654698468722759675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "13484605287576302088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "2469399061693302590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "14782181149367028912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "1448238652280623323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "2076478920663115306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "7369834759425644726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11702633755046828968", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "2944972038827287015", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8261441437673092886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "16694312773479519523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "5486494868955566721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "8246009573416434030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9457894602447879547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "4598302923247277427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "3116224788980631217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17103527368951412486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10150428063205056209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "16984028253790680977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "17857105233471273424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "2715609009808401074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "232807837985324954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11254635684957519432", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "6729077823331194042", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15961933828477762733", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "827225131390571924", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10429613013253088132", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14629385997654952321", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7546586420552408243", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "11897687507601277182", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "13975409361394567866", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6458124573210430792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "2385616965635993249", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14462438074931673266", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9338654554616107568", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15476402794704488137", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "13654816209891478730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "1680424228660495363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "12929981792125924963", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "4698507050987130777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "1094144958579794349", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12358908585763044267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "9793373151408615612", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "90849151510482266", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17277787450259342076", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "2367877811435050998", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "433942345363552443", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14575816691130255191", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3675622521877371819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7601637686045360430", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9818496628902493298", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "377651990943545344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16357661916741979192", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5417669424921804056", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "854020380490533945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11984602132438314210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "13100228219613095795", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9808704199834907703", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13071373212254908241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "18392748682101174561", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11928475964162658765", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11937547211842355800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4637568849323640167", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16812695025037565299", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10487883723723512839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15315014737515653325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1579905786032546689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4243996335899627971", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2545885699369058867", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16926950874716567095", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6355395905401306995", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2096779676054335057", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4217179485243909459", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17101789600628162503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "6139574161497189424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "16559140502701231107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "11459784003592366395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "7869916853707978306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 94) }, - //{ "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1841901358010744236", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10888203577545955226", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9799890897264103013", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1827296932806936575", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13010820430079828498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6245781545617904772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8714031312599034571", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12922099252166105096", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3042887030242700493", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1419879016567682338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12870587285162108523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6103433181190121715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3469599265931338557", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14812617666668076833", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14854734265631496499", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "7637441820772916248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "14695781272831602408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "15696910741835640150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17790593820165047954", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1433224983833208570", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16185194021453870096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "10310918050196558188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "14885938077915823034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "14442357887993453368", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4719130523147011420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8870222084473246330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "9350596936816632825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "6183248276225219542", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3757195189216622027", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "8318857994507665384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "12864512857659000129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6615646900347529347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "5208923086986567490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "9390793435913144215", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11997629302296435180", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4858270366437120918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 265) }, - { "15975176007724247667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "16837473534895641370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "16214153687871223428", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "8858112708913743577", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "8844677471730173649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "9043742986995534354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "15621341038256548867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "18268980125375728709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9757167087033785227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6513982093384445397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "9273893819042428704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "8270840662337272430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "15021512490648380369", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "6578908625437515675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13762042713029963144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "668798769117277023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "10917498758625273194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14335423820860953927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13978750151855895830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "8965747921518186477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 62) }, - { "4428101657497677982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "8151272056391095510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 62) }, - { "17656341100957270390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 44) }, - { "10295330953350618042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "15901675909820977223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "6026876733674266377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "381149736509958403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "7962991673727743706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "10064251191248475177", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16663239694378513014", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4690831975451405214", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "16132498413588349821", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "10609644803793651808", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "16520784657717262379", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2271187702055786721", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1872921634399989626", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12535576637355537200", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7771729980527620398", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14849108908297747749", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14043770215999952932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15277856047844308598", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8048617952947915835", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11446745541571732900", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17422822627612865758", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13954144830230671601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "11198908896401597838", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "5582896843095691256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "8133587696326295326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "9492402787848610840", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10515519878978734341", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1375156980278317418", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13455881643467418059", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12788968383428254917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "12304975739476881266", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3390014193205017427", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "13587202155230938291", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14462744723628661203", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "16109721499545711936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 106) }, - { "10090923790949378407", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16672038432561840773", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2973773544904290726", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17193614571243427089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2862029728492027826", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11066930104187448422", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2755147389712995637", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10662798624911535617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "2172121470071868949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3579916582911190192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "5495063314176654751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3771003491521695667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "15514370342945522276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "17285639145064557279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "12642574441854544900", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5471430682416582179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "8561154029325525444", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8939900194037985459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "2129726780118554358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15463465056816958579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9268536904925062469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "45977313646881991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "7092246390386193774", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14801984300948838261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "2797723586312707948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "8451212914744825089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "5131348852069018593", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "13619081494170885939", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "8818679285688095197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "4608292692528881356", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "9729987752669765456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1782966703272153440", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15641674846325113216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "18140951659547259039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "3416294810798281053", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "12066560812164094695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "9332596500956923556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15067550526427941795", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15428062440621131394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "8873614802459592665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "3367130693014583254", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14135594471530769414", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14971707650115908544", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3036512701943687724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "5334291640387922287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16245760498096322525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "9928406318940388716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3369894612786523432", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "590505356692040012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "13240472672791632740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "7128145024365641089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "11497327844388026594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "7301757962797024939", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "18121689595247452649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "875296362957469305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "14912119584313592912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "12494969618927201911", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "4640611487944119712", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1692411934657235774", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1673006919995519093", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "10601835610089648700", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "13262672660175739705", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "7639015398436550592", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13867172651521406104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "14587150810299279663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "17271409929705935575", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16744813357455687598", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17215047912921813592", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "4093195092417588676", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17895953872149392740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "5918874715861160937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14498368518428801865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13857947326347149693", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4965619590663772423", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17153828952517174005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "9864812885638557249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "8410695282651246173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "3011957000022205132", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "18202466898415347292", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "11433166800587133728", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4499160027703324879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10225878843410985743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10961131057009777878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "17123897723015586893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3938875063592179645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "7589320923145169660", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 186) }, - { "13907115679251591389", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5262155845067632954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12323619994816664201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13711624246076632711", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13879644216615040961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "13418213186769741623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "9850414237385072276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13762987373425432087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "15387492794262813616", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "9455446170928387706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "7799083605029182328", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "6416346888102436677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "13401162817870652306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "4574862993950020539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3956303186129893250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "14928794187754412027", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "9892597035419316966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5275016494706355806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "10947686124973711385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 30) }, - { "3997597867012981671", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4003433148846544263", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "4361250474585164062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10523363119855336043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "1718324808394833635", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8430284238380067998", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "3950448771871155887", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "9033877528655370244", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "8878071105867359307", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "7511984934520363336", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "860443413504997114", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4957638663977636791", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14793503588688729262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9303039486341715392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "8923406201866512905", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "14629889085799380442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "4811310048537439646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "8622985922687454592", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "8611046137980763541", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1879796404388368873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "6928835003016610382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8603207107304593583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6477198553362516437", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6377441002585730862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15761554874575656075", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "118354408955419547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "13601202334102031245", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11716196499333250570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "16984923535088627888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "364996668506826202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "6412527114952548517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "4892959859293355837", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16127331840410137228", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "13335944978055152562", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10715829903767495958", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6065404265303390338", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "3897655522585667381", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1860663592951633878", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5698748062275134041", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15945452307780131237", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6764685582382238740", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4737109912659941670", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9371952894576491521", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "12696412964119109465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17825874529822806486", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8220763890959777277", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17731591992960147987", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8950668477702067729", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14991602704357959545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16383540667048742064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16820082917500285799", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6820284286806022849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17285815901490707654", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "994182747184593564", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6642767323474835034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3215659303601163167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "54975980454651672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "11529876081402974396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3644282167178264526", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "360872770877634346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16720108310653948550", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14353390922580547467", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9868561386826862471", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17465517455679097501", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5570311824197099845", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7524311370696987092", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14070988879848388270", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "8296551195150971668", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14352796912241296357", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9840495023131952174", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4720851194954041037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "17009318615658405230", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - }); - td.td.insert({ - { "9421643783312790618", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "10693348571961406417", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2940027113687311893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6090625728451718945", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5643908654122573882", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8852322966320229583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11185156002426041243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14670068483447729857", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4623542918584461522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 150) }, - { "3126708271410621754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "1434535531617424039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "17025268985366223779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "15398976608777968810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "7413341807736193935", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6071597471486669736", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1127095963814993729", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8611856835854445891", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11115935318793891293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "5393081375805921525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "17589256877540537468", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4135814997524960840", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15180348902159643465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "2818524781020760666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "4942080349816430490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9263314249867362", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14377032179148581309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15245529372955421912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13595283050046771323", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15667549927492357263", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13827442968070281886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "12076060884099762835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16532386511585070092", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4910582540370962997", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12335148041391647118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 150) }, - { "10689880083512104726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "8870164706606458004", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9269498023794081940", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6779832349039897240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13942354789498444722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "14294764660016835141", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12323510278692809329", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "5728070995112243570", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5381496395266530071", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9712640406795417230", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15036737419347383878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 185) }, - { "11552594222313787816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "9399255910184037480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "10594581016504135920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15640487942881889055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "14165417928501578590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "12251989236991754721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 59) }, - { "6675363512560434713", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9831713940431605743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "6531349504807709133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "2726501303929773572", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "10439704858943788014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "18137994263450376706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "5711991739289045727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "15255831401757117660", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "3906658058160172747", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "15823433297099049221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 343) }, - { "7829483638597533960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "14092273913846393837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "3746578485711843646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12228183555926126959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "8776893332387904786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "16672299044236704672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 343) }, - { "13309889945947393850", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "15966815420067673043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "7415938485228396256", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9655590024687998403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "14798289196964890724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "9794684437872784678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "16729204245488754836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15185983488152870534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "13821372148587948765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "4727004015814244856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1738348894912205653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "559491455289877068", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "17312172687490475177", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3470176432841342662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "8950283515337670839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3995072673238444396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1238913228370790536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "928677976151553489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4059887681292863495", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10493952422143348278", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5610465912655751128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "759163065093339795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "11300938516591867859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "12843263740221725967", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "888316366026890514", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8088645310090149658", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15891058658954073255", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "3456538031339928220", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "14187063304165334647", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11593893535334124231", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9218293603091125898", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "3614865264081581688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "8860682105104682547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "3775781894241463386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "8857354069987696352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "5611508857136313396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "10872828113308792940", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "3726594456692340607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "7541331569935741737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5639394073086652531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "11158391063762007051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6319861294308997034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "6893801771793379570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "9015970699147699643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "9252735579930779632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "16237353798629485972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "10916127635689513485", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14631094106016920364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5050075828787158563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "2277573429750402800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "7462044209068160751", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "8879836520351993142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "560198731460537880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6414187394150266523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "15317838148382459105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "11219109605495282242", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "10404790565578782014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "4657890394631454901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "4256171754976506222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "12658039760507507230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9883901352719605734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "5115148310176289236", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4892280615322354003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15491567059821267605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "4716932801711295063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "15559962129967760292", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7866546777503165080", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12994023006726461909", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1869893771689012539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "5635187738652974532", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12849693339574251399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "13233683642200681957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "4310121962651039089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "7620758476872568593", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "10344702612951473525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "1668590302432600271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "8750610033922701675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "6913992575736424382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "17945230226911262869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "7356559449640788577", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 289) }, - { "18349175655630268884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16817085704588915904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 185) }, - { "9503107262691437536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15595806193584438610", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "14283867094396458105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "14215445060938730397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "12720976113342879024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "14766625154638709852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "11757187678986741715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "13038212285326297688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "13919423909034348565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "12925256096286953030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "6275163484075546689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "239651884801599911", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "9500211224156027451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "10902538092301362853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "8454943813981348115", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "11722951613064434115", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "13547342611064538960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "15171119202712914112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5451487099025245427", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1814940262511664251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "16341609351317463829", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "14343280871046671393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "15586404971308258630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "15891211707425019144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "15351688973597240327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "1844016761754156672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "17925606428283439978", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "15050158761219834868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "17448180555072943363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "11422222075976800614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "14284377769814732906", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "667777413731244716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "11624071786842686451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3874974512053082278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "5471037497181745651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "6371386660654628561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "5331835606773958814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16163821504542698475", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "5697543838890997891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "6217438921274668801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "2633095809604510774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "11218297661079136641", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "3374196543196230185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17797320202829145544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "17198778757516749818", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "6440981718484677922", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "7643715911083095268", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "137903092932521503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13203019690952060789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "3918152537861570517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8782903242853500098", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18312668164562040079", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "15160703466234996170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "2751241748685218213", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12622728760401804660", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 113) }, - { "1290624457831957354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "10924946887162830574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 133) }, - { "10789202693606479024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "3718558874911694616", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "12835389389575311182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17406431092101974143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1400409391266374603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12154660333025778322", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16474284418841532356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18117355153710110681", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8449591498895477846", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2962899568083589487", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3491333679577961640", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13945298510228460890", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4282198629458668761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "779525528509830615", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15002237905129290671", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16833026567865627676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "16991060247581867302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6902644989079870993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "15088285782819494786", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3379661203936923589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "10049571207493913006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "15691689005236690951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "11275109735493317886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "12220860296984467101", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "44210723233569665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "14014987361364503383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "18189351665719757712", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16159032667792855758", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3374037004378790060", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "6765409971512438438", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10098892297878373639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "981877665302032867", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7472350511000146655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "15513971895394346930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "6169721205327431190", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4519054607159036572", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "12750124851833311828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "4333851142313192116", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6041620003527819661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "15091361629922645798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "9348121965341418899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "393951904144235223", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3220084080191614421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "11610588256244825741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2802357220980817497", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8972812517118478580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4207115359813621211", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 75) }, - { "16582237002610438015", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "772342953072606219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8546247990965609013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "7971830510840138313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3570484486449791727", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11461079340079820563", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12663860560275361463", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8132521728369930959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16108573960501496757", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11086699387784339943", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4013707396889204359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "11850332373794932468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "14763982961176216679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "8207349115037232863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "3273748387141431306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - //{ "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14472322679644532468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "8378137527264154204", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10180255575636684134", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "18242121098885244699", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6178572652675599622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6558074021146321216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "11038938372264857379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 119) }, - { "17137800360536507200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 148) }, - { "14016185289182597841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "8970519484272874266", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3289746379259038515", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18389174979070260315", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17666483005735191253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "10845781902676865789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "13646634862315619979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "5072154928583891344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7757331094141318304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16779678846332091086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "12223166874490429642", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "13316017702896072758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "10390896207372295988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "8386498395042623384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "17923632501885139982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "13398326377839777956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "13520557646924372128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "5996787039089786722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "2626376166907387273", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "6147643392694904814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "4311921348668650791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "8046109476498335792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "14190077682825257613", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "5032302126047788183", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15256375572125522238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5083173538217738703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "17269467004855120308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "2901056469731554922", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "9747637051217505111", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7175860674618956918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "9530922411870814200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "11015319643831560673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "5182740559503076121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "7567277014404457462", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "15973842639221447367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "15951492056203075273", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "5283253936050062275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "17650690912303447913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "9614936270604202220", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "13998661469619523378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "10584034255622783869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "2973436171295280783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "1908809004094565452", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "1623383628456201603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "5766507688771440170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "16626226341188424071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "14619055893081624406", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "17222005830854879661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14224121742920800990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5859124386313585730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "9596156698919548146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "2729099061601852493", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8233922303282945338", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7402006230339617617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "8420763628389536977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "8325767678959979628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "7673672840505587739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14166169053627992481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "120923426036313670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "7348084298010357768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "14653065651448352526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "11008522061447263744", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "305505245310584136", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "6472139251351862598", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10586018593856542117", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16706121580364790904", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495776091407365966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16430562172386510259", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5673972310424776040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8797843396807284399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "1698321314111848001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "5762290464889692462", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "3218248162832023196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "12988961529988078346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11683680166617045816", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6252429564537528709", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11717348577195224554", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9275303306340702111", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12245096462203481681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "18439435691655740074", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "15272426400992401555", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9136831791301215059", std::make_tuple("fully_connected_gpu_bfyx_ref", -1) }, - { "18238669114790278675", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15376246520426368532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 146) }, - { "16469788155263456039", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9360494451263553093", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7897877428349481398", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9354818521586974021", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5853553261686771766", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6351572488552853754", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11372638316835753193", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6170074103544756465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 150) }, - { "4544242784357021697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14225108809796795520", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "178353385245384751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2002574142025049539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "11630971824787392820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "11542493210215136239", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9595803435783166868", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17610828776103321939", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "18312069177632970412", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 116) }, - { "7577483892218843723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4355933224673863178", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15124932296735391043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16888042302987189589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "1584639932403433303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "15516194807992507442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "6614374536332038989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "5097818987523855112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6623182990939010641", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "17001023283013862129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "7935150275452094595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "2326323992207208685", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5072735784865711772", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4683841893192741312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "10341773151035665956", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14109534738984061372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "16706244336960642883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "1967030672241059921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1482100699000420627", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "13632911653636980024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "6198830126915940359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "12125006289181390694", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "14152716242882609401", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16732621354152092286", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17921489101554455214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "7384108582424003436", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13139625572508441980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16491532291908469567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "17788367809717898285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "1509728225855233852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "3816705689596666600", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14157505468412850916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "17366807170224886960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14548509699664316785", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "10404702662303016402", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "16436357970364549479", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4858167644379876157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "11718418772370938734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 160) }, - { "989564341557094953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8133676065307881979", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14555883089089918919", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14026570177552137240", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11686670048744589243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6678796313875454849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "641417817126876622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9622546530872848323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9194788897910888066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "522181557896569275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3332334993503432420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16131448347558322280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "13852065717057446998", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4342360467977736802", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16336482874764861478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "6075691042233712335", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7570346182940928159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "12971822824884826169", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3033264172690274208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17301887391757619741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "15790005937034794347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "15464327246951632247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5659168916726488798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "8079376692609682448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15160738482264643601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13953639482255428227", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12293786134765875615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "18214412375127043522", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10670103699537731664", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10147266284710177932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11443268857010762276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4659943649635556150", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11726298758004767743", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13896680298436380632", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3797957937905580811", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16294962940703055933", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8526484907799590618", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13723543003759101485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9873647901670251106", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13268525255152984893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "13008742408950833847", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8616686489737649890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "11423865221956815041", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14990645740260870030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "15204453579641378742", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10892456883214928095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "17856816245251319111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "1799430190598598671", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3106591708459602370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "4111904926378218826", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7678457226823073886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "8786249783185140623", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5389189982064081933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "15175088047384943892", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15331103261044247142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "3627273785739110683", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12478309735214802531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "7212944937255713716", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11873734271080160669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "18421820525219154881", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9553032671453999824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "3860603464276263676", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14115313335378184289", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "9519623751582710696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "10288726118862235940", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "2231648183489019418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6316097202867006365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "16208488491972128275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "17978026144659698965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "2566302789609970663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2714742023091949586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "3087801652564627458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "8602155166799218249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "14230385851791760020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "10828719108804915700", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "13973179950424276578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "3668927000317872012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7947870656736319919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - }); - td.td.insert({ - { "14767888121198814523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "10401632438377178271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "8762901342272872498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "10783981060353445280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "12608289345175485333", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "7875272450497189442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "1599725688135122629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "11932770338770247767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "11184047387366978375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "11716771904412649891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "12576157843776905380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "8402692278765063674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6781076363516398481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "3255465741612432300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "9767950219863105043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "7134654288295280046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5821853991835395449", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "3480732841490521799", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6973260260946088987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "15649927926091502215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "7910468668367486698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "156456996459945842", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "1994927850993519406", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "16431165572426232677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2864254144951744544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "5390559917122707732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "8378839908604146288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "17163158934005653629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "1185280691070355160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "1999979442136861875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2527189070714658176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16032797290430373799", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "16783619135298589974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "12785335515281046438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9216608098626790565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "18257496796879980386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "2452226948562393335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "11072545690050335239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12668149981216388765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "714898562476771473", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11115684531624462986", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "8710684853144029787", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3752171257634205726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5243587439683016777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "16881283637687482989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6042976104660344109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7351733901977025859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "9341400376014914418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "13713406612642090169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "683350872280694452", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "10436819182310112786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10269788826827249402", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12558716383635737426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7181186153851700294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "8107447526839063293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "1016414921656805365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "4871907623235871050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15539976365475470623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3880189981766119529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10154958553575016770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "4561874206785244358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "12358640399843058144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12956726277674279950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7177837234452118325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "3160080179644173650", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9832551412183684637", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6347790007333387897", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13219313818719819982", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17780553554354185249", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13315473376247698298", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11815135771923538945", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12465309202808173810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "7171436879576678563", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9407046952012845638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "8805267762044816983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14381377343079009210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "13248818835662551847", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "9300668734746602663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "7706778813807762766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "16991433003318725315", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4584399194832832140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "8558026087297588736", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10198351802037434471", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13257958112171706655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "14722464361594874490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "1544616395544118800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5955569479109539856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "17738708576252096108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "10276056345160651377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "13515249925520423329", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "10055593174764596789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "8707130584661395715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "11161176476048297041", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "6959258479021077609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "6365109451272429541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "8191978674781978488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "11604224659996035116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "1759873215866222608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4559874433048442047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6937259685509040959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "8242732346001884230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "16156727721974657541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "13786314015179226945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "4202371435873473624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 186) }, - { "10933135228023712253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14467312749536832362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "10557843071473489529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14967016402348718219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "7594056145185406157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "12051754199123379659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "2634827464202220192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "10211888372266149335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "4548339182509526896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "828946941343000506", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "13008375263617223352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "5638301531544801477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "10213021343800816450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "12000084249129063723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "18040104088851490930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "16394608147869554267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "15229178454191871174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4877661058006573128", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7515937801840512449", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4747017546101861376", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8833751655076849826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "16256124470203598218", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2706523860113152678", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14946999257618007034", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13699343107940933196", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2887515984302814699", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4906737644615337997", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4725303208352054390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14955652052550053223", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "862470330257326268", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10381668587006680936", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2593337359555305520", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10774393239130591748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16247780189312707876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4487284881658782961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2811240876735166934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2447678508469638445", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "7454164784767168407", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "18275848121133385773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13759457214873634937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4855959048455906948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7160112985819045832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "6880424067049089394", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "748023061136366353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "15793120434966402276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1932618420321708351", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "3336444565837087463", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "15067224168014815918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4431271266410883917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14115818307364071162", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15250928896997938213", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13013685738525906988", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17607598031220186942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10278583197921433748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "5300123851331202735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10751381988703627540", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9905160045246767203", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2927340528757005274", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16243861301305882872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "15004681374954252324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4496537089364942280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "13357951046545317387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12878719705192625362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "4785466104509327241", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2416244034719176938", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "17785504548342377669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12811319921474895164", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10760000973615798613", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6300105753728135778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5791707725846814784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13246629627758485603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "9400558994532871122", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "17865276008842107020", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9981156409872807880", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4626770940790542333", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1094262369519841857", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5523297987528243797", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1789389636704094004", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13544237579827433636", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17696244668222870549", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8926171136732424790", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16179159307898475953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "2692291137583386471", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "1095433004701276122", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16277739324697771064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15945243427420522827", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12296021067910843036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10673589588224406026", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3585431879296991112", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "3119002388778552316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5322582996019286781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "13225749488949717853", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13207215182979880133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "17730578026124357983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9725306578495355500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "5277508201756602822", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12806959657459851511", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15232478805009654818", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17712227426604098630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "2530975976273876727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "6232318392696042532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "13657818175298160631", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4088603773237062922", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11177710514557128293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "17515272254985846970", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "312130674630486188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "18247095696433793115", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6341728273786101457", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10401462893795799864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "3032101782888447048", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15078331029547630371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "18043541805861795852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "7608435380564752000", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7129337563584588644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12204270722180734542", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "13588405581356678469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "4986281570682617547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "2214420531345686129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "15030725973433075086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "15384520760315696372", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1915712383376159541", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6176816506826300479", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15331830720555178784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17558578036713688769", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "2388815483287403961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15948716167523201661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6787190800192250525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3378135802544446861", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13053802967262518173", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5503306970973862635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9417884304413500664", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7866083951140251349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10084810175406860705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "10342347371769114236", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4063042455950354352", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10055531955039754920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13173341667656398216", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9356247214800869277", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "8630592326601832361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5041676938441886628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15379755045295790608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13410979599123644577", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "13504573816477550406", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4459291258089899503", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13992993617743773278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13395962624719382401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "1535675815795592775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9849036672784280133", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10780684483689207763", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "4060515618437959603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9203467651096078409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9698108593334526558", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3252398754887381352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3120759967333088019", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "2024996599975373573", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14876099702827489987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "8653894569484019347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "4004518396368398824", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12801342874692090364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9387557098916352467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11689587446775003898", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13973363990921590224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "6278030053136901802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "12122586525659611649", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17711197779492504718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "12489342380264260364", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12148845150031891038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "5080727465135503101", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16818862727193981112", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "1827977959922344361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "6291003899324240633", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3002862967523058894", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7222921168135747513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "8696847224485998117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "7453625482178960081", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "1472822945750487574", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2032419134020329477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "805104869568121149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17215312565214990348", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "1737128374457513820", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16263489451695566992", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1608378717397996752", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14346703182362139650", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9744493065276230785", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8331721527098298378", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12097373631649932423", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17442035600389810700", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15953351443307161934", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8740196547852036537", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13809436837912218131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9722172495422643735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "2662628817605495834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6163765140843670080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "15662207751131195569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16494358566119044242", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12641727819019838301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4917595053453614536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14577496472237742721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "4356806313729405658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "14282717676967464809", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7275701540104992761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6459003512612780875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "12791541622557283904", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11882713776717158678", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10982479758700194728", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "8714769962126708854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "7639744043430667021", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6804493132858449665", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5204696395552974337", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "8893913418784905112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13496918758899426996", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4707842387180272918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "425930963222944558", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3844246198992827038", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14280128364139551919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6774493262072228712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "5670860641930464485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10055923266096584825", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13508499324621059445", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "17431631935986646683", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5568431877348597159", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "356320499267651746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "10632294140185068783", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12339584174527699309", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8556999353039153661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "15381427144405510339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8855986581847188591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "13704396706685353016", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "17128550517647168353", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "8625183189646433895", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "7921388663815287395", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4213330047036138895", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17034122796081495259", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13076343553185159307", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5854267518455107328", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13675314612031135613", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12825029449351875037", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9397711809671506538", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12965800692507042874", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2647922515901529845", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "10961049607808752432", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "13988022841867948024", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4612862531793961340", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10950469938532358632", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2228733394430438519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12015814430456201522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12344008430499496640", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9863615330219779441", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3560058786734628608", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16047381404034145819", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9714811479610938662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7306541374689856571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "5689486642279577539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15545653867155770893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14910368344505819159", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3220771309796407003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "12786796142417489350", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "13947140171097868740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "1168311873250200110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3495786143085325748", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2164537487697642190", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10623345643437043886", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15240415102190323330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10321975076426598984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6467563111927343808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4280198021826662216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9464830880142854424", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12113781253211924677", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5410693492803892704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "4844529595057806427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10848097581672953022", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7947428837044782745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11705938507822117867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8334753494554256932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4995468555341975721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6282308289220311358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "18275232300842488846", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15754022314306112499", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15193841338943103284", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "956475051281637098", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "1117811515417136925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "760383787039304033", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "5351526116347538406", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15923292837937693143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "2954421933443715181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "14945079011377285773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9573520179708447727", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11432977101529429562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "8918387046558682780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "16699295198130950587", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17358462939783262207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "17406383217119217230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "14003645277231336821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "6638761803107874904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "1630585964216121575", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10745248353587672572", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "13395074742046717601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "12659539044474018256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "6598024975967050290", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2006890470582854116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 164) }, - { "11369389082421346630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "4986977887030495943", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "9681320098885387731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "8730097760819044515", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "11882021989615795558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8202324251716703125", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2932157519158822224", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8431962471592709199", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16116546888494787089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "2954606701225038770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "6757752550680050481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "5893257440341358427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "1327911294059513894", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7771820069600757360", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "4618159169098049590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12268432630136256720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "2373658589834410892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8440300225468667909", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14495382595913294626", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "4974435385259831818", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4455497237293642238", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "682912708716537431", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2585176064846114298", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16033144151193421543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12141880589558027223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10098661517988566506", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "16192971634546462244", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "14793709237400480942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "1646362346584649954", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "4874397454627474644", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "6171331678772388712", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "7496699438957793920", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16767657090925788431", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1006721963560645335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "14753245713079865819", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "1779870708816318465", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17157919258161230886", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "12398103047184982980", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "2961249862769657168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12131460825751874564", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "12365814254940023343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "17218545462549916519", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11877919824125633092", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10679711602282897680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17801375178828079914", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15446821602347034830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "16041087076800110589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14102351022029437177", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5786978465690715325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10140124683113804219", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14022671143475909407", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7468500876165989695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4628560194573173205", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "3963065974337687046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "13439359175348786664", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5342116782332968020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9198777289928370963", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "18276472227494448327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15774430281717785574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3887883367078892827", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1004081473410027655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "5460182945235134126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "7932494263344450271", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5596359111431962318", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1116176429672030385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14642276070370158123", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1709508499926680213", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15466995361950304551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "12936512845587590244", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4464844599426088921", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12966090642798680442", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "128970554088066862", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7183620142123364052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "3793885399790365373", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "4932548298968525464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "8248099164876900927", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "249355510483373796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "2837134119351786115", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "141687758281942172", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15718782218800307385", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3191047205441946466", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1564774057733793087", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11134833419828370568", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "1556966764088589197", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "6087676883600048234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15052286556809931759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3377724880784871475", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3452246087500006120", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6840268976700446867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "4278180549747978226", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "17856997406888930289", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "16556093306187145310", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2581594444558181374", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3377472614945731801", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10622082408513122112", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - }); - td.td.insert({ - { "11452807035432891156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "13529174180301001127", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "17184405948599119534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "5921658305530976502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6802655190570100236", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17877430344093804543", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "153771221207255459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "15596913527233792996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "1016967125909374575", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6829653688530177613", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6094638411430816112", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7559615879839693931", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11270266455366424659", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7100226796198950149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "210793817522061488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "11152357292626304216", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5771335481927877060", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14619753612256300695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "2839767407547705101", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4981552552200657366", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "9626028243479089234", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3164513064874019611", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2363414141971004557", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8962502004422485576", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3154903035376733831", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16134637021630473012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "5553176511624221429", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "4890932609897686394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15334769670416409064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5513667102916409932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5351705572686943348", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8200094670006738584", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8100051552977329013", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9004823715680825977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "16179959997108523051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15148625184033310404", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6577754887650563753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13182965457868586949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13839590781642269381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12711366212612147422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7963529808900784906", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12184558469694708819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "3285180770267559354", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6613282637922219205", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17093159649157277089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14660081992091188026", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16228026045292341333", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "269829518575229806", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13023942860659386957", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13291308922240014334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "1187622888238643867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16229324496308453344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "14019704891647234793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "6141637854990273316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "13524128602135083081", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "531020979837645217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "8416686771626338600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "2916077416184925232", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "16862531110856250955", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11352536854890889084", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "1683347645109643149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4374049085310743239", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16159971034327080937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "15779210035964863067", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15153285262450947102", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7049603973253724866", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9389671301472986523", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13891598020647124806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "9315279998737090956", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18261342465838720356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3632541114724731809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17088320301520334100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4352363968456148009", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "1827842275223841485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7548767746018027960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "17750850961096057029", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7606282654661282476", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6201358671959761215", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4829111442270007186", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7267651931396380072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "1279682391530947146", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2655979063469551930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14425547983540742516", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "981419593633555198", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12324657364444167791", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3246153532847702583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4202705710324555180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12272318018055307535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "396815044270978782", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15633173680908856082", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16635731992372618666", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10418466892824851134", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "3244777852750357718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2443758478383854939", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13503934436248311972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2594310972560076285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "2424349375092546581", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7104985983444651979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13518747015059826801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "11675809062974151496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "4725349695436675084", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "17351243519367619322", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17026338651868178077", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8730407034445893642", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "144434691308306757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4114184149613179671", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2558882920723584206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "16481414687792927331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "17756651805686889890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "2228533392085335649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "9038567144062573854", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1345293381483212104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "729683192738752814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "458997435535883643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16955907389221472146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "17927673764274384911", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6418222853479731432", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7539191242110313918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "18014188548165359278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "16640379332042800496", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14856197725306980283", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9279474331309267880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5717588912072437191", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1143426643765799488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "1049385516019456025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "10766144770072425534", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6442062011017461761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "6063490496423709036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3892512749863226006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4970240836537468609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "14668725050395069435", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "7939047354407928586", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "365747554145156596", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18207060402110970301", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11049175652352131465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2982080608393779951", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17216477578093693014", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14116923400742300182", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7029133126202354787", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17420660823086709040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17300489799784213303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "15549100047322521213", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "8342403220432961494", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4600322689355365368", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6432444239720173669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5944283189654634640", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "8682613468075783516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5788340143385910170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14166708932229380784", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9262263820759430835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "10661619519548036109", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11254313793397682889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "8941570659228294791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "3711589321155572550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "13440603011986281192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10072782544067079397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "855625721312733540", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "8643403818712296708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "3482316012102041163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "15966346359387758212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "6179768494274723997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "110891946535801188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13300595681637438535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16686223109098592740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "4196950243745604808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "15357494333788579519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5791271012599760917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3502203881558439278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6615043890071705766", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16602880550249876273", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "8163937071550477896", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4788158788847752998", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11048286378242522780", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15669268280202512868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "11708180973354877349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "7429872600277069485", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "18404744652577257121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "15145594907273468650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "17189550036105947900", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7167054889777381093", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3278181836788028231", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6040360226338233118", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6877955452402826287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "10864271596740164097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14322983802576638073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "9469688466553577331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13052522487775745493", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10990480508394584613", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11406807220585770939", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17014949219411078284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2128641903680430067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8751004549226570175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "12508733516106581272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1535119834165965208", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6537771397615897748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16911666678187393426", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18163247824658143109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3169531413538986325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "1861963470217658786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17175653712131007582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12148428445687813823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "15239273648189016892", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17290692657168386471", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14119365735362663804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4225327120021140533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "673126354575235249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17628454700752918711", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3529846607992358207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "342387360760418341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16353520814579109491", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "3134099148543397372", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3042628567386436226", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "868736197323541759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3241775197578183463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7851643406001230159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "8502552745012743053", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "10353443026537243362", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3104552371734307984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "12807894319350246437", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11258614397356100246", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12946314097679886518", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "12909725304008017600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "6153017925473103663", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12188122150443559128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11983651079897753600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "12988924268115973386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "4891686540869580517", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15196732464112076502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "459391085160518545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "7760457628691335753", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4865678723441158246", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15589245661365969249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "9661616000023492219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "14777607874956018667", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14113322810933328214", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16281761113420371943", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "16988191641007425377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "15844881725957151580", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8059328623525062913", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3662747857062156477", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15121448034928438384", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14122213471825630433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14985236276429954162", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14321283775111180227", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "98795127409553442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14805540705424073865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3788462090984291082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11823068760218786389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "5963105523596432544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "10308431308942416781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "8712136292276123857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "11314582467969020320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "17147293671640396193", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "4465701487417893814", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6144958783262207773", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10467232566885547072", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17262854991782705821", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4635570915184713874", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8706634286501695698", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3863816884636503247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9252629750817485029", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13168267319035362901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "16567638487719493784", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13449466515297095146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10808909442136736629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15172865163331822352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16260483557979578317", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "7469127846325904854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "8783239368699382065", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "3477539135137665170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "605638562926557381", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2613462626256090659", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "12626994817506009929", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12417557233566012737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "14056483847542666300", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "446997309263592434", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16589191615146805668", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "17226649394712507758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "13566885629976429699", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9931266845625995359", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2522707948254032777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "6486250531858548438", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8174273876544952794", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15049304780567617964", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "1321553039928725678", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5105893636044171966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1661430504764145711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10041204026657386200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15969909663367854367", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10956917223944472347", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6060390128414591327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10987291891349907631", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16452573613171944531", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6370356607952251648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "11547588640573840103", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6882259829255167273", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4184283661465100793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "1799277562177870093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16276490504942526329", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "13939380644892198347", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14257398784378656791", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14444423571297570985", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "9708741882115135691", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9374845449632011709", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11907741510409644649", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11833466191385766041", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3276797683943990958", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8494679093555050767", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16807117250109985357", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13642010365337780940", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5622078553841657218", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "3973953743850093759", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "3210709940026980348", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "15122428380000835284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4492743859922847514", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3555469834146426564", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13140141354298916151", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "7110352624440078898", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9527046928040225586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "7797523746053138659", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1478169078874265704", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1264966373832011567", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13715010490012086430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "1470778934882087497", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "12725817227797568697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10024777334075819235", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "364471436103661689", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4052362583575987109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "10657660173790920140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "6557428245898292304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9440117898128288296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15929262283669093154", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6352520536724420824", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1921667815983542102", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6088184848087986042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16602667769746047266", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15953651221917495492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "34011924689025090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "674384870483198184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3555798556624172621", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13793032417416585006", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9019684110208109757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "647849627466319112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11242435114747058327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17302407573266205607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13606281481050014632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "2466805217694531959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "5511298016141559884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "5483150635926637198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6265211373810873425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7643647841451578008", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13254760530618979318", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16709502837180561673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15693956942112465267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17891347169069018262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10521453583707218193", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "14303192614979408043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16609351383660437793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11118586558529856637", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10939847328508611170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "5114254088513267110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "10163486148946687267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13296242326766100583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "12068797674575015662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10978693262040522687", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "10037086825900566930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17216583849049249733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "341552075482632478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "738850098651678143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "7139714914586273766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "3302557590307975559", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7648248878470053116", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4917917708431763965", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12978593897559876761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18064160378597803888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "7689593699365225521", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "15819149710195058441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "18274109287723887410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10269238332775024706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9167138376243583750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5713105609160120586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "150812658537571916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "4485289322925780000", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17268201530818712998", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8747430148550634190", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16986358655784856534", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6109013751635776331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9585113116232600562", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3503893875515897267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13144385730409574259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "743941460026466526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "4492332228252010118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "1920042803083729276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "16436006771518788093", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17567504672169904482", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1989849521691057108", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8203171222962341018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9795194069954915563", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13369603621524676979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "2930658435447859986", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "10721885719016335538", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17663718302088575615", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "10749263296616139689", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "18356235677223229518", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2657828809338947050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "15743750994087974449", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "754596461956525575", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17690103717758388022", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1581136092002053880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "184306359395609972", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4891076250667414900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "10946917656449245131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13963558035989415263", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13239946614209250451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "1076938718721677141", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5851532147278358697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5746129902873132635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11592511763160794565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "8244393417024602494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "10340341966852782124", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10014822679257636832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3975219156915176189", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13536863026622428609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "11408010379683511978", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15458285682224384803", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3407965587245145003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "10514330767826407566", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "4251496064392381805", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2384682907808363130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15705923658253281113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6610298174133949061", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6801247431347692935", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7702208423015808353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "625378771032655972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6542436061498779527", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10220143644047641696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "5009829190055738132", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9863034269936216346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14973431782875808802", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11948858355027908365", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "473983206819135409", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6586872365879203192", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "18412999191021390737", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5274456170971167904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "9275398105290923887", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11340683391412454009", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8100282867486124965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "6361758198448370863", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16431503579923509596", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10280282710562383672", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9138345765585313427", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11117326838088757686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "18222598708685323020", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5198859831430501652", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16644329894881952739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9367630847798077790", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4906856539144714227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14958085423402252319", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9835535945548454398", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "187589970359123667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "678657374277098506", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8434335101659807351", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15928128327390664485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16504425380504793738", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6480587375918509253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "9751235588096143414", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "16866525370343398909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "10160678465371702528", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17188750289444625186", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14811603003184578943", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4363379197393466424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "16403435599807360704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "9367985410929563457", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "10716232679616746794", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "622299920975636640", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10798283054583509534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14179140464588572277", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "351304363117543419", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3499106702307464480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "259619428712608645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3296098567244638489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13593304587712966846", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7572277082530361815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "6379337678256717737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "4513178474272034213", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3390430905253038550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "925607706467451476", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5627536079808515754", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16464493408368412759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "13839116996827687373", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "307874768879227632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "2321767794934000238", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "10308113903347312964", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6712698149192186833", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14930789530046665855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2204178900998688268", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17174919737114915467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15154700439767512396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14916625550370402883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7650375560336513366", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - }); - td.td.insert({ - { "9999553425206328238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17515064188391421150", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10437367877444543776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4362304842016958728", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "383721620126444793", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "138379779469699309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "3759515057574218101", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2856601829807186494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3286330985102373533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8159303545761286685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4056979460327024961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "17823133607491820214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13678741578702922441", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17310844417517474522", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7287895452784411060", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6513788469599330141", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1432487477100132607", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2463151488506537801", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4054850047596998735", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9747825473942435842", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "250084243188516935", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "672634960435241508", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "748301576795035305", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14255457787105784042", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2750476114907782459", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10028244201873254140", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6469067021323571170", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10601714587235375373", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2483181247706575298", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1732853511466309905", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12113297049460198476", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11557224109907477240", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18084899872055349937", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2890305478244125142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "16659638340060273536", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7297768924198851782", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13104971224879807298", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5507252417827285564", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8511924860787648884", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8339235544283885013", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5654030701873405891", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1436723751951975466", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8325439593817651819", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17618112803233960227", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12327651080801123538", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13617891575616631067", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6020885536659393981", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1940159900852645250", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "753809225159529269", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8790166817024820739", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10677449690354999149", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10593983805743674128", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8217088979257009010", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5687085271369421207", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15279061373346657582", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7096501191029978469", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10094312347267495565", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3510084874150710192", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9391986481292718799", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5259220060268012597", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8117066211911522905", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10716559814452841971", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3949211089098986928", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14752151264004665491", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9195500778955925293", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13023666909692825369", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10961696014697611547", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "408602315578383859", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "582954161360487990", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2482190331248449465", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4075769657981876449", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14367142998060454343", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8114910678593187231", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4495451816890445327", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17102726573636919392", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10845009858831745215", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3112081942557253948", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1635689655354995548", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10250301712194120144", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4855747489298888657", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14238766089951260596", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7017830157652362654", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5619751660204221930", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18093663410921658106", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7869191330107002954", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7245974724868795129", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11019243479903456358", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12568255992252373147", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12413024322120393790", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17328716013187434957", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14899206494260920951", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9983462569671477588", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2383983224188083583", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1759538680129620900", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17194386925266836084", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1518413386955573037", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4104380387301024172", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6500468942462159659", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14136370464716049139", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4056919990977544228", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17351367314312762125", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17586380391909451000", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3561558658922596877", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3296755748686779746", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "534789472217562338", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10470060457279511896", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4917360877294344854", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3020953254086476464", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12307245536623707478", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17039711449439313953", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2734182509541824864", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14521225825422360447", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5857101685300045443", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1547471890307888038", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11159429929932958728", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1382911856313970571", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1854612313463195535", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13051406650237455505", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15438530452161762045", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7446661399223808792", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17861183465344343443", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2026622899016787854", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16127482065413259805", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12961109385388101976", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16855828799826043472", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15658859674277700656", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4833749391314748606", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3326691585067800328", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10718764522366711114", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13643973579671217152", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11155444222714959508", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "11544626480076777556", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14277552178674323256", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7792811600696842064", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10622803531832712558", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3079343528005019570", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3521119014097924580", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13643421651252474051", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6947390018658290847", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11697545935437523887", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2179704411405073702", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6886280732774854778", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15841879134365332862", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1489646217778958363", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8996027646503556955", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17569170625753249614", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16686854568163084344", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11187304651899164445", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14540721800838487177", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1979841019103384445", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13241679793873365192", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7013169017932712804", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17389114672554594444", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4157112143322859333", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15217255896294251282", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7606241825090144098", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8254388198068394779", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "755942233998922490", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14018816117251124336", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "12054714986067446052", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "1138657035758391650", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3767246406609050779", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8536612779196342267", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18269766292810651342", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9646020463213439644", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5654817010240784792", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17277846909615605376", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "102220157823566379", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18213629255325554583", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8809496195168645264", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8506262325379391391", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "555647031314007743", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11133391567691287018", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4531238775069637542", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1050921927000835075", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3929145534169458063", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17125607183887169558", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3771153805567862915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5658567026478236676", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14520461267731870642", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15949156027942399242", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14569379143051211142", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5083162050523454050", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13951906075577108679", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9004122893718097099", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4336765005970913285", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1037896951032802088", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12090010131585526347", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9035445496715584647", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5455756262684457251", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7013197348316253486", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "143894893069959052", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13984124581247009793", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17964690428632248307", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5850736343172747247", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12379881923680871705", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11864459706509310150", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15222102499748205072", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1198491147477454704", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11400303472547811086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "11660798111579160734", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12081136231782604198", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14508437224082799436", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2396983035676921683", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11489881652545443112", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5651551840851524311", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11753049051286720239", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "479427514681077218", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10277290426401380976", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4919635200134986619", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8180846581099717076", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6411489040870738143", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4195122768220068448", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "52150349468142798", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4439371893496638788", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4039813343849078927", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16533127286587475454", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10264270523529136771", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9915620237695279980", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7090467930115498252", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15407802086492754450", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4007960934134542892", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2002110062193477745", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10154803388813032920", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1939527596007045209", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18436843102627176620", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "277852397173940175", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6822978927370753017", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10859939917723763131", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9248235209454206632", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2665169698359670120", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18266967379169677646", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5047972486012090625", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "8183383667948205424", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14478151143114959230", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11396985422513105543", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12114476173765693172", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "17041468169694105561", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16498300259966485293", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "1173136780324694038", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15786764202107923723", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8913526950888110377", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15988378956341507229", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10993107955805947401", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6214677989814002369", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10884202393733523875", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4424960026145600447", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9763754389347695094", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "105055722864217258", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9775648000771985077", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15967893151722576439", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "5774841809066688068", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "13402919586406297042", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11335142595937152387", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2689568881580764024", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "6571325912136856822", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "18122652705874970766", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2000008755333069005", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10361998183258703575", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "12348644068948200883", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "736422312606696687", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16240864447025932692", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8589562027950762944", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9162564861963233717", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3167738956362101592", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7260746128189749064", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15308960063718398523", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10129304668926912275", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4853130422682926168", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14453982453535955244", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "16608982023596566351", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2470663389603706356", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4240407752719875080", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4846563120992975368", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "3706088306568590662", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14866563628584464675", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14721943524627076027", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9323941828298277387", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14109366965145192619", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4923997413838231159", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "181017193671999192", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "10757412618207229106", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "6395263375773555188", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5979046470758784946", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3927359449523162508", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18232387132890063687", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6709212639543074230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10086813986911195558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "3109992766790372487", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "794530296606789816", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1249133049911188319", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2006024870459798086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "11914297820344167381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13079795735173763117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "6241224766048532539", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16524474021378494125", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4407550747921719377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "7259905085241841240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "6666210546769702280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7814543122045448412", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13554702187867408038", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2547880010597993852", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9061076702890952738", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15460429275475874158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7724185199575851246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8533091468352267196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15025120359649460106", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "2613575328969629284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7463954007838579697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9151324495773628566", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "651020886445062493", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8237821273547216740", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2875927974837744359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3674322065648064195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "137871170540938640", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3066826388383295007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17483221428915982776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3403906310423395442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3888283018836731569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13928684419408478520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11520548550630007970", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4922714504620931501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15683804450763499599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "12686604223669447758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10746289671948325353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "2487976264999747775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6163010595188500945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10404333823880552577", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "16662409111036688294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "5400706842524705774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "17423097433955762667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "18131954418490925431", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "16549854027697846882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10340073416712988987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "4633923265089466898", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "6808980404170272597", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10592783998150232858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "1594829714229111215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "17361714725103230834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 152) }, - { "15732140959902969012", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5796974850751105634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10588059104387338398", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "11738780323979052397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "16342972196376030503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "10406201782146034797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "17342758321852264926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "15951978466742016539", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14100026884590707572", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8368507377481570353", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16780457022162749898", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17140702790441856730", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2578325663193624576", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8784358107340738205", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2955459120402821540", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "2840794055129352139", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7104266560248570112", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11113125355390956764", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9127827617126714860", std::make_tuple("fully_connected_gpu_fb_io_b8_f8_vload", -1) }, - { "2268275392299271167", std::make_tuple("fully_connected_gpu_fb_io_block_fp16", -1) }, - { "10615831454139478379", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "8205640825965213946", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "14337168375989245254", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "11664399629496237233", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "15750539817895707253", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "921209976738626097", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "8590416145336196354", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "10463632805036507382", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "13637537549252005181", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "7581949584623524395", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "9814647153117279415", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "3444250649099578792", std::make_tuple("convolution_gpu_yxfb_yxio_b16", -1) }, - { "4039483032571506874", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14309249337788077160", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6254493271976962295", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12387660887222981357", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7723131901316908741", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13963554827358438190", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5001552360784483833", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14201142257504107783", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1066668660701816536", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4664196755018349672", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1103228955716492167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "8618835732380720921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "8391292909068775212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "9488974186647231896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9069334144391048686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "12493863403516600413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13375084585444085517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "18040173797801558071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "15329174116169594863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "14910223536998380801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "18277685132620834972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13090887980792573261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "3067806959725855130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "17791773192152464021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "13603318842632052764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "12072881177966014126", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14413047954443174304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "9118663018352672834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "3558391988878894288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "10047727261970275928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "11527382293059267033", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "8445575388700666150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "4560479630843098090", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12542825714985999760", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4599539412023802059", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "4570119951370893062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "16897917745917378359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "6947523163603267191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "2322126126611987721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "6518845972912144959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "9741774854327055438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "7079854103926842364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "5035895518536085765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4307157272240924516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "142486914279119363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "1532263118203058517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "13529694429433303321", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "2820916926593580316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "11140657515428786448", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9028970753877215614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "14038308632095412386", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15928183143089896780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "14071202918199194502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "12383676694875725364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - }); - td.td.insert({ - // style sample - { "108008098283423515", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4060303280041214217", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6537702661107952067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "14207620784078112608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "15507553344802827573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "11202969539314825145", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "17875115440447156357", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "5043345769472136917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "16920049042115931889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "18396735425525918800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "2188753401875518777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "861151538204493788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "6577112081591081699", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "6662263400328602558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "12062286938998602641", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3532486493536194182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "14486900605080248966", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "3986970741207127928", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "12055000818441091810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "6473775431261965926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13358640031183139493", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "6917849789850282518", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "339005357927126796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "3341093105217903149", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "4002803423257090980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "491985190756430188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "7294200033269380787", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "4133961720345934549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "15578894483600472601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "5153485325286526589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11666701706717643100", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4425021395842654484", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "16007037430422291336", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "3766679421476531097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12016934279017055632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "6099288410648891214", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14944495584618629508", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9930151769697976322", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "2751149427305557313", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8308207826619932628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3823293373281864380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10416260780913304720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "2847588473935575710", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "6114169197348303753", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12362870423757408187", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "15330333360513835100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4337663535143862248", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12324726684926692530", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11152914598877675570", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17706702842712421674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "8596083086448639289", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "3826763780015674157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13022765751713107854", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12054929554615967645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "8483866344820602196", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "17257458463329928325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "17396276238049115844", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12139918033335162307", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17613450189830338239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 20) }, - { "466805001581651681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12375983338952375600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "2835926422026106846", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7562282591986948344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10222410309423438801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "9667626193041507177", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "5918842657011667447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "10197866743342998409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "10481938393331020691", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18128936267842454401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "8968418225456926192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13004007524122679918", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "3587239831348133052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "13594576107143259571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "3622666399417827014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "12576876344393380361", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13160857254841009807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "5445489344860863060", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1213577713645615257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "17692282381799629643", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6196533506278647179", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8978870911977287031", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6336679824344178824", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "6223842516539111057", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "1046547531196124397", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "58154090876617650", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12346479378618214663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "2502439462576713842", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "4182038693129989035", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12115518620344827362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "604454303639822310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "5453339018427413517", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9083797214718240599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 167) }, - { "5886784323972875305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1077955953397294307", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "1705252754140106824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "14801234233433168563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "6099288410648891214", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9667626193041507177", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "8308207826619932628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14486900605080248966", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "12362870423757408187", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "15507553344802827573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "18128936267842454401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "7491177930963608610", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "74789225791237471", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "13384934269447336301", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "13121630338540122290", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "15469602039104029406", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7160031288662381100", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7329115981778571341", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3_B1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3_B1.cpp deleted file mode 100644 index 980aac5..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3_B1.cpp +++ /dev/null @@ -1,1937 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // KBL GT3e - void tuning_cache_5927_B1(tuning_data& td) - { - td.td.insert({ - { "1375156980278317418", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13455881643467418059", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "12788968383428254917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13131740479277027362", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "3390014193205017427", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1270467775674221667", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14462744723628661203", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "8203171222962341018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9795194069954915563", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13369603621524676979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13575423234109624706", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "10721885719016335538", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "14567947256029724271", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "10749263296616139689", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11717348577195224554", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9275303306340702111", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12245096462203481681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "4999505377862312410", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "15272426400992401555", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9325097933807426691", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "18238669114790278675", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6664482192233202590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "7454366978268164047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "16135569134646688251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "10572945270796129630", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "17495198214524203238", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "5221108094913859739", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1092633914190498221", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "2738256633362038820", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16689586259416414782", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1525652349412826502", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17683302016987200208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "5615525527388396983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "3992735701291817771", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13208739898218342989", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9536348721941264933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12803521018213865796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "8854783036772473804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 149) }, - { "6766480740724769248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "768423629375648579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4044100281521441011", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "873240542570331563", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12875236165672036211", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12008819728839685704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "2486645741683554648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "368578589584714524", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "301201776306602054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "13152181652632422771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "10311747599696543062", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11258322449556590366", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14095734330183410835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "14910223536998380801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "3352689317181436056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "15832740972576959202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 285) }, - { "14732184525012592889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "8421045774757048067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "941232110069825628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "8975333906619899020", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14800592533315327674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "11816277809167487786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "957781751038897330", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10498289589469975939", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12970943403831707924", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "1300292367195167745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "3399837016486623477", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16740871614208968868", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "71587235425438167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12717047049023783979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "10478482486372389470", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "6056581247196718403", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3780320160034246719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "2819320453491169732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16976464773806576190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "13321672741246923341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "15140532227060261467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 275) }, - { "9400755775406101904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "10292585962794261197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "13048561902713182858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3658425022428447440", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 272) }, - { "16947830954662293793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "8397584983137442239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "1071169341660439058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "5326247361632903583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6214194654733781771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "10025839973092358719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16711955423531846725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2915165824085219545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17108987360340581555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11972097635078477347", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16926950874716567095", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1212319037405620223", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12397280593466519809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "2609454334520044465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1336940384521633733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15271783562528081169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9533360488591027707", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6930697835136176263", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14444423571297570985", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "12643423612381102003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 79) }, - { "18423051691107460439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "15381833359831622179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12040626513219974957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10647227605517025377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8127570953237266335", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12876112384009608387", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12663860560275361463", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12352923639732112511", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 76) }, - { "708452703070938673", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3217246278485567748", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15713964605078748923", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12293786134765875615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "16043683538361975370", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10670103699537731664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "17854578307286932628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11443268857010762276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "4479117540570599742", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11726298758004767743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "2968031010495399536", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3797957937905580811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "1474271081523145413", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8526484907799590618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "13723543003759101485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11728824117049687850", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13268525255152984893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "14397348576352573007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8616686489737649890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "13176385389367548697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14990645740260870030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "7472330881076141262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "10892456883214928095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "9522661528867955338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "17856816245251319111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "14872992823083730615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3106591708459602370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "11609821372586026178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "7678457226823073886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "10118395047539851751", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5389189982064081933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "1742897526168249500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "15331103261044247142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "6644418194983229139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "12478309735214802531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "18012549942299450620", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11873734271080160669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "10424278617647597641", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9553032671453999824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "3860603464276263676", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1207026216972160297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "9519623751582710696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10328182165125764988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "2231648183489019418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17599383258252980421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "16208488491972128275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13379165253894817165", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2566302789609970663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1478419046264331178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "3087801652564627458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16103943009195163681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14230385851791760020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15293727142789007900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "13973179950424276578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "713121569924250372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7947870656736319919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1663285216972929652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "14767888121198814523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2124033349728954551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "8762901342272872498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17006133396401462698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "10783981060353445280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15110359240685619357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "7875272450497189442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3281207855459771997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11932770338770247767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15860915170591763391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "11716771904412649891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1095495157025479260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8402692278765063674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "509781001842353609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3255465741612432300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13439896617880328331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7134654288295280046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6769243149577568817", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "3480732841490521799", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "18269685060032395235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15649927926091502215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "69439315851965666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "156456996459945842", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3012566432840424198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "16431165572426232677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6324565723045697080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5390559917122707732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5469227748156438008", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "17163158934005653629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2307310127637739872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "1999979442136861875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2527189070714658176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "8329846097322076175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16783619135298589974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "12214162812589030126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9216608098626790565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5179760459095053114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "2452226948562393335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "4499586349553581439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12668149981216388765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2287356884312581209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11115684531624462986", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6483208845600234755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3752171257634205726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "1774158624592967937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16881283637687482989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14749947225382670869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7351733901977025859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "435888248913413834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "13713406612642090169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16582132711225619740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "10436819182310112786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14546281065004619074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12558716383635737426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "12609361477548272638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "8107447526839063293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10995907213890714701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4871907623235871050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7394217382008802567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3880189981766119529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "3759057398165607194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "4561874206785244358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "488298169768725160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12956726277674279950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7177837234452118325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 8) }, - { "9057036344533510776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5093049998173715787", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13761566845514364807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "1594612401422787491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14603590053512154268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "10136369729388564720", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050675313067213312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "14221578799010900252", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "11723735945517472199", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "13810995219720233595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "2704063557078535883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "10384537928514123040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 310) }, - { "17427036330773218054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9796621763733208035", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14046114605615338907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "5763440554939527411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12892693137085610062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "17775705003104146872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "14878347463243157447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7368916076070115064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "3499109651698979012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "190530884420224257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "4202645222013675478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "11324851661119942609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "6232363902828992968", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "4299492266819967844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "9481675228591993785", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "11772741918108731396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "18419183012101393192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "17832542092610191859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "11771014003680394135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9192665896782282996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "9763310312421884308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "11430400968543668873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "3430266954211750407", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "7172604084103519563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "10306542963828398049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "5235375820995365354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5091558853871982858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "12914986936318857086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "2265784112305305260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "9019388470685749691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12427258337646070422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15884763176333003771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "7211355951470869591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15399245700982979379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12644942072153919043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5876880412336151866", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "13775529405693629438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "9048522050692986204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "10642327923162019888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6410682026872155392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9454954846682513038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16463823433924519300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "7279393739634103483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13358283026528078900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "8032685176029570383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "949330876419581703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17713034180977313726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "472454322186482185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "2727219457659794468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7852745450437172519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "6065819201836017182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15984885011101717258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "14811022197918391667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16146350476627599543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "16173557782125372935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "296142385116663420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12655099960717366198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7937870623766562191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9367157746678824712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "18062849937960759210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "11919129623429545762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10522649794540845800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1104489643524273315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5419775002149092646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9226912483632588371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4958222070605478947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4479979951990338510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "12022152681602871455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "5740738339752793113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "12087141795291232248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17825280904760131680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3974589991022739479", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1838534101161814609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10046663998164493552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "2305461098719675735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16504962609450876148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "6345550009198921347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11239754372812258455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "4347816192417741558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17809920600993699808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16710010075465723498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17729546848373991614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16998508915819714690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12952980509662451384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2683507674615735878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "13059207969254830451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "16295660312557315941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14089893422771228191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "18034648276860485300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17739868787095417856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10880081193716628051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15916505622570323098", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "9101018613418825655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15650839696475698676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "15628121900226431719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14554225625951128811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "3134489458855347772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "5627834277145735283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10729288973933590396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10869005786136023160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5597908143491399643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "577182964135927041", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16947969669087411530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "861419637283812778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "3643250372952944907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "17977676737774695825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10309504812060596568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8866736221671835567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "2133849627845285277", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "12793908914872030220", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15947699374684516369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4660288622381620227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "15914512645931208899", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7460672405409009037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "1541754036637209097", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "89439319782574517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "14088382963493477342", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "18203935818408469865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "13191096881934434519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "7918742312252115870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "15641537661939240413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "157805434489791310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "7941729567451949422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "10628725059172743408", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4492673409319122180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "15857087373591747006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "13793441296561946357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5172712078329324967", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "8780604510524622314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "1760690277175249985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "13649894122307008732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17546566148752689536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "12675313398314286884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14621327324047759584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "14136097914489095982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7638626850074132214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9399994156762372761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "18068050257421269408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "11830297960718214360", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14959566236432790882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "16884396694505987920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "17947818179123182001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "9381304526221508530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "13932662890258900896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "8268533335852735248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17419874083634480896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "12773693193167844110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "5157249499936659040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "4282661608732125403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3159147743553063163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1706927777850488363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "9839670675413379092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "6780215829176686721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12972634653821069685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16129296588866116913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18202222342562516071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15426960908024585800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "17026284168840448378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "18118237182023167949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "11113256687741667688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "10555597973766215754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17517495652165026573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1832310305089212990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "13855438905855887272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15349944413643626251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4738743763536059708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "16611452077660879545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "8101977280003030465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2012181953284568566", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "2969389503332309296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14515066741400300669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9373353053843326128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "10023279637210292010", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1103204698908514224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "18092842590142527927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12174571114411168588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "14431607479949498164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "10279778381617181802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4237276338897143680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "8083672466967374860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16705621644424684055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "5352861363832390974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "16945184617367657570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "2995134938466176198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11706378390483804857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "7958459862276998225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11703557271443535142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "5020788604681810984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15217183882858251099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "10650698451740924172", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "706370730287471796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18199526506796726885", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9269175963143039426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3691705516240577130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13472532612464340803", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12388375914105990324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11582534256623549131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1653274345637156919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5893940382830835820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "17700958439420868719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "12730339458081890990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "6631816968511312100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7000524935770116969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "386749666417295495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7162575953766465459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "11398019086259011063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3041612155708729812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4274801141127703532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4865023158176874622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18424912460022156378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "10408322429232132983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "5277400567128489977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "6848989271874647093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "10085059621136526248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "12962552332511702682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "751912075185318190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4505008254511324231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4191326605459754690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9824678205469832038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18245935804520236353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "12309132521191764927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "12843671306854567956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "8275277322582733101", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13698389420396031586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "12949204491386872217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7370273921473161914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "941829593638869991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16206791915939407806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "1500571771538985941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "2095802691829304676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17542414935564676110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "12380856644683171627", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1451466106918423837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "8071957466247137919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "11661208196482963286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6635217802203685464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "265124365266629363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "9513032457323269513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11814740669468421049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5221320470007950766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "14359530849521980269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "6181651715051152713", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "1450888744802985214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2842103889477438816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14006248791647711759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7072606962946873975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "3599823735065658574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "11311859068168414878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "17525531790109748810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16749148369456398030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "17556238490521153146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6067904130482758510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "1791615587935799399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "12985650543127289023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "6714886136800883594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "220326805056361171", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "6777045876155144709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "9454512817077883797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14011124615649605281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "994489782629179836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "4338023436590582323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1152693503778768433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "5994204139128667921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "17243576882981097341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "5524218746051008792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2669822154816760632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "7179714714302073459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "13002363400738122017", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "17006095064160484022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13733327241591630239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "2623687018437195679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "14077148976508649021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "8272823732258536202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2451712485584835395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8057302050645780813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7430073011895298582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5095827462645341808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15129834325410878425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "9660812093766156608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15781622938833984014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "1089679781525023551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6129602738379919488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5287076386757143976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "16076153317792960383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2108296560864415762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17006655627343469372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "9404677451270692749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "1372939511728986224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5311718276151327830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "529543453251381109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "15591167992985613695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "15026219694198820614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8258382025812748961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "14810839157236175179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16117738994809548007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "659846949368492111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5211191663202250117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13418701036204748812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "9714764457768279762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17310332946322628458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15975964562807570772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13447028922679236865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8337820318779061494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "18136765667969393174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "14821616804286068969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18386376129938707290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16609136488331186895", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1996860183441418841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6491244517639245276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "16312223896859176991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17833517350994024381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "4226968857681929488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "5141753233513623264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "6860503758000008398", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16489624657475712467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "7862815466573236157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "10679760989906275129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "852092858392507925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6996376303337512293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10978173291465325823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "6670327979947471550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "11318913630213187720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "123251351612308092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "10784073615329190425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "2261453441277654139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2937907409658060025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "7852144838267007144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "4408772370026995920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15411474884532403722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "9462315044265139531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "6419580456182610836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "12277470820821378855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "16865879032845300007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2862999234347597091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15447513376965243034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "14420809655798184553", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12954154886708228545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "7575634241190730697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "2344498602308448450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "4304041922043496030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10971070835319242371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "4862529593282936100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "5312140481706133684", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "15522785615618973614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17798636687709019154", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "1938086876393565238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11897113890115321056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14363654136811880073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "3928266232090746643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "15882969506682501496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16426179645101678763", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "18174857480705846286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "598390166442977699", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5522698342845820411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11559360678008060513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13184662326021747000", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "16037141448095945650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "15094664469997373662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "822162932339827810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "2597453794298356435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "15851356529373376076", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "7966454753124154534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7311120574972466702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "16461809076899645037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "11655994466278963438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6981537186704688907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7903891232234389925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4229105529069729944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "12796777049340516563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "14289048840489035546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4239133538073498792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5103094815475470596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8560635685184432720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16264774056719724826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2571882179292959757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16758962840329202004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4550028191070279999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15661322183507404821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "14650567822254940018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3755253206085028904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8751016391945753900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "288853243482418538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5047419871737940985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8819268903800581706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3746573775462003750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16286085532892593349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16547425454653232058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "8195881973746570408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7712831597869354170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17035903590837750750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1907439276166837309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3036808833459559381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17928043901784474130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "14667209474639064623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1701609125136907870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2140514316203117958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "9366201112659847392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "7808544677773370430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2251029128552117936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9529614587861271730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16811402686462277562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "10554266898346470422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7817036102984218692", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "6329618009202266591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16936366288366370882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "8025053805734757314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "534032316469702287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3963106895592011725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "17994361454416813294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "14902389080201926109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3796274347773622633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1306339989221885682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10900880512948479338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "287386909600391846", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "17542176922797334839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1081962464388501987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5831419373611158773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "3179874645565098825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "14906458674793172507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1934379409955686502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10178951466584845110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12693511427898130707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "18137106379929135901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "11619548409913646265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13317417676446624018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16710651492402564794", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10967218651864700933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "5381578460674280089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13026555349791486777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "11913020016435860608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "8260130048649729185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "14133958262039763609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5585398540591396124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16442107352245114876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "423221712829930726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13550435052563656432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2440366541074371090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "8300655194765375060", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13163146272900339330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "5406129421969383274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "15118142492742177336", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "10727592780669452048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "1076005730007872492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13699740641705514374", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13054405729329143152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13503608041359512", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14385185911482960528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "11215217005872946038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "4099859307693687554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "4408600136502382976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "3037042229494600258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "1155389358857780776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "11461581290174106570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16896833230469488924", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11469881811044037340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "3003526572122876385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14251848023416168295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "17248756229500447131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "929378940515745198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "12962558681443556219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "4481903208484313806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13558618754911056302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11455518069358829249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "15890473622821659630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "6942622405269419082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13890118723041457532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11292995457386147494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5077214229434392730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "17774424004510360936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10412588668458621135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "7334966010680206302", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4161141078006269526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 278) }, - { "6522575549211855712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5629373398445592781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "13374993751390784382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "12976499206227689731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9882204352209412039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "5041111302824362529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "13869716373706247686", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6438522646185979880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2406816735581074778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "8881150100883636392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "593712935037568960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "11970881115757095265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "5584432943673435454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "4560479630843098090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "15374625876485618845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "13102754309439605192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "17912189681971987483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "8153567933591966877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "1604661321386793876", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8990561333549136048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12278364834477923930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 300) }, - { "3122997634505472500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15669490019428002270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "116291934148608396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14729854278671832528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "10591379189397010097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "11929531534620071758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 300) }, - { "1819720745131968914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "10607904718265020949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "913496537924971856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "916389941321470163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1411786954276574458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2730604806511016352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "5843679089588930933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "7304346312452588844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2423754482456771339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3653156933813711765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "10728212277329722684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "877436308867220589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18375125668176498051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6767245864232675168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9287404618748313247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8728178019712933221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "18251360413872841969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "18271689282126907793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "954796765467489259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "13597240991532942069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "5079055505117153635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "4135003545872878882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "11883485911218628865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "2242915551775617989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "10556089809203693400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "3727142736386026852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "1622880009460832832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "4437258459981739942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "14691372262153587653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "12181607120522804433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3159681096461848644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "6729785110495533200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "15322019609805777935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7024495439434892956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "10416622008071151225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "5796500397424307442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "15702382940521972117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "6093575518270471235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5805383505505929391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1801731858063091191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "1559798212423183813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "5594180958505308003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "4766071144928072260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "8650948093564284852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3883845471211207871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4366168099274266975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "578703329577922869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "16863960779539003201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15450609897480659306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "8203550467004532364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "7431849514656037251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14484890926084856480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "7777333052643961206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "4424217045094988504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "7994179151788368291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "15192024816519005250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4747159205186229582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 97) }, - { "5485971317082563152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "18128162750557822655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "12421707187947291166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "792684262493086891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "941626985322260281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11868551452004726281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14352303529756685990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "10702234389482091891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "3895088069642140043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "5334566325056222430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "8306337702797456793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "15720507574336564201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "3277243911383750280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18150429561058646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "11169292427557543138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8295126647635181949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14213516751025324346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "16509472637458153234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16589607587365212240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6988674007771237080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3448477246688526708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "8507854696766492454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "8906588133431586825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "654122557966242717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10196332102593337214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "15831600396403741571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "17808913959977434594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "15548971488532746290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "13468713306678453952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13613399861925108148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17802514063213000148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "13093429681061786539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "12247991248100147706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14491949194619001237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7590767013583950613", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "13210604117940125947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4670443882075998209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2857337999074313592", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "16036386660666696362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "755414184406250882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 129) }, - { "12190841837604350271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "10292243973236220688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "17793292063552633023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "7605139219344415117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "787363431787954804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7000486794832106857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "13608239208821071914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17281202179589913619", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "16985912104363932350", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "14744368497944610864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "3737552767159920174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "3792945601873900927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "1364546124782880196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "3689722043202617487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "2632535010129224704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "10968768803038046390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "5353552956675518468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7866128397931438774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "18233660940545931789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "11670430946096342056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "2627779045483019709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11066913713501760080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2552187713769926425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "654821507679356726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "7606728651572102823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7549378486471456156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "15410074937424854348", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15114370307779942381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12112853999307505628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4161612746310931789", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3388752887767453958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "14046990030104971367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "16230621843665445228", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "9274179337770060652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5115134711994944288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13898821685774165645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "3007637520820789085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "16294825599850364701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "14681717813022425567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4915831715914920982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "12894240573737168362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5448537627319798272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14389915292223442327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14274685812676150168", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7732899312577293959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "11956435900037329302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "9263063714383940562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5824801192141531089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "5608133987357542077", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15392077168521832549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "16446533347502650316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14762599606783897222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "709835724029986012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "1572991986657256775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7398196853452900099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "8140094412609934765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "2659031931257084418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4640028527711211109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "18172711677056449158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5183231560876991543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "6821855018718422278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "13237050834496100264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7164580481046523192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "2490155559809645659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "15430549683839591544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4553409514380460123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3041752019114501584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "4161001033681779582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "4764776977138392550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "6882621854468565774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8881135571874888085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14038261392627717712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "628191607060767879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3511588484597779204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6904130543085920483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7924408980408826942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9416186718345824095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14719421757340260468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11936419502418995274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16601702334097258697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5336120047683197088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15897477855246170861", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9780938731831129283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1473214668483422172", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17515573322312447679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "18356980026934328781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "18077281411861416889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "2543041530639980505", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "16370218798911151331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "17316626950179740845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "10414903047695486119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "2809950092498355574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "12011982029561277581", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "11267742746905371769", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "12534001599784153836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "1882052795393187384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "419783127503173016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "14211903923555028634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "10892706534058849825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "2345023488044002149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "5754844816339228920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "17015791782274123780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "3706994659266083979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "13324157125165576832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "12014527187730671229", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "5170245731599664670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6854611304056079417", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1954052357826969119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17824431042110985323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3603706453982734995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "11992353959766718397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "15163327502374403643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "16758697697363920520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "10930115765550856328", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14418429155823196539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1628593159980574595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "15675968397825708285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "9594594523961285945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "6634330132674952638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "8434794604559592624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3150231129728961455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12545558125736154584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "15485701086886851362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "18005721959893562716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "490233152678323691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "4073467095502162430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "5801429077171542466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "14841539539334726292", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "9404953235624894187", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 92) }, - { "17995371099806008878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "8961138963663532667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "425744529089575241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1316444335300814745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "761169277744593430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3325727286860556323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2526832080529662683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15470013032930986062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12255528292506999241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13119479079474639169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "12813978452097969536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "4991419288164762786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "18210370419559876426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14962768577232034246", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "1452597292381229708", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7104756264011682902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "7744787957569714828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13503688893307029975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "9133263538092913983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1383899865465106141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11829442945690098558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "12394049027081208902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "12159582810513550491", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "17738299860390552088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "797387385159110695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "8757900457181374694", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "6048964584602891448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "17882819773586674851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "17829148383265978140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14711697456265712456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "724953082687879224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "805221045541170643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "8241070786700614317", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "9191832520273617003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "12408889192918919210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4885944395876887711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2651385050387738902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6303682540621797774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "905780459938651623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4476928353532757380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13681462437496627948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "17243648226968859637", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11192356850081328892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "9323825370872655346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "10000618285883395700", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6418327009347170687", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "8528750110601691390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "8061914949376516780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12992194515157698316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "17870874477143985774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "16234606052818596502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 94) }, - { "9148379585489720669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 98) }, - { "9270950131920019932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17001502418583498926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11163107409437069532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "11465965972527519631", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2534408579674556441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18109284647478027063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9849272539053219052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17382660912493284320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7877332346656934022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "6323026044750482867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "9761573038170759563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "12098146032672599222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "1403617451623027879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "9058996149754556268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "5864250949922222051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15847413004526420496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "3199841714087553410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "4957638663977636791", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9437794960375526230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "9475130054420979752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13312514874803986753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "15997754881872769378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "1941341635794709702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10157866834809927320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "12308359047798183133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "2986189945936592561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "6928835003016610382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "10084794570892043447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15417738436777481469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "18377298651236993830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7354234812009979811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "8656468860180713379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14472187692485966933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 101) }, - { "397770940444464146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14258499419905714808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "17599396373608265826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12935563359569230797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4892959859293355837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "2802810524370514276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14907097142953816744", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2525260242689556544", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13328449155966085543", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11856266545854830143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15993427814066246646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "2100891581797371600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "12242618640422208652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "6133592828563353516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 306) }, - { "18232278892738147217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11992625045241269569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "12601126285773042005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "7457899998356343871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 198) }, - { "6343888265369366589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "10791067159964399241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "11327097771110264965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "5245308722062496788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 312) }, - { "10792503079194374004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "4818231379191523896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "2198278382394812839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "3800011935243649447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9631545863582097486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1779941298820543013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "3621930417735246405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "14435120971846098308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "2893564501191050837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "8108843303778211282", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "3682813162987778705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "15494543914974994991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "7565221050911842393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 90) }, - { "5629670679897666607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "11754316727756881612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 98) }, - { "10990741293315393791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "17024388383581997032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "10302338806536775954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "7915318733663535312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "13702692566238948173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "2909728331855309274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13071545223094862275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9631481972809246378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13540002981450186147", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7076937538747704750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "18043340998699622388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7148542290597073512", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "9040046051053703359", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1077773457856682663", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4716188972902735458", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17343050785312683560", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "5687802882700097624", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3524531620118359828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "5688478347124565305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "5504757952698692953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "13800387305792597325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "6574971185849732667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "10573920781439771673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "4992668316921598993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "15778834188130183853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "3062101811226530720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "428659495445490820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "956022649859563080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "13410850301164057911", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "17423645390621980919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "7802311886554362782", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "1172103288112689821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "17353894529222574441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "16431857516454692096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "9100044555742394133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "13115589642140732066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "16190949264253468961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "7026575758396092435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "16761856644242716357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 90) }, - { "6341197991729122563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "17087740929472936216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "10795104632256101599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13327653786981478088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "1096671695414716274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10774528268153772208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "9525853014023664813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "10632020369698615114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3234107167862677811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8708643228914766202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "12415368596357091523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1028160614515220430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5275016494706355806", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "10947686124973711385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "3349519148124496343", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4003433148846544263", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11718418772370938734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 89) }, - { "989564341557094953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "6942049339361951275", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14555883089089918919", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15320845027635796583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4014667229872705228", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 68) }, - { "2438374917504708831", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3272017687600371031", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16067605128297748820", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "14150012830816329527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "804195263636995800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11528417522960871233", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "15378025640603637387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12860222041026638681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 285) }, - { "12725647706191463348", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12553441041059632729", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 68) }, - { "12782191856884962803", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15824189967727245909", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11149782181562145291", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "2653651564133701304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "3526580286148537369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3985659568982275663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "13642146548740074992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "2349007644347065353", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6146876760962332928", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17434429579652310107", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9447458159095730492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "8655883535274781128", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7272538316511343863", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17564338309805484464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7881187047171099732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15579919505002150556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "11583017348580874022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "17915846724151945664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "5319668297345215520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 129) }, - { "17208186152576814861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "3633858263279042265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "13853056718266488510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14759179293743468995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "16995873636564597028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9438739171104456179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "14429081455612806819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9819596940685093690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12085348936192462321", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11951606039079763598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "8769060267707904998", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17104611871050967957", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2103882464623009432", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2659712601063515059", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9759380701896779097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "13842309033760176194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2418288192668085805", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14994322266840011040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16402312692470500253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16955653765071712611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "8739347545059610410", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13459514533473657102", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "7824524940405130010", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17796310681498690253", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14823616678465136590", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13816104794723484993", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "846088275031979661", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18125732229366977468", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8464582977975377118", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6290317420155851465", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "12696412964119109465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "4994591211723226974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "1036010477232750453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "13786357802945430475", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1003101267609305257", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14991602704357959545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "6181308879301978465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15488550074426713959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4062706195708729345", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "8594644182487917002", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15881381297320383917", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6040286126398028933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13926122593957480821", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6213386558868267629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4456004887590847716", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9642229389394495047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "18259656768460999562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "4983880246908724272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "7881579844586294503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "5331173521406046122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "3285520504090196295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7143510787416483146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13104509059416300615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "10090923790949378407", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3429844423226609965", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "706049518431331645", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17193614571243427089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3621424752591567930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11066930104187448422", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "209732971447020989", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16044646335477470657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2172121470071868949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3392693938352572136", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495063314176654751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14553856088069405595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "4967444801764057340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "12160764253455777655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "17723621158215826108", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2171768477223405739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12672995204641007004", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5622089373755094139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2129726780118554358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "4160656836528944651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "11052732052072367261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "18432787283148809023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "16172528828198474326", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16327433707667075261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2797723586312707948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "8451212914744825089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7025975403069487257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "8913950860101596091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "15308578014507211237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13132804928635689780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4465781406991476376", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "16266491618150971928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "181006047500375768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "18140951659547259039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "272730229972987861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "14898892437285105327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "17252449599613270108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13436376034548670107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "13787436604877398090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "8873614802459592665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "13663893159182636270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "1361159591875955678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 312) }, - { "5912303851874077576", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16245760498096322525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "9928406318940388716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "3036512701943687724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "5334291640387922287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "3002986032379998259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "16469788155263456039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8709632541892447149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "9524303276541517389", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9354818521586974021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "16781127329510211966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6351572488552853754", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "907036267078333137", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11855070245618904113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4544242784357021697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18218631037214746168", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "178353385245384751", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "17658152048177750315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "11636129433022017868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "2622434279674583815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "14335074487552883436", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11175955260573469979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "2732519635571994212", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13893789954946953427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 119) }, - { "4355933224673863178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18037918102910297531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16071723603031305677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "1697248235682953135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "7843498978148810586", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6767159196241633301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 116) }, - { "5097818987523855112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6623182990939010641", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6711878663358611849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "8671491767142900139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12164298124869114517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "17089801601582809764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "75742659105146536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4652136280940317116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "9751582946441607796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16706244336960642883", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12581879452540858313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "17443356777503458523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "939718260623752240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14131851237755716991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "7474639594232203854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "14152716242882609401", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7998930863626763670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "10323345824599612614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "30229601562833524", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17788367809717898285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1509728225855233852", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13139625572508441980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16491532291908469567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "6355395905401306995", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2096779676054335057", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4217179485243909459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "17101789600628162503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6139574161497189424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16559140502701231107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11459784003592366395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7869916853707978306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "8132521728369930959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16108573960501496757", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11086699387784339943", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4013707396889204359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "11850332373794932468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "14763982961176216679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "8207349115037232863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "3273748387141431306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11254635684957519432", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "16816222375242496370", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12809199739984715013", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5040730152867713388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "10429613013253088132", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15451919862187018297", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7546586420552408243", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14487682847898298214", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3106710091841093202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "6458124573210430792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9182897385081081193", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14462438074931673266", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18133334552107213128", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "38736266675995457", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13654816209891478730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "6263019986730305851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "12929981792125924963", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3138374672801504481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "17009318615658405230", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9421643783312790618", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2294026590516781945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2940027113687311893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6090625728451718945", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5643908654122573882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9065894438656900887", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "11185156002426041243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "14670068483447729857", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4623542918584461522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "1143214652021653634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1434535531617424039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "17025268985366223779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11507538232733291666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16474284418841532356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12461575861709234385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "192209423643075326", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15490478608105402679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "3491333679577961640", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8176012042686275874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "4282198629458668761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "689445825453914111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "969746749329671447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "16833026567865627676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "13046322179198317310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6902644989079870993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "10987953316324712538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "12515465135362865565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10049571207493913006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "3926585856863002495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11275109735493317886", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12238674883388043717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "16108759090923335184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11756881293845417212", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "17839839336294937155", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4703107905652287491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18180820925685532104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3835286851569826052", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7807983899017500046", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "10294185397756053636", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "5519535335798045279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "8701248964531180496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "291868903926685441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "15239764240622554314", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15963038745470172423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "11428599290755097395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3180320769716158201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "583303098958523195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "318377908569897093", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7353563160591978243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "2582625260054352916", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5609922876429907954", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12557015880639217508", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "11528310408333718862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "1471837664358450291", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7351401242363888463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "953306082374100275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "15759530339367380982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13300022131572486202", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15689502054035168040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "16969463538496570528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "10237524128771958432", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7969848911698660033", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "7130694811424715594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8578747191812631883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "5197105253412476591", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3120553928584920777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4750894407873652809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12667014405537239093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13644681270630373984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "7187734276051878356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13253775441326432265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "14733510474010040334", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "3336303478756453360", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16352331970945217438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "13484950419220835364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4674416595144505741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "14559308665571750465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4542143431130171516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13189392239349392492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "7009735776703529573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4220826666482500445", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "14792528369891965810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15287650965861631130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "10308175009371219583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "2903605246599054308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "9213563311267466388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5019077257951332016", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2497756607567197523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "9285566577169147378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3432296808755992670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7688176479120305539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "8818070832398055086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8787816339967963727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "863952266514375915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "5835634465164771899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "15101680837342453931", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 16) }, - { "1116274074896622552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "12790788016297794214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "13538051178827008933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16403423801823379909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3723613341885592267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "3830703844770425343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "40704767167309552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "13973028408397200796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16561224775421968533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 43) }, - { "11243840588602365090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "14103112843209793966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "10483664832302187567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "8100595788531468781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6620782733027313312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "5644068493155655611", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4867937397499803072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "2702144517025248597", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3304589333915676807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12894625941923144893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "11649407835105973949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4897991181236908768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "12179581684777023804", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2806529556090896246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11327228813412934262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "5485749317130402302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3499243120652875549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "10916647716124396856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "5749536453225343663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "789359733867650915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "12626014184575881530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1201692134690347847", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "5219399418946822456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "14217181622713951411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 300) }, - { "13025323039227543550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "6114147683777615071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "2355214244972870639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 152) }, - { "3167336012388169649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "12218337369633748663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7264756313770306662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10492056481694320580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14281201038135286621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "8127190765748950828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "142486914279119363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1532263118203058517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5482851829165191681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "10548792624072794724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4239415134522959352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "9028970753877215614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "2324120381399737261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "10267260789603562117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9988801796928462423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "12516911293946682547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "9213886570531053949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 53) }, - { "385046297070779752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "12541834857357563605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "11709992724966310174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "17222005830854879661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "475043738497218394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "1071007164550012186", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "6719302427415173754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "10482582307328548806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "6351347283201596793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 96) }, - { "6531171505861182429", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "879005904827468163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16290626406346691996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 282) }, - { "4569338575782832784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7575675354187625951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "5795073619189010837", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15123868617509445149", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5601435819039968726", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14104238386345631681", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17377293745073971167", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12134712464763856064", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5524215233998361104", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1103228955716492167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8618835732380720921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "15908673392788376468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 264) }, - { "8482147530539941792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "9069334144391048686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "12493863403516600413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 273) }, - { "16692569816843207989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "3438116423688595487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "15602863681196390535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "18277685132620834972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 152) }, - { "16541722316343690197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "875142032423622622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "8965747921518186477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "3067806959725855130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "5779388310240896974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11092828091552833150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "10295330953350618042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "17791773192152464021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 96) }, - { "4894227264080887361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "381149736509958403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "13603318842632052764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "8929453032482114162", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "7662200927459001757", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11473442921040533207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "388828310152538138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "1643241486250690844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "11806105193035393795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "8843585527713905568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13248567106128518549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "13708979487306970634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "14406070210216948643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15352245788978088971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "1435153323458789173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "17638692805430115529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "14068780861332616363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "6656593119788274992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "14695781272831602408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15696910741835640150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "15315327794058441258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7545013298074733778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "4026686872534942904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "6553736978928374036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "12129572274423886770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "9723314434598141024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11031625790234068916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "1138439260035360722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8323445733669842657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "54019631544204590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8971115542951085891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "4584970211859494304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9321208819255762521", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "12617625046664709483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "8264178890341675354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5334190564423375247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14746359019867963124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2044363708106765326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5132761922124425835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "8141428150264829362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "276407276027553756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11878734040194151073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11622925573287101001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3192332625020432602", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "9785114056964539323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "9410978119783758141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "12523676912856063091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5912451559447635837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 133) }, - { "10264913782610095832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10309083227104422150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "8500148569566077929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "6578908625437515675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 23) }, - { "13762042713029963144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1561225943337590599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10917498758625273194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "14335423820860953927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4428101657497677982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "15901675909820977223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7962991673727743706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "12141300895511301068", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "17106086048442658788", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12707946849050970702", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17154337492545826355", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "10109431802089940590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9428176632140441528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "52089503050497755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "12297371032753209816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "659150305191479097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "2065752819810364738", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "13583166868754499339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "13991205023798493715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "8939683514448064461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "18337160891834020517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1154228007901031779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15156525717629023944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "7757331094141318304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16779678846332091086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "5409924335138540834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "4149728557142033774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "6443517114667332732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "5419041493176804960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "15948383678216076358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "9604982746455852556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "15739274921308457528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4642234334824303290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "13200151444914751729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "16894871557229780934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "9933958860597451711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17094948685292534952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "9762182215179534181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "18273537339378756543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "7720939595094113814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "5865480930796299143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "10058165874008941852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "17309326904418811234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "5592428580503282095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "16348402367953880206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "13607830451968188080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "9311802150474489673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "5159470523468873105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "7975810844103449438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "11455843788148231615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1410630713443793537", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17303408650780384587", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "12069726772532946193", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "6204183474669103812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "12874626654611400042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "13546876216568825877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "2973436171295280783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1908809004094565452", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2322559721899919275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "5766507688771440170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 53) }, - { "16626226341188424071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "14224121742920800990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "407189201971322683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "8460847842045253466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "879896719155824868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "5219048275475447369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8707189142909022305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "5948701218437980356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050143605017295447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "8906185843274300447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "8321769923556905957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10433541468308381909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "10405183426600618231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14885109535362957947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "72444706264681262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "16818714747882774917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "16236397968499692493", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "700717277178942679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "482564204402769504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "3221221905804708596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16467987800266816984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 216) }, - { "11599932445375240727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "5057534502588100071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "15640202505592598653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3355259926747524578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9226443907548972870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8104309105061227444", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "18384657372655350144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "13739257060165119132", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "9810904714798127155", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15609627722687211129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14738573151275130683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9421927854269492263", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15962533525948221648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "15856268902838573812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "4085450203909854919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "2370837049876630969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "13464226348405628455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "12228963567837353733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10377729875228238588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "16362139250976572928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "5420766967862917815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "14578291812739325465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "18310667924071639899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "16853250891250756537", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12990341489637414845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "14630499010941056793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "878892264408839067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "9259437778054905599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "14974730512607138726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "3600066510593746268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "3140230065585683313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "15891662883560480723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "11284755586130392759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "2281119269283845320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 196) }, - { "12246408434917478929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13283842370311517843", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "13753473508578037346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17123153447808465303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "10700011669103135203", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "9979259596137305973", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17225578855755054959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "6471563320494376693", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "8146945902795164796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "18372284940315010254", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "2194607895573544953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "1332624116953483870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "158222105675022402", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6830387121684699972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "11077503608116183709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17847109385592002207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13384754476437374504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "11462462742322068863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "4265693151382066296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "11070620435959083971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "6982733543386888622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3563614453014995411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "3498490999014554104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15595549493819416194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "14532844474906286088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9562291747339451180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6772239376357727149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10690972785852373520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "4488336106517889531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "10058614204420018541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "13865227850818392065", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14100870590396726248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10848277915422577656", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 43) }, - { "8121179472578287280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "2502125887857336825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13192808619929896995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "5115661026367632863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "12812685418923919055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "6293403765897901528", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17596685300497748803", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2150326211917340956", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "530491406341772040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15197248015210313435", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2816353973187452604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16383540667048742064", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "16820082917500285799", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6820284286806022849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17285815901490707654", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "994182747184593564", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6642767323474835034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "3215659303601163167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "54975980454651672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "11529876081402974396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "14026570177552137240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "11686670048744589243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "6678796313875454849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "641417817126876622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "9622546530872848323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "9194788897910888066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "522181557896569275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "3332334993503432420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16131448347558322280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "6585223640997887253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "6205240287062600210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "17522452942286240233", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6571438978296387721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "15511138074959300404", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 236) }, - { "11107930597263802755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "10320711719466983961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "16884228931101540030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "8253823502854784432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "6025872155179042054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "10173283505468233128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16094174852600023296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11077876432364512822", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10586018593856542117", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3109104171383198425", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18136135457402651842", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11834683513280095384", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4806571630436601566", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14849108908297747749", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3644282167178264526", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "360872770877634346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16720108310653948550", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14353390922580547467", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9868561386826862471", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17465517455679097501", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5570311824197099845", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7524311370696987092", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14070988879848388270", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "8296551195150971668", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14352796912241296357", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9840495023131952174", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4720851194954041037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "13852065717057446998", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4342360467977736802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "16336482874764861478", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6075691042233712335", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7570346182940928159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12971822824884826169", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3033264172690274208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "17301887391757619741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "15790005937034794347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15464327246951632247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "5659168916726488798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "8079376692609682448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15160738482264643601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17900257435531434807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16789135236017252073", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13224814158106791463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5078905972285278557", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4196367396954155354", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7009873605945341897", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7199295899520406795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16833854122884184025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "14599780481362761532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2572395498687401679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "11810221946429451169", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18084635102736402756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "59739211822469868", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5240181393417899912", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15962137123591591534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "10989937450490049763", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9798585825695496550", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9220830217525628783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "2235210915304938149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "3930314908786112883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "1334070221835422461", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6681818065741882453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "6980201892073961793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11530101016435264783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4801117903303888658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5782934278345953016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "13951717514084457087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "2721793280965260548", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8124736388338424498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "12223993560805441284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9860570706348640782", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14043770215999952932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15277856047844308598", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8048617952947915835", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11446745541571732900", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17422822627612865758", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13954144830230671601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11198908896401597838", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "5582896843095691256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "8133587696326295326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "9492402787848610840", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10515519878978734341", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16706121580364790904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "5495776091407365966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "16430562172386510259", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5673972310424776040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8797843396807284399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "1698321314111848001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5762290464889692462", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "3218248162832023196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12988961529988078346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11683680166617045816", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6252429564537528709", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL_B1_B16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL_B1_B16.cpp deleted file mode 100644 index fbf1719..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL_B1_B16.cpp +++ /dev/null @@ -1,1823 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // KBL GT3e - void tuning_cache_8A52_B1_B16(tuning_data& td) - { - td.td.insert({ - { "1375156980278317418", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13455881643467418059", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12788968383428254917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13131740479277027362", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "3390014193205017427", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1270467775674221667", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14462744723628661203", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11717348577195224554", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9275303306340702111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "12245096462203481681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "4999505377862312410", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15272426400992401555", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9325097933807426691", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "18238669114790278675", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "2738256633362038820", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16689586259416414782", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1525652349412826502", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17683302016987200208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 312) }, - { "5615525527388396983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 343) }, - { "3992735701291817771", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13208739898218342989", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9536348721941264933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12803521018213865796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 277) }, - { "8854783036772473804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 266) }, - { "6766480740724769248", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "768423629375648579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "4044100281521441011", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "873240542570331563", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12875236165672036211", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12008819728839685704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "2486645741683554648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "368578589584714524", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "301201776306602054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "13152181652632422771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "10311747599696543062", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11258322449556590366", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14095734330183410835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "14910223536998380801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "3352689317181436056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15832740972576959202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "14732184525012592889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "8421045774757048067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "941232110069825628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5326247361632903583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6214194654733781771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10025839973092358719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16711955423531846725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2915165824085219545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "17108987360340581555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11972097635078477347", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16926950874716567095", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12643423612381102003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "18423051691107460439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15381833359831622179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12040626513219974957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10647227605517025377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8127570953237266335", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12876112384009608387", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12663860560275361463", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3217246278485567748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "15713964605078748923", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12293786134765875615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "16043683538361975370", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10670103699537731664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "17854578307286932628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11443268857010762276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "4479117540570599742", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11726298758004767743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "2968031010495399536", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3797957937905580811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "1474271081523145413", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8526484907799590618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "13723543003759101485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11728824117049687850", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13268525255152984893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14397348576352573007", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8616686489737649890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13176385389367548697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14990645740260870030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "7472330881076141262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "10892456883214928095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "9522661528867955338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "17856816245251319111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "14872992823083730615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "3106591708459602370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "11609821372586026178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7678457226823073886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10118395047539851751", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5389189982064081933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "1742897526168249500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "15331103261044247142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6644418194983229139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12478309735214802531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "18012549942299450620", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "11873734271080160669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10424278617647597641", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "9553032671453999824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3860603464276263676", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1207026216972160297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "9519623751582710696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "10328182165125764988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "2231648183489019418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "17599383258252980421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "16208488491972128275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "13379165253894817165", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "2566302789609970663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "1478419046264331178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "3087801652564627458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "16103943009195163681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14230385851791760020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "15293727142789007900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "13973179950424276578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "713121569924250372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "7947870656736319919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "1663285216972929652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14767888121198814523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "2124033349728954551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "8762901342272872498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "17006133396401462698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "10783981060353445280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "15110359240685619357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7875272450497189442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "3281207855459771997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "11932770338770247767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "15860915170591763391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "11716771904412649891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "1095495157025479260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8402692278765063674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "509781001842353609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "3255465741612432300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "13439896617880328331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "7134654288295280046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "6769243149577568817", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "3480732841490521799", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "18269685060032395235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "15649927926091502215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "69439315851965666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "156456996459945842", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "3012566432840424198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "16431165572426232677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "6324565723045697080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5390559917122707732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "5469227748156438008", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "17163158934005653629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "2307310127637739872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "1999979442136861875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "2527189070714658176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "8329846097322076175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16783619135298589974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "12214162812589030126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9216608098626790565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 20) }, - { "5179760459095053114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "2452226948562393335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "4499586349553581439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12668149981216388765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2287356884312581209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "11115684531624462986", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "6483208845600234755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "3752171257634205726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "1774158624592967937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "16881283637687482989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "14749947225382670869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7351733901977025859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "435888248913413834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "13713406612642090169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 20) }, - { "16582132711225619740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10436819182310112786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 20) }, - { "14546281065004619074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12558716383635737426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "12609361477548272638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "8107447526839063293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10995907213890714701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4871907623235871050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "7394217382008802567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3880189981766119529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 20) }, - { "3759057398165607194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "4561874206785244358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "488298169768725160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12956726277674279950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 21) }, - { "7177837234452118325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "9057036344533510776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5093049998173715787", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13761566845514364807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "1594612401422787491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14603590053512154268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "10136369729388564720", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050675313067213312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "14221578799010900252", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11723735945517472199", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "13810995219720233595", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2704063557078535883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "10384537928514123040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17427036330773218054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "9796621763733208035", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14046114605615338907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5763440554939527411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "12892693137085610062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "17775705003104146872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "14878347463243157447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7368916076070115064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "3499109651698979012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "190530884420224257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "4202645222013675478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11324851661119942609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "6232363902828992968", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "4299492266819967844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "9481675228591993785", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "11772741918108731396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "18419183012101393192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "17832542092610191859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 139) }, - { "11771014003680394135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9192665896782282996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "9763310312421884308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11430400968543668873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "3430266954211750407", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7172604084103519563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "10306542963828398049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5235375820995365354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 137) }, - { "5091558853871982858", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12914986936318857086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "2265784112305305260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9019388470685749691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "12427258337646070422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15884763176333003771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "7211355951470869591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15399245700982979379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12644942072153919043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5876880412336151866", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "13775529405693629438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "9048522050692986204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "10642327923162019888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "6410682026872155392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "9454954846682513038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "16463823433924519300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "7279393739634103483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13358283026528078900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "8032685176029570383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "949330876419581703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "17713034180977313726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "472454322186482185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "2727219457659794468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7852745450437172519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "6065819201836017182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15984885011101717258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "14811022197918391667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16146350476627599543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "16173557782125372935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "296142385116663420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12655099960717366198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7937870623766562191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "9367157746678824712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "18062849937960759210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "11919129623429545762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "10522649794540845800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1104489643524273315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5419775002149092646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "9226912483632588371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "4958222070605478947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "4479979951990338510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "12022152681602871455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "5740738339752793113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12087141795291232248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "17825280904760131680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3974589991022739479", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "1838534101161814609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "10046663998164493552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "2305461098719675735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16504962609450876148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "6345550009198921347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11239754372812258455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "4347816192417741558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17809920600993699808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16710010075465723498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17729546848373991614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16998508915819714690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12952980509662451384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2683507674615735878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "13059207969254830451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16295660312557315941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14089893422771228191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "18034648276860485300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17739868787095417856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "10880081193716628051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15916505622570323098", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "9101018613418825655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15650839696475698676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "15628121900226431719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "14554225625951128811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "3134489458855347772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "5627834277145735283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "10729288973933590396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "10869005786136023160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "5597908143491399643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "577182964135927041", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16947969669087411530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "861419637283812778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "3643250372952944907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17977676737774695825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "10309504812060596568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "8866736221671835567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2133849627845285277", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "12793908914872030220", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15947699374684516369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4660288622381620227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "15914512645931208899", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7460672405409009037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "1541754036637209097", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "89439319782574517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "14088382963493477342", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18203935818408469865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "13191096881934434519", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7918742312252115870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "15641537661939240413", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "157805434489791310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "7941729567451949422", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10628725059172743408", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4492673409319122180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15857087373591747006", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13793441296561946357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5172712078329324967", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "8780604510524622314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "1760690277175249985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "13649894122307008732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "17546566148752689536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12675313398314286884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14621327324047759584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "14136097914489095982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "7638626850074132214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "9399994156762372761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "18068050257421269408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "11830297960718214360", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14959566236432790882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16884396694505987920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "17947818179123182001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "9381304526221508530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13932662890258900896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "8268533335852735248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "17419874083634480896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "12773693193167844110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5157249499936659040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "4282661608732125403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "3159147743553063163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "1706927777850488363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "9839670675413379092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "6780215829176686721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "12972634653821069685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "16129296588866116913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "18202222342562516071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "15426960908024585800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "17026284168840448378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "18118237182023167949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "11113256687741667688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "10555597973766215754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "17517495652165026573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "1832310305089212990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "13855438905855887272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "15349944413643626251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4738743763536059708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "16611452077660879545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "8101977280003030465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "2012181953284568566", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "2969389503332309296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "14515066741400300669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "9373353053843326128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "10023279637210292010", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "1103204698908514224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "18092842590142527927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "12174571114411168588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "14431607479949498164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "10279778381617181802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "4237276338897143680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "8083672466967374860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "16705621644424684055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "5352861363832390974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "16945184617367657570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "2995134938466176198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "11706378390483804857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "7958459862276998225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "11703557271443535142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "5020788604681810984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "15217183882858251099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "10650698451740924172", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "706370730287471796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "18199526506796726885", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "9269175963143039426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3691705516240577130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "13472532612464340803", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "12388375914105990324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "11582534256623549131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "1653274345637156919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "5893940382830835820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "17700958439420868719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "12730339458081890990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "6631816968511312100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "7000524935770116969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "386749666417295495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "7162575953766465459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "11398019086259011063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "3041612155708729812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "4274801141127703532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "4865023158176874622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "18424912460022156378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "10408322429232132983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "5277400567128489977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "6848989271874647093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "10085059621136526248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "12962552332511702682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "751912075185318190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "4505008254511324231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "4191326605459754690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "9824678205469832038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "18245935804520236353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "12309132521191764927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12843671306854567956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8275277322582733101", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 52) }, - { "13698389420396031586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12949204491386872217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "7370273921473161914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "941829593638869991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "16206791915939407806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1500571771538985941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "2095802691829304676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17542414935564676110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 52) }, - { "12380856644683171627", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1451466106918423837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 52) }, - { "8071957466247137919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11661208196482963286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "6635217802203685464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "265124365266629363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 51) }, - { "9513032457323269513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11814740669468421049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "5221320470007950766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14359530849521980269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 51) }, - { "6181651715051152713", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "1450888744802985214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "2842103889477438816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "14006248791647711759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "7072606962946873975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "3599823735065658574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "11311859068168414878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17525531790109748810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 51) }, - { "16749148369456398030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17556238490521153146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 51) }, - { "6067904130482758510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1791615587935799399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "12985650543127289023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "6714886136800883594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "220326805056361171", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "6777045876155144709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "9454512817077883797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14011124615649605281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "994489782629179836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4338023436590582323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "1152693503778768433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "5994204139128667921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 51) }, - { "17243576882981097341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "5524218746051008792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 48) }, - { "2669822154816760632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "7179714714302073459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 51) }, - { "13002363400738122017", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17006095064160484022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "13733327241591630239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "2623687018437195679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "14077148976508649021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8272823732258536202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2451712485584835395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8057302050645780813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7430073011895298582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5095827462645341808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "15129834325410878425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9660812093766156608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15781622938833984014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1089679781525023551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "6129602738379919488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "5287076386757143976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16076153317792960383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2108296560864415762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17006655627343469372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9404677451270692749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1372939511728986224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5311718276151327830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "529543453251381109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15591167992985613695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15026219694198820614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8258382025812748961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14810839157236175179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16117738994809548007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "659846949368492111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5211191663202250117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "13418701036204748812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9714764457768279762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "17310332946322628458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "15975964562807570772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "13447028922679236865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8337820318779061494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "18136765667969393174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "14821616804286068969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "18386376129938707290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16609136488331186895", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "1996860183441418841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "6491244517639245276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "16312223896859176991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17833517350994024381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "4226968857681929488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "5141753233513623264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6860503758000008398", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16489624657475712467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "7862815466573236157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "10679760989906275129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "852092858392507925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "6996376303337512293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "10978173291465325823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6670327979947471550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "11318913630213187720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "123251351612308092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10784073615329190425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "2261453441277654139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2937907409658060025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7852144838267007144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "4408772370026995920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15411474884532403722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "9462315044265139531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "6419580456182610836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "12277470820821378855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16865879032845300007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2862999234347597091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15447513376965243034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14420809655798184553", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "12954154886708228545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7575634241190730697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "2344498602308448450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4304041922043496030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "10971070835319242371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "4862529593282936100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "5312140481706133684", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "15522785615618973614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17798636687709019154", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "1938086876393565238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11897113890115321056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "14363654136811880073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "3928266232090746643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "15882969506682501496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16426179645101678763", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "18174857480705846286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "598390166442977699", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "5522698342845820411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "11559360678008060513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13184662326021747000", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16037141448095945650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "15094664469997373662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "822162932339827810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2597453794298356435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15851356529373376076", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7966454753124154534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7311120574972466702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16461809076899645037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "11655994466278963438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "6981537186704688907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7903891232234389925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4229105529069729944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12796777049340516563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14289048840489035546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4239133538073498792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5103094815475470596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8560635685184432720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16264774056719724826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2571882179292959757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16758962840329202004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4550028191070279999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15661322183507404821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14650567822254940018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "3755253206085028904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8751016391945753900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "288853243482418538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5047419871737940985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8819268903800581706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3746573775462003750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16286085532892593349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16547425454653232058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8195881973746570408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7712831597869354170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17035903590837750750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1907439276166837309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "3036808833459559381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17928043901784474130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "14667209474639064623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1701609125136907870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "2140514316203117958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9366201112659847392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "7808544677773370430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "2251029128552117936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "9529614587861271730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16811402686462277562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "10554266898346470422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7817036102984218692", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "6329618009202266591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16936366288366370882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "8025053805734757314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "534032316469702287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "3963106895592011725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "17994361454416813294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "14902389080201926109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3796274347773622633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "1306339989221885682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "10900880512948479338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "287386909600391846", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "17542176922797334839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "1081962464388501987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5831419373611158773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "3179874645565098825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14906458674793172507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "7334966010680206302", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4161141078006269526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 272) }, - { "6522575549211855712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "5629373398445592781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "13374993751390784382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "12976499206227689731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9882204352209412039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "5041111302824362529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13869716373706247686", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6438522646185979880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "2406816735581074778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "8881150100883636392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "593712935037568960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "11970881115757095265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "5584432943673435454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "4560479630843098090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "15374625876485618845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "12278364834477923930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3122997634505472500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 300) }, - { "15669490019428002270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "13102754309439605192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "17912189681971987483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "8153567933591966877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "1604661321386793876", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8990561333549136048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "10728212277329722684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "877436308867220589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18375125668176498051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6767245864232675168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9287404618748313247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 310) }, - { "8728178019712933221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "18251360413872841969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "18271689282126907793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "954796765467489259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "13597240991532942069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "5079055505117153635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 307) }, - { "4135003545872878882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "11883485911218628865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "2242915551775617989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "10556089809203693400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "3727142736386026852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "1622880009460832832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "4437258459981739942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "14691372262153587653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "12181607120522804433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "3159681096461848644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "6729785110495533200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "15322019609805777935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "7024495439434892956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "10416622008071151225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "5796500397424307442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "15702382940521972117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6093575518270471235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "5805383505505929391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "1801731858063091191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "1559798212423183813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "5594180958505308003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "4766071144928072260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "8650948093564284852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "3883845471211207871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "4366168099274266975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "578703329577922869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 184) }, - { "16863960779539003201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "15450609897480659306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "8203550467004532364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "7431849514656037251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "14484890926084856480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "7777333052643961206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "4424217045094988504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 265) }, - { "7994179151788368291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "15192024816519005250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "4747159205186229582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "5485971317082563152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 266) }, - { "18128162750557822655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 90) }, - { "12421707187947291166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "792684262493086891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "941626985322260281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "11868551452004726281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14352303529756685990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "10702234389482091891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "3895088069642140043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 105) }, - { "5334566325056222430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "8306337702797456793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "15720507574336564201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "3277243911383750280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18150429561058646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "11169292427557543138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8295126647635181949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14213516751025324346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16509472637458153234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "16589607587365212240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6988674007771237080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "3448477246688526708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8507854696766492454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "8906588133431586825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "654122557966242717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10196332102593337214", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15831600396403741571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "17808913959977434594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "15548971488532746290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "13468713306678453952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13613399861925108148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "17802514063213000148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "13093429681061786539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "12247991248100147706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "14491949194619001237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "7590767013583950613", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "13210604117940125947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4670443882075998209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "2857337999074313592", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "16036386660666696362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "755414184406250882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "12190841837604350271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "10292243973236220688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "17793292063552633023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7605139219344415117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "787363431787954804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7000486794832106857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "13608239208821071914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17281202179589913619", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "16985912104363932350", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 133) }, - { "14744368497944610864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "3737552767159920174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3792945601873900927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "1364546124782880196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3689722043202617487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "2632535010129224704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "10968768803038046390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "5353552956675518468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "7866128397931438774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "18233660940545931789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11670430946096342056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2627779045483019709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 92) }, - { "11066913713501760080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2552187713769926425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "654821507679356726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7606728651572102823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "7549378486471456156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15410074937424854348", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "15114370307779942381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "12112853999307505628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "4161612746310931789", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3388752887767453958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "14046990030104971367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "16230621843665445228", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9274179337770060652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5115134711994944288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "13898821685774165645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3007637520820789085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "16294825599850364701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14681717813022425567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "4915831715914920982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "12894240573737168362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5448537627319798272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14389915292223442327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "14274685812676150168", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7732899312577293959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11956435900037329302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "9263063714383940562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "5824801192141531089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "5608133987357542077", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15392077168521832549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "16446533347502650316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "14762599606783897222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "709835724029986012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1572991986657256775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7398196853452900099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "8140094412609934765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "2659031931257084418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4640028527711211109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "18172711677056449158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5183231560876991543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "6821855018718422278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "13237050834496100264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "7164580481046523192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "2490155559809645659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "15430549683839591544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "4553409514380460123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3041752019114501584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "4161001033681779582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4764776977138392550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "6882621854468565774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8881135571874888085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14038261392627717712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "628191607060767879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3511588484597779204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6904130543085920483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7924408980408826942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "9416186718345824095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14719421757340260468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11936419502418995274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16601702334097258697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5336120047683197088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15897477855246170861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "9780938731831129283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "1473214668483422172", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17515573322312447679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "18356980026934328781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "18077281411861416889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "2543041530639980505", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "16370218798911151331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "17316626950179740845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 144) }, - { "10414903047695486119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "2809950092498355574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "12011982029561277581", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "11267742746905371769", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "12534001599784153836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "1882052795393187384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "419783127503173016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 144) }, - { "14211903923555028634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10892706534058849825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "2345023488044002149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "5754844816339228920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "17015791782274123780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "3706994659266083979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "13324157125165576832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "12014527187730671229", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "5170245731599664670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "6854611304056079417", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "1954052357826969119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17824431042110985323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3603706453982734995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "11992353959766718397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 186) }, - { "15163327502374403643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "16758697697363920520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "10930115765550856328", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14418429155823196539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1628593159980574595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "15675968397825708285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "9594594523961285945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "6634330132674952638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "8434794604559592624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "3150231129728961455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "12545558125736154584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "15485701086886851362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "18005721959893562716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "490233152678323691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "4073467095502162430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5801429077171542466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "14841539539334726292", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "9404953235624894187", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 89) }, - { "17995371099806008878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "8961138963663532667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "425744529089575241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "1316444335300814745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "761169277744593430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3325727286860556323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2526832080529662683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15470013032930986062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "12255528292506999241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13119479079474639169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "12813978452097969536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4991419288164762786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18210370419559876426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14962768577232034246", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "1452597292381229708", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7104756264011682902", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7744787957569714828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13503688893307029975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "9133263538092913983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1383899865465106141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11829442945690098558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "12394049027081208902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "12159582810513550491", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "17738299860390552088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "797387385159110695", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8757900457181374694", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "6048964584602891448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "17882819773586674851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "17829148383265978140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14711697456265712456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "724953082687879224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "805221045541170643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "8241070786700614317", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "9191832520273617003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "12408889192918919210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "4885944395876887711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "2651385050387738902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "6303682540621797774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "905780459938651623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4476928353532757380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13681462437496627948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "17243648226968859637", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "11192356850081328892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9323825370872655346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "10000618285883395700", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6418327009347170687", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "8528750110601691390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8061914949376516780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12992194515157698316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "17870874477143985774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "16234606052818596502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "9148379585489720669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "9270950131920019932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "17001502418583498926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "11163107409437069532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11465965972527519631", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2534408579674556441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "18109284647478027063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "9849272539053219052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "17382660912493284320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7877332346656934022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 311) }, - { "6323026044750482867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "9761573038170759563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 119) }, - { "12098146032672599222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "1403617451623027879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "9058996149754556268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "5864250949922222051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "15847413004526420496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "3199841714087553410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "4957638663977636791", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9437794960375526230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "9475130054420979752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "13312514874803986753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "15997754881872769378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "1941341635794709702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "10157866834809927320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "12308359047798183133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "2986189945936592561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "6928835003016610382", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10084794570892043447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "15417738436777481469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "18377298651236993830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7354234812009979811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "8656468860180713379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "14472187692485966933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "397770940444464146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14258499419905714808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "17599396373608265826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12935563359569230797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4892959859293355837", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2802810524370514276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14907097142953816744", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2525260242689556544", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13328449155966085543", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11856266545854830143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15993427814066246646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "2100891581797371600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "12242618640422208652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "6133592828563353516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "18232278892738147217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "11992625045241269569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "12601126285773042005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "7457899998356343871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 311) }, - { "6343888265369366589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "10791067159964399241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 312) }, - { "11327097771110264965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "5245308722062496788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 306) }, - { "10792503079194374004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "4818231379191523896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "2198278382394812839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "3800011935243649447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "9631545863582097486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "1779941298820543013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "3621930417735246405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "14435120971846098308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "2893564501191050837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "8108843303778211282", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "3682813162987778705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "15494543914974994991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "7565221050911842393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "5629670679897666607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11754316727756881612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "10990741293315393791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17024388383581997032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "10302338806536775954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7915318733663535312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13702692566238948173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "2909728331855309274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13071545223094862275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "9631481972809246378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "13540002981450186147", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 104) }, - { "7076937538747704750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "18043340998699622388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7148542290597073512", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "9040046051053703359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "1077773457856682663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "4716188972902735458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "17343050785312683560", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "5687802882700097624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "3524531620118359828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "5688478347124565305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "5504757952698692953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "13800387305792597325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 292) }, - { "6574971185849732667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "10573920781439771673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "4992668316921598993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "15778834188130183853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "3062101811226530720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 151) }, - { "428659495445490820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "956022649859563080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "13410850301164057911", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "17423645390621980919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7802311886554362782", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1172103288112689821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "17353894529222574441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16431857516454692096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "9100044555742394133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "13115589642140732066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "16190949264253468961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7026575758396092435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "16761856644242716357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 88) }, - { "6341197991729122563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "17087740929472936216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 101) }, - { "10795104632256101599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13327653786981478088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "1096671695414716274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "10774528268153772208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9525853014023664813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "10632020369698615114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "3234107167862677811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "8708643228914766202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "12415368596357091523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1028160614515220430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5275016494706355806", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "10947686124973711385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "3349519148124496343", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4003433148846544263", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11718418772370938734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "989564341557094953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "6942049339361951275", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "14555883089089918919", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15320845027635796583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4014667229872705228", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 68) }, - { "2438374917504708831", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3272017687600371031", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16067605128297748820", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "14150012830816329527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "804195263636995800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11528417522960871233", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "15378025640603637387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12860222041026638681", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12725647706191463348", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12553441041059632729", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "12782191856884962803", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15824189967727245909", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11149782181562145291", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2653651564133701304", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3526580286148537369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3985659568982275663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "13642146548740074992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "2349007644347065353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 119) }, - { "6146876760962332928", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17434429579652310107", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9447458159095730492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "8655883535274781128", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7272538316511343863", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17564338309805484464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7881187047171099732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15579919505002150556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "11583017348580874022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "17915846724151945664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5319668297345215520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "17208186152576814861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "3633858263279042265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13853056718266488510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "14759179293743468995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "16995873636564597028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 105) }, - { "9438739171104456179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "14429081455612806819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "9819596940685093690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "12085348936192462321", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11951606039079763598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "8769060267707904998", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17104611871050967957", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2103882464623009432", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2659712601063515059", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9759380701896779097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "13842309033760176194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "2418288192668085805", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14994322266840011040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16402312692470500253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16955653765071712611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "8739347545059610410", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13459514533473657102", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "7824524940405130010", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17796310681498690253", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14823616678465136590", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13816104794723484993", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "846088275031979661", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18125732229366977468", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8464582977975377118", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6290317420155851465", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "12696412964119109465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "4994591211723226974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "1036010477232750453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "13786357802945430475", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "1003101267609305257", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14991602704357959545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "6181308879301978465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15488550074426713959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "4062706195708729345", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "8594644182487917002", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15881381297320383917", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6040286126398028933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13926122593957480821", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6213386558868267629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4456004887590847716", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9642229389394495047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18259656768460999562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4983880246908724272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 264) }, - { "7881579844586294503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5331173521406046122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "3285520504090196295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7143510787416483146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "13104509059416300615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 307) }, - { "10090923790949378407", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3429844423226609965", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "706049518431331645", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17193614571243427089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3621424752591567930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11066930104187448422", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "209732971447020989", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16044646335477470657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "2172121470071868949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3392693938352572136", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495063314176654751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14553856088069405595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "4967444801764057340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "12160764253455777655", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17723621158215826108", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2171768477223405739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12672995204641007004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "5622089373755094139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2129726780118554358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "4160656836528944651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "11052732052072367261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "18432787283148809023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16172528828198474326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 232) }, - { "16327433707667075261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2797723586312707948", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8451212914744825089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7025975403069487257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8913950860101596091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15308578014507211237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "13132804928635689780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4465781406991476376", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16266491618150971928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "181006047500375768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "18140951659547259039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "272730229972987861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "14898892437285105327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "17252449599613270108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "13436376034548670107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "13787436604877398090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8873614802459592665", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13663893159182636270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "1361159591875955678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "5912303851874077576", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16245760498096322525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "9928406318940388716", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3036512701943687724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "5334291640387922287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "3002986032379998259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "16469788155263456039", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8709632541892447149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9524303276541517389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "9354818521586974021", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16781127329510211966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6351572488552853754", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "907036267078333137", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11855070245618904113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "4544242784357021697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18218631037214746168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "178353385245384751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17658152048177750315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "11636129433022017868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "2622434279674583815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "14335074487552883436", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11175955260573469979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "2732519635571994212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 239) }, - { "13893789954946953427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "4355933224673863178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "18037918102910297531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "16071723603031305677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "1697248235682953135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "7843498978148810586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 239) }, - { "6767159196241633301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5097818987523855112", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "6623182990939010641", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6711878663358611849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8671491767142900139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12164298124869114517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "17089801601582809764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "75742659105146536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4652136280940317116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "9751582946441607796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "16706244336960642883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12581879452540858313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "17443356777503458523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "939718260623752240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "14131851237755716991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "7474639594232203854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "14152716242882609401", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7998930863626763670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "10323345824599612614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "30229601562833524", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17788367809717898285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "1509728225855233852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "13139625572508441980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "16491532291908469567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "6355395905401306995", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2096779676054335057", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4217179485243909459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 232) }, - { "17101789600628162503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "6139574161497189424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16559140502701231107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11459784003592366395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7869916853707978306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "8132521728369930959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16108573960501496757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11086699387784339943", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "4013707396889204359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "11850332373794932468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "14763982961176216679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8207349115037232863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3273748387141431306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11254635684957519432", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "16816222375242496370", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12809199739984715013", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5040730152867713388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10429613013253088132", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15451919862187018297", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7546586420552408243", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14487682847898298214", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3106710091841093202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "6458124573210430792", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9182897385081081193", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14462438074931673266", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18133334552107213128", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "38736266675995457", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13654816209891478730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "6263019986730305851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "12929981792125924963", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3138374672801504481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "17009318615658405230", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9421643783312790618", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2294026590516781945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "2940027113687311893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6090625728451718945", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5643908654122573882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "9065894438656900887", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11185156002426041243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "14670068483447729857", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4623542918584461522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "1143214652021653634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1434535531617424039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17025268985366223779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 104) }, - { "11507538232733291666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 104) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "16474284418841532356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12461575861709234385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "192209423643075326", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15490478608105402679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "3491333679577961640", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8176012042686275874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4282198629458668761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "689445825453914111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "969746749329671447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16833026567865627676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13046322179198317310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "6902644989079870993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "10987953316324712538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "12515465135362865565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "10049571207493913006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "3926585856863002495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11275109735493317886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "12238674883388043717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "16108759090923335184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11756881293845417212", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "17839839336294937155", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4703107905652287491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18180820925685532104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3835286851569826052", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7807983899017500046", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "10294185397756053636", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5519535335798045279", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8701248964531180496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "291868903926685441", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15239764240622554314", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15963038745470172423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "11428599290755097395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "3180320769716158201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "583303098958523195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "318377908569897093", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7353563160591978243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "2582625260054352916", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5609922876429907954", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12557015880639217508", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "11528310408333718862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "1471837664358450291", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7351401242363888463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "953306082374100275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "15759530339367380982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13300022131572486202", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15689502054035168040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "16969463538496570528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "10237524128771958432", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7969848911698660033", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7130694811424715594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "8578747191812631883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "5197105253412476591", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3120553928584920777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "4750894407873652809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12667014405537239093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "13644681270630373984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "7187734276051878356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13253775441326432265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14733510474010040334", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3336303478756453360", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16352331970945217438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "13484950419220835364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4674416595144505741", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14559308665571750465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4542143431130171516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "13189392239349392492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "7009735776703529573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "4220826666482500445", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 273) }, - { "14792528369891965810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15287650965861631130", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10308175009371219583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "2903605246599054308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "9213563311267466388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "5019077257951332016", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2497756607567197523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "9285566577169147378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "3432296808755992670", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7688176479120305539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 88) }, - { "8818070832398055086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8787816339967963727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "863952266514375915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "5835634465164771899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "15101680837342453931", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "1116274074896622552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "12790788016297794214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 43) }, - { "13538051178827008933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "16403423801823379909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "3723613341885592267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "3830703844770425343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "40704767167309552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "13973028408397200796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "16561224775421968533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "11243840588602365090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "14103112843209793966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "10483664832302187567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "8100595788531468781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "6620782733027313312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "5644068493155655611", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4867937397499803072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "2702144517025248597", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3304589333915676807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12894625941923144893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11649407835105973949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4897991181236908768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "12179581684777023804", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2806529556090896246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "11327228813412934262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "5485749317130402302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3499243120652875549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "10916647716124396856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "5749536453225343663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "789359733867650915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12626014184575881530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "1201692134690347847", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5219399418946822456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14217181622713951411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "13025323039227543550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "6114147683777615071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "2355214244972870639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "3167336012388169649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "12218337369633748663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "7264756313770306662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "10492056481694320580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "14281201038135286621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8127190765748950828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "142486914279119363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "1532263118203058517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "5482851829165191681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "10548792624072794724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "4239415134522959352", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9028970753877215614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "2324120381399737261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "10267260789603562117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9988801796928462423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "12516911293946682547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "9213886570531053949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "385046297070779752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "12541834857357563605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "11709992724966310174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "17222005830854879661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 15) }, - { "475043738497218394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "1071007164550012186", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6719302427415173754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "10482582307328548806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "6351347283201596793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 89) }, - { "6531171505861182429", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "879005904827468163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "16290626406346691996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "4569338575782832784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7575675354187625951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 307) }, - { "5795073619189010837", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15123868617509445149", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5601435819039968726", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14104238386345631681", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17377293745073971167", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12134712464763856064", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5524215233998361104", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1103228955716492167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "8618835732380720921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "15908673392788376468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "8482147530539941792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "9069334144391048686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12493863403516600413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "16692569816843207989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "3438116423688595487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "15602863681196390535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "18277685132620834972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "16541722316343690197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "875142032423622622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "8965747921518186477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "3067806959725855130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "5779388310240896974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "11092828091552833150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "10295330953350618042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "17791773192152464021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "4894227264080887361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "381149736509958403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "13603318842632052764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "8929453032482114162", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "7662200927459001757", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11473442921040533207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "388828310152538138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "1643241486250690844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "11806105193035393795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8843585527713905568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13248567106128518549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13708979487306970634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "14406070210216948643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "15352245788978088971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "1435153323458789173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17638692805430115529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14068780861332616363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 340) }, - { "6656593119788274992", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14695781272831602408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "15696910741835640150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 275) }, - { "15315327794058441258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7545013298074733778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "4026686872534942904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "6553736978928374036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "12129572274423886770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "9723314434598141024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "11031625790234068916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "1138439260035360722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8323445733669842657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "54019631544204590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8971115542951085891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "4584970211859494304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9321208819255762521", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "12617625046664709483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "8264178890341675354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "5334190564423375247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14746359019867963124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2044363708106765326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5132761922124425835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "8141428150264829362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "276407276027553756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11878734040194151073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11622925573287101001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3192332625020432602", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "9785114056964539323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "9410978119783758141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "12523676912856063091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5912451559447635837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 281) }, - { "10264913782610095832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "10309083227104422150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "8500148569566077929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 283) }, - { "6578908625437515675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13762042713029963144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1561225943337590599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 121) }, - { "10917498758625273194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14335423820860953927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4428101657497677982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15901675909820977223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "7962991673727743706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "12141300895511301068", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "17106086048442658788", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12707946849050970702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 150) }, - { "17154337492545826355", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "10109431802089940590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9428176632140441528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "52089503050497755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12297371032753209816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "659150305191479097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2065752819810364738", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13583166868754499339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13991205023798493715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "8939683514448064461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18337160891834020517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1154228007901031779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "15156525717629023944", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7757331094141318304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "16779678846332091086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "5409924335138540834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4149728557142033774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "6443517114667332732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "5419041493176804960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15948383678216076358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "9604982746455852556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15739274921308457528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "4642234334824303290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "13200151444914751729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "16894871557229780934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "9933958860597451711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17094948685292534952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "9762182215179534181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "18273537339378756543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "7720939595094113814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "5865480930796299143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "10058165874008941852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "17309326904418811234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "5592428580503282095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "16348402367953880206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13607830451968188080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "9311802150474489673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "5159470523468873105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "7975810844103449438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11455843788148231615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "1410630713443793537", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17303408650780384587", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "12069726772532946193", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "6204183474669103812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "12874626654611400042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "13546876216568825877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "2973436171295280783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1908809004094565452", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2322559721899919275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5766507688771440170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16626226341188424071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14224121742920800990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "407189201971322683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 15) }, - { "8460847842045253466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "879896719155824868", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5219048275475447369", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8707189142909022305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "5948701218437980356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050143605017295447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "8906185843274300447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "8321769923556905957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "10433541468308381909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "10405183426600618231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14885109535362957947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 101) }, - { "72444706264681262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "16818714747882774917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "16236397968499692493", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "700717277178942679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "482564204402769504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "3221221905804708596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "16467987800266816984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 234) }, - { "11599932445375240727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "5057534502588100071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15640202505592598653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 343) }, - { "3355259926747524578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 343) }, - { "9226443907548972870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "8104309105061227444", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "18384657372655350144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "13739257060165119132", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "9810904714798127155", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15609627722687211129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14738573151275130683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "9421927854269492263", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "15962533525948221648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "15856268902838573812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "4085450203909854919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "2370837049876630969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "13464226348405628455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "12228963567837353733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "10377729875228238588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16362139250976572928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5420766967862917815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14578291812739325465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "18310667924071639899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "16853250891250756537", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "12990341489637414845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "14630499010941056793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "878892264408839067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "9259437778054905599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14974730512607138726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3600066510593746268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "3140230065585683313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "15891662883560480723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "11284755586130392759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "2281119269283845320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "12246408434917478929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13283842370311517843", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "13753473508578037346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "17123153447808465303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10700011669103135203", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9979259596137305973", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "17225578855755054959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6471563320494376693", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "8146945902795164796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "18372284940315010254", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2194607895573544953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "1332624116953483870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "158222105675022402", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6830387121684699972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "11077503608116183709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "17847109385592002207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13384754476437374504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "11462462742322068863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "4265693151382066296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11070620435959083971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6982733543386888622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3563614453014995411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "3498490999014554104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15595549493819416194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 105) }, - { "14532844474906286088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9562291747339451180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 23) }, - { "6772239376357727149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10690972785852373520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "4488336106517889531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "10058614204420018541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "13865227850818392065", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14100870590396726248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10848277915422577656", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 43) }, - { "8121179472578287280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "2502125887857336825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "13192808619929896995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "5115661026367632863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "12812685418923919055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6293403765897901528", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17596685300497748803", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2150326211917340956", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "530491406341772040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15197248015210313435", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2816353973187452604", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16383540667048742064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16820082917500285799", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6820284286806022849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17285815901490707654", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "994182747184593564", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6642767323474835034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "3215659303601163167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "54975980454651672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "11529876081402974396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "14026570177552137240", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11686670048744589243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6678796313875454849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "641417817126876622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9622546530872848323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9194788897910888066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "522181557896569275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "3332334993503432420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "16131448347558322280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "6585223640997887253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6205240287062600210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 302) }, - { "17522452942286240233", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "6571438978296387721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "15511138074959300404", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11107930597263802755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "10320711719466983961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "16884228931101540030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "8253823502854784432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "6025872155179042054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "10173283505468233128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16094174852600023296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11077876432364512822", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10586018593856542117", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3109104171383198425", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18136135457402651842", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11834683513280095384", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4806571630436601566", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14849108908297747749", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3644282167178264526", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "360872770877634346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16720108310653948550", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14353390922580547467", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9868561386826862471", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17465517455679097501", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5570311824197099845", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7524311370696987092", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14070988879848388270", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "8296551195150971668", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14352796912241296357", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9840495023131952174", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4720851194954041037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "13852065717057446998", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4342360467977736802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16336482874764861478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "6075691042233712335", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7570346182940928159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "12971822824884826169", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3033264172690274208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "17301887391757619741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15790005937034794347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "15464327246951632247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "5659168916726488798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8079376692609682448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "15160738482264643601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17900257435531434807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16789135236017252073", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13224814158106791463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5078905972285278557", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4196367396954155354", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7009873605945341897", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7199295899520406795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "16833854122884184025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14599780481362761532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2572395498687401679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11810221946429451169", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18084635102736402756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "59739211822469868", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5240181393417899912", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15962137123591591534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "10989937450490049763", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9798585825695496550", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9220830217525628783", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2235210915304938149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3930314908786112883", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1334070221835422461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6681818065741882453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 150) }, - { "6980201892073961793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "11530101016435264783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "4801117903303888658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "5782934278345953016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "13951717514084457087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2721793280965260548", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8124736388338424498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "12223993560805441284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9860570706348640782", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14043770215999952932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15277856047844308598", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8048617952947915835", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11446745541571732900", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17422822627612865758", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13954144830230671601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "11198908896401597838", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "5582896843095691256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "8133587696326295326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9492402787848610840", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "10515519878978734341", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16706121580364790904", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495776091407365966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16430562172386510259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "5673972310424776040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "8797843396807284399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "1698321314111848001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5762290464889692462", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "3218248162832023196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "12988961529988078346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11683680166617045816", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6252429564537528709", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B1_B16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B1_B16.cpp deleted file mode 100644 index db3f827..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B1_B16.cpp +++ /dev/null @@ -1,3478 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - //SKL GT2 - void tuning_cache_1912_B1_B16(tuning_data& td) - { - td.td.insert({ - { "1375156980278317418", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13455881643467418059", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "12788968383428254917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "13131740479277027362", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "3390014193205017427", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1270467775674221667", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14462744723628661203", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "8203171222962341018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9795194069954915563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13369603621524676979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13575423234109624706", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "10721885719016335538", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "14567947256029724271", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "10749263296616139689", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11717348577195224554", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9275303306340702111", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12245096462203481681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "4999505377862312410", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "15272426400992401555", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9325097933807426691", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "18238669114790278675", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6664482192233202590", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7454366978268164047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "16135569134646688251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "10572945270796129630", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "17495198214524203238", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "5221108094913859739", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "1092633914190498221", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "2738256633362038820", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16689586259416414782", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1525652349412826502", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17683302016987200208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "5615525527388396983", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3992735701291817771", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13208739898218342989", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9536348721941264933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12803521018213865796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8854783036772473804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 264) }, - { "6766480740724769248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "768423629375648579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4044100281521441011", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "873240542570331563", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12875236165672036211", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12008819728839685704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "2486645741683554648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "368578589584714524", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "301201776306602054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "13152181652632422771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "10311747599696543062", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11258322449556590366", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14095734330183410835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "14910223536998380801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "3352689317181436056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15832740972576959202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "14732184525012592889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "8421045774757048067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "941232110069825628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8975333906619899020", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14800592533315327674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "11816277809167487786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "957781751038897330", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10498289589469975939", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12970943403831707924", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1300292367195167745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3399837016486623477", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16740871614208968868", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "71587235425438167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "12717047049023783979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10478482486372389470", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "6056581247196718403", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3780320160034246719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 273) }, - { "2819320453491169732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16976464773806576190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13321672741246923341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "15140532227060261467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "9400755775406101904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "10292585962794261197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "13048561902713182858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 274) }, - { "3658425022428447440", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "16947830954662293793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "8397584983137442239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "1071169341660439058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "5326247361632903583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6214194654733781771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "10025839973092358719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16711955423531846725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2915165824085219545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17108987360340581555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11972097635078477347", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16926950874716567095", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1212319037405620223", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12397280593466519809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "2609454334520044465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1336940384521633733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15271783562528081169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9533360488591027707", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6930697835136176263", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14444423571297570985", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "12643423612381102003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 79) }, - { "18423051691107460439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "15381833359831622179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12040626513219974957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10647227605517025377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8127570953237266335", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12876112384009608387", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12663860560275361463", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12352923639732112511", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 79) }, - { "708452703070938673", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "394778201589371681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2477849395789783501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11637325834858582585", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1485662490111767875", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "4300306345092124175", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3402183863499902145", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3217246278485567748", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15713964605078748923", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12293786134765875615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "16043683538361975370", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10670103699537731664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "17854578307286932628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11443268857010762276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "4479117540570599742", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11726298758004767743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "2968031010495399536", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3797957937905580811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "1474271081523145413", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8526484907799590618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "13723543003759101485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11728824117049687850", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13268525255152984893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "14397348576352573007", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8616686489737649890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13176385389367548697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14990645740260870030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7472330881076141262", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10892456883214928095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9522661528867955338", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17856816245251319111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "14872992823083730615", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3106591708459602370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11609821372586026178", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7678457226823073886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "10118395047539851751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5389189982064081933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "1742897526168249500", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15331103261044247142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "6644418194983229139", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12478309735214802531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "18012549942299450620", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11873734271080160669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "10424278617647597641", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9553032671453999824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "3860603464276263676", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1207026216972160297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "9519623751582710696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "10328182165125764988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2231648183489019418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "17599383258252980421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "16208488491972128275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "13379165253894817165", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2566302789609970663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "1478419046264331178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3087801652564627458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "16103943009195163681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14230385851791760020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "15293727142789007900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13973179950424276578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "713121569924250372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "7947870656736319919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "1663285216972929652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14767888121198814523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2124033349728954551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8762901342272872498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "17006133396401462698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10783981060353445280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "15110359240685619357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7875272450497189442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "3281207855459771997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "11932770338770247767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "15860915170591763391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11716771904412649891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "1095495157025479260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8402692278765063674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 133) }, - { "509781001842353609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3255465741612432300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "13439896617880328331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7134654288295280046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "6769243149577568817", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3480732841490521799", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "18269685060032395235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "15649927926091502215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 133) }, - { "69439315851965666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "156456996459945842", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "3012566432840424198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "16431165572426232677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "6324565723045697080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "5390559917122707732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "5469227748156438008", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17163158934005653629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2307310127637739872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "1999979442136861875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2527189070714658176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "8329846097322076175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16783619135298589974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "12214162812589030126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9216608098626790565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 55) }, - { "5179760459095053114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "2452226948562393335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "4499586349553581439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12668149981216388765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "2287356884312581209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11115684531624462986", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "6483208845600234755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "3752171257634205726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "1774158624592967937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16881283637687482989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "14749947225382670869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7351733901977025859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "435888248913413834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "13713406612642090169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "16582132711225619740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10436819182310112786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 55) }, - { "14546281065004619074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12558716383635737426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "12609361477548272638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "8107447526839063293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "10995907213890714701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4871907623235871050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "7394217382008802567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "3880189981766119529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "3759057398165607194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "4561874206785244358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 55) }, - { "488298169768725160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12956726277674279950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 55) }, - { "7177837234452118325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 8) }, - { "15031155621982459860", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15223164574152266895", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4834446692898125871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14766477690417085350", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4461989328775275994", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "10141927023849730720", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10837496380266058422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "5012013738970489338", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16839741351990811959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "7846384623429362522", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9193880745263317167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "2863465257341735941", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10447947790216991304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12024817951074673335", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13474805373264874144", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "671453551040072499", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "87031578643428011", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14034525799882831106", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10864011008000364415", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5115007207028125638", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2866656294663853474", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7913076120244203725", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15187035463799513424", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17778091287904736965", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9562527071055150197", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10645625090439446714", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9955939178447682108", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7450417963648518926", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6648876837655776653", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1520529227443340435", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4455369117448405874", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2920840796593281126", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16341722570340169855", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15289152041466330689", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14362876471450307424", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10330180429524641331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12046017161414846599", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17228810554159747400", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14835309921389262864", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11263540528012919947", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16139615240471264488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "3820661057776133570", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17515847111676784130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "4252157815622916471", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4819131094439732065", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "17264010982688979937", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11277866878590984477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "11324651029379152442", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13425251102263428554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "4571404165794634411", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12279771749366327372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "5754396201681434378", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9809458159478958866", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "5459463503840817402", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6484375582324852109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "7005509036795164602", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10785966734346479177", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "15363606233048272809", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4890043345392707202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "345043289576587800", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4804533178560338520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "13328911884191551889", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13302687772426736346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "15231987838322151865", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17214254645087272557", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "4849343880559509889", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "851057218719456209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "331661172067077796", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3017824560305532066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "13596876807637507229", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2242602888499888844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "13264617841270329349", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11604794601689380990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "7770000755097925765", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5008350851224686853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "12166852830214895457", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17672785701483179117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "2439993891369206440", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15822546325822628634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "3056212889689424946", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12712071520541638451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "6217542346826403576", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6290584630172122012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "1245259979364728404", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13006774775034887171", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "3725013268198063198", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1359720957005310113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "1354647381212852890", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10480527638577674825", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10883992248631603006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18255227391100087860", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13565691057064774487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7954972694876158422", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5118467701668427545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2339864165283480961", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "490931535580183607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "150132162949295379", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14795618530175274538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14126906427006602775", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "905526102343710614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3385797925880519845", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16238415425814188039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7107677063657303327", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4098191685457418125", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2936333406928424760", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5539793555189956907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10106454449619141260", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5346898505346646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11807282628372660280", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12375919467924385618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11705756153433897198", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6651389480007764007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16911464046178654033", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12495003066477974474", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7650862961269327235", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10709828018763273371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5044721291675005144", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18427056032084727710", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1390379098099686972", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12054200116003751590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9500850790449116723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 8) }, - { "9057036344533510776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5093049998173715787", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13761566845514364807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "1594612401422787491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14603590053512154268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "10136369729388564720", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050675313067213312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "14221578799010900252", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11723735945517472199", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "13810995219720233595", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2704063557078535883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "10384537928514123040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17427036330773218054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "9796621763733208035", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14046114605615338907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "5763440554939527411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12892693137085610062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "17775705003104146872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14878347463243157447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "7368916076070115064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "3499109651698979012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "190530884420224257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4202645222013675478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "11324851661119942609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "6232363902828992968", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "4299492266819967844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "9481675228591993785", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "11772741918108731396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "18419183012101393192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "17832542092610191859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11771014003680394135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "9192665896782282996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9763310312421884308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 278) }, - { "11430400968543668873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3430266954211750407", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 277) }, - { "7172604084103519563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10306542963828398049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "5235375820995365354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "5091558853871982858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "12914986936318857086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2265784112305305260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9019388470685749691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12427258337646070422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "15884763176333003771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "7211355951470869591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15399245700982979379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "12644942072153919043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5876880412336151866", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "13775529405693629438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9048522050692986204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10642327923162019888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "6410682026872155392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "9454954846682513038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16463823433924519300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "7279393739634103483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "13358283026528078900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "8032685176029570383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "949330876419581703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17713034180977313726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "472454322186482185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "2727219457659794468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7852745450437172519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "6065819201836017182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15984885011101717258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "14811022197918391667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16146350476627599543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "16173557782125372935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "296142385116663420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12655099960717366198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7937870623766562191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9367157746678824712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "18062849937960759210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "11919129623429545762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "10522649794540845800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1104489643524273315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5419775002149092646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9226912483632588371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4958222070605478947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4479979951990338510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "12022152681602871455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "5740738339752793113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "12087141795291232248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17825280904760131680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3974589991022739479", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1838534101161814609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "10046663998164493552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "2305461098719675735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16504962609450876148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "6345550009198921347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11239754372812258455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4347816192417741558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17809920600993699808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16710010075465723498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17729546848373991614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16998508915819714690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12952980509662451384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2683507674615735878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "13059207969254830451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16295660312557315941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "14089893422771228191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "18034648276860485300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17739868787095417856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10880081193716628051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15916505622570323098", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "9101018613418825655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15650839696475698676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15628121900226431719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14554225625951128811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "3134489458855347772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5627834277145735283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10729288973933590396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10869005786136023160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5597908143491399643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "577182964135927041", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16947969669087411530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "861419637283812778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "3643250372952944907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "17977676737774695825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10309504812060596568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "8866736221671835567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2133849627845285277", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "13902214851539825156", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "669771152920944125", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16921939234324970069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "7649413902932043811", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5658664813683907476", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "10071449674652717890", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13352000946213986936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "5291011077679733990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "1458615259705605525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "543472136359161929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "4644580321919256401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "12946531140050029900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "5010119207726811326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "3308770992373192529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "16913004986170202203", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4079026972040047969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "2683304757433993300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "3141886504884887200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "14444475853714164129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "10747988576436391912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "2722124265986526212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "8856888761246057127", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "1902656726461670148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "3337625924046561031", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "10280619408766255552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9695024256541464964", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "6733731409232284409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "15805087418686802636", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7056030150365552588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "13038533272699602337", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "3737576893817599311", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "8761283252495354972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "17549411807772646930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "13124342334495538095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8576733135863336233", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1082586642383386489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "3217574161785059951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "18357544235608006954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "13954821927253849036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "16158139166784964096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13558687084677943158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "13809898858049445969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16862145184923128012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "693883892843558363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5393510569127725391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "4533786844080178561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10128143628088846123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "5295693108687178880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16425665058951535484", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "1398177377739338750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7407975398526425554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "8614534946699754256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "7372956570616880244", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "13676654389512816868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9043982883185435219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "1626430741965136732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "15295951849706930711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "1075027491444288875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "16084700435355748612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "16698547937652264447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "16729849855476690294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "14171139920084409181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "4264284648458489052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "8866716292621164810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "11828175723996627443", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "11164519756679631743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5558136691773431495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "11031569203645035546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "4084026445911476156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "3819990462129075757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "10055549084854766170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "11657946392097042544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16768797136991242472", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12107262410635772120", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "938222258370511187", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "11727227430687227444", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "1040650352205493707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "1563987925712579649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "3870539490799697188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "13170441257780067955", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17490471699618303993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "13993548620104010490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "15728009639807698634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "10991423760161409883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7242013296950669829", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11744368351982723504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "7314288062932060863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9299299311101549958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "4138968242532400395", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4135068756462147853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16247399911710810038", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6020017927557041768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "11265472910579659280", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12512751736409465214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "17015328096102652908", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14147460733160099960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "10811837819834149164", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2173867324489962689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "11198301748997371475", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9741607635826869269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3860667078458481972", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13590444711975157776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "1551596771935253711", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "632116056424249698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3499645386058307669", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10471519687597963116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4429109491655891299", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9439431829175743345", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "70580716590540876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "577844026691991089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3873183249402084406", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15799159401545270696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "18154019240019929225", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1569043950563130463", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "4491380839102267034", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9243949750444156746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4772696293208603817", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4927360358387344983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5770286476124511234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "17084977396231597605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16800575429414554907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "12793908914872030220", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15947699374684516369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4660288622381620227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15914512645931208899", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7460672405409009037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "1541754036637209097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "89439319782574517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "14088382963493477342", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "18203935818408469865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "13191096881934434519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7918742312252115870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "15641537661939240413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "157805434489791310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "7941729567451949422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "10628725059172743408", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4492673409319122180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "15857087373591747006", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13793441296561946357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "5172712078329324967", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "8780604510524622314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "1760690277175249985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13649894122307008732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "17546566148752689536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12675313398314286884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "14621327324047759584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "14136097914489095982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7638626850074132214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "9399994156762372761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "18068050257421269408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "11830297960718214360", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "14959566236432790882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16884396694505987920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "17947818179123182001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9381304526221508530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "13932662890258900896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "8268533335852735248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17419874083634480896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12773693193167844110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "5157249499936659040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "4282661608732125403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "3159147743553063163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "1706927777850488363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "9839670675413379092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6780215829176686721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12972634653821069685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "16129296588866116913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "18202222342562516071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "15426960908024585800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "17026284168840448378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "18118237182023167949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "11113256687741667688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "10555597973766215754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "17517495652165026573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "1832310305089212990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "13855438905855887272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "15349944413643626251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4738743763536059708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "16611452077660879545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "8101977280003030465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "2012181953284568566", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "2969389503332309296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "14515066741400300669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "9373353053843326128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "10023279637210292010", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "1103204698908514224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "18092842590142527927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "12174571114411168588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "14431607479949498164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "10279778381617181802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "4237276338897143680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "8083672466967374860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "16705621644424684055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "5352861363832390974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "16945184617367657570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "2995134938466176198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "11706378390483804857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "7958459862276998225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "11703557271443535142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "5020788604681810984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "15217183882858251099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "10650698451740924172", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "706370730287471796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "18199526506796726885", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "9269175963143039426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "3691705516240577130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "13472532612464340803", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "12388375914105990324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "11582534256623549131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "1653274345637156919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "5893940382830835820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "17700958439420868719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "12730339458081890990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "6631816968511312100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "7000524935770116969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "386749666417295495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "7162575953766465459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "11398019086259011063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "3041612155708729812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "4274801141127703532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "4865023158176874622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "18424912460022156378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "10408322429232132983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "5277400567128489977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "6848989271874647093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "10085059621136526248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "12962552332511702682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "751912075185318190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "4505008254511324231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "4191326605459754690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "9824678205469832038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "18245935804520236353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "12309132521191764927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "12843671306854567956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "8275277322582733101", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "13698389420396031586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "12949204491386872217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7370273921473161914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "941829593638869991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16206791915939407806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "1500571771538985941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2095802691829304676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "17542414935564676110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "12380856644683171627", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1451466106918423837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "8071957466247137919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "11661208196482963286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "6635217802203685464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "265124365266629363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "9513032457323269513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "11814740669468421049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "5221320470007950766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "14359530849521980269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6181651715051152713", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "1450888744802985214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2842103889477438816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "14006248791647711759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "7072606962946873975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "3599823735065658574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "11311859068168414878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "17525531790109748810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "16749148369456398030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "17556238490521153146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "6067904130482758510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "1791615587935799399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "12985650543127289023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "6714886136800883594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "220326805056361171", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "6777045876155144709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "9454512817077883797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "14011124615649605281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "994489782629179836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "4338023436590582323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1152693503778768433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5994204139128667921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "17243576882981097341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "5524218746051008792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2669822154816760632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "7179714714302073459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "13002363400738122017", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "17006095064160484022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "13733327241591630239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "11942736969933408358", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7869779894480025247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5735608687257018419", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "4346591404756288097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "805131056816361237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "16910952799476896905", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17512961503976896701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "4773077837537775324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "12193395770362986433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "5740745357953479527", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9040145293899470160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "12755692101476964677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12467673564660108244", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "7432142107544210174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7232326270078161768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17238880534517721334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7235358742317442134", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7548031489690889629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5040095338370816349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "3816674884393241704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "13919204232414535363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "15589007878875898942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "17711453305763476458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "3501882025888946886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "1171681987783013074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17585206779958265260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17046662043776372746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9208964785762052001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "4435224497850514394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "16728762255357411770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "2968439898708528834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "11845189428639322474", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "16616945998593626851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "16490405739040977260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "4974320417566990034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "6428098122005804378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "17281826959243966826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7369903937189508744", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "9111988592015450418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9119618606914671839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1711220333751274603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "597650904461183283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16888412539296862194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "3350601287664242323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9702618600245321109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "17649961873981897621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "3244675355773468991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "9340159617983543624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "10570285542015420072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "15968821946892330559", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5509395737020858006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "3806131437010910920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "4523064418696274869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "12004552919019936392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "18313088176414428990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "5649150695527000655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "14985755375924972050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "9441060601228656341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11421180829679625737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "15770767768674603174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12055647521556218046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "17908444616754154471", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5568753513029409478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "12417253210787537988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "4046830923427667342", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "8108933468437926367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "84595904778810418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "11756650366229979428", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "1617135706549276688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "3011188207492335920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12450814729547235386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "1157947252370351851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5374664689223295796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "18215430801133520364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12936220888307335332", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "8746621720912032145", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12003323477818208825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "17170858505976681742", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16566128345135114558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "15690161340392005765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "60267878504897170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "3501667344669686338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "8690196189594920365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "1930929857644673460", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9671459469252116568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "3266557807508325807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "18041177945345031826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "18267428053198215471", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "18417288692814472127", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "14031009077471784948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11666250400445971335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "1367483816197881270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "14248239982355212178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "15820359925623438341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "15216108478837665623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "17489680436564779197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "14117801387057507639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "12831123539633580270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "11337525286386930242", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "8431759922045602848", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9601412379897937608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "9152433123828445089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "3118602494449249177", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "5159738930501638535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5060012838564094182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "1905758333157310570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "6870942166356599956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "18067291256808591467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "2826762745628486040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "11841034668170849494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "3034482898462686729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "15838113905712517735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "9407646138658641974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "15636128989267984459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "8409488188696700816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "5720964268093705079", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "5922142661777925178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12900949103593247293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "13483088320871913126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13960388312976163971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "1843555260471832708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "15739278428190392018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "3868149953087814447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6845814820599174031", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "6203765709597125063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "12871555773123368130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "1237920404306733800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7669403041163460089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "6791806088355877039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8561261337239934159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "9580986168276580598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4708035980731751007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "13734043898517059207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3177304125602972370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "15727611564408173858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1632416005093914709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "12253049204822930675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "15078168059698267650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "12522495848240087966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "5074273865983613482", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "11936530628363072904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7870154008378361670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "3774285301357006334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4848143712599565301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "10316451248440741901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "733956743303342862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "16677044352793659175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7075659071934895087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "8803037667261582905", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12421204749289937399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "7330202944390548890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10753540518493641553", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "9999425239167488495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "14001406016806064079", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "7565867291827884997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "5941298590926032148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "10130171279527667782", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17344974951998490453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "5550969016335082071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3398322619007806698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "11356842300444410831", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "2623687018437195679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14077148976508649021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8272823732258536202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2451712485584835395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8057302050645780813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7430073011895298582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5095827462645341808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15129834325410878425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9660812093766156608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15781622938833984014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1089679781525023551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "6129602738379919488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5287076386757143976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16076153317792960383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "2108296560864415762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "17006655627343469372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9404677451270692749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "1372939511728986224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5311718276151327830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "529543453251381109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15591167992985613695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15026219694198820614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8258382025812748961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "14810839157236175179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "16117738994809548007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "659846949368492111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5211191663202250117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13418701036204748812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9714764457768279762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "17310332946322628458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15975964562807570772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13447028922679236865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8337820318779061494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "18136765667969393174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14821616804286068969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18386376129938707290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16609136488331186895", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1996860183441418841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "6491244517639245276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "16312223896859176991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "17833517350994024381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "4226968857681929488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5141753233513623264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6860503758000008398", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16489624657475712467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7862815466573236157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10679760989906275129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "852092858392507925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6996376303337512293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10978173291465325823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6670327979947471550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "11318913630213187720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "123251351612308092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10784073615329190425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2261453441277654139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2937907409658060025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7852144838267007144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4408772370026995920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15411474884532403722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "9462315044265139531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "6419580456182610836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "12277470820821378855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "16865879032845300007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2862999234347597091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15447513376965243034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14420809655798184553", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12954154886708228545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7575634241190730697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "2344498602308448450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4304041922043496030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "10971070835319242371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4862529593282936100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "5312140481706133684", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15522785615618973614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "17798636687709019154", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1938086876393565238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11897113890115321056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14363654136811880073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "3928266232090746643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15882969506682501496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16426179645101678763", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "18174857480705846286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "598390166442977699", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5522698342845820411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11559360678008060513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13184662326021747000", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "16037141448095945650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15094664469997373662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "822162932339827810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2597453794298356435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "15851356529373376076", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "7966454753124154534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "7311120574972466702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16461809076899645037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "1591199515536783245", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "338716975932676215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12165079289914715018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "348058686961206025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17635171685500922207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9643408025778914022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "5145853681977610916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "15155676074658242659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5269172622193124300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "17037462814585846902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "10100237101982273901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "15322609677356616580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "3399406641489305996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "10187930930336324253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "17252589865292797082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "17922279129043570176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "6323083153920795679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9277176009071334860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "4313392430539923574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "10883341041912056319", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17310409067211414565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "863057075064640334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9131235538209388787", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12868739680413736657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "15901724303713479611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "16944335478353845609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "14025235562200209723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "6556424924189200804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "14398854364550406668", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "6577505360421510286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "14098811155652990436", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "15530407024531326375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "4466647043226271996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "4121109463284708890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7916244303189113815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "12309955719964788034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "10133054058562198093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "6294240435687565243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "10178145641713631806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7585184325339753737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9222744127882324405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "9542325095876448686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8155268141318893606", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8541982562061181756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13472577372534605883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15980348884716629349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9737565171095493297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "3622409603053918029", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5657471280535146301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "17025324057045572535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "818998169319147148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "1680468564927032670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "14466032674083938714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "73865742350616903", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "13833960927635646899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "2783577080556699089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "3563872903821081702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4387041763614917736", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "9714508918051740792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15412447128995361859", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5965451243366505522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13856271274572142709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "5156033406916344703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "1018687388655376483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "3779229442395464456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "13448845356783404653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "15578456771467281881", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "18302892230881285207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "9737833587413114584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "467975197394411990", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "994842991399671507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "778476198101178556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "4769003637955328938", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4914474312076193952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "4091702228990140696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "7602222004475424358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "14544219140091420262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4279062247055842367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "6603778920476932267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4959403414256988744", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1425953627379976115", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13477548641580029772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "1963081583851864291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16393176054374397767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "11132679855317294753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16000753982895054944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "2727175120437582536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "2921118493468368908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "11626398907755088688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "3224352307778512793", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7780140599533242850", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "1270307036687208396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "5911282942658469852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8809017515482311843", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 280) }, - { "11655994466278963438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "6981537186704688907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7903891232234389925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4229105529069729944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12796777049340516563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "14289048840489035546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "4239133538073498792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5103094815475470596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8560635685184432720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16264774056719724826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2571882179292959757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16758962840329202004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4550028191070279999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15661322183507404821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14650567822254940018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3755253206085028904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "8751016391945753900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "288853243482418538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5047419871737940985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8819268903800581706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "3746573775462003750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "16286085532892593349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "16547425454653232058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8195881973746570408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7712831597869354170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17035903590837750750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1907439276166837309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3036808833459559381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17928043901784474130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14667209474639064623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1701609125136907870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "2140514316203117958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9366201112659847392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "7808544677773370430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "2251029128552117936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "9529614587861271730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16811402686462277562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "10554266898346470422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7817036102984218692", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "6329618009202266591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16936366288366370882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "8025053805734757314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "534032316469702287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3963106895592011725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17994361454416813294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "14902389080201926109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3796274347773622633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1306339989221885682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "10900880512948479338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "287386909600391846", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17542176922797334839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1081962464388501987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5831419373611158773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3179874645565098825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14906458674793172507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1934379409955686502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "10178951466584845110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "12693511427898130707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "18137106379929135901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "11619548409913646265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "13317417676446624018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16710651492402564794", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "10967218651864700933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "5381578460674280089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "13026555349791486777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "11913020016435860608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "8260130048649729185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14133958262039763609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5585398540591396124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16442107352245114876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "423221712829930726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13550435052563656432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "2440366541074371090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "8300655194765375060", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13163146272900339330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5406129421969383274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15118142492742177336", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10727592780669452048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1076005730007872492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13699740641705514374", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13054405729329143152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "13503608041359512", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "14385185911482960528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "11215217005872946038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4099859307693687554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "4408600136502382976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "3037042229494600258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "1155389358857780776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "11461581290174106570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16896833230469488924", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11469881811044037340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "3003526572122876385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14251848023416168295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "17248756229500447131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "929378940515745198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "12962558681443556219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "4481903208484313806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13558618754911056302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11455518069358829249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15890473622821659630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "6942622405269419082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "13890118723041457532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11292995457386147494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "5077214229434392730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17774424004510360936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "10412588668458621135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "10771803503544737080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "142650579335909103", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "14116800584981026541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12995903177757437362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "6143200133853000387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "11893541520830049036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "6310724136390087834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "6391201577234440562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12058759356433220258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "17152614235879767116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "2111669705686676421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "7333511810266504718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7397341452130124383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "2939605281692583169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "1644335606100150388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "2394023805427701338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12531580106484042446", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "15586047342916704364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "15779837958180258409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "14123081378489325832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "7818381040882768404", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12510951219501865365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "6156831095718536092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "3568514382399560386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12065769091972094756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "5321698540631249776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "378801963103874857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "2149582237161177965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "2770397466252831892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "3039528482572243879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "12577421746159122264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "13553263424160050064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "4021558014531645922", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "59356084516953804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "1170380397764345558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "13094402291968806996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "6713985030102340818", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "8354579049246302728", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "13815395589135469450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "13558656230312558247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "11666226259183201584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "11451740938287179908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "273242667845386507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16587061389996963349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7119182041840303390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "16292848987976256449", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16437124655147660375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "2495655464941634884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "10294610483561043024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "14403132596827435096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "85050336704401597", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "4450409744922989123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "15528692642731712121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "16661843849495077745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "852015206582470545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "9813748068195103720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "10544034939133448916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "226601879759378771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16432425079146486467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "7274179284676568361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "5184121466994451498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "3538679039078582272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9920155432685318259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "8859895010324601937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "14026537760442360645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "14349625788399542568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15065019229949449623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14115742296883450319", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16748662918272106932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "2273992727647793692", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3190494353583341446", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8837721075413149240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "2817919813339364130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14263790627243107300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "12866217660635921034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "290134020607738418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17207560805775399864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5245526691775741296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4933831571091731212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3872151366780051246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3541538046227217664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16182470664818268848", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8519354640245415816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "6222595759158615206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "7201521533301617290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "15497797842820949408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3219408878901707426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "2188101366183302888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14079654309452583394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9250410390663336388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "8787438180071123604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "11799179287124317845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "14206076551739831333", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9468684953949274635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8543619733732987550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14159596290442764023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "4378422094110940766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8505040075968411726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "10914921540144371519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3515437649977762166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18035673326929466074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9390478179772073718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "6254141935545262078", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5955575949957198434", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "5600128039063009632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14114380593731243715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 168) }, - { "10728212277329722684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "877436308867220589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18375125668176498051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "6767245864232675168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9287404618748313247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "8728178019712933221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "18251360413872841969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "18271689282126907793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "954796765467489259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "13597240991532942069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "5079055505117153635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "4135003545872878882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "11883485911218628865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "2242915551775617989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 310) }, - { "10556089809203693400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "3727142736386026852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 311) }, - { "1622880009460832832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "4437258459981739942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "14691372262153587653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "12181607120522804433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "3159681096461848644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "6729785110495533200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "15322019609805777935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "7024495439434892956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "10416622008071151225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "5796500397424307442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15702382940521972117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6093575518270471235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5805383505505929391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1801731858063091191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "1559798212423183813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "5594180958505308003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "4766071144928072260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "8650948093564284852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3883845471211207871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "4366168099274266975", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "578703329577922869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "16863960779539003201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "15450609897480659306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "8203550467004532364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7431849514656037251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14484890926084856480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 272) }, - { "7777333052643961206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 274) }, - { "4424217045094988504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "7994179151788368291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "15192024816519005250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "4747159205186229582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "5485971317082563152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "18128162750557822655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 101) }, - { "12421707187947291166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "792684262493086891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "941626985322260281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "11868551452004726281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14352303529756685990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "10702234389482091891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "3895088069642140043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "5334566325056222430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "8306337702797456793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "15720507574336564201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "3277243911383750280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18150429561058646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "11169292427557543138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8792010676469476740", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13190888313721073437", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9477562342190423343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "1202292109713947702", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8640150341228170279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "12757611260347801001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "7183578232279711009", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "8984436655107983227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "16397733032387984819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "16364494883229084045", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "11800783548769329949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "16065744898134487748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "15800447082078291243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "10090036431487700311", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "14045927407431718832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "2162882863309264684", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "16579057939215877904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "3988024997010367546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "2066731703492755469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "13781423818051299677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "5211831143687501130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "6863331059471727622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "6403698142681887543", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7481256533438761028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "14091610802555875119", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12024143207855886580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10170577772376890221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "721174714308243785", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "15809639778580769565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16667887002111125871", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12790570304622911607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "8567667881970262923", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "10576856554114055028", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2777318471329665162", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "937159502066696999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11087413527078604815", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18186615266760475767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 275) }, - { "3833510944499257797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "1218323229202187514", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7683334381958571864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16773645387243701837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16958329690837977102", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9452470718398027950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "16511393582666965704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3216793152416217495", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18416908414174464784", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5498839261395459224", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12198263593657033426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "10014448860206587805", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13330734840729670622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12676167240795292217", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4850497746076450913", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10016815108730511683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "17948637243158994878", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12259844988981080505", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "15078590909693331731", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11988285441493553006", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13851240591038949807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "16588325081458426169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "8642107585829380438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "6219075471508685758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "10546430708947911124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2613462626256090659", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8295126647635181949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14213516751025324346", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "16509472637458153234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "16589607587365212240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "6988674007771237080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "3448477246688526708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "8507854696766492454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "8906588133431586825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "654122557966242717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "10196332102593337214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "15831600396403741571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "17808913959977434594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "15548971488532746290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "13468713306678453952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13613399861925108148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "17802514063213000148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "13093429681061786539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "12247991248100147706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "14491949194619001237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 128) }, - { "7590767013583950613", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13210604117940125947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "4670443882075998209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "2857337999074313592", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "16036386660666696362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "755414184406250882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12190841837604350271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "10292243973236220688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "17793292063552633023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "7605139219344415117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "787363431787954804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7000486794832106857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13608239208821071914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "17281202179589913619", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "16985912104363932350", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "14744368497944610864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3737552767159920174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3792945601873900927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1364546124782880196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3689722043202617487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "2632535010129224704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "10968768803038046390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "5353552956675518468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7866128397931438774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "18233660940545931789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11670430946096342056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "2627779045483019709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11066913713501760080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2552187713769926425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "654821507679356726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7606728651572102823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "7549378486471456156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15410074937424854348", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15114370307779942381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4803370483104261655", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10415046594066474634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "3441335188113424896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9277610800970567810", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17179609670678746034", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8251544171504007740", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1353170363915443814", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14540578324750869319", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13471752029049484143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9062774198518904260", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17917978116807564183", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3017411837779243878", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12992061224471212714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "13161997040644039778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "11724225282274130518", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "12822126914959112382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "9423958333298993923", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "7307271009495440764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17746215841755337461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "3976736548270395981", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "1192279884248226739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "5538883245745495145", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1173986078589662704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "11031358859656806724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "4238885454989272754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "8943913562339525413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "6931953332823066530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "7799984350284425885", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "14204609663091442879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9091110033424983286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "15829095120243431195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "3239033622277917802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "7578177053220150569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1089944493540593798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "15529757761327002288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18082422341304348326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "17219920118109316867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "12026482841341343242", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "3070859615622845671", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "1778345646142852816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "15188570678726970998", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "4750513665628842598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "3372770576629463160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "2983038203471784211", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "6673966852801136416", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "8792202318168046223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16441830491664937048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "1419073145594317633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17525564757769958678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13468081302022888489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "15914058104244750036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "13760645810144930270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5963901433137582265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "14668725050395069435", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "12112853999307505628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4161612746310931789", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3388752887767453958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14046990030104971367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "16230621843665445228", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9274179337770060652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5115134711994944288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13898821685774165645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3007637520820789085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "16294825599850364701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14681717813022425567", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4915831715914920982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "12894240573737168362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5448537627319798272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14389915292223442327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 164) }, - { "14274685812676150168", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "7732899312577293959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11956435900037329302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "9263063714383940562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5824801192141531089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5608133987357542077", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15392077168521832549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "16446533347502650316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "14762599606783897222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "709835724029986012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "1572991986657256775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "7398196853452900099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "8140094412609934765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2659031931257084418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "4640028527711211109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "18172711677056449158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "5183231560876991543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6821855018718422278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "13237050834496100264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "7164580481046523192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "2490155559809645659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "15430549683839591544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4553409514380460123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3041752019114501584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "4161001033681779582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4764776977138392550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6882621854468565774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8881135571874888085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14038261392627717712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "628191607060767879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3511588484597779204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6904130543085920483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7924408980408826942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9416186718345824095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "14719421757340260468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11936419502418995274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16601702334097258697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12501619443242354860", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7104309382120208659", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2321148334382088982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4914435717288687793", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4104562704039821482", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13308187548669026714", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3603187029740446600", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7338229552985076723", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2161052921317193579", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6104380778870471127", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13710319251108632115", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8096131027165540886", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11823205954749139338", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13403161389559730", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "998876398773540321", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9280431727790048190", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1152691534728260611", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9101903304994333336", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "142270860894725256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "621915374938805401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "15746620724134970969", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "503369896500284129", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7585785802379042424", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10486348549691280032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "5758133252959371492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "15117880293418979489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9120377367517042357", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4278280309700908015", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "9144487908815767824", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17408275657360833363", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11820789223587555410", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9232653317479846765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "18184621367843960190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "15059549186302099880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16765994345605657100", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9869959062341950047", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14343008518525689150", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3202085450628781999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "17224104246148265328", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "7322472892320910654", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "12480527132372884168", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1008476023750261156", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12589440296742583335", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12604104383683210104", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12782932626966309185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12946540633035976364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18221867262301937903", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10171373375072694210", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17791024851737594885", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "959260710517842876", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16988275131627316108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15048584393463312977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17381516856910544374", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5336120047683197088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15897477855246170861", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9780938731831129283", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1473214668483422172", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17515573322312447679", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18356980026934328781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "18077281411861416889", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2543041530639980505", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16370218798911151331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "17316626950179740845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "10414903047695486119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "2809950092498355574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "12011982029561277581", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "11267742746905371769", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12534001599784153836", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1882052795393187384", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "419783127503173016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "14211903923555028634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "10892706534058849825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "2345023488044002149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5754844816339228920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "17015791782274123780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "3706994659266083979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "13324157125165576832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "12014527187730671229", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "5170245731599664670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "6854611304056079417", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "1954052357826969119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "17824431042110985323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "3603706453982734995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "11992353959766718397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "15163327502374403643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "16758697697363920520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "10930115765550856328", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "14418429155823196539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "1628593159980574595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 163) }, - { "15675968397825708285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "9594594523961285945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "6634330132674952638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "8434794604559592624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "3150231129728961455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "12545558125736154584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "15485701086886851362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "18005721959893562716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "490233152678323691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "4073467095502162430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "5801429077171542466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 94) }, - { "14841539539334726292", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "9404953235624894187", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 91) }, - { "17995371099806008878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "8961138963663532667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "425744529089575241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "1316444335300814745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "761169277744593430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "3325727286860556323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "2526832080529662683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "15470013032930986062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "12255528292506999241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "13119479079474639169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "12813978452097969536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "4991419288164762786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "18210370419559876426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "9748307611165615848", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11147573971701279689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "10865695385270390803", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "11999246609107242706", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4118073384938355655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "12134858519320245809", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2930898141522848681", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4190912926126844643", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2929190644951986399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "1126499865206906037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "13483175684542464385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "1920070013712913772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "10787747981914307179", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7715649642603303319", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5581428998642936688", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7532088618116521936", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18126685473408206840", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "2878824076934639346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "6548949901446632697", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13609660900720370993", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "883436333317162926", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16293465561256937726", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4759671642533786591", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4903592553439092472", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "2581414750854621875", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11627532066884923848", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17983556812075120553", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9099720270958987421", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8106738346643994005", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2554991397391195611", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13121297281694293907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "8220168481755031959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14502856487639608696", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16871004845988227014", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12015336418727455195", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1984152634309440563", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14312549767853703411", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "403634422724914329", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "10751536136794650334", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10135458965276110244", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2008424849669196225", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13735180250757239202", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12351866693978844266", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "6788311046557489996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "14578867494693499627", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11158789938857558596", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9616636708366808604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11069983292783104310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "708747442142592697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2780423409483867058", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3160543867929843861", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11305232900158601613", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12339692995143159283", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9316082753126682958", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15991460001131903561", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17647962002015093887", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4897448054295474302", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14184895905338394239", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15112599407339712681", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10486000767830001094", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14999920879568237166", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14799579913711096584", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6450532136308941035", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14962768577232034246", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "1452597292381229708", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7104756264011682902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "7744787957569714828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "13503688893307029975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "9133263538092913983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "1383899865465106141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "11829442945690098558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "12394049027081208902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "12159582810513550491", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "17738299860390552088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "797387385159110695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "8757900457181374694", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "6048964584602891448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "17882819773586674851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "17829148383265978140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14711697456265712456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "724953082687879224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "805221045541170643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "8241070786700614317", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9191832520273617003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "12408889192918919210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "4885944395876887711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "2651385050387738902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6303682540621797774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "905780459938651623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "4476928353532757380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13681462437496627948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "17243648226968859637", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11192356850081328892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9323825370872655346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "10000618285883395700", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6418327009347170687", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8528750110601691390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "8061914949376516780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "12992194515157698316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "17870874477143985774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16234606052818596502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "9148379585489720669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "9270950131920019932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "17001502418583498926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11163107409437069532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11465965972527519631", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2534408579674556441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18109284647478027063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9849272539053219052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17382660912493284320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17764033613416389758", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18431306649860116380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "3699344686791530101", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "14151747022287993729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "826850797666395121", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13486084204140096478", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2114599010013594942", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13251091004269229867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "5240706676373148280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "17490188677223978661", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17854208422879910606", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "8767817856303586064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10672380526821947133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10730222715353420212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "16683169947375504066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "2964705957088952872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14885031472057965707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11308583200952256245", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "7208008921815475393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7113777272518482528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6334639534663495263", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "10151922632636937118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11560634267092054110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "15914107501176673997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "18218755616248669884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "9987415314864002460", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "7667898603371717971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "4403753181729432604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1040030752340209480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "760687670112194844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "9803492989444302959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "216603198215625772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10899110544832584656", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "14447191095937730964", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11130439225010714550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "4325081100430903742", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4216958486055161753", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4400247897123856252", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "2294800960010879540", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "5195511638783481084", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "9545968464906009869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "12932635875905153141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "16925721317097534009", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "4398371999113956082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "16347412180100581330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "7877332346656934022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "6323026044750482867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "9761573038170759563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "12098146032672599222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "1403617451623027879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "9058996149754556268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "5864250949922222051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "15847413004526420496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "3199841714087553410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "4957638663977636791", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9437794960375526230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "9475130054420979752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "13312514874803986753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "15997754881872769378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "1941341635794709702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10157866834809927320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "12308359047798183133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "2986189945936592561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "6928835003016610382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10084794570892043447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 279) }, - { "15417738436777481469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 277) }, - { "18377298651236993830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7354234812009979811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "8656468860180713379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "14472187692485966933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "397770940444464146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14258499419905714808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17599396373608265826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "12935563359569230797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "4892959859293355837", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2802810524370514276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11587239927319376658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "9076758673133996959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "10432365444137108781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "13092232276822302626", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "14896875712028630045", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "3236003754884728510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "12181889163404078773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "4856470441452830056", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10022487076451608714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "14811603003184578943", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11565861421381730304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 312) }, - { "16577611471466452776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14616969385577243225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "17921973525603585874", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4617809377006148936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12641170321047008726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5940337324384948573", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5738835498104275267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "3499106702307464480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "6942016672941874829", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "2173720698351153121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17201365233492366678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2877521658768725103", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7689320135952025041", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12031180482028822765", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4717620775314557374", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13800760323805415740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "946479876892100082", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5039037192630609823", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13839116996827687373", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17037416417174266088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "2321767794934000238", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14907097142953816744", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2525260242689556544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "13328449155966085543", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11856266545854830143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15993427814066246646", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2100891581797371600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 274) }, - { "12242618640422208652", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6133592828563353516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 304) }, - { "18232278892738147217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "11992625045241269569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "12601126285773042005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "7457899998356343871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "6343888265369366589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "10791067159964399241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 305) }, - { "11327097771110264965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "5245308722062496788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "10792503079194374004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "4818231379191523896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "2198278382394812839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3800011935243649447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9631545863582097486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1779941298820543013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "3621930417735246405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 276) }, - { "14435120971846098308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 276) }, - { "2893564501191050837", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8108843303778211282", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 278) }, - { "3682813162987778705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "15494543914974994991", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7565221050911842393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "5629670679897666607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 272) }, - { "11754316727756881612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "10990741293315393791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "17024388383581997032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "10302338806536775954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "7915318733663535312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13702692566238948173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2909728331855309274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "13071545223094862275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "9631481972809246378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "13540002981450186147", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "7076937538747704750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "2041212737963974230", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5308128387928804050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "8619526128410675593", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "4792351255949877935", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17759505449240263390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "9584652777232392944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 275) }, - { "9999955037598579164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "15961487889420208188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "541817615957967731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "13853630125050609175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "4137755981477177003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "16949056117405140365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "16014822406751503249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "7700321970687976931", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "7056293586529818253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3814584042139408454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "16992405636352406660", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17442105631503326136", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9606639214735570069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7940369586324090841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8444259010311137762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "15489746763312425915", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6800893510381991731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "4156384238797998294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "11645116728396933125", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10912495395422146386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "875400109066360897", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16475247464223458061", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12700372241799686527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11640225461345567929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "13183380647506951324", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5242271874488296527", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "9488453013746383896", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9726913113016874092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "15979956159651515122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "9947449295659685973", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "14230493618724018658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "1704404203639481753", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10404725818204494388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9767294641786972359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "4282668574670785584", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18043340998699622388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7148542290597073512", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9040046051053703359", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1077773457856682663", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4716188972902735458", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17343050785312683560", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5687802882700097624", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3524531620118359828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "5688478347124565305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "5504757952698692953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "13800387305792597325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "6574971185849732667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "10573920781439771673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "4992668316921598993", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15778834188130183853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "3062101811226530720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "428659495445490820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 165) }, - { "956022649859563080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "13410850301164057911", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "17423645390621980919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "7802311886554362782", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "1172103288112689821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "17353894529222574441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "16431857516454692096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "9100044555742394133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "13115589642140732066", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16190949264253468961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7026575758396092435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 163) }, - { "16761856644242716357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "6341197991729122563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "17087740929472936216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 92) }, - { "10795104632256101599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13327653786981478088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1096671695414716274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "10774528268153772208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9525853014023664813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "10632020369698615114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "3234107167862677811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "8708643228914766202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "12415368596357091523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "1028160614515220430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "17742192339816511494", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11931568365395665142", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "731825454731954517", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15989894214714907271", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13478984039708550410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "15773157615731010456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16772854836230971016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "2934519615045138808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "4880150897829846031", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17889864541794448203", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11768117585574496387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17906607354577138153", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "18270587701371596297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "18142462471803295391", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4815047491742617397", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4513063773753763458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "2984726467649419856", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11795826875463204296", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15675903059949404837", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15817443774186015593", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14558572801374416278", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15555083739490354527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3854114166348568039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "3216877571075556066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "739676584505475609", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8303211644727914658", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12908594497114706897", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9918371346247634545", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10893432143734884603", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5339985303398206057", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "5941852872160795604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 106) }, - { "17634966178519099371", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18299254635579957284", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13357365044448426880", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18135307303959376082", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14764715930784496165", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10979362792894404338", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15006321421735686121", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12370729327673204804", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10722677916294015259", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13454265023861566476", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7995820969034996638", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5275016494706355806", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "10947686124973711385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 59) }, - { "3349519148124496343", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4003433148846544263", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14973431782875808802", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11948858355027908365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "17951403431757222177", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6586872365879203192", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11718418772370938734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "989564341557094953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "6942049339361951275", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14555883089089918919", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14808895254077106198", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13830605041347009953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "11955992313739654625", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "16921026268702574340", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "15320845027635796583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4014667229872705228", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 68) }, - { "2438374917504708831", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12391792381149655331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12864558900883069118", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "7209217811135076623", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3272017687600371031", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16067605128297748820", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "14150012830816329527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16218339663410630711", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2089730611490367290", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "8907982643256296667", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "804195263636995800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11528417522960871233", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "15378025640603637387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12860222041026638681", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11597391933877736800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5042176052323856983", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "17010172246526353957", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "938848188161536107", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12725647706191463348", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12553441041059632729", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 68) }, - { "12782191856884962803", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15824189967727245909", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16027853590391209100", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5352061583962489055", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2294318010381635693", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11055049031355432623", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11149782181562145291", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "2653651564133701304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "3526580286148537369", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "3985659568982275663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "13642146548740074992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "17011363406405852347", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "15386715291503303766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "10292349730148518173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "3154539627593235077", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "6856130385095139346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "2349007644347065353", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6146876760962332928", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17434429579652310107", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9447458159095730492", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8655883535274781128", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7272538316511343863", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17564338309805484464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 148) }, - { "7881187047171099732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15579919505002150556", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11583017348580874022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17915846724151945664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5319668297345215520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "17208186152576814861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3633858263279042265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13853056718266488510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "14759179293743468995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16995873636564597028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "9438739171104456179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "14429081455612806819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "9819596940685093690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9426665763007611385", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "794499287296495726", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4980217316169616839", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16105073808368936420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "9530116228032101908", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8527193566719173253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16566214123371867456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1470933384474984858", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10706267011822108376", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16081386644309102158", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3571959174116404960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "12566041126392848976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7603872175048237237", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18235209540858013173", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14316077757957132678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "10816637153861630723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "9175450649281374948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17370158297470557151", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12051595062513871723", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2967481531952454828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12085348936192462321", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11951606039079763598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "8769060267707904998", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17104611871050967957", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2103882464623009432", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2659712601063515059", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9759380701896779097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "13842309033760176194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "2418288192668085805", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "14994322266840011040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "16402312692470500253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "16955653765071712611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "17830290099875088207", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "603883331897298932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9731370183088819573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2296581485980163665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "15133468875250992696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "12972798847556569913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "17446505012657609153", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "7223801044761006523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "16511749893955141055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "9485825829394109934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "8130920994920685157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "3573490922300056520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "5479761740065152589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9480653639044390919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "8739347545059610410", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13459514533473657102", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "7824524940405130010", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17796310681498690253", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14823616678465136590", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13816104794723484993", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "846088275031979661", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18125732229366977468", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8464582977975377118", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6290317420155851465", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "12696412964119109465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "4994591211723226974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "1036010477232750453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "13786357802945430475", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "1003101267609305257", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14991602704357959545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "7840653268996892538", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15488340031228619748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "5003718302026277632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "7693459946348737411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "10536316961655703500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "10765280349477640969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "7447163906170805189", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "9319254979377483709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "7843508201826629532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "16395067736440127496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13820498543284008286", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12071914115316550349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "12727541507197887360", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "17364712285968437405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "16120988958246503683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "7375461241315602473", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13282951481330978659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "6181308879301978465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15488550074426713959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "4062706195708729345", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "11604111639041106489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "10512507780534402341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "2128612971571865547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "8594644182487917002", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15881381297320383917", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6040286126398028933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13926122593957480821", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6213386558868267629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "4456004887590847716", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9642229389394495047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "18259656768460999562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "4983880246908724272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 344) }, - { "7881579844586294503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "5331173521406046122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 265) }, - { "3285520504090196295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "7143510787416483146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 265) }, - { "18103534417093702556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9328223957245552723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "11706446082856895571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "12625112690264223217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "2114232149447438823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13883044928774243663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "17636500109629107732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "6192955702438301372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "13970935346154374605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "9692654253261175490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "2116913943188857359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "12802517759474139810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13611054146745413536", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13814086981499638596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "3106922888635965020", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "10509933181132310969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "17318287523550546026", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "11806402239500046867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12353956380178079089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "875296362957469305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "14912119584313592912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "12494969618927201911", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "6344802942015047824", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1692411934657235774", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "615341695338735013", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "10601835610089648700", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "13262672660175739705", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "16522364268583242080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "18253784177599134876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 186) }, - { "12319073009094248232", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "9954050478761346921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "4640696923527766618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "1436052878894538927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "16011429608661242565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "4381329435655511217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "13972357557211413688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "6580334406272192111", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "10437599469161149176", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "4490223883171428014", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "2529786184394804665", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "6995235840871804844", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1671208365782918441", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "13104509059416300615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 266) }, - { "10090923790949378407", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3429844423226609965", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "706049518431331645", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "17193614571243427089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3621424752591567930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11066930104187448422", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "209732971447020989", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16044646335477470657", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "2172121470071868949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3392693938352572136", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495063314176654751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14553856088069405595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "4967444801764057340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "12160764253455777655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "17723621158215826108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "2171768477223405739", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "12672995204641007004", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5622089373755094139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "2129726780118554358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "4160656836528944651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "11052732052072367261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "18432787283148809023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "16172528828198474326", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16327433707667075261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "2797723586312707948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "8451212914744825089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "7025975403069487257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "8913950860101596091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 361) }, - { "15308578014507211237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "13132804928635689780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "4465781406991476376", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16266491618150971928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "181006047500375768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "18140951659547259039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "272730229972987861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "14898892437285105327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "17252449599613270108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "13436376034548670107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "13787436604877398090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "8873614802459592665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "13663893159182636270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1361159591875955678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "5912303851874077576", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16245760498096322525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "9928406318940388716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "3036512701943687724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "5334291640387922287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "3002986032379998259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "16469788155263456039", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8709632541892447149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9524303276541517389", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9354818521586974021", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16781127329510211966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6351572488552853754", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "907036267078333137", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11855070245618904113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "4544242784357021697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18218631037214746168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "178353385245384751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17658152048177750315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "11636129433022017868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 217) }, - { "2622434279674583815", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14335074487552883436", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11175955260573469979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "2732519635571994212", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13893789954946953427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "4355933224673863178", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18037918102910297531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "16071723603031305677", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1697248235682953135", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7843498978148810586", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6767159196241633301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "5097818987523855112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6623182990939010641", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6711878663358611849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8671491767142900139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "12164298124869114517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "17089801601582809764", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "75742659105146536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "4652136280940317116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "9751582946441607796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "16706244336960642883", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12581879452540858313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17443356777503458523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "939718260623752240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "14131851237755716991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "7474639594232203854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "14152716242882609401", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7998930863626763670", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10323345824599612614", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "30229601562833524", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17788367809717898285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1509728225855233852", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13139625572508441980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "16491532291908469567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "6355395905401306995", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2096779676054335057", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4217179485243909459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "17101789600628162503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "6139574161497189424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "16559140502701231107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "11459784003592366395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7869916853707978306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 56) }, - //{ "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "12794369485239257709", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13338594271376045657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "677249604491773387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "2668729552208169959", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "13011676362747785816", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "4678607855896512523", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4356817283284529593", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1885075753696445410", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17806712457019493207", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11862259122805366807", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "15201438563802430490", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "8132521728369930959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16108573960501496757", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11086699387784339943", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4013707396889204359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "11850332373794932468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14763982961176216679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "8207349115037232863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "3273748387141431306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "17109520309574369561", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13754408679115174221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "16717713360264747483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1045854873741563331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16767392067294252396", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6114241186364821679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "11241838709529552265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "15192230303376521834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 90) }, - { "5374969798377773063", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "592245952014430043", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3114869763557037270", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11254635684957519432", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "16816222375242496370", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12809199739984715013", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5040730152867713388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "10429613013253088132", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15451919862187018297", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7546586420552408243", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14487682847898298214", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3106710091841093202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6458124573210430792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 343) }, - { "9182897385081081193", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14462438074931673266", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18133334552107213128", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "38736266675995457", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 209) }, - { "13654816209891478730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "6263019986730305851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 218) }, - { "12929981792125924963", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3138374672801504481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "4465701487417893814", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12977678792503377525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "10879218241103462088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2221145174704245189", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4635570915184713874", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16075006181495932250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "3863816884636503247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5440983284868981549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "15428591250165788477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16567638487719493784", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "18059267466971880386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "10808909442136736629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "5682190700442712936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "712165731154577189", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "7469127846325904854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "5926747396493954633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "3477539135137665170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "16235115911229280717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "17009318615658405230", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9421643783312790618", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2294026590516781945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "2940027113687311893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6090625728451718945", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5643908654122573882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9065894438656900887", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11185156002426041243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14670068483447729857", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4623542918584461522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "1143214652021653634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "1434535531617424039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "17025268985366223779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "11507538232733291666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16243196137456624852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "8059328623525062913", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3662747857062156477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "314054598858070952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "14122213471825630433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14985236276429954162", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "3265415000818832667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "856877003890134554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14805540705424073865", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3788462090984291082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "2715447739580688669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7171904645566467208", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10308431308942416781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "8712136292276123857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "8700574100180128776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17147293671640396193", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "16474284418841532356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12461575861709234385", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "192209423643075326", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15490478608105402679", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3491333679577961640", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8176012042686275874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "4282198629458668761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "689445825453914111", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "969746749329671447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "16833026567865627676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13046322179198317310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "6902644989079870993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10987953316324712538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12515465135362865565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10049571207493913006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3926585856863002495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11275109735493317886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12238674883388043717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "101401523793806394", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11007944497812650617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "3240102173773280414", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14883438809987378616", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13320675959188615441", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11975047184326016230", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2608363732937932266", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15943141845766932879", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15486917753097743853", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8317673282128335201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10635659193402005820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11450378244355788918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2625969259447793593", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12207503176295152756", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4625107584562815965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1997392406402548974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2524029454785583409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4615708568396290002", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5349415632630235233", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16108759090923335184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11756881293845417212", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "17839839336294937155", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4703107905652287491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18180820925685532104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "3835286851569826052", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7807983899017500046", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "10294185397756053636", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 337) }, - { "5519535335798045279", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8701248964531180496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "291868903926685441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15239764240622554314", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15963038745470172423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "11428599290755097395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "3180320769716158201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 221) }, - { "583303098958523195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "3509487327001107638", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2649192407401044065", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7706714181281908433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "15914342421266687768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "1497560475414454618", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13485300684443803732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "14571022040013651253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "2832268621630415376", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9383182168277796969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "16487774205195979355", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "2226745622763268469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13809330759308309353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "11634932044447867039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "318377908569897093", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7353563160591978243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "2582625260054352916", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5609922876429907954", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12557015880639217508", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "11528310408333718862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "1471837664358450291", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7351401242363888463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "953306082374100275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "15759530339367380982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "13300022131572486202", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15689502054035168040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "16969463538496570528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "10237524128771958432", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7969848911698660033", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7130694811424715594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "8578747191812631883", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5197105253412476591", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3120553928584920777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4750894407873652809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "12667014405537239093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13644681270630373984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "15602218079503030465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3950738240651133849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "9101334153142718004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "15695415285791951018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15493488989417521388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "3391032227732782982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "8951040603784899163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13804221028705631415", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1351033666248868977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11330591026581463934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "6142707387281700290", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16117448559783537844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "4531222427159927606", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "3116068331849795558", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14389719202147508599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "17053671692908867872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "17025182465337728023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "15035800097152337587", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16770615142634470903", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9378269524012289175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6727930402459775131", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16362857896338778056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "7187734276051878356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13253775441326432265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "14733510474010040334", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "3336303478756453360", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16352331970945217438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "13484950419220835364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "4674416595144505741", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14559308665571750465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4542143431130171516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13189392239349392492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "7009735776703529573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "4220826666482500445", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "14792528369891965810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "15287650965861631130", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10308175009371219583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "2903605246599054308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "9213563311267466388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5019077257951332016", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2497756607567197523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "9285566577169147378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3432296808755992670", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7688176479120305539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "8818070832398055086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8787816339967963727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "863952266514375915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "5835634465164771899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "15101680837342453931", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "1116274074896622552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "12790788016297794214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "13538051178827008933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "16403423801823379909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 94) }, - { "3723613341885592267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "3830703844770425343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "40704767167309552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "13973028408397200796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 55) }, - { "16561224775421968533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 44) }, - { "11243840588602365090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "14103112843209793966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10483664832302187567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "8100595788531468781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6620782733027313312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "13526488884846845330", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3534971503826416049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 374) }, - { "10425889533411573166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "5214654427283761256", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13569941893504840630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "1318571118468536310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "17724604495865223459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12229574562535756991", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7264274394359484318", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15069906408448814772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "11857037689248685487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7977195117668583981", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15678385128478075284", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13025361884606488732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16723478941106779069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "726985753660756762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "586947787345351152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11418379777288974452", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2575631797904040925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "6288489890578212082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5649082203775427830", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8036474422877454869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "3711525118850629466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1875764913306932583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "548663565933738403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "17329287216741045059", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11848462434662954749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "7581174843529024536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11334122788337402526", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7868973874302246233", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17209528805596238905", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7878605163588288309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 90) }, - { "5941092474669713339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "13738760763969959522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "11988546375476924356", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13680926356824317761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 52) }, - { "2530317332900569142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "2891736961665476908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 54) }, - { "18008552719153887303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "1299545313185409227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "17907223570737272640", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "15643135666029727865", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18180655791734632264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "12990527753120735255", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5303970743736042689", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1596353239542510685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 372) }, - { "8040001390872143271", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12052207771201936228", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9942099207256025216", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "60509335250891515", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11499219760597131534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "6726099352298108756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "597073780328219388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10783630257421062891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6988492019664525206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "7132328255408635227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4006884370026272807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13938466156916423478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "8689206546467098603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "5644068493155655611", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4867937397499803072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "2702144517025248597", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3304589333915676807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12894625941923144893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "11649407835105973949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4897991181236908768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "12179581684777023804", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2806529556090896246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "11327228813412934262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5485749317130402302", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3499243120652875549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10916647716124396856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "5749536453225343663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "789359733867650915", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12626014184575881530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "1201692134690347847", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15249442550355454201", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2598267743388306204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "7181154048972884375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "10930640103080573253", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8458082326743351141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "584086621952390547", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4754967381316623440", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4353842547963164546", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6131481289104111211", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "517997325935712670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "5600807544955072308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "973966345068677905", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8532217744217419503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14614844213016502202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "4126895998426674411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9700808806849459216", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2438261005924916746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "4056971751486746551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "8929453032482114162", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "7662200927459001757", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11473442921040533207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 273) }, - { "388828310152538138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "1643241486250690844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11806105193035393795", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8843585527713905568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13248567106128518549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "13708979487306970634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 277) }, - { "14406070210216948643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 277) }, - { "15352245788978088971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 278) }, - { "1435153323458789173", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17638692805430115529", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14068780861332616363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "6656593119788274992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "14695781272831602408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "15696910741835640150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15315327794058441258", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7545013298074733778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "4026686872534942904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "6553736978928374036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12129572274423886770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "9723314434598141024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11031625790234068916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1138439260035360722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8323445733669842657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "54019631544204590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8971115542951085891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "4584970211859494304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9321208819255762521", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "12617625046664709483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8264178890341675354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5334190564423375247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "14746359019867963124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "2044363708106765326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "5132761922124425835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8141428150264829362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "276407276027553756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11878734040194151073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "11622925573287101001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "3192332625020432602", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9785114056964539323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9410978119783758141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "12523676912856063091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5912451559447635837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "10264913782610095832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "10309083227104422150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "8500148569566077929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "6578908625437515675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "13762042713029963144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1561225943337590599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 129) }, - { "10917498758625273194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "14335423820860953927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "875142032423622622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 165) }, - { "8965747921518186477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "4428101657497677982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "5779388310240896974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 57) }, - { "11092828091552833150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "10295330953350618042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "15901675909820977223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4894227264080887361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "381149736509958403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "7962991673727743706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "12725675221990905186", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17961702508543961900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "7082007579524697455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1867337342417952506", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "8931169575495985034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "16542318967217020315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10626341369865893888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9090828337597312855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13621339501067135142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13754540732991287617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6669808855737023569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "17640725195881101275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6928136130626403937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15047676717402283805", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1082574490068006980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "6557428245898292304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "9440117898128288296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4672441137336208890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14289082888174784976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "5056859994174498686", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16574710115918192418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "15839295895890205274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "16307464696265537356", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11910735867274493498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "14671212883301405408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12028665820838352309", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "4773123925616969670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13602140021189675477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "7708321360699824256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8609939102588915855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10782611933832492335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "8857763129101380288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1230262279011217327", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "14424566003632608852", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "5497751772699578150", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "9541630719145326121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 250) }, - { "10724501418439612080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "187352687850707150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3438296636411972401", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "4165036357594592683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15106614232165315070", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "17477062954520561609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "6664432489777052771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3341302541468955849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11626402549863483301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "3522383297921565178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "8651641584737798174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "12473600360154597915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "13296242326766100583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "12068797674575015662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6297802534570892679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10037086825900566930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "17216583849049249733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "1287490919205560806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 153) }, - { "738850098651678143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "7139714914586273766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "14050124896329573468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 96) }, - { "5429130923188159806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "7953255701516490034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 22) }, - { "6195916781434462809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "11025471731438443683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "4622514167765722873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "14680730265621679042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "12141300895511301068", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "17106086048442658788", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12707946849050970702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17154337492545826355", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "10109431802089940590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9428176632140441528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "52089503050497755", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12297371032753209816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "659150305191479097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "2065752819810364738", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "13583166868754499339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "13991205023798493715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "8939683514448064461", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18337160891834020517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "1154228007901031779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "15156525717629023944", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7757331094141318304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16779678846332091086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "5409924335138540834", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4149728557142033774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 196) }, - { "6443517114667332732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "5419041493176804960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "15948383678216076358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9604982746455852556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "15739274921308457528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "4642234334824303290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "13200151444914751729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "16894871557229780934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "9933958860597451711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "17094948685292534952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "9762182215179534181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "18273537339378756543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "7720939595094113814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "5865480930796299143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "10058165874008941852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "17309326904418811234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "5592428580503282095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "16348402367953880206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "13607830451968188080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "9311802150474489673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "5159470523468873105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "7975810844103449438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "11455843788148231615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1410630713443793537", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17303408650780384587", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "12069726772532946193", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "6204183474669103812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "12874626654611400042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "13546876216568825877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "2973436171295280783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "1908809004094565452", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2322559721899919275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 131) }, - { "5766507688771440170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "16626226341188424071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11709992724966310174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 161) }, - { "17222005830854879661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14224121742920800990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "1071007164550012186", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 165) }, - { "6719302427415173754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "10482582307328548806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "407189201971322683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 23) }, - { "6531171505861182429", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "879005904827468163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "8460847842045253466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "10488269059469838160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "11359409533744011242", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14813178380338948912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6307939332939714967", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10894058425957901202", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16610284927818475574", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3221469860582147955", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6423785822515265784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "742689192890486807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7349880498513046830", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2369451367723962073", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11690533591656807605", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9205978149692979955", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2728938624042183713", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2781309272856442321", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "579781312141502576", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12564687330941036772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8421388456873652700", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12177387334053203378", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11239541755868028928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "12776081190690731910", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "5648658688155716974", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12213354854947437262", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5680236635030250712", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5751283221740229986", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3646228701104397128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "13776178598632392721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "13364676690016875118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "3141773224039276177", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16384186388687043048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "14421898375873029115", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8922929126299811091", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10256831975351722184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12590922530749026871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "15209909241815414156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "8791285622784082122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "7474592508575297101", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12068974703657294908", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10682300249493137042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "1788455099959676873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "15225354446874994535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "3226193790517362610", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15814015810740458605", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4129722446574108695", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18094205332383644037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "11120846960057008937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "9195732599757736182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "9939234037869927090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "5898740235388207878", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16694984452720336415", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "4889188980319017094", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14412158605670555579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "3463959257726925426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15726902746983125797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8463615810239412362", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16531824466148265247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3374410641320310726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "9589942627115344216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "12864204111424196179", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "840202264034382558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "16386955278777720573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "16267682394077585279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "10544411879329675593", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "9835739612255048978", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6293403765897901528", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17596685300497748803", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2150326211917340956", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1587501521145162454", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7561096442572829049", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "15078262396281327048", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16383540667048742064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16820082917500285799", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6820284286806022849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17285815901490707654", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "994182747184593564", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6642767323474835034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "3215659303601163167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "54975980454651672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11529876081402974396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "10308113903347312964", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6712698149192186833", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "14930789530046665855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "2204178900998688268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "17174919737114915467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15154700439767512396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14916625550370402883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "7650375560336513366", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9999553425206328238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14026570177552137240", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11686670048744589243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6678796313875454849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "641417817126876622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9622546530872848323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "9194788897910888066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "522181557896569275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3332334993503432420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16131448347558322280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15924916465272239832", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11669828823444745889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "7243917162812988891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17891499682354369344", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14532519639619315651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3635446784873718932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18275601715050791851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6997971129340865650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10722782762733112118", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6585223640997887253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6205240287062600210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "17522452942286240233", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "6571438978296387721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "15511138074959300404", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "11107930597263802755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "10320711719466983961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16884228931101540030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "8253823502854784432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "6025872155179042054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "10173283505468233128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "16094174852600023296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "11077876432364512822", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10586018593856542117", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "18436249934780056991", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10179916356323479080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "1760391741350091665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "3109104171383198425", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18136135457402651842", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11834683513280095384", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4806571630436601566", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14849108908297747749", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8490260671996115530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2929715823970060874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "15924583510704449214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "14331658870024759698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6340128090694375876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1120455113299469776", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17268201530818712998", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "3644282167178264526", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "360872770877634346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "16720108310653948550", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14353390922580547467", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9868561386826862471", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17465517455679097501", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5570311824197099845", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7524311370696987092", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14070988879848388270", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "8296551195150971668", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14352796912241296357", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9840495023131952174", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4720851194954041037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17515064188391421150", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10437367877444543776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "4362304842016958728", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "383721620126444793", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "138379779469699309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "3759515057574218101", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2856601829807186494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3286330985102373533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "8159303545761286685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "4056979460327024961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "17823133607491820214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7969441643457570812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "970768445746568749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13852065717057446998", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4342360467977736802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "16336482874764861478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "6075691042233712335", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7570346182940928159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12971822824884826169", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3033264172690274208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17301887391757619741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "15790005937034794347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "15464327246951632247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5659168916726488798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "8079376692609682448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15160738482264643601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "537074122417021898", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3336076058264596420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "1982176363226079588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "15052577143485630617", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9314293064351558241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4958835037528182801", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6817494598328071314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14387756025635589673", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17536308070854915513", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16027456210394993913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8655315308767111198", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4447065688824381344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6843617687528352801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1418595171949196661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17900257435531434807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16789135236017252073", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13224814158106791463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5078905972285278557", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4196367396954155354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "7009873605945341897", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7199295899520406795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "16833854122884184025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "14599780481362761532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2572395498687401679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "11810221946429451169", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18084635102736402756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "59739211822469868", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5240181393417899912", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15962137123591591534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "10989937450490049763", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9798585825695496550", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2362092095402043749", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4444730303823507621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "487214150851213303", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "745009493367761775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "3806761527342944195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "14458851250685872417", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 339) }, - { "7106362077449435105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "5853697372844744672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7603319690872333930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4628748977913534701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "10565371760124443824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1972879521448306536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "13893808009363736870", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6584960721513702502", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9220830217525628783", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2235210915304938149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3930314908786112883", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1334070221835422461", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6681818065741882453", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6980201892073961793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "11530101016435264783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "4801117903303888658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "5782934278345953016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13951717514084457087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "2721793280965260548", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "8124736388338424498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "12223993560805441284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9860570706348640782", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "991586070509079617", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7060804814325505165", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "787203599734115483", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6193161166790398003", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12806934028210472719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "14043770215999952932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15277856047844308598", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "8048617952947915835", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11446745541571732900", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17422822627612865758", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13954144830230671601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "11198908896401597838", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "5582896843095691256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "8133587696326295326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9492402787848610840", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10515519878978734341", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8747430148550634190", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16986358655784856534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "6109013751635776331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "9585113116232600562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 350) }, - { "3503893875515897267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "13144385730409574259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "743941460026466526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4492332228252010118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "1920042803083729276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "16436006771518788093", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17567504672169904482", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1989849521691057108", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "16706121580364790904", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495776091407365966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16430562172386510259", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5673972310424776040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8797843396807284399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "1698321314111848001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "5762290464889692462", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3218248162832023196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12988961529988078346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11683680166617045816", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6252429564537528709", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13145474177271090694", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1208161922424418734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 236) }, - { "2762489653422414995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12937333118472722002", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12917241193304093727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "11020315012951440351", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "1518270620354036926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2567046336192437734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16409729623371222748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1044978617045366709", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "8473037597903277214", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14398366949002972908", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "7334966010680206302", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4161141078006269526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 278) }, - { "6522575549211855712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5629373398445592781", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13374993751390784382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "12976499206227689731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "9882204352209412039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "5041111302824362529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "13869716373706247686", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6438522646185979880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2406816735581074778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "8881150100883636392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "593712935037568960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "11970881115757095265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "5584432943673435454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 316) }, - { "4560479630843098090", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15374625876485618845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "13102754309439605192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 351) }, - { "17912189681971987483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 318) }, - { "8153567933591966877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "1604661321386793876", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8990561333549136048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "12278364834477923930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3122997634505472500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "15669490019428002270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "116291934148608396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "14729854278671832528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 239) }, - { "10591379189397010097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "11929531534620071758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 239) }, - { "1819720745131968914", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10607904718265020949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "913496537924971856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "916389941321470163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1411786954276574458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "2730604806511016352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "5843679089588930933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "7304346312452588844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "2423754482456771339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "3653156933813711765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5219399418946822456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14217181622713951411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "13025323039227543550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "6114147683777615071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "2355214244972870639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "3167336012388169649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "12218337369633748663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7264756313770306662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10492056481694320580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14281201038135286621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8127190765748950828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "142486914279119363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "1532263118203058517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "5482851829165191681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "10548792624072794724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 116) }, - { "4239415134522959352", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9028970753877215614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2324120381399737261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "10267260789603562117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "9988801796928462423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "12516911293946682547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "9213886570531053949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 125) }, - { "385046297070779752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "12541834857357563605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 132) }, - { "475043738497218394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "6351347283201596793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 96) }, - { "16290626406346691996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "4569338575782832784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7575675354187625951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "5795073619189010837", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15123868617509445149", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5601435819039968726", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14104238386345631681", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17377293745073971167", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12134712464763856064", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5524215233998361104", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1103228955716492167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8618835732380720921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15908673392788376468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "8482147530539941792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "9069334144391048686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 101) }, - { "12493863403516600413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "16692569816843207989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 277) }, - { "3438116423688595487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "15602863681196390535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "18277685132620834972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 276) }, - { "16541722316343690197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 128) }, - { "3067806959725855130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "17791773192152464021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "13603318842632052764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 14) }, - { "13131740479277027362", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15334195300678132907", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "2038505773698938555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12090536142661253835", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "4999505377862312410", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3934290309368153435", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "5951936376654416075", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "13204120207726209723", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "17108987360340581555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4795705973706796563", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4084106758501882407", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8127570953237266335", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "7500192998744460131", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2379484884827231127", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3349519148124496343", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13477416097954638887", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6942049339361951275", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5303170164698694791", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "12494969618927201911", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "7875724726741958520", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "13835908664998757647", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "6407471972820516685", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "5385316497510064491", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17377315194963069204", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "6580334406272192111", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "17790026124881397912", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "13314092088416047551", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "8479958930889587809", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "3750338655074082587", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "16683485007140805060", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11077876432364512822", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6062246008880097669", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4806571630436601566", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2458592904274981909", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1051506168926530904", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4163359403543480821", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5415319660821122528", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3286629188347536485", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13575423234109624706", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1841155673858789206", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "6708349666663292171", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "5083163738120585821", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "10572945270796129630", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "4436244774193918646", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "12985942652866621579", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "775538461106687677", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "9533360488591027707", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8913823292181409151", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11583985978586657985", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1485662490111767875", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "11872464450773754851", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5364060938737428149", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2613462626256090659", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14668725050395069435", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17381516856910544374", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "6450532136308941035", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2321767794934000238", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "7995820969034996638", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "17951403431757222177", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1074748462756364699", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11955992313739654625", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "11939914680143672459", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17806712457019493207", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11862259122805366807", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "15201438563802430490", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "5374969798377773063", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "592245952014430043", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3114869763557037270", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17147293671640396193", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6911215749850066204", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2814805887448339818", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "1120455113299469776", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8002233052700666718", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "16436006771518788093", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11083993858285515074", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "4133424990380177132", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1044978617045366709", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "952318454591754214", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5762878778443755104", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B8.cpp deleted file mode 100644 index f15d59e..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT2_B8.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - //SKL GT2 - void tuning_cache_1912_B8(tuning_data& td) - { - td.td.insert({ - - { "9832505855130134649", std::make_tuple("convolution_gpu_yxfb_yxio_b16", 0) }, - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e.cpp deleted file mode 100644 index bb2c47b..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // SKL GT4e - void tuning_cache_193B(tuning_data& td) - { - tuning_cache_193B_B1_B16(td); - tuning_cache_193B_B8(td); - tuning_cache_193B_B32_B64(td); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B1_B16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B1_B16.cpp deleted file mode 100644 index a020fdf..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B1_B16.cpp +++ /dev/null @@ -1,3710 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // SKL GT4e - void tuning_cache_193B_B1_B16(tuning_data& td) - { - td.td.insert({ - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "10728212277329722684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "877436308867220589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18375125668176498051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14907097142953816744", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2525260242689556544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "13328449155966085543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "11856266545854830143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15993427814066246646", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2100891581797371600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 275) }, - { "12242618640422208652", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6133592828563353516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 304) }, - { "18232278892738147217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11992625045241269569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "12601126285773042005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "5079055505117153635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "7457899998356343871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "6343888265369366589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "10791067159964399241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "11327097771110264965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "5245308722062496788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "10792503079194374004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "4818231379191523896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "2198278382394812839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "3800011935243649447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "9631545863582097486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "7777333052643961206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1779941298820543013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "3621930417735246405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14435120971846098308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "2893564501191050837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "8108843303778211282", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3682813162987778705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "15494543914974994991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "7565221050911842393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "5629670679897666607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "11754316727756881612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 96) }, - { "10990741293315393791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "17024388383581997032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "10302338806536775954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7915318733663535312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13702692566238948173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2909728331855309274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "13071545223094862275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9631481972809246378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "13540002981450186147", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7076937538747704750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6767245864232675168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9287404618748313247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8728178019712933221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "18251360413872841969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "18271689282126907793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "954796765467489259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13597240991532942069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "4135003545872878882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "11883485911218628865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "2242915551775617989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "10556089809203693400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "3727142736386026852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1622880009460832832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "4437258459981739942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "14691372262153587653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "12181607120522804433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3159681096461848644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "6729785110495533200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "15322019609805777935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "7024495439434892956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "10416622008071151225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "5796500397424307442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "15702382940521972117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "6093575518270471235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "5805383505505929391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "1801731858063091191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "1559798212423183813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "5594180958505308003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "4766071144928072260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "8650948093564284852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "3883845471211207871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "4366168099274266975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "578703329577922869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "16863960779539003201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "15450609897480659306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "8203550467004532364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7431849514656037251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "14484890926084856480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "4424217045094988504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "7994179151788368291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "15192024816519005250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "4747159205186229582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "5485971317082563152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "18128162750557822655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 91) }, - { "12421707187947291166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "792684262493086891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "941626985322260281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "11868551452004726281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14352303529756685990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "10702234389482091891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "3895088069642140043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "5334566325056222430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8306337702797456793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "15720507574336564201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "3277243911383750280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18150429561058646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "11169292427557543138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1375156980278317418", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13455881643467418059", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12788968383428254917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "13131740479277027362", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "3390014193205017427", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1270467775674221667", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14462744723628661203", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "8203171222962341018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9795194069954915563", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13369603621524676979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "13575423234109624706", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "10721885719016335538", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "14567947256029724271", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "10749263296616139689", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "11717348577195224554", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9275303306340702111", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12245096462203481681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "4999505377862312410", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15272426400992401555", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9325097933807426691", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "18238669114790278675", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6664482192233202590", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7454366978268164047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "16135569134646688251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10572945270796129630", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17495198214524203238", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "5221108094913859739", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1092633914190498221", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "2738256633362038820", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16689586259416414782", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1525652349412826502", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17683302016987200208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "5615525527388396983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3992735701291817771", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13208739898218342989", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9536348721941264933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12803521018213865796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8854783036772473804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "6766480740724769248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "768423629375648579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4044100281521441011", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "873240542570331563", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12875236165672036211", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12008819728839685704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "2486645741683554648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "368578589584714524", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 133) }, - { "301201776306602054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13152181652632422771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "10311747599696543062", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11258322449556590366", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14095734330183410835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "14910223536998380801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "3352689317181436056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15832740972576959202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "14732184525012592889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "8421045774757048067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "941232110069825628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "8975333906619899020", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14800592533315327674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11816277809167487786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "957781751038897330", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10498289589469975939", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12970943403831707924", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1300292367195167745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3399837016486623477", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16740871614208968868", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "71587235425438167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "12717047049023783979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "10478482486372389470", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6056581247196718403", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3780320160034246719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "2819320453491169732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16976464773806576190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "13321672741246923341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 264) }, - { "15140532227060261467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "9400755775406101904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "10292585962794261197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "13048561902713182858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "3658425022428447440", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "16947830954662293793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "8397584983137442239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "1071169341660439058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5326247361632903583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6214194654733781771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "10025839973092358719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "16711955423531846725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "2915165824085219545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "17108987360340581555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11972097635078477347", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "16926950874716567095", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1212319037405620223", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12397280593466519809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "2609454334520044465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1336940384521633733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15271783562528081169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "9533360488591027707", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6930697835136176263", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14444423571297570985", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "12643423612381102003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 79) }, - { "18423051691107460439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "15381833359831622179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12040626513219974957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10647227605517025377", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "8127570953237266335", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12876112384009608387", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12663860560275361463", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12352923639732112511", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 79) }, - { "708452703070938673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "394778201589371681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2477849395789783501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11637325834858582585", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1485662490111767875", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "4300306345092124175", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3402183863499902145", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3217246278485567748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "15713964605078748923", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12293786134765875615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "16043683538361975370", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10670103699537731664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "17854578307286932628", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11443268857010762276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "4479117540570599742", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11726298758004767743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "2968031010495399536", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3797957937905580811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "1474271081523145413", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8526484907799590618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "13723543003759101485", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11728824117049687850", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "13268525255152984893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "14397348576352573007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8616686489737649890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "13176385389367548697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14990645740260870030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "7472330881076141262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10892456883214928095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "9522661528867955338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "17856816245251319111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "14872992823083730615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3106591708459602370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "11609821372586026178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7678457226823073886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "10118395047539851751", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "5389189982064081933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "1742897526168249500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "15331103261044247142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "6644418194983229139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "12478309735214802531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "18012549942299450620", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "11873734271080160669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "10424278617647597641", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "9553032671453999824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "3860603464276263676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "1207026216972160297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "9519623751582710696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10328182165125764988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "2231648183489019418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "17599383258252980421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "16208488491972128275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "13379165253894817165", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "2566302789609970663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1478419046264331178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "3087801652564627458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16103943009195163681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14230385851791760020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15293727142789007900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "13973179950424276578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "713121569924250372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7947870656736319919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1663285216972929652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14767888121198814523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2124033349728954551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "8762901342272872498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17006133396401462698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "10783981060353445280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15110359240685619357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7875272450497189442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3281207855459771997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "11932770338770247767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15860915170591763391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "11716771904412649891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1095495157025479260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "8402692278765063674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "509781001842353609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3255465741612432300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "13439896617880328331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7134654288295280046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6769243149577568817", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "3480732841490521799", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "18269685060032395235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15649927926091502215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "69439315851965666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "156456996459945842", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3012566432840424198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16431165572426232677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6324565723045697080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "5390559917122707732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5469227748156438008", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17163158934005653629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2307310127637739872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1999979442136861875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2527189070714658176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "8329846097322076175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "16783619135298589974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "12214162812589030126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "9216608098626790565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5179760459095053114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "2452226948562393335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "4499586349553581439", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "12668149981216388765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "2287356884312581209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "11115684531624462986", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "6483208845600234755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "3752171257634205726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "1774158624592967937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "16881283637687482989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "14749947225382670869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "7351733901977025859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "435888248913413834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "13713406612642090169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "16582132711225619740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "10436819182310112786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "14546281065004619074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "12558716383635737426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "12609361477548272638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "8107447526839063293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "10995907213890714701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "4871907623235871050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "7394217382008802567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "3880189981766119529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "3759057398165607194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "4561874206785244358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "488298169768725160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "12956726277674279950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 42) }, - { "7177837234452118325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 2) }, - { "15031155621982459860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15223164574152266895", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4834446692898125871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14766477690417085350", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4461989328775275994", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "10141927023849730720", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10837496380266058422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5012013738970489338", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16839741351990811959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "7846384623429362522", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9193880745263317167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "2863465257341735941", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10447947790216991304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "12024817951074673335", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13474805373264874144", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "671453551040072499", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "87031578643428011", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14034525799882831106", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10864011008000364415", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5115007207028125638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "2866656294663853474", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7913076120244203725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15187035463799513424", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17778091287904736965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9562527071055150197", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10645625090439446714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9955939178447682108", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7450417963648518926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6648876837655776653", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1520529227443340435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "4455369117448405874", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2920840796593281126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "16341722570340169855", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15289152041466330689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14362876471450307424", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10330180429524641331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12046017161414846599", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17228810554159747400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14835309921389262864", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11263540528012919947", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16139615240471264488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3820661057776133570", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17515847111676784130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4252157815622916471", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4819131094439732065", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "17264010982688979937", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11277866878590984477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11324651029379152442", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13425251102263428554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "4571404165794634411", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12279771749366327372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5754396201681434378", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9809458159478958866", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "5459463503840817402", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6484375582324852109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "7005509036795164602", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10785966734346479177", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15363606233048272809", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4890043345392707202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "345043289576587800", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4804533178560338520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13328911884191551889", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13302687772426736346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15231987838322151865", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17214254645087272557", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4849343880559509889", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "851057218719456209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "331661172067077796", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3017824560305532066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13596876807637507229", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2242602888499888844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13264617841270329349", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11604794601689380990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "7770000755097925765", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5008350851224686853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12166852830214895457", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17672785701483179117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "2439993891369206440", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15822546325822628634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3056212889689424946", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12712071520541638451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "6217542346826403576", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6290584630172122012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "1245259979364728404", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13006774775034887171", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3725013268198063198", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1359720957005310113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "1354647381212852890", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10480527638577674825", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10883992248631603006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18255227391100087860", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13565691057064774487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7954972694876158422", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5118467701668427545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2339864165283480961", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "490931535580183607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "150132162949295379", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14795618530175274538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14126906427006602775", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "905526102343710614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3385797925880519845", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16238415425814188039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7107677063657303327", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4098191685457418125", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2936333406928424760", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5539793555189956907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10106454449619141260", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5346898505346646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11807282628372660280", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12375919467924385618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11705756153433897198", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6651389480007764007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16911464046178654033", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12495003066477974474", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7650862961269327235", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10709828018763273371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5044721291675005144", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18427056032084727710", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1390379098099686972", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12054200116003751590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9500850790449116723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "9057036344533510776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5093049998173715787", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13761566845514364807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "1594612401422787491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14603590053512154268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "10136369729388564720", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050675313067213312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "14221578799010900252", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11723735945517472199", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "13810995219720233595", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2704063557078535883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "10384537928514123040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "17427036330773218054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "9796621763733208035", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "14046114605615338907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5763440554939527411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12892693137085610062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "17775705003104146872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "14878347463243157447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "7368916076070115064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3499109651698979012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "190530884420224257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "4202645222013675478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11324851661119942609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "6232363902828992968", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4299492266819967844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "9481675228591993785", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11772741918108731396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "18419183012101393192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17832542092610191859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "11771014003680394135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "9192665896782282996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "9763310312421884308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 355) }, - { "11430400968543668873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3430266954211750407", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7172604084103519563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "10306542963828398049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "5235375820995365354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5091558853871982858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "12914986936318857086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2265784112305305260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "9019388470685749691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12427258337646070422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15884763176333003771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7211355951470869591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15399245700982979379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12644942072153919043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5876880412336151866", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "13775529405693629438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "9048522050692986204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "10642327923162019888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6410682026872155392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "9454954846682513038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16463823433924519300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7279393739634103483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "13358283026528078900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "8032685176029570383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "949330876419581703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17713034180977313726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "472454322186482185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "2727219457659794468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7852745450437172519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "6065819201836017182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15984885011101717258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "14811022197918391667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16146350476627599543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16173557782125372935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "296142385116663420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12655099960717366198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7937870623766562191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9367157746678824712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "18062849937960759210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11919129623429545762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10522649794540845800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1104489643524273315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5419775002149092646", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "9226912483632588371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "4958222070605478947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "4479979951990338510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "12022152681602871455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "5740738339752793113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "12087141795291232248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17825280904760131680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3974589991022739479", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "1838534101161814609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10046663998164493552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "2305461098719675735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16504962609450876148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "6345550009198921347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "11239754372812258455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4347816192417741558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "17809920600993699808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16710010075465723498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "17729546848373991614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16998508915819714690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "12952980509662451384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2683507674615735878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "13059207969254830451", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16295660312557315941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "14089893422771228191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "18034648276860485300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17739868787095417856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10880081193716628051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15916505622570323098", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "9101018613418825655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "15650839696475698676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "15628121900226431719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14554225625951128811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "3134489458855347772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "5627834277145735283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10729288973933590396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "10869005786136023160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5597908143491399643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "577182964135927041", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16947969669087411530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "861419637283812778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "3643250372952944907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "17977676737774695825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10309504812060596568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8866736221671835567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2133849627845285277", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "13902214851539825156", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "669771152920944125", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16921939234324970069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "7649413902932043811", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5658664813683907476", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "10071449674652717890", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13352000946213986936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "5291011077679733990", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1458615259705605525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "543472136359161929", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4644580321919256401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "12946531140050029900", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5010119207726811326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "3308770992373192529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16913004986170202203", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4079026972040047969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "2683304757433993300", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3141886504884887200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "14444475853714164129", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10747988576436391912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "2722124265986526212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "8856888761246057127", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "1902656726461670148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "3337625924046561031", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "10280619408766255552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9695024256541464964", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "6733731409232284409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "15805087418686802636", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "7056030150365552588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "13038533272699602337", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "3737576893817599311", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "8761283252495354972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "17549411807772646930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "13124342334495538095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "8576733135863336233", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "1082586642383386489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "3217574161785059951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "18357544235608006954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "13954821927253849036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16158139166784964096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13558687084677943158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "13809898858049445969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16862145184923128012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "693883892843558363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5393510569127725391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "4533786844080178561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10128143628088846123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "5295693108687178880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16425665058951535484", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1398177377739338750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7407975398526425554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "8614534946699754256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7372956570616880244", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "13676654389512816868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9043982883185435219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1626430741965136732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15295951849706930711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1075027491444288875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16084700435355748612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "16698547937652264447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16729849855476690294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14171139920084409181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4264284648458489052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "8866716292621164810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11828175723996627443", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "11164519756679631743", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5558136691773431495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11031569203645035546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4084026445911476156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3819990462129075757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10055549084854766170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11657946392097042544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16768797136991242472", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12107262410635772120", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "938222258370511187", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11727227430687227444", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1040650352205493707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1563987925712579649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3870539490799697188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "13170441257780067955", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17490471699618303993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "13993548620104010490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15728009639807698634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10991423760161409883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7242013296950669829", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11744368351982723504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7314288062932060863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9299299311101549958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4138968242532400395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4135068756462147853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16247399911710810038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "6020017927557041768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "11265472910579659280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "12512751736409465214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "17015328096102652908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "14147460733160099960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "10811837819834149164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "2173867324489962689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11198301748997371475", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "9741607635826869269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "3860667078458481972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "13590444711975157776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1551596771935253711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "632116056424249698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "3499645386058307669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "10471519687597963116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "4429109491655891299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "9439431829175743345", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "70580716590540876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "577844026691991089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3873183249402084406", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "15799159401545270696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "18154019240019929225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "1569043950563130463", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4491380839102267034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "9243949750444156746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4772696293208603817", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4927360358387344983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5770286476124511234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17084977396231597605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16800575429414554907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 10) }, - { "12793908914872030220", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15947699374684516369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4660288622381620227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "15914512645931208899", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7460672405409009037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "1541754036637209097", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "89439319782574517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "14088382963493477342", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "18203935818408469865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "13191096881934434519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "7918742312252115870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "15641537661939240413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "157805434489791310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "7941729567451949422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "10628725059172743408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "4492673409319122180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "15857087373591747006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "13793441296561946357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "5172712078329324967", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "8780604510524622314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "1760690277175249985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "13649894122307008732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17546566148752689536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "12675313398314286884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "14621327324047759584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "14136097914489095982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "7638626850074132214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "9399994156762372761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "18068050257421269408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "11830297960718214360", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "14959566236432790882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16884396694505987920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17947818179123182001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9381304526221508530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "13932662890258900896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "8268533335852735248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "17419874083634480896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "12773693193167844110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "5157249499936659040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "4282661608732125403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3159147743553063163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "1706927777850488363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "9839670675413379092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6780215829176686721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12972634653821069685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16129296588866116913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "18202222342562516071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15426960908024585800", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "17026284168840448378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "18118237182023167949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11113256687741667688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10555597973766215754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "17517495652165026573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1832310305089212990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "13855438905855887272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "15349944413643626251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4738743763536059708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16611452077660879545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8101977280003030465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "2012181953284568566", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "2969389503332309296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "14515066741400300669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "9373353053843326128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10023279637210292010", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1103204698908514224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "18092842590142527927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "12174571114411168588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "14431607479949498164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10279778381617181802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4237276338897143680", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8083672466967374860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16705621644424684055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "5352861363832390974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "16945184617367657570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "2995134938466176198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "11706378390483804857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7958459862276998225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "11703557271443535142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "5020788604681810984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "15217183882858251099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "10650698451740924172", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "706370730287471796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "18199526506796726885", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "9269175963143039426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "3691705516240577130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13472532612464340803", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12388375914105990324", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "11582534256623549131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1653274345637156919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5893940382830835820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "17700958439420868719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "12730339458081890990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6631816968511312100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7000524935770116969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "386749666417295495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7162575953766465459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "11398019086259011063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3041612155708729812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "4274801141127703532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4865023158176874622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "18424912460022156378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10408322429232132983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "5277400567128489977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "6848989271874647093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "10085059621136526248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "12962552332511702682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "751912075185318190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "4505008254511324231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "4191326605459754690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "9824678205469832038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "18245935804520236353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "12309132521191764927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12843671306854567956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "8275277322582733101", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13698389420396031586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12949204491386872217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "7370273921473161914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "941829593638869991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16206791915939407806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1500571771538985941", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2095802691829304676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17542414935564676110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "12380856644683171627", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1451466106918423837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "8071957466247137919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11661208196482963286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6635217802203685464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "265124365266629363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "9513032457323269513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11814740669468421049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5221320470007950766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "14359530849521980269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "6181651715051152713", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "1450888744802985214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2842103889477438816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "14006248791647711759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7072606962946873975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "3599823735065658574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "11311859068168414878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17525531790109748810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16749148369456398030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17556238490521153146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6067904130482758510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1791615587935799399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "12985650543127289023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "6714886136800883594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "220326805056361171", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "6777045876155144709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "9454512817077883797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14011124615649605281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "994489782629179836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "4338023436590582323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "1152693503778768433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "5994204139128667921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "17243576882981097341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "5524218746051008792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2669822154816760632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "7179714714302073459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13002363400738122017", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17006095064160484022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13733327241591630239", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "11942736969933408358", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7869779894480025247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5735608687257018419", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "4346591404756288097", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "805131056816361237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "16910952799476896905", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17512961503976896701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "4773077837537775324", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12193395770362986433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "5740745357953479527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9040145293899470160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "12755692101476964677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12467673564660108244", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "7432142107544210174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "7232326270078161768", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17238880534517721334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "7235358742317442134", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7548031489690889629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "5040095338370816349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "3816674884393241704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "13919204232414535363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "15589007878875898942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "17711453305763476458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "3501882025888946886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "1171681987783013074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "17585206779958265260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "17046662043776372746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9208964785762052001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "4435224497850514394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16728762255357411770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "2968439898708528834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "11845189428639322474", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "16616945998593626851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16490405739040977260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "4974320417566990034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "6428098122005804378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "17281826959243966826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "7369903937189508744", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "9111988592015450418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9119618606914671839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "1711220333751274603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "597650904461183283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "16888412539296862194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3350601287664242323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9702618600245321109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17649961873981897621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "3244675355773468991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9340159617983543624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "10570285542015420072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15968821946892330559", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "5509395737020858006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3806131437010910920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "4523064418696274869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12004552919019936392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "18313088176414428990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "5649150695527000655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "14985755375924972050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9441060601228656341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "11421180829679625737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15770767768674603174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12055647521556218046", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "17908444616754154471", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "5568753513029409478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "12417253210787537988", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "4046830923427667342", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "8108933468437926367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "84595904778810418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11756650366229979428", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "1617135706549276688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3011188207492335920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12450814729547235386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "1157947252370351851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "5374664689223295796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "18215430801133520364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12936220888307335332", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "8746621720912032145", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "12003323477818208825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "17170858505976681742", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "16566128345135114558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "15690161340392005765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "60267878504897170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "3501667344669686338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "8690196189594920365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "1930929857644673460", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9671459469252116568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "3266557807508325807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "18041177945345031826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "18267428053198215471", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "18417288692814472127", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14031009077471784948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "11666250400445971335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1367483816197881270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "14248239982355212178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "15820359925623438341", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "15216108478837665623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17489680436564779197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "14117801387057507639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "12831123539633580270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "11337525286386930242", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "8431759922045602848", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9601412379897937608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "9152433123828445089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "3118602494449249177", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "5159738930501638535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "5060012838564094182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "1905758333157310570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "6870942166356599956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "18067291256808591467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "2826762745628486040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "11841034668170849494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "3034482898462686729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "15838113905712517735", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "9407646138658641974", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15636128989267984459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8409488188696700816", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5720964268093705079", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "5922142661777925178", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12900949103593247293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13483088320871913126", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13960388312976163971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1843555260471832708", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15739278428190392018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "3868149953087814447", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6845814820599174031", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6203765709597125063", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12871555773123368130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "1237920404306733800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7669403041163460089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6791806088355877039", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8561261337239934159", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "9580986168276580598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "4708035980731751007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "13734043898517059207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "3177304125602972370", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "15727611564408173858", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1632416005093914709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12253049204822930675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "15078168059698267650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "12522495848240087966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5074273865983613482", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11936530628363072904", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7870154008378361670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3774285301357006334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4848143712599565301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10316451248440741901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "733956743303342862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "16677044352793659175", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7075659071934895087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "8803037667261582905", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "12421204749289937399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7330202944390548890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "10753540518493641553", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "9999425239167488495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "14001406016806064079", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "7565867291827884997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "5941298590926032148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "10130171279527667782", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "17344974951998490453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "5550969016335082071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "3398322619007806698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "11356842300444410831", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 50) }, - { "2623687018437195679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14077148976508649021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8272823732258536202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2451712485584835395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8057302050645780813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7430073011895298582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5095827462645341808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15129834325410878425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "9660812093766156608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15781622938833984014", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1089679781525023551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6129602738379919488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5287076386757143976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16076153317792960383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2108296560864415762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17006655627343469372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "9404677451270692749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1372939511728986224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5311718276151327830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "529543453251381109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15591167992985613695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15026219694198820614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8258382025812748961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14810839157236175179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16117738994809548007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "659846949368492111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5211191663202250117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "13418701036204748812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "9714764457768279762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17310332946322628458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15975964562807570772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "13447028922679236865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "8337820318779061494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "18136765667969393174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14821616804286068969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "18386376129938707290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16609136488331186895", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "1996860183441418841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6491244517639245276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "16312223896859176991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17833517350994024381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "4226968857681929488", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5141753233513623264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "6860503758000008398", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16489624657475712467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "7862815466573236157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10679760989906275129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "852092858392507925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "6996376303337512293", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10978173291465325823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "6670327979947471550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "11318913630213187720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "123251351612308092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10784073615329190425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2261453441277654139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2937907409658060025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "7852144838267007144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4408772370026995920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15411474884532403722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "9462315044265139531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6419580456182610836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "12277470820821378855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "16865879032845300007", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2862999234347597091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "15447513376965243034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "14420809655798184553", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "12954154886708228545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "7575634241190730697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "2344498602308448450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4304041922043496030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "10971070835319242371", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4862529593282936100", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5312140481706133684", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "15522785615618973614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17798636687709019154", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "1938086876393565238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "11897113890115321056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "14363654136811880073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3928266232090746643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "15882969506682501496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 83) }, - { "16426179645101678763", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "18174857480705846286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "598390166442977699", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5522698342845820411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "11559360678008060513", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13184662326021747000", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "16037141448095945650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "15094664469997373662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "822162932339827810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2597453794298356435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15851356529373376076", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "7966454753124154534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7311120574972466702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16461809076899645037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "1591199515536783245", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "338716975932676215", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12165079289914715018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "348058686961206025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17635171685500922207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9643408025778914022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "5145853681977610916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15155676074658242659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "5269172622193124300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17037462814585846902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10100237101982273901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15322609677356616580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3399406641489305996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10187930930336324253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17252589865292797082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17922279129043570176", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6323083153920795679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9277176009071334860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "4313392430539923574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10883341041912056319", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17310409067211414565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "863057075064640334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9131235538209388787", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12868739680413736657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15901724303713479611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16944335478353845609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14025235562200209723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6556424924189200804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14398854364550406668", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6577505360421510286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14098811155652990436", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15530407024531326375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "4466647043226271996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4121109463284708890", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7916244303189113815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12309955719964788034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10133054058562198093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6294240435687565243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10178145641713631806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7585184325339753737", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9222744127882324405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 369) }, - { "9542325095876448686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8155268141318893606", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8541982562061181756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13472577372534605883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15980348884716629349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "9737565171095493297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3622409603053918029", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "5657471280535146301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17025324057045572535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "818998169319147148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1680468564927032670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "14466032674083938714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "73865742350616903", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "13833960927635646899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2783577080556699089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "3563872903821081702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "4387041763614917736", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "9714508918051740792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15412447128995361859", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "5965451243366505522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "13856271274572142709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "5156033406916344703", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "1018687388655376483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "3779229442395464456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 102) }, - { "13448845356783404653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "15578456771467281881", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "18302892230881285207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "9737833587413114584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "467975197394411990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "994842991399671507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "778476198101178556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "4769003637955328938", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "4914474312076193952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "4091702228990140696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7602222004475424358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "14544219140091420262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4279062247055842367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "6603778920476932267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4959403414256988744", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "1425953627379976115", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13477548641580029772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "1963081583851864291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16393176054374397767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "11132679855317294753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16000753982895054944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "2727175120437582536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "2921118493468368908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "11626398907755088688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3224352307778512793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "7780140599533242850", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "1270307036687208396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "5911282942658469852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8809017515482311843", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 10) }, - { "11655994466278963438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "6981537186704688907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7903891232234389925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "4229105529069729944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "12796777049340516563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14289048840489035546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "4239133538073498792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5103094815475470596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8560635685184432720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16264774056719724826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2571882179292959757", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16758962840329202004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "4550028191070279999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "15661322183507404821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14650567822254940018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3755253206085028904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8751016391945753900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "288853243482418538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5047419871737940985", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8819268903800581706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3746573775462003750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16286085532892593349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16547425454653232058", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "8195881973746570408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7712831597869354170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17035903590837750750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "1907439276166837309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "3036808833459559381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17928043901784474130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14667209474639064623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1701609125136907870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2140514316203117958", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "9366201112659847392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "7808544677773370430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "2251029128552117936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "9529614587861271730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16811402686462277562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "10554266898346470422", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "7817036102984218692", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "6329618009202266591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "16936366288366370882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "8025053805734757314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "534032316469702287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3963106895592011725", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17994361454416813294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "14902389080201926109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "3796274347773622633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "1306339989221885682", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10900880512948479338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "287386909600391846", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17542176922797334839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1081962464388501987", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5831419373611158773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3179874645565098825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "14906458674793172507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1934379409955686502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10178951466584845110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12693511427898130707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "18137106379929135901", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "11619548409913646265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "13317417676446624018", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "16710651492402564794", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "10967218651864700933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "5381578460674280089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "13026555349791486777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "11913020016435860608", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "8260130048649729185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "14133958262039763609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5585398540591396124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16442107352245114876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "423221712829930726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13550435052563656432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2440366541074371090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "8300655194765375060", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13163146272900339330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5406129421969383274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "15118142492742177336", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10727592780669452048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "1076005730007872492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13699740641705514374", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "13054405729329143152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13503608041359512", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "14385185911482960528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "11215217005872946038", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4099859307693687554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "4408600136502382976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "3037042229494600258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "1155389358857780776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "11461581290174106570", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "16896833230469488924", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "11469881811044037340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "3003526572122876385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "14251848023416168295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "17248756229500447131", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "929378940515745198", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "12962558681443556219", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "4481903208484313806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13558618754911056302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "11455518069358829249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "15890473622821659630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "6942622405269419082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13890118723041457532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "11292995457386147494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "5077214229434392730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "17774424004510360936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "10412588668458621135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "10771803503544737080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "142650579335909103", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14116800584981026541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12995903177757437362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6143200133853000387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11893541520830049036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "6310724136390087834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6391201577234440562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12058759356433220258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "17152614235879767116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "2111669705686676421", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7333511810266504718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7397341452130124383", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "2939605281692583169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1644335606100150388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "2394023805427701338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12531580106484042446", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15586047342916704364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15779837958180258409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14123081378489325832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7818381040882768404", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12510951219501865365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6156831095718536092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "3568514382399560386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "12065769091972094756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "5321698540631249776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "378801963103874857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "2149582237161177965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "2770397466252831892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "3039528482572243879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "12577421746159122264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "13553263424160050064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "4021558014531645922", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "59356084516953804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "1170380397764345558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13094402291968806996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6713985030102340818", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8354579049246302728", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "13815395589135469450", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13558656230312558247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "11666226259183201584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11451740938287179908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "273242667845386507", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16587061389996963349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "7119182041840303390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16292848987976256449", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "16437124655147660375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2495655464941634884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10294610483561043024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14403132596827435096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "85050336704401597", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4450409744922989123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "15528692642731712121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16661843849495077745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "852015206582470545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9813748068195103720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "10544034939133448916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "226601879759378771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "16432425079146486467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7274179284676568361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "5184121466994451498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3538679039078582272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "9920155432685318259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8859895010324601937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "14026537760442360645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "14349625788399542568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "15065019229949449623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14115742296883450319", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16748662918272106932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2273992727647793692", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3190494353583341446", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "8837721075413149240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2817919813339364130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14263790627243107300", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "12866217660635921034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "290134020607738418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17207560805775399864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "5245526691775741296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "4933831571091731212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3872151366780051246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "3541538046227217664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "16182470664818268848", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8519354640245415816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "6222595759158615206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7201521533301617290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "15497797842820949408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3219408878901707426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2188101366183302888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14079654309452583394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9250410390663336388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "8787438180071123604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "11799179287124317845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "14206076551739831333", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "9468684953949274635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "8543619733732987550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14159596290442764023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "4378422094110940766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8505040075968411726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "10914921540144371519", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3515437649977762166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "18035673326929466074", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9390478179772073718", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "6254141935545262078", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "5955575949957198434", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "5600128039063009632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "14114380593731243715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 10) }, - { "10728212277329722684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "877436308867220589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18375125668176498051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6767245864232675168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9287404618748313247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8728178019712933221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "18251360413872841969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "18271689282126907793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "954796765467489259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "13597240991532942069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "5079055505117153635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "4135003545872878882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "11883485911218628865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "2242915551775617989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "10556089809203693400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "3727142736386026852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "1622880009460832832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "4437258459981739942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "14691372262153587653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "12181607120522804433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "3159681096461848644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "6729785110495533200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 286) }, - { "15322019609805777935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "7024495439434892956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "10416622008071151225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "5796500397424307442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "15702382940521972117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "6093575518270471235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "5805383505505929391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "1801731858063091191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "1559798212423183813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "5594180958505308003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "4766071144928072260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "8650948093564284852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3883845471211207871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "4366168099274266975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "578703329577922869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "16863960779539003201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "15450609897480659306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "8203550467004532364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "7431849514656037251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "14484890926084856480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "7777333052643961206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "4424217045094988504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "7994179151788368291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "15192024816519005250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "4747159205186229582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "5485971317082563152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "18128162750557822655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "12421707187947291166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "792684262493086891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "941626985322260281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "11868551452004726281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14352303529756685990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "10702234389482091891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3895088069642140043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5334566325056222430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8306337702797456793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15720507574336564201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3277243911383750280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "18150429561058646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "11169292427557543138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8792010676469476740", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13190888313721073437", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "9477562342190423343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "1202292109713947702", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8640150341228170279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "12757611260347801001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "7183578232279711009", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8984436655107983227", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16397733032387984819", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16364494883229084045", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "11800783548769329949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "16065744898134487748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "15800447082078291243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "10090036431487700311", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14045927407431718832", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2162882863309264684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16579057939215877904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "3988024997010367546", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "2066731703492755469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "13781423818051299677", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5211831143687501130", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6863331059471727622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6403698142681887543", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7481256533438761028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "14091610802555875119", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12024143207855886580", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10170577772376890221", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "721174714308243785", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15809639778580769565", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16667887002111125871", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12790570304622911607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 185) }, - { "8567667881970262923", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10576856554114055028", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2777318471329665162", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "937159502066696999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "11087413527078604815", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18186615266760475767", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3833510944499257797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "1218323229202187514", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7683334381958571864", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16773645387243701837", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16958329690837977102", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9452470718398027950", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 185) }, - { "16511393582666965704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "3216793152416217495", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18416908414174464784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5498839261395459224", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12198263593657033426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "10014448860206587805", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13330734840729670622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12676167240795292217", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4850497746076450913", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10016815108730511683", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17948637243158994878", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12259844988981080505", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "15078590909693331731", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11988285441493553006", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13851240591038949807", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "16588325081458426169", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8642107585829380438", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6219075471508685758", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10546430708947911124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "2613462626256090659", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "8295126647635181949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14213516751025324346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16509472637458153234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "16589607587365212240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6988674007771237080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "3448477246688526708", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "8507854696766492454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "8906588133431586825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "654122557966242717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "10196332102593337214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15831600396403741571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "17808913959977434594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "15548971488532746290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "13468713306678453952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13613399861925108148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 44) }, - { "17802514063213000148", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "13093429681061786539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12247991248100147706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14491949194619001237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 165) }, - { "7590767013583950613", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13210604117940125947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "4670443882075998209", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2857337999074313592", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "16036386660666696362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "755414184406250882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 63) }, - { "12190841837604350271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "10292243973236220688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "17793292063552633023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "7605139219344415117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "787363431787954804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "7000486794832106857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13608239208821071914", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17281202179589913619", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "16985912104363932350", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 62) }, - { "14744368497944610864", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3737552767159920174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "3792945601873900927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "1364546124782880196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "3689722043202617487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "2632535010129224704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "10968768803038046390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5353552956675518468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "7866128397931438774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "18233660940545931789", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "11670430946096342056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "2627779045483019709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "11066913713501760080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2552187713769926425", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "654821507679356726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "7606728651572102823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "7549378486471456156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "15410074937424854348", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "15114370307779942381", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4803370483104261655", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10415046594066474634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3441335188113424896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "9277610800970567810", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17179609670678746034", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8251544171504007740", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1353170363915443814", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14540578324750869319", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13471752029049484143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9062774198518904260", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17917978116807564183", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3017411837779243878", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12992061224471212714", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13161997040644039778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "11724225282274130518", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "12822126914959112382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "9423958333298993923", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "7307271009495440764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "17746215841755337461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "3976736548270395981", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "1192279884248226739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5538883245745495145", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "1173986078589662704", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "11031358859656806724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "4238885454989272754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8943913562339525413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "6931953332823066530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "7799984350284425885", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "14204609663091442879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "9091110033424983286", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15829095120243431195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "3239033622277917802", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "7578177053220150569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "1089944493540593798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "15529757761327002288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18082422341304348326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "17219920118109316867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "12026482841341343242", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "3070859615622845671", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "1778345646142852816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "15188570678726970998", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4750513665628842598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "3372770576629463160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "2983038203471784211", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "6673966852801136416", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "8792202318168046223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "16441830491664937048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "1419073145594317633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17525564757769958678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "13468081302022888489", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15914058104244750036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "13760645810144930270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "5963901433137582265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "14668725050395069435", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "12112853999307505628", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "4161612746310931789", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3388752887767453958", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14046990030104971367", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 162) }, - { "16230621843665445228", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "9274179337770060652", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 280) }, - { "5115134711994944288", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "13898821685774165645", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "3007637520820789085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "16294825599850364701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "14681717813022425567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "4915831715914920982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "12894240573737168362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5448537627319798272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 148) }, - { "14389915292223442327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "14274685812676150168", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7732899312577293959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "11956435900037329302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "9263063714383940562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "5824801192141531089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5608133987357542077", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "15392077168521832549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "16446533347502650316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "14762599606783897222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "709835724029986012", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1572991986657256775", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "7398196853452900099", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "8140094412609934765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "2659031931257084418", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "4640028527711211109", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "18172711677056449158", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "5183231560876991543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "6821855018718422278", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "13237050834496100264", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "7164580481046523192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "2490155559809645659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15430549683839591544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "4553409514380460123", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "3041752019114501584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "4161001033681779582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4764776977138392550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6882621854468565774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "8881135571874888085", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14038261392627717712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "628191607060767879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "3511588484597779204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "6904130543085920483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "7924408980408826942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "9416186718345824095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "14719421757340260468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 81) }, - { "11936419502418995274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "16601702334097258697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12501619443242354860", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7104309382120208659", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2321148334382088982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "4914435717288687793", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4104562704039821482", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13308187548669026714", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3603187029740446600", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7338229552985076723", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2161052921317193579", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6104380778870471127", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13710319251108632115", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8096131027165540886", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11823205954749139338", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13403161389559730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "998876398773540321", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9280431727790048190", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1152691534728260611", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9101903304994333336", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "142270860894725256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "621915374938805401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "15746620724134970969", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "503369896500284129", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7585785802379042424", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10486348549691280032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "5758133252959371492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15117880293418979489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "9120377367517042357", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4278280309700908015", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9144487908815767824", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17408275657360833363", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11820789223587555410", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9232653317479846765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "18184621367843960190", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15059549186302099880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "16765994345605657100", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9869959062341950047", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14343008518525689150", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3202085450628781999", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "17224104246148265328", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7322472892320910654", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "12480527132372884168", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1008476023750261156", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12589440296742583335", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12604104383683210104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "12782932626966309185", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12946540633035976364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18221867262301937903", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10171373375072694210", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17791024851737594885", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "959260710517842876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "16988275131627316108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15048584393463312977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17381516856910544374", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5336120047683197088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15897477855246170861", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9780938731831129283", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1473214668483422172", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17515573322312447679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "18356980026934328781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "18077281411861416889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "2543041530639980505", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 281) }, - { "16370218798911151331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 280) }, - { "17316626950179740845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "10414903047695486119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "2809950092498355574", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "12011982029561277581", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "11267742746905371769", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "12534001599784153836", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 281) }, - { "1882052795393187384", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 281) }, - { "419783127503173016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "14211903923555028634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "10892706534058849825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 286) }, - { "2345023488044002149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 282) }, - { "5754844816339228920", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 280) }, - { "17015791782274123780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 280) }, - { "3706994659266083979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "13324157125165576832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "12014527187730671229", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5170245731599664670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "6854611304056079417", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "1954052357826969119", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "17824431042110985323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3603706453982734995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "11992353959766718397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "15163327502374403643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "16758697697363920520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "10930115765550856328", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "14418429155823196539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "1628593159980574595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "15675968397825708285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "9594594523961285945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "6634330132674952638", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "8434794604559592624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "3150231129728961455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12545558125736154584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "15485701086886851362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "18005721959893562716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "490233152678323691", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "4073467095502162430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5801429077171542466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "14841539539334726292", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "9404953235624894187", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "17995371099806008878", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "8961138963663532667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "425744529089575241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1316444335300814745", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "761169277744593430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "3325727286860556323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "2526832080529662683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15470013032930986062", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12255528292506999241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13119479079474639169", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12813978452097969536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "4991419288164762786", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "18210370419559876426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "9748307611165615848", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11147573971701279689", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "10865695385270390803", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "11999246609107242706", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4118073384938355655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "12134858519320245809", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2930898141522848681", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4190912926126844643", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2929190644951986399", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1126499865206906037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 114) }, - { "13483175684542464385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "1920070013712913772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "10787747981914307179", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7715649642603303319", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5581428998642936688", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7532088618116521936", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18126685473408206840", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 114) }, - { "2878824076934639346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "6548949901446632697", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13609660900720370993", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "883436333317162926", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16293465561256937726", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4759671642533786591", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "4903592553439092472", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "2581414750854621875", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "11627532066884923848", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17983556812075120553", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9099720270958987421", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8106738346643994005", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2554991397391195611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "13121297281694293907", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "8220168481755031959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "14502856487639608696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "16871004845988227014", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12015336418727455195", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "1984152634309440563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "14312549767853703411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "403634422724914329", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "10751536136794650334", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "10135458965276110244", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2008424849669196225", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13735180250757239202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "12351866693978844266", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "6788311046557489996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "14578867494693499627", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "11158789938857558596", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9616636708366808604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "11069983292783104310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "708747442142592697", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "2780423409483867058", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "3160543867929843861", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11305232900158601613", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12339692995143159283", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "9316082753126682958", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15991460001131903561", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17647962002015093887", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "4897448054295474302", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14184895905338394239", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15112599407339712681", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10486000767830001094", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14999920879568237166", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14799579913711096584", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "6450532136308941035", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14962768577232034246", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "1452597292381229708", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7104756264011682902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "7744787957569714828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13503688893307029975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9133263538092913983", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "1383899865465106141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "11829442945690098558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12394049027081208902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "12159582810513550491", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "17738299860390552088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "797387385159110695", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "8757900457181374694", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "6048964584602891448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "17882819773586674851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "17829148383265978140", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "14711697456265712456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "724953082687879224", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "805221045541170643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "8241070786700614317", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9191832520273617003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "12408889192918919210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4885944395876887711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "2651385050387738902", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6303682540621797774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "905780459938651623", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4476928353532757380", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "13681462437496627948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "17243648226968859637", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "11192356850081328892", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "9323825370872655346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10000618285883395700", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "6418327009347170687", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "8528750110601691390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "8061914949376516780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12992194515157698316", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "17870874477143985774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "16234606052818596502", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "9148379585489720669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "9270950131920019932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17001502418583498926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "11163107409437069532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 80) }, - { "11465965972527519631", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2534408579674556441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18109284647478027063", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9849272539053219052", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "17382660912493284320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17764033613416389758", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18431306649860116380", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3699344686791530101", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14151747022287993729", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "826850797666395121", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13486084204140096478", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2114599010013594942", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13251091004269229867", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5240706676373148280", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17490188677223978661", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17854208422879910606", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8767817856303586064", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10672380526821947133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 342) }, - { "10730222715353420212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "16683169947375504066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "2964705957088952872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "14885031472057965707", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11308583200952256245", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "7208008921815475393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7113777272518482528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "6334639534663495263", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "10151922632636937118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11560634267092054110", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "15914107501176673997", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "18218755616248669884", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "9987415314864002460", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "7667898603371717971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "4403753181729432604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "1040030752340209480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "760687670112194844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "9803492989444302959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "216603198215625772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10899110544832584656", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14447191095937730964", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "11130439225010714550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4325081100430903742", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "4216958486055161753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4400247897123856252", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "2294800960010879540", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "5195511638783481084", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "9545968464906009869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "12932635875905153141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 220) }, - { "16925721317097534009", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "4398371999113956082", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16347412180100581330", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "7877332346656934022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 289) }, - { "6323026044750482867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "9761573038170759563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "12098146032672599222", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "1403617451623027879", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 282) }, - { "9058996149754556268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 282) }, - { "5864250949922222051", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "15847413004526420496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 286) }, - { "3199841714087553410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "4957638663977636791", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9437794960375526230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "9475130054420979752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "13312514874803986753", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 304) }, - { "15997754881872769378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "1941341635794709702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "10157866834809927320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "12308359047798183133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "2986189945936592561", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "6928835003016610382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "10084794570892043447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "15417738436777481469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "18377298651236993830", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7354234812009979811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "8656468860180713379", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "14472187692485966933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "397770940444464146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "14258499419905714808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "17599396373608265826", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "12935563359569230797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "4892959859293355837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "2802810524370514276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11587239927319376658", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9076758673133996959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10432365444137108781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "13092232276822302626", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "14896875712028630045", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3236003754884728510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 286) }, - { "12181889163404078773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "4856470441452830056", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10022487076451608714", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14811603003184578943", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11565861421381730304", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16577611471466452776", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14616969385577243225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "17921973525603585874", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4617809377006148936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "12641170321047008726", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5940337324384948573", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5738835498104275267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "3499106702307464480", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6942016672941874829", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2173720698351153121", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17201365233492366678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "2877521658768725103", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7689320135952025041", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12031180482028822765", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4717620775314557374", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 268) }, - { "13800760323805415740", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 267) }, - { "946479876892100082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "5039037192630609823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "13839116996827687373", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17037416417174266088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2321767794934000238", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14907097142953816744", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2525260242689556544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "13328449155966085543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "11856266545854830143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15993427814066246646", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2100891581797371600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 370) }, - { "12242618640422208652", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6133592828563353516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "18232278892738147217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 283) }, - { "11992625045241269569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 289) }, - { "12601126285773042005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 289) }, - { "7457899998356343871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "6343888265369366589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "10791067159964399241", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11327097771110264965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "5245308722062496788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "10792503079194374004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "4818231379191523896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "2198278382394812839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "3800011935243649447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "9631545863582097486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "1779941298820543013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "3621930417735246405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "14435120971846098308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "2893564501191050837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "8108843303778211282", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "3682813162987778705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "15494543914974994991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "7565221050911842393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "5629670679897666607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "11754316727756881612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "10990741293315393791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "17024388383581997032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "10302338806536775954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "7915318733663535312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "13702692566238948173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2909728331855309274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13071545223094862275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9631481972809246378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13540002981450186147", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "7076937538747704750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "2041212737963974230", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5308128387928804050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "8619526128410675593", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "4792351255949877935", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17759505449240263390", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "9584652777232392944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "9999955037598579164", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "15961487889420208188", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 197) }, - { "541817615957967731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "13853630125050609175", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "4137755981477177003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "16949056117405140365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "16014822406751503249", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "7700321970687976931", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "7056293586529818253", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "3814584042139408454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 313) }, - { "16992405636352406660", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17442105631503326136", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9606639214735570069", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7940369586324090841", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "8444259010311137762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "15489746763312425915", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6800893510381991731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "4156384238797998294", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 185) }, - { "11645116728396933125", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10912495395422146386", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "875400109066360897", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16475247464223458061", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12700372241799686527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11640225461345567929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "13183380647506951324", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5242271874488296527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9488453013746383896", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9726913113016874092", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "15979956159651515122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "9947449295659685973", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "14230493618724018658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "1704404203639481753", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10404725818204494388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9767294641786972359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "4282668574670785584", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18043340998699622388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "7148542290597073512", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9040046051053703359", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1077773457856682663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "4716188972902735458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17343050785312683560", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "5687802882700097624", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "3524531620118359828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "5688478347124565305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "5504757952698692953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "13800387305792597325", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "6574971185849732667", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "10573920781439771673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "4992668316921598993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 301) }, - { "15778834188130183853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "3062101811226530720", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "428659495445490820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "956022649859563080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "13410850301164057911", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "17423645390621980919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "7802311886554362782", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "1172103288112689821", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "17353894529222574441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "16431857516454692096", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "9100044555742394133", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 170) }, - { "13115589642140732066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "16190949264253468961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "7026575758396092435", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "16761856644242716357", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 167) }, - { "6341197991729122563", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "17087740929472936216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "10795104632256101599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "13327653786981478088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1096671695414716274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "10774528268153772208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "9525853014023664813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "10632020369698615114", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "3234107167862677811", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "8708643228914766202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "12415368596357091523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "1028160614515220430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "17742192339816511494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "11931568365395665142", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "731825454731954517", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15989894214714907271", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "13478984039708550410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "15773157615731010456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16772854836230971016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "2934519615045138808", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "4880150897829846031", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17889864541794448203", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11768117585574496387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17906607354577138153", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 198) }, - { "18270587701371596297", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 199) }, - { "18142462471803295391", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4815047491742617397", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "4513063773753763458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "2984726467649419856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "11795826875463204296", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15675903059949404837", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15817443774186015593", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14558572801374416278", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15555083739490354527", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "3854114166348568039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "3216877571075556066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "739676584505475609", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "8303211644727914658", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12908594497114706897", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9918371346247634545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "10893432143734884603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "5339985303398206057", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "5941852872160795604", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "17634966178519099371", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18299254635579957284", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13357365044448426880", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18135307303959376082", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14764715930784496165", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 227) }, - { "10979362792894404338", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 230) }, - { "15006321421735686121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "12370729327673204804", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "10722677916294015259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "13454265023861566476", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "7995820969034996638", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5275016494706355806", std::make_tuple("convolution_gpu_bfyx_direct_10_12_16", -1) }, - { "10947686124973711385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "3349519148124496343", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "4003433148846544263", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14973431782875808802", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11948858355027908365", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "17951403431757222177", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "6586872365879203192", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11718418772370938734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "989564341557094953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 87) }, - { "6942049339361951275", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14555883089089918919", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "14808895254077106198", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13830605041347009953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "11955992313739654625", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "16921026268702574340", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "15320845027635796583", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4014667229872705228", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2438374917504708831", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12391792381149655331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12864558900883069118", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "7209217811135076623", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3272017687600371031", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16067605128297748820", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "14150012830816329527", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16218339663410630711", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2089730611490367290", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "8907982643256296667", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "804195263636995800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11528417522960871233", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "15378025640603637387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12860222041026638681", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11597391933877736800", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5042176052323856983", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "17010172246526353957", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "938848188161536107", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12725647706191463348", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12553441041059632729", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "12782191856884962803", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15824189967727245909", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16027853590391209100", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5352061583962489055", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2294318010381635693", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11055049031355432623", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2349007644347065353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 111) }, - { "6146876760962332928", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17434429579652310107", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9447458159095730492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "8655883535274781128", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7272538316511343863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 225) }, - { "17564338309805484464", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "7881187047171099732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15579919505002150556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "11583017348580874022", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "17915846724151945664", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5319668297345215520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "17208186152576814861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3633858263279042265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "13853056718266488510", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "14759179293743468995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16995873636564597028", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "9438739171104456179", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 224) }, - { "14429081455612806819", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "9819596940685093690", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9426665763007611385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "794499287296495726", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4980217316169616839", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16105073808368936420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "9530116228032101908", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8527193566719173253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16566214123371867456", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "1470933384474984858", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10706267011822108376", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16081386644309102158", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3571959174116404960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "12566041126392848976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7603872175048237237", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18235209540858013173", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14316077757957132678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10816637153861630723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "9175450649281374948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 106) }, - { "17370158297470557151", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12051595062513871723", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2967481531952454828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 108) }, - { "12085348936192462321", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11951606039079763598", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "8769060267707904998", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17104611871050967957", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2103882464623009432", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2659712601063515059", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "9759380701896779097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "13842309033760176194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "2418288192668085805", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14994322266840011040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16402312692470500253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16955653765071712611", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "17830290099875088207", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "603883331897298932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9731370183088819573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "2296581485980163665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15133468875250992696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "12972798847556569913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "17446505012657609153", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "7223801044761006523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "16511749893955141055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9485825829394109934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "8130920994920685157", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "3573490922300056520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "5479761740065152589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9480653639044390919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "8739347545059610410", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13459514533473657102", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7824524940405130010", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17796310681498690253", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14823616678465136590", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13816104794723484993", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "846088275031979661", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18125732229366977468", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8464582977975377118", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6290317420155851465", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "12696412964119109465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "4994591211723226974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1036010477232750453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13786357802945430475", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "1003101267609305257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14991602704357959545", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "7840653268996892538", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15488340031228619748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "5003718302026277632", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "7693459946348737411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "10536316961655703500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 196) }, - { "10765280349477640969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "7447163906170805189", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "9319254979377483709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7843508201826629532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16395067736440127496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13820498543284008286", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12071914115316550349", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12727541507197887360", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17364712285968437405", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16120988958246503683", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7375461241315602473", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13282951481330978659", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "6181308879301978465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "15488550074426713959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4062706195708729345", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11604111639041106489", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "10512507780534402341", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2128612971571865547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "8594644182487917002", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15881381297320383917", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6040286126398028933", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13926122593957480821", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6213386558868267629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4456004887590847716", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9642229389394495047", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "18259656768460999562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "4983880246908724272", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 273) }, - { "7881579844586294503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "5331173521406046122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3285520504090196295", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "7143510787416483146", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "18103534417093702556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "9328223957245552723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11706446082856895571", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12625112690264223217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2114232149447438823", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13883044928774243663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "17636500109629107732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "6192955702438301372", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "13970935346154374605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "9692654253261175490", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "2116913943188857359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12802517759474139810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "13611054146745413536", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13814086981499638596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "3106922888635965020", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "10509933181132310969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "17318287523550546026", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "11806402239500046867", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "12353956380178079089", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "875296362957469305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "14912119584313592912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12494969618927201911", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "6344802942015047824", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "1692411934657235774", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "615341695338735013", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "10601835610089648700", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "13262672660175739705", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "16522364268583242080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "18253784177599134876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "12319073009094248232", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "9954050478761346921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "4640696923527766618", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1436052878894538927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "16011429608661242565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "4381329435655511217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "13972357557211413688", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "13104509059416300615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "10090923790949378407", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3429844423226609965", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "706049518431331645", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17193614571243427089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3621424752591567930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11066930104187448422", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "209732971447020989", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16044646335477470657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "2172121470071868949", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3392693938352572136", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495063314176654751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14553856088069405595", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "4967444801764057340", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "12160764253455777655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "17723621158215826108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "2171768477223405739", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12672995204641007004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 232) }, - { "5622089373755094139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2129726780118554358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "4160656836528944651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11052732052072367261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "18432787283148809023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16172528828198474326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 232) }, - { "16327433707667075261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2797723586312707948", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "8451212914744825089", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7025975403069487257", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 201) }, - { "8913950860101596091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "15308578014507211237", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "13132804928635689780", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "4465781406991476376", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 287) }, - { "16266491618150971928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "181006047500375768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 286) }, - { "18140951659547259039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "272730229972987861", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "14898892437285105327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "17252449599613270108", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 282) }, - { "13436376034548670107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "13787436604877398090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 286) }, - { "8873614802459592665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "13663893159182636270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "1361159591875955678", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "5912303851874077576", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "16245760498096322525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "9928406318940388716", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "3036512701943687724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5334291640387922287", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "3002986032379998259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 149) }, - { "16469788155263456039", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8709632541892447149", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9524303276541517389", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9354818521586974021", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16781127329510211966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6351572488552853754", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "907036267078333137", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11855070245618904113", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "4544242784357021697", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18218631037214746168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "178353385245384751", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17658152048177750315", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "11636129433022017868", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "2622434279674583815", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14335074487552883436", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11175955260573469979", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "2732519635571994212", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "13893789954946953427", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "4355933224673863178", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18037918102910297531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "16071723603031305677", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "1697248235682953135", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 362) }, - { "7843498978148810586", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "6767159196241633301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "5097818987523855112", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6623182990939010641", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6711878663358611849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "8671491767142900139", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12164298124869114517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "17089801601582809764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "75742659105146536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "4652136280940317116", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "9751582946441607796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16706244336960642883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "12581879452540858313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "17443356777503458523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 282) }, - { "939718260623752240", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 280) }, - { "14131851237755716991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "7474639594232203854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "14152716242882609401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "7998930863626763670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "10323345824599612614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 336) }, - { "30229601562833524", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17788367809717898285", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "1509728225855233852", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13139625572508441980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "16491532291908469567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "6355395905401306995", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2096779676054335057", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4217179485243909459", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17101789600628162503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "6139574161497189424", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "16559140502701231107", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "11459784003592366395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "7869916853707978306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - //{ "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - //{ "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "12794369485239257709", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13338594271376045657", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "677249604491773387", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2668729552208169959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13011676362747785816", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4678607855896512523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "4356817283284529593", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1885075753696445410", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "17806712457019493207", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11862259122805366807", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "15201438563802430490", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "8132521728369930959", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16108573960501496757", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11086699387784339943", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 119) }, - { "4013707396889204359", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11850332373794932468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 215) }, - { "14763982961176216679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "8207349115037232863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "3273748387141431306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "17109520309574369561", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13754408679115174221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "16717713360264747483", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1045854873741563331", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16767392067294252396", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6114241186364821679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "11241838709529552265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "15192230303376521834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "5374969798377773063", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "592245952014430043", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3114869763557037270", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11254635684957519432", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "16816222375242496370", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12809199739984715013", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5040730152867713388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "10429613013253088132", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15451919862187018297", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7546586420552408243", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14487682847898298214", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3106710091841093202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "6458124573210430792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9182897385081081193", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14462438074931673266", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18133334552107213128", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "38736266675995457", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "13654816209891478730", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "6263019986730305851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "12929981792125924963", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "3138374672801504481", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "4465701487417893814", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12977678792503377525", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "10879218241103462088", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2221145174704245189", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4635570915184713874", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16075006181495932250", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3863816884636503247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5440983284868981549", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15428591250165788477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16567638487719493784", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "18059267466971880386", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 328) }, - { "10808909442136736629", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "5682190700442712936", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "712165731154577189", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "7469127846325904854", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "5926747396493954633", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "3477539135137665170", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16235115911229280717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "17009318615658405230", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9421643783312790618", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "2294026590516781945", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "2940027113687311893", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6090625728451718945", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5643908654122573882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "9065894438656900887", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "11185156002426041243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "14670068483447729857", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4623542918584461522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "1143214652021653634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "1434535531617424039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "17025268985366223779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11507538232733291666", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "16243196137456624852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "8059328623525062913", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3662747857062156477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "314054598858070952", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14122213471825630433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14985236276429954162", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3265415000818832667", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "856877003890134554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14805540705424073865", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3788462090984291082", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "2715447739580688669", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7171904645566467208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "10308431308942416781", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8712136292276123857", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "8700574100180128776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "17147293671640396193", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "16474284418841532356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12461575861709234385", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "192209423643075326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "15490478608105402679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "3491333679577961640", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8176012042686275874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "4282198629458668761", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "689445825453914111", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "969746749329671447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "16833026567865627676", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "13046322179198317310", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6902644989079870993", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "10987953316324712538", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "12515465135362865565", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "10049571207493913006", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "3926585856863002495", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "11275109735493317886", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "12238674883388043717", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "101401523793806394", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11007944497812650617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "3240102173773280414", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14883438809987378616", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "13320675959188615441", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11975047184326016230", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2608363732937932266", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15943141845766932879", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15486917753097743853", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8317673282128335201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "10635659193402005820", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "11450378244355788918", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "2625969259447793593", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12207503176295152756", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4625107584562815965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "1997392406402548974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "2524029454785583409", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "4615708568396290002", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5349415632630235233", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16108759090923335184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11756881293845417212", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "17839839336294937155", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4703107905652287491", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18180820925685532104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 191) }, - { "3835286851569826052", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7807983899017500046", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "10294185397756053636", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "5519535335798045279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "8701248964531180496", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "291868903926685441", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "15239764240622554314", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15963038745470172423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "11428599290755097395", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "3180320769716158201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "583303098958523195", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "3509487327001107638", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2649192407401044065", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7706714181281908433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15914342421266687768", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1497560475414454618", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13485300684443803732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14571022040013651253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2832268621630415376", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "9383182168277796969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "16487774205195979355", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "2226745622763268469", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "13809330759308309353", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 228) }, - { "11634932044447867039", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "318377908569897093", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7353563160591978243", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "2582625260054352916", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5609922876429907954", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12557015880639217508", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "11528310408333718862", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "1471837664358450291", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7351401242363888463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "953306082374100275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "15759530339367380982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "13300022131572486202", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15689502054035168040", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "16969463538496570528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "10237524128771958432", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7969848911698660033", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "7130694811424715594", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8578747191812631883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "5197105253412476591", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3120553928584920777", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "4750894407873652809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12667014405537239093", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13644681270630373984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 364) }, - { "15602218079503030465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3950738240651133849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "9101334153142718004", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15695415285791951018", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15493488989417521388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3391032227732782982", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8951040603784899163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13804221028705631415", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1351033666248868977", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "11330591026581463934", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6142707387281700290", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16117448559783537844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "4531222427159927606", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3116068331849795558", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14389719202147508599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "17053671692908867872", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "17025182465337728023", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "15035800097152337587", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16770615142634470903", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "9378269524012289175", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6727930402459775131", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16362857896338778056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 368) }, - { "7187734276051878356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13253775441326432265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "14733510474010040334", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3336303478756453360", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16352331970945217438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "13484950419220835364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "4674416595144505741", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14559308665571750465", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4542143431130171516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13189392239349392492", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "7009735776703529573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "4220826666482500445", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14792528369891965810", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "15287650965861631130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "10308175009371219583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 166) }, - { "2903605246599054308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "9213563311267466388", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "5019077257951332016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "2497756607567197523", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "9285566577169147378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3432296808755992670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "7688176479120305539", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 163) }, - { "8818070832398055086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "8787816339967963727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "863952266514375915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "5835634465164771899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "15101680837342453931", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 46) }, - { "1116274074896622552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "12790788016297794214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13538051178827008933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 86) }, - { "16403423801823379909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 85) }, - { "3723613341885592267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "3830703844770425343", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "40704767167309552", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "13973028408397200796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "16561224775421968533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "11243840588602365090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "14103112843209793966", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "10483664832302187567", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "8100595788531468781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "6620782733027313312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "13526488884846845330", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3534971503826416049", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "10425889533411573166", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5214654427283761256", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13569941893504840630", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "1318571118468536310", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17724604495865223459", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12229574562535756991", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7264274394359484318", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15069906408448814772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "11857037689248685487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "7977195117668583981", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15678385128478075284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13025361884606488732", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16723478941106779069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 269) }, - { "726985753660756762", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "586947787345351152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11418379777288974452", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2575631797904040925", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "6288489890578212082", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5649082203775427830", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8036474422877454869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "3711525118850629466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1875764913306932583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "548663565933738403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "17329287216741045059", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11848462434662954749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "7581174843529024536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "11334122788337402526", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7868973874302246233", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "17209528805596238905", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7878605163588288309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5941092474669713339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "13738760763969959522", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 90) }, - { "11988546375476924356", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 52) }, - { "13680926356824317761", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "2530317332900569142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "2891736961665476908", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 53) }, - { "18008552719153887303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "1299545313185409227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "17907223570737272640", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "15643135666029727865", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18180655791734632264", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12990527753120735255", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5303970743736042689", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1596353239542510685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 359) }, - { "8040001390872143271", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12052207771201936228", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9942099207256025216", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "60509335250891515", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11499219760597131534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "6726099352298108756", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "597073780328219388", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10783630257421062891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "6988492019664525206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "7132328255408635227", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4006884370026272807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13938466156916423478", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 363) }, - { "8689206546467098603", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "5644068493155655611", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4867937397499803072", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "2702144517025248597", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3304589333915676807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12894625941923144893", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 284) }, - { "11649407835105973949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "4897991181236908768", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "12179581684777023804", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2806529556090896246", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "11327228813412934262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5485749317130402302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "3499243120652875549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "10916647716124396856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "5749536453225343663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "789359733867650915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "12626014184575881530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "1201692134690347847", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15249442550355454201", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2598267743388306204", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "7181154048972884375", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10930640103080573253", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8458082326743351141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "584086621952390547", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4754967381316623440", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4353842547963164546", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6131481289104111211", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "517997325935712670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 332) }, - { "5600807544955072308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "973966345068677905", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8532217744217419503", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "14614844213016502202", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "4126895998426674411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "9700808806849459216", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2438261005924916746", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 366) }, - { "4056971751486746551", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "8929453032482114162", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "7662200927459001757", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11473442921040533207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "388828310152538138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 356) }, - { "1643241486250690844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "11806105193035393795", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8843585527713905568", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "13248567106128518549", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13708979487306970634", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14406070210216948643", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15352245788978088971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "1435153323458789173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17638692805430115529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14068780861332616363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 324) }, - { "6656593119788274992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "14695781272831602408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "15696910741835640150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15315327794058441258", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7545013298074733778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "4026686872534942904", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "6553736978928374036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "12129572274423886770", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "9723314434598141024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "11031625790234068916", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "1138439260035360722", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "8323445733669842657", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "54019631544204590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "8971115542951085891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "4584970211859494304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "9321208819255762521", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "12617625046664709483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "8264178890341675354", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "5334190564423375247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "14746359019867963124", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "2044363708106765326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "5132761922124425835", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "8141428150264829362", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "276407276027553756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11878734040194151073", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11622925573287101001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3192332625020432602", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "9785114056964539323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "9410978119783758141", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "12523676912856063091", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5912451559447635837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "10264913782610095832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "10309083227104422150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "8500148569566077929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 123) }, - { "6578908625437515675", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "13762042713029963144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "1561225943337590599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "10917498758625273194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "14335423820860953927", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "875142032423622622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 120) }, - { "8965747921518186477", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 3) }, - { "4428101657497677982", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "5779388310240896974", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11092828091552833150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 43) }, - { "10295330953350618042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "15901675909820977223", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "4894227264080887361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "381149736509958403", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "7962991673727743706", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "12725675221990905186", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17961702508543961900", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "7082007579524697455", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1867337342417952506", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8931169575495985034", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16542318967217020315", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10626341369865893888", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9090828337597312855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13621339501067135142", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "13754540732991287617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "6669808855737023569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "17640725195881101275", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6928136130626403937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15047676717402283805", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "1082574490068006980", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "6557428245898292304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "9440117898128288296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4672441137336208890", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14289082888174784976", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 198) }, - { "5056859994174498686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "16574710115918192418", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15839295895890205274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "16307464696265537356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11910735867274493498", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14671212883301405408", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "12028665820838352309", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4773123925616969670", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "13602140021189675477", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7708321360699824256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8609939102588915855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10782611933832492335", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "8857763129101380288", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1230262279011217327", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14424566003632608852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5497751772699578150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9541630719145326121", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "10724501418439612080", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "187352687850707150", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3438296636411972401", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4165036357594592683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15106614232165315070", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17477062954520561609", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6664432489777052771", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "3341302541468955849", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "11626402549863483301", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "3522383297921565178", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 259) }, - { "8651641584737798174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 262) }, - { "12473600360154597915", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "13296242326766100583", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "12068797674575015662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "6297802534570892679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "10037086825900566930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "17216583849049249733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 260) }, - { "1287490919205560806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "738850098651678143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "7139714914586273766", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "14050124896329573468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "5429130923188159806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 98) }, - { "7953255701516490034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "6195916781434462809", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "11025471731438443683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "4622514167765722873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "14680730265621679042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "12141300895511301068", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "17106086048442658788", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12707946849050970702", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 143) }, - { "17154337492545826355", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "10109431802089940590", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "9428176632140441528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 244) }, - { "52089503050497755", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "12297371032753209816", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 214) }, - { "659150305191479097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "2065752819810364738", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13583166868754499339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "13991205023798493715", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "8939683514448064461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "18337160891834020517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "1154228007901031779", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 353) }, - { "15156525717629023944", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "7757331094141318304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "16779678846332091086", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5409924335138540834", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "4149728557142033774", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "6443517114667332732", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "5419041493176804960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "15948383678216076358", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "9604982746455852556", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "15739274921308457528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "4642234334824303290", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "13200151444914751729", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "16894871557229780934", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "9933958860597451711", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "17094948685292534952", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "9762182215179534181", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "18273537339378756543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "7720939595094113814", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "5865480930796299143", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "10058165874008941852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "17309326904418811234", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "5592428580503282095", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "16348402367953880206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 242) }, - { "13607830451968188080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "9311802150474489673", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "5159470523468873105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "7975810844103449438", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "11455843788148231615", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "1410630713443793537", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "17303408650780384587", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "12069726772532946193", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "6204183474669103812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "12874626654611400042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 135) }, - { "13546876216568825877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "2973436171295280783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "1908809004094565452", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 62) }, - { "2322559721899919275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 167) }, - { "5766507688771440170", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 22) }, - { "16626226341188424071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 62) }, - { "11709992724966310174", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "17222005830854879661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14224121742920800990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 84) }, - { "1071007164550012186", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6719302427415173754", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 44) }, - { "10482582307328548806", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "407189201971322683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "6531171505861182429", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "879005904827468163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 0) }, - { "8460847842045253466", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "10488269059469838160", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "11359409533744011242", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14813178380338948912", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "6307939332939714967", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10894058425957901202", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16610284927818475574", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3221469860582147955", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6423785822515265784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "742689192890486807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7349880498513046830", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "2369451367723962073", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "11690533591656807605", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9205978149692979955", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2728938624042183713", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2781309272856442321", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "579781312141502576", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12564687330941036772", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8421388456873652700", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12177387334053203378", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11239541755868028928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "12776081190690731910", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 170) }, - { "5648658688155716974", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12213354854947437262", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5680236635030250712", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "5751283221740229986", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3646228701104397128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "13776178598632392721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "13364676690016875118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 174) }, - { "3141773224039276177", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16384186388687043048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 172) }, - { "14421898375873029115", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "8922929126299811091", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10256831975351722184", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12590922530749026871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15209909241815414156", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8791285622784082122", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "7474592508575297101", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "12068974703657294908", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "10682300249493137042", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "1788455099959676873", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "15225354446874994535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "3226193790517362610", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "15814015810740458605", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4129722446574108695", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "18094205332383644037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "11120846960057008937", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "9195732599757736182", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "9939234037869927090", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5898740235388207878", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16694984452720336415", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "4889188980319017094", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14412158605670555579", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "3463959257726925426", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "15726902746983125797", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "8463615810239412362", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16531824466148265247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "3374410641320310726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "9589942627115344216", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 101) }, - { "12864204111424196179", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "840202264034382558", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 41) }, - { "16386955278777720573", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 95) }, - { "16267682394077585279", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10544411879329675593", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 7) }, - { "9835739612255048978", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 6) }, - { "6293403765897901528", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17596685300497748803", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "2150326211917340956", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1587501521145162454", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7561096442572829049", std::make_tuple("convolution_gpu_bfyx_3x3_dw_opt", 69) }, - { "15078262396281327048", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16383540667048742064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16820082917500285799", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6820284286806022849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17285815901490707654", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "994182747184593564", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "6642767323474835034", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "3215659303601163167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "54975980454651672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "11529876081402974396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "10308113903347312964", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6712698149192186833", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14930789530046665855", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "2204178900998688268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17174919737114915467", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15154700439767512396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "14916625550370402883", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "7650375560336513366", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "9999553425206328238", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "14026570177552137240", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11686670048744589243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6678796313875454849", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "641417817126876622", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9622546530872848323", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9194788897910888066", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "522181557896569275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "3332334993503432420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "16131448347558322280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "15924916465272239832", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11669828823444745889", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "7243917162812988891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "17891499682354369344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14532519639619315651", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3635446784873718932", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "18275601715050791851", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6997971129340865650", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "10722782762733112118", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6585223640997887253", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6205240287062600210", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "17522452942286240233", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6571438978296387721", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "15511138074959300404", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11107930597263802755", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10320711719466983961", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16884228931101540030", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8253823502854784432", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6025872155179042054", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "10173283505468233128", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "16094174852600023296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "11077876432364512822", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10586018593856542117", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "18436249934780056991", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10179916356323479080", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "1760391741350091665", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1418595171949196661", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "15967614281807823696", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "15329680728165965773", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "8794896449397768269", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12151068022697708126", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15959543980008442942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "10861769381993948050", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "3316798708399098230", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4734389463002799056", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6911215749850066204", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "18267175011323462494", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3109104171383198425", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "18136135457402651842", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "11834683513280095384", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4806571630436601566", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14849108908297747749", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8490260671996115530", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "2929715823970060874", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 346) }, - { "15924583510704449214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 345) }, - { "14331658870024759698", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "6340128090694375876", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1120455113299469776", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17268201530818712998", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "3644282167178264526", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "360872770877634346", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16720108310653948550", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14353390922580547467", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9868561386826862471", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17465517455679097501", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5570311824197099845", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "7524311370696987092", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "14070988879848388270", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "8296551195150971668", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14352796912241296357", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9840495023131952174", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "4720851194954041037", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "17515064188391421150", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10437367877444543776", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "4362304842016958728", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "383721620126444793", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "138379779469699309", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3759515057574218101", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2856601829807186494", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "3286330985102373533", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "8159303545761286685", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "4056979460327024961", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "17823133607491820214", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "7969441643457570812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "970768445746568749", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "13852065717057446998", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4342360467977736802", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16336482874764861478", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6075691042233712335", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7570346182940928159", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "12971822824884826169", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3033264172690274208", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "17301887391757619741", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 330) }, - { "15790005937034794347", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 241) }, - { "15464327246951632247", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5659168916726488798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "8079376692609682448", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "15160738482264643601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "537074122417021898", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3336076058264596420", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 238) }, - { "1982176363226079588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "15052577143485630617", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "9314293064351558241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "4958835037528182801", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6817494598328071314", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "14387756025635589673", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "17536308070854915513", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16027456210394993913", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "8655315308767111198", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "4447065688824381344", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "6843617687528352801", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "17900257435531434807", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16789135236017252073", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "13224814158106791463", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5078905972285278557", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4196367396954155354", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7009873605945341897", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7199295899520406795", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16833854122884184025", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 347) }, - { "14599780481362761532", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "2572395498687401679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "11810221946429451169", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "18084635102736402756", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "59739211822469868", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "5240181393417899912", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "15962137123591591534", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "10989937450490049763", std::make_tuple("convolution_gpu_bfyx_1x1_hgemm_buf_16x1", -1) }, - { "9798585825695496550", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2362092095402043749", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4444730303823507621", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "487214150851213303", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "745009493367761775", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3806761527342944195", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14458851250685872417", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7106362077449435105", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "5853697372844744672", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "7603319690872333930", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "4628748977913534701", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 338) }, - { "10565371760124443824", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "1972879521448306536", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "13893808009363736870", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6584960721513702502", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9220830217525628783", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2235210915304938149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 190) }, - { "3930314908786112883", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1334070221835422461", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 157) }, - { "6681818065741882453", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "6980201892073961793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "11530101016435264783", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "4801117903303888658", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "5782934278345953016", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "13951717514084457087", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 329) }, - { "2721793280965260548", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "8124736388338424498", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 331) }, - { "12223993560805441284", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9860570706348640782", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "991586070509079617", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 192) }, - { "7060804814325505165", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "787203599734115483", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "6193161166790398003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "12806934028210472719", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 189) }, - { "7465681710653503161", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "7958443549125799229", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "15548847099740441551", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16986610822918634530", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "438528596970898721", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15109847707903824859", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "7121708962074176240", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "16789245987103323406", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6318228858846223186", std::make_tuple("convolution_gpu_bfyx_1x1", -1) }, - { "14043770215999952932", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15277856047844308598", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8048617952947915835", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "11446745541571732900", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17422822627612865758", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "13954144830230671601", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11198908896401597838", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "5582896843095691256", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "8133587696326295326", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 109) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "9492402787848610840", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "10515519878978734341", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8747430148550634190", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16986358655784856534", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6109013751635776331", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "9585113116232600562", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3503893875515897267", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "13144385730409574259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "743941460026466526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "4492332228252010118", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "1920042803083729276", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 229) }, - { "16436006771518788093", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17567504672169904482", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1989849521691057108", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "16706121580364790904", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5495776091407365966", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "16430562172386510259", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5673972310424776040", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8797843396807284399", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "1698321314111848001", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "5762290464889692462", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "3218248162832023196", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "12988961529988078346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "11683680166617045816", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6252429564537528709", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13145474177271090694", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "1208161922424418734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "2762489653422414995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12937333118472722002", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "12917241193304093727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "11020315012951440351", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "1518270620354036926", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "2567046336192437734", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 210) }, - { "16409729623371222748", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 212) }, - { "1044978617045366709", std::make_tuple("fully_connected_gpu_fb_io_b8_f8_vload", -1) }, - { "8473037597903277214", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "14398366949002972908", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "7334966010680206302", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "4161141078006269526", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 226) }, - { "6522575549211855712", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "5629373398445592781", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "13374993751390784382", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 315) }, - { "12976499206227689731", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "9882204352209412039", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 365) }, - { "5041111302824362529", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13869716373706247686", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6438522646185979880", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "2406816735581074778", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "8881150100883636392", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "593712935037568960", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "11970881115757095265", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 319) }, - { "5584432943673435454", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 317) }, - { "4560479630843098090", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15374625876485618845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 348) }, - { "13102754309439605192", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "17912189681971987483", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "8153567933591966877", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 352) }, - { "1604661321386793876", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "8990561333549136048", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 360) }, - { "12278364834477923930", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3122997634505472500", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "15669490019428002270", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "116291934148608396", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "14729854278671832528", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "10591379189397010097", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 237) }, - { "11929531534620071758", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 239) }, - { "1819720745131968914", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "10607904718265020949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "913496537924971856", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 187) }, - { "916389941321470163", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "1411786954276574458", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 357) }, - { "2730604806511016352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 107) }, - { "5843679089588930933", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 110) }, - { "7304346312452588844", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 231) }, - { "2423754482456771339", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "3653156933813711765", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "11149782181562145291", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2653651564133701304", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "3526580286148537369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3985659568982275663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "13642146548740074992", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 371) }, - { "5219399418946822456", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14217181622713951411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 303) }, - { "13025323039227543550", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "6114147683777615071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "2355214244972870639", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 140) }, - { "3167336012388169649", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 142) }, - { "12218337369633748663", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 141) }, - { "7264756313770306662", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "10492056481694320580", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "14281201038135286621", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "8127190765748950828", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 271) }, - { "142486914279119363", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "1532263118203058517", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "5482851829165191681", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 270) }, - { "10548792624072794724", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "4239415134522959352", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "9028970753877215614", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 122) }, - { "2324120381399737261", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "10267260789603562117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "9988801796928462423", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "12516911293946682547", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 246) }, - { "9213886570531053949", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 55) }, - { "385046297070779752", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "12541834857357563605", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 124) }, - { "475043738497218394", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "6351347283201596793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "16290626406346691996", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 23) }, - { "4569338575782832784", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "7575675354187625951", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 349) }, - { "5795073619189010837", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "15123868617509445149", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5601435819039968726", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "14104238386345631681", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "17377293745073971167", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "12134712464763856064", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "5524215233998361104", std::make_tuple("convolution_gpu_winograd_6x3_s1_fused", -1) }, - { "1103228955716492167", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 247) }, - { "8618835732380720921", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", -1) }, - { "15908673392788376468", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 261) }, - { "8482147530539941792", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 299) }, - { "9069334144391048686", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "12493863403516600413", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "16692569816843207989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 265) }, - { "3438116423688595487", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 254) }, - { "15602863681196390535", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 245) }, - { "18277685132620834972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "16541722316343690197", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 127) }, - { "3067806959725855130", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 136) }, - { "17791773192152464021", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 57) }, - { "13603318842632052764", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 1) }, - { "879896719155824868", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5219048275475447369", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8707189142909022305", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "5948701218437980356", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "17050143605017295447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 341) }, - { "8906185843274300447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "8321769923556905957", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "10433541468308381909", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "10405183426600618231", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 354) }, - { "14885109535362957947", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "72444706264681262", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "16818714747882774917", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 222) }, - { "16236397968499692493", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "700717277178942679", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "482564204402769504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "3221221905804708596", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "16467987800266816984", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 213) }, - { "11599932445375240727", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 223) }, - { "5057534502588100071", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15640202505592598653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "3355259926747524578", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "9226443907548972870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "8104309105061227444", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "18384657372655350144", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 219) }, - { "13739257060165119132", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "9810904714798127155", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "15609627722687211129", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 358) }, - { "14738573151275130683", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "9421927854269492263", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15962533525948221648", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 138) }, - { "15856268902838573812", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "4085450203909854919", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "2370837049876630969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "13464226348405628455", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 252) }, - { "12228963567837353733", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "10377729875228238588", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "16362139250976572928", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 154) }, - { "5420766967862917815", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 194) }, - { "14578291812739325465", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 155) }, - { "18310667924071639899", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "16853250891250756537", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "12990341489637414845", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "14630499010941056793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "878892264408839067", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 156) }, - { "9259437778054905599", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "14974730512607138726", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "3600066510593746268", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 159) }, - { "3140230065585683313", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "15891662883560480723", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 367) }, - { "11284755586130392759", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "2281119269283845320", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 158) }, - { "12246408434917478929", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "13283842370311517843", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 193) }, - { "13753473508578037346", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 134) }, - { "17123153447808465303", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "10700011669103135203", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 321) }, - { "9979259596137305973", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "17225578855755054959", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6471563320494376693", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "8146945902795164796", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "18372284940315010254", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "2194607895573544953", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "1332624116953483870", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "158222105675022402", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6830387121684699972", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "11077503608116183709", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "17847109385592002207", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "13384754476437374504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "11462462742322068863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 322) }, - { "4265693151382066296", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "11070620435959083971", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "6982733543386888622", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "3563614453014995411", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "3498490999014554104", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 327) }, - { "15595549493819416194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 99) }, - { "14532844474906286088", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "9562291747339451180", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "6772239376357727149", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 126) }, - { "10690972785852373520", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 49) }, - { "4488336106517889531", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 82) }, - { "10058614204420018541", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "13865227850818392065", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "14100870590396726248", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 40) }, - { "10848277915422577656", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 47) }, - { "8121179472578287280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "2502125887857336825", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "13192808619929896995", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 4) }, - { "5115661026367632863", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "12812685418923919055", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 5) }, - { "13131740479277027362", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "15334195300678132907", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "2038505773698938555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12090536142661253835", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "4999505377862312410", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "3934290309368153435", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "5951936376654416075", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "13204120207726209723", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "17108987360340581555", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4795705973706796563", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4084106758501882407", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "8127570953237266335", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "7500192998744460131", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2379484884827231127", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2040762223425679479", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4800587664660105589", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "1616603916015535857", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5927467766675317093", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3349519148124496343", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "13477416097954638887", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6942049339361951275", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "5303170164698694791", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "12494969618927201911", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "7875724726741958520", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "13835908664998757647", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "6407471972820516685", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "5385316497510064491", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17377315194963069204", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "3889519976910355277", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "12081835728078383819", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "14923692894655929923", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "580936360000782237", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "10682918518101379579", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13178480813522103091", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "6149673627320838019", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "11077876432364512822", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6062246008880097669", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4806571630436601566", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2458592904274981909", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "2007192658799516915", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "1051506168926530904", std::make_tuple("fully_connected_gpu_bs_f_bsv16_b1", -1) }, - { "4163359403543480821", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "4232250144427804891", std::make_tuple("fully_connected_gpu_bf_io_gemm", -1) }, - { "5415319660821122528", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "3286629188347536485", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "13575423234109624706", std::make_tuple("fully_connected_gpu_yxfb_ref", -1) }, - { "1841155673858789206", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "6708349666663292171", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "5083163738120585821", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "10572945270796129630", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "4436244774193918646", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "12985942652866621579", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "775538461106687677", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "9533360488591027707", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8913823292181409151", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11583985978586657985", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1485662490111767875", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "11872464450773754851", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5364060938737428149", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2613462626256090659", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "14668725050395069435", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "17381516856910544374", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "6450532136308941035", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2321767794934000238", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "7995820969034996638", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "17951403431757222177", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1074748462756364699", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "11955992313739654625", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "11939914680143672459", std::make_tuple("fully_connected_gpu_fb_io_ref", -1) }, - { "17806712457019493207", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11862259122805366807", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "15201438563802430490", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "5374969798377773063", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "592245952014430043", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "3114869763557037270", std::make_tuple("fully_connected_gpu_fb_oi_ref", -1) }, - { "17147293671640396193", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "6911215749850066204", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "2814805887448339818", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "1120455113299469776", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "8002233052700666718", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "16436006771518788093", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "11083993858285515074", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "4133424990380177132", std::make_tuple("fully_connected_gpu_bs_f_bsv16_af8_vload", -1) }, - { "1044978617045366709", std::make_tuple("fully_connected_gpu_fb_io_b8_f8_vload", -1) }, - { "952318454591754214", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - { "5762878778443755104", std::make_tuple("fully_connected_gpu_bs_f_bsv8_af8_vload", -1) }, - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B32_B64.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B32_B64.cpp deleted file mode 100644 index c41fd5c..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B32_B64.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // SKL GT4e - void tuning_cache_193B_B32_B64(tuning_data& td) - { - td.td.insert({ - - { "10794662801660960189", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 749) }, - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B8.cpp deleted file mode 100644 index d82ede4..0000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_SKL_GT4e_B8.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // SKL GT4e - void tuning_cache_193B_B8(tuning_data& td) - { - td.td.insert({ - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - /* { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "11207257238719531888", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "12348135936862667024", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 202) }, - { "1540041682425757361", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "6949539207944972855", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4197617702037834389", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 326) }, - { "13898284586432291433", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "3715177305271762194", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "481328129206881674", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "10001963042016663554", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "15471470494305051299", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 323) }, - { "14716719350966652036", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "13404888565084206853", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "1249137685908951501", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 320) }, - { "4840004190985490064", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "5120466856097219243", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "4701832665603867798", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 243) }, - { "6181272224000872375", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "2030309697153345387", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 335) }, - { "10728212277329722684", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "877436308867220589", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "18375125668176498051", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "14907097142953816744", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2525260242689556544", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 375) }, - { "13328449155966085543", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 373) }, - { "11856266545854830143", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "15993427814066246646", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "2100891581797371600", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 275) }, - { "12242618640422208652", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "6133592828563353516", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 304) }, - { "18232278892738147217", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "11992625045241269569", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "12601126285773042005", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 297) }, - { "5079055505117153635", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 195) }, - { "7457899998356343871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "6343888265369366589", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "10791067159964399241", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 309) }, - { "11327097771110264965", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "5245308722062496788", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 314) }, - { "10792503079194374004", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "4818231379191523896", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 288) }, - { "2198278382394812839", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "3800011935243649447", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 298) }, - { "9631545863582097486", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 255) }, - { "7777333052643961206", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 251) }, - { "1779941298820543013", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "3621930417735246405", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 263) }, - { "14435120971846098308", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 188) }, - { "2893564501191050837", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 181) }, - { "8108843303778211282", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 256) }, - { "3682813162987778705", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "15494543914974994991", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "7565221050911842393", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 93) }, - { "5629670679897666607", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "11754316727756881612", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 96) }, - { "10990741293315393791", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "17024388383581997032", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "10302338806536775954", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 204) }, - { "7915318733663535312", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "13702692566238948173", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "2909728331855309274", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "13071545223094862275", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "9631481972809246378", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 211) }, - { "13540002981450186147", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "7076937538747704750", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "10290107543739998181", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) }, - { "6767245864232675168", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "9287404618748313247", std::make_tuple("convolution_gpu_bfyx_gemm_like", -1) }, - { "8728178019712933221", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "18251360413872841969", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "18271689282126907793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 294) }, - { "954796765467489259", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 325) }, - { "13597240991532942069", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "4135003545872878882", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "11883485911218628865", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 293) }, - { "2242915551775617989", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 291) }, - { "10556089809203693400", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 290) }, - { "3727142736386026852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 333) }, - { "1622880009460832832", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 182) }, - { "4437258459981739942", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 296) }, - { "14691372262153587653", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "12181607120522804433", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 334) }, - { "3159681096461848644", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "6729785110495533200", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "15322019609805777935", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 183) }, - { "7024495439434892956", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 295) }, - { "10416622008071151225", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 173) }, - { "5796500397424307442", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "15702382940521972117", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "6093575518270471235", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "5805383505505929391", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "1801731858063091191", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 175) }, - { "1559798212423183813", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 176) }, - { "5594180958505308003", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "4766071144928072260", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 177) }, - { "8650948093564284852", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "3883845471211207871", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 248) }, - { "4366168099274266975", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 171) }, - { "578703329577922869", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 253) }, - { "16863960779539003201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "15450609897480659306", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 178) }, - { "8203550467004532364", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "7431849514656037251", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 257) }, - { "14484890926084856480", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 179) }, - { "4424217045094988504", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "7994179151788368291", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 180) }, - { "15192024816519005250", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 258) }, - { "4747159205186229582", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 103) }, - { "5485971317082563152", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 249) }, - { "18128162750557822655", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 91) }, - { "12421707187947291166", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "792684262493086891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "941626985322260281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 207) }, - { "11868551452004726281", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "14352303529756685990", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 206) }, - { "10702234389482091891", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "3895088069642140043", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 200) }, - { "5334566325056222430", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 205) }, - { "8306337702797456793", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "15720507574336564201", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "3277243911383750280", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 208) }, - { "18150429561058646714", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 240) }, - { "11169292427557543138", std::make_tuple("convolution_gpu_bfyx_os_iyx_osv16", 203) }, - { "13933912937625580405", std::make_tuple("fully_connected_gpu_bf_io_input_spatial", -1) },*/ - - }); - } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_axis.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_axis.cl index 9a7691b..8e85b9c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_axis.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_axis.cl @@ -12,19 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "include/common.cl" #include "include/data_types.cl" #define GLOBAL_SIZE 128 #define LOCAL_SIZE GLOBAL_SIZE -typedef struct /* Index and Value type that holds index and value used in this kernel */ -{ - uint index; - UNIT_TYPE value; -} iav_type; - #ifdef BATCH_AXIS #define GAP_SIZE (INPUT0_FEATURE_NUM * INPUT0_SIZE_X * INPUT0_SIZE_Y) #define VALUES_NUM INPUT0_BATCH_NUM @@ -73,6 +66,7 @@ typedef struct /* Index and Value type that holds index and value used in this k __attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) KERNEL(arg_max_gpu_axis)(const __global UNIT_TYPE* input, __global float* output) { +#include "include/arg_max_min_common.cl" uint results[TOP_K]; __local iav_type scratch[LOCAL_SIZE]; const uint first_dim_id = (uint)get_global_id(1); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_gpu_ref.cl index 3ad4ac6..7db799b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/arg_max_min_gpu_ref.cl @@ -12,19 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "include/common.cl" #include "include/data_types.cl" #define GLOBAL_SIZE 128 #define LOCAL_SIZE GLOBAL_SIZE -typedef struct /* Index and Value type that holds index and value used in this kernel */ -{ - uint index; - UNIT_TYPE value; -} iav_type; - #ifdef MAX_OUT #define COMPARE_SIGN < #define UNIT_FILL_VAL UNIT_VAL_MIN @@ -36,6 +29,7 @@ typedef struct /* Index and Value type that holds index and value used in this k __attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) KERNEL(arg_max_gpu_top_k)(const __global UNIT_TYPE* input, __global float* output) { +#include "include/arg_max_min_common.cl" uint results[TOP_K]; __local iav_type scratch[LOCAL_SIZE]; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl index 7fe1a8a..aaf60c3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl @@ -20,9 +20,17 @@ __attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) KERNEL(batch_norm_gpu)( const __global UNIT_TYPE* input, -#ifdef FORWARD - __global UNIT_TYPE* inv_var, -#endif + #ifdef MEAN_VAR_OUT + __global UNIT_TYPE* mean_out, + __global UNIT_TYPE* variance_out, + #endif + #ifdef SCALE_SHIFT + __global UNIT_TYPE* scale, + __global UNIT_TYPE* shift, + #endif + #ifdef FORWARD + __global UNIT_TYPE* inv_var, + #endif __global UNIT_TYPE* output) { __local ACCUMULATOR_TYPE sum[LOCAL_SIZE]; @@ -56,7 +64,9 @@ KERNEL(batch_norm_gpu)( } UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); - +#ifdef MEAN_VAR_OUT + mean_out[f] = mean; +#endif sum[local_idx] = 0; input_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0); @@ -83,7 +93,9 @@ KERNEL(batch_norm_gpu)( } float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); - +#ifdef MEAN_VAR_OUT + variance_out[f] = variance; +#endif float inv_variance = (float)(1.0 / sqrt(variance + EPSILON)); #ifdef FORWARD if (local_idx == 0) @@ -95,9 +107,15 @@ KERNEL(batch_norm_gpu)( { for (uint x = 0; x < OUTPUT_SIZE_X; x++) { - output[out_idx] = inv_variance * (input[out_idx] - mean); + #ifdef SCALE_SHIFT + output[out_idx] = (inv_variance * (input[out_idx] - mean)) * scale[f] + shift[f]; + #else + output[out_idx] = inv_variance * (input[out_idx] - mean); + #endif out_idx += OUTPUT_X_PITCH; } out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; } -} \ No newline at end of file +} + +#undef LOCAL_SIZE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/broadcast_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/broadcast_gpu_ref.cl index 286608f..ecda287 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/broadcast_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/broadcast_gpu_ref.cl @@ -16,16 +16,21 @@ KERNEL(broadcast_gpu_ref)( - const __global UNIT_TYPE* input, - __global UNIT_TYPE* output) + const __global INPUT0_TYPE* input, + __global INPUT0_TYPE* output) { // [CONSTEXPR] // Input sizes: - const uint in_sx = INPUT0_SIZE_X; - const uint in_sy = INPUT0_SIZE_Y; - const uint in_sf = INPUT0_FEATURE_NUM; - const uint in_sb = INPUT0_BATCH_NUM; + uint4 input_indices; + input_indices[0] = INPUT0_BATCH_NUM; + input_indices[1] = INPUT0_FEATURE_NUM; + input_indices[2] = INPUT0_SIZE_Y; + input_indices[3] = INPUT0_SIZE_X; + const uint in_sx = input_indices[BROADCAST_ORDER[3]]; + const uint in_sy = input_indices[BROADCAST_ORDER[2]]; + const uint in_sf = input_indices[BROADCAST_ORDER[1]]; + const uint in_sb = input_indices[BROADCAST_ORDER[0]]; const uint out_x = (uint) get_global_id(0); const uint out_y = (uint) get_global_id(1); @@ -40,9 +45,8 @@ KERNEL(broadcast_gpu_ref)( const uint in_f = out_f % in_sf; const uint in_b = out_b % in_sb; - const uint in_pos = GET_DATA_INDEX(INPUT0, in_b, in_f, in_y, in_x); + const uint in_pos = INPUT0_OFFSET + in_x + in_sx * (in_y + in_sy * (in_f + in_sf * in_b)); const uint out_pos = GET_DATA_INDEX(OUTPUT, out_b, out_f, out_y, out_x); - output[out_pos] = input[in_pos]; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl new file mode 100644 index 0000000..b157875 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl @@ -0,0 +1,64 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + + +KERNEL(contract_ref)( + const __global INPUT0_TYPE* input, + __global INPUT0_TYPE* output) +{ + INPUT0_TYPE out_val = REDUCE_SEED; + +#if REDUCE_B + for (uint in_b = 0; in_b < INPUT0_BATCH_NUM; ++in_b) { +#else + const uint in_b = (uint) get_global_id(DIM_B); +#endif + +#if REDUCE_F + for (uint in_f = 0; in_f < INPUT0_FEATURE_NUM; ++in_f) { +#else + const uint in_f = (uint) get_global_id(DIM_F); +#endif + +#if REDUCE_Y + for (uint in_y = 0; in_y < INPUT0_SIZE_Y; ++in_y) { +#else + const uint in_y = (uint) get_global_id(DIM_Y); +#endif + +#if REDUCE_X + for (uint in_x = 0; in_x < INPUT0_SIZE_X; ++in_x) { +#else + const uint in_x = (uint) get_global_id(DIM_X); +#endif + + out_val = REDUCE_OPERATION(out_val, input[GET_DATA_INDEX(INPUT0, in_b, in_f, in_y, in_x)]); + +#if REDUCE_X + } +#endif +#if REDUCE_Y + } +#endif +#if REDUCE_F + } +#endif +#if REDUCE_B + } +#endif + + output[GET_DATA_INDEX(OUTPUT, 0, get_global_id(0), get_global_id(1), get_global_id(2))] = out_val; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl index bfba2d9..cf25001 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1.cl @@ -13,6 +13,7 @@ // limitations under the License. #include "include/include_all.cl" +#include "include/sub_group.cl" #if FP16_UNIT_USED #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset))) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl new file mode 100644 index 0000000..8773666 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl @@ -0,0 +1,238 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + +#define SIMD_SIZE 8 +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) +KERNEL(convolution)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif + uint split_idx) +{ + const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH; + const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT; + const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM; + const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;; + + const uint ifm_part = get_sub_group_id(); + uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2; + + UNIT_TYPE in[OUT_BLOCK_HEIGHT]; + UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2]; + UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2]; + + for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++) + { + dotProd0[i] = 0; + dotProd1[i] = 0; + } + +#if OUT_BLOCK_DEPTH == 8 + const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2); +#elif OUT_BLOCK_DEPTH == 4 + const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2); +#elif OUT_BLOCK_DEPTH == 2 + const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2); +#else + const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH; +#endif + const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH; + + //-------------------------------------------------------------------- + // main computation phase + //-------------------------------------------------------------------- + + for (uint k = 0; k < FILTER_IFM_NUM/2; ++k) + { + for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++) + { + const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH; + in[i] = input[in_offset]; + } + +#if OUT_BLOCK_DEPTH == 8 + float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64)); +#elif OUT_BLOCK_DEPTH == 4 + float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32)); +#elif OUT_BLOCK_DEPTH == 2 + float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16)); +#endif + + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + float _in = intel_sub_group_shuffle(in[br], bc); + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd]; + dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2]; + } + } + } + } + + __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE]; + __local float* slm_p = &slm_vals[0]; + //-------------------------------------------------------------------- + // second sub_group in workgroup task + //-------------------------------------------------------------------- + + if(ifm_part == 1) + { + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * bd))] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + } + } + } + + } + + //-------------------------------------------------------------------- + // first sub_group in workgroup task + //-------------------------------------------------------------------- + + if(ifm_part == 0) + { + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + uint width_offset = 0; + #if (OUT_BLOCK_WIDTH) >= 4 + const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + OUT_BLOCK_DEPTH/2) )); + float4 tmp = (float4)(dotProd1[width_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd1[width_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd1[width_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd1[width_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); + vstore4(tmp, 0, slm_p + slm_off); + width_offset += 4; + #endif + for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++) + { + slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) ))] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + } + } + } + + } + + //-------------------------------------------------------------------- + // add bias phase + //-------------------------------------------------------------------- + + #if BIAS_TERM + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()]; + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias; + } + } + } + #endif + + barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it + + //-------------------------------------------------------------------- + // sum sub-group results + activation phase + //-------------------------------------------------------------------- + + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + uint width_offset = 0; + #if (OUT_BLOCK_WIDTH) >= 4 + const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) )); + float4 tmp = vload4(0, slm_p + slm_off); + dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[0]; + dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[1]; + dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[2]; + dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[3]; + + dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);; + dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);; + dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);; + dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);; + + width_offset += 4; + #endif + + for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++) + { + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ))]; + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);; + } + } + } + + //-------------------------------------------------------------------- + // output phase + //-------------------------------------------------------------------- + + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x); + uint out_vstore_offset = 0; + #if (OUT_BLOCK_WIDTH >= 8) + float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); + vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 8; + #endif + #if (OUT_BLOCK_WIDTH % 8) > 3 + float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); + vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 4; + #endif + #if (OUT_BLOCK_WIDTH % 4) > 1 + float2 tmp2 = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); + vstore2(tmp2, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 2; + #endif + //dst_index += 4 * OUTPUT_X_PITCH; + for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++) + { + output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + } + } + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_depthwise_weights_lwg.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_depthwise_weights_lwg.cl index f21b03d..9cec96f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_depthwise_weights_lwg.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_depthwise_weights_lwg.cl @@ -103,4 +103,4 @@ KERNEL(convolution_depthwise_weights_lwg)( const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset; output[dst_index] = ACTIVATION(dotProd, NL_M, NL_N); -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl index c28f328..eb8af3d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl @@ -167,7 +167,7 @@ KERNEL(convolution_f16)( #if (PADDING_SIZE_X == 1) && (INPPUT_PADDING_Y == 1) && (FILTER_SIZE_X == 3) && (FILTER_SIZE_Y == 3) if ((y_offset + patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y)) { - blockA00 = half_zeros; + blockA00 = { 0 }; } else { @@ -178,7 +178,7 @@ KERNEL(convolution_f16)( #else if ((y_offset + patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y)) { - blockA00 = half_zeros; + blockA00 = { 0 }; } else { @@ -193,7 +193,7 @@ KERNEL(convolution_f16)( #pragma error if ((y_offset + patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y)) { - blockA00 = half_zeros; + blockA00 = { 0 }; } else { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp32.cl index 0366f8f..0066e6e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp32.cl @@ -15,6 +15,7 @@ */ #include "include/include_all.cl" +#include "include/sub_group.cl" #define TILE_M 2 #define TILE_K FILTER_SIZE_X diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl index e70ca2e..07fd633 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl @@ -95,7 +95,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; in_addr = batch_idx * INPUT0_BATCH_PITCH; - in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + or * STRIDE_SIZE_Y * INPUT0_Y_PITCH + oc * STRIDE_SIZE_X + lid; + in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + (or * STRIDE_SIZE_Y * INPUT0_Y_PITCH) + (oc * STRIDE_SIZE_X + lid) * INPUT0_X_PITCH; for(int kd = 0; kd < FILTER_IFM_NUM; kd++) // _ID = 3, RGB { @@ -107,7 +107,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( // Horizontal position in input block after read. const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH]; // If we have row break, move to the next row. if (in_block_next_x_pos == IN_BLOCK_WIDTH) @@ -120,7 +120,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { // - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH]; // If we have row break, move to the next row. if (in_block_next_x_pos == IN_BLOCK_WIDTH) @@ -132,11 +132,11 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH; if (lid < sg_br_pos) - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH]; // We have row break inside sub-group. Need to move to next line. tmp_in_addr += INPUT0_Y_PITCH; if (lid >= sg_br_pos) - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - sg_br_pos]; + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - (sg_br_pos * INPUT0_X_PITCH)]; // If we have another row break, move to the next row. if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH) @@ -211,17 +211,51 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( } } + +//-------------------------------------------------------------------- +// output phase +//-------------------------------------------------------------------- + #ifdef LEFTOVERS if (feature_idx < OUTPUT_FEATURE_NUM) #endif for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) { if(!(or + r >= OUTPUT_SIZE_Y)) { +#if (OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH) == 0 // in this case we don't need to check if we're outside of X boundaries + uint out_vstore_offset = 0; + #if (OUT_BLOCK_WIDTH % 8) > 3 + MAKE_VECTOR_TYPE(UNIT_TYPE, 4) tmp = MAKE_VECTOR_TYPE(UNIT_TYPE, 4)( + out[out_vstore_offset + 0 + r * OUTPUT_BLOCK_WIDTH], + out[out_vstore_offset + 1 + r * OUTPUT_BLOCK_WIDTH], + out[out_vstore_offset + 2 + r * OUTPUT_BLOCK_WIDTH], + out[out_vstore_offset + 3 + r * OUTPUT_BLOCK_WIDTH] + ); + + vstore4(tmp, 0, output + out_addr + r * OUTPUT_Y_PITCH + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 4; + #endif + + #if (OUT_BLOCK_WIDTH % 4) > 1 + MAKE_VECTOR_TYPE(UNIT_TYPE, 2) tmp2 = MAKE_VECTOR_TYPE(UNIT_TYPE, 2)( + out[out_vstore_offset + 0 + r * OUTPUT_BLOCK_WIDTH], + out[out_vstore_offset + 1 + r * OUTPUT_BLOCK_WIDTH] + ); + + vstore2(tmp2, 0, output + out_addr + r * OUTPUT_Y_PITCH + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 2; + #endif + for(uint c = out_vstore_offset; c < OUTPUT_BLOCK_WIDTH; c++) { + // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. + output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c]; + } +#else for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) { // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. if(!(oc + c >= OUTPUT_SIZE_X)) output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c]; } +#endif } } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl new file mode 100644 index 0000000..a7566fd --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl @@ -0,0 +1,254 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/common.cl" +#include "include/data_types.cl" + +#define SIMD_SIZE SUB_GROUP_SIZE +// --------------------------------------------------------------------------------------------------------------------- +// Just-in-time macro definitions: +// --------------------------------------------------------------------------------------------------------------------- + +// Required JIT constants: +// - INPUT - [tensor] Input dimensions (batch, spatial and feature). +// - OUTPUT - [tensor] Output dimensions (batch, spatial and feature). +// - STRIDE - [tensor] Stride (only spatial). Factors that describe step size in X or Y dimension of +// input position of application of convolution filter when next ouput value +// (step 1 in in X or Y dimension of output) is computed. +// - INPUT0_OFFSET - [tensor] Offset for the first element +// initial offset input position of application of convolution filter and output position. +// - FP16_SUPPORTED - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16). +// - FP16_UNIT_USED - [0/1] Value indicating that current kernel should use FP16. +// - UNIT_TYPE - Type of unit of input/output/weight/bias. +// - UNIT_VAL_ZERO - Literal of current UNIT_TYPE that represents 0. +// - RELU - [0/1] Indicates that ReLU activation function should be used on output. +// - NEGATIVE_SLOPE - [float] Factor for negative output values (required when ReLU is specified). +// +// - SUB_GROUP_SIZE - [int] Size of used subgroup (SIMD). +// - LEFTOVERS - [int] Optional parameter, required only when number of ofm is not dividable by SUB_GROUP_SIZE +// see comment for FEATURES_THREADS_PER_BATCH for more informations + +/* +gpu::make_jit_constant("OUTPUT_LIMIT", output_size), +gpu::make_jit_constant("FILTER", filter_mem.argument().size), +gpu::make_jit_constant("FILTER_ARRAY_NUM", split), +gpu::make_jit_constant("OUTPUT_BLOCK_WIDTH", _kernel_data.block_width)); +gpu::make_jit_constant("OUTPUT_BLOCK_HEIGHT", _kernel_data.block_height)); +gpu::make_jit_constant("IN_BLOCK_ARRAY_SIZE", _kernel_data.input_block_array_size)); +gpu::make_jit_constant("IN_BLOCK_WIDTH", _kernel_data.input_block_width)); +gpu::make_jit_constant("PREFETCH", _kernel_data.prefetch)); +if (_kernel_data.leftovers) + gpu::make_jit_constant("LEFTOVERS", _kernel_data.leftovers)); +*/ + +// FEATURES_THREADS_PER_BATCH defines how many threads in z-dimension are processing single batch. +// ideally, z-dimension of value n should indicate processing of n-th output feature. however, since +// threads are stack in groups of SUB_GROUP_SIZE, when number of ofm is not dividable by SUB_GROUP_SIZE +// there are dummy threads added in z-dimension in count of LEFTOVERS. We need to take them into consideration +// while calculating batch's id (see lines 86-87). Values calculated by dummy threads are discarded at line 210. +#ifdef LEFTOVERS +#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM + LEFTOVERS) +#else +#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM) +#endif + +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) +__attribute__((reqd_work_group_size(1, 1, 2*SUB_GROUP_SIZE))) +KERNEL(convolution_gpu_bfyx_os_iyx_osv16_2_sg)( + const __global UNIT_TYPE* input, + __global UNIT_TYPE* output, + const __global UNIT_TYPE* weights, +#if BIAS_TERM + const __global UNIT_TYPE* bias, +#endif + uint split_idx) // TODO: removing this parameter cause a performance degradation... :) +{ + const uint oc = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH; // oc = Output Column + const uint or = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; // or = Output Row + const uint fm = get_group_id(2) * SUB_GROUP_SIZE + get_sub_group_local_id();//get_global_id(2); // fm = Feature Map = od = Output Depth + const uint lid = get_sub_group_local_id(); + + const uint ifm_part = get_sub_group_id(); + __local float slm_vals[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT * SIMD_SIZE]; + + uint batch_idx = fm / FEATURES_THREADS_PER_BATCH; + uint feature_idx = fm % FEATURES_THREADS_PER_BATCH; + uint fmg = feature_idx / SUB_GROUP_SIZE; + + UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE]; + UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT]; + UNIT_TYPE w[PREFETCH]; + uint in_addr; + uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid; + weight_addr += ifm_part * SUB_GROUP_SIZE * FILTER_IFM_NUM/2 * FILTER_SIZE_X * FILTER_SIZE_Y; + + for(int i = 0; i < (OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT); i++) { + out[i] = UNIT_VAL_ZERO; + } + + uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; + in_addr = batch_idx * INPUT0_BATCH_PITCH; + in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + or * STRIDE_SIZE_Y * INPUT0_Y_PITCH + oc * STRIDE_SIZE_X + lid; + in_addr += ifm_part * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM/2; + + for(int kd = 0; kd < FILTER_IFM_NUM/2; kd++) // _ID = 3, RGB + { + uint tmp_in_addr = in_addr; + +#if IN_BLOCK_WIDTH % SUB_GROUP_SIZE == 0 + __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE))) + for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) { + // Horizontal position in input block after read. + const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; + + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + + // If we have row break, move to the next row. + if (in_block_next_x_pos == IN_BLOCK_WIDTH) + tmp_in_addr += INPUT0_Y_PITCH; + } +#elif (2 * IN_BLOCK_WIDTH) % SUB_GROUP_SIZE == 0 + __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE))) + for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) { + // Horizontal position in input block after read. + const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; + + if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { // + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + + // If we have row break, move to the next row. + if (in_block_next_x_pos == IN_BLOCK_WIDTH) + tmp_in_addr += INPUT0_Y_PITCH; + } + else { + // TODO: Generalize this step to relax IN_BLOCK_WIDTH restrictions. + // Position in sub-group on which new row need to be read. + const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH; + + if (lid < sg_br_pos) + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + // We have row break inside sub-group. Need to move to next line. + tmp_in_addr += INPUT0_Y_PITCH; + if (lid >= sg_br_pos) + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - sg_br_pos]; + + // If we have another row break, move to the next row. + if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH) + tmp_in_addr += INPUT0_Y_PITCH; + } + } +#else + #error IN_BLOCK_WIDTH must be multiple of SUB_GROUP_SIZE or half of SUB_GROUP_SIZE. Other scenarios are not currently implemented. +#endif + + //move to next filter + in_addr += INPUT0_FEATURE_PITCH; + + for(int pf=0; pf= OUTPUT_SIZE_Y)) + { + for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) { + // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. + if(!(oc + c >= OUTPUT_SIZE_X)) + output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c]; + } + } + } + +} + +} + +#undef FEATURES_THREADS_PER_BATCH diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_ref.cl index a36c020..0e8a264 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_ref.cl @@ -52,9 +52,6 @@ KERNEL(convolution)( #else const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; #endif - const uint filter_offset = f*FILTER_OFM_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; - for (uint k = 0; k < FILTER_IFM_NUM; ++k) { for (uint j = 0; j < FILTER_SIZE_Y ; ++j) @@ -71,8 +68,18 @@ KERNEL(convolution)( if(!zero_x) { - uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*INPUT0_FEATURE_PITCH; - uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; + uint input_idx = + GET_DATA_INDEX( + INPUT0, b, k, input_offset_y, input_offset_x) + + in_split_offset; + uint filter_idx = GET_FILTER_INDEX(FILTER, f, k, j, i); +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + filter_idx += split_idx * FILTER_LENGTH; +#endif +#ifdef LOCAL_CONVOLUTION + filter_idx += FILTER_SIZE_X * FILTER_SIZE_Y + * (x + OUTPUT_SIZE_X * y); +#endif #if QUANTIZATION_TERM dotProd += (int)input[input_idx] * (int)weights[filter_idx]; #else @@ -85,10 +92,15 @@ KERNEL(convolution)( } #if BIAS_TERM +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint bias_offset = split_idx * BIAS_LENGTH; +#else + const uint bias_offset = 0; +#endif #if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); + const uint bias_index = bias_offset + GET_DATA_INDEX(BIAS, b, f, y, x); #elif BIAS_PER_OFM - const uint bias_index = f; + const uint bias_index = bias_offset + f; #endif #if QUANTIZATION_TERM #if CALIBRATION_TERM diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl new file mode 100644 index 0000000..a495e1d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl @@ -0,0 +1,170 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/common.cl" + +#include "include/data_types.cl" +#include "include/fetch.cl" +#include "include/mmad.cl" + +#define FILTER_IFM_SLICES ((FILTER_IFM_NUM + 3) /4) +#define FILTER_SIZE_X_SLICES ((FILTER_SIZE_X + 7) / 8) + +#define OUT_BLOCK_HEIGHT 4 +#define WEIGHTS_PER_WORKITEM 4 // currently needs to be set to 4, check output stage and float4 on quantizations etc. + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION \ + out[w] = convert_uchar_sat((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * SCALE + bias_f[w]); + +#elif NO_QUANTIZATION + +#define QUANTIZATION \ + out[w] = convert_uchar_sat(dotProd[w*OUT_BLOCK_HEIGHT + h][i]); + +#else + +#define QUANTIZATION \ + out[w] = as_uchar( ACTIVATION( convert_char( round( ( (float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w])), NL_M, NL_N)); + +#endif + +__attribute__((intel_reqd_sub_group_size(8))) +KERNEL(convolution_gpu_byx8_f4_fs_bs_yx_bsv4_fsv32)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, + __global BIAS_TYPE* biases, + __global float* quantizations, +#if CALIBRATION_TERM + __global float* calibrations, +#endif + uint split_idx) +{ + const uint x = get_group_id(1) * 8; + const uint y = get_group_id(2) * OUT_BLOCK_HEIGHT; + + const uint f = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM ) % OUTPUT_FEATURE_NUM; + const uint b = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM) / OUTPUT_FEATURE_NUM; + + int8 dotProd[OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 }; + + const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; + const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + + const uint filter_offset = f*FILTER_OFM_PITCH; + const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET; + + for (uint k = 0; k < FILTER_IFM_SLICES; ++k) + { + __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) + for (uint j = 0; j < FILTER_SIZE_Y ; ++j) + { + const int input_offset_y = input_y + j * DILATION_SIZE_Y; + + __attribute__((opencl_unroll_hint(FILTER_SIZE_X_SLICES))) + for(uint i = 0; i < FILTER_SIZE_X_SLICES; i++) + { + int8 act_reg[OUT_BLOCK_HEIGHT]; // activations for MMAD + + // preload spatial data + __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) + for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) + { + uint input_idx = GET_DATA_BYX8_F4_INDEX(INPUT0, b, k * 4, input_offset_y + h * STRIDE_SIZE_Y, input_x + i * 8); + int2 _input_data_01 = as_int2(intel_sub_group_block_read2((__global uint*)(input + input_idx))); + int _input_data_2 = as_int(intel_sub_group_block_read((__global uint*)(input + input_idx + 8 * 8))); + + act_reg[h][0] = _input_data_01[0]; + act_reg[h][1] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 1); + act_reg[h][2] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 2); + act_reg[h][3] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 3); + act_reg[h][4] = _input_data_01[1]; + act_reg[h][5] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 1); + act_reg[h][6] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 2); + act_reg[h][7] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 3); + } + + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights + { + uint filter_idx = GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(FILTER, f + w * 8, k * 4, j, i * 8); + int8 _w = as_int8(intel_sub_group_block_read8((__global uint*)(weights + filter_idx))); + + __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) + for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) + { + // MMAD on 8x WEIGHTS_PER_WORKITEM input channels elements for 8x outputs in WI + dotProd[w*OUT_BLOCK_HEIGHT + h] = MMAD_8x8(act_reg[h], _w, dotProd[w*OUT_BLOCK_HEIGHT + h]); + } + } + } + } + } + +float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + f) )); +float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + f) )); +#if CALIBRATION_TERM +float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + f) )); +#endif + +__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) +for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) +{ + const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f + get_sub_group_local_id(), y + h, x); + + __attribute__((opencl_unroll_hint(8))) + for(uint i = 0; i < 8; i++) + { + + #if WEIGHTS_PER_WORKITEM == 4 + + uchar4 out; + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) + { + QUANTIZATION; + } + intel_sub_group_block_write_uc4((__global uchar*)(output + dst_index + 32 * 4 * i), out); + + #else + + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) + { + #if CALIBRATION_TERM + dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w]); + #else // CALIBRATION_TERM + dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * O_QF); + #endif // CALIBRATION_TERM + output[dst_index + 32 * 4 * i + 8 * w] = ACTIVATION(convert_char(dotProd[w*OUT_BLOCK_HEIGHT + h][i]), NL_M, NL_N); + } + + #endif + } +} + +} + +#undef OUT_BLOCK_HEIGHT +#undef WEIGHTS_PER_WORKITEM + +#undef FILTER_SIZE_X_SLICES +#undef FILTER_IFM_SLICES + +#undef SCALE +#undef QUANTIZATION \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl new file mode 100644 index 0000000..a240d4b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl @@ -0,0 +1,105 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + +#define OBS 8 +__attribute__((intel_reqd_sub_group_size(8))) +KERNEL(convolution)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif +#if QUANTIZATION_TERM + __global float* quantizations, +#endif +#if CALIBRATION_TERM + __global float* calibrations, +#endif + uint split_idx) +{ + const uint f_pack = (get_group_id(0) * 32) % OUTPUT_FEATURE_NUM; + const uint b = (get_group_id(0) * 32) / OUTPUT_FEATURE_NUM; + + const uint x = get_group_id(1) * OBS; + const uint y = get_group_id(2); + + int4 dotProd[OBS] = { 0 }; + + const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; + const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + + const uint filter_offset = f_pack*FILTER_OFM_PITCH; + const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET; + + for (uint j = 0; j < FILTER_SIZE_Y ; ++j) + { + const int input_offset_y = input_y + j; + for (uint i = 0; i < FILTER_SIZE_X ; ++i) + { + const int input_offset_x = input_x + i + STRIDE_SIZE_X * get_sub_group_local_id(); + uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH; + uint filter_idx = filter_offset + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; + + char input_data[3]; + char2 _i = vload2(0, input + input_idx); + input_data[0] = _i.s0; + input_data[1] = _i.s1; + input_data[2] = input[input_idx + 2]; + + for (uint k = 0; k < FILTER_IFM_NUM; ++k) + { + char4 w_data = as_char4(intel_sub_group_block_read((const __global uint*)(weights + filter_idx))); + for(uint r = 0; r < OBS; r++) + { + char in = intel_sub_group_shuffle(input_data[k], r); + for(uint c = 0; c < 4; c++) + { + dotProd[r][c] += (int)in * (int)w_data[c]; + } + } + filter_idx += FILTER_IFM_PITCH; + } + } + } + + +const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f_pack, y, x + get_sub_group_local_id()); +const uint _f_idx = f_pack + get_sub_group_local_id() * 4; +float4 quants = vload4(0, quantizations + _f_idx ); +float4 calibs = vload4(0, calibrations + _f_idx ); +float4 bias = vload4(0, biases + _f_idx ); +for(uint r = 0; r < OBS; r++) +{ + char4 char_output; + for(uint c = 0; c < 4; c++) + { + const uint f_idx = f_pack + get_sub_group_local_id() * 4 + c; + #if BIAS_TERM + const uint bias_index = f_idx; + #if CALIBRATION_TERM + dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * calibs[c]); + #else // CALIBRATION_TERM + dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * O_QF); + #endif // CALIBRATION_TERM + #endif + char_output[c] = ACTIVATION(convert_char(dotProd[r][c]), NL_M, NL_N); + } + const uint out_idx = intel_sub_group_shuffle(dst_index, r); + intel_sub_group_block_write( (__global uint*)(output + out_idx) , as_uint(char_output)); +} + +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad.cl new file mode 100644 index 0000000..0fa75dd --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad.cl @@ -0,0 +1,202 @@ +// Copyright (c) 2018-2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/common.cl" +#include "include/fetch.cl" +#include "include/data_types.cl" +#include "include/imad.cl" + +#ifndef NON_BLOCK_LOAD +// block loads for inputs and weights should be fastest, but compiler seems +// to do better with a mix, regular loads for inputs and block loads for weights. +#define BLOCK_LOAD_WEIGHTS +#endif +// Input reading operation is always blocked. +#define BLOCK_LOAD_INPUTS + +// for now kernel stride is square +#define K_WSTRIDE K_STRIDE +#define K_HSTRIDE K_STRIDE + +// need KERNEL width for first output + STRIDE more for each additional. +#define IN_BLOCK_WIDTH (K_WIDTH + K_WSTRIDE * (OUT_BLOCK_WIDTH - 1)) +#define IN_BLOCK_HEIGHT (K_HEIGHT + K_HSTRIDE * (OUT_BLOCK_HEIGHT - 1)) + +// for imad we are packing 4 8bit activations per 32 bit SIMD lane +// if we later add 4bit, then PACK would be 8. +#define PACK 4 + +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) +KERNEL (convolution_gpu_imad)( + __global uint *inputs, + __global OUTPUT_TYPE *outputs, + __global int *weights +#if BIAS_TERM + ,__global BIAS_TYPE *biases +#endif +#if QUANTIZATION_TERM + ,__global float *quantizations +#endif +#if CALIBRATION_TERM + ,__global float *calibrations +#endif +) +{ + const uint oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column + const uint or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row + const uint fm = get_global_id(2); // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16 + const uint fmg = get_group_id(2); + const uint lid = get_local_id(2); + const uint batch = fm / _OD; + + uint in[IN_BLOCK_HEIGHT]; + int out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 }; // this is the 32 bit signed accumulator that must be converted to 8 bits before final write. + + #define NUM_FILTERS (K_HEIGHT * K_WIDTH) + int w[NUM_FILTERS]; + + int in_addr; + +#ifdef BLOCK_LOAD_WEIGHTS + int weight_addr = (fmg % (_OD / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK); +#else + int weight_addr = (fmg % (_OD / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK) + lid; +#endif + + uint input_size = (_ID * (_IH + IHPAD) * (_IW + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities. + + __attribute__((opencl_unroll_hint(1))) + for(int kd = 0; kd < (_ID / PACK); kd++) // For imad we do 4X less input feature map iterations since we are packing 4 of them in each uchar4. For now assume _ID is multiple of packing factor. + { + +#ifdef BLOCK_LOAD_INPUTS + in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) + (or * K_STRIDE) * (_IW + IWPAD) + (oc * K_STRIDE); +#else + in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) + (or * K_STRIDE) * (_IW + IWPAD) + (oc * K_STRIDE) + lid; +#endif + in_addr += batch * input_size; // adjust for batching + + for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) { +#ifdef BLOCK_LOAD_INPUTS + in[reg] = intel_sub_group_block_read((const __global uint*) &inputs[in_addr]); +#else + in[reg] = inputs[in_addr];// read SIMD_SIZE elements wide +#endif + in_addr += (_IW + IWPAD); // move to next row down + } + +#ifdef BLOCK_LOAD_WEIGHTS + *((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr])); + w[8]= as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)])); + weight_addr += SIMD_SIZE*NUM_FILTERS; +#else + for(int pf=0; pf < NUM_FILTERS; pf++) { + w[pf] = weights[weight_addr]; + weight_addr += SIMD_SIZE; + } +#endif + + int wi = 0; + int kr = 0; // kr = Kernel Row + LOOP(K_HEIGHT, kr, + { + int kc = 0; // kc = Kernel Column + LOOP(K_WIDTH, kc, + { + for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) { + for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) { + uint input = sub_group_broadcast(in[br * K_HSTRIDE + kr], bc * K_WSTRIDE + kc); + + out[br * OUT_BLOCK_WIDTH + bc] = +#ifdef CONVO_UNSIGNED + IMAD(out[br * OUT_BLOCK_WIDTH + bc], as_uchar4(input), as_char4(w[wi])); +#else + IMAD(out[br * OUT_BLOCK_WIDTH + bc], as_char4(input), as_char4(w[wi])); +#endif + } + } + wi++; + }); + }); + } //for kd + + // Feature maps are an array of slices, each H,W position within the slice contains + // four 8bit feature maps, packed like RGBA components into a 32 bit pixel. + int row_size_bytes = (_OW + OWPAD) * PACK; + + // Slice_pack is a pack of 4 feature map tiles that are [OH][OW][4] + // that are stored within the full [N][C/4][H][W][4] output. + int slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD); + + // Dividing the feature map index by 4 gives us the slice_pack_index in each lane + // (each lane within block of 4 will have same index). + int slice_pack_index = fm / PACK; + + // Each group of 4 simd lanes points to start of it's slice pack. + int slice_pack_start_addr_bytes = slice_pack_index * slice_pack_size_bytes; + + // Make each lane within the group of 4(PACK) simd lanes point to an individual byte + // witihn the uchar4 at start of slice pack. + int slice_pack_addr_bytes = slice_pack_start_addr_bytes + (lid % PACK); + + // Adjust to particular tile that we are working on + slice_pack_addr_bytes += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * row_size_bytes + + (oc + OUTPUT_PAD_BEFORE_SIZE_X) * PACK; + + for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) { + for (int c = 0; c < OUT_BLOCK_WIDTH; c++) { + uint out_idx = slice_pack_addr_bytes + r * row_size_bytes + (c*PACK); +#if QUANTIZATION_TERM + int dotProd = out[r * OUT_BLOCK_WIDTH + c]; +#else + UNIT_TYPE dotProd = out[r * OUT_BLOCK_WIDTH + c]; +#endif + +#if BIAS_TERM + const uint f = fm % _OD; + #if BIAS_PER_OUTPUT + #error convolution_gpu_imad.cl: BIAS_PER_OUTPUT - not supported + #elif BIAS_PER_OFM + const uint bias_index = f; + #endif + + #if QUANTIZATION_TERM + #if CALIBRATION_TERM + + dotProd = (UNIT_TYPE)round( ((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) + * calibrations[f] ); + #else + dotProd = (UNIT_TYPE)round( ((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) + * O_QF ); + #endif // CALIBRATION_TERM + #else + dotProd += (UNIT_TYPE)biases[bias_index]; + #endif // QUANTIZATION_TERM +#endif // BIAS_TERM + +#if QUANTIZATION_TERM + UNIT_TYPE dotProd_A = ACTIVATION(convert_char(dotProd), NL_M, NL_N); +#else + UNIT_TYPE dotProd_A = ACTIVATION(dotProd, NL_M, NL_N); +#endif + +#ifdef CONVO_UNSIGNED + outputs[out_idx] = (uchar)( max((int)dotProd_A , 0) & 0xFF ); +#else + outputs[out_idx] = (uchar)dotProd_A & 0xFF; +#endif + } // for (int c = 0; c < OUT_BLOCK_WIDTH; c++) + } // for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl new file mode 100644 index 0000000..381f198 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl @@ -0,0 +1,396 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/mmad.cl" + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION(idx) \ + {\ + for(uint z = 0; z < 4; z++)\ + {\ + regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\ + regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\ + regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\ + regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\ + }\ + } + +#elif NO_QUANTIZATION + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\ + regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\ + regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\ + regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\ + \ + regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\ + regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\ + regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\ + regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\ + \ + regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\ + regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\ + regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\ + regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\ + \ + regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\ + regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\ + regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\ + regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]); + +#else + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + +#endif + + +inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) +{ +#if OUT_WITH_PADDING == 1 + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; + padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; + padded_offset += y_idx * OUT_Y_PITCH; + padded_offset += x_idx * OUT_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += OUT_OFFSET; + + return padded_offset; +#else + return cOffset; +#endif +} + +inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, + __local int8* l_tileB, const uint l_offsetTileB_col0, + const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, + const uint l_offsetTileB_col3, int8* rowA, int8* colB, + int8* regC) +{ + // Read tile A from SLM to regA + uint l_offsetTileATemp = l_offsetTileA; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); + l_offsetTileATemp += 8 * SG_SIZE; + } + // Read tile B from SLM to regB and compute mmad + colB[0] = l_tileB[l_offsetTileB_col0]; + colB[1] = l_tileB[l_offsetTileB_col1]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); + } + colB[0] = l_tileB[l_offsetTileB_col2]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); + } + colB[1] = l_tileB[l_offsetTileB_col3]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); + } + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); + } +} + +/* + * \brief GEMM kernel to compute MxN matrix using SLM + * \param g_inA - Input matrix + * \param g_inB - Input matrix + * \param g_outC - Output matrix + */ + +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) +KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8) + ( + __global char* const g_inA, + __global int* g_outC, + __global char* const g_inB, + #if BIAS_TERM + __global BIAS_TYPE* biases, + #endif + __global float* quantizations, + #if CALIBRATION_TERM + __global float* calibrations, + #endif + uint split_idx + + ) +{ + + __global int4* const g_matrixA = (__global int4*)g_inA; + __global int4* const g_matrixB = (__global int4*)g_inB; + __global int8* g_matrixC = (__global int8*)g_outC; + + // Each work-group works to compute 128x128 tile. + // Each work-group contains 16 sub-groups. + // Each sub-group within the work-group works to compute a 32x32 tile. + // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). + // 2) Each sub-group works to compute 32x32 tileC (stored in regC). + // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. + // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") + __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 + __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 + + __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; + __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; + __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; + + const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y); + + const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); + const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); + const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); + const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); + + // Thread IDs + const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY + const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX + const uint l_tidX = get_local_id(DIM_X); // 0,...,31 in WG + const uint l_tidY = get_local_id(DIM_Y); // 0,1,2,3 in WG + const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX; // 0,1,2,...127 + + // SubGroup IDs + const uint sg_tid = get_sub_group_local_id(); // 0,1,...,8 + const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); //{0}/8 + const uint sg_global_idY = g_tidY; //{0} + + const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3} + const uint sg_local_idY = l_tidY; // 0,1,2,3 + const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4 + + const uint sub_group_id = get_sub_group_id(); + + + // Registers + int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts // (32/8)*4 + int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA + int8 colB[2]; // each lane will store 32x4 piece of matrixB + + // SLM indices + const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; + const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); + const uint numElements32x8TileB = numElements32x32TileB / 4; + const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; + const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; + const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; + + // Global indices + uint g_idxA[2]; + uint g_idxB[2]; +#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) + g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid; + g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid; + g_idxA[1] = g_idxA[0] + l_groupSize; + g_idxB[1] = g_idxB[0] + l_groupSize; +#else // Row (matrixA) and Col (matrixB) major layout + g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); + g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); +#endif + + // Initial SLM setup + { + l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; + l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; + l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; + l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; + +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + } + + int4 hdcReadValueA[2]; + int4 hdcReadValueB[2]; + + __attribute__((opencl_unroll_hint(1))) + for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) + { + /* + * SLM setup - HDC read only + */ + // Overlap HDC reads with mmad compute + hdcReadValueA[0] = g_matrixA[g_idxA[0]]; + hdcReadValueB[0] = g_matrixB[g_idxB[0]]; + hdcReadValueA[1] = g_matrixA[g_idxA[1]]; + hdcReadValueB[1] = g_matrixB[g_idxB[1]]; + +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + /* + * mmad compute + */ + FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, + l_offsetTileB_col3, rowA, colB, regC); + + /* + * SLM setup - SLM write only + */ + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; + + barrier(CLK_LOCAL_MEM_FENCE); + } // main outer loop + + /* + * Last mmad compute iteration (avoids branching in main loop) + */ + + FUNC_CALL(mmad_32x32_int8)( + &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, + &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, + regC); + +#ifdef OUTPUT_TILED_GLOBAL_LAYOUT + // Write out in swizzled manner after quantizing + __global uchar* g_outC_uchar = (__global uchar*)g_outC; + uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + + sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); + + uchar16 regC_uchar16; + uint offset_uc16 = 0; + + const uint workgroup_id_x = get_group_id(0); + uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x + uint feature = get_sub_group_local_id()*4 + feature_off; + + float4 quant_f = vload4(0, quantizations + feature); + float4 bias_f = vload4(0, biases + feature); + float4 calib_f = vload4(0, calibrations + feature); + +#if MMAD_SUPPORTED == 1 + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) +#endif + for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) + { + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + // B0..3, F0..31 + QUANTIZATION(0); + } + + intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); + cOffset += sizeof(uchar16) * SG_SIZE; + + // now we need to calculate again for other x + padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + // B0..3, F0..31 + QUANTIZATION(4); + } + + intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); + cOffset += sizeof(uchar16) * SG_SIZE; + } +#else + // Write final accumulated values + uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + + sg_tid * (MATRIX_M / 8); + __attribute__((opencl_unroll_hint(SIMD_LANE_N))) + for (uint i = 0; i < (SIMD_LANE_N); ++i) + { + __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) + for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) + { + g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; + } + cOffset += SG_SIZE * (MATRIX_M / 8); + } +#endif + +} + +#undef QUANTIZATION +#undef SCALE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl new file mode 100644 index 0000000..94a38d7 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl @@ -0,0 +1,389 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/mmad.cl" + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION(idx) \ + {\ + for(uint z = 0; z < 4; z++)\ + {\ + regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\ + regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\ + regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\ + regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\ + }\ + } + +#elif NO_QUANTIZATION + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\ + regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\ + regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\ + regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\ + \ + regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\ + regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\ + regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\ + regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\ + \ + regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\ + regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\ + regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\ + regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\ + \ + regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\ + regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\ + regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\ + regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]); + +#else + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + +#endif + +inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) +{ +#if OUT_WITH_PADDING == 1 + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; + padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; + padded_offset += y_idx * OUT_Y_PITCH; + padded_offset += x_idx * OUT_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += OUT_OFFSET; + + return padded_offset; +#else + return cOffset; +#endif +} + +inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, + __local int8* l_tileB, const uint l_offsetTileB_col0, + const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, + const uint l_offsetTileB_col3, int8* rowA, int8* colB, + int8* regC) +{ + // Read tile A from SLM to regA + uint l_offsetTileATemp = l_offsetTileA; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); + l_offsetTileATemp += 8 * SG_SIZE; + } + // Read tile B from SLM to regB and compute mmad + colB[0] = l_tileB[l_offsetTileB_col0]; + colB[1] = l_tileB[l_offsetTileB_col1]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); + } + colB[0] = l_tileB[l_offsetTileB_col2]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); + } + colB[1] = l_tileB[l_offsetTileB_col3]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); + } + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); + } +} + +/* + * \brief GEMM kernel to compute MxN matrix using SLM + * \param g_inA - Input matrix + * \param g_inB - Input matrix + * \param g_outC - Output matrix + */ + +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) +KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8) + (__global char* const g_inA, + __global int* g_outC, + __global char* const g_inB, + #if BIAS_TERM + __global BIAS_TYPE* biases, + #endif + __global float* quantizations, + #if CALIBRATION_TERM + __global float* calibrations, + #endif + uint split_idx + + ) +{ + + __global int4* const g_matrixA = (__global int4*)g_inA; + __global int4* const g_matrixB = (__global int4*)g_inB; + __global int8* g_matrixC = (__global int8*)g_outC; + + // Each work-group works to compute 128x128 tile. + // Each work-group contains 16 sub-groups. + // Each sub-group within the work-group works to compute a 32x32 tile. + // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). + // 2) Each sub-group works to compute 32x32 tileC (stored in regC). + // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. + // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") + __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; + __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; + + __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; + __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; + __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; + + const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y); + + const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); + const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); + const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); + const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); + + // Thread IDs + const uint g_tidY = get_global_id(DIM_Y); + const uint g_tidX = get_global_id(DIM_X); + const uint l_tidX = get_local_id(DIM_X); + const uint l_tidY = get_local_id(DIM_Y); + const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX; + + // SubGroup IDs + const uint sg_tid = get_sub_group_local_id(); + const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); + const uint sg_global_idY = g_tidY; + const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); + const uint sg_local_idY = l_tidY; + const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; + + const uint sub_group_id = get_sub_group_id(); + + // Registers + int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts + int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA + int8 colB[2]; // each lane will store 32x4 piece of matrixB + + // SLM indices + const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; + const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); + const uint numElements32x8TileB = numElements32x32TileB / 4; + const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; + const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; + const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; + + // Global indices + uint g_idxA[2]; + uint g_idxB[2]; +#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) + g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid; + g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid; + g_idxA[1] = g_idxA[0] + l_groupSize; + g_idxB[1] = g_idxB[0] + l_groupSize; +#else // Row (matrixA) and Col (matrixB) major layout + g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); + g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); +#endif + // Initial SLM setup + { + l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; + l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; + + l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; + if (l_tid < 32) + { + // Not all work-items will be needed to fetch the remaining matrix B + l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; + } +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + } + int4 hdcReadValueA[2]; + int4 hdcReadValueB[2]; + + __attribute__((opencl_unroll_hint(1))) + for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) + { + hdcReadValueA[0] = g_matrixA[g_idxA[0]]; + hdcReadValueB[0] = g_matrixB[g_idxB[0]]; + hdcReadValueA[1] = g_matrixA[g_idxA[1]]; + if (l_tid < 32) + { + // Not all work-items will be needed to fetch the remaining matrix B + hdcReadValueB[1] = g_matrixB[g_idxB[1]]; + } +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + + //MMAD compute + FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, + l_offsetTileB_col3, rowA, colB, regC); + + //SLM setup - SLM write only + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; + if (l_tid < 32) + { + // Not all work-items will be needed to fetch the remaining matrix B + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } // main outer loop + + //Last MMAD compute iteration (avoids branching in main loop) + FUNC_CALL(mmad_32x32_int8)( + &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, + &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, + regC); + + +#ifdef OUTPUT_TILED_GLOBAL_LAYOUT + + // Write out in swizzled manner after quantizing + __global uchar* g_outC_uchar = (__global uchar*)g_outC; + uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + + sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); + + uchar16 regC_uchar16; + uint offset_uc16 = 0; + + const uint workgroup_id_x = get_group_id(0); + uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x + uint feature = get_sub_group_local_id()*4 + feature_off; + + float4 quant_f = vload4(0, quantizations + feature); + float4 bias_f = vload4(0, biases + feature); + float4 calib_f = vload4(0, calibrations + feature); + +#if MMAD_SUPPORTED == 1 + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) +#endif + for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) + { + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + // B0..3, F0..31 + QUANTIZATION(0); + } + + intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); + cOffset += sizeof(uchar16) * SG_SIZE; + + // now we need to calculate again for other x + padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + // B0..3, F0..31 + QUANTIZATION(4); + } + + intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); + cOffset += sizeof(uchar16) * SG_SIZE; + } + +#else + // Write final accumulated values + uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + + sg_tid * (MATRIX_M / 8); + __attribute__((opencl_unroll_hint(SIMD_LANE_N))) + for (uint i = 0; i < (SIMD_LANE_N); ++i) + { + __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) + for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) + { + g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; + } + cOffset += SG_SIZE * (MATRIX_M / 8); + } +#endif +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl new file mode 100644 index 0000000..0a6d731 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl @@ -0,0 +1,430 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/mmad.cl" + +inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) +{ + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; + padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; + padded_offset += y_idx * OUT_Y_PITCH; + padded_offset += x_idx * OUT_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += OUT_OFFSET; + + return padded_offset; +} + +inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, + __local int8* l_tileB, const uint l_offsetTileB_col0, + const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, + const uint l_offsetTileB_col3, int8* rowA, int8* colB, + int8* regC) +{ + // Read tile A from SLM to regA + uint l_offsetTileATemp = l_offsetTileA; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); + l_offsetTileATemp += 8 * SG_SIZE; + } + // Read tile B from SLM to regB and compute mmad + colB[0] = l_tileB[l_offsetTileB_col0]; + colB[1] = l_tileB[l_offsetTileB_col1]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); + } + colB[0] = l_tileB[l_offsetTileB_col2]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); + } + colB[1] = l_tileB[l_offsetTileB_col3]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); + } + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); + } +} + +/* + * \brief GEMM kernel to compute MxN matrix using SLM + * \param g_inA - Input matrix + * \param g_inB - Input matrix + * \param g_outC - Output matrix + */ + +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) +KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8) + ( + __global char* const g_inA, + __global int* g_outC, + __global char* const g_inB, + #if BIAS_TERM + __global BIAS_TYPE* biases, + #endif + __global float* quantizations, + #if CALIBRATION_TERM + __global float* calibrations, + #endif + uint split_idx + + ) +{ + + __global int4* const g_matrixA = (__global int4*)g_inA; + __global int4* const g_matrixB = (__global int4*)g_inB; + __global int8* g_matrixC = (__global int8*)g_outC; + + // 1) All work-items in work-group fill SLM with tileA and tileB. + // 2) Each sub-group works to compute a 32x32 tileC (stored in regC). + // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. + // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") + __local int8 l_workGroupTileA_0[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; + __local int8 l_workGroupTileB_0[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; + __local uint* l_workGroupTileA_uint_0 = (__local uint*)l_workGroupTileA_0; + + __local int8 l_workGroupTileA_1[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; + __local int8 l_workGroupTileB_1[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; + __local uint* l_workGroupTileA_uint_1 = (__local uint*)l_workGroupTileA_1; + + __local int8* l_workGroupTileA_live = l_workGroupTileA_0; + __local int8* l_workGroupTileB_live = l_workGroupTileB_0; + __local uint* l_workGroupTileA_live_uint = l_workGroupTileA_uint_0; + + __local int4* l_workGroupTileA_0_int4 = (__local int4*)l_workGroupTileA_0; + __local int4* l_workGroupTileB_0_int4 = (__local int4*)l_workGroupTileB_0; + __local int4* l_workGroupTileA_1_int4 = (__local int4*)l_workGroupTileA_1; + __local int4* l_workGroupTileB_1_int4 = (__local int4*)l_workGroupTileB_1; + + const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y); + + // Thread IDs + const uint g_tidY = get_global_id(DIM_Y); + const uint g_tidX = get_global_id(DIM_X); + const uint l_tidX = get_local_id(DIM_X); + const uint l_tidY = get_local_id(DIM_Y); + const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX; + + // SubGroup IDs + const uint sg_tid = get_sub_group_local_id(); + const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); + const uint sg_global_idY = g_tidY; + const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); + const uint sg_local_idY = l_tidY; + const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; + + const uint sub_group_id = get_sub_group_id(); + + // Registers + int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts + int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA + int8 colB[2]; // each lane will store 32x4 piece of matrixB + + // SLM indices + const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; + const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); + const uint numElements32x8TileB = numElements32x32TileB / 4; + const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; + const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; + const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; + + // Global indices +#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) + uint g_idxA = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid; + uint g_idxB = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid; +#else // Row (matrixA) and Col (matrixB) major layout + uint g_idxA = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + uint g_idxB = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); +#endif + + // Initial SLM setup + { + uint g_idxATemp = g_idxA; + for (uint i = l_tid; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE) + { + l_workGroupTileA_0_int4[i] = g_matrixA[g_idxATemp]; +#ifdef TILED_GLOBAL_LAYOUT + g_idxATemp += WG_SIZE; +#else + g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); +#endif + } + + uint g_idxBTemp = g_idxB; + for (uint i = l_tid; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE) + { + l_workGroupTileB_0_int4[i] = g_matrixB[g_idxBTemp]; +#ifdef TILED_GLOBAL_LAYOUT + g_idxBTemp += WG_SIZE; +#else + g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); +#endif + } + +#ifdef TILED_GLOBAL_LAYOUT + g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA += MATRIX_SMALL_K / sizeof(int4); + g_idxB += MATRIX_SMALL_K / sizeof(int4); +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + } + + int4 hdcReadValueA[(WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1 + ? 1 + : (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE]; + int4 hdcReadValueB[(WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1 + ? 1 + : (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE]; + + __attribute__((opencl_unroll_hint(1))) + for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) + { + /* + * SLM setup - HDC read only + */ + +#if ((MATRIX_K / MATRIX_SMALL_K) > 1) + uint g_idxATemp = g_idxA; + for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j) + { + hdcReadValueA[j] = g_matrixA[g_idxATemp]; +#ifdef TILED_GLOBAL_LAYOUT + g_idxATemp += WG_SIZE; +#else + g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); +#endif + } + + uint g_idxBTemp = g_idxB; + for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j) + { + hdcReadValueB[j] = g_matrixB[g_idxBTemp]; +#ifdef TILED_GLOBAL_LAYOUT + g_idxBTemp += WG_SIZE; +#else + g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); +#endif + } + +#ifdef TILED_GLOBAL_LAYOUT + g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA += MATRIX_SMALL_K / sizeof(int4); + g_idxB += MATRIX_SMALL_K / sizeof(int4); +#endif +#endif + + /* + * MMAD compute + */ + + FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live, + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, + l_offsetTileB_col3, rowA, colB, regC); + + /* + * SLM setup - SLM write only + */ + +#if ((MATRIX_K / MATRIX_SMALL_K) > 1) + if (k % 2 == 0) + { + for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); + i += WG_SIZE, ++j) + { + l_workGroupTileA_1_int4[i] = hdcReadValueA[j]; + } + + for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); + i += WG_SIZE, ++j) + { + l_workGroupTileB_1_int4[i] = hdcReadValueB[j]; + } + + l_workGroupTileA_live = l_workGroupTileA_1; + l_workGroupTileB_live = l_workGroupTileB_1; + l_workGroupTileA_live_uint = l_workGroupTileA_uint_1; + } + else + { + for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); + i += WG_SIZE, ++j) + { + l_workGroupTileA_0_int4[i] = hdcReadValueA[j]; + } + + for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); + i += WG_SIZE, ++j) + { + l_workGroupTileB_0_int4[i] = hdcReadValueB[j]; + } + + l_workGroupTileA_live = l_workGroupTileA_0; + l_workGroupTileB_live = l_workGroupTileB_0; + l_workGroupTileA_live_uint = l_workGroupTileA_uint_0; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + } + + /* + * Last MMAD compute iteration (avoids branching in main loop) + */ + FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live, + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, + l_offsetTileB_col3, rowA, colB, regC); + +#ifdef OUTPUT_TILED_GLOBAL_LAYOUT + // Write out in swizzled manner after quantizing + __global uchar* g_outC_uchar = (__global uchar*)g_outC; + uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + + sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); + + uchar8 regC_uchar8[SIMD_LANE_M * SIMD_LANE_N / (sizeof(uchar8) / sizeof(uchar))]; + uint offset_uc8 = 0; + + const uint workgroup_id_x = get_group_id(0); + uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x + uint feature = get_sub_group_local_id() + feature_off; + + float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) )); + float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) )); + float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) )); + +#if MMAD_SUPPORTED == 1 + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) +#endif + for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) + { + // begin of account for output PADDING + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + // end of account for padding + + // B0 F0..31 + regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + // B1 F0..31 + regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + + FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); + cOffset += sizeof(uchar8) * SG_SIZE; + padded_offset += sizeof(uchar8) * SG_SIZE; + offset_uc8++; + + // B2 F0..31 + regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + // B3 F0..31 + regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + + FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); + cOffset += sizeof(uchar8) * SG_SIZE; + offset_uc8++; + + // now we need to calculate again for other x + padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + // + + regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s4) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s4) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s4) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s4) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + + regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s5) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s5) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s5) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s5) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + + FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); + cOffset += sizeof(uchar8) * SG_SIZE; + padded_offset += sizeof(uchar8) * SG_SIZE; + offset_uc8++; + + regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s6) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s6) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s6) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s6) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + + regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s7) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s7) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s7) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); + regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s7) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); + + FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); + cOffset += sizeof(uchar8) * SG_SIZE; + offset_uc8++; + } +#else + // Write final accumulated values + uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + + sg_tid * (MATRIX_M / 8); + __attribute__((opencl_unroll_hint(SIMD_LANE_N))) + for (uint i = 0; i < (SIMD_LANE_N); ++i) + { + __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) + for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) + { + g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; + } + cOffset += SG_SIZE * (MATRIX_M / 8); + } +#endif + +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl new file mode 100644 index 0000000..0e65059 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl @@ -0,0 +1,194 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/fetch.cl" +#include "include/mmad.cl" + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION \ + uchar4 out;\ + out[0] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 0][b] * SCALE + bias_f.s0);\ + out[1] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 1][b] * SCALE + bias_f.s1);\ + out[2] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 2][b] * SCALE + bias_f.s2);\ + out[3] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 3][b] * SCALE + bias_f.s3); + +#elif NO_QUANTIZATION + +#define QUANTIZATION \ + uchar4 out;\ + out[0] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 0][b]);\ + out[1] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 1][b]);\ + out[2] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 2][b]);\ + out[3] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 3][b]); + +#else + +#define QUANTIZATION \ + char4 out;\ + out[0] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 0][b] * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N);\ + out[1] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 1][b] * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N);\ + out[2] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 2][b] * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N);\ + out[3] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 3][b] * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N); + +#endif + +#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) +#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) +#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) +#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) +// input data is in blocks 4batch x 32 features + +#define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1) + +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) +KERNEL(convolution_mmad_batched_block)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, + __global BIAS_TYPE* biases, + const __global float* quantizations, +#if CALIBRATION_TERM + const __global float* calibrations, +#endif + uint split_idx) +{ + const uint x = get_global_id(0) * OUT_BLOCK_WIDTH; + const uint y = get_global_id(1) * OUT_BLOCK_HEIGHT; + +#if WEIGHTS_PER_WORKITEM == 4 + const uint f = (get_group_id(2) * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED; +#else + const uint f = ((get_group_id(2) * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED; +#endif + const uint b_block = (get_group_id(2) * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED; + + // all accumulators + int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 }; + + const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; + const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + + const uint filter_offset = ((get_group_id(2) * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; + const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block; + + uint filter_idx = filter_offset; + __attribute__((opencl_unroll_hint(1))) + for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) + { + __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) + for (uint j = 0; j < FILTER_SIZE_Y; ++j) + { + + ////// preloading input data ////// + int4 preloaded_input[NEEDED_INPUT_X]; + for(int p = 0; p < NEEDED_INPUT_X; p++) + { + const int input_offset_y = input_y + j; + const int input_offset_x = input_x + p; + + uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH; + preloaded_input[p] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); + } + + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint wi = 0; wi < WEIGHTS_PER_WORKITEM; wi++) + { + ////// preloading weights data ////// + int8 preloaded_weights[FILTER_SIZE_X]; + uint tmp_filter_idx = filter_idx; + __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) + for(uint w = 0; w < FILTER_SIZE_X; w++) + { + preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + tmp_filter_idx + (wi * FILTER_OFM_BLOCK_PITCH)))); + tmp_filter_idx += FILTER_X_PITCH; + } + ////// computing ////// + __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) + for (uint i = 0; i < FILTER_SIZE_X; ++i) + { + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++) + { + const uint out_idx = ox + wi * OUT_BLOCK_WIDTH; + const uint in_idx = ox * STRIDE_SIZE_X + i; + dotProd[out_idx] = MMAD_4x8(preloaded_input[in_idx], preloaded_weights[i], dotProd[out_idx]); + } + } + } + filter_idx += FILTER_X_PITCH * FILTER_SIZE_X; + } + } + +////// QUANTIZE & OUTPUT ////// + +#if WEIGHTS_PER_WORKITEM == 4 + +float4 quant_f = vload4(0, quantizations + f); +float4 bias_f = vload4(0, biases + f); +float4 calib_f = vload4(0, calibrations + f); + +__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) +for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) +{ + const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y, x + o); + uint4 to_output; + __attribute__((opencl_unroll_hint(4))) + for(uint b = 0; b < 4; b++) + { + QUANTIZATION; + to_output[b] = as_uint(out); + } + intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output); +} +#else +__attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) +for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) +{ + float quant_f = quantizations[f + w * 8]; + float bias_f = biases[f + w * 8]; +#if CALIBRATION_TERM + float calib_f = calibrations[f + w * 8]; +#endif + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) + { + const uint out_idx = o + OUT_BLOCK_WIDTH * w; + __attribute__((opencl_unroll_hint(4))) + for(uint b = 0; b < 4; b++) + { + #if CALIBRATION_TERM + dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f); + #else // CALIBRATION_TERM + dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF); + #endif // CALIBRATION_TERM + + const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f + w * 8, y, x + o); + output[dst_index] = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N); + } + } +} +#endif + +} + +#undef FILTER_IFM_MMAD_NUM +#undef FILTER_OFM_MMAD_NUM +#undef FILTER_IFM_ALIGNED +#undef FILTER_OFM_ALIGNED + +#undef SCALE +#undef QUANTIZATION \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl new file mode 100644 index 0000000..bc58c70 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl @@ -0,0 +1,241 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/fetch.cl" +#include "include/mmad.cl" + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION \ + uchar4 out;\ + out[0] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s0);\ + out[1] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * SCALE + bias_f.s1);\ + out[2] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * SCALE + bias_f.s2);\ + out[3] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s3); + +#elif NO_QUANTIZATION + +#define QUANTIZATION \ + uchar4 out;\ + out[0] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]);\ + out[1] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]);\ + out[2] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]);\ + out[3] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]); + +#else + +#define QUANTIZATION \ + char4 out;\ + out[0] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0 ) ), NL_M, NL_N);\ + out[1] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1 ) ), NL_M, NL_N);\ + out[2] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2 ) ), NL_M, NL_N);\ + out[3] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b] * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3 ) ), NL_M, NL_N); + +#endif + +#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) +#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) +#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) +#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) +// input data is in blocks 4batch x 32 features + +#define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1) +#define NEEDED_INPUT_Y ((OUT_BLOCK_HEIGHT-1) * (STRIDE_SIZE_Y) + (FILTER_SIZE_Y - 1) + 1) + +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) +KERNEL(convolution_mmad_batched_block_1x1)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, + __global BIAS_TYPE* biases, + const __global float* quantizations, +#if CALIBRATION_TERM + const __global float* calibrations, +#endif + uint split_idx) +{ + const uint x = get_global_id(0) * OUT_BLOCK_WIDTH; + const uint y = get_global_id(1) * OUT_BLOCK_HEIGHT; + +#if WEIGHTS_PER_WORKITEM == 4 + const uint f = (get_group_id(2) * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED; +#else + const uint f = ((get_group_id(2) * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED; +#endif + const uint b_block = (get_group_id(2) * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED; + + int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 }; + + const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; + const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + + const uint filter_offset = ((get_group_id(2) * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; + const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block; + + uint filter_idx = filter_offset; + for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) + { + ////// preloading input data ////// + int4 preloaded_input[NEEDED_INPUT_X * NEEDED_INPUT_Y]; + for(int h = 0; h < NEEDED_INPUT_Y; h++) + { + for(int p = 0; p < NEEDED_INPUT_X; p++) + { + const int input_offset_y = input_y + h; + const int input_offset_x = input_x + p; + + uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH; + preloaded_input[p + h * NEEDED_INPUT_X] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); + } + } + + __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) + for (uint j = 0; j < FILTER_SIZE_Y; ++j) + { + __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) + for (uint i = 0; i < FILTER_SIZE_X; ++i) + { + ////// preloading weights data ////// + int8 preloaded_weights[WEIGHTS_PER_WORKITEM]; + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) + { + preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + (filter_idx + w * FILTER_OFM_BLOCK_PITCH) ) )); + } + + ////// computing ////// + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) + { + __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) + for(uint oy = 0; oy < OUT_BLOCK_HEIGHT; oy++) + { + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++) + { + const uint out_idx = ox + OUT_BLOCK_WIDTH * (oy + w * OUT_BLOCK_HEIGHT); + const uint preloaded_idx =ox * STRIDE_SIZE_X + i + NEEDED_INPUT_X * (oy * STRIDE_SIZE_Y + j); + dotProd[out_idx] = MMAD_4x8(preloaded_input[preloaded_idx], preloaded_weights[w], dotProd[out_idx]); + } + } + } + filter_idx += FILTER_X_PITCH; + } + } + } + + +#if WEIGHTS_PER_WORKITEM == 4 + +float4 quant_f = vload4(0, quantizations + f); +float4 bias_f = vload4(0, biases + f); +float4 calib_f = vload4(0, calibrations + f); +__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) +for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) +{ + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) + { + const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o); + + uint4 to_output; + __attribute__((opencl_unroll_hint(4))) + for(uint b = 0; b < 4; b++) + { + const uint out_idx = o + OUT_BLOCK_WIDTH * h; + + QUANTIZATION; + to_output[b] = as_uint(out); + } + intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output); + } +} + +#else // WEIGHTS_PER_WORKITEM ==4 + +////// QUANTIZE & OUTPUT ////// +__attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) +for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) +{ + float quant_f = quantizations[f + w * 8]; + float bias_f = biases[f + w * 8]; +#if CALIBRATION_TERM + float calib_f = calibrations[f + w * 8]; +#endif + __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) + for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) + { + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) + { + const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT); + for(uint b = 0; b < 4; b++) + { + #if CALIBRATION_TERM + dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f); + #else // CALIBRATION_TERM + dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF); + #endif // CALIBRATION_TERM + } + } + } +} + +////// OUTPUT STAGE ////// +__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) +for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) +{ + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) + { + const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o); + + __attribute__((opencl_unroll_hint(4))) + for(uint b = 0; b < 4; b++) + { + #if WEIGHTS_PER_WORKITEM == 2 + char2 out; + const uint out_idx = o + OUT_BLOCK_WIDTH * h; + out[0] = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N); + out[1] = ACTIVATION(convert_char(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT][b]), NL_M, NL_N); + + intel_sub_group_block_write_uc2((__global uchar*)(output + dst_index + b * 32), as_uchar2(out)); + #else + __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) + for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) + { + const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT); + const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f + w * 8, y + h, x + o); + char char_val = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N); + output[dst_index + b * 32] = char_val; + } + #endif + } + } +} + +#endif // WEIGHTS_PER_WORKITEM ==4 + +} + +#undef FILTER_IFM_MMAD_NUM +#undef FILTER_OFM_MMAD_NUM +#undef FILTER_IFM_ALIGNED +#undef FILTER_OFM_ALIGNED + + +#undef SCALE +#undef QUANTIZATION \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl new file mode 100644 index 0000000..f9e04cf --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl @@ -0,0 +1,948 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/data_types.cl" +#include "include/mmad.cl" + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION \ + slm_write0.s0 = convert_uchar_sat((float)outvec.s0 * SCALE + bias_f);\ + slm_write0.s1 = convert_uchar_sat((float)outvec.s1 * SCALE + bias_f);\ + slm_write0.s2 = convert_uchar_sat((float)outvec.s2 * SCALE + bias_f);\ + slm_write0.s3 = convert_uchar_sat((float)outvec.s3 * SCALE + bias_f); + +#elif NO_QUANTIZATION + +#define QUANTIZATION(idx) \ + slm_write0.s0 = convert_uchar_sat(outvec.s0);\ + slm_write0.s1 = convert_uchar_sat(outvec.s1);\ + slm_write0.s2 = convert_uchar_sat(outvec.s2);\ + slm_write0.s3 = convert_uchar_sat(outvec.s3); + +#else + +#define QUANTIZATION \ + slm_write0.s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s0) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N));\ + slm_write0.s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s1) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N));\ + slm_write0.s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s2) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N));\ + slm_write0.s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s3) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N)); + +#endif + +// mapping to clDNN +#define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C) +#define _OD OUTPUT_FEATURE_NUM +#define _OW OUTPUT_SIZE_X +#define _OH OUTPUT_SIZE_Y +#define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) +#define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define _IH INPUT0_SIZE_Y +#define _IW INPUT0_SIZE_X +#define _ID INPUT0_FEATURE_NUM +#define K_HEIGHT FILTER_SIZE_Y +#define K_WIDTH FILTER_SIZE_X +#define BATCH_SIZE OUTPUT_BATCH_NUM + +#define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) +#define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) +#define K_STRIDE STRIDE_SIZE_X +// end of mapping + +// for now kernel stride is square +#define K_WSTRIDE K_STRIDE +#define K_HSTRIDE K_STRIDE + +#define PACK 32 +#define BATCH_PACK 4 + +__attribute__((intel_reqd_sub_group_size(8))) +KERNEL(convolution_mmad_slm_2x14_rep4)( +__global int8 *inputs, +__global uchar* outputs, +__global int8* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif +#if QUANTIZATION_TERM + const __global float* quantizations, +#endif +#if CALIBRATION_TERM + const __global float* calibrations, +#endif + uint split_idx +) +{ + const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z; + const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y; + + ushort fmg = get_group_id(0); // Output Depth + ushort group_y = get_group_id(1); // Output Width + ushort group_z = get_group_id(2); // Output Height + + /* 32,1,4 WG , SIMD8 - 16 HW threads in a WG + threads 0-3 (group1) : (lid_x:0-15,lid_y:0,lid_z:0) + threads 4-7 (group2) : (lid_x:0-15,lid_y:0,lid_z:1) + threads 8-11 (group3) : (lid_x:0-15,lid_y:0,lid_z:2) + threads 12-15 (group4) : (lid_x:0-15,lid_y:0,lid_z:3) + + Verify sub_group_layout through below printfs + + if(group_z == 0 && group_y == 0 && fmg == 0 && get_sub_group_id() == 31) { + printf("\n sub_group_local_id: %d, lid_x: %d, lid_y: %d, lid_z: %d ", get_sub_group_local_id(), get_local_id(0) ,get_local_id(1),get_local_id(2)); + printf("\n #WorkgroupsX: %d, #WorkgroupsY: %d, #WorkgroupsZ: %d",get_num_groups(0),get_num_groups(1),get_num_groups(2)); + } + + If sub_group_layout is different then derive lid_x, lid_z + + lid_z: thread_id/4 + */ + + /* Thread, local IDs */ + ushort thread_id = get_sub_group_id(); + ushort threadid_group_4 = thread_id % 4; + ushort threadid_mod_2 = thread_id%2; + ushort threadid_mod_8 = thread_id % 8; + + ushort lid_x = get_local_id(0); + ushort lid_z = get_local_id(2); + + uchar lane_id = get_sub_group_local_id(); + + /* 32-bit signed accumulator for 4 mini-batches , for a thread OUT_BLOCK_WIDTH*HEIGHT*4 registers are used + Will be converted to 8-bits before final write */ + + int4 out[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = { 0 } ; + + /* Account for batching */ + + ushort batch = ( fmg*LOCAL_SIZE_X ) /_OD; + + // Size calculated for int8 elements , One Batch processing is [H][W][4N][32C] + uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ; + + uint in_addr_offset = batch*input_size; + + /* Goto activation tile for work group, offset is w.r.t int8 array */ + + uint groupy_tile = TILE_W*group_y; + uint groupz_tile = TILE_H*group_z; + + in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK; + + /* SLM space for Activation, Weights + ( 32,1,4 ) Workgroup - 4 tiles along Y direction and 32 different output channels + Activation - 10Wx16Wx4Nx32C Weights -9RSx32Kx32C */ + + __local int8 act_slm [ 10*16*4 ]; + __local int8 weight_slm [ 9*32 ]; + + /* 10Hx16Wx4Nx32C activation tile written into SLM. Distribute among 16 threads in Workgroup + threads 0-1 write 16x4x32 of H=0, W=0...15 ( 8x4x32 per thread ) + threads 2-3 write 16x4x32 of H=1, W=0...15 ( 8x4x32 per thread ) + threads 4-5 write 16x4x32 of H=2, W=0...15 ( 8x4x32 per thread ) + threads 6-7 write 16x4x32 of H=3, W=0...15 ( 8x4x32 per thread ) + threads 8-9 write 16x4x32 of H=4, W=0...15 ( 8x4x32 per thread ) + threads 10-11 write 16x4x32 of H=5, W=0...15 ( 8x4x32 per thread ) + threads 12 write 16x4x32 of H=6, W=0...15 ( 16x4x32 per thread ) + thread 13 writes 16x4x32 of H=7 + thread 14 writes 16x4x32 of H=8 + thread 15 writes 16x4x32 of H=9 + + Interleaved write to avoid SLM BC + + threads0,1 write 16x4x32 together + thread0 writes first 4x32 block, thread1 writes next 4x32 block etc. + */ + + + /* Goto activation tile for thread in group */ + + uint row_offset = thread_id / 2; + + if ( thread_id >= 12 ) { + row_offset = 6 + thread_id - 12 - threadid_mod_2; + } + + // In addr offset for the particular thread + in_addr_offset += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ; + + /* Activation SLM indices */ + uint act_slm_write = row_offset * ( TILE_W + 2) * BATCH_PACK; + uint act_slm_read = OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ; + + /* Weights + Weight Global Tensor Order: [K/8][C/32][R][S][8C][8K][4C] + */ + + /* 9RSx32Kx32C Weight Block in SLM + thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0 ( k=0..7) + thread1 handles w(0,0),w(0,1),w(0,2) of K=1 ( k=8..15) + thread2 handles w(1,0),w(1,1) of K=0 ( k=0..7) + thread3 handles w(1,0),w(1,1) of K=1 ( k=8..15) + thread4 handles w(1,2),w(2,0) of K=0 ( k=0..7) + thread5 handles w(1,2),w(2,0) of K=1 ( k=8..15) + thread6 handles w(2,1),w(2,2) of K=0 ( k=0..7) + thread7 handles w(2,1),w(2,2) of K=1 ( k=8..15) + + Similarly threads8-15 handles for K=2,3 + + Weight Layout in SLM + + w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=8..15,C=0..15) + w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=8..15,C=16..31) + + Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM + Thread0 will read k=0..7, thread1 will read k=8..15 + + First all output channels are present in SLM, then next weight pixel is present in SLM */ + + #define NUM_FILTERS (K_HEIGHT * K_WIDTH) + + uint output_depth = fmg % ( _OD / LOCAL_SIZE_X ); + + uint weight_size_CRS = ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside + + // Global weight addr for workgroup + uint weight_global_addr_offset = output_depth * 4 * weight_size_CRS ; //32 output channels per workgroup + + // Global weight address for thread + uint weight_global_channel_offset = threadid_mod_2 * weight_size_CRS ; + + uint slm_channel_offset = 0; + + if ( thread_id >= 8 ) { + weight_global_channel_offset += 2*weight_size_CRS; + slm_channel_offset = 1; + } + + uint weight_global_pixel_offset = 0; + uint slm_pixel_offset = 0; + + if ( threadid_mod_8 >=2 ) + { + weight_global_pixel_offset = 3*8 + ( ( (threadid_mod_8/2) - 1 )*2*8 ); + slm_pixel_offset = 3*LOCAL_SIZE_X + ( ( (threadid_mod_8/2) - 1 )*2*LOCAL_SIZE_X ); + } + + weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset; + + /* Weight slm write index */ + + uint slm_write_weight = threadid_mod_2*4 + slm_pixel_offset + slm_channel_offset * 16; + + /* Weight slm read index */ + + uint wt_slm_rd_offset = threadid_group_4*8; + + if ( threadid_mod_2 ) + { + wt_slm_rd_offset = wt_slm_rd_offset - 8 + 4; + } + + int kd; + + __attribute__((opencl_unroll_hint(1))) + for(kd = 0; kd < ( _ID / PACK ) ; kd++) + { + + { + /* Load Activation from global to SLM */ + + int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset; + + __global uint *activation_tile = (__global uint*)&inputs[ in_addr ]; + + __local uint *act_slm_ptr = (__local uint *) &act_slm [ act_slm_write ]; + + /* The odd thread in fused pair will start from next 4x8 block */ + + activation_tile += threadid_mod_2*4*8; + act_slm_ptr += threadid_mod_2*4*8; + + int4 act_col_0 = as_int4( intel_sub_group_block_read4(activation_tile) ); + int4 act_col_1 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) ); + int4 act_col_2 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) ); + int4 act_col_3 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) ); + int4 act_col_4 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); + int4 act_col_5 = as_int4( intel_sub_group_block_read4(activation_tile + 5*8*8) ); + int4 act_col_6 = as_int4( intel_sub_group_block_read4(activation_tile + 6*8*8) ); + int4 act_col_7 = as_int4( intel_sub_group_block_read4(activation_tile + 7*8*8) ); + + SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 5*8*8 ) , as_uint4 ( act_col_5 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 6*8*8 ) , as_uint4 ( act_col_6 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 7*8*8 ) , as_uint4 ( act_col_7 ) ); + + if ( thread_id >=12 ) + { + activation_tile = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8; + act_slm_ptr += 8*8*8; + + int4 act_col_9 = as_int4( intel_sub_group_block_read4(activation_tile) ); + int4 act_col_10 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) ); + int4 act_col_11 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) ); + int4 act_col_12 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) ); + int4 act_col_13 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); + int4 act_col_14 = as_int4( intel_sub_group_block_read4(activation_tile + 5*8*8) ); + int4 act_col_15 = as_int4( intel_sub_group_block_read4(activation_tile + 6*8*8) ); + int4 act_col_16 = as_int4( intel_sub_group_block_read4(activation_tile + 7*8*8) ); + + SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_9 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_10 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 5*8*8 ) , as_uint4 ( act_col_14 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 6*8*8 ) , as_uint4 ( act_col_15 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 7*8*8 ) , as_uint4 ( act_col_16 ) ); + } + + /* load weights from global to weight_slm */ + + int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset; + + __global uint *weight_tile = (__global uint*)&weights [ weight_addr ]; + __local uint *wt_slm_ptr = (__local uint *) &weight_slm [ slm_write_weight ]; + + int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); + int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); + int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); + int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) ); + + SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w0 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 32*8 ) , as_uint4 ( w2 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 32*8 + 8*8 ) , as_uint4 ( w3 ) ); + + if( threadid_mod_8 < 2 ) + { + weight_tile += 16*8; + wt_slm_ptr += 2*32*8; + + int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); + int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); + + SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w5 ) ); + } + } + + // Synchronize SLM writes across workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + uint wt_slm_rd = wt_slm_rd_offset; + + __local uint *slm_ptr0 = (__local uint *) &act_slm[ act_slm_read ]; + __local uint *slm_ptr1 = (__local uint *) &weight_slm[ wt_slm_rd ]; + + int8 weights_reg0, weights_reg1,weights_reg2; + + /********************************************************************************************************** + First phase - load first row of weights and for the first activation row - 1Hx8Wx4N inputs at a time + - Weights - 24 registers, Activations - 32 registers: Total 56 registers used for input data + ***********************************************************************************************************/ + { + int4 act_reg[ 8 ]; + + /* Load weights from SLM into registers */ + { + weights_reg0.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg0.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + + weights_reg1.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg1.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + + weights_reg2.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg2.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + } + + /* load first 1Hx8Wx4N inputs - Activation Broadcast will occur since it is same for fused threads */ + + __attribute__((opencl_unroll_hint(8))) + for (int ic = 0; ic < 8; ic++) + { + /* Load activations from SLM into registers */ + + uint slm_offset = ic * BATCH_PACK * 8 ; + + act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + } + + /* Convolve */ + + /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/ + + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[0], weights_reg0 ); + out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[1], weights_reg0 ); + out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[2], weights_reg0 ); + out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[3], weights_reg0 ); + out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[4], weights_reg0 ); + out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[5], weights_reg0 ); + out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[6], weights_reg0 ); + out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[7], weights_reg0 ); + + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[1], weights_reg1 ); + out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[2], weights_reg1 ); + out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[3], weights_reg1 ); + out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[4], weights_reg1 ); + out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[5], weights_reg1 ); + out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[6], weights_reg1 ); + out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[7], weights_reg1 ); + + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[2], weights_reg2 ); + out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[3], weights_reg2 ); + out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[4], weights_reg2 ); + out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[5], weights_reg2 ); + out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[6], weights_reg2 ); + out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[7], weights_reg2 ); + + /* load next 1Hx8Wx4N inputs */ + + __attribute__((opencl_unroll_hint(8))) + for (int ic = 8; ic < 16; ic++) + { + uint slm_offset = ic * BATCH_PACK * 8; + + act_reg [ ic - 8 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset) ) ; + } + + /* Convolve */ + + out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[0], weights_reg2 ); + out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[1], weights_reg2 ); + out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[2], weights_reg2 ); + out[ 9 ] = _MMAD_4x8 ( out[ 9 ], act_reg[3], weights_reg2 ); + out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[4], weights_reg2 ); + out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[5], weights_reg2 ); + out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[6], weights_reg2 ); + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[7], weights_reg2 ); + + out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[0], weights_reg1 ); + out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[1], weights_reg1 ); + out[ 9 ] = _MMAD_4x8 ( out[ 9 ], act_reg[2], weights_reg1 ); + out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[3], weights_reg1 ); + out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[4], weights_reg1 ); + out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[5], weights_reg1 ); + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[6], weights_reg1 ); + + out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[0], weights_reg0 ); + out[ 9 ] = _MMAD_4x8 ( out [ 9 ], act_reg[1], weights_reg0 ); + out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[2], weights_reg0 ); + out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[3], weights_reg0 ); + out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[4], weights_reg0 ); + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[5], weights_reg0 ); + } + + /* Second , Third phase */ + { + int8 weights_reg3, weights_reg4,weights_reg5; + int4 act_reg_2[ 6 ]; + + /***************************************************************************************************************************************** + Second phase - load second row of weights, now both rows are in registers, for the second activation row - 1Hx6Wx4N inputs at a time + - Weights - 48 registers, Activations - 24 registers: Total 72 registers used for input data + ******************************************************************************************************************************************/ + + /* Load weights of row = 1 from SLM into registers */ + { + + weights_reg3.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg3.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + + weights_reg4.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg4.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + + weights_reg5.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg5.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + } + + /* load input row =1,col=0:1 1Hx2Wx8N */ + + uint slm_row_offset_2 = 1*(TILE_W + 2)*BATCH_PACK*8; + + act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_2) ) ; + act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_2 + BATCH_PACK*8) ) ; + + out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[0] , weights_reg0 ); + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[0] , weights_reg3 ); + out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg_2[1] , weights_reg3 ); + out[ 15 ] = _MMAD_4x8 ( out[ 15 ], act_reg_2[1] , weights_reg0 ); + + out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[1], weights_reg1 ); + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[1], weights_reg4 ); + + /* load input row =1,col=2:7,8:13,1Hx6Wx4N */ + + uint col = 2; + + __attribute__((opencl_unroll_hint(2))) + do { + + uint slm_offset = 1*(TILE_W + 2)*BATCH_PACK*8 + col*BATCH_PACK*8; + + act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + BATCH_PACK*8)) ; + act_reg_2 [ 2 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 2*BATCH_PACK*8)) ; + act_reg_2 [ 3 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 3*BATCH_PACK*8) ) ; + act_reg_2 [ 4 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 4*BATCH_PACK*8) ) ; + act_reg_2 [ 5 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 5*BATCH_PACK*8) ) ; + + uint first_row_offset = col - 2; + uint second_row_offset = 14 + col - 2; + + out [ first_row_offset ] = _MMAD_4x8 ( out[ first_row_offset ] , act_reg_2[0] , weights_reg5 ); + out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1] , act_reg_2[0], weights_reg4 ); + out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2] , act_reg_2[0], weights_reg3 ); + out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[1], weights_reg3 ); + + out [ second_row_offset ] = _MMAD_4x8 ( out[ second_row_offset ] , act_reg_2[0] , weights_reg2 ); + out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1] , act_reg_2[0], weights_reg1 ); + out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2] , act_reg_2[0], weights_reg0 ); + out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[1], weights_reg0 ); + + out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1 ], act_reg_2[1], weights_reg5 ); + out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2 ], act_reg_2[1], weights_reg4 ); + out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[2], weights_reg4 ); + out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4 ], act_reg_2[2], weights_reg3 ); + + out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1 ], act_reg_2[1], weights_reg2 ); + out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2 ], act_reg_2[1], weights_reg1 ); + out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3 ], act_reg_2[2], weights_reg1 ); + out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4 ], act_reg_2[2], weights_reg0 ); + + out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2], act_reg_2[2], weights_reg5 ); + out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3], act_reg_2[3], weights_reg5 ); + out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[3], weights_reg4 ); + out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[3], weights_reg3 ); + + out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2], act_reg_2[2], weights_reg2 ); + out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[3], weights_reg2 ); + out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[3], weights_reg1 ); + out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[3], weights_reg0 ); + + out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[4], weights_reg3 ); + out [ first_row_offset + 7 ] = _MMAD_4x8 ( out[ first_row_offset + 7], act_reg_2[5], weights_reg3 ); + out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[4], weights_reg4 ); + out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[5], weights_reg4 ); + out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[4], weights_reg5 ); + out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[5], weights_reg5 ); + + out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[4], weights_reg0 ); + out [ second_row_offset + 7 ] = _MMAD_4x8 ( out[ second_row_offset + 7], act_reg_2[5], weights_reg0 ); + out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[4], weights_reg1 ); + out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[5], weights_reg1 ); + out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[4], weights_reg2 ); + out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[5], weights_reg2 ); + + col +=6; + + } while ( col < 14 ); + + /* load input row =1,col=14:15 1Hx2Wx4N */ + + uint slm_row_offset_3 = 1 * (TILE_W + 2) * BATCH_PACK * 8 + 14 * BATCH_PACK * 8; + + act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_3)) ; + act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_3 + BATCH_PACK*8)) ; + + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[0], weights_reg4 ); + out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[0], weights_reg1 ); + out[ 26 ] = _MMAD_4x8 ( out[ 26 ], act_reg_2[0], weights_reg2 ); + + out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg_2[0], weights_reg5 ); + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[1], weights_reg5 ); + + out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[1], weights_reg2 ); + + /**************************************************************************************************************************************** + Third phase - load third row of weights, this replaces first weight row, for the third activation row read 1Hx6Wx4N inputs at a time + - Weights - 48 registers, Activations - 24 registers: Total 72 registers used for input data + *****************************************************************************************************************************************/ + + /* Load weights of row = 2 from SLM into registers - replaces row = 0 weights */ + { + weights_reg0.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg0.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + + weights_reg1.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg1.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + + weights_reg2.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); + weights_reg2.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); + slm_ptr1 += LOCAL_SIZE_X*8; + } + + uint slm_row_offset_4 = 2*(TILE_W + 2)*BATCH_PACK*8; + + act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_4)) ; + act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_4 + BATCH_PACK*8)) ; + + out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[0] , weights_reg3 ); + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[0] , weights_reg0 ); + out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg_2[1] , weights_reg0 ); + out[ 15 ] = _MMAD_4x8 ( out[ 15 ], act_reg_2[1] , weights_reg3 ); + + out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[1], weights_reg4 ); + out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[1], weights_reg1 ); + + /* load input row =2,col=2:7,8:13,1Hx6Wx4N */ + + uint col_2 = 2; + + __attribute__((opencl_unroll_hint(2))) + do { + + uint slm_offset = 2*(TILE_W + 2)*BATCH_PACK*8 + col_2*BATCH_PACK*8; + + act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + BATCH_PACK*8)) ; + act_reg_2 [ 2 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 2*BATCH_PACK*8)) ; + act_reg_2 [ 3 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 3*BATCH_PACK*8) ) ; + act_reg_2 [ 4 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 4*BATCH_PACK*8) ) ; + act_reg_2 [ 5 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 5*BATCH_PACK*8) ) ; + + uint first_row_offset = col_2 - 2; + uint second_row_offset = 14 + col_2 - 2; + + out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1] , act_reg_2[0], weights_reg1 ); + out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2] , act_reg_2[0], weights_reg0 ); + out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[1], weights_reg0 ); + out [ first_row_offset ] = _MMAD_4x8 ( out[ first_row_offset ] , act_reg_2[0] , weights_reg2 ); + + out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1] , act_reg_2[0], weights_reg4 ); + out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2] , act_reg_2[0], weights_reg3 ); + out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[1], weights_reg3 ); + out [ second_row_offset ] = _MMAD_4x8 ( out[ second_row_offset ] , act_reg_2[0] , weights_reg5 ); + + out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1 ], act_reg_2[1], weights_reg2 ); + out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2 ], act_reg_2[1], weights_reg1 ); + out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[2], weights_reg1 ); + out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4 ], act_reg_2[2], weights_reg0 ); + + out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1 ], act_reg_2[1], weights_reg5 ); + out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2 ], act_reg_2[1], weights_reg4 ); + out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3 ], act_reg_2[2], weights_reg4 ); + out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4 ], act_reg_2[2], weights_reg3 ); + + out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[3], weights_reg0 ); + out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2], act_reg_2[2], weights_reg2 ); + out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3], act_reg_2[3], weights_reg2 ); + out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[3], weights_reg1 ); + + out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[3], weights_reg3 ); + out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2], act_reg_2[2], weights_reg5 ); + out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[3], weights_reg5 ); + out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[3], weights_reg4 ); + + out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[4], weights_reg0 ); + out [ first_row_offset + 7 ] = _MMAD_4x8 ( out[ first_row_offset + 7], act_reg_2[5], weights_reg0 ); + out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[4], weights_reg1 ); + out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[5], weights_reg1 ); + out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[4], weights_reg2 ); + out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[5], weights_reg2 ); + + out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[4], weights_reg3 ); + out [ second_row_offset + 7 ] = _MMAD_4x8 ( out[ second_row_offset + 7], act_reg_2[5], weights_reg3 ); + out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[4], weights_reg4 ); + out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[5], weights_reg4 ); + out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[4], weights_reg5 ); + out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[5], weights_reg5 ); + + col_2 +=6; + + } while ( col_2 < 14 ); + + /* load input row =2,col=14:15 1Hx2Wx4N */ + + uint slm_row_offset_5 = 2 * (TILE_W + 2) * BATCH_PACK * 8 + 14 * BATCH_PACK * 8; + + act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_5)) ; + act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_5 + BATCH_PACK*8)) ; + + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[0], weights_reg1 ); + out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[0], weights_reg4 ); + out[ 26 ] = _MMAD_4x8 ( out[ 26 ], act_reg_2[0], weights_reg5 ); + + out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg_2[0], weights_reg2 ); + out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[1], weights_reg2 ); + + out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[1], weights_reg5 ); + } + + /************************************************************************************************* + Fourth phase - discard middle weight row, for fourth activation row load 1Hx8Wx4N at a time + - Weights - 24 registers, Activations - 32 registers: Total 56 registers used for input data + **************************************************************************************************/ + { + int4 act_reg[ 8 ]; + + /* load first 1Hx8Wx4N inputs */ + + uint slm_row_offset_6 = 3 * (TILE_W + 2) * BATCH_PACK * 8 ; + + __attribute__((opencl_unroll_hint(8))) + for (int ic = 0; ic < 8; ic++) + { + /* Load activations from SLM into registers */ + uint slm_offset = ic * BATCH_PACK * 8 + slm_row_offset_6; + act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + } + + /* Convolve */ + + uint phase_offset = 14; + + out[ phase_offset + 0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[0], weights_reg0 ); + out[ phase_offset + 1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[1], weights_reg0 ); + out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[2], weights_reg0 ); + out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[3], weights_reg0 ); + out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[4], weights_reg0 ); + out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[5], weights_reg0 ); + out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[6], weights_reg0 ); + out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[7], weights_reg0 ); + + out[ phase_offset +0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[1], weights_reg1 ); + out[ phase_offset +1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[2], weights_reg1 ); + out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[3], weights_reg1 ); + out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[4], weights_reg1 ); + out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[5], weights_reg1 ); + out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[6], weights_reg1 ); + out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[7], weights_reg1 ); + + out[ phase_offset +0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[2], weights_reg2 ); + out[ phase_offset +1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[3], weights_reg2 ); + out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[4], weights_reg2 ); + out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[5], weights_reg2 ); + out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[6], weights_reg2 ); + out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[7], weights_reg2 ); + + /* load next 1Hx8Wx4N inputs */ + + __attribute__((opencl_unroll_hint(8))) + for (int ic = 8; ic < 16; ic++) + { + uint slm_offset = ic * BATCH_PACK * 8 + slm_row_offset_6; + act_reg [ ic - 8 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + } + + /* Convolve */ + + out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[0], weights_reg2 ); + out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[1], weights_reg2 ); + out[ phase_offset + 8 ] = _MMAD_4x8 ( out[ phase_offset +8 ], act_reg[2], weights_reg2 ); + out[ phase_offset +9 ] = _MMAD_4x8 ( out[phase_offset + 9 ], act_reg[3], weights_reg2 ); + out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[4], weights_reg2 ); + out[ phase_offset +11 ] = _MMAD_4x8 ( out[phase_offset + 11 ], act_reg[5], weights_reg2 ); + out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[6], weights_reg2 ); + out[ phase_offset +13 ] = _MMAD_4x8 ( out[ phase_offset +13 ], act_reg[7], weights_reg2 ); + + out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[0], weights_reg1 ); + out[ phase_offset +8 ] = _MMAD_4x8 ( out[phase_offset + 8 ], act_reg[1], weights_reg1 ); + out[ phase_offset +9 ] = _MMAD_4x8 ( out[ phase_offset +9 ], act_reg[2], weights_reg1 ); + out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[3], weights_reg1 ); + out[ phase_offset +11 ] = _MMAD_4x8 ( out[ phase_offset +11 ], act_reg[4], weights_reg1 ); + out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[5], weights_reg1 ); + out[ phase_offset +13 ] = _MMAD_4x8 ( out[phase_offset + 13 ], act_reg[6], weights_reg1 ); + + out[ phase_offset +8 ] = _MMAD_4x8 ( out[phase_offset + 8 ], act_reg[0], weights_reg0 ); + out[ phase_offset +9 ] = _MMAD_4x8 ( out[ phase_offset +9 ], act_reg[1], weights_reg0 ); + out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[2], weights_reg0 ); + out[ phase_offset +11 ] = _MMAD_4x8 ( out[phase_offset + 11 ], act_reg[3], weights_reg0 ); + out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[4], weights_reg0 ); + out[ phase_offset +13 ] = _MMAD_4x8 ( out[phase_offset + 13 ], act_reg[5], weights_reg0 ); + } + + // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM + barrier(CLK_LOCAL_MEM_FENCE); + + } //for kd + + /**************************************************************************************************************** + *******************************Output Write Stage**************************************************************** + ****************************************************************************************************************/ + + /* + Outputs will be passed through activation function and quantized to 8 bits before writing + Output will be in same format as input [K/32][N/4][P][Q][4N][32K] + Writes are staged in SLM so that 32-bit writes can be done to Global memory + */ + + /******************* Write output to SLM *************************************/ + + /* Quantize and pack 4x1 byte - from consectuive n-coordinates + Write uint32 from each lane to SLM , the entire thread will write 8-consecutive K-coorindates + Four threads will write 4x8xuint32 for 32 output channels and 4 batches + This will be repeated for entire WG-tile + + Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK ) + */ + + uint out_slm_write = lid_z * TILE_W * OUT_BLOCK_HEIGHT * 32 + threadid_group_4 * 8 + lane_id; + + __local uchar4* out_slm = (__local uchar4*) &act_slm; + __local uchar4* out_slm_2 = (__local uchar4*) &out_slm[ out_slm_write ]; + + /* Scale the accumulator down and do the ReLU before converting to 8 bits */ + + /* Real code might do this, but need to get scale right or the convert to uchar saturates and then doesn''t match CPU + float scale = (float)SCALE_FACTOR; + + uchar outchar = (uchar)max(((float)outint) * scale, 0.0f); */ + + const uint _feature = ((fmg * 32) % _OD) + get_local_id(0); + float quant_f = as_float(intel_sub_group_block_read((__global uint*) (quantizations + _feature) )); + float bias_f = as_float(intel_sub_group_block_read((__global uint*) (biases + _feature) )); + float calib_f = as_float(intel_sub_group_block_read((__global uint*) (calibrations + _feature) )); + + __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) + for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) + { + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for (int c = 0; c < OUT_BLOCK_WIDTH; c++) + { + int4 outvec = out[ r * OUT_BLOCK_WIDTH + c]; + + uchar4 slm_write0; + + int slm_addr = c * 32 + r * TILE_W * 32; + + /*TODO - Activation & Quantization code goes here - presently applying ReLU and taking lower 8-bits */ + + slm_write0.s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s0) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N)); + slm_write0.s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s1) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N)); + slm_write0.s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s2) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N)); + slm_write0.s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s3) * quant_f * I_QF + bias_f) * calib_f)), NL_M, NL_N)); + + out_slm_2[ slm_addr ] = slm_write0; + + } // out_block_width-for loop + + } // out_block_height-for loop + + // Wait till all threads in WG finish placing the output + barrier(CLK_LOCAL_MEM_FENCE); + + /******************* Read from SLM & Write to Global *************************************/ + + /* Each lane will read uint4 from SLM - 4K x 4N values. Swizzle them into 4N x 4K order + + SLM Read Distribution - 8Px14Qx4Nx32K output tile + + Threads 0-1 handles row0, col 0-13, + Threads 2-3 handles row1, col 0-13, + .. + Threads 14-15 handles row7, col 0-13 */ + + uint row_id = thread_id / 2; + uint col_id = ( thread_id % 2 )*7; + + uint out_slm_read = col_id * 32 + row_id * TILE_W * 32 + lane_id * 4; + + __local uint4 *out_slm3 = (__local uint4*) &out_slm[ out_slm_read ]; + + /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */ + uint row_size_bytes = (_OW + OWPAD) * PACK * BATCH_PACK; + + /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */ + uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD); + + /* Each fmg writes [OH][OW][4][32]*/ + + uint output_depth_index = output_depth; + + uint batch_index = batch; + + uint slice_pack_addr_bytes = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + (groupz_tile + row_id ) * row_size_bytes + (groupy_tile + col_id ) * PACK * BATCH_PACK; + + __global uint* output_write = (__global uint *) &outputs [ slice_pack_addr_bytes ]; + + /* Each lane writes 4K values of 4 batches and 8 different columns */ + + /* 4K values of K=0..31 */ + + const char mask_constant = 0xFF; + + __attribute__((opencl_unroll_hint(7))) + for ( int c=0; c<7; c++ ) + { + /* Get 4K4N values in uint4 - each uint containing 4N values of a K + swizzle the data and pack into another uint4 containing 4N4K values - each uint containing 4K values of a N. + Use block_writes for writing uint4 */ + + uint4 out_k4n4 = out_slm3 [ c*8 ]; + + //Pack 4K values of first n + uchar4 out_n0k4; + + out_n0k4.s0 = out_k4n4.s0 & mask_constant; + out_n0k4.s1 = out_k4n4.s1 & mask_constant; + out_n0k4.s2 = out_k4n4.s2 & mask_constant; + out_n0k4.s3 = out_k4n4.s3 & mask_constant; + + /* Assigning to uchar hence need to get the required bits to lower 8-bits*/ + + //Pack 4K values of second n + uchar4 out_n1k4; + + out_n1k4.s0 = (out_k4n4.s0 >> 8) & mask_constant; + out_n1k4.s1 = (out_k4n4.s1 >> 8) & mask_constant; + out_n1k4.s2 = (out_k4n4.s2 >> 8) & mask_constant; + out_n1k4.s3 = (out_k4n4.s3 >> 8) & mask_constant; + + //Pack 4K values of third n + uchar4 out_n2k4; + + out_n2k4.s0 = (out_k4n4.s0 >> 16) & mask_constant; + out_n2k4.s1 = (out_k4n4.s1 >> 16) & mask_constant; + out_n2k4.s2 = (out_k4n4.s2 >> 16) & mask_constant; + out_n2k4.s3 = (out_k4n4.s3 >> 16) & mask_constant; + + //Pack 4K values of fourth n + uchar4 out_n3k4; + + out_n3k4.s0 = (out_k4n4.s0 >> 24) & mask_constant; + out_n3k4.s1 = (out_k4n4.s1 >> 24) & mask_constant; + out_n3k4.s2 = (out_k4n4.s2 >> 24) & mask_constant; + out_n3k4.s3 = (out_k4n4.s3 >> 24) & mask_constant; + + uint4 out_n4k4; + + out_n4k4.s0 = as_uint ( out_n0k4 ); + out_n4k4.s1 = as_uint ( out_n1k4 ); + out_n4k4.s2 = as_uint ( out_n2k4 ); + out_n4k4.s3 = as_uint ( out_n3k4 ); + + intel_sub_group_block_write4 ( output_write , out_n4k4 ); + + output_write += 4*8; + } +} //end of kernel + +#undef SCAL +#undef QUANTIZATION \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl new file mode 100644 index 0000000..7030a2e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl @@ -0,0 +1,1044 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/mmad.cl" + +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION \ + out_write_N2K4[0].s0 = convert_uchar_sat((float)outvec0.s0 * SCALE + bias_f.s0); /*K= lane_id,N=0*/ \ + out_write_N2K4[0].s1 = convert_uchar_sat((float)outvec1.s0 * SCALE + bias_f.s1); /*K= lane_id + 8,N=0*/\ + out_write_N2K4[0].s2 = convert_uchar_sat((float)outvec2.s0 * SCALE + bias_f.s2); /*K= lane_id + 16,N=0*/\ + out_write_N2K4[0].s3 = convert_uchar_sat((float)outvec3.s0 * SCALE + bias_f.s3); /*K= lane_id + 24,N=0*/\ + \ + out_write_N2K4[0].s4 = convert_uchar_sat((float)outvec0.s1 * SCALE + bias_f.s0); /*K= lane_id,N=1*/\ + out_write_N2K4[0].s5 = convert_uchar_sat((float)outvec1.s1 * SCALE + bias_f.s1); /*K= lane_id + 8,N=1*/\ + out_write_N2K4[0].s6 = convert_uchar_sat((float)outvec2.s1 * SCALE + bias_f.s2); /*K= lane_id + 16,N=1*/\ + out_write_N2K4[0].s7 = convert_uchar_sat((float)outvec3.s1 * SCALE + bias_f.s3); /*K= lane_id + 24,N=1*/\ + \ + out_write_N2K4[1].s0 = convert_uchar_sat((float)outvec0.s2 * SCALE + bias_f.s0); /*K= lane_id,N=2*/\ + out_write_N2K4[1].s1 = convert_uchar_sat((float)outvec1.s2 * SCALE + bias_f.s1); /*K= lane_id + 8,N=2*/\ + out_write_N2K4[1].s2 = convert_uchar_sat((float)outvec2.s2 * SCALE + bias_f.s2); /*K= lane_id + 16,N=2*/\ + out_write_N2K4[1].s3 = convert_uchar_sat((float)outvec3.s2 * SCALE + bias_f.s3); /*K= lane_id + 24,N=2*/\ + \ + out_write_N2K4[1].s4 = convert_uchar_sat((float)outvec0.s3 * SCALE + bias_f.s0); /*K= lane_id,N=3*/\ + out_write_N2K4[1].s5 = convert_uchar_sat((float)outvec1.s3 * SCALE + bias_f.s1); /*K= lane_id + 8,N=3*/\ + out_write_N2K4[1].s6 = convert_uchar_sat((float)outvec2.s3 * SCALE + bias_f.s2); /*K= lane_id + 16,N=3*/\ + out_write_N2K4[1].s7 = convert_uchar_sat((float)outvec3.s3 * SCALE + bias_f.s3); /*K= lane_id + 24,N=3*/ + +#elif NO_QUANTIZATION + +#define QUANTIZATION \ + out_write_N2K4[0].s0 = convert_uchar_sat(outvec0.s0); /*K= lane_id,N=0*/ \ + out_write_N2K4[0].s1 = convert_uchar_sat(outvec1.s0); /*K= lane_id + 8,N=0*/\ + out_write_N2K4[0].s2 = convert_uchar_sat(outvec2.s0); /*K= lane_id + 16,N=0*/\ + out_write_N2K4[0].s3 = convert_uchar_sat(outvec3.s0); /*K= lane_id + 24,N=0*/\ + \ + out_write_N2K4[0].s4 = convert_uchar_sat(outvec0.s1); /*K= lane_id,N=1*/\ + out_write_N2K4[0].s5 = convert_uchar_sat(outvec1.s1); /*K= lane_id + 8,N=1*/\ + out_write_N2K4[0].s6 = convert_uchar_sat(outvec2.s1); /*K= lane_id + 16,N=1*/\ + out_write_N2K4[0].s7 = convert_uchar_sat(outvec3.s1); /*K= lane_id + 24,N=1*/\ + \ + out_write_N2K4[1].s0 = convert_uchar_sat(outvec0.s2); /*K= lane_id,N=2*/\ + out_write_N2K4[1].s1 = convert_uchar_sat(outvec1.s2); /*K= lane_id + 8,N=2*/\ + out_write_N2K4[1].s2 = convert_uchar_sat(outvec2.s2); /*K= lane_id + 16,N=2*/\ + out_write_N2K4[1].s3 = convert_uchar_sat(outvec3.s2); /*K= lane_id + 24,N=2*/\ + \ + out_write_N2K4[1].s4 = convert_uchar_sat(outvec0.s3); /*K= lane_id,N=3*/\ + out_write_N2K4[1].s5 = convert_uchar_sat(outvec1.s3); /*K= lane_id + 8,N=3*/\ + out_write_N2K4[1].s6 = convert_uchar_sat(outvec2.s3); /*K= lane_id + 16,N=3*/\ + out_write_N2K4[1].s7 = convert_uchar_sat(outvec3.s3); /*K= lane_id + 24,N=3*/ + +#else + +#define QUANTIZATION \ + out_write_N2K4[0].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=0*/ \ + out_write_N2K4[0].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=0*/\ + out_write_N2K4[0].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=0*/\ + out_write_N2K4[0].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=0*/\ + \ + out_write_N2K4[0].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=1*/\ + out_write_N2K4[0].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=1*/\ + out_write_N2K4[0].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=1*/\ + out_write_N2K4[0].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=1*/\ + \ + out_write_N2K4[1].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=2*/\ + out_write_N2K4[1].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=2*/\ + out_write_N2K4[1].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=2*/\ + out_write_N2K4[1].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=2*/\ + \ + out_write_N2K4[1].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N)); /*K= lane_id,N=3*/\ + out_write_N2K4[1].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N)); /*K= lane_id + 8,N=3*/\ + out_write_N2K4[1].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N)); /*K= lane_id + 16,N=3*/\ + out_write_N2K4[1].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N)); /*K= lane_id + 24,N=3*/ + +#endif + +// mapping to clDNN +#define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C) +#define _OD OUTPUT_FEATURE_NUM +#define _OW OUTPUT_SIZE_X +#define _OH OUTPUT_SIZE_Y +#define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) +#define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define _IH INPUT0_SIZE_Y +#define _IW INPUT0_SIZE_X +#define _ID INPUT0_FEATURE_NUM +#define K_HEIGHT FILTER_SIZE_Y +#define K_WIDTH FILTER_SIZE_X +#define BATCH_SIZE OUTPUT_BATCH_NUM + +#define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) +#define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) +#define K_STRIDE STRIDE_SIZE_X +// end of mapping + +// for now kernel stride is square +#define K_WSTRIDE K_STRIDE +#define K_HSTRIDE K_STRIDE + +#define PACK 32 +#define BATCH_PACK 4 + +__attribute__((intel_reqd_sub_group_size(8))) +KERNEL(convolution_mmad_slm_2x14_rep4)( +__global int8 *inputs, +__global uchar* outputs, +__global int8* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif +#if QUANTIZATION_TERM + const __global float* quantizations, +#endif +#if CALIBRATION_TERM + const __global float* calibrations, +#endif + uint split_idx +) +{ + const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z; + const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y; + + ushort fmg = get_group_id(0); // Output Depth + ushort group_y = get_group_id(1); // Output Width + ushort group_z = get_group_id(2); // Output Height + + /* 16,1,8 WG , SIMD8 - 16 HW threads in a WG + threads 0-1 : ( lid_x:0-15,lid_y:0,lid_z:0) + threads 2-3 : ( lid_x:0-15,lid_y:0,lid_z:1) + .. + threads 12-13: ( lid_x:0-15, lid_y:0,lid_z:6) + threads 14-15: ( lid_x:0-15, lid_y:0,lid_z:7) + */ + + /* Thread, local IDs */ + ushort thread_id = get_sub_group_id(); + ushort threadid_mod_2 = thread_id % 2; + ushort threadid_mod_8 = thread_id % 8; + + ushort lid_x = get_local_id(0); + ushort lid_z = get_local_id(2); + + uchar lane_id = get_sub_group_local_id(); + + /* 32-bit signed accumulator , 112 output registers for 1Px7Qx4Nx32K output tile size + Will be converted to 8-bits before final write */ + + int4 out_07 [ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 0-7 + int4 out_815[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 8-15 + int4 out_1623[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 16-23 + int4 out_2431[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 24-31 + + /* Account for batching */ + + ushort batch = ( fmg*LOCAL_SIZE_X*4 ) /_OD; // Each thread processing 32 output_channels and each fmg processing 64 output channels , LOCAL_SIZE_X is only 16 + + // Size calculated for int8 elements + uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ; + + uint in_addr_offset = batch*input_size; + + /* Goto activation tile for work group, offset is w.r.t int8 array */ + + uint groupy_tile = TILE_W*group_y; + uint groupz_tile = TILE_H*group_z; + + in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK; + + /* SLM space for Activation, Weights + ( 16,1,8 ) Workgroup - 7 tiles along Y direction and 64 different output channels + 2 threads used to load global memory + Activation - 9Hx9Wx4Nx32C Weights -3Rx3Sx64Kx32C */ + + __local int8 act_slm [ 9*9*4 ]; + __local int8 weight_slm [ 9*64 ]; + + /* 9Hx9Wx4Nx32C activation tile written into SLM. Distribute among 14 threads in Workgroup + threads 0-1 write 9x4x32 of H=0, W=0...8 + threads 2-3 write 9x4x32 of H=1, W=0...8 + threads 4-5 write 9x4x32 of H=2, W=0...8 + threads 6-7 write 9x4x32 of H=3, W=0...8 + threads 8-9 write 9x4x32 of H=4, W=0...8 + threads 10-11 write 9x4x32 of H=5,W=0...8 + threads 12-13 write 9x4x32 of H=6,W=0...8 + threads 14 write 9x4x32 of H=7,W=0...8 + threads 15 write 9x4x32 of H=8,W=0...8 */ + + /* Goto activation tile for thread in group */ + + uint row_offset = thread_id / 2; + + if ( thread_id >= 14 ) + { + row_offset = 7; + } + + // In addr offset for the particular thread + in_addr_offset += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ; + + /* Activation SLM indices */ + uint act_slm_write = row_offset * ( TILE_W + 2) * BATCH_PACK; + uint act_slm_read = OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ; + + /* 9RSx64Kx32C Weight Block in SLM + thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0,1 ( k=0..15 ) + thread1 handles w(0,0),w(0,1),w(0,2) of K=2,3 ( k=16..31) + thread2 handles w(1,0),w(1,1) of K=0,1 ( k=0..15) + thread3 handles w(1,0),w(1,1) of K=2,3 ( k=16..31) + thread4 handles w(1,2),w(2,0) of K=0,1 ( k=0..15) + thread5 handles w(1,2),w(2,0) of K=2,3 ( k=16..31) + thread6 handles w(2,1),w(2,2) of K=0,1 ( k=0..15) + thread7 handles w(2,1),w(2,2) of K=2,3 ( k=16..31) + + Similarly threads8-15 handles for K=4,5,6,7 + + Weight Layout in SLM + + w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=32..39,C=0..15) + w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=32..39,C=16..31) + + Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM + Thread0 will read k=0..31, thread1 will read k=32..63 + + First all output channels are present in SLM, then next weight pixel is present in SLM */ + + #define NUM_FILTERS (K_HEIGHT * K_WIDTH) + + uint output_depth = fmg % ( _OD / ( LOCAL_SIZE_X * 4 ) ); //LOCAL_SIZE_X=16, 64 output channels used + + uint weight_size_CRS = ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside + + // Global weight addr for workgroup + uint weight_global_addr_offset = output_depth * 8 * weight_size_CRS ; //64 output channels per workgroup + + /* Global weight address for thread */ + + // Goto appropriate output channel in weights + uint weight_global_channel_offset = threadid_mod_2 * 2 * weight_size_CRS ; + + uint slm_channel_offset = threadid_mod_2; + uint bc_fused_thread_offset = 0; + + if ( thread_id >= 8 ) + { + bc_fused_thread_offset = 1; + + weight_global_channel_offset = 4 * weight_size_CRS + slm_channel_offset * weight_size_CRS * 2 ; + } + + // Goto appropriate pixel in weights + + uint weight_global_pixel_offset = 0; + uint slm_pixel_offset = 0; + + if ( threadid_mod_8 >=2 ) + { + /* First three pixels handled by threads 0-1, then 2 pixels handled by two threads */ + + weight_global_pixel_offset = 3*8 + ( ( (threadid_mod_8/2) - 1 )*2*8 ); + slm_pixel_offset = 3*64 + ( ( (threadid_mod_8/2) - 1 )*2*64 ); + } + + weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset; + + /* Weight slm write index */ + + uint slm_write_weight = slm_pixel_offset + slm_channel_offset * 32 + bc_fused_thread_offset * 4; + + /* Weight slm read index */ + + /* Thread 0 reads output channels 0-15, thread 1 handles output channels 16-31, data present in interleaved + manner in SLM + Data layout in SLM + + w(0,0) C=0..7, K = 0..7 | w(0,0) C=0..7, K = 32..39 + w(0,0) C=8..15,K=0..7 | w(0,0) C=8..15,K = 32..39 + w(0,0) C=0..7, K=8..15 | w(0,0) C=0..7, K = 40..47 + w(0,0) C=8..15,K=8..15 | w(0,0) C=8..15,K= 40..47 + + */ + uint wt_slm_rd_offset = threadid_mod_2*4; + + int kd; + + __attribute__((opencl_unroll_hint(1))) + for(kd = 0; kd < ( _ID / PACK ) ; kd++) + { + { + /* Load Activation from global to SLM */ + + int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset; + + __global uint *activation_tile = (__global uint*)&inputs[ in_addr ]; + + __local uint *act_slm_ptr = (__local uint *) &act_slm [ act_slm_write ]; + + /* The odd thread in fused pair will start from next 4x8 block */ + + activation_tile += threadid_mod_2*4*8; + act_slm_ptr += threadid_mod_2*4*8; + + int4 act_col_0 = as_int4( intel_sub_group_block_read4(activation_tile) );//col 0 + int4 act_col_1 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );//col 2 + int4 act_col_2 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );//col 4 + int4 act_col_3 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );//col 6 + + SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) ); + + if ( threadid_mod_2 == 0 ) + { + int4 act_col_4 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); + + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) ); + } + + if ( thread_id >=14) + { + activation_tile = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8; + act_slm_ptr = act_slm_ptr + (TILE_W + 2) * BATCH_PACK *8; + + int4 act_col_9 = as_int4( intel_sub_group_block_read4(activation_tile) ); + int4 act_col_10 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) ); + int4 act_col_11 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) ); + int4 act_col_12 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) ); + + SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_9 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_10 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) ); + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) ); + + if ( threadid_mod_2 == 0 ) + { + int4 act_col_13 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); + + SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) ); + } + } + + /* load weights from global to weight_slm */ + + int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset; + + __global uint *weight_tile = (__global uint*)&weights [ weight_addr ]; + __local uint *wt_slm_ptr = (__local uint *)&weight_slm [ slm_write_weight ]; + + __global uint *weight_tile_2 = weight_tile; + __local uint *wt_slm_ptr_2 = wt_slm_ptr; + + int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); // Pixel1 K=0..7 C=0..15 + int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=0..7 C=16..31 + int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=0..7 C=0..15 + int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=0..7 C=16..31 + + // Goto next output channel + weight_tile += weight_size_CRS*8; + + int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); // Pixel1 K=8..15 C=0..15 + int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=8..15 C=16..31 + int4 w6 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=8..15 C=0..15 + int4 w7 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=8..15 C=16..31 + + SLM_BLOCK_WRITE_4 ( wt_slm_ptr, as_uint4 ( w0 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ), as_uint4 ( w2 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ), as_uint4 ( w3 ) ); + + wt_slm_ptr += 16*8; + + SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w5 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ) , as_uint4 ( w6 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ) , as_uint4 ( w7 ) ); + + if( threadid_mod_8 < 2 ) + { + // Goto next pixel + weight_tile_2 += 16*8; + wt_slm_ptr_2 += 2*64*8; + + int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) ); // Pixel1 K=0..7 C=0..15 + int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) ); // Pixel1 K=0..7 C=16..31 + + // Goto next output channel + weight_tile_2 += weight_size_CRS*8; + + int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) ); // Pixel1 K=8..15 C=0..15 + int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) ); // Pixel1 K=8..15 C=16..31 + + SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2, as_uint4 ( w0 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w1 ) ); + + wt_slm_ptr_2 += 16*8; + + SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2 , as_uint4 ( w4 ) ); + SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w5 ) ); + } + } + + // Synchronize SLM writes across workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + if ( lid_z <= 6 ) + { + uint wt_slm_rd = wt_slm_rd_offset; + + __local uint *slm_ptr0 = (__local uint *) &act_slm[ act_slm_read ]; + __local uint *slm_ptr1 = (__local uint *) &weight_slm[ wt_slm_rd ]; + + /* balancing load of weights, activations */ + int8 weights_reg[3]; //24 registers + int4 act_reg[18]; //72 registers + uint slm_read_pixel_offset = 64*8; + + /********************************************************************************************************** + First phase - multiply first row of weights and 1st row of activations + ***********************************************************************************************************/ + + /* Load weights from SLM into registers - row0, output channels 0..7 */ + + { + __local uint *slm_ptrw0 = slm_ptr1; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + } + + /* load 1Hx9Wx4N inputs, Activation row0 */ + + __attribute__((opencl_unroll_hint(9))) + for (int ic = 0; ic < 9; ic++) + { + /* Load activations from SLM into registers */ + + uint slm_offset = ic * BATCH_PACK * 8 ; + + act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + } + + /* Convolve */ + + /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/ + + /* Output channels 0-7 */ + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] ); + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] ); + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] ); + + /* Load weights from SLM into registers - row0, output channels 8..15 */ + + { + __local uint *slm_ptrw0 = slm_ptr1 + 2*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + } + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] ); + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] ); + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] ); + + /* Load weights from SLM into registers - row0, output channels 16..23 */ + { + __local uint *slm_ptrw0 = slm_ptr1 + 4*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + } + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] ); + + /* load 1Hx9Wx4N inputs, Activation row1 */ + + uint slm_row_offset_2 = 1*(TILE_W + 2)*BATCH_PACK*8; + + __attribute__((opencl_unroll_hint(9))) + for (int ic = 0; ic < 9; ic++) + { + /* Load activations from SLM into registers */ + + uint slm_offset = slm_row_offset_2 + ic * BATCH_PACK * 8 ; + + act_reg [ ic + 9 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + } + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] ); + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] ); + + /* Load weights from SLM into registers - row0, output channels 24..31 */ + { + __local uint *slm_ptrw0 = slm_ptr1 + 6*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + slm_ptrw0 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); + } + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] ); + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] ); + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] ); + + /********************************************************************************************************** + Second phase - multiply second row of weights and second row of activations + ***********************************************************************************************************/ + + /* Load weights from SLM into registers - row1, output channels 0..7 */ + { + __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + } + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[9], weights_reg[0] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[10], weights_reg[0] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[11], weights_reg[0] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[12], weights_reg[0] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[13], weights_reg[0] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[14], weights_reg[0] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[15], weights_reg[0] ); + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[10], weights_reg[1] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[11], weights_reg[1] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[12], weights_reg[1] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[13], weights_reg[1] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[14], weights_reg[1] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[15], weights_reg[1] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[16], weights_reg[1] ); + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[11], weights_reg[2] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[12], weights_reg[2] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[13], weights_reg[2] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[14], weights_reg[2] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[15], weights_reg[2] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[16], weights_reg[2] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[17], weights_reg[2] ); + + /* Load weights from SLM into registers - row1, output channels 8..15 */ + { + __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 2*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + } + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[9], weights_reg[0] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[10], weights_reg[0] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[11], weights_reg[0] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[12], weights_reg[0] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[13], weights_reg[0] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[14], weights_reg[0] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[15], weights_reg[0] ); + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[10], weights_reg[1] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[11], weights_reg[1] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[12], weights_reg[1] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[13], weights_reg[1] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[14], weights_reg[1] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[15], weights_reg[1] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[16], weights_reg[1] ); + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[11], weights_reg[2] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[12], weights_reg[2] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[13], weights_reg[2] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[14], weights_reg[2] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[15], weights_reg[2] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[16], weights_reg[2] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[17], weights_reg[2] ); + + /* Load weights from SLM into registers - row1, output channels 16..23 */ + { + __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 4*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + } + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[9], weights_reg[0] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[10], weights_reg[0] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[11], weights_reg[0] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[12], weights_reg[0] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[13], weights_reg[0] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[14], weights_reg[0] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[15], weights_reg[0] ); + + /* load 1Hx9Wx4N inputs, Activation row2 */ + + uint slm_row_offset_3 = 2*(TILE_W + 2)*BATCH_PACK*8; + + __attribute__((opencl_unroll_hint(9))) + for (int ic = 0; ic < 9; ic++) + { + /* Load activations from SLM into registers */ + + uint slm_offset = slm_row_offset_3 + ic * BATCH_PACK * 8 ; + + act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; + } + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[10], weights_reg[1] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[11], weights_reg[1] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[12], weights_reg[1] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[13], weights_reg[1] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[14], weights_reg[1] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[15], weights_reg[1] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[16], weights_reg[1] ); + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[11], weights_reg[2] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[12], weights_reg[2] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[13], weights_reg[2] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[14], weights_reg[2] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[15], weights_reg[2] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[16], weights_reg[2] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[17], weights_reg[2] ); + + /* Load weights from SLM into registers - row1, output channels 24..31 */ + { + __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 6*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + slm_ptrw1 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); + } + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[9], weights_reg[0] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[10], weights_reg[0] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[11], weights_reg[0] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[12], weights_reg[0] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[13], weights_reg[0] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[14], weights_reg[0] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[15], weights_reg[0] ); + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[10], weights_reg[1] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[11], weights_reg[1] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[12], weights_reg[1] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[13], weights_reg[1] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[14], weights_reg[1] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[15], weights_reg[1] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[16], weights_reg[1] ); + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[11], weights_reg[2] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[12], weights_reg[2] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[13], weights_reg[2] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[14], weights_reg[2] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[15], weights_reg[2] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[16], weights_reg[2] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[17], weights_reg[2] ); + + /********************************************************************************************************** + Third phase - multiply third row of weights and third row of activations + ***********************************************************************************************************/ + + /* Load weights from SLM into registers - row2, output channels 0..7 */ + { + __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + } + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] ); + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] ); + + out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] ); + out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] ); + out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] ); + out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] ); + out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] ); + out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] ); + out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] ); + + /* Load weights from SLM into registers - row2, output channels 8..15 */ + { + __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 2*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + } + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] ); + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] ); + + out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] ); + out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] ); + out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] ); + out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] ); + out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] ); + out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] ); + out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] ); + + /* Load weights from SLM into registers - row2, output channels 16..23 */ + { + __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 4*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + } + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] ); + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] ); + + out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] ); + out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] ); + out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] ); + out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] ); + out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] ); + out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] ); + out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] ); + + /* Load weights from SLM into registers - row3, output channels 24..31 */ + { + __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 6*8*8; + + weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + slm_ptrw2 += slm_read_pixel_offset; + + weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); + weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); + } + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] ); + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] ); + + out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] ); + out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] ); + out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] ); + out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] ); + out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] ); + out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] ); + out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] ); + } + + // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM + barrier(CLK_LOCAL_MEM_FENCE); + } //for kd + + /**************************************************************************************************************** + *******************************Output Write Stage**************************************************************** + ****************************************************************************************************************/ + /* + Outputs will be passed through activation function and quantized to 8 bits before writing + Output will be in same format as input [K/32][N/4][P][Q][4N][32K] */ + + /******************* Write output to SLM *************************************/ + + /* Quantize and pack 4x1 byte - from consectuive n-coordinates + Each thread produces [1P][7Q][4N][32K] + Write uint32 from each lane to SLM , the entire thread will write 32-consecutive K-coorindates + + Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK ) + In SLM 7x7x4x32 present first then the next 32 channels + */ + + if( lid_z <= 6 ) + { + /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */ + uint row_size_bytes = (_OW + OWPAD) * PACK * BATCH_PACK; + + /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */ + uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD); + + /* Each output_depth WG writes 64 output channels */ + + uint output_depth_index = output_depth*2 + threadid_mod_2; + uint batch_index = batch; + + /* Each WG produces entire 7x7 output, hence no group_y, group_z tiling */ + + uint output_offset_x = groupy_tile * OUT_X_PITCH; + uint output_offset_y = groupz_tile * OUT_Y_PITCH; + uint slice_pack_addr_bytes = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + lid_z * row_size_bytes; + + __global uchar* output_write_ptr = (__global uchar *) &outputs [ slice_pack_addr_bytes + output_offset_x + output_offset_y ]; + + const uint feature = output_depth_index * 32 + get_sub_group_local_id(); + + const float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) )); + const float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) )); + const float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) )); + + __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) + for (int col = 0; col < OUT_BLOCK_WIDTH; col++) + { + + int4 outvec0 = out_07[col]; + int4 outvec1 = out_815[col]; + int4 outvec2 = out_1623[col]; + int4 outvec3 = out_2431[col]; + + /* Non-Linear Activation & Quantization code */ + + uchar8 out_write_N2K4[2]; + + QUANTIZATION; + + intel_sub_group_block_write_uc8 ( output_write_ptr , out_write_N2K4[0] ); + output_write_ptr += 64; + intel_sub_group_block_write_uc8 ( output_write_ptr , out_write_N2K4[1] ); + output_write_ptr += 64; + + } // out_block_width-for loop + }//lid_z loop +} //end of kernel + +#undef SCAL +#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_winograd_2x3_s1_fused.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_winograd_2x3_s1_fused.cl index 1623a95..603e148 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_winograd_2x3_s1_fused.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_winograd_2x3_s1_fused.cl @@ -18,7 +18,7 @@ // Output matrix dimensions: M x N // -------------------------------------------------------------------------------------------------------------------------------- -#include "include/data_types.cl" +#include "include/common.cl" #define DOT4i0( _result, _A, _B, i) \ diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_ref.cl index 03affe9..33f88c0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_ref.cl @@ -39,6 +39,11 @@ KERNEL(convolution_gpu_yxfb_ref)( const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; #endif const uint input_offset = INPUT0_OFFSET + batch_offset*INPUT0_BATCH_PITCH + in_split_offset; +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint filter_offset = split_idx * FILTER_LENGTH; +#else + const uint filter_offset = 0; +#endif for (uint i = 0; i < FILTER_SIZE_Y; i++) { @@ -55,7 +60,7 @@ KERNEL(convolution_gpu_yxfb_ref)( if(!zero) { uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH; - uint filter_idx = ofm_offset*FILTER_OFM_PITCH + i*FILTER_Y_PITCH + j*FILTER_X_PITCH; + uint filter_idx = filter_offset + ofm_offset*FILTER_OFM_PITCH + i*FILTER_Y_PITCH + j*FILTER_X_PITCH; for (uint h = 0; h < FILTER_IFM_NUM; h++) { @@ -68,7 +73,12 @@ KERNEL(convolution_gpu_yxfb_ref)( } } #if BIAS_TERM - result += bias[ofm_offset]; +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint bias_offset = split_idx * BIAS_LENGTH; +#else + const uint bias_offset = 0; +#endif + result += bias[ofm_offset + bias_offset]; #endif const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM; const uint dst_index = batch_offset*OUTPUT_BATCH_PITCH + ofm_offset*OUTPUT_FEATURE_PITCH + out_y*OUTPUT_Y_PITCH + out_x*OUTPUT_X_PITCH + OUTPUT_OFFSET + out_split_offset; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp16.cl index edf68f8..2b1fb4c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp16.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/sub_group.cl" __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1))) @@ -31,15 +32,15 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)( // get_global_size(1) -> Output size in X-dimension. // get_global_size(2) -> Output size in Y-dimension. // get_global_id(0) -> Id of work item computing single spatial point of output indicated by get_global_id(1), get_global_id(2). - // get_global_id(1) -> Current x-position in output. - // get_global_id(2) -> Current y-position in output. + // get_group_id(1) -> Current x-position in output. + // get_group_id(2) -> Current y-position in output. // // WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS -> Number of work items needed to compute entire one batch for at least one feature and one spatial point. // (this number in current implementation computes also OFM_PER_WORK_ITEM output features at the same time). // FILTER_ARRAY_NUM -> Number of filters groups (split size). - const uint out_x = get_global_id(1); - const uint out_y = get_global_id(2); + const uint out_x = get_group_id(1); + const uint out_y = get_group_id(2); const uint output_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM; const uint output_x_size = OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X; @@ -140,6 +141,15 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)( } #if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1) + #if BATCHES_PER_WORK_ITEM == 4 + uint _out_id = OUTPUT_VIEW_OFFSET + out_id; + for(uint i = 0; i < 16; i++) + { + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[0][i], _data[1][i])); + *(__global uint*)(output + _out_id + 32) = as_uint((half2)(_data[2][i], _data[3][i])); + _out_id += OUTPUT_FEATURE_PITCH; + } + #else for(uint s = 0; s < BATCHES_PER_WORK_ITEM / 2; s++) { uint _out_id = OUTPUT_VIEW_OFFSET + out_id + chunk_size * s * LOCAL_WORK_GROUP_SIZE; @@ -160,6 +170,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)( *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].se, _data[chunk_size * s + 1].se)); _out_id += OUTPUT_FEATURE_PITCH; *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sf, _data[chunk_size * s + 1].sf)); _out_id += OUTPUT_FEATURE_PITCH; } + #endif #else for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl index dd869b5..004f8e0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/sub_group.cl" KERNEL(convolution_gpu_yxfb_yxio_b16)( const __global float* input, diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl index 181b619..a56896d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl @@ -13,6 +13,7 @@ // limitations under the License. #include "include/include_all.cl" +#include "include/sub_group.cl" __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1))) KERNEL(convolution_gpu_yxfb_yxio_b1_block)( diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl index 0f2722f..85aa75d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl @@ -13,6 +13,7 @@ // limitations under the License. #include "include/include_all.cl" +#include "include/sub_group.cl" __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1))) KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)( diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl index 519c822..21fc110 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/sub_group.cl" __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1))) KERNEL(convolution_gpu_yxfb_yxio_b8)( diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl index cba96cb..95641e2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl @@ -93,13 +93,16 @@ KERNEL(convolution_grad_weights_gpu_ref)( #endif } -#if MOMENTUM - float update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR; - filter[weights_idx] -= update_gradient_w; - prev_grad_w[weights_idx] = update_gradient_w; +#if OUTPUT_GRAD_W + output[weights_idx] = grad_w; #else - filter[weights_idx] -= lr * grad_w + DECAY_RATE * lr * filter[weights_idx]; -#endif + #if MOMENTUM + float update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR; + filter[weights_idx] -= update_gradient_w; + prev_grad_w[weights_idx] = update_gradient_w; + #else + filter[weights_idx] -= lr * grad_w + DECAY_RATE * lr * filter[weights_idx]; + #endif #if BIAS_TERM if(ifm == 0 && id_x == 0 && id_y == 0) @@ -114,4 +117,6 @@ KERNEL(convolution_grad_weights_gpu_ref)( } #endif +#endif + } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl index 1d6ffea..fba71db 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl @@ -90,13 +90,16 @@ KERNEL(convolution_grad_weights_gpu_ref)( if (local_id == 0) { -#if MOMENTUM - UNIT_TYPE update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR; - filter[weights_idx] -= update_gradient_w; - prev_grad_w[weights_idx] = update_gradient_w; +#if OUTPUT_GRAD_W + output[weights_idx] = grad_w; #else - filter[weights_idx] -= lr * (grad_w + DECAY_RATE * filter[weights_idx]); -#endif + #if MOMENTUM + UNIT_TYPE update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR; + filter[weights_idx] -= update_gradient_w; + prev_grad_w[weights_idx] = update_gradient_w; + #else + filter[weights_idx] -= lr * (grad_w + DECAY_RATE * filter[weights_idx]); + #endif #if BIAS_TERM if(ifm == 0 && id_x == 0 && id_y == 0) @@ -110,5 +113,6 @@ KERNEL(convolution_grad_weights_gpu_ref)( #endif } #endif +#endif } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl index 2b2e0c9..a1dcd67 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl @@ -63,6 +63,11 @@ KERNEL(deconvolution_gpu_bfyx_opt)( const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; #endif const uint input_offset = INPUT0_OFFSET + batch_offset*INPUT0_BATCH_PITCH + in_split_offset; +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint filter_offset = split_idx * FILTER_LENGTH; +#else + const uint filter_offset = 0; +#endif for (uint i = start_y; i < FILTER_SIZE_Y; i+=STRIDE_SIZE_Y) { @@ -83,7 +88,7 @@ KERNEL(deconvolution_gpu_bfyx_opt)( uint input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH; #if GRADIENT - uint filter_idx = ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; + uint filter_idx = filter_offset + ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; for (uint h = 0; h < FILTER_OFM_NUM; h++) { result = fma(input[input_idx], filter[filter_idx], result); @@ -91,7 +96,7 @@ KERNEL(deconvolution_gpu_bfyx_opt)( input_idx += INPUT0_FEATURE_PITCH; } #else - uint filter_idx = ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; + uint filter_idx = filter_offset + ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; for (uint h = 0; h < FILTER_IFM_NUM; h++) { result = fma(input[input_idx], filter[filter_idx], result); @@ -104,7 +109,12 @@ KERNEL(deconvolution_gpu_bfyx_opt)( } } #if BIAS_TERM - result += bias[ofm_offset]; +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint bias_offset = split_idx * BIAS_LENGTH; +#else + const uint bias_offset = 0; +#endif + result += bias[ofm_offset + bias_offset]; #endif const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM; const uint dst_index = OUTPUT_OFFSET + out_split_offset + batch_offset*OUTPUT_BATCH_PITCH + ofm_offset*OUTPUT_FEATURE_PITCH + id_y*OUTPUT_Y_PITCH + id_x*OUTPUT_X_PITCH; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl index d2a369b..4e8fa0d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl @@ -55,6 +55,11 @@ KERNEL(deconvolution_gpu_yxfb_ref)( const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; #endif const uint input_offset = INPUT0_OFFSET + batch_offset*INPUT0_BATCH_PITCH + in_split_offset; +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint filter_offset = split_idx * FILTER_LENGTH; +#else + const uint filter_offset = 0; +#endif for (uint i = 0; i < FILTER_SIZE_Y; i++) { @@ -74,7 +79,7 @@ KERNEL(deconvolution_gpu_yxfb_ref)( uint fixed_input_offset_y = (uint)input_offset_y / STRIDE_SIZE_Y; uint input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH; #if GRADIENT - uint filter_idx = ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; + uint filter_idx = filter_offset + ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; for (uint h = 0; h < FILTER_OFM_NUM; h++) { result = fma(input[input_idx], filter[filter_idx], result); @@ -82,7 +87,7 @@ KERNEL(deconvolution_gpu_yxfb_ref)( input_idx += INPUT0_FEATURE_PITCH; } #else - uint filter_idx = ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; + uint filter_idx = filter_offset + ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; for (uint h = 0; h < FILTER_IFM_NUM; h++) { result = fma(input[input_idx], filter[filter_idx], result); @@ -95,7 +100,12 @@ KERNEL(deconvolution_gpu_yxfb_ref)( } } #if BIAS_TERM - result += bias[ofm_offset]; +#if GROUPED && !DEPTHWISE_SEPARABLE_OPT + const uint bias_offset = split_idx * BIAS_LENGTH; +#else + const uint bias_offset = 0; +#endif + result += bias[ofm_offset + bias_offset]; #endif const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM; const uint dst_index = OUTPUT_OFFSET + out_split_offset + batch_offset*OUTPUT_BATCH_PITCH + ofm_offset*OUTPUT_FEATURE_PITCH + out_y*OUTPUT_Y_PITCH + out_x*OUTPUT_X_PITCH; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_ref.cl new file mode 100644 index 0000000..2c96cc4 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_ref.cl @@ -0,0 +1,36 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +KERNEL(depth_to_space_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint feature = get_global_id(1); + const uint y = get_global_id(2) / OUTPUT_SIZE_X; + const uint x = get_global_id(2) % OUTPUT_SIZE_X; + + const uint input_y = y / BLOCK_SIZE; + const uint offset_y = y % BLOCK_SIZE; + + const uint input_x = x / BLOCK_SIZE; + const uint offset_x = (x % BLOCK_SIZE); + const uint offset_feature = (offset_y * BLOCK_SIZE + offset_x) * OUTPUT_FEATURE_NUM; + + const uint output_index = OUTPUT_OFFSET + (batch * OUTPUT_BATCH_PITCH) + (feature * OUTPUT_FEATURE_PITCH) + (y * OUTPUT_Y_PITCH) + x; + const uint input_feature = feature + offset_feature; + const uint input_index = INPUT0_OFFSET + (batch * INPUT0_BATCH_PITCH) + (input_feature * INPUT0_FEATURE_PITCH) + (input_y * INPUT0_Y_PITCH) + input_x; + output[output_index] = input[input_index]; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl new file mode 100644 index 0000000..94c14e4 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl @@ -0,0 +1,217 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" +#include "include/detection_output_common.cl" + +KERNEL (detection_output)(__global UNIT_TYPE* input_location, __global UNIT_TYPE* output, __global UNIT_TYPE* input_confidence, __global UNIT_TYPE* input_prior_box) +{ + const uint idx = get_global_id(0); // bbox idx + const uint local_id = get_local_id(0) * NUM_OF_ITEMS; // All bboxes from one image in work group + const uint idx_image = idx / NUM_OF_ITERATIONS; // idx of current image + + __local uint indexes[NUM_OF_PRIORS]; + __local uint scores_size[NUM_CLASSES * NUM_OF_IMAGES]; + __local bool stillSorting; + + uint indexes_class_0[NUM_OF_PRIORS]; + + int last_bbox_in_class = NUM_OF_ITEMS; + bool is_last_bbox_in_class = false; + for (uint it = 0; it < NUM_OF_ITEMS; it ++) + { + if (((local_id + it + 1) % NUM_OF_PRIORS) == 0 ) + { + last_bbox_in_class = it; + is_last_bbox_in_class = true; + break; + } + } + + for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) + { + if (idx_class == BACKGROUND_LABEL_ID) + { + continue; + } + + for (uint it = 0; it < NUM_OF_ITEMS; it++) + { + indexes[local_id + it] = local_id + it; + } + + stillSorting = true; + barrier(CLK_LOCAL_MEM_FENCE); + + bool is_last_bbox_in_image = (is_last_bbox_in_class) && (idx_class == (NUM_CLASSES - 1)); + + while(stillSorting) + { + barrier(CLK_LOCAL_MEM_FENCE); + stillSorting = false; + + for (uint i = 0; i < 2; i++) + { + for (uint it = 0; it < NUM_OF_ITEMS; it++) + { + uint item_id = local_id + it; + + uint idx1 = indexes[item_id]; + uint idx2 = indexes[item_id+1]; + bool perform = false; + if ((((i % 2) && (item_id % 2)) || + ((!(i % 2)) && (!(item_id % 2)))) && + (it < last_bbox_in_class)) + { + perform = true; + } + + if (perform && + (FUNC_CALL(get_score)(input_confidence, idx1, idx_class, idx_image) < + FUNC_CALL(get_score)(input_confidence, idx2, idx_class, idx_image))) + { + indexes[item_id] = idx2; + indexes[item_id+1] = idx1; + stillSorting = true; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + } + + // Do it only once per class in image + if (is_last_bbox_in_class) + { + UNIT_TYPE adaptive_threshold = NMS_THRESHOLD; + uint post_nms_count = 0; + const uint shared_class = (SHARE_LOCATION)? 0 : idx_class; + scores_size[idx_class] = 0; + + // Do the "keep" algorithm only for classes with confidence greater than CONFIDENCE_THRESHOLD. + // Check first, the biggest one (after sort) element in class. + if (FUNC_CALL(get_score)(input_confidence, indexes[0], idx_class, idx_image) != 0.0f) + { + for (uint i = 0; i < SCORES_COUNT; i++) + { + const uint bb_idx = indexes[i]; + bool keep = true; + for (uint j = 0; j < post_nms_count; j++) + { + if (!keep) + { + break; + } + + UNIT_TYPE overlap = 0.0; + const uint bb_idx2 = indexes[j]; + + UNIT_TYPE decoded_bbox1[4]; + FUNC_CALL(get_decoded_bbox)(decoded_bbox1, input_location, input_prior_box, bb_idx, shared_class, idx_image); + UNIT_TYPE decoded_bbox2[4]; + FUNC_CALL(get_decoded_bbox)(decoded_bbox2, input_location, input_prior_box, bb_idx2, shared_class, idx_image); + bool intersecting = + (decoded_bbox1[0] < decoded_bbox2[2]) & + (decoded_bbox2[0] < decoded_bbox1[2]) & + (decoded_bbox1[1] < decoded_bbox2[3]) & + (decoded_bbox2[1] < decoded_bbox1[3]); + + if (intersecting) + { + const UNIT_TYPE intersect_width = min(decoded_bbox1[2], decoded_bbox2[2]) - max(decoded_bbox1[0], decoded_bbox2[0]); + const UNIT_TYPE intersect_height = min(decoded_bbox1[3], decoded_bbox2[3]) - max(decoded_bbox1[1], decoded_bbox2[1]); + const UNIT_TYPE intersect_size = intersect_width * intersect_height; + const UNIT_TYPE bbox1_area = (decoded_bbox1[2] - decoded_bbox1[0]) * (decoded_bbox1[3] - decoded_bbox1[1]); + const UNIT_TYPE bbox2_area = (decoded_bbox2[2] - decoded_bbox2[0]) * (decoded_bbox2[3] - decoded_bbox2[1]); + overlap = intersect_size / (bbox1_area + bbox2_area - intersect_size); + } + keep = (overlap <= adaptive_threshold); + } + if (keep) + { + indexes[post_nms_count] = indexes[i]; + ++post_nms_count; + } + if ((keep) && (ETA < 1) && (adaptive_threshold > 0.5)) + { + adaptive_threshold *= ETA; + } + } + } + // Write number of scores to global memory, for proper output order in separated work groups + scores_size[idx_class] = post_nms_count; + } + + stillSorting = true; + // Wait for scores number from all classes in images + barrier(CLK_LOCAL_MEM_FENCE); + + uint output_offset = (idx_image * NUM_CLASSES_OUT + idx_class - HIDDEN_CLASS) * SCORES_COUNT; + + for (uint it = 0; it < NUM_OF_ITEMS; it++) + { + const uint local_id_out = local_id + it; + + if (local_id_out < scores_size[idx_class]) + { + const uint score_idx = indexes[local_id_out]; + uint bb_idx = indexes[local_id_out]; + const uint shared_class = (SHARE_LOCATION)? 0 : idx_class; + UNIT_TYPE decoded_bbox[4]; + FUNC_CALL(get_decoded_bbox)(decoded_bbox, input_location, input_prior_box, bb_idx, shared_class, idx_image); + + const uint out_idx = (local_id_out + output_offset) * OUTPUT_ROW_SIZE + OUTPUT_OFFSET; + output[out_idx] = TO_UNIT_TYPE(idx_image); + output[out_idx + 1] = TO_UNIT_TYPE(idx_class); + output[out_idx + 2] = FUNC_CALL(get_score)(input_confidence, score_idx, idx_class, idx_image); + output[out_idx + 3] = decoded_bbox[0]; + output[out_idx + 4] = decoded_bbox[1]; + output[out_idx + 5] = decoded_bbox[2]; + output[out_idx + 6] = decoded_bbox[3]; + } + } + + // If work item is processing last bbox in image (we already know the number of all detections), + // use it to fill rest of keep_top_k items if number of detections is smaller + if (is_last_bbox_in_class) + { + uint out_idx = output_offset + scores_size[idx_class]; + + uint current_top_k = output_offset + SCORES_COUNT; + for (uint i = out_idx; i < current_top_k; i++) + { + out_idx = i * OUTPUT_ROW_SIZE + OUTPUT_OFFSET; + output[out_idx] = -1.0; + output[out_idx + 1] = 0.0; + output[out_idx + 2] = 0.0; + output[out_idx + 3] = 0.0; + output[out_idx + 4] = 0.0; + output[out_idx + 5] = 0.0; + output[out_idx + 6] = 0.0; + } + } + + // Write number of scores kept in first step of detection output + if (is_last_bbox_in_image) + { + uint scores_sum = 0; + for (uint i = 0; i < NUM_CLASSES; i++) + { + scores_sum += scores_size[i]; + } + output[idx_image] = scores_sum; + + } + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_sort.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_sort.cl new file mode 100644 index 0000000..1a74d96 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_sort.cl @@ -0,0 +1,217 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" +#include "include/detection_output_common.cl" + +UNIT_TYPE FUNC(get_score_sort)(__global UNIT_TYPE* input_bboxes, const uint idx_bbox, const uint idx_image) +{ + if (idx_bbox == KEEP_BBOXES_NUM) + { + // Idx set to dummy value, return -1 to exclude this element from sorting + return -1; + } + else + { + return input_bboxes[(idx_bbox + idx_image * NUM_OF_IMAGE_BBOXES) * OUTPUT_ROW_SIZE + INPUT_OFFSET + SCORE_OFFSET]; + } +} + +KERNEL (detection_output_sort)(__global UNIT_TYPE* input_bboxes, __global UNIT_TYPE* output) +{ + __local uint indexes[NUM_CLASSES_IN]; + __local bool stillSorting; + __local uint output_count; + __local uint num_out_per_class[NUM_CLASSES_IN]; + + output_count = 0; + num_out_per_class[get_local_id(0)] = 0; + + const uint image_id = get_global_id(0) / NUM_CLASSES_IN; + const uint local_id = get_local_id(0) * NUM_OF_ITEMS_SORT; // All bboxes from one image in work group + + uint image_offset_input = image_id * NUM_OF_IMAGE_BBOXES; + + uint count_sum = 0; + for (uint i = 0; i < image_id; i++) + { + count_sum += (input_bboxes[i] < KEEP_TOP_K)? input_bboxes[i] : KEEP_TOP_K; + } + + uint image_offset_output = count_sum * OUTPUT_ROW_SIZE; + + // If there is less elements than needed, write input to output + if (input_bboxes[image_id] <= KEEP_TOP_K) + { + if (local_id == 0) + { + for (uint class = 0; class < NUM_CLASSES_IN; class++) + { + if (class == BACKGROUND_LABEL_ID && !HIDDEN_CLASS) + { + continue; + } + for (uint i = 0; i < NUM_OF_CLASS_BBOXES; i++) + { + uint input_idx = (i + image_offset_input + class * NUM_OF_CLASS_BBOXES) * OUTPUT_ROW_SIZE + INPUT_OFFSET; + if (input_bboxes[input_idx] != -1) + { + uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output; + + for (uint idx = 0; idx < OUTPUT_ROW_SIZE; idx++) + { + output[out_idx + idx] = input_bboxes[input_idx + idx]; + } + + output_count++; + } + else + { + break; + } + } + } + } + } + else + { + uint sorted_output[KEEP_TOP_K * NUM_CLASSES_IN]; + + for (uint it = 0; it < NUM_OF_ITEMS_SORT; it++) + { + indexes[local_id + it] = (local_id + it) * NUM_OF_CLASS_BBOXES; + } + + while (output_count < KEEP_BBOXES_NUM) + { + stillSorting = true; + + while(stillSorting) + { + barrier(CLK_LOCAL_MEM_FENCE); + stillSorting = false; + for (uint it = 0; it < NUM_OF_ITEMS_SORT; it++) + { + uint item_id = local_id + it; + for (uint i = 0; i < 2; i++) + { + + uint idx1 = indexes[item_id]; + uint idx2 = indexes[item_id+1]; + bool perform = false; + if ((((i % 2) && (item_id % 2)) || + ((!(i % 2)) && (!(item_id % 2)))) && + (item_id != (NUM_CLASSES_IN - 1))) + { + perform = true; + } + + if (perform && + (FUNC_CALL(get_score_sort)(input_bboxes, idx1, image_id) < + FUNC_CALL(get_score_sort)(input_bboxes, idx2, image_id))) + { + indexes[item_id] = idx2; + indexes[item_id+1] = idx1; + stillSorting = true; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + } + + if (local_id == 0) + { + UNIT_TYPE top_score = FUNC_CALL(get_score_sort)(input_bboxes, indexes[0], image_id); + + if (top_score != 0) + { + for (uint it = 0; (it < NUM_CLASSES_IN) && (output_count < KEEP_BBOXES_NUM); it++) + { + if (FUNC_CALL(get_score_sort)(input_bboxes, indexes[it], image_id) == top_score) + { + // write to output, create counter, and check if keep_top_k is satisfied. + uint input_idx = (indexes[it] + image_offset_input) * OUTPUT_ROW_SIZE + INPUT_OFFSET; + uint class_idx = input_bboxes[input_idx + 1] - HIDDEN_CLASS; + + sorted_output[class_idx * KEEP_TOP_K + num_out_per_class[class_idx]] = input_idx; + num_out_per_class[class_idx]++; + + indexes[it]++; + output_count++; + + // If all class elements are written to output, set dummy value to exclude class from sorting. + if ((indexes[it] % NUM_OF_CLASS_BBOXES) == 0) + { + indexes[it] = KEEP_BBOXES_NUM; + } + } + } + } + else + { + // There is no more significant results to sort. + output_count = KEEP_BBOXES_NUM; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (local_id == 0) + { + output_count = 0; + for (uint i = 0; i < NUM_CLASSES_IN; i++) + { + for (uint j = 0; j < num_out_per_class[i]; j++) + { + + uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output; + for (uint idx = 0; idx < OUTPUT_ROW_SIZE; idx++) + { + output[out_idx + idx] = input_bboxes[sorted_output[i * KEEP_TOP_K + j] + idx]; + } + output_count++; + } + } + uint image_count_sum = (input_bboxes[image_id] < KEEP_TOP_K)? input_bboxes[image_id] : KEEP_TOP_K; + for (output_count; output_count < image_count_sum; output_count++) + { + uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output; + output[out_idx] = -1.0; + output[out_idx + 1] = 0.0; + output[out_idx + 2] = 0.0; + output[out_idx + 3] = 0.0; + output[out_idx + 4] = 0.0; + output[out_idx + 5] = 0.0; + output[out_idx + 6] = 0.0; + } + } + } + + if (local_id == 0 && + image_id == (NUM_IMAGES - 1)) + { + for (output_count += count_sum; output_count < (KEEP_TOP_K * NUM_IMAGES); output_count++ ) + { + uint out_idx = output_count * OUTPUT_ROW_SIZE; + output[out_idx] = -1.0; + output[out_idx + 1] = 0.0; + output[out_idx + 2] = 0.0; + output[out_idx + 3] = 0.0; + output[out_idx + 4] = 0.0; + output[out_idx + 5] = 0.0; + output[out_idx + 6] = 0.0; + } + } + +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl new file mode 100644 index 0000000..2d598ea --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl @@ -0,0 +1,100 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "include/include_all.cl" + +#define PACK 4 + +#define SGR_MAX_SIZE (get_max_sub_group_size()) +#define SGR_LOCAL_ID (get_sub_group_local_id()) + +#define GET_INDEX(_x) \ + ( ((_x / SGR_MAX_SIZE) * SGR_MAX_SIZE /* Normed to max_subgroup_size */) \ + * (4 * sizeof(int) /* 4xINT32 per sub_group reading */) \ + ) + +inline int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx) +{ + int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(src + idx))); + int16 to_return; + for(uint i = 0; i < 4; i++) + { + for(uint j = 0; j < 4; j++) + { + to_return[i * 4 + j] = as_char4(int_data[i])[j]; + } + } + return to_return; +} +#define GET_INPUT(A, B) FUNC_CALL(get_int16)(A, GET_INDEX(x)) + + +__attribute__((intel_reqd_sub_group_size(8))) +KERNEL(eltwise_b_fs_yx_fsv4)( + INPUTS_DECLS + __global UNIT_TYPE* output +#if CALIBRATION_TERM + , const __global float* calibrations +#endif + ) +{ + // This kernel works with linearized data w/o strides and padding + // so only one dimension 'X' is required + const uint x = get_global_id(0); + const uint idx = GET_INDEX(x); + + int16 res; + + DO_ELTWISE; + + for(uint i = 0; i < 4; i++) + { + const uint out_idx = idx + (sizeof(int) * (SGR_LOCAL_ID + (i * SGR_MAX_SIZE))); + char4 char_res; + + for(uint j = 0; j < 4; j++) + { + int res_tmp = res[i * 4 + j]; + #if QUANTIZATION_TERM + #if CALIBRATION_TERM + // Batch: + const uint b = out_idx / OUTPUT_BATCH_PITCH; + // Feature: + // Because of specific data layout Feature must be normed to PACK size + uint d3 = ((out_idx - b * OUTPUT_BATCH_PITCH) / (OUTPUT_FEATURE_PITCH * PACK)) * PACK; + res_tmp = (int)round(((float)res_tmp) * calibrations[d3+j]); + #else // CALIBRATION_TERM + res_tmp = (int)round(((float)res_tmp) * O_QF); + #endif // CALIBRATION_TERM + #endif // QUANTIZATION_TERM + + #ifdef ELTW_UNSIGNED + char_res[j] = ACTIVATION(convert_uchar(res_tmp), NL_M, NL_N); + #else + char_res[j] = ACTIVATION(convert_char(res_tmp), NL_M, NL_N); + #endif + } + // put 4 chars into output + // char_result[i] = as_int(char_res); + *((__global int*)(output + out_idx)) = as_int(char_res); + } +} + +#undef PACK +#undef SGR_MAX_SIZE +#undef SGR_LOCAL_ID +#undef GET_INDEX +#undef GET_INPUT diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl index fe5e4a8..388d50d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl @@ -16,8 +16,13 @@ #include "include/include_all.cl" +#ifdef INPUT_STRIDED +#define GET_INDEX(src) \ + GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2 * CAT(src, _STRIDE_Y), d1 * CAT(src, _STRIDE_X)) +#else #define GET_INDEX(src) \ GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2, d1) +#endif int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl index c51a61e..f1a5a4e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl @@ -12,16 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "include/include_all.cl" -#include "include/common.cl" -#include "include/data_types.cl" - -KERNEL(embed_ref)(const __global UNIT_TYPE* input0, __global UNIT_TYPE* output, __global UNIT_TYPE* weights, __global UNIT_TYPE* biases) +KERNEL(embed_ref)(const __global UNIT_TYPE* input0, + __global UNIT_TYPE* output, + const __global UNIT_TYPE* weights +#if BIAS_TERM + ,const __global UNIT_TYPE* biases +#endif +) { const uint x = (uint)get_global_id(0); const uint y = (uint)get_global_id(1); const uint b = (uint)get_global_id(2); + uint output_idx = (b*INPUT0_ELEMENTS_COUNT*NUM_OUTPUT_SIZE)+(uint)(x*NUM_OUTPUT_SIZE+y); - output[output_idx] = weights[(uint)(input0[(b*INPUT0_ELEMENTS_COUNT)+x]*NUM_OUTPUT_SIZE+y)] + biases[y]; + output[output_idx] = weights[(uint)(input0[(b*INPUT0_ELEMENTS_COUNT)+x]*NUM_OUTPUT_SIZE+y)]; +#if BIAS_TERM + output[output_idx] += biases[y]; +#endif } - \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl index e11fb14..9a3bac2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl @@ -14,54 +14,30 @@ #include "include/include_all.cl" - -#if FP16_UNIT_USED - // Block read - currently block is 4 bytes aligned. - #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset))) - - #define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB) \ - { \ - const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \ - const half16 acol1 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s1 ); \ - const half16 acol2 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s2 ); \ - const half16 acol3 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s3 ); \ - const half16 acol4 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s4 ); \ - const half16 acol5 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s5 ); \ - const half16 acol6 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s6 ); \ - const half16 acol7 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s7 ); \ - _result = fma( _blockB.s0, acol0, _result ); \ - _result = fma( _blockB.s1, acol1, _result ); \ - _result = fma( _blockB.s2, acol2, _result ); \ - _result = fma( _blockB.s3, acol3, _result ); \ - _result = fma( _blockB.s4, acol4, _result ); \ - _result = fma( _blockB.s5, acol5, _result ); \ - _result = fma( _blockB.s6, acol6, _result ); \ - _result = fma( _blockB.s7, acol7, _result ); \ - } -#else - // Block read - currently block is 4 bytes aligned. - #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset))) - - #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \ - { \ - const float8 acol0 = TRANSPOSE_BLOCK_8( _blockA.s0 ); \ - const float8 acol1 = TRANSPOSE_BLOCK_8( _blockA.s1 ); \ - const float8 acol2 = TRANSPOSE_BLOCK_8( _blockA.s2 ); \ - const float8 acol3 = TRANSPOSE_BLOCK_8( _blockA.s3 ); \ - const float8 acol4 = TRANSPOSE_BLOCK_8( _blockA.s4 ); \ - const float8 acol5 = TRANSPOSE_BLOCK_8( _blockA.s5 ); \ - const float8 acol6 = TRANSPOSE_BLOCK_8( _blockA.s6 ); \ - const float8 acol7 = TRANSPOSE_BLOCK_8( _blockA.s7 ); \ - _result = mad( _blockB.s0, acol0, _result ); \ - _result = mad( _blockB.s1, acol1, _result ); \ - _result = mad( _blockB.s2, acol2, _result ); \ - _result = mad( _blockB.s3, acol3, _result ); \ - _result = mad( _blockB.s4, acol4, _result ); \ - _result = mad( _blockB.s5, acol5, _result ); \ - _result = mad( _blockB.s6, acol6, _result ); \ - _result = mad( _blockB.s7, acol7, _result ); \ - } -#endif +#include "include/sub_group.cl" + +// Block read - currently block is 4 bytes aligned. +#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset))) + +#define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB) \ +{ \ + const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \ + const half16 acol1 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s1 ); \ + const half16 acol2 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s2 ); \ + const half16 acol3 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s3 ); \ + const half16 acol4 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s4 ); \ + const half16 acol5 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s5 ); \ + const half16 acol6 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s6 ); \ + const half16 acol7 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s7 ); \ + _result = fma( _blockB.s0, acol0, _result ); \ + _result = fma( _blockB.s1, acol1, _result ); \ + _result = fma( _blockB.s2, acol2, _result ); \ + _result = fma( _blockB.s3, acol3, _result ); \ + _result = fma( _blockB.s4, acol4, _result ); \ + _result = fma( _blockB.s5, acol5, _result ); \ + _result = fma( _blockB.s6, acol6, _result ); \ + _result = fma( _blockB.s7, acol7, _result ); \ +} #define SUB_GROUP_SIZE 16 @@ -115,7 +91,4 @@ KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv16_vload)( #undef SUB_GROUP_SIZE #undef ALIGNED_BLOCK_READ8 -#undef MAKE_VECTOR_TYPE -#undef CONCAT_TOKEN -#undef CONCAT_TOKEN_HANDLER1 -#undef MULTIPLY_BLOCKS_16x16 +#undef MULTIPLY_BLOCKS_16x8 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl index 9183519..109829f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/sub_group.cl" #if FP16_UNIT_USED // Block read - currently block is 4 bytes aligned. @@ -224,7 +225,4 @@ KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv8_vload)( #undef SUB_GROUP_SIZE #undef ALIGNED_BLOCK_READ8 -#undef MAKE_VECTOR_TYPE -#undef CONCAT_TOKEN -#undef CONCAT_TOKEN_HANDLER1 #undef MULTIPLY_BLOCKS_8x8 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl index 556adec..bf21205 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/sub_group.cl" __attribute__((reqd_work_group_size(8, 1, 1))) KERNEL (fully_connected_gpu_xb_xb_b8_x8)( diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8_vload.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8_vload.cl index ed86d49..4d596f7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8_vload.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_b8_f8_vload.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/sub_group.cl" #if FP16_UNIT_USED #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \ diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl new file mode 100644 index 0000000..af8a8fb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl @@ -0,0 +1,95 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/common.cl" + +#include "include/data_types.cl" +#include "include/fetch.cl" +#include "include/imad.cl" + +#define SIMD_SIZE 16 +#define BYTES_PER_READ (sizeof(int)) +#define BYTES_PER_READ8 (8 * BYTES_PER_READ) + +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) +KERNEL(fully_connected_gpu_IMAD)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + const __global FILTER_TYPE* weights +#if BIAS_TERM + , const __global BIAS_TYPE* biases +#endif +#if QUANTIZATION_TERM + ,const __global float* quantizations +#endif +#if CALIBRATION_TERM + ,const __global float* calibrations +#endif + ) +{ + // This kernel works with linearized data w/o strides and padding + // so only one dimension 'F' is required + const uint f = get_global_id(0); + const uint b = get_global_id(1); + + if (f >= OUTPUT_FEATURE_NUM) { + return; + } + + int dotProd = 0; + + uint idx_w = ((f / SIMD_SIZE) * SIMD_SIZE) * INPUT0_FEATURE_NUM; + const __global INPUT0_TYPE* current_input = &input[GET_DATA_INDEX(INPUT0, b, 0, 0, 0)]; + + for (uint idx_i = 0; idx_i < INPUT0_FEATURE_NUM; idx_i += BYTES_PER_READ8) { + int input_data = as_int(intel_sub_group_block_read((const __global uint*)(current_input + idx_i))); + int8 activations; //activations of all lanes + activations.s0 = sub_group_broadcast(input_data, 0); + activations.s1 = sub_group_broadcast(input_data, 1); + activations.s2 = sub_group_broadcast(input_data, 2); + activations.s3 = sub_group_broadcast(input_data, 3); + activations.s4 = sub_group_broadcast(input_data, 4); + activations.s5 = sub_group_broadcast(input_data, 5); + activations.s6 = sub_group_broadcast(input_data, 6); + activations.s7 = sub_group_broadcast(input_data, 7); + + int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + idx_w))); + idx_w += SIMD_SIZE * BYTES_PER_READ8; + + for (int i = 0; i < 8; i++) { + dotProd = IMAD(dotProd, as_char4(activations[i]), as_char4(weights_data[i])); + } + } + +#if BIAS_TERM +#if BIAS_PER_OUTPUT + const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); +#elif BIAS_PER_OFM + const uint bias_index = f; +#endif +#if CALIBRATION_TERM + dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]); +#else // CALIBRATION_TERM + dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * O_QF); +#endif // CALIBRATION_TERM +#endif // BIAS_TERM + + const uint out_index = GET_DATA_INDEX(OUTPUT, b, f, 0, 0); + output[out_index] = ACTIVATION(convert_char(dotProd), NL_M, NL_N); +} + +#undef SIMD_SIZE +#undef BYTES_PER_READ +#undef BYTES_PER_READ8 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_yxfb_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_yxfb_ref.cl index e8ea675..5c63b79 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_yxfb_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_yxfb_ref.cl @@ -14,6 +14,7 @@ #include "include/include_all.cl" +#include "include/reshape_dims.cl" // Required JIT constants: // - FP16_SUPPORTED - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16). diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl new file mode 100644 index 0000000..82e0921 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl @@ -0,0 +1,197 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + +#define LOCAL_SIZE INPUT0_BATCH_NUM + +__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) +KERNEL(convolution)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif + uint split_idx, + __global INPUT0_TYPE* scale_in +#if SCALE_BIAS_TERM + , __global INPUT0_TYPE* scale_bias +#endif +#if FUSED_TRAINING + , __global INPUT0_TYPE* inv_var, + __global INPUT0_TYPE* conv_output, + __global INPUT0_TYPE* bn_output +#endif + ) +{ + const uint f = get_global_id(1); + const uint b = get_global_id(0); + + UNIT_TYPE conv_out = UNIT_VAL_ZERO; + + const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; + + const uint filter_offset = f*FILTER_OFM_PITCH; + const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; + + for (uint y = 0; y < OUTPUT_SIZE_Y; ++y) + { + const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + for (uint x = 0; x < OUTPUT_SIZE_X; ++x) + { + const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; + for (uint k = 0; k < FILTER_IFM_NUM; ++k) + { + for (uint j = 0; j < FILTER_SIZE_Y ; ++j) + { + const int input_offset_y = input_y + j * DILATION_SIZE_Y; + const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; + + if(!zero_y) + { + for (uint i = 0; i < FILTER_SIZE_X ; ++i) + { + const int input_offset_x = input_x + i * DILATION_SIZE_X; + const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; + + if(!zero_x) + { + uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*INPUT0_FEATURE_PITCH; + uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; + conv_out += input[input_idx] * weights[filter_idx]; + } + } + } + } + } +#if BIAS_TERM + conv_out += (UNIT_TYPE)biases[f]; +#endif + + const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; + const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset; +#ifdef FUSED_TRAINING + conv_output[dst_index] = conv_out; +#else + output[dst_index] = conv_out; +#endif + } + } + + + // BATCH NORM PART + barrier(CLK_LOCAL_MEM_FENCE); + + __local ACCUMULATOR_TYPE sum[LOCAL_SIZE]; + + const uint local_idx = b; + + sum[local_idx] = 0; + + uint input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); + for (uint y = 0; y < OUTPUT_SIZE_Y; y++) + { + for (uint x = 0; x < OUTPUT_SIZE_X; x++) + { +#ifdef FUSED_TRAINING + UNIT_TYPE in = conv_output[input_idx]; +#else + UNIT_TYPE in = output[input_idx]; +#endif + sum[local_idx] += in; + input_idx += OUTPUT_X_PITCH; + } + input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) + { + if (local_idx < offset) + { + sum[local_idx] += sum[local_idx + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); + + sum[local_idx] = 0; + + input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); + for (uint y = 0; y < OUTPUT_SIZE_Y; y++) + { + for (uint x = 0; x < OUTPUT_SIZE_X; x++) + { +#ifdef FUSED_TRAINING + UNIT_TYPE in = conv_output[input_idx] - mean; +#else + UNIT_TYPE in = output[input_idx] - mean; +#endif + sum[local_idx] += in * in; + input_idx += OUTPUT_X_PITCH; + } + input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) + { + if (local_idx < offset) + { + sum[local_idx] += sum[local_idx + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); + + float inv_variance = (float)(1.0 / sqrt(variance + EPSILON)); + +#ifdef FUSED_TRAINING + if (local_idx == 0) + inv_var[f] = inv_variance; +#endif + + uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); + for (uint y = 0; y < OUTPUT_SIZE_Y; y++) + { + for (uint x = 0; x < OUTPUT_SIZE_X; x++) + { +#ifdef FUSED_TRAINING + UNIT_TYPE out_val = inv_variance * (conv_output[out_idx] - mean); + bn_output[out_idx] = out_val; +#ifdef SCALE_BIAS_TERM + output[out_idx] = ACTIVATION(out_val * scale_in[f] + scale_bias[f], NL_M, NL_N); +#else + output[out_idx] = ACTIVATION(out_val * scale_in[f], NL_M, NL_N); +#endif +#else +#ifdef SCALE_BIAS_TERM + output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f] + scale_bias[f], NL_M, NL_N); +#else + output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f], NL_M, NL_N); +#endif +#endif + out_idx += OUTPUT_X_PITCH; + } + out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; + } + +} + +#undef LOCAL_SIZE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl new file mode 100644 index 0000000..b22e2d9 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl @@ -0,0 +1,254 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + +#define SIMD_SIZE 8 +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) +KERNEL(fused_conv_eltwise_gpu_bfyx_1x1_opt)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif + uint split_idx, + const __global float* src3) +{ + const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH; + const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT; + const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM; + const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;; + + const uint ifm_part = get_sub_group_id(); + uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2; + + UNIT_TYPE in[OUT_BLOCK_HEIGHT]; + UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2]; + UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2]; + + for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++) + { + dotProd0[i] = 0; + dotProd1[i] = 0; + } + +#if OUT_BLOCK_DEPTH == 8 + const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2); +#elif OUT_BLOCK_DEPTH == 4 + const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2); +#elif OUT_BLOCK_DEPTH == 2 + const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2); +#else + const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH; +#endif + const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH; + + //-------------------------------------------------------------------- + // main computation phase + //-------------------------------------------------------------------- + + for (uint k = 0; k < FILTER_IFM_NUM/2; ++k) + { + for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++) + { + const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH; + in[i] = input[in_offset]; + } + +#if OUT_BLOCK_DEPTH == 8 + float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64)); +#elif OUT_BLOCK_DEPTH == 4 + float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32)); +#elif OUT_BLOCK_DEPTH == 2 + float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16)); +#endif + + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + float _in = intel_sub_group_shuffle(in[br], bc); + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd]; + dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2]; + } + } + } + } + + __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE]; + + //-------------------------------------------------------------------- + // second sub_group in workgroup task + //-------------------------------------------------------------------- + + if(ifm_part == 1) + { + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)) + get_sub_group_local_id()] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + } + } + } + + } + + //-------------------------------------------------------------------- + // first sub_group in workgroup task + //-------------------------------------------------------------------- + + if(ifm_part == 0) + { + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) )) + get_sub_group_local_id()] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + } + } + } + + } + + //-------------------------------------------------------------------- + // add bias phase + //-------------------------------------------------------------------- + + #if BIAS_TERM + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()]; + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias; + } + } + } + #endif + + barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it + + //-------------------------------------------------------------------- + // sum sub-group results + activation phase + //-------------------------------------------------------------------- + + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) )) + get_sub_group_local_id()]; + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);; + } + } + } + + //-------------------------------------------------------------------- + // eltwise with eltwise activation phase + //-------------------------------------------------------------------- + #if IN_OUT_OPT != 1 + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++) + { + uint src3_offset = GET_DATA_INDEX(INPUT1, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), (group_y + br) * ELTW_STRIDE_Y, (group_x + bc) * ELTW_STRIDE_X); + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += src3[src3_offset]; + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW); + } + } + } + #endif + + //-------------------------------------------------------------------- + // output phase + //-------------------------------------------------------------------- + + for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++) + { + for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++) + { + uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x); + uint out_vstore_offset = 0; + #if (OUT_BLOCK_WIDTH >= 8) + { + float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); +#if IN_OUT_OPT == 1 + float8 tmp2 = vload8(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + tmp += tmp2; + tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW); +#endif + vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 8; + } + #endif + #if (OUT_BLOCK_WIDTH % 8) > 3 + { + float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); +#if IN_OUT_OPT == 1 + float4 tmp2 = vload4(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + tmp += tmp2; + tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW); +#endif + vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 4; + } + #endif + #if (OUT_BLOCK_WIDTH % 4) > 1 + { + float2 tmp = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], + dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]); +#if IN_OUT_OPT == 1 + float2 tmp2 = vload2(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + tmp += tmp2; + tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW); +#endif + vstore2(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH); + out_vstore_offset += 2; + } + #endif + for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++) + { +#if IN_OUT_OPT == 1 + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += output[dst_index + bc * OUTPUT_X_PITCH]; + dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW); +#endif + output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]; + } + } + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_os_iyx_osv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_os_iyx_osv16.cl new file mode 100644 index 0000000..bd439e0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_os_iyx_osv16.cl @@ -0,0 +1,252 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/common.cl" +#include "include/data_types.cl" + + +// --------------------------------------------------------------------------------------------------------------------- +// Just-in-time macro definitions: +// --------------------------------------------------------------------------------------------------------------------- + +// Required JIT constants: +// - INPUT - [tensor] Input dimensions (batch, spatial and feature). +// - OUTPUT - [tensor] Output dimensions (batch, spatial and feature). +// - STRIDE - [tensor] Stride (only spatial). Factors that describe step size in X or Y dimension of +// input position of application of convolution filter when next ouput value +// (step 1 in in X or Y dimension of output) is computed. +// - INPUT0_OFFSET - [tensor] Offset for the first element +// initial offset input position of application of convolution filter and output position. +// - FP16_SUPPORTED - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16). +// - FP16_UNIT_USED - [0/1] Value indicating that current kernel should use FP16. +// - UNIT_TYPE - Type of unit of input/output/weight/bias. +// - UNIT_VAL_ZERO - Literal of current UNIT_TYPE that represents 0. +// - RELU - [0/1] Indicates that ReLU activation function should be used on output. +// - NEGATIVE_SLOPE - [float] Factor for negative output values (required when ReLU is specified). +// +// - SUB_GROUP_SIZE - [int] Size of used subgroup (SIMD). +// - LEFTOVERS - [int] Optional parameter, required only when number of ofm is not dividable by SUB_GROUP_SIZE +// see comment for FEATURES_THREADS_PER_BATCH for more informations + +/* +gpu::make_jit_constant("OUTPUT_LIMIT", output_size), +gpu::make_jit_constant("FILTER", filter_mem.argument().size), +gpu::make_jit_constant("FILTER_ARRAY_NUM", split), +gpu::make_jit_constant("OUTPUT_BLOCK_WIDTH", _kernel_data.block_width)); +gpu::make_jit_constant("OUTPUT_BLOCK_HEIGHT", _kernel_data.block_height)); +gpu::make_jit_constant("IN_BLOCK_ARRAY_SIZE", _kernel_data.input_block_array_size)); +gpu::make_jit_constant("IN_BLOCK_WIDTH", _kernel_data.input_block_width)); +gpu::make_jit_constant("PREFETCH", _kernel_data.prefetch)); +if (_kernel_data.leftovers) + gpu::make_jit_constant("LEFTOVERS", _kernel_data.leftovers)); +*/ + +// FEATURES_THREADS_PER_BATCH defines how many threads in z-dimension are processing single batch. +// ideally, z-dimension of value n should indicate processing of n-th output feature. however, since +// threads are stack in groups of SUB_GROUP_SIZE, when number of ofm is not dividable by SUB_GROUP_SIZE +// there are dummy threads added in z-dimension in count of LEFTOVERS. We need to take them into consideration +// while calculating batch's id (see lines 86-87). Values calculated by dummy threads are discarded at line 210. +#ifdef LEFTOVERS +#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM + LEFTOVERS) +#else +#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM) +#endif + +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) +__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE))) +KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( + const __global UNIT_TYPE* input, + __global UNIT_TYPE* output, + const __global UNIT_TYPE* weights, +#if BIAS_TERM + const __global UNIT_TYPE* bias, +#endif + uint split_idx, + const __global UNIT_TYPE* eltw_input) // TODO: removing this parameter cause a performance degradation... :) +{ + const uint oc = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH; // oc = Output Column + const uint or = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; // or = Output Row + const uint fm = get_global_id(2); // fm = Feature Map = od = Output Depth + const uint lid = get_sub_group_local_id(); + + uint batch_idx = fm / FEATURES_THREADS_PER_BATCH; + uint feature_idx = fm % FEATURES_THREADS_PER_BATCH; + uint fmg = feature_idx / SUB_GROUP_SIZE; + + UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE]; + UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT]; + UNIT_TYPE w[PREFETCH]; + uint in_addr; + uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid; + + for(int i = 0; i < (OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT); i++) { + out[i] = UNIT_VAL_ZERO; + } + + uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; + in_addr = batch_idx * INPUT0_BATCH_PITCH; + in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + or * STRIDE_SIZE_Y * INPUT0_Y_PITCH + oc * STRIDE_SIZE_X + lid; + + for(int kd = 0; kd < FILTER_IFM_NUM; kd++) // _ID = 3, RGB + { + uint tmp_in_addr = in_addr; + +#if IN_BLOCK_WIDTH % SUB_GROUP_SIZE == 0 + __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE))) + for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) { + // Horizontal position in input block after read. + const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; + + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + + // If we have row break, move to the next row. + if (in_block_next_x_pos == IN_BLOCK_WIDTH) + tmp_in_addr += INPUT0_Y_PITCH; + } +#elif (2 * IN_BLOCK_WIDTH) % SUB_GROUP_SIZE == 0 + __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE))) + for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) { + // Horizontal position in input block after read. + const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; + + if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { // + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + + // If we have row break, move to the next row. + if (in_block_next_x_pos == IN_BLOCK_WIDTH) + tmp_in_addr += INPUT0_Y_PITCH; + } + else { + // TODO: Generalize this step to relax IN_BLOCK_WIDTH restrictions. + // Position in sub-group on which new row need to be read. + const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH; + + if (lid < sg_br_pos) + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; + // We have row break inside sub-group. Need to move to next line. + tmp_in_addr += INPUT0_Y_PITCH; + if (lid >= sg_br_pos) + in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - sg_br_pos]; + + // If we have another row break, move to the next row. + if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH) + tmp_in_addr += INPUT0_Y_PITCH; + } + } +#else + #error IN_BLOCK_WIDTH must be multiple of SUB_GROUP_SIZE or half of SUB_GROUP_SIZE. Other scenarios are not currently implemented. +#endif + + //move to next filter + in_addr += INPUT0_FEATURE_PITCH; + + for(int pf=0; pf= OUTPUT_SIZE_Y)) + { + for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) { + // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. + if(!(oc + c >= OUTPUT_SIZE_X)) + { +#if IN_OUT_OPT == 1 + out[r * OUTPUT_BLOCK_WIDTH + c] += output[out_addr + r * OUTPUT_Y_PITCH + c]; + out[r * OUTPUT_BLOCK_WIDTH + c] = ACTIVATION_ELTW(out[r * OUTPUT_BLOCK_WIDTH + c], NL_M_ELTW, NL_N_ELTW); +#endif + output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c]; + } + } + } + } +} + +#undef FEATURES_THREADS_PER_BATCH diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl new file mode 100644 index 0000000..022431d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl @@ -0,0 +1,602 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "include/include_all.cl" +#include "include/sub_group.cl" +#include "include/fetch.cl" + +#define TILE_M 2 +#define TILE_K FILTER_SIZE_X +#define TILE_N 32 + +inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset)(uint out_offset, uint strideX, uint strideY) +{ +// bfyx + uint tmp_idx = out_offset; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + x_idx *= strideX; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + y_idx *= strideY; + tmp_idx /= OUTPUT_SIZE_Y; + uint f_idx = tmp_idx % OUTPUT_FEATURE_NUM; + tmp_idx /= OUTPUT_FEATURE_NUM; + uint b_idx = tmp_idx % OUTPUT_BATCH_NUM; + + return GET_DATA_INDEX(INPUT1, b_idx, f_idx, y_idx, x_idx); +} + +__attribute__((intel_reqd_sub_group_size(8))) +KERNEL(fused_conv_eltwise_gemm_fp32)( + const __global float *src0, + __global float *dst, + const __global float *src1, +#if BIAS_TERM + const __global float *bias, +#endif + uint split_idx, + const __global float* src3) +{ +#include "include/vec_typedefs.cl" + + const unsigned group_x = get_group_id(0); + const unsigned group_y = get_group_id(1); + const unsigned global_x = get_global_id(0); + const unsigned global_y = get_global_id(1); + const unsigned global_z = get_global_id(2); + + unsigned interleaved_y; + unsigned kernel_y; + unsigned kernel_idx; + + // Result ctile (*dst) is M rows x N columns + // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. + float8 blockC00 = 0.f; + float8 blockC10 = 0.f; + float8 blockC20 = 0.f; + float8 blockC30 = 0.f; + float8 blockC01 = 0.f; + float8 blockC11 = 0.f; + float8 blockC21 = 0.f; + float8 blockC31 = 0.f; + + const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM; + // Src0 (patch input) is directly used as atile. + // Each work item points to the start of a different patch. + // atile is M rows x K columns. + const uint src0_read_offset0_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset + + INPUT0_BATCH_PITCH * global_z // batch offset + + ( ( ( global_y * TILE_M + 0 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset + + ( ( ( global_y * TILE_M + 0 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset + const uint src0_read_offset1_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset + + INPUT0_BATCH_PITCH * global_z // batch offset + + ( ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset + + ( ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset + + // Src1 (filter) is directly used as btile. + // It starts at the top of src1 and walks down. + // btile is K rows x N columns. + uint src0_read_offset0 = src0_read_offset0_const; + uint src0_read_offset1 = src0_read_offset1_const; + uint src1_read_offset = ( global_x * TILE_N * 2); + +#define DOT_PRODUCT_8( _result, _rowA, colB ) \ + { \ + _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ + _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ + _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ + _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ + _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ + _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ + _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ + _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ + } + + // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. + // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch + // and FILTER_SIZE_X/2 rows of interleaved filter. + unsigned patch_depth = 0; + do + { + unsigned patch_row = 0; + do + { + // Load atile and btile. + // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity. + // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved. The non + // interleaved row is padded with zero to ensure same size as interleaved rows. This + // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the + // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3. + // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. + // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... + // (0, 2) (8, 2) (16, 2) (24, 2) ... ... + // ... + const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1; + + float blockA00[FILTER_SIZE_X]; + float blockA01[FILTER_SIZE_X]; + + // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop + { + unsigned i = 0; + LOOP(FILTER_SIZE_X, i, + { +#if LEFTOVERS == 1 + if(src0_read_offset0_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) + { + if(src0_read_offset0 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) + blockA00[i] = src0[src0_read_offset0 + i]; + } + else +#endif + blockA00[i] = src0[src0_read_offset0 + i]; + +#if LEFTOVERS == 1 + if(src0_read_offset1_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) + { + if(src0_read_offset1 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) + blockA01[i] = src0[src0_read_offset1 + i]; + } + else +#endif + blockA01[i] = src0[src0_read_offset1 + i]; + } ) + } + + float* pblockA00 = (float*)(&blockA00); + float* pblockA01 = (float*)(&blockA01); + + src0_read_offset0 += INPUT0_Y_PITCH; + src0_read_offset1 += INPUT0_Y_PITCH; + + + float blockB00[FILTER_SIZE_X*4]; + float8* p8BlockB00 = (float8*)blockB00; + float4* p4BlockB00 = (float4*)blockB00; + float* pBlockB00 = (float* )blockB00; + + interleaved_y = 0; + LOOP(FILTER_SIZE_X_DIV2, interleaved_y, + { + p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) ); + src1_read_offset += ALIGNED_OFM * 2; + } ) + if ( kernel_width_is_odd ) + { + p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) ); + src1_read_offset += ALIGNED_OFM * 2; + } + + // Perform MADs + kernel_idx = 0; + interleaved_y = 0; + LOOP(FILTER_SIZE_X_DIV2, interleaved_y, + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + } ) + if ( kernel_width_is_odd ) + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + } + } + + //while( ++patch_row < 1 ); //debug + while( ++patch_row < FILTER_SIZE_Y ); + + src0_read_offset0 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch + src0_read_offset1 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch + } + //while ( ++patch_depth < 1 ); //debug + while ( ++patch_depth < INPUT0_FEATURE_NUM ); + + const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; + // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: + // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. + __global float *out0 = dst + OUTPUT_OFFSET + out_split_offset + + global_z * OUTPUT_BATCH_PITCH // batch offset + + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset + + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset + + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset + __global float *out1 = dst + OUTPUT_OFFSET + out_split_offset + + global_z * OUTPUT_BATCH_PITCH // batch offset + + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset + + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset + + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ); // x offset + + #if BIAS_TERM + __global float8* biasPtr = (__global float8*) (bias + group_x * TILE_N); + #endif + + uint out0_offset = OUTPUT_OFFSET + out_split_offset + + global_z * OUTPUT_BATCH_PITCH // batch offset + + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset + + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset + + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset + + uint out1_offset = OUTPUT_OFFSET + out_split_offset + + global_z * OUTPUT_BATCH_PITCH // batch offset + + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset + + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset + + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ); + + //-----------------------------------------------------------------------------------------------// + // OUTPUT PHASE + //-----------------------------------------------------------------------------------------------// + if( global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y ) + { + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 ) + { + #if BIAS_TERM + blockC00 += *biasPtr; + blockC10 += *(biasPtr + 1); + blockC20 += *(biasPtr + 2); + blockC30 += *(biasPtr + 3); + #endif + + blockC00 = ACTIVATION(blockC00, NL_M, NL_N); + blockC10 = ACTIVATION(blockC10, NL_M, NL_N); + blockC20 = ACTIVATION(blockC20, NL_M, NL_N); + blockC30 = ACTIVATION(blockC30, NL_M, NL_N); + + // eltwise + uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); + for(uint i = 0; i < 8; i++) + { + blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; + blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; + blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; + blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH]; + } + + blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW); + blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW); + blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW); + blockC30 = ACTIVATION_ELTW(blockC30, NL_M_ELTW, NL_N_ELTW); + // end eltwise + + for( unsigned i = 0; i < 8; i++ ) + { + out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; + out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; + out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i]; + out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i]; + } + } + else + { + if ( ( global_x + 1 ) < get_global_size(0) ) + { + #if BIAS_TERM + blockC00 += *biasPtr; + blockC10 += *(biasPtr + 1); + blockC20 += *(biasPtr + 2); + blockC30 += *(biasPtr + 3); + #endif + + blockC00 = ACTIVATION(blockC00, NL_M, NL_N); + blockC10 = ACTIVATION(blockC10, NL_M, NL_N); + blockC20 = ACTIVATION(blockC20, NL_M, NL_N); + blockC30 = ACTIVATION(blockC30, NL_M, NL_N); + + // eltwise + uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); + for(uint i = 0; i < 8; i++) + { + blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; + blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; + blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; + blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH]; + } + + blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW); + blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW); + blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW); + blockC30 = ACTIVATION_ELTW(blockC30, NL_M_ELTW, NL_N_ELTW); + // end eltwise + + for ( unsigned i = 0; i < 8; i++ ) + { + out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; + out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; + out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i]; + out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i]; + } + } + else + { + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 ) + { + #if BIAS_TERM + blockC00 += *biasPtr; + blockC10 += *(biasPtr + 1); + blockC20 += *(biasPtr + 2); + if (( OUTPUT_FEATURE_NUM % TILE_N) > 24 ) blockC30 += *(biasPtr + 3); + #endif + + blockC00 = ACTIVATION(blockC00, NL_M, NL_N); + blockC10 = ACTIVATION(blockC10, NL_M, NL_N); + blockC20 = ACTIVATION(blockC20, NL_M, NL_N); + + // remaining output channels + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + blockC30[i] = ACTIVATION(blockC30[i], NL_M, NL_N); + } + + // eltwise + uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); + for(uint i = 0; i < 8; i++) + { + blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; + blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; + blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; + } + + // remaining output channels + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + blockC30[i] += src3[src3_offset + (i + 24 )* INPUT1_FEATURE_PITCH]; + blockC30[i] = ACTIVATION_ELTW(blockC30[i], NL_M_ELTW, NL_N_ELTW); + } + + blockC00 = ACTIVATION_ELTW(blockC00, NL_M_ELTW, NL_N_ELTW); + blockC10 = ACTIVATION_ELTW(blockC10, NL_M_ELTW, NL_N_ELTW); + blockC20 = ACTIVATION_ELTW(blockC20, NL_M_ELTW, NL_N_ELTW); + // end eltwise + + for (unsigned i = 0; i < 8; i++) + { + out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; + out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; + out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i]; + } + + // remaining output channels + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i]; + } + } + else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 ) + { + #if BIAS_TERM + blockC00 += *biasPtr; + blockC10 += *(biasPtr + 1); + if (( OUTPUT_FEATURE_NUM % TILE_N) > 16 ) + blockC20 += *(biasPtr + 2); + #endif + + blockC00 = ACTIVATION(blockC00, NL_M, NL_N); + blockC10 = ACTIVATION(blockC10, NL_M, NL_N); + + for (unsigned i = 0; i < 8; i++) + { + out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; + out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; + } + + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out0[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC20[i], NL_M, NL_N); + + } + } + else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 ) + { + #if BIAS_TERM + blockC00 += *biasPtr; + if (( OUTPUT_FEATURE_NUM % TILE_N) > 8 ) + blockC10 += *(biasPtr + 1); + #endif + + blockC00 = ACTIVATION(blockC00, NL_M, NL_N); + + for (unsigned i = 0; i < 8; i++) + { + out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; + } + + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out0[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC10[i], NL_M, NL_N); + } + } + else + { + #if BIAS_TERM + blockC00 += *biasPtr; + #endif + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out0[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC00[i], NL_M, NL_N); + } + } + } + } + } + + if ((global_y * TILE_M + 1) < OUTPUT_SIZE_X * OUTPUT_SIZE_Y ) + { + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 ) + { + #if BIAS_TERM + blockC01 += *biasPtr; + blockC11 += *(biasPtr + 1); + blockC21 += *(biasPtr + 2); + blockC31 += *(biasPtr + 3); + #endif + + blockC01 = ACTIVATION(blockC01, NL_M, NL_N); + blockC11 = ACTIVATION(blockC11, NL_M, NL_N); + blockC21 = ACTIVATION(blockC21, NL_M, NL_N); + blockC31 = ACTIVATION(blockC31, NL_M, NL_N); + + // eltwise + uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out1_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); + for(uint i = 0; i < 8; i++) + { + blockC01[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; + blockC11[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; + blockC21[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; + blockC31[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH]; + } + + blockC01 = ACTIVATION_ELTW(blockC01, NL_M_ELTW, NL_N_ELTW); + blockC11 = ACTIVATION_ELTW(blockC11, NL_M_ELTW, NL_N_ELTW); + blockC21 = ACTIVATION_ELTW(blockC21, NL_M_ELTW, NL_N_ELTW); + blockC31 = ACTIVATION_ELTW(blockC31, NL_M_ELTW, NL_N_ELTW); + // end eltwise + + for( unsigned i = 0; i < 8; i++ ) + { + out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; + out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; + out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i]; + out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i]; + } + } + else + { + if ( ( global_x + 1 ) < get_global_size(0) ) + { + #if BIAS_TERM + blockC01 += *biasPtr; + blockC11 += *(biasPtr + 1); + blockC21 += *(biasPtr + 2); + blockC31 += *(biasPtr + 3); + #endif + + blockC01 = ACTIVATION(blockC01, NL_M, NL_N); + blockC11 = ACTIVATION(blockC11, NL_M, NL_N); + blockC21 = ACTIVATION(blockC21, NL_M, NL_N); + blockC31 = ACTIVATION(blockC31, NL_M, NL_N); + + for ( unsigned i = 0; i < 8; i++ ) + { + out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; + out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; + out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i]; + out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i]; + } + } + else + { + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 ) + { + #if BIAS_TERM + blockC01 += *biasPtr; + blockC11 += *(biasPtr + 1); + blockC21 += *(biasPtr + 2); + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 24 ) blockC31 += *(biasPtr + 3); + #endif + + blockC01 = ACTIVATION(blockC01, NL_M, NL_N); + blockC11 = ACTIVATION(blockC11, NL_M, NL_N); + blockC21 = ACTIVATION(blockC21, NL_M, NL_N); + + for (unsigned i = 0; i < 8; i++) + { + out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; + out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; + out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i]; + } + + // Remaining channels + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out1[(24+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC31[i], NL_M, NL_N); + } + } + else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 ) + { + #if BIAS_TERM + blockC01 += *biasPtr; + blockC11 += *(biasPtr + 1); + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 16 ) blockC21 += *(biasPtr + 2); + #endif + + blockC01 = ACTIVATION(blockC01, NL_M, NL_N); + blockC11 = ACTIVATION(blockC11, NL_M, NL_N); + + for (unsigned i = 0; i < 8; i++) + { + out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; + out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; + } + + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out1[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC21[i], NL_M, NL_N); + } + } + else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 ) + { + #if BIAS_TERM + blockC01 += *biasPtr; + if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 8 ) blockC11 += *(biasPtr + 1); + #endif + + blockC01 = ACTIVATION(blockC01, NL_M, NL_N); + + for (unsigned i = 0; i < 8; i++) + { + out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; + } + + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out1[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC11[i], NL_M, NL_N); + } + } + else + { + #if BIAS_TERM + blockC01 += *biasPtr; + #endif + + for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) + { + out1[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION(blockC01[i], NL_M, NL_N); + } + } + } + } + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl new file mode 100644 index 0000000..68f3bdf --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl @@ -0,0 +1,509 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/mmad.cl" + +#define SUM_SCALE 0.11f +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION(idx) \ + {\ + float4 tmp;\ + for(uint z = 0; z < 4; z++)\ + {\ + tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\ + tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\ + tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\ + tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\ + \ + regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\ + regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\ + regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\ + regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\ + }\ + } + +#elif NO_QUANTIZATION + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = regC[0 * 4 + i][idx];\ + regC_uchar16.s1 = regC[1 * 4 + i][idx];\ + regC_uchar16.s2 = regC[2 * 4 + i][idx];\ + regC_uchar16.s3 = regC[3 * 4 + i][idx];\ + \ + regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\ + regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\ + regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\ + regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\ + \ + regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\ + regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\ + regC_uchar16.sa = regC[2 * 4 + i][idx+2];\ + regC_uchar16.sb = regC[3 * 4 + i][idx+2];\ + \ + regC_uchar16.sc = regC[0 * 4 + i][idx+3];\ + regC_uchar16.sd = regC[1 * 4 + i][idx+3];\ + regC_uchar16.se = regC[2 * 4 + i][idx+3];\ + regC_uchar16.sf = regC[3 * 4 + i][idx+3];\ + {\ + int16 sum;\ + for(uint s = 0; s <16; s++)\ + {\ + sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ + }\ + regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\ + regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\ + regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\ + regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\ + \ + regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\ + regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\ + regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\ + regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\ + \ + regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\ + regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\ + regC_uchar16.sa = convert_uchar_sat( sum.sa );\ + regC_uchar16.sb = convert_uchar_sat( sum.sb );\ + \ + regC_uchar16.sc = convert_uchar_sat( sum.sc );\ + regC_uchar16.sd = convert_uchar_sat( sum.sd );\ + regC_uchar16.se = convert_uchar_sat( sum.se );\ + regC_uchar16.sf = convert_uchar_sat( sum.sf );\ + } + +#else + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + {\ + int16 sum;\ + for(uint s = 0; s <16; s++)\ + {\ + sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ + }\ + regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s0) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s1) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s2) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s3) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + \ + regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s4) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s5) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s6) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s7) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + \ + regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s8) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s9) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sa) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sb) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + \ + regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sc) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sd) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.se) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sf) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + } +#endif + + +inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) +{ +#if OUT_WITH_PADDING == 1 + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; + padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; + padded_offset += y_idx * OUT_Y_PITCH; + padded_offset += x_idx * OUT_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += OUT_OFFSET; + + return padded_offset; +#else + return cOffset; +#endif +} + +#if IN_OUT_OPT != 1 +inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY) +{ +#if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1 + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + x_idx *= strideX; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + y_idx *= strideY; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH; + padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH; + padded_offset += y_idx * IN2_Y_PITCH; + padded_offset += x_idx * IN2_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += IN2_OFFSET; + + return padded_offset; +#else + return cOffset; +#endif +} +#endif + +inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, + __local int8* l_tileB, const uint l_offsetTileB_col0, + const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, + const uint l_offsetTileB_col3, int8* rowA, int8* colB, + int8* regC) +{ + // Read tile A from SLM to regA + uint l_offsetTileATemp = l_offsetTileA; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); + l_offsetTileATemp += 8 * SG_SIZE; + } + // Read tile B from SLM to regB and compute mmad + colB[0] = l_tileB[l_offsetTileB_col0]; + colB[1] = l_tileB[l_offsetTileB_col1]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); + } + colB[0] = l_tileB[l_offsetTileB_col2]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); + } + colB[1] = l_tileB[l_offsetTileB_col3]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); + } + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); + } +} + +/* + * \brief GEMM kernel to compute MxN matrix using SLM + * \param g_inA - Input matrix + * \param g_inB - Input matrix + * \param g_outC - Output matrix + */ + +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) +KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8_fused_eltwise) + ( + __global char* const g_inA, + __global int* g_outC, + __global char* const g_inB, + #if BIAS_TERM + __global BIAS_TYPE* biases, + #endif + __global float* quantizations, + #if CALIBRATION_TERM + __global float* calibrations, + #endif + uint split_idx, + __global char* const input2, + __global float* eltw_calibrations + ) +{ + + __global int4* const g_matrixA = (__global int4*)g_inA; + __global int4* const g_matrixB = (__global int4*)g_inB; + __global int8* g_matrixC = (__global int8*)g_outC; + + // Each work-group works to compute 128x128 tile. + // Each work-group contains 16 sub-groups. + // Each sub-group within the work-group works to compute a 32x32 tile. + // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). + // 2) Each sub-group works to compute 32x32 tileC (stored in regC). + // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. + // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") + __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 + __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 + + __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; + __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; + __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; + + const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y); + + const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); + const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); + const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); + const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); + + // Thread IDs + const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY + const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX + const uint l_tidX = get_local_id(DIM_X); // 0,...,31 in WG + const uint l_tidY = get_local_id(DIM_Y); // 0,1,2,3 in WG + const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX; // 0,1,2,...127 + + // SubGroup IDs + const uint sg_tid = get_sub_group_local_id(); // 0,1,...,8 + const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); //{0}/8 + const uint sg_global_idY = g_tidY; //{0} + + const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3} + const uint sg_local_idY = l_tidY; // 0,1,2,3 + const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4 + + const uint sub_group_id = get_sub_group_id(); + + + // Registers + int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts // (32/8)*4 + int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA + int8 colB[2]; // each lane will store 32x4 piece of matrixB + + // SLM indices + const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; + const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); + const uint numElements32x8TileB = numElements32x32TileB / 4; + const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; + const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; + const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; + + // Global indices + uint g_idxA[2]; + uint g_idxB[2]; +#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) + g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid; + g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid; + g_idxA[1] = g_idxA[0] + l_groupSize; + g_idxB[1] = g_idxB[0] + l_groupSize; +#else // Row (matrixA) and Col (matrixB) major layout + g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); + g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); +#endif + + // Initial SLM setup + { + l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; + l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; + l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; + l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; + +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + } + + int4 hdcReadValueA[2]; + int4 hdcReadValueB[2]; + + __attribute__((opencl_unroll_hint(1))) + for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) + { + /* + * SLM setup - HDC read only + */ + // Overlap HDC reads with mmad compute + hdcReadValueA[0] = g_matrixA[g_idxA[0]]; + hdcReadValueB[0] = g_matrixB[g_idxB[0]]; + hdcReadValueA[1] = g_matrixA[g_idxA[1]]; + hdcReadValueB[1] = g_matrixB[g_idxB[1]]; + +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + /* + * mmad compute + */ + FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, + l_offsetTileB_col3, rowA, colB, regC); + + /* + * SLM setup - SLM write only + */ + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; + + barrier(CLK_LOCAL_MEM_FENCE); + } // main outer loop + + /* + * Last mmad compute iteration (avoids branching in main loop) + */ + + FUNC_CALL(mmad_32x32_int8)( + &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, + &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, + regC); + +#ifdef OUTPUT_TILED_GLOBAL_LAYOUT + // Write out in swizzled manner after quantizing + __global uchar* g_outC_uchar = (__global uchar*)g_outC; + uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + + sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); + + uchar16 regC_uchar16; + uint offset_uc16 = 0; + + const uint workgroup_id_x = get_group_id(0); + uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x + uint feature = get_sub_group_local_id()*4 + feature_off; + + float4 quant_f = vload4(0, quantizations + feature); + float4 bias_f = vload4(0, biases + feature); + float4 calib_f = vload4(0, calibrations + feature); + + // eltwise calibs + float4 eltw_calib_f = vload4(0, eltw_calibrations + feature); + + uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))]; + uint tmpcOff = cOffset; + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) + for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++) + { + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff); +#if IN_OUT_OPT == 1 + eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset))); +#else + const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y); + eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset))); +#endif + tmpcOff += sizeof(uchar16) * SG_SIZE; + } + +#if MMAD_SUPPORTED == 1 + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) +#endif + for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) + { + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + uchar16 eltw_input_vals = eltw[i * 2]; + // B0..3, F0..31 + QUANTIZATION(0); + } + + intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); + cOffset += sizeof(uchar16) * SG_SIZE; + + // now we need to calculate again for other x + padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + uchar16 eltw_input_vals = eltw[i * 2 + 1]; + // B0..3, F0..31 + QUANTIZATION(4); + } + + intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); + cOffset += sizeof(uchar16) * SG_SIZE; + } +#else + // Write final accumulated values + uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + + sg_tid * (MATRIX_M / 8); + __attribute__((opencl_unroll_hint(SIMD_LANE_N))) + for (uint i = 0; i < (SIMD_LANE_N); ++i) + { + __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) + for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) + { + g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; + } + cOffset += SG_SIZE * (MATRIX_M / 8); + } +#endif +} + +#undef SUM_SCALE +#undef SCALE +#undef QUANTIZATION \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl new file mode 100644 index 0000000..45148c1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl @@ -0,0 +1,505 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/mmad.cl" + +#define SUM_SCALE 0.11f +#define SCALE 0.11f + +#ifdef LIGHTWEIGHT_QUANTIZATION + +#define QUANTIZATION(idx) \ + {\ + float4 tmp;\ + for(uint z = 0; z < 4; z++)\ + {\ + tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\ + tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\ + tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\ + tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\ + \ + regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\ + regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\ + regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\ + regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\ + }\ + } + +#elif NO_QUANTIZATION + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = regC[0 * 4 + i][idx];\ + regC_uchar16.s1 = regC[1 * 4 + i][idx];\ + regC_uchar16.s2 = regC[2 * 4 + i][idx];\ + regC_uchar16.s3 = regC[3 * 4 + i][idx];\ + \ + regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\ + regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\ + regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\ + regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\ + \ + regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\ + regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\ + regC_uchar16.sa = regC[2 * 4 + i][idx+2];\ + regC_uchar16.sb = regC[3 * 4 + i][idx+2];\ + \ + regC_uchar16.sc = regC[0 * 4 + i][idx+3];\ + regC_uchar16.sd = regC[1 * 4 + i][idx+3];\ + regC_uchar16.se = regC[2 * 4 + i][idx+3];\ + regC_uchar16.sf = regC[3 * 4 + i][idx+3];\ + {\ + int16 sum;\ + for(uint s = 0; s <16; s++)\ + {\ + sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ + }\ + regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\ + regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\ + regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\ + regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\ + \ + regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\ + regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\ + regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\ + regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\ + \ + regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\ + regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\ + regC_uchar16.sa = convert_uchar_sat( sum.sa );\ + regC_uchar16.sb = convert_uchar_sat( sum.sb );\ + \ + regC_uchar16.sc = convert_uchar_sat( sum.sc );\ + regC_uchar16.sd = convert_uchar_sat( sum.sd );\ + regC_uchar16.se = convert_uchar_sat( sum.se );\ + regC_uchar16.sf = convert_uchar_sat( sum.sf );\ + } + +#else + +#define QUANTIZATION(idx) \ + regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + \ + regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\ + regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\ + regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\ + regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\ + {\ + int16 sum;\ + for(uint s = 0; s <16; s++)\ + {\ + sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ + }\ + regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s0) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s1) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s2) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s3) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + \ + regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s4) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s5) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s6) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s7) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + \ + regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s8) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.s9) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sa) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sb) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + \ + regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sc) * eltw_calib_f.s0)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sd) * eltw_calib_f.s1)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.se) * eltw_calib_f.s2)), NL_M_ELTW, NL_N_ELTW));\ + regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char((int)round( (float)(sum.sf) * eltw_calib_f.s3)), NL_M_ELTW, NL_N_ELTW));\ + } +#endif + +inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) +{ +#if OUT_WITH_PADDING == 1 + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; + padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; + padded_offset += y_idx * OUT_Y_PITCH; + padded_offset += x_idx * OUT_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += OUT_OFFSET; + + return padded_offset; +#else + return cOffset; +#endif +} + +#if IN_OUT_OPT != 1 +inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY) +{ +#if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1 + uint tmp_idx = cOffset; + uint f_val_idx = tmp_idx % 32; + tmp_idx /= 32; + uint b_val_idx = tmp_idx % 4; + tmp_idx /= 4; + uint x_idx = tmp_idx % OUTPUT_SIZE_X; + x_idx *= strideX; + tmp_idx /= OUTPUT_SIZE_X; + uint y_idx = tmp_idx % OUTPUT_SIZE_Y; + y_idx *= strideY; + tmp_idx /= OUTPUT_SIZE_Y; + uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); + tmp_idx /= (OUTPUT_BATCH_NUM / 4); + uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); + + uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH; + padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH; + padded_offset += y_idx * IN2_Y_PITCH; + padded_offset += x_idx * IN2_X_PITCH; + padded_offset += b_val_idx * 32; + padded_offset += f_val_idx; + padded_offset += IN2_OFFSET; + + return padded_offset; +#else + return cOffset; +#endif +} +#endif + +inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, + __local int8* l_tileB, const uint l_offsetTileB_col0, + const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, + const uint l_offsetTileB_col3, int8* rowA, int8* colB, + int8* regC) +{ + // Read tile A from SLM to regA + uint l_offsetTileATemp = l_offsetTileA; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); + l_offsetTileATemp += 8 * SG_SIZE; + } + // Read tile B from SLM to regB and compute mmad + colB[0] = l_tileB[l_offsetTileB_col0]; + colB[1] = l_tileB[l_offsetTileB_col1]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); + } + colB[0] = l_tileB[l_offsetTileB_col2]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); + } + colB[1] = l_tileB[l_offsetTileB_col3]; + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); + } + __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) + for (uint j = 0; j < (SG_TILE_M / 8); ++j) + { + // Compute partial C + regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); + } +} + +/* + * \brief GEMM kernel to compute MxN matrix using SLM + * \param g_inA - Input matrix + * \param g_inB - Input matrix + * \param g_outC - Output matrix + */ + +__attribute__((intel_reqd_sub_group_size(SG_SIZE))) +KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8_fused_eltwise) + (__global char* const g_inA, + __global int* g_outC, + __global char* const g_inB, + #if BIAS_TERM + __global BIAS_TYPE* biases, + #endif + __global float* quantizations, + #if CALIBRATION_TERM + __global float* calibrations, + #endif + uint split_idx, + __global char* const input2, + __global float* eltw_calibrations + ) +{ + + __global int4* const g_matrixA = (__global int4*)g_inA; + __global int4* const g_matrixB = (__global int4*)g_inB; + __global int8* g_matrixC = (__global int8*)g_outC; + + // Each work-group works to compute 128x128 tile. + // Each work-group contains 16 sub-groups. + // Each sub-group within the work-group works to compute a 32x32 tile. + // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). + // 2) Each sub-group works to compute 32x32 tileC (stored in regC). + // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. + // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") + __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; + __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; + + __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; + __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; + __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; + + const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y); + + const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); + const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); + const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); + const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); + + // Thread IDs + const uint g_tidY = get_global_id(DIM_Y); + const uint g_tidX = get_global_id(DIM_X); + const uint l_tidX = get_local_id(DIM_X); + const uint l_tidY = get_local_id(DIM_Y); + const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX; + + // SubGroup IDs + const uint sg_tid = get_sub_group_local_id(); + const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); + const uint sg_global_idY = g_tidY; + const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); + const uint sg_local_idY = l_tidY; + const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; + + const uint sub_group_id = get_sub_group_id(); + + // Registers + int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts + int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA + int8 colB[2]; // each lane will store 32x4 piece of matrixB + + // SLM indices + const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; + const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); + const uint numElements32x8TileB = numElements32x32TileB / 4; + const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; + const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; + const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; + const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; + + // Global indices + uint g_idxA[2]; + uint g_idxB[2]; +#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) + g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid; + g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid; + g_idxA[1] = g_idxA[0] + l_groupSize; + g_idxB[1] = g_idxB[0] + l_groupSize; +#else // Row (matrixA) and Col (matrixB) major layout + g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) + + (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); + g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); + g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); +#endif + // Initial SLM setup + { + l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; + l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; + + l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; + if (l_tid < 32) + { + // Not all work-items will be needed to fetch the remaining matrix B + l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; + } +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + } + int4 hdcReadValueA[2]; + int4 hdcReadValueB[2]; + + __attribute__((opencl_unroll_hint(1))) + for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) + { + hdcReadValueA[0] = g_matrixA[g_idxA[0]]; + hdcReadValueB[0] = g_matrixB[g_idxB[0]]; + hdcReadValueA[1] = g_matrixA[g_idxA[1]]; + if (l_tid < 32) + { + // Not all work-items will be needed to fetch the remaining matrix B + hdcReadValueB[1] = g_matrixB[g_idxB[1]]; + } +#ifdef TILED_GLOBAL_LAYOUT + g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); +#else + g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); + g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); + g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); +#endif + + + //MMAD compute + FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, + l_offsetTileB_col3, rowA, colB, regC); + + //SLM setup - SLM write only + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; + l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; + if (l_tid < 32) + { + // Not all work-items will be needed to fetch the remaining matrix B + l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } // main outer loop + + //Last MMAD compute iteration (avoids branching in main loop) + FUNC_CALL(mmad_32x32_int8)( + &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], + l_offsetTileA, + &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], + l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, + regC); + + +#ifdef OUTPUT_TILED_GLOBAL_LAYOUT + + // Write out in swizzled manner after quantizing + __global uchar* g_outC_uchar = (__global uchar*)g_outC; + uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + + sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); + + uchar16 regC_uchar16; + uint offset_uc16 = 0; + + const uint workgroup_id_x = get_group_id(0); + uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x + uint feature = get_sub_group_local_id()*4 + feature_off; + + float4 quant_f = vload4(0, quantizations + feature); + float4 bias_f = vload4(0, biases + feature); + float4 calib_f = vload4(0, calibrations + feature); + + // eltwise calibs + float4 eltw_calib_f = vload4(0, eltw_calibrations + feature); + + uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))]; + uint tmpcOff = cOffset; + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) + for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++) + { + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff); +#if IN_OUT_OPT == 1 + eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset))); +#else + const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y); + eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset))); +#endif + tmpcOff += sizeof(uchar16) * SG_SIZE; + } + +#if MMAD_SUPPORTED == 1 + __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) +#endif + for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) + { + uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + uchar16 eltw_input_vals = eltw[i * 2]; + // B0..3, F0..31 + QUANTIZATION(0); + } + + intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); + cOffset += sizeof(uchar16) * SG_SIZE; + + // now we need to calculate again for other x + padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); + { + uchar16 eltw_input_vals = eltw[i * 2 + 1]; + // B0..3, F0..31 + QUANTIZATION(4); + } + + intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); + cOffset += sizeof(uchar16) * SG_SIZE; + } +#else + // Write final accumulated values + uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + + sg_tid * (MATRIX_M / 8); + __attribute__((opencl_unroll_hint(SIMD_LANE_N))) + for (uint i = 0; i < (SIMD_LANE_N); ++i) + { + __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) + for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) + { + g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; + } + cOffset += SG_SIZE * (MATRIX_M / 8); + } +#endif +} + +#undef SUM_SCALE +#undef SCALE +#undef QUANTIZATION \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_yxfb_yxio_b16_fp16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_yxfb_yxio_b16_fp16.cl new file mode 100644 index 0000000..241200f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_yxfb_yxio_b16_fp16.cl @@ -0,0 +1,256 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" +#include "include/sub_group.cl" + +__attribute__((intel_reqd_sub_group_size(16))) +__attribute__((reqd_work_group_size(16, 1, 1))) +KERNEL(fused_conv_eltwise_gpu_yxfb_yxio_b16)( + const __global UNIT_TYPE* input, + __global UNIT_TYPE* output, + const __global UNIT_TYPE* filter, +#if BIAS_TERM + const __global UNIT_TYPE* bias, +#endif + uint split_idx, + const __global UNIT_TYPE* input2) +{ + // get_global_size(0) -> Number of work items needed to compute all features and all batches for single output spatial position + // (single (x, y) point in output). + // get_global_size(1) -> Output size in X-dimension. + // get_global_size(2) -> Output size in Y-dimension. + // get_global_id(0) -> Id of work item computing single spatial point of output indicated by get_global_id(1), get_global_id(2). + // get_group_id(1) -> Current x-position in output. + // get_group_id(2) -> Current y-position in output. + // + // WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS -> Number of work items needed to compute entire one batch for at least one feature and one spatial point. + // (this number in current implementation computes also OFM_PER_WORK_ITEM output features at the same time). + // FILTER_ARRAY_NUM -> Number of filters groups (split size). + + const uint out_x = get_group_id(1); + const uint out_y = get_group_id(2); + + const uint output_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM; + const uint output_x_size = OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X; + const uint linear_id_xy = OUTPUT_PAD_BEFORE_SIZE_X + out_x + output_x_size * (out_y + OUTPUT_PAD_BEFORE_SIZE_Y); + uint global_id = (((uint)get_global_id(0) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (output_f_size / OFM_PER_WORK_ITEM)) * WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS; + + const uint sub_group_id = get_local_id(0); + +#if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1) + const uint chunk_size = sizeof(uint)/sizeof(UNIT_TYPE); +#else + const uint chunk_size = 1; +#endif + + const uint out_batch_id = chunk_size * sub_group_id + LOCAL_WORK_GROUP_SIZE * BATCHES_PER_WORK_ITEM * ((uint)get_group_id(0) % LOCAL_WORK_GROUPS_PER_SINGLE_BATCHES_ELEMENTS); + + const uint out_id = (global_id / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) * OFM_PER_WORK_ITEM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_FEATURE_NUM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_BATCH_NUM + out_batch_id; + + const uint ofm_offset = ((global_id * OFM_PER_WORK_ITEM) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) % output_f_size; + +#if IN_OUT_OPT != 1 // calculating eltwise offset + const uint eltw_x = out_x * ELTW_STRIDE_X; + const uint eltw_y = out_y * ELTW_STRIDE_Y; + + const uint eltw_f_size = INPUT1_PAD_BEFORE_FEATURE_NUM + INPUT1_FEATURE_NUM + INPUT1_PAD_AFTER_FEATURE_NUM; + const uint eltw_x_size = INPUT1_PAD_BEFORE_SIZE_X + INPUT1_SIZE_X + INPUT1_PAD_AFTER_SIZE_X; + + const uint eltw_linear_id_xy = INPUT1_PAD_BEFORE_SIZE_X + eltw_x + eltw_x_size * (eltw_y + INPUT1_PAD_BEFORE_SIZE_Y); + + uint eltw_global_id = (((uint)get_global_id(0) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) + (eltw_linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (eltw_f_size / OFM_PER_WORK_ITEM)) * WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS; + const uint eltw_id = (eltw_global_id / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) * OFM_PER_WORK_ITEM * INPUT1_FEATURE_PITCH + INPUT1_PAD_BEFORE_FEATURE_NUM * INPUT1_FEATURE_PITCH + INPUT1_PAD_BEFORE_BATCH_NUM + out_batch_id; +#endif + + // Each component of vector element contains computation for separate output feature. + half16 _data[BATCHES_PER_WORK_ITEM]; + for(uint i = 0; i < BATCHES_PER_WORK_ITEM; i++) + { + _data[i] = UNIT_VAL_ZERO; + } + + const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X; + const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + + for (uint i = 0; i < FILTER_SIZE_Y; i++) + { + const int input_offset_y = y + i * DILATION_SIZE_Y; + const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; + + if(!zero_y) + { + for (uint j = 0; j < FILTER_SIZE_X; j++) + { + const int input_offset_x = x + j * DILATION_SIZE_X; + const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; + + if(!zero) + { + uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH; + input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH; + input_idx += out_batch_id; + + //sub_group_id used as offset to make each workitem load different filter, and then shuffle it + // 2 * sub_group_id is used because we group 2 halfs as one uint element. + uint filter_idx = ofm_offset + 2*sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH; + + for (uint h = 0; h < FILTER_IFM_NUM; h++) + { +#if defined(USE_BLOCK_READ_2) + half4 _input = as_half4(intel_sub_group_block_read2((const __global uint*)(input + input_idx))); + uint filter_val_pair = *(const __global uint*)(filter + filter_idx); + half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair); + _data[0] = fma(_input.s0, filter_transp, _data[0]); + _data[1] = fma(_input.s1, filter_transp, _data[1]); + _data[2] = fma(_input.s2, filter_transp, _data[2]); + _data[3] = fma(_input.s3, filter_transp, _data[3]); + input_idx += INPUT0_FEATURE_PITCH; +#elif defined(USE_BLOCK_READ_1) + half2 _input = as_half2(intel_sub_group_block_read((const __global uint*)(input + input_idx))); + uint filter_val_pair = *(const __global uint*)(filter + filter_idx); + half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair); + _data[0] = fma(_input.s0, filter_transp, _data[0]); + _data[1] = fma(_input.s1, filter_transp, _data[1]); + input_idx += INPUT0_FEATURE_PITCH; +#else + uint filter_val_pair = *(const __global uint*)(filter + filter_idx); + half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair); + for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++) + { + _data[s] = fma(input[input_idx], filter_transp, _data[s]); + input_idx += LOCAL_WORK_GROUP_SIZE; + } + input_idx += INPUT0_FEATURE_PITCH - BATCHES_PER_WORK_ITEM * LOCAL_WORK_GROUP_SIZE; +#endif + filter_idx += FILTER_IFM_PITCH; + } + } + } + } + } + +#if BIAS_TERM + uint bias_val_pair = *(const __global uint*)(bias + (ofm_offset + 2 * sub_group_id)); + for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++) + { + ADD_BIAS_16_FP16(_data[s], bias_val_pair); + } +#endif + for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++) + { + _data[s] = ACTIVATION(_data[s], NL_M, NL_N); + } + +#if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1) + #if BATCHES_PER_WORK_ITEM == 4 + uint _out_id = OUTPUT_VIEW_OFFSET + out_id; + for(uint i = 0; i < 16; i++) + { +#if IN_OUT_OPT == 1 + half2 eltw_second_input_data0 = as_half2(*(__global uint*)(output + _out_id )); + half2 eltw_second_input_data1 = as_half2(*(__global uint*)(output + _out_id + 32)); +#else + uint _eltw_id = INPUT1_VIEW_OFFSET + eltw_id; + half2 eltw_second_input_data0 = as_half2(*(__global uint*)(input2 + _eltw_id + i * INPUT1_FEATURE_PITCH)); + half2 eltw_second_input_data1 = as_half2(*(__global uint*)(input2 + _eltw_id + i * INPUT1_FEATURE_PITCH + 32)); +#endif + _data[0][i] += eltw_second_input_data0.s0; + _data[1][i] += eltw_second_input_data0.s1; + _data[2][i] += eltw_second_input_data1.s0; + _data[3][i] += eltw_second_input_data1.s1; + + _data[0][i] = ACTIVATION_ELTW(_data[0][i], NL_M_ELTW, NL_N_ELTW); + _data[1][i] = ACTIVATION_ELTW(_data[1][i], NL_M_ELTW, NL_N_ELTW); + _data[2][i] = ACTIVATION_ELTW(_data[2][i], NL_M_ELTW, NL_N_ELTW); + _data[3][i] = ACTIVATION_ELTW(_data[3][i], NL_M_ELTW, NL_N_ELTW); + + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[0][i], _data[1][i])); + *(__global uint*)(output + _out_id + 32) = as_uint((half2)(_data[2][i], _data[3][i])); + _out_id += OUTPUT_FEATURE_PITCH; + } + #else + for(uint s = 0; s < BATCHES_PER_WORK_ITEM / 2; s++) + { + uint _out_id = OUTPUT_VIEW_OFFSET + out_id + chunk_size * s * LOCAL_WORK_GROUP_SIZE; + + for(uint i = 0; i < 16; i++) + { +#if IN_OUT_OPT == 1 + half2 eltw_second_input_data = as_half2(*(__global uint*)(output + _out_id + i * OUTPUT_FEATURE_PITCH)); +#else + uint _eltw_id = INPUT1_VIEW_OFFSET + eltw_id + chunk_size * s * LOCAL_WORK_GROUP_SIZE; + half2 eltw_second_input_data = as_half2(*(__global uint*)(input2 + _eltw_id + i * INPUT1_FEATURE_PITCH)); +#endif + _data[chunk_size * s][i] += eltw_second_input_data.s0; + _data[chunk_size * s + 1][i] += eltw_second_input_data.s1; + _data[chunk_size * s][i] = ACTIVATION_ELTW(_data[chunk_size * s][i], NL_M_ELTW, NL_N_ELTW); + _data[chunk_size * s + 1][i] = ACTIVATION_ELTW(_data[chunk_size * s + 1][i], NL_M_ELTW, NL_N_ELTW); + } + + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s0, _data[chunk_size * s + 1].s0)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s1, _data[chunk_size * s + 1].s1)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s2, _data[chunk_size * s + 1].s2)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s3, _data[chunk_size * s + 1].s3)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s4, _data[chunk_size * s + 1].s4)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s5, _data[chunk_size * s + 1].s5)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s6, _data[chunk_size * s + 1].s6)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s7, _data[chunk_size * s + 1].s7)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s8, _data[chunk_size * s + 1].s8)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s9, _data[chunk_size * s + 1].s9)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sa, _data[chunk_size * s + 1].sa)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sb, _data[chunk_size * s + 1].sb)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sc, _data[chunk_size * s + 1].sc)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sd, _data[chunk_size * s + 1].sd)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].se, _data[chunk_size * s + 1].se)); _out_id += OUTPUT_FEATURE_PITCH; + *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sf, _data[chunk_size * s + 1].sf)); _out_id += OUTPUT_FEATURE_PITCH; + } + #endif +#else + for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++) + { + uint _out_id = OUTPUT_VIEW_OFFSET + out_id + s * LOCAL_WORK_GROUP_SIZE; + + for(uint i = 0; i < 16; i++) + { +#if IN_OUT_OPT == 1 + half eltw_second_input_data = output[_out_id + i * OUTPUT_FEATURE_PITCH]; +#else + uint _eltw_id = INPUT1_VIEW_OFFSET + eltw_id + s * LOCAL_WORK_GROUP_SIZE; + half eltw_second_input_data = output[_eltw_id + i * INPUT1_FEATURE_PITCH]; +#endif + _data[s][i] += eltw_second_input_data; + _data[s][i] = ACTIVATION_ELTW(_data[s][i], NL_M_ELTW, NL_N_ELTW); + } + + output[_out_id] = _data[s].s0; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s1; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s2; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s3; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s4; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s5; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s6; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s7; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s8; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].s9; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].sa; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].sb; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].sc; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].sd; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].se; _out_id += OUTPUT_FEATURE_PITCH; + output[_out_id] = _data[s].sf; _out_id += OUTPUT_FEATURE_PITCH; + } +#endif +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl new file mode 100644 index 0000000..ee2adda --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl @@ -0,0 +1,33 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +KERNEL(gather_ref)(const __global UNIT_TYPE* dictionary, const __global float* indices, __global UNIT_TYPE* output) +{ + const uint workItemId = get_global_id(0); + + if (workItemId >= COMPUTATIONAL_OPERATIONS_NUMBER) + return; + + uint partNumber = workItemId / INPUT1_LENGTH; + uint outputIndex = workItemId * SLICE_SIZE; + uint index = workItemId - (partNumber * INPUT1_LENGTH); + + for (int k = 0; k < SLICE_SIZE; ++k) + { + output[outputIndex++] = dictionary[(partNumber * PART_SIZE) + ((uint) indices[index] * SLICE_SIZE) + k]; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gemm_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gemm_ref.cl index 26656ab..a8a29b1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gemm_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gemm_ref.cl @@ -50,7 +50,7 @@ for (uint i = 0; i < Y1; ++i) value = fma(input0[in0_idx], input1[in1_idx], value); } #if TRANSPOSE_INPUT1 && TRANSPOSE_INPUT2 - uint out_idx = y * X1 + x + b * X1 * Y2; + uint out_idx = x * Y2 + y + b * X1 * Y2; #elif TRANSPOSE_INPUT1 uint out_idx = x * X2 + y + b * X1 * Y1; #elif TRANSPOSE_INPUT2 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl index 4bc9338..14db17d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl @@ -16,13 +16,24 @@ #include "include/include_all.cl" -#if ELTWISE_LAYOUT_BASED || QUANTIZATION_TERM +#ifdef INPUT_STRIDED + +#define GET_INDEX(prefix, num) \ + CAT(CAT(prefix, num), _OFFSET) + \ + ((d1 * CAT(CAT(prefix, num), _STRIDE_X)) % CAT(CAT(prefix, num), _SIZE_X))*CAT(CAT(prefix, num), _X_PITCH) +\ + ((d2 * CAT(CAT(prefix, num), _STRIDE_Y)) % CAT(CAT(prefix, num), _SIZE_Y))*CAT(CAT(prefix, num), _Y_PITCH) +\ + (d3 % CAT(CAT(prefix, num), _FEATURE_NUM))*CAT(CAT(prefix, num), _FEATURE_PITCH) + \ + (d4 % CAT(CAT(prefix, num), _BATCH_NUM ))*CAT(CAT(prefix, num), _BATCH_PITCH) + +#else + +#if ELTWISE_LAYOUT_BASED || QUANTIZATION_TERM || ELTWISE_BROADCAST #define GET_INDEX(prefix, num) \ CAT(CAT(prefix, num), _OFFSET) + \ - (d1 % CAT(CAT(prefix, num), _SIZE_X ))*CAT(CAT(prefix, num), _X_PITCH) + \ - (d2 % CAT(CAT(prefix, num), _SIZE_Y ))*CAT(CAT(prefix, num), _Y_PITCH) + \ - (d3 % CAT(CAT(prefix, num), _FEATURE_NUM))*CAT(CAT(prefix, num), _FEATURE_PITCH) + \ + (d1 % CAT(CAT(prefix, num), _SIZE_X ))*CAT(CAT(prefix, num), _X_PITCH) + \ + (d2 % CAT(CAT(prefix, num), _SIZE_Y ))*CAT(CAT(prefix, num), _Y_PITCH) + \ + (d3 % CAT(CAT(prefix, num), _FEATURE_NUM))*CAT(CAT(prefix, num), _FEATURE_PITCH) + \ (d4 % CAT(CAT(prefix, num), _BATCH_NUM ))*CAT(CAT(prefix, num), _BATCH_PITCH) #elif ELTWISE_NO_PITCH_SAME_DIMS @@ -40,6 +51,9 @@ #endif +#endif + + KERNEL(eltwise)( INPUTS_DECLS __global UNIT_TYPE* output @@ -48,9 +62,9 @@ KERNEL(eltwise)( #endif ) { -#if ELTWISE_LAYOUT_BASED || QUANTIZATION_TERM - const uint d1 = get_global_id(GWS_YX) % INPUT0_SIZE_X; // X - const uint d2 = get_global_id(GWS_YX) / INPUT0_SIZE_X; // Y +#if ELTWISE_LAYOUT_BASED || QUANTIZATION_TERM || ELTWISE_BROADCAST + const uint d1 = get_global_id(GWS_YX) % OUTPUT_SIZE_X; // X + const uint d2 = get_global_id(GWS_YX) / OUTPUT_SIZE_X; // Y const uint d3 = get_global_id(GWS_FEATURE); // Feature const uint d4 = get_global_id(GWS_BATCH); // Batch @@ -67,7 +81,7 @@ KERNEL(eltwise)( const uint d2 = get_global_id(1); const uint d3 = get_global_id(2) % OUTPUT_SIZES[2]; const uint d4 = get_global_id(2) / OUTPUT_SIZES[2]; - + uint output_offset = OUTPUT_OFFSET + d1*OUTPUT_PITCHES[0] + d2*OUTPUT_PITCHES[1] + @@ -80,7 +94,7 @@ KERNEL(eltwise)( #else UNIT_TYPE res; #endif - + DO_ELTWISE; #if QUANTIZATION_TERM diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/arg_max_min_common.cl similarity index 73% rename from inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3.cpp rename to inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/arg_max_min_common.cl index 4bfd3d1..52531ee 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_GT3.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/arg_max_min_common.cl @@ -14,13 +14,13 @@ // limitations under the License. */ -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector -{ - // KBL GT3e - void tuning_cache_5927(tuning_data& td) +/* Index and Value type that holds index and value used in this kernel */ + +#ifndef IAV_STRUCT_DEFINED + typedef struct { - tuning_cache_5927_B1(td); - } -} \ No newline at end of file + uint index; + UNIT_TYPE value; + } iav_type; + #define IAV_STRUCT_DEFINED +#endif \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl index d5ca258..24040f2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl @@ -14,6 +14,10 @@ // limitations under the License. */ +#if defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif + #define __CAT(x, y) x##y #define CAT(x, y) __CAT(x, y) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl index 9949216..8d35591 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl @@ -14,10 +14,6 @@ // limitations under the License. */ -#if defined(cl_khr_fp16) -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif - // TODO: currently we calculate on float32 because it's lot of "add" operation and it stuck on the value "8192.0f" #if !defined(ACCUMULATOR_TYPE) #define ACCUMULATOR_TYPE float diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/detection_output_common.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/detection_output_common.cl new file mode 100644 index 0000000..68016af --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/detection_output_common.cl @@ -0,0 +1,180 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define PRIOR_BOX_SIZE 4 // Each prior-box consists of [xmin, ymin, xmax, ymax]. +#define OUTPUT_ROW_SIZE 7 // Each detection consists of [image_id, label, confidence, xmin, ymin, xmax, ymax]. + +#define CODE_TYPE_CORNER 0 +#define CODE_TYPE_CENTER_SIZE 1 +#define CODE_TYPE_CORNER_SIZE 2 + +#define HIDDEN_CLASS ((BACKGROUND_LABEL_ID == 0 && SHARE_LOCATION)? 1 : 0) +#define NUM_OF_IMAGES INPUT0_BATCH_NUM +#define NUM_LOC_CLASSES ((SHARE_LOCATION)? 1 : NUM_CLASSES) +#define NUM_CLASSES_OUT ((HIDDEN_CLASS == 1)? NUM_CLASSES - 1 : NUM_CLASSES) +#define NUM_OF_PRIORS (INPUT0_LENGTH / (NUM_OF_IMAGES * NUM_LOC_CLASSES * PRIOR_BOX_SIZE)) +#define NUM_OF_ITEMS ((NUM_OF_PRIORS / 256) + 1) +#define NUM_OF_ITERATIONS ((NUM_OF_PRIORS % NUM_OF_ITEMS == 0)? (NUM_OF_PRIORS / NUM_OF_ITEMS) : ((NUM_OF_PRIORS / NUM_OF_ITEMS) + 1)) + +#define X_SIZE INPUT0_Y_PITCH +#define Y_SIZE (INPUT0_FEATURE_PITCH/INPUT0_Y_PITCH) +#define LOCATION_PADDING (INPUT0_PAD_BEFORE_SIZE_Y * X_SIZE + INPUT0_PAD_BEFORE_SIZE_X) +#define LOC_XY_SIZE_PRODUCT (X_SIZE * Y_SIZE) +#define CONF_PADDING (CONF_PADDING_Y * CONF_SIZE_X + CONF_PADDING_X) +#define CONF_XY_SIZE_PRODUCT (CONF_SIZE_X * CONF_SIZE_Y) + +#define NUM_OF_PRIOR_COMPONENTS (NUM_OF_PRIORS * PRIOR_BOX_SIZE) +#define NUM_OF_IMAGE_CONF (INPUT0_LENGTH/NUM_OF_IMAGES/PRIOR_BOX_SIZE) + +#define SCORES_COUNT (((TOP_K != -1) && (TOP_K < NUM_OF_PRIORS))? TOP_K : NUM_OF_PRIORS) + +#define OUTPUT_OFFSET (((NUM_OF_IMAGES + 15) / 16) * 16) +#define SCORE_OFFSET 2 + +#define INPUT_OFFSET (((NUM_IMAGES + 15) / 16) * 16) +#define INPUT_BBOXES_COUNT ((INPUT0_LENGTH - INPUT_OFFSET) / OUTPUT_ROW_SIZE) +#define NUM_CLASSES_IN NUM_CLASSES_OUT +#define BBOXES_NUM_BASED_TOP_K (TOP_K * NUM_CLASSES_IN * NUM_IMAGES) +#define INPUT_BBOXES_LENGTH (((TOP_K != -1) && (BBOXES_NUM_BASED_TOP_K < INPUT_BBOXES_COUNT))? BBOXES_NUM_BASED_TOP_K : INPUT_BBOXES_COUNT) +#define NUM_OF_CLASS_BBOXES (INPUT_BBOXES_LENGTH / (NUM_IMAGES * NUM_CLASSES_IN)) +#define NUM_OF_IMAGE_BBOXES (INPUT_BBOXES_LENGTH / NUM_IMAGES) +#define NUM_OF_ITEMS_SORT ((NUM_CLASSES_IN / 256) + 1) + + +// Number of bboxes to keep in output +#define KEEP_BBOXES_NUM ((KEEP_TOP_K < NUM_OF_IMAGE_BBOXES)? KEEP_TOP_K : NUM_OF_IMAGE_BBOXES) + +void FUNC(get_decoded_bbox)(UNIT_TYPE* decoded_bbox, __global UNIT_TYPE* input_location, __global UNIT_TYPE* input_prior_box, const uint idx_prior, const uint idx_class, const uint idx_image) +{ + const uint prior_offset = idx_prior * PRIOR_INFO_SIZE + PRIOR_COORD_OFFSET; + uint location_offset = + (NUM_LOC_CLASSES * (idx_prior * PRIOR_BOX_SIZE) + idx_image * INPUT0_FEATURE_NUM + idx_class * PRIOR_BOX_SIZE) * + LOC_XY_SIZE_PRODUCT + + LOCATION_PADDING; + + UNIT_TYPE prior_bboxes[4] = { + input_prior_box[prior_offset], + input_prior_box[prior_offset + 1], + input_prior_box[prior_offset + 2], + input_prior_box[prior_offset + 3]}; + + if (!PRIOR_IS_NORMALIZED) + { + prior_bboxes[0] /= IMAGE_WIDTH; + prior_bboxes[1] /= IMAGE_HEIGH; + prior_bboxes[2] /= IMAGE_WIDTH; + prior_bboxes[3] /= IMAGE_HEIGH; + } + + if (CODE_TYPE == CODE_TYPE_CORNER) + { + if (VARIANCE_ENCODED_IN_TARGET) + { + // variance is encoded in target, we simply need to add the offset predictions. + for(uint i = 0; i < PRIOR_BOX_SIZE; i++) + { + decoded_bbox[i] = + prior_bboxes[i] + + input_location[location_offset]; + + location_offset += LOC_XY_SIZE_PRODUCT; + } + } + else + { + // variance is encoded in bbox, we need to scale the offset accordingly. + for(uint i = 0; i < PRIOR_BOX_SIZE; i++) + { + decoded_bbox[i] = + mad(input_prior_box[NUM_OF_PRIOR_COMPONENTS + i], // prior variances are places after prior bboxes + input_location[location_offset], + prior_bboxes[i]); + + location_offset += LOC_XY_SIZE_PRODUCT; + } + } + } + else if (CODE_TYPE == CODE_TYPE_CENTER_SIZE) + { + const UNIT_TYPE prior_width = prior_bboxes[2] - prior_bboxes[0]; + const UNIT_TYPE prior_height = prior_bboxes[3] - prior_bboxes[1]; + const UNIT_TYPE prior_center_x = (prior_bboxes[0] + prior_bboxes[2]) / 2; + const UNIT_TYPE prior_center_y = (prior_bboxes[1] + prior_bboxes[3]) / 2; + const UNIT_TYPE bbox_xmin = input_location[location_offset]; + const UNIT_TYPE bbox_ymin = input_location[location_offset + LOC_XY_SIZE_PRODUCT]; + const UNIT_TYPE bbox_xmax = input_location[location_offset + 2 * LOC_XY_SIZE_PRODUCT]; + const UNIT_TYPE bbox_ymax = input_location[location_offset + 3 * LOC_XY_SIZE_PRODUCT]; + UNIT_TYPE decode_bbox_center_x, decode_bbox_center_y; + UNIT_TYPE decode_bbox_width, decode_bbox_height; + + if (VARIANCE_ENCODED_IN_TARGET) + { + // variance is encoded in target, we simply need to restore the offset predictions. + decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x; + decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y; + decode_bbox_width = (exp(bbox_xmax) * prior_width) / 2; + decode_bbox_height = (exp(bbox_ymax) * prior_height) / 2; + } + else + { + // variance is encoded in bbox, we need to scale the offset accordingly. + decode_bbox_center_x = input_prior_box[NUM_OF_PRIOR_COMPONENTS] * bbox_xmin * prior_width + prior_center_x; + decode_bbox_center_y = input_prior_box[NUM_OF_PRIOR_COMPONENTS + 1] * bbox_ymin * prior_height + prior_center_y; + decode_bbox_width = (exp(input_prior_box[NUM_OF_PRIOR_COMPONENTS + 2] * bbox_xmax) * prior_width) / 2; + decode_bbox_height = (exp(input_prior_box[NUM_OF_PRIOR_COMPONENTS + 3] * bbox_ymax) * prior_height) / 2; + } + + decoded_bbox[0] = decode_bbox_center_x - decode_bbox_width; + decoded_bbox[1] = decode_bbox_center_y - decode_bbox_height; + decoded_bbox[2] = decode_bbox_center_x + decode_bbox_width; + decoded_bbox[3] = decode_bbox_center_y + decode_bbox_height; + } + else + { + const UNIT_TYPE prior_width = prior_bboxes[2] - prior_bboxes[0]; + const UNIT_TYPE prior_height = prior_bboxes[3] - prior_bboxes[1]; + const UNIT_TYPE bbox_xmin = input_location[location_offset]; + const UNIT_TYPE bbox_ymin = input_location[location_offset + LOC_XY_SIZE_PRODUCT]; + const UNIT_TYPE bbox_xmax = input_location[location_offset + 2 * LOC_XY_SIZE_PRODUCT]; + const UNIT_TYPE bbox_ymax = input_location[location_offset + 3 * LOC_XY_SIZE_PRODUCT]; + + if (VARIANCE_ENCODED_IN_TARGET) + { + // variance is encoded in target, we simply need to add the offset predictions. + decoded_bbox[0] = prior_bboxes[0] + bbox_xmin * prior_width; + decoded_bbox[1] = prior_bboxes[1] + bbox_ymin * prior_height; + decoded_bbox[2] = prior_bboxes[2] + bbox_xmax * prior_width; + decoded_bbox[3] = prior_bboxes[3] + bbox_ymax * prior_height; + } + else + { + // variance is encoded in bbox, we need to scale the offset accordingly. + decoded_bbox[0] = prior_bboxes[0] + input_prior_box[NUM_OF_PRIOR_COMPONENTS] * bbox_xmin * prior_width; + decoded_bbox[1] = prior_bboxes[1] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 1] * bbox_ymin * prior_height; + decoded_bbox[2] = prior_bboxes[2] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 2] * bbox_xmax * prior_width; + decoded_bbox[3] = prior_bboxes[3] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 3] * bbox_ymax * prior_height; + } + } +} + +UNIT_TYPE FUNC(get_score)(__global UNIT_TYPE* input_confidence, const uint idx_prior, const uint idx_class, const uint idx_image) +{ + const uint confidence_offset = // offset in kernel input 'input_confidence' + (idx_prior * NUM_CLASSES + idx_image * NUM_OF_PRIORS * NUM_CLASSES + idx_class) * + CONF_XY_SIZE_PRODUCT + + CONF_PADDING; + + return (input_confidence[confidence_offset] > CONFIDENCE_THRESHOLD)? input_confidence[confidence_offset] : 0; +} + diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl index 582e9f5..837f4fc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl @@ -72,6 +72,27 @@ inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pit CAT(prefix, _FEATURE_NUM), \ CAT(prefix, _OFFSET)) +inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x, + uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset) +{ + const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4; + const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8; + const uint b_offset = b * b_pitch; + const uint xy_offset = x * x_pitch + y * y_pitch; + const uint f_offset = f; + const size_t idx = offset + xy_offset + b_offset + f_offset; + return idx; +} + +#define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x)\ + FUNC_CALL(get_byx8_f4_index)( \ + b, f, y, x, CAT(prefix, _X_PITCH), \ + CAT(prefix, _Y_PITCH), \ + CAT(prefix, _BATCH_PITCH), \ + CAT(prefix, _FEATURE_NUM), \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _OFFSET)) + #define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \ FUNC_CALL(get_bf8_xy16_index)( \ b, f, y, x, CAT(prefix, _SIZE_X ), \ @@ -249,7 +270,35 @@ inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_index)(uint o, uint i, uint y, uint CAT(prefix, _OFM_NUM), \ CAT(prefix, _OFFSET)) +inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset) +{ + const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32; + + const uint f_32_aligned = ((size_ifm + 31)/32) * 32; + const uint isv2_idx = i % 4; + const uint osv_idx = o_swizzled % 8; + const uint isv1_idx = (i / 4) % 8; + const uint is_idx = i / 32; + const uint os_idx = o_swizzled / 8; + size_t idx = offset + isv2_idx + 4 * (osv_idx + 8 * isv1_idx); + idx += x * 4 * 8 * 8; + idx += y * size_x * 4 * 8 * 8; + idx += is_idx * size_y * size_x * 4 * 8 * 8; + idx += os_idx * (f_32_aligned/32) * size_y * size_x * 4 * 8 * 8; + + return idx; +} + +#define GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(prefix, o, i, y, x) \ + FUNC_CALL(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)( \ + o, i, y, x, CAT(prefix, _SIZE_X ), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + CAT(prefix, _OFFSET)) + + inline uint FUNC(get_is_o_yx_isv32_index)(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size) { const uint i_aligned_to_32 = ((i_size + 31) / 32) * 32; @@ -266,6 +315,106 @@ inline uint FUNC(get_is_o_yx_isv32_index)(uint o, uint i, uint y, uint x, uint i CAT(prefix, _SIZE_X),\ CAT(prefix, _SIZE_Y)) +inline uint FUNC(get_is_o32_yx_isv32_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size) +{ + const uint o_aligned_to_32 = ((o_size + 31) / 32) * 32; + const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32; + const uint i_aligned_to_32 = ((i_size + 31) / 32) * 32; + const uint i_val = i % 32; + const uint i_slice = i / 32; + const size_t idx = i_val + 32* (x + x_size * (y + y_size * (o_swizzled + o_aligned_to_32 * i_slice) ) ); + return idx; +} + +#define GET_FILTER_IS_O32_YX_ISV32_SWIZZLED_BY_4(prefix, o, i, y, x)\ + FUNC_CALL(get_is_o32_yx_isv32_swizzled_by_4_index)(\ + o, i, y, x, CAT(prefix, _IFM_NUM),\ + CAT(prefix, _OFM_NUM),\ + CAT(prefix, _SIZE_X),\ + CAT(prefix, _SIZE_Y)) + +inline uint FUNC(get_os_is_y_x8_osv8_isv4_index)(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size) +{ + const uint i_aligned_to_4 = ((i_size + 3) / 4) * 4; + const uint o_aligned_to_8 = ((o_size + 7) / 8) * 8; + const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8; + const uint i_val = i % 4; + const uint i_slice = i / 4; + const uint o_val = o % 8; + const uint o_slice = o / 8; + const size_t idx = i_val + 4 * (o_val + 8 * ( x + x_aligned_to_8 * (y + y_size * (i_slice + (i_aligned_to_4/4) * (o_slice))))); + return idx; +} + +#define GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(prefix, o, i, y, x)\ + FUNC_CALL(get_os_is_y_x8_osv8_isv4_index)(\ + o, i, y, x, CAT(prefix, _IFM_NUM),\ + CAT(prefix, _OFM_NUM),\ + CAT(prefix, _SIZE_X),\ + CAT(prefix, _SIZE_Y)) + +#define GET_DATA_B_FS_YX_FSV4_INDEX(prefix, o, i, y, x)\ + FUNC_CALL(get_b_fs_yx_fsv4)(\ + o, i, y, x,\ + CAT(prefix, _FEATURE_NUM),\ + CAT(prefix, _PAD_BEFORE_SIZE_Y), CAT(prefix, _SIZE_Y), CAT(prefix, _PAD_AFTER_SIZE_Y),\ + CAT(prefix, _PAD_BEFORE_SIZE_X), CAT(prefix, _SIZE_X), CAT(prefix, _PAD_AFTER_SIZE_X)) + +inline uint FUNC(get_b_fs_yx_fsv4)(uint o, uint i, uint y, uint x, + uint feature_num, + uint pad_before_size_y, uint size_y, uint pad_after_size_y, + uint pad_before_size_x, uint size_x, uint pad_after_size_x) +{ + const uint tile = 4; + uint id_tile = i / tile; + uint id = i - id_tile * tile; + + const uint feature_num_aligned4 = ((feature_num + 3) / 4) * 4; + + uint idx = o * (feature_num_aligned4 / tile) * + (pad_before_size_y + size_y + pad_after_size_y) * + (pad_before_size_x + size_x + pad_after_size_x) * tile + + id_tile * (pad_before_size_y + size_y + pad_after_size_y) * + (pad_before_size_x + size_x + pad_after_size_x) * tile + + pad_before_size_y * (pad_before_size_x + size_x + pad_after_size_x) * tile + + y * (pad_before_size_x + size_x + pad_after_size_x) * tile + + pad_before_size_x * tile + + x * tile + + id; + + return idx; +} + +#define GET_FILTER_OS_IS_YX_OSV16_ISV4_INDEX(prefix, o, i, y, x)\ + FUNC_CALL(get_os_is_yx_osv16_isv4)(\ + o, i, y, x,\ + CAT(prefix, _IFM_PITCH),\ + CAT(prefix, _OFM_PITCH),\ + CAT(prefix, _SIZE_X)) + +inline uint FUNC(get_os_is_yx_osv16_isv4)(uint o, uint i, uint y, uint x, + uint i_size, + uint o_size, + uint x_size) +{ + const uint otd = 16; + uint out_depth_tile = o / otd; + uint od = o - out_depth_tile * otd; + + const uint tile = 4; + uint id_tile = i / tile; + uint id = i - id_tile * tile; + + uint idx = out_depth_tile * (o_size / tile) * otd * tile + + id_tile * i_size * otd * tile + + y * x_size * otd * tile + + x * otd * tile + + od * tile + + id; + + return idx; +} + #define DECLARE_SAMPLER const sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST #if FP16_UNIT_USED diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/imad.cl new file mode 100644 index 0000000..d05e20e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/imad.cl @@ -0,0 +1,34 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +inline int FUNC(imad_SW)(int acc, uchar4 input, char4 weight) __attribute__((overloadable)) { + acc += input[0] * weight[0]; + acc += input[1] * weight[1]; + acc += input[2] * weight[2]; + acc += input[3] * weight[3]; + return acc; +} + +inline int FUNC(imad_SW)(int acc, char4 input, char4 weight) __attribute__((overloadable)) { + acc += input[0] * weight[0]; + acc += input[1] * weight[1]; + acc += input[2] * weight[2]; + acc += input[3] * weight[3]; + return acc; +} + + +#define IMAD(_O, _I, _W) FUNC_CALL(imad_SW)(_O, _I, _W) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/include_all.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/include_all.cl index 6b030bc..cc1c7ea 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/include_all.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/include_all.cl @@ -16,6 +16,4 @@ #include "common.cl" #include "data_types.cl" -#include "sub_group.cl" -#include "reshape_dims.cl" #include "fetch.cl" \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl index 1200075..4fc07ad 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl @@ -14,6 +14,47 @@ // limitations under the License. */ +void FUNC(intel_sub_group_block_write_4)( __local uint* p, uint4 data ) +{ + p[ get_sub_group_local_id() ] = data.s0; + p += 8; + p[ get_sub_group_local_id() ] = data.s1; + p += 8; + p[ get_sub_group_local_id() ] = data.s2; + p += 8; + p[ get_sub_group_local_id() ] = data.s3; +} + +uint4 FUNC(intel_sub_group_block_read_uint4)(const __local uint* p) +{ + uint4 ret; + uint idx = get_sub_group_local_id(); + + ret.s0 = p[idx]; idx += get_max_sub_group_size(); + ret.s1 = p[idx]; idx += get_max_sub_group_size(); + ret.s2 = p[idx]; idx += get_max_sub_group_size(); + ret.s3 = p[idx]; idx += get_max_sub_group_size(); + + return ret; +} + +uint8 FUNC(intel_sub_group_block_read_uint8)(const __local uint* p) +{ + uint8 ret; + uint idx = get_sub_group_local_id(); + + ret.s0 = p[idx]; idx += get_max_sub_group_size(); + ret.s1 = p[idx]; idx += get_max_sub_group_size(); + ret.s2 = p[idx]; idx += get_max_sub_group_size(); + ret.s3 = p[idx]; idx += get_max_sub_group_size(); + ret.s4 = p[idx]; idx += get_max_sub_group_size(); + ret.s5 = p[idx]; idx += get_max_sub_group_size(); + ret.s6 = p[idx]; idx += get_max_sub_group_size(); + ret.s7 = p[idx]; idx += get_max_sub_group_size(); + + return ret; +} + inline int FUNC(mmad_4)(char4 input, char4 weight, int acc) { acc += (input[0] * weight[0]); @@ -75,7 +116,54 @@ inline int8 FUNC(mmad8x8)(int8 A_vectors, int8 B_vectors, int8 acc) return ret; } +// TODO: remove it when cl_intel_subgroups_char extension will work +inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v) +{ +#ifdef cl_intel_subgroups_char + intel_sub_group_block_write_uc8(outPtr, v); +#else + uint idx = get_sub_group_local_id(); + + outPtr[idx] = v.s0; idx += get_max_sub_group_size(); + outPtr[idx] = v.s1; idx += get_max_sub_group_size(); + outPtr[idx] = v.s2; idx += get_max_sub_group_size(); + outPtr[idx] = v.s3; idx += get_max_sub_group_size(); + outPtr[idx] = v.s4; idx += get_max_sub_group_size(); + outPtr[idx] = v.s5; idx += get_max_sub_group_size(); + outPtr[idx] = v.s6; idx += get_max_sub_group_size(); + outPtr[idx] = v.s7; idx += get_max_sub_group_size(); +#endif +} + +inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) +{ +#ifdef cl_intel_subgroups_char + return intel_sub_group_block_read_uc8(ptr); +#else + uint idx = get_sub_group_local_id(); + + uchar8 ret; + + ret.s0 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s1 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s2 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s3 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s4 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s5 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s6 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s7 = ptr[idx]; idx += get_max_sub_group_size(); + + return ret; + +#endif +} + +// + #define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C) #define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C) #define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C) +#define SLM_BLOCK_WRITE_4(A, B) (FUNC_CALL(intel_sub_group_block_write_4)(A, B)) +#define SLM_BLOCK_READ_4(A) (FUNC_CALL(intel_sub_group_block_read_uint4)(A)) +#define SLM_BLOCK_READ_8(A) (FUNC_CALL(intel_sub_group_block_read_uint8)(A)) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/vec_typedefs.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/vec_typedefs.cl index e0fdb49..8b50ecb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/vec_typedefs.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/vec_typedefs.cl @@ -52,37 +52,3 @@ typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; typedef struct float0 { float s0; } float0; //never used but makes compiler happy. - -#if (KERNEL_WIDTH == 1) -__constant half1 half_zeros= (half1){0}; -#elif (KERNEL_WIDTH == 2) - __constant half2 half_zeros = (half2)(0); -#elif (KERNEL_WIDTH == 3) - __constant half3 half_zeros = (half3)(0); -#elif (KERNEL_WIDTH == 4) - __constant half4 half_zeros = (half4)(0); -#elif (KERNEL_WIDTH == 5) - __constant half5 half_zeros = (half5){0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 6) - __constant half6 half_zeros = (half6){0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 7) - __constant half7 half_zeros = (half7){0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 8) - __constant half8 half_zeros = (half8)(0); -#elif (KERNEL_WIDTH == 9) - __constant half9 half_zeros = (half9){0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 10) - __constant half10 half_zeros = (half10){0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 11) - __constant half11 half_zeros = (half11){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 12) - __constant half12 half_zeros = (half12){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 13) - __constant half13 half_zeros = (half13){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 14) - __constant half14 half_zeros = (half14){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 15) - __constant half15 half_zeros = (half15){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -#elif (KERNEL_WIDTH == 16) - __constant half16 half_zeros = (half16)(0); -#endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl index 9862c1a..33d3403 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl @@ -17,7 +17,9 @@ KERNEL(index_select_gpu_ref)( const __global UNIT_TYPE* input, +#ifndef REVERSE const __global int* indices, +#endif __global UNIT_TYPE* output) { // [CONSTEXPR]: @@ -29,32 +31,73 @@ KERNEL(index_select_gpu_ref)( const uint out_b = (uint) get_global_id(0); const uint indices_idx = (uint) get_global_id(1); const uint feature_idx = (uint) get_global_id(2); - const uint indices_value = indices[indices_idx]; + #if AXES_NUMBER == 1 + #ifdef REVERSE + const uint indices_value = REVERSE_AXIS_SIZE - 1 - indices_idx; + #else + const uint indices_value = indices[indices_idx]; + #endif + #elif AXES_NUMBER > 1 + #ifdef REVERSE + uint indices_value[4] = { + #ifdef REVERSE_INDEX_SELECT_AXIS_BATCH_SIZE + REVERSE_INDEX_SELECT_AXIS_BATCH_SIZE - 1 - out_b, + #else + out_b, + #endif + #ifdef REVERSE_INDEX_SELECT_AXIS_FEATURE_SIZE + REVERSE_INDEX_SELECT_AXIS_FEATURE_SIZE - 1 - feature_idx, + #else + feature_idx, + #endif + #ifdef REVERSE_INDEX_SELECT_AXIS_Y_SIZE + REVERSE_INDEX_SELECT_AXIS_Y_SIZE - 1 - indices_idx, + #else + indices_idx, + #endif + 0 + }; + #endif + #endif + // [LOGIC]: -#ifdef INDEX_SELECT_AXIS_BATCH - for(uint x = 0; x < input_sx; x++) - { - for(uint y = 0; y < input_sy; y++) - { - output[GET_DATA_INDEX(OUTPUT, indices_idx, feature_idx, y, x)] = input[GET_DATA_INDEX(INPUT0, indices_value, feature_idx, y, x)]; + #if AXES_NUMBER > 1 + for(uint x = 0; x < input_sx; x++) + { + #ifdef REVERSE_INDEX_SELECT_AXIS_X_SIZE + indices_value[3] = REVERSE_INDEX_SELECT_AXIS_X_SIZE - 1 - x; + #else + indices_value[3] = x; + #endif + output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, x)] = input[GET_DATA_INDEX(INPUT0, indices_value[0], indices_value[1], indices_value[2], indices_value[3])]; } - } -#elif defined INDEX_SELECT_AXIS_FEATURE - for(uint x = 0; x < input_sx; x++) - { - output[GET_DATA_INDEX(OUTPUT, out_b, indices_idx, feature_idx, x)] = input[GET_DATA_INDEX(INPUT0, out_b, indices_value, feature_idx, x)]; - } -#elif defined INDEX_SELECT_AXIS_X - for(uint i = 0; i < input_sx; i++) - { - output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, i, indices_idx)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, i, indices_value)]; - } -#elif defined INDEX_SELECT_AXIS_Y + + #else + #ifdef INDEX_SELECT_AXIS_BATCH + for(uint x = 0; x < input_sx; x++) + { + for(uint y = 0; y < input_sy; y++) + { + output[GET_DATA_INDEX(OUTPUT, indices_idx, feature_idx, y, x)] = input[GET_DATA_INDEX(INPUT0, indices_value, feature_idx, y, x)]; + } + } + #elif defined INDEX_SELECT_AXIS_FEATURE + for(uint x = 0; x < input_sx; x++) + { + output[GET_DATA_INDEX(OUTPUT, out_b, indices_idx, feature_idx, x)] = input[GET_DATA_INDEX(INPUT0, out_b, indices_value, feature_idx, x)]; + } + #elif defined INDEX_SELECT_AXIS_X + for(uint i = 0; i < input_sy; i++) + { + output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, i, indices_idx)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, i, indices_value)]; + } + #elif defined INDEX_SELECT_AXIS_Y - for(uint i = 0; i < input_sx; i++) - { - output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, i)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, indices_value, i)]; - } -#endif + for(uint i = 0; i < input_sx; i++) + { + output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, i)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, indices_value, i)]; + } + #endif + #endif } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_elt_gpu_bfyx_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_elt_gpu_bfyx_ref.cl index f4d8f72..682b83a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_elt_gpu_bfyx_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_elt_gpu_bfyx_ref.cl @@ -18,9 +18,9 @@ #define ACTIVATION_LOGISTIC(input) (UNIT_VAL_ONE/(UNIT_VAL_ONE + exp(-input))) #define ACTIVATION_HYPERBOLIC_TAN(input) (tanh(input)) -// tempGEMM = [ batch, direction, 1, 4 * hidden_size ] -// cell = [ batch, direction, 1, hidden_size ] optional -// output = [ batch, direction, 2, hidden_size ] output +// tempGEMM = [ batch, 1, direction, 4 * hidden_size ] +// cell = [ batch, 1, direction, hidden_size ] optional +// output = [ batch, 1, direction, hidden_size ] output KERNEL(lstm_elt)( const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output @@ -47,9 +47,9 @@ KERNEL(lstm_elt)( #endif #if CELL_TERM - val += cell[GET_DATA_INDEX(CELL, b, 0, 0, x)] * ACTIVATION_LOGISTIC(CLIP(ft)); + val += cell[GET_DATA_INDEX(CELL, b, 0, CELL_DIRECTION, x)] * ACTIVATION_LOGISTIC(CLIP(ft)); #endif - output[GET_DATA_INDEX(OUTPUT, b, 0, 0, x)] = ACTIVATION_HYPERBOLIC_TAN(val) * ACTIVATION_LOGISTIC(ot); // hidden + output[GET_DATA_INDEX(OUTPUT, b, 0, 0, x)] = (OUTPUT_TYPE)(ACTIVATION_HYPERBOLIC_TAN(val) * ACTIVATION_LOGISTIC(ot)); // hidden output[GET_DATA_INDEX(OUTPUT, b, 1, 0, x)] = (OUTPUT_TYPE)val; // cell } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemm_gpu_bfyx_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemm_gpu_bfyx_ref.cl index 3980075..90370bd 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemm_gpu_bfyx_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemm_gpu_bfyx_ref.cl @@ -43,14 +43,14 @@ KERNEL(lstm_gemm)( ACCUMULATOR_TYPE dotProd = 0; for(uint x = 0; x < INPUT0_SIZE_X; ++x ) { - const uint input_idx = GET_DATA_INDEX(INPUT0, b, 0, 0, x); + const uint input_idx = GET_DATA_INDEX(INPUT0, b, 0, INPUT_DIRECTION, x); const uint weights_idx = GET_DATA_INDEX(WEIGHTS, 0, DIRECTION, y, x); dotProd += (ACCUMULATOR_TYPE)(input[input_idx] * weights[weights_idx]); } #if HIDDEN_TERM for(uint x = 0; x < HIDDEN_SIZE_X; ++x ) { - const uint hidden_idx = GET_DATA_INDEX(HIDDEN, b, 0, 0, x); + const uint hidden_idx = GET_DATA_INDEX(HIDDEN, b, 0, HIDDEN_DIRECTION, x); const uint recurrent_idx = GET_DATA_INDEX(RECURRENT, 0, DIRECTION, y, x); dotProd += (ACCUMULATOR_TYPE)(hidden[hidden_idx] * recurrent[recurrent_idx]); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cl new file mode 100644 index 0000000..82c3e7f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_ff_SIMD16.cl @@ -0,0 +1,128 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +#ifndef DIRECTION +#define DIRECTION 0 +#endif + +#ifndef SIMD +#define SIMD 16 +#endif + +// Sums value of result across all subgroups. +#define SUM_ACROSS_SUB_GROUP(val) \ + \ +{ \ + val += intel_sub_group_shuffle(val, x+1); \ + val += intel_sub_group_shuffle(val, x+2); \ + val += intel_sub_group_shuffle(val, x+4); \ + val += (SIMD > 8) ? intel_sub_group_shuffle(val, x+8) : 0; \ + val += (SIMD > 16) ? intel_sub_group_shuffle(val, x+16) : 0; \ +} + +// input = [ batch, sequence, 1, input_size ] +// weights = [ 1, direction, 4 * hidden_size, input_size ] +// recurrent = [ 1, direction, 4 * hidden_size, hidden_size ] +// biases = [ 1, 1, direction, 4 * hidden_size ] optional +// hidden = [ batch, direction, 1, hidden_size ] optional +// tempGEMM = [ batch, direction, 1, 4 * hidden_size ] output + +__attribute__((reqd_work_group_size(SIMD, 1, 1))) +KERNEL(lstm_gemm)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + const __global WEIGHTS_TYPE* weights +#if HIDDEN_TERM + , const __global OUTPUT_TYPE* hidden, + const __global RECURRENT_TYPE* recurrent +#endif +#if BIAS_TERM + , const __global BIAS_TYPE* biases +#endif + ) +{ + const uint x = get_local_id(0); + const uint y = get_global_id(1); + const int local_sz = get_local_size(0); + const int weight_num_rows = get_global_size(1); + + uint K; + int start_offset; + int end_offset; + int matrix_offset; + int vector_offset; + float4 sum; + float result; + + K = INPUT0_SIZE_X; // Width of weight matrix + start_offset = GET_DATA_INDEX(WEIGHTS, 0, DIRECTION, y, 0); // set as the starting offset of the weight matrix + end_offset = start_offset + K; + matrix_offset = start_offset + (x * 4); // Weight offset for the work item to work on + vector_offset = GET_DATA_INDEX(INPUT0, 0, 0, INPUT_DIRECTION, (x*4)); // Input offset for the work item to work on + sum = (float4)(0.f); + result = 0; + for(; matrix_offset < end_offset; matrix_offset += (local_sz * 4), vector_offset += (local_sz * 4)) + { + float4 mask = (float4) (1 , (matrix_offset + 1) < end_offset , (matrix_offset + 2) < end_offset , (matrix_offset + 3) < end_offset); + float4 m = (float4) (weights[matrix_offset], weights[matrix_offset + 1], weights[matrix_offset + 2], weights[matrix_offset + 3]); + m = m * mask; + + const float4 v = (float4) (input[vector_offset], input[vector_offset + 1], input[vector_offset + 2], input[vector_offset + 3]); + + sum = mad(m, v, sum); + } + + result = sum.x + sum.y + sum.z + sum.w; + +#if HIDDEN_TERM + K = HIDDEN_SIZE_X; // width of recurrent matrix + start_offset = GET_DATA_INDEX(RECURRENT, 0, DIRECTION, y, 0); // set as the starting offset of the recurrent matrix + end_offset = start_offset + K; + matrix_offset = start_offset + (x * 4); // recurrent offset for the work item to work on + vector_offset = GET_DATA_INDEX(HIDDEN, 0, 0, HIDDEN_DIRECTION, (x*4)); // hidden vector offset for the work item to work on + sum = (float4)(0.f); + for(; matrix_offset < end_offset; matrix_offset += (local_sz * 4), vector_offset += (local_sz * 4)) + { + float4 mask = (float4) (1 , (matrix_offset + 1) < end_offset , (matrix_offset + 2) < end_offset , (matrix_offset + 3) < end_offset); + float4 m = (float4) (recurrent[matrix_offset], recurrent[matrix_offset + 1], recurrent[matrix_offset + 2], recurrent[matrix_offset + 3]); + m = m * mask; + + const float4 v = (float4) (hidden[vector_offset], hidden[vector_offset + 1], hidden[vector_offset + 2], hidden[vector_offset + 3]); + + sum = mad(m, v, sum); + } + + result += sum.x + sum.y + sum.z + sum.w; +#endif + + // Add together partial sums contained in each work item's "result" variable + SUM_ACROSS_SUB_GROUP(result); + + if(x == 0) + { + output[y] = (OUTPUT_TYPE)result; + +#if BIAS_TERM + const uint bias_idx = GET_DATA_INDEX(BIAS, 0, 0, DIRECTION, y); + float bias = (ACCUMULATOR_TYPE)biases[bias_idx]; + output[y] += (OUTPUT_TYPE)bias; +#endif + } +} + +#undef SUM_ACROSS_SUB_GROUP +#undef SIMD \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cl new file mode 100644 index 0000000..0be579b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lstm_gemv_gpu_subgroup1x64_bfyx_hh_SIMD16.cl @@ -0,0 +1,131 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +#ifndef DIRECTION +#define DIRECTION 0 +#endif + +#ifndef SIMD +#define SIMD 16 +#endif + +// Sums value of result across all subgroups. +#define SUM_ACROSS_SUB_GROUP(val) \ + \ +{ \ + val += intel_sub_group_shuffle(val, x+1); \ + val += intel_sub_group_shuffle(val, x+2); \ + val += intel_sub_group_shuffle(val, x+4); \ + val += intel_sub_group_shuffle(val, x+8); \ +} + +// input = [ batch, sequence, 1, input_size ] +// weights = [ 1, direction, 4 * hidden_size, input_size ] +// recurrent = [ 1, direction, 4 * hidden_size, hidden_size ] +// biases = [ 1, 1, direction, 4 * hidden_size ] optional +// hidden = [ batch, direction, 1, hidden_size ] optional +// tempGEMM = [ batch, direction, 1, 4 * hidden_size ] output + +__attribute__((reqd_work_group_size(SIMD, 1, 1))) +KERNEL(lstm_gemm)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + const __global WEIGHTS_TYPE* weights +#if HIDDEN_TERM + , const __global OUTPUT_TYPE* hidden, + const __global RECURRENT_TYPE* recurrent +#endif +#if BIAS_TERM + , const __global BIAS_TYPE* biases +#endif + ) +{ + const uint x = get_local_id(0); + const uint y = get_global_id(1); + const int local_sz = get_local_size(0); + + uint K; + int start_offset; + int end_offset; + int matrix_offset; + int vector_offset; + float4 sum; + float result; + + K = INPUT0_SIZE_X; // Width of weight matrix + start_offset = GET_DATA_INDEX(WEIGHTS, 0, DIRECTION, y, 0); // set as the starting offset of the weight matrix + end_offset = start_offset + K; + matrix_offset = start_offset + (x * 4); // Weight offset for the work item to work on + vector_offset = GET_DATA_INDEX(INPUT0, 0, 0, INPUT_DIRECTION, (x*4)); // Input offset for the work item to work on + sum = (float4)(0.f); + result = 0; + for(; matrix_offset < end_offset; matrix_offset += (local_sz * 4), vector_offset += (local_sz * 4)) + { + half4 mask = (half4) (1 , (matrix_offset + 1) < end_offset , (matrix_offset + 2) < end_offset , (matrix_offset + 3) < end_offset); + half4 m = (half4) (weights[matrix_offset], weights[matrix_offset + 1], weights[matrix_offset + 2], weights[matrix_offset + 3]); + m = m * mask; + + const half4 v = (half4)(input[vector_offset], input[vector_offset + 1], input[vector_offset + 2], input[vector_offset + 3]); + + sum = mad(convert_float4(m), convert_float4(v), sum); + } + + result = sum.x + sum.y + sum.z + sum.w; + +#if HIDDEN_TERM + K = HIDDEN_SIZE_X; // width of recurrent matrix + start_offset = GET_DATA_INDEX(RECURRENT, 0, DIRECTION, y, 0); // set as the starting offset of the recurrent matrix + end_offset = start_offset + K; + matrix_offset = start_offset + (x * 4); // recurrent offset for the work item to work on + vector_offset = GET_DATA_INDEX(HIDDEN, 0, 0, HIDDEN_DIRECTION, (x*4)); // hidden vector offset for the work item to work on + sum = (float4)(0.f); + for(; matrix_offset < end_offset; matrix_offset += (local_sz * 4), vector_offset += (local_sz * 4)) + { + half4 mask = (half4) (1 , (matrix_offset + 1) < end_offset , (matrix_offset + 2) < end_offset , (matrix_offset + 3) < end_offset); + half4 m = (half4) (recurrent[matrix_offset], recurrent[matrix_offset + 1], recurrent[matrix_offset + 2], recurrent[matrix_offset + 3]); + m = m * mask; + + const half4 v = (half4) (hidden[vector_offset], hidden[vector_offset + 1], hidden[vector_offset + 2], hidden[vector_offset + 3]); + + sum = mad(convert_float4(m), convert_float4(v), sum); + } + + result += sum.x + sum.y + sum.z + sum.w; +#endif + + // Add together partial sums contained in each work item's "result" variable + SUM_ACROSS_SUB_GROUP(result); + + if(x == 0) + { + output[y] = 0;// (half)result; + +#if BIAS_TERM + const uint bias_idx = GET_DATA_INDEX(BIAS, 0, 0, DIRECTION, y); + half bias = biases[bias_idx]; + result += (float)bias; +#endif + + output[y] = (half)result; + //output[y] = convert_half_rte(result); + + + } +} + +#undef SUM_ACROSS_SUB_GROUP +#undef SIMD \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/one_hot_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/one_hot_ref.cl new file mode 100644 index 0000000..b3f02ae --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/one_hot_ref.cl @@ -0,0 +1,39 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + +#define GET_COORDS_INDEX(prefix, coords) GET_DATA_INDEX(prefix, coords[0], coords[1], coords[2], coords[3]) + +KERNEL(one_hot_ref)( + const __global INPUT0_TYPE* input, + __global INPUT0_TYPE* output) +{ + uint in_coords[4] = { 0, get_global_id(0), get_global_id(1), get_global_id(2) }; + uint out_coords[4] = { 0, get_global_id(0), get_global_id(1), get_global_id(2) }; + for (uint i = 0; i < ONE_HOT_AXIS; ++i) + out_coords[i] = out_coords[i + 1]; + + // Fill the output with 0 + for (out_coords[ONE_HOT_AXIS] = 0; out_coords[ONE_HOT_AXIS] < ONE_HOT_LIMIT; ++out_coords[ONE_HOT_AXIS]) + output[GET_COORDS_INDEX(OUTPUT, out_coords)] = 0; + + // Put in the 1; ignore bad input values + INPUT0_TYPE val = input[GET_COORDS_INDEX(INPUT0, in_coords)]; + if (val >= 0 && val < ONE_HOT_LIMIT) + { + out_coords[ONE_HOT_AXIS] = val; + output[GET_COORDS_INDEX(OUTPUT, out_coords)] = 1; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl index a980555..a85c82f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl @@ -12,34 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "include/common.cl" -#include "include/data_types.cl" +#include "include/include_all.cl" KERNEL (permute_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) { uint4 input_indices, output_indices; - input_indices[0] = get_global_id(0); - input_indices[1] = get_global_id(1); - input_indices[2] = get_global_id(2) % INPUT0_SIZES[2]; - input_indices[3] = get_global_id(2) / INPUT0_SIZES[2]; + //gws(y, x, b*f) + //input_indices[b, f, x, y] + input_indices[3] = get_global_id(0); + input_indices[2] = get_global_id(1); + input_indices[1] = get_global_id(2) % INPUT0_FEATURE_NUM; + input_indices[0] = get_global_id(2) / INPUT0_FEATURE_NUM; + //PERMUTE_ORDER[b, f, x, y] + //output_indices[b, f, x, y] output_indices[0] = input_indices[PERMUTE_ORDER[0]]; output_indices[1] = input_indices[PERMUTE_ORDER[1]]; output_indices[2] = input_indices[PERMUTE_ORDER[2]]; output_indices[3] = input_indices[PERMUTE_ORDER[3]]; - uint input_offset = INPUT0_OFFSET + - input_indices[0]*INPUT0_PITCHES[0] + - input_indices[1]*INPUT0_PITCHES[1] + - input_indices[2]*INPUT0_PITCHES[2] + - input_indices[3]*INPUT0_PITCHES[3]; - uint output_offset = OUTPUT_OFFSET + - output_indices[0]*OUTPUT_PITCHES[0] + - output_indices[1]*OUTPUT_PITCHES[1] + - output_indices[2]*OUTPUT_PITCHES[2] + - output_indices[3]*OUTPUT_PITCHES[3]; + uint input_offset = GET_DATA_INDEX(INPUT0, input_indices[0], input_indices[1], input_indices[3], input_indices[2]); + uint output_offset = GET_DATA_INDEX(OUTPUT, output_indices[0], output_indices[1], output_indices[3], output_indices[2]); output[output_offset] = ACTIVATION(input[input_offset], NL_M, NL_N); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl new file mode 100644 index 0000000..a31592d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl @@ -0,0 +1,143 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +#if MAX_POOLING + #define INIT_VAL CHAR_MIN +#elif AVG_POOLING + #define INIT_VAL 0 +#else +#error +#endif + + +inline int FUNC(apply_pooling)(int tmp, int in) +{ +#if MAX_POOLING + return max(tmp, in); +#elif AVG_POOLING + return tmp + in; +#endif +} + +KERNEL(pooling_gpu_b_fs_yx_fsv4)( + const __global UNIT_TYPE* input, + __global UNIT_TYPE* output) +{ + const uint x = (uint)get_global_id(0); + const uint y = (uint)get_global_id(1); + const uint bf = (uint)get_global_id(2); + const uint f = (bf * 4) % INPUT0_FEATURE_NUM; + const uint b = (bf * 4) / INPUT0_FEATURE_NUM; + + const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; + const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; + + int result[4] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL }; + +#ifdef CHECK_BOUNDRY + if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || + offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y) + { + return; + } + +#ifdef DYNAMIC_KERNEL_DIVIDER + uint num_elementes = 0; +#endif + + const uint batch_and_feature_offset = GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, 0, 0); + for(uint j = 0; j < POOL_SIZE_Y; j++) + { + int input_offset_y = offset_y + j; + bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; + if(!zero_y) + { + for(uint i = 0; i < POOL_SIZE_X; i++) + { + int input_offset_x = offset_x + i; + bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; + if(!zero) + { + const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH; + + int int_data = *((const __global int*)(input + input_idx)); + char4 ch4_data = as_char4(int_data); + result[0] = FUNC_CALL(apply_pooling)(result[0], (int)ch4_data[0]); + result[1] = FUNC_CALL(apply_pooling)(result[1], (int)ch4_data[1]); + result[2] = FUNC_CALL(apply_pooling)(result[2], (int)ch4_data[2]); + result[3] = FUNC_CALL(apply_pooling)(result[3], (int)ch4_data[3]); + +#ifdef DYNAMIC_KERNEL_DIVIDER + num_elementes++; +#endif + } + } + } + } +#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER + const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y); + const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X); + const uint num_elementes = (hend - offset_y) * (wend - offset_x); +#endif +#else // !CHECK_BOUNDRY + uint input_idx = GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, offset_y, offset_x); + + for(uint j = 0; j < POOL_SIZE_Y; j++) + { + for(uint i = 0; i < POOL_SIZE_X; i++) + { + int int_data = *((const __global int*)(input + input_idx)); + char4 ch4_data = as_char4(int_data); + result[0] = FUNC_CALL(apply_pooling)(result[0], (int)ch4_data[0]); + result[1] = FUNC_CALL(apply_pooling)(result[1], (int)ch4_data[1]); + result[2] = FUNC_CALL(apply_pooling)(result[2], (int)ch4_data[2]); + result[3] = FUNC_CALL(apply_pooling)(result[3], (int)ch4_data[3]); + + input_idx += IN_X_PITCH; + } + input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH); + } + +#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) + const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y; +#endif +#endif + +#if defined AVG_POOLING + #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) + for(uint i = 0; i < 4; i++) + { + result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1))); + } + #else + for(uint i = 0; i < 4; i++) + { + result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); + } + #endif +#endif + + char4 char_res; + for(uint op = 0; op < 4; op++) + { + char_res[op] = ACTIVATION(convert_char(result[op]), NL_M ,NL_N); + } + const uint output_pos = GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x); + *((__global int*)(output + output_pos)) = as_int(char_res); +} + +#undef INIT_VAL diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl index 130cd8c..c23652a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl @@ -43,8 +43,8 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( const uint bf = (uint)get_global_id(2); // we process 4 features per workitem that's why we need to divide it const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32; - const uint f = 4 * (bf % (aligned32_features / 4)); - const uint b_block = bf / (aligned32_features / 4); + const uint f = (get_global_id(2) * 4) % aligned32_features; + const uint b = 4 * ((get_global_id(2) * 4) / aligned32_features); if (x >= OUTPUT_SIZE_X) { @@ -54,11 +54,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - int4 result[4]; - for(uint b = 0; b < 4; b++) - { - result[b] = INIT_VAL; - } + int4 result[4] = { INIT_VAL }; #ifdef CHECK_BOUNDRY if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || @@ -71,7 +67,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( uint num_elementes = 0; #endif - const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b_block * 4, f, 0, 0); + const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, 0, 0); for(uint j = 0; j < POOL_SIZE_Y; j++) { int input_offset_y = offset_y + j; @@ -110,7 +106,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( const uint num_elementes = (hend - offset_y) * (wend - offset_x); #endif #else - uint input_idx = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b_block * 4, f, offset_y, offset_x); + uint input_idx = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, offset_y, offset_x); for(uint j = 0; j < POOL_SIZE_Y; j++) { @@ -156,14 +152,18 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( #endif #endif -for(uint b = 0; b < 4; b++) -{ - for(uint op = 0; op < 4; op++) + int4 char_result; + for(uint b = 0; b < 4; b++) { - const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f+op, y, x); - output[output_pos] = ACTIVATION(convert_char(result[b][op]), NL_M ,NL_N); + char4 char_res = as_char4(char_result[b]); + for(uint op = 0; op < 4; op++) + { + char_res[op] = ACTIVATION(convert_char(result[b][op]), NL_M ,NL_N); + } + char_result[b] = as_int(char_res); } -} + const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); + intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(char_result)); } #undef INIT_VAL \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pyramid_roi_align_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pyramid_roi_align_gpu_ref.cl new file mode 100644 index 0000000..f1b7664 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pyramid_roi_align_gpu_ref.cl @@ -0,0 +1,159 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/include_all.cl" + +#define META_OFFSET_X 4 +#define META_OFFSET_Y 5 + +#define SIZE_TAB_PARAMETERS 4 + +struct Parameters +{ + int h_source, w_source, f_Size, x_Size, y_Size, offset; +}; + +__constant struct Parameters parameters [SIZE_TAB_PARAMETERS] = + { + { INPUT2_SIZE_Y, INPUT2_SIZE_X, INPUT2_FEATURE_PITCH, INPUT2_X_PITCH, INPUT2_Y_PITCH, INPUT2_OFFSET }, + { INPUT3_SIZE_Y, INPUT3_SIZE_X, INPUT3_FEATURE_PITCH, INPUT3_X_PITCH, INPUT3_Y_PITCH, INPUT3_OFFSET }, + { INPUT4_SIZE_Y, INPUT4_SIZE_X, INPUT4_FEATURE_PITCH, INPUT4_X_PITCH, INPUT4_Y_PITCH, INPUT4_OFFSET }, + { INPUT5_SIZE_Y, INPUT5_SIZE_X, INPUT5_FEATURE_PITCH, INPUT5_X_PITCH, INPUT5_Y_PITCH, INPUT5_OFFSET } + }; + + +KERNEL(pyramidROIAlign_gpu_ref)( + const __global INPUT0_TYPE *boxes, + const __global INPUT1_TYPE *image_meta, + const __global INPUT2_TYPE *P2, + const __global INPUT3_TYPE *P3, + const __global INPUT4_TYPE *P4, + const __global INPUT5_TYPE *P5, + const __global INPUT6_TYPE *dim, + __global OUTPUT_TYPE *output) +{ + // [CONSTEXPR]: + const uint kerNum = (uint) get_global_id(0); + + const __global float *feature_map_Ptr[SIZE_TAB_PARAMETERS]; + int f_Size; + + INPUT1_TYPE img_dim_X = image_meta[GET_DATA_INDEX(INPUT1, 0, 0, 0, META_OFFSET_X)]; + INPUT1_TYPE img_dim_Y = image_meta[GET_DATA_INDEX(INPUT1, 0, 0, 0, META_OFFSET_Y)]; + + INPUT1_TYPE image_area = img_dim_X * img_dim_Y; + INPUT1_TYPE scale = sqrt(image_area) / 224.0; + + INPUT0_TYPE hU = boxes[GET_DATA_INDEX(INPUT0, 0, 0, kerNum, 2)]; + INPUT0_TYPE hL = boxes[GET_DATA_INDEX(INPUT0, 0, 0, kerNum, 0)]; + INPUT0_TYPE h = hU - hL; + INPUT0_TYPE wU = boxes[GET_DATA_INDEX(INPUT0, 0, 0, kerNum, 3)]; + INPUT0_TYPE wL = boxes[GET_DATA_INDEX(INPUT0, 0, 0, kerNum, 1)]; + INPUT0_TYPE w = wU - wL; + + int roi_level = (int)round(log2(sqrt(h*w) * scale)); + + // 0 <= roi_level <= 3 + roi_level = min(3, max(0, 2 + roi_level)); + + feature_map_Ptr[0] = P2; + feature_map_Ptr[1] = P3; + feature_map_Ptr[2] = P4; + feature_map_Ptr[3] = P5; + + f_Size = parameters[roi_level].f_Size; + + //calculate cooficients for transformation + INPUT0_TYPE y1 = hL * (parameters[roi_level].h_source - 1); + INPUT0_TYPE x1 = wL * (parameters[roi_level].w_source - 1); + INPUT0_TYPE y2 = hU * (parameters[roi_level].h_source - 1); + INPUT0_TYPE x2 = wU * (parameters[roi_level].w_source - 1); + INPUT0_TYPE deltaX = (x2 - x1) / (OUTPUT_SIZE_X - 1); + INPUT0_TYPE deltaY = (y2 - y1) / (OUTPUT_SIZE_Y - 1); + INPUT0_TYPE y = y1; + + //transformation + for (int i = 0; i < OUTPUT_SIZE_Y; ++i) //loop by 'y' coordinate + { + int ya = (int)floor(y); + int yb = (int)ceil(y); + + if (ya < 0) ya = 0; + if (yb >= parameters[roi_level].h_source) yb = parameters[roi_level].h_source - 1; + if (yb - ya == 0) + { + if (yb + 2 < parameters[roi_level].h_source) ++yb; + else --ya; + } + + INPUT0_TYPE x = x1; + + for (int j = 0; j < OUTPUT_SIZE_X; ++j) //loop by 'x' coordinate + { + int xa = (int)floor(x); + int xb = (int)ceil(x); + if (xa < 0) xa = 0; + if (xb >= parameters[roi_level].w_source) xb = parameters[roi_level].w_source - 1; + if (xb - xa == 0) + { + if (xb + 2 < parameters[roi_level].w_source) ++xb; + else --xa; + } + + /* BILINEAR TRANSFORMATION + (xa,yb,f3)*---------------------------------*(xb,yb,f2) + | | + | *(x,y) | + | | + (xa,ya,f0)*---------------------------------*(xb,ya,f1) + */ + //cooficients for bilinear transformation + INPUT0_TYPE a = yb - y; + INPUT0_TYPE b = y - ya; + INPUT0_TYPE c = xb - x; + INPUT0_TYPE d = x - xa; + + /*#define GET_DATA_INDEX(prefix, b, f, y, x) \ + CAT(prefix, _OFFSET) + \ + (x)*CAT(prefix, _X_PITCH) + \ + (y)*CAT(prefix, _Y_PITCH) + \ + (f)*CAT(prefix, _FEATURE_PITCH) + \ + (b)*CAT(prefix, _BATCH_PITCH) + + For P2, P3, P4, P5 batch size is always 0 */ + + size_t f0Ind = parameters[roi_level].offset + parameters[roi_level].y_Size * ya + parameters[roi_level].x_Size * xa; + size_t f1Ind = parameters[roi_level].offset + parameters[roi_level].y_Size * ya + parameters[roi_level].x_Size * xb; + size_t f2Ind = parameters[roi_level].offset + parameters[roi_level].y_Size * yb + parameters[roi_level].x_Size * xb; + size_t f3Ind = parameters[roi_level].offset + parameters[roi_level].y_Size * yb + parameters[roi_level].x_Size * xa; + size_t ind_out = OUTPUT_OFFSET + i * OUTPUT_Y_PITCH + j * OUTPUT_X_PITCH + kerNum * OUTPUT_BATCH_PITCH; + + for (int k = 0; k < OUTPUT_FEATURE_NUM; ++k) //transformation for every feature + { + INPUT0_TYPE f0 = feature_map_Ptr[roi_level][k * f_Size + f0Ind]; + INPUT0_TYPE f1 = feature_map_Ptr[roi_level][k * f_Size + f1Ind]; + INPUT0_TYPE f2 = feature_map_Ptr[roi_level][k * f_Size + f2Ind]; + INPUT0_TYPE f3 = feature_map_Ptr[roi_level][k * f_Size + f3Ind]; + + INPUT0_TYPE f03 = f3 * b + f0 * a; + INPUT0_TYPE f12 = f2 * b + f1 * a; + INPUT0_TYPE f = f03 * c + f12 * d; + + output[k * OUTPUT_FEATURE_PITCH + ind_out] = f; + } + x += deltaX; + } + y += deltaY; + } +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data.cl index 591a07c..04a7955 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data.cl @@ -30,8 +30,12 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x) return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x); #elif defined INPUT0_LAYOUT_BYXF_AF32 return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_BYX8_F4 + return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x); #elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32 return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4 + return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x); #else #error reorder_data.cl: input format - not supported #endif @@ -50,8 +54,12 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x) return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_BYXF_AF32 return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_BYX8_F4 + return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x); #elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32 return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4 + return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x); #else #error reorder_data.cl: output format - not supported #endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl new file mode 100644 index 0000000..0efd2cc --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_byxf_f32_to_byx8_f4_i8.cl @@ -0,0 +1,136 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/reshape_dims.cl" +#include "include/fetch.cl" + +#include "include/data_types.cl" + +///////////////////////// Input Index ///////////////////////// +inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x) +{ +#if INPUT0_SIMPLE + return GET_DATA_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \ + defined INPUT0_LAYOUT_BS_F_BSV16__AF8 + return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE); +#elif defined INPUT0_LAYOUT_BF8_XY16 + return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_BYXF_AF32 + return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_BYX8_F4 + return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32 + return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x); +#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4 + return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x); +#else +#error reorder_data.cl: input format - not supported +#endif +} + +///////////////////////// Output Index ///////////////////////// + +inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x) +{ +#if OUTPUT_SIMPLE + return GET_DATA_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \ + defined OUTPUT_LAYOUT_BS_F_BSV16__AF8 + return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE); +#elif defined OUTPUT_LAYOUT_BF8_XY16 + return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_BYXF_AF32 + return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_BYX8_F4 + return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32 + return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); +#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4 + return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x); +#else +#error reorder_data.cl: output format - not supported +#endif +} + +__attribute__((intel_reqd_sub_group_size(16))) +KERNEL (reorder_data_byxf_f32_to_byx8_f4_i8)( + const __global INPUT_REORDER_TYPE* input, + __global OUTPUT_REORDER_TYPE* output +#ifdef MEAN_SUBTRACT_IN_BUFFER + , __global MEAN_SUBTRACT_TYPE* mean_subtract +#endif + ) +{ + const uint x = get_global_id(0); + const uint y = get_group_id(1); + const uint b = get_group_id(2); + + const uint input_idx = FUNC_CALL(get_input_index)(b, 0, y, x); + const uint output_idx = FUNC_CALL(get_output_index)(b, 0, y, x); + +#if defined MEAN_SUBTRACT_INSIDE_PARAMS + float4 res; + res.s0 = TO_MEAN_TYPE(input[input_idx]); + res.s0 = MEAN_OP(res.s0, VALUE_TO_SUBTRACT[0 % VALUE_TO_SUBTRACT_SIZE]); + res.s1 = TO_MEAN_TYPE(input[input_idx+1]); + res.s1 = MEAN_OP(res.s1, VALUE_TO_SUBTRACT[1 % VALUE_TO_SUBTRACT_SIZE]); + res.s2 = TO_MEAN_TYPE(input[input_idx+2]); + res.s2 = MEAN_OP(res.s2, VALUE_TO_SUBTRACT[2 % VALUE_TO_SUBTRACT_SIZE]); + res.s3 = 0; +#elif defined MEAN_SUBTRACT_IN_BUFFER +#if defined MEAN_PER_FEATURE + MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res; + res.s0 = TO_MEAN_TYPE(input[input_idx]); + res.s0 = MEAN_OP(res.s0, mean_subtract[0]); + res.s1 = TO_MEAN_TYPE(input[input_idx+1]); + res.s1 = MEAN_OP(res.s1, mean_subtract[1]); + res.s2 = TO_MEAN_TYPE(input[input_idx+2]); + res.s2 = MEAN_OP(res.s2, mean_subtract[2]); + res.s3 = 0 +#else + MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res; + res.s0 = TO_MEAN_TYPE(input[input_idx]); + res.s1 = TO_MEAN_TYPE(input[input_idx+1]); + res.s2 = TO_MEAN_TYPE(input[input_idx+2]); + res.s3 = 0; + + uint4 msv; + msv = FUNC_CALL(reshape_dims)(b,0,y,x, INPUT0_SIZE_Y, INPUT0_SIZE_X, MEAN_SUBTRACT_SIZE_Y, MEAN_SUBTRACT_SIZE_X, INPUT0_DIMS, MEAN_SUBTRACT_DIMS); + res.s0 = MEAN_OP(res.s0, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[0], msv[1], msv[2], msv[3])]); + + msv = FUNC_CALL(reshape_dims)(b,1,y,x, INPUT0_SIZE_Y, INPUT0_SIZE_X, MEAN_SUBTRACT_SIZE_Y, MEAN_SUBTRACT_SIZE_X, INPUT0_DIMS, MEAN_SUBTRACT_DIMS); + res.s1 = MEAN_OP(res.s1, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[0], msv[1], msv[2], msv[3])]); + + msv = FUNC_CALL(reshape_dims)(b,2,y,x, INPUT0_SIZE_Y, INPUT0_SIZE_X, MEAN_SUBTRACT_SIZE_Y, MEAN_SUBTRACT_SIZE_X, INPUT0_DIMS, MEAN_SUBTRACT_DIMS); + res.s2 = MEAN_OP(res.s2, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[0], msv[1], msv[2], msv[3])]); +#endif +#else + MAKE_VECTOR_TYPE(CALC_TYPE, 4) res; + res.s0 = TO_CALC_TYPE(input[input_idx]); + res.s1 = TO_CALC_TYPE(input[input_idx+1]); + res.s2 = TO_CALC_TYPE(input[input_idx+2]); + res.s3 = 0; +#endif + + char4 out_vals; + out_vals.s0 = ACTIVATION(TO_OUTPUT_REORDER_TYPE(res.s0), NL_M ,NL_N); + out_vals.s1 = ACTIVATION(TO_OUTPUT_REORDER_TYPE(res.s1), NL_M ,NL_N); + out_vals.s2 = ACTIVATION(TO_OUTPUT_REORDER_TYPE(res.s2), NL_M ,NL_N); + out_vals.s3 = 0; + + __global uint* dst = (__global uint*)output; + dst[output_idx/4] = as_uint(out_vals); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl index 33a662a..7caa43d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl @@ -13,7 +13,9 @@ // limitations under the License. -#include "include/include_all.cl" +#include "include/fetch.cl" +#include "include/reshape_dims.cl" +#include "include/data_types.cl" ///////////////////////// Input Index ///////////////////////// @@ -26,6 +28,10 @@ inline uint FUNC(get_input_index)(uint o, uint i, uint y, uint x) defined INPUT0_LAYOUT_OS_I_OSV8__AI8 || \ defined INPUT0_LAYOUT_OS_I_OSV16__AI8 return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); +#elif defined INPUT0_LAYOUT_IYX_OSV32 + return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, 32); +#elif defined INPUT0_LAYOUT_IYX_OSV64 + return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, 64); #elif defined INPUT0_LAYOUT_OS_IYX_OSV16_ROTATE_180 return GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_I_YXS_OS_YXSV2_OSV16 @@ -38,6 +44,10 @@ inline uint FUNC(get_input_index)(uint o, uint i, uint y, uint x) return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_IS_O_YX_ISV32 return GET_FILTER_IS_O_YX_ISV32(INPUT0, o, i, y, x); +#elif defined INPUT0_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4 + return GET_FILTER_IS_O32_YX_ISV32_SWIZZLED_BY_4(INPUT0, o, i, y, x); +#elif defined INPUT0_LAYOUT_OS_IS_Y_X8_OSV8_ISV4 + return GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(INPUT0, o, i, y, x); #else #error reorder_weights.cl: input format - not supported #endif @@ -54,6 +64,10 @@ inline uint FUNC(get_output_index)(uint o, uint i, uint y, uint x) defined OUTPUT_LAYOUT_OS_I_OSV8__AI8 || \ defined OUTPUT_LAYOUT_OS_I_OSV16__AI8 return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); +#elif defined OUTPUT_LAYOUT_OS_IYX_OSV32 + return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, 32); +#elif defined OUTPUT_LAYOUT_OS_IYX_OSV64 + return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, 64); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV16_ROTATE_180 return GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_I_YXS_OS_YXSV2_OSV16 @@ -66,6 +80,14 @@ inline uint FUNC(get_output_index)(uint o, uint i, uint y, uint x) return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_IS_O_YX_ISV32 return GET_FILTER_IS_O_YX_ISV32(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4 + return GET_FILTER_IS_O32_YX_ISV32_SWIZZLED_BY_4(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_Y_X8_OSV8_ISV4 + return GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV4 + return GET_FILTER_OS_IS_YX_OSV16_ISV4_INDEX(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4 + return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, y, x); #else #error reorder_weights.cl: output format - not supported #endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl new file mode 100644 index 0000000..96235d6 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl @@ -0,0 +1,43 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global float* seq_lengths, __global UNIT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint feature = get_global_id(1); + const uint y = get_global_id(2) / INPUT0_SIZE_X; + const uint x = get_global_id(2) % INPUT0_SIZE_X; + uint dimensions[] = { batch, feature, y, x }; + + const uint input_index = INPUT0_OFFSET + + batch * INPUT0_BATCH_PITCH + + feature * INPUT0_FEATURE_PITCH + + y * INPUT0_Y_PITCH + + x * INPUT0_X_PITCH; + + const uint length = seq_lengths[dimensions[BATCH_AXIS]]; + if (dimensions[SEQ_AXIS] < length) + dimensions[SEQ_AXIS] = length - dimensions[SEQ_AXIS] - 1; + + const uint output_index = OUTPUT_OFFSET + + dimensions[0] * OUTPUT_BATCH_PITCH + + dimensions[1] * OUTPUT_FEATURE_PITCH + + dimensions[2] * OUTPUT_Y_PITCH + + dimensions[3] * OUTPUT_X_PITCH; + + output[output_index] = input[input_index]; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ps_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ps_ref.cl new file mode 100644 index 0000000..194b06d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ps_ref.cl @@ -0,0 +1,141 @@ +// Copyright (c) 2016-2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/common.cl" +#include "include/data_types.cl" + +// Each RoI is described by 5 elements [batch_id xmin ymin xmax ymax] +#define ROI_NUM_ELEMENTS 5 + +#define COORD_T float +#define ACCUM_T float + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define CLAMP(v,l,u) MAX((l),MIN((v),(u))) + +KERNEL(roi_pooling_ps_gpu)(const __global INPUT0_TYPE * src_data, + __global OUTPUT_TYPE * dst_data, + const __global INPUT1_TYPE * src_rois) +{ + const size_t i = get_global_id(0); + + const uint x = i % OUTPUT_SIZE_X; + const uint y = i / OUTPUT_SIZE_X % OUTPUT_SIZE_Y; + const uint c = i / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_FEATURE_NUM; + const uint r = i / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_FEATURE_NUM % OUTPUT_ROI_NUM; + + const __global INPUT1_TYPE* roi_ptr = &src_rois[INPUT1_BATCH_PITCH * r]; + const int src_batch_idx = (int)(roi_ptr[0]); + +#if BILINEAR_POOLING + + COORD_T roi_start_w = roi_ptr[1] * SPATIAL_SCALE; + COORD_T roi_start_h = roi_ptr[2] * SPATIAL_SCALE; + COORD_T roi_end_w = roi_ptr[3] * SPATIAL_SCALE; + COORD_T roi_end_h = roi_ptr[4] * SPATIAL_SCALE; + + COORD_T roi_height = (roi_end_h - roi_start_h); + COORD_T roi_width = (roi_end_w - roi_start_w); + + ACCUM_T res = 0.0f; + + for (int bin_y = 0; bin_y < SPATIAL_BINS_Y; bin_y++) + { + for (int bin_x = 0; bin_x < SPATIAL_BINS_X; bin_x++) + { + COORD_T box_xmin = roi_start_w + (bin_x + 0) * (roi_width / SPATIAL_BINS_X); + COORD_T box_xmax = roi_start_w + (bin_x + 1) * (roi_width / SPATIAL_BINS_X); + COORD_T box_ymin = roi_start_h + (bin_y + 0) * (roi_height / SPATIAL_BINS_Y); + COORD_T box_ymax = roi_start_h + (bin_y + 1) * (roi_height / SPATIAL_BINS_Y); + + const uint gc = c + (bin_y*SPATIAL_BINS_X + bin_x)*OUTPUT_FEATURE_NUM; + const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + src_batch_idx*INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH*gc; + COORD_T height_scale = POOLED_HEIGHT > 1 ? (box_ymax - box_ymin) * (INPUT0_SIZE_Y - 1) / (POOLED_HEIGHT - 1) + : 0.0f; + COORD_T width_scale = POOLED_WIDTH > 1 ? (box_xmax - box_xmin) * (INPUT0_SIZE_X - 1) / (POOLED_WIDTH - 1) + : 0.0f; + + float in_y = POOLED_HEIGHT > 1 ? (y * height_scale + box_ymin * (INPUT0_SIZE_Y - 1)) + : 0.5f * (box_ymin + box_ymax) * (INPUT0_SIZE_Y - 1); + float in_x = POOLED_WIDTH > 1 ? (x * width_scale + box_xmin * (INPUT0_SIZE_X - 1)) + : 0.5f * (box_xmin + box_xmax) * (INPUT0_SIZE_X - 1); + + if (!(in_y < 0 || in_y > (COORD_T)(INPUT0_SIZE_Y - 1) || + in_x < 0 || in_x > (COORD_T)(INPUT0_SIZE_X - 1) || src_batch_idx == -1)) + { + int top_y_index = (int)(floor(in_y)); + int bottom_y_index = (int)(min(ceil(in_y), (COORD_T)INPUT0_SIZE_Y - 1)); + int left_x_index = (int)(floor(in_x)); + int right_x_index = (int)(min(ceil(in_x), (COORD_T)INPUT0_SIZE_X - 1)); + + ACCUM_T top_left = (ACCUM_T)data[top_y_index*INPUT0_Y_PITCH + left_x_index*INPUT0_X_PITCH]; + ACCUM_T top_right = (ACCUM_T)data[top_y_index*INPUT0_Y_PITCH + right_x_index*INPUT0_X_PITCH]; + ACCUM_T bottom_left = (ACCUM_T)data[bottom_y_index*INPUT0_Y_PITCH + left_x_index*INPUT0_X_PITCH]; + ACCUM_T bottom_right = (ACCUM_T)data[bottom_y_index*INPUT0_Y_PITCH + right_x_index*INPUT0_X_PITCH]; + + ACCUM_T top = top_left + (top_right - top_left) * (in_x - left_x_index); + ACCUM_T bottom = bottom_left + (bottom_right - bottom_left) * (in_x - left_x_index); + + res += top + (bottom - top) * (in_y - top_y_index); + } + } + } + + res /= (SPATIAL_BINS_Y*SPATIAL_BINS_X); +#elif AVG_POOLING + const uint work_c = x + POOLED_WIDTH * (y + POOLED_HEIGHT * c); + const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + src_batch_idx*INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH*work_c; + + const COORD_T roi_x = (COORD_T)(round(roi_ptr[1]) + 0.f) * SPATIAL_SCALE; + const COORD_T roi_y = (COORD_T)(round(roi_ptr[2]) + 0.f) * SPATIAL_SCALE; + const COORD_T roi_x1 = (COORD_T)(round(roi_ptr[3]) + 1.f) * SPATIAL_SCALE; + const COORD_T roi_y1 = (COORD_T)(round(roi_ptr[4]) + 1.f) * SPATIAL_SCALE; + + // The final coordinate is within the ROI and malformed dimensions are treated as 1 + const COORD_T roi_w = max(roi_x1 - roi_x, .1f); + const COORD_T roi_h = max(roi_y1 - roi_y, .1f); + + const COORD_T dx_begin = (x + 0) * (COORD_T)(roi_w / POOLED_WIDTH); + const COORD_T dy_begin = (y + 0) * (COORD_T)(roi_h / POOLED_HEIGHT); + const COORD_T dx_after = (x + 1) * (COORD_T)(roi_w / POOLED_WIDTH); + const COORD_T dy_after = (y + 1) * (COORD_T)(roi_h / POOLED_HEIGHT); + + // clamp in case roi_x or roi_y were unreasonable + const int x_begin = CLAMP(floor(roi_x + dx_begin), 0, INPUT0_SIZE_X); + const int y_begin = CLAMP(floor(roi_y + dy_begin), 0, INPUT0_SIZE_Y); + const int x_after = CLAMP(ceil(roi_x + dx_after), 0, INPUT0_SIZE_X); + const int y_after = CLAMP(ceil(roi_y + dy_after), 0, INPUT0_SIZE_Y); + + ACCUM_T res = 0.0f; + + for (int yy = y_begin; yy < y_after; ++yy) + { + for (int xx = x_begin; xx < x_after; ++xx) + { + INPUT0_TYPE val = data[xx*INPUT0_X_PITCH + yy*INPUT0_Y_PITCH]; + res += (ACCUM_T)val; + } + } + + const COORD_T area = (y_after - y_begin) * (x_after - x_begin); + if (area) + res /= area; + +#else +#error "Unsupported pooling mode" +#endif + const uint output_offset = OUTPUT_OFFSET + x*OUTPUT_X_PITCH + y*OUTPUT_Y_PITCH + c*OUTPUT_FEATURE_PITCH + r*OUTPUT_ROI_PITCH; + dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)(res), NL_M, NL_N); +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ref.cl index 0c006bc..2006d57 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ref.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2018 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -32,11 +32,7 @@ #define DST_H POOLED_HEIGHT #define PITCH_ROI_R INPUT1_BATCH_PITCH -#if GROUP_SIZE == 0 #define DST_C INPUT0_FEATURE_NUM -#else -#define DST_C (GROUP_SIZE ? (INPUT0_FEATURE_NUM / GROUP_SIZE / GROUP_SIZE) : INPUT0_FEATURE_NUM) -#endif // Note: In the non-ROI_OLD case we keep the coordinates in float instead // of using UNIT_TYPE, since with FP16 we might actually lose some @@ -52,12 +48,6 @@ #error - unknown ROI_POOLING kernel type #endif -/**************************************************************************** - * * - * RoI Pooling * - * * - ***************************************************************************/ - KERNEL(roi_pooling_gpu) ( const __global INPUT0_TYPE * src_data, @@ -76,7 +66,9 @@ KERNEL(roi_pooling_gpu) // with SPATIAL_SCALE: It makes sense since the resolution of // the pooled data is limited by its dimensions. (Is this clear?) - const __global INPUT1_TYPE * roi_ptr = &src_rois[PITCH_ROI_R * r]; + const __global INPUT1_TYPE* roi_ptr = &src_rois[PITCH_ROI_R * r]; + + const int src_batch_idx = (int)(roi_ptr[0]); #if BILINEAR_POOLING const uint output_offset = OUTPUT_OFFSET + x*OUTPUT_X_PITCH + y*OUTPUT_Y_PITCH + c*OUTPUT_FEATURE_PITCH + r*OUTPUT_ROI_PITCH; @@ -86,13 +78,13 @@ KERNEL(roi_pooling_gpu) COORD_T roi_end_w = roi_ptr[3]; COORD_T roi_end_h = roi_ptr[4]; - COORD_T height_scale = (roi_end_h - roi_start_h) * (SRC_H - 1) / (COORD_T)(POOLED_HEIGHT - 1); - COORD_T width_scale = (roi_end_w - roi_start_w) * (SRC_W - 1) / (COORD_T)(POOLED_WIDTH - 1); + COORD_T height_scale = (roi_end_h - roi_start_h) * (SRC_H - 1.0f) / (COORD_T)(POOLED_HEIGHT - 1.0f); + COORD_T width_scale = (roi_end_w - roi_start_w) * (SRC_W - 1.0f) / (COORD_T)(POOLED_WIDTH - 1.0f); - COORD_T in_y = y*height_scale + roi_start_h*(COORD_T)(SRC_H - 1); - COORD_T in_x = x*width_scale + roi_start_w*(COORD_T)(SRC_W - 1); + COORD_T in_y = y*height_scale + roi_start_h*(COORD_T)(SRC_H - 1.0f); + COORD_T in_x = x*width_scale + roi_start_w*(COORD_T)(SRC_W - 1.0f); - if (in_y < 0 || in_y > (COORD_T)(SRC_H - 1) || in_x < 0 || in_x > (COORD_T)(SRC_W - 1) || roi_ptr[0] == -1) { + if (in_y < 0 || in_y > (COORD_T)(SRC_H - 1) || in_x < 0 || in_x > (COORD_T)(SRC_W - 1) || src_batch_idx == -1) { dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)0, NL_M, NL_N); return; } @@ -102,7 +94,7 @@ KERNEL(roi_pooling_gpu) int left_x_index = (int)(floor(in_x)); int right_x_index = (int)(min(ceil(in_x), (COORD_T)SRC_W - 1)); - const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + INPUT0_FEATURE_PITCH*c; + const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + src_batch_idx*INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH*c; ACCUM_T top_left = (ACCUM_T)data[top_y_index*INPUT0_Y_PITCH + left_x_index*INPUT0_X_PITCH]; ACCUM_T top_right = (ACCUM_T)data[top_y_index*INPUT0_Y_PITCH + right_x_index*INPUT0_X_PITCH]; @@ -117,7 +109,6 @@ KERNEL(roi_pooling_gpu) dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)res, NL_M, NL_N); #else -#if USE_OLD_SCALE_AND_ROUNDING const int roi_x = round(roi_ptr[1] * SPATIAL_SCALE); const int roi_y = round(roi_ptr[2] * SPATIAL_SCALE); const int roi_x1 = round(roi_ptr[3] * SPATIAL_SCALE); @@ -126,16 +117,6 @@ KERNEL(roi_pooling_gpu) // The final coordinate is within the ROI and malformed dimensions are treated as 1 const uint roi_w = max(roi_x1 - roi_x, 0) + 1; const uint roi_h = max(roi_y1 - roi_y, 0) + 1; -#else - const COORD_T roi_x = (COORD_T)(round(roi_ptr[1]) + 0.f) * SPATIAL_SCALE; - const COORD_T roi_y = (COORD_T)(round(roi_ptr[2]) + 0.f) * SPATIAL_SCALE; - const COORD_T roi_x1 = (COORD_T)(round(roi_ptr[3]) + 1.f) * SPATIAL_SCALE; - const COORD_T roi_y1 = (COORD_T)(round(roi_ptr[4]) + 1.f) * SPATIAL_SCALE; - - // The final coordinate is within the ROI and malformed dimensions are treated as 1 - const COORD_T roi_w = max(roi_x1 - roi_x, .1f); - const COORD_T roi_h = max(roi_y1 - roi_y, .1f); -#endif // Note that when the "after" is rounded rounded up else we get the last cell, // instead of the cell beyond (For "symmetry"). @@ -145,7 +126,6 @@ KERNEL(roi_pooling_gpu) // [0, 1, 3, 4] # as expected // >>> [((x + 1) * 6) // 4 for x in [0, 1, 2, 3]] # "after" values // [1, 3, 4 ,6] # [2, 3, 5, 6] expected! -#if USE_OLD_SCALE_AND_ROUNDING const int dx_begin = ((x + 0) * roi_w) / DST_W; const int dy_begin = ((y + 0) * roi_h) / DST_H; const int dx_after = ((x + 1) * roi_w + (DST_W - 1)) / DST_W; @@ -156,38 +136,8 @@ KERNEL(roi_pooling_gpu) const int y_begin = clamp(roi_y + dy_begin, 0, SRC_H); const int x_after = clamp(roi_x + dx_after, 0, SRC_W); const int y_after = clamp(roi_y + dy_after, 0, SRC_H); -#else - const COORD_T dx_begin = (x + 0) * (COORD_T)(roi_w / DST_W); - const COORD_T dy_begin = (y + 0) * (COORD_T)(roi_h / DST_H); - const COORD_T dx_after = (x + 1) * (COORD_T)(roi_w / DST_W); - const COORD_T dy_after = (y + 1) * (COORD_T)(roi_h / DST_H); - - // clamp in case roi_x or roi_y were unreasonable - const int x_begin = CLAMP(floor(roi_x + dx_begin), 0, SRC_W); - const int y_begin = CLAMP(floor(roi_y + dy_begin), 0, SRC_H); - const int x_after = CLAMP(ceil(roi_x + dx_after), 0, SRC_W); - const int y_after = CLAMP(ceil(roi_y + dy_after), 0, SRC_H); -#endif - -#if GROUP_SIZE == 0 - const uint work_c = c; -#else - -#if 0 - const COORD_T group_bin_w = (COORD_T)roi_w / DST_W; - const COORD_T group_bin_h = (COORD_T)roi_h / DST_H; - - const uint group_x = CLAMP(x * group_bin_w, 0, GROUP_SIZE - 1); - const uint group_y = CLAMP(y * group_bin_h, 0, GROUP_SIZE - 1); -#else - const uint group_x = x; - const uint group_y = y; -#endif - - const uint work_c = group_x + GROUP_SIZE * (group_y + GROUP_SIZE * c); -#endif - const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + INPUT0_FEATURE_PITCH*work_c; + const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + src_batch_idx*INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH*c; #if MAX_POOLING ACCUM_T res = x_begin < x_after && y_begin < y_after ? -FLT_MAX : 0; @@ -208,7 +158,6 @@ KERNEL(roi_pooling_gpu) #if (!MAX_POOLING) { - //TODO(ruv): again, differs from the standard fixed size area (?) const COORD_T area = (y_after - y_begin) * (x_after - x_begin); if (area) res /= area; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/shuffle_channels_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/shuffle_channels_ref.cl new file mode 100644 index 0000000..77ba698 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/shuffle_channels_ref.cl @@ -0,0 +1,43 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +KERNEL(shuffle_channels_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint feature = get_global_id(1); + const uint y = get_global_id(2) / OUTPUT_SIZE_X; + const uint x = get_global_id(2) % OUTPUT_SIZE_X; + const uint dimensions[] = { batch, feature, y, x }; + + const uint current_group = dimensions[AXIS] / GROUP_SIZE; + const uint position_in_group = dimensions[AXIS] % GROUP_SIZE; + const uint input_index = INPUT0_OFFSET + (batch * INPUT0_BATCH_PITCH) + (feature * INPUT0_FEATURE_PITCH) + (y * INPUT0_Y_PITCH) + x; + + uint output_index = OUTPUT_OFFSET; + + for (uint i = 0; i < AXIS; ++i) { + output_index += dimensions[i] * INPUT0_PITCHES[INPUT0_DIMS - i - 1]; + } + + output_index += (position_in_group * GROUPS_NUMBER + current_group) * INPUT0_PITCHES[INPUT0_DIMS - AXIS - 1]; + + for (uint i = AXIS + 1; i < INPUT0_DIMS; ++i) { + output_index += dimensions[i] * INPUT0_PITCHES[INPUT0_DIMS - i - 1]; + } + + output[output_index] = input[input_index]; +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/strided_slice_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/strided_slice_ref.cl new file mode 100644 index 0000000..1fec68a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/strided_slice_ref.cl @@ -0,0 +1,50 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include/include_all.cl" + +KERNEL(strided_slice_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +{ + const uint batch = get_global_id(0); + const uint feature = get_global_id(1); + +#if NEW_AXIS_MODE + // If NEW_AXIS_MODE that just copy input to output + const uint y_input = get_global_id(2) / INPUT0_SIZE_X; + const uint x_input = get_global_id(2) % INPUT0_SIZE_X; + const uint input_index = INPUT0_OFFSET + + batch * INPUT0_BATCH_PITCH + + feature * INPUT0_FEATURE_PITCH + + y_input * INPUT0_Y_PITCH + + x_input * INPUT0_X_PITCH; + output[input_index] = input[input_index]; +#else + const uint y = get_global_id(2) / OUTPUT_SIZE_X; + const uint x = get_global_id(2) % OUTPUT_SIZE_X; + const uint input_index = INPUT0_OFFSET + + (SLICE_BEGIN_BATCH + batch * SLICE_STEPS_BATCH) * INPUT0_BATCH_PITCH + + (SLICE_BEGIN_FEATURE + feature * SLICE_STEPS_FEATURE) * INPUT0_FEATURE_PITCH + + (SLICE_BEGIN_Y + y * SLICE_STEPS_Y) * INPUT0_Y_PITCH + + (SLICE_BEGIN_X + x * SLICE_STEPS_X) * INPUT0_X_PITCH; + + const uint output_index = OUTPUT_OFFSET + + batch * OUTPUT_BATCH_PITCH + + feature * OUTPUT_FEATURE_PITCH + + y * OUTPUT_Y_PITCH + + x * OUTPUT_X_PITCH; + + output[output_index] = input[input_index]; +#endif +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp index 4a2344e..47ab153 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp @@ -181,11 +181,11 @@ namespace kernel_selector { std::cout << "ERROR: dispatch data for kernel: " << kernelName << " is incorrect: GWS0: " << runInfo.gws0 << " LWS0: " << runInfo.lws0 << std::endl; } - if (runInfo.gws0 % runInfo.lws0 != 0) + if (runInfo.gws1 % runInfo.lws1 != 0) { std::cout << "ERROR: dispatch data for kernel: " << kernelName << " is incorrect: GWS1: " << runInfo.gws1 << " LWS1: " << runInfo.lws1 << std::endl; } - if (runInfo.gws0 % runInfo.lws0 != 0) + if (runInfo.gws2 % runInfo.lws2 != 0) { std::cout << "ERROR: dispatch data for kernel: " << kernelName << " is incorrect: GWS2: " << runInfo.gws2 << " LWS2: " << runInfo.lws2 << std::endl; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h index 917b4e5..1bc50e8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h @@ -49,7 +49,7 @@ namespace kernel_selector std::string CreateJit(const std::string& template_name, const JitConstants& constants, const std::string& kernel_name) const; std::string GetEntryPoint(const std::string& templateName, const std::string& layerID, const optional_params& options) const; Arguments GetArgsDesc(uint32_t num_of_input, bool use_weights, bool use_bias, bool use_quantization = false, bool use_calibration = 0) const; - std::shared_ptr GetKernelString(const std::string& kernel_name, const std::string& jit, const std::string& entry_point, const EngineInfo& engine_info, const std::string& exe_mode = ROUND_ROBIN) const; - void FillCLKernelData(clKernelData& kernel, const CommonDispatchData& runInfo, const EngineInfo& engine_info, const std::string& kernel_map_name, const std::string& jit, const std::string& entry_point, const std::string& exe_mode = ROUND_ROBIN, + std::shared_ptr GetKernelString(const std::string& kernel_name, const std::string& jit, const std::string& entry_point, const EngineInfo& engine_info, const std::string& exe_mode = DEFAULT) const; + void FillCLKernelData(clKernelData& kernel, const CommonDispatchData& runInfo, const EngineInfo& engine_info, const std::string& kernel_map_name, const std::string& jit, const std::string& entry_point, const std::string& exe_mode = DEFAULT, bool weights = false, bool bias = false, int number_of_inputs = 1, bool quantization = false, bool calibration = false) const; }; } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp index 1a426a0..0f1cf4d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include "jitter.h" +#include "tensor_type.h" namespace kernel_selector { @@ -23,6 +24,7 @@ namespace kernel_selector { switch (wType) { case WeightsType::INT8: return GetTypeName(); + case WeightsType::UINT8: return GetTypeName(); case WeightsType::F16: return "half"; case WeightsType::F32: return GetTypeName(); default: return ""; @@ -58,6 +60,28 @@ namespace kernel_selector { } } + std::string toCodeString(float val) { + if (std::isinf(val)) + return std::signbit(val) ? "-INFINITY" : "INFINITY"; + std::stringstream ss; + // Workaround GCC compiler/STL bug + ss << "as_float(0x" << std::hex << *reinterpret_cast(&val) << ")"; + + ss << " /*" << std::scientific << val << "*/"; + return ss.str(); + } + + std::string toCodeString(double val) { + if (std::isinf(val)) + return std::signbit(val) ? "-INFINITY" : "INFINITY"; + std::stringstream ss; + // Workaround GCC compiler/STL bug + ss << "as_double(0x" << std::hex << *reinterpret_cast(&val) << ")"; + + ss << " /*" << std::scientific << val << "*/"; + return ss.str(); + } + JitDefinitions JitConstants::GetDefinitions() const { JitDefinitions definitons; @@ -70,6 +94,53 @@ namespace kernel_selector { return definitons; } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // TensorBaseTJitConstant + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template + class TensorBaseTJitConstant : public JitConstant + { + protected: + TensorBaseTJitConstant(const std::string& name) : JitConstant(name) {} + + public: + + JitDefinitions GetDefinitions(const Tensor::TensorBaseT& t) const + { + JitDefinitions definitions{ + { _name + "_TYPE", toCLType(t.GetDType()) }, + { _name + "_OFFSET", toCodeString(t.GetFirstElementOffset()) }, + { _name + "_VIEW_OFFSET", toCodeString(t.GetViewOffset()) }, + { _name + "_LENGTH", toCodeString(t.LogicalSize()) }, + { _name + "_DIMS", toCodeString(t.GetDims().size()) }, + { _name + "_SIMPLE", toCodeString(t.SimpleLayout()) }, + { "TO_" + _name + "_TYPE", "convert_" + toCLType(t.GetDType()) }, + { _name + "_LAYOUT_" + toString(t.GetLayout()), "1" }, + }; + + definitions.push_back({ _name + "_SIZE", toCodeString(t.GetDims().size()) }); + definitions.push_back({ _name + "_SIZES", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.v; }) }); + definitions.push_back({ _name + "_PITCHES", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.pitch; }) }); + definitions.push_back({ _name + "_PAD_BEFORE", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.before; }) }); + definitions.push_back({ _name + "_PAD_AFTER", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.after; }) }); + + return definitions; + } + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // DataTensorJitConstant + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class DataTensorJitConstant : public TensorBaseTJitConstant + { + const DataTensor _tensor; + + public: + DataTensorJitConstant(const std::string& name, const DataTensor& t) : TensorBaseTJitConstant(name), _tensor(t) {} + + JitDefinitions GetDefinitions() const override; + }; + JitDefinitions DataTensorJitConstant::GetDefinitions() const { JitDefinitions baseDefinitions = TensorBaseTJitConstant::GetDefinitions(_tensor); @@ -100,19 +171,37 @@ namespace kernel_selector { return definitions; } + std::shared_ptr MakeJitConstant(const std::string& name, const DataTensor& value) + { + return std::static_pointer_cast(std::make_shared(name, value)); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // WeightTensorJitConstant + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class WeightTensorJitConstant : public TensorBaseTJitConstant + { + const WeightsTensor _tensor; + + public: + WeightTensorJitConstant(const std::string& name, const WeightsTensor& t) : TensorBaseTJitConstant(name), _tensor(t) {} + + JitDefinitions GetDefinitions() const override; + }; + JitDefinitions WeightTensorJitConstant::GetDefinitions() const { JitDefinitions baseDefinitions = TensorBaseTJitConstant::GetDefinitions(_tensor); JitDefinitions definitions{ - { _name + "_SIZE_X", toCodeString(_tensor.X().v) }, - { _name + "_SIZE_Y", toCodeString(_tensor.Y().v) }, - { _name + "_IFM_NUM", toCodeString(_tensor.IFM().v) }, - { _name + "_OFM_NUM", toCodeString(_tensor.OFM().v) }, - { _name + "_X_PITCH", toCodeString(_tensor.X().pitch) }, - { _name + "_Y_PITCH", toCodeString(_tensor.Y().pitch) }, - { _name + "_IFM_PITCH", toCodeString(_tensor.IFM().pitch) }, - { _name + "_OFM_PITCH", toCodeString(_tensor.OFM().pitch) }, + { _name + "_SIZE_X", toCodeString(_tensor.X().v) }, + { _name + "_SIZE_Y", toCodeString(_tensor.Y().v) }, + { _name + "_IFM_NUM", toCodeString(_tensor.IFM().v) }, + { _name + "_OFM_NUM", toCodeString(_tensor.OFM().v) }, + { _name + "_X_PITCH", toCodeString(_tensor.X().pitch) }, + { _name + "_Y_PITCH", toCodeString(_tensor.Y().pitch) }, + { _name + "_IFM_PITCH", toCodeString(_tensor.IFM().pitch) }, + { _name + "_OFM_PITCH", toCodeString(_tensor.OFM().pitch) }, }; definitions.insert(definitions.end(), baseDefinitions.begin(), baseDefinitions.end()); @@ -120,63 +209,71 @@ namespace kernel_selector { return definitions; } - std::shared_ptr MakeActivationJitConstants(ActivationFunction activation_function) + std::shared_ptr MakeJitConstant(const std::string& name, const WeightsTensor& value) { + return std::static_pointer_cast(std::make_shared(name, value)); + } + + std::shared_ptr MakeActivationJitConstants(ActivationFunction activation_function, const std::string& suffix) + { + std::string name = "ACTIVATION" + suffix; // TODO: use native_exp and use cast for APL switch (activation_function) { case ActivationFunction::LOGISTIC: - return MakeJitConstant("ACTIVATION(input, m, n)", "(UNIT_VAL_ONE/(UNIT_VAL_ONE + exp(-input)))"); + return MakeJitConstant(name + "(input, m, n)", "(UNIT_VAL_ONE/(UNIT_VAL_ONE + exp(-input)))"); case ActivationFunction::HYPERBOLIC_TAN: - return MakeJitConstant("ACTIVATION(input, m, n)", "(tanh(input))"); + return MakeJitConstant(name + "(input, m, n)", "(tanh(input))"); case ActivationFunction::RELU: - return MakeJitConstant("ACTIVATION(input, m, n)", "(UNIT_MAX_FUNC(UNIT_VAL_ZERO, input))"); + return MakeJitConstant(name + "(input, m, n)", "(UNIT_MAX_FUNC(UNIT_VAL_ZERO, input))"); case ActivationFunction::RELU_NEGATIVE_SLOPE: - return MakeJitConstant("ACTIVATION(input, slope, n)", "isinf(TO_UNIT_TYPE(slope)) ? ((input >= UNIT_VAL_ZERO) ? \ + return MakeJitConstant(name + "(input, slope, n)", "isinf(TO_UNIT_TYPE(slope)) ? ((input >= UNIT_VAL_ZERO) ? \ input : -TO_UNIT_TYPE(slope)) : \ (UNIT_MAX_FUNC(input, UNIT_VAL_ZERO) + TO_UNIT_TYPE(slope) * UNIT_MIN_FUNC(input, UNIT_VAL_ZERO))"); case ActivationFunction::ELU: - return MakeJitConstant("ACTIVATION(input, alpha, n)", "(UNIT_MAX_FUNC(input, UNIT_VAL_ZERO) + \ + return MakeJitConstant(name + "(input, alpha, n)", "(UNIT_MAX_FUNC(input, UNIT_VAL_ZERO) + \ TO_UNIT_TYPE(alpha) * (exp(UNIT_MIN_FUNC(input, UNIT_VAL_ZERO)) - UNIT_VAL_ONE));"); case ActivationFunction::CLAMP: - return MakeJitConstant("ACTIVATION(input, m, n)", "(UNIT_MAX_FUNC(TO_UNIT_TYPE(m), UNIT_MIN_FUNC(TO_UNIT_TYPE(n), input)))"); + return MakeJitConstant(name + "(input, m, n)", "(UNIT_MAX_FUNC(TO_UNIT_TYPE(m), UNIT_MIN_FUNC(TO_UNIT_TYPE(n), input)))"); case ActivationFunction::SOFTRELU: - return MakeJitConstant("ACTIVATION(input, m, n)", "(log(UNIT_VAL_ONE + exp(input)))"); + return MakeJitConstant(name + "(input, m, n)", "(log(UNIT_VAL_ONE + exp(input)))"); case ActivationFunction::ABS: - return MakeJitConstant("ACTIVATION(input, m, n)", "(fabs(input))"); + return MakeJitConstant(name + "(input, m, n)", "(fabs(input))"); case ActivationFunction::LINEAR: - return MakeJitConstant("ACTIVATION(input, m, n)", "(m*input + n)"); + return MakeJitConstant(name + "(input, m, n)", "(m*input + n)"); case ActivationFunction::SQUARE: - return MakeJitConstant("ACTIVATION(input, m, n)", "(input*input)"); + return MakeJitConstant(name + "(input, m, n)", "(input*input)"); case ActivationFunction::SQRT: - return MakeJitConstant("ACTIVATION(input, m, n)", "(sqrt(input))"); + return MakeJitConstant(name + "(input, m, n)", "(sqrt(input))"); case ActivationFunction::SIN: - return MakeJitConstant("ACTIVATION(input, m, n)", "(sin(input))"); + return MakeJitConstant(name + "(input, m, n)", "(sin(input))"); case ActivationFunction::ASIN: - return MakeJitConstant("ACTIVATION(input, m, n)", "(asin(input))"); + return MakeJitConstant(name + "(input, m, n)", "(asin(input))"); case ActivationFunction::SINH: - return MakeJitConstant("ACTIVATION(input, m, n)", "(sinh(input))"); + return MakeJitConstant(name + "(input, m, n)", "(sinh(input))"); case ActivationFunction::COS: - return MakeJitConstant("ACTIVATION(input, m, n)", "(cos(input))"); + return MakeJitConstant(name + "(input, m, n)", "(cos(input))"); case ActivationFunction::ACOS: - return MakeJitConstant("ACTIVATION(input, m, n)", "(acos(input))"); + return MakeJitConstant(name + "(input, m, n)", "(acos(input))"); case ActivationFunction::COSH: - return MakeJitConstant("ACTIVATION(input, m, n)", "(cosh(input))"); + return MakeJitConstant(name + "(input, m, n)", "(cosh(input))"); case ActivationFunction::LOG: - return MakeJitConstant("ACTIVATION(input, m, n)", "(log(input))"); + return MakeJitConstant(name + "(input, m, n)", "(log(input))"); case ActivationFunction::LOG2: - return MakeJitConstant("ACTIVATION(input, m, n)", "(log2(input))"); + return MakeJitConstant(name + "(input, m, n)", "(log2(input))"); case ActivationFunction::EXP: - return MakeJitConstant("ACTIVATION(input, m, n)", "(exp(input))"); + return MakeJitConstant(name + "(input, m, n)", "(exp(input))"); + case ActivationFunction::NOT: + return MakeJitConstant(name + "(input, m, n)", "((input != 0) ? UNIT_VAL_ZERO : UNIT_VAL_ONE)"); case ActivationFunction::RELU_GRAD: - return MakeJitConstant("ACTIVATION(input_grad, input, m, n)", "(input_grad * (input > UNIT_VAL_ZERO ? TO_UNIT_TYPE(1) : TO_UNIT_TYPE(0)))"); + return MakeJitConstant(name + "(input_grad, input, m, n)", "(input_grad * (input > UNIT_VAL_ZERO ? TO_UNIT_TYPE(1) : TO_UNIT_TYPE(0)))"); case ActivationFunction::RELU_NEGATIVE_SLOPE_GRAD: - return MakeJitConstant("ACTIVATION(input_grad, input, slope, n)", "(input_grad * ((input > UNIT_VAL_ZERO ? TO_UNIT_TYPE(1) : TO_UNIT_TYPE(0)) + TO_UNIT_TYPE(slope) * (input <= 0 ? TO_UNIT_TYPE(1) : TO_UNIT_TYPE(0))))"); + return MakeJitConstant(name + "(input_grad, input, slope, n)", "(input_grad * ((input > UNIT_VAL_ZERO ? TO_UNIT_TYPE(1) : TO_UNIT_TYPE(0)) + TO_UNIT_TYPE(slope) * (input <= 0 ? TO_UNIT_TYPE(1) : TO_UNIT_TYPE(0))))"); case ActivationFunction::NONE_GRAD: - return MakeJitConstant("ACTIVATION(input_grad, input, m, n)", "input_grad"); + return MakeJitConstant(name + "(input_grad, input, m, n)", "input_grad"); case ActivationFunction::NONE: default: - return MakeJitConstant("ACTIVATION(input, m, n)", "input"); + return MakeJitConstant(name + "(input, m, n)", "input"); } } @@ -195,27 +292,47 @@ namespace kernel_selector { case Datatype::INT8: unit_type = "char"; unit_max_val = "CHAR_MAX"; - unit_min_val = "-UNIT_VAL_MAX"; + unit_min_val = "CHAR_MIN"; unit_val_one = "(char) 1"; unit_val_zero = "(char) 0"; to_unit_type = "convert_char(v)"; unit_max_func = "max"; unit_min_func = "min"; break; + case Datatype::UINT8: + unit_type = "uchar"; + unit_max_val = "UCHAR_MAX"; + unit_min_val = "0"; + unit_val_one = "(uchar) 1"; + unit_val_zero = "(uchar) 0"; + to_unit_type = "convert_uchar(v)"; + unit_max_func = "max"; + unit_min_func = "min"; + break; case Datatype::INT32: unit_type = "int"; unit_max_val = "INT_MAX"; - unit_min_val = "-UNIT_VAL_MAX"; + unit_min_val = "INT_MIN"; unit_val_one = "(int) 1"; unit_val_zero = "(int) 0"; to_unit_type = "convert_int(v)"; unit_max_func = "max"; unit_min_func = "min"; break; + case Datatype::UINT32: + unit_type = "uint"; + unit_max_val = "UINT_MAX"; + unit_min_val = "0"; + unit_val_one = "(uint) 1"; + unit_val_zero = "(uint) 0"; + to_unit_type = "convert_uint(v)"; + unit_max_func = "max"; + unit_min_func = "min"; + break; case Datatype::INT64: unit_type = "long"; unit_max_val = "LONG_MAX"; - unit_min_val = "-UNIT_VAL_MAX"; + unit_min_val = "LONG_MIN"; unit_val_one = "(long) 1"; unit_val_zero = "(long) 0"; to_unit_type = "convert_long(v)"; @@ -256,6 +373,16 @@ namespace kernel_selector { MakeJitConstant("UNIT_MIN_FUNC", unit_min_func), }; } + + JitConstants MakeActivationJitConstants(const base_activation_params& params, const std::string& suffix) + { + return JitConstants{ + MakeJitConstant("NL_M" + suffix, params.m), + MakeJitConstant("NL_N" + suffix, params.n), + MakeActivationJitConstants(params.function, suffix) + }; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // MakeBaseParamsJitConstants //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -265,12 +392,16 @@ namespace kernel_selector { bool bInt8Used = params.output.GetDType() == Datatype::INT8; bool bInt32Used = params.output.GetDType() == Datatype::INT32; bool bInt64Used = params.output.GetDType() == Datatype::INT64; + bool bUInt8Used = params.output.GetDType() == Datatype::UINT8; + bool bUInt32Used = params.output.GetDType() == Datatype::INT32; for (const auto& i : params.inputs) { bFP16Used |= i.GetDType() == Datatype::F16; bInt8Used |= i.GetDType() == Datatype::INT8; bInt32Used |= i.GetDType() == Datatype::INT32; bInt64Used |= i.GetDType() == Datatype::INT64; + bUInt8Used |= i.GetDType() == Datatype::UINT8; + bUInt32Used |= i.GetDType() == Datatype::UINT32; } JitConstants jit{ @@ -281,16 +412,11 @@ namespace kernel_selector { MakeJitConstant("INT8_UNIT_USED", bInt8Used), MakeJitConstant("INT32_UNIT_USED", bInt32Used), MakeJitConstant("INT64_UNIT_USED", bInt64Used), + MakeJitConstant("UINT8_UNIT_USED", bUInt8Used), + MakeJitConstant("UINT32_UNIT_USED", bUInt32Used), MakeJitConstant("GRADIENT", params.gradient), }; - // for activation function - jit.AddConstants({ - MakeJitConstant("NL_M", params.activationParams.m), - MakeJitConstant("NL_N", params.activationParams.n), - MakeActivationJitConstants(params.activationFunc), - }); - if (bInt8Used) { jit.Merge(MakeUnitTypeJitConstants(Datatype::INT8)); @@ -307,11 +433,22 @@ namespace kernel_selector { { jit.Merge(MakeUnitTypeJitConstants(Datatype::INT64)); } + else if (bUInt8Used) + { + jit.Merge(MakeUnitTypeJitConstants(Datatype::UINT8)); + } + else if (bUInt32Used) + { + jit.Merge(MakeUnitTypeJitConstants(Datatype::UINT32)); + } else { jit.Merge(MakeUnitTypeJitConstants(Datatype::F32)); } + // for activation function + jit.Merge(MakeActivationJitConstants(params.activation)); + for (size_t i = 0; i < params.inputs.size(); i++) { jit.AddConstant(MakeJitConstant("INPUT" + toCodeString(i), params.inputs[i])); @@ -344,4 +481,4 @@ namespace kernel_selector { return jit; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h index 3e65a0b..3992de9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h @@ -18,8 +18,6 @@ #pragma once #include "kernel_selector_common.h" -#include "kernel_selector_params.h" -#include "tensor_type.h" #include #include @@ -27,6 +25,8 @@ namespace kernel_selector { +struct base_params; + using JitDefinitions = std::vector>; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -66,69 +66,20 @@ std::string getMeanOpString(MeanOp op); template std::string toCodeString(T val) { return std::to_string(val); } -template<> -inline std::string toCodeString(std::string val) { return val; } - -template<> -inline std::string toCodeString(const char* val) { return val; } - -template<> -inline std::string toCodeString(char* val) { return val; } - -template<> -inline std::string toCodeString(bool val) -{ - std::stringstream ss; - ss << static_cast(val); - return ss.str(); -} - -template<> -inline std::string toCodeString(const bool val) -{ - std::stringstream ss; - ss << static_cast(val); - return ss.str(); -} - -template<> -inline std::string toCodeString(float val) { - if (std::isinf(val)) - return std::signbit(val) ? "-INFINITY" : "INFINITY"; - std::stringstream ss; -#ifdef __GNUC__ - // Workaround GCC compiler/STL bug - ss << "as_float(0x" << std::hex << *reinterpret_cast(&val) << ")"; -#else - ss << std::hexfloat << val << "f"; -#endif - ss << " /*" << std::scientific << val << "*/"; - return ss.str(); -} - -template<> -inline std::string toCodeString(double val) { - if (std::isinf(val)) - return std::signbit(val) ? "-INFINITY" : "INFINITY"; - std::stringstream ss; -#ifdef __GNUC__ - // Workaround GCC compiler/STL bug - ss << "as_double(0x" << std::hex << *reinterpret_cast(&val) << ")"; -#else - ss << std::hexfloat << val; -#endif - ss << " /*" << std::scientific << val << "*/"; - return ss.str(); -} +inline std::string toCodeString(const std::string& val) { return val; } +inline std::string toCodeString(const char* val) { return val; } +inline std::string toCodeString(bool val) { return val ? "1" : "0"; } +std::string toCodeString(float val); +std::string toCodeString(double val); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // JitConstant //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template -inline std::string toVectorString(const VecT& vec, const std::string& vertorType, size_t maxDim, ValT padFillingVal, Func fetchFunc) +inline std::string toVectorString(const VecT& vec, const std::string& vectorType, size_t maxDim, ValT padFillingVal, Func fetchFunc) { std::stringstream ss; - ss << "(" << vertorType << " []){ "; + ss << "(" << vectorType << " []){ "; for (size_t i = 0; i < vec.size(); i++) ss << toCodeString(fetchFunc(vec[i])) << ","; for (size_t i = vec.size(); i < maxDim; i++) @@ -171,75 +122,8 @@ std::shared_ptr MakeJitConstant(const std::string& name, T value) return std::static_pointer_cast(std::make_shared(name, toCodeString(value))); } -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// TensorBaseTJitConstant -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -class TensorBaseTJitConstant : public JitConstant -{ -protected: - TensorBaseTJitConstant(const std::string& name) : JitConstant(name) {} - -public: - - JitDefinitions GetDefinitions(const Tensor::TensorBaseT& t) const - { - JitDefinitions definitions{ - { _name + "_TYPE", toCLType(t.GetDType()) }, - { _name + "_OFFSET", toCodeString(t.GetFirstElementOffset()) }, - { _name + "_VIEW_OFFSET", toCodeString(t.GetViewOffset()) }, - { _name + "_LENGTH", toCodeString(t.LogicalSize()) }, - { _name + "_DIMS", toCodeString(t.GetDims().size()) }, - { _name + "_SIMPLE", toCodeString(t.SimpleLayout()) }, - { "TO_" + _name + "_TYPE", "convert_" + toCLType(t.GetDType()) }, - { _name + "_LAYOUT_" + toString(t.GetLayout()), "1" }, - }; - - definitions.push_back({ _name + "_SIZE", toCodeString(t.GetDims().size()) }); - definitions.push_back({ _name + "_SIZES", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.v; }) }); - definitions.push_back({ _name + "_PITCHES", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.pitch; }) }); - definitions.push_back({ _name + "_PAD_BEFORE", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.before; }) }); - definitions.push_back({ _name + "_PAD_AFTER", toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.after; }) }); - - return definitions; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// DataTensorJitConstant -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class DataTensorJitConstant : public TensorBaseTJitConstant -{ - const DataTensor _tensor; - -public: - DataTensorJitConstant(const std::string& name, const DataTensor& t) : TensorBaseTJitConstant(name), _tensor(t) {} - - JitDefinitions GetDefinitions() const override; -}; - -inline std::shared_ptr MakeJitConstant(const std::string& name, const DataTensor& value) -{ - return std::static_pointer_cast(std::make_shared(name, value)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// WeightTensorJitConstant -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class WeightTensorJitConstant : public TensorBaseTJitConstant -{ - const WeightsTensor _tensor; - -public: - WeightTensorJitConstant(const std::string& name, const WeightsTensor& t) : TensorBaseTJitConstant(name), _tensor(t) {} - - JitDefinitions GetDefinitions() const override; -}; - -inline std::shared_ptr MakeJitConstant(const std::string& name, const WeightsTensor& value) -{ - return std::static_pointer_cast(std::make_shared(name, value)); -} +std::shared_ptr MakeJitConstant(const std::string& name, const struct Tensor::DataTensor& value); +std::shared_ptr MakeJitConstant(const std::string& name, const struct Tensor::WeightsTensor& value); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // VectorDataJitConstant @@ -354,6 +238,7 @@ public: JitDefinitions GetDefinitions() const; }; +JitConstants MakeActivationJitConstants(const base_activation_params& params, const std::string& suffix=""); JitConstants MakeBaseParamsJitConstants(const base_params& params); JitConstants MakeLoopUnrollParamsJitConstants(uint32_t loopCount); JitConstants MakeUnitTypeJitConstants(Datatype dataType); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp index 92933f8..04607dc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp @@ -21,93 +21,11 @@ namespace kernel_selector { - bool CheckConvolutionPaddedInputDesc(const convolution_params& params, const DataTensor& reqDesc) - { - assert(params.inputs.size() == 1); - - bool properPadding = - reqDesc.X().pad.before <= params.inputs[0].X().pad.before && - reqDesc.Y().pad.before <= params.inputs[0].Y().pad.before && - reqDesc.Feature().pad.before <= params.inputs[0].Feature().pad.before && - reqDesc.Batch().pad.before <= params.inputs[0].Batch().pad.before; - - properPadding &= - reqDesc.X().pad.after <= params.inputs[0].X().pad.after && - reqDesc.Y().pad.after <= params.inputs[0].Y().pad.after && - reqDesc.Feature().pad.after <= params.inputs[0].Feature().pad.after && - reqDesc.Batch().pad.after <= params.inputs[0].Batch().pad.after; - - properPadding &= ((params.padding.x == 0 && params.padding.y == 0) || params.inputs[0].GetPaddedVal() == 0.f); - - return properPadding; - } - - DataTensor GetConvolutionBFYXPaddedTensor(const convolution_params& cp) - { - assert(cp.inputs.size() == 1); - assert(cp.inputs[0].GetDims().size() == 4U); - - DataTensor t = cp.inputs[0]; - std::vector pad{ { 0,0 },{ 0,0 },{ 0,0 },{ 0,0 } }; - - pad[0].before = cp.padding.x; - pad[1].before = cp.padding.y; - - const auto inputLimitX = (cp.output.X().v - 1) * cp.stride.x + (cp.filterSize.x - 1) * cp.dilation.x + 1; - const auto inputLimitY = (cp.output.Y().v - 1) * cp.stride.y + (cp.filterSize.y - 1) * cp.dilation.y + 1; - - pad[0].after = (size_t)std::max((int)inputLimitX - (int)t.X().v - (int)pad[0].before, (int)0); - pad[1].after = (size_t)std::max((int)inputLimitY - (int)t.Y().v - (int)pad[1].before, (int)0); - - Tensor::NDims dims(4); - const Tensor::NDims& orgDims = cp.inputs[0].GetDims(); - size_t pitch = 1; - for (size_t i = 0; i < dims.size(); i++) - { - dims[i].pad = pad[i]; - dims[i].v = orgDims[i].v; - dims[i].pitch = pitch; - pitch *= dims[i].LogicalDimPadded(); - } - - return{ dims, t.GetDType(), t.GetLayout() }; - } - - bool CovolutionCheckInput(const Params& p, const optional_params& o) - { - const convolution_params& params = static_cast(p); - const convolution_optional_params& optParams = static_cast(o); - - const auto req_input = GetConvolutionBFYXPaddedTensor(params); - const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input); - const bool bInputPadded = optParams.allowInputReordering || bProperInputDesc; - - if (!bInputPadded) - { - return false; - } - - return true; - } - - bool CovolutionUpdateInputParams(convolution_params& params) - { - const auto req_input = GetConvolutionBFYXPaddedTensor(params); - const bool bProperInputDesc = CheckConvolutionPaddedInputDesc(params, req_input); - - if (!bProperInputDesc) - { - params.inputs[0] = req_input; - return true; - } - - return false; - } - - WeightsType DataTypeToWeightsType(Datatype t) + static WeightsType DataTypeToWeightsType(Datatype t) { switch (t) { + case Datatype::UINT8: return WeightsType::UINT8; case Datatype::INT8: return WeightsType::INT8; case Datatype::F16: return WeightsType::F16; case Datatype::F32: return WeightsType::F32; @@ -116,9 +34,10 @@ namespace kernel_selector { } } - bool CheckWeights(const WeightsTensor& tensor, WeightsType reqType, std::vector reqLayouts) + static bool CheckWeights(const WeightsTensor& tensor, WeightsType reqType, std::vector reqLayouts, const ParamsKey& paramsKey) { - if (reqType != tensor.GetDType()) + if ((reqType != tensor.GetDType()) && + !(paramsKey.isEnabledDifferentInputWeightsTypes())) { return false; } @@ -170,7 +89,7 @@ namespace kernel_selector { return true; } - bool UpdateWeightsParams(weight_bias_params& newParams, const optional_params& options, std::vector layouts, WeightsReorderParams& weightsReorderParams) + bool UpdateWeightsParams(weight_bias_params& newParams, const optional_params& options, std::vector layouts, WeightsReorderParams& weightsReorderParams, const ParamsKey& paramsKey) { //validate if weights type is image and if device supports requested sizes for (auto& requested_layout : layouts) @@ -184,8 +103,8 @@ namespace kernel_selector { const weight_bias_optional_params& optParams = static_cast(options); const auto dtype = DataTypeToWeightsType(newParams.inputs[0].GetDType()); - bool bProperWeights = CheckWeights(newParams.weights, dtype, layouts); - + bool bProperWeights = CheckWeights( + newParams.weights, dtype, layouts, paramsKey); if (!bProperWeights) { if (!optParams.allowStaticInputReordering) @@ -274,7 +193,7 @@ namespace kernel_selector { std::vector GetOptimalLocalWorkGroupSizes(std::vector gws) { const size_t lws_max = 256; - const size_t optimal_lws_values[] = { 256, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 3, 2, 1 }; + const size_t optimal_lws_values[] = { 256, 227, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 2, 1 }; size_t total_lws = 1; std::vector lws; for (size_t i = 0; i < gws.size(); ++i) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h index e7cc7cf..dbd6fe4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,22 +17,16 @@ #pragma once #include "jitter.h" -#include "tensor_type.h" namespace kernel_selector { struct weight_bias_params; - struct convolution_params; + struct optional_params; + struct WeightsReorderParams; - bool CheckConvolutionPaddedInputDesc(const convolution_params& params, const DataTensor& reqDesc); - DataTensor GetConvolutionBFYXPaddedTensor(const convolution_params& cp); - bool CovolutionCheckInput(const Params& p, const optional_params& o); - bool CovolutionUpdateInputParams(convolution_params& params); - WeightsType DataTypeToWeightsType(Datatype t); - bool CheckWeights(const WeightsTensor& tensor, WeightsType reqType, std::vector reqLayouts); std::vector GetImageSizes(const kernel_selector::WeightsTensor& dimensions, const WeightsLayout layout); bool CheckImageSize(const weight_bias_params& newParams, const WeightsLayout layout); - bool UpdateWeightsParams(weight_bias_params& newParams, const optional_params& options, std::vector layouts, WeightsReorderParams& weightsReorderParams); + bool UpdateWeightsParams(weight_bias_params& newParams, const optional_params& options, std::vector layouts, WeightsReorderParams& weightsReorderParams, const ParamsKey& paramsKey = ParamsKey()); JitConstants GetTensorFriendlyWorkGroupsJit(const DataTensor& t); std::vector GetTensorFriendlyWorkGroups(const DataTensor& t); std::vector GetOptimalLocalWorkGroupSizes(std::vector gws); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py index 22b48fe..41e78f0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py @@ -58,13 +58,13 @@ class OpenCL2CHeaders(object): self.include_files[filename] = {} #kernel_name = name[:name.find('.')] kernel_name = name[:name.find('.cl')] - res = '{{"{}",\nR"__krnl(\n'.format(kernel_name) + res = '{{"{}",\n(std::string) R"__krnl(\n'.format(kernel_name) content = self.append_file_content(filename, filename) max_lines = 200 for i, line in enumerate(content.split('\n')): if i % max_lines == 0: - res += ')__krnl"\nR"__krnl(' + res += ')__krnl"\n + R"__krnl(' res += line + '\n' res += ')__krnl"}},\n\n'.format(kernel_name, self.append_file_content(filename, filename)) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h index 80b501e..28450a5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h @@ -40,7 +40,17 @@ namespace kernel_selector return GetKernelsData(params, options); } - virtual ParamsKey GetSupportedKey() const = 0; + virtual bool Supports(const Params& params, const optional_params& options) const + { + const ParamsKey requireKey = params.GetParamsKey().Merge(options.GetSupportedKey()); + return GetSupportedKey().Support(requireKey); + } + + bool SupportsTuning() const + { + return GetSupportedKey().TuningSupport(); + } + virtual const std::string GetName() const { return kernelName; } static const primitive_db& get_db() { return db; } @@ -50,8 +60,9 @@ namespace kernel_selector const std::string kernelName; static size_t UniqeID() { return counter++; } // TODO: use interlocked + virtual ParamsKey GetSupportedKey() const = 0; private: static size_t counter; }; -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector.cpp index 6e938d0..2968c10 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector.cpp @@ -85,11 +85,9 @@ namespace kernel_selector { if (params.GetType() == kType && options.GetType() == kType) { - const ParamsKey requireKey = params.GetParamsKey().Merge(options.GetSupportedKey()); for (const auto& implementation : implementations) { - const ParamsKey implKey = implementation->GetSupportedKey(); - if (implKey.Support(requireKey)) + if (implementation->Supports(params, options)) { try { @@ -146,25 +144,23 @@ namespace kernel_selector { { KernelsData kernelsData; std::string kernelName; - if (params.GetType() == kType && options.GetType() == kType) { std::string hash = std::to_string(create_hash(params.to_string())); - ParamsKey requireKey = params.GetParamsKey().Merge(options.GetSupportedKey()); - std::tuple cachedKernelConfig; if (options.tuningParams.mode == TuningMode::TUNING_DISABLED) // Try to load kernel/config from offline cache { #if ENABLE_OFFLINE_TUNING_CACHE - cachedKernelConfig = autoTuner.LoadKernelOffline(params.engineInfo.deviceId, hash); + cachedKernelConfig = autoTuner.LoadKernelOffline(params.engineInfo.deviceCache, hash); + #else return GetNaiveBestKernel(params, options, kType); #endif } else // Try to load kernel/config from on-line cache { - cachedKernelConfig = autoTuner.LoadKernelOnline(options.tuningParams.mode, options.tuningParams.cacheFilePath, params.engineInfo.deviceId, params.engineInfo.driverVersion, params.engineInfo.hostVersion, hash); + cachedKernelConfig = autoTuner.LoadKernelOnline(options.tuningParams.mode, options.tuningParams.cacheFilePath, params.engineInfo.computeUnitsCount, hash); } bool hashFoundInCache = !std::get<0>(cachedKernelConfig).empty(); @@ -179,7 +175,7 @@ namespace kernel_selector { if (implementation->GetName().compare(cachedkernelName) == 0) { KernelsData kds = implementation->GetTunedKernelsDataByIndex(params, options, autoTuneIndex); - if (kds.size() && kds[0].kernels.size() && implementation->GetSupportedKey().Support(requireKey)) + if (kds.size() && kds[0].kernels.size() && implementation->Supports(params, options)) { kernelsData = kds; kernelsData[0].kernelName = cachedkernelName; @@ -208,9 +204,7 @@ namespace kernel_selector { for (const auto& implementation : implementations) { - - const ParamsKey implKey = implementation->GetSupportedKey(); - if (implKey.Support(requireKey) && implKey.TuningSupport()) + if (implementation->Supports(params, options) && implementation->SupportsTuning()) { try { @@ -219,11 +213,11 @@ namespace kernel_selector { for (size_t i = 0; i < kds.size(); i++) { - kds[i].runTime = runTimes[i]; + kds[i].runTime = runTimes[i]; if (kernelsData.size() == 0 || kds[i].runTime < kernelsData[0].runTime) { kernelsData = { kds[i] }; - kernelName = implementation->GetName(); + kernelName = implementation->GetName(); } } } @@ -240,9 +234,8 @@ namespace kernel_selector { for (const auto& implementation : implementations) { - const ParamsKey implKey = implementation->GetSupportedKey(); //this time, check only implementations that have disabled tuning - if (implKey.Support(requireKey) && !implKey.TuningSupport()) + if (implementation->Supports(params, options) && !implementation->SupportsTuning()) { try { @@ -271,10 +264,10 @@ namespace kernel_selector { { kernelsData[0].kernelName = kernelName; kernelsData[0].kernels[0].layerID = params.layerID; - autoTuner.StoreKernel(options.tuningParams.cacheFilePath, hash, kernelName, kernelsData[0].autoTuneIndex); + autoTuner.StoreKernel(options.tuningParams.cacheFilePath, hash, kernelName, kernelsData[0].autoTuneIndex, params.engineInfo.computeUnitsCount); } } return kernelsData; } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp index f441136..c35748a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp @@ -17,7 +17,7 @@ #include "kernel_selector_common.h" #include -namespace kernel_selector +namespace kernel_selector { std::string GetStringEnv(const char* varName) { @@ -72,6 +72,7 @@ namespace kernel_selector case ActivationFunction::LOG: method = "LOG"; break; case ActivationFunction::LOG2: method = "LOG2"; break; case ActivationFunction::EXP: method = "EXP"; break; + case ActivationFunction::NOT: method = "NOT"; break; case ActivationFunction::NONE: method = "NONE"; break; case ActivationFunction::NONE_GRAD: method = "NONE_GRAD"; break; default: break; @@ -95,7 +96,9 @@ namespace kernel_selector case kernel_selector::DataLayout::brfyx: return "BRFYX"; case kernel_selector::DataLayout::winograd_2x3_s1_data: return "WINOGRAD_2x3_S1_DATA"; case kernel_selector::DataLayout::byxf_af32: return "BYXF_AF32"; + case kernel_selector::DataLayout::byx8_f4: return "BYX8_F4"; case kernel_selector::DataLayout::fs_bs_yx_bsv4_fsv32: return "FS_BS_YX_BSV4_FSV32"; + case kernel_selector::DataLayout::b_fs_yx_fsv4: return "B_FS_YX_FSV4"; default: return ""; } } @@ -308,6 +311,8 @@ namespace kernel_selector case WeightsLayout::iyxo: return "IYXO"; case WeightsLayout::yxio: return "YXIO"; case WeightsLayout::os_iyx_osv16: return "OS_IYX_OSV16"; + case WeightsLayout::os_iyx_osv32: return "OS_IYX_OSV32"; + case WeightsLayout::os_iyx_osv64: return "OS_IYX_OSV64"; case WeightsLayout::os_iyx_osv16_rotate_180: return "OS_IYX_OSV16_ROTATE_180"; case WeightsLayout::os_i_osv16: return "OS_I_OSV16"; case WeightsLayout::os_i_osv8__ai8: return "OS_I_OSV8__AI8"; @@ -323,7 +328,12 @@ namespace kernel_selector case WeightsLayout::image_2d_weights_winograd_6x3_s1_fbxyb: return "IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB"; case WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb: return "IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB"; case WeightsLayout::os_is_yx_isa8_osv8_isv4: return "OS_IS_YX_ISA8_OSV8_ISV4"; + case WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::is_o_yx_isv32: return "IS_O_YX_ISV32"; + case WeightsLayout::is_o32_yx_isv32_swizzled_by_4: return "IS_O32_YX_ISV32_SWIZZLED_BY_4"; + case WeightsLayout::os_is_y_x8_osv8_isv4: return "OS_IS_Y_X8_OSV8_ISV4"; + case WeightsLayout::os_is_yx_osv16_isv4: return "OS_IS_YX_OSV16_ISV4"; + default: return ""; break; @@ -354,6 +364,18 @@ namespace kernel_selector } } + std::string toString(GatherAxis a) + { + switch (a) + { + case GatherAxis::X: return "X"; + case GatherAxis::Y: return "Y"; + case GatherAxis::FEATURE: return "FEATURE"; + case GatherAxis::BATCH: return "BATCH"; + default: return ""; + } + } + std::string toString(SampleType type) { switch (type) @@ -388,13 +410,6 @@ namespace kernel_selector } } - std::string toString(NonLinearParams params) - { - std::stringstream s; - s << "m" << params.m << "_n" << params.n; - return s.str(); - } - std::string toString(const Tensor::Dim& dim) { std::stringstream s; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h index ef12a74..9f5f304 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h @@ -17,7 +17,6 @@ #pragma once #include "kernel_selector_params.h" -#include "primitive_db.h" #include #include @@ -29,7 +28,8 @@ #include #define AGE_BASED "-cl-no-subgroup-ifp" -#define ROUND_ROBIN "" +#define DEFAULT "" +#define NO_PRERA_SCH "-cl-intel-no-prera-scheduling" namespace kernel_selector { @@ -279,9 +279,9 @@ namespace kernel_selector { std::string toString(WeightsLayout layout); std::string toString(ConcatAxis a); std::string toString(TileAxis a); + std::string toString(GatherAxis a); std::string toString(SampleType type); std::string toString(const BorderType type); - std::string toString(NonLinearParams params); std::string toString(const Tensor::Dim& dim); std::string toString(const DataTensor& tensor); std::string toString(const IndexSelectAxis& axis); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp index bd718c1..fab2127 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp @@ -347,6 +347,16 @@ namespace kernel_selector { } } + void ParamsKey::EnableFusedConvEltwEltwiseStride() + { + key.restrict.val.dedicated.fused_conv_eltw.stride = 1; + } + + void ParamsKey::EnableEltwiseStride() + { + key.restrict.val.dedicated.eltwise.stride = 1; + } + void ParamsKey::EnableArgMaxMinAxis(ArgMaxMinAxis a) { switch (a) @@ -400,19 +410,35 @@ namespace kernel_selector { key.restrict.val.dedicated.lookt.indicesOther = 1; } + void ParamsKey::EnableFusedConvEltwiseRWOutOpt() + { + key.restrict.val.dedicated.fused_conv_eltw.rw_out_opt = 1; + } + bool ParamsKey::Support(const ParamsKey& k) const { - return - ((key.restrict.raw & k.key.restrict.raw) == k.key.restrict.raw) && // check if this kernel supports this params - ((key.machineInfo.raw & k.key.machineInfo.raw) == key.machineInfo.raw) && // check if machine supports this kernel - ((key.inputType.raw & k.key.inputType.raw) == k.key.inputType.raw) && - ((key.outputType.raw & k.key.outputType.raw) == k.key.outputType.raw) && - ((key.inputWeightsType.raw & k.key.inputWeightsType.raw) == k.key.inputWeightsType.raw) && - ((key.outputWeightsType.raw & k.key.outputWeightsType.raw) == k.key.outputWeightsType.raw) && - ((key.inputLayout & k.key.inputLayout) != 0 || key.inputLayout == k.key.inputLayout) && - ((key.outputLayout & k.key.outputLayout) != 0 || key.outputLayout == k.key.outputLayout) && - ((key.weightsInputLayout & k.key.weightsInputLayout) != 0 || key.weightsInputLayout == k.key.weightsInputLayout) && - ((key.weightsOutputLayout & k.key.weightsOutputLayout) != 0 || key.weightsOutputLayout == k.key.weightsOutputLayout); + if (!((key.restrict.raw & k.key.restrict.raw) == k.key.restrict.raw)) // check if this kernel supports this params + return false; + if (!((key.machineInfo.raw & k.key.machineInfo.raw) == key.machineInfo.raw)) // check if machine supports this kernel + return false; + if (!((key.inputType.raw & k.key.inputType.raw) == k.key.inputType.raw)) + return false; + if (!((key.outputType.raw & k.key.outputType.raw) == k.key.outputType.raw)) + return false; + if (!((key.inputWeightsType.raw & k.key.inputWeightsType.raw) == k.key.inputWeightsType.raw)) + return false; + if (!((key.outputWeightsType.raw & k.key.outputWeightsType.raw) == k.key.outputWeightsType.raw)) + return false; + if (!((key.inputLayout & k.key.inputLayout) != 0 || key.inputLayout == k.key.inputLayout)) + return false; + if (!((key.outputLayout & k.key.outputLayout) != 0 || key.outputLayout == k.key.outputLayout)) + return false; + if (!((key.weightsInputLayout & k.key.weightsInputLayout) != 0 || key.weightsInputLayout == k.key.weightsInputLayout)) + return false; + if (!((key.weightsOutputLayout & k.key.weightsOutputLayout) != 0 || key.weightsOutputLayout == k.key.weightsOutputLayout)) + return false; + + return true; } ParamsKey ParamsKey::Merge(const ParamsKey& k) const @@ -542,12 +568,18 @@ namespace kernel_selector { return k; } + std::string base_activation_params::to_string() const + { + std::stringstream s; + s << "m" << m << "_n" << n << "_" << toString(function); + return s.str(); + } + std::string base_params::to_string() const { std::stringstream s; s << Params::to_string() << "_"; - s << toString(activationParams) << "_"; - s << toString(activationFunc) << "_"; + s << activation.to_string() << "_"; for (auto input : inputs) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h index d4351f2..d8c5199 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,8 +20,8 @@ #include #include #include "common_types.h" -#include "common_tools.h" #include "tensor_type.h" +#include "document.h" namespace kernel_selector { @@ -58,6 +58,7 @@ namespace kernel_selector struct val_t { uint32_t different_types : 1; + uint32_t different_input_weights_types : 1; uint32_t offset : 1; uint32_t pitches : 1; uint32_t batching : 1; @@ -120,15 +121,18 @@ namespace kernel_selector uint32_t fixedKenrelDivider : 1; uint32_t dynamicKenrelDivider : 1; uint32_t dynamicKenrelDividerWithPadding : 1; + uint32_t position_sensitive : 1; } pooling; struct conv_t { uint32_t split : 1; uint32_t dilation : 1; - uint32_t depthwiseSeparableOpt : 1; + uint32_t depthwise_separable_opt : 1; uint32_t transposed : 1; uint32_t quantization : 1; uint32_t calibration : 1; + uint32_t local : 1; + uint32_t grouped : 1; } conv; struct fc_t {} fc; struct softmax_t @@ -171,6 +175,11 @@ namespace kernel_selector { uint32_t winograd : 1; } reorder; + struct eltwise_t + { + uint32_t stride : 1; + uint32_t broadcast : 1; + } eltwise; struct lstm_gemm_t { uint32_t bias : 1; uint32_t hidden : 1; @@ -178,6 +187,21 @@ namespace kernel_selector struct lstm_elt_t { uint32_t cell : 1; } lstm_elt; + struct fused_conv_eltw_t { + // conv + uint32_t split : 1; + uint32_t dilation : 1; + uint32_t depthwise_separable_opt : 1; + uint32_t transposed : 1; + uint32_t quantization : 1; + uint32_t calibration : 1; + uint32_t local : 1; + uint32_t grouped : 1; + // eltw + uint32_t stride : 1; + // fused conv eltw + uint32_t rw_out_opt : 1; + } fused_conv_eltw; } dedicated; } val; uint64_t raw; @@ -233,6 +257,8 @@ namespace kernel_selector void EnableAllOutputWeightsType(); void EnableFP16Emulation() { key.restrict.val.FP16Emulation = 1; } void EnableDifferentTypes() { key.restrict.val.different_types = 1; } + void EnableDifferentInputWeightsTypes() { + key.restrict.val.different_input_weights_types = 1; } void EnableInputLayout(DataLayout l) { key.inputLayout |= (1 << l); } void EnableAllInputLayout() { key.inputLayout = 0xffffffff; } void EnableOutputLayout(DataLayout l) { key.outputLayout |= (1 << l); } @@ -261,16 +287,32 @@ namespace kernel_selector void EnablePoolKernelDividerMode(KernelDividerMode m); void EnablePoolType(PoolType t); void EnablePoolRemainder(PoolRemainder r); + void EnablePositionSensitivePooling() { key.restrict.val.dedicated.pooling.position_sensitive = 1; } void EnableSplitSupport() { key.restrict.val.dedicated.conv.split = 1; } void EnableDilation() { key.restrict.val.dedicated.conv.dilation = 1; } - void EnableDepthwiseSeparableOpt() { key.restrict.val.dedicated.conv.depthwiseSeparableOpt = 1; } + void EnableDepthwiseSeparableOpt() { key.restrict.val.dedicated.conv.depthwise_separable_opt = 1; } + void EnableLocalConvolution() { key.restrict.val.dedicated.conv.local = 1; } + void EnableGroupedConvolution() { key.restrict.val.dedicated.conv.grouped = 1; } void EnableTranspose() { key.restrict.val.dedicated.conv.transposed = 1; } void EnableInt8Quantization() { key.restrict.val.dedicated.conv.quantization = 1; } void EnableOutputCalibration() { key.restrict.val.dedicated.conv.calibration = 1; } + + void EnableFusedConvEltwSplitSupport() { key.restrict.val.dedicated.fused_conv_eltw.split = 1; } + void EnableFusedConvEltwDilation() { key.restrict.val.dedicated.fused_conv_eltw.dilation = 1; } + void EnableFusedConvEltwDepthwiseSeparableOpt() { key.restrict.val.dedicated.fused_conv_eltw.depthwise_separable_opt = 1; } + void EnableFusedConvEltwLocalConvolution() { key.restrict.val.dedicated.fused_conv_eltw.local = 1; } + void EnableFusedConvEltwGroupedConvolution() { key.restrict.val.dedicated.fused_conv_eltw.grouped = 1; } + void EnableFusedConvEltwTranspose() { key.restrict.val.dedicated.fused_conv_eltw.transposed = 1; } + void EnableFusedConvEltwInt8Quantization() { key.restrict.val.dedicated.fused_conv_eltw.quantization = 1; } + void EnableFusedConvEltwOutputCalibration() { key.restrict.val.dedicated.fused_conv_eltw.calibration = 1; } + void EnableFusedConvEltwEltwiseStride(); + void EnableWinogradReorder() { key.restrict.val.dedicated.reorder.winograd = 1; } void EnableSoftmaxDim(SoftmaxDim d); void EnableConcatAxis(ConcatAxis a); void EnableUpSamplingSampleType(SampleType a); + void EnableEltwiseStride(); + void EnableEltwiseBroadcast() { key.restrict.val.dedicated.eltwise.broadcast = 1; } void EnableLSTMGEMMBias() { key.restrict.val.dedicated.lstm_gemm.bias = 1; } void EnableLSTMGEMMHidden() { key.restrict.val.dedicated.lstm_gemm.hidden = 1; } void EnableLSTMEltCell() { key.restrict.val.dedicated.lstm_elt.cell = 1; } @@ -280,6 +322,7 @@ namespace kernel_selector void EnableArgMaxMinAxis(ArgMaxMinAxis a); void EnableLookUpTableIndicesFormat(Datatype a); void EnableIndexSelectAxis(IndexSelectAxis a); + void EnableFusedConvEltwiseRWOutOpt(); bool Support(const ParamsKey& k) const; bool TuningSupport() const { @@ -287,6 +330,9 @@ namespace kernel_selector return true; return false; } + bool isEnabledDifferentInputWeightsTypes() const { + return key.restrict.val.different_input_weights_types ? true : false; + } ParamsKey Merge(const ParamsKey& k) const; private: @@ -305,6 +351,7 @@ namespace kernel_selector bool bImageSupport = false; bool bIMADSupport = false; bool bIMMADSupport = false; + uint32_t computeUnitsCount = 0; uint64_t maxWorkGroupSize = 0; uint64_t maxLocalMemSize = 0; uint64_t maxImage2dWidth = 0; @@ -312,6 +359,7 @@ namespace kernel_selector std::string deviceId = ""; std::string driverVersion = ""; std::string hostVersion = ""; + std::shared_ptr deviceCache; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -336,17 +384,31 @@ namespace kernel_selector }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // base_activation_params + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + struct base_activation_params + { + ActivationFunction function = ActivationFunction::NONE; + float m = 1.f; + float n = 0.f; + + base_activation_params() = default; + base_activation_params(const float m, const float n) : m(m), n(n) {} + + virtual std::string to_string() const; + }; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // base_params //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct base_params : public Params { virtual ~base_params() {} - ActivationFunction activationFunc = ActivationFunction::NONE; - NonLinearParams activationParams; - MultiDataTensor inputs; - DataTensor output; - bool gradient = false; + base_activation_params activation; + MultiDataTensor inputs; + DataTensor output; + bool gradient = false; virtual std::string to_string() const; virtual ParamsKey GetParamsKey() const; diff --git a/inference-engine/thirdparty/clDNN/src/CMakeLists.txt b/inference-engine/thirdparty/clDNN/src/CMakeLists.txt index 0ba989f..861a09d 100644 --- a/inference-engine/thirdparty/clDNN/src/CMakeLists.txt +++ b/inference-engine/thirdparty/clDNN/src/CMakeLists.txt @@ -77,6 +77,26 @@ file(GLOB __CLDNN_Headers__api__c "${__CLDNN_Directory__api__c}/*.hpp" ) +set(__CLDNN_Label__api_extension "api_extension") +file(GLOB __CLDNN_Headers__api_extension + "${CLDNN__API_EXTENSION_DIR}/*.h" + "${CLDNN__API_EXTENSION_DIR}/*.hpp" + ) + +set(__CLDNN_Directory__api_extension__cpp "${CLDNN__API_EXTENSION_DIR}/CPP") +set(__CLDNN_Label__api_extension__cpp "${__CLDNN_Label__api_extension}\\CPP") +file(GLOB __CLDNN_Headers__api_extension__cpp + "${__CLDNN_Directory__api_extension__cpp}/*.h" + "${__CLDNN_Directory__api_extension__cpp}/*.hpp" + ) + +set(__CLDNN_Directory__api_extension__c "${CLDNN__API_EXTENSION_DIR}/C") +set(__CLDNN_Label__api_extension__c "${__CLDNN_Label__api_extension}\\C") +file(GLOB __CLDNN_Headers__api_extension__c + "${__CLDNN_Directory__api_extension__c}/*.h" + "${__CLDNN_Directory__api_extension__c}/*.hpp" + ) + set(__CLDNN_Label__main "") file(GLOB __CLDNN_Sources__main "${CMAKE_CURRENT_SOURCE_DIR}/*.h" @@ -84,6 +104,14 @@ file(GLOB __CLDNN_Sources__main "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" ) +set(__CLDNN_Directory__graph_opt "${CMAKE_CURRENT_SOURCE_DIR}/graph_optimizer") +set(__CLDNN_Label__graph_opt "graph_optimizer") +file(GLOB __CLDNN_Sources__graph_opt + "${__CLDNN_Directory__graph_opt}/*.h" + "${__CLDNN_Directory__graph_opt}/*.hpp" + "${__CLDNN_Directory__graph_opt}/*.cpp" + ) + set(__CLDNN_Directory__include "${CMAKE_CURRENT_SOURCE_DIR}/include") set(__CLDNN_Label__include "include") file(GLOB __CLDNN_Headers__include @@ -146,10 +174,14 @@ set(__CLDNN_Directory__ks_cache "${__CLDNN_Directory__ks_core}/cache") set(__CLDNN_AllSources ${__CLDNN_Headers__api} + ${__CLDNN_Sources__graph_opt} ${__CLDNN_Headers__include} ${__CLDNN_Sources__caps} ${__CLDNN_Headers__api__cpp} ${__CLDNN_Headers__api__c} + ${__CLDNN_Headers__api_extension} + ${__CLDNN_Headers__api_extension__c} + ${__CLDNN_Headers__api_extension__cpp} ${__CLDNN_Sources__main} ${__CLDNN_Sources__gpu} ${__CLDNN_Sources__cache} @@ -161,16 +193,20 @@ set_property(SOURCE ${__CLDNN_Sources__cg_cache} PROPERTY GENERATED TRUE) # =============================================== Filters ============================================== -source_group("${__CLDNN_Label__api}" FILES ${__CLDNN_Headers__api}) -source_group("${__CLDNN_Label__api__cpp}" FILES ${__CLDNN_Headers__api__cpp}) -source_group("${__CLDNN_Label__api__c}" FILES ${__CLDNN_Headers__api__c}) -source_group("${__CLDNN_Label__include}" FILES ${__CLDNN_Headers__include}) -source_group("${__CLDNN_Label__caps}" FILES ${__CLDNN_Sources__caps}) -source_group("${__CLDNN_Label__main}" FILES ${__CLDNN_Sources__main}) -source_group("${__CLDNN_Label__gpu}" FILES ${__CLDNN_Sources__gpu}) -source_group("${__CLDNN_Label__cache}" FILES ${__CLDNN_Sources__cache}) -source_group("${__CLDNN_Label__ch_kernels}" FILES ${__CLDNN_Sources__ch_kernels}) -source_group("${__CLDNN_Label__cg_cache}" FILES ${__CLDNN_Sources__cg_cache}) +source_group("${__CLDNN_Label__api}" FILES ${__CLDNN_Headers__api}) +source_group("${__CLDNN_Label__api__cpp}" FILES ${__CLDNN_Headers__api__cpp}) +source_group("${__CLDNN_Label__api__c}" FILES ${__CLDNN_Headers__api__c}) +source_group("${__CLDNN_Label__api_extension}" FILES ${__CLDNN_Headers__api_extension}) +source_group("${__CLDNN_Label__api_extension__cpp}" FILES ${__CLDNN_Headers__api_extension__cpp}) +source_group("${__CLDNN_Label__api_extension__c}" FILES ${__CLDNN_Headers__api_extension__c}) +source_group("${__CLDNN_Label__include}" FILES ${__CLDNN_Headers__include}) +source_group("${__CLDNN_Label__graph_opt}" FILES ${__CLDNN_Sources__graph_opt}) +source_group("${__CLDNN_Label__caps}" FILES ${__CLDNN_Sources__caps}) +source_group("${__CLDNN_Label__main}" FILES ${__CLDNN_Sources__main}) +source_group("${__CLDNN_Label__gpu}" FILES ${__CLDNN_Sources__gpu}) +source_group("${__CLDNN_Label__cache}" FILES ${__CLDNN_Sources__cache}) +source_group("${__CLDNN_Label__ch_kernels}" FILES ${__CLDNN_Sources__ch_kernels}) +source_group("${__CLDNN_Label__cg_cache}" FILES ${__CLDNN_Sources__cg_cache}) # ===================================== Include/Link directories ======================================= diff --git a/inference-engine/thirdparty/clDNN/src/activation.cpp b/inference-engine/thirdparty/clDNN/src/activation.cpp index 503c720..f459287 100644 --- a/inference-engine/thirdparty/clDNN/src/activation.cpp +++ b/inference-engine/thirdparty/clDNN/src/activation.cpp @@ -29,6 +29,8 @@ primitive_type_id activation_type_id() layout activation_inst::calc_output_layout(activation_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for activation_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/activation_grad.cpp b/inference-engine/thirdparty/clDNN/src/activation_grad.cpp index 9d277c5..ecae773 100644 --- a/inference-engine/thirdparty/clDNN/src/activation_grad.cpp +++ b/inference-engine/thirdparty/clDNN/src/activation_grad.cpp @@ -29,6 +29,9 @@ primitive_type_id activation_grad_type_id() layout activation_grad_inst::calc_output_layout(activation_grad_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "activation_grad_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/apply_adam.cpp b/inference-engine/thirdparty/clDNN/src/apply_adam.cpp index 1b0e9d4..24b659e 100644 --- a/inference-engine/thirdparty/clDNN/src/apply_adam.cpp +++ b/inference-engine/thirdparty/clDNN/src/apply_adam.cpp @@ -27,8 +27,15 @@ primitive_type_id apply_adam_type_id() return &instance; } +apply_adam_node::typed_program_node(const std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) +{ + can_share_buffer(false); //apply adam's output initial val should be either 0 or use same buffer as mutable_data after it (no allocation needed) +} layout apply_adam_inst::calc_output_layout(apply_adam_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for apply_adam_node!"); return node.input().get_non_padded_output_layout(); } @@ -72,4 +79,4 @@ apply_adam_inst::typed_primitive_inst(network_impl& network, apply_adam_node con CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "beta1_power format", beta1_power_format.value, "supported beta1_power formats", format::yxfb, format::bfyx); CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "beta2_power format", beta2_power_format.value, "supported beta2_power formats", format::yxfb, format::bfyx); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/src/arg_max_min.cpp b/inference-engine/thirdparty/clDNN/src/arg_max_min.cpp index aa2f0e4..96cdca3 100644 --- a/inference-engine/thirdparty/clDNN/src/arg_max_min.cpp +++ b/inference-engine/thirdparty/clDNN/src/arg_max_min.cpp @@ -31,7 +31,10 @@ namespace cldnn layout arg_max_min_inst::calc_output_layout(arg_max_min_node const& node) { - auto desc = node.get_primitive(); + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "arg_max_min_node!"); + auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/average_unpooling.cpp b/inference-engine/thirdparty/clDNN/src/average_unpooling.cpp index aed36d0..4badd96 100644 --- a/inference-engine/thirdparty/clDNN/src/average_unpooling.cpp +++ b/inference-engine/thirdparty/clDNN/src/average_unpooling.cpp @@ -30,6 +30,9 @@ namespace cldnn layout average_unpooling_inst::calc_output_layout(average_unpooling_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "average_unpooling_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/batch_norm.cpp b/inference-engine/thirdparty/clDNN/src/batch_norm.cpp index 0aea3e6..2b972b1 100644 --- a/inference-engine/thirdparty/clDNN/src/batch_norm.cpp +++ b/inference-engine/thirdparty/clDNN/src/batch_norm.cpp @@ -18,6 +18,7 @@ #include "primitive_type_base.h" #include "error_handler.h" #include "json_object.h" +#include "mutable_data_inst.h" namespace cldnn { @@ -29,40 +30,43 @@ primitive_type_id batch_norm_type_id() layout batch_norm_inst::calc_output_layout(batch_norm_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for batch_norm_node!"); return node.input().get_non_padded_output_layout(); } std::string batch_norm_inst::to_string(batch_norm_node const& node) { - auto desc = node.get_primitive(); - auto node_info = node.desc_to_json(); - auto& mean = node.mean(); bool variance_term = node.variance_term(); - auto& inv_var = node.inv_variance(); std::stringstream primitive_description; - json_composite batch_norm_info; if (node.use_global_stats()) { - batch_norm_info.add("mean_id", mean.id()); + batch_norm_info.add("mean_id", node.mean().id()); if (variance_term) { batch_norm_info.add("variance_id", node.variance().id()); } } + if (node.use_scale_shift()) + { + batch_norm_info.add("scale_id", node.scale().id()); + batch_norm_info.add("shift_id", node.shift().id()); + } if (node.forwad_pass()) { - batch_norm_info.add("inv_var", inv_var.id()); + batch_norm_info.add("inv_var", node.inv_variance().id()); } - batch_norm_info.add("epsilon", desc->epsilon); + batch_norm_info.add("epsilon", node.get_primitive()->epsilon); - node_info->add("batch norm info", batch_norm_info); - node_info->dump(primitive_description); + node.desc_to_json()->add("batch norm info", batch_norm_info); + node.desc_to_json()->dump(primitive_description); return primitive_description.str(); } + batch_norm_inst::typed_primitive_inst(network_impl& network, batch_norm_node const& node) :parent(network, node) { @@ -71,8 +75,27 @@ batch_norm_inst::typed_primitive_inst(network_impl& network, batch_norm_node con auto mean_format = node.mean().get_output_layout().format; auto variance_format = node.variance().get_output_layout().format; - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Mean format", mean_format.value, "supported mean formats", format::yxfb, format::bfyx); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Variance format", variance_format.value, "supported variance formats", format::yxfb, format::bfyx); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Mean format", mean_format.value, "supported mean formats", format::yxfb, format::bfyx, format::byxf); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Variance format", variance_format.value, "supported variance formats", format::yxfb, format::bfyx, format::byxf); + + auto is_mean_mutable_data = node.mean().is_type(); + auto is_var_mutable_data = node.variance().is_type(); + + CLDNN_ERROR_BOOL(node.id(), "mean and variance are not the same type", (is_mean_mutable_data != is_var_mutable_data), ""); } + + if (use_scale_shift()) { + auto scale_format = node.scale().get_output_layout().format; + auto shift_format = node.shift().get_output_layout().format; + + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Scale format", scale_format.value, "supported scale formats", format::yxfb, format::bfyx, format::byxf); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Shift format", shift_format.value, "supported shift formats", format::yxfb, format::bfyx, format::byxf); + } + + if (forwad_pass()) + { + auto is_inv_var_mutable_data = node.inv_variance().is_type(); + CLDNN_ERROR_BOOL(node.id(), "inv_variance is not mutable_data type", !is_inv_var_mutable_data, ""); + } +} } -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp b/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp index d30e771..cadcb7d 100644 --- a/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp +++ b/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp @@ -29,6 +29,9 @@ namespace cldnn layout batch_norm_grad_inst::calc_output_layout(parent::typed_node const& node) { + assert( + (bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for batch_norm_grad_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/border.cpp b/inference-engine/thirdparty/clDNN/src/border.cpp index b07a1f9..2a2c5b6 100644 --- a/inference-engine/thirdparty/clDNN/src/border.cpp +++ b/inference-engine/thirdparty/clDNN/src/border.cpp @@ -30,22 +30,24 @@ primitive_type_id border_type_id() layout border_inst::calc_output_layout(border_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for border_node!"); auto input_layout = node.input().get_output_layout(); auto desc = node.get_primitive(); auto&& new_size = input_layout.size; - new_size += desc->left_top_sizes; - new_size += desc->right_bottom_sizes; + new_size += desc->left_top_sizes.sub({0, 0, 0, 0}); + new_size += desc->right_bottom_sizes.sub({0, 0, 0, 0}); - return {input_layout.data_type, input_layout.format, new_size}; + return { input_layout.data_type, input_layout.format, {new_size.batch[0], new_size.feature[0], new_size.spatial[0], new_size.spatial[1]} }; } std::string border_inst::to_string(border_node const& node) { auto desc = node.get_primitive(); - const auto& left_top_sizes = desc->left_top_sizes; - const auto& right_bottom_sizes = desc->right_bottom_sizes; + const auto& left_top_sizes = desc->left_top_sizes.sub({0, 0, 0, 0}); + const auto& right_bottom_sizes = desc->right_bottom_sizes.sub({0, 0, 0, 0}); const auto& border_value = std::to_string(desc->border_value); const char* border_type_str = "unknown"; @@ -80,8 +82,8 @@ border_inst::typed_primitive_inst(network_impl& network, border_node const& node const auto input_format = input_layout.format; const auto& input_sizes = input_layout.size; - auto lt_sizes = argument.left_top_sizes; - auto rb_sizes = argument.right_bottom_sizes; + auto lt_sizes = argument.left_top_sizes.sub({0, 0, 0, 0}); + auto rb_sizes = argument.right_bottom_sizes.sub({0, 0, 0, 0}); auto b_type = argument.type; CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input format", input_format.value, "supported border primitive input formats", diff --git a/inference-engine/thirdparty/clDNN/src/broadcast.cpp b/inference-engine/thirdparty/clDNN/src/broadcast.cpp index 4113e53..d7f8738 100644 --- a/inference-engine/thirdparty/clDNN/src/broadcast.cpp +++ b/inference-engine/thirdparty/clDNN/src/broadcast.cpp @@ -30,28 +30,39 @@ primitive_type_id broadcast_type_id() layout broadcast_inst::calc_output_layout(broadcast_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for broadcast_node!"); auto input_layout = node.input().get_output_layout(); auto desc = node.get_primitive(); - auto&& new_size = tensor::max(desc->broadcast_sizes, input_layout.size); - return {input_layout.data_type, input_layout.format, new_size}; + return {input_layout.data_type, input_layout.format, desc->broadcast_sizes}; } std::string broadcast_inst::to_string(broadcast_node const& node) { - auto desc = node.get_primitive(); + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + const auto& broadcast_sizes = desc->broadcast_sizes; + const auto& broadcast_axes = desc->broadcast_axes; + auto& input = node.input(); - const auto& broadcast_sizes = desc->broadcast_sizes; + std::stringstream primitive_description; + std::stringstream ss_broadcast_axes; + + for (size_t i = 0; i < broadcast_axes.size(); ++i) + { + ss_broadcast_axes << broadcast_axes.at(i); + i != (broadcast_axes.size() - 1) ? ss_broadcast_axes << ", " : ss_broadcast_axes << ""; + } - auto node_info = node.desc_to_json(); - json_composite broadcast_info; - broadcast_info.add("broadcast sizes", broadcast_sizes.to_string()); + broadcast_info.add("input id", input.id()); + broadcast_info.add("broadcast_sizes", broadcast_sizes.to_string()); + broadcast_info.add("broadcast axes", ss_broadcast_axes.str()); node_info->add("broadcast info", broadcast_info); - - std::stringstream primitive_description; node_info->dump(primitive_description); + return primitive_description.str(); } @@ -60,23 +71,56 @@ broadcast_inst::typed_primitive_inst(network_impl& network, broadcast_node const { auto input_layout = node.input().get_output_layout(); - const auto input_format = input_layout.format; const auto& input_sizes = input_layout.size; - - auto bc_sizes = argument.broadcast_sizes; - - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input format", input_format.value, "supported broadcast primitive input formats", - format::bfyx, format::yxfb, format::byxf); - - - // Check if sizes of broadcast are in proper range. - CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "Broadcast sizes", bc_sizes, "0 value", {1, 1, 1, 1}, - "Invalid broadcast size: non-positive value"); - - bc_sizes = tensor::max(bc_sizes, input_sizes); - - // Check if sizes of broadcast are compatible with sizes of input. - CLDNN_ERROR_TENSOR_SIZES_NOT_DIVIDABLE(node.id(), "Broadcast sizes", bc_sizes, "input sizes", input_sizes, + const auto& output_sizes = argument.broadcast_sizes; + + std::vector input_dims = {input_sizes.batch[0], input_sizes.feature[0], + input_sizes.spatial[1], input_sizes.spatial[0]}; + std::vector reordered_input_dims(4, 0); + std::set existing; + + const auto& broadcast_axes = node.get_primitive()->broadcast_axes; + size_t broadcast_axes_size = broadcast_axes.size(); + size_t index = 0; + size_t input_index = broadcast_axes_size; + + if (broadcast_axes_size > 4) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: broadcast_axes size should be less or equal 4."); + } + for (size_t i = 0; i < broadcast_axes_size; ++i) + { + if (broadcast_axes.at(i) >= 4) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: broadcast_axes index should be within broadcast_sizes range."); + } + if (existing.find(broadcast_axes.at(i)) != existing.end()) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: Duplicate axes numbers was found in broadcast_axes."); + } + existing.insert(broadcast_axes.at(i)); + } + for (size_t i = 0; i < input_index; ++i) + { + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input size on dimension number " + std::to_string(i), input_dims.at(i), "", 1, "Must be equal 1."); + } + //bfyx format + for (size_t i = 0; i < 4; ++i) + { + if (std::find(broadcast_axes.begin(), broadcast_axes.end(), i) != broadcast_axes.end()) + { + reordered_input_dims.at(i) = input_dims.at(index); + ++index; + } + else + { + reordered_input_dims.at(i) = input_dims.at(input_index); + ++input_index; + } + } + tensor input_sizes_to_compare(reordered_input_dims.at(0), reordered_input_dims.at(1), reordered_input_dims.at(3), reordered_input_dims.at(2)); + + CLDNN_ERROR_TENSOR_SIZES_NOT_DIVIDABLE(node.id(), "Broadcast sizes", output_sizes, "input sizes", input_sizes_to_compare, "Invalid broadcast size: not dividable by input size"); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/src/caps/public/gpu_devices.inc b/inference-engine/thirdparty/clDNN/src/caps/public/gpu_devices.inc deleted file mode 100644 index 06c1554..0000000 --- a/inference-engine/thirdparty/clDNN/src/caps/public/gpu_devices.inc +++ /dev/null @@ -1,63 +0,0 @@ -/* -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - - -//HD IRIS 5XX series -GEN_DEVICE(HD510, 0x1906, HD5XX, GEN9, GT1) -GEN_DEVICE(HD520, 0x1916, HD5XX, GEN9, GT2) -GEN_DEVICE(HD515, 0x191E, HD5XX, GEN9, GT2) -GEN_DEVICE(HD530, 0x1912, HD5XX, GEN9, GT2) -GEN_DEVICE(IRIS_540_550, 0x1926, HD5XX, GEN9, GT3) -GEN_DEVICE(HD510, 0x1902, HD5XX, GEN9, GT1) -GEN_DEVICE(IRIS_PRO_580, 0x193A, HD5XX, GEN9, GT4) -GEN_DEVICE(IRIS_PRO_580, 0x193B, HD5XX, GEN9, GT4) -GEN_DEVICE(HD530, 0x191B, HD5XX, GEN9, GT2) -GEN_DEVICE(HD_P530, 0x191D, HD5XX, GEN9, GT2) -GEN_DEVICE(IRIS_PRO_P580, 0x193D, HD5XX, GEN9, GT4) - -GEN_DEVICE(HD_500, 0x5A84, HD500_505, GEN9, GT1) -GEN_DEVICE(HD_505, 0x5A85, HD500_505, GEN9, GT1) -GEN_DEVICE(Joule_570x, 0x1A84, HD500_505, GEN9, GT1) -GEN_DEVICE(Joule_550x, 0x1A85, HD500_505, GEN9, GT1) - -//HD IRIS 6XX series -GEN_DEVICE(HD610, 0x5906, HD6XX, GEN9, GT1) -GEN_DEVICE(HD620, 0x5916, HD6XX, GEN9, GT2) -GEN_DEVICE(IRIS_PLUS_640, 0x5926, HD6XX, GEN9, GT3) -GEN_DEVICE(IRIS_PLUS_650, 0x5927, HD6XX, GEN9, GT3) -GEN_DEVICE(HD615, 0x591E, HD6XX, GEN9, GT2) -GEN_DEVICE(HD610, 0x5902, HD6XX, GEN9, GT1) -GEN_DEVICE(HD630, 0x5912, HD6XX, GEN9, GT2) -GEN_DEVICE(HD630, 0x591B, HD6XX, GEN9, GT2) -GEN_DEVICE(HD_P630, 0x591D, HD6XX, GEN9, GT2) - -//8th generation -GEN_DEVICE(HD610, 0x3E90, HD6XX, GEN9, GT1) -GEN_DEVICE(HD610, 0x3E93, HD6XX, GEN9, GT1) -GEN_DEVICE(HD620, 0x3E91, HD6XX, GEN9, GT2) -GEN_DEVICE(HD620, 0x3E92, HD6XX, GEN9, GT2) -GEN_DEVICE(HD620, 0x3E96, HD6XX, GEN9, GT2) -GEN_DEVICE(HD620, 0x5917, HD6XX, GEN9, GT2) -GEN_DEVICE(HD630, 0x3EA5, HD6XX, GEN9, GT3) -GEN_DEVICE(HD630, 0x3EA6, HD6XX, GEN9, GT3) -GEN_DEVICE(HD630, 0x3EA7, HD6XX, GEN9, GT3) -GEN_DEVICE(HD630, 0x3EA8, HD6XX, GEN9, GT3) - -GEN_DEVICE(HD605, 0x3184, HD6XX, GEN9, GT2) -GEN_DEVICE(HD600 , 0x3185, HD6XX, GEN9, GT2) - - - diff --git a/inference-engine/thirdparty/clDNN/src/caps/public/gpu_enums.inc b/inference-engine/thirdparty/clDNN/src/caps/public/gpu_enums.inc deleted file mode 100644 index b811d68..0000000 --- a/inference-engine/thirdparty/clDNN/src/caps/public/gpu_enums.inc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -GPU_CONFIGURATION(GT0, 0) -GPU_CONFIGURATION(GT1, 10) -GPU_CONFIGURATION(GT2, 20) -GPU_CONFIGURATION(GT3, 30) -GPU_CONFIGURATION(GT4, 40) -GPU_CONFIGURATION(GT_UNKNOWN, 1000) - -GPU_MODEL(HD500_505, 505) -GPU_MODEL(HD5XX, 599) -GPU_MODEL(HD6XX, 699) -GPU_MODEL(FUTURE, 10000) - -GPU_ARCHITECTURE(GEN9, 90) -GPU_ARCHITECTURE(GEN_UNKNOWN, 1000) - diff --git a/inference-engine/thirdparty/clDNN/src/caps/public/mode.inc b/inference-engine/thirdparty/clDNN/src/caps/public/mode.inc index d2f02af..965373f 100644 --- a/inference-engine/thirdparty/clDNN/src/caps/public/mode.inc +++ b/inference-engine/thirdparty/clDNN/src/caps/public/mode.inc @@ -14,6 +14,5 @@ // limitations under the License. */ -bool public_caps = true; bool is_imad_supported(int) { return false; } bool is_immad_supported(int) { return false; } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/cldnn.cpp b/inference-engine/thirdparty/clDNN/src/cldnn.cpp index 2985bef..a69069a 100644 --- a/inference-engine/thirdparty/clDNN/src/cldnn.cpp +++ b/inference-engine/thirdparty/clDNN/src/cldnn.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -107,45 +107,55 @@ void cldnn_change_input_layout(cldnn_topology topology, cldnn_primitive_id id, c }); } -void cldnn_get_primitive_ids(cldnn_topology topology, char* ids, size_t size, size_t* size_ret, cldnn_status* status) +static void primitive_id_vector_to_char_array( + char* names, + size_t size, + size_t* size_ret, + cldnn_status* status, + const std::vector& vec) { - return exception_handler(CLDNN_ERROR, status, [&]() + *size_ret = std::accumulate( + std::begin(vec), + std::end(vec), + size_t(1), // final zero symbol + [](size_t acc, const cldnn::primitive_id& id) { - SHOULD_NOT_BE_NULL(topology, "Topology"); - auto ids_size = api_cast(topology)->get_primitives().size(); - SHOULD_NOT_EQUAL_0(ids_size, "Primitives number"); - auto& primitives_ids = api_cast(topology)->get_primitives_id(); - *size_ret = std::accumulate( - std::begin(primitives_ids), - std::end(primitives_ids), - size_t(1), //final zero symbol - [](size_t acc, const cldnn::primitive_id& id) - { - return acc + id.size() + 1; // plus zero symbol - }); + return acc + id.size() + 1; // plus zero symbol + }); - if (size < *size_ret) - { - if (status) *status = CLDNN_INVALID_ARG; - return; - } + if (size < *size_ret) + { + if (status) *status = CLDNN_INVALID_ARG; + return; + } - size_t i = 0; - for (auto& id : primitives_ids) - { - // workaround for Microsoft VC++ + size_t i = 0; + for (auto& id : vec) + { + // workaround for Microsoft VC++ #if defined _MSC_VER #pragma warning(push) #pragma warning(disable: 4996) #endif - i += id.copy(ids + i, size - i - 2); + i += id.copy(names + i, size - i - 2); #if defined _MSC_VER #pragma warning(pop) #endif - ids[i++] = 0; // plus zero symbol - assert(i < size); - } - ids[i] = 0; // final zero symbol + names[i++] = 0; // plus zero symbol + assert(i < size); + } + names[i] = 0; // final zero symbol +} + +void cldnn_get_primitive_ids(cldnn_topology topology, char* ids, size_t size, size_t* size_ret, cldnn_status* status) +{ + return exception_handler(CLDNN_ERROR, status, [&]() + { + SHOULD_NOT_BE_NULL(topology, "Topology"); + auto ids_size = api_cast(topology)->get_primitives().size(); + SHOULD_NOT_EQUAL_0(ids_size, "Primitives number"); + auto&& primitives_ids = api_cast(topology)->get_primitives_id(); + primitive_id_vector_to_char_array(ids, size, size_ret, status, primitives_ids); }); } @@ -206,19 +216,19 @@ cldnn_engine cldnn_create_engine(/*cldnn_engine_type*/ int32_t type, uint32_t en void cldnn_retain_engine(cldnn_engine engine, cldnn_status* status) { - exception_handler(CLDNN_ERROR, status, [&]() - { + exception_handler(CLDNN_ERROR, status, [&]() + { SHOULD_NOT_BE_NULL(engine, "Engine"); - api_cast(engine)->add_ref(); + api_cast(engine)->add_ref(); }); } void cldnn_release_engine(cldnn_engine engine, cldnn_status* status) { - exception_handler(CLDNN_ERROR, status, [&]() - { + exception_handler(CLDNN_ERROR, status, [&]() + { SHOULD_NOT_BE_NULL(engine, "Engine"); - api_cast(engine)->release(); + api_cast(engine)->release(); }); } @@ -296,19 +306,19 @@ CLDNN_API int32_t cldnn_is_user_event(cldnn_event event, cldnn_status * status) void cldnn_retain_event(cldnn_event event, cldnn_status* status) { - exception_handler(CLDNN_ERROR, status, [&]() - { + exception_handler(CLDNN_ERROR, status, [&]() + { SHOULD_NOT_BE_NULL(event, "Event"); - api_cast(event)->add_ref(); + api_cast(event)->add_ref(); }); } void cldnn_release_event(cldnn_event event, cldnn_status* status) { - exception_handler(CLDNN_ERROR, status, [&]() - { + exception_handler(CLDNN_ERROR, status, [&]() + { SHOULD_NOT_BE_NULL(event, "Event"); - api_cast(event)->release(); + api_cast(event)->release(); }); } @@ -447,10 +457,10 @@ void cldnn_set_network_input(cldnn_network network, cldnn_primitive_id id, cldnn { exception_handler(CLDNN_ERROR, status, [&]() { + SHOULD_NOT_BE_NULL(mem, "Mem"); auto mem_size = api_cast(mem)->size(); SHOULD_NOT_BE_NULL(network, "Network"); SHOULD_NOT_BE_NULL(id, "Id"); - SHOULD_NOT_BE_NULL(mem, "Mem"); SHOULD_NOT_EQUAL_0(mem_size, "Memory size"); api_cast(network)->set_input_data(id, *api_cast(mem)); }); @@ -466,7 +476,7 @@ void cldnn_set_learning_rate(cldnn_network network, float lr, cldnn_status* stat float cldnn_get_learning_rate(cldnn_network network, cldnn_status* status) { - return exception_handler(CLDNN_ERROR, status, 0, [&]() + return exception_handler(CLDNN_ERROR, status, 0, [&]() { return api_cast(network)->get_learning_rate(); }); @@ -485,7 +495,7 @@ cldnn_engine cldnn_get_network_engine(cldnn_network network, cldnn_status* statu cldnn_program cldnn_get_network_program(cldnn_network network, cldnn_status* status) { return exception_handler(CLDNN_ERROR, status, nullptr, [&]() - { + { SHOULD_NOT_BE_NULL(network, "Network"); refcounted_obj_ptr ptr{const_cast(&api_cast(network)->get_program())}; return api_cast(ptr.detach()); @@ -509,7 +519,7 @@ void cldnn_get_primitive_info(cldnn_network network, cldnn_primitive_id prim_id, size_t i = 0; for (const auto c : prim_info) { - info[i++] = c; + info[i++] = c; assert(i < size); } info[i] = 0; // final zero symbol @@ -520,41 +530,10 @@ void cldnn_get_network_output_names(cldnn_network network, char* names, size_t s { exception_handler(CLDNN_ERROR, status, [&]() { - auto output_size = api_cast(network)->get_output_ids().size(); - SHOULD_NOT_BE_NULL(network, "Network"); - SHOULD_NOT_EQUAL_0(output_size, "Output size"); + SHOULD_NOT_BE_NULL(network, "Network"); auto&& output_ids = api_cast(network)->get_output_ids(); - *size_ret = std::accumulate( - std::begin(output_ids), - std::end(output_ids), - size_t(1), // final zero symbol - [](size_t acc, const cldnn::primitive_id& id) - { - return acc + id.size() + 1; // plus zero symbol - }); - - if(size < *size_ret) - { - if (status) *status = CLDNN_INVALID_ARG; - return; - } - - size_t i = 0; - for(auto& id: output_ids) - { -// workaround for Microsoft VC++ -#if defined _MSC_VER -#pragma warning(push) -#pragma warning(disable: 4996) -#endif - i += id.copy(names + i, size - i - 2); -#if defined _MSC_VER -#pragma warning(pop) -#endif - names[i++] = 0; // plus zero symbol - assert(i < size); - } - names[i] = 0; // final zero symbol + SHOULD_NOT_EQUAL_0(output_ids.size(), "Output size"); + primitive_id_vector_to_char_array(names, size, size_ret, status, output_ids); }); } @@ -562,41 +541,10 @@ void cldnn_get_network_executed_primitive_names(cldnn_network network, char* nam { exception_handler(CLDNN_ERROR, status, [&]() { - auto primitives_size = api_cast(network)->get_executed_primitive_ids().size(); SHOULD_NOT_BE_NULL(network, "Network"); - SHOULD_NOT_EQUAL_0(primitives_size, "Primitives size"); auto&& primitive_ids = api_cast(network)->get_executed_primitive_ids(); - *size_ret = std::accumulate( - std::begin(primitive_ids), - std::end(primitive_ids), - size_t(1), // final zero symbol - [](size_t acc, const cldnn::primitive_id& id) - { - return acc + id.size() + 1; // plus zero symbol - }); - - if (size < *size_ret) - { - if (status) *status = CLDNN_INVALID_ARG; - return; - } - - size_t i = 0; - for (auto& id : primitive_ids) - { - // workaround for Microsoft VC++ -#if defined _MSC_VER -#pragma warning(push) -#pragma warning(disable: 4996) -#endif - i += id.copy(names + i, size - i - 2); -#if defined _MSC_VER -#pragma warning(pop) -#endif - names[i++] = 0; // plus zero symbol - assert(i < size); - } - names[i] = 0; // final zero symbol + SHOULD_NOT_EQUAL_0(primitive_ids.size(), "Primitives size"); + primitive_id_vector_to_char_array(names, size, size_ret, status, primitive_ids); }); } @@ -604,41 +552,10 @@ void cldnn_get_network_all_primitive_names(cldnn_network network, char* names, s { exception_handler(CLDNN_ERROR, status, [&]() { - auto primitives_size = api_cast(network)->get_all_primitive_ids().size(); SHOULD_NOT_BE_NULL(network, "Network"); - SHOULD_NOT_EQUAL_0(primitives_size, "Primitives size"); auto&& primitive_ids = api_cast(network)->get_all_primitive_ids(); - *size_ret = std::accumulate( - std::begin(primitive_ids), - std::end(primitive_ids), - size_t(1), // final zero symbol - [](size_t acc, const cldnn::primitive_id& id) - { - return acc + id.size() + 1; // plus zero symbol - }); - - if (size < *size_ret) - { - if (status) *status = CLDNN_INVALID_ARG; - return; - } - - size_t i = 0; - for (auto& id : primitive_ids) - { - // workaround for Microsoft VC++ -#if defined _MSC_VER -#pragma warning(push) -#pragma warning(disable: 4996) -#endif - i += id.copy(names + i, size - i - 2); -#if defined _MSC_VER -#pragma warning(pop) -#endif - names[i++] = 0; // plus zero symbol - assert(i < size); - } - names[i] = 0; // final zero symbol + SHOULD_NOT_EQUAL_0(primitive_ids.size(), "Primitives size"); + primitive_id_vector_to_char_array(names, size, size_ret, status, primitive_ids); }); } @@ -646,41 +563,10 @@ void cldnn_get_network_all_primitive_org_names(cldnn_network network, char* name { exception_handler(CLDNN_ERROR, status, [&]() { - auto primitives_size = api_cast(network)->get_all_primitive_org_ids().size(); SHOULD_NOT_BE_NULL(network, "Network"); - SHOULD_NOT_EQUAL_0(primitives_size, "Primitives size"); auto&& primitive_ids = api_cast(network)->get_all_primitive_org_ids(); - *size_ret = std::accumulate( - std::begin(primitive_ids), - std::end(primitive_ids), - size_t(1), // final zero symbol - [](size_t acc, const cldnn::primitive_id& id) - { - return acc + id.size() + 1; // plus zero symbol - }); - - if (size < *size_ret) - { - if (status) *status = CLDNN_INVALID_ARG; - return; - } - - size_t i = 0; - for (auto& id : primitive_ids) - { - // workaround for Microsoft VC++ -#if defined _MSC_VER -#pragma warning(push) -#pragma warning(disable: 4996) -#endif - i += id.copy(names + i, size - i - 2); -#if defined _MSC_VER -#pragma warning(pop) -#endif - names[i++] = 0; // plus zero symbol - assert(i < size); - } - names[i] = 0; // final zero symbol + SHOULD_NOT_EQUAL_0(primitive_ids.size(), "Primitives size"); + primitive_id_vector_to_char_array(names, size, size_ret, status, primitive_ids); }); } @@ -770,7 +656,7 @@ cldnn_memory cldnn_attach_memory(cldnn_layout layout, void* pointer, size_t size return exception_handler(CLDNN_ERROR, status, nullptr, [&]() { cldnn::layout layout_obj(layout); - if (layout_obj.bytes_count() > size) + if (layout_obj.bytes_count() > size) throw std::invalid_argument("buffer size does not match layout size"); return api_cast(new cldnn::simple_attached_memory(layout_obj, pointer)); }); @@ -914,6 +800,8 @@ PRIMITIVE_TYPE_ID_CALL_IMPL(deconvolution) PRIMITIVE_TYPE_ID_CALL_IMPL(concatenation) PRIMITIVE_TYPE_ID_CALL_IMPL(eltwise) PRIMITIVE_TYPE_ID_CALL_IMPL(fully_connected) +PRIMITIVE_TYPE_ID_CALL_IMPL(fused_conv_bn_scale) +PRIMITIVE_TYPE_ID_CALL_IMPL(fused_conv_eltwise) PRIMITIVE_TYPE_ID_CALL_IMPL(input_layout) PRIMITIVE_TYPE_ID_CALL_IMPL(lookup_table) PRIMITIVE_TYPE_ID_CALL_IMPL(lrn) @@ -932,6 +820,7 @@ PRIMITIVE_TYPE_ID_CALL_IMPL(proposal) PRIMITIVE_TYPE_ID_CALL_IMPL(roi_pooling) PRIMITIVE_TYPE_ID_CALL_IMPL(prior_box) PRIMITIVE_TYPE_ID_CALL_IMPL(detection_output) +PRIMITIVE_TYPE_ID_CALL_IMPL(detection_output_sort) PRIMITIVE_TYPE_ID_CALL_IMPL(normalize) PRIMITIVE_TYPE_ID_CALL_IMPL(generic_layer) PRIMITIVE_TYPE_ID_CALL_IMPL(custom_gpu_primitive) @@ -950,3 +839,12 @@ PRIMITIVE_TYPE_ID_CALL_IMPL(tile) PRIMITIVE_TYPE_ID_CALL_IMPL(gemm) PRIMITIVE_TYPE_ID_CALL_IMPL(select) PRIMITIVE_TYPE_ID_CALL_IMPL(index_select) +PRIMITIVE_TYPE_ID_CALL_IMPL(condition) +PRIMITIVE_TYPE_ID_CALL_IMPL(pyramid_roi_align) +PRIMITIVE_TYPE_ID_CALL_IMPL(contract) +PRIMITIVE_TYPE_ID_CALL_IMPL(one_hot) +PRIMITIVE_TYPE_ID_CALL_IMPL(gather) +PRIMITIVE_TYPE_ID_CALL_IMPL(depth_to_space) +PRIMITIVE_TYPE_ID_CALL_IMPL(shuffle_channels) +PRIMITIVE_TYPE_ID_CALL_IMPL(strided_slice) +PRIMITIVE_TYPE_ID_CALL_IMPL(reverse_sequence) diff --git a/inference-engine/thirdparty/clDNN/src/concatenation.cpp b/inference-engine/thirdparty/clDNN/src/concatenation.cpp index a7e4452..7ab7643 100644 --- a/inference-engine/thirdparty/clDNN/src/concatenation.cpp +++ b/inference-engine/thirdparty/clDNN/src/concatenation.cpp @@ -29,6 +29,8 @@ primitive_type_id concatenation_type_id() layout concatenation_inst::calc_output_layout(concatenation_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for concatenation_node!"); auto desc = node.get_primitive(); auto input_layout = node.input(0).get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/condition.cpp b/inference-engine/thirdparty/clDNN/src/condition.cpp new file mode 100644 index 0000000..58be0cd --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/condition.cpp @@ -0,0 +1,85 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "condition_inst.h" + +#include "error_handler.h" +#include "json_object.h" +#include "primitive_type_base.h" + + +namespace cldnn +{ +primitive_type_id condition_type_id() +{ + static primitive_type_base instance; + return &instance; +} +/* + Calc_output_layout method is called only when output layout is invalidated. + It means, that it is called when: + 1) It has never been called. + 2) Dependency has changed output layout. + In this both cases, we need to recalc branch_true and branch_false. + !* We can be sure, that this method was called AT LEAST once during graph compilation.*! +*/ +layout condition_inst::calc_output_layout(condition_node const& node) +{ + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for condition_node!"); + node.set_branches(); + + auto branch_true_output = node.get_branch_true()->get_outputs(); + auto branch_false_output = node.get_branch_false()->get_outputs(); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Count of branch true outputs", branch_true_output.size(), "expected outputs size", 1, "Branch true should have one output."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Count of branch false outputs", branch_false_output.size(), "expected outputs size", 1, "Branch false should have one output."); + + auto layout_true = branch_true_output.at(0)->get_output_layout(); + auto layout_false = branch_false_output.at(0)->get_output_layout(); + CLDNN_ERROR_LAYOUT_MISMATCH(node.id(), "Branch true output layout", layout_true, "branch false output layout", layout_false, "Layout of the branches should be the same."); + + return layout_true; +} + +std::string condition_inst::to_string(condition_node const& node) +{ + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + json_composite condition_info; + + node_info->add("condition info", condition_info); + + std::stringstream primitive_description; + node_info->dump(primitive_description); + return primitive_description.str(); +} + +/* +Condition primitive is resuing memory with the input. +*/ +condition_inst::typed_primitive_inst(network_impl& network, condition_node const& node) + : parent(network, node) + , _net_true(node.get_program().get_engine().allocate_network(*node.get_branch_true(), true)) + , _net_false(node.get_program().get_engine().allocate_network(*node.get_branch_false(), true)) +{ + auto compare_tensor = node.compare().get_output_layout().size; + auto input_tensor = node.input().get_output_layout().size; + CLDNN_ERROR_TENSOR_SIZES_GREATER_THAN(node.id(), "Compare tensor", compare_tensor, "input tensor", input_tensor, "Compare primitive is too big."); + + auto compare_with_offster_tensor = compare_tensor + node.offset(); + CLDNN_ERROR_TENSOR_SIZES_GREATER_THAN(node.id(), "Offset with compare tensor", compare_with_offster_tensor, "input tensor", input_tensor, "Offset is too big."); + +} +} diff --git a/inference-engine/thirdparty/clDNN/src/constants_propagator.cpp b/inference-engine/thirdparty/clDNN/src/constants_propagator.cpp deleted file mode 100644 index 2a6cdad..0000000 --- a/inference-engine/thirdparty/clDNN/src/constants_propagator.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* -// Copyright (c) 2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "constants_propagator.h" -#include "engine_impl.h" -#include "program_impl.h" -#include "network_impl.h" -#include "memory_impl.h" - -#include "api/CPP/input_layout.hpp" - -using namespace cldnn; - -constants_propagator::constants_propagator(program_impl::ptr program) : prog(program) -{ -} - -void constants_propagator::visit_node(program_node& node) -{ - if (node.is_constant()) - handle_constant(node); -} - -std::list> constants_propagator::calculate() -{ - if (!has_non_trivial_constants) - return{}; - - build_options bo; - bo.set_option(build_option::optimize_data(false)); - bo.set_option(build_option::outputs(const_outputs)); - network_impl::ptr net = prog->get_engine().build_network(tpl, bo, true); - for (auto& cin : const_inputs) - net->set_input_data(cin->id(), cin->get_attached_memory()); - - net->execute({}); - net->reset_execution(true); //wait for computations to complete - auto outputs = net->get_outputs(); - - std::list> ret; - for (auto& out : outputs) - ret.push_back({ out->id(), &out->output_memory() }); - - return ret; -} - -void constants_propagator::handle_constant(program_node& node) -{ - if (!node.is_type()) - { - add_constant(node); - if (node.has_non_const_user()) - const_outputs.push_back(node.id()); - } -} - -void constants_propagator::add_constant(program_node& node) -{ - if (node.is_type()) - return; - - tpl.add(node.desc); - has_non_trivial_constants = true; - - //if a node is either an endpoint or an output, always add it as an output - if (node.is_endpoint() || node.is_output()) - const_outputs.push_back(node.id()); - - //if a non-tirivial constant has a trivial input, add this input as an input for our network - add_deps_to_tpl(node.get_dependencies()); -} - -void constants_propagator::add_deps_to_tpl(const std::vector& deps) -{ - /* - Nodes can share dependencies, if we already have dep in tpl, don't add it again. - example: - C <--- shared dep - / \ - / \ - A B - */ - for (auto& dep : deps) - { - if (dep->is_type()) - { - if (is_already_in_tpl(dep->id())) continue; - tpl.add(std::make_shared(dep->id(), dep->as().get_primitive()->mem.get_layout())); - const_inputs.push_back(&dep->as()); - } - } -} - -bool constants_propagator::is_already_in_tpl(const primitive_id& id) -{ - for (auto const& id_in_tpl : tpl.get_primitives_id()) - { - if (id == id_in_tpl) return true; - } - return false; -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/contract.cpp b/inference-engine/thirdparty/clDNN/src/contract.cpp new file mode 100644 index 0000000..020f404 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/contract.cpp @@ -0,0 +1,130 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "contract_inst.h" + +#include "error_handler.h" +#include "json_object.h" +#include "primitive_type_base.h" + + +namespace cldnn +{ + primitive_type_id contract_type_id() + { + static primitive_type_base instance; + return &instance; + } + + layout contract_inst::calc_output_layout(contract_node const& node) + { + auto input_layout = node.input().get_output_layout(); + const auto& input_sizes = input_layout.size; + auto desc = node.get_primitive(); + auto reduction_axes = desc->reduction_axes; + + std::vector input_dims = { input_sizes.batch[0], input_sizes.feature[0], + input_sizes.spatial[1], input_sizes.spatial[0] }; + std::vector output_sizes(4, 0); + int cur_dim = 3; + for (int i = 3; i >= 0; --i) + { + while (std::find(reduction_axes.begin(), reduction_axes.end(), cur_dim) != reduction_axes.end() && cur_dim >= 0) + --cur_dim; + output_sizes.at(i) = cur_dim >= 0 ? input_dims.at(cur_dim--) : 1; + } + + return { input_layout.data_type, input_layout.format, cldnn::tensor(output_sizes[0], output_sizes[1], output_sizes[3], output_sizes[2]) }; + } + + std::string contract_inst::to_string(contract_node const& node) + { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + const auto& reduction_axes = desc->reduction_axes; + auto& input = node.input(); + + std::stringstream primitive_description; + std::stringstream ss_reduction_axes; + + for (size_t i = 0; i < reduction_axes.size(); ++i) + { + ss_reduction_axes << reduction_axes.at(i); + i != (reduction_axes.size() - 1) ? ss_reduction_axes << ", " : ss_reduction_axes << ""; + } + + std::string str_mode; + switch (desc->mode) + { + case contract_mode::sum: + str_mode = "sum"; + break; + case contract_mode::prod: + str_mode = "product"; + break; + case contract_mode::all: + str_mode = "all"; + break; + case contract_mode::any: + str_mode = "any"; + break; + case contract_mode::max: + str_mode = "max"; + break; + default: + str_mode = "not supported mode"; + break; + } + + json_composite contract_info; + contract_info.add("input id", input.id()); + contract_info.add("mode", str_mode); + contract_info.add("reduction axes", ss_reduction_axes.str()); + + node_info->add("contract info", contract_info); + node_info->dump(primitive_description); + + return primitive_description.str(); + } + + contract_inst::typed_primitive_inst(network_impl& network, contract_node const& node) + : parent(network, node) + { + std::set existing; + const auto& reduction_axes = node.get_primitive()->reduction_axes; + size_t reduction_axes_size = reduction_axes.size(); + + if (reduction_axes.empty()) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: reduction_axes should not be empty."); + } + if (reduction_axes_size > 4) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: reduction_axes size should be less or equal 4."); + } + for (size_t i = 0; i < reduction_axes_size; ++i) + { + if (reduction_axes.at(i) >= 4) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: reduction_axes index should be within reduction_axes range."); + } + if (existing.find(reduction_axes.at(i)) != existing.end()) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: Duplicate axes numbers was found in reduction_axes."); + } + existing.insert(reduction_axes.at(i)); + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/convolution.cpp b/inference-engine/thirdparty/clDNN/src/convolution.cpp index cdb6ff2..fcdda7f 100644 --- a/inference-engine/thirdparty/clDNN/src/convolution.cpp +++ b/inference-engine/thirdparty/clDNN/src/convolution.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -31,6 +31,8 @@ primitive_type_id convolution_type_id() layout convolution_inst::calc_output_layout(convolution_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for convolution_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); @@ -103,8 +105,16 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) auto output_range = calc_sliding_window_output_range( input_layout.size, filter_size, input_offset, stride, dilation, true, 1); - tensor output_size(input_layout.size.batch[0], number_of_features, - output_range.spatial[0], output_range.spatial[1]); + tensor::value_type output_features = desc->output_size.feature[0] != 0 ? desc->output_size.feature[0] : number_of_features; + tensor output_size = tensor(input_layout.size.batch[0], output_features, + output_range.spatial[0], output_range.spatial[1]); + + // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer will take byxf and return fs_bs_yx_bsv4_fsv32 + if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 && input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) + { + return layout{ input_layout.data_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size }; + } + return { input_layout.data_type, input_layout.format, output_size }; } @@ -122,6 +132,8 @@ std::string convolution_inst::to_string(convolution_node const& node) json_composite conv_info; conv_info.add("stride", strd.to_string()); conv_info.add("input offset", desc->input_offset.to_string()); + conv_info.add("padding above", desc->padding_above.to_string()); + conv_info.add("padding below", desc->padding_below.to_string()); conv_info.add("split", split); conv_info.add("dilation", dilation.to_string()); conv_info.add("with activation", activation); @@ -148,8 +160,8 @@ convolution_inst::typed_primitive_inst(network_impl& network, convolution_node c auto output_inst = node.get_output_layout(); auto output_size = output_inst.size; - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input number of dimensions", input_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Input/output dims mismtach"); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Stride number of dimensions", stride.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "stride/output dims mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input number of dimensions", input_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Input/output dims mismatch"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Stride number of dimensions", stride.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "stride/output dims mismatch"); auto split = node.get_split(); for (decltype(split) j = 0; j < split; j++) @@ -162,18 +174,24 @@ convolution_inst::typed_primitive_inst(network_impl& network, convolution_node c CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias feature[0]", bias_inst.size.feature[0], "expected size of feature", 1, "Biases isn't 1D vector."); CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[1]", bias_inst.size.spatial[1], "expected size of spatial[1]", 1, "Biases isn't 1D vector."); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[0]", bias_inst.size.spatial[0], "expected feature map number", output_size.feature[0] / split, "Bias/fm mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[0]", bias_inst.size.spatial[0], "expected feature map number", output_size.feature[0] / split, "Bias/fm mismatch"); } auto input_offset = argument.input_offset; - CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights number of dimensions", filter_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Weights/output dims mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights number of dimensions", filter_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Weights/output dims mismatch"); CLDNN_ERROR_NOT_EQUAL(node.id(), "Convolution padding mode", node.get_output_layout().data_padding.filling_value(), "padding value", 0.0f, "Unknown padding mode."); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset number of dimensions", input_offset.raw.size(), "input number of dimensions", input_inst.size.raw.size(), "Input offset/ input size mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset number of dimensions", input_offset.raw.size(), "input number of dimensions", input_inst.size.raw.size(), "Input offset/ input size mismatch"); CLDNN_ERROR_NOT_EQUAL(node.id(), "Output feature size", output_size.feature.size(), "expected feature size", 1, "Only one-dimensional features are supported"); CLDNN_ERROR_NOT_EQUAL(node.id(), "Output batch size", output_size.batch.size(), "expected output size", 1, "Only one-dimensional batch size are supported"); CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights spatial size", filter_inst.size.spatial.size(), "expected weights spatial size", 2, "Weights have to have 2 dimensions in spatial domain."); - CLDNN_ERROR_LESS_THAN(node.id(), "Weights feature maps number", (input_inst.size.feature[0] - input_offset.feature[0]) / split, "input feature maps number", filter_inst.size.feature[0], "Weights/ifm mismtach"); + CLDNN_ERROR_LESS_THAN(node.id(), "Weights feature maps number", (input_inst.size.feature[0] - input_offset.feature[0]) / split, "input feature maps number", filter_inst.size.feature[0], "Weights/ifm mismatch"); + if (filter_inst.format == format::bf_lyx_yx) // local convolution + { + auto local = filter_inst.size.local; + CLDNN_ERROR_NOT_EQUAL(node.id(), "Number of local x dimension", local[0], "output x dimension", output_inst.size.spatial[0], "Weights/output dims mismatch"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Number of local y dimension", local[1], "output y dimension", output_inst.size.spatial[1], "Weights/output dims mismatch"); + } } } } diff --git a/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp b/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp index e8d7116..90be77c 100644 --- a/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp +++ b/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp @@ -31,9 +31,16 @@ primitive_type_id convolution_grad_weights_type_id() layout convolution_grad_weights_inst::calc_output_layout(convolution_grad_weights_node const& node) { - //output buffer will not be used in this primitive + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "convolution_grad_weights_node!"); + //output buffer will not be used in this primitive unless output gradient weights is set auto input_grad_layout_size = node.input(0).get_output_layout(); - return{ input_grad_layout_size.data_type, input_grad_layout_size.format, { 1, 1, 1, 1 } }; + tensor output_sizes = { 1, 1, 1, 1 }; + if (node.output_grad_w()) + output_sizes = node.weights().get_output_layout().size; + + return{ input_grad_layout_size.data_type, input_grad_layout_size.format, output_sizes }; } std::string convolution_grad_weights_inst::to_string(convolution_grad_weights_node const& node) diff --git a/inference-engine/thirdparty/clDNN/src/crop.cpp b/inference-engine/thirdparty/clDNN/src/crop.cpp index 01c2e2d..e8463fa 100644 --- a/inference-engine/thirdparty/clDNN/src/crop.cpp +++ b/inference-engine/thirdparty/clDNN/src/crop.cpp @@ -30,23 +30,54 @@ primitive_type_id crop_type_id() layout crop_inst::calc_output_layout(crop_node const& node) { - auto input_layout = node.input().get_output_layout(); - auto result = layout({ input_layout.data_type, input_layout.format, node.get_primitive()->reference_input }); - return result; + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for crop_node!"); + const auto& ref_in_sizes = node.get_primitive()->reference_input; + const auto in_layout = node.input().get_output_layout(); + const auto& in_sizes = in_layout.size; + const auto& offsets = node.get_primitive()->offsets; + + // Check for borders variant of crop. + if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || + ref_in_sizes.spatial[0] < 0 || ref_in_sizes.spatial[1] < 0) + { + // Ignore not supported dimensions. + const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0}); + const auto lt_sizes = offsets.sub({0, 0, 0, 0}); + + const auto out_sizes = in_sizes - (rb_sizes + lt_sizes); + + return layout({in_layout.data_type, in_layout.format, out_sizes}); + } + return layout({in_layout.data_type, in_layout.format, ref_in_sizes}); } std::string crop_inst::to_string(crop_node const& node) { - auto desc = node.get_primitive(); - auto offsets = desc->offsets; + const auto& desc = node.get_primitive(); + auto ref_in_sizes = desc->reference_input; + const auto& offsets = desc->offsets; + const auto in_layout = node.input().get_output_layout(); + const auto& in_sizes = in_layout.size; + auto node_info = node.desc_to_json(); - auto ref_input = desc->reference_input; - + + // Check for borders variant of crop. + if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || + ref_in_sizes.spatial[0] < 0 || ref_in_sizes.spatial[1] < 0) + { + // Ignore not supported dimensions. + const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0}); + const auto lt_sizes = offsets.sub({0, 0, 0, 0}); + + ref_in_sizes = in_sizes - (rb_sizes + lt_sizes); + } + std::stringstream primitive_description; json_composite crop_info; - crop_info.add("reference input", ref_input.to_string()); - crop_info.add("offset", offsets.to_string()); + crop_info.add("reference input size", ref_in_sizes.to_string()); + crop_info.add("offset", offsets.to_string()); node_info->add("crop info", crop_info); node_info->dump(primitive_description); @@ -55,23 +86,39 @@ std::string crop_inst::to_string(crop_node const& node) } crop_inst::typed_primitive_inst(network_impl& network, crop_node const& node) - :parent(network, node) + : parent(network, node) { - auto reference_input_sizes = argument.reference_input; - auto inp_layout = node.input().get_output_layout(); - auto input_sizes = inp_layout.size; - auto input_format = inp_layout.format; - auto offsets = argument.offsets; + const auto& ref_in_sizes = argument.reference_input; + const auto in_layout = node.input().get_output_layout(); + const auto& in_sizes = in_layout.size; + const auto in_format = in_layout.format; + const auto& offsets = argument.offsets; + + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input format", in_format.value, "supported crop input formats", format::yxfb, format::bfyx, format::fyxb); + + // Check for borders variant of crop. + if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || + ref_in_sizes.spatial[0] < 0 || ref_in_sizes.spatial[1] < 0) + { + // Ignore not supported dimensions. + const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0}); + const auto lt_sizes = offsets.sub({0, 0, 0, 0}); + + const auto out_sizes = in_sizes - (rb_sizes + lt_sizes); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input format", input_format.value, "supported crop input formats", format::yxfb, format::bfyx ); + CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "Left/top/lower borders", lt_sizes, "0 value", {}, "Invalid border size: negative"); + CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "Right/bottom/upper borders", rb_sizes, "0 value", {}, "Invalid border size: negative"); + + CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "Input sizes - border sizes", out_sizes, "1 value", {1, 1, 1, 1}, "Invalid border sizes: greater-equal input sizes"); + } //check if output sizes matches reference input sizes - CLDNN_ERROR_TENSOR_SIZES_GREATER_THAN(node.id(), "Reference input", reference_input_sizes, "input sizes", input_sizes, "Reference input tensor/ input tensor mismtach"); - + CLDNN_ERROR_TENSOR_SIZES_GREATER_THAN(node.id(), "Reference input", ref_in_sizes, "input sizes", in_sizes, "Reference input tensor/ input tensor mismtach"); + //check if offsets do not extend input sizes and if match the output sizes - CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "Batch offsets", offsets, "0 value", { 0, 0, 0, 0 }, "Invalid Batch offset: negative value"); - auto input_size_sub_offsets = input_sizes - offsets; - CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "input sizes - offsets", input_size_sub_offsets, "reference input sizes", reference_input_sizes, "Invalid Batch offset: exceeds data for output!"); + CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "Batch offsets", offsets, "0 value", {}, "Invalid Batch offset: negative value"); + auto input_size_sub_offsets = in_sizes - offsets; + CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(node.id(), "input sizes - offsets", input_size_sub_offsets, "reference input sizes", ref_in_sizes, "Invalid Batch offset: exceeds data for output!"); if (node.can_be_optimized()) { @@ -96,4 +143,4 @@ void crop_inst::reuse_input() { _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout()); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/src/data.cpp b/inference-engine/thirdparty/clDNN/src/data.cpp index cccb4e9..7b7702e 100644 --- a/inference-engine/thirdparty/clDNN/src/data.cpp +++ b/inference-engine/thirdparty/clDNN/src/data.cpp @@ -48,6 +48,7 @@ data_node::typed_program_node(const std::shared_ptr dprim, program_impl& p : parent(dprim, prog), mem(api_cast(dprim->mem.get())) { constant = true; + can_share_buffer(false); recalc_output_layout(false); } diff --git a/inference-engine/thirdparty/clDNN/src/deconvolution.cpp b/inference-engine/thirdparty/clDNN/src/deconvolution.cpp index 563ff8f..6c7dad9 100644 --- a/inference-engine/thirdparty/clDNN/src/deconvolution.cpp +++ b/inference-engine/thirdparty/clDNN/src/deconvolution.cpp @@ -31,6 +31,8 @@ primitive_type_id deconvolution_type_id() layout deconvolution_inst::calc_output_layout(deconvolution_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for deconvolution_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/depth_to_space.cpp b/inference-engine/thirdparty/clDNN/src/depth_to_space.cpp new file mode 100644 index 0000000..7c0b5f0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/depth_to_space.cpp @@ -0,0 +1,78 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "depth_to_space_inst.h" + +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn +{ +primitive_type_id depth_to_space_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout depth_to_space_inst::calc_output_layout(depth_to_space_node const& node) +{ + auto desc = node.get_primitive(); + + auto input_layout = node.input(0).get_output_layout(); + auto input_format = input_layout.format; + + const size_t block_size = desc->block_size; + + if (block_size < 2) + CLDNN_ERROR_MESSAGE(node.id(), "Invalid depthToSpace block_size value (should equal at least two). Actual block size is" + + std::to_string(block_size)); + + if (input_layout.size.feature[0] % (block_size * block_size) != 0) + CLDNN_ERROR_MESSAGE(node.id(), "The depth of the input tensor must be divisible by squared block size. Actual block size is " + + std::to_string(block_size)); + + const size_t feature = input_layout.size.feature[0] / block_size / block_size; + const size_t y = input_layout.size.spatial[1] * block_size; + const size_t x = input_layout.size.spatial[0] * block_size; + + return layout{input_layout.data_type, input_format, tensor(TensorValue(input_layout.size.batch[0]), TensorValue(feature), TensorValue(x), TensorValue(y))}; +} + +std::string depth_to_space_inst::to_string(depth_to_space_node const& node) +{ + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite depth_to_space_info; + depth_to_space_info.add("input id", input.id()); + depth_to_space_info.add("block size", desc->block_size); + + node_info->add("depth_to_space info", depth_to_space_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +depth_to_space_inst::typed_primitive_inst(network_impl& network, depth_to_space_node const& node) + : parent(network, node) +{ +} + +} diff --git a/inference-engine/thirdparty/clDNN/src/detection_output.cpp b/inference-engine/thirdparty/clDNN/src/detection_output.cpp index e8fa392..4d121df 100644 --- a/inference-engine/thirdparty/clDNN/src/detection_output.cpp +++ b/inference-engine/thirdparty/clDNN/src/detection_output.cpp @@ -30,16 +30,48 @@ primitive_type_id detection_output_type_id() layout detection_output_inst::calc_output_layout(detection_output_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "detection_output_node!"); CLDNN_ERROR_NOT_EQUAL(node.id(), "Detection output layer input number", node.get_dependencies().size(), "expected number of inputs", static_cast(3), ""); auto input_layout = node.location().get_output_layout(); // Batch size and feature size are 1. - // Number of bounding boxes to be kept is set to keep_top_k*batch size. - // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1. + // Number of bounding boxes to be kept is set to keep_top_k*batch size. + // If number of detections is lower than top_k, will write dummy results at the end with image_id=-1. // Each row is a 7 dimension vector, which stores: // [image_id, label, confidence, xmin, ymin, xmax, ymax] - return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0]) }; + int output_size = (int)input_layout.get_linear_size() / PRIOR_BOX_SIZE; + int num_classes = node.get_primitive()->num_classes; + + if (node.get_primitive()->share_location) + { + num_classes = (node.get_primitive()->background_label_id == 0) ? node.get_primitive()->num_classes - 1 : node.get_primitive()->num_classes; + output_size *= num_classes; + } + + if (node.get_primitive()->top_k != -1) + { + int top_k = node.get_primitive()->top_k * num_classes * input_layout.size.batch[0]; + if (top_k < output_size) + { + output_size = top_k; + } + } + + output_size *= DETECTION_OUTPUT_ROW_SIZE; + // Add space for number of output results per image - needed in the next detection output step + output_size += ((input_layout.size.batch[0] + 15) / 16) * 16; + + if (node.get_program().get_options().get()->enabled()) + { + return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, 1, output_size) }; + } + else + { + return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0]) }; + } } std::string detection_output_inst::to_string(detection_output_node const& node) @@ -50,12 +82,13 @@ std::string detection_output_inst::to_string(detection_output_node const& node) auto variance_encoded = desc->variance_encoded_in_target ? "true" : "false"; auto prior_is_normalized = desc->prior_is_normalized ? "true" : "false"; auto decrease_label_id = desc->decrease_label_id ? "true" : "false"; - auto clip = desc->clip ? "true" : "false"; + auto clip_before_nms = desc->clip_before_nms ? "true" : "false"; + auto clip_after_nms = desc->clip_after_nms ? "true" : "false"; auto& input_location = node.location(); auto& input_prior_box = node.prior_box(); auto& input_confidence = node.confidence(); - + std::stringstream primitive_description; std::string str_code_type; @@ -74,7 +107,7 @@ std::string detection_output_inst::to_string(detection_output_node const& node) str_code_type = "not supported code type"; break; } - + json_composite detec_out_info; detec_out_info.add("input location id", input_location.id()); detec_out_info.add("input confidence id", input_confidence.id()); @@ -95,7 +128,8 @@ std::string detection_output_inst::to_string(detection_output_node const& node) detec_out_info.add("input_width", desc->input_width); detec_out_info.add("input_height", desc->input_height); detec_out_info.add("decrease_label_id", decrease_label_id); - detec_out_info.add("clip", clip); + detec_out_info.add("clip_before_nms", clip_before_nms); + detec_out_info.add("clip_after_nms", clip_after_nms); detec_out_info.dump(primitive_description); node_info->add("dection output info", detec_out_info); @@ -125,11 +159,77 @@ detection_output_inst::typed_primitive_inst(network_impl& network, detection_out auto desc = node.get_primitive(); int prior_feature_size = desc->variance_encoded_in_target ? 1 : 2; tensor prior_box_size = prior_box_layout.size; - CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box batch size", prior_box_size.batch[0], "expected value", 1, ""); CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box spatial X", prior_box_size.spatial[0], "expected value", 1, ""); CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box feature size", prior_box_size.feature[0], "expected value", prior_feature_size, ""); CLDNN_ERROR_BOOL(node.id(), "Detection output layer padding", node.is_padded(), "Detection output layer doesn't support output padding."); CLDNN_ERROR_BOOL(node.id(), "Detection output layer Prior-box input padding", node.get_dependency(2).is_padded(), "Detection output layer doesn't support input padding in Prior-Box input"); } + +/************************ Detection Output keep_top_k part ************************/ + +primitive_type_id detection_output_sort_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout detection_output_sort_inst::calc_output_layout(detection_output_sort_node const& node) +{ + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "detection_output_sort_node!"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Detection output layer input number", node.get_dependencies().size(), "expected number of inputs", static_cast(1), ""); + + auto input_layout = node.input().get_output_layout(); + int keep_top_k = node.as().get_primitive()->keep_top_k; + int num_images = node.as().get_primitive()->num_images; + + // If detection output sort is used as a second part of detection output get proper info from detection otput node + if (num_images == 0) + { + CLDNN_ERROR_BOOL(node.id(), "node.get_dependency(0).is_type()", !node.get_dependency(0).is_type(), "Cannot calculate output layout."); + input_layout = node.get_dependency(0).as().location().get_output_layout(); + keep_top_k = node.get_dependency(0).as().get_primitive()->keep_top_k; + num_images = input_layout.size.batch[0]; + } + // Batch size and feature size are 1. + // Number of bounding boxes to be kept is set to keep_top_k*batch size. + // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1. + // Each row is a 7 dimension vector, which stores: + // [image_id, label, confidence, xmin, ymin, xmax, ymax] + return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, keep_top_k * num_images) }; +} + +std::string detection_output_sort_inst::to_string(detection_output_sort_node const& node) +{ + auto node_info = node.desc_to_json(); + auto desc = node.get_primitive(); + + auto& input_bboxes = node.input(); + + std::stringstream primitive_description; + + json_composite detec_out_info; + detec_out_info.add("input bboxes id", input_bboxes.id()); + detec_out_info.add("num_classes:", desc->num_images); + detec_out_info.add("num_classes:", desc->num_classes); + detec_out_info.add("keep_top_k", desc->keep_top_k); + detec_out_info.add("share_location", desc->share_location); + detec_out_info.add("top_k", desc->top_k); + detec_out_info.dump(primitive_description); + + node_info->add("dection output info", detec_out_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +detection_output_sort_inst::typed_primitive_inst(network_impl& network, detection_output_sort_node const& node) + :parent(network, node) +{ + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input memory format", node.get_dependency(0).get_output_layout().format.value, "expected bfyx input format", format::bfyx); + + CLDNN_ERROR_BOOL(node.id(), "Detecion output layer padding", node.is_padded(), "Detection output layer doesn't support output padding."); +} } diff --git a/inference-engine/thirdparty/clDNN/src/eltwise.cpp b/inference-engine/thirdparty/clDNN/src/eltwise.cpp index 1ee22cc..2a6835b 100644 --- a/inference-engine/thirdparty/clDNN/src/eltwise.cpp +++ b/inference-engine/thirdparty/clDNN/src/eltwise.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,22 +30,53 @@ primitive_type_id eltwise_type_id() layout eltwise_inst::calc_output_layout(eltwise_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for eltwise_inst_node!"); + auto input_node_layout = node.input().get_non_padded_output_layout(); + + auto size = input_node_layout.size; + for (size_t i = 1; i < node.inputs_count(); i++) + { + size = tensor::max(size, node.input(i).get_non_padded_output_layout().size); + } + auto output_layout = layout(input_node_layout.data_type, input_node_layout.format, size); + auto mode = node.get_primitive()->mode; //list of operations supported for integer types if (input_node_layout.data_type == data_types::i8 || input_node_layout.data_type == data_types::i32 || input_node_layout.data_type == data_types::i64) { - auto mode = node.get_primitive()->mode; - std::vector eltwise_int_modes = { eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod, eltwise_mode::div }; + std::vector eltwise_int_modes = { eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod, eltwise_mode::div, eltwise_mode::min, eltwise_mode::max, eltwise_mode::mod, + eltwise_mode::eq, eltwise_mode::ne, eltwise_mode::lt, eltwise_mode::le, eltwise_mode::gt, eltwise_mode::ge, + eltwise_mode::logic_and, eltwise_mode::logic_or, eltwise_mode::logic_xor }; if (std::find(eltwise_int_modes.begin(), eltwise_int_modes.end(), mode) == eltwise_int_modes.end()) CLDNN_ERROR_MESSAGE(node.id(), "Requested eltwise mode is not supported for integer types."); } - return input_node_layout; + // Logic and comparison operations should return i8 for any inputs + std::vector eltwise_bool_modes = { eltwise_mode::eq, eltwise_mode::ne, eltwise_mode::lt, eltwise_mode::le, + eltwise_mode::gt, eltwise_mode::ge, + eltwise_mode::logic_and, eltwise_mode::logic_or, eltwise_mode::logic_xor }; + if (std::find(eltwise_bool_modes.begin(), eltwise_bool_modes.end(), mode) != eltwise_bool_modes.end()) + { + output_layout.data_type = data_types::i8; + if (node.get_primitive()->with_activation) + CLDNN_ERROR_MESSAGE(node.id(), "Activations are not supported for logical operations."); + } + + auto eltw = std::static_pointer_cast((node.get_primitive())); + if (!eltw->stride.empty()) + { + // we can safely use only first stride, since we're using first input, and input / stride should give exact same value for every input + input_node_layout.size.spatial[0] /= eltw->stride[0].spatial[0]; + input_node_layout.size.spatial[1] /= eltw->stride[0].spatial[1]; + return input_node_layout; + } + return output_layout; } -static inline std::string stringify_vector(std::vector v) +static inline std::string stringify_vector(const std::vector& v) { std::stringstream s; @@ -90,13 +121,43 @@ std::string eltwise_inst::to_string(eltwise_node const& node) break; case eltwise_mode::min: str_mode = "min"; - break; + break; case eltwise_mode::pow: str_mode = "pow"; break; + case eltwise_mode::squared_diff: + str_mode = "squared_diff"; + break; case eltwise_mode::mod: str_mode = "mod"; break; + case eltwise_mode::eq: + str_mode = "equal"; + break; + case eltwise_mode::ne: + str_mode = "not equal"; + break; + case eltwise_mode::lt: + str_mode = "less"; + break; + case eltwise_mode::le: + str_mode = "less-or-equal"; + break; + case eltwise_mode::gt: + str_mode = "greater"; + break; + case eltwise_mode::ge: + str_mode = "greater-or-equal"; + break; + case eltwise_mode::logic_and: + str_mode = "and"; + break; + case eltwise_mode::logic_or: + str_mode = "or"; + break; + case eltwise_mode::logic_xor: + str_mode = "xor"; + break; default: str_mode = "not supported mode"; break; @@ -126,21 +187,78 @@ std::string eltwise_inst::to_string(eltwise_node const& node) eltwise_inst::typed_primitive_inst(network_impl& network, eltwise_node const& node) :parent(network, node) { - auto input_layout = node.input().get_output_layout(); - auto batch_size = input_layout.size.batch[0]; - auto feature_size = input_layout.size.feature[0]; + check_inputs_count(node); + // check for stride + auto prim = node.get_primitive(); + if (!prim->stride.empty()) + { + // number of strides must match number of inputs + CLDNN_ERROR_NOT_EQUAL(node.id(), "Eltwise inputs count", node.inputs_count(), "Eltwise strides count", prim->stride.size(), ""); - auto input_batch_size = input_layout.size.batch[0]; - auto input_feature_size = input_layout.size.feature[0]; + const auto out_x = node.get_output_layout().size.spatial[0]; + const auto out_y = node.get_output_layout().size.spatial[1]; + // check if strides are correctly set. I.e INPUT_SIZE_X / STRIDE_X = OUTPUT_SIZE_X, same for Y dimension + for (size_t i = 0; i < node.inputs_count(); i++) + { + const auto& in_layout = node.input(i).get_output_layout(); + auto stride = prim->stride[i]; - if (batch_size != 1) + const auto in_x_div_stride_x = in_layout.size.spatial[0] / stride.spatial[0]; + if(in_x_div_stride_x != out_x) + CLDNN_ERROR_NOT_EQUAL(node.id(), "Eltwise input_x / stride_x", in_x_div_stride_x, "Eltwise output_x", out_x, ""); + + const auto in_y_div_stride_y = in_layout.size.spatial[1] / stride.spatial[1]; + if(in_y_div_stride_y != out_y) + CLDNN_ERROR_NOT_EQUAL(node.id(), "Eltwise inputyx / stride_y", in_y_div_stride_y, "Eltwise output_y", out_y, ""); + } + } + else { - CLDNN_ERROR_NOT_EQUAL(node.id(), "Eltwise batch size", batch_size, "input batch size", input_batch_size, ""); + std::vector input0_size = node.input().get_output_layout().size.raw.vector(); + for (size_t i = 1; i < node.inputs_count(); i++) + { + std::vector input_size = node.input(i).get_output_layout().size.raw.vector(); + for (size_t d = 0; d < input0_size.size(); d++) + { + bool sizes_equal = input0_size[d] == input_size[d]; + bool broadcast = (input0_size[d] == 1 || input_size[d] == 1) && (input0_size[d] != 1 || input_size[d] != 1); + CLDNN_ERROR_BOOL(node.id(), "Sizes equal or broadcast is possible", !(sizes_equal || broadcast), "Invalid input shapes"); + } + } } +} - if (feature_size != 1) +void eltwise_inst::check_inputs_count(eltwise_node const &node) +{ + const size_t inputs_number = node.get_primitive()->input.size(); + const eltwise_mode mode = node.get_primitive()->mode; + + switch (mode) { - CLDNN_ERROR_NOT_EQUAL(node.id(), "Eltwise feature size", feature_size, "input feature size", input_feature_size, ""); + case eltwise_mode::sum: + case eltwise_mode::sub: + case eltwise_mode::div: + case eltwise_mode::prod: + case eltwise_mode::max: + case eltwise_mode::min: + case eltwise_mode::mod: + case eltwise_mode::logic_and: + case eltwise_mode::logic_or: + case eltwise_mode::logic_xor: + if (inputs_number < 2) + CLDNN_ERROR_MESSAGE(node.id(), "Invalid eltwise inputs number (should be equal at least to 2). Actual: " + std::to_string(inputs_number)); + break; + case eltwise_mode::eq: + case eltwise_mode::ne: + case eltwise_mode::lt: + case eltwise_mode::le: + case eltwise_mode::gt: + case eltwise_mode::ge: + case eltwise_mode::squared_diff: + case eltwise_mode::pow: + if (inputs_number != 2) + CLDNN_ERROR_MESSAGE(node.id(), "Invalid eltwise inputs number (should be equal to 2). Actual: " + std::to_string(inputs_number)); + break; } } } diff --git a/inference-engine/thirdparty/clDNN/src/embed.cpp b/inference-engine/thirdparty/clDNN/src/embed.cpp index b2087b0..b1c6199 100644 --- a/inference-engine/thirdparty/clDNN/src/embed.cpp +++ b/inference-engine/thirdparty/clDNN/src/embed.cpp @@ -31,11 +31,13 @@ namespace cldnn layout embed_inst::calc_output_layout(embed_node const& node) { - auto input_layout = node.input().get_output_layout(); + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for embed_node!"); + auto input_layout = node.input().get_output_layout(); auto desc = node.get_primitive(); auto weights_layout = node.weights().get_output_layout(); - auto result = layout(input_layout.data_type, format::bfyx, tensor(input_layout.size.batch[0], input_layout.size.spatial[0] * input_layout.size.spatial[1], weights_layout.size.batch[0], 1)); + auto result = layout(input_layout.data_type, format::bfyx, tensor(input_layout.size.batch[0], input_layout.size.spatial[0], weights_layout.size.batch[0], 1)); return result; } @@ -66,5 +68,8 @@ namespace cldnn auto output_size = output_memory().get_layout(); CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_size.format.value, "expected format", format::yxfb, format::bfyx); CLDNN_ERROR_NOT_EQUAL(node.id(), "Input size", input_size.size.raw.size(), "output size", output_size.size.raw.size(), ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input batch", input_size.size.batch[0], "output batch", output_size.size.batch[0], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input feature", input_size.size.feature[0], "size 1", 1, ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input y size ", input_size.size.spatial[1], "size 1", 1, ""); } } diff --git a/inference-engine/thirdparty/clDNN/src/engine.cpp b/inference-engine/thirdparty/clDNN/src/engine.cpp index f0e6a53..f883938 100644 --- a/inference-engine/thirdparty/clDNN/src/engine.cpp +++ b/inference-engine/thirdparty/clDNN/src/engine.cpp @@ -40,6 +40,8 @@ gpu_toolkit_config convert_configuration(const engine_configuration conf) result.ocl_sources_dumps_dir = conf.sources_dumps_dir; result.priority_mode = static_cast(conf.priority_mode); result.throttle_mode = static_cast(conf.throttle_mode); + result.user_context = static_cast(conf.context); + result.tuning_cache_path = conf.tuning_cache_path; return result; } @@ -49,6 +51,15 @@ engine_impl::engine_impl(const engine_configuration& conf) , _memory_pool(*this) { } +engine_impl::~engine_impl() +{ + /* + Engine, which is main owner of context deallocate events pool manually, because + of the event_impl <-> gpu_toolkit dependencies. + */ + _context->release_events_pool(); +} + memory_impl::ptr engine_impl::allocate_memory(layout layout) { return _memory_pool.get_memory(layout); @@ -96,7 +107,7 @@ bool engine_impl::is_the_same_buffer(const memory_impl& mem1, const memory_impl& event_impl::ptr engine_impl::create_user_event(bool set) { try { - return{ new gpu::user_event(get_context(), set), false }; + return _context->create_user_event(set); } catch (cl::Error const& err) { throw gpu::ocl_error(err); @@ -113,19 +124,29 @@ void engine_impl::release_pending_memory() get_context()->release_pending_memory(); } -program_impl::ptr engine_impl::build_program(const topology_impl& topology, const build_options& options, bool is_internal) +program_impl::ptr engine_impl::build_program(const topology_impl& topology, const build_options& options, bool is_internal, bool no_optimizations) +{ + return{ new program_impl(*this, topology, options, is_internal, no_optimizations), false }; +} + +program_impl::ptr engine_impl::build_program(const std::set>& nodes, const build_options& options, bool is_internal) +{ + return{ new program_impl(*this, nodes, options, is_internal), false }; +} + +network_impl::ptr engine_impl::build_network(const topology_impl& topology, const build_options& options, bool is_internal) { - return{ new program_impl(*this, topology, options, is_internal), false }; + return{ new network_impl(*this, topology, options, is_internal), false }; } -network_impl::ptr engine_impl::build_network(const topology_impl& topology, const build_options& options, bool internal_network) +network_impl::ptr engine_impl::build_network(const std::set>& nodes, const build_options& options, bool is_internal) { - return{ new network_impl(*this, topology, options, internal_network), false }; + return{ new network_impl(*this, nodes, options, is_internal), false }; } -network_impl::ptr engine_impl::allocate_network(const program_impl& program) +network_impl::ptr engine_impl::allocate_network(const program_impl& program, bool is_internal) { - return{ new network_impl(program), false }; + return{ new network_impl(program, is_internal), false }; } void engine_impl::wait_for_events(std::vector const & events) diff --git a/inference-engine/thirdparty/clDNN/src/error_handler.cpp b/inference-engine/thirdparty/clDNN/src/error_handler.cpp index 6a23ca1..74b3652 100644 --- a/inference-engine/thirdparty/clDNN/src/error_handler.cpp +++ b/inference-engine/thirdparty/clDNN/src/error_handler.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -43,16 +43,16 @@ void err_details::cldnn_print_error_message(const std::string& file, int line, c void error_message(const std::string& file, int line, const std::string& instance_id, const std::string& message) { - std::stringstream error_msg; - error_msg << message << std::endl; - err_details::cldnn_print_error_message(file, line, instance_id, error_msg); + std::stringstream error_msg; + error_msg << message << std::endl; + err_details::cldnn_print_error_message(file, line, instance_id, error_msg); } void error_on_not_supported_fp16(const std::string& file, int line, const std::string& instance_id, uint8_t supp_fp16, bool fp16_used) { - std::stringstream error_msg; if (!supp_fp16 && fp16_used) { + std::stringstream error_msg; error_msg << "GPU device does not support half precision floating-point formats (cl_khr_fp16 extension)" << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg); } @@ -60,20 +60,23 @@ void error_on_not_supported_fp16(const std::string& file, int line, const std::s void error_on_bool(const std::string& file, int line, const std::string& instance_id, const std::string& condition_id, bool condition, const std::string& additional_message) { - std::stringstream error_msg; if (condition) { + std::stringstream error_msg; auto condition_to_string = [](const bool& condi)->std::string { return condi ? "true" : "false"; }; error_msg << condition_id << "(" << condition_to_string(condition) << ") should be " << condition_to_string(!condition) << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } } -void error_on_mismatching_data_types(const std::string& file, int line, const std::string& instance_id, const std::string& data_format_1_id, data_types data_format_1, const std::string& data_format_2_id, data_types data_format_2, const std::string& additional_message) +void error_on_mismatching_data_types(const std::string& file, int line, const std::string& instance_id, const std::string& data_format_1_id, data_types data_format_1, const std::string& data_format_2_id, data_types data_format_2, const std::string& additional_message, bool ignore_sign) { - std::stringstream error_msg; - if (data_format_1 != data_format_2) + if (data_format_1 != data_format_2 && + !ignore_sign && + ((data_format_1 == data_types::i8 && data_format_2 == data_types::u8) || + (data_format_1 == data_types::u8 && data_format_2 == data_types::i8))) { + std::stringstream error_msg; error_msg << "Data formats are incompatible." << std::endl; error_msg << data_format_1_id << " format is: " << data_type_traits::name(data_format_1) << ", " << data_format_2_id << " is: " << data_type_traits::name(data_format_2) << std::endl; error_msg << "Data formats should be the same!" << std::endl; @@ -101,18 +104,18 @@ void error_on_tensor_dims_less_than_other_tensor_dims(const std::string& file, i errors.push_back("Spatial y"); } - std::stringstream error_msg; if (!errors.empty()) { - error_msg << tensor_id << " sizes: " << tens << std::endl; - error_msg << tensor_to_compare_to_id << " sizes: " << tens_to_compre << std::endl; - error_msg << "All " << tensor_id << " dimensions should not be less than " << tensor_to_compare_to_id << " dimensions." << std::endl; - error_msg << "Mismatching dimensions: "; - for (size_t i = 0; i < errors.size(); i++) - { - error_msg << errors.at(i) << std::endl; - } - err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); + std::stringstream error_msg; + error_msg << tensor_id << " sizes: " << tens << std::endl; + error_msg << tensor_to_compare_to_id << " sizes: " << tens_to_compre << std::endl; + error_msg << "All " << tensor_id << " dimensions should not be less than " << tensor_to_compare_to_id << " dimensions." << std::endl; + error_msg << "Mismatching dimensions: "; + for (size_t i = 0; i < errors.size(); i++) + { + error_msg << errors.at(i) << std::endl; + } + err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } } @@ -136,9 +139,9 @@ void error_on_tensor_dims_greater_than_other_tensor_dims(const std::string& file errors.push_back("Spatial y"); } - std::stringstream error_msg; if (!errors.empty()) { + std::stringstream error_msg; error_msg << tensor_id << " sizes: " << tens << std::endl; error_msg << tensor_to_compare_to_id << " sizes: " << tens_to_compre << std::endl; error_msg << "All " << tensor_id << " dimensions should not be greater than " << tensor_to_compare_to_id << std::endl; @@ -171,9 +174,9 @@ void error_on_tensor_dims_not_dividable_by_other_tensor_dims(const std::string& errors.push_back("Spatial y"); } - std::stringstream error_msg; if (!errors.empty()) { + std::stringstream error_msg; error_msg << tensor_id << " sizes: " << tens << std::endl; error_msg << tensor_to_compare_to_id << " sizes: " << tens_to_compre << std::endl; error_msg << "All " << tensor_id << " dimensions must be dividable by corresponding dimensions from " << tensor_to_compare_to_id << std::endl; diff --git a/inference-engine/thirdparty/clDNN/src/fully_connected.cpp b/inference-engine/thirdparty/clDNN/src/fully_connected.cpp index bb960eb..cba38f2 100644 --- a/inference-engine/thirdparty/clDNN/src/fully_connected.cpp +++ b/inference-engine/thirdparty/clDNN/src/fully_connected.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -58,6 +58,9 @@ bool is_batch_after_spatial(const std::string order) layout fully_connected_inst::calc_output_layout(fully_connected_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "fully_connected_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); @@ -104,7 +107,7 @@ fully_connected_inst::typed_primitive_inst(network_impl& network, fully_connecte auto input_layout = node.input().get_output_layout(); auto output_layout = node.get_output_layout(); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_layout.format.value, "expected format", format::yxfb, format::bfyx, format::byxf_af32, format::fs_bs_yx_bsv4_fsv32); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_layout.format.value, "expected format", format::yxfb, format::bfyx, format::byxf_af32, format::fs_bs_yx_bsv4_fsv32, format::b_fs_yx_fsv4); CLDNN_ERROR_NOT_EQUAL(node.id(), "Input size", input_layout.size.raw.size(), "output size", output_layout.size.raw.size(), ""); } } diff --git a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp b/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp index 6a13c2e..d5d8196 100644 --- a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp +++ b/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp @@ -30,6 +30,9 @@ primitive_type_id fully_connected_grad_input_type_id() layout fully_connected_grad_input_inst::calc_output_layout(fully_connected_grad_input_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "fully_connected_grad_input_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp b/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp index 8332eaa..378a3c7 100644 --- a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp +++ b/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp @@ -30,6 +30,9 @@ primitive_type_id fully_connected_grad_weights_type_id() layout fully_connected_grad_weights_inst::calc_output_layout(fully_connected_grad_weights_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "fully_connected_grad_weights_node!"); //output buffer will not be used in this primitive auto input_grad_layout_size = node.input().get_output_layout(); return{ input_grad_layout_size.data_type, input_grad_layout_size.format,{ 1, 1, 1, 1 } }; diff --git a/inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp b/inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp new file mode 100644 index 0000000..d8e36a1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp @@ -0,0 +1,131 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "fused_conv_bn_scale_inst.h" +#include "primitive_type_base.h" +#include "sliding_window_utils.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn +{ +primitive_type_id fused_conv_bn_scale_type_id() +{ + static primitive_type_base instance; + return &instance; +} +// TODO: unify this code with regular convolution. +layout fused_conv_bn_scale_inst::calc_output_layout(fused_conv_bn_scale_node const& node) +{ + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "fused_conv_bn_scale_node!"); + auto desc = node.get_primitive(); + + auto input_layout = node.input().get_output_layout(); + auto weights_layout = node.weights(0).get_output_layout(); //weights are stored after inputs + + auto input_offset = desc->input_offset; + auto stride = desc->stride; + auto split = desc->weights.size(); + auto dilation = desc->dilation; + + // compute how many outputs in rows and columns will be generate by filter. + // outp <= (input_size - (2*input_offset) - kernel_size)/ stride + auto filter_size = weights_layout.size; + + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Stride spatial X", stride.spatial[0], "value", 0, "Stride spatial X must be positive (>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Stride spatial Y", stride.spatial[1], "value", 0, "Stride spatial Y must be positive (>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Dilatation spatial X", dilation.spatial[0], "value", 0, "Dilatation patial X must be positive (>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Dilatation spatial Y", dilation.spatial[1], "value", 0, "Dilatation spatial Y must be positive (>= 1)"); + CLDNN_ERROR_GREATER_THAN(node.id(), "Input offset spatial X", 2 * input_offset.spatial[0], "input layout spatial X", input_layout.size.spatial[0], "There is no input data to process"); + CLDNN_ERROR_GREATER_THAN(node.id(), "Input offset spatial Y", 2 * input_offset.spatial[1], "input layout spatial Y", input_layout.size.spatial[1], "There is no input data to process"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset feature", input_offset.feature[0], "", 0, "Input offset in feature is not supported"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset batch", input_offset.batch[0], "", 0, "Input offset in batch is not supported"); + + // get output feature map from weights. It should be the same as number of biases. Will be verified in convolution::create() + auto number_of_features = weights_layout.size.batch[0] * static_cast(split); + + auto output_range = calc_sliding_window_output_range( + input_layout.size, filter_size, input_offset, stride, { 1, 1, 1, 1 }, true, 1); + + tensor output_size(input_layout.size.batch[0], number_of_features, + output_range.spatial[0], output_range.spatial[1]); + return { input_layout.data_type, input_layout.format, output_size }; +} + +std::string fused_conv_bn_scale_inst::to_string(fused_conv_bn_scale_node const& node) +{ + auto desc = node.get_primitive(); + auto strd = desc->stride; + auto split = node.get_split(); + auto node_info = node.desc_to_json(); + auto activation = desc->with_activation ? " true" : "false"; + + std::stringstream primitive_description; + + json_composite fuse_info; + fuse_info.add("stride", strd.to_string()); + fuse_info.add("input offset", desc->input_offset.to_string()); + fuse_info.add("split", split); + fuse_info.add("with activation", activation); + fuse_info.add("slope", desc->activation_negative_slope); + + node_info->add("fused_conv_bn_scale info", fuse_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +fused_conv_bn_scale_inst::typed_primitive_inst(network_impl& network, fused_conv_bn_scale_node const& node) + : parent(network, node) +{ + auto stride = argument.stride; + + auto input_inst = node.input().get_output_layout(); + auto output_inst = node.get_output_layout(); + auto output_size = output_inst.size; + + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input number of dimensions", input_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Input/output dims mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Stride number of dimensions", stride.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "stride/output dims mismtach"); + + auto split = node.get_split(); + for (decltype(split) j = 0; j < split; j++) + { + auto filter_inst = node.weights(j).get_output_layout(); //convolution filter + if (bias_term()) + { + auto bias_inst = node.bias(j).get_output_layout(); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias batch[0]", bias_inst.size.batch[0], "expected size of batch", 1, "Biases isn't 1D vector."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias feature[0]", bias_inst.size.feature[0], "expected size of feature", 1, "Biases isn't 1D vector."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[1]", bias_inst.size.spatial[1], "expected size of spatial[1]", 1, "Biases isn't 1D vector."); + + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[0]", bias_inst.size.spatial[0], "expected feature map number", output_size.feature[0] / split, "Bias/fm mismtach"); + } + + auto input_offset = argument.input_offset; + + CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights number of dimensions", filter_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Weights/output dims mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Convolution padding mode", node.get_output_layout().data_padding.filling_value(), "padding value", 0.0f, "Unknown padding mode."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset number of dimensions", input_offset.raw.size(), "input number of dimensions", input_inst.size.raw.size(), "Input offset/ input size mismtach"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Output feature size", output_size.feature.size(), "expected feature size", 1, "Only one-dimensional features are supported"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Output batch size", output_size.batch.size(), "expected output size", 1, "Only one-dimensional batch size are supported"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights spatial size", filter_inst.size.spatial.size(), "expected weights spatial size", 2, "Weights have to have 2 dimensions in spatial domain."); + CLDNN_ERROR_LESS_THAN(node.id(), "Weights feature maps number", (input_inst.size.feature[0] - input_offset.feature[0]) / split, "input feature maps number", filter_inst.size.feature[0], "Weights/ifm mismtach"); + } +} +} diff --git a/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp b/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp new file mode 100644 index 0000000..b1b436f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp @@ -0,0 +1,196 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "fused_conv_eltwise_inst.h" +#include "primitive_type_base.h" +#include "sliding_window_utils.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn +{ +primitive_type_id fused_conv_eltwise_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout fused_conv_eltwise_inst::calc_output_layout(fused_conv_eltwise_node const& node) +{ + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "fused_conv_eltwise_node!"); + auto desc = node.get_primitive(); + + auto input_layout = node.input().get_output_layout(); + auto weights_layout = node.weights(0).get_output_layout(); //weights are stored after inputs + + auto input_offset = desc->conv.input_offset; + auto stride = desc->conv.stride; + auto dilation = desc->conv.dilation; + auto split = desc->conv.weights.size(); + + // compute how many outputs in rows and columns will be generate by filter. + // outp <= (input_size - (2*input_offset) - kernel_size)/ stride + auto filter_size = weights_layout.size; + + // TODO: Consider moving general parameter verification to arguments constructor. + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Stride spatial X", stride.spatial[0], "value", 0, "Stride spatial X must be positive (>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Stride spatial Y", stride.spatial[1], "value", 0, "Stride spatial Y must be positive (>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Dilatation spatial X", dilation.spatial[0], "value", 0, "Dilatation patial X must be positive (>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "Dilatation spatial Y", dilation.spatial[1], "value", 0, "Dilatation spatial Y must be positive (>= 1)"); + CLDNN_ERROR_GREATER_THAN(node.id(), "Input offset spatial X", 2 * input_offset.spatial[0], "input layout spatial X", input_layout.size.spatial[0], "There is no input data to process"); + CLDNN_ERROR_GREATER_THAN(node.id(), "Input offset spatial Y", 2 * input_offset.spatial[1], "input layout spatial Y", input_layout.size.spatial[1], "There is no input data to process"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset feature", input_offset.feature[0], "", 0, "Input offset in feature is not supported"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset batch", input_offset.batch[0], "", 0, "Input offset in batch is not supported"); + + // TODO: FCN and SSD used offset larger than convolution size. does it make sense to support it? do we support it on the ref kernels? +// CLDNN_ERROR_GREATER_THAN(node.id(), "Negate input offset spatial X", -input_offset.spatial[0], "input window size spatial X", filter_size.spatial[0], "First convolution is outside of image. please reduce input offset X"); +// CLDNN_ERROR_GREATER_THAN(node.id(), "Negate input offset spatial Y", -input_offset.spatial[1], "input window size spatial Y", filter_size.spatial[1], "First convolution is outside of image. please reduce input offset Y"); + + if (input_layout.format == format::winograd_2x3_s1_weights || input_layout.format == format::winograd_2x3_s1_fused_weights || + input_layout.format == format::winograd_6x3_s1_fused_weights || input_layout.format == format::image_2d_weights_winograd_6x3_s1_fbxyb || input_layout.format == format::image_2d_weights_winograd_6x3_s1_xfbyb) + CLDNN_ERROR_MESSAGE(node.id(), "Input for convolution should not be in windograd weights format - it is reserved for weights only"); + + if (input_layout.format == format::winograd_2x3_s1_data) + { + CLDNN_ERROR_NOT_EQUAL(node.id(), "convolution split", split, "expected value", 1, "Convolution with winograd input only supports split == 1"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "stride spatial X", stride.spatial[0], "expected value", 1, "Convolution's input in winograd_2x3_s1_data format can only be used with stride 1x1"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "stride spatial Y", stride.spatial[1], "expected value", 1, "Convolution's input in winograd_2x3_s1_data format can only be used with stride 1x1"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Dilatation spatial X", dilation.spatial[0], "expected value", 1, "Winograd 2x3 convolution does not support dilatation"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Dilatation spatial Y", dilation.spatial[1], "expected value", 1, "Winograd 2x3 convolution does not support dilatation"); + if (input_layout.size.feature[0] % 32 != 0) + CLDNN_ERROR_MESSAGE(node.id(), "Input for winograd 2x3 convolution should have features count divisable by 32"); + if (weights_layout.size.batch[0] % 32 != 0) + CLDNN_ERROR_MESSAGE(node.id(), "Number of filters (OFM) for winograd 2x3 convolution should be divisable by 32"); + + if (node.get_primitive()->conv.with_activation) + CLDNN_ERROR_MESSAGE(node.id(), "Winograd 2x3 convolution should not have activation fused - activation should be performed at transformation from winograd domain stage"); + + CLDNN_ERROR_LESS_THAN(node.id(), "input width", input_layout.size.spatial[0], "filter width", 3, "Convolution input is smaller than weights"); + CLDNN_ERROR_LESS_THAN(node.id(), "input height", input_layout.size.spatial[1], "filter height", 3, "Convolution input is smaller than weights"); + + constexpr tensor::value_type filter_height = 3; //by definition of format::winograd_2x3_s1_data (our assumption) + constexpr tensor::value_type winograd_filter_height = filter_height; //for this format, winograd filter is considered to be a set of 1d filters so its height should remain the same as original filter's + + return layout{ input_layout.data_type, input_layout.format, tensor{ input_layout.size.batch[0], weights_layout.size.batch[0], input_layout.size.spatial[0], input_layout.size.spatial[1] - winograd_filter_height + 1 }, input_layout.data_padding }; + } + + // get output feature map from weights. It should be the same as number of biases. Will be verifed in convolution::create() + auto number_of_features = weights_layout.size.batch[0] * static_cast(split); + + if (desc->conv.with_output_size) + { + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "User defined output spatial X", desc->conv.output_size.spatial[0], "value", 0, "must be positive(>= 1)"); + CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "User defined output spatial Y", desc->conv.output_size.spatial[1], "value", 0, "must be positive(>= 1)"); + + tensor output_size(input_layout.size.batch[0], number_of_features, + desc->conv.output_size.spatial[0], desc->conv.output_size.spatial[1]); + return { input_layout.data_type, input_layout.format, output_size }; + } + + auto output_range = calc_sliding_window_output_range( + input_layout.size, filter_size, input_offset, stride, dilation, true, 1); + + tensor output_size(input_layout.size.batch[0], number_of_features, + output_range.spatial[0], output_range.spatial[1]); + + + // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer will take byxf and return fs_bs_yx_bsv4_fsv32 + if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 && input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) + { + return layout{ input_layout.data_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size }; + } + + return { input_layout.data_type, input_layout.format, output_size }; +} + +std::string fused_conv_eltwise_inst::to_string(fused_conv_eltwise_node const& node) +{ + auto desc = node.get_primitive(); + auto strd = desc->conv.stride; + auto split = node.get_split(); + auto dilation = desc->conv.dilation; + auto node_info = node.desc_to_json(); + auto activation = desc->conv.with_activation ? " true" : "false"; + + std::stringstream primitive_description; + + json_composite conv_info; + conv_info.add("stride", strd.to_string()); + conv_info.add("input offset", desc->conv.input_offset.to_string()); + conv_info.add("split", split); + conv_info.add("dilation", dilation.to_string()); + conv_info.add("with activation", activation); + conv_info.add("slope", desc->conv.activation_negative_slope); + if (desc->conv.with_output_size) + { + json_composite ud_out_size_info; + ud_out_size_info.add("size", desc->conv.output_size.to_string()); + conv_info.add("with user defined output size", ud_out_size_info); + } + + node_info->add("convolution info", conv_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +fused_conv_eltwise_inst::typed_primitive_inst(network_impl& network, fused_conv_eltwise_node const& node) + : parent(network, node) +{ + auto stride = argument.conv.stride; + + auto input_inst = node.input().get_output_layout(); + auto output_inst = node.get_output_layout(); + auto output_size = output_inst.size; + + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input number of dimensions", input_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Input/output dims mismatch"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Stride number of dimensions", stride.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "stride/output dims mismatch"); + + auto split = node.get_split(); + for (decltype(split) j = 0; j < split; j++) + { + auto filter_inst = node.weights(j).get_output_layout(); //convolution filter + if (bias_term()) + { + auto bias_inst = node.bias(j).get_output_layout(); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias batch[0]", bias_inst.size.batch[0], "expected size of batch", 1, "Biases isn't 1D vector."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias feature[0]", bias_inst.size.feature[0], "expected size of feature", 1, "Biases isn't 1D vector."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[1]", bias_inst.size.spatial[1], "expected size of spatial[1]", 1, "Biases isn't 1D vector."); + + CLDNN_ERROR_NOT_EQUAL(node.id(), "Bias spatial[0]", bias_inst.size.spatial[0], "expected feature map number", output_size.feature[0] / split, "Bias/fm mismatch"); + } + + auto input_offset = argument.conv.input_offset; + + CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights number of dimensions", filter_inst.size.raw.size(), "output number of dimensions", output_inst.size.raw.size(), "Weights/output dims mismatch"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Convolution padding mode", node.get_output_layout().data_padding.filling_value(), "padding value", 0.0f, "Unknown padding mode."); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input offset number of dimensions", input_offset.raw.size(), "input number of dimensions", input_inst.size.raw.size(), "Input offset/ input size mismatch"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Output feature size", output_size.feature.size(), "expected feature size", 1, "Only one-dimensional features are supported"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Output batch size", output_size.batch.size(), "expected output size", 1, "Only one-dimensional batch size are supported"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Weights spatial size", filter_inst.size.spatial.size(), "expected weights spatial size", 2, "Weights have to have 2 dimensions in spatial domain."); + CLDNN_ERROR_LESS_THAN(node.id(), "Weights feature maps number", (input_inst.size.feature[0] - input_offset.feature[0]) / split, "input feature maps number", filter_inst.size.feature[0], "Weights/ifm mismatch"); + if (filter_inst.format == format::bf_lyx_yx) // local convolution + { + auto local = filter_inst.size.local; + CLDNN_ERROR_NOT_EQUAL(node.id(), "Number of local x dimension", local[0], "output x dimension", output_inst.size.spatial[0], "Weights/output dims mismatch"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Number of local y dimension", local[1], "output y dimension", output_inst.size.spatial[1], "Weights/output dims mismatch"); + } + } +} +} diff --git a/inference-engine/thirdparty/clDNN/src/gather.cpp b/inference-engine/thirdparty/clDNN/src/gather.cpp new file mode 100644 index 0000000..121d573 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gather.cpp @@ -0,0 +1,68 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "gather_inst.h" + +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn +{ +primitive_type_id gather_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout gather_inst::calc_output_layout(gather_node const& node) +{ + auto desc = node.get_primitive(); + + auto input_layout = node.input(1).get_output_layout(); + auto input_format = input_layout.format; + + auto input_shape = node.get_primitive()->output_shape; + + + return layout{input_layout.data_type, input_format, input_shape}; +} + +std::string gather_inst::to_string(gather_node const& node) +{ + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite gather_info; + gather_info.add("input id", input.id()); + gather_info.add("axis", desc->axis); + gather_info.add("output shape", desc->output_shape.to_string()); + + node_info->add("gather info", gather_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +gather_inst::typed_primitive_inst(network_impl& network, gather_node const& node) + : parent(network, node) +{ +} + +} diff --git a/inference-engine/thirdparty/clDNN/src/gemm.cpp b/inference-engine/thirdparty/clDNN/src/gemm.cpp index a8072bc..49f8cc7 100644 --- a/inference-engine/thirdparty/clDNN/src/gemm.cpp +++ b/inference-engine/thirdparty/clDNN/src/gemm.cpp @@ -31,6 +31,8 @@ primitive_type_id gemm_type_id() layout gemm_inst::calc_output_layout(gemm_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for gemm_node!"); auto input1_layout = node.input(0).get_output_layout(); auto input2_layout = node.input(1).get_output_layout(); bool transpose_input1 = node.get_primitive()->transpose_input1; @@ -89,8 +91,8 @@ gemm_inst::typed_primitive_inst(network_impl& network, gemm_node const& node) if (node.inputs_count() > 2) { auto input3_layout = node.input(2).get_output_layout(); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[0], "Input2 Rows count", input_layout.size.spatial[1], ""); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[1], "Input2 Rows count", input2_layout.size.spatial[0], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Columns count", input3_layout.size.spatial[0], "Input2 Columns count", input2_layout.size.spatial[0], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Rows count", input3_layout.size.spatial[1], "Input1 Rows count", input_layout.size.spatial[1], ""); } } @@ -100,8 +102,8 @@ gemm_inst::typed_primitive_inst(network_impl& network, gemm_node const& node) if (node.inputs_count() > 2) { auto input3_layout = node.input(2).get_output_layout(); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[0], "Input2 Rows count", input_layout.size.spatial[0], ""); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[1], "Input2 Rows count", input2_layout.size.spatial[0], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input13 Columns count", input3_layout.size.spatial[0], "Input2 Rows count", input2_layout.size.spatial[1], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Rows count", input3_layout.size.spatial[1], "Input1 Rows count", input_layout.size.spatial[1], ""); } } else if (transpose_input1 && !transpose_input2) @@ -110,8 +112,8 @@ gemm_inst::typed_primitive_inst(network_impl& network, gemm_node const& node) if (node.inputs_count() > 2) { auto input3_layout = node.input(2).get_output_layout(); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[0], "Input2 Rows count", input_layout.size.spatial[1], ""); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[1], "Input2 Rows count", input2_layout.size.spatial[1], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Columns count", input3_layout.size.spatial[0], "Input2 Columns count", input2_layout.size.spatial[0], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Rows count", input3_layout.size.spatial[1], "Input1 Columns count", input_layout.size.spatial[0], ""); } } else @@ -120,8 +122,8 @@ gemm_inst::typed_primitive_inst(network_impl& network, gemm_node const& node) if (node.inputs_count() > 2) { auto input3_layout = node.input(2).get_output_layout(); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[0], "Input2 Rows count", input_layout.size.spatial[0], ""); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Input1 Columns count", input3_layout.size.spatial[1], "Input2 Rows count", input2_layout.size.spatial[1], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Columns count", input3_layout.size.spatial[0], "Input2 Rows count", input2_layout.size.spatial[1], ""); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Input3 Rows count", input3_layout.size.spatial[1], "Input1 Columns count", input_layout.size.spatial[0], ""); } } diff --git a/inference-engine/thirdparty/clDNN/src/generic_layer.cpp b/inference-engine/thirdparty/clDNN/src/generic_layer.cpp index 8b5cb6f..6d1c3c6 100644 --- a/inference-engine/thirdparty/clDNN/src/generic_layer.cpp +++ b/inference-engine/thirdparty/clDNN/src/generic_layer.cpp @@ -31,6 +31,12 @@ primitive_type_id generic_layer_type_id() return &instance; } +generic_layer_node::typed_program_node(const std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) +{ + can_share_buffer(false); +} + generic_layer_inst::typed_primitive_inst(network_impl& network, generic_layer_node const& node) : parent(network, node) { diff --git a/inference-engine/thirdparty/clDNN/src/gpu/activation_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/activation_gpu.cpp index a6d46a6..d4b40a4 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/activation_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/activation_gpu.cpp @@ -48,14 +48,14 @@ struct activation_gpu : typed_primitive_gpu_impl auto activation_params = get_default_params(arg); auto activation_optional_params = get_default_optional_params(arg.get_program()); - convert_new_activation_func(arg.get_primitive(), activation_params); + convert_new_activation_func(arg.get_primitive(), activation_params.activation); if (arg.is_parameterized()) { const auto& slope_layout = arg.slope_input().get_output_layout(); const auto& output_layout = arg.get_output_layout(); - const auto params_num = kernel_selector::GetActivationAdditionalParamsNumber(activation_params.activationFunc); + const auto params_num = kernel_selector::GetActivationAdditionalParamsNumber(activation_params.activation.function); CLDNN_ERROR_LESS_THAN(arg.id(), "Slope layout size count", slope_layout.size.count(), "output_layout.size.feature[0] * params_num", static_cast(output_layout.size.feature[0] * params_num), "Error - not enough data inside additional params buffer"); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp index fefd536..a599f0b 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp @@ -52,16 +52,16 @@ struct activation_grad_gpu : typed_primitive_gpu_impl activation_grad_params.gradient = true; activation_grad_params.inputs.push_back(convert_data_tensor(arg.get_dependency(1).get_output_layout())); - activation_grad_params.activationFunc = get_kernel_selector_activation_grad_param(primitive->activation_grad_func); - activation_grad_params.activationParams.m = primitive->additional_params.a; - activation_grad_params.activationParams.n = primitive->additional_params.b; + activation_grad_params.activation.function = get_kernel_selector_activation_grad_param(primitive->activation_grad_func); + activation_grad_params.activation.m = primitive->additional_params.a; + activation_grad_params.activation.n = primitive->additional_params.b; if (arg.is_parameterized()) { const auto& slope_layout = arg.slope_input().get_output_layout(); const auto& output_layout = arg.get_output_layout(); - const auto params_num = kernel_selector::GetActivationAdditionalParamsNumber(activation_grad_params.activationFunc); + const auto params_num = kernel_selector::GetActivationAdditionalParamsNumber(activation_grad_params.activation.function); CLDNN_ERROR_LESS_THAN(arg.id(), "Slope layout size count", slope_layout.size.count(), "output_layout.size.feature[0] * params_num", static_cast(output_layout.size.feature[0] * params_num), "Error - not enough data inside additional params buffer"); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/arg_max_min_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/arg_max_min_gpu.cpp index ec4249e..a14e193 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/arg_max_min_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/arg_max_min_gpu.cpp @@ -33,13 +33,6 @@ namespace cldnn { protected: - virtual bool validate(typed_primitive_inst& instance) const override - { - bool res = parent::validate(instance); - - return res; - } - virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst& instance, int32_t) const override { kernel::kernel_arguments_data args = parent::get_arguments(instance, 0); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp index 8adb888..f5364ad 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp @@ -37,17 +37,20 @@ protected: { kernel::kernel_arguments_data args; - - if (!instance.use_global_stats()) - { - args.inputs = { &instance.input_memory() }; - if (instance.forwad_pass()) - args.inputs.push_back(&instance.inv_variance_memory()); - } - else - { - args.inputs = { &instance.input_memory(), &instance.mean_memory(), &instance.variance_memory() }; - } + args.inputs = { &instance.input_memory() }; + + if (instance.use_global_stats()) { + args.inputs.push_back(&instance.mean_memory()); + args.inputs.push_back(&instance.variance_memory()); + } + + if (instance.use_scale_shift()) { + args.inputs.push_back(&instance.scale_memory()); + args.inputs.push_back(&instance.shift_memory()); + } + + if (instance.forwad_pass()) + args.inputs.push_back(&instance.inv_variance_memory()); args.output = &instance.output_memory(); @@ -58,13 +61,17 @@ public: static primitive_impl* create(const batch_norm_node &arg) { - if (!arg.use_global_stats()) + if (!arg.use_global_stats() + || arg.calc_mean_var() ) { auto norm_params = get_default_params(arg); auto norm_optional_params = get_default_optional_params(arg.get_program()); norm_params.batchNormParams.epsilon = arg.get_primitive()->epsilon; norm_params.batchNormParams.with_inv_var = arg.forwad_pass(); + norm_params.batchNormParams.with_scale_shift = arg.use_scale_shift(); + if (arg.calc_mean_var()) + norm_params.batchNormParams.with_mean_var_out = arg.calc_mean_var(); auto& kernel_selector = kernel_selector::batch_norm_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(norm_params, norm_optional_params); @@ -86,7 +93,7 @@ public: ew_params.inputs.push_back(convert_data_tensor(arg.mean().get_output_layout())); ew_params.inputs.push_back(convert_data_tensor(arg.variance().get_output_layout())); - + ew_params.operations.push_back({ { kernel_selector::eltwise_params::InputType::Buffer(0), kernel_selector::eltwise_params::InputType::Buffer(1) }, kernel_selector::eltwise_mode::SUB }); @@ -103,6 +110,19 @@ public: { kernel_selector::eltwise_params::InputType::Intermediate(0), kernel_selector::eltwise_params::InputType::Intermediate(2) }, kernel_selector::eltwise_mode::MUL }); + if (arg.use_scale_shift()) { + ew_params.inputs.push_back(convert_data_tensor(arg.scale().get_output_layout())); + ew_params.inputs.push_back(convert_data_tensor(arg.shift().get_output_layout())); + + ew_params.operations.push_back({ + { kernel_selector::eltwise_params::InputType::Intermediate(3), kernel_selector::eltwise_params::InputType::Buffer(3) }, + kernel_selector::eltwise_mode::MUL }); + + ew_params.operations.push_back({ + { kernel_selector::eltwise_params::InputType::Intermediate(4), kernel_selector::eltwise_params::InputType::Buffer(4) }, + kernel_selector::eltwise_mode::ADD }); + } + ew_params.layoutBased = true; auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance(); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/broadcast_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/broadcast_gpu.cpp index 8c72bdc..fc3667a 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/broadcast_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/broadcast_gpu.cpp @@ -35,6 +35,25 @@ struct broadcast_gpu : typed_primitive_gpu_impl auto bc_params = get_default_params(arg, 1); auto bc_optional_params = get_default_optional_params(arg.get_program()); + const auto& broadcast_axes = arg.get_primitive()->broadcast_axes; + uint16_t index = (uint16_t) 0; + uint16_t input_index = (uint16_t) broadcast_axes.size(); + + //bfyx format + for (size_t i = 0; i < 4; ++i) + { + if (std::find(broadcast_axes.begin(), broadcast_axes.end(), i) != broadcast_axes.end()) + { + bc_params.input_order.push_back(index); + ++index; + } + else + { + bc_params.input_order.push_back(input_index); + ++input_index; + } + } + auto& kernel_selector = kernel_selector::broadcast_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(bc_params, bc_optional_params); @@ -49,20 +68,12 @@ namespace { attach() { auto val_fw = broadcast_gpu::create; - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::yxfb), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw); - - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), val_fw); } ~attach() = default; }; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.cpp b/inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.cpp new file mode 100644 index 0000000..d8db570 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.cpp @@ -0,0 +1,151 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "command_queues_builder.h" +#include "error_handler.h" + +namespace cldnn { namespace gpu{ + + command_queues_builder::command_queues_builder(const cl::Context& context, const cl::Device& device, const cl_platform_id& platform_id) + : _context(context) + , _device(device) + , _platform_id(platform_id) + , _priority_mode(cldnn_priority_disabled) + , _throttle_mode(cldnn_throttle_disabled) + {} + + cl_command_queue_properties command_queues_builder::get_properties() + { + cl_command_queue_properties ret = ((_profiling ? CL_QUEUE_PROFILING_ENABLE : 0) | (_out_of_order ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0)); + return ret; + } + + void command_queues_builder::build() + { + auto properties = get_properties(); + + if (_priority_mode == cldnn_priority_disabled && + _throttle_mode == cldnn_throttle_disabled) + { + _queue = cl::CommandQueue(_context, _device, properties); + return; + } + + unsigned cl_queue_priority_value = CL_QUEUE_PRIORITY_MED_KHR; + + switch (_priority_mode) + { + case cldnn_priority_high: + cl_queue_priority_value = CL_QUEUE_PRIORITY_HIGH_KHR; + break; + case cldnn_priority_low: + cl_queue_priority_value = CL_QUEUE_PRIORITY_LOW_KHR; + break; + default: + break; + } + + unsigned cl_queue_throttle_value = CL_QUEUE_THROTTLE_MED_KHR; + + switch (_throttle_mode) + { + case cldnn_throttle_high: + cl_queue_throttle_value = CL_QUEUE_THROTTLE_HIGH_KHR; + break; + case cldnn_throttle_low: + cl_queue_throttle_value = CL_QUEUE_THROTTLE_LOW_KHR; + break; + default: + break; + } + + cl_int error_code = CL_SUCCESS; + + if (_priority_mode != cldnn_priority_disabled && + _throttle_mode != cldnn_throttle_disabled) + { + cl_queue_properties properties_low[] = { + CL_QUEUE_PRIORITY_KHR, cl_queue_priority_value, + CL_QUEUE_THROTTLE_KHR, cl_queue_throttle_value, + CL_QUEUE_PROPERTIES, properties, + 0 }; + + _queue = clCreateCommandQueueWithProperties( + _context.get(), + _device.get(), + properties_low, + &error_code); + } + else if (_priority_mode != cldnn_priority_disabled) + { + cl_queue_properties properties_low[] = { + CL_QUEUE_PRIORITY_KHR, cl_queue_priority_value, + CL_QUEUE_PROPERTIES, properties, + 0 }; + + _queue = clCreateCommandQueueWithProperties( + _context.get(), + _device.get(), + properties_low, + &error_code); + } + else if (_throttle_mode != cldnn_throttle_disabled) + { + cl_queue_properties properties_low[] = { + CL_QUEUE_THROTTLE_KHR, cl_queue_throttle_value, + CL_QUEUE_PROPERTIES, properties, + 0 }; + + _queue = clCreateCommandQueueWithProperties( + _context.get(), + _device.get(), + properties_low, + &error_code); + } + + if (error_code != CL_SUCCESS) { + CLDNN_ERROR_MESSAGE("Command queues builders", "clCreateCommandQueueWithPropertiesINTEL error " + std::to_string(error_code)); + } + } + + void command_queues_builder::set_priority_mode(cldnn_priority_mode_type priority, bool extension_support) + { + if (priority != cldnn_priority_disabled && !extension_support) + { + CLDNN_ERROR_MESSAGE( + "Command queues builders - priority_mode", + "The param priority_mode is set in engine_configuration,\ + but cl_khr_priority_hints or cl_khr_create_command_queue\ + is not supported by current OpenCL implementation."); + } + _priority_mode = priority; + } + + void command_queues_builder::set_throttle_mode(cldnn_throttle_mode_type throttle, bool extension_support) + { + if (throttle != cldnn_throttle_disabled && !extension_support) + { + CLDNN_ERROR_MESSAGE( + "Command queues builders - throttle_mode", + "The param throttle_mode is set in engine_configuration,\ + but cl_khr_throttle_hints is not supported by current OpenCL implementation."); + } + _throttle_mode = throttle; + } +} +} + diff --git a/inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.h b/inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.h new file mode 100644 index 0000000..4d375cb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/command_queues_builder.h @@ -0,0 +1,46 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "ocl_toolkit.h" + +namespace cldnn { namespace gpu { + class command_queues_builder + { + public: + command_queues_builder(const cl::Context& context, const cl::Device& device, const cl_platform_id& platform_id); + void build(); + void set_throttle_mode(cldnn_throttle_mode_type throttle, bool extension_support); + void set_priority_mode(cldnn_priority_mode_type priority, bool extension_support); + void set_profiling(bool flag) { _profiling = flag; } + void set_out_of_order(bool flag) { _out_of_order = flag; } + cl::CommandQueue& queue() { return _queue; } + cl::CommandQueue queue() const { return _queue; } + + private: + cl::CommandQueue _queue; + cl::Context _context; + cl::Device _device; + cl_platform_id _platform_id; + bool _profiling; + bool _out_of_order; + cldnn_priority_mode_type _priority_mode; + cldnn_throttle_mode_type _throttle_mode; + + cl_command_queue_properties get_properties(); + }; +}} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp index 98d8be2..032fa63 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp @@ -115,6 +115,8 @@ namespace { { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), concatenation_gpu::create }, { std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), concatenation_gpu::create }, { std::make_tuple(engine_types::ocl, data_types::i64, format::byxf), concatenation_gpu::create }, + { std::make_tuple(engine_types::ocl, data_types::f32, format::fyxb), concatenation_gpu::create }, + { std::make_tuple(engine_types::ocl, data_types::f16, format::fyxb), concatenation_gpu::create }, // MMAD { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create }, diff --git a/inference-engine/thirdparty/clDNN/src/gpu/condition_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/condition_gpu.cpp new file mode 100644 index 0000000..30d7ead --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/condition_gpu.cpp @@ -0,0 +1,144 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "condition_inst.h" +#include "network_impl.h" +#include "implementation_map.h" +#include "math_utils.h" + +#include + +namespace cldnn { namespace gpu { + +struct condition_gpu : typed_primitive_impl +{ + const condition_node& outer; + + condition_gpu(const condition_node& outer) + : outer(outer) + {} + + event_impl::ptr execute_impl(const std::vector& events, condition_inst& instance) override + { + for (auto& a : events) + { + a->wait(); + } + auto ev = instance.get_network().get_engine().create_user_event(false); + + bool exec_branch = choose_branch_to_exec(instance); + memory_impl::ptr memory_to_copy; + if (exec_branch) + memory_to_copy = &execute_branch(instance.get_net_true(), instance.result_id(), instance.input_memory()); + else + memory_to_copy = &execute_branch(instance.get_net_false(), instance.result_id(), instance.input_memory()); + //just copy memory + mem_lock inp_ptr{ memory_to_copy }; + mem_lock out_ptr{ instance.output_memory() }; + std::copy(inp_ptr.begin(), inp_ptr.end(), out_ptr.begin()); + dynamic_cast(ev.get())->set(); // set as complete + return ev; + } + + static primitive_impl* create(const condition_node& arg) + { + return new condition_gpu(arg); + } + +private: + /* + Add functions here. + */ + bool check_condition(const float value_1, const float value_2, const cond_functions& func) const + { + switch (func) + { + case cond_functions::EQUAL: return value_1 == value_2; + break; + case cond_functions::GREATER: return value_1 > value_2; + break; + case cond_functions::LESS: return value_1 < value_2; + break; + default: + throw("Unknown comparision function for: " + outer.id()); + break; + } + } + + /* + Loop over memory and check condition. + Returns boolean flag, which says what branch should be executed. + */ + bool choose_branch_to_exec(condition_inst& instance) const + { + mem_lock lock_compare_data{ instance.compare_memory() }; + auto compare_layout = instance.compare_memory().get_layout(); + auto compare_ptr = lock_compare_data.begin(); + + mem_lock lock_input{ instance.input_memory() }; + auto input_layout = instance.input_memory().get_layout(); + auto input_ptr = lock_input.begin(); + + auto function = instance.argument.function; + auto& offset = instance.argument.offset; + auto& range = compare_layout.size; + + for (auto b = 0; b < range.batch[0]; b++) + { + for (auto f = 0; f < range.feature[0]; f++) + { + for (auto y = 0; y < range.spatial[1]; y++) + { + for (auto x = 0; x < range.spatial[0]; x++) + { + auto input_idx = input_layout.get_linear_offset({ + b + offset.batch[0], + f + offset.feature[0], + x + offset.spatial[0], + y + offset.spatial[1] + }); + auto compare_idx = compare_layout.get_linear_offset({ b, f, x, y }); + if (!check_condition(input_ptr[input_idx], compare_ptr[compare_idx], function)) return false; + } + } + } + } + return true; + } + + + + memory_impl& execute_branch(network_impl::ptr branch, const primitive_id& input_id, memory_impl& input_memory) const + { + branch->set_input_data(input_id, input_memory); + branch->execute({}); + return branch->get_outputs().at(0)->output_memory(); + } + +}; + +namespace { + struct attach { + attach() { + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), + condition_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), + condition_gpu::create); + } + ~attach() = default; + }; + attach attach_impl; +} +} +} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp index e9b4b47..c1702a0 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp @@ -15,7 +15,7 @@ */ /////////////////////////////////////////////////////////////////////////////////////////////////// -#include "ocl_toolkit.h" +#include "confiugration.h" namespace cldnn { namespace gpu { @@ -30,6 +30,8 @@ namespace cldnn { , host_out_of_order(false) , log("") , ocl_sources_dumps_dir("") + , user_context(nullptr) + , tuning_cache_path("cache.json") {} } } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/confiugration.h b/inference-engine/thirdparty/clDNN/src/gpu/confiugration.h new file mode 100644 index 0000000..3f7b258 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/confiugration.h @@ -0,0 +1,50 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include +#include "api/C/cldnn.h" + +namespace cl +{ +class Context; +} +namespace cldnn { + namespace gpu { + struct configuration + { + enum device_types { default_device = 0, cpu, gpu, accelerator }; + + configuration(); + + bool enable_profiling; + bool meaningful_kernels_names; + bool dump_custom_program; + device_types device_type; + uint32_t device_vendor; + std::string compiler_options; + std::string single_kernel_name; + bool host_out_of_order; + std::string log; + std::string ocl_sources_dumps_dir; + cldnn_priority_mode_type priority_mode; + cldnn_throttle_mode_type throttle_mode; + cl::Context* user_context; + std::string tuning_cache_path; + }; + } +} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp new file mode 100644 index 0000000..b7f1c22 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp @@ -0,0 +1,88 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "contract_inst.h" + +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "error_handler.h" +#include "contract/contract_kernel_selector.h" +#include "contract/contract_kernel_base.h" + +namespace cldnn { + namespace gpu { + + namespace + { + inline kernel_selector::ContractMode convert_to_contract_mode(contract_mode mode) + { + switch (mode) + { + case contract_mode::sum: return kernel_selector::ContractMode::SUM; + case contract_mode::prod: return kernel_selector::ContractMode::PRODUCT; + case contract_mode::all: return kernel_selector::ContractMode::ALL; + case contract_mode::any: return kernel_selector::ContractMode::ANY; + case contract_mode::max: return kernel_selector::ContractMode::MAX; + + default: + return kernel_selector::ContractMode::SUM; + } + } + } + + struct contract_gpu : typed_primitive_gpu_impl + { + using parent = typed_primitive_gpu_impl; + using parent::parent; + + + static primitive_impl* create(const contract_node& arg) + { + auto c_params = get_default_params(arg, 1); + auto c_optional_params = get_default_optional_params(arg.get_program()); + + c_params.reduction_axes = arg.get_primitive()->reduction_axes; + c_params.mode = convert_to_contract_mode(arg.get_primitive()->mode); + + auto& kernel_selector = kernel_selector::contract_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(c_params, c_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + return new contract_gpu(arg, best_kernels[0]); + } + }; + + namespace { + struct attach { + attach() { + auto val_fw = contract_gpu::create; + + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), val_fw); + } + ~attach() = default; + }; + + attach attach_impl; + + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp index dd5a004..54e63a7 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -32,13 +32,17 @@ struct convolution_gpu : typed_primitive_gpu_impl protected: - virtual bool validate(typed_primitive_inst& instance) const override + virtual bool validate_impl(const typed_primitive_inst& instance) const override { - bool res = parent::validate(instance); + bool res = true; + + auto outer_id = _outer.id(); + auto data_type = instance.node.input().get_output_layout().data_type; // Check whether all memory elements use the same unit type (FP16 or FP32). - CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(), "Input memory", instance.node.input().get_output_layout().data_type, "output memory", instance.node.get_output_layout().data_type, ""); - CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(), "Input memory", instance.node.input().get_output_layout().data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, ""); + CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "output memory", instance.node.get_output_layout().data_type, ""); + // Integer signed/unsigned is ok for convoluiton + CLDNN_ERROR_DATA_TYPES_MISMATCH_IGNORE_SIGN(outer_id, "Input memory", data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, ""); return res; } @@ -59,6 +63,11 @@ protected: return _outer.get_split(); } + virtual uint32_t get_groups() const override + { + return _outer.get_groups(); + } + public: static primitive_impl* create(const convolution_node &arg) @@ -72,6 +81,7 @@ public: const auto& stride = primitive->stride; const auto& dilation = primitive->dilation; const auto& input_offset = primitive->input_offset; + const auto& groups = primitive->groups; const auto depthwise_separable_opt = arg.get_depthwise_sep_opt(); const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split; @@ -80,22 +90,24 @@ public: assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]); - auto conv_params = get_weights_bias_default_params(arg, actual_split); + auto conv_params = get_weights_bias_default_params(arg, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, groups); auto conv_optional_params = get_default_weights_bias_optional_params(arg.get_program()); const auto additional_offset = tensor::max(input_offset, 0); if (additional_offset != 0) { - conv_params.inputs[0] = convert_data_tensor(input_layout, actual_split, additional_offset); + conv_params.inputs[0] = convert_data_tensor(input_layout, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, additional_offset); } if(primitive->with_activation) - convert_activation_func_params(primitive, conv_params); + convert_activation_func_params(primitive, conv_params.activation); - conv_params.depthwiseSeparableOpt = depthwise_separable_opt; + conv_params.depthwise_separable_opt = depthwise_separable_opt; conv_params.transposed = transposed; + conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1; conv_params.split = split; + conv_params.groups = groups; conv_params.filterSize = { (uint32_t)weights_size.spatial[0], (uint32_t)weights_size.spatial[1], @@ -141,8 +153,7 @@ public: kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(conv_params, conv_optional_params); - CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); - + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with these arguments"); auto conv = new convolution_gpu(arg, best_kernels[0]); return conv; @@ -165,7 +176,12 @@ namespace{ implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), convolution_gpu::create); // MMAD implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), convolution_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), convolution_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), convolution_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), convolution_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), convolution_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), convolution_gpu::create); } ~attach() {} }; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp index b8bc157..5f39cac 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp @@ -31,9 +31,9 @@ struct convolution_grad_weights_gpu : typed_primitive_gpu_impl& instance) const override + virtual bool validate_impl(const typed_primitive_inst& instance) const override { - bool res = parent::validate(instance); + bool res = true; CLDNN_ERROR_NOT_EQUAL(_outer.id(), "convolution_grad_weights filling value", _outer.get_output_layout().data_padding.filling_value(), "padding mode", 0.0f, "Unknown padding mode in convolution_grad_weights."); // Check whether all memory elements use the same unit type (FP16 or FP32). @@ -96,13 +96,15 @@ public: const tensor dilation = {0,0,1,1}; #endif const auto depthwise_separable_opt = arg.get_depthwise_sep_opt(); + const auto output_grad_w = arg.output_grad_w(); const auto& input_offset = primitive->input_offset; auto conv_grad_weights_params = get_default_learning_params(arg, depthwise_separable_opt ? 1 : split); auto conv_grad_weights_optional_params = get_default_learning_optional_params(arg.get_program()); - conv_grad_weights_params.depthwiseSeparableOpt = depthwise_separable_opt; + conv_grad_weights_params.depthwise_separable_opt = depthwise_separable_opt; + conv_grad_weights_params.output_grad_w = output_grad_w; conv_grad_weights_params.gradient = true; conv_grad_weights_params.inputs.push_back(convert_data_tensor(arg.get_dependency(1).get_output_layout())); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/crop_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/crop_gpu.cpp index d5638ce..86a0255 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/crop_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/crop_gpu.cpp @@ -67,10 +67,28 @@ namespace { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::yxfb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::yxfb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::yxfb), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::fyxb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fyxb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::fyxb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::fyxb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fyxb), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::fyxb), val_fw); } ~attach() {} }; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/custom_gpu_primitive_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/custom_gpu_primitive_gpu.cpp index a4c940d..d4256de 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/custom_gpu_primitive_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/custom_gpu_primitive_gpu.cpp @@ -98,6 +98,7 @@ static void add_layout_to_jit(kernel_selector::jit_constants& mem_consts, const // #define INPUT0_TYPE float static const std::map dataTypeToIndex{ { data_types::i8 ,"char" }, + { data_types::u8 ,"uchar" }, { data_types::i32 ,"int" }, { data_types::i64 ,"long" }, { data_types::f16 ,"half" }, diff --git a/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp index 68ffdbe..7ec6291 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp @@ -32,9 +32,9 @@ struct deconvolution_gpu : typed_primitive_gpu_impl protected: // TODO: share it with convolution and fully connected - virtual bool validate(typed_primitive_inst& instance) const override + virtual bool validate_impl(const typed_primitive_inst& instance) const override { - bool res = parent::validate(instance); + bool res = true; CLDNN_ERROR_NOT_EQUAL(_outer.id(), "deconvolution filling value", _outer.get_output_layout().data_padding.filling_value(), "padding mode", 0.0f, "Unknown padding mode in deconvolution."); // Check whether all memory elements use the same unit type (FP16 or FP32). @@ -64,6 +64,11 @@ protected: return _outer.get_split(); } + virtual uint32_t get_groups() const override + { + return _outer.get_groups(); + } + public: static primitive_impl* create(const deconvolution_node& arg) @@ -93,18 +98,21 @@ public: const tensor dilation = {0,0,1,1}; #endif const auto depthwise_separable_opt = arg.get_depthwise_sep_opt(); + const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split; const auto& input_offset = primitive->input_offset; + const auto& groups = primitive->groups; - auto deconv_params = get_weights_bias_default_params(arg, depthwise_separable_opt ? 1 : split); + auto deconv_params = get_weights_bias_default_params(arg, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, groups); auto deconv_optional_params = get_default_weights_bias_optional_params(arg.get_program()); if(primitive->with_activation) - convert_activation_func_params(primitive, deconv_params); + convert_activation_func_params(primitive, deconv_params.activation); - deconv_params.depthwiseSeparableOpt = depthwise_separable_opt; + deconv_params.depthwise_separable_opt = depthwise_separable_opt; deconv_params.split = split; + deconv_params.groups = groups; deconv_params.filterSize = { (uint32_t)weights_size.spatial[0], (uint32_t)weights_size.spatial[1], @@ -136,8 +144,7 @@ public: auto& kernel_selector = kernel_selector::deconvolution_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(deconv_params, deconv_optional_params); - CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); - + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with these arguments"); auto deconv = new deconvolution_gpu(arg, best_kernels[0]); return deconv; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/depth_to_space_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/depth_to_space_gpu.cpp new file mode 100644 index 0000000..bc29029 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/depth_to_space_gpu.cpp @@ -0,0 +1,72 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "depth_to_space_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "depth_to_space/depth_to_space_kernel_selector.h" +#include "depth_to_space/depth_to_space_kernel_ref.h" +#include "error_handler.h" + +using namespace cldnn; + +namespace cldnn +{ + namespace gpu + { + struct depth_to_space_gpu : typed_primitive_gpu_impl + { + using parent = typed_primitive_gpu_impl; + using parent::parent; + + public: + + static primitive_impl* create(const depth_to_space_node& arg) + { + auto depth_to_space_params = get_default_params(arg); + auto depth_to_space_optional_params = + get_default_optional_params(arg.get_program()); + + depth_to_space_params.block_size = arg.get_primitive()->block_size; + + auto& kernel_selector = kernel_selector::depth_to_space_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(depth_to_space_params, depth_to_space_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto depth_to_space = new depth_to_space_gpu(arg, best_kernels[0]); + + return depth_to_space; + } + }; + + namespace + { + struct attach + { + attach() + { + auto val_fw = depth_to_space_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + } + ~attach() = default; + }; + attach attach_impl; + } + } //namespace cldnn +} //namespace gpu diff --git a/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp new file mode 100644 index 0000000..dab69d1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp @@ -0,0 +1,652 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "detection_output_inst.h" +#include "kernel.h" +#include "network_impl.h" +#include "implementation_map.h" +#include "math_utils.h" + +#include +#include +#include +#include +#include + +#ifdef FIX_OPENMP_RELEASE_ISSUE +#ifdef OPENMP_FOUND +#include +#endif +#endif + +namespace cldnn { namespace gpu { + +namespace { + struct bounding_box + { + float xmin; + float ymin; + float xmax; + float ymax; + + bounding_box() : xmin(0), ymin(0), xmax(0), ymax(0) {} + + bounding_box(const float xmin, const float ymin, const float xmax, const float ymax) : + xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {} + + // Computes the area of a bounding box. + float area() const + { + return (xmax - xmin) * (ymax - ymin); + } + }; +} + +/************************ Detection Output CPU ************************/ +struct detection_output_cpu : typed_primitive_impl +{ + const detection_output_node& outer; + + detection_output_cpu(const detection_output_node& outer) + : outer(outer) + {} + + static void decode_bounding_box( + const bounding_box& prior_bbox, const std::array& prior_variance, + const prior_box_code_type code_type, const bool variance_encoded_in_target, + const bounding_box& bbox, bounding_box* decoded_bbox, + const bool prior_is_normalized, const size_t image_width, const size_t image_height, const bool clip_before_nms) + { + float prior_bbox_xmin = prior_bbox.xmin; + float prior_bbox_ymin = prior_bbox.ymin; + float prior_bbox_xmax = prior_bbox.xmax; + float prior_bbox_ymax = prior_bbox.ymax; + + float bbox_xmin = bbox.xmin; + float bbox_ymin = bbox.ymin; + float bbox_xmax = bbox.xmax; + float bbox_ymax = bbox.ymax; + + if (!prior_is_normalized) { + prior_bbox_xmin /= image_width; + prior_bbox_ymin /= image_height; + prior_bbox_xmax /= image_width; + prior_bbox_ymax /= image_height; + } + + switch (code_type) + { + case prior_box_code_type::corner: + { + if (variance_encoded_in_target) + { + // variance is encoded in target, we simply need to add the offset predictions. + decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin; + decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin; + decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax; + decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax; + } + else + { + // variance is encoded in bbox, we need to scale the offset accordingly. + decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin; + decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin; + decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax; + decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax; + } + break; + } + case prior_box_code_type::center_size: + { + const float prior_width = prior_bbox_xmax - prior_bbox_xmin; + assert(prior_width > 0); + const float prior_height = prior_bbox_ymax - prior_bbox_ymin; + assert(prior_height > 0); + const float prior_center_x = (prior_bbox_xmin + prior_bbox_xmax) / 2.f; + const float prior_center_y = (prior_bbox_ymin + prior_bbox_ymax) / 2.f; + float decode_bbox_center_x, decode_bbox_center_y; + float decode_bbox_width, decode_bbox_height; + if (variance_encoded_in_target) + { + // variance is encoded in target, we simply need to restore the offset predictions. + decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x; + decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y; + decode_bbox_width = (exp(bbox_xmax) * prior_width); + decode_bbox_height = (exp(bbox_ymax) * prior_height); + } + else + { + // variance is encoded in bbox, we need to scale the offset accordingly. + decode_bbox_center_x = prior_variance[0] * bbox_xmin * prior_width + prior_center_x; + decode_bbox_center_y = prior_variance[1] * bbox_ymin * prior_height + prior_center_y; + decode_bbox_width = (exp(prior_variance[2] * bbox_xmax) * prior_width); + decode_bbox_height = (exp(prior_variance[3] * bbox_ymax) * prior_height); + } + decoded_bbox->xmin = decode_bbox_center_x - decode_bbox_width / 2.0f; + decoded_bbox->ymin = decode_bbox_center_y - decode_bbox_height / 2.0f; + decoded_bbox->xmax = decode_bbox_center_x + decode_bbox_width / 2.0f; + decoded_bbox->ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; + break; + } + case prior_box_code_type::corner_size: + { + const float prior_width = prior_bbox_xmax - prior_bbox_xmin; + assert(prior_width > 0); + const float prior_height = prior_bbox_ymax - prior_bbox_ymin; + assert(prior_height > 0); + if (variance_encoded_in_target) + { + // variance is encoded in target, we simply need to add the offset predictions. + decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin * prior_width; + decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin * prior_height; + decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax * prior_width; + decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax * prior_height; + } + else + { + // variance is encoded in bbox, we need to scale the offset accordingly. + decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin * prior_width; + decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin * prior_height; + decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax * prior_width; + decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax * prior_height; + } + break; + } + default: + { + assert(0); + } + } + + if (clip_before_nms) + { + decoded_bbox->xmin = std::max(0.0f, std::min(1.0f, decoded_bbox->xmin)); + decoded_bbox->ymin = std::max(0.0f, std::min(1.0f, decoded_bbox->ymin)); + decoded_bbox->xmax = std::max(0.0f, std::min(1.0f, decoded_bbox->xmax)); + decoded_bbox->ymax = std::max(0.0f, std::min(1.0f, decoded_bbox->ymax)); + } + } + + static void apply_nms(const std::vector& bboxes, + std::vector>& scores, + const float nms_threshold, const float eta, const int top_k) + { + // Sort the scores in descending order and keep top_k scores if needed. + if ((top_k != -1) && ((int)scores.size() > top_k)) + { + std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(), [](const std::pair& p1, const std::pair& p2) { return (p1.first > p2.first) || (p1.first == p2.first && p1.second < p2.second); }); + scores.resize(top_k); + } + else + { + std::stable_sort(scores.begin(), scores.end(), [](const std::pair& p1, const std::pair& p2) { return p1.first > p2.first; }); + } + + // NMS + float adaptive_threshold = nms_threshold; + int post_nms_count = 0; + + for (auto score_index : scores) + { + const int idx = score_index.second; + bounding_box box1(bboxes[idx]); + bool keep = true; + for (int i = 0; i < post_nms_count; ++i) + { + if (!keep) + { + break; + } + bounding_box box2(bboxes[scores[i].second]); + bool intersecting = (box1.xmin < box2.xmax) & (box2.xmin < box1.xmax) & (box1.ymin < box2.ymax) & (box2.ymin < box1.ymax); + float overlap = 0.0f; + if (intersecting) + { + const float intersect_width = std::min(box1.xmax, box2.xmax) - std::max(box1.xmin, box2.xmin); + const float intersect_height = std::min(box1.ymax, box2.ymax) - std::max(box1.ymin, box2.ymin); + const float intersect_size = intersect_width * intersect_height; + overlap = intersect_size / (box1.area() + box2.area() - intersect_size); + } + keep = (overlap <= adaptive_threshold); + } + if (keep) + { + scores[post_nms_count] = score_index; + ++post_nms_count; + } + if (keep && eta < 1 && adaptive_threshold > 0.5) + { + adaptive_threshold *= eta; + } + } + scores.resize(post_nms_count); // scores holds only the items that were kept after the NMS. + } + + template + void generate_detections(const detection_output_inst& instance, const int num_of_images, const std::vector>>& all_bboxes, std::vector>>>& confidences) + { + mem_lock lock{ instance.output_memory() }; + auto out_ptr = lock.begin(); + + const auto& args = instance.argument; + std::vector>>> final_detections; // Per image -> For each label: Pair (score, prior index) + for (int image = 0; image < num_of_images; ++image) + { + const std::vector >& bboxes_per_image = all_bboxes[image]; + std::vector>>& conf_per_image = confidences[image]; + int num_det = 0; +#ifdef FIX_OPENMP_RELEASE_ISSUE +#ifdef OPENMP_FOUND + int num_available_threads = omp_get_max_threads(); + //half available threads usage shows the best perf results for both SKL (4c8t) and APL (4c4t) for this part of detection output + int num_threads_to_use = (omp_in_parallel() == 0) ? num_available_threads/2 : 1; + #pragma omp parallel for num_threads(num_threads_to_use) reduction(+:num_det) +#endif +#endif + for (int cls = 0; cls < (int)args.num_classes; ++cls) + { + if ((int)cls == args.background_label_id) + { + conf_per_image[cls].clear(); + continue; // Skip background class. + } + std::vector>& scores = conf_per_image[cls]; + const int label = args.share_location ? 0 : cls; + apply_nms(bboxes_per_image[label], scores, args.nms_threshold, args.eta, args.top_k); + num_det += (int)scores.size(); + } + if (num_det > args.keep_top_k) + { + std::vector>> score_index_pairs; + score_index_pairs.reserve(num_det); + for (int label = 0; label < (int)args.num_classes; ++label) + { + std::vector>& scores = confidences[image][label]; + for (std::pair score_index : scores) + { + score_index_pairs.emplace_back(score_index.first, std::make_pair(label, score_index.second)); + } + } + + // Keep top k results per image. + auto sort_function = [](const std::pair>& p1, const std::pair>& p2) { return p1.first > p2.first; }; + if ((int)score_index_pairs.size() > args.keep_top_k) + { + std::partial_sort(score_index_pairs.begin(), score_index_pairs.begin() + args.keep_top_k, score_index_pairs.end(), sort_function); + score_index_pairs.resize(args.keep_top_k); + } + else + { + std::sort(score_index_pairs.begin(), score_index_pairs.end(), sort_function); + } + + // Store the new indices. + std::vector>> new_indices(args.num_classes); + for (int j = 0; j < (int)score_index_pairs.size(); ++j) + { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].emplace_back(score_index_pairs[j].first, idx); + } + final_detections.emplace_back(new_indices); + } + else + { + final_detections.emplace_back(confidences[image]); + } + } + + int count = 0; + for (int image = 0; image < num_of_images; ++image) + { + const std::vector >& bboxes_per_image = all_bboxes[image]; + auto& final_detections_per_image = final_detections[image]; + for (int label = 0; label < (int)final_detections_per_image.size(); ++label) + { + int loc_label = args.share_location ? 0 : label; + const std::vector& bboxes = bboxes_per_image[loc_label]; + const std::vector>& label_detections = final_detections_per_image[label]; + for (std::pair score_prior : label_detections) + { + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)(float)image; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = args.decrease_label_id ? ((dtype)((float)label - 1.0f)) + : (dtype)(float)label; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)score_prior.first; + const bounding_box& bbox = bboxes[score_prior.second]; + float xmin = bbox.xmin; + float ymin = bbox.ymin; + float xmax = bbox.xmax; + float ymax = bbox.ymax; + + if (args.clip_after_nms) + { + xmin = std::max(0.0f, std::min(1.0f, xmin)); + ymin = std::max(0.0f, std::min(1.0f, ymin)); + xmax = std::max(0.0f, std::min(1.0f, xmax)); + ymax = std::max(0.0f, std::min(1.0f, ymax)); + } + + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)xmin; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)ymin; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)xmax; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)ymax; + ++count; + } + } + } + + //In case number of detections is smaller than keep_top_k fill the rest of the buffer with invalid image id (-1). + while (count < num_of_images*args.keep_top_k) + { + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)-1.f; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = (dtype)0.f; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)0.f; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)0.f; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)0.f; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)0.f; + out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)0.f; + ++count; + } + } + + // Compute the linear index taking the padding into account. + static inline int get_linear_feature_index(const int batch_id, const int feature_id, const int input_buffer_size_f, const int input_buffer_size_y, + const int input_buffer_size_x, const int input_padding_lower_y, const int input_padding_lower_x) + { + // This helper function assumes input layout with x_size = 1 and y_size = 1; + // Location and confidence inputs should be tensors with size {b,f,1,1}. + // This is validated in detection output primitive instance creation. + + int input_idx = (batch_id * input_buffer_size_f + feature_id) * input_buffer_size_y * input_buffer_size_x; + input_idx += input_padding_lower_y * input_buffer_size_x + input_padding_lower_x; + + return input_idx; + } + + template + void extract_locations_per_image(const detection_output_inst& instance, std::vector>>& locations, const int num_of_priors, const int num_loc_classes) + { + const bool share_location = instance.argument.share_location; + auto& input_location = instance.location_memory(); + const int num_of_images = (int)locations.size(); + + mem_lock lock{ input_location }; + auto location_data = lock.begin(); + + assert(num_of_priors * num_loc_classes * PRIOR_BOX_SIZE == input_location.get_layout().size.feature[0]); + + const auto& input_buffer_size = input_location.get_layout().get_buffer_size(); + const int input_buffer_size_x = input_buffer_size.spatial[0]; + const int input_buffer_size_y = input_buffer_size.spatial[1]; + const int input_buffer_size_f = input_buffer_size.feature[0]; + const auto& input_padding = input_location.get_layout().data_padding; + const int input_padding_lower_x = input_padding.lower_size().spatial[0]; + const int input_padding_lower_y = input_padding.lower_size().spatial[1]; + + for (int image = 0; image < num_of_images; ++image) + { + std::vector>& label_to_bbox = locations[image]; + label_to_bbox.resize(num_loc_classes); + for (int cls = 0; cls < num_loc_classes; ++cls) + { + int label = share_location ? 0 : cls; + auto & bboxes = label_to_bbox[label]; + bboxes.resize(num_of_priors); + + for (int prior = 0; prior < num_of_priors; ++prior) + { + int idx = prior * num_loc_classes * PRIOR_BOX_SIZE; + bboxes[prior].xmin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE, input_buffer_size_f, input_buffer_size_y, + input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); + bboxes[prior].ymin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 1, input_buffer_size_f, input_buffer_size_y, + input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); + bboxes[prior].xmax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 2, input_buffer_size_f, input_buffer_size_y, + input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); + bboxes[prior].ymax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 3, input_buffer_size_f, input_buffer_size_y, + input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); + } + } + } + } + + template + void extract_prior_boxes_and_variances(const detection_output_inst& instance, const bool variance_encoded_in_target, + const int32_t prior_info_size, const int32_t prior_coordinates_offset, const int32_t images_count, + std::vector& prior_bboxes, std::vector>& prior_variances) + { + auto& input_prior_box = instance.prior_box_memory(); + const int num_of_priors = (int)prior_bboxes.size() / images_count; + + mem_lock lock{ input_prior_box }; + for (int i = 0; i < images_count; i++) + { + auto prior_box_data = lock.begin() + i*num_of_priors*prior_info_size * (variance_encoded_in_target ? 1 : 2); + + for (int prior = 0; prior < num_of_priors; ++prior) + { + int idx = prior * prior_info_size + prior_coordinates_offset; + prior_bboxes[i*num_of_priors + prior] = bounding_box((float)(prior_box_data[idx]), (float)(prior_box_data[idx + 1]), (float)(prior_box_data[idx + 2]), (float)(prior_box_data[idx + 3])); + idx += num_of_priors * prior_info_size; + for (int j = 0; j < PRIOR_BOX_SIZE; ++j) + { + prior_variances[i*num_of_priors + prior][j] = variance_encoded_in_target ? 0.0f : (float)(prior_box_data[idx + j]); + } + } + + } + } + + template + void extract_confidences_per_image(const detection_output_inst& instance, std::vector>>>& confidences, const int num_of_priors) + { + const int num_classes = instance.argument.num_classes; + + const int num_of_images = (int)confidences.size(); + auto& input_confidence = instance.confidence_memory(); + const float confidence_threshold = instance.argument.confidence_threshold; + + mem_lock lock{ &input_confidence }; + auto confidence_data = lock.begin(); + + assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]); + + const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size(); + const int input_buffer_size_x = input_buffer_size.spatial[0]; + const int input_buffer_size_y = input_buffer_size.spatial[1]; + const int input_buffer_size_f = input_buffer_size.feature[0]; + const auto& input_padding = input_confidence.get_layout().data_padding; + const int input_padding_lower_x = input_padding.lower_size().spatial[0]; + const int input_padding_lower_y = input_padding.lower_size().spatial[1]; + const int stride = input_buffer_size_y * input_buffer_size_x; + + for (int image = 0; image < num_of_images; ++image) + { + std::vector>>& label_to_scores = confidences[image]; + label_to_scores.resize(num_classes); + int idx = get_linear_feature_index(image, 0, input_buffer_size_f, input_buffer_size_y, + input_buffer_size_x, input_padding_lower_y, input_padding_lower_x); + + if (stride == 1 && std::is_same::value) + { + float const* confidence_ptr_float = (float const*)(&(*confidence_data)); + confidence_ptr_float += idx; + __m128 threshold = _mm_load_ps1(&confidence_threshold); + for (int prior = 0; prior < num_of_priors; ++prior) + { + int cls = 0; + for (; cls + 3 < num_classes; cls += 4) + { + __m128 scores = _mm_loadu_ps(confidence_ptr_float); + confidence_ptr_float += 4; + __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold)); + if (_mm_testz_si128(mask128, mask128)) + { + continue; + } + int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128)); + if (mask & 1) + { + label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior); + } + if (mask & 2) + { + int score = _mm_extract_ps(scores, 1); + float s = reinterpret_cast(score); + label_to_scores[cls + 1].emplace_back(s, prior); + } + if (mask & 4) + { + int score = _mm_extract_ps(scores, 2); + float s = reinterpret_cast(score); + label_to_scores[cls + 2].emplace_back(s, prior); + } + if (mask & 8) + { + int score = _mm_extract_ps(scores, 3); + float s = reinterpret_cast(score); + label_to_scores[cls + 3].emplace_back(s, prior); + } + } + for (; cls < num_classes; ++cls) + { + float score = *confidence_ptr_float; + if (score > confidence_threshold) + { + label_to_scores[cls].emplace_back(score, prior); + } + ++confidence_ptr_float; + } + } + } + else + { + for (int prior = 0; prior < num_of_priors; ++prior) + { + for (int cls = 0; cls < num_classes; ++cls) + { + float score = (float)confidence_data[idx]; + if (score > confidence_threshold) + { + label_to_scores[cls].emplace_back(score, prior); + } + idx += stride; + } + } + } + } + } + + template + void prepare_data(const detection_output_inst& instance, std::vector>> &bboxes, std::vector>>>& confidences) + { + assert(bboxes.size() == confidences.size()); + + const auto& args = instance.argument; + + const int num_of_images = (int)bboxes.size(); + const int num_of_priors = instance.prior_box_memory().get_layout().size.spatial[1] / args.prior_info_size; + const int num_loc_classes = args.share_location ? 1 : args.num_classes; + + // Extract locations per image. + std::vector>> locations(num_of_images); // Per image : label -> bounding boxes. + extract_locations_per_image(instance, locations, num_of_priors, num_loc_classes); + + int32_t batches_in_prior_boxes = instance.prior_box_memory().get_layout().size.batch[0]; + std::vector prior_bboxes(batches_in_prior_boxes*num_of_priors); // Prior-Boxes (identical for all images since we assume all images in a batch are of same dimension). + std::vector> prior_variances(batches_in_prior_boxes*num_of_priors); // Variances per prior-box (identical for all images since we assume all images in a batch are of same dimension). + extract_prior_boxes_and_variances(instance, args.variance_encoded_in_target, + args.prior_info_size, args.prior_coordinates_offset, batches_in_prior_boxes, + prior_bboxes, prior_variances); + + // Create the decoded bounding boxes according to locations predictions and prior-boxes. + for (int image = 0; image < num_of_images; ++image) + { + std::vector>& bboxes_per_image = bboxes[image]; + bboxes_per_image.resize(num_loc_classes); + locations[image].resize(num_loc_classes); + for (int cls = 0; cls < num_loc_classes; ++cls) + { + const int label = args.share_location ? 0 : cls; + if (!args.share_location && label == args.background_label_id) + { + continue; // Skip background class. + } + const std::vector& label_loc_preds = locations[image][label]; + int label_loc_preds_size = (int)label_loc_preds.size(); + + bboxes_per_image[label].clear(); + + for (int i = 0; i < label_loc_preds_size; ++i) + { + bounding_box decoded_bbox; + int32_t pb_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i; + int32_t var_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i; + decode_bounding_box(prior_bboxes[pb_offset], prior_variances[var_offset], + args.code_type, args.variance_encoded_in_target, label_loc_preds[i], &decoded_bbox, + args.prior_is_normalized, args.input_width, args.input_height, args.clip_before_nms); + bboxes_per_image[label].emplace_back(decoded_bbox); + } + } + } + + // Extract confidences per image. + extract_confidences_per_image(instance, confidences, num_of_priors); + } + + event_impl::ptr execute_impl(const std::vector& events, detection_output_inst& instance) override + { + for (auto& a : events) + { + a->wait(); + } + + auto ev = instance.get_network().get_engine().create_user_event(false); + + const int num_of_images = instance.location_memory().get_layout().size.batch[0]; //batch size + + std::vector>> bboxes(num_of_images); // Per image : label -> decoded bounding boxes. + std::vector>>> confidences(num_of_images); // Per image : class -> confidences per bounding box. + + if (instance.location_memory().get_layout().data_type == data_types::f32) + { + prepare_data::type>(instance, bboxes, confidences); + + generate_detections::type>(instance, num_of_images, bboxes, confidences); + } + else + { + prepare_data::type>(instance, bboxes, confidences); + + generate_detections::type>(instance, num_of_images, bboxes, confidences); + } + + dynamic_cast(ev.get())->set(); // set as complete + // TODO: consider refactoring create_user_event() to return cldnn::user_event* + return ev; + } + + static primitive_impl* create(const detection_output_node& arg) + { + return new detection_output_cpu(arg); + } +}; + +primitive_impl* runDetectOutCpu(const detection_output_node& arg) +{ + return new detection_output_cpu(arg); +} + +}} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/detection_output_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_gpu.cpp index 55754a8..bfafd18 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/detection_output_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_gpu.cpp @@ -15,17 +15,11 @@ */ #include "detection_output_inst.h" -#include "kernel.h" -#include "kd_selector.h" -#include "network_impl.h" -#include "implementation_map.h" -#include "math_utils.h" - -#include -#include -#include -#include -#include +#include "primitive_gpu_base.h" +#include "error_handler.h" +#include "kernel_selector_helper.h" +#include "detection_output/detection_output_kernel_base.h" +#include "detection_output/detection_output_kernel_selector.h" #ifdef FIX_OPENMP_RELEASE_ISSUE #ifdef OPENMP_FOUND @@ -35,606 +29,134 @@ namespace cldnn { namespace gpu { -namespace { - struct bounding_box - { - float xmin; - float ymin; - float xmax; - float ymax; - - bounding_box() : xmin(0), ymin(0), xmax(0), ymax(0) {} - - bounding_box(const float xmin, const float ymin, const float xmax, const float ymax) : - xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {} - - // Computes the area of a bounding box. - float area() const - { - return (xmax - xmin) * (ymax - ymin); - } - }; -} - -struct detection_output_gpu : typed_primitive_impl +struct detection_output_gpu : typed_primitive_gpu_impl { - const detection_output_node& outer; - - detection_output_gpu(const detection_output_node& outer) - : outer(outer) - {} + using parent = typed_primitive_gpu_impl; + using parent::parent; - static void decode_bounding_box( - const bounding_box& prior_bbox, const std::array& prior_variance, - const prior_box_code_type code_type, const bool variance_encoded_in_target, - const bounding_box& bbox, bounding_box* decoded_bbox, - const bool prior_is_normalized, const size_t image_width, const size_t image_height, const bool clip) +private: + static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams, const detection_output_node& arg) { - float prior_bbox_xmin = prior_bbox.xmin; - float prior_bbox_ymin = prior_bbox.ymin; - float prior_bbox_xmax = prior_bbox.xmax; - float prior_bbox_ymax = prior_bbox.ymax; - - float bbox_xmin = bbox.xmin; - float bbox_ymin = bbox.ymin; - float bbox_xmax = bbox.xmax; - float bbox_ymax = bbox.ymax; - - if (!prior_is_normalized) { - prior_bbox_xmin /= image_width; - prior_bbox_ymin /= image_height; - prior_bbox_xmax /= image_width; - prior_bbox_ymax /= image_height; - } - - switch (code_type) - { - case prior_box_code_type::corner: - { - if (variance_encoded_in_target) - { - // variance is encoded in target, we simply need to add the offset predictions. - decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin; - decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin; - decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax; - decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax; - } - else - { - // variance is encoded in bbox, we need to scale the offset accordingly. - decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin; - decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin; - decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax; - decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax; - } - break; - } - case prior_box_code_type::center_size: - { - const float prior_width = prior_bbox_xmax - prior_bbox_xmin; - assert(prior_width > 0); - const float prior_height = prior_bbox_ymax - prior_bbox_ymin; - assert(prior_height > 0); - const float prior_center_x = (prior_bbox_xmin + prior_bbox_xmax) / 2.f; - const float prior_center_y = (prior_bbox_ymin + prior_bbox_ymax) / 2.f; - float decode_bbox_center_x, decode_bbox_center_y; - float decode_bbox_width, decode_bbox_height; - if (variance_encoded_in_target) - { - // variance is encoded in target, we simply need to restore the offset predictions. - decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x; - decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y; - decode_bbox_width = (exp(bbox_xmax) * prior_width); - decode_bbox_height = (exp(bbox_ymax) * prior_height); - } - else - { - // variance is encoded in bbox, we need to scale the offset accordingly. - decode_bbox_center_x = prior_variance[0] * bbox_xmin * prior_width + prior_center_x; - decode_bbox_center_y = prior_variance[1] * bbox_ymin * prior_height + prior_center_y; - decode_bbox_width = (exp(prior_variance[2] * bbox_xmax) * prior_width); - decode_bbox_height = (exp(prior_variance[3] * bbox_ymax) * prior_height); - } - decoded_bbox->xmin = decode_bbox_center_x - decode_bbox_width / 2.0f; - decoded_bbox->ymin = decode_bbox_center_y - decode_bbox_height / 2.0f; - decoded_bbox->xmax = decode_bbox_center_x + decode_bbox_width / 2.0f; - decoded_bbox->ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; - break; - } - case prior_box_code_type::corner_size: - { - const float prior_width = prior_bbox_xmax - prior_bbox_xmin; - assert(prior_width > 0); - const float prior_height = prior_bbox_ymax - prior_bbox_ymin; - assert(prior_height > 0); - if (variance_encoded_in_target) - { - // variance is encoded in target, we simply need to add the offset predictions. - decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin * prior_width; - decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin * prior_height; - decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax * prior_width; - decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax * prior_height; - } - else - { - // variance is encoded in bbox, we need to scale the offset accordingly. - decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin * prior_width; - decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin * prior_height; - decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax * prior_width; - decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax * prior_height; - } - break; - } - default: - { - assert(0); - } - } - - if (clip) - { - decoded_bbox->xmin = std::max(0.0f, std::min(1.0f, decoded_bbox->xmin)); - decoded_bbox->ymin = std::max(0.0f, std::min(1.0f, decoded_bbox->ymin)); - decoded_bbox->xmax = std::max(0.0f, std::min(1.0f, decoded_bbox->xmax)); - decoded_bbox->ymax = std::max(0.0f, std::min(1.0f, decoded_bbox->ymax)); - } + auto primitive = arg.get_primitive(); + detectOutParams.keep_top_k = primitive->keep_top_k; + detectOutParams.num_classes = primitive->num_classes; + detectOutParams.top_k = primitive->top_k; + detectOutParams.background_label_id = primitive->background_label_id; + detectOutParams.code_type = (int32_t)primitive->code_type; + detectOutParams.share_location = primitive->share_location; + detectOutParams.variance_encoded_in_target = primitive->variance_encoded_in_target; + detectOutParams.nms_threshold = primitive->nms_threshold; + detectOutParams.eta = primitive->eta; + detectOutParams.confidence_threshold = primitive->confidence_threshold; + detectOutParams.prior_coordinates_offset = primitive->prior_coordinates_offset; + detectOutParams.prior_info_size = primitive->prior_info_size; + detectOutParams.prior_is_normalized = primitive->prior_is_normalized; + detectOutParams.input_width = primitive->input_width; + detectOutParams.input_heigh = primitive->input_height; + detectOutParams.conf_size_x = arg.confidence().get_output_layout().get_buffer_size().spatial[0]; + detectOutParams.conf_size_y = arg.confidence().get_output_layout().get_buffer_size().spatial[1]; + detectOutParams.conf_padding_x = arg.confidence().get_output_layout().data_padding.lower_size().spatial[0]; + detectOutParams.conf_padding_y = arg.confidence().get_output_layout().data_padding.lower_size().spatial[1]; } - static void apply_nms(const std::vector& bboxes, - std::vector>& scores, - const float nms_threshold, const float eta, const int top_k) - { - // Sort the scores in descending order and keep top_k scores if needed. - if ((top_k != -1) && ((int)scores.size() > top_k)) - { - std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(), [](const std::pair& p1, const std::pair& p2) { return (p1.first > p2.first) || (p1.first == p2.first && p1.second < p2.second); }); - scores.resize(top_k); - } - else - { - std::stable_sort(scores.begin(), scores.end(), [](const std::pair& p1, const std::pair& p2) { return p1.first > p2.first; }); - } - - // NMS - float adaptive_threshold = nms_threshold; - int post_nms_count = 0; - for (auto score_index : scores) - { - const int idx = score_index.second; - bounding_box box1(bboxes[idx]); - bool keep = true; - for (int i = 0; i < post_nms_count; ++i) - { - if (!keep) - { - break; - } - bounding_box box2(bboxes[scores[i].second]); - bool intersecting = (box1.xmin < box2.xmax) & (box2.xmin < box1.xmax) & (box1.ymin < box2.ymax) & (box2.ymin < box1.ymax); - float overlap = 0.0f; - if (intersecting) - { - const float intersect_width = std::min(box1.xmax, box2.xmax) - std::max(box1.xmin, box2.xmin); - const float intersect_height = std::min(box1.ymax, box2.ymax) - std::max(box1.ymin, box2.ymin); - const float intersect_size = intersect_width * intersect_height; - overlap = intersect_size / (box1.area() + box2.area() - intersect_size); - } - keep = (overlap <= adaptive_threshold); - } - if (keep) - { - scores[post_nms_count] = score_index; - ++post_nms_count; - } - if (keep && eta < 1 && adaptive_threshold > 0.5) - { - adaptive_threshold *= eta; - } - } - scores.resize(post_nms_count); // scores holds only the items that were kept after the NMS. - } +public: - template - void generate_detections(const detection_output_inst& instance, const int num_of_images, const std::vector>>& all_bboxes, std::vector>>>& confidences) + static primitive_impl* create(const detection_output_node& arg) { - mem_lock lock{ instance.output_memory() }; - auto out_ptr = lock.begin(); - - const auto& args = instance.argument; - std::vector>>> final_detections; // Per image -> For each label: Pair (score, prior index) - for (int image = 0; image < num_of_images; ++image) - { - const std::vector >& bboxes_per_image = all_bboxes[image]; - std::vector>>& conf_per_image = confidences[image]; - int num_det = 0; -#ifdef FIX_OPENMP_RELEASE_ISSUE -#ifdef OPENMP_FOUND - int num_available_threads = omp_get_max_threads(); - //half available threads usage shows the best perf results for both SKL (4c8t) and APL (4c4t) for this part of detection output - int num_threads_to_use = (omp_in_parallel() == 0) ? num_available_threads/2 : 1; - #pragma omp parallel for num_threads(num_threads_to_use) reduction(+:num_det) -#endif -#endif - for (int cls = 0; cls < (int)args.num_classes; ++cls) - { - if ((int)cls == args.background_label_id) - { - conf_per_image[cls].clear(); - continue; // Skip background class. - } - std::vector>& scores = conf_per_image[cls]; - const int label = args.share_location ? 0 : cls; - apply_nms(bboxes_per_image[label], scores, args.nms_threshold, args.eta, args.top_k); - num_det += (int)scores.size(); - } - if (num_det > args.keep_top_k) - { - std::vector>> score_index_pairs; - score_index_pairs.reserve(num_det); - for (int label = 0; label < (int)args.num_classes; ++label) - { - std::vector>& scores = confidences[image][label]; - for (std::pair score_index : scores) - { - score_index_pairs.emplace_back(score_index.first, std::make_pair(label, score_index.second)); - } - } - - // Keep top k results per image. - auto sort_function = [](const std::pair>& p1, const std::pair>& p2) { return p1.first > p2.first; }; - if ((int)score_index_pairs.size() > args.keep_top_k) - { - std::partial_sort(score_index_pairs.begin(), score_index_pairs.begin() + args.keep_top_k, score_index_pairs.end(), sort_function); - score_index_pairs.resize(args.keep_top_k); - } - else - { - std::sort(score_index_pairs.begin(), score_index_pairs.end(), sort_function); - } - - // Store the new indices. - std::vector>> new_indices(args.num_classes); - for (int j = 0; j < (int)score_index_pairs.size(); ++j) - { - int label = score_index_pairs[j].second.first; - int idx = score_index_pairs[j].second.second; - new_indices[label].emplace_back(score_index_pairs[j].first, idx); - } - final_detections.emplace_back(new_indices); - } - else - { - final_detections.emplace_back(confidences[image]); - } - } - - int count = 0; - for (int image = 0; image < num_of_images; ++image) - { - const std::vector >& bboxes_per_image = all_bboxes[image]; - auto& final_detections_per_image = final_detections[image]; - for (int label = 0; label < (int)final_detections_per_image.size(); ++label) - { - int loc_label = args.share_location ? 0 : label; - const std::vector& bboxes = bboxes_per_image[loc_label]; - const std::vector>& label_detections = final_detections_per_image[label]; - for (std::pair score_prior : label_detections) - { - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)(float)image; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = args.decrease_label_id ? ((dtype)((float)label - 1.0f)) - : (dtype)(float)label; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)score_prior.first; - const bounding_box& bbox = bboxes[score_prior.second]; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)bbox.xmin; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)bbox.ymin; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)bbox.xmax; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)bbox.ymax; - ++count; - } - } - } - - //In case number of detections is smaller than keep_top_k fill the rest of the buffer with invalid image id (-1). - while (count < num_of_images*args.keep_top_k) + if (!arg.get_program().get_options().get()->enabled()) { - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)-1.f; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = (dtype)0.f; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)0.f; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)0.f; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)0.f; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)0.f; - out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)0.f; - ++count; + return runDetectOutCpu(arg); } - } - // Compute the linear index taking the padding into account. - static inline int get_linear_feature_index(const int batch_id, const int feature_id, const int input_buffer_size_f, const int input_buffer_size_y, - const int input_buffer_size_x, const int input_padding_lower_y, const int input_padding_lower_x) - { - // This helper function assumes input layout with x_size = 1 and y_size = 1; - // Location and confidence inputs should be tensors with size {b,f,1,1}. - // This is validated in detection output primitive instance creation. + auto detect_out_params = get_default_params(arg); + auto detect_out_optional_params = get_default_optional_params(arg.get_program()); - int input_idx = (batch_id * input_buffer_size_f + feature_id) * input_buffer_size_y * input_buffer_size_x; - input_idx += input_padding_lower_y * input_buffer_size_x + input_padding_lower_x; + setDetectOutSpecificParams(detect_out_params.detectOutParams, arg); - return input_idx; - } + auto& kernel_selector = kernel_selector::detection_output_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params); - template - void extract_locations_per_image(const detection_output_inst& instance, std::vector>>& locations, const int num_of_priors, const int num_loc_classes) - { - const bool share_location = instance.argument.share_location; - auto& input_location = instance.location_memory(); - const int num_of_images = (int)locations.size(); + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); - mem_lock lock{ input_location }; - auto location_data = lock.begin(); + auto detect_out = new detection_output_gpu(arg, best_kernels[0]); - assert(num_of_priors * num_loc_classes * PRIOR_BOX_SIZE == input_location.get_layout().size.feature[0]); - - const auto& input_buffer_size = input_location.get_layout().get_buffer_size(); - const int input_buffer_size_x = input_buffer_size.spatial[0]; - const int input_buffer_size_y = input_buffer_size.spatial[1]; - const int input_buffer_size_f = input_buffer_size.feature[0]; - const auto& input_padding = input_location.get_layout().data_padding; - const int input_padding_lower_x = input_padding.lower_size().spatial[0]; - const int input_padding_lower_y = input_padding.lower_size().spatial[1]; - - for (int image = 0; image < num_of_images; ++image) - { - std::vector>& label_to_bbox = locations[image]; - label_to_bbox.resize(num_loc_classes); - for (int cls = 0; cls < num_loc_classes; ++cls) - { - int label = share_location ? 0 : cls; - auto & bboxes = label_to_bbox[label]; - bboxes.resize(num_of_priors); - - for (int prior = 0; prior < num_of_priors; ++prior) - { - int idx = prior * num_loc_classes * PRIOR_BOX_SIZE; - bboxes[prior].xmin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE, input_buffer_size_f, input_buffer_size_y, - input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); - bboxes[prior].ymin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 1, input_buffer_size_f, input_buffer_size_y, - input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); - bboxes[prior].xmax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 2, input_buffer_size_f, input_buffer_size_y, - input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); - bboxes[prior].ymax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 3, input_buffer_size_f, input_buffer_size_y, - input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]); - } - } - } + return detect_out; } +}; - template - void extract_prior_boxes_and_variances(const detection_output_inst& instance, const bool variance_encoded_in_target, - const int32_t prior_info_size, const int32_t prior_coordinates_offset, - std::vector& prior_bboxes, std::vector>& prior_variances) - { - auto& input_prior_box = instance.prior_box_memory(); - const int num_of_priors = (int)prior_bboxes.size(); +primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel) +{ + return new detection_output_gpu(arg, kernel); +} - mem_lock lock{ input_prior_box }; - auto prior_box_data = lock.begin(); +/************************ Detection Output keep_top_k part ************************/ - for (int prior = 0; prior < num_of_priors; ++prior) - { - int idx = prior * prior_info_size + prior_coordinates_offset; - prior_bboxes[prior] = bounding_box((float)(prior_box_data[idx]), (float)(prior_box_data[idx + 1]), (float)(prior_box_data[idx + 2]), (float)(prior_box_data[idx + 3])); - idx += num_of_priors * prior_info_size; - for (int j = 0; j < PRIOR_BOX_SIZE; ++j) - { - prior_variances[prior][j] = variance_encoded_in_target ? 0.0f : (float)(prior_box_data[idx + j]); - } - } - } +struct detection_output_sort_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; - template - void extract_confidences_per_image(const detection_output_inst& instance, std::vector>>>& confidences, const int num_of_priors) +private: + static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams, const detection_output_sort_node& arg) { - const int num_classes = instance.argument.num_classes; - - const int num_of_images = (int)confidences.size(); - auto& input_confidence = instance.confidence_memory(); - const float confidence_threshold = instance.argument.confidence_threshold; - - mem_lock lock{ &input_confidence }; - auto confidence_data = lock.begin(); - - assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]); - - const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size(); - const int input_buffer_size_x = input_buffer_size.spatial[0]; - const int input_buffer_size_y = input_buffer_size.spatial[1]; - const int input_buffer_size_f = input_buffer_size.feature[0]; - const auto& input_padding = input_confidence.get_layout().data_padding; - const int input_padding_lower_x = input_padding.lower_size().spatial[0]; - const int input_padding_lower_y = input_padding.lower_size().spatial[1]; - const int stride = input_buffer_size_y * input_buffer_size_x; - - for (int image = 0; image < num_of_images; ++image) + if (arg.get_dependency(0).is_type()) { - std::vector>>& label_to_scores = confidences[image]; - label_to_scores.resize(num_classes); - int idx = get_linear_feature_index(image, 0, input_buffer_size_f, input_buffer_size_y, - input_buffer_size_x, input_padding_lower_y, input_padding_lower_x); - - if (stride == 1 && std::is_same::value) - { - float const* confidence_ptr_float = (float const*)(&(*confidence_data)); - confidence_ptr_float += idx; - __m128 threshold = _mm_load_ps1(&confidence_threshold); - for (int prior = 0; prior < num_of_priors; ++prior) - { - int cls = 0; - for (; cls + 3 < num_classes; cls += 4) - { - __m128 scores = _mm_loadu_ps(confidence_ptr_float); - confidence_ptr_float += 4; - __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold)); - if (_mm_testz_si128(mask128, mask128)) - { - continue; - } - int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128)); - if (mask & 1) - { - label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior); - } - if (mask & 2) - { - int score = _mm_extract_ps(scores, 1); - float s = reinterpret_cast(score); - label_to_scores[cls + 1].emplace_back(s, prior); - } - if (mask & 4) - { - int score = _mm_extract_ps(scores, 2); - float s = reinterpret_cast(score); - label_to_scores[cls + 2].emplace_back(s, prior); - } - if (mask & 8) - { - int score = _mm_extract_ps(scores, 3); - float s = reinterpret_cast(score); - label_to_scores[cls + 3].emplace_back(s, prior); - } - } - for (; cls < num_classes; ++cls) - { - float score = *confidence_ptr_float; - if (score > confidence_threshold) - { - label_to_scores[cls].emplace_back(score, prior); - } - ++confidence_ptr_float; - } - } - } - else - { - for (int prior = 0; prior < num_of_priors; ++prior) - { - for (int cls = 0; cls < num_classes; ++cls) - { - float score = (float)confidence_data[idx]; - if (score > confidence_threshold) - { - label_to_scores[cls].emplace_back(score, prior); - } - idx += stride; - } - } - } + auto primitive = arg.get_dependency(0).as().get_primitive(); + detectOutParams.keep_top_k = primitive->keep_top_k; + detectOutParams.num_classes = primitive->num_classes; + detectOutParams.num_images = arg.get_dependency(0).as().location().get_output_layout().size.batch[0]; + detectOutParams.top_k = primitive->top_k; + detectOutParams.share_location = primitive->share_location; + detectOutParams.background_label_id = primitive->background_label_id; } - } - - template - void prepare_data(const detection_output_inst& instance, std::vector>> &bboxes, std::vector>>>& confidences) - { - assert(bboxes.size() == confidences.size()); - - const auto& args = instance.argument; - - const int num_of_images = (int)bboxes.size(); - const int num_of_priors = instance.prior_box_memory().get_layout().size.spatial[1] / args.prior_info_size; - const int num_loc_classes = args.share_location ? 1 : args.num_classes; - - // Extract locations per image. - std::vector>> locations(num_of_images); // Per image : label -> bounding boxes. - extract_locations_per_image(instance, locations, num_of_priors, num_loc_classes); - - // Extract prior boxes - same within a batch. - std::vector prior_bboxes(num_of_priors); // Prior-Boxes (identical for all images since we assume all images in a batch are of same dimension). - std::vector> prior_variances(num_of_priors); // Variances per prior-box (identical for all images since we assume all images in a batch are of same dimension). - extract_prior_boxes_and_variances(instance, args.variance_encoded_in_target, - args.prior_info_size, args.prior_coordinates_offset, - prior_bboxes, prior_variances); - - // Create the decoded bounding boxes according to locations predictions and prior-boxes. - for (int image = 0; image < num_of_images; ++image) + else { - std::vector>& bboxes_per_image = bboxes[image]; - bboxes_per_image.resize(num_loc_classes); - locations[image].resize(num_loc_classes); - for (int cls = 0; cls < num_loc_classes; ++cls) - { - const int label = args.share_location ? 0 : cls; - if (!args.share_location && label == args.background_label_id) - { - continue; // Skip background class. - } - const std::vector& label_loc_preds = locations[image][label]; - int label_loc_preds_size = (int)label_loc_preds.size(); - assert((int)prior_bboxes.size() == label_loc_preds_size); - - bboxes_per_image[label].clear(); - - for (int i = 0; i < label_loc_preds_size; ++i) - { - bounding_box decoded_bbox; - decode_bounding_box(prior_bboxes[i], prior_variances[i], args.code_type, args.variance_encoded_in_target, label_loc_preds[i], &decoded_bbox, - args.prior_is_normalized, args.input_width, args.input_height, args.clip); - bboxes_per_image[label].emplace_back(decoded_bbox); - } - } + auto primitive = arg.get_primitive(); + detectOutParams.keep_top_k = primitive->keep_top_k; + detectOutParams.num_classes = primitive->num_classes; + detectOutParams.num_images = primitive->num_images; + detectOutParams.top_k = primitive->top_k; + detectOutParams.share_location = primitive->share_location; + detectOutParams.background_label_id = primitive->background_label_id; } - - // Extract confidences per image. - extract_confidences_per_image(instance, confidences, num_of_priors); } - event_impl::ptr execute_impl(const std::vector& events, detection_output_inst& instance) override +public: + static primitive_impl* create(const detection_output_sort_node& arg) { - for (auto& a : events) - { - a->wait(); - } - - auto ev = instance.get_network().get_engine().create_user_event(false); - - const int num_of_images = instance.location_memory().get_layout().size.batch[0]; //batch size + auto detect_out_params = get_default_params(arg); + auto detect_out_optional_params = get_default_optional_params(arg.get_program()); - std::vector>> bboxes(num_of_images); // Per image : label -> decoded bounding boxes. - std::vector>>> confidences(num_of_images); // Per image : class -> confidences per bounding box. + setDetectOutSpecificParams(detect_out_params.detectOutParams, arg); - if (instance.location_memory().get_layout().data_type == data_types::f32) - { - prepare_data::type>(instance, bboxes, confidences); - - generate_detections::type>(instance, num_of_images, bboxes, confidences); - } - else - { - prepare_data::type>(instance, bboxes, confidences); + auto& kernel_selector = kernel_selector::detection_output_sort_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params); - generate_detections::type>(instance, num_of_images, bboxes, confidences); - } + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); - dynamic_cast(ev.get())->set(); // set as complete - // TODO: consider refactoring create_user_event() to return cldnn::user_event* - return ev; - } + auto detect_out = new detection_output_sort_gpu(arg, best_kernels[0]); - static primitive_impl* create(const detection_output_node& arg) - { - return new detection_output_gpu(arg); + return detect_out; } }; +primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel) +{ + return new detection_output_sort_gpu(arg, kernel); +} + namespace { struct attach { - attach() - { + attach() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), detection_output_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), detection_output_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), detection_output_sort_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), detection_output_sort_gpu::create); } - ~attach() {} }; attach attach_impl; } + }} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp index 4833983..5219fe9 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp @@ -1,5 +1,9 @@ /* -// Copyright (c) 2016 Intel Corporation +<<<<<<< HEAD +// Copyright (c) 2019 Intel Corporation +======= +// Copyright (c) 2016-2019 Intel Corporation +>>>>>>> 0473785... Eltwise operation added: equal, not_equal, less, less_equal, greater, greater_equal, logical_and, logical_or, logical_xor, that produce int output with 0 and 1 values // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -26,7 +30,7 @@ namespace cldnn { namespace gpu { namespace { - inline kernel_selector::eltwise_mode convect_to_eltwise_mode(eltwise_mode mode) + inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) { switch (mode) { @@ -38,6 +42,16 @@ namespace case eltwise_mode::min: return kernel_selector::eltwise_mode::MIN; case eltwise_mode::pow: return kernel_selector::eltwise_mode::POW; case eltwise_mode::mod: return kernel_selector::eltwise_mode::MODULU; + case eltwise_mode::eq: return kernel_selector::eltwise_mode::EQ; + case eltwise_mode::ne: return kernel_selector::eltwise_mode::NE; + case eltwise_mode::lt: return kernel_selector::eltwise_mode::LT; + case eltwise_mode::le: return kernel_selector::eltwise_mode::LE; + case eltwise_mode::gt: return kernel_selector::eltwise_mode::GT; + case eltwise_mode::ge: return kernel_selector::eltwise_mode::GE; + case eltwise_mode::logic_and: return kernel_selector::eltwise_mode::LOGIC_AND; + case eltwise_mode::logic_or: return kernel_selector::eltwise_mode::LOGIC_OR; + case eltwise_mode::logic_xor: return kernel_selector::eltwise_mode::LOGIC_XOR; + case eltwise_mode::squared_diff: return kernel_selector::eltwise_mode::SQUARED_DIFF; default: return kernel_selector::eltwise_mode::ADD; } @@ -58,8 +72,8 @@ protected: } public: - static primitive_impl* create(const eltwise_node& arg) - { + static primitive_impl* create(const eltwise_node& arg) + { auto ew_params = get_default_params(arg); auto ew_optional_params = get_default_optional_params(arg.get_program()); @@ -70,17 +84,17 @@ public: const auto& primitive = arg.get_primitive(); if(primitive->with_activation) - convert_activation_func_params(primitive, ew_params); + convert_activation_func_params(primitive, ew_params.activation); - ew_params.operations.push_back({ + ew_params.operations.push_back({ { kernel_selector::eltwise_params::InputType::Buffer(0), kernel_selector::eltwise_params::InputType::Buffer(1) }, - convect_to_eltwise_mode(primitive->mode) }); + convert_to_eltwise_mode(primitive->mode) }); for (uint32_t i = 2; i < static_cast(arg.inputs_count()); i++) { ew_params.operations.push_back({{ kernel_selector::eltwise_params::InputType::Intermediate(i-2), kernel_selector::eltwise_params::InputType::Buffer(i) }, - convect_to_eltwise_mode(primitive->mode) }); + convert_to_eltwise_mode(primitive->mode) }); } if (primitive->mode == eltwise_mode::sum) @@ -91,7 +105,53 @@ public: for (size_t i = 0; i < ew_params.inputs.size(); i++) { if (!ew_params.inputs[i].SameDims(ew_params.output)) - ew_params.layoutBased = true; + { + std::vector input_size = arg.input(i).get_output_layout().size.raw.vector(); + std::vector output_size = arg.get_output_layout().size.raw.vector(); + bool broadcast = false; + for (size_t d = 0; d < output_size.size(); d++) + { + if (output_size[d] != 1 || input_size[d] == 1) + broadcast = true; + } + if (broadcast) + { + ew_params.broadcast = true; + break; + } + else + { + ew_params.layoutBased = true; + break; + } + + } + } + + // stride + if (!primitive->stride.empty()) + { + const auto& stride = primitive->stride; + ew_params.stride.resize(stride.size()); + for (size_t i = 0; i < primitive->stride.size(); i++) + { + ew_params.stride[i] = { (uint32_t)stride[i].spatial[0], (uint32_t)stride[i].spatial[1] }; + } + } + + // check if strides are the same + if(!ew_params.stride.empty()) + { + const auto& stride = ew_params.stride[0]; + for (size_t i = 1; i < ew_params.stride.size(); i++) + { + if (stride.x != ew_params.stride[i].x || stride.y != ew_params.stride[i].y) + ew_params.layoutBased = true; + } + } + else if (!ew_params.inputs[0].SameDimsSizes(ew_params.inputs[1])) + { + ew_params.broadcast = true; } if (primitive->output_calibration_factors.size() > 0 || primitive->output_quantization_factor != 1.0f) @@ -139,7 +199,9 @@ namespace { { std::make_tuple(engine_types::ocl, data_types::i64, format::byxf), eltwise_gpu::create }, // MMAD { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create }, - { std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create } + { std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create }, + { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create }, + { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create } }); } ~attach() {} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/engine_info.cpp b/inference-engine/thirdparty/clDNN/src/gpu/engine_info.cpp index 73d20d6..e693bef 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/engine_info.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/engine_info.cpp @@ -18,6 +18,10 @@ #include #include #include +#include +#include +#include +#include "istreamwrapper.h" #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN @@ -25,10 +29,18 @@ #include #include #include -#elif defined(__linux__) -#include +#else +#include +#include +#include +#include #endif + +#include #include +#include + + namespace cldnn { namespace gpu{ namespace { @@ -118,40 +130,55 @@ std::string to_string_hex(int val) return std::string("0x") + &buf[i]; } -struct device_info -{ - engine_info_internal::models model; - engine_info_internal::architectures arch; - engine_info_internal::configurations config; - std::string code; -}; - #include "mode.inc" -const device_info& get_device_info(int device_id) -{ -#define GEN_DEVICE(code, dev_id, model, arch, conf) { dev_id, {engine_info_internal::model, engine_info_internal::arch, engine_info_internal::conf, #code} }, - static const std::unordered_map device_map +std::shared_ptr get_cache_from_file(uint32_t compute_units_count, const gpu_toolkit& context) { + std::string tuning_cache_path = context.get_configuration().tuning_cache_path; + if (tuning_cache_path.compare("cache.json") == 0) { -#include "gpu_devices.inc" - }; -#undef GEN_DEVICE - - auto it = device_map.find(device_id); - if (it == device_map.end()) +#ifdef _WIN32 + char path[MAX_PATH]; + HMODULE hm = NULL; + GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | + GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + (LPCSTR)&get_cache_from_file, &hm); + GetModuleFileName(hm, path, sizeof(path)); + std::string bin_path(path); + tuning_cache_path = bin_path.substr(0, bin_path.find_last_of("\\")) + "\\cache.json"; +#else + Dl_info dl_info; + dladdr((void*)device_info_failed_msg, &dl_info); + std::string bin_path(dl_info.dli_fname); + tuning_cache_path = bin_path.substr(0, bin_path.find_last_of("/")) + "/cache.json"; +#endif + } + rapidjson::Document cacheFile; + rapidjson::Document cacheDeviceData; + auto computeUnits = std::to_string(compute_units_count); + std::ifstream f(tuning_cache_path); + if (f.good()) { - if (public_caps) + rapidjson::IStreamWrapper isw{ f }; + cacheFile.ParseStream(isw); + auto errorCode = cacheFile.GetParseError(); + if (!cacheFile.HasMember(computeUnits.c_str()) && errorCode == 0) + { + computeUnits = "24"; + } + if (cacheFile.HasMember(computeUnits.c_str()) && errorCode == 0) { - throw std::runtime_error(std::string(device_info_failed_msg) + " - unsupported device id: " + to_string_hex(device_id) + ". Note: HD5xx+ devices are supported"); + cacheDeviceData.CopyFrom(cacheFile[computeUnits.c_str()], cacheDeviceData.GetAllocator()); } else { - std::cerr << "[WARNING]. Device ID (" << to_string_hex(device_id) << ") not supported. Pretending to behave like SKL GT2." << std::endl; - int new_device_id = 6433; - return device_map.at(new_device_id); + cacheDeviceData.Parse("{}"); } } - return device_map.at(device_id); + else + { + cacheDeviceData.Parse("{}"); + } + return std::make_shared < rapidjson::Document>(std::move(cacheDeviceData)); } } // namespace @@ -160,13 +187,17 @@ engine_info_internal::engine_info_internal(const gpu_toolkit& context) { auto device_id = get_gpu_device_id(); if (0 == device_id) throw std::runtime_error(device_info_failed_msg); - auto& dev_info = get_device_info(device_id); - model = dev_info.model; - architecture = dev_info.arch; - configuration = dev_info.config; dev_id = to_string_hex(device_id); driver_version = context.device().getInfo(); + compute_units_count = context.device().getInfo(); + try { + device_cache = get_cache_from_file(compute_units_count, context); + } + catch (...){ + std::cout << "[WARNING] error during parsing cache file, tuning data won't be used" << std::endl; + device_cache->Parse("{}"); + } cores_count = static_cast(context.device().getInfo()); core_frequency = static_cast(context.device().getInfo()); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/engine_info.h b/inference-engine/thirdparty/clDNN/src/gpu/engine_info.h index 4ad7d46..384eae8 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/engine_info.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/engine_info.h @@ -15,69 +15,22 @@ */ #pragma once #include +#include #include "api/CPP/engine.hpp" +#include "document.h" -namespace cldnn { namespace gpu { + +namespace cldnn { + namespace gpu { class gpu_toolkit; struct engine_info_internal : cldnn::engine_info { - #ifdef GPU_CONFIGURATION - #undef GPU_CONFIGURATION - #endif - #ifdef GPU_MODEL - #undef GPU_MODEL - #endif - #ifdef GPU_ARCHITECTURE - #undef GPU_ARCHITECTURE - #endif - - - enum configurations - { - #define GPU_CONFIGURATION(enum_name, enum_value) enum_name = enum_value, - #define GPU_MODEL(enum_name, enum_value) - #define GPU_ARCHITECTURE(enum_name, enum_value) - #include "gpu_enums.inc" - #undef GPU_CONFIGURATION - #undef GPU_MODEL - #undef GPU_ARCHITECTURE - }; - - - - enum models - { - #define GPU_CONFIGURATION(enum_name, enum_value) - #define GPU_MODEL(enum_name, enum_value) enum_name = enum_value, - #define GPU_ARCHITECTURE(enum_name, enum_value) - #include "gpu_enums.inc" - #undef GPU_CONFIGURATION - #undef GPU_MODEL - #undef GPU_ARCHITECTURE - }; - - - - enum architectures - { - #define GPU_CONFIGURATION(enum_name, enum_value) - #define GPU_MODEL(enum_name, enum_value) - #define GPU_ARCHITECTURE(enum_name, enum_value) enum_name = enum_value, - #include "gpu_enums.inc" - #undef GPU_CONFIGURATION - #undef GPU_MODEL - #undef GPU_ARCHITECTURE - }; - - #undef GPU_CONFIGURATION - - - configurations configuration; - models model; - architectures architecture; std::string dev_id; std::string driver_version; + std::uint32_t compute_units_count; + std::shared_ptr device_cache; + private: friend class gpu_toolkit; explicit engine_info_internal(const gpu_toolkit& context); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/events_pool.h b/inference-engine/thirdparty/clDNN/src/gpu/events_pool.h new file mode 100644 index 0000000..11a0e37 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/events_pool.h @@ -0,0 +1,139 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "refcounted_obj.h" +#include "event_impl.h" +#include "meta_utils.h" +#include + +namespace cldnn { + namespace gpu { + + class gpu_toolkit; + + template::value>::type> + class event_pool_impl + { + protected: + event_pool_impl() = default; + + using type = Type; + + event_impl::ptr get_from_pool(std::shared_ptr& ctx) + { + for (auto& ev : _events) + { + if (!ev->is_valid()) + return ev; + } + return allocate({ new Type(ctx), false }); + } + + void reset_events() + { + for (auto& ev : _events) + ev->reset(); + } + + private: + std::vector _events; + + event_impl::ptr allocate(const event_impl::ptr& obj) + { + _events.emplace_back(obj); + return _events.back(); + } + }; + + struct base_event_pool : event_pool_impl + { + event_impl::ptr get(std::shared_ptr& ctx, const cl::Event& ev, const uint64_t q_stamp) + { + auto ret = get_from_pool(ctx); + dynamic_cast(ret.get())->attach_ocl_event(ev, q_stamp); + return ret; + } + void reset() + { + reset_events(); + } + }; + + struct user_event_pool : event_pool_impl + { + event_impl::ptr get(std::shared_ptr& ctx, bool set = false) + { + auto ret = get_from_pool(ctx); + dynamic_cast(ret.get())->attach_event(set); + return ret; + } + void reset() + { + reset_events(); + } + }; + + struct group_event_pool : event_pool_impl + { + event_impl::ptr get(std::shared_ptr& ctx, const std::vector& deps) + { + auto ret_ev = get_from_pool(ctx); + dynamic_cast(ret_ev.get())->attach_events(deps); + return ret_ev; + } + void reset() + { + reset_events(); + } + }; + + class events_pool + { + public: + events_pool() = default; + + event_impl::ptr get_from_base_pool(std::shared_ptr ctx, const cl::Event& ev, const uint64_t q_stamp) + { + return _base_pool.get(ctx, ev, q_stamp); + } + + event_impl::ptr get_from_user_pool(std::shared_ptr ctx, bool set = false) + { + return _user_pool.get(ctx, set); + } + + event_impl::ptr get_from_group_pool(std::shared_ptr ctx, const std::vector& deps) + { + return _group_pool.get(ctx, deps); + } + + void reset_events() + { + _base_pool.reset(); + _user_pool.reset(); + _group_pool.reset(); + } + + private: + base_event_pool _base_pool; + user_event_pool _user_pool; + group_event_pool _group_pool; + }; + } +} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/events_waiter.h b/inference-engine/thirdparty/clDNN/src/gpu/events_waiter.h index ca3a8ac..d16b56e 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/events_waiter.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/events_waiter.h @@ -16,9 +16,8 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// #pragma once -#include "api/CPP/profiling.hpp" -#include "ocl_user_event.h" #include "ocl_toolkit.h" +#include "event_impl.h" namespace cldnn { namespace gpu { class events_waiter : public context_holder @@ -29,12 +28,6 @@ public: event_impl::ptr run(const std::vector& dependencies) { - if (dependencies.size() == 0) - { - auto ev = new gpu::user_event(context(), true); - return{ ev, false }; - } - if (dependencies.size() == 1) return dependencies[0]; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp index 517a732..6494601 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -92,7 +92,7 @@ public: fc_optional_params.allowInputReordering = true; if(arg.get_primitive()->with_activation) - convert_activation_func_params(arg.get_primitive(), fc_params); + convert_activation_func_params(arg.get_primitive(), fc_params.activation); fc_params.output = fc_params.output.FlattenFeatureAndSpatials(); @@ -154,6 +154,9 @@ namespace { // MMAD { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw }, { std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw }, + // IMAD + { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw }, + { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw }, }); } ~attach() {} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp index fb22ef0..89a32e1 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp @@ -33,9 +33,9 @@ struct fully_connected_grad_weights_gpu : typed_primitive_gpu_impl& instance) const override + virtual bool validate_impl(const typed_primitive_inst& instance) const override { - bool res = parent::validate(instance); + bool res = true; if (instance.use_momentum()) { diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp new file mode 100644 index 0000000..f51ae8e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp @@ -0,0 +1,166 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_bn_scale_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "error_handler.h" +#include "kernel_selector_helper.h" +#include "kernel_runner.h" +#include "fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h" +#include "fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h" + +namespace cldnn { namespace gpu { + +struct fused_conv_bn_scale_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; + +protected: + + virtual bool validate_impl(const typed_primitive_inst& instance) const override + { + bool res = true; + + // Check whether all memory elements use the same unit type (FP16 or FP32). + CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(), "Input memory", instance.node.input().get_output_layout().data_type, "output memory", instance.node.get_output_layout().data_type, ""); + CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(), "Input memory", instance.node.input().get_output_layout().data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, ""); + + return res; + } + + virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst& instance, int32_t split) const override + { + kernel::kernel_arguments_data args = parent::get_arguments(instance, split); + auto desc = std::static_pointer_cast(instance.desc()); + + args.weights = &instance.weights_memory(split); + args.bias = instance.bias_term() ? &instance.bias_memory(split) : nullptr; + + if (!desc->scale_bias.empty()) + { + if (instance.is_fused_in_training()) + { + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 4)); + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 3)); + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 2)); + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 1)); + } + else + { + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 1)); + } + } + else if (instance.is_fused_in_training()) + { + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 3)); + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 2)); + args.inputs.push_back(&instance.dep_memory(instance.dependencies().size() - 1)); + } + + return args; + } + + virtual int32_t get_split() const override + { + return _outer.get_split(); + } + +public: + + static primitive_impl* create(const fused_conv_bn_scale_node &arg) + { + const auto& primitive = arg.get_primitive(); + const auto& input_layout = arg.input().get_output_layout(); + const auto& weights_layout = arg.weights(0).get_output_layout(); + const auto& weights_size = weights_layout.size; + + const auto& split = primitive->split(); + const auto& stride = primitive->stride; + const auto& input_offset = primitive->input_offset; + const auto& dilation = primitive->dilation; + + assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]); + + auto fuse_params = get_weights_bias_default_params(arg, split); + auto fuse_optional_params = get_default_weights_bias_optional_params(arg.get_program()); + + const auto additional_offset = tensor::max(input_offset, 0); + if (additional_offset != 0) + { + fuse_params.inputs[0] = convert_data_tensor(input_layout, split, additional_offset); + } + + fuse_params.epsilon = arg.get_primitive()->epsilon; + + fuse_params.fused_in_training = arg.is_fused_in_training(); + fuse_params.scale_bias = arg.scale_bias_term(); + + if(primitive->with_activation) + convert_activation_func_params(primitive, fuse_params.activation); + + fuse_params.split = split; + fuse_params.filterSize = { + (uint32_t)weights_size.spatial[0], + (uint32_t)weights_size.spatial[1], + }; + + fuse_params.padding = { + (uint32_t)std::max(-input_offset.spatial[0], 0), + (uint32_t)std::max(-input_offset.spatial[1], 0) + }; + + fuse_params.stride = { + (uint32_t)stride.spatial[0], + (uint32_t)stride.spatial[1] + }; + + fuse_params.dilation = { + (uint32_t)dilation.spatial[0], + (uint32_t)dilation.spatial[1] + }; + + auto& kernel_selector = kernel_selector::fused_conv_bn_scale_kernel_selector::Instance(); + + const auto& tuning_config = arg.get_program().get_options().get(); + + if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache) + { + fuse_optional_params.tuningParams.runner = std::make_shared(arg.get_program().get_engine(), true); + } + + kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(fuse_params, fuse_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto fuse = new fused_conv_bn_scale_gpu(arg, best_kernels[0]); + + return fuse; + } +}; + +namespace{ + struct attach { + attach() { + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), fused_conv_bn_scale_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), fused_conv_bn_scale_gpu::create); + } + ~attach() {} + }; + attach attach_impl; +} +} } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp new file mode 100644 index 0000000..ea619ba --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp @@ -0,0 +1,214 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "fused_conv_eltwise_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "error_handler.h" +#include "kernel_selector_helper.h" +#include "kernel_runner.h" +#include "fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h" +#include "fused_conv_eltwise/fused_conv_eltwise_kernel_base.h" + +namespace cldnn { namespace gpu { + +struct fused_conv_eltwise_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; + +protected: + + virtual bool validate_impl(const typed_primitive_inst& instance) const override + { + bool res = true; + + auto outer_id = _outer.id(); + auto data_type = instance.node.input().get_output_layout().data_type; + + // Check whether all memory elements use the same unit type (FP16 or FP32). + CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "output memory", instance.node.get_output_layout().data_type, ""); + CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, ""); + + return res; + } + + virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst& instance, int32_t split) const override + { + kernel::kernel_arguments_data args = parent::get_arguments(instance, split); + + args.weights = &instance.weights_memory(split); + args.bias = instance.bias_term() ? &instance.bias_memory(split) : nullptr; + args.weights_quantization_factors = instance.weights_quantization_factors_term() ? &instance.weights_quantization_factors_memory(split) : nullptr; + args.output_calibration_factors = instance.conv_output_calibration_factors_term() ? &instance.output_calibration_factors_memory(split) : nullptr; + if (instance.eltw_output_calibration_factors_term()) + args.fused_op_calibration_factors.push_back(&instance.eltw_output_calibration_factors_memory()); + return args; + } + + virtual int32_t get_split() const override + { + return _outer.get_split(); + } + +public: + + static primitive_impl* create(const fused_conv_eltwise_node &arg) + { + const auto& primitive = arg.get_primitive(); + const auto& input_layout = arg.input().get_output_layout(); + const auto& weights_layout = arg.weights(0).get_output_layout(); + const auto& weights_size = weights_layout.size; + + const auto& split = primitive->split(); + const auto& stride = primitive->conv.stride; + const auto& dilation = primitive->conv.dilation; + const auto& input_offset = primitive->conv.input_offset; + + const auto depthwise_separable_opt = arg.get_depthwise_sep_opt(); + const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split; + + const auto transposed = arg.get_transposed(); + + assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]); + + // conv params + auto fused_params = get_weights_bias_default_params(arg, actual_split); + // add second input for eltwise + if (!static_cast(arg.get_primitive().get())->second_input_in_output) + { + fused_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout())); + } + + auto& conv_params = fused_params.conv; + auto& eltw_params = fused_params.eltw; + + auto conv_optional_params = get_default_weights_bias_optional_params(arg.get_program()); + + const auto additional_offset = tensor::max(input_offset, 0); + if (additional_offset != 0) + { + fused_params.inputs[0] = convert_data_tensor(input_layout, actual_split, additional_offset); + } + + if (primitive->conv.with_activation) + { + convert_activation_func_params(&primitive->conv, fused_params.activation); + } + if (primitive->eltw.with_activation) + { + convert_activation_func_params(&primitive->eltw, fused_params.eltw.activation); + } + + fused_params.conv.depthwise_separable_opt = depthwise_separable_opt; + fused_params.conv.transposed = transposed; + + fused_params.second_input_in_output = primitive->second_input_in_output; + + conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1; + conv_params.split = split; + conv_params.filterSize = { + (uint32_t)weights_size.spatial[0], + (uint32_t)weights_size.spatial[1], + }; + + conv_params.padding = { + (uint32_t)std::max(-input_offset.spatial[0], 0), + (uint32_t)std::max(-input_offset.spatial[1], 0) + }; + + conv_params.stride = { + (uint32_t)stride.spatial[0], + (uint32_t)stride.spatial[1] + }; + conv_params.dilation = { + (uint32_t)dilation.spatial[0], + (uint32_t)dilation.spatial[1] + }; + + if (primitive->conv.weights_quantization_factors.size() > 0) + { + conv_params.int8_quantization = true; + conv_params.weights_quantization_factors.push_back(convert_data_tensor(arg.weights_quantization_factors().get_output_layout()).FlattenFeatureAndSpatials()); + conv_params.input_quantization_factor = arg.get_conv_input_qf(); + + if (primitive->conv.output_calibration_factors.size() > 0) + { + conv_params.output_calibration = true; + conv_params.output_calibration_factors.push_back(convert_data_tensor(arg.conv_output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials()); + } + else + conv_params.output_quantization_factor = arg.get_conv_output_qf(); + } + + // eltw params + if (primitive->eltw.output_calibration_factors.size() > 0 || primitive->eltw.output_quantization_factor != 1.0f) + { + eltw_params.int8_quantization = true; + + if (primitive->eltw.output_calibration_factors.size() > 0) + { + eltw_params.output_calibration = true; + eltw_params.output_calibration_factors.push_back(convert_data_tensor(arg.eltw_output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials()); + } + else + eltw_params.output_quantization_factor = arg.get_eltw_output_qf(); + } + + // stride + if (!primitive->eltw.stride.empty()) + { + const auto& eltw_stride = primitive->eltw.stride; + eltw_params.stride.resize(eltw_stride.size()); + for (size_t i = 0; i < primitive->eltw.stride.size(); i++) + { + eltw_params.stride[i] = { (uint32_t)eltw_stride[i].spatial[0], (uint32_t)eltw_stride[i].spatial[1] }; + } + } + + auto& kernel_selector = kernel_selector::fused_conv_eltwise_kernel_selector::Instance(); + + const auto& tuning_config = arg.get_program().get_options().get(); + + if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache) + { + conv_optional_params.tuningParams.runner = std::make_shared(arg.get_program().get_engine(), true); + } + + kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(fused_params, conv_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto conv = new fused_conv_eltwise_gpu(arg, best_kernels[0]); + + return conv; + } +}; + +namespace{ + struct attach { + attach() { + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), fused_conv_eltwise_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), fused_conv_eltwise_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), fused_conv_eltwise_gpu::create); + // MMAD + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), fused_conv_eltwise_gpu::create); + } + ~attach() {} + }; + attach attach_impl; +} +} } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp new file mode 100644 index 0000000..776246f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp @@ -0,0 +1,86 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "gather_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "gather/gather_kernel_selector.h" +#include "gather/gather_kernel_ref.h" +#include "error_handler.h" + +using namespace cldnn; + +namespace cldnn +{ +namespace gpu +{ + kernel_selector::gather_axis convert_axis(gather::gather_axis axis) + { + switch (axis) + { + case gather::along_x: return kernel_selector::gather_axis::X; + case gather::along_y: return kernel_selector::gather_axis::Y; + case gather::along_f: return kernel_selector::gather_axis::FEATURE; + case gather::along_b: return kernel_selector::gather_axis::BATCH; + default: + return kernel_selector::gather_axis::X; + } + } + + struct gather_gpu : typed_primitive_gpu_impl + { + using parent = typed_primitive_gpu_impl; + using parent::parent; + + public: + + static primitive_impl* create(const gather_node& arg) + { + auto gather_params = get_default_params(arg); + auto gather_optional_params = get_default_optional_params(arg.get_program()); + + gather_params.axis = convert_axis(arg.get_primitive()->axis); + + gather_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout())); + + auto& kernel_selector = kernel_selector::gather_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(gather_params, gather_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto gather = new gather_gpu(arg, best_kernels[0]); + + return gather; + } + }; + + namespace + { + struct attach + { + attach() + { + auto val_fw = gather_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + } + ~attach() = default; + }; + attach attach_impl; + } +} //namespace cldnn +} //namespace gpu diff --git a/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp index 0dab915..41f826a 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp @@ -26,17 +26,22 @@ namespace cldnn { namespace gpu { namespace { - inline kernel_selector::IndexSelectAxis convect_to_index_select_axis(index_select_axis_name axis) + inline std::vector convert_to_index_select_axis(std::vector axes) { - switch (axis) + std::vector axes_names = {}; + for (size_t i = 0; i < axes.size(); i++) { - case index_select_axis_name::along_b: return kernel_selector::IndexSelectAxis::BATCH; - case index_select_axis_name::along_f: return kernel_selector::IndexSelectAxis::FEATURE; - case index_select_axis_name::along_x: return kernel_selector::IndexSelectAxis::X; - case index_select_axis_name::along_y: return kernel_selector::IndexSelectAxis::Y; - default: - return kernel_selector::IndexSelectAxis::BATCH; + switch (axes[i]) + { + case index_select_axis_name::along_b: axes_names.push_back(kernel_selector::IndexSelectAxis::BATCH); break; + case index_select_axis_name::along_f: axes_names.push_back(kernel_selector::IndexSelectAxis::FEATURE); break; + case index_select_axis_name::along_x: axes_names.push_back(kernel_selector::IndexSelectAxis::X); break; + case index_select_axis_name::along_y: axes_names.push_back(kernel_selector::IndexSelectAxis::Y); break; + default: + axes_names.push_back(kernel_selector::IndexSelectAxis::BATCH); break; + } } + return axes_names; } } @@ -50,8 +55,11 @@ struct index_select_gpu : typed_primitive_gpu_impl auto index_select_params = get_default_params(arg, 1); auto index_select_optional_params = get_default_optional_params(arg.get_program()); - index_select_params.inputs.push_back(convert_data_tensor(arg.indices().get_output_layout())); - index_select_params.axis = convect_to_index_select_axis(arg.get_axis()); + if (!arg.get_reverse()) + index_select_params.inputs.push_back(convert_data_tensor(arg.indices().get_output_layout())); + + index_select_params.axes = convert_to_index_select_axis(arg.get_axes()); + index_select_params.reverse = arg.get_reverse(); auto& kernel_selector = kernel_selector::index_select_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(index_select_params, index_select_optional_params); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp index ad97670..ca7c24d 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp @@ -115,10 +115,22 @@ namespace { } break; case kernel_selector::kernel_argument_types::OUTPUT_CALIBRATION_FACTORS: - if (data.output_calibration_factors) + if (args[i].index == 0) { - status = kernel.setArg(i, dynamic_cast(*data.output_calibration_factors).get_buffer()); + if (data.output_calibration_factors) + { + status = kernel.setArg(i, dynamic_cast(*data.output_calibration_factors).get_buffer()); + } } + else + { + size_t new_idx = args[i].index - 1; + if (new_idx < data.fused_op_calibration_factors.size() && data.fused_op_calibration_factors[new_idx]) + { + status = kernel.setArg(i, dynamic_cast(*data.fused_op_calibration_factors[new_idx]).get_buffer()); + } + } + break; case kernel_selector::kernel_argument_types::SCALE_TABLE: if (data.scale_table) diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel.h b/inference-engine/thirdparty/clDNN/src/gpu/kernel.h index 4088b12..67a5cf8 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/kernel.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel.h @@ -17,7 +17,8 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// #pragma once -#include "memory_gpu.h" +#include "ocl_toolkit.h" +#include "memory_impl.h" #include "kernels_cache.h" #include "event_impl.h" @@ -69,6 +70,8 @@ public: memory_impl::cptr slope; memory_impl::cptr prev_weights_grad; memory_impl::cptr prev_bias_grad; + // used for fused primitives + std::vector fused_op_calibration_factors; int32_t split = 0; float lr; const kernel_selector::kernel_scalar_arguments* scalars = nullptr; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.h b/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.h index e3d7375..24fe6db 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.h @@ -35,7 +35,7 @@ public: private: const int compilation_batch_size = 50; - const int runs_per_kernel = 10; + const int runs_per_kernel = 3; void prepare_kernel_args(const kernel_selector::KernelsData& kernels_data, gpu::kernel::kernel_arguments_data& args); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp index ed3c6aa..8f33b63 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp @@ -354,3 +354,4 @@ void kernels_cache::build_all() } }} + \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp index 890a8b6..e6ddab6 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp @@ -33,9 +33,9 @@ namespace cldnn { protected: - virtual bool validate(typed_primitive_inst& instance) const override + virtual bool validate_impl(const typed_primitive_inst& instance) const override { - bool res = parent::validate(instance); + bool res = true; // Check whether all memory elements use the same unit type (FP16 or FP32). CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(), "Input memory", instance.input_memory(1).get_layout().data_type, "output memory", instance.output_memory().get_layout().data_type, ""); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/lstm_elt_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/lstm_elt_gpu.cpp index 69baa64..b9f8eef 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/lstm_elt_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/lstm_elt_gpu.cpp @@ -47,12 +47,6 @@ protected: return args; } - virtual bool validate(typed_primitive_inst& instance) const override - { - bool res = parent::validate(instance); - - return res; - } public: static primitive_impl* create(const lstm_elt_node& arg) @@ -64,11 +58,16 @@ public: { const auto& cell_layout = arg.cell().get_output_layout(); lstm_elt_params.SetCell(convert_data_tensor(cell_layout)); + // TODO: make a generic function to get the direction + if (cell_layout.size.spatial[1] > 1) { + lstm_elt_params.cell_direction = arg.direction(); + } } lstm_elt_params.SetOffsetOrder(arg.offset_order()); lstm_elt_params.clip = arg.clip(); lstm_elt_params.input_forget = arg.input_forget(); + lstm_elt_params.direction = arg.direction(); auto& kernel_selector = kernel_selector::lstm_elt_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(lstm_elt_params, lstm_elt_optional_params); @@ -90,6 +89,8 @@ namespace { implementation_map::add({ { std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw }, { std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw }, + { std::make_tuple(engine_types::ocl, data_types::f32, format::fyxb), val_fw }, + { std::make_tuple(engine_types::ocl, data_types::f16, format::fyxb), val_fw }, }); } ~attach() {} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/lstm_gemm_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/lstm_gemm_gpu.cpp index 7cb6b11..40d601a 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/lstm_gemm_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/lstm_gemm_gpu.cpp @@ -50,12 +50,6 @@ protected: return args; } - virtual bool validate(typed_primitive_inst& instance) const override - { - bool res = parent::validate(instance); - - return res; - } public: static primitive_impl* create(const lstm_gemm_node& arg) @@ -78,8 +72,25 @@ public: const auto& hidden_layout = arg.hidden().get_output_layout(); lstm_gemm_params.SetHidden(convert_data_tensor(hidden_layout)); + // TODO: make a generic function to get the direction + if (hidden_layout.size.spatial[1] > 1) { + lstm_gemm_params.hidden_direction = arg.direction(); + } } lstm_gemm_params.direction = arg.direction(); + + // Update the direction of the input for the gemm kernel + const auto& input_layout = arg.input().get_output_layout(); + size_t input_directions = input_layout.size.spatial[1]; + + if (input_directions > 1) // For bidirection input, input direction can be 1 or 0 + { + lstm_gemm_params.input_direction = arg.direction(); + } + else // For unidirectional input + { + lstm_gemm_params.input_direction = 0; + } auto lstm_gemm_optional_params = get_default_optional_params(arg.get_program()); @@ -103,6 +114,8 @@ namespace { implementation_map::add({ { std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw }, { std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw }, + { std::make_tuple(engine_types::ocl, data_types::f32, format::fyxb), val_fw }, + { std::make_tuple(engine_types::ocl, data_types::f16, format::fyxb), val_fw }, }); } ~attach() {} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/memory_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/memory_gpu.cpp index e497807..c50f631 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/memory_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/memory_gpu.cpp @@ -22,7 +22,7 @@ namespace cldnn { namespace gpu { gpu_buffer::gpu_buffer(const refcounted_obj_ptr& engine, const layout& layout) - : memory_impl(engine, layout) + : memory_impl(engine, layout, false) , _context(engine->get_context()) , _lock_count(0) , _buffer(_context->context(), CL_MEM_READ_WRITE, size()) @@ -34,7 +34,7 @@ gpu_buffer::gpu_buffer(const refcounted_obj_ptr& engine, const layo } gpu_buffer::gpu_buffer(const refcounted_obj_ptr& engine, const layout& new_layout, const cl::Buffer& buffer) - : memory_impl(engine, new_layout) + : memory_impl(engine, new_layout, true) , _context(engine->get_context()) , _lock_count(0) , _buffer(buffer) @@ -67,7 +67,7 @@ void gpu_buffer::fill(unsigned char pattern, event_impl::ptr ev) { } gpu_image2d::gpu_image2d(const refcounted_obj_ptr& engine, const layout& layout) - : memory_impl(engine, layout) + : memory_impl(engine, layout, false) , _context(engine->get_context()) , _lock_count(0) , _mapped_ptr(nullptr) @@ -110,7 +110,7 @@ gpu_image2d::gpu_image2d(const refcounted_obj_ptr& engine, const la } gpu_image2d::gpu_image2d(const refcounted_obj_ptr& engine, const layout& new_layout, const cl::Image2D& buffer) - : memory_impl(engine, new_layout) + : memory_impl(engine, new_layout, true) , _context(engine->get_context()) , _lock_count(0) , _buffer(buffer) diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h index 8e015ab..7c9f820 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h @@ -14,7 +14,11 @@ struct profiling_period_ocl_start_stop struct ocl_base_event : virtual public event_impl { public: - ocl_base_event(uint64_t queue_stamp = 0) : _queue_stamp(queue_stamp) {} + ocl_base_event(uint64_t queue_stamp = 0, bool valid = false) + : _queue_stamp(queue_stamp) + { + _attached = valid; + } uint64_t get_queue_stamp() const { return _queue_stamp; } protected: uint64_t _queue_stamp = 0; @@ -23,20 +27,31 @@ protected: struct base_event : virtual public ocl_base_event { public: - base_event(std::shared_ptr ctx, cl::Event const& ev, uint64_t queue_stamp = 0) : ocl_base_event(queue_stamp), _ctx(ctx), _event(ev) + base_event(std::shared_ptr ctx, cl::Event const& ev, uint64_t queue_stamp = 0) + : ocl_base_event(queue_stamp, true) + , _ctx(ctx) + , _event(ev) + {} + + base_event(std::shared_ptr ctx) + : ocl_base_event(0, false) + , _ctx(ctx) {} + void attach_ocl_event(const cl::Event& ev, const uint64_t q_stamp) + { + _event = ev; + _queue_stamp = q_stamp; + _attached = true; + } + std::shared_ptr get_context() const { return _ctx; } cl::Event get() { return _event; } - private: std::shared_ptr _ctx; - cl::Event _event; bool _callback_set = false; - void set_ocl_callback(); - static void CL_CALLBACK ocl_event_completion_callback(cl_event, cl_int, void* me); private: @@ -46,26 +61,50 @@ private: bool get_profiling_info_impl(std::list& info) override; friend struct base_events; + +protected: + cl::Event _event; }; struct base_events : virtual public ocl_base_event { public: - base_events(std::shared_ptr ctx, std::vector const &ev) : ocl_base_event(0), _ctx(ctx), _events(ev) + base_events(std::shared_ptr ctx, std::vector const &ev) + : ocl_base_event(0, true) + , _ctx(ctx) + , _events(ev) + { + set_queue_stamp(); + } + + base_events(std::shared_ptr ctx) + : ocl_base_event(0, false) + , _ctx(ctx) + {} + + void attach_events(const std::vector& ev) + { + if (_attached) + throw std::runtime_error("Trying to attach events to valid event object."); + _events = ev; + _attached = true; + set_queue_stamp(); + } + + std::shared_ptr get_context() const { return _ctx; } + +private: + void set_queue_stamp() { uint64_t _queue_stamp_max = 0; - for (size_t i = 0; i < ev.size(); i++) + for (size_t i = 0; i < _events.size(); i++) { - auto * _base_event = dynamic_cast(ev[i].get()); + auto * _base_event = dynamic_cast(_events[i].get()); if (_base_event->get_queue_stamp() > _queue_stamp_max) _queue_stamp_max = _base_event->get_queue_stamp(); } _queue_stamp = _queue_stamp_max; } - - std::shared_ptr get_context() const { return _ctx; } - -private: void wait_impl() override; bool is_set_impl() override; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp new file mode 100644 index 0000000..46ba2a1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp @@ -0,0 +1,178 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "ocl_builder.h" +#include "confiugration.h" + +// NOTE: Due to buggy scope transition of warnings we need to disable warning in place of use/instantation +// of some types (even though we already disabled them in scope of definition of these types). +// Moreover this warning is pretty much now only for annoyance: it is generated due to lack +// of proper support for mangling of custom GCC attributes into type name (usually when used +// with templates, even from standard library). +#if defined __GNUC__ && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif + +namespace cldnn { namespace gpu{ + + ocl_builder::ocl_builder(const configuration& config) + : _is_user_context(config.user_context != nullptr ? true : false) + { + if (_is_user_context) + { + _context = *config.user_context; + build_device_from_user_context(config); + } + else + { + build_device(config); + build_context(); + } + build_platform_id(); + } + + void ocl_builder::build_device_from_user_context(const configuration& config) + { + auto all_devices = _context.getInfo(); + auto num_devices = _context.getInfo(); + if (num_devices != 1) + { + throw std::runtime_error("[ERROR]. Number of devices from user context is not equal to 1."); + } + auto device = all_devices.at(0); + auto dev_type = device.getInfo(); + if (dev_type != CL_DEVICE_TYPE_GPU) + { + throw std::runtime_error("[ERROR]. User defined device is not an gpu device!"); + } + + std::list reasons; + if (does_device_match_config(config, device, reasons)) + { + _device = device; + return; + } + else + { + std::string error_msg = "No OpenCL device found which would match provided configuration:"; + for (const auto& reason : reasons) + error_msg += "\n " + reason; + throw std::invalid_argument(std::move(error_msg)); + } + + } + + void ocl_builder::build_device(const configuration& config) + { + std::list reasons; + cl_uint n = 0; + + // Get number of platforms availible + cl_int err = clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + throw std::runtime_error("clGetPlatformIDs error " + std::to_string(err)); + } + + // Get platform list + std::vector platform_ids(n); + err = clGetPlatformIDs(n, platform_ids.data(), NULL); + if (err != CL_SUCCESS) { + throw std::runtime_error("clGetPlatformIDs error " + std::to_string(err)); + } + + for (auto& id : platform_ids) + { + cl::Platform platform = cl::Platform(id); + std::vector devices; + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + for (auto& d : devices) + { + if (does_device_match_config(config, d, reasons)) + { + _device = d; + return; + } + } + } + + if (reasons.empty()) + throw std::runtime_error("Could not find any OpenCL device"); + + std::string error_msg = "No OpenCL device found which would match provided configuration:"; + for (const auto& reason : reasons) + error_msg += "\n " + reason; + + throw std::invalid_argument(std::move(error_msg)); + } + + void ocl_builder::build_context() + { + _context = cl::Context(_device); + } + + bool ocl_builder::does_device_match_config(const configuration& config, const cl::Device& dev, std::list& reasons) + { + auto dev_name = dev.getInfo(); + bool ok = true; + + auto dev_type = dev.getInfo(); + + cl_device_type device_types[] = { + CL_DEVICE_TYPE_DEFAULT, + CL_DEVICE_TYPE_CPU, + CL_DEVICE_TYPE_GPU, + CL_DEVICE_TYPE_ACCELERATOR }; + + if (dev_type != device_types[config.device_type]) + { + reasons.push_back(dev_name + ": invalid device type"); + ok = false; + } + + auto vendor_id = dev.getInfo(); + if (vendor_id != config.device_vendor) + { + reasons.push_back(dev_name + ": invalid vendor type"); + ok = false; + } + + if (config.host_out_of_order) + { + auto queue_properties = dev.getInfo(); + using cmp_t = std::common_type::type>::type; + if (!(static_cast(queue_properties) & static_cast(cl::QueueProperties::OutOfOrder))) + { + reasons.push_back(dev_name + ": missing out of order support"); + ok = false; + } + } + return ok; + } + + void ocl_builder::build_platform_id() + { + cl_int err; + _platform_id = _device.getInfo(&err); + if (err != CL_SUCCESS) + { + throw std::runtime_error("Error getting OpenCL platform_id from device!"); + } + } + +} +} + diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.h new file mode 100644 index 0000000..0f6f6e0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.h @@ -0,0 +1,54 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +// we want exceptions +#define CL_HPP_ENABLE_EXCEPTIONS +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#include +#include + +namespace cldnn { +namespace gpu { + struct configuration; + + class ocl_builder + { + public: + ocl_builder(const configuration& config); + cl::Context get_context() const { return _context; } + const cl::Device &get_device() const { return _device; } + cl_platform_id get_platform_id() const { return _platform_id; } + bool is_user_context() const { return _is_user_context; } + + private: + cl::Context _context; + cl::Device _device; + cl_platform_id _platform_id; + bool _is_user_context; + + void build_device_from_user_context(const configuration& config); + void build_device(const configuration& config); + void build_context(); + bool does_device_match_config(const configuration& config, const cl::Device& dev, std::list& reasons); + void build_platform_id(); + }; + +} +} + diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp index d74a036..0044ec3 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp @@ -18,6 +18,8 @@ #include "ocl_toolkit.h" #include "ocl_base_event.h" #include "ocl_user_event.h" +#include "command_queues_builder.h" +#include "events_pool.h" #include #include @@ -70,96 +72,6 @@ ocl_error::ocl_error(cl::Error const & err) : error(err.what() + std::string(", { } -namespace { - - cl_device_type convert_configuration_device_type(configuration::device_types device_type) - { - cl_device_type device_types[] = { - CL_DEVICE_TYPE_DEFAULT, - CL_DEVICE_TYPE_CPU, - CL_DEVICE_TYPE_GPU, - CL_DEVICE_TYPE_ACCELERATOR }; - return device_types[device_type]; - } - - bool does_device_match_config(cl::Device const& dev, configuration const& config, std::list& reasons) - { - auto dev_name = dev.getInfo(); - bool ok = true; - - auto dev_type = dev.getInfo(); - - if (dev_type != convert_configuration_device_type(config.device_type)) - { - reasons.push_back(dev_name + ": invalid device type"); - ok = false; - } - - auto vendor_id = dev.getInfo(); - if (vendor_id != config.device_vendor) - { - reasons.push_back(dev_name + ": invalid vendor type"); - ok = false; - } - - if (config.host_out_of_order) - { - auto queue_properties = dev.getInfo(); - using cmp_t = std::common_type::type>::type; - if (!(static_cast(queue_properties) & static_cast(cl::QueueProperties::OutOfOrder))) - { - reasons.push_back(dev_name + ": missing out of order support"); - ok = false; - } - } - - return ok; - } -} - -cl::Device get_gpu_device(const configuration& config, cl_platform_id& platform_id) -{ - std::list reasons; - cl_uint n = 0; - - // Get number of platforms availible - cl_int err = clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - throw std::runtime_error("clGetPlatformIDs error " + std::to_string(err)); - } - - // Get platform list - std::vector platform_ids(n); - err = clGetPlatformIDs(n, platform_ids.data(), NULL); - if (err != CL_SUCCESS) { - throw std::runtime_error("clGetPlatformIDs error " + std::to_string(err)); - } - - for (auto& id : platform_ids) - { - cl::Platform platform = cl::Platform(id); - std::vector devices; - platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); - for (auto& d : devices) - { - if (does_device_match_config(d, config, reasons)) - { - platform_id = id; - return d; - } - } - } - - if (reasons.empty()) - throw std::runtime_error("Could not find any OpenCL device"); - - std::string error_msg = "No OpenCL device found which would match provided configuration:"; - for (const auto& reason : reasons) - error_msg += "\n " + reason; - - throw std::invalid_argument(std::move(error_msg)); -} - std::shared_ptr gpu_toolkit::create(const configuration & cfg) { struct make_shared_wa : public gpu_toolkit { make_shared_wa(const configuration& cfg) : gpu_toolkit(cfg) {} }; @@ -176,116 +88,21 @@ struct gpu_toolkit::ocl_logger std::ofstream _log_file; }; -gpu_toolkit::gpu_toolkit(const configuration& config) +gpu_toolkit::gpu_toolkit(const configuration& config) : _configuration(config) - , _device(get_gpu_device(config, _platform_id)) + , _ocl_builder(config) + , _user_context(_ocl_builder.is_user_context()) , _neo_driver(strstr(get_device_version().c_str(), "NEO") ? true : false) - , _context(_device) - , _command_queue(_context, - _device, - (config.enable_profiling - ? cl::QueueProperties::Profiling - : cl::QueueProperties::None) | - (config.host_out_of_order && _neo_driver - ? cl::QueueProperties::OutOfOrder - : cl::QueueProperties::None)) + , _context(_ocl_builder.get_context()) + , _platform_id(_ocl_builder.get_platform_id()) , _engine_info(*this) , _kernels_cache(*this) + , _events_pool(new events_pool()) { - _device.getInfo(CL_DEVICE_EXTENSIONS, &_extensions); - - cl_command_queue_properties queue_properties = - ((config.enable_profiling) ? - CL_QUEUE_PROFILING_ENABLE : - 0) | - ((config.host_out_of_order && - _neo_driver) ? - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : - 0); - - if (_configuration.priority_mode != cldnn_priority_disabled) - { - if (extension_supported("cl_khr_priority_hints") && - extension_supported("cl_intelx_create_command_queue")) - // TODO add check when caps will be availible (instead of cl_intelx_create_command_queue) - //&& extension_supported("cl_khr_create_command_queue")) - { - // TODO: When cl_khr_create_command_queue will be availible the - // function name will change to clCreateCommandQueueWithPropertiesKHR - // in place of clCreateCommandQueueWithPropertiesINTEL. -#ifndef WIN32 - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wpedantic" -#endif - pfn_clCreateCommandQueueWithPropertiesINTEL clCreateCommandQueueWithPropertiesINTEL = - (pfn_clCreateCommandQueueWithPropertiesINTEL)clGetExtensionFunctionAddressForPlatform( - _platform_id, - "clCreateCommandQueueWithPropertiesINTEL"); -#ifndef WIN32 - #pragma GCC diagnostic pop -#endif - unsigned cl_queue_priority_value = CL_QUEUE_PRIORITY_MED_KHR; - - switch (_configuration.priority_mode) - { - case cldnn_priority_high: - cl_queue_priority_value = CL_QUEUE_PRIORITY_HIGH_KHR; - break; - case cldnn_priority_low: - cl_queue_priority_value = CL_QUEUE_PRIORITY_LOW_KHR; - break; - default: - break; - } - - cl_int error_code = CL_SUCCESS; - cl_queue_properties properties_low[] = { - CL_QUEUE_PRIORITY_KHR, cl_queue_priority_value, - CL_QUEUE_PROPERTIES, queue_properties, - 0 }; - - _command_queue = clCreateCommandQueueWithPropertiesINTEL( - _context.get(), - _device.get(), - properties_low, - &error_code); - - if (error_code != CL_SUCCESS) { - throw std::runtime_error("clCreateCommandQueueWithPropertiesINTEL error " + std::to_string(error_code)); - } - } - else - { - throw std::invalid_argument( - "The param priority_mode is set in engine_configuration,\ - but cl_khr_priority_hints or cl_khr_create_command_queue\ - is not supported by current OpenCL implementation."); - } - } - else - { - _command_queue = cl::CommandQueue(_context, _device, queue_properties); - } - - if (_configuration.throttle_mode != cldnn_throttle_disabled) - { - if (extension_supported("cl_khr_throttle_hints")) - { - throw std::invalid_argument( - "The param throttle_mode is set in engine_configuration,\ - but it is placeholder for future use. It has no effect for now\ - and should be set to cldnn_throttle_disabled"); - } - else - { - throw std::invalid_argument( - "The param throttle_mode is set in engine_configuration,\ - but cl_khr_throttle_hints is not supported by current OpenCL implementation."); - } - } + _ocl_builder.get_device().getInfo(CL_DEVICE_EXTENSIONS, &_extensions); + build_command_queues(config); _logger = std::unique_ptr(new ocl_logger()); - if (logging_enabled()) { open_log() @@ -303,9 +120,7 @@ gpu_toolkit::gpu_toolkit(const configuration& config) << " engine log: " << _configuration.log << "\n" << " sources dumps: " << _configuration.ocl_sources_dumps_dir << "\n" << "\nEngine info:\n" - << " configuration: " << std::to_string(_engine_info.configuration) << "\n" - << " model: " << std::to_string(_engine_info.model) << "\n" - << " architecture: " << std::to_string(_engine_info.architecture) << "\n" + << " device id: " << _engine_info.dev_id << "\n" << " cores count: " << _engine_info.cores_count << "\n" << " core frequencey: " << _engine_info.core_frequency << "\n" << " max work group size: " << _engine_info.max_work_group_size << "\n" @@ -313,10 +128,28 @@ gpu_toolkit::gpu_toolkit(const configuration& config) << " fp16: " << std::boolalpha << (_engine_info.supports_fp16 != 0) << "\n" << " fp16 denorms: " << std::boolalpha << (_engine_info.supports_fp16_denorms != 0) << "\n" << " subgroups short: " << std::boolalpha << (_engine_info.supports_subgroups_short != 0) << "\n" + << " used defined context: "<< std::boolalpha << _user_context << "\n" << std::endl; } } +void gpu_toolkit::build_command_queues(const configuration& config) +{ + command_queues_builder queue_builder(_context, _ocl_builder.get_device(), _platform_id); + queue_builder.set_profiling(config.enable_profiling); + queue_builder.set_out_of_order((config.host_out_of_order && _neo_driver)); + + bool priorty_extensions = extension_supported("cl_khr_priority_hints") && extension_supported("cl_khr_create_command_queue"); + queue_builder.set_priority_mode(config.priority_mode, priorty_extensions); + + bool throttle_extensions = extension_supported("cl_khr_throttle_hints") && extension_supported("cl_khr_create_command_queue"); + queue_builder.set_throttle_mode(config.throttle_mode, throttle_extensions); + + queue_builder.build(); + + _command_queue = queue_builder.queue(); +} + event_impl::ptr gpu_toolkit::enqueue_kernel(cl::Kernel const& kern, cl::NDRange const& global, cl::NDRange const& local, std::vector const & deps) { std::vector dep_events; @@ -358,14 +191,13 @@ event_impl::ptr gpu_toolkit::enqueue_kernel(cl::Kernel const& kern, cl::NDRange log(_queue_counter + 1, msg); } - - return{ new base_event(shared_from_this(), ret_ev, ++_queue_counter), false }; + return _events_pool->get_from_base_pool(shared_from_this(), ret_ev, ++_queue_counter); } event_impl::ptr gpu_toolkit::enqueue_marker(std::vector const& deps) { if (deps.empty()) - return{ new user_event(shared_from_this(), true), false }; + return _events_pool->get_from_user_pool(shared_from_this(), true); if (!_configuration.host_out_of_order) { @@ -379,7 +211,7 @@ event_impl::ptr gpu_toolkit::enqueue_marker(std::vector const& try { _command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev); - } + } catch (cl::Error const& err) { throw ocl_error(err); } @@ -396,19 +228,33 @@ event_impl::ptr gpu_toolkit::enqueue_marker(std::vector const& if (logging_enabled()) log(_queue_counter + 1, "Marker with dependencies: " + events_list_to_string(deps)); - - return{ new base_event(shared_from_this(), ret_ev, ++_queue_counter), false }; + return _events_pool->get_from_base_pool(shared_from_this(), ret_ev, ++_queue_counter); } else { sync_events(deps); - return{ new base_event(shared_from_this(), _last_barrier_ev, _last_barrier), false }; + return _events_pool->get_from_base_pool(shared_from_this(), _last_barrier_ev, _last_barrier); } } event_impl::ptr gpu_toolkit::group_events(std::vector const& deps) { - return{ new base_events(shared_from_this(), deps), false }; + return _events_pool->get_from_group_pool(shared_from_this(), deps); +} + +event_impl::ptr gpu_toolkit::create_user_event(bool set) +{ + return _events_pool->get_from_user_pool(shared_from_this(), set); +} + +void gpu_toolkit::reset_events() +{ + _events_pool->reset_events(); +} + +void gpu_toolkit::release_events_pool() +{ + _events_pool.reset(); } void gpu_toolkit::flush() @@ -419,7 +265,7 @@ void gpu_toolkit::flush() } void gpu_toolkit::release_pending_memory() { - /* + /* TODO: Temp. solution, untill proper API calls from OpenCL are released. */ void* ptr = nullptr; @@ -483,14 +329,14 @@ void gpu_toolkit::sync_events(std::vector const & deps) { try { if (_output_event) - { + { _command_queue.enqueueBarrierWithWaitList(nullptr, &_last_barrier_ev); } else { _command_queue.enqueueBarrierWithWaitList(nullptr, nullptr); } - + } catch (cl::Error const& err) { throw ocl_error(err); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.h index 50c7460..1a69bd4 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.h @@ -17,25 +17,20 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// #pragma once -// we want exceptions -#define CL_HPP_ENABLE_EXCEPTIONS -#define CL_HPP_MINIMUM_OPENCL_VERSION 120 -#define CL_HPP_TARGET_OPENCL_VERSION 120 -#include +#include "ocl_builder.h" -#include "api/CPP/profiling.hpp" #include "kernels_cache.h" #include "engine_info.h" #include "event_impl.h" +#include "confiugration.h" #include #include -namespace cldnn { +namespace cldnn { typedef cl::vector> kernels_binaries_vector; - typedef cl::vector kernels_binaries_container; + typedef cl::vector kernels_binaries_container; namespace gpu { - typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *pfn_clCreateCommandQueueWithPropertiesINTEL)( cl_context context, cl_device_id device, @@ -48,26 +43,7 @@ public: ocl_error(cl::Error const& err); }; -struct configuration -{ - enum device_types { default_device = 0, cpu, gpu, accelerator }; - - configuration(); - - bool enable_profiling; - bool meaningful_kernels_names; - bool dump_custom_program; - device_types device_type; - uint32_t device_vendor; - std::string compiler_options; - std::string single_kernel_name; - bool host_out_of_order; - std::string log; - std::string ocl_sources_dumps_dir; - cldnn_priority_mode_type priority_mode; - cldnn_throttle_mode_type throttle_mode; -}; - +class events_pool; class gpu_toolkit; class context_holder @@ -82,42 +58,18 @@ protected: }; -struct profiling_period_event : instrumentation::profiling_period -{ - profiling_period_event(const cl::Event& event, cl_profiling_info start, cl_profiling_info end) - : _event(event) - , _start(start) - , _end(end) - {} - - std::chrono::nanoseconds value() const override - { - cl_ulong start_nanoseconds; - _event.getProfilingInfo(_start, &start_nanoseconds); - cl_ulong end_nanoseconds; - _event.getProfilingInfo(_end, &end_nanoseconds); - return std::chrono::nanoseconds(static_cast(end_nanoseconds - start_nanoseconds)); - } - -private: - cl::Event _event; - cl_profiling_info _start; - cl_profiling_info _end; -}; - class gpu_toolkit : public std::enable_shared_from_this { friend class context_holder; protected: gpu_toolkit(const configuration& aconfiguration = configuration()); - public: static std::shared_ptr create(const configuration& cfg = configuration()); const cl::Context& context() const { return _context; } - const cl::Device& device() const { return _device; } + const cl::Device& device() const { return _ocl_builder.get_device(); } const cl::CommandQueue& queue() const { return _command_queue; } - + const configuration& get_configuration() const { return _configuration; } engine_info_internal get_engine_info() const { return _engine_info; } kernels_cache& get_kernels_cache() { return _kernels_cache; } @@ -125,7 +77,7 @@ public: void store_binaries(kernels_binaries_vector binaries) { _binaries.push_back(binaries); } bool get_serialization_flag() { return _serialize; } void set_serialization_flag(bool serialization_flag) { _serialize = serialization_flag; } - + inline bool extension_supported(const std::string ext) { return _extensions.find(ext) != std::string::npos; } gpu_toolkit(const gpu_toolkit& other) = delete; @@ -139,6 +91,9 @@ public: event_impl::ptr enqueue_kernel(cl::Kernel const& kern, cl::NDRange const& global, cl::NDRange const& local, std::vector const& deps); event_impl::ptr enqueue_marker(std::vector const& deps); event_impl::ptr group_events(std::vector const& deps); + void reset_events(); + event_impl::ptr create_user_event(bool set); + void release_events_pool(); void flush(); void release_pending_memory(); @@ -147,10 +102,10 @@ public: void log(uint64_t id, std::string const& msg); bool logging_enabled() const { return !_configuration.log.empty(); } bool is_neo_driver() { return _neo_driver; } - private: configuration _configuration; - cl::Device _device; + ocl_builder _ocl_builder; + bool _user_context = false; bool _neo_driver = false; cl::Context _context; cl::CommandQueue _command_queue; @@ -162,6 +117,7 @@ private: std::atomic _queue_counter{ 0 }; std::atomic _last_barrier{ 0 }; + std::unique_ptr _events_pool; cl::Event _last_barrier_ev; std::string _extensions; @@ -174,7 +130,9 @@ private: bool _output_event = false; std::ofstream& open_log(); - std::string get_device_version() { return _device.getInfo(); } + std::string get_device_version() { return _ocl_builder.get_device().getInfo(); } + + void build_command_queues(const configuration& config); }; }} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.cpp index 5769193..c357134 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.cpp @@ -1,5 +1,22 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + #include "ocl_user_event.h" + using namespace cldnn::gpu; void user_event::set_impl() @@ -10,6 +27,7 @@ void user_event::set_impl() static_cast(get()).setStatus(CL_COMPLETE); _duration = std::unique_ptr( new cldnn::instrumentation::profiling_period_basic(_timer.uptime())); + _attached = true; } bool user_event::get_profiling_info_impl(std::list& info) { @@ -20,4 +38,4 @@ bool user_event::get_profiling_info_impl(std::list& in info.push_back({ "duration", static_cast(_duration->value().count()) }); return true; -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.h index 8fe2692..6346aad 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_user_event.h @@ -1,6 +1,24 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + + #pragma once #include "ocl_base_event.h" +#include "api/CPP/profiling.hpp" #ifdef _WIN32 #pragma warning(push) @@ -11,14 +29,23 @@ namespace cldnn { namespace gpu { struct user_event : public base_event, public cldnn::user_event { - user_event(std::shared_ptr ctx, bool auto_set = false) : base_event(ctx, cl::UserEvent(ctx->context())), cldnn::user_event(auto_set) - { - if (auto_set) - user_event::set_impl(); - } + user_event(std::shared_ptr ctx) + : base_event(ctx) + , cldnn::user_event(false) + {} void set_impl() override; - + void attach_event(bool set) + { + _event = cl::UserEvent(get_context()->context()); + //we need to reset the timer(since attach_ocl_event is called only when this object is being reused) + _timer = cldnn::instrumentation::timer<>(); + if (set) + { + set_impl(); + _set = set; + } + } bool get_profiling_info_impl(std::list& info) override; protected: diff --git a/inference-engine/thirdparty/clDNN/src/gpu/one_hot_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/one_hot_gpu.cpp new file mode 100644 index 0000000..8b7c4f1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/one_hot_gpu.cpp @@ -0,0 +1,72 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "one_hot_inst.h" + +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "one_hot/one_hot_kernel_selector.h" +#include "one_hot/one_hot_kernel_base.h" +#include "error_handler.h" + +namespace cldnn { + namespace gpu { + + struct one_hot_gpu : typed_primitive_gpu_impl + { + using parent = typed_primitive_gpu_impl; + using parent::parent; + + + static primitive_impl* create(const one_hot_node& arg) + { + auto oh_params = get_default_params(arg, 1); + auto oh_optional_params = get_default_optional_params(arg.get_program()); + + oh_params.one_hot_axis = arg.get_primitive()->one_hot_axis; + + auto output_sizes = arg.get_output_layout().size; + std::vector output_dims = { output_sizes.batch[0], output_sizes.feature[0], + output_sizes.spatial[1], output_sizes.spatial[0] }; + oh_params.one_hot_limit = output_dims[oh_params.one_hot_axis]; + + auto& kernel_selector = kernel_selector::one_hot_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(oh_params, oh_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with these arguments"); + + return new one_hot_gpu(arg, best_kernels[0]); + } + }; + + namespace { + struct attach { + attach() { + auto val_fw = one_hot_gpu::create; + + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), val_fw); + } + ~attach() = default; + }; + + attach attach_impl; + + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/gpu/permute_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/permute_gpu.cpp index 6bf0208..8865625 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/permute_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/permute_gpu.cpp @@ -36,13 +36,8 @@ struct permute_gpu : typed_primitive_gpu_impl auto permute_params = get_default_params(arg); auto permute_optional_params = get_default_optional_params(arg.get_program()); - uint16_t max_input_index = (uint16_t)(permute_params.inputs[0].GetDims().size() - 1); const auto& permute_order = arg.get_primitive()->permute_order; - for (size_t i = 0; i < permute_order.size(); i++) - { - auto order = permute_order[permute_order.size() - 1 - i]; - permute_params.order.push_back(max_input_index - order); - } + permute_params.order = permute_order; auto& kernel_selector = kernel_selector::permute_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(permute_params, permute_optional_params); @@ -65,4 +60,4 @@ namespace { }; attach attach_impl; } -} } \ No newline at end of file +} } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp index e21df51..401b716 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -113,6 +113,11 @@ public: pp.poolType = cldnn_2_pool_type(primitive->mode); pp.remainderAction = kernel_selector::pool_remainder::CEIL; + if (primitive->global_pooling) { + primitive->size.spatial[0] = input_sizes.spatial[0]; + primitive->size.spatial[1] = input_sizes.spatial[1]; + } + //check if last pooling window goes outside of input size + padding. If so the avg pooling size will be adjusted to that. auto dynamic_mode = (((output_sizes.spatial[0] - 1) * stride.spatial[0]) + primitive->size.spatial[0]) > -2 * input_offset.spatial[0] + input_sizes.spatial[0] || (((output_sizes.spatial[1] - 1) * stride.spatial[1]) + primitive->size.spatial[1]) > -2 * input_offset.spatial[1] + input_sizes.spatial[1]; @@ -174,6 +179,8 @@ namespace { // MMAD implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), pooling_gpu::create); } ~attach() {} }; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.cpp b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.cpp index 3128f2a..f11a8ec 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.cpp @@ -16,10 +16,6 @@ #include "primitive_gpu_base.h" -#include "detection_output_inst.h" -#include "proposal_inst.h" -#include "prior_box_inst.h" - namespace cldnn { namespace gpu { @@ -27,12 +23,8 @@ namespace cldnn { { for (const auto& user : users) { - if (user->type() == detection_output::type_id() || - user->type() == prior_box::type_id() || - user->type() == proposal::type_id()) - { + if (user->get_selected_impl()->is_cpu()) return true; - } } return false; } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h index 8343147..704b83e 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h @@ -18,6 +18,7 @@ #pragma once #include "primitive_inst.h" +#include "program_impl.h" #include "kernel.h" #include "events_waiter.h" #include "error_handler.h" @@ -30,8 +31,8 @@ namespace cldnn { namespace gpu bool is_any_user_cpu(const std::list& users); /* -Base class for all implementation of specified primitive type. -For example, all convolution implementations should derive from typed_primitive_impl. +Base class for all GPU implementation of specified primitive type. +For example, all gpu convolution implementations should derive from typed_primitive_gpu_impl. */ template struct typed_primitive_gpu_impl : public typed_primitive_impl @@ -67,13 +68,11 @@ struct typed_primitive_gpu_impl : public typed_primitive_impl auto& eimpl = arg.get_program().get_engine(); _intermediates_memory.push_back(eimpl.allocate_memory(expected_layout)); } - } -protected: - virtual bool validate(typed_primitive_inst&) const - { - return true; } + bool is_cpu() const override { return false; } + +protected: virtual bool optimized_out(typed_primitive_inst&) const { @@ -99,6 +98,11 @@ protected: return 1; } + virtual uint32_t get_groups() const + { + return 1; + } + event_impl::ptr aggregate_events(const std::vector& events, bool group=false) const { if (events.size() == 1) @@ -112,9 +116,6 @@ protected: virtual event_impl::ptr execute_impl(const std::vector& events, typed_primitive_inst& instance) override { - const bool validated = validate(instance); - CLDNN_ERROR_NOT_EQUAL(_outer.id(), "validate", validated, "", true, "not a valid instance."); - if (optimized_out(instance)) { return aggregate_events(events); @@ -124,6 +125,9 @@ protected: // TODO - split should be handle in kernel selector by providing multiple kernels. auto split = get_split(); + auto groups = get_groups(); + if (split == 1) + split = groups; // we iterate over split first in order to be able parallelism with OOOQ mechanism. for (size_t k = 0; k < _kernels.size(); ++k) diff --git a/inference-engine/thirdparty/clDNN/src/gpu/proposal_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/proposal_gpu.cpp index 6eb7393..d172f84 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/proposal_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/proposal_gpu.cpp @@ -16,7 +16,6 @@ #include "proposal_inst.h" #include "kernel.h" -#include "kd_selector.h" #include "implementation_map.h" #include "network_impl.h" #include "engine_impl.h" @@ -38,7 +37,7 @@ namespace { * * ****************************************************************************/ - inline const float & clamp(const float & v, const float & lower, const float & upper) + inline const float& clamp(const float & v, const float & lower, const float & upper) { return std::max(lower, std::min(v, upper)); } @@ -54,22 +53,22 @@ namespace { { float x0, y0, x1, y1; - inline float area() const - { - return std::max(0.f, y1 - y0 + 1.f) * std::max(0.f, x1 - x0 + 1.f); + inline float area() const + { + return std::max(0.f, y1 - y0 + 1.f) * std::max(0.f, x1 - x0 + 1.f); } }; struct delta_t { float shift_x, shift_y, log_w, log_h; }; - struct proposal_t - { + struct proposal_t + { proposal_t() = default; proposal_t(const roi_t& r, const float c, const size_t& o) : roi(r), confidence(c), ord(o) {} - roi_t roi; - float confidence; - size_t ord; + roi_t roi; + float confidence; + size_t ord; }; inline float float_read_helper(const float* mem) @@ -124,7 +123,8 @@ namespace { int img_w, int img_h, float coordinates_offset, - bool initial_clip) + bool initial_clip, + bool clip_before_nms) { float x0 = box.start_x + anchor_shift_x; float y0 = box.start_y + anchor_shift_y; @@ -149,10 +149,20 @@ namespace { const float half_pred_w = std::exp(delta.log_w) * anchor_w * .5f; const float half_pred_h = std::exp(delta.log_h) * anchor_h * .5f; - return { clamp(pred_center_x - half_pred_w, 0.f, img_w - coordinates_offset), - clamp(pred_center_y - half_pred_h, 0.f, img_h - coordinates_offset), - clamp(pred_center_x + half_pred_w, 0.f, img_w - coordinates_offset), - clamp(pred_center_y + half_pred_h, 0.f, img_h - coordinates_offset) }; + float new_x0 = pred_center_x - half_pred_w; + float new_y0 = pred_center_y - half_pred_h; + float new_x1 = pred_center_x + half_pred_w; + float new_y1 = pred_center_y + half_pred_h; + + if (clip_before_nms) + { + new_x0 = clamp(new_x0, 0.f, img_w - coordinates_offset); + new_y0 = clamp(new_y0, 0.f, img_h - coordinates_offset); + new_x1 = clamp(new_x1, 0.f, img_w - coordinates_offset); + new_y1 = clamp(new_y1, 0.f, img_h - coordinates_offset); + } + + return { new_x0, new_y0, new_x1, new_y1 }; } std::vector perform_nms( @@ -242,11 +252,13 @@ struct proposal_gpu : typed_primitive_impl bool swap_xy = instance.argument.swap_xy; bool initial_clip = instance.argument.initial_clip; + bool clip_before_nms = instance.argument.clip_before_nms; + bool clip_after_nms = instance.argument.clip_after_nms; float coordinates_offset = instance.argument.coordinates_offset; float box_coordinate_scale = instance.argument.box_coordinate_scale; float box_size_scale = instance.argument.box_size_scale; - if (image_info.get_layout().count() == 4) + if (image_info.get_layout().size.feature[0] == 4) { img_w = static_cast(float_read_helper(image_info_mem + proposal_inst::image_info_width_index) + EPSILON); img_h = static_cast(float_read_helper(image_info_mem + proposal_inst::image_info_height_index) + EPSILON); @@ -262,13 +274,13 @@ struct proposal_gpu : typed_primitive_impl scaled_min_bbox_size *= img_z; min_bbox_x = scaled_min_bbox_size; - if (image_info.get_layout().count() > proposal_inst::image_info_scale_min_bbox_x) + if (image_info.get_layout().size.feature[0] > proposal_inst::image_info_scale_min_bbox_x) { min_bbox_x = static_cast(min_bbox_x * float_read_helper(image_info_mem + proposal_inst::image_info_scale_min_bbox_x)); } min_bbox_y = scaled_min_bbox_size; - if (image_info.get_layout().count() > proposal_inst::image_info_scale_min_bbox_y) + if (image_info.get_layout().size.feature[0] > proposal_inst::image_info_scale_min_bbox_y) { min_bbox_y = static_cast(min_bbox_y * float_read_helper(image_info_mem + proposal_inst::image_info_scale_min_bbox_y)); } @@ -291,67 +303,80 @@ struct proposal_gpu : typed_primitive_impl const dtype* cls_scores_mem = cls_scores_ptr.data(); const dtype* bbox_pred_mem = bbox_pred_ptr.data(); - std::vector sorted_proposals_confidence; - sorted_proposals_confidence.reserve(fm_h * fm_w * anchors_num); - for (int y = 0; y < fm_h; ++y) + for (int n = 0; n < score_size.batch[0]; n++) { - for (int x = 0; x < fm_w; ++x) + std::vector sorted_proposals_confidence; + size_t num_proposals = fm_h * fm_w * anchors_num; + sorted_proposals_confidence.reserve(num_proposals); + for (int y = 0; y < fm_h; ++y) { - const int anchor_shift_x = (swap_xy ? y : x) * instance.argument.feature_stride; - const int anchor_shift_y = (swap_xy ? x : y) * instance.argument.feature_stride; - const int location_index = y * fm_w + x; - - // we assume proposals are grouped by window location - for (unsigned int anchor_index = 0; anchor_index < anchors_num ; anchor_index++) + for (int x = 0; x < fm_w; ++x) { - float dx0 = float_read_helper(bbox_pred_mem + location_index + fm_sz * (anchor_index * 4 + 0)) / box_coordinate_scale; - float dy0 = float_read_helper(bbox_pred_mem + location_index + fm_sz * (anchor_index * 4 + 1)) / box_coordinate_scale; - float dx1 = float_read_helper(bbox_pred_mem + location_index + fm_sz * (anchor_index * 4 + 2)) / box_size_scale; - float dy1 = float_read_helper(bbox_pred_mem + location_index + fm_sz * (anchor_index * 4 + 3)) / box_size_scale; - - delta_t bbox_delta { dx0, dy0, dx1, dy1 }; - - const roi_t& roi = gen_bbox(anchors[anchor_index], bbox_delta, anchor_shift_x, anchor_shift_y, - img_w, img_h, coordinates_offset, initial_clip); - - int bbox_w = (int)(roi.x1 - roi.x0 + coordinates_offset); - int bbox_h = (int)(roi.y1 - roi.y0 + coordinates_offset); - - unsigned int scores_index = location_index + fm_sz * (anchor_index + (unsigned int)anchors_num); - float proposal_confidence = (min_bbox_x <= bbox_w)* (min_bbox_y <= bbox_h) * float_read_helper(cls_scores_mem + scores_index); - sorted_proposals_confidence.emplace_back(roi, proposal_confidence, sorted_proposals_confidence.size()); + const int anchor_shift_x = (swap_xy ? y : x) * instance.argument.feature_stride; + const int anchor_shift_y = (swap_xy ? x : y) * instance.argument.feature_stride; + const int location_index = y * fm_w + x; + + // we assume proposals are grouped by window location + for (unsigned int anchor_index = 0; anchor_index < anchors_num ; anchor_index++) + { + float dx0 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 0)) / box_coordinate_scale; + float dy0 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 1)) / box_coordinate_scale; + float dx1 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 2)) / box_size_scale; + float dy1 = float_read_helper(bbox_pred_mem + n*num_proposals*4 + location_index + fm_sz * (anchor_index * 4 + 3)) / box_size_scale; + + delta_t bbox_delta { dx0, dy0, dx1, dy1 }; + + const roi_t& roi = gen_bbox(anchors[anchor_index], bbox_delta, anchor_shift_x, anchor_shift_y, + img_w, img_h, coordinates_offset, initial_clip, clip_before_nms); + + int bbox_w = (int)(roi.x1 - roi.x0 + coordinates_offset); + int bbox_h = (int)(roi.y1 - roi.y0 + coordinates_offset); + + size_t scores_index = n*num_proposals * 2 + location_index + fm_sz * (anchor_index + anchors_num); + float proposal_confidence = (min_bbox_x <= bbox_w)* (min_bbox_y <= bbox_h) * float_read_helper(cls_scores_mem + scores_index); + sorted_proposals_confidence.emplace_back(roi, proposal_confidence, sorted_proposals_confidence.size()); + } } } - } - size_t pre_nms = std::min(instance.argument.pre_nms_topn, (int)sorted_proposals_confidence.size()); - sort_and_keep_n_items(sorted_proposals_confidence, pre_nms); - const std::vector& res = perform_nms(sorted_proposals_confidence, instance.argument.iou_threshold, - instance.argument.post_nms_topn, coordinates_offset); + size_t pre_nms = std::min(instance.argument.pre_nms_topn, (int)sorted_proposals_confidence.size()); + sort_and_keep_n_items(sorted_proposals_confidence, pre_nms); + std::vector res = perform_nms(sorted_proposals_confidence, instance.argument.iou_threshold, + instance.argument.post_nms_topn, coordinates_offset); - auto& output = instance.output_memory(); + auto& output = instance.output_memory(); - mem_lock output_ptr{ output }; - dtype* top_data = output_ptr.data(); + mem_lock output_ptr{ output }; + dtype* top_data = output_ptr.data() + n*instance.argument.post_nms_topn*5; - size_t res_num_rois = res.size(); + size_t res_num_rois = res.size(); - for (size_t i = 0; i < res_num_rois; ++i) - { - float_write_helper(top_data + 5 * i + 0, 0.0f); - float_write_helper(top_data + 5 * i + 1, res[i].x0); - float_write_helper(top_data + 5 * i + 2, res[i].y0); - float_write_helper(top_data + 5 * i + 3, res[i].x1); - float_write_helper(top_data + 5 * i + 4, res[i].y1); - } - for (size_t i = res_num_rois; i < (size_t)instance.argument.post_nms_topn; i++) - { - float_write_helper(top_data + 5*i + 0, -1.0f); - float_write_helper(top_data + 5*i + 1, 0.0f); - float_write_helper(top_data + 5*i + 2, 0.0f); - float_write_helper(top_data + 5*i + 3, 0.0f); - float_write_helper(top_data + 5*i + 4, 0.0f); + for (size_t i = 0; i < res_num_rois; ++i) + { + if (clip_after_nms) + { + res[i].x0 = clamp(res[i].x0, 0.0f, float(img_w)); + res[i].y0 = clamp(res[i].y0, 0.0f, float(img_h)); + res[i].x1 = clamp(res[i].x1, 0.0f, float(img_w)); + res[i].y1 = clamp(res[i].y1, 0.0f, float(img_h)); + } + + float_write_helper(top_data + 5 * i + 0, float(n)); + float_write_helper(top_data + 5 * i + 1, res[i].x0 / (instance.argument.normalize ? img_w : 1.0f)); + float_write_helper(top_data + 5 * i + 2, res[i].y0 / (instance.argument.normalize ? img_h : 1.0f)); + float_write_helper(top_data + 5 * i + 3, res[i].x1 / (instance.argument.normalize ? img_w : 1.0f)); + float_write_helper(top_data + 5 * i + 4, res[i].y1 / (instance.argument.normalize ? img_h : 1.0f)); + } + + for (size_t i = res_num_rois; i < (size_t)instance.argument.post_nms_topn; i++) + { + float_write_helper(top_data + 5*i + 0, -1.0f); + float_write_helper(top_data + 5*i + 1, 0.0f); + float_write_helper(top_data + 5*i + 2, 0.0f); + float_write_helper(top_data + 5*i + 3, 0.0f); + float_write_helper(top_data + 5*i + 4, 0.0f); + } } } @@ -380,17 +405,15 @@ struct proposal_gpu : typed_primitive_impl static primitive_impl* create(const proposal_node& arg) { const layout & l = arg.image_info().get_output_layout(); - const size_t count = l.size.count(); + const size_t count = static_cast(l.size.feature[0]); //Supported image_info sizes and components meaning: // - image_info[3] = { img_height, img_width, img_depth } // - image_info[4] = { img_height, img_width, scale_min_bbox_y, scale_min_bbox_x } // - image_info[6] = { img_height, img_width, img_depth, scale_min_bbox_y, scale_min_bbox_x, scale_depth_index } - if ((size_t)l.size.feature[0] != count || (count != 3 && count != 4 && count != 6)) { + if (count != 3 && count != 4 && count != 6) { CLDNN_ERROR_MESSAGE(arg.id(), "image_info must have either 3, 4 or 6 items"); } - CLDNN_ERROR_BOOL(arg.id(), "Batching", !hasSingleBatchOutput(arg.bbox_pred()), "Proposal doesn't support batching."); - CLDNN_ERROR_BOOL(arg.id(), "Batching", !hasSingleBatchOutput(arg.cls_score()), "Proposal doesn't support batching."); return new proposal_gpu(arg); } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/pyramid_roi_align_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/pyramid_roi_align_gpu.cpp new file mode 100644 index 0000000..d5164a1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/pyramid_roi_align_gpu.cpp @@ -0,0 +1,76 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "pyramid_roi_align_inst.h" + +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "pyramid_roi_align/pyramid_roi_align_kernel_selector.h" +#include "pyramid_roi_align/pyramid_roi_align_kernel_base.h" +#include "error_handler.h" +#include "pyramid_roi_align_inst.h" +#include "network_impl.h" + + +#define DEPTH_OF_FEATURE_MAP 4 +#define NUM_COORDINATES 4 +#define META_OFFSET_X 4 +#define META_OFFSET_Y 5 + +namespace cldnn { namespace gpu { + +struct pyramid_roi_align_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; + + static primitive_impl* create(const pyramidROIAlign_node& arg) + { + auto pyramidROIAlign_params = get_default_params(arg, 1); + auto pyramidROIAlign_optional_params = get_default_optional_params(arg.get_program()); + + pyramidROIAlign_params.inputs.push_back(convert_data_tensor(arg.image_meta().get_output_layout())); + pyramidROIAlign_params.inputs.push_back(convert_data_tensor(arg.P2().get_output_layout())); + pyramidROIAlign_params.inputs.push_back(convert_data_tensor(arg.P3().get_output_layout())); + pyramidROIAlign_params.inputs.push_back(convert_data_tensor(arg.P4().get_output_layout())); + pyramidROIAlign_params.inputs.push_back(convert_data_tensor(arg.P5().get_output_layout())); + pyramidROIAlign_params.inputs.push_back(convert_data_tensor(arg.pool_size().get_output_layout())); + + + auto& kernel_selector = kernel_selector::PyramidROIAlign_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(pyramidROIAlign_params, pyramidROIAlign_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + return new pyramid_roi_align_gpu(arg, best_kernels[0]); + } + +}; + +namespace { + struct attach { + attach() { + auto val_fw = pyramid_roi_align_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + } + ~attach() = default; + + }; + + attach attach_impl; + +} +}} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp new file mode 100644 index 0000000..146a864 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp @@ -0,0 +1,71 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "reverse_sequence_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "reverse_sequence/reverse_sequence_kernel_selector.h" +#include "reverse_sequence/reverse_sequence_kernel_ref.h" +#include "error_handler.h" + +using namespace cldnn; + +namespace cldnn { namespace gpu +{ +struct reverse_sequence_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; + +public: + + static primitive_impl* create(const reverse_sequence_node& arg) + { + auto reverse_sequence_params = get_default_params(arg); + auto reverse_sequence_optional_params = get_default_optional_params(arg.get_program()); + + reverse_sequence_params.seq_axis = arg.get_primitive()->seq_axis; + reverse_sequence_params.batch_axis = arg.get_primitive()->batch_axis; + + reverse_sequence_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout())); + + auto& kernel_selector = kernel_selector::reverse_sequence_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(reverse_sequence_params, reverse_sequence_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto reverse_sequence = new reverse_sequence_gpu(arg, best_kernels[0]); + + return reverse_sequence; + } +}; + +namespace +{ + struct attach + { + attach() + { + auto val_fw = reverse_sequence_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + } + ~attach() = default; + }; + attach attach_impl; +} +} } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/roi_pooling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/roi_pooling_gpu.cpp index 3ff7df6..d4d5dd6 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/roi_pooling_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/roi_pooling_gpu.cpp @@ -83,30 +83,22 @@ public: CLDNN_ERROR_NOT_EQUAL(arg.id(), "roi_pooling padding filling value", padding_filling_value, "padding mode", 0.0f, "Unknown padding mode in roi_pooling."); CLDNN_ERROR_NOT_PROPER_FORMAT(arg.id(), "Input_layout.format", input_layout.format.value, "output_layout.format", output_layout.format); - auto group_sz = primitive->group_sz; - auto in_feat = input_layout.get_buffer_size().feature[0]; - auto out_feat = output_layout.get_buffer_size().feature[0]; - - CLDNN_ERROR_LESS_THAN(arg.id(), "Group size", group_sz, "value", 0, ""); - if (group_sz) { - CLDNN_ERROR_NOT_EQUAL(arg.id(), "input feture map", in_feat, "group_sz * group_sz * out_feat", group_sz * group_sz * out_feat, ""); - } - CLDNN_ERROR_BOOL(arg.id(), "Batching", !hasSingleBatchOutput(arg.input()), "PS/ RoI Pooling doesn't support batching."); - auto roi_params = get_default_params(arg); auto roi_optional_params = get_default_optional_params(arg.get_program()); const auto& out = roi_params.output; - + const auto roi_bfyx = convert_data_tensor(rois_layout); const auto roi_bf = roi_bfyx.FlattenFeatureAndSpatials(); roi_params.inputs.push_back(roi_bf); roi_params.output = { out.GetDims(), out.GetDType(), kernel_selector::data_layout::brfyx, out.GetViewOffset(), out.PhysicalSize(), out.GetPaddedVal() }; // TOOD: it's an hack - cldnn doesn't support roi pooling with batching - roi_params.mode = cldnn_2_pool_type(primitive->mode); - roi_params.pooledWidth = primitive->pooled_width; - roi_params.pooledHeight = primitive->pooled_height; - roi_params.spatialScale = primitive->spatial_scale; - roi_params.groupSize = group_sz; + roi_params.mode = cldnn_2_pool_type(primitive->mode); + roi_params.position_sensitive = primitive->position_sensitive; + roi_params.pooledWidth = primitive->pooled_width; + roi_params.pooledHeight = primitive->pooled_height; + roi_params.spatialScale = primitive->spatial_scale; + roi_params.spatial_bins_x = primitive->spatial_bins_x; + roi_params.spatial_bins_y = primitive->spatial_bins_y; auto& kernel_selector = kernel_selector::roi_pooling_kernel_selector::Instance(); auto best_kernels = kernel_selector.GetBestKernels(roi_params, roi_optional_params); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/shuffle_channels_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/shuffle_channels_gpu.cpp new file mode 100644 index 0000000..454810f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/shuffle_channels_gpu.cpp @@ -0,0 +1,75 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "shuffle_channels_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "shuffle_channels/shuffle_channels_kernel_selector.h" +#include "shuffle_channels/shuffle_channels_kernel_ref.h" +#include "error_handler.h" + +using namespace cldnn; + +namespace cldnn { namespace gpu { + +struct shuffle_channels_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; + +public: + + static primitive_impl* create(const shuffle_channels_node& arg) + { + auto shuffle_channels_params = get_default_params(arg); + auto shuffle_channels_optional_params = get_default_optional_params(arg.get_program()); + + const int32_t number_of_dims = 4; + int32_t axis = arg.get_primitive()->axis; + + if (axis < 0) + axis += number_of_dims; + + shuffle_channels_params.group = arg.get_primitive()->group; + shuffle_channels_params.axis = axis; + + auto& kernel_selector = kernel_selector::shuffle_channels_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(shuffle_channels_params, shuffle_channels_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto shuffle_channels = new shuffle_channels_gpu(arg, best_kernels[0]); + + return shuffle_channels; + } +}; + +namespace +{ + struct attach + { + attach() + { + auto val_fw = shuffle_channels_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + } + ~attach() = default; + }; + attach attach_impl; +} +} } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/strided_slice_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/strided_slice_gpu.cpp new file mode 100644 index 0000000..b093ca2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/gpu/strided_slice_gpu.cpp @@ -0,0 +1,99 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "strided_slice_inst.h" +#include "primitive_gpu_base.h" +#include "implementation_map.h" +#include "kernel_selector_helper.h" +#include "strided_slice/strided_slice_kernel_ref.h" +#include "strided_slice/strided_slice_kernel_selector.h" +#include "error_handler.h" +#include "data_inst.h" + +using namespace cldnn; + +namespace cldnn +{ +namespace gpu +{ + +struct strided_slice_gpu : typed_primitive_gpu_impl +{ + using parent = typed_primitive_gpu_impl; + using parent::parent; +public: + static primitive_impl* create(const strided_slice_node& arg) + { + auto strided_slice_params = get_default_params(arg); + auto strided_slice_optional_params = get_default_optional_params(arg.get_program()); + const int32_t numberOfDims = 4; + + auto complete_strided_slice_params = [&](std::vector& param) { + for (size_t i = param.size(); i < numberOfDims; ++i) + param.push_back(1); + }; + + auto completeStridedSliceMasks = [&](std::vector& mask) { + for (size_t i = mask.size(); i < numberOfDims; ++i) + mask.push_back(0); + }; + + // Getting data from constant inputs. There are 3 args: Begin, End, Stride + for (size_t i = 1; i < arg.get_dependencies().size(); ++i) { + auto& input = arg.get_dependency(i).as(); + auto& mem = input.get_attached_memory(); + int32_t* data = static_cast(mem.lock()); + std::vector vData = std::vector(data, data + input.get_output_layout().count()); + complete_strided_slice_params(vData); + strided_slice_params.striding_params.push_back(vData); + mem.unlock(); + } + + strided_slice_params.end_mask = arg.get_primitive()->end_mask; + completeStridedSliceMasks(strided_slice_params.end_mask); + strided_slice_params.begin_mask = arg.get_primitive()->begin_mask; + completeStridedSliceMasks(strided_slice_params.begin_mask); + strided_slice_params.new_axis_mask = arg.get_primitive()->new_axis_mask; + strided_slice_params.shrink_axis_mask = arg.get_primitive()->shrink_axis_mask; + completeStridedSliceMasks(strided_slice_params.shrink_axis_mask); + + auto& kernel_selector = kernel_selector::strided_slice_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(strided_slice_params, strided_slice_optional_params); + + CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments"); + + auto strided_slice = new strided_slice_gpu(arg, best_kernels[0]); + + return strided_slice; + } +}; + +namespace +{ + struct attach + { + attach() + { + auto val_fw = strided_slice_gpu::create; + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + } + ~attach() = default; + }; + attach attach_impl; +} +} //namespace gpu +} //namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/gpu/upsampling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/upsampling_gpu.cpp index 423c58e..aa37305 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/upsampling_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/upsampling_gpu.cpp @@ -50,7 +50,7 @@ struct upsampling_gpu : typed_primitive_gpu_impl const auto& primitive = arg.get_primitive(); if(primitive->with_activation) - convert_activation_func_params(primitive, us_params); + convert_activation_func_params(primitive, us_params.activation); us_params.scale = primitive->scale; us_params.num_filter = primitive->num_filter; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/wait_for_events_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/wait_for_events_gpu.cpp index c116e2a..30bbf7e 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/wait_for_events_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/wait_for_events_gpu.cpp @@ -36,6 +36,11 @@ public: return events_waiter.run(events); } + bool validate(const primitive_inst&) const override + { + return true; + } + static primitive_impl* create_data(const data_node& data) { return new wait_for_events_gpu(data); diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_required_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_required_reorders.cpp new file mode 100644 index 0000000..d903f5c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_required_reorders.cpp @@ -0,0 +1,143 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include + +#include "pass_manager.h" +#include "program_node.h" +#include "mutable_data_inst.h" +#include "concatenation_inst.h" +#include "scale_inst.h" +#include "tensor_type.h" + +/* +This pass checks if data formats (layouts) of output/input in hidden layers match. +If not than required reorder is added to the network. +*/ + +/* +Add a reorder in between node and usr with reorder_layout as layout +*/ +void add_required_reorders::add_reorder(program_impl& p, program_node* node, program_node* usr, layout reorder_layout) +{ + + auto new_reorder = std::make_shared(node->id() + "_reorder_" + usr->id(), + node->id(), + reorder_layout); + auto& new_reorder_node = p.get_or_create(new_reorder); + + //ToDo: add a method to program_impl class which adds an intermediate node given a node and its user + auto it = std::find(usr->get_dependencies().begin(), usr->get_dependencies().end(), node); + if (it == usr->get_dependencies().end()) + { + throw error("Inconcistency in topology description: user of a node is not present among its dependecies.", CLDNN_ERROR); + } + auto idx = it - usr->get_dependencies().begin(); + if (idx < 0 || (size_t)idx >= usr->get_dependencies().size()) + { + throw error("Internal Error: container index out of range exception.", CLDNN_ERROR); + } + p.add_intermediate(new_reorder_node, *usr, idx); +} + +void add_required_reorders::run(program_impl& p) +{ + auto usr_itr = p.get_processing_order().begin(); + while (usr_itr != p.get_processing_order().end()) + { + auto& usr = *usr_itr++; + if (usr->get_dependencies().size() == 0) + continue; // only nodes with dependencies + if (usr->is_type() || usr->is_type()) + continue; + if (usr->type()->does_an_implementation_exist(p.get_engine(), *usr)) + continue; + + /* + First check if there are non data flow dependencies for the primitive + if so then choose the same output format as the data + */ + bool correct_layout_selected = false; + for (auto& node : usr->get_dependencies()) + { + if (!node->is_in_data_flow()) + { + /* + ToDo: Here we should handle also the situation where primitive usr has data inputs in different formats + */ + layout current_layout(usr->get_output_layout().data_type, + node->get_output_layout().format, + usr->get_output_layout().size); + usr->set_output_layout(current_layout); + if (usr->type()->does_possible_implementation_exist(p.get_engine(), *usr)) + { + correct_layout_selected = true; + break; + } + else + { + throw error("Internal Error: no layout format available for " + usr->id() + + " comaptible with " + node->id(), CLDNN_ERROR); + } + } + } + + if (!correct_layout_selected) { + //This list of preffered layouts has been selected arbitrary due to developers' experience + cldnn::format preffered_layout_formats[]{ + cldnn::format::bfyx, + cldnn::format::yxfb, + cldnn::format::byxf, + }; + + for (auto new_layout_format : preffered_layout_formats) + { + layout current_layout(usr->get_output_layout().data_type, + new_layout_format, + usr->get_output_layout().size); + usr->set_output_layout(current_layout); + if (usr->type()->does_possible_implementation_exist(p.get_engine(), *usr)) + { + correct_layout_selected = true; + break; + } + } + + if (!correct_layout_selected) { + throw error("Internal Error: no implementation for " + usr->id() + " kernel which satisfies output format dependecies.", CLDNN_ERROR); + } + } + + // layout is selected now add required reorders + auto dep_itr = usr->get_dependencies().begin(); + while (dep_itr != usr->get_dependencies().end()) + { + auto node = *dep_itr++; + //do not add a reorder if usr or node are reorders or does not belong to data_flow + if (!usr->is_type() && + !node->is_type() && + node->is_in_data_flow()) + { + if ((usr->get_output_layout()!=node->get_output_layout())) + { + add_reorder(p, node, usr, usr->get_output_layout()); + } + } + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp new file mode 100644 index 0000000..e78cf86 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp @@ -0,0 +1,120 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "batch_norm_inst.h" +#include "reshape_inst.h" + +using namespace cldnn; + +//Some primitives require a specific shape for thier inputs/parameters. +//We should check this and add reshape to be compliant with this. +// +//Example: batch_norm primitive requires that mean/variance/scale/shift is shape {1, X, 1, 1} +void add_reshape_to_primitives::run(program_impl& p) +{ + auto processing_order = p.get_processing_order(); + + for (auto& node : processing_order) + { + //if node is batch_norm and mean/var are given (i.e. use eltwise kernel to calculate batch_norm) + if (node->is_type() && + (!node->as().calc_mean_var() && node->as().use_global_stats())) + { + auto mean_layout = node->as().mean().get_output_layout(); + auto mean_size = mean_layout.size; + auto mean_x = mean_size.spatial[0]; + auto mean_y = mean_size.spatial[1]; + auto mean_b = mean_size.batch[0]; + + if (mean_x != 1 + || mean_y != 1 + || mean_b != 1) + { + auto mean_name = node->as().mean().id(); + std::vector mean_sizes = mean_size.sizes(); + int32_t mean_max_size = *std::max_element(std::begin(mean_sizes), std::end(mean_sizes)); + + auto r_prim = std::make_shared("reshape_" + mean_name + "_" + node->id(), mean_name, tensor(1, mean_max_size, 1, 1)); + auto& r_prim_node = p.get_or_create(r_prim); + + p.add_intermediate(r_prim_node, *node, 1, true); + } + + auto variance_size = node->as().variance().get_output_layout().size; + auto variance_x = variance_size.spatial[0]; + auto variance_y = variance_size.spatial[1]; + auto variance_b = variance_size.batch[0]; + + if (variance_x != 1 + || variance_y != 1 + || variance_b != 1) + { + auto variance_name = node->as().variance().id(); + std::vector variance_sizes = variance_size.sizes(); + int32_t variance_max_size = *std::max_element(std::begin(variance_sizes), std::end(variance_sizes)); + + auto r_prim = std::make_shared("reshape_" + variance_name + "_" + node->id(), variance_name, tensor(1, variance_max_size, 1, 1)); + auto& r_prim_node = p.get_or_create(r_prim); + + p.add_intermediate(r_prim_node, *node, 2, true); + } + + if (node->as().use_scale_shift()) + { + auto scale_size = node->as().scale().get_output_layout().size; + auto scale_x = scale_size.spatial[0]; + auto scale_y = scale_size.spatial[1]; + auto scale_b = scale_size.batch[0]; + + if (scale_x != 1 + || scale_y != 1 + || scale_b != 1) + { + auto scale_name = node->as().scale().id(); + std::vector scale_sizes = scale_size.sizes(); + int32_t scale_max_size = *std::max_element(std::begin(scale_sizes), std::end(scale_sizes)); + + auto r_prim = std::make_shared("reshape_" + scale_name + "_" + node->id(), scale_name, tensor(1, scale_max_size, 1, 1)); + auto& r_prim_node = p.get_or_create(r_prim); + + p.add_intermediate(r_prim_node, *node, 3, true); + } + + auto shift_size = node->as().shift().get_output_layout().size; + auto shift_x = shift_size.spatial[0]; + auto shift_y = shift_size.spatial[1]; + auto shift_b = shift_size.batch[0]; + + if (shift_x != 1 + || shift_y != 1 + || shift_b != 1) + { + auto shift_name = node->as().shift().id(); + std::vector shift_sizes = shift_size.sizes(); + int32_t shift_max_size = *std::max_element(std::begin(shift_sizes), std::end(shift_sizes)); + + auto r_prim = std::make_shared("reshape_" + shift_name + "_" + node->id(), shift_name, tensor(1, shift_max_size, 1, 1)); + auto& r_prim_node = p.get_or_create(r_prim); + + p.add_intermediate(r_prim_node, *node, 4, true); + } + } + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/calculate_prior_boxes.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/calculate_prior_boxes.cpp new file mode 100644 index 0000000..c7e9079 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/calculate_prior_boxes.cpp @@ -0,0 +1,47 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "prior_box_inst.h" +#include "program_node.h" +#include "program_impl.h" + +using namespace cldnn; + +void calculate_prior_boxes::run(program_impl& p) +{ + auto itr = p.get_processing_order().begin(); + while (itr != p.get_processing_order().end()) + { + auto& node = (*itr++); + if (!node->is_type()) + continue; + + auto& pb_node = node->as(); + + pb_node.calc_result(); + p.remove_connection(pb_node.input(), pb_node); + + auto& result = pb_node.get_result_buffer(); + result.add_ref(); // need to inc ref count since we will be assigning this memory as cldnn_memory in next line that is not ref_count_obj + auto cpp_mem = details::memory_c_to_cpp_converter::convert(api_cast(&result)); + + auto& data_node = p.get_or_create(std::make_shared("_cldnn_tmp_" + pb_node.id() + "_result", cpp_mem)); + p.replace(pb_node, data_node); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/compile_graph.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/compile_graph.cpp new file mode 100644 index 0000000..db7c659 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/compile_graph.cpp @@ -0,0 +1,39 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "internal_primitive.h" +#include "data_inst.h" +#include "mutable_data_inst.h" +#include "program_node.h" +#include "engine_impl.h" + +using namespace cldnn; + +void compile_graph::run(program_impl& p) +{ + for (auto& node : p.get_processing_order()) + { + if (!node->is_type() && !node->is_type()) + { + node->get_output_layout(); + if (!node->is_type() && !(node->is_type() && node->get_dependencies().empty())) + node->selected_impl = node->type()->choose_impl(p.get_engine(), *node); + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_remove_stride.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_remove_stride.cpp new file mode 100644 index 0000000..eea35ee --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_remove_stride.cpp @@ -0,0 +1,105 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "api/CPP/tensor.hpp" + +#include "pass_manager.h" + +#include "convolution_inst.h" +#include "eltwise_inst.h" + +#include + +using namespace cldnn; + +void eltwise_remove_stride::conv_stride_extend(program_impl& p, program_node& node, cldnn::tensor& tensor) +{ + // make sure we have only 1 user + if (node.get_users().size() > 1) + return; + + const auto conv = std::static_pointer_cast(node.get_primitive()); + auto weights_node_ptr = p.get_node_ptr(conv->weights[0]); + auto filter_size = weights_node_ptr->get_output_layout().size; + // make sure this is conv 1x1 + if (filter_size.spatial[0] == 1 && filter_size.spatial[1] == 1) + { + auto deps = node.get_dependencies(); + for (auto dep : deps) + { + if (dep->is_type()) + { + conv_stride_extend(p, *dep, tensor); + dep->recalc_output_layout(true); + break; + } + } + auto c = const_cast(&(*conv)); + c->with_output_size = false; + node.recalc_output_layout(true); + } + else + { + bool can_shrink_x = (filter_size.spatial[0] - (conv->stride.spatial[0] + (tensor.spatial[0] - 1))) >= 0; + bool can_shrink_y = (filter_size.spatial[1] - (conv->stride.spatial[1] + (tensor.spatial[1] - 1))) >= 0; + if (can_shrink_x && can_shrink_y) + { + auto c = const_cast(&(*conv)); + c->stride.spatial[0] += tensor.spatial[0] - 1; + c->stride.spatial[1] += tensor.spatial[1] - 1; + c->with_output_size = false; + node.recalc_output_layout(true); + tensor.spatial[0] = 1; + tensor.spatial[1] = 1; + } + } +} + +void eltwise_remove_stride::run(program_impl& p) +{ + for (auto& node : p.get_processing_order()) + { + if (node->is_type()) + { + // TODO: make fp16 work + if (node->get_output_layout().data_type != data_types::i8 && node->get_output_layout().data_type != data_types::f32) + { + if (node->get_output_layout().data_type != data_types::f16 || node->get_output_layout().format != format::yxfb) + { + continue; + } + } + + const auto eltw = std::static_pointer_cast(node->get_primitive()); + if (!eltw->stride.empty()) + { + auto deps = node->get_dependencies(); + for (size_t i = 0; i < deps.size(); i++) + { + auto dep = deps[i]; + // TODO: add other primitives beside convolution here + if (dep->is_type()) + { + auto e = const_cast(&(*eltw)); + conv_stride_extend(p, *dep, e->stride[i]); + } + } + } + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_shrinking.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_shrinking.cpp new file mode 100644 index 0000000..b3d0e00 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/eltwise_shrinking.cpp @@ -0,0 +1,132 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "pass_manager.h" +#include "eltwise_inst.h" + +using namespace cldnn; + +void eltwise_shrinking::run(program_impl& p) +{ + std::vector convs_to_shrink; + + for (auto& node : p.get_processing_order()) + { + if (node->is_type()) + { + // TODO: make fp16 work + if (node->get_output_layout().data_type != data_types::i8 && node->get_output_layout().data_type != data_types::f32) + { + if (node->get_output_layout().data_type != data_types::f16 || node->get_output_layout().format != format::yxfb) + { + continue; + } + } + + const auto eltw = std::static_pointer_cast(node->get_primitive()); + // TODO: support cases which already have stride! + if (eltw->stride.empty()) + { + bool can_shrink = true; + int32_t stride_x = 0; + int32_t stride_y = 0; + convs_to_shrink.clear(); + auto users = node->get_users(); + for (auto user : users) + { + // currently we can shrink only if users are convolutions + if (!user->is_type()) + { + can_shrink = false; + break; + } + + if (user->get_output_layout().format == format::b_fs_yx_fsv4) + { + // Workaround for VIS-1079 + // Currently, we don't have "conv + eltwise" optimization for + // IMAD and it blocks us to run the whole ResNet-50.i8 topology in IMAD. + // As workaround, this optimization will be temporary switched off for + // "format == b_fs_yx_fsv4"(IMAD specific data layout). + // TODO: Please, remove this code, when VIS - 1079 will be done. + can_shrink = false; + break; + } + + const auto conv = std::static_pointer_cast(user->get_primitive()); + if (conv->weights.size() != 1) + { + can_shrink = false; + break; + } + + auto weights_node_ptr = p.get_node_ptr(conv->weights[0]); + auto filter_size = weights_node_ptr->get_output_layout().size; + // make sure this is conv 1x1 + if (filter_size.spatial[0] != 1 || filter_size.spatial[1] != 1) + { + can_shrink = false; + break; + } + + // make sure convolution can accept shrinked input by modifying stride + if (conv->stride.spatial[0] > 1 || conv->stride.spatial[1] > 1) + { + if (stride_x == 0) + stride_x = conv->stride.spatial[0]; + if (stride_y == 0) + stride_y = conv->stride.spatial[1]; + + // make sure stride across all eltwise's convolution users is the same + if (conv->stride.spatial[0] != stride_x || conv->stride.spatial[1] != stride_y) + { + can_shrink = false; + break; + } + convs_to_shrink.push_back(user); + } + else + { + can_shrink = false; + break; + } + } + if (can_shrink) + { + // add stride for every eltwise's inputs to have shrinked output + auto e = const_cast(&(*eltw)); + for (size_t user = 0; user < node->get_users().size(); user++) + { + e->stride.push_back({ 0,0,stride_x,stride_y }); + } + node->recalc_output_layout(); + + // change stride on every convolution + for (size_t i = 0; i < convs_to_shrink.size(); i++) + { + const auto conv = std::static_pointer_cast(convs_to_shrink[i]->get_primitive()); + auto c = const_cast(&(*conv)); + c->stride.spatial[0] = 1; + c->stride.spatial[1] = 1; + // TODO: remove forcing "false" with_output_size if not needed + c->with_output_size = false; + convs_to_shrink[i]->recalc_output_layout(); + } + } + } + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp new file mode 100644 index 0000000..64e3853 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp @@ -0,0 +1,641 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_node.h" + +#include "split_inst.h" +#include "convolution_inst.h" +#include "crop_inst.h" +#include "lstm_inst.h" +#include "reshape_inst.h" +#include "upsampling_inst.h" + +#include + +using namespace cldnn; + +namespace cldnn +{ + std::string get_id_string(size_t i) { + std::stringstream ss; + ss << std::setw(5) << std::setfill('0') << i; + return ss.str(); + } + + // ToDo: rewrite methods in this class the same style (maybe: handle_() ), + // is it possible to avoid iterating over all nodes several times? + // do we have any repeated code here, can we make it more readable? + void graph_initializations::replace_nodes(program_impl& p) + { + auto itr = p.nodes_map.begin(); + while (itr != p.nodes_map.end()) + { + auto node_itr = itr++; + auto& node = (*node_itr).second; + + if (node->is_type()) + { + //check if split is not used by any primitive, as it will be optimized + if (node->get_users().size() != 0) + throw std::logic_error("Split layer cannot be used directly! Please use split output \"" + node->id() + ":\"!"); + + //get_output size and validate split primitive inputs + auto output_layout = node->get_output_layout(); + auto output_layout_size = output_layout.size; + + auto split_prim = node->as().typed_desc(); + primitive_id input_id = split_prim->input[0]; + auto split_num = split_prim->output_offsets.size(); + + //create crop for each split ouptut provided + for (decltype(split_num) i = 0; i < split_num; i++) + { + primitive_id output_id = node->id() + ":" + split_prim->output_ids[i]; + + auto node_ptr = p.nodes_map.find(output_id)->second; + + //calculate crop reference input size + tensor reference_input_size; + + // For all the split offsets before the last split offset, the size can be calculated + // size_of_offset[n] = offset[n + 1] - offset[n]; + if (i != (split_num - 1)) + { + reference_input_size += split_prim->output_offsets[i + 1] - split_prim->output_offsets[i]; + } + // For the last split i.e. size[split_num - 1] = split_input.size - offsets[n]; + else + { + reference_input_size += output_layout_size - split_prim->output_offsets[i]; + } + + // For all the other dimensions, copy from the split_input + for (int dimension = 0; dimension < CLDNN_TENSOR_DIM_MAX; dimension++) + { + reference_input_size.raw[dimension] + = (reference_input_size.raw[dimension] == 0) ? output_layout_size.raw[dimension] : reference_input_size.raw[dimension]; + } + + //update crop primitive + node_ptr->set_output_padding(output_layout.data_padding); + auto crop_prim = node_ptr->as().typed_desc(); + crop_prim->reference_input = reference_input_size; + } + + //remove input->split connection and remove original split node + p.remove_connection(node->get_dependency(0), *node); + p.optimized_out.push_back(node->id()); + p.nodes_map.erase(node->id()); + continue; + } + + //find upsampling primitives with bilinear filtering and create deconvolution with proper weights instead + if (node->is_type()) + { + auto upsampling_prim = node->as().typed_desc(); + + if (upsampling_prim->sample_type != upsampling_sample_type::bilinear) + continue; + + //check if num_filter is not 0 (required for bilinear upsampling) + if (upsampling_prim->num_filter == 0) + throw std::logic_error("num_filter in upsampling cannot be 0 in bilinear filtering mode in \"" + node->id() + "\"!"); + + primitive_id upsampling_id = node->id(); + auto& input_node = node->get_dependency(0); + + primitive_id input_id = upsampling_prim->input[0]; + auto num_filter = upsampling_prim->num_filter; + + //setting deconvolution parameters based on upsampling input + auto scale = static_cast(upsampling_prim->scale); + tensor stride(1, 1, scale, scale); + auto offset = static_cast(std::ceil((scale - 1) / 2.f)); + tensor input_offset(0, 0, -offset, -offset); + + //setting weights for deconvolution + auto kernel_size = static_cast((2 * scale) - (scale % 2)); + layout weights_layout(data_types::f32, format::bfyx, tensor(1, 1, kernel_size, kernel_size)); + + std::vector weights_vec; + for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++) + { + memory_impl::ptr data_to_allocate = p.get_engine().allocate_memory(weights_layout); + mem_lock dst{ data_to_allocate }; + float *dst_data = dst.data(); + //initialize with bilinear weights data + auto f = static_cast(std::ceil(kernel_size / 2.0f)); + float c = (2 * f - 1 - f % 2) / (2.f * f); + float x = 0.f; + float y = 0.f; + for (size_t i = 0; i < weights_layout.count(); ++i) { + x = static_cast(i % kernel_size); + y = static_cast((i / kernel_size) % kernel_size); + dst_data[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c)); + } + + //create weights primitive, with dummy memory which will be replaced in firther step + primitive_id weights_id = upsampling_id + "_deconvolution_weights" + std::to_string(weights_idx); + layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1)); + float zero = 0.f; + auto weights_prim = std::make_shared(weights_id, memory::attach(dummy_layout, &zero, 1)); + p.get_or_create(weights_prim); + + weights_vec.push_back(weights_id); + + auto weights_node_ptr = p.nodes_map.find(weights_id)->second; + + //attach weights buffer + auto& data_node = weights_node_ptr->as(); + data_node.attach_memory(*data_to_allocate, false); + } + + //remove upsampling node, rename it and move to the optimized list + p.remove_connection(node->get_dependency(0), *node); + auto rename_id = upsampling_id + "_tmp"; + p.rename(*node, rename_id); + + //create deconvolution primitive + auto deconv_prim = std::make_shared(upsampling_id, input_id, weights_vec, stride, input_offset); + p.get_or_create(deconv_prim); + + auto deconv_node_ptr = p.nodes_map.find(upsampling_id)->second; + + auto upsampling_node_ptr = p.nodes_map.find(rename_id)->second; + p.replace_all_usages(*upsampling_node_ptr, *deconv_node_ptr); + p.optimized_out.push_back(rename_id); + p.nodes_map.erase(rename_id); + + //add connections input->deconvolution and weights->deconvolution + p.add_connection(input_node, *deconv_node_ptr); + + for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++) + { + auto weights_node_ptr = p.nodes_map.find(weights_vec[weights_idx])->second; + p.add_connection(*weights_node_ptr, *deconv_node_ptr); + } + continue; + } + + //find deconvolution primitives with stride 1 and change them to convolution with trasposed weights + if (node->is_type()) + { + if (!p.get_options().get()->enabled()) + continue; + + auto deconv_prim = node->as().typed_desc(); + + //limit optimization to stride = 1 + if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient()) + continue; + + primitive_id deconv_id = node->id(); + auto& input_node = node->get_dependency(0); + + primitive_id input_id = deconv_prim->input[0]; + + //setting convolution parameters based on deconvolution params + auto stride = deconv_prim->stride; + auto weights = deconv_prim->weights; + std::vector weights_vec; + for (auto& weights_id : weights) + weights_vec.push_back(weights_id); + auto biases = deconv_prim->bias; + std::vector bias_vec; + for (auto& bias_id : biases) + bias_vec.push_back(bias_id); + auto input_offset = deconv_prim->input_offset; + auto with_activation = deconv_prim->with_activation; + auto activation_negative_slope = deconv_prim->activation_negative_slope; + auto output_padding = deconv_prim->output_padding; + + //remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list + tensor filter_size = { 1, 1, 1, 1 }; + p.remove_connection(node->get_dependency(0), *node); + for (auto& weights_id : weights_vec) + { + auto weights_node_ptr = p.nodes_map.find(weights_id)->second; + p.remove_connection(*weights_node_ptr, *node); + //get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls have same size + if (weights_id == weights_vec[0]) + filter_size = weights_node_ptr->get_output_layout().size; + } + + input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1); + input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1); + + if (!bias_vec.empty()) + { + for (auto& bias_id : bias_vec) + { + auto bias_id_node_ptr = p.nodes_map.find(bias_id)->second; + p.remove_connection(*bias_id_node_ptr, *node); + } + } + auto rename_id = deconv_id + "_tmp"; + p.rename(*node, rename_id); + + //create convolution primitive + if (biases.size() != 0) + { + auto conv_prim = std::make_shared(deconv_id, input_id, weights_vec, bias_vec, + stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding); + p.get_or_create(conv_prim); + } + else + { + auto conv_prim = std::make_shared(deconv_id, input_id, weights_vec, + stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding); + p.get_or_create(conv_prim); + } + + auto conv_node_ptr = p.nodes_map.find(deconv_id)->second; + auto conv_node = &conv_node_ptr->as(); + conv_node->set_transposed(true); + + //add connections input->convolution, weights->convolution and bias->convolution + p.add_connection(input_node, *conv_node_ptr); + + for (auto& weights_id : weights_vec) + { + auto weights_node_ptr = p.nodes_map.find(weights_id)->second; + p.add_connection(*weights_node_ptr, *conv_node_ptr); + } + + if (!bias_vec.empty()) + { + for (auto& bias_id : bias_vec) + { + auto bias_id_node_ptr = p.nodes_map.find(bias_id)->second; + p.add_connection(*bias_id_node_ptr, *conv_node_ptr); + } + } + + auto deconv_node_ptr = p.nodes_map.find(rename_id)->second; + p.replace_all_usages(*deconv_node_ptr, *conv_node_ptr); + p.optimized_out.push_back(rename_id); + p.nodes_map.erase(rename_id); + + continue; + } + } + } + + void graph_initializations::handle_detection_output(program_impl& p) + { + auto itr = p.nodes_map.begin(); //note we need to use iterators since currently processed element can be removed + while (itr != p.nodes_map.end()) + { + auto node_itr = itr++; + auto& node = *(*node_itr).second; + // Create second part detection output primitive and replace nodes names - do it only once + if ((p.get_options().get()->enabled()) && + (node.is_type()) && + (node.id().find("_pre") == std::string::npos)) //ToDo: this will fail if user will name the primitive with using _pre like do_pre + // we need to use node mark() or some other idea to prevent it + { + // rename detection output + const primitive_id detect_out_node_name = node.id(); + const primitive_id new_primitive_id = detect_out_node_name + "_pre"; + p.rename(node, new_primitive_id); + + auto detect_out_prim = node.as().typed_desc(); + // Create new primitive, "keep top k" part of detection output + // ToDo: add a default parameters to the detection_output_sort class constructor to get rid off this initialization from here + auto detect_out_sort_prim = std::make_shared( + detect_out_node_name, + node.id(), + // not important params here - it will be set during "primitive_impl* create" func in "detection_output_sort_gpu" + 0, // num_images + 0, // num_classes + 0, // keep_top_k + false, // share_location + 0, // top_k + -1, // background_label_id + detect_out_prim->output_padding); + + p.get_or_create(detect_out_sort_prim); + + auto sort_node = p.nodes_map.find(detect_out_node_name)->second; + + // Add connection to second part of detection output + if (node.get_users().size()) + { + p.add_intermediate(*sort_node, *(node.get_users().front()), 0, false); + } + else + { + p.add_connection(node, *sort_node); + } + } + } + } + + void graph_initializations::handle_lstm(program_impl& p) + { + bool has_lstm_children; + auto itr = p.nodes_map.begin(); //note we need to use iterators since currently processed element can be removed + while (itr != p.nodes_map.end()) + { + auto node_itr = itr++; + auto& node = (*node_itr).second; + has_lstm_children = false; + // replace lstm node with lstm_gemm and lstm_elt nodes + if (node->is_type()) { + bool initial_hidden_term = node->as().initial_hidden_term(); + bool initial_cell_term = node->as().initial_cell_term(); + bool bias_term = node->as().bias_term(); + auto lstm_prim = node->as().typed_desc(); + primitive_id weights_id = lstm_prim->weights; + primitive_id recurrent_id = lstm_prim->recurrent; + primitive_id bias_id = bias_term ? lstm_prim->bias : ""; + primitive_id initial_hidden_id = initial_hidden_term ? lstm_prim->initial_hidden : ""; + primitive_id initial_cell_id = initial_cell_term ? lstm_prim->initial_cell : ""; + + //removing connection with weights to get proper dependency order for next operations + p.remove_connection(*p.nodes_map.at(weights_id), *node); + p.remove_connection(*p.nodes_map.at(recurrent_id), *node); + if (bias_term) + p.remove_connection(*p.nodes_map.at(bias_id), *node); + if (initial_hidden_term) + p.remove_connection(*p.nodes_map.at(initial_hidden_id), *node); + if (initial_cell_term) + p.remove_connection(*p.nodes_map.at(initial_cell_id), *node); + + //calculating sizes + auto input_size = node->get_dependency(0).get_output_layout().size; + auto recurrent_size = p.nodes_map.at(recurrent_id)->get_output_layout().size; + + // hidden tensor size = [batch, seq, hidden_size, direction] + // the output of the element wise operation is cropped and used in the next time step + // sequence_len = 1 and direction = 1. The backward pass is separated from the forward pass + auto hidden_size = tensor(input_size.batch[0], 1, recurrent_size.spatial[0], 1); + + size_t directions = recurrent_size.feature[0]; + size_t input_directions = input_size.spatial[1]; + size_t num_input_dependencies = node->get_dependencies().size(); + size_t input_vector_size = node->as().sequence_len(); + size_t sequence_len = input_vector_size; + + // Calculate the input sequence length for the lstm node + // Case 1: If the input comes in as a concatenated input i.e. the + // input is not divided into sequence elements + if (input_vector_size == 1 && num_input_dependencies == 1) + { + // Either the input actually has 1 sequence element + auto& input = node->get_dependency(0); + auto input_layout = input.get_output_layout(); + tensor input_tensor = input_layout.size; + + // Get the sequence length from the input to LSTM + sequence_len = input_layout.size.feature[0]; + + // If the input's feature/sequence length field is > 1, i.e. If + // the sequence elements are concatenated into one single input + // then it has to be split into individual sequence elements + if (sequence_len > 1) + { + for (size_t sequence_element = 0; sequence_element < sequence_len; sequence_element++) + { + primitive_id crop_id = input.id() + ":crop:" + get_id_string(sequence_element); + tensor crop_tensor{ input_tensor.batch[0], 1, input_tensor.spatial[0], input_tensor.spatial[1] }; + tensor offset_tensor{ 0, static_cast(sequence_element), 0, 0 }; + auto input_crop = std::make_shared(crop_id, input.id(), crop_tensor, offset_tensor); + auto& input_crop_node = p.get_or_create(input_crop); + + // Add the crop nodes as user for input + p.add_connection(node->get_dependency(0), input_crop_node); + + // Connect crop with lstm + p.add_connection(input_crop_node, *node); + } + + // We have the sequence elements (cropped inputs) as input to LSTM. + // The original input is no longer a dependency to LSTM. + // Remove the input node as a dependency to LSTM + p.remove_connection(node->get_dependency(0), *node); + + // Update the total no. of input dependecies + num_input_dependencies = node->get_dependencies().size(); + } + } + + //if the sequence has a single element but it has multiple inputs then + //the parent of this lstm is an lstm node. If this is a bidirectional lstm + //then the sequence length is the number of dependencies divided by 2. + else if (input_vector_size == 1 && num_input_dependencies > 1) + { + sequence_len = (directions == 1) ? num_input_dependencies : num_input_dependencies / 2; + } + + //check if this lstm node has an lstm child + for (auto& user : node->get_users()) + { + if (user->is_type()) + { + has_lstm_children = true; + } + } + + bool emit_last_cell = lstm_prim->output_selection == cldnn_lstm_output_hidden_cell || + lstm_prim->output_selection == cldnn_lstm_output_sequence_cell; + bool emit_sequence = lstm_prim->output_selection == cldnn_lstm_output_sequence_cell || + lstm_prim->output_selection == cldnn_lstm_output_sequence; + + std::vector cell_list(directions * sequence_len); + std::vector hidden_list(directions * sequence_len); + std::map> output_map; + auto dependencies = node->get_dependencies(); + + //lstm expanding + for (size_t dir = 0; dir < directions; ++dir) { + auto hidden_id = initial_hidden_id; + auto cell_id = initial_cell_id; + for (size_t i = 0; i < sequence_len; ++i) { + size_t idx = i + dir * sequence_len; + primitive_id lstm_gemm_id = node->id() + ":lstm_gemm" + get_id_string(idx); + primitive_id lstm_elt_id = node->id() + ":lstm_elt" + get_id_string(idx); + primitive_id crop_id = node->id() + ":crop" + get_id_string(idx); + + size_t input_idx = i; + //for bidirectional lstms, if first LSTM layer then reverse input + //for subsequent stacked layers the input is strided on the dir dimension + if (directions > 0) { + if (num_input_dependencies > sequence_len) { // stacked layer + input_idx = dir * sequence_len + i; + } + else + { + if ((input_directions < 2) && dir > 0) { // first layer + input_idx = sequence_len - i - 1; + } + } + } + + //primitive_id lstm_gemm_input_id = node->get_dependency(input_idx).get_primitive()->id; + //the line below requires an attention: get_org_primitive_id() might not be an actual id of a node (see rename method) + //ToDO: ensure that get_org_primitive_id() is suitable here + primitive_id lstm_gemm_input_id = node->get_dependency(input_idx).get_org_primitive_id(); + + auto lstm_gemm_node = std::make_shared(lstm_gemm_id, lstm_gemm_input_id, weights_id, recurrent_id, bias_id, hidden_id, (uint32_t)dir); + auto &n1 = p.get_or_create(lstm_gemm_node); + + auto lstm_elt_node = std::make_shared(lstm_elt_id, lstm_gemm_id, cell_id, lstm_prim->clip, lstm_prim->input_forget, + lstm_prim->activations, lstm_prim->activation_params, lstm_prim->offset_order, (uint32_t)dir); + auto &n2 = p.get_or_create(lstm_elt_node); + //adding lstm_elt as user + p.add_connection(n1, n2); + //adding dependecy to lstm_gemm node + //input + p.add_connection(node->get_dependency(input_idx), n1); + //adding weights and initial values to lstm_gemm + p.add_connection(*p.nodes_map.at(weights_id), n1); + p.add_connection(*p.nodes_map.at(recurrent_id), n1); + if (bias_term) + p.add_connection(*p.nodes_map.at(bias_id), n1); + + //adding cell and hiddens as dependencies + if (i > 0) + { + p.add_connection(*cell_list[size_t(i - 1) * directions + dir], n2); + p.add_connection(*hidden_list[size_t(i - 1) * directions + dir], n1); + } + //if initial values are present + else + { + if (initial_hidden_term) + p.add_connection(*p.nodes_map.at(hidden_id), n1); + if (initial_cell_term) + p.add_connection(*p.nodes_map.at(cell_id), n2); + } + + //lstm_hidden + { + hidden_id = crop_id + ":hidden"; + auto crop_hidden = std::make_shared(hidden_id, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 }); + auto &n3 = p.get_or_create(crop_hidden); + //adding eltwise as dependency to hidden + p.add_connection(n2, n3); + + //if parent is lstm adding hiddens as dependency + if (has_lstm_children) + { + for (auto& user : node->get_users()) + { + p.add_connection(n3, *user); + } + } + hidden_list[i * directions + dir] = &n3; + if (i == sequence_len - 1 || emit_sequence) + { + output_map[i * directions + dir] = { hidden_id, &n3 }; + } + } + + //lstm_cell + if (i < sequence_len - 1 || emit_last_cell) + { + cell_id = crop_id + ":cell"; + auto crop_cell = std::make_shared(cell_id, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 }); + auto &n4 = p.get_or_create(crop_cell); + p.add_connection(n2, n4); + cell_list[i * directions + dir] = &n4; + if (i == sequence_len - 1) + { + output_map[sequence_len * directions + dir] = { cell_id, &n4 }; + } + } + } + } + //if there is no next lstm, concatenation is created + if (!has_lstm_children) + { + std::vector output_ids_offsets; + for (auto& e : output_map) + { + output_ids_offsets.push_back(e.second.first); + } + primitive_id original_id = node->id(); + primitive_id concatenation_id = original_id + ":concat"; + auto concatenation_primitive = std::make_shared(concatenation_id, output_ids_offsets, concatenation::along_f); + auto &concatenation_node = p.get_or_create(concatenation_primitive); + for (auto& e : output_map) + { + p.add_connection(*e.second.second, concatenation_node); + } + if (directions == 2) { + // bidirectional support requires concatenations along the direction and sequence axis + // instead we can concatenate along the sequence axis and reshape the tensor to the account + // for the direction + size_t concatenate_len = emit_sequence ? sequence_len : 1; + if (emit_last_cell) concatenate_len++; + + tensor output_size{ input_size.batch[0], static_cast(concatenate_len), hidden_size.spatial[0], (int32_t)directions }; + primitive_id reshape_id = original_id + ":reshape"; + auto reshape_primitive = std::make_shared(reshape_id, concatenation_id, output_size); + auto &reshape_node = p.get_or_create(reshape_primitive); + p.add_connection(concatenation_node, reshape_node); + p.replace_all_usages(*node, reshape_node); + } + else + { + p.replace_all_usages(*node, concatenation_node); + } + } + //removing expanded node + p.remove_all_connections(*node); + p.nodes_map.erase(node->id()); + continue; + } + } + + } + + void graph_initializations::set_outputs(program_impl& p) + { + auto outputs_option = p.get_options().get(); + if (!outputs_option->outputs.empty()) + { + for (auto const& output : outputs_option->outputs) + { + auto o_node = p.nodes_map.at(output); + o_node->set_output(true); + p.outputs.push_back(o_node.get()); + } + } + else + { + for (auto& node : p.nodes_map) + if (node.second->is_endpoint()) + { + node.second->set_output(true); + p.outputs.push_back(node.second.get()); + } + } + } + + void graph_initializations::run(program_impl& p) + { + replace_nodes(p); + handle_detection_output(p); + handle_lstm(p); + set_outputs(p); + p.get_processing_order().calc_processing_order(p); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/handle_input_padding.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/handle_input_padding.cpp new file mode 100644 index 0000000..d11dce2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/handle_input_padding.cpp @@ -0,0 +1,94 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "border_inst.h" +#include "convolution_inst.h" +#include "error_handler.h" + +using namespace cldnn; + +//Some primitives support padding for input. +//There are 2 types of padding: symmetric and asymettric. +//Symmetric padding can be done using input_offset parameter for primitives. +//Asymmetric padding can be done by adding border primitive before them. It's safe way without modyfing optimized kernels. +void handle_input_padding::run(program_impl& p) +{ + auto processing_order = p.get_processing_order(); + + for (auto& node : processing_order) + { + if (node->is_type() + && (node->as().get_primitive()->padding_above.spatial[0] != 0 || node->as().get_primitive()->padding_above.spatial[1] != 0 + || node->as().get_primitive()->padding_below.spatial[0] != 0 || node->as().get_primitive()->padding_below.spatial[1] != 0)) + { + auto conv = node->as().get_primitive(); + auto conv_primitive = const_cast(&(*conv)); + + //Asymmetric padding + if (node->as().get_primitive()->padding_above.spatial[0] != node->as().get_primitive()->padding_below.spatial[0] + || node->as().get_primitive()->padding_above.spatial[1] != node->as().get_primitive()->padding_below.spatial[1]) + { + primitive_id conv_id = conv_primitive->id; + primitive_id input_id = conv_primitive->input[0]; + + auto padding_above = conv_primitive->padding_above; + auto padding_below = conv_primitive->padding_below; + + CLDNN_ERROR_NOT_EQUAL(node->as().id(), "Padding above feature", padding_above.feature[0], "", 0, "Padding above in feature is not supported"); + CLDNN_ERROR_NOT_EQUAL(node->as().id(), "Padding above batch", padding_above.batch[0], "", 0, "Padding above in batch is not supported"); + CLDNN_ERROR_NOT_EQUAL(node->as().id(), "Padding below feature", padding_below.feature[0], "", 0, "Padding below in feature is not supported"); + CLDNN_ERROR_NOT_EQUAL(node->as().id(), "Padding below batch", padding_below.batch[0], "", 0, "Padding below in batch is not supported"); + + CLDNN_ERROR_LESS_THAN(node->as().id(), "Padding above X", padding_above.spatial[0], "", 0, "Padding above in X cannot be negative"); + CLDNN_ERROR_LESS_THAN(node->as().id(), "Padding above Y", padding_above.spatial[1], "", 0, "Padding above in Y cannot be negative"); + CLDNN_ERROR_LESS_THAN(node->as().id(), "Padding below X", padding_below.spatial[0], "", 0, "Padding below in X cannot be negative"); + CLDNN_ERROR_LESS_THAN(node->as().id(), "Padding below Y", padding_below.spatial[1], "", 0, "Padding below in Y cannot be negative"); + + //set padding_above/padding_below to zeros - border primitive do the job + conv_primitive->padding_above = tensor(0, 0, 0, 0); + conv_primitive->padding_below = tensor(0, 0, 0, 0); + + //create border primitive + primitive_id border_id = input_id + "_border_" + conv_id; + auto b_prim = std::make_shared(border_id, input_id, + padding_above, + padding_below, + border_type::constant, 0.0f); + + auto& b_prim_node = p.get_or_create(b_prim); + + p.add_intermediate(b_prim_node, *node, 0, true); + + continue; + } + //Symmetric padding + else + { + //set input_offset + conv_primitive->input_offset = conv_primitive->padding_above.negate().add(conv_primitive->input_offset); + + //set padding_above/padding_below to zeros - input_offset do the job + conv_primitive->padding_above = tensor(0, 0, 0, 0); + conv_primitive->padding_below = tensor(0, 0, 0, 0); + + node->as().recalc_output_layout(true); + } + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/mark_nodes.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/mark_nodes.cpp new file mode 100644 index 0000000..29c17a4 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/mark_nodes.cpp @@ -0,0 +1,43 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_impl.h" + +using namespace cldnn; + +void mark_nodes::run(program_impl& p) { + mark_constants(p); + mark_data_flow(p); +} + +void mark_nodes::mark_constants(program_impl& p) +{ + for (auto& node : p.get_processing_order()) + { + p.mark_if_constant(*node); + } +} + +void mark_nodes::mark_data_flow(program_impl& p) +{ + for (auto const& node : p.get_processing_order()) + { + p.mark_if_data_flow(*node); + } +} diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/post_optimize_weights.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/post_optimize_weights.cpp new file mode 100644 index 0000000..0a13dc3 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/post_optimize_weights.cpp @@ -0,0 +1,131 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "pass_manager.h" +#include "program_helpers.h" +#include "api_extension/CPP/fused_conv_eltwise.hpp" +#include "include/fused_conv_eltwise_inst.h" + +namespace cldnn +{ + +post_optimize_weights::post_optimize_weights(layout_optimizer& lo_ref) : base_pass("post_optimize_weights"), _lo(lo_ref) {} + +void post_optimize_weights::run(program_impl& p) { + run(p, _lo); +} + +//function which prepares given primitive for weights optimization +template +void post_optimize_weights::optimize_weights(T& node, layout_optimizer& lo, program_impl& p) +{ + auto weights_offset = node.get_primitive()->input.size(); + auto bias_offset = weights_offset + program_helpers::wrap_if_single(node.get_primitive()->weights).size(); + for (auto i = weights_offset; i < bias_offset; i++) + { + auto& weights = node.get_dependency(i); + auto* impl = node.get_selected_impl().get(); + auto output_layout = node.get_output_layout(); + auto& weights_node = node.get_dependency(1); + auto weights_layout = weights_node.get_output_layout(); + const auto weights_type = layout_optimizer::data_type::weights; + + auto reorders = lo.get_generic_layer( + impl->_weights_reorder_params, + weights.id(), + weights_layout, + weights_type); + + for (auto& reorder : reorders) + { + //insert new generic_layer node to topology + p.add_intermediate(reorder.first, node, i, !reorder.second); + //set generic_layer's node output layout and implementation + auto& g_node = node.get_dependency(i); + g_node.get_output_layout(false); + g_node.selected_impl = g_node.type()->choose_impl(p.get_engine(), g_node); + } + //set the old output layout and do not invalidate users as change of weights will not affect output layout + node.set_output_layout(output_layout, false); + } +} + +//function which prepares given primitive for weights optimization +template <> +void post_optimize_weights::optimize_weights(fused_conv_eltwise_node& node, layout_optimizer& lo, program_impl& p) +{ + auto weights_offset = node.get_primitive()->input.size(); + auto bias_offset = weights_offset + program_helpers::wrap_if_single(node.get_primitive()->conv.weights).size(); + for (auto i = weights_offset; i < bias_offset; i++) + { + auto& weights = node.get_dependency(i); + auto* impl = node.get_selected_impl().get(); + auto output_layout = node.get_output_layout(); + auto& weights_node = node.get_dependency(1); + auto weights_layout = weights_node.get_output_layout(); + const auto weights_type = layout_optimizer::data_type::weights; + + auto reorders = lo.get_generic_layer( + impl->_weights_reorder_params, + weights.id(), + weights_layout, + weights_type); + + for (auto& reorder : reorders) + { + //insert new generic_layer node to topology + p.add_intermediate(reorder.first, node, i, !reorder.second); + //set generic_layer's node output layout and implementation + auto& g_node = node.get_dependency(i); + g_node.get_output_layout(false); + g_node.selected_impl = g_node.type()->choose_impl(p.get_engine(), g_node); + } + //set the old output layout and do not invalidate users as change of weights will not affect output layout + node.set_output_layout(output_layout, false); + } +} + +template void post_optimize_weights::optimize_weights(convolution_node& node, layout_optimizer& lo, program_impl& p); +template void post_optimize_weights::optimize_weights(deconvolution_node& node, layout_optimizer& lo, program_impl& p); +template void post_optimize_weights::optimize_weights(fully_connected_node& node, layout_optimizer& lo, program_impl& p); + +void post_optimize_weights::run(program_impl& p, layout_optimizer& lo) +{ + for (auto& node : p.get_processing_order()) + { + if (node->type() == convolution::type_id()) + { + optimize_weights(node->as(), lo, p); + } + else if (node->type() == deconvolution::type_id()) + { + optimize_weights(node->as(), lo, p); + } + else if (node->type() == fully_connected::type_id()) + { + optimize_weights(node->as(), lo, p); + } + else if (node->type() == fused_conv_eltwise::type_id()) + { + optimize_weights(node->as(), lo, p); + } + } +} + +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp new file mode 100644 index 0000000..95e102e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp @@ -0,0 +1,87 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_node.h" +#include "layout_optimizer.h" +#include "program_impl.h" +#include "program_helpers.h" +#include "fully_connected_inst.h" + +using namespace cldnn; + +pre_optimize_bias::pre_optimize_bias(layout_optimizer& lo_ref) : base_pass("pre_optimize_bias"), _lo(lo_ref) {} + +void pre_optimize_bias::run(program_impl& p) { + run(p, _lo); +} + +//function which prepares given primitive for weights optimization +template +void pre_optimize_bias::optimize_bias(T& node, layout_optimizer& lo, program_impl& p) +{ + layout output_layout = node.get_output_layout(); + + size_t weights_offset = node.get_primitive()->input.size(); + size_t bias_offset = weights_offset + program_helpers::wrap_if_single(node.get_primitive()->weights).size(); + for (size_t i = bias_offset; i < node.get_dependencies().size(); ++i) + { + //find weights primitive with given pimitive_id and add it to weights_optimizer + const program_node& bias = node.get_dependency(i); + const auto bias_type = layout_optimizer::data_type::bias; + auto reorder = lo.get_reorder( + bias.get_output_layout(), + bias.id(), + bias_type, + node, + output_layout); + + if (reorder.first) + p.add_intermediate(reorder.first, node, i, !reorder.second); + } +} +template void pre_optimize_bias::optimize_bias(convolution_node& node, layout_optimizer& lo, program_impl& p); +template void pre_optimize_bias::optimize_bias(deconvolution_node& node, layout_optimizer& lo, program_impl& p); +template void pre_optimize_bias::optimize_bias(fully_connected_node& node, layout_optimizer& lo, program_impl& p); +template void pre_optimize_bias::optimize_bias(embed_node& node, layout_optimizer& lo, program_impl& p); + + +void pre_optimize_bias::run(program_impl& p, layout_optimizer& lo) +{ + for (auto& prim : p.get_processing_order()) + { + if (prim->type() == convolution::type_id()) + { + if (!prim->as().weights_quantization_term()) + optimize_bias(prim->as(), lo, p); + } + else if (prim->type() == deconvolution::type_id()) + { + optimize_bias(prim->as(), lo, p); + } + else if (prim->type() == fully_connected::type_id()) + { + if (!prim->as().weights_quantization_term()) + optimize_bias(prim->as(), lo, p); + } + else if (prim->type() == embed::type_id()) + { + optimize_bias(prim->as(), lo, p); + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prep_opt_depthwise_sep_post.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prep_opt_depthwise_sep_post.cpp new file mode 100644 index 0000000..0f04577 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prep_opt_depthwise_sep_post.cpp @@ -0,0 +1,100 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_helpers.h" + + +template +void prep_opt_depthwise_sep_post::optimize_depthwise_sep_pre(program_impl& p, T& node) +{ + if (!node.get_depthwise_sep_opt()) + return; + + if (node.get_groups() > 1) { + if (node.get_groups() >= 16) { + node.set_groups(1); // use one kernel + } + return; // no concatenations requered + } + + const auto& split = node.get_primitive()->split(); + + auto dependency_offset = node.get_primitive()->input.size(); + //concatenate weights + { + //if weights were optimized it is needed to use the sizes after optimization + auto target_layout = program_helpers::get_weights_layout(node.get_dependency(dependency_offset), split); + program_helpers::merge_buffers(p.get_engine(), node, target_layout, dependency_offset, dependency_offset + split); + dependency_offset++; + } + + //concatenate biases + if (node.get_primitive()->bias.size() != 0) + { + const auto& bias_layout = node.get_dependency(dependency_offset).get_output_layout(); + auto target_layout = layout(bias_layout.data_type, cldnn::format::bfyx, { 1, 1, bias_layout.size.spatial[0] * split, 1 }); + program_helpers::merge_buffers(p.get_engine(), node, target_layout, dependency_offset, dependency_offset + split); + dependency_offset++; + } + + if (node.template is_type()) + { + auto& prim_node = node.template as(); + const auto& prim = prim_node.get_primitive(); + + // concatenate weights quantization factors + if (prim->weights_quantization_factors.size() != 0) + { + const auto& weights_quantization_layout = node.get_dependency(dependency_offset).get_output_layout(); + auto target_layout = layout(weights_quantization_layout.data_type, cldnn::format::bfyx, { 1, 1, weights_quantization_layout.size.batch[0] * split, 1 }); + program_helpers::merge_buffers(p.get_engine(), node, target_layout, dependency_offset, dependency_offset + split); + dependency_offset++; + } + // concatenate output callibration factors + if (prim->output_calibration_factors.size() != 0) + { + const auto& output_callibration_layout = node.get_dependency(dependency_offset).get_output_layout(); + auto target_layout = layout(output_callibration_layout.data_type, cldnn::format::bfyx, { 1, 1, output_callibration_layout.size.batch[0] * split, 1 }); + program_helpers::merge_buffers(p.get_engine(), node, target_layout, dependency_offset, dependency_offset + split); + dependency_offset++; + } + } + + if (node.get_primitive()) + //override node split, as only one kernel will be executed + node.set_split(1); +} +template void prep_opt_depthwise_sep_post::optimize_depthwise_sep_pre(program_impl& p, convolution_node& node); +template void prep_opt_depthwise_sep_post::optimize_depthwise_sep_pre(program_impl& p, deconvolution_node& node); + +void prep_opt_depthwise_sep_post::run(program_impl& p) +{ + //depthwise separated convolution/deconvolution optimization + for (auto& prim : p.get_processing_order()) + { + if (prim->type() == convolution::type_id()) + { + optimize_depthwise_sep_pre(p, prim->as()); + } + else if (prim->type() == deconvolution::type_id()) + { + optimize_depthwise_sep_pre(p, prim->as()); + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp new file mode 100644 index 0000000..500a6fe --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp @@ -0,0 +1,321 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "api/CPP/eltwise.hpp" +#include "api/CPP/pooling.hpp" +#include "api/CPP/upsampling.hpp" +#include "primitive_inst.h" +#include "activation_inst.h" +#include "concatenation_inst.h" +#include "crop_inst.h" +#include "eltwise_inst.h" +#include "reshape_inst.h" +#include "scale_inst.h" + +#include "pass_manager.h" +#include "program_helpers.h" + + +using namespace cldnn; + +//ToDo remove friendship relation from program_node + +void prepare_buffer_fusing::run(program_impl& p) +{ + bool is_debug = p.get_options().get()->enabled(); + /* + We need to take care of proper ordering by types. + 1. Concats + 2. Crops + 3. Others + Concat before crops is needed because of the crop fusing padding requirments. + If crop is before concat there can be padding mismtach, since concat changes padding. + */ + auto can_optimize = [](const program_node* node) + { + if (node->is_output() || + (node->get_fused_activation_func() != cldnn_activation_func_t::activation_none)) + { + return false; + } + return true; + }; + + //[1] First try to optimize all concats + auto node_itr = p.get_processing_order().begin(); + while (node_itr != p.get_processing_order().end()) + { + auto& node = (*node_itr++); + if (!can_optimize(node)) + continue; + program_helpers::do_for_types(*node, [&p, is_debug](concatenation_node& node) + { + // we need to avoid mixing padded and unpadded buffer + bool all_dependencies_padded = true; + bool all_dependencies_unpadded = true; + for (auto& input : node.get_dependencies()) { + layout l = input->get_output_layout(); + if (static_cast(l.data_padding)) + all_dependencies_unpadded = false; + else + all_dependencies_padded = false; + } + auto concat_axis = node.get_primitive()->axis; + auto padd = node.get_output_layout().data_padding; + + tensor lower_padd = padd.lower_size(); + tensor upper_padd = padd.upper_size(); + + auto upper_padd_val = node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis]; + tensor lower_padd_offset = lower_padd; + + std::list, tensor>> stack = { std::make_pair(node.get_dependencies(), tensor{ 0, 0, 0, 0 }) }; + while (!stack.empty()) + { + auto nodes_list = stack.front(); + stack.pop_front(); + + auto cascade_adjustment = nodes_list.second; + upper_padd.raw[concat_axis] = upper_padd_val; + lower_padd = lower_padd_offset; + + //check if concatenation in place can be applied for inputs set + for (auto input : nodes_list.first) + { + //if any of this node's inputs is used by more than one primitive and is not optimized concatenation then do not fuse buffers, + //also, if an input is marked as network output, prevent optimizations which would affect a form of its output (unless debug flag is set) + // todo: in future, if this case is problem, it can be optimized further to enable buffer fusing + // per single input rather than all/none + // + restrict input types to those which support padding on x,y,b and f + if (!input->support_padding() || + (input->is_output() && !is_debug) || + input->get_users().size() > 2) + return; + + if (input->get_users().size() > 1) + { + auto user_count = input->get_users().size(); + for (auto& user : input->get_users()) + if (user->is_type()) + user_count--; + if (user_count != 1) // user_cout == 0 means that input will be used only by concatenations, so we cannot apply concat in place for it + return; + } + } + + //apply concatenation in place optimization + for (auto input : nodes_list.first) + { + auto input_lenght = input->get_output_layout().size.raw[concat_axis]; + + bool optimized_concat_input = false; + if (input->type() == concatenation::type_id() && input->can_be_optimized()) + { + if (input->as().get_primitive()->axis != node.get_primitive()->axis) + return; + optimized_concat_input = true; + } + + // shrink upper pad so it points at the end of the input's buffer + // + // |--- lower padd ---| |---------- upper padd -----------| + // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| + upper_padd.raw[concat_axis] -= input_lenght; + + //adjust padding sizes for cascade concatenations + auto lower_padd_tmp = lower_padd; + lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis]; + auto upper_padd_tmp = upper_padd; + upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis]; + + // set new padding for input + input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes())); + + // move lower padd further + // + // |-------------- lower padd -------------|---------- upper padd -----------| + // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| + + lower_padd.raw[concat_axis] += input_lenght; + + if (optimized_concat_input && !input->get_dependencies().empty()) + stack.push_back(std::make_pair(input->get_dependencies(), input->get_output_layout().data_padding.lower_size())); + } + } + + node.can_be_optimized(true); + for (auto dep : node.get_users()) + { + dep->can_share_buffer(false); + } + if (!all_dependencies_padded && !all_dependencies_unpadded) + node.can_share_buffer(false); + }); + } + + //[2] Then try to optimize all crops + node_itr = p.get_processing_order().begin(); + while (node_itr != p.get_processing_order().end()) + { + auto& node = (*node_itr++); + if (!can_optimize(node)) + continue; + // zero copy + program_helpers::do_for_types(*node, [&p, is_debug](crop_node& node) + { + //if the node is marked as network output, prevent optimizations which would affect a form of its output, unless debug flag is set + if (node.is_output() && !is_debug) + return; + + //do not optimize when next node is concatenation which is not output + if (node.get_users().size() == 1 && node.get_users().front()->is_type() && !node.get_users().front()->is_output()) + return; + + if (node.get_dependencies().size() == 1 && + node.get_users().size() > 0) + { + // optimization is available for cropping across depth(features) only + // if output padding has defined padding across features already it wouldn't + // work because it expect to have zeros in the padded area. + const auto& crop_layout = node.get_output_layout(); + auto format = crop_layout.format; + auto crop_prim = node.get_primitive(); + auto input_layout = node.get_dependency(0).get_output_layout(); + const auto& crop_size = crop_layout.size; + const auto& out_padd = crop_layout.data_padding; + if (format == format::bfyx && + crop_size.batch[0] == input_layout.size.batch[0] && + crop_size.spatial[0] == input_layout.size.spatial[0] && + crop_size.spatial[1] == input_layout.size.spatial[1] && + out_padd.lower_size().feature[0] == 0 && + out_padd.upper_size().feature[0] == 0 && + out_padd.lower_size().batch[0] == 0 && + out_padd.upper_size().batch[0] == 0 && + out_padd.lower_size().spatial[0] == 0 && + out_padd.lower_size().spatial[1] == 0 && + out_padd.upper_size().spatial[0] == 0 && + out_padd.upper_size().spatial[1] == 0) + { + // Regular crop + // crop input buffer + // |___________data____________| + // + // crop output buffer + // |-------->| offsets[f] |<--| + // |_____data____| + // <------------> + // reference size + // + // In-place crop + // crop output buffer + // |_low_pad_|__data_size__|___|<-upper pad + + node.set_output_padding(padding( + { out_padd.lower_size().batch[0], crop_prim->offsets.feature[0], out_padd.lower_size().spatial[0], out_padd.lower_size().spatial[1] }, + { out_padd.upper_size().batch[0], input_layout.size.feature[0] - crop_prim->offsets.feature[0] - crop_size.feature[0], + out_padd.upper_size().spatial[0], out_padd.upper_size().spatial[1] })); + node.can_be_optimized(true); + } + } + }); + } + + //[3] Optimize all other primitives + node_itr = p.get_processing_order().begin(); + while (node_itr != p.get_processing_order().end()) + { + auto& node = (*node_itr++); + if (!can_optimize(node)) + continue; + program_helpers::do_for_types(*node, [&p](reshape_node& node) + { + node.get_output_layout(); + if (node.is_in_place() + && node.get_fused_activation_func() == activation_none) + node.can_be_optimized(true); + }); + program_helpers::do_for_types(*node, [&p](reorder_node& node) + { + auto& input = node.input(); + auto output_layout = node.get_output_layout(); + //This is WA for topologies that due to additional reorders added perform worse with conv1x1 optimization + auto remove_bf8_xy_opt = ((input.is_type() || input.is_type()) && + output_layout.format == format::bf8_xy16 && input.get_users().size() == 1); + //Remove reorder from convolution 1x1 to bfyx in some conditions + auto remove_byxf_opt = (input.is_type() && + input.get_users().size() == 1 && + input.get_output_layout().format == format::byxf); + //check if all inputs user have the same format + auto all_users_same_format = true; + auto input_user_layout_format = input.get_users().front()->get_output_layout().format; + for (auto const& user : input.get_users()) + { + if (user->get_output_layout().format != input_user_layout_format) + { + all_users_same_format = false; + break; + } + } + auto same_data_type = input.get_output_layout().data_type == output_layout.data_type; + //Optimization only available in case of layers that support different input and output formats. + //todo: new api needs to be created to read such caps + if (!(input.is_type() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) && + !remove_bf8_xy_opt && + !(input.is_type() && input.get_output_layout().format == format::bf8_xy16) && + !(input.is_type() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) && + !(remove_byxf_opt && (node.get_users().front()->is_type() || node.get_users().front()->is_type()))) + return; + + if (remove_bf8_xy_opt) + { + auto users_user_layout = node.get_users().front()->get_users().front()->get_output_layout(); + // if users_user_layout is still bf8_yx16 (stacked convolutions) then leave the reorder + if (users_user_layout.format == format::bf8_xy16) + return; + auto input_layout = input.get_output_layout(); + auto target_layout = layout(input_layout.data_type, users_user_layout.format, input_layout.size, input_layout.data_padding); + input.set_output_layout(target_layout, false); + } + else if (remove_byxf_opt) + { + auto user = node.get_users().front(); + auto users_users = node.get_users().front()->get_users(); + + for (auto const& users_user : users_users) + { + if (users_user->get_output_layout().format != format::byxf && !users_user->is_type()) + { + remove_byxf_opt = false; + break; + } + } + + if (remove_byxf_opt) + { + auto input_layout = input.get_output_layout(); + user->set_output_layout(input_layout, false); + } + } + else + input.set_output_layout(output_layout, false); + + node.can_be_optimized(true); + p.extract_and_remove(node); //try to remove redundant reorders + }); + } +} diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_depthwise_sep_opt.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_depthwise_sep_opt.cpp new file mode 100644 index 0000000..dd288a2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_depthwise_sep_opt.cpp @@ -0,0 +1,70 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_helpers.h" + + +template +void prepare_depthwise_sep_opt::optimize_depthwise_sep_pre(T& node) +{ + if (node.get_groups() == 1) { + //enable optimization only when IFM / split <= 8 (otherwise scheduling multiple opt kernels is better) and split >= 16 + if (!(node.get_dependency(0).get_output_layout().size.feature[0] / node.get_primitive()->split() <= 8) || + !(node.get_primitive()->split() >= 16)) + return; + + //make sure the weights and biases are data type and + //are not reused in other primitives as they will be overriden with concatenated ones + for (size_t i = 1; i < node.get_dependencies().size(); i++) + { + auto& weights_or_biases = node.get_dependency(i); + if (weights_or_biases.get_users().size() > 1 || weights_or_biases.type() != data::type_id()) + return; + } + } + else { + //enable optimization only when IFM / groups <= 8 (otherwise scheduling multiple opt kernels is better) and groups >= 16 + if (!(node.get_dependency(0).get_output_layout().size.feature[0] / node.get_groups() <= 8) || + !(node.get_groups() >= 16)) + return; + } + + node.set_depthwise_sep_opt(true); +} + +template void prepare_depthwise_sep_opt::optimize_depthwise_sep_pre(convolution_node& node); +template void prepare_depthwise_sep_opt::optimize_depthwise_sep_pre(deconvolution_node& node); + +void prepare_depthwise_sep_opt::run(program_impl& p) +{ + //depthiwise separated convolution/deconvolution optimization + for (auto& prim : p.get_processing_order()) + { + if (prim->type() == convolution::type_id()) + { + optimize_depthwise_sep_pre(prim->as()); + } + else if (prim->type() == deconvolution::type_id()) + { + optimize_depthwise_sep_pre(prim->as()); + } + } +} + + diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp new file mode 100644 index 0000000..8c536cc --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp @@ -0,0 +1,146 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pooling_inst.h" +#include "program_node.h" +#include "pass_manager.h" +#include "convolution_inst.h" +#include "sliding_window_utils.h" + +using namespace cldnn; + +void prepare_padding::run(program_impl& p) +{ + if (output_size_handling_enabled) + { + // Prepare upper padding for primitives that support output_size parameter. + for (const auto& node : p.get_processing_order()) + { + if (node->is_type()) + { + auto& prim_node = node->as(); + const auto& prim = prim_node.get_primitive(); + + if (!prim->with_output_size) + continue; + + auto filter_size = prim_node.weights(0).get_output_layout().size; + + auto needed_padding = calc_sliding_window_needed_input_padding( + prim_node.input().get_output_layout(), + prim->output_size, filter_size, prim->input_offset, prim->stride, prim->dilation, false, 1); + p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + } + else if (node->is_type()) + { + auto& prim_node = node->as(); + const auto& prim = prim_node.get_primitive(); + + if (!prim->with_output_size) + continue; + + auto filter_size = prim_node.weights(0).get_output_layout().size; + + auto needed_padding = calc_sliding_window_needed_input_padding( + prim_node.input().get_output_layout(), + prim->output_size, filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1); + + p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + } + else if (node->is_type()) + { + auto& prim_node = node->as(); + const auto& prim = prim_node.get_primitive(); + + if (!prim->with_output_size) + continue; + + // NOTE: Currently there is no pooling implementation/pooling mode which does not check input data range. + // There is no need to add padding requirements on pooling inputs. + //auto needed_padding = calc_sliding_window_needed_input_padding( + // prim_node.input().get_output_layout(), + // prim->output_size, prim->size, prim->input_offset, prim->stride, {1, 1, 1, 1}, false, 1); + auto needed_padding = prim_node.input().get_output_layout().data_padding; + + p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + } + } + } + + // Prepare optimized padding for bfyx convolution. + for (auto& pair : p.nodes_map) + { + if (pair.second->type() != convolution::type_id()) + continue; + + auto& node = pair.second->as(); + if (node.get_dependencies().empty()) + continue; + + auto conv = node.get_primitive(); + auto& conv_input_node = node.get_dependency(0); + auto conv_layout = node.get_output_layout(); + + // right now output padding optimization is only available for bfyx format and data type = float32 + if (conv_layout.format != cldnn::format::bfyx + && conv_layout.format != cldnn::format::bf8_xy16 + && conv_layout.format != cldnn::format::byxf_af32 + && conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 + && conv_layout.format != cldnn::format::b_fs_yx_fsv4) + { + continue; + } + + // We shoudn't apply any padding to nodes which are marked as outputs + if (conv_input_node.is_output()) + continue; + + // Calculating input padding needed for convolution + auto& filter_node = node.as().weights(0); + auto filter_prim = filter_node.get_primitive(); + + layout filter_layout = filter_node.get_output_layout(); + + // convolution have only one input primitive + auto prev_prim_output_layout = conv_input_node.get_output_layout(); + + // Compute initial required paddings for primitive used as input for convolution. + auto input_offset = conv->input_offset; + auto stride = conv->stride; + auto dilation = conv->dilation; + + auto input_limit_x = input_offset.spatial[0] + (conv_layout.size.spatial[0] - 1) * stride.spatial[0] + (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1; + auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] + (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1; + + auto left_padding = std::max(-input_offset.spatial[0], 0); + auto top_padding = std::max(-input_offset.spatial[1], 0); + auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0); + auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0); + + // Adjust right padding, so entire buffer size in X dimension is properly aligned. + // TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added. + //auto needed_buffer_size_x = static_cast( + // round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16)); + //right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0]; + + cldnn::padding needed_padding({ 0, 0, left_padding, top_padding }, { 0, 0, right_padding, bottom_padding }, 0); + needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding); + + p.apply_needed_padding(node, conv_input_node, needed_padding); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp new file mode 100644 index 0000000..e204b05 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -0,0 +1,542 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "api/CPP/pooling.hpp" +#include "api/CPP/proposal.hpp" +#include "api/CPP/roi_pooling.hpp" + +#include "program_helpers.h" +#include "pass_manager.h" + +#include "activation_inst.h" +#include "batch_norm_inst.h" +#include "batch_norm_grad_inst.h" +#include "crop_inst.h" +#include "eltwise_inst.h" +#include "fused_conv_bn_scale_inst.h" +#include "fused_conv_eltwise_inst.h" +#include "lrn_inst.h" +#include "mutable_data_inst.h" +#include "mvn_inst.h" +#include "normalize_inst.h" +#include "permute_inst.h" +#include "reshape_inst.h" +#include "softmax_inst.h" +#include "scale_inst.h" +#include "scale_grad_weights_inst.h" +#include "upsampling_inst.h" + + +void prepare_primitive_fusing::fuse_skip_layers(program_impl& p, program_node* node) +{ + program_helpers::do_for_types(*node, [&p](eltwise_node& node) + { + if (node.get_primitive()->mode != eltwise_mode::sum || node.inputs_count() != 2) + return; + + // both inputs should be deconvolutions + if (!(node.input(0).is_type() && node.input(1).is_type())) + { + return; + } + + auto& to_fuse_with = node.input(0); + int to_fuse_index = 1; + + //remove dependencies and users of elwtise that is going to be extracted + p.add_connection(node.input(to_fuse_index), to_fuse_with); + p.remove_connection(node.input(to_fuse_index), node); + + p.get_processing_order().erase(&to_fuse_with); + p.get_processing_order().insert(&node, &to_fuse_with); + + if (node.get_fused_activation_func() != activation_none) + to_fuse_with.set_fused_activation(node.get_fused_activation_func(), node.get_fused_activation_params()); + to_fuse_with.set_output_padding(node.get_output_layout().data_padding); + + p.extract_and_remove(node); + }); +} + +template +static bool node_is_type(program_node* n) +{ + return n->is_type(); +} + +void prepare_primitive_fusing::fuse_conv_bn_scale(program_impl& p, program_node* node) +{ + program_helpers::do_for_types(*node, [&p](convolution_node& node) + { + if (node.get_users().size() > 2) + return; + + auto found_bn = std::find_if(node.get_users().begin(), node.get_users().end(), node_is_type); + auto bn_node = found_bn != node.get_users().end() ? *found_bn : nullptr; + if (bn_node != nullptr) + { + if (bn_node->get_users().size() > 2) + return; + + auto found_scale = std::find_if(bn_node->get_users().begin(), bn_node->get_users().end(), node_is_type); + auto sc_node = found_bn != node.get_users().end() ? *found_scale : nullptr; + if (sc_node != nullptr) + { + int bn_index = int(std::distance(node.get_users().begin(), found_bn)); + int sc_index = int(std::distance(bn_node->get_users().begin(), found_scale)); + auto scale_prim = std::static_pointer_cast(sc_node->get_primitive()); + auto bn_prim = std::static_pointer_cast(bn_node->get_primitive()); + auto prim = node.get_primitive(); + bool training = false; + + if (node.get_users().size() == 2) + { + training = true; + float zero = 0.0f; + layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1)); + + auto bn_backw = node.get_users().begin(); + std::advance(bn_backw, bn_index == 0 ? 1 : 0); + if (!(*bn_backw)->is_type()) + return; + auto sc_backw = bn_node->get_users().begin(); + std::advance(sc_backw, sc_index == 0 ? 1 : 0); + if (!(*sc_backw)->is_type()) + return; + + auto conv_out_prim = std::make_shared(prim->id + "_fused_conv_out", memory::attach(dummy_layout, &zero, 1)); + auto& conv_out_node = p.get_or_create(conv_out_prim); + auto conv_out_mem = p.get_engine().allocate_memory(node.get_output_layout()); + conv_out_node.as().attach_memory(*conv_out_mem, false); + p.add_intermediate(conv_out_node, **bn_backw, 1, true); + + auto bn_out_prim = std::make_shared(prim->id + "_fused_bn_out", memory::attach(dummy_layout, &zero, 1)); + auto& bn_out_node = p.get_or_create(bn_out_prim); + auto bn_out_mem = p.get_engine().allocate_memory(bn_node->get_output_layout()); + bn_out_node.as().attach_memory(*bn_out_mem, false); + p.add_intermediate(bn_out_node, **sc_backw, 0, true); + } + + auto new_conv = std::make_shared(prim->id + "_fused", prim->input[0], prim->weights.ref(), prim->bias.ref(), bn_prim->epsilon, + scale_prim->input[1], scale_prim->bias, prim->stride, prim->dilation, prim->input_offset, bn_prim->inv_variance, + prim->with_activation, prim->activation_negative_slope, prim->output_padding); + auto& new_node = p.get_or_create(new_conv); + p.replace(node, new_node); + + while (sc_node->get_dependencies().size() > 1) // ToDo: here we modify users and dependencies, + // It should be done through public methods in program_node/program_impl + // to avoid friend declarations + { + auto& dep = sc_node->get_dependency(sc_node->get_dependencies().size() - 1); + p.remove_connection(dep, *sc_node); + dep.users.push_back(&new_node); + if (sc_node->get_dependencies().size() == 1) + new_node.dependencies.insert(new_node.dependencies.begin() + 1, &dep); + else + new_node.dependencies.push_back(&dep); + } + p.extract_and_remove(*sc_node); + while (bn_node->get_dependencies().size() > 1) + { + auto& dep = bn_node->get_dependency(bn_node->get_dependencies().size() - 1); + p.remove_connection(dep, *bn_node); + new_node.dependencies.push_back(&dep); + } + p.extract_and_remove(*bn_node); + auto inv_var_node = std::find_if(new_node.dependencies.begin(), new_node.dependencies.end(), + [&new_conv](const program_node* node) { return node->id().find(new_conv->inv_variance) != std::string::npos; }); + (*inv_var_node)->users.push_back(&new_node); + + if (training) + { + auto user = std::find_if(new_node.get_users().begin(), new_node.get_users().end(), + [](const program_node* node) { return node->id().find("_fused_conv_out") != std::string::npos; }); + p.reverse_connection(new_node, **user); + user = std::find_if(new_node.get_users().begin(), new_node.get_users().end(), + [](const program_node* node) { return node->id().find("_fused_bn_out") != std::string::npos; }); + p.reverse_connection(new_node, **user); + p.get_processing_order().calculate_BFS_processing_order(); + } + } + } + }); +} + +void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* node) +{ + // make sure this convolution have only 1 user and it's eltwise + // maek sure convolution is not an output + if (node->users.size() != 1 || + node->is_output()) + return; + + if (!(*(node->users.begin()))->is_type()) + return; + + convolution_node * conv_node = static_cast(node); + convolution & conv = const_cast(*conv_node->get_primitive()); + + // currently works only for this format + if ( (conv_node->get_output_layout().format != cldnn::format::fs_bs_yx_bsv4_fsv32 || conv_node->get_output_layout().data_type != cldnn::data_types::i8) && + (conv_node->get_output_layout().format != cldnn::format::bfyx || conv_node->get_output_layout().data_type != cldnn::data_types::f32) && + (conv_node->get_output_layout().format != cldnn::format::yxfb || conv_node->get_output_layout().data_type != cldnn::data_types::f16) + ) + return; + + auto weights_node_ptr = p.nodes_map.find(conv.weights[0])->second; + auto filter_size = weights_node_ptr->get_output_layout().size; + + // make sure if this is conv 1x1 its stride is 1x1 + if (filter_size.spatial[0] == 1 && filter_size.spatial[1] == 1) + { + if (conv.stride.spatial[0] != 1 || conv.stride.spatial[1] != 1) + return; + } + else + return; + + eltwise_node * eltw_node = static_cast(*(node->users.begin())); + + // make sure eltwise have only 2 inputs + // make sure eltwise is not an output + if (eltw_node->inputs_count() != 2 || + eltw_node->is_output()) + return; + + // only single ADD operation is currently supported + // TODO: enable more + eltwise & eltw = const_cast(*eltw_node->get_primitive()); + if (eltw.mode != eltwise_mode::sum) + return; + + if (eltw_node->get_fused_activation_func() == activation_relu_negative_slope) + { + eltw.with_activation = true; + eltw.activation_negative_slope = eltw_node->get_fused_activation_params().a; + } + else + { + return; + } + + int eltw_fused_input_idx; // <-- this input gets fused with eltwise + int eltw_second_input_idx; // <-- this input is not fused, so we add it in kernel + // here we check which input gets execute as last one, and fuse it + if (p.processing_order.get_processing_number(&eltw_node->input(0)) < p.processing_order.get_processing_number(&eltw_node->input(1))) + { + eltw_fused_input_idx = 1; + eltw_second_input_idx = 0; + } + else + { + eltw_fused_input_idx = 0; + eltw_second_input_idx = 1; + } + + // we check if input to fuse is convolution that we're right now processing + if (eltw_node->input(eltw_fused_input_idx).id() != conv.id) + return; + + primitive_id conv_id = conv_node->id(); + + // get strides for other than our conv input + std::vector new_eltw_strides; + // conv strides modified by eltwise stride + tensor new_conv_stride = conv.stride; + + if (eltw.stride.size() == eltw_node->inputs_count()) + { + // for cases when stride from eltwise must be applied into fused convolution + new_conv_stride.spatial[0] *= eltw.stride[eltw_fused_input_idx].spatial[0]; + new_conv_stride.spatial[1] *= eltw.stride[eltw_fused_input_idx].spatial[1]; + // stride from non-fused eltwise input + new_eltw_strides.push_back(eltw.stride[eltw_second_input_idx]); + } + + auto fused_conv_eltw = std::make_shared( + conv.id + "_fused_" + eltw.id, + conv_node->input().id(), + eltw_node->input(eltw_second_input_idx).id(), + eltw.mode, + conv.weights.ref(), + conv.bias.ref(), + conv.weights_quantization_factors.ref(), + conv.output_calibration_factors.ref(), + conv.input_quantization_factor, + eltw.output_calibration_factors, + new_eltw_strides, + new_conv_stride, + conv.input_offset, + conv.dilation, + conv.with_activation, + conv.activation_negative_slope, + eltw.with_activation, + eltw.activation_negative_slope + ); + + auto& new_node = p.get_or_create(fused_conv_eltw); + p.replace(*conv_node, new_node); + + // right now new node's user is eltwise, let's clear users and take eltwise's users + new_node.users.clear(); + p.replace_all_usages(*eltw_node, new_node); + + // TODO: do it better, now it's done in a very ugly way to have good dependency order + std::vector updated_deps; + updated_deps.push_back(new_node.dependencies[0]); + + // add second input + updated_deps.push_back(&eltw_node->input(eltw_second_input_idx)); + eltw_node->input(eltw_second_input_idx).users.push_back(&new_node); + + for (size_t d = 1; d < new_node.dependencies.size(); d++) + { + updated_deps.push_back(new_node.dependencies[d]); + } + + if (eltw_node->output_calibration_term()) + { + updated_deps.push_back(&eltw_node->output_calibration_factors()); + eltw_node->output_calibration_factors().users.push_back(&new_node); + } + + new_node.dependencies = updated_deps; + + while (eltw_node->dependencies.size() > 1) + { + auto& dep = eltw_node->get_dependency(eltw_node->get_dependencies().size() - 1); + p.remove_connection(dep, *eltw_node); + } + + p.extract_and_remove(*eltw_node); + new_node.recalc_output_layout(); +} + +void prepare_primitive_fusing::run(program_impl& p) +{ + bool is_debug = p.get_options().get()->enabled(); + + std::list conv_nodes; + auto itr = p.get_processing_order().begin(); //note we need to use iterators since currently processed element can be removed + while (itr != p.get_processing_order().end()) + { + auto node_itr = itr++; + if ((*node_itr)->is_type()) + conv_nodes.push_back(*node_itr); + } + + // Disabled due to kernel being not optimized + //itr = conv_nodes.begin(); + //while (itr != conv_nodes.end()) + //{ + // auto node_itr = itr++; + // auto& node = (*node_itr); + + // fuse_conv_bn_scale(p, node); + //} + + //This loop tries fusing several reorders one by one (if present) into one reorder + itr = p.get_processing_order().begin(); + while (itr != p.get_processing_order().end()) + { + auto node_itr = itr++; + auto& node = (*node_itr); + + if (node->is_output()) + continue; + + program_helpers::do_for_types(*node, [&p, is_debug](reorder_node& node) + { + auto& input = node.input(); + + //Restrictions: + // - inputs cannot be padded + // - primitives input cannot be output + // - input was optimized + if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.get_dependencies().size() != 1 || + input.can_be_optimized()) + return; + + // - check if previous node is reorder with 1 user (and if the layouts are the same - remove reorder) + // - do not fuse if current node has mean subtract + if (input.get_users().size() != 1 || + (!input.is_type() && input.get_output_layout() != node.get_users().front()->get_output_layout()) || + node.has_mean() || !node.get_primitive()->subtract_per_feature.empty()) + return; + + input.set_output_layout(node.get_output_layout(), false); + p.extract_and_remove(node); + }); + } + + itr = p.processing_order.begin(); + while (itr != p.processing_order.end()) + { + auto node_itr = itr++; + auto& node = (*node_itr); + + program_helpers::do_for_types(*node, [&p, is_debug](activation_node& node) + { + auto& input = node.input(); + + //Restrictions: + // - inputs cannot be padded + // - primitives input cannot be output + // - no activation additional input + // - input was optimized + if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() || + node.get_dependencies().size() != 1 || input.can_be_optimized()) + return; + + // - check if there is no activation fused already + // - limit to primitives which implementations support activation fusing + if (input.get_users().size() != 1 || input.get_fused_activation_func() != activation_none || + //TODO: new api needs to be created to read such caps + //right now use whitelist so no new primitives will be affected in case of lack of fused activation support + (!input.is_type() && !input.is_type() && !input.is_type() && + !input.is_type() && !input.is_type() && !input.is_type() && + !input.is_type() && !input.is_type() && !input.is_type() && + !input.is_type() && !input.is_type() && !input.is_type() && + !input.is_type() && !input.is_type() && !input.is_type() && + !input.is_type() && !input.is_type() && !input.is_type())) + return; + + input.set_fused_activation(node.get_primitive()->activation_func, node.get_primitive()->additional_params); + input.set_output_padding(node.get_output_layout().data_padding); + + p.extract_and_remove(node); + }); + } + + //This loop tries fusing eltwise (sum) with deconvolution + itr = p.get_processing_order().begin(); + while (itr != p.get_processing_order().end()) + { + auto node_itr = itr++; + auto& node = (*node_itr); + + fuse_skip_layers(p, node); + } +} + +void prepare_conv_eltw_fusing::run(program_impl& p) +{ + std::list conv_nodes; + auto itr = p.get_processing_order().begin(); //note we need to use iterators since currently processed element can be removed + while (itr != p.get_processing_order().end()) + { + auto node_itr = itr++; + if ((*node_itr)->is_type()) + conv_nodes.push_back(*node_itr); + } + + //fuse conv + eltwise after activations + itr = conv_nodes.begin(); + while (itr != conv_nodes.end()) + { + auto node_itr = itr++; + auto& node = (*node_itr); + + fuse_conv_eltwise(p, node); + } +} + +void prepare_conv_eltw_read_write_opt::conv_eltwise_read_write_opt(program_impl& p, program_node* node) +{ + fused_conv_eltwise_node * fused_conv_eltw_node = static_cast(node); + program_node * second_input_node = &fused_conv_eltw_node->get_dependency(1); + // output layouts must match + if (fused_conv_eltw_node->get_output_layout() != second_input_node->get_output_layout()) // check whole layout + { + return; + } + + // buffer shared between primitives, if second input is mutable data, then we can reuse this memory + auto shared_buffer_mem = second_input_node->is_type() ? second_input_node->as().get_attached_memory_ptr() : p.get_engine().allocate_memory(node->get_output_layout()); + + float zero = 0.0f; + layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1)); + + // this one is the first one to write data to + auto rw_output_prim0 = std::make_shared(fused_conv_eltw_node->id() + "_RW_OPT_use", memory::attach(dummy_layout, &zero, 1)); + // this one already expects data to be inside + auto rw_output_prim1 = std::make_shared(fused_conv_eltw_node->id() + "_RW_OPT_reuse", memory::attach(dummy_layout, &zero, 1)); + + auto& rw_output_node0 = p.get_or_create(rw_output_prim0); + auto& rw_output_node1 = p.get_or_create(rw_output_prim1); + + rw_output_node0.as().attach_memory(*shared_buffer_mem, false); + rw_output_node1.as().attach_memory(*shared_buffer_mem, false); + + // add connection between second input node -> rw_output_node0 -> node + p.add_intermediate(rw_output_node0, *node, 1, true); + // replace other connections with rw_output_node0 + auto itr = second_input_node->users.begin(); + while (itr != second_input_node->users.end()) + { + auto& usage = (*itr++); + if (usage->id() != rw_output_node0.id() && usage->id() != node->id()) + { + usage->replace_dependency(*second_input_node, rw_output_node0); + } + } + // add connection between node -> rw_output_node1 -> after nodes + //first find index in our first user's dependency + size_t dep_idx = 0; + for (auto dep : (*(node->users.begin()))->dependencies) + { + if (dep->id() == node->id()) + break; + dep_idx++; + } + p.add_intermediate(rw_output_node1, **(node->users.begin()), dep_idx, true); + // replace other connections with rw_output_node1 + itr = node->users.begin(); + while (itr != node->users.end()) + { + auto& usage = (*itr++); + if (usage->id() != rw_output_node1.id() && usage->id() != node->id()) + { + usage->replace_dependency(*node, rw_output_node1); + } + } + fused_conv_eltwise* prim = const_cast((fused_conv_eltw_node->get_primitive().get())); + prim->second_input_in_output = true; +} + +void prepare_conv_eltw_read_write_opt::run(program_impl& p) +{ + std::list fused_conv_eltw_nodes; + auto itr = p.get_processing_order().begin(); //note we need to use iterators since currently processed element can be removed + while (itr != p.get_processing_order().end()) + { + auto node_itr = itr++; + if ((*node_itr)->is_type()) + fused_conv_eltw_nodes.push_back(*node_itr); + } + + //fuse conv + eltwise after activations + itr = fused_conv_eltw_nodes.begin(); + while (itr != fused_conv_eltw_nodes.end()) + { + auto node_itr = itr++; + auto& node = (*node_itr); + + conv_eltwise_read_write_opt(p, node); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/propagate_constants.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/propagate_constants.cpp new file mode 100644 index 0000000..3b7fd33 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/propagate_constants.cpp @@ -0,0 +1,194 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_node.h" +#include "engine_impl.h" +#include "program_impl.h" +#include "network_impl.h" +#include "data_inst.h" + + +using namespace cldnn; + +//ToDo remove friendship relation from program_node and program_impl +void propagate_constants::run(program_impl& p) +{ + for (auto& node : p.get_processing_order()) + { + if (node->is_constant()) + handle_constant(p, *node); + } + + auto&& to_replace = calculate(p.get_engine()); + + //remove all nodes which are no longer relevant, i.e. nodes which: + // 1. are constants, and + // 2. do not have non-const user (so their data are not used during inference), and + // 3. are not marked as outputs. + // in case if node has either non-const user or is marked as output, it should be replace with cldnn::data rather than removed (see next loop) + auto proc_itr = p.get_processing_order().begin(); + while (proc_itr != p.get_processing_order().end()) + { + auto& node = (*proc_itr++); + if (!node->is_constant()) + continue; + if (has_non_const_user(*node) || (node->is_output() && !node->is_type())) + continue; + + auto& users = node->users; + auto& deps = node->dependencies; + + for (size_t idx = 0; idx < deps.size(); idx++) + { + deps.at(idx)->users.remove(node); + } + deps.clear(); + + for (auto& usr : users) { + auto& usr_deps = usr->dependencies; + usr_deps.erase(std::remove(usr_deps.begin(), usr_deps.end(), node), usr_deps.end()); + } + users.clear(); + + if (!node->is_output()) + { + auto rem = p.remove_if_dangling(*node); + assert(rem && "Non-output constant node which has only constant users should have been removed during constants propagation pass"); + (void)rem; + } + } + + //replace all constant nodes which are relevant for inference (either used by non-const user or marked as output) with recomputed cldnn::data + for (auto& cout : to_replace) + { + auto& id_to_replace = cout.first; + + //TODO: do not use API primitives internally and get rid of this last 'cldnn::memory' internal usage + memory api_memory = details::memory_c_to_cpp_converter::convert(api_cast(cout.second.get())); + //c-cpp converter does not retain since normally it is done inside API-impl layer (cldnn.cpp) so we need to do it manually + cout.second->add_ref(); + + auto const_data = std::make_shared("_cldnn_const_prop_" + id_to_replace, api_memory /* <<< REMOVE ME WHEN POSSIBLE */); + auto& new_node = p.get_or_create(const_data); + auto& curr_node = p.get_node(id_to_replace); + + if (!curr_node.is_type()) + { + auto curr_node_deps = curr_node.get_dependencies(); + for (auto& dep : curr_node_deps) + { + auto dep_users = dep->get_users(); + for (auto& dep_user : dep_users) + { + if (dep_user == &curr_node) + p.remove_connection(*dep, curr_node); + } + } + } + + curr_node.dependencies.clear(); + //remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any dependencies) + curr_node.users.erase( + std::remove_if(curr_node.users.begin(), curr_node.users.end(), [](program_node* node) { return node->is_constant(); }), + curr_node.users.end() + ); + p.replace(curr_node, new_node); + } +} + +bool propagate_constants::has_non_const_user(program_node& node) const { + if (!node.is_constant()) return true; + for (auto &user : node.get_users()) + { + if (!user->is_constant()) return true; + } + return false; +} + +std::list> propagate_constants::calculate(engine_impl &engine) +{ + if (!has_non_trivial_constants) + return{}; + + build_options bo; + bo.set_option(build_option::optimize_data(false)); + bo.set_option(build_option::outputs(const_outputs)); + network_impl::ptr net = engine.build_network(nodes, bo, true); + for (auto& cin : const_inputs) + net->set_input_data(cin->id(), cin->get_attached_memory()); + + net->execute({}); + net->reset_execution(true); //wait for computations to complete + auto outputs = net->get_outputs(); + + std::list> ret; + for (auto& out : outputs) + ret.push_back({ out->id(), &out->output_memory() }); + + return ret; +} + +void propagate_constants::handle_constant(program_impl& prog, program_node& node) +{ + if (!node.is_type()) + { + add_constant(prog, node); + if (has_non_const_user(node)) + const_outputs.push_back(node.id()); + } +} + +void propagate_constants::add_constant(program_impl& prog, program_node& node) +{ + if (node.is_type()) + return; + nodes.insert(prog.get_node_ptr(node.get_primitive()->id)); + has_non_trivial_constants = true; + + //if a node is either an endpoint or an output, always add it as an output + if (node.is_endpoint() || node.is_output()) + const_outputs.push_back(node.id()); + + //if a non-tirivial constant has a trivial input, add this input as an input for our network + add_deps_to_tpl(prog, node.get_dependencies()); +} + +void propagate_constants::add_deps_to_tpl(program_impl& prog, const std::vector& deps) +{ + /* + Nodes can share dependencies, if we already have dep in tpl, don't add it again. + example: + C <--- shared dep + / \ + / \ + A B + */ + for (auto& dep : deps) + { + if (dep->is_type()) + { + auto dep_ptr = prog.get_node_ptr(dep->get_primitive()->id); + if (nodes.find(dep_ptr) == nodes.end()) + { + nodes.insert(prog.get_node_ptr(dep->get_primitive()->id)); + const_inputs.push_back(&dep->as()); + } + } + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp new file mode 100644 index 0000000..bc36609 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp @@ -0,0 +1,92 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" +#include "program_helpers.h" + +using namespace cldnn; + +void remove_redundant_reorders::run(program_impl& p) +{ + auto itr = p.get_processing_order().begin(); //note we need to use iterators since currently processed element can be removed + while (itr != p.get_processing_order().end()) + { + auto& node = (*itr++); //post-inc to avoid invalidation due to possible erase + if (!node->is_type()) //only care for reorders + continue; + + program_node* current_node = node; + std::vector r_nodes_to_remove; + + auto optimize = true; + while (current_node) + { + auto& r_node = current_node->as(); + current_node = nullptr; + + if (r_node.has_mean() || !r_node.get_primitive()->subtract_per_feature.empty() //do not optimize if mean of subtract are present + || r_node.is_output()) //do not optimize when both reorder and layer before are outputs + { + optimize = false; + break; + } + + r_nodes_to_remove.push_back(&r_node); + + if (r_node.get_dependency(0).is_type() && r_node.get_dependencies().size() == 1 && r_node.get_users().size() == 1 && r_node.get_dependency(0).get_users().size() == 1) + current_node = &r_node.get_dependency(0); + } + if (!optimize) + continue; + + assert(node->get_dependencies().size() == 1 && "reorder without mean should have exactly one dependecy (input)"); + auto& r_output = r_nodes_to_remove.front(); + auto& r_input = r_nodes_to_remove.back()->get_dependency(0); + auto o_layout = r_output->get_output_layout(); + auto i_layout = r_input.get_output_layout(); + + auto ident = program_helpers::are_layouts_identical(o_layout, i_layout); + if (!ident.second) + continue; + + for (auto remove_reorder_node : r_nodes_to_remove) + { + auto& r_node = remove_reorder_node->as(); + + if (ident.first && ident.second && r_node.is_output() && r_node.get_dependency(0).is_input()) //do not optimize when reorder is output and layer before is input + { + optimize = false; + break; + } + } + if (!optimize) + continue; + + auto rem_itr = r_nodes_to_remove.begin(); + while (rem_itr != r_nodes_to_remove.end()) + { + auto remove_reorder_node = *rem_itr++; + auto& r_node = remove_reorder_node->as(); + //mark as optimized + r_node.can_be_optimized(true); + r_node.requires_reinterpret(!ident.first); + if (ident.first) //no need of reshape + p.extract_and_remove(r_node); //try to remove if possible (with respect to r_node not being marked as output) + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp new file mode 100644 index 0000000..7570881 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp @@ -0,0 +1,269 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "api/CPP/proposal.hpp" +#include "api/CPP/roi_pooling.hpp" +#include "api/CPP/reorg_yolo.hpp" +#include "api/CPP/eltwise.hpp" +#include "upsampling_inst.h" +#include "pass_manager.h" +#include "program_node.h" +#include "layout_optimizer.h" +#include "program_impl.h" +#include "program_helpers.h" + +using namespace cldnn; + +//ToDo remove friendship relation from program_impl + +reorder_inputs::reorder_inputs(layout_optimizer& lo_ref) : base_pass("reorder_inputs"), _lo(lo_ref) {} + +void reorder_inputs::run(program_impl& p) { + run(p, _lo); +} + +void reorder_inputs::run(program_impl& p, layout_optimizer& lo) +{ + //first pass to set layout optimization_attributes for topology + for (auto& node : p.get_processing_order()) + { + auto& prim = *node; + if (prim.type() == cldnn::convolution::type_id()) + { + if (prim.as().get_primitive()->split() > 1) + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1); + } + + //list of layers that do not support yxfb or perform worse than bfyx + if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() || + prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() || + prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id()) + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1); + } + + const auto reorder_input = [&p, &lo](typed_program_node& conv_node) + { + auto conv_prim = conv_node.get_primitive(); + auto& input_node = conv_node.get_dependency(0); + auto&& weights_layout = conv_node.weights(0).get_output_layout(); + auto&& input_layout = input_node.get_output_layout(); + + std::shared_ptr new_input = nullptr; + + if (input_node.type() == reorder::type_id()) //convolution's input is a reorder + { + auto reorder_prim = input_node.as().typed_desc(); + auto& reorder_input = input_node.get_dependency(0); + auto reorder_layout = input_node.get_output_layout(); + reorder_layout.data_type = *reorder_prim->output_data_type; + new_input = lo.get_reorder( + reorder_layout, + reorder_prim->id, + layout_optimizer::data_type::input, + conv_node, + weights_layout).first; + + auto reorder_removed = false; + if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal + { + auto reorder_input_layout = reorder_input.get_output_layout(); + + auto opt_layout = layout(*new_input->output_data_type, new_input->output_format, reorder_input_layout.size); + if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format + { + if (reorder_prim->subtract_per_feature.empty() && + reorder_prim->mean.empty() && + !reorder_prim->output_padding) //just plain reorder + { + conv_node.replace_dependency(0, reorder_input); + if (input_node.get_users().size() == 0 && !input_node.is_output()) + { + reorder_removed = p.extract_and_remove(input_node); + } + new_input = nullptr; + } + else //change reorder's output layout + { + reorder_prim->output_format = opt_layout.format; + reorder_prim->output_data_type = opt_layout.data_type; + new_input = nullptr; + } + } + else //current reorder gives bad output, simply change it + { + reorder_prim->output_format = opt_layout.format; + reorder_prim->output_data_type = opt_layout.data_type; + new_input = nullptr; + } + } + + if (!reorder_removed) + input_node.recalc_output_layout(); + else + conv_node.recalc_output_layout(); + } + else + { + new_input = lo.get_reorder( + input_node.get_output_layout(), + input_node.id(), + layout_optimizer::data_type::input, + conv_node, + weights_layout).first; + } + + if (new_input && new_input->output_format == format::winograd_2x3_s1_data) + { + auto lower_size = (conv_prim->input_offset.negate() + input_layout.size); + + tensor upper_input_padding = tensor{ 0 }; + upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2; //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0 + upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8; //for y, y - 2 % 8 == 0 must hold + + p.apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() }); + + auto winograd_output = std::make_shared("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format, + input_layout.data_type, std::vector{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding); + conv_node.output_layout.data_padding = padding{}; + program_node& back_node = p.get_or_create(winograd_output); + p.get_processing_order().insert_next(&conv_node, &back_node); + + auto bias_term = conv_node.bias_term(); + //create additional eltwise node after reorder to compute bias + if (bias_term) + { + auto& bias_node = conv_node.get_dependency(2); + std::vector inputs = { back_node.id(), bias_node.id() }; + auto winograd_output_biases = std::make_shared(back_node.id() + "_bias", inputs, + cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope, + back_node.get_output_layout().data_padding); + back_node.get_output_layout().data_padding = padding{}; + auto& back_bias_node = p.get_or_create(winograd_output_biases); + p.get_processing_order().insert_next(&back_node, &back_bias_node); + p.replace_all_usages(back_node, back_bias_node); + p.add_connection(back_node, back_bias_node); + p.add_connection(bias_node, back_bias_node); + conv_node.invalidate_users(); + p.replace_all_usages(conv_node, back_bias_node); + } + + if (conv_prim->with_activation) + { + conv_node.typed_desc()->with_activation = false; + if (!bias_term) + back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope }); + } + + if (!bias_term) + { + conv_node.invalidate_users(); + p.replace_all_usages(conv_node, back_node); + } + p.add_connection(conv_node, back_node); + + auto& r_node = p.get_or_create(new_input); + r_node.as().set_input_offset(conv_prim->input_offset); + + if (!bias_term) + { + p.swap_names(conv_node, back_node); + if (conv_node.is_output()) + { + conv_node.set_output(false); + back_node.set_output(true); + for (auto& output : p.get_outputs()) + { + if (output == &conv_node) + { + output = &back_node; + break; + } + } + } + } + else + { + conv_node.remove_dependency(2); + auto& back_bias_node = *(p.nodes_map.find(back_node.id() + "_bias")->second); + p.swap_names(conv_node, back_bias_node); + if (conv_node.is_output()) + { + conv_node.set_output(false); + back_bias_node.set_output(true); + for (auto& output : p.get_outputs()) + { + if (output == &conv_node) + { + output = &back_bias_node; + break; + } + } + } + } + } + + if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf)) + { + auto conv1x1_output = std::make_shared("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type); + auto& back_node = p.get_or_create(conv1x1_output); + p.get_processing_order().insert_next(&conv_node, &back_node); + conv_node.invalidate_users(); + p.replace_all_usages(conv_node, back_node); + p.add_connection(conv_node, back_node); + } + + if (new_input) + { + auto& r_node = p.get_or_create(new_input); + p.add_intermediate(r_node, conv_node, 0, r_node.get_dependencies().empty()); + conv_node.recalc_output_layout(); + } + }; + + const auto reorder_input_detection_output = [&p, &lo](typed_program_node& detection_output_node) + { + auto detection_output_prim = detection_output_node.get_primitive(); + + for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++) + { + auto& input = detection_output_node.get_dependency(i); + std::shared_ptr new_input = lo.get_reorder( + input.get_output_layout(), + input.id(), + layout_optimizer::data_type::input, + detection_output_node, + layout{ data_types::f32, format::bfyx, tensor{} }).first; + + if (new_input) + { + p.add_intermediate(new_input, detection_output_node, i); + } + } + }; + + for (auto& prim : p.get_processing_order()) + { + //there's an assumption that only convolution will take data/input_layout as input + //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above + program_helpers::do_for_types(*prim, + reorder_input, //case for convolution + reorder_input_detection_output //case for detection-output + ); + } +} diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp new file mode 100644 index 0000000..f9ff2f6 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp @@ -0,0 +1,76 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "pass_manager.h" + +//ToDo: remove those include with the appropriate code below once we will have support for multiple outputs of a primitive +#include "batch_norm_inst.h" +#include "max_unpooling_inst.h" +#include "pooling_inst.h" + +using namespace cldnn; + +//This pass optimizes out nodes which have no impact on outputs +void trim_to_outputs::run(program_impl& p) +{ + const size_t actual_nodes = p.get_processing_order().size(); + if (!actual_nodes) //degenerated case but can happen + return; + + if (p.get_outputs().size() == actual_nodes) + return; + + //do backward bfs starting from all outputs + std::list*> stack = { &(p.get_outputs()) }; + + std::vector special_nodes; + for (auto& node : p.get_processing_order()) + { + if (node->is_type() || //input layout may become disconnected during prior boxes calculations so it may have not been marked at this place but we don't want to remove it + node->is_type() || // ToDo: remove this after support for multi-outputs in primitives will be implemented. + node->is_type() || + (node->is_type() && node->as().get_primitive()->mode == pooling_mode::max_with_argmax)) + special_nodes.push_back(node); + } + stack.push_back(&special_nodes); + + while (!stack.empty()) + { + auto nodes_list = stack.front(); + stack.pop_front(); + + for (auto& node : *nodes_list) + { + if (!node->is_marked()) + { + node->mark(); + if (!node->get_dependencies().empty()) + stack.push_back(&node->get_dependencies()); + } + } + } + + //all not-marked nodes should be removed + std::list to_rem; + for (auto& node : p.get_processing_order()) + { + if (!node->is_marked()) + to_rem.push_back(node); + } + p.remove_nodes(to_rem); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/activation_inst.h b/inference-engine/thirdparty/clDNN/src/include/activation_inst.h index 80a5647..7ff10cf 100644 --- a/inference-engine/thirdparty/clDNN/src/include/activation_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/activation_inst.h @@ -26,6 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h b/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h index f9c7358..0ca4cda 100644 --- a/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h @@ -25,6 +25,7 @@ namespace cldnn template <> struct typed_program_node : public typed_program_node_base { + typed_program_node(const std::shared_ptr prim, program_impl& prog); using parent = typed_program_node_base; public: diff --git a/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h b/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h index 9569527..175dc8e 100644 --- a/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h @@ -18,6 +18,7 @@ #pragma once #include "api/CPP/batch_norm.hpp" #include "primitive_inst.h" +#include "mutable_data_inst.h" namespace cldnn { @@ -33,10 +34,34 @@ public: program_node& input() const { return get_dependency(0); } program_node& mean() const { return get_dependency(1); } program_node& variance() const { return get_dependency(2); } - program_node& inv_variance() const { return get_dependency(1); }; + program_node& scale() const + { + if(get_dependencies().size() >= 5) + return get_dependency(3); + else + return get_dependency(1); + } + program_node& shift() const + { + if (get_dependencies().size() >= 5) + return get_dependency(4); + else + return get_dependency(2); + } + program_node& inv_variance() const + { + if (get_dependencies().size() == 2) + return get_dependency(1); + else if (get_dependencies().size() == 6) + return get_dependency(5); + else + return get_dependency(3); + }; bool variance_term() const { return !get_primitive()->variance.empty(); } bool use_global_stats() const { return !get_primitive()->mean.empty() && !get_primitive()->variance.empty(); }; + bool use_scale_shift() const { return !get_primitive()->scale.empty() && !get_primitive()->shift.empty(); }; bool forwad_pass() const { return !get_primitive()->inv_variance.empty(); }; + bool calc_mean_var() const { return (use_global_stats() && mean().is_type() && variance().is_type()); }; }; @@ -56,9 +81,33 @@ public: memory_impl& mean_memory() const { return dep_memory(1); } memory_impl& variance_memory() const { return dep_memory(2); } - memory_impl& inv_variance_memory() const { return dep_memory(1); }; + memory_impl& scale_memory() const + { + if (dependencies().size() >= 5) + return dep_memory(3); + else + return dep_memory(1); + } + memory_impl& shift_memory() const + { + if (dependencies().size() >= 5) + return dep_memory(4); + else + return dep_memory(2); + } + memory_impl& inv_variance_memory() const + { + if (dependencies().size() == 2) + return dep_memory(1); + else if (dependencies().size() == 6) + return dep_memory(5); + else + return dep_memory(3); + }; bool use_global_stats() const { return !argument.mean.empty() && !argument.variance.empty(); }; + bool use_scale_shift() const { return !argument.scale.empty() && !argument.scale.empty(); }; bool forwad_pass() const { return !argument.inv_variance.empty(); }; + bool calc_mean_var() const { return node.calc_mean_var(); }; }; using batch_norm_inst = typed_primitive_inst; diff --git a/inference-engine/thirdparty/clDNN/src/include/border_inst.h b/inference-engine/thirdparty/clDNN/src/include/border_inst.h index ff3b28b..1190bfc 100644 --- a/inference-engine/thirdparty/clDNN/src/include/border_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/border_inst.h @@ -28,16 +28,15 @@ struct typed_program_node : typed_program_node_base private: using parent = typed_program_node_base; - public: using parent::parent; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } program_node& input() const { return get_dependency(0); } }; using border_node = typed_program_node; - template <> class typed_primitive_inst : public typed_primitive_inst_base { diff --git a/inference-engine/thirdparty/clDNN/src/include/broadcast_inst.h b/inference-engine/thirdparty/clDNN/src/include/broadcast_inst.h index f10b562..0cc920e 100644 --- a/inference-engine/thirdparty/clDNN/src/include/broadcast_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/broadcast_inst.h @@ -28,10 +28,10 @@ struct typed_program_node : typed_program_node_base private: using parent = typed_program_node_base; - public: using parent::parent; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } program_node& input() const { return get_dependency(0); } }; diff --git a/inference-engine/thirdparty/clDNN/src/include/concatenation_inst.h b/inference-engine/thirdparty/clDNN/src/include/concatenation_inst.h index 2ef3b1b..dfd0dd5 100644 --- a/inference-engine/thirdparty/clDNN/src/include/concatenation_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/concatenation_inst.h @@ -26,6 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/condition_inst.h b/inference-engine/thirdparty/clDNN/src/include/condition_inst.h new file mode 100644 index 0000000..1d8c1d6 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/condition_inst.h @@ -0,0 +1,127 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include + +#include "network_impl.h" +#include "primitive_inst.h" + +namespace cldnn +{ +namespace details +{ + +} + +template <> +struct typed_program_node : public typed_program_node_base +{ +private: + using parent = typed_program_node_base; + + class branch + { + public: + branch(topology_impl& tpl) : _topology(tpl) {} + + void set(const program_node& node) + { + add_or_change_input_layout(node); + _program = node.get_program().get_engine().build_program(_topology, node.get_program().get_options(), true); //rebuild program + } + program_impl::ptr get() const { return _program; } + + private: + topology_impl & _topology; + program_impl::ptr _program = nullptr; + + void add_or_change_input_layout(const program_node& node) + { + auto layout = node.get_dependency(0).get_output_layout(); + auto input_id = node.as().result_id(); + if (_program == nullptr) //if first run, create input_layout + { + _topology.add(std::make_shared(input_id, layout)); + for (auto& prim : _topology.get_primitives()) + { + for (auto& inp : prim.second->input) + { + if (inp == node.id()) + inp = input_id; + } + } + } + else + { + _topology.change_input_layout(input_id, layout); + } + } + }; + +public: + using parent::parent; + + typed_program_node(std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) + , _branch_true(*api_cast(this->get_primitive()->topology_true.get())) + , _branch_false(*api_cast(this->get_primitive()->topology_false.get())) + { + } + + program_node& input() const { return get_dependency(0); } + program_node& compare() const { return get_dependency(1); } + cond_functions func() const { return get_primitive()->function; } + tensor offset() const { return get_primitive()->offset; } + void set_branches() const + { + _branch_true.set(*this); + _branch_false.set(*this); + } + program_impl::ptr get_branch_true() const { return _branch_true.get(); } + program_impl::ptr get_branch_false() const{ return _branch_false.get(); } + primitive_id result_id() const { return id() + ":result"; } + +private: + mutable branch _branch_true; + mutable branch _branch_false; +}; + +using condition_node = typed_program_node; + + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(condition_node const& node); + static std::string to_string(condition_node const& node); + typed_primitive_inst(network_impl& network, condition_node const& node); + + memory_impl& input_memory() const { return dep_memory(0); } + memory_impl& compare_memory() const { return dep_memory(1); } + network_impl::ptr get_net_true() const { return _net_true; } + network_impl::ptr get_net_false() const { return _net_false; } + primitive_id result_id() const { return node.result_id(); } +private: + network_impl::ptr _net_true; + network_impl::ptr _net_false; +}; + +using condition_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/constants_propagator.h b/inference-engine/thirdparty/clDNN/src/include/constants_propagator.h deleted file mode 100644 index 7b402f3..0000000 --- a/inference-engine/thirdparty/clDNN/src/include/constants_propagator.h +++ /dev/null @@ -1,48 +0,0 @@ -/* -// Copyright (c) 2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once - -#include "program_impl.h" -#include "data_inst.h" - -namespace cldnn -{ - -class constants_propagator -{ -public: - constants_propagator(program_impl::ptr program); - - void visit_node(program_node& node); - - std::list> calculate(); - -private: - program_impl::ptr prog; - topology_impl tpl; - std::list*> const_inputs; - std::vector const_outputs; - bool has_non_trivial_constants = false; - - void handle_constant(program_node& node); - void add_constant(program_node& node); - void add_deps_to_tpl(const std::vector& node); - bool is_already_in_tpl(const primitive_id& id); -}; - -} diff --git a/inference-engine/thirdparty/clDNN/src/include/contract_inst.h b/inference-engine/thirdparty/clDNN/src/include/contract_inst.h new file mode 100644 index 0000000..bc783bc --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/contract_inst.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include + +#include "primitive_inst.h" + + +namespace cldnn +{ + template <> + struct typed_program_node : typed_program_node_base + { + private: + using parent = typed_program_node_base; + + public: + using parent::parent; + + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } + program_node& input() const { return get_dependency(0); } + }; + + using contract_node = typed_program_node; + + + template <> + class typed_primitive_inst : public typed_primitive_inst_base + { + using parent = typed_primitive_inst_base; + + public: + static layout calc_output_layout(contract_node const& node); + static std::string to_string(contract_node const& node); + typed_primitive_inst(network_impl& network, contract_node const& node); + }; + + using contract_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h b/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h index ed32f0c..cd1571d 100644 --- a/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h @@ -90,6 +90,11 @@ public: return false; } + bool output_grad_w() const + { + return get_primitive()->output_grad_w; + } + private: int32_t split; bool depthwise_sep_opt; @@ -165,6 +170,11 @@ public: else return false; } + + bool output_grad_w() const + { + return argument.output_grad_w; + } }; using convolution_grad_weights_inst = typed_primitive_inst; diff --git a/inference-engine/thirdparty/clDNN/src/include/convolution_inst.h b/inference-engine/thirdparty/clDNN/src/include/convolution_inst.h index b47e09f..8366839 100644 --- a/inference-engine/thirdparty/clDNN/src/include/convolution_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/convolution_inst.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -37,7 +37,9 @@ public: , transposed(false) , input_qf(this->get_primitive()->input_quantization_factor) , output_qf(this->get_primitive()->output_quantization_factor) + , groups(this->get_primitive()->groups) { + support_padding(true); } void set_split(int32_t node_split) { split = node_split; } @@ -49,6 +51,9 @@ public: void set_transposed(bool node_transposed) { transposed = node_transposed; } bool get_transposed() const { return transposed; } + void set_groups(uint32_t node_groups) { groups = node_groups; } + uint32_t get_groups() const { return groups; } + program_node& input() const { return get_dependency(0); } program_node& weights(size_t idx = 0) const @@ -107,6 +112,7 @@ private: bool transposed; float input_qf; float output_qf; + uint32_t groups; }; using convolution_node = typed_program_node; @@ -125,34 +131,50 @@ public: memory_impl& weights_memory(size_t index) const { - if (static_cast(index) >= node.get_split()) - throw std::range_error("weights offset too big"); - - return dep_memory(1 + index); + if (node.get_groups() == 1) { + if (static_cast(index) >= node.get_split()) + throw std::range_error("weights offset too big"); + return dep_memory(1 + index); + } + else { // all weights are in one buffer + return dep_memory(1); + } } memory_impl& bias_memory(size_t index) const { - if (static_cast(index) >= node.get_split()) - throw std::range_error("bias offset too big"); - - return dep_memory(1 + node.get_split() + index); + if (node.get_groups() == 1) { + if (static_cast(index) >= node.get_split()) + throw std::range_error("bias offset too big"); + return dep_memory(1 + node.get_split() + index); + } + else { // all bias are in one buffer + return dep_memory(2); + } } memory_impl& weights_quantization_factors_memory(size_t index) const { - if (static_cast(index) >= node.get_split()) - throw std::range_error("quantization factors offset too big"); - - return dep_memory(1 + 2*node.get_split() + index); + if (node.get_groups() == 1) { + if (static_cast(index) >= node.get_split()) + throw std::range_error("quantization factors offset too big"); + return dep_memory(1 + 2 * node.get_split() + index); + } + else { // all quantization_factors are in one buffer + return dep_memory(3); + }; } memory_impl& output_calibration_factors_memory(size_t index) const { - if (static_cast(index) >= node.get_split()) - throw std::range_error("quantization factors offset too big"); - - return dep_memory(1 + 3 * node.get_split() + index); + if (node.get_groups() == 1) { + if (static_cast(index) >= node.get_split()) + throw std::range_error("quantization factors offset too big"); + return dep_memory(1 + 3 * node.get_split() + index); + } + else { // all calibration_factors are in one buffer + return dep_memory(4); + } } bool bias_term() const diff --git a/inference-engine/thirdparty/clDNN/src/include/crop_inst.h b/inference-engine/thirdparty/clDNN/src/include/crop_inst.h index ef4260f..d845aac 100644 --- a/inference-engine/thirdparty/clDNN/src/include/crop_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/crop_inst.h @@ -23,13 +23,15 @@ namespace cldnn { template <> -class typed_program_node : public typed_program_node_base +struct typed_program_node : public typed_program_node_base { +private: using parent = typed_program_node_base; public: using parent::parent; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } program_node& input() const { return get_dependency(0); } }; diff --git a/inference-engine/thirdparty/clDNN/src/include/custom_gpu_primitive_inst.h b/inference-engine/thirdparty/clDNN/src/include/custom_gpu_primitive_inst.h index 377be1d..0e19a23 100644 --- a/inference-engine/thirdparty/clDNN/src/include/custom_gpu_primitive_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/custom_gpu_primitive_inst.h @@ -42,6 +42,9 @@ class typed_primitive_inst : public typed_primitive_inst_b public: static layout calc_output_layout(custom_gpu_primitive_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "custom_gpu_primitive_node!"); layout output_layout = node.get_primitive()->output_layout; // if the output layout format was set to any, it means the layer output format will be the same as the first input diff --git a/inference-engine/thirdparty/clDNN/src/include/deconvolution_inst.h b/inference-engine/thirdparty/clDNN/src/include/deconvolution_inst.h index a2e1516..adfe356 100644 --- a/inference-engine/thirdparty/clDNN/src/include/deconvolution_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/deconvolution_inst.h @@ -32,7 +32,9 @@ public: : parent(prim, prog) , split(this->get_primitive()->split()) , depthwise_sep_opt(false) + , groups(this->get_primitive()->groups) { + support_padding(true); } @@ -42,6 +44,9 @@ public: void set_depthwise_sep_opt(bool node_depthwise_sep_opt) { depthwise_sep_opt = node_depthwise_sep_opt; } bool get_depthwise_sep_opt() const { return depthwise_sep_opt; } + void set_groups(uint32_t node_groups) { groups = node_groups; } + uint32_t get_groups() const { return groups; } + program_node& input() const { return get_dependency(0); } program_node& weights(size_t idx = 0) const @@ -73,21 +78,22 @@ public: if (static_cast(idx) > 0) throw std::range_error("Only one input for fused sum is supported"); - int d_idx = 1 + this->get_split() + idx; + size_t d_idx = 1 + this->get_split() + idx; d_idx += bias_term() ? this->get_split() : 0; return get_dependency(d_idx); } bool has_fused_sum() const { - int d_idx = 1 + this->get_split(); + size_t d_idx = 1 + this->get_split(); d_idx += bias_term() ? this->get_split() : 0; - return static_cast(dependencies.size()) == (d_idx + 1); + return dependencies.size() == (d_idx + 1); } private: int32_t split; bool depthwise_sep_opt; + uint32_t groups; }; using deconvolution_node = typed_program_node; @@ -106,21 +112,28 @@ public: memory_impl& weights_memory(size_t index) const { - if (static_cast(index) >= node.get_split()) - throw std::range_error("weights offset too big"); - - return dep_memory(1 + index); + if (node.get_groups() == 1) { + if (static_cast(index) >= node.get_split()) + throw std::range_error("weights offset too big"); + return dep_memory(1 + index); + } + else { // all weights are in one buffer + return dep_memory(1); + } } memory_impl& bias_memory(size_t index) const { - if (argument.bias.size() == 0 && static_cast(index) >= node.get_split()) - throw std::range_error("no bias data"); - - if (static_cast(index) > node.get_split()) - throw std::range_error("bias offset too big"); - - return dep_memory(1 + node.get_split() + index); + if (node.get_groups() == 1) { + if (argument.bias.size() == 0 && static_cast(index) >= node.get_split()) + throw std::range_error("no bias data"); + if (static_cast(index) > node.get_split()) + throw std::range_error("bias offset too big"); + return dep_memory(1 + node.get_split() + index); + } + else { // all bias are in one buffer + return dep_memory(2); + } } bool bias_term() const diff --git a/inference-engine/thirdparty/clDNN/src/include/depth_to_space_inst.h b/inference-engine/thirdparty/clDNN/src/include/depth_to_space_inst.h new file mode 100644 index 0000000..5dda8d4 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/depth_to_space_inst.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api/CPP/depth_to_space.hpp" +#include "primitive_inst.h" + +namespace cldnn +{ +template <> +struct typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } +}; + +using depth_to_space_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(depth_to_space_node const& node); + static std::string to_string(depth_to_space_node const& node); + +public: + typed_primitive_inst(network_impl& network, depth_to_space_node const& desc); +}; + +using depth_to_space_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/detection_output_inst.h b/inference-engine/thirdparty/clDNN/src/include/detection_output_inst.h index f918b6d..d1d24a7 100644 --- a/inference-engine/thirdparty/clDNN/src/include/detection_output_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/detection_output_inst.h @@ -34,6 +34,7 @@ class typed_program_node : public typed_program_node_base; +template <> +class typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input() const { return get_dependency(0); } +}; + +using detection_output_sort_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(detection_output_sort_node const& node); + static std::string to_string(detection_output_sort_node const& node); + +public: + typed_primitive_inst(network_impl& network, detection_output_sort_node const& node); +}; + +using detection_output_sort_inst = typed_primitive_inst; + +namespace gpu { + primitive_impl* runDetectOutCpu(const detection_output_node& arg); + primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel); + primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel); } + +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h b/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h index f6d8f61..7b6e6cf 100644 --- a/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,6 +54,7 @@ template <> class typed_primitive_inst : public typed_primitive_inst_base { using parent = typed_primitive_inst_base; + static void check_inputs_count(eltwise_node const &node); public: static layout calc_output_layout(eltwise_node const& node); diff --git a/inference-engine/thirdparty/clDNN/src/include/embed_inst.h b/inference-engine/thirdparty/clDNN/src/include/embed_inst.h index 0455226..11bdc24 100644 --- a/inference-engine/thirdparty/clDNN/src/include/embed_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/embed_inst.h @@ -55,4 +55,4 @@ namespace cldnn using embed_inst = typed_primitive_inst; -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/engine_impl.h b/inference-engine/thirdparty/clDNN/src/include/engine_impl.h index ea1234a..3f81b76 100644 --- a/inference-engine/thirdparty/clDNN/src/include/engine_impl.h +++ b/inference-engine/thirdparty/clDNN/src/include/engine_impl.h @@ -22,7 +22,6 @@ #include "refcounted_obj.h" #include "implementation_map.h" #include "memory_pool.h" - #include "gpu/engine_info.h" #include @@ -41,6 +40,7 @@ struct event_impl; struct topology_impl; struct program_impl; struct network_impl; +struct program_node; template struct typed_program_node; @@ -49,9 +49,8 @@ struct engine_impl : public refcounted_obj { public: engine_impl(const engine_configuration& conf); - + ~engine_impl(); engine_types type() const { return engine_types::ocl; } - refcounted_obj_ptr allocate_memory(layout layout); refcounted_obj_ptr allocate_memory(layout layout, primitive_id, uint32_t, std::set, bool reusable = true); refcounted_obj_ptr reinterpret_buffer(const memory_impl& memory, layout new_layout); @@ -60,11 +59,13 @@ public: refcounted_obj_ptr create_user_event(bool set = false); void wait_for_events(std::vector const& events); - refcounted_obj_ptr build_program(const topology_impl& topology, const build_options& options, bool is_internal = false); + refcounted_obj_ptr build_program(const topology_impl& topology, const build_options& options, bool is_internal = false, bool no_optimizations = false); + refcounted_obj_ptr build_program(const std::set>& nodes, const build_options & options, bool is_internal); void compile_program(program_impl& prog); - refcounted_obj_ptr allocate_network(const program_impl& program); - refcounted_obj_ptr build_network(const topology_impl& topology, const build_options& options, bool internal_network = false); + refcounted_obj_ptr allocate_network(const program_impl& program, bool is_internal = false); + refcounted_obj_ptr build_network(const topology_impl& topology, const build_options& options, bool is_internal = false); + refcounted_obj_ptr build_network(const std::set>& nodes, const build_options & options, bool is_internal); void flush_network(); void release_pending_memory(); @@ -77,7 +78,23 @@ public: auto factory = implementation_map::get(type(), node); return std::move(std::unique_ptr(factory(node))); } - + + template + bool does_an_implementation_exist(typed_program_node const& node) + { + if (&node.get_program().get_engine() != this) + throw std::invalid_argument("engine_impl::create_primitive_impl: program's engine does not match called engine"); + return implementation_map::check(type(), node); + } + + template + bool does_possible_implementation_exist(typed_program_node const& node) + { + if (&node.get_program().get_engine() != this) + throw std::invalid_argument("engine_impl::create_primitive_impl: program's engine does not match called engine"); + return implementation_map::check_io_eq(type(), node); + } + const engine_configuration& configuration() const { return _configuration; } void set_mem_pool(bool flag) { _configuration.enable_memory_pool = flag; } std::shared_ptr get_context() const { return _context; } @@ -97,4 +114,4 @@ private: }; } -API_CAST(::cldnn_engine, cldnn::engine_impl) +API_CAST(::cldnn_engine, cldnn::engine_impl) \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/error_handler.h b/inference-engine/thirdparty/clDNN/src/include/error_handler.h index 36f6bd7..5126d01 100644 --- a/inference-engine/thirdparty/clDNN/src/include/error_handler.h +++ b/inference-engine/thirdparty/clDNN/src/include/error_handler.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2017 Intel Corporation +// Copyright (c) 2017-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -34,23 +34,33 @@ namespace err_details template inline void error_on_not_equal(const std::string& file, int line, const std::string& instance_id, const std::string& number_id, N1 number, const std::string& compare_to_id, N2 number_to_compare_to, const std::string& additional_message = "") { - std::stringstream error_msg; + if (number != static_cast(number_to_compare_to)) { - if (number != static_cast(number_to_compare_to)) - { - error_msg << number_id << "(=" << number << ") is not equal to: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; - err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); - } + std::stringstream error_msg; + error_msg << number_id << "(=" << number << ") is not equal to: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; + err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } } #define CLDNN_ERROR_NOT_EQUAL(instance_id, number_id, number, compare_to_id, number_to_compare_to, add_msg) error_on_not_equal(__FILE__, __LINE__, instance_id, number_id, number, compare_to_id, number_to_compare_to, add_msg) template +inline void error_on_equal(const std::string& file, int line, const std::string& instance_id, const std::string& number_id, N1 number, const std::string& compare_to_id, N2 number_to_compare_to, const std::string& additional_message = "") +{ + if (number == static_cast(number_to_compare_to)) + { + std::stringstream error_msg; + error_msg << number_id << "(=" << number << ") is equal to: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; + err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); + } +} +#define CLDNN_ERROR_EQUAL(instance_id, number_id, number, compare_to_id, number_to_compare_to, add_msg) error_on_equal(__FILE__, __LINE__, instance_id, number_id, number, compare_to_id, number_to_compare_to, add_msg) + +template inline void error_on_greater_than(const std::string& file, int line, const std::string& instance_id, const std::string& number_id, N1 number, const std::string& compare_to_id, N2 number_to_compare_to, const std::string& additional_message = "") { - std::stringstream error_msg; if (number > static_cast(number_to_compare_to)) { + std::stringstream error_msg; error_msg << number_id << "(=" << number << ") is greater than: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } @@ -60,9 +70,9 @@ inline void error_on_greater_than(const std::string& file, int line, const std:: template inline void error_on_less_than(const std::string& file, int line, const std::string& instance_id, const std::string& number_id, N1 number, const std::string& compare_to_id, N2 number_to_compare_to, const std::string& additional_message = "") { - std::stringstream error_msg; if (number < static_cast(number_to_compare_to)) { + std::stringstream error_msg; error_msg << number_id << "(=" << number << ") is less than: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } @@ -72,9 +82,9 @@ inline void error_on_less_than(const std::string& file, int line, const std::str template inline void error_on_less_or_equal_than(const std::string& file, int line, const std::string& instance_id, const std::string& number_id, N1 number, const std::string& compare_to_id, N2 number_to_compare_to, const std::string& additional_message = "") { - std::stringstream error_msg; if (number <= static_cast(number_to_compare_to)) { + std::stringstream error_msg; error_msg << number_id << "(=" << number << ") is less or equal than: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } @@ -84,9 +94,9 @@ inline void error_on_less_or_equal_than(const std::string& file, int line, const template inline void error_on_greater_or_equal_than(const std::string& file, int line, const std::string& instance_id, const std::string& number_id, N1 number, const std::string& compare_to_id, N2 number_to_compare_to, const std::string& additional_message = "") { - std::stringstream error_msg; if (number >= static_cast(number_to_compare_to)) { + std::stringstream error_msg; error_msg << number_id << "(=" << number << ") is greater or equal than: " << compare_to_id << "(=" << number_to_compare_to << ")" << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } @@ -96,9 +106,9 @@ inline void error_on_greater_or_equal_than(const std::string& file, int line, co template inline void error_on_nullptr(const std::string& file, int line, const std::string& instance_id, const std::string& condition_id, ptr condition, const std::string& additional_message = "") { - std::stringstream error_msg; if (condition == nullptr) { + std::stringstream error_msg; error_msg << condition_id << " should not be null" << std::endl; err_details::cldnn_print_error_message(file, line, instance_id, error_msg, additional_message); } @@ -108,7 +118,6 @@ inline void error_on_nullptr(const std::string& file, int line, const std::strin template inline void error_on_not_proper_enum_values(const std::string& file, int line, const std::string& instance_id, const std::string& mode_id, M mode, const std::string& modes_id, Ms... modes_to_compare_to) { - std::stringstream error_msg; auto enum_value_string = [](const M& mode)->std::string { if (std::is_same::value) { @@ -119,6 +128,7 @@ inline void error_on_not_proper_enum_values(const std::string& file, int line, c const std::array modes{ std::forward(modes_to_compare_to)... }; if (std::all_of(modes.begin(), modes.end(), [&](const M& m)->int {return mode != m; })) { + std::stringstream error_msg; error_msg << mode_id << "( " << enum_value_string(mode) << " ) is incompatible with " << modes_id << ". Should be one of: "; for (const auto& ms : modes) { @@ -142,8 +152,9 @@ void error_on_mismatch_layout(const std::string& file, int line, const std::stri void error_on_bool(const std::string& file, int line, const std::string& instance_id, const std::string& condition_id, bool condition, const std::string& additional_message = ""); #define CLDNN_ERROR_BOOL(instance_id, condition_id, condition, add_msg) error_on_bool(__FILE__, __LINE__, instance_id, condition_id, condition, add_msg) -void error_on_mismatching_data_types(const std::string& file, int line, const std::string& instance_id, const std::string& data_format_1_id, data_types data_format_1, const std::string& data_format_2_id, data_types data_format_2, const std::string& additional_message = ""); +void error_on_mismatching_data_types(const std::string& file, int line, const std::string& instance_id, const std::string& data_format_1_id, data_types data_format_1, const std::string& data_format_2_id, data_types data_format_2, const std::string& additional_message = "", bool ignore_sign = false); #define CLDNN_ERROR_DATA_TYPES_MISMATCH(instance_id, data_format_1_id, data_format_1, data_format_2_id, data_format_2, add_msg) error_on_mismatching_data_types(__FILE__, __LINE__, instance_id, data_format_1_id, data_format_1, data_format_2_id, data_format_2, add_msg) +#define CLDNN_ERROR_DATA_TYPES_MISMATCH_IGNORE_SIGN(instance_id, data_format_1_id, data_format_1, data_format_2_id, data_format_2, add_msg) error_on_mismatching_data_types(__FILE__, __LINE__, instance_id, data_format_1_id, data_format_1, data_format_2_id, data_format_2, add_msg, true) void error_on_tensor_dims_less_than_other_tensor_dims(const std::string& file, int line, const std::string& instance_id, const std::string& tensor_id, const tensor& tens, const std::string& tensor_to_compare_to_id, const tensor& tens_to_compre, const std::string& additional_message = ""); #define CLDNN_ERROR_TENSOR_SIZES_LESS_THAN(instance_id, tensor_id, tensor_1, compare_to_id, tensor_to_compare_to, ...) error_on_tensor_dims_less_than_other_tensor_dims(__FILE__, __LINE__, instance_id, tensor_id, tensor_1, compare_to_id, tensor_to_compare_to, __VA_ARGS__) diff --git a/inference-engine/thirdparty/clDNN/src/include/event_impl.h b/inference-engine/thirdparty/clDNN/src/include/event_impl.h index a8adc74..4e696e2 100644 --- a/inference-engine/thirdparty/clDNN/src/include/event_impl.h +++ b/inference-engine/thirdparty/clDNN/src/include/event_impl.h @@ -33,7 +33,8 @@ public: void wait(); bool is_set(); - + virtual bool is_valid() const { return _attached; } + virtual void reset() { _attached = false; } //returns true if handler has been successfully added bool add_event_handler(cldnn_event_handler handler, void* data); @@ -48,7 +49,7 @@ private: protected: bool _set = false; - + bool _attached = false; //because ocl event can be attached later, we need mechanism to check if such event was attached void call_handlers(); virtual void wait_impl() = 0; diff --git a/inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h b/inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h new file mode 100644 index 0000000..7e7b572 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h @@ -0,0 +1,149 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api_extension/CPP/fused_conv_bn_scale.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn +{ + +template <> +struct typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + typed_program_node(std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) + , split(this->get_primitive()->split()) + { + } + + void set_split(int32_t node_split) { split = node_split; } + int32_t get_split() const { return split; } + + program_node& input(size_t idx = 0) const + { + if (static_cast(idx) >= static_cast(desc->input.size())) + throw std::range_error("input index too big"); + + return get_dependency(idx); + } + + program_node& weights(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("weights offset too big"); + + return get_dependency(desc->input.size() + idx); + } + + program_node& bias(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("bias offset too big"); + + return get_dependency(desc->input.size() + this->get_split() + idx); + } + + program_node& weights_quantization_factors(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("quantization factor offset too big"); + + return get_dependency(desc->input.size() + 2*this->get_split() + idx); + } + + program_node& output_calibration_factors(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("calibration factor offset too big"); + + return get_dependency(desc->input.size() + 3 * this->get_split() + idx); + } + + bool bias_term() const + { + return get_primitive()->bias.size() > 0; + } + + bool scale_bias_term() const + { + return !get_primitive()->scale_bias.empty(); + } + + bool is_fused_in_training() const + { + return !get_primitive()->inv_variance.empty(); + } + +private: + int32_t split; +}; + +using fused_conv_bn_scale_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(fused_conv_bn_scale_node const& node); + static std::string to_string(fused_conv_bn_scale_node const& node); + +public: + typed_primitive_inst(network_impl& network, fused_conv_bn_scale_node const& node); + + memory_impl& weights_memory(size_t index) const + { + if (static_cast(index) >= node.get_split()) + throw std::range_error("weights offset too big"); + + return dep_memory(inputs_memory_count() + index); + } + + memory_impl& bias_memory(size_t index) const + { + if (static_cast(index) >= node.get_split()) + throw std::range_error("bias offset too big"); + + return dep_memory(inputs_memory_count() + node.get_split() + index); + } + + bool bias_term() const + { + return node.bias_term(); + } + + bool scale_bias_term() const + { + return node.scale_bias_term(); + } + + bool is_fused_in_training() const + { + return node.is_fused_in_training(); + } +}; + +using fused_conv_bn_scale_inst = typed_primitive_inst; + +} diff --git a/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h b/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h new file mode 100644 index 0000000..051ec11 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h @@ -0,0 +1,204 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api_extension/CPP/fused_conv_eltwise.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn +{ + +template <> +struct typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + typed_program_node(std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) + , split(this->get_primitive()->split()) + , depthwise_sep_opt(false) + , transposed(false) + , conv_input_qf(this->get_primitive()->conv.input_quantization_factor) + , conv_output_qf(this->get_primitive()->conv.output_quantization_factor) + { + } + + void set_split(int32_t node_split) { split = node_split; } + int32_t get_split() const { return split; } + + void set_depthwise_sep_opt(bool node_depthwise_sep_opt) { depthwise_sep_opt = node_depthwise_sep_opt; } + bool get_depthwise_sep_opt() const { return depthwise_sep_opt; } + + void set_transposed(bool node_transposed) { transposed = node_transposed; } + bool get_transposed() const { return transposed; } + + program_node& input(size_t idx = 0) const + { + if (static_cast(idx) >= static_cast(desc->input.size())) + throw std::range_error("input index too big"); + + return get_dependency(idx); + } + + program_node& weights(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("weights offset too big"); + + return get_dependency(desc->input.size() + idx); + } + + program_node& bias(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("bias offset too big"); + + return get_dependency(desc->input.size() + this->get_split() + idx); + } + + program_node& weights_quantization_factors(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("quantization factor offset too big"); + + return get_dependency(desc->input.size() + 2 * this->get_split() + idx); + } + + program_node& conv_output_calibration_factors(size_t idx = 0) const + { + if (static_cast(idx) >= this->get_split()) + throw std::range_error("calibration factor offset too big"); + + return get_dependency(desc->input.size() + 3 * this->get_split() + idx); + } + + program_node& eltw_output_calibration_factors() const + { + return get_dependency(desc->input.size() + 4 * this->get_split()); + } + + bool bias_term() const + { + return get_primitive()->conv.bias.size() > 0; + } + + bool weights_quantization_term() const + { + return get_primitive()->conv.weights_quantization_factors.size() > 0; + } + + bool conv_output_calibration_term() const + { + return get_primitive()->conv.output_calibration_factors.size() > 0; + } + + bool eltw_output_calibration_term() const + { + return get_primitive()->eltw.output_calibration_factors.size() > 0; + } + + float get_conv_input_qf() const { return conv_input_qf; } + float get_conv_output_qf() const { return conv_output_qf; } + float get_eltw_output_qf() const { return eltw_output_qf; } + +private: + int32_t split; + bool depthwise_sep_opt; + bool transposed; + float conv_input_qf; + float conv_output_qf; + float eltw_output_qf; +}; + +using fused_conv_eltwise_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(fused_conv_eltwise_node const& node); + static std::string to_string(fused_conv_eltwise_node const& node); + +public: + typed_primitive_inst(network_impl& network, fused_conv_eltwise_node const& node); + + memory_impl& weights_memory(size_t index) const + { + if (static_cast(index) >= node.get_split()) + throw std::range_error("weights offset too big"); + + return dep_memory(2 + index); + } + + memory_impl& bias_memory(size_t index) const + { + if (static_cast(index) >= node.get_split()) + throw std::range_error("bias offset too big"); + + return dep_memory(2 + node.get_split() + index); + } + + memory_impl& weights_quantization_factors_memory(size_t index) const + { + if (static_cast(index) >= node.get_split()) + throw std::range_error("quantization factors offset too big"); + + return dep_memory(2 + 2*node.get_split() + index); + } + + memory_impl& output_calibration_factors_memory(size_t index) const + { + if (static_cast(index) >= node.get_split()) + throw std::range_error("quantization factors offset too big"); + + return dep_memory(2 + 3 * node.get_split() + index); + } + + memory_impl& eltw_output_calibration_factors_memory() const + { + return dep_memory(2 + 4 * node.get_split()); + } + + bool bias_term() const + { + return node.bias_term(); + } + + bool weights_quantization_factors_term() const + { + return node.weights_quantization_term(); + } + + bool conv_output_calibration_factors_term() const + { + return node.conv_output_calibration_term(); + } + + bool eltw_output_calibration_factors_term() const + { + return node.eltw_output_calibration_term(); + } +}; + +using fused_conv_eltwise_inst = typed_primitive_inst; + +} diff --git a/inference-engine/thirdparty/clDNN/src/include/gather_inst.h b/inference-engine/thirdparty/clDNN/src/include/gather_inst.h new file mode 100644 index 0000000..a2ee829 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/gather_inst.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api/CPP/gather.hpp" +#include "primitive_inst.h" + +namespace cldnn +{ +template <> +struct typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } +}; + +using gather_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(gather_node const& node); + static std::string to_string(gather_node const& node); + +public: + typed_primitive_inst(network_impl& network, gather_node const& desc); +}; + +using gather_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/generic_layer_inst.h b/inference-engine/thirdparty/clDNN/src/include/generic_layer_inst.h index 468591b..2bba001 100644 --- a/inference-engine/thirdparty/clDNN/src/include/generic_layer_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/generic_layer_inst.h @@ -26,6 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog); public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/implementation_map.h b/inference-engine/thirdparty/clDNN/src/include/implementation_map.h index 5fc2710..7472038 100644 --- a/inference-engine/thirdparty/clDNN/src/include/implementation_map.h +++ b/inference-engine/thirdparty/clDNN/src/include/implementation_map.h @@ -57,6 +57,10 @@ struct implementation_key { return std::make_tuple(engine_type, primitive.get_dependency(0).get_output_layout().data_type, primitive.get_dependency(0).get_output_layout().format); } + type operator()(engine_types engine_type, const layout& proposed_layout) + { + return std::make_tuple(engine_type, proposed_layout.data_type, proposed_layout.format); + } }; template<> @@ -67,6 +71,10 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } }; template<> @@ -77,6 +85,11 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } + }; template<> @@ -87,6 +100,11 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } + }; template<> @@ -97,6 +115,11 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } + }; template<> @@ -107,6 +130,11 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } + }; template<> @@ -117,6 +145,11 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } + }; template<> @@ -127,6 +160,10 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } }; template<> @@ -137,6 +174,11 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } + }; template<> @@ -147,6 +189,10 @@ struct implementation_key { return engine_type; } + type operator()(engine_types engine_type, const layout&) + { + return engine_type; + } }; template @@ -162,12 +208,35 @@ public: auto key = key_builder()(engine_type, primitive); auto it = map_type::instance().find(key); if (it == std::end(map_type::instance())) - throw std::runtime_error(std::string("implementation_map for ")+typeid(primitive_kind).name() +" could not find any implementation to match key"); - + throw std::runtime_error( + std::string("implementation_map for ") + typeid(primitive_kind).name() + + " could not find any implementation to match key"); // create implementation & attach it to result return it->second; } + //check if for a given engine and type there exist an implementation + static bool check(engine_types engine_type, const typed_program_node& primitive) + { + auto key = key_builder()(engine_type, primitive); + auto it = map_type::instance().find(key); + if (it == std::end(map_type::instance())) + return false; + else + return true; + } + + //check if there exists a kernel implementation of a primitive with output set it primitive's output layout + static bool check_io_eq(engine_types engine_type, const typed_program_node& primitive) + { + auto key = key_builder()(engine_type, primitive.get_output_layout()); + auto it = map_type::instance().find(key); + if (it == std::end(map_type::instance())) + return false; + else + return true; + } + static void add(typename map_type::key_type key, factory_type factory) { map_type::instance().insert({ key, factory }); } diff --git a/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h b/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h index 0d775f6..b414849 100644 --- a/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h @@ -36,7 +36,8 @@ namespace cldnn } program_node& input() const { return get_dependency(0); } program_node& indices() const { return get_dependency(1); } - index_select_axis_name get_axis() const { return get_primitive()->axis; } + bool get_reverse() const { return get_primitive()->reverse; } + std::vector get_axes() const { return get_primitive()->axis; } }; using index_select_node = typed_program_node; @@ -53,7 +54,8 @@ namespace cldnn memory_impl& input() const { return dep_memory(0); } memory_impl& indices() const { return dep_memory(1); } - index_select_axis_name get_axis() const { return node.get_axis(); } + bool get_reverse() const { return node.get_reverse(); } + std::vector get_axes() const { return node.get_axes(); } }; using index_select_inst = typed_primitive_inst; diff --git a/inference-engine/thirdparty/clDNN/src/include/input_layout_inst.h b/inference-engine/thirdparty/clDNN/src/include/input_layout_inst.h index 64d9ba7..ef4bbe1 100644 --- a/inference-engine/thirdparty/clDNN/src/include/input_layout_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/input_layout_inst.h @@ -24,10 +24,12 @@ namespace cldnn struct memory_impl; template <> -struct typed_program_node : public typed_program_node_base +struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; using parent::parent; + + typed_program_node(const std::shared_ptr prim, program_impl& prog); }; using input_layout_node = typed_program_node; diff --git a/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h b/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h index 6030ccd..b21729e 100644 --- a/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h +++ b/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h @@ -15,10 +15,7 @@ #pragma once #include "api/C/cldnn.h" -#include "api/CPP/program.hpp" - -#include "gpu/ocl_toolkit.h" -#include "program_impl.h" +#include "api/CPP/tensor.hpp" #include "kernel_selector_params.h" #include "kernel_selector_common.h" @@ -28,6 +25,16 @@ using namespace cldnn; +namespace cldnn +{ + enum class data_types : size_t; + enum class tuning_mode; + struct format; + struct layout; + struct program_impl; + struct program_node; +} + namespace kernel_selector { using n_dims = kernel_selector::Tensor::NDims; @@ -63,6 +70,7 @@ namespace kernel_selector using tuning_mode = kernel_selector::TuningMode; using sample_type = kernel_selector::SampleType; using border_type = kernel_selector::BorderType; + using gather_axis = kernel_selector::GatherAxis; using data_tensor = kernel_selector::DataTensor; using weights_tensor = kernel_selector::WeightsTensor; @@ -74,6 +82,8 @@ namespace kernel_selector using params = kernel_selector::Params; using weights_reorder_params = kernel_selector::WeightsReorderParams; using generic_kernel_params = kernel_selector::GenericKernelParams; + + struct training_params; } kernel_selector::data_type to_data_type(data_types dt); @@ -104,59 +114,45 @@ kernel_selector::dim_tensor convert_dim_vector(const tensor& t) } template -inline void convert_activation_func_params(const p_type primitive, kernel_selector::base_params& params) +inline void convert_activation_func_params(const p_type primitive, kernel_selector::base_activation_params& params) { const float negative_slope = primitive->activation_negative_slope; if (negative_slope != 0.0f) { - params.activationParams.m = negative_slope; - params.activationFunc = kernel_selector::activation_function::RELU_NEGATIVE_SLOPE; + params.m = negative_slope; + params.function = kernel_selector::activation_function::RELU_NEGATIVE_SLOPE; } else { - params.activationFunc = kernel_selector::activation_function::RELU; + params.function = kernel_selector::activation_function::RELU; } } template -inline void convert_fused_activation_func_params(const arg_t& arg, kernel_selector::base_params& params) +inline void convert_fused_activation_func_params(const arg_t& arg, kernel_selector::base_activation_params& params) { - params.activationParams.m = arg.get_fused_activation_params().a; - params.activationParams.n = arg.get_fused_activation_params().b; - params.activationFunc = get_kernel_selector_activation_param(arg.get_fused_activation_func()); + params.m = arg.get_fused_activation_params().a; + params.n = arg.get_fused_activation_params().b; + params.function = get_kernel_selector_activation_param(arg.get_fused_activation_func()); } template -inline void convert_new_activation_func(const p_type primitive, kernel_selector::base_params& params) +inline void convert_new_activation_func(const p_type primitive, kernel_selector::base_activation_params& params) { - params.activationFunc = get_kernel_selector_activation_param(primitive->activation_func); - params.activationParams.m = primitive->additional_params.a; - params.activationParams.n = primitive->additional_params.b; + params.function = get_kernel_selector_activation_param(primitive->activation_func); + params.m = primitive->additional_params.a; + params.n = primitive->additional_params.b; } +void set_params(const program_node& node, kernel_selector::params& params); + template inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) { params_t params; - const auto& context = arg.get_program().get_engine().get_context(); - const auto& engine_info = context->get_engine_info(); - - params.engineInfo.bSubGroupSupport = context->extension_supported("cl_intel_subgroups"); - params.engineInfo.bSubGroupShortSupport = context->extension_supported("cl_intel_subgroups_short"); - params.engineInfo.bFP16Support = context->extension_supported("cl_khr_fp16"); - params.engineInfo.bFP64Support = context->extension_supported("cl_khr_fp64"); - params.engineInfo.bIMADSupport = engine_info.supports_imad != 0; - params.engineInfo.bIMMADSupport = engine_info.supports_immad != 0; - params.engineInfo.bImageSupport = engine_info.supports_image != 0; - params.engineInfo.maxWorkGroupSize = engine_info.max_work_group_size; - params.engineInfo.maxLocalMemSize = engine_info.max_local_mem_size; - params.engineInfo.maxImage2dWidth = engine_info.max_image2d_width; - params.engineInfo.maxImage2dHeight = engine_info.max_image2d_height; - params.engineInfo.deviceId = engine_info.dev_id; - params.engineInfo.driverVersion = engine_info.driver_version; - params.engineInfo.hostVersion = to_host_version(cldnn::get_version()); - + set_params(arg, params); + const auto& input_layout = arg.input().get_output_layout(); const auto& output_layout = arg.get_output_layout(); @@ -165,63 +161,61 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) params.layerID = arg.id(); - convert_fused_activation_func_params(arg, params); + convert_fused_activation_func_params(arg, params.activation); return params; } template -inline params_t get_weights_bias_default_params(const arg_t& arg, uint32_t split = 1) +inline params_t get_weights_bias_default_params(const arg_t& arg, uint32_t split = 1, uint32_t groups = 1) { params_t params = get_default_params(arg, split); - const auto& weights_layout = arg.weights().get_output_layout(); - params.weights = convert_weights_tensor(weights_layout); + if (groups == 1) { + params.weights = convert_weights_tensor(weights_layout); + } + else { + params.weights = convert_weights_tensor(layout(weights_layout.data_type, weights_layout.format, + { weights_layout.size.batch[0]/(int)groups, weights_layout.size.feature[0], weights_layout.size.spatial[0], weights_layout.size.spatial[1] } + )); + } if (arg.bias_term()) { const auto& bias_layout = arg.bias().get_output_layout(); // bias per output is not supported on cldnn - params.bias.push_back(convert_data_tensor(bias_layout).FlattenFeatureAndSpatials()); + if (groups == 1) { + params.bias.push_back(convert_data_tensor(bias_layout).FlattenFeatureAndSpatials()); } + else { + params.bias.push_back(convert_data_tensor( + layout( + bias_layout.data_type, bias_layout.format, + { bias_layout.size.batch[0], bias_layout.size.feature[0], bias_layout.size.spatial[0]/(int)groups, bias_layout.size.spatial[1] } + )).FlattenFeatureAndSpatials() + ); + } } return params; } +void set_learning_params(const program_node& node, kernel_selector::training_params& params, bool use_momentum); + template inline params_t get_default_learning_params(const arg_t& arg, uint32_t split = 1) { params_t params = get_weights_bias_default_params(arg, split); - - const auto learning_params = arg.get_program().get_options().template get()->params; - - if (arg.use_momentum()) - { - params.use_momentum = true; - } - - params.momentum_factor = learning_params.momentum; - params.weights_decay = learning_params.weights_decay; - + set_learning_params(arg, params, arg.use_momentum()); return params; } +void set_optional_params(const program_impl& program, kernel_selector::optional_params& params); + template inline optional_params_t get_default_optional_params(const program_impl& program) { optional_params_t params; - - const auto& context = program.get_engine().get_context(); - - params.meaningfulKernelsNames = context->get_configuration().meaningful_kernels_names; - params.allowStaticInputReordering = program.get_options().get()->enabled(); - params.allowInputReordering = false; - params.allowOutputReordering = false; - - const auto& tuning_config = program.get_options().get(); - params.tuningParams.mode = to_tuning_mode(tuning_config->config.mode); - params.tuningParams.cacheFilePath = tuning_config->config.cache_file_path; - + set_optional_params(program, params); return params; } diff --git a/inference-engine/thirdparty/clDNN/src/include/lstm_elt_inst.h b/inference-engine/thirdparty/clDNN/src/include/lstm_elt_inst.h index 9530b78..4d69dda 100644 --- a/inference-engine/thirdparty/clDNN/src/include/lstm_elt_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/lstm_elt_inst.h @@ -40,6 +40,7 @@ public: return clip_val; } bool input_forget() const { return get_primitive()->input_forget; } + int32_t direction() const { return get_primitive()->direction; } }; using lstm_elt_node = typed_program_node; @@ -66,6 +67,7 @@ public: return clip_val; } bool input_forget() const { return argument.input_forget; } + uint32_t direction() const { return argument.direction; } }; using lstm_elt_inst = typed_primitive_inst; diff --git a/inference-engine/thirdparty/clDNN/src/include/max_unpooling_inst.h b/inference-engine/thirdparty/clDNN/src/include/max_unpooling_inst.h index 14c449b..f03c4fb 100644 --- a/inference-engine/thirdparty/clDNN/src/include/max_unpooling_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/max_unpooling_inst.h @@ -26,7 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; - + typed_program_node(const std::shared_ptr prim, program_impl& prog); public: using parent::parent; program_node& input() const { return get_dependency(0); } diff --git a/inference-engine/thirdparty/clDNN/src/include/memory_impl.h b/inference-engine/thirdparty/clDNN/src/include/memory_impl.h index 515f6dc..5c18b7b 100644 --- a/inference-engine/thirdparty/clDNN/src/include/memory_impl.h +++ b/inference-engine/thirdparty/clDNN/src/include/memory_impl.h @@ -27,10 +27,15 @@ namespace cldnn struct memory_impl : refcounted_obj { - memory_impl(const engine_impl::ptr& engine, layout layout): _engine(engine), _layout(layout){} + memory_impl(const engine_impl::ptr& engine, layout layout, bool reused=false) + : _engine(engine) + , _layout(layout) + , _reused(reused) + {} + virtual ~memory_impl() { - if (_engine != nullptr) + if (_engine != nullptr && !_reused) { _engine->get_memory_pool().subtract_memory_used(_layout.bytes_count()); } @@ -45,6 +50,8 @@ struct memory_impl : refcounted_obj protected: const engine_impl::ptr _engine; const layout _layout; +private: + bool _reused; }; struct simple_attached_memory : memory_impl diff --git a/inference-engine/thirdparty/clDNN/src/include/memory_pool.h b/inference-engine/thirdparty/clDNN/src/include/memory_pool.h index b5135eb..1e835f8 100644 --- a/inference-engine/thirdparty/clDNN/src/include/memory_pool.h +++ b/inference-engine/thirdparty/clDNN/src/include/memory_pool.h @@ -110,7 +110,7 @@ struct padded_pool_comparer class memory_pool { memory_pool(); - + refcounted_obj_ptr alloc_memory(const layout& layout); static bool has_conflict(const memory_set&, const std::set&, uint32_t); @@ -122,7 +122,7 @@ class memory_pool uint64_t _max_peak_memory_used; public: memory_pool(engine_impl& engine); - + ~memory_pool(); refcounted_obj_ptr get_memory(const layout& layout, const primitive_id& id, uint32_t network_id, const std::set& restrictions, bool reusable = true); // get from pool or create memory allocation refcounted_obj_ptr get_memory(const layout& layout); refcounted_obj_ptr get_from_non_padded_pool(const layout& layout, const primitive_id& id, uint32_t network_id, const std::set&); diff --git a/inference-engine/thirdparty/clDNN/src/include/meta_utils.h b/inference-engine/thirdparty/clDNN/src/include/meta_utils.h index ad18786..de1c55a 100644 --- a/inference-engine/thirdparty/clDNN/src/include/meta_utils.h +++ b/inference-engine/thirdparty/clDNN/src/include/meta_utils.h @@ -62,4 +62,4 @@ struct is_internal_primitive : public std::integral_constant::type>::value> {}; } -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/network_impl.h b/inference-engine/thirdparty/clDNN/src/include/network_impl.h index 4874d37..b9b26e0 100644 --- a/inference-engine/thirdparty/clDNN/src/include/network_impl.h +++ b/inference-engine/thirdparty/clDNN/src/include/network_impl.h @@ -39,6 +39,7 @@ struct network_impl : public refcounted_obj public: network_impl(const program_impl& program, bool is_internal = false); network_impl(engine_impl& engine, const topology_impl& topo, const build_options& options = build_options(), bool is_internal = false); + network_impl(engine_impl& engine, const std::set>& nodes, const build_options & options, bool is_internal); const program_impl& get_program() const { return *_program; } engine_impl& get_engine() const { return _program->get_engine(); } @@ -61,19 +62,19 @@ public: std::vector get_all_primitive_ids() const; std::vector get_all_primitive_org_ids() const; void execute(const std::vector& events); - + void validate_primitives(); // Implementation specific calls std::shared_ptr get_primitive(const primitive_id& id); std::string get_primitive_info(const primitive_id& id) const; const event_impl::ptr& get_primitive_event(const primitive_id& id) const { return _events.at(id); } std::vector> get_primitives(const std::vector& ids); std::vector> get_primitives(const std::vector& nodes); - event_impl::ptr execute_primitive(const std::shared_ptr& primitive, const std::vector& events); + void execute_primitive(const std::shared_ptr& primitive, const std::vector& events); void allocate_primitives(); void build_insts_deps(); uint32_t get_id() const { return net_id; } + void build_exec_order(); bool is_internal() const { return _internal; } - private: uint32_t net_id = 0; const program_impl::cptr _program; @@ -89,6 +90,10 @@ private: std::unordered_map _events; void allocate_primitive_instance(program_node const& node); + void add_to_exec_order(const primitive_id& id); + std::shared_ptr find_in_internal_networks(const primitive_id& id); + std::shared_ptr find_primitive(const primitive_id& id); + void check_names(); }; } diff --git a/inference-engine/thirdparty/clDNN/src/include/one_hot_inst.h b/inference-engine/thirdparty/clDNN/src/include/one_hot_inst.h new file mode 100644 index 0000000..21157be --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/one_hot_inst.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include + +#include "primitive_inst.h" + + +namespace cldnn +{ + template <> + struct typed_program_node : typed_program_node_base + { + private: + using parent = typed_program_node_base; + + public: + using parent::parent; + + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } + program_node& input() const { return get_dependency(0); } + }; + + using one_hot_node = typed_program_node; + + + template <> + class typed_primitive_inst : public typed_primitive_inst_base + { + using parent = typed_primitive_inst_base; + + public: + static layout calc_output_layout(one_hot_node const& node); + static std::string to_string(one_hot_node const& node); + typed_primitive_inst(network_impl& network, one_hot_node const& node); + }; + + using one_hot_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/pass_manager.h b/inference-engine/thirdparty/clDNN/src/include/pass_manager.h new file mode 100644 index 0000000..f295d1d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/pass_manager.h @@ -0,0 +1,276 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#pragma once + +#include "program_impl.h" +#include "layout_optimizer.h" + +namespace cldnn +{ + class base_pass + { + friend class pass_manager; + public: + base_pass(const std::string& pass_name) : name(pass_name) {} + virtual void run(program_impl& p) = 0; + std::string get_name() { return name; } + void clean_marks(program_impl& p) { + for (auto& node : p.get_processing_order()) + { + node->unmark(); + } + } + private: + const std::string name; + }; + + class pass_manager + { + public: + pass_manager() + { + pass_count = 0; + } + void run(program_impl& p, base_pass& pass) + { + pass.run(p); + std::string dump_file_name; + if (pass_count < 10) + dump_file_name += "0"; + dump_file_name += std::to_string(pass_count) + "_" + pass.get_name(); + p.dump_program(dump_file_name.c_str(), true); + pass.clean_marks(p); + pass_count++; + } + uint32_t get_pass_count() { return pass_count; } + uint32_t inc_pass_count() { return ++pass_count; } + ~pass_manager() {} + private: + uint32_t pass_count; + }; + + class add_required_reorders : public base_pass + { + public: + add_required_reorders() : base_pass("add_required_reorders") {} + private: + virtual void run(program_impl& p) override; + void add_reorder(program_impl& p, program_node* node, program_node* usr, layout reorder_layout); + }; + + class add_reshape_to_primitives : public base_pass + { + public: + add_reshape_to_primitives() : base_pass("add_reshape_to_primitives_pass") {} + private: + virtual void run(program_impl& p) override; + }; + + class calculate_prior_boxes : public base_pass + { + public: + calculate_prior_boxes() : base_pass("calculated_prior_boxes") {} + private: + virtual void run(program_impl& p) override; + }; + + class compile_graph: public base_pass + { + public: + compile_graph() : base_pass("compile_graph") {} + private: + virtual void run(program_impl& p) override; + }; + + class eltwise_shrinking : public base_pass + { + public: + eltwise_shrinking() : base_pass("eltwise_shrinking") {} + private: + virtual void run(program_impl& p) override; + }; + + class eltwise_remove_stride : public base_pass + { + public: + eltwise_remove_stride() : base_pass("eltwise_remove_stride") {} + private: + virtual void run(program_impl& p) override; + void conv_stride_extend(program_impl & p, program_node & node, cldnn::tensor & tensor); + }; + + class graph_initializations : public base_pass + { + public: + graph_initializations() : base_pass("init") {} + private: + virtual void run(program_impl& p) override; + void replace_nodes(program_impl& p); + void handle_detection_output(program_impl& p); + void handle_lstm(program_impl& p); + void set_outputs(program_impl& p); + }; + + class handle_input_padding : public base_pass + { + public: + handle_input_padding() : base_pass("handle_input_padding") {} + private: + virtual void run(program_impl& p) override; + }; + + class mark_nodes : public base_pass + { + public: + mark_nodes() : base_pass("analyzed_graph") {} + private: + virtual void run(program_impl& p) override; + void mark_constants(program_impl& p); + void mark_data_flow(program_impl& p); + }; + + class prepare_buffer_fusing : public base_pass + { + public: + prepare_buffer_fusing() : base_pass("prepare_buffer_fusing") {} + private: + virtual void run(program_impl& p) override; + }; + + class prepare_conv_eltw_fusing : public base_pass + { + public: + prepare_conv_eltw_fusing() : base_pass("prepare_conv_eltw_fusing") {} + private: + virtual void run(program_impl& p) override; + void fuse_conv_eltwise(program_impl& p, program_node* node); + }; + + class prepare_conv_eltw_read_write_opt : public base_pass + { + public: + prepare_conv_eltw_read_write_opt() : base_pass("prepare_conv_eltw_read_write_opt") {} + private: + virtual void run(program_impl& p) override; + void conv_eltwise_read_write_opt(program_impl& p, program_node* node); + }; + + class prepare_depthwise_sep_opt : public base_pass + { + public: + prepare_depthwise_sep_opt() : base_pass("prepare_depthwise_sep_opt") {} + private: + virtual void run(program_impl& p) override; + template void optimize_depthwise_sep_pre(T& node); + }; + + class prep_opt_depthwise_sep_post : public base_pass + { + public: + prep_opt_depthwise_sep_post() : base_pass("prep_opt_depthwise_sep_post") {} + private: + virtual void run(program_impl& p) override; + template void optimize_depthwise_sep_pre(program_impl& p, T& node); + }; + + class prepare_primitive_fusing : public base_pass + { + public: + prepare_primitive_fusing() : base_pass("prepare_primitive_fusing") {} + private: + virtual void run(program_impl& p) override; + void fuse_skip_layers(program_impl& p, program_node* node); + void fuse_conv_bn_scale(program_impl& p, program_node* node); + }; + + class pre_optimize_bias : public base_pass + { + public: + pre_optimize_bias(layout_optimizer& lo_ref); + private: + virtual void run(program_impl& p) override; + virtual void run(program_impl& p, layout_optimizer& lo); + template + void optimize_bias(T& node, layout_optimizer& lo, program_impl& p); + layout_optimizer& _lo; + }; + + class prepare_padding : public base_pass + { + public: + prepare_padding(bool output_size_handling_enabled_switch) : base_pass("prepare_padding"), + output_size_handling_enabled(output_size_handling_enabled_switch) {} + private: + virtual void run(program_impl& p) override; + bool output_size_handling_enabled; + }; + + class post_optimize_weights : public base_pass + { + public: + post_optimize_weights(layout_optimizer& lo_ref); + private: + virtual void run(program_impl& p) override; + virtual void run(program_impl& p, layout_optimizer& lo); + template + void optimize_weights(T& node, layout_optimizer& lo, program_impl& p); + layout_optimizer& _lo; + }; + + class propagate_constants : public base_pass + { + public: + propagate_constants() : base_pass("propagate_constants") {} + private: + virtual void run(program_impl& p) override; + std::list> calculate(engine_impl &engine); + bool has_non_const_user(program_node& node) const; + void handle_constant(program_impl& prog, program_node& node); + void add_constant(program_impl& prog, program_node& node); + void add_deps_to_tpl(program_impl& prog, const std::vector& node); + + bool has_non_trivial_constants = false; + std::list*> const_inputs; + std::vector const_outputs; + std::set> nodes; + }; + + class remove_redundant_reorders : public base_pass + { + public: + remove_redundant_reorders() : base_pass("remove_redundant_reorders") {} + virtual void run(program_impl& p) override; + }; + + class reorder_inputs : public base_pass + { + public: + reorder_inputs(layout_optimizer& lo_ref); + private: + virtual void run(program_impl& p) override; + virtual void run(program_impl& p, layout_optimizer& lo); + layout_optimizer& _lo; + }; + + class trim_to_outputs : public base_pass + { + public: + trim_to_outputs() : base_pass("trimmed") {} + private: + virtual void run(program_impl& p) override; + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/permute_inst.h b/inference-engine/thirdparty/clDNN/src/include/permute_inst.h index bb76c9a..e538eda 100644 --- a/inference-engine/thirdparty/clDNN/src/include/permute_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/permute_inst.h @@ -26,6 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/pooling_inst.h b/inference-engine/thirdparty/clDNN/src/include/pooling_inst.h index 2956667..2bdc444 100644 --- a/inference-engine/thirdparty/clDNN/src/include/pooling_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/pooling_inst.h @@ -26,6 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h index 563e6d1..0a7b9f3 100644 --- a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h @@ -21,7 +21,6 @@ #include "api/CPP/concatenation.hpp" #include "event_impl.h" -#include "program_impl.h" #include "memory_impl.h" #include "meta_utils.h" #include "kernel_selector_helper.h" @@ -53,17 +52,18 @@ struct primitive_impl // A special member function is user-provided if it is user-declared and not explicitly defaulted or deleted // on its first declaration. primitive_impl() : _weights_reorder_params() {} - primitive_impl(const kernel_selector::weights_reorder_params& params, std::string kernel_name = "") : _weights_reorder_params(params), kernel_name(kernel_name) {} + primitive_impl(const kernel_selector::weights_reorder_params& params, std::string kernel_name = "") : _weights_reorder_params(params), _kernel_name(kernel_name) {} virtual ~primitive_impl() = default; virtual event_impl::ptr execute(const std::vector& events, primitive_inst& instance) = 0; - - std::string get_kernel_name() { return kernel_name; }; - + virtual bool validate(const primitive_inst& instance) const = 0; + std::string get_kernel_name() const { return _kernel_name; }; // TODO: added a derived class for weights reordering (maybe for all static data reordering) const kernel_selector::weights_reorder_params _weights_reorder_params; + // class typed_primitive_gpu_impl override this with return false; + virtual bool is_cpu() const { return true; } private: - std::string kernel_name; + std::string _kernel_name; }; /* @@ -92,12 +92,12 @@ public: primitive_id id() const { return _node.id(); } primitive_id org_id() const { return _node.get_org_primitive_id(); } bool can_be_optimized() const { return _node.can_be_optimized(); } - const std::shared_ptr desc() const { return _node.get_primitive(); } + std::shared_ptr desc() const { return _node.get_primitive(); } network_impl& get_network() const { return _network; } uint32_t get_network_id() const; //return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead - const primitive_impl* get_impl() const { return _impl.get(); } + primitive_impl* get_impl() const { return _impl.get(); } memory_impl& input_memory(size_t index = 0) const { @@ -107,7 +107,7 @@ public: } event_impl::ptr execute(const std::vector& events); - + bool validate() const { return _impl->validate(*this); } bool output_changed() const { return _output_changed; } void reset_output_change() { _output_changed = false; } @@ -150,7 +150,8 @@ protected: /* Base class for all implementation of specified primitive type. -For example, all convolution implementations should derive from typed_primitive_impl. +For example, all cpu convolution implementations should derive directly from typed_primitive_impl. +GPU implementations should derive from typed_primitive_gpu_impl; */ template struct typed_primitive_impl : public primitive_impl @@ -158,7 +159,6 @@ struct typed_primitive_impl : public primitive_impl static_assert(meta::is_primitive::value, "PType should be a non-const, non-volatile class derived from primitive"); using primitive_impl::primitive_impl; - private: event_impl::ptr execute(const std::vector>& event, primitive_inst& instance) override { @@ -169,8 +169,23 @@ private: return execute_impl(event, reinterpret_cast&>(instance)); } - virtual event_impl::ptr execute_impl(const std::vector& event, typed_primitive_inst& instance) = 0; + + virtual bool validate(const primitive_inst& instance) const override + { + if (instance.type() != PType::type_id()) + throw std::invalid_argument("Implementation type does not match primitive type"); + if (instance.get_impl() != this) + throw std::invalid_argument("Trying to validate primitive implementation with mismatching primitive instance"); + + return validate_impl(reinterpret_cast&>(instance)); + } + virtual bool validate_impl(const typed_primitive_inst&) const + { + return true; + } + + }; namespace details diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_type.h b/inference-engine/thirdparty/clDNN/src/include/primitive_type.h index 2b19e4a..1347a44 100644 --- a/inference-engine/thirdparty/clDNN/src/include/primitive_type.h +++ b/inference-engine/thirdparty/clDNN/src/include/primitive_type.h @@ -40,6 +40,8 @@ struct cldnn_primitive_type virtual std::shared_ptr create_node(cldnn::program_impl& program, const std::shared_ptr prim) const = 0; virtual std::shared_ptr create_instance(cldnn::network_impl& network, const cldnn::program_node& node) const = 0; virtual std::unique_ptr choose_impl(cldnn::engine_impl& engine, const cldnn::program_node& node) const = 0; + virtual bool does_an_implementation_exist(cldnn::engine_impl& engine, const cldnn::program_node& node) const = 0; + virtual bool does_possible_implementation_exist(cldnn::engine_impl& engine, const cldnn::program_node& node) const = 0; virtual cldnn::layout calc_output_layout(const cldnn::program_node& node) const = 0; virtual std::string to_string(const cldnn::program_node& node) const = 0; diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_type_base.h b/inference-engine/thirdparty/clDNN/src/include/primitive_type_base.h index 2f4f745..91a9dec 100644 --- a/inference-engine/thirdparty/clDNN/src/include/primitive_type_base.h +++ b/inference-engine/thirdparty/clDNN/src/include/primitive_type_base.h @@ -63,6 +63,20 @@ struct primitive_type_base : ::cldnn_primitive_type return engine.create_primitive_impl(node.as()); } + bool does_an_implementation_exist(engine_impl& engine, const cldnn::program_node& node) const override + { + if (node.type() != this) + throw std::invalid_argument("primitive_type_base::choose_impl: primitive type mismatch"); + return engine.does_an_implementation_exist(node.as()); + } + + bool does_possible_implementation_exist(engine_impl& engine, const cldnn::program_node& node) const override + { + if (node.type() != this) + throw std::invalid_argument("primitive_type_base::choose_impl: primitive type mismatch"); + return engine.does_possible_implementation_exist(node.as()); + } + cldnn::layout calc_output_layout(const cldnn::program_node& node) const override { if (node.type() != this) diff --git a/inference-engine/thirdparty/clDNN/src/include/program_dump_graph.h b/inference-engine/thirdparty/clDNN/src/include/program_dump_graph.h index 0ee0e67..2e61ffd 100644 --- a/inference-engine/thirdparty/clDNN/src/include/program_dump_graph.h +++ b/inference-engine/thirdparty/clDNN/src/include/program_dump_graph.h @@ -18,7 +18,7 @@ #include "program_impl.h" #include "program_node.h" -#include "data_inst.h" +#include "gpu/ocl_toolkit.h" #include namespace cldnn @@ -30,7 +30,4 @@ namespace cldnn void dump_graph_processing_order(std::ofstream&, const program_impl&); void dump_graph_init(std::ofstream&, const program_impl&, std::function const&); void dump_graph_info(std::ofstream&, const program_impl&, std::function const&); - void dump_to_xml(std::ofstream& graph, const program_impl& program, std::function const& filter, std::vector& offsets, std::vector& data_names); - void dump_kernels(kernels_binaries_container program_binaries, std::vector& offsets, std::vector& data_names, std::ofstream& file_stream); - void dump_data(memory_impl& mem, std::ofstream& stream, unsigned long long& total_offset, unsigned long long type); } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/program_helpers.h b/inference-engine/thirdparty/clDNN/src/include/program_helpers.h new file mode 100644 index 0000000..1c7cb1e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/program_helpers.h @@ -0,0 +1,114 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "program_node.h" +#include "engine_impl.h" +#include "program_impl.h" + +namespace cldnn +{ + struct program_helpers + { + //helper function which creates single-element array if it's given anything + //other than std::vector. + //It should be used in generic code when there's a need to force vector usage + //in foreach loop over variable which can in one context be a vector or a scalar + //in another. + //example: + // T t; + // for (auto& string : wrap_if_single(t.dump())) + //depending on type T, t.dump() may return either std::string or std::vector, + //to ensure compatibility between these cases, wrap_if_single will create single-element + //container in case t.dump() would return plain std::string. + // + // T& case -> returns container which holds T& + template + static program_impl::single_element_container wrap_if_single(T& t) + { + return program_impl::single_element_container(t); + } + + //helper function which creates single-element array if it's given anything + //other than std::vector. + // T const& case -> returns container which holds T const& + template + static program_impl::single_element_container wrap_if_single(T const& t) + { + return program_impl::single_element_container(t); + } + + //helper function which creates single-element array if it's given anything + //other than std::vector. + // T&& case -> returns container which holds new instance of T created by moving given param + template + static program_impl::single_element_container wrap_if_single(T&& t) + { + static_assert(meta::always_false::value, "Wrapping temporary object into single_element_container is an error (requires valid reference)"); + return program_impl::single_element_container(t); + } + + //helper function which creates single-element array if it's given anything + //other than std::vector. + // std::vector case -> does not wrap, returns t as-is + static const primitive::fixed_size_vector_ref& wrap_if_single(primitive::fixed_size_vector_ref const& t) + { + return t; + } + + //helper function for selecting function basing on the type of the given primitive + //this is the termination case for parameter pack recurrence, see overload below for logic + template + static void do_for_types(program_node&) + { + return; + } + + //helper function for selecting function basing on the type of the given primitive + //this function should be explicitly given set of types and implicitly set of functions. + //both sets should have equal size. First function will be called if type of the given primitive + //will match first explicitly given type, second will be called if it matches second explicitly given + //type etc. + //Functions given as arguments should themselves take std::shared_ptr as argument + //where T is the type that should be match if this function should be called + // + //example: + // do_for_types< + // convolution, + // pooling + // >(primitive, + // [](typed_program_node&){ do something if 'primitive' is a convolution }, + // [](typed_program_node&) { do something if 'primitive' is a pooling } + // ); + template + static decltype(static_cast(std::declval()(std::declval&>()))) do_for_types( + program_node& node, + Func const& func, + RestOfFuncs const&... rest) + { + if (node.type() == T::type_id()) + func(node.as()); + else + do_for_types(node, rest...); + } + static void merge_buffers(engine_impl &engine, program_node &node, layout target_layout, size_t begin_offset, size_t end_offset); + static layout get_weights_layout(typed_program_node &data_node, int32_t split); + static std::pair are_layouts_identical(layout const& l1, layout const& l2); + }; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/program_impl.h b/inference-engine/thirdparty/clDNN/src/include/program_impl.h index c3cb673..c518d9c 100644 --- a/inference-engine/thirdparty/clDNN/src/include/program_impl.h +++ b/inference-engine/thirdparty/clDNN/src/include/program_impl.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,136 +15,206 @@ */ /////////////////////////////////////////////////////////////////////////////////////////////////// + #pragma once + #include "api/CPP/program.hpp" #include "refcounted_obj.h" -#include "topology_impl.h" #include "engine_impl.h" -#include "program_node.h" -#include "memory_impl.h" #include -#include namespace cldnn { +struct topology_impl; struct primitive_impl; +struct program_node; class layout_optimizer; -class constants_propagator; - +class pass_manager; +class program_impl_wrapper; /* cldnn_program implementation */ struct program_impl : public refcounted_obj { - friend struct program_node; - + friend class calculate_prior_boxes; // to be removed when possible + friend class graph_initializations; // to be removed when possible + friend class prepare_padding; // to be removed when possible + friend class propagate_constants; // to be removed when possible + friend class prepare_primitive_fusing; // to be removed when possible + friend class prepare_conv_eltw_fusing; // to be removed when possible + friend class prepare_conv_eltw_read_write_opt; // to be removed when possible + friend class reorder_inputs; // to be removed when possible + friend class program_impl_wrapper; // this class is intended to extend the interface of program_impl for + // the usage within tests_core_internal project only public: - program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal); - - void dump_memory_pool() const; - - engine_impl& get_engine() const { return *engine; } - build_options get_options() const { return options; } - bool is_debug_build() const { return options.get()->enabled(); } - - std::list> get_nodes() const; - std::list get_processing_order() const { return processing_order; } - std::list get_optimized_out() const { return optimized_out; } - program_node& get_node(primitive_id const& id) + struct nodes_ordering { - try + public: + typedef std::list list_of_nodes; + typedef list_of_nodes::const_iterator const_iterator; + typedef list_of_nodes::iterator node_iterator; + const_iterator begin() const { return _processing_order.begin(); } + const_iterator end() const { return _processing_order.end(); } + + void calc_processing_order_visit(program_node* node); + void calc_processing_order(program_impl& p); + int32_t get_processing_number(program_node* node) const { return get_processing_number(get_processing_iterator(*node)); } + // int32_t get_processing_number(const_iterator iter) const { return 1+(int32_t)std::distance(begin(), iter); } + int32_t get_processing_number(node_iterator iter) const { return 1 + (int32_t)std::distance(_processing_order.begin(), const_iterator(iter)); } + void calculate_BFS_processing_order(); + size_t size() { return _processing_order.size(); } + bool is_correct(program_node* node); + + node_iterator get_processing_iterator(program_node& node) const { - return *nodes_map.at(id); + return processing_order_iterators.at(&node); } - catch (...) + void clear() { - throw std::runtime_error("Program doesn't contain primtive node: " + id); + processing_order_iterators.clear(); + _processing_order.clear(); } - } - bool has_node(const primitive_id& prim) const - { - return nodes_map.count(prim) > 0; - } + void insert(program_node* key_node, program_node* node) + { + node_iterator _where = processing_order_iterators.at(key_node); + processing_order_iterators[node] = _processing_order.insert(_where, node); + } - program_node const& get_node(primitive_id const& id) const - { - try + void insert_next(program_node* key_node, program_node* node) { - return *nodes_map.at(id); + node_iterator _where = std::next(processing_order_iterators.at(key_node)); + processing_order_iterators[node] = _processing_order.insert(_where, node); } - catch (...) + + void erase(program_node* key_node) { - throw std::runtime_error("Program doesn't contain primtive node: " + id); + node_iterator i = processing_order_iterators.at(key_node); + processing_order_iterators.erase(key_node); + _processing_order.erase(i); } - } + + private: + list_of_nodes _processing_order; + std::map processing_order_iterators; + }; + + template + struct single_element_container + { + single_element_container(T& t) : elem(&t) + {} + constexpr size_t size() const { return 1; } + single_element_container begin() const { return single_element_container(elem); } + single_element_container end() const { return single_element_container(nullptr); } + single_element_container& operator ++() { elem = nullptr; return *this; } + bool operator !=(single_element_container const& sec) { return elem != sec.elem; } + + T operator *() { return *elem; } + + private: + single_element_container(T* t) : elem(t) + {} + + T* elem; + }; + program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal, bool no_optimizations=false); + /* constructor used to build a program from subset of nodes of other program (used in propagate_constants) */ + program_impl(engine_impl& engine_ref, std::set> const &nodes, build_options const& options, bool is_internal); + ~program_impl(); + engine_impl& get_engine() const { return *engine; } + const build_options& get_options() const { return options; } + std::list& get_inputs() { return inputs; } // ToDo: redesign trim to ouptut pass to make it const as_well as get_engine and get options + std::vector& get_outputs() { return outputs; } // ToDo: redesign reorder-inputs pass to make it const as_well as get_engine and get options + bool is_debug_build() const { return options.get()->enabled(); } + const nodes_ordering& get_processing_order() const; + nodes_ordering& get_processing_order(); + const std::list& get_optimized_out() const { return optimized_out; } + bool has_node(const primitive_id& prim) const { return nodes_map.count(prim) > 0; } + program_node& get_node(primitive_id const& id); + program_node const& get_node(primitive_id const& id) const; + std::shared_ptr get_node_ptr(const primitive_id& prim) { return nodes_map.at(prim); } + std::shared_ptr get_node_ptr(const primitive_id& prim) const { return nodes_map.at(prim); } + void dump_memory_pool() const; + + //returns already existing program_node for given primitive 'prim' (lookup in 'nodes_map') + //if it was previously created, otherwise creates and then returns program_node + program_node& get_or_create(std::shared_ptr prim); + + // Inserts given program_node 'node' as an intermediate node between 'next' and it's + // dependency at 'prev_idx' index. + void add_intermediate(program_node& node, program_node& next, size_t prev_idx, + bool connect_int_node_with_old_dep = true, + bool move_usrs_of_prev_to_node = false); + + // Gets or creates program_node for given primitive 'prim' and inserts it as an intermediate + // node between 'next' and it's dependency at 'prev_idx' index. + void add_intermediate(std::shared_ptr prim, program_node& next, size_t prev_idx, + bool connect_int_node_with_old_dep = true, + bool move_usrs_of_prev_to_node = false); + + //removes a node from the graph and deletes it afterwards, + //prereq: node cannot be marked as output and has to have exactly one dependency + //returns if 'node' has been extracted and removed successfully + bool extract_and_remove(program_node& node); + + //returns if 'node' has been removed + bool remove_if_dangling(program_node& node); + + void mark_if_constant(program_node& node); + // mark if the node is in data flow assuming that all dependencies are marked properly + void mark_if_data_flow(program_node& node); + //Reverses connection - user becomes dependency. + + void remove_nodes(std::list& to_remove); + void dump_program(const char* stage, bool with_full_info, std::function const& filter = nullptr) const; private: uint32_t prog_id = 0; - engine_impl::ptr engine; build_options options; - std::list inputs; std::vector outputs; - std::list processing_order; + nodes_ordering processing_order; + std::unique_ptr pm; std::map> nodes_map; - std::list optimized_out; - // TODO: Remove once we will get full support for input/output padding in all primitive implementations. - bool output_size_handling_enabled; - /* ** High-level functions, in order of usage */ - void init_graph(topology_impl const& topology); - void pre_optimize_graph(); - void post_optimize_graph(); - void compile_graph(); + /* build nodes internal structure based on topology */ + void prepare_nodes(topology_impl const& topology); + /* build nodes internal structure based on the subset of nodes of other program (used in propagate_constants) */ + void prepare_nodes(std::set> const& nodes); + void add_node_dependencies(program_node* node_ptr); + void copy_node_dependencies(program_node* dest, program_node* src); + void build_program(bool is_internal); + void init_graph(); + void set_options(); + + void run_graph_compilation(); + void pre_optimize_graph(bool is_internal); + void post_optimize_graph(bool is_internal); void cleanup(); /* - ** Initialization functions - */ - void set_outputs(); - void calc_processing_order(); - void calc_prior_boxes(); - - /* ** Analysis functions */ - void mark_constants(); - void mark_data_flow(); // TODO: Remove once we will get full support for input/output padding in all primitive implementations. - void analyze_output_size_handling_need(); - void replace_nodes_pre(); - void replace_nodes_post(); - void handle_lstm(); + bool analyze_output_size_handling_need(); + + // handle split, deconvolution and upsampling void handle_reshape(); /* ** Optimization functions */ - void trim_to_outputs(); - void remove_redundant_reorders(); - void calculate_BFS_processing_order(); - void reorder_inputs(layout_optimizer& lo); - void pre_optimize_bias(layout_optimizer& lo); - void post_optimize_weights(layout_optimizer& lo); void apply_needed_padding(program_node& node, program_node& prev_node, const padding& needed_padding); - void prepare_padding(); - void propagate_constants(); - void prepare_buffer_fusing(); - void fuse_skip_layers(program_node* node); - void prepare_primitive_fusing(); - void prepare_depthwise_sep_opt(); - void prep_opt_depthwise_sep_post(); - void update_processing_numbers(); /* ** Memory pool functions @@ -158,57 +228,15 @@ private: /* ** Utilities */ + void add_split_outputs(); + // mark if the node is constant assuming that all dependencies are marked properly + void reverse_connection(program_node& dep_node, program_node& user_node); - //returns already existing program_node for given primitive 'prim' (lookup in 'nodes_map') - //if it was previously created, otherwise creates and then returns program_node - program_node& get_or_create(std::shared_ptr prim); - - // Inserts given program_node 'node' as an intermediate node between 'next' and it's - // dependency at 'prev_idx' index. - void add_intermediate(program_node& node, program_node& next, size_t prev_idx, bool connect_int_node_with_old_dep = true); - - // Gets or creates program_node for given primitive 'prim' and inserts it as an intermediate - // node between 'next' and it's dependency at 'prev_idx' index. - void add_intermediate(std::shared_ptr prim, program_node& next, size_t prev_idx, bool connect_int_node_with_old_dep = true) - { - add_intermediate(get_or_create(prim), next, prev_idx, connect_int_node_with_old_dep); - } - - void add_connection(program_node& prev, program_node& next) - { - prev.users.push_back(&next); - next.dependencies.push_back(&prev); - } + void add_connection(program_node& prev, program_node& next); - void remove_connection(program_node& prev, program_node& next) - { - prev.users.remove(&next); - next.dependencies.erase(std::remove(next.dependencies.begin(), next.dependencies.end(), &prev), next.dependencies.end()); - } - - void remove_all_connections(program_node& node) { - // since the graph is not topological sorted, we need to remove the node from both dependencies and users - for (auto &e : node.users) { - e->dependencies.erase(std::remove(e->dependencies.begin(), e->dependencies.end(), &node), e->dependencies.end()); - } - for(auto &e : node.dependencies) { - e->users.remove(&node); - } - node.dependencies.clear(); - node.users.clear(); - } + void remove_connection(program_node& prev, program_node& next); - bool processing_order_is_correct(program_node* node) - { - for (auto& dep : node->get_dependencies()) - { - if (node->processing_num < dep->processing_num) - { - return false; - } - } - return true; - } + void remove_all_connections(program_node& node); void rename(program_node & node, primitive_id const & new_id); void swap_names(program_node& node1, program_node& node2); @@ -216,37 +244,9 @@ private: //old_node - node which will be replaced //new_node - node which will replace the old one - //replace_whole_branch - if set to true, 'old_node' will be replaced with all its dependencies and new_node will retain its dependencies - // old's dependencies which are post-dominates by 'old_node' will also be removed - void replace(program_node& old_node, program_node& new_node, bool replace_whole_branch, bool check_output_layouts_integrity = true); - - //returns if 'node' has been removed - bool remove_if_dangling(program_node& node, bool detach_whole_branch = false); - - //removes a node from the graph and deletes it afterwards, - //prereq: node cannot be marked as output and has to have exactly one dependency - //returns if 'node' has been extracted and removed successfully - bool extract_and_remove(program_node& node); - void replace_data_with_optimized(std::map const& replace_map); - void dump_program(const char* stage, bool with_full_info, std::function const& filter = nullptr) const; - //Dumps weights and biasses in serialization process, not working yet, in progress. - void dump_weights_and_biasses(std::vector& offsets, std::vector& data_names, std::ofstream& file_stream) const; - //Makes serialization with given name. - //Placeholder, not working yet, in progress. - void serialize(std::string network_name, std::function const& filter = nullptr) const; - - template - void optimize_bias(T& node, layout_optimizer& lo); - - template - void optimize_weights(T& node, layout_optimizer& lo); - - template - void optimize_depthwise_sep_pre(T& node); - - template - void optimize_depthwise_sep_post(T& node); + void replace(program_node& old_node, program_node& new_node); }; + } API_CAST(::cldnn_program, cldnn::program_impl) diff --git a/inference-engine/thirdparty/clDNN/src/include/program_node.h b/inference-engine/thirdparty/clDNN/src/include/program_node.h index e9df77a..42cec07 100644 --- a/inference-engine/thirdparty/clDNN/src/include/program_node.h +++ b/inference-engine/thirdparty/clDNN/src/include/program_node.h @@ -22,12 +22,12 @@ #include "meta_utils.h" -#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) - namespace cldnn { struct program_impl; +class reorder_inputs; +class graph_initializations; template struct typed_program_node; @@ -51,8 +51,14 @@ class xml_composite; */ struct program_node { - friend struct program_impl; - friend class constants_propagator; + friend struct program_impl; // to be removed when possible + friend class compile_graph; // to be removed when possible + friend class graph_initializations; // to be removed when possible + friend class prepare_primitive_fusing; // to be removed when possible + friend class prepare_conv_eltw_fusing; // to be removed when possible + friend class prepare_conv_eltw_read_write_opt; // to be removed when possible + friend class propagate_constants; // to be removed when possible + friend class post_optimize_weights; // to be removed when possible - requires an access to selected_impl template friend struct typed_program_node; @@ -82,10 +88,10 @@ public: std::vector const& get_dependencies() const { return dependencies; } program_node& get_dependency(size_t idx) const { return *dependencies.at(idx); } - //replaces idx-th dependency of 'this' with 'new_dep', calls program::remove_if_dangling(old_dep, detach_whole_branch) - void replace_dependency(size_t idx, program_node& new_dep, bool detach_whole_branch = false); - //searches for 'old_dep' in dependencies list of 'this' and replaces it with 'new_dep', calls program::remove_if_dangling(old_dep, detach_whole_branch) - void replace_dependency(program_node const& old_dep, program_node& new_dep, bool detach_whole_branch = false); + //replaces idx-th dependency of 'this' with 'new_dep', calls program::remove_if_dangling(old_dep) + void replace_dependency(size_t idx, program_node& new_dep); + //searches for 'old_dep' in dependencies list of 'this' and replaces it with 'new_dep', calls program::remove_if_dangling(old_dep) + void replace_dependency(program_node const& old_dep, program_node& new_dep); std::vector get_dependencies_ids() const; @@ -113,8 +119,7 @@ public: std::list const& get_users() const { return reinterpret_cast&>(users); } std::unique_ptr desc_to_json() const; - std::unique_ptr desc_to_xml() const; - //do not modify primitive directly to keep synchronisation wit graph + //do not modify primitive directly to keep synchronisation with graph std::shared_ptr get_primitive() const { return desc; } //primitive modification functions void set_output_padding(padding const& padd) @@ -132,7 +137,7 @@ public: //only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users layout calc_output_layout() const; - //uses cached output layout if vlid, if not calls 'calc_output_layout' and stores its result + invalidate all users if layout has changed and @p invalidate_users_if_changed is set to true + //uses cached output layout if valid, if not calls 'calc_output_layout' and stores its result + invalidate all users if layout has changed and @p invalidate_users_if_changed is set to true layout get_output_layout(bool invalidate_users_if_changed = true); //returns cached output layout if valid, otherwise throws an exception layout get_output_layout() const; @@ -159,7 +164,6 @@ public: bool is_output() const { return output; } bool is_valid_output_layout() const { return valid_output_layout; } - uint32_t get_processing_num() const { return processing_num; } uint8_t mark(uint8_t val = 1) { uint8_t ret = user_mark; user_mark = val; return ret; } void unmark() { user_mark = 0; } @@ -183,19 +187,25 @@ public: return fused_activation.additional_params; } + // check/set if the node can be optimized out (removed from the network) bool can_be_optimized() const { return optimized; } void can_be_optimized(bool opt) { optimized = opt; } + // check/set if the node's buffer can be shared during the memory pool optimization + bool can_share_buffer() const { return share_buffer; } + void can_share_buffer(bool share) { share_buffer = share; } + + // check/set if the node support padding in x,y,b and f + bool support_padding() const { return _support_padding; } + void support_padding(bool support) { _support_padding = support; } + primitive_id get_org_primitive_id() const { return org_id; } - void set_org_primitive_id(primitive_id org_prim_id) - { - org_id = org_prim_id; - } bool is_constant() const { return constant; } - bool has_non_const_user() const { return (!constant || constant_frontier); } - //returns true if this node is within main data flow of the network (i.e. it does not describe helper data like convolution's weights etc.) + + // returns true if this node is within main data flow of the network (i.e. it does not describe helper data like convolution's weights etc.) bool is_in_data_flow() const { return data_flow; } + //conversion from generic to specific template ::value>::type> typed_program_node& as() @@ -248,28 +258,22 @@ protected: std::vector dependencies; std::list users; -#if defined(__GNUC__) && (GCC_VERSION < 40900) - std::list::iterator processing_itr; -#else - std::list::const_iterator processing_itr; -#endif - uint32_t processing_num = 0; - // list of primitives that can reuse same memory buffers due to execution order conflicts - std::set memory_dependencies; + std::set memory_dependencies; bool constant = false; - bool constant_frontier = false; bool data_flow = false; bool output = false; uint8_t user_mark = 0; bool optimized = false; + bool share_buffer = true; + bool _support_padding = false; mutable bool has_reused_memory = false; mutable uint32_t reused_memory_color = 0; - primitive_id org_id = ""; + const primitive_id org_id; struct fused_activation_params { @@ -288,8 +292,9 @@ namespace details struct api_typed_program_node_base : public program_node { static_assert(meta::is_api_primitive::value, "PType should name a non-const, non-volatile type derived from cldnn::primitive but not from cldnn::internal_primitive"); + friend class cldnn::graph_initializations; friend struct cldnn::program_impl; - + friend class cldnn::reorder_inputs; public: using program_node::program_node; @@ -369,4 +374,4 @@ struct typed_program_node : public typed_program_node_base program_node& input() const { return program_node::get_dependency(0); } }; -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/pyramid_roi_align_inst.h b/inference-engine/thirdparty/clDNN/src/include/pyramid_roi_align_inst.h new file mode 100644 index 0000000..f87b3f4 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/pyramid_roi_align_inst.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "api/CPP/pyramid_roi_align.hpp" +#include "primitive_inst.h" + +#include + +namespace cldnn { + template <> + struct typed_program_node : public typed_program_node_base + { + using parent = typed_program_node_base; + + public: + typed_program_node(std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) + {} + + program_node& input() const { return get_dependency(0); } + program_node& boxes() const { return get_dependency(0); } + program_node& image_meta() const { return get_dependency(1); } + program_node& P2() const { return get_dependency(2); } + program_node& P3() const { return get_dependency(3); } + program_node& P4() const { return get_dependency(4); } + program_node& P5() const { return get_dependency(5); } + program_node& pool_size() const { return get_dependency(6); } + }; + + using pyramidROIAlign_node = typed_program_node; + + template <> + class typed_primitive_inst : public typed_primitive_inst_base + { + using parent = typed_primitive_inst_base; + + public: + static layout calc_output_layout(pyramidROIAlign_node const& node); + static std::string to_string(pyramidROIAlign_node const& node); + typed_primitive_inst(network_impl& network, pyramidROIAlign_node const& node); + + memory_impl& boxes() const { return dep_memory(0); } + memory_impl& image_meta() const { return dep_memory(1); } + memory_impl& P2() const { return dep_memory(2); } + memory_impl& P3() const { return dep_memory(3); } + memory_impl& P4() const { return dep_memory(4); } + memory_impl& P5() const { return dep_memory(5); } + memory_impl& pool_size() const { return dep_memory(6); } + }; + + using pyramid_roi_align_inst = typed_primitive_inst; +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/include/reshape_inst.h b/inference-engine/thirdparty/clDNN/src/include/reshape_inst.h index 1ac9fdf..a97153a 100644 --- a/inference-engine/thirdparty/clDNN/src/include/reshape_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/reshape_inst.h @@ -26,6 +26,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/reverse_sequence_inst.h b/inference-engine/thirdparty/clDNN/src/include/reverse_sequence_inst.h new file mode 100644 index 0000000..ac02b8e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/reverse_sequence_inst.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api/CPP/reverse_sequence.hpp" +#include "primitive_inst.h" + +namespace cldnn +{ + template <> + struct typed_program_node : public typed_program_node_base + { + using parent = typed_program_node_base; + + public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } + }; + + using reverse_sequence_node = typed_program_node; + + template <> + class typed_primitive_inst : public typed_primitive_inst_base + { + using parent = typed_primitive_inst_base; + + public: + static layout calc_output_layout(reverse_sequence_node const& node); + static std::string to_string(reverse_sequence_node const& node); + + public: + typed_primitive_inst(network_impl& network, reverse_sequence_node const& desc); + }; + + using reverse_sequence_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/scale_inst.h b/inference-engine/thirdparty/clDNN/src/include/scale_inst.h index 405507a..b239ef1 100644 --- a/inference-engine/thirdparty/clDNN/src/include/scale_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/scale_inst.h @@ -25,11 +25,13 @@ namespace cldnn template <> struct typed_program_node : public typed_program_node_base { +private: using parent = typed_program_node_base; public: using parent::parent; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } program_node& input() const { return get_dependency(0); } program_node& scale_in() const { return get_dependency(1); } program_node& bias() const { return get_dependency(2); } diff --git a/inference-engine/thirdparty/clDNN/src/include/shuffle_channels_inst.h b/inference-engine/thirdparty/clDNN/src/include/shuffle_channels_inst.h new file mode 100644 index 0000000..5a633a5 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/shuffle_channels_inst.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api/CPP/shuffle_channels.hpp" +#include "primitive_inst.h" + +namespace cldnn +{ +template <> +struct typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } +}; + +using shuffle_channels_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(shuffle_channels_node const& node); + static std::string to_string(shuffle_channels_node const& node); + +public: + typed_primitive_inst(network_impl& network, shuffle_channels_node const& desc); +}; + +using shuffle_channels_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/strided_slice_inst.h b/inference-engine/thirdparty/clDNN/src/include/strided_slice_inst.h new file mode 100644 index 0000000..a12e536 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/include/strided_slice_inst.h @@ -0,0 +1,51 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include "api/CPP/strided_slice.hpp" +#include "primitive_inst.h" + +namespace cldnn +{ +template <> +struct typed_program_node : public typed_program_node_base +{ + using parent = typed_program_node_base; + +public: + using parent::parent; + + program_node& input(size_t index = 0) const { return get_dependency(index); } +}; + +using strided_slice_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base +{ + using parent = typed_primitive_inst_base; + +public: + static layout calc_output_layout(strided_slice_node const& node); + static std::string to_string(strided_slice_node const& node); + +public: + typed_primitive_inst(network_impl& network, strided_slice_node const& desc); +}; + +using strided_slice_inst = typed_primitive_inst; +} diff --git a/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h b/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h index 5d83c5b..f274381 100644 --- a/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h +++ b/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2017 Intel Corporation +// Copyright (c) 2017-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -26,33 +26,15 @@ inline std::string bool_to_str(bool cond) return cond ? "true" : "false"; } -inline std::string get_extr_type(const char* str) +inline std::string get_extr_type(const std::string& str) { - if (!str) - { - return{}; - } + auto begin = str.find('<'); + auto end = str.find('>'); - while (*str && *str != '<') - { - ++str; - } - if (!*str) - { - return{}; - } + if (begin == std::string::npos || end == std::string::npos) + return {}; - auto end = str; - while (*end && *end != '>') - { - ++end; - } - if (!*end) - { - return{}; - } - - return{ str + 1, end }; + return str.substr(begin + 1, (end - begin) -1); } inline std::string dt_to_str(data_types dt) @@ -60,6 +42,7 @@ inline std::string dt_to_str(data_types dt) switch (dt) { case data_types::i8: return "i8"; + case data_types::u8: return "u8"; case data_types::i32: return "i32"; case data_types::i64: return "i64"; case data_types::f16: return "f16"; @@ -73,18 +56,36 @@ inline std::string fmt_to_str(format fmt) { switch (fmt.value) { - case format::bfyx: return "bfyx"; - case format::byxf: return "byxf"; case format::yxfb: return "yxfb"; + case format::byxf: return "byxf"; + case format::bfyx: return "bfyx"; case format::fyxb: return "fyxb"; - case format::bs_x_bsv16: return "bs_x_bsv16"; + case format::os_iyx_osv16: return "os_iyx_osv16"; + case format::os_iyx_osv32: return "os_iyx_osv32"; + case format::os_iyx_osv64: return "os_iyx_osv64"; case format::bs_xs_xsv8_bsv8: return "bs_xs_xsv8_bsv8"; case format::bs_xs_xsv8_bsv16: return "bs_xs_xsv8_bsv16"; - case format::os_iyx_osv16: return "os_iyx_osv16"; + case format::bs_x_bsv16: return "bs_x_bsv16"; + case format::bf8_xy16: return "bf8_xy16"; + case format::image_2d_weights_c4_fyx_b: return "image_2d_weights_c4_fyx_b"; + case format::image_2d_weights_c1_b_fyx: return "image_2d_weights_c1_b_fyx"; + case format::winograd_2x3_s1_data: return "winograd_2x3_s1_data"; + case format::winograd_2x3_s1_weights: return "winograd_2x3_s1_weights"; + case format::winograd_2x3_s1_fused_weights: return "winograd_2x3_s1_fused_weights"; + case format::winograd_6x3_s1_fused_weights: return "winograd_6x3_s1_fused_weights"; + case format::image_2d_weights_winograd_6x3_s1_fbxyb: return "image_2d_weights_winograd_6x3_s1_fbxyb"; + case format::image_2d_weights_winograd_6x3_s1_xfbyb: return "image_2d_weights_winograd_6x3_s1_xfbyb"; case format::os_is_yx_isa8_osv8_isv4: return "os_is_yx_isa8_osv8_isv4"; + case format::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return "os_is_yx_isa8_osv8_isv4_swizzled_by_4"; case format::is_o_yx_isv32: return "is_o_yx_isv32"; + case format::is_o32_yx_isv32_swizzled_by_4: return "is_o32_yx_isv32_swizzled_by_4"; + case format::os_is_y_x8_osv8_isv4: return "os_is_y_x8_osv8_isv4"; case format::byxf_af32: return "byxf_af32"; + case format::byx8_f4: return "byx8_f4"; case format::fs_bs_yx_bsv4_fsv32: return "fs_bs_yx_bsv4_fsv32"; + case format::bf_lyx_yx: return "bf_lyx_yx"; + case format::b_fs_yx_fsv4: return "b_fs_yx_fs4"; break; + case format::os_is_yx_osv16_isv4: return "os_is_yx_osv16_isv4"; break; default: return "unknown (" + std::to_string(fmt.value) + ")"; } diff --git a/inference-engine/thirdparty/clDNN/src/include/upsampling_inst.h b/inference-engine/thirdparty/clDNN/src/include/upsampling_inst.h index 2cf4d47..dd1d390 100644 --- a/inference-engine/thirdparty/clDNN/src/include/upsampling_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/upsampling_inst.h @@ -27,6 +27,7 @@ template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program_impl& prog) : parent(prim, prog) { support_padding(true); } public: using parent::parent; diff --git a/inference-engine/thirdparty/clDNN/src/include/xml_object.h b/inference-engine/thirdparty/clDNN/src/include/xml_object.h deleted file mode 100644 index c32eddd..0000000 --- a/inference-engine/thirdparty/clDNN/src/include/xml_object.h +++ /dev/null @@ -1,129 +0,0 @@ -/* -// Copyright (c) 2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ -#pragma once -#include -#include -#include -#include -#include - -namespace cldnn -{ - class xml_base; - using xml_key = std::string; - using xml_base_ptr = std::shared_ptr; - using xml_map = std::unordered_map; - - class xml_base - { - public: - virtual void dump(std::ostream& out, int offset) = 0; - }; - - template - class xml_leaf : public xml_base - { - private: - Type value; - public: - xml_leaf(const Type& val) : value(val) {} - xml_leaf(Type&& val) : value(std::move(val)) {} - void dump(std::ostream& out, int) override - { - out << value; - } - }; - - template - class xml_basic_array : public xml_base - { - private: - std::vector values; - public: - xml_basic_array(const std::vector& arr) : values(arr) {} - xml_basic_array(std::vector&& arr) : values(std::move(arr)) {} - void dump(std::ostream& out, int) override - { - const char* delim = ""; - for (size_t i = 0; i < values.size(); i++) - { - out << delim << values[i]; - delim = ","; - } - } - }; - - class xml_composite : public xml_base - { - private: - xml_map children; - public: - void dump(std::ostream& out, int offset = -1) override - { - offset++; - bool first = true; - static int offset_temp; - std::string spaces(offset * 4, ' '); - if (offset!=0) out << "\n"; - for (const auto& it : children) - { - if (first) - { - out << spaces << "<" << it.first << ">"; - first = false; - } - else - out << "\n" << spaces << "<" << it.first << ">"; - - offset_temp = offset; - it.second->dump(out, offset); - - std::string spaces_behind(0, ' '); - if (offset_temp != offset) - spaces_behind = spaces; - out << spaces_behind << ""; - if (offset == 1) - { - out << spaces << "\n"; - } - }; - - if (offset > 0) - { - out << spaces << "\n"; - offset--; - } - } - - template - void add(xml_key key, Type value) - { - children[key] = std::make_shared>(value); - } - void add(xml_key key, xml_composite comp) - { - children[key] = std::make_shared(comp); - } - template - void add(xml_key key, std::vector array) - { - children[key] = std::make_shared>(array); - } - }; - - -} - diff --git a/inference-engine/thirdparty/clDNN/src/index_select.cpp b/inference-engine/thirdparty/clDNN/src/index_select.cpp index 9c14470..88acded 100644 --- a/inference-engine/thirdparty/clDNN/src/index_select.cpp +++ b/inference-engine/thirdparty/clDNN/src/index_select.cpp @@ -30,36 +30,44 @@ namespace cldnn layout index_select_inst::calc_output_layout(index_select_node const& node) { - auto desc = node.get_primitive(); + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "index_select_node!"); + auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); - auto indices_layout = node.indices().get_output_layout(); - auto indices_size = indices_layout.size.spatial[0]; - - auto axis = node.get_axis(); + int32_t output_b = input_layout.size.batch[0]; int32_t output_f = input_layout.size.feature[0]; int32_t output_x = input_layout.size.spatial[0]; int32_t output_y = input_layout.size.spatial[1]; - switch (axis) - { - case index_select_axis_name::along_b: - output_b = indices_size; - break; - case index_select_axis_name::along_f: - output_f = indices_size; - break; - case index_select_axis_name::along_x: - output_x = indices_size; - break; - case index_select_axis_name::along_y: - output_y = indices_size; - break; - default: - CLDNN_ERROR_MESSAGE(node.id(), "UNSPORTTED AXIS"); - break; + if (!node.get_reverse()) { + auto indices_layout = node.indices().get_output_layout(); + auto indices_size = indices_layout.size.spatial[0]; + auto axes = node.get_axes(); + for (size_t i = 0; i < axes.size(); i++) + { + switch (axes[i]) + { + case index_select_axis_name::along_b: + output_b = indices_size; + break; + case index_select_axis_name::along_f: + output_f = indices_size; + break; + case index_select_axis_name::along_x: + output_x = indices_size; + break; + case index_select_axis_name::along_y: + output_y = indices_size; + break; + default: + CLDNN_ERROR_MESSAGE(node.id(), "UNSUPPORTED AXIS"); + break; + } + } } return layout{ input_layout.data_type, input_layout.format, { output_b, output_f, output_x, output_y } }; } @@ -71,27 +79,30 @@ namespace cldnn std::stringstream primitive_description; std::string axis_str = ""; - switch (desc->axis) + for (size_t i = 0; i < desc->axis.size(); i++) { - case index_select_axis_name::along_b: - axis_str = "along_b"; - break; - case index_select_axis_name::along_f: - axis_str = "along_f"; - break; - case index_select_axis_name::along_y: - axis_str = "along_y"; - break; - case index_select_axis_name::along_x: - axis_str = "along_x"; - break; - default: - axis_str = "not supported axis"; - break; + switch (desc->axis.at(i)) + { + case index_select_axis_name::along_b: + axis_str += "along_b, "; + break; + case index_select_axis_name::along_f: + axis_str += "along_f, "; + break; + case index_select_axis_name::along_y: + axis_str += "along_y, "; + break; + case index_select_axis_name::along_x: + axis_str += "along_x, "; + break; + default: + axis_str += "not supported axis, "; + break; + } } json_composite index_select_info; - index_select_info.add("axis", axis_str); + index_select_info.add("axes", axis_str); node_info->add("index_select_info", index_select_info); node_info->dump(primitive_description); @@ -104,17 +115,21 @@ namespace cldnn { auto& input = node.input(); auto input_layout = input.get_output_layout(); - auto& indices = node.indices(); - auto indices_layout = indices.get_output_layout(); auto const node_id = node.id(); - CLDNN_ERROR_DATA_TYPES_MISMATCH(node_id, "indicies data_type", indices_layout.data_type, "i32 data_type ", data_types::i32, ""); CLDNN_ERROR_NOT_PROPER_FORMAT(node_id, "input_format", input_layout.format, "supported input format", format::bfyx, format::yxfb); - CLDNN_ERROR_NOT_PROPER_FORMAT(node_id, "input_format", indices_layout.format, "supported indicies format", format::bfyx, format::yxfb); - CLDNN_ERROR_NOT_EQUAL(node_id, "indicies batch_size", indices_layout.size.batch[0], "expected size", 1, ""); - CLDNN_ERROR_NOT_EQUAL(node_id, "indicies feature_size", indices_layout.size.feature[0], "expected size", 1, ""); - CLDNN_ERROR_NOT_EQUAL(node_id, "indicies y_size", indices_layout.size.spatial[1], "expected size", 1, ""); - CLDNN_ERROR_LESS_THAN(node_id, "indicies x_size", indices_layout.size.spatial[0], "expected size", 1, ""); + + if (!node.get_reverse()) + { + auto& indices = node.indices(); + auto indices_layout = indices.get_output_layout(); + CLDNN_ERROR_DATA_TYPES_MISMATCH(node_id, "indicies data_type", indices_layout.data_type, "i32 data_type ", data_types::i32, ""); + CLDNN_ERROR_NOT_EQUAL(node_id, "indicies batch_size", indices_layout.size.batch[0], "expected size", 1, ""); + CLDNN_ERROR_NOT_EQUAL(node_id, "indicies feature_size", indices_layout.size.feature[0], "expected size", 1, ""); + CLDNN_ERROR_NOT_EQUAL(node_id, "indicies y_size", indices_layout.size.spatial[1], "expected size", 1, ""); + CLDNN_ERROR_LESS_THAN(node_id, "indicies x_size", indices_layout.size.spatial[0], "expected size", 1, ""); + CLDNN_ERROR_NOT_PROPER_FORMAT(node_id, "input_format", indices_layout.format, "supported indicies format", format::bfyx, format::yxfb); + } } } diff --git a/inference-engine/thirdparty/clDNN/src/input_layout.cpp b/inference-engine/thirdparty/clDNN/src/input_layout.cpp index 8ec055f..6fa5861 100644 --- a/inference-engine/thirdparty/clDNN/src/input_layout.cpp +++ b/inference-engine/thirdparty/clDNN/src/input_layout.cpp @@ -29,6 +29,12 @@ primitive_type_id input_layout_type_id() return &instance; } +input_layout_node::typed_program_node(const std::shared_ptr dprim, program_impl& prog) + : parent(dprim, prog) +{ + can_share_buffer(false); +} + input_layout_inst::typed_primitive_inst(network_impl& network, input_layout_node const& node) : parent(network, node) { diff --git a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp index 67be2ce..d073d8b 100644 --- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp +++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2018 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,6 +13,14 @@ // limitations under the License. #include "kernel_selector_helper.h" +#include "kernel_selector_params.h" + +#include "gpu/ocl_toolkit.h" + +#include "program_node.h" +#include "program_impl.h" + +#include "training_params.h" kernel_selector::data_type to_data_type(data_types dt) { @@ -51,6 +59,7 @@ kernel_selector::weights_type to_weights_type(data_types dt) switch (dt) { case cldnn::data_types::i8: return kernel_selector::weights_type::INT8; + case cldnn::data_types::u8: return kernel_selector::weights_type::UINT8; case cldnn::data_types::f16: return kernel_selector::weights_type::F16; case cldnn::data_types::f32: return kernel_selector::weights_type::F32; default: @@ -64,6 +73,7 @@ data_types from_weights_type(kernel_selector::weights_type dt) switch (dt) { case kernel_selector::weights_type::INT8: return data_types::i8; + case kernel_selector::weights_type::UINT8: return data_types::u8; case kernel_selector::weights_type::F16: return data_types::f16; case kernel_selector::weights_type::F32: return data_types::f32; default: @@ -86,8 +96,10 @@ kernel_selector::data_layout to_data_layout(format f) case format::bf8_xy16: return kernel_selector::data_layout::bf8_xy16; case format::winograd_2x3_s1_data: return kernel_selector::data_layout::winograd_2x3_s1_data; case format::byxf_af32: return kernel_selector::data_layout::byxf_af32; + case format::byx8_f4: return kernel_selector::data_layout::byx8_f4; case format::fs_bs_yx_bsv4_fsv32: return kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32; // case format::brfyx: return kernel_selector::data_layout::brfyx; + case format::b_fs_yx_fsv4: return kernel_selector::data_layout::b_fs_yx_fsv4; default: return kernel_selector::data_layout::bfyx; } @@ -109,6 +121,7 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) case kernel_selector::data_layout::brfyx: return cldnn::format::bfyx; case kernel_selector::data_layout::winograd_2x3_s1_data: return cldnn::format::winograd_2x3_s1_data; case kernel_selector::data_layout::byxf_af32: return cldnn::format::byxf_af32; + case kernel_selector::data_layout::byx8_f4: return cldnn::format::byx8_f4; case kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32: return cldnn::format::fs_bs_yx_bsv4_fsv32; default: return cldnn::format::bfyx; @@ -125,6 +138,8 @@ kernel_selector::weights_layout to_weights_layout(format f) case format::byxf: return kernel_selector::weights_layout::oyxi; case format::yxfb: return kernel_selector::weights_layout::yxio; case format::os_iyx_osv16: return kernel_selector::weights_layout::os_iyx_osv16; + case format::os_iyx_osv32: return kernel_selector::weights_layout::os_iyx_osv32; + case format::os_iyx_osv64: return kernel_selector::weights_layout::os_iyx_osv64; case format::bs_xs_xsv8_bsv8: return kernel_selector::weights_layout::os_i_osv8__ai8; case format::bs_xs_xsv8_bsv16: return kernel_selector::weights_layout::os_i_osv16__ai8; case format::bs_x_bsv16: return kernel_selector::weights_layout::os_i_osv16; @@ -135,8 +150,13 @@ kernel_selector::weights_layout to_weights_layout(format f) case format::winograd_6x3_s1_fused_weights: return kernel_selector::weights_layout::winograd_6x3_s1_fused_weights; case format::image_2d_weights_winograd_6x3_s1_fbxyb: return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_fbxyb; case format::image_2d_weights_winograd_6x3_s1_xfbyb: return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb; - case format::os_is_yx_isa8_osv8_isv4: return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4; + case format::os_is_yx_isa8_osv8_isv4: return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4; + case format::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4_swizzled_by_4; case format::is_o_yx_isv32: return kernel_selector::weights_layout::is_o_yx_isv32; + case format::is_o32_yx_isv32_swizzled_by_4: return kernel_selector::weights_layout::is_o32_yx_isv32_swizzled_by_4; + case format::os_is_y_x8_osv8_isv4: return kernel_selector::weights_layout::os_is_y_x8_osv8_isv4; + case format::bf_lyx_yx: return kernel_selector::weights_layout::bf_lyx_yx; + case format::os_is_yx_osv16_isv4: return kernel_selector::weights_layout::os_is_yx_osv16_isv4; default: return kernel_selector::weights_layout::oi; } @@ -147,24 +167,30 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) switch (l) { case kernel_selector::weights_layout::oi: - case kernel_selector::weights_layout::oiyx: return cldnn::format::bfyx; - case kernel_selector::weights_layout::oyxi: return cldnn::format::byxf; + case kernel_selector::weights_layout::oiyx: return cldnn::format::bfyx; + case kernel_selector::weights_layout::oyxi: return cldnn::format::byxf; case kernel_selector::weights_layout::io: - case kernel_selector::weights_layout::iyxo: return cldnn::format::fyxb; - case kernel_selector::weights_layout::yxio: return cldnn::format::yxfb; - case kernel_selector::weights_layout::os_iyx_osv16: return cldnn::format::os_iyx_osv16; - case kernel_selector::weights_layout::os_i_osv16: return cldnn::format::bs_x_bsv16; - case kernel_selector::weights_layout::os_i_osv8__ai8: return cldnn::format::bs_xs_xsv8_bsv8; - case kernel_selector::weights_layout::os_i_osv16__ai8: return cldnn::format::bs_xs_xsv8_bsv16; - case kernel_selector::weights_layout::image_2d_weights_c4_fyx_b: return cldnn::format::image_2d_weights_c4_fyx_b; - case kernel_selector::weights_layout::image_2d_weights_c1_b_fyx: return cldnn::format::image_2d_weights_c1_b_fyx; - case kernel_selector::weights_layout::winograd_2x3_s1_weights: return cldnn::format::winograd_2x3_s1_weights; - case kernel_selector::weights_layout::winograd_2x3_s1_fused_weights: return cldnn::format::winograd_2x3_s1_fused_weights; - case kernel_selector::weights_layout::winograd_6x3_s1_fused_weights: return cldnn::format::winograd_6x3_s1_fused_weights; - case kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_fbxyb: return cldnn::format::image_2d_weights_winograd_6x3_s1_fbxyb; - case kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb: return cldnn::format::image_2d_weights_winograd_6x3_s1_xfbyb; - case kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4: return cldnn::format::os_is_yx_isa8_osv8_isv4; - case kernel_selector::weights_layout::is_o_yx_isv32: return cldnn::format::is_o_yx_isv32; + case kernel_selector::weights_layout::iyxo: return cldnn::format::fyxb; + case kernel_selector::weights_layout::yxio: return cldnn::format::yxfb; + case kernel_selector::weights_layout::os_iyx_osv16: return cldnn::format::os_iyx_osv16; + case kernel_selector::weights_layout::os_iyx_osv32: return cldnn::format::os_iyx_osv32; + case kernel_selector::weights_layout::os_iyx_osv64: return cldnn::format::os_iyx_osv64; + case kernel_selector::weights_layout::os_i_osv16: return cldnn::format::bs_x_bsv16; + case kernel_selector::weights_layout::os_i_osv8__ai8: return cldnn::format::bs_xs_xsv8_bsv8; + case kernel_selector::weights_layout::os_i_osv16__ai8: return cldnn::format::bs_xs_xsv8_bsv16; + case kernel_selector::weights_layout::image_2d_weights_c4_fyx_b: return cldnn::format::image_2d_weights_c4_fyx_b; + case kernel_selector::weights_layout::image_2d_weights_c1_b_fyx: return cldnn::format::image_2d_weights_c1_b_fyx; + case kernel_selector::weights_layout::winograd_2x3_s1_weights: return cldnn::format::winograd_2x3_s1_weights; + case kernel_selector::weights_layout::winograd_2x3_s1_fused_weights: return cldnn::format::winograd_2x3_s1_fused_weights; + case kernel_selector::weights_layout::winograd_6x3_s1_fused_weights: return cldnn::format::winograd_6x3_s1_fused_weights; + case kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_fbxyb: return cldnn::format::image_2d_weights_winograd_6x3_s1_fbxyb; + case kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb: return cldnn::format::image_2d_weights_winograd_6x3_s1_xfbyb; + case kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4: return cldnn::format::os_is_yx_isa8_osv8_isv4; + case kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4; + case kernel_selector::weights_layout::is_o_yx_isv32: return cldnn::format::is_o_yx_isv32; + case kernel_selector::weights_layout::is_o32_yx_isv32_swizzled_by_4: return cldnn::format::is_o32_yx_isv32_swizzled_by_4; + case kernel_selector::weights_layout::os_is_y_x8_osv8_isv4: return cldnn::format::os_is_y_x8_osv8_isv4; + case kernel_selector::weights_layout::bf_lyx_yx: return cldnn::format::bf_lyx_yx; default: return cldnn::format::bfyx; } @@ -213,6 +239,11 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split new_vals[3] = align_to(vals[3], 32); new_vals[2] = align_to(vals[2], 4); } + if (ks_layout == kernel_selector::Tensor::byx8_f4) + { + new_vals[3] = align_to(vals[3], 4); + new_vals[2] = align_to(vals[2], 8); + } for (size_t i = 0; i < vec.size(); i++) { @@ -245,9 +276,8 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split kernel_selector::weights_tensor convert_weights_tensor(const layout& l) { - assert(l.format.dimension() == 4); - const auto& t = l.size.sizes(format::bfyx); - const auto base_layout = kernel_selector::weights_layout::oiyx; + const auto& t = l.size.sizes(l.format); + const auto base_layout = to_weights_layout(l.format); const auto ks_type = to_weights_type(l.data_type); const auto ks_layout = to_weights_layout(l.format); std::vector vec(kernel_selector::WeightsTensor::ChannelsCount(base_layout)); @@ -307,10 +337,12 @@ kernel_selector::activation_function get_kernel_selector_activation_param(cldnn_ return kernel_selector::activation_function::COSH; case activation_log: return kernel_selector::activation_function::LOG; - case activation_log2: - return kernel_selector::activation_function::LOG2; + case activation_log2: + return kernel_selector::activation_function::LOG2; case activation_exp: return kernel_selector::activation_function::EXP; + case activation_not: + return kernel_selector::activation_function::NOT; default: throw std::runtime_error("Unknown activation function"); break; @@ -331,4 +363,54 @@ kernel_selector::activation_function get_kernel_selector_activation_grad_param(c throw std::runtime_error("Unknown activation_grad function"); break; } +} + +void set_params(const program_node& node, kernel_selector::params& params) +{ + const auto& context = node.get_program().get_engine().get_context(); + const auto& engine_info = context->get_engine_info(); + + params.engineInfo.bSubGroupSupport = context->extension_supported("cl_intel_subgroups"); + params.engineInfo.bSubGroupShortSupport = context->extension_supported("cl_intel_subgroups_short"); + params.engineInfo.bFP16Support = context->extension_supported("cl_khr_fp16"); + params.engineInfo.bFP64Support = context->extension_supported("cl_khr_fp64"); + params.engineInfo.bIMADSupport = engine_info.supports_imad != 0; + params.engineInfo.bIMMADSupport = engine_info.supports_immad != 0; + params.engineInfo.bImageSupport = engine_info.supports_image != 0; + params.engineInfo.maxWorkGroupSize = engine_info.max_work_group_size; + params.engineInfo.maxLocalMemSize = engine_info.max_local_mem_size; + params.engineInfo.maxImage2dWidth = engine_info.max_image2d_width; + params.engineInfo.maxImage2dHeight = engine_info.max_image2d_height; + params.engineInfo.deviceId = engine_info.dev_id; + params.engineInfo.computeUnitsCount = engine_info.compute_units_count; + params.engineInfo.deviceCache = engine_info.device_cache; + params.engineInfo.driverVersion = engine_info.driver_version; + params.engineInfo.hostVersion = to_host_version(cldnn::get_version()); +} + +void set_learning_params(const program_node& node, kernel_selector::training_params& params, bool use_momentum) +{ + const auto learning_params = node.get_program().get_options().template get()->params; + + if (use_momentum) + { + params.use_momentum = true; + } + + params.momentum_factor = learning_params.momentum; + params.weights_decay = learning_params.weights_decay; +} + +void set_optional_params(const program_impl& program, kernel_selector::optional_params& params) +{ + const auto& context = program.get_engine().get_context(); + + params.meaningfulKernelsNames = context->get_configuration().meaningful_kernels_names; + params.allowStaticInputReordering = program.get_options().get()->enabled(); + params.allowInputReordering = false; + params.allowOutputReordering = false; + + const auto& tuning_config = program.get_options().get(); + params.tuningParams.mode = to_tuning_mode(tuning_config->config.mode); + params.tuningParams.cacheFilePath = tuning_config->config.cache_file_path; } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index e2723fd..8adf3d5 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -201,6 +201,12 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, data_ expected_tensor = current_layout.size; expected_format = cldnn::format::byxf; } + // IMAD case + else if (current_layout.format == format::b_fs_yx_fsv4 || + current_layout.format == format::os_is_yx_osv16_isv4) + { + // Nothing to do, just go out from here. + } // MMAD case else if (current_layout.data_type == data_types::i8) { @@ -211,7 +217,8 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, data_ || (_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) { - if (current_layout.data_type == data_types::f32 && + // commented out due to performance reasons, maybe enable in future + /*if (current_layout.data_type == data_types::f32 && current_layout.size.batch[0] % 16 == 0 && current_layout.format == format::bfyx && output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 && @@ -226,7 +233,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, data_ expected_format = cldnn::format::bf8_xy16; } } - else + else*/ { expected_tensor = current_layout.size; expected_format = cldnn::format::bfyx; diff --git a/inference-engine/thirdparty/clDNN/src/lookup_table.cpp b/inference-engine/thirdparty/clDNN/src/lookup_table.cpp index 432bc44..22cd517 100644 --- a/inference-engine/thirdparty/clDNN/src/lookup_table.cpp +++ b/inference-engine/thirdparty/clDNN/src/lookup_table.cpp @@ -31,6 +31,9 @@ namespace cldnn layout lookup_table_inst::calc_output_layout(lookup_table_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "lookup_table_node!"); auto desc = node.get_primitive(); auto input_data_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/lrn.cpp b/inference-engine/thirdparty/clDNN/src/lrn.cpp index b25b6cb..1fe2b26 100644 --- a/inference-engine/thirdparty/clDNN/src/lrn.cpp +++ b/inference-engine/thirdparty/clDNN/src/lrn.cpp @@ -29,6 +29,8 @@ primitive_type_id lrn_type_id() layout lrn_inst::calc_output_layout(lrn_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for lrn_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/lstm.cpp b/inference-engine/thirdparty/clDNN/src/lstm.cpp index 7c80782..fae374a 100644 --- a/inference-engine/thirdparty/clDNN/src/lstm.cpp +++ b/inference-engine/thirdparty/clDNN/src/lstm.cpp @@ -31,18 +31,21 @@ primitive_type_id lstm_type_id() layout lstm_inst::calc_output_layout(lstm_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for lstm_node!"); auto input_layout = node.input().get_output_layout(); auto hidden_layout = node.inital_hidden().get_output_layout(); - // input = [ 1, sequence, batch, input_size ] - // weights = [ 1, direction, 4 * hidden_size, input_size ] - // recurrent = [ 1, direction, 4 * hidden_size, hidden_size ] - // biases = [ 1, 1, direction, 4 * hidden_size ] - // hidden = [ 1, direction, batch, hidden_size ] - // cell = [ 1, direction, batch, hidden_size ] - // output = [ sequence, direction, batch, hidden_size ] + // input = [ batch, sequence, direction, input_size ] + // weights = [ 1, direction, 4 * hidden_size, input_size ] + // recurrent = [ 1, direction, 4 * hidden_size, hidden_size ] + // biases = [ 1, 1, direction, 4 * hidden_size ] + // hidden = [ batch, 1, direction, hidden_size ] + // cell = [ batch, 1, direction, hidden_size ] + // output = [ batch, sequence, direction, hidden_size ] auto result = layout(input_layout.data_type, format::bfyx, - tensor(hidden_layout.size.feature[0], input_layout.size.feature[0], hidden_layout.size.spatial[0], hidden_layout.size.spatial[1])); + tensor(hidden_layout.size.feature[0], input_layout.size.feature[0], + hidden_layout.size.spatial[0], hidden_layout.size.spatial[1])); return result; } @@ -75,10 +78,8 @@ std::string lstm_inst::to_string(lstm_node const& node) lstm_inst::typed_primitive_inst(network_impl& network, lstm_node const& node) :parent(network, node) { - // [ARIEL] TODO: That do we need to check here?? - auto input_size = node.input().get_output_layout(); - // auto output_size = output_memory().get_layout(); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_size.format.value, "expected format", format::bfyx); - //CLDNN_ERROR_NOT_EQUAL(node.id(), "Input size", input_size.size.raw.size(), "output size", output_size.size.raw.size(), ""); + auto input_layout = node.input().get_output_layout(); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_layout.format.value, "expected format", format::bfyx); } + } diff --git a/inference-engine/thirdparty/clDNN/src/lstm_elt.cpp b/inference-engine/thirdparty/clDNN/src/lstm_elt.cpp index 718939d..d809f86 100644 --- a/inference-engine/thirdparty/clDNN/src/lstm_elt.cpp +++ b/inference-engine/thirdparty/clDNN/src/lstm_elt.cpp @@ -30,6 +30,8 @@ primitive_type_id lstm_elt_type_id() layout lstm_elt_inst::calc_output_layout(lstm_elt_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for lstm_elt_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); @@ -38,7 +40,7 @@ layout lstm_elt_inst::calc_output_layout(lstm_elt_node const& node) // output{bfyx} = [b: batch, f: 2, x: direction, y: hidden_size ] output // The output of the lstm_elt node is the concatenation of the intermediate [hidden, cell] tensors. // A crop/split node is needed to extract each individual tensors - auto result = layout(input_layout.data_type, format::bfyx, + auto result = layout(input_layout.data_type, input_layout.format, tensor(input_layout.size.batch[0], 2, input_layout.size.spatial[0] / 4, input_layout.size.feature[0])); return result; } @@ -63,6 +65,6 @@ lstm_elt_inst::typed_primitive_inst(network_impl& network, lstm_elt_node const& :parent(network, node) { auto input_size = node.input().get_output_layout(); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_size.format.value, "expected format", format::bfyx); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_size.format.value, "expected format", format::bfyx, format::fyxb); } } diff --git a/inference-engine/thirdparty/clDNN/src/lstm_gemm.cpp b/inference-engine/thirdparty/clDNN/src/lstm_gemm.cpp index 31d36fa..e39a271 100644 --- a/inference-engine/thirdparty/clDNN/src/lstm_gemm.cpp +++ b/inference-engine/thirdparty/clDNN/src/lstm_gemm.cpp @@ -31,6 +31,8 @@ primitive_type_id lstm_gemm_type_id() layout lstm_gemm_inst::calc_output_layout(lstm_gemm_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for lstm_gemm_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); auto weights_layout = node.weights().get_output_layout(); @@ -41,8 +43,7 @@ layout lstm_gemm_inst::calc_output_layout(lstm_gemm_node const& node) // biases{bfyx} = [b: 1, f:1 , x: direction, y: 4 * hidden_size ] // hidden{bfyx} = [b: batch, f: direction, x: 1 , y: hidden_size ] optional // tempGEMM{bfyx} = [b: batch, f: direction, x: 4*hidden_size, y: 1] output - - auto result = layout(input_layout.data_type, format::bfyx, tensor(input_layout.size.batch[0], weights_layout.size.feature[0], weights_layout.size.spatial[1], 1)); + auto result = layout(input_layout.data_type, input_layout.format, tensor(input_layout.size.batch[0], weights_layout.size.feature[0], weights_layout.size.spatial[1], 1)); return result; } @@ -71,7 +72,7 @@ std::string lstm_gemm_inst::to_string(lstm_gemm_node const& node) lstm_gemm_inst::typed_primitive_inst(network_impl& network, lstm_gemm_node const& node) :parent(network, node) { - auto input_size = node.input().get_output_layout(); - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_size.format.value, "expected format", format::bfyx); + auto input_layout = node.input().get_output_layout(); + CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "input format", input_layout.format.value, "expected format", format::bfyx, format::fyxb); } } diff --git a/inference-engine/thirdparty/clDNN/src/max_unpooling.cpp b/inference-engine/thirdparty/clDNN/src/max_unpooling.cpp index da67f02..5e2d99b 100644 --- a/inference-engine/thirdparty/clDNN/src/max_unpooling.cpp +++ b/inference-engine/thirdparty/clDNN/src/max_unpooling.cpp @@ -28,8 +28,16 @@ primitive_type_id max_unpooling_type_id() return &instance; } +max_unpooling_node::typed_program_node(const std::shared_ptr prim, program_impl& prog) + : parent(prim, prog) +{ + can_share_buffer(false); // for max_unpooling initial zero values are significant +} + layout max_unpooling_inst::calc_output_layout(max_unpooling_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for max_unpooling_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/memory_pool.cpp b/inference-engine/thirdparty/clDNN/src/memory_pool.cpp index d339492..2a36ee1 100644 --- a/inference-engine/thirdparty/clDNN/src/memory_pool.cpp +++ b/inference-engine/thirdparty/clDNN/src/memory_pool.cpp @@ -24,6 +24,8 @@ #include "memory_impl.h" #include "program_impl.h" +#include "program_node.h" + #include "gpu/memory_gpu.h" namespace cldnn { @@ -69,6 +71,8 @@ namespace cldnn } } } + memory_pool::~memory_pool() + { } bool memory_pool::has_conflict(const memory_set& a, const std::set& b, uint32_t b_network_id) { diff --git a/inference-engine/thirdparty/clDNN/src/mutable_data.cpp b/inference-engine/thirdparty/clDNN/src/mutable_data.cpp index d2deb02..9ad7fef 100644 --- a/inference-engine/thirdparty/clDNN/src/mutable_data.cpp +++ b/inference-engine/thirdparty/clDNN/src/mutable_data.cpp @@ -49,6 +49,7 @@ mutable_data_node::typed_program_node(const std::shared_ptr dprim, : parent(dprim, prog), mem(api_cast(dprim->mem.get())) { recalc_output_layout(false); + can_share_buffer(false); fill_memory(); } diff --git a/inference-engine/thirdparty/clDNN/src/mvn.cpp b/inference-engine/thirdparty/clDNN/src/mvn.cpp index 2674376..d0460a9 100644 --- a/inference-engine/thirdparty/clDNN/src/mvn.cpp +++ b/inference-engine/thirdparty/clDNN/src/mvn.cpp @@ -28,6 +28,8 @@ primitive_type_id mvn_type_id() layout mvn_inst::calc_output_layout(mvn_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for mvn_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp index cca47a6..07ade9a 100644 --- a/inference-engine/thirdparty/clDNN/src/network.cpp +++ b/inference-engine/thirdparty/clDNN/src/network.cpp @@ -26,10 +26,16 @@ #include "error_handler.h" #include "primitive_inst.h" #include "input_layout_inst.h" +#include "condition_inst.h" #include "kernel_selector_helper.h" #include +#include "gpu/ocl_toolkit.h" + + //#define DEBUG_DUMP_PATH "/tmp/dump/" + + #ifdef DEBUG_DUMP_PATH #include #include @@ -41,7 +47,6 @@ namespace cldnn { - #ifdef DEBUG_DUMP_PATH static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false) { @@ -142,6 +147,7 @@ static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false std::replace(filename.begin(), filename.end(), '\\', '_'); std::replace(filename.begin(), filename.end(), '/', '_'); std::replace(filename.begin(), filename.end(), ' ', '_'); + std::replace(filename.begin(), filename.end(), ':', '_'); filename = DEBUG_DUMP_PATH + filename + ".txt"; std::ofstream file_stream(filename); @@ -151,9 +157,8 @@ static float convert_half_to_float(half_t val, bool flush_denorm_to_zero = false dump(mem, file_stream); } #endif - /* -Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by const. propagator). +Network_impl will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants opt pass). */ network_impl::network_impl(const program_impl& program, bool is_internal) : _program(&program) @@ -166,8 +171,10 @@ network_impl::network_impl(const program_impl& program, bool is_internal) } allocate_primitives(); + check_names(); build_insts_deps(); - + build_exec_order(); + validate_primitives(); _program->dump_memory_pool(); } @@ -176,6 +183,20 @@ network_impl::network_impl(engine_impl& engine, const topology_impl& topo, const { } +network_impl::network_impl(engine_impl& engine, const std::set>& nodes, const build_options& options, bool is_internal) + : network_impl(*engine.build_program(nodes, options, is_internal), is_internal) +{ +} + +void network_impl::validate_primitives() +{ + for (auto const& prim : _exec_order) + { + bool valid = prim->validate(); + CLDNN_ERROR_NOT_EQUAL(prim->id(), "validate", valid, "", true, "has not a valid instance."); + } +} + void network_impl::reset_execution(bool wait) { if (wait && _events.size() > 0) @@ -198,13 +219,12 @@ void network_impl::reset_execution(bool wait) void network_impl::set_input_data(const primitive_id& id, memory_impl& data) { std::shared_ptr primitive_inst; - try { - primitive_inst = _primitives.at(id); - } - catch (...) - { + + primitive_inst = find_primitive(id); + + if(primitive_inst == nullptr) throw std::runtime_error("topology doesn't contain prmitive:" + id); - } + if (primitive_inst->type() != input_layout::type_id()) { CLDNN_ERROR_MESSAGE(id, "primitive " + id + " is not an input"); @@ -217,6 +237,46 @@ void network_impl::set_input_data(const primitive_id& id, memory_impl& data) input->set_data(data); } +void cldnn::network_impl::check_names() +{ + for (auto const& prim : _primitives) + { + if (find_in_internal_networks(prim.first) != nullptr) + CLDNN_ERROR_MESSAGE("Network_impl", "Found primitive with id: " + prim.first + + "in anotother network."); + } +} + +std::shared_ptr cldnn::network_impl::find_primitive(const primitive_id& id) +{ + std::shared_ptr ret; + + if (_primitives.find(id) != _primitives.end()) + return _primitives.at(id); + + return find_in_internal_networks(id); +} + +std::shared_ptr cldnn::network_impl::find_in_internal_networks(const primitive_id& id) +{ + std::shared_ptr ret; + + for (auto const& prim : _primitives) + { + if (prim.second->type() == condition::type_id()) //currently only condition inst contains mini networks + { + auto cond_inst = std::static_pointer_cast(prim.second); + ret = cond_inst->get_net_true()->find_primitive(id); + if (ret != nullptr) + return ret; + ret = cond_inst->get_net_false()->find_primitive(id); + if (ret != nullptr) + return ret; + } + } + return nullptr; +} + void network_impl::set_learning_rate(const float lr) { _learning_rate = lr; @@ -228,16 +288,18 @@ float network_impl::get_learning_rate() } std::string network_impl::get_primitive_info(const primitive_id& id) const -{ +{ const auto& node = _program->get_node(id); return node.type()->to_string(node); } void network_impl::allocate_primitives() { - auto nodes = _program->get_nodes(); std::vector> nodes_to_allocate{}; - nodes_to_allocate.insert(nodes_to_allocate.begin(), nodes.begin(), nodes.end()); + for (auto node : _program->get_processing_order()) + { + nodes_to_allocate.push_back(_program->get_node_ptr(node->id())); + } std::sort(nodes_to_allocate.begin(), nodes_to_allocate.end(), [](std::shared_ptr const& lhs, std::shared_ptr const& rhs) { @@ -250,7 +312,6 @@ void network_impl::allocate_primitives() } } - void network_impl::build_insts_deps() { for (auto& inst : _primitives) @@ -259,18 +320,32 @@ void network_impl::build_insts_deps() } } +void network_impl::build_exec_order() +{ + for (auto& node : _program->get_processing_order()) + { + if (!node->is_type() && + !(node->is_type() && node->get_dependencies().empty())) + { + add_to_exec_order(node->id()); + } + } +} +void network_impl::add_to_exec_order(const primitive_id& id) +{ + auto inst = get_primitive(id); + _exec_order.push_back(inst); +} + void network_impl::execute(const std::vector>& events) { //Wait for previous execution completion reset_execution(false); - for (auto& inst : _program->get_processing_order()) + for (auto& inst : _exec_order) { - if (!inst->is_type() && - !(inst->is_type() && inst->get_dependencies().empty())) - { #ifdef DEBUG_DUMP_PATH - auto& node = _program->get_node(inst->id()); + auto& node = _program->get_node(inst->id()); std::string layer_name = node.id(); #if DUMP_VERBOSE @@ -287,10 +362,9 @@ void network_impl::execute(const std::vector>& ev } } #endif - execute_primitive(get_primitive(inst->id()), events); - _exec_order.push_back(get_primitive(inst->id())); + execute_primitive(inst, events); #ifdef DEBUG_DUMP_PATH - #if DUMP_SINGLE_LAYER +#if DUMP_SINGLE_LAYER if (layer_name == DUMP_LAYER_NAME) #endif { @@ -298,7 +372,6 @@ void network_impl::execute(const std::vector>& ev } get_engine().flush_network(); #endif - } } for (auto& inst : _program->get_processing_order()) @@ -307,10 +380,10 @@ void network_impl::execute(const std::vector>& ev //the mutable_data can be updated when is both user or dependency. if (inst->is_type()) { - decltype(inst->get_processing_num()) proc_num = 0; + decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0; for (auto& user : inst->get_users()) { - auto user_proc_num = user->get_processing_num(); + auto user_proc_num = _program->get_processing_order().get_processing_number(user); if (user_proc_num > proc_num) { _events[inst->id()] = _events[user->id()]; @@ -322,7 +395,7 @@ void network_impl::execute(const std::vector>& ev { for (auto& dep : inst->get_dependencies()) { - auto dep_proc_num = dep->get_processing_num(); + auto dep_proc_num = _program->get_processing_order().get_processing_number(dep); if (dep_proc_num > proc_num) { _events[inst->id()] = _events[dep->id()]; @@ -343,8 +416,10 @@ void network_impl::execute(const std::vector>& ev prim.second->reset_output_change(); } - // Using output of previouse network as input to another one may cause hazard (in OOOQ mode) if user would not - // provide proper event to execution. Flushing pipeline should prevent this kind of issues. + get_engine().get_context()->reset_events(); + + // Using output of previouse network as input to another one may cause hazard (in OOOQ mode) if user would not + // provide proper event to execution. Flushing pipeline should prevent this kind of issues. // In scenarios with a big number of very small networks it can provide performance drop. get_engine().flush_network(); } @@ -363,7 +438,9 @@ std::vector network_impl::get_executed_primitive_ids() const std::vector ret; ret.reserve(_exec_order.size()); for (auto const& executed_primitive : _exec_order) + { ret.push_back(executed_primitive->id()); + } return ret; } @@ -410,7 +487,7 @@ std::vector> network_impl::get_primitives(const return result; } -refcounted_obj_ptr network_impl::execute_primitive(const std::shared_ptr& primitive, const std::vector>& events) +void network_impl::execute_primitive(const std::shared_ptr& primitive, const std::vector>& events) { auto id = primitive->id(); auto it = _events.find(id); @@ -422,9 +499,7 @@ refcounted_obj_ptr network_impl::execute_primitive(const std::shared ev = primitive->execute(events); else ev = get_engine().create_user_event(true); - _events.insert({ id, ev }); - return ev; } void network_impl::allocate_primitive_instance(program_node const& node) @@ -443,5 +518,4 @@ void network_impl::allocate_primitive_instance(program_node const& node) _data_outputs.push_back(inst); } } - } diff --git a/inference-engine/thirdparty/clDNN/src/nodes_ordering.cpp b/inference-engine/thirdparty/clDNN/src/nodes_ordering.cpp new file mode 100644 index 0000000..ae19ac5 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/nodes_ordering.cpp @@ -0,0 +1,119 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "program_impl.h" +#include "program_node.h" +#include "error_handler.h" + +namespace cldnn +{ + // helper method for calc_processing order + void program_impl::nodes_ordering::calc_processing_order_visit(program_node* node) + { + if (node->is_marked()) + return; + for (auto user : node->users) + { + calc_processing_order_visit(user); + } + node->mark(); + _processing_order.push_front(node); + processing_order_iterators[node] = _processing_order.begin(); + return; + } + + //DFS to sort nodes topologically + //any topological sort of nodes is required for further optimizations + void program_impl::nodes_ordering::calc_processing_order(program_impl& p) + { + _processing_order.clear(); + for (auto input : p.get_inputs()) + { + calc_processing_order_visit(input); + } + for (auto& node : _processing_order) + { + node->unmark(); + } + return; + } + + /* + recalculate processing_order + algorithm based on: CLRS 24.5 (critical path in DAG) + modifications: adjust for multiple inputs + input: any topological order in processing order + output: BFS topological order. + */ + void program_impl::nodes_ordering::calculate_BFS_processing_order() + { + std::map distances; + for (auto itr : _processing_order) + { + distances[itr] = -1; + } + int max_distance = 0; + for (auto itr : _processing_order) + { + //Init + if (distances[itr] == -1) { // this must be an input + distances[itr] = 0; // initialize input + } + // RELAX + for (auto& user : itr->get_users()) + { + distances[user] = std::max(distances[user], distances[itr] + 1); + max_distance = std::max(max_distance, distances[user]); + } + } + + //bucket sort nodes based on their max distance from input + std::vector> dist_lists; + dist_lists.resize(max_distance + 1); + for (auto itr : _processing_order) + { + dist_lists[distances[itr]].push_back(itr); + } + + //replace the old processing order by the new one, still topological. + _processing_order.clear(); + for (auto& dist : dist_lists) + { + for (auto& node : dist) + { + _processing_order.push_back(node); + processing_order_iterators[node] = _processing_order.end(); + processing_order_iterators[node]--; + } + } + return; + } + + //verifies if a given node will be processed before all its dependent nodes + bool program_impl::nodes_ordering::is_correct(program_node* node) + { + for (auto& dep : node->get_dependencies()) + { + if (get_processing_number(node) < get_processing_number(dep)) + { + return false; + } + } + return true; + } +} diff --git a/inference-engine/thirdparty/clDNN/src/normalize.cpp b/inference-engine/thirdparty/clDNN/src/normalize.cpp index d9ec578..e364575 100644 --- a/inference-engine/thirdparty/clDNN/src/normalize.cpp +++ b/inference-engine/thirdparty/clDNN/src/normalize.cpp @@ -29,6 +29,8 @@ primitive_type_id normalize_type_id() layout normalize_inst::calc_output_layout(normalize_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for normalize_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/one_hot.cpp b/inference-engine/thirdparty/clDNN/src/one_hot.cpp new file mode 100644 index 0000000..a7c1539 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/one_hot.cpp @@ -0,0 +1,97 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "one_hot_inst.h" + +#include "error_handler.h" +#include "json_object.h" +#include "primitive_type_base.h" + + +namespace cldnn +{ + primitive_type_id one_hot_type_id() + { + static primitive_type_base instance; + return &instance; + } + + layout one_hot_inst::calc_output_layout(one_hot_node const& node) + { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for one_hot_node!"); + auto input_layout = node.input().get_output_layout(); + auto desc = node.get_primitive(); + + if (desc->one_hot_axis > 3) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: one_hot_axis should be less or equal to 3."); + } + + return{ input_layout.data_type, input_layout.format, desc->shape }; + } + + std::string one_hot_inst::to_string(one_hot_node const& node) + { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + const auto& shape = desc->shape; + const auto& one_hot_axis = desc->one_hot_axis; + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite one_hot_info; + one_hot_info.add("input id", input.id()); + one_hot_info.add("output shape", shape.to_string()); + one_hot_info.add("one-hot axis", one_hot_axis); + + node_info->add("one_hot info", one_hot_info); + node_info->dump(primitive_description); + + return primitive_description.str(); + } + + one_hot_inst::typed_primitive_inst(network_impl& network, one_hot_node const& node) + : parent(network, node) + { + auto input_layout = node.input().get_output_layout(); + + const auto& input_sizes = input_layout.size; + const auto& output_sizes = argument.shape; + + std::vector input_dims = { input_sizes.batch[0], input_sizes.feature[0], + input_sizes.spatial[1], input_sizes.spatial[0] }; + std::vector output_dims = { output_sizes.batch[0], output_sizes.feature[0], + output_sizes.spatial[1], output_sizes.spatial[0] }; + + const auto& one_hot_axis = node.get_primitive()->one_hot_axis; + if (input_dims[0] != 1) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: input batch size should be equal to 1."); + } + + //bfyx format + for (int i = 3, j = 3; i > 0; --i, --j) + { + if (j == one_hot_axis) + --j; + if (input_dims[i] != output_dims[j]) + { + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: shape does not fit input size."); + } + } + } +} diff --git a/inference-engine/thirdparty/clDNN/src/permute.cpp b/inference-engine/thirdparty/clDNN/src/permute.cpp index af32597..38e684e 100644 --- a/inference-engine/thirdparty/clDNN/src/permute.cpp +++ b/inference-engine/thirdparty/clDNN/src/permute.cpp @@ -31,42 +31,18 @@ primitive_type_id permute_type_id() return &instance; } -static std::vector get_permute_order(permute_node const& node, format::type fmt) -{ - - CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "node format", fmt, "byxf, yxfb, bfyx, fyxb", format::byxf, format::yxfb, format::bfyx, format::fyxb); - switch (fmt) - { - // For input formats: - // 0 - batch (b), 1 - feature (f), 2, 3 - spatial (x -> 2, y -> 3) - case format::byxf: - return{ 0, 3, 2, 1 }; - - case format::yxfb: - return{ 3, 2, 1, 0 }; - - case format::bfyx: - return{ 0, 1, 3, 2 }; - - case format::fyxb: - return{ 1, 3, 2, 0 }; - - default: - throw std::invalid_argument("This format is not supported in GPU permute_inst"); - } -} layout permute_inst::calc_output_layout(permute_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for permute_node!"); auto input_layout = node.input().get_output_layout(); auto permute_order = node.get_primitive()->permute_order; - auto input_sizes_ordered = input_layout.size.sizes(input_layout.format); - - const auto& fmt_2_bfxy = get_permute_order(node, input_layout.format); std::vector output_sizes; - for (auto i : fmt_2_bfxy) + + for (size_t x = 0; x < permute_order.size(); x++) { - output_sizes.push_back(input_sizes_ordered[permute_order[i]]); + output_sizes.push_back(input_layout.size.raw[permute_order[x]]); } auto input_size = tensor(output_sizes); diff --git a/inference-engine/thirdparty/clDNN/src/pooling.cpp b/inference-engine/thirdparty/clDNN/src/pooling.cpp index 6006d14..18ecebc 100644 --- a/inference-engine/thirdparty/clDNN/src/pooling.cpp +++ b/inference-engine/thirdparty/clDNN/src/pooling.cpp @@ -30,6 +30,8 @@ primitive_type_id pooling_type_id() layout pooling_inst::calc_output_layout(parent::typed_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for pooling_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); @@ -50,6 +52,11 @@ layout pooling_inst::calc_output_layout(parent::typed_node const& node) CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input_layout.format", input_layout.format.value, "argmax_layout.format", argmax_layout.format); } + if (desc->global_pooling) { + window_size.spatial[0] = input_layout.size.spatial[0]; + window_size.spatial[1] = input_layout.size.spatial[1]; + } + // TODO: Consider moving general parameter verification to arguments constructor. CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "stride spatial X", stride.spatial[0], "", 0, "Stride spatial X must be positive (>= 1)"); CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(), "stride spatial Y", stride.spatial[1], "", 0, "Stride spatial Y must be positive (>= 1)"); diff --git a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp index 32c7861..30ff836 100644 --- a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp +++ b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp @@ -22,6 +22,7 @@ #include "input_layout_inst.h" #include "max_unpooling_inst.h" #include "apply_adam_inst.h" +#include "fused_conv_eltwise_inst.h" #include "network_impl.h" #include "engine_impl.h" @@ -40,11 +41,12 @@ uint32_t primitive_inst::get_network_id() const event_impl::ptr primitive_inst::execute(const std::vector& events) { - CLDNN_ERROR_BOOL(id(), "Invalid/unset input", !_has_valid_input, "Cannot execute primitive " + id() + " with invalid/unset input"); + const auto primitive_id = id(); + CLDNN_ERROR_BOOL(primitive_id, "Invalid/unset input", !_has_valid_input, "Cannot execute primitive " + primitive_id + " with invalid/unset input"); on_execute(); if (_exec_deps.size() == 0) - return _impl->execute(events, *this); + return _impl->execute(events, *this); std::vector dependencies; dependencies.reserve(_exec_deps.size()); @@ -53,15 +55,15 @@ event_impl::ptr primitive_inst::execute(const std::vector& even auto id = input->id(); try { // if the requested event deos not exits it means that it has not been executed, so the processing_order is wrong or synchronization failed. - auto ev = get_network().get_primitive_event(id); + auto ev = get_network().get_primitive_event(id); dependencies.emplace_back(ev); - } + } catch (const std::out_of_range& oor) { - std::string temp = std::string("internal CLDNN error: execution order corrupted.") + std::string("\n") + std::string(oor.what() + std::string("\n")); + std::string temp = std::string("internal CLDNN error: execution order corrupted.") + std::string("\n") + std::string(oor.what() + std::string("\n")); CLDNN_ERROR_MESSAGE(id, temp); } } - return _impl->execute(dependencies, *this); + return _impl->execute(dependencies, *this); } void primitive_inst::build_deps() @@ -95,6 +97,16 @@ primitive_inst::primitive_inst(network_impl& network, program_node const& node, //For certain primitives, it is known which dependency is used for synchronization only else if (user->is_type() && (user->as().has_additional_dep()) && (user->as().additional_dep().id() == node.id())) user_count--; + else if (user->is_type()) + { + if ((*user->as().get_users().begin())->is_type()) + { + if (user->as().get_dependency(1).id() == node.id()) + { + user_count--; + } + } + } } if (user_count == 1 && mutable_data_count == 1) @@ -119,15 +131,9 @@ memory_impl::ptr primitive_inst::allocate_output() return get_network().get_engine().allocate_memory(layout, _node.id(), get_network_id(), _node.get_memory_dependencies(), false); } else if (_network.is_internal() || - _node.is_type() || - _node.is_type() || - _node.is_type() || - //for max_unpooling initial zero values are significant - _node.is_type() || - //apply adam's output initial val should be either 0 or use same buffer as mutable_data after it (no allocation needed) - _node.is_type() || - _node.can_be_optimized() || - _node.is_output()) + (!_node.can_share_buffer()) || + _node.can_be_optimized() || + _node.is_output()) { return get_network().get_engine().allocate_memory(layout); } diff --git a/inference-engine/thirdparty/clDNN/src/prior_box.cpp b/inference-engine/thirdparty/clDNN/src/prior_box.cpp index 6f3678b..d4a53fb 100644 --- a/inference-engine/thirdparty/clDNN/src/prior_box.cpp +++ b/inference-engine/thirdparty/clDNN/src/prior_box.cpp @@ -194,6 +194,8 @@ void prior_box_node::calc_result() layout prior_box_inst::calc_output_layout(prior_box_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for prior_box_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); assert(input_layout.size.spatial.size() == 2); diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index 005e883..731da08 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -16,55 +16,38 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// -#include "program_impl.h" -#include "primitive_inst.h" -#include "layout_optimizer.h" -#include "constants_propagator.h" - -#include "primitive_type.h" -#include "api/CPP/activation.hpp" -#include "api/CPP/eltwise.hpp" -#include "api/CPP/input_layout.hpp" -#include "api/CPP/pooling.hpp" -#include "api/CPP/proposal.hpp" -#include "api/CPP/roi_pooling.hpp" -#include "api/CPP/reorg_yolo.hpp" - -#include "activation_inst.h" -#include "batch_norm_inst.h" +#include "error_handler.h" +#include "kernel_selector_helper.h" #include "internal_primitive.h" #include "internal_primitive_type_base.h" +#include "layout_optimizer.h" +#include "pass_manager.h" +#include "primitive_type.h" +#include "program_dump_graph.h" +#include "program_helpers.h" +#include "program_impl.h" +#include "sliding_window_utils.h" + #include "convolution_inst.h" #include "concatenation_inst.h" #include "crop_inst.h" #include "data_inst.h" -#include "mutable_data_inst.h" #include "deconvolution_inst.h" #include "detection_output_inst.h" -#include "lrn_inst.h" -#include "normalize_inst.h" -#include "permute_inst.h" +#include "input_layout_inst.h" +#include "lstm_inst.h" +#include "lstm_elt_inst.h" +#include "lstm_gemm_inst.h" +#include "mutable_data_inst.h" +#include "pooling_inst.h" +#include "primitive_inst.h" #include "prior_box_inst.h" +#include "proposal_inst.h" #include "reorder_inst.h" #include "reshape_inst.h" -#include "scale_inst.h" -#include "embed_inst.h" -#include "softmax_inst.h" #include "split_inst.h" -#include "program_dump_graph.h" -#include "upsampling_inst.h" -#include "eltwise_inst.h" -#include "fully_connected_inst.h" -#include "mvn_inst.h" -#include "lstm_inst.h" -#include "lstm_gemm_inst.h" -#include "lstm_elt_inst.h" -#include "embed_inst.h" -#include "network_impl.h" -#include "kernel_selector_helper.h" -#include "sliding_window_utils.h" -#include "error_handler.h" +#include "gpu/ocl_toolkit.h" #include #include @@ -72,214 +55,55 @@ #include #include #include +#include +program_impl::program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal, bool no_optimizations) + : engine(&engine_ref), options(options), processing_order(* new nodes_ordering), pm(std::unique_ptr(new pass_manager())) +{ + set_options(); + prepare_nodes(topology); + if (no_optimizations) + init_graph(); + else + build_program(is_internal); +} -namespace { - - //helper function for selecting function basing on the type of the given primitive - //this is the termination case for parameter pack recurrence, see overload below for logic - template - void do_for_types(program_node&) - { - return; - } - - //helper function for selecting function basing on the type of the given primitive - //this function should be explicitly given set of types and implicitly set of functions. - //both sets should have equal size. First function will be called if type of the given primitive - //will match first explicitly given type, second will be called if it matches second explicitly given - //type etc. - //Functions given as arguments should themselves take std::shared_ptr as argument - //where T is the type that should be match if this function should be called - // - //example: - // do_for_types< - // convolution, - // pooling - // >(primitive, - // [](typed_program_node&){ do something if 'primitive' is a convolution }, - // [](typed_program_node&) { do something if 'primitive' is a pooling } - // ); - template - decltype(static_cast(std::declval()(std::declval&>()))) do_for_types( - program_node& node, - Func const& func, - RestOfFuncs const&... rest) - { - if (node.type() == T::type_id()) - func(node.as()); - else - do_for_types(node, rest...); - } - - template - struct single_element_container - { - single_element_container(T& t) : elem(&t) - {} - - constexpr size_t size() const { return 1; } - single_element_container begin() const { return single_element_container(elem); } - single_element_container end() const { return single_element_container(nullptr); } - single_element_container& operator ++() { elem = nullptr; return *this; } - bool operator !=(single_element_container const& sec) { return elem != sec.elem; } - - T operator *() { return *elem; } - - private: - single_element_container(T* t) : elem(t) - {} - - T* elem; - }; - - //helper function which creates single-element array if it's given anything - //other than std::vector. - //It should be used in generic code when there's a need to force vector usage - //in foreach loop over variable which can in one context be a vector or a scalar - //in another. - //example: - // T t; - // for (auto& string : wrap_if_single(t.dump())) - //depending on type T, t.dump() may return either std::string or std::vector, - //to ensure compatibility between these cases, wrap_if_single will create single-element - //container in case t.dump() would return plain std::string. - // - // T& case -> returns container which holds T& - template - single_element_container wrap_if_single(T& t) - { - return single_element_container(t); - } - - //helper function which creates single-element array if it's given anything - //other than std::vector. - // T const& case -> returns container which holds T const& - template - single_element_container wrap_if_single(T const& t) - { - return single_element_container(t); - } - - //helper function which creates single-element array if it's given anything - //other than std::vector. - // T&& case -> returns container which holds new instance of T created by moving given param - template - single_element_container wrap_if_single(T&& t) - { - static_assert(meta::always_false::value, "Wrapping temporary object into single_element_container is an error (requires valid reference)"); - return single_element_container(t); - } - - //helper function which creates single-element array if it's given anything - //other than std::vector. - // std::vector case -> does not wrap, returns t as-is - primitive::fixed_size_vector_ref const& wrap_if_single(primitive::fixed_size_vector_ref const& t) - { - return t; - } - - //helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization - void merge_buffers(engine_impl::ptr engine, program_node &node, layout target_layout, size_t begin_offset, size_t end_offset) - { - memory_impl::ptr data_to_allocate = engine->allocate_memory(target_layout); - - for (size_t i = begin_offset; i < end_offset; i++) - { - auto& weights = node.get_dependency(i).as(); - mem_lock src{ weights.get_attached_memory() }; - mem_lock dst{ data_to_allocate }; - std::copy(src.begin(), src.end(), dst.begin() + (i - begin_offset)*src.size()); - } - - for (size_t i = 0; i < end_offset - begin_offset - 1; i++) - node.remove_dependency(begin_offset + 1); +program_impl::program_impl(engine_impl& engine_ref, std::set> const& nodes, build_options const& options, bool is_internal) + : engine(&engine_ref), options(options), processing_order(*new nodes_ordering), pm(std::unique_ptr(new pass_manager())) +{ + set_options(); + prepare_nodes(nodes); + build_program(is_internal); +} - auto& data_node = node.get_dependency(begin_offset).as(); - data_node.attach_memory(*data_to_allocate, false); - } +program_impl::~program_impl() = default; - //helper function for getting target layout used in depthwise sep optimization - layout get_weights_layout(typed_program_node &data_node, int32_t split) +program_node& program_impl::get_node(primitive_id const& id) +{ + try { - auto mem_layout = data_node.get_output_layout(); - - return layout(mem_layout.data_type, mem_layout.format, { split * mem_layout.size.batch[0], mem_layout.size.feature[0], mem_layout.size.spatial[0], mem_layout.size.spatial[1] }); + return *nodes_map.at(id); } - - // pair.first tells whether l1 and l2 are absolutely identical - // pair.second tells whether l1 and l2 can be reinterpreted to each other without need of reordering - // note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted nor dropped) - // note: if layouts describe two buffers with different size, consider them not to be identical even if smaller buffer can be considered to hold subsequence of larger buffer, - // this behavior is required to force buffer allocation for smaller buffer which, currently, should always be performed - std::pair are_layouts_identical(layout const& l1, layout const& l2) + catch (...) { - if (l1 == l2) - return{ true, true }; - if (l1.data_type != l2.data_type) - return{ false, false }; - if (l1.size != l2.size) - return{ false, false }; - if (l1.get_linear_size() != l2.get_linear_size()) - return{ false, false }; - if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) || - (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16)) - return{ false, false }; - - auto l1_pitch = l1.get_pitches(); - auto l2_pitch = l2.get_pitches(); - - //ignore pitches which will never be used (for dims with size == 1) - for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i) - if (l1.size.raw[i] == 1) - l1_pitch.raw[i] = 0; - for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i) - if (l2.size.raw[i] == 1) - l2_pitch.raw[i] = 0; - - auto l1_offset = l1.get_linear_offset(); - auto l2_offset = l2.get_linear_offset(); - if (l1_pitch == l2_pitch && l1_offset == l2_offset) - return{ false, true }; - - return{ false, false }; + throw std::runtime_error("Program doesn't contain primtive node: " + id); } } -program_impl::program_impl(engine_impl& engine_ref, topology_impl const& topology, build_options const& options, bool is_internal) - : engine(&engine_ref), options(options), output_size_handling_enabled(true) +program_node const& program_impl::get_node(primitive_id const& id) const { - static std::atomic id_gen{ 0 }; - prog_id = ++id_gen; - assert(prog_id != 0); - - if ((options.get()->config.mode == tuning_mode::tuning_tune_and_cache) && - !engine->configuration().enable_profiling) + try { - throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!"); + return *nodes_map.at(id); } - - init_graph(topology); - pre_optimize_graph(); - compile_graph(); - post_optimize_graph(); - - engine->compile_program(*this); - this->dump_program("13_finished", true); - - //Makes serialization with given name. - //Placeholder, not working yet, in progress. - auto serialization_network_name = get_serialization_network_name(options); - if (!serialization_network_name.empty() && !is_internal) + catch (...) { - this->serialize(serialization_network_name); + throw std::runtime_error("Program doesn't contain primtive node: " + id); } - - cleanup(); } // TODO: Remove once we will get full support for input/output padding in all primitive implementations. -void program_impl::analyze_output_size_handling_need() +bool program_impl::analyze_output_size_handling_need() { bool handling_needed = false; @@ -344,72 +168,181 @@ void program_impl::analyze_output_size_handling_need() } } - output_size_handling_enabled = handling_needed; + return handling_needed; } -std::list> program_impl::get_nodes() const +// create new nodes for a program based on the set of nodes +// method created to be used by propagate_constants to build sub program from constant nodes +void program_impl::prepare_nodes(std::set>const &nodes) { - std::list> ret; - - for (auto& node : processing_order) - ret.push_back(nodes_map.at(node->id())); - return ret; + for (const auto& itr : nodes) + { + if (itr.get()->is_type()) + { + get_or_create( + std::make_shared(itr.get()->id(), itr.get()->as().get_primitive()->mem.get_layout()) + ); + } + else + { + get_or_create(itr->desc); + } + } + for (const auto& node : nodes_map) + { + auto node_ptr = node.second; + if (node_ptr == nullptr) + throw error("NULL pointer in nodes_map.", CLDNN_ERROR); + //ToDo: avoid O(n^2) run time here (pass map instead of set?) + bool found = false; + for (const auto& src_node : nodes) + { + if (src_node == nullptr) + throw error("NULL pointer in nodes_map.", CLDNN_ERROR); + if (node.first == src_node->get_primitive()->id) + { + copy_node_dependencies(node_ptr.get(), src_node.get()); + found = true; + break; + } + } + if (!found) + { + add_node_dependencies(node_ptr.get()); + } + if (node_ptr->dependencies.size() == 0) + inputs.push_back(node_ptr.get()); + } } -void program_impl::init_graph(topology_impl const& topology) +// create all nodes from topology primitives, add dependencies among them and create inputs list +void program_impl::prepare_nodes(topology_impl const &topology) { auto const& topo_map = topology.get_primitives(); - for (auto const& prim : topo_map) + for (const auto& prim : topo_map) { - auto& n = get_or_create(prim.second); - inputs.push_back(&n); + get_or_create(prim.second); + } + add_split_outputs(); + for (const auto& node : nodes_map) + { + auto node_ptr = node.second.get(); + if (node_ptr == nullptr) + throw error("NULL pointer in nodes_map.", CLDNN_ERROR); + add_node_dependencies(node_ptr); + if (node_ptr->dependencies.size()==0) + { + inputs.push_back(node_ptr); + } } - replace_nodes_pre(); +} - for (auto itr = inputs.begin(); itr != inputs.end(); ) +// add node's dependecies from its primitive dependencies +void program_impl::add_node_dependencies(program_node* node) +{ + auto deps = node->get_primitive()->dependencies(); + //add pointers to node's dependencies + for (auto& dep : deps) { - auto node_itr = itr++; - auto& node = (*node_itr); - auto deps = node->get_primitive()->dependencies(); - if (deps.empty()) - continue; + try { + auto dep_node = nodes_map.at(dep); + node->dependencies.push_back(dep_node.get()); + dep_node->users.push_back(node); + } + catch (...) { + throw std::runtime_error("Program doesn't contain primitive: " + dep + + " that is input to: " + node->get_primitive()->id); + } + } +} - //add pointers to node's dependencies - for (auto& dep : deps) - { - try { - auto dep_node = nodes_map.at(dep); - node->dependencies.push_back(dep_node.get()); - dep_node->users.push_back(node); - } - catch (...) { - throw std::runtime_error("Program doesn't contain primitive: " + dep + - " that is input to: " + node->get_primitive()->id); - } +/* helper method for program_impl constructor from list of nodes which + copies src_node dependecies to the destination node dest_node dependencies. + But only to those which appaer in this program implementation nodes_map */ +void program_impl::copy_node_dependencies(program_node* dest_node, program_node* src_node) +{ + if (dest_node->get_primitive()->id != src_node->get_primitive()->id) + { + throw std::runtime_error("Node " + src_node->get_primitive()->id + " and its copy " + dest_node->get_primitive()->id + " do not match."); + } + auto src_deps = src_node->get_dependencies(); + //add pointers to node's dependencies + for (auto& src_dep : src_deps) + { + // do not copy dependencies to nodes which does not belong to the new (subgraph) topology + if (nodes_map.find(src_dep->get_primitive()->id) == nodes_map.end()) continue; + + try { + auto dest_dep = nodes_map.at(src_dep->get_primitive()->id); + dest_node->dependencies.push_back(dest_dep.get()); + dest_dep->users.push_back(dest_node); } + catch (...) { + throw std::runtime_error("Program doesn't contain primitive: " + src_dep->get_primitive()->id + + " that is input to: " + src_node->get_primitive()->id); + } + } +} + +void program_impl::set_options() +{ + static std::atomic id_gen{ 0 }; + prog_id = ++id_gen; + assert(prog_id != 0); - //primitive has dependencies so remove it from 'inputs' - inputs.erase(node_itr); + if ((options.get()->config.mode == tuning_mode::tuning_tune_and_cache) && + !engine->configuration().enable_profiling) + { + throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!"); } +} - replace_nodes_post(); - handle_lstm(); - set_outputs(); - calc_processing_order(); +void program_impl::build_program(bool is_internal) +{ + init_graph(); + { + pre_optimize_graph(is_internal); + } + run_graph_compilation(); + { + post_optimize_graph(is_internal); + } + engine->compile_program(*this); + this->dump_program("finished", true); + cleanup(); +} - dump_program("0_init", true); +void program_impl::init_graph() +{ + graph_initializations graph_initializations_pass; + pm->run(*this, graph_initializations_pass); + + calculate_prior_boxes calculate_prior_boxes_pass; + pm->run(*this, calculate_prior_boxes_pass); + + mark_nodes mark_nodes_pass; + pm->run(*this, mark_nodes_pass); +} - calc_prior_boxes(); dump_program("1_calculated_prior_boxes", true); - mark_constants(); - mark_data_flow(); - dump_program("2_analyzed_graph", true); +void program_impl::run_graph_compilation() { + compile_graph compile_graph_pass; + pm->run(*this, compile_graph_pass); } -void program_impl::pre_optimize_graph() +void program_impl::pre_optimize_graph(bool is_internal) { - trim_to_outputs(); dump_program("3_trimmed", true); - calculate_BFS_processing_order(); - analyze_output_size_handling_need(); + trim_to_outputs trim_pass; //trim to outputs + pm->run(*this, trim_pass); // ToDo remove hidden dependencies from trimm pass + + handle_input_padding handle_input_padding; // handle symmetric and asymmetric padding for input + pm->run(*this, handle_input_padding); + + add_reshape_to_primitives add_reshape_to_primitives_pass; // add reshape to input/parameters for some primitives + pm->run(*this, add_reshape_to_primitives_pass); + + processing_order.calculate_BFS_processing_order(); // this method makes sense only for OOOQ (out of order execution queue) + + bool output_size_handling_enabled = analyze_output_size_handling_need(); for (auto& node : processing_order) { if (!node->is_type() && !node->is_type()) @@ -418,56 +351,125 @@ void program_impl::pre_optimize_graph() if (options.get()->enabled()) { - prepare_primitive_fusing(); + prepare_primitive_fusing prepare_primitive_fusing_pass; + pm->run(*this, prepare_primitive_fusing_pass); + layout_optimizer lo(output_size_handling_enabled); - reorder_inputs(lo); - // this code should move to post compilation after kernel selector will support handling reorder bias - pre_optimize_bias(lo); - dump_program("4_reordered_inputs", true); + reorder_inputs reorder_inputs_pass(lo); + pm->run(*this, reorder_inputs_pass); + + // this code should be moved to post compilation after kernel selector will support handling reorder bias + pre_optimize_bias pre_optimize_bias_pass(lo); + pm->run(*this, pre_optimize_bias_pass); + + // passes regarding conv + eltwise optimizations + + // shrinking eltwise if users are conv 1x1 with stride > 1 optimization + eltwise_shrinking eltwise_shrinking_pass; + pm->run(*this, eltwise_shrinking_pass); + + // trying to set stride to 1x1 by shrinking convolutions before eltwise if doable + eltwise_remove_stride eltwise_remove_stride_pass; + pm->run(*this, eltwise_remove_stride_pass); + + prepare_conv_eltw_fusing prepare_conv_eltw_fusing_pass; + pm->run(*this, prepare_conv_eltw_fusing_pass); + + prepare_conv_eltw_read_write_opt prepare_conv_eltw_read_write_opt_pass; + pm->run(*this, prepare_conv_eltw_read_write_opt_pass); } handle_reshape(); - remove_redundant_reorders(); dump_program("5_removed_redundant_reorders", true); - prepare_padding(); - prepare_depthwise_sep_opt(); - propagate_constants(); dump_program("6_propagated_constants", true); + remove_redundant_reorders remove_redundant_reorders_pass; + pm->run(*this, remove_redundant_reorders_pass); + + prepare_padding prepare_padding_pass(output_size_handling_enabled); + pm->run(*this, prepare_padding_pass); + + prepare_depthwise_sep_opt prepare_depthwise_sep_opt_pass; + pm->run(*this, prepare_depthwise_sep_opt_pass); + + if (!is_internal) + { + propagate_constants propagate_constants_pass; // ToDo remove hidden dependencies from propagate_constants pass + pm->run(*this, propagate_constants_pass); + } //try to fuse buffers (i.e. depth_concat in bfyx format) after padding calculations if (options.get()->enabled()) { - prepare_buffer_fusing(); + prepare_buffer_fusing prepare_buffer_fusing_pass; + pm->run(*this, prepare_buffer_fusing_pass); } - dump_program("7_pre_optimized", true); + //check if there exists some layout incompatibilities and add an reorder node if required + add_required_reorders add_required_reorders_pass; + pm->run(*this, add_required_reorders_pass); } -void program_impl::compile_graph() +void program_impl::post_optimize_graph(bool is_internal) { - for (auto& node : processing_order) + layout_optimizer lo; + post_optimize_weights post_optimize_weights_pass(lo); + pm->run(*this, post_optimize_weights_pass); + + remove_redundant_reorders remove_redundant_reorders_pass; + pm->run(*this, remove_redundant_reorders_pass); //TODO: do we need it at this place also? + + if (!is_internal) { - if (!node->is_type() && !node->is_type()) - { - node->get_output_layout(); - if (!node->is_type() && !(node->is_type() && node->get_dependencies().empty())) - node->selected_impl = node->type()->choose_impl(*engine, *node); - } + propagate_constants propagate_constants_pass; // ToDo remove hidden dependencies from propagate_constants pass + pm->run(*this, propagate_constants_pass); } - dump_program("8_compiled", true); + prep_opt_depthwise_sep_post prep_opt_depthwise_sep_post_pass; + pm->run(*this, prep_opt_depthwise_sep_post_pass); + + prepare_memory_dependencies(); } -void program_impl::post_optimize_graph() +// mark if the node is constant assuming that all dependencies are marked properly +void program_impl::mark_if_constant(program_node& node) { - layout_optimizer lo; - post_optimize_weights(lo); dump_program("9_reordered_weights", true); - remove_redundant_reorders(); dump_program("10_removed_redundant_reorders", true); //TODO: do we need it at this place also? - propagate_constants(); dump_program("11_propagated_constants", true); - prep_opt_depthwise_sep_post(); - update_processing_numbers(); dump_program("12_validated_processing_order", true); - prepare_memory_dependencies(); + if (node.get_dependencies().empty()) + return; + if (node.is_type()) + return; + node.constant = true; + for (auto& dep : node.get_dependencies()) + { + if (!dep->constant) + { + node.constant = false; + break; + } + } } +// mark if the node is in data flow assuming that all dependencies are marked properly +void program_impl::mark_if_data_flow(program_node& node) +{ + if (node.is_type() || node.is_type()) + { + node.data_flow = true; + } + else + { + node.data_flow = false; + size_t inputs_count = node.get_dependencies().size(); + if (node.is_type() || node.is_type()) + inputs_count = 2; //ignore third input as it is related to prior boxes (i.e. concat of prior-boxes) + for (size_t idx = 0; idx < inputs_count; idx++) + { + if (node.get_dependency(idx).is_in_data_flow()) + { + node.data_flow = true; + return; + } + } + } +} void program_impl::cleanup() { for (auto& node : processing_order) @@ -488,21 +490,13 @@ void program_impl::cleanup() } } -std::string get_id_string(size_t i) { - std::stringstream ss; - ss << std::setw(5) << std::setfill('0') << i; - return ss.str(); -} - -void program_impl::replace_nodes_pre() -{ +void program_impl::add_split_outputs() { auto itr = nodes_map.begin(); while (itr != nodes_map.end()) { auto node_itr = itr++; auto& node = (*node_itr).second; - //find split primitives and create crop primitives out of them if (node->is_type()) { auto split_prim = node->as().typed_desc(); @@ -522,2175 +516,339 @@ void program_impl::replace_nodes_pre() } } +program_impl::nodes_ordering& program_impl::get_processing_order() +{ + return processing_order; +} -void program_impl::replace_nodes_post() +const program_impl::nodes_ordering& program_impl::get_processing_order() const { - auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed - while (itr != nodes_map.end()) - { - auto node_itr = itr++; - auto& node = (*node_itr).second; + return processing_order; +} - //find split primitives and create crop primitives out of them - if (node->is_type()) +void add_memory_dependency(program_node* node, program_node* dep) +{ + if (node->can_be_optimized() || + !dep->can_be_optimized()) + { + node->add_memory_dependency(dep->id()); + } + else + { + if (node->id() == dep->id()) + { + return; + } + for (auto subdep : dep->get_dependencies()) { - //check if split is not used by any primitive, as it will be optimized - if (node->get_users().size() != 0) - throw std::logic_error("Split layer cannot be used directly! Please use split output \"" + node->id() + ":\"!"); + add_memory_dependency(node, subdep); + add_memory_dependency(subdep, node); + } + } +} - //get_output size and validate split primitive inputs - auto output_layout = node->get_output_layout(); - auto output_layout_size = output_layout.size; +void program_impl::basic_memory_dependencies() +{ + auto itr = processing_order.begin(); + std::vector past_outputs; + while (itr != processing_order.end()) + { + auto& node = *itr; + itr++; - auto split_prim = node->as().typed_desc(); - primitive_id input_id = split_prim->input[0]; - auto split_num = split_prim->output_offsets.size(); + //data primitive can't be reused + if (node->is_type()) + continue; - //create crop for each split ouptut provided - for (decltype(split_num) i = 0; i < split_num; i++) - { - primitive_id output_id = node->id() + ":" + split_prim->output_ids[i]; + // add my dependencies to restriction list (can't share input.output buffers) + for (auto it : node->get_dependencies()) + { + add_memory_dependency(node, it); + add_memory_dependency(it, node); + } - auto node_ptr = nodes_map.find(output_id)->second; + // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, this output + // has to land on the primitve restriction list. Otherwise memory reuse can corrupt final results. + node->add_memory_dependency(past_outputs); + // if current node is an output add it to the outputs list after restriction. + if (node->is_output()) + past_outputs.push_back(node->id()); + } +} - //calculate crop reference input size - tensor reference_input_size; - for (decltype(split_num) j = 0; j < i; j++) - reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j]; +void program_impl::skipped_branch_memory_dependencies() +{ + // Primitive A can't use primitive B buffer if processing_num(B) < processing_num(A) and for any usr - the user of B processing_num(usr) > processing_num(A) + // Otherwise it could override data that has to be used in the future. + auto itrB = processing_order.begin(); + while (itrB != processing_order.end()) + { + auto& nodeB = *itrB; + auto itrA = ++itrB; + if (nodeB->get_users().size()==0) + continue; - for (decltype(split_num) j = i; j < split_num - 1; j++) - reference_input_size += split_prim->output_offsets[j + 1] - split_prim->output_offsets[j]; - - reference_input_size = output_layout_size - reference_input_size; - - //update crop primitive and add connections - node_ptr->set_output_padding(output_layout.data_padding); - auto crop_prim = node_ptr->as().typed_desc(); - crop_prim->reference_input = reference_input_size; - - add_connection(node->get_dependency(0), *node_ptr); - } - - //remove input->split connection and remove original split node - remove_connection(node->get_dependency(0), *node); - optimized_out.push_back(node->id()); - nodes_map.erase(node->id()); - continue; - } - - //find upsampling primitives with bilinear filtering and create deconvolution with proper weights instead - if (node->is_type()) - { - auto upsampling_prim = node->as().typed_desc(); - - if (upsampling_prim->sample_type != upsampling_sample_type::bilinear) - continue; - - //check if num_filter is not 0 (required for bilinear upsampling) - if (upsampling_prim->num_filter == 0) - throw std::logic_error("num_filter in upsampling cannot be 0 in bilinear filtering mode in \"" + node->id() + "\"!"); - - primitive_id upsampling_id = node->id(); - auto& input_node = node->get_dependency(0); - - primitive_id input_id = upsampling_prim->input[0]; - auto num_filter = upsampling_prim->num_filter; - - //setting deconvolution parameters based on upsampling input - auto scale = static_cast(upsampling_prim->scale); - tensor stride(1, 1, scale, scale); - auto offset = static_cast(std::ceil((scale - 1) / 2.f)); - tensor input_offset(0, 0, -offset, -offset); - - //setting weights for deconvolution - auto kernel_size = static_cast((2 * scale) - (scale % 2)); - layout weights_layout(data_types::f32, format::bfyx, tensor(1, 1, kernel_size, kernel_size)); - - std::vector weights_vec; - for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++) - { - memory_impl::ptr data_to_allocate = engine->allocate_memory(weights_layout); - mem_lock dst{ data_to_allocate }; - float *dst_data = dst.data(); - //initialize with bilinear weights data - auto f = static_cast(std::ceil(kernel_size / 2.0f)); - float c = (2 * f - 1 - f % 2) / (2.f * f); - float x = 0.f; - float y = 0.f; - for (size_t i = 0; i < weights_layout.count(); ++i) { - x = static_cast(i % kernel_size); - y = static_cast((i / kernel_size) % kernel_size); - dst_data[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c)); - } - - //create weights primitive, with dummy memory which will be replaced in firther step - primitive_id weights_id = upsampling_id + "_deconvolution_weights" + std::to_string(weights_idx); - layout dummy_layout(data_types::f32, format::bfyx, tensor(1, 1, 1, 1)); - float zero = 0.f; - auto weights_prim = std::make_shared(weights_id, memory::attach(dummy_layout, &zero, 1)); - get_or_create(weights_prim); - - weights_vec.push_back(weights_id); - - auto weights_node_ptr = nodes_map.find(weights_id)->second; - - //attach weights buffer - auto& data_node = weights_node_ptr->as(); - data_node.attach_memory(*data_to_allocate, false); - } - - //remove upsampling node, rename it and move to the optimized list - remove_connection(node->get_dependency(0), *node); - auto rename_id = upsampling_id + "_tmp"; - rename(*node, rename_id); - - //create deconvolution primitive - auto deconv_prim = std::make_shared(upsampling_id, input_id, weights_vec, stride, input_offset); - get_or_create(deconv_prim); - - auto deconv_node_ptr = nodes_map.find(upsampling_id)->second; - - auto upsampling_node_ptr = nodes_map.find(rename_id)->second; - replace_all_usages(*upsampling_node_ptr, *deconv_node_ptr); - optimized_out.push_back(rename_id); - nodes_map.erase(rename_id); - - //add connections input->deconvolution and weights->deconvolution - add_connection(input_node, *deconv_node_ptr); - - for (uint32_t weights_idx = 0; weights_idx < num_filter; weights_idx++) - { - auto weights_node_ptr = nodes_map.find(weights_vec[weights_idx])->second; - add_connection(*weights_node_ptr, *deconv_node_ptr); - } - continue; - } - - //find deconvolution primitives with stride 1 and change them to convolution with trasposed weights - if (node->is_type()) - { - if (!options.get()->enabled()) - continue; - - auto deconv_prim = node->as().typed_desc(); - - //limit optimization to stride = 1 - if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient()) - continue; - - primitive_id deconv_id = node->id(); - auto& input_node = node->get_dependency(0); - - primitive_id input_id = deconv_prim->input[0]; - - //setting convolution parameters based on deconvolution params - auto stride = deconv_prim->stride; - auto weights = deconv_prim->weights; - std::vector weights_vec; - for (auto& weights_id : weights) - weights_vec.push_back(weights_id); - auto biases = deconv_prim->bias; - std::vector bias_vec; - for (auto& bias_id : biases) - bias_vec.push_back(bias_id); - auto input_offset = deconv_prim->input_offset; - auto with_activation = deconv_prim->with_activation; - auto activation_negative_slope = deconv_prim->activation_negative_slope; - auto output_padding = deconv_prim->output_padding; - - //remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list - tensor filter_size = { 1, 1, 1, 1 }; - remove_connection(node->get_dependency(0), *node); - for (auto& weights_id : weights_vec) - { - auto weights_node_ptr = nodes_map.find(weights_id)->second; - remove_connection(*weights_node_ptr, *node); - //get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls have same size - if (weights_id == weights_vec[0]) - filter_size = weights_node_ptr->get_output_layout().size; - } - - input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1); - input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1); - - if (!bias_vec.empty()) - { - for (auto& bias_id : bias_vec) - { - auto bias_id_node_ptr = nodes_map.find(bias_id)->second; - remove_connection(*bias_id_node_ptr, *node); - } - } - auto rename_id = deconv_id + "_tmp"; - rename(*node, rename_id); - - //create convolution primitive - if (biases.size() != 0) - { - auto conv_prim = std::make_shared(deconv_id, input_id, weights_vec, bias_vec, - stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding); - get_or_create(conv_prim); - } - else - { - auto conv_prim = std::make_shared(deconv_id, input_id, weights_vec, - stride, input_offset, tensor{ 1, 1, 1, 1 }, with_activation, activation_negative_slope, output_padding); - get_or_create(conv_prim); - } - - auto conv_node_ptr = nodes_map.find(deconv_id)->second; - auto conv_node = &conv_node_ptr->as(); - conv_node->set_transposed(true); - - //add connections input->convolution, weights->convolution and bias->convolution - add_connection(input_node, *conv_node_ptr); - - for (auto& weights_id : weights_vec) - { - auto weights_node_ptr = nodes_map.find(weights_id)->second; - add_connection(*weights_node_ptr, *conv_node_ptr); - } - - if (!bias_vec.empty()) - { - for (auto& bias_id : bias_vec) - { - auto bias_id_node_ptr = nodes_map.find(bias_id)->second; - add_connection(*bias_id_node_ptr, *conv_node_ptr); - } - } - - auto deconv_node_ptr = nodes_map.find(rename_id)->second; - replace_all_usages(*deconv_node_ptr, *conv_node_ptr); - optimized_out.push_back(rename_id); - nodes_map.erase(rename_id); - - continue; - } - } -} - -void program_impl::handle_lstm() -{ - bool has_lstm_children; - auto itr = nodes_map.begin(); //note we need to use iterators since currently processed element can be removed - while (itr != nodes_map.end()) - { - auto node_itr = itr++; - auto& node = (*node_itr).second; - has_lstm_children = false; - // replace lstm node with lstm_gemm and lstm_elt nodes - if (node->is_type()) { - bool initial_hidden_term = node->as().initial_hidden_term(); - bool initial_cell_term = node->as().initial_cell_term(); - bool bias_term = node->as().bias_term(); - auto lstm_prim = node->as().typed_desc(); - primitive_id weights_id = lstm_prim->weights; - primitive_id recurrent_id = lstm_prim->recurrent; - primitive_id bias_id = bias_term ? lstm_prim->bias : ""; - primitive_id initial_hidden_id = initial_hidden_term ? lstm_prim->initial_hidden : ""; - primitive_id initial_cell_id = initial_cell_term ? lstm_prim->initial_cell : ""; - //removing connection with weights to get proper dependency order for next operations - remove_connection(*nodes_map.at(weights_id), *node); - remove_connection(*nodes_map.at(recurrent_id), *node); - if (bias_term) - remove_connection(*nodes_map.at(bias_id), *node); - if (initial_hidden_term) - remove_connection(*nodes_map.at(initial_hidden_id), *node); - if (initial_cell_term) - remove_connection(*nodes_map.at(initial_cell_id), *node); - - //calculating sizes - auto input_size = node->get_dependency(0).get_output_layout().size; - auto recurrent_size = nodes_map.at(recurrent_id)->get_output_layout().size; - auto hidden_size = tensor(input_size.batch[0], 1, recurrent_size.spatial[0], input_size.feature[0]); - size_t directions = recurrent_size.feature[0]; - size_t input_dependencies = node->get_dependencies().size(); - size_t sequence_len = node->as().sequence_len(); - - //if the sequence has a single element but it has multiple inputs then - //the parent of this lstm is an lstm node. If this is a bidirectional lstm - //then the sequence length is the number of dependencies divided by 2. - if (sequence_len == 1 && input_dependencies > 1) - sequence_len = (directions == 1) ? input_dependencies : input_dependencies / 2; - - //check if this lstm node has an lstm child - for (auto& user : node->get_users()) - { - if (user->is_type()) - { - has_lstm_children = true; - } - } - - std::vector cell_list(directions * sequence_len); - std::vector concat_depends(directions * sequence_len); - std::vector output_ids_offsets(directions * sequence_len); - - primitive_id hidden_fwd_id = initial_hidden_id; - primitive_id hidden_bwd_id = initial_hidden_id; - primitive_id cell_fwd_id = initial_cell_id; - primitive_id cell_bwd_id = initial_cell_id; - - auto split_direction = [&](const std::string gate, bool initial_term, primitive_id& fwd_id, primitive_id& bwd_id) { - if (initial_term) { - primitive_id initial_id = fwd_id; - fwd_id = node->id() + ":" + gate + "_fwd"; - auto fwd_node = std::make_shared(fwd_id, initial_id, hidden_size, tensor{ 0,0,0,0 }); - auto &n1 = get_or_create(fwd_node); - add_connection(*nodes_map.at(initial_id), n1); - bwd_id = node->id() + ":" + gate + "_bwd"; - auto bwd_node = std::make_shared(bwd_id, initial_id, hidden_size, tensor{ 0,1,0,0 }); - auto &n2 = get_or_create(bwd_node); - add_connection(*nodes_map.at(initial_id), n2); - } - }; - - //if bidirectional lstm then initial_hidden and initial_cell terms need to be split - if (directions > 1) { - split_direction("hidden", initial_hidden_term, hidden_fwd_id, hidden_bwd_id); - split_direction("cell", initial_cell_term, cell_fwd_id, cell_bwd_id); - } - - //lstm expanding - for (size_t dir = 0; dir < directions; ++dir) { - auto hidden_id = dir == 0 ? hidden_fwd_id : hidden_bwd_id; - auto cell_id = dir == 0 ? cell_fwd_id : cell_bwd_id; - for (size_t i = 0; i < sequence_len; ++i) { - size_t idx = i + dir * sequence_len; - primitive_id lstm_gemm_id = node->id() + ":lstm_gemm" + get_id_string(idx); - primitive_id lstm_elt_id = node->id() + ":lstm_elt" + get_id_string(idx); - primitive_id crop_id = node->id() + ":crop" + get_id_string(idx); - - size_t input_idx = i; - //for bidirectional lstms, if first LSTM layer then reverse input - //for subsequent stacked layers the input is strided on the dir dimension - if (directions > 0) { - if (input_dependencies > sequence_len) { // stacked layer - input_idx = dir * sequence_len + i; - } - else - { - if (dir > 0) { // first layer - input_idx = sequence_len - i - 1; - } - } - } - primitive_id lstm_gemm_input_id = node->get_dependency(input_idx).get_org_primitive_id(); - - auto lstm_gemm_node = std::make_shared(lstm_gemm_id, lstm_gemm_input_id, weights_id, recurrent_id, bias_id, hidden_id, (uint32_t)dir); - auto &n1 = get_or_create(lstm_gemm_node); - - auto lstm_elt_node = std::make_shared(lstm_elt_id, lstm_gemm_id, cell_id, lstm_prim->clip, lstm_prim->input_forget, - lstm_prim->activations, lstm_prim->activation_params, lstm_prim->offset_order); - auto &n2 = get_or_create(lstm_elt_node); - //adding lstm_elt as user - add_connection(n1, n2); - //adding dependecy to lstm_gemm node - //input - add_connection(node->get_dependency(input_idx), n1); - //adding weights and initial values to lstm_gemm - add_connection(*nodes_map.at(weights_id), n1); - add_connection(*nodes_map.at(recurrent_id), n1); - if (bias_term) - add_connection(*nodes_map.at(bias_id), n1); - - //adding cell and hiddens as dependencies - if (i > 0) - { - add_connection(*cell_list[size_t(i - 1) * directions + dir], n2); - add_connection(*(concat_depends[size_t(i - 1) * directions + dir]), n1); - } - //if initial values are present - else - { - if (initial_hidden_term) - add_connection(*nodes_map.at(hidden_id), n1); - if (initial_cell_term) - add_connection(*nodes_map.at(cell_id), n2); - } - - //lstm_hidden - hidden_id = crop_id + ":hidden"; - auto crop_hidden = std::make_shared(hidden_id, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 }); - auto &n3 = get_or_create(crop_hidden); - //adding eltwise as dependency to hidden - add_connection(n2, n3); - - //if parent is lstm adding hiddens as dependency - if (has_lstm_children) - { - for (auto& user : node->get_users()) - { - add_connection(n3, *user); - } - } - concat_depends[i * directions + dir] = &n3; - - //lstm_cell - if (i < sequence_len - 1) { - cell_id = crop_id + ":cell"; - auto crop_cell = std::make_shared(cell_id, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 }); - auto &n4 = get_or_create(crop_cell); - add_connection(n2, n4); - cell_list[i * directions + dir] = &n4; - } - output_ids_offsets[i * directions + dir] = hidden_id; - } - } - - //if there is no next lstm, concatenation is created - if (!has_lstm_children) - { - primitive_id original_id = node->id(); - primitive_id concatenation_id = original_id + ":concat"; - auto concatenation_primitive = std::make_shared(concatenation_id, output_ids_offsets, concatenation::along_f); - auto &concatenation_node = get_or_create(concatenation_primitive); - for (auto sub_dependency : concat_depends) - { - add_connection(*sub_dependency, concatenation_node); - } - if (directions == 2) { - // bidirectional support requires concatenations along the direction and sequence axis - // instead we can concatenate along the sequence axis and reshape the tensor to the account - // for the direction - tensor output_size {input_size.batch[0], (int32_t)sequence_len, hidden_size.spatial[0], (int32_t)directions}; - primitive_id reshape_id = original_id + ":reshape"; - auto reshape_primitive = std::make_shared(reshape_id, concatenation_id, output_size); - auto &reshape_node = get_or_create(reshape_primitive); - add_connection(concatenation_node, reshape_node); - for (auto& user : node->get_users()) - { - add_connection(reshape_node, *user); - } - } - } - - //removing expanded node - remove_all_connections(*node); - nodes_map.erase(node->id()); - continue; - } - } - -} - -void program_impl::set_outputs() -{ - auto outputs_option = options.get(); - if (!outputs_option->outputs.empty()) - { - for (auto const& output : outputs_option->outputs) - { - auto o_node = nodes_map.at(output); - o_node->set_output(true); - outputs.push_back(o_node.get()); - } - } - else - { - for (auto& node : nodes_map) - if (node.second->is_endpoint()) - { - node.second->set_output(true); - outputs.push_back(node.second.get()); - } - } -} - -void program_impl::calc_processing_order() -{ - processing_order.clear(); - - //run dfs to sort nodes topologically - for (auto input : inputs) - { - if (input->is_marked()) - continue; - - input->mark(); - std::list::const_iterator>> stack = { std::make_pair(input, input->users.begin()) }; - - while (!stack.empty()) //imitate call stack + // find the last user of B in processing order + auto itrUsr = nodeB->get_users().begin(); + auto lastUsr = itrUsr++; + while (itrUsr != nodeB->get_users().end()) { - new_frame: - auto& frame = stack.back(); - - while (frame.second != frame.first->users.end()) - { - auto successor = *frame.second; - ++frame.second; - - if (!successor->is_marked()) - { - successor->mark(); - - //recurrence call - stack.push_back(std::make_pair(successor, successor->users.begin())); - goto new_frame; - } - } - - //we have finished processing one node so add it to the processing queue - processing_order.push_front(frame.first); - frame.first->processing_itr = processing_order.begin(); - - //return from call - stack.pop_back(); + if (processing_order.get_processing_number(*lastUsr) < processing_order.get_processing_number(*itrUsr)) + lastUsr = itrUsr; + itrUsr++; } - } - - uint32_t idx = 0; - for (auto& node : processing_order) - { - node->processing_num = ++idx; - node->unmark(); - } -} - -void program_impl::update_processing_numbers() -{ - uint32_t idx = 0; - for (auto& node : processing_order) - { - node->processing_num = ++idx; - } - for (auto& node : processing_order) - { - if (!processing_order_is_correct(node)) + //mark all nodes in between B and lastUsr of B as forbidden to share buffer with B + while (itrA != processing_order.get_processing_iterator(**lastUsr)) { - CLDNN_ERROR_MESSAGE(node->id(), "Incorrect processing order"); - return; + auto& nodeA = *itrA; + itrA++; + add_memory_dependency(nodeA, nodeB); + add_memory_dependency(nodeB, nodeA); } } } -void program_impl::calc_prior_boxes() +void program_impl::oooq_memory_dependencies() { auto itr = processing_order.begin(); - while (itr != processing_order.end()) - { - auto& node = (*itr++); - if (!node->is_type()) - continue; - - auto& pb_node = node->as(); - - pb_node.calc_result(); - remove_connection(pb_node.input(), pb_node); - - auto& result = pb_node.get_result_buffer(); - result.add_ref(); // need to inc ref count since we will be assigning this memory as cldnn_memory in next line that is not ref_count_obj - auto cpp_mem = details::memory_c_to_cpp_converter::convert(api_cast(&result)); - - auto& data_node = get_or_create(std::make_shared("_cldnn_tmp_" + pb_node.id() + "_result", cpp_mem)); - replace(pb_node, data_node, false, false); - } -} - - + // This order let us build dependencies based on syncing points. + // Set of nodes between two syncing points will be called sync_region. + // Major rules is: can't share resource with nodes in my sync_region -void program_impl::mark_constants() -{ - for (auto& node : processing_order) + int32_t last_barrier = 0; + bool needs_barrier = false; + std::vector sync_region; + while (itr != processing_order.end()) { - if (node->dependencies.empty()) - continue; - if (node->is_type()) - continue; + auto& node = *itr; + itr++; - node->constant = true; - for (auto& dep : node->get_dependencies()) + // if any of dep has proccess num after barrier -> needs barrier + for (auto dep : node->get_dependencies()) { - if (!dep->constant) + if (processing_order.get_processing_number(dep) >= last_barrier) { - node->constant = false; + needs_barrier = true; break; } } - if (!node->constant) - for (auto& dep : node->get_dependencies()) - if (dep->constant) - dep->constant_frontier = true; - } -} - -void program_impl::mark_data_flow() -{ - std::list stack; - for (auto const& node : processing_order) - { - if ((node->is_endpoint() && !node->constant) || node->is_type()) + if (needs_barrier) { - stack.push_back(node); - node->data_flow = true; - node->mark(); - } - } - - while (!stack.empty()) - { - auto node = stack.front(); - stack.pop_front(); - - size_t dep_idx = 0; - size_t inputs_count = (node->is_type() ? node->get_dependencies().size() : node->get_primitive()->input.size()); - //TODO: remove this hack after addition of constants propagation pass - if (node->is_type() || node->is_type()) - inputs_count = 2; //ignore third input as it is related to prior boxes (i.e. concat of prior-boxes) + last_barrier = processing_order.get_processing_number(node); + needs_barrier = false; + // add each pair bi-direction dependency + for (auto nd1 = sync_region.begin(); nd1 + 1 != sync_region.end(); nd1++) + { + for (auto nd2 = nd1 + 1; nd2 != sync_region.end(); nd2++) + { + add_memory_dependency(*nd1, *nd2); + add_memory_dependency(*nd2, *nd1); + } + } - for (auto dep : node->get_dependencies()) - { - bool data_flow = (dep_idx < inputs_count && !dep->constant); - ++dep_idx; - if (!data_flow) - continue; + // collect dependencies of every node in sync region + std::vector deps; + for (auto& nd_in_region : sync_region) + for (auto& dep : nd_in_region->get_dependencies()) + deps.emplace_back(dep); - dep->data_flow = data_flow; - if (dep->is_marked()) - continue; + for (auto& nd_in_region : sync_region) + for (auto& dep : deps) + { + add_memory_dependency(nd_in_region, dep); + add_memory_dependency(dep, nd_in_region); + } - stack.push_back(dep); - dep->mark(); + sync_region.clear(); } - } - - for (auto& node : processing_order) - { - assert(!node->constant || !node->data_flow); //node which is constant cannot be marked as data flow - node->unmark(); + sync_region.push_back(node); } } -void program_impl::trim_to_outputs() +void program_impl::prepare_memory_dependencies() { - size_t actual_nodes = processing_order.size(); - if (!actual_nodes) //degenerated case but can happen - return; - - if (outputs.size() == actual_nodes) - return; - - //do backward bfs starting from all outputs - std::list*> stack = { &outputs }; - while (!stack.empty()) - { - auto nodes_list = stack.front(); - stack.pop_front(); - - for (auto node : *nodes_list) - { - if (!node->is_marked()) - { - node->mark(); - if (!node->get_dependencies().empty()) - stack.push_back(&node->get_dependencies()); - } - } - } - - //all not-marked nodes should be removed - std::list to_rem; - for (auto node : processing_order) - { - if (node->is_type()) //input layout may become disconnected during prior boxes calculations so it may have not been marked at this place but we don't want to remove it - node->mark(); - else if (!node->is_marked()) - to_rem.push_back(node); - } - - for (auto const& node : to_rem) - { - if (node->is_input()) - inputs.remove(node); - else - { - for (auto dep : node->dependencies) - if (dep->is_marked()) - dep->users.remove(node); - } - - for (auto user : node->users) - if (user->is_marked()) - user->dependencies.erase(std::remove(user->dependencies.begin(), user->dependencies.end(), node), user->dependencies.end()); - - optimized_out.push_back(node->id()); - nodes_map.erase(node->id()); - } -} - -void add_memory_dependency(program_node* node, program_node* dep) -{ - if (node->can_be_optimized() || - !dep->can_be_optimized()) - { - node->add_memory_dependency(dep->id()); - } - else - { - if (node->id() == dep->id()) - { - return; - } - for (auto subdep : dep->get_dependencies()) - { - add_memory_dependency(node, subdep); - add_memory_dependency(subdep, node); - } - } -} - -void program_impl::basic_memory_dependencies() -{ - auto itr = processing_order.begin(); - std::vector past_outputs; - while (itr != processing_order.end()) - { - auto& node = *itr; - itr++; - - //data primitive can't be reused - if (node->is_type()) - continue; - - // add my dependencies to restriction list (can't share input.output buffers) - for (auto it : node->get_dependencies()) - { - add_memory_dependency(node, it); - add_memory_dependency(it, node); - } - - // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs, this output - // has to land on the primitve restriction list. Otherwise memory reuse can corrupt final results. - node->add_memory_dependency(past_outputs); - // if current node is an output add it to the outputs list after restriction. - if (node->is_output()) - past_outputs.push_back(node->id()); - } -} - -void program_impl::skipped_branch_memory_dependencies() -{ - auto itr = processing_order.begin(); - // Primitive A can't use primitive B buffer if B->processing_num < A->processing_num and any of B users processing_num > A->processing_num - // Otherwise it could override data that has to be used in the future. - // TODO: improve algorithm to to O(n*log(n)) - while (itr != processing_order.end()) - { - auto& node = *itr; - itr++; - auto itr2 = processing_order.begin(); - if (itr2 == itr) - continue; - while (itr2 != processing_order.end()) - { - auto& node2 = *itr2; - itr2++; - if (node2->get_processing_num() < node->get_processing_num()) - { - // if at least one user will be processed after 'node', node2 has to be added to forbiden list - for (auto usr : node2->get_users()) - { - if (usr->get_processing_num() > node->get_processing_num()) - { - add_memory_dependency(node, node2); - add_memory_dependency(node2, node); - break; - } - } - } - } - } -} - -void program_impl::oooq_memory_dependencies() -{ - auto itr = processing_order.begin(); - // This order let us build dependencies based on syncing points. - // Set of nodes between two syncing points will be called sync_region. - // Major rules is: can't share resource with nodes in my sync_region - - uint32_t last_barrier = 0; - bool needs_barrier = false; - std::vector sync_region; - while (itr != processing_order.end()) - { - auto& node = *itr; - itr++; - - // if any of dep has proccess num after barrier -> needs barrier - for (auto dep : node->get_dependencies()) - { - if (dep->get_processing_num() >= last_barrier) - { - needs_barrier = true; - break; - } - } - - if (needs_barrier) - { - last_barrier = node->get_processing_num(); - needs_barrier = false; - // add each pair bi-direction dependency - for (auto nd1 = sync_region.begin(); nd1 + 1 != sync_region.end(); nd1++) - { - for (auto nd2 = nd1 + 1; nd2 != sync_region.end(); nd2++) - { - add_memory_dependency(*nd1, *nd2); - add_memory_dependency(*nd2, *nd1); - } - } - - // collect dependencies of every node in sync region - std::vector deps; - for (auto& nd_in_region : sync_region) - for (auto& dep : nd_in_region->get_dependencies()) - deps.emplace_back(dep); - - - for (auto& nd_in_region : sync_region) - for (auto& dep : deps) - { - add_memory_dependency(nd_in_region, dep); - add_memory_dependency(dep, nd_in_region); - } - - sync_region.clear(); - } - sync_region.push_back(node); - } -} - -void program_impl::prepare_memory_dependencies() -{ - if (!get_engine().configuration().enable_memory_pool) - return; - - basic_memory_dependencies(); - skipped_branch_memory_dependencies(); - oooq_memory_dependencies(); -} - -std::string program_impl::get_memory_dependencies_string() const -{ - std::string mem_dep = "Memory dependencies/restrictions:\n"; - auto itr = processing_order.begin(); - while (itr != processing_order.end()) - { - auto& node = *itr; - itr++; - mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: "); - for (auto it : node->get_memory_dependencies()) - mem_dep == mem_dep.append(it).append(", "); - mem_dep = mem_dep.append("\n"); - } - return mem_dep; -} - -void program_impl::remove_redundant_reorders() -{ - auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed - while (itr != processing_order.end()) - { - auto& node = (*itr++); //post-inc to avoid invalidation due to possible erase - if (!node->is_type()) //only care for reorders - continue; - - program_node* current_node = node; - std::vector r_nodes_to_remove; - - auto optimize = true; - while (current_node) - { - auto& r_node = current_node->as(); - current_node = nullptr; - - if (r_node.has_mean() || !r_node.get_primitive()->subtract_per_feature.empty() || //do not optimize if mean of subtract are present - (r_node.is_output() && r_node.get_dependency(0).is_output())) //do not optimize when both reorder and layer before are outputs - { - optimize = false; - break; - } - - r_nodes_to_remove.push_back(&r_node); - - if (r_node.get_dependency(0).is_type() && r_node.get_dependencies().size() == 1 && r_node.get_users().size() == 1 && r_node.get_dependency(0).get_users().size() == 1) - current_node = &r_node.get_dependency(0); - } - if (!optimize) - continue; - - assert(node->dependencies.size() == 1 && "reorder without mean should have exactly one dependecy (input)"); - auto& r_output = r_nodes_to_remove.front(); - auto& r_input = r_nodes_to_remove.back()->get_dependency(0); - auto o_layout = r_output->get_output_layout(); - auto i_layout = r_input.get_output_layout(); - - auto ident = are_layouts_identical(o_layout, i_layout); - if (!ident.second) - continue; - - for (auto remove_reorder_node : r_nodes_to_remove) - { - auto& r_node = remove_reorder_node->as(); - - if (ident.first && ident.second && r_node.is_output() && r_node.get_dependency(0).is_input()) //do not optimize when reorder is output and layer before is input - { - optimize = false; - break; - } - } - if (!optimize) - continue; - - for (auto remove_reorder_node : r_nodes_to_remove) - { - auto& r_node = remove_reorder_node->as(); - - //mark as optimized - r_node.can_be_optimized(true); - r_node.requires_reinterpret(!ident.first); - if (ident.first) //no need of reshape - extract_and_remove(r_node); //try to remove if possible (with respect to r_node not being marked as output) - } - } -} - -/* - recalculate processing_order - algorithm based on: CLRS 24.5 (critical path in DAG) - modifications: adjust for multiple inputs - input: any topological order in processing order - output: BFS topological order. -*/ - -void program_impl::calculate_BFS_processing_order() { - std::map distances; - for (auto itr : processing_order) - { - distances[itr] = -1; - } - int max_distance = 0; - for (auto itr : processing_order) - { - //Init - if (distances[itr] == -1) { // this must be an input - distances[itr] = 0; // initialize input - } - // RELAX - for (auto& user : itr->get_users()) - { - distances[user] = std::max(distances[user], distances[itr] + 1); - max_distance = std::max(max_distance, distances[user]); - } - } - - //bucket sort nodes based on their max distance from input - std::vector> dist_lists; - dist_lists.resize(max_distance + 1); - for (auto itr : processing_order) - { - dist_lists[distances[itr]].push_back(itr); - } - - //replace the old processing order by the new one, still topological. - processing_order.clear(); - for (auto& dist : dist_lists) - { - for (auto& node : dist) - { - processing_order.push_back(node); - node->processing_itr = processing_order.end(); - node->processing_itr--; - } - } - update_processing_numbers(); - return; -} - -void program_impl::reorder_inputs(layout_optimizer& lo) -{ - //first pass to set layout optimization_attributes for topology - for (auto& p : nodes_map) - { - auto& prim = *p.second; - if (prim.type() == cldnn::convolution::type_id()) - { - if (prim.as().get_primitive()->split() > 1) - lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::splitted_convolution, 1); - } - - //list of layers that do not support yxfb or perform worse than bfyx - if (prim.type() == cldnn::detection_output::type_id() || prim.type() == cldnn::proposal::type_id() || - prim.type() == cldnn::roi_pooling::type_id() || prim.type() == cldnn::deconvolution::type_id() || - prim.type() == cldnn::upsampling::type_id() || prim.type() == cldnn::reorg_yolo::type_id()) - lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bfyx_only_layer, 1); - } - - const auto reorder_input = [this, &lo](typed_program_node& conv_node) - { - auto conv_prim = conv_node.get_primitive(); - auto& input_node = conv_node.get_dependency(0); - auto&& weights_layout = conv_node.weights(0).get_output_layout(); - auto&& input_layout = input_node.get_output_layout(); - - std::shared_ptr new_input = nullptr; - - if (input_node.type() == reorder::type_id()) //convolution's input is a reorder - { - auto reorder_prim = input_node.as().typed_desc(); - auto& reorder_input = input_node.get_dependency(0); - auto reorder_layout = input_node.get_output_layout(); - reorder_layout.data_type = reorder_prim->output_data_type; - new_input = lo.get_reorder( - reorder_layout, - reorder_prim->id, - layout_optimizer::data_type::input, - conv_node, - weights_layout).first; - - auto reorder_removed = false; - if (new_input && new_input->output_format != format::winograd_2x3_s1_data && new_input->output_format != format::bf8_xy16 && new_input->output_format != format::byxf) //output format is not optimal - { - auto reorder_input_layout = reorder_input.get_output_layout(); - - auto opt_layout = layout(new_input->output_data_type, new_input->output_format, reorder_input_layout.size); - if (reorder_input_layout == opt_layout) //reorder 'breaks' optimal format - { - if (reorder_prim->subtract_per_feature.empty() && - reorder_prim->mean.empty() && - !reorder_prim->output_padding) //just plain reorder - { - conv_node.replace_dependency(0, reorder_input); - if (input_node.get_users().size() == 0 && !input_node.is_output()) - { - reorder_removed = extract_and_remove(input_node); - } - new_input = nullptr; - } - else //change reorder's output layout - { - reorder_prim->output_format = opt_layout.format; - reorder_prim->output_data_type = opt_layout.data_type; - new_input = nullptr; - } - } - else //current reorder gives bad output, simply change it - { - reorder_prim->output_format = opt_layout.format; - reorder_prim->output_data_type = opt_layout.data_type; - new_input = nullptr; - } - } - - if (!reorder_removed) - input_node.recalc_output_layout(); - else - conv_node.recalc_output_layout(); - } - else - { - new_input = lo.get_reorder( - input_node.get_output_layout(), - input_node.id(), - layout_optimizer::data_type::input, - conv_node, - weights_layout).first; - } - - if (new_input && new_input->output_format == format::winograd_2x3_s1_data) - { - auto lower_size = (conv_prim->input_offset.negate() + input_layout.size); - - tensor upper_input_padding = tensor{ 0 }; - upper_input_padding.spatial[0] = (2 - (lower_size.spatial[0] % 2)) % 2; //winograd conv requires input's x to be in form 4 + 2n, with restriction that x >= 3, we can shortage it to x % 2 == 0 - upper_input_padding.spatial[1] = (8 - ((lower_size.spatial[1] - 2) % 8)) % 8; //for y, y - 2 % 8 == 0 must hold - - apply_needed_padding(conv_node, input_node, padding{ conv_prim->input_offset.negate().sizes(), upper_input_padding.sizes() }); - - auto winograd_output = std::make_shared("_winograd_" + conv_node.id(), conv_node.id(), input_layout.format, - input_layout.data_type, std::vector{}, cldnn_reorder_mean_mode::mean_subtract, conv_node.output_layout.data_padding); - conv_node.output_layout.data_padding = padding{}; - auto& back_node = get_or_create(winograd_output); - back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node); - - auto bias_term = conv_node.bias_term(); - //create additional eltwise node after reorder to compute bias - if (bias_term) - { - auto& bias_node = conv_node.get_dependency(2); - std::vector inputs = { back_node.id(), bias_node.id() }; - auto winograd_output_biases = std::make_shared(back_node.id() + "_bias", inputs, - cldnn::eltwise_mode::sum, conv_prim->with_activation, conv_prim->activation_negative_slope, - back_node.output_layout.data_padding); - back_node.output_layout.data_padding = padding{}; - auto& back_bias_node = get_or_create(winograd_output_biases); - back_bias_node.processing_itr = processing_order.insert(std::next(back_node.processing_itr), &back_bias_node); - replace_all_usages(back_node, back_bias_node); - add_connection(back_node, back_bias_node); - add_connection(bias_node, back_bias_node); - conv_node.invalidate_users(); - replace_all_usages(conv_node, back_bias_node); - } - - if (conv_prim->with_activation) - { - conv_node.typed_desc()->with_activation = false; - if (!bias_term) - back_node.set_fused_activation(activation_relu_negative_slope, cldnn_activation_additional_params_t{ conv_prim->activation_negative_slope, 0.0f }); - } - - if (!bias_term) - { - conv_node.invalidate_users(); - replace_all_usages(conv_node, back_node); - } - add_connection(conv_node, back_node); - - auto& r_node = get_or_create(new_input); - r_node.as().set_input_offset(conv_prim->input_offset); - - if (!bias_term) - { - swap_names(conv_node, back_node); - if (conv_node.is_output()) - { - conv_node.set_output(false); - back_node.set_output(true); - for (auto& output : outputs) - { - if (output == &conv_node) - { - output = &back_node; - break; - } - } - } - } - else - { - conv_node.remove_dependency(2); - auto& back_bias_node = *nodes_map.find(back_node.id() + "_bias")->second; - swap_names(conv_node, back_bias_node); - if (conv_node.is_output()) - { - conv_node.set_output(false); - back_bias_node.set_output(true); - for (auto& output : outputs) - { - if (output == &conv_node) - { - output = &back_bias_node; - break; - } - } - } - } - } - - if (new_input && (new_input->output_format == format::bf8_xy16 || new_input->output_format == format::byxf)) - { - auto conv1x1_output = std::make_shared("_conv1x1_reorder_back_" + conv_node.id(), conv_node.id(), input_layout.format, input_layout.data_type); - auto& back_node = get_or_create(conv1x1_output); - back_node.processing_itr = processing_order.insert(std::next(conv_node.processing_itr), &back_node); - - conv_node.invalidate_users(); - replace_all_usages(conv_node, back_node); - add_connection(conv_node, back_node); - } - - if (new_input) - { - auto& r_node = get_or_create(new_input); - add_intermediate(r_node, conv_node, 0, r_node.dependencies.empty()); - conv_node.recalc_output_layout(); - } - }; - - const auto reorder_input_detection_output = [this, &lo](typed_program_node& detection_output_node) - { - auto detection_output_prim = detection_output_node.get_primitive(); - - for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++) - { - auto& input = detection_output_node.get_dependency(i); - std::shared_ptr new_input = lo.get_reorder( - input.get_output_layout(), - input.id(), - layout_optimizer::data_type::input, - detection_output_node, - layout{ data_types::f32, format::bfyx, tensor{} }).first; - - if (new_input) - { - add_intermediate(new_input, detection_output_node, i); - } - } - }; - - for (auto& prim : processing_order) - { - //there's an assumption that only convolution will take data/input_layout as input - //exception to that rule would be a convolution which takes a reorder as input - see reoder_input above - do_for_types(*prim, - reorder_input, //case for convolution - reorder_input_detection_output //case for detection-output - ); - } -} - -//function which prepares given primitive for weights optimization -template -void program_impl::optimize_bias(T& node, layout_optimizer& lo) -{ - layout output_layout = node.get_output_layout(); - - size_t weights_offset = node.get_primitive()->input.size(); - size_t bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size(); - for (size_t i = bias_offset; i < node.get_dependencies().size(); ++i) - { - //find weights primitive with given pimitive_id and add it to weights_optimizer - const program_node& bias = node.get_dependency(i); - const auto bias_type = layout_optimizer::data_type::bias; - auto reorder = lo.get_reorder( - bias.get_output_layout(), - bias.id(), - bias_type, - node, - output_layout); - - if (reorder.first) - this->add_intermediate(reorder.first, node, i, !reorder.second); - } -} -template void program_impl::optimize_bias(convolution_node& node, layout_optimizer& lo); -template void program_impl::optimize_bias(deconvolution_node& node, layout_optimizer& lo); -template void program_impl::optimize_bias(fully_connected_node& node, layout_optimizer& lo); -template void program_impl::optimize_bias(embed_node& node, layout_optimizer& lo); - -void program_impl::pre_optimize_bias(layout_optimizer& lo) -{ - for (auto& p : nodes_map) - { - auto& prim = *p.second; - if (prim.type() == convolution::type_id()) - { - if (!prim.as().weights_quantization_term()) - optimize_bias(prim.as(), lo); - } - else if (prim.type() == deconvolution::type_id()) - { - optimize_bias(prim.as(), lo); - } - else if (prim.type() == fully_connected::type_id()) - { - if (!prim.as().weights_quantization_term()) - optimize_bias(prim.as(), lo); - } - else if (prim.type() == embed::type_id()) - { - optimize_bias(prim.as(), lo); - } - } -} - -template -void program_impl::optimize_depthwise_sep_pre(T& node) -{ - //enable optimization only when IFM / split <= 8 (otherwise scheduling multiple opt kernels is better) and split >= 16 - if (!(node.get_dependency(0).get_output_layout().size.feature[0] / node.get_primitive()->split() <= 8) || - !(node.get_primitive()->split() >= 16)) - return; - - //make sure the weights and biases are data type and - //are not reused in other primitives as they will be overriden with concatenated ones - for (size_t i = 1; i < node.get_dependencies().size(); i++) - { - auto& weights_or_biases = node.get_dependency(i); - if (weights_or_biases.get_users().size() > 1 || weights_or_biases.type() != data::type_id()) - return; - } - - node.set_depthwise_sep_opt(true); -} -template void program_impl::optimize_depthwise_sep_pre(convolution_node& node); -template void program_impl::optimize_depthwise_sep_pre(deconvolution_node& node); - -void program_impl::prepare_depthwise_sep_opt() -{ - //depthiwise separated convolution/deconvolution optimization - for (auto& p : nodes_map) - { - auto& prim = *p.second; - if (prim.type() == convolution::type_id()) - { - optimize_depthwise_sep_pre(prim.as()); - } - else if (prim.type() == deconvolution::type_id()) - { - optimize_depthwise_sep_pre(prim.as()); - } - } -} - -void program_impl::handle_reshape() -{ - //reshape primitive by definition does not change underlying data, only shape description - //however during graph initialization and data optimization the layouts can be changed without user's knowledge, - //when reshape is followed by reorder, it is likely that reorder's output will not be as expected (for example reshape with flattened shape) - //this pass resolved the issue by changing graph in the following way - //- in case reshape has multiple users with reshape->reorder sequence, it will be splitted to multiple reshape primitives with single user - //- in case of reshape->reorder sequence, the additional reorder before reshape will be added, - // if last reorder does not contain padding or mean subtract, it will be removed later in the graph - - for (const auto& node : processing_order) - { - if (node->is_type()) - { - auto& input_node = node->get_dependency(0); - - if (input_node.is_type()) - continue; - - //vector for storing nodes that are reorder type, for which splitted primitives are needed (except for the first one where orginal reshape will be used) - std::vector reorder_node_to_split; - - //find the users of reshape that are reorder type, if none present then skip the current node - for (const auto& user : node->get_users()) - { - if (user->is_type()) - reorder_node_to_split.push_back(user); - } - - if (!reorder_node_to_split.empty()) - { - auto& prim_node = node->as(); - const auto& prim = prim_node.get_primitive(); - auto output_shape = prim->output_shape; - - //vector for storing reshape nodes to connect to new reorder nodes (if needed) - std::vector reorder_reshape_nodes; - - bool skip_first_user = false; - auto reshape_users = node->get_users(); - for (const auto& user : reshape_users) - { - //reshape node for first user will be the orginal reshape from the graph - if (!skip_first_user) - { - if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end()) - reorder_reshape_nodes.push_back(node); - skip_first_user = true; - continue; - } - - //other reshapes will be clones of the orginal one connected to reshape->reorder sequences - if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end()) - { - auto new_reshape = std::make_shared("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape); - auto& new_reshape_node = get_or_create(new_reshape); - add_connection(input_node, new_reshape_node); - user->replace_dependency(0, new_reshape_node); - new_reshape_node.processing_itr = processing_order.insert(std::next(input_node.processing_itr), &new_reshape_node); - reorder_reshape_nodes.push_back(&new_reshape_node); - } - } - - //add new reorder nodes to proper reshape node - auto reshape_reorder_id = 0; - for (const auto& reorder_node : reorder_node_to_split) - { - auto& reorder_reshape_node = reorder_reshape_nodes[reshape_reorder_id]; - auto reshape_in_layout = reorder_node->get_output_layout(); - auto reshape_input = std::make_shared("_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(), input_node.id(), reshape_in_layout.format, reshape_in_layout.data_type); - auto& reshape_input_node = get_or_create(reshape_input); - add_intermediate(reshape_input_node, *reorder_reshape_node, 0, reshape_input_node.dependencies.empty()); - reshape_reorder_id++; - } - } - - auto reshape_layout = node->get_output_layout(); - if (!(node->is_output()) && (reshape_layout.format != cldnn::format::bfyx)) - { - auto bfyx_layout = layout({ reshape_layout.data_type, cldnn::format::bfyx, reshape_layout.size }); - //when some primitive does an implicit reorder to some other format then we lose the info about pitches in reshape stage - //we assume user provides the input vector in bfyx - if (!are_layouts_identical(reshape_layout, bfyx_layout).second) - { - auto reshape_input = std::make_shared("_reshape_input_" + node->id(), input_node.id(), cldnn::format::bfyx, reshape_layout.data_type); - auto& reshape_input_node = get_or_create(reshape_input); - add_intermediate(reshape_input_node, *node, 0, reshape_input_node.dependencies.empty()); - - auto reshape_users = node->get_users(); - for (const auto& user : reshape_users) - { - size_t idx = 0; - for (size_t i = 0; i < user->get_dependencies().size(); i++) - { - auto& input = user->get_dependency(i); - if (input.id() == node->id()) { - idx = i; - break; - } - } - auto reshape_output = std::make_shared("_reshape_output_" + node->id(), user->id(), reshape_layout.format, reshape_layout.data_type); - auto& reshape_output_node = get_or_create(reshape_output); - add_intermediate(reshape_output_node, *user, idx, reshape_output_node.dependencies.empty()); - } - } - } - } - } -} - -//function which prepares given primitive for weights optimization -template -void program_impl::optimize_weights(T& node, layout_optimizer& lo) -{ - auto weights_offset = node.get_primitive()->input.size(); - auto bias_offset = weights_offset + wrap_if_single(node.get_primitive()->weights).size(); - for (auto i = weights_offset; i < bias_offset; i++) - { - auto& weights = node.get_dependency(i); - auto* impl = node.get_selected_impl().get(); - auto output_layout = node.get_output_layout(); - auto& weights_node = node.get_dependency(1); - auto weights_layout = weights_node.get_output_layout(); - const auto weights_type = layout_optimizer::data_type::weights; - - auto reorders = lo.get_generic_layer( - impl->_weights_reorder_params, - weights.id(), - weights_layout, - weights_type); - - for (auto& reorder : reorders) - { - //insert new generic_layer node to topology - this->add_intermediate(reorder.first, node, i, !reorder.second); - //set generic_layer's node output layout and implementation - auto& g_node = node.get_dependency(i); - g_node.get_output_layout(false); - g_node.selected_impl = g_node.type()->choose_impl(*engine, g_node); - } - //set the old output layout and do not invalidate users as change of weights will not affect output layout - node.set_output_layout(output_layout, false); - } -} -template void program_impl::optimize_weights(convolution_node& node, layout_optimizer& lo); -template void program_impl::optimize_weights(deconvolution_node& node, layout_optimizer& lo); -template void program_impl::optimize_weights(fully_connected_node& node, layout_optimizer& lo); - -void program_impl::post_optimize_weights(layout_optimizer& lo) -{ - for (auto& p : nodes_map) - { - auto& prim = *p.second; - if (prim.type() == convolution::type_id()) - { - optimize_weights(prim.as(), lo); - } - else if (prim.type() == deconvolution::type_id()) - { - optimize_weights(prim.as(), lo); - } - else if (prim.type() == fully_connected::type_id()) - { - optimize_weights(prim.as(), lo); - } - //else if (prim.type() == lstm_gemm::type_id())//TODO: Enable postoptimize weights for lstm - //{ - // prep_opt(prim.as()); //we should take care of weights and reccurent - //} - } -} - -template -void program_impl::optimize_depthwise_sep_post(T& node) -{ - if (!node.get_depthwise_sep_opt()) - return; - - const auto& split = node.get_primitive()->split(); - - auto dependency_offset = node.get_primitive()->input.size(); - //concatenate weights - { - //if weights were optimized it is needed to use the sizes after optimization - auto target_layout = get_weights_layout(node.get_dependency(dependency_offset), split); - merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split); - dependency_offset++; - } - - //concatenate biases - if (node.get_primitive()->bias.size() != 0) - { - const auto& bias_layout = node.get_dependency(dependency_offset).get_output_layout(); - auto target_layout = layout(bias_layout.data_type, cldnn::format::bfyx, { 1, 1, bias_layout.size.spatial[0] * split, 1 }); - merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split); - dependency_offset++; - } - - if (node.template is_type()) - { - auto& prim_node = node.template as(); - const auto& prim = prim_node.get_primitive(); - - // concatenate weights quantization factors - if (prim->weights_quantization_factors.size() != 0) - { - const auto& weights_quantization_layout = node.get_dependency(dependency_offset).get_output_layout(); - auto target_layout = layout(weights_quantization_layout.data_type, cldnn::format::bfyx, { 1, 1, weights_quantization_layout.size.batch[0] * split, 1 }); - merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split); - dependency_offset++; - } - // concatenate output callibration factors - if (prim->output_calibration_factors.size() != 0) - { - const auto& output_callibration_layout = node.get_dependency(dependency_offset).get_output_layout(); - auto target_layout = layout(output_callibration_layout.data_type, cldnn::format::bfyx, { 1, 1, output_callibration_layout.size.batch[0] * split, 1 }); - merge_buffers(engine, node, target_layout, dependency_offset, dependency_offset + split); - dependency_offset++; - } - } - - if (node.get_primitive()) - //override node split, as only one kernel will be executed - node.set_split(1); -} -template void program_impl::optimize_depthwise_sep_post(convolution_node& node); -template void program_impl::optimize_depthwise_sep_post(deconvolution_node& node); - -void program_impl::prep_opt_depthwise_sep_post() -{ - //depthiwise separated convolution/deconvolution optimization - for (auto& p : nodes_map) - { - auto& prim = *p.second; - if (prim.type() == convolution::type_id()) - { - optimize_depthwise_sep_post(prim.as()); - } - else if (prim.type() == deconvolution::type_id()) - { - optimize_depthwise_sep_post(prim.as()); - } - } -} - -void program_impl::apply_needed_padding(program_node& node, program_node& prev_node, - const padding& needed_padding) -{ - auto target_layout = prev_node.get_output_layout(); - - // Short circuit if padding did not change. - if (target_layout.data_padding == needed_padding) - return; - - // Special handling for input nodes. - if (prev_node.is_type() || prev_node.is_type()) - { - target_layout.data_padding = needed_padding; - - auto r_prim = std::make_shared("reorder_input_" + node.id(), prev_node.id(), target_layout); - add_intermediate(r_prim, node, 0); - return; - } - - prev_node.merge_output_padding(needed_padding); -} - -void program_impl::prepare_padding() -{ - if (output_size_handling_enabled) - { - // Prepare upper padding for primitives that support output_size parameter. - for (const auto& node : processing_order) - { - if (node->is_type()) - { - auto& prim_node = node->as(); - const auto& prim = prim_node.get_primitive(); - - if (!prim->with_output_size) - continue; - - auto filter_size = prim_node.weights(0).get_output_layout().size; - - auto needed_padding = calc_sliding_window_needed_input_padding( - prim_node.input().get_output_layout(), - prim->output_size, filter_size, prim->input_offset, prim->stride, prim->dilation, false, 1); - apply_needed_padding(prim_node, prim_node.input(), needed_padding); - } - else if (node->is_type()) - { - auto& prim_node = node->as(); - const auto& prim = prim_node.get_primitive(); - - if (!prim->with_output_size) - continue; - - auto filter_size = prim_node.weights(0).get_output_layout().size; - - auto needed_padding = calc_sliding_window_needed_input_padding( - prim_node.input().get_output_layout(), - prim->output_size, filter_size, prim->input_offset, prim->stride, { 1, 1, 1, 1 }, true, 1); - - apply_needed_padding(prim_node, prim_node.input(), needed_padding); - } - else if (node->is_type()) - { - auto& prim_node = node->as(); - const auto& prim = prim_node.get_primitive(); - - if (!prim->with_output_size) - continue; - - // NOTE: Currently there is no pooling implementation/pooling mode which does not check input data range. - // There is no need to add padding requirements on pooling inputs. - //auto needed_padding = calc_sliding_window_needed_input_padding( - // prim_node.input().get_output_layout(), - // prim->output_size, prim->size, prim->input_offset, prim->stride, {1, 1, 1, 1}, false, 1); - auto needed_padding = prim_node.input().get_output_layout().data_padding; - - apply_needed_padding(prim_node, prim_node.input(), needed_padding); - } - } - } - - // Prepare optimized padding for bfyx convolution. - for (auto& pair : nodes_map) - { - if (pair.second->type() != convolution::type_id()) - continue; - - auto& node = pair.second->as(); - if (node.get_dependencies().empty()) - continue; - - auto conv = node.get_primitive(); - auto& conv_input_node = node.get_dependency(0); - auto conv_layout = node.get_output_layout(); - - // right now output padding optimization is only available for bfyx format and data type = float32 - if (conv_layout.format != cldnn::format::bfyx - && conv_layout.format != cldnn::format::bf8_xy16 - && conv_layout.format != cldnn::format::byxf_af32 - && conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32) - { - continue; - } - - // Calculating input padding needed for convolution - auto& filter_node = node.as().weights(0); - auto filter_prim = filter_node.get_primitive(); - - layout filter_layout = filter_node.get_output_layout(); - - // convolution have only one input primitive - auto prev_prim_output_layout = conv_input_node.get_output_layout(); - - // Compute initial required paddings for primitive used as input for convolution. - auto input_offset = conv->input_offset; - auto stride = conv->stride; - auto dilation = conv->dilation; - - auto input_limit_x = input_offset.spatial[0] + (conv_layout.size.spatial[0] - 1) * stride.spatial[0] + (filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1; - auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] + (filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1; - - auto left_padding = std::max(-input_offset.spatial[0], 0); - auto top_padding = std::max(-input_offset.spatial[1], 0); - auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0); - auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0); - - // Adjust right padding, so entire buffer size in X dimension is properly aligned. - // TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added. - //auto needed_buffer_size_x = static_cast( - // round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16)); - //right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0]; - - cldnn::padding needed_padding({ 0, 0, left_padding, top_padding }, { 0, 0, right_padding, bottom_padding }, 0); - needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding); - - apply_needed_padding(node, conv_input_node, needed_padding); - } -} - -void program_impl::propagate_constants() -{ - constants_propagator prop(this); - - for (auto& node : processing_order) - prop.visit_node(*node); - - auto&& to_replace = prop.calculate(); - - //remove all nodes which are no longer relevant, i.e. nodes which: - // 1. are constants, and - // 2. do not have non-const user (so their data are not used during inference), and - // 3. are not marked as outputs. - // in case if node has either non-const user or is marked as output, it should be replace with cldnn::data rather than removed (see next loop) - auto proc_itr = processing_order.begin(); - while (proc_itr != processing_order.end()) - { - auto& node = (*proc_itr++); - if (!node->is_constant()) - continue; - if (node->has_non_const_user() || (node->is_output() && !node->is_type())) - continue; - - auto& users = node->users; - auto& deps = node->dependencies; - - for (size_t idx = 0; idx < deps.size(); idx++) - { - deps.at(idx)->users.remove(node); - } - deps.clear(); - - for (auto& usr : users) { - auto& usr_deps = usr->dependencies; - usr_deps.erase(std::remove(usr_deps.begin(), usr_deps.end(), node), usr_deps.end()); - } - users.clear(); - - if (!node->is_output()) - { - auto rem = remove_if_dangling(*node); - assert(rem && "Non-output constant node which has only constant users should have been removed during constants propagation pass"); - (void)rem; - } - } - - //replace all constant nodes which are relevant for inference (either used by non-const user or marked as output) with recomputed cldnn::data - for (auto& cout : to_replace) - { - auto& id_to_replace = cout.first; - - //TODO: do not use API primitives internally and get rid of this last 'cldnn::memory' internal usage - memory api_memory = details::memory_c_to_cpp_converter::convert(api_cast(cout.second.get())); - //c-cpp converter does not retain since normally it is done inside API-impl layer (cldnn.cpp) so we need to do it manually - cout.second->add_ref(); - - auto const_data = std::make_shared("_cldnn_const_prop_" + id_to_replace, api_memory /* <<< REMOVE ME WHEN POSSIBLE */); - auto& new_node = get_or_create(const_data); - auto& curr_node = *nodes_map.at(id_to_replace); - - if (!curr_node.is_type()) - { - auto curr_node_deps = curr_node.get_dependencies(); - for (auto& dep : curr_node_deps) - { - auto dep_users = dep->get_users(); - for (auto& dep_user : dep_users) - { - if (dep_user == &curr_node) - remove_connection(*dep, curr_node); - } - } - } + if (!get_engine().configuration().enable_memory_pool) + return; - curr_node.dependencies.clear(); - //remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any dependencies) - curr_node.users.erase( - std::remove_if(curr_node.users.begin(), curr_node.users.end(), [](program_node* node) { return node->is_constant(); }), - curr_node.users.end() - ); - replace(curr_node, new_node, false, false); - } + basic_memory_dependencies(); + skipped_branch_memory_dependencies(); + oooq_memory_dependencies(); } -void program_impl::prepare_buffer_fusing() +std::string program_impl::get_memory_dependencies_string() const { - bool is_debug = options.get()->enabled(); + std::string mem_dep = "Memory dependencies/restrictions:\n"; auto itr = processing_order.begin(); while (itr != processing_order.end()) { - auto& node = (*itr++); + auto& node = *itr; + itr++; + mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: "); + for (auto it : node->get_memory_dependencies()) + mem_dep == mem_dep.append(it).append(", "); + mem_dep = mem_dep.append("\n"); + } + return mem_dep; +} - // TODO: Move fused activation to previous layer when possible - if (node->fused_activation.activation_func != cldnn_activation_func_t::activation_none) - continue; +void program_impl::handle_reshape() +{ + //reshape primitive by definition does not change underlying data, only shape description + //however during graph initialization and data optimization the layouts can be changed without user's knowledge, + //when reshape is followed by reorder, it is likely that reorder's output will not be as expected (for example reshape with flattened shape) + //this pass resolved the issue by changing graph in the following way + //- in case reshape has multiple users with reshape->reorder sequence, it will be splitted to multiple reshape primitives with single user + //- in case of reshape->reorder sequence, the additional reorder before reshape will be added, + // if last reorder does not contain padding or mean subtract, it will be removed later in the graph - do_for_types(*node, [this, is_debug](concatenation_node& node) + for (const auto& node : processing_order) + { + if (node->is_type()) { - // buffer fusing should not be performed if one of inputs produces padded output since - // it could break desired memory alignment. On the other hand, if this node uses all inputs - // exclusively (see check above) they should not have output padding set since concatenation - // does not ask for any. - if (node.has_padded_dependency()) - return; + auto& input_node = node->get_dependency(0); - auto concat_axis = node.get_primitive()->axis; - auto padd = node.get_output_layout().data_padding; + if (input_node.is_type()) + continue; + + node->get_output_layout(); + if (node->as().is_in_place()) + node->optimized = true; - tensor lower_padd = padd.lower_size(); - tensor upper_padd = padd.upper_size(); + //vector for storing nodes that are reorder type, for which splitted primitives are needed (except for the first one where orginal reshape will be used) + std::vector reorder_node_to_split; - auto upper_padd_val = node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis]; - tensor lower_padd_offset = lower_padd; + //find the users of reshape that are reorder type, if none present then skip the current node + for (const auto& user : node->get_users()) + { + if (user->is_type()) + reorder_node_to_split.push_back(user); + } - std::list, tensor>> stack = { std::make_pair(node.get_dependencies(), tensor{ 0, 0, 0, 0 }) }; - while (!stack.empty()) + if (!reorder_node_to_split.empty()) { - auto nodes_list = stack.front(); - stack.pop_front(); + auto& prim_node = node->as(); + const auto& prim = prim_node.get_primitive(); + auto output_shape = prim->output_shape; - auto cascade_adjustment = nodes_list.second; - upper_padd.raw[concat_axis] = upper_padd_val; - lower_padd = lower_padd_offset; + //vector for storing reshape nodes to connect to new reorder nodes (if needed) + std::vector reorder_reshape_nodes; - //check if concatenation in place can be applied for inputs set - for (auto input : nodes_list.first) + bool skip_first_user = false; + auto reshape_users = node->get_users(); + for (const auto& user : reshape_users) { - //if any of this node's inputs is used by more than one primitive and is not optimized concatenation then do not fuse buffers, - //also, if an input is marked as network output, prevent optimizations which would affect a form of its output (unless debug flag is set) - // todo: in future, if this case is problem, it can be optimized further to enable buffer fusing - // per single input rather than all/none - // + restrict input types to pooling, convolution and activation only due to problems with output padding on b and f - if ((!input->is_type() && !input->is_type() && !input->is_type() && !input->is_type() && !input->is_type() && !input->is_type()) || - (input->is_output() && !is_debug) || - input->get_users().size() > 2) - return; - - if (input->get_users().size() > 1) + //reshape node for first user will be the orginal reshape from the graph + if (!skip_first_user) { - auto user_count = input->get_users().size(); - for (auto& user : input->get_users()) - if (user->is_type()) - user_count--; - if (user_count > 1) - return; + if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end()) + reorder_reshape_nodes.push_back(node); + skip_first_user = true; + continue; } - //check only for spatial paddings. Accept feature and batch - if (input->get_output_layout().data_padding.lower_size().spatial[0] != 0 || - input->get_output_layout().data_padding.upper_size().spatial[0] != 0 || - input->get_output_layout().data_padding.lower_size().spatial[1] != 0 || - input->get_output_layout().data_padding.upper_size().spatial[1] != 0) - return; - } - - //apply concatenation in place optimization - for (auto input : nodes_list.first) - { - auto input_lenght = input->get_output_layout().size.raw[concat_axis]; - - // shrink upper pad so it points at the end of the input's buffer - // - // |--- lower padd ---| |---------- upper padd -----------| - // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| - upper_padd.raw[concat_axis] -= input_lenght; - - //adjust padding sizes for cascade concatenations - auto lower_padd_tmp = lower_padd; - lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis]; - auto upper_padd_tmp = upper_padd; - upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis]; - - // set new padding for input - input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes())); - - // move lower padd further - // - // |-------------- lower padd -------------|---------- upper padd -----------| - // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| - - lower_padd.raw[concat_axis] += input_lenght; - - if (input->type() == concatenation::type_id() && input->can_be_optimized()) + //other reshapes will be clones of the orginal one connected to reshape->reorder sequences + if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) != reorder_node_to_split.end()) { - if (input->as().get_primitive()->axis != node.get_primitive()->axis) - return; - - if (!input->get_dependencies().empty()) - stack.push_back(std::make_pair(input->get_dependencies(), input->get_output_layout().data_padding.lower_size())); + auto new_reshape = std::make_shared("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape); + auto& new_reshape_node = get_or_create(new_reshape); + add_connection(input_node, new_reshape_node); + user->replace_dependency(0, new_reshape_node); + processing_order.insert_next(&input_node, &new_reshape_node); + reorder_reshape_nodes.push_back(&new_reshape_node); } } - } - - node.can_be_optimized(true); - }); - - // zero copy - do_for_types(*node, [this, is_debug](crop_node& node) - { - //if the node is marked as network output, prevent optimizations which would affect a form of its output, unless debug flag is set - if (node.is_output() && !is_debug) - return; - - //do not optimize when next node is concatenation which is not output - if (node.get_users().size() == 1 && node.get_users().front()->is_type() && !node.get_users().front()->is_output()) - return; - if (node.get_dependencies().size() == 1 && - node.get_users().size() > 0) - { - // optimization is avaiable for croping across depth(features) only - // if output padding has defined padding accross featuers already it wouldn't - // work because it expect to have zeros in the padded area. - auto format = node.get_output_layout().format; - auto crop_prim = node.get_primitive(); - auto input_layout = node.get_dependency(0).get_output_layout(); - auto out_padd = node.get_output_layout().data_padding; - if (format == format::bfyx && - crop_prim->reference_input.batch[0] == input_layout.size.batch[0] && - crop_prim->reference_input.spatial[0] == input_layout.size.spatial[0] && - crop_prim->reference_input.spatial[1] == input_layout.size.spatial[1] && - out_padd.lower_size().feature[0] == 0 && - out_padd.upper_size().feature[0] == 0 && - out_padd.lower_size().batch[0] == 0 && - out_padd.upper_size().batch[0] == 0 && - out_padd.lower_size().spatial[0] == 0 && - out_padd.lower_size().spatial[1] == 0 && - out_padd.upper_size().spatial[0] == 0 && - out_padd.upper_size().spatial[1] == 0) + //add new reorder nodes to proper reshape node + auto reshape_reorder_id = 0; + for (const auto& reorder_node : reorder_node_to_split) { - // Regular crop - // crop input buffer - // |___________data____________| - // - // crop output buffer - // |-------->| offsets[f] |<--| - // |_____data____| - // <------------> - // reference size - // - // Inplace crop - // crop output buffer - // |_low_pad_|__data_size__|___|<-upper pad - - node.set_output_padding(padding( - { out_padd.lower_size().batch[0], crop_prim->offsets.feature[0], out_padd.lower_size().spatial[0], out_padd.lower_size().spatial[1] }, - { out_padd.upper_size().batch[0], input_layout.size.feature[0] - crop_prim->offsets.feature[0] - crop_prim->reference_input.feature[0], - out_padd.upper_size().spatial[0], out_padd.upper_size().spatial[1] })); - node.can_be_optimized(true); + /* + auto new_reshape = std::make_shared("_reshape_split_" + user->id() + "_" + node->id(), input_node.id(), output_shape); + auto& new_reshape_node = get_or_create(new_reshape); + add_connection(input_node, new_reshape_node); + user->replace_dependency(0, new_reshape_node); + processing_order.insert(std::next(processing_order.get_processing_iterator(input_node)), &new_reshape_node); + reorder_reshape_nodes.push_back(&new_reshape_node); + */ + auto& reorder_reshape_node = reorder_reshape_nodes[reshape_reorder_id]; + auto reshape_in_layout = reorder_node->get_output_layout(); + auto reshape_input = std::make_shared("_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(), input_node.id(), + reshape_in_layout.format, reshape_in_layout.data_type); + auto& reshape_input_node = get_or_create(reshape_input); + add_intermediate(reshape_input_node, *reorder_reshape_node, 0, reshape_input_node.dependencies.empty()); + reshape_reorder_id++; } } - }); - do_for_types(*node, [this](reshape_node& node) - { - node.get_output_layout(); - if (node.is_in_place() && node.get_fused_activation_func() == activation_none) - node.can_be_optimized(true); - }); - do_for_types(*node, [this](reorder_node& node) - { - auto& input = node.input(); - auto output_layout = node.get_output_layout(); - //This is WA for topologies that due to additional reorders added perform worse with conv1x1 optimization - auto remove_bf8_xy_opt = ((input.is_type() || input.is_type()) && - output_layout.format == format::bf8_xy16 && input.get_users().size() == 1); - //Remove reorder from convolution 1x1 to bfyx in some conditions - auto remove_byxf_opt = (input.is_type() && - input.get_users().size() == 1 && - input.get_output_layout().format == format::byxf); - //check if all inputs user have the same format - auto all_users_same_format = true; - auto input_user_layout_format = input.get_users().front()->get_output_layout().format; - for (auto const& user : input.get_users()) + auto reshape_layout = node->get_output_layout(); + if (!(node->is_output()) && (reshape_layout.format != cldnn::format::bfyx)) { - if (user->get_output_layout().format != input_user_layout_format) + auto bfyx_layout = layout({ reshape_layout.data_type, cldnn::format::bfyx, reshape_layout.size }); + //when some primitive does an implicit reorder to some other format then we lose the info about pitches in reshape stage + //we assume user provides the input vector in bfyx + if (!program_helpers::are_layouts_identical(reshape_layout, bfyx_layout).second) { - all_users_same_format = false; - break; - } - } - auto same_data_type = input.get_output_layout().data_type == output_layout.data_type; - //Optimization only available in case of layers that support different input and output formats. - //todo: new api needs to be created to read such caps - if (!(input.is_type() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) && - !remove_bf8_xy_opt && - !(input.is_type() && input.get_output_layout().format == format::bf8_xy16) && - !(input.is_type() && (output_layout.format == format::bfyx || output_layout.format == format::yxfb || output_layout.format == format::byxf) && all_users_same_format && same_data_type) && - !(remove_byxf_opt && (node.get_users().front()->is_type() || node.get_users().front()->is_type()))) - return; - - if (remove_bf8_xy_opt) - { - auto users_user_layout = node.get_users().front()->get_users().front()->get_output_layout(); - // if users_user_layout is still bf8_yx16 (stacked convolutions) then leave the reorder - if (users_user_layout.format == format::bf8_xy16) - return; - auto input_layout = input.get_output_layout(); - auto target_layout = layout(input_layout.data_type, users_user_layout.format, input_layout.size, input_layout.data_padding); - input.set_output_layout(target_layout, false); - } - else if (remove_byxf_opt) - { - auto user = node.get_users().front(); - auto users_users = node.get_users().front()->get_users(); + auto reshape_input = std::make_shared("_reshape_input_" + node->id(), input_node.id(), cldnn::format::bfyx, reshape_layout.data_type); + auto& reshape_input_node = get_or_create(reshape_input); + add_intermediate(reshape_input_node, *node, 0, reshape_input_node.dependencies.empty()); - for (auto const& users_user : users_users) - { - if (users_user->get_output_layout().format != format::byxf && !users_user->is_type()) + auto reshape_users = node->get_users(); + for (const auto& user : reshape_users) { - remove_byxf_opt = false; - break; + size_t idx = 0; + for (size_t i = 0; i < user->get_dependencies().size(); i++) + { + auto& input = user->get_dependency(i); + if (input.id() == node->id()) { + idx = i; + break; + } + } + auto reshape_output = std::make_shared("_reshape_output_" + node->id(), user->id(), reshape_layout.format, reshape_layout.data_type); + auto& reshape_output_node = get_or_create(reshape_output); + add_intermediate(reshape_output_node, *user, idx, reshape_output_node.dependencies.empty()); } } - - if (remove_byxf_opt) - { - auto input_layout = input.get_output_layout(); - user->set_output_layout(input_layout, false); - } } - else - input.set_output_layout(output_layout, false); - - node.can_be_optimized(true); - extract_and_remove(node); //try to remove redundant reorders - }); + } } } -void program_impl::fuse_skip_layers(program_node* node) +void program_impl::apply_needed_padding(program_node& node, program_node& prev_node, + const padding& needed_padding) { - do_for_types(*node, [this](eltwise_node& node) - { - bool skippable = false; - int index = 0; - if (node.get_primitive()->mode != eltwise_mode::sum || node.inputs_count() != 2) - return; - - if (node.input(0).is_type()) - { - skippable = true; - } - else if (node.input(1).is_type()) - { - skippable = true; - index = 1; - } - - if (!skippable) - return; - - auto& to_fuse_with = node.input(index).as(); - int to_fuse_index = index == 0 ? 1 : 0; - - // check that node doesn't have fused eltwise already - if (to_fuse_with.has_fused_sum()) - return; - - //remove dependencies and users of elwtise that is going to be extracted - add_connection(node.input(to_fuse_index), to_fuse_with); - remove_connection(node.input(to_fuse_index), node); - - //replace processing_num of the node where fusing take place and eltwise - auto new_processing_num = node.processing_num; - processing_order.erase(to_fuse_with.processing_itr); - to_fuse_with.processing_itr = processing_order.insert(node.processing_itr, &to_fuse_with); - to_fuse_with.processing_num = new_processing_num; - - //make sure that new fused node's users have higher processing_num than fused node - for (auto user : to_fuse_with.get_users()) - { - if (user->processing_num < new_processing_num) - { - processing_order.erase(user->processing_itr); - user->processing_itr = processing_order.insert(std::next(to_fuse_with.processing_itr), user); - user->processing_num = new_processing_num + 1; - } - } - - if (node.get_fused_activation_func() != activation_none) - to_fuse_with.set_fused_activation(node.get_fused_activation_func(), node.get_fused_activation_params()); - to_fuse_with.set_output_padding(node.get_output_layout().data_padding); - - extract_and_remove(node); - }); -} + auto target_layout = prev_node.get_output_layout(); -void program_impl::prepare_primitive_fusing() -{ - bool is_debug = options.get()->enabled(); + // Short circuit if padding did not change. + if (target_layout.data_padding == needed_padding) + return; - auto itr = processing_order.begin(); //note we need to use iterators since currently processed element can be removed - while (itr != processing_order.end()) + // Special handling for input nodes. + if (prev_node.is_type() || prev_node.is_type()) { - auto node_itr = itr++; - auto& node = (*node_itr); - - do_for_types(*node, [this, is_debug](activation_node& node) - { - - auto& input = node.input(); - - //Restrictions: - // - inputs cannot be padded - // - primitives input cannot be output - // - no activation additional input - // - input was optimized - if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() || - node.get_dependencies().size() != 1 || input.can_be_optimized()) - return; - - // - check if there is no activation fused already - // - limit to primitives which implementations support activation fusing - if (input.get_users().size() != 1 || input.get_fused_activation_func() != activation_none || - //TODO: new api needs to be created to read such caps - //right now use whitelist so no new primitives will be affected in case of lack of fused activation support - (!input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && - !input.is_type() && !input.is_type() && !input.is_type())) - return; - - input.set_fused_activation(node.get_primitive()->activation_func, node.get_primitive()->additional_params); - input.set_output_padding(node.get_output_layout().data_padding); + target_layout.data_padding = needed_padding; - extract_and_remove(node); - }); + auto r_prim = std::make_shared("reorder_input_" + node.id(), prev_node.id(), target_layout); + add_intermediate(r_prim, node, 0); + return; } - //Second loop tries fusing several reorders one by one (if present) into one reorder - itr = processing_order.begin(); - while (itr != processing_order.end()) - { - auto node_itr = itr++; - auto& node = (*node_itr); - - do_for_types(*node, [this, is_debug](reorder_node& node) - { - auto& input = node.input(); - - //Restrictions: - // - inputs cannot be padded - // - primitives input cannot be output - // - input was optimized - if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.get_dependencies().size() != 1 || - input.can_be_optimized()) - return; - - // - check if previous node is reorder with 1 user - // - do not fuse if current node has mean subtract - if (input.get_users().size() != 1 || !input.is_type() || - node.has_mean() || !node.get_primitive()->subtract_per_feature.empty()) - return; + prev_node.merge_output_padding(needed_padding); +} - input.set_output_layout(node.get_output_layout(), false); - extract_and_remove(node); - }); - } - //Third loop tries fusing eltwise (sum) with deconvolution - itr = processing_order.begin(); - while (itr != processing_order.end()) +void program_impl::reverse_connection(program_node& dep_node, program_node& user_node) +{ + if (std::find(dep_node.users.begin(), dep_node.users.end(), &user_node) != dep_node.users.end()) { - auto node_itr = itr++; - auto& node = (*node_itr); - - fuse_skip_layers(node); + remove_connection(dep_node, user_node); + add_connection(user_node, dep_node); } + else + throw std::runtime_error("Trying to reverse connection, but nodes are wrongly or not connected."); } program_node& program_impl::get_or_create(std::shared_ptr prim) @@ -2700,37 +858,78 @@ program_node& program_impl::get_or_create(std::shared_ptr prim) return *itr->second; auto new_node = prim->type->create_node(*this, prim); - new_node->set_org_primitive_id(new_node->id()); nodes_map.insert(itr, { prim->id, new_node }); return *new_node; } -void program_impl::add_intermediate(program_node& node, program_node& next, size_t prev_idx, bool connect_int_node_with_old_dep) +void program_impl::add_intermediate(program_node& node, program_node& next, size_t prev_idx, + bool connect_int_node_with_old_dep, bool move_usrs_of_prev_to_node) { if (connect_int_node_with_old_dep && !node.dependencies.empty()) - throw std::invalid_argument("Node which is about to be added inbetween two other nodes should not have any existing dependencies"); + throw std::invalid_argument("Node which is about to be added in between two other nodes should not have any existing dependencies"); auto& prev = next.get_dependency(prev_idx); //firstly add connection, later replace dependency, so 'prev' won't become dangling and therefore removed if (connect_int_node_with_old_dep) { add_connection(prev, node); - if (node.processing_itr != processing_order.end()) - processing_order.erase(node.processing_itr); + if (processing_order.size() != 0) + { + processing_order.insert_next(&prev, &node); + } + } - auto itr = prev.processing_itr; - node.processing_itr = processing_order.insert(++itr, &node); - node.processing_num = prev.processing_num; + if (move_usrs_of_prev_to_node) { + auto itr = prev.get_users().begin(); + while(itr!= prev.get_users().end()) + { + auto usr = *itr; + itr++; + if (usr->id() != node.id()) + usr->replace_dependency(prev, node); + } + mark_if_constant(prev); + mark_if_constant(node); + mark_if_data_flow(prev); + mark_if_data_flow(node); + } + else { + next.replace_dependency(prev_idx, node); + node.constant = prev.constant; + node.data_flow = prev.data_flow; } +} + +void program_impl::add_intermediate(std::shared_ptr prim, program_node& next, size_t prev_idx, + bool connect_int_node_with_old_dep, bool move_usrs_of_prev_to_node) +{ + add_intermediate(get_or_create(prim), next, prev_idx, connect_int_node_with_old_dep, move_usrs_of_prev_to_node); +} - next.replace_dependency(prev_idx, node); - node.constant = prev.constant; - node.data_flow = prev.data_flow; - if (prev.constant_frontier) +void program_impl::add_connection(program_node& prev, program_node& next) +{ + prev.users.push_back(&next); + next.dependencies.push_back(&prev); +} + +void program_impl::remove_connection(program_node& prev, program_node& next) +{ + prev.users.remove(&next); + next.dependencies.erase(std::remove(next.dependencies.begin(), next.dependencies.end(), &prev), next.dependencies.end()); +} + +void program_impl::remove_all_connections(program_node& node) { + // since the graph is not topological sorted, we need to remove the node from both dependencies and users + for (auto &e : node.users) + { + e->dependencies.erase(std::remove(e->dependencies.begin(), e->dependencies.end(), &node), e->dependencies.end()); + } + for (auto &e : node.dependencies) { - node.constant_frontier = true; - prev.constant_frontier = false; + e->users.remove(&node); } + node.dependencies.clear(); + node.users.clear(); } void program_impl::rename(program_node & node, primitive_id const & new_id) @@ -2776,9 +975,9 @@ void program_impl::replace_all_usages(program_node & old_node, program_node & ne } } -void program_impl::replace(program_node& old_node, program_node& new_node, bool replace_whole_branch, bool check_output_layouts_integrity) +void program_impl::replace(program_node& old_node, program_node& new_node) { - if ((!new_node.dependencies.empty() && !replace_whole_branch) || !new_node.users.empty()) + if (!new_node.dependencies.empty() || !new_node.users.empty()) throw std::invalid_argument("Node which is about to replace other node should be detached"); if (new_node.is_output()) @@ -2788,15 +987,13 @@ void program_impl::replace(program_node& old_node, program_node& new_node, bool new_node.output_layout = old_node.get_output_layout(); new_node.valid_output_layout = old_node.valid_output_layout; - if (!replace_whole_branch) + + //copy old's dependencies + while (!old_node.dependencies.empty()) { - //copy old's dependencies - while (!old_node.dependencies.empty()) - { - auto& dep = old_node.dependencies.back(); - add_connection(*dep, new_node); - remove_connection(*dep, old_node); - } + auto& dep = old_node.dependencies.front(); + add_connection(*dep, new_node); + remove_connection(*dep, old_node); } //append users @@ -2815,9 +1012,6 @@ void program_impl::replace(program_node& old_node, program_node& new_node, bool old_node.users.clear(); - if (check_output_layouts_integrity && new_node.valid_output_layout) - new_node.recalc_output_layout(); - bool old_was_output = false; //copy node's state if (old_node.is_output()) @@ -2832,17 +1026,11 @@ void program_impl::replace(program_node& old_node, program_node& new_node, bool inputs.remove(&old_node); new_node.constant = old_node.constant; - new_node.constant_frontier = old_node.constant_frontier; new_node.user_mark = old_node.user_mark; - auto old_news_pos = new_node.processing_itr; - new_node.processing_itr = processing_order.insert(old_node.processing_itr, &new_node); - new_node.processing_num = old_node.processing_num; - if (old_news_pos != processing_order.end()) - processing_order.erase(old_news_pos); - if (old_node.processing_itr != processing_order.end()) - processing_order.erase(old_node.processing_itr); - + processing_order.insert(&old_node, &new_node); + if (processing_order.get_processing_iterator(old_node) != processing_order.end()) + processing_order.erase(&old_node); nodes_map.erase(id); rename(new_node, id); @@ -2854,65 +1042,23 @@ void program_impl::replace(program_node& old_node, program_node& new_node, bool } } -bool program_impl::remove_if_dangling(program_node& node, bool detach_whole_branch) +bool program_impl::remove_if_dangling(program_node& node) { if (!node.users.empty()) return false; - if (!detach_whole_branch && !node.dependencies.empty()) + if (!node.dependencies.empty()) return false; - std::list to_remove; - std::list marked; - if (detach_whole_branch) - { - node.mark(); - std::list queue = { &node }; - while (!queue.empty()) - { - auto curr = queue.front(); - queue.pop_front(); - marked.push_back(curr); - - //remove only if all users also has been marked - bool rem = !std::any_of(curr->get_users().begin(), curr->get_users().end(), [](program_node* node) { return !node->is_marked(); }); - if (rem) - to_remove.push_back(curr); - - for (auto dep : curr->get_dependencies()) - { - if (!dep->is_marked()) - { - dep->mark(); - queue.push_back(dep); - } - } - } - } - else - to_remove.push_back(&node); - - for (auto n : marked) - n->unmark(); - - for (auto rem : to_remove) + if (!node.is_output() || is_debug_build()) { - if (!rem->is_output() || is_debug_build()) - { - if (detach_whole_branch) - { - for (auto& user : rem->get_users()) - user->remove_dependency(*rem); - } - if (rem->is_input()) - inputs.remove(rem); + if (node.is_input()) + inputs.remove(&node); - if (std::find(processing_order.begin(), processing_order.end(), rem) != processing_order.end()) - processing_order.erase(rem->processing_itr); - optimized_out.push_back(rem->id()); - nodes_map.erase(rem->id()); - } + if (std::find(processing_order.begin(), processing_order.end(), &node) != processing_order.end()) + processing_order.erase(&node); + optimized_out.push_back(node.id()); + nodes_map.erase(node.id()); } - return true; } @@ -2943,13 +1089,6 @@ bool program_impl::extract_and_remove(program_node& node) node.dependencies.clear(); input.users.remove(&node); - if (node.constant_frontier) - { - assert(node.constant && "Constant frontier should also, by definition, be constant"); - assert(input.constant && "Input for constant forontier should, by definition, be constant"); - input.constant_frontier = true; - } - if (!node.is_endpoint()) replace_all_usages(node, input); else @@ -2958,14 +1097,26 @@ bool program_impl::extract_and_remove(program_node& node) return true; } -void program_impl::replace_data_with_optimized(std::map const & replace_map) +void program_impl::remove_nodes(std::list& to_remove) { - for (auto& result : replace_map) + for (auto const& node : to_remove) { - auto& node = *nodes_map.at(result.first); - assert(node.is_type() && "Optimized primitive is not a cldnn::data"); - assert(result.second != nullptr && "Memory which handles result of optimization should not be nullptr"); - node.as().attach_memory(*result.second, false); + if (node->is_input()) + get_inputs().remove(node); + else + { + for (auto& dep : node->dependencies) + dep->users.remove(node); + } + for (auto& user : node->users) + { + user->dependencies.erase(std::remove(user->dependencies.begin(), + user->dependencies.end(), node), + user->dependencies.end()); + } + get_processing_order().erase(node); + optimized_out.push_back(node->id()); + nodes_map.erase(node->id()); } } @@ -2978,17 +1129,17 @@ void program_impl::dump_memory_pool() const { return; } - path += "cldnn_memory_pool.log"; auto dep = get_memory_dependencies_string(); get_engine().dump_memory_pool(*this, path, dep); - dump_program("14_memory_pool", true); + std::string dump_file_name = std::to_string(pm->get_pass_count()+1) + "_memory_pool"; + dump_program(dump_file_name.c_str(), true); } //TODO: break this function into number of smaller ones + add per-primitive fields (possibly use primitive_inst::to_string?) void program_impl::dump_program(const char* stage, bool with_full_info, std::function const& filter) const { - auto path = get_dir_path(options); + std::string path = get_dir_path(options); if (path.empty()) { return; @@ -3012,41 +1163,4 @@ void program_impl::dump_program(const char* stage, bool with_full_info, std::fun dump_graph_optimized(graph, *this); } -//Dumps weights and biasses in serialization process, not working yet, in progress. -void program_impl::dump_weights_and_biasses(std::vector& offsets, std::vector& data_names, std::ofstream& file_stream) const -{ - for (auto const& n : nodes_map) - { - auto dependency_count = (unsigned int)n.second.get()->get_dependencies().size(); - for (unsigned int dp = 0; dp < dependency_count; dp++) - { - auto& dependency = n.second.get()->get_dependency(dp); - if (dependency.is_type()) - { - offsets.push_back(offsets.empty() ? 0ull : offsets.back()); - auto& mem = dependency.as().get_attached_memory(); - if (mem.get_layout().data_type == data_types::f32) - dump_data(mem, file_stream, offsets.back(), sizeof(float)); - else - dump_data(mem, file_stream, offsets.back(), sizeof(short)); - data_names.push_back(dependency.as().id()); - } - } - } - file_stream.close(); -} - -//Makes serialization with given name. -//Placeholder, not working yet, in progress. -void program_impl::serialize(std::string network_name, std::function const& filter) const -{ - std::vector offsets; - std::vector data_names; - - std::ofstream file_stream(network_name + "_" + "serialization" + ".bin", std::ios::binary); - dump_kernels(engine->get_context().get()->get_kernels_cache().get_context().get_binaries(), offsets, data_names, file_stream); - dump_weights_and_biasses(offsets, data_names, file_stream); - std::ofstream graph(network_name + "_" + "serialization" + ".xml"); - dump_to_xml(graph, *this, filter, offsets, data_names); -} diff --git a/inference-engine/thirdparty/clDNN/src/program_dump_graph.cpp b/inference-engine/thirdparty/clDNN/src/program_dump_graph.cpp index 7e4c739..b82dd0e 100644 --- a/inference-engine/thirdparty/clDNN/src/program_dump_graph.cpp +++ b/inference-engine/thirdparty/clDNN/src/program_dump_graph.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2018 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,7 +18,13 @@ #include "program_dump_graph.h" #include "to_string_utils.h" -#include "xml_object.h" +#include "data_inst.h" +#include "condition_inst.h" + +#include "gpu/ocl_toolkit.h" + +#include "to_string_utils.h" + #include #include @@ -152,12 +158,12 @@ namespace cldnn graph.close(); } - std::string get_node_id(program_node* ptr) + std::string get_node_id(const program_node* ptr) { return "node_" + std::to_string(reinterpret_cast(ptr)); } - void dump_full_node(std::ofstream& out, program_node* node) + void dump_full_node(std::ofstream& out, const program_node* node) { out << node->type()->to_string(*node); } @@ -193,31 +199,7 @@ namespace cldnn { const auto extr_oformat = [](program_node* ptr) { - std::string out = ""; - switch (ptr->get_output_layout().format) - { - case format::yxfb: out = "yxfb"; break; - case format::byxf: out = "byxf"; break; - case format::bfyx: out = "bfyx"; break; - case format::fyxb: out = "fyxb"; break; - case format::os_iyx_osv16: out = "os_iyx_osv16"; break; - case format::bs_xs_xsv8_bsv8: out = "bs_xs_xsv8_bsv8"; break; - case format::bs_xs_xsv8_bsv16: out = "bs_xs_xsv8_bsv16"; break; - case format::bs_x_bsv16: out = "bs_x_bsv16"; break; - case format::bf8_xy16: out = "bf8_xy16"; break; - case format::image_2d_weights_c1_b_fyx: out = "image_2d_weights_c1_b_fyx"; break; - case format::image_2d_weights_c4_fyx_b: out = "image_2d_weights_c4_fyx_b"; break; - case format::image_2d_weights_winograd_6x3_s1_fbxyb: out = "image_2d_weights_winograd_6x3_s1_fbxyb"; break; - case format::image_2d_weights_winograd_6x3_s1_xfbyb: out = "image_2d_weights_winograd_6x3_s1_xfbyb"; break; - case format::os_is_yx_isa8_osv8_isv4: out = "os_is_yx_isa8_osv8_isv4"; break; - case format::is_o_yx_isv32: out = "is_o_yx_isv32"; break; - case format::byxf_af32: out = "byxf_af32"; break; - case format::fs_bs_yx_bsv4_fsv32: out = "fs_bs_yx_bsv4_fsv32"; break; - case format::any: out = "any"; break; - default: - out = "unk format"; - break; - } + std::string out = fmt_to_str(ptr->get_output_layout().format); if (!ptr->is_valid_output_layout()) out += " (invalid)"; @@ -225,22 +207,6 @@ namespace cldnn return out; }; - const auto extr_data_type = [](program_node* ptr) - { - std::string out = ""; - switch (ptr->get_output_layout().data_type) - { - case data_types::i8: out = "i8"; break; - case data_types::u8: out = "u8"; break; - case data_types::f16: out = "f16"; break; - case data_types::f32: out = "f32"; break; - default: - out = "unknown data_type"; - break; - } - return out; - }; - const auto dump_mem_info = [](program_node* ptr) { std::string out = "size_info: "; @@ -262,7 +228,7 @@ namespace cldnn }; graph << "digraph cldnn_program {\n"; - for (auto& node : program.get_nodes()) + for (auto& node : program.get_processing_order()) { if (filter && !filter(*node)) { @@ -272,23 +238,36 @@ namespace cldnn #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wpotentially-evaluated-expression" #endif - std::string node_type = get_extr_type(typeid(*node).name()); - graph << " " << get_node_id(node.get()) << "[label=\"" << node->id() << ":\n" << node_type << "\n out format: " + extr_oformat(node.get()) - << "\n out data_type: " + extr_data_type(node.get()) - << "\\nprocessing number: " << node->get_processing_num() << "\\n color:" << (node->is_reusing_memory() ? std::to_string(node->get_reused_memory_color()) : "none") + auto& node_type = typeid(*node); + std::string node_type_name = get_extr_type(node_type.name()); + graph << " " << get_node_id(node) << "[label=\"" << node->id() << ":\n" << node_type_name << "\n out format: " + extr_oformat(node) + << "\n out data_type: " + dt_to_str(node->get_output_layout().data_type) + << "\\nprocessing number: " << program.get_processing_order().get_processing_number(node) << "\\n color:" << (node->is_reusing_memory() ? std::to_string(node->get_reused_memory_color()) : "none") << (node->can_be_optimized() ? "\\n optimized out" : ""); - if (node_type != "struct cldnn::data" && node_type != "struct cldnn::input_layout" && !node->can_be_optimized()) + + if (node_type_name != "struct cldnn::data" && node_type_name != "struct cldnn::input_layout" && !node->can_be_optimized()) + { graph << "\\n Selected kernel: " << (node->get_selected_impl() == nullptr ? "none" : node->get_selected_impl().get()->get_kernel_name() - + "\n" + dump_mem_info(node.get())); + + "\n" + dump_mem_info(node)); + } graph << "\""; #ifdef __clang__ #pragma clang diagnostic pop #endif + if (node->is_type()) + { + graph << ", shape=diamond"; + } if (node->is_type() || node->is_constant()) + { graph << ", shape=box"; + } if (node->is_type()) + { graph << ", color=blue"; + } + if (node->is_reusing_memory()) { graph << ", fillcolor=\"" << colors[node->get_reused_memory_color() % colors.size()] << "\" "; @@ -303,9 +282,9 @@ namespace cldnn continue; } bool doubled = true; - if (std::find(user->get_dependencies().begin(), user->get_dependencies().end(), node.get()) == user->get_dependencies().end()) + if (std::find(user->get_dependencies().begin(), user->get_dependencies().end(), node) == user->get_dependencies().end()) doubled = false; - graph << " " << get_node_id(node.get()) << " -> " << get_node_id(user); + graph << " " << get_node_id(node) << " -> " << get_node_id(user); bool data_flow = node->is_in_data_flow() && user->is_in_data_flow(); if (data_flow) @@ -330,12 +309,12 @@ namespace cldnn continue; } - if (std::find(dep->get_users().begin(), dep->get_users().end(), node.get()) != dep->get_users().end()) + if (std::find(dep->get_users().begin(), dep->get_users().end(), node) != dep->get_users().end()) { continue; } - graph << " " << get_node_id(node.get()) << " -> " << get_node_id(dep) << " [style=dashed, label=\"dep\", constraint=false];\n"; + graph << " " << get_node_id(node) << " -> " << get_node_id(dep) << " [style=dashed, label=\"dep\", constraint=false];\n"; } } graph << "}\n"; @@ -361,101 +340,16 @@ namespace cldnn void dump_graph_info(std::ofstream& graph, const program_impl& program, std::function const& filter) { - for (auto& node : program.get_nodes()) + for (auto& node : program.get_processing_order()) { if (filter && !filter(*node)) continue; - dump_full_node(graph, node.get()); + dump_full_node(graph, node); graph << std::endl << std::endl; } close_stream(graph); } - - //Function used by serialization. Not working yet, in progress. - void dump_to_xml(std::ofstream& graph, const program_impl& program, std::function const& filter, std::vector& offsets, std::vector& data_names) - { - xml_composite data_container, node_container; - auto node_number = 1; - auto kernels_number = 1; - auto postion = 0u; - auto offset = 0ull; - auto size = offsets.at(0); - for (auto& node : program.get_nodes()) - { - if (filter && !filter(*node)) - continue; - - std::string package_name = "node_" + std::to_string(node_number); - auto node_info = node.get()->desc_to_xml(); - auto id = node->id(); - for (auto p = postion; p < (unsigned int)data_names.size(); p++) - { - if (p != 0) - { - offset = offsets.at(p - 1); - size = offsets.at(p) - offsets.at(p - 1); - } - if (data_names.at(p).find("kernels") != std::string::npos) - { - node_info.reset(new xml_composite()); - node_info->add("id", data_names.at(p)); - id = "kernels"; - package_name = "kernels_" + std::to_string(kernels_number); - - postion++; - kernels_number++; - node_number--; - } - if (data_names.at(p).find(id) != std::string::npos) - { - node_info->add("data_offset", std::to_string(offset)); - node_info->add("data_size", std::to_string(size)); - node_number++; - break; - } - } - node_container.add(package_name, node_info.get()); - } - data_container.add("data", node_container); - data_container.dump(graph); - close_stream(graph); - } - - //Function used by serialization. Not working yet, in progress. - void dump_kernels(kernels_binaries_container program_binaries, std::vector& offsets, std::vector& data_names, std::ofstream& file_stream) - { - auto offset_temp = 0ull; - for (unsigned int i = 0; i < (unsigned int)program_binaries.size(); i++) - { - for (unsigned int j = 0; j < (unsigned int)program_binaries.at(i).size(); j++) - { - for (unsigned int k = 0; k < (unsigned int)program_binaries.at(i).at(j).size(); k++) - { - char* p = (char*)&program_binaries.at(i).at(j).at(k); - file_stream.write(p, sizeof(char)); - offset_temp += sizeof(char); - } - } - offsets.push_back(offset_temp); - std::string offset_name = "kernels_part_" + std::to_string(i+1); - data_names.push_back(offset_name); - } - } - - //Function used by serialization. Not working yet, in progress. - void dump_data(memory_impl& mem, std::ofstream& stream, unsigned long long& total_offset, unsigned long long type) - { - auto offset = 0ull; - char * ptr = (char*)mem.lock(); - for (unsigned int x = 0; x < (unsigned int)mem.get_layout().count(); x++) - { - stream.write(ptr + offset, type); - offset += type; - } - mem.unlock(); - total_offset += offset; - } } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/program_helpers.cpp b/inference-engine/thirdparty/clDNN/src/program_helpers.cpp new file mode 100644 index 0000000..4565c0b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/program_helpers.cpp @@ -0,0 +1,92 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "program_helpers.h" +#include "program_impl.h" +#include "data_inst.h" + +namespace cldnn +{ + //helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization + void program_helpers::merge_buffers(engine_impl &engine, program_node &node, layout target_layout, size_t begin_offset, size_t end_offset) + { + memory_impl::ptr data_to_allocate = engine.allocate_memory(target_layout); + + for (size_t i = begin_offset; i < end_offset; i++) + { + auto& weights = node.get_dependency(i).as(); + mem_lock src{ weights.get_attached_memory() }; + mem_lock dst{ data_to_allocate }; + std::copy(src.begin(), src.end(), dst.begin() + (i - begin_offset)*src.size()); + } + + for (size_t i = 0; i < end_offset - begin_offset - 1; i++) + node.remove_dependency(begin_offset + 1); + + auto& data_node = node.get_dependency(begin_offset).as(); + data_node.attach_memory(*data_to_allocate, false); + } + + //helper function for getting target layout used in depthwise sep optimization + layout program_helpers::get_weights_layout(typed_program_node &data_node, int32_t split) + { + auto mem_layout = data_node.get_output_layout(); + + return layout(mem_layout.data_type, mem_layout.format, { split * mem_layout.size.batch[0], mem_layout.size.feature[0], mem_layout.size.spatial[0], mem_layout.size.spatial[1] }); + } + + // pair.first tells whether l1 and l2 are absolutely identical + // pair.second tells whether l1 and l2 can be reinterpreted to each other without need of reordering + // note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted nor dropped) + // note: if layouts describe two buffers with different size, consider them not to be identical even if smaller buffer can be considered to hold subsequence of larger buffer, + // this behavior is required to force buffer allocation for smaller buffer which, currently, should always be performed + std::pair program_helpers::are_layouts_identical(layout const& l1, layout const& l2) + { + if (l1 == l2) + return{ true, true }; + if (l1.data_type != l2.data_type) + return{ false, false }; + if (l1.size != l2.size) + return{ false, false }; + if (l1.get_linear_size() != l2.get_linear_size()) + return{ false, false }; + if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) || + (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16) || + (l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) || + (l2.format == format::b_fs_yx_fsv4 && l1.format != format::b_fs_yx_fsv4)) + return{ false, false }; + + auto l1_pitch = l1.get_pitches(); + auto l2_pitch = l2.get_pitches(); + + //ignore pitches which will never be used (for dims with size == 1) + for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i) + if (l1.size.raw[i] == 1) + l1_pitch.raw[i] = 0; + for (size_t i = 0; i < CLDNN_TENSOR_DIM_MAX; ++i) + if (l2.size.raw[i] == 1) + l2_pitch.raw[i] = 0; + + auto l1_offset = l1.get_linear_offset(); + auto l2_offset = l2.get_linear_offset(); + if (l1_pitch == l2_pitch && l1_offset == l2_offset) + return{ false, true }; + + return{ false, false }; + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/src/program_node.cpp b/inference-engine/thirdparty/clDNN/src/program_node.cpp index 078c4f5..7ed4546 100644 --- a/inference-engine/thirdparty/clDNN/src/program_node.cpp +++ b/inference-engine/thirdparty/clDNN/src/program_node.cpp @@ -18,21 +18,18 @@ #include "program_impl.h" #include "primitive_inst.h" #include "to_string_utils.h" - #include "json_object.h" -#include "xml_object.h" + using namespace cldnn; -program_node::program_node(std::shared_ptr prim, program_impl & prog) : desc(prim), myprog(prog) +program_node::program_node(std::shared_ptr prim, program_impl & prog) : desc(prim), myprog(prog), org_id(prim->id) { if (prim) output_layout.data_padding = prim->output_padding; - - processing_itr = prog.processing_order.end(); } -void program_node::replace_dependency(size_t idx, program_node& new_dep, bool detach_whole_branch) +void program_node::replace_dependency(size_t idx, program_node& new_dep) { if (idx >= dependencies.size()) return; @@ -40,17 +37,17 @@ void program_node::replace_dependency(size_t idx, program_node& new_dep, bool de return; dependencies[idx]->users.remove(this); - myprog.remove_if_dangling(*dependencies[idx], detach_whole_branch); + myprog.remove_if_dangling(*dependencies[idx]); dependencies[idx] = &new_dep; new_dep.users.push_back(this); } -void program_node::replace_dependency(program_node const& old_dep, program_node& new_dep, bool detach_whole_branch) +void program_node::replace_dependency(program_node const& old_dep, program_node& new_dep) { for (size_t i = 0; i < dependencies.size(); ++i) if (dependencies[i] == &old_dep) - return replace_dependency(i, new_dep, detach_whole_branch); + return replace_dependency(i, new_dep); } std::vector program_node::get_dependencies_ids() const @@ -86,68 +83,6 @@ void program_node::add_memory_dependency(std::vector prim_list) memory_dependencies.insert(prim_list.begin(),prim_list.end()); } -//Function used by serialization. Not working yet, in progress. -std::unique_ptr program_node::desc_to_xml() const -{ - std::unique_ptr node_info = std::unique_ptr(new xml_composite()); - node_info->add("id", id()); - node_info->add("valid_output_layout", bool_to_str(valid_output_layout)); - - xml_composite output_layout_info; - output_layout_info.add("data_type", dt_to_str(output_layout.data_type)); - output_layout_info.add("format", fmt_to_str(output_layout.format)); - output_layout_info.add("size", output_layout.size.to_string()); - - xml_composite padding_info; - padding_info.add("lower_size", output_layout.data_padding.lower_size().to_string()); - padding_info.add("upper_size", output_layout.data_padding.upper_size().to_string()); - output_layout_info.add("padding_info", padding_info); - - node_info->add("output_layout", output_layout_info); - node_info->add("processing_number", processing_num); - node_info->add("constant", bool_to_str(constant)); - node_info->add("output", bool_to_str(output)); - - std::vector deps_ptrs; - { - bool empty = true; - auto itr = dependencies.begin(); - while (itr != dependencies.end()) - { - if (empty) - { - empty = false; - } - deps_ptrs.push_back(std::to_string(reinterpret_cast(*itr++))); - } - if (deps_ptrs.empty()) - { - deps_ptrs.push_back("null"); - } - } - node_info->add("dependencies", deps_ptrs); - - std::vector users_ptrs; - { - bool empty = true; - auto itr = users.begin(); - while (itr != users.end()) - { - if (empty) - { - empty = false; - } - users_ptrs.push_back(std::to_string(reinterpret_cast(*itr++))); - } - if (users_ptrs.empty()) - { - users_ptrs.push_back("null"); - } - } - node_info->add("users", users_ptrs); - return node_info; - } - std::unique_ptr program_node::desc_to_json() const { std::unique_ptr node_info = std::unique_ptr(new json_composite()); @@ -169,7 +104,6 @@ std::unique_ptr program_node::desc_to_json() const node_info->add("output layout", output_layout_info); - node_info->add("processing number", processing_num); node_info->add("in data flow", bool_to_str(data_flow)); node_info->add("constant", bool_to_str(constant)); node_info->add("in data flow", bool_to_str(data_flow)); @@ -334,3 +268,4 @@ void details::internal_program_node_base::set_implementation(std::unique_ptroutput_data_type == false + && "Output data type forcing is not supported for proposal_node!"); auto desc = node.get_primitive(); layout input_layout = node.get_dependency(cls_scores_index).get_output_layout(); - return layout(input_layout.data_type, format::bfyx, { desc->post_nms_topn, CLDNN_ROI_VECTOR_SIZE, 1, 1 }); + return layout(input_layout.data_type, format::bfyx, { input_layout.size.batch[0] * desc->post_nms_topn, CLDNN_ROI_VECTOR_SIZE, 1, 1 }); } static inline std::string stringify_vector(std::vector v) @@ -81,10 +83,12 @@ std::string proposal_inst::to_string(proposal_node const& node) std::stringstream primitive_description; - auto swap_xy = desc->swap_xy ? "true" : "false"; - auto initial_clip = desc->initial_clip ? "true" : "false"; - auto round_ratios = desc->round_ratios ? "true" : "false"; - auto shift_anchors = desc->shift_anchors ? "true" : "false"; + auto swap_xy = desc->swap_xy ? "true" : "false"; + auto initial_clip = desc->initial_clip ? "true" : "false"; + auto round_ratios = desc->round_ratios ? "true" : "false"; + auto shift_anchors = desc->shift_anchors ? "true" : "false"; + auto clip_before_nms = desc->clip_before_nms ? "true" : "false"; + auto clip_after_nms = desc->clip_after_nms ? "true" : "false"; json_composite proposal_info; proposal_info.add("cls score", stringify_port(node.cls_score())); @@ -107,6 +111,8 @@ std::string proposal_inst::to_string(proposal_node const& node) params.add("initial clip", initial_clip); params.add("round ratios", round_ratios); params.add("shift anchors", shift_anchors); + params.add("clip_before_nms", clip_before_nms); + params.add("clip_after_nms", clip_after_nms); proposal_info.add("params", params); node_info->add("proposal info", proposal_info); diff --git a/inference-engine/thirdparty/clDNN/src/pyramid_roi_align.cpp b/inference-engine/thirdparty/clDNN/src/pyramid_roi_align.cpp new file mode 100644 index 0000000..a9b82c1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/pyramid_roi_align.cpp @@ -0,0 +1,63 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "pyramid_roi_align_inst.h" +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn { + primitive_type_id pyramid_roi_align_type_id() + { + static primitive_type_base instance; + return &instance; + } + + layout pyramid_roi_align_inst::calc_output_layout(pyramidROIAlign_node const &node) + { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "pyramidROIAlign_node!"); + + auto desc = node.get_primitive(); + + auto boxes_layout = node.boxes().get_output_layout(); + auto P2_layout = node.P2().get_output_layout(); + auto pool_size_layout = node.pool_size().get_output_layout(); + + int32_t output_b = boxes_layout.size.spatial[1]; + int32_t output_f = P2_layout.size.feature[0]; + + int32_t output_x = pool_size_layout.size.spatial[0]; + int32_t output_y = pool_size_layout.size.spatial[1]; + + return layout{ P2_layout.data_type, P2_layout.format, { output_b, output_f, output_x, output_y } }; + } + + std::string pyramid_roi_align_inst::to_string(pyramidROIAlign_node const& node) + { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + std::stringstream primitive_description; + json_composite pyramid_roi_align_info; + node_info->add("pyramid_roi_align_info", pyramid_roi_align_info); + node_info->dump(primitive_description); + return primitive_description.str(); + } + + pyramid_roi_align_inst::typed_primitive_inst(network_impl& network, pyramidROIAlign_node const& node) + : parent(network, node) + { } +} diff --git a/inference-engine/thirdparty/clDNN/src/region_yolo.cpp b/inference-engine/thirdparty/clDNN/src/region_yolo.cpp index 3fe079f..4bec7a0 100644 --- a/inference-engine/thirdparty/clDNN/src/region_yolo.cpp +++ b/inference-engine/thirdparty/clDNN/src/region_yolo.cpp @@ -28,6 +28,9 @@ namespace cldnn layout region_yolo_inst::calc_output_layout(region_yolo_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "region_yolo_node!"); auto input_layout = node.input().get_output_layout(); auto desc = node.get_primitive(); diff --git a/inference-engine/thirdparty/clDNN/src/reorder.cpp b/inference-engine/thirdparty/clDNN/src/reorder.cpp index e7aab42..c9428b3 100644 --- a/inference-engine/thirdparty/clDNN/src/reorder.cpp +++ b/inference-engine/thirdparty/clDNN/src/reorder.cpp @@ -36,7 +36,7 @@ layout reorder_inst::calc_output_layout(reorder_node const& node) auto input_layout = node.input().get_output_layout(); auto ifmt = input_layout.format; - auto odt = node.get_primitive()->output_data_type; + auto odt = *node.get_primitive()->output_data_type; auto ofmt = node.get_primitive()->output_format; auto op = node.get_primitive()->output_padding; diff --git a/inference-engine/thirdparty/clDNN/src/reorg_yolo.cpp b/inference-engine/thirdparty/clDNN/src/reorg_yolo.cpp index 29ceb9f..9c1e85c 100644 --- a/inference-engine/thirdparty/clDNN/src/reorg_yolo.cpp +++ b/inference-engine/thirdparty/clDNN/src/reorg_yolo.cpp @@ -28,6 +28,9 @@ namespace cldnn layout reorg_yolo_inst::calc_output_layout(reorg_yolo_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "reorg_yolo_node!"); auto input_layout = node.input().get_output_layout(); auto desc = node.get_primitive(); auto stride = desc->stride; diff --git a/inference-engine/thirdparty/clDNN/src/reshape.cpp b/inference-engine/thirdparty/clDNN/src/reshape.cpp index 1825375..0cc6870 100644 --- a/inference-engine/thirdparty/clDNN/src/reshape.cpp +++ b/inference-engine/thirdparty/clDNN/src/reshape.cpp @@ -32,8 +32,31 @@ primitive_type_id reshape_type_id() layout reshape_inst::calc_output_layout(reshape_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for reshape_node!"); auto input_layout = node.input().get_non_padded_output_layout(); - input_layout.size = node.get_primitive()->output_shape; + auto sizes = node.get_primitive()->output_shape.sizes(); + auto input_sizes = input_layout.size.sizes(); + size_t need_recalc = 0; + uint32_t shape_count = 1; + + for (size_t i = 0; i < sizes.size(); i++) { + if (sizes[i] == -1) { + if (need_recalc) { + CLDNN_ERROR_MESSAGE(node.id(), "Only one dimension of the new shape can be -1"); + } + need_recalc = i; + continue; + } + if (sizes[i] == 0) { + sizes[i] = input_sizes[i]; + } + shape_count *= sizes[i]; + } + if (need_recalc) + sizes[need_recalc] = (int)input_layout.size.count() / shape_count; + + input_layout.size = tensor(sizes); return input_layout; } @@ -61,7 +84,7 @@ reshape_inst::typed_primitive_inst(network_impl& network, reshape_node const& no auto input_layout = node.input().get_output_layout(); auto output_layout = node.get_output_layout(); CLDNN_ERROR_DATA_TYPES_MISMATCH(node.id(), "Input layout data typr", input_layout.data_type, "output layout data type", output_layout.data_type, ""); - CLDNN_ERROR_NOT_EQUAL(node.id(), "Output layout count", output_layout.count(), "input layout count", input_layout.count(), "Output layout of reshape pirmitive changes size of input buffer"); + CLDNN_ERROR_NOT_EQUAL(node.id(), "Output layout count", output_layout.count(), "input layout count", input_layout.count(), "Output layout of reshape primitive changes size of input buffer"); //if reshape operated in-place, postpone creation of the output until network run, //then create new memory object as the reinterpreted output of the previous primitive @@ -88,4 +111,4 @@ void reshape_inst::reuse_input() _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout()); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/src/reverse_sequence.cpp b/inference-engine/thirdparty/clDNN/src/reverse_sequence.cpp new file mode 100644 index 0000000..8673c20 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/reverse_sequence.cpp @@ -0,0 +1,65 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "reverse_sequence_inst.h" + +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn +{ +primitive_type_id reverse_sequence_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout reverse_sequence_inst::calc_output_layout(reverse_sequence_node const& node) +{ + auto desc = node.get_primitive(); + + auto input_layout = node.input(0).get_output_layout(); + auto input_format = input_layout.format; + + return layout{input_layout.data_type, input_format, input_layout.size}; +} + +std::string reverse_sequence_inst::to_string(reverse_sequence_node const& node) +{ + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + + std::stringstream primitive_description; + + json_composite reverse_sequence_info; + reverse_sequence_info.add("input id", node.input(0).id()); + reverse_sequence_info.add("sequence lengths id", node.input(1).id()); + reverse_sequence_info.add("sequence axis", desc->seq_axis); + reverse_sequence_info.add("batch axis", desc->batch_axis); + + node_info->add("reverse_sequence info", reverse_sequence_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +reverse_sequence_inst::typed_primitive_inst(network_impl& network, reverse_sequence_node const& node) +: parent(network, node) +{ +} + +} diff --git a/inference-engine/thirdparty/clDNN/src/roi_pooling.cpp b/inference-engine/thirdparty/clDNN/src/roi_pooling.cpp index 0d45548..cbaca7b 100644 --- a/inference-engine/thirdparty/clDNN/src/roi_pooling.cpp +++ b/inference-engine/thirdparty/clDNN/src/roi_pooling.cpp @@ -29,44 +29,35 @@ primitive_type_id roi_pooling_type_id() layout roi_pooling_inst::calc_output_layout(roi_pooling_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for roi_pooling_node!"); auto desc = node.get_primitive(); layout data_layout = node.input().get_output_layout(); - int fm = data_layout.size.feature[0]; - layout rois_layout = node.rois().get_output_layout(); int num_rois = rois_layout.size.batch[0]; + int out_fm = desc->position_sensitive ? desc->output_dim : data_layout.size.feature[0]; - int gss = desc->group_sz * desc->group_sz; - - - CLDNN_ERROR_LESS_THAN(node.id(), "Group size", desc->group_sz, "value", 0, ""); - if (gss && fm % gss != 0) - { - CLDNN_ERROR_MESSAGE(node.id(), "group_sz must be either 0 (For RoIPooling) or satisfy fm % (group_sz^2) == 0"); - } - - if (gss) - { - fm /= gss; - } - - return layout(data_layout.data_type, format::bfyx, { num_rois, fm, desc->pooled_width, desc->pooled_height }); + return layout(data_layout.data_type, format::bfyx, { num_rois, out_fm, desc->pooled_width, desc->pooled_height }); } std::string roi_pooling_inst::to_string(roi_pooling_node const& node) { auto desc = node.get_primitive(); auto mode = desc->mode == pooling_mode::max ? "max" : desc->mode == pooling_mode::bilinear ? "bilinear" : "average"; + auto is_ps = desc->position_sensitive ? "true" : "false"; auto node_info = node.desc_to_json(); std::stringstream primitive_description; json_composite roi_info; roi_info.add("mode", mode); + roi_info.add("position sensitive", is_ps); roi_info.add("pooled_w", desc->pooled_width); roi_info.add("pooled_h", desc->pooled_height); roi_info.add("spatial_scale", desc->spatial_scale); - roi_info.add("group_sz", desc->group_sz); + roi_info.add("output_dim", desc->output_dim); + roi_info.add("spatial_bins_x", desc->spatial_bins_x); + roi_info.add("spatial_bins_y", desc->spatial_bins_y); node_info->add("roi info", roi_info); node_info->dump(primitive_description); diff --git a/inference-engine/thirdparty/clDNN/src/scale.cpp b/inference-engine/thirdparty/clDNN/src/scale.cpp index 1c71f0d..c95fcf7 100644 --- a/inference-engine/thirdparty/clDNN/src/scale.cpp +++ b/inference-engine/thirdparty/clDNN/src/scale.cpp @@ -29,6 +29,8 @@ primitive_type_id scale_type_id() layout scale_inst::calc_output_layout(scale_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for scale_node!"); auto result = node.input().get_non_padded_output_layout(); auto scale_sizes = node.scale_in().get_non_padded_output_layout().size; diff --git a/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp b/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp index 8f2716b..9adcbe7 100644 --- a/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp +++ b/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp @@ -29,6 +29,9 @@ namespace cldnn layout scale_grad_input_inst::calc_output_layout(scale_grad_input_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "scale_grad_input_node!"); auto result = node.input().get_non_padded_output_layout(); auto scale_in_sizes = node.scale_in().get_non_padded_output_layout().size; diff --git a/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp b/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp index 3d4a7b2..13a0110 100644 --- a/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp +++ b/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp @@ -29,6 +29,9 @@ primitive_type_id scale_grad_weights_type_id() layout scale_grad_weights_inst::calc_output_layout(scale_grad_weights_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "scale_grad_weights_node!"); //output buffer will not be used in this primitive auto input_grad_layout_size = node.input().get_output_layout(); return{ input_grad_layout_size.data_type, input_grad_layout_size.format,{ 1, 1, 1, 1 } }; diff --git a/inference-engine/thirdparty/clDNN/src/select.cpp b/inference-engine/thirdparty/clDNN/src/select.cpp index df5aaa8..da799e0 100644 --- a/inference-engine/thirdparty/clDNN/src/select.cpp +++ b/inference-engine/thirdparty/clDNN/src/select.cpp @@ -30,6 +30,8 @@ primitive_type_id select_type_id() layout select_inst::calc_output_layout(select_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for select_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/shuffle_channels.cpp b/inference-engine/thirdparty/clDNN/src/shuffle_channels.cpp new file mode 100644 index 0000000..e89654a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/shuffle_channels.cpp @@ -0,0 +1,83 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "shuffle_channels_inst.h" + +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" + +namespace cldnn +{ +primitive_type_id shuffle_channels_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout shuffle_channels_inst::calc_output_layout(shuffle_channels_node const& node) +{ + auto desc = node.get_primitive(); + + auto input_layout = node.input(0).get_output_layout(); + auto input_format = input_layout.format; + + const int32_t number_of_dims = 4; + const int32_t group = desc->group; + int32_t axis = desc->axis; + + if (axis < 0) + axis += number_of_dims; + + if (axis < 0 || axis >= number_of_dims) + CLDNN_ERROR_MESSAGE(node.id(), "Incorrect axis value! Actual axis is" + std::to_string(group)); + + if (group < 1) + CLDNN_ERROR_MESSAGE(node.id(), "Invalid group size value (should equal at least one). Actual block size is" + + std::to_string(group)); + + if (input_layout.size.sizes(format::bfyx)[axis] % group != 0) + CLDNN_ERROR_MESSAGE(node.id(), "Group parameter must evenly divide the channel dimension. Actual group size is " + + std::to_string(group)); + + return layout{input_layout.data_type, input_format, input_layout.size}; +} + +std::string shuffle_channels_inst::to_string(shuffle_channels_node const& node) +{ + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite shuffle_channels_info; + shuffle_channels_info.add("input id", input.id()); + shuffle_channels_info.add("groups number", desc->group); + shuffle_channels_info.add("axis", desc->axis); + + node_info->add("shuffle_channels info", shuffle_channels_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +shuffle_channels_inst::typed_primitive_inst(network_impl& network, shuffle_channels_node const& node) +: parent(network, node) +{ +} + +} diff --git a/inference-engine/thirdparty/clDNN/src/softmax.cpp b/inference-engine/thirdparty/clDNN/src/softmax.cpp index 1096b87..70c5688 100644 --- a/inference-engine/thirdparty/clDNN/src/softmax.cpp +++ b/inference-engine/thirdparty/clDNN/src/softmax.cpp @@ -28,6 +28,8 @@ primitive_type_id softmax_type_id() layout softmax_inst::calc_output_layout(softmax_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for softmax_node!"); return node.input().get_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp b/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp index df94b60..41069f5 100644 --- a/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp +++ b/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp @@ -28,6 +28,9 @@ primitive_type_id softmax_loss_grad_type_id() layout softmax_loss_grad_inst::calc_output_layout(softmax_loss_grad_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for " + "softmax_loss_grad_node!"); return node.input().get_non_padded_output_layout(); } diff --git a/inference-engine/thirdparty/clDNN/src/split.cpp b/inference-engine/thirdparty/clDNN/src/split.cpp index 01dc4cf..4b5d366 100644 --- a/inference-engine/thirdparty/clDNN/src/split.cpp +++ b/inference-engine/thirdparty/clDNN/src/split.cpp @@ -30,6 +30,8 @@ primitive_type_id split_type_id() layout split_inst::calc_output_layout(split_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for split_node!"); auto output_ids = node.get_primitive()->output_ids; auto output_offsets = node.get_primitive()->output_offsets; auto param_num = output_ids.size(); @@ -81,4 +83,4 @@ split_inst::typed_primitive_inst(network_impl& network, split_node const& node) CLDNN_ERROR_MESSAGE(node.id(), "Split primitive instance should not be created!"); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/src/strided_slice.cpp b/inference-engine/thirdparty/clDNN/src/strided_slice.cpp new file mode 100644 index 0000000..9a2390a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/src/strided_slice.cpp @@ -0,0 +1,141 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "strided_slice_inst.h" +#include "primitive_type_base.h" +#include "error_handler.h" +#include "json_object.h" +#include "data_inst.h" + +namespace cldnn +{ +primitive_type_id strided_slice_type_id() +{ + static primitive_type_base instance; + return &instance; +} + +layout strided_slice_inst::calc_output_layout(strided_slice_node const& node) { + const size_t numberOfDims = 4; + auto desc = node.get_primitive(); + auto input_layout = node.input(0).get_output_layout(); + auto input_format = input_layout.format; + + auto completeStridedSliceParams = [&](std::vector& param) { + for (size_t i = param.size(); i < numberOfDims; ++i) + param.push_back(1); + }; + + auto completeStridedSliceMasks = [&](std::vector& mask) { + for (size_t i = mask.size(); i < numberOfDims; ++i) + mask.push_back(0); + }; + + auto maskStridedSliceParams = [&](std::vector& param, const std::vector& mask) { + for (size_t i = 0; i < param.size(); ++i) + if (mask[i]) + param[i] = input_layout.size.sizes(format::bfyx)[i]; + }; + + // Getting data from constant inputs. There are 3 args: Begin, End, Stride + std::vector> stridedSliceArgs; + for (size_t i = 1; i < node.get_dependencies().size(); ++i) { + auto& input = node.get_dependency(i).as(); + auto& mem = input.get_attached_memory(); + int32_t* data = static_cast(mem.lock()); + std::vector vData = std::vector(data, data + input.get_output_layout().count()); + completeStridedSliceParams(vData); + stridedSliceArgs.push_back(vData); + mem.unlock(); + } + + std::vector beginMask(desc->begin_mask); + completeStridedSliceMasks(beginMask); + std::vector endMask(desc->end_mask); + completeStridedSliceMasks(endMask); + + auto& begin = stridedSliceArgs[0]; + auto& end = stridedSliceArgs[1]; + const auto& strides = stridedSliceArgs[2]; + std::vector outputDimsSizes; + + // If the ith bit of begin_mask is set, begin[i] is ignored and the fullest possible range in that dimension is used instead. + maskStridedSliceParams(begin, beginMask); + // end_mask works analogously + maskStridedSliceParams(end, endMask); + + auto isShiftPossible = [] (std::vector& dims) -> bool { + if (dims[dims.size() - 1] == 1) + return true; + else + return false; + }; + + // If the new_axis_mask is set, then begin, end, and stride are ignored + if (std::find(desc->new_axis_mask.begin(), desc->new_axis_mask.end(), 1) == desc->new_axis_mask.end()) { + for (size_t i = 0; i < numberOfDims; ++i) { + int32_t outputDimSize = (end[i] - begin[i]) / strides[i]; + if ((end[i] - begin[i]) % strides[i] != 0) + outputDimSize++; + outputDimsSizes.push_back(outputDimSize); + } + } else { + outputDimsSizes = input_layout.size.sizes(format::bfyx); + for (size_t i = 0; i < desc->new_axis_mask.size(); ++i) + if (desc->new_axis_mask[desc->new_axis_mask.size() - i - 1] == 1) + if (isShiftPossible(outputDimsSizes)) { + for (size_t j = outputDimsSizes.size() - 1; j > i; --j) + outputDimsSizes[j] = outputDimsSizes[j - 1]; + outputDimsSizes[i] = 1; + } + } + + return layout{input_layout.data_type, input_format, tensor(outputDimsSizes[0], outputDimsSizes[1], outputDimsSizes[3], outputDimsSizes[2])}; +} + +std::string strided_slice_inst::to_string(strided_slice_node const& node) +{ + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + auto& input = node.input(); + + std::stringstream primitive_description; + + json_composite strided_slice_info; + strided_slice_info.add("input id", input.id()); + strided_slice_info.add("begin_param id", node.get_dependency(1).id()); + strided_slice_info.add("end_param id", node.get_dependency(2).id()); + strided_slice_info.add("stride_param id", node.get_dependency(3).id()); + strided_slice_info.add("begin mask", node.get_primitive()->begin_mask); + strided_slice_info.add("end mask", node.get_primitive()->end_mask); + strided_slice_info.add("new axis mask", node.get_primitive()->new_axis_mask); + strided_slice_info.add("shrink axis mask", node.get_primitive()->shrink_axis_mask); + strided_slice_info.add("begin_param shape", node.get_dependency(1).get_output_layout().size.to_string()); + strided_slice_info.add("end_param shape", node.get_dependency(2).get_output_layout().size.to_string()); + strided_slice_info.add("stride_param shape", node.get_dependency(3).get_output_layout().size.to_string()); + + node_info->add("strided_slice info", strided_slice_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +strided_slice_inst::typed_primitive_inst(network_impl& network, strided_slice_node const& node) + : parent(network, node) +{ +} + +} diff --git a/inference-engine/thirdparty/clDNN/src/tile.cpp b/inference-engine/thirdparty/clDNN/src/tile.cpp index 9e47b70..c592aa9 100644 --- a/inference-engine/thirdparty/clDNN/src/tile.cpp +++ b/inference-engine/thirdparty/clDNN/src/tile.cpp @@ -30,6 +30,8 @@ primitive_type_id tile_type_id() layout tile_inst::calc_output_layout(tile_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for tile_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/upsampling.cpp b/inference-engine/thirdparty/clDNN/src/upsampling.cpp index 75ca2f9..fa57f88 100644 --- a/inference-engine/thirdparty/clDNN/src/upsampling.cpp +++ b/inference-engine/thirdparty/clDNN/src/upsampling.cpp @@ -29,6 +29,8 @@ primitive_type_id upsampling_type_id() layout upsampling_inst::calc_output_layout(upsampling_node const& node) { + assert((bool)node.get_primitive()->output_data_type == false + && "Output data type forcing is not supported for upsampling_node!"); auto desc = node.get_primitive(); auto input_layout = node.input().get_output_layout(); auto scale = desc->scale; diff --git a/inference-engine/thirdparty/clDNN/tests/CMakeLists.txt b/inference-engine/thirdparty/clDNN/tests/CMakeLists.txt index 9ceaa22..7f906cd 100644 --- a/inference-engine/thirdparty/clDNN/tests/CMakeLists.txt +++ b/inference-engine/thirdparty/clDNN/tests/CMakeLists.txt @@ -15,15 +15,15 @@ # ========================================= Name / Output settings ===================================== -set(CLDNN_BUILD__PROJ "tests") +set(CLDNN_BUILD__PROJ "clDNN_unit_tests") set(CLDNN_BUILD__PROJ_LABEL "${CLDNN_BUILD__PROJ}") set(CLDNN_BUILD__PROJ_OUTPUT_NAME "${CLDNN_BUILD__PROJ}${CLDNN__OUT_CPU_SUFFIX}") # =========================================== Compiler options ========================================= - intel_config_flag_apply_settings(CompilerOptions CMAKE_CXX_FLAGS ALL_PATTERN "" SET WarnLevel3 + StandardCxx11 ) if (NOT MSVC) intel_config_flag_apply_settings(CompilerOptions CMAKE_CXX_FLAGS ALL_PATTERN "" diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/events_pool_test.cpp b/inference-engine/thirdparty/clDNN/tests/module_tests/events_pool_test.cpp new file mode 100644 index 0000000..c7509ff --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/module_tests/events_pool_test.cpp @@ -0,0 +1,65 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + + + +#include +#include "api/CPP/engine.hpp" +#include "test_utils/test_utils.h" +#include "api/CPP/input_layout.hpp" +#include "api/CPP/network.hpp" + +using namespace tests; +using namespace cldnn; + +TEST(events_pool, DISABLED_basic_test) +{ + /* + This tests if the events pool works and there's no memory leak. + */ + auto batch_num = 1; + auto feature_num = 4; + auto x_size = 1; + auto y_size = 1; + + topology topology; + topology.add(input_layout("input", { data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num))}})); + topology.add(activation("relu", "input", activation_relu)); + topology.add(activation("relu1", "relu", activation_relu)); + topology.add(activation("relu2", "relu1", activation_relu)); + topology.add(activation("relu3", "relu2", activation_relu)); + topology.add(activation("relu4", "relu3", activation_relu)); + topology.add(activation("relu5", "relu4", activation_relu)); + + build_options bo; + bo.set_option(build_option::optimize_data(true)); + + for (int i = 0; i < 20; i++) + { + engine eng;// here we build new engine i times + auto input = memory::allocate(eng, { data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + std::vector input_vec = { -1.f, 2.f, -3.f, 4.f }; + for (int j = 0; j < 20; j++) //then we build network j times + { + network network(eng, topology, bo); + network.set_input_data("input", input); + for(int k = 0; k < 20; k++) //and execute that network k times + network.execute(); + } + EXPECT_EQ(eng.get_max_used_device_memory_size(), (uint64_t)80); + eng.~engine(); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/gpu_toolkit_test.cpp b/inference-engine/thirdparty/clDNN/tests/module_tests/gpu_toolkit_test.cpp index 7f6bbc0..e0e28a8 100644 --- a/inference-engine/thirdparty/clDNN/tests/module_tests/gpu_toolkit_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/module_tests/gpu_toolkit_test.cpp @@ -14,17 +14,123 @@ // limitations under the License. */ - - #include #include "api/CPP/engine.hpp" +#include "test_utils/test_utils.h" +#include "api/CPP/network.hpp" +#include "api/CPP/topology.hpp" +#include "api/CPP/input_layout.hpp" +#include "api/CPP/activation.hpp" +#include "api/C/input_layout.h" +#include "api/C/activation.h" +#include "api/C/cldnn.h" + +#include "test_utils.h" + +#define CL_HPP_ENABLE_EXCEPTIONS +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 120 + +#if defined __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmissing-braces" +#elif defined __GNUC__ && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif + +#include using namespace cldnn; +class user_gpu_toolkit +{ +public: + user_gpu_toolkit() + { + get_platform_and_device(get_plaftorm()); + create_context_from_one_device(); + } + + cl_context get_gpu_context() const { return _gpu_context; } + +private: + cl_platform_id _platform_id; + cl_device_id _gpu_device; + cl_context _gpu_context; + + void create_context_from_one_device() + { + cl_int error = 0; + _gpu_context = clCreateContext(0, 1, &_gpu_device, 0, 0, &error); + if (error != CL_SUCCESS) + { + throw std::runtime_error("error creating context"); + } + } + + cl_platform_id get_plaftorm() + { + cl_uint n = 0; + cl_int err = clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + throw std::runtime_error("clGetPlatformIDs error " + std::to_string(err)); + } + + // Get platform list + std::vector platform_ids(n); + err = clGetPlatformIDs(n, platform_ids.data(), NULL); + if (err != CL_SUCCESS) { + throw std::runtime_error("clGetPlatformIDs error " + std::to_string(err)); + } + return platform_ids[0]; + } + + void get_platform_and_device(cl_platform_id platform_id) + { + _platform_id = platform_id; + cl_int err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &_gpu_device, 0); + if (err != CL_SUCCESS) { + throw std::runtime_error("clGetDeviceIDs error " + std::to_string(err)); + } + } +}; + TEST(gpu_engine, engine_info) { - engine engine; + const auto& engine = tests::get_test_engine(); auto info = engine.get_info(); EXPECT_GT(info.cores_count, 0u); EXPECT_GT(info.core_frequency, 0u); +} + +TEST(gpu_engine, DISABLED_user_context) +{ + user_gpu_toolkit gpu_toolkit; + cl_context user_context = gpu_toolkit.get_gpu_context(); + + //[0] Check if the user engine config works. + auto engine_config = cldnn::engine_configuration(false, false, false, "", "", true, "", "", cldnn::priority_mode_types::disabled, cldnn::throttle_mode_types::disabled, true, &user_context); + + //[1]Check if the engine creation works. + engine engine(engine_config); + auto info = engine.get_info(); + EXPECT_GT(info.cores_count, 0u); + EXPECT_GT(info.core_frequency, 0u); + + //[2]Now check if the queues works (run simple network). + topology topo; + auto inp_lay = cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,2,2}); + auto input_mem = cldnn::memory::allocate(engine, inp_lay); + tests::set_values(input_mem, { 1.0f, 2.0f, 3.0f, 4.0f }); + auto inp = input_layout("input", inp_lay); + auto activ = activation("this_needs_queue", "input", cldnn_activation_func::activation_abs); + topo.add(inp, activ); + network net(engine, topo); + + net.set_input_data("input", input_mem); + auto out = net.execute(); + auto out_ptr = out.at("this_needs_queue").get_memory().pointer(); + EXPECT_EQ(out.size(), size_t(1)); + for(uint32_t i = 0;i < 4; i++) + EXPECT_EQ(out_ptr[i], float(i+1)); } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_grad_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_grad_gpu_test.cpp index d051356..7cc7865 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_grad_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_grad_gpu_test.cpp @@ -43,7 +43,7 @@ TEST(activation_grad_f16_fw_gpu, basic_bfyx_all_functions) // a: 0.5, b: 2.5 // - engine engine; + const auto& engine = get_test_engine(); auto input_grad = memory::allocate(engine, { data_types::f16, format::bfyx,{ 1, 1, 5, 4 } }); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 1, 1, 5, 4 } }); @@ -142,7 +142,7 @@ TEST(activation_grad_f32_fw_gpu, basic_bfyx_all_functions) // a: 0.5, b: 2.5 // - engine engine; + const auto& engine = get_test_engine(); auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp index e40de23..9ec8de1 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp @@ -34,6 +34,61 @@ using namespace cldnn; using namespace tests; +TEST(activation_f32_fw_gpu, not_basic_yxfb) { + // Input: + // 1 0 -3 4 5 + // 0 2 3 4 -6 + // 3 -3 3 0 1 + // 1 1 1 -1 0 + // + // Output: + // 0, 1, 0, 0, 0, + // 1, 0, 0, 0, 0, + // 0, 0, 0, 1, 0, + // 0, 0, 0, 0, 1 + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } }); + set_values(input, + { 1.0f, 0.0f, -3.0f, 4.0f, 5.0f, + 0.0f, 2.0f, 3.0f, 4.0f, -6.0f, + 3.0f, -3.0f, 3.0f, 0.0f, 1.0f, + 1.0f, 1.0f, 1.0f, -1.0f, 0.0f }); + VF output_vec = { + 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 1.0f }; + + topology topology( + input_layout("input", input.get_layout()), + activation("not", "input", activation_not)); + network network(engine, topology); + network.set_input_data("input", input); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "not"); + + auto output_memory = outputs.at("not").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); + + int y_size = output_layout.size.spatial[1]; + int x_size = output_layout.size.spatial[0]; + int f_size = output_layout.size.feature[0]; + int b_size = output_layout.size.batch[0]; + EXPECT_EQ(output_layout.format, format::yxfb); + EXPECT_EQ(y_size, 4); + EXPECT_EQ(x_size, 5); + EXPECT_EQ(f_size, 1); + EXPECT_EQ(b_size, 1); + + for (size_t i = 0; i < output_vec.size(); ++i) { + EXPECT_FLOAT_EQ(output_vec[i], output_ptr[i]); + } +} + TEST(activation_f32_fw_gpu, relu_basic_yxfb) { // Input: // 1 -2 -3 4 5 @@ -49,7 +104,7 @@ TEST(activation_f32_fw_gpu, relu_basic_yxfb) { // 3 -1.5 3 5 1 // 1 1 1 -0.5 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } }); set_values(input, @@ -102,7 +157,7 @@ TEST(activation_f32_fw_gpu, basic_yxfb_all_functions) // a: 0.5, b: 2.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 5, 4 } }); auto input_params = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } }); @@ -130,7 +185,8 @@ TEST(activation_f32_fw_gpu, basic_yxfb_all_functions) activation_cos, activation_cosh, activation_exp, - activation_log2, + activation_not, + activation_log2, }; cldnn_activation_additional_params params = { 0.5f, 2.5f }; @@ -229,12 +285,15 @@ TEST(activation_f32_fw_gpu, basic_yxfb_all_functions) case activation_exp: EXPECT_FLOAT_EQ(std::exp((float)input_ptr[i]), output_ptr[i]); break; - case activation_log2: - if (input_ptr[i] > 0) //logarithm exist only for positive real values + case activation_not: + EXPECT_FLOAT_EQ((float)(!input_ptr[i]), output_ptr[i]); + break; + case activation_log2: + if (input_ptr[i] > 0) //logarithm exist only for positive real values { - EXPECT_FLOAT_EQ(std::log2((float)input_ptr[i]), output_ptr[i]); + EXPECT_FLOAT_EQ(std::log2((float)input_ptr[i]), output_ptr[i]); } - break; + break; default: break; } @@ -245,7 +304,7 @@ TEST(activation_f32_fw_gpu, basic_yxfb_all_functions) TEST(activation_f32_fw_gpu, basic_yxfb_asin_acos_log) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 4 } }); set_values(input, { 0.12f, 0.56f, 0.45f, 0.789f, 0.546f, 0.999f, 0.7899f, 0.6677f}); @@ -328,7 +387,7 @@ TEST(activation_f32_fw_gpu, relu_basic_input_padding_yxfb) { // 3 -1.5 3 5 1 // 1 1 1 -0.5 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } }); @@ -394,7 +453,7 @@ TEST(activation_f32_fw_gpu, relu_basic_output_padding_yxfb) { // 0 0 0 0 0 0 0 0 0 0 0 // 0 0 0 0 0 0 0 0 0 0 0 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } }); set_values(input, diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp new file mode 100644 index 0000000..952f2c1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp @@ -0,0 +1,213 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include +#include "api/CPP/memory.hpp" +#include +#include +#include +#include +#include "test_utils/test_utils.h" +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace tests; + +/* +These tests are inteded to check if additional reorders are being added properly during +add_reorders optimization pass. +*/ + +//Input has incompatible format +TEST(add_reorders_gpu, basic1) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::fyxb,{ 2, 2, 3, 2 } }); //format unsupported by batch_norm! + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + set_values(input, { + 1.f, 2.f, -10.f, + 3.f, 4.f, -14.f, + 5.f, 6.f, -12.f, + 7.f, 8.f, -16.f, + 0.f, 0.f, -11.f, + 0.5f, -0.5f, -15.f, + 1.5f, 5.2f, -13.f, + 12.f, 9.f, -17.f + }); + + set_values(mean, { 0.1f, 0.2f }); + set_values(variance, { 0.4f, 0.5f }); + + float epsilon = 1e-3f; + float expected_out[] = { + 1.42125f, 3.00042f, + -0.28256f, -0.28256f, + -15.94960f, 4.57958f, + -15.82340f, 0.42384f, + 6.15875f,-22.26620f, + -0.98896f,-21.47460f, + 7.73791f, 9.31708f, + 1.83664f, 7.06401f, + -19.1079f, 10.8962f, + -18.6490f, 16.6711f, + 12.4754f, -25.4246f, + 12.4327f, -24.3002f}; + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", epsilon)); + + network network(engine, topology); // without additional reorders we would get an exception here + network.set_input_data("input", input); + + EXPECT_EQ(network.get_all_primitive_org_ids().size(), size_t(5)); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory().pointer(); + for (int i = 0; i < 2 * 2 * 3 * 2; i++) + { + EXPECT_NEAR(expected_out[i], output[i], epsilon); + } +} + +//concatenation of incompatible convolutions +TEST(add_reorders_gpu, two_convolutions_and_concatenation) { + const auto& engine = get_test_engine(); + build_options build_opt; + build_opt.set_option(build_option::optimize_data(false)); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto weights1 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } }); + + set_values(input, { 1.1f, 1.2f, 1.3f, 1.4f }); + set_values(weights1, { 2.1f, 3.1f}); + set_values(weights2, { 1.1f, 0.1f}); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights1", weights1)); + topology.add(data("weights2", weights2)); + + topology.add(cldnn::convolution("conv1", { "input" }, { "weights1" })); + topology.add(cldnn::reorder("reorder", "input", cldnn::layout(data_types::f32, format::byxf, 4))); + topology.add(cldnn::convolution("conv2", { "reorder" }, { "weights2" })); + + topology.add(cldnn::concatenation("concat", { "conv1", "conv2" }, cldnn::concatenation::along_f)); + + network network(engine, topology, build_opt); + network.set_input_data("input", input); + + //concatenation accepts inputs in different formats, so no reorders should be added here + EXPECT_EQ(network.get_all_primitive_org_ids().size(), size_t(7)); + auto outputs = network.execute(); + + float expected_out[] = { 6.34f, 1.34f, 6.86f, 1.46f }; + float epsilon = 1e-3f; + + for (auto& it : outputs) + { + auto output = it.second.get_memory().pointer(); + for (size_t cntr = 0; cntr < 2 * 2; cntr++) + { + EXPECT_NEAR(expected_out[cntr], output[cntr], epsilon); + } + } +} + +template +void tile_ref(const memory& input, memory& output, tile::tile_axis axis, int num_tiles) +{ + auto get_sizes = [](const tensor& size, tile::tile_axis axis) -> std::pair + { + switch (axis) + { + case tile::along_b: return std::make_pair(1, size.batch[0] * size.feature[0] * size.spatial[1] * size.spatial[0]); + case tile::along_f: return std::make_pair(size.batch[0], size.feature[0] * size.spatial[1] * size.spatial[0]); + case tile::along_y: return std::make_pair(size.batch[0] * size.feature[0], size.spatial[1] * size.spatial[0]); + case tile::along_x: return std::make_pair(size.batch[0] * size.feature[0] * size.spatial[1], size.spatial[0]); + default: throw std::invalid_argument("Invalid axis(" + std::to_string(static_cast(axis)) + ") in tile ref version"); + } + }; + + const pointer src = input.pointer(); + pointer dst = output.pointer(); + + const data_t* psrc = src.data(); + data_t* pdst = dst.data(); + + auto sizes = get_sizes(input.get_layout().size, axis); + int outer_dim = sizes.first; + int inner_dim = sizes.second; + + for (int i = 0; i < outer_dim; i++) + { + for (int t = 0; t < num_tiles; t++) + { + for (int j = 0; j < inner_dim; j++) + { + pdst[j] = psrc[j]; + } + pdst += inner_dim; + } + psrc += inner_dim; + } +} + +TEST(add_reorders_gpu, basic_reshape_and_tile) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 2, 2, 1 } }); + auto output_ref = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 1, 4, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(reshape("reshape", "input", tensor(2, 1, 2, 1))); + topology.add(tile("tile", "reshape", tile::along_y, 4)); + + std::vector input_vec = { 1.f, 0.f, 5.f, 1.5f }; + set_values(input, input_vec); + tile_ref(input, output_ref, tile::along_y, 4); + + network network(engine, topology); + network.set_input_data("input", input); + + //reorder is required as tile accepts only bfyx format + EXPECT_EQ(network.get_all_primitive_org_ids().size(), size_t(4)); + auto outputs = network.execute(); + + auto output = outputs.at("tile").get_memory(); + auto output_ptr = output.pointer(); + auto output_ref_ptr = output_ref.pointer(); + + for (unsigned int i = 0; i < output_ref.count(); ++i) { + EXPECT_EQ(output_ptr[i], output_ref_ptr[i]); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp index d0af068..6d2250c 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp @@ -34,7 +34,7 @@ using namespace tests; TEST(apply_adam_gpu, basic_in2x2x3x2_bfyx) { // Test creates topology with two apply adam primitives (t = [0, 1]) with the same output variable which is updated. - engine engine; + const auto& engine = get_test_engine(); auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); auto var = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/arg_max_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/arg_max_gpu_test.cpp index 42471e1..b66cfae 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/arg_max_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/arg_max_gpu_test.cpp @@ -33,7 +33,7 @@ using namespace tests; TEST(arg_max_gpu, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -85,7 +85,7 @@ TEST(arg_max_gpu, base) { TEST(arg_max_gpu_batch_one, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 5, batch_num = 1, top_k = 8; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -164,7 +164,7 @@ TEST(arg_max_gpu_batch_one, base) { TEST(arg_max_gpu_top_k, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 5, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); const int top_k = 8; auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -249,7 +249,7 @@ TEST(arg_max_gpu_min, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 4, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -303,7 +303,7 @@ TEST(arg_max_gpu_min, base) { TEST(arg_max_gpu_min_top_k, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 4, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); const int top_k = 3; auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -385,7 +385,7 @@ TEST(arg_max_gpu_min_top_k, base) { TEST(arg_max_gpu_min_axis_batch, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 4, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); const int top_k = 2; auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/average_unpooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/average_unpooling_gpu_test.cpp index bb30293..d537c89 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/average_unpooling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/average_unpooling_gpu_test.cpp @@ -52,7 +52,7 @@ TEST(average_unpooling_gpu, basic_in2x2x2x1) { // f1: b0: 1.5 2.5 1 b1: 1.75 2.9375 1.1875 // f1: b0: 1.5 2.5 1 b1: 1.75 2.9375 1.1875 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } }); @@ -119,7 +119,7 @@ TEST(average_unpooling_gpu, basic_in2x2x3x2_with_average_pooling_unpooling) { // f1: b0: 1.5 1.5 0.5 b1: 1.75 1.75 1 // f1: b0: 1.5 1.5 0.5 b1: 1.75 1.75 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); @@ -191,7 +191,7 @@ TEST(average_unpooling_gpu, basic_in2x2x2x1_output_padding) { // f0: b0: 0.625 -0.5 -1.125 b1: 0 -1.6875 -1.6875 // f1: b0: 1.5 2.5 1 b1: 1.75 2.9375 1.1875 // f1: b0: 1.5 2.5 1 b1: 1.75 2.9375 1.1875 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -272,7 +272,7 @@ TEST(average_unpooling_gpu, basic_in2x2x2x1_fp16) { // f1: b0: 1.5 2.5 1 b1: 1.75 2.9375 1.1875 // f1: b0: 1.5 2.5 1 b1: 1.75 2.9375 1.1875 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 2, 2, 2, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp index ddc19ff..0de6c66 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp @@ -25,6 +25,7 @@ #include "test_utils/test_utils.h" #include #include +#include using namespace cldnn; using namespace tests; @@ -49,7 +50,7 @@ TEST(batch_normalization_gpu, basic_in2x3x2x2) { // f1: 107.0624 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 2 } }); auto mean = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 1 } }); @@ -103,6 +104,102 @@ TEST(batch_normalization_gpu, basic_in2x3x2x2) { } } +TEST(batch_normalization_gpu, basic_in2x3x2x2_scale_shift) { + // Mean : 3x2x2 + // Input : 2x3x2x2 + // Output : 2x3x2x2 + + // Input: + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f1: b0: 7 8 -16 b1: 12 9 -17 + // + // Mean + // f0: -3.3333 + // f1: -0.3583 + // + // Variance + // f0: 44.9305 + // f1: 107.0624 + // + // Scale + // f0: 2.0 + // f1: 1.0 + // + // Shift + // f0: 0.0 + // f1: 5.0 + + + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(mean, { -3.3333f, -0.3583f }); + set_values(variance, { 44.9305f, 107.0624f }); + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc) { // Mean : 3x2x2 // Input : 2x3x2x2 @@ -123,17 +220,16 @@ TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc) { // f1: 107.0624 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); - auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); float epsilon = 0.0001f; topology topology; topology.add(input_layout("input", input.get_layout())); - topology.add(data("inv_variance", inv_variance)); + topology.add(mutable_data("inv_variance", inv_variance)); topology.add(batch_norm("batch_norm", "input", epsilon, "inv_variance")); set_values(input, { @@ -173,7 +269,7 @@ TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc) { } } -TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx) { +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc_no_inv_var) { // Mean : 3x2x2 // Input : 2x3x2x2 // Output : 2x3x2x2 @@ -193,36 +289,198 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx) { // f1: 107.0624 - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); - auto mean = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } }); - auto variance = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); float epsilon = 0.0001f; topology topology; topology.add(input_layout("input", input.get_layout())); - topology.add(data("mean", mean)); - topology.add(data("variance", variance)); - topology.add(batch_norm("batch_norm", "input", "mean", "variance", epsilon)); + topology.add(batch_norm("batch_norm", "input", epsilon)); set_values(input, { - 1.f, 2.f, -10.f, 3.f, - 4.f, -14.f, 5.f, 6.f, - -12.f, 7.f, 8.f, -16.f, - 0.f, 0.f, -11.f, 0.5f, - -0.5f, -15.f, 1.5f, 5.2f, - -13.f, 12.f, 9.f, -17.f + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f }); + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc_scale_shift) { + // Mean : 3x2x2 + // Input : 2x3x2x2 + // Output : 2x3x2x2 + + // Input: + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f1: b0: 7 8 -16 b1: 12 9 -17 + // + // Mean + // f0: -3.3333 + // f1: -0.3583 + // + // Variance + // f0: 44.9305 + // f1: 107.0624 + // + // Scale + // f0: 2.0 + // f1: 1.0 + // + // Shift + // f0: 0.0 + // f1: 5.0 + + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("inv_variance", inv_variance)); + topology.add(batch_norm("batch_norm", "input", epsilon, "scale", "shift", "inv_variance")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc_scale_shift_no_inv_var) { + // Mean : 3x2x2 + // Input : 2x3x2x2 + // Output : 2x3x2x2 + + // Input: // f0: b0: 1 2 -10 b1: 0 0 -11 // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 // f1: b0: 7 8 -16 b1: 12 9 -17 + // + // Mean + // f0: -3.3333 + // f1: -0.3583 + // + // Variance + // f0: 44.9305 + // f1: 107.0624 + // + // Scale + // f0: 2.0 + // f1: 1.0 + // + // Shift + // f0: 0.0 + // f1: 5.0 - set_values(mean, { -3.3333f, -0.3583f }); - set_values(variance, { 44.9305f, 107.0624f }); + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(batch_norm("batch_norm", "input", epsilon, "scale", "shift")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); network network(engine, topology); @@ -235,10 +493,17 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx) { for (int j = 0; j < 2; ++j) { //F float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + for (int i = 0; i < 2; ++i) { //B for (int k = 0; k < 2; ++k) { //Y for (int l = 0; l < 3; ++l) { //X - float data = output_ptr[l + k * 3 + j * 2 * 3 + i * 2 * 2 * 3]; + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; sum += data; var += data * data; } @@ -252,12 +517,269 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx) { } } -TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) { +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs) { + // Mean : 3x2x2 + // Input : 2x3x2x2 + // Output : 2x3x2x2 + + // Input: + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f1: b0: 7 8 -16 b1: 12 9 -17 + // + // Mean (to be calculated) + // f0: -3.3333 + // f1: -0.3583 + // + // Variance (to be calculated) + // f0: 44.9305 + // f1: 107.0624 + // + // Scale + // f0: 2.0 + // f1: 1.0 + // + // Shift + // f0: 0.0 + // f1: 5.0 + + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(mutable_data("inv_variance", inv_variance)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + std::vector mean_ref = { -3.3333f, -0.3583f }; + std::vector val_ref = { 44.9305f, 107.0624f }; + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean_out.pointer(); + auto varp = variance_out.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + + EXPECT_NEAR(meanf, mean_ref[j], 1e-03F); + EXPECT_NEAR(varf, val_ref[j], 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs_no_inv_var) { + // Mean : 3x2x2 + // Input : 2x3x2x2 + // Output : 2x3x2x2 + + // Input: + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f1: b0: 7 8 -16 b1: 12 9 -17 + // + // Mean (to be calculated) + // f0: -3.3333 + // f1: -0.3583 + // + // Variance (to be calculated) + // f0: 44.9305 + // f1: 107.0624 + // + // Scale + // f0: 2.0 + // f1: 1.0 + // + // Shift + // f0: 0.0 + // f1: 5.0 + + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + std::vector mean_ref = { -3.3333f, -0.3583f }; + std::vector val_ref = { 44.9305f, 107.0624f }; + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean_out.pointer(); + auto varp = variance_out.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + + EXPECT_NEAR(meanf, mean_ref[j], 1e-03F); + EXPECT_NEAR(varf, val_ref[j], 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs_error_out_type) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(data("mean_out", mean_out)); + topology.add(data("variance_out", variance_out)); + topology.add(data("inv_variance", inv_variance)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance")); + + EXPECT_ANY_THROW(network(engine, topology)); +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs_error_non_equal_types) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(mutable_data("inv_variance", inv_variance)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance")); + + EXPECT_ANY_THROW(network(engine, topology)); +} + + +TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx) { // Mean : 3x2x2 // Input : 2x3x2x2 // Output : 2x3x2x2 - // Input padding : 1x2 - // Output padding : 2x1 // Input: // f0: b0: 1 2 -10 b1: 0 0 -11 @@ -274,7 +796,7 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) { // f1: 107.0624 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto mean = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } }); @@ -286,8 +808,7 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) { topology.add(input_layout("input", input.get_layout())); topology.add(data("mean", mean)); topology.add(data("variance", variance)); - topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 1, 2 }, 0 }))); - topology.add(batch_norm("batch_norm", "reorder", "mean", "variance", epsilon, padding({ 0, 0, 2, 1 }, 0))); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", epsilon)); set_values(input, { 1.f, 2.f, -10.f, 3.f, @@ -320,7 +841,7 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) { for (int i = 0; i < 2; ++i) { //B for (int k = 0; k < 2; ++k) { //Y for (int l = 0; l < 3; ++l) { //X - float data = output_ptr[l + 2 + 7 * (k + 1 + 4 * (j + 2 * i))]; + float data = output_ptr[l + k * 3 + j * 2 * 3 + i * 2 * 2 * 3]; sum += data; var += data * data; } @@ -332,4 +853,1854 @@ TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) { EXPECT_NEAR(sum, 0, 1e-03F); EXPECT_NEAR(var, 1, 1e-03F); } -} \ No newline at end of file +} + +TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) { + // Mean : 3x2x2 + // Input : 2x3x2x2 + // Output : 2x3x2x2 + // Input padding : 1x2 + // Output padding : 2x1 + + // Input: + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f1: b0: 7 8 -16 b1: 12 9 -17 + // + // Mean + // f0: -3.3333 + // f1: -0.3583 + // + // Variance + // f0: 44.9305 + // f1: 107.0624 + + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 1, 2 }, 0 }))); + topology.add(batch_norm("batch_norm", "reorder", "mean", "variance", epsilon, padding({ 0, 0, 2, 1 }, 0))); + + set_values(input, { + 1.f, 2.f, -10.f, 3.f, + 4.f, -14.f, 5.f, 6.f, + -12.f, 7.f, 8.f, -16.f, + 0.f, 0.f, -11.f, 0.5f, + -0.5f, -15.f, 1.5f, 5.2f, + -13.f, 12.f, 9.f, -17.f + }); + + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f1: b0: 7 8 -16 b1: 12 9 -17 + + set_values(mean, { -3.3333f, -0.3583f }); + set_values(variance, { 44.9305f, 107.0624f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[l + 2 + 7 * (k + 1 + 4 * (j + 2 * i))]; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_to_string) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + + topology.add(mutable_data("inv_variance", inv_variance)); + + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + + topology.add(batch_norm("batch_norm0", "input", "mean", "variance", epsilon)); + topology.add(batch_norm("batch_norm1", "input", "mean", "variance", "scale", "shift", epsilon)); + topology.add(batch_norm("batch_norm2", "input", epsilon)); + topology.add(batch_norm("batch_norm3", "input", epsilon, "inv_variance")); + topology.add(batch_norm("batch_norm4", "input", epsilon, "scale", "shift")); + topology.add(batch_norm("batch_norm5", "input", epsilon, "scale", "shift", "inv_variance")); + topology.add(batch_norm("batch_norm6", "input", epsilon, "mean_out", "variance_out", "scale", "shift" )); + topology.add(batch_norm("batch_norm7", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance")); + + network network(engine, topology); + + size_t zero_length = 0; + + EXPECT_NE(network.get_primitive_info("batch_norm0").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm1").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm2").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm3").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm4").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm5").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm6").length(), zero_length); + EXPECT_NE(network.get_primitive_info("batch_norm7").length(), zero_length); +} + + +TEST(batch_normalization_gpu, basic_in2x3x2x2_yxfb_scale_shift_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(mean, { -3.3333f, -0.3583f }); + set_values(variance, { 44.9305f, 107.0624f }); + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_yxfb_scale_shift_different_shapes_input_layouts) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("mean", mean.get_layout())); + topology.add(input_layout("variance", variance.get_layout())); + topology.add(input_layout("scale", scale.get_layout())); + topology.add(input_layout("shift", shift.get_layout())); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(mean, { -3.3333f, -0.3583f }); + set_values(variance, { 44.9305f, 107.0624f }); + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("mean", mean); + network.set_input_data("variance", variance); + network.set_input_data("scale", scale); + network.set_input_data("shift", shift); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x2x2_yxfb_with_var_mean_outputs_no_inv_var_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + -10.f, -11.f, -12.f, -13.f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 9.f, + -14.f, -15.f, -16.f, -17.f + }); + + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + std::vector mean_ref = { -3.3333f, -0.3583f }; + std::vector val_ref = { 44.9305f, 107.0624f }; + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean_out.pointer(); + auto varp = variance_out.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k]; + data = (data - shiftf) / scalef; + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + + EXPECT_NEAR(meanf, mean_ref[j], 1e-03F); + EXPECT_NEAR(varf, val_ref[j], 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x2x3x2_byxf_scale_shift_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 1, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon)); + + set_values(input, { + 1.f, 5.f, 2.f, 6.f, -10.f, -12.f, + 3.f, 7.f, 4.f, 8.f, -14.f, -16.f, + 0.f, 1.5f, 0.f, 5.2f, -11.f, -13.f, + 0.5f, 12.f, -0.5f, 9.f, -15.f, -17.f + }); + + set_values(mean, { -3.3333f, -0.3583f }); + set_values(variance, { 44.9305f, 107.0624f }); + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + std::vector expected_result{ + 0.646469f, 0.517855f, 0.795655f, 0.614501f, -0.99458f, -1.12512f, + 0.944842f, 0.711146f, 1.09403f, 0.807792f, -1.59133f, -1.5117f, + 0.497283f, 0.179596f, 0.497283f, 0.537184f, -1.14377f, -1.22176f, + 0.571876f, 1.19437f, 0.42269f, 0.904437f, -1.74051f, -1.60834f + }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + auto index = 12 * i + 6 * k + 2 * l + j; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-3F); + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x2x3x2_byxf_with_var_mean_outputs_no_inv_var_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 1, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 2, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift")); + + set_values(input, { + 1.f, 5.f, 2.f, 6.f, -10.f, -12.f, + 3.f, 7.f, 4.f, 8.f, -14.f, -16.f, + 0.f, 1.5f, 0.f, 5.2f, -11.f, -13.f, + 0.5f, 12.f, -0.5f, 9.f, -15.f, -17.f + }); + + set_values(scale, { 2.f, 1.f }); + set_values(shift, { 0.f, 5.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + std::vector mean_ref = { -3.3333f, -0.3583f }; + std::vector val_ref = { 44.9305f, 107.0624f }; + + std::vector expected_result{ + 0.646469f, 0.517855f, 0.795655f, 0.614501f, -0.99458f, -1.12512f, + 0.944842f, 0.711146f, 1.09403f, 0.807792f, -1.59133f, -1.5117f, + 0.497283f, 0.179596f, 0.497283f, 0.537184f, -1.14377f, -1.22176f, + 0.571876f, 1.19437f, 0.42269f, 0.904437f, -1.74051f, -1.60834f + }; + + for (int j = 0; j < 2; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean_out.pointer(); + auto varp = variance_out.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + auto index = 12 * i + 6 * k + 2 * l + j; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-3F); + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + + EXPECT_NEAR(meanf, mean_ref[j], 1e-03F); + EXPECT_NEAR(varf, val_ref[j], 1e-03F); + } +} + + +TEST(batch_normalization_gpu, basic_in2x3x5x2_yxfb_scale_shift_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 5, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 5, 1, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 5, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 5, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 5 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon)); + + set_values(input, { + // y0x0 + 1.f, 0.f, // f0 + 5.f, 1.5f, // f1 + 1.f, 0.f, // f2 + 5.f, 1.5f, // f3 + 1.f, 0.f, // f4 + + // y0x1 + 2.f, 0.f, + 6.f, 5.2f, + 2.f, 0.f, + 6.f, 5.2f, + 2.f, 0.f, + + // y0x2 + -10.f, -11.f, + -12.f, -13.f, + -10.f, -11.f, + -12.f, -13.f, + -10.f, -11.f, + + // y1x0 + 3.f, 0.5f, + 7.f, 12.f, + 3.f, 0.5f, + 7.f, 12.f, + 3.f, 0.5f, + + // y1x1 + 4.f, -0.5f, + 8.f, 9.f, + 4.f, -0.5f, + 8.f, 9.f, + 4.f, -0.5f, + + // y1x2 + -14.f, -15.f, + -16.f, -17.f, + -14.f, -15.f, + -16.f, -17.f, + - 14.f, -15.f + }); + + set_values(mean, { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f }); + set_values(variance, { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f }); + set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f }); + set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f }); + + std::vector expected_result{ + 0.646469f, 0.497283f, + 0.517855f, 0.179596f, + 0.646469f, 0.497283f, + 0.517855f, 0.179596f, + 0.646469f, 0.497283f, + + 0.795655f, 0.497283f, + 0.614501f, 0.537184f, + 0.795655f, 0.497283f, + 0.614501f, 0.537184f, + 0.795655f, 0.497283f, + + -0.99458f, -1.14377f, + -1.12512f, -1.22176f, + -0.99458f, -1.14377f, + -1.12512f, -1.22176f, + -0.99458f, -1.14377f, + + 0.944842f, 0.571876f, + 0.711146f, 1.19437f, + 0.944842f, 0.571876f, + 0.711146f, 1.19437f, + 0.944842f, 0.571876f, + + 1.09403f, 0.42269f, + 0.807792f, 0.904437f, + 1.09403f, 0.42269f, + 0.807792f, 0.904437f, + 1.09403f, 0.42269f, + + -1.59133f, -1.74051f, + -1.5117f, -1.60834f, + -1.59133f, -1.74051f, + -1.5117f, -1.60834f, + -1.59133f, -1.74051f + }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 5; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + int index = 30 * k + 10 * l + 2 * j + i; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-3F); + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x3x5x2_yxfb_with_var_mean_outputs_no_inv_var_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 5, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 5, 1, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 5, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 5, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 5 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift")); + + set_values(input, { + // y0x0 + 1.f, 0.f, // f0 + 5.f, 1.5f, // f1 + 1.f, 0.f, // f2 + 5.f, 1.5f, // f3 + 1.f, 0.f, // f4 + + // y0x1 + 2.f, 0.f, + 6.f, 5.2f, + 2.f, 0.f, + 6.f, 5.2f, + 2.f, 0.f, + + // y0x2 + -10.f, -11.f, + -12.f, -13.f, + -10.f, -11.f, + -12.f, -13.f, + -10.f, -11.f, + + // y1x0 + 3.f, 0.5f, + 7.f, 12.f, + 3.f, 0.5f, + 7.f, 12.f, + 3.f, 0.5f, + + // y1x1 + 4.f, -0.5f, + 8.f, 9.f, + 4.f, -0.5f, + 8.f, 9.f, + 4.f, -0.5f, + + // y1x2 + -14.f, -15.f, + -16.f, -17.f, + -14.f, -15.f, + -16.f, -17.f, + -14.f, -15.f + }); + + set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f }); + set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f }); + + std::vector expected_result{ + 0.646469f, 0.497283f, + 0.517855f, 0.179596f, + 0.646469f, 0.497283f, + 0.517855f, 0.179596f, + 0.646469f, 0.497283f, + + 0.795655f, 0.497283f, + 0.614501f, 0.537184f, + 0.795655f, 0.497283f, + 0.614501f, 0.537184f, + 0.795655f, 0.497283f, + + -0.99458f, -1.14377f, + -1.12512f, -1.22176f, + -0.99458f, -1.14377f, + -1.12512f, -1.22176f, + -0.99458f, -1.14377f, + + 0.944842f, 0.571876f, + 0.711146f, 1.19437f, + 0.944842f, 0.571876f, + 0.711146f, 1.19437f, + 0.944842f, 0.571876f, + + 1.09403f, 0.42269f, + 0.807792f, 0.904437f, + 1.09403f, 0.42269f, + 0.807792f, 0.904437f, + 1.09403f, 0.42269f, + + -1.59133f, -1.74051f, + -1.5117f, -1.60834f, + -1.59133f, -1.74051f, + -1.5117f, -1.60834f, + -1.59133f, -1.74051f + }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + std::vector mean_ref = { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f }; + std::vector val_ref = { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f }; + + for (int j = 0; j < 5; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean_out.pointer(); + auto varp = variance_out.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + int index = 30 * k + 10 * l + 2 * j + i; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-3F); + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + + EXPECT_NEAR(meanf, mean_ref[j], 1e-03F); + EXPECT_NEAR(varf, val_ref[j], 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x2x3x5_byxf_scale_shift_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 5, 3, 2 } }); + auto mean = memory::allocate(engine, { data_types::f32, format::byxf,{ 5, 1, 1, 1 } }); + auto variance = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 5, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 5, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 5 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon)); + + set_values(input, { + // b0y0 + 1.f, 5.f, 1.f, 5.f, 1.f, // x0 + 2.f, 6.f, 2.f, 6.f, 2.f, // x1 + -10.f, -12.f, -10.f, -12.f, -10.f, //x2 + + // b0y1 + 3.f, 7.f, 3.f, 7.f, 3.f, + 4.f, 8.f, 4.f, 8.f, 4.f, + -14.f, -16.f, -14.f, -16.f, -14.f, + + // b1y0 + 0.f, 1.5f, 0.f, 1.5f, 0.f, + 0.f, 5.2f, 0.f, 5.2f, 0.f, + -11.f, -13.f, -11.f, -13.f, -11.f, + + // b1y1 + 0.5f, 12.f, 0.5f, 12.f, 0.5f, + -0.5f, 9.f, -0.5f, 9.f, -0.5f, + -15.f, -17.f, -15.f, -17.f, -15.f + }); + + set_values(mean, { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f }); + set_values(variance, { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f }); + set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f }); + set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f }); + + std::vector expected_result{ + 0.646469f, 0.517855f, 0.646469f, 0.517855f, 0.646469f, + 0.795655f, 0.614501f, 0.795655f, 0.614501f, 0.795655f, + -0.99458f, -1.12512f, -0.99458f, -1.12512f, -0.99458f, + + 0.944842f, 0.711146f, 0.944842f, 0.711146f, 0.944842f, + 1.09403f, 0.807792f, 1.09403f, 0.807792f, 1.09403f, + -1.59133f, -1.5117f, -1.59133f, -1.5117f, -1.59133f, + + 0.497283f, 0.179596f, 0.497283f, 0.179596f, 0.497283f, + 0.497283f, 0.537184f, 0.497283f, 0.537184f, 0.497283f, + -1.14377f, -1.22176f, -1.14377f, -1.22176f, -1.14377f, + + 0.571876f, 1.19437f, 0.571876f, 1.19437f, 0.571876f, + 0.42269f, 0.904437f, 0.42269f, 0.904437f, 0.42269f, + -1.74051f, -1.60834f, -1.74051f, -1.60834f, -1.74051f + }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 5; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + auto index = 30 * i + 15 * k + 5 * l + j; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-3F); + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + } +} + +TEST(batch_normalization_gpu, basic_in2x2x3x5_byxf_with_var_mean_outputs_no_inv_var_different_shapes) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 5, 3, 2 } }); + auto mean_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 5, 1, 1, 1 } }); + auto variance_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 5, 1, 1 } }); + auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 5, 1 } }); + auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 5 } }); + + float epsilon = 0.0001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("scale", scale)); + topology.add(data("shift", shift)); + topology.add(mutable_data("mean_out", mean_out)); + topology.add(mutable_data("variance_out", variance_out)); + topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift")); + + set_values(input, { + // b0y0 + 1.f, 5.f, 1.f, 5.f, 1.f, // x0 + 2.f, 6.f, 2.f, 6.f, 2.f, // x1 + -10.f, -12.f, -10.f, -12.f, -10.f, //x2 + + // b0y1 + 3.f, 7.f, 3.f, 7.f, 3.f, + 4.f, 8.f, 4.f, 8.f, 4.f, + -14.f, -16.f, -14.f, -16.f, -14.f, + + // b1y0 + 0.f, 1.5f, 0.f, 1.5f, 0.f, + 0.f, 5.2f, 0.f, 5.2f, 0.f, + -11.f, -13.f, -11.f, -13.f, -11.f, + + // b1y1 + 0.5f, 12.f, 0.5f, 12.f, 0.5f, + -0.5f, 9.f, -0.5f, 9.f, -0.5f, + -15.f, -17.f, -15.f, -17.f, -15.f + }); + + set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f }); + set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + std::vector mean_ref = { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f }; + std::vector val_ref = { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f }; + + std::vector expected_result{ + 0.646469f, 0.517855f, 0.646469f, 0.517855f, 0.646469f, + 0.795655f, 0.614501f, 0.795655f, 0.614501f, 0.795655f, + -0.99458f, -1.12512f, -0.99458f, -1.12512f, -0.99458f, + + 0.944842f, 0.711146f, 0.944842f, 0.711146f, 0.944842f, + 1.09403f, 0.807792f, 1.09403f, 0.807792f, 1.09403f, + -1.59133f, -1.5117f, -1.59133f, -1.5117f, -1.59133f, + + 0.497283f, 0.179596f, 0.497283f, 0.179596f, 0.497283f, + 0.497283f, 0.537184f, 0.497283f, 0.537184f, 0.497283f, + -1.14377f, -1.22176f, -1.14377f, -1.22176f, -1.14377f, + + 0.571876f, 1.19437f, 0.571876f, 1.19437f, 0.571876f, + 0.42269f, 0.904437f, 0.42269f, 0.904437f, 0.42269f, + -1.74051f, -1.60834f, -1.74051f, -1.60834f, -1.74051f + }; + + for (int j = 0; j < 5; ++j) { //F + float sum = 0, var = 0; + + auto scalep = scale.pointer(); + auto shiftp = shift.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean_out.pointer(); + auto varp = variance_out.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int i = 0; i < 2; ++i) { //B + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 3; ++l) { //X + auto index = 30 * i + 15 * k + 5 * l + j; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-3F); + sum += data; + var += data * data; + } + } + } + sum /= 2 * 3 * 2; + var /= 2 * 3 * 2; + + EXPECT_NEAR(sum, 0, 1e-03F); + EXPECT_NEAR(var, 1, 1e-03F); + + EXPECT_NEAR(meanf, mean_ref[j], 1e-03F); + EXPECT_NEAR(varf, val_ref[j], 1e-03F); + } +} + + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b1c2h2w2) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 1, 2, 2, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { feature(2) }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { feature(2) }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { feature(2) }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { feature(2) }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(mutable_data("mean", mean)); + topology.add(mutable_data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + set_values(input, { + 0.54881352f, + 0.71518934f, + 0.60276335f, + 0.54488319f, + + 0.42365479f, + 0.64589411f, + 0.4375872f, + 0.89177299f + }); + + set_values(gamma, { 1.f, 1.f }); + set_values(beta, { 0.f, 0.f }); + + std::vector expected_result { + -0.71498716f, + 1.48388731f, + -0.00196938f, + -0.76693159f, + + -0.91316032f, + 0.23943391f, + -0.84090298f, + 1.51462936f + }; + + std::vector expected_mean = { 0.602912f, 0.599727f }; + std::vector expected_variance = { 0.00472505f, 0.0361782f }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean.pointer(); + auto varp = variance.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int k = 0; k < 2; ++k) { //Y + for (int l = 0; l < 2; ++l) { //X + int index = 4 * j + 2 * k + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + + EXPECT_NEAR(meanf, expected_mean[j], 1e-5F); + EXPECT_NEAR(varf, expected_variance[j], 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b2c2h2w1) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 2, 2, 1, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { feature(2) }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { feature(2) }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { feature(2) }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { feature(2) }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(mutable_data("mean", mean)); + topology.add(mutable_data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + + set_values(input, { + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f + }); + + set_values(gamma, { 1.f, 1.f }); + set_values(beta, { 0.f, 0.f }); + + std::vector expected_result{ + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f }; + + std::vector expected_mean = { 0.583388f, 0.619252f }; + std::vector expected_variance = { 0.0119972f, 0.0282681f }; + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean.pointer(); + auto varp = variance.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int k = 0; k < 2; ++k) { //B + for (int l = 0; l < 2; ++l) { //Y + int index = 4 * k + 2 * j + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + + EXPECT_NEAR(meanf, expected_mean[j], 1e-5F); + EXPECT_NEAR(varf, expected_variance[j], 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_inference_b2c2h2w1) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 2, 2, 1, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { feature(2) }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { feature(2) }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { feature(2) }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { feature(2) }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + + set_values(input, { + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f + }); + + set_values(gamma, { 1.f, 1.f }); + set_values(beta, { 0.f, 0.f }); + + set_values(mean, { 0.583388f, 0.619252f }); + set_values(variance, { 0.0119972f, 0.0282681f }); + + std::vector expected_result{ + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int k = 0; k < 2; ++k) { //B + for (int l = 0; l < 2; ++l) { //Y + int index = 4 * k + 2 * j + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b2c2h2w1_different_shapes) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 2, 2, 1, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { 2, 1, 1, 1 }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { 1, 2, 1, 1 }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { 1, 1, 2, 1 }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { 1, 1, 1, 2 }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(mutable_data("mean", mean)); + topology.add(mutable_data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + + set_values(input, { + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f + }); + + set_values(gamma, { 2.f, 3.f }); + set_values(beta, { 5.f, 10.f }); + + std::vector expected_result{ + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f }; + + std::vector expected_mean = { 0.583388f, 0.619252f }; + std::vector expected_variance = { 0.0119972f, 0.0282681f }; + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean.pointer(); + auto varp = variance.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int k = 0; k < 2; ++k) { //B + for (int l = 0; l < 2; ++l) { //Y + int index = 4 * k + 2 * j + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + + EXPECT_NEAR(meanf, expected_mean[j], 1e-5F); + EXPECT_NEAR(varf, expected_variance[j], 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_inference_b2c2h2w1_different_shapes) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 2, 2, 1, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { 2, 1, 1, 1 }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { 1, 1, 2, 1 }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { 1, 1, 2, 1 }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { 1, 1, 1, 2 }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + + set_values(input, { + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f + }); + + set_values(gamma, { 2.f, 3.f }); + set_values(beta, { 5.f, 10.f }); + + set_values(mean, { 0.583388f, 0.619252f }); + set_values(variance, { 0.0119972f, 0.0282681f }); + + std::vector expected_result{ + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f }; + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 2; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int k = 0; k < 2; ++k) { //B + for (int l = 0; l < 2; ++l) { //Y + int index = 4 * k + 2 * j + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b2c5h2w1_different_shapes) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 2, 5, 1, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { 5, 1, 1, 1 }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { 1, 5, 1, 1 }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { 1, 1, 5, 1 }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { 1, 1, 1, 5 }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(mutable_data("mean", mean)); + topology.add(mutable_data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + + set_values(input, { + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + 0.54881352f, + 0.71518934f, + + + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f, + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f, + + 0.42365479f, + 0.64589411f + }); + + set_values(gamma, { 2.f, 3.f, 4.f, 5.f, 1.f }); + set_values(beta, { 5.f, 10.f, -10.f, -15.f, 0.f }); + + std::vector expected_result{ + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + -0.30327f, + 1.1561f, + + + + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f, + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f, + + -1.4011f, + 0.548275f + }; + + std::vector expected_mean = { 0.583388f, 0.619252f, 0.583388f, 0.619252f, 0.583388f }; + std::vector expected_variance = { 0.0119972f, 0.0282681f, 0.0119972f, 0.0282681f, 0.0119972f }; + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 5; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + auto meanp = mean.pointer(); + auto varp = variance.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + for (int k = 0; k < 2; ++k) { //B + for (int l = 0; l < 2; ++l) { //Y + int index = 10 * k + 2 * j + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + + EXPECT_NEAR(meanf, expected_mean[j], 1e-5F); + EXPECT_NEAR(varf, expected_variance[j], 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_inference_b2c5h2w1_different_shapes) +{ + const auto& engine = get_test_engine(); + + tensor input_shape = { 2, 5, 1, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { 5, 1, 1, 1 }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { 1, 5, 1, 1 }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { 1, 1, 5, 1 }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { 1, 1, 1, 5 }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(data("mean", mean)); + topology.add(data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + + set_values(input, { + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + 0.54881352f, + 0.71518934f, + + 0.60276335f, + 0.54488319f, + + 0.54881352f, + 0.71518934f, + + + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f, + + 0.42365479f, + 0.64589411f, + + 0.4375872f, + 0.89177299f, + + 0.42365479f, + 0.64589411f + }); + + set_values(gamma, { 2.f, 3.f, 4.f, 5.f, 1.f }); + set_values(beta, { 5.f, 10.f, -10.f, -15.f, 0.f }); + + std::vector expected_result{ + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + -0.30327f, + 1.1561f, + + -0.0963782f, + -0.434702f, + + -0.30327f, + 1.1561f, + + + + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f, + + -1.4011f, + 0.548275f, + + -1.06187f, + 1.59295f, + + -1.4011f, + 0.548275f + }; + + set_values(mean, { 0.583388f, 0.619252f, 0.583388f, 0.619252f, 0.583388f }); + set_values(variance, { 0.0119972f, 0.0282681f, 0.0119972f, 0.0282681f, 0.0119972f }); + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("batch_norm").get_memory(); + auto output_ptr = output.pointer(); + + for (int j = 0; j < 5; ++j) { //F + float sum = 0; + + auto scalep = gamma.pointer(); + auto shiftp = beta.pointer(); + float scalef = scalep[j]; + float shiftf = shiftp[j]; + + for (int k = 0; k < 2; ++k) { //B + for (int l = 0; l < 2; ++l) { //Y + int index = 10 * k + 2 * j + l; + float data = output_ptr[index]; + data = (data - shiftf) / scalef; + EXPECT_NEAR(data, expected_result[index], 1e-5F); + sum += data; + } + } + + sum /= 2 * 2; + + EXPECT_NEAR(sum, 0, 1e-5F); + } +} + +TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b1c2h2w2_no_bn_output) +{ + engine engine; + + tensor input_shape = { 1, 2, 2, 2 }; + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape }); + tensor mean_shape = { feature(2) }; + auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape }); + tensor var_shape = { feature(2) }; + auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape }); + tensor gamma_shape = { feature(2) }; + auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape }); + tensor beta_shape = { feature(2) }; + auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape }); + + float eps = 0.001f; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("gamma", gamma)); + topology.add(data("beta", beta)); + topology.add(mutable_data("mean", mean)); + topology.add(mutable_data("variance", variance)); + topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta")); + + set_values(input, { + 0.54881352f, + 0.71518934f, + 0.60276335f, + 0.54488319f, + + 0.42365479f, + 0.64589411f, + 0.4375872f, + 0.89177299f + }); + + set_values(gamma, { 1.f, 1.f }); + set_values(beta, { 0.f, 0.f }); + + std::vector expected_mean = { 0.602912f, 0.599727f }; + std::vector expected_variance = { 0.00472505f, 0.0361782f }; + + build_options bo; + bo.set_option(build_option::outputs({ "mean", "variance" })); + network network(engine, topology, bo); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + for (int j = 0; j < 2; ++j) { //F + auto meanp = mean.pointer(); + auto varp = variance.pointer(); + float meanf = meanp[j]; + float varf = varp[j]; + + EXPECT_NEAR(meanf, expected_mean[j], 1e-5F); + EXPECT_NEAR(varf, expected_variance[j], 1e-5F); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp index f6c364b..f9b820b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp @@ -51,7 +51,7 @@ TEST(batch_normalization_backward_gpu, basic_in2x2x2x3) { // f0: 0.1491862 // f1: 0.0966454 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp index 7a25399..f1b3da3 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp @@ -73,7 +73,7 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -149,7 +149,7 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant_non_constant) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -225,7 +225,7 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_mirror) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -301,7 +301,7 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_mirror_101) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -379,7 +379,7 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_edge) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -454,7 +454,7 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_constant) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -524,7 +524,7 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_mirror) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -590,7 +590,7 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_mirror_101) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; @@ -656,7 +656,7 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_edge) { constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y; constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}}); topology topology; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/broadcast_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/broadcast_gpu_test.cpp index 965a65e..548ef3d 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/broadcast_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/broadcast_gpu_test.cpp @@ -31,62 +31,44 @@ using namespace cldnn; using namespace ::tests; - template -static std::vector generate_rnd_real_input( - const std::size_t b, const std::size_t f, const std::size_t y, const std::size_t x, - const T min = static_cast(0), const T max = static_cast(1), const unsigned rnd_bits = 9) +void start_broadcast_test(data_types cldnn_data_type, std::vector output_shape, + std::vector input_shape, std::vector broadcast_axes, + std::vector golden_data) { - static std::default_random_engine rnd_gen(random_seed); - cldnn::tests::distributions::uniform_quantized_real_distribution rnd_dist(min, max, rnd_bits); - - std::vector data; - data.reserve(b * f * y * x); - for (size_t i = 0; i < b * f * y * x; ++i) - data.push_back(rnd_dist(rnd_gen)); - - return data; -} - - -TEST(broadcast_gpu, basic_yxfb_1x1x2x3_to_1x2x2x9) { - // Input (BF:XY) : 1x1:3x2 - // Output (BF:XY): 1x2:9x2 - - constexpr auto in_size_b = 1; - constexpr auto in_size_f = 1; - constexpr auto in_size_y = 2; - constexpr auto in_size_x = 3; + size_t input_data_size = accumulate(input_shape.rbegin(), input_shape.rend(), (size_t)1, std::multiplies()); + EXPECT_GE(input_data_size, (size_t)1); + std::vector input_data = {}; + for (size_t i = 1; i <= input_data_size; ++i) { + input_data.push_back((T)i); + } - constexpr auto bc_scale_b = 1; - constexpr auto bc_scale_f = 2; - constexpr auto bc_scale_y = 1; - constexpr auto bc_scale_x = 3; + EXPECT_EQ(golden_data.size(), accumulate(output_shape.rbegin(), output_shape.rend(), (size_t)1, std::multiplies())); - constexpr auto out_size_b = bc_scale_b * in_size_b; - constexpr auto out_size_f = bc_scale_f * in_size_f; - constexpr auto out_size_y = bc_scale_y * in_size_y; - constexpr auto out_size_x = bc_scale_x * in_size_x; + std::vector output_4d(4, 1); + for(size_t i = 0; i < output_shape.size(); ++i) { + output_4d.at(4 - output_shape.size() + i) = (tensor::value_type)output_shape.at(i); + } + std::vector input_4d(4, 1); + for(size_t i = 0; i < input_shape.size(); ++i) { + input_4d.at(4 - input_shape.size() + i) = (tensor::value_type)input_shape.at(i); + } + std::vector fixed_b_axes; + size_t shift = 4 - output_shape.size(); + for(size_t i = 0; i < shift; ++i) { + fixed_b_axes.push_back((uint16_t) i); + } + for(size_t i = 0; i < broadcast_axes.size(); ++i) { + fixed_b_axes.push_back((uint16_t) (broadcast_axes.at(i) + shift)); + } - engine engine; - auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {cldnn_data_type, format::bfyx, {input_4d.at(0), input_4d.at(1), input_4d.at(3), input_4d.at(2)}}); topology topology; - topology.add( - input_layout("input", input.get_layout()) - ); - topology.add( - broadcast("output", "input", {out_size_b, out_size_f, out_size_x, out_size_y}) - ); - - std::vector input_data = { - 41, -11, 13, - 107, -66, 0, - }; - std::vector out_data = { - 41, 41, -11, -11, 13, 13, 41, 41, -11, -11, 13, 13, 41, 41, -11, -11, 13, 13, - 107, 107, -66, -66, 0, 0, 107, 107, -66, -66, 0, 0, 107, 107, -66, -66, 0, 0, - }; + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {output_4d.at(0), output_4d.at(1), output_4d.at(3), output_4d.at(2)}, fixed_b_axes)); + set_values(input, input_data); network network(engine, topology); @@ -94,375 +76,972 @@ TEST(broadcast_gpu, basic_yxfb_1x1x2x3_to_1x2x2x9) { auto outputs = network.execute(); auto output = outputs.at("output").get_memory(); - auto output_ptr = output.pointer(); - - ASSERT_EQ(out_data.size(), static_cast(out_size_b * out_size_f * out_size_y * out_size_x)); - - for (auto b = 0; b < out_size_b; ++b) { // B - for (auto f = 0; f < out_size_f; ++f) { // F - for (auto y = 0; y < out_size_y; ++y) { // Y - for (auto x = 0; x < out_size_x; ++x) { // X - auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB - - EXPECT_EQ(output_ptr[output_off], out_data[output_off]); + auto output_ptr = output.pointer(); + + for (tensor::value_type b = 0; b < output_4d.at(0); ++b) { + for (tensor::value_type f = 0; f < output_4d.at(1); ++f) { + for (tensor::value_type y = 0; y < output_4d.at(2); ++y) { + for (tensor::value_type x = 0; x < output_4d.at(3); ++x) { + auto output_off = ((b * output_4d.at(1) + f) * output_4d.at(2) + y) * output_4d.at(3) + x; + EXPECT_EQ(output_ptr[output_off], golden_data[output_off]); } } } } } -TEST(broadcast_gpu, basic_bfyx_4x2x2x2_to_8x2x6x4) { - // Input (BF:XY) : 4x2:2x2 - // Output (BF:XY): 8x2:6x4 +TEST(broadcast_gpu_float, bfyx_1_to_5_w_b_axes_0) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0}; + start_broadcast_test(data_types::f32, {5}, {1}, {0}, golden_data); +} - constexpr auto in_size_b = 4; - constexpr auto in_size_f = 2; - constexpr auto in_size_y = 2; - constexpr auto in_size_x = 2; +TEST(broadcast_gpu_uint8_t, bfyx_1_to_5_w_b_axes_0) { + std::vector golden_data = {1, 1, 1, 1, 1}; + start_broadcast_test(data_types::u8, {5}, {1}, {0}, golden_data); +} - constexpr auto bc_scale_b = 2; - constexpr auto bc_scale_f = 1; - constexpr auto bc_scale_y = 3; - constexpr auto bc_scale_x = 2; +TEST(broadcast_gpu_float, bfyx_1_to_4x5_w_b_axes_0x1) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + start_broadcast_test(data_types::f32, {4, 5}, {1}, {0, 1}, golden_data); +} - constexpr auto out_size_b = bc_scale_b * in_size_b; - constexpr auto out_size_f = bc_scale_f * in_size_f; - constexpr auto out_size_y = bc_scale_y * in_size_y; - constexpr auto out_size_x = bc_scale_x * in_size_x; +TEST(broadcast_gpu_uint8_t, bfyx_1_to_4x5_w_b_axes_0x1) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + start_broadcast_test(data_types::u8, {4, 5}, {1}, {0, 1}, golden_data); +} - engine engine; - auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}}); +TEST(broadcast_gpu_float, bfyx_1_to_3x4x5_w_b_axes_0x1x2) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + start_broadcast_test(data_types::f32, {3, 4, 5}, {1}, {0, 1, 2}, golden_data); +} - topology topology; - topology.add( - input_layout("input", input.get_layout()) - ); - topology.add( - broadcast("output", "input", {out_size_b, out_size_f, out_size_x, out_size_y}) - ); - - std::vector input_data = { - 11, 12, - 21, 22, - - -11, -12, - -21, -22, - - - 13, 14, - 23, 24, - - -13, -14, - -23, -24, - - - 15, 16, - 25, 26, - - -15, -16, - -25, -26, - - - 17, 18, - 27, 28, - - -17, -18, - -27, -28, - }; - std::vector out_data = { - 11, 12, 11, 12, - 21, 22, 21, 22, - 11, 12, 11, 12, - 21, 22, 21, 22, - 11, 12, 11, 12, - 21, 22, 21, 22, - - -11, -12, -11, -12, - -21, -22, -21, -22, - -11, -12, -11, -12, - -21, -22, -21, -22, - -11, -12, -11, -12, - -21, -22, -21, -22, - - - 13, 14, 13, 14, - 23, 24, 23, 24, - 13, 14, 13, 14, - 23, 24, 23, 24, - 13, 14, 13, 14, - 23, 24, 23, 24, - - -13, -14, -13, -14, - -23, -24, -23, -24, - -13, -14, -13, -14, - -23, -24, -23, -24, - -13, -14, -13, -14, - -23, -24, -23, -24, - - - 15, 16, 15, 16, - 25, 26, 25, 26, - 15, 16, 15, 16, - 25, 26, 25, 26, - 15, 16, 15, 16, - 25, 26, 25, 26, - - -15, -16, -15, -16, - -25, -26, -25, -26, - -15, -16, -15, -16, - -25, -26, -25, -26, - -15, -16, -15, -16, - -25, -26, -25, -26, - - - 17, 18, 17, 18, - 27, 28, 27, 28, - 17, 18, 17, 18, - 27, 28, 27, 28, - 17, 18, 17, 18, - 27, 28, 27, 28, - - -17, -18, -17, -18, - -27, -28, -27, -28, - -17, -18, -17, -18, - -27, -28, -27, -28, - -17, -18, -17, -18, - -27, -28, -27, -28, - - - 11, 12, 11, 12, - 21, 22, 21, 22, - 11, 12, 11, 12, - 21, 22, 21, 22, - 11, 12, 11, 12, - 21, 22, 21, 22, - - -11, -12, -11, -12, - -21, -22, -21, -22, - -11, -12, -11, -12, - -21, -22, -21, -22, - -11, -12, -11, -12, - -21, -22, -21, -22, - - - 13, 14, 13, 14, - 23, 24, 23, 24, - 13, 14, 13, 14, - 23, 24, 23, 24, - 13, 14, 13, 14, - 23, 24, 23, 24, - - -13, -14, -13, -14, - -23, -24, -23, -24, - -13, -14, -13, -14, - -23, -24, -23, -24, - -13, -14, -13, -14, - -23, -24, -23, -24, - - - 15, 16, 15, 16, - 25, 26, 25, 26, - 15, 16, 15, 16, - 25, 26, 25, 26, - 15, 16, 15, 16, - 25, 26, 25, 26, - - -15, -16, -15, -16, - -25, -26, -25, -26, - -15, -16, -15, -16, - -25, -26, -25, -26, - -15, -16, -15, -16, - -25, -26, -25, -26, - - - 17, 18, 17, 18, - 27, 28, 27, 28, - 17, 18, 17, 18, - 27, 28, 27, 28, - 17, 18, 17, 18, - 27, 28, 27, 28, - - -17, -18, -17, -18, - -27, -28, -27, -28, - -17, -18, -17, -18, - -27, -28, -27, -28, - -17, -18, -17, -18, - -27, -28, -27, -28, - }; - set_values(input, input_data); +TEST(broadcast_gpu_uint8_t, bfyx_1_to_3x4x5_w_b_axes_0x1x2) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + start_broadcast_test(data_types::u8, {3, 4, 5}, {1}, {0, 1, 2}, golden_data); +} - network network(engine, topology); - network.set_input_data("input", input); - auto outputs = network.execute(); +TEST(broadcast_gpu_float, bfyx_1_to_2x3x4x5_w_b_axes_0x1x2x3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {1}, {0, 1, 2, 3}, golden_data); +} - auto output = outputs.at("output").get_memory(); - auto output_ptr = output.pointer(); +TEST(broadcast_gpu_uint8_t, bfyx_1_to_2x3x4x5_w_b_axes_0x1x2x3) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {1}, {0, 1, 2, 3}, golden_data); +} - ASSERT_EQ(out_data.size(), static_cast(out_size_b * out_size_f * out_size_y * out_size_x)); +TEST(broadcast_gpu_float, bfyx_1_to_5_w_o_b_axes) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0}; + start_broadcast_test(data_types::f32, {5}, {1}, {}, golden_data); +} - for (auto b = 0; b < out_size_b; ++b) { // B - for (auto f = 0; f < out_size_f; ++f) { // F - for (auto y = 0; y < out_size_y; ++y) { // Y - for (auto x = 0; x < out_size_x; ++x) { // X - auto output_off = ((b * out_size_f + f) * out_size_y + y) * out_size_x + x; // BFYX +TEST(broadcast_gpu_uint8_t, bfyx_1_to_5_w_o_b_axes) { + std::vector golden_data = {1, 1, 1, 1, 1}; + start_broadcast_test(data_types::u8, {5}, {1}, {}, golden_data); +} - EXPECT_EQ(output_ptr[output_off], out_data[output_off]); - } - } - } - } +TEST(broadcast_gpu_float, bfyx_3_to_12_w_o_b_axes) { + std::vector golden_data = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}; + start_broadcast_test(data_types::f32, {12}, {3}, {}, golden_data); } -TEST(broadcast_gpu, basic_byxf_2x3x4x5_to_10x12x12x10) { - // Input (BF:XY) : 2x3:5x4 - // Output (BF:XY): 10x12:10x12 +TEST(broadcast_gpu_uint8_t, bfyx_3_to_12_w_o_b_axes) { + std::vector golden_data = {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3}; + start_broadcast_test(data_types::u8, {12}, {3}, {}, golden_data); +} - constexpr auto in_size_b = 2; - constexpr auto in_size_f = 3; - constexpr auto in_size_y = 4; - constexpr auto in_size_x = 5; +TEST(broadcast_gpu_float, bfyx_1x1_to_4x5_w_o_b_axes) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + start_broadcast_test(data_types::f32, {4, 5}, {1, 1}, {}, golden_data); +} - constexpr auto bc_scale_b = 5; - constexpr auto bc_scale_f = 4; - constexpr auto bc_scale_y = 3; - constexpr auto bc_scale_x = 2; +TEST(broadcast_gpu_uint8_t, bfyx_1x1_to_4x5_w_o_b_axes) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + start_broadcast_test(data_types::u8, {4, 5}, {1, 1}, {}, golden_data); +} - constexpr auto out_size_b = bc_scale_b * in_size_b; - constexpr auto out_size_f = bc_scale_f * in_size_f; - constexpr auto out_size_y = bc_scale_y * in_size_y; - constexpr auto out_size_x = bc_scale_x * in_size_x; +TEST(broadcast_gpu_float, bfyx_2x3_to_8x6_w_o_b_axes) { + std::vector golden_data = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, + 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, + 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, + 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0}; + start_broadcast_test(data_types::f32, {8, 6}, {2, 3}, {}, golden_data); +} - engine engine; - auto input = memory::allocate(engine, {data_types::f32, format::byxf, {in_size_b, in_size_f, in_size_x, in_size_y}}); +TEST(broadcast_gpu_uint8_t, bfyx_2x3_to_8x6_w_o_b_axes) { + std::vector golden_data = {1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, + 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, + 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, + 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}; + start_broadcast_test(data_types::u8, {8, 6}, {2, 3}, {}, golden_data); +} - topology topology; - topology.add( - input_layout("input", input.get_layout()) - ); - topology.add( - broadcast("output", "input", {out_size_b, out_size_f, out_size_x, out_size_y}) - ); - - std::vector input_data = generate_rnd_real_input(in_size_b, in_size_f, in_size_y, in_size_x, -8.0f, 8.0f); - set_values(input, input_data); +TEST(broadcast_gpu_float, bfyx_2x3x4_to_6x6x4_w_o_b_axes) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + start_broadcast_test(data_types::f32, {6, 6, 4}, {2, 3, 4}, {}, golden_data); +} - network network(engine, topology); - network.set_input_data("input", input); - auto outputs = network.execute(); +TEST(broadcast_gpu_uint8_t, bfyx_2x3x4_to_6x6x4_w_o_b_axes) { + std::vector golden_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; + start_broadcast_test(data_types::u8, {6, 6, 4}, {2, 3, 4}, {}, golden_data); +} - auto output = outputs.at("output").get_memory(); - auto output_ptr = output.pointer(); +TEST(broadcast_gpu_float, bfyx_2x3x4x5_to_2x9x8x5_w_o_b_axes) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, + 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, + 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, + 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, + 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, + 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, + 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, + 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, + 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, + 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, + 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, + 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, + 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, + 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, + 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, + 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, + 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, + 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, + 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, + 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, + 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, + 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, + 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, + 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, + 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, + 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, + 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, + 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, + 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, + 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, + 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, + 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, + 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, + 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, + 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, + 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, + 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, + 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, + 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, + 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, + 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, + 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0}; + start_broadcast_test(data_types::f32, {2, 9, 8, 5}, {2, 3, 4, 5}, {}, golden_data); +} - for (auto b = 0; b < out_size_b; ++b) { // B - for (auto f = 0; f < out_size_f; ++f) { // F - for (auto y = 0; y < out_size_y; ++y) { // Y - for (auto x = 0; x < out_size_x; ++x) { // X - auto output_off = ((b * out_size_y + y) * out_size_x + x) * out_size_f + f; // BYXF +TEST(broadcast_gpu_uint8_t, bfyx_2x3x4x5_to_2x9x8x5_w_o_b_axes) { + std::vector golden_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, + 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, + 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, + 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, + 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, + 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, + 111, 112, 113, 114, 115, 116, 117, 118, 119, 120}; + start_broadcast_test(data_types::u8, {2, 9, 8, 5}, {2, 3, 4, 5}, {}, golden_data); +} - auto in_b = b % in_size_b; - auto in_f = f % in_size_f; - auto in_y = y % in_size_y; - auto in_x = x % in_size_x; +TEST(broadcast_gpu_float, bfyx_3_to_2x3_w_b_axes_0) { + std::vector golden_data = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0}; + start_broadcast_test(data_types::f32, {2, 3}, {3}, {0}, golden_data); +} - auto input_off = ((in_b * in_size_y + in_y) * in_size_x + in_x) * in_size_f + in_f; // BYXF +TEST(broadcast_gpu_uint8_t, bfyx_3_to_2x3_w_b_axes_0) { + std::vector golden_data = {1, 2, 3, 1, 2, 3}; + start_broadcast_test(data_types::u8, {2, 3}, {3}, {0}, golden_data); +} +TEST(broadcast_gpu_float, bfyx_3_to_2x6_w_b_axes_0) { + std::vector golden_data = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}; + start_broadcast_test(data_types::f32, {2, 6}, {3}, {0}, golden_data); +} - EXPECT_EQ(output_ptr[output_off], input_data[input_off]); - } - } - } - } +TEST(broadcast_gpu_uint8_t, bfyx_3_to_2x6_w_b_axes_0) { + std::vector golden_data = {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3}; + start_broadcast_test(data_types::u8, {2, 6}, {3}, {0}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2_to_2x3_w_b_axes_1) { + std::vector golden_data = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; + start_broadcast_test(data_types::f32, {2, 3}, {2}, {1}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2_to_2x3_w_b_axes_1) { + std::vector golden_data = {1, 1, 1, 2, 2, 2}; + start_broadcast_test(data_types::u8, {2, 3}, {2}, {1}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2_to_6x3_w_b_axes_1) { + std::vector golden_data = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; + start_broadcast_test(data_types::f32, {6, 3}, {2}, {1}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2_to_6x3_w_b_axes_1) { + std::vector golden_data = {1, 1, 1, 2, 2, 2, 1, 1, 1, + 2, 2, 2, 1, 1, 1, 2, 2, 2}; + start_broadcast_test(data_types::u8, {6, 3}, {2}, {1}, golden_data); } -TEST(broadcast_gpu, basic_bfyx_2x1x1x5_to_2x13x11x5) { - // Input (BF:XY) : 2x1:5x1 - // Output (BF:XY): 2x13:5x11 +TEST(broadcast_gpu_float, bfyx_3x4_to_2x3x4_w_b_axes_0) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + start_broadcast_test(data_types::f32, {2, 3, 4}, {3, 4}, {0}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_3x4_to_2x3x4_w_b_axes_0) { + std::vector golden_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + start_broadcast_test(data_types::u8, {2, 3, 4}, {3, 4}, {0}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x4_to_2x3x4_w_b_axes_1) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0}; + start_broadcast_test(data_types::f32, {2, 3, 4}, {2, 4}, {1}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x4_to_2x3x4_w_b_axes_1) { + std::vector golden_data = {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 5, 6, 7, 8, 5, 6, 7, 8, 5, 6, 7, 8}; + start_broadcast_test(data_types::u8, {2, 3, 4}, {2, 4}, {1}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x3_to_2x3x4_w_b_axes_2) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0}; + start_broadcast_test(data_types::f32, {2, 3, 4}, {2, 3}, {2}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x3_to_2x3x4_w_b_axes_2) { + std::vector golden_data = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6}; + start_broadcast_test(data_types::u8, {2, 3, 4}, {2, 3}, {2}, golden_data); +} - constexpr auto in_size_b = 2; - constexpr auto in_size_f = 1; - constexpr auto in_size_y = 1; - constexpr auto in_size_x = 5; +TEST(broadcast_gpu_float, bfyx_4_to_2x3x4_w_b_axes_0_1) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0}; + start_broadcast_test(data_types::f32, {2, 3, 4}, {4}, {0, 1}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_4_to_2x3x4_w_b_axes_0_1) { + std::vector golden_data = {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}; + start_broadcast_test(data_types::u8, {2, 3, 4}, {4}, {0, 1}, golden_data); +} - constexpr auto bc_scale_b = 1; - constexpr auto bc_scale_f = 13; - constexpr auto bc_scale_y = 11; - constexpr auto bc_scale_x = 1; +TEST(broadcast_gpu_float, bfyx_3_to_2x3x4_w_b_axes_0_2) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, + 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}; + start_broadcast_test(data_types::f32, {2, 3, 4}, {3}, {0, 2}, golden_data); +} - constexpr auto out_size_b = bc_scale_b * in_size_b; - constexpr auto out_size_f = bc_scale_f * in_size_f; - constexpr auto out_size_y = bc_scale_y * in_size_y; - constexpr auto out_size_x = bc_scale_x * in_size_x; +TEST(broadcast_gpu_uint8_t, bfyx_3_to_2x3x4_w_b_axes_0_2) { + std::vector golden_data = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + start_broadcast_test(data_types::u8, {2, 3, 4}, {3}, {0, 2}, golden_data); +} - engine engine; - auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}}); +TEST(broadcast_gpu_float, bfyx_2_to_2x3x4_w_b_axes_1_2) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}; + start_broadcast_test(data_types::f32, {2, 3, 4}, {2}, {1, 2}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2_to_2x3x4_w_b_axes_1_2) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + start_broadcast_test(data_types::u8, {2, 3, 4}, {2}, {1, 2}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_3x4x5_to_2x3x4x5_w_b_axes_0) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, + 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, + 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, + 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, + 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {3, 4, 5}, {0}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_3x4x5_to_2x3x4x5_w_b_axes_0) { + std::vector golden_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {3, 4, 5}, {0}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x4x5_to_2x3x4x5_w_b_axes_1) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2, 4, 5}, {1}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x4x5_to_2x3x4x5_w_b_axes_1) { + std::vector golden_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2, 4, 5}, {1}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x3x5_to_2x3x4x5_w_b_axes_2) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 16.0, 17.0, 18.0, 19.0, 20.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 21.0, 22.0, 23.0, 24.0, 25.0, + 21.0, 22.0, 23.0, 24.0, 25.0, 21.0, 22.0, 23.0, 24.0, 25.0, + 26.0, 27.0, 28.0, 29.0, 30.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 26.0, 27.0, 28.0, 29.0, 30.0, 26.0, 27.0, 28.0, 29.0, 30.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2, 3, 5}, {2}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x3x5_to_2x3x4x5_w_b_axes_2) { + std::vector golden_data = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 16, 17, 18, 19, 20, + 16, 17, 18, 19, 20, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 21, 22, 23, 24, 25, + 21, 22, 23, 24, 25, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 26, 27, 28, 29, 30, + 26, 27, 28, 29, 30, 26, 27, 28, 29, 30}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2, 3, 5}, {2}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x3x4_to_2x3x4x5_w_b_axes_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, + 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, + 11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0, + 13.0, 13.0, 13.0, 13.0, 13.0, 14.0, 14.0, 14.0, 14.0, 14.0, + 15.0, 15.0, 15.0, 15.0, 15.0, 16.0, 16.0, 16.0, 16.0, 16.0, + 17.0, 17.0, 17.0, 17.0, 17.0, 18.0, 18.0, 18.0, 18.0, 18.0, + 19.0, 19.0, 19.0, 19.0, 19.0, 20.0, 20.0, 20.0, 20.0, 20.0, + 21.0, 21.0, 21.0, 21.0, 21.0, 22.0, 22.0, 22.0, 22.0, 22.0, + 23.0, 23.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24.0, 24.0, 24.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2, 3, 4}, {3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x3x4_to_2x3x4x5_w_b_axes_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, + 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, + 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 24, 24, 24, 24, 24}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2, 3, 4}, {3}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_4x5_to_2x3x4x5_w_b_axes_0_1) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {4, 5}, {0, 1}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_4x5_to_2x3x4x5_w_b_axes_0_1) { + std::vector golden_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {4, 5}, {0, 1}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_3x5_to_2x3x4x5_w_b_axes_0_2) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 11.0, 12.0, 13.0, 14.0, 15.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {3, 5}, {0, 2}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_3x5_to_2x3x4x5_w_b_axes_0_2) { + std::vector golden_data = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, + 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {3, 5}, {0, 2}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_3x4_to_2x3x4x5_w_b_axes_0_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, + 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, + 11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, + 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, + 11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {3, 4}, {0, 3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_3x4_to_2x3x4x5_w_b_axes_0_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 12, 12, 12, 12, 12}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {3, 4}, {0, 3}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x5_to_2x3x4x5_w_b_axes_1_2) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0, + 6.0, 7.0, 8.0, 9.0, 10.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2, 5}, {1, 2}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x5_to_2x3x4x5_w_b_axes_1_2) { + std::vector golden_data = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, + 6, 7, 8, 9, 10, 6, 7, 8, 9, 10}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2, 5}, {1, 2}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x4_to_2x3x4x5_w_b_axes_1_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2, 4}, {1, 3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x4_to_2x3x4x5_w_b_axes_1_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2, 4}, {1, 3}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2x3_to_2x3x4x5_w_b_axes_2_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2, 3}, {2, 3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2x3_to_2x3x4x5_w_b_axes_2_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2, 3}, {2, 3}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_5_to_2x3x4x5_w_b_axes_0_1_2) { + std::vector golden_data = {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, + 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {5}, {0, 1, 2}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_5_to_2x3x4x5_w_b_axes_0_1_2) { + std::vector golden_data = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {5}, {0, 1, 2}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_4_to_2x3x4x5_w_b_axes_0_1_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {4}, {0, 1, 3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_4_to_2x3x4x5_w_b_axes_0_1_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {4}, {0, 1, 3}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_3_to_2x3x4x5_w_b_axes_0_2_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {3}, {0, 2, 3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_3_to_2x3x4x5_w_b_axes_0_2_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {3}, {0, 2, 3}, golden_data); +} + +TEST(broadcast_gpu_float, bfyx_2_to_2x3x4x5_w_b_axes_1_2_3) { + std::vector golden_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}; + start_broadcast_test(data_types::f32, {2, 3, 4, 5}, {2}, {1, 2, 3}, golden_data); +} + +TEST(broadcast_gpu_uint8_t, bfyx_2_to_2x3x4x5_w_b_axes_1_2_3) { + std::vector golden_data = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + start_broadcast_test(data_types::u8, {2, 3, 4, 5}, {2}, {1, 2, 3}, golden_data); +} + + +TEST(broadcast_gpu, basic_error_wrong_b_axes_size) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, 1, 1}}); topology topology; - topology.add( - input_layout("input", input.get_layout()) - ); - topology.add( - broadcast("output", "input", {out_size_b, out_size_f, out_size_x, out_size_y}) - ); - - std::vector input_data = generate_rnd_real_input(in_size_b, in_size_f, in_size_y, in_size_x, -8.0f, 8.0f); - set_values(input, input_data); + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {2, 3, 4, 5}, {0, 1, 2, 3, 4})); - network network(engine, topology); - network.set_input_data("input", input); - auto outputs = network.execute(); + std::string msg_to_find = "Incorrect parameters configuration: broadcast_axes size should be less or equal 4."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} - auto output = outputs.at("output").get_memory(); - auto output_ptr = output.pointer(); +TEST(broadcast_gpu, basic_error_wrong_b_axis_value) { - for (auto b = 0; b < out_size_b; ++b) { // B - for (auto f = 0; f < out_size_f; ++f) { // F - for (auto y = 0; y < out_size_y; ++y) { // Y - for (auto x = 0; x < out_size_x; ++x) { // X - auto output_off = ((b * out_size_f + f) * out_size_y + y) * out_size_x + x; // BFYX + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, 1, 1}}); - auto in_b = b % in_size_b; - auto in_f = f % in_size_f; - auto in_y = y % in_size_y; - auto in_x = x % in_size_x; + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {2, 3, 4, 5}, {0, 4})); - auto input_off = ((in_b * in_size_f + in_f) * in_size_y + in_y) * in_size_x + in_x; // BFYX + std::string msg_to_find = "Incorrect parameters configuration: broadcast_axes index should be within broadcast_sizes range."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} +TEST(broadcast_gpu, basic_error_duplicate_b_axis_values) { - EXPECT_EQ(output_ptr[output_off], input_data[input_off]); - } - } - } - } + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, 1, 1}}); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {2, 3, 4, 5}, {0, 1, 1})); + + std::string msg_to_find = "Incorrect parameters configuration: Duplicate axes numbers was found in broadcast_axes."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); } -TEST(broadcast_gpu, basic_error_on_nondiv_bc_size) { - // Input (BF:XY) : 2x1:5x1 - // Output (BF:XY): 2x13:5x11 +TEST(broadcast_gpu, basic_error_wrong_input_dimension_0) { - constexpr auto in_size_b = 2; - constexpr auto in_size_f = 1; - constexpr auto in_size_y = 1; - constexpr auto in_size_x = 5; + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {2, 3, 4, 5}}); - constexpr auto out_size_b = in_size_b; - constexpr auto out_size_f = in_size_f; - constexpr auto out_size_y = in_size_y; - constexpr auto out_size_x = 7; + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {2, 3, 4, 5}, {1})); - engine engine; - auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}}); + std::string msg_to_find = "Input size on dimension number 0(=2) is not equal to: (=1)"; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(broadcast_gpu, basic_error_not_dividable_2x3x4x5_to_3x3x4x5) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {2, 3, 4, 5}}); topology topology; - topology.add( - input_layout("input", input.get_layout()) - ); - topology.add( - broadcast("output", "input", {out_size_b, out_size_f, out_size_x, out_size_y}) - ); - - std::vector input_data = generate_rnd_real_input(in_size_b, in_size_f, in_size_y, in_size_x, -8.0f, 8.0f); - set_values(input, input_data); + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {3, 3, 4, 5}, {})); - EXPECT_ANY_THROW(network(engine, topology)); + std::string msg_to_find = "Invalid broadcast size: not dividable by input size"; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); } +TEST(broadcast_gpu, basic_error_not_dividable_3_to_2x3x4x5_w_b_axes_0x1x3) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, 3, 1}}); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {2, 3, 4, 5}, {0, 1, 3})); + + std::string msg_to_find = "Invalid broadcast size: not dividable by input size"; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(broadcast_gpu, basic_error_not_dividable_4x5_to_3x4x5_w_b_axes_1) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 3, 5, 4}}); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(broadcast("output", "input", {2, 3, 4, 5}, {1})); + + std::string msg_to_find = "Invalid broadcast size: not dividable by input size"; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/command_queue_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/command_queue_test.cpp new file mode 100644 index 0000000..ade14c5 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/command_queue_test.cpp @@ -0,0 +1,165 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include +#include +#include +#include +#include "test_utils/test_utils.h" +#include "api/CPP/arg_max_min.hpp" + +using namespace cldnn; +using namespace tests; +using namespace std; + +#ifdef _WIN32 +#include +#include +static int g_run_once = 1; +static int g_qpc_availible; +static LARGE_INTEGER g_qpc_frec; + +// Function for future use to measure performance +unsigned long GetMilliseconds(void) +{ + unsigned long ms; + LARGE_INTEGER qpc_ticks; + + if (g_run_once) { + g_qpc_availible = QueryPerformanceFrequency(&g_qpc_frec); + // QPC returns nonzero value if HW supports high resolution perf counter + EXPECT_NE(g_qpc_availible, 0); + g_run_once = 0; + } + + if (g_qpc_availible) { + QueryPerformanceCounter(&qpc_ticks); + ms = (unsigned long)(1000.0 * ((double)(qpc_ticks.QuadPart)) / ((double)(g_qpc_frec.QuadPart))); + } + // back up if High-Resolution Timer is not available + else ms = GetTickCount(); + + return ms; +} +#endif + + +// Run some topology too see if command queue does work correctly +// Coppied from arg_max_gpu.base test. +void exexute_network(cldnn::engine engine) +{ + // Input : 2x3x2x2 + static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(arg_max_min("arg_max", "input", arg_max_min::max)); + + vector input_vec = { + //y0x0 y0x1 y1x0 y1x1 + /*b0f0*/0.1f, -0.1f, 0.9f, 1.5f, + /*b0f1*/0.2f, 0.2f, -10.f, 5.2f, + /*b0f2*/0.2f, 0.2f, -10.f, 5.2f, + + /*b1f0*/3.f, 0.5f, 7.f, 10.f, + /*b1f1*/4.f, 0.5f, 8.f, 8.2f, + /*b1f2*/0.2f, 0.2f, -10.f, 5.2f + }; + set_values(input, input_vec); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "arg_max"); + + auto output = outputs.at("arg_max").get_memory(); + auto output_ptr = output.pointer(); + float out_buffer[batch_num]; + for (uint32_t i = 0; i < batch_num; i++) + { + out_buffer[i] = get_value(output_ptr, i); + } + int size = x_size * y_size * feature_num; + int index; + float value; + for (int i = 0; i < batch_num; i++) { + EXPECT_GE(out_buffer[i], 0); + EXPECT_LT(out_buffer[i], size); + index = (int)out_buffer[i]; + value = input_vec[i*size + (int)index]; + for (int j = 0; j < size; j++) + { + EXPECT_LE(input_vec[i*size + j], value); + } + } +} + +TEST(command_queue_test, test_priority_hints) { + engine_configuration configuration = + engine_configuration( + false, // profiling + false, // decorate_kernel_names + false, // dump_custom_program + "", // options + "", // single_kernel + true, // primitives_parallelisation + "", // engine_log + "", // sources_dumps_dir + priority_mode_types::low, + throttle_mode_types::disabled); + cldnn::engine engine(configuration); + exexute_network(engine); +} + +TEST(command_queue_test, test_throttle_hints) { + engine_configuration configuration = + engine_configuration( + false, // profiling + false, // decorate_kernel_names + false, // dump_custom_program + "", // options + "", // single_kernel + true, // primitives_parallelisation + "", // engine_log + "", // sources_dumps_dir + priority_mode_types::disabled, + throttle_mode_types::high); + cldnn::engine engine(configuration); + exexute_network(engine); +} + +TEST(command_queue_test, test_priority_and_throttle_hints) { + engine_configuration configuration = + engine_configuration( + false, // profiling + false, // decorate_kernel_names + false, // dump_custom_program + "", // options + "", // single_kernel + true, // primitives_parallelisation + "", // engine_log + "", // sources_dumps_dir + priority_mode_types::high, + throttle_mode_types::low); + cldnn::engine engine(configuration); + exexute_network(engine); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/condition_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/condition_gpu_test.cpp new file mode 100644 index 0000000..09e299e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/condition_gpu_test.cpp @@ -0,0 +1,617 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_utils/test_utils.h" + +#include + + +using namespace cldnn; +using namespace ::tests; + + +bool is_output_equal(const cldnn::memory& mem, const std::vector& ref) +{ + auto ptr = mem.pointer(); + for (size_t i = 0; i < mem.get_layout().count(); i++) + { + if (!are_equal(ptr[i], ref[i])) return false; + } + return true; +} + +topology generate_simple_branch (bool branch_true_false, const primitive_id& input_id) +{ + topology branch; + if (branch_true_false) + { + branch.add( + pooling(input_id + "_when_true", input_id, cldnn::pooling_mode::max, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + } + else + { + branch.add( + pooling(input_id + "_when_false", input_id, cldnn::pooling_mode::average, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + } + return branch; +} + + +TEST(condition_gpu, basic_equal_comp) { + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto scale_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + topology branch_true = generate_simple_branch(true, "condi"); + topology branch_false = generate_simple_branch(false, "condi"); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + input_layout("scale_data", scale_mem.get_layout()) + ); + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", cond_functions::EQUAL) + ); + topology.add( + scale("output", "condi", "scale_data") + ); + + network net(engine, topology, bs); + set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f }); + set_values(scale_mem, { 10.0f }); + net.set_input_data("input", input); + net.set_input_data("scale_data", scale_mem); + + decltype(net.execute()) out; + + //WHEN TRUE + set_values(compare, { 1.0f }); + net.set_input_data("compare", compare); + out = net.execute(); + auto out_data_true = out.at("output").get_memory(); + EXPECT_TRUE(is_output_equal(out_data_true, {20.0f, 40.0f})); + + //WHEN FALSE + set_values(compare, { 4.0f }); + net.set_input_data("compare", compare); + out = net.execute(); + auto out_data_false = out.at("output").get_memory(); + EXPECT_TRUE(is_output_equal(out_data_false, { 15.0f, 35.0f })); + +} + +TEST(condition_gpu, basic_range_equal_comp) { + + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } }); + + topology branch_true = generate_simple_branch(true, "condi"); + topology branch_false = generate_simple_branch(false, "condi"); + + topology topology; + topology.add( + input_layout("input0", input0.get_layout()) + ); + topology.add( + input_layout("input1", input1.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + concatenation("concat", { "input0", "input1" }, concatenation::along_x) + ); + topology.add( + condition("condi", "concat", branch_true, branch_false, "compare", cond_functions::EQUAL) + ); + + std::vector input0_data = { + 1, 2, 3, 4 + }; + std::vector input1_data = { + 5, 6, 7, 8 + }; + std::vector compare_data_true = { + 1, 2, 3 + }; + std::vector pooling_when_true_data = { + 2, 4, 6, 8 + }; + std::vector compare_data_false = { + 1, 2, 10 + }; + std::vector pooling_when_false_data = { + 1.5, 3.5, 5.5, 7.5 + }; + + set_values(input0, input0_data); + set_values(input1, input1_data); + network net(engine, topology, bs); + net.set_input_data("input0", input0); + net.set_input_data("input1", input1); + + decltype(net.execute()) outputs; + + //CHECK TRUE + set_values(compare, compare_data_true); + net.set_input_data("compare", compare); + outputs = net.execute(); + + auto out_data_true = outputs.at("condi").get_memory(); + EXPECT_TRUE(is_output_equal(out_data_true, pooling_when_true_data)); + + //CHECK FALSE + set_values(compare, compare_data_false); + net.set_input_data("compare", compare); + outputs = net.execute(); + + auto out_data_false = outputs.at("condi").get_memory(); + EXPECT_TRUE(is_output_equal(out_data_false, pooling_when_false_data)); +} + +std::pair, std::vector> get_values_to_compare(const cldnn::tensor& offset, const cldnn::tensor& range, const std::vector& values, const cldnn::layout& input_lay, const cond_functions& func) +{ + std::vector ret_true; + std::vector ret_false; + auto mem_desc = generic_test::get_linear_memory_desc(input_lay); + for (int32_t b = 0; b < range.batch[0]; b++) + { + for (int32_t f = 0; f < range.feature[0]; f++) + { + for (int32_t y = 0; y < range.spatial[1]; y++) + { + for (int32_t x = 0; x < range.spatial[0]; x++) + { + auto linear_idx = generic_test::get_linear_index( + input_lay, + offset.batch[0] + b, + offset.feature[0] + f, + offset.spatial[1] + y, + offset.spatial[0] + x, + mem_desc); + + switch (func) + { + case cond_functions::EQUAL: + ret_true.push_back(values.at(linear_idx)); + ret_false.push_back(-1.0f); + break; + case cond_functions::GREATER: + ret_true.push_back(values.at(linear_idx) - 1.0f); + ret_false.push_back(99.0f); + break; + case cond_functions::LESS: + ret_true.push_back(values.at(linear_idx) + 1.0f); + ret_false.push_back(-1.0f); + break; + } + } + } + } + } + return { ret_true, ret_false }; +} + +TEST(DISABLED_condition_gpu, generic_test_true_false) { + + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 5, 2, 5, 1 } }); + std::vector input_data(50); + std::iota(input_data.begin(), input_data.end(), 0.0f); + + std::vector functions = { + cond_functions::EQUAL, + cond_functions::GREATER, + cond_functions::LESS, + }; + + // ranges, with data when condition is true or false + std::vector ranges = { + {1, 1, 1, 1}, + {1, 1, 3, 1}, + {2, 1, 1, 1}, + {2, 1, 1, 1} + }; + + std::vector offsets = { + { 0, 0, 0, 0}, + { 0, 0, 1, 0}, + { 0, 0, 2, 0}, + { 2, 0, 0, 0}, + { 2, 1, 1, 0} + }; + + std::vector pooling_when_true_data = { + 2, 4, 7, 9, 12, 14, 17, + 19, 22, 24, 27, 29, 32, + 34, 37, 39, 42, 44, 47, 49 + }; + + std::vector pooling_when_false_data = { + 1, 3, 6, 8, 11, 13, 16, + 18, 21, 23, 26, 28, 31, + 33, 36, 38, 41, 43, 46, 48 + }; + + for (auto const& func : functions) + { + for (auto const& range : ranges) + { + for (auto const& offset : offsets) + { + auto comp_values = get_values_to_compare(offset, range, input_data, input.get_layout(), func); + auto comp_values_true = comp_values.first; + auto comp_values_false = comp_values.second; + + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx, range }); + + topology branch_true; + topology branch_false; + branch_true.add( + pooling("pooling_when_true", "condi", cldnn::pooling_mode::max, { 1, 1, 3, 1 }, { 1, 1, 2, 1 }) + ); + branch_false.add( + pooling("pooling_when_false", "condi", cldnn::pooling_mode::average, { 1, 1, 3, 1 }, { 1, 1, 2, 1 }) + ); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", func, offset) + ); + + set_values(input, input_data); + network net(engine, topology, bs); + net.set_input_data("input", input); + + decltype(net.execute()) outputs; + + //CHECK TRUE + set_values(compare, comp_values_true); + net.set_input_data("compare", compare); + outputs = net.execute(); + + auto out_data_true = outputs.at("condi").get_memory(); + EXPECT_TRUE(is_output_equal(out_data_true, pooling_when_true_data)); + + //CHECK FALSE + set_values(compare, comp_values_false); + net.set_input_data("compare", compare); + outputs = net.execute(); + + auto out_data_false = outputs.at("condi").get_memory(); + EXPECT_TRUE(is_output_equal(out_data_false, pooling_when_false_data)); + + } + } + } +} + +TEST(condition_gpu, basic_stacked_ifs) { + + /* + + + <...> + + <...> + + <...> + + + */ + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto compare2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } }); + + + topology condi_1_true = generate_simple_branch(true, "condi"); + topology condi_1_false = generate_simple_branch(false, "condi"); + topology condi_2_true; + condi_2_true.add( + activation("activ_when_true", "condi2", cldnn_activation_func::activation_log2) + ); + topology condi_2_false; + condi_2_false.add( + activation("activ_when_false", "condi2", cldnn_activation_func::activation_relu) + ); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + condition("condi", "input", condi_1_true, condi_1_false, "compare", cond_functions::EQUAL) + ); + topology.add( + input_layout("compare2", compare2.get_layout()) + ); + topology.add( + condition("condi2", "condi", condi_2_true, condi_2_false, "compare2", cond_functions::GREATER) + ); + + std::vector input_data = { + 1, 2, 3, 4 + }; + std::vector compare_data = { + 1 + }; + std::vector compare_2_data = { + 0.0f, 0.0f + }; + set_values(input, input_data); + set_values(compare, compare_data); + set_values(compare2, compare_2_data); + + network net(engine, topology, bs); + net.set_input_data("input", input); + net.set_input_data("compare", compare); + net.set_input_data("compare2", compare2); + auto outputs = net.execute(); + + auto out_data = outputs.at("condi2").get_memory(); + EXPECT_TRUE(is_output_equal(out_data, {1.0f, 2.0f})); +} + +TEST(condition_gpu, basic_nested_ifs) { + + /* + + + <...> + + <...> + + <...> + + + */ + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto compare2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } }); + auto scale_5_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + set_values(scale_5_mem, { 5.0f }); + auto scale_10_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + set_values(scale_10_mem, { 10.0f }); + + + topology nested_true; + { + nested_true.add(scale("scale_5", "condi_nested", "scale_5_data"), + data("scale_5_data", scale_5_mem)); + } + topology nested_false; + { + nested_false.add(scale("scale_10", "condi_nested", "scale_10_data"), + data("scale_10_data", scale_10_mem)); + } + + topology branch_true; + branch_true.add( + pooling("pooling_when_true", "condi", cldnn::pooling_mode::max, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + branch_true.add( + input_layout("compare2", compare2.get_layout()) + ); + + branch_true.add( + condition( + "condi_nested", + "pooling_when_true", + nested_true, + nested_false, + "compare2", + cond_functions::EQUAL) + ); + + topology branch_false; + branch_false.add( + pooling("pooling_when_false", "condi", cldnn::pooling_mode::average, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + + topology.add( + input_layout("compare", compare.get_layout()) + ); + + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", cond_functions::EQUAL) + ); + + std::vector input_data = { + 1.0f, 2.0f, 3.0f, 4.0f + }; + std::vector compare_data = { + 1.0f + }; + std::vector compare_2_data = { + 2.0f, 4.0f + }; + set_values(input, input_data); + set_values(compare, compare_data); + set_values(compare2, compare_2_data); + + network net(engine, topology, bs); + net.set_input_data("input", input); + net.set_input_data("compare", compare); + net.set_input_data("compare2", compare2); + auto outputs = net.execute(); + + auto out_data = outputs.at("condi").get_memory(); + EXPECT_TRUE(is_output_equal(out_data, { 10.0f, 20.0f })); +} + + +TEST(condition_gpu, negative_compare_wrong_layout) { + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 1 } }); + + topology branch_true = generate_simple_branch(true, "condi"); + topology branch_false = generate_simple_branch(false, "condi"); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", cond_functions::EQUAL) + ); + + EXPECT_ANY_THROW(network net(engine, topology, bs);); +} + +TEST(condition_gpu, negative_too_big_offset) { + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } }); + + topology branch_true = generate_simple_branch(true, "condi"); + topology branch_false = generate_simple_branch(false, "condi"); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", cond_functions::EQUAL, {1, 1, 2, 1}) + ); + + EXPECT_ANY_THROW(network net(engine, topology, bs);); +} + +TEST(condition_gpu, negative_not_same_layouts) { + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + topology branch_true; + branch_true.add( + pooling("pooling_when_true", "condi", cldnn::pooling_mode::max, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + + topology branch_false; + branch_false.add( + pooling("pooling_when_false", "condi", cldnn::pooling_mode::max, { 0, 0, 4, 1 }, { 0, 0, 4, 1 }) + ); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", cond_functions::EQUAL) + ); + + EXPECT_ANY_THROW(network net(engine, topology, bs);); +} + +TEST(condition_gpu, negative_same_names_within_different_networks) { + const auto& engine = get_test_engine(); + build_options bs; + bs.set_option(build_option::optimize_data(true)); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); + auto compare = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + topology branch_true; + branch_true.add( + pooling("pooling_check_name", "condi", cldnn::pooling_mode::max, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + + topology branch_false; + branch_false.add( + pooling("pooling_when_false", "condi", cldnn::pooling_mode::max, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("compare", compare.get_layout()) + ); + topology.add( + condition("condi", "input", branch_true, branch_false, "compare", cond_functions::EQUAL) + ); + topology.add( + pooling("pooling_check_name", "condi", cldnn::pooling_mode::max, { 0, 0, 2, 1 }, { 0, 0, 2, 1 }) + ); + + EXPECT_ANY_THROW(network net(engine, topology, bs);); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp new file mode 100644 index 0000000..1a2c671 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp @@ -0,0 +1,352 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include +#include + +#include "test_utils/test_utils.h" +#include "test_utils/uniform_quantized_real_distribution.hpp" + +#include + +using namespace cldnn; +using namespace ::tests; + +template +T reduce_execute(cldnn::contract_mode mode, T x, T y) { + switch (mode) { + case contract_mode::sum: + return x + y; + case contract_mode::prod: + return x * y; + case contract_mode::all: + return x && y; + case contract_mode::any: + return x || y; + case contract_mode::max: + return x > y ? x : y; + default: + return (T)0; + } +} + +template +VVVVF reduce_dim(VVVVF &input, + cldnn::contract_mode mode, uint16_t axis, + int input_padding_y = 0, int input_padding_x = 0, + int output_padding_y = 0, int output_padding_x = 0) { + + size_t padding_y = input_padding_y + output_padding_y; + size_t padding_x = input_padding_x + output_padding_x; + size_t out_sizes[4]; + out_sizes[0] = input.size(); + out_sizes[1] = input[0].size(); + out_sizes[2] = input[0][0].size() + 2 * padding_y; + out_sizes[3] = input[0][0][0].size() + 2 * padding_x; + if (axis == 0) + out_sizes[0] = 1; + else + for (uint16_t i = axis; i > 0; --i) + { + out_sizes[i] = out_sizes[i - 1]; + out_sizes[i - 1] = 1; + } + VVVVF output(out_sizes[0], VVVF(out_sizes[1], VVF(out_sizes[2], VF(out_sizes[3])))); + + switch (axis) { + case 0: + for (size_t f = 0; f < out_sizes[1]; ++f) + for (size_t y = 0; y < out_sizes[2]; ++y) + for (size_t x = 0; x < out_sizes[3]; ++x) + { + T res = input[0][f][y][x]; + size_t orig_b = input.size(); + for (size_t b = 1; b < orig_b; ++b) + res = reduce_execute(mode, res, input[b][f][y][x]); + output[0][f][y][x] = res; + } + break; + case 1: + for (size_t b = 0; b < out_sizes[1]; ++b) + for (size_t y = 0; y < out_sizes[2]; ++y) + for (size_t x = 0; x < out_sizes[3]; ++x) + { + T res = input[b][0][y][x]; + size_t orig_f = input[0].size(); + for (size_t f = 1; f < orig_f; ++f) + res = reduce_execute(mode, res, input[b][f][y][x]); + output[0][b][y][x] = res; + } + break; + case 2: + for (size_t b = 0; b < out_sizes[1]; ++b) + for (size_t f = 0; f < out_sizes[2]; ++f) + for (size_t x = 0; x < out_sizes[3]; ++x) + { + T res = input[b][f][0][x]; + size_t orig_y = input[0][0].size(); + for (size_t y = 1; y < orig_y; ++y) + res = reduce_execute(mode, res, input[b][f][y][x]); + output[0][b][f][x] = res; + } + break; + case 3: + for (size_t b = 0; b < out_sizes[1]; ++b) + for (size_t f = 0; f < out_sizes[2]; ++f) + for (size_t y = 0; y < out_sizes[3]; ++y) + { + T res = input[b][f][y][0]; + size_t orig_x = input[0][0][0].size(); + for (size_t x = 1; x < orig_x; ++x) + res = reduce_execute(mode, res, input[b][f][y][x]); + output[0][b][f][y] = res; + } + break; + default: break; + } + return output; +} + +template +VVVVF reduce_input(VVVVF &input, + cldnn::contract_mode mode, std::vector reduction_axes, + int input_padding_y = 0, int input_padding_x = 0, + int output_padding_y = 0, int output_padding_x = 0) { + VVVVF output(input); + for (size_t i = 0; i < reduction_axes.size(); ++i) + output = reduce_dim(output, mode, reduction_axes[i], input_padding_y, input_padding_x, output_padding_y, output_padding_x); + return output; +} + +std::string print_axes(std::vector reduction_axes) +{ + std::stringstream res; + res << "["; + for (size_t i = 0; i < reduction_axes.size(); ++i) + { + if (i != 0) + res << ", "; + res << reduction_axes[i]; + } + res << "]"; + return res.str(); +} + +template +void generic_contract_test_float(cldnn::format test_input_fmt, int input_b, int input_f, int input_y, int input_x, cldnn::contract_mode mode, + std::vector reduction_axes, int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, int output_padding_x = 0) { + + int min_random = -2, max_random = 2; + VVVVF input_rnd = generate_random_4d(input_b, input_f, input_y, input_x, min_random, max_random); + VF input_rnd_vec = flatten_4d(test_input_fmt, input_rnd); + + const auto& engine = get_test_engine(); + tensor input_tensor(input_b, input_f, input_x, input_y); + auto input = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); + set_values(input, input_rnd_vec); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(contract("output", "input", mode, reduction_axes)); + + network network(engine, topology); + network.set_input_data("input", input); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "output"); + + auto output_memory = outputs.at("output").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); + + VVVVF output_cpu = reduce_input(input_rnd, mode, reduction_axes, input_padding_y, input_padding_x, output_padding_y, output_padding_x); + EXPECT_EQ(output_layout.format.value, test_input_fmt.value); + tensor output_tensor = output_layout.get_buffer_size(); + int y_size = output_tensor.spatial[1]; + int x_size = output_tensor.spatial[0]; + int f_size = output_tensor.feature[0]; + int b_size = output_tensor.batch[0]; + EXPECT_EQ(y_size, (int)output_cpu[0][0].size()); + EXPECT_EQ(x_size, (int)output_cpu[0][0][0].size()); + EXPECT_EQ(f_size, (int)output_cpu[0].size()); + EXPECT_EQ(b_size, (int)output_cpu.size()); + + + bool test_is_correct = true; + VF output_cpu_vec = flatten_4d(test_input_fmt, output_cpu); + for (size_t i = 0; i < output_cpu_vec.size(); ++i) { + if (!floating_point_equal(output_cpu_vec[i], output_ptr[i]) && !(std::isnan((float)output_cpu_vec[i]) && std::isnan((float)output_ptr[i]))) { + test_is_correct = false; + break; + } + } + EXPECT_EQ(test_is_correct, true) << std::endl + << "failing test parameters:" << std::endl + << "input_b = " << input_b << std::endl + << "input_f = " << input_f << std::endl + << "input_y = " << input_y << std::endl + << "input_x = " << input_x << std::endl + << "contract_mode = " << (int)mode << std::endl + << "axes = " << print_axes(reduction_axes) << std::endl + << "input_padding_y = " << input_padding_y << std::endl + << "input_padding_x = " << input_padding_x << std::endl + << "output_padding_y = " << output_padding_y << std::endl + << "output_padding_x = " << output_padding_x << std::endl; +} + +template +void generic_contract_test_int(cldnn::format test_input_fmt, int input_b, int input_f, int input_y, int input_x, cldnn::contract_mode mode, + std::vector reduction_axes, int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, int output_padding_x = 0) { + + int min_random = -2, max_random = 2; + VVVVF input_rnd = generate_random_4d(input_b, input_f, input_y, input_x, min_random, max_random); + VF input_rnd_vec = flatten_4d(test_input_fmt, input_rnd); + + const auto& engine = get_test_engine(); + tensor input_tensor(input_b, input_f, input_x, input_y); + auto input = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); + set_values(input, input_rnd_vec); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(contract("output", "input", mode, reduction_axes)); + + network network(engine, topology); + network.set_input_data("input", input); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "output"); + + auto output_memory = outputs.at("output").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); + + VVVVF output_cpu = reduce_input(input_rnd, mode, reduction_axes, input_padding_y, input_padding_x, output_padding_y, output_padding_x); + EXPECT_EQ(output_layout.format.value, test_input_fmt.value); + tensor output_tensor = output_layout.get_buffer_size(); + int y_size = output_tensor.spatial[1]; + int x_size = output_tensor.spatial[0]; + int f_size = output_tensor.feature[0]; + int b_size = output_tensor.batch[0]; + EXPECT_EQ(y_size, (int)output_cpu[0][0].size()); + EXPECT_EQ(x_size, (int)output_cpu[0][0][0].size()); + EXPECT_EQ(f_size, (int)output_cpu[0].size()); + EXPECT_EQ(b_size, (int)output_cpu.size()); + + + bool test_is_correct = true; + VF output_cpu_vec = flatten_4d(test_input_fmt, output_cpu); + + for (size_t i = 0; i < output_cpu_vec.size(); ++i) { + if (output_cpu_vec[i] != output_ptr[i]) { + test_is_correct = false; + break; + } + } + EXPECT_EQ(test_is_correct, true) << std::endl + << "failing test parameters:" << std::endl + << "input_b = " << input_b << std::endl + << "input_f = " << input_f << std::endl + << "input_y = " << input_y << std::endl + << "input_x = " << input_x << std::endl + << "contract_mode = " << (int)mode << std::endl + << "axes = " << print_axes(reduction_axes) << std::endl + << "input_padding_y = " << input_padding_y << std::endl + << "input_padding_x = " << input_padding_x << std::endl + << "output_padding_y = " << output_padding_y << std::endl + << "output_padding_x = " << output_padding_x << std::endl; +} + +TEST(contract_gpu_f32, generic_y_sum) { + generic_contract_test_float(format::bfyx, 5, 5, 5, 5, contract_mode::sum, { 2 }); +} + +TEST(contract_gpu_f32, generic_fx_prod) { + generic_contract_test_float(format::bfyx, 5, 5, 5, 5, contract_mode::sum, { 1, 3 }); +} + +TEST(contract_gpu_i32, generic_f_all) { + generic_contract_test_int(format::bfyx, 5, 5, 5, 5, contract_mode::all, { 1 }); +} + +TEST(contract_gpu_i32, generic_bfyx_any) { + generic_contract_test_int(format::bfyx, 5, 5, 5, 5, contract_mode::any, { 0, 1, 2, 3 }); +} + +TEST(contract_gpu_f32, generic_f_max) { + generic_contract_test_float(format::bfyx, 5, 5, 5, 5, contract_mode::max, { 1 }); +} + +TEST(contract_gpu_i32, generic_f_max) { + generic_contract_test_int(format::bfyx, 5, 5, 5, 5, contract_mode::max, { 1 }); +} + +TEST(contract_error, basic_error_empty_r_axes) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(contract("output", "input", contract_mode::sum, { })); + + std::string msg_to_find = "Incorrect parameters configuration: reduction_axes should not be empty."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(contract_error, basic_error_wrong_r_axes_size) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(contract("output", "input", contract_mode::sum, { 0, 1, 2, 3, 4 })); + + std::string msg_to_find = "Incorrect parameters configuration: reduction_axes size should be less or equal 4."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(contract_error, basic_error_wrong_r_axis_value) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(contract("output", "input", contract_mode::sum, { 0, 4 })); + + std::string msg_to_find = "Incorrect parameters configuration: reduction_axes index should be within reduction_axes range."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(contract_error, basic_error_duplicate_r_axis_values) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(contract("output", "input", contract_mode::sum, { 0, 1, 1 })); + + std::string msg_to_find = "Incorrect parameters configuration: Duplicate axes numbers was found in reduction_axes."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index b651444..c37493b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// #include +#include #include "api/CPP/memory.hpp" #include #include "api/CPP/convolution.hpp" @@ -60,7 +61,7 @@ T kahan_summation(std::vector &input) { template VVF reference_convolve(VVVF &input, VVVF &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1, - int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, + int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, int output_padding_x = 0, size_t f_begin = 0) { size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1; @@ -144,7 +145,7 @@ TEST(convolution_f32_fw_gpu, basic_convolution_no_bias) { // 21 28 39 // 18 20 20 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32,format::yxfb,{ 1, 1, 5, 4 } }); auto weights = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 3, 2 } }); @@ -216,7 +217,7 @@ TEST(convolution_f32_fw_gpu, basic_convolution_int8_no_bias) { // 21 28 39 // 18 20 20 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 5, 4 } }); auto weights = memory::allocate(engine, { data_types::i8,format::bfyx,{ 1, 1, 3, 2 } }); @@ -263,7 +264,7 @@ TEST(convolution_f32_fw_gpu, basic_convolution_int8_no_bias) { TEST(convolution_f32_fw_gpu, with_output_size_same_input) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 320, 320 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 64, 4, 7, 7 } }); @@ -275,7 +276,7 @@ TEST(convolution_f32_fw_gpu, with_output_size_same_input) { data("weights2", weights2), convolution::create_with_output_size("conv1", "input", { "weights" }, {1, 64, 160, 160}, {1, 1, 2, 2}, {0, 0, -3, -3}), convolution::create_with_output_size("conv2", "input", { "weights2" }, {1, 64, 320, 320}, {1, 1, 1, 1}, {0, 0, -3, -3}) - ); + ); network network(engine, topology); network.set_input_data("input", input); @@ -294,16 +295,16 @@ TEST(convolution_f32_fw_gpu, three_convolutions_same_weights) { // Input: // 1 1 1 1 // 1 1 1 1 - // + // // Filter: - // 1 - // + // 1 + // // Output: // 8 8 8 8 // 8 8 8 8 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {1,2,2,2} }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 2,2,1,1 } }); @@ -373,7 +374,7 @@ TEST(convolution_f32_fw_gpu, basic_convolution) { // Bias: // 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 2 } }); @@ -421,7 +422,7 @@ TEST(convolution_f32_fw_gpu, basic_convolution) { TEST(convolution_f32_fw_gpu, basic_convolution_bfyx_weights_as_input_layout) { //Same params as convolution_f32_fw_gpu, basic_convolution but with bfyx optimized data and weights set as input_layout - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 5, 4 } }); @@ -518,7 +519,7 @@ TEST(convolution_f32_fw_gpu, basic_convolution_input_padding) { // Bias: // 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); @@ -588,12 +589,12 @@ TEST(convolution_f32_fw_gpu, basic_convolution_input_padding) { //print_2d(temp_vec); } -TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) { +TEST(convolution_f32_fw_gpu, basic_convolution_sym_input_padding) { // Filter : 2x2 // Stride : 1x1 // Input : 3x4 - // Input padding : 2x1 - // Output : 8x9 + // Input padding : above 2x1, below 2x1 + // Output : 6x5 // Padding: Zero // // Input: @@ -610,39 +611,33 @@ TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) { // 1 1 // // Output: - // 1 1 1 1 1 1 1 1 1 - // 1 1 1 1 1 1 1 1 1 - // 1 1 2 4 6 8 5 1 1 - // 1 1 4 8 11 15 9 1 1 - // 1 1 6 11 12 16 10 1 1 - // 1 1 4 7 7 9 6 1 1 - // 1 1 1 1 1 1 1 1 1 - // 1 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 + // 2 4 6 8 5 + // 4 8 11 15 9 + // 6 11 12 16 10 + // 4 7 7 9 6 + // 1 1 1 1 1 // // Bias: // 1 - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } }); - auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); - auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f }); set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f }); set_values(biases, { 1.0f }); VVF output_vec = { - { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, - { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } }; + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 2.0f, 4.0f, 6.0f, 8.0f, 5.0f }, + { 4.0f, 8.0f, 11.0f, 15.0f, 9.0f }, + { 6.0f, 11.0f, 12.0f, 16.0f, 10.0f }, + { 4.0f, 7.0f, 7.0f, 9.0f, 6.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } }; - const int x_pad = 2; - const int y_pad = 1; topology topology( input_layout("input", input.get_layout()), data("weights", weights), @@ -653,11 +648,13 @@ TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) { { "weights" }, { "biases" }, { 1,1,1,1 }, - { 0,0,-1,-2 }, + { 0,0,0,0 }, { 1, 1, 1, 1 }, + { 0,0,1,2 }, + { 0,0,1,2 }, false, 0, - padding{ { 0,0,-x_pad,-y_pad }, 0 }) + padding{ { 0,0,0,0 }, 0 }) ); network network(engine, topology); @@ -669,90 +666,94 @@ TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) { auto output_memory = outputs.at("conv").get_memory(); auto output_layout = output_memory.get_layout(); - auto output_size = output_layout.get_buffer_size(); auto output_ptr = output_memory.pointer(); - int y_size = output_size.spatial[1]; - int x_size = output_size.spatial[0]; - int f_size = output_size.feature[0]; - int b_size = output_size.batch[0]; + int y_size = output_layout.size.spatial[1]; + int x_size = output_layout.size.spatial[0]; + int f_size = output_layout.size.feature[0]; + int b_size = output_layout.size.batch[0]; EXPECT_EQ(output_layout.format, format::yxfb); - EXPECT_EQ(y_size, 8); - EXPECT_EQ(x_size, 9); + EXPECT_EQ(y_size, 6); + EXPECT_EQ(x_size, 5); EXPECT_EQ(f_size, 1); EXPECT_EQ(b_size, 1); - for (int y = y_pad; y < y_size - y_pad; ++y) - { - for (int x = x_pad; x < x_size - x_pad; ++x) - { + for (int y = 0; y < y_size; ++y) { + for (int x = 0; x < x_size; ++x) { EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]); } } - - //VVF temp_vec(y_size, VF(x_size, 0.0f)); - //for (int y = 0; y < y_size; ++y) { - // for (int x = 0; x < x_size; ++x) { - // temp_vec[y][x] = output_ptr[y * x_size + x]; - // } - //} - //print_2d(temp_vec); } -TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad_random) { +TEST(convolution_f32_fw_gpu, basic_convolution_asym_input_padding) { // Filter : 2x2 - // Stride : 2x2 - // Input : 4x4 - // Output : 2x2 + // Stride : 1x1 + // Input : 3x4 + // Input padding : above 2x1, below 3x2 + // Output : 7x6 + // Padding: Zero // // Input: - // rnd rnd rnd rnd - // rnd rnd rnd rnd - // rnd rnd rnd rnd - // rnd rnd rnd rnd - // - // Filter - // rnd rnd - // rnd rnd + // z z z z z z z + // z z z z z z z + // z 1 2 3 4 z z + // z 2 2 3 4 z z + // z 3 3 3 5 z z + // z z z z z z z + // z z z z z z z + // z z z z z z z // - // Bias - // rnd + // Filter: + // 1 1 + // 1 1 // // Output: - // rnd rnd - // rnd rnd - - size_t batch = 1, input_f = 1, input_y = 4, input_x = 4; - - VVVVF input_rnd = generate_random_4d(batch, input_f, input_y, input_x, -10, 10); - VF input_rnd_vec = flatten_4d(format::yxfb, input_rnd); - VVVVF filter_rnd = generate_random_4d(1, 1, 2, 2, -10, 10); - VF filter_rnd_vec = flatten_4d(format::bfyx, filter_rnd); - VF bias_rnd = generate_random_1d(1, -10, 10); - VVVVF output_rnd(batch, VVVF(filter_rnd.size())); - for (size_t b = 0; b < output_rnd.size(); ++b) { - for (size_t of = 0; of < filter_rnd.size(); ++of) { - output_rnd[b][of] = reference_convolve(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]); - } - } - VF output_rnd_vec = flatten_4d(format::yxfb, output_rnd); + // 1 1 1 1 1 1 + // 2 4 6 8 5 1 + // 4 8 11 15 9 1 + // 6 11 12 16 10 1 + // 4 7 7 9 6 1 + // 1 1 1 1 1 1 + // 1 1 1 1 1 1 + // + // Bias: + // 1 - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } }); - //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } }); - auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); - auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); - set_values(input, input_rnd_vec); - set_values(weights, filter_rnd_vec); - set_values(biases, bias_rnd); + set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f }); + set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f }); + set_values(biases, { 1.0f }); + VVF output_vec = { + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f }, + { 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f }, + { 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f }, + { 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } }; topology topology( input_layout("input", input.get_layout()), data("weights", weights), data("biases", biases), - convolution("conv", "input", {"weights"}, {"biases"}, {1,1,2,2}) + convolution( + "conv", + "input", + { "weights" }, + { "biases" }, + { 1,1,1,1 }, + { 0,0,0,0 }, + { 1, 1, 1, 1 }, + { 0,0,1,2 }, + { 0,0,2,3 }, + false, + 0, + padding{ { 0,0,0,0 }, 0 }) ); network network(engine, topology); @@ -762,67 +763,106 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad_random) { EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "conv"); - auto output_prim = outputs.begin()->second.get_memory(); + auto output_memory = outputs.at("conv").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); - auto output_ptr = output_prim.pointer(); + int y_size = output_layout.size.spatial[1]; + int x_size = output_layout.size.spatial[0]; + int f_size = output_layout.size.feature[0]; + int b_size = output_layout.size.batch[0]; + EXPECT_EQ(output_layout.format, format::yxfb); + EXPECT_EQ(y_size, 7); + EXPECT_EQ(x_size, 6); + EXPECT_EQ(f_size, 1); + EXPECT_EQ(b_size, 1); - for (size_t i = 0; i < output_rnd.size(); ++i) { - float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]); - EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl; + for (int y = 0; y < y_size; ++y) { + for (int x = 0; x < x_size; ++x) { + EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]); + } } } -TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad_random) { +TEST(convolution_f32_fw_gpu, basic_convolution_sym_input_padding_with_input_offset) { // Filter : 2x2 - // Stride : 2x2 - // Input : 2x2x1x2 - // Output : 1x1x1x2 + // Stride : 1x1 + // Input : 3x4 + // Input padding : above 2x1, below 2x1 + // Input offset: 2x1 + // Output : 10x7 + // Padding: Zero // // Input: - // rnd rnd rnd rnd - // rnd rnd rnd rnd + // z z z z z z z z + // z z z z z z z z + // z z z z z z z z + // z z z z z z z z + // z z 1 2 3 4 z z + // z z 2 2 3 4 z z + // z z 3 3 3 5 z z + // z z z z z z z z + // z z z z z z z z + // z z z z z z z z + // z z z z z z z z // // Filter: - // rnd rnd - // rnd rnd - // - // Bias: - // rnd + // 1 1 + // 1 1 // // Output: - // rnd rnd - - size_t batch = 2, input_f = 1, input_y = 2, input_x = 2; - - VVVVF input_rnd = generate_random_4d(batch, input_f, input_y, input_x, -10, 10); - VF input_rnd_vec = flatten_4d(format::yxfb, input_rnd); - VVVVF filter_rnd = generate_random_4d(1, 1, 2, 2, -10, 10); - VF filter_rnd_vec = flatten_4d(format::bfyx, filter_rnd); - VF bias_rnd = generate_random_1d(1, -10, 10); - VVVVF output_rnd(batch, VVVF(filter_rnd.size())); - for (size_t b = 0; b < output_rnd.size(); ++b) { - for (size_t of = 0; of < filter_rnd.size(); ++of) { - output_rnd[b][of] = reference_convolve(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]); - } - } - VF output_rnd_vec = flatten_4d(format::yxfb, output_rnd); + // 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 + // 1 2 4 6 8 5 1 + // 1 4 8 11 15 9 1 + // 1 6 11 12 16 10 1 + // 1 4 7 7 9 6 1 + // 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 + // + // Bias: + // 1 - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } }); - //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } }); - auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); - auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); - set_values(input, input_rnd_vec); - set_values(weights, filter_rnd_vec); - set_values(biases, bias_rnd); + set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f }); + set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f }); + set_values(biases, { 1.0f }); + VVF output_vec = { + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f }, + { 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f }, + { 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f }, + { 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } }; topology topology( input_layout("input", input.get_layout()), data("weights", weights), data("biases", biases), - convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 }) + convolution( + "conv", + "input", + { "weights" }, + { "biases" }, + { 1,1,1,1 }, + { 0,0,-1,-2 }, + { 1, 1, 1, 1 }, + { 0,0,1,2 }, + { 0,0,1,2 }, + false, + 0, + padding{ { 0,0,0,0 }, 0 }) ); network network(engine, topology); @@ -832,40 +872,417 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad_random) { EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "conv"); - auto output_prim = outputs.begin()->second.get_memory(); + auto output_memory = outputs.at("conv").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); - auto output_ptr = output_prim.pointer(); + int y_size = output_layout.size.spatial[1]; + int x_size = output_layout.size.spatial[0]; + int f_size = output_layout.size.feature[0]; + int b_size = output_layout.size.batch[0]; + EXPECT_EQ(output_layout.format, format::yxfb); + EXPECT_EQ(y_size, 10); + EXPECT_EQ(x_size, 7); + EXPECT_EQ(f_size, 1); + EXPECT_EQ(b_size, 1); - for (size_t i = 0; i < output_rnd.size(); ++i) { - float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]); - EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl; + for (int y = 0; y < y_size; ++y) { + for (int x = 0; x < x_size; ++x) { + EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]); + } } } -TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad) { +TEST(convolution_f32_fw_gpu, basic_convolution_asym_input_padding_with_input_offset) { // Filter : 2x2 - // Stride : 2x2 - // Input : 4x4 - // Output : 2x2 + // Stride : 1x1 + // Input : 3x4 + // Input padding : above 2x1, below 3x2 + // Input offset: 2x1 + // Output : 11x8 + // Padding: Zero // // Input: - // -0.5 1 0.5 2 - // 1.5 -0.5 0 -1 - // 0.5 0.5 -1 1 - // 0.5 2 1.5 -0.5 - // - // Filter - // -2 0.5 - // 3.5 1.5 + // z z z z z z z z z + // z z z z z z z z z + // z z z z z z z z z + // z z z z z z z z z + // z z 1 2 3 4 z z z + // z z 2 2 3 4 z z z + // z z 3 3 3 5 z z z + // z z z z z z z z z + // z z z z z z z z z + // z z z z z z z z z + // z z z z z z z z z + // z z z z z z z z z // - // Bias + // Filter: + // 1 1 + // 1 1 + // + // Output: + // 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 + // 1 2 4 6 8 5 1 1 + // 1 4 8 11 15 9 1 1 + // 1 6 11 12 16 10 1 1 + // 1 4 7 7 9 6 1 1 + // 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 + // + // Bias: + // 1 + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f }); + set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f }); + set_values(biases, { 1.0f }); + VVF output_vec = { + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f, 1.0f }, + { 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f, 1.0f }, + { 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f, 1.0f }, + { 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } }; + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + convolution( + "conv", + "input", + { "weights" }, + { "biases" }, + { 1,1,1,1 }, + { 0,0,-1,-2 }, + { 1, 1, 1, 1 }, + { 0,0,1,2 }, + { 0,0,2,3 }, + false, + 0, + padding{ { 0,0,0,0 }, 0 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_memory = outputs.at("conv").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); + + int y_size = output_layout.size.spatial[1]; + int x_size = output_layout.size.spatial[0]; + int f_size = output_layout.size.feature[0]; + int b_size = output_layout.size.batch[0]; + EXPECT_EQ(output_layout.format, format::yxfb); + EXPECT_EQ(y_size, 11); + EXPECT_EQ(x_size, 8); + EXPECT_EQ(f_size, 1); + EXPECT_EQ(b_size, 1); + + for (int y = 0; y < y_size; ++y) { + for (int x = 0; x < x_size; ++x) { + EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]); + } + } +} + +TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) { + // Filter : 2x2 + // Stride : 1x1 + // Input : 3x4 + // Input padding : 2x1 + // Output : 8x9 + // Padding: Zero + // + // Input: + // z z z z z z + // z z z z z z + // z 1 2 3 4 z + // z 2 2 3 4 z + // z 3 3 3 5 z + // z z z z z z + // z z z z z z + // + // Filter: + // 1 1 + // 1 1 + // + // Output: + // 1 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 1 + // 1 1 2 4 6 8 5 1 1 + // 1 1 4 8 11 15 9 1 1 + // 1 1 6 11 12 16 10 1 1 + // 1 1 4 7 7 9 6 1 1 + // 1 1 1 1 1 1 1 1 1 + // 1 1 1 1 1 1 1 1 1 + // + // Bias: + // 1 + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f }); + set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f }); + set_values(biases, { 1.0f }); + VVF output_vec = { + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } }; + + const int x_pad = 2; + const int y_pad = 1; + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + convolution( + "conv", + "input", + { "weights" }, + { "biases" }, + { 1,1,1,1 }, + { 0,0,-1,-2 }, + { 1, 1, 1, 1 }, + false, + 0, + padding{ { 0,0,-x_pad,-y_pad }, 0 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_memory = outputs.at("conv").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_size = output_layout.get_buffer_size(); + auto output_ptr = output_memory.pointer(); + + int y_size = output_size.spatial[1]; + int x_size = output_size.spatial[0]; + int f_size = output_size.feature[0]; + int b_size = output_size.batch[0]; + EXPECT_EQ(output_layout.format, format::yxfb); + EXPECT_EQ(y_size, 8); + EXPECT_EQ(x_size, 9); + EXPECT_EQ(f_size, 1); + EXPECT_EQ(b_size, 1); + + for (int y = y_pad; y < y_size - y_pad; ++y) + { + for (int x = x_pad; x < x_size - x_pad; ++x) + { + EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]); + } + } + + //VVF temp_vec(y_size, VF(x_size, 0.0f)); + //for (int y = 0; y < y_size; ++y) { + // for (int x = 0; x < x_size; ++x) { + // temp_vec[y][x] = output_ptr[y * x_size + x]; + // } + //} + //print_2d(temp_vec); +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad_random) { + // Filter : 2x2 + // Stride : 2x2 + // Input : 4x4 + // Output : 2x2 + // + // Input: + // rnd rnd rnd rnd + // rnd rnd rnd rnd + // rnd rnd rnd rnd + // rnd rnd rnd rnd + // + // Filter + // rnd rnd + // rnd rnd + // + // Bias + // rnd + // + // Output: + // rnd rnd + // rnd rnd + + size_t batch = 1, input_f = 1, input_y = 4, input_x = 4; + + VVVVF input_rnd = generate_random_4d(batch, input_f, input_y, input_x, -10, 10); + VF input_rnd_vec = flatten_4d(format::yxfb, input_rnd); + VVVVF filter_rnd = generate_random_4d(1, 1, 2, 2, -10, 10); + VF filter_rnd_vec = flatten_4d(format::bfyx, filter_rnd); + VF bias_rnd = generate_random_1d(1, -10, 10); + VVVVF output_rnd(batch, VVVF(filter_rnd.size())); + for (size_t b = 0; b < output_rnd.size(); ++b) { + for (size_t of = 0; of < filter_rnd.size(); ++of) { + output_rnd[b][of] = reference_convolve(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]); + } + } + VF output_rnd_vec = flatten_4d(format::yxfb, output_rnd); + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } }); + //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, input_rnd_vec); + set_values(weights, filter_rnd_vec); + set_values(biases, bias_rnd); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + convolution("conv", "input", {"weights"}, {"biases"}, {1,1,2,2}) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + for (size_t i = 0; i < output_rnd.size(); ++i) { + float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]); + EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl; + } +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad_random) { + // Filter : 2x2 + // Stride : 2x2 + // Input : 2x2x1x2 + // Output : 1x1x1x2 + // + // Input: + // rnd rnd rnd rnd + // rnd rnd rnd rnd + // + // Filter: + // rnd rnd + // rnd rnd + // + // Bias: + // rnd + // + // Output: + // rnd rnd + + size_t batch = 2, input_f = 1, input_y = 2, input_x = 2; + + VVVVF input_rnd = generate_random_4d(batch, input_f, input_y, input_x, -10, 10); + VF input_rnd_vec = flatten_4d(format::yxfb, input_rnd); + VVVVF filter_rnd = generate_random_4d(1, 1, 2, 2, -10, 10); + VF filter_rnd_vec = flatten_4d(format::bfyx, filter_rnd); + VF bias_rnd = generate_random_1d(1, -10, 10); + VVVVF output_rnd(batch, VVVF(filter_rnd.size())); + for (size_t b = 0; b < output_rnd.size(); ++b) { + for (size_t of = 0; of < filter_rnd.size(); ++of) { + output_rnd[b][of] = reference_convolve(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]); + } + } + VF output_rnd_vec = flatten_4d(format::yxfb, output_rnd); + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } }); + //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, input_rnd_vec); + set_values(weights, filter_rnd_vec); + set_values(biases, bias_rnd); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + for (size_t i = 0; i < output_rnd.size(); ++i) { + float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]); + EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl; + } +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad) { + // Filter : 2x2 + // Stride : 2x2 + // Input : 4x4 + // Output : 2x2 + // + // Input: + // -0.5 1 0.5 2 + // 1.5 -0.5 0 -1 + // 0.5 0.5 -1 1 + // 0.5 2 1.5 -0.5 + // + // Filter + // -2 0.5 + // 3.5 1.5 + // + // Bias // 2 // // Output: // 8 0.5 // 6 9 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } }); @@ -919,7 +1336,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad) { // // Output: // 3.65 -5.36 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } }); @@ -971,7 +1388,7 @@ TEST(convolution_f32_fw_gpu, basic_ofm_wsiz2x1x2x1_in1x2x1_nopad) { // 5.1 f=0 // -5.2 f=1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 1, 2 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 2 } }); @@ -1030,7 +1447,7 @@ TEST(convolution_f32_fw_gpu, basic_ofm_wsiz3x2x2x1_in2x2x1_nopad) { // 64,0 f=1 // 103.0 f=2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 2 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 3 } }); @@ -1086,7 +1503,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2x1x3_wstr2x2_in2x2x1x1_nopad) { // 2.12 // 3.08 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 3 } }); @@ -1142,7 +1559,7 @@ TEST(convolution_f32_fw_gpu, wsiz3x3_wstr2x2_in2x2x1x1_zeropad) { // // Output: // 12.25 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 1 } }); @@ -1199,7 +1616,7 @@ TEST(convolution_f32_fw_gpu, offsets_wsiz3x3_wstr2x2_in2x2x1x1_zeropad) { // Output: // rnd rnd // rnd 2.0 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } }); @@ -1276,7 +1693,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2) { // 8 3.65 0.5 -5.36 // 6 3.65 9 -5.36 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 4, 4 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 2 } }); @@ -1333,83 +1750,262 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2) { EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 7)); } -TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2) { - // 2x Filter : 2x2 - // Stride : 2x2 - // Input : 2x4x4x2 - // Output : 2x2x2x2 - // - // Input: - // f0b0: -0.5 1 0.5 2 - // 1.5 -0.5 0 -1 - // 0.5 0.5 -1 1 - // 0.5 2 1.5 -0.5 - // - // f0b1: -0.5 1 0.5 2 - // 1.5 -0.5 0 -1 - // 0.5 0.5 -1 1 - // 0.5 2 1.5 -0.5 - // - // f1b0: 0.5 1.5 2.3 -0.4 - // 2.0 -4.0 1.0 3.0 - // 0.5 1.5 2.3 -0.4 - // 2.0 -4.0 1.0 3.0 - // - // f1b1: 0.5 1.5 2.3 -0.4 - // 2.0 -4.0 1.0 3.0 - // 0.5 1.5 2.3 -0.4 - // 2.0 -4.0 1.0 3.0 - // - // - // Filter1: - // -2 0.5 - // 3.5 1.5 - // - // Bias1: - // 2 - // - // Filter2: - // -1.2 1.5 - // 0.5 -0.5 - // - // Bias2: - // -1 - - // Output: - // 8 8 3.65 3.65 0.5 0.5 -5.36 -5.36 - // 6 6 3.65 3.65 9 9 -5.36 -5.36 +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2) { + // 2x Filter : 2x2 + // Stride : 2x2 + // Input : 2x4x4x2 + // Output : 2x2x2x2 + // + // Input: + // f0b0: -0.5 1 0.5 2 + // 1.5 -0.5 0 -1 + // 0.5 0.5 -1 1 + // 0.5 2 1.5 -0.5 + // + // f0b1: -0.5 1 0.5 2 + // 1.5 -0.5 0 -1 + // 0.5 0.5 -1 1 + // 0.5 2 1.5 -0.5 + // + // f1b0: 0.5 1.5 2.3 -0.4 + // 2.0 -4.0 1.0 3.0 + // 0.5 1.5 2.3 -0.4 + // 2.0 -4.0 1.0 3.0 + // + // f1b1: 0.5 1.5 2.3 -0.4 + // 2.0 -4.0 1.0 3.0 + // 0.5 1.5 2.3 -0.4 + // 2.0 -4.0 1.0 3.0 + // + // + // Filter1: + // -2 0.5 + // 3.5 1.5 + // + // Bias1: + // 2 + // + // Filter2: + // -1.2 1.5 + // 0.5 -0.5 + // + // Bias2: + // -1 + + // Output: + // 8 8 3.65 3.65 0.5 0.5 -5.36 -5.36 + // 6 6 3.65 3.65 9 9 -5.36 -5.36 + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 4, 4 } }); + //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 2, 2 }, 2 } }); + auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); + auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); + auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, { + -0.5f, -0.5f, 0.5f, 0.5f, 1.0f, 1.0f, 1.5f, 1.5f, 0.5f, 0.5f, 2.3f, 2.3f, 2.0f, 2.0f, -0.4f, -0.4f, + 1.5f, 1.5f, 2.0f, 2.0f, -0.5f, -0.5f, -4.0f, -4.0f, 0.0f, 0.0f, 1.0f, 1.0f, -1.0f, -1.0f, 3.0f, 3.0f, + 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 1.5f, 1.5f, -1.0f, -1.0f, 2.3f, 2.3f, 1.0f, 1.0f, -0.4f, -0.4f, + 0.5f, 0.5f, 2.0f, 2.0f, 2.0f, 2.0f, -4.0f, -4.0f, 1.5f, 1.5f, 1.0f, 1.0f, -0.5f, -0.5f, 3.0f, 3.0f, + }); + set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f }); + set_values(biases1, { 2.0f }); + set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f }); + set_values(biases2, { -1.0f }); + + topology topology( + input_layout("input", input.get_layout()), + data("weights1", weights1), + data("biases1", biases1), + data("weights2", weights2), + data("biases2", biases2), + convolution( + "conv", + "input", + { "weights1", "weights2" }, + { "biases1", "biases2" }, + { 1,1,2,2 }, + { 0,0,0,0 }, + { 1,1,1,1 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 0)); + EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 1)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 2)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 3)); + EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 4)); + EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 5)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 6)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 7)); + EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 8)); + EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 9)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 10)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 11)); + EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 12)); + EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 13)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 14)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 15)); +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_group2) { + // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2 + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 4, 4 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } }); + + set_values(input, { + -0.5f, 0.5f, 1.0f, 1.5f, 0.5f, 2.3f, 2.0f, -0.4f, + 1.5f, 2.0f, -0.5f, -4.0f, 0.0f, 1.0f, -1.0f, 3.0f, + 0.5f, 0.5f, 0.5f, 1.5f, -1.0f, 2.3f, 1.0f, -0.4f, + 0.5f, 2.0f, 2.0f, -4.0f, 1.5f, 1.0f, -0.5f, 3.0f + }); + set_values(weights, { + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f + }); + set_values(biases, { 2.0f, -1.0f }); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + convolution( + "conv", + "input", + { "weights" }, + { "biases" }, + 2, // number of groups + { 0,0,2,2 }, + { 0,0,0,0 }, + { 1,1,1,1 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 0)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 1)); + EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 2)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 3)); + EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 4)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 5)); + EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 6)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 7)); +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_group2_bfyx) { + // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2 + + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 4, 4 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } }); + + set_values(input, { + -0.5f, 0.5f, 1.0f, 1.5f, 0.5f, 2.3f, 2.0f, -0.4f, + 1.5f, 2.0f, -0.5f, -4.0f, 0.0f, 1.0f, -1.0f, 3.0f, + 0.5f, 0.5f, 0.5f, 1.5f, -1.0f, 2.3f, 1.0f, -0.4f, + 0.5f, 2.0f, 2.0f, -4.0f, 1.5f, 1.0f, -0.5f, 3.0f + }); + set_values(weights, { + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f + }); + set_values(biases, { 2.0f, -1.0f }); + + topology topology( + input_layout("input", input.get_layout()), + reorder("input_1", "input", { data_types::f32,format::bfyx,{ 1, 2, 4, 4 } }), + data("weights", weights), + data("biases", biases), + convolution( + "conv", + "input_1", + { "weights" }, + { "biases" }, + 2, // number of groups + { 0,0,2,2 }, + { 0,0,0,0 }, + { 1,1,1,1 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 0)); + EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 1)); + EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 2)); + EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 3)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 4)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 5)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 6)); + EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 7)); +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_group2) { + // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 engine engine; - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 4, 4 } }); - //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 2, 2 }, 2 } }); - auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); - auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); - auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); - auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 4, 4 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } }); set_values(input, { - -0.5f, -0.5f, 0.5f, 0.5f, 1.0f, 1.0f, 1.5f, 1.5f, 0.5f, 0.5f, 2.3f, 2.3f, 2.0f, 2.0f, -0.4f, -0.4f, + -0.5f, -0.5f, 0.5f, 0.5f, 1.0f, 1.0f, 1.5f, 1.5f, 0.5f, 0.5f, 2.3f, 2.3f, 2.0f, 2.0f, -0.4f, -0.4f, 1.5f, 1.5f, 2.0f, 2.0f, -0.5f, -0.5f, -4.0f, -4.0f, 0.0f, 0.0f, 1.0f, 1.0f, -1.0f, -1.0f, 3.0f, 3.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 1.5f, 1.5f, -1.0f, -1.0f, 2.3f, 2.3f, 1.0f, 1.0f, -0.4f, -0.4f, 0.5f, 0.5f, 2.0f, 2.0f, 2.0f, 2.0f, -4.0f, -4.0f, 1.5f, 1.5f, 1.0f, 1.0f, -0.5f, -0.5f, 3.0f, 3.0f, }); - set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f }); - set_values(biases1, { 2.0f }); - set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f }); - set_values(biases2, { -1.0f }); + set_values(weights, { + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f + }); + set_values(biases, { 2.0f, -1.0f }); topology topology( input_layout("input", input.get_layout()), - data("weights1", weights1), - data("biases1", biases1), - data("weights2", weights2), - data("biases2", biases2), + data("weights", weights), + data("biases", biases), convolution( "conv", "input", - { "weights1", "weights2" }, - { "biases1", "biases2" }, + { "weights" }, + { "biases" }, + 2, // number of groups { 1,1,2,2 }, { 0,0,0,0 }, { 1,1,1,1 }) @@ -1426,20 +2022,20 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2) { auto output_ptr = output_prim.pointer(); - EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 0)); - EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 1)); - EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 2)); - EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 3)); - EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 4)); - EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 5)); + EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 0)); + EXPECT_FLOAT_EQ(8.0f, get_value(output_ptr, 1)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 2)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 3)); + EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 4)); + EXPECT_FLOAT_EQ(0.5f, get_value(output_ptr, 5)); EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 6)); EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 7)); - EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 8)); - EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 9)); - EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 10)); - EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 11)); - EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 12)); - EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 13)); + EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 8)); + EXPECT_FLOAT_EQ(6.0f, get_value(output_ptr, 9)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 10)); + EXPECT_FLOAT_EQ(3.65f, get_value(output_ptr, 11)); + EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 12)); + EXPECT_FLOAT_EQ(9.0f, get_value(output_ptr, 13)); EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 14)); EXPECT_FLOAT_EQ(-5.36f, get_value(output_ptr, 15)); } @@ -1448,7 +2044,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthw // Test for depthwise separable optimization, there are 16 weights and biases (split 16) // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 16, 4, 4 } }); @@ -1511,8 +2107,204 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthw convolution( "conv", "input", - weights_vec, - bias_vec, + weights_vec, + bias_vec, + { 1,1,2,2 }, + { 0,0,0,0 }, + { 1,1,1,1 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + std::vector expected_output_vec = { + 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, + 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, + 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, + 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, + }; + + for (unsigned int i = 0; i < expected_output_vec.size(); i++) + { + EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); + } +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt_bfyx) { + // Test for depthwise separable optimization, there are 16 weights and biases (split 16) + // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1 + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 16, 4, 4 } }); + + set_values(input, { + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, + 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, + }); + + topology topology(input_layout("input", input.get_layout())); + + std::vector weights_vec; + std::vector bias_vec; + + for (uint32_t i = 0; i < 8; i++) + { + auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f }); + set_values(biases1, { 2.0f }); + set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f }); + set_values(biases2, { -1.0f }); + + primitive_id weights_id = "weights_" + std::to_string(i); + primitive_id weights2_id = "weights2_" + std::to_string(i); + primitive_id bias_id = "biases_" + std::to_string(i); + primitive_id bias2_id = "biases2_" + std::to_string(i); + + weights_vec.push_back(weights_id); + weights_vec.push_back(weights2_id); + bias_vec.push_back(bias_id); + bias_vec.push_back(bias2_id); + + topology.add( + data(weights_id, weights1), + data(bias_id, biases1), + data(weights2_id, weights2), + data(bias2_id, biases2) + ); + + } + + topology.add( + convolution( + "conv", + "input", + weights_vec, + bias_vec, + { 1,1,2,2 }, + { 0,0,0,0 }, + { 1,1,1,1 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + std::vector expected_output_vec = { + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + 8.0f, 0.5f, 6.0f, 9.0f, 3.65f,-5.36f, 3.65f, -5.36f, + }; + + for (unsigned int i = 0; i < expected_output_vec.size(); i++) + { + EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); + } +} + +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_group16) { + // Test for grouped convolution, there are 16 joined weights and biases (group 16) + // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt + + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 16, 4, 4 } }); + + set_values(input, { + -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, -0.5f, -0.5f, 0.5f, 0.5f, + 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, + 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, 0.5f, 0.5f, 2.3f, 2.3f, + 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, 2.0f, 2.0f, -0.4f, -0.4f, + 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, 1.5f, 1.5f, 2.0f, 2.0f, + -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, + 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, + -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, -1.0f, -1.0f, 3.0f, 3.0f, + 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, + 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, 0.5f, 0.5f, 1.5f, 1.5f, + -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, -1.0f, -1.0f, 2.3f, 2.3f, + 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, 1.0f, 1.0f, -0.4f, -0.4f, + 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, 0.5f, 0.5f, 2.0f, 2.0f, + 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, 2.0f, 2.0f, -4.0f, -4.0f, + 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, 1.5f, 1.5f, 1.0f, 1.0f, + -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, -0.5f, -0.5f, 3.0f, 3.0f, + }); + + topology topology(input_layout("input", input.get_layout())); + + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 16, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 16, 1 } }); + + set_values(weights, + { + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f + } + ); + set_values(biases, { 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f}); + + topology.add( + data("weights", weights), + data("bias", biases) + ); + + topology.add( + convolution( + "conv", + "input", + { "weights" }, + { "bias" }, + 16, { 1,1,2,2 }, { 0,0,0,0 }, { 1,1,1,1 }) @@ -1542,9 +2334,9 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthw } } -TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt_bfyx) { - // Test for depthwise separable optimization, there are 16 weights and biases (split 16) - // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1 +TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_group16_bfyx) { + // Test for grouped convolution, there are 16 joined weights and biases (group 16) + // data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt_bfyx engine engine; auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 16, 4, 4 } }); @@ -1570,46 +2362,44 @@ TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthw topology topology(input_layout("input", input.get_layout())); - std::vector weights_vec; - std::vector bias_vec; - - for (uint32_t i = 0; i < 8; i++) - { - auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); - auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); - auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); - auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); - - set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f }); - set_values(biases1, { 2.0f }); - set_values(weights2, { -1.2f, 1.5f, 0.5f, -0.5f }); - set_values(biases2, { -1.0f }); - - primitive_id weights_id = "weights_" + std::to_string(i); - primitive_id weights2_id = "weights2_" + std::to_string(i); - primitive_id bias_id = "biases_" + std::to_string(i); - primitive_id bias2_id = "biases2_" + std::to_string(i); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 16, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 16, 1 } }); - weights_vec.push_back(weights_id); - weights_vec.push_back(weights2_id); - bias_vec.push_back(bias_id); - bias_vec.push_back(bias2_id); + set_values(weights, + { + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f, + -2.0f, 0.5f, 3.5f, 1.5f, + -1.2f, 1.5f, 0.5f, -0.5f + } + ); - topology.add( - data(weights_id, weights1), - data(bias_id, biases1), - data(weights2_id, weights2), - data(bias2_id, biases2) - ); + set_values(biases, { 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f}); - } + topology.add( + data("weights", weights), + data("bias", biases) + ); topology.add( convolution( "conv", "input", - weights_vec, - bias_vec, + { "weights" }, + { "bias" }, + 16, { 1,1,2,2 }, { 0,0,0,0 }, { 1,1,1,1 }) @@ -1659,7 +2449,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_nopad_split2) { // // Filter1: // -2 -0.5 ofm=0 - // 1 2 ofm=1 + // 1 2 ofm=1 // Bias1: // 1 5 // @@ -1671,13 +2461,13 @@ TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_nopad_split2) { // -1 2.5 // // Output: - // -2.25 + // -2.25 // 7.5 // // -1.75 // 2.25 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 4, 1, 1 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 4 } }); @@ -1740,7 +2530,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x2x1_nopad_split2) { // // Filter1: // -2 ofm=0 - // 1 ofm=1 + // 1 ofm=1 // Bias1: // 1 5 // @@ -1752,14 +2542,14 @@ TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x2x1_nopad_split2) { // -1 2.5 // // Output: - // -2 + // -2 // 6.5 // // 1 // 3.5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 1 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 4 } }); @@ -1838,7 +2628,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_filter_1x3x2x1x1_no // -1 2.5 2 // // Output: - // -1.5 + // -1.5 // 8 // 7.75 // @@ -1847,7 +2637,7 @@ TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_filter_1x3x2x1x1_no // -2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 4, 1, 1 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 6 } }); @@ -1924,7 +2714,7 @@ TEST(convolution_gpu, trivial_convolution_relu) { // 4 0.0 // 2 5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } }); @@ -1998,7 +2788,7 @@ TEST(convolution_gpu, relu_with_negative_slope) { // 4 -0.35 // 2 5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } }); //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } }); @@ -2049,7 +2839,7 @@ TEST(convolution_gpu, relu_with_negative_slope) { TEST(convolution_gpu, DISABLED_two_1x1_kernels_after_each_other) { - engine engine; + const auto& engine = get_test_engine(); extern const std::vector conv_1x1_output; @@ -2091,7 +2881,7 @@ TEST(convolution_gpu, DISABLED_two_1x1_kernels_after_each_other) { auto output_ptr = output_prim.pointer(); auto output_layout = output_prim.get_layout(); - + int y_size = output_layout.size.spatial[1]; int x_size = output_layout.size.spatial[0]; int f_size = output_layout.size.feature[0]; @@ -2140,7 +2930,7 @@ TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp32) const int32_t output_x = (input_x - weights_x) / stride_x + 1; const int32_t output_y = (input_y - weights_y) / stride_y + 1; - engine engine; + const auto& engine = get_test_engine(); auto input_size = tensor( batch_size, input_feature_count, input_x, input_y ); auto input = memory::allocate(engine, { data_types::f32, input_format, input_size }); @@ -2311,7 +3101,7 @@ void quantize_weights(cldnn::memory& weights, cldnn::memory& w_qf) for (int w = 0; w < batch_pitch; w++) if (max < abs(ptr[ofm* batch_pitch + w])) max = abs(ptr[ofm* batch_pitch + w]); - + if (max == (T)0) max = (T)1; // do not quantize @@ -2429,7 +3219,7 @@ TEST(convolution_f32_fw_gpu, byte_activation) { engine_configuration eng_conf(false, false, false, "", "", true, "", "kernels"); engine engine{ eng_conf }; auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } }); - + VVVF output_vec = { { { 11, 0, 15 }, @@ -2505,7 +3295,7 @@ TEST(convolution_f32_fw_gpu, quantized_convolution_low_prec_single_ofq) { // Bias: // 1 -8 - engine engine; + const auto& engine = get_test_engine(); auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } }); auto weights_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } }); @@ -2518,11 +3308,11 @@ TEST(convolution_f32_fw_gpu, quantized_convolution_low_prec_single_ofq) { set_values(biases, { 1.0f, -8.0f }); VVVF output_vec = { - { + { { 21.0f, 28.0f, 39.0f }, { 18.0f, 20.0f, 20.0f } }, - { + { { 155.0f, 245.0f, 348.0f }, { 142.0f, 140.0f, 178.0f } } }; @@ -2546,7 +3336,7 @@ TEST(convolution_f32_fw_gpu, quantized_convolution_low_prec_single_ofq) { auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } }); auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } }); - float i_qf = 1.0f; + float i_qf = 1.0f; float o_qf = 127.0f / max_abs(output_memory_f); std::vector weights_values = { 1, 2, 1, 2, 1, 2, 19, 17, -1, -10, 32, 23 }; @@ -2618,7 +3408,7 @@ TEST(convolution_f32_fw_gpu, quantized_convolution_high_prec_calib_per_ofm) { // // Bias: // 1 -8 - engine engine; + const auto& engine = get_test_engine(); auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } }); auto weights_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } }); @@ -2656,10 +3446,10 @@ TEST(convolution_f32_fw_gpu, quantized_convolution_high_prec_calib_per_ofm) { auto output_memory_f = outputs_f.at("conv_f").get_memory(); auto output_ptr_f = output_memory_f.pointer(); - + auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } }); auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } }); - float i_qf = 1.0f; + float i_qf = 1.0f; std::vector weights_values = { 1, 2, 1, 2, 1, 2, 19, 17, -1, -10, 32, 23 }; set_values(input, { 1, 2, 3, 4, 5, 2, 2, 3, 4, 6, 3, 3, 3, 5, 1, 1, 1, 1, 1, 1 }); @@ -2751,7 +3541,7 @@ TEST(convolution_f32_fw_gpu, calibration_advance) { // Bias2: // 2 4 0 - engine engine; + const auto& engine = get_test_engine(); auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 4 } }); auto weights_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } }); @@ -2762,10 +3552,10 @@ TEST(convolution_f32_fw_gpu, calibration_advance) { auto w_qf_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } }); std::vector weights_values_f = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.9f, 1.7f, -1.0f, -1.0f, 3.2f, 2.3f }; - std::vector weights_values_f_2 = { + std::vector weights_values_f_2 = { 1.5f, 2.3f, -1.0f, 3.0f, 5.6f, -1.0f, 3.0f, 5.6f, -1.0f, 1.0f, 2.0f, 3.0f, - + 1.9f, 1.7f, -1.0f, 1.9f, 1.7f, -1.0f, -1.0f, 3.2f, 2.3f, -1.0f, 3.2f, 2.3f, @@ -2835,19 +3625,380 @@ TEST(convolution_f32_fw_gpu, calibration_advance) { auto o_qf = output_calibrations_2.pointer(); for (int f = 0; f < out_size.feature[0]; f++) - for (int y = 0; y < out_size.spatial[1]; ++y) { - for (int x = 0; x < out_size.spatial[0]; ++x) { - EXPECT_NEAR(ref_ptr[x + out_size.spatial[0] * (y + out_size.spatial[1]*f)], ((float)test_ptr[x + out_size.spatial[0] * (y + out_size.spatial[1] * f)]) / o_qf[f], 3.0f); + { + for (int y = 0; y < out_size.spatial[1]; ++y) + { + for (int x = 0; x < out_size.spatial[0]; ++x) + { + EXPECT_NEAR(ref_ptr[x + out_size.spatial[0] + * (y + out_size.spatial[1] * f)], ((float)test_ptr[x + out_size.spatial[0] + * (y + out_size.spatial[1] * f)]) / o_qf[f], 3.0f); } } + } + +} +TEST(convolution_f32_fw_gpu, local_basic) { + // Filter : 3x3x2x2 - local sizes + // Stride : 1x1 + // Input : 4x4 + // Output : 3x3 + // + // Input: + // 1 1 1 1 + // 1 1 1 1 + // 2 2 2 2 + // 2 2 2 2 + // + // + // Filter: + // 0 0 1 1 2 2 + // 0 0 1 1 2 2 + // + // 3 3 4 4 5 5 + // 3 3 4 4 5 5 + // + // 6 6 7 7 8 8 + // 6 6 7 7 8 8 + // + // Output: + // 0 4 8 + // 18 24 30 + // 48 56 64 + // + + const auto& engine = get_test_engine(); + tensor local_size = tensor(1,1,2,2,3,3); + auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 4 } }); + auto weights_f = memory::allocate(engine, { data_types::f32, format::bf_lyx_yx, local_size }); + cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + std::vector weights_values_f = { + 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, + + 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, + + 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, + 8.0, 8.0, 8.0, 8.0, + }; + set_values(input_f, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0 }); + set_values(weights_f, weights_values_f); + set_values(biases, { 0.0f }); + std::vector output_vec = + { + 0.0f, 4.0f, 8.0f, + 18.0f, 24.0f, 30.0f, + 48.0f, 56.0f, 64.0f + }; + + topology topology_f( + input_layout("input_f", input_f.get_layout()), + data("weights_f", weights_f), + data("biases", biases), + convolution("conv_f", "input_f", { "weights_f" }, { "biases" }, { 0, 0, 1, 1 })); + + build_options opts; + opts.set_option(build_option::optimize_data(true)); + network network_f(engine, topology_f, opts); + network_f.set_input_data("input_f", input_f); + + auto outputs_f = network_f.execute(); + EXPECT_EQ(outputs_f.begin()->first, "conv_f"); + + auto output_memory_f = outputs_f.at("conv_f").get_memory(); + auto output_ptr_f = output_memory_f.pointer(); + unsigned int cntr = 0; + for (auto fl : output_ptr_f) + EXPECT_FLOAT_EQ(fl, output_vec[cntr++]); +} + + +TEST(convolution_f32_fw_gpu, local_multi_out_features) { + // Filter : 3x1x3x3x2x2 - local sizes + // Stride : 1x1 + // Input : 4x4 + // Output : 3x3x3 + // + // Input: + // 1 1 1 1 + // 1 1 1 1 + // 2 2 2 2 + // 2 2 2 2 + // + // + // Filter: + // 0 0 1 1 2 2 --- 1 ofm + // 0 0 1 1 2 2 + // + // 3 3 4 4 5 5 + // 3 3 4 4 5 5 + // + // 6 6 7 7 8 8 + // 6 6 7 7 8 8 + // + // 0 0 0 0 0 0 --- 2 ofm + // 0 0 0 0 0 0 + // + // 0 0 0 0 0 0 + // 0 0 0 0 0 0 + // + // 0 0 0 0 0 0 + // 0 0 0 0 0 0 + // + // 0 0 2 2 4 4 --- 3 ofm + // 0 0 2 2 4 4 + // + // 6 6 8 8 1 1 + // 6 6 8 8 1 1 + // + // 3 3 5 5 7 7 + // 3 3 5 5 7 7 + // + + // + // Output: + // 0 4 8 + // 18 24 30 + // 48 56 64 + // + // 0 0 0 + // 0 0 0 + // 0 0 0 + // + // 0 8 16 + // 36 48 6 + // 24 40 56 + // + + const auto& engine = get_test_engine(); + tensor local_size = tensor(3,1,2,2,3,3); + auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 4 } }); + auto weights_f = memory::allocate(engine, { data_types::f32, format::bf_lyx_yx, local_size }); + cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } }); + + std::vector weights_values_f = { + 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, + + 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, + + 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, + 8.0, 8.0, 8.0, 8.0, + + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + + 0.0, 0.0, 0.0, 0.0, + 2.0, 2.0, 2.0, 2.0, + 4.0, 4.0, 4.0, 4.0, + + 6.0, 6.0, 6.0, 6.0, + 8.0, 8.0, 8.0, 8.0, + 1.0, 1.0, 1.0, 1.0, + + 3.0, 3.0, 3.0, 3.0, + 5.0, 5.0, 5.0, 5.0, + 7.0, 7.0, 7.0, 7.0, + }; + set_values(input_f, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0 }); + set_values(weights_f, weights_values_f); + set_values(biases, { 0.0f, 0.0f, 0.0f }); + std::vector output_vec = + { + 0.0f, 4.0f, 8.0f, + 18.0f, 24.0f, 30.0f, + 48.0f, 56.0f, 64.0f, + + 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, + + 0.0f, 8.0f, 16.0f, + 36.0f, 48.0f, 6.0f, + 24.0f, 40.0f, 56.0f, + }; + + topology topology_f( + input_layout("input_f", input_f.get_layout()), + data("weights_f", weights_f), + data("biases", biases), + convolution("conv_f", "input_f", { "weights_f" }, { "biases" }, { 0, 0, 1, 1 })); + + build_options opts; + opts.set_option(build_option::optimize_data(true)); + network network_f(engine, topology_f, opts); + network_f.set_input_data("input_f", input_f); + + auto outputs_f = network_f.execute(); + EXPECT_EQ(outputs_f.begin()->first, "conv_f"); + + auto output_memory_f = outputs_f.at("conv_f").get_memory(); + auto output_ptr_f = output_memory_f.pointer(); + unsigned int cntr = 0; + for (auto fl : output_ptr_f) + { + EXPECT_FLOAT_EQ(fl, output_vec[cntr++]); + } +} + +TEST(convolution_f32_fw_gpu, local_multi_input_features) { + // Filter : 1x3x3x3x2x2 - local sizes + // Stride : 1x1 + // Input : 3x4x4 + // Output : 3x3 + // + // Input: + // 0 0 0 0 + // 0 0 0 0 + // 0 0 0 0 + // 0 0 0 0 + // + // 1 1 1 1 + // 1 1 1 1 + // 1 1 1 1 + // 1 1 1 1 + // + // 2 2 2 2 + // 2 2 2 2 + // 2 2 2 2 + // 2 2 2 2 + // + // + // Filter: + // 0 0 1 1 2 2 + // 0 0 1 1 2 2 + // + // 3 3 4 4 5 5 + // 3 3 4 4 5 5 + // + // 6 6 7 7 8 8 + // 6 6 7 7 8 8 + // + // 0 0 1 1 2 2 + // 0 0 1 1 2 2 + // + // 3 3 4 4 5 5 + // 3 3 4 4 5 5 + // + // 6 6 7 7 8 8 + // 6 6 7 7 8 8 + // + // 0 0 1 1 2 2 + // 0 0 1 1 2 2 + // + // 3 3 4 4 5 5 + // 3 3 4 4 5 5 + // + // 6 6 7 7 8 8 + // 6 6 7 7 8 8 + // + // Output: + // 0 4 8 + // 18 24 30 + // 48 56 64 + // + + const auto& engine = get_test_engine(); + tensor local_size = tensor(1,3,2,2,3,3); + auto input_f = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 3, 4, 4 } }); + auto weights_f = memory::allocate(engine, { data_types::f32, format::bf_lyx_yx, local_size }); + cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + std::vector weights_values_f = { + 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, + + 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, + + 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, + 8.0, 8.0, 8.0, 8.0, + + 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, + + 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, + 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, + 8.0, 8.0, 8.0, 8.0, + + 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, + + 3.0, 3.0, 3.0, 3.0, + 4.0, 4.0, 4.0, 4.0, + 5.0, 5.0, 5.0, 5.0, + + 6.0, 6.0, 6.0, 6.0, + 7.0, 7.0, 7.0, 7.0, + 8.0, 8.0, 8.0, 8.0, + }; + set_values(input_f, { + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0 }); + set_values(weights_f, weights_values_f); + set_values(biases, { 0.0f }); + std::vector output_vec = + { + 60.0f, 72.0f, 84.0f, + 24.0f, 36.0f, 48.0f, + 24.0f, 36.0f, 48.0f + }; + + topology topology_f( + input_layout("input_f", input_f.get_layout()), + data("weights_f", weights_f), + data("biases", biases), + convolution("conv_f", "input_f", { "weights_f" }, { "biases" }, { 0, 0, 1, 1 })); + + build_options opts; + opts.set_option(build_option::optimize_data(true)); + network network_f(engine, topology_f, opts); + network_f.set_input_data("input_f", input_f); + + auto outputs_f = network_f.execute(); + EXPECT_EQ(outputs_f.begin()->first, "conv_f"); + + auto output_memory_f = outputs_f.at("conv_f").get_memory(); + auto output_ptr_f = output_memory_f.pointer(); + unsigned int cntr = 0; + for (auto fl : output_ptr_f) + EXPECT_FLOAT_EQ(fl, output_vec[cntr++]); } + TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp16) { #define USE_OLD_WEIGHTS_FORMAT 0 - engine engine; + const auto& engine = get_test_engine(); if (!engine.get_info().supports_fp16) { @@ -3053,12 +4204,228 @@ TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp16) #undef USE_OLD_WEIGHTS_FORMAT } +using TestParamType_convolution_gpu = ::testing::tuple; // 4 - With bias + +struct convolution_gpu : public ::testing::TestWithParam +{ + static std::string + PrintToStringParamName(testing::TestParamInfo param_info) + { + // construct a readable name + return std::to_string(testing::get<0>(param_info.param)) + + 'x' + std::to_string(testing::get<0>(param_info.param)) + + "_f" + std::to_string(testing::get<1>(param_info.param)) + + "_stride" + std::to_string(testing::get<2>(param_info.param)) + + "_pad" + std::to_string(testing::get<3>(param_info.param)) + + (testing::get<4>(param_info.param) ? "_bias" : ""); + } +}; + +TEST_P(convolution_gpu, b_fs_yx_fsv4) +{ + const int in_B = 2; + const int in_X = 56; + const int in_Y = 56; + const int _OuD = 32; + const int W_B = _OuD; + + // Kernel sizes + int W_X = testing::get<0>(GetParam()); + int W_Y = W_X; + + // Convoluiton offset + int offSet = -(W_X / 2); + + // Features + int in_F = testing::get<1>(GetParam()); + int W_F = in_F; + + // Stride + int stride = testing::get<2>(GetParam()); + + // Output padding + int output_padding = testing::get<3>(GetParam()); + + // Biases + bool with_bias = testing::get<4>(GetParam()); + + engine engine; + + // Input data init + std::vector Data(in_B * in_F * in_X * in_Y); + std::iota(Data.begin(), Data.end(), 0); + auto input = memory::allocate(engine, {data_types::i8, format::bfyx, {in_B, in_F, in_X, in_Y}}); + set_values(input, std::move(Data)); + + // Create a topology + topology topology(input_layout("input", input.get_layout())); + + // Reorder + topology.add(reorder("reorder_in", + "input", + layout(data_types::i8, format::b_fs_yx_fsv4, {in_B, in_F, in_X, in_Y}))); + + // Weights init + std::vector Weights(W_B * W_F * W_X * W_Y); + std::iota(Weights.begin(), Weights.end(), 0); + auto weights_gold = + memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}}); + auto weights_imad = + memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}}); + set_values(weights_gold, Weights); + set_values(weights_imad, std::move(Weights)); + topology.add(data("weights_gold", weights_gold), data("weights_imad", weights_imad)); + + if (with_bias) + { + // Bias, Callibraiton, Quantization + std::vector vB(_OuD), vC(_OuD), vQ(_OuD); + float x = 0.1f; + std::generate(vB.begin(), vB.end(), [x]() mutable { + x += 0.01f; + if (x >= 0.9f) + x = 0.1f; + return x; + }); + x = 0.2f; + std::generate(vC.begin(), vC.end(), [x]() mutable { + x += 0.01f; + if (x >= 0.9f) + x = 0.2f; + return x; + }); + x = 0.3f; + std::generate(vQ.begin(), vQ.end(), [x]() mutable { + x += 0.01f; + if (x >= 0.9f) + x = 0.3f; + return x; + }); + auto bias_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}}); + auto bias_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}}); + auto callib_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}}); + auto callib_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}}); + auto quant_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}}); + auto quant_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}}); + set_values(bias_gold, vB); + set_values(bias_imad, std::move(vB)); + set_values(callib_gold, vC); + set_values(callib_imad, std::move(vC)); + set_values(quant_gold, vQ); + set_values(quant_imad, std::move(vQ)); + topology.add(data("bias_gold", bias_gold), + data("callib_gold", callib_gold), + data("quant_gold", quant_gold)); + topology.add(data("bias_imad", bias_imad), + data("callib_imad", callib_imad), + data("quant_imad", quant_imad)); + + // Convolutions + convolution conv_gold("conv_gold", + "input", + {"weights_gold"}, + {"bias_gold"}, + {"quant_gold"}, + {"callib_gold"}, + 1.0f, + {1, 1, stride, stride}, + {0, 0, offSet, offSet}); + convolution conv_imad("conv_imad", + "reorder_in", + {"weights_imad"}, + {"bias_imad"}, + {"quant_imad"}, + {"callib_imad"}, + 1.0f, + {1, 1, stride, stride}, + {0, 0, offSet, offSet}); + conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f); + conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f); + topology.add(conv_gold, conv_imad); + } + else + { + // Convolutions + convolution conv_gold( + "conv_gold", "input", {"weights_gold"}, {1, 1, stride, stride}, {0, 0, offSet, offSet}); + convolution conv_imad( + "conv_imad", "reorder_in", {"weights_imad"}, {1, 1, stride, stride}, {0, 0, offSet, offSet}); + conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f); + conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f); + topology.add(conv_gold, conv_imad); + } + + // Reorder + topology.add(reorder("reorder_out", + "conv_imad", + layout(data_types::i8, + format::bfyx, + {in_B, W_B, (in_X + stride - 1) / stride, (in_Y + stride - 1) / stride}, + padding({0, 0, output_padding, output_padding}, 0.0f)))); + + // Network build + build_options build_opt; + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + + // Network execuiton + network.set_input_data("input", input); + auto outputs = network.execute(); + + auto out_gold = outputs.find("conv_gold"); + auto out_test = outputs.find("reorder_out"); + ASSERT_NE(out_gold, outputs.end()); + ASSERT_NE(out_test, outputs.end()); + + auto gold_ptr = out_gold->second.get_memory().pointer(); + auto test_ptr = out_test->second.get_memory().pointer(); + + ASSERT_EQ(gold_ptr.size(), test_ptr.size()); + for (size_t i = 0; i < gold_ptr.size(); i++) + { + ASSERT_EQ(gold_ptr[i], test_ptr[i]); + } +} + +// Select particular test cases +INSTANTIATE_TEST_CASE_P(convolution_gpu_imad, + convolution_gpu, + ::testing::Values( + // Filter size, Input features, Stride, Output padding, With bias + TestParamType_convolution_gpu(1, 32, 1, 0, false), + TestParamType_convolution_gpu(3, 32, 1, 0, false), + TestParamType_convolution_gpu(7, 3, 1, 0, false), + TestParamType_convolution_gpu(1, 32, 1, 0, true), + TestParamType_convolution_gpu(3, 32, 1, 0, true), + TestParamType_convolution_gpu(7, 3, 1, 0, true), + TestParamType_convolution_gpu(1, 32, 1, 1, false), + TestParamType_convolution_gpu(3, 32, 1, 1, false), + TestParamType_convolution_gpu(7, 3, 1, 1, false), + TestParamType_convolution_gpu(1, 32, 2, 0, false), + TestParamType_convolution_gpu(3, 32, 2, 0, false), + TestParamType_convolution_gpu(7, 3, 2, 0, false)), + convolution_gpu::PrintToStringParamName); +//// or test all combinations +//INSTANTIATE_TEST_CASE_P(convolution_gpu_imad, +// convolution_gpu, +// ::testing::Combine(::testing::Values(1, 3, 7), // Filter size +// ::testing::Values(3, 32), // Input features +// ::testing::Values(1, 2), // Stride +// ::testing::Values(0, 1), // Output padding +// ::testing::Values(false, true) // With bias +// ), +// convolution_gpu::PrintToStringParamName); + class convolution_test : public tests::generic_test { public: - static void TearDownTestCase() + static void TearDownTestCase() { for (auto generic_params : all_generic_params) { @@ -3073,9 +4440,9 @@ public: static std::vector generate_specific_test_params() { - // TODO: check split + // TODO: check split - // TODO: check convolution without bias + // TODO: check convolution without bias const std::vector& weights = { "input1" }; const std::vector& bias = { "input2" }; @@ -3120,7 +4487,9 @@ public: std::vector input_tensor_size = { tensor(1, 5, 59, 72), tensor(8, 3, 63, 56), tensor(16, 2, 50, 50), tensor(32, 1, 44, 62) }; - for (cldnn::data_types data_type : test_data_types()) + auto data_types = test_data_types(); + + for (cldnn::data_types data_type : data_types) { for (cldnn::format input_format : input_formats) { @@ -3232,7 +4601,7 @@ public: const cldnn::convolution* convolution = (cldnn::convolution*)layer_params; data_types dt = inputs[0].get_layout().data_type; - + tensor input_size = inputs[0].get_layout().size; tensor dilation = convolution->dilation; tensor stride = convolution->stride; @@ -3261,7 +4630,7 @@ public: // Initialized output with zeros. std::fill(output_mem.begin(), output_mem.end(), static_cast(0)); - + // Add the bias for (int b = 0; b < input_size.batch[0]; b++) { @@ -3377,7 +4746,7 @@ TEST_P(convolution_test, CONVOLUTION) run_single_test(); } -INSTANTIATE_TEST_CASE_P(DISABLED_CONVOLUTION, - convolution_test, +INSTANTIATE_TEST_CASE_P(DISABLED_CONVOLUTION, + convolution_test, ::testing::ValuesIn(convolution_test::generate_all_test_params()), tests::generic_test::custom_param_name_functor()); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp index a3cbc0a..6f8b9d4 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp @@ -48,7 +48,7 @@ TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad // -4 3.5 -0.5 21 // 12 -18 4 -9 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -103,7 +103,7 @@ TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad // -4 3.5 -0.5 21 // 12 -18 4 -9 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -139,7 +139,7 @@ TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad } } -TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_fusion) { +TEST(convolution_grad_input_f32_fw_gpu, DISABLED_basic_wsiz2x2_in2x2x1x2_bfyx_stride2_fusion) { // Filter : 2x2 // Input : 2x2x1x2 // Output : 2x2x1x2 @@ -157,7 +157,7 @@ TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_fus // -4 3.5 -0.5 21 // 12 -18 4 -9 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -198,8 +198,8 @@ TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_fus auto output_ptr = output_prim.pointer(); std::vector expected_output_vec = { - -3.f, 5.5f, 15.f, -14.f, - 4.5f, 27.f, 11.f, 0.f + -3.f, 5.5f, 14.f, -15.f, + 4.5f, 27.f, 10.f, -1.f }; for (unsigned int i = 0; i < expected_output_vec.size(); i++) diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp index 0857fba..1a7cd2a 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp @@ -33,6 +33,22 @@ using namespace cldnn; using namespace tests; +void validate_output(std::vector expected_weights_vec, std::map outputs) +{ + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "conv_grad_weights"); + + auto output_prim = outputs.begin()->second.get_memory(); + auto output_ptr = output_prim.pointer(); + + for (unsigned int i = 0; i < expected_weights_vec.size(); i++) + { + float x = float_round(expected_weights_vec[i]); + float y = float_round(output_ptr[i]); + EXPECT_FLOAT_EQ(x, y) << "on weights verification" << random_seed << std::endl; + } +} + TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1) { // Filter : 2x2 // Input grad : 1x2x2x2 @@ -47,7 +63,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_p // 8 0.5 // 6 9 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); @@ -123,7 +139,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in8x1x2x2_bfyx_stride2_p // 8 0.5 // 6 9 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); @@ -195,7 +211,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_p // 8 0.5 // 6 9 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -257,7 +273,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_p // Bias: // 0 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.001f; auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -275,10 +291,12 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_p mutable_data("biases", biases), convolution("conv", "input_reordered", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 }), convolution_grad_input("conv_grad_input", "conv", { "weights" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 }), - convolution_grad_weights("conv_grad_weights", "conv", "input_reordered", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 }) + convolution_grad_weights("conv_grad_weights", "conv", "input_reordered", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, + { 0, 0, -1, -1 }, { 1,1,1,1 }, "conv_grad_input") ); - - network network(engine, topology); + build_options opt; + opt.set_option(build_option::outputs({ "conv_grad_input", "conv_grad_weights" })); + network network(engine, topology, opt); network.set_input_data("input", input); network.set_learning_rate(lr); @@ -329,7 +347,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_p // 8 0.5 1 2 // 6 9 3 4 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } }); @@ -424,7 +442,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz1x1_in1x2x5x5_bfyx_stride2_p // 5 6 7 8 // 9 10 11 11 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 5, 5 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 4 } }); @@ -515,7 +533,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in32x1x2x2_yxfb_stride1) // y2: x1: 0.5 0.6 0.7 0.9 1 1.1 0.7 0.9 0.1 1.9 0.6 0.5 0.4 0.1 0.1 1.7 0.5 0.4 0.5 0.6 0.7 0.8 0.8 1.7 1.8 1.2 2.1 0.5 0.2 0.9 1.5 1.6 // y2: x2: 0.5 0.6 0.7 0.9 1 1.1 0.7 0.9 0.1 1.9 0.1 1.7 0.5 0.4 0.4 0.1 0.1 1.7 0.5 0.4 0.5 0.6 1.2 2.1 0.5 0.2 0.9 0.4 0.1 1.2 1.7 1.8 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::yxfb,{ 32, 1, 2, 2 } }); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 32, 1, 3, 3 } }); @@ -597,7 +615,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz3x3_in2x1x3x3_bfyx_stride1_p // 0.5 0.6 0.7 0.9 1 1.1 0.7 0.9 0.1 // 0.7 0.8 0.8 1.7 1.8 1.2 2.1 0.5 0.2 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 3 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } }); @@ -679,7 +697,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz3x3_in2x1x3x3_bfyx_stride1_p // 0.5 0.6 0.7 0.9 1 1.1 0.7 0.9 0.1 // 0.7 0.8 0.8 1.7 1.8 1.2 2.1 0.5 0.2 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 3 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } }); @@ -781,7 +799,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz7x7_in2x1x7x7_bfyx_stride1_p // b0:f0: 0.7 0.8 0.8 0.7 0.8 0.2 0.1 b0:f1: 0.4 0.6 0.1 0.2 0.1 0.1 0.7 // b0:f0: 0.5 0.6 0.7 0.9 0. 0.1 0.7 b0:f1: 0.5 0.3 0.7 0.5 0.4 0.1 0.7 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 7, 7 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } }); @@ -927,7 +945,7 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz7x7_in2x1x7x7_bfyx_stride1_p // b0:f0: 0.7 0.8 0.8 0.7 0.8 0.2 0.1 b0:f1: 0.4 0.6 0.1 0.2 0.1 0.1 0.7 // b0:f0: 0.5 0.6 0.7 0.9 0. 0.1 0.7 b0:f1: 0.5 0.3 0.7 0.5 0.4 0.1 0.7 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 7, 7 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } }); @@ -1044,3 +1062,52 @@ TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz7x7_in2x1x7x7_bfyx_stride1_p EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl; } } + +TEST(convolution_grad_weights_f32_fw_gpu, ngraph_2d_1item_2iterations) { + // Filter : 2x1x2x2 + // Input grad : 1x2x4x2 + // Input : 1x1x5x3 + // Stride : 1x1 + + const auto& engine = get_test_engine(); + auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 3 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); + + + topology topology( + input_layout("input_grad", input_grad.get_layout()), + data("input", input), + mutable_data("weights", weights), + convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { 1,1,1,1 }, { 0,0,0,0 }, { 1,1,1,1 }, true) + ); + + build_options bo; + bo.set_option(build_option::optimize_data(true)); + network network(engine, topology, bo); + + + // set values for first iteration + set_values(input, + { 0.671875f, 0.546875f, -0.5625f, -0.359375f, -0.09375f, 0.546875f, -0.546875f, 0.890625f, 0.828125f, -0.546875f, 1.f, -0.078125f, -0.890625f, 0.40625f, -0.359375f }); + set_values(input_grad, + { 1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f }); + network.set_input_data("input_grad", input_grad); + std::vector expected_weights_vec = + { 0.671875f, 0.546875f, 0.546875f, -0.546875f, + 0.f, 0.f, 0.f, 0.f }; + auto outputs = network.execute(); + validate_output(expected_weights_vec, outputs); + + // set values for second iteration + set_values(input_grad, + { 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.f }); + network.set_input_data("input_grad", input_grad); + expected_weights_vec = + { 0.f, 0.f, 0.f, 0.f, + 0.828125f, -0.546875f, 0.40625f, -0.359375f }; + outputs = network.execute(); + validate_output(expected_weights_vec, outputs); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/crop_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/crop_gpu_test.cpp index 8ab62ad..fb8b6e3 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/crop_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/crop_gpu_test.cpp @@ -45,7 +45,7 @@ TEST(crop_gpu, basic_in2x3x2x2_crop_all) { // Input : 2x3x4x5 // Output : 1x2x2x3 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 3; @@ -88,12 +88,60 @@ TEST(crop_gpu, basic_in2x3x2x2_crop_all) { } } +TEST(crop_gpu, basic_int_in2x3x2x2_crop_all) { + // Reference : 1x2x2x2 + // Input : 2x3x4x5 + // Output : 1x2x2x3 + + const auto& engine = get_test_engine(); + + auto batch_num = 2; + auto feature_num = 3; + auto x_size = 4; + auto y_size = 5; + + auto crop_batch_num = batch_num - 1; + auto crop_feature_num = feature_num - 1; + auto crop_x_size = x_size - 2; + auto crop_y_size = y_size - 2; + + auto input = memory::allocate(engine, { data_types::i32, format::yxfb,{ batch_num, feature_num, x_size, y_size } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(crop("crop", "input", { crop_batch_num, crop_feature_num, crop_x_size, crop_y_size }, { 0, 0, 0, 0 })); + + std::vector input_vec = generate_random_input(batch_num, feature_num, y_size, x_size, -10, 10); + set_values(input, input_vec); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("crop").get_memory(); + auto output_ptr = output.pointer(); + + for (int b = 0; b < crop_batch_num; ++b) { //B + for (int f = 0; f < crop_feature_num; ++f) { //F + for (int y = 0; y < crop_y_size; ++y) { //Y + for (int x = 0; x < crop_x_size; ++x) { //X + int linear_id = b + batch_num * (f + feature_num * (x + x_size * y)); + int output_linear_id = b + crop_batch_num * (f + crop_feature_num * (x + crop_x_size * y)); + EXPECT_EQ(output_ptr[output_linear_id], input_vec[linear_id]); + } + } + } + } +} + TEST(crop_gpu, basic_in2x3x2x2_crop_all_bfyx) { // Reference : 3x1x2x2 // Input : 6x2x4x3 // Output : 3x1x2x2 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 6; auto feature_num = 2; @@ -137,6 +185,149 @@ TEST(crop_gpu, basic_in2x3x2x2_crop_all_bfyx) { } } +TEST(crop_gpu, basic_int_in2x3x2x2_crop_all_bfyx) { + // Reference : 3x1x2x2 + // Input : 6x2x4x3 + // Output : 3x1x2x2 + + const auto& engine = get_test_engine(); + + auto batch_num = 6; + auto feature_num = 2; + auto x_size = 4; + auto y_size = 3; + + auto crop_batch_num = batch_num - 3; + auto crop_feature_num = feature_num - 1; + auto crop_x_size = x_size - 2; + auto crop_y_size = y_size - 1; + + auto input = memory::allocate(engine, { data_types::i32,format::bfyx,{ batch_num, feature_num, x_size, y_size } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(crop("crop", "input", { crop_batch_num, crop_feature_num, crop_x_size, crop_y_size }, { 0, 0, 0, 0 })); + + std::vector input_vec = generate_random_input(batch_num, feature_num, y_size, x_size, -10, 10); + set_values(input, input_vec); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("crop").get_memory(); + auto output_ptr = output.pointer(); + std::vector a; + for (int b = 0; b < crop_batch_num; ++b) { //B + for (int f = 0; f < crop_feature_num; ++f) { //F + for (int y = 0; y < crop_y_size; ++y) { //Y + for (int x = 0; x < crop_x_size; ++x) { //X + int linear_id = x + x_size * (y + y_size * (f + feature_num * b)); + int output_linear_id = x + crop_x_size * (y + crop_y_size * (f + crop_feature_num * b)); + a.push_back(output_ptr[output_linear_id]); + EXPECT_EQ(output_ptr[output_linear_id], input_vec[linear_id]); + } + } + } + } +} + +TEST(crop_gpu, basic_in2x3x2x2_crop_all_fyxb) { + // Reference : 3x1x2x2 + // Input : 6x2x4x3 + // Output : 3x1x2x2 + + const auto& engine = get_test_engine(); + + auto batch_num = 6; + auto feature_num = 2; + auto x_size = 4; + auto y_size = 3; + + auto crop_batch_num = batch_num - 3; + auto crop_feature_num = feature_num - 1; + auto crop_x_size = x_size - 2; + auto crop_y_size = y_size - 1; + + auto input = memory::allocate(engine, { data_types::f32,format::fyxb,{ batch_num, feature_num, x_size, y_size } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(crop("crop", "input", { crop_batch_num, crop_feature_num, crop_x_size, crop_y_size }, {0, 0, 0, 0} )); + + std::vector input_vec = generate_random_input(batch_num, feature_num, y_size, x_size, -10, 10); + set_values(input, input_vec); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("crop").get_memory(); + auto output_ptr = output.pointer(); + for (int b = 0; b < crop_batch_num; ++b) { //B + for (int f = 0; f < crop_feature_num; ++f) { //F + for (int y = 0; y < crop_y_size; ++y) { //Y + for (int x = 0; x < crop_x_size; ++x) { //X + int linear_id = b + batch_num * (x + x_size * (y + y_size * f)); + int output_linear_id = b + crop_batch_num * (x + crop_x_size * (y + crop_y_size * f)); + EXPECT_EQ(output_ptr[output_linear_id], input_vec[linear_id]); + } + } + } + } +} + +TEST(crop_gpu, basic_int_in2x3x2x2_crop_all_fyxb) { + // Reference : 3x1x2x2 + // Input : 6x2x4x3 + // Output : 3x1x2x2 + + const auto& engine = get_test_engine(); + + auto batch_num = 6; + auto feature_num = 2; + auto x_size = 4; + auto y_size = 3; + + auto crop_batch_num = batch_num - 3; + auto crop_feature_num = feature_num - 1; + auto crop_x_size = x_size - 2; + auto crop_y_size = y_size - 1; + + auto input = memory::allocate(engine, { data_types::i32,format::fyxb,{ batch_num, feature_num, x_size, y_size } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(crop("crop", "input", { crop_batch_num, crop_feature_num, crop_x_size, crop_y_size }, { 0, 0, 0, 0 })); + + std::vector input_vec = generate_random_input(batch_num, feature_num, y_size, x_size, -10, 10); + set_values(input, input_vec); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("crop").get_memory(); + auto output_ptr = output.pointer(); + for (int b = 0; b < crop_batch_num; ++b) { //B + for (int f = 0; f < crop_feature_num; ++f) { //F + for (int y = 0; y < crop_y_size; ++y) { //Y + for (int x = 0; x < crop_x_size; ++x) { //X + int linear_id = b + batch_num * (x + x_size * (y + y_size * f)); + int output_linear_id = b + crop_batch_num * (x + crop_x_size * (y + crop_y_size * f)); + EXPECT_EQ(output_ptr[output_linear_id], input_vec[linear_id]); + } + } + } + } +} + TEST(crop_gpu, basic_in2x3x2x2_crop_offsets) { // Reference : 1x2x2x1 // Offsets : 1x0x1x1 @@ -145,11 +336,11 @@ TEST(crop_gpu, basic_in2x3x2x2_crop_offsets) { // Input: // f0: b0: 1 2 -10 b1: 0 0 -11 - // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 - // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 + // f0: b0: 3 4 -14 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -12 b1: 1.5 5.2 -13 // f1: b0: 7 8 -16 b1: 12 8 -17 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -202,11 +393,76 @@ TEST(crop_gpu, basic_in2x3x2x2_crop_offsets) { } } +TEST(crop_gpu, basic_int_in2x3x2x2_crop_offsets) { + // Reference : 1x2x2x1 + // Offsets : 1x0x1x1 + // Input : 2x2x3x2 + // Output : 1x2x2x1 + + // Input: + // f0: b0: 1 2 -10 b1: 0 0 -11 + // f0: b0: 3 4 -14 b1: 50 -5 -15 + // f1: b0: 5 6 -12 b1: 15 52 -13 + // f1: b0: 7 8 -16 b1: 12 8 -17 + + const auto& engine = get_test_engine(); + + auto batch_num = 2; + auto feature_num = 2; + auto x_size = 3; + auto y_size = 2; + + auto crop_batch_num = batch_num - 1; + auto crop_feature_num = feature_num; + auto crop_x_size = x_size - 1; + auto crop_y_size = y_size - 1; + + auto batch_offset = 1; + auto feature_offset = 0; + auto x_offset = 1; + auto y_offset = 1; + + auto input = memory::allocate(engine, { data_types::i32, format::yxfb,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(crop("crop", "input", tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num)), { tensor(feature(0)) })); + + std::vector input_vec = { 1, 0, 5, 15, + 2, 0, 6, 52, + -10, -11, -12, -13, + 3, 50, 7, 12, + 4, -5, 8, 8, + -14, -15, -16, -17 }; + set_values(input, input_vec); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("crop").get_memory(); + auto output_ptr = output.pointer(); + + for (int b = 0; b < crop_batch_num; ++b) { //B + for (int f = 0; f < crop_feature_num; ++f) { //F + for (int y = 0; y < crop_y_size; ++y) { //Y + for (int x = 0; x < crop_x_size; ++x) { //X + int linear_id = (b + batch_offset) + batch_num * ((f + feature_offset) + feature_num * ((x + x_offset) + x_size * (y + y_offset))); + int output_linear_id = b + crop_batch_num * (f + crop_feature_num * (x + crop_x_size * y)); + EXPECT_EQ(output_ptr[output_linear_id], input_vec[linear_id]); + } + } + } + } +} + TEST(crop_gpu, basic_in1x4x1x1_split) { // Tests split with crop implementation // _CROP_1(1x3x1x1,offset(0x0x0x0)) // | - // INPUT(1x4x1x1) + // INPUT(1x4x1x1) // |_ // CROP_2(1x1x1x1,offset(0x3x0x0)) // @@ -231,7 +487,7 @@ TEST(crop_gpu, basic_in1x4x1x1_split) { // Out2: // f0: 4.0 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 1; auto feature_num = 4; @@ -278,11 +534,87 @@ TEST(crop_gpu, basic_in1x4x1x1_split) { EXPECT_EQ(output_ptr_2[i], out2[i]); } +TEST(crop_gpu, basic_int_in1x4x1x1_split) { + // Tests split with crop implementation + // _CROP_1(1x3x1x1,offset(0x0x0x0)) + // | + // INPUT(1x4x1x1) + // |_ + // CROP_2(1x1x1x1,offset(0x3x0x0)) + // + // Reference1 : 1x3x1x1 + // Offsets1 : 0x0x0x0 + // Reference2 : 1x1x1x1 + // Offsets2 : 0x3x0x0 + // Input : 1x4x1x1 + // Output1 : 1x3x1x1 + // Output2 : 1x1x1x1 + + // Input: + // f0: -1 + // f1: 2 + // f2: -3 + // f3: 4 + + // Out1: + // f0: -1 + // f1: 2 + // f2: -3 + + // Out2: + // f0: 4 + const auto& engine = get_test_engine(); + + auto batch_num = 1; + auto feature_num = 4; + auto x_size = 1; + auto y_size = 1; + + auto crop_batch_num = 1; + auto crop_feature_num_1 = 3; + auto crop_feature_num_2 = 1; + auto crop_x_size = 1; + auto crop_y_size = 1; + auto feature_offset_1 = 0; + auto feature_offset_2 = 3; + auto input = memory::allocate(engine, { data_types::i32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(crop("crop1", "input", tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) })); + topology.add(crop("crop2", "input", tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) })); + + std::vector input_vec = { -1, 2, -3, 4 }; + std::vector out1 = { -1, 2,-3 }; + std::vector out2 = { 4, }; + set_values(input, input_vec); + build_options bo; + bo.set_option(build_option::optimize_data(true)); + bo.set_option(build_option::outputs(topology.get_primitive_ids())); + + network network(engine, topology, bo); + network.set_input_data("input", input); + auto outputs = network.execute(); + + auto output = outputs.at("crop1").get_memory(); + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < out1.size(); i++) + EXPECT_EQ(output_ptr[i], out1[i]); + + std::cout << std::endl; + auto output_2 = outputs.at("crop2").get_memory(); + auto output_ptr_2 = output_2.pointer(); + + for (size_t i = 0; i < out2.size(); i++) + EXPECT_EQ(output_ptr_2[i], out2[i]); +} + TEST(crop_gpu, basic_in1x4x1x1_split_w_relu) { // Tests split with crop implementation // _ CROP_1(1x3x1x1,offset(0x0x0x0)) --> RELU // | - // INPUT(1x4x1x1)--RELU + // INPUT(1x4x1x1)--RELU // |_ // CROP_2(1x1x1x1,offset(0x3x0x0)) --> RELU // diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/custom_gpu_primitive_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/custom_gpu_primitive_test.cpp index a3ad88f..f74a5f9 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/custom_gpu_primitive_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/custom_gpu_primitive_test.cpp @@ -60,7 +60,7 @@ TEST(custom_gpu_primitive_f32, add_basic_in2x2x2x2) { // f1: b0: 15 16.5 b1: 22 16.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); @@ -153,7 +153,7 @@ void add_basic_in2x2x2x2_with_reorder() // f1: b0: 15 16.5 b1: 22 16.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); @@ -266,7 +266,7 @@ TEST(custom_gpu_primitive_f32, eltwise_add_basic_in2x2x2x2) { // f1: b0: 15 16.5 b1: 22 16.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); @@ -359,7 +359,7 @@ TEST(custom_gpu_primitive_f32, add_eltwise_basic_in2x2x2x2) { // f1: b0: 15 16.5 b1: 22 16.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); @@ -452,7 +452,7 @@ TEST(custom_gpu_primitive_f32, two_kernels_with_same_entry_point_basic_in2x2x2x2 // f1: b0: 15 16.5 b1: 22 16.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); @@ -523,3 +523,74 @@ TEST(custom_gpu_primitive_f32, two_kernels_with_same_entry_point_basic_in2x2x2x2 EXPECT_TRUE(are_equal(input_ptr[i] + 7, output_ptr[i])); } } + +TEST(custom_gpu_primitive_u8, add_basic_in2x2x2x2) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 2, 2, 2, 2 } }); + + std::string kernel_code = + R"__krnl( + __kernel void add_kernel(const __global uchar* input0, const __global uchar* input1, __global uchar* output) + { + const unsigned idx = get_global_id(0); + output[idx] = input0[idx] + input1[idx]; + } + )__krnl"; + std::string entry_point = "add_kernel"; + std::vector parameters = { { arg_input, 0 },{ arg_input, 1 },{ arg_output, 0 } }; + layout output_layout = { data_types::u8, format::yxfb,{ 2, 2, 2, 2 } }; + std::vector gws = { output_layout.count() }; + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(custom_gpu_primitive( + "user_kernel", + { "input", "input2" }, + { kernel_code }, + entry_point, + parameters, + "-cl-mad-enable", + output_layout, + gws)); + + set_values(input, { + 1, 0, 5, 1, + 200, 100, 160, 150, + 130, 0, 175, 12, + 4, 100, 8, 180 + }); + + set_values(input2, { + 0, 2, 0, 2, + 55, 75, 20, 4, + 15, 17, 80, 10, + 2, 60, 0, 20 + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "user_kernel"); + + auto output = outputs.at("user_kernel").get_memory(); + + unsigned char answers[16] = { + 1, 2, 5, 3, + 255, 175, 180, 154, + 145, 17, 255, 22, + 6, 160, 8, 200 + }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/deconvolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/deconvolution_gpu_test.cpp index 2f3ffe5..f546e37 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/deconvolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/deconvolution_gpu_test.cpp @@ -53,7 +53,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x1_nopad) { // 18 0.75 7.25 // 23 42.5 15.5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -115,7 +115,7 @@ TEST(deconvolution_f32_fw_gpu, no_bias_basic_wsiz2x2_in2x2x1x1_nopad) { // 18 0.75 7.25 // 23 42.5 15.5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); @@ -173,7 +173,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x1_nopad_bfyx) { // Filt // 18 0.75 7.25 // 23 42.5 15.5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); @@ -233,7 +233,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x1_pad1) { // Output: // 0.75 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -284,7 +284,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x1_stride2_nopad) { // Output: // 0.75 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -349,7 +349,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x1_stride4_pad2) { // 0 0 0 // 6 0 -18 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } }); @@ -411,7 +411,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_stride2_pad1) { // -3 4.5 0.5 22 // 13 -17 5 -7 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -476,7 +476,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2x2_in2x2x1x1_stride2_pad1) { // f1: 1 8.5 // f1: 17 - 13 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } }); @@ -537,7 +537,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1) { // -3 4.5 0.5 22 // 13 -17 5 -7 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } }); @@ -599,7 +599,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1_input_p // -3 4.5 0.5 22 // 13 -17 5 -7 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -666,7 +666,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2x2_in2x2x1x1_stride2_pad1_input_padd // f1: 1 8.5 // f1: 17 - 13 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 2, 2 } }); @@ -728,7 +728,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_yxfb_stride2_pad1) { // -3 4.5 0.5 22 // 13 -17 5 -7 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -789,7 +789,7 @@ TEST(deconvolution_f16_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_yxfb_stride2_pad1) { // -3 4.5 0.5 22 // 13 -17 5 -7 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 2, 1, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -861,7 +861,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2) // -3 4.5 -8 -28 // 13 -17 1 -17 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -906,11 +906,56 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2) } } +TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_group2) { + // data is similar as in basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2 + + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 1 } }); + + set_values(input, { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f }); + set_values(weights, { + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f + }); + set_values(biases, { 1.0f, -1.0f }); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + deconvolution("deconv", "input", { "weights" }, { "biases" }, 2, { 1, 1, 2, 2 }, { 0, 0, -1, -1 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "deconv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + std::vector expected_output_vec = { + -3.f, 4.5f, 13.f, -17.f, + -8.f, -28.f, 1.f, -17.f + }; + + for (unsigned int i = 0; i < expected_output_vec.size(); i++) + { + EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); + } +} + TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2_depthwise_sep_opt) { // Test for depthwise separable optimization, there are 16 weights and biases (split 16) // data is similar as in basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 2, 2 } }); set_values(input, @@ -989,11 +1034,93 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2_ } } +TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_group16) { + // Test for depthwise separable optimization, there are 16 joined weights and biases (group 16) + // data is similar as in basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2_depthwise_sep_opt + + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 2, 2 } }); + set_values(input, + { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f + }); + + topology topology(input_layout("input", input.get_layout())); + + std::vector weights_vec; + std::vector bias_vec; + + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 16, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 16, 1 } }); + + set_values(weights, + { + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f + } + ); + set_values(biases, { 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f }); + topology.add( + data("weights", weights), + data("bias", biases) + ); + + topology.add(deconvolution("deconv", "input", { "weights" }, { "bias" }, 16, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "deconv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + std::vector expected_output_vec = { + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, + }; + + for (unsigned int i = 0; i < expected_output_vec.size(); i++) + { + EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); + } +} + TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2_depthwise_sep_opt_ofm2) { // Test for depthwise separable optimization, there are 16 weights and biases (split 16) // data is similar as in basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 2, 2 } }); set_values(input, @@ -1072,6 +1199,96 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2_ } } +TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_group16_ofm2) { + // Test for depthwise separable optimization, there are 16 joined weights and biases (group 16) + // data is similar as in basic_wsiz2x2_in1x2x2x2_bfyx_stride2_pad1_split2_depthwise_sep_opt_ofm2 + + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 2, 2 } }); + set_values(input, + { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f, + 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f + }); + + topology topology(input_layout("input", input.get_layout())); + + std::vector weights_vec; + std::vector bias_vec; + + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 32, 1, 2, 2 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 32, 1 } }); + + set_values(weights, + { + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + -2.f, 2.f, 7.f, -0.5f, -2.f, 2.f, 7.f, -0.5f, + -4.f, 1.f, -9.f, -7.f, -4.f, 1.f, -9.f, -7.f, + } + ); + + set_values(biases, + { + 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, + 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f + } + ); + + topology.add( + data("weights", weights), + data("bias", biases) + ); + + topology.add(deconvolution("deconv", "input", { "weights" }, { "bias" }, 16, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "deconv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + std::vector expected_output_vec = { + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + -3.f, 4.5f, 13.f, -17.f,-3.f, 4.5f, 13.f, -17.f, -8.f, -28.f, 1.f, -17.f, -8.f, -28.f, 1.f, -17.f, + }; + + for (unsigned int i = 0; i < expected_output_vec.size(); i++) + { + EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); + } +} + + TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x6x1x1_bfyx_stride2_pad1_split2_ofm3) { // Filter : 1x1 // Stride : 1x1 @@ -1109,7 +1326,7 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x6x1x1_bfyx_stride2_pad1_split2_ // 6 // -2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 4, 1, 1 } }); auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 3, 2, 1, 1 } }); @@ -1152,4 +1369,52 @@ TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x6x1x1_bfyx_stride2_pad1_split2_ { EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); } -} \ No newline at end of file +} + +TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in1x6x1x1_bfyx_stride2_pad1_group2_ofm3) { + // data is similar as in basic_wsiz2x2_in1x6x1x1_bfyx_stride2_pad1_split2_ofm3 + + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 4, 1, 1 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 6, 2, 1, 1 } }); + auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 6, 1 } }); + + set_values(input, { + 1.5f, 0.5f, 2.0f, -1.0f + }); + set_values(weights, { + -2.0f, 1.0f, 1.0f, 3.0f, 0.5f, 8.0f, + 4.0f, -4.0f, 2.0f, 0.5f, -0.5f, 3.0f + }); + set_values(biases, { + 1.0f, 5.0f, 3.0f, + -1.0f, 2.5f, 2.0f + }); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + data("biases", biases), + deconvolution("deconv", "input", { "weights" }, { "biases" }, 2, { 1, 1, 1, 1 }, { 0, 0, 0, 0 }) + ); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "deconv"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + std::vector expected_output_vec = { + -1.5f, 8.0f, 7.75f, 11.0f, 6.0f, -2.0f + }; + for (unsigned int i = 0; i < expected_output_vec.size(); i++) + { + EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp index e0f08ab..9ffa10c 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp @@ -20,6 +20,10 @@ #include "api/CPP/memory.hpp" #include #include "api/CPP/concatenation.hpp" +#include "api/CPP/convolution.hpp" +#include "api/CPP/data.hpp" +#include "api/CPP/pooling.hpp" +#include "api/CPP/upsampling.hpp" #include #include #include @@ -63,7 +67,7 @@ TEST(depth_concatenate_f32_gpu, test01) { // 0 -0.2 :f4 // - engine engine; + const auto& engine = get_test_engine(); auto input1 = memory::allocate(engine, {data_types::f32, format::yxfb, { 2,2,1,1 }}); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2,3,1,1 }}); @@ -123,7 +127,7 @@ void concat_basic_with_reorder() // 0 0 :f4 // - engine engine; + const auto& engine = get_test_engine(); auto input1 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2,2,1,1 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2,3,1,1 } }); auto outs = { 2.0f, 3.0f, 0.0f, 1.0f, 1.0f, 4.0f, -4.0f, -7.0f, 0.0f, 0.0f }; @@ -200,7 +204,7 @@ TEST(depth_concatenate_f32_gpu, test02) { // 0 -0.2 :f7 // - engine engine; + const auto& engine = get_test_engine(); auto input1 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2,2,1,1 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2,3,1,1 } }); auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2,3,1,1 } }); @@ -246,12 +250,47 @@ TEST(depth_concatenate_f32_gpu, test02) { EXPECT_FLOAT_EQ(-0.2f, output_ptr[15]); } +TEST(concatenate_f32_gpu, test_concatenation_of_pool_and_unpool) +{ + engine engine; + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto weights = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 2, 1 } }); + + set_values(input1, { 16.0f, 32.0f, 128.0f, 256.0f }); + set_values(weights, { .1f, .2f }); + topology topology; + topology.add(input_layout("input1", input1.get_layout())); + topology.add(pooling("pool1", "input1", + cldnn::pooling_mode::max, + { 1,1,2,1 }, /*kernel*/ + { 1,1,1,1 } /*stride*/ + )); + topology.add(upsampling("unpool1", "input1", 1, 0, upsampling_sample_type::nearest)); + topology.add(concatenation("concat1", { "pool1", "unpool1" }, cldnn::concatenation::along_x)); + topology.add(data("weights", weights)), + topology.add(convolution("conv", "concat1", { "weights" })); + + cldnn::build_options options; + options.set_option(cldnn::build_option::optimize_data(true)); + network network(engine, topology, options); + network.set_input_data("input1", input1); + + auto outputs = network.execute({}); + auto output = outputs.at("conv").get_memory(); + std::vector out_ref = { 6.4f, 8.f, 51.2f, 64.f }; + auto output_ptr = output.pointer(); + for (int i=0; i<4; i++) + { + EXPECT_NEAR(output_ptr[i], out_ref[i], 1e-3); + } +} + TEST(depth_concatenate_f32_gpu, test03_cascade_concat_opt) { // Test for cascade concatenation optimization. // Despite having concatenations one after another and connected to different non padded activation primitives, // graph should remove all concatenations from execution. - engine engine; + const auto& engine = get_test_engine(); auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1,2,2,1 } }); set_values(input1, { 16.0f, 32.0f, 128.0f, 256.0f }); @@ -305,7 +344,7 @@ TEST(depth_concatenate_f32_gpu, test03_cascade_concat_opt) { TEST(depth_concatenate_f32_gpu, test04_fused_relu) { // 2 inputs of size 3x10x10 concatenated on f axis with fused relu - engine engine; + const auto& engine = get_test_engine(); auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1,3,10,10 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1,3,10,10 } }); @@ -345,11 +384,10 @@ TEST(depth_concatenate_f32_gpu, test04_fused_relu) { } } - TEST(depth_concatenate_f32_gpu, test05_different_formats) { // 2 inputs of size 3x10x10 concatenated on f axis - engine engine; + const auto& engine = get_test_engine(); auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1,3,2,2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1,3,2,2 } }); @@ -397,6 +435,237 @@ TEST(depth_concatenate_f32_gpu, test05_different_formats) { } +TEST(depth_concatenate_i32_gpu, optimize_data01) { + + const auto& engine = get_test_engine(); + build_options build_opt; + auto input = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,1,1 } }); + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add(cldnn::concatenation("int1", { "input" }, cldnn::concatenation::along_f)); + topology.add(cldnn::concatenation("result1", { "int1" }, cldnn::concatenation::along_f)); + topology.add(cldnn::concatenation("result2", { "int1" }, cldnn::concatenation::along_f)); + + + std::vector input_data = { 4 }; + std::vector out_data = { 4 }; + set_values(input, input_data); + + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + network.set_input_data("input", input); + auto outputs = network.execute(); + + for (auto& it : outputs) + { + auto output_ptr = it.second.get_memory().pointer(); + EXPECT_EQ(output_ptr[0], out_data[0]); + } +} + +TEST(depth_concatenate_i32_gpu, optimize_data02) { + + const auto& engine = get_test_engine(); + build_options build_opt; + auto input1 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + auto input2 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + auto input3 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + auto input4 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + + topology topology; + topology.add( + input_layout("input1", input1.get_layout()) + ); + topology.add( + input_layout("input2", input2.get_layout()) + ); + topology.add( + input_layout("input3", input3.get_layout()) + ); + topology.add( + input_layout("input4", input4.get_layout()) + ); + + topology.add(cldnn::concatenation("concat1", { "input1", "input2" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat2", { "input3", "input4" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat3", { "input2", "input4" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat4", { "concat1", "concat2" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat5", { "concat2", "concat3" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat6", { "concat4", "concat5" }, cldnn::concatenation::along_x)); + + std::vector input_data1 = + { 1, 2, + 3, 4 }; + + std::vector input_data2 = + { 5, 6, + 7, 8 }; + + std::vector input_data3 = + { 9, 10, + 11, 12 }; + + std::vector input_data4 = + { 12, 14, + 15, 16 }; + + std::vector c6_data = + { 1, 2, 5, 6, 9, 10, 12, 14, 9, 10, 12, 14, 5, 6, 12, 14, + 3, 4, 7, 8, 11, 12, 15, 16, 11, 12, 15, 16, 7, 8, 15, 16 }; + + set_values(input1, input_data1); + set_values(input2, input_data2); + set_values(input3, input_data3); + set_values(input4, input_data4); + + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + network.set_input_data("input4", input4); + auto outputs = network.execute(); + + auto output_concat6 = outputs.at("concat6").get_memory().pointer(); + + for (size_t i = 0; i < output_concat6.size(); i++) { + EXPECT_EQ(output_concat6[i], c6_data[i]); + } +} + +TEST(depth_concatenate_i32_gpu, optimize_data03) { + + const auto& engine = get_test_engine(); + build_options build_opt; + auto input1 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + + topology topology; + topology.add( + input_layout("input1", input1.get_layout()) + ); + + topology.add(cldnn::concatenation("concat1", { "input1" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat2", { "concat1" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat3", { "concat1" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat4", { "concat3" }, cldnn::concatenation::along_x)); + + std::vector input_data1 = + { 1, 2, + 3, 4 }; + + std::vector output_data = + { 1, 2, + 3, 4 }; + + set_values(input1, input_data1); + + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + network.set_input_data("input1", input1); + + auto outputs = network.execute(); + + for (auto& it : outputs) + { + auto output_ptr = it.second.get_memory().pointer(); + for (size_t i = 0; i < output_ptr.size(); i++) { + EXPECT_EQ(output_ptr[i], output_data[i]); + } + } +} + +TEST(depth_concatenate_i32_gpu, optimize_data04) { + + const auto& engine = get_test_engine(); + build_options build_opt; + auto input1 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + + topology topology; + topology.add( + input_layout("input1", input1.get_layout()) + ); + + topology.add(cldnn::concatenation("concat1", { "input1" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat2", { "concat1" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat3", { "concat1" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat4", { "concat2", "concat3" }, cldnn::concatenation::along_x)); + + std::vector input_data1 = + { 1, 2, + 3, 4 }; + + std::vector output_data = + { 1, 2, 1, 2, + 3, 4, 3, 4 }; + + set_values(input1, input_data1); + + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + network.set_input_data("input1", input1); + + auto outputs = network.execute(); + + for (auto& it : outputs) + { + auto output_ptr = it.second.get_memory().pointer(); + for (size_t i = 0; i < output_ptr.size(); i++) { + EXPECT_EQ(output_ptr[i], output_data[i]); + } + } +} + +TEST(depth_concatenate_i32_gpu, optimize_data05) { + + const auto& engine = get_test_engine(); + build_options build_opt; + auto input1 = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1,1,2,2 } }); + + topology topology; + topology.add( + input_layout("input1", input1.get_layout()) + ); + + topology.add(cldnn::concatenation("concat1", { "input1" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat2", { "concat1" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat3", { "concat1" }, cldnn::concatenation::along_x)); + + topology.add(cldnn::concatenation("concat4", { "concat2", "concat3" }, cldnn::concatenation::along_x)); + topology.add(cldnn::concatenation("concat5", { "concat1", "concat4" }, cldnn::concatenation::along_x)); + + std::vector input_data1 = + { 1, 2, + 3, 4 }; + + std::vector c5_data = + { 1, 2, 1, 2, 1, 2, + 3, 4, 3, 4, 3, 4 }; + + set_values(input1, input_data1); + + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + network.set_input_data("input1", input1); + + auto outputs = network.execute(); + + auto output_concat5 = outputs.at("concat5").get_memory().pointer(); + + for (size_t i = 0; i < output_concat5.size(); i++) { + EXPECT_EQ(output_concat5[i], c5_data[i]); + } +} + ////////////////////////////////////////////////////////////////////////////// // // // Exhaustive Negative Matrix tests // @@ -409,7 +678,7 @@ static network setup_depth_concatatenate_network(const std::vector d assert(dts.size() == ts.size()); const size_t sz = ts.size(); - engine engine; + const auto& engine = get_test_engine(); topology topology; std::vector input_names; @@ -504,7 +773,9 @@ public: { std::vector all_generic_params; - for (cldnn::data_types dt : test_data_types()) + auto data_types = test_data_types(); + + for (cldnn::data_types dt : data_types) for (int32_t b : test_batch_sizes) for (tensor & t : test_input_sizes) { diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_to_space_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_to_space_gpu_test.cpp new file mode 100644 index 0000000..49e8dcb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_to_space_gpu_test.cpp @@ -0,0 +1,308 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(depth_to_space_fp16_gpu, d1411_bs2) { + // Input : 1x4x1x1 + // Block size : 2 + // Output : 1x1x2x2 + // Input values in fp16 + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 4, 1, 1 } }); + size_t block_size = 2; + + set_values(input1, { + FLOAT16(0.0f), FLOAT16(1.0f), + FLOAT16(2.0f), FLOAT16(3.0f) + }); + + topology topology; + topology.add(input_layout("Input0", input1.get_layout())); + topology.add( + depth_to_space("depth_to_space", "Input0", block_size) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input1); + + auto outputs = network.execute(); + + auto output = outputs.at("depth_to_space").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 2.f, 3.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(depth_to_space_fp16_gpu, d1421_bs2) { + // Input : 1x4x2x1 + // Block size : 2 + // Output : 1x1x4x2 + // Input values in fp16 + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 4, 1, 2 } }); + size_t block_size = 2; + + set_values(input1, { + FLOAT16(0.0f), FLOAT16(1.0f), + FLOAT16(2.0f), FLOAT16(3.0f), + FLOAT16(4.0f), FLOAT16(5.0f), + FLOAT16(6.0f), FLOAT16(7.0f) + }); + + topology topology; + topology.add(input_layout("Input0", input1.get_layout())); + topology.add( + depth_to_space("depth_to_space", "Input0", block_size) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input1); + + auto outputs = network.execute(); + + auto output = outputs.at("depth_to_space").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(depth_to_space_fp16_gpu, d1933_bs3) { + // Input : 1x9x3x3 + // Block size : 3 + // Output : 1x1x9x9 + // Input values in fp16 + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 9, 3, 3 } }); + size_t block_size = 3; + + set_values(input1, { + FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f), FLOAT16(4.0f), + FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f), FLOAT16(8.0f), FLOAT16(9.0f), + FLOAT16(10.0f), FLOAT16(11.0f), FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), + FLOAT16(15.0f), FLOAT16(16.0f), FLOAT16(17.0f), FLOAT16(18.0f), FLOAT16(19.0f), + FLOAT16(20.0f), FLOAT16(21.0f), FLOAT16(22.0f), FLOAT16(23.0f), FLOAT16(24.0f), + FLOAT16(25.0f), FLOAT16(26.0f), FLOAT16(27.0f), FLOAT16(28.0f), FLOAT16(29.0f), + FLOAT16(30.0f), FLOAT16(31.0f), FLOAT16(32.0f), FLOAT16(33.0f), FLOAT16(34.0f), + FLOAT16(35.0f), FLOAT16(36.0f), FLOAT16(37.0f), FLOAT16(38.0f), FLOAT16(39.0f), + FLOAT16(40.0f), FLOAT16(41.0f), FLOAT16(42.0f), FLOAT16(43.0f), FLOAT16(44.0f), + FLOAT16(45.0f), FLOAT16(46.0f), FLOAT16(47.0f), FLOAT16(48.0f), FLOAT16(49.0f), + FLOAT16(50.0f), FLOAT16(51.0f), FLOAT16(52.0f), FLOAT16(53.0f), FLOAT16(54.0f), + FLOAT16(55.0f), FLOAT16(56.0f), FLOAT16(57.0f), FLOAT16(58.0f), FLOAT16(59.0f), + FLOAT16(60.0f), FLOAT16(61.0f), FLOAT16(62.0f), FLOAT16(63.0f), FLOAT16(64.0f), + FLOAT16(65.0f), FLOAT16(66.0f), FLOAT16(67.0f), FLOAT16(68.0f), FLOAT16(69.0f), + FLOAT16(70.0f), FLOAT16(71.0f), FLOAT16(72.0f), FLOAT16(73.0f), FLOAT16(74.0f), + FLOAT16(75.0f), FLOAT16(76.0f), FLOAT16(77.0f), FLOAT16(78.0f), FLOAT16(79.0f), + FLOAT16(80.0f) + }); + + topology topology; + topology.add(input_layout("Input0", input1.get_layout())); + topology.add( + depth_to_space("depth_to_space", "Input0", block_size) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input1); + + auto outputs = network.execute(); + + auto output = outputs.at("depth_to_space").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 9.0f, 18.0f, 1.0f, 10.0f, 19.0f, 2.0f, 11.0f, 20.0f, 27.0f, + 36.0f, 45.0f, 28.0f, 37.0f, 46.0f, 29.0f, 38.0f, 47.0f, 54.0f, 63.0f, + 72.0f, 55.0f, 64.0f, 73.0f, 56.0f, 65.0f, 74.0f, 3.0f, 12.0f, 21.0f, + 4.0f, 13.0f, 22.0f, 5.0f, 14.0f, 23.0f, 30.0f, 39.0f, 48.0f, 31.0f, + 40.0f, 49.0f, 32.0f, 41.0f, 50.0f, 57.0f, 66.0f, 75.0f, 58.0f, 67.0f, + 76.0f, 59.0f, 68.0f, 77.0f, 6.0f, 15.0f, 24.0f, 7.0f, 16.0f, 25.0f, + 8.0f, 17.0f, 26.0f, 33.0f, 42.0f, 51.0f, 34.0f, 43.0f, 52.0f, 35.0f, + 44.0f, 53.0f, 60.0f, 69.0f, 78.0f, 61.0f, 70.0f, 79.0f, 62.0f, 71.0f, + 80.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(depth_to_space_fp32_gpu, d1411_bs2) { + // Input : 1x4x1x1 + // Block size : 2 + // Output : 1x1x2x2 + // Input values in fp32 + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 1 } }); + size_t block_size = 2; + + set_values(input1, { + 0.f, 1.f, 2.f, 3.f + }); + + topology topology; + topology.add(input_layout("Input0", input1.get_layout())); + topology.add( + depth_to_space("depth_to_space", "Input0", block_size) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input1); + + auto outputs = network.execute(); + + auto output = outputs.at("depth_to_space").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 2.f, 3.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(depth_to_space_fp32_gpu, d1421_bs2) { + // Input : 1x4x2x1 + // Block size : 2 + // Output : 1x1x4x2 + // Input values in fp32 + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 2 } }); + size_t block_size = 2; + + set_values(input1, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f + }); + + topology topology; + topology.add(input_layout("Input0", input1.get_layout())); + topology.add( + depth_to_space("depth_to_space", "Input0", block_size) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input1); + + auto outputs = network.execute(); + + auto output = outputs.at("depth_to_space").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 2.f, 4.f, 6.f, 1.f, 3.f, 5.f, 7.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(depth_to_space_fp32_gpu, d1933_bs3) { + // Input : 1x9x3x3 + // Block size : 3 + // Output : 1x1x9x9 + // Input values in fp32 + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 9, 3, 3 } }); + size_t block_size = 3; + + set_values(input1, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, + 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, 49.0f, + 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, + 60.0f, 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, 66.0f, 67.0f, 68.0f, 69.0f, + 70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f, 78.0f, 79.0f, + 80.0f + }); + + topology topology; + topology.add(input_layout("Input0", input1.get_layout())); + topology.add( + depth_to_space("depth_to_space", "Input0", block_size) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input1); + + auto outputs = network.execute(); + + auto output = outputs.at("depth_to_space").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 9.0f, 18.0f, 1.0f, 10.0f, 19.0f, 2.0f, 11.0f, 20.0f, 27.0f, + 36.0f, 45.0f, 28.0f, 37.0f, 46.0f, 29.0f, 38.0f, 47.0f, 54.0f, 63.0f, + 72.0f, 55.0f, 64.0f, 73.0f, 56.0f, 65.0f, 74.0f, 3.0f, 12.0f, 21.0f, + 4.0f, 13.0f, 22.0f, 5.0f, 14.0f, 23.0f, 30.0f, 39.0f, 48.0f, 31.0f, + 40.0f, 49.0f, 32.0f, 41.0f, 50.0f, 57.0f, 66.0f, 75.0f, 58.0f, 67.0f, + 76.0f, 59.0f, 68.0f, 77.0f, 6.0f, 15.0f, 24.0f, 7.0f, 16.0f, 25.0f, + 8.0f, 17.0f, 26.0f, 33.0f, 42.0f, 51.0f, 34.0f, 43.0f, 52.0f, 35.0f, + 44.0f, 53.0f, 60.0f, 69.0f, 78.0f, 61.0f, 70.0f, 79.0f, 62.0f, 71.0f, + 80.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/detection_output_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/detection_output_test.cpp index df2799a..ff920a9 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/detection_output_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/detection_output_test.cpp @@ -123,6 +123,33 @@ public: } } + void init_buffer_sort(cldnn::memory input_buff) + { + auto input_data_ptr = input_buff.pointer(); + + EXPECT_EQ((int)input_buff.count(), 128); + + T* input_data = input_data_ptr.data(); + input_data[0] = 8; + input_data[1] = 3; + input_data[16] = 0; input_data[17] = 0; input_data[18] = 0.6f; input_data[19] = 0.55f; input_data[20] = 0.55f; input_data[21] = 0.85f; input_data[22] = 0.85f; + input_data[23] = 0; input_data[24] = 0; input_data[25] = 0.4f; input_data[26] = 0.15f; input_data[27] = 0.55f; input_data[28] = 0.45f; input_data[29] = 0.85f; + input_data[30] = 0; input_data[31] = 0; input_data[32] = 0.2f; input_data[33] = 0.55f; input_data[34] = 0.15f; input_data[35] = 0.85f; input_data[36] = 0.45f; + input_data[37] = 0; input_data[38] = 0; input_data[39] = 0.0f; input_data[40] = 0.15f; input_data[41] = 0.15f; input_data[42] = 0.45f; input_data[43] = 0.45f; + input_data[44] = 0; input_data[45] = 1; input_data[46] = 1.0f; input_data[47] = 0.20f; input_data[48] = 0.20f; input_data[49] = 0.50f; input_data[50] = 0.50f; + input_data[51] = 0; input_data[52] = 1; input_data[53] = 0.8f; input_data[54] = 0.50f; input_data[55] = 0.20f; input_data[56] = 0.80f; input_data[57] = 0.50f; + input_data[58] = 0; input_data[59] = 1; input_data[60] = 0.6f; input_data[61] = 0.20f; input_data[62] = 0.50f; input_data[63] = 0.50f; input_data[64] = 0.80f; + input_data[65] = 0; input_data[66] = 1; input_data[67] = 0.4f; input_data[68] = 0.50f; input_data[69] = 0.50f; input_data[70] = 0.80f; input_data[71] = 0.80f; + input_data[72] = 1; input_data[73] = 0; input_data[74] = 1.0f; input_data[75] = 0.25f; input_data[76] = 0.25f; input_data[77] = 0.55f; input_data[78] = 0.55f; + input_data[79] = 1; input_data[80] = 0; input_data[81] = 0.4f; input_data[82] = 0.45f; input_data[83] = 0.45f; input_data[84] = 0.75f; input_data[85] = 0.75f; + input_data[86] = -1; input_data[87] = 0; input_data[88] = 0; input_data[89] = 0; input_data[90] = 0; input_data[91] = 0; input_data[92] = 0; + input_data[93] = -1; input_data[94] = 0; input_data[95] = 0; input_data[96] = 0; input_data[97] = 0; input_data[98] = 0; input_data[99] = 0; + input_data[100] = 1; input_data[101] = 1; input_data[102] = 0.6f; input_data[103] = 0.40f; input_data[104] = 0.40f; input_data[105] = 0.70f; input_data[106] = 0.70f; + input_data[107] = -1; input_data[108] = 0; input_data[109] = 0; input_data[110] = 0; input_data[111] = 0; input_data[112] = 0; input_data[113] = 0; + input_data[114] = -1; input_data[115] = 0; input_data[116] = 0; input_data[117] = 0; input_data[118] = 0; input_data[119] = 0; input_data[120] = 0; + input_data[121] = -1; input_data[122] = 0; input_data[123] = 0; input_data[124] = 0; input_data[125] = 0; input_data[126] = 0; input_data[127] = 0; + } + void check_results(const memory& output, const int num, const std::string values) { assert(num < output.get_layout().size.spatial[1]); @@ -145,526 +172,817 @@ public: EXPECT_TRUE(floating_point_equal(data[num * output.get_layout().size.spatial[0] + i], (T)(float)atof(items[i].c_str()))); } } - static const int num_of_images = 2; - static const int num_classes = 2; - static const int num_priors = 4; - static const int img_size = 300; - const float nms_threshold; -}; -typedef ::testing::Types detection_output_test_types; -TYPED_TEST_CASE(detection_output_test, detection_output_test_types); + void setup_basic(bool runOnGPU) + { + const bool share_location = true; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 150; + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); -TYPED_TEST(detection_output_test, test_setup_basic) -{ - const bool share_location = true; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 150; + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} }); + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k)); - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); - auto outputs = network.execute(); - - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); - - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); -} + auto outputs = network.execute(); -TYPED_TEST(detection_output_test, test_forward_share_location) -{ - const bool share_location = true; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 4; - const int background_label_id = 0; + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} }); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + } - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + void setup_two_layers(bool runOnGPU) + { + const bool share_location = true; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 150; - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - auto outputs = network.execute(); + topology.add(detection_output("detection_output_1", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k)); + topology.add(detection_output("detection_output_2", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k)); - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); - auto output_prim = outputs.begin()->second.get_memory(); - - this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); - this->check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45"); - this->check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85"); - this->check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85"); - this->check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75"); - this->check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55"); - this->check_results(output_prim, 6, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 7, "-1 0 0 0 0 0 0"); -} + auto outputs = network.execute(); -TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k) -{ - const bool share_location = true; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 1; - const int background_label_id = 0; + EXPECT_EQ(outputs.size(), size_t(2)); + unsigned i = 1; + for (auto it = outputs.begin(); it != outputs.begin(); it++) + { - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} }); + EXPECT_EQ(it->first, "detection_output_" + std::to_string(i)); - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + EXPECT_EQ(it->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(it->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(it->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(it->second.get_memory().get_layout().size.spatial[0], 7); + i++; + } + } - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + void forward_share_location(bool runOnGPU) + { + const bool share_location = true; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 4; + const int background_label_id = 0; - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - auto outputs = network.execute(); + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - auto output_prim = outputs.begin()->second.get_memory(); + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); - this->check_results(output_prim, 1, "1 1 0.6 0.45 0.45 0.75 0.75"); -} + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); -TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k) -{ - const bool share_location = true; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 6; - const int background_label_id = 0; + auto outputs = network.execute(); - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4} }); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + auto output_prim = outputs.begin()->second.get_memory(); - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); + check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45"); + check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85"); + check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85"); + check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75"); + check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55"); + check_results(output_prim, 6, "-1 0 0 0 0 0 0"); + check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + } - auto outputs = network.execute(); + void forward_num_detections_greater_than_keep_top_k(bool runOnGPU) + { + const bool share_location = true; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 1; + const int background_label_id = 0; - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); - auto output_prim = outputs.begin()->second.get_memory(); + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); - this->check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45"); - this->check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85"); - this->check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85"); - this->check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75"); - this->check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55"); - this->check_results(output_prim, 6, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 7, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 8, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 9, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 10, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 11, "-1 0 0 0 0 0 0"); -} + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); -TYPED_TEST(detection_output_test, test_forward_share_location_top_k) -{ - const bool share_location = true; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 2; - const int top_k = 2; - const int background_label_id = 0; + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + auto outputs = network.execute(); - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); - auto outputs = network.execute(); + auto output_prim = outputs.begin()->second.get_memory(); - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); + check_results(output_prim, 1, "1 1 0.6 0.45 0.45 0.75 0.75"); + } - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + void forward_num_detections_smaller_than_keep_top_k(bool runOnGPU) + { + const bool share_location = true; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 6; + const int background_label_id = 0; - auto output_prim = outputs.begin()->second.get_memory(); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - this->check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); - this->check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45"); - this->check_results(output_prim, 2, "1 1 0.6 0.45 0.45 0.75 0.75"); - this->check_results(output_prim, 3, "-1 0 0 0 0 0 0"); -} + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); -TYPED_TEST(detection_output_test, test_forward_no_share_location) -{ - const bool share_location = false; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 10; - const int background_label_id = -1; + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); + check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45"); + check_results(output_prim, 2, "0 1 0.6 0.15 0.55 0.45 0.85"); + check_results(output_prim, 3, "0 1 0.4 0.55 0.55 0.85 0.85"); + check_results(output_prim, 4, "1 1 0.6 0.45 0.45 0.75 0.75"); + check_results(output_prim, 5, "1 1 0.0 0.25 0.25 0.55 0.55"); + check_results(output_prim, 6, "-1 0 0 0 0 0 0"); + check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + check_results(output_prim, 8, "-1 0 0 0 0 0 0"); + check_results(output_prim, 9, "-1 0 0 0 0 0 0"); + check_results(output_prim, 10, "-1 0 0 0 0 0 0"); + check_results(output_prim, 11, "-1 0 0 0 0 0 0"); + } - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + void test_forward_share_location_top_k(bool runOnGPU) + { + const bool share_location = true; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 2; + const int top_k = 2; + const int background_label_id = 0; - auto outputs = network.execute(); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - auto output_prim = outputs.begin()->second.get_memory(); + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); - this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); - this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); - this->check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45"); - this->check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45"); - this->check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50"); - this->check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50"); - this->check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80"); - this->check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80"); - this->check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55"); - this->check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75"); - this->check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70"); - this->check_results(output_prim, 11, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 12, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 13, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 14, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 15, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 16, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 17, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 18, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 19, "-1 0 0 0 0 0 0"); -} + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } -TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k) -{ - const bool share_location = false; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 4; - const int background_label_id = -1; - const int top_k = 2; + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + auto outputs = network.execute(); - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + auto output_prim = outputs.begin()->second.get_memory(); - auto outputs = network.execute(); + check_results(output_prim, 0, "0 1 1.0 0.15 0.15 0.45 0.45"); + check_results(output_prim, 1, "0 1 0.8 0.55 0.15 0.85 0.45"); + check_results(output_prim, 2, "1 1 0.6 0.45 0.45 0.75 0.75"); + check_results(output_prim, 3, "-1 0 0 0 0 0 0"); + } - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + void forward_no_share_location(bool runOnGPU) + { + const bool share_location = false; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 10; + const int background_label_id = -1; - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - auto output_prim = outputs.begin()->second.get_memory(); + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); - this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); - this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); - this->check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50"); - this->check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50"); - this->check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55"); - this->check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70"); - this->check_results(output_prim, 6, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 7, "-1 0 0 0 0 0 0"); -} + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); -TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0) -{ - const bool share_location = false; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 5; - const int background_label_id = 0; + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); + check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); + check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45"); + check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45"); + check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50"); + check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50"); + check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80"); + check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80"); + check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55"); + check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75"); + check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70"); + check_results(output_prim, 11, "-1 0 0 0 0 0 0"); + check_results(output_prim, 12, "-1 0 0 0 0 0 0"); + check_results(output_prim, 13, "-1 0 0 0 0 0 0"); + check_results(output_prim, 14, "-1 0 0 0 0 0 0"); + check_results(output_prim, 15, "-1 0 0 0 0 0 0"); + check_results(output_prim, 16, "-1 0 0 0 0 0 0"); + check_results(output_prim, 17, "-1 0 0 0 0 0 0"); + check_results(output_prim, 18, "-1 0 0 0 0 0 0"); + check_results(output_prim, 19, "-1 0 0 0 0 0 0"); + } - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + void forward_no_share_location_top_k(bool runOnGPU) + { + const bool share_location = false; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 4; + const int background_label_id = -1; + const int top_k = 2; - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); - auto outputs = network.execute(); + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); - auto output_prim = outputs.begin()->second.get_memory(); + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } - this->check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50"); - this->check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50"); - this->check_results(output_prim, 2, "0 1 0.6 0.20 0.50 0.50 0.80"); - this->check_results(output_prim, 3, "0 1 0.4 0.50 0.50 0.80 0.80"); - this->check_results(output_prim, 4, "1 1 0.6 0.40 0.40 0.70 0.70"); - this->check_results(output_prim, 5, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 6, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 7, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 8, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 9, "-1 0 0 0 0 0 0"); + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); + check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); + check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50"); + check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50"); + check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55"); + check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70"); + check_results(output_prim, 6, "-1 0 0 0 0 0 0"); + check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + } + + void forward_no_share_location_neg_0(bool runOnGPU) + { + const bool share_location = false; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 5; + const int background_label_id = 0; + + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold)); + + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } + + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50"); + check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50"); + check_results(output_prim, 2, "0 1 0.6 0.20 0.50 0.50 0.80"); + check_results(output_prim, 3, "0 1 0.4 0.50 0.50 0.80 0.80"); + check_results(output_prim, 4, "1 1 0.6 0.40 0.40 0.70 0.70"); + check_results(output_prim, 5, "-1 0 0 0 0 0 0"); + check_results(output_prim, 6, "-1 0 0 0 0 0 0"); + check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + check_results(output_prim, 8, "-1 0 0 0 0 0 0"); + check_results(output_prim, 9, "-1 0 0 0 0 0 0"); + } + + void forward_no_share_location_neg_0_top_k(bool runOnGPU) + { + const bool share_location = false; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 2; + const int background_label_id = 0; + const int top_k = 2; + + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + + topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); + + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } + + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50"); + check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50"); + check_results(output_prim, 2, "1 1 0.6 0.40 0.40 0.70 0.70"); + check_results(output_prim, 3, "-1 0 0 0 0 0 0"); + } + + void forward_no_share_location_top_k_input_padding(bool runOnGPU) + { + const bool share_location = false; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 4; + const int background_label_id = -1; + const int top_k = 2; + + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); + + this->init_buffers(input_prior_box, input_confidence, input_location, share_location); + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } }))); + topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } }))); + + topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); + + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } + + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); + check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); + check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50"); + check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50"); + check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55"); + check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70"); + check_results(output_prim, 6, "-1 0 0 0 0 0 0"); + check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + } + + void test_forward_no_share_location_top_k_faster_rcnn_case(bool runOnGPU) + { + const bool share_location = false; + const int num_loc_classes = share_location ? 1 : this->num_classes; + const int keep_top_k = 4; + const int background_label_id = -1; + const int top_k = 2; + const float eta = 1.0f; + const prior_box_code_type code_type = prior_box_code_type::corner; + const bool variance_encoded_in_target = true; + const float confidence_threshold = -std::numeric_limits::max(); + const int32_t prior_info_size = 5; + const int32_t prior_coordinates_offset = 1; + const bool prior_is_normalized = true; + + const auto& engine = get_test_engine(); + cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); + cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); + cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 1, 1, this->num_priors * prior_info_size } }); + + this->init_buffers(input_prior_box, input_confidence, input_location, share_location, variance_encoded_in_target, + prior_info_size, prior_coordinates_offset, prior_is_normalized); + + topology topology; + topology.add(input_layout("input_location", input_location.get_layout())); + topology.add(input_layout("input_confidence", input_confidence.get_layout())); + topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); + topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } }))); + topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } }))); + + topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", + this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k, + eta, code_type, variance_encoded_in_target, confidence_threshold, prior_info_size, prior_coordinates_offset, + prior_is_normalized, this->img_size, this->img_size + )); + + build_options opts; + if (runOnGPU) + { + opts.set_option(build_option::detection_output_gpu(true)); + } + + network network(engine, topology, opts); + network.set_input_data("input_location", input_location); + network.set_input_data("input_confidence", input_confidence); + network.set_input_data("input_prior_box", input_prior_box); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "detection_output"); + + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); + EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); + + auto output_prim = outputs.begin()->second.get_memory(); + + check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); + check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); + check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50"); + check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50"); + check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55"); + check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70"); + check_results(output_prim, 6, "-1 0 0 0 0 0 0"); + check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + } + + static const int num_of_images = 2; + static const int num_classes = 2; + static const int num_priors = 4; + static const int img_size = 300; + const float nms_threshold; +}; + +typedef ::testing::Types detection_output_test_types; +TYPED_TEST_CASE(detection_output_test, detection_output_test_types); + + +TYPED_TEST(detection_output_test, test_setup_basic) +{ + this->setup_basic(false); } -TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k) +TYPED_TEST(detection_output_test, test_setup_basic_gpu) { - const bool share_location = false; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 2; - const int background_label_id = 0; - const int top_k = 2; + this->setup_basic(true); +} - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, 2, 1, this->num_priors * 4 } }); +TYPED_TEST(detection_output_test, test_setup_two_layers) +{ + this->setup_two_layers(false); +} - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); +TYPED_TEST(detection_output_test, test_setup_two_layers_gpu) +{ + this->setup_two_layers(true); +} - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); +TYPED_TEST(detection_output_test, test_forward_share_location) +{ + this->forward_share_location(false); +} - topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); +TYPED_TEST(detection_output_test, test_forward_share_location_gpu) +{ + this->forward_share_location(true); +} - auto outputs = network.execute(); +TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k) +{ + this->forward_num_detections_greater_than_keep_top_k(false); +} - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); +TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k_gpu) +{ + this->forward_num_detections_greater_than_keep_top_k(true); +} - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); +TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k) +{ + this->forward_num_detections_smaller_than_keep_top_k(false); +} - auto output_prim = outputs.begin()->second.get_memory(); +TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k_gpu) +{ + this->forward_num_detections_smaller_than_keep_top_k(true); +} - this->check_results(output_prim, 0, "0 1 1.0 0.20 0.20 0.50 0.50"); - this->check_results(output_prim, 1, "0 1 0.8 0.50 0.20 0.80 0.50"); - this->check_results(output_prim, 2, "1 1 0.6 0.40 0.40 0.70 0.70"); - this->check_results(output_prim, 3, "-1 0 0 0 0 0 0"); +TYPED_TEST(detection_output_test, test_forward_share_location_top_k) +{ + this->test_forward_share_location_top_k(false); } -TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding) +TYPED_TEST(detection_output_test, test_forward_share_location_top_k_gpu) { - const bool share_location = false; - const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 4; - const int background_label_id = -1; - const int top_k = 2; + this->test_forward_share_location_top_k(true); +} - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 2, 1, this->num_priors * 4 } }); +TYPED_TEST(detection_output_test, test_forward_no_share_location) +{ + this->forward_no_share_location(false); +} - this->init_buffers(input_prior_box, input_confidence, input_location, share_location); - topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } }))); - topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } }))); +TYPED_TEST(detection_output_test, test_forward_no_share_location_gpu) +{ + this->forward_no_share_location(true); +} - topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k)); - network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); +TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k) +{ + this->forward_no_share_location_top_k(false); +} - auto outputs = network.execute(); +TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_gpu) +{ + this->forward_no_share_location_top_k(true); +} - EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); +TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0) +{ + this->forward_no_share_location_neg_0(false); +} - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images); - EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7); +TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_gpu) +{ + this->forward_no_share_location_neg_0(true); +} - auto output_prim = outputs.begin()->second.get_memory(); +TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k) +{ + this->forward_no_share_location_neg_0_top_k(false); +} - this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); - this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); - this->check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50"); - this->check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50"); - this->check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55"); - this->check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70"); - this->check_results(output_prim, 6, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 7, "-1 0 0 0 0 0 0"); +TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k_gpu) +{ + this->forward_no_share_location_neg_0_top_k(true); +} + +TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding) +{ + this->forward_no_share_location_top_k_input_padding(false); +} + +TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding_gpu) +{ + this->forward_no_share_location_top_k_input_padding(true); } TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case) { + this->test_forward_no_share_location_top_k_faster_rcnn_case(false); +} + +TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case_gpu) +{ + this->test_forward_no_share_location_top_k_faster_rcnn_case(true); +} + +TYPED_TEST(detection_output_test, test_detection_output_sort_gpu) +{ const bool share_location = false; const int num_loc_classes = share_location ? 1 : this->num_classes; - const int keep_top_k = 4; + const int keep_top_k = 10; const int background_label_id = -1; - const int top_k = 2; - const float eta = 1.0f; - const prior_box_code_type code_type = prior_box_code_type::corner; - const bool variance_encoded_in_target = true; - const float confidence_threshold = -std::numeric_limits::max(); - const int32_t prior_info_size = 5; - const int32_t prior_coordinates_offset = 1; - const bool prior_is_normalized = true; - - cldnn::engine engine; - cldnn::memory input_location = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * num_loc_classes * 4, 1, 1 } }); - cldnn::memory input_confidence = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ this->num_of_images, this->num_priors * this->num_classes, 1, 1 } }); - cldnn::memory input_prior_box = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 1, 1, this->num_priors * prior_info_size } }); - - this->init_buffers(input_prior_box, input_confidence, input_location, share_location, variance_encoded_in_target, - prior_info_size, prior_coordinates_offset, prior_is_normalized); + const int top_k = -1; + + const unsigned out_row_size = 7; + const unsigned score_space = ((this->num_of_images + 15) / 16) * 16; + int input_size = this->num_of_images * num_loc_classes * this->num_priors * out_row_size + score_space; + + const auto& engine = get_test_engine(); + cldnn::memory input_buff = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 1, 1, input_size } }); + + this->init_buffer_sort(input_buff); topology topology; - topology.add(input_layout("input_location", input_location.get_layout())); - topology.add(input_layout("input_confidence", input_confidence.get_layout())); - topology.add(input_layout("input_prior_box", input_prior_box.get_layout())); - topology.add(reorder("input_location_padded", "input_location", input_location.get_layout().with_padding({ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } }))); - topology.add(reorder("input_confidence_padded", "input_confidence", input_location.get_layout().with_padding({ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } }))); - - topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", - this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k, - eta, code_type, variance_encoded_in_target, confidence_threshold, prior_info_size, prior_coordinates_offset, - prior_is_normalized, this->img_size, this->img_size - )); + topology.add(input_layout("input_location", input_buff.get_layout())); + + topology.add(detection_output_sort("detection_output_sort", "input_location", this->num_of_images, this->num_classes, keep_top_k, share_location, top_k, background_label_id)); network network(engine, topology); - network.set_input_data("input_location", input_location); - network.set_input_data("input_confidence", input_confidence); - network.set_input_data("input_prior_box", input_prior_box); + network.set_input_data("input_location", input_buff); auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "detection_output"); + EXPECT_EQ(outputs.begin()->first, "detection_output_sort"); EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1); EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1); @@ -675,11 +993,23 @@ TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rc this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85"); this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85"); - this->check_results(output_prim, 2, "0 1 1.0 0.20 0.20 0.50 0.50"); - this->check_results(output_prim, 3, "0 1 0.8 0.50 0.20 0.80 0.50"); - this->check_results(output_prim, 4, "1 0 1.0 0.25 0.25 0.55 0.55"); - this->check_results(output_prim, 5, "1 1 0.6 0.40 0.40 0.70 0.70"); - this->check_results(output_prim, 6, "-1 0 0 0 0 0 0"); - this->check_results(output_prim, 7, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45"); + this->check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45"); + this->check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50"); + this->check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50"); + this->check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80"); + this->check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80"); + this->check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55"); + this->check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75"); + this->check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70"); + this->check_results(output_prim, 11, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 12, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 13, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 14, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 15, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 16, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 17, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 18, "-1 0 0 0 0 0 0"); + this->check_results(output_prim, 19, "-1 0 0 0 0 0 0"); } diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp index 750aaa5..417ab07 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include "test_utils/test_utils.h" namespace cldnn @@ -97,7 +98,7 @@ void generic_eltwise_test(cldnn::format test_input_fmt, int input_b, int input_f VF input1_rnd_vec = flatten_4d(test_input_fmt, input1_rnd); VF input2_rnd_vec = flatten_4d(test_input_fmt, input2_rnd); - engine engine; + const auto& engine = get_test_engine(); tensor input_tensor( input_b, input_f, input_x, input_y ); auto input1 = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); auto input2 = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); @@ -157,707 +158,2283 @@ void generic_eltwise_test(cldnn::format test_input_fmt, int input_b, int input_f << "type = " << (sizeof(T) == 2 ? "float16" : "float32") << std::endl; } -TEST(eltwise_gpu_f32, add_basic_in4x4x2x2) { - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, equal_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 -2 b1: 17 6.5 - // f1: b0: 0.5 2 b1: 2.5 4 - // f1: b0: 8 -0.5 b1: 10 -2.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 1.5 7 b1: 2.5 7 - // f0: b0: 18 2 b1: 17.5 6 - // f1: b0: 5.5 8 b1: 4 9.2 - // f1: b0: 15 16.5 b1: 22 16.5 - // + // 0, 1, 0, 1, + // 0, 0, 1, 0, + // 0, 0, 0, 0, + // 0, 1, 0, 0 - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::eq)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { 1.5f, 2.5f, 5.5f, 4.f, - 7.f, 7.f, 8.f, 9.2f, - 18.f,17.5f, 15.f, 22.f, - 2.f, 6.f, 7.5f, 5.5f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 0, 1, 0, 1, + 0, 0, 1, 0, + 0, 0, 0, 0, + 0, 1, 0, 0 }; - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, max_basic_in4x4x4x4) { - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, not_equal_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 0.5 2 b1: 2.5 4 - // f1: b0: 8 -0.5 b1: 10 -2.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 1 5 b1: 2.5 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 5 6 b1: 2.5 5.2 - // f1: b0: 8 8 b1: 12 8 - // - engine engine; + // 1, 0, 1, 0, + // 1, 1, 0, 1, + // 1, 1, 1, 1, + // 1, 0, 1, 1 - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::max)); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - 6.f, 8.f, -0.5f, -2.5f }); + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::ne)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { - 1.f, 2.5f, 5.f, 2.5f, - 5.f, 7.f, 6.f, 5.2f, - 15.f, 17.f, 8.f, 12.f, - 6.f, 8.f, 8.f, 8.f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 1, 0, 1, 0, + 1, 1, 0, 1, + 1, 1, 1, 1, + 1, 0, 1, 1 }; - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, sub_basic_in4x4x4x4) { - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, less_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 0.5 2 b1: -1 2 - // f1: b0: 8 -0.5 b1: 8.5 10.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 0.5 -3 b1: -2.5 -7 - // f0: b0: -12 -2 b1: -16.5 -8.5 - // f1: b0: 4.5 4 b1: 2.5 3.2 - // f1: b0: -1 8.5 b1: 3.5 -2.5 - // + // 0, 0, 0, 0, + // 1, 1, 0, 0, + // 1, 1, 1, 0, + // 0, 0, 0, 0 - engine engine; - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sub)); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, -1.f, - 5.f, 7.f, 2.f, 2.f, - 15.f, 17.f, 8.f, 8.5f, - 6.f, 8.f, -0.5f, 10.5f }); + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::lt)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { - 0.5f, -2.5f, 4.5f, 2.5f, - -3.f, -7.f, 4.f, 3.2f, - -12.f, -16.5f, -1.f, 3.5f, - -2.f, -8.5f, 8.5f, -2.5f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 0, 0, 0, 0, + 1, 1, 0, 0, + 1, 1, 1, 0, + 0, 0, 0, 0 }; - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_int, basic_in4x4x4x4) { - // Same params as in eltwise_gpu_f32, sub_basic_in4x4x4x4 but using int types instead +TEST(eltwise_gpu_f32, less_equal_in2_float_out1_int) { + // Input2 : 2x2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 - std::vector data_types_to_test = { data_types::i8, data_types::i32, data_types::i64 }; - std::vector eltwise_ops_to_test = { eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::div, eltwise_mode::prod }; + // Input: + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f + // + // Input2 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f + // + // Output: + // 0, 1, 0, 1, + // 1, 1, 1, 0, + // 1, 1, 1, 0, + // 0, 1, 0, 0 - for (auto& data_type : data_types_to_test) - { - for (auto& mode : eltwise_ops_to_test) - { - engine engine; - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(reorder("input_reorder", "input", { data_type, format::yxfb,{ 2, 2, 2, 2 } })); - topology.add(reorder("input2_reorder", "input2", { data_type, format::yxfb,{ 2, 2, 2, 2 } })); - topology.add(eltwise("eltwise", { "input_reorder", "input2_reorder" }, mode)); - topology.add(reorder("eltwise_reorder", "eltwise", { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } })); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - std::vector input_1_vec = { - 1.f, 0.f, 5.f, 1.f, - 2.f, 0.f, 6.f, 5.f, - 3.f, 0.f, 7.f, 12.f, - 4.f, 0.f, 8.f, 8.f - }; - set_values(input, input_1_vec); + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f + }); - std::vector input_2_vec = { - 0.f, 2.f, 0.f, -1.f, - 5.f, 7.f, 2.f, 2.f, - 15.f, 17.f, 8.f, 8.f, - 6.f, 8.f, 0.f, 10.f }; - set_values(input2, input_2_vec); + set_values(input2, { + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); - network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - auto outputs = network.execute(); + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::le)); - ASSERT_EQ(outputs.size(), size_t(1)); - EXPECT_EQ(outputs.begin()->first, "eltwise_reorder"); + network network(engine, topology); - auto output = outputs.at("eltwise_reorder").get_memory(); + network.set_input_data("input", input1); + network.set_input_data("input2", input2); - auto output_ptr = output.pointer(); + auto outputs = network.execute(); - for (int i = 0; i < 16; i++) - { - float expected = 0.f; - if (mode == eltwise_mode::sum) - expected = input_1_vec[i] + input_2_vec[i]; - else if (mode == eltwise_mode::sub) - expected = input_1_vec[i] - input_2_vec[i]; - else if (mode == eltwise_mode::prod) - expected = input_1_vec[i] * input_2_vec[i]; - else if (mode == eltwise_mode::div) - expected = input_1_vec[i] / input_2_vec[i]; - else if (mode == eltwise_mode::min) - expected = std::min(input_1_vec[i], input_2_vec[i]); - else if (mode == eltwise_mode::max) - expected = std::max(input_1_vec[i], input_2_vec[i]); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); - EXPECT_TRUE(are_equal(std::floor(expected), output_ptr[i])); - } - } + auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); + + std::vector answers = { 0, 1, 0, 1, + 1, 1, 1, 0, + 1, 1, 1, 0, + 0, 1, 0, 0 }; + + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, prod_basic_in4x4x4x4) { - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, greater_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1 5.2 - // f1: b0: 7 8 b1: 12 7.5 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 0.5 b1: 5 2 - // f0: b0: 2.5 2.5 b1: 7 4 - // f1: b0: 15 8 b1: 6 -0.5 - // f1: b0: 17 10 b1: 8 -2.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 0.5 1 b1: 0 0 - // f0: b0: 7.5 10 b1: 3.5 -2 - // f1: b0: 75 48 b1: 6 -2.6 - // f1: b0: 119 80 b1: 96 -18.75 - // + // 1, 0, 1, 0, + // 0, 0, 0, 1, + // 0, 0, 0, 1, + // 1, 0, 1, 1 + const auto& engine = get_test_engine(); - engine engine; - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::prod)); - - set_values(input, { - 1.f, 0.f, 5.f, 1.f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 7.5f + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 5.f, 15.f, 6.f, - 0.5f, 2.f, 8.f, -0.5f, - 2.5f, 7.f, 17.f, 8.f, - 2.5f, 4.f, 10.f, -2.5f }); + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::gt)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { - 0.5f, 0.0f, 75.f, 6.0f, - 1.0f, 0.0f, 48.f, -2.6f, - 7.5f, 3.5f, 119.f, 96.0f, - 10.0f, -2.0f, 80.f, -18.75f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 1, 0, 1, 0, + 0, 0, 0, 1, + 0, 0, 0, 1, + 1, 0, 1, 1 }; - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, max_basic_in4x4x4x4_input_padding) { - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, greater_equal_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 - // Input Padding: 2x1 (with reorder) // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 0.5 2 b1: 2.5 4 - // f1: b0: 8 -0.5 b1: 10 -2.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 1 5 b1: 2.5 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 5 6 b1: 2.5 5.2 - // f1: b0: 8 8 b1: 12 8 - // - engine engine; + // 1, 1, 1, 1, + // 0, 0, 1, 1, + // 0, 0, 0, 1, + // 1, 1, 1, 1 - auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); - topology.add(reorder("reorder2", "input2", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); - topology.add(eltwise("eltwise", {"reorder", "reorder2"}, eltwise_mode::max)); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - 6.f, 8.f, -0.5f, -2.5f }); + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::ge)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { - 1.f, 2.5f, 5.f, 2.5f, - 5.f, 7.f, 6.f, 5.2f, - 15.f, 17.f, 8.f, 12.f, - 6.f, 8.f, 8.f, 8.f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 1, 1, 1, 1, + 0, 0, 1, 1, + 0, 0, 0, 1, + 1, 1, 1, 1 }; - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, add_basic_in4x4x2x2_with_coefficients) { - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, logicalAND_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 -2 b1: 17 6.5 - // f1: b0: 0.5 2 b1: 2.5 4 - // f1: b0: 8 -0.5 b1: 10 -2.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 0.75 3.5 b1: 1.25 3.5 - // f0: b0: 9 1 b1: 8.75 3 - // f1: b0: 2.75 4 b1: 2 4.6 - // f1: b0: 7.5 8.25 b1: 11 8.25 - // + // 1, 1, 1, 1, + // 1, 0, 1, 1, + // 1, 1, 1, 1, + // 1, 0, 1, 1 - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum, {0.5f, 0.5f})); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, 2.f, 0.f, 6.f, 5.2f, 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::logic_and)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { 0.75f, 1.25f, 2.75f, 2.f, - 3.5f, 3.5f, 4.f, 4.6f, - 9.f, 8.75f, 7.5f, 11.f, - 1.f, 3.f, 3.75f, 2.75f }; + std::vector answers = { 1, 1, 1, 1, + 1, 0, 1, 1, + 1, 1, 1, 1, + 1, 0, 1, 1 }; - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, coefficients_count_check) { - engine engine; +TEST(eltwise_gpu_f32, logicalAND_in3_float_out1_int) { + // Input2 : 2x2x2x2 + // Input3 : 2x2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input3 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + // Input: + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f + // + // Input2 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f + // + // Input3 + // 0.f, 0.f, 0.f, 0.f, + // 0.f, 0.f, 0.f, 0.f, + // 1.f, 1.f, 1.f, 1.f, + // 1.f, 1.f, 1.f, 1.f + // + // Output: + // 0, 0, 0, 0, + // 0, 0, 0, 0, + // 1, 1, 1, 1, + // 1, 0, 1, 1 + + const auto& engine = get_test_engine(); + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + set_values(input3, { + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 1.f, 1.f, 1.f, 1.f, + 1.f, 1.f, 1.f, 1.f + }); topology topology; - topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input", input1.get_layout())); topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("input3", input3.get_layout())); + topology.add(input_layout("input3", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2", "input3"}, eltwise_mode::logic_and)); - std::vector coeffs0 = {}; - std::vector coeffs1 = {0.5f}; - std::vector coeffs2 = {0.5f, 0.5f}; - std::vector coeffs3 = {0.5f, 0.5f, 0.5f}; + network network(engine, topology); - EXPECT_THROW(topology.add(eltwise("eltwise1", {"input", "input2"}, eltwise_mode::sum, coeffs1)), std::invalid_argument); - EXPECT_THROW(topology.add(eltwise("eltwise2", {"input", "input2"}, eltwise_mode::sum, coeffs3)), std::invalid_argument); + network.set_input_data("input", input1); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); - EXPECT_THROW(topology.add(eltwise("eltwise3", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs1)), std::invalid_argument); - EXPECT_THROW(topology.add(eltwise("eltwise4", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs2)), std::invalid_argument); + auto outputs = network.execute(); - EXPECT_NO_THROW(topology.add(eltwise("eltwise5", {"input", "input2"}, eltwise_mode::sum, coeffs0))); - EXPECT_NO_THROW(topology.add(eltwise("eltwise6", {"input", "input2"}, eltwise_mode::sum, coeffs2))); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); - EXPECT_NO_THROW(topology.add(eltwise("eltwise7", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs0))); - EXPECT_NO_THROW(topology.add(eltwise("eltwise8", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs3))); + auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); + + std::vector answers = { 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, + 1, 0, 1, 1 }; + + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); + } } -TEST(eltwise_gpu_f32, add_basic_in4x4x2x2_with_coefficients_3inputs) { - // Input3 : 2x2x2 - // Input2 : 2x2x2 +TEST(eltwise_gpu_f32, logicalOR_in2_float_out1_int) { + // Input2 : 2x2x2x2 // Input : 2x2x2x2 // Output : 2x2x2x2 // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 -2 b1: 17 6.5 - // f1: b0: 0.5 2 b1: 2.5 4 - // f1: b0: 8 -0.5 b1: 10 -2.5 - // - // Input3 - // f0: b0: 8 7 b1: 0 1 - // f0: b0: 6 5 b1: 0 1 - // f1: b0: 4 3 b1: 0 1 - // f1: b0: 2 1 b1: 0 1 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Output: - // f0: b0: 4.75 7 b1: 1.25 4 - // f0: b0: 12 3.5 b1: 8.75 3.5 - // f1: b0: 4.75 5.5 b1: 2 5.1 - // f1: b0: 8.5 8.75 b1: 11 8.75 - // - - engine engine; + // 1, 1, 1, 1, + // 1, 1, 1, 1, + // 1, 1, 1, 1, + // 1, 0, 1, 1 - auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto input3 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("input3", input3.get_layout())); - topology.add(eltwise("eltwise", {"input", "input2", "input3"}, eltwise_mode::sum, {0.5f, 0.5f, 0.5f})); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, 2.f, 0.f, 6.f, 5.2f, 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); + -2.f, 0.f, -0.5f, -2.5f + }); - set_values(input3, { - 8.f, 0.f, 4.f, 0.f, - 7.f, 1.f, 3.f, 1.f, - 6.f, 0.f, 2.f, 0.f, - 5.f, 1.f, 1.f, 1.f }); + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::logic_or)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); - network.set_input_data("input3", input3); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { 4.75f, 1.25f, 4.75f, 2.f, - 7.0f, 4.0f, 5.5f, 5.1f, - 12.f, 8.75f, 8.5f, 11.f, - 3.5f, 3.5f, 4.25f, 3.25f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 0, 1, 1 }; - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); } } -TEST(eltwise_gpu_f32, max_3inputs_in4x4x4x4_input_padding) { - // Input : 2x2x2x2 - // Input2 : 2x2x2x2 +TEST(eltwise_gpu_f32, logicalOR_in3_float_out1_int) { // Input3 : 2x2x2x2 + // Input2 : 2x2x2x2 + // Input : 2x2x2x2 // Output : 2x2x2x2 - // Input Padding: 2x1 (with reorder) // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f // // Input2 - // f0: b0: 0.5 5 b1: 2.5 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 0.5 2 b1: 2.5 4 - // f1: b0: 8 -0.5 b1: 10 -2.5 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f // // Input3 - // f0: b0: 1.1 1 b1: 4 0 - // f0: b0: 15 -1 b1: 3 6 - // f1: b0: 1.5 2 b1: 2 7 - // f1: b0: 9 0.5 b1: 1 8 + // 0.f, 1.f, 1.f, 1.f, + // 0.f, 1.f, 1.f, 0.f, + // 1.f, 1.f, 1.f, 1.f, + // 1.f, 1.f, 1.f, 1.f // // Output: - // f0: b0: 1.1 5 b1: 4 7 - // f0: b0: 15 6 b1: 17 8 - // f1: b0: 5 6 b1: 2.5 7 - // f1: b0: 9 8 b1: 12 8 - // - engine engine; + // 1, 1, 1, 1, + // 1, 1, 1, 1, + // 1, 1, 1, 1, + // 1, 1, 1, 1 - auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); - auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + const auto& engine = get_test_engine(); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("input3", input3.get_layout())); - topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); - topology.add(reorder("reorder2", "input2", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); - topology.add(reorder("reorder3", "input3", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); - topology.add(eltwise("eltwise", {"reorder", "reorder2", "reorder3"}, eltwise_mode::max)); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f }); set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - 6.f, 8.f, -0.5f, -2.5f }); + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); set_values(input3, { - 1.1f, 4.f, 1.5f, 2.f, - 1.f, 0.f, 2.f, 7.f, - 15.f, 3.f, 9.f, 1.f, - -1.f, 6.f, 0.5f, 8.f }); + 0.f, 1.f, 1.f, 1.f, + 0.f, 1.f, 1.f, 0.f, + 1.f, 1.f, 1.f, 1.f, + 1.f, 1.f, 1.f, 1.f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("input3", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2", "input3"}, eltwise_mode::logic_or)); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input1); network.set_input_data("input2", input2); network.set_input_data("input3", input3); + auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); EXPECT_EQ(outputs.begin()->first, "eltwise"); auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); - float answers[16] = { - 1.1f, 4.f, 5.f, 2.5f, - 5.f, 7.f, 6.f, 7.f, - 15.f, 17.f, 9.f, 12.f, - 6.f, 8.f, 8.f, 8.f }; - - auto output_ptr = output.pointer(); + std::vector answers = { 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1 }; + + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); + } +} + +TEST(eltwise_gpu_f32, logicalXOR_in2_float_out1_int) { + // Input2 : 2x2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // 1.f, 2.5f, 5.f, 1.5f, + // 2.f, 0.f, 6.f, 5.2f, + // 3.f, 0.5f, 7.f, 12.f, + // 4.f, 0.f, 8.f, 8.f + // + // Input2 + // 0.5f, 2.5f, 0.5f, 1.5f, + // 5.f, 7.f, 6.f, 4.f, + // 15.f, 17.f, 8.f, 10.f, + // -2.f, 0.f, -0.5f, -2.5f + // + // Output: + // 0, 0, 0, 0, + // 0, 1, 0, 0, + // 0, 0, 0, 0, + // 0, 0, 0, 0 + + const auto& engine = get_test_engine(); + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + + set_values(input1, { + 1.f, 2.5f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 1.5f, + 5.f, 7.f, 6.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 0.f, -0.5f, -2.5f + }); + + topology topology; + topology.add(input_layout("input", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::logic_xor)); + + network network(engine, topology); + + network.set_input_data("input", input1); + network.set_input_data("input2", input2); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + auto output_ptr = output.pointer(); + + std::vector answers = { 0, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; + + for (size_t i = 0; i < answers.size(); ++i) { + EXPECT_EQ(answers[i], output_ptr[i]); + } +} + +TEST(eltwise_gpu_f32, add_basic_in4x4x2x2) { + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 -2 b1: 17 6.5 + // f1: b0: 0.5 2 b1: 2.5 4 + // f1: b0: 8 -0.5 b1: 10 -2.5 + // + // Output: + // f0: b0: 1.5 7 b1: 2.5 7 + // f0: b0: 18 2 b1: 17.5 6 + // f1: b0: 5.5 8 b1: 4 9.2 + // f1: b0: 15 16.5 b1: 22 16.5 + // + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.5f, 2.5f, 5.5f, 4.f, + 7.f, 7.f, 8.f, 9.2f, + 18.f,17.5f, 15.f, 22.f, + 2.f, 6.f, 7.5f, 5.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_in2x2x2x2_broadcast_channel) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + + set_values(input, { + 1.f, 0.f, + 2.f, 0.f, + + 3.f, 0.5f, + 4.f, -0.5f, + }); + + set_values(input2, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.5f, 2.5f, + 2.5f, 2.5f, + + 6.f, 7.f, + 4.f, 4.f, + + 18.f, 17.5f, + 12.f, 9.5f, + + 1.f, 7.f, + 3.5f, -3.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_in2x2x2x2_broadcast_x) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + + set_values(input, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + + set_values(input2, { + 1.f, + 0.f, + + 2.f, + 0.f, + + 3.f, + 0.5f, + + 4.f, + -0.5f, + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.5f, 3.5f, + 0.5f, 2.5f, + + 7.f, 9.f, + 2.f, 4.f, + + 18.f, 20.f, + 8.5f, 10.5f, + + 2.f, 10.5f, + -1.f, -3.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_in2x2x2x2_broadcast_y) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + + set_values(input, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + + set_values(input2, { + 1.f, 0.f, + 2.f, 0.f, + + 3.f, 0.5f, + 4.f, -0.5f, + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.5f, 2.5f, + 2.5f, 2.5f, + + 8.f, 7.5f, + 6.f, 3.5f, + + 16.f, 17.f, + 10.f, 10.f, + + 1.f, 7.f, + 3.5f, -3.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_in2x2x2x2_broadcast_batch) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + + set_values(input, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + + set_values(input2, { + 1.f, 0.f, + + 2.f, 0.f, + + 3.f, 0.5f, + + 4.f, -0.5f, + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.5f, 2.5f, + 1.5f, 2.5f, + + 7.f, 7.f, + 4.f, 4.f, + + 18.f, 17.5f, + 11.f, 10.5f, + + 2.f, 6.f, + 3.5f, -3.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_in2x2x2x2_broadcast_multiple_dims) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum)); + + set_values(input, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + set_values(input2, { + 1.f, + 2.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.5f, 3.5f, + 1.5f, 3.5f, + + 7.f, 9.f, + 4.f, 6.f, + + 16.f, 18.f, + 9.f, 11.f, + + 0.f, 8.5f, + 1.5f, -0.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, pow_in2x2x2x2_broadcast_all) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::pow)); + + set_values(input, { + 1.f, 2.f, + 3.f, 4.f, + + 5.f, 6.f, + 7.f, 8.f, + + 9.f, 10.f, + 11.f, 12.f, + + 13.f, 14.f, + 15.f, 16.f }); + + + set_values(input2, { 2.0f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 1.f, 4.f, + 9.f, 16.f, + + 25.f, 36.f, + 49.f, 64.f, + + 81.f, 100.f, + 121.f, 144.f, + + 169.f, 196.f, + 225.f, 256.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_basic_in2x2x2x2_broadcast_2_inputs_same_dim) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("input3", input3.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2", "input3"}, eltwise_mode::sum)); + + set_values(input, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + + set_values(input2, { + 1.f, 0.f, + + 2.f, 0.f, + + 3.f, 0.5f, + + 4.f, -0.5f, + }); + + set_values(input3, { + 3.f, 2.f, + + 1.f, 2.f, + + -2.f, 1.5f, + + -4.f, 0.5f, + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 4.5f, 4.5f, + 4.5f, 4.5f, + + 8.f, 9.f, + 5.f, 6.f, + + 16.f, 19.f, + 9.f, 12.f, + + -2.f, 6.5f, + -0.5f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_basic_in2x2x2x2_broadcast_2_inputs_diff_dim) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("input3", input3.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2", "input3"}, eltwise_mode::sum)); + + set_values(input, { + 0.5f, 2.5f, + 0.5f, 2.5f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f }); + + + set_values(input2, { + 1.f, 0.f, + + 2.f, 0.f, + + 3.f, 0.5f, + + 4.f, -0.5f, + }); + + set_values(input3, { + 3.f, 2.f, + 1.f, 2.f, + + -2.f, 1.5f, + -4.f, 0.5f, + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 4.5f, 4.5f, + 2.5f, 4.5f, + + 10.f, 9.f, + 5.f, 6.f, + + 16.f, 19.f, + 7.f, 11.f, + + 0.f, 7.5f, + -0.5f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, max_basic_in4x4x4x4) { + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 0.5 2 b1: 2.5 4 + // f1: b0: 8 -0.5 b1: 10 -2.5 + // + // Output: + // f0: b0: 1 5 b1: 2.5 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 5 6 b1: 2.5 5.2 + // f1: b0: 8 8 b1: 12 8 + // + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::max)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + 6.f, 8.f, -0.5f, -2.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 1.f, 2.5f, 5.f, 2.5f, + 5.f, 7.f, 6.f, 5.2f, + 15.f, 17.f, 8.f, 12.f, + 6.f, 8.f, 8.f, 8.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, sub_basic_in4x4x4x4) { + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 0.5 2 b1: -1 2 + // f1: b0: 8 -0.5 b1: 8.5 10.5 + // + // Output: + // f0: b0: 0.5 -3 b1: -2.5 -7 + // f0: b0: -12 -2 b1: -16.5 -8.5 + // f1: b0: 4.5 4 b1: 2.5 3.2 + // f1: b0: -1 8.5 b1: 3.5 -2.5 + // + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sub)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, -1.f, + 5.f, 7.f, 2.f, 2.f, + 15.f, 17.f, 8.f, 8.5f, + 6.f, 8.f, -0.5f, 10.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 0.5f, -2.5f, 4.5f, 2.5f, + -3.f, -7.f, 4.f, 3.2f, + -12.f, -16.5f, -1.f, 3.5f, + -2.f, -8.5f, 8.5f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_int, basic_in4x4x4x4) { + // Same params as in eltwise_gpu_f32, sub_basic_in4x4x4x4 but using int types instead + + std::vector data_types_to_test = { data_types::i8, data_types::i32, data_types::i64 }; + std::vector eltwise_ops_to_test = { eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::div, eltwise_mode::prod, eltwise_mode::min, eltwise_mode::max, eltwise_mode::mod }; + + for (auto& data_type : data_types_to_test) + { + for (auto& mode : eltwise_ops_to_test) + { + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(reorder("input_reorder", "input", { data_type, format::yxfb,{ 2, 2, 2, 2 } })); + topology.add(reorder("input2_reorder", "input2", { data_type, format::yxfb,{ 2, 2, 2, 2 } })); + topology.add(eltwise("eltwise", { "input_reorder", "input2_reorder" }, mode)); + topology.add(reorder("eltwise_reorder", "eltwise", { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } })); + + std::vector input_1_vec = { + 1.f, 0.f, 5.f, 1.f, + 2.f, 0.f, 6.f, 5.f, + 3.f, 0.f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f + }; + set_values(input, input_1_vec); + + std::vector input_2_vec = { + 0.f, 2.f, 0.f, -1.f, + 5.f, 7.f, 2.f, 2.f, + 15.f, 17.f, 8.f, 8.f, + 6.f, 8.f, 0.f, 10.f }; + set_values(input2, input_2_vec); + + network network(engine, topology); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + ASSERT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise_reorder"); + + auto output = outputs.at("eltwise_reorder").get_memory(); + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + float expected = 0.f; + if (mode == eltwise_mode::sum) + expected = input_1_vec[i] + input_2_vec[i]; + else if (mode == eltwise_mode::sub) + expected = input_1_vec[i] - input_2_vec[i]; + else if (mode == eltwise_mode::prod) + expected = input_1_vec[i] * input_2_vec[i]; + else if (mode == eltwise_mode::div) + expected = input_1_vec[i] / input_2_vec[i]; + else if (mode == eltwise_mode::min) + expected = std::min(input_1_vec[i], input_2_vec[i]); + else if (mode == eltwise_mode::max) + expected = std::max(input_1_vec[i], input_2_vec[i]); + else if (mode == eltwise_mode::mod) { + expected = std::fmod(input_1_vec[i], input_2_vec[i]); + } + + + EXPECT_TRUE(are_equal(std::floor(expected), output_ptr[i])); + } + } + } +} + +TEST(eltwise_gpu_f32_int, basic_in4x4x4x4) { + // Same params as in eltwise_gpu_f32, sub_basic_in4x4x4x4 but using int types for first input. + // + // Eltwise supports mixed inputs, but only first input can be set as intX. + + std::vector data_types_to_test = { data_types::i8, data_types::i32, data_types::i64 }; + std::vector eltwise_ops_to_test = { eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::div, eltwise_mode::prod, eltwise_mode::min, eltwise_mode::max, eltwise_mode::mod }; + + for (auto& data_type : data_types_to_test) + { + for (auto& mode : eltwise_ops_to_test) + { + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(reorder("input_reorder", "input", { data_type, format::yxfb,{ 2, 2, 2, 2 } })); + topology.add(eltwise("eltwise", { "input_reorder", "input2" }, mode)); + topology.add(reorder("eltwise_reorder", "eltwise", { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } })); + + std::vector input_1_vec = { + 1.f, 0.f, 5.f, 1.f, + 2.f, 0.f, 6.f, 5.f, + 3.f, 0.f, 7.f, 12.f, + 4.f, 0.f, 8.f, 8.f + }; + set_values(input, input_1_vec); + + std::vector input_2_vec = { + 0.f, 2.f, 0.f, -1.f, + 5.f, 7.f, 2.f, 2.f, + 15.f, 17.f, 8.f, 8.f, + 6.f, 8.f, 0.f, 10.f }; + set_values(input2, input_2_vec); + + network network(engine, topology); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + ASSERT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise_reorder"); + + auto output = outputs.at("eltwise_reorder").get_memory(); + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + float expected = 0.f; + if (mode == eltwise_mode::sum) + expected = input_1_vec[i] + input_2_vec[i]; + else if (mode == eltwise_mode::sub) + expected = input_1_vec[i] - input_2_vec[i]; + else if (mode == eltwise_mode::prod) + expected = input_1_vec[i] * input_2_vec[i]; + else if (mode == eltwise_mode::div) + expected = input_1_vec[i] / input_2_vec[i]; + else if (mode == eltwise_mode::min) + expected = std::min(input_1_vec[i], input_2_vec[i]); + else if (mode == eltwise_mode::max) + expected = std::max(input_1_vec[i], input_2_vec[i]); + else if (mode == eltwise_mode::mod) + expected = std::fmod(input_1_vec[i], input_2_vec[i]); + + EXPECT_TRUE(are_equal(std::floor(expected), output_ptr[i])); + } + } + } +} + +TEST(eltwise_gpu_f32, prod_basic_in4x4x4x4) { + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1 5.2 + // f1: b0: 7 8 b1: 12 7.5 + // + // Input2 + // f0: b0: 0.5 0.5 b1: 5 2 + // f0: b0: 2.5 2.5 b1: 7 4 + // f1: b0: 15 8 b1: 6 -0.5 + // f1: b0: 17 10 b1: 8 -2.5 + // + // Output: + // f0: b0: 0.5 1 b1: 0 0 + // f0: b0: 7.5 10 b1: 3.5 -2 + // f1: b0: 75 48 b1: 6 -2.6 + // f1: b0: 119 80 b1: 96 -18.75 + // + + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::prod)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 7.5f + }); + + set_values(input2, { + 0.5f, 5.f, 15.f, 6.f, + 0.5f, 2.f, 8.f, -0.5f, + 2.5f, 7.f, 17.f, 8.f, + 2.5f, 4.f, 10.f, -2.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 0.5f, 0.0f, 75.f, 6.0f, + 1.0f, 0.0f, 48.f, -2.6f, + 7.5f, 3.5f, 119.f, 96.0f, + 10.0f, -2.0f, 80.f, -18.75f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, max_basic_in4x4x4x4_input_padding) { + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + // Input Padding: 2x1 (with reorder) + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 0.5 2 b1: 2.5 4 + // f1: b0: 8 -0.5 b1: 10 -2.5 + // + // Output: + // f0: b0: 1 5 b1: 2.5 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 5 6 b1: 2.5 5.2 + // f1: b0: 8 8 b1: 12 8 + // + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); + topology.add(reorder("reorder2", "input2", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); + topology.add(eltwise("eltwise", {"reorder", "reorder2"}, eltwise_mode::max)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + 6.f, 8.f, -0.5f, -2.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 1.f, 2.5f, 5.f, 2.5f, + 5.f, 7.f, 6.f, 5.2f, + 15.f, 17.f, 8.f, 12.f, + 6.f, 8.f, 8.f, 8.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, add_basic_in4x4x2x2_with_coefficients) { + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 -2 b1: 17 6.5 + // f1: b0: 0.5 2 b1: 2.5 4 + // f1: b0: 8 -0.5 b1: 10 -2.5 + // + // Output: + // f0: b0: 0.75 3.5 b1: 1.25 3.5 + // f0: b0: 9 1 b1: 8.75 3 + // f1: b0: 2.75 4 b1: 2 4.6 + // f1: b0: 7.5 8.25 b1: 11 8.25 + // + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2"}, eltwise_mode::sum, {0.5f, 0.5f})); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 0.75f, 1.25f, 2.75f, 2.f, + 3.5f, 3.5f, 4.f, 4.6f, + 9.f, 8.75f, 7.5f, 11.f, + 1.f, 3.f, 3.75f, 2.75f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, coefficients_count_check) { + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("input3", input3.get_layout())); + + std::vector coeffs0 = {}; + std::vector coeffs1 = {0.5f}; + std::vector coeffs2 = {0.5f, 0.5f}; + std::vector coeffs3 = {0.5f, 0.5f, 0.5f}; + + EXPECT_THROW(topology.add(eltwise("eltwise1", {"input", "input2"}, eltwise_mode::sum, coeffs1)), std::invalid_argument); + EXPECT_THROW(topology.add(eltwise("eltwise2", {"input", "input2"}, eltwise_mode::sum, coeffs3)), std::invalid_argument); + + EXPECT_THROW(topology.add(eltwise("eltwise3", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs1)), std::invalid_argument); + EXPECT_THROW(topology.add(eltwise("eltwise4", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs2)), std::invalid_argument); + + EXPECT_NO_THROW(topology.add(eltwise("eltwise5", {"input", "input2"}, eltwise_mode::sum, coeffs0))); + EXPECT_NO_THROW(topology.add(eltwise("eltwise6", {"input", "input2"}, eltwise_mode::sum, coeffs2))); + + EXPECT_NO_THROW(topology.add(eltwise("eltwise7", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs0))); + EXPECT_NO_THROW(topology.add(eltwise("eltwise8", {"input", "input2", "input3"}, eltwise_mode::sum, coeffs3))); +} + +TEST(eltwise_gpu_f32, add_basic_in4x4x2x2_with_coefficients_3inputs) { + // Input3 : 2x2x2 + // Input2 : 2x2x2 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 -2 b1: 17 6.5 + // f1: b0: 0.5 2 b1: 2.5 4 + // f1: b0: 8 -0.5 b1: 10 -2.5 + // + // Input3 + // f0: b0: 8 7 b1: 0 1 + // f0: b0: 6 5 b1: 0 1 + // f1: b0: 4 3 b1: 0 1 + // f1: b0: 2 1 b1: 0 1 + // + // Output: + // f0: b0: 4.75 7 b1: 1.25 4 + // f0: b0: 12 3.5 b1: 8.75 3.5 + // f1: b0: 4.75 5.5 b1: 2 5.1 + // f1: b0: 8.5 8.75 b1: 11 8.75 + // + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("input3", input3.get_layout())); + topology.add(eltwise("eltwise", {"input", "input2", "input3"}, eltwise_mode::sum, {0.5f, 0.5f, 0.5f})); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(input3, { + 8.f, 0.f, 4.f, 0.f, + 7.f, 1.f, 3.f, 1.f, + 6.f, 0.f, 2.f, 0.f, + 5.f, 1.f, 1.f, 1.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { 4.75f, 1.25f, 4.75f, 2.f, + 7.0f, 4.0f, 5.5f, 5.1f, + 12.f, 8.75f, 8.5f, 11.f, + 3.5f, 3.5f, 4.25f, 3.25f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, max_3inputs_in4x4x4x4_input_padding) { + // Input : 2x2x2x2 + // Input2 : 2x2x2x2 + // Input3 : 2x2x2x2 + // Output : 2x2x2x2 + // Input Padding: 2x1 (with reorder) + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 5 b1: 2.5 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 0.5 2 b1: 2.5 4 + // f1: b0: 8 -0.5 b1: 10 -2.5 + // + // Input3 + // f0: b0: 1.1 1 b1: 4 0 + // f0: b0: 15 -1 b1: 3 6 + // f1: b0: 1.5 2 b1: 2 7 + // f1: b0: 9 0.5 b1: 1 8 + // + // Output: + // f0: b0: 1.1 5 b1: 4 7 + // f0: b0: 15 6 b1: 17 8 + // f1: b0: 5 6 b1: 2.5 7 + // f1: b0: 9 8 b1: 12 8 + // + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("input3", input3.get_layout())); + topology.add(reorder("reorder", "input", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); + topology.add(reorder("reorder2", "input2", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); + topology.add(reorder("reorder3", "input3", input.get_layout().with_padding({ { 0, 0, 2, 1 }, 0 }))); + topology.add(eltwise("eltwise", {"reorder", "reorder2", "reorder3"}, eltwise_mode::max)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + 6.f, 8.f, -0.5f, -2.5f }); + + set_values(input3, { + 1.1f, 4.f, 1.5f, 2.f, + 1.f, 0.f, 2.f, 7.f, + 15.f, 3.f, 9.f, 1.f, + -1.f, 6.f, 0.5f, 8.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 1.1f, 4.f, 5.f, 2.5f, + 5.f, 7.f, 6.f, 7.f, + 15.f, 17.f, 9.f, 12.f, + 6.f, 8.f, 8.f, 8.f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + + +TEST(eltwise_gpu_f32, stride_test_2x2) { + // Input : 2x2x2x2 + // Input2 : 2x2x4x4 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 1 2 3 4 b1: 17 18 19 20 + // f0: b0: 5 6 7 8 b1: 21 22 23 24 + // f0: b0: 9 10 11 12 b1: 25 26 27 28 + // f0: b0: 13 14 15 16 b1: 29 30 31 32 + + // f1: b0: 33 34 35 36 b1: 49 50 51 52 + // f1: b0: 37 38 39 40 b1: 53 54 55 56 + // f1: b0: 41 42 43 44 b1: 57 58 59 60 + // f1: b0: 45 46 47 48 b1: 61 62 63 64 + + // + // Output: + // f0: b0: 1 3 b1: 17 19 + // f0: b0: 9 11 b1: 25 27 + // f1: b0: 33 35 b1: 49 51 + // f1: b0: 41 43 b1: 57 59 + // + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 4, 4 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", "input", "input2", { {0,0,1,1}, {0,0,2,2} }, eltwise_mode::max)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 1, 17, 33, 49, + 2, 18, 33, 50, + 3, 19, 35, 51, + 4, 20, 36, 52, + 5, 21, 37, 53, + 6, 22, 38, 54, + 7, 23, 39, 55, + 8, 24, 40, 56, + 9, 25, 41, 57, + 10, 26, 42, 58, + 11, 27, 43, 59, + 12, 28, 44, 60, + 13, 29, 45, 61, + 14, 30, 46, 62, + 15, 31, 47, 63, + 16, 32, 48, 64 }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 1, 17, 33, 49, + 3, 19, 35, 51, + 9, 25, 41, 57, + 11, 27, 43, 59 }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(eltwise_gpu_f32, broadcast_test_in4x4x2x2) { + // Input2 : 2x1x1 + // Input : 2x2x2x2 + // Output : 2x2x2x2 + + // Input: + // f0: b0: 1 2 b1: 0 0 + // f0: b0: 3 4 b1: 0.5 -0.5 + // f1: b0: 5 6 b1: 1.5 5.2 + // f1: b0: 7 8 b1: 12 8 + // + // Input2 + // f0: b0: 0.5 b1: 2.5 + // f1: b0: 0.5 b1: 2.5 + // + // Output: + // f0: b0: 1.5 7 b1: 2.5 7 + // f0: b0: 18 2 b1: 17.5 6 + // f1: b0: 5.5 8 b1: 4 9.2 + // f1: b0: 15 16.5 b1: 22 16.5 + // + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(eltwise("eltwise", { "input", "input2" }, eltwise_mode::sum)); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f + }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output = outputs.at("eltwise").get_memory(); + + float answers[16] = { + 1.5f, 2.5f, 5.5f, 4.f, + 2.5f, 2.5f, 6.5f, 7.7f, + 3.5f, 3.f, 7.5f, 14.5f, + 4.5f, 2.f, 8.5f, 10.5f }; + + auto output_ptr = output.pointer(); for (int i = 0; i < 16; i++) { @@ -866,12 +2443,174 @@ TEST(eltwise_gpu_f32, max_3inputs_in4x4x4x4_input_padding) { } +template +int8_t eltwise_bool_execute(cldnn::eltwise_mode mode, T x, T y) { + switch (mode) { + case eltwise_mode::eq: + return x == y; + case eltwise_mode::ne: + return x != y; + case eltwise_mode::lt: + return x < y; + case eltwise_mode::le: + return x <= y; + case eltwise_mode::gt: + return x > y; + case eltwise_mode::ge: + return x >= y; + case eltwise_mode::logic_and: + return x && y; + case eltwise_mode::logic_or: + return x || y; + default: + return (int8_t)0; + } +} + +template +VVVVF eltwise_bool_reference(VVVVF &input1, VVVVF &input2, + cldnn::eltwise_mode mode, int input_padding_y = 0, + int input_padding_x = 0, int output_padding_y = 0, + int output_padding_x = 0) { + + size_t padding_y = input_padding_y + output_padding_y; + size_t padding_x = input_padding_x + output_padding_x; + size_t output_b = input1.size(); + size_t output_f = input1[0].size(); + size_t output_y = input1[0][0].size() + 2 * padding_y; + size_t output_x = input1[0][0][0].size() + 2 * padding_x; + VVVVF output(output_b, VVVF(output_f, VVF(output_y, VF(output_x)))); + + T res; + for (size_t b = 0; b < output_b; ++b) { + for (size_t f = 0; f < output_f; ++f) { + for (size_t y = 0; y < input1[0][0].size(); ++y) { + for (size_t x = 0; x < input1[0][0][0].size(); ++x) { + res = eltwise_bool_execute(mode, input1[b][f][y][x], input2[b][f][y][x]); + output[b][f][y + padding_y][x + padding_x] = res; + } + } + } + } + return output; +} + +template +void generic_eltwise_bool_test(cldnn::format test_input_fmt, int input_b, int input_f, int input_y, int input_x, cldnn::eltwise_mode mode, + int input_padding_y, int input_padding_x, int output_padding_y, int output_padding_x) { + + int min_random = -2, max_random = 2; + VVVVF input1_rnd = generate_random_4d(input_b, input_f, input_y, input_x, min_random, max_random); + VVVVF input2_rnd = generate_random_4d(input_b, input_f, input_y, input_x, min_random, max_random); + VF input1_rnd_vec = flatten_4d(test_input_fmt, input1_rnd); + VF input2_rnd_vec = flatten_4d(test_input_fmt, input2_rnd); + + const auto& engine = get_test_engine(); + tensor input_tensor( input_b, input_f, input_x, input_y ); + auto input1 = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); + auto input2 = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); + set_values(input1, input1_rnd_vec); + set_values(input2, input2_rnd_vec); + + topology topology; + topology.add(input_layout("input1", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(reorder("reorder1", "input1", input1.get_layout().with_padding({{ 0, 0, input_padding_x, input_padding_y }, 0 }))); + topology.add(eltwise("eltwise", {"reorder1", "input2"}, mode, false, 0.f, { { 0, 0, output_padding_x, output_padding_y }, 0 })); + + network network(engine, topology); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "eltwise"); + + auto output_memory = outputs.at("eltwise").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); + + VVVVF output_cpu = eltwise_bool_reference(input1_rnd, input2_rnd, mode, input_padding_y, input_padding_x, output_padding_y, output_padding_x); + EXPECT_EQ(output_layout.format.value, test_input_fmt.value); + tensor output_tensor = output_layout.get_buffer_size(); + int y_size = output_tensor.spatial[1]; + int x_size = output_tensor.spatial[0]; + int f_size = output_tensor.feature[0]; + int b_size = output_tensor.batch[0]; + EXPECT_EQ(y_size, (int)output_cpu[0][0].size()); + EXPECT_EQ(x_size, (int)output_cpu[0][0][0].size()); + EXPECT_EQ(f_size, (int)output_cpu[0].size()); + EXPECT_EQ(b_size, (int)output_cpu.size()); + + bool test_is_correct = true; + VF output_cpu_vec = flatten_4d(test_input_fmt, output_cpu); + for (size_t i = 0; i < output_cpu_vec.size(); ++i) { + if (output_cpu_vec[i] != output_ptr[i]) { + test_is_correct = false; + break; + } + } + EXPECT_EQ(test_is_correct, true) << std::endl + << "failing test parameters:" << std::endl + << "input_b = " << input_b << std::endl + << "input_f = " << input_f << std::endl + << "input_y = " << input_y << std::endl + << "input_x = " << input_x << std::endl + << "eltwise_mode = " << (int)mode << std::endl + << "input_padding_y = " << input_padding_y << std::endl + << "input_padding_x = " << input_padding_x << std::endl + << "output_padding_y = " << output_padding_y << std::endl + << "output_padding_x = " << output_padding_x << std::endl + << "type = " << (sizeof(T) == 1 ? "int8" : "int32") << std::endl; +} + +void run_eltwise_bool_generic_test(cldnn::eltwise_mode mode) +{ + cldnn::format test_inputs_fmt = cldnn::format::bfyx; + std::pair input_size = { 227, 227 }; + + generic_eltwise_bool_test(test_inputs_fmt, 1, 1, input_size.first, input_size.second, mode, 0, 0, 0, 0); + generic_eltwise_bool_test(test_inputs_fmt, 1, 1, input_size.first, input_size.second, mode, 0, 0, 0, 0); +} + +TEST(eltwise_gpu_bool, eltwise_eq) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::eq); +} + +TEST(eltwise_gpu_bool, eltwise_ne) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::ne); +} + +TEST(eltwise_gpu_bool, eltwise_lt) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::lt); +} + +TEST(eltwise_gpu_bool, eltwise_le) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::le); +} + +TEST(eltwise_gpu_bool, eltwise_gt) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::gt); +} + +TEST(eltwise_gpu_bool, eltwise_ge) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::ge); +} + +TEST(eltwise_gpu_bool, eltwise_and) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::logic_and); +} + +TEST(eltwise_gpu_bool, eltwise_or) { + run_eltwise_bool_generic_test(cldnn::eltwise_mode::logic_or); +} + + void run_eltwise_generic_test(cldnn::eltwise_mode mode) { cldnn::format test_inputs_fmt = cldnn::format::bfyx; std::pair input_size = { 227, 227 }; - engine engine; + const auto& engine = get_test_engine(); bool f16_supported = !!engine.get_info().supports_fp16; if (!f16_supported) { std::cout << "[ SKIPPED ] float16 combinations are skipped (cl_khr_fp16 is not supported)." << std::endl; @@ -898,6 +2637,295 @@ TEST(eltwise_gpu, eltwise_mod) { run_eltwise_generic_test(cldnn::eltwise_mode::mod); } + +TEST(eltwise_gpu, b_fs_yx_fsv4_w_callib) { + int B_array[] = { 1, 4, 16, 32, 0 }; // Batch + int F_array[] = { 256, 512, 1024, 2048, 0 }; // Features + int I_array[] = { 56, 28, 14, 14, 0 }; // Input MxM data sizes + + for (int j = 0; F_array[j]; j++) { + const auto& engine = get_test_engine(); + + int in_B = B_array[j]; + int in_F = F_array[j]; + + int in_X = I_array[j], + in_Y = in_X; + + // Input data init + std::vector Data(in_B * in_F * in_X * in_Y); + for (size_t i = 0; i < Data.size(); i++) + Data[i] = static_cast(i); + std::vector DataGold(Data); + + // Expected "gold" output and IMAD output. + std::vector vGoldOutput; + std::vector vTestOutput; + + // Mem initialization + // This is user data, no kernels here + auto input1 = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + std::vector data_i1(DataGold); + set_values(input1, std::move(data_i1)); + auto input2 = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + std::vector data_i2(DataGold); + set_values(input2, std::move(data_i2)); + + auto callib = memory::allocate(engine, + { data_types::f32, + format::bfyx, + { 1, in_F, 1, 1 } }); + std::vector data_c(in_F); + float sign = 1; + for (size_t i = 0; i < data_c.size(); i++) { + data_c[i] = ((i + 1) % 7) ? sign : -sign; + sign *= (float)1.0123; + } + set_values(callib, std::move(data_c)); + + // "Golden" Eltwise + { + topology topology; + + auto eltw = eltwise("eltw_GOLD", + "input1", "input2", + "callib", + eltwise_mode::sum, true); + + // Create a topology + topology.add(input_layout("input1", input1.get_layout()), + input_layout("input2", input2.get_layout()), + eltw); + + topology.add(data("callib", callib)); + + // Network processing + network network(engine, topology); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + // Validation + auto searchC = outputs.find("eltw_GOLD"); + EXPECT_NE(searchC, outputs.end()); + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + vGoldOutput.reserve(output_ptr.size()); + for (size_t i = 0; i < output_ptr.size(); i++) + vGoldOutput.push_back(output_ptr[i]); + } + + // "IMAD" Eltwise + { + topology topology; + + // Reorder (a-ka swizzelling) input to MMAD/IMAD Pooling format + topology.add(reorder("reorder1_Swizzelled", + "input1", + layout(data_types::i8, + format::b_fs_yx_fsv4, + { in_B, in_F, in_X, in_Y })), + reorder("reorder2_Swizzelled", + "input2", + layout(data_types::i8, + format::b_fs_yx_fsv4, + { in_B, in_F, in_X, in_Y }))); + + auto eltw = eltwise("eltw_IMAD", + "reorder1_Swizzelled", "reorder2_Swizzelled", + "callib", + eltwise_mode::sum, true); + + topology.add(input_layout("input1", input1.get_layout()), + input_layout("input2", input2.get_layout()), + eltw); + + topology.add(data("callib", callib)); + + // Back reordering (a-ka unswizzelling) output from MMAD/IMAD pooling + topology.add(reorder("reorder_UnSwizzelled", + "eltw_IMAD", + layout(data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y }))); + + // Network processing + network network(engine, topology); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + auto outputs = network.execute(); + + // Validation + auto searchC = outputs.find("reorder_UnSwizzelled"); + EXPECT_NE(searchC, outputs.end()); + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + vTestOutput.reserve(output_ptr.size()); + for (size_t i = 0; i < output_ptr.size(); i++) + vTestOutput.push_back(output_ptr[i]); + } + + // Result validation + ASSERT_TRUE(vGoldOutput.size() == vTestOutput.size()); + for (size_t i = 0; i < vGoldOutput.size(); i++) + ASSERT_TRUE(vTestOutput[i] == vGoldOutput[i]); + } // for (int j = 0; F_array[j]; j++) +} + +TEST(eltwise_gpu, b_fs_yx_fsv4_wo_callib) { + // + // Input data + const int BATCH = 1; + const int in_B = BATCH; + + const auto& engine = get_test_engine(); + + int in_F = 256; + + int in_X = 56, + in_Y = in_X; + + // Input data init + std::vector Data(in_B * in_F * in_X * in_Y); + for (size_t i = 0; i < Data.size(); i++) + Data[i] = static_cast(i); + std::vector DataGold(Data); + + // Mem initialization + // This is user data, no kernels here + auto input1 = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + std::vector data_i1(DataGold); + for (size_t i = 0; i < data_i1.size(); i++) data_i1[i] = data_i1[i] + 1; + set_values(input1, std::move(data_i1)); + + auto input2 = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + std::vector data_i2(DataGold); + for (size_t i = 0; i < data_i2.size(); i++) data_i2[i] = data_i2[i] + 2; + set_values(input2, std::move(data_i2)); + + auto input3 = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + std::vector data_i3(DataGold); + for (size_t i = 0; i < data_i3.size(); i++) data_i3[i] = data_i3[i] + 3; + set_values(input3, std::move(data_i3)); + + cldnn::eltwise_mode mode[] = { cldnn::eltwise_mode::min, + cldnn::eltwise_mode::max, + cldnn::eltwise_mode::sum }; + + for (int i = 0; i < 3; i++) { + // Expected "gold" output and IMAD output. + std::vector vGoldOutput; + std::vector vTestOutput; + + // "Golden" Eltwise + { + topology topology; + + auto eltw = eltwise("eltw_GOLD", + { "input1", "input2", "input3" }, + mode[i], true); + + // Create a topology + topology.add(input_layout("input1", input1.get_layout()), + input_layout("input2", input2.get_layout()), + input_layout("input3", input3.get_layout()), + eltw); + + // Network processing + network network(engine, topology); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + // Validation + auto searchC = outputs.find("eltw_GOLD"); + EXPECT_NE(searchC, outputs.end()); + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + vGoldOutput.reserve(output_ptr.size()); + for (size_t i = 0; i < output_ptr.size(); i++) + vGoldOutput.push_back(output_ptr[i]); + } + + // "IMAD" Eltwise + { + topology topology; + + // Reorder (a-ka swizzelling) input to MMAD/IMAD Pooling format + topology.add(reorder("reorder1_Swizzelled", + "input1", + layout(data_types::i8, + format::b_fs_yx_fsv4, + { in_B, in_F, in_X, in_Y })), + reorder("reorder2_Swizzelled", + "input2", + layout(data_types::i8, + format::b_fs_yx_fsv4, + { in_B, in_F, in_X, in_Y })), + reorder("reorder3_Swizzelled", + "input3", + layout(data_types::i8, + format::b_fs_yx_fsv4, + { in_B, in_F, in_X, in_Y }))); + + auto eltw = eltwise("eltw_IMAD", + { "reorder1_Swizzelled", + "reorder2_Swizzelled", + "reorder3_Swizzelled" }, + mode[i], true); + + topology.add(input_layout("input1", input1.get_layout()), + input_layout("input2", input2.get_layout()), + input_layout("input3", input3.get_layout()), + eltw); + + // Back reordering (a-ka unswizzelling) output from MMAD/IMAD pooling + topology.add(reorder("reorder_UnSwizzelled", + "eltw_IMAD", + layout(data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y }))); + + // Network processing + network network(engine, topology); + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + // Validation + auto searchC = outputs.find("reorder_UnSwizzelled"); + EXPECT_NE(searchC, outputs.end()); + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + vTestOutput.reserve(output_ptr.size()); + for (size_t i = 0; i < output_ptr.size(); i++) + vTestOutput.push_back(output_ptr[i]); + } + + // Result validation + ASSERT_TRUE(vGoldOutput.size() == vTestOutput.size()); + for (size_t i = 0; i < vGoldOutput.size(); i++) + ASSERT_TRUE(vTestOutput[i] == vGoldOutput[i]); + } +} + TEST(DISABLED_eltwise_gpu, generic_random) { VF test_inputs_fmts = { cldnn::format::bfyx, cldnn::format::yxfb }; VF modes = { cldnn::eltwise_mode::sum, cldnn::eltwise_mode::sub, cldnn::eltwise_mode::max, cldnn::eltwise_mode::prod }; @@ -905,7 +2933,7 @@ TEST(DISABLED_eltwise_gpu, generic_random) { VF slopes = { 0.0f, -0.0f, -17.19f, 1028.8f, std::numeric_limits::infinity(), -std::numeric_limits::infinity() }; std::vector> input_sizes = { { 100, 100 },{ 227, 227 },{ 400, 600 } }; - engine engine; + const auto& engine = get_test_engine(); bool f16_supported = !!engine.get_info().supports_fp16; if (!f16_supported) { std::cout << "[ SKIPPED ] float16 combinations are skipped (cl_khr_fp16 is not supported)." << std::endl; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp index be00e84..1ed4515 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp @@ -53,12 +53,12 @@ TEST(embed_gpu, seq3num4) { // 0.0 0.0 0.0 0.0 // 6.0 8.0 -2.0 -2.0 - engine engine; + const auto& engine = get_test_engine(); auto batch = 1; auto sequence_length = 3; auto num_output_size = 4; auto vocab_size = 3; - auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, 1, sequence_length } }); + auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, sequence_length, 1 } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ num_output_size, 1, vocab_size, 1 } }); auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, 1, num_output_size } }); auto output_ref = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, sequence_length, num_output_size, 1 } }); @@ -119,12 +119,12 @@ TEST(embed_gpu, b2seq2num3) { // -1.0 0.0 1.0 -1.0 4.0 4.0 // 10.0 18.0 19.0 -1.0 0.0 1.0 - engine engine; + const auto& engine = get_test_engine(); auto batch = 2; auto sequence_length = 2; auto num_output_size = 3; auto vocab_size = 3; - auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, 1, sequence_length } }); + auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, sequence_length, 1 } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ num_output_size, 1, vocab_size, 1 } }); auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 1, num_output_size } }); auto output_ref = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, sequence_length, num_output_size, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp index f07fa00..4882d9a 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,11 +54,11 @@ VVVVF fully_connected_reference(VVVVF &input, VVVVF &weights, VF &bi size_t input_y = input[0][0].size(); size_t input_x = input[0][0][0].size(); size_t output_b = input.size(); // input is assumed to be bfyx - size_t output_x = weights.size(); // weights is assumed to be bfyx - VVVVF output(output_b, VVVF(1, VVF(1, VF(output_x)))); + size_t output_f = weights.size(); // weights is assumed to be bfyx + VVVVF output(output_b, VVVF(1, VVF(1, VF(output_f)))); float res; for (size_t b = 0; b < output_b; ++b) { - for (size_t n = 0; n < output_x; ++n) { + for (size_t n = 0; n < output_f; ++n) { res = bias[n]; for (size_t f = 0; f < input_f; ++f) { for (size_t y = 0; y < input_y; ++y) { @@ -76,20 +76,20 @@ VVVVF fully_connected_reference(VVVVF &input, VVVVF &weights, VF &bi } template -void generic_fully_connected_test(cldnn::format test_input_fmt, cldnn::format test_weights_fmt, int input_b, int f, int y, int x, int output_x, bool relu, T slope = 0) { +void generic_fully_connected_test(cldnn::format test_input_fmt, cldnn::format test_weights_fmt, int input_b, int f, int y, int x, int output_f, bool relu, T slope = 0) { int min_random = -2, max_random = 2; VVVVF input_rnd = generate_random_4d(input_b, f, y, x, min_random, max_random); - VVVVF weights_rnd = generate_random_4d(output_x, f, y, x, min_random, max_random); - VF bias_rnd_vec = generate_random_1d(output_x, min_random, max_random); + VVVVF weights_rnd = generate_random_4d(output_f, f, y, x, min_random, max_random); + VF bias_rnd_vec = generate_random_1d(output_f, min_random, max_random); VF input_rnd_vec = flatten_4d(test_input_fmt, input_rnd); VF weights_rnd_vec = flatten_4d(test_weights_fmt, weights_rnd); - engine engine; + const auto& engine = get_test_engine(); tensor input_tensor(input_b, f, x, y); - tensor weights_tensor(output_x, f, x, y); + tensor weights_tensor(output_f, f, x, y); auto input = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); auto weights = memory::allocate(engine, { type_to_data_type::value, test_weights_fmt, weights_tensor }); - auto bias = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1,1,output_x,1 } }); + auto bias = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1,1,output_f,1 } }); set_values(input, input_rnd_vec); set_values(weights, weights_rnd_vec); set_values(bias, bias_rnd_vec); @@ -115,9 +115,9 @@ void generic_fully_connected_test(cldnn::format test_input_fmt, cldnn::format te //EXPECT_EQ(output_layout.format.value, test_input_fmt); tensor output_tensor = output_layout.size; int b_size = output_tensor.batch[0]; - int x_size = output_tensor.spatial[0]; + int x_size = output_tensor.feature[0]; EXPECT_EQ(b_size, input_b); - EXPECT_EQ(x_size, output_x); + EXPECT_EQ(x_size, output_f); unsigned num_of_operations = f * x * y * 2; float ulp = (1.0f / 1024.0f) * num_of_operations; bool test_is_correct = true; @@ -139,7 +139,7 @@ void generic_fully_connected_test(cldnn::format test_input_fmt, cldnn::format te << "f = " << f << std::endl << "y = " << y << std::endl << "x = " << x << std::endl - << "output_x = " << output_x << std::endl + << "output_f = " << output_f << std::endl << "relu = " << relu << std::endl << "slope = " << (float)slope << std::endl << "type = " << (sizeof(T) == 2 ? "float16" : "float32") << std::endl; @@ -154,7 +154,7 @@ TEST(DISABLED_fully_connected_gpu, generic_random_short) { std::vector> input_sizes = { {28, 28}, {64, 64}, {100, 100}, {227, 227}, {1000, 1}, {1, 4096} }; VF outputs_x = { 5, 16 }; - engine engine; + const auto& engine = get_test_engine(); bool f16_supported = !!engine.get_info().supports_fp16; if (!f16_supported) { std::cout << "[ SKIPPED ] float16 combinations are skipped (cl_khr_fp16 is not supported)." << std::endl; @@ -165,18 +165,18 @@ TEST(DISABLED_fully_connected_gpu, generic_random_short) { for (const auto& b : batches) { for(const auto& f : features) { for (const auto& sizes : input_sizes) { - for (int output_x : outputs_x) { + for (int output_f : outputs_x) { for (bool relu_activated : relu) { - generic_fully_connected_test(test_input_fmt, test_weights_fmt, b, f, sizes.second, sizes.first, output_x, relu_activated); + generic_fully_connected_test(test_input_fmt, test_weights_fmt, b, f, sizes.second, sizes.first, output_f, relu_activated); if (!f16_supported) continue; - generic_fully_connected_test(test_input_fmt, test_weights_fmt, b, f, sizes.second, sizes.first, output_x, relu_activated); + generic_fully_connected_test(test_input_fmt, test_weights_fmt, b, f, sizes.second, sizes.first, output_f, relu_activated); } } } } } } - } + } } TEST(fully_connected_gpu, no_biases) { @@ -203,7 +203,7 @@ TEST(fully_connected_gpu, no_biases) { const int32_t input_x = 3, input_b = 1, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b, 1, input_x, 1} }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); @@ -261,7 +261,7 @@ TEST(fully_connected_gpu, no_biases_int8) { const int32_t input_x = 3, input_b = 1, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ input_b, 1, input_x, 1 } }); auto weights_prim = memory::allocate(engine, { data_types::i8,format::bfyx,{ weight_b, 1, weight_x, 1 } }); @@ -319,16 +319,15 @@ TEST(fully_connected_gpu, xb_f32_batch_1) { // Output: // 2.5 2.75 0.75 7 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, input_b = 1, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate( engine, { data_types::f32, format::yxfb, { input_b, 1, input_x, 1 } }); - //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_x } },{ 1 } } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx, { 1,1,output_x, 1} }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx, { 1,1,output_f, 1} }); set_values(input_prim, { -0.5f, 2.0f, 0.5f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -380,16 +379,15 @@ TEST(fully_connected_gpu, xb_f32_batch_2) { // 2.5 2.75 0.75 7 // 4 1 2.75 5 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, input_b = 2, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b,1,input_x, 1 } }); - //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_x } },{ 1 } } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_x,1 } }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } }); set_values(input_prim, { -0.5f, 1.0f, 2.0f, 1.5f, 0.5f, 0.0f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -442,16 +440,16 @@ TEST(fully_connected_gpu, x_f32) { // Output: // 2.5 2.75 0.75 7 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx, { 1,1,input_x,1 } }); - //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_x } },{ 1 } } }); + //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_f } },{ 1 } } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_x,1 } }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } }); set_values(input_prim, { -0.5f, 2.0f, 0.5f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -492,7 +490,7 @@ TEST(fully_connected_gpu, yxfn_f32) { // 3 -4 f1: b0 // Weights: - // 1 -1 n0: fm0 + // 1 -1 n0: fm0 // 2 0 n0: fm1 // 3 4 n1: fm0 // 0.5 5 n1: fm1 @@ -503,7 +501,7 @@ TEST(fully_connected_gpu, yxfn_f32) { // Output: // 10 -28.5 - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 2, 1 } }); //auto output_prim = memory::allocate({ memory::format::xb_f32,{ 2 ,{ { 1 } }, 1 } }); @@ -557,16 +555,16 @@ TEST(fully_connected_gpu, xb_f32_batch_1_relu) { // Output: // 2.5 0 0.75 0 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, input_b = 1, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b, 1, input_x, 1 } }); - //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_x } },{ 1 } } }); + //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_f } },{ 1 } } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_x, 1 } }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f, 1 } }); set_values(input_prim, { -0.5f, 2.0f, 0.5f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -618,16 +616,16 @@ TEST(fully_connected_gpu, xb_f32_batch_2_relu) { // 2.5 0 0.75 0 // 4 0 2.75 0 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, input_b = 2, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::yxfb,{ input_b, 1, input_x, 1 } }); - //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_x } },{ 1 } } }); + //auto output_prim = memory::allocate({ memory::format::xb_f32,{ output_b,{ { output_f } },{ 1 } } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_x,1 } }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } }); set_values(input_prim, { -0.5f, 1.0f, 2.0f, 1.5f, 0.5f, 0.0f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -680,16 +678,16 @@ TEST(fully_connected_gpu, x_f32_relu) { // Output: // 2.5 0 0.75 0 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,input_x,1 } }); - //auto output_prim = memory::allocate({ memory::format::x_f32,{ 1 ,{ { output_x } }, 1 } }); + //auto output_prim = memory::allocate({ memory::format::x_f32,{ 1 ,{ { output_f } }, 1 } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_x,1 } }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } }); set_values(input_prim, { -0.5f, 2.0f, 0.5f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -739,16 +737,16 @@ TEST(fully_connected_gpu, x_f32_relu_with_negative_slope) { // Output: // 2.5 -0.125 0.75 -0.1 - const int32_t output_x = 4, // size of whole output buffer + const int32_t output_f = 4, // size of whole output buffer input_x = 3, // size of whole input buffer weight_b = 4, weight_x = 3; // size of whole weights buffer - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,input_x,1 } }); - //auto output_prim = memory::allocate({ memory::format::x_f32,{ 1 ,{ { output_x } }, 1 } }); + //auto output_prim = memory::allocate({ memory::format::x_f32,{ 1 ,{ { output_f } }, 1 } }); auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ weight_b, 1, weight_x, 1 } }); - auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_x,1 } }); + auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1,1,output_f,1 } }); set_values(input_prim, { -0.5f, 2.0f, 0.5f }); set_values(weights_prim, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); @@ -777,3 +775,126 @@ TEST(fully_connected_gpu, x_f32_relu_with_negative_slope) { EXPECT_EQ(0.75f, output_ptr[2]); EXPECT_EQ(-0.1f, output_ptr[3]); } + +TEST(fully_connected_gpu, b_fs_yx_fsv4) +{ + const auto& engine = get_test_engine(); + + const int in_B = 2; + const int in_F = 2048; + const int in_Y = 1; + const int in_X = 1; + + const int W_B = 1000; + const int W_F = in_F; + const int W_Y = in_Y; + const int W_X = in_X; + + // Input data + std::vector Data(in_F * in_B); // in_X=in_Y=1 + int i = 0; + std::generate(Data.begin(), Data.end(), [i]() mutable { return i++ % 9; }); + auto input = memory::allocate(engine, {data_types::i8, format::bfyx, {in_B, in_F, in_X, in_Y}}); + set_values(input, std::move(Data)); + + // Create a topology + topology topology(input_layout("input", input.get_layout())); + + // Reorder + topology.add(reorder("reorder_in", + "input", + layout(data_types::i8, format::b_fs_yx_fsv4, {in_B, in_F, in_X, in_Y}))); + + // Weights + std::vector Weights(W_B * W_F); + i = 0; + std::generate(Weights.begin(), Weights.end(), [W_F, i]() mutable { + return i % 2 ? -(i++) / W_F - 1 : (i++) / W_F + 1; + }); + auto weights_gold = + memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}}); + auto weights_imad = + memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}}); + set_values(weights_gold, Weights); + set_values(weights_imad, std::move(Weights)); + topology.add(data("weights_gold", weights_gold), data("weights_imad", weights_imad)); + + // Bias, Callibraiton, Quantization + std::vector vB(in_F), vC(in_F), vQ(in_F); + float x = 0.1f; + std::generate(vB.begin(), vB.end(), [x]() mutable { + x += 0.01f; + if (x >= 0.9f) + x = 0.1f; + return x; + }); + x = 0.2f; + std::generate(vC.begin(), vC.end(), [x]() mutable { + x += 0.01f; + if (x >= 0.9f) + x = 0.2f; + return x; + }); + x = 0.3f; + std::generate(vQ.begin(), vQ.end(), [x]() mutable { + x += 0.01f; + if (x >= 0.9f) + x = 0.3f; + return x; + }); + auto bias_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}}); + auto bias_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}}); + auto callib_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}}); + auto callib_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}}); + auto quant_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}}); + auto quant_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, in_F, 1}}); + set_values(bias_gold, vB); + set_values(bias_imad, std::move(vB)); + set_values(callib_gold, vC); + set_values(callib_imad, std::move(vC)); + set_values(quant_gold, vQ); + set_values(quant_imad, std::move(vQ)); + topology.add(data("bias_gold", bias_gold), + data("callib_gold", callib_gold), + data("quant_gold", quant_gold)); + topology.add(data("bias_imad", bias_imad), + data("callib_imad", callib_imad), + data("quant_imad", quant_imad)); + + // Fully connected + fully_connected fullc_gold( + "fullc_gold", "input", "weights_gold", {"bias_gold"}, {"quant_gold"}, {"callib_gold"}, 1.0f); + fully_connected fullc_imad( + "fullc_imad", "reorder_in", "weights_imad", {"bias_imad"}, {"quant_imad"}, {"callib_imad"}, 1.0f); + topology.add(fullc_gold, fullc_imad); + + // Output reorder + auto reorder_gold = + reorder("reorder_gold", fullc_gold, layout(data_types::i8, format::bfyx, {in_B, W_B, 1, 1})); + auto reorder_imad = + reorder("reorder_imad", fullc_imad, layout(data_types::i8, format::bfyx, {in_B, W_B, 1, 1})); + topology.add(reorder_gold, reorder_imad); + + // Network build + build_options build_opt; + build_opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, build_opt); + + // Network execuiton + network.set_input_data("input", input); + auto outputs = network.execute(); + + auto out_gold = outputs.find("reorder_gold"); + auto out_test = outputs.find("reorder_imad"); + + ASSERT_NE(out_gold, outputs.end()); + ASSERT_NE(out_test, outputs.end()); + auto gold_ptr = out_gold->second.get_memory().pointer(); + auto test_ptr = out_test->second.get_memory().pointer(); + + ASSERT_EQ(gold_ptr.size(), test_ptr.size()); + for (size_t i = 0; i < gold_ptr.size(); i++) + { + ASSERT_EQ(gold_ptr[i], test_ptr[i]); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp index 71a107d..2737576 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp @@ -51,7 +51,7 @@ TEST(fully_connected_grad_input_gpu, basic_bfyx) { // -1.125 5.625 10.125 - engine engine; + const auto& engine = get_test_engine(); auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp index 7edb631..b470bda 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp @@ -44,7 +44,7 @@ TEST(fully_connected_grad_weights_gpu, basic_bfyx) { // Input_grad: // 1.5 0.75 -2.25 3 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 4, 1 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 1 } }); @@ -125,7 +125,7 @@ TEST(fully_connected_grad_weights_gpu, basic_bfyx_b8) { // 1.5 0.75 -2.25 3 // 1 1 1 1 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 8, 1, 4, 1 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 8, 1, 3, 1 } }); @@ -206,7 +206,7 @@ TEST(fully_connected_grad_weights_gpu, basic_bfyx_no_bias) { // Input_grad: // 1.5 0.75 -2.25 3 - engine engine; + const auto& engine = get_test_engine(); float lr = 0.00001f; auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp new file mode 100644 index 0000000..a46c6b9 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp @@ -0,0 +1,112 @@ +/* +// Copyright (c) 2016 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include +#include "api/CPP/memory.hpp" +#include +#include "api/CPP/convolution.hpp" +#include "api/CPP/eltwise.hpp" +#include "api/CPP/reorder.hpp" +#include +#include +#include +#include "test_utils/test_utils.h" +#include + +#include +#include +#include + +using namespace cldnn; +using namespace tests; +using namespace testing; + +TEST(fused_conv_eltwise, basic_0) +{ + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 4, 5 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, { + 1.0f, 2.0f, -15.f, 3.0f, 4.0f, -15.f, 5.0f, 6.0f, -15.f, 7.0f, + -15.f, 0.0f, 0.0f, -15.f, 0.5f, -0.5f, -15.f, 8.0f, 1.5f, 5.2f + }); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + convolution("conv", "input", { "weights" }), + eltwise("eltwise", "input", "conv", eltwise_mode::sum), + reorder("out", "eltwise", format::bfyx, data_types::f32)); + + build_options opt; + opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, opt); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "out"); + + auto output = outputs.begin()->second.get_memory(); + auto&& out_layout = output.get_layout(); + + EXPECT_EQ(out_layout.format, format::bfyx); + EXPECT_EQ(out_layout.size.batch[0], 1); + EXPECT_EQ(out_layout.size.feature[0], 1); + EXPECT_EQ(out_layout.size.spatial[0], 4); + EXPECT_EQ(out_layout.size.spatial[1], 5); +} + + +TEST(fused_conv_eltwise, dont_fuse_if_conv_elt_are_outputs) +{ + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 5 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + set_values(input, { + 1.0f, 2.0f, -15.f, 3.0f, 4.0f, -15.f, 5.0f, 6.0f, -15.f, 7.0f, + -15.f, 0.0f, 0.0f, -15.f, 0.5f, -0.5f, -15.f, 8.0f, 1.5f, 5.2f + }); + + topology topology( + input_layout("input", input.get_layout()), + data("weights", weights), + convolution("conv", "input", { "weights" }), + eltwise("out", "input", "conv", eltwise_mode::sum)); + + build_options opt; + opt.set_option(build_option::optimize_data(true)); + network network(engine, topology, opt); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "out"); + + auto output = outputs.begin()->second.get_memory(); + auto&& out_layout = output.get_layout(); + + EXPECT_EQ(out_layout.format, format::bfyx); + EXPECT_EQ(out_layout.size.batch[0], 1); + EXPECT_EQ(out_layout.size.feature[0], 1); + EXPECT_EQ(out_layout.size.spatial[0], 4); + EXPECT_EQ(out_layout.size.spatial[1], 5); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp new file mode 100644 index 0000000..90b4e80 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp @@ -0,0 +1,513 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(gather_gpu_fp16, d14_axisB) { + // Indexes : 2x2x1x1 + // Dictionary : 1x4x1x1 + // Axis : 0 + // Output : 1x4x2x1 + // Input values in fp16 + + // Indexes: + // 0.f, 1.f, 1.f, 0.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f + // + // Output: + // 1.f, 2.f, 3.f, 4.f, 3.f, 4.f, 1.f, 2.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_b; + + set_values(input1, { + FLOAT16(1.0f), FLOAT16(2.0f), + FLOAT16(3.0f), FLOAT16(4.0f) + }); + + set_values(input2, { + 0.f, 1.f, + 1.f, 0.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(1, 4, 1, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 4.f, 3.f, 4.f, 1.f, 2.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(gather_gpu_fp16, d222_axisB) { + // Indexes : 3x2x2x1 + // Dictionary : 2x2x1x1 + // Axis : 0 + // Output : 2x2x2x2 + // Input values in fp16 + + // Indexes: + // 0.f, 1.f, 2.f, 1.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + // 7.f, 8.f, 9.f, 10.f, 11.f, 12.f + // + // Output: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 5.f, 6.f, 7.f, 8.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_b; + + set_values(input1, { + FLOAT16(1.f), FLOAT16(2.f), FLOAT16(3.f), + FLOAT16(4.f), FLOAT16(5.f), FLOAT16(6.f), + + FLOAT16(7.f), FLOAT16(8.f), FLOAT16(9.f), + FLOAT16(10.f), FLOAT16(11.f), FLOAT16(12.f) + }); + + set_values(input2, { + 0.f, 1.f, + 2.f, 1.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(2, 2, 2, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 5.f, 6.f, 7.f, 8.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(gather_gpu_fp16, d22_axisY) { + // Indexes : 2x2x3x1 + // Dictionary : 2x2x1x1 + // Axis : 2 + // Output : 2x2x2x2 + // Input values in fp16 + + // Indexes: + // 0.f, 1.f, 2.f, 1.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + // 7.f, 8.f, 9.f, 10.f, 11.f, 12.f + // + // Output: + // 1.f, 2.f, 3.f, 2.f, 4.f, 5.f, 6.f, 5.f, 7.f, 8.f, 9.f, 8.f, 10.f, 11.f, 12.f, 11.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 1, 3 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_y; + + set_values(input1, { + FLOAT16(1.f), FLOAT16(2.f), FLOAT16(3.f), + FLOAT16(4.f), FLOAT16(5.f), FLOAT16(6.f), + + FLOAT16(7.f), FLOAT16(8.f), FLOAT16(9.f), + FLOAT16(10.f), FLOAT16(11.f), FLOAT16(12.f) + }); + + set_values(input2, { + 0.f, 1.f, 2.f, 1.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(2, 2, 2, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 2.f, 4.f, 5.f, 6.f, 5.f, 7.f, 8.f, 9.f, 8.f, 10.f, 11.f, 12.f, 11.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(gather_gpu_fp16, d22_axisF) { + // Indexes : 2x3x2x1 + // Dictionary : 2x2x1x1 + // Axis : 2 + // Output : 2x2x2x2 + // Input values in fp16 + + // Indexes: + // 0.f, 1.f, 2.f, 1.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + // 7.f, 8.f, 9.f, 10.f, 11.f, 12.f + // + // Output: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 3.f, 4.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 9.f, 10.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 1, 2 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_f; + + set_values(input1, { + FLOAT16(1.f), FLOAT16(2.f), FLOAT16(3.f), + FLOAT16(4.f), FLOAT16(5.f), FLOAT16(6.f), + + FLOAT16(7.f), FLOAT16(8.f), FLOAT16(9.f), + FLOAT16(10.f), FLOAT16(11.f), FLOAT16(12.f) + }); + + set_values(input2, { + 0.f, 1.f, 2.f, 1.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(2, 2, 2, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 3.f, 4.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 9.f, 10.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(gather_gpu_fp32, d14_axisB) { + // Indexes : 2x2x1x1 + // Dictionary : 1x4x1x1 + // Axis : 0 + // Output : 1x4x2x1 + // Input values in fp32 + + // Indexes: + // 0.f, 1.f, 1.f, 0.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f + // + // Output: + // 1.f, 2.f, 3.f, 4.f, 3.f, 4.f, 1.f, 2.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_b; + + set_values(input1, { + 1.0f, 2.0f, + 3.0f, 4.0f + }); + + set_values(input2, { + 0.f, 1.f, + 1.f, 0.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(1, 4, 1, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 4.f, 3.f, 4.f, 1.f, 2.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(gather_gpu_fp32, d222_axisB) { + // Indexes : 3x2x2x1 + // Dictionary : 2x2x1x1 + // Axis : 0 + // Output : 2x2x2x2 + // Input values in fp32 + + // Indexes: + // 0.f, 1.f, 2.f, 1.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + // 7.f, 8.f, 9.f, 10.f, 11.f, 12.f + // + // Output: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 5.f, 6.f, 7.f, 8.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 2 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_b; + + set_values(input1, { + 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, + + 7.f, 8.f, 9.f, + 10.f, 11.f, 12.f + }); + + set_values(input2, { + 0.f, 1.f, 2.f, 1.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(2, 2, 2, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 5.f, 6.f, 7.f, 8.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(gather_gpu_fp32, d22_axisY) { + // Indexes : 2x2x3x1 + // Dictionary : 2x2x1x1 + // Axis : 2 + // Output : 2x2x2x2 + // Input values in fp32 + + // Indexes: + // 0.f, 1.f, 2.f, 1.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + // 7.f, 8.f, 9.f, 10.f, 11.f, 12.f + // + // Output: + // 1.f, 2.f, 3.f, 2.f, 4.f, 5.f, 6.f, 5.f, 7.f, 8.f, 9.f, 8.f, 10.f, 11.f, 12.f, 11.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 3 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_y; + + set_values(input1, { + 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, + + 7.f, 8.f, 9.f, + 10.f, 11.f, 12.f + }); + + set_values(input2, { + 0.f, 1.f, 2.f, 1.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(2, 2, 2, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 2.f, 4.f, 5.f, 6.f, 5.f, 7.f, 8.f, 9.f, 8.f, 10.f, 11.f, 12.f, 11.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(gather_gpu_fp32, d22_axisF) { + // Indexes : 2x3x2x1 + // Dictionary : 2x2x1x1 + // Axis : 1 + // Output : 2x2x2x2 + // Input values in fp32 + + // Indexes: + // 0.f, 1.f, 2.f, 1.f + // + // Dictionary: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + // 7.f, 8.f, 9.f, 10.f, 11.f, 12.f + // + // Output: + // 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 3.f, 4.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 9.f, 10.f + + engine engine; + + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 3, 1, 2 } }); // Dictionary + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); // Indexes + auto axis = cldnn::gather::gather_axis::along_f; + + set_values(input1, { + 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, + + 7.f, 8.f, 9.f, + 10.f, 11.f, 12.f + }); + + set_values(input2, { + 0.f, 1.f, 2.f, 1.f + }); + + topology topology; + topology.add(input_layout("InputDictionary", input1.get_layout())); + topology.add(input_layout("InputText", input2.get_layout())); + topology.add( + gather("gather", "InputDictionary", "InputText", axis, tensor(2, 2, 2, 2)) + ); + + network network(engine, topology); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + + auto outputs = network.execute(); + + auto output = outputs.at("gather").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 3.f, 4.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 9.f, 10.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/gemm_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/gemm_gpu_test.cpp index ad39dcc..1a77be6 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/gemm_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/gemm_gpu_test.cpp @@ -32,7 +32,7 @@ using namespace cldnn; using namespace ::tests; TEST(gemm_gpu, basic_bfyx_t1) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 4 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 4 } }); @@ -83,7 +83,7 @@ TEST(gemm_gpu, basic_bfyx_t1) { } } TEST(gemm_gpu, basic_bfyx_t2) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 3 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }); @@ -131,7 +131,7 @@ TEST(gemm_gpu, basic_bfyx_t2) { } TEST(gemm_gpu, basic_bfyx_t1t2) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 4 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 4, 1 } }); @@ -188,7 +188,7 @@ TEST(gemm_gpu, basic_bfyx_t1t2) { } TEST(gemm_gpu, basic_input3) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 3 } }); auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); @@ -252,10 +252,10 @@ TEST(gemm_gpu, basic_input3) { } TEST(gemm_gpu, basic_input3_t1t2) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 3 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 2 } }); - auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 4 } }); float alpha = 2.f; float beta = 3.f; @@ -272,8 +272,10 @@ TEST(gemm_gpu, basic_input3_t1t2) { }; std::vector input3_data = { - 1.0f, 0.0f, 1.0f, 0.0f, - 2.0f, 2.0f, 1.0f, 1.0f, + 1.0f, 0.0f, + 1.0f, 0.0f, + 2.0f, 2.0f, + 1.0f, 1.0f, }; set_values(input, input_data); @@ -281,8 +283,10 @@ TEST(gemm_gpu, basic_input3_t1t2) { set_values(input3, input3_data); std::vector out_data = { - 15.0f, 12.0f, 27.0f, 24.0f, - 12.0f, 14.0f, 17.0f, 19.0f, + 15.0f, 6.0f, + 15.0f, 8.0f, + 30.0f, 20.0f, + 27.0f, 19.0f }; topology topology; @@ -314,8 +318,217 @@ TEST(gemm_gpu, basic_input3_t1t2) { EXPECT_FLOAT_EQ(output_ptr[i], out_data[i]); } } +TEST(gemm_gpu, basic_input3_1) { + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 4 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 3 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 4 } }); + float alpha = 2.f; + float beta = 3.f; + + std::vector input_data = { + 1.0f, 1.0f, 0.0f, + 2.0f, 0.0f, 0.0f, + 3.0f, 1.0f, 0.0f, + 4.0f, 0.0f, 0.0f + }; + + std::vector input_data2 = { + 3.0f, 2.0f, + 3.0f, 1.0f, + 1.0f, 2.0f, + }; + + std::vector input3_data = { + 1.0f, 0.0f, + 1.0f, 0.0f, + 2.0f, 2.0f, + 1.0f, 1.0f, + }; + + set_values(input, input_data); + set_values(input2, input_data2); + set_values(input3, input3_data); + + std::vector out_data = { + 15.0f, 6.0f, + 15.0f, 8.0f, + 30.0f, 20.0f, + 27.0f, 19.0f + }; + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("input2", input2.get_layout()) + ); + topology.add( + input_layout("input3", input3.get_layout()) + ); + topology.add( + gemm("output", "input", "input2", "input3", false, false, alpha, beta) + + ); + + network network(engine, topology); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + auto output = outputs.at("output").get_memory(); + auto output_ptr = output.pointer(); + + EXPECT_EQ(output_ptr.size(), (uint32_t)8); + + for (uint32_t i = 0; i < out_data.size(); ++i) { + EXPECT_FLOAT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(gemm_gpu, basic_input3_t2) { + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 4 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 2 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 4 } }); + float alpha = 2.f; + float beta = 3.f; + + + std::vector input_data = { + 1.0f, 1.0f, 0.0f, + 2.0f, 0.0f, 0.0f, + 3.0f, 1.0f, 0.0f, + 4.0f, 0.0f, 0.0f + }; + + + std::vector input_data2 = { + 3.0f, 3.0f, 1.0f, + 2.0f, 1.0f, 2.0f, + }; + + std::vector input3_data = { + 1.0f, 0.0f, + 1.0f, 0.0f, + 2.0f, 2.0f, + 1.0f, 1.0f, + }; + + set_values(input, input_data); + set_values(input2, input_data2); + set_values(input3, input3_data); + + std::vector out_data = { + 15.0f, 6.0f, + 15.0f, 8.0f, + 30.0f, 20.0f, + 27.0f, 19.0f, + }; + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("input2", input2.get_layout()) + ); + topology.add( + input_layout("input3", input3.get_layout()) + ); + topology.add( + gemm("output", "input", "input2", "input3", false, true, alpha, beta) + ); + + network network(engine, topology); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + auto output = outputs.at("output").get_memory(); + auto output_ptr = output.pointer(); + + EXPECT_EQ(output_ptr.size(), (uint32_t)8); + + for (uint32_t i = 0; i < out_data.size(); ++i) { + EXPECT_FLOAT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(gemm_gpu, basic_input3_t1) { + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 3 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 3 } }); + auto input3 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 4 } }); + float alpha = 2.f; + float beta = 3.f; + + + std::vector input_data = { + 1.0f, 2.0f, 3.0f, 4.0f, + 1.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f + }; + + std::vector input_data2 = { + 3.0f, 2.0f, + 3.0f, 1.0f, + 1.0f, 2.0f + }; + + std::vector input3_data = { + 1.0f, 0.0f, + 1.0f, 0.0f, + 2.0f, 2.0f, + 1.0f, 1.0f, + }; + + set_values(input, input_data); + set_values(input2, input_data2); + set_values(input3, input3_data); + + std::vector out_data = { + 15.0f, 6.0f, + 15.0f, 8.0f, + 30.0f, 20.0f, + 27.0f, 19.0f, + }; + + topology topology; + topology.add( + input_layout("input", input.get_layout()) + ); + topology.add( + input_layout("input2", input2.get_layout()) + ); + topology.add( + input_layout("input3", input3.get_layout()) + ); + topology.add( + gemm("output", "input", "input2", "input3", true, false, alpha, beta) + ); + + network network(engine, topology); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("input3", input3); + auto outputs = network.execute(); + + auto output = outputs.at("output").get_memory(); + auto output_ptr = output.pointer(); + + EXPECT_EQ(output_ptr.size(), (uint32_t)8); + + for (uint32_t i = 0; i < out_data.size(); ++i) { + EXPECT_FLOAT_EQ(output_ptr[i], out_data[i]); + } +} + TEST(gemm_gpu, basic_bfyx) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 4, 3 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 1, 4 } }); @@ -373,7 +586,7 @@ TEST(gemm_gpu, basic_bfyx) { } TEST(gemm_gpu, basic3_bfyx) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 5, 1, 500, 9 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 5, 1, 1, 500 } }); @@ -2979,7 +3192,7 @@ TEST(gemm_gpu, basic3_bfyx) { } TEST(gemm_gpu, basic_smarcink2) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 3 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp index 7a0e389..218cac0 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp @@ -223,7 +223,7 @@ TEST(index_select_gpu, basic_along_b_3_executes_bfyx) indices: {1, 1, 4, 1} output: {4, 2, 3, 4} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 5; constexpr auto in_size_f = 2; constexpr auto in_size_x = 3; @@ -299,7 +299,7 @@ TEST(index_select_gpu, basic_along_f_3_executes_bfyx) indices: {1, 1, 10, 1} output: {2, 10, 3, 3} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 2; constexpr auto in_size_f = 5; constexpr auto in_size_x = 3; @@ -375,7 +375,7 @@ TEST(index_select_gpu, basic_along_x_3_executes_bfyx) indices: {1, 1, 3, 1} output: {3, 4, 3, 5} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 3; constexpr auto in_size_f = 4; constexpr auto in_size_x = 6; @@ -451,7 +451,7 @@ TEST(index_select_gpu, basic_along_y_3_executes_bfyx) indices: {1, 1, 5, 1} output: {2, 4, 4, 5} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 2; constexpr auto in_size_f = 4; constexpr auto in_size_x = 4; @@ -527,7 +527,7 @@ TEST(index_select_gpu, basic_along_b_3_executes_yxfb) indices: {1, 1, 4, 1} output: {4, 2, 3, 4} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 5; constexpr auto in_size_f = 2; constexpr auto in_size_x = 3; @@ -604,7 +604,7 @@ TEST(index_select_gpu, basic_along_f_3_executes_yxfb) indices: {1, 1, 10, 1} output: {2, 10, 3, 3} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 2; constexpr auto in_size_f = 5; constexpr auto in_size_x = 3; @@ -681,7 +681,7 @@ TEST(index_select_gpu, basic_along_x_3_executes_yxfb) indices: {1, 1, 3, 1} output: {3, 4, 3, 5} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 3; constexpr auto in_size_f = 4; constexpr auto in_size_x = 6; @@ -757,7 +757,7 @@ TEST(index_select_gpu, basic_along_y_3_executes_yxfb) indices: {1, 1, 5, 1} output: {2, 4, 4, 5} */ - engine engine; + const auto& engine = get_test_engine(); constexpr auto in_size_b = 2; constexpr auto in_size_f = 4; constexpr auto in_size_x = 4; @@ -826,3 +826,862 @@ TEST(index_select_gpu, basic_along_y_3_executes_yxfb) } } } + +TEST(index_select_gpu, reverse_along_b_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 4, 2 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f, + + + + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + }; + + std::vector out_data = { + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + + + + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f, + }; + + constexpr auto axis = index_select_axis_name::along_b; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_f_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 3, 4 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f + }; + + constexpr auto axis = index_select_axis_name::along_f; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_y_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 8.f, 9.f, 10.f, 11.f, + 4.f, 5.f, 6.f, 7.f, + 0.f, 1.f, 2.f, 3.f, + + 20.f, 21.f, 22.f, 23.f, + 16.f, 17.f, 18.f, 19.f, + 12.f, 13.f, 14.f, 15.f + }; + + constexpr auto axis = index_select_axis_name::along_y; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_x_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 3.f, 2.f, 1.f, 0.f, + 7.f, 6.f, 5.f, 4.f, + 11.f, 10.f, 9.f, 8.f, + + 15.f, 14.f, 13.f, 12.f, + 19.f, 18.f, 17.f, 16.f, + 23.f, 22.f, 21.f, 20.f + }; + + constexpr auto axis = index_select_axis_name::along_x; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + + +TEST(index_select_gpu, reverse_along_y_yxfb) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 2, 2, 2 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f, + + + + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + }; + + std::vector out_data = { + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + + + + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f, + }; + + constexpr auto axis = index_select_axis_name::along_y; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_x_yxfb) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 3, 4, 2, 1 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f + }; + + constexpr auto axis = index_select_axis_name::along_x; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_f_yxfb) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 2, 1 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 8.f, 9.f, 10.f, 11.f, + 4.f, 5.f, 6.f, 7.f, + 0.f, 1.f, 2.f, 3.f, + + 20.f, 21.f, 22.f, 23.f, + 16.f, 17.f, 18.f, 19.f, + 12.f, 13.f, 14.f, 15.f + }; + + constexpr auto axis = index_select_axis_name::along_f; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_b_yxfb) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 2, 1 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 3.f, 2.f, 1.f, 0.f, + 7.f, 6.f, 5.f, 4.f, + 11.f, 10.f, 9.f, 8.f, + + 15.f, 14.f, 13.f, 12.f, + 19.f, 18.f, 17.f, 16.f, + 23.f, 22.f, 21.f, 20.f + }; + + constexpr auto axis = index_select_axis_name::along_b; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + + +TEST(index_select_gpu, reverse_along_yx_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 11.f, 10.f, 9.f, 8.f, + 7.f, 6.f, 5.f, 4.f, + 3.f, 2.f, 1.f, 0.f, + + 23.f, 22.f, 21.f, 20.f, + 19.f, 18.f, 17.f, 16.f, + 15.f, 14.f, 13.f, 12.f + }; + + std::vector axis = { index_select_axis_name::along_y, index_select_axis_name::along_x }; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_fyx_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } }); + + std::vector input_data = { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }; + + std::vector out_data = { + 23.f, 22.f, 21.f, 20.f, + 19.f, 18.f, 17.f, 16.f, + 15.f, 14.f, 13.f, 12.f, + + 11.f, 10.f, 9.f, 8.f, + 7.f, 6.f, 5.f, 4.f, + 3.f, 2.f, 1.f, 0.f + }; + + std::vector axis = { index_select_axis_name::along_f, index_select_axis_name::along_y, index_select_axis_name::along_x }; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_bfyx_bfyx) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 3, 3, 4, 3 } }); + + std::vector input_data = { + // b0f0 + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + // f1 + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + // f2 + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + 32.f, 33.f, 34.f, 35.f, + + // b1f0 + 36.f, 37.f, 38.f, 39.f, + 40.f, 41.f, 42.f, 43.f, + 44.f, 45.f, 46.f, 47.f, + // f1 + 48.f, 49.f, 50.f, 51.f, + 52.f, 53.f, 54.f, 55.f, + 56.f, 57.f, 58.f, 59.f, + // f2 + 60.f, 61.f, 62.f, 63.f, + 64.f, 65.f, 66.f, 67.f, + 68.f, 69.f, 70.f, 71.f, + + // b2f0 + 72.f, 73.f, 74.f, 75.f, + 76.f, 77.f, 78.f, 79.f, + 80.f, 81.f, 82.f, 83.f, + // f1 + 84.f, 85.f, 86.f, 87.f, + 88.f, 89.f, 90.f, 91.f, + 92.f, 93.f, 94.f, 95.f, + // f2 + 96.f, 97.f, 98.f, 99.f, + 100.f, 101.f, 102.f, 103.f, + 104.f, 105.f, 106.f, 107.f + }; + + std::vector out_data = { + 107.f, 106.f, 105.f, 104.f, + 103.f, 102.f, 101.f, 100.f, + 99.f, 98.f, 97.f, 96.f, + + 95.f, 94.f, 93.f, 92.f, + 91.f, 90.f, 89.f, 88.f, + 87.f, 86.f, 85.f, 84.f, + + 83.f, 82.f, 81.f, 80.f, + 79.f, 78.f, 77.f, 76.f, + 75.f, 74.f, 73.f, 72.f, + + + 71.f, 70.f, 69.f, 68.f, + 67.f, 66.f, 65.f, 64.f, + 63.f, 62.f, 61.f, 60.f, + + 59.f, 58.f, 57.f, 56.f, + 55.f, 54.f, 53.f, 52.f, + 51.f, 50.f, 49.f, 48.f, + + 47.f, 46.f, 45.f, 44.f, + 43.f, 42.f, 41.f, 40.f, + 39.f, 38.f, 37.f, 36.f, + + + 35.f, 34.f, 33.f, 32.f, + 31.f, 30.f, 29.f, 28.f, + 27.f, 26.f, 25.f, 24.f, + + 23.f, 22.f, 21.f, 20.f, + 19.f, 18.f, 17.f, 16.f, + 15.f, 14.f, 13.f, 12.f, + + 11.f, 10.f, 9.f, 8.f, + 7.f, 6.f, 5.f, 4.f, + 3.f, 2.f, 1.f, 0.f + }; + + std::vector axis = { index_select_axis_name::along_b, index_select_axis_name::along_f, index_select_axis_name::along_y, index_select_axis_name::along_x }; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_bfx_yxfb) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 3, 3 } }); + + std::vector input_data = { + // y0x0 + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + // x1 + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + // x2 + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + 32.f, 33.f, 34.f, 35.f, + + // y1x0 + 36.f, 37.f, 38.f, 39.f, + 40.f, 41.f, 42.f, 43.f, + 44.f, 45.f, 46.f, 47.f, + // x1 + 48.f, 49.f, 50.f, 51.f, + 52.f, 53.f, 54.f, 55.f, + 56.f, 57.f, 58.f, 59.f, + // x2 + 60.f, 61.f, 62.f, 63.f, + 64.f, 65.f, 66.f, 67.f, + 68.f, 69.f, 70.f, 71.f, + + // y2x0 + 72.f, 73.f, 74.f, 75.f, + 76.f, 77.f, 78.f, 79.f, + 80.f, 81.f, 82.f, 83.f, + // x1 + 84.f, 85.f, 86.f, 87.f, + 88.f, 89.f, 90.f, 91.f, + 92.f, 93.f, 94.f, 95.f, + // x2 + 96.f, 97.f, 98.f, 99.f, + 100.f, 101.f, 102.f, 103.f, + 104.f, 105.f, 106.f, 107.f + }; + + std::vector out_data = { + 35.f, 34.f, 33.f, 32.f, + 31.f, 30.f, 29.f, 28.f, + 27.f, 26.f, 25.f, 24.f, + + 23.f, 22.f, 21.f, 20.f, + 19.f, 18.f, 17.f, 16.f, + 15.f, 14.f, 13.f, 12.f, + + 11.f, 10.f, 9.f, 8.f, + 7.f, 6.f, 5.f, 4.f, + 3.f, 2.f, 1.f, 0.f, + + + 71.f, 70.f, 69.f, 68.f, + 67.f, 66.f, 65.f, 64.f, + 63.f, 62.f, 61.f, 60.f, + + 59.f, 58.f, 57.f, 56.f, + 55.f, 54.f, 53.f, 52.f, + 51.f, 50.f, 49.f, 48.f, + + 47.f, 46.f, 45.f, 44.f, + 43.f, 42.f, 41.f, 40.f, + 39.f, 38.f, 37.f, 36.f, + + + 107.f, 106.f, 105.f, 104.f, + 103.f, 102.f, 101.f, 100.f, + 99.f, 98.f, 97.f, 96.f, + + 95.f, 94.f, 93.f, 92.f, + 91.f, 90.f, 89.f, 88.f, + 87.f, 86.f, 85.f, 84.f, + + 83.f, 82.f, 81.f, 80.f, + 79.f, 78.f, 77.f, 76.f, + 75.f, 74.f, 73.f, 72.f + }; + + std::vector axis = { index_select_axis_name::along_f, index_select_axis_name::along_b, index_select_axis_name::along_x }; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} + +TEST(index_select_gpu, reverse_along_bfyx_yxfb) +{ + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 3, 3 } }); + + std::vector input_data = { + // y0x0 + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + // x1 + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f, + // x2 + 24.f, 25.f, 26.f, 27.f, + 28.f, 29.f, 30.f, 31.f, + 32.f, 33.f, 34.f, 35.f, + + // y1x0 + 36.f, 37.f, 38.f, 39.f, + 40.f, 41.f, 42.f, 43.f, + 44.f, 45.f, 46.f, 47.f, + // x1 + 48.f, 49.f, 50.f, 51.f, + 52.f, 53.f, 54.f, 55.f, + 56.f, 57.f, 58.f, 59.f, + // x2 + 60.f, 61.f, 62.f, 63.f, + 64.f, 65.f, 66.f, 67.f, + 68.f, 69.f, 70.f, 71.f, + + // y2x0 + 72.f, 73.f, 74.f, 75.f, + 76.f, 77.f, 78.f, 79.f, + 80.f, 81.f, 82.f, 83.f, + // x1 + 84.f, 85.f, 86.f, 87.f, + 88.f, 89.f, 90.f, 91.f, + 92.f, 93.f, 94.f, 95.f, + // x2 + 96.f, 97.f, 98.f, 99.f, + 100.f, 101.f, 102.f, 103.f, + 104.f, 105.f, 106.f, 107.f + }; + + std::vector out_data = { + 107.f, 106.f, 105.f, 104.f, + 103.f, 102.f, 101.f, 100.f, + 99.f, 98.f, 97.f, 96.f, + + 95.f, 94.f, 93.f, 92.f, + 91.f, 90.f, 89.f, 88.f, + 87.f, 86.f, 85.f, 84.f, + + 83.f, 82.f, 81.f, 80.f, + 79.f, 78.f, 77.f, 76.f, + 75.f, 74.f, 73.f, 72.f, + + + 71.f, 70.f, 69.f, 68.f, + 67.f, 66.f, 65.f, 64.f, + 63.f, 62.f, 61.f, 60.f, + + 59.f, 58.f, 57.f, 56.f, + 55.f, 54.f, 53.f, 52.f, + 51.f, 50.f, 49.f, 48.f, + + 47.f, 46.f, 45.f, 44.f, + 43.f, 42.f, 41.f, 40.f, + 39.f, 38.f, 37.f, 36.f, + + + 35.f, 34.f, 33.f, 32.f, + 31.f, 30.f, 29.f, 28.f, + 27.f, 26.f, 25.f, 24.f, + + 23.f, 22.f, 21.f, 20.f, + 19.f, 18.f, 17.f, 16.f, + 15.f, 14.f, 13.f, 12.f, + + 11.f, 10.f, 9.f, 8.f, + 7.f, 6.f, 5.f, 4.f, + 3.f, 2.f, 1.f, 0.f + }; + + std::vector axis = { index_select_axis_name::along_b, index_select_axis_name::along_f, index_select_axis_name::along_y, index_select_axis_name::along_x }; + + topology topo; + topo.add( + input_layout("input", input.get_layout()) + ); + topo.add( + index_select("index_select", "input", axis) + ); + + network net(engine, topo); + + set_values(input, input_data); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output_mem = outputs.at("index_select").get_memory(); + auto output_ptr = output_mem.pointer(); + + for (size_t i = 0; i < output_ptr.size(); i++) + { + EXPECT_EQ(output_ptr[i], out_data[i]); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp index 72dd2fc..45f0408 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp @@ -32,7 +32,7 @@ using namespace tests; TEST(lookup_table_base, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { batch_num, feature_num, x_size , y_size } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, {2, 1, 1, 1} }); @@ -83,7 +83,7 @@ TEST(lookup_table_base, base) { TEST(lookup_table_num, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, number_of_values = 3; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 1 } }); @@ -160,7 +160,7 @@ TEST(lookup_table_num, base) { TEST(lookup_table_with_arg_max, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -207,7 +207,7 @@ TEST(lookup_table_with_arg_max, base) { TEST(lookup_table_axis, base) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, number_of_values = 2; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 3, 2, 2 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp index 13c6c93..ba109f0 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp @@ -21,6 +21,7 @@ #include "api/CPP/lstm.hpp" #include #include +#include #include #include #include @@ -29,10 +30,14 @@ #include "test_utils/test_utils.h" #include #include "instrumentation.h" +#include #include #include +#ifdef WIN32 +#pragma warning(disable: 4503) +#endif using namespace cldnn; using namespace tests; @@ -88,7 +93,7 @@ VVVVF lstm_gemm_reference(VVVVF& input, VVVVF& weights, VVVVF& recur } if (hasHidden) { for (size_t x = 0; x < hidden_size; ++x) { - res += (T)recurrent[0][dir][y][x] * (T)hidden[b][dir][0][x]; + res += (T)recurrent[0][dir][y][x] * (T)hidden[b][0][dir][x]; } } if (hasBias) { @@ -102,7 +107,9 @@ VVVVF lstm_gemm_reference(VVVVF& input, VVVVF& weights, VVVVF& recur template VVVVF lstm_elt_reference(VVVVF& tempGEMM, VVVVF& cell, - bool hasCell = true, float clip_threshold = 0, bool input_forget = false, size_t dir = 0) { + bool hasCell = true, float clip_threshold = 0, + bool input_forget = false, size_t dir = 0) +{ size_t hidden_size = tempGEMM[0][0][0].size() / 4; size_t batch_size = tempGEMM.size(); VVVVF tempOut(batch_size, VVVF(2, VVF(1, VF(hidden_size)))); @@ -113,16 +120,28 @@ VVVVF lstm_elt_reference(VVVVF& tempGEMM, VVVVF& cell, T *ot = &tempGEMM[b][0][0][off.ot]; T *ft = &tempGEMM[b][0][0][off.ft]; T *zt = &tempGEMM[b][0][0][off.zt]; + for (size_t h = 0; h < hidden_size; ++h) { - T val = sigmoid(clip(it[h], clip_threshold)) * std::tanh((float)clip(zt[h], clip_threshold)); + + // Convert all inputs to float for all the elementwise operations. This is done to immitate + // how lstm kernel is performing the elementwise operations. + float fp32_it = (float)it[h]; + float fp32_ot = (float)ot[h]; + float fp32_ft = (float)ft[h]; + float fp32_zt = (float)zt[h]; + float val = sigmoid(clip(fp32_it, clip_threshold)) * std::tanh(clip(fp32_zt, clip_threshold)); + if (input_forget) { - val *= (1 - ft[h]); + val *= (1 - fp32_ft); } if (hasCell) { - val += cell[b][dir][0][h] * sigmoid(clip(ft[h], clip_threshold)); + val += (float)cell[b][0][dir][h] * sigmoid(clip(fp32_ft, clip_threshold)); } - tempOut[b][0][0][h] = std::tanh((float)val) * sigmoid(ot[h]); - tempOut[b][1][0][h] = val; + + // Convert back to output data type before storing it into the output buffer. Currently, the output + // data type may be float or FLOAT16 (half) + tempOut[b][0][0][h] = (T)(std::tanh(val) * sigmoid(fp32_ot)); + tempOut[b][1][0][h] = (T)val; } } return tempOut; @@ -154,10 +173,14 @@ void print(const std::string& s, VVVVF& input) { // tempGEMM = [ batch, 1, 1, 4 * hidden_size ] temporary output // output = [ batch, sequence, direction, hidden_size ] output template -void lstm_reference(VVVVF& input, VVVVF& hidden, VVVVF& cell, VVVVF& weights, VVVVF& recurrent, VVVVF& bias, - VVVVF& output, VVVVF& last_hidden, VVVVF& last_cell, - bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true, - float clip_threshold = 0, bool input_forget = false, bool scramble_input = true) { +void lstm_reference(VVVVF& input, VVVVF& hidden, VVVVF& cell, + VVVVF& weights, VVVVF& recurrent, VVVVF& bias, + VVVVF& output, VVVVF& last_hidden, + VVVVF& last_cell, bool hasBias = true, + bool hasInitialHidden = true, bool hasInitialCell = true, + float clip_threshold = 0, bool input_forget = false, + bool scramble_input = true) +{ size_t sequence_len = input[0].size(); size_t dir_len = weights[0].size(); size_t batch = input.size(); @@ -179,8 +202,8 @@ void lstm_reference(VVVVF& input, VVVVF& hidden, VVVVF& cell, VVVVF& // tempOutput[batch][0] = hidden and tempOutput[batch][1] = cell for (size_t i = 0; i < batch; i++) { output[i][seq][dir] = tempOutput[i][0][0]; - hidden[i][dir] = tempOutput[i][0]; - cell[i][dir] = tempOutput[i][1]; + hidden[i][0][dir] = tempOutput[i][0][0]; + cell[i][0][dir] = tempOutput[i][1][0]; } tempHasInitialHidden = true; tempHasInitialCell = true; @@ -210,12 +233,23 @@ void generic_lstm_gemm_gpu_test(int sequence_len, int direction, int batch_size, VVVVF ref_output = lstm_gemm_reference(ref_input, ref_weights, ref_recurrent, ref_bias, ref_hidden, 0, hasBias, hasHidden); - engine engine; - memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ batch_size, sequence_len, input_size, 1 } }); - memory weights = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, direction, input_size, 4 * hidden_size } }); - memory recurrent = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, direction, hidden_size, 4 * hidden_size } }); - memory biases = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, 1, 4 * hidden_size, direction } }); - memory hidden = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ batch_size, direction, hidden_size, 1 } }); + constexpr auto dt = std::is_same::value ? data_types::f32 : data_types::f16; + const auto& engine = get_test_engine(); + + // If the input is of fp16 type then, the memory will be allocated as such + if (!engine.get_info().supports_fp16) + { + if (dt == data_types::f16) + { + return; + } + } + + memory input = memory::allocate(engine, { dt, format::bfyx, { batch_size, sequence_len, input_size, 1 } }); + memory weights = memory::allocate(engine, { dt, format::bfyx, { 1, direction, input_size, 4 * hidden_size } }); + memory recurrent = memory::allocate(engine, { dt, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }); + memory biases = memory::allocate(engine, { dt, format::bfyx, { 1, 1, 4 * hidden_size, direction } }); + memory hidden = memory::allocate(engine, { dt, format::bfyx, { batch_size, direction, hidden_size, 1 } }); set_values(input, ref_input_vec); set_values(weights, ref_weights_vec); @@ -250,13 +284,13 @@ void generic_lstm_gemm_gpu_test(int sequence_len, int direction, int batch_size, int i = 0; for (int b = 0; b < batch_size; ++b) { for (int x = 0; x < 4 * hidden_size; ++x) - EXPECT_EQ(ref_output[b][0][0][x], output_ptr[i++]); + EXPECT_FLOAT_EQ(ref_output[b][0][0][x], output_ptr[i++]); } } template void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size, bool hasCell = true, - float clip_threshold = 0.f, bool input_forget = false) { + T clip_threshold = (T)0.f, bool input_forget = false) { // tempGEMM = [ 1, direction, batch, 4 * hidden_size ] input // cell = [ 1, direction, batch, hidden_size ] optional // output = [ 2, direction, batch, hidden_size ] output concat[hidden, cell] @@ -269,9 +303,25 @@ void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size, VVVVF ref_output = lstm_elt_reference(ref_tempGEMM, ref_cell, hasCell, clip_threshold, input_forget); - engine engine; - memory tempGEMM = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ batch_size, direction, 4 * hidden_size, 1 } }); - memory cell = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ batch_size, direction, hidden_size, 1 } }); + // We observe some mismatch in down-converting from fp32 to fp16 + // between the reference implementation and opencl kernel. This can be + // a simple rounding error. Thus, for fp16 we are increasing our tolerance + // to error from 1E-4 to 1E-2 + constexpr float ferror = std::is_same::value ? (float)1E-4 : (float)1E-2; + constexpr auto dt = std::is_same::value ? data_types::f32 : data_types::f16; + const auto& engine = get_test_engine(); + + // If the input is of fp16 type then, the memory will be allocated as such + if (!engine.get_info().supports_fp16) + { + if (dt == data_types::f16) + { + return; + } + } + + memory tempGEMM = memory::allocate(engine, { dt, format::bfyx,{ batch_size, direction, 4 * hidden_size, 1 } }); + memory cell = memory::allocate(engine, { dt, format::bfyx,{ batch_size, direction, hidden_size, 1 } }); set_values(tempGEMM, ref_tempGEMM_vec); set_values(cell, ref_cell_vec); @@ -298,7 +348,7 @@ void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size, for (int x = 0; x < hidden_size; ++x) { auto idx = b * 2 * hidden_size + j * hidden_size + x; - EXPECT_NEAR(ref_output[b][j][0][x], output_ptr[idx], FERROR); + ASSERT_NEAR(ref_output[b][j][0][x], output_ptr[idx] , ferror); } } } @@ -388,7 +438,7 @@ void generic_lstm_custom_gpu_test(int sequence_len, int direction, int batch_siz lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output, last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell); - engine engine; + const auto& engine = get_test_engine(); memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ batch_size, sequence_len, input_size, 1 } }); memory weights = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, direction, input_size, 4 * hidden_size } }); memory recurrent = memory::allocate(engine, { type_to_data_type::value, format::bfyx,{ 1, direction, hidden_size, 4 * hidden_size } }); @@ -434,7 +484,7 @@ void generic_lstm_custom_gpu_test(int sequence_len, int direction, int batch_siz template void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batch_size, int input_size, int hidden_size, bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true, - float clip_threshold = 0, bool input_forget = false) { + T clip_threshold = 0, bool input_forget = false) { std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl; int min_random = -2, max_random = 2; @@ -452,8 +502,8 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc ref_weights.push_back(generate_random_4d(1, direction, 4 * hidden_size, i==0 ? input_size : hidden_size, min_random, max_random)); ref_recurrent.push_back(generate_random_4d(1, direction, 4 * hidden_size, hidden_size, min_random, max_random)); ref_bias.push_back(generate_random_4d(1, 1, direction, 4 * hidden_size, min_random, max_random)); - ref_hidden.push_back(generate_random_4d(batch_size, direction, 1, hidden_size, min_random, max_random)); - ref_cell.push_back(generate_random_4d(batch_size, direction, 1, hidden_size, min_random, max_random)); + ref_hidden.push_back(generate_random_4d(batch_size, 1, direction, hidden_size, min_random, max_random)); + ref_cell.push_back(generate_random_4d(batch_size, 1, direction, hidden_size, min_random, max_random)); ref_output.push_back(VVVVF(batch_size, VVVF(sequence_len, VVF(direction, VF(hidden_size))))); } @@ -471,8 +521,8 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc ref_cell_vec.push_back(flatten_4d(cldnn::format::bfyx, ref_cell[i])); } - VVVVF last_hidden(batch_size, VVVF(direction, VVF(1, VF(hidden_size)))); - VVVVF last_cell(batch_size, VVVF(direction, VVF(1, VF(hidden_size)))); + VVVVF last_hidden(batch_size, VVVF(1, VVF(direction, VF(hidden_size)))); + VVVVF last_cell(batch_size, VVVF(1, VVF(direction, VF(hidden_size)))); lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0], last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell, @@ -485,9 +535,24 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc clip_threshold, input_forget, false); } - engine engine; + // We observe some mismatch in down-converting from fp32 to fp16 + // between the reference implementation and opencl kernel. This can be + // a simple rounding error. Thus, for fp16 we are increasing our tolerance + // to error from 1E-4 to 1E-2 + constexpr float ferror = std::is_same::value ? (float)1E-4 : (float)1E-2; + constexpr auto dt = std::is_same::value ? data_types::f32 : data_types::f16; + const auto& engine = get_test_engine(); - memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx, {batch_size, sequence_len, input_size, 1} }); + // If the input is of fp16 type then, the memory will be allocated as such + if (!engine.get_info().supports_fp16) + { + if (dt == data_types::f16) + { + return; + } + } + + memory input = memory::allocate(engine, { dt, format::bfyx, {batch_size, sequence_len, input_size, 1} }); set_values(input, ref_input_vec); std::vector weights; @@ -496,20 +561,20 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc std::vector hidden; std::vector cell; for(int i = 0; i < layers; ++i) { - weights.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, direction, i==0 ? input_size : hidden_size, 4 * hidden_size } })); + weights.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, i==0 ? input_size : hidden_size, 4 * hidden_size } })); set_values(weights[i], ref_weights_vec[i]); - recurrent.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } })); + recurrent.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } })); set_values(recurrent[i], ref_recurrent_vec[i]); if (hasBias) { - biases.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, 1, 4 * hidden_size, direction } })); + biases.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, 1, 4 * hidden_size, direction } })); set_values(biases[i], ref_bias_vec[i]); } if (hasInitialHidden) { - hidden.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, direction, hidden_size, 1 } })); + hidden.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction } })); set_values(hidden[i], ref_hidden_vec[i]); } if (hasInitialCell) { - cell.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, direction, hidden_size, 1 } })); + cell.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction} })); set_values(cell[i], ref_cell_vec[i]); } } @@ -543,12 +608,14 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc if (i == 0) { topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id, hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "", - clip_threshold, input_forget, {}, {}, default_offset_type)); + clip_threshold, input_forget, {}, {}, + cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type)); } else { topology.add(lstm(lstm_id, { prev_lstm_id }, weights_id, recurrent_id, hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "", - clip_threshold, input_forget, {}, {}, default_offset_type)); + clip_threshold, input_forget, {}, {}, + cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type)); } prev_lstm_id = lstm_id; } @@ -567,17 +634,17 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction)); auto output = outputs.begin()->second.get_memory(); - + // Get the output tensor cldnn::layout output_layout = output.get_layout(); - cldnn::tensor output_tensor = output_layout.size; - + cldnn::tensor output_tensor = output_layout.size; + // Compare the output tensor configuration against the reference value // Output tensor is configured in bfyx format ASSERT_EQ(batch_size, output_tensor.batch[0]); ASSERT_EQ(sequence_len, output_tensor.feature[0]); ASSERT_EQ(direction, output_tensor.spatial[1]); - ASSERT_EQ(hidden_size, output_tensor.spatial[0]); + ASSERT_EQ(hidden_size, output_tensor.spatial[0]); auto output_ptr = output.pointer(); int32_t i = 0; @@ -585,7 +652,998 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc for (int32_t s = 0; s < sequence_len; ++s) { for (int32_t d = 0; d < direction; ++d) { for (int32_t x = 0; x < hidden_size; ++x) { - ASSERT_NEAR(ref_output[layers-1][b][s][d][x], output_ptr[i++], FERROR); + ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], ferror); + } + } + } + } + } +} + +// ------------------------------------------------------- +template +void lstm_gpu_output_test(const cldnn_lstm_output& output_selection, int directions) { + int layers = 1; + int sequence_len = 4; + int batch_size = 3; + int input_size = 3; + int hidden_size = 4; + + std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size + << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size + << " Output selection: " << output_selection << std::endl; + int min_random = -2, max_random = 2; + + VVVVF ref_input = generate_random_4d(batch_size, sequence_len, 1, input_size, min_random, max_random); + VVVVF ref_weights = generate_random_4d(1, directions, 4 * hidden_size, input_size, min_random, max_random); + VVVVF ref_recurrent = generate_random_4d(1, directions, 4 * hidden_size, hidden_size, min_random, max_random); + VVVVF ref_bias = generate_random_4d(1, 1, directions, 4 * hidden_size, min_random, max_random); + VVVVF ref_hidden = generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random); + VVVVF ref_cell = generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random); + VVVVF ref_output = VVVVF(batch_size, VVVF(sequence_len, VVF(directions, VF(hidden_size)))); + + VF ref_input_vec = flatten_4d(cldnn::format::bfyx, ref_input); + VF ref_weights_vec = flatten_4d(cldnn::format::bfyx, ref_weights); + VF ref_recurrent_vec = flatten_4d(cldnn::format::bfyx, ref_recurrent); + VF ref_bias_vec = flatten_4d(cldnn::format::bfyx, ref_bias); + VF ref_hidden_vec = flatten_4d(cldnn::format::bfyx, ref_hidden); + VF ref_cell_vec = flatten_4d(cldnn::format::bfyx, ref_cell); + + VVVVF last_hidden(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))); + VVVVF last_cell(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))); + + lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output, + last_hidden, last_cell, true, true, true, + (T)0, false, true); + + const auto& engine = get_test_engine(); + + memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx, {batch_size, sequence_len, input_size, 1} }); + memory weights = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } }); + memory recurrent = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } }); + memory biases = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } }); + memory hidden = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, 1, hidden_size, directions } }); + memory cell = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, 1, hidden_size, directions } }); + + set_values(input, ref_input_vec); + set_values(weights, ref_weights_vec); + set_values(recurrent, ref_recurrent_vec); + set_values(biases, ref_bias_vec); + set_values(hidden, ref_hidden_vec); + set_values(cell, ref_cell_vec); + + bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell || + output_selection == cldnn_lstm_output_sequence_cell; + bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden || + output_selection == cldnn_lstm_output_hidden_cell; + + topology topology; + std::vector> input_ids_offsets; + std::vector lstm_inputs; + std::vector output_ids_offsets; + + topology.add(input_layout("input", input.get_layout())); + for (int i = 0; i < sequence_len; ++i) + { + input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}}); + lstm_inputs.push_back("inputSplit:"+get_string_id(i)); + } + topology.add(split("inputSplit", "input", input_ids_offsets)); + topology.add(data("weights", weights)); + topology.add(data("recurrent", recurrent)); + topology.add(data("biases", biases)); + topology.add(input_layout("hidden", hidden.get_layout())); + topology.add(input_layout("cell", cell.get_layout())); + topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent", + "biases", "hidden", "cell", "", 0, false, {}, {}, + output_selection, default_offset_type)); + if (emit_last_cell) + { + int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1; + tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions}; + tensor cell_tensor {batch_size, 1, hidden_size, directions}; + topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0})); + topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0})); + } + + network network(engine, topology); + network.set_input_data("input", input); + network.set_input_data("hidden", hidden); + network.set_input_data("cell", cell); + + auto outputs = network.execute(); + uint32_t ref_num_output_primitives = 1; // Output will return atleast 1 primitive + + if (emit_last_cell) { + // add another primitve to account for cell state if the output selection includes cell state + ref_num_output_primitives += 1; + } + + // check if the number of returned primitives match the expected number of output primitives + ASSERT_EQ(ref_num_output_primitives, outputs.size()); + + for (auto itr = outputs.begin(); itr != outputs.end(); itr++) + { + auto output_tensor = itr->second.get_memory().get_layout().size; + primitive_id primitive_name = itr->first; + + cldnn::memory output_memory = itr->second.get_memory(); + int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T)); + cldnn::tensor ref_output_tensor; + VVVVF ref_primitive_output; + + int32_t ref_batch_size = batch_size; + int32_t ref_hidden_size = hidden_size; + int32_t ref_directions = directions; + + int32_t ref_seq_len = 1; + // Set the reference output against which the primitive's output will be compared + if (primitive_name.find("crop:last_cell") != std::string::npos) + { + ref_primitive_output = last_cell; + } + else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos) + { + ref_primitive_output = last_hidden; + } + else + { + ref_seq_len = sequence_len; + ref_primitive_output = ref_output; + } + + ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions }; + int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions; + + // The number of elements in reference should match the number of elements in the primitive's output + ASSERT_EQ(ref_output_size , output_size); + + // Compare the output tensor configuration against the reference value + // Output tensor is configured in bfyx format + ASSERT_EQ(ref_batch_size, output_tensor.batch[0]); + ASSERT_EQ(ref_seq_len, output_tensor.feature[0]); // Sequence length should match + ASSERT_EQ(ref_directions, output_tensor.spatial[1]); // directions should match + ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]); // input size should match + + auto output_ptr = output_memory.pointer(); + + int32_t i = 0; + for (int32_t b = 0; b < ref_batch_size; ++b) { + for (int32_t s = 0; s < ref_seq_len; ++s) { + for (int32_t d = 0; d < ref_directions; ++d) { + for (int32_t x = 0; x < ref_hidden_size; ++x) { + ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR); + } + } + } + } + } +} + + +// ------------------------------------------------------- +template +void lstm_gpu_format_test(const cldnn::format& format, int directions) { + int layers = 1; + int sequence_len = 6; + int batch_size = 3; + int input_size = 4; + int hidden_size = 5; + + cldnn_lstm_output output_selection = cldnn_lstm_output::cldnn_lstm_output_sequence; + + std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size + << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size + << " Output selection: " << output_selection << std::endl; + int min_random = -2, max_random = 2; + + VVVVF ref_input = generate_random_4d(batch_size, sequence_len, 1, input_size, min_random, max_random); + VVVVF ref_weights = generate_random_4d(1, directions, 4 * hidden_size, input_size, min_random, max_random); + VVVVF ref_recurrent = generate_random_4d(1, directions, 4 * hidden_size, hidden_size, min_random, max_random); + VVVVF ref_bias = generate_random_4d(1, 1, directions, 4 * hidden_size, min_random, max_random); + VVVVF ref_hidden = generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random); + VVVVF ref_cell = generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random); + VVVVF ref_output = VVVVF(batch_size, VVVF(sequence_len, VVF(directions, VF(hidden_size)))); + + VF ref_input_vec = flatten_4d(format, ref_input); + VF ref_weights_vec = flatten_4d(cldnn::format::bfyx, ref_weights); + VF ref_recurrent_vec = flatten_4d(cldnn::format::bfyx, ref_recurrent); + VF ref_bias_vec = flatten_4d(cldnn::format::bfyx, ref_bias); + VF ref_hidden_vec = flatten_4d(format, ref_hidden); + VF ref_cell_vec = flatten_4d(format, ref_cell); + + VVVVF last_hidden(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))); + VVVVF last_cell(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))); + + lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output, + last_hidden, last_cell, true, true, true, + (T)0, false, true); + + const auto& engine = get_test_engine(); + + memory input = memory::allocate(engine, { type_to_data_type::value,format, {batch_size, sequence_len, input_size, 1} }); + memory weights = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } }); + memory recurrent = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } }); + memory biases = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } }); + memory hidden = memory::allocate(engine, { type_to_data_type::value, format, { batch_size, 1, hidden_size, directions } }); + memory cell = memory::allocate(engine, { type_to_data_type::value, format, { batch_size, 1, hidden_size, directions } }); + + set_values(input, ref_input_vec); + set_values(weights, ref_weights_vec); + set_values(recurrent, ref_recurrent_vec); + set_values(biases, ref_bias_vec); + set_values(hidden, ref_hidden_vec); + set_values(cell, ref_cell_vec); + + bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell || + output_selection == cldnn_lstm_output_sequence_cell; + bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden || + output_selection == cldnn_lstm_output_hidden_cell; + + topology topology; + std::vector> input_ids_offsets; + std::vector lstm_inputs; + std::vector output_ids_offsets; + + topology.add(input_layout("input", input.get_layout())); + for (int i = 0; i < sequence_len; ++i) + { + input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}}); + lstm_inputs.push_back("inputSplit:"+get_string_id(i)); + } + topology.add(split("inputSplit", "input", input_ids_offsets)); + topology.add(data("weights", weights)); + topology.add(data("recurrent", recurrent)); + topology.add(data("biases", biases)); + topology.add(input_layout("hidden", hidden.get_layout())); + topology.add(input_layout("cell", cell.get_layout())); + topology.add(lstm("lstm"+get_string_id(0), lstm_inputs, "weights", "recurrent", + "biases", "hidden", "cell", "", 0, false, {}, {}, + output_selection, default_offset_type)); + + if (emit_last_cell) + { + int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1; + tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions}; + tensor cell_tensor {batch_size, 1, hidden_size, directions}; + topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0})); + topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0})); + } + + network network(engine, topology); + std::map outputs; + + network.set_input_data("input", input); + network.set_input_data("hidden", hidden); + network.set_input_data("cell", cell); + outputs = network.execute(); + + uint32_t ref_num_output_primitives = 1; // Output will return atleast 1 primitive + + if (emit_last_cell) { + // add another primitve to account for cell state if the output selection includes cell state + ref_num_output_primitives += 1; + } + + // check if the number of returned primitives match the expected number of output primitives + ASSERT_EQ(ref_num_output_primitives, outputs.size()); + + for (auto itr = outputs.begin(); itr != outputs.end(); itr++) + { + auto output_tensor = itr->second.get_memory().get_layout().size; + primitive_id primitive_name = itr->first; + + cldnn::memory output_memory = itr->second.get_memory(); + int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T)); + cldnn::tensor ref_output_tensor; + VVVVF ref_primitive_output; + + int32_t ref_batch_size = batch_size; + int32_t ref_hidden_size = hidden_size; + int32_t ref_directions = directions; + + int32_t ref_seq_len = 1; + // Set the reference output against which the primitive's output will be compared + if (primitive_name.find("crop:last_cell") != std::string::npos) + { + ref_primitive_output = last_cell; + } + else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos) + { + ref_primitive_output = last_hidden; + } + else + { + ref_seq_len = sequence_len; + ref_primitive_output = ref_output; + } + + ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions }; + int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions; + + // The number of elements in reference should match the number of elements in the primitive's output + ASSERT_EQ(ref_output_size , output_size); + + // Compare the output tensor configuration against the reference value + // Output tensor is configured in bfyx format + ASSERT_EQ(ref_batch_size, output_tensor.batch[0]); + ASSERT_EQ(ref_seq_len, output_tensor.feature[0]); // Sequence length should match + ASSERT_EQ(ref_directions, output_tensor.spatial[1]); // directions should match + ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]); // input size should match + + auto output_ptr = output_memory.pointer(); + + int32_t i = 0; + if (format == cldnn::format::bfyx) { + for (int32_t b = 0; b < ref_batch_size; ++b) { + for (int32_t s = 0; s < ref_seq_len; ++s) { + for (int32_t d = 0; d < ref_directions; ++d) { + for (int32_t x = 0; x < ref_hidden_size; ++x) { + ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR); + } + } + } + } + } + else if(format == cldnn::format::fyxb) + { + for (int32_t s = 0; s < ref_seq_len; ++s) { + for (int32_t d = 0; d < ref_directions; ++d) { + for (int32_t x = 0; x < ref_hidden_size; ++x) { + for (int32_t b = 0; b < ref_batch_size; ++b) { + ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR); + } + } + } + } + } + + } +} + +// ------------------------------------------------------- +template +void lstm_gpu_users_test() { + int sequence_len = 2; + int batch_size = 1; + int input_size = 1; + int hidden_size = 1; + int directions = 1; + int min_random = -2, max_random = 2; + + // The following test is designed to test the user dependencies of an LSTM node when replaced by subcomponents + // by the graph compiler. + // The output of an LSTM node is set to last_hidden only. Then we concatenate the last_hidden with the initial_hidden tensor: + // (input, weights, recurrent, bias, initial_hidden, inital_cell) -> LSTM -> last_hidden + // concatenation(last_hidden, initial_hidden) + // If the replacing is is done correctly then the initial_hidden tensor should match the output of the concatenation + // by an offset along the sequence. + + VVVVF ref_input = generate_random_4d(batch_size, sequence_len, 1, input_size, min_random, max_random); + VVVVF ref_weights = generate_random_4d(1, directions, 4 * hidden_size, input_size, min_random, max_random); + VVVVF ref_recurrent = generate_random_4d(1, directions, 4 * hidden_size, hidden_size, min_random, max_random); + VVVVF ref_bias = generate_random_4d(1, 1, directions, 4 * hidden_size, min_random, max_random); + VVVVF ref_hidden = generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random); + VVVVF ref_cell = generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random); + VVVVF ref_output = VVVVF(batch_size, VVVF(sequence_len, VVF(directions, VF(hidden_size)))); + + VF ref_input_vec = flatten_4d(format::bfyx, ref_input); + VF ref_weights_vec = flatten_4d(format::bfyx, ref_weights); + VF ref_recurrent_vec = flatten_4d(format::bfyx, ref_recurrent); + VF ref_bias_vec = flatten_4d(format::bfyx, ref_bias); + VF ref_hidden_vec = flatten_4d(format::bfyx, ref_hidden); + VF ref_cell_vec = flatten_4d(format::bfyx, ref_cell); + + VVVVF last_hidden(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))); + VVVVF last_cell(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))); + + const auto& engine = get_test_engine(); + + memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx, {batch_size, sequence_len, input_size, 1} }); + memory weights = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } }); + memory recurrent = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } }); + memory biases = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } }); + memory hidden = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, 1, hidden_size, directions } }); + memory cell = memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, 1, hidden_size, directions } }); + + set_values(input, ref_input_vec); + set_values(weights, ref_weights_vec); + set_values(recurrent, ref_recurrent_vec); + set_values(biases, ref_bias_vec); + set_values(hidden, ref_hidden_vec); + set_values(cell, ref_cell_vec); + + topology topology; + std::vector> input_ids_offsets; + std::vector lstm_inputs; + + topology.add(input_layout("input", input.get_layout())); + for (int i = 0; i < sequence_len; ++i) + { + input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}}); + lstm_inputs.push_back("inputSplit:"+get_string_id(i)); + } + topology.add(split("inputSplit", "input", input_ids_offsets)); + topology.add(data("weights", weights)); + topology.add(data("recurrent", recurrent)); + topology.add(data("biases", biases)); + topology.add(input_layout("hidden", hidden.get_layout())); + topology.add(input_layout("cell", cell.get_layout())); + topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent", + "biases", "hidden", "cell", "", 0, false, {}, {}, + cldnn_lstm_output::cldnn_lstm_output_hidden, default_offset_type)); + std::vector output_ids_offsets {"lstm", "hidden"}; + topology.add(concatenation("concatenation", output_ids_offsets, concatenation::along_f)); + + network network(engine, topology); + std::map outputs; + + network.set_input_data("input", input); + network.set_input_data("hidden", hidden); + network.set_input_data("cell", cell); + outputs = network.execute(); + + // check if the number of returned primitives match the expected number of output primitives + ASSERT_EQ(size_t(1), outputs.size()); + cldnn::memory output_memory = outputs.begin()->second.get_memory(); + auto output_ptr = output_memory.pointer(); + + int32_t i = 0; + for (int32_t b = 0; b < batch_size; ++b) { + for (int32_t s = 0; s < 1; ++s) { + for (int32_t d = 0; d < directions; ++d) { + for (int32_t x = 0; x < hidden_size; ++x) { + int32_t idx = x + hidden_size * (d + directions * ((s+1) + sequence_len * b)); + ASSERT_NEAR(ref_hidden[b][s][d][x], output_ptr[idx], FERROR); + } + } + } + } +} + +// ------------------------------------------------------- +template +void lstm_gpu_concatenated_input_test(int layers, int sequence_len, int direction, + int batch_size, int input_size, int hidden_size, + bool has_bias = true, bool has_initial_hidden = true, + bool has_initial_cell = true, float clip_threshold = 0, + bool input_forget = false) +{ + std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size + << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl; + int min_random = -2, max_random = 2; + + VVVVF ref_input = generate_random_4d(batch_size, sequence_len, 1, input_size, min_random, max_random); + + std::vector> ref_weights; + std::vector> ref_recurrent; + std::vector> ref_bias; + std::vector> ref_hidden; + std::vector> ref_cell; + std::vector> ref_output; + + for (int i = 0; i < layers; ++i) { + ref_weights.push_back(generate_random_4d(1, direction, 4 * hidden_size, i == 0 ? input_size : hidden_size, min_random, max_random)); + ref_recurrent.push_back(generate_random_4d(1, direction, 4 * hidden_size, hidden_size, min_random, max_random)); + ref_bias.push_back(generate_random_4d(1, 1, direction, 4 * hidden_size, min_random, max_random)); + ref_hidden.push_back(generate_random_4d(batch_size, 1, direction, hidden_size, min_random, max_random)); + ref_cell.push_back(generate_random_4d(batch_size, 1, direction, hidden_size, min_random, max_random)); + ref_output.push_back(VVVVF(batch_size, VVVF(sequence_len, VVF(direction, VF(hidden_size))))); + } + + VF ref_input_vec = flatten_4d(cldnn::format::bfyx, ref_input); + + std::vector> ref_weights_vec; + std::vector> ref_recurrent_vec; + std::vector> ref_bias_vec; + std::vector> ref_hidden_vec; + std::vector> ref_cell_vec; + for (int i = 0; i < layers; ++i) { + ref_weights_vec.push_back(flatten_4d(cldnn::format::bfyx, ref_weights[i])); + ref_recurrent_vec.push_back(flatten_4d(cldnn::format::bfyx, ref_recurrent[i])); + ref_bias_vec.push_back(flatten_4d(cldnn::format::bfyx, ref_bias[i])); + ref_hidden_vec.push_back(flatten_4d(cldnn::format::bfyx, ref_hidden[i])); + ref_cell_vec.push_back(flatten_4d(cldnn::format::bfyx, ref_cell[i])); + } + + VVVVF last_hidden(batch_size, VVVF(1, VVF(direction, VF(hidden_size)))); + VVVVF last_cell(batch_size, VVVF(1, VVF(direction, VF(hidden_size)))); + + lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0], + last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell, + clip_threshold, input_forget, true); + + for (int i = 1; i < layers; ++i) { + lstm_reference(ref_output[i - 1], ref_hidden[i], ref_cell[i], ref_weights[i], ref_recurrent[i], + ref_bias[i], ref_output[i], + last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell, + clip_threshold, input_forget, false); + } + + const auto& engine = get_test_engine(); + + memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx, {batch_size, sequence_len, input_size, 1} }); + set_values(input, ref_input_vec); + + std::vector weights; + std::vector recurrent; + std::vector biases; + std::vector hidden; + std::vector cell; + for (int i = 0; i < layers; ++i) { + weights.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, direction, i == 0 ? input_size : hidden_size, 4 * hidden_size } })); + set_values(weights[i], ref_weights_vec[i]); + recurrent.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } })); + set_values(recurrent[i], ref_recurrent_vec[i]); + if (has_bias) { + biases.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { 1, 1, 4 * hidden_size, direction } })); + set_values(biases[i], ref_bias_vec[i]); + } + if (has_initial_hidden) { + hidden.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, 1, hidden_size, direction } })); + set_values(hidden[i], ref_hidden_vec[i]); + } + if (has_initial_cell) { + cell.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, { batch_size, 1, hidden_size, direction} })); + set_values(cell[i], ref_cell_vec[i]); + } + } + + topology topology; + std::vector> input_ids_offsets; + std::vector lstm_inputs; + std::vector output_ids_offsets; + + topology.add(input_layout("input", input.get_layout())); + cldnn::primitive_id prev_node_id; + + for (int i = 0; i < layers; ++i) { + std::string sid = get_string_id(i); + std::string lstm_id = "lstm" + sid; + std::string weights_id = "weights" + sid; + std::string recurrent_id = "recurrent" + sid; + std::string biases_id = "biases" + sid; + std::string hidden_id = "hidden" + sid; + std::string cell_id = "cell" + sid; + std::string output_crop_id = "crop:sequence:" + sid; + + topology.add(data(weights_id, weights[i])); + topology.add(data(recurrent_id, recurrent[i])); + if (has_bias) topology.add(data(biases_id, biases[i])); + if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[i].get_layout())); + if (has_initial_cell) topology.add(input_layout(cell_id, cell[i].get_layout())); + if (i == 0) { + topology.add(lstm(lstm_id, { "input" }, weights_id, recurrent_id, + has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "", + clip_threshold, input_forget, {}, {}, + cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type)); + } + else { + topology.add(lstm(lstm_id, { prev_node_id }, weights_id, recurrent_id, + has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "", + clip_threshold, input_forget, {}, {}, + cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type)); + } + + // Crop out the whole output sequence element + topology.add(crop(output_crop_id, lstm_id, {batch_size, sequence_len, hidden_size, direction}, {0, 0, 0, 0})); + + // Save the node id to provide it as input to the next lstm layer + prev_node_id = output_crop_id; + } + + network network(engine, topology); + network.set_input_data("input", input); + for (int i = 0; i < layers; ++i) { + std::string sid = get_string_id(i); + if (has_initial_hidden) network.set_input_data("hidden" + sid, hidden[i]); + if (has_initial_cell) network.set_input_data("cell" + sid, cell[i]); + } + auto outputs = network.execute(); + { + ASSERT_EQ(outputs.size(), size_t(1)); + size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T); + ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction)); + + auto output = outputs.begin()->second.get_memory(); + + // Get the output tensor + cldnn::layout output_layout = output.get_layout(); + cldnn::tensor output_tensor = output_layout.size; + + // Compare the output tensor configuration against the reference value + // Output tensor is configured in bfyx format + ASSERT_EQ(batch_size, output_tensor.batch[0]); + ASSERT_EQ(sequence_len, output_tensor.feature[0]); + ASSERT_EQ(direction, output_tensor.spatial[1]); + ASSERT_EQ(hidden_size, output_tensor.spatial[0]); + + auto output_ptr = output.pointer(); + int32_t i = 0; + for (int32_t b = 0; b < batch_size; ++b) { + for (int32_t s = 0; s < sequence_len; ++s) { + for (int32_t d = 0; d < direction; ++d) { + for (int32_t x = 0; x < hidden_size; ++x) { + ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], FERROR); + } + } + } + } + } +} + +// This test checks chained and stacked LSTM topology. The configuration allows to create +// LSTM topology with multiple layers and can also be chained together. +template +void lstm_gpu_chain_test(int batch_size, int input_size, int hidden_size, + int directions, size_t layers, size_t chains, int sequence_len, + const cldnn_lstm_output& output_selection) +{ + int min_random = -2, max_random = 2; + bool has_bias = false; + bool has_initial_hidden = false; + bool has_initial_cell = false; + float clip_threshold = 0; + bool input_forget = false; + + std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size + << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size + << " Output selection: " << output_selection << std::endl; + + VVVVF ref_input = generate_random_4d(batch_size, sequence_len, 1, input_size, min_random, max_random); + std::vector>> ref_weights; + std::vector>> ref_recurrent; + std::vector>> ref_bias; + std::vector>> ref_hidden; + std::vector>> ref_cell; + std::vector>> ref_output; + + // Create the 4 dimensional weight, bias, hidden, cell state and output vectors + for (size_t chain = 0; chain < chains; chain++) { + + std::vector> per_chain_ref_weights; + std::vector> per_chain_ref_recurrent; + std::vector> per_chain_ref_bias; + std::vector> per_chain_ref_hidden; + std::vector> per_chain_ref_cell; + std::vector> per_chain_ref_output; + + for (size_t layer = 0; layer < layers; layer++) { + per_chain_ref_weights.push_back(generate_random_4d(1, directions, 4 * hidden_size, (layer == 0) ? input_size : hidden_size, min_random, max_random)); + per_chain_ref_recurrent.push_back(generate_random_4d(1, directions, 4 * hidden_size, hidden_size, min_random, max_random)); + per_chain_ref_bias.push_back(generate_random_4d(1, 1, directions, 4 * hidden_size, min_random, max_random)); + per_chain_ref_hidden.push_back(generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random)); + per_chain_ref_cell.push_back(generate_random_4d(batch_size, 1, directions, hidden_size, min_random, max_random)); + per_chain_ref_output.push_back(VVVVF(batch_size, VVVF(sequence_len, VVF(directions, VF(hidden_size))))); + } + + ref_weights.push_back(per_chain_ref_weights); + ref_recurrent.push_back(per_chain_ref_recurrent); + ref_bias.push_back(per_chain_ref_bias); + ref_hidden.push_back(per_chain_ref_hidden); + ref_cell.push_back(per_chain_ref_cell); + ref_output.push_back(per_chain_ref_output); + } + + VF ref_input_vec; + std::vector>> ref_weights_vec; + std::vector>> ref_recurrent_vec; + std::vector>> ref_bias_vec; + std::vector>> ref_hidden_vec; + std::vector>> ref_cell_vec; + std::vector>> ref_output_vec; + + ref_input_vec = flatten_4d(cldnn::format::bfyx, ref_input); + + // flatten all the 4 dimensional vectors across chains and layers + for (size_t chain = 0; chain < chains; chain++) { + + std::vector> per_chain_ref_weights; + std::vector> per_chain_ref_recurrent; + std::vector> per_chain_ref_bias; + std::vector> per_chain_ref_hidden; + std::vector> per_chain_ref_cell; + std::vector> per_chain_ref_output; + + for (size_t layer = 0; layer < layers; layer++) { + per_chain_ref_weights.push_back(flatten_4d(cldnn::format::bfyx, ref_weights[chain][layer])); + per_chain_ref_recurrent.push_back(flatten_4d(cldnn::format::bfyx, ref_recurrent[chain][layer])); + per_chain_ref_bias.push_back(flatten_4d(cldnn::format::bfyx, ref_bias[chain][layer])); + per_chain_ref_hidden.push_back(flatten_4d(cldnn::format::bfyx, ref_hidden[chain][layer])); + per_chain_ref_cell.push_back(flatten_4d(cldnn::format::bfyx, ref_cell[chain][layer])); + per_chain_ref_output.push_back(flatten_4d(cldnn::format::bfyx, ref_output[chain][layer])); + } + + ref_weights_vec.push_back(per_chain_ref_weights); + ref_recurrent_vec.push_back(per_chain_ref_recurrent); + ref_bias_vec.push_back(per_chain_ref_bias); + ref_hidden_vec.push_back(per_chain_ref_hidden); + ref_cell_vec.push_back(per_chain_ref_cell); + ref_output_vec.push_back(per_chain_ref_output); + } + + std::vector>> last_hidden(chains, std::vector >(layers, VVVVF(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))))); + std::vector>> last_cell(chains, std::vector >(layers, VVVVF(batch_size, VVVF(1, VVF(directions, VF(hidden_size)))))); + + for (size_t chain = 0; chain < chains; chain++) { + lstm_reference(ref_input, ref_hidden[chain][0], ref_cell[chain][0], ref_weights[chain][0], + ref_recurrent[chain][0], ref_bias[chain][0], ref_output[chain][0], + last_hidden[chain][0], last_cell[chain][0], has_bias, + chain == 0 ? has_initial_hidden : true, + chain == 0 ? has_initial_cell : true, + clip_threshold, input_forget, true); + + if (chain < chains - 1) + { + ref_hidden[chain + 1][0] = last_hidden[chain][0]; + ref_cell[chain + 1][0] = last_cell[chain][0]; + } + } + + for (size_t layer = 1; layer < layers; ++layer) { + for (size_t chain = 0; chain < chains; chain++) { + lstm_reference(ref_output[chain][layer - 1], ref_hidden[chain][layer], ref_cell[chain][layer], + ref_weights[chain][layer], ref_recurrent[chain][layer], ref_bias[chain][layer], + ref_output[chain][layer], last_hidden[chain][layer], last_cell[chain][layer], has_bias, + chain == 0 ? has_initial_hidden : true, + chain == 0 ? has_initial_cell : true, + clip_threshold, input_forget, + false); + + if (chain < chains - 1) + { + ref_hidden[chain + 1][layer] = last_hidden[chain][layer]; + ref_cell[chain + 1][layer] = last_cell[chain][layer]; + } + } + } + + const auto& engine = get_test_engine(); + tensor input_tensor = { batch_size, sequence_len, input_size, 1 }; + layout layout = { type_to_data_type::value, cldnn::format::bfyx, input_tensor }; + + memory input = memory::allocate(engine, layout); + set_values(input, ref_input_vec); + + // 2-dim vectors to support chain and layers + std::vector> weights; + std::vector> recurrent; + std::vector> biases; + std::vector> hidden; + std::vector> cell; + + for (size_t chain = 0; chain < chains; chain++) { + std::vector per_chain_weights; + std::vector per_chain_recurrent; + std::vector per_chain_biases; + std::vector per_chain_hidden; + std::vector per_chain_cell; + + for (size_t layer = 0; layer < layers; layer++) { + per_chain_weights.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, {1, directions, layer == 0 ? input_size : hidden_size, 4 * hidden_size} })); + set_values(per_chain_weights[layer], ref_weights_vec[chain][layer]); + + per_chain_recurrent.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, {1, directions, hidden_size, 4 * hidden_size} })); + set_values(per_chain_recurrent[layer], ref_recurrent_vec[chain][layer]); + + if (has_bias) + { + per_chain_biases.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, {1, 1, 4 * hidden_size, directions} })); + set_values(per_chain_biases[layer], ref_bias_vec[chain][layer]); + } + + if (has_initial_hidden) + { + per_chain_hidden.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, {1, 1, hidden_size, directions} })); + set_values(per_chain_hidden[layer], ref_hidden_vec[chain][layer]); + } + + if (has_initial_cell) + { + per_chain_cell.push_back(memory::allocate(engine, { type_to_data_type::value, format::bfyx, {1, 1, hidden_size, directions} })); + set_values(per_chain_cell[layer], ref_cell_vec[chain][layer]); + } + } + + weights.push_back(per_chain_weights); + recurrent.push_back(per_chain_recurrent); + biases.push_back(per_chain_biases); + hidden.push_back(per_chain_hidden); + cell.push_back(per_chain_cell); + } + + // Start creating the topology + cldnn::topology topology; + std::vector> input_ids_offsets; + std::vector lstm_inputs; + std::vector output_ids_offsets; + + topology.add(input_layout("input", input.get_layout())); + + for (int feature = 0; feature < sequence_len; feature++) { + input_ids_offsets.push_back({ get_string_id(feature), {0, feature, 0, 0} }); + lstm_inputs.push_back("inputSplit:" + get_string_id(feature)); + } + topology.add(split("inputSplit", "input", input_ids_offsets)); + + bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden + || output_selection == cldnn_lstm_output_hidden_cell; + + std::vector output_sequence_ids; + std::vector last_hidden_ids; + std::vector last_cell_ids; + + for (size_t chain = 0; chain < chains; chain++) { + + // Add all the primitives to the network + std::vector prev_output_sequence_ids(output_sequence_ids); + std::vector prev_last_hidden_ids(last_hidden_ids); + std::vector prev_last_cell_ids(last_cell_ids); + + // Erase all the temporary primitive id containers + output_sequence_ids.clear(); + last_cell_ids.clear(); + last_hidden_ids.clear(); + + for (size_t layer = 0; layer < layers; layer++) { + std::string chain_id = get_string_id(chain); + std::string layer_id = get_string_id(layer); + std::string lstm_id = "lstm:" + chain_id + ":" + layer_id; + std::string weights_id = "weights:" + chain_id + ":" + layer_id; + std::string recurrent_id = "recurrent:" + chain_id + ":" + layer_id; + std::string biases_id = "biases:" + chain_id + ":" + layer_id; + std::string hidden_id = "hidden:" + chain_id + ":" + layer_id; + std::string cell_id = "cell:" + chain_id + ":" + layer_id; + std::string crop_seq_id = "crop:sequence:" + chain_id + ":" + layer_id; + std::string crop_last_cell_id = "crop:last_cell:" + chain_id + ":" + layer_id; + std::string crop_last_hidden_id = "crop:last_hidden:" + chain_id + ":" + layer_id; + + primitive_id initial_hidden_id; + primitive_id initial_cell_id; + cldnn_lstm_output output_selection_per_layer; + + topology.add(data(weights_id, weights[chain][layer])); + topology.add(data(recurrent_id, recurrent[chain][layer])); + if (has_bias) topology.add(data(biases_id, biases[chain][layer])); + + if (chain == 0 && layer == 0) + { + if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[chain][layer].get_layout())); + if (has_initial_cell) topology.add(input_layout(cell_id, cell[chain][layer].get_layout())); + } + + // Get the initial hidden and initial cell for each layer for each chain link + if (chain == 0) + { + initial_hidden_id = has_initial_hidden ? hidden_id : ""; + initial_cell_id = has_initial_cell ? cell_id : ""; + } + else + { + initial_hidden_id = prev_last_hidden_ids[layer]; + initial_cell_id = prev_last_cell_ids[layer]; + } + + // Output selection for all the layers except the last layer has to have the sequence, + // last hidden and last cell + if (layer < layers - 1) + { + output_selection_per_layer = cldnn_lstm_output::cldnn_lstm_output_sequence_cell; + } + else + { + // For the last layer, use the output selection provided by the user + output_selection_per_layer = output_selection; + } + + if (layer == 0) + { + topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id, + has_bias ? biases_id : "", + initial_hidden_id, initial_cell_id, + "", clip_threshold, input_forget, {}, {}, + output_selection_per_layer, default_offset_type)); + } + else + { + topology.add(lstm(lstm_id, { output_sequence_ids[layer - 1] }, weights_id, recurrent_id, + has_bias ? biases_id : "", + initial_hidden_id, initial_cell_id, + "", clip_threshold, input_forget, {}, {}, + output_selection_per_layer, default_offset_type)); + } + + tensor sequence_tensor{ batch_size, sequence_len, hidden_size, directions }; + tensor cell_tensor{ batch_size, 1, hidden_size, directions }; + tensor last_hidden_tensor{ batch_size, 1, hidden_size, directions }; + + // For all the layers except the last layer, we need to crop output sequence, + // last hidden and last cell. + // The output sequence goes into the next layer of lstm in a chain link + // The last cell state and last hidden go to the lstm node in the same layer + // next in chain + topology.add(crop(crop_seq_id, lstm_id, sequence_tensor, tensor{ 0, 0, 0, 0 })); // Add crop to get the sequence + topology.add(crop(crop_last_hidden_id, lstm_id, last_hidden_tensor, tensor{ 0, sequence_len - 1, 0, 0 })); // Add crop to get the last hidden element + topology.add(crop(crop_last_cell_id, lstm_id, cell_tensor, tensor{ 0, sequence_len, 0, 0 })); // Add crop to get the last cell element + + // Keep a copy of the sequence, last hidden and last cell primitve id for each layer + output_sequence_ids.push_back(crop_seq_id); + last_hidden_ids.push_back(crop_last_hidden_id); + last_cell_ids.push_back(crop_last_cell_id); + } + } + + // Creating network out of the above designed topology + cldnn::network network(engine, topology); + network.set_input_data("input", input); + for (size_t layer = 0; layer < layers; layer++) { + std::string sid = get_string_id(layer); + if (has_initial_hidden) network.set_input_data("hidden:000:" + sid, hidden[0][layer]); // 0 is the chain link index + if (has_initial_cell) network.set_input_data("cell:000:" + sid, cell[0][layer]); // 0 is the chain link index + } + + auto outputs = network.execute(); + for (auto itr = outputs.begin(); itr != outputs.end(); itr++) + { + auto output_tensor = itr->second.get_memory().get_layout().size; + primitive_id primitive_name = itr->first; + + // Split the primitive id to get the chain id + // Eg: primitive id: crop:last_cell:XXX:YYY + // XXX is the chain id + // YYY is the layer id + std::string chain_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":") + 1) + 1, 5); + std::string layer_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":", primitive_name.find(":") + 1) + 1) + 1, 5); + size_t chain_id = stoi(chain_str); + size_t layer_id = stoi(layer_str); + + cldnn::memory output_memory = itr->second.get_memory(); + int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T)); + cldnn::tensor ref_output_tensor; + VVVVF ref_primitive_output; + + int32_t ref_batch_size = batch_size; + int32_t ref_hidden_size = hidden_size; + int32_t ref_directions = directions; + + int32_t ref_seq_len = 1; + + // Set the reference output against which the primitive's output will be compared + if (primitive_name.find("crop:last_cell") != std::string::npos) + { + ref_primitive_output = last_cell[chain_id][layer_id]; + } + else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos) + { + ref_primitive_output = last_hidden[chain_id][layer_id]; + } + else + { + ref_seq_len = sequence_len; + ref_primitive_output = ref_output[chain_id][layers - 1]; + } + + ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions }; + int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions; + + // The number of elements in reference should match the number of elements in the primitive's output + ASSERT_EQ(ref_output_size, output_size); + + // Compare the output tensor configuration against the reference value + // Output tensor is configured in bfyx format + ASSERT_EQ(ref_batch_size, output_tensor.batch[0]); + ASSERT_EQ(ref_seq_len, output_tensor.feature[0]); // Sequence length should match + ASSERT_EQ(ref_directions, output_tensor.spatial[1]); // directions should match + ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]); // input size should match + + auto output_ptr = output_memory.pointer(); + + int32_t i = 0; + for (int32_t b = 0; b < ref_batch_size; ++b) { + for (int32_t s = 0; s < ref_seq_len; ++s) { + for (int32_t d = 0; d < ref_directions; ++d) { + for (int32_t x = 0; x < ref_hidden_size; ++x) { + ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR); } } } @@ -593,6 +1651,7 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc } } + TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f32) { generic_lstm_gemm_gpu_test(1, 1, 3, 6, 2, true, true); } @@ -609,6 +1668,24 @@ TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f32) { generic_lstm_gemm_gpu_test(1, 1, 3, 6, 2, false, false); } +// LSTM GEMM tests to test LSTM GEMMV kernel implementation +TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_test_f32) { + generic_lstm_gemm_gpu_test(5, 1, 1, 1024, 1024, true, true); +} + +TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_bias_f32) { + generic_lstm_gemm_gpu_test(1, 1, 1, 256, 2, false, true); +} + +TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_f32) { + generic_lstm_gemm_gpu_test(1, 1, 1, 64, 2, true, false); +} + +TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_bias_f32) { + generic_lstm_gemm_gpu_test(1, 1, 1, 64, 2, false, false); +} + +// LSTM ELT Tests TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f32) { generic_lstm_elt_gpu_test(1, 1, 4, 6, 3, true, 0.3f); } @@ -751,9 +1828,234 @@ TEST(lstm_gpu, generic_lstm_stacked_seq_bi_f32) { generic_lstm_gpu_test(4, 7, 2, 3, 3, 2, true, true, true); } +// optional outputs support +TEST(lstm_gpu, output_test_sequence_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_sequence, 1); +} + +TEST(lstm_gpu, output_test_hidden_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_hidden, 1); +} + +TEST(lstm_gpu, output_test_hidden_cell_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 1); +} + +TEST(lstm_gpu, output_test_sequence_cell_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 1); +} + +TEST(lstm_gpu, output_test_sequence_bi_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_sequence, 2); +} + +TEST(lstm_gpu, output_test_hidden_bi_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_hidden, 2); +} + +TEST(lstm_gpu, output_test_hidden_cell_bi_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 2); +} + +TEST(lstm_gpu, output_test_sequence_cell_bi_f32) { + lstm_gpu_output_test(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 2); +} + +// format tests +TEST(lstm_gpu, lstm_gpu_format_bfyx_f32) { + lstm_gpu_format_test(cldnn::format::bfyx, 1); +} + +TEST(lstm_gpu, lstm_gpu_format_bfyx_bi_f32) { + lstm_gpu_format_test(cldnn::format::bfyx, 2); +} + +TEST(lstm_gpu, lstm_gpu_format_fyxb_f32) { + lstm_gpu_format_test(cldnn::format::fyxb, 1); +} + +TEST(lstm_gpu, lstm_gpu_format_fyxb_bi_f32) { + lstm_gpu_format_test(cldnn::format::fyxb, 2); +} + +// test for LSTM users' dependencies +TEST(lstm_gpu, lstm_users_f32) { + lstm_gpu_users_test(); +} + +// Test for LSTM with concatenated input +TEST(lstm_gpu, generic_lstm_concatenated_input) { + lstm_gpu_concatenated_input_test(1, 2, 2, 1, 1, 1, true, true, true); +} + +TEST(lstm_gpu, generic_lstm_concatenated_input_multi_layer) { + lstm_gpu_concatenated_input_test(5, 5, 2, 1, 1, 4, true, true, true); +} + +// test for LSTM with chain and stack (multilayer) +TEST(lstm_gpu, generic_lstm_chained_unidirectional_f32) { + // batch size = 1 + // input size = 2 + // hidden size = 4 + // directions = 1 + // layers = 1 + // chains = 1 + // sequence length = 1 + // output selection = output sequence and cell + lstm_gpu_chain_test(1, 2, 4, 1, 1, 2, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell); +} + +TEST(lstm_gpu, generic_lstm_chained_bidirectional_f32) { + // batch size = 1 + // input size = 2 + // hidden size = 4 + // directions = 2 + // layers = 1 + // chains = 1 + // sequence length = 1 + // output selection = output sequence and cell + lstm_gpu_chain_test(1, 2, 4, 2, 1, 1, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell); +} + +TEST(lstm_gpu, generic_lstm_chained_no_stack_bidirectional_f32) { + // batch size = 2 + // input size = 2 + // hidden size = 4 + // directions = 2 + // layers = 1 + // chains = 2 + // sequence length = 5 + // output selection = output sequence and cell + lstm_gpu_chain_test(2, 2, 4, 2, 1, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell); +} + +TEST(lstm_gpu, generic_lstm_chained_stacked_bidirectional_f32) { + // batch size = 2 + // input size = 2 + // hidden size = 4 + // directions = 2 + // layers = 4 + // chains = 2 + // sequence length = 5 + // output selection = output sequence and cell + lstm_gpu_chain_test(2, 2, 4, 2, 4, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell); +} + +// FP16 Half precision tests +TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f16) { + generic_lstm_gemm_gpu_test(1, 1, 3, 6, 2, true, true); +} + +TEST(lstm_gemm_gpu, generic_lstm_gemm_no_bias_f16) { + generic_lstm_gemm_gpu_test(1, 1, 3, 6, 2, false, true); +} + +TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_f16) { + generic_lstm_gemm_gpu_test(1, 1, 3, 6, 2, true, false); +} + +TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f16) { + generic_lstm_gemm_gpu_test(1, 1, 3, 6, 2, false, false); +} + +TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f16) { + generic_lstm_elt_gpu_test(1, 1, 4, 6, 3, true, 0.3f); +} + +TEST(lstm_elt_gpu, generic_lstm_elt_test_input_forget_f16) { + generic_lstm_elt_gpu_test(1, 1, 4, 6, 3, true, 0.f, 1); +} + +TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_input_forget_f16) { + generic_lstm_elt_gpu_test(1, 1, 4, 6, 3, true, 0.5f, 1); +} + +TEST(lstm_elt_gpu, generic_lstm_elt_test_f16) { + generic_lstm_elt_gpu_test(1, 1, 4, 6, 3, true); +} + +TEST(lstm_elt_gpu, generic_lstm_elt_no_cell_f16) { + generic_lstm_elt_gpu_test(1, 1, 4, 6, 3, false); +} + +TEST(lstm_gpu, generic_lstm_f16) { + generic_lstm_gpu_test(1, 7, 1, 3, 3, 2, true, true, true); +} + +TEST(lstm_gpu, generic_lstm_no_bias_f16) { + generic_lstm_gpu_test(1, 7, 1, 3, 3, 2, false, true, true); +} + +TEST(lstm_gpu, generic_lstm_no_hidden_f16) { + generic_lstm_gpu_test(1, 7, 1, 5, 4, 3, true, false, true); +} + +TEST(lstm_gpu, generic_lstm_no_bias_hidden_f16) { + generic_lstm_gpu_test(1, 7, 1, 5, 4, 3, false, false, true); +} + +TEST(lstm_gpu, generic_lstm_no_cell_f16) { + generic_lstm_gpu_test(1, 7, 1, 5, 4, 3, true, true, false); +} + +TEST(lstm_gpu, generic_lstm_no_bias_cell_f16) { + generic_lstm_gpu_test(1, 7, 1, 5, 4, 3, false, true, false); +} + +TEST(lstm_gpu, generic_lstm_no_hidden_cell_f16) { + generic_lstm_gpu_test(1, 7, 1, 5, 4, 3, true, false, false); +} + +TEST(lstm_gpu, generic_lstm_no_bias_hidden_cell_f16) { + generic_lstm_gpu_test(1, 7, 1, 5, 4, 3, false, false, false); +} + +TEST(lstm_gpu, generic_lstm_clip_f16) { + generic_lstm_gpu_test(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 0); +} + +TEST(lstm_gpu, generic_lstm_input_forget_f16) { + generic_lstm_gpu_test(1, 7, 1, 3, 3, 2, true, true, true, 0.f, 1); +} + +TEST(lstm_gpu, generic_lstm_clip_input_forget_f16) { + generic_lstm_gpu_test(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 1); +} + +TEST(lstm_gpu, generic_lstm_offset_order_ifoz_f16) { + default_offset_type = cldnn_lstm_offset_order_ifoz; + generic_lstm_gpu_test(1, 7, 1, 3, 3, 2, true, true, true); + default_offset_type = cldnn_lstm_offset_order_iofz; +} + +TEST(lstm_gpu, generic_lstm_canonical_f16) { + generic_lstm_gpu_test(1, 1, 1, 1, 1, 1, true, true, true); +} + +// bidirectional support +TEST(lstm_gpu, generic_lstm_bi_bias_f16) { + generic_lstm_gpu_test(1, 7, 2, 2, 3, 4, true, false, false); +} + +TEST(lstm_gpu, generic_lstm_bi_bias_hidden_f16) { + generic_lstm_gpu_test(1, 7, 2, 2, 3, 4, true, true, false); +} + +TEST(lstm_gpu, generic_lstm_bi_bias_hidden_cell_f16) { + generic_lstm_gpu_test(1, 7, 2, 2, 3, 4, true, true, true); +} + +// multi-layer support +TEST(lstm_gpu, generic_lstm_stacked_seq_f16) { + generic_lstm_gpu_test(4, 7, 1, 3, 3, 2, true, true, true); +} + +TEST(lstm_gpu, generic_lstm_stacked_bi_f16) { + generic_lstm_gpu_test(4, 7, 2, 3, 3, 2, true, true, true); +} + // TODO: Add tests for the following: -// optional concatenate output -// optional last hidden -// optional last cell +// integration testing using multi-layer and chained LSTMs +// LSTMs single input // optional activation list diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/max_unpooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/max_unpooling_gpu_test.cpp index ec78a6c..afade14 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/max_unpooling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/max_unpooling_gpu_test.cpp @@ -57,7 +57,7 @@ TEST(max_unpooling_gpu, basic_in2x3x2x2) { // f1: b0: 0 0 0 b1: 0 0 0 // f1: b0: 0 8 16 b1: 12 0 17 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 1 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -139,7 +139,7 @@ TEST(max_unpooling_gpu, basic_in2x3x2x2_output_padding) { // f1: b0: 0 0 0 b1: 0 0 0 // f1: b0: 0 8 16 b1: 12 0 17 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -230,7 +230,7 @@ TEST(max_unpooling_gpu, basic_in2x3x2x2_output_size) { // f1: b0: 0 0 0 b1: 0 0 0 // f1: b0: 0 8 16 b1: 12 0 17 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -311,7 +311,7 @@ TEST(max_unpooling_gpu, basic_in2x3x2x2_fp16) { // f1: b0: 0 0 0 b1: 0 0 0 // f1: b0: 0 8 16 b1: 12 0 17 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 2, 2, 2, 1 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -392,7 +392,7 @@ TEST(max_unpooling_gpu, basic_in2x2x3x2_max_with_argmax_pooling_unpooling) { // f1: b0: 0 8 16 b1: 12 0 17 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp index 6bca8f2..75821bf 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include "test_utils/test_utils.h" @@ -72,7 +75,7 @@ TEST(memory_tests, DISABLED_network_creation_loop) #endif TEST(memory_pool, basic_non_padded_relu_pipe) { // 5 relu's of size 1x4x1x1 - engine engine; + const cldnn::engine engine;// here we need new engine auto batch_num = 1; auto feature_num = 4; auto x_size = 1; @@ -106,7 +109,7 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) { // uncomment this line to disable memory pool /*engine_configuration cfg{ false, false, false, std::string(), std::string(), true, std::string(),std::string(), 0, false }; engine engine{ cfg };*/ - engine engine; + const cldnn::engine engine;// here we need new engine auto batch_num = 1; auto feature_num = 4; auto x_size = 4; @@ -144,7 +147,7 @@ TEST(memory_pool, multi_outputs_network) { // uncomment this line to disable memory pool /*engine_configuration cfg{ false, false, false, std::string(), std::string(), true, std::string(),std::string(), 0, false }; engine engine{ cfg };*/ - engine engine; + const cldnn::engine engine;// here we need new engine auto batch_num = 1; auto feature_num = 4; auto x_size = 4; @@ -173,11 +176,8 @@ TEST(memory_pool, multi_outputs_network) { EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t)2048); } -// Disabled since ref values seems to be incorrect. -// Test passes when Relu4 is fused with concat1 and then concat1 is optimized out, -// but this optimizations order is invalid. -// TODO: fix the test -TEST(memory_pool, DISABLED_oooq) { + +TEST(memory_pool, oooq) { /* -- relu1 - concat1- relu4 -- input< -- relu2 / >-- concat2 -- relu6 -- relu3 -- relu5 --------- @@ -210,14 +210,10 @@ TEST(memory_pool, DISABLED_oooq) { network.set_input_data("input", input); auto outputs = network.execute(); - EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t) 2304); + EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t) 2816); } -// Disabled since ref values seems to be incorrect. -// Test passes when Relu4 is fused with concat1 and then concat1 is optimized out, -// but this optimizations order is invalid. -// TODO: fix the test -TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) { +TEST(memory_pool, shared_mem_pool_same_topology_twice) { /* -- relu1 - concat1- relu4 -- input< -- relu2 | >-- concat2 -- relu6 -- relu3 -- relu5 --------- @@ -261,7 +257,7 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) { auto output_layout_first = output_memory_first.get_layout(); auto output_ptr_first = output_memory_first.pointer(); - EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t) 2304); + EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t) 2816); network network_second(engine, topology, bo); network_second.set_input_data("input", input); @@ -271,7 +267,7 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) { auto output_layout_second = output_memory_second.get_layout(); auto output_ptr_second = output_memory_second.pointer(); - EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t)3072); + EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t) 3584); EXPECT_EQ(output_layout_first, output_layout_second); int y_size = output_layout_first.size.spatial[1]; @@ -461,3 +457,112 @@ TEST(memory_pool, shared_dep_two_output) { auto outputs = network.execute(); EXPECT_EQ(engine.get_max_used_device_memory_size(), (uint64_t)256); } + +TEST(memory_pool, non_opt_intermidate_opt_after) { + + engine_configuration cfg{ false, false, false, std::string(), std::string(), true /*oooq*/, std::string(),std::string(), priority_mode_types::disabled, throttle_mode_types::disabled, true /*mem_pool*/ }; + engine engine{ cfg }; + auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 }); + auto input_layout2 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 }); + + auto input_memory1 = cldnn::memory::allocate(engine, input_layout1); + auto input_memory2 = cldnn::memory::allocate(engine, input_layout2); + auto scale_memory = cldnn::memory::allocate(engine, layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,1,1 })); + auto data_memory = cldnn::data("scale_mem", scale_memory); + + set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f }); + set_values(input_memory2, { 5.0f, 6.0f, 7.0f, 8.0f }); + set_values(scale_memory, { 1.0f}); + + auto reshape_tensor = cldnn::tensor(8, 1, 1, 1); + auto input = cldnn::input_layout("input1", input_layout1); + auto input2 = cldnn::input_layout("input2", input_layout2); + auto concat = cldnn::concatenation("concat", { "input1", "input2" }, cldnn::concatenation::along_b); + auto reshape = cldnn::reshape("reshape", "concat", reshape_tensor); + auto crop1 = cldnn::crop("crop1", "reshape", { 1,1,1,1 }, { 0, 0, 0, 0 }); + auto crop2 = cldnn::crop("crop2", "reshape", { 1,1,1,1 }, { 1, 0, 0, 0 }); + auto eltwise1 = cldnn::scale("elt1", "crop1", "scale_mem"); + auto eltwise2 = cldnn::scale("elt2", "crop2", "scale_mem"); + + auto topology = cldnn::topology( + input, input2, + concat, + reshape, + crop1, crop2, + eltwise1, eltwise2, + data_memory + ); + + build_options bo; + bo.set_option(build_option::optimize_data(false)); + network network(engine, topology, bo); + network.set_input_data("input1", input_memory1); + network.set_input_data("input2", input_memory2); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), static_cast(2)); + + auto out1 = outputs.at("elt1"); + auto out2 = outputs.at("elt2"); + + auto out1_ptr = out1.get_memory().pointer(); + auto out2_ptr = out2.get_memory().pointer(); + EXPECT_EQ(out1_ptr[0], 1.0f); + EXPECT_EQ(out2_ptr[0], 2.0f); +} + +TEST(memory_pool, add_mem_dep_test) { + + engine_configuration cfg{ false, false, false, std::string(), std::string(), true /*oooq*/, std::string(),std::string(), priority_mode_types::disabled, throttle_mode_types::disabled, true /*mem_pool*/ }; + engine engine{ cfg }; + auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 2, 2, 2 }); + + auto input_memory1 = cldnn::memory::allocate(engine, input_layout1); + auto scale_memory = cldnn::memory::allocate(engine, layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,1,1 })); + auto data_memory = cldnn::data("scale_mem", scale_memory); + + set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f}); + set_values(scale_memory, { 1.0f }); + + + auto input = cldnn::input_layout("input1", input_layout1); + auto actv1 = cldnn::activation("input_activ1", "input1", cldnn_activation_func::activation_abs); + auto actv2 = cldnn::activation("input_activ2", "input1", cldnn_activation_func::activation_abs); + auto crop1 = cldnn::crop("crop1", "input_activ1", { 1,1,2,2 }, { 0, 0, 0, 0 }); + auto crop2 = cldnn::crop("crop2", "input_activ2", { 1,1,2,2 }, { 0, 1, 0, 0 }); + auto eltwise1 = cldnn::scale("elt1", "crop1", "scale_mem"); + auto eltwise2 = cldnn::scale("elt2", "crop2", "scale_mem"); + auto actv3 = cldnn::activation("out3", "elt1", cldnn_activation_func::activation_abs); + auto actv4 = cldnn::activation("out4", "elt2", cldnn_activation_func::activation_abs); + + auto topology = cldnn::topology( + input, + crop1, crop2, + actv1, actv2, + eltwise1, eltwise2, + data_memory, + actv3, actv4 + ); + + build_options bo; + bo.set_option(build_option::optimize_data(true)); + network network(engine, topology, bo); + network.set_input_data("input1", input_memory1); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), static_cast(2)); + + auto out1 = outputs.at("out3"); + auto out2 = outputs.at("out4"); + + auto out1_ptr = out1.get_memory().pointer(); + auto out2_ptr = out2.get_memory().pointer(); + EXPECT_EQ(out1_ptr[0], 1.0f); + EXPECT_EQ(out1_ptr[1], 2.0f); + EXPECT_EQ(out1_ptr[2], 3.0f); + EXPECT_EQ(out1_ptr[3], 4.0f); + + EXPECT_EQ(out2_ptr[0], 5.0f); + EXPECT_EQ(out2_ptr[1], 6.0f); + EXPECT_EQ(out2_ptr[2], 7.0f); + EXPECT_EQ(out2_ptr[3], 8.0f); +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp index b63bbe6..da2cc38 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp @@ -139,7 +139,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 7, 10, 17, 13 } }); @@ -167,7 +167,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_fp16) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 7, 10, 17, 13 } }); @@ -195,7 +195,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 7, 10, 17, 13 } }); @@ -223,7 +223,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance_fp16) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 7, 10, 17, 13 } }); @@ -251,7 +251,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 7, 10, 17, 13 } }); @@ -279,7 +279,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_fp16) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 7, 10, 17, 13 } }); @@ -307,7 +307,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 7, 10, 17, 13 } }); @@ -335,7 +335,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance_fp16) using namespace cldnn; using namespace tests; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f16, format::bfyx,{ 7, 10, 17, 13 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/one_hot_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/one_hot_gpu_test.cpp new file mode 100644 index 0000000..8c02717 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/one_hot_gpu_test.cpp @@ -0,0 +1,193 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include +#include + +#include "test_utils/test_utils.h" +#include "test_utils/uniform_quantized_real_distribution.hpp" + +#include + +using namespace cldnn; +using namespace ::tests; + +template +VVVVF one_hot_cpu(VVVVF &input, uint16_t axis, + int32_t one_hot_limit, int input_padding_y = 0, + int input_padding_x = 0, int output_padding_y = 0, + int output_padding_x = 0) { + + size_t padding_y = input_padding_y + output_padding_y; + size_t padding_x = input_padding_x + output_padding_x; + size_t out_sizes[4]; + out_sizes[0] = input.size(); + out_sizes[1] = input[0].size(); + out_sizes[2] = input[0][0].size() + 2 * padding_y; + out_sizes[3] = input[0][0][0].size() + 2 * padding_x; + for (uint16_t i = 0; i < axis; ++i) + out_sizes[i] = out_sizes[i + 1]; + out_sizes[axis] = one_hot_limit; + VVVVF output(out_sizes[0], VVVF(out_sizes[1], VVF(out_sizes[2], VF(out_sizes[3])))); + + switch (axis) { + case 0: + for (size_t b = 0; b < out_sizes[0]; ++b) + for (size_t f = 0; f < out_sizes[1]; ++f) + for (size_t y = 0; y < out_sizes[2]; ++y) + for (size_t x = 0; x < out_sizes[3]; ++x) + output[b][f][y][x] = input[0][f][y][x] == (T)b ? 1 : 0; + break; + case 1: + for (size_t b = 0; b < out_sizes[0]; ++b) + for (size_t f = 0; f < out_sizes[1]; ++f) + for (size_t y = 0; y < out_sizes[2]; ++y) + for (size_t x = 0; x < out_sizes[3]; ++x) + output[b][f][y][x] = input[0][b][y][x] == (T)f ? 1 : 0; + break; + case 2: + for (size_t b = 0; b < out_sizes[0]; ++b) + for (size_t f = 0; f < out_sizes[1]; ++f) + for (size_t y = 0; y < out_sizes[2]; ++y) + for (size_t x = 0; x < out_sizes[3]; ++x) + output[b][f][y][x] = input[0][b][f][x] == (T)y ? 1 : 0; + break; + case 3: + for (size_t b = 0; b < out_sizes[0]; ++b) + for (size_t f = 0; f < out_sizes[1]; ++f) + for (size_t y = 0; y < out_sizes[2]; ++y) + for (size_t x = 0; x < out_sizes[3]; ++x) + output[b][f][y][x] = input[0][b][f][y] == (T)x ? 1 : 0; + break; + default: break; + } + return output; +} + +template +void generic_one_hot_test_int(cldnn::format test_input_fmt, int input_b, int input_f, int input_y, int input_x, tensor shape, + uint16_t one_hot_axis, int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, int output_padding_x = 0) { + std::vector output_dims = { shape.batch[0], shape.feature[0], + shape.spatial[1], shape.spatial[0] }; + int32_t one_hot_limit = output_dims[one_hot_axis]; + + int min_random = -2, max_random = one_hot_limit + 2; + VVVVF input_rnd = generate_random_4d(input_b, input_f, input_y, input_x, min_random, max_random); + VF input_rnd_vec = flatten_4d(test_input_fmt, input_rnd); + + const auto& engine = get_test_engine(); + tensor input_tensor(input_b, input_f, input_x, input_y); + auto input = memory::allocate(engine, { type_to_data_type::value, test_input_fmt, input_tensor }); + set_values(input, input_rnd_vec); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(one_hot("output", "input", shape, one_hot_axis)); + + network network(engine, topology); + network.set_input_data("input", input); + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "output"); + + auto output_memory = outputs.at("output").get_memory(); + auto output_layout = output_memory.get_layout(); + auto output_ptr = output_memory.pointer(); + + VVVVF output_cpu = one_hot_cpu(input_rnd, one_hot_axis, one_hot_limit, input_padding_y, input_padding_x, output_padding_y, output_padding_x); + EXPECT_EQ(output_layout.format.value, test_input_fmt.value); + tensor output_tensor = output_layout.get_buffer_size(); + int y_size = output_tensor.spatial[1]; + int x_size = output_tensor.spatial[0]; + int f_size = output_tensor.feature[0]; + int b_size = output_tensor.batch[0]; + EXPECT_EQ(y_size, (int)output_cpu[0][0].size()); + EXPECT_EQ(x_size, (int)output_cpu[0][0][0].size()); + EXPECT_EQ(f_size, (int)output_cpu[0].size()); + EXPECT_EQ(b_size, (int)output_cpu.size()); + + + bool test_is_correct = true; + VF output_cpu_vec = flatten_4d(test_input_fmt, output_cpu); + + for (size_t i = 0; i < output_cpu_vec.size(); ++i) { + if (output_cpu_vec[i] != output_ptr[i]) { + test_is_correct = false; + break; + } + } + EXPECT_EQ(test_is_correct, true) << std::endl + << "failing test parameters:" << std::endl + << "input_b = " << input_b << std::endl + << "input_f = " << input_f << std::endl + << "input_y = " << input_y << std::endl + << "input_x = " << input_x << std::endl + << "one_hot_limit = " << one_hot_limit << std::endl + << "one_hot_axis = " << one_hot_axis << std::endl + << "input_padding_y = " << input_padding_y << std::endl + << "input_padding_x = " << input_padding_x << std::endl + << "output_padding_y = " << output_padding_y << std::endl + << "output_padding_x = " << output_padding_x << std::endl; +} + +TEST(one_hot_gpu_i32, generic_y_in10_oh5) { + generic_one_hot_test_int(format::bfyx, 1, 10, 10, 10, tensor(10, 10, 10, 5), 2); +} + + +TEST(one_hot_error, basic_error_wrong_batch_size) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::i32, format::bfyx, { 10, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(one_hot("output", "input", tensor(10, 1, 1, 50), 2)); + + std::string msg_to_find = "Incorrect parameters configuration: input batch size should be equal to 1."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(one_hot_error, basic_error_wrong_axis) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(one_hot("output", "input", tensor(1, 1, 1, 50), 4)); + + std::string msg_to_find = "Incorrect parameters configuration: one_hot_axis should be less or equal to 3."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} + +TEST(one_hot_error, basic_error_bad_shape) { + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1, 1, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(one_hot("output", "input", tensor(1, 5, 1, 50), 2)); + + std::string msg_to_find = "Incorrect parameters configuration: shape does not fit input size."; + EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find)); +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp index 8f455ae..80657c1 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/permute_gpu_test.cpp @@ -25,7 +25,9 @@ #include #include "test_utils/test_utils.h" #include - +#include +#include +#include #include #include #include @@ -34,7 +36,66 @@ using namespace cldnn; using namespace tests; using namespace testing; -TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2) + +TEST(permute_gpu_f32, output_ordering_test) +{ + const auto& engine = get_test_engine(); + + + std::vector> input_tensors = + { + { 10, 5, 15, 2 },{ 2, 4, 6, 8 },{ 2, 2, 3, 2 },{ 9, 8, 7, 4 } + }; + std::vector> permutations = + { + { 0, 1, 2, 3 }, //do nothing + { 0, 1, 3, 2 }, //replace x with y + { 1, 0, 3, 2 }, //replace b with f + { 0, 2, 3, 1 } //big permutation + }; + std::vector input_formats = { format::bfyx, format::yxfb }; + + auto get_permutation = [&](const std::vector& inp1, const std::vector& order) + { + EXPECT_EQ(inp1.size(), order.size()); + std::vector output; + for (auto const& o : order) + { + output.push_back(inp1.at(o)); + } + return output; + }; + + for (auto const& fr : input_formats) + { + for (auto const& inp_t : input_tensors) + { + for (auto const& perm : permutations) + { + + auto input = memory::allocate(engine, { data_types::f32, fr, tensor(inp_t) }); + topology topology( + input_layout("input", input.get_layout()), + permute("permute", "input", perm)); + + network network(engine, topology); + network.set_input_data("input", input); + auto outputs = network.execute(); + auto output = outputs.at("permute"); + auto output_mem = output.get_memory(); + EXPECT_EQ(outputs.size(), size_t(1)); + auto ref_tensor = get_permutation(inp_t, perm); + auto out_tensor = output_mem.get_layout().size; + EXPECT_EQ(out_tensor.batch[0], ref_tensor[0]); + EXPECT_EQ(out_tensor.feature[0], ref_tensor[1]); + EXPECT_EQ(out_tensor.spatial[0], ref_tensor[2]); + EXPECT_EQ(out_tensor.spatial[1], ref_tensor[3]); + } + } + } +} + +TEST(permute_gpu_f32, basic_bfyx_permute_0_1_2_3) { // Input : bfyx:2x2x3x2 // Permute order : { 0,1,3,2 } @@ -45,7 +106,64 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2) // f1: b0: 5 6 -15 b1: 1.5 5.2 -15 // f1: b0: 7 8 -15 b1: 12 8 -15 // + // Output = input + + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); + + std::vector values = + { + 1.0f, 2.0f, -15.f, + 3.0f, 4.0f, -15.f, + + 5.0f, 6.0f, -15.f, + 7.0f, 8.0f, -15.f, + + 0.0f, 0.0f, -15.f, + 0.5f, -0.5f, -15.f, + + 1.5f, 5.2f, -15.f, + 12.0f, 8.0f, -15.f + }; + + set_values(input, values); + + topology topology( + input_layout("input", input.get_layout()), + permute("permute", "input", { 0, 1, 2, 3 })); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "permute"); + + auto output = outputs.begin()->second.get_memory(); + + + auto output_ptr = output.pointer(); + for (int i = 0; i < 24; i++) + { + EXPECT_FLOAT_EQ(values[i], output_ptr[i]); + } + +} + +TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2) +{ + // Input : bfyx:2x2x3x2 + // Permute order : { 0,1,3,2 } + // // Input: + // f0: b0: 1 2 -15 b1: 0 0 -15 + // f0: b0: 3 4 -15 b1: 0.5 -0.5 -15 + // f1: b0: 5 6 -15 b1: 1.5 5.2 -15 + // f1: b0: 7 8 -15 b1: 12 8 -15 + // + // Output // f0: b0: 1 3 b1: 0 0.5 // f0: b0: 2 4 b1: 0 -0.5 // f0: b0: -15 -15 b1: -15 -15 @@ -54,9 +172,9 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2) // f1: b0: -15 -15 b1: -15 -15 // - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); set_values(input, { 1.0f, 2.0f, -15.f, @@ -70,7 +188,7 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2) 1.5f, 5.2f, -15.f, 12.0f, 8.0f, -15.f, - }); + }); topology topology( input_layout("input", input.get_layout()), @@ -111,56 +229,20 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2) } -TEST(permute_gpu_f32, basic_yxfb_permute_3_2_0_1) +TEST(permute_gpu_f32, basic_yxfb_permute_1_0_2_3) { - // Input : yxfb:2x2x2x2 - // Permute order : { 3,2,0,1 } - // Output padding : 0x1 - // - // Input: - // f0: b0: 1 2 b1: 0 0 - // f0: b0: 3 4 b1: 0.5 -0.5 - // f1: b0: 5 6 b1: 1.5 5.2 - // f1: b0: 7 8 b1: 12 8 - // - // Output: - // b0 f0: 1 2 - // b0 f0: 3 4 - // - // b0 f1: 5 6 - // b0 f1: 7 8 - // - // b1 f0: 0 0 - // b1 f0: 0.5 -0.5 - // - // b1 f1: 1.5 5.2 - // b1 f1: 12 8 - // - - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input_mem = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 100, 64, 1 } }); - set_values(input, { - 1.f, 0.f, - 5.f, 1.5f, - - 2.f, 0.f, - 6.f, 5.2f, - - 3.f, 0.5f, - 7.f, 12.f, - - 4.f, -0.5f, - 8.f, 8.f - }); + tests::set_random_values(input_mem); topology topology( - input_layout("input", input.get_layout()), - permute("permute", "input", { 3, 2, 0, 1 }, { { 0, 0, 1, 0}, 0 })); + input_layout("input", input_mem.get_layout()), + permute("permute", "input", { 1, 0, 2, 3 })); network network(engine, topology); - network.set_input_data("input", input); + network.set_input_data("input", input_mem); auto outputs = network.execute(); EXPECT_EQ(outputs.size(), size_t(1)); @@ -168,22 +250,11 @@ TEST(permute_gpu_f32, basic_yxfb_permute_3_2_0_1) auto output = outputs.begin()->second.get_memory(); - float answers[32] = { - 0.0f, 0.0f, 0.0f, 0.0f, - 1.0f, 2.0f, 3.0f, 4.0f, - 5.0f, 6.0f, 7.0f, 8.0f, - 0.0f, 0.0f, 0.0f, 0.0f, - - 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.5f, -0.5f, - 1.5f, 5.2f, 12.0f, 8.0f, - 0.0f, 0.0f, 0.0f, 0.0f - }; - auto output_ptr = output.pointer(); - for (int i = 0; i < 32; i++) + auto input_ptr = input_mem.pointer(); + for (int i = 0; i < 6400; i++) { - EXPECT_FLOAT_EQ(answers[i], output_ptr[i]); + EXPECT_FLOAT_EQ(input_ptr[i], output_ptr[i]); } } @@ -209,7 +280,7 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2_input_padding) // f1: b0: -15 -15 b1: -15 -15 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); @@ -225,7 +296,7 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2_input_padding) 1.5f, 5.2f, -15.f, 12.0f, 8.0f, -15.f, - }); + }); topology topology( input_layout("input", input.get_layout()), @@ -267,10 +338,120 @@ TEST(permute_gpu_f32, basic_bfyx_permute_0_1_3_2_input_padding) } +TEST(permute_gpu_f32, basic_yxfb_permute_batch_with_feature) +{ + // Input : yxfb:8x2x1x1 + // Permute order : { 1, 0, 2, 3 } + // Output : yxfb:2x8x1x1 + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 8, 2, 1, 1 } }); + + set_values(input, { + //b0 - b7 for f=0 + 1.f, 0.f, 5.f, 1.5f, 2.f, 0.f, 6.f, 5.2f, + + //b0 - b7 for f=1 + 3.f, 0.5f, 7.f, 12.f, 4.f, -0.5f, 8.f, 8.f + }); + + topology topology( + input_layout("input", input.get_layout()), + permute("permute", "input", { 1, 0, 2, 3 })); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "permute"); + + auto output = outputs.begin()->second.get_memory(); + auto out_tensor = output.get_layout().size; + EXPECT_EQ(out_tensor.batch[0], 2); + EXPECT_EQ(out_tensor.feature[0], 8); + EXPECT_EQ(out_tensor.spatial[0], 1); + EXPECT_EQ(out_tensor.spatial[1], 1); + + float answers[16] = { + 1.0f, 3.0f, + 0.0f, 0.5f, + 5.f, 7.f, + 1.5f, 12.f, + 2.f, 4.f, + 0.f, -0.5f, + 6.f, 8.f, + 5.2f, 8.f + }; + + auto output_ptr = output.pointer(); + for (int i = 0; i < 16; i++) + { + EXPECT_FLOAT_EQ(answers[i], output_ptr[i]); + } + +} + +TEST(permute_gpu_f32, basic_bfyx_permute_batch_with_feature) +{ + // Input : yxfb:8x2x1x1 + // Permute order : { 1, 0, 2, 3 } + // Output : yxfb:2x8x1x1 + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 8, 1, 1 } }); + + set_values(input, { + //f0 - f7 for b=0 + 1.f, 0.f, 5.f, 1.5f, 2.f, 0.f, 6.f, 5.2f, + + //f0 - f7 for b=1 + 3.f, 0.5f, 7.f, 12.f, 4.f, -0.5f, 8.f, 8.f + }); + + topology topology( + input_layout("input", input.get_layout()), + permute("permute", "input", { 1, 0, 2, 3 })); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "permute"); + + auto output = outputs.begin()->second.get_memory(); + auto out_tensor = output.get_layout().size; + EXPECT_EQ(out_tensor.batch[0], 8); + EXPECT_EQ(out_tensor.feature[0], 2); + EXPECT_EQ(out_tensor.spatial[0], 1); + EXPECT_EQ(out_tensor.spatial[1], 1); + + float answers[16] = { + 1.0f, 3.0f, + 0.0f, 0.5f, + 5.f, 7.f, + 1.5f, 12.f, + 2.f, 4.f, + 0.f, -0.5f, + 6.f, 8.f, + 5.2f, 8.f + }; + + auto output_ptr = output.pointer(); + for (int i = 0; i < 16; i++) + { + EXPECT_FLOAT_EQ(answers[i], output_ptr[i]); + } + +} + template void permute_test_with_reorder() { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); @@ -286,7 +467,7 @@ void permute_test_with_reorder() 1.0f, 5.0f, -15.f, 12.0f, 8.0f, -15.f, - }); + }); topology topology( input_layout("input", input.get_layout()), @@ -338,4 +519,102 @@ TEST(permute_gpu_i32, basic_bfyx_permute_0_1_3_2) { TEST(permute_gpu_i64, basic_bfyx_permute_0_1_3_2) { permute_test_with_reorder(); -} \ No newline at end of file +} + +TEST(fc_permute_crop_gpu, basic_permute_yxfb) +{ + const auto& engine = get_test_engine(); + + auto input_mem = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 5, 1, 512 } }); + + //Topolgy creates permute which "repalces" the batch with the feature. + topology topology( + input_layout("input", input_mem.get_layout()), // yxfb {1, 5, 1, 512 }} + permute("permute", "input", { 1, 0, 2, 3 }) // yxfb {5, 1, 1, 512} --- without permute fix yxfb {1, 5, 512, 1} + ); + + network network(engine, topology); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "permute"); + + auto output = outputs.begin()->second.get_memory(); + auto out_tensor = output.get_layout().size; + EXPECT_EQ(out_tensor.batch[0], 5); + EXPECT_EQ(out_tensor.feature[0], 1); + EXPECT_EQ(out_tensor.spatial[0], 1); + EXPECT_EQ(out_tensor.spatial[1], 512); + EXPECT_EQ(output.get_layout().format, cldnn::format::yxfb); +} + +TEST(fc_permute_crop_gpu, basic_0) +{ + + const auto& engine = get_test_engine(); + + auto input_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 5, 11264, 1, 1 } }); + auto weights_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 512, 11264, 1, 1 } }); + auto bias_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 512, 1 } }); + + topology topology( + input_layout("input", input_mem.get_layout()), // bfyx {5, 11264, 1, 1}} + data("weights", weights_mem), + data("bias", bias_mem), + fully_connected("fully_connected", "input", "weights", "bias"), // yxfb {5, 512, 1, 1} + reshape("reshape", "fully_connected", { 1, 5, 1, 512 }), // yxfb {1, 5, 1, 512} + permute("permute", "reshape", { 1, 0, 2, 3 }), // yxfb {5, 1, 1, 512} --- without permute fix yxfb {1, 5, 512, 1} + crop("crop", "permute", { 1, 1, 1, 512 }, { 4, 0, 0 ,0 }) // without permute fix it will fail "Tensor pitches didn't set correctly" + ); + + network network(engine, topology); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "crop"); + + auto output = outputs.begin()->second.get_memory(); + auto out_tensor = output.get_layout().size; + EXPECT_EQ(out_tensor.batch[0], 1); + EXPECT_EQ(out_tensor.feature[0], 1); + EXPECT_EQ(out_tensor.spatial[0], 1); + EXPECT_EQ(out_tensor.spatial[1], 512); + EXPECT_EQ(output.get_layout().format, cldnn::format::yxfb); +} + +TEST(fc_permute_gpu, basic_permute_bfyx) +{ + const auto& engine = get_test_engine(); + + auto input_mem = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 5, 1, 256 } }); + + tests::set_random_values(input_mem); + + //Topolgy creates permute which "repalces" the batch with the feature. + topology topology( + input_layout("input", input_mem.get_layout()), + permute("permute", "input", { 1, 0, 2, 3 }) + ); + + network network(engine, topology); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "permute"); + + auto output = outputs.begin()->second.get_memory(); + auto out_tensor = output.get_layout().size; + EXPECT_EQ(out_tensor.batch[0], 5); + EXPECT_EQ(out_tensor.feature[0], 1); + EXPECT_EQ(out_tensor.spatial[0], 1); + EXPECT_EQ(out_tensor.spatial[1], 256); + EXPECT_EQ(output.get_layout().format, cldnn::format::bfyx); + + auto input_ptr = input_mem.pointer(); + auto output_ptr = output.pointer(); + for (int i = 0; i < 5 * 256; i++) + EXPECT_NEAR(input_ptr[i], output_ptr[i], 1e-3f); +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp index 711a5ec..3bcd271 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ TEST(pooling_forward_gpu, basic_max_byxf_f32_wsiz3x3_wstr1x1_i1x3x3x8_nopad) { // Expected output: // [ 8.0, 0.0, 0.0, 4,0, 0,5, -0.5, -0.5, -0.5 ] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 8, 3, 3 } }); @@ -99,7 +99,7 @@ TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz3x3_wstr1x1_i3x3x1x1_nopad) { // Expected output: // [ 2.0] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } }); @@ -122,9 +122,47 @@ TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz3x3_wstr1x1_i3x3x1x1_nopad) { EXPECT_EQ(2.0f, output_ptr[0]); } +TEST(pooling_forward_gpu, basic_max_yxfb_f32_global_i3x3x1x1_nopad) { + // Brief test description. + // + // Pool mode: max + // Global pooling: true + // Padding: none + // + // Input data: + // [-0.5, 1.0, 0.5] + // [ 2.0, 1.5, -0.5] + // [ 0.0, -1.0, 0.5] + // + // Expected output: + // [ 2.0] + + const auto& engine = get_test_engine(); + + auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 3, 3 } }); + + topology topology; + topology.add(input_layout("input_prim", input_prim.get_layout())); + topology.add(pooling("pool_prim", "input_prim", pooling_mode::max)); + + network network(engine, topology); + set_values(input_prim, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f }); + network.set_input_data("input_prim", input_prim); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "pool_prim"); + + auto output_prim = outputs.begin()->second.get_memory(); + + auto output_ptr = output_prim.pointer(); + + EXPECT_EQ(2.0f, output_ptr[0]); +} + TEST(pooling_forward_gpu, basic_max_pooling_int8) { - engine engine; + const auto& engine = get_test_engine(); layout in_layout = { type_to_data_type::value,format::byxf,{ 1,1,3,3 } }; layout out_layout = { type_to_data_type::value,format::byxf,{ 1,1,1,1 } }; layout byte_layout = { type_to_data_type::value, format::bfyx,{ 1,1,3,3 } }; @@ -171,7 +209,7 @@ TEST(pooling_forward_gpu, basic_max_pooling_int8) { TEST(pooling_forward_gpu, basic_avg_pooling_int8) { - engine engine; + const auto& engine = get_test_engine(); layout in_layout = { type_to_data_type::value,format::byxf,{ 1,1,3,3 } }; layout out_layout = { type_to_data_type::value,format::byxf,{ 1,1,1,1 } }; layout byte_layout = { type_to_data_type::value, format::bfyx,{ 1,1,3,3 } }; @@ -235,7 +273,7 @@ TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz2x2_wstr1x1_i3x3x1x1_nopad) { // [ 2.0, 1.5] // [ 2.0, 1.5] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } }); @@ -279,7 +317,7 @@ TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz2x2_wstr2x2_i4x4x1x1_nopad) { // [ 2.0, 0.5] // [ 0.5, 0.5] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } }); @@ -333,7 +371,7 @@ TEST(pooling_forward_gpu, basic_max_yxfb_f32_wsiz2x2_wstr1x1_i3x3x2x2_nopad) { // [ 0.5, 1.0] [ 1.0, 0.5] // [-0.5, 1.5] [ 1.0, 0.0] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 3 } }); @@ -383,7 +421,7 @@ TEST(pooling_forward_gpu, offsets_max_yxfb_f32_wsiz2x2_wstr2x2_i2x2x1x1_zeropad) // [ 1.5, -0.5] // [ -1, 0.5] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -428,7 +466,7 @@ TEST(pooling_forward_gpu, offsets_max_yxfb_f32_wsiz2x2_wstr2x2_i3x3x1x1_zeropad) // [ 1.5, -0.5] // [ 1, -0.5] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } }); @@ -477,7 +515,7 @@ TEST(pooling_forward_gpu, basic_avg_yxfb_f32_wsiz2x2_wstr1x1_i3x3x1x1_nopad) { // [ 1.0, 0.625] // [ 1.625, 0.875] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } }); @@ -522,7 +560,7 @@ TEST(pooling_forward_gpu, offsets_avg_yxfb_f32_wsiz2x2_wstr2x2_i2x2x1x1_zeropad) // [ 0.375, -0.125] // [ -0.25, 0.125] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } }); @@ -567,7 +605,7 @@ TEST(pooling_forward_gpu, offsets_avg_bfyx_f32_wsiz3x3_wstr3x3_i1x1x3x3_zeropad) // [ 0.177777, -0.133333] // [ 0.333333, 0.55] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 3 } }); @@ -615,7 +653,7 @@ TEST(pooling_forward_gpu, offsets_avg_yxfb_f32_wsiz2x2_wstr2x2_i3x3x1x1_zeropad) // [ 0.375, 0.5] // [ -0.125, -1.125] - engine engine; + const auto& engine = get_test_engine(); auto input_prim = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 3, 3 } }); @@ -664,7 +702,7 @@ TEST(pooling_forward_gpu, offsets_avg_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i2x2x1x1_out // [0, 0, 0, 0, 0, 0] // [0, 0, 0, 0, 0, 0] - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -725,7 +763,7 @@ TEST(pooling_forward_gpu, offsets_max_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i3x3x1x1_out // [0, 1, -0.5, 0, 0] // [0, 0, 0, 0, 0] - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -795,7 +833,7 @@ TEST(pooling_forward_gpu, offsets_avg_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i2x2x1x1_inp // [0, 0, 0, 0, 0, 0] // [0, 0, 0, 0, 0, 0] - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -858,7 +896,7 @@ TEST(pooling_forward_gpu, offsets_max_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i3x3x1x1_inp // [0, 1, -0.5, 0] // [0, 0, 0, 0, 0] - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -929,7 +967,7 @@ TEST(pooling_forward_gpu, avg_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i2x2x1x1_inpad2x1_ou // [0, 0, 0, 0, 0, 0] // [0, 0, 0, 0, 0, 0] - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -997,7 +1035,7 @@ TEST(pooling_forward_gpu, max_yxfb_bfyx_f32_wsiz2x2_wstr2x2_i3x3x1x1_inpad2x1_ou // [0, 12, 14, 16, 0] // [0, 0, 0, 0, 0] - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -1067,7 +1105,7 @@ TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax) { // f1: b0: 10 11 b1: 21 23 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -1146,7 +1184,7 @@ TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax_input_padding) { // f1: b0: 10 11 b1: 21 23 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -1226,7 +1264,7 @@ TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax_output_padding) { // f1: b0: 10 11 b1: 21 23 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -1316,7 +1354,7 @@ TEST(pooling_forward_gpu, basic_in2x2x3x2_max_with_argmax_with_output_size) { // f1: b0: 10 11 b1: 21 23 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto arg_max = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 1 } }); @@ -1500,6 +1538,133 @@ TEST(pooling_forward_gpu, yxfb_average_without_padding_i1x1_w3x3_s1x1_o1x1_fp16) generic_average_wo_padding_test(format::yxfb, spatial(1, 1), spatial(1, 1), spatial(3, 3), tensor{ 0,0,1,1 }, tensor{ 0,0,-1,-1 }); } +TEST(pooling_forward_gpu, b_fs_yx_fsv4) +{ + int B_array[] = { 16, 4, 0 }; // Batch + int F_array[] = { 64, 2048, 0 }; // Features + int I_array[] = { 112, 7, 0 }; // Input MxM data sizes + int W_array[] = { 7, 3, 0 }; // Filter (a-ka weights) sizes + int S_array[] = { 1, 2, 0 }; // Strides + for (int j = 0; F_array[j]; j++) { + int in_B = B_array[j]; + + int in_F = F_array[j]; + + int in_X = I_array[j], + in_Y = in_X; + + int W_X = W_array[j], + W_Y = W_X; + + int S_X = S_array[j], + S_Y = S_X; + + // Input data init + std::vector Data(in_B * in_F * in_X * in_Y); + for (size_t i = 0; i < Data.size(); i++) + Data[i] = static_cast(i); + std::vector DataGold(Data); + + // Expected "gold" output and IMAD output. + std::vector vGoldOutput; + std::vector vTestOutput; + + engine engine; + + // "Golden" Pooling + { + // Mem initialization + // This is user data, no kernels here + auto input = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + set_values(input, std::move(DataGold)); + + auto pool = pooling("pool_GOLD", + "input", + pooling_mode::max, + { 1, 1, W_X, W_Y }, // kernel_size + { 1, 1, S_X, S_Y }); // stride + + // Create a topology with a simple Convolution layer + topology topology(input_layout("input", input.get_layout()), + pool); + + // Network processing + network network(engine, topology); + network.set_input_data("input", input); + //network_exe(network, vGoldOutput, "pool_GOLD"); + auto outputs = network.execute(); + auto searchC = outputs.find("pool_GOLD"); + ASSERT_FALSE(searchC == outputs.end()); + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + vGoldOutput.reserve(output_ptr.size()); + for (size_t i = 0; i < output_ptr.size(); i++) + vGoldOutput.push_back(output_ptr[i]); + } + + // + // IMAD Pooling + // + { + topology topology; + + // Mem initialization + // This is user data, no kernels here + auto input = memory::allocate(engine, + { data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y } }); + set_values(input, std::move(Data)); + + // Add input to topology + topology.add( + input_layout("input", input.get_layout())); + + // Reorder (a-ka swizzelling) input to MMAD/IMAD Pooling format + topology.add(reorder("reorder_Swizzelled", + "input", + layout(data_types::i8, + format::b_fs_yx_fsv4, + { in_B, in_F, in_X, in_Y }))); + + // Add Convoluiton to topology + topology.add(pooling("pool_IMAD", + "reorder_Swizzelled", + pooling_mode::max, + { 1, 1, W_X, W_Y }, // kernel_size + { 1, 1, S_X, S_Y })); // stride + + // Back reordering (a-ka unswizzelling) output from MMAD/IMAD pooling + topology.add(reorder("reorder_UnSwizzelled", + "pool_IMAD", + layout(data_types::i8, + format::bfyx, + { in_B, in_F, in_X, in_Y }))); + + network network(engine, topology); + network.set_input_data("input", input); + //network_exe(network, vTestOutput, "reorder_UnSwizzelled"); + auto outputs = network.execute(); + auto searchC = outputs.find("reorder_UnSwizzelled"); + ASSERT_FALSE(searchC == outputs.end()); + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + vTestOutput.reserve(output_ptr.size()); + for (size_t i = 0; i < output_ptr.size(); i++) + vTestOutput.push_back(output_ptr[i]); + } + + // Result validation + ASSERT_TRUE(vGoldOutput.size() == vTestOutput.size()); + for (size_t i = 0; i < vGoldOutput.size(); i++) + ASSERT_TRUE(vTestOutput[i] == vGoldOutput[i]); + + } // for (int j = 0; F_array[j]; i++) +} + class pooling_test : public tests::generic_test { diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/propagate_constants_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/propagate_constants_gpu_test.cpp new file mode 100644 index 0000000..673e7ea --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/propagate_constants_gpu_test.cpp @@ -0,0 +1,69 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include "api/CPP/memory.hpp" +#include +#include +#include +#include +#include "test_utils/test_utils.h" +#include +#include +#include +#include + +using namespace cldnn; +using namespace tests; + +//We expect additional reorder to be added in between "weights1" and "reshape1". +//This situation should be handled properly by propagate constants optimization phase +TEST(propagate_constants, copy_dependecies_from_nodes) { + const auto& engine = get_test_engine(); + build_options build_opt; + build_opt.set_option(build_option::optimize_data(true)); + + auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto weights1 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 1 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } }); + + set_values(input, { FLOAT16(1.1f), FLOAT16(1.2f), FLOAT16(1.3f), FLOAT16(1.4f) }); + set_values(weights1, { FLOAT16(2.1f), FLOAT16(3.1f) }); + set_values(weights2, { 1.1f, 0.1f }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights1", weights1)); + topology.add(data("weights2", weights2)); + topology.add(reshape("reshape1", "weights1", tensor(spatial(1, 2)))); + topology.add(reorder("reorder2", "input", layout(data_types::f32, format::byxf, 4))); + topology.add(reorder("reorder1", "reshape1", layout(data_types::f32, format::byxf, 4))); + topology.add(concatenation("concat", { "reorder1", "weights2" }, concatenation::along_x)); + topology.add(convolution("conv2", { "reorder2" }, { "concat" })); + network network(engine, topology, build_opt); + network.set_input_data("input", input); + + auto outputs = network.execute(); + + float epsilon = 1e-2f; + for (auto& it : outputs) + { + auto output = it.second.get_memory().pointer(); + EXPECT_NEAR(7.8f, output[0], epsilon); + } +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/proposal_cpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/proposal_cpu_test.cpp index 93e36a6..c1b818d 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/proposal_cpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/proposal_cpu_test.cpp @@ -94,7 +94,7 @@ template TestRunnerProposal::TestRunnerProposal() : _cls_scores_layout(cldnn::type_to_data_type::value, format::bfyx, { 1, 18, 23, 14 } ), _bbox_pred_layout(cldnn::type_to_data_type::value, format::bfyx, { 1, 36, 23, 14 } ), - _image_info_layout(cldnn::type_to_data_type::value, format::bfyx, { 1, 1, 3, 1 } ), + _image_info_layout(cldnn::type_to_data_type::value, format::bfyx, { 1, 3, 1, 1 } ), _test_layer(layer_name, cls_scores_name, bbox_pred_name, diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/pyramid_roi_align_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/pyramid_roi_align_gpu_test.cpp new file mode 100644 index 0000000..db7a9d2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pyramid_roi_align_gpu_test.cpp @@ -0,0 +1,191 @@ +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils/test_utils.h" + + +using namespace cldnn; +using namespace tests; + +enum Test_index { //order the same as test_data table + BOXES = 0, + IMAGE_META, + P5, + P4, + P3, + P2, + POOL +}; + +struct Test_data +{ + const char *parameter_name; + int32_t b, f, x, y; + std::vector dataTMP; +}; + +Test_data test_data [] = //order the same as enum Test_index +{ + { + "boxes", + 1, 1, 1, 4, + { 0.274695277f, 0.39985016f, 0.751607299f, 0.649529517f } + }, + { + "image_meta", + 1, 1, 1, 93, + { 0, 415, 640, 3, 1024, 1024, 3, 180, 0, 844, 1024, 1.6f } + }, + { + "P5", + 1, 1, 32, 32, + { + -2.33415818f, -1.46765602f, -0.998123348f, -0.945146739f, -0.721071541f, -1.19279253f, -1.37023795f, -1.61545324f, -2.03868198f, -1.72659981f, -1.5598495f, -1.55309856f, -1.53211606f, -1.86645496f, -1.84540808f, -1.68674099f, -1.60733783f, -1.43271363f, -1.37408626f, -1.35044777f, -1.25868618f, -0.965965867f, -0.881696165f, -0.709434509f, -0.494760394f, -0.482933104f, -1.26238084f, -1.45486391f, -1.00801146f, -0.840218246f, -0.420806766f, 0.635412455f, + -5.15252113f, -4.81609535f, -4.33736563f, -4.5069356f, -4.69305611f, -5.35324192f, -5.4090085f, -5.18345022f, -5.57966137f, -6.08182287f, -6.4237361f, -6.63587379f, -6.60395145f, -6.99704218f, -7.26061678f, -7.13621283f, -6.92309761f, -6.54043388f, -6.0931859f, -5.95154953f, -5.92886162f, -5.60794735f, -5.39521217f, -5.24937916f, -4.93126583f, -5.03314447f, -6.35518694f, -5.97401428f, -4.61507177f, -3.88595009f, -3.10539627f, -1.12507141f, + -4.58263206f, -4.23551846f, -3.71995449f, -3.9303925f, -4.22284889f, -4.90389252f, -4.90515423f, -4.35046101f, -4.93061686f, -5.62805653f, -7.13111687f, -8.04961014f, -8.61973f, -8.91128826f, -9.59987259f, -9.77626991f, -9.34930134f, -8.41235256f, -7.99330997f, -7.56377172f, -7.41074753f, -7.68792772f, -7.52159262f, -7.23604727f, -6.43461895f, -6.30558538f, -7.42862511f, -6.57217264f, -4.36673212f, -3.42791319f, -2.78279519f, -1.13899291f, + -4.05928659f, -3.63066411f, -3.08045626f, -3.49022269f, -3.33089471f, -3.98461342f, -3.60927105f, -3.47735429f, -4.22189903f, -5.61483288f, -6.73310328f, -7.82119894f, -7.76680946f, -7.81351185f, -8.53846359f, -8.85490894f, -8.87630653f, -8.05196667f, -7.37027693f, -6.48965073f, -6.0011878f, -6.49297428f, -6.87221718f, -6.6889801f, -5.67975998f, -5.48370981f, -6.48479271f, -5.99923038f, -4.15075731f, -3.24771428f, -2.38959575f, -0.802779257f, + -3.8221159f, -3.2125051f, -2.67735672f, -3.35456967f, -2.42953777f, -1.97508657f, -0.0455740131f, 0.200172856f, -1.73673642f, -4.14228773f, -6.05798674f, -6.92922974f, -6.31088972f, -5.24032164f, -5.8104291f, -6.21769142f, -6.71948385f, -6.34254694f, -5.40050459f, -3.83635306f, -2.84016895f, -3.47709227f, -4.53029394f, -4.79398346f, -4.15029287f, -4.34026718f, -5.05020094f, -4.96476984f, -3.85935163f, -3.06635952f, -2.21780515f, -0.550920606f, + -3.38425207f, -2.47040701f, -1.75725257f, -2.67789435f, -1.93510687f, -0.023562137f, 3.12235284f, 3.195858f, -0.502758205f, -3.64130497f, -4.92483091f, -5.37235212f, -4.44142771f, -3.01087427f, -2.56460142f, -3.36131048f, -4.67883253f, -4.97649288f, -4.15489054f, -3.05888772f, -2.53061557f, -2.89280939f, -3.89569187f, -3.85883617f, -3.85448074f, -3.72637963f, -4.17853975f, -3.72458243f, -3.2028439f, -2.26282644f, -1.57095635f, -0.0362351872f, + -2.86179805f, -1.77212584f, -1.01908028f, -2.22856259f, -2.04378486f, -0.389851034f, 2.5954473f, 3.546386f, -0.572356939f, -3.22942686f, -4.71709538f, -5.06511068f, -4.19580078f, -2.62281418f, -1.84743559f, -1.72474909f, -2.85398459f, -3.05193329f, -2.1715126f, -1.87324941f, -2.42470956f, -3.27851868f, -4.05942631f, -3.64058971f, -3.65105247f, -3.37935495f, -3.88859773f, -3.24483466f, -2.69226313f, -1.51380038f, -0.803811312f, 0.575846195f, + -2.44617772f, -1.21146309f, -0.406607807f, -1.79564178f, -2.15529561f, -1.86219978f, -0.642769337f, -0.119694829f, -3.55873179f, -6.07527542f, -7.34461832f, -7.5732069f, -5.2650032f, -2.78443551f, -2.01951551f, -2.20919466f, -3.48502755f, -3.39159703f, -2.84414029f, -3.01556158f, -4.17538118f, -4.6715436f, -4.51803017f, -3.98833418f, -4.03647232f, -3.56217432f, -4.35153055f, -3.35357046f, -2.34758973f, -0.991552889f, -0.410246134f, 0.853578329f, + -2.32879257f, -0.983750522f, -0.21862191f, -1.63332736f, -2.70467782f, -3.79070854f, -3.12105083f, -3.37172794f, -5.87286377f, -7.56662798f, -8.18826008f, -7.51929522f, -5.9531951f, -4.06868601f, -2.65765858f, -2.80148482f, -4.28907013f, -4.32930136f, -4.3640132f, -4.59029436f, -5.4193697f, -5.89368916f, -5.6321454f, -5.52998543f, -5.09114599f, -3.59506583f, -3.95068288f, -3.30025363f, -2.04802871f, -0.637728035f, -0.245602071f, 0.879402578f, + -2.35637832f, -0.938572884f, -0.137476623f, -1.41782618f, -2.65590358f, -4.25014019f, -4.0826478f, -4.17878771f, -5.6027894f, -7.31306791f, -7.89162493f, -7.03756762f, -6.09949017f, -5.60607052f, -4.94666481f, -4.39400244f, -4.67201567f, -4.2205472f, -4.38528776f, -4.6779213f, -4.83282471f, -4.84141684f, -4.65654802f, -4.24497604f, -3.85145688f, -2.74431086f, -3.78755236f, -3.00524449f, -1.81372464f, -0.552992642f, -0.150228053f, 0.944489419f, + -2.39807153f, -0.961493254f, -0.207601368f, -1.41579533f, -2.26456952f, -3.31752872f, -2.37754416f, -2.27816534f, -3.3359437f, -4.83316755f, -4.82455635f, -5.1267004f, -4.75627851f, -6.18640566f, -7.98392439f, -9.12876225f, -8.12104893f, -7.43801117f, -5.90858698f, -3.8132503f, -2.49779272f, -2.64403725f, -2.50610948f, -2.27564049f, -2.08231401f, -2.0385685f, -3.72143364f, -3.04797244f, -1.76300609f, -0.521960258f, -0.0881003886f, 0.961502016f, + -2.44038081f, -1.01705039f, -0.289608359f, -1.37090492f, -1.93311131f, -2.47754407f, -1.31518912f, -0.804416537f, -0.930097163f, -0.780354142f, -0.834263086f, -1.50460267f, -3.63839531f, -4.60880566f, -6.8964262f, -8.66131878f, -9.60757637f, -8.79116344f, -6.86388493f, -4.30527639f, -1.8283174f, -1.4908253f, -1.37629032f, -1.22827542f, -1.60703599f, -2.33176303f, -3.86254454f, -2.99731207f, -1.65976918f, -0.461797535f, 0.0194968097f, 0.998998225f, + -2.46240711f, -1.03391945f, -0.35371244f, -1.40552509f, -1.92847848f, -2.80441093f, -1.44593406f, -0.652132452f, -0.4637236f, -0.377687186f, -0.223660469f, -1.29031694f, -2.68966746f, -3.15799189f, -3.18843555f, -4.4910984f, -6.69606543f, -8.33802032f, -8.19927311f, -6.32680511f, -3.98862648f, -2.22264123f, -1.55090904f, -1.1854068f, -1.3106786f, -1.90384912f, -3.67234707f, -2.88272882f, -1.53641987f, -0.362456888f, 0.0893754214f, 1.02051163f, + -2.48206067f, -1.02961993f, -0.368244141f, -1.42910719f, -1.93446803f, -2.968822f, -1.83339584f, -1.077631f, -1.20465982f, -1.57803464f, -1.41360343f, -1.76699162f, -2.31551576f, -2.05016136f, -0.0285568349f, 1.02111804f, -1.09839404f, -3.57055283f, -6.42463684f, -6.38169003f, -6.04913425f, -3.92720795f, -2.87601185f, -2.27725315f, -1.91104662f, -1.94828415f, -3.19035602f, -2.59298229f, -1.44278193f, -0.386298746f, 0.0836858153f, 0.999346912f, + -2.48712945f, -1.01729345f, -0.474304944f, -1.67669559f, -2.10705042f, -3.42592764f, -2.34152699f, -1.83562672f, -1.90750253f, -2.23259664f, -1.80318487f, -2.05461431f, -2.2218473f, -1.68138134f, 1.89481843f, 4.749331f, 4.48664188f, 1.76011801f, -2.80741739f, -5.01609373f, -6.86733389f, -4.95238161f, -3.11620855f, -2.35959673f, -2.14903998f, -2.22679043f, -3.25020576f, -2.55579758f, -1.45884585f, -0.450649738f, 0.0580532737f, 0.980433941f, + -2.5185082f, -1.06924045f, -0.577468932f, -1.7359041f, -2.2522819f, -3.44346404f, -2.27338934f, -1.50737846f, -1.4048748f, -1.7626915f, -1.77618313f, -2.55145335f, -2.72144723f, -1.09168231f, 3.47705436f, 7.27473307f, 7.77128983f, 4.76851988f, -0.231550142f, -4.59473372f, -7.91270256f, -5.9186945f, -3.17887211f, -1.95729899f, -2.12510371f, -2.66853952f, -3.79930806f, -2.93926597f, -1.47657454f, -0.51107496f, 0.0374269597f, 0.9673509f, + -2.57245374f, -1.16771162f, -0.721676588f, -1.80981266f, -2.38730669f, -3.6522367f, -2.01576495f, -0.8515746f, -0.121799529f, -1.13752592f, -1.98465598f, -3.21510339f, -3.90218043f, -1.90408611f, 3.62870288f, 9.53359127f, 12.2969809f, 9.25624657f, 3.08819818f, -3.57391787f, -8.53378582f, -6.41586733f, -3.14953685f, -1.97396016f, -2.7328465f, -3.78186893f, -4.93579912f, -3.55470729f, -1.54245102f, -0.482002735f, 0.0237279348f, 0.970623732f, + -2.6402328f, -1.25508213f, -0.813264728f, -1.85111022f, -2.31478047f, -3.37323236f, -1.72119153f, -0.622631073f, 0.275214434f, -1.74099112f, -3.82077885f, -5.72362041f, -7.07592487f, -5.2477479f, 1.65343058f, 9.84803104f, 13.9755783f, 12.027339f, 6.53266191f, 0.243630022f, -4.9232049f, -4.36105299f, -1.71283042f, -1.22028506f, -2.47615337f, -3.96648002f, -4.9211669f, -3.52139068f, -1.58175361f, -0.453389883f, 0.0172070079f, 0.974586606f, + -2.69985747f, -1.30426204f, -0.813042939f, -1.84938121f, -2.33455706f, -3.75564861f, -2.54689479f, -2.26757884f, -1.79824364f, -2.93493605f, -4.15734148f, -4.67264462f, -5.97829533f, -6.07628202f, -0.634435117f, 7.86048698f, 13.385828f, 13.8827438f, 9.38942051f, 3.89634967f, -1.39140749f, -2.39509726f, -1.62092125f, -1.5939455f, -2.25631547f, -3.52288079f, -4.53593159f, -3.25450349f, -1.60031211f, -0.435814232f, 0.0219062977f, 0.986854315f, + -2.74063468f, -1.31302822f, -0.820956767f, -1.81994605f, -2.28283525f, -3.5440836f, -2.51103139f, -2.81304479f, -3.26139283f, -3.37517047f, -3.98655128f, -4.15412378f, -4.92545223f, -5.78675413f, -3.06408238f, 3.01499391f, 8.77478504f, 10.6144304f, 8.11615849f, 4.45580721f, 0.623039126f, -1.10865057f, -1.95774138f, -2.36074567f, -2.57845926f, -3.33297563f, -3.97079587f, -2.93356919f, -1.50071633f, -0.443875313f, 0.0236797072f, 0.991317093f, + -2.77299833f, -1.32691216f, -0.831916511f, -1.82886219f, -2.0734787f, -3.13335371f, -1.50032151f, -1.46733963f, -2.72959828f, -3.5253818f, -4.29566097f, -5.57419872f, -6.24431992f, -6.32591867f, -5.26826477f, -3.04502487f, 0.449693143f, 3.47979259f, 3.50362659f, 2.58046269f, 0.579684913f, -0.919588447f, -2.08200479f, -2.6678884f, -2.59757757f, -3.0013814f, -3.42182064f, -2.75994992f, -1.48684669f, -0.477065891f, 0.0327885784f, 0.994787097f, + -2.7904563f, -1.33298481f, -0.825692832f, -1.78411806f, -1.98032236f, -2.94529605f, -1.540254f, -1.03917682f, -1.87087965f, -2.15394163f, -2.24386406f, -1.56417131f, -1.79924405f, -2.09344101f, -3.65430427f, -4.66693974f, -4.27157164f, -1.08878291f, -0.221785039f, -0.0799107477f, -0.684955359f, -1.22172666f, -1.90416121f, -2.04627061f, -2.09932423f, -2.7114203f, -3.33123398f, -2.65206981f, -1.4748162f, -0.431342453f, 0.0863730982f, 1.03362691f, + -2.80970526f, -1.32318377f, -0.788406253f, -1.62803352f, -1.83336627f, -2.71299958f, -1.29830825f, -0.898415565f, -1.27306414f, -1.4642626f, -1.53942132f, -0.524312437f, -1.13679814f, -2.15964532f, -3.81581545f, -6.19301414f, -6.9342289f, -4.5518117f, -4.05187798f, -3.89661026f, -2.73003149f, -1.90081847f, -1.18712986f, -1.05476069f, -1.45352709f, -2.40461349f, -3.57806826f, -2.67894101f, -1.34701252f, -0.292546421f, 0.223820776f, 1.15115368f, + -2.83941913f, -1.31946158f, -0.752137005f, -1.59541857f, -1.98224044f, -3.13006711f, -2.87664342f, -2.74078941f, -2.44921613f, -1.53203559f, -1.11937928f, -0.268255889f, -1.06444466f, -2.87781739f, -4.91630268f, -8.23729324f, -10.6890593f, -10.1742487f, -8.88589478f, -7.06334209f, -4.42162704f, -2.8048737f, -0.9670524f, -0.169980749f, -0.62598449f, -1.46366549f, -3.44733119f, -2.70727062f, -1.12550855f, 0.0431886837f, 0.491125584f, 1.39527845f, + -2.88625073f, -1.36332977f, -0.782323718f, -1.70872879f, -2.29862785f, -3.65832949f, -3.41763759f, -2.27270484f, -1.15727568f, -0.485867918f, -0.534794629f, -0.99851644f, -1.86469233f, -3.56163645f, -6.06065321f, -8.93986511f, -11.1936483f, -11.16537f, -9.42015839f, -7.1612606f, -4.54605007f, -3.13340139f, -1.05612564f, -0.218226328f, -0.347539067f, -0.917124569f, -3.23879743f, -2.66016054f, -1.1019274f, 0.280594468f, 0.802835882f, 1.70916617f, + -2.95432734f, -1.55732679f, -0.9671579f, -1.87740719f, -2.52375722f, -3.9269383f, -3.63090515f, -2.16633034f, -1.57592404f, -1.65385628f, -2.63003421f, -3.4876802f, -4.29189682f, -4.7487464f, -5.76429272f, -6.65200949f, -7.45039988f, -7.22736359f, -6.15258741f, -5.31453133f, -3.85754275f, -3.2067554f, -1.73008275f, -1.35701323f, -1.16924942f, -1.25322843f, -3.28507686f, -2.95321226f, -1.38456213f, 0.187379554f, 0.978641272f, 1.96348953f, + -3.11177945f, -1.80547488f, -1.13023674f, -1.9582721f, -2.37351155f, -3.67039227f, -3.1937058f, -2.27774191f, -2.11655211f, -2.92763114f, -3.51109672f, -4.43897057f, -4.60774946f, -5.22836876f, -5.26246691f, -5.41725492f, -5.64507723f, -5.44532156f, -5.25552511f, -5.40288162f, -4.75492859f, -4.50234127f, -3.85268068f, -3.71338868f, -3.31360817f, -3.09147811f, -4.54734945f, -3.58751845f, -1.86106849f, -0.0580402128f, 0.987123847f, 2.12943125f, + -3.36467028f, -2.15916252f, -1.39851403f, -2.21555972f, -2.6277256f, -3.89018989f, -3.28536391f, -2.4179709f, -2.31355095f, -3.14865518f, -3.84860849f, -4.44453287f, -4.50857449f, -4.88197565f, -4.95770359f, -5.04250717f, -4.74955845f, -4.8034606f, -4.87089396f, -5.45653677f, -5.71883726f, -5.90324974f, -5.92616558f, -5.50277519f, -5.18182898f, -5.07875252f, -6.3301487f, -4.71556807f, -2.65147376f, -0.510522306f, 0.768599629f, 2.15899801f, + -3.76996517f, -2.77193499f, -2.04029584f, -2.67725992f, -3.11456323f, -4.35716057f, -3.96405196f, -3.11866283f, -2.89303422f, -3.84127808f, -4.63507318f, -5.34559536f, -5.6741724f, -5.9913516f, -5.89291143f, -6.14835787f, -5.75908613f, -5.48700523f, -5.17146826f, -5.74538183f, -6.23743486f, -6.26235199f, -6.18846273f, -5.73266459f, -5.36256504f, -5.36837292f, -6.48104477f, -4.97722006f, -3.1608839f, -1.36612868f, -0.0857250318f, 1.66240442f, + -4.26793671f, -3.59083676f, -2.84308076f, -3.14333463f, -3.37969398f, -4.55007124f, -4.40458679f, -3.53423572f, -2.78584123f, -3.32700229f, -3.87822628f, -5.09642506f, -6.15807199f, -6.88138437f, -7.01429605f, -7.22634697f, -7.04120684f, -6.64636993f, -5.79211712f, -5.76786995f, -5.5597887f, -5.01553154f, -4.96951723f, -4.92054939f, -4.69466639f, -4.54826736f, -5.57798719f, -4.50945187f, -3.42488861f, -2.2323885f, -1.17007399f, 0.706006825f, + -4.58093643f, -4.09917927f, -3.6026299f, -3.76272631f, -4.10116673f, -5.68298769f, -5.67115974f, -5.20354462f, -4.87026978f, -5.25120115f, -5.51101351f, -6.41377878f, -7.30511761f, -8.20695019f, -8.15464306f, -8.10768127f, -7.72227478f, -7.57483578f, -6.83547497f, -6.92473555f, -6.26031017f, -5.44693089f, -4.98586988f, -4.71777868f, -4.84076738f, -4.88040304f, -5.76190281f, -4.94208717f, -3.97660065f, -3.09410763f, -2.34518123f, -0.401388973f, + -4.02035284f, -4.02879238f, -4.01832962f, -4.46334934f, -5.42945766f, -7.13510704f, -7.44949913f, -7.30862284f, -7.11234093f, -7.16781139f, -7.24586773f, -7.57177401f, -7.80264711f, -7.91191673f, -7.63455296f, -7.31139612f, -7.24533272f, -7.21524429f, -7.19505501f, -7.53508186f, -7.19776154f, -6.55349255f, -6.06127691f, -5.76581764f, -6.03102398f, -6.42573166f, -7.26578999f, -6.23923731f, -5.2162056f, -4.30992317f, -3.86889744f, -2.24626088f + } + }, + { + "P4", + 1, 1, 64, 64, + {/*Intentionaly 0 elements. Test for this elemnt does not used*/} + }, + { + "P3", + 1, 1, 128, 128, + {/*Intentionaly 0 elements. Test for this elemnt does not used*/} + }, + { + "P2", + 1, 1, 128, 128, + {/*Intentionaly 0 elements. Test for this elemnt does not used*/} + }, + { + "pool", + 1, 1, 7, 7, + {/*Intentionaly 0 elements. Values not important - only layout*/} + } +}; + +memory allocate_memory(Test_index key, const engine &engine) +{ + auto ret = memory::allocate(engine, { data_types::f32, format::bfyx, { test_data[key].b, test_data[key].f, test_data[key].y, test_data[key].x } }); + set_values(ret, test_data[key].dataTMP); + return ret; +} + +TEST(pyramidROIAlign_gpu, basic_functionality) +{ + const auto& engine = get_test_engine(); + + std::vector answer = + { + -5.56710863f, -4.15980053f, -3.62781334f, -4.4299016f, -4.32974339f, -4.59520054f, -5.14869022f, + -4.04856586f, -6.20199442f, -8.62770653f, -9.3613081f, -7.69766426f, -4.6893239f, -1.79761052f, + -2.1207974f, -0.0283275843f, 2.62955427f, 0.693355441f, -3.21296549f, -5.62806273f, -6.13721943f, + -3.01667213f, 1.90189886f, 9.18445969f, 11.0731812f, 5.476161f, -2.67103052f, -8.19120693f, + -5.73783922f, -2.93177485f, 5.87217808f, 11.9360819f, 10.5841255f, 4.8481946f, -0.81512779f, + -2.63171887f, -3.56354189f, -4.38874054f, -2.65824175f, 0.0660879612f, 0.36207819f, -0.571367621f, + -2.00750613f, -4.5745883f, -8.36942673f, -10.7424393f, -9.67979145f, -7.39468241f, -4.24828815f + }; + + auto boxes = allocate_memory(BOXES, engine); + auto image_meta = allocate_memory(IMAGE_META, engine); + auto P5_tensor = allocate_memory(P5, engine); + auto P4_tensor = allocate_memory(P4, engine); + auto P3_tensor = allocate_memory(P3, engine); + auto P2_tensor = allocate_memory(P2, engine); + auto pool_size = allocate_memory(POOL, engine); + + topology topo; + topo.add(input_layout(test_data[BOXES].parameter_name, boxes.get_layout())); + topo.add(input_layout(test_data[IMAGE_META].parameter_name, image_meta.get_layout())); + topo.add(input_layout(test_data[P2].parameter_name, P2_tensor.get_layout())); + topo.add(input_layout(test_data[P3].parameter_name, P3_tensor.get_layout())); + topo.add(input_layout(test_data[P4].parameter_name, P4_tensor.get_layout())); + topo.add(input_layout(test_data[P5].parameter_name, P5_tensor.get_layout())); + topo.add(input_layout(test_data[POOL].parameter_name, pool_size.get_layout())); + + topo.add(pyramid_roi_align("pyramidROIAlign", + test_data[BOXES].parameter_name, + test_data[IMAGE_META].parameter_name, + test_data[P2].parameter_name, + test_data[P3].parameter_name, + test_data[P4].parameter_name, + test_data[P5].parameter_name, + test_data[POOL].parameter_name)); + + network net(engine, topo); + net.set_input_data(test_data[BOXES].parameter_name, boxes); + net.set_input_data(test_data[IMAGE_META].parameter_name, image_meta); + net.set_input_data(test_data[P2].parameter_name, P2_tensor); + net.set_input_data(test_data[P3].parameter_name, P3_tensor); + net.set_input_data(test_data[P4].parameter_name, P4_tensor); + net.set_input_data(test_data[P5].parameter_name, P5_tensor); + net.set_input_data(test_data[POOL].parameter_name, pool_size); + + auto outputs = net.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "pyramidROIAlign"); + + auto output_mem = outputs.at("pyramidROIAlign").get_memory(); + auto output_ptr = output_mem.pointer(); + + int k = 0; + for (float val1 : output_ptr) + { + EXPECT_NEAR(val1, answer[k++], 1e-5); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp index 5bb2857..03996ba 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp @@ -59,7 +59,7 @@ TEST(reorder_gpu_f32, basic) // b1 f1: 12 8 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 }); @@ -145,7 +145,7 @@ TEST(reorder_gpu_f32, basic_subtract) { // b1 f1: 10 7 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); layout output_layout( data_types::f32, format::bfyx, {2,2,2,2} ); @@ -234,7 +234,7 @@ TEST(reorder_gpu_f32, basic_subtract_value) { // b1 f1: 9.5 5.5 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); layout output_layout(data_types::f32, format::bfyx,{ 2,2,2,2 }); @@ -318,7 +318,7 @@ TEST(reorder_gpu_f16, basic_subtract_f32_output_f32) { // b1 f1: 10 7 // - engine engine; + const auto& engine = get_test_engine(); if (!engine.get_info().supports_fp16) { @@ -413,7 +413,7 @@ TEST(reorder_gpu_f16, basic_subtract_value) { // b1 f1: 9.5 5.5 // - engine engine; + const auto& engine = get_test_engine(); if (!engine.get_info().supports_fp16) { std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl; @@ -482,7 +482,7 @@ TEST(reorder_gpu, basic_convert_f16_f32_f16) { // Output is expected to contain the same value as input in range of indices from 0x0000 to 0xF801. // - engine engine; + const auto& engine = get_test_engine(); if (!engine.get_info().supports_fp16) { @@ -562,7 +562,7 @@ TEST(reorder_gpu, basic_convert_f16_f32_f16) { TEST(reorder_gpu, basic_convert_int8) { - engine engine; + const auto& engine = get_test_engine(); layout in_layout = { type_to_data_type::value,format::byxf,{ 1,1,3,3 } }; layout byte_layout = { type_to_data_type::value, format::bfyx,{ 1,1,3,3 } }; std::initializer_list input_f = { 1.0f, -2.5f, 3.1f, -4.0f, 5.03f, -6.99f, 7.0f, -8.0f, 9.0f }; @@ -620,7 +620,7 @@ TEST(reorder_gpu, basic_convert_uint8rgbabyxf_to_fp32_bfyx) { // const int kernel_size = 5; const int feature_size = 4; - engine engine; + const auto& engine = get_test_engine(); if (!engine.get_info().supports_fp16) { @@ -751,7 +751,7 @@ TEST(reorder_gpu_f32, basic_yxfb_to_bfyx_input_padding) // f1: b0: 5 6 b1: 1.5 5.2 // f1: b0: 7 8 b1: 12 8 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); layout output_layout(data_types::f32, format::bfyx, { 2,2,2,2 }); @@ -830,7 +830,7 @@ TEST(reorder_gpu_f32, basic_bfyx_to_yxfb_input_padding) // b1 f1: 12 8 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); layout output_layout(data_types::f32, format::yxfb, { 2,2,2,2 }); @@ -910,7 +910,34 @@ TEST(reorder_gpu_opt, basic_remove_redundant) EXPECT_TRUE(outputs.at("r2").get_memory().get_layout().format == format::yxfb); } -TEST(reorder_gpu_opt, basic_remove_redundant_due_to_implicit_reorders) +TEST(reorder_gpu_opt, remove_redundant_activation_fuse) +{ + engine eng; + + memory in = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 1, 2, 1 } }); + set_values(in, { -1.0f, -1.0f }); + memory scale_mem = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{1, 1, 1, 1 } }); + set_values(scale_mem, { 2.0f }); + topology tpl{ + input_layout("in", in.get_layout()), + reorder("r1", "in", format::bfyx, data_types::f32), + activation("relu", "r1", cldnn_activation_func::activation_relu_negative_slope, {0.01f, 0.0f}), + data("scale_data", scale_mem), + scale("output", "relu", "scale_data") + }; + + build_options opts; + opts.set_option(build_option::optimize_data(true)); + + network net(eng, tpl, opts); + net.set_input_data("in", in); + auto outputs = net.execute(); + auto out_ptr = outputs.begin()->second.get_memory().pointer(); + EXPECT_FLOAT_EQ(out_ptr[0], -0.02f); + EXPECT_FLOAT_EQ(out_ptr[1], -0.02f); +} + +TEST(reorder_gpu_opt, basic_do_not_remove_redundant_due_it_is_output) { engine eng; @@ -920,7 +947,7 @@ TEST(reorder_gpu_opt, basic_remove_redundant_due_to_implicit_reorders) input_layout("in", in.get_layout()), convolution("conv", "in", { "weights" }), data("weights", weights), - reorder("r1", "conv", format::bfyx, data_types::f32) //optimize data should add conversion from yxfb to bfyx and 'conv' should output data in bfyx as well (IE case) + reorder("r1", "conv", format::bfyx, data_types::f32) //reoder is output - do not optimize }; build_options opts; @@ -931,8 +958,10 @@ TEST(reorder_gpu_opt, basic_remove_redundant_due_to_implicit_reorders) auto outputs = net.execute(); auto executed_primitives = net.get_executed_primitives(); - //remove redundant reorder optimization should replace redundant reorder node with convolution - EXPECT_TRUE(executed_primitives.count("conv") == 0); + //all pirmitives in this test needs to be executed + EXPECT_TRUE(executed_primitives.count("conv") == 1); + EXPECT_TRUE(executed_primitives.count("in") == 1); + EXPECT_TRUE(executed_primitives.count("r1") == 1); ASSERT_TRUE(outputs.count("r1") == 1); EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx); } @@ -965,6 +994,35 @@ TEST(reorder_gpu_opt, basic_remove_redundant_output_due_to_implicit_reorders) EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx); } +TEST(reorder_gpu_opt, basic_remove_redundant_due_to_implicit_reorders) +{ + engine eng; + + memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } }); + memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } }); + topology tpl{ + input_layout("in", in.get_layout()), + convolution("conv", "in",{ "weights" }), + data("weights", weights), + reorder("r1", "conv", format::bfyx, data_types::f32), //optimize data should add conversion from yxfb to bfyx and 'conv' should output data in bfyx as well (IE case) + softmax("output", "r1") + }; + + build_options opts; + opts.set_option(build_option::optimize_data(true)); + + network net(eng, tpl, opts); + net.set_input_data("in", in); + auto outputs = net.execute(); + auto executed_primitives = net.get_executed_primitives(); + + //remove redundant reorder optimization should remove r1 node + EXPECT_TRUE(executed_primitives.count("r1") == 0); + //all pirmitives in this test needs to be executed + ASSERT_TRUE(outputs.count("output") == 1); + EXPECT_TRUE(outputs.at("output").get_memory().get_layout().format == format::bfyx); +} + TEST(reorder_gpu_opt, non_trivial_remove_redundant) { engine eng; @@ -987,7 +1045,7 @@ TEST(reorder_gpu_opt, non_trivial_remove_redundant) ASSERT_TRUE(executed_primitives.count("in") == 1); //ASSERT_TRUE(all_primitives.at("r1") == "_optimized_"); - EXPECT_TRUE(executed_primitives.at("in") == outputs.at("r1").get_event()); + EXPECT_TRUE(executed_primitives.at("in") != outputs.at("r1").get_event()); ASSERT_TRUE(outputs.count("r1") == 1); EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx); } @@ -1129,7 +1187,7 @@ TEST(reorder_gpu_opt, mean_mul_val_float_to_int) TEST(reorder_gpu_i32, basic) { // Test for converting data types f32->i32 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); layout output_layout(data_types::i32, format::bfyx, { 2,2,2,2 }); @@ -1170,7 +1228,7 @@ TEST(reorder_gpu_i32, basic) TEST(reorder_gpu_i64, basic) { // Test for converting data types f32->i64 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); layout output_layout(data_types::i64, format::bfyx, { 2,2,2,2 }); @@ -1232,6 +1290,8 @@ public: static std::vector> generate_specific_test_params() { generic_test::generate_generic_test_params(all_generic_params); + + const auto data_types = test_data_types(); for (const auto& test_param : all_generic_params) { @@ -1239,7 +1299,7 @@ public: std::vector output_layouts = {}; - for (const auto& dt : test_data_types()) + for (const auto& dt : data_types) { for (const auto& fmt : generic_test::test_input_formats) { @@ -1280,7 +1340,7 @@ public: assert(mean == ""); assert(subtract_per_feature.size() == 0); - auto output = memory::allocate(engine, cldnn::layout(reorder->output_data_type, inputs[0].get_layout().format, inputs[0].get_layout().size)); + auto output = memory::allocate(engine, cldnn::layout(*reorder->output_data_type, inputs[0].get_layout().format, inputs[0].get_layout().size)); cldnn::pointer input_mem = inputs[0].pointer(); cldnn::pointer output_mem = output.pointer(); @@ -1299,7 +1359,7 @@ public: { if (generic_params->data_type == data_types::f32) { - if (((cldnn::reorder*)layer_params)->output_data_type == data_types::f32) + if (*layer_params->output_data_type == data_types::f32) { return generate_reference_typed(inputs); } @@ -1310,7 +1370,7 @@ public: } else { - if (((cldnn::reorder*)layer_params)->output_data_type == data_types::f32) + if (*layer_params->output_data_type == data_types::f32) { return generate_reference_typed(inputs); } diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reshape_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reshape_gpu_test.cpp index b7a6852..d75d9ee 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/reshape_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reshape_gpu_test.cpp @@ -43,7 +43,7 @@ void verify_int(const int32_t &output_value, const int32_t &value) template void generic_reshape_test(format fmt, tensor const& input_size, tensor const& reshape_size, bool in_place, padding const& input_padd = padding(), padding const& output_padd = padding()) { - engine engine; + const auto& engine = get_test_engine(); //allocate input memory auto data_type = data_types::f32; @@ -501,7 +501,7 @@ TEST(reshape_gpu_f32, multiple_users_with_reorder) { // b1f0: 0.0 // b1f1: 4.0 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; auto x_size = 1; @@ -536,4 +536,49 @@ TEST(reshape_gpu_f32, multiple_users_with_reorder) { for (size_t i = 0; i < out2.size(); i++) EXPECT_EQ(output_ptr_2[i], out2[i]); -} \ No newline at end of file +} + +TEST(reshape_gpu_f32, calc_output_shape) { + + // INPUT(bfyx,2x2x1x1) -- RESHAPE(1, 1, 0, -1) + + // Input: + // b0f0: -1.0 + // b0f1: 2.0 + // b1f0: -3.0 + // b1f1: 4.0 + // + // output_shape (1, 1, 1, 4) + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 1, 1 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(reshape("reshape", "input", tensor(1, 1, 0, -1))); + + set_values(input, { -1.f, 2.f, -3.f, 4.f }); + + network network(engine, topology); + network.set_input_data("input", input); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "reshape"); + + auto output = outputs.at("reshape").get_memory(); + + EXPECT_TRUE(output.get_layout().data_type == input.get_layout().data_type); + EXPECT_TRUE(output.get_layout().format == input.get_layout().format); + + ASSERT_TRUE(output.get_layout().size == tensor(1, 1, 1, 4)); + + float answers[4] = { -1.f, 2.f, -3.f, 4.f }; + + auto output_ptr = output.pointer(); + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reverse_sequence_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reverse_sequence_gpu_test.cpp new file mode 100644 index 0000000..441b558 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reverse_sequence_gpu_test.cpp @@ -0,0 +1,580 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(reverese_sequence_gpu_test, fp32_d2_2_ba1_sa0) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } }); + size_t batch_axis = 1; + size_t seq_axis = 0; + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f + }); + + set_values(seq_lengths, { + 1.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 3.0f, 2.0f, 1.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(reverese_sequence_gpu_test, fp32_d3_3_3_ba0_sa1) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 3, 1, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 1, 1 } }); + size_t batch_axis = 0; + size_t seq_axis = 1; + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f + }); + + set_values(seq_lengths, { + 2.0f, 2.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f, 6.0f, 7.0f, 8.0f, + 12.0f, 13.0f, 14.0f, 9.0f, 10.0f, 11.0f, 15.0f, 16.0f, 17.0f, + 21.0f, 22.0f, 23.0f, 18.0f, 19.0f, 20.0f, 24.0f, 25.0f, 26.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(reverese_sequence_gpu_test, fp32_d3_3_3_ba2_sa0) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 3, 1, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 1, 1 } }); + size_t batch_axis = 2; + size_t seq_axis = 0; + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f + }); + + set_values(seq_lengths, { + 2.0f, 2.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(reverese_sequence_gpu_test, fp32_d2_2_3_2ba0_sa3) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } }); + size_t batch_axis = 0; + size_t seq_axis = 3; + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + set_values(seq_lengths, { + 1.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 13.0f, 12.0f, 15.0f, 14.0f, 17.0f, 16.0f, + 19.0f, 18.0f, 21.0f, 20.0f, 23.0f, 22.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(reverese_sequence_gpu_test, fp32_d2_2_3_2ba0_sa2) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } }); + size_t batch_axis = 0; + size_t seq_axis = 2; + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + set_values(seq_lengths, { + 2.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 2.0f, 3.0f, 0.0f, 1.0f, 4.0f, 5.0f, + 8.0f, 9.0f, 6.0f, 7.0f, 10.0f, 11.0f, + 14.0f, 15.0f, 12.0f, 13.0f, 16.0f, 17.0f, + 20.0f, 21.0f, 18.0f, 19.0f, 22.0f, 23.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(reverese_sequence_gpu_test, fp32_d2_2_3_2ba2_sa0) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 1, 1 } }); + size_t batch_axis = 2; + size_t seq_axis = 0; + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + set_values(seq_lengths, { + 1.0f, 1.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 1.0f, 2.0f, 3.0f, 16.0f, 17.0f, + 6.0f, 7.0f, 8.0f, 9.0f, 22.0f, 23.0f, + 12.0f, 13.0f, 14.0f, 15.0f, 4.0f, 5.0f, + 18.0f, 19.0f, 20.0f, 21.0f, 10.0f, 11.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(reverese_sequence_gpu_test, fp16_d2_2_ba1_sa0) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } }); + size_t batch_axis = 1; + size_t seq_axis = 0; + + set_values(input, { + FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f) + }); + + set_values(seq_lengths, { + 1.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 3.0f, 2.0f, 1.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(reverese_sequence_gpu_test, fp16_d3_3_3_ba0_sa1) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 3, 1, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 1, 1 } }); + size_t batch_axis = 0; + size_t seq_axis = 1; + + set_values(input, { + FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f), FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f), FLOAT16(8.0f), FLOAT16(9.0f), + FLOAT16(10.0f), FLOAT16(11.0f), FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), FLOAT16(15.0f), FLOAT16(16.0f), FLOAT16(17.0f), FLOAT16(18.0f), FLOAT16(19.0f), + FLOAT16(20.0f), FLOAT16(21.0f), FLOAT16(22.0f), FLOAT16(23.0f), FLOAT16(24.0f), FLOAT16(25.0f), FLOAT16(26.0f) + }); + + set_values(seq_lengths, { + 2.0f, 2.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f, 6.0f, 7.0f, 8.0f, + 12.0f, 13.0f, 14.0f, 9.0f, 10.0f, 11.0f, 15.0f, 16.0f, 17.0f, + 21.0f, 22.0f, 23.0f, 18.0f, 19.0f, 20.0f, 24.0f, 25.0f, 26.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(reverese_sequence_gpu_test, fp16_d3_3_3_ba2_sa0) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 3, 1, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 1, 1 } }); + size_t batch_axis = 2; + size_t seq_axis = 0; + + set_values(input, { + FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f), FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f), FLOAT16(8.0f), FLOAT16(9.0f), + FLOAT16(10.0f), FLOAT16(11.0f), FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), FLOAT16(15.0f), FLOAT16(16.0f), FLOAT16(17.0f), FLOAT16(18.0f), FLOAT16(19.0f), + FLOAT16(20.0f), FLOAT16(21.0f), FLOAT16(22.0f), FLOAT16(23.0f), FLOAT16(24.0f), FLOAT16(25.0f), FLOAT16(26.0f) + }); + + set_values(seq_lengths, { + 2.0f, 2.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(reverese_sequence_gpu_test, fp16_d2_2_3_2ba0_sa3) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 2, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } }); + size_t batch_axis = 0; + size_t seq_axis = 3; + + set_values(input, { + FLOAT16(0.0f), FLOAT16( 1.0f), FLOAT16( 2.0f), FLOAT16( 3.0f), FLOAT16( 4.0f), FLOAT16( 5.0f), FLOAT16( 6.0f), FLOAT16( 7.0f), FLOAT16( 8.0f), FLOAT16( 9.0f), + FLOAT16(10.0f), FLOAT16( 11.0f), FLOAT16( 12.0f), FLOAT16( 13.0f), FLOAT16( 14.0f), FLOAT16( 15.0f), FLOAT16( 16.0f), FLOAT16( 17.0f), FLOAT16( 18.0f), FLOAT16( 19.0f), + FLOAT16(20.0f), FLOAT16( 21.0f), FLOAT16( 22.0f), FLOAT16( 23.0f) + }); + + set_values(seq_lengths, { + 1.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 13.0f, 12.0f, 15.0f, 14.0f, 17.0f, 16.0f, + 19.0f, 18.0f, 21.0f, 20.0f, 23.0f, 22.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(reverese_sequence_gpu_test, fp16_d2_2_3_2ba0_sa2) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 2, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } }); + size_t batch_axis = 0; + size_t seq_axis = 2; + + set_values(input, { + FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f), FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f), FLOAT16(8.0f), FLOAT16(9.0f), + FLOAT16(10.0f), FLOAT16(11.0f), FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), FLOAT16(15.0f), FLOAT16(16.0f), FLOAT16(17.0f), FLOAT16(18.0f), FLOAT16(19.0f), + FLOAT16(20.0f), FLOAT16(21.0f), FLOAT16(22.0f), FLOAT16(23.0f) + }); + + set_values(seq_lengths, { + 2.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 2.0f, 3.0f, 0.0f, 1.0f, 4.0f, 5.0f, + 8.0f, 9.0f, 6.0f, 7.0f, 10.0f, 11.0f, + 14.0f, 15.0f, 12.0f, 13.0f, 16.0f, 17.0f, + 20.0f, 21.0f, 18.0f, 19.0f, 22.0f, 23.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} + +TEST(reverese_sequence_gpu_test, fp16_d2_2_3_2ba2_sa0) { + engine engine; + + auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 2, 3 } }); + auto seq_lengths = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 1, 1 } }); + size_t batch_axis = 2; + size_t seq_axis = 0; + + set_values(input, { + FLOAT16(0.0f), FLOAT16(1.0f), FLOAT16(2.0f), FLOAT16(3.0f), FLOAT16(4.0f), FLOAT16(5.0f), FLOAT16(6.0f), FLOAT16(7.0f), FLOAT16(8.0f), FLOAT16(9.0f), + FLOAT16(10.0f), FLOAT16(11.0f), FLOAT16(12.0f), FLOAT16(13.0f), FLOAT16(14.0f), FLOAT16(15.0f), FLOAT16(16.0f), FLOAT16(17.0f), FLOAT16(18.0f), FLOAT16(19.0f), + FLOAT16(20.0f), FLOAT16(21.0f), FLOAT16(22.0f), FLOAT16(23.0f) + }); + + set_values(seq_lengths, { + 1.0f, 1.0f, 2.0f + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("seq_lengths", seq_lengths.get_layout())); + topology.add( + reverse_sequence("reverse_sequence", "input", "seq_lengths", seq_axis, batch_axis) + ); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("seq_lengths", seq_lengths); + + auto outputs = network.execute(); + + auto output = outputs.at("reverse_sequence").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.0f, 1.0f, 2.0f, 3.0f, 16.0f, 17.0f, + 6.0f, 7.0f, 8.0f, 9.0f, 22.0f, 23.0f, + 12.0f, 13.0f, 14.0f, 15.0f, 4.0f, 5.0f, + 18.0f, 19.0f, 20.0f, 21.0f, 10.0f, 11.0f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i])); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_gpu_test.cpp index 6aecf63..f5b9a7a 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_gpu_test.cpp @@ -47,7 +47,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_same_size) { // f1: b0: 1.1 1.2 1.25 b1: 1.3 1.4 1.5 // f1: b0: 1.6 1.7 1.75 b1: 1.8 1.9 2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 2 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 2 } }); @@ -107,7 +107,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_same_size_bfyx) { // f1: b0: 1.1 1.2 1.25 b1: 1.3 1.4 1.5 // f1: b0: 1.6 1.7 1.75 b1: 1.8 1.9 2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } }); @@ -165,7 +165,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_same_size_scale_bfyx) { // f1: b0: 1.1 1.2 1.25 b1: 1.3 1.4 1.5 // f1: b0: 1.6 1.7 1.75 b1: 1.8 1.9 2 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -243,7 +243,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_same_size_bias_term) { // f1: b0: 3.1 3.2 3.25 b1: 3.3 3.4 3.5 // f1: b0: 4.6 4.7 4.75 b1: 4.8 4.9 4 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 2 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 2 } }); @@ -313,7 +313,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_scalar) { // Scale: // 0.1 0.2 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -378,7 +378,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_y) { // Scale: // 0.1 0.2 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -445,7 +445,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_fb) { // f0b0: 0.1 f0b1: 0.2 // f1b0: 0.5 f1b1: 2.0 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -511,7 +511,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_f) { // Scale: per feature // f0bx: 0.1 f1bx: 0.2 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -578,7 +578,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_x) { // Scale: // 0.1 0.2 0.25 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -646,7 +646,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_xy) { // f0: 0.1 0.2 0.25 // f0: 0.6 0.7 0.75 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -719,7 +719,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_batch1) { // f1: b0: 1.1 1.2 1.25 // f1: b0: 1.6 1.7 1.75 - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 2; auto feature_num = 2; @@ -793,7 +793,7 @@ TEST(scale_gpu, basic_in2x3_scale_same_size_bx) { // b0: -0.1 3.2 7 // b1: 0 1 -1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 3, 1 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 3, 1 } }); @@ -857,7 +857,7 @@ TEST(scale_gpu, basic_in2x3_scale_same_size_xb) { // x0: -0.1 3.2 7 // x1: 0 1 -1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 3, 1, 2, 1 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb, { 3, 1, 2, 1 } }); @@ -919,7 +919,7 @@ TEST(scale_gpu, basic_in2x3_scale_single_value_bx) { // Bias: // -0.1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 3, 1 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); @@ -979,7 +979,7 @@ TEST(scale_gpu, basic_in2x3_scale_single_value_xb) { // Bias: // -0.1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 3, 1, 2, 1 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 1, 1 } }); @@ -1036,7 +1036,7 @@ TEST(scale_gpu, basic_in2x3_scale_same_size_no_bias_bx) { // b0: 3.1 0.2 0.17 // b1: 10 -3 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 3, 1 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 3, 1 } }); @@ -1086,7 +1086,7 @@ TEST(scale_gpu, basic_in2x3_scale_same_size_no_bias_xb) { // x0: 3.1 0.2 0.17 // x1: 10 -3 1 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 3, 1, 2, 1 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb, { 3, 1, 2, 1 } }); @@ -1139,7 +1139,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_yxfb_bfyx_same_size_padding) { // 0.1 0.2 // 0.6 0.5 - engine engine; + const auto& engine = get_test_engine(); std::vector formats_to_test = { format::yxfb , format::bfyx }; for (std::vector::iterator it = formats_to_test.begin(); it != formats_to_test.end(); ++it) @@ -1204,7 +1204,7 @@ static network setup_scale_network( bool pass_bias //TODO: a WA for lack of std::optional bias ) { - engine engine; + const auto& engine = get_test_engine(); topology topology; auto input_mem = memory::allocate(engine, { dt, f, input_tensor }); @@ -1327,7 +1327,9 @@ public: std::vector all_generic_params; - for (cldnn::data_types dt : test_data_types()) + auto data_types = test_data_types(); + + for (cldnn::data_types dt : data_types) for (tensor & t : test_input_sizes) { std::vector> attempted_dims; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp index 8b9a22c..c7057d5 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp @@ -46,7 +46,7 @@ TEST(scale_grad_input_gpu, basic_in2x3x2x2_scale_same_size) { // f1: b0: 1.1 1.2 1.25 b1: 1.3 1.4 1.5 // f1: b0: 1.6 1.7 1.75 b1: 1.8 1.9 2 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp index 79d7501..680c68e 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp @@ -52,7 +52,7 @@ TEST(scale_grad_weights_gpu, basic_in2x3x2x2) { // f0: 0.1 // f1: 0.6 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } }); auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } }); @@ -137,7 +137,7 @@ TEST(scale_grad_weights_gpu, basic_in2x3x2x2_bias) { // f0: 1 // f1: 0.5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); @@ -237,7 +237,7 @@ TEST(scale_grad_weights_gpu, basic_in2x3x2x2_bias_momentum) { // f0: 1 // f1: 0.5 - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/select_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/select_gpu_test.cpp index 228664b..abd2cff 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/select_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/select_gpu_test.cpp @@ -29,17 +29,17 @@ using namespace tests; // select_gpu_f32 TEST(select_gpu_f32, select_basic) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); topology topology; topology.add(input_layout("input", input.get_layout())); topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); set_values(input, { 1.f, 0.f, 5.f, 1.5f, @@ -54,17 +54,17 @@ TEST(select_gpu_f32, select_basic) { 15.f, 17.f, 8.f, 10.f, -2.f, 6.5f, -0.5f, -2.5f }); - set_values(mask, { - 0.f, 0.f, 0.f, 0.f, - 1.f, 1.f, 1.f, 1.f, - 0.f, 1.f, 0.f, 1.f, - 1.f, 0.f, 1.f, 0.f }); + set_values(mask, { + 0.f, 0.f, 0.f, 0.f, + 1.f, 1.f, 1.f, 1.f, + 0.f, 1.f, 0.f, 1.f, + 1.f, 0.f, 1.f, 0.f }); network network(engine, topology); network.set_input_data("input", input); network.set_input_data("input2", input2); - network.set_input_data("mask", mask); + network.set_input_data("mask", mask); auto outputs = network.execute(); auto output = outputs.at("select").get_memory(); @@ -83,1137 +83,1137 @@ TEST(select_gpu_f32, select_basic) { } TEST(select_gpu_f32, select_basic_negative) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f - }); - - set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); - - set_values(mask, { - -0.f, -0.f, -0.f, -0.f, - -1.f, -1.f, -1.f, -1.f, - -0.f, -1.f, -0.f, -1.f, - -1.f, -0.f, -1.f, -0.f }); - - network network(engine, topology); - - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); - - auto output = outputs.at("select").get_memory(); - - float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, - 2.f, 0.f, 6.f, 5.2f, - 15.f, 0.5f, 8.f, 12.f, - 4.f, 6.5f, 8.f, -2.5f }; - - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(mask, { + -0.f, -0.f, -0.f, -0.f, + -1.f, -1.f, -1.f, -1.f, + -0.f, -1.f, -0.f, -1.f, + -1.f, -0.f, -1.f, -0.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, + 2.f, 0.f, 6.f, 5.2f, + 15.f, 0.5f, 8.f, 12.f, + 4.f, 6.5f, 8.f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_comma) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f - }); - - set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); - - set_values(mask, { - 0.f, 0.f, 0.f, 0.f, - 0.1f, 0.3f, 0.5f, 0.7f, - -0.f, -0.1f, -0.f, -0.5f, - -0.7f, -0.f, -1.5f, -0.f }); - - network network(engine, topology); - - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); - - auto output = outputs.at("select").get_memory(); - - float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, - 2.f, 0.f, 6.f, 5.2f, - 15.f, 0.5f, 8.f, 12.f, - 4.f, 6.5f, 8.f, -2.5f }; - - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(mask, { + 0.f, 0.f, 0.f, 0.f, + 0.1f, 0.3f, 0.5f, 0.7f, + -0.f, -0.1f, -0.f, -0.5f, + -0.7f, -0.f, -1.5f, -0.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, + 2.f, 0.f, 6.f, 5.2f, + 15.f, 0.5f, 8.f, 12.f, + 4.f, 6.5f, 8.f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_error_input_sizes) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 3, 4, 5, 6 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 3, 4, 5, 6 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - EXPECT_ANY_THROW(network(engine, topology)); + EXPECT_ANY_THROW(network(engine, topology)); } TEST(select_gpu_f32, select_basic_error_mask_sizes) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 3, 4, 5, 6 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 3, 4, 5, 6 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - EXPECT_ANY_THROW(network(engine, topology)); + EXPECT_ANY_THROW(network(engine, topology)); } TEST(select_gpu_f32, select_basic_error_input_types) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - EXPECT_ANY_THROW(network(engine, topology)); + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + EXPECT_ANY_THROW(network(engine, topology)); } TEST(select_gpu_f32, select_basic_error_input_formats) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - EXPECT_ANY_THROW(network(engine, topology)); + EXPECT_ANY_THROW(network(engine, topology)); } TEST(select_gpu_f32, select_basic_byxf) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f - }); - - set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); - - set_values(mask, { - 0.f, 0.f, 0.f, 0.f, - 1.f, 1.f, 1.f, 1.f, - 0.f, 1.f, 0.f, 1.f, - 1.f, 0.f, 1.f, 0.f }); - - network network(engine, topology); - - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); - - auto output = outputs.at("select").get_memory(); - - float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, - 2.f, 0.f, 6.f, 5.2f, - 15.f, 0.5f, 8.f, 12.f, - 4.f, 6.5f, 8.f, -2.5f }; - - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(mask, { + 0.f, 0.f, 0.f, 0.f, + 1.f, 1.f, 1.f, 1.f, + 0.f, 1.f, 0.f, 1.f, + 1.f, 0.f, 1.f, 0.f }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, + 2.f, 0.f, 6.f, 5.2f, + 15.f, 0.5f, 8.f, 12.f, + 4.f, 6.5f, 8.f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_mask_f16) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f - }); - - set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); - - set_values(mask, { - 0, 0, 0, 0, - 1, 1, 1, 1, - 0, 1, 0, 1, - 1, 0, 1, 0 }); - - network network(engine, topology); - - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); - - auto output = outputs.at("select").get_memory(); - - float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, - 2.f, 0.f, 6.f, 5.2f, - 15.f, 0.5f, 8.f, 12.f, - 4.f, 6.5f, 8.f, -2.5f }; - - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(mask, { + 0, 0, 0, 0, + 1, 1, 1, 1, + 0, 1, 0, 1, + 1, 0, 1, 0 }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, + 2.f, 0.f, 6.f, 5.2f, + 15.f, 0.5f, 8.f, 12.f, + 4.f, 6.5f, 8.f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_mask_i8) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f - }); - - set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); - - set_values(mask, { - 0, 0, 0, 0, - 1, 1, 1, 1, - 0, 1, 0, 1, - 1, 0, 1, 0 }); - - network network(engine, topology); - - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); - - auto output = outputs.at("select").get_memory(); - - float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, - 2.f, 0.f, 6.f, 5.2f, - 15.f, 0.5f, 8.f, 12.f, - 4.f, 6.5f, 8.f, -2.5f }; - - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(mask, { + 0, 0, 0, 0, + 1, 1, 1, 1, + 0, 1, 0, 1, + 1, 0, 1, 0 }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, + 2.f, 0.f, 6.f, 5.2f, + 15.f, 0.5f, 8.f, 12.f, + 4.f, 6.5f, 8.f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_mask_u8) { - engine engine; - - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 2, 2, 2, 2 } }); - - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); - - set_values(input, { - 1.f, 0.f, 5.f, 1.5f, - 2.f, 0.f, 6.f, 5.2f, - 3.f, 0.5f, 7.f, 12.f, - 4.f, -0.5f, 8.f, 8.f - }); - - set_values(input2, { - 0.5f, 2.5f, 0.5f, 2.5f, - 5.f, 7.f, 2.f, 4.f, - 15.f, 17.f, 8.f, 10.f, - -2.f, 6.5f, -0.5f, -2.5f }); - - set_values(mask, { - 0, 0, 0, 0, - 1, 1, 1, 1, - 0, 1, 0, 1, - 1, 0, 1, 0 }); - - network network(engine, topology); - - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); - - auto output = outputs.at("select").get_memory(); - - float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, - 2.f, 0.f, 6.f, 5.2f, - 15.f, 0.5f, 8.f, 12.f, - 4.f, 6.5f, 8.f, -2.5f }; - - auto output_ptr = output.pointer(); - - for (int i = 0; i < 16; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 2, 2, 2, 2 } }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); + + set_values(input, { + 1.f, 0.f, 5.f, 1.5f, + 2.f, 0.f, 6.f, 5.2f, + 3.f, 0.5f, 7.f, 12.f, + 4.f, -0.5f, 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, 0.5f, 2.5f, + 5.f, 7.f, 2.f, 4.f, + 15.f, 17.f, 8.f, 10.f, + -2.f, 6.5f, -0.5f, -2.5f }); + + set_values(mask, { + 0, 0, 0, 0, + 128, 210, 150, 177, + 0, 211, 0, 255, + 199, 0, 160, 0 }); + + network network(engine, topology); + + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { 0.5f, 2.5f, 0.5f, 2.5f, + 2.f, 0.f, 6.f, 5.2f, + 15.f, 0.5f, 8.f, 12.f, + 4.f, 6.5f, 8.f, -2.5f }; + + auto output_ptr = output.pointer(); + + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1.f, 0.f, 2.f, 0.f - }); + set_values(input, { + 1.f, 0.f, 2.f, 0.f + }); - set_values(input2, { - 0.5f, 2.5f, 5.f, 7.f - }); + set_values(input2, { + 0.5f, 2.5f, 5.f, 7.f + }); - set_values(mask, { - 0.f, 0.f, 1.f, 1.f - }); + set_values(mask, { + 0.f, 0.f, 1.f, 1.f + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - float answers[4] = { - 0.5f, 2.5f, 2.f, 0.f - }; + float answers[4] = { + 0.5f, 2.5f, 2.f, 0.f + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_bfyx_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1.f, 0.f, - 2.f, 0.f - }); + set_values(input, { + 1.f, 0.f, + 2.f, 0.f + }); - set_values(input2, { - 0.5f, 2.5f, - 5.f, 7.f - }); + set_values(input2, { + 0.5f, 2.5f, + 5.f, 7.f + }); - set_values(mask, { - 0.f, 0.f, - 1.f, 1.f - }); + set_values(mask, { + 0.f, 0.f, + 1.f, 1.f + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - float answers[4] = { - 0.5f, 2.5f, - 2.f, 0.f - }; + float answers[4] = { + 0.5f, 2.5f, + 2.f, 0.f + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f32, select_basic_byxf_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1.f, 0.f, - 2.f, 0.f - }); + set_values(input, { + 1.f, 0.f, + 2.f, 0.f + }); - set_values(input2, { - 0.5f, 2.5f, - 5.f, 7.f - }); + set_values(input2, { + 0.5f, 2.5f, + 5.f, 7.f + }); - set_values(mask, { - 0.f, 0.f, - 1.f, 1.f - }); + set_values(mask, { + 0.f, 0.f, + 1.f, 1.f + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - float answers[4] = { - 0.5f, 2.5f, - 2.f, 0.f - }; + float answers[4] = { + 0.5f, 2.5f, + 2.f, 0.f + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } // select_gpu_f16 TEST(select_gpu_f16, select_basic_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0, 0, - 1, 1 - }); + set_values(mask, { + 0, 0, + 1, 1 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - uint16_t answers[4] = { - 0, 2, - 2, 0 - }; + uint16_t answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f16, select_basic_mask_f32_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0.f, 0.f, - 1.5f, 0.4f - }); + set_values(mask, { + 0.f, 0.f, + 1.5f, 0.4f + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - uint16_t answers[4] = { - 0, 2, - 2, 0 - }; + uint16_t answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f16, select_basic_mask_i8_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0, 0, - 1, 1 - }); + set_values(mask, { + 0, 0, + 1, 1 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - uint16_t answers[4] = { - 0, 2, - 2, 0 - }; + uint16_t answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } TEST(select_gpu_f16, select_basic_mask_u8_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0, 0, - 1, 1 - }); + set_values(mask, { + 0, 0, + 128, 255 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - uint16_t answers[4] = { - 0, 2, - 2, 0 - }; + uint16_t answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); - } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } } // select_gpu_i8 TEST(select_gpu_i8, select_basic_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0, 0, - 3, 5 - }); + set_values(mask, { + 0, 0, + 3, 5 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + int answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } TEST(select_gpu_i8, select_basic_mask_f32_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0.f, 0.f, - 1.5f, 0.4f - }); + set_values(mask, { + 0.f, 0.f, + 1.5f, 0.4f + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + int answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } TEST(select_gpu_i8, select_basic_mask_f16_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0, 0, - 3, 5 - }); + set_values(mask, { + 0, 0, + 3, 5 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + int answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } TEST(select_gpu_i8, select_basic_mask_u8_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 1, 0, + 2, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 2, + 5, 7 + }); - set_values(mask, { - 0, 0, - 3, 5 - }); + set_values(mask, { + 0, 0, + 128, 255 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + int answers[4] = { + 0, 2, + 2, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } // select_gpu_u8 TEST(select_gpu_u8, select_basic_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 128, 0, + 255, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 255, + 205, 128 + }); - set_values(mask, { - 0, 0, - 1, 1 - }); + set_values(mask, { + 0, 0, + 128, 255 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + unsigned char answers[4] = { + 0, 255, + 255, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } TEST(select_gpu_u8, select_basic_mask_f32_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 128, 0, + 255, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 255, + 205, 128 + }); - set_values(mask, { - 0.f, 0.f, - 1.5f, 0.4f - }); + set_values(mask, { + 0.f, 0.f, + 1.5f, 0.4f + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + int answers[4] = { + 0, 255, + 255, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } TEST(select_gpu_u8, select_basic_mask_f16_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 128, 0, + 255, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 255, + 205, 128 + }); - set_values(mask, { - 0, 0, - 1, 1 - }); + set_values(mask, { + 0, 0, + 1, 1 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + unsigned char answers[4] = { + 0, 255, + 255, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } TEST(select_gpu_u8, select_basic_mask_i8_1x1x2x2) { - engine engine; + const auto& engine = get_test_engine(); - auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); - auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto input2 = memory::allocate(engine, { data_types::u8, format::yxfb,{ 1, 1, 2, 2 } }); + auto mask = memory::allocate(engine, { data_types::i8, format::yxfb,{ 1, 1, 2, 2 } }); - topology topology; - topology.add(input_layout("input", input.get_layout())); - topology.add(input_layout("input2", input2.get_layout())); - topology.add(input_layout("mask", mask.get_layout())); - topology.add(cldnn::select("select", "input", "input2", "mask")); + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(input_layout("mask", mask.get_layout())); + topology.add(cldnn::select("select", "input", "input2", "mask")); - set_values(input, { - 1, 0, - 2, 0 - }); + set_values(input, { + 128, 0, + 255, 0 + }); - set_values(input2, { - 0, 2, - 5, 7 - }); + set_values(input2, { + 0, 255, + 205, 128 + }); - set_values(mask, { - 0, 0, - 1, 1 - }); + set_values(mask, { + 0, 0, + 1, 1 + }); - network network(engine, topology); + network network(engine, topology); - network.set_input_data("input", input); - network.set_input_data("input2", input2); - network.set_input_data("mask", mask); - auto outputs = network.execute(); + network.set_input_data("input", input); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + auto outputs = network.execute(); - auto output = outputs.at("select").get_memory(); + auto output = outputs.at("select").get_memory(); - int answers[4] = { - 0, 2, - 2, 0 - }; + unsigned char answers[4] = { + 0, 255, + 255, 0 + }; - auto output_ptr = output.pointer(); + auto output_ptr = output.pointer(); - for (int i = 0; i < 4; i++) - { - EXPECT_EQ(answers[i], output_ptr[i]); - } + for (int i = 0; i < 4; i++) + { + EXPECT_EQ(answers[i], output_ptr[i]); + } } diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/shuffle_channels_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/shuffle_channels_test.cpp new file mode 100644 index 0000000..630b3b8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/shuffle_channels_test.cpp @@ -0,0 +1,386 @@ +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include + +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(shuffle_channels_fp32_gpu, d1_15_2_2_ax1_g5) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 15, 2, 2 } }); + int32_t axis = 1; + int32_t group = 5; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, + 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, 49.0f, + 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 2.f, 3.f, 12.f, 13.f, 14.f, 15.f, 24.f, 25.f, 26.f, 27.f, 36.f, 37.f, 38.f, 39.f, 48.f, 49.f, 50.f, 51.f, + 4.f, 5.f, 6.f, 7.f, 16.f, 17.f, 18.f, 19.f, 28.f, 29.f, 30.f, 31.f, 40.f, 41.f, 42.f, 43.f, 52.f, 53.f, 54.f, 55.f, + 8.f, 9.f, 10.f, 11.f, 20.f, 21.f, 22.f, 23.f, 32.f, 33.f, 34.f, 35.f, 44.f, 45.f, 46.f, 47.f, 56.f, 57.f, 58.f, 59.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + + +TEST(shuffle_channels_fp32_gpu, d1_15_2_2_axm3_g5) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 15, 2, 2 } }); + int32_t axis = -3; + int32_t group = 5; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, + 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, 49.0f, + 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 2.f, 3.f, 12.f, 13.f, 14.f, 15.f, 24.f, 25.f, 26.f, 27.f, 36.f, 37.f, 38.f, 39.f, 48.f, 49.f, 50.f, 51.f, + 4.f, 5.f, 6.f, 7.f, 16.f, 17.f, 18.f, 19.f, 28.f, 29.f, 30.f, 31.f, 40.f, 41.f, 42.f, 43.f, 52.f, 53.f, 54.f, 55.f, + 8.f, 9.f, 10.f, 11.f, 20.f, 21.f, 22.f, 23.f, 32.f, 33.f, 34.f, 35.f, 44.f, 45.f, 46.f, 47.f, 56.f, 57.f, 58.f, 59.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d15_2_2_ax0_g5) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 15, 2, 1, 2 } }); + int32_t axis = 0; + int32_t group = 5; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, + 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, 49.0f, + 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 2.f, 3.f, 12.f, 13.f, 14.f, 15.f, 24.f, 25.f, 26.f, 27.f, 36.f, 37.f, 38.f, 39.f, 48.f, 49.f, 50.f, 51.f, + 4.f, 5.f, 6.f, 7.f, 16.f, 17.f, 18.f, 19.f, 28.f, 29.f, 30.f, 31.f, 40.f, 41.f, 42.f, 43.f, 52.f, 53.f, 54.f, 55.f, + 8.f, 9.f, 10.f, 11.f, 20.f, 21.f, 22.f, 23.f, 32.f, 33.f, 34.f, 35.f, 44.f, 45.f, 46.f, 47.f, 56.f, 57.f, 58.f, 59.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d15_2_2_axm4_g5) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 15, 2, 1, 2 } }); + int32_t axis = -4; + int32_t group = 5; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, + 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, 49.0f, + 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 2.f, 3.f, 12.f, 13.f, 14.f, 15.f, 24.f, 25.f, 26.f, 27.f, 36.f, 37.f, 38.f, 39.f, 48.f, 49.f, 50.f, 51.f, + 4.f, 5.f, 6.f, 7.f, 16.f, 17.f, 18.f, 19.f, 28.f, 29.f, 30.f, 31.f, 40.f, 41.f, 42.f, 43.f, 52.f, 53.f, 54.f, 55.f, + 8.f, 9.f, 10.f, 11.f, 20.f, 21.f, 22.f, 23.f, 32.f, 33.f, 34.f, 35.f, 44.f, 45.f, 46.f, 47.f, 56.f, 57.f, 58.f, 59.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d2_2_6_axm2_g3) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 6 } }); + int32_t axis = -2; + int32_t group = 3; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 2.f, 4.f, 1.f, 3.f, 5.f, 6.f, 8.f, 10.f, 7.f, 9.f, 11.f, + 12.f, 14.f, 16.f, 13.f, 15.f, 17.f, 18.f, 20.f, 22.f, 19.f, 21.f, 23.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d2_6_2_axm3_g3) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 6, 1, 2 } }); + int32_t axis = -3; + int32_t group = 3; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 4.f, 5.f, 8.f, 9.f, 2.f, 3.f, 6.f, 7.f, 10.f, 11.f, + 12.f, 13.f, 16.f, 17.f, 20.f, 21.f, 14.f, 15.f, 18.f, 19.f, 22.f, 23.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d2_2_6_axm2_g2) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 6 } }); + int32_t axis = -2; + int32_t group = 2; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 3.f, 1.f, 4.f, 2.f, 5.f, 6.f, 9.f, 7.f, 10.f, 8.f, 11.f, + 12.f, 15.f, 13.f, 16.f, 14.f, 17.f, 18.f, 21.f, 19.f, 22.f, 20.f, 23.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d2_6_2_axm3_g2) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 6, 1, 2 } }); + int32_t axis = -3; + int32_t group = 2; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 1.f, 6.f, 7.f, 2.f, 3.f, 8.f, 9.f, 4.f, 5.f, 10.f, 11.f, + 12.f, 13.f, 18.f, 19.f, 14.f, 15.f, 20.f, 21.f, 16.f, 17.f, 22.f, 23.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} + +TEST(shuffle_channels_fp32_gpu, d6_axm0_g2) { + engine engine; + + auto input0 = memory::allocate(engine, { data_types::f32, format::bfyx, { 6, 1, 1, 1 } }); + int32_t axis = 0; + int32_t group = 2; + + set_values(input0, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f + }); + + topology topology; + topology.add(input_layout("Input0", input0.get_layout())); + topology.add( + shuffle_channels("shuffle_channels", "Input0", group, axis) + ); + + network network(engine, topology); + + network.set_input_data("Input0", input0); + + auto outputs = network.execute(); + + auto output = outputs.at("shuffle_channels").get_memory(); + auto output_ptr = output.pointer(); + + std::vector expected_results = { + 0.f, 3.f, 1.f, 4.f, 2.f, 5.f + }; + + for (size_t i = 0; i < expected_results.size(); ++i) { + EXPECT_EQ(expected_results[i], output_ptr[i]); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_gpu_test.cpp index 2a1802c..a685008 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_gpu_test.cpp @@ -41,12 +41,13 @@ public: float out_buffer[out_size]; float expected_buffer[out_size]; - cldnn::engine engine; + const cldnn::engine& engine; cldnn::memory input; + //neural::primitive output = memory::allocate({ memory::format::xb_f32, {output_b, {{output_x}}, 1}}); softmax_gpu_xb_f32_test_fixture() - :engine() + : engine(get_test_engine()) ,input(memory::allocate(engine, { data_types::f32, format::yxfb, { input_b, 1, input_x, 1}})) {} @@ -191,7 +192,7 @@ TEST(softmax_gpu_bfyx_f32, normalize_fyx) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -264,7 +265,7 @@ TEST(softmax_gpu_bfyx_f32, normalize_y) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -359,7 +360,7 @@ TEST(softmax_gpu_bfyx_f32, normalize_f) { // Input : 2x3x2x2 static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, buf_size = x_size*y_size * batch_num * feature_num; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } }); topology topology; @@ -447,7 +448,7 @@ TEST(softmax_gpu_yxfb_f32, normalize_f) { static const int32_t x_size = 1, y_size = 2, feature_num = 1, batch_num = 12, buf_size = x_size*y_size * batch_num * feature_num; - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ batch_num, feature_num, y_size , x_size } }); topology topology; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp index 6c5a5ef..302ca0b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp @@ -31,7 +31,7 @@ using namespace tests; TEST(softmax_loss_grad_f32_fw_gpu, basic1) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 4, 1 } }); auto labels = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 1, 1 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/split_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/split_gpu_test.cpp index 2df0018..921c382 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/split_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/split_gpu_test.cpp @@ -23,8 +23,12 @@ #include #include #include +#include #include "test_utils/test_utils.h" +#include +#include + using namespace cldnn; using namespace tests; @@ -55,6 +59,300 @@ void check_feature_map(cldnn::pointer output_ptr, std::vector &input_vec, } } +template +void split_test(int batch_num, int feature_num, int x_size, int y_size, std::vector split_offsets) +{ + const auto& engine = get_test_engine(); + cldnn::tensor reference_input_size = { batch_num, feature_num, x_size, y_size }; + + cldnn::memory input = memory::allocate(engine, { type_to_data_type::value, format::bfyx, reference_input_size }); + std::vector > input_ids_offsets; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + + // lambda exoression to create the primitive id for the splits + auto create_split_id = [](size_t splitNum) { + std::stringstream ss; + ss << std::setw(5) << std::setfill('0') << splitNum; + + return ss.str(); + }; + + // Create the splits with the split ids for the topology + for (size_t splitNum = 0; splitNum < split_offsets.size(); splitNum++) + { + input_ids_offsets.push_back({ create_split_id(splitNum), split_offsets[splitNum]}); + } + + topology.add(split("split", "input", input_ids_offsets)); + + std::vector input_vec = generate_random_input(batch_num, feature_num, y_size, x_size, -10, 10); + set_values(input, input_vec); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + + // The number of splits should match the expected number of splits + EXPECT_EQ(outputs.size(), size_t(split_offsets.size())); + + std::vector expected_sizes; + for (size_t splitNum = 0; splitNum < split_offsets.size(); splitNum++) // Calculate the expected sizes + { + cldnn::tensor size; + + if (splitNum < (split_offsets.size() - 1)) + { + size = split_offsets[splitNum + 1] - split_offsets[splitNum]; + } + else + { + size = reference_input_size - split_offsets[splitNum]; + } + + // For all the other dimensions, copy from the split_input + for (int dimension = 0; dimension < CLDNN_TENSOR_DIM_MAX; dimension++) + { + size.raw[dimension] + = (size.raw[dimension] == 0) ? reference_input_size.raw[dimension] : size.raw[dimension]; + } + + expected_sizes.push_back(size); + } + + pointer input_ptr = input.pointer(); + + for (size_t splitNum = 0; splitNum < split_offsets.size(); splitNum++) + { + primitive_id split_id = "split:" + create_split_id(splitNum); + cldnn::memory output = outputs.at(split_id).get_memory(); + auto prim = output.get_layout(); + EXPECT_EQ(prim.size, expected_sizes[splitNum]); + auto output_ptr = output.pointer(); + + // Output tensor size + auto output_batch = prim.size.batch[0]; + auto output_feature = prim.size.feature[0]; + auto output_x = prim.size.spatial[0]; + auto output_y = prim.size.spatial[1]; + + // Input offsets, starting from which we will compare the output + auto input_batch_offset = split_offsets[splitNum].batch[0]; + auto input_feature_offset = split_offsets[splitNum].feature[0]; + auto input_y_offset = split_offsets[splitNum].spatial[1]; + auto input_x_offset = split_offsets[splitNum].spatial[0]; + + // iterator to iterate through input buffer + auto input_batch_itr = input_batch_offset; + auto input_feature_itr = input_feature_offset; + auto input_y_itr = input_y_offset; + auto input_x_itr = input_x_offset; + + for (auto b = 0; b < output_batch; ++b) { // B + + // reset the input feature iterator + input_feature_itr = input_feature_offset; + for (auto f = 0; f < output_feature; f++) { // F + + // reset the input y iterator + input_y_itr = input_y_offset; + for (auto y = 0; y < output_y; y++) { // Y + + // reset the input x iterator + input_x_itr = input_x_offset; + for (auto x = 0; x < output_x; x++) { // X + auto linear_id = input_x_itr + x_size * (input_y_itr + y_size * (input_feature_itr + feature_num * input_batch_itr)); // index in input + auto output_linear_id = x + output_x * (y + output_y * (f + output_feature * b)); // index in output + EXPECT_EQ(output_ptr[output_linear_id], input_vec[linear_id]); + input_x_itr++; // update the input x iterator + } + input_y_itr++; // update the input y iterator + } + input_feature_itr++; // update the input feature iterator + } + input_batch_itr++; // update the input batch iterator + } + } +} + +TEST(split_gpu, split_1d_uneven_2_splits) { + + // Input : 2x4x3x3 + // Output1 : 2x1x3x3 + // Output2 : 2x3x3x3 + // Split params: + // id: "out0", offsets: { 0, 0, 0, 0 } + // id: "out1", offsets: { 0, 1, 0, 0 } + + auto batch_num = 2; + auto feature_num = 4; + auto x_size = 3; + auto y_size = 3; + std::vector split_offsets = { + {0, 0, 0, 0}, + {0, 1, 0, 0} + }; + + split_test(batch_num, feature_num, x_size, y_size, split_offsets); +} + + +TEST(split_gpu, basic_split_concat_optimization) { + + const auto& engine = get_test_engine(); + + auto input = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 25, 1, 256 } }); + tests::set_random_values(input); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + std::vector> offsets; + std::vector ids; + for (int i = 0; i < 25; i++) + { + auto id = "crop_" + std::to_string(i); + ids.push_back("split:" + id); + offsets.push_back({ id, {0, i, 0, 0} }); + } + + topology.add(split("split", "input", offsets)); + topology.add(concatenation("concat", ids, concatenation::along_f)); + topology.add(reorder("output", "concat", format::bfyx, data_types::f32)); + + build_options opts; + opts.set_option(build_option::optimize_data(true)); + network network(engine, topology, opts); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + auto output = outputs.at("output").get_memory(); + auto output_ptr = output.pointer(); + auto input_ptr = input.pointer(); + + for (int i = 0; i < 25*256; ++i) + { + EXPECT_EQ(output_ptr[i], input_ptr[i]); + } +} + +TEST(split_gpu, split_1d_uneven_3_splits) { + + // Input : 2x8x3x3 + // Output1 : 2x1x3x3 + // Output2 : 2x3x3x3 + // Output3 : 2x4x3x3 + // Split params: + // id: "out0", offsets: { 0, 0, 0, 0 } + // id: "out1", offsets: { 0, 1, 0, 0 } + // id: "out2", offsets: { 0, 4, 0, 0 } + + auto batch_num = 2; + auto feature_num = 8; + auto x_size = 3; + auto y_size = 3; + std::vector split_offsets = { + {0, 0, 0, 0}, + {0, 1, 0, 0}, + {0, 4, 0, 0}, + }; + + split_test(batch_num, feature_num, x_size, y_size, split_offsets); +} + +TEST(split_gpu, split_2d_uneven_2_splits) { + + // Input : 2x8x10x3 + // Output1 : 2x1x4x3 + // Output2 : 2x3x6x3 + // Split params: + // id: "out0", offsets: { 0, 0, 0, 0 } + // id: "out1", offsets: { 0, 1, 4, 0 } + + auto batch_num = 2; + auto feature_num = 8; + auto x_size = 10; + auto y_size = 3; + std::vector split_offsets = { + {0, 0, 0, 0}, + {0, 1, 4, 0} + }; + + split_test(batch_num, feature_num, x_size, y_size, split_offsets); +} + +TEST(split_gpu, split_2d_uneven_3_split3) { + + // Input : 2x8x10x3 + // Output1 : 2x1x4x3 + // Output2 : 2x3x3x3 + // Output3 : 2x4x3x3 + // Split params: + // id: "out0", offsets: { 0, 0, 0, 0 } + // id: "out1", offsets: { 0, 1, 4, 0 } + // id: "out2", offsets: { 0, 4, 7, 0 } + + auto batch_num = 2; + auto feature_num = 8; + auto x_size = 10; + auto y_size = 3; + std::vector split_offsets = { + {0, 0, 0, 0}, + {0, 1, 4, 0}, + {0, 4, 7, 0}, + }; + + split_test(batch_num, feature_num, x_size, y_size, split_offsets); +} + +TEST(split_gpu, split_3d_uneven_2_splits) { + + // Input : 2x8x10x3 + // Output1 : 2x1x4x1 + // Output2 : 2x7x6x2 + // Split params: + // id: "out0", offsets: { 0, 0, 0, 0 } + // id: "out1", offsets: { 0, 1, 4, 1 } + + auto batch_num = 2; + auto feature_num = 8; + auto x_size = 10; + auto y_size = 3; + std::vector split_offsets = { + {0, 0, 0, 0}, + {0, 1, 4, 1} + }; + + split_test(batch_num, feature_num, x_size, y_size, split_offsets); +} + +TEST(split_gpu, split_3d_uneven_3_splits) { + + // Input : 2x8x10x5 + // Output1 : 2x1x4x1 + // Output2 : 2x6x4x1 + // Output3 : 2x1x2x1 + // Split params: + // id: "out0", offsets: { 0, 0, 0, 0 } + // id: "out1", offsets: { 0, 1, 4, 1 } + // id: "out2", offsets: { 0, 7, 8, 2 } + + auto batch_num = 2; + auto feature_num = 8; + auto x_size = 10; + auto y_size = 3; + std::vector split_offsets = { + {0, 0, 0, 0}, + {0, 1, 4, 1}, + {0, 7, 8, 2} + }; + + split_test(batch_num, feature_num, x_size, y_size, split_offsets); +} + TEST(split_gpu, basic_in2x3x2x2_split_feature_bfyx) { // Input : 6x3x4x3 // 3 x Outputs: 6x1x4x3 @@ -63,7 +361,7 @@ TEST(split_gpu, basic_in2x3x2x2_split_feature_bfyx) { // id: "out1", offsets: { 0, 1, 0, 0 } // id: "out2", offsets: { 0, 2, 0, 0 } - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 6; auto feature_num = 3; @@ -110,7 +408,7 @@ TEST(split_gpu, basic_in2x3x2x2_split_scale_feature_bfyx) { // id: "out2", offsets: { 0, 2, 0, 0 } // Additional scale layer at the end - engine engine; + const auto& engine = get_test_engine(); auto batch_num = 6; auto feature_num = 3; @@ -143,7 +441,7 @@ TEST(split_gpu, basic_in2x3x2x2_split_scale_feature_bfyx) { set_values(scale_input1, scale_input_vec1); std::vector scale_input_vec2 = { 3.f }; set_values(scale_input2, scale_input_vec2); - + std::vector input_vec = generate_random_input(batch_num, feature_num, y_size, x_size, -10, 10); set_values(input, input_vec); @@ -165,4 +463,4 @@ TEST(split_gpu, basic_in2x3x2x2_split_scale_feature_bfyx) { auto output_ptr = output.pointer(); check_feature_map(output_ptr, input_vec, batch_num, feature_num, y_size, x_size, i, i + 1); } -} \ No newline at end of file +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/strided_slice_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/strided_slice_gpu_test.cpp new file mode 100644 index 0000000..c673071 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/strided_slice_gpu_test.cpp @@ -0,0 +1,375 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include +#include +#include "api/CPP/strided_slice.hpp" +#include +#include +#include +#include "test_utils/test_utils.h" +#include + + +using namespace cldnn; +using namespace tests; + + +TEST(strided_slice_gpu_f32, test_2x2x2x2) { + // Input (BFYX): 2x2x2x2 + // Begin (BFYX): 0x0x0x0 + // End (BFYX): 2x2x2x2 + // Stride (BFYX): 1x1x1x1 + // Output (BFYX): 2x2x2x2 + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }); + set_values(begin, { + 0, 0, 0, 0 + }); + set_values(end, { + 2, 2, 2, 2 + }); + set_values(strides, { + 1, 1, 1, 1 + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("input2", begin)); + topology.add(data("input3", end)); + topology.add(data("input4", strides)); + topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {})); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f }; + + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < answers.size(); ++i) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(strided_slice_gpu_f32, test_2x2x2x2_2) { + // Input (BFYX): 2x2x2x2 + // Begin (BFYX): 1x1x1x1 + // End (BFYX): 2x2x2x2 + // Stride (BFYX): 1x1x1x1 + // Output (BFYX): 1x1x1x1 + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); + auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }); + set_values(begin, { + 1, 1, 1, 1 + }); + set_values(end, { + 2, 2, 2, 2 + }); + set_values(strides, { + 1, 1, 1, 1 + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("input2", begin)); + topology.add(data("input3", end)); + topology.add(data("input4", strides)); + topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {})); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { 15.f }; + + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < answers.size(); ++i) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(strided_slice_gpu_f32, test_2x2x4x3) { + // Input (BFYX): 2x2x4x3 + // Begin (BFYX): 0x0x0x0 + // End (BFYX): 2x2x4x3 + // Stride (BFYX): 1x1x2x1 + // Output (BFYX): 2x2x2x3 + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 4 } }); + auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + + set_values(input, { + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f, + 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f, + 27.f, 28.f, 29.f, 30.f, 31.f, 32.f, 33.f, 34.f, 35.f, + 36.f, 37.f, 38.f, 39.f, 40.f, 41.f, 42.f, 43.f, 44.f, + 45.f, 46.f, 47.f + }); + set_values(begin, { + 0, 0, 0, 0 + }); + set_values(end, { + 2, 2, 4, 3 + }); + set_values(strides, { + 1, 1, 2, 1 + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("input2", begin)); + topology.add(data("input3", end)); + topology.add(data("input4", strides)); + topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {})); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 0.f, 1.f, 2.f, 6.f, 7.f, 8.f, 12.f, 13.f, 14.f, 18.f, 19.f, 20.f, + 24.f, 25.f, 26.f, 30.f, 31.f, 32.f, 36.f, 37.f, 38.f, 42.f, 43.f, 44.f + }; + + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < answers.size(); ++i) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(strided_slice_gpu_f32, test_2x2x4x4) { + // Input (BFYX): 2x2x1x1 + // Begin (BFYX): 1x0x0x1 + // End (BFYX): 2x2x4x4 + // Stride (BFYX): 1x1x1x2 + // Output (BFYX): 1x2x2x3 + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 4, 4 } }); + auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, + 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, + 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, + 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 48.0f, 49.0f, + 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, + 60.0f, 61.0f, 62.0f, 63.0f + }); + set_values(begin, { + 1, 0, 0, 1 + }); + set_values(end, { + 2, 2, 4, 4 + }); + set_values(strides, { + 1, 1, 2, 1 + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("input2", begin)); + topology.add(data("input3", end)); + topology.add(data("input4", strides)); + topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {})); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 33.f, 34.f, 35.f, 41.f, 42.f, 43.f, 49.f, 50.f, 51.f, 57.f, 58.f, 59.f + }; + + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < answers.size(); ++i) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(strided_slice_gpu_f32, test_2x2x4x1_new_axis_mask) { + // Input (BFYX): 2x2x4x1 + // New_axis_mask: 1 + // Output (BFYX): 1x2x2x4 + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 4 } }); + auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }); + set_values(begin, { + 1, 0, 1, 0 + }); + set_values(end, { + 2, 2, 4, 4 + }); + set_values(strides, { + 1, 1, 1, 2 + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("input2", begin)); + topology.add(data("input3", end)); + topology.add(data("input4", strides)); + topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, { 1 }, {})); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }; + + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < answers.size(); ++i) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} + +TEST(strided_slice_gpu_f32, test_2x2x1x1_new_axis_mask_2) { + // Input (BFYX): 2x2x1x1 + // New_axis_mask: 101 + // Output (BFYX): 1x2x1x2 + + const auto& engine = get_test_engine(); + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } }); + auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f + }); + set_values(begin, { + 1, 0, 1, 0 + }); + set_values(end, { + 2, 2, 4, 4 + }); + set_values(strides, { + 1, 1, 1, 2 + }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("input2", begin)); + topology.add(data("input3", end)); + topology.add(data("input4", strides)); + topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, { 1, 0, 1 }, {})); + + network network(engine, topology); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 0.0f, 1.0f, 2.0f, 3.0f + }; + + auto output_ptr = output.pointer(); + + for (size_t i = 0; i < answers.size(); ++i) + { + EXPECT_TRUE(are_equal(answers[i], output_ptr[i])); + } +} diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/tile_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/tile_gpu_test.cpp index 1edf8a9..0d49b04 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/tile_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/tile_gpu_test.cpp @@ -69,7 +69,7 @@ void tile_ref(const memory& input, memory& output, tile::tile_axis axis, int num } TEST(tile_gpu, basic_in1x2x2x2_axis_b) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } }); auto output_ref = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } }); @@ -99,7 +99,7 @@ TEST(tile_gpu, basic_in1x2x2x2_axis_b) { } TEST(tile_gpu, basic_in1x2x2x2_axis_f) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } }); auto output_ref = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 2, 2 } }); @@ -132,7 +132,7 @@ TEST(tile_gpu, basic_in1x2x2x2_axis_f) { } TEST(tile_gpu, basic_in1x2x2x2_axis_y) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } }); auto output_ref = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 4, 2 } }); @@ -165,7 +165,7 @@ TEST(tile_gpu, basic_in1x2x2x2_axis_y) { } TEST(tile_gpu, basic_in1x2x2x2_axis_x) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } }); auto output_ref = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 4 } }); @@ -197,7 +197,7 @@ TEST(tile_gpu, basic_in1x2x2x2_axis_x) { } TEST(tile_gpu, basic_in1x2x2x2_axis_x_dense) { - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 1 } }); auto output_ref = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 4 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/topology_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/topology_test.cpp index a5c06f2..3491933 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/topology_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/topology_test.cpp @@ -445,11 +445,11 @@ protected: cldnn::layout* output_layout; std::vector generator; - static cldnn::engine engine; + static const cldnn::engine& engine; static std::vector all_output_layouts;//just for tear-down }; -cldnn::engine topology_test::engine; +const cldnn::engine& topology_test::engine = tests::get_test_engine(); std::vector topology_test::all_output_layouts = {}; std::vector topology_test::topology_generator::layer_types = { diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/trim_to_outputs_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/trim_to_outputs_gpu_test.cpp new file mode 100644 index 0000000..428881f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/trim_to_outputs_gpu_test.cpp @@ -0,0 +1,200 @@ +/* +// Copyright (c) 2018 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include "api/CPP/memory.hpp" +#include +#include "api/CPP/concatenation.hpp" +#include +#include +#include +#include +#include "test_utils/test_utils.h" + +using namespace cldnn; +using namespace tests; + +/* + This set of tests has been designed to check the correctness of trim_to_outputs optimization pass +*/ + + +/* + In this test we check if the convolution conv2 will be eliminated from the network. This is expected to be done in trim_to_outputs optimization pass + + Network structure: input -> conv1 (output) + \ + ---> conv2 (to be eliminated) +*/ +TEST(trim_to_outputs, one_node_to_eliminate_case1) { + const auto& engine = get_test_engine(); + build_options build_opt; + build_opt.set_option(cldnn::build_option::outputs({ "conv1" })); + build_opt.set_option(build_option::optimize_data(false)); // to avoid adding reorders + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 1, 1 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + auto bias = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, { 1.1f }); + set_values(weights, { 2.1f }); + set_values(bias, { 1.6f }); + + std::vector out_data = { 3.91f }; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights", weights)); + topology.add(data("bias", bias)); + topology.add(cldnn::convolution("conv1", { "input" }, { "weights" }, { "bias" })); + topology.add(cldnn::convolution("conv2", { "input" }, { "weights" }, { "bias" })); + + network network(engine, topology, build_opt); + network.set_input_data("input", input); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), (size_t)1); // there is only one output + EXPECT_EQ(network.get_executed_primitives().size(), (size_t)2); // input and conv1 where executed + EXPECT_EQ(network.get_all_primitive_ids().size(), (size_t)4); // also bias and weights still exist + + for (auto& it : outputs) + { + auto output_ptr = it.second.get_memory().pointer(); + for (size_t cntr = 0; cntr < out_data.size(); cntr++) + { + EXPECT_NEAR(output_ptr[cntr], out_data[cntr], 1e-4); + } + EXPECT_EQ(it.first, "conv1"); + } +} + +/* +in this test we check if the convolution conv2 will be eliminated from the network. This is expected to be done in trim_to_outputs optimization pass + +Network structure: input -> conv1 (output) + \ + ---> conv2 (to be eliminated along with its weights and bias) +*/ +TEST(trim_to_outputs, one_node_to_eliminate_case2) { + const auto& engine = get_test_engine(); + build_options build_opt; + build_opt.set_option(cldnn::build_option::outputs({ "conv1" })); + build_opt.set_option(build_option::optimize_data(false)); // to avoid adding reorders + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 1 } }); + auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto bias1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto bias2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + set_values(input, { 1.1f }); + set_values(weights1, { 2.1f }); + set_values(bias1, { 1.6f }); + set_values(weights2, { 0.3f }); + set_values(bias2, { 0.2f }); + + std::vector out_data = { 3.91f }; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights1", weights1)); + topology.add(data("bias1", bias1)); + topology.add(cldnn::convolution("conv1", { "input" }, { "weights1" }, { "bias1" })); + topology.add(data("weights2", weights2)); + topology.add(data("bias2", bias2)); + topology.add(cldnn::convolution("conv2", { "input" }, { "weights2" }, { "bias2" })); + + network network(engine, topology, build_opt); + network.set_input_data("input", input); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), (size_t)1); // there is only one output + EXPECT_EQ(network.get_executed_primitives().size(), (size_t)2); // input and conv1 where executed + EXPECT_EQ(network.get_all_primitive_ids().size(), (size_t)4); // also bias1 and weights1 still exist + + for (auto& it : outputs) + { + auto output_ptr = it.second.get_memory().pointer(); + + for (size_t cntr = 0; cntr < out_data.size(); cntr++) + { + EXPECT_NEAR(output_ptr[cntr], out_data[cntr], 1e-4); + } + EXPECT_EQ(it.first, "conv1"); + } +} + +/* +in this test we check if the convolution conv2 will be eliminated from the network. This is expected to be done in trim_to_outputs optimization pass + +Network structure: input ---> conv1 --- ---> conv4 (output) + \ + ---> conv2 ---> conv3 +Convolutions conv2, conv3 should be optimized out along with weights23 shered by conv2 and conv3. +*/ +TEST(trim_to_outputs, two_nodes_to_eliminate_case1) { + const auto& engine = get_test_engine(); + build_options build_opt; + build_opt.set_option(cldnn::build_option::outputs({ "conv4" })); + build_opt.set_option(build_option::optimize_data(false)); // to avoid adding reorders + + auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 1 } }); + auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto weights23 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto weights4 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + auto bias = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); + + set_values(input, { 1.1f }); + set_values(weights1, { 2.1f }); + set_values(weights23, { 3.0f }); + set_values(weights4, { 2.0f }); + set_values(bias, { 1.6f }); + + std::vector out_data = { 9.42f }; + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights1", weights1)); + topology.add(data("bias", bias)); + topology.add(cldnn::convolution("conv1", { "input" }, { "weights1" }, { "bias" })); + topology.add(data("weights23", weights23)); + topology.add(cldnn::convolution("conv2", { "input" }, { "weights23" }, { "bias" })); + topology.add(cldnn::convolution("conv3", { "conv2" }, { "weights23" }, { "bias" })); + topology.add(data("weights4", weights4)); + topology.add(cldnn::convolution("conv4", { "conv1" }, { "weights4" }, { "bias" })); + + network network(engine, topology, build_opt); + network.set_input_data("input", input); + auto outputs = network.execute(); + + EXPECT_EQ(outputs.size(), (size_t)1); // there is only one output + EXPECT_EQ(network.get_executed_primitives().size(), (size_t)3); // input, conv1 and conv4 where executed + EXPECT_EQ(network.get_all_primitive_ids().size(), (size_t)6); // also bias weights1 and weights4 still exist + + for (auto& it : outputs) + { + auto output_ptr = it.second.get_memory().pointer(); + + for (size_t cntr = 0; cntr < out_data.size(); cntr++) + { + EXPECT_NEAR(output_ptr[cntr], out_data[cntr], 1e-4); + } + EXPECT_EQ(it.first, "conv4"); + } +} + diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/upsampling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/upsampling_gpu_test.cpp index eceece2..d0d76a8 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/upsampling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/upsampling_gpu_test.cpp @@ -41,7 +41,7 @@ TEST(upsampling_gpu, basic_in2x3x2x2_nearest) { // f1: b0: 7 8 -16 b1: 12 9 -17 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } }); @@ -112,7 +112,7 @@ TEST(upsampling_gpu, basic_in2x3x2x2_bilinear) { // f0: b0: 3 4 // - engine engine; + const auto& engine = get_test_engine(); auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } }); diff --git a/inference-engine/thirdparty/clDNN/tests/test_utils/instrumentation.cpp b/inference-engine/thirdparty/clDNN/tests/test_utils/instrumentation.cpp index e9c5ba2..e90744f 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_utils/instrumentation.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_utils/instrumentation.cpp @@ -22,6 +22,7 @@ #include #include + namespace instrumentation { // initalize dumping directory for whole run const std::string logger::dump_dir = DUMP_DIRECTORY; @@ -303,7 +304,7 @@ namespace instrumentation { auto i_size = mem_arg.size.batch[0]; //batch = input feature map auto x_size = mem_arg.size.spatial[0]; // spatial_x = output feature map auto weights_size = mem_arg.size.count(); - int xsv = 8, bsv = 8; + int xsv = 8, bsv = 8; unsigned int input_it = 0, input_i_it= 0 , input_o_it = 0; for (cldnn::tensor::value_type it = 0; it < weights_size; it++) { @@ -371,9 +372,10 @@ namespace instrumentation { } template - void dump(const cldnn::memory& mem, std::vector>& streams) + void dump(const cldnn::memory& mem, std::vector>& dump_strings) { auto mem_ptr = mem.pointer(); + std::stringstream stream; auto&& pitches = mem.get_layout().get_pitches(); auto&& size = mem.get_layout().size; @@ -386,39 +388,40 @@ namespace instrumentation { for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x) { unsigned int input_it = b*pitches.batch[0] + f*pitches.feature[0] + y*pitches.spatial[1] + x*pitches.spatial[0]; - streams[b][f] << convert_element(mem_ptr[input_it]) << " "; + stream << convert_element(mem_ptr[input_it]) << " "; input_it++; } - streams[b][f] << std::endl; + stream << std::endl; + dump_strings[b][f] = stream.str(); } } } } void logger::log_memory_to_file(const cldnn::memory& mem, std::string prefix, bool single_batch, cldnn::tensor::value_type batch_id, bool single_feature, cldnn::tensor::value_type feature_id) - { + { auto batch = mem.get_layout().size.batch[0]; auto feature = mem.get_layout().size.feature[0]; auto eng_type = "gpu" ; - std::vector> streams(batch); + std::vector> dump_strings(batch); for(cldnn::tensor::value_type b = 0; b < batch; b++) { - streams[b].resize(feature); + dump_strings[b].resize(feature); } if (mem.get_layout().data_type == cldnn::data_types::f32) - dump(mem, streams); + dump(mem, dump_strings); else - dump(mem, streams); + dump(mem, dump_strings); for (cldnn::tensor::value_type b = 0; b < batch; b++) for (cldnn::tensor::value_type f = 0; f < feature; f++) { - if ((!single_batch || b == batch_id) && (!single_feature || f == feature_id)) + if (!single_batch || (b == batch_id && f == feature_id)) { std::string filename((dump_dir + "/" + prefix + "_" + eng_type + "_b" + std::to_string(b) + "_f" + std::to_string(f) + ".txt")); - std::ofstream file_stream = std::ofstream(filename, std::ios::out); - file_stream << streams[b][f].str(); + std::ofstream file_stream(filename); + file_stream << dump_strings[b][f]; file_stream.close(); } } diff --git a/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.cpp b/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.cpp index ddc7467..b46bedf 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.cpp @@ -77,7 +77,7 @@ namespace tests { values.push_back(static_cast(multipler + j)); } - tests::set_values_per_batch_and_feature(input_mems[i], generic_params->input_layouts[i], values); + tests::set_values_per_batch_and_feature(input_mems[i], values); multipler = values.size(); } else @@ -87,7 +87,7 @@ namespace tests { values.push_back(FLOAT16(static_cast(multipler + j))); } - tests::set_values_per_batch_and_feature(input_mems[i], generic_params->input_layouts[i], values); + tests::set_values_per_batch_and_feature(input_mems[i], values); multipler = values.size(); } } @@ -276,7 +276,7 @@ namespace tests return{ p, calc_offfset(layout, p) }; } - size_t generic_test::get_linear_index(const layout & layout, size_t b, size_t f, size_t y, size_t x, const memory_desc& desc) + size_t generic_test::get_linear_index(const layout&, size_t b, size_t f, size_t y, size_t x, const memory_desc& desc) { return desc.offset + @@ -309,7 +309,9 @@ namespace tests //{ format::yx,{ 8,8 } } , { format::yx,{ 9,9 } } , { format::yx,{ 10,10 } } , { format::yx,{ 11,11 } } , { format::yx,{ 12,12 } } , { format::yx,{ 13,13 } } , //{ format::yx,{ 14,14 } } , { format::yx,{ 15,15 } } , { format::yx,{ 16,16 } } }; - for (cldnn::data_types data_type : test_data_types()) + auto data_types = test_data_types(); + + for (cldnn::data_types data_type : data_types) { for (cldnn::format fmt : test_input_formats) { @@ -329,6 +331,12 @@ namespace tests return all_generic_params; } + const cldnn::engine & get_test_engine() + { + static const cldnn::engine engine; + return engine; + } + const std::string test_dump::name() const { std::string temp = name_str; @@ -377,8 +385,7 @@ namespace tests std::vector result; result.push_back(cldnn::data_types::f32); - cldnn::engine temp; - if(temp.get_info().supports_fp16) + if(get_test_engine().get_info().supports_fp16) { result.push_back(cldnn::data_types::f16); } @@ -390,4 +397,4 @@ namespace tests std::vector generic_test::test_feature_sizes = { 1, 2 };// , 3, 15}; std::vector generic_test::test_input_sizes = { { 1, 1, 100, 100 } ,{ 1, 1, 277, 277 } ,{ 1, 1, 400, 600 } }; -} +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.h b/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.h index aed2146..62892df 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.h +++ b/inference-engine/thirdparty/clDNN/tests/test_utils/test_utils.h @@ -21,6 +21,7 @@ #include "api/CPP/memory.hpp" #include "api/CPP/tensor.hpp" #include "api/CPP/program.hpp" +#include "api/CPP/network.hpp" #include #include #include @@ -40,6 +41,8 @@ #include "api/CPP/activation.hpp" #include "api/CPP/pooling.hpp" +#include + #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) namespace tests { @@ -79,7 +82,7 @@ inline VF flatten_4d(cldnn::format input_format, VVVVF &data) { size_t b = data[0].size(); size_t c = data[0][0].size(); size_t d = data[0][0][0].size(); - VF vec(a * b * c * d, 0.0f); + VF vec(a * b * c * d, (T)(0.0f)); size_t idx = 0; switch (input_format.value) { @@ -91,6 +94,14 @@ inline VF flatten_4d(cldnn::format input_format, VVVVF &data) { vec[idx++] = data[bi][fi][yi][xi]; break; + case cldnn::format::fyxb: + for (size_t fi = 0; fi < b; ++fi) + for (size_t yi = 0; yi < c; ++yi) + for (size_t xi = 0; xi < d; ++xi) + for (size_t bi = 0; bi < a; ++bi) + vec[idx++] = data[bi][fi][yi][xi]; + break; + case cldnn::format::bfyx: for (size_t bi = 0; bi < a; ++bi) for (size_t fi = 0; fi < b; ++fi) @@ -183,7 +194,7 @@ void set_values(const cldnn::memory& mem, std::vector args) { } template -void set_values_per_batch_and_feature(const cldnn::memory& mem, const cldnn::layout& layout, std::vector args) +void set_values_per_batch_and_feature(const cldnn::memory& mem, std::vector args) { auto mem_ptr = mem.pointer(); auto&& pitches = mem.get_layout().get_pitches(); @@ -219,6 +230,24 @@ void set_random_values(const cldnn::memory& mem, bool sign = false, unsigned sig } +// Tries to construct a network, checking if an expected error appears +inline void check_exception_massage(const cldnn::engine& engine, cldnn::topology& topology, std::string msg_to_find) +{ + try { + cldnn::network(engine, topology); + } + catch (std::exception & exc) { + std::string msg(exc.what()); + if (msg.find(msg_to_find) != std::string::npos) { + throw; + } + else { + printf("%s\n", exc.what()); + } + } +} + + // Checks equality of floats. // For values less than absoulte_error_limit, absolute error will be counted // for others, the relatve error will be counted. @@ -318,6 +347,8 @@ struct memory_desc size_t offset; }; +const cldnn::engine & get_test_engine(); + struct test_dump { const std::string name() const; @@ -358,7 +389,7 @@ public: }; protected: - cldnn::engine engine; + const cldnn::engine& engine = get_test_engine(); test_params* generic_params; test_dump test_info; cldnn::primitive* layer_params; @@ -422,7 +453,9 @@ inline void PrintTupleTo(const std::tuplepooled_width << " Pooled height: " << p->pooled_height << " Spatial scale: " << p->spatial_scale - << " Group size: " << p->group_sz; + << " Spatial bins x: " << p->spatial_bins_x + << " Spatial bins y: " << p->spatial_bins_y + << " Output dim: " << p->output_dim; } else if(primitive->type == cldnn::scale::type_id()) { @@ -437,7 +470,7 @@ inline void PrintTupleTo(const std::tupletype == cldnn::reorder::type_id()) { auto reorder = static_cast(primitive); - str << "Output data type: " << cldnn::data_type_traits::name(reorder->output_data_type) << " Mean: " << reorder->mean << "Subtract per feature: " << "TODO" /*std::vector subtract_per_feature*/; + str << "Output data type: " << cldnn::data_type_traits::name(*reorder->output_data_type) << " Mean: " << reorder->mean << "Subtract per feature: " << "TODO" /*std::vector subtract_per_feature*/; } else if (primitive->type == cldnn::normalize::type_id()) { diff --git a/inference-engine/thirdparty/clDNN/tests_core_internal/CMakeLists.txt b/inference-engine/thirdparty/clDNN/tests_core_internal/CMakeLists.txt new file mode 100644 index 0000000..cdc5811 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests_core_internal/CMakeLists.txt @@ -0,0 +1,311 @@ +# Copyright (c) 2019 Intel Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ====================================== Helper constant variables ===================================== + +# Order of scan for special capabilities files (.inc files with capabilities description). +set(CLDNN__CAPS_SCAN_ORDER + "private" + "internal" + "public" + ) + +# ========================================= Name / Output settings ===================================== + +set(CLDNN_BUILD__PROJ "tests_core_internal") +set(CLDNN_BUILD__PROJ_LABEL "${CLDNN_BUILD__PROJ}") +set(CLDNN_BUILD__PROJ_OUTPUT_NAME "${CLDNN_BUILD__PROJ}${CLDNN__OUT_CPU_SUFFIX}") + +# =========================================== Compiler options ========================================= +intel_config_flag_apply_settings(CompilerOptions CMAKE_CXX_FLAGS ALL_PATTERN "" + SET + StandardCxx11 + RttiEnabled + ) + +if (NOT MSVC) + intel_config_flag_apply_settings(CompilerOptions CMAKE_CXX_FLAGS ALL_PATTERN "" + SET_RAW + "-Wno-error=conversion-null" + "-Wno-error=type-limits" + "-Wno-error=unused-variable" + ) +endif () + +find_package(OpenMP) +if (OPENMP_FOUND) + add_definitions(-DOPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +# ================================== Compiler preprocessor definitions ================================= + +set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS + CLDNN_EXPORTS + EXPORT_NEURAL_SYMBOLS + "CLDNN_VERSION_MAJOR=${CLDNN__VERSION_MAJOR}" + "CLDNN_VERSION_MINOR=${CLDNN__VERSION_MINOR}" + "CLDNN_VERSION_BUILD=${CLDNN__VERSION_BUILD}" + "CLDNN_VERSION_REVISION=${CLDNN__VERSION_REVISION}" + ) + + +# ========================================= Source/Header files ======================================== + +set(__CLDNN_Directory__clDNN_copy "${CMAKE_CURRENT_SOURCE_DIR}/../src") +set(__CLDNN_Label__clDNN_copy "clDNN") +file(GLOB __CLDNN_Sources__clDNN_copy + "${__CLDNN_Directory__clDNN_copy}/*.h" + "${__CLDNN_Directory__clDNN_copy}/*.hpp" + "${__CLDNN_Directory__clDNN_copy}/*.cpp" + "${__CLDNN_Directory__clDNN_copy}/*.inc" + ) + +set(__CLDNN_Label__api "${__CLDNN_Label__clDNN_copy}\\api") +file(GLOB __CLDNN_Headers__api + "${CLDNN__API_DIR}/*.h" + "${CLDNN__API_DIR}/*.hpp" + ) + +set(__CLDNN_Directory__api__cpp "${CLDNN__API_DIR}/CPP") +set(__CLDNN_Label__api__cpp "${__CLDNN_Label__api}\\CPP") +file(GLOB __CLDNN_Headers__api__cpp + "${__CLDNN_Directory__api__cpp}/*.h" + "${__CLDNN_Directory__api__cpp}/*.hpp" + ) + +set(__CLDNN_Directory__api__c "${CLDNN__API_DIR}/C") +set(__CLDNN_Label__api__c "${__CLDNN_Label__api}\\C") +file(GLOB __CLDNN_Headers__api__c + "${__CLDNN_Directory__api__c}/*.h" + "${__CLDNN_Directory__api__c}/*.hpp" + ) + +set(__CLDNN_Label__api_extension "${__CLDNN_Label__clDNN_copy}\\api_extension") +file(GLOB __CLDNN_Headers__api_extension + "${CLDNN__API_EXTENSION_DIR}/*.h" + "${CLDNN__API_EXTENSION_DIR}/*.hpp" + ) + +set(__CLDNN_Directory__api_extension__cpp "${CLDNN__API_EXTENSION_DIR}/CPP") +set(__CLDNN_Label__api_extension__cpp "${__CLDNN_Label__api_extension}\\CPP") +file(GLOB __CLDNN_Headers__api_extension__cpp + "${__CLDNN_Directory__api_extension__cpp}/*.h" + "${__CLDNN_Directory__api_extension__cpp}/*.hpp" + ) + +set(__CLDNN_Directory__api_extension__c "${CLDNN__API_EXTENSION_DIR}/C") +set(__CLDNN_Label__api_extension__c "${__CLDNN_Label__api_extension}\\C") +file(GLOB __CLDNN_Headers__api_extension__c + "${__CLDNN_Directory__api_extension__c}/*.h" + "${__CLDNN_Directory__api_extension__c}/*.hpp" + ) + +set(__CLDNN_Label__main "") +file(GLOB __CLDNN_Sources__main + "${CMAKE_CURRENT_SOURCE_DIR}/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" + ) + +set(__CLDNN_Directory__graph_opt "${CMAKE_CURRENT_SOURCE_DIR}/../src/graph_optimizer") +set(__CLDNN_Label__graph_opt "${__CLDNN_Label__clDNN_copy}\\graph_optimizer") +file(GLOB __CLDNN_Sources__graph_opt + "${__CLDNN_Directory__graph_opt}/*.h" + "${__CLDNN_Directory__graph_opt}/*.hpp" + "${__CLDNN_Directory__graph_opt}/*.cpp" + ) + +set(__CLDNN_Directory__include "${CMAKE_CURRENT_SOURCE_DIR}/../src/include") +set(__CLDNN_Label__include "${__CLDNN_Label__clDNN_copy}\\include") +file(GLOB __CLDNN_Headers__include + "${__CLDNN_Directory__include}/*.h" + "${__CLDNN_Directory__include}/*.hpp" + ) + +set(__CLDNN_Directory__test_cases "${CMAKE_CURRENT_SOURCE_DIR}/test_cases") +set(__CLDNN_Label__test_cases "test cases") +file(GLOB __CLDNN_Sources__test_cases + "${__CLDNN_Directory__test_cases}/*.h" + "${__CLDNN_Directory__test_cases}/*.hpp" + "${__CLDNN_Directory__test_cases}/*.cpp" + ) + +set(__CLDNN_Directory__test_utils "${CMAKE_CURRENT_SOURCE_DIR}/../tests/test_utils") +set(__CLDNN_Label__test_utils "test utils") +file(GLOB __CLDNN_Sources__test_utils + "${__CLDNN_Directory__test_utils}/*.h" + "${__CLDNN_Directory__test_utils}/*.hpp" + "${__CLDNN_Directory__test_utils}/*.cpp" + ) + +set(__CLDNN_Directory__gtest "${CLDNN__GTEST_DIR}") +set(__CLDNN_Label__gtest "google test framework") +file(GLOB __CLDNN_Sources__gtest + "${__CLDNN_Directory__gtest}/*.cc" + ) + +# Special handling of capabilities files. +set(__CLDNN_Directory__caps "${CMAKE_CURRENT_SOURCE_DIR}/../src/caps") +set(__CLDNN_Label__caps "${__CLDNN_Label__clDNN_copy}\\caps") +foreach(__CLDNN_CapsScanDir ${CLDNN__CAPS_SCAN_ORDER}) + string(REPLACE ";" "\;" __CLDNN_CapsScanDir "${__CLDNN_CapsScanDir}") # [WA#1] Must escape ; again if occurred in item. + file(GLOB __CLDNN_Sources__caps "${__CLDNN_Directory__caps}/${__CLDNN_CapsScanDir}/*.inc") + list(LENGTH __CLDNN_Sources__caps __CLDNN_CapsScanDirFileCount) + if(__CLDNN_CapsScanDirFileCount GREATER 0) + set(__CLDNN_IncDirectory__caps "${__CLDNN_Directory__caps}/${__CLDNN_CapsScanDir}") + message(STATUS "[clDNN] Selected capabilities: ${__CLDNN_CapsScanDir}") + break() + endif() +endforeach() +if(NOT (__CLDNN_CapsScanDirFileCount GREATER 0)) + message(FATAL_ERROR "[clDNN] Cannot locate any capabilities files in \"${__CLDNN_Directory__caps}\" subdirectories.") +endif() +unset(__CLDNN_CapsScanDir) +unset(__CLDNN_CapsScanDirFileCount) + +set(__CLDNN_Directory__gpu "${CMAKE_CURRENT_SOURCE_DIR}/../src/gpu") +set(__CLDNN_Label__gpu "${__CLDNN_Label__clDNN_copy}\\gpu") +file(GLOB __CLDNN_Sources__gpu + "${__CLDNN_Directory__gpu}/*.h" + "${__CLDNN_Directory__gpu}/*.hpp" + "${__CLDNN_Directory__gpu}/*.cpp" + "${__CLDNN_Directory__gpu}/*.inc" + ) + +set(__CLDNN_Directory__cache "${__CLDNN_Directory__gpu}/cache") +set(__CLDNN_Label__cache "${__CLDNN_Label__gpu}\\cache") +file(GLOB __CLDNN_Sources__cache + "${__CLDNN_Directory__cache}/*.h" + "${__CLDNN_Directory__cache}/*.hpp" + "${__CLDNN_Directory__cache}/*.cpp" + ) + +set(__CLDNN_Directory__ch_kernels "${__CLDNN_Directory__cache}/kernels") +set(__CLDNN_Label__ch_kernels "${__CLDNN_Label__cache}\\kernels") +file(GLOB __CLDNN_Sources__ch_kernels + "${__CLDNN_Directory__ch_kernels}/*.cl" + ) + +set(__CLDNN_Directory__cg_cache "${CLDNN__CODEGEN_INCDIR}") +set(__CLDNN_CGDirectory__cg_cache "${CLDNN__CODEGEN_DIR}/cache") +set(__CLDNN_Label__cg_cache "${__CLDNN_Label__cache}\\codegen") + +set(__CLDNN_Directory__ks_main "${CLDNN__KERNEL_SELECTOR_DIR}") +set(__CLDNN_Directory__ks_core "${CLDNN__KERNEL_SELECTOR_DIR}/core") +set(__CLDNN_Directory__ks_common "${CLDNN__KERNEL_SELECTOR_DIR}/common") +set(__CLDNN_Directory__ks_core_common "${__CLDNN_Directory__ks_core}/common") +set(__CLDNN_Directory__ks_actual_kernels "${__CLDNN_Directory__ks_core}/actual_kernels") +set(__CLDNN_Directory__ks_cache "${__CLDNN_Directory__ks_core}/cache") + + +set(__CLDNN_AllSources + ${__CLDNN_Sources__clDNN_copy} + ${__CLDNN_Headers__api} + ${__CLDNN_Sources__graph_opt} + ${__CLDNN_Headers__include} + ${__CLDNN_Sources__caps} + ${__CLDNN_Headers__api__cpp} + ${__CLDNN_Headers__api__c} + ${__CLDNN_Headers__api_extension} + ${__CLDNN_Headers__api_extension__c} + ${__CLDNN_Headers__api_extension__cpp} + ${__CLDNN_Sources__main} + ${__CLDNN_Sources__gpu} + ${__CLDNN_Sources__cache} + ${__CLDNN_Sources__ch_kernels} + ${__CLDNN_Sources__cg_cache} + ${__CLDNN_Sources__test_cases} + ${__CLDNN_Sources__test_utils} + ${__CLDNN_Sources__gtest} + ) +# Helping with some generators. +set_property(SOURCE ${__CLDNN_Sources__cg_cache} PROPERTY GENERATED TRUE) + + +# =============================================== Filters ============================================== + +source_group("${__CLDNN_Label__api}" FILES ${__CLDNN_Headers__api}) +source_group("${__CLDNN_Label__api__cpp}" FILES ${__CLDNN_Headers__api__cpp}) +source_group("${__CLDNN_Label__api__c}" FILES ${__CLDNN_Headers__api__c}) +source_group("${__CLDNN_Label__api_extension}" FILES ${__CLDNN_Headers__api_extension}) +source_group("${__CLDNN_Label__api_extension__cpp}" FILES ${__CLDNN_Headers__api_extension__cpp}) +source_group("${__CLDNN_Label__api_extension__c}" FILES ${__CLDNN_Headers__api_extension__c}) +source_group("${__CLDNN_Label__include}" FILES ${__CLDNN_Headers__include}) +source_group("${__CLDNN_Label__graph_opt}" FILES ${__CLDNN_Sources__graph_opt}) +source_group("${__CLDNN_Label__caps}" FILES ${__CLDNN_Sources__caps}) +source_group("${__CLDNN_Label__main}" FILES ${__CLDNN_Sources__main}) +source_group("${__CLDNN_Label__gpu}" FILES ${__CLDNN_Sources__gpu}) +source_group("${__CLDNN_Label__cache}" FILES ${__CLDNN_Sources__cache}) +source_group("${__CLDNN_Label__ch_kernels}" FILES ${__CLDNN_Sources__ch_kernels}) +source_group("${__CLDNN_Label__cg_cache}" FILES ${__CLDNN_Sources__cg_cache}) +source_group("${__CLDNN_Label__test_cases}" FILES ${__CLDNN_Sources__test_cases}) +source_group("${__CLDNN_Label__test_utils}" FILES ${__CLDNN_Sources__test_utils}) +source_group("${__CLDNN_Label__gtest}" FILES ${__CLDNN_Sources__gtest}) + + +# ===================================== Include/Link directories ======================================= + +include_directories( + "${CLDNN__MAIN_DIR}" + "${CLDNN__MAIN_DIR}/src" + "${CLDNN__GTEST_DIR}" + "${__CLDNN_Directory__test_utils}" + "${CMAKE_CURRENT_SOURCE_DIR}" + "${__CLDNN_Directory__include}" + "${__CLDNN_IncDirectory__caps}" + "${__CLDNN_Directory__ks_core}" + "${__CLDNN_Directory__ks_core}/common" + "${__CLDNN_Directory__ks_actual_kernels}" + "${__CLDNN_Directory__ks_common}" + "${__CLDNN_Directory__gpu}" + ) + +# =================================== Link targets and dependencies ==================================== + +# Tests executable. +add_executable("${CLDNN_BUILD__PROJ}" + ${__CLDNN_AllSources} + ) + +set_property(TARGET "${CLDNN_BUILD__PROJ}" PROPERTY PROJECT_LABEL "${CLDNN_BUILD__PROJ_LABEL}") +set_property(TARGET "${CLDNN_BUILD__PROJ}" PROPERTY OUTPUT_NAME "${CLDNN_BUILD__PROJ_OUTPUT_NAME}") + + +# Set library dependencies +target_link_libraries("${CLDNN_BUILD__PROJ}" + # "${CLDNN_BUILD__PROJ__clDNN}" + OpenCL + cldnn_kernel_selector + ) + +if(WIN32) + target_link_libraries("${CLDNN_BUILD__PROJ}" setupapi) +elseif((NOT ANDROID) AND (UNIX)) + target_link_libraries("${CLDNN_BUILD__PROJ}" pthread) +endif() +target_link_libraries("${CLDNN_BUILD__PROJ}" ${CLDNN__SYSTEM_LINK_LIBRARIES}) + +# =================================== Custom pre- and post-steps ======================================= + +if(CLDNN__RUN_TESTS) + add_custom_command(TARGET "${CLDNN_BUILD__PROJ}" POST_BUILD + WORKING_DIRECTORY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}" + COMMAND "${CLDNN_BUILD__PROJ}" + COMMENT "Executing tests..." + ) +endif() + +# ====================================================================================================== diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL.cpp b/inference-engine/thirdparty/clDNN/tests_core_internal/main.cpp similarity index 70% rename from inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL.cpp rename to inference-engine/thirdparty/clDNN/tests_core_internal/main.cpp index a5e90ad..02fbc7d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cache/cache_ICL.cpp +++ b/inference-engine/thirdparty/clDNN/tests_core_internal/main.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2019 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,13 +14,10 @@ // limitations under the License. */ -#include "auto_tuner.h" -#include "auto_tuner_offline.h" -namespace kernel_selector +#include "gtest/gtest.h" + +int main(int argc, char* argv[]) { - // ICL_GT2 - void tuning_cache_8A52(tuning_data& td) - { - tuning_cache_8A52_B1_B16(td); - } + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); } \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests_core_internal/program_impl_wrapper.h b/inference-engine/thirdparty/clDNN/tests_core_internal/program_impl_wrapper.h new file mode 100644 index 0000000..ddb9a1c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests_core_internal/program_impl_wrapper.h @@ -0,0 +1,32 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +namespace cldnn +{ + struct program_node; + struct program_impl; + // This class is intended to allow using private methods from program_impl within tests_core_internal project. + // Once needed, more methods wrapper should be added here. + class program_impl_wrapper + { + public: + static void add_connection(program_impl& p, program_node& prev, program_node& next) + { + p.add_connection(prev, next); + } + }; + +} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/tests_core_internal/test_cases/graph_manipulation_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests_core_internal/test_cases/graph_manipulation_gpu_test.cpp new file mode 100644 index 0000000..4a02d5c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests_core_internal/test_cases/graph_manipulation_gpu_test.cpp @@ -0,0 +1,203 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include + +#include + +#include "program_impl.h" +#include "api_impl.h" +#include "topology_impl.h" +#include "engine_impl.h" +#include "memory_impl.h" +#include "data_inst.h" +#include "activation_inst.h" +#include "convolution_inst.h" +#include "crop_inst.h" +#include "network_impl.h" +#include "reshape_inst.h" +#include "pass_manager.h" + +#include "test_utils.h" +#include "program_impl_wrapper.h" + +using namespace cldnn; +using namespace ::tests; + +/* Basic test to show how the program can be build and run within internal tests + in similar way as it is done in tests utilizing clDNN API */ +TEST(basic, test1) { + const auto& engine = get_test_engine(); + build_options build_opt; + build_opt.set_option(build_option::optimize_data(true)); + + auto input = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 2 } }); + auto weights1 = memory::allocate(engine, { data_types::f16, format::yxfb,{ 1, 1, 2, 1 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } }); + + set_values(input, { FLOAT16(1.1f), FLOAT16(1.2f), FLOAT16(1.3f), FLOAT16(1.4f) }); + set_values(weights1, { FLOAT16(2.1f), FLOAT16(3.1f) }); + set_values(weights2, { 1.1f, 0.1f }); + + topology topology; + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights1", weights1)); + topology.add(data("weights2", weights2)); + topology.add(reshape("reshape1", "weights1", tensor(spatial(1, 2)))); + topology.add(reorder("reorder2", "input", layout(data_types::f32, format::byxf, 4))); + topology.add(reorder("reorder1", "reshape1", layout(data_types::f32, format::byxf, 4))); + topology.add(concatenation("concat", { "reorder1", "weights2" }, concatenation::along_x)); + topology.add(convolution("conv2", { "reorder2" }, { "concat" })); + + program_impl::ptr prog = api_cast(engine.get())->build_program(*api_cast(topology.get()), build_opt, false); + cldnn::refcounted_obj_ptr net = api_cast(engine.get())->allocate_network(*prog); + network network = api_cast(net.get()); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + float epsilon = 1e-2f; + for (auto& it : outputs) + { + auto output = it.second.get_memory().pointer(); + EXPECT_NEAR(7.8f, output[0], epsilon); + } +} + +/* + This test creates a program without optimization passes, even the compilation is being run manualy. + Thus, a single method from program_impl like add_intermediate might be tested separately. +*/ +TEST(add_intermediate_gpu, test1) +{ + build_options build_opt; + topology topology; + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {2, 2, 2, 2} }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, {2, 2, 2, 2} }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 1, 1 } }); + + set_values(input, { (1.1f), (1.2f), (1.3f), (1.4f), + (2.1f), (2.2f), (2.3f), (2.4f), + (3.1f), (3.2f), (3.3f), (3.4f), + (4.1f), (4.2f), (4.3f), (4.4f) }); + set_values(weights, { (1.5f), (1.6f), (1.7f), (1.8f), + (2.5f), (2.6f), (2.7f), (2.8f), + (3.5f), (3.6f), (3.7f), (3.8f), + (4.5f), (4.6f), (4.7f), (4.8f) }); + + set_values(weights2, { (5.5f), (5.6f), (5.7f), (5.8f) }); + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights", weights)); + topology.add(data("weights2", weights2)); + topology.add(cldnn::convolution("conv1a", { "input" }, { "weights" })); + topology.add(cldnn::convolution("conv1b", { "input" }, { "weights" })); + topology.add(cldnn::convolution("conv2a", { "conv1a" }, { "weights2" })); + auto new_reorder = std::make_shared("reorder","nothing", input.get_layout()); + program_impl::ptr prog = api_cast(engine.get())->build_program(*api_cast(topology.get()), build_opt, false, true); + prog->add_intermediate(new_reorder, prog->get_node("conv1a"), 0); + prog->dump_program("custom_dump", true); + + pass_manager pm; + compile_graph compile_graph_pass; + pm.run(*prog, compile_graph_pass); + + cldnn::refcounted_obj_ptr net = api_cast(engine.get())->allocate_network(*prog); + network network = api_cast(net.get()); + network.set_input_data("input", input); + auto outputs = network.execute(); + + std::vector expected_output_vec = { + 32.2f, 60.2f, 66.6f, 126.6f, + 514.22f, 532.7f, 1075.26f, 1113.9f + }; + + uint32_t output_size = 4; + uint32_t output_index = 0; + for (auto& it : outputs) + { + auto output = it.second.get_memory().pointer(); + for (uint32_t x = 0; x < output_size; x++) + { + EXPECT_FLOAT_EQ(expected_output_vec[x+output_size*output_index], output[x]); + } + output_index++; + } +} + +/* This test shows how to use private members (here: add_connection) of program_impl using program_impl_wraper */ +TEST(add_intermediate_gpu, test2) +{ + build_options build_opt; + topology topology; + engine engine; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } }); + auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 1, 1 } }); + + set_values(input, { (1.1f), (1.2f), (1.3f), (1.4f), + (2.1f), (2.2f), (2.3f), (2.4f), + (3.1f), (3.2f), (3.3f), (3.4f), + (4.1f), (4.2f), (4.3f), (4.4f) }); + set_values(weights, { (1.5f), (1.6f), (1.7f), (1.8f), + (2.5f), (2.6f), (2.7f), (2.8f), + (3.5f), (3.6f), (3.7f), (3.8f), + (4.5f), (4.6f), (4.7f), (4.8f) }); + + set_values(weights2, { (5.5f), (5.6f), (5.7f), (5.8f) }); + + topology.add(input_layout("input", input.get_layout())); + topology.add(data("weights2", weights2)); + + topology.add(cldnn::convolution("conv2a", { "input" }, { "weights2" })); + topology.add(cldnn::convolution("conv2b", { "input" }, { "weights2" })); + + std::vector w_vec; + w_vec.push_back("weights"); + auto new_conv = std::make_shared("conv1a", "input", w_vec); + auto weights_node = std::make_shared("weights", weights); + program_impl::ptr prog = api_cast(engine.get())->build_program(*api_cast(topology.get()), build_opt, false, true); + + prog->add_intermediate(new_conv, prog->get_node("conv2a"), 0, true, true); + program_impl_wrapper::add_connection(*prog, prog->get_or_create(weights_node), prog->get_or_create(new_conv)); + prog->dump_program("custom_dump", true); + + pass_manager pm; + compile_graph compile_graph_pass; + pm.run(*prog, compile_graph_pass); + + cldnn::refcounted_obj_ptr net = api_cast(engine.get())->allocate_network(*prog); + network network = api_cast(net.get()); + network.set_input_data("input", input); + auto outputs = network.execute(); + + std::vector expected_output_vec = { + 514.22f, 532.7f, 1075.26f, 1113.9f + }; + + uint32_t output_size = 4; + for (auto& it : outputs) + { + auto output = it.second.get_memory().pointer(); + for (uint32_t x = 0; x < output_size; x++) + { + EXPECT_FLOAT_EQ(expected_output_vec[x], output[x]); + } + } +} diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/allocators.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/allocators.h new file mode 100644 index 0000000..06b3420 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/allocators.h @@ -0,0 +1,284 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ALLOCATORS_H_ +#define RAPIDJSON_ALLOCATORS_H_ + +#include "rapidjson.h" + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// Allocator + +/*! \class rapidjson::Allocator + \brief Concept for allocating, resizing and freeing memory block. + + Note that Malloc() and Realloc() are non-static but Free() is static. + + So if an allocator need to support Free(), it needs to put its pointer in + the header of memory block. + +\code +concept Allocator { + static const bool kNeedFree; //!< Whether this allocator needs to call Free(). + + // Allocate a memory block. + // \param size of the memory block in bytes. + // \returns pointer to the memory block. + void* Malloc(size_t size); + + // Resize a memory block. + // \param originalPtr The pointer to current memory block. Null pointer is permitted. + // \param originalSize The current size in bytes. (Design issue: since some allocator may not book-keep this, explicitly pass to it can save memory.) + // \param newSize the new size in bytes. + void* Realloc(void* originalPtr, size_t originalSize, size_t newSize); + + // Free a memory block. + // \param pointer to the memory block. Null pointer is permitted. + static void Free(void *ptr); +}; +\endcode +*/ + + +/*! \def RAPIDJSON_ALLOCATOR_DEFUALT_CHUNK_CAPACITY + \ingroup RAPIDJSON_CONFIG + \brief User-defined kDefaultChunkCapacity definition. + + User can define this as any \c size that is a power of 2. +*/ + +#ifndef RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY +#define RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY (64 * 1024) +#endif + + +/////////////////////////////////////////////////////////////////////////////// +// CrtAllocator + +//! C-runtime library allocator. +/*! This class is just wrapper for standard C library memory routines. + \note implements Allocator concept +*/ +class CrtAllocator { +public: + static const bool kNeedFree = true; + void* Malloc(size_t size) { + if (size) // behavior of malloc(0) is implementation defined. + return std::malloc(size); + else + return NULL; // standardize to returning NULL. + } + void* Realloc(void* originalPtr, size_t originalSize, size_t newSize) { + (void)originalSize; + if (newSize == 0) { + std::free(originalPtr); + return NULL; + } + return std::realloc(originalPtr, newSize); + } + static void Free(void *ptr) { std::free(ptr); } +}; + +/////////////////////////////////////////////////////////////////////////////// +// MemoryPoolAllocator + +//! Default memory allocator used by the parser and DOM. +/*! This allocator allocate memory blocks from pre-allocated memory chunks. + + It does not free memory blocks. And Realloc() only allocate new memory. + + The memory chunks are allocated by BaseAllocator, which is CrtAllocator by default. + + User may also supply a buffer as the first chunk. + + If the user-buffer is full then additional chunks are allocated by BaseAllocator. + + The user-buffer is not deallocated by this allocator. + + \tparam BaseAllocator the allocator type for allocating memory chunks. Default is CrtAllocator. + \note implements Allocator concept +*/ +template +class MemoryPoolAllocator { +public: + static const bool kNeedFree = false; //!< Tell users that no need to call Free() with this allocator. (concept Allocator) + + //! Constructor with chunkSize. + /*! \param chunkSize The size of memory chunk. The default is kDefaultChunkSize. + \param baseAllocator The allocator for allocating memory chunks. + */ + MemoryPoolAllocator(size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) : + chunkHead_(0), chunk_capacity_(chunkSize), userBuffer_(0), baseAllocator_(baseAllocator), ownBaseAllocator_(0) + { + } + + //! Constructor with user-supplied buffer. + /*! The user buffer will be used firstly. When it is full, memory pool allocates new chunk with chunk size. + + The user buffer will not be deallocated when this allocator is destructed. + + \param buffer User supplied buffer. + \param size Size of the buffer in bytes. It must at least larger than sizeof(ChunkHeader). + \param chunkSize The size of memory chunk. The default is kDefaultChunkSize. + \param baseAllocator The allocator for allocating memory chunks. + */ + MemoryPoolAllocator(void *buffer, size_t size, size_t chunkSize = kDefaultChunkCapacity, BaseAllocator* baseAllocator = 0) : + chunkHead_(0), chunk_capacity_(chunkSize), userBuffer_(buffer), baseAllocator_(baseAllocator), ownBaseAllocator_(0) + { + RAPIDJSON_ASSERT(buffer != 0); + RAPIDJSON_ASSERT(size > sizeof(ChunkHeader)); + chunkHead_ = reinterpret_cast(buffer); + chunkHead_->capacity = size - sizeof(ChunkHeader); + chunkHead_->size = 0; + chunkHead_->next = 0; + } + + //! Destructor. + /*! This deallocates all memory chunks, excluding the user-supplied buffer. + */ + ~MemoryPoolAllocator() { + Clear(); + RAPIDJSON_DELETE(ownBaseAllocator_); + } + + //! Deallocates all memory chunks, excluding the user-supplied buffer. + void Clear() { + while (chunkHead_ && chunkHead_ != userBuffer_) { + ChunkHeader* next = chunkHead_->next; + baseAllocator_->Free(chunkHead_); + chunkHead_ = next; + } + if (chunkHead_ && chunkHead_ == userBuffer_) + chunkHead_->size = 0; // Clear user buffer + } + + //! Computes the total capacity of allocated memory chunks. + /*! \return total capacity in bytes. + */ + size_t Capacity() const { + size_t capacity = 0; + for (ChunkHeader* c = chunkHead_; c != 0; c = c->next) + capacity += c->capacity; + return capacity; + } + + //! Computes the memory blocks allocated. + /*! \return total used bytes. + */ + size_t Size() const { + size_t size = 0; + for (ChunkHeader* c = chunkHead_; c != 0; c = c->next) + size += c->size; + return size; + } + + //! Allocates a memory block. (concept Allocator) + void* Malloc(size_t size) { + if (!size) + return NULL; + + size = RAPIDJSON_ALIGN(size); + if (chunkHead_ == 0 || chunkHead_->size + size > chunkHead_->capacity) + if (!AddChunk(chunk_capacity_ > size ? chunk_capacity_ : size)) + return NULL; + + void *buffer = reinterpret_cast(chunkHead_) + RAPIDJSON_ALIGN(sizeof(ChunkHeader)) + chunkHead_->size; + chunkHead_->size += size; + return buffer; + } + + //! Resizes a memory block (concept Allocator) + void* Realloc(void* originalPtr, size_t originalSize, size_t newSize) { + if (originalPtr == 0) + return Malloc(newSize); + + if (newSize == 0) + return NULL; + + originalSize = RAPIDJSON_ALIGN(originalSize); + newSize = RAPIDJSON_ALIGN(newSize); + + // Do not shrink if new size is smaller than original + if (originalSize >= newSize) + return originalPtr; + + // Simply expand it if it is the last allocation and there is sufficient space + if (originalPtr == reinterpret_cast(chunkHead_) + RAPIDJSON_ALIGN(sizeof(ChunkHeader)) + chunkHead_->size - originalSize) { + size_t increment = static_cast(newSize - originalSize); + if (chunkHead_->size + increment <= chunkHead_->capacity) { + chunkHead_->size += increment; + return originalPtr; + } + } + + // Realloc process: allocate and copy memory, do not free original buffer. + if (void* newBuffer = Malloc(newSize)) { + if (originalSize) + std::memcpy(newBuffer, originalPtr, originalSize); + return newBuffer; + } + else + return NULL; + } + + //! Frees a memory block (concept Allocator) + static void Free(void *ptr) { (void)ptr; } // Do nothing + +private: + //! Copy constructor is not permitted. + MemoryPoolAllocator(const MemoryPoolAllocator& rhs) /* = delete */; + //! Copy assignment operator is not permitted. + MemoryPoolAllocator& operator=(const MemoryPoolAllocator& rhs) /* = delete */; + + //! Creates a new chunk. + /*! \param capacity Capacity of the chunk in bytes. + \return true if success. + */ + bool AddChunk(size_t capacity) { + if (!baseAllocator_) + ownBaseAllocator_ = baseAllocator_ = RAPIDJSON_NEW(BaseAllocator)(); + if (ChunkHeader* chunk = reinterpret_cast(baseAllocator_->Malloc(RAPIDJSON_ALIGN(sizeof(ChunkHeader)) + capacity))) { + chunk->capacity = capacity; + chunk->size = 0; + chunk->next = chunkHead_; + chunkHead_ = chunk; + return true; + } + else + return false; + } + + static const int kDefaultChunkCapacity = RAPIDJSON_ALLOCATOR_DEFAULT_CHUNK_CAPACITY; //!< Default chunk capacity. + + //! Chunk header for perpending to each chunk. + /*! Chunks are stored as a singly linked list. + */ + struct ChunkHeader { + size_t capacity; //!< Capacity of the chunk in bytes (excluding the header itself). + size_t size; //!< Current size of allocated memory in bytes. + ChunkHeader *next; //!< Next chunk in the linked list. + }; + + ChunkHeader *chunkHead_; //!< Head of the chunk linked-list. Only the head chunk serves allocation. + size_t chunk_capacity_; //!< The minimum capacity of chunk when they are allocated. + void *userBuffer_; //!< User supplied buffer. + BaseAllocator* baseAllocator_; //!< base allocator for allocating memory chunks. + BaseAllocator* ownBaseAllocator_; //!< base allocator created by this object. +}; + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_ENCODINGS_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/cursorstreamwrapper.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/cursorstreamwrapper.h new file mode 100644 index 0000000..52c11a7 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/cursorstreamwrapper.h @@ -0,0 +1,78 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_CURSORSTREAMWRAPPER_H_ +#define RAPIDJSON_CURSORSTREAMWRAPPER_H_ + +#include "stream.h" + +#if defined(__GNUC__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#endif + +#if defined(_MSC_VER) && _MSC_VER <= 1800 +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4702) // unreachable code +RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated +#endif + +RAPIDJSON_NAMESPACE_BEGIN + + +//! Cursor stream wrapper for counting line and column number if error exists. +/*! + \tparam InputStream Any stream that implements Stream Concept +*/ +template > +class CursorStreamWrapper : public GenericStreamWrapper { +public: + typedef typename Encoding::Ch Ch; + + CursorStreamWrapper(InputStream& is): + GenericStreamWrapper(is), line_(1), col_(0) {} + + // counting line and column number + Ch Take() { + Ch ch = this->is_.Take(); + if(ch == '\n') { + line_ ++; + col_ = 0; + } else { + col_ ++; + } + return ch; + } + + //! Get the error line number, if error exists. + size_t GetLine() const { return line_; } + //! Get the error column number, if error exists. + size_t GetColumn() const { return col_; } + +private: + size_t line_; //!< Current Line + size_t col_; //!< Current Column +}; + +#if defined(_MSC_VER) && _MSC_VER <= 1800 +RAPIDJSON_DIAG_POP +#endif + +#if defined(__GNUC__) +RAPIDJSON_DIAG_POP +#endif + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_CURSORSTREAMWRAPPER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/document.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/document.h new file mode 100644 index 0000000..dfa499e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/document.h @@ -0,0 +1,2643 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_DOCUMENT_H_ +#define RAPIDJSON_DOCUMENT_H_ + +/*! \file document.h */ + +#include "reader.h" +#include "internal/meta.h" +#include "internal/strfunc.h" +#include "memorystream.h" +#include "encodedstream.h" +#include // placement new +#include + +RAPIDJSON_DIAG_PUSH +#ifdef __clang__ +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(switch-enum) +RAPIDJSON_DIAG_OFF(c++98-compat) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant +RAPIDJSON_DIAG_OFF(4244) // conversion from kXxxFlags to 'uint16_t', possible loss of data +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_OFF(effc++) +#endif // __GNUC__ + +#ifndef RAPIDJSON_NOMEMBERITERATORCLASS +#include // std::random_access_iterator_tag +#endif + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS +#include // std::move +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +// Forward declaration. +template +class GenericValue; + +template +class GenericDocument; + +//! Name-value pair in a JSON object value. +/*! + This class was internal to GenericValue. It used to be a inner struct. + But a compiler (IBM XL C/C++ for AIX) have reported to have problem with that so it moved as a namespace scope struct. + https://code.google.com/p/rapidjson/issues/detail?id=64 +*/ +template +struct GenericMember { + GenericValue name; //!< name of member (must be a string) + GenericValue value; //!< value of member. +}; + +/////////////////////////////////////////////////////////////////////////////// +// GenericMemberIterator + +#ifndef RAPIDJSON_NOMEMBERITERATORCLASS + +//! (Constant) member iterator for a JSON object value +/*! + \tparam Const Is this a constant iterator? + \tparam Encoding Encoding of the value. (Even non-string values need to have the same encoding in a document) + \tparam Allocator Allocator type for allocating memory of object, array and string. + + This class implements a Random Access Iterator for GenericMember elements + of a GenericValue, see ISO/IEC 14882:2003(E) C++ standard, 24.1 [lib.iterator.requirements]. + + \note This iterator implementation is mainly intended to avoid implicit + conversions from iterator values to \c NULL, + e.g. from GenericValue::FindMember. + + \note Define \c RAPIDJSON_NOMEMBERITERATORCLASS to fall back to a + pointer-based implementation, if your platform doesn't provide + the C++ header. + + \see GenericMember, GenericValue::MemberIterator, GenericValue::ConstMemberIterator + */ +template +class GenericMemberIterator { + + friend class GenericValue; + template friend class GenericMemberIterator; + + typedef GenericMember PlainType; + typedef typename internal::MaybeAddConst::Type ValueType; + +public: + //! Iterator type itself + typedef GenericMemberIterator Iterator; + //! Constant iterator type + typedef GenericMemberIterator ConstIterator; + //! Non-constant iterator type + typedef GenericMemberIterator NonConstIterator; + + /** \name std::iterator_traits support */ + //@{ + typedef ValueType value_type; + typedef ValueType * pointer; + typedef ValueType & reference; + typedef std::ptrdiff_t difference_type; + typedef std::random_access_iterator_tag iterator_category; + //@} + + //! Pointer to (const) GenericMember + typedef pointer Pointer; + //! Reference to (const) GenericMember + typedef reference Reference; + //! Signed integer type (e.g. \c ptrdiff_t) + typedef difference_type DifferenceType; + + //! Default constructor (singular value) + /*! Creates an iterator pointing to no element. + \note All operations, except for comparisons, are undefined on such values. + */ + GenericMemberIterator() : ptr_() {} + + //! Iterator conversions to more const + /*! + \param it (Non-const) iterator to copy from + + Allows the creation of an iterator from another GenericMemberIterator + that is "less const". Especially, creating a non-constant iterator + from a constant iterator are disabled: + \li const -> non-const (not ok) + \li const -> const (ok) + \li non-const -> const (ok) + \li non-const -> non-const (ok) + + \note If the \c Const template parameter is already \c false, this + constructor effectively defines a regular copy-constructor. + Otherwise, the copy constructor is implicitly defined. + */ + GenericMemberIterator(const NonConstIterator & it) : ptr_(it.ptr_) {} + Iterator& operator=(const NonConstIterator & it) { ptr_ = it.ptr_; return *this; } + + //! @name stepping + //@{ + Iterator& operator++(){ ++ptr_; return *this; } + Iterator& operator--(){ --ptr_; return *this; } + Iterator operator++(int){ Iterator old(*this); ++ptr_; return old; } + Iterator operator--(int){ Iterator old(*this); --ptr_; return old; } + //@} + + //! @name increment/decrement + //@{ + Iterator operator+(DifferenceType n) const { return Iterator(ptr_+n); } + Iterator operator-(DifferenceType n) const { return Iterator(ptr_-n); } + + Iterator& operator+=(DifferenceType n) { ptr_+=n; return *this; } + Iterator& operator-=(DifferenceType n) { ptr_-=n; return *this; } + //@} + + //! @name relations + //@{ + bool operator==(ConstIterator that) const { return ptr_ == that.ptr_; } + bool operator!=(ConstIterator that) const { return ptr_ != that.ptr_; } + bool operator<=(ConstIterator that) const { return ptr_ <= that.ptr_; } + bool operator>=(ConstIterator that) const { return ptr_ >= that.ptr_; } + bool operator< (ConstIterator that) const { return ptr_ < that.ptr_; } + bool operator> (ConstIterator that) const { return ptr_ > that.ptr_; } + //@} + + //! @name dereference + //@{ + Reference operator*() const { return *ptr_; } + Pointer operator->() const { return ptr_; } + Reference operator[](DifferenceType n) const { return ptr_[n]; } + //@} + + //! Distance + DifferenceType operator-(ConstIterator that) const { return ptr_-that.ptr_; } + +private: + //! Internal constructor from plain pointer + explicit GenericMemberIterator(Pointer p) : ptr_(p) {} + + Pointer ptr_; //!< raw pointer +}; + +#else // RAPIDJSON_NOMEMBERITERATORCLASS + +// class-based member iterator implementation disabled, use plain pointers + +template +struct GenericMemberIterator; + +//! non-const GenericMemberIterator +template +struct GenericMemberIterator { + //! use plain pointer as iterator type + typedef GenericMember* Iterator; +}; +//! const GenericMemberIterator +template +struct GenericMemberIterator { + //! use plain const pointer as iterator type + typedef const GenericMember* Iterator; +}; + +#endif // RAPIDJSON_NOMEMBERITERATORCLASS + +/////////////////////////////////////////////////////////////////////////////// +// GenericStringRef + +//! Reference to a constant string (not taking a copy) +/*! + \tparam CharType character type of the string + + This helper class is used to automatically infer constant string + references for string literals, especially from \c const \b (!) + character arrays. + + The main use is for creating JSON string values without copying the + source string via an \ref Allocator. This requires that the referenced + string pointers have a sufficient lifetime, which exceeds the lifetime + of the associated GenericValue. + + \b Example + \code + Value v("foo"); // ok, no need to copy & calculate length + const char foo[] = "foo"; + v.SetString(foo); // ok + + const char* bar = foo; + // Value x(bar); // not ok, can't rely on bar's lifetime + Value x(StringRef(bar)); // lifetime explicitly guaranteed by user + Value y(StringRef(bar, 3)); // ok, explicitly pass length + \endcode + + \see StringRef, GenericValue::SetString +*/ +template +struct GenericStringRef { + typedef CharType Ch; //!< character type of the string + + //! Create string reference from \c const character array +#ifndef __clang__ // -Wdocumentation + /*! + This constructor implicitly creates a constant string reference from + a \c const character array. It has better performance than + \ref StringRef(const CharType*) by inferring the string \ref length + from the array length, and also supports strings containing null + characters. + + \tparam N length of the string, automatically inferred + + \param str Constant character array, lifetime assumed to be longer + than the use of the string in e.g. a GenericValue + + \post \ref s == str + + \note Constant complexity. + \note There is a hidden, private overload to disallow references to + non-const character arrays to be created via this constructor. + By this, e.g. function-scope arrays used to be filled via + \c snprintf are excluded from consideration. + In such cases, the referenced string should be \b copied to the + GenericValue instead. + */ +#endif + template + GenericStringRef(const CharType (&str)[N]) RAPIDJSON_NOEXCEPT + : s(str), length(N-1) {} + + //! Explicitly create string reference from \c const character pointer +#ifndef __clang__ // -Wdocumentation + /*! + This constructor can be used to \b explicitly create a reference to + a constant string pointer. + + \see StringRef(const CharType*) + + \param str Constant character pointer, lifetime assumed to be longer + than the use of the string in e.g. a GenericValue + + \post \ref s == str + + \note There is a hidden, private overload to disallow references to + non-const character arrays to be created via this constructor. + By this, e.g. function-scope arrays used to be filled via + \c snprintf are excluded from consideration. + In such cases, the referenced string should be \b copied to the + GenericValue instead. + */ +#endif + explicit GenericStringRef(const CharType* str) + : s(str), length(NotNullStrLen(str)) {} + + //! Create constant string reference from pointer and length +#ifndef __clang__ // -Wdocumentation + /*! \param str constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \param len length of the string, excluding the trailing NULL terminator + + \post \ref s == str && \ref length == len + \note Constant complexity. + */ +#endif + GenericStringRef(const CharType* str, SizeType len) + : s(RAPIDJSON_LIKELY(str) ? str : emptyString), length(len) { RAPIDJSON_ASSERT(str != 0 || len == 0u); } + + GenericStringRef(const GenericStringRef& rhs) : s(rhs.s), length(rhs.length) {} + + //! implicit conversion to plain CharType pointer + operator const Ch *() const { return s; } + + const Ch* const s; //!< plain CharType pointer + const SizeType length; //!< length of the string (excluding the trailing NULL terminator) + +private: + SizeType NotNullStrLen(const CharType* str) { + RAPIDJSON_ASSERT(str != 0); + return internal::StrLen(str); + } + + /// Empty string - used when passing in a NULL pointer + static const Ch emptyString[]; + + //! Disallow construction from non-const array + template + GenericStringRef(CharType (&str)[N]) /* = delete */; + //! Copy assignment operator not permitted - immutable type + GenericStringRef& operator=(const GenericStringRef& rhs) /* = delete */; +}; + +template +const CharType GenericStringRef::emptyString[] = { CharType() }; + +//! Mark a character pointer as constant string +/*! Mark a plain character pointer as a "string literal". This function + can be used to avoid copying a character string to be referenced as a + value in a JSON GenericValue object, if the string's lifetime is known + to be valid long enough. + \tparam CharType Character type of the string + \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \return GenericStringRef string reference object + \relatesalso GenericStringRef + + \see GenericValue::GenericValue(StringRefType), GenericValue::operator=(StringRefType), GenericValue::SetString(StringRefType), GenericValue::PushBack(StringRefType, Allocator&), GenericValue::AddMember +*/ +template +inline GenericStringRef StringRef(const CharType* str) { + return GenericStringRef(str); +} + +//! Mark a character pointer as constant string +/*! Mark a plain character pointer as a "string literal". This function + can be used to avoid copying a character string to be referenced as a + value in a JSON GenericValue object, if the string's lifetime is known + to be valid long enough. + + This version has better performance with supplied length, and also + supports string containing null characters. + + \tparam CharType character type of the string + \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \param length The length of source string. + \return GenericStringRef string reference object + \relatesalso GenericStringRef +*/ +template +inline GenericStringRef StringRef(const CharType* str, size_t length) { + return GenericStringRef(str, SizeType(length)); +} + +#if RAPIDJSON_HAS_STDSTRING +//! Mark a string object as constant string +/*! Mark a string object (e.g. \c std::string) as a "string literal". + This function can be used to avoid copying a string to be referenced as a + value in a JSON GenericValue object, if the string's lifetime is known + to be valid long enough. + + \tparam CharType character type of the string + \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \return GenericStringRef string reference object + \relatesalso GenericStringRef + \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. +*/ +template +inline GenericStringRef StringRef(const std::basic_string& str) { + return GenericStringRef(str.data(), SizeType(str.size())); +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GenericValue type traits +namespace internal { + +template +struct IsGenericValueImpl : FalseType {}; + +// select candidates according to nested encoding and allocator types +template struct IsGenericValueImpl::Type, typename Void::Type> + : IsBaseOf, T>::Type {}; + +// helper to match arbitrary GenericValue instantiations, including derived classes +template struct IsGenericValue : IsGenericValueImpl::Type {}; + +} // namespace internal + +/////////////////////////////////////////////////////////////////////////////// +// TypeHelper + +namespace internal { + +template +struct TypeHelper {}; + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsBool(); } + static bool Get(const ValueType& v) { return v.GetBool(); } + static ValueType& Set(ValueType& v, bool data) { return v.SetBool(data); } + static ValueType& Set(ValueType& v, bool data, typename ValueType::AllocatorType&) { return v.SetBool(data); } +}; + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsInt(); } + static int Get(const ValueType& v) { return v.GetInt(); } + static ValueType& Set(ValueType& v, int data) { return v.SetInt(data); } + static ValueType& Set(ValueType& v, int data, typename ValueType::AllocatorType&) { return v.SetInt(data); } +}; + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsUint(); } + static unsigned Get(const ValueType& v) { return v.GetUint(); } + static ValueType& Set(ValueType& v, unsigned data) { return v.SetUint(data); } + static ValueType& Set(ValueType& v, unsigned data, typename ValueType::AllocatorType&) { return v.SetUint(data); } +}; + +#ifdef _MSC_VER +RAPIDJSON_STATIC_ASSERT(sizeof(long) == sizeof(int)); +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsInt(); } + static long Get(const ValueType& v) { return v.GetInt(); } + static ValueType& Set(ValueType& v, long data) { return v.SetInt(data); } + static ValueType& Set(ValueType& v, long data, typename ValueType::AllocatorType&) { return v.SetInt(data); } +}; + +RAPIDJSON_STATIC_ASSERT(sizeof(unsigned long) == sizeof(unsigned)); +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsUint(); } + static unsigned long Get(const ValueType& v) { return v.GetUint(); } + static ValueType& Set(ValueType& v, unsigned long data) { return v.SetUint(data); } + static ValueType& Set(ValueType& v, unsigned long data, typename ValueType::AllocatorType&) { return v.SetUint(data); } +}; +#endif + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsInt64(); } + static int64_t Get(const ValueType& v) { return v.GetInt64(); } + static ValueType& Set(ValueType& v, int64_t data) { return v.SetInt64(data); } + static ValueType& Set(ValueType& v, int64_t data, typename ValueType::AllocatorType&) { return v.SetInt64(data); } +}; + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsUint64(); } + static uint64_t Get(const ValueType& v) { return v.GetUint64(); } + static ValueType& Set(ValueType& v, uint64_t data) { return v.SetUint64(data); } + static ValueType& Set(ValueType& v, uint64_t data, typename ValueType::AllocatorType&) { return v.SetUint64(data); } +}; + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsDouble(); } + static double Get(const ValueType& v) { return v.GetDouble(); } + static ValueType& Set(ValueType& v, double data) { return v.SetDouble(data); } + static ValueType& Set(ValueType& v, double data, typename ValueType::AllocatorType&) { return v.SetDouble(data); } +}; + +template +struct TypeHelper { + static bool Is(const ValueType& v) { return v.IsFloat(); } + static float Get(const ValueType& v) { return v.GetFloat(); } + static ValueType& Set(ValueType& v, float data) { return v.SetFloat(data); } + static ValueType& Set(ValueType& v, float data, typename ValueType::AllocatorType&) { return v.SetFloat(data); } +}; + +template +struct TypeHelper { + typedef const typename ValueType::Ch* StringType; + static bool Is(const ValueType& v) { return v.IsString(); } + static StringType Get(const ValueType& v) { return v.GetString(); } + static ValueType& Set(ValueType& v, const StringType data) { return v.SetString(typename ValueType::StringRefType(data)); } + static ValueType& Set(ValueType& v, const StringType data, typename ValueType::AllocatorType& a) { return v.SetString(data, a); } +}; + +#if RAPIDJSON_HAS_STDSTRING +template +struct TypeHelper > { + typedef std::basic_string StringType; + static bool Is(const ValueType& v) { return v.IsString(); } + static StringType Get(const ValueType& v) { return StringType(v.GetString(), v.GetStringLength()); } + static ValueType& Set(ValueType& v, const StringType& data, typename ValueType::AllocatorType& a) { return v.SetString(data, a); } +}; +#endif + +template +struct TypeHelper { + typedef typename ValueType::Array ArrayType; + static bool Is(const ValueType& v) { return v.IsArray(); } + static ArrayType Get(ValueType& v) { return v.GetArray(); } + static ValueType& Set(ValueType& v, ArrayType data) { return v = data; } + static ValueType& Set(ValueType& v, ArrayType data, typename ValueType::AllocatorType&) { return v = data; } +}; + +template +struct TypeHelper { + typedef typename ValueType::ConstArray ArrayType; + static bool Is(const ValueType& v) { return v.IsArray(); } + static ArrayType Get(const ValueType& v) { return v.GetArray(); } +}; + +template +struct TypeHelper { + typedef typename ValueType::Object ObjectType; + static bool Is(const ValueType& v) { return v.IsObject(); } + static ObjectType Get(ValueType& v) { return v.GetObject(); } + static ValueType& Set(ValueType& v, ObjectType data) { return v = data; } + static ValueType& Set(ValueType& v, ObjectType data, typename ValueType::AllocatorType&) { return v = data; } +}; + +template +struct TypeHelper { + typedef typename ValueType::ConstObject ObjectType; + static bool Is(const ValueType& v) { return v.IsObject(); } + static ObjectType Get(const ValueType& v) { return v.GetObject(); } +}; + +} // namespace internal + +// Forward declarations +template class GenericArray; +template class GenericObject; + +/////////////////////////////////////////////////////////////////////////////// +// GenericValue + +//! Represents a JSON value. Use Value for UTF8 encoding and default allocator. +/*! + A JSON value can be one of 7 types. This class is a variant type supporting + these types. + + Use the Value if UTF8 and default allocator + + \tparam Encoding Encoding of the value. (Even non-string values need to have the same encoding in a document) + \tparam Allocator Allocator type for allocating memory of object, array and string. +*/ +template > +class GenericValue { +public: + //! Name-value pair in an object. + typedef GenericMember Member; + typedef Encoding EncodingType; //!< Encoding type from template parameter. + typedef Allocator AllocatorType; //!< Allocator type from template parameter. + typedef typename Encoding::Ch Ch; //!< Character type derived from Encoding. + typedef GenericStringRef StringRefType; //!< Reference to a constant string + typedef typename GenericMemberIterator::Iterator MemberIterator; //!< Member iterator for iterating in object. + typedef typename GenericMemberIterator::Iterator ConstMemberIterator; //!< Constant member iterator for iterating in object. + typedef GenericValue* ValueIterator; //!< Value iterator for iterating in array. + typedef const GenericValue* ConstValueIterator; //!< Constant value iterator for iterating in array. + typedef GenericValue ValueType; //!< Value type of itself. + typedef GenericArray Array; + typedef GenericArray ConstArray; + typedef GenericObject Object; + typedef GenericObject ConstObject; + + //!@name Constructors and destructor. + //@{ + + //! Default constructor creates a null value. + GenericValue() RAPIDJSON_NOEXCEPT : data_() { data_.f.flags = kNullFlag; } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move constructor in C++11 + GenericValue(GenericValue&& rhs) RAPIDJSON_NOEXCEPT : data_(rhs.data_) { + rhs.data_.f.flags = kNullFlag; // give up contents + } +#endif + +private: + //! Copy constructor is not permitted. + GenericValue(const GenericValue& rhs); + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Moving from a GenericDocument is not permitted. + template + GenericValue(GenericDocument&& rhs); + + //! Move assignment from a GenericDocument is not permitted. + template + GenericValue& operator=(GenericDocument&& rhs); +#endif + +public: + + //! Constructor with JSON value type. + /*! This creates a Value of specified type with default content. + \param type Type of the value. + \note Default content for number is zero. + */ + explicit GenericValue(Type type) RAPIDJSON_NOEXCEPT : data_() { + static const uint16_t defaultFlags[] = { + kNullFlag, kFalseFlag, kTrueFlag, kObjectFlag, kArrayFlag, kShortStringFlag, + kNumberAnyFlag + }; + RAPIDJSON_NOEXCEPT_ASSERT(type >= kNullType && type <= kNumberType); + data_.f.flags = defaultFlags[type]; + + // Use ShortString to store empty string. + if (type == kStringType) + data_.ss.SetLength(0); + } + + //! Explicit copy constructor (with allocator) + /*! Creates a copy of a Value by using the given Allocator + \tparam SourceAllocator allocator of \c rhs + \param rhs Value to copy from (read-only) + \param allocator Allocator for allocating copied elements and buffers. Commonly use GenericDocument::GetAllocator(). + \param copyConstStrings Force copying of constant strings (e.g. referencing an in-situ buffer) + \see CopyFrom() + */ + template + GenericValue(const GenericValue& rhs, Allocator& allocator, bool copyConstStrings = false) { + switch (rhs.GetType()) { + case kObjectType: { + SizeType count = rhs.data_.o.size; + Member* lm = reinterpret_cast(allocator.Malloc(count * sizeof(Member))); + const typename GenericValue::Member* rm = rhs.GetMembersPointer(); + for (SizeType i = 0; i < count; i++) { + new (&lm[i].name) GenericValue(rm[i].name, allocator, copyConstStrings); + new (&lm[i].value) GenericValue(rm[i].value, allocator, copyConstStrings); + } + data_.f.flags = kObjectFlag; + data_.o.size = data_.o.capacity = count; + SetMembersPointer(lm); + } + break; + case kArrayType: { + SizeType count = rhs.data_.a.size; + GenericValue* le = reinterpret_cast(allocator.Malloc(count * sizeof(GenericValue))); + const GenericValue* re = rhs.GetElementsPointer(); + for (SizeType i = 0; i < count; i++) + new (&le[i]) GenericValue(re[i], allocator, copyConstStrings); + data_.f.flags = kArrayFlag; + data_.a.size = data_.a.capacity = count; + SetElementsPointer(le); + } + break; + case kStringType: + if (rhs.data_.f.flags == kConstStringFlag && !copyConstStrings) { + data_.f.flags = rhs.data_.f.flags; + data_ = *reinterpret_cast(&rhs.data_); + } + else + SetStringRaw(StringRef(rhs.GetString(), rhs.GetStringLength()), allocator); + break; + default: + data_.f.flags = rhs.data_.f.flags; + data_ = *reinterpret_cast(&rhs.data_); + break; + } + } + + //! Constructor for boolean value. + /*! \param b Boolean value + \note This constructor is limited to \em real boolean values and rejects + implicitly converted types like arbitrary pointers. Use an explicit cast + to \c bool, if you want to construct a boolean JSON value in such cases. + */ +#ifndef RAPIDJSON_DOXYGEN_RUNNING // hide SFINAE from Doxygen + template + explicit GenericValue(T b, RAPIDJSON_ENABLEIF((internal::IsSame))) RAPIDJSON_NOEXCEPT // See #472 +#else + explicit GenericValue(bool b) RAPIDJSON_NOEXCEPT +#endif + : data_() { + // safe-guard against failing SFINAE + RAPIDJSON_STATIC_ASSERT((internal::IsSame::Value)); + data_.f.flags = b ? kTrueFlag : kFalseFlag; + } + + //! Constructor for int value. + explicit GenericValue(int i) RAPIDJSON_NOEXCEPT : data_() { + data_.n.i64 = i; + data_.f.flags = (i >= 0) ? (kNumberIntFlag | kUintFlag | kUint64Flag) : kNumberIntFlag; + } + + //! Constructor for unsigned value. + explicit GenericValue(unsigned u) RAPIDJSON_NOEXCEPT : data_() { + data_.n.u64 = u; + data_.f.flags = (u & 0x80000000) ? kNumberUintFlag : (kNumberUintFlag | kIntFlag | kInt64Flag); + } + + //! Constructor for int64_t value. + explicit GenericValue(int64_t i64) RAPIDJSON_NOEXCEPT : data_() { + data_.n.i64 = i64; + data_.f.flags = kNumberInt64Flag; + if (i64 >= 0) { + data_.f.flags |= kNumberUint64Flag; + if (!(static_cast(i64) & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x00000000))) + data_.f.flags |= kUintFlag; + if (!(static_cast(i64) & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000))) + data_.f.flags |= kIntFlag; + } + else if (i64 >= static_cast(RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000))) + data_.f.flags |= kIntFlag; + } + + //! Constructor for uint64_t value. + explicit GenericValue(uint64_t u64) RAPIDJSON_NOEXCEPT : data_() { + data_.n.u64 = u64; + data_.f.flags = kNumberUint64Flag; + if (!(u64 & RAPIDJSON_UINT64_C2(0x80000000, 0x00000000))) + data_.f.flags |= kInt64Flag; + if (!(u64 & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x00000000))) + data_.f.flags |= kUintFlag; + if (!(u64 & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000))) + data_.f.flags |= kIntFlag; + } + + //! Constructor for double value. + explicit GenericValue(double d) RAPIDJSON_NOEXCEPT : data_() { data_.n.d = d; data_.f.flags = kNumberDoubleFlag; } + + //! Constructor for float value. + explicit GenericValue(float f) RAPIDJSON_NOEXCEPT : data_() { data_.n.d = static_cast(f); data_.f.flags = kNumberDoubleFlag; } + + //! Constructor for constant string (i.e. do not make a copy of string) + GenericValue(const Ch* s, SizeType length) RAPIDJSON_NOEXCEPT : data_() { SetStringRaw(StringRef(s, length)); } + + //! Constructor for constant string (i.e. do not make a copy of string) + explicit GenericValue(StringRefType s) RAPIDJSON_NOEXCEPT : data_() { SetStringRaw(s); } + + //! Constructor for copy-string (i.e. do make a copy of string) + GenericValue(const Ch* s, SizeType length, Allocator& allocator) : data_() { SetStringRaw(StringRef(s, length), allocator); } + + //! Constructor for copy-string (i.e. do make a copy of string) + GenericValue(const Ch*s, Allocator& allocator) : data_() { SetStringRaw(StringRef(s), allocator); } + +#if RAPIDJSON_HAS_STDSTRING + //! Constructor for copy-string from a string object (i.e. do make a copy of string) + /*! \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + GenericValue(const std::basic_string& s, Allocator& allocator) : data_() { SetStringRaw(StringRef(s), allocator); } +#endif + + //! Constructor for Array. + /*! + \param a An array obtained by \c GetArray(). + \note \c Array is always pass-by-value. + \note the source array is moved into this value and the sourec array becomes empty. + */ + GenericValue(Array a) RAPIDJSON_NOEXCEPT : data_(a.value_.data_) { + a.value_.data_ = Data(); + a.value_.data_.f.flags = kArrayFlag; + } + + //! Constructor for Object. + /*! + \param o An object obtained by \c GetObject(). + \note \c Object is always pass-by-value. + \note the source object is moved into this value and the sourec object becomes empty. + */ + GenericValue(Object o) RAPIDJSON_NOEXCEPT : data_(o.value_.data_) { + o.value_.data_ = Data(); + o.value_.data_.f.flags = kObjectFlag; + } + + //! Destructor. + /*! Need to destruct elements of array, members of object, or copy-string. + */ + ~GenericValue() { + if (Allocator::kNeedFree) { // Shortcut by Allocator's trait + switch(data_.f.flags) { + case kArrayFlag: + { + GenericValue* e = GetElementsPointer(); + for (GenericValue* v = e; v != e + data_.a.size; ++v) + v->~GenericValue(); + Allocator::Free(e); + } + break; + + case kObjectFlag: + for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m) + m->~Member(); + Allocator::Free(GetMembersPointer()); + break; + + case kCopyStringFlag: + Allocator::Free(const_cast(GetStringPointer())); + break; + + default: + break; // Do nothing for other types. + } + } + } + + //@} + + //!@name Assignment operators + //@{ + + //! Assignment with move semantics. + /*! \param rhs Source of the assignment. It will become a null value after assignment. + */ + GenericValue& operator=(GenericValue& rhs) RAPIDJSON_NOEXCEPT { + if (RAPIDJSON_LIKELY(this != &rhs)) { + this->~GenericValue(); + RawAssign(rhs); + } + return *this; + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move assignment in C++11 + GenericValue& operator=(GenericValue&& rhs) RAPIDJSON_NOEXCEPT { + return *this = rhs.Move(); + } +#endif + + //! Assignment of constant string reference (no copy) + /*! \param str Constant string reference to be assigned + \note This overload is needed to avoid clashes with the generic primitive type assignment overload below. + \see GenericStringRef, operator=(T) + */ + GenericValue& operator=(StringRefType str) RAPIDJSON_NOEXCEPT { + GenericValue s(str); + return *this = s; + } + + //! Assignment with primitive types. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param value The value to be assigned. + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref SetString(const Ch*, Allocator&) (for copying) or + \ref StringRef() (to explicitly mark the pointer as constant) instead. + All other pointer types would implicitly convert to \c bool, + use \ref SetBool() instead. + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::IsPointer), (GenericValue&)) + operator=(T value) { + GenericValue v(value); + return *this = v; + } + + //! Deep-copy assignment from Value + /*! Assigns a \b copy of the Value to the current Value object + \tparam SourceAllocator Allocator type of \c rhs + \param rhs Value to copy from (read-only) + \param allocator Allocator to use for copying + \param copyConstStrings Force copying of constant strings (e.g. referencing an in-situ buffer) + */ + template + GenericValue& CopyFrom(const GenericValue& rhs, Allocator& allocator, bool copyConstStrings = false) { + RAPIDJSON_ASSERT(static_cast(this) != static_cast(&rhs)); + this->~GenericValue(); + new (this) GenericValue(rhs, allocator, copyConstStrings); + return *this; + } + + //! Exchange the contents of this value with those of other. + /*! + \param other Another value. + \note Constant complexity. + */ + GenericValue& Swap(GenericValue& other) RAPIDJSON_NOEXCEPT { + GenericValue temp; + temp.RawAssign(*this); + RawAssign(other); + other.RawAssign(temp); + return *this; + } + + //! free-standing swap function helper + /*! + Helper function to enable support for common swap implementation pattern based on \c std::swap: + \code + void swap(MyClass& a, MyClass& b) { + using std::swap; + swap(a.value, b.value); + // ... + } + \endcode + \see Swap() + */ + friend inline void swap(GenericValue& a, GenericValue& b) RAPIDJSON_NOEXCEPT { a.Swap(b); } + + //! Prepare Value for move semantics + /*! \return *this */ + GenericValue& Move() RAPIDJSON_NOEXCEPT { return *this; } + //@} + + //!@name Equal-to and not-equal-to operators + //@{ + //! Equal-to operator + /*! + \note If an object contains duplicated named member, comparing equality with any object is always \c false. + \note Linear time complexity (number of all values in the subtree and total lengths of all strings). + */ + template + bool operator==(const GenericValue& rhs) const { + typedef GenericValue RhsType; + if (GetType() != rhs.GetType()) + return false; + + switch (GetType()) { + case kObjectType: // Warning: O(n^2) inner-loop + if (data_.o.size != rhs.data_.o.size) + return false; + for (ConstMemberIterator lhsMemberItr = MemberBegin(); lhsMemberItr != MemberEnd(); ++lhsMemberItr) { + typename RhsType::ConstMemberIterator rhsMemberItr = rhs.FindMember(lhsMemberItr->name); + if (rhsMemberItr == rhs.MemberEnd() || lhsMemberItr->value != rhsMemberItr->value) + return false; + } + return true; + + case kArrayType: + if (data_.a.size != rhs.data_.a.size) + return false; + for (SizeType i = 0; i < data_.a.size; i++) + if ((*this)[i] != rhs[i]) + return false; + return true; + + case kStringType: + return StringEqual(rhs); + + case kNumberType: + if (IsDouble() || rhs.IsDouble()) { + double a = GetDouble(); // May convert from integer to double. + double b = rhs.GetDouble(); // Ditto + return a >= b && a <= b; // Prevent -Wfloat-equal + } + else + return data_.n.u64 == rhs.data_.n.u64; + + default: + return true; + } + } + + //! Equal-to operator with const C-string pointer + bool operator==(const Ch* rhs) const { return *this == GenericValue(StringRef(rhs)); } + +#if RAPIDJSON_HAS_STDSTRING + //! Equal-to operator with string object + /*! \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + bool operator==(const std::basic_string& rhs) const { return *this == GenericValue(StringRef(rhs)); } +#endif + + //! Equal-to operator with primitive types + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c double, \c true, \c false + */ + template RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr,internal::IsGenericValue >), (bool)) operator==(const T& rhs) const { return *this == GenericValue(rhs); } + + //! Not-equal-to operator + /*! \return !(*this == rhs) + */ + template + bool operator!=(const GenericValue& rhs) const { return !(*this == rhs); } + + //! Not-equal-to operator with const C-string pointer + bool operator!=(const Ch* rhs) const { return !(*this == rhs); } + + //! Not-equal-to operator with arbitrary types + /*! \return !(*this == rhs) + */ + template RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue), (bool)) operator!=(const T& rhs) const { return !(*this == rhs); } + + //! Equal-to operator with arbitrary types (symmetric version) + /*! \return (rhs == lhs) + */ + template friend RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue), (bool)) operator==(const T& lhs, const GenericValue& rhs) { return rhs == lhs; } + + //! Not-Equal-to operator with arbitrary types (symmetric version) + /*! \return !(rhs == lhs) + */ + template friend RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue), (bool)) operator!=(const T& lhs, const GenericValue& rhs) { return !(rhs == lhs); } + //@} + + //!@name Type + //@{ + + Type GetType() const { return static_cast(data_.f.flags & kTypeMask); } + bool IsNull() const { return data_.f.flags == kNullFlag; } + bool IsFalse() const { return data_.f.flags == kFalseFlag; } + bool IsTrue() const { return data_.f.flags == kTrueFlag; } + bool IsBool() const { return (data_.f.flags & kBoolFlag) != 0; } + bool IsObject() const { return data_.f.flags == kObjectFlag; } + bool IsArray() const { return data_.f.flags == kArrayFlag; } + bool IsNumber() const { return (data_.f.flags & kNumberFlag) != 0; } + bool IsInt() const { return (data_.f.flags & kIntFlag) != 0; } + bool IsUint() const { return (data_.f.flags & kUintFlag) != 0; } + bool IsInt64() const { return (data_.f.flags & kInt64Flag) != 0; } + bool IsUint64() const { return (data_.f.flags & kUint64Flag) != 0; } + bool IsDouble() const { return (data_.f.flags & kDoubleFlag) != 0; } + bool IsString() const { return (data_.f.flags & kStringFlag) != 0; } + + // Checks whether a number can be losslessly converted to a double. + bool IsLosslessDouble() const { + if (!IsNumber()) return false; + if (IsUint64()) { + uint64_t u = GetUint64(); + volatile double d = static_cast(u); + return (d >= 0.0) + && (d < static_cast((std::numeric_limits::max)())) + && (u == static_cast(d)); + } + if (IsInt64()) { + int64_t i = GetInt64(); + volatile double d = static_cast(i); + return (d >= static_cast((std::numeric_limits::min)())) + && (d < static_cast((std::numeric_limits::max)())) + && (i == static_cast(d)); + } + return true; // double, int, uint are always lossless + } + + // Checks whether a number is a float (possible lossy). + bool IsFloat() const { + if ((data_.f.flags & kDoubleFlag) == 0) + return false; + double d = GetDouble(); + return d >= -3.4028234e38 && d <= 3.4028234e38; + } + // Checks whether a number can be losslessly converted to a float. + bool IsLosslessFloat() const { + if (!IsNumber()) return false; + double a = GetDouble(); + if (a < static_cast(-(std::numeric_limits::max)()) + || a > static_cast((std::numeric_limits::max)())) + return false; + double b = static_cast(static_cast(a)); + return a >= b && a <= b; // Prevent -Wfloat-equal + } + + //@} + + //!@name Null + //@{ + + GenericValue& SetNull() { this->~GenericValue(); new (this) GenericValue(); return *this; } + + //@} + + //!@name Bool + //@{ + + bool GetBool() const { RAPIDJSON_ASSERT(IsBool()); return data_.f.flags == kTrueFlag; } + //!< Set boolean value + /*! \post IsBool() == true */ + GenericValue& SetBool(bool b) { this->~GenericValue(); new (this) GenericValue(b); return *this; } + + //@} + + //!@name Object + //@{ + + //! Set this value as an empty object. + /*! \post IsObject() == true */ + GenericValue& SetObject() { this->~GenericValue(); new (this) GenericValue(kObjectType); return *this; } + + //! Get the number of members in the object. + SizeType MemberCount() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.size; } + + //! Get the capacity of object. + SizeType MemberCapacity() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.capacity; } + + //! Check whether the object is empty. + bool ObjectEmpty() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.size == 0; } + + //! Get a value from an object associated with the name. + /*! \pre IsObject() == true + \tparam T Either \c Ch or \c const \c Ch (template used for disambiguation with \ref operator[](SizeType)) + \note In version 0.1x, if the member is not found, this function returns a null value. This makes issue 7. + Since 0.2, if the name is not correct, it will assert. + If user is unsure whether a member exists, user should use HasMember() first. + A better approach is to use FindMember(). + \note Linear time complexity. + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr::Type, Ch> >),(GenericValue&)) operator[](T* name) { + GenericValue n(StringRef(name)); + return (*this)[n]; + } + template + RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr::Type, Ch> >),(const GenericValue&)) operator[](T* name) const { return const_cast(*this)[name]; } + + //! Get a value from an object associated with the name. + /*! \pre IsObject() == true + \tparam SourceAllocator Allocator of the \c name value + + \note Compared to \ref operator[](T*), this version is faster because it does not need a StrLen(). + And it can also handle strings with embedded null characters. + + \note Linear time complexity. + */ + template + GenericValue& operator[](const GenericValue& name) { + MemberIterator member = FindMember(name); + if (member != MemberEnd()) + return member->value; + else { + RAPIDJSON_ASSERT(false); // see above note + + // This will generate -Wexit-time-destructors in clang + // static GenericValue NullValue; + // return NullValue; + + // Use static buffer and placement-new to prevent destruction + static char buffer[sizeof(GenericValue)]; + return *new (buffer) GenericValue(); + } + } + template + const GenericValue& operator[](const GenericValue& name) const { return const_cast(*this)[name]; } + +#if RAPIDJSON_HAS_STDSTRING + //! Get a value from an object associated with name (string object). + GenericValue& operator[](const std::basic_string& name) { return (*this)[GenericValue(StringRef(name))]; } + const GenericValue& operator[](const std::basic_string& name) const { return (*this)[GenericValue(StringRef(name))]; } +#endif + + //! Const member iterator + /*! \pre IsObject() == true */ + ConstMemberIterator MemberBegin() const { RAPIDJSON_ASSERT(IsObject()); return ConstMemberIterator(GetMembersPointer()); } + //! Const \em past-the-end member iterator + /*! \pre IsObject() == true */ + ConstMemberIterator MemberEnd() const { RAPIDJSON_ASSERT(IsObject()); return ConstMemberIterator(GetMembersPointer() + data_.o.size); } + //! Member iterator + /*! \pre IsObject() == true */ + MemberIterator MemberBegin() { RAPIDJSON_ASSERT(IsObject()); return MemberIterator(GetMembersPointer()); } + //! \em Past-the-end member iterator + /*! \pre IsObject() == true */ + MemberIterator MemberEnd() { RAPIDJSON_ASSERT(IsObject()); return MemberIterator(GetMembersPointer() + data_.o.size); } + + //! Request the object to have enough capacity to store members. + /*! \param newCapacity The capacity that the object at least need to have. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note Linear time complexity. + */ + GenericValue& MemberReserve(SizeType newCapacity, Allocator &allocator) { + RAPIDJSON_ASSERT(IsObject()); + if (newCapacity > data_.o.capacity) { + SetMembersPointer(reinterpret_cast(allocator.Realloc(GetMembersPointer(), data_.o.capacity * sizeof(Member), newCapacity * sizeof(Member)))); + data_.o.capacity = newCapacity; + } + return *this; + } + + //! Check whether a member exists in the object. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Whether a member with that name exists. + \note It is better to use FindMember() directly if you need the obtain the value as well. + \note Linear time complexity. + */ + bool HasMember(const Ch* name) const { return FindMember(name) != MemberEnd(); } + +#if RAPIDJSON_HAS_STDSTRING + //! Check whether a member exists in the object with string object. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Whether a member with that name exists. + \note It is better to use FindMember() directly if you need the obtain the value as well. + \note Linear time complexity. + */ + bool HasMember(const std::basic_string& name) const { return FindMember(name) != MemberEnd(); } +#endif + + //! Check whether a member exists in the object with GenericValue name. + /*! + This version is faster because it does not need a StrLen(). It can also handle string with null character. + \param name Member name to be searched. + \pre IsObject() == true + \return Whether a member with that name exists. + \note It is better to use FindMember() directly if you need the obtain the value as well. + \note Linear time complexity. + */ + template + bool HasMember(const GenericValue& name) const { return FindMember(name) != MemberEnd(); } + + //! Find member by name. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Iterator to member, if it exists. + Otherwise returns \ref MemberEnd(). + + \note Earlier versions of Rapidjson returned a \c NULL pointer, in case + the requested member doesn't exist. For consistency with e.g. + \c std::map, this has been changed to MemberEnd() now. + \note Linear time complexity. + */ + MemberIterator FindMember(const Ch* name) { + GenericValue n(StringRef(name)); + return FindMember(n); + } + + ConstMemberIterator FindMember(const Ch* name) const { return const_cast(*this).FindMember(name); } + + //! Find member by name. + /*! + This version is faster because it does not need a StrLen(). It can also handle string with null character. + \param name Member name to be searched. + \pre IsObject() == true + \return Iterator to member, if it exists. + Otherwise returns \ref MemberEnd(). + + \note Earlier versions of Rapidjson returned a \c NULL pointer, in case + the requested member doesn't exist. For consistency with e.g. + \c std::map, this has been changed to MemberEnd() now. + \note Linear time complexity. + */ + template + MemberIterator FindMember(const GenericValue& name) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(name.IsString()); + MemberIterator member = MemberBegin(); + for ( ; member != MemberEnd(); ++member) + if (name.StringEqual(member->name)) + break; + return member; + } + template ConstMemberIterator FindMember(const GenericValue& name) const { return const_cast(*this).FindMember(name); } + +#if RAPIDJSON_HAS_STDSTRING + //! Find member by string object name. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Iterator to member, if it exists. + Otherwise returns \ref MemberEnd(). + */ + MemberIterator FindMember(const std::basic_string& name) { return FindMember(GenericValue(StringRef(name))); } + ConstMemberIterator FindMember(const std::basic_string& name) const { return FindMember(GenericValue(StringRef(name))); } +#endif + + //! Add a member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value Value of any type. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note The ownership of \c name and \c value will be transferred to this object on success. + \pre IsObject() && name.IsString() + \post name.IsNull() && value.IsNull() + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(GenericValue& name, GenericValue& value, Allocator& allocator) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(name.IsString()); + + ObjectData& o = data_.o; + if (o.size >= o.capacity) + MemberReserve(o.capacity == 0 ? kDefaultObjectCapacity : (o.capacity + (o.capacity + 1) / 2), allocator); + Member* members = GetMembersPointer(); + members[o.size].name.RawAssign(name); + members[o.size].value.RawAssign(value); + o.size++; + return *this; + } + + //! Add a constant string value as member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value constant string reference as value of member. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + \note This overload is needed to avoid clashes with the generic primitive type AddMember(GenericValue&,T,Allocator&) overload below. + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(GenericValue& name, StringRefType value, Allocator& allocator) { + GenericValue v(value); + return AddMember(name, v, allocator); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Add a string object as member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value constant string reference as value of member. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + \note This overload is needed to avoid clashes with the generic primitive type AddMember(GenericValue&,T,Allocator&) overload below. + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(GenericValue& name, std::basic_string& value, Allocator& allocator) { + GenericValue v(value, allocator); + return AddMember(name, v, allocator); + } +#endif + + //! Add any primitive value as member (name-value pair) to the object. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param name A string value as name of member. + \param value Value of primitive type \c T as value of member + \param allocator Allocator for reallocating memory. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref AddMember(StringRefType, GenericValue&, Allocator&) or \ref + AddMember(StringRefType, StringRefType, Allocator&). + All other pointer types would implicitly convert to \c bool, + use an explicit cast instead, if needed. + \note Amortized Constant time complexity. + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (GenericValue&)) + AddMember(GenericValue& name, T value, Allocator& allocator) { + GenericValue v(value); + return AddMember(name, v, allocator); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericValue& AddMember(GenericValue&& name, GenericValue&& value, Allocator& allocator) { + return AddMember(name, value, allocator); + } + GenericValue& AddMember(GenericValue&& name, GenericValue& value, Allocator& allocator) { + return AddMember(name, value, allocator); + } + GenericValue& AddMember(GenericValue& name, GenericValue&& value, Allocator& allocator) { + return AddMember(name, value, allocator); + } + GenericValue& AddMember(StringRefType name, GenericValue&& value, Allocator& allocator) { + GenericValue n(name); + return AddMember(n, value, allocator); + } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + + + //! Add a member (name-value pair) to the object. + /*! \param name A constant string reference as name of member. + \param value Value of any type. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note The ownership of \c value will be transferred to this object on success. + \pre IsObject() + \post value.IsNull() + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(StringRefType name, GenericValue& value, Allocator& allocator) { + GenericValue n(name); + return AddMember(n, value, allocator); + } + + //! Add a constant string value as member (name-value pair) to the object. + /*! \param name A constant string reference as name of member. + \param value constant string reference as value of member. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + \note This overload is needed to avoid clashes with the generic primitive type AddMember(StringRefType,T,Allocator&) overload below. + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(StringRefType name, StringRefType value, Allocator& allocator) { + GenericValue v(value); + return AddMember(name, v, allocator); + } + + //! Add any primitive value as member (name-value pair) to the object. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param name A constant string reference as name of member. + \param value Value of primitive type \c T as value of member + \param allocator Allocator for reallocating memory. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref AddMember(StringRefType, GenericValue&, Allocator&) or \ref + AddMember(StringRefType, StringRefType, Allocator&). + All other pointer types would implicitly convert to \c bool, + use an explicit cast instead, if needed. + \note Amortized Constant time complexity. + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (GenericValue&)) + AddMember(StringRefType name, T value, Allocator& allocator) { + GenericValue n(name); + return AddMember(n, value, allocator); + } + + //! Remove all members in the object. + /*! This function do not deallocate memory in the object, i.e. the capacity is unchanged. + \note Linear time complexity. + */ + void RemoveAllMembers() { + RAPIDJSON_ASSERT(IsObject()); + for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m) + m->~Member(); + data_.o.size = 0; + } + + //! Remove a member in object by its name. + /*! \param name Name of member to be removed. + \return Whether the member existed. + \note This function may reorder the object members. Use \ref + EraseMember(ConstMemberIterator) if you need to preserve the + relative order of the remaining members. + \note Linear time complexity. + */ + bool RemoveMember(const Ch* name) { + GenericValue n(StringRef(name)); + return RemoveMember(n); + } + +#if RAPIDJSON_HAS_STDSTRING + bool RemoveMember(const std::basic_string& name) { return RemoveMember(GenericValue(StringRef(name))); } +#endif + + template + bool RemoveMember(const GenericValue& name) { + MemberIterator m = FindMember(name); + if (m != MemberEnd()) { + RemoveMember(m); + return true; + } + else + return false; + } + + //! Remove a member in object by iterator. + /*! \param m member iterator (obtained by FindMember() or MemberBegin()). + \return the new iterator after removal. + \note This function may reorder the object members. Use \ref + EraseMember(ConstMemberIterator) if you need to preserve the + relative order of the remaining members. + \note Constant time complexity. + */ + MemberIterator RemoveMember(MemberIterator m) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(data_.o.size > 0); + RAPIDJSON_ASSERT(GetMembersPointer() != 0); + RAPIDJSON_ASSERT(m >= MemberBegin() && m < MemberEnd()); + + MemberIterator last(GetMembersPointer() + (data_.o.size - 1)); + if (data_.o.size > 1 && m != last) + *m = *last; // Move the last one to this place + else + m->~Member(); // Only one left, just destroy + --data_.o.size; + return m; + } + + //! Remove a member from an object by iterator. + /*! \param pos iterator to the member to remove + \pre IsObject() == true && \ref MemberBegin() <= \c pos < \ref MemberEnd() + \return Iterator following the removed element. + If the iterator \c pos refers to the last element, the \ref MemberEnd() iterator is returned. + \note This function preserves the relative order of the remaining object + members. If you do not need this, use the more efficient \ref RemoveMember(MemberIterator). + \note Linear time complexity. + */ + MemberIterator EraseMember(ConstMemberIterator pos) { + return EraseMember(pos, pos +1); + } + + //! Remove members in the range [first, last) from an object. + /*! \param first iterator to the first member to remove + \param last iterator following the last member to remove + \pre IsObject() == true && \ref MemberBegin() <= \c first <= \c last <= \ref MemberEnd() + \return Iterator following the last removed element. + \note This function preserves the relative order of the remaining object + members. + \note Linear time complexity. + */ + MemberIterator EraseMember(ConstMemberIterator first, ConstMemberIterator last) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(data_.o.size > 0); + RAPIDJSON_ASSERT(GetMembersPointer() != 0); + RAPIDJSON_ASSERT(first >= MemberBegin()); + RAPIDJSON_ASSERT(first <= last); + RAPIDJSON_ASSERT(last <= MemberEnd()); + + MemberIterator pos = MemberBegin() + (first - MemberBegin()); + for (MemberIterator itr = pos; itr != last; ++itr) + itr->~Member(); + std::memmove(static_cast(&*pos), &*last, static_cast(MemberEnd() - last) * sizeof(Member)); + data_.o.size -= static_cast(last - first); + return pos; + } + + //! Erase a member in object by its name. + /*! \param name Name of member to be removed. + \return Whether the member existed. + \note Linear time complexity. + */ + bool EraseMember(const Ch* name) { + GenericValue n(StringRef(name)); + return EraseMember(n); + } + +#if RAPIDJSON_HAS_STDSTRING + bool EraseMember(const std::basic_string& name) { return EraseMember(GenericValue(StringRef(name))); } +#endif + + template + bool EraseMember(const GenericValue& name) { + MemberIterator m = FindMember(name); + if (m != MemberEnd()) { + EraseMember(m); + return true; + } + else + return false; + } + + Object GetObject() { RAPIDJSON_ASSERT(IsObject()); return Object(*this); } + ConstObject GetObject() const { RAPIDJSON_ASSERT(IsObject()); return ConstObject(*this); } + + //@} + + //!@name Array + //@{ + + //! Set this value as an empty array. + /*! \post IsArray == true */ + GenericValue& SetArray() { this->~GenericValue(); new (this) GenericValue(kArrayType); return *this; } + + //! Get the number of elements in array. + SizeType Size() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size; } + + //! Get the capacity of array. + SizeType Capacity() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.capacity; } + + //! Check whether the array is empty. + bool Empty() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size == 0; } + + //! Remove all elements in the array. + /*! This function do not deallocate memory in the array, i.e. the capacity is unchanged. + \note Linear time complexity. + */ + void Clear() { + RAPIDJSON_ASSERT(IsArray()); + GenericValue* e = GetElementsPointer(); + for (GenericValue* v = e; v != e + data_.a.size; ++v) + v->~GenericValue(); + data_.a.size = 0; + } + + //! Get an element from array by index. + /*! \pre IsArray() == true + \param index Zero-based index of element. + \see operator[](T*) + */ + GenericValue& operator[](SizeType index) { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(index < data_.a.size); + return GetElementsPointer()[index]; + } + const GenericValue& operator[](SizeType index) const { return const_cast(*this)[index]; } + + //! Element iterator + /*! \pre IsArray() == true */ + ValueIterator Begin() { RAPIDJSON_ASSERT(IsArray()); return GetElementsPointer(); } + //! \em Past-the-end element iterator + /*! \pre IsArray() == true */ + ValueIterator End() { RAPIDJSON_ASSERT(IsArray()); return GetElementsPointer() + data_.a.size; } + //! Constant element iterator + /*! \pre IsArray() == true */ + ConstValueIterator Begin() const { return const_cast(*this).Begin(); } + //! Constant \em past-the-end element iterator + /*! \pre IsArray() == true */ + ConstValueIterator End() const { return const_cast(*this).End(); } + + //! Request the array to have enough capacity to store elements. + /*! \param newCapacity The capacity that the array at least need to have. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note Linear time complexity. + */ + GenericValue& Reserve(SizeType newCapacity, Allocator &allocator) { + RAPIDJSON_ASSERT(IsArray()); + if (newCapacity > data_.a.capacity) { + SetElementsPointer(reinterpret_cast(allocator.Realloc(GetElementsPointer(), data_.a.capacity * sizeof(GenericValue), newCapacity * sizeof(GenericValue)))); + data_.a.capacity = newCapacity; + } + return *this; + } + + //! Append a GenericValue at the end of the array. + /*! \param value Value to be appended. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \pre IsArray() == true + \post value.IsNull() == true + \return The value itself for fluent API. + \note The ownership of \c value will be transferred to this array on success. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + \note Amortized constant time complexity. + */ + GenericValue& PushBack(GenericValue& value, Allocator& allocator) { + RAPIDJSON_ASSERT(IsArray()); + if (data_.a.size >= data_.a.capacity) + Reserve(data_.a.capacity == 0 ? kDefaultArrayCapacity : (data_.a.capacity + (data_.a.capacity + 1) / 2), allocator); + GetElementsPointer()[data_.a.size++].RawAssign(value); + return *this; + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericValue& PushBack(GenericValue&& value, Allocator& allocator) { + return PushBack(value, allocator); + } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + + //! Append a constant string reference at the end of the array. + /*! \param value Constant string reference to be appended. + \param allocator Allocator for reallocating memory. It must be the same one used previously. Commonly use GenericDocument::GetAllocator(). + \pre IsArray() == true + \return The value itself for fluent API. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + \note Amortized constant time complexity. + \see GenericStringRef + */ + GenericValue& PushBack(StringRefType value, Allocator& allocator) { + return (*this).template PushBack(value, allocator); + } + + //! Append a primitive value at the end of the array. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param value Value of primitive type T to be appended. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \pre IsArray() == true + \return The value itself for fluent API. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref PushBack(GenericValue&, Allocator&) or \ref + PushBack(StringRefType, Allocator&). + All other pointer types would implicitly convert to \c bool, + use an explicit cast instead, if needed. + \note Amortized constant time complexity. + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (GenericValue&)) + PushBack(T value, Allocator& allocator) { + GenericValue v(value); + return PushBack(v, allocator); + } + + //! Remove the last element in the array. + /*! + \note Constant time complexity. + */ + GenericValue& PopBack() { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(!Empty()); + GetElementsPointer()[--data_.a.size].~GenericValue(); + return *this; + } + + //! Remove an element of array by iterator. + /*! + \param pos iterator to the element to remove + \pre IsArray() == true && \ref Begin() <= \c pos < \ref End() + \return Iterator following the removed element. If the iterator pos refers to the last element, the End() iterator is returned. + \note Linear time complexity. + */ + ValueIterator Erase(ConstValueIterator pos) { + return Erase(pos, pos + 1); + } + + //! Remove elements in the range [first, last) of the array. + /*! + \param first iterator to the first element to remove + \param last iterator following the last element to remove + \pre IsArray() == true && \ref Begin() <= \c first <= \c last <= \ref End() + \return Iterator following the last removed element. + \note Linear time complexity. + */ + ValueIterator Erase(ConstValueIterator first, ConstValueIterator last) { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(data_.a.size > 0); + RAPIDJSON_ASSERT(GetElementsPointer() != 0); + RAPIDJSON_ASSERT(first >= Begin()); + RAPIDJSON_ASSERT(first <= last); + RAPIDJSON_ASSERT(last <= End()); + ValueIterator pos = Begin() + (first - Begin()); + for (ValueIterator itr = pos; itr != last; ++itr) + itr->~GenericValue(); + std::memmove(static_cast(pos), last, static_cast(End() - last) * sizeof(GenericValue)); + data_.a.size -= static_cast(last - first); + return pos; + } + + Array GetArray() { RAPIDJSON_ASSERT(IsArray()); return Array(*this); } + ConstArray GetArray() const { RAPIDJSON_ASSERT(IsArray()); return ConstArray(*this); } + + //@} + + //!@name Number + //@{ + + int GetInt() const { RAPIDJSON_ASSERT(data_.f.flags & kIntFlag); return data_.n.i.i; } + unsigned GetUint() const { RAPIDJSON_ASSERT(data_.f.flags & kUintFlag); return data_.n.u.u; } + int64_t GetInt64() const { RAPIDJSON_ASSERT(data_.f.flags & kInt64Flag); return data_.n.i64; } + uint64_t GetUint64() const { RAPIDJSON_ASSERT(data_.f.flags & kUint64Flag); return data_.n.u64; } + + //! Get the value as double type. + /*! \note If the value is 64-bit integer type, it may lose precision. Use \c IsLosslessDouble() to check whether the converison is lossless. + */ + double GetDouble() const { + RAPIDJSON_ASSERT(IsNumber()); + if ((data_.f.flags & kDoubleFlag) != 0) return data_.n.d; // exact type, no conversion. + if ((data_.f.flags & kIntFlag) != 0) return data_.n.i.i; // int -> double + if ((data_.f.flags & kUintFlag) != 0) return data_.n.u.u; // unsigned -> double + if ((data_.f.flags & kInt64Flag) != 0) return static_cast(data_.n.i64); // int64_t -> double (may lose precision) + RAPIDJSON_ASSERT((data_.f.flags & kUint64Flag) != 0); return static_cast(data_.n.u64); // uint64_t -> double (may lose precision) + } + + //! Get the value as float type. + /*! \note If the value is 64-bit integer type, it may lose precision. Use \c IsLosslessFloat() to check whether the converison is lossless. + */ + float GetFloat() const { + return static_cast(GetDouble()); + } + + GenericValue& SetInt(int i) { this->~GenericValue(); new (this) GenericValue(i); return *this; } + GenericValue& SetUint(unsigned u) { this->~GenericValue(); new (this) GenericValue(u); return *this; } + GenericValue& SetInt64(int64_t i64) { this->~GenericValue(); new (this) GenericValue(i64); return *this; } + GenericValue& SetUint64(uint64_t u64) { this->~GenericValue(); new (this) GenericValue(u64); return *this; } + GenericValue& SetDouble(double d) { this->~GenericValue(); new (this) GenericValue(d); return *this; } + GenericValue& SetFloat(float f) { this->~GenericValue(); new (this) GenericValue(static_cast(f)); return *this; } + + //@} + + //!@name String + //@{ + + const Ch* GetString() const { RAPIDJSON_ASSERT(IsString()); return (data_.f.flags & kInlineStrFlag) ? data_.ss.str : GetStringPointer(); } + + //! Get the length of string. + /*! Since rapidjson permits "\\u0000" in the json string, strlen(v.GetString()) may not equal to v.GetStringLength(). + */ + SizeType GetStringLength() const { RAPIDJSON_ASSERT(IsString()); return ((data_.f.flags & kInlineStrFlag) ? (data_.ss.GetLength()) : data_.s.length); } + + //! Set this value as a string without copying source string. + /*! This version has better performance with supplied length, and also support string containing null character. + \param s source string pointer. + \param length The length of source string, excluding the trailing null terminator. + \return The value itself for fluent API. + \post IsString() == true && GetString() == s && GetStringLength() == length + \see SetString(StringRefType) + */ + GenericValue& SetString(const Ch* s, SizeType length) { return SetString(StringRef(s, length)); } + + //! Set this value as a string without copying source string. + /*! \param s source string reference + \return The value itself for fluent API. + \post IsString() == true && GetString() == s && GetStringLength() == s.length + */ + GenericValue& SetString(StringRefType s) { this->~GenericValue(); SetStringRaw(s); return *this; } + + //! Set this value as a string by copying from source string. + /*! This version has better performance with supplied length, and also support string containing null character. + \param s source string. + \param length The length of source string, excluding the trailing null terminator. + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length + */ + GenericValue& SetString(const Ch* s, SizeType length, Allocator& allocator) { return SetString(StringRef(s, length), allocator); } + + //! Set this value as a string by copying from source string. + /*! \param s source string. + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length + */ + GenericValue& SetString(const Ch* s, Allocator& allocator) { return SetString(StringRef(s), allocator); } + + //! Set this value as a string by copying from source string. + /*! \param s source string reference + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s.s && strcmp(GetString(),s) == 0 && GetStringLength() == length + */ + GenericValue& SetString(StringRefType s, Allocator& allocator) { this->~GenericValue(); SetStringRaw(s, allocator); return *this; } + +#if RAPIDJSON_HAS_STDSTRING + //! Set this value as a string by copying from source string. + /*! \param s source string. + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s.data() && strcmp(GetString(),s.data() == 0 && GetStringLength() == s.size() + \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + GenericValue& SetString(const std::basic_string& s, Allocator& allocator) { return SetString(StringRef(s), allocator); } +#endif + + //@} + + //!@name Array + //@{ + + //! Templated version for checking whether this value is type T. + /*! + \tparam T Either \c bool, \c int, \c unsigned, \c int64_t, \c uint64_t, \c double, \c float, \c const \c char*, \c std::basic_string + */ + template + bool Is() const { return internal::TypeHelper::Is(*this); } + + template + T Get() const { return internal::TypeHelper::Get(*this); } + + template + T Get() { return internal::TypeHelper::Get(*this); } + + template + ValueType& Set(const T& data, AllocatorType& allocator) { return internal::TypeHelper::Set(*this, data, allocator); } + + //@} + + //! Generate events of this value to a Handler. + /*! This function adopts the GoF visitor pattern. + Typical usage is to output this JSON value as JSON text via Writer, which is a Handler. + It can also be used to deep clone this value via GenericDocument, which is also a Handler. + \tparam Handler type of handler. + \param handler An object implementing concept Handler. + */ + template + bool Accept(Handler& handler) const { + switch(GetType()) { + case kNullType: return handler.Null(); + case kFalseType: return handler.Bool(false); + case kTrueType: return handler.Bool(true); + + case kObjectType: + if (RAPIDJSON_UNLIKELY(!handler.StartObject())) + return false; + for (ConstMemberIterator m = MemberBegin(); m != MemberEnd(); ++m) { + RAPIDJSON_ASSERT(m->name.IsString()); // User may change the type of name by MemberIterator. + if (RAPIDJSON_UNLIKELY(!handler.Key(m->name.GetString(), m->name.GetStringLength(), (m->name.data_.f.flags & kCopyFlag) != 0))) + return false; + if (RAPIDJSON_UNLIKELY(!m->value.Accept(handler))) + return false; + } + return handler.EndObject(data_.o.size); + + case kArrayType: + if (RAPIDJSON_UNLIKELY(!handler.StartArray())) + return false; + for (const GenericValue* v = Begin(); v != End(); ++v) + if (RAPIDJSON_UNLIKELY(!v->Accept(handler))) + return false; + return handler.EndArray(data_.a.size); + + case kStringType: + return handler.String(GetString(), GetStringLength(), (data_.f.flags & kCopyFlag) != 0); + + default: + RAPIDJSON_ASSERT(GetType() == kNumberType); + if (IsDouble()) return handler.Double(data_.n.d); + else if (IsInt()) return handler.Int(data_.n.i.i); + else if (IsUint()) return handler.Uint(data_.n.u.u); + else if (IsInt64()) return handler.Int64(data_.n.i64); + else return handler.Uint64(data_.n.u64); + } + } + +private: + template friend class GenericValue; + template friend class GenericDocument; + + enum { + kBoolFlag = 0x0008, + kNumberFlag = 0x0010, + kIntFlag = 0x0020, + kUintFlag = 0x0040, + kInt64Flag = 0x0080, + kUint64Flag = 0x0100, + kDoubleFlag = 0x0200, + kStringFlag = 0x0400, + kCopyFlag = 0x0800, + kInlineStrFlag = 0x1000, + + // Initial flags of different types. + kNullFlag = kNullType, + kTrueFlag = kTrueType | kBoolFlag, + kFalseFlag = kFalseType | kBoolFlag, + kNumberIntFlag = kNumberType | kNumberFlag | kIntFlag | kInt64Flag, + kNumberUintFlag = kNumberType | kNumberFlag | kUintFlag | kUint64Flag | kInt64Flag, + kNumberInt64Flag = kNumberType | kNumberFlag | kInt64Flag, + kNumberUint64Flag = kNumberType | kNumberFlag | kUint64Flag, + kNumberDoubleFlag = kNumberType | kNumberFlag | kDoubleFlag, + kNumberAnyFlag = kNumberType | kNumberFlag | kIntFlag | kInt64Flag | kUintFlag | kUint64Flag | kDoubleFlag, + kConstStringFlag = kStringType | kStringFlag, + kCopyStringFlag = kStringType | kStringFlag | kCopyFlag, + kShortStringFlag = kStringType | kStringFlag | kCopyFlag | kInlineStrFlag, + kObjectFlag = kObjectType, + kArrayFlag = kArrayType, + + kTypeMask = 0x07 + }; + + static const SizeType kDefaultArrayCapacity = 16; + static const SizeType kDefaultObjectCapacity = 16; + + struct Flag { +#if RAPIDJSON_48BITPOINTER_OPTIMIZATION + char payload[sizeof(SizeType) * 2 + 6]; // 2 x SizeType + lower 48-bit pointer +#elif RAPIDJSON_64BIT + char payload[sizeof(SizeType) * 2 + sizeof(void*) + 6]; // 6 padding bytes +#else + char payload[sizeof(SizeType) * 2 + sizeof(void*) + 2]; // 2 padding bytes +#endif + uint16_t flags; + }; + + struct String { + SizeType length; + SizeType hashcode; //!< reserved + const Ch* str; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + // implementation detail: ShortString can represent zero-terminated strings up to MaxSize chars + // (excluding the terminating zero) and store a value to determine the length of the contained + // string in the last character str[LenPos] by storing "MaxSize - length" there. If the string + // to store has the maximal length of MaxSize then str[LenPos] will be 0 and therefore act as + // the string terminator as well. For getting the string length back from that value just use + // "MaxSize - str[LenPos]". + // This allows to store 13-chars strings in 32-bit mode, 21-chars strings in 64-bit mode, + // 13-chars strings for RAPIDJSON_48BITPOINTER_OPTIMIZATION=1 inline (for `UTF8`-encoded strings). + struct ShortString { + enum { MaxChars = sizeof(static_cast(0)->payload) / sizeof(Ch), MaxSize = MaxChars - 1, LenPos = MaxSize }; + Ch str[MaxChars]; + + inline static bool Usable(SizeType len) { return (MaxSize >= len); } + inline void SetLength(SizeType len) { str[LenPos] = static_cast(MaxSize - len); } + inline SizeType GetLength() const { return static_cast(MaxSize - str[LenPos]); } + }; // at most as many bytes as "String" above => 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + // By using proper binary layout, retrieval of different integer types do not need conversions. + union Number { +#if RAPIDJSON_ENDIAN == RAPIDJSON_LITTLEENDIAN + struct I { + int i; + char padding[4]; + }i; + struct U { + unsigned u; + char padding2[4]; + }u; +#else + struct I { + char padding[4]; + int i; + }i; + struct U { + char padding2[4]; + unsigned u; + }u; +#endif + int64_t i64; + uint64_t u64; + double d; + }; // 8 bytes + + struct ObjectData { + SizeType size; + SizeType capacity; + Member* members; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + struct ArrayData { + SizeType size; + SizeType capacity; + GenericValue* elements; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + union Data { + String s; + ShortString ss; + Number n; + ObjectData o; + ArrayData a; + Flag f; + }; // 16 bytes in 32-bit mode, 24 bytes in 64-bit mode, 16 bytes in 64-bit with RAPIDJSON_48BITPOINTER_OPTIMIZATION + + RAPIDJSON_FORCEINLINE const Ch* GetStringPointer() const { return RAPIDJSON_GETPOINTER(Ch, data_.s.str); } + RAPIDJSON_FORCEINLINE const Ch* SetStringPointer(const Ch* str) { return RAPIDJSON_SETPOINTER(Ch, data_.s.str, str); } + RAPIDJSON_FORCEINLINE GenericValue* GetElementsPointer() const { return RAPIDJSON_GETPOINTER(GenericValue, data_.a.elements); } + RAPIDJSON_FORCEINLINE GenericValue* SetElementsPointer(GenericValue* elements) { return RAPIDJSON_SETPOINTER(GenericValue, data_.a.elements, elements); } + RAPIDJSON_FORCEINLINE Member* GetMembersPointer() const { return RAPIDJSON_GETPOINTER(Member, data_.o.members); } + RAPIDJSON_FORCEINLINE Member* SetMembersPointer(Member* members) { return RAPIDJSON_SETPOINTER(Member, data_.o.members, members); } + + // Initialize this value as array with initial data, without calling destructor. + void SetArrayRaw(GenericValue* values, SizeType count, Allocator& allocator) { + data_.f.flags = kArrayFlag; + if (count) { + GenericValue* e = static_cast(allocator.Malloc(count * sizeof(GenericValue))); + SetElementsPointer(e); + std::memcpy(static_cast(e), values, count * sizeof(GenericValue)); + } + else + SetElementsPointer(0); + data_.a.size = data_.a.capacity = count; + } + + //! Initialize this value as object with initial data, without calling destructor. + void SetObjectRaw(Member* members, SizeType count, Allocator& allocator) { + data_.f.flags = kObjectFlag; + if (count) { + Member* m = static_cast(allocator.Malloc(count * sizeof(Member))); + SetMembersPointer(m); + std::memcpy(static_cast(m), members, count * sizeof(Member)); + } + else + SetMembersPointer(0); + data_.o.size = data_.o.capacity = count; + } + + //! Initialize this value as constant string, without calling destructor. + void SetStringRaw(StringRefType s) RAPIDJSON_NOEXCEPT { + data_.f.flags = kConstStringFlag; + SetStringPointer(s); + data_.s.length = s.length; + } + + //! Initialize this value as copy string with initial data, without calling destructor. + void SetStringRaw(StringRefType s, Allocator& allocator) { + Ch* str = 0; + if (ShortString::Usable(s.length)) { + data_.f.flags = kShortStringFlag; + data_.ss.SetLength(s.length); + str = data_.ss.str; + } else { + data_.f.flags = kCopyStringFlag; + data_.s.length = s.length; + str = static_cast(allocator.Malloc((s.length + 1) * sizeof(Ch))); + SetStringPointer(str); + } + std::memcpy(str, s, s.length * sizeof(Ch)); + str[s.length] = '\0'; + } + + //! Assignment without calling destructor + void RawAssign(GenericValue& rhs) RAPIDJSON_NOEXCEPT { + data_ = rhs.data_; + // data_.f.flags = rhs.data_.f.flags; + rhs.data_.f.flags = kNullFlag; + } + + template + bool StringEqual(const GenericValue& rhs) const { + RAPIDJSON_ASSERT(IsString()); + RAPIDJSON_ASSERT(rhs.IsString()); + + const SizeType len1 = GetStringLength(); + const SizeType len2 = rhs.GetStringLength(); + if(len1 != len2) { return false; } + + const Ch* const str1 = GetString(); + const Ch* const str2 = rhs.GetString(); + if(str1 == str2) { return true; } // fast path for constant string + + return (std::memcmp(str1, str2, sizeof(Ch) * len1) == 0); + } + + Data data_; +}; + +//! GenericValue with UTF8 encoding +typedef GenericValue > Value; + +/////////////////////////////////////////////////////////////////////////////// +// GenericDocument + +//! A document for parsing JSON text as DOM. +/*! + \note implements Handler concept + \tparam Encoding Encoding for both parsing and string storage. + \tparam Allocator Allocator for allocating memory for the DOM + \tparam StackAllocator Allocator for allocating memory for stack during parsing. + \warning Although GenericDocument inherits from GenericValue, the API does \b not provide any virtual functions, especially no virtual destructor. To avoid memory leaks, do not \c delete a GenericDocument object via a pointer to a GenericValue. +*/ +template , typename StackAllocator = CrtAllocator> +class GenericDocument : public GenericValue { +public: + typedef typename Encoding::Ch Ch; //!< Character type derived from Encoding. + typedef GenericValue ValueType; //!< Value type of the document. + typedef Allocator AllocatorType; //!< Allocator type from template parameter. + + //! Constructor + /*! Creates an empty document of specified type. + \param type Mandatory type of object to create. + \param allocator Optional allocator for allocating memory. + \param stackCapacity Optional initial capacity of stack in bytes. + \param stackAllocator Optional allocator for allocating memory for stack. + */ + explicit GenericDocument(Type type, Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) : + GenericValue(type), allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_() + { + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + } + + //! Constructor + /*! Creates an empty document which type is Null. + \param allocator Optional allocator for allocating memory. + \param stackCapacity Optional initial capacity of stack in bytes. + \param stackAllocator Optional allocator for allocating memory for stack. + */ + GenericDocument(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) : + allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_() + { + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move constructor in C++11 + GenericDocument(GenericDocument&& rhs) RAPIDJSON_NOEXCEPT + : ValueType(std::forward(rhs)), // explicit cast to avoid prohibited move from Document + allocator_(rhs.allocator_), + ownAllocator_(rhs.ownAllocator_), + stack_(std::move(rhs.stack_)), + parseResult_(rhs.parseResult_) + { + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.parseResult_ = ParseResult(); + } +#endif + + ~GenericDocument() { + Destroy(); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move assignment in C++11 + GenericDocument& operator=(GenericDocument&& rhs) RAPIDJSON_NOEXCEPT + { + // The cast to ValueType is necessary here, because otherwise it would + // attempt to call GenericValue's templated assignment operator. + ValueType::operator=(std::forward(rhs)); + + // Calling the destructor here would prematurely call stack_'s destructor + Destroy(); + + allocator_ = rhs.allocator_; + ownAllocator_ = rhs.ownAllocator_; + stack_ = std::move(rhs.stack_); + parseResult_ = rhs.parseResult_; + + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.parseResult_ = ParseResult(); + + return *this; + } +#endif + + //! Exchange the contents of this document with those of another. + /*! + \param rhs Another document. + \note Constant complexity. + \see GenericValue::Swap + */ + GenericDocument& Swap(GenericDocument& rhs) RAPIDJSON_NOEXCEPT { + ValueType::Swap(rhs); + stack_.Swap(rhs.stack_); + internal::Swap(allocator_, rhs.allocator_); + internal::Swap(ownAllocator_, rhs.ownAllocator_); + internal::Swap(parseResult_, rhs.parseResult_); + return *this; + } + + // Allow Swap with ValueType. + // Refer to Effective C++ 3rd Edition/Item 33: Avoid hiding inherited names. + using ValueType::Swap; + + //! free-standing swap function helper + /*! + Helper function to enable support for common swap implementation pattern based on \c std::swap: + \code + void swap(MyClass& a, MyClass& b) { + using std::swap; + swap(a.doc, b.doc); + // ... + } + \endcode + \see Swap() + */ + friend inline void swap(GenericDocument& a, GenericDocument& b) RAPIDJSON_NOEXCEPT { a.Swap(b); } + + //! Populate this document by a generator which produces SAX events. + /*! \tparam Generator A functor with bool f(Handler) prototype. + \param g Generator functor which sends SAX events to the parameter. + \return The document itself for fluent API. + */ + template + GenericDocument& Populate(Generator& g) { + ClearStackOnExit scope(*this); + if (g(*this)) { + RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object + ValueType::operator=(*stack_.template Pop(1));// Move value from stack to document + } + return *this; + } + + //!@name Parse from stream + //!@{ + + //! Parse JSON text from an input stream (with Encoding conversion) + /*! \tparam parseFlags Combination of \ref ParseFlag. + \tparam SourceEncoding Encoding of input stream + \tparam InputStream Type of input stream, implementing Stream concept + \param is Input stream to be parsed. + \return The document itself for fluent API. + */ + template + GenericDocument& ParseStream(InputStream& is) { + GenericReader reader( + stack_.HasAllocator() ? &stack_.GetAllocator() : 0); + ClearStackOnExit scope(*this); + parseResult_ = reader.template Parse(is, *this); + if (parseResult_) { + RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object + ValueType::operator=(*stack_.template Pop(1));// Move value from stack to document + } + return *this; + } + + //! Parse JSON text from an input stream + /*! \tparam parseFlags Combination of \ref ParseFlag. + \tparam InputStream Type of input stream, implementing Stream concept + \param is Input stream to be parsed. + \return The document itself for fluent API. + */ + template + GenericDocument& ParseStream(InputStream& is) { + return ParseStream(is); + } + + //! Parse JSON text from an input stream (with \ref kParseDefaultFlags) + /*! \tparam InputStream Type of input stream, implementing Stream concept + \param is Input stream to be parsed. + \return The document itself for fluent API. + */ + template + GenericDocument& ParseStream(InputStream& is) { + return ParseStream(is); + } + //!@} + + //!@name Parse in-place from mutable string + //!@{ + + //! Parse JSON text from a mutable string + /*! \tparam parseFlags Combination of \ref ParseFlag. + \param str Mutable zero-terminated string to be parsed. + \return The document itself for fluent API. + */ + template + GenericDocument& ParseInsitu(Ch* str) { + GenericInsituStringStream s(str); + return ParseStream(s); + } + + //! Parse JSON text from a mutable string (with \ref kParseDefaultFlags) + /*! \param str Mutable zero-terminated string to be parsed. + \return The document itself for fluent API. + */ + GenericDocument& ParseInsitu(Ch* str) { + return ParseInsitu(str); + } + //!@} + + //!@name Parse from read-only string + //!@{ + + //! Parse JSON text from a read-only string (with Encoding conversion) + /*! \tparam parseFlags Combination of \ref ParseFlag (must not contain \ref kParseInsituFlag). + \tparam SourceEncoding Transcoding from input Encoding + \param str Read-only zero-terminated string to be parsed. + */ + template + GenericDocument& Parse(const typename SourceEncoding::Ch* str) { + RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); + GenericStringStream s(str); + return ParseStream(s); + } + + //! Parse JSON text from a read-only string + /*! \tparam parseFlags Combination of \ref ParseFlag (must not contain \ref kParseInsituFlag). + \param str Read-only zero-terminated string to be parsed. + */ + template + GenericDocument& Parse(const Ch* str) { + return Parse(str); + } + + //! Parse JSON text from a read-only string (with \ref kParseDefaultFlags) + /*! \param str Read-only zero-terminated string to be parsed. + */ + GenericDocument& Parse(const Ch* str) { + return Parse(str); + } + + template + GenericDocument& Parse(const typename SourceEncoding::Ch* str, size_t length) { + RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); + MemoryStream ms(reinterpret_cast(str), length * sizeof(typename SourceEncoding::Ch)); + EncodedInputStream is(ms); + ParseStream(is); + return *this; + } + + template + GenericDocument& Parse(const Ch* str, size_t length) { + return Parse(str, length); + } + + GenericDocument& Parse(const Ch* str, size_t length) { + return Parse(str, length); + } + +#if RAPIDJSON_HAS_STDSTRING + template + GenericDocument& Parse(const std::basic_string& str) { + // c_str() is constant complexity according to standard. Should be faster than Parse(const char*, size_t) + return Parse(str.c_str()); + } + + template + GenericDocument& Parse(const std::basic_string& str) { + return Parse(str.c_str()); + } + + GenericDocument& Parse(const std::basic_string& str) { + return Parse(str); + } +#endif // RAPIDJSON_HAS_STDSTRING + + //!@} + + //!@name Handling parse errors + //!@{ + + //! Whether a parse error has occurred in the last parsing. + bool HasParseError() const { return parseResult_.IsError(); } + + //! Get the \ref ParseErrorCode of last parsing. + ParseErrorCode GetParseError() const { return parseResult_.Code(); } + + //! Get the position of last parsing error in input, 0 otherwise. + size_t GetErrorOffset() const { return parseResult_.Offset(); } + + //! Implicit conversion to get the last parse result +#ifndef __clang // -Wdocumentation + /*! \return \ref ParseResult of the last parse operation + + \code + Document doc; + ParseResult ok = doc.Parse(json); + if (!ok) + printf( "JSON parse error: %s (%u)\n", GetParseError_En(ok.Code()), ok.Offset()); + \endcode + */ +#endif + operator ParseResult() const { return parseResult_; } + //!@} + + //! Get the allocator of this document. + Allocator& GetAllocator() { + RAPIDJSON_ASSERT(allocator_); + return *allocator_; + } + + //! Get the capacity of stack in bytes. + size_t GetStackCapacity() const { return stack_.GetCapacity(); } + +private: + // clear stack on any exit from ParseStream, e.g. due to exception + struct ClearStackOnExit { + explicit ClearStackOnExit(GenericDocument& d) : d_(d) {} + ~ClearStackOnExit() { d_.ClearStack(); } + private: + ClearStackOnExit(const ClearStackOnExit&); + ClearStackOnExit& operator=(const ClearStackOnExit&); + GenericDocument& d_; + }; + + // callers of the following private Handler functions + // template friend class GenericReader; // for parsing + template friend class GenericValue; // for deep copying + +public: + // Implementation of Handler + bool Null() { new (stack_.template Push()) ValueType(); return true; } + bool Bool(bool b) { new (stack_.template Push()) ValueType(b); return true; } + bool Int(int i) { new (stack_.template Push()) ValueType(i); return true; } + bool Uint(unsigned i) { new (stack_.template Push()) ValueType(i); return true; } + bool Int64(int64_t i) { new (stack_.template Push()) ValueType(i); return true; } + bool Uint64(uint64_t i) { new (stack_.template Push()) ValueType(i); return true; } + bool Double(double d) { new (stack_.template Push()) ValueType(d); return true; } + + bool RawNumber(const Ch* str, SizeType length, bool copy) { + if (copy) + new (stack_.template Push()) ValueType(str, length, GetAllocator()); + else + new (stack_.template Push()) ValueType(str, length); + return true; + } + + bool String(const Ch* str, SizeType length, bool copy) { + if (copy) + new (stack_.template Push()) ValueType(str, length, GetAllocator()); + else + new (stack_.template Push()) ValueType(str, length); + return true; + } + + bool StartObject() { new (stack_.template Push()) ValueType(kObjectType); return true; } + + bool Key(const Ch* str, SizeType length, bool copy) { return String(str, length, copy); } + + bool EndObject(SizeType memberCount) { + typename ValueType::Member* members = stack_.template Pop(memberCount); + stack_.template Top()->SetObjectRaw(members, memberCount, GetAllocator()); + return true; + } + + bool StartArray() { new (stack_.template Push()) ValueType(kArrayType); return true; } + + bool EndArray(SizeType elementCount) { + ValueType* elements = stack_.template Pop(elementCount); + stack_.template Top()->SetArrayRaw(elements, elementCount, GetAllocator()); + return true; + } + +private: + //! Prohibit copying + GenericDocument(const GenericDocument&); + //! Prohibit assignment + GenericDocument& operator=(const GenericDocument&); + + void ClearStack() { + if (Allocator::kNeedFree) + while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects) + (stack_.template Pop(1))->~ValueType(); + else + stack_.Clear(); + stack_.ShrinkToFit(); + } + + void Destroy() { + RAPIDJSON_DELETE(ownAllocator_); + } + + static const size_t kDefaultStackCapacity = 1024; + Allocator* allocator_; + Allocator* ownAllocator_; + internal::Stack stack_; + ParseResult parseResult_; +}; + +//! GenericDocument with UTF8 encoding +typedef GenericDocument > Document; + +//! Helper class for accessing Value of array type. +/*! + Instance of this helper class is obtained by \c GenericValue::GetArray(). + In addition to all APIs for array type, it provides range-based for loop if \c RAPIDJSON_HAS_CXX11_RANGE_FOR=1. +*/ +template +class GenericArray { +public: + typedef GenericArray ConstArray; + typedef GenericArray Array; + typedef ValueT PlainType; + typedef typename internal::MaybeAddConst::Type ValueType; + typedef ValueType* ValueIterator; // This may be const or non-const iterator + typedef const ValueT* ConstValueIterator; + typedef typename ValueType::AllocatorType AllocatorType; + typedef typename ValueType::StringRefType StringRefType; + + template + friend class GenericValue; + + GenericArray(const GenericArray& rhs) : value_(rhs.value_) {} + GenericArray& operator=(const GenericArray& rhs) { value_ = rhs.value_; return *this; } + ~GenericArray() {} + + SizeType Size() const { return value_.Size(); } + SizeType Capacity() const { return value_.Capacity(); } + bool Empty() const { return value_.Empty(); } + void Clear() const { value_.Clear(); } + ValueType& operator[](SizeType index) const { return value_[index]; } + ValueIterator Begin() const { return value_.Begin(); } + ValueIterator End() const { return value_.End(); } + GenericArray Reserve(SizeType newCapacity, AllocatorType &allocator) const { value_.Reserve(newCapacity, allocator); return *this; } + GenericArray PushBack(ValueType& value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericArray PushBack(ValueType&& value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericArray PushBack(StringRefType value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } + template RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (const GenericArray&)) PushBack(T value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } + GenericArray PopBack() const { value_.PopBack(); return *this; } + ValueIterator Erase(ConstValueIterator pos) const { return value_.Erase(pos); } + ValueIterator Erase(ConstValueIterator first, ConstValueIterator last) const { return value_.Erase(first, last); } + +#if RAPIDJSON_HAS_CXX11_RANGE_FOR + ValueIterator begin() const { return value_.Begin(); } + ValueIterator end() const { return value_.End(); } +#endif + +private: + GenericArray(); + GenericArray(ValueType& value) : value_(value) {} + ValueType& value_; +}; + +//! Helper class for accessing Value of object type. +/*! + Instance of this helper class is obtained by \c GenericValue::GetObject(). + In addition to all APIs for array type, it provides range-based for loop if \c RAPIDJSON_HAS_CXX11_RANGE_FOR=1. +*/ +template +class GenericObject { +public: + typedef GenericObject ConstObject; + typedef GenericObject Object; + typedef ValueT PlainType; + typedef typename internal::MaybeAddConst::Type ValueType; + typedef GenericMemberIterator MemberIterator; // This may be const or non-const iterator + typedef GenericMemberIterator ConstMemberIterator; + typedef typename ValueType::AllocatorType AllocatorType; + typedef typename ValueType::StringRefType StringRefType; + typedef typename ValueType::EncodingType EncodingType; + typedef typename ValueType::Ch Ch; + + template + friend class GenericValue; + + GenericObject(const GenericObject& rhs) : value_(rhs.value_) {} + GenericObject& operator=(const GenericObject& rhs) { value_ = rhs.value_; return *this; } + ~GenericObject() {} + + SizeType MemberCount() const { return value_.MemberCount(); } + SizeType MemberCapacity() const { return value_.MemberCapacity(); } + bool ObjectEmpty() const { return value_.ObjectEmpty(); } + template ValueType& operator[](T* name) const { return value_[name]; } + template ValueType& operator[](const GenericValue& name) const { return value_[name]; } +#if RAPIDJSON_HAS_STDSTRING + ValueType& operator[](const std::basic_string& name) const { return value_[name]; } +#endif + MemberIterator MemberBegin() const { return value_.MemberBegin(); } + MemberIterator MemberEnd() const { return value_.MemberEnd(); } + GenericObject MemberReserve(SizeType newCapacity, AllocatorType &allocator) const { value_.MemberReserve(newCapacity, allocator); return *this; } + bool HasMember(const Ch* name) const { return value_.HasMember(name); } +#if RAPIDJSON_HAS_STDSTRING + bool HasMember(const std::basic_string& name) const { return value_.HasMember(name); } +#endif + template bool HasMember(const GenericValue& name) const { return value_.HasMember(name); } + MemberIterator FindMember(const Ch* name) const { return value_.FindMember(name); } + template MemberIterator FindMember(const GenericValue& name) const { return value_.FindMember(name); } +#if RAPIDJSON_HAS_STDSTRING + MemberIterator FindMember(const std::basic_string& name) const { return value_.FindMember(name); } +#endif + GenericObject AddMember(ValueType& name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(ValueType& name, StringRefType value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#if RAPIDJSON_HAS_STDSTRING + GenericObject AddMember(ValueType& name, std::basic_string& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#endif + template RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (ValueType&)) AddMember(ValueType& name, T value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericObject AddMember(ValueType&& name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(ValueType&& name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(ValueType& name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(StringRefType name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericObject AddMember(StringRefType name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(StringRefType name, StringRefType value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + template RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (GenericObject)) AddMember(StringRefType name, T value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + void RemoveAllMembers() { value_.RemoveAllMembers(); } + bool RemoveMember(const Ch* name) const { return value_.RemoveMember(name); } +#if RAPIDJSON_HAS_STDSTRING + bool RemoveMember(const std::basic_string& name) const { return value_.RemoveMember(name); } +#endif + template bool RemoveMember(const GenericValue& name) const { return value_.RemoveMember(name); } + MemberIterator RemoveMember(MemberIterator m) const { return value_.RemoveMember(m); } + MemberIterator EraseMember(ConstMemberIterator pos) const { return value_.EraseMember(pos); } + MemberIterator EraseMember(ConstMemberIterator first, ConstMemberIterator last) const { return value_.EraseMember(first, last); } + bool EraseMember(const Ch* name) const { return value_.EraseMember(name); } +#if RAPIDJSON_HAS_STDSTRING + bool EraseMember(const std::basic_string& name) const { return EraseMember(ValueType(StringRef(name))); } +#endif + template bool EraseMember(const GenericValue& name) const { return value_.EraseMember(name); } + +#if RAPIDJSON_HAS_CXX11_RANGE_FOR + MemberIterator begin() const { return value_.MemberBegin(); } + MemberIterator end() const { return value_.MemberEnd(); } +#endif + +private: + GenericObject(); + GenericObject(ValueType& value) : value_(value) {} + ValueType& value_; +}; + +RAPIDJSON_NAMESPACE_END +RAPIDJSON_DIAG_POP + +#endif // RAPIDJSON_DOCUMENT_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/encodedstream.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/encodedstream.h new file mode 100644 index 0000000..223601c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/encodedstream.h @@ -0,0 +1,299 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ENCODEDSTREAM_H_ +#define RAPIDJSON_ENCODEDSTREAM_H_ + +#include "stream.h" +#include "memorystream.h" + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Input byte stream wrapper with a statically bound encoding. +/*! + \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. + \tparam InputByteStream Type of input byte stream. For example, FileReadStream. +*/ +template +class EncodedInputStream { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); +public: + typedef typename Encoding::Ch Ch; + + EncodedInputStream(InputByteStream& is) : is_(is) { + current_ = Encoding::TakeBOM(is_); + } + + Ch Peek() const { return current_; } + Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; } + size_t Tell() const { return is_.Tell(); } + + // Not implemented + void Put(Ch) { RAPIDJSON_ASSERT(false); } + void Flush() { RAPIDJSON_ASSERT(false); } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + EncodedInputStream(const EncodedInputStream&); + EncodedInputStream& operator=(const EncodedInputStream&); + + InputByteStream& is_; + Ch current_; +}; + +//! Specialized for UTF8 MemoryStream. +template <> +class EncodedInputStream, MemoryStream> { +public: + typedef UTF8<>::Ch Ch; + + EncodedInputStream(MemoryStream& is) : is_(is) { + if (static_cast(is_.Peek()) == 0xEFu) is_.Take(); + if (static_cast(is_.Peek()) == 0xBBu) is_.Take(); + if (static_cast(is_.Peek()) == 0xBFu) is_.Take(); + } + Ch Peek() const { return is_.Peek(); } + Ch Take() { return is_.Take(); } + size_t Tell() const { return is_.Tell(); } + + // Not implemented + void Put(Ch) {} + void Flush() {} + Ch* PutBegin() { return 0; } + size_t PutEnd(Ch*) { return 0; } + + MemoryStream& is_; + +private: + EncodedInputStream(const EncodedInputStream&); + EncodedInputStream& operator=(const EncodedInputStream&); +}; + +//! Output byte stream wrapper with statically bound encoding. +/*! + \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. + \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream. +*/ +template +class EncodedOutputStream { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); +public: + typedef typename Encoding::Ch Ch; + + EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { + if (putBOM) + Encoding::PutBOM(os_); + } + + void Put(Ch c) { Encoding::Put(os_, c); } + void Flush() { os_.Flush(); } + + // Not implemented + Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} + Ch Take() { RAPIDJSON_ASSERT(false); return 0;} + size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + EncodedOutputStream(const EncodedOutputStream&); + EncodedOutputStream& operator=(const EncodedOutputStream&); + + OutputByteStream& os_; +}; + +#define RAPIDJSON_ENCODINGS_FUNC(x) UTF8::x, UTF16LE::x, UTF16BE::x, UTF32LE::x, UTF32BE::x + +//! Input stream wrapper with dynamically bound encoding and automatic encoding detection. +/*! + \tparam CharType Type of character for reading. + \tparam InputByteStream type of input byte stream to be wrapped. +*/ +template +class AutoUTFInputStream { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); +public: + typedef CharType Ch; + + //! Constructor. + /*! + \param is input stream to be wrapped. + \param type UTF encoding type if it is not detected from the stream. + */ + AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) { + RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); + DetectType(); + static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) }; + takeFunc_ = f[type_]; + current_ = takeFunc_(*is_); + } + + UTFType GetType() const { return type_; } + bool HasBOM() const { return hasBOM_; } + + Ch Peek() const { return current_; } + Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; } + size_t Tell() const { return is_->Tell(); } + + // Not implemented + void Put(Ch) { RAPIDJSON_ASSERT(false); } + void Flush() { RAPIDJSON_ASSERT(false); } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + AutoUTFInputStream(const AutoUTFInputStream&); + AutoUTFInputStream& operator=(const AutoUTFInputStream&); + + // Detect encoding type with BOM or RFC 4627 + void DetectType() { + // BOM (Byte Order Mark): + // 00 00 FE FF UTF-32BE + // FF FE 00 00 UTF-32LE + // FE FF UTF-16BE + // FF FE UTF-16LE + // EF BB BF UTF-8 + + const unsigned char* c = reinterpret_cast(is_->Peek4()); + if (!c) + return; + + unsigned bom = static_cast(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24)); + hasBOM_ = false; + if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } + else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } + else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); } + else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); } + else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); } + + // RFC 4627: Section 3 + // "Since the first two characters of a JSON text will always be ASCII + // characters [RFC0020], it is possible to determine whether an octet + // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking + // at the pattern of nulls in the first four octets." + // 00 00 00 xx UTF-32BE + // 00 xx 00 xx UTF-16BE + // xx 00 00 00 UTF-32LE + // xx 00 xx 00 UTF-16LE + // xx xx xx xx UTF-8 + + if (!hasBOM_) { + int pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0); + switch (pattern) { + case 0x08: type_ = kUTF32BE; break; + case 0x0A: type_ = kUTF16BE; break; + case 0x01: type_ = kUTF32LE; break; + case 0x05: type_ = kUTF16LE; break; + case 0x0F: type_ = kUTF8; break; + default: break; // Use type defined by user. + } + } + + // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. + if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); + if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); + } + + typedef Ch (*TakeFunc)(InputByteStream& is); + InputByteStream* is_; + UTFType type_; + Ch current_; + TakeFunc takeFunc_; + bool hasBOM_; +}; + +//! Output stream wrapper with dynamically bound encoding and automatic encoding detection. +/*! + \tparam CharType Type of character for writing. + \tparam OutputByteStream type of output byte stream to be wrapped. +*/ +template +class AutoUTFOutputStream { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); +public: + typedef CharType Ch; + + //! Constructor. + /*! + \param os output stream to be wrapped. + \param type UTF encoding type. + \param putBOM Whether to write BOM at the beginning of the stream. + */ + AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) { + RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); + + // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. + if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); + if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); + + static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) }; + putFunc_ = f[type_]; + + if (putBOM) + PutBOM(); + } + + UTFType GetType() const { return type_; } + + void Put(Ch c) { putFunc_(*os_, c); } + void Flush() { os_->Flush(); } + + // Not implemented + Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} + Ch Take() { RAPIDJSON_ASSERT(false); return 0;} + size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + AutoUTFOutputStream(const AutoUTFOutputStream&); + AutoUTFOutputStream& operator=(const AutoUTFOutputStream&); + + void PutBOM() { + typedef void (*PutBOMFunc)(OutputByteStream&); + static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) }; + f[type_](*os_); + } + + typedef void (*PutFunc)(OutputByteStream&, Ch); + + OutputByteStream* os_; + UTFType type_; + PutFunc putFunc_; +}; + +#undef RAPIDJSON_ENCODINGS_FUNC + +RAPIDJSON_NAMESPACE_END + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_FILESTREAM_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/encodings.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/encodings.h new file mode 100644 index 0000000..0b24467 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/encodings.h @@ -0,0 +1,716 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ENCODINGS_H_ +#define RAPIDJSON_ENCODINGS_H_ + +#include "rapidjson.h" + +#if defined(_MSC_VER) && !defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4244) // conversion from 'type1' to 'type2', possible loss of data +RAPIDJSON_DIAG_OFF(4702) // unreachable code +#elif defined(__GNUC__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +RAPIDJSON_DIAG_OFF(overflow) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// Encoding + +/*! \class rapidjson::Encoding + \brief Concept for encoding of Unicode characters. + +\code +concept Encoding { + typename Ch; //! Type of character. A "character" is actually a code unit in unicode's definition. + + enum { supportUnicode = 1 }; // or 0 if not supporting unicode + + //! \brief Encode a Unicode codepoint to an output stream. + //! \param os Output stream. + //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively. + template + static void Encode(OutputStream& os, unsigned codepoint); + + //! \brief Decode a Unicode codepoint from an input stream. + //! \param is Input stream. + //! \param codepoint Output of the unicode codepoint. + //! \return true if a valid codepoint can be decoded from the stream. + template + static bool Decode(InputStream& is, unsigned* codepoint); + + //! \brief Validate one Unicode codepoint from an encoded stream. + //! \param is Input stream to obtain codepoint. + //! \param os Output for copying one codepoint. + //! \return true if it is valid. + //! \note This function just validating and copying the codepoint without actually decode it. + template + static bool Validate(InputStream& is, OutputStream& os); + + // The following functions are deal with byte streams. + + //! Take a character from input byte stream, skip BOM if exist. + template + static CharType TakeBOM(InputByteStream& is); + + //! Take a character from input byte stream. + template + static Ch Take(InputByteStream& is); + + //! Put BOM to output byte stream. + template + static void PutBOM(OutputByteStream& os); + + //! Put a character to output byte stream. + template + static void Put(OutputByteStream& os, Ch c); +}; +\endcode +*/ + +/////////////////////////////////////////////////////////////////////////////// +// UTF8 + +//! UTF-8 encoding. +/*! http://en.wikipedia.org/wiki/UTF-8 + http://tools.ietf.org/html/rfc3629 + \tparam CharType Code unit for storing 8-bit UTF-8 data. Default is char. + \note implements Encoding concept +*/ +template +struct UTF8 { + typedef CharType Ch; + + enum { supportUnicode = 1 }; + + template + static void Encode(OutputStream& os, unsigned codepoint) { + if (codepoint <= 0x7F) + os.Put(static_cast(codepoint & 0xFF)); + else if (codepoint <= 0x7FF) { + os.Put(static_cast(0xC0 | ((codepoint >> 6) & 0xFF))); + os.Put(static_cast(0x80 | ((codepoint & 0x3F)))); + } + else if (codepoint <= 0xFFFF) { + os.Put(static_cast(0xE0 | ((codepoint >> 12) & 0xFF))); + os.Put(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + os.Put(static_cast(0x80 | (codepoint & 0x3F))); + } + else { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + os.Put(static_cast(0xF0 | ((codepoint >> 18) & 0xFF))); + os.Put(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + os.Put(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + os.Put(static_cast(0x80 | (codepoint & 0x3F))); + } + } + + template + static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { + if (codepoint <= 0x7F) + PutUnsafe(os, static_cast(codepoint & 0xFF)); + else if (codepoint <= 0x7FF) { + PutUnsafe(os, static_cast(0xC0 | ((codepoint >> 6) & 0xFF))); + PutUnsafe(os, static_cast(0x80 | ((codepoint & 0x3F)))); + } + else if (codepoint <= 0xFFFF) { + PutUnsafe(os, static_cast(0xE0 | ((codepoint >> 12) & 0xFF))); + PutUnsafe(os, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + PutUnsafe(os, static_cast(0x80 | (codepoint & 0x3F))); + } + else { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + PutUnsafe(os, static_cast(0xF0 | ((codepoint >> 18) & 0xFF))); + PutUnsafe(os, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + PutUnsafe(os, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + PutUnsafe(os, static_cast(0x80 | (codepoint & 0x3F))); + } + } + + template + static bool Decode(InputStream& is, unsigned* codepoint) { +#define RAPIDJSON_COPY() c = is.Take(); *codepoint = (*codepoint << 6) | (static_cast(c) & 0x3Fu) +#define RAPIDJSON_TRANS(mask) result &= ((GetRange(static_cast(c)) & mask) != 0) +#define RAPIDJSON_TAIL() RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x70) + typename InputStream::Ch c = is.Take(); + if (!(c & 0x80)) { + *codepoint = static_cast(c); + return true; + } + + unsigned char type = GetRange(static_cast(c)); + if (type >= 32) { + *codepoint = 0; + } else { + *codepoint = (0xFFu >> type) & static_cast(c); + } + bool result = true; + switch (type) { + case 2: RAPIDJSON_TAIL(); return result; + case 3: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + case 4: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x50); RAPIDJSON_TAIL(); return result; + case 5: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x10); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + case 6: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + case 10: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x20); RAPIDJSON_TAIL(); return result; + case 11: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x60); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + default: return false; + } +#undef RAPIDJSON_COPY +#undef RAPIDJSON_TRANS +#undef RAPIDJSON_TAIL + } + + template + static bool Validate(InputStream& is, OutputStream& os) { +#define RAPIDJSON_COPY() os.Put(c = is.Take()) +#define RAPIDJSON_TRANS(mask) result &= ((GetRange(static_cast(c)) & mask) != 0) +#define RAPIDJSON_TAIL() RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x70) + Ch c; + RAPIDJSON_COPY(); + if (!(c & 0x80)) + return true; + + bool result = true; + switch (GetRange(static_cast(c))) { + case 2: RAPIDJSON_TAIL(); return result; + case 3: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + case 4: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x50); RAPIDJSON_TAIL(); return result; + case 5: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x10); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + case 6: RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + case 10: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x20); RAPIDJSON_TAIL(); return result; + case 11: RAPIDJSON_COPY(); RAPIDJSON_TRANS(0x60); RAPIDJSON_TAIL(); RAPIDJSON_TAIL(); return result; + default: return false; + } +#undef RAPIDJSON_COPY +#undef RAPIDJSON_TRANS +#undef RAPIDJSON_TAIL + } + + static unsigned char GetRange(unsigned char c) { + // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types. + static const unsigned char type[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10, + 0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40, + 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, + 0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + }; + return type[c]; + } + + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + typename InputByteStream::Ch c = Take(is); + if (static_cast(c) != 0xEFu) return c; + c = is.Take(); + if (static_cast(c) != 0xBBu) return c; + c = is.Take(); + if (static_cast(c) != 0xBFu) return c; + c = is.Take(); + return c; + } + + template + static Ch Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + return static_cast(is.Take()); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(0xEFu)); + os.Put(static_cast(0xBBu)); + os.Put(static_cast(0xBFu)); + } + + template + static void Put(OutputByteStream& os, Ch c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(c)); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// UTF16 + +//! UTF-16 encoding. +/*! http://en.wikipedia.org/wiki/UTF-16 + http://tools.ietf.org/html/rfc2781 + \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead. + \note implements Encoding concept + + \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness. + For streaming, use UTF16LE and UTF16BE, which handle endianness. +*/ +template +struct UTF16 { + typedef CharType Ch; + RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2); + + enum { supportUnicode = 1 }; + + template + static void Encode(OutputStream& os, unsigned codepoint) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); + if (codepoint <= 0xFFFF) { + RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair + os.Put(static_cast(codepoint)); + } + else { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + unsigned v = codepoint - 0x10000; + os.Put(static_cast((v >> 10) | 0xD800)); + os.Put(static_cast((v & 0x3FF) | 0xDC00)); + } + } + + + template + static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); + if (codepoint <= 0xFFFF) { + RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair + PutUnsafe(os, static_cast(codepoint)); + } + else { + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + unsigned v = codepoint - 0x10000; + PutUnsafe(os, static_cast((v >> 10) | 0xD800)); + PutUnsafe(os, static_cast((v & 0x3FF) | 0xDC00)); + } + } + + template + static bool Decode(InputStream& is, unsigned* codepoint) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2); + typename InputStream::Ch c = is.Take(); + if (c < 0xD800 || c > 0xDFFF) { + *codepoint = static_cast(c); + return true; + } + else if (c <= 0xDBFF) { + *codepoint = (static_cast(c) & 0x3FF) << 10; + c = is.Take(); + *codepoint |= (static_cast(c) & 0x3FF); + *codepoint += 0x10000; + return c >= 0xDC00 && c <= 0xDFFF; + } + return false; + } + + template + static bool Validate(InputStream& is, OutputStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2); + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2); + typename InputStream::Ch c; + os.Put(static_cast(c = is.Take())); + if (c < 0xD800 || c > 0xDFFF) + return true; + else if (c <= 0xDBFF) { + os.Put(c = is.Take()); + return c >= 0xDC00 && c <= 0xDFFF; + } + return false; + } +}; + +//! UTF-16 little endian encoding. +template +struct UTF16LE : UTF16 { + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + CharType c = Take(is); + return static_cast(c) == 0xFEFFu ? Take(is) : c; + } + + template + static CharType Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + unsigned c = static_cast(is.Take()); + c |= static_cast(static_cast(is.Take())) << 8; + return static_cast(c); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(0xFFu)); + os.Put(static_cast(0xFEu)); + } + + template + static void Put(OutputByteStream& os, CharType c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(static_cast(c) & 0xFFu)); + os.Put(static_cast((static_cast(c) >> 8) & 0xFFu)); + } +}; + +//! UTF-16 big endian encoding. +template +struct UTF16BE : UTF16 { + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + CharType c = Take(is); + return static_cast(c) == 0xFEFFu ? Take(is) : c; + } + + template + static CharType Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + unsigned c = static_cast(static_cast(is.Take())) << 8; + c |= static_cast(static_cast(is.Take())); + return static_cast(c); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(0xFEu)); + os.Put(static_cast(0xFFu)); + } + + template + static void Put(OutputByteStream& os, CharType c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast((static_cast(c) >> 8) & 0xFFu)); + os.Put(static_cast(static_cast(c) & 0xFFu)); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// UTF32 + +//! UTF-32 encoding. +/*! http://en.wikipedia.org/wiki/UTF-32 + \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead. + \note implements Encoding concept + + \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness. + For streaming, use UTF32LE and UTF32BE, which handle endianness. +*/ +template +struct UTF32 { + typedef CharType Ch; + RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4); + + enum { supportUnicode = 1 }; + + template + static void Encode(OutputStream& os, unsigned codepoint) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4); + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + os.Put(codepoint); + } + + template + static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4); + RAPIDJSON_ASSERT(codepoint <= 0x10FFFF); + PutUnsafe(os, codepoint); + } + + template + static bool Decode(InputStream& is, unsigned* codepoint) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4); + Ch c = is.Take(); + *codepoint = c; + return c <= 0x10FFFF; + } + + template + static bool Validate(InputStream& is, OutputStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4); + Ch c; + os.Put(c = is.Take()); + return c <= 0x10FFFF; + } +}; + +//! UTF-32 little endian enocoding. +template +struct UTF32LE : UTF32 { + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + CharType c = Take(is); + return static_cast(c) == 0x0000FEFFu ? Take(is) : c; + } + + template + static CharType Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + unsigned c = static_cast(is.Take()); + c |= static_cast(static_cast(is.Take())) << 8; + c |= static_cast(static_cast(is.Take())) << 16; + c |= static_cast(static_cast(is.Take())) << 24; + return static_cast(c); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(0xFFu)); + os.Put(static_cast(0xFEu)); + os.Put(static_cast(0x00u)); + os.Put(static_cast(0x00u)); + } + + template + static void Put(OutputByteStream& os, CharType c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(c & 0xFFu)); + os.Put(static_cast((c >> 8) & 0xFFu)); + os.Put(static_cast((c >> 16) & 0xFFu)); + os.Put(static_cast((c >> 24) & 0xFFu)); + } +}; + +//! UTF-32 big endian encoding. +template +struct UTF32BE : UTF32 { + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + CharType c = Take(is); + return static_cast(c) == 0x0000FEFFu ? Take(is) : c; + } + + template + static CharType Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + unsigned c = static_cast(static_cast(is.Take())) << 24; + c |= static_cast(static_cast(is.Take())) << 16; + c |= static_cast(static_cast(is.Take())) << 8; + c |= static_cast(static_cast(is.Take())); + return static_cast(c); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(0x00u)); + os.Put(static_cast(0x00u)); + os.Put(static_cast(0xFEu)); + os.Put(static_cast(0xFFu)); + } + + template + static void Put(OutputByteStream& os, CharType c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast((c >> 24) & 0xFFu)); + os.Put(static_cast((c >> 16) & 0xFFu)); + os.Put(static_cast((c >> 8) & 0xFFu)); + os.Put(static_cast(c & 0xFFu)); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// ASCII + +//! ASCII encoding. +/*! http://en.wikipedia.org/wiki/ASCII + \tparam CharType Code unit for storing 7-bit ASCII data. Default is char. + \note implements Encoding concept +*/ +template +struct ASCII { + typedef CharType Ch; + + enum { supportUnicode = 0 }; + + template + static void Encode(OutputStream& os, unsigned codepoint) { + RAPIDJSON_ASSERT(codepoint <= 0x7F); + os.Put(static_cast(codepoint & 0xFF)); + } + + template + static void EncodeUnsafe(OutputStream& os, unsigned codepoint) { + RAPIDJSON_ASSERT(codepoint <= 0x7F); + PutUnsafe(os, static_cast(codepoint & 0xFF)); + } + + template + static bool Decode(InputStream& is, unsigned* codepoint) { + uint8_t c = static_cast(is.Take()); + *codepoint = c; + return c <= 0X7F; + } + + template + static bool Validate(InputStream& is, OutputStream& os) { + uint8_t c = static_cast(is.Take()); + os.Put(static_cast(c)); + return c <= 0x7F; + } + + template + static CharType TakeBOM(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + uint8_t c = static_cast(Take(is)); + return static_cast(c); + } + + template + static Ch Take(InputByteStream& is) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); + return static_cast(is.Take()); + } + + template + static void PutBOM(OutputByteStream& os) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + (void)os; + } + + template + static void Put(OutputByteStream& os, Ch c) { + RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); + os.Put(static_cast(c)); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// AutoUTF + +//! Runtime-specified UTF encoding type of a stream. +enum UTFType { + kUTF8 = 0, //!< UTF-8. + kUTF16LE = 1, //!< UTF-16 little endian. + kUTF16BE = 2, //!< UTF-16 big endian. + kUTF32LE = 3, //!< UTF-32 little endian. + kUTF32BE = 4 //!< UTF-32 big endian. +}; + +//! Dynamically select encoding according to stream's runtime-specified UTF encoding type. +/*! \note This class can be used with AutoUTFInputtStream and AutoUTFOutputStream, which provides GetType(). +*/ +template +struct AutoUTF { + typedef CharType Ch; + + enum { supportUnicode = 1 }; + +#define RAPIDJSON_ENCODINGS_FUNC(x) UTF8::x, UTF16LE::x, UTF16BE::x, UTF32LE::x, UTF32BE::x + + template + static RAPIDJSON_FORCEINLINE void Encode(OutputStream& os, unsigned codepoint) { + typedef void (*EncodeFunc)(OutputStream&, unsigned); + static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Encode) }; + (*f[os.GetType()])(os, codepoint); + } + + template + static RAPIDJSON_FORCEINLINE void EncodeUnsafe(OutputStream& os, unsigned codepoint) { + typedef void (*EncodeFunc)(OutputStream&, unsigned); + static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(EncodeUnsafe) }; + (*f[os.GetType()])(os, codepoint); + } + + template + static RAPIDJSON_FORCEINLINE bool Decode(InputStream& is, unsigned* codepoint) { + typedef bool (*DecodeFunc)(InputStream&, unsigned*); + static const DecodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Decode) }; + return (*f[is.GetType()])(is, codepoint); + } + + template + static RAPIDJSON_FORCEINLINE bool Validate(InputStream& is, OutputStream& os) { + typedef bool (*ValidateFunc)(InputStream&, OutputStream&); + static const ValidateFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Validate) }; + return (*f[is.GetType()])(is, os); + } + +#undef RAPIDJSON_ENCODINGS_FUNC +}; + +/////////////////////////////////////////////////////////////////////////////// +// Transcoder + +//! Encoding conversion. +template +struct Transcoder { + //! Take one Unicode codepoint from source encoding, convert it to target encoding and put it to the output stream. + template + static RAPIDJSON_FORCEINLINE bool Transcode(InputStream& is, OutputStream& os) { + unsigned codepoint; + if (!SourceEncoding::Decode(is, &codepoint)) + return false; + TargetEncoding::Encode(os, codepoint); + return true; + } + + template + static RAPIDJSON_FORCEINLINE bool TranscodeUnsafe(InputStream& is, OutputStream& os) { + unsigned codepoint; + if (!SourceEncoding::Decode(is, &codepoint)) + return false; + TargetEncoding::EncodeUnsafe(os, codepoint); + return true; + } + + //! Validate one Unicode codepoint from an encoded stream. + template + static RAPIDJSON_FORCEINLINE bool Validate(InputStream& is, OutputStream& os) { + return Transcode(is, os); // Since source/target encoding is different, must transcode. + } +}; + +// Forward declaration. +template +inline void PutUnsafe(Stream& stream, typename Stream::Ch c); + +//! Specialization of Transcoder with same source and target encoding. +template +struct Transcoder { + template + static RAPIDJSON_FORCEINLINE bool Transcode(InputStream& is, OutputStream& os) { + os.Put(is.Take()); // Just copy one code unit. This semantic is different from primary template class. + return true; + } + + template + static RAPIDJSON_FORCEINLINE bool TranscodeUnsafe(InputStream& is, OutputStream& os) { + PutUnsafe(os, is.Take()); // Just copy one code unit. This semantic is different from primary template class. + return true; + } + + template + static RAPIDJSON_FORCEINLINE bool Validate(InputStream& is, OutputStream& os) { + return Encoding::Validate(is, os); // source/target encoding are the same + } +}; + +RAPIDJSON_NAMESPACE_END + +#if defined(__GNUC__) || (defined(_MSC_VER) && !defined(__clang__)) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_ENCODINGS_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/error/en.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/error/en.h new file mode 100644 index 0000000..2db838b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/error/en.h @@ -0,0 +1,74 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ERROR_EN_H_ +#define RAPIDJSON_ERROR_EN_H_ + +#include "error.h" + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(switch-enum) +RAPIDJSON_DIAG_OFF(covered-switch-default) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Maps error code of parsing into error message. +/*! + \ingroup RAPIDJSON_ERRORS + \param parseErrorCode Error code obtained in parsing. + \return the error message. + \note User can make a copy of this function for localization. + Using switch-case is safer for future modification of error codes. +*/ +inline const RAPIDJSON_ERROR_CHARTYPE* GetParseError_En(ParseErrorCode parseErrorCode) { + switch (parseErrorCode) { + case kParseErrorNone: return RAPIDJSON_ERROR_STRING("No error."); + + case kParseErrorDocumentEmpty: return RAPIDJSON_ERROR_STRING("The document is empty."); + case kParseErrorDocumentRootNotSingular: return RAPIDJSON_ERROR_STRING("The document root must not be followed by other values."); + + case kParseErrorValueInvalid: return RAPIDJSON_ERROR_STRING("Invalid value."); + + case kParseErrorObjectMissName: return RAPIDJSON_ERROR_STRING("Missing a name for object member."); + case kParseErrorObjectMissColon: return RAPIDJSON_ERROR_STRING("Missing a colon after a name of object member."); + case kParseErrorObjectMissCommaOrCurlyBracket: return RAPIDJSON_ERROR_STRING("Missing a comma or '}' after an object member."); + + case kParseErrorArrayMissCommaOrSquareBracket: return RAPIDJSON_ERROR_STRING("Missing a comma or ']' after an array element."); + + case kParseErrorStringUnicodeEscapeInvalidHex: return RAPIDJSON_ERROR_STRING("Incorrect hex digit after \\u escape in string."); + case kParseErrorStringUnicodeSurrogateInvalid: return RAPIDJSON_ERROR_STRING("The surrogate pair in string is invalid."); + case kParseErrorStringEscapeInvalid: return RAPIDJSON_ERROR_STRING("Invalid escape character in string."); + case kParseErrorStringMissQuotationMark: return RAPIDJSON_ERROR_STRING("Missing a closing quotation mark in string."); + case kParseErrorStringInvalidEncoding: return RAPIDJSON_ERROR_STRING("Invalid encoding in string."); + + case kParseErrorNumberTooBig: return RAPIDJSON_ERROR_STRING("Number too big to be stored in double."); + case kParseErrorNumberMissFraction: return RAPIDJSON_ERROR_STRING("Miss fraction part in number."); + case kParseErrorNumberMissExponent: return RAPIDJSON_ERROR_STRING("Miss exponent in number."); + + case kParseErrorTermination: return RAPIDJSON_ERROR_STRING("Terminate parsing due to Handler error."); + case kParseErrorUnspecificSyntaxError: return RAPIDJSON_ERROR_STRING("Unspecific syntax error."); + + default: return RAPIDJSON_ERROR_STRING("Unknown error."); + } +} + +RAPIDJSON_NAMESPACE_END + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_ERROR_EN_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/error/error.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/error/error.h new file mode 100644 index 0000000..9311d2f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/error/error.h @@ -0,0 +1,161 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ERROR_ERROR_H_ +#define RAPIDJSON_ERROR_ERROR_H_ + +#include "../rapidjson.h" + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +#endif + +/*! \file error.h */ + +/*! \defgroup RAPIDJSON_ERRORS RapidJSON error handling */ + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ERROR_CHARTYPE + +//! Character type of error messages. +/*! \ingroup RAPIDJSON_ERRORS + The default character type is \c char. + On Windows, user can define this macro as \c TCHAR for supporting both + unicode/non-unicode settings. +*/ +#ifndef RAPIDJSON_ERROR_CHARTYPE +#define RAPIDJSON_ERROR_CHARTYPE char +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ERROR_STRING + +//! Macro for converting string literial to \ref RAPIDJSON_ERROR_CHARTYPE[]. +/*! \ingroup RAPIDJSON_ERRORS + By default this conversion macro does nothing. + On Windows, user can define this macro as \c _T(x) for supporting both + unicode/non-unicode settings. +*/ +#ifndef RAPIDJSON_ERROR_STRING +#define RAPIDJSON_ERROR_STRING(x) x +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// ParseErrorCode + +//! Error code of parsing. +/*! \ingroup RAPIDJSON_ERRORS + \see GenericReader::Parse, GenericReader::GetParseErrorCode +*/ +enum ParseErrorCode { + kParseErrorNone = 0, //!< No error. + + kParseErrorDocumentEmpty, //!< The document is empty. + kParseErrorDocumentRootNotSingular, //!< The document root must not follow by other values. + + kParseErrorValueInvalid, //!< Invalid value. + + kParseErrorObjectMissName, //!< Missing a name for object member. + kParseErrorObjectMissColon, //!< Missing a colon after a name of object member. + kParseErrorObjectMissCommaOrCurlyBracket, //!< Missing a comma or '}' after an object member. + + kParseErrorArrayMissCommaOrSquareBracket, //!< Missing a comma or ']' after an array element. + + kParseErrorStringUnicodeEscapeInvalidHex, //!< Incorrect hex digit after \\u escape in string. + kParseErrorStringUnicodeSurrogateInvalid, //!< The surrogate pair in string is invalid. + kParseErrorStringEscapeInvalid, //!< Invalid escape character in string. + kParseErrorStringMissQuotationMark, //!< Missing a closing quotation mark in string. + kParseErrorStringInvalidEncoding, //!< Invalid encoding in string. + + kParseErrorNumberTooBig, //!< Number too big to be stored in double. + kParseErrorNumberMissFraction, //!< Miss fraction part in number. + kParseErrorNumberMissExponent, //!< Miss exponent in number. + + kParseErrorTermination, //!< Parsing was terminated. + kParseErrorUnspecificSyntaxError //!< Unspecific syntax error. +}; + +//! Result of parsing (wraps ParseErrorCode) +/*! + \ingroup RAPIDJSON_ERRORS + \code + Document doc; + ParseResult ok = doc.Parse("[42]"); + if (!ok) { + fprintf(stderr, "JSON parse error: %s (%u)", + GetParseError_En(ok.Code()), ok.Offset()); + exit(EXIT_FAILURE); + } + \endcode + \see GenericReader::Parse, GenericDocument::Parse +*/ +struct ParseResult { + //!! Unspecified boolean type + typedef bool (ParseResult::*BooleanType)() const; +public: + //! Default constructor, no error. + ParseResult() : code_(kParseErrorNone), offset_(0) {} + //! Constructor to set an error. + ParseResult(ParseErrorCode code, size_t offset) : code_(code), offset_(offset) {} + + //! Get the error code. + ParseErrorCode Code() const { return code_; } + //! Get the error offset, if \ref IsError(), 0 otherwise. + size_t Offset() const { return offset_; } + + //! Explicit conversion to \c bool, returns \c true, iff !\ref IsError(). + operator BooleanType() const { return !IsError() ? &ParseResult::IsError : NULL; } + //! Whether the result is an error. + bool IsError() const { return code_ != kParseErrorNone; } + + bool operator==(const ParseResult& that) const { return code_ == that.code_; } + bool operator==(ParseErrorCode code) const { return code_ == code; } + friend bool operator==(ParseErrorCode code, const ParseResult & err) { return code == err.code_; } + + bool operator!=(const ParseResult& that) const { return !(*this == that); } + bool operator!=(ParseErrorCode code) const { return !(*this == code); } + friend bool operator!=(ParseErrorCode code, const ParseResult & err) { return err != code; } + + //! Reset error code. + void Clear() { Set(kParseErrorNone); } + //! Update error code and offset. + void Set(ParseErrorCode code, size_t offset = 0) { code_ = code; offset_ = offset; } + +private: + ParseErrorCode code_; + size_t offset_; +}; + +//! Function pointer type of GetParseError(). +/*! \ingroup RAPIDJSON_ERRORS + + This is the prototype for \c GetParseError_X(), where \c X is a locale. + User can dynamically change locale in runtime, e.g.: +\code + GetParseErrorFunc GetParseError = GetParseError_En; // or whatever + const RAPIDJSON_ERROR_CHARTYPE* s = GetParseError(document.GetParseErrorCode()); +\endcode +*/ +typedef const RAPIDJSON_ERROR_CHARTYPE* (*GetParseErrorFunc)(ParseErrorCode); + +RAPIDJSON_NAMESPACE_END + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_ERROR_ERROR_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/filereadstream.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/filereadstream.h new file mode 100644 index 0000000..f1bfb7d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/filereadstream.h @@ -0,0 +1,99 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_FILEREADSTREAM_H_ +#define RAPIDJSON_FILEREADSTREAM_H_ + +#include "stream.h" +#include + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(unreachable-code) +RAPIDJSON_DIAG_OFF(missing-noreturn) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! File byte stream for input using fread(). +/*! + \note implements Stream concept +*/ +class FileReadStream { +public: + typedef char Ch; //!< Character type (byte). + + //! Constructor. + /*! + \param fp File pointer opened for read. + \param buffer user-supplied buffer. + \param bufferSize size of buffer in bytes. Must >=4 bytes. + */ + FileReadStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferSize_(bufferSize), bufferLast_(0), current_(buffer_), readCount_(0), count_(0), eof_(false) { + RAPIDJSON_ASSERT(fp_ != 0); + RAPIDJSON_ASSERT(bufferSize >= 4); + Read(); + } + + Ch Peek() const { return *current_; } + Ch Take() { Ch c = *current_; Read(); return c; } + size_t Tell() const { return count_ + static_cast(current_ - buffer_); } + + // Not implemented + void Put(Ch) { RAPIDJSON_ASSERT(false); } + void Flush() { RAPIDJSON_ASSERT(false); } + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + + // For encoding detection only. + const Ch* Peek4() const { + return (current_ + 4 <= bufferLast_) ? current_ : 0; + } + +private: + void Read() { + if (current_ < bufferLast_) + ++current_; + else if (!eof_) { + count_ += readCount_; + readCount_ = std::fread(buffer_, 1, bufferSize_, fp_); + bufferLast_ = buffer_ + readCount_ - 1; + current_ = buffer_; + + if (readCount_ < bufferSize_) { + buffer_[readCount_] = '\0'; + ++bufferLast_; + eof_ = true; + } + } + } + + std::FILE* fp_; + Ch *buffer_; + size_t bufferSize_; + Ch *bufferLast_; + Ch *current_; + size_t readCount_; + size_t count_; //!< Number of characters read + bool eof_; +}; + +RAPIDJSON_NAMESPACE_END + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_FILESTREAM_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/filewritestream.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/filewritestream.h new file mode 100644 index 0000000..8b48fee --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/filewritestream.h @@ -0,0 +1,104 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_FILEWRITESTREAM_H_ +#define RAPIDJSON_FILEWRITESTREAM_H_ + +#include "stream.h" +#include + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(unreachable-code) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Wrapper of C file stream for output using fwrite(). +/*! + \note implements Stream concept +*/ +class FileWriteStream { +public: + typedef char Ch; //!< Character type. Only support char. + + FileWriteStream(std::FILE* fp, char* buffer, size_t bufferSize) : fp_(fp), buffer_(buffer), bufferEnd_(buffer + bufferSize), current_(buffer_) { + RAPIDJSON_ASSERT(fp_ != 0); + } + + void Put(char c) { + if (current_ >= bufferEnd_) + Flush(); + + *current_++ = c; + } + + void PutN(char c, size_t n) { + size_t avail = static_cast(bufferEnd_ - current_); + while (n > avail) { + std::memset(current_, c, avail); + current_ += avail; + Flush(); + n -= avail; + avail = static_cast(bufferEnd_ - current_); + } + + if (n > 0) { + std::memset(current_, c, n); + current_ += n; + } + } + + void Flush() { + if (current_ != buffer_) { + size_t result = std::fwrite(buffer_, 1, static_cast(current_ - buffer_), fp_); + if (result < static_cast(current_ - buffer_)) { + // failure deliberately ignored at this time + // added to avoid warn_unused_result build errors + } + current_ = buffer_; + } + } + + // Not implemented + char Peek() const { RAPIDJSON_ASSERT(false); return 0; } + char Take() { RAPIDJSON_ASSERT(false); return 0; } + size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } + char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + // Prohibit copy constructor & assignment operator. + FileWriteStream(const FileWriteStream&); + FileWriteStream& operator=(const FileWriteStream&); + + std::FILE* fp_; + char *buffer_; + char *bufferEnd_; + char *current_; +}; + +//! Implement specialized version of PutN() with memset() for better performance. +template<> +inline void PutN(FileWriteStream& stream, char c, size_t n) { + stream.PutN(c, n); +} + +RAPIDJSON_NAMESPACE_END + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_FILESTREAM_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/fwd.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/fwd.h new file mode 100644 index 0000000..e8104e8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/fwd.h @@ -0,0 +1,151 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_FWD_H_ +#define RAPIDJSON_FWD_H_ + +#include "rapidjson.h" + +RAPIDJSON_NAMESPACE_BEGIN + +// encodings.h + +template struct UTF8; +template struct UTF16; +template struct UTF16BE; +template struct UTF16LE; +template struct UTF32; +template struct UTF32BE; +template struct UTF32LE; +template struct ASCII; +template struct AutoUTF; + +template +struct Transcoder; + +// allocators.h + +class CrtAllocator; + +template +class MemoryPoolAllocator; + +// stream.h + +template +struct GenericStringStream; + +typedef GenericStringStream > StringStream; + +template +struct GenericInsituStringStream; + +typedef GenericInsituStringStream > InsituStringStream; + +// stringbuffer.h + +template +class GenericStringBuffer; + +typedef GenericStringBuffer, CrtAllocator> StringBuffer; + +// filereadstream.h + +class FileReadStream; + +// filewritestream.h + +class FileWriteStream; + +// memorybuffer.h + +template +struct GenericMemoryBuffer; + +typedef GenericMemoryBuffer MemoryBuffer; + +// memorystream.h + +struct MemoryStream; + +// reader.h + +template +struct BaseReaderHandler; + +template +class GenericReader; + +typedef GenericReader, UTF8, CrtAllocator> Reader; + +// writer.h + +template +class Writer; + +// prettywriter.h + +template +class PrettyWriter; + +// document.h + +template +struct GenericMember; + +template +class GenericMemberIterator; + +template +struct GenericStringRef; + +template +class GenericValue; + +typedef GenericValue, MemoryPoolAllocator > Value; + +template +class GenericDocument; + +typedef GenericDocument, MemoryPoolAllocator, CrtAllocator> Document; + +// pointer.h + +template +class GenericPointer; + +typedef GenericPointer Pointer; + +// schema.h + +template +class IGenericRemoteSchemaDocumentProvider; + +template +class GenericSchemaDocument; + +typedef GenericSchemaDocument SchemaDocument; +typedef IGenericRemoteSchemaDocumentProvider IRemoteSchemaDocumentProvider; + +template < + typename SchemaDocumentType, + typename OutputHandler, + typename StateAllocator> +class GenericSchemaValidator; + +typedef GenericSchemaValidator, void>, CrtAllocator> SchemaValidator; + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_RAPIDJSONFWD_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/biginteger.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/biginteger.h new file mode 100644 index 0000000..a31c8a8 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/biginteger.h @@ -0,0 +1,290 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_BIGINTEGER_H_ +#define RAPIDJSON_BIGINTEGER_H_ + +#include "../rapidjson.h" + +#if defined(_MSC_VER) && !__INTEL_COMPILER && defined(_M_AMD64) +#include // for _umul128 +#pragma intrinsic(_umul128) +#endif + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +class BigInteger { +public: + typedef uint64_t Type; + + BigInteger(const BigInteger& rhs) : count_(rhs.count_) { + std::memcpy(digits_, rhs.digits_, count_ * sizeof(Type)); + } + + explicit BigInteger(uint64_t u) : count_(1) { + digits_[0] = u; + } + + BigInteger(const char* decimals, size_t length) : count_(1) { + RAPIDJSON_ASSERT(length > 0); + digits_[0] = 0; + size_t i = 0; + const size_t kMaxDigitPerIteration = 19; // 2^64 = 18446744073709551616 > 10^19 + while (length >= kMaxDigitPerIteration) { + AppendDecimal64(decimals + i, decimals + i + kMaxDigitPerIteration); + length -= kMaxDigitPerIteration; + i += kMaxDigitPerIteration; + } + + if (length > 0) + AppendDecimal64(decimals + i, decimals + i + length); + } + + BigInteger& operator=(const BigInteger &rhs) + { + if (this != &rhs) { + count_ = rhs.count_; + std::memcpy(digits_, rhs.digits_, count_ * sizeof(Type)); + } + return *this; + } + + BigInteger& operator=(uint64_t u) { + digits_[0] = u; + count_ = 1; + return *this; + } + + BigInteger& operator+=(uint64_t u) { + Type backup = digits_[0]; + digits_[0] += u; + for (size_t i = 0; i < count_ - 1; i++) { + if (digits_[i] >= backup) + return *this; // no carry + backup = digits_[i + 1]; + digits_[i + 1] += 1; + } + + // Last carry + if (digits_[count_ - 1] < backup) + PushBack(1); + + return *this; + } + + BigInteger& operator*=(uint64_t u) { + if (u == 0) return *this = 0; + if (u == 1) return *this; + if (*this == 1) return *this = u; + + uint64_t k = 0; + for (size_t i = 0; i < count_; i++) { + uint64_t hi; + digits_[i] = MulAdd64(digits_[i], u, k, &hi); + k = hi; + } + + if (k > 0) + PushBack(k); + + return *this; + } + + BigInteger& operator*=(uint32_t u) { + if (u == 0) return *this = 0; + if (u == 1) return *this; + if (*this == 1) return *this = u; + + uint64_t k = 0; + for (size_t i = 0; i < count_; i++) { + const uint64_t c = digits_[i] >> 32; + const uint64_t d = digits_[i] & 0xFFFFFFFF; + const uint64_t uc = u * c; + const uint64_t ud = u * d; + const uint64_t p0 = ud + k; + const uint64_t p1 = uc + (p0 >> 32); + digits_[i] = (p0 & 0xFFFFFFFF) | (p1 << 32); + k = p1 >> 32; + } + + if (k > 0) + PushBack(k); + + return *this; + } + + BigInteger& operator<<=(size_t shift) { + if (IsZero() || shift == 0) return *this; + + size_t offset = shift / kTypeBit; + size_t interShift = shift % kTypeBit; + RAPIDJSON_ASSERT(count_ + offset <= kCapacity); + + if (interShift == 0) { + std::memmove(digits_ + offset, digits_, count_ * sizeof(Type)); + count_ += offset; + } + else { + digits_[count_] = 0; + for (size_t i = count_; i > 0; i--) + digits_[i + offset] = (digits_[i] << interShift) | (digits_[i - 1] >> (kTypeBit - interShift)); + digits_[offset] = digits_[0] << interShift; + count_ += offset; + if (digits_[count_]) + count_++; + } + + std::memset(digits_, 0, offset * sizeof(Type)); + + return *this; + } + + bool operator==(const BigInteger& rhs) const { + return count_ == rhs.count_ && std::memcmp(digits_, rhs.digits_, count_ * sizeof(Type)) == 0; + } + + bool operator==(const Type rhs) const { + return count_ == 1 && digits_[0] == rhs; + } + + BigInteger& MultiplyPow5(unsigned exp) { + static const uint32_t kPow5[12] = { + 5, + 5 * 5, + 5 * 5 * 5, + 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5, + 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 * 5 + }; + if (exp == 0) return *this; + for (; exp >= 27; exp -= 27) *this *= RAPIDJSON_UINT64_C2(0X6765C793, 0XFA10079D); // 5^27 + for (; exp >= 13; exp -= 13) *this *= static_cast(1220703125u); // 5^13 + if (exp > 0) *this *= kPow5[exp - 1]; + return *this; + } + + // Compute absolute difference of this and rhs. + // Assume this != rhs + bool Difference(const BigInteger& rhs, BigInteger* out) const { + int cmp = Compare(rhs); + RAPIDJSON_ASSERT(cmp != 0); + const BigInteger *a, *b; // Makes a > b + bool ret; + if (cmp < 0) { a = &rhs; b = this; ret = true; } + else { a = this; b = &rhs; ret = false; } + + Type borrow = 0; + for (size_t i = 0; i < a->count_; i++) { + Type d = a->digits_[i] - borrow; + if (i < b->count_) + d -= b->digits_[i]; + borrow = (d > a->digits_[i]) ? 1 : 0; + out->digits_[i] = d; + if (d != 0) + out->count_ = i + 1; + } + + return ret; + } + + int Compare(const BigInteger& rhs) const { + if (count_ != rhs.count_) + return count_ < rhs.count_ ? -1 : 1; + + for (size_t i = count_; i-- > 0;) + if (digits_[i] != rhs.digits_[i]) + return digits_[i] < rhs.digits_[i] ? -1 : 1; + + return 0; + } + + size_t GetCount() const { return count_; } + Type GetDigit(size_t index) const { RAPIDJSON_ASSERT(index < count_); return digits_[index]; } + bool IsZero() const { return count_ == 1 && digits_[0] == 0; } + +private: + void AppendDecimal64(const char* begin, const char* end) { + uint64_t u = ParseUint64(begin, end); + if (IsZero()) + *this = u; + else { + unsigned exp = static_cast(end - begin); + (MultiplyPow5(exp) <<= exp) += u; // *this = *this * 10^exp + u + } + } + + void PushBack(Type digit) { + RAPIDJSON_ASSERT(count_ < kCapacity); + digits_[count_++] = digit; + } + + static uint64_t ParseUint64(const char* begin, const char* end) { + uint64_t r = 0; + for (const char* p = begin; p != end; ++p) { + RAPIDJSON_ASSERT(*p >= '0' && *p <= '9'); + r = r * 10u + static_cast(*p - '0'); + } + return r; + } + + // Assume a * b + k < 2^128 + static uint64_t MulAdd64(uint64_t a, uint64_t b, uint64_t k, uint64_t* outHigh) { +#if defined(_MSC_VER) && defined(_M_AMD64) + uint64_t low = _umul128(a, b, outHigh) + k; + if (low < k) + (*outHigh)++; + return low; +#elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__x86_64__) + __extension__ typedef unsigned __int128 uint128; + uint128 p = static_cast(a) * static_cast(b); + p += k; + *outHigh = static_cast(p >> 64); + return static_cast(p); +#else + const uint64_t a0 = a & 0xFFFFFFFF, a1 = a >> 32, b0 = b & 0xFFFFFFFF, b1 = b >> 32; + uint64_t x0 = a0 * b0, x1 = a0 * b1, x2 = a1 * b0, x3 = a1 * b1; + x1 += (x0 >> 32); // can't give carry + x1 += x2; + if (x1 < x2) + x3 += (static_cast(1) << 32); + uint64_t lo = (x1 << 32) + (x0 & 0xFFFFFFFF); + uint64_t hi = x3 + (x1 >> 32); + + lo += k; + if (lo < k) + hi++; + *outHigh = hi; + return lo; +#endif + } + + static const size_t kBitCount = 3328; // 64bit * 54 > 10^1000 + static const size_t kCapacity = kBitCount / sizeof(Type); + static const size_t kTypeBit = sizeof(Type) * 8; + + Type digits_[kCapacity]; + size_t count_; +}; + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_BIGINTEGER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/diyfp.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/diyfp.h new file mode 100644 index 0000000..b6c2cf5 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/diyfp.h @@ -0,0 +1,271 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// This is a C++ header-only implementation of Grisu2 algorithm from the publication: +// Loitsch, Florian. "Printing floating-point numbers quickly and accurately with +// integers." ACM Sigplan Notices 45.6 (2010): 233-243. + +#ifndef RAPIDJSON_DIYFP_H_ +#define RAPIDJSON_DIYFP_H_ + +#include "../rapidjson.h" +#include + +#if defined(_MSC_VER) && defined(_M_AMD64) && !defined(__INTEL_COMPILER) +#include +#pragma intrinsic(_BitScanReverse64) +#pragma intrinsic(_umul128) +#endif + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +#endif + +struct DiyFp { + DiyFp() : f(), e() {} + + DiyFp(uint64_t fp, int exp) : f(fp), e(exp) {} + + explicit DiyFp(double d) { + union { + double d; + uint64_t u64; + } u = { d }; + + int biased_e = static_cast((u.u64 & kDpExponentMask) >> kDpSignificandSize); + uint64_t significand = (u.u64 & kDpSignificandMask); + if (biased_e != 0) { + f = significand + kDpHiddenBit; + e = biased_e - kDpExponentBias; + } + else { + f = significand; + e = kDpMinExponent + 1; + } + } + + DiyFp operator-(const DiyFp& rhs) const { + return DiyFp(f - rhs.f, e); + } + + DiyFp operator*(const DiyFp& rhs) const { +#if defined(_MSC_VER) && defined(_M_AMD64) + uint64_t h; + uint64_t l = _umul128(f, rhs.f, &h); + if (l & (uint64_t(1) << 63)) // rounding + h++; + return DiyFp(h, e + rhs.e + 64); +#elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__x86_64__) + __extension__ typedef unsigned __int128 uint128; + uint128 p = static_cast(f) * static_cast(rhs.f); + uint64_t h = static_cast(p >> 64); + uint64_t l = static_cast(p); + if (l & (uint64_t(1) << 63)) // rounding + h++; + return DiyFp(h, e + rhs.e + 64); +#else + const uint64_t M32 = 0xFFFFFFFF; + const uint64_t a = f >> 32; + const uint64_t b = f & M32; + const uint64_t c = rhs.f >> 32; + const uint64_t d = rhs.f & M32; + const uint64_t ac = a * c; + const uint64_t bc = b * c; + const uint64_t ad = a * d; + const uint64_t bd = b * d; + uint64_t tmp = (bd >> 32) + (ad & M32) + (bc & M32); + tmp += 1U << 31; /// mult_round + return DiyFp(ac + (ad >> 32) + (bc >> 32) + (tmp >> 32), e + rhs.e + 64); +#endif + } + + DiyFp Normalize() const { + RAPIDJSON_ASSERT(f != 0); // https://stackoverflow.com/a/26809183/291737 +#if defined(_MSC_VER) && defined(_M_AMD64) + unsigned long index; + _BitScanReverse64(&index, f); + return DiyFp(f << (63 - index), e - (63 - index)); +#elif defined(__GNUC__) && __GNUC__ >= 4 + int s = __builtin_clzll(f); + return DiyFp(f << s, e - s); +#else + DiyFp res = *this; + while (!(res.f & (static_cast(1) << 63))) { + res.f <<= 1; + res.e--; + } + return res; +#endif + } + + DiyFp NormalizeBoundary() const { + DiyFp res = *this; + while (!(res.f & (kDpHiddenBit << 1))) { + res.f <<= 1; + res.e--; + } + res.f <<= (kDiySignificandSize - kDpSignificandSize - 2); + res.e = res.e - (kDiySignificandSize - kDpSignificandSize - 2); + return res; + } + + void NormalizedBoundaries(DiyFp* minus, DiyFp* plus) const { + DiyFp pl = DiyFp((f << 1) + 1, e - 1).NormalizeBoundary(); + DiyFp mi = (f == kDpHiddenBit) ? DiyFp((f << 2) - 1, e - 2) : DiyFp((f << 1) - 1, e - 1); + mi.f <<= mi.e - pl.e; + mi.e = pl.e; + *plus = pl; + *minus = mi; + } + + double ToDouble() const { + union { + double d; + uint64_t u64; + }u; + RAPIDJSON_ASSERT(f <= kDpHiddenBit + kDpSignificandMask); + if (e < kDpDenormalExponent) { + // Underflow. + return 0.0; + } + if (e >= kDpMaxExponent) { + // Overflow. + return std::numeric_limits::infinity(); + } + const uint64_t be = (e == kDpDenormalExponent && (f & kDpHiddenBit) == 0) ? 0 : + static_cast(e + kDpExponentBias); + u.u64 = (f & kDpSignificandMask) | (be << kDpSignificandSize); + return u.d; + } + + static const int kDiySignificandSize = 64; + static const int kDpSignificandSize = 52; + static const int kDpExponentBias = 0x3FF + kDpSignificandSize; + static const int kDpMaxExponent = 0x7FF - kDpExponentBias; + static const int kDpMinExponent = -kDpExponentBias; + static const int kDpDenormalExponent = -kDpExponentBias + 1; + static const uint64_t kDpExponentMask = RAPIDJSON_UINT64_C2(0x7FF00000, 0x00000000); + static const uint64_t kDpSignificandMask = RAPIDJSON_UINT64_C2(0x000FFFFF, 0xFFFFFFFF); + static const uint64_t kDpHiddenBit = RAPIDJSON_UINT64_C2(0x00100000, 0x00000000); + + uint64_t f; + int e; +}; + +inline DiyFp GetCachedPowerByIndex(size_t index) { + // 10^-348, 10^-340, ..., 10^340 + static const uint64_t kCachedPowers_F[] = { + RAPIDJSON_UINT64_C2(0xfa8fd5a0, 0x081c0288), RAPIDJSON_UINT64_C2(0xbaaee17f, 0xa23ebf76), + RAPIDJSON_UINT64_C2(0x8b16fb20, 0x3055ac76), RAPIDJSON_UINT64_C2(0xcf42894a, 0x5dce35ea), + RAPIDJSON_UINT64_C2(0x9a6bb0aa, 0x55653b2d), RAPIDJSON_UINT64_C2(0xe61acf03, 0x3d1a45df), + RAPIDJSON_UINT64_C2(0xab70fe17, 0xc79ac6ca), RAPIDJSON_UINT64_C2(0xff77b1fc, 0xbebcdc4f), + RAPIDJSON_UINT64_C2(0xbe5691ef, 0x416bd60c), RAPIDJSON_UINT64_C2(0x8dd01fad, 0x907ffc3c), + RAPIDJSON_UINT64_C2(0xd3515c28, 0x31559a83), RAPIDJSON_UINT64_C2(0x9d71ac8f, 0xada6c9b5), + RAPIDJSON_UINT64_C2(0xea9c2277, 0x23ee8bcb), RAPIDJSON_UINT64_C2(0xaecc4991, 0x4078536d), + RAPIDJSON_UINT64_C2(0x823c1279, 0x5db6ce57), RAPIDJSON_UINT64_C2(0xc2109436, 0x4dfb5637), + RAPIDJSON_UINT64_C2(0x9096ea6f, 0x3848984f), RAPIDJSON_UINT64_C2(0xd77485cb, 0x25823ac7), + RAPIDJSON_UINT64_C2(0xa086cfcd, 0x97bf97f4), RAPIDJSON_UINT64_C2(0xef340a98, 0x172aace5), + RAPIDJSON_UINT64_C2(0xb23867fb, 0x2a35b28e), RAPIDJSON_UINT64_C2(0x84c8d4df, 0xd2c63f3b), + RAPIDJSON_UINT64_C2(0xc5dd4427, 0x1ad3cdba), RAPIDJSON_UINT64_C2(0x936b9fce, 0xbb25c996), + RAPIDJSON_UINT64_C2(0xdbac6c24, 0x7d62a584), RAPIDJSON_UINT64_C2(0xa3ab6658, 0x0d5fdaf6), + RAPIDJSON_UINT64_C2(0xf3e2f893, 0xdec3f126), RAPIDJSON_UINT64_C2(0xb5b5ada8, 0xaaff80b8), + RAPIDJSON_UINT64_C2(0x87625f05, 0x6c7c4a8b), RAPIDJSON_UINT64_C2(0xc9bcff60, 0x34c13053), + RAPIDJSON_UINT64_C2(0x964e858c, 0x91ba2655), RAPIDJSON_UINT64_C2(0xdff97724, 0x70297ebd), + RAPIDJSON_UINT64_C2(0xa6dfbd9f, 0xb8e5b88f), RAPIDJSON_UINT64_C2(0xf8a95fcf, 0x88747d94), + RAPIDJSON_UINT64_C2(0xb9447093, 0x8fa89bcf), RAPIDJSON_UINT64_C2(0x8a08f0f8, 0xbf0f156b), + RAPIDJSON_UINT64_C2(0xcdb02555, 0x653131b6), RAPIDJSON_UINT64_C2(0x993fe2c6, 0xd07b7fac), + RAPIDJSON_UINT64_C2(0xe45c10c4, 0x2a2b3b06), RAPIDJSON_UINT64_C2(0xaa242499, 0x697392d3), + RAPIDJSON_UINT64_C2(0xfd87b5f2, 0x8300ca0e), RAPIDJSON_UINT64_C2(0xbce50864, 0x92111aeb), + RAPIDJSON_UINT64_C2(0x8cbccc09, 0x6f5088cc), RAPIDJSON_UINT64_C2(0xd1b71758, 0xe219652c), + RAPIDJSON_UINT64_C2(0x9c400000, 0x00000000), RAPIDJSON_UINT64_C2(0xe8d4a510, 0x00000000), + RAPIDJSON_UINT64_C2(0xad78ebc5, 0xac620000), RAPIDJSON_UINT64_C2(0x813f3978, 0xf8940984), + RAPIDJSON_UINT64_C2(0xc097ce7b, 0xc90715b3), RAPIDJSON_UINT64_C2(0x8f7e32ce, 0x7bea5c70), + RAPIDJSON_UINT64_C2(0xd5d238a4, 0xabe98068), RAPIDJSON_UINT64_C2(0x9f4f2726, 0x179a2245), + RAPIDJSON_UINT64_C2(0xed63a231, 0xd4c4fb27), RAPIDJSON_UINT64_C2(0xb0de6538, 0x8cc8ada8), + RAPIDJSON_UINT64_C2(0x83c7088e, 0x1aab65db), RAPIDJSON_UINT64_C2(0xc45d1df9, 0x42711d9a), + RAPIDJSON_UINT64_C2(0x924d692c, 0xa61be758), RAPIDJSON_UINT64_C2(0xda01ee64, 0x1a708dea), + RAPIDJSON_UINT64_C2(0xa26da399, 0x9aef774a), RAPIDJSON_UINT64_C2(0xf209787b, 0xb47d6b85), + RAPIDJSON_UINT64_C2(0xb454e4a1, 0x79dd1877), RAPIDJSON_UINT64_C2(0x865b8692, 0x5b9bc5c2), + RAPIDJSON_UINT64_C2(0xc83553c5, 0xc8965d3d), RAPIDJSON_UINT64_C2(0x952ab45c, 0xfa97a0b3), + RAPIDJSON_UINT64_C2(0xde469fbd, 0x99a05fe3), RAPIDJSON_UINT64_C2(0xa59bc234, 0xdb398c25), + RAPIDJSON_UINT64_C2(0xf6c69a72, 0xa3989f5c), RAPIDJSON_UINT64_C2(0xb7dcbf53, 0x54e9bece), + RAPIDJSON_UINT64_C2(0x88fcf317, 0xf22241e2), RAPIDJSON_UINT64_C2(0xcc20ce9b, 0xd35c78a5), + RAPIDJSON_UINT64_C2(0x98165af3, 0x7b2153df), RAPIDJSON_UINT64_C2(0xe2a0b5dc, 0x971f303a), + RAPIDJSON_UINT64_C2(0xa8d9d153, 0x5ce3b396), RAPIDJSON_UINT64_C2(0xfb9b7cd9, 0xa4a7443c), + RAPIDJSON_UINT64_C2(0xbb764c4c, 0xa7a44410), RAPIDJSON_UINT64_C2(0x8bab8eef, 0xb6409c1a), + RAPIDJSON_UINT64_C2(0xd01fef10, 0xa657842c), RAPIDJSON_UINT64_C2(0x9b10a4e5, 0xe9913129), + RAPIDJSON_UINT64_C2(0xe7109bfb, 0xa19c0c9d), RAPIDJSON_UINT64_C2(0xac2820d9, 0x623bf429), + RAPIDJSON_UINT64_C2(0x80444b5e, 0x7aa7cf85), RAPIDJSON_UINT64_C2(0xbf21e440, 0x03acdd2d), + RAPIDJSON_UINT64_C2(0x8e679c2f, 0x5e44ff8f), RAPIDJSON_UINT64_C2(0xd433179d, 0x9c8cb841), + RAPIDJSON_UINT64_C2(0x9e19db92, 0xb4e31ba9), RAPIDJSON_UINT64_C2(0xeb96bf6e, 0xbadf77d9), + RAPIDJSON_UINT64_C2(0xaf87023b, 0x9bf0ee6b) + }; + static const int16_t kCachedPowers_E[] = { + -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, + -954, -927, -901, -874, -847, -821, -794, -768, -741, -715, + -688, -661, -635, -608, -582, -555, -529, -502, -475, -449, + -422, -396, -369, -343, -316, -289, -263, -236, -210, -183, + -157, -130, -103, -77, -50, -24, 3, 30, 56, 83, + 109, 136, 162, 189, 216, 242, 269, 295, 322, 348, + 375, 402, 428, 455, 481, 508, 534, 561, 588, 614, + 641, 667, 694, 720, 747, 774, 800, 827, 853, 880, + 907, 933, 960, 986, 1013, 1039, 1066 + }; + RAPIDJSON_ASSERT(index < 87); + return DiyFp(kCachedPowers_F[index], kCachedPowers_E[index]); +} + +inline DiyFp GetCachedPower(int e, int* K) { + + //int k = static_cast(ceil((-61 - e) * 0.30102999566398114)) + 374; + double dk = (-61 - e) * 0.30102999566398114 + 347; // dk must be positive, so can do ceiling in positive + int k = static_cast(dk); + if (dk - k > 0.0) + k++; + + unsigned index = static_cast((k >> 3) + 1); + *K = -(-348 + static_cast(index << 3)); // decimal exponent no need lookup table + + return GetCachedPowerByIndex(index); +} + +inline DiyFp GetCachedPower10(int exp, int *outExp) { + RAPIDJSON_ASSERT(exp >= -348); + unsigned index = static_cast(exp + 348) / 8u; + *outExp = -348 + static_cast(index) * 8; + return GetCachedPowerByIndex(index); +} + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +RAPIDJSON_DIAG_OFF(padded) +#endif + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_DIYFP_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/dtoa.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/dtoa.h new file mode 100644 index 0000000..bf2e9b2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/dtoa.h @@ -0,0 +1,245 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// This is a C++ header-only implementation of Grisu2 algorithm from the publication: +// Loitsch, Florian. "Printing floating-point numbers quickly and accurately with +// integers." ACM Sigplan Notices 45.6 (2010): 233-243. + +#ifndef RAPIDJSON_DTOA_ +#define RAPIDJSON_DTOA_ + +#include "itoa.h" // GetDigitsLut() +#include "diyfp.h" +#include "ieee754.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +RAPIDJSON_DIAG_OFF(array-bounds) // some gcc versions generate wrong warnings https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 +#endif + +inline void GrisuRound(char* buffer, int len, uint64_t delta, uint64_t rest, uint64_t ten_kappa, uint64_t wp_w) { + while (rest < wp_w && delta - rest >= ten_kappa && + (rest + ten_kappa < wp_w || /// closer + wp_w - rest > rest + ten_kappa - wp_w)) { + buffer[len - 1]--; + rest += ten_kappa; + } +} + +inline int CountDecimalDigit32(uint32_t n) { + // Simple pure C++ implementation was faster than __builtin_clz version in this situation. + if (n < 10) return 1; + if (n < 100) return 2; + if (n < 1000) return 3; + if (n < 10000) return 4; + if (n < 100000) return 5; + if (n < 1000000) return 6; + if (n < 10000000) return 7; + if (n < 100000000) return 8; + // Will not reach 10 digits in DigitGen() + //if (n < 1000000000) return 9; + //return 10; + return 9; +} + +inline void DigitGen(const DiyFp& W, const DiyFp& Mp, uint64_t delta, char* buffer, int* len, int* K) { + static const uint32_t kPow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; + const DiyFp one(uint64_t(1) << -Mp.e, Mp.e); + const DiyFp wp_w = Mp - W; + uint32_t p1 = static_cast(Mp.f >> -one.e); + uint64_t p2 = Mp.f & (one.f - 1); + int kappa = CountDecimalDigit32(p1); // kappa in [0, 9] + *len = 0; + + while (kappa > 0) { + uint32_t d = 0; + switch (kappa) { + case 9: d = p1 / 100000000; p1 %= 100000000; break; + case 8: d = p1 / 10000000; p1 %= 10000000; break; + case 7: d = p1 / 1000000; p1 %= 1000000; break; + case 6: d = p1 / 100000; p1 %= 100000; break; + case 5: d = p1 / 10000; p1 %= 10000; break; + case 4: d = p1 / 1000; p1 %= 1000; break; + case 3: d = p1 / 100; p1 %= 100; break; + case 2: d = p1 / 10; p1 %= 10; break; + case 1: d = p1; p1 = 0; break; + default:; + } + if (d || *len) + buffer[(*len)++] = static_cast('0' + static_cast(d)); + kappa--; + uint64_t tmp = (static_cast(p1) << -one.e) + p2; + if (tmp <= delta) { + *K += kappa; + GrisuRound(buffer, *len, delta, tmp, static_cast(kPow10[kappa]) << -one.e, wp_w.f); + return; + } + } + + // kappa = 0 + for (;;) { + p2 *= 10; + delta *= 10; + char d = static_cast(p2 >> -one.e); + if (d || *len) + buffer[(*len)++] = static_cast('0' + d); + p2 &= one.f - 1; + kappa--; + if (p2 < delta) { + *K += kappa; + int index = -kappa; + GrisuRound(buffer, *len, delta, p2, one.f, wp_w.f * (index < 9 ? kPow10[index] : 0)); + return; + } + } +} + +inline void Grisu2(double value, char* buffer, int* length, int* K) { + const DiyFp v(value); + DiyFp w_m, w_p; + v.NormalizedBoundaries(&w_m, &w_p); + + const DiyFp c_mk = GetCachedPower(w_p.e, K); + const DiyFp W = v.Normalize() * c_mk; + DiyFp Wp = w_p * c_mk; + DiyFp Wm = w_m * c_mk; + Wm.f++; + Wp.f--; + DigitGen(W, Wp, Wp.f - Wm.f, buffer, length, K); +} + +inline char* WriteExponent(int K, char* buffer) { + if (K < 0) { + *buffer++ = '-'; + K = -K; + } + + if (K >= 100) { + *buffer++ = static_cast('0' + static_cast(K / 100)); + K %= 100; + const char* d = GetDigitsLut() + K * 2; + *buffer++ = d[0]; + *buffer++ = d[1]; + } + else if (K >= 10) { + const char* d = GetDigitsLut() + K * 2; + *buffer++ = d[0]; + *buffer++ = d[1]; + } + else + *buffer++ = static_cast('0' + static_cast(K)); + + return buffer; +} + +inline char* Prettify(char* buffer, int length, int k, int maxDecimalPlaces) { + const int kk = length + k; // 10^(kk-1) <= v < 10^kk + + if (0 <= k && kk <= 21) { + // 1234e7 -> 12340000000 + for (int i = length; i < kk; i++) + buffer[i] = '0'; + buffer[kk] = '.'; + buffer[kk + 1] = '0'; + return &buffer[kk + 2]; + } + else if (0 < kk && kk <= 21) { + // 1234e-2 -> 12.34 + std::memmove(&buffer[kk + 1], &buffer[kk], static_cast(length - kk)); + buffer[kk] = '.'; + if (0 > k + maxDecimalPlaces) { + // When maxDecimalPlaces = 2, 1.2345 -> 1.23, 1.102 -> 1.1 + // Remove extra trailing zeros (at least one) after truncation. + for (int i = kk + maxDecimalPlaces; i > kk + 1; i--) + if (buffer[i] != '0') + return &buffer[i + 1]; + return &buffer[kk + 2]; // Reserve one zero + } + else + return &buffer[length + 1]; + } + else if (-6 < kk && kk <= 0) { + // 1234e-6 -> 0.001234 + const int offset = 2 - kk; + std::memmove(&buffer[offset], &buffer[0], static_cast(length)); + buffer[0] = '0'; + buffer[1] = '.'; + for (int i = 2; i < offset; i++) + buffer[i] = '0'; + if (length - kk > maxDecimalPlaces) { + // When maxDecimalPlaces = 2, 0.123 -> 0.12, 0.102 -> 0.1 + // Remove extra trailing zeros (at least one) after truncation. + for (int i = maxDecimalPlaces + 1; i > 2; i--) + if (buffer[i] != '0') + return &buffer[i + 1]; + return &buffer[3]; // Reserve one zero + } + else + return &buffer[length + offset]; + } + else if (kk < -maxDecimalPlaces) { + // Truncate to zero + buffer[0] = '0'; + buffer[1] = '.'; + buffer[2] = '0'; + return &buffer[3]; + } + else if (length == 1) { + // 1e30 + buffer[1] = 'e'; + return WriteExponent(kk - 1, &buffer[2]); + } + else { + // 1234e30 -> 1.234e33 + std::memmove(&buffer[2], &buffer[1], static_cast(length - 1)); + buffer[1] = '.'; + buffer[length + 1] = 'e'; + return WriteExponent(kk - 1, &buffer[0 + length + 2]); + } +} + +inline char* dtoa(double value, char* buffer, int maxDecimalPlaces = 324) { + RAPIDJSON_ASSERT(maxDecimalPlaces >= 1); + Double d(value); + if (d.IsZero()) { + if (d.Sign()) + *buffer++ = '-'; // -0.0, Issue #289 + buffer[0] = '0'; + buffer[1] = '.'; + buffer[2] = '0'; + return &buffer[3]; + } + else { + if (value < 0) { + *buffer++ = '-'; + value = -value; + } + int length, K; + Grisu2(value, buffer, &length, &K); + return Prettify(buffer, length, K, maxDecimalPlaces); + } +} + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_DTOA_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/ieee754.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/ieee754.h new file mode 100644 index 0000000..c2684ba --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/ieee754.h @@ -0,0 +1,78 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_IEEE754_ +#define RAPIDJSON_IEEE754_ + +#include "../rapidjson.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +class Double { +public: + Double() {} + Double(double d) : d_(d) {} + Double(uint64_t u) : u_(u) {} + + double Value() const { return d_; } + uint64_t Uint64Value() const { return u_; } + + double NextPositiveDouble() const { + RAPIDJSON_ASSERT(!Sign()); + return Double(u_ + 1).Value(); + } + + bool Sign() const { return (u_ & kSignMask) != 0; } + uint64_t Significand() const { return u_ & kSignificandMask; } + int Exponent() const { return static_cast(((u_ & kExponentMask) >> kSignificandSize) - kExponentBias); } + + bool IsNan() const { return (u_ & kExponentMask) == kExponentMask && Significand() != 0; } + bool IsInf() const { return (u_ & kExponentMask) == kExponentMask && Significand() == 0; } + bool IsNanOrInf() const { return (u_ & kExponentMask) == kExponentMask; } + bool IsNormal() const { return (u_ & kExponentMask) != 0 || Significand() == 0; } + bool IsZero() const { return (u_ & (kExponentMask | kSignificandMask)) == 0; } + + uint64_t IntegerSignificand() const { return IsNormal() ? Significand() | kHiddenBit : Significand(); } + int IntegerExponent() const { return (IsNormal() ? Exponent() : kDenormalExponent) - kSignificandSize; } + uint64_t ToBias() const { return (u_ & kSignMask) ? ~u_ + 1 : u_ | kSignMask; } + + static int EffectiveSignificandSize(int order) { + if (order >= -1021) + return 53; + else if (order <= -1074) + return 0; + else + return order + 1074; + } + +private: + static const int kSignificandSize = 52; + static const int kExponentBias = 0x3FF; + static const int kDenormalExponent = 1 - kExponentBias; + static const uint64_t kSignMask = RAPIDJSON_UINT64_C2(0x80000000, 0x00000000); + static const uint64_t kExponentMask = RAPIDJSON_UINT64_C2(0x7FF00000, 0x00000000); + static const uint64_t kSignificandMask = RAPIDJSON_UINT64_C2(0x000FFFFF, 0xFFFFFFFF); + static const uint64_t kHiddenBit = RAPIDJSON_UINT64_C2(0x00100000, 0x00000000); + + union { + double d_; + uint64_t u_; + }; +}; + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_IEEE754_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/itoa.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/itoa.h new file mode 100644 index 0000000..9b1c45c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/itoa.h @@ -0,0 +1,308 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ITOA_ +#define RAPIDJSON_ITOA_ + +#include "../rapidjson.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +inline const char* GetDigitsLut() { + static const char cDigitsLut[200] = { + '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9', + '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9', + '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9', + '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9', + '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9', + '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9', + '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9', + '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9', + '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9', + '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9' + }; + return cDigitsLut; +} + +inline char* u32toa(uint32_t value, char* buffer) { + RAPIDJSON_ASSERT(buffer != 0); + + const char* cDigitsLut = GetDigitsLut(); + + if (value < 10000) { + const uint32_t d1 = (value / 100) << 1; + const uint32_t d2 = (value % 100) << 1; + + if (value >= 1000) + *buffer++ = cDigitsLut[d1]; + if (value >= 100) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= 10) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + } + else if (value < 100000000) { + // value = bbbbcccc + const uint32_t b = value / 10000; + const uint32_t c = value % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = cDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + } + else { + // value = aabbbbcccc in decimal + + const uint32_t a = value / 100000000; // 1 to 42 + value %= 100000000; + + if (a >= 10) { + const unsigned i = a << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + } + else + *buffer++ = static_cast('0' + static_cast(a)); + + const uint32_t b = value / 10000; // 0 to 9999 + const uint32_t c = value % 10000; // 0 to 9999 + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + *buffer++ = cDigitsLut[d1]; + *buffer++ = cDigitsLut[d1 + 1]; + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + } + return buffer; +} + +inline char* i32toa(int32_t value, char* buffer) { + RAPIDJSON_ASSERT(buffer != 0); + uint32_t u = static_cast(value); + if (value < 0) { + *buffer++ = '-'; + u = ~u + 1; + } + + return u32toa(u, buffer); +} + +inline char* u64toa(uint64_t value, char* buffer) { + RAPIDJSON_ASSERT(buffer != 0); + const char* cDigitsLut = GetDigitsLut(); + const uint64_t kTen8 = 100000000; + const uint64_t kTen9 = kTen8 * 10; + const uint64_t kTen10 = kTen8 * 100; + const uint64_t kTen11 = kTen8 * 1000; + const uint64_t kTen12 = kTen8 * 10000; + const uint64_t kTen13 = kTen8 * 100000; + const uint64_t kTen14 = kTen8 * 1000000; + const uint64_t kTen15 = kTen8 * 10000000; + const uint64_t kTen16 = kTen8 * kTen8; + + if (value < kTen8) { + uint32_t v = static_cast(value); + if (v < 10000) { + const uint32_t d1 = (v / 100) << 1; + const uint32_t d2 = (v % 100) << 1; + + if (v >= 1000) + *buffer++ = cDigitsLut[d1]; + if (v >= 100) + *buffer++ = cDigitsLut[d1 + 1]; + if (v >= 10) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + } + else { + // value = bbbbcccc + const uint32_t b = v / 10000; + const uint32_t c = v % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = cDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + } + } + else if (value < kTen16) { + const uint32_t v0 = static_cast(value / kTen8); + const uint32_t v1 = static_cast(value % kTen8); + + const uint32_t b0 = v0 / 10000; + const uint32_t c0 = v0 % 10000; + + const uint32_t d1 = (b0 / 100) << 1; + const uint32_t d2 = (b0 % 100) << 1; + + const uint32_t d3 = (c0 / 100) << 1; + const uint32_t d4 = (c0 % 100) << 1; + + const uint32_t b1 = v1 / 10000; + const uint32_t c1 = v1 % 10000; + + const uint32_t d5 = (b1 / 100) << 1; + const uint32_t d6 = (b1 % 100) << 1; + + const uint32_t d7 = (c1 / 100) << 1; + const uint32_t d8 = (c1 % 100) << 1; + + if (value >= kTen15) + *buffer++ = cDigitsLut[d1]; + if (value >= kTen14) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= kTen13) + *buffer++ = cDigitsLut[d2]; + if (value >= kTen12) + *buffer++ = cDigitsLut[d2 + 1]; + if (value >= kTen11) + *buffer++ = cDigitsLut[d3]; + if (value >= kTen10) + *buffer++ = cDigitsLut[d3 + 1]; + if (value >= kTen9) + *buffer++ = cDigitsLut[d4]; + + *buffer++ = cDigitsLut[d4 + 1]; + *buffer++ = cDigitsLut[d5]; + *buffer++ = cDigitsLut[d5 + 1]; + *buffer++ = cDigitsLut[d6]; + *buffer++ = cDigitsLut[d6 + 1]; + *buffer++ = cDigitsLut[d7]; + *buffer++ = cDigitsLut[d7 + 1]; + *buffer++ = cDigitsLut[d8]; + *buffer++ = cDigitsLut[d8 + 1]; + } + else { + const uint32_t a = static_cast(value / kTen16); // 1 to 1844 + value %= kTen16; + + if (a < 10) + *buffer++ = static_cast('0' + static_cast(a)); + else if (a < 100) { + const uint32_t i = a << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + } + else if (a < 1000) { + *buffer++ = static_cast('0' + static_cast(a / 100)); + + const uint32_t i = (a % 100) << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + } + else { + const uint32_t i = (a / 100) << 1; + const uint32_t j = (a % 100) << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + *buffer++ = cDigitsLut[j]; + *buffer++ = cDigitsLut[j + 1]; + } + + const uint32_t v0 = static_cast(value / kTen8); + const uint32_t v1 = static_cast(value % kTen8); + + const uint32_t b0 = v0 / 10000; + const uint32_t c0 = v0 % 10000; + + const uint32_t d1 = (b0 / 100) << 1; + const uint32_t d2 = (b0 % 100) << 1; + + const uint32_t d3 = (c0 / 100) << 1; + const uint32_t d4 = (c0 % 100) << 1; + + const uint32_t b1 = v1 / 10000; + const uint32_t c1 = v1 % 10000; + + const uint32_t d5 = (b1 / 100) << 1; + const uint32_t d6 = (b1 % 100) << 1; + + const uint32_t d7 = (c1 / 100) << 1; + const uint32_t d8 = (c1 % 100) << 1; + + *buffer++ = cDigitsLut[d1]; + *buffer++ = cDigitsLut[d1 + 1]; + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + *buffer++ = cDigitsLut[d5]; + *buffer++ = cDigitsLut[d5 + 1]; + *buffer++ = cDigitsLut[d6]; + *buffer++ = cDigitsLut[d6 + 1]; + *buffer++ = cDigitsLut[d7]; + *buffer++ = cDigitsLut[d7 + 1]; + *buffer++ = cDigitsLut[d8]; + *buffer++ = cDigitsLut[d8 + 1]; + } + + return buffer; +} + +inline char* i64toa(int64_t value, char* buffer) { + RAPIDJSON_ASSERT(buffer != 0); + uint64_t u = static_cast(value); + if (value < 0) { + *buffer++ = '-'; + u = ~u + 1; + } + + return u64toa(u, buffer); +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_ITOA_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/meta.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/meta.h new file mode 100644 index 0000000..d401edf --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/meta.h @@ -0,0 +1,186 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_META_H_ +#define RAPIDJSON_INTERNAL_META_H_ + +#include "../rapidjson.h" + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(6334) +#endif + +#if RAPIDJSON_HAS_CXX11_TYPETRAITS +#include +#endif + +//@cond RAPIDJSON_INTERNAL +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +// Helper to wrap/convert arbitrary types to void, useful for arbitrary type matching +template struct Void { typedef void Type; }; + +/////////////////////////////////////////////////////////////////////////////// +// BoolType, TrueType, FalseType +// +template struct BoolType { + static const bool Value = Cond; + typedef BoolType Type; +}; +typedef BoolType TrueType; +typedef BoolType FalseType; + + +/////////////////////////////////////////////////////////////////////////////// +// SelectIf, BoolExpr, NotExpr, AndExpr, OrExpr +// + +template struct SelectIfImpl { template struct Apply { typedef T1 Type; }; }; +template <> struct SelectIfImpl { template struct Apply { typedef T2 Type; }; }; +template struct SelectIfCond : SelectIfImpl::template Apply {}; +template struct SelectIf : SelectIfCond {}; + +template struct AndExprCond : FalseType {}; +template <> struct AndExprCond : TrueType {}; +template struct OrExprCond : TrueType {}; +template <> struct OrExprCond : FalseType {}; + +template struct BoolExpr : SelectIf::Type {}; +template struct NotExpr : SelectIf::Type {}; +template struct AndExpr : AndExprCond::Type {}; +template struct OrExpr : OrExprCond::Type {}; + + +/////////////////////////////////////////////////////////////////////////////// +// AddConst, MaybeAddConst, RemoveConst +template struct AddConst { typedef const T Type; }; +template struct MaybeAddConst : SelectIfCond {}; +template struct RemoveConst { typedef T Type; }; +template struct RemoveConst { typedef T Type; }; + + +/////////////////////////////////////////////////////////////////////////////// +// IsSame, IsConst, IsMoreConst, IsPointer +// +template struct IsSame : FalseType {}; +template struct IsSame : TrueType {}; + +template struct IsConst : FalseType {}; +template struct IsConst : TrueType {}; + +template +struct IsMoreConst + : AndExpr::Type, typename RemoveConst::Type>, + BoolType::Value >= IsConst::Value> >::Type {}; + +template struct IsPointer : FalseType {}; +template struct IsPointer : TrueType {}; + +/////////////////////////////////////////////////////////////////////////////// +// IsBaseOf +// +#if RAPIDJSON_HAS_CXX11_TYPETRAITS + +template struct IsBaseOf + : BoolType< ::std::is_base_of::value> {}; + +#else // simplified version adopted from Boost + +template struct IsBaseOfImpl { + RAPIDJSON_STATIC_ASSERT(sizeof(B) != 0); + RAPIDJSON_STATIC_ASSERT(sizeof(D) != 0); + + typedef char (&Yes)[1]; + typedef char (&No) [2]; + + template + static Yes Check(const D*, T); + static No Check(const B*, int); + + struct Host { + operator const B*() const; + operator const D*(); + }; + + enum { Value = (sizeof(Check(Host(), 0)) == sizeof(Yes)) }; +}; + +template struct IsBaseOf + : OrExpr, BoolExpr > >::Type {}; + +#endif // RAPIDJSON_HAS_CXX11_TYPETRAITS + + +////////////////////////////////////////////////////////////////////////// +// EnableIf / DisableIf +// +template struct EnableIfCond { typedef T Type; }; +template struct EnableIfCond { /* empty */ }; + +template struct DisableIfCond { typedef T Type; }; +template struct DisableIfCond { /* empty */ }; + +template +struct EnableIf : EnableIfCond {}; + +template +struct DisableIf : DisableIfCond {}; + +// SFINAE helpers +struct SfinaeTag {}; +template struct RemoveSfinaeTag; +template struct RemoveSfinaeTag { typedef T Type; }; + +#define RAPIDJSON_REMOVEFPTR_(type) \ + typename ::RAPIDJSON_NAMESPACE::internal::RemoveSfinaeTag \ + < ::RAPIDJSON_NAMESPACE::internal::SfinaeTag&(*) type>::Type + +#define RAPIDJSON_ENABLEIF(cond) \ + typename ::RAPIDJSON_NAMESPACE::internal::EnableIf \ + ::Type * = NULL + +#define RAPIDJSON_DISABLEIF(cond) \ + typename ::RAPIDJSON_NAMESPACE::internal::DisableIf \ + ::Type * = NULL + +#define RAPIDJSON_ENABLEIF_RETURN(cond,returntype) \ + typename ::RAPIDJSON_NAMESPACE::internal::EnableIf \ + ::Type + +#define RAPIDJSON_DISABLEIF_RETURN(cond,returntype) \ + typename ::RAPIDJSON_NAMESPACE::internal::DisableIf \ + ::Type + +} // namespace internal +RAPIDJSON_NAMESPACE_END +//@endcond + +#if defined(_MSC_VER) && !defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_INTERNAL_META_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/pow10.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/pow10.h new file mode 100644 index 0000000..02f475d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/pow10.h @@ -0,0 +1,55 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_POW10_ +#define RAPIDJSON_POW10_ + +#include "../rapidjson.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +//! Computes integer powers of 10 in double (10.0^n). +/*! This function uses lookup table for fast and accurate results. + \param n non-negative exponent. Must <= 308. + \return 10.0^n +*/ +inline double Pow10(int n) { + static const double e[] = { // 1e-0...1e308: 309 * 8 bytes = 2472 bytes + 1e+0, + 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6, 1e+7, 1e+8, 1e+9, 1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, + 1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38, 1e+39, 1e+40, + 1e+41, 1e+42, 1e+43, 1e+44, 1e+45, 1e+46, 1e+47, 1e+48, 1e+49, 1e+50, 1e+51, 1e+52, 1e+53, 1e+54, 1e+55, 1e+56, 1e+57, 1e+58, 1e+59, 1e+60, + 1e+61, 1e+62, 1e+63, 1e+64, 1e+65, 1e+66, 1e+67, 1e+68, 1e+69, 1e+70, 1e+71, 1e+72, 1e+73, 1e+74, 1e+75, 1e+76, 1e+77, 1e+78, 1e+79, 1e+80, + 1e+81, 1e+82, 1e+83, 1e+84, 1e+85, 1e+86, 1e+87, 1e+88, 1e+89, 1e+90, 1e+91, 1e+92, 1e+93, 1e+94, 1e+95, 1e+96, 1e+97, 1e+98, 1e+99, 1e+100, + 1e+101,1e+102,1e+103,1e+104,1e+105,1e+106,1e+107,1e+108,1e+109,1e+110,1e+111,1e+112,1e+113,1e+114,1e+115,1e+116,1e+117,1e+118,1e+119,1e+120, + 1e+121,1e+122,1e+123,1e+124,1e+125,1e+126,1e+127,1e+128,1e+129,1e+130,1e+131,1e+132,1e+133,1e+134,1e+135,1e+136,1e+137,1e+138,1e+139,1e+140, + 1e+141,1e+142,1e+143,1e+144,1e+145,1e+146,1e+147,1e+148,1e+149,1e+150,1e+151,1e+152,1e+153,1e+154,1e+155,1e+156,1e+157,1e+158,1e+159,1e+160, + 1e+161,1e+162,1e+163,1e+164,1e+165,1e+166,1e+167,1e+168,1e+169,1e+170,1e+171,1e+172,1e+173,1e+174,1e+175,1e+176,1e+177,1e+178,1e+179,1e+180, + 1e+181,1e+182,1e+183,1e+184,1e+185,1e+186,1e+187,1e+188,1e+189,1e+190,1e+191,1e+192,1e+193,1e+194,1e+195,1e+196,1e+197,1e+198,1e+199,1e+200, + 1e+201,1e+202,1e+203,1e+204,1e+205,1e+206,1e+207,1e+208,1e+209,1e+210,1e+211,1e+212,1e+213,1e+214,1e+215,1e+216,1e+217,1e+218,1e+219,1e+220, + 1e+221,1e+222,1e+223,1e+224,1e+225,1e+226,1e+227,1e+228,1e+229,1e+230,1e+231,1e+232,1e+233,1e+234,1e+235,1e+236,1e+237,1e+238,1e+239,1e+240, + 1e+241,1e+242,1e+243,1e+244,1e+245,1e+246,1e+247,1e+248,1e+249,1e+250,1e+251,1e+252,1e+253,1e+254,1e+255,1e+256,1e+257,1e+258,1e+259,1e+260, + 1e+261,1e+262,1e+263,1e+264,1e+265,1e+266,1e+267,1e+268,1e+269,1e+270,1e+271,1e+272,1e+273,1e+274,1e+275,1e+276,1e+277,1e+278,1e+279,1e+280, + 1e+281,1e+282,1e+283,1e+284,1e+285,1e+286,1e+287,1e+288,1e+289,1e+290,1e+291,1e+292,1e+293,1e+294,1e+295,1e+296,1e+297,1e+298,1e+299,1e+300, + 1e+301,1e+302,1e+303,1e+304,1e+305,1e+306,1e+307,1e+308 + }; + RAPIDJSON_ASSERT(n >= 0 && n <= 308); + return e[n]; +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_POW10_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/regex.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/regex.h new file mode 100644 index 0000000..377f86c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/regex.h @@ -0,0 +1,737 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_REGEX_H_ +#define RAPIDJSON_INTERNAL_REGEX_H_ + +#include "../allocators.h" +#include "../stream.h" +#include "stack.h" + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(switch-enum) +RAPIDJSON_DIAG_OFF(implicit-fallthrough) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#if __GNUC__ >= 7 +RAPIDJSON_DIAG_OFF(implicit-fallthrough) +#endif +#endif + +#ifndef RAPIDJSON_REGEX_VERBOSE +#define RAPIDJSON_REGEX_VERBOSE 0 +#endif + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +/////////////////////////////////////////////////////////////////////////////// +// DecodedStream + +template +class DecodedStream { +public: + DecodedStream(SourceStream& ss) : ss_(ss), codepoint_() { Decode(); } + unsigned Peek() { return codepoint_; } + unsigned Take() { + unsigned c = codepoint_; + if (c) // No further decoding when '\0' + Decode(); + return c; + } + +private: + void Decode() { + if (!Encoding::Decode(ss_, &codepoint_)) + codepoint_ = 0; + } + + SourceStream& ss_; + unsigned codepoint_; +}; + +/////////////////////////////////////////////////////////////////////////////// +// GenericRegex + +static const SizeType kRegexInvalidState = ~SizeType(0); //!< Represents an invalid index in GenericRegex::State::out, out1 +static const SizeType kRegexInvalidRange = ~SizeType(0); + +template +class GenericRegexSearch; + +//! Regular expression engine with subset of ECMAscript grammar. +/*! + Supported regular expression syntax: + - \c ab Concatenation + - \c a|b Alternation + - \c a? Zero or one + - \c a* Zero or more + - \c a+ One or more + - \c a{3} Exactly 3 times + - \c a{3,} At least 3 times + - \c a{3,5} 3 to 5 times + - \c (ab) Grouping + - \c ^a At the beginning + - \c a$ At the end + - \c . Any character + - \c [abc] Character classes + - \c [a-c] Character class range + - \c [a-z0-9_] Character class combination + - \c [^abc] Negated character classes + - \c [^a-c] Negated character class range + - \c [\b] Backspace (U+0008) + - \c \\| \\\\ ... Escape characters + - \c \\f Form feed (U+000C) + - \c \\n Line feed (U+000A) + - \c \\r Carriage return (U+000D) + - \c \\t Tab (U+0009) + - \c \\v Vertical tab (U+000B) + + \note This is a Thompson NFA engine, implemented with reference to + Cox, Russ. "Regular Expression Matching Can Be Simple And Fast (but is slow in Java, Perl, PHP, Python, Ruby,...).", + https://swtch.com/~rsc/regexp/regexp1.html +*/ +template +class GenericRegex { +public: + typedef Encoding EncodingType; + typedef typename Encoding::Ch Ch; + template friend class GenericRegexSearch; + + GenericRegex(const Ch* source, Allocator* allocator = 0) : + ownAllocator_(allocator ? 0 : RAPIDJSON_NEW(Allocator)()), allocator_(allocator ? allocator : ownAllocator_), + states_(allocator_, 256), ranges_(allocator_, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), + anchorBegin_(), anchorEnd_() + { + GenericStringStream ss(source); + DecodedStream, Encoding> ds(ss); + Parse(ds); + } + + ~GenericRegex() + { + RAPIDJSON_DELETE(ownAllocator_); + } + + bool IsValid() const { + return root_ != kRegexInvalidState; + } + +private: + enum Operator { + kZeroOrOne, + kZeroOrMore, + kOneOrMore, + kConcatenation, + kAlternation, + kLeftParenthesis + }; + + static const unsigned kAnyCharacterClass = 0xFFFFFFFF; //!< For '.' + static const unsigned kRangeCharacterClass = 0xFFFFFFFE; + static const unsigned kRangeNegationFlag = 0x80000000; + + struct Range { + unsigned start; // + unsigned end; + SizeType next; + }; + + struct State { + SizeType out; //!< Equals to kInvalid for matching state + SizeType out1; //!< Equals to non-kInvalid for split + SizeType rangeStart; + unsigned codepoint; + }; + + struct Frag { + Frag(SizeType s, SizeType o, SizeType m) : start(s), out(o), minIndex(m) {} + SizeType start; + SizeType out; //!< link-list of all output states + SizeType minIndex; + }; + + State& GetState(SizeType index) { + RAPIDJSON_ASSERT(index < stateCount_); + return states_.template Bottom()[index]; + } + + const State& GetState(SizeType index) const { + RAPIDJSON_ASSERT(index < stateCount_); + return states_.template Bottom()[index]; + } + + Range& GetRange(SizeType index) { + RAPIDJSON_ASSERT(index < rangeCount_); + return ranges_.template Bottom()[index]; + } + + const Range& GetRange(SizeType index) const { + RAPIDJSON_ASSERT(index < rangeCount_); + return ranges_.template Bottom()[index]; + } + + template + void Parse(DecodedStream& ds) { + Stack operandStack(allocator_, 256); // Frag + Stack operatorStack(allocator_, 256); // Operator + Stack atomCountStack(allocator_, 256); // unsigned (Atom per parenthesis) + + *atomCountStack.template Push() = 0; + + unsigned codepoint; + while (ds.Peek() != 0) { + switch (codepoint = ds.Take()) { + case '^': + anchorBegin_ = true; + break; + + case '$': + anchorEnd_ = true; + break; + + case '|': + while (!operatorStack.Empty() && *operatorStack.template Top() < kAlternation) + if (!Eval(operandStack, *operatorStack.template Pop(1))) + return; + *operatorStack.template Push() = kAlternation; + *atomCountStack.template Top() = 0; + break; + + case '(': + *operatorStack.template Push() = kLeftParenthesis; + *atomCountStack.template Push() = 0; + break; + + case ')': + while (!operatorStack.Empty() && *operatorStack.template Top() != kLeftParenthesis) + if (!Eval(operandStack, *operatorStack.template Pop(1))) + return; + if (operatorStack.Empty()) + return; + operatorStack.template Pop(1); + atomCountStack.template Pop(1); + ImplicitConcatenation(atomCountStack, operatorStack); + break; + + case '?': + if (!Eval(operandStack, kZeroOrOne)) + return; + break; + + case '*': + if (!Eval(operandStack, kZeroOrMore)) + return; + break; + + case '+': + if (!Eval(operandStack, kOneOrMore)) + return; + break; + + case '{': + { + unsigned n, m; + if (!ParseUnsigned(ds, &n)) + return; + + if (ds.Peek() == ',') { + ds.Take(); + if (ds.Peek() == '}') + m = kInfinityQuantifier; + else if (!ParseUnsigned(ds, &m) || m < n) + return; + } + else + m = n; + + if (!EvalQuantifier(operandStack, n, m) || ds.Peek() != '}') + return; + ds.Take(); + } + break; + + case '.': + PushOperand(operandStack, kAnyCharacterClass); + ImplicitConcatenation(atomCountStack, operatorStack); + break; + + case '[': + { + SizeType range; + if (!ParseRange(ds, &range)) + return; + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass); + GetState(s).rangeStart = range; + *operandStack.template Push() = Frag(s, s, s); + } + ImplicitConcatenation(atomCountStack, operatorStack); + break; + + case '\\': // Escape character + if (!CharacterEscape(ds, &codepoint)) + return; // Unsupported escape character + // fall through to default + + default: // Pattern character + PushOperand(operandStack, codepoint); + ImplicitConcatenation(atomCountStack, operatorStack); + } + } + + while (!operatorStack.Empty()) + if (!Eval(operandStack, *operatorStack.template Pop(1))) + return; + + // Link the operand to matching state. + if (operandStack.GetSize() == sizeof(Frag)) { + Frag* e = operandStack.template Pop(1); + Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0)); + root_ = e->start; + +#if RAPIDJSON_REGEX_VERBOSE + printf("root: %d\n", root_); + for (SizeType i = 0; i < stateCount_ ; i++) { + State& s = GetState(i); + printf("[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1, (char)s.codepoint); + } + printf("\n"); +#endif + } + } + + SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) { + State* s = states_.template Push(); + s->out = out; + s->out1 = out1; + s->codepoint = codepoint; + s->rangeStart = kRegexInvalidRange; + return stateCount_++; + } + + void PushOperand(Stack& operandStack, unsigned codepoint) { + SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint); + *operandStack.template Push() = Frag(s, s, s); + } + + void ImplicitConcatenation(Stack& atomCountStack, Stack& operatorStack) { + if (*atomCountStack.template Top()) + *operatorStack.template Push() = kConcatenation; + (*atomCountStack.template Top())++; + } + + SizeType Append(SizeType l1, SizeType l2) { + SizeType old = l1; + while (GetState(l1).out != kRegexInvalidState) + l1 = GetState(l1).out; + GetState(l1).out = l2; + return old; + } + + void Patch(SizeType l, SizeType s) { + for (SizeType next; l != kRegexInvalidState; l = next) { + next = GetState(l).out; + GetState(l).out = s; + } + } + + bool Eval(Stack& operandStack, Operator op) { + switch (op) { + case kConcatenation: + RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag) * 2); + { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + Patch(e1.out, e2.start); + *operandStack.template Push() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex)); + } + return true; + + case kAlternation: + if (operandStack.GetSize() >= sizeof(Frag) * 2) { + Frag e2 = *operandStack.template Pop(1); + Frag e1 = *operandStack.template Pop(1); + SizeType s = NewState(e1.start, e2.start, 0); + *operandStack.template Push() = Frag(s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex)); + return true; + } + return false; + + case kZeroOrOne: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + *operandStack.template Push() = Frag(s, Append(e.out, s), e.minIndex); + return true; + } + return false; + + case kZeroOrMore: + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + Patch(e.out, s); + *operandStack.template Push() = Frag(s, s, e.minIndex); + return true; + } + return false; + + default: + RAPIDJSON_ASSERT(op == kOneOrMore); + if (operandStack.GetSize() >= sizeof(Frag)) { + Frag e = *operandStack.template Pop(1); + SizeType s = NewState(kRegexInvalidState, e.start, 0); + Patch(e.out, s); + *operandStack.template Push() = Frag(e.start, s, e.minIndex); + return true; + } + return false; + } + } + + bool EvalQuantifier(Stack& operandStack, unsigned n, unsigned m) { + RAPIDJSON_ASSERT(n <= m); + RAPIDJSON_ASSERT(operandStack.GetSize() >= sizeof(Frag)); + + if (n == 0) { + if (m == 0) // a{0} not support + return false; + else if (m == kInfinityQuantifier) + Eval(operandStack, kZeroOrMore); // a{0,} -> a* + else { + Eval(operandStack, kZeroOrOne); // a{0,5} -> a? + for (unsigned i = 0; i < m - 1; i++) + CloneTopOperand(operandStack); // a{0,5} -> a? a? a? a? a? + for (unsigned i = 0; i < m - 1; i++) + Eval(operandStack, kConcatenation); // a{0,5} -> a?a?a?a?a? + } + return true; + } + + for (unsigned i = 0; i < n - 1; i++) // a{3} -> a a a + CloneTopOperand(operandStack); + + if (m == kInfinityQuantifier) + Eval(operandStack, kOneOrMore); // a{3,} -> a a a+ + else if (m > n) { + CloneTopOperand(operandStack); // a{3,5} -> a a a a + Eval(operandStack, kZeroOrOne); // a{3,5} -> a a a a? + for (unsigned i = n; i < m - 1; i++) + CloneTopOperand(operandStack); // a{3,5} -> a a a a? a? + for (unsigned i = n; i < m; i++) + Eval(operandStack, kConcatenation); // a{3,5} -> a a aa?a? + } + + for (unsigned i = 0; i < n - 1; i++) + Eval(operandStack, kConcatenation); // a{3} -> aaa, a{3,} -> aaa+, a{3.5} -> aaaa?a? + + return true; + } + + static SizeType Min(SizeType a, SizeType b) { return a < b ? a : b; } + + void CloneTopOperand(Stack& operandStack) { + const Frag src = *operandStack.template Top(); // Copy constructor to prevent invalidation + SizeType count = stateCount_ - src.minIndex; // Assumes top operand contains states in [src->minIndex, stateCount_) + State* s = states_.template Push(count); + memcpy(s, &GetState(src.minIndex), count * sizeof(State)); + for (SizeType j = 0; j < count; j++) { + if (s[j].out != kRegexInvalidState) + s[j].out += count; + if (s[j].out1 != kRegexInvalidState) + s[j].out1 += count; + } + *operandStack.template Push() = Frag(src.start + count, src.out + count, src.minIndex + count); + stateCount_ += count; + } + + template + bool ParseUnsigned(DecodedStream& ds, unsigned* u) { + unsigned r = 0; + if (ds.Peek() < '0' || ds.Peek() > '9') + return false; + while (ds.Peek() >= '0' && ds.Peek() <= '9') { + if (r >= 429496729 && ds.Peek() > '5') // 2^32 - 1 = 4294967295 + return false; // overflow + r = r * 10 + (ds.Take() - '0'); + } + *u = r; + return true; + } + + template + bool ParseRange(DecodedStream& ds, SizeType* range) { + bool isBegin = true; + bool negate = false; + int step = 0; + SizeType start = kRegexInvalidRange; + SizeType current = kRegexInvalidRange; + unsigned codepoint; + while ((codepoint = ds.Take()) != 0) { + if (isBegin) { + isBegin = false; + if (codepoint == '^') { + negate = true; + continue; + } + } + + switch (codepoint) { + case ']': + if (start == kRegexInvalidRange) + return false; // Error: nothing inside [] + if (step == 2) { // Add trailing '-' + SizeType r = NewRange('-'); + RAPIDJSON_ASSERT(current != kRegexInvalidRange); + GetRange(current).next = r; + } + if (negate) + GetRange(start).start |= kRangeNegationFlag; + *range = start; + return true; + + case '\\': + if (ds.Peek() == 'b') { + ds.Take(); + codepoint = 0x0008; // Escape backspace character + } + else if (!CharacterEscape(ds, &codepoint)) + return false; + // fall through to default + + default: + switch (step) { + case 1: + if (codepoint == '-') { + step++; + break; + } + // fall through to step 0 for other characters + + case 0: + { + SizeType r = NewRange(codepoint); + if (current != kRegexInvalidRange) + GetRange(current).next = r; + if (start == kRegexInvalidRange) + start = r; + current = r; + } + step = 1; + break; + + default: + RAPIDJSON_ASSERT(step == 2); + GetRange(current).end = codepoint; + step = 0; + } + } + } + return false; + } + + SizeType NewRange(unsigned codepoint) { + Range* r = ranges_.template Push(); + r->start = r->end = codepoint; + r->next = kRegexInvalidRange; + return rangeCount_++; + } + + template + bool CharacterEscape(DecodedStream& ds, unsigned* escapedCodepoint) { + unsigned codepoint; + switch (codepoint = ds.Take()) { + case '^': + case '$': + case '|': + case '(': + case ')': + case '?': + case '*': + case '+': + case '.': + case '[': + case ']': + case '{': + case '}': + case '\\': + *escapedCodepoint = codepoint; return true; + case 'f': *escapedCodepoint = 0x000C; return true; + case 'n': *escapedCodepoint = 0x000A; return true; + case 'r': *escapedCodepoint = 0x000D; return true; + case 't': *escapedCodepoint = 0x0009; return true; + case 'v': *escapedCodepoint = 0x000B; return true; + default: + return false; // Unsupported escape character + } + } + + Allocator* ownAllocator_; + Allocator* allocator_; + Stack states_; + Stack ranges_; + SizeType root_; + SizeType stateCount_; + SizeType rangeCount_; + + static const unsigned kInfinityQuantifier = ~0u; + + // For SearchWithAnchoring() + bool anchorBegin_; + bool anchorEnd_; +}; + +template +class GenericRegexSearch { +public: + typedef typename RegexType::EncodingType Encoding; + typedef typename Encoding::Ch Ch; + + GenericRegexSearch(const RegexType& regex, Allocator* allocator = 0) : + regex_(regex), allocator_(allocator), ownAllocator_(0), + state0_(allocator, 0), state1_(allocator, 0), stateSet_() + { + RAPIDJSON_ASSERT(regex_.IsValid()); + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + stateSet_ = static_cast(allocator_->Malloc(GetStateSetSize())); + state0_.template Reserve(regex_.stateCount_); + state1_.template Reserve(regex_.stateCount_); + } + + ~GenericRegexSearch() { + Allocator::Free(stateSet_); + RAPIDJSON_DELETE(ownAllocator_); + } + + template + bool Match(InputStream& is) { + return SearchWithAnchoring(is, true, true); + } + + bool Match(const Ch* s) { + GenericStringStream is(s); + return Match(is); + } + + template + bool Search(InputStream& is) { + return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_); + } + + bool Search(const Ch* s) { + GenericStringStream is(s); + return Search(is); + } + +private: + typedef typename RegexType::State State; + typedef typename RegexType::Range Range; + + template + bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) { + DecodedStream ds(is); + + state0_.Clear(); + Stack *current = &state0_, *next = &state1_; + const size_t stateSetSize = GetStateSetSize(); + std::memset(stateSet_, 0, stateSetSize); + + bool matched = AddState(*current, regex_.root_); + unsigned codepoint; + while (!current->Empty() && (codepoint = ds.Take()) != 0) { + std::memset(stateSet_, 0, stateSetSize); + next->Clear(); + matched = false; + for (const SizeType* s = current->template Bottom(); s != current->template End(); ++s) { + const State& sr = regex_.GetState(*s); + if (sr.codepoint == codepoint || + sr.codepoint == RegexType::kAnyCharacterClass || + (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint))) + { + matched = AddState(*next, sr.out) || matched; + if (!anchorEnd && matched) + return true; + } + if (!anchorBegin) + AddState(*next, regex_.root_); + } + internal::Swap(current, next); + } + + return matched; + } + + size_t GetStateSetSize() const { + return (regex_.stateCount_ + 31) / 32 * 4; + } + + // Return whether the added states is a match state + bool AddState(Stack& l, SizeType index) { + RAPIDJSON_ASSERT(index != kRegexInvalidState); + + const State& s = regex_.GetState(index); + if (s.out1 != kRegexInvalidState) { // Split + bool matched = AddState(l, s.out); + return AddState(l, s.out1) || matched; + } + else if (!(stateSet_[index >> 5] & (1u << (index & 31)))) { + stateSet_[index >> 5] |= (1u << (index & 31)); + *l.template PushUnsafe() = index; + } + return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation. + } + + bool MatchRange(SizeType rangeIndex, unsigned codepoint) const { + bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0; + while (rangeIndex != kRegexInvalidRange) { + const Range& r = regex_.GetRange(rangeIndex); + if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) && codepoint <= r.end) + return yes; + rangeIndex = r.next; + } + return !yes; + } + + const RegexType& regex_; + Allocator* allocator_; + Allocator* ownAllocator_; + Stack state0_; + Stack state1_; + uint32_t* stateSet_; +}; + +typedef GenericRegex > Regex; +typedef GenericRegexSearch RegexSearch; + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +#if defined(__clang__) || defined(_MSC_VER) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_INTERNAL_REGEX_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/stack.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/stack.h new file mode 100644 index 0000000..89558d0 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/stack.h @@ -0,0 +1,231 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_STACK_H_ +#define RAPIDJSON_INTERNAL_STACK_H_ + +#include "../allocators.h" +#include "swap.h" + +#if defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +/////////////////////////////////////////////////////////////////////////////// +// Stack + +//! A type-unsafe stack for storing different types of data. +/*! \tparam Allocator Allocator for allocating stack memory. +*/ +template +class Stack { +public: + // Optimization note: Do not allocate memory for stack_ in constructor. + // Do it lazily when first Push() -> Expand() -> Resize(). + Stack(Allocator* allocator, size_t stackCapacity) : allocator_(allocator), ownAllocator_(0), stack_(0), stackTop_(0), stackEnd_(0), initialCapacity_(stackCapacity) { + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + Stack(Stack&& rhs) + : allocator_(rhs.allocator_), + ownAllocator_(rhs.ownAllocator_), + stack_(rhs.stack_), + stackTop_(rhs.stackTop_), + stackEnd_(rhs.stackEnd_), + initialCapacity_(rhs.initialCapacity_) + { + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.stack_ = 0; + rhs.stackTop_ = 0; + rhs.stackEnd_ = 0; + rhs.initialCapacity_ = 0; + } +#endif + + ~Stack() { + Destroy(); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + Stack& operator=(Stack&& rhs) { + if (&rhs != this) + { + Destroy(); + + allocator_ = rhs.allocator_; + ownAllocator_ = rhs.ownAllocator_; + stack_ = rhs.stack_; + stackTop_ = rhs.stackTop_; + stackEnd_ = rhs.stackEnd_; + initialCapacity_ = rhs.initialCapacity_; + + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.stack_ = 0; + rhs.stackTop_ = 0; + rhs.stackEnd_ = 0; + rhs.initialCapacity_ = 0; + } + return *this; + } +#endif + + void Swap(Stack& rhs) RAPIDJSON_NOEXCEPT { + internal::Swap(allocator_, rhs.allocator_); + internal::Swap(ownAllocator_, rhs.ownAllocator_); + internal::Swap(stack_, rhs.stack_); + internal::Swap(stackTop_, rhs.stackTop_); + internal::Swap(stackEnd_, rhs.stackEnd_); + internal::Swap(initialCapacity_, rhs.initialCapacity_); + } + + void Clear() { stackTop_ = stack_; } + + void ShrinkToFit() { + if (Empty()) { + // If the stack is empty, completely deallocate the memory. + Allocator::Free(stack_); // NOLINT (+clang-analyzer-unix.Malloc) + stack_ = 0; + stackTop_ = 0; + stackEnd_ = 0; + } + else + Resize(GetSize()); + } + + // Optimization note: try to minimize the size of this function for force inline. + // Expansion is run very infrequently, so it is moved to another (probably non-inline) function. + template + RAPIDJSON_FORCEINLINE void Reserve(size_t count = 1) { + // Expand the stack if needed + if (RAPIDJSON_UNLIKELY(stackTop_ + sizeof(T) * count > stackEnd_)) + Expand(count); + } + + template + RAPIDJSON_FORCEINLINE T* Push(size_t count = 1) { + Reserve(count); + return PushUnsafe(count); + } + + template + RAPIDJSON_FORCEINLINE T* PushUnsafe(size_t count = 1) { + RAPIDJSON_ASSERT(stackTop_); + RAPIDJSON_ASSERT(stackTop_ + sizeof(T) * count <= stackEnd_); + T* ret = reinterpret_cast(stackTop_); + stackTop_ += sizeof(T) * count; + return ret; + } + + template + T* Pop(size_t count) { + RAPIDJSON_ASSERT(GetSize() >= count * sizeof(T)); + stackTop_ -= count * sizeof(T); + return reinterpret_cast(stackTop_); + } + + template + T* Top() { + RAPIDJSON_ASSERT(GetSize() >= sizeof(T)); + return reinterpret_cast(stackTop_ - sizeof(T)); + } + + template + const T* Top() const { + RAPIDJSON_ASSERT(GetSize() >= sizeof(T)); + return reinterpret_cast(stackTop_ - sizeof(T)); + } + + template + T* End() { return reinterpret_cast(stackTop_); } + + template + const T* End() const { return reinterpret_cast(stackTop_); } + + template + T* Bottom() { return reinterpret_cast(stack_); } + + template + const T* Bottom() const { return reinterpret_cast(stack_); } + + bool HasAllocator() const { + return allocator_ != 0; + } + + Allocator& GetAllocator() { + RAPIDJSON_ASSERT(allocator_); + return *allocator_; + } + + bool Empty() const { return stackTop_ == stack_; } + size_t GetSize() const { return static_cast(stackTop_ - stack_); } + size_t GetCapacity() const { return static_cast(stackEnd_ - stack_); } + +private: + template + void Expand(size_t count) { + // Only expand the capacity if the current stack exists. Otherwise just create a stack with initial capacity. + size_t newCapacity; + if (stack_ == 0) { + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + newCapacity = initialCapacity_; + } else { + newCapacity = GetCapacity(); + newCapacity += (newCapacity + 1) / 2; + } + size_t newSize = GetSize() + sizeof(T) * count; + if (newCapacity < newSize) + newCapacity = newSize; + + Resize(newCapacity); + } + + void Resize(size_t newCapacity) { + const size_t size = GetSize(); // Backup the current size + stack_ = static_cast(allocator_->Realloc(stack_, GetCapacity(), newCapacity)); + stackTop_ = stack_ + size; + stackEnd_ = stack_ + newCapacity; + } + + void Destroy() { + Allocator::Free(stack_); + RAPIDJSON_DELETE(ownAllocator_); // Only delete if it is owned by the stack + } + + // Prohibit copy constructor & assignment operator. + Stack(const Stack&); + Stack& operator=(const Stack&); + + Allocator* allocator_; + Allocator* ownAllocator_; + char *stack_; + char *stackTop_; + char *stackEnd_; + size_t initialCapacity_; +}; + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_STACK_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strfunc.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strfunc.h new file mode 100644 index 0000000..226439a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strfunc.h @@ -0,0 +1,69 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_STRFUNC_H_ +#define RAPIDJSON_INTERNAL_STRFUNC_H_ + +#include "../stream.h" +#include + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +//! Custom strlen() which works on different character types. +/*! \tparam Ch Character type (e.g. char, wchar_t, short) + \param s Null-terminated input string. + \return Number of characters in the string. + \note This has the same semantics as strlen(), the return value is not number of Unicode codepoints. +*/ +template +inline SizeType StrLen(const Ch* s) { + RAPIDJSON_ASSERT(s != 0); + const Ch* p = s; + while (*p) ++p; + return SizeType(p - s); +} + +template <> +inline SizeType StrLen(const char* s) { + return SizeType(std::strlen(s)); +} + +template <> +inline SizeType StrLen(const wchar_t* s) { + return SizeType(std::wcslen(s)); +} + +//! Returns number of code points in a encoded string. +template +bool CountStringCodePoint(const typename Encoding::Ch* s, SizeType length, SizeType* outCount) { + RAPIDJSON_ASSERT(s != 0); + RAPIDJSON_ASSERT(outCount != 0); + GenericStringStream is(s); + const typename Encoding::Ch* end = s + length; + SizeType count = 0; + while (is.src_ < end) { + unsigned codepoint; + if (!Encoding::Decode(is, &codepoint)) + return false; + count++; + } + *outCount = count; + return true; +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_INTERNAL_STRFUNC_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strtod.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strtod.h new file mode 100644 index 0000000..dfca22b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/strtod.h @@ -0,0 +1,290 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_STRTOD_ +#define RAPIDJSON_STRTOD_ + +#include "ieee754.h" +#include "biginteger.h" +#include "diyfp.h" +#include "pow10.h" +#include +#include + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +inline double FastPath(double significand, int exp) { + if (exp < -308) + return 0.0; + else if (exp >= 0) + return significand * internal::Pow10(exp); + else + return significand / internal::Pow10(-exp); +} + +inline double StrtodNormalPrecision(double d, int p) { + if (p < -308) { + // Prevent expSum < -308, making Pow10(p) = 0 + d = FastPath(d, -308); + d = FastPath(d, p + 308); + } + else + d = FastPath(d, p); + return d; +} + +template +inline T Min3(T a, T b, T c) { + T m = a; + if (m > b) m = b; + if (m > c) m = c; + return m; +} + +inline int CheckWithinHalfULP(double b, const BigInteger& d, int dExp) { + const Double db(b); + const uint64_t bInt = db.IntegerSignificand(); + const int bExp = db.IntegerExponent(); + const int hExp = bExp - 1; + + int dS_Exp2 = 0, dS_Exp5 = 0, bS_Exp2 = 0, bS_Exp5 = 0, hS_Exp2 = 0, hS_Exp5 = 0; + + // Adjust for decimal exponent + if (dExp >= 0) { + dS_Exp2 += dExp; + dS_Exp5 += dExp; + } + else { + bS_Exp2 -= dExp; + bS_Exp5 -= dExp; + hS_Exp2 -= dExp; + hS_Exp5 -= dExp; + } + + // Adjust for binary exponent + if (bExp >= 0) + bS_Exp2 += bExp; + else { + dS_Exp2 -= bExp; + hS_Exp2 -= bExp; + } + + // Adjust for half ulp exponent + if (hExp >= 0) + hS_Exp2 += hExp; + else { + dS_Exp2 -= hExp; + bS_Exp2 -= hExp; + } + + // Remove common power of two factor from all three scaled values + int common_Exp2 = Min3(dS_Exp2, bS_Exp2, hS_Exp2); + dS_Exp2 -= common_Exp2; + bS_Exp2 -= common_Exp2; + hS_Exp2 -= common_Exp2; + + BigInteger dS = d; + dS.MultiplyPow5(static_cast(dS_Exp5)) <<= static_cast(dS_Exp2); + + BigInteger bS(bInt); + bS.MultiplyPow5(static_cast(bS_Exp5)) <<= static_cast(bS_Exp2); + + BigInteger hS(1); + hS.MultiplyPow5(static_cast(hS_Exp5)) <<= static_cast(hS_Exp2); + + BigInteger delta(0); + dS.Difference(bS, &delta); + + return delta.Compare(hS); +} + +inline bool StrtodFast(double d, int p, double* result) { + // Use fast path for string-to-double conversion if possible + // see http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/ + if (p > 22 && p < 22 + 16) { + // Fast Path Cases In Disguise + d *= internal::Pow10(p - 22); + p = 22; + } + + if (p >= -22 && p <= 22 && d <= 9007199254740991.0) { // 2^53 - 1 + *result = FastPath(d, p); + return true; + } + else + return false; +} + +// Compute an approximation and see if it is within 1/2 ULP +inline bool StrtodDiyFp(const char* decimals, int dLen, int dExp, double* result) { + uint64_t significand = 0; + int i = 0; // 2^64 - 1 = 18446744073709551615, 1844674407370955161 = 0x1999999999999999 + for (; i < dLen; i++) { + if (significand > RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) || + (significand == RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) && decimals[i] > '5')) + break; + significand = significand * 10u + static_cast(decimals[i] - '0'); + } + + if (i < dLen && decimals[i] >= '5') // Rounding + significand++; + + int remaining = dLen - i; + const int kUlpShift = 3; + const int kUlp = 1 << kUlpShift; + int64_t error = (remaining == 0) ? 0 : kUlp / 2; + + DiyFp v(significand, 0); + v = v.Normalize(); + error <<= -v.e; + + dExp += remaining; + + int actualExp; + DiyFp cachedPower = GetCachedPower10(dExp, &actualExp); + if (actualExp != dExp) { + static const DiyFp kPow10[] = { + DiyFp(RAPIDJSON_UINT64_C2(0xa0000000, 0x00000000), -60), // 10^1 + DiyFp(RAPIDJSON_UINT64_C2(0xc8000000, 0x00000000), -57), // 10^2 + DiyFp(RAPIDJSON_UINT64_C2(0xfa000000, 0x00000000), -54), // 10^3 + DiyFp(RAPIDJSON_UINT64_C2(0x9c400000, 0x00000000), -50), // 10^4 + DiyFp(RAPIDJSON_UINT64_C2(0xc3500000, 0x00000000), -47), // 10^5 + DiyFp(RAPIDJSON_UINT64_C2(0xf4240000, 0x00000000), -44), // 10^6 + DiyFp(RAPIDJSON_UINT64_C2(0x98968000, 0x00000000), -40) // 10^7 + }; + int adjustment = dExp - actualExp; + RAPIDJSON_ASSERT(adjustment >= 1 && adjustment < 8); + v = v * kPow10[adjustment - 1]; + if (dLen + adjustment > 19) // has more digits than decimal digits in 64-bit + error += kUlp / 2; + } + + v = v * cachedPower; + + error += kUlp + (error == 0 ? 0 : 1); + + const int oldExp = v.e; + v = v.Normalize(); + error <<= oldExp - v.e; + + const int effectiveSignificandSize = Double::EffectiveSignificandSize(64 + v.e); + int precisionSize = 64 - effectiveSignificandSize; + if (precisionSize + kUlpShift >= 64) { + int scaleExp = (precisionSize + kUlpShift) - 63; + v.f >>= scaleExp; + v.e += scaleExp; + error = (error >> scaleExp) + 1 + kUlp; + precisionSize -= scaleExp; + } + + DiyFp rounded(v.f >> precisionSize, v.e + precisionSize); + const uint64_t precisionBits = (v.f & ((uint64_t(1) << precisionSize) - 1)) * kUlp; + const uint64_t halfWay = (uint64_t(1) << (precisionSize - 1)) * kUlp; + if (precisionBits >= halfWay + static_cast(error)) { + rounded.f++; + if (rounded.f & (DiyFp::kDpHiddenBit << 1)) { // rounding overflows mantissa (issue #340) + rounded.f >>= 1; + rounded.e++; + } + } + + *result = rounded.ToDouble(); + + return halfWay - static_cast(error) >= precisionBits || precisionBits >= halfWay + static_cast(error); +} + +inline double StrtodBigInteger(double approx, const char* decimals, int dLen, int dExp) { + RAPIDJSON_ASSERT(dLen >= 0); + const BigInteger dInt(decimals, static_cast(dLen)); + Double a(approx); + int cmp = CheckWithinHalfULP(a.Value(), dInt, dExp); + if (cmp < 0) + return a.Value(); // within half ULP + else if (cmp == 0) { + // Round towards even + if (a.Significand() & 1) + return a.NextPositiveDouble(); + else + return a.Value(); + } + else // adjustment + return a.NextPositiveDouble(); +} + +inline double StrtodFullPrecision(double d, int p, const char* decimals, size_t length, size_t decimalPosition, int exp) { + RAPIDJSON_ASSERT(d >= 0.0); + RAPIDJSON_ASSERT(length >= 1); + + double result = 0.0; + if (StrtodFast(d, p, &result)) + return result; + + RAPIDJSON_ASSERT(length <= INT_MAX); + int dLen = static_cast(length); + + RAPIDJSON_ASSERT(length >= decimalPosition); + RAPIDJSON_ASSERT(length - decimalPosition <= INT_MAX); + int dExpAdjust = static_cast(length - decimalPosition); + + RAPIDJSON_ASSERT(exp >= INT_MIN + dExpAdjust); + int dExp = exp - dExpAdjust; + + // Make sure length+dExp does not overflow + RAPIDJSON_ASSERT(dExp <= INT_MAX - dLen); + + // Trim leading zeros + while (dLen > 0 && *decimals == '0') { + dLen--; + decimals++; + } + + // Trim trailing zeros + while (dLen > 0 && decimals[dLen - 1] == '0') { + dLen--; + dExp++; + } + + if (dLen == 0) { // Buffer only contains zeros. + return 0.0; + } + + // Trim right-most digits + const int kMaxDecimalDigit = 767 + 1; + if (dLen > kMaxDecimalDigit) { + dExp += dLen - kMaxDecimalDigit; + dLen = kMaxDecimalDigit; + } + + // If too small, underflow to zero. + // Any x <= 10^-324 is interpreted as zero. + if (dLen + dExp <= -324) + return 0.0; + + // If too large, overflow to infinity. + // Any x >= 10^309 is interpreted as +infinity. + if (dLen + dExp > 309) + return std::numeric_limits::infinity(); + + if (StrtodDiyFp(decimals, dLen, dExp, &result)) + return result; + + // Use approximation from StrtodDiyFp and make adjustment with BigInteger comparison + return StrtodBigInteger(result, decimals, dLen, dExp); +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_STRTOD_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/swap.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/swap.h new file mode 100644 index 0000000..666e49f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/internal/swap.h @@ -0,0 +1,46 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_SWAP_H_ +#define RAPIDJSON_INTERNAL_SWAP_H_ + +#include "../rapidjson.h" + +#if defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +//! Custom swap() to avoid dependency on C++ header +/*! \tparam T Type of the arguments to swap, should be instantiated with primitive C++ types only. + \note This has the same semantics as std::swap(). +*/ +template +inline void Swap(T& a, T& b) RAPIDJSON_NOEXCEPT { + T tmp = a; + a = b; + b = tmp; +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_INTERNAL_SWAP_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/istreamwrapper.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/istreamwrapper.h new file mode 100644 index 0000000..5f81698 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/istreamwrapper.h @@ -0,0 +1,113 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ISTREAMWRAPPER_H_ +#define RAPIDJSON_ISTREAMWRAPPER_H_ + +#include "stream.h" +#include + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4351) // new behavior: elements of array 'array' will be default initialized +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Wrapper of \c std::basic_istream into RapidJSON's Stream concept. +/*! + The classes can be wrapped including but not limited to: + + - \c std::istringstream + - \c std::stringstream + - \c std::wistringstream + - \c std::wstringstream + - \c std::ifstream + - \c std::fstream + - \c std::wifstream + - \c std::wfstream + + \tparam StreamType Class derived from \c std::basic_istream. +*/ + +template +class BasicIStreamWrapper { +public: + typedef typename StreamType::char_type Ch; + BasicIStreamWrapper(StreamType& stream) : stream_(stream), count_(), peekBuffer_() {} + + Ch Peek() const { + typename StreamType::int_type c = stream_.peek(); + return RAPIDJSON_LIKELY(c != StreamType::traits_type::eof()) ? static_cast(c) : static_cast('\0'); + } + + Ch Take() { + typename StreamType::int_type c = stream_.get(); + if (RAPIDJSON_LIKELY(c != StreamType::traits_type::eof())) { + count_++; + return static_cast(c); + } + else + return '\0'; + } + + // tellg() may return -1 when failed. So we count by ourself. + size_t Tell() const { return count_; } + + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + void Put(Ch) { RAPIDJSON_ASSERT(false); } + void Flush() { RAPIDJSON_ASSERT(false); } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + + // For encoding detection only. + const Ch* Peek4() const { + RAPIDJSON_ASSERT(sizeof(Ch) == 1); // Only usable for byte stream. + int i; + bool hasError = false; + for (i = 0; i < 4; ++i) { + typename StreamType::int_type c = stream_.get(); + if (c == StreamType::traits_type::eof()) { + hasError = true; + stream_.clear(); + break; + } + peekBuffer_[i] = static_cast(c); + } + for (--i; i >= 0; --i) + stream_.putback(peekBuffer_[i]); + return !hasError ? peekBuffer_ : 0; + } + +private: + BasicIStreamWrapper(const BasicIStreamWrapper&); + BasicIStreamWrapper& operator=(const BasicIStreamWrapper&); + + StreamType& stream_; + size_t count_; //!< Number of characters read. Note: + mutable Ch peekBuffer_[4]; +}; + +typedef BasicIStreamWrapper IStreamWrapper; +typedef BasicIStreamWrapper WIStreamWrapper; + +#if defined(__clang__) || defined(_MSC_VER) +RAPIDJSON_DIAG_POP +#endif + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_ISTREAMWRAPPER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/memorybuffer.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/memorybuffer.h new file mode 100644 index 0000000..39bee1d --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/memorybuffer.h @@ -0,0 +1,70 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_MEMORYBUFFER_H_ +#define RAPIDJSON_MEMORYBUFFER_H_ + +#include "stream.h" +#include "internal/stack.h" + +RAPIDJSON_NAMESPACE_BEGIN + +//! Represents an in-memory output byte stream. +/*! + This class is mainly for being wrapped by EncodedOutputStream or AutoUTFOutputStream. + + It is similar to FileWriteBuffer but the destination is an in-memory buffer instead of a file. + + Differences between MemoryBuffer and StringBuffer: + 1. StringBuffer has Encoding but MemoryBuffer is only a byte buffer. + 2. StringBuffer::GetString() returns a null-terminated string. MemoryBuffer::GetBuffer() returns a buffer without terminator. + + \tparam Allocator type for allocating memory buffer. + \note implements Stream concept +*/ +template +struct GenericMemoryBuffer { + typedef char Ch; // byte + + GenericMemoryBuffer(Allocator* allocator = 0, size_t capacity = kDefaultCapacity) : stack_(allocator, capacity) {} + + void Put(Ch c) { *stack_.template Push() = c; } + void Flush() {} + + void Clear() { stack_.Clear(); } + void ShrinkToFit() { stack_.ShrinkToFit(); } + Ch* Push(size_t count) { return stack_.template Push(count); } + void Pop(size_t count) { stack_.template Pop(count); } + + const Ch* GetBuffer() const { + return stack_.template Bottom(); + } + + size_t GetSize() const { return stack_.GetSize(); } + + static const size_t kDefaultCapacity = 256; + mutable internal::Stack stack_; +}; + +typedef GenericMemoryBuffer<> MemoryBuffer; + +//! Implement specialized version of PutN() with memset() for better performance. +template<> +inline void PutN(MemoryBuffer& memoryBuffer, char c, size_t n) { + std::memset(memoryBuffer.stack_.Push(n), c, n * sizeof(c)); +} + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_MEMORYBUFFER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/memorystream.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/memorystream.h new file mode 100644 index 0000000..1d71d8a --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/memorystream.h @@ -0,0 +1,71 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_MEMORYSTREAM_H_ +#define RAPIDJSON_MEMORYSTREAM_H_ + +#include "stream.h" + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(unreachable-code) +RAPIDJSON_DIAG_OFF(missing-noreturn) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Represents an in-memory input byte stream. +/*! + This class is mainly for being wrapped by EncodedInputStream or AutoUTFInputStream. + + It is similar to FileReadBuffer but the source is an in-memory buffer instead of a file. + + Differences between MemoryStream and StringStream: + 1. StringStream has encoding but MemoryStream is a byte stream. + 2. MemoryStream needs size of the source buffer and the buffer don't need to be null terminated. StringStream assume null-terminated string as source. + 3. MemoryStream supports Peek4() for encoding detection. StringStream is specified with an encoding so it should not have Peek4(). + \note implements Stream concept +*/ +struct MemoryStream { + typedef char Ch; // byte + + MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {} + + Ch Peek() const { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_; } + Ch Take() { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_++; } + size_t Tell() const { return static_cast(src_ - begin_); } + + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + void Put(Ch) { RAPIDJSON_ASSERT(false); } + void Flush() { RAPIDJSON_ASSERT(false); } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + + // For encoding detection only. + const Ch* Peek4() const { + return Tell() + 4 <= size_ ? src_ : 0; + } + + const Ch* src_; //!< Current read position. + const Ch* begin_; //!< Original head of the string. + const Ch* end_; //!< End of stream. + size_t size_; //!< Size of the stream. +}; + +RAPIDJSON_NAMESPACE_END + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_MEMORYBUFFER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/inttypes.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/inttypes.h new file mode 100644 index 0000000..1811128 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/inttypes.h @@ -0,0 +1,316 @@ +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2013 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the product nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +// The above software in this distribution may have been modified by +// THL A29 Limited ("Tencent Modifications"). +// All Tencent Modifications are Copyright (C) 2015 THL A29 Limited. + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include "stdint.h" + +// miloyip: VC supports inttypes.h since VC2013 +#if _MSC_VER >= 1800 +#include +#else + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd8 "d" +#define SCNi8 "i" +#define SCNdLEAST8 "d" +#define SCNiLEAST8 "i" +#define SCNdFAST8 "d" +#define SCNiFAST8 "i" + +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo8 "o" +#define SCNu8 "u" +#define SCNx8 "x" +#define SCNX8 "X" +#define SCNoLEAST8 "o" +#define SCNuLEAST8 "u" +#define SCNxLEAST8 "x" +#define SCNXLEAST8 "X" +#define SCNoFAST8 "o" +#define SCNuFAST8 "u" +#define SCNxFAST8 "x" +#define SCNXFAST8 "X" + +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +#endif // __STDC_FORMAT_MACROS ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + +#endif // _MSC_VER >= 1800 + +#endif // _MSC_INTTYPES_H_ ] diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/stdint.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/stdint.h new file mode 100644 index 0000000..3d4477b --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/msinttypes/stdint.h @@ -0,0 +1,300 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2013 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the product nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +// The above software in this distribution may have been modified by +// THL A29 Limited ("Tencent Modifications"). +// All Tencent Modifications are Copyright (C) 2015 THL A29 Limited. + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +// miloyip: Originally Visual Studio 2010 uses its own stdint.h. However it generates warning with INT64_C(), so change to use this file for vs2010. +#if _MSC_VER >= 1600 // [ +#include + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +#undef INT8_C +#undef INT16_C +#undef INT32_C +#undef INT64_C +#undef UINT8_C +#undef UINT16_C +#undef UINT32_C +#undef UINT64_C + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +// These #ifndef's are needed to prevent collisions with . +// Check out Issue 9 for the details. +#ifndef INTMAX_C // [ +# define INTMAX_C INT64_C +#endif // INTMAX_C ] +#ifndef UINTMAX_C // [ +# define UINTMAX_C UINT64_C +#endif // UINTMAX_C ] + +#endif // __STDC_CONSTANT_MACROS ] + +#else // ] _MSC_VER >= 1700 [ + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we have to wrap include with 'extern "C++" {}' +// or compiler would give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#if defined(__cplusplus) && !defined(_M_ARM) +extern "C" { +#endif +# include +#if defined(__cplusplus) && !defined(_M_ARM) +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +// These #ifndef's are needed to prevent collisions with . +// Check out Issue 9 for the details. +#ifndef INTMAX_C // [ +# define INTMAX_C INT64_C +#endif // INTMAX_C ] +#ifndef UINTMAX_C // [ +# define UINTMAX_C UINT64_C +#endif // UINTMAX_C ] + +#endif // __STDC_CONSTANT_MACROS ] + +#endif // _MSC_VER >= 1600 ] + +#endif // _MSC_STDINT_H_ ] diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/ostreamwrapper.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/ostreamwrapper.h new file mode 100644 index 0000000..6f4667c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/ostreamwrapper.h @@ -0,0 +1,81 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_OSTREAMWRAPPER_H_ +#define RAPIDJSON_OSTREAMWRAPPER_H_ + +#include "stream.h" +#include + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Wrapper of \c std::basic_ostream into RapidJSON's Stream concept. +/*! + The classes can be wrapped including but not limited to: + + - \c std::ostringstream + - \c std::stringstream + - \c std::wpstringstream + - \c std::wstringstream + - \c std::ifstream + - \c std::fstream + - \c std::wofstream + - \c std::wfstream + + \tparam StreamType Class derived from \c std::basic_ostream. +*/ + +template +class BasicOStreamWrapper { +public: + typedef typename StreamType::char_type Ch; + BasicOStreamWrapper(StreamType& stream) : stream_(stream) {} + + void Put(Ch c) { + stream_.put(c); + } + + void Flush() { + stream_.flush(); + } + + // Not implemented + char Peek() const { RAPIDJSON_ASSERT(false); return 0; } + char Take() { RAPIDJSON_ASSERT(false); return 0; } + size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } + char* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + size_t PutEnd(char*) { RAPIDJSON_ASSERT(false); return 0; } + +private: + BasicOStreamWrapper(const BasicOStreamWrapper&); + BasicOStreamWrapper& operator=(const BasicOStreamWrapper&); + + StreamType& stream_; +}; + +typedef BasicOStreamWrapper OStreamWrapper; +typedef BasicOStreamWrapper WOStreamWrapper; + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_OSTREAMWRAPPER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/pointer.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/pointer.h new file mode 100644 index 0000000..3d339f2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/pointer.h @@ -0,0 +1,1357 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_POINTER_H_ +#define RAPIDJSON_POINTER_H_ + +#include "document.h" +#include "internal/itoa.h" + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(switch-enum) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +static const SizeType kPointerInvalidIndex = ~SizeType(0); //!< Represents an invalid index in GenericPointer::Token + +//! Error code of parsing. +/*! \ingroup RAPIDJSON_ERRORS + \see GenericPointer::GenericPointer, GenericPointer::GetParseErrorCode +*/ +enum PointerParseErrorCode { + kPointerParseErrorNone = 0, //!< The parse is successful + + kPointerParseErrorTokenMustBeginWithSolidus, //!< A token must begin with a '/' + kPointerParseErrorInvalidEscape, //!< Invalid escape + kPointerParseErrorInvalidPercentEncoding, //!< Invalid percent encoding in URI fragment + kPointerParseErrorCharacterMustPercentEncode //!< A character must percent encoded in URI fragment +}; + +/////////////////////////////////////////////////////////////////////////////// +// GenericPointer + +//! Represents a JSON Pointer. Use Pointer for UTF8 encoding and default allocator. +/*! + This class implements RFC 6901 "JavaScript Object Notation (JSON) Pointer" + (https://tools.ietf.org/html/rfc6901). + + A JSON pointer is for identifying a specific value in a JSON document + (GenericDocument). It can simplify coding of DOM tree manipulation, because it + can access multiple-level depth of DOM tree with single API call. + + After it parses a string representation (e.g. "/foo/0" or URI fragment + representation (e.g. "#/foo/0") into its internal representation (tokens), + it can be used to resolve a specific value in multiple documents, or sub-tree + of documents. + + Contrary to GenericValue, Pointer can be copy constructed and copy assigned. + Apart from assignment, a Pointer cannot be modified after construction. + + Although Pointer is very convenient, please aware that constructing Pointer + involves parsing and dynamic memory allocation. A special constructor with user- + supplied tokens eliminates these. + + GenericPointer depends on GenericDocument and GenericValue. + + \tparam ValueType The value type of the DOM tree. E.g. GenericValue > + \tparam Allocator The allocator type for allocating memory for internal representation. + + \note GenericPointer uses same encoding of ValueType. + However, Allocator of GenericPointer is independent of Allocator of Value. +*/ +template +class GenericPointer { +public: + typedef typename ValueType::EncodingType EncodingType; //!< Encoding type from Value + typedef typename ValueType::Ch Ch; //!< Character type from Value + + //! A token is the basic units of internal representation. + /*! + A JSON pointer string representation "/foo/123" is parsed to two tokens: + "foo" and 123. 123 will be represented in both numeric form and string form. + They are resolved according to the actual value type (object or array). + + For token that are not numbers, or the numeric value is out of bound + (greater than limits of SizeType), they are only treated as string form + (i.e. the token's index will be equal to kPointerInvalidIndex). + + This struct is public so that user can create a Pointer without parsing and + allocation, using a special constructor. + */ + struct Token { + const Ch* name; //!< Name of the token. It has null character at the end but it can contain null character. + SizeType length; //!< Length of the name. + SizeType index; //!< A valid array index, if it is not equal to kPointerInvalidIndex. + }; + + //!@name Constructors and destructor. + //@{ + + //! Default constructor. + GenericPointer(Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {} + + //! Constructor that parses a string or URI fragment representation. + /*! + \param source A null-terminated, string or URI fragment representation of JSON pointer. + \param allocator User supplied allocator for this pointer. If no allocator is provided, it creates a self-owned one. + */ + explicit GenericPointer(const Ch* source, Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) { + Parse(source, internal::StrLen(source)); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Constructor that parses a string or URI fragment representation. + /*! + \param source A string or URI fragment representation of JSON pointer. + \param allocator User supplied allocator for this pointer. If no allocator is provided, it creates a self-owned one. + \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + explicit GenericPointer(const std::basic_string& source, Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) { + Parse(source.c_str(), source.size()); + } +#endif + + //! Constructor that parses a string or URI fragment representation, with length of the source string. + /*! + \param source A string or URI fragment representation of JSON pointer. + \param length Length of source. + \param allocator User supplied allocator for this pointer. If no allocator is provided, it creates a self-owned one. + \note Slightly faster than the overload without length. + */ + GenericPointer(const Ch* source, size_t length, Allocator* allocator = 0) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) { + Parse(source, length); + } + + //! Constructor with user-supplied tokens. + /*! + This constructor let user supplies const array of tokens. + This prevents the parsing process and eliminates allocation. + This is preferred for memory constrained environments. + + \param tokens An constant array of tokens representing the JSON pointer. + \param tokenCount Number of tokens. + + \b Example + \code + #define NAME(s) { s, sizeof(s) / sizeof(s[0]) - 1, kPointerInvalidIndex } + #define INDEX(i) { #i, sizeof(#i) - 1, i } + + static const Pointer::Token kTokens[] = { NAME("foo"), INDEX(123) }; + static const Pointer p(kTokens, sizeof(kTokens) / sizeof(kTokens[0])); + // Equivalent to static const Pointer p("/foo/123"); + + #undef NAME + #undef INDEX + \endcode + */ + GenericPointer(const Token* tokens, size_t tokenCount) : allocator_(), ownAllocator_(), nameBuffer_(), tokens_(const_cast(tokens)), tokenCount_(tokenCount), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) {} + + //! Copy constructor. + GenericPointer(const GenericPointer& rhs) : allocator_(rhs.allocator_), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) { + *this = rhs; + } + + //! Copy constructor. + GenericPointer(const GenericPointer& rhs, Allocator* allocator) : allocator_(allocator), ownAllocator_(), nameBuffer_(), tokens_(), tokenCount_(), parseErrorOffset_(), parseErrorCode_(kPointerParseErrorNone) { + *this = rhs; + } + + //! Destructor. + ~GenericPointer() { + if (nameBuffer_) // If user-supplied tokens constructor is used, nameBuffer_ is nullptr and tokens_ are not deallocated. + Allocator::Free(tokens_); + RAPIDJSON_DELETE(ownAllocator_); + } + + //! Assignment operator. + GenericPointer& operator=(const GenericPointer& rhs) { + if (this != &rhs) { + // Do not delete ownAllcator + if (nameBuffer_) + Allocator::Free(tokens_); + + tokenCount_ = rhs.tokenCount_; + parseErrorOffset_ = rhs.parseErrorOffset_; + parseErrorCode_ = rhs.parseErrorCode_; + + if (rhs.nameBuffer_) + CopyFromRaw(rhs); // Normally parsed tokens. + else { + tokens_ = rhs.tokens_; // User supplied const tokens. + nameBuffer_ = 0; + } + } + return *this; + } + + //@} + + //!@name Append token + //@{ + + //! Append a token and return a new Pointer + /*! + \param token Token to be appended. + \param allocator Allocator for the newly return Pointer. + \return A new Pointer with appended token. + */ + GenericPointer Append(const Token& token, Allocator* allocator = 0) const { + GenericPointer r; + r.allocator_ = allocator; + Ch *p = r.CopyFromRaw(*this, 1, token.length + 1); + std::memcpy(p, token.name, (token.length + 1) * sizeof(Ch)); + r.tokens_[tokenCount_].name = p; + r.tokens_[tokenCount_].length = token.length; + r.tokens_[tokenCount_].index = token.index; + return r; + } + + //! Append a name token with length, and return a new Pointer + /*! + \param name Name to be appended. + \param length Length of name. + \param allocator Allocator for the newly return Pointer. + \return A new Pointer with appended token. + */ + GenericPointer Append(const Ch* name, SizeType length, Allocator* allocator = 0) const { + Token token = { name, length, kPointerInvalidIndex }; + return Append(token, allocator); + } + + //! Append a name token without length, and return a new Pointer + /*! + \param name Name (const Ch*) to be appended. + \param allocator Allocator for the newly return Pointer. + \return A new Pointer with appended token. + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr::Type, Ch> >), (GenericPointer)) + Append(T* name, Allocator* allocator = 0) const { + return Append(name, internal::StrLen(name), allocator); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Append a name token, and return a new Pointer + /*! + \param name Name to be appended. + \param allocator Allocator for the newly return Pointer. + \return A new Pointer with appended token. + */ + GenericPointer Append(const std::basic_string& name, Allocator* allocator = 0) const { + return Append(name.c_str(), static_cast(name.size()), allocator); + } +#endif + + //! Append a index token, and return a new Pointer + /*! + \param index Index to be appended. + \param allocator Allocator for the newly return Pointer. + \return A new Pointer with appended token. + */ + GenericPointer Append(SizeType index, Allocator* allocator = 0) const { + char buffer[21]; + char* end = sizeof(SizeType) == 4 ? internal::u32toa(index, buffer) : internal::u64toa(index, buffer); + SizeType length = static_cast(end - buffer); + buffer[length] = '\0'; + + if (sizeof(Ch) == 1) { + Token token = { reinterpret_cast(buffer), length, index }; + return Append(token, allocator); + } + else { + Ch name[21]; + for (size_t i = 0; i <= length; i++) + name[i] = static_cast(buffer[i]); + Token token = { name, length, index }; + return Append(token, allocator); + } + } + + //! Append a token by value, and return a new Pointer + /*! + \param token token to be appended. + \param allocator Allocator for the newly return Pointer. + \return A new Pointer with appended token. + */ + GenericPointer Append(const ValueType& token, Allocator* allocator = 0) const { + if (token.IsString()) + return Append(token.GetString(), token.GetStringLength(), allocator); + else { + RAPIDJSON_ASSERT(token.IsUint64()); + RAPIDJSON_ASSERT(token.GetUint64() <= SizeType(~0)); + return Append(static_cast(token.GetUint64()), allocator); + } + } + + //!@name Handling Parse Error + //@{ + + //! Check whether this is a valid pointer. + bool IsValid() const { return parseErrorCode_ == kPointerParseErrorNone; } + + //! Get the parsing error offset in code unit. + size_t GetParseErrorOffset() const { return parseErrorOffset_; } + + //! Get the parsing error code. + PointerParseErrorCode GetParseErrorCode() const { return parseErrorCode_; } + + //@} + + //! Get the allocator of this pointer. + Allocator& GetAllocator() { return *allocator_; } + + //!@name Tokens + //@{ + + //! Get the token array (const version only). + const Token* GetTokens() const { return tokens_; } + + //! Get the number of tokens. + size_t GetTokenCount() const { return tokenCount_; } + + //@} + + //!@name Equality/inequality operators + //@{ + + //! Equality operator. + /*! + \note When any pointers are invalid, always returns false. + */ + bool operator==(const GenericPointer& rhs) const { + if (!IsValid() || !rhs.IsValid() || tokenCount_ != rhs.tokenCount_) + return false; + + for (size_t i = 0; i < tokenCount_; i++) { + if (tokens_[i].index != rhs.tokens_[i].index || + tokens_[i].length != rhs.tokens_[i].length || + (tokens_[i].length != 0 && std::memcmp(tokens_[i].name, rhs.tokens_[i].name, sizeof(Ch)* tokens_[i].length) != 0)) + { + return false; + } + } + + return true; + } + + //! Inequality operator. + /*! + \note When any pointers are invalid, always returns true. + */ + bool operator!=(const GenericPointer& rhs) const { return !(*this == rhs); } + + //@} + + //!@name Stringify + //@{ + + //! Stringify the pointer into string representation. + /*! + \tparam OutputStream Type of output stream. + \param os The output stream. + */ + template + bool Stringify(OutputStream& os) const { + return Stringify(os); + } + + //! Stringify the pointer into URI fragment representation. + /*! + \tparam OutputStream Type of output stream. + \param os The output stream. + */ + template + bool StringifyUriFragment(OutputStream& os) const { + return Stringify(os); + } + + //@} + + //!@name Create value + //@{ + + //! Create a value in a subtree. + /*! + If the value is not exist, it creates all parent values and a JSON Null value. + So it always succeed and return the newly created or existing value. + + Remind that it may change types of parents according to tokens, so it + potentially removes previously stored values. For example, if a document + was an array, and "/foo" is used to create a value, then the document + will be changed to an object, and all existing array elements are lost. + + \param root Root value of a DOM subtree to be resolved. It can be any value other than document root. + \param allocator Allocator for creating the values if the specified value or its parents are not exist. + \param alreadyExist If non-null, it stores whether the resolved value is already exist. + \return The resolved newly created (a JSON Null value), or already exists value. + */ + ValueType& Create(ValueType& root, typename ValueType::AllocatorType& allocator, bool* alreadyExist = 0) const { + RAPIDJSON_ASSERT(IsValid()); + ValueType* v = &root; + bool exist = true; + for (const Token *t = tokens_; t != tokens_ + tokenCount_; ++t) { + if (v->IsArray() && t->name[0] == '-' && t->length == 1) { + v->PushBack(ValueType().Move(), allocator); + v = &((*v)[v->Size() - 1]); + exist = false; + } + else { + if (t->index == kPointerInvalidIndex) { // must be object name + if (!v->IsObject()) + v->SetObject(); // Change to Object + } + else { // object name or array index + if (!v->IsArray() && !v->IsObject()) + v->SetArray(); // Change to Array + } + + if (v->IsArray()) { + if (t->index >= v->Size()) { + v->Reserve(t->index + 1, allocator); + while (t->index >= v->Size()) + v->PushBack(ValueType().Move(), allocator); + exist = false; + } + v = &((*v)[t->index]); + } + else { + typename ValueType::MemberIterator m = v->FindMember(GenericStringRef(t->name, t->length)); + if (m == v->MemberEnd()) { + v->AddMember(ValueType(t->name, t->length, allocator).Move(), ValueType().Move(), allocator); + v = &(--v->MemberEnd())->value; // Assumes AddMember() appends at the end + exist = false; + } + else + v = &m->value; + } + } + } + + if (alreadyExist) + *alreadyExist = exist; + + return *v; + } + + //! Creates a value in a document. + /*! + \param document A document to be resolved. + \param alreadyExist If non-null, it stores whether the resolved value is already exist. + \return The resolved newly created, or already exists value. + */ + template + ValueType& Create(GenericDocument& document, bool* alreadyExist = 0) const { + return Create(document, document.GetAllocator(), alreadyExist); + } + + //@} + + //!@name Query value + //@{ + + //! Query a value in a subtree. + /*! + \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root. + \param unresolvedTokenIndex If the pointer cannot resolve a token in the pointer, this parameter can obtain the index of unresolved token. + \return Pointer to the value if it can be resolved. Otherwise null. + + \note + There are only 3 situations when a value cannot be resolved: + 1. A value in the path is not an array nor object. + 2. An object value does not contain the token. + 3. A token is out of range of an array value. + + Use unresolvedTokenIndex to retrieve the token index. + */ + ValueType* Get(ValueType& root, size_t* unresolvedTokenIndex = 0) const { + RAPIDJSON_ASSERT(IsValid()); + ValueType* v = &root; + for (const Token *t = tokens_; t != tokens_ + tokenCount_; ++t) { + switch (v->GetType()) { + case kObjectType: + { + typename ValueType::MemberIterator m = v->FindMember(GenericStringRef(t->name, t->length)); + if (m == v->MemberEnd()) + break; + v = &m->value; + } + continue; + case kArrayType: + if (t->index == kPointerInvalidIndex || t->index >= v->Size()) + break; + v = &((*v)[t->index]); + continue; + default: + break; + } + + // Error: unresolved token + if (unresolvedTokenIndex) + *unresolvedTokenIndex = static_cast(t - tokens_); + return 0; + } + return v; + } + + //! Query a const value in a const subtree. + /*! + \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root. + \return Pointer to the value if it can be resolved. Otherwise null. + */ + const ValueType* Get(const ValueType& root, size_t* unresolvedTokenIndex = 0) const { + return Get(const_cast(root), unresolvedTokenIndex); + } + + //@} + + //!@name Query a value with default + //@{ + + //! Query a value in a subtree with default value. + /*! + Similar to Get(), but if the specified value do not exists, it creates all parents and clone the default value. + So that this function always succeed. + + \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root. + \param defaultValue Default value to be cloned if the value was not exists. + \param allocator Allocator for creating the values if the specified value or its parents are not exist. + \see Create() + */ + ValueType& GetWithDefault(ValueType& root, const ValueType& defaultValue, typename ValueType::AllocatorType& allocator) const { + bool alreadyExist; + ValueType& v = Create(root, allocator, &alreadyExist); + return alreadyExist ? v : v.CopyFrom(defaultValue, allocator); + } + + //! Query a value in a subtree with default null-terminated string. + ValueType& GetWithDefault(ValueType& root, const Ch* defaultValue, typename ValueType::AllocatorType& allocator) const { + bool alreadyExist; + ValueType& v = Create(root, allocator, &alreadyExist); + return alreadyExist ? v : v.SetString(defaultValue, allocator); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Query a value in a subtree with default std::basic_string. + ValueType& GetWithDefault(ValueType& root, const std::basic_string& defaultValue, typename ValueType::AllocatorType& allocator) const { + bool alreadyExist; + ValueType& v = Create(root, allocator, &alreadyExist); + return alreadyExist ? v : v.SetString(defaultValue, allocator); + } +#endif + + //! Query a value in a subtree with default primitive value. + /*! + \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (ValueType&)) + GetWithDefault(ValueType& root, T defaultValue, typename ValueType::AllocatorType& allocator) const { + return GetWithDefault(root, ValueType(defaultValue).Move(), allocator); + } + + //! Query a value in a document with default value. + template + ValueType& GetWithDefault(GenericDocument& document, const ValueType& defaultValue) const { + return GetWithDefault(document, defaultValue, document.GetAllocator()); + } + + //! Query a value in a document with default null-terminated string. + template + ValueType& GetWithDefault(GenericDocument& document, const Ch* defaultValue) const { + return GetWithDefault(document, defaultValue, document.GetAllocator()); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Query a value in a document with default std::basic_string. + template + ValueType& GetWithDefault(GenericDocument& document, const std::basic_string& defaultValue) const { + return GetWithDefault(document, defaultValue, document.GetAllocator()); + } +#endif + + //! Query a value in a document with default primitive value. + /*! + \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (ValueType&)) + GetWithDefault(GenericDocument& document, T defaultValue) const { + return GetWithDefault(document, defaultValue, document.GetAllocator()); + } + + //@} + + //!@name Set a value + //@{ + + //! Set a value in a subtree, with move semantics. + /*! + It creates all parents if they are not exist or types are different to the tokens. + So this function always succeeds but potentially remove existing values. + + \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root. + \param value Value to be set. + \param allocator Allocator for creating the values if the specified value or its parents are not exist. + \see Create() + */ + ValueType& Set(ValueType& root, ValueType& value, typename ValueType::AllocatorType& allocator) const { + return Create(root, allocator) = value; + } + + //! Set a value in a subtree, with copy semantics. + ValueType& Set(ValueType& root, const ValueType& value, typename ValueType::AllocatorType& allocator) const { + return Create(root, allocator).CopyFrom(value, allocator); + } + + //! Set a null-terminated string in a subtree. + ValueType& Set(ValueType& root, const Ch* value, typename ValueType::AllocatorType& allocator) const { + return Create(root, allocator) = ValueType(value, allocator).Move(); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Set a std::basic_string in a subtree. + ValueType& Set(ValueType& root, const std::basic_string& value, typename ValueType::AllocatorType& allocator) const { + return Create(root, allocator) = ValueType(value, allocator).Move(); + } +#endif + + //! Set a primitive value in a subtree. + /*! + \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (ValueType&)) + Set(ValueType& root, T value, typename ValueType::AllocatorType& allocator) const { + return Create(root, allocator) = ValueType(value).Move(); + } + + //! Set a value in a document, with move semantics. + template + ValueType& Set(GenericDocument& document, ValueType& value) const { + return Create(document) = value; + } + + //! Set a value in a document, with copy semantics. + template + ValueType& Set(GenericDocument& document, const ValueType& value) const { + return Create(document).CopyFrom(value, document.GetAllocator()); + } + + //! Set a null-terminated string in a document. + template + ValueType& Set(GenericDocument& document, const Ch* value) const { + return Create(document) = ValueType(value, document.GetAllocator()).Move(); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Sets a std::basic_string in a document. + template + ValueType& Set(GenericDocument& document, const std::basic_string& value) const { + return Create(document) = ValueType(value, document.GetAllocator()).Move(); + } +#endif + + //! Set a primitive value in a document. + /*! + \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c bool + */ + template + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (ValueType&)) + Set(GenericDocument& document, T value) const { + return Create(document) = value; + } + + //@} + + //!@name Swap a value + //@{ + + //! Swap a value with a value in a subtree. + /*! + It creates all parents if they are not exist or types are different to the tokens. + So this function always succeeds but potentially remove existing values. + + \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root. + \param value Value to be swapped. + \param allocator Allocator for creating the values if the specified value or its parents are not exist. + \see Create() + */ + ValueType& Swap(ValueType& root, ValueType& value, typename ValueType::AllocatorType& allocator) const { + return Create(root, allocator).Swap(value); + } + + //! Swap a value with a value in a document. + template + ValueType& Swap(GenericDocument& document, ValueType& value) const { + return Create(document).Swap(value); + } + + //@} + + //! Erase a value in a subtree. + /*! + \param root Root value of a DOM sub-tree to be resolved. It can be any value other than document root. + \return Whether the resolved value is found and erased. + + \note Erasing with an empty pointer \c Pointer(""), i.e. the root, always fail and return false. + */ + bool Erase(ValueType& root) const { + RAPIDJSON_ASSERT(IsValid()); + if (tokenCount_ == 0) // Cannot erase the root + return false; + + ValueType* v = &root; + const Token* last = tokens_ + (tokenCount_ - 1); + for (const Token *t = tokens_; t != last; ++t) { + switch (v->GetType()) { + case kObjectType: + { + typename ValueType::MemberIterator m = v->FindMember(GenericStringRef(t->name, t->length)); + if (m == v->MemberEnd()) + return false; + v = &m->value; + } + break; + case kArrayType: + if (t->index == kPointerInvalidIndex || t->index >= v->Size()) + return false; + v = &((*v)[t->index]); + break; + default: + return false; + } + } + + switch (v->GetType()) { + case kObjectType: + return v->EraseMember(GenericStringRef(last->name, last->length)); + case kArrayType: + if (last->index == kPointerInvalidIndex || last->index >= v->Size()) + return false; + v->Erase(v->Begin() + last->index); + return true; + default: + return false; + } + } + +private: + //! Clone the content from rhs to this. + /*! + \param rhs Source pointer. + \param extraToken Extra tokens to be allocated. + \param extraNameBufferSize Extra name buffer size (in number of Ch) to be allocated. + \return Start of non-occupied name buffer, for storing extra names. + */ + Ch* CopyFromRaw(const GenericPointer& rhs, size_t extraToken = 0, size_t extraNameBufferSize = 0) { + if (!allocator_) // allocator is independently owned. + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + + size_t nameBufferSize = rhs.tokenCount_; // null terminators for tokens + for (Token *t = rhs.tokens_; t != rhs.tokens_ + rhs.tokenCount_; ++t) + nameBufferSize += t->length; + + tokenCount_ = rhs.tokenCount_ + extraToken; + tokens_ = static_cast(allocator_->Malloc(tokenCount_ * sizeof(Token) + (nameBufferSize + extraNameBufferSize) * sizeof(Ch))); + nameBuffer_ = reinterpret_cast(tokens_ + tokenCount_); + if (rhs.tokenCount_ > 0) { + std::memcpy(tokens_, rhs.tokens_, rhs.tokenCount_ * sizeof(Token)); + } + if (nameBufferSize > 0) { + std::memcpy(nameBuffer_, rhs.nameBuffer_, nameBufferSize * sizeof(Ch)); + } + + // Adjust pointers to name buffer + std::ptrdiff_t diff = nameBuffer_ - rhs.nameBuffer_; + for (Token *t = tokens_; t != tokens_ + rhs.tokenCount_; ++t) + t->name += diff; + + return nameBuffer_ + nameBufferSize; + } + + //! Check whether a character should be percent-encoded. + /*! + According to RFC 3986 2.3 Unreserved Characters. + \param c The character (code unit) to be tested. + */ + bool NeedPercentEncode(Ch c) const { + return !((c >= '0' && c <= '9') || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || c == '-' || c == '.' || c == '_' || c =='~'); + } + + //! Parse a JSON String or its URI fragment representation into tokens. +#ifndef __clang__ // -Wdocumentation + /*! + \param source Either a JSON Pointer string, or its URI fragment representation. Not need to be null terminated. + \param length Length of the source string. + \note Source cannot be JSON String Representation of JSON Pointer, e.g. In "/\u0000", \u0000 will not be unescaped. + */ +#endif + void Parse(const Ch* source, size_t length) { + RAPIDJSON_ASSERT(source != NULL); + RAPIDJSON_ASSERT(nameBuffer_ == 0); + RAPIDJSON_ASSERT(tokens_ == 0); + + // Create own allocator if user did not supply. + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + + // Count number of '/' as tokenCount + tokenCount_ = 0; + for (const Ch* s = source; s != source + length; s++) + if (*s == '/') + tokenCount_++; + + Token* token = tokens_ = static_cast(allocator_->Malloc(tokenCount_ * sizeof(Token) + length * sizeof(Ch))); + Ch* name = nameBuffer_ = reinterpret_cast(tokens_ + tokenCount_); + size_t i = 0; + + // Detect if it is a URI fragment + bool uriFragment = false; + if (source[i] == '#') { + uriFragment = true; + i++; + } + + if (i != length && source[i] != '/') { + parseErrorCode_ = kPointerParseErrorTokenMustBeginWithSolidus; + goto error; + } + + while (i < length) { + RAPIDJSON_ASSERT(source[i] == '/'); + i++; // consumes '/' + + token->name = name; + bool isNumber = true; + + while (i < length && source[i] != '/') { + Ch c = source[i]; + if (uriFragment) { + // Decoding percent-encoding for URI fragment + if (c == '%') { + PercentDecodeStream is(&source[i], source + length); + GenericInsituStringStream os(name); + Ch* begin = os.PutBegin(); + if (!Transcoder, EncodingType>().Validate(is, os) || !is.IsValid()) { + parseErrorCode_ = kPointerParseErrorInvalidPercentEncoding; + goto error; + } + size_t len = os.PutEnd(begin); + i += is.Tell() - 1; + if (len == 1) + c = *name; + else { + name += len; + isNumber = false; + i++; + continue; + } + } + else if (NeedPercentEncode(c)) { + parseErrorCode_ = kPointerParseErrorCharacterMustPercentEncode; + goto error; + } + } + + i++; + + // Escaping "~0" -> '~', "~1" -> '/' + if (c == '~') { + if (i < length) { + c = source[i]; + if (c == '0') c = '~'; + else if (c == '1') c = '/'; + else { + parseErrorCode_ = kPointerParseErrorInvalidEscape; + goto error; + } + i++; + } + else { + parseErrorCode_ = kPointerParseErrorInvalidEscape; + goto error; + } + } + + // First check for index: all of characters are digit + if (c < '0' || c > '9') + isNumber = false; + + *name++ = c; + } + token->length = static_cast(name - token->name); + if (token->length == 0) + isNumber = false; + *name++ = '\0'; // Null terminator + + // Second check for index: more than one digit cannot have leading zero + if (isNumber && token->length > 1 && token->name[0] == '0') + isNumber = false; + + // String to SizeType conversion + SizeType n = 0; + if (isNumber) { + for (size_t j = 0; j < token->length; j++) { + SizeType m = n * 10 + static_cast(token->name[j] - '0'); + if (m < n) { // overflow detection + isNumber = false; + break; + } + n = m; + } + } + + token->index = isNumber ? n : kPointerInvalidIndex; + token++; + } + + RAPIDJSON_ASSERT(name <= nameBuffer_ + length); // Should not overflow buffer + parseErrorCode_ = kPointerParseErrorNone; + return; + + error: + Allocator::Free(tokens_); + nameBuffer_ = 0; + tokens_ = 0; + tokenCount_ = 0; + parseErrorOffset_ = i; + return; + } + + //! Stringify to string or URI fragment representation. + /*! + \tparam uriFragment True for stringifying to URI fragment representation. False for string representation. + \tparam OutputStream type of output stream. + \param os The output stream. + */ + template + bool Stringify(OutputStream& os) const { + RAPIDJSON_ASSERT(IsValid()); + + if (uriFragment) + os.Put('#'); + + for (Token *t = tokens_; t != tokens_ + tokenCount_; ++t) { + os.Put('/'); + for (size_t j = 0; j < t->length; j++) { + Ch c = t->name[j]; + if (c == '~') { + os.Put('~'); + os.Put('0'); + } + else if (c == '/') { + os.Put('~'); + os.Put('1'); + } + else if (uriFragment && NeedPercentEncode(c)) { + // Transcode to UTF8 sequence + GenericStringStream source(&t->name[j]); + PercentEncodeStream target(os); + if (!Transcoder >().Validate(source, target)) + return false; + j += source.Tell() - 1; + } + else + os.Put(c); + } + } + return true; + } + + //! A helper stream for decoding a percent-encoded sequence into code unit. + /*! + This stream decodes %XY triplet into code unit (0-255). + If it encounters invalid characters, it sets output code unit as 0 and + mark invalid, and to be checked by IsValid(). + */ + class PercentDecodeStream { + public: + typedef typename ValueType::Ch Ch; + + //! Constructor + /*! + \param source Start of the stream + \param end Past-the-end of the stream. + */ + PercentDecodeStream(const Ch* source, const Ch* end) : src_(source), head_(source), end_(end), valid_(true) {} + + Ch Take() { + if (*src_ != '%' || src_ + 3 > end_) { // %XY triplet + valid_ = false; + return 0; + } + src_++; + Ch c = 0; + for (int j = 0; j < 2; j++) { + c = static_cast(c << 4); + Ch h = *src_; + if (h >= '0' && h <= '9') c = static_cast(c + h - '0'); + else if (h >= 'A' && h <= 'F') c = static_cast(c + h - 'A' + 10); + else if (h >= 'a' && h <= 'f') c = static_cast(c + h - 'a' + 10); + else { + valid_ = false; + return 0; + } + src_++; + } + return c; + } + + size_t Tell() const { return static_cast(src_ - head_); } + bool IsValid() const { return valid_; } + + private: + const Ch* src_; //!< Current read position. + const Ch* head_; //!< Original head of the string. + const Ch* end_; //!< Past-the-end position. + bool valid_; //!< Whether the parsing is valid. + }; + + //! A helper stream to encode character (UTF-8 code unit) into percent-encoded sequence. + template + class PercentEncodeStream { + public: + PercentEncodeStream(OutputStream& os) : os_(os) {} + void Put(char c) { // UTF-8 must be byte + unsigned char u = static_cast(c); + static const char hexDigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; + os_.Put('%'); + os_.Put(static_cast(hexDigits[u >> 4])); + os_.Put(static_cast(hexDigits[u & 15])); + } + private: + OutputStream& os_; + }; + + Allocator* allocator_; //!< The current allocator. It is either user-supplied or equal to ownAllocator_. + Allocator* ownAllocator_; //!< Allocator owned by this Pointer. + Ch* nameBuffer_; //!< A buffer containing all names in tokens. + Token* tokens_; //!< A list of tokens. + size_t tokenCount_; //!< Number of tokens in tokens_. + size_t parseErrorOffset_; //!< Offset in code unit when parsing fail. + PointerParseErrorCode parseErrorCode_; //!< Parsing error code. +}; + +//! GenericPointer for Value (UTF-8, default allocator). +typedef GenericPointer Pointer; + +//!@name Helper functions for GenericPointer +//@{ + +////////////////////////////////////////////////////////////////////////////// + +template +typename T::ValueType& CreateValueByPointer(T& root, const GenericPointer& pointer, typename T::AllocatorType& a) { + return pointer.Create(root, a); +} + +template +typename T::ValueType& CreateValueByPointer(T& root, const CharType(&source)[N], typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Create(root, a); +} + +// No allocator parameter + +template +typename DocumentType::ValueType& CreateValueByPointer(DocumentType& document, const GenericPointer& pointer) { + return pointer.Create(document); +} + +template +typename DocumentType::ValueType& CreateValueByPointer(DocumentType& document, const CharType(&source)[N]) { + return GenericPointer(source, N - 1).Create(document); +} + +////////////////////////////////////////////////////////////////////////////// + +template +typename T::ValueType* GetValueByPointer(T& root, const GenericPointer& pointer, size_t* unresolvedTokenIndex = 0) { + return pointer.Get(root, unresolvedTokenIndex); +} + +template +const typename T::ValueType* GetValueByPointer(const T& root, const GenericPointer& pointer, size_t* unresolvedTokenIndex = 0) { + return pointer.Get(root, unresolvedTokenIndex); +} + +template +typename T::ValueType* GetValueByPointer(T& root, const CharType (&source)[N], size_t* unresolvedTokenIndex = 0) { + return GenericPointer(source, N - 1).Get(root, unresolvedTokenIndex); +} + +template +const typename T::ValueType* GetValueByPointer(const T& root, const CharType(&source)[N], size_t* unresolvedTokenIndex = 0) { + return GenericPointer(source, N - 1).Get(root, unresolvedTokenIndex); +} + +////////////////////////////////////////////////////////////////////////////// + +template +typename T::ValueType& GetValueByPointerWithDefault(T& root, const GenericPointer& pointer, const typename T::ValueType& defaultValue, typename T::AllocatorType& a) { + return pointer.GetWithDefault(root, defaultValue, a); +} + +template +typename T::ValueType& GetValueByPointerWithDefault(T& root, const GenericPointer& pointer, const typename T::Ch* defaultValue, typename T::AllocatorType& a) { + return pointer.GetWithDefault(root, defaultValue, a); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename T::ValueType& GetValueByPointerWithDefault(T& root, const GenericPointer& pointer, const std::basic_string& defaultValue, typename T::AllocatorType& a) { + return pointer.GetWithDefault(root, defaultValue, a); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename T::ValueType&)) +GetValueByPointerWithDefault(T& root, const GenericPointer& pointer, T2 defaultValue, typename T::AllocatorType& a) { + return pointer.GetWithDefault(root, defaultValue, a); +} + +template +typename T::ValueType& GetValueByPointerWithDefault(T& root, const CharType(&source)[N], const typename T::ValueType& defaultValue, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).GetWithDefault(root, defaultValue, a); +} + +template +typename T::ValueType& GetValueByPointerWithDefault(T& root, const CharType(&source)[N], const typename T::Ch* defaultValue, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).GetWithDefault(root, defaultValue, a); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename T::ValueType& GetValueByPointerWithDefault(T& root, const CharType(&source)[N], const std::basic_string& defaultValue, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).GetWithDefault(root, defaultValue, a); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename T::ValueType&)) +GetValueByPointerWithDefault(T& root, const CharType(&source)[N], T2 defaultValue, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).GetWithDefault(root, defaultValue, a); +} + +// No allocator parameter + +template +typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const GenericPointer& pointer, const typename DocumentType::ValueType& defaultValue) { + return pointer.GetWithDefault(document, defaultValue); +} + +template +typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const GenericPointer& pointer, const typename DocumentType::Ch* defaultValue) { + return pointer.GetWithDefault(document, defaultValue); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const GenericPointer& pointer, const std::basic_string& defaultValue) { + return pointer.GetWithDefault(document, defaultValue); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename DocumentType::ValueType&)) +GetValueByPointerWithDefault(DocumentType& document, const GenericPointer& pointer, T2 defaultValue) { + return pointer.GetWithDefault(document, defaultValue); +} + +template +typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], const typename DocumentType::ValueType& defaultValue) { + return GenericPointer(source, N - 1).GetWithDefault(document, defaultValue); +} + +template +typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], const typename DocumentType::Ch* defaultValue) { + return GenericPointer(source, N - 1).GetWithDefault(document, defaultValue); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename DocumentType::ValueType& GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], const std::basic_string& defaultValue) { + return GenericPointer(source, N - 1).GetWithDefault(document, defaultValue); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename DocumentType::ValueType&)) +GetValueByPointerWithDefault(DocumentType& document, const CharType(&source)[N], T2 defaultValue) { + return GenericPointer(source, N - 1).GetWithDefault(document, defaultValue); +} + +////////////////////////////////////////////////////////////////////////////// + +template +typename T::ValueType& SetValueByPointer(T& root, const GenericPointer& pointer, typename T::ValueType& value, typename T::AllocatorType& a) { + return pointer.Set(root, value, a); +} + +template +typename T::ValueType& SetValueByPointer(T& root, const GenericPointer& pointer, const typename T::ValueType& value, typename T::AllocatorType& a) { + return pointer.Set(root, value, a); +} + +template +typename T::ValueType& SetValueByPointer(T& root, const GenericPointer& pointer, const typename T::Ch* value, typename T::AllocatorType& a) { + return pointer.Set(root, value, a); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename T::ValueType& SetValueByPointer(T& root, const GenericPointer& pointer, const std::basic_string& value, typename T::AllocatorType& a) { + return pointer.Set(root, value, a); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename T::ValueType&)) +SetValueByPointer(T& root, const GenericPointer& pointer, T2 value, typename T::AllocatorType& a) { + return pointer.Set(root, value, a); +} + +template +typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], typename T::ValueType& value, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Set(root, value, a); +} + +template +typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], const typename T::ValueType& value, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Set(root, value, a); +} + +template +typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], const typename T::Ch* value, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Set(root, value, a); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename T::ValueType& SetValueByPointer(T& root, const CharType(&source)[N], const std::basic_string& value, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Set(root, value, a); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename T::ValueType&)) +SetValueByPointer(T& root, const CharType(&source)[N], T2 value, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Set(root, value, a); +} + +// No allocator parameter + +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer& pointer, typename DocumentType::ValueType& value) { + return pointer.Set(document, value); +} + +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer& pointer, const typename DocumentType::ValueType& value) { + return pointer.Set(document, value); +} + +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer& pointer, const typename DocumentType::Ch* value) { + return pointer.Set(document, value); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const GenericPointer& pointer, const std::basic_string& value) { + return pointer.Set(document, value); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename DocumentType::ValueType&)) +SetValueByPointer(DocumentType& document, const GenericPointer& pointer, T2 value) { + return pointer.Set(document, value); +} + +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], typename DocumentType::ValueType& value) { + return GenericPointer(source, N - 1).Set(document, value); +} + +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], const typename DocumentType::ValueType& value) { + return GenericPointer(source, N - 1).Set(document, value); +} + +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], const typename DocumentType::Ch* value) { + return GenericPointer(source, N - 1).Set(document, value); +} + +#if RAPIDJSON_HAS_STDSTRING +template +typename DocumentType::ValueType& SetValueByPointer(DocumentType& document, const CharType(&source)[N], const std::basic_string& value) { + return GenericPointer(source, N - 1).Set(document, value); +} +#endif + +template +RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr, internal::IsGenericValue >), (typename DocumentType::ValueType&)) +SetValueByPointer(DocumentType& document, const CharType(&source)[N], T2 value) { + return GenericPointer(source, N - 1).Set(document, value); +} + +////////////////////////////////////////////////////////////////////////////// + +template +typename T::ValueType& SwapValueByPointer(T& root, const GenericPointer& pointer, typename T::ValueType& value, typename T::AllocatorType& a) { + return pointer.Swap(root, value, a); +} + +template +typename T::ValueType& SwapValueByPointer(T& root, const CharType(&source)[N], typename T::ValueType& value, typename T::AllocatorType& a) { + return GenericPointer(source, N - 1).Swap(root, value, a); +} + +template +typename DocumentType::ValueType& SwapValueByPointer(DocumentType& document, const GenericPointer& pointer, typename DocumentType::ValueType& value) { + return pointer.Swap(document, value); +} + +template +typename DocumentType::ValueType& SwapValueByPointer(DocumentType& document, const CharType(&source)[N], typename DocumentType::ValueType& value) { + return GenericPointer(source, N - 1).Swap(document, value); +} + +////////////////////////////////////////////////////////////////////////////// + +template +bool EraseValueByPointer(T& root, const GenericPointer& pointer) { + return pointer.Erase(root); +} + +template +bool EraseValueByPointer(T& root, const CharType(&source)[N]) { + return GenericPointer(source, N - 1).Erase(root); +} + +//@} + +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) || defined(_MSC_VER) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_POINTER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/prettywriter.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/prettywriter.h new file mode 100644 index 0000000..45afb69 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/prettywriter.h @@ -0,0 +1,277 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_PRETTYWRITER_H_ +#define RAPIDJSON_PRETTYWRITER_H_ + +#include "writer.h" + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#endif + +#if defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Combination of PrettyWriter format flags. +/*! \see PrettyWriter::SetFormatOptions + */ +enum PrettyFormatOptions { + kFormatDefault = 0, //!< Default pretty formatting. + kFormatSingleLineArray = 1 //!< Format arrays on a single line. +}; + +//! Writer with indentation and spacing. +/*! + \tparam OutputStream Type of output os. + \tparam SourceEncoding Encoding of source string. + \tparam TargetEncoding Encoding of output stream. + \tparam StackAllocator Type of allocator for allocating memory of stack. +*/ +template, typename TargetEncoding = UTF8<>, typename StackAllocator = CrtAllocator, unsigned writeFlags = kWriteDefaultFlags> +class PrettyWriter : public Writer { +public: + typedef Writer Base; + typedef typename Base::Ch Ch; + + //! Constructor + /*! \param os Output stream. + \param allocator User supplied allocator. If it is null, it will create a private one. + \param levelDepth Initial capacity of stack. + */ + explicit PrettyWriter(OutputStream& os, StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : + Base(os, allocator, levelDepth), indentChar_(' '), indentCharCount_(4), formatOptions_(kFormatDefault) {} + + + explicit PrettyWriter(StackAllocator* allocator = 0, size_t levelDepth = Base::kDefaultLevelDepth) : + Base(allocator, levelDepth), indentChar_(' '), indentCharCount_(4) {} + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + PrettyWriter(PrettyWriter&& rhs) : + Base(std::forward(rhs)), indentChar_(rhs.indentChar_), indentCharCount_(rhs.indentCharCount_), formatOptions_(rhs.formatOptions_) {} +#endif + + //! Set custom indentation. + /*! \param indentChar Character for indentation. Must be whitespace character (' ', '\\t', '\\n', '\\r'). + \param indentCharCount Number of indent characters for each indentation level. + \note The default indentation is 4 spaces. + */ + PrettyWriter& SetIndent(Ch indentChar, unsigned indentCharCount) { + RAPIDJSON_ASSERT(indentChar == ' ' || indentChar == '\t' || indentChar == '\n' || indentChar == '\r'); + indentChar_ = indentChar; + indentCharCount_ = indentCharCount; + return *this; + } + + //! Set pretty writer formatting options. + /*! \param options Formatting options. + */ + PrettyWriter& SetFormatOptions(PrettyFormatOptions options) { + formatOptions_ = options; + return *this; + } + + /*! @name Implementation of Handler + \see Handler + */ + //@{ + + bool Null() { PrettyPrefix(kNullType); return Base::EndValue(Base::WriteNull()); } + bool Bool(bool b) { PrettyPrefix(b ? kTrueType : kFalseType); return Base::EndValue(Base::WriteBool(b)); } + bool Int(int i) { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteInt(i)); } + bool Uint(unsigned u) { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteUint(u)); } + bool Int64(int64_t i64) { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteInt64(i64)); } + bool Uint64(uint64_t u64) { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteUint64(u64)); } + bool Double(double d) { PrettyPrefix(kNumberType); return Base::EndValue(Base::WriteDouble(d)); } + + bool RawNumber(const Ch* str, SizeType length, bool copy = false) { + RAPIDJSON_ASSERT(str != 0); + (void)copy; + PrettyPrefix(kNumberType); + return Base::EndValue(Base::WriteString(str, length)); + } + + bool String(const Ch* str, SizeType length, bool copy = false) { + RAPIDJSON_ASSERT(str != 0); + (void)copy; + PrettyPrefix(kStringType); + return Base::EndValue(Base::WriteString(str, length)); + } + +#if RAPIDJSON_HAS_STDSTRING + bool String(const std::basic_string& str) { + return String(str.data(), SizeType(str.size())); + } +#endif + + bool StartObject() { + PrettyPrefix(kObjectType); + new (Base::level_stack_.template Push()) typename Base::Level(false); + return Base::WriteStartObject(); + } + + bool Key(const Ch* str, SizeType length, bool copy = false) { return String(str, length, copy); } + +#if RAPIDJSON_HAS_STDSTRING + bool Key(const std::basic_string& str) { + return Key(str.data(), SizeType(str.size())); + } +#endif + + bool EndObject(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level)); // not inside an Object + RAPIDJSON_ASSERT(!Base::level_stack_.template Top()->inArray); // currently inside an Array, not Object + RAPIDJSON_ASSERT(0 == Base::level_stack_.template Top()->valueCount % 2); // Object has a Key without a Value + + bool empty = Base::level_stack_.template Pop(1)->valueCount == 0; + + if (!empty) { + Base::os_->Put('\n'); + WriteIndent(); + } + bool ret = Base::EndValue(Base::WriteEndObject()); + (void)ret; + RAPIDJSON_ASSERT(ret == true); + if (Base::level_stack_.Empty()) // end of json text + Base::Flush(); + return true; + } + + bool StartArray() { + PrettyPrefix(kArrayType); + new (Base::level_stack_.template Push()) typename Base::Level(true); + return Base::WriteStartArray(); + } + + bool EndArray(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(Base::level_stack_.GetSize() >= sizeof(typename Base::Level)); + RAPIDJSON_ASSERT(Base::level_stack_.template Top()->inArray); + bool empty = Base::level_stack_.template Pop(1)->valueCount == 0; + + if (!empty && !(formatOptions_ & kFormatSingleLineArray)) { + Base::os_->Put('\n'); + WriteIndent(); + } + bool ret = Base::EndValue(Base::WriteEndArray()); + (void)ret; + RAPIDJSON_ASSERT(ret == true); + if (Base::level_stack_.Empty()) // end of json text + Base::Flush(); + return true; + } + + //@} + + /*! @name Convenience extensions */ + //@{ + + //! Simpler but slower overload. + bool String(const Ch* str) { return String(str, internal::StrLen(str)); } + bool Key(const Ch* str) { return Key(str, internal::StrLen(str)); } + + //@} + + //! Write a raw JSON value. + /*! + For user to write a stringified JSON as a value. + + \param json A well-formed JSON value. It should not contain null character within [0, length - 1] range. + \param length Length of the json. + \param type Type of the root of json. + \note When using PrettyWriter::RawValue(), the result json may not be indented correctly. + */ + bool RawValue(const Ch* json, size_t length, Type type) { + RAPIDJSON_ASSERT(json != 0); + PrettyPrefix(type); + return Base::EndValue(Base::WriteRawValue(json, length)); + } + +protected: + void PrettyPrefix(Type type) { + (void)type; + if (Base::level_stack_.GetSize() != 0) { // this value is not at root + typename Base::Level* level = Base::level_stack_.template Top(); + + if (level->inArray) { + if (level->valueCount > 0) { + Base::os_->Put(','); // add comma if it is not the first element in array + if (formatOptions_ & kFormatSingleLineArray) + Base::os_->Put(' '); + } + + if (!(formatOptions_ & kFormatSingleLineArray)) { + Base::os_->Put('\n'); + WriteIndent(); + } + } + else { // in object + if (level->valueCount > 0) { + if (level->valueCount % 2 == 0) { + Base::os_->Put(','); + Base::os_->Put('\n'); + } + else { + Base::os_->Put(':'); + Base::os_->Put(' '); + } + } + else + Base::os_->Put('\n'); + + if (level->valueCount % 2 == 0) + WriteIndent(); + } + if (!level->inArray && level->valueCount % 2 == 0) + RAPIDJSON_ASSERT(type == kStringType); // if it's in object, then even number should be a name + level->valueCount++; + } + else { + RAPIDJSON_ASSERT(!Base::hasRoot_); // Should only has one and only one root. + Base::hasRoot_ = true; + } + } + + void WriteIndent() { + size_t count = (Base::level_stack_.GetSize() / sizeof(typename Base::Level)) * indentCharCount_; + PutN(*Base::os_, static_cast(indentChar_), count); + } + + Ch indentChar_; + unsigned indentCharCount_; + PrettyFormatOptions formatOptions_; + +private: + // Prohibit copy constructor & assignment operator. + PrettyWriter(const PrettyWriter&); + PrettyWriter& operator=(const PrettyWriter&); +}; + +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_RAPIDJSON_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/rapidjson.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/rapidjson.h new file mode 100644 index 0000000..065c8bb --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/rapidjson.h @@ -0,0 +1,654 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_RAPIDJSON_H_ +#define RAPIDJSON_RAPIDJSON_H_ + +/*!\file rapidjson.h + \brief common definitions and configuration + + \see RAPIDJSON_CONFIG + */ + +/*! \defgroup RAPIDJSON_CONFIG RapidJSON configuration + \brief Configuration macros for library features + + Some RapidJSON features are configurable to adapt the library to a wide + variety of platforms, environments and usage scenarios. Most of the + features can be configured in terms of overridden or predefined + preprocessor macros at compile-time. + + Some additional customization is available in the \ref RAPIDJSON_ERRORS APIs. + + \note These macros should be given on the compiler command-line + (where applicable) to avoid inconsistent values when compiling + different translation units of a single application. + */ + +#include // malloc(), realloc(), free(), size_t +#include // memset(), memcpy(), memmove(), memcmp() + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_VERSION_STRING +// +// ALWAYS synchronize the following 3 macros with corresponding variables in /CMakeLists.txt. +// + +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +// token stringification +#define RAPIDJSON_STRINGIFY(x) RAPIDJSON_DO_STRINGIFY(x) +#define RAPIDJSON_DO_STRINGIFY(x) #x + +// token concatenation +#define RAPIDJSON_JOIN(X, Y) RAPIDJSON_DO_JOIN(X, Y) +#define RAPIDJSON_DO_JOIN(X, Y) RAPIDJSON_DO_JOIN2(X, Y) +#define RAPIDJSON_DO_JOIN2(X, Y) X##Y +//!@endcond + +/*! \def RAPIDJSON_MAJOR_VERSION + \ingroup RAPIDJSON_CONFIG + \brief Major version of RapidJSON in integer. +*/ +/*! \def RAPIDJSON_MINOR_VERSION + \ingroup RAPIDJSON_CONFIG + \brief Minor version of RapidJSON in integer. +*/ +/*! \def RAPIDJSON_PATCH_VERSION + \ingroup RAPIDJSON_CONFIG + \brief Patch version of RapidJSON in integer. +*/ +/*! \def RAPIDJSON_VERSION_STRING + \ingroup RAPIDJSON_CONFIG + \brief Version of RapidJSON in ".." string format. +*/ +#define RAPIDJSON_MAJOR_VERSION 1 +#define RAPIDJSON_MINOR_VERSION 1 +#define RAPIDJSON_PATCH_VERSION 0 +#define RAPIDJSON_VERSION_STRING \ + RAPIDJSON_STRINGIFY(RAPIDJSON_MAJOR_VERSION.RAPIDJSON_MINOR_VERSION.RAPIDJSON_PATCH_VERSION) + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_NAMESPACE_(BEGIN|END) +/*! \def RAPIDJSON_NAMESPACE + \ingroup RAPIDJSON_CONFIG + \brief provide custom rapidjson namespace + + In order to avoid symbol clashes and/or "One Definition Rule" errors + between multiple inclusions of (different versions of) RapidJSON in + a single binary, users can customize the name of the main RapidJSON + namespace. + + In case of a single nesting level, defining \c RAPIDJSON_NAMESPACE + to a custom name (e.g. \c MyRapidJSON) is sufficient. If multiple + levels are needed, both \ref RAPIDJSON_NAMESPACE_BEGIN and \ref + RAPIDJSON_NAMESPACE_END need to be defined as well: + + \code + // in some .cpp file + #define RAPIDJSON_NAMESPACE my::rapidjson + #define RAPIDJSON_NAMESPACE_BEGIN namespace my { namespace rapidjson { + #define RAPIDJSON_NAMESPACE_END } } + #include "rapidjson/..." + \endcode + + \see rapidjson + */ +/*! \def RAPIDJSON_NAMESPACE_BEGIN + \ingroup RAPIDJSON_CONFIG + \brief provide custom rapidjson namespace (opening expression) + \see RAPIDJSON_NAMESPACE +*/ +/*! \def RAPIDJSON_NAMESPACE_END + \ingroup RAPIDJSON_CONFIG + \brief provide custom rapidjson namespace (closing expression) + \see RAPIDJSON_NAMESPACE +*/ +#ifndef RAPIDJSON_NAMESPACE +#define RAPIDJSON_NAMESPACE rapidjson +#endif +#ifndef RAPIDJSON_NAMESPACE_BEGIN +#define RAPIDJSON_NAMESPACE_BEGIN namespace RAPIDJSON_NAMESPACE { +#endif +#ifndef RAPIDJSON_NAMESPACE_END +#define RAPIDJSON_NAMESPACE_END } +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_HAS_STDSTRING + +#ifndef RAPIDJSON_HAS_STDSTRING +#ifdef RAPIDJSON_DOXYGEN_RUNNING +#define RAPIDJSON_HAS_STDSTRING 1 // force generation of documentation +#else +#define RAPIDJSON_HAS_STDSTRING 0 // no std::string support by default +#endif +/*! \def RAPIDJSON_HAS_STDSTRING + \ingroup RAPIDJSON_CONFIG + \brief Enable RapidJSON support for \c std::string + + By defining this preprocessor symbol to \c 1, several convenience functions for using + \ref rapidjson::GenericValue with \c std::string are enabled, especially + for construction and comparison. + + \hideinitializer +*/ +#endif // !defined(RAPIDJSON_HAS_STDSTRING) + +#if RAPIDJSON_HAS_STDSTRING +#include +#endif // RAPIDJSON_HAS_STDSTRING + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_NO_INT64DEFINE + +/*! \def RAPIDJSON_NO_INT64DEFINE + \ingroup RAPIDJSON_CONFIG + \brief Use external 64-bit integer types. + + RapidJSON requires the 64-bit integer types \c int64_t and \c uint64_t types + to be available at global scope. + + If users have their own definition, define RAPIDJSON_NO_INT64DEFINE to + prevent RapidJSON from defining its own types. +*/ +#ifndef RAPIDJSON_NO_INT64DEFINE +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +#if defined(_MSC_VER) && (_MSC_VER < 1800) // Visual Studio 2013 +#include "msinttypes/stdint.h" +#include "msinttypes/inttypes.h" +#else +// Other compilers should have this. +#include +#include +#endif +//!@endcond +#ifdef RAPIDJSON_DOXYGEN_RUNNING +#define RAPIDJSON_NO_INT64DEFINE +#endif +#endif // RAPIDJSON_NO_INT64TYPEDEF + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_FORCEINLINE + +#ifndef RAPIDJSON_FORCEINLINE +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +#if defined(_MSC_VER) && defined(NDEBUG) +#define RAPIDJSON_FORCEINLINE __forceinline +#elif defined(__GNUC__) && __GNUC__ >= 4 && defined(NDEBUG) +#define RAPIDJSON_FORCEINLINE __attribute__((always_inline)) +#else +#define RAPIDJSON_FORCEINLINE +#endif +//!@endcond +#endif // RAPIDJSON_FORCEINLINE + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ENDIAN +#define RAPIDJSON_LITTLEENDIAN 0 //!< Little endian machine +#define RAPIDJSON_BIGENDIAN 1 //!< Big endian machine + +//! Endianness of the machine. +/*! + \def RAPIDJSON_ENDIAN + \ingroup RAPIDJSON_CONFIG + + GCC 4.6 provided macro for detecting endianness of the target machine. But other + compilers may not have this. User can define RAPIDJSON_ENDIAN to either + \ref RAPIDJSON_LITTLEENDIAN or \ref RAPIDJSON_BIGENDIAN. + + Default detection implemented with reference to + \li https://gcc.gnu.org/onlinedocs/gcc-4.6.0/cpp/Common-Predefined-Macros.html + \li http://www.boost.org/doc/libs/1_42_0/boost/detail/endian.hpp +*/ +#ifndef RAPIDJSON_ENDIAN +// Detect with GCC 4.6's macro +# ifdef __BYTE_ORDER__ +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN +# elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN +# else +# error Unknown machine endianness detected. User needs to define RAPIDJSON_ENDIAN. +# endif // __BYTE_ORDER__ +// Detect with GLIBC's endian.h +# elif defined(__GLIBC__) +# include +# if (__BYTE_ORDER == __LITTLE_ENDIAN) +# define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN +# elif (__BYTE_ORDER == __BIG_ENDIAN) +# define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN +# else +# error Unknown machine endianness detected. User needs to define RAPIDJSON_ENDIAN. +# endif // __GLIBC__ +// Detect with _LITTLE_ENDIAN and _BIG_ENDIAN macro +# elif defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) +# define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN +# elif defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN) +# define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN +// Detect with architecture macros +# elif defined(__sparc) || defined(__sparc__) || defined(_POWER) || defined(__powerpc__) || defined(__ppc__) || defined(__hpux) || defined(__hppa) || defined(_MIPSEB) || defined(_POWER) || defined(__s390__) +# define RAPIDJSON_ENDIAN RAPIDJSON_BIGENDIAN +# elif defined(__i386__) || defined(__alpha__) || defined(__ia64) || defined(__ia64__) || defined(_M_IX86) || defined(_M_IA64) || defined(_M_ALPHA) || defined(__amd64) || defined(__amd64__) || defined(_M_AMD64) || defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || defined(__bfin__) +# define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN +# elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +# define RAPIDJSON_ENDIAN RAPIDJSON_LITTLEENDIAN +# elif defined(RAPIDJSON_DOXYGEN_RUNNING) +# define RAPIDJSON_ENDIAN +# else +# error Unknown machine endianness detected. User needs to define RAPIDJSON_ENDIAN. +# endif +#endif // RAPIDJSON_ENDIAN + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_64BIT + +//! Whether using 64-bit architecture +#ifndef RAPIDJSON_64BIT +#if defined(__LP64__) || (defined(__x86_64__) && defined(__ILP32__)) || defined(_WIN64) || defined(__EMSCRIPTEN__) +#define RAPIDJSON_64BIT 1 +#else +#define RAPIDJSON_64BIT 0 +#endif +#endif // RAPIDJSON_64BIT + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ALIGN + +//! Data alignment of the machine. +/*! \ingroup RAPIDJSON_CONFIG + \param x pointer to align + + Some machines require strict data alignment. The default is 8 bytes. + User can customize by defining the RAPIDJSON_ALIGN function macro. +*/ +#ifndef RAPIDJSON_ALIGN +#define RAPIDJSON_ALIGN(x) (((x) + static_cast(7u)) & ~static_cast(7u)) +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_UINT64_C2 + +//! Construct a 64-bit literal by a pair of 32-bit integer. +/*! + 64-bit literal with or without ULL suffix is prone to compiler warnings. + UINT64_C() is C macro which cause compilation problems. + Use this macro to define 64-bit constants by a pair of 32-bit integer. +*/ +#ifndef RAPIDJSON_UINT64_C2 +#define RAPIDJSON_UINT64_C2(high32, low32) ((static_cast(high32) << 32) | static_cast(low32)) +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_48BITPOINTER_OPTIMIZATION + +//! Use only lower 48-bit address for some pointers. +/*! + \ingroup RAPIDJSON_CONFIG + + This optimization uses the fact that current X86-64 architecture only implement lower 48-bit virtual address. + The higher 16-bit can be used for storing other data. + \c GenericValue uses this optimization to reduce its size form 24 bytes to 16 bytes in 64-bit architecture. +*/ +#ifndef RAPIDJSON_48BITPOINTER_OPTIMIZATION +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) +#define RAPIDJSON_48BITPOINTER_OPTIMIZATION 1 +#else +#define RAPIDJSON_48BITPOINTER_OPTIMIZATION 0 +#endif +#endif // RAPIDJSON_48BITPOINTER_OPTIMIZATION + +#if RAPIDJSON_48BITPOINTER_OPTIMIZATION == 1 +#if RAPIDJSON_64BIT != 1 +#error RAPIDJSON_48BITPOINTER_OPTIMIZATION can only be set to 1 when RAPIDJSON_64BIT=1 +#endif +#define RAPIDJSON_SETPOINTER(type, p, x) (p = reinterpret_cast((reinterpret_cast(p) & static_cast(RAPIDJSON_UINT64_C2(0xFFFF0000, 0x00000000))) | reinterpret_cast(reinterpret_cast(x)))) +#define RAPIDJSON_GETPOINTER(type, p) (reinterpret_cast(reinterpret_cast(p) & static_cast(RAPIDJSON_UINT64_C2(0x0000FFFF, 0xFFFFFFFF)))) +#else +#define RAPIDJSON_SETPOINTER(type, p, x) (p = (x)) +#define RAPIDJSON_GETPOINTER(type, p) (p) +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_SSE2/RAPIDJSON_SSE42/RAPIDJSON_NEON/RAPIDJSON_SIMD + +/*! \def RAPIDJSON_SIMD + \ingroup RAPIDJSON_CONFIG + \brief Enable SSE2/SSE4.2/Neon optimization. + + RapidJSON supports optimized implementations for some parsing operations + based on the SSE2, SSE4.2 or NEon SIMD extensions on modern Intel + or ARM compatible processors. + + To enable these optimizations, three different symbols can be defined; + \code + // Enable SSE2 optimization. + #define RAPIDJSON_SSE2 + + // Enable SSE4.2 optimization. + #define RAPIDJSON_SSE42 + \endcode + + // Enable ARM Neon optimization. + #define RAPIDJSON_NEON + \endcode + + \c RAPIDJSON_SSE42 takes precedence over SSE2, if both are defined. + + If any of these symbols is defined, RapidJSON defines the macro + \c RAPIDJSON_SIMD to indicate the availability of the optimized code. +*/ +#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) \ + || defined(RAPIDJSON_NEON) || defined(RAPIDJSON_DOXYGEN_RUNNING) +#define RAPIDJSON_SIMD +#endif + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_NO_SIZETYPEDEFINE + +#ifndef RAPIDJSON_NO_SIZETYPEDEFINE +/*! \def RAPIDJSON_NO_SIZETYPEDEFINE + \ingroup RAPIDJSON_CONFIG + \brief User-provided \c SizeType definition. + + In order to avoid using 32-bit size types for indexing strings and arrays, + define this preprocessor symbol and provide the type rapidjson::SizeType + before including RapidJSON: + \code + #define RAPIDJSON_NO_SIZETYPEDEFINE + namespace rapidjson { typedef ::std::size_t SizeType; } + #include "rapidjson/..." + \endcode + + \see rapidjson::SizeType +*/ +#ifdef RAPIDJSON_DOXYGEN_RUNNING +#define RAPIDJSON_NO_SIZETYPEDEFINE +#endif +RAPIDJSON_NAMESPACE_BEGIN +//! Size type (for string lengths, array sizes, etc.) +/*! RapidJSON uses 32-bit array/string indices even on 64-bit platforms, + instead of using \c size_t. Users may override the SizeType by defining + \ref RAPIDJSON_NO_SIZETYPEDEFINE. +*/ +typedef unsigned SizeType; +RAPIDJSON_NAMESPACE_END +#endif + +// always import std::size_t to rapidjson namespace +RAPIDJSON_NAMESPACE_BEGIN +using std::size_t; +RAPIDJSON_NAMESPACE_END + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_ASSERT + +//! Assertion. +/*! \ingroup RAPIDJSON_CONFIG + By default, rapidjson uses C \c assert() for internal assertions. + User can override it by defining RAPIDJSON_ASSERT(x) macro. + + \note Parsing errors are handled and can be customized by the + \ref RAPIDJSON_ERRORS APIs. +*/ +#ifndef RAPIDJSON_ASSERT +#include +#define RAPIDJSON_ASSERT(x) assert(x) +#endif // RAPIDJSON_ASSERT + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_STATIC_ASSERT + +// Prefer C++11 static_assert, if available +#ifndef RAPIDJSON_STATIC_ASSERT +#if __cplusplus >= 201103L || ( defined(_MSC_VER) && _MSC_VER >= 1800 ) +#define RAPIDJSON_STATIC_ASSERT(x) \ + static_assert(x, RAPIDJSON_STRINGIFY(x)) +#endif // C++11 +#endif // RAPIDJSON_STATIC_ASSERT + +// Adopt C++03 implementation from boost +#ifndef RAPIDJSON_STATIC_ASSERT +#ifndef __clang__ +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +#endif +RAPIDJSON_NAMESPACE_BEGIN +template struct STATIC_ASSERTION_FAILURE; +template <> struct STATIC_ASSERTION_FAILURE { enum { value = 1 }; }; +template struct StaticAssertTest {}; +RAPIDJSON_NAMESPACE_END + +#if defined(__GNUC__) || defined(__clang__) +#define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE __attribute__((unused)) +#else +#define RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE +#endif +#ifndef __clang__ +//!@endcond +#endif + +/*! \def RAPIDJSON_STATIC_ASSERT + \brief (Internal) macro to check for conditions at compile-time + \param x compile-time condition + \hideinitializer + */ +#define RAPIDJSON_STATIC_ASSERT(x) \ + typedef ::RAPIDJSON_NAMESPACE::StaticAssertTest< \ + sizeof(::RAPIDJSON_NAMESPACE::STATIC_ASSERTION_FAILURE)> \ + RAPIDJSON_JOIN(StaticAssertTypedef, __LINE__) RAPIDJSON_STATIC_ASSERT_UNUSED_ATTRIBUTE +#endif // RAPIDJSON_STATIC_ASSERT + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_LIKELY, RAPIDJSON_UNLIKELY + +//! Compiler branching hint for expression with high probability to be true. +/*! + \ingroup RAPIDJSON_CONFIG + \param x Boolean expression likely to be true. +*/ +#ifndef RAPIDJSON_LIKELY +#if defined(__GNUC__) || defined(__clang__) +#define RAPIDJSON_LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define RAPIDJSON_LIKELY(x) (x) +#endif +#endif + +//! Compiler branching hint for expression with low probability to be true. +/*! + \ingroup RAPIDJSON_CONFIG + \param x Boolean expression unlikely to be true. +*/ +#ifndef RAPIDJSON_UNLIKELY +#if defined(__GNUC__) || defined(__clang__) +#define RAPIDJSON_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define RAPIDJSON_UNLIKELY(x) (x) +#endif +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Helpers + +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN + +#define RAPIDJSON_MULTILINEMACRO_BEGIN do { +#define RAPIDJSON_MULTILINEMACRO_END \ +} while((void)0, 0) + +// adopted from Boost +#define RAPIDJSON_VERSION_CODE(x,y,z) \ + (((x)*100000) + ((y)*100) + (z)) + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_DIAG_PUSH/POP, RAPIDJSON_DIAG_OFF + +#if defined(__GNUC__) +#define RAPIDJSON_GNUC \ + RAPIDJSON_VERSION_CODE(__GNUC__,__GNUC_MINOR__,__GNUC_PATCHLEVEL__) +#endif + +#if defined(__clang__) || (defined(RAPIDJSON_GNUC) && RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,2,0)) + +#define RAPIDJSON_PRAGMA(x) _Pragma(RAPIDJSON_STRINGIFY(x)) +#define RAPIDJSON_DIAG_PRAGMA(x) RAPIDJSON_PRAGMA(GCC diagnostic x) +#define RAPIDJSON_DIAG_OFF(x) \ + RAPIDJSON_DIAG_PRAGMA(ignored RAPIDJSON_STRINGIFY(RAPIDJSON_JOIN(-W,x))) + +// push/pop support in Clang and GCC>=4.6 +#if defined(__clang__) || (defined(RAPIDJSON_GNUC) && RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,6,0)) +#define RAPIDJSON_DIAG_PUSH RAPIDJSON_DIAG_PRAGMA(push) +#define RAPIDJSON_DIAG_POP RAPIDJSON_DIAG_PRAGMA(pop) +#else // GCC >= 4.2, < 4.6 +#define RAPIDJSON_DIAG_PUSH /* ignored */ +#define RAPIDJSON_DIAG_POP /* ignored */ +#endif + +#elif defined(_MSC_VER) + +// pragma (MSVC specific) +#define RAPIDJSON_PRAGMA(x) __pragma(x) +#define RAPIDJSON_DIAG_PRAGMA(x) RAPIDJSON_PRAGMA(warning(x)) + +#define RAPIDJSON_DIAG_OFF(x) RAPIDJSON_DIAG_PRAGMA(disable: x) +#define RAPIDJSON_DIAG_PUSH RAPIDJSON_DIAG_PRAGMA(push) +#define RAPIDJSON_DIAG_POP RAPIDJSON_DIAG_PRAGMA(pop) + +#else + +#define RAPIDJSON_DIAG_OFF(x) /* ignored */ +#define RAPIDJSON_DIAG_PUSH /* ignored */ +#define RAPIDJSON_DIAG_POP /* ignored */ + +#endif // RAPIDJSON_DIAG_* + +/////////////////////////////////////////////////////////////////////////////// +// C++11 features + +#ifndef RAPIDJSON_HAS_CXX11_RVALUE_REFS +#if defined(__clang__) +#if __has_feature(cxx_rvalue_references) && \ + (defined(_MSC_VER) || defined(_LIBCPP_VERSION) || defined(__GLIBCXX__) && __GLIBCXX__ >= 20080306) +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#else +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 0 +#endif +#elif (defined(RAPIDJSON_GNUC) && (RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,3,0)) && defined(__GXX_EXPERIMENTAL_CXX0X__)) || \ + (defined(_MSC_VER) && _MSC_VER >= 1600) || \ + (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5140 && defined(__GXX_EXPERIMENTAL_CXX0X__)) + +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#else +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 0 +#endif +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + +#ifndef RAPIDJSON_HAS_CXX11_NOEXCEPT +#if defined(__clang__) +#define RAPIDJSON_HAS_CXX11_NOEXCEPT __has_feature(cxx_noexcept) +#elif (defined(RAPIDJSON_GNUC) && (RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,6,0)) && defined(__GXX_EXPERIMENTAL_CXX0X__)) || \ + (defined(_MSC_VER) && _MSC_VER >= 1900) || \ + (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5140 && defined(__GXX_EXPERIMENTAL_CXX0X__)) +#define RAPIDJSON_HAS_CXX11_NOEXCEPT 1 +#else +#define RAPIDJSON_HAS_CXX11_NOEXCEPT 0 +#endif +#endif +#if RAPIDJSON_HAS_CXX11_NOEXCEPT +#define RAPIDJSON_NOEXCEPT noexcept +#else +#define RAPIDJSON_NOEXCEPT /* noexcept */ +#endif // RAPIDJSON_HAS_CXX11_NOEXCEPT + +// no automatic detection, yet +#ifndef RAPIDJSON_HAS_CXX11_TYPETRAITS +#if (defined(_MSC_VER) && _MSC_VER >= 1700) +#define RAPIDJSON_HAS_CXX11_TYPETRAITS 1 +#else +#define RAPIDJSON_HAS_CXX11_TYPETRAITS 0 +#endif +#endif + +#ifndef RAPIDJSON_HAS_CXX11_RANGE_FOR +#if defined(__clang__) +#define RAPIDJSON_HAS_CXX11_RANGE_FOR __has_feature(cxx_range_for) +#elif (defined(RAPIDJSON_GNUC) && (RAPIDJSON_GNUC >= RAPIDJSON_VERSION_CODE(4,6,0)) && defined(__GXX_EXPERIMENTAL_CXX0X__)) || \ + (defined(_MSC_VER) && _MSC_VER >= 1700) || \ + (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x5140 && defined(__GXX_EXPERIMENTAL_CXX0X__)) +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 +#else +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 0 +#endif +#endif // RAPIDJSON_HAS_CXX11_RANGE_FOR + +//!@endcond + +//! Assertion (in non-throwing contexts). + /*! \ingroup RAPIDJSON_CONFIG + Some functions provide a \c noexcept guarantee, if the compiler supports it. + In these cases, the \ref RAPIDJSON_ASSERT macro cannot be overridden to + throw an exception. This macro adds a separate customization point for + such cases. + + Defaults to C \c assert() (as \ref RAPIDJSON_ASSERT), if \c noexcept is + supported, and to \ref RAPIDJSON_ASSERT otherwise. + */ + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_NOEXCEPT_ASSERT + +#ifdef RAPIDJSON_ASSERT_THROWS +#if RAPIDJSON_HAS_CXX11_NOEXCEPT +#define RAPIDJSON_NOEXCEPT_ASSERT(x) +#else +#define RAPIDJSON_NOEXCEPT_ASSERT(x) RAPIDJSON_ASSERT(x) +#endif // RAPIDJSON_HAS_CXX11_NOEXCEPT +#else +#define RAPIDJSON_NOEXCEPT_ASSERT(x) RAPIDJSON_ASSERT(x) +#endif // RAPIDJSON_ASSERT_THROWS + +/////////////////////////////////////////////////////////////////////////////// +// new/delete + +#ifndef RAPIDJSON_NEW +///! customization point for global \c new +#define RAPIDJSON_NEW(TypeName) new TypeName +#endif +#ifndef RAPIDJSON_DELETE +///! customization point for global \c delete +#define RAPIDJSON_DELETE(x) delete x +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Type + +/*! \namespace rapidjson + \brief main RapidJSON namespace + \see RAPIDJSON_NAMESPACE +*/ +RAPIDJSON_NAMESPACE_BEGIN + +//! Type of JSON value +enum Type { + kNullType = 0, //!< null + kFalseType = 1, //!< false + kTrueType = 2, //!< true + kObjectType = 3, //!< object + kArrayType = 4, //!< array + kStringType = 5, //!< string + kNumberType = 6 //!< number +}; + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_RAPIDJSON_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/reader.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/reader.h new file mode 100644 index 0000000..44a6bcd --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/reader.h @@ -0,0 +1,2230 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_READER_H_ +#define RAPIDJSON_READER_H_ + +/*! \file reader.h */ + +#include "allocators.h" +#include "stream.h" +#include "encodedstream.h" +#include "internal/meta.h" +#include "internal/stack.h" +#include "internal/strtod.h" +#include + +#if defined(RAPIDJSON_SIMD) && defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanForward) +#endif +#ifdef RAPIDJSON_SSE42 +#include +#elif defined(RAPIDJSON_SSE2) +#include +#elif defined(RAPIDJSON_NEON) +#include +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(old-style-cast) +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(switch-enum) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant +RAPIDJSON_DIAG_OFF(4702) // unreachable code +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +#endif + +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +#define RAPIDJSON_NOTHING /* deliberately empty */ +#ifndef RAPIDJSON_PARSE_ERROR_EARLY_RETURN +#define RAPIDJSON_PARSE_ERROR_EARLY_RETURN(value) \ + RAPIDJSON_MULTILINEMACRO_BEGIN \ + if (RAPIDJSON_UNLIKELY(HasParseError())) { return value; } \ + RAPIDJSON_MULTILINEMACRO_END +#endif +#define RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID \ + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(RAPIDJSON_NOTHING) +//!@endcond + +/*! \def RAPIDJSON_PARSE_ERROR_NORETURN + \ingroup RAPIDJSON_ERRORS + \brief Macro to indicate a parse error. + \param parseErrorCode \ref rapidjson::ParseErrorCode of the error + \param offset position of the error in JSON input (\c size_t) + + This macros can be used as a customization point for the internal + error handling mechanism of RapidJSON. + + A common usage model is to throw an exception instead of requiring the + caller to explicitly check the \ref rapidjson::GenericReader::Parse's + return value: + + \code + #define RAPIDJSON_PARSE_ERROR_NORETURN(parseErrorCode,offset) \ + throw ParseException(parseErrorCode, #parseErrorCode, offset) + + #include // std::runtime_error + #include "rapidjson/error/error.h" // rapidjson::ParseResult + + struct ParseException : std::runtime_error, rapidjson::ParseResult { + ParseException(rapidjson::ParseErrorCode code, const char* msg, size_t offset) + : std::runtime_error(msg), ParseResult(code, offset) {} + }; + + #include "rapidjson/reader.h" + \endcode + + \see RAPIDJSON_PARSE_ERROR, rapidjson::GenericReader::Parse + */ +#ifndef RAPIDJSON_PARSE_ERROR_NORETURN +#define RAPIDJSON_PARSE_ERROR_NORETURN(parseErrorCode, offset) \ + RAPIDJSON_MULTILINEMACRO_BEGIN \ + RAPIDJSON_ASSERT(!HasParseError()); /* Error can only be assigned once */ \ + SetParseError(parseErrorCode, offset); \ + RAPIDJSON_MULTILINEMACRO_END +#endif + +/*! \def RAPIDJSON_PARSE_ERROR + \ingroup RAPIDJSON_ERRORS + \brief (Internal) macro to indicate and handle a parse error. + \param parseErrorCode \ref rapidjson::ParseErrorCode of the error + \param offset position of the error in JSON input (\c size_t) + + Invokes RAPIDJSON_PARSE_ERROR_NORETURN and stops the parsing. + + \see RAPIDJSON_PARSE_ERROR_NORETURN + \hideinitializer + */ +#ifndef RAPIDJSON_PARSE_ERROR +#define RAPIDJSON_PARSE_ERROR(parseErrorCode, offset) \ + RAPIDJSON_MULTILINEMACRO_BEGIN \ + RAPIDJSON_PARSE_ERROR_NORETURN(parseErrorCode, offset); \ + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; \ + RAPIDJSON_MULTILINEMACRO_END +#endif + +#include "error/error.h" // ParseErrorCode, ParseResult + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// ParseFlag + +/*! \def RAPIDJSON_PARSE_DEFAULT_FLAGS + \ingroup RAPIDJSON_CONFIG + \brief User-defined kParseDefaultFlags definition. + + User can define this as any \c ParseFlag combinations. +*/ +#ifndef RAPIDJSON_PARSE_DEFAULT_FLAGS +#define RAPIDJSON_PARSE_DEFAULT_FLAGS kParseNoFlags +#endif + +//! Combination of parseFlags +/*! \see Reader::Parse, Document::Parse, Document::ParseInsitu, Document::ParseStream + */ +enum ParseFlag { + kParseNoFlags = 0, //!< No flags are set. + kParseInsituFlag = 1, //!< In-situ(destructive) parsing. + kParseValidateEncodingFlag = 2, //!< Validate encoding of JSON strings. + kParseIterativeFlag = 4, //!< Iterative(constant complexity in terms of function call stack size) parsing. + kParseStopWhenDoneFlag = 8, //!< After parsing a complete JSON root from stream, stop further processing the rest of stream. When this flag is used, parser will not generate kParseErrorDocumentRootNotSingular error. + kParseFullPrecisionFlag = 16, //!< Parse number in full precision (but slower). + kParseCommentsFlag = 32, //!< Allow one-line (//) and multi-line (/**/) comments. + kParseNumbersAsStringsFlag = 64, //!< Parse all numbers (ints/doubles) as strings. + kParseTrailingCommasFlag = 128, //!< Allow trailing commas at the end of objects and arrays. + kParseNanAndInfFlag = 256, //!< Allow parsing NaN, Inf, Infinity, -Inf and -Infinity as doubles. + kParseDefaultFlags = RAPIDJSON_PARSE_DEFAULT_FLAGS //!< Default parse flags. Can be customized by defining RAPIDJSON_PARSE_DEFAULT_FLAGS +}; + +/////////////////////////////////////////////////////////////////////////////// +// Handler + +/*! \class rapidjson::Handler + \brief Concept for receiving events from GenericReader upon parsing. + The functions return true if no error occurs. If they return false, + the event publisher should terminate the process. +\code +concept Handler { + typename Ch; + + bool Null(); + bool Bool(bool b); + bool Int(int i); + bool Uint(unsigned i); + bool Int64(int64_t i); + bool Uint64(uint64_t i); + bool Double(double d); + /// enabled via kParseNumbersAsStringsFlag, string is not null-terminated (use length) + bool RawNumber(const Ch* str, SizeType length, bool copy); + bool String(const Ch* str, SizeType length, bool copy); + bool StartObject(); + bool Key(const Ch* str, SizeType length, bool copy); + bool EndObject(SizeType memberCount); + bool StartArray(); + bool EndArray(SizeType elementCount); +}; +\endcode +*/ +/////////////////////////////////////////////////////////////////////////////// +// BaseReaderHandler + +//! Default implementation of Handler. +/*! This can be used as base class of any reader handler. + \note implements Handler concept +*/ +template, typename Derived = void> +struct BaseReaderHandler { + typedef typename Encoding::Ch Ch; + + typedef typename internal::SelectIf, BaseReaderHandler, Derived>::Type Override; + + bool Default() { return true; } + bool Null() { return static_cast(*this).Default(); } + bool Bool(bool) { return static_cast(*this).Default(); } + bool Int(int) { return static_cast(*this).Default(); } + bool Uint(unsigned) { return static_cast(*this).Default(); } + bool Int64(int64_t) { return static_cast(*this).Default(); } + bool Uint64(uint64_t) { return static_cast(*this).Default(); } + bool Double(double) { return static_cast(*this).Default(); } + /// enabled via kParseNumbersAsStringsFlag, string is not null-terminated (use length) + bool RawNumber(const Ch* str, SizeType len, bool copy) { return static_cast(*this).String(str, len, copy); } + bool String(const Ch*, SizeType, bool) { return static_cast(*this).Default(); } + bool StartObject() { return static_cast(*this).Default(); } + bool Key(const Ch* str, SizeType len, bool copy) { return static_cast(*this).String(str, len, copy); } + bool EndObject(SizeType) { return static_cast(*this).Default(); } + bool StartArray() { return static_cast(*this).Default(); } + bool EndArray(SizeType) { return static_cast(*this).Default(); } +}; + +/////////////////////////////////////////////////////////////////////////////// +// StreamLocalCopy + +namespace internal { + +template::copyOptimization> +class StreamLocalCopy; + +//! Do copy optimization. +template +class StreamLocalCopy { +public: + StreamLocalCopy(Stream& original) : s(original), original_(original) {} + ~StreamLocalCopy() { original_ = s; } + + Stream s; + +private: + StreamLocalCopy& operator=(const StreamLocalCopy&) /* = delete */; + + Stream& original_; +}; + +//! Keep reference. +template +class StreamLocalCopy { +public: + StreamLocalCopy(Stream& original) : s(original) {} + + Stream& s; + +private: + StreamLocalCopy& operator=(const StreamLocalCopy&) /* = delete */; +}; + +} // namespace internal + +/////////////////////////////////////////////////////////////////////////////// +// SkipWhitespace + +//! Skip the JSON white spaces in a stream. +/*! \param is A input stream for skipping white spaces. + \note This function has SSE2/SSE4.2 specialization. +*/ +template +void SkipWhitespace(InputStream& is) { + internal::StreamLocalCopy copy(is); + InputStream& s(copy.s); + + typename InputStream::Ch c; + while ((c = s.Peek()) == ' ' || c == '\n' || c == '\r' || c == '\t') + s.Take(); +} + +inline const char* SkipWhitespace(const char* p, const char* end) { + while (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + return p; +} + +#ifdef RAPIDJSON_SSE42 +//! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. +inline const char *SkipWhitespace_SIMD(const char* p) { + // Fast return for single non-whitespace + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + // 16-byte align to the next boundary + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + // The rest of string using SIMD + static const char whitespace[16] = " \n\r\t"; + const __m128i w = _mm_loadu_si128(reinterpret_cast(&whitespace[0])); + + for (;; p += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast(p)); + const int r = _mm_cmpistri(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT | _SIDD_NEGATIVE_POLARITY); + if (r != 16) // some of characters is non-whitespace + return p + r; + } +} + +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { + // Fast return for single non-whitespace + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + else + return p; + + // The middle of string using SIMD + static const char whitespace[16] = " \n\r\t"; + const __m128i w = _mm_loadu_si128(reinterpret_cast(&whitespace[0])); + + for (; p <= end - 16; p += 16) { + const __m128i s = _mm_loadu_si128(reinterpret_cast(p)); + const int r = _mm_cmpistri(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT | _SIDD_NEGATIVE_POLARITY); + if (r != 16) // some of characters is non-whitespace + return p + r; + } + + return SkipWhitespace(p, end); +} + +#elif defined(RAPIDJSON_SSE2) + +//! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once. +inline const char *SkipWhitespace_SIMD(const char* p) { + // Fast return for single non-whitespace + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + // 16-byte align to the next boundary + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + // The rest of string + #define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c } + static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') }; + #undef C16 + + const __m128i w0 = _mm_loadu_si128(reinterpret_cast(&whitespaces[0][0])); + const __m128i w1 = _mm_loadu_si128(reinterpret_cast(&whitespaces[1][0])); + const __m128i w2 = _mm_loadu_si128(reinterpret_cast(&whitespaces[2][0])); + const __m128i w3 = _mm_loadu_si128(reinterpret_cast(&whitespaces[3][0])); + + for (;; p += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast(p)); + __m128i x = _mm_cmpeq_epi8(s, w0); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); + unsigned short r = static_cast(~_mm_movemask_epi8(x)); + if (r != 0) { // some of characters may be non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + _BitScanForward(&offset, r); + return p + offset; +#else + return p + __builtin_ffs(r) - 1; +#endif + } + } +} + +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { + // Fast return for single non-whitespace + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + else + return p; + + // The rest of string + #define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c } + static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') }; + #undef C16 + + const __m128i w0 = _mm_loadu_si128(reinterpret_cast(&whitespaces[0][0])); + const __m128i w1 = _mm_loadu_si128(reinterpret_cast(&whitespaces[1][0])); + const __m128i w2 = _mm_loadu_si128(reinterpret_cast(&whitespaces[2][0])); + const __m128i w3 = _mm_loadu_si128(reinterpret_cast(&whitespaces[3][0])); + + for (; p <= end - 16; p += 16) { + const __m128i s = _mm_loadu_si128(reinterpret_cast(p)); + __m128i x = _mm_cmpeq_epi8(s, w0); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); + unsigned short r = static_cast(~_mm_movemask_epi8(x)); + if (r != 0) { // some of characters may be non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + _BitScanForward(&offset, r); + return p + offset; +#else + return p + __builtin_ffs(r) - 1; +#endif + } + } + + return SkipWhitespace(p, end); +} + +#elif defined(RAPIDJSON_NEON) + +//! Skip whitespace with ARM Neon instructions, testing 16 8-byte characters at once. +inline const char *SkipWhitespace_SIMD(const char* p) { + // Fast return for single non-whitespace + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + // 16-byte align to the next boundary + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') + ++p; + else + return p; + + const uint8x16_t w0 = vmovq_n_u8(' '); + const uint8x16_t w1 = vmovq_n_u8('\n'); + const uint8x16_t w2 = vmovq_n_u8('\r'); + const uint8x16_t w3 = vmovq_n_u8('\t'); + + for (;; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, w0); + x = vorrq_u8(x, vceqq_u8(s, w1)); + x = vorrq_u8(x, vceqq_u8(s, w2)); + x = vorrq_u8(x, vceqq_u8(s, w3)); + + x = vmvnq_u8(x); // Negate + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + if (low == 0) { + if (high != 0) { + int lz =__builtin_clzll(high);; + return p + 8 + (lz >> 3); + } + } else { + int lz = __builtin_clzll(low);; + return p + (lz >> 3); + } + } +} + +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { + // Fast return for single non-whitespace + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + else + return p; + + const uint8x16_t w0 = vmovq_n_u8(' '); + const uint8x16_t w1 = vmovq_n_u8('\n'); + const uint8x16_t w2 = vmovq_n_u8('\r'); + const uint8x16_t w3 = vmovq_n_u8('\t'); + + for (; p <= end - 16; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, w0); + x = vorrq_u8(x, vceqq_u8(s, w1)); + x = vorrq_u8(x, vceqq_u8(s, w2)); + x = vorrq_u8(x, vceqq_u8(s, w3)); + + x = vmvnq_u8(x); // Negate + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + if (low == 0) { + if (high != 0) { + int lz = __builtin_clzll(high); + return p + 8 + (lz >> 3); + } + } else { + int lz = __builtin_clzll(low); + return p + (lz >> 3); + } + } + + return SkipWhitespace(p, end); +} + +#endif // RAPIDJSON_NEON + +#ifdef RAPIDJSON_SIMD +//! Template function specialization for InsituStringStream +template<> inline void SkipWhitespace(InsituStringStream& is) { + is.src_ = const_cast(SkipWhitespace_SIMD(is.src_)); +} + +//! Template function specialization for StringStream +template<> inline void SkipWhitespace(StringStream& is) { + is.src_ = SkipWhitespace_SIMD(is.src_); +} + +template<> inline void SkipWhitespace(EncodedInputStream, MemoryStream>& is) { + is.is_.src_ = SkipWhitespace_SIMD(is.is_.src_, is.is_.end_); +} +#endif // RAPIDJSON_SIMD + +/////////////////////////////////////////////////////////////////////////////// +// GenericReader + +//! SAX-style JSON parser. Use \ref Reader for UTF8 encoding and default allocator. +/*! GenericReader parses JSON text from a stream, and send events synchronously to an + object implementing Handler concept. + + It needs to allocate a stack for storing a single decoded string during + non-destructive parsing. + + For in-situ parsing, the decoded string is directly written to the source + text string, no temporary buffer is required. + + A GenericReader object can be reused for parsing multiple JSON text. + + \tparam SourceEncoding Encoding of the input stream. + \tparam TargetEncoding Encoding of the parse output. + \tparam StackAllocator Allocator type for stack. +*/ +template +class GenericReader { +public: + typedef typename SourceEncoding::Ch Ch; //!< SourceEncoding character type + + //! Constructor. + /*! \param stackAllocator Optional allocator for allocating stack memory. (Only use for non-destructive parsing) + \param stackCapacity stack capacity in bytes for storing a single decoded string. (Only use for non-destructive parsing) + */ + GenericReader(StackAllocator* stackAllocator = 0, size_t stackCapacity = kDefaultStackCapacity) : + stack_(stackAllocator, stackCapacity), parseResult_(), state_(IterativeParsingStartState) {} + + //! Parse JSON text. + /*! \tparam parseFlags Combination of \ref ParseFlag. + \tparam InputStream Type of input stream, implementing Stream concept. + \tparam Handler Type of handler, implementing Handler concept. + \param is Input stream to be parsed. + \param handler The handler to receive events. + \return Whether the parsing is successful. + */ + template + ParseResult Parse(InputStream& is, Handler& handler) { + if (parseFlags & kParseIterativeFlag) + return IterativeParse(is, handler); + + parseResult_.Clear(); + + ClearStackOnExit scope(*this); + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + + if (RAPIDJSON_UNLIKELY(is.Peek() == '\0')) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorDocumentEmpty, is.Tell()); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + } + else { + ParseValue(is, handler); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + + if (!(parseFlags & kParseStopWhenDoneFlag)) { + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + + if (RAPIDJSON_UNLIKELY(is.Peek() != '\0')) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorDocumentRootNotSingular, is.Tell()); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + } + } + } + + return parseResult_; + } + + //! Parse JSON text (with \ref kParseDefaultFlags) + /*! \tparam InputStream Type of input stream, implementing Stream concept + \tparam Handler Type of handler, implementing Handler concept. + \param is Input stream to be parsed. + \param handler The handler to receive events. + \return Whether the parsing is successful. + */ + template + ParseResult Parse(InputStream& is, Handler& handler) { + return Parse(is, handler); + } + + //! Initialize JSON text token-by-token parsing + /*! + */ + void IterativeParseInit() { + parseResult_.Clear(); + state_ = IterativeParsingStartState; + } + + //! Parse one token from JSON text + /*! \tparam InputStream Type of input stream, implementing Stream concept + \tparam Handler Type of handler, implementing Handler concept. + \param is Input stream to be parsed. + \param handler The handler to receive events. + \return Whether the parsing is successful. + */ + template + bool IterativeParseNext(InputStream& is, Handler& handler) { + while (RAPIDJSON_LIKELY(is.Peek() != '\0')) { + SkipWhitespaceAndComments(is); + + Token t = Tokenize(is.Peek()); + IterativeParsingState n = Predict(state_, t); + IterativeParsingState d = Transit(state_, t, n, is, handler); + + // If we've finished or hit an error... + if (RAPIDJSON_UNLIKELY(IsIterativeParsingCompleteState(d))) { + // Report errors. + if (d == IterativeParsingErrorState) { + HandleError(state_, is); + return false; + } + + // Transition to the finish state. + RAPIDJSON_ASSERT(d == IterativeParsingFinishState); + state_ = d; + + // If StopWhenDone is not set... + if (!(parseFlags & kParseStopWhenDoneFlag)) { + // ... and extra non-whitespace data is found... + SkipWhitespaceAndComments(is); + if (is.Peek() != '\0') { + // ... this is considered an error. + HandleError(state_, is); + return false; + } + } + + // Success! We are done! + return true; + } + + // Transition to the new state. + state_ = d; + + // If we parsed anything other than a delimiter, we invoked the handler, so we can return true now. + if (!IsIterativeParsingDelimiterState(n)) + return true; + } + + // We reached the end of file. + stack_.Clear(); + + if (state_ != IterativeParsingFinishState) { + HandleError(state_, is); + return false; + } + + return true; + } + + //! Check if token-by-token parsing JSON text is complete + /*! \return Whether the JSON has been fully decoded. + */ + RAPIDJSON_FORCEINLINE bool IterativeParseComplete() const { + return IsIterativeParsingCompleteState(state_); + } + + //! Whether a parse error has occurred in the last parsing. + bool HasParseError() const { return parseResult_.IsError(); } + + //! Get the \ref ParseErrorCode of last parsing. + ParseErrorCode GetParseErrorCode() const { return parseResult_.Code(); } + + //! Get the position of last parsing error in input, 0 otherwise. + size_t GetErrorOffset() const { return parseResult_.Offset(); } + +protected: + void SetParseError(ParseErrorCode code, size_t offset) { parseResult_.Set(code, offset); } + +private: + // Prohibit copy constructor & assignment operator. + GenericReader(const GenericReader&); + GenericReader& operator=(const GenericReader&); + + void ClearStack() { stack_.Clear(); } + + // clear stack on any exit from ParseStream, e.g. due to exception + struct ClearStackOnExit { + explicit ClearStackOnExit(GenericReader& r) : r_(r) {} + ~ClearStackOnExit() { r_.ClearStack(); } + private: + GenericReader& r_; + ClearStackOnExit(const ClearStackOnExit&); + ClearStackOnExit& operator=(const ClearStackOnExit&); + }; + + template + void SkipWhitespaceAndComments(InputStream& is) { + SkipWhitespace(is); + + if (parseFlags & kParseCommentsFlag) { + while (RAPIDJSON_UNLIKELY(Consume(is, '/'))) { + if (Consume(is, '*')) { + while (true) { + if (RAPIDJSON_UNLIKELY(is.Peek() == '\0')) + RAPIDJSON_PARSE_ERROR(kParseErrorUnspecificSyntaxError, is.Tell()); + else if (Consume(is, '*')) { + if (Consume(is, '/')) + break; + } + else + is.Take(); + } + } + else if (RAPIDJSON_LIKELY(Consume(is, '/'))) + while (is.Peek() != '\0' && is.Take() != '\n') {} + else + RAPIDJSON_PARSE_ERROR(kParseErrorUnspecificSyntaxError, is.Tell()); + + SkipWhitespace(is); + } + } + } + + // Parse object: { string : value, ... } + template + void ParseObject(InputStream& is, Handler& handler) { + RAPIDJSON_ASSERT(is.Peek() == '{'); + is.Take(); // Skip '{' + + if (RAPIDJSON_UNLIKELY(!handler.StartObject())) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + if (Consume(is, '}')) { + if (RAPIDJSON_UNLIKELY(!handler.EndObject(0))) // empty object + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + return; + } + + for (SizeType memberCount = 0;;) { + if (RAPIDJSON_UNLIKELY(is.Peek() != '"')) + RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissName, is.Tell()); + + ParseString(is, handler, true); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + if (RAPIDJSON_UNLIKELY(!Consume(is, ':'))) + RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissColon, is.Tell()); + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + ParseValue(is, handler); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + ++memberCount; + + switch (is.Peek()) { + case ',': + is.Take(); + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + break; + case '}': + is.Take(); + if (RAPIDJSON_UNLIKELY(!handler.EndObject(memberCount))) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + return; + default: + RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissCommaOrCurlyBracket, is.Tell()); break; // This useless break is only for making warning and coverage happy + } + + if (parseFlags & kParseTrailingCommasFlag) { + if (is.Peek() == '}') { + if (RAPIDJSON_UNLIKELY(!handler.EndObject(memberCount))) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + is.Take(); + return; + } + } + } + } + + // Parse array: [ value, ... ] + template + void ParseArray(InputStream& is, Handler& handler) { + RAPIDJSON_ASSERT(is.Peek() == '['); + is.Take(); // Skip '[' + + if (RAPIDJSON_UNLIKELY(!handler.StartArray())) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + if (Consume(is, ']')) { + if (RAPIDJSON_UNLIKELY(!handler.EndArray(0))) // empty array + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + return; + } + + for (SizeType elementCount = 0;;) { + ParseValue(is, handler); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + ++elementCount; + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + + if (Consume(is, ',')) { + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + } + else if (Consume(is, ']')) { + if (RAPIDJSON_UNLIKELY(!handler.EndArray(elementCount))) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + return; + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorArrayMissCommaOrSquareBracket, is.Tell()); + + if (parseFlags & kParseTrailingCommasFlag) { + if (is.Peek() == ']') { + if (RAPIDJSON_UNLIKELY(!handler.EndArray(elementCount))) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + is.Take(); + return; + } + } + } + } + + template + void ParseNull(InputStream& is, Handler& handler) { + RAPIDJSON_ASSERT(is.Peek() == 'n'); + is.Take(); + + if (RAPIDJSON_LIKELY(Consume(is, 'u') && Consume(is, 'l') && Consume(is, 'l'))) { + if (RAPIDJSON_UNLIKELY(!handler.Null())) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell()); + } + + template + void ParseTrue(InputStream& is, Handler& handler) { + RAPIDJSON_ASSERT(is.Peek() == 't'); + is.Take(); + + if (RAPIDJSON_LIKELY(Consume(is, 'r') && Consume(is, 'u') && Consume(is, 'e'))) { + if (RAPIDJSON_UNLIKELY(!handler.Bool(true))) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell()); + } + + template + void ParseFalse(InputStream& is, Handler& handler) { + RAPIDJSON_ASSERT(is.Peek() == 'f'); + is.Take(); + + if (RAPIDJSON_LIKELY(Consume(is, 'a') && Consume(is, 'l') && Consume(is, 's') && Consume(is, 'e'))) { + if (RAPIDJSON_UNLIKELY(!handler.Bool(false))) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, is.Tell()); + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell()); + } + + template + RAPIDJSON_FORCEINLINE static bool Consume(InputStream& is, typename InputStream::Ch expect) { + if (RAPIDJSON_LIKELY(is.Peek() == expect)) { + is.Take(); + return true; + } + else + return false; + } + + // Helper function to parse four hexadecimal digits in \uXXXX in ParseString(). + template + unsigned ParseHex4(InputStream& is, size_t escapeOffset) { + unsigned codepoint = 0; + for (int i = 0; i < 4; i++) { + Ch c = is.Peek(); + codepoint <<= 4; + codepoint += static_cast(c); + if (c >= '0' && c <= '9') + codepoint -= '0'; + else if (c >= 'A' && c <= 'F') + codepoint -= 'A' - 10; + else if (c >= 'a' && c <= 'f') + codepoint -= 'a' - 10; + else { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorStringUnicodeEscapeInvalidHex, escapeOffset); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(0); + } + is.Take(); + } + return codepoint; + } + + template + class StackStream { + public: + typedef CharType Ch; + + StackStream(internal::Stack& stack) : stack_(stack), length_(0) {} + RAPIDJSON_FORCEINLINE void Put(Ch c) { + *stack_.template Push() = c; + ++length_; + } + + RAPIDJSON_FORCEINLINE void* Push(SizeType count) { + length_ += count; + return stack_.template Push(count); + } + + size_t Length() const { return length_; } + + Ch* Pop() { + return stack_.template Pop(length_); + } + + private: + StackStream(const StackStream&); + StackStream& operator=(const StackStream&); + + internal::Stack& stack_; + SizeType length_; + }; + + // Parse string and generate String event. Different code paths for kParseInsituFlag. + template + void ParseString(InputStream& is, Handler& handler, bool isKey = false) { + internal::StreamLocalCopy copy(is); + InputStream& s(copy.s); + + RAPIDJSON_ASSERT(s.Peek() == '\"'); + s.Take(); // Skip '\"' + + bool success = false; + if (parseFlags & kParseInsituFlag) { + typename InputStream::Ch *head = s.PutBegin(); + ParseStringToStream(s, s); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + size_t length = s.PutEnd(head) - 1; + RAPIDJSON_ASSERT(length <= 0xFFFFFFFF); + const typename TargetEncoding::Ch* const str = reinterpret_cast(head); + success = (isKey ? handler.Key(str, SizeType(length), false) : handler.String(str, SizeType(length), false)); + } + else { + StackStream stackStream(stack_); + ParseStringToStream(s, stackStream); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + SizeType length = static_cast(stackStream.Length()) - 1; + const typename TargetEncoding::Ch* const str = stackStream.Pop(); + success = (isKey ? handler.Key(str, length, true) : handler.String(str, length, true)); + } + if (RAPIDJSON_UNLIKELY(!success)) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, s.Tell()); + } + + // Parse string to an output is + // This function handles the prefix/suffix double quotes, escaping, and optional encoding validation. + template + RAPIDJSON_FORCEINLINE void ParseStringToStream(InputStream& is, OutputStream& os) { +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + static const char escape[256] = { + Z16, Z16, 0, 0,'\"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'/', + Z16, Z16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0, + 0, 0,'\b', 0, 0, 0,'\f', 0, 0, 0, 0, 0, 0, 0,'\n', 0, + 0, 0,'\r', 0,'\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16 + }; +#undef Z16 +//!@endcond + + for (;;) { + // Scan and copy string before "\\\"" or < 0x20. This is an optional optimzation. + if (!(parseFlags & kParseValidateEncodingFlag)) + ScanCopyUnescapedString(is, os); + + Ch c = is.Peek(); + if (RAPIDJSON_UNLIKELY(c == '\\')) { // Escape + size_t escapeOffset = is.Tell(); // For invalid escaping, report the initial '\\' as error offset + is.Take(); + Ch e = is.Peek(); + if ((sizeof(Ch) == 1 || unsigned(e) < 256) && RAPIDJSON_LIKELY(escape[static_cast(e)])) { + is.Take(); + os.Put(static_cast(escape[static_cast(e)])); + } + else if (RAPIDJSON_LIKELY(e == 'u')) { // Unicode + is.Take(); + unsigned codepoint = ParseHex4(is, escapeOffset); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + if (RAPIDJSON_UNLIKELY(codepoint >= 0xD800 && codepoint <= 0xDBFF)) { + // Handle UTF-16 surrogate pair + if (RAPIDJSON_UNLIKELY(!Consume(is, '\\') || !Consume(is, 'u'))) + RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); + unsigned codepoint2 = ParseHex4(is, escapeOffset); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN_VOID; + if (RAPIDJSON_UNLIKELY(codepoint2 < 0xDC00 || codepoint2 > 0xDFFF)) + RAPIDJSON_PARSE_ERROR(kParseErrorStringUnicodeSurrogateInvalid, escapeOffset); + codepoint = (((codepoint - 0xD800) << 10) | (codepoint2 - 0xDC00)) + 0x10000; + } + TEncoding::Encode(os, codepoint); + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorStringEscapeInvalid, escapeOffset); + } + else if (RAPIDJSON_UNLIKELY(c == '"')) { // Closing double quote + is.Take(); + os.Put('\0'); // null-terminate the string + return; + } + else if (RAPIDJSON_UNLIKELY(static_cast(c) < 0x20)) { // RFC 4627: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF + if (c == '\0') + RAPIDJSON_PARSE_ERROR(kParseErrorStringMissQuotationMark, is.Tell()); + else + RAPIDJSON_PARSE_ERROR(kParseErrorStringInvalidEncoding, is.Tell()); + } + else { + size_t offset = is.Tell(); + if (RAPIDJSON_UNLIKELY((parseFlags & kParseValidateEncodingFlag ? + !Transcoder::Validate(is, os) : + !Transcoder::Transcode(is, os)))) + RAPIDJSON_PARSE_ERROR(kParseErrorStringInvalidEncoding, offset); + } + } + } + + template + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InputStream&, OutputStream&) { + // Do nothing for generic version + } + +#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) + // StringStream -> StackStream + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream& os) { + const char* p = is.src_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = p; + return; + } + else + os.Put(*p++); + + // The rest of string using SIMD + static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; + static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; + const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); + const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); + const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); + + for (;; p += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast(p)); + const __m128i t1 = _mm_cmpeq_epi8(s, dq); + const __m128i t2 = _mm_cmpeq_epi8(s, bs); + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F + const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); + unsigned short r = static_cast(_mm_movemask_epi8(x)); + if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped + SizeType length; + #ifdef _MSC_VER // Find the index of first escaped + unsigned long offset; + _BitScanForward(&offset, r); + length = offset; + #else + length = static_cast(__builtin_ffs(r) - 1); + #endif + if (length != 0) { + char* q = reinterpret_cast(os.Push(length)); + for (size_t i = 0; i < length; i++) + q[i] = p[i]; + + p += length; + } + break; + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(os.Push(16)), s); + } + + is.src_ = p; + } + + // InsituStringStream -> InsituStringStream + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) { + RAPIDJSON_ASSERT(&is == &os); + (void)os; + + if (is.src_ == is.dst_) { + SkipUnescapedString(is); + return; + } + + char* p = is.src_; + char *q = is.dst_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = p; + is.dst_ = q; + return; + } + else + *q++ = *p++; + + // The rest of string using SIMD + static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; + static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; + const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); + const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); + const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); + + for (;; p += 16, q += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast(p)); + const __m128i t1 = _mm_cmpeq_epi8(s, dq); + const __m128i t2 = _mm_cmpeq_epi8(s, bs); + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F + const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); + unsigned short r = static_cast(_mm_movemask_epi8(x)); + if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped + size_t length; +#ifdef _MSC_VER // Find the index of first escaped + unsigned long offset; + _BitScanForward(&offset, r); + length = offset; +#else + length = static_cast(__builtin_ffs(r) - 1); +#endif + for (const char* pend = p + length; p != pend; ) + *q++ = *p++; + break; + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(q), s); + } + + is.src_ = p; + is.dst_ = q; + } + + // When read/write pointers are the same for insitu stream, just skip unescaped characters + static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) { + RAPIDJSON_ASSERT(is.src_ == is.dst_); + char* p = is.src_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + for (; p != nextAligned; p++) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = is.dst_ = p; + return; + } + + // The rest of string using SIMD + static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; + static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; + const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); + const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); + const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); + + for (;; p += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast(p)); + const __m128i t1 = _mm_cmpeq_epi8(s, dq); + const __m128i t2 = _mm_cmpeq_epi8(s, bs); + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F + const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); + unsigned short r = static_cast(_mm_movemask_epi8(x)); + if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped + size_t length; +#ifdef _MSC_VER // Find the index of first escaped + unsigned long offset; + _BitScanForward(&offset, r); + length = offset; +#else + length = static_cast(__builtin_ffs(r) - 1); +#endif + p += length; + break; + } + } + + is.src_ = is.dst_ = p; + } +#elif defined(RAPIDJSON_NEON) + // StringStream -> StackStream + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream& os) { + const char* p = is.src_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = p; + return; + } + else + os.Put(*p++); + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (;; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + SizeType length = 0; + bool escaped = false; + if (low == 0) { + if (high != 0) { + unsigned lz = (unsigned)__builtin_clzll(high);; + length = 8 + (lz >> 3); + escaped = true; + } + } else { + unsigned lz = (unsigned)__builtin_clzll(low);; + length = lz >> 3; + escaped = true; + } + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped + if (length != 0) { + char* q = reinterpret_cast(os.Push(length)); + for (size_t i = 0; i < length; i++) + q[i] = p[i]; + + p += length; + } + break; + } + vst1q_u8(reinterpret_cast(os.Push(16)), s); + } + + is.src_ = p; + } + + // InsituStringStream -> InsituStringStream + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) { + RAPIDJSON_ASSERT(&is == &os); + (void)os; + + if (is.src_ == is.dst_) { + SkipUnescapedString(is); + return; + } + + char* p = is.src_; + char *q = is.dst_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + while (p != nextAligned) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = p; + is.dst_ = q; + return; + } + else + *q++ = *p++; + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (;; p += 16, q += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + SizeType length = 0; + bool escaped = false; + if (low == 0) { + if (high != 0) { + unsigned lz = (unsigned)__builtin_clzll(high); + length = 8 + (lz >> 3); + escaped = true; + } + } else { + unsigned lz = (unsigned)__builtin_clzll(low); + length = lz >> 3; + escaped = true; + } + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped + for (const char* pend = p + length; p != pend; ) { + *q++ = *p++; + } + break; + } + vst1q_u8(reinterpret_cast(q), s); + } + + is.src_ = p; + is.dst_ = q; + } + + // When read/write pointers are the same for insitu stream, just skip unescaped characters + static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) { + RAPIDJSON_ASSERT(is.src_ == is.dst_); + char* p = is.src_; + + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + for (; p != nextAligned; p++) + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast(*p) < 0x20)) { + is.src_ = is.dst_ = p; + return; + } + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (;; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + if (low == 0) { + if (high != 0) { + int lz = __builtin_clzll(high); + p += 8 + (lz >> 3); + break; + } + } else { + int lz = __builtin_clzll(low); + p += lz >> 3; + break; + } + } + + is.src_ = is.dst_ = p; + } +#endif // RAPIDJSON_NEON + + template + class NumberStream; + + template + class NumberStream { + public: + typedef typename InputStream::Ch Ch; + + NumberStream(GenericReader& reader, InputStream& s) : is(s) { (void)reader; } + + RAPIDJSON_FORCEINLINE Ch Peek() const { return is.Peek(); } + RAPIDJSON_FORCEINLINE Ch TakePush() { return is.Take(); } + RAPIDJSON_FORCEINLINE Ch Take() { return is.Take(); } + RAPIDJSON_FORCEINLINE void Push(char) {} + + size_t Tell() { return is.Tell(); } + size_t Length() { return 0; } + const char* Pop() { return 0; } + + protected: + NumberStream& operator=(const NumberStream&); + + InputStream& is; + }; + + template + class NumberStream : public NumberStream { + typedef NumberStream Base; + public: + NumberStream(GenericReader& reader, InputStream& is) : Base(reader, is), stackStream(reader.stack_) {} + + RAPIDJSON_FORCEINLINE Ch TakePush() { + stackStream.Put(static_cast(Base::is.Peek())); + return Base::is.Take(); + } + + RAPIDJSON_FORCEINLINE void Push(char c) { + stackStream.Put(c); + } + + size_t Length() { return stackStream.Length(); } + + const char* Pop() { + stackStream.Put('\0'); + return stackStream.Pop(); + } + + private: + StackStream stackStream; + }; + + template + class NumberStream : public NumberStream { + typedef NumberStream Base; + public: + NumberStream(GenericReader& reader, InputStream& is) : Base(reader, is) {} + + RAPIDJSON_FORCEINLINE Ch Take() { return Base::TakePush(); } + }; + + template + void ParseNumber(InputStream& is, Handler& handler) { + internal::StreamLocalCopy copy(is); + NumberStream s(*this, copy.s); + + size_t startOffset = s.Tell(); + double d = 0.0; + bool useNanOrInf = false; + + // Parse minus + bool minus = Consume(s, '-'); + + // Parse int: zero / ( digit1-9 *DIGIT ) + unsigned i = 0; + uint64_t i64 = 0; + bool use64bit = false; + int significandDigit = 0; + if (RAPIDJSON_UNLIKELY(s.Peek() == '0')) { + i = 0; + s.TakePush(); + } + else if (RAPIDJSON_LIKELY(s.Peek() >= '1' && s.Peek() <= '9')) { + i = static_cast(s.TakePush() - '0'); + + if (minus) + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + if (RAPIDJSON_UNLIKELY(i >= 214748364)) { // 2^31 = 2147483648 + if (RAPIDJSON_LIKELY(i != 214748364 || s.Peek() > '8')) { + i64 = i; + use64bit = true; + break; + } + } + i = i * 10 + static_cast(s.TakePush() - '0'); + significandDigit++; + } + else + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + if (RAPIDJSON_UNLIKELY(i >= 429496729)) { // 2^32 - 1 = 4294967295 + if (RAPIDJSON_LIKELY(i != 429496729 || s.Peek() > '5')) { + i64 = i; + use64bit = true; + break; + } + } + i = i * 10 + static_cast(s.TakePush() - '0'); + significandDigit++; + } + } + // Parse NaN or Infinity here + else if ((parseFlags & kParseNanAndInfFlag) && RAPIDJSON_LIKELY((s.Peek() == 'I' || s.Peek() == 'N'))) { + if (Consume(s, 'N')) { + if (Consume(s, 'a') && Consume(s, 'N')) { + d = std::numeric_limits::quiet_NaN(); + useNanOrInf = true; + } + } + else if (RAPIDJSON_LIKELY(Consume(s, 'I'))) { + if (Consume(s, 'n') && Consume(s, 'f')) { + d = (minus ? -std::numeric_limits::infinity() : std::numeric_limits::infinity()); + useNanOrInf = true; + + if (RAPIDJSON_UNLIKELY(s.Peek() == 'i' && !(Consume(s, 'i') && Consume(s, 'n') + && Consume(s, 'i') && Consume(s, 't') && Consume(s, 'y')))) { + RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, s.Tell()); + } + } + } + + if (RAPIDJSON_UNLIKELY(!useNanOrInf)) { + RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, s.Tell()); + } + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, s.Tell()); + + // Parse 64bit int + bool useDouble = false; + if (use64bit) { + if (minus) + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + if (RAPIDJSON_UNLIKELY(i64 >= RAPIDJSON_UINT64_C2(0x0CCCCCCC, 0xCCCCCCCC))) // 2^63 = 9223372036854775808 + if (RAPIDJSON_LIKELY(i64 != RAPIDJSON_UINT64_C2(0x0CCCCCCC, 0xCCCCCCCC) || s.Peek() > '8')) { + d = static_cast(i64); + useDouble = true; + break; + } + i64 = i64 * 10 + static_cast(s.TakePush() - '0'); + significandDigit++; + } + else + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + if (RAPIDJSON_UNLIKELY(i64 >= RAPIDJSON_UINT64_C2(0x19999999, 0x99999999))) // 2^64 - 1 = 18446744073709551615 + if (RAPIDJSON_LIKELY(i64 != RAPIDJSON_UINT64_C2(0x19999999, 0x99999999) || s.Peek() > '5')) { + d = static_cast(i64); + useDouble = true; + break; + } + i64 = i64 * 10 + static_cast(s.TakePush() - '0'); + significandDigit++; + } + } + + // Force double for big integer + if (useDouble) { + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + d = d * 10 + (s.TakePush() - '0'); + } + } + + // Parse frac = decimal-point 1*DIGIT + int expFrac = 0; + size_t decimalPosition; + if (Consume(s, '.')) { + decimalPosition = s.Length(); + + if (RAPIDJSON_UNLIKELY(!(s.Peek() >= '0' && s.Peek() <= '9'))) + RAPIDJSON_PARSE_ERROR(kParseErrorNumberMissFraction, s.Tell()); + + if (!useDouble) { +#if RAPIDJSON_64BIT + // Use i64 to store significand in 64-bit architecture + if (!use64bit) + i64 = i; + + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + if (i64 > RAPIDJSON_UINT64_C2(0x1FFFFF, 0xFFFFFFFF)) // 2^53 - 1 for fast path + break; + else { + i64 = i64 * 10 + static_cast(s.TakePush() - '0'); + --expFrac; + if (i64 != 0) + significandDigit++; + } + } + + d = static_cast(i64); +#else + // Use double to store significand in 32-bit architecture + d = static_cast(use64bit ? i64 : i); +#endif + useDouble = true; + } + + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + if (significandDigit < 17) { + d = d * 10.0 + (s.TakePush() - '0'); + --expFrac; + if (RAPIDJSON_LIKELY(d > 0.0)) + significandDigit++; + } + else + s.TakePush(); + } + } + else + decimalPosition = s.Length(); // decimal position at the end of integer. + + // Parse exp = e [ minus / plus ] 1*DIGIT + int exp = 0; + if (Consume(s, 'e') || Consume(s, 'E')) { + if (!useDouble) { + d = static_cast(use64bit ? i64 : i); + useDouble = true; + } + + bool expMinus = false; + if (Consume(s, '+')) + ; + else if (Consume(s, '-')) + expMinus = true; + + if (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + exp = static_cast(s.Take() - '0'); + if (expMinus) { + // (exp + expFrac) must not underflow int => we're detecting when -exp gets + // dangerously close to INT_MIN (a pessimistic next digit 9 would push it into + // underflow territory): + // + // -(exp * 10 + 9) + expFrac >= INT_MIN + // <=> exp <= (expFrac - INT_MIN - 9) / 10 + RAPIDJSON_ASSERT(expFrac <= 0); + int maxExp = (expFrac + 2147483639) / 10; + + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + exp = exp * 10 + static_cast(s.Take() - '0'); + if (RAPIDJSON_UNLIKELY(exp > maxExp)) { + while (RAPIDJSON_UNLIKELY(s.Peek() >= '0' && s.Peek() <= '9')) // Consume the rest of exponent + s.Take(); + } + } + } + else { // positive exp + int maxExp = 308 - expFrac; + while (RAPIDJSON_LIKELY(s.Peek() >= '0' && s.Peek() <= '9')) { + exp = exp * 10 + static_cast(s.Take() - '0'); + if (RAPIDJSON_UNLIKELY(exp > maxExp)) + RAPIDJSON_PARSE_ERROR(kParseErrorNumberTooBig, startOffset); + } + } + } + else + RAPIDJSON_PARSE_ERROR(kParseErrorNumberMissExponent, s.Tell()); + + if (expMinus) + exp = -exp; + } + + // Finish parsing, call event according to the type of number. + bool cont = true; + + if (parseFlags & kParseNumbersAsStringsFlag) { + if (parseFlags & kParseInsituFlag) { + s.Pop(); // Pop stack no matter if it will be used or not. + typename InputStream::Ch* head = is.PutBegin(); + const size_t length = s.Tell() - startOffset; + RAPIDJSON_ASSERT(length <= 0xFFFFFFFF); + // unable to insert the \0 character here, it will erase the comma after this number + const typename TargetEncoding::Ch* const str = reinterpret_cast(head); + cont = handler.RawNumber(str, SizeType(length), false); + } + else { + SizeType numCharsToCopy = static_cast(s.Length()); + StringStream srcStream(s.Pop()); + StackStream dstStream(stack_); + while (numCharsToCopy--) { + Transcoder, TargetEncoding>::Transcode(srcStream, dstStream); + } + dstStream.Put('\0'); + const typename TargetEncoding::Ch* str = dstStream.Pop(); + const SizeType length = static_cast(dstStream.Length()) - 1; + cont = handler.RawNumber(str, SizeType(length), true); + } + } + else { + size_t length = s.Length(); + const char* decimal = s.Pop(); // Pop stack no matter if it will be used or not. + + if (useDouble) { + int p = exp + expFrac; + if (parseFlags & kParseFullPrecisionFlag) + d = internal::StrtodFullPrecision(d, p, decimal, length, decimalPosition, exp); + else + d = internal::StrtodNormalPrecision(d, p); + + // Use > max, instead of == inf, to fix bogus warning -Wfloat-equal + if (d > (std::numeric_limits::max)()) { + // Overflow + // TODO: internal::StrtodX should report overflow (or underflow) + RAPIDJSON_PARSE_ERROR(kParseErrorNumberTooBig, startOffset); + } + + cont = handler.Double(minus ? -d : d); + } + else if (useNanOrInf) { + cont = handler.Double(d); + } + else { + if (use64bit) { + if (minus) + cont = handler.Int64(static_cast(~i64 + 1)); + else + cont = handler.Uint64(i64); + } + else { + if (minus) + cont = handler.Int(static_cast(~i + 1)); + else + cont = handler.Uint(i); + } + } + } + if (RAPIDJSON_UNLIKELY(!cont)) + RAPIDJSON_PARSE_ERROR(kParseErrorTermination, startOffset); + } + + // Parse any JSON value + template + void ParseValue(InputStream& is, Handler& handler) { + switch (is.Peek()) { + case 'n': ParseNull (is, handler); break; + case 't': ParseTrue (is, handler); break; + case 'f': ParseFalse (is, handler); break; + case '"': ParseString(is, handler); break; + case '{': ParseObject(is, handler); break; + case '[': ParseArray (is, handler); break; + default : + ParseNumber(is, handler); + break; + + } + } + + // Iterative Parsing + + // States + enum IterativeParsingState { + IterativeParsingFinishState = 0, // sink states at top + IterativeParsingErrorState, // sink states at top + IterativeParsingStartState, + + // Object states + IterativeParsingObjectInitialState, + IterativeParsingMemberKeyState, + IterativeParsingMemberValueState, + IterativeParsingObjectFinishState, + + // Array states + IterativeParsingArrayInitialState, + IterativeParsingElementState, + IterativeParsingArrayFinishState, + + // Single value state + IterativeParsingValueState, + + // Delimiter states (at bottom) + IterativeParsingElementDelimiterState, + IterativeParsingMemberDelimiterState, + IterativeParsingKeyValueDelimiterState, + + cIterativeParsingStateCount + }; + + // Tokens + enum Token { + LeftBracketToken = 0, + RightBracketToken, + + LeftCurlyBracketToken, + RightCurlyBracketToken, + + CommaToken, + ColonToken, + + StringToken, + FalseToken, + TrueToken, + NullToken, + NumberToken, + + kTokenCount + }; + + RAPIDJSON_FORCEINLINE Token Tokenize(Ch c) const { + +//!@cond RAPIDJSON_HIDDEN_FROM_DOXYGEN +#define N NumberToken +#define N16 N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N + // Maps from ASCII to Token + static const unsigned char tokenMap[256] = { + N16, // 00~0F + N16, // 10~1F + N, N, StringToken, N, N, N, N, N, N, N, N, N, CommaToken, N, N, N, // 20~2F + N, N, N, N, N, N, N, N, N, N, ColonToken, N, N, N, N, N, // 30~3F + N16, // 40~4F + N, N, N, N, N, N, N, N, N, N, N, LeftBracketToken, N, RightBracketToken, N, N, // 50~5F + N, N, N, N, N, N, FalseToken, N, N, N, N, N, N, N, NullToken, N, // 60~6F + N, N, N, N, TrueToken, N, N, N, N, N, N, LeftCurlyBracketToken, N, RightCurlyBracketToken, N, N, // 70~7F + N16, N16, N16, N16, N16, N16, N16, N16 // 80~FF + }; +#undef N +#undef N16 +//!@endcond + + if (sizeof(Ch) == 1 || static_cast(c) < 256) + return static_cast(tokenMap[static_cast(c)]); + else + return NumberToken; + } + + RAPIDJSON_FORCEINLINE IterativeParsingState Predict(IterativeParsingState state, Token token) const { + // current state x one lookahead token -> new state + static const char G[cIterativeParsingStateCount][kTokenCount] = { + // Finish(sink state) + { + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState + }, + // Error(sink state) + { + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState + }, + // Start + { + IterativeParsingArrayInitialState, // Left bracket + IterativeParsingErrorState, // Right bracket + IterativeParsingObjectInitialState, // Left curly bracket + IterativeParsingErrorState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingValueState, // String + IterativeParsingValueState, // False + IterativeParsingValueState, // True + IterativeParsingValueState, // Null + IterativeParsingValueState // Number + }, + // ObjectInitial + { + IterativeParsingErrorState, // Left bracket + IterativeParsingErrorState, // Right bracket + IterativeParsingErrorState, // Left curly bracket + IterativeParsingObjectFinishState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingMemberKeyState, // String + IterativeParsingErrorState, // False + IterativeParsingErrorState, // True + IterativeParsingErrorState, // Null + IterativeParsingErrorState // Number + }, + // MemberKey + { + IterativeParsingErrorState, // Left bracket + IterativeParsingErrorState, // Right bracket + IterativeParsingErrorState, // Left curly bracket + IterativeParsingErrorState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingKeyValueDelimiterState, // Colon + IterativeParsingErrorState, // String + IterativeParsingErrorState, // False + IterativeParsingErrorState, // True + IterativeParsingErrorState, // Null + IterativeParsingErrorState // Number + }, + // MemberValue + { + IterativeParsingErrorState, // Left bracket + IterativeParsingErrorState, // Right bracket + IterativeParsingErrorState, // Left curly bracket + IterativeParsingObjectFinishState, // Right curly bracket + IterativeParsingMemberDelimiterState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingErrorState, // String + IterativeParsingErrorState, // False + IterativeParsingErrorState, // True + IterativeParsingErrorState, // Null + IterativeParsingErrorState // Number + }, + // ObjectFinish(sink state) + { + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState + }, + // ArrayInitial + { + IterativeParsingArrayInitialState, // Left bracket(push Element state) + IterativeParsingArrayFinishState, // Right bracket + IterativeParsingObjectInitialState, // Left curly bracket(push Element state) + IterativeParsingErrorState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingElementState, // String + IterativeParsingElementState, // False + IterativeParsingElementState, // True + IterativeParsingElementState, // Null + IterativeParsingElementState // Number + }, + // Element + { + IterativeParsingErrorState, // Left bracket + IterativeParsingArrayFinishState, // Right bracket + IterativeParsingErrorState, // Left curly bracket + IterativeParsingErrorState, // Right curly bracket + IterativeParsingElementDelimiterState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingErrorState, // String + IterativeParsingErrorState, // False + IterativeParsingErrorState, // True + IterativeParsingErrorState, // Null + IterativeParsingErrorState // Number + }, + // ArrayFinish(sink state) + { + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState + }, + // Single Value (sink state) + { + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, IterativeParsingErrorState, + IterativeParsingErrorState + }, + // ElementDelimiter + { + IterativeParsingArrayInitialState, // Left bracket(push Element state) + IterativeParsingArrayFinishState, // Right bracket + IterativeParsingObjectInitialState, // Left curly bracket(push Element state) + IterativeParsingErrorState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingElementState, // String + IterativeParsingElementState, // False + IterativeParsingElementState, // True + IterativeParsingElementState, // Null + IterativeParsingElementState // Number + }, + // MemberDelimiter + { + IterativeParsingErrorState, // Left bracket + IterativeParsingErrorState, // Right bracket + IterativeParsingErrorState, // Left curly bracket + IterativeParsingObjectFinishState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingMemberKeyState, // String + IterativeParsingErrorState, // False + IterativeParsingErrorState, // True + IterativeParsingErrorState, // Null + IterativeParsingErrorState // Number + }, + // KeyValueDelimiter + { + IterativeParsingArrayInitialState, // Left bracket(push MemberValue state) + IterativeParsingErrorState, // Right bracket + IterativeParsingObjectInitialState, // Left curly bracket(push MemberValue state) + IterativeParsingErrorState, // Right curly bracket + IterativeParsingErrorState, // Comma + IterativeParsingErrorState, // Colon + IterativeParsingMemberValueState, // String + IterativeParsingMemberValueState, // False + IterativeParsingMemberValueState, // True + IterativeParsingMemberValueState, // Null + IterativeParsingMemberValueState // Number + }, + }; // End of G + + return static_cast(G[state][token]); + } + + // Make an advance in the token stream and state based on the candidate destination state which was returned by Transit(). + // May return a new state on state pop. + template + RAPIDJSON_FORCEINLINE IterativeParsingState Transit(IterativeParsingState src, Token token, IterativeParsingState dst, InputStream& is, Handler& handler) { + (void)token; + + switch (dst) { + case IterativeParsingErrorState: + return dst; + + case IterativeParsingObjectInitialState: + case IterativeParsingArrayInitialState: + { + // Push the state(Element or MemeberValue) if we are nested in another array or value of member. + // In this way we can get the correct state on ObjectFinish or ArrayFinish by frame pop. + IterativeParsingState n = src; + if (src == IterativeParsingArrayInitialState || src == IterativeParsingElementDelimiterState) + n = IterativeParsingElementState; + else if (src == IterativeParsingKeyValueDelimiterState) + n = IterativeParsingMemberValueState; + // Push current state. + *stack_.template Push(1) = n; + // Initialize and push the member/element count. + *stack_.template Push(1) = 0; + // Call handler + bool hr = (dst == IterativeParsingObjectInitialState) ? handler.StartObject() : handler.StartArray(); + // On handler short circuits the parsing. + if (!hr) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorTermination, is.Tell()); + return IterativeParsingErrorState; + } + else { + is.Take(); + return dst; + } + } + + case IterativeParsingMemberKeyState: + ParseString(is, handler, true); + if (HasParseError()) + return IterativeParsingErrorState; + else + return dst; + + case IterativeParsingKeyValueDelimiterState: + RAPIDJSON_ASSERT(token == ColonToken); + is.Take(); + return dst; + + case IterativeParsingMemberValueState: + // Must be non-compound value. Or it would be ObjectInitial or ArrayInitial state. + ParseValue(is, handler); + if (HasParseError()) { + return IterativeParsingErrorState; + } + return dst; + + case IterativeParsingElementState: + // Must be non-compound value. Or it would be ObjectInitial or ArrayInitial state. + ParseValue(is, handler); + if (HasParseError()) { + return IterativeParsingErrorState; + } + return dst; + + case IterativeParsingMemberDelimiterState: + case IterativeParsingElementDelimiterState: + is.Take(); + // Update member/element count. + *stack_.template Top() = *stack_.template Top() + 1; + return dst; + + case IterativeParsingObjectFinishState: + { + // Transit from delimiter is only allowed when trailing commas are enabled + if (!(parseFlags & kParseTrailingCommasFlag) && src == IterativeParsingMemberDelimiterState) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorObjectMissName, is.Tell()); + return IterativeParsingErrorState; + } + // Get member count. + SizeType c = *stack_.template Pop(1); + // If the object is not empty, count the last member. + if (src == IterativeParsingMemberValueState) + ++c; + // Restore the state. + IterativeParsingState n = static_cast(*stack_.template Pop(1)); + // Transit to Finish state if this is the topmost scope. + if (n == IterativeParsingStartState) + n = IterativeParsingFinishState; + // Call handler + bool hr = handler.EndObject(c); + // On handler short circuits the parsing. + if (!hr) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorTermination, is.Tell()); + return IterativeParsingErrorState; + } + else { + is.Take(); + return n; + } + } + + case IterativeParsingArrayFinishState: + { + // Transit from delimiter is only allowed when trailing commas are enabled + if (!(parseFlags & kParseTrailingCommasFlag) && src == IterativeParsingElementDelimiterState) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorValueInvalid, is.Tell()); + return IterativeParsingErrorState; + } + // Get element count. + SizeType c = *stack_.template Pop(1); + // If the array is not empty, count the last element. + if (src == IterativeParsingElementState) + ++c; + // Restore the state. + IterativeParsingState n = static_cast(*stack_.template Pop(1)); + // Transit to Finish state if this is the topmost scope. + if (n == IterativeParsingStartState) + n = IterativeParsingFinishState; + // Call handler + bool hr = handler.EndArray(c); + // On handler short circuits the parsing. + if (!hr) { + RAPIDJSON_PARSE_ERROR_NORETURN(kParseErrorTermination, is.Tell()); + return IterativeParsingErrorState; + } + else { + is.Take(); + return n; + } + } + + default: + // This branch is for IterativeParsingValueState actually. + // Use `default:` rather than + // `case IterativeParsingValueState:` is for code coverage. + + // The IterativeParsingStartState is not enumerated in this switch-case. + // It is impossible for that case. And it can be caught by following assertion. + + // The IterativeParsingFinishState is not enumerated in this switch-case either. + // It is a "derivative" state which cannot triggered from Predict() directly. + // Therefore it cannot happen here. And it can be caught by following assertion. + RAPIDJSON_ASSERT(dst == IterativeParsingValueState); + + // Must be non-compound value. Or it would be ObjectInitial or ArrayInitial state. + ParseValue(is, handler); + if (HasParseError()) { + return IterativeParsingErrorState; + } + return IterativeParsingFinishState; + } + } + + template + void HandleError(IterativeParsingState src, InputStream& is) { + if (HasParseError()) { + // Error flag has been set. + return; + } + + switch (src) { + case IterativeParsingStartState: RAPIDJSON_PARSE_ERROR(kParseErrorDocumentEmpty, is.Tell()); return; + case IterativeParsingFinishState: RAPIDJSON_PARSE_ERROR(kParseErrorDocumentRootNotSingular, is.Tell()); return; + case IterativeParsingObjectInitialState: + case IterativeParsingMemberDelimiterState: RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissName, is.Tell()); return; + case IterativeParsingMemberKeyState: RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissColon, is.Tell()); return; + case IterativeParsingMemberValueState: RAPIDJSON_PARSE_ERROR(kParseErrorObjectMissCommaOrCurlyBracket, is.Tell()); return; + case IterativeParsingKeyValueDelimiterState: + case IterativeParsingArrayInitialState: + case IterativeParsingElementDelimiterState: RAPIDJSON_PARSE_ERROR(kParseErrorValueInvalid, is.Tell()); return; + default: RAPIDJSON_ASSERT(src == IterativeParsingElementState); RAPIDJSON_PARSE_ERROR(kParseErrorArrayMissCommaOrSquareBracket, is.Tell()); return; + } + } + + RAPIDJSON_FORCEINLINE bool IsIterativeParsingDelimiterState(IterativeParsingState s) const { + return s >= IterativeParsingElementDelimiterState; + } + + RAPIDJSON_FORCEINLINE bool IsIterativeParsingCompleteState(IterativeParsingState s) const { + return s <= IterativeParsingErrorState; + } + + template + ParseResult IterativeParse(InputStream& is, Handler& handler) { + parseResult_.Clear(); + ClearStackOnExit scope(*this); + IterativeParsingState state = IterativeParsingStartState; + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + while (is.Peek() != '\0') { + Token t = Tokenize(is.Peek()); + IterativeParsingState n = Predict(state, t); + IterativeParsingState d = Transit(state, t, n, is, handler); + + if (d == IterativeParsingErrorState) { + HandleError(state, is); + break; + } + + state = d; + + // Do not further consume streams if a root JSON has been parsed. + if ((parseFlags & kParseStopWhenDoneFlag) && state == IterativeParsingFinishState) + break; + + SkipWhitespaceAndComments(is); + RAPIDJSON_PARSE_ERROR_EARLY_RETURN(parseResult_); + } + + // Handle the end of file. + if (state != IterativeParsingFinishState) + HandleError(state, is); + + return parseResult_; + } + + static const size_t kDefaultStackCapacity = 256; //!< Default stack capacity in bytes for storing a single decoded string. + internal::Stack stack_; //!< A stack for storing decoded string temporarily during non-destructive parsing. + ParseResult parseResult_; + IterativeParsingState state_; +}; // class GenericReader + +//! Reader with UTF8 encoding and default allocator. +typedef GenericReader, UTF8<> > Reader; + +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) || defined(_MSC_VER) +RAPIDJSON_DIAG_POP +#endif + + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_READER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/schema.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/schema.h new file mode 100644 index 0000000..57ec797 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/schema.h @@ -0,0 +1,2496 @@ +// Tencent is pleased to support the open source community by making RapidJSON available-> +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip-> All rights reserved-> +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License-> You may obtain a copy of the License at +// +// http://opensource->org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied-> See the License for the +// specific language governing permissions and limitations under the License-> + +#ifndef RAPIDJSON_SCHEMA_H_ +#define RAPIDJSON_SCHEMA_H_ + +#include "document.h" +#include "pointer.h" +#include "stringbuffer.h" +#include // abs, floor + +#if !defined(RAPIDJSON_SCHEMA_USE_INTERNALREGEX) +#define RAPIDJSON_SCHEMA_USE_INTERNALREGEX 1 +#else +#define RAPIDJSON_SCHEMA_USE_INTERNALREGEX 0 +#endif + +#if !RAPIDJSON_SCHEMA_USE_INTERNALREGEX && defined(RAPIDJSON_SCHEMA_USE_STDREGEX) && (__cplusplus >=201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)) +#define RAPIDJSON_SCHEMA_USE_STDREGEX 1 +#else +#define RAPIDJSON_SCHEMA_USE_STDREGEX 0 +#endif + +#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX +#include "internal/regex.h" +#elif RAPIDJSON_SCHEMA_USE_STDREGEX +#include +#endif + +#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX || RAPIDJSON_SCHEMA_USE_STDREGEX +#define RAPIDJSON_SCHEMA_HAS_REGEX 1 +#else +#define RAPIDJSON_SCHEMA_HAS_REGEX 0 +#endif + +#ifndef RAPIDJSON_SCHEMA_VERBOSE +#define RAPIDJSON_SCHEMA_VERBOSE 0 +#endif + +#if RAPIDJSON_SCHEMA_VERBOSE +#include "stringbuffer.h" +#endif + +RAPIDJSON_DIAG_PUSH + +#if defined(__GNUC__) +RAPIDJSON_DIAG_OFF(effc++) +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_OFF(weak-vtables) +RAPIDJSON_DIAG_OFF(exit-time-destructors) +RAPIDJSON_DIAG_OFF(c++98-compat-pedantic) +RAPIDJSON_DIAG_OFF(variadic-macros) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// Verbose Utilities + +#if RAPIDJSON_SCHEMA_VERBOSE + +namespace internal { + +inline void PrintInvalidKeyword(const char* keyword) { + printf("Fail keyword: %s\n", keyword); +} + +inline void PrintInvalidKeyword(const wchar_t* keyword) { + wprintf(L"Fail keyword: %ls\n", keyword); +} + +inline void PrintInvalidDocument(const char* document) { + printf("Fail document: %s\n\n", document); +} + +inline void PrintInvalidDocument(const wchar_t* document) { + wprintf(L"Fail document: %ls\n\n", document); +} + +inline void PrintValidatorPointers(unsigned depth, const char* s, const char* d) { + printf("S: %*s%s\nD: %*s%s\n\n", depth * 4, " ", s, depth * 4, " ", d); +} + +inline void PrintValidatorPointers(unsigned depth, const wchar_t* s, const wchar_t* d) { + wprintf(L"S: %*ls%ls\nD: %*ls%ls\n\n", depth * 4, L" ", s, depth * 4, L" ", d); +} + +} // namespace internal + +#endif // RAPIDJSON_SCHEMA_VERBOSE + +/////////////////////////////////////////////////////////////////////////////// +// RAPIDJSON_INVALID_KEYWORD_RETURN + +#if RAPIDJSON_SCHEMA_VERBOSE +#define RAPIDJSON_INVALID_KEYWORD_VERBOSE(keyword) internal::PrintInvalidKeyword(keyword) +#else +#define RAPIDJSON_INVALID_KEYWORD_VERBOSE(keyword) +#endif + +#define RAPIDJSON_INVALID_KEYWORD_RETURN(keyword)\ +RAPIDJSON_MULTILINEMACRO_BEGIN\ + context.invalidKeyword = keyword.GetString();\ + RAPIDJSON_INVALID_KEYWORD_VERBOSE(keyword.GetString());\ + return false;\ +RAPIDJSON_MULTILINEMACRO_END + +/////////////////////////////////////////////////////////////////////////////// +// Forward declarations + +template +class GenericSchemaDocument; + +namespace internal { + +template +class Schema; + +/////////////////////////////////////////////////////////////////////////////// +// ISchemaValidator + +class ISchemaValidator { +public: + virtual ~ISchemaValidator() {} + virtual bool IsValid() const = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// +// ISchemaStateFactory + +template +class ISchemaStateFactory { +public: + virtual ~ISchemaStateFactory() {} + virtual ISchemaValidator* CreateSchemaValidator(const SchemaType&) = 0; + virtual void DestroySchemaValidator(ISchemaValidator* validator) = 0; + virtual void* CreateHasher() = 0; + virtual uint64_t GetHashCode(void* hasher) = 0; + virtual void DestroryHasher(void* hasher) = 0; + virtual void* MallocState(size_t size) = 0; + virtual void FreeState(void* p) = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// +// IValidationErrorHandler + +template +class IValidationErrorHandler { +public: + typedef typename SchemaType::Ch Ch; + typedef typename SchemaType::SValue SValue; + + virtual ~IValidationErrorHandler() {} + + virtual void NotMultipleOf(int64_t actual, const SValue& expected) = 0; + virtual void NotMultipleOf(uint64_t actual, const SValue& expected) = 0; + virtual void NotMultipleOf(double actual, const SValue& expected) = 0; + virtual void AboveMaximum(int64_t actual, const SValue& expected, bool exclusive) = 0; + virtual void AboveMaximum(uint64_t actual, const SValue& expected, bool exclusive) = 0; + virtual void AboveMaximum(double actual, const SValue& expected, bool exclusive) = 0; + virtual void BelowMinimum(int64_t actual, const SValue& expected, bool exclusive) = 0; + virtual void BelowMinimum(uint64_t actual, const SValue& expected, bool exclusive) = 0; + virtual void BelowMinimum(double actual, const SValue& expected, bool exclusive) = 0; + + virtual void TooLong(const Ch* str, SizeType length, SizeType expected) = 0; + virtual void TooShort(const Ch* str, SizeType length, SizeType expected) = 0; + virtual void DoesNotMatch(const Ch* str, SizeType length) = 0; + + virtual void DisallowedItem(SizeType index) = 0; + virtual void TooFewItems(SizeType actualCount, SizeType expectedCount) = 0; + virtual void TooManyItems(SizeType actualCount, SizeType expectedCount) = 0; + virtual void DuplicateItems(SizeType index1, SizeType index2) = 0; + + virtual void TooManyProperties(SizeType actualCount, SizeType expectedCount) = 0; + virtual void TooFewProperties(SizeType actualCount, SizeType expectedCount) = 0; + virtual void StartMissingProperties() = 0; + virtual void AddMissingProperty(const SValue& name) = 0; + virtual bool EndMissingProperties() = 0; + virtual void PropertyViolations(ISchemaValidator** subvalidators, SizeType count) = 0; + virtual void DisallowedProperty(const Ch* name, SizeType length) = 0; + + virtual void StartDependencyErrors() = 0; + virtual void StartMissingDependentProperties() = 0; + virtual void AddMissingDependentProperty(const SValue& targetName) = 0; + virtual void EndMissingDependentProperties(const SValue& sourceName) = 0; + virtual void AddDependencySchemaError(const SValue& souceName, ISchemaValidator* subvalidator) = 0; + virtual bool EndDependencyErrors() = 0; + + virtual void DisallowedValue() = 0; + virtual void StartDisallowedType() = 0; + virtual void AddExpectedType(const typename SchemaType::ValueType& expectedType) = 0; + virtual void EndDisallowedType(const typename SchemaType::ValueType& actualType) = 0; + virtual void NotAllOf(ISchemaValidator** subvalidators, SizeType count) = 0; + virtual void NoneOf(ISchemaValidator** subvalidators, SizeType count) = 0; + virtual void NotOneOf(ISchemaValidator** subvalidators, SizeType count) = 0; + virtual void Disallowed() = 0; +}; + + +/////////////////////////////////////////////////////////////////////////////// +// Hasher + +// For comparison of compound value +template +class Hasher { +public: + typedef typename Encoding::Ch Ch; + + Hasher(Allocator* allocator = 0, size_t stackCapacity = kDefaultSize) : stack_(allocator, stackCapacity) {} + + bool Null() { return WriteType(kNullType); } + bool Bool(bool b) { return WriteType(b ? kTrueType : kFalseType); } + bool Int(int i) { Number n; n.u.i = i; n.d = static_cast(i); return WriteNumber(n); } + bool Uint(unsigned u) { Number n; n.u.u = u; n.d = static_cast(u); return WriteNumber(n); } + bool Int64(int64_t i) { Number n; n.u.i = i; n.d = static_cast(i); return WriteNumber(n); } + bool Uint64(uint64_t u) { Number n; n.u.u = u; n.d = static_cast(u); return WriteNumber(n); } + bool Double(double d) { + Number n; + if (d < 0) n.u.i = static_cast(d); + else n.u.u = static_cast(d); + n.d = d; + return WriteNumber(n); + } + + bool RawNumber(const Ch* str, SizeType len, bool) { + WriteBuffer(kNumberType, str, len * sizeof(Ch)); + return true; + } + + bool String(const Ch* str, SizeType len, bool) { + WriteBuffer(kStringType, str, len * sizeof(Ch)); + return true; + } + + bool StartObject() { return true; } + bool Key(const Ch* str, SizeType len, bool copy) { return String(str, len, copy); } + bool EndObject(SizeType memberCount) { + uint64_t h = Hash(0, kObjectType); + uint64_t* kv = stack_.template Pop(memberCount * 2); + for (SizeType i = 0; i < memberCount; i++) + h ^= Hash(kv[i * 2], kv[i * 2 + 1]); // Use xor to achieve member order insensitive + *stack_.template Push() = h; + return true; + } + + bool StartArray() { return true; } + bool EndArray(SizeType elementCount) { + uint64_t h = Hash(0, kArrayType); + uint64_t* e = stack_.template Pop(elementCount); + for (SizeType i = 0; i < elementCount; i++) + h = Hash(h, e[i]); // Use hash to achieve element order sensitive + *stack_.template Push() = h; + return true; + } + + bool IsValid() const { return stack_.GetSize() == sizeof(uint64_t); } + + uint64_t GetHashCode() const { + RAPIDJSON_ASSERT(IsValid()); + return *stack_.template Top(); + } + +private: + static const size_t kDefaultSize = 256; + struct Number { + union U { + uint64_t u; + int64_t i; + }u; + double d; + }; + + bool WriteType(Type type) { return WriteBuffer(type, 0, 0); } + + bool WriteNumber(const Number& n) { return WriteBuffer(kNumberType, &n, sizeof(n)); } + + bool WriteBuffer(Type type, const void* data, size_t len) { + // FNV-1a from http://isthe.com/chongo/tech/comp/fnv/ + uint64_t h = Hash(RAPIDJSON_UINT64_C2(0x84222325, 0xcbf29ce4), type); + const unsigned char* d = static_cast(data); + for (size_t i = 0; i < len; i++) + h = Hash(h, d[i]); + *stack_.template Push() = h; + return true; + } + + static uint64_t Hash(uint64_t h, uint64_t d) { + static const uint64_t kPrime = RAPIDJSON_UINT64_C2(0x00000100, 0x000001b3); + h ^= d; + h *= kPrime; + return h; + } + + Stack stack_; +}; + +/////////////////////////////////////////////////////////////////////////////// +// SchemaValidationContext + +template +struct SchemaValidationContext { + typedef Schema SchemaType; + typedef ISchemaStateFactory SchemaValidatorFactoryType; + typedef IValidationErrorHandler ErrorHandlerType; + typedef typename SchemaType::ValueType ValueType; + typedef typename ValueType::Ch Ch; + + enum PatternValidatorType { + kPatternValidatorOnly, + kPatternValidatorWithProperty, + kPatternValidatorWithAdditionalProperty + }; + + SchemaValidationContext(SchemaValidatorFactoryType& f, ErrorHandlerType& eh, const SchemaType* s) : + factory(f), + error_handler(eh), + schema(s), + valueSchema(), + invalidKeyword(), + hasher(), + arrayElementHashCodes(), + validators(), + validatorCount(), + patternPropertiesValidators(), + patternPropertiesValidatorCount(), + patternPropertiesSchemas(), + patternPropertiesSchemaCount(), + valuePatternValidatorType(kPatternValidatorOnly), + propertyExist(), + inArray(false), + valueUniqueness(false), + arrayUniqueness(false) + { + } + + ~SchemaValidationContext() { + if (hasher) + factory.DestroryHasher(hasher); + if (validators) { + for (SizeType i = 0; i < validatorCount; i++) + factory.DestroySchemaValidator(validators[i]); + factory.FreeState(validators); + } + if (patternPropertiesValidators) { + for (SizeType i = 0; i < patternPropertiesValidatorCount; i++) + factory.DestroySchemaValidator(patternPropertiesValidators[i]); + factory.FreeState(patternPropertiesValidators); + } + if (patternPropertiesSchemas) + factory.FreeState(patternPropertiesSchemas); + if (propertyExist) + factory.FreeState(propertyExist); + } + + SchemaValidatorFactoryType& factory; + ErrorHandlerType& error_handler; + const SchemaType* schema; + const SchemaType* valueSchema; + const Ch* invalidKeyword; + void* hasher; // Only validator access + void* arrayElementHashCodes; // Only validator access this + ISchemaValidator** validators; + SizeType validatorCount; + ISchemaValidator** patternPropertiesValidators; + SizeType patternPropertiesValidatorCount; + const SchemaType** patternPropertiesSchemas; + SizeType patternPropertiesSchemaCount; + PatternValidatorType valuePatternValidatorType; + PatternValidatorType objectPatternValidatorType; + SizeType arrayElementIndex; + bool* propertyExist; + bool inArray; + bool valueUniqueness; + bool arrayUniqueness; +}; + +/////////////////////////////////////////////////////////////////////////////// +// Schema + +template +class Schema { +public: + typedef typename SchemaDocumentType::ValueType ValueType; + typedef typename SchemaDocumentType::AllocatorType AllocatorType; + typedef typename SchemaDocumentType::PointerType PointerType; + typedef typename ValueType::EncodingType EncodingType; + typedef typename EncodingType::Ch Ch; + typedef SchemaValidationContext Context; + typedef Schema SchemaType; + typedef GenericValue SValue; + typedef IValidationErrorHandler ErrorHandler; + friend class GenericSchemaDocument; + + Schema(SchemaDocumentType* schemaDocument, const PointerType& p, const ValueType& value, const ValueType& document, AllocatorType* allocator) : + allocator_(allocator), + uri_(schemaDocument->GetURI(), *allocator), + pointer_(p), + typeless_(schemaDocument->GetTypeless()), + enum_(), + enumCount_(), + not_(), + type_((1 << kTotalSchemaType) - 1), // typeless + validatorCount_(), + notValidatorIndex_(), + properties_(), + additionalPropertiesSchema_(), + patternProperties_(), + patternPropertyCount_(), + propertyCount_(), + minProperties_(), + maxProperties_(SizeType(~0)), + additionalProperties_(true), + hasDependencies_(), + hasRequired_(), + hasSchemaDependencies_(), + additionalItemsSchema_(), + itemsList_(), + itemsTuple_(), + itemsTupleCount_(), + minItems_(), + maxItems_(SizeType(~0)), + additionalItems_(true), + uniqueItems_(false), + pattern_(), + minLength_(0), + maxLength_(~SizeType(0)), + exclusiveMinimum_(false), + exclusiveMaximum_(false), + defaultValueLength_(0) + { + typedef typename SchemaDocumentType::ValueType ValueType; + typedef typename ValueType::ConstValueIterator ConstValueIterator; + typedef typename ValueType::ConstMemberIterator ConstMemberIterator; + + if (!value.IsObject()) + return; + + if (const ValueType* v = GetMember(value, GetTypeString())) { + type_ = 0; + if (v->IsString()) + AddType(*v); + else if (v->IsArray()) + for (ConstValueIterator itr = v->Begin(); itr != v->End(); ++itr) + AddType(*itr); + } + + if (const ValueType* v = GetMember(value, GetEnumString())) + if (v->IsArray() && v->Size() > 0) { + enum_ = static_cast(allocator_->Malloc(sizeof(uint64_t) * v->Size())); + for (ConstValueIterator itr = v->Begin(); itr != v->End(); ++itr) { + typedef Hasher > EnumHasherType; + char buffer[256u + 24]; + MemoryPoolAllocator<> hasherAllocator(buffer, sizeof(buffer)); + EnumHasherType h(&hasherAllocator, 256); + itr->Accept(h); + enum_[enumCount_++] = h.GetHashCode(); + } + } + + if (schemaDocument) { + AssignIfExist(allOf_, *schemaDocument, p, value, GetAllOfString(), document); + AssignIfExist(anyOf_, *schemaDocument, p, value, GetAnyOfString(), document); + AssignIfExist(oneOf_, *schemaDocument, p, value, GetOneOfString(), document); + } + + if (const ValueType* v = GetMember(value, GetNotString())) { + schemaDocument->CreateSchema(¬_, p.Append(GetNotString(), allocator_), *v, document); + notValidatorIndex_ = validatorCount_; + validatorCount_++; + } + + // Object + + const ValueType* properties = GetMember(value, GetPropertiesString()); + const ValueType* required = GetMember(value, GetRequiredString()); + const ValueType* dependencies = GetMember(value, GetDependenciesString()); + { + // Gather properties from properties/required/dependencies + SValue allProperties(kArrayType); + + if (properties && properties->IsObject()) + for (ConstMemberIterator itr = properties->MemberBegin(); itr != properties->MemberEnd(); ++itr) + AddUniqueElement(allProperties, itr->name); + + if (required && required->IsArray()) + for (ConstValueIterator itr = required->Begin(); itr != required->End(); ++itr) + if (itr->IsString()) + AddUniqueElement(allProperties, *itr); + + if (dependencies && dependencies->IsObject()) + for (ConstMemberIterator itr = dependencies->MemberBegin(); itr != dependencies->MemberEnd(); ++itr) { + AddUniqueElement(allProperties, itr->name); + if (itr->value.IsArray()) + for (ConstValueIterator i = itr->value.Begin(); i != itr->value.End(); ++i) + if (i->IsString()) + AddUniqueElement(allProperties, *i); + } + + if (allProperties.Size() > 0) { + propertyCount_ = allProperties.Size(); + properties_ = static_cast(allocator_->Malloc(sizeof(Property) * propertyCount_)); + for (SizeType i = 0; i < propertyCount_; i++) { + new (&properties_[i]) Property(); + properties_[i].name = allProperties[i]; + properties_[i].schema = typeless_; + } + } + } + + if (properties && properties->IsObject()) { + PointerType q = p.Append(GetPropertiesString(), allocator_); + for (ConstMemberIterator itr = properties->MemberBegin(); itr != properties->MemberEnd(); ++itr) { + SizeType index; + if (FindPropertyIndex(itr->name, &index)) + schemaDocument->CreateSchema(&properties_[index].schema, q.Append(itr->name, allocator_), itr->value, document); + } + } + + if (const ValueType* v = GetMember(value, GetPatternPropertiesString())) { + PointerType q = p.Append(GetPatternPropertiesString(), allocator_); + patternProperties_ = static_cast(allocator_->Malloc(sizeof(PatternProperty) * v->MemberCount())); + patternPropertyCount_ = 0; + + for (ConstMemberIterator itr = v->MemberBegin(); itr != v->MemberEnd(); ++itr) { + new (&patternProperties_[patternPropertyCount_]) PatternProperty(); + patternProperties_[patternPropertyCount_].pattern = CreatePattern(itr->name); + schemaDocument->CreateSchema(&patternProperties_[patternPropertyCount_].schema, q.Append(itr->name, allocator_), itr->value, document); + patternPropertyCount_++; + } + } + + if (required && required->IsArray()) + for (ConstValueIterator itr = required->Begin(); itr != required->End(); ++itr) + if (itr->IsString()) { + SizeType index; + if (FindPropertyIndex(*itr, &index)) { + properties_[index].required = true; + hasRequired_ = true; + } + } + + if (dependencies && dependencies->IsObject()) { + PointerType q = p.Append(GetDependenciesString(), allocator_); + hasDependencies_ = true; + for (ConstMemberIterator itr = dependencies->MemberBegin(); itr != dependencies->MemberEnd(); ++itr) { + SizeType sourceIndex; + if (FindPropertyIndex(itr->name, &sourceIndex)) { + if (itr->value.IsArray()) { + properties_[sourceIndex].dependencies = static_cast(allocator_->Malloc(sizeof(bool) * propertyCount_)); + std::memset(properties_[sourceIndex].dependencies, 0, sizeof(bool)* propertyCount_); + for (ConstValueIterator targetItr = itr->value.Begin(); targetItr != itr->value.End(); ++targetItr) { + SizeType targetIndex; + if (FindPropertyIndex(*targetItr, &targetIndex)) + properties_[sourceIndex].dependencies[targetIndex] = true; + } + } + else if (itr->value.IsObject()) { + hasSchemaDependencies_ = true; + schemaDocument->CreateSchema(&properties_[sourceIndex].dependenciesSchema, q.Append(itr->name, allocator_), itr->value, document); + properties_[sourceIndex].dependenciesValidatorIndex = validatorCount_; + validatorCount_++; + } + } + } + } + + if (const ValueType* v = GetMember(value, GetAdditionalPropertiesString())) { + if (v->IsBool()) + additionalProperties_ = v->GetBool(); + else if (v->IsObject()) + schemaDocument->CreateSchema(&additionalPropertiesSchema_, p.Append(GetAdditionalPropertiesString(), allocator_), *v, document); + } + + AssignIfExist(minProperties_, value, GetMinPropertiesString()); + AssignIfExist(maxProperties_, value, GetMaxPropertiesString()); + + // Array + if (const ValueType* v = GetMember(value, GetItemsString())) { + PointerType q = p.Append(GetItemsString(), allocator_); + if (v->IsObject()) // List validation + schemaDocument->CreateSchema(&itemsList_, q, *v, document); + else if (v->IsArray()) { // Tuple validation + itemsTuple_ = static_cast(allocator_->Malloc(sizeof(const Schema*) * v->Size())); + SizeType index = 0; + for (ConstValueIterator itr = v->Begin(); itr != v->End(); ++itr, index++) + schemaDocument->CreateSchema(&itemsTuple_[itemsTupleCount_++], q.Append(index, allocator_), *itr, document); + } + } + + AssignIfExist(minItems_, value, GetMinItemsString()); + AssignIfExist(maxItems_, value, GetMaxItemsString()); + + if (const ValueType* v = GetMember(value, GetAdditionalItemsString())) { + if (v->IsBool()) + additionalItems_ = v->GetBool(); + else if (v->IsObject()) + schemaDocument->CreateSchema(&additionalItemsSchema_, p.Append(GetAdditionalItemsString(), allocator_), *v, document); + } + + AssignIfExist(uniqueItems_, value, GetUniqueItemsString()); + + // String + AssignIfExist(minLength_, value, GetMinLengthString()); + AssignIfExist(maxLength_, value, GetMaxLengthString()); + + if (const ValueType* v = GetMember(value, GetPatternString())) + pattern_ = CreatePattern(*v); + + // Number + if (const ValueType* v = GetMember(value, GetMinimumString())) + if (v->IsNumber()) + minimum_.CopyFrom(*v, *allocator_); + + if (const ValueType* v = GetMember(value, GetMaximumString())) + if (v->IsNumber()) + maximum_.CopyFrom(*v, *allocator_); + + AssignIfExist(exclusiveMinimum_, value, GetExclusiveMinimumString()); + AssignIfExist(exclusiveMaximum_, value, GetExclusiveMaximumString()); + + if (const ValueType* v = GetMember(value, GetMultipleOfString())) + if (v->IsNumber() && v->GetDouble() > 0.0) + multipleOf_.CopyFrom(*v, *allocator_); + + // Default + if (const ValueType* v = GetMember(value, GetDefaultValueString())) + if (v->IsString()) + defaultValueLength_ = v->GetStringLength(); + + } + + ~Schema() { + AllocatorType::Free(enum_); + if (properties_) { + for (SizeType i = 0; i < propertyCount_; i++) + properties_[i].~Property(); + AllocatorType::Free(properties_); + } + if (patternProperties_) { + for (SizeType i = 0; i < patternPropertyCount_; i++) + patternProperties_[i].~PatternProperty(); + AllocatorType::Free(patternProperties_); + } + AllocatorType::Free(itemsTuple_); +#if RAPIDJSON_SCHEMA_HAS_REGEX + if (pattern_) { + pattern_->~RegexType(); + AllocatorType::Free(pattern_); + } +#endif + } + + const SValue& GetURI() const { + return uri_; + } + + const PointerType& GetPointer() const { + return pointer_; + } + + bool BeginValue(Context& context) const { + if (context.inArray) { + if (uniqueItems_) + context.valueUniqueness = true; + + if (itemsList_) + context.valueSchema = itemsList_; + else if (itemsTuple_) { + if (context.arrayElementIndex < itemsTupleCount_) + context.valueSchema = itemsTuple_[context.arrayElementIndex]; + else if (additionalItemsSchema_) + context.valueSchema = additionalItemsSchema_; + else if (additionalItems_) + context.valueSchema = typeless_; + else { + context.error_handler.DisallowedItem(context.arrayElementIndex); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetItemsString()); + } + } + else + context.valueSchema = typeless_; + + context.arrayElementIndex++; + } + return true; + } + + RAPIDJSON_FORCEINLINE bool EndValue(Context& context) const { + if (context.patternPropertiesValidatorCount > 0) { + bool otherValid = false; + SizeType count = context.patternPropertiesValidatorCount; + if (context.objectPatternValidatorType != Context::kPatternValidatorOnly) + otherValid = context.patternPropertiesValidators[--count]->IsValid(); + + bool patternValid = true; + for (SizeType i = 0; i < count; i++) + if (!context.patternPropertiesValidators[i]->IsValid()) { + patternValid = false; + break; + } + + if (context.objectPatternValidatorType == Context::kPatternValidatorOnly) { + if (!patternValid) { + context.error_handler.PropertyViolations(context.patternPropertiesValidators, count); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetPatternPropertiesString()); + } + } + else if (context.objectPatternValidatorType == Context::kPatternValidatorWithProperty) { + if (!patternValid || !otherValid) { + context.error_handler.PropertyViolations(context.patternPropertiesValidators, count + 1); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetPatternPropertiesString()); + } + } + else if (!patternValid && !otherValid) { // kPatternValidatorWithAdditionalProperty) + context.error_handler.PropertyViolations(context.patternPropertiesValidators, count + 1); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetPatternPropertiesString()); + } + } + + if (enum_) { + const uint64_t h = context.factory.GetHashCode(context.hasher); + for (SizeType i = 0; i < enumCount_; i++) + if (enum_[i] == h) + goto foundEnum; + context.error_handler.DisallowedValue(); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetEnumString()); + foundEnum:; + } + + if (allOf_.schemas) + for (SizeType i = allOf_.begin; i < allOf_.begin + allOf_.count; i++) + if (!context.validators[i]->IsValid()) { + context.error_handler.NotAllOf(&context.validators[allOf_.begin], allOf_.count); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetAllOfString()); + } + + if (anyOf_.schemas) { + for (SizeType i = anyOf_.begin; i < anyOf_.begin + anyOf_.count; i++) + if (context.validators[i]->IsValid()) + goto foundAny; + context.error_handler.NoneOf(&context.validators[anyOf_.begin], anyOf_.count); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetAnyOfString()); + foundAny:; + } + + if (oneOf_.schemas) { + bool oneValid = false; + for (SizeType i = oneOf_.begin; i < oneOf_.begin + oneOf_.count; i++) + if (context.validators[i]->IsValid()) { + if (oneValid) { + context.error_handler.NotOneOf(&context.validators[oneOf_.begin], oneOf_.count); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetOneOfString()); + } else + oneValid = true; + } + if (!oneValid) { + context.error_handler.NotOneOf(&context.validators[oneOf_.begin], oneOf_.count); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetOneOfString()); + } + } + + if (not_ && context.validators[notValidatorIndex_]->IsValid()) { + context.error_handler.Disallowed(); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetNotString()); + } + + return true; + } + + bool Null(Context& context) const { + if (!(type_ & (1 << kNullSchemaType))) { + DisallowedType(context, GetNullString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + return CreateParallelValidator(context); + } + + bool Bool(Context& context, bool) const { + if (!(type_ & (1 << kBooleanSchemaType))) { + DisallowedType(context, GetBooleanString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + return CreateParallelValidator(context); + } + + bool Int(Context& context, int i) const { + if (!CheckInt(context, i)) + return false; + return CreateParallelValidator(context); + } + + bool Uint(Context& context, unsigned u) const { + if (!CheckUint(context, u)) + return false; + return CreateParallelValidator(context); + } + + bool Int64(Context& context, int64_t i) const { + if (!CheckInt(context, i)) + return false; + return CreateParallelValidator(context); + } + + bool Uint64(Context& context, uint64_t u) const { + if (!CheckUint(context, u)) + return false; + return CreateParallelValidator(context); + } + + bool Double(Context& context, double d) const { + if (!(type_ & (1 << kNumberSchemaType))) { + DisallowedType(context, GetNumberString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + + if (!minimum_.IsNull() && !CheckDoubleMinimum(context, d)) + return false; + + if (!maximum_.IsNull() && !CheckDoubleMaximum(context, d)) + return false; + + if (!multipleOf_.IsNull() && !CheckDoubleMultipleOf(context, d)) + return false; + + return CreateParallelValidator(context); + } + + bool String(Context& context, const Ch* str, SizeType length, bool) const { + if (!(type_ & (1 << kStringSchemaType))) { + DisallowedType(context, GetStringString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + + if (minLength_ != 0 || maxLength_ != SizeType(~0)) { + SizeType count; + if (internal::CountStringCodePoint(str, length, &count)) { + if (count < minLength_) { + context.error_handler.TooShort(str, length, minLength_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinLengthString()); + } + if (count > maxLength_) { + context.error_handler.TooLong(str, length, maxLength_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaxLengthString()); + } + } + } + + if (pattern_ && !IsPatternMatch(pattern_, str, length)) { + context.error_handler.DoesNotMatch(str, length); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetPatternString()); + } + + return CreateParallelValidator(context); + } + + bool StartObject(Context& context) const { + if (!(type_ & (1 << kObjectSchemaType))) { + DisallowedType(context, GetObjectString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + + if (hasDependencies_ || hasRequired_) { + context.propertyExist = static_cast(context.factory.MallocState(sizeof(bool) * propertyCount_)); + std::memset(context.propertyExist, 0, sizeof(bool) * propertyCount_); + } + + if (patternProperties_) { // pre-allocate schema array + SizeType count = patternPropertyCount_ + 1; // extra for valuePatternValidatorType + context.patternPropertiesSchemas = static_cast(context.factory.MallocState(sizeof(const SchemaType*) * count)); + context.patternPropertiesSchemaCount = 0; + std::memset(context.patternPropertiesSchemas, 0, sizeof(SchemaType*) * count); + } + + return CreateParallelValidator(context); + } + + bool Key(Context& context, const Ch* str, SizeType len, bool) const { + if (patternProperties_) { + context.patternPropertiesSchemaCount = 0; + for (SizeType i = 0; i < patternPropertyCount_; i++) + if (patternProperties_[i].pattern && IsPatternMatch(patternProperties_[i].pattern, str, len)) { + context.patternPropertiesSchemas[context.patternPropertiesSchemaCount++] = patternProperties_[i].schema; + context.valueSchema = typeless_; + } + } + + SizeType index; + if (FindPropertyIndex(ValueType(str, len).Move(), &index)) { + if (context.patternPropertiesSchemaCount > 0) { + context.patternPropertiesSchemas[context.patternPropertiesSchemaCount++] = properties_[index].schema; + context.valueSchema = typeless_; + context.valuePatternValidatorType = Context::kPatternValidatorWithProperty; + } + else + context.valueSchema = properties_[index].schema; + + if (context.propertyExist) + context.propertyExist[index] = true; + + return true; + } + + if (additionalPropertiesSchema_) { + if (additionalPropertiesSchema_ && context.patternPropertiesSchemaCount > 0) { + context.patternPropertiesSchemas[context.patternPropertiesSchemaCount++] = additionalPropertiesSchema_; + context.valueSchema = typeless_; + context.valuePatternValidatorType = Context::kPatternValidatorWithAdditionalProperty; + } + else + context.valueSchema = additionalPropertiesSchema_; + return true; + } + else if (additionalProperties_) { + context.valueSchema = typeless_; + return true; + } + + if (context.patternPropertiesSchemaCount == 0) { // patternProperties are not additional properties + context.error_handler.DisallowedProperty(str, len); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetAdditionalPropertiesString()); + } + + return true; + } + + bool EndObject(Context& context, SizeType memberCount) const { + if (hasRequired_) { + context.error_handler.StartMissingProperties(); + for (SizeType index = 0; index < propertyCount_; index++) + if (properties_[index].required && !context.propertyExist[index]) + if (properties_[index].schema->defaultValueLength_ == 0 ) + context.error_handler.AddMissingProperty(properties_[index].name); + if (context.error_handler.EndMissingProperties()) + RAPIDJSON_INVALID_KEYWORD_RETURN(GetRequiredString()); + } + + if (memberCount < minProperties_) { + context.error_handler.TooFewProperties(memberCount, minProperties_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinPropertiesString()); + } + + if (memberCount > maxProperties_) { + context.error_handler.TooManyProperties(memberCount, maxProperties_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaxPropertiesString()); + } + + if (hasDependencies_) { + context.error_handler.StartDependencyErrors(); + for (SizeType sourceIndex = 0; sourceIndex < propertyCount_; sourceIndex++) { + const Property& source = properties_[sourceIndex]; + if (context.propertyExist[sourceIndex]) { + if (source.dependencies) { + context.error_handler.StartMissingDependentProperties(); + for (SizeType targetIndex = 0; targetIndex < propertyCount_; targetIndex++) + if (source.dependencies[targetIndex] && !context.propertyExist[targetIndex]) + context.error_handler.AddMissingDependentProperty(properties_[targetIndex].name); + context.error_handler.EndMissingDependentProperties(source.name); + } + else if (source.dependenciesSchema) { + ISchemaValidator* dependenciesValidator = context.validators[source.dependenciesValidatorIndex]; + if (!dependenciesValidator->IsValid()) + context.error_handler.AddDependencySchemaError(source.name, dependenciesValidator); + } + } + } + if (context.error_handler.EndDependencyErrors()) + RAPIDJSON_INVALID_KEYWORD_RETURN(GetDependenciesString()); + } + + return true; + } + + bool StartArray(Context& context) const { + if (!(type_ & (1 << kArraySchemaType))) { + DisallowedType(context, GetArrayString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + + context.arrayElementIndex = 0; + context.inArray = true; + + return CreateParallelValidator(context); + } + + bool EndArray(Context& context, SizeType elementCount) const { + context.inArray = false; + + if (elementCount < minItems_) { + context.error_handler.TooFewItems(elementCount, minItems_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinItemsString()); + } + + if (elementCount > maxItems_) { + context.error_handler.TooManyItems(elementCount, maxItems_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaxItemsString()); + } + + return true; + } + + // Generate functions for string literal according to Ch +#define RAPIDJSON_STRING_(name, ...) \ + static const ValueType& Get##name##String() {\ + static const Ch s[] = { __VA_ARGS__, '\0' };\ + static const ValueType v(s, static_cast(sizeof(s) / sizeof(Ch) - 1));\ + return v;\ + } + + RAPIDJSON_STRING_(Null, 'n', 'u', 'l', 'l') + RAPIDJSON_STRING_(Boolean, 'b', 'o', 'o', 'l', 'e', 'a', 'n') + RAPIDJSON_STRING_(Object, 'o', 'b', 'j', 'e', 'c', 't') + RAPIDJSON_STRING_(Array, 'a', 'r', 'r', 'a', 'y') + RAPIDJSON_STRING_(String, 's', 't', 'r', 'i', 'n', 'g') + RAPIDJSON_STRING_(Number, 'n', 'u', 'm', 'b', 'e', 'r') + RAPIDJSON_STRING_(Integer, 'i', 'n', 't', 'e', 'g', 'e', 'r') + RAPIDJSON_STRING_(Type, 't', 'y', 'p', 'e') + RAPIDJSON_STRING_(Enum, 'e', 'n', 'u', 'm') + RAPIDJSON_STRING_(AllOf, 'a', 'l', 'l', 'O', 'f') + RAPIDJSON_STRING_(AnyOf, 'a', 'n', 'y', 'O', 'f') + RAPIDJSON_STRING_(OneOf, 'o', 'n', 'e', 'O', 'f') + RAPIDJSON_STRING_(Not, 'n', 'o', 't') + RAPIDJSON_STRING_(Properties, 'p', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's') + RAPIDJSON_STRING_(Required, 'r', 'e', 'q', 'u', 'i', 'r', 'e', 'd') + RAPIDJSON_STRING_(Dependencies, 'd', 'e', 'p', 'e', 'n', 'd', 'e', 'n', 'c', 'i', 'e', 's') + RAPIDJSON_STRING_(PatternProperties, 'p', 'a', 't', 't', 'e', 'r', 'n', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's') + RAPIDJSON_STRING_(AdditionalProperties, 'a', 'd', 'd', 'i', 't', 'i', 'o', 'n', 'a', 'l', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's') + RAPIDJSON_STRING_(MinProperties, 'm', 'i', 'n', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's') + RAPIDJSON_STRING_(MaxProperties, 'm', 'a', 'x', 'P', 'r', 'o', 'p', 'e', 'r', 't', 'i', 'e', 's') + RAPIDJSON_STRING_(Items, 'i', 't', 'e', 'm', 's') + RAPIDJSON_STRING_(MinItems, 'm', 'i', 'n', 'I', 't', 'e', 'm', 's') + RAPIDJSON_STRING_(MaxItems, 'm', 'a', 'x', 'I', 't', 'e', 'm', 's') + RAPIDJSON_STRING_(AdditionalItems, 'a', 'd', 'd', 'i', 't', 'i', 'o', 'n', 'a', 'l', 'I', 't', 'e', 'm', 's') + RAPIDJSON_STRING_(UniqueItems, 'u', 'n', 'i', 'q', 'u', 'e', 'I', 't', 'e', 'm', 's') + RAPIDJSON_STRING_(MinLength, 'm', 'i', 'n', 'L', 'e', 'n', 'g', 't', 'h') + RAPIDJSON_STRING_(MaxLength, 'm', 'a', 'x', 'L', 'e', 'n', 'g', 't', 'h') + RAPIDJSON_STRING_(Pattern, 'p', 'a', 't', 't', 'e', 'r', 'n') + RAPIDJSON_STRING_(Minimum, 'm', 'i', 'n', 'i', 'm', 'u', 'm') + RAPIDJSON_STRING_(Maximum, 'm', 'a', 'x', 'i', 'm', 'u', 'm') + RAPIDJSON_STRING_(ExclusiveMinimum, 'e', 'x', 'c', 'l', 'u', 's', 'i', 'v', 'e', 'M', 'i', 'n', 'i', 'm', 'u', 'm') + RAPIDJSON_STRING_(ExclusiveMaximum, 'e', 'x', 'c', 'l', 'u', 's', 'i', 'v', 'e', 'M', 'a', 'x', 'i', 'm', 'u', 'm') + RAPIDJSON_STRING_(MultipleOf, 'm', 'u', 'l', 't', 'i', 'p', 'l', 'e', 'O', 'f') + RAPIDJSON_STRING_(DefaultValue, 'd', 'e', 'f', 'a', 'u', 'l', 't') + +#undef RAPIDJSON_STRING_ + +private: + enum SchemaValueType { + kNullSchemaType, + kBooleanSchemaType, + kObjectSchemaType, + kArraySchemaType, + kStringSchemaType, + kNumberSchemaType, + kIntegerSchemaType, + kTotalSchemaType + }; + +#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX + typedef internal::GenericRegex RegexType; +#elif RAPIDJSON_SCHEMA_USE_STDREGEX + typedef std::basic_regex RegexType; +#else + typedef char RegexType; +#endif + + struct SchemaArray { + SchemaArray() : schemas(), count() {} + ~SchemaArray() { AllocatorType::Free(schemas); } + const SchemaType** schemas; + SizeType begin; // begin index of context.validators + SizeType count; + }; + + template + void AddUniqueElement(V1& a, const V2& v) { + for (typename V1::ConstValueIterator itr = a.Begin(); itr != a.End(); ++itr) + if (*itr == v) + return; + V1 c(v, *allocator_); + a.PushBack(c, *allocator_); + } + + static const ValueType* GetMember(const ValueType& value, const ValueType& name) { + typename ValueType::ConstMemberIterator itr = value.FindMember(name); + return itr != value.MemberEnd() ? &(itr->value) : 0; + } + + static void AssignIfExist(bool& out, const ValueType& value, const ValueType& name) { + if (const ValueType* v = GetMember(value, name)) + if (v->IsBool()) + out = v->GetBool(); + } + + static void AssignIfExist(SizeType& out, const ValueType& value, const ValueType& name) { + if (const ValueType* v = GetMember(value, name)) + if (v->IsUint64() && v->GetUint64() <= SizeType(~0)) + out = static_cast(v->GetUint64()); + } + + void AssignIfExist(SchemaArray& out, SchemaDocumentType& schemaDocument, const PointerType& p, const ValueType& value, const ValueType& name, const ValueType& document) { + if (const ValueType* v = GetMember(value, name)) { + if (v->IsArray() && v->Size() > 0) { + PointerType q = p.Append(name, allocator_); + out.count = v->Size(); + out.schemas = static_cast(allocator_->Malloc(out.count * sizeof(const Schema*))); + memset(out.schemas, 0, sizeof(Schema*)* out.count); + for (SizeType i = 0; i < out.count; i++) + schemaDocument.CreateSchema(&out.schemas[i], q.Append(i, allocator_), (*v)[i], document); + out.begin = validatorCount_; + validatorCount_ += out.count; + } + } + } + +#if RAPIDJSON_SCHEMA_USE_INTERNALREGEX + template + RegexType* CreatePattern(const ValueType& value) { + if (value.IsString()) { + RegexType* r = new (allocator_->Malloc(sizeof(RegexType))) RegexType(value.GetString(), allocator_); + if (!r->IsValid()) { + r->~RegexType(); + AllocatorType::Free(r); + r = 0; + } + return r; + } + return 0; + } + + static bool IsPatternMatch(const RegexType* pattern, const Ch *str, SizeType) { + GenericRegexSearch rs(*pattern); + return rs.Search(str); + } +#elif RAPIDJSON_SCHEMA_USE_STDREGEX + template + RegexType* CreatePattern(const ValueType& value) { + if (value.IsString()) + RegexType *r = static_cast(allocator_->Malloc(sizeof(RegexType))); + try { + return new (r) RegexType(value.GetString(), std::size_t(value.GetStringLength()), std::regex_constants::ECMAScript); + } + catch (const std::regex_error&) { + AllocatorType::Free(r); + } + return 0; + } + + static bool IsPatternMatch(const RegexType* pattern, const Ch *str, SizeType length) { + std::match_results r; + return std::regex_search(str, str + length, r, *pattern); + } +#else + template + RegexType* CreatePattern(const ValueType&) { return 0; } + + static bool IsPatternMatch(const RegexType*, const Ch *, SizeType) { return true; } +#endif // RAPIDJSON_SCHEMA_USE_STDREGEX + + void AddType(const ValueType& type) { + if (type == GetNullString() ) type_ |= 1 << kNullSchemaType; + else if (type == GetBooleanString()) type_ |= 1 << kBooleanSchemaType; + else if (type == GetObjectString() ) type_ |= 1 << kObjectSchemaType; + else if (type == GetArrayString() ) type_ |= 1 << kArraySchemaType; + else if (type == GetStringString() ) type_ |= 1 << kStringSchemaType; + else if (type == GetIntegerString()) type_ |= 1 << kIntegerSchemaType; + else if (type == GetNumberString() ) type_ |= (1 << kNumberSchemaType) | (1 << kIntegerSchemaType); + } + + bool CreateParallelValidator(Context& context) const { + if (enum_ || context.arrayUniqueness) + context.hasher = context.factory.CreateHasher(); + + if (validatorCount_) { + RAPIDJSON_ASSERT(context.validators == 0); + context.validators = static_cast(context.factory.MallocState(sizeof(ISchemaValidator*) * validatorCount_)); + context.validatorCount = validatorCount_; + + if (allOf_.schemas) + CreateSchemaValidators(context, allOf_); + + if (anyOf_.schemas) + CreateSchemaValidators(context, anyOf_); + + if (oneOf_.schemas) + CreateSchemaValidators(context, oneOf_); + + if (not_) + context.validators[notValidatorIndex_] = context.factory.CreateSchemaValidator(*not_); + + if (hasSchemaDependencies_) { + for (SizeType i = 0; i < propertyCount_; i++) + if (properties_[i].dependenciesSchema) + context.validators[properties_[i].dependenciesValidatorIndex] = context.factory.CreateSchemaValidator(*properties_[i].dependenciesSchema); + } + } + + return true; + } + + void CreateSchemaValidators(Context& context, const SchemaArray& schemas) const { + for (SizeType i = 0; i < schemas.count; i++) + context.validators[schemas.begin + i] = context.factory.CreateSchemaValidator(*schemas.schemas[i]); + } + + // O(n) + bool FindPropertyIndex(const ValueType& name, SizeType* outIndex) const { + SizeType len = name.GetStringLength(); + const Ch* str = name.GetString(); + for (SizeType index = 0; index < propertyCount_; index++) + if (properties_[index].name.GetStringLength() == len && + (std::memcmp(properties_[index].name.GetString(), str, sizeof(Ch) * len) == 0)) + { + *outIndex = index; + return true; + } + return false; + } + + bool CheckInt(Context& context, int64_t i) const { + if (!(type_ & ((1 << kIntegerSchemaType) | (1 << kNumberSchemaType)))) { + DisallowedType(context, GetIntegerString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + + if (!minimum_.IsNull()) { + if (minimum_.IsInt64()) { + if (exclusiveMinimum_ ? i <= minimum_.GetInt64() : i < minimum_.GetInt64()) { + context.error_handler.BelowMinimum(i, minimum_, exclusiveMinimum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinimumString()); + } + } + else if (minimum_.IsUint64()) { + context.error_handler.BelowMinimum(i, minimum_, exclusiveMinimum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinimumString()); // i <= max(int64_t) < minimum.GetUint64() + } + else if (!CheckDoubleMinimum(context, static_cast(i))) + return false; + } + + if (!maximum_.IsNull()) { + if (maximum_.IsInt64()) { + if (exclusiveMaximum_ ? i >= maximum_.GetInt64() : i > maximum_.GetInt64()) { + context.error_handler.AboveMaximum(i, maximum_, exclusiveMaximum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaximumString()); + } + } + else if (maximum_.IsUint64()) { } + /* do nothing */ // i <= max(int64_t) < maximum_.GetUint64() + else if (!CheckDoubleMaximum(context, static_cast(i))) + return false; + } + + if (!multipleOf_.IsNull()) { + if (multipleOf_.IsUint64()) { + if (static_cast(i >= 0 ? i : -i) % multipleOf_.GetUint64() != 0) { + context.error_handler.NotMultipleOf(i, multipleOf_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMultipleOfString()); + } + } + else if (!CheckDoubleMultipleOf(context, static_cast(i))) + return false; + } + + return true; + } + + bool CheckUint(Context& context, uint64_t i) const { + if (!(type_ & ((1 << kIntegerSchemaType) | (1 << kNumberSchemaType)))) { + DisallowedType(context, GetIntegerString()); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetTypeString()); + } + + if (!minimum_.IsNull()) { + if (minimum_.IsUint64()) { + if (exclusiveMinimum_ ? i <= minimum_.GetUint64() : i < minimum_.GetUint64()) { + context.error_handler.BelowMinimum(i, minimum_, exclusiveMinimum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinimumString()); + } + } + else if (minimum_.IsInt64()) + /* do nothing */; // i >= 0 > minimum.Getint64() + else if (!CheckDoubleMinimum(context, static_cast(i))) + return false; + } + + if (!maximum_.IsNull()) { + if (maximum_.IsUint64()) { + if (exclusiveMaximum_ ? i >= maximum_.GetUint64() : i > maximum_.GetUint64()) { + context.error_handler.AboveMaximum(i, maximum_, exclusiveMaximum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaximumString()); + } + } + else if (maximum_.IsInt64()) { + context.error_handler.AboveMaximum(i, maximum_, exclusiveMaximum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaximumString()); // i >= 0 > maximum_ + } + else if (!CheckDoubleMaximum(context, static_cast(i))) + return false; + } + + if (!multipleOf_.IsNull()) { + if (multipleOf_.IsUint64()) { + if (i % multipleOf_.GetUint64() != 0) { + context.error_handler.NotMultipleOf(i, multipleOf_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMultipleOfString()); + } + } + else if (!CheckDoubleMultipleOf(context, static_cast(i))) + return false; + } + + return true; + } + + bool CheckDoubleMinimum(Context& context, double d) const { + if (exclusiveMinimum_ ? d <= minimum_.GetDouble() : d < minimum_.GetDouble()) { + context.error_handler.BelowMinimum(d, minimum_, exclusiveMinimum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMinimumString()); + } + return true; + } + + bool CheckDoubleMaximum(Context& context, double d) const { + if (exclusiveMaximum_ ? d >= maximum_.GetDouble() : d > maximum_.GetDouble()) { + context.error_handler.AboveMaximum(d, maximum_, exclusiveMaximum_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMaximumString()); + } + return true; + } + + bool CheckDoubleMultipleOf(Context& context, double d) const { + double a = std::abs(d), b = std::abs(multipleOf_.GetDouble()); + double q = std::floor(a / b); + double r = a - q * b; + if (r > 0.0) { + context.error_handler.NotMultipleOf(d, multipleOf_); + RAPIDJSON_INVALID_KEYWORD_RETURN(GetMultipleOfString()); + } + return true; + } + + void DisallowedType(Context& context, const ValueType& actualType) const { + ErrorHandler& eh = context.error_handler; + eh.StartDisallowedType(); + + if (type_ & (1 << kNullSchemaType)) eh.AddExpectedType(GetNullString()); + if (type_ & (1 << kBooleanSchemaType)) eh.AddExpectedType(GetBooleanString()); + if (type_ & (1 << kObjectSchemaType)) eh.AddExpectedType(GetObjectString()); + if (type_ & (1 << kArraySchemaType)) eh.AddExpectedType(GetArrayString()); + if (type_ & (1 << kStringSchemaType)) eh.AddExpectedType(GetStringString()); + + if (type_ & (1 << kNumberSchemaType)) eh.AddExpectedType(GetNumberString()); + else if (type_ & (1 << kIntegerSchemaType)) eh.AddExpectedType(GetIntegerString()); + + eh.EndDisallowedType(actualType); + } + + struct Property { + Property() : schema(), dependenciesSchema(), dependenciesValidatorIndex(), dependencies(), required(false) {} + ~Property() { AllocatorType::Free(dependencies); } + SValue name; + const SchemaType* schema; + const SchemaType* dependenciesSchema; + SizeType dependenciesValidatorIndex; + bool* dependencies; + bool required; + }; + + struct PatternProperty { + PatternProperty() : schema(), pattern() {} + ~PatternProperty() { + if (pattern) { + pattern->~RegexType(); + AllocatorType::Free(pattern); + } + } + const SchemaType* schema; + RegexType* pattern; + }; + + AllocatorType* allocator_; + SValue uri_; + PointerType pointer_; + const SchemaType* typeless_; + uint64_t* enum_; + SizeType enumCount_; + SchemaArray allOf_; + SchemaArray anyOf_; + SchemaArray oneOf_; + const SchemaType* not_; + unsigned type_; // bitmask of kSchemaType + SizeType validatorCount_; + SizeType notValidatorIndex_; + + Property* properties_; + const SchemaType* additionalPropertiesSchema_; + PatternProperty* patternProperties_; + SizeType patternPropertyCount_; + SizeType propertyCount_; + SizeType minProperties_; + SizeType maxProperties_; + bool additionalProperties_; + bool hasDependencies_; + bool hasRequired_; + bool hasSchemaDependencies_; + + const SchemaType* additionalItemsSchema_; + const SchemaType* itemsList_; + const SchemaType** itemsTuple_; + SizeType itemsTupleCount_; + SizeType minItems_; + SizeType maxItems_; + bool additionalItems_; + bool uniqueItems_; + + RegexType* pattern_; + SizeType minLength_; + SizeType maxLength_; + + SValue minimum_; + SValue maximum_; + SValue multipleOf_; + bool exclusiveMinimum_; + bool exclusiveMaximum_; + + SizeType defaultValueLength_; +}; + +template +struct TokenHelper { + RAPIDJSON_FORCEINLINE static void AppendIndexToken(Stack& documentStack, SizeType index) { + *documentStack.template Push() = '/'; + char buffer[21]; + size_t length = static_cast((sizeof(SizeType) == 4 ? u32toa(index, buffer) : u64toa(index, buffer)) - buffer); + for (size_t i = 0; i < length; i++) + *documentStack.template Push() = static_cast(buffer[i]); + } +}; + +// Partial specialized version for char to prevent buffer copying. +template +struct TokenHelper { + RAPIDJSON_FORCEINLINE static void AppendIndexToken(Stack& documentStack, SizeType index) { + if (sizeof(SizeType) == 4) { + char *buffer = documentStack.template Push(1 + 10); // '/' + uint + *buffer++ = '/'; + const char* end = internal::u32toa(index, buffer); + documentStack.template Pop(static_cast(10 - (end - buffer))); + } + else { + char *buffer = documentStack.template Push(1 + 20); // '/' + uint64 + *buffer++ = '/'; + const char* end = internal::u64toa(index, buffer); + documentStack.template Pop(static_cast(20 - (end - buffer))); + } + } +}; + +} // namespace internal + +/////////////////////////////////////////////////////////////////////////////// +// IGenericRemoteSchemaDocumentProvider + +template +class IGenericRemoteSchemaDocumentProvider { +public: + typedef typename SchemaDocumentType::Ch Ch; + + virtual ~IGenericRemoteSchemaDocumentProvider() {} + virtual const SchemaDocumentType* GetRemoteDocument(const Ch* uri, SizeType length) = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// +// GenericSchemaDocument + +//! JSON schema document. +/*! + A JSON schema document is a compiled version of a JSON schema. + It is basically a tree of internal::Schema. + + \note This is an immutable class (i.e. its instance cannot be modified after construction). + \tparam ValueT Type of JSON value (e.g. \c Value ), which also determine the encoding. + \tparam Allocator Allocator type for allocating memory of this document. +*/ +template +class GenericSchemaDocument { +public: + typedef ValueT ValueType; + typedef IGenericRemoteSchemaDocumentProvider IRemoteSchemaDocumentProviderType; + typedef Allocator AllocatorType; + typedef typename ValueType::EncodingType EncodingType; + typedef typename EncodingType::Ch Ch; + typedef internal::Schema SchemaType; + typedef GenericPointer PointerType; + typedef GenericValue URIType; + friend class internal::Schema; + template + friend class GenericSchemaValidator; + + //! Constructor. + /*! + Compile a JSON document into schema document. + + \param document A JSON document as source. + \param uri The base URI of this schema document for purposes of violation reporting. + \param uriLength Length of \c name, in code points. + \param remoteProvider An optional remote schema document provider for resolving remote reference. Can be null. + \param allocator An optional allocator instance for allocating memory. Can be null. + */ + explicit GenericSchemaDocument(const ValueType& document, const Ch* uri = 0, SizeType uriLength = 0, + IRemoteSchemaDocumentProviderType* remoteProvider = 0, Allocator* allocator = 0) : + remoteProvider_(remoteProvider), + allocator_(allocator), + ownAllocator_(), + root_(), + typeless_(), + schemaMap_(allocator, kInitialSchemaMapSize), + schemaRef_(allocator, kInitialSchemaRefSize) + { + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + + Ch noUri[1] = {0}; + uri_.SetString(uri ? uri : noUri, uriLength, *allocator_); + + typeless_ = static_cast(allocator_->Malloc(sizeof(SchemaType))); + new (typeless_) SchemaType(this, PointerType(), ValueType(kObjectType).Move(), ValueType(kObjectType).Move(), allocator_); + + // Generate root schema, it will call CreateSchema() to create sub-schemas, + // And call AddRefSchema() if there are $ref. + CreateSchemaRecursive(&root_, PointerType(), document, document); + + // Resolve $ref + while (!schemaRef_.Empty()) { + SchemaRefEntry* refEntry = schemaRef_.template Pop(1); + if (const SchemaType* s = GetSchema(refEntry->target)) { + if (refEntry->schema) + *refEntry->schema = s; + + // Create entry in map if not exist + if (!GetSchema(refEntry->source)) { + new (schemaMap_.template Push()) SchemaEntry(refEntry->source, const_cast(s), false, allocator_); + } + } + else if (refEntry->schema) + *refEntry->schema = typeless_; + + refEntry->~SchemaRefEntry(); + } + + RAPIDJSON_ASSERT(root_ != 0); + + schemaRef_.ShrinkToFit(); // Deallocate all memory for ref + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move constructor in C++11 + GenericSchemaDocument(GenericSchemaDocument&& rhs) RAPIDJSON_NOEXCEPT : + remoteProvider_(rhs.remoteProvider_), + allocator_(rhs.allocator_), + ownAllocator_(rhs.ownAllocator_), + root_(rhs.root_), + typeless_(rhs.typeless_), + schemaMap_(std::move(rhs.schemaMap_)), + schemaRef_(std::move(rhs.schemaRef_)), + uri_(std::move(rhs.uri_)) + { + rhs.remoteProvider_ = 0; + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.typeless_ = 0; + } +#endif + + //! Destructor + ~GenericSchemaDocument() { + while (!schemaMap_.Empty()) + schemaMap_.template Pop(1)->~SchemaEntry(); + + if (typeless_) { + typeless_->~SchemaType(); + Allocator::Free(typeless_); + } + + RAPIDJSON_DELETE(ownAllocator_); + } + + const URIType& GetURI() const { return uri_; } + + //! Get the root schema. + const SchemaType& GetRoot() const { return *root_; } + +private: + //! Prohibit copying + GenericSchemaDocument(const GenericSchemaDocument&); + //! Prohibit assignment + GenericSchemaDocument& operator=(const GenericSchemaDocument&); + + struct SchemaRefEntry { + SchemaRefEntry(const PointerType& s, const PointerType& t, const SchemaType** outSchema, Allocator *allocator) : source(s, allocator), target(t, allocator), schema(outSchema) {} + PointerType source; + PointerType target; + const SchemaType** schema; + }; + + struct SchemaEntry { + SchemaEntry(const PointerType& p, SchemaType* s, bool o, Allocator* allocator) : pointer(p, allocator), schema(s), owned(o) {} + ~SchemaEntry() { + if (owned) { + schema->~SchemaType(); + Allocator::Free(schema); + } + } + PointerType pointer; + SchemaType* schema; + bool owned; + }; + + void CreateSchemaRecursive(const SchemaType** schema, const PointerType& pointer, const ValueType& v, const ValueType& document) { + if (schema) + *schema = typeless_; + + if (v.GetType() == kObjectType) { + const SchemaType* s = GetSchema(pointer); + if (!s) + CreateSchema(schema, pointer, v, document); + + for (typename ValueType::ConstMemberIterator itr = v.MemberBegin(); itr != v.MemberEnd(); ++itr) + CreateSchemaRecursive(0, pointer.Append(itr->name, allocator_), itr->value, document); + } + else if (v.GetType() == kArrayType) + for (SizeType i = 0; i < v.Size(); i++) + CreateSchemaRecursive(0, pointer.Append(i, allocator_), v[i], document); + } + + void CreateSchema(const SchemaType** schema, const PointerType& pointer, const ValueType& v, const ValueType& document) { + RAPIDJSON_ASSERT(pointer.IsValid()); + if (v.IsObject()) { + if (!HandleRefSchema(pointer, schema, v, document)) { + SchemaType* s = new (allocator_->Malloc(sizeof(SchemaType))) SchemaType(this, pointer, v, document, allocator_); + new (schemaMap_.template Push()) SchemaEntry(pointer, s, true, allocator_); + if (schema) + *schema = s; + } + } + } + + bool HandleRefSchema(const PointerType& source, const SchemaType** schema, const ValueType& v, const ValueType& document) { + static const Ch kRefString[] = { '$', 'r', 'e', 'f', '\0' }; + static const ValueType kRefValue(kRefString, 4); + + typename ValueType::ConstMemberIterator itr = v.FindMember(kRefValue); + if (itr == v.MemberEnd()) + return false; + + if (itr->value.IsString()) { + SizeType len = itr->value.GetStringLength(); + if (len > 0) { + const Ch* s = itr->value.GetString(); + SizeType i = 0; + while (i < len && s[i] != '#') // Find the first # + i++; + + if (i > 0) { // Remote reference, resolve immediately + if (remoteProvider_) { + if (const GenericSchemaDocument* remoteDocument = remoteProvider_->GetRemoteDocument(s, i)) { + PointerType pointer(&s[i], len - i, allocator_); + if (pointer.IsValid()) { + if (const SchemaType* sc = remoteDocument->GetSchema(pointer)) { + if (schema) + *schema = sc; + new (schemaMap_.template Push()) SchemaEntry(source, const_cast(sc), false, allocator_); + return true; + } + } + } + } + } + else if (s[i] == '#') { // Local reference, defer resolution + PointerType pointer(&s[i], len - i, allocator_); + if (pointer.IsValid()) { + if (const ValueType* nv = pointer.Get(document)) + if (HandleRefSchema(source, schema, *nv, document)) + return true; + + new (schemaRef_.template Push()) SchemaRefEntry(source, pointer, schema, allocator_); + return true; + } + } + } + } + return false; + } + + const SchemaType* GetSchema(const PointerType& pointer) const { + for (const SchemaEntry* target = schemaMap_.template Bottom(); target != schemaMap_.template End(); ++target) + if (pointer == target->pointer) + return target->schema; + return 0; + } + + PointerType GetPointer(const SchemaType* schema) const { + for (const SchemaEntry* target = schemaMap_.template Bottom(); target != schemaMap_.template End(); ++target) + if (schema == target->schema) + return target->pointer; + return PointerType(); + } + + const SchemaType* GetTypeless() const { return typeless_; } + + static const size_t kInitialSchemaMapSize = 64; + static const size_t kInitialSchemaRefSize = 64; + + IRemoteSchemaDocumentProviderType* remoteProvider_; + Allocator *allocator_; + Allocator *ownAllocator_; + const SchemaType* root_; //!< Root schema. + SchemaType* typeless_; + internal::Stack schemaMap_; // Stores created Pointer -> Schemas + internal::Stack schemaRef_; // Stores Pointer from $ref and schema which holds the $ref + URIType uri_; +}; + +//! GenericSchemaDocument using Value type. +typedef GenericSchemaDocument SchemaDocument; +//! IGenericRemoteSchemaDocumentProvider using SchemaDocument. +typedef IGenericRemoteSchemaDocumentProvider IRemoteSchemaDocumentProvider; + +/////////////////////////////////////////////////////////////////////////////// +// GenericSchemaValidator + +//! JSON Schema Validator. +/*! + A SAX style JSON schema validator. + It uses a \c GenericSchemaDocument to validate SAX events. + It delegates the incoming SAX events to an output handler. + The default output handler does nothing. + It can be reused multiple times by calling \c Reset(). + + \tparam SchemaDocumentType Type of schema document. + \tparam OutputHandler Type of output handler. Default handler does nothing. + \tparam StateAllocator Allocator for storing the internal validation states. +*/ +template < + typename SchemaDocumentType, + typename OutputHandler = BaseReaderHandler, + typename StateAllocator = CrtAllocator> +class GenericSchemaValidator : + public internal::ISchemaStateFactory, + public internal::ISchemaValidator, + public internal::IValidationErrorHandler +{ +public: + typedef typename SchemaDocumentType::SchemaType SchemaType; + typedef typename SchemaDocumentType::PointerType PointerType; + typedef typename SchemaType::EncodingType EncodingType; + typedef typename SchemaType::SValue SValue; + typedef typename EncodingType::Ch Ch; + typedef GenericStringRef StringRefType; + typedef GenericValue ValueType; + + //! Constructor without output handler. + /*! + \param schemaDocument The schema document to conform to. + \param allocator Optional allocator for storing internal validation states. + \param schemaStackCapacity Optional initial capacity of schema path stack. + \param documentStackCapacity Optional initial capacity of document path stack. + */ + GenericSchemaValidator( + const SchemaDocumentType& schemaDocument, + StateAllocator* allocator = 0, + size_t schemaStackCapacity = kDefaultSchemaStackCapacity, + size_t documentStackCapacity = kDefaultDocumentStackCapacity) + : + schemaDocument_(&schemaDocument), + root_(schemaDocument.GetRoot()), + stateAllocator_(allocator), + ownStateAllocator_(0), + schemaStack_(allocator, schemaStackCapacity), + documentStack_(allocator, documentStackCapacity), + outputHandler_(0), + error_(kObjectType), + currentError_(), + missingDependents_(), + valid_(true) +#if RAPIDJSON_SCHEMA_VERBOSE + , depth_(0) +#endif + { + } + + //! Constructor with output handler. + /*! + \param schemaDocument The schema document to conform to. + \param allocator Optional allocator for storing internal validation states. + \param schemaStackCapacity Optional initial capacity of schema path stack. + \param documentStackCapacity Optional initial capacity of document path stack. + */ + GenericSchemaValidator( + const SchemaDocumentType& schemaDocument, + OutputHandler& outputHandler, + StateAllocator* allocator = 0, + size_t schemaStackCapacity = kDefaultSchemaStackCapacity, + size_t documentStackCapacity = kDefaultDocumentStackCapacity) + : + schemaDocument_(&schemaDocument), + root_(schemaDocument.GetRoot()), + stateAllocator_(allocator), + ownStateAllocator_(0), + schemaStack_(allocator, schemaStackCapacity), + documentStack_(allocator, documentStackCapacity), + outputHandler_(&outputHandler), + error_(kObjectType), + currentError_(), + missingDependents_(), + valid_(true) +#if RAPIDJSON_SCHEMA_VERBOSE + , depth_(0) +#endif + { + } + + //! Destructor. + ~GenericSchemaValidator() { + Reset(); + RAPIDJSON_DELETE(ownStateAllocator_); + } + + //! Reset the internal states. + void Reset() { + while (!schemaStack_.Empty()) + PopSchema(); + documentStack_.Clear(); + error_.SetObject(); + currentError_.SetNull(); + missingDependents_.SetNull(); + valid_ = true; + } + + //! Checks whether the current state is valid. + // Implementation of ISchemaValidator + virtual bool IsValid() const { return valid_; } + + //! Gets the error object. + ValueType& GetError() { return error_; } + const ValueType& GetError() const { return error_; } + + //! Gets the JSON pointer pointed to the invalid schema. + PointerType GetInvalidSchemaPointer() const { + return schemaStack_.Empty() ? PointerType() : CurrentSchema().GetPointer(); + } + + //! Gets the keyword of invalid schema. + const Ch* GetInvalidSchemaKeyword() const { + return schemaStack_.Empty() ? 0 : CurrentContext().invalidKeyword; + } + + //! Gets the JSON pointer pointed to the invalid value. + PointerType GetInvalidDocumentPointer() const { + if (documentStack_.Empty()) { + return PointerType(); + } + else { + return PointerType(documentStack_.template Bottom(), documentStack_.GetSize() / sizeof(Ch)); + } + } + + void NotMultipleOf(int64_t actual, const SValue& expected) { + AddNumberError(SchemaType::GetMultipleOfString(), ValueType(actual).Move(), expected); + } + void NotMultipleOf(uint64_t actual, const SValue& expected) { + AddNumberError(SchemaType::GetMultipleOfString(), ValueType(actual).Move(), expected); + } + void NotMultipleOf(double actual, const SValue& expected) { + AddNumberError(SchemaType::GetMultipleOfString(), ValueType(actual).Move(), expected); + } + void AboveMaximum(int64_t actual, const SValue& expected, bool exclusive) { + AddNumberError(SchemaType::GetMaximumString(), ValueType(actual).Move(), expected, + exclusive ? &SchemaType::GetExclusiveMaximumString : 0); + } + void AboveMaximum(uint64_t actual, const SValue& expected, bool exclusive) { + AddNumberError(SchemaType::GetMaximumString(), ValueType(actual).Move(), expected, + exclusive ? &SchemaType::GetExclusiveMaximumString : 0); + } + void AboveMaximum(double actual, const SValue& expected, bool exclusive) { + AddNumberError(SchemaType::GetMaximumString(), ValueType(actual).Move(), expected, + exclusive ? &SchemaType::GetExclusiveMaximumString : 0); + } + void BelowMinimum(int64_t actual, const SValue& expected, bool exclusive) { + AddNumberError(SchemaType::GetMinimumString(), ValueType(actual).Move(), expected, + exclusive ? &SchemaType::GetExclusiveMinimumString : 0); + } + void BelowMinimum(uint64_t actual, const SValue& expected, bool exclusive) { + AddNumberError(SchemaType::GetMinimumString(), ValueType(actual).Move(), expected, + exclusive ? &SchemaType::GetExclusiveMinimumString : 0); + } + void BelowMinimum(double actual, const SValue& expected, bool exclusive) { + AddNumberError(SchemaType::GetMinimumString(), ValueType(actual).Move(), expected, + exclusive ? &SchemaType::GetExclusiveMinimumString : 0); + } + + void TooLong(const Ch* str, SizeType length, SizeType expected) { + AddNumberError(SchemaType::GetMaxLengthString(), + ValueType(str, length, GetStateAllocator()).Move(), SValue(expected).Move()); + } + void TooShort(const Ch* str, SizeType length, SizeType expected) { + AddNumberError(SchemaType::GetMinLengthString(), + ValueType(str, length, GetStateAllocator()).Move(), SValue(expected).Move()); + } + void DoesNotMatch(const Ch* str, SizeType length) { + currentError_.SetObject(); + currentError_.AddMember(GetActualString(), ValueType(str, length, GetStateAllocator()).Move(), GetStateAllocator()); + AddCurrentError(SchemaType::GetPatternString()); + } + + void DisallowedItem(SizeType index) { + currentError_.SetObject(); + currentError_.AddMember(GetDisallowedString(), ValueType(index).Move(), GetStateAllocator()); + AddCurrentError(SchemaType::GetAdditionalItemsString(), true); + } + void TooFewItems(SizeType actualCount, SizeType expectedCount) { + AddNumberError(SchemaType::GetMinItemsString(), + ValueType(actualCount).Move(), SValue(expectedCount).Move()); + } + void TooManyItems(SizeType actualCount, SizeType expectedCount) { + AddNumberError(SchemaType::GetMaxItemsString(), + ValueType(actualCount).Move(), SValue(expectedCount).Move()); + } + void DuplicateItems(SizeType index1, SizeType index2) { + ValueType duplicates(kArrayType); + duplicates.PushBack(index1, GetStateAllocator()); + duplicates.PushBack(index2, GetStateAllocator()); + currentError_.SetObject(); + currentError_.AddMember(GetDuplicatesString(), duplicates, GetStateAllocator()); + AddCurrentError(SchemaType::GetUniqueItemsString(), true); + } + + void TooManyProperties(SizeType actualCount, SizeType expectedCount) { + AddNumberError(SchemaType::GetMaxPropertiesString(), + ValueType(actualCount).Move(), SValue(expectedCount).Move()); + } + void TooFewProperties(SizeType actualCount, SizeType expectedCount) { + AddNumberError(SchemaType::GetMinPropertiesString(), + ValueType(actualCount).Move(), SValue(expectedCount).Move()); + } + void StartMissingProperties() { + currentError_.SetArray(); + } + void AddMissingProperty(const SValue& name) { + currentError_.PushBack(ValueType(name, GetStateAllocator()).Move(), GetStateAllocator()); + } + bool EndMissingProperties() { + if (currentError_.Empty()) + return false; + ValueType error(kObjectType); + error.AddMember(GetMissingString(), currentError_, GetStateAllocator()); + currentError_ = error; + AddCurrentError(SchemaType::GetRequiredString()); + return true; + } + void PropertyViolations(ISchemaValidator** subvalidators, SizeType count) { + for (SizeType i = 0; i < count; ++i) + MergeError(static_cast(subvalidators[i])->GetError()); + } + void DisallowedProperty(const Ch* name, SizeType length) { + currentError_.SetObject(); + currentError_.AddMember(GetDisallowedString(), ValueType(name, length, GetStateAllocator()).Move(), GetStateAllocator()); + AddCurrentError(SchemaType::GetAdditionalPropertiesString(), true); + } + + void StartDependencyErrors() { + currentError_.SetObject(); + } + void StartMissingDependentProperties() { + missingDependents_.SetArray(); + } + void AddMissingDependentProperty(const SValue& targetName) { + missingDependents_.PushBack(ValueType(targetName, GetStateAllocator()).Move(), GetStateAllocator()); + } + void EndMissingDependentProperties(const SValue& sourceName) { + if (!missingDependents_.Empty()) + currentError_.AddMember(ValueType(sourceName, GetStateAllocator()).Move(), + missingDependents_, GetStateAllocator()); + } + void AddDependencySchemaError(const SValue& sourceName, ISchemaValidator* subvalidator) { + currentError_.AddMember(ValueType(sourceName, GetStateAllocator()).Move(), + static_cast(subvalidator)->GetError(), GetStateAllocator()); + } + bool EndDependencyErrors() { + if (currentError_.ObjectEmpty()) + return false; + ValueType error(kObjectType); + error.AddMember(GetErrorsString(), currentError_, GetStateAllocator()); + currentError_ = error; + AddCurrentError(SchemaType::GetDependenciesString()); + return true; + } + + void DisallowedValue() { + currentError_.SetObject(); + AddCurrentError(SchemaType::GetEnumString()); + } + void StartDisallowedType() { + currentError_.SetArray(); + } + void AddExpectedType(const typename SchemaType::ValueType& expectedType) { + currentError_.PushBack(ValueType(expectedType, GetStateAllocator()).Move(), GetStateAllocator()); + } + void EndDisallowedType(const typename SchemaType::ValueType& actualType) { + ValueType error(kObjectType); + error.AddMember(GetExpectedString(), currentError_, GetStateAllocator()); + error.AddMember(GetActualString(), ValueType(actualType, GetStateAllocator()).Move(), GetStateAllocator()); + currentError_ = error; + AddCurrentError(SchemaType::GetTypeString()); + } + void NotAllOf(ISchemaValidator** subvalidators, SizeType count) { + for (SizeType i = 0; i < count; ++i) { + MergeError(static_cast(subvalidators[i])->GetError()); + } + } + void NoneOf(ISchemaValidator** subvalidators, SizeType count) { + AddErrorArray(SchemaType::GetAnyOfString(), subvalidators, count); + } + void NotOneOf(ISchemaValidator** subvalidators, SizeType count) { + AddErrorArray(SchemaType::GetOneOfString(), subvalidators, count); + } + void Disallowed() { + currentError_.SetObject(); + AddCurrentError(SchemaType::GetNotString()); + } + +#define RAPIDJSON_STRING_(name, ...) \ + static const StringRefType& Get##name##String() {\ + static const Ch s[] = { __VA_ARGS__, '\0' };\ + static const StringRefType v(s, static_cast(sizeof(s) / sizeof(Ch) - 1)); \ + return v;\ + } + + RAPIDJSON_STRING_(InstanceRef, 'i', 'n', 's', 't', 'a', 'n', 'c', 'e', 'R', 'e', 'f') + RAPIDJSON_STRING_(SchemaRef, 's', 'c', 'h', 'e', 'm', 'a', 'R', 'e', 'f') + RAPIDJSON_STRING_(Expected, 'e', 'x', 'p', 'e', 'c', 't', 'e', 'd') + RAPIDJSON_STRING_(Actual, 'a', 'c', 't', 'u', 'a', 'l') + RAPIDJSON_STRING_(Disallowed, 'd', 'i', 's', 'a', 'l', 'l', 'o', 'w', 'e', 'd') + RAPIDJSON_STRING_(Missing, 'm', 'i', 's', 's', 'i', 'n', 'g') + RAPIDJSON_STRING_(Errors, 'e', 'r', 'r', 'o', 'r', 's') + RAPIDJSON_STRING_(Duplicates, 'd', 'u', 'p', 'l', 'i', 'c', 'a', 't', 'e', 's') + +#undef RAPIDJSON_STRING_ + +#if RAPIDJSON_SCHEMA_VERBOSE +#define RAPIDJSON_SCHEMA_HANDLE_BEGIN_VERBOSE_() \ +RAPIDJSON_MULTILINEMACRO_BEGIN\ + *documentStack_.template Push() = '\0';\ + documentStack_.template Pop(1);\ + internal::PrintInvalidDocument(documentStack_.template Bottom());\ +RAPIDJSON_MULTILINEMACRO_END +#else +#define RAPIDJSON_SCHEMA_HANDLE_BEGIN_VERBOSE_() +#endif + +#define RAPIDJSON_SCHEMA_HANDLE_BEGIN_(method, arg1)\ + if (!valid_) return false; \ + if (!BeginValue() || !CurrentSchema().method arg1) {\ + RAPIDJSON_SCHEMA_HANDLE_BEGIN_VERBOSE_();\ + return valid_ = false;\ + } + +#define RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(method, arg2)\ + for (Context* context = schemaStack_.template Bottom(); context != schemaStack_.template End(); context++) {\ + if (context->hasher)\ + static_cast(context->hasher)->method arg2;\ + if (context->validators)\ + for (SizeType i_ = 0; i_ < context->validatorCount; i_++)\ + static_cast(context->validators[i_])->method arg2;\ + if (context->patternPropertiesValidators)\ + for (SizeType i_ = 0; i_ < context->patternPropertiesValidatorCount; i_++)\ + static_cast(context->patternPropertiesValidators[i_])->method arg2;\ + } + +#define RAPIDJSON_SCHEMA_HANDLE_END_(method, arg2)\ + return valid_ = EndValue() && (!outputHandler_ || outputHandler_->method arg2) + +#define RAPIDJSON_SCHEMA_HANDLE_VALUE_(method, arg1, arg2) \ + RAPIDJSON_SCHEMA_HANDLE_BEGIN_ (method, arg1);\ + RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(method, arg2);\ + RAPIDJSON_SCHEMA_HANDLE_END_ (method, arg2) + + bool Null() { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Null, (CurrentContext()), ( )); } + bool Bool(bool b) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Bool, (CurrentContext(), b), (b)); } + bool Int(int i) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Int, (CurrentContext(), i), (i)); } + bool Uint(unsigned u) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Uint, (CurrentContext(), u), (u)); } + bool Int64(int64_t i) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Int64, (CurrentContext(), i), (i)); } + bool Uint64(uint64_t u) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Uint64, (CurrentContext(), u), (u)); } + bool Double(double d) { RAPIDJSON_SCHEMA_HANDLE_VALUE_(Double, (CurrentContext(), d), (d)); } + bool RawNumber(const Ch* str, SizeType length, bool copy) + { RAPIDJSON_SCHEMA_HANDLE_VALUE_(String, (CurrentContext(), str, length, copy), (str, length, copy)); } + bool String(const Ch* str, SizeType length, bool copy) + { RAPIDJSON_SCHEMA_HANDLE_VALUE_(String, (CurrentContext(), str, length, copy), (str, length, copy)); } + + bool StartObject() { + RAPIDJSON_SCHEMA_HANDLE_BEGIN_(StartObject, (CurrentContext())); + RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(StartObject, ()); + return valid_ = !outputHandler_ || outputHandler_->StartObject(); + } + + bool Key(const Ch* str, SizeType len, bool copy) { + if (!valid_) return false; + AppendToken(str, len); + if (!CurrentSchema().Key(CurrentContext(), str, len, copy)) return valid_ = false; + RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(Key, (str, len, copy)); + return valid_ = !outputHandler_ || outputHandler_->Key(str, len, copy); + } + + bool EndObject(SizeType memberCount) { + if (!valid_) return false; + RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(EndObject, (memberCount)); + if (!CurrentSchema().EndObject(CurrentContext(), memberCount)) return valid_ = false; + RAPIDJSON_SCHEMA_HANDLE_END_(EndObject, (memberCount)); + } + + bool StartArray() { + RAPIDJSON_SCHEMA_HANDLE_BEGIN_(StartArray, (CurrentContext())); + RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(StartArray, ()); + return valid_ = !outputHandler_ || outputHandler_->StartArray(); + } + + bool EndArray(SizeType elementCount) { + if (!valid_) return false; + RAPIDJSON_SCHEMA_HANDLE_PARALLEL_(EndArray, (elementCount)); + if (!CurrentSchema().EndArray(CurrentContext(), elementCount)) return valid_ = false; + RAPIDJSON_SCHEMA_HANDLE_END_(EndArray, (elementCount)); + } + +#undef RAPIDJSON_SCHEMA_HANDLE_BEGIN_VERBOSE_ +#undef RAPIDJSON_SCHEMA_HANDLE_BEGIN_ +#undef RAPIDJSON_SCHEMA_HANDLE_PARALLEL_ +#undef RAPIDJSON_SCHEMA_HANDLE_VALUE_ + + // Implementation of ISchemaStateFactory + virtual ISchemaValidator* CreateSchemaValidator(const SchemaType& root) { + return new (GetStateAllocator().Malloc(sizeof(GenericSchemaValidator))) GenericSchemaValidator(*schemaDocument_, root, documentStack_.template Bottom(), documentStack_.GetSize(), +#if RAPIDJSON_SCHEMA_VERBOSE + depth_ + 1, +#endif + &GetStateAllocator()); + } + + virtual void DestroySchemaValidator(ISchemaValidator* validator) { + GenericSchemaValidator* v = static_cast(validator); + v->~GenericSchemaValidator(); + StateAllocator::Free(v); + } + + virtual void* CreateHasher() { + return new (GetStateAllocator().Malloc(sizeof(HasherType))) HasherType(&GetStateAllocator()); + } + + virtual uint64_t GetHashCode(void* hasher) { + return static_cast(hasher)->GetHashCode(); + } + + virtual void DestroryHasher(void* hasher) { + HasherType* h = static_cast(hasher); + h->~HasherType(); + StateAllocator::Free(h); + } + + virtual void* MallocState(size_t size) { + return GetStateAllocator().Malloc(size); + } + + virtual void FreeState(void* p) { + StateAllocator::Free(p); + } + +private: + typedef typename SchemaType::Context Context; + typedef GenericValue, StateAllocator> HashCodeArray; + typedef internal::Hasher HasherType; + + GenericSchemaValidator( + const SchemaDocumentType& schemaDocument, + const SchemaType& root, + const char* basePath, size_t basePathSize, +#if RAPIDJSON_SCHEMA_VERBOSE + unsigned depth, +#endif + StateAllocator* allocator = 0, + size_t schemaStackCapacity = kDefaultSchemaStackCapacity, + size_t documentStackCapacity = kDefaultDocumentStackCapacity) + : + schemaDocument_(&schemaDocument), + root_(root), + stateAllocator_(allocator), + ownStateAllocator_(0), + schemaStack_(allocator, schemaStackCapacity), + documentStack_(allocator, documentStackCapacity), + outputHandler_(0), + error_(kObjectType), + currentError_(), + missingDependents_(), + valid_(true) +#if RAPIDJSON_SCHEMA_VERBOSE + , depth_(depth) +#endif + { + if (basePath && basePathSize) + memcpy(documentStack_.template Push(basePathSize), basePath, basePathSize); + } + + StateAllocator& GetStateAllocator() { + if (!stateAllocator_) + stateAllocator_ = ownStateAllocator_ = RAPIDJSON_NEW(StateAllocator)(); + return *stateAllocator_; + } + + bool BeginValue() { + if (schemaStack_.Empty()) + PushSchema(root_); + else { + if (CurrentContext().inArray) + internal::TokenHelper, Ch>::AppendIndexToken(documentStack_, CurrentContext().arrayElementIndex); + + if (!CurrentSchema().BeginValue(CurrentContext())) + return false; + + SizeType count = CurrentContext().patternPropertiesSchemaCount; + const SchemaType** sa = CurrentContext().patternPropertiesSchemas; + typename Context::PatternValidatorType patternValidatorType = CurrentContext().valuePatternValidatorType; + bool valueUniqueness = CurrentContext().valueUniqueness; + RAPIDJSON_ASSERT(CurrentContext().valueSchema); + PushSchema(*CurrentContext().valueSchema); + + if (count > 0) { + CurrentContext().objectPatternValidatorType = patternValidatorType; + ISchemaValidator**& va = CurrentContext().patternPropertiesValidators; + SizeType& validatorCount = CurrentContext().patternPropertiesValidatorCount; + va = static_cast(MallocState(sizeof(ISchemaValidator*) * count)); + for (SizeType i = 0; i < count; i++) + va[validatorCount++] = CreateSchemaValidator(*sa[i]); + } + + CurrentContext().arrayUniqueness = valueUniqueness; + } + return true; + } + + bool EndValue() { + if (!CurrentSchema().EndValue(CurrentContext())) + return false; + +#if RAPIDJSON_SCHEMA_VERBOSE + GenericStringBuffer sb; + schemaDocument_->GetPointer(&CurrentSchema()).Stringify(sb); + + *documentStack_.template Push() = '\0'; + documentStack_.template Pop(1); + internal::PrintValidatorPointers(depth_, sb.GetString(), documentStack_.template Bottom()); +#endif + + uint64_t h = CurrentContext().arrayUniqueness ? static_cast(CurrentContext().hasher)->GetHashCode() : 0; + + PopSchema(); + + if (!schemaStack_.Empty()) { + Context& context = CurrentContext(); + if (context.valueUniqueness) { + HashCodeArray* a = static_cast(context.arrayElementHashCodes); + if (!a) + CurrentContext().arrayElementHashCodes = a = new (GetStateAllocator().Malloc(sizeof(HashCodeArray))) HashCodeArray(kArrayType); + for (typename HashCodeArray::ConstValueIterator itr = a->Begin(); itr != a->End(); ++itr) + if (itr->GetUint64() == h) { + DuplicateItems(static_cast(itr - a->Begin()), a->Size()); + RAPIDJSON_INVALID_KEYWORD_RETURN(SchemaType::GetUniqueItemsString()); + } + a->PushBack(h, GetStateAllocator()); + } + } + + // Remove the last token of document pointer + while (!documentStack_.Empty() && *documentStack_.template Pop(1) != '/') + ; + + return true; + } + + void AppendToken(const Ch* str, SizeType len) { + documentStack_.template Reserve(1 + len * 2); // worst case all characters are escaped as two characters + *documentStack_.template PushUnsafe() = '/'; + for (SizeType i = 0; i < len; i++) { + if (str[i] == '~') { + *documentStack_.template PushUnsafe() = '~'; + *documentStack_.template PushUnsafe() = '0'; + } + else if (str[i] == '/') { + *documentStack_.template PushUnsafe() = '~'; + *documentStack_.template PushUnsafe() = '1'; + } + else + *documentStack_.template PushUnsafe() = str[i]; + } + } + + RAPIDJSON_FORCEINLINE void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push()) Context(*this, *this, &schema); } + + RAPIDJSON_FORCEINLINE void PopSchema() { + Context* c = schemaStack_.template Pop(1); + if (HashCodeArray* a = static_cast(c->arrayElementHashCodes)) { + a->~HashCodeArray(); + StateAllocator::Free(a); + } + c->~Context(); + } + + void AddErrorLocation(ValueType& result, bool parent) { + GenericStringBuffer sb; + PointerType instancePointer = GetInvalidDocumentPointer(); + ((parent && instancePointer.GetTokenCount() > 0) + ? PointerType(instancePointer.GetTokens(), instancePointer.GetTokenCount() - 1) + : instancePointer).StringifyUriFragment(sb); + ValueType instanceRef(sb.GetString(), static_cast(sb.GetSize() / sizeof(Ch)), + GetStateAllocator()); + result.AddMember(GetInstanceRefString(), instanceRef, GetStateAllocator()); + sb.Clear(); + memcpy(sb.Push(CurrentSchema().GetURI().GetStringLength()), + CurrentSchema().GetURI().GetString(), + CurrentSchema().GetURI().GetStringLength() * sizeof(Ch)); + GetInvalidSchemaPointer().StringifyUriFragment(sb); + ValueType schemaRef(sb.GetString(), static_cast(sb.GetSize() / sizeof(Ch)), + GetStateAllocator()); + result.AddMember(GetSchemaRefString(), schemaRef, GetStateAllocator()); + } + + void AddError(ValueType& keyword, ValueType& error) { + typename ValueType::MemberIterator member = error_.FindMember(keyword); + if (member == error_.MemberEnd()) + error_.AddMember(keyword, error, GetStateAllocator()); + else { + if (member->value.IsObject()) { + ValueType errors(kArrayType); + errors.PushBack(member->value, GetStateAllocator()); + member->value = errors; + } + member->value.PushBack(error, GetStateAllocator()); + } + } + + void AddCurrentError(const typename SchemaType::ValueType& keyword, bool parent = false) { + AddErrorLocation(currentError_, parent); + AddError(ValueType(keyword, GetStateAllocator(), false).Move(), currentError_); + } + + void MergeError(ValueType& other) { + for (typename ValueType::MemberIterator it = other.MemberBegin(), end = other.MemberEnd(); it != end; ++it) { + AddError(it->name, it->value); + } + } + + void AddNumberError(const typename SchemaType::ValueType& keyword, ValueType& actual, const SValue& expected, + const typename SchemaType::ValueType& (*exclusive)() = 0) { + currentError_.SetObject(); + currentError_.AddMember(GetActualString(), actual, GetStateAllocator()); + currentError_.AddMember(GetExpectedString(), ValueType(expected, GetStateAllocator()).Move(), GetStateAllocator()); + if (exclusive) + currentError_.AddMember(ValueType(exclusive(), GetStateAllocator()).Move(), true, GetStateAllocator()); + AddCurrentError(keyword); + } + + void AddErrorArray(const typename SchemaType::ValueType& keyword, + ISchemaValidator** subvalidators, SizeType count) { + ValueType errors(kArrayType); + for (SizeType i = 0; i < count; ++i) + errors.PushBack(static_cast(subvalidators[i])->GetError(), GetStateAllocator()); + currentError_.SetObject(); + currentError_.AddMember(GetErrorsString(), errors, GetStateAllocator()); + AddCurrentError(keyword); + } + + const SchemaType& CurrentSchema() const { return *schemaStack_.template Top()->schema; } + Context& CurrentContext() { return *schemaStack_.template Top(); } + const Context& CurrentContext() const { return *schemaStack_.template Top(); } + + static const size_t kDefaultSchemaStackCapacity = 1024; + static const size_t kDefaultDocumentStackCapacity = 256; + const SchemaDocumentType* schemaDocument_; + const SchemaType& root_; + StateAllocator* stateAllocator_; + StateAllocator* ownStateAllocator_; + internal::Stack schemaStack_; //!< stack to store the current path of schema (BaseSchemaType *) + internal::Stack documentStack_; //!< stack to store the current path of validating document (Ch) + OutputHandler* outputHandler_; + ValueType error_; + ValueType currentError_; + ValueType missingDependents_; + bool valid_; +#if RAPIDJSON_SCHEMA_VERBOSE + unsigned depth_; +#endif +}; + +typedef GenericSchemaValidator SchemaValidator; + +/////////////////////////////////////////////////////////////////////////////// +// SchemaValidatingReader + +//! A helper class for parsing with validation. +/*! + This helper class is a functor, designed as a parameter of \ref GenericDocument::Populate(). + + \tparam parseFlags Combination of \ref ParseFlag. + \tparam InputStream Type of input stream, implementing Stream concept. + \tparam SourceEncoding Encoding of the input stream. + \tparam SchemaDocumentType Type of schema document. + \tparam StackAllocator Allocator type for stack. +*/ +template < + unsigned parseFlags, + typename InputStream, + typename SourceEncoding, + typename SchemaDocumentType = SchemaDocument, + typename StackAllocator = CrtAllocator> +class SchemaValidatingReader { +public: + typedef typename SchemaDocumentType::PointerType PointerType; + typedef typename InputStream::Ch Ch; + typedef GenericValue ValueType; + + //! Constructor + /*! + \param is Input stream. + \param sd Schema document. + */ + SchemaValidatingReader(InputStream& is, const SchemaDocumentType& sd) : is_(is), sd_(sd), invalidSchemaKeyword_(), error_(kObjectType), isValid_(true) {} + + template + bool operator()(Handler& handler) { + GenericReader reader; + GenericSchemaValidator validator(sd_, handler); + parseResult_ = reader.template Parse(is_, validator); + + isValid_ = validator.IsValid(); + if (isValid_) { + invalidSchemaPointer_ = PointerType(); + invalidSchemaKeyword_ = 0; + invalidDocumentPointer_ = PointerType(); + error_.SetObject(); + } + else { + invalidSchemaPointer_ = validator.GetInvalidSchemaPointer(); + invalidSchemaKeyword_ = validator.GetInvalidSchemaKeyword(); + invalidDocumentPointer_ = validator.GetInvalidDocumentPointer(); + error_.CopyFrom(validator.GetError(), allocator_); + } + + return parseResult_; + } + + const ParseResult& GetParseResult() const { return parseResult_; } + bool IsValid() const { return isValid_; } + const PointerType& GetInvalidSchemaPointer() const { return invalidSchemaPointer_; } + const Ch* GetInvalidSchemaKeyword() const { return invalidSchemaKeyword_; } + const PointerType& GetInvalidDocumentPointer() const { return invalidDocumentPointer_; } + const ValueType& GetError() const { return error_; } + +private: + InputStream& is_; + const SchemaDocumentType& sd_; + + ParseResult parseResult_; + PointerType invalidSchemaPointer_; + const Ch* invalidSchemaKeyword_; + PointerType invalidDocumentPointer_; + StackAllocator allocator_; + ValueType error_; + bool isValid_; +}; + +RAPIDJSON_NAMESPACE_END +RAPIDJSON_DIAG_POP + +#endif // RAPIDJSON_SCHEMA_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/stream.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/stream.h new file mode 100644 index 0000000..7f2643e --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/stream.h @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rapidjson.h" + +#ifndef RAPIDJSON_STREAM_H_ +#define RAPIDJSON_STREAM_H_ + +#include "encodings.h" + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// Stream + +/*! \class rapidjson::Stream + \brief Concept for reading and writing characters. + + For read-only stream, no need to implement PutBegin(), Put(), Flush() and PutEnd(). + + For write-only stream, only need to implement Put() and Flush(). + +\code +concept Stream { + typename Ch; //!< Character type of the stream. + + //! Read the current character from stream without moving the read cursor. + Ch Peek() const; + + //! Read the current character from stream and moving the read cursor to next character. + Ch Take(); + + //! Get the current read cursor. + //! \return Number of characters read from start. + size_t Tell(); + + //! Begin writing operation at the current read pointer. + //! \return The begin writer pointer. + Ch* PutBegin(); + + //! Write a character. + void Put(Ch c); + + //! Flush the buffer. + void Flush(); + + //! End the writing operation. + //! \param begin The begin write pointer returned by PutBegin(). + //! \return Number of characters written. + size_t PutEnd(Ch* begin); +} +\endcode +*/ + +//! Provides additional information for stream. +/*! + By using traits pattern, this type provides a default configuration for stream. + For custom stream, this type can be specialized for other configuration. + See TEST(Reader, CustomStringStream) in readertest.cpp for example. +*/ +template +struct StreamTraits { + //! Whether to make local copy of stream for optimization during parsing. + /*! + By default, for safety, streams do not use local copy optimization. + Stream that can be copied fast should specialize this, like StreamTraits. + */ + enum { copyOptimization = 0 }; +}; + +//! Reserve n characters for writing to a stream. +template +inline void PutReserve(Stream& stream, size_t count) { + (void)stream; + (void)count; +} + +//! Write character to a stream, presuming buffer is reserved. +template +inline void PutUnsafe(Stream& stream, typename Stream::Ch c) { + stream.Put(c); +} + +//! Put N copies of a character to a stream. +template +inline void PutN(Stream& stream, Ch c, size_t n) { + PutReserve(stream, n); + for (size_t i = 0; i < n; i++) + PutUnsafe(stream, c); +} + +/////////////////////////////////////////////////////////////////////////////// +// GenericStreamWrapper + +//! A Stream Wrapper +/*! \tThis string stream is a wrapper for any stream by just forwarding any + \treceived message to the origin stream. + \note implements Stream concept +*/ + +#if defined(_MSC_VER) && _MSC_VER <= 1800 +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4702) // unreachable code +RAPIDJSON_DIAG_OFF(4512) // assignment operator could not be generated +#endif + +template > +class GenericStreamWrapper { +public: + typedef typename Encoding::Ch Ch; + GenericStreamWrapper(InputStream& is): is_(is) {} + + Ch Peek() const { return is_.Peek(); } + Ch Take() { return is_.Take(); } + size_t Tell() { return is_.Tell(); } + Ch* PutBegin() { return is_.PutBegin(); } + void Put(Ch ch) { is_.Put(ch); } + void Flush() { is_.Flush(); } + size_t PutEnd(Ch* ch) { return is_.PutEnd(ch); } + + // wrapper for MemoryStream + const Ch* Peek4() const { return is_.Peek4(); } + + // wrapper for AutoUTFInputStream + UTFType GetType() const { return is_.GetType(); } + bool HasBOM() const { return is_.HasBOM(); } + +protected: + InputStream& is_; +}; + +#if defined(_MSC_VER) && _MSC_VER <= 1800 +RAPIDJSON_DIAG_POP +#endif + +/////////////////////////////////////////////////////////////////////////////// +// StringStream + +//! Read-only string stream. +/*! \note implements Stream concept +*/ +template +struct GenericStringStream { + typedef typename Encoding::Ch Ch; + + GenericStringStream(const Ch *src) : src_(src), head_(src) {} + + Ch Peek() const { return *src_; } + Ch Take() { return *src_++; } + size_t Tell() const { return static_cast(src_ - head_); } + + Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } + void Put(Ch) { RAPIDJSON_ASSERT(false); } + void Flush() { RAPIDJSON_ASSERT(false); } + size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } + + const Ch* src_; //!< Current read position. + const Ch* head_; //!< Original head of the string. +}; + +template +struct StreamTraits > { + enum { copyOptimization = 1 }; +}; + +//! String stream with UTF8 encoding. +typedef GenericStringStream > StringStream; + +/////////////////////////////////////////////////////////////////////////////// +// InsituStringStream + +//! A read-write string stream. +/*! This string stream is particularly designed for in-situ parsing. + \note implements Stream concept +*/ +template +struct GenericInsituStringStream { + typedef typename Encoding::Ch Ch; + + GenericInsituStringStream(Ch *src) : src_(src), dst_(0), head_(src) {} + + // Read + Ch Peek() { return *src_; } + Ch Take() { return *src_++; } + size_t Tell() { return static_cast(src_ - head_); } + + // Write + void Put(Ch c) { RAPIDJSON_ASSERT(dst_ != 0); *dst_++ = c; } + + Ch* PutBegin() { return dst_ = src_; } + size_t PutEnd(Ch* begin) { return static_cast(dst_ - begin); } + void Flush() {} + + Ch* Push(size_t count) { Ch* begin = dst_; dst_ += count; return begin; } + void Pop(size_t count) { dst_ -= count; } + + Ch* src_; + Ch* dst_; + Ch* head_; +}; + +template +struct StreamTraits > { + enum { copyOptimization = 1 }; +}; + +//! Insitu string stream with UTF8 encoding. +typedef GenericInsituStringStream > InsituStringStream; + +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_STREAM_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/stringbuffer.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/stringbuffer.h new file mode 100644 index 0000000..4e38b82 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/stringbuffer.h @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_STRINGBUFFER_H_ +#define RAPIDJSON_STRINGBUFFER_H_ + +#include "stream.h" +#include "internal/stack.h" + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS +#include // std::move +#endif + +#include "internal/stack.h" + +#if defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Represents an in-memory output stream. +/*! + \tparam Encoding Encoding of the stream. + \tparam Allocator type for allocating memory buffer. + \note implements Stream concept +*/ +template +class GenericStringBuffer { +public: + typedef typename Encoding::Ch Ch; + + GenericStringBuffer(Allocator* allocator = 0, size_t capacity = kDefaultCapacity) : stack_(allocator, capacity) {} + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericStringBuffer(GenericStringBuffer&& rhs) : stack_(std::move(rhs.stack_)) {} + GenericStringBuffer& operator=(GenericStringBuffer&& rhs) { + if (&rhs != this) + stack_ = std::move(rhs.stack_); + return *this; + } +#endif + + void Put(Ch c) { *stack_.template Push() = c; } + void PutUnsafe(Ch c) { *stack_.template PushUnsafe() = c; } + void Flush() {} + + void Clear() { stack_.Clear(); } + void ShrinkToFit() { + // Push and pop a null terminator. This is safe. + *stack_.template Push() = '\0'; + stack_.ShrinkToFit(); + stack_.template Pop(1); + } + + void Reserve(size_t count) { stack_.template Reserve(count); } + Ch* Push(size_t count) { return stack_.template Push(count); } + Ch* PushUnsafe(size_t count) { return stack_.template PushUnsafe(count); } + void Pop(size_t count) { stack_.template Pop(count); } + + const Ch* GetString() const { + // Push and pop a null terminator. This is safe. + *stack_.template Push() = '\0'; + stack_.template Pop(1); + + return stack_.template Bottom(); + } + + //! Get the size of string in bytes in the string buffer. + size_t GetSize() const { return stack_.GetSize(); } + + //! Get the length of string in Ch in the string buffer. + size_t GetLength() const { return stack_.GetSize() / sizeof(Ch); } + + static const size_t kDefaultCapacity = 256; + mutable internal::Stack stack_; + +private: + // Prohibit copy constructor & assignment operator. + GenericStringBuffer(const GenericStringBuffer&); + GenericStringBuffer& operator=(const GenericStringBuffer&); +}; + +//! String buffer with UTF8 encoding +typedef GenericStringBuffer > StringBuffer; + +template +inline void PutReserve(GenericStringBuffer& stream, size_t count) { + stream.Reserve(count); +} + +template +inline void PutUnsafe(GenericStringBuffer& stream, typename Encoding::Ch c) { + stream.PutUnsafe(c); +} + +//! Implement specialized version of PutN() with memset() for better performance. +template<> +inline void PutN(GenericStringBuffer >& stream, char c, size_t n) { + std::memset(stream.stack_.Push(n), c, n * sizeof(c)); +} + +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_STRINGBUFFER_H_ diff --git a/inference-engine/thirdparty/clDNN/utils/rapidjson/writer.h b/inference-engine/thirdparty/clDNN/utils/rapidjson/writer.h new file mode 100644 index 0000000..6f5b690 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/utils/rapidjson/writer.h @@ -0,0 +1,709 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_WRITER_H_ +#define RAPIDJSON_WRITER_H_ + +#include "stream.h" +#include "internal/meta.h" +#include "internal/stack.h" +#include "internal/strfunc.h" +#include "internal/dtoa.h" +#include "internal/itoa.h" +#include "stringbuffer.h" +#include // placement new + +#if defined(RAPIDJSON_SIMD) && defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanForward) +#endif +#ifdef RAPIDJSON_SSE42 +#include +#elif defined(RAPIDJSON_SSE2) +#include +#elif defined(RAPIDJSON_NEON) +#include +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(unreachable-code) +RAPIDJSON_DIAG_OFF(c++98-compat) +#elif defined(_MSC_VER) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// WriteFlag + +/*! \def RAPIDJSON_WRITE_DEFAULT_FLAGS + \ingroup RAPIDJSON_CONFIG + \brief User-defined kWriteDefaultFlags definition. + + User can define this as any \c WriteFlag combinations. +*/ +#ifndef RAPIDJSON_WRITE_DEFAULT_FLAGS +#define RAPIDJSON_WRITE_DEFAULT_FLAGS kWriteNoFlags +#endif + +//! Combination of writeFlags +enum WriteFlag { + kWriteNoFlags = 0, //!< No flags are set. + kWriteValidateEncodingFlag = 1, //!< Validate encoding of JSON strings. + kWriteNanAndInfFlag = 2, //!< Allow writing of Infinity, -Infinity and NaN. + kWriteDefaultFlags = RAPIDJSON_WRITE_DEFAULT_FLAGS //!< Default write flags. Can be customized by defining RAPIDJSON_WRITE_DEFAULT_FLAGS +}; + +//! JSON writer +/*! Writer implements the concept Handler. + It generates JSON text by events to an output os. + + User may programmatically calls the functions of a writer to generate JSON text. + + On the other side, a writer can also be passed to objects that generates events, + + for example Reader::Parse() and Document::Accept(). + + \tparam OutputStream Type of output stream. + \tparam SourceEncoding Encoding of source string. + \tparam TargetEncoding Encoding of output stream. + \tparam StackAllocator Type of allocator for allocating memory of stack. + \note implements Handler concept +*/ +template, typename TargetEncoding = UTF8<>, typename StackAllocator = CrtAllocator, unsigned writeFlags = kWriteDefaultFlags> +class Writer { +public: + typedef typename SourceEncoding::Ch Ch; + + static const int kDefaultMaxDecimalPlaces = 324; + + //! Constructor + /*! \param os Output stream. + \param stackAllocator User supplied allocator. If it is null, it will create a private one. + \param levelDepth Initial capacity of stack. + */ + explicit + Writer(OutputStream& os, StackAllocator* stackAllocator = 0, size_t levelDepth = kDefaultLevelDepth) : + os_(&os), level_stack_(stackAllocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {} + + explicit + Writer(StackAllocator* allocator = 0, size_t levelDepth = kDefaultLevelDepth) : + os_(0), level_stack_(allocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {} + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + Writer(Writer&& rhs) : + os_(rhs.os_), level_stack_(std::move(rhs.level_stack_)), maxDecimalPlaces_(rhs.maxDecimalPlaces_), hasRoot_(rhs.hasRoot_) { + rhs.os_ = 0; + } +#endif + + //! Reset the writer with a new stream. + /*! + This function reset the writer with a new stream and default settings, + in order to make a Writer object reusable for output multiple JSONs. + + \param os New output stream. + \code + Writer writer(os1); + writer.StartObject(); + // ... + writer.EndObject(); + + writer.Reset(os2); + writer.StartObject(); + // ... + writer.EndObject(); + \endcode + */ + void Reset(OutputStream& os) { + os_ = &os; + hasRoot_ = false; + level_stack_.Clear(); + } + + //! Checks whether the output is a complete JSON. + /*! + A complete JSON has a complete root object or array. + */ + bool IsComplete() const { + return hasRoot_ && level_stack_.Empty(); + } + + int GetMaxDecimalPlaces() const { + return maxDecimalPlaces_; + } + + //! Sets the maximum number of decimal places for double output. + /*! + This setting truncates the output with specified number of decimal places. + + For example, + + \code + writer.SetMaxDecimalPlaces(3); + writer.StartArray(); + writer.Double(0.12345); // "0.123" + writer.Double(0.0001); // "0.0" + writer.Double(1.234567890123456e30); // "1.234567890123456e30" (do not truncate significand for positive exponent) + writer.Double(1.23e-4); // "0.0" (do truncate significand for negative exponent) + writer.EndArray(); + \endcode + + The default setting does not truncate any decimal places. You can restore to this setting by calling + \code + writer.SetMaxDecimalPlaces(Writer::kDefaultMaxDecimalPlaces); + \endcode + */ + void SetMaxDecimalPlaces(int maxDecimalPlaces) { + maxDecimalPlaces_ = maxDecimalPlaces; + } + + /*!@name Implementation of Handler + \see Handler + */ + //@{ + + bool Null() { Prefix(kNullType); return EndValue(WriteNull()); } + bool Bool(bool b) { Prefix(b ? kTrueType : kFalseType); return EndValue(WriteBool(b)); } + bool Int(int i) { Prefix(kNumberType); return EndValue(WriteInt(i)); } + bool Uint(unsigned u) { Prefix(kNumberType); return EndValue(WriteUint(u)); } + bool Int64(int64_t i64) { Prefix(kNumberType); return EndValue(WriteInt64(i64)); } + bool Uint64(uint64_t u64) { Prefix(kNumberType); return EndValue(WriteUint64(u64)); } + + //! Writes the given \c double value to the stream + /*! + \param d The value to be written. + \return Whether it is succeed. + */ + bool Double(double d) { Prefix(kNumberType); return EndValue(WriteDouble(d)); } + + bool RawNumber(const Ch* str, SizeType length, bool copy = false) { + RAPIDJSON_ASSERT(str != 0); + (void)copy; + Prefix(kNumberType); + return EndValue(WriteString(str, length)); + } + + bool String(const Ch* str, SizeType length, bool copy = false) { + RAPIDJSON_ASSERT(str != 0); + (void)copy; + Prefix(kStringType); + return EndValue(WriteString(str, length)); + } + +#if RAPIDJSON_HAS_STDSTRING + bool String(const std::basic_string& str) { + return String(str.data(), SizeType(str.size())); + } +#endif + + bool StartObject() { + Prefix(kObjectType); + new (level_stack_.template Push()) Level(false); + return WriteStartObject(); + } + + bool Key(const Ch* str, SizeType length, bool copy = false) { return String(str, length, copy); } + +#if RAPIDJSON_HAS_STDSTRING + bool Key(const std::basic_string& str) + { + return Key(str.data(), SizeType(str.size())); + } +#endif + + bool EndObject(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); // not inside an Object + RAPIDJSON_ASSERT(!level_stack_.template Top()->inArray); // currently inside an Array, not Object + RAPIDJSON_ASSERT(0 == level_stack_.template Top()->valueCount % 2); // Object has a Key without a Value + level_stack_.template Pop(1); + return EndValue(WriteEndObject()); + } + + bool StartArray() { + Prefix(kArrayType); + new (level_stack_.template Push()) Level(true); + return WriteStartArray(); + } + + bool EndArray(SizeType elementCount = 0) { + (void)elementCount; + RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); + RAPIDJSON_ASSERT(level_stack_.template Top()->inArray); + level_stack_.template Pop(1); + return EndValue(WriteEndArray()); + } + //@} + + /*! @name Convenience extensions */ + //@{ + + //! Simpler but slower overload. + bool String(const Ch* const& str) { return String(str, internal::StrLen(str)); } + bool Key(const Ch* const& str) { return Key(str, internal::StrLen(str)); } + + //@} + + //! Write a raw JSON value. + /*! + For user to write a stringified JSON as a value. + + \param json A well-formed JSON value. It should not contain null character within [0, length - 1] range. + \param length Length of the json. + \param type Type of the root of json. + */ + bool RawValue(const Ch* json, size_t length, Type type) { + RAPIDJSON_ASSERT(json != 0); + Prefix(type); + return EndValue(WriteRawValue(json, length)); + } + + //! Flush the output stream. + /*! + Allows the user to flush the output stream immediately. + */ + void Flush() { + os_->Flush(); + } + +protected: + //! Information for each nested level + struct Level { + Level(bool inArray_) : valueCount(0), inArray(inArray_) {} + size_t valueCount; //!< number of values in this level + bool inArray; //!< true if in array, otherwise in object + }; + + static const size_t kDefaultLevelDepth = 32; + + bool WriteNull() { + PutReserve(*os_, 4); + PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 'l'); return true; + } + + bool WriteBool(bool b) { + if (b) { + PutReserve(*os_, 4); + PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'r'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'e'); + } + else { + PutReserve(*os_, 5); + PutUnsafe(*os_, 'f'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 's'); PutUnsafe(*os_, 'e'); + } + return true; + } + + bool WriteInt(int i) { + char buffer[11]; + const char* end = internal::i32toa(i, buffer); + PutReserve(*os_, static_cast(end - buffer)); + for (const char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast(*p)); + return true; + } + + bool WriteUint(unsigned u) { + char buffer[10]; + const char* end = internal::u32toa(u, buffer); + PutReserve(*os_, static_cast(end - buffer)); + for (const char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast(*p)); + return true; + } + + bool WriteInt64(int64_t i64) { + char buffer[21]; + const char* end = internal::i64toa(i64, buffer); + PutReserve(*os_, static_cast(end - buffer)); + for (const char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast(*p)); + return true; + } + + bool WriteUint64(uint64_t u64) { + char buffer[20]; + char* end = internal::u64toa(u64, buffer); + PutReserve(*os_, static_cast(end - buffer)); + for (char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast(*p)); + return true; + } + + bool WriteDouble(double d) { + if (internal::Double(d).IsNanOrInf()) { + if (!(writeFlags & kWriteNanAndInfFlag)) + return false; + if (internal::Double(d).IsNan()) { + PutReserve(*os_, 3); + PutUnsafe(*os_, 'N'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'N'); + return true; + } + if (internal::Double(d).Sign()) { + PutReserve(*os_, 9); + PutUnsafe(*os_, '-'); + } + else + PutReserve(*os_, 8); + PutUnsafe(*os_, 'I'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'f'); + PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y'); + return true; + } + + char buffer[25]; + char* end = internal::dtoa(d, buffer, maxDecimalPlaces_); + PutReserve(*os_, static_cast(end - buffer)); + for (char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast(*p)); + return true; + } + + bool WriteString(const Ch* str, SizeType length) { + static const typename OutputStream::Ch hexDigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; + static const char escape[256] = { +#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + //0 1 2 3 4 5 6 7 8 9 A B C D E F + 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'b', 't', 'n', 'u', 'f', 'r', 'u', 'u', // 00 + 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', // 10 + 0, 0, '"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20 + Z16, Z16, // 30~4F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0, // 50 + Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16 // 60~FF +#undef Z16 + }; + + if (TargetEncoding::supportUnicode) + PutReserve(*os_, 2 + length * 6); // "\uxxxx..." + else + PutReserve(*os_, 2 + length * 12); // "\uxxxx\uyyyy..." + + PutUnsafe(*os_, '\"'); + GenericStringStream is(str); + while (ScanWriteUnescapedString(is, length)) { + const Ch c = is.Peek(); + if (!TargetEncoding::supportUnicode && static_cast(c) >= 0x80) { + // Unicode escaping + unsigned codepoint; + if (RAPIDJSON_UNLIKELY(!SourceEncoding::Decode(is, &codepoint))) + return false; + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, 'u'); + if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)) { + PutUnsafe(*os_, hexDigits[(codepoint >> 12) & 15]); + PutUnsafe(*os_, hexDigits[(codepoint >> 8) & 15]); + PutUnsafe(*os_, hexDigits[(codepoint >> 4) & 15]); + PutUnsafe(*os_, hexDigits[(codepoint ) & 15]); + } + else { + RAPIDJSON_ASSERT(codepoint >= 0x010000 && codepoint <= 0x10FFFF); + // Surrogate pair + unsigned s = codepoint - 0x010000; + unsigned lead = (s >> 10) + 0xD800; + unsigned trail = (s & 0x3FF) + 0xDC00; + PutUnsafe(*os_, hexDigits[(lead >> 12) & 15]); + PutUnsafe(*os_, hexDigits[(lead >> 8) & 15]); + PutUnsafe(*os_, hexDigits[(lead >> 4) & 15]); + PutUnsafe(*os_, hexDigits[(lead ) & 15]); + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, 'u'); + PutUnsafe(*os_, hexDigits[(trail >> 12) & 15]); + PutUnsafe(*os_, hexDigits[(trail >> 8) & 15]); + PutUnsafe(*os_, hexDigits[(trail >> 4) & 15]); + PutUnsafe(*os_, hexDigits[(trail ) & 15]); + } + } + else if ((sizeof(Ch) == 1 || static_cast(c) < 256) && RAPIDJSON_UNLIKELY(escape[static_cast(c)])) { + is.Take(); + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, static_cast(escape[static_cast(c)])); + if (escape[static_cast(c)] == 'u') { + PutUnsafe(*os_, '0'); + PutUnsafe(*os_, '0'); + PutUnsafe(*os_, hexDigits[static_cast(c) >> 4]); + PutUnsafe(*os_, hexDigits[static_cast(c) & 0xF]); + } + } + else if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ? + Transcoder::Validate(is, *os_) : + Transcoder::TranscodeUnsafe(is, *os_)))) + return false; + } + PutUnsafe(*os_, '\"'); + return true; + } + + bool ScanWriteUnescapedString(GenericStringStream& is, size_t length) { + return RAPIDJSON_LIKELY(is.Tell() < length); + } + + bool WriteStartObject() { os_->Put('{'); return true; } + bool WriteEndObject() { os_->Put('}'); return true; } + bool WriteStartArray() { os_->Put('['); return true; } + bool WriteEndArray() { os_->Put(']'); return true; } + + bool WriteRawValue(const Ch* json, size_t length) { + PutReserve(*os_, length); + GenericStringStream is(json); + while (RAPIDJSON_LIKELY(is.Tell() < length)) { + RAPIDJSON_ASSERT(is.Peek() != '\0'); + if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ? + Transcoder::Validate(is, *os_) : + Transcoder::TranscodeUnsafe(is, *os_)))) + return false; + } + return true; + } + + void Prefix(Type type) { + (void)type; + if (RAPIDJSON_LIKELY(level_stack_.GetSize() != 0)) { // this value is not at root + Level* level = level_stack_.template Top(); + if (level->valueCount > 0) { + if (level->inArray) + os_->Put(','); // add comma if it is not the first element in array + else // in object + os_->Put((level->valueCount % 2 == 0) ? ',' : ':'); + } + if (!level->inArray && level->valueCount % 2 == 0) + RAPIDJSON_ASSERT(type == kStringType); // if it's in object, then even number should be a name + level->valueCount++; + } + else { + RAPIDJSON_ASSERT(!hasRoot_); // Should only has one and only one root. + hasRoot_ = true; + } + } + + // Flush the value if it is the top level one. + bool EndValue(bool ret) { + if (RAPIDJSON_UNLIKELY(level_stack_.Empty())) // end of json text + Flush(); + return ret; + } + + OutputStream* os_; + internal::Stack level_stack_; + int maxDecimalPlaces_; + bool hasRoot_; + +private: + // Prohibit copy constructor & assignment operator. + Writer(const Writer&); + Writer& operator=(const Writer&); +}; + +// Full specialization for StringStream to prevent memory copying + +template<> +inline bool Writer::WriteInt(int i) { + char *buffer = os_->Push(11); + const char* end = internal::i32toa(i, buffer); + os_->Pop(static_cast(11 - (end - buffer))); + return true; +} + +template<> +inline bool Writer::WriteUint(unsigned u) { + char *buffer = os_->Push(10); + const char* end = internal::u32toa(u, buffer); + os_->Pop(static_cast(10 - (end - buffer))); + return true; +} + +template<> +inline bool Writer::WriteInt64(int64_t i64) { + char *buffer = os_->Push(21); + const char* end = internal::i64toa(i64, buffer); + os_->Pop(static_cast(21 - (end - buffer))); + return true; +} + +template<> +inline bool Writer::WriteUint64(uint64_t u) { + char *buffer = os_->Push(20); + const char* end = internal::u64toa(u, buffer); + os_->Pop(static_cast(20 - (end - buffer))); + return true; +} + +template<> +inline bool Writer::WriteDouble(double d) { + if (internal::Double(d).IsNanOrInf()) { + // Note: This code path can only be reached if (RAPIDJSON_WRITE_DEFAULT_FLAGS & kWriteNanAndInfFlag). + if (!(kWriteDefaultFlags & kWriteNanAndInfFlag)) + return false; + if (internal::Double(d).IsNan()) { + PutReserve(*os_, 3); + PutUnsafe(*os_, 'N'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'N'); + return true; + } + if (internal::Double(d).Sign()) { + PutReserve(*os_, 9); + PutUnsafe(*os_, '-'); + } + else + PutReserve(*os_, 8); + PutUnsafe(*os_, 'I'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'f'); + PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y'); + return true; + } + + char *buffer = os_->Push(25); + char* end = internal::dtoa(d, buffer, maxDecimalPlaces_); + os_->Pop(static_cast(25 - (end - buffer))); + return true; +} + +#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) +template<> +inline bool Writer::ScanWriteUnescapedString(StringStream& is, size_t length) { + if (length < 16) + return RAPIDJSON_LIKELY(is.Tell() < length); + + if (!RAPIDJSON_LIKELY(is.Tell() < length)) + return false; + + const char* p = is.src_; + const char* end = is.head_ + length; + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + const char* endAligned = reinterpret_cast(reinterpret_cast(end) & static_cast(~15)); + if (nextAligned > end) + return true; + + while (p != nextAligned) + if (*p < 0x20 || *p == '\"' || *p == '\\') { + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); + } + else + os_->PutUnsafe(*p++); + + // The rest of string using SIMD + static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; + static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; + const __m128i dq = _mm_loadu_si128(reinterpret_cast(&dquote[0])); + const __m128i bs = _mm_loadu_si128(reinterpret_cast(&bslash[0])); + const __m128i sp = _mm_loadu_si128(reinterpret_cast(&space[0])); + + for (; p != endAligned; p += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast(p)); + const __m128i t1 = _mm_cmpeq_epi8(s, dq); + const __m128i t2 = _mm_cmpeq_epi8(s, bs); + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F + const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); + unsigned short r = static_cast(_mm_movemask_epi8(x)); + if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped + SizeType len; +#ifdef _MSC_VER // Find the index of first escaped + unsigned long offset; + _BitScanForward(&offset, r); + len = offset; +#else + len = static_cast(__builtin_ffs(r) - 1); +#endif + char* q = reinterpret_cast(os_->PushUnsafe(len)); + for (size_t i = 0; i < len; i++) + q[i] = p[i]; + + p += len; + break; + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(os_->PushUnsafe(16)), s); + } + + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); +} +#elif defined(RAPIDJSON_NEON) +template<> +inline bool Writer::ScanWriteUnescapedString(StringStream& is, size_t length) { + if (length < 16) + return RAPIDJSON_LIKELY(is.Tell() < length); + + if (!RAPIDJSON_LIKELY(is.Tell() < length)) + return false; + + const char* p = is.src_; + const char* end = is.head_ + length; + const char* nextAligned = reinterpret_cast((reinterpret_cast(p) + 15) & static_cast(~15)); + const char* endAligned = reinterpret_cast(reinterpret_cast(end) & static_cast(~15)); + if (nextAligned > end) + return true; + + while (p != nextAligned) + if (*p < 0x20 || *p == '\"' || *p == '\\') { + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); + } + else + os_->PutUnsafe(*p++); + + // The rest of string using SIMD + const uint8x16_t s0 = vmovq_n_u8('"'); + const uint8x16_t s1 = vmovq_n_u8('\\'); + const uint8x16_t s2 = vmovq_n_u8('\b'); + const uint8x16_t s3 = vmovq_n_u8(32); + + for (; p != endAligned; p += 16) { + const uint8x16_t s = vld1q_u8(reinterpret_cast(p)); + uint8x16_t x = vceqq_u8(s, s0); + x = vorrq_u8(x, vceqq_u8(s, s1)); + x = vorrq_u8(x, vceqq_u8(s, s2)); + x = vorrq_u8(x, vcltq_u8(s, s3)); + + x = vrev64q_u8(x); // Rev in 64 + uint64_t low = vgetq_lane_u64(reinterpret_cast(x), 0); // extract + uint64_t high = vgetq_lane_u64(reinterpret_cast(x), 1); // extract + + SizeType len = 0; + bool escaped = false; + if (low == 0) { + if (high != 0) { + unsigned lz = (unsigned)__builtin_clzll(high); + len = 8 + (lz >> 3); + escaped = true; + } + } else { + unsigned lz = (unsigned)__builtin_clzll(low); + len = lz >> 3; + escaped = true; + } + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped + char* q = reinterpret_cast(os_->PushUnsafe(len)); + for (size_t i = 0; i < len; i++) + q[i] = p[i]; + + p += len; + break; + } + vst1q_u8(reinterpret_cast(os_->PushUnsafe(16)), s); + } + + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); +} +#endif // RAPIDJSON_NEON + +RAPIDJSON_NAMESPACE_END + +#if defined(_MSC_VER) || defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_RAPIDJSON_H_ diff --git a/inference-engine/thirdparty/clDNN/version.json b/inference-engine/thirdparty/clDNN/version.json index a26804e..9bb5352 100644 --- a/inference-engine/thirdparty/clDNN/version.json +++ b/inference-engine/thirdparty/clDNN/version.json @@ -3,7 +3,7 @@ { "major": 1, # clDNN major version (major version of API). "minor": 4, # clDNN minor version (correlated with major API version of Inference Engine). - "build": 14, # clDNN build version (correlated with ordinal numeber of public release of clDNN). + "build": 23, # clDNN build version (correlated with ordinal numeber of public release of clDNN). "revision_base": 0, # Offset that will be subtracted from environment variable provided by build system. "revision_min": -1 # Minumum value of revision. Computed value of revision will be clamped from below by this value. } \ No newline at end of file diff --git a/inference-engine/thirdparty/fluid/checksum.txt b/inference-engine/thirdparty/fluid/checksum.txt index d912ec0..ba34e30 100644 --- a/inference-engine/thirdparty/fluid/checksum.txt +++ b/inference-engine/thirdparty/fluid/checksum.txt @@ -1 +1 @@ -5d28798fbe1b11d9c9d6fcd28c02f07e +b4a07b700b3cd4537289644b593edbc4 diff --git a/inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt b/inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt index ec05b38..cc4cef7 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt +++ b/inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt @@ -23,6 +23,7 @@ file(GLOB gapi_ext_hdrs "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/util/*.hpp" "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cpu/*.hpp" "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/gpu/*.hpp" + "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/ocl/*.hpp" "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/fluid/*.hpp" "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/own/*.hpp" ) @@ -72,11 +73,11 @@ set(gapi_srcs src/backends/fluid/gfluidimgproc_func.dispatch.cpp src/backends/fluid/gfluidcore.cpp - # GPU Backend (currently built-in) - src/backends/gpu/ggpubackend.cpp - src/backends/gpu/ggpukernel.cpp - src/backends/gpu/ggpuimgproc.cpp - src/backends/gpu/ggpucore.cpp + # OCL Backend (currently built-in) + src/backends/ocl/goclbackend.cpp + src/backends/ocl/goclkernel.cpp + src/backends/ocl/goclimgproc.cpp + src/backends/ocl/goclcore.cpp # Compound src/backends/common/gcompoundbackend.cpp diff --git a/inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake b/inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake index 9f6ebef..12e2212 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake +++ b/inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake @@ -1,3 +1,9 @@ +OCV_OPTION(WITH_ADE "Enable ADE framework (required for Graph API module)" ON) + +if(NOT WITH_ADE) + return() +endif() + if (ade_DIR) # if ade_DIR is set, use ADE-supplied CMake script # to set up variables to the prebuilt ADE diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp index a043a83..b8f31e9 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp index 9af3620..597d251 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CORE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp index ec76fe5..6dbe8b0 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CPU_CORE_API_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp index facaab6..d44a995 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCPUKERNEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp index 0b96db0..c25ae61 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CPU_IMGPROC_API_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp index 8c21f57..d5a49e8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_FLUID_CORE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp index 8965ec7..aaf2f4d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_FLUID_BUFFER_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp index c71c5aa..d6480e3 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_FLUID_KERNEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp index dedfa9d..c83da86 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_FLUID_IMGPROC_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp index f8a3170..7867ea3 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GARG_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp index 87d0015..7a91127 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GARRAY_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp index baf4f44..777d30d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCALL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp index 6a3f51f..3066a33 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMMON_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp index ad491b7..227f663 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMPILED_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp index c5ac8a7..a3df713 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMPOUNDKERNEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp index e89b9ae..d4d0ba1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMPUTATION_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp index adc7da3..956e96d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GKERNEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp index 0fa5342..e1ef637 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GMAT_HPP @@ -142,7 +142,7 @@ namespace gapi { namespace own { GAPI_EXPORTS GMatDesc descr_of(const Mat &mat); }}//gapi::own -std::ostream& operator<<(std::ostream& os, const cv::GMatDesc &desc); +GAPI_EXPORTS std::ostream& operator<<(std::ostream& os, const cv::GMatDesc &desc); } // namespace cv diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp index 473be34..75179c3 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GMETAARG_HPP @@ -37,7 +37,7 @@ using GMetaArg = util::variant , GScalarDesc , GArrayDesc >; -std::ostream& operator<<(std::ostream& os, const GMetaArg &); +GAPI_EXPORTS std::ostream& operator<<(std::ostream& os, const GMetaArg &); using GMetaArgs = std::vector; @@ -61,6 +61,15 @@ namespace detail } // namespace detail +class Mat; +class UMat; +GAPI_EXPORTS cv::GMetaArgs descr_of(const std::vector &vec); +GAPI_EXPORTS cv::GMetaArgs descr_of(const std::vector &vec); +namespace gapi { namespace own { + class Mat; + GAPI_EXPORTS cv::GMetaArgs descr_of(const std::vector &vec); +}} // namespace gapi::own + } // namespace cv #endif // OPENCV_GAPI_GMETAARG_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp index 8b53d9b..8c89879 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GPROTO_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp index 98d49b5..5651020 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp @@ -2,22 +2,22 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GPU_CORE_API_HPP #define OPENCV_GAPI_GPU_CORE_API_HPP +/** @file +* @deprecated Use "opencv2/gapi/ocl/core.hpp" instead. +*/ -#include // GAPI_EXPORTS -#include // GKernelPackage +#include "opencv2/gapi/ocl/core.hpp" namespace cv { namespace gapi { namespace core { namespace gpu { - -GAPI_EXPORTS GKernelPackage kernels(); - + using namespace ocl; } // namespace gpu } // namespace core } // namespace gapi diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp index e5a6215..34a18b8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp @@ -2,243 +2,17 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GGPUKERNEL_HPP #define OPENCV_GAPI_GGPUKERNEL_HPP +/** @file +* @deprecated Use "opencv2/gapi/ocl/goclkernel.hpp" instead. +*/ -#include -#include -#include -#include +#include "opencv2/gapi/ocl/goclkernel.hpp" +#define GAPI_GPU_KERNEL GAPI_OCL_KERNEL -#include -#include -#include -#include - -// FIXME: namespace scheme for backends? -namespace cv { - -namespace gimpl -{ - // Forward-declare an internal class - class GGPUExecutable; -} // namespace gimpl - -namespace gapi -{ -namespace gpu -{ - /** - * \addtogroup gapi_std_backends G-API Standard backends - * @{ - */ - /** - * @brief Get a reference to GPU backend. - * - * At the moment, the GPU backend is built atop of OpenCV - * "Transparent API" (T-API), see cv::UMat for details. - * - * @sa gapi_std_backends - */ - GAPI_EXPORTS cv::gapi::GBackend backend(); - /** @} */ -} // namespace gpu -} // namespace gapi - - -// Represents arguments which are passed to a wrapped GPU function -// FIXME: put into detail? -class GAPI_EXPORTS GGPUContext -{ -public: - // Generic accessor API - template - const T& inArg(int input) { return m_args.at(input).get(); } - - // Syntax sugar - const cv::UMat& inMat(int input); - cv::UMat& outMatR(int output); // FIXME: Avoid cv::Mat m = ctx.outMatR() - - const cv::gapi::own::Scalar& inVal(int input); - cv::gapi::own::Scalar& outValR(int output); // FIXME: Avoid cv::gapi::own::Scalar s = ctx.outValR() - template std::vector& outVecR(int output) // FIXME: the same issue - { - return outVecRef(output).wref(); - } - -protected: - detail::VectorRef& outVecRef(int output); - - std::vector m_args; - std::unordered_map m_results; - - - friend class gimpl::GGPUExecutable; -}; - -class GAPI_EXPORTS GGPUKernel -{ -public: - // This function is kernel's execution entry point (does the processing work) - using F = std::function; - - GGPUKernel(); - explicit GGPUKernel(const F& f); - - void apply(GGPUContext &ctx); - -protected: - F m_f; -}; - -// FIXME: This is an ugly ad-hoc imlpementation. TODO: refactor - -namespace detail -{ -template struct gpu_get_in; -template<> struct gpu_get_in -{ - static cv::UMat get(GGPUContext &ctx, int idx) { return ctx.inMat(idx); } -}; -template<> struct gpu_get_in -{ - static cv::Scalar get(GGPUContext &ctx, int idx) { return to_ocv(ctx.inVal(idx)); } -}; -template struct gpu_get_in > -{ - static const std::vector& get(GGPUContext &ctx, int idx) { return ctx.inArg(idx).rref(); } -}; -template struct gpu_get_in -{ - static T get(GGPUContext &ctx, int idx) { return ctx.inArg(idx); } -}; - -struct tracked_cv_umat{ - //TODO Think if T - API could reallocate UMat to a proper size - how do we handle this ? - //tracked_cv_umat(cv::UMat& m) : r{(m)}, original_data{m.getMat(ACCESS_RW).data} {} - tracked_cv_umat(cv::UMat& m) : r{ (m) }, original_data{ nullptr } {} - cv::UMat r; - uchar* original_data; - - operator cv::UMat& (){ return r;} - void validate() const{ - //if (r.getMat(ACCESS_RW).data != original_data) - //{ - // util::throw_error - // (std::logic_error - // ("OpenCV kernel output parameter was reallocated. \n" - // "Incorrect meta data was provided ?")); - //} - - } -}; - -struct scalar_wrapper_gpu -{ - //FIXME reuse CPU (OpenCV) plugin code - scalar_wrapper_gpu(cv::gapi::own::Scalar& s) : m_s{cv::gapi::own::to_ocv(s)}, m_org_s(s) {}; - operator cv::Scalar& () { return m_s; } - void writeBack() const { m_org_s = to_own(m_s); } - - cv::Scalar m_s; - cv::gapi::own::Scalar& m_org_s; -}; - -template -void postprocess_gpu(Outputs&... outs) -{ - struct - { - void operator()(tracked_cv_umat* bm) { bm->validate(); } - void operator()(scalar_wrapper_gpu* sw) { sw->writeBack(); } - void operator()(...) { } - - } validate; - //dummy array to unfold parameter pack - int dummy[] = { 0, (validate(&outs), 0)... }; - cv::util::suppress_unused_warning(dummy); -} - -template struct gpu_get_out; -template<> struct gpu_get_out -{ - static tracked_cv_umat get(GGPUContext &ctx, int idx) - { - auto& r = ctx.outMatR(idx); - return{ r }; - } -}; -template<> struct gpu_get_out -{ - static scalar_wrapper_gpu get(GGPUContext &ctx, int idx) - { - auto& s = ctx.outValR(idx); - return{ s }; - } -}; -template struct gpu_get_out > -{ - static std::vector& get(GGPUContext &ctx, int idx) { return ctx.outVecR(idx); } -}; - -template -struct GPUCallHelper; - -// FIXME: probably can be simplified with std::apply or analogue. -template -struct GPUCallHelper, std::tuple > -{ - template - struct call_and_postprocess - { - template - static void call(Inputs&&... ins, Outputs&&... outs) - { - //not using a std::forward on outs is deliberate in order to - //cause compilation error, by tring to bind rvalue references to lvalue references - Impl::run(std::forward(ins)..., outs...); - - postprocess_gpu(outs...); - } - }; - - template - static void call_impl(GGPUContext &ctx, detail::Seq, detail::Seq) - { - //TODO: Make sure that OpenCV kernels do not reallocate memory for output parameters - //by comparing it's state (data ptr) before and after the call. - //Convert own::Scalar to cv::Scalar before call kernel and run kernel - //convert cv::Scalar to own::Scalar after call kernel and write back results - call_and_postprocess::get(ctx, IIs))...>::call(gpu_get_in::get(ctx, IIs)..., gpu_get_out::get(ctx, OIs)...); - } - - static void call(GGPUContext &ctx) - { - call_impl(ctx, - typename detail::MkSeq::type(), - typename detail::MkSeq::type()); - } -}; - -} // namespace detail - -template -class GGPUKernelImpl: public detail::GPUCallHelper -{ - using P = detail::GPUCallHelper; - -public: - using API = K; - - static cv::gapi::GBackend backend() { return cv::gapi::gpu::backend(); } - static cv::GGPUKernel kernel() { return GGPUKernel(&P::call); } -}; - -#define GAPI_GPU_KERNEL(Name, API) struct Name: public cv::GGPUKernelImpl - -} // namespace cv #endif // OPENCV_GAPI_GGPUKERNEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp index 6071dda..d83081d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp @@ -2,22 +2,23 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GPU_IMGPROC_API_HPP #define OPENCV_GAPI_GPU_IMGPROC_API_HPP +/** @file +* @deprecated Use "opencv2/gapi/ocl/imgproc.hpp" instead. +*/ + +#include "opencv2/gapi/ocl/imgproc.hpp" -#include // GAPI_EXPORTS -#include // GKernelPackage namespace cv { namespace gapi { namespace imgproc { namespace gpu { - -GAPI_EXPORTS GKernelPackage kernels(); - + using namespace ocl; } // namespace gpu } // namespace imgproc } // namespace gapi diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp index dd1205b..ee2237d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp @@ -3,7 +3,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GSCALAR_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp index d05e02e..09b4910 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GTYPE_TRAITS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp index a966f26..f32d050 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GTYPED_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp index aeed9fa..73b92d2 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_IMGPROC_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/core.hpp new file mode 100644 index 0000000..784ee20 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/core.hpp @@ -0,0 +1,27 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#ifndef OPENCV_GAPI_OCL_CORE_API_HPP +#define OPENCV_GAPI_OCL_CORE_API_HPP + +#include // GAPI_EXPORTS +#include // GKernelPackage + +namespace cv { +namespace gapi { +namespace core { +namespace ocl { + + GAPI_EXPORTS GKernelPackage kernels(); + +} // namespace ocl +} // namespace core +} // namespace gapi +} // namespace cv + + +#endif // OPENCV_GAPI_OCL_CORE_API_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/goclkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/goclkernel.hpp new file mode 100644 index 0000000..8f5c867 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/goclkernel.hpp @@ -0,0 +1,244 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#ifndef OPENCV_GAPI_GOCLKERNEL_HPP +#define OPENCV_GAPI_GOCLKERNEL_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include + +// FIXME: namespace scheme for backends? +namespace cv { + +namespace gimpl +{ + // Forward-declare an internal class + class GOCLExecutable; +} // namespace gimpl + +namespace gapi +{ +namespace ocl +{ + /** + * \addtogroup gapi_std_backends G-API Standard backends + * @{ + */ + /** + * @brief Get a reference to OCL backend. + * + * At the moment, the OCL backend is built atop of OpenCV + * "Transparent API" (T-API), see cv::UMat for details. + * + * @sa gapi_std_backends + */ + GAPI_EXPORTS cv::gapi::GBackend backend(); + /** @} */ +} // namespace ocl +} // namespace gapi + + +// Represents arguments which are passed to a wrapped OCL function +// FIXME: put into detail? +class GAPI_EXPORTS GOCLContext +{ +public: + // Generic accessor API + template + const T& inArg(int input) { return m_args.at(input).get(); } + + // Syntax sugar + const cv::UMat& inMat(int input); + cv::UMat& outMatR(int output); // FIXME: Avoid cv::Mat m = ctx.outMatR() + + const cv::gapi::own::Scalar& inVal(int input); + cv::gapi::own::Scalar& outValR(int output); // FIXME: Avoid cv::gapi::own::Scalar s = ctx.outValR() + template std::vector& outVecR(int output) // FIXME: the same issue + { + return outVecRef(output).wref(); + } + +protected: + detail::VectorRef& outVecRef(int output); + + std::vector m_args; + std::unordered_map m_results; + + + friend class gimpl::GOCLExecutable; +}; + +class GAPI_EXPORTS GOCLKernel +{ +public: + // This function is kernel's execution entry point (does the processing work) + using F = std::function; + + GOCLKernel(); + explicit GOCLKernel(const F& f); + + void apply(GOCLContext &ctx); + +protected: + F m_f; +}; + +// FIXME: This is an ugly ad-hoc imlpementation. TODO: refactor + +namespace detail +{ +template struct ocl_get_in; +template<> struct ocl_get_in +{ + static cv::UMat get(GOCLContext &ctx, int idx) { return ctx.inMat(idx); } +}; +template<> struct ocl_get_in +{ + static cv::Scalar get(GOCLContext &ctx, int idx) { return to_ocv(ctx.inVal(idx)); } +}; +template struct ocl_get_in > +{ + static const std::vector& get(GOCLContext &ctx, int idx) { return ctx.inArg(idx).rref(); } +}; +template struct ocl_get_in +{ + static T get(GOCLContext &ctx, int idx) { return ctx.inArg(idx); } +}; + +struct tracked_cv_umat{ + //TODO Think if T - API could reallocate UMat to a proper size - how do we handle this ? + //tracked_cv_umat(cv::UMat& m) : r{(m)}, original_data{m.getMat(ACCESS_RW).data} {} + tracked_cv_umat(cv::UMat& m) : r{ (m) }, original_data{ nullptr } {} + cv::UMat r; + uchar* original_data; + + operator cv::UMat& (){ return r;} + void validate() const{ + //if (r.getMat(ACCESS_RW).data != original_data) + //{ + // util::throw_error + // (std::logic_error + // ("OpenCV kernel output parameter was reallocated. \n" + // "Incorrect meta data was provided ?")); + //} + + } +}; + +struct scalar_wrapper_ocl +{ + //FIXME reuse CPU (OpenCV) plugin code + scalar_wrapper_ocl(cv::gapi::own::Scalar& s) : m_s{cv::gapi::own::to_ocv(s)}, m_org_s(s) {}; + operator cv::Scalar& () { return m_s; } + void writeBack() const { m_org_s = to_own(m_s); } + + cv::Scalar m_s; + cv::gapi::own::Scalar& m_org_s; +}; + +template +void postprocess_ocl(Outputs&... outs) +{ + struct + { + void operator()(tracked_cv_umat* bm) { bm->validate(); } + void operator()(scalar_wrapper_ocl* sw) { sw->writeBack(); } + void operator()(...) { } + + } validate; + //dummy array to unfold parameter pack + int dummy[] = { 0, (validate(&outs), 0)... }; + cv::util::suppress_unused_warning(dummy); +} + +template struct ocl_get_out; +template<> struct ocl_get_out +{ + static tracked_cv_umat get(GOCLContext &ctx, int idx) + { + auto& r = ctx.outMatR(idx); + return{ r }; + } +}; +template<> struct ocl_get_out +{ + static scalar_wrapper_ocl get(GOCLContext &ctx, int idx) + { + auto& s = ctx.outValR(idx); + return{ s }; + } +}; +template struct ocl_get_out > +{ + static std::vector& get(GOCLContext &ctx, int idx) { return ctx.outVecR(idx); } +}; + +template +struct OCLCallHelper; + +// FIXME: probably can be simplified with std::apply or analogue. +template +struct OCLCallHelper, std::tuple > +{ + template + struct call_and_postprocess + { + template + static void call(Inputs&&... ins, Outputs&&... outs) + { + //not using a std::forward on outs is deliberate in order to + //cause compilation error, by tring to bind rvalue references to lvalue references + Impl::run(std::forward(ins)..., outs...); + + postprocess_ocl(outs...); + } + }; + + template + static void call_impl(GOCLContext &ctx, detail::Seq, detail::Seq) + { + //TODO: Make sure that OpenCV kernels do not reallocate memory for output parameters + //by comparing it's state (data ptr) before and after the call. + //Convert own::Scalar to cv::Scalar before call kernel and run kernel + //convert cv::Scalar to own::Scalar after call kernel and write back results + call_and_postprocess::get(ctx, IIs))...>::call(ocl_get_in::get(ctx, IIs)..., ocl_get_out::get(ctx, OIs)...); + } + + static void call(GOCLContext &ctx) + { + call_impl(ctx, + typename detail::MkSeq::type(), + typename detail::MkSeq::type()); + } +}; + +} // namespace detail + +template +class GOCLKernelImpl: public detail::OCLCallHelper +{ + using P = detail::OCLCallHelper; + +public: + using API = K; + + static cv::gapi::GBackend backend() { return cv::gapi::ocl::backend(); } + static cv::GOCLKernel kernel() { return GOCLKernel(&P::call); } +}; + +#define GAPI_OCL_KERNEL(Name, API) struct Name: public cv::GOCLKernelImpl + +} // namespace cv + +#endif // OPENCV_GAPI_GOCLKERNEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/imgproc.hpp new file mode 100644 index 0000000..2330348 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/ocl/imgproc.hpp @@ -0,0 +1,27 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#ifndef OPENCV_GAPI_OCL_IMGPROC_API_HPP +#define OPENCV_GAPI_OCL_IMGPROC_API_HPP + +#include // GAPI_EXPORTS +#include // GKernelPackage + +namespace cv { +namespace gapi { +namespace imgproc { +namespace ocl { + + GAPI_EXPORTS GKernelPackage kernels(); + +} // namespace ocl +} // namespace imgproc +} // namespace gapi +} // namespace cv + + +#endif // OPENCV_GAPI_OCL_IMGPROC_API_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp index 5acf280..51e1318 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp @@ -3,7 +3,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OPENCV_INCLUDES_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp index 27a1d80..2143b3a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OPERATORS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp index 8d3feff..5cdfdf8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OWN_ASSERT_HPP @@ -10,7 +10,8 @@ #if !defined(GAPI_STANDALONE) #include -#define GAPI_Assert(expr) CV_Assert(expr) +#define GAPI_Assert CV_Assert +#define GAPI_DbgAssert CV_DbgAssert #else #include @@ -30,7 +31,6 @@ namespace detail #define GAPI_Assert(expr) \ { if (!(expr)) ::detail::assert_abort(#expr, __LINE__, __FILE__, __func__); } -#endif #ifdef NDEBUG # define GAPI_DbgAssert(expr) @@ -38,4 +38,6 @@ namespace detail # define GAPI_DbgAssert(expr) GAPI_Assert(expr) #endif +#endif // GAPI_STANDALONE + #endif // OPENCV_GAPI_OWN_ASSERT_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp index 8c1feb4..0fcc781 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OWN_CONVERT_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp index e110536..696a3ed 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CV_DEFS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp index 0d955d0..3c5c4b5 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OWN_TYPES_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp index 73f3afc..e761a38 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OWN_MAT_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp index 207dcde..7b39e61 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OWN_SATURATE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp index bda91c8..b538ba2 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GAPI_OWN_SCALAR_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp index 20445ee..8763234 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_TYPES_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp index 3146cb6..73087c6 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_UTIL_ANY_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp index 575655e..3204b00 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP #define OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP @@ -16,6 +16,4 @@ namespace util } // namespace util } // namespace cv -#define UNUSED(x) cv::util::suppress_unused_warning(x) - #endif /* OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP */ diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp index 54126d6..254d7ed 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_UTIL_OPTIONAL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp index 689bf58..191f669 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_UTIL_THROW_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp index d0378e0..0cf81e6 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_UTIL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp index cb0270a..4488d84 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_UTIL_VARIANT_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp index 2df4d88..33cbba1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "perf_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp index 8af7b1a..77fe427 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CORE_PERF_TESTS_HPP @@ -50,9 +50,9 @@ namespace opencv_test class MaxPerfTest : public TestPerfParams> {}; class AbsDiffPerfTest : public TestPerfParams> {}; class AbsDiffCPerfTest : public TestPerfParams> {}; - class SumPerfTest : public TestPerfParams> {}; - class AddWeightedPerfTest : public TestPerfParams> {}; - class NormPerfTest : public TestPerfParams> {}; + class SumPerfTest : public TestPerfParams> {}; + class AddWeightedPerfTest : public TestPerfParams> {}; + class NormPerfTest : public TestPerfParams> {}; class IntegralPerfTest : public TestPerfParams> {}; class ThresholdPerfTest : public TestPerfParams> {}; class ThresholdOTPerfTest : public TestPerfParams> {}; diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index f49e061..cce548a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CORE_PERF_TESTS_INL_HPP @@ -900,13 +900,13 @@ PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance) PERF_TEST_P_(SumPerfTest, TestPerformance) { - cv::Size sz_in = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - double tolerance = get<2>(GetParam()); + compare_scalar_f cmpF = get<0>(GetParam()); + cv::Size sz_in = get<1>(GetParam()); + MatType type = get<2>(GetParam()); cv::GCompileArgs compile_args = get<3>(GetParam()); - initMatrixRandU(type, sz_in, false); + initMatrixRandU(type, sz_in, type, false); cv::Scalar out_sum; cv::Scalar out_sum_ocv; @@ -928,8 +928,7 @@ PERF_TEST_P_(SumPerfTest, TestPerformance) // Comparison //////////////////////////////////////////////////////////// { - EXPECT_LE(std::abs(out_sum[0] - out_sum_ocv[0]) / std::max(1.0, std::abs(out_sum_ocv[0])), tolerance) - << "OCV=" << out_sum_ocv[0] << " GAPI=" << out_sum[0]; + EXPECT_TRUE(cmpF(out_sum, out_sum_ocv)); } SANITY_CHECK_NOTHING(); @@ -939,10 +938,10 @@ PERF_TEST_P_(SumPerfTest, TestPerformance) PERF_TEST_P_(AddWeightedPerfTest, TestPerformance) { - cv::Size sz_in = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - int dtype = get<2>(GetParam()); - double tolerance = get<3>(GetParam()); + compare_f cmpF = get<0>(GetParam()); + cv::Size sz_in = get<1>(GetParam()); + MatType type = get<2>(GetParam()); + int dtype = get<3>(GetParam()); cv::GCompileArgs compile_args = get<4>(GetParam()); auto& rng = cv::theRNG(); @@ -968,45 +967,9 @@ PERF_TEST_P_(AddWeightedPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check - if (0) - { - // Note, that we cannot expect bitwise results for add-weighted: - // - // tmp = src1*alpha + src2*beta + gamma; - // dst = saturate( round(tmp) ); - // - // Because tmp is floating-point, dst depends on compiler optimizations - // - // However, we must expect good accuracy of tmp, and rounding correctly - - cv::Mat failures; - - if (out_mat_ocv.type() == CV_32FC1) - { - // result: float - may vary in 7th decimal digit - failures = abs(out_mat_gapi - out_mat_ocv) > abs(out_mat_ocv) * 1e-6; - } - else - { - // result: integral - rounding may vary if fractional part of tmp - // is nearly 0.5 - - cv::Mat inexact, incorrect, diff, tmp; - - inexact = out_mat_gapi != out_mat_ocv; - - // even if rounded differently, check if still rounded correctly - cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, tmp, CV_32F); - cv::subtract(out_mat_gapi, tmp, diff, cv::noArray(), CV_32F); - incorrect = abs(diff) >= tolerance;// 0.5000005f; // relative to 6 digits - - failures = inexact & incorrect; - } - - EXPECT_EQ(0, cv::countNonZero(failures)); - EXPECT_EQ(out_mat_gapi.size(), sz_in); - } + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + EXPECT_EQ(out_mat_gapi.size(), sz_in); + SANITY_CHECK_NOTHING(); } @@ -1015,10 +978,10 @@ PERF_TEST_P_(AddWeightedPerfTest, TestPerformance) PERF_TEST_P_(NormPerfTest, TestPerformance) { - NormTypes opType = get<0>(GetParam()); - cv::Size sz = get<1>(GetParam()); - MatType type = get<2>(GetParam()); - double tolerance = get<3>(GetParam()); + compare_scalar_f cmpF = get<0>(GetParam()); + NormTypes opType = get<1>(GetParam()); + cv::Size sz = get<2>(GetParam()); + MatType type = get<3>(GetParam()); cv::GCompileArgs compile_args = get<4>(GetParam()); @@ -1051,8 +1014,7 @@ PERF_TEST_P_(NormPerfTest, TestPerformance) // Comparison //////////////////////////////////////////////////////////// { - EXPECT_LE(std::abs(out_norm[0] - out_norm_ocv[0]) / std::max(1.0, std::abs(out_norm_ocv[0])), tolerance) - << "OCV=" << out_norm_ocv[0] << " GAPI=" << out_norm[0]; + EXPECT_TRUE(cmpF(out_norm, out_norm_ocv)); } SANITY_CHECK_NOTHING(); diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp index 5a2ffb8..387ffb8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "perf_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp index 750c069..c2e65b9 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_IMGPROC_PERF_TESTS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp index 5a13cfe..e210bd0 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_IMGPROC_PERF_TESTS_INL_HPP @@ -52,7 +52,7 @@ PERF_TEST_P_(SepFilterPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -100,7 +100,7 @@ PERF_TEST_P_(Filter2DPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -145,7 +145,7 @@ PERF_TEST_P_(BoxFilterPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -188,7 +188,7 @@ PERF_TEST_P_(BlurPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -230,7 +230,7 @@ PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -271,7 +271,7 @@ PERF_TEST_P_(MedianBlurPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -314,7 +314,7 @@ PERF_TEST_P_(ErodePerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -357,7 +357,7 @@ PERF_TEST_P_(Erode3x3PerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -400,7 +400,7 @@ PERF_TEST_P_(DilatePerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -443,7 +443,7 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -526,7 +526,7 @@ PERF_TEST_P_(CannyPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -564,7 +564,7 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -830,7 +830,7 @@ PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 6957401..4a3a8c7 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../perf_precomp.hpp" @@ -152,24 +152,24 @@ INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestCPU, AbsDiffCPerfTest, Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(SumPerfTestCPU, SumPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), + Combine(Values(AbsToleranceScalar(0.0).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), - Values(0.0), + //Values(0.0), Values(cv::compile_args(CORE_CPU)))); -// FIXME: Comparison introduced by YL doesn't work with C3 INSTANTIATE_TEST_CASE_P(AddWeightedPerfTestCPU, AddWeightedPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, /*CV_8UC3,*/ CV_16UC1, CV_16SC1, CV_32FC1), + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(-1, CV_8U, CV_16U, CV_32F), - Values(0.5000005), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(NormPerfTestCPU, NormPerfTest, - Combine(Values(NORM_INF, NORM_L1, NORM_L2), + Combine(Values(AbsToleranceScalar(0.0).to_compare_f()), + Values(NORM_INF, NORM_L1, NORM_L2), Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), - Values(0.0), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(IntegralPerfTestCPU, IntegralPerfTest, diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp index ea3d753..4c84210 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../perf_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp index a5d13e6..964a03a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../perf_precomp.hpp" @@ -13,9 +13,101 @@ namespace opencv_test { - INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest, - Combine(Values(AbsExact().to_compare_f()), - Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), // add CV_32FC1 when ready +INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_8U, SepFilterPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3), + Values(3), + Values(szVGA, sz720p, sz1080p), + Values(-1, CV_16S, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_other, SepFilterPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), + Values(szVGA, sz720p, sz1080p), + Values(-1, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(Filter2DPerfTestFluid, Filter2DPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add 4, 5, 7 when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::BORDER_DEFAULT), + Values(-1, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestFluid, BoxFilterPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::BORDER_DEFAULT), + Values(-1, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BlurPerfTestFluid, BlurPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::BORDER_DEFAULT), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestFluid, GaussianBlurPerfTest, + Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestFluid, MedianBlurPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(ErodePerfTestFluid, ErodePerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::MorphShapes::MORPH_RECT, + cv::MorphShapes::MORPH_CROSS, + cv::MorphShapes::MORPH_ELLIPSE), + Values(cv::compile_args(IMGPROC_FLUID)))); + +// GAPI/fluid does not support iterations parameter for the Erode kernel +INSTANTIATE_TEST_CASE_P(DISABLED_Erode3x3PerfTestFluid, Erode3x3PerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(szVGA, sz720p, sz1080p), + Values(1, 2, 4), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(DilatePerfTestFluid, DilatePerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::MorphShapes::MORPH_RECT, + cv::MorphShapes::MORPH_CROSS, + cv::MorphShapes::MORPH_ELLIPSE), + Values(cv::compile_args(IMGPROC_FLUID)))); + +// GAPI/fluid does not support iterations parameter for the Dilate kernel +INSTANTIATE_TEST_CASE_P(DISABLED_Dilate3x3PerfTestFluid, Dilate3x3PerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(szVGA, sz720p, sz1080p), + Values(1, 2, 4), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), Values(3), // add 5x5 once supported Values(szVGA, sz720p, sz1080p), Values(-1, CV_16S, CV_32F), @@ -23,8 +115,8 @@ namespace opencv_test Values(1, 2), Values(cv::compile_args(IMGPROC_FLUID)))); - INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest, - Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()), +INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest, + Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()), Values(CV_32FC1), Values(3), // add 5x5 once supported Values(szVGA, sz720p, sz1080p), @@ -33,44 +125,44 @@ namespace opencv_test Values(1, 2), Values(cv::compile_args(IMGPROC_FLUID)))); - INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest, - Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest, - Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); +INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest, + Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest, + Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); } diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index 652cbae..b1ebc5d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -2,12 +2,11 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../perf_precomp.hpp" #include "../common/gapi_core_perf_tests.hpp" -#include "opencv2/gapi/gpu/core.hpp" #define CORE_GPU cv::gapi::core::gpu::kernels() @@ -153,24 +152,23 @@ INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestGPU, AbsDiffCPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(SumPerfTestGPU, SumPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsToleranceScalar(1e-5).to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), - Values(4.0), //TODO: too relaxed? Values(cv::compile_args(CORE_GPU)))); -// FIXME: Comparison introduced by YL doesn't work with C3 INSTANTIATE_TEST_CASE_P(AddWeightedPerfTestGPU, AddWeightedPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), - Values( CV_8UC1, /*CV_8UC3,*/ CV_16UC1, CV_16SC1, CV_32FC1 ), + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), + Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), - Values(0.50005), Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(NormPerfTestGPU, NormPerfTest, - Combine(Values(NORM_INF, NORM_L1, NORM_L2), + Combine(Values(AbsToleranceScalar(1e-5).to_compare_f()), + Values(NORM_INF, NORM_L1, NORM_L2), Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), - Values(4.0), //TODO: too relaxed? Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(IntegralPerfTestGPU, IntegralPerfTest, diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp index 14ef606..0976299 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp @@ -2,12 +2,11 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../perf_precomp.hpp" #include "../common/gapi_imgproc_perf_tests.hpp" -#include "opencv2/gapi/gpu/imgproc.hpp" #define IMGPROC_GPU cv::gapi::imgproc::gpu::kernels() @@ -109,10 +108,20 @@ INSTANTIATE_TEST_CASE_P(Dilate3x3PerfTestGPU, Dilate3x3PerfTest, INSTANTIATE_TEST_CASE_P(SobelPerfTestGPU, SobelPerfTest, Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), - Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1/*, CV_32FC1*/), //TODO: CV_32FC1 fails accuracy + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), Values(3, 5), Values(szVGA, sz720p, sz1080p), - Values(-1, CV_32F), + Values(-1, CV_16S, CV_32F), + Values(0, 1), + Values(1, 2), + Values(cv::compile_args(IMGPROC_GPU)))); + +INSTANTIATE_TEST_CASE_P(SobelPerfTestGPU32F, SobelPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_32FC1), + Values(3, 5), + Values(szVGA, sz720p, sz1080p), + Values(CV_32F), Values(0, 1), Values(1, 2), Values(cv::compile_args(IMGPROC_GPU)))); diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp index 48786b6..5ada23d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "perf_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp index 8d6d77e..ff8aba0 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "perf_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp index abd7cbe..f0eba6a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef __OPENCV_GAPI_PERF_PRECOMP_HPP__ @@ -17,6 +17,8 @@ #include "opencv2/gapi/core.hpp" #include "opencv2/gapi/cpu/gcpukernel.hpp" #include "opencv2/gapi/gpu/ggpukernel.hpp" +#include "opencv2/gapi/gpu/imgproc.hpp" +#include "opencv2/gapi/gpu/core.hpp" #include "opencv2/gapi/operators.hpp" #include "opencv2/gapi/fluid/core.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp index 744db16..bb865c4 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp index edab0a0..fce1da5 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp index 0fd19a7..90a5d3d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp index 8144d21..37307ce 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp index 1c6e297..b7f483b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef GAPI_API_GBACKEND_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp index 2dd823d..e035052 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" @@ -28,9 +28,14 @@ cv::GCall::GCall(const cv::GKernel &k) cv::GCall::~GCall() { + // FIXME: current behavior of the destructor can cause troubles in a threaded environment. GCall + // is not supposed to be accessed for modification within multiple threads. There should be a + // way to ensure somehow that no problem occurs in future. For now, this is a reminder that + // GCall is not supposed to be copied inside a code block that is executed in parallel. + // When a GCall object is destroyed (and GCall::Priv is likely still alive, // as there might be other references), reset m_node to break cycle. - m_priv->m_node = GNode(); + m_priv->m_node = GNode(); } void cv::GCall::setArgs(std::vector &&args) diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp index ffb122e..122303a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GCALL_PRIV_HPP @@ -19,13 +19,31 @@ namespace cv { +// GCall is used to capture details (arguments) passed to operation when the graph is +// constructed. It is, in fact, just a "serialization" of a function call (to some extent). The +// only place where new GCall objects are constructed is KernelName::on(). Note that GCall not +// only stores its input arguments, but also yields operation's pseudo-results to return +// "results". +// GCall arguments are GArgs which can wrap either our special types (like GMat) or other +// stuff user may pass according to operation's signature (opaque to us). +// If a dynamic g-object is wrapped in GArg, it has origin - something where that object comes +// from. It is either another function call (again, a GCall) or nothing (for graph's starting +// points, for example). By using these links, we understand what the flow is and construct the +// real graph. Origin is a node in a graph, represented by GNode. +// When a GCall is created, it instantiates it's appropriate GNode since we need an origin for +// objects we produce with this call. This is what is stored in m_node and then is used in every +// yield() call (the framework calls yield() according to template signature which we strip then +// - aka type erasure). +// Here comes the recursion - GNode knows it is created for GCall, and GCall stores that node +// object as origin for yield(). In order to break it, in GNode's object destructor this m_node +// pointer is reset (note - GCall::Priv remains alive). Now GCall's ownership "moves" to GNode +// and remains there until the API part is destroyed. class GCall::Priv { public: std::vector m_args; const GKernel m_k; - // FIXME: Document that there's no recursion here. // TODO: Rename to "constructionNode" or smt to reflect its lifetime GNode m_node; diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp index ab761ed..fe14b90 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp index 13d1b9a..035f56b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMPUTATION_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp index f8c851a..ca4314d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp index e8c5285..0477c91 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" @@ -33,15 +33,39 @@ const cv::GOrigin& cv::GMat::priv() const return *m_priv; } +namespace{ + template cv::GMetaArgs vec_descr_of(const std::vector &vec) + { + cv::GMetaArgs vec_descr; + vec_descr.reserve(vec.size()); + for(auto& mat : vec){ + vec_descr.emplace_back(descr_of(mat)); + } + return vec_descr; + } +} + + #if !defined(GAPI_STANDALONE) cv::GMatDesc cv::descr_of(const cv::Mat &mat) { return GMatDesc{mat.depth(), mat.channels(), {mat.cols, mat.rows}}; } + cv::GMatDesc cv::descr_of(const cv::UMat &mat) { return GMatDesc{ mat.depth(), mat.channels(),{ mat.cols, mat.rows } }; } + +cv::GMetaArgs cv::descr_of(const std::vector &vec) +{ + return vec_descr_of(vec); +} + +cv::GMetaArgs cv::descr_of(const std::vector &vec) +{ + return vec_descr_of(vec); +} #endif cv::GMatDesc cv::gapi::own::descr_of(const cv::gapi::own::Mat &mat) @@ -49,6 +73,11 @@ cv::GMatDesc cv::gapi::own::descr_of(const cv::gapi::own::Mat &mat) return GMatDesc{mat.depth(), mat.channels(), {mat.cols, mat.rows}}; } +cv::GMetaArgs cv::gapi::own::descr_of(const std::vector &vec) +{ + return vec_descr_of(vec); +} + namespace cv { std::ostream& operator<<(std::ostream& os, const cv::GMatDesc &desc) { diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp index efda5d5..05ee7dc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp index bd6c790..7f0aa4a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GNODE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp index 5425471..d5e3055 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GNODE_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp index 2482d62..e24ca8a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp index 2684924..8df4029 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GPROTO_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp index 30f3dc9..8d0b066 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp index c9fe19e..00088d0 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp index 7c4b522..e1fc4cd 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp index 44fc4fa..647f5bf 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp index 613022c..1229739 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GBACKEND_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp index 948898f..c927cd1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp index 89abcef..ce74c16 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp index 5cc8bb0..c17750c 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp index 6ce8c48..0525e3a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCPUBACKEND_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp index c42f863..8a3b3ab 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp index 77e9e82..b1248a2 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCPUCORE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp index d14584b..1f0251f 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp index 172871a..d6ea758 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCPUIMGPROC_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp index af13eed..cfa5257 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp index e6eaaae..2c8b88e 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" @@ -930,9 +930,13 @@ namespace { // FIXME: ASSERT(DATA), ASSERT(FLUIDDATA) auto &fd = fg.metadata(out_data_node).get(); - fd.latency = out_latency; + // If fluid node is external, it will be bound to a real image without + // fluid buffer allocation, so set its latency to 0 not to confuse later latency propagation. + // Latency is used in fluid buffer allocation process and is not used by the scheduler + // so latency doesn't affect the execution and setting it to 0 is legal + fd.latency = fd.internal ? out_latency : 0; fd.lpi_write = fu.k.m_lpi; - GModel::log(g, out_data_node, "Latency: " + std::to_string(out_latency)); + GModel::log(g, out_data_node, "Latency: " + std::to_string(fd.latency)); } } } @@ -1207,35 +1211,41 @@ void GFluidBackendImpl::addBackendPasses(ade::ExecutionEngineSetupContext &ectx) for (const auto& nh : gim.nodes()) { - if (gim.metadata(nh).get().k == NodeKind::ISLAND) + switch (gim.metadata(nh).get().k) + { + case NodeKind::ISLAND: { const auto isl = gim.metadata(nh).get().object; if (isl->backend() == cv::gapi::fluid::backend()) { - // add FluidData to all data nodes inside island + // Add FluidData to all data nodes inside island, + // set internal = true if node is not a slot in terms of higher-level GIslandModel for (const auto node : isl->contents()) { - if (g.metadata(node).get().t == NodeType::DATA) + if (g.metadata(node).get().t == NodeType::DATA && + !fg.metadata(node).contains()) setFluidData(node, true); } - - // add FluidData to slot if it's read/written by fluid - std::vector io_handles; - for (const auto &in_op : isl->in_ops()) - { - ade::util::copy(in_op->inNodes(), std::back_inserter(io_handles)); - } - for (const auto &out_op : isl->out_ops()) - { - ade::util::copy(out_op->outNodes(), std::back_inserter(io_handles)); - } - for (const auto &io_node : io_handles) - { - if (!fg.metadata(io_node).contains()) - setFluidData(io_node, false); - } } // if (fluid backend) - } // if (ISLAND) + } break; // case::ISLAND + case NodeKind::SLOT: + { + // add FluidData to slot if it's read/written by fluid + // regardless if it is one fluid island (both writing to and reading from this object) + // or two distinct islands (both fluid) + auto isFluidIsland = [&](const ade::NodeHandle& node) { + const auto isl = gim.metadata(node).get().object; + return isl->backend() == cv::gapi::fluid::backend(); + }; + + if (ade::util::any_of(ade::util::chain(nh->inNodes(), nh->outNodes()), isFluidIsland)) + { + auto data_node = gim.metadata(nh).get().original_data_node; + setFluidData(data_node, false); + } + } break; // case::SLOT + default: GAPI_Assert(false); + } // switch } // for (gim.nodes()) }); // FIXME: diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp index ba8b977..d340202 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_FLUID_BACKEND_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp index 6672ea2..66705f2 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp index 1f3eadc..dd6e518 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_FLUID_BUFFER_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp index 16a63e2..61dba02 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #if !defined(GAPI_STANDALONE) @@ -340,7 +340,7 @@ static void run_arithm_s3(uchar out[], const uchar in[], int width, const uchar v_store_interleave(&out[3*w], x, y, z); } #endif - UNUSED(v_op); + cv::util::suppress_unused_warning(v_op); for (; w < width; w++) { out[3*w ] = saturate( s_op(in[3*w ], scalar[0]) ); @@ -386,7 +386,7 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float v_store(&out[w], uc); } #endif - UNUSED(v_op); + cv::util::suppress_unused_warning(v_op); for (; w < width; w++) { out[w] = saturate(s_op(in[w], scalar[0]), std::roundf); diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp index e2e4c4f..2cdc573 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #if !defined(GAPI_STANDALONE) @@ -344,7 +344,7 @@ static const int maxKernelSize = 9; template static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize, - const cv::Point& /* anchor */, bool normalize) + const cv::Point& /* anchor */, bool normalize, float *buf[]) { GAPI_Assert(kernelSize.width <= maxKernelSize); GAPI_Assert(kernelSize.width == kernelSize.height); @@ -365,36 +365,53 @@ static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSi int width = dst.length(); int chan = dst.meta().chan; - GAPI_DbgAssert(chan <= 4); + if (kernelSize.width == 3 && kernelSize.height == 3) + { + int y = dst.y(); + int y0 = dst.priv().writeStart(); - for (int w=0; w < width; w++) + float kx[3] = {1, 1, 1}; + float *ky = kx; + + float scale=1, delta=0; + if (normalize) + scale = 1/9.f; + + run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); + } else { - float sum[4] = {0, 0, 0, 0}; + GAPI_DbgAssert(chan <= 4); - for (int i=0; i < kernel; i++) + for (int w=0; w < width; w++) { - for (int j=0; j < kernel; j++) + float sum[4] = {0, 0, 0, 0}; + + for (int i=0; i < kernel; i++) { - for (int c=0; c < chan; c++) - sum[c] += in[i][(w + j - border)*chan + c]; + for (int j=0; j < kernel; j++) + { + for (int c=0; c < chan; c++) + sum[c] += in[i][(w + j - border)*chan + c]; + } } - } - for (int c=0; c < chan; c++) - { - float result = normalize? sum[c]/(kernel * kernel) : sum[c]; + for (int c=0; c < chan; c++) + { + float result = normalize? sum[c]/(kernel * kernel) : sum[c]; - out[w*chan + c] = saturate(result, rintf); + out[w*chan + c] = saturate(result, rintf); + } } } } -GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false) +GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, true) { static const int Window = 3; static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor, - int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst) + int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst, + Buffer& scratch) { // TODO: support sizes 3, 5, 7, 9, ... GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3); @@ -404,14 +421,46 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false) static const bool normalize = true; + int width = src.length(); + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = scratch.OutLine(); + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize); + UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + static void initScratch(const GMatDesc & in, + const cv::Size & /* ksize */, + const cv::Point & /* anchor */, + int /* borderType */, + const cv::Scalar & /* borderValue */, + Buffer & scratch) + { + int width = in.size.width; + int chan = in.chan; + + int buflen = width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); + GMatDesc bufdesc = {CV_32F, 1, bufsize}; + Buffer buffer(bufdesc); + scratch = std::move(buffer); + } + + static void resetScratch(Buffer& /* scratch */) + { + } + static Border getBorder(const cv::GMatDesc& /* src */, const cv::Size & /* kernelSize */, const cv::Point & /* anchor */, @@ -422,18 +471,19 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false) } }; -GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false) +GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, true) { static const int Window = 3; static void run(const View & src, int /* ddepth */, const cv::Size & kernelSize, - const cv::Point & anchor, + const cv::Point & anchor, bool normalize, int /* borderType */, const cv::Scalar& /* borderValue */, - Buffer& dst) + Buffer& dst, + Buffer& scratch) { // TODO: support sizes 3, 5, 7, 9, ... GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3); @@ -441,17 +491,51 @@ GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false) // TODO: suport non-trivial anchor GAPI_Assert(anchor.x == -1 && anchor.y == -1); + int width = src.length(); + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = scratch.OutLine(); + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize); + UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + static void initScratch(const GMatDesc & in, + int /* ddepth */, + const cv::Size & /* kernelSize */, + const cv::Point & /* anchor */, + bool /* normalize */, + int /* borderType */, + const cv::Scalar& /* borderValue */, + Buffer & scratch) + { + int width = in.size.width; + int chan = in.chan; + + int buflen = width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); + GMatDesc bufdesc = {CV_32F, 1, bufsize}; + Buffer buffer(bufdesc); + scratch = std::move(buffer); + } + + static void resetScratch(Buffer& /* scratch */) + { + } + static Border getBorder(const cv::GMatDesc& /* src */, int /* ddepth */, const cv::Size & /* kernelSize */, @@ -510,18 +594,21 @@ static void run_sepfilter(Buffer& dst, const View& src, const float kx[], int kxLen, const float ky[], int kyLen, const cv::Point& /* anchor */, - float delta=0) + float scale, float delta, + float *buf[]) { - static const int maxLines = 9; - GAPI_Assert(kyLen <= maxLines); + constexpr int kMax = 11; + GAPI_Assert(kxLen <= kMax && kyLen <= kMax); - const SRC *in[ maxLines ]; + const SRC *in[kMax]; DST *out; - int border = (kyLen - 1) / 2; + int xborder = (kxLen - 1) / 2; + int yborder = (kyLen - 1) / 2; + for (int i=0; i < kyLen; i++) { - in[i] = src.InLine(i - border); + in[i] = src.InLine(i - yborder); } out = dst.OutLine(); @@ -529,28 +616,52 @@ static void run_sepfilter(Buffer& dst, const View& src, int width = dst.length(); int chan = dst.meta().chan; - for (int w=0; w < width; w++) + // optimized 3x3 vs reference + if (kxLen == 3 && kyLen == 3) { - // TODO: make this cycle innermost - for (int c=0; c < chan; c++) + int y = dst.y(); + int y0 = dst.priv().writeStart(); + + int border = xborder; + run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); + } + else + { + int length = chan * width; + int xshift = chan * xborder; + + // horizontal pass + + for (int k=0; k < kyLen; k++) { - float sum=0; + const SRC *inp[kMax] = {nullptr}; - for (int i=0; i < kyLen; i++) + for (int j=0; j < kxLen; j++) { - float sumi=0; + inp[j] = in[k] + (j - xborder)*xshift; + } + for (int l=0; l < length; l++) + { + float sum = 0; for (int j=0; j < kxLen; j++) { - sumi += in[i][(w + j - border)*chan + c] * kx[j]; + sum += inp[j][l] * kx[j]; } - - sum += sumi * ky[i]; + buf[k][l] = sum; } + } - float result = sum + delta; + // vertical pass - out[w*chan + c] = saturate(result, rintf); + for (int l=0; l < length; l++) + { + float sum = 0; + for (int k=0; k < kyLen; k++) + { + sum += buf[k][l] * ky[k]; + } + out[l] = saturate(sum*scale + delta, rintf); } } } @@ -580,21 +691,37 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true) int kxLen = kernX.rows * kernX.cols; int kyLen = kernY.rows * kernY.cols; + GAPI_Assert(kyLen == 3); + float *kx = scratch.OutLine(); float *ky = kx + kxLen; + int width = src.meta().size.width; + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = ky + kyLen; + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + + float scale = 1; float delta = static_cast(delta_[0]); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); - UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); - UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); - UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); + UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( short, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } - static void initScratch(const GMatDesc& /* in */, + static void initScratch(const GMatDesc& in, int /* ddepth */, const Mat & kernX, const Mat & kernY, @@ -607,7 +734,13 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true) int kxLen = kernX.rows * kernX.cols; int kyLen = kernY.rows * kernY.cols; - cv::gapi::own::Size bufsize(kxLen + kyLen, 1); + int width = in.size.width; + int chan = in.chan; + + int buflen = kxLen + kyLen + // x, y kernels + width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); GMatDesc bufdesc = {CV_32F, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -664,29 +797,47 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true) auto *kx = scratch.OutLine(); // cached kernX data auto *ky = kx + kxsize; // cached kernY data + int width = src.meta().size.width; + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = ky + kysize; + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + auto anchor = cv::Point(-1, -1); - float delta = 0.f; + + float scale = 1; + float delta = 0; // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta); - UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta); - UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta); + UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); + UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); + UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); + UNARY_( float, float, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } - static void initScratch(const GMatDesc& /* in */, + static void initScratch(const GMatDesc& in, const cv::Size & ksize, double sigmaX, double sigmaY, - int /* borderType */, - const cv::Scalar & /* borderValue */, + int /* borderType */, + const cv::Scalar & /* borderValue */, Buffer & scratch) { int kxsize = ksize.width; int kysize = ksize.height; - cv::gapi::own::Size bufsize(kxsize + kysize, 1); + int width = in.size.width; + int chan = in.chan; + + int buflen = kxsize + kysize + // x, y kernels + width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); GMatDesc bufdesc = {CV_32F, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -767,7 +918,7 @@ static void run_sobel(Buffer& dst, int y0 = dst.priv().writeStart(); // int y1 = dst.priv().writeEnd(); - run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); + run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); } GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true) @@ -901,24 +1052,30 @@ static void run_filter2d(Buffer& dst, const View& src, int width = dst.length(); int chan = dst.meta().chan; + int length = width * chan; - for (int w=0; w < width; w++) + // manually optimized for 3x3 + if (k_rows == 3 && k_cols == 3) { - // TODO: make this cycle innermost - for (int c=0; c < chan; c++) - { - float sum = 0; - - for (int i=0; i < k_rows; i++) - for (int j=0; j < k_cols; j++) - { - sum += in[i][(w + j - border_x)*chan + c] * k[k_cols*i + j]; - } + float scale = 1; + run_filter2d_3x3_impl(out, in, width, chan, k, scale, delta); + return; + } - float result = sum + delta; + // reference: any kernel size + for (int l=0; l < length; l++) + { + float sum = 0; - out[w*chan + c] = saturate(result, rintf); + for (int i=0; i < k_rows; i++) + for (int j=0; j < k_cols; j++) + { + sum += in[i][l + (j - border_x)*chan] * k[k_cols*i + j]; } + + float result = sum + delta; + + out[l] = saturate(result, rintf); } } @@ -946,6 +1103,7 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true) int k_rows = kernel.rows; int k_cols = kernel.cols; + const float *k = scratch.OutLine(); // copy of kernel.data // DST SRC OP __VA_ARGS__ @@ -969,7 +1127,12 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true) const cv::Scalar & /* borderValue */, Buffer & scratch) { - cv::gapi::own::Size bufsize(kernel.rows * kernel.cols, 1); + int krows = kernel.rows; + int kcols = kernel.cols; + + int buflen = krows * kcols; // kernel size + + cv::gapi::own::Size bufsize(buflen, 1); GMatDesc bufdesc = {CV_32F, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -1001,7 +1164,26 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true) // //----------------------------- -enum Morphology { M_ERODE, M_DILATE }; +static MorphShape detect_morph3x3_shape(const uchar kernel[]) +{ + const uchar k[3][3] = { + { kernel[0], kernel[1], kernel[2]}, + { kernel[3], kernel[4], kernel[5]}, + { kernel[6], kernel[7], kernel[8]} + }; + + if (k[0][0] && k[0][1] && k[0][2] && + k[1][0] && k[1][1] && k[1][2] && + k[2][0] && k[2][1] && k[2][2]) + return M_FULL; + + if (!k[0][0] && k[0][1] && !k[0][2] && + k[1][0] && k[1][1] && k[1][2] && + !k[2][0] && k[2][1] && !k[2][2]) + return M_CROSS; + + return M_UNDEF; +} template static void run_morphology( Buffer& dst, @@ -1009,9 +1191,14 @@ static void run_morphology( Buffer& dst, const uchar k[], int k_rows, int k_cols, + MorphShape k_type, const cv::Point & /* anchor */, Morphology morphology) { + static_assert(std::is_same::value, "unsupported combination of types"); + + GAPI_Assert(M_ERODE == morphology || M_DILATE == morphology); + static const int maxLines = 9; GAPI_Assert(k_rows <= maxLines); @@ -1031,43 +1218,44 @@ static void run_morphology( Buffer& dst, int width = dst.length(); int chan = dst.meta().chan; - for (int w=0; w < width; w++) + // call optimized code, if 3x3 + if (3 == k_rows && 3 == k_cols) { - // TODO: make this cycle innermost - for (int c=0; c < chan; c++) + run_morphology3x3_impl(out, in, width, chan, k, k_type, morphology); + return; + } + + // reference: any size of k[] + int length = width * chan; + for (int l=0; l < length; l++) + { + SRC result; + if (M_ERODE == morphology) { - SRC result=0; - if (M_ERODE == morphology) - { - result = std::numeric_limits::max(); - } - else if (M_DILATE == morphology) - { - result = std::numeric_limits::min(); - } - else - CV_Error(cv::Error::StsBadArg, "unsupported morphology operation"); + result = std::numeric_limits::max(); + } + else // if (M_DILATE == morphology) + { + result = std::numeric_limits::min(); + } - for (int i=0; i < k_rows; i++) - for (int j=0; j < k_cols; j++) + for (int i=0; i < k_rows; i++) + for (int j=0; j < k_cols; j++) + { + if ( k[k_cols*i + j] ) { - if ( k[k_cols*i + j] ) + if (M_ERODE == morphology) + { + result = (std::min)(result, in[i][l + (j - border_x)*chan]); + } + else // if (M_DILATE == morphology) { - if (M_ERODE == morphology) - { - result = std::min(result, in[i][(w + j - border_x)*chan + c]); - } - else if (M_DILATE == morphology) - { - result = std::max(result, in[i][(w + j - border_x)*chan + c]); - } - else - CV_Error(cv::Error::StsBadArg, "unsupported morphology operation"); + result = (std::max)(result, in[i][l + (j - border_x)*chan]); } } - - out[w*chan + c] = saturate(result, rintf); } + + out[l] = saturate(result, rintf); } } @@ -1095,13 +1283,16 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true) int k_rows = kernel.rows; int k_cols = kernel.cols; + int k_size = k_rows * k_cols; auto *k = scratch.OutLine(); // copy of kernel.data + auto k_type = static_cast(k[k_size]); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); - UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); - UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); + UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE); + UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE); + UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE); + UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } @@ -1109,15 +1300,16 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true) static void initScratch(const GMatDesc& /* in */, const Mat & kernel, const Point & /* anchor */, - int /* iterations */, + int /* iterations */, int /* borderType */, const cv::Scalar & /* borderValue */, Buffer & scratch) { int k_rows = kernel.rows; int k_cols = kernel.cols; + int k_size = k_rows * k_cols; - cv::gapi::own::Size bufsize(k_rows * k_cols, 1); + cv::gapi::own::Size bufsize(k_size + 1, 1); GMatDesc bufdesc = {CV_8U, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -1125,6 +1317,11 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true) // FIXME: move to resetScratch stage ? auto *k = scratch.OutLine(); getKernel(k, kernel); + + if (3 == k_rows && 3 == k_cols) + k[k_size] = static_cast(detect_morph3x3_shape(k)); + else + k[k_size] = static_cast(M_UNDEF); } static void resetScratch(Buffer& /* scratch */) @@ -1172,13 +1369,16 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true) int k_rows = kernel.rows; int k_cols = kernel.cols; + int k_size = k_rows * k_cols; auto *k = scratch.OutLine(); // copy of kernel.data + auto k_type = static_cast(k[k_size]); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); - UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); - UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); + UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE); + UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE); + UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE); + UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } @@ -1193,8 +1393,9 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true) { int k_rows = kernel.rows; int k_cols = kernel.cols; + int k_size = k_rows * k_cols; - cv::gapi::own::Size bufsize(k_rows * k_cols, 1); + cv::gapi::own::Size bufsize(k_size + 1, 1); GMatDesc bufdesc = {CV_8U, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -1202,6 +1403,11 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true) // FIXME: move to resetScratch stage ? auto *k = scratch.OutLine(); getKernel(k, kernel); + + if (3 == k_rows && 3 == k_cols) + k[k_size] = static_cast(detect_morph3x3_shape(k)); + else + k[k_size] = static_cast(M_UNDEF); } static void resetScratch(Buffer& /* scratch */) @@ -1236,7 +1442,9 @@ static void run_medianblur( Buffer& dst, const View & src, int ksize) { - static const int kmax = 9; + static_assert(std::is_same::value, "unsupported combination of types"); + + constexpr int kmax = 9; GAPI_Assert(ksize <= kmax); const SRC *in[ kmax ]; @@ -1254,24 +1462,33 @@ static void run_medianblur( Buffer& dst, int width = dst.length(); int chan = dst.meta().chan; - for (int w=0; w < width; w++) + // optimized: if 3x3 + + if (3 == ksize) { - // TODO: make this cycle innermost - for (int c=0; c < chan; c++) - { - SRC neighbours[kmax * kmax]; + run_medblur3x3_impl(out, in, width, chan); + return; + } - for (int i=0; i < ksize; i++) - for (int j=0; j < ksize; j++) - { - neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c]; - } + // reference: any ksize - int length = ksize * ksize; - std::nth_element(neighbours, neighbours + length/2, neighbours + length); + int length = width * chan; + int klength = ksize * ksize; + int klenhalf = klength / 2; - out[w*chan + c] = saturate(neighbours[length/2], rintf); + for (int l=0; l < length; l++) + { + SRC neighbours[kmax * kmax]; + + for (int i=0; i < ksize; i++) + for (int j=0; j < ksize; j++) + { + neighbours[i*ksize + j] = in[i][l + (j - border)*chan]; } + + std::nth_element(neighbours, neighbours + klenhalf, neighbours + klength); + + out[l] = saturate(neighbours[klenhalf], rintf); } } @@ -1290,6 +1507,7 @@ GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false) UNARY_(uchar , uchar , run_medianblur, dst, src, ksize); UNARY_(ushort, ushort, run_medianblur, dst, src, ksize); UNARY_( short, short, run_medianblur, dst, src, ksize); + UNARY_( float, float, run_medianblur, dst, src, ksize); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp index 9b21790..3624de9 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #if !defined(GAPI_STANDALONE) @@ -57,34 +57,102 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL); } -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- +//------------------------- + +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0) \ +{ \ + CV_CPU_DISPATCH(run_sepfilter3x3_impl, \ + (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL + +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta) \ +{ \ + CV_CPU_DISPATCH(run_filter2d_3x3_impl, \ + (out, in, width, chan, kernel, scale, delta), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + +//----------------------------- +// +// Fluid kernels: Erode, Dilate +// +//----------------------------- -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0) \ +#define RUN_MORPHOLOGY3X3_IMPL(T) \ +void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \ + const uchar k[], MorphShape k_type, \ + Morphology morphology) \ { \ - CV_CPU_DISPATCH(run_sobel_row, \ - (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \ + CV_CPU_DISPATCH(run_morphology3x3_impl, \ + (out, in, width, chan, k, k_type, morphology), \ CV_CPU_DISPATCH_MODES_ALL); \ } -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +RUN_MORPHOLOGY3X3_IMPL(uchar ) +RUN_MORPHOLOGY3X3_IMPL(ushort) +RUN_MORPHOLOGY3X3_IMPL( short) +RUN_MORPHOLOGY3X3_IMPL( float) + +#undef RUN_MORPHOLOGY3X3_IMPL + +//--------------------------- +// +// Fluid kernels: Median blur +// +//--------------------------- + +#define RUN_MEDBLUR3X3_IMPL(T) \ +void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \ +{ \ + CV_CPU_DISPATCH(run_medblur3x3_impl, (out, in, width, chan), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +RUN_MEDBLUR3X3_IMPL(uchar ) +RUN_MEDBLUR3X3_IMPL(ushort) +RUN_MEDBLUR3X3_IMPL( short) +RUN_MEDBLUR3X3_IMPL( float) + +#undef RUN_MEDBLUR3X3_IMPL } // namespace fliud } // namespace gapi diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp index 1b6f1b8..1e28dfd 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #pragma once @@ -33,29 +33,87 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]); -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- - -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0); - -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +//------------------------- + +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0); + +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL + +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta); + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + +//----------------------------- +// +// Fluid kernels: Erode, Dilate +// +//----------------------------- + +enum Morphology { M_ERODE, M_DILATE }; + +enum MorphShape { M_FULL, M_CROSS, M_UNDEF }; + +#define RUN_MORPHOLOGY3X3_IMPL(T) \ +void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \ + const uchar k[], MorphShape k_type, \ + Morphology morphology); + +RUN_MORPHOLOGY3X3_IMPL(uchar ) +RUN_MORPHOLOGY3X3_IMPL(ushort) +RUN_MORPHOLOGY3X3_IMPL( short) +RUN_MORPHOLOGY3X3_IMPL( float) + +#undef RUN_MORPHOLOGY3X3_IMPL + +//--------------------------- +// +// Fluid kernels: Median blur +// +//--------------------------- + +#define RUN_MEDBLUR3X3_IMPL(T) \ +void run_medblur3x3_impl(T out[], const T *in[], int width, int chan); + +RUN_MEDBLUR3X3_IMPL(uchar ) +RUN_MEDBLUR3X3_IMPL(ushort) +RUN_MEDBLUR3X3_IMPL( short) +RUN_MEDBLUR3X3_IMPL( float) + +#undef RUN_MEDBLUR3X3_IMPL } // namespace fluid } // namespace gapi diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp index c87be08..d455ae8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp @@ -2,19 +2,26 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // NB: allow including this *.hpp several times! // #pragma once -- don't: this file is NOT once! #if !defined(GAPI_STANDALONE) +#include "gfluidimgproc_func.hpp" + #include "opencv2/gapi/own/saturate.hpp" #include "opencv2/core.hpp" #include "opencv2/core/hal/intrin.hpp" #include +#include + +#include +#include +#include #ifdef __GNUC__ # pragma GCC diagnostic push @@ -48,34 +55,120 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]); -//--------------------- +//------------------------- +// +// Fluid kernels: sepFilter +// +//------------------------- + +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0); + +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL + +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta); + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + +//----------------------------- +// +// Fluid kernels: Erode, Dilate +// +//----------------------------- + +#define RUN_MORPHOLOGY3X3_IMPL(T) \ +void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \ + const uchar k[], MorphShape k_type, \ + Morphology morphology); + +RUN_MORPHOLOGY3X3_IMPL(uchar ) +RUN_MORPHOLOGY3X3_IMPL(ushort) +RUN_MORPHOLOGY3X3_IMPL( short) +RUN_MORPHOLOGY3X3_IMPL( float) + +#undef RUN_MORPHOLOGY3X3_IMPL + +//--------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: Median blur // -//--------------------- - -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0); - -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +//--------------------------- + +#define RUN_MEDBLUR3X3_IMPL(T) \ +void run_medblur3x3_impl(T out[], const T *in[], int width, int chan); + +RUN_MEDBLUR3X3_IMPL(uchar ) +RUN_MEDBLUR3X3_IMPL(ushort) +RUN_MEDBLUR3X3_IMPL( short) +RUN_MEDBLUR3X3_IMPL( float) + +#undef RUN_MEDBLUR3X3_IMPL //---------------------------------------------------------------------- #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#if CV_SIMD +template +static inline v_float32 vx_load_f32(const SRC* ptr) +{ + if (std::is_same::value) + { + v_uint32 tmp = vx_load_expand_q(reinterpret_cast(ptr)); + return v_cvt_f32(v_reinterpret_as_s32(tmp)); + } + + if (std::is_same::value) + { + v_uint32 tmp = vx_load_expand(reinterpret_cast(ptr)); + return v_cvt_f32(v_reinterpret_as_s32(tmp)); + } + + if (std::is_same::value) + { + v_int32 tmp = vx_load_expand(reinterpret_cast(ptr)); + return v_cvt_f32(tmp); + } + + if (std::is_same::value) + { + v_float32 tmp = vx_load(reinterpret_cast(ptr)); + return tmp; + } + + CV_Error(cv::Error::StsBadArg, "unsupported type"); +} +#endif // CV_SIMD + //---------------------------------- // // Fluid kernels: RGB2Gray, BGR2Gray @@ -309,187 +402,359 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef } } -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- +//------------------------- -// Sobel 3x3: vertical pass -template -static void run_sobel3x3_vert(DST out[], int length, const float ky[], - float scale, float delta, const int r[], float *buf[]) +#if CV_SIMD +// this variant not using buf[] appears 15% faster than reference any-2-float code below +template +static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta) { - float ky0 = ky[0], - ky1 = ky[1], - ky2 = ky[2]; + const int length = width * chan; + const int shift = border * chan; - int r0 = r[0], - r1 = r[1], - r2 = r[2]; + const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2]; + const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2]; -#if CV_SIMD - // for floating-point output, - // manual vectoring may be not better than compiler's optimization -#define EXPLICIT_SIMD_32F 0 // 1=vectorize 32f case explicitly, 0=don't -#if EXPLICIT_SIMD_32F - if (std::is_same::value && length >= v_int16::nlanes) + for (int l=0; l < length; ) { - constexpr static int nlanes = v_float32::nlanes; + static const int nlanes = v_float32::nlanes; - for (int l=0; l < length; ) + // main part + for ( ; l <= length - nlanes; l += nlanes) { - for (; l <= length - nlanes; l += nlanes) + auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[]) { - v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); - sum = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum); - sum = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum); - - if (!noscale) - { - sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta)); - } - - v_store(reinterpret_cast(&out[l]), sum); - } - - if (l < length) + v_float32 t0 = vx_load_f32(&i[l - shift]); + v_float32 t1 = vx_load_f32(&i[l ]); + v_float32 t2 = vx_load_f32(&i[l + shift]); + v_float32 t = t0 * vx_setall_f32(kx0); + t = v_fma(t1, vx_setall_f32(kx1), t); + t = v_fma(t2, vx_setall_f32(kx2), t); + return t; + }; + + v_float32 s0 = xsum(in[0]); + v_float32 s1 = xsum(in[1]); + v_float32 s2 = xsum(in[2]); + v_float32 s = s0 * vx_setall_f32(ky0); + s = v_fma(s1, vx_setall_f32(ky1), s); + s = v_fma(s2, vx_setall_f32(ky2), s); + + if (!noscale) { - // tail: recalculate last pixels - GAPI_DbgAssert(length >= nlanes); - l = length - nlanes; + s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta)); } + + v_store(&out[l], s); } - return; + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } } -#endif +} + +// this variant with manually vectored rounding to short/ushort appears 10-40x faster +// than reference code below +template +static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + int r[3]; + r[0] = (y - y0 ) % 3; // buf[r[0]]: previous + r[1] = (y - y0 + 1) % 3; // this + r[2] = (y - y0 + 2) % 3; // next row + + const int length = width * chan; + const int shift = border * chan; + + const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2]; + const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2]; + + // horizontal pass + + int k0 = (y == y0)? 0: 2; - if ((std::is_same::value || std::is_same::value) - && length >= v_int16::nlanes) + for (int k = k0; k < 3; k++) { - constexpr static int nlanes = v_int16::nlanes; + // previous , this , next pixel + const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift}; - for (int l=0; l < length; ) + // rely on compiler vectoring + for (int l=0; l < length; l++) { - for (; l <= length - nlanes; l += nlanes) - { - v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); - sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); - sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); + buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2; + } + } + + // vertical pass - v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0); - sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1); - sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1); + const int r0=r[0], r1=r[1], r2=r[2]; - if (!noscale) - { - sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); - sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); - } + for (int l=0; l < length;) + { + constexpr int nlanes = v_int16::nlanes; - v_int32 isum0 = v_round(sum0), - isum1 = v_round(sum1); + // main part of row + for (; l <= length - nlanes; l += nlanes) + { + v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); + sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); + sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); - if (std::is_same::value) - { - // signed short - v_int16 res = v_pack(isum0, isum1); - v_store(reinterpret_cast(&out[l]), res); - } else - { - // unsigned short - v_uint16 res = v_pack_u(isum0, isum1); - v_store(reinterpret_cast(&out[l]), res); - } + v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0); + sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1); + sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1); + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); } - if (l < length) + v_int32 isum0 = v_round(sum0), + isum1 = v_round(sum1); + + if (std::is_same::value) { - // tail: recalculate last pixels - GAPI_DbgAssert(length >= nlanes); - l = length - nlanes; + // signed short + v_int16 res = v_pack(isum0, isum1); + v_store(reinterpret_cast(&out[l]), res); + } else + { + // unsigned short + v_uint16 res = v_pack_u(isum0, isum1); + v_store(reinterpret_cast(&out[l]), res); } } - return; + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } } +} - if (std::is_same::value && length >= v_uint8::nlanes) +// this code with manually vectored rounding to uchar is 10-40x faster than reference +template +static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + int r[3]; + r[0] = (y - y0 ) % 3; // buf[r[0]]: previous + r[1] = (y - y0 + 1) % 3; // this + r[2] = (y - y0 + 2) % 3; // next row + + const int length = width * chan; + const int shift = border * chan; + + const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2]; + const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2]; + + // horizontal pass + + int k0 = (y == y0)? 0: 2; + + for (int k = k0; k < 3; k++) { - constexpr static int nlanes = v_uint8::nlanes; + // previous , this , next pixel + const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift}; - for (int l=0; l < length; ) + // rely on compiler vectoring + for (int l=0; l < length; l++) { - for (; l <= length - nlanes; l += nlanes) + buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2; + } + } + + // vertical pass + + const int r0=r[0], r1=r[1], r2=r[2]; + + for (int l=0; l < length;) + { + constexpr int nlanes = v_uint8::nlanes; + + // main part of row + for (; l <= length - nlanes; l += nlanes) + { + v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); + sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); + sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); + + v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0); + sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1); + sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1); + + v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0); + sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2); + sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2); + + v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0); + sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3); + sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3); + + if (!noscale) { - v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); - sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); - sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); + sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); + } - v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0); - sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1); - sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1); + v_int32 isum0 = v_round(sum0), + isum1 = v_round(sum1), + isum2 = v_round(sum2), + isum3 = v_round(sum3); - v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0); - sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2); - sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2); + v_int16 ires0 = v_pack(isum0, isum1), + ires1 = v_pack(isum2, isum3); - v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0); - sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3); - sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3); + v_uint8 res = v_pack_u(ires0, ires1); + v_store(reinterpret_cast(&out[l]), res); + } - if (!noscale) - { - sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); - sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); - sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); - sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); - } + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } +} - v_int32 isum0 = v_round(sum0), - isum1 = v_round(sum1), - isum2 = v_round(sum2), - isum3 = v_round(sum3); +// this code manually vectored for int16 not much faster than generic any-to-short code above +#define USE_SEPFILTER3X3_CHAR2SHORT 1 - v_int16 ires0 = v_pack(isum0, isum1), - ires1 = v_pack(isum2, isum3); +#if USE_SEPFILTER3X3_CHAR2SHORT +template +static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + const schar ikx0 = saturate(kx[0], rintf); + const schar ikx1 = saturate(kx[1], rintf); + const schar ikx2 = saturate(kx[2], rintf); + + const schar iky0 = saturate(ky[0], rintf); + const schar iky1 = saturate(ky[1], rintf); + const schar iky2 = saturate(ky[2], rintf); + + const short iscale = saturate(scale * (1 << 15), rintf); + const short idelta = saturate(delta , rintf); + + // check if this code is applicable + if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] || + iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] || + idelta != delta || + std::abs(scale) > 1 || std::abs(scale) < 0.01) + { + run_sepfilter3x3_any2short(out, in, width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + short *ibuf[3]; + ibuf[0] = reinterpret_cast(buf[0]); + ibuf[1] = reinterpret_cast(buf[1]); + ibuf[2] = reinterpret_cast(buf[2]); + + int r[3]; + r[0] = (y - y0 ) % 3; // buf[r[0]]: previous + r[1] = (y - y0 + 1) % 3; // this + r[2] = (y - y0 + 2) % 3; // next row + + const int length = width * chan; + const int shift = border * chan; + + // horizontal pass + + int k0 = (y == y0)? 0: 2; + + for (int k = k0; k < 3; k++) + { + for (int l=0; l < length;) + { + constexpr int nlanes = v_int16::nlanes; - v_uint8 res = v_pack_u(ires0, ires1); - v_store(reinterpret_cast(&out[l]), res); + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + v_uint16 t0 = vx_load_expand(&in[k][l - shift]); // previous + v_uint16 t1 = vx_load_expand(&in[k][l ]); // current + v_uint16 t2 = vx_load_expand(&in[k][l + shift]); // next pixel + v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) + + v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) + + v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2); + v_store(&ibuf[r[k]][l], t); } + // tail (if any) if (l < length) { - // tail: recalculate last pixels GAPI_DbgAssert(length >= nlanes); l = length - nlanes; } } - - return; } -#endif - // reference code - for (int l=0; l < length; l++) + // vertical pass + + for (int l=0; l < length;) { - float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2; + constexpr int nlanes = v_int16::nlanes; - if (!noscale) + // main part of output row + for (; l <= length - nlanes; l += nlanes) { - sum = sum*scale + delta; + v_int16 s0 = vx_load(&ibuf[r[0]][l]); // previous + v_int16 s1 = vx_load(&ibuf[r[1]][l]); // current + v_int16 s2 = vx_load(&ibuf[r[2]][l]); // next row + v_int16 s = s0 * vx_setall_s16(iky0) + + s1 * vx_setall_s16(iky1) + + s2 * vx_setall_s16(iky2); + + if (!noscale) + { + s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta); + } + + v_store(&out[l], s); } - out[l] = cv::gapi::own::saturate(sum, rintf); + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } } } +#endif -template -static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan, - const float kx[], const float ky[], int border, - float scale, float delta, float *buf[], - int y, int y0) +#endif // CV_SIMD + +template +static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) { int r[3]; r[0] = (y - y0) % 3; // buf[r[0]]: previous @@ -497,19 +762,21 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan, r[2] = (y - y0 + 2) % 3; // next row int length = width * chan; + int shift = border * chan; // horizontal pass // full horizontal pass is needed only if very 1st row in ROI; // for 2nd and further rows, it is enough to convolve only the // "next" row - as we can reuse buffers from previous calls to - // this kernel (note that Fluid processes rows consequently) + // this kernel (Fluid does rows consequently: y=y0, y0+1, ...) + int k0 = (y == y0)? 0: 2; for (int k = k0; k < 3; k++) { - // previous, this , next pixel - const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan}; + // previous , this , next pixel + const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift}; // rely on compiler vectoring for (int l=0; l < length; l++) @@ -519,37 +786,991 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan, } // vertical pass - if (scale == 1 && delta == 0) + + for (int l=0; l < length; l++) + { + float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2]; + + if (!noscale) + { + sum = sum*scale + delta; + } + + out[l] = saturate(sum, rintf); + } +} + +template +static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void) length; + +#if USE_SEPFILTER3X3_CHAR2SHORT + if (std::is_same::value && std::is_same::value && + length >= v_int16::nlanes) + { + // only slightly faster than more generic any-to-short (see below) + run_sepfilter3x3_char2short(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } +#endif + + if (std::is_same::value && std::is_same::value && + length >= v_float32::nlanes) + { + // appears 15% faster than reference any-to-float code (called below) + run_sepfilter3x3_any2float(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta); + return; + } + + if (std::is_same::value && length >= v_int16::nlanes) + { + // appears 10-40x faster than reference due to much faster rounding + run_sepfilter3x3_any2short(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) + { + // appears 10-40x faster than reference due to much faster rounding + run_sepfilter3x3_any2short(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && length >= v_uint8::nlanes) { - constexpr static bool noscale = true; // omit scaling - run_sobel3x3_vert(out, length, ky, scale, delta, r, buf); - } else + // appears 10-40x faster than reference due to much faster rounding + run_sepfilter3x3_any2char(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } +#endif // CV_SIMD + + // reference code is quite fast for any-to-float case, + // but not for any-to-integral due to very slow rounding + run_sepfilter3x3_reference(out, in, width, chan, kx, ky, border, + scale, delta, buf, y, y0); +} + +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0) \ +{ \ + if (scale == 1 && delta == 0) \ + { \ + constexpr bool noscale = true; \ + run_sepfilter3x3_code(out, in, width, chan, kx, ky, border, \ + scale, delta, buf, y, y0); \ + } \ + else \ + { \ + constexpr bool noscale = false; \ + run_sepfilter3x3_code(out, in, width, chan, kx, ky, border, \ + scale, delta, buf, y, y0); \ + } \ +} + +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL + +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +template +static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ + static constexpr int ksize = 3; + static constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + const float k[3][3] = {{ kernel[0], kernel[1], kernel[2] }, + { kernel[3], kernel[4], kernel[5] }, + { kernel[6], kernel[7], kernel[8] }}; + + for (int l=0; l < length; l++) { - constexpr static bool noscale = false; // do scaling - run_sobel3x3_vert(out, length, ky, scale, delta, r, buf); + float sum = in[0][l - shift] * k[0][0] + in[0][l] * k[0][1] + in[0][l + shift] * k[0][2] + + in[1][l - shift] * k[1][0] + in[1][l] * k[1][1] + in[1][l + shift] * k[1][2] + + in[2][l - shift] * k[2][0] + in[2][l] * k[2][1] + in[2][l + shift] * k[2][2]; + + if (!noscale) + { + sum = sum*scale + delta; + } + + out[l] = saturate(sum, rintf); } } -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0) \ -{ \ - run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \ +#if CV_SIMD +// assume DST is short or ushort +template +static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ + static constexpr int ksize = 3; + static constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + const float k[3][3] = { + { kernel[0], kernel[1], kernel[2] }, + { kernel[3], kernel[4], kernel[5] }, + { kernel[6], kernel[7], kernel[8] } + }; + + for (int l=0; l < length;) + { + static constexpr int nlanes = v_int16::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + auto sumx = [in, shift, &k](int i, int j) + { + v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]); + s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s); + s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s); + return s; + }; + + int l0 = l; + int l1 = l + nlanes/2; + v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0); + v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1); + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_int32 res0 = v_round(sum0); + v_int32 res1 = v_round(sum1); + + if (std::is_same::value) + { + v_uint16 res = v_pack_u(res0, res1); + v_store(reinterpret_cast(&out[l]), res); + } + else // if DST == short + { + v_int16 res = v_pack(res0, res1); + v_store(reinterpret_cast(&out[l]), res); + } + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } } -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +template +static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ + static constexpr int ksize = 3; + static constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + const float k[3][3] = { + { kernel[0], kernel[1], kernel[2] }, + { kernel[3], kernel[4], kernel[5] }, + { kernel[6], kernel[7], kernel[8] } + }; + + for (int l=0; l < length;) + { + static constexpr int nlanes = v_uint8::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + auto sumx = [in, shift, &k](int i, int j) + { + v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]); + s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s); + s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s); + return s; + }; + + int l0 = l; + int l1 = l + nlanes/4; + int l2 = l + 2*nlanes/4; + int l3 = l + 3*nlanes/4; + v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0); + v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1); + v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2); + v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3); + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); + sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_int32 res0 = v_round(sum0); + v_int32 res1 = v_round(sum1); + v_int32 res2 = v_round(sum2); + v_int32 res3 = v_round(sum3); + + v_int16 resl = v_pack(res0, res1); + v_int16 resh = v_pack(res2, res3); + v_uint8 res = v_pack_u(resl, resh); + + v_store(&out[l], res); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } +} +#endif + +template +static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void) length; + + if (std::is_same::value && length >= v_int16::nlanes) + { + run_filter2d_3x3_any2short(reinterpret_cast(out), in, + width, chan, kernel, scale, delta); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) + { + run_filter2d_3x3_any2short(reinterpret_cast(out), in, + width, chan, kernel, scale, delta); + return; + } + + + if (std::is_same::value && length >= v_uint8::nlanes) + { + run_filter2d_3x3_any2char(reinterpret_cast(out), in, + width, chan, kernel, scale, delta); + return; + } +#endif // CV_SIMD + + run_filter2d_3x3_reference(out, in, width, chan, kernel, scale, delta); +} + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta) \ +{ \ + if (scale == 1 && delta == 0) \ + { \ + constexpr bool noscale = true; \ + run_filter2d_3x3_code(out, in, width, chan, kernel, scale, delta); \ + } \ + else \ + { \ + constexpr bool noscale = false; \ + run_filter2d_3x3_code(out, in, width, chan, kernel, scale, delta); \ + } \ +} + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + +//----------------------------- +// +// Fluid kernels: Erode, Dilate +// +//----------------------------- + +template +static void run_morphology3x3_reference(T out[], const T *in[], int width, int chan, + const uchar k[], MorphShape k_type, + Morphology morphology) +{ + constexpr int k_size = 3; + constexpr int border = (k_size - 1) / 2; + + const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}}; + + const int length = width * chan; + const int shift = border * chan; + + if (M_ERODE == morphology) + { + if (M_FULL == k_type) + { + for (int l=0; l < length; l++) + { + T result = std::numeric_limits::max(); + + result = (std::min)(result, in[0][l - shift]); + result = (std::min)(result, in[0][l ]); + result = (std::min)(result, in[0][l + shift]); + + result = (std::min)(result, in[1][l - shift]); + result = (std::min)(result, in[1][l ]); + result = (std::min)(result, in[1][l + shift]); + + result = (std::min)(result, in[2][l - shift]); + result = (std::min)(result, in[2][l ]); + result = (std::min)(result, in[2][l + shift]); + + out[l] = result; + } + return; + } + + if (M_CROSS == k_type) + { + for (int l=0; l < length; l++) + { + T result = std::numeric_limits::max(); + + // result = (std::min)(result, in[0][l - shift]); + result = (std::min)(result, in[0][l ]); + // result = (std::min)(result, in[0][l + shift]); + + result = (std::min)(result, in[1][l - shift]); + result = (std::min)(result, in[1][l ]); + result = (std::min)(result, in[1][l + shift]); + + // result = (std::min)(result, in[2][l - shift]); + result = (std::min)(result, in[2][l ]); + // result = (std::min)(result, in[2][l + shift]); + + out[l] = result; + } + return; + } + + for (int l=0; l < length; l++) + { + T result = std::numeric_limits::max(); + + result = kernel[0][0]? (std::min)(result, in[0][l - shift]): result; + result = kernel[0][1]? (std::min)(result, in[0][l ]): result; + result = kernel[0][2]? (std::min)(result, in[0][l + shift]): result; + + result = kernel[1][0]? (std::min)(result, in[1][l - shift]): result; + result = kernel[1][1]? (std::min)(result, in[1][l ]): result; + result = kernel[1][2]? (std::min)(result, in[1][l + shift]): result; + + result = kernel[2][0]? (std::min)(result, in[2][l - shift]): result; + result = kernel[2][1]? (std::min)(result, in[2][l ]): result; + result = kernel[2][2]? (std::min)(result, in[2][l + shift]): result; + + out[l] = result; + } + return; + } + + if (M_DILATE == morphology) + { + if (M_FULL == k_type) + { + for (int l=0; l < length; l++) + { + T result = std::numeric_limits::min(); + + result = (std::max)(result, in[0][l - shift]); + result = (std::max)(result, in[0][l ]); + result = (std::max)(result, in[0][l + shift]); + + result = (std::max)(result, in[1][l - shift]); + result = (std::max)(result, in[1][l ]); + result = (std::max)(result, in[1][l + shift]); + + result = (std::max)(result, in[2][l - shift]); + result = (std::max)(result, in[2][l ]); + result = (std::max)(result, in[2][l + shift]); + + out[l] = result; + } + return; + } + + if (M_CROSS == k_type) + { + for (int l=0; l < length; l++) + { + T result = std::numeric_limits::min(); + + // result = (std::max)(result, in[0][l - shift]); + result = (std::max)(result, in[0][l ]); + // result = (std::max)(result, in[0][l + shift]); + + result = (std::max)(result, in[1][l - shift]); + result = (std::max)(result, in[1][l ]); + result = (std::max)(result, in[1][l + shift]); + + // result = (std::max)(result, in[2][l - shift]); + result = (std::max)(result, in[2][l ]); + // result = (std::max)(result, in[2][l + shift]); + + out[l] = result; + } + return; + } + + for (int l=0; l < length; l++) + { + T result = std::numeric_limits::min(); + + result = kernel[0][0]? (std::max)(result, in[0][l - shift]): result; + result = kernel[0][1]? (std::max)(result, in[0][l ]): result; + result = kernel[0][2]? (std::max)(result, in[0][l + shift]): result; + + result = kernel[1][0]? (std::max)(result, in[1][l - shift]): result; + result = kernel[1][1]? (std::max)(result, in[1][l ]): result; + result = kernel[1][2]? (std::max)(result, in[1][l + shift]): result; + + result = kernel[2][0]? (std::max)(result, in[2][l - shift]): result; + result = kernel[2][1]? (std::max)(result, in[2][l ]): result; + result = kernel[2][2]? (std::max)(result, in[2][l + shift]): result; + + out[l] = result; + } + return; + } + + CV_Error(cv::Error::StsBadArg, "unsupported morphology"); +} + +#if CV_SIMD +template +static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan, + const uchar k[], MorphShape k_type, + Morphology morphology, + S setall) +{ + constexpr int k_size = 3; + constexpr int border = (k_size - 1) / 2; + + const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}}; + + const int length = width * chan; + const int shift = border * chan; + + if (M_ERODE == morphology) + { + if (M_FULL == k_type) + { + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT r = setall(std::numeric_limits::max()); + + r = v_min(r, vx_load(&in[0][l - shift])); + r = v_min(r, vx_load(&in[0][l ])); + r = v_min(r, vx_load(&in[0][l + shift])); + + r = v_min(r, vx_load(&in[1][l - shift])); + r = v_min(r, vx_load(&in[1][l ])); + r = v_min(r, vx_load(&in[1][l + shift])); + + r = v_min(r, vx_load(&in[2][l - shift])); + r = v_min(r, vx_load(&in[2][l ])); + r = v_min(r, vx_load(&in[2][l + shift])); + + v_store(&out[l], r); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; + } + + if (M_CROSS == k_type) + { + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT r = setall(std::numeric_limits::max()); + + // r = v_min(r, vx_load(&in[0][l - shift])); + r = v_min(r, vx_load(&in[0][l ])); + // r = v_min(r, vx_load(&in[0][l + shift])); + + r = v_min(r, vx_load(&in[1][l - shift])); + r = v_min(r, vx_load(&in[1][l ])); + r = v_min(r, vx_load(&in[1][l + shift])); + + // r = v_min(r, vx_load(&in[2][l - shift])); + r = v_min(r, vx_load(&in[2][l ])); + // r = v_min(r, vx_load(&in[2][l + shift])); + + v_store(&out[l], r); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; + } + + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT r = setall(std::numeric_limits::max()); + + if (kernel[0][0]) r = v_min(r, vx_load(&in[0][l - shift])); + if (kernel[0][1]) r = v_min(r, vx_load(&in[0][l ])); + if (kernel[0][2]) r = v_min(r, vx_load(&in[0][l + shift])); + + if (kernel[1][0]) r = v_min(r, vx_load(&in[1][l - shift])); + if (kernel[1][1]) r = v_min(r, vx_load(&in[1][l ])); + if (kernel[1][2]) r = v_min(r, vx_load(&in[1][l + shift])); + + if (kernel[2][0]) r = v_min(r, vx_load(&in[2][l - shift])); + if (kernel[2][1]) r = v_min(r, vx_load(&in[2][l ])); + if (kernel[2][2]) r = v_min(r, vx_load(&in[2][l + shift])); + + v_store(&out[l], r); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; + } + + if (M_DILATE == morphology) + { + if (M_FULL == k_type) + { + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT r = setall(std::numeric_limits::min()); + + r = v_max(r, vx_load(&in[0][l - shift])); + r = v_max(r, vx_load(&in[0][l ])); + r = v_max(r, vx_load(&in[0][l + shift])); + + r = v_max(r, vx_load(&in[1][l - shift])); + r = v_max(r, vx_load(&in[1][l ])); + r = v_max(r, vx_load(&in[1][l + shift])); + + r = v_max(r, vx_load(&in[2][l - shift])); + r = v_max(r, vx_load(&in[2][l ])); + r = v_max(r, vx_load(&in[2][l + shift])); + + v_store(&out[l], r); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; + } + + if (M_CROSS == k_type) + { + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT r = setall(std::numeric_limits::min()); + + // r = v_max(r, vx_load(&in[0][l - shift])); + r = v_max(r, vx_load(&in[0][l ])); + // r = v_max(r, vx_load(&in[0][l + shift])); + + r = v_max(r, vx_load(&in[1][l - shift])); + r = v_max(r, vx_load(&in[1][l ])); + r = v_max(r, vx_load(&in[1][l + shift])); + + // r = v_max(r, vx_load(&in[2][l - shift])); + r = v_max(r, vx_load(&in[2][l ])); + // r = v_max(r, vx_load(&in[2][l + shift])); + + v_store(&out[l], r); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; + } + + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT r = setall(std::numeric_limits::min()); + + if (kernel[0][0]) r = v_max(r, vx_load(&in[0][l - shift])); + if (kernel[0][1]) r = v_max(r, vx_load(&in[0][l ])); + if (kernel[0][2]) r = v_max(r, vx_load(&in[0][l + shift])); + + if (kernel[1][0]) r = v_max(r, vx_load(&in[1][l - shift])); + if (kernel[1][1]) r = v_max(r, vx_load(&in[1][l ])); + if (kernel[1][2]) r = v_max(r, vx_load(&in[1][l + shift])); + + if (kernel[2][0]) r = v_max(r, vx_load(&in[2][l - shift])); + if (kernel[2][1]) r = v_max(r, vx_load(&in[2][l ])); + if (kernel[2][2]) r = v_max(r, vx_load(&in[2][l + shift])); + + v_store(&out[l], r); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } + return; + } + + CV_Error(cv::Error::StsBadArg, "unsupported morphology"); +} +#endif + +template +static void run_morphology3x3_code(T out[], const T *in[], int width, int chan, + const uchar k[], MorphShape k_type, + Morphology morphology) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void) length; + + if (std::is_same::value && length >= v_float32::nlanes) + { + run_morphology3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, k, k_type, morphology, + vx_setall_f32); + return; + } + + if (std::is_same::value && length >= v_int16::nlanes) + { + run_morphology3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, k, k_type, morphology, + vx_setall_s16); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) + { + run_morphology3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, k, k_type, morphology, + vx_setall_u16); + return; + } + + if (std::is_same::value && length >= v_uint8::nlanes) + { + run_morphology3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, k, k_type, morphology, + vx_setall_u8); + return; + } +#endif // CV_SIMD + + run_morphology3x3_reference(out, in, width, chan, k, k_type, morphology); +} + +#define RUN_MORPHOLOGY3X3_IMPL(T) \ +void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \ + const uchar k[], MorphShape k_type, \ + Morphology morphology) \ +{ \ + run_morphology3x3_code(out, in, width, chan, k, k_type, morphology); \ +} + +RUN_MORPHOLOGY3X3_IMPL(uchar ) +RUN_MORPHOLOGY3X3_IMPL(ushort) +RUN_MORPHOLOGY3X3_IMPL( short) +RUN_MORPHOLOGY3X3_IMPL( float) + +#undef RUN_MORPHOLOGY3X3_IMPL + +//--------------------------- +// +// Fluid kernels: Median blur +// +//--------------------------- + +template +static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan) +{ + constexpr int ksize = 3; + constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + for (int l=0; l < length; l++) + { + T t[3][3]; + + // neighbourhood 3x3 + t[0][0] = in[0][l - shift]; t[0][1] = in[0][l]; t[0][2] = in[0][l + shift]; + t[1][0] = in[1][l - shift]; t[1][1] = in[1][l]; t[1][2] = in[1][l + shift]; + t[2][0] = in[2][l - shift]; t[2][1] = in[2][l]; t[2][2] = in[2][l + shift]; + + // sort 2 values + auto sort = [](T& a, T& b) + { + T u=a, v=b; + a = (std::min)(u, v); + b = (std::max)(u, v); + }; + + // horizontal: 3-elements bubble-sort per each row + sort(t[0][0], t[0][1]); sort(t[0][1], t[0][2]); sort(t[0][0], t[0][1]); + sort(t[1][0], t[1][1]); sort(t[1][1], t[1][2]); sort(t[1][0], t[1][1]); + sort(t[2][0], t[2][1]); sort(t[2][1], t[2][2]); sort(t[2][0], t[2][1]); + + // vertical: columns bubble-sort (although partial) + sort(t[0][0], t[1][0]); sort(t[0][1], t[1][1]); /*sort(t[0][2], t[1][2]);*/ + sort(t[1][0], t[2][0]); sort(t[1][1], t[2][1]); sort(t[1][2], t[2][2]); + /*sort(t[0][0], t[1][0]);*/ sort(t[0][1], t[1][1]); sort(t[0][2], t[1][2]); + + // diagonal: bubble-sort (in opposite order!) + sort(t[1][1], t[0][2]); sort(t[2][0], t[1][1]); sort(t[1][1], t[0][2]); + + out[l] = t[1][1]; + } +} + +#if CV_SIMD +template +static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan) +{ + constexpr int ksize = 3; + constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + for (int l=0; l < length;) + { + constexpr int nlanes = VT::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + VT t00, t01, t02, t10, t11, t12, t20, t21, t22; + + // neighbourhood 3x3 + + t00 = vx_load(&in[0][l - shift]); + t01 = vx_load(&in[0][l ]); + t02 = vx_load(&in[0][l + shift]); + + t10 = vx_load(&in[1][l - shift]); + t11 = vx_load(&in[1][l ]); + t12 = vx_load(&in[1][l + shift]); + + t20 = vx_load(&in[2][l - shift]); + t21 = vx_load(&in[2][l ]); + t22 = vx_load(&in[2][l + shift]); + + // sort 2 values + auto sort = [](VT& a, VT& b) + { + VT u=a, v=b; + a = v_min(u, v); + b = v_max(u, v); + }; + + // horizontal: 3-elements bubble-sort per each row + sort(t00, t01); sort(t01, t02); sort(t00, t01); + sort(t10, t11); sort(t11, t12); sort(t10, t11); + sort(t20, t21); sort(t21, t22); sort(t20, t21); + + // vertical: columns bubble-sort (although partial) + sort(t00, t10); sort(t01, t11); /*sort(t02, t12);*/ + sort(t10, t20); sort(t11, t21); sort(t12, t22); + /*sort(t00, t10);*/ sort(t01, t11); sort(t02, t12); + + // diagonal: bubble-sort (in opposite order!) + sort(t11, t02); sort(t20, t11); sort(t11, t02); + + v_store(&out[l], t11); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } +} +#endif + +template +static void run_medblur3x3_code(T out[], const T *in[], int width, int chan) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void) length; + + if (std::is_same::value && length >= v_float32::nlanes) + { + run_medblur3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan); + return; + } + + if (std::is_same::value && length >= v_int16::nlanes) + { + run_medblur3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) + { + run_medblur3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan); + return; + } + + if (std::is_same::value && length >= v_uint8::nlanes) + { + run_medblur3x3_simd(reinterpret_cast(out), + reinterpret_cast(in), + width, chan); + return; + } +#endif + + run_medblur3x3_reference(out, in, width, chan); +} + +#define RUN_MEDBLUR3X3_IMPL(T) \ +void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \ +{ \ + run_medblur3x3_code(out, in, width, chan); \ +} + +RUN_MEDBLUR3X3_IMPL(uchar ) +RUN_MEDBLUR3X3_IMPL(ushort) +RUN_MEDBLUR3X3_IMPL( short) +RUN_MEDBLUR3X3_IMPL( float) + +#undef RUN_MEDBLUR3X3_IMPL + +//------------------------------------------------------------------------------ #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp index a38b2f1..0a54f4e 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef GFLUIDUTILS_HPP @@ -10,7 +10,7 @@ #include #include -#include //UNUSED +#include //suppress_unused_warning #include namespace cv { diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp index eda6a5f..e2f4cd4 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp index 1fb128d..5ba2ff6 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GGPUBACKEND_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp index a1ee6a1..60367fe 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp index 47cbfa6..c38d7f1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GGPUCORE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp index 9b7aca1..c90257b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp index cd2e324..29bd3fc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GGPUIMGPROC_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp index 87e2aa9..36f96de 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.cpp new file mode 100644 index 0000000..7fec9d1 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.cpp @@ -0,0 +1,226 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#include "precomp.hpp" + +#include +#include + +#include + +#include +#include +#include + +#include + +#include "opencv2/gapi/gcommon.hpp" +#include "opencv2/gapi/util/any.hpp" +#include "opencv2/gapi/gtype_traits.hpp" + +#include "compiler/gobjref.hpp" +#include "compiler/gmodel.hpp" + +#include "backends/ocl/goclbackend.hpp" +#include "backends/ocl/goclimgproc.hpp" +#include "backends/ocl/goclcore.hpp" + +#include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK! + +// FIXME: Is there a way to take a typed graph (our GModel), +// and create a new typed graph _ATOP_ of that (by extending with a couple of +// new types?). +// Alternatively, is there a way to compose types graphs? +// +// If not, we need to introduce that! +using GOCLModel = ade::TypedGraph + < cv::gimpl::Unit + , cv::gimpl::Protocol + >; + +// FIXME: Same issue with Typed and ConstTyped +using GConstGOCLModel = ade::ConstTypedGraph + < cv::gimpl::Unit + , cv::gimpl::Protocol + >; + +namespace +{ + class GOCLBackendImpl final: public cv::gapi::GBackend::Priv + { + virtual void unpackKernel(ade::Graph &graph, + const ade::NodeHandle &op_node, + const cv::GKernelImpl &impl) override + { + GOCLModel gm(graph); + auto ocl_impl = cv::util::any_cast(impl.opaque); + gm.metadata(op_node).set(cv::gimpl::Unit{ocl_impl}); + } + + virtual EPtr compile(const ade::Graph &graph, + const cv::GCompileArgs &, + const std::vector &nodes) const override + { + return EPtr{new cv::gimpl::GOCLExecutable(graph, nodes)}; + } + }; +} + +cv::gapi::GBackend cv::gapi::ocl::backend() +{ + static cv::gapi::GBackend this_backend(std::make_shared()); + return this_backend; +} + +// GOCLExcecutable implementation ////////////////////////////////////////////// +cv::gimpl::GOCLExecutable::GOCLExecutable(const ade::Graph &g, + const std::vector &nodes) + : m_g(g), m_gm(m_g) +{ + // Convert list of operations (which is topologically sorted already) + // into an execution script. + for (auto &nh : nodes) + { + switch (m_gm.metadata(nh).get().t) + { + case NodeType::OP: m_script.push_back({nh, GModel::collectOutputMeta(m_gm, nh)}); break; + case NodeType::DATA: + { + m_dataNodes.push_back(nh); + const auto &desc = m_gm.metadata(nh).get(); + if (desc.storage == Data::Storage::CONST) + { + auto rc = RcDesc{desc.rc, desc.shape, desc.ctor}; + magazine::bindInArg(m_res, rc, m_gm.metadata(nh).get().arg); + } + //preallocate internal Mats in advance + if (desc.storage == Data::Storage::INTERNAL && desc.shape == GShape::GMAT) + { + const auto mat_desc = util::get(desc.meta); + const auto type = CV_MAKETYPE(mat_desc.depth, mat_desc.chan); + m_res.slot()[desc.rc].create(mat_desc.size.height, mat_desc.size.width, type); + } + break; + } + default: util::throw_error(std::logic_error("Unsupported NodeType type")); + } + } +} + +// FIXME: Document what it does +cv::GArg cv::gimpl::GOCLExecutable::packArg(const GArg &arg) +{ + // No API placeholders allowed at this point + // FIXME: this check has to be done somewhere in compilation stage. + GAPI_Assert( arg.kind != cv::detail::ArgKind::GMAT + && arg.kind != cv::detail::ArgKind::GSCALAR + && arg.kind != cv::detail::ArgKind::GARRAY); + + if (arg.kind != cv::detail::ArgKind::GOBJREF) + { + // All other cases - pass as-is, with no transformations to GArg contents. + return arg; + } + GAPI_Assert(arg.kind == cv::detail::ArgKind::GOBJREF); + + // Wrap associated CPU object (either host or an internal one) + // FIXME: object can be moved out!!! GExecutor faced that. + const cv::gimpl::RcDesc &ref = arg.get(); + switch (ref.shape) + { + case GShape::GMAT: return GArg(m_res.slot()[ref.id]); + case GShape::GSCALAR: return GArg(m_res.slot()[ref.id]); + // Note: .at() is intentional for GArray as object MUST be already there + // (and constructed by either bindIn/Out or resetInternal) + case GShape::GARRAY: return GArg(m_res.slot().at(ref.id)); + default: + util::throw_error(std::logic_error("Unsupported GShape type")); + break; + } +} + +void cv::gimpl::GOCLExecutable::run(std::vector &&input_objs, + std::vector &&output_objs) +{ + // Update resources with run-time information - what this Island + // has received from user (or from another Island, or mix...) + // FIXME: Check input/output objects against GIsland protocol + + for (auto& it : input_objs) magazine::bindInArg (m_res, it.first, it.second, true); + for (auto& it : output_objs) magazine::bindOutArg(m_res, it.first, it.second, true); + + // Initialize (reset) internal data nodes with user structures + // before processing a frame (no need to do it for external data structures) + GModel::ConstGraph gm(m_g); + for (auto nh : m_dataNodes) + { + const auto &desc = gm.metadata(nh).get(); + + if ( desc.storage == Data::Storage::INTERNAL + && !util::holds_alternative(desc.ctor)) + { + // FIXME: Note that compile-time constant data objects (like + // a value-initialized GArray) also satisfy this condition + // and should be excluded, but now we just don't support it + magazine::resetInternalData(m_res, desc); + } + } + + // OpenCV backend execution is not a rocket science at all. + // Simply invoke our kernels in the proper order. + GConstGOCLModel gcm(m_g); + for (auto &op_info : m_script) + { + const auto &op = m_gm.metadata(op_info.nh).get(); + + // Obtain our real execution unit + // TODO: Should kernels be copyable? + GOCLKernel k = gcm.metadata(op_info.nh).get().k; + + // Initialize kernel's execution context: + // - Input parameters + GOCLContext context; + context.m_args.reserve(op.args.size()); + + using namespace std::placeholders; + ade::util::transform(op.args, + std::back_inserter(context.m_args), + std::bind(&GOCLExecutable::packArg, this, _1)); + + // - Output parameters. + // FIXME: pre-allocate internal Mats, etc, according to the known meta + for (const auto &out_it : ade::util::indexed(op.outs)) + { + // FIXME: Can the same GArg type resolution mechanism be reused here? + const auto out_port = ade::util::index(out_it); + const auto out_desc = ade::util::value(out_it); + context.m_results[out_port] = magazine::getObjPtr(m_res, out_desc, true); + } + + // Now trigger the executable unit + k.apply(context); + + for (const auto &out_it : ade::util::indexed(op_info.expected_out_metas)) + { + const auto out_index = ade::util::index(out_it); + const auto expected_meta = ade::util::value(out_it); + const auto out_meta = descr_of(context.m_results[out_index]); + + if (expected_meta != out_meta) + { + util::throw_error + (std::logic_error + ("Output meta doesn't " + "coincide with the generated meta\n" + "Expected: " + ade::util::to_string(expected_meta) + "\n" + "Actual : " + ade::util::to_string(out_meta))); + } + } + } // for(m_script) + + for (auto &it : output_objs) magazine::writeBack(m_res, it.first, it.second, true); +} diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.hpp new file mode 100644 index 0000000..a86f3e6 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclbackend.hpp @@ -0,0 +1,72 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#ifndef OPENCV_GAPI_GOCLBACKEND_HPP +#define OPENCV_GAPI_GOCLBACKEND_HPP + +#include // map +#include // unordered_map +#include // tuple +#include // type_list_index + +#include "opencv2/gapi/garg.hpp" +#include "opencv2/gapi/gproto.hpp" +#include "opencv2/gapi/ocl/goclkernel.hpp" + + +#include "api/gapi_priv.hpp" +#include "backends/common/gbackend.hpp" +#include "compiler/gislandmodel.hpp" + +namespace cv { namespace gimpl { + +struct Unit +{ + static const char *name() { return "OCLKernel"; } + GOCLKernel k; +}; + +class GOCLExecutable final: public GIslandExecutable +{ + const ade::Graph &m_g; + GModel::ConstGraph m_gm; + + struct OperationInfo + { + ade::NodeHandle nh; + GMetaArgs expected_out_metas; + }; + + // Execution script, currently absolutely naive + std::vector m_script; + // List of all resources in graph (both internal and external) + std::vector m_dataNodes; + + // Actual data of all resources in graph (both internal and external) + Mag m_res; + GArg packArg(const GArg &arg); + +public: + GOCLExecutable(const ade::Graph &graph, + const std::vector &nodes); + + virtual inline bool canReshape() const override { return false; } + virtual inline void reshape(ade::Graph&, const GCompileArgs&) override + { + // FIXME: OCL plugin is in fact reshapeable (as it was initially, + // even before outMeta() has been introduced), so this limitation + // should be dropped. + util::throw_error(std::logic_error("GOCLExecutable::reshape() should never be called")); + } + + virtual void run(std::vector &&input_objs, + std::vector &&output_objs) override; +}; + +}} + +#endif // OPENCV_GAPI_GOCLBACKEND_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.cpp new file mode 100644 index 0000000..ba80ef3 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.cpp @@ -0,0 +1,582 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#include "precomp.hpp" + +#include "opencv2/gapi/core.hpp" +#include "opencv2/gapi/ocl/core.hpp" +#include "backends/ocl/goclcore.hpp" + +GAPI_OCL_KERNEL(GOCLAdd, cv::gapi::core::GAdd) +{ + static void run(const cv::UMat& a, const cv::UMat& b, int dtype, cv::UMat& out) + { + cv::add(a, b, out, cv::noArray(), dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLAddC, cv::gapi::core::GAddC) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, int dtype, cv::UMat& out) + { + cv::add(a, b, out, cv::noArray(), dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLSub, cv::gapi::core::GSub) +{ + static void run(const cv::UMat& a, const cv::UMat& b, int dtype, cv::UMat& out) + { + cv::subtract(a, b, out, cv::noArray(), dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLSubC, cv::gapi::core::GSubC) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, int dtype, cv::UMat& out) + { + cv::subtract(a, b, out, cv::noArray(), dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLSubRC, cv::gapi::core::GSubRC) +{ + static void run(const cv::Scalar& a, const cv::UMat& b, int dtype, cv::UMat& out) + { + cv::subtract(a, b, out, cv::noArray(), dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLMul, cv::gapi::core::GMul) +{ + static void run(const cv::UMat& a, const cv::UMat& b, double scale, int dtype, cv::UMat& out) + { + cv::multiply(a, b, out, scale, dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLMulCOld, cv::gapi::core::GMulCOld) +{ + static void run(const cv::UMat& a, double b, int dtype, cv::UMat& out) + { + cv::multiply(a, b, out, 1, dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLMulC, cv::gapi::core::GMulC) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, int dtype, cv::UMat& out) + { + cv::multiply(a, b, out, 1, dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLDiv, cv::gapi::core::GDiv) +{ + static void run(const cv::UMat& a, const cv::UMat& b, double scale, int dtype, cv::UMat& out) + { + cv::divide(a, b, out, scale, dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLDivC, cv::gapi::core::GDivC) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, double scale, int dtype, cv::UMat& out) + { + cv::divide(a, b, out, scale, dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLDivRC, cv::gapi::core::GDivRC) +{ + static void run(const cv::Scalar& a, const cv::UMat& b, double scale, int dtype, cv::UMat& out) + { + cv::divide(a, b, out, scale, dtype); + } +}; + +GAPI_OCL_KERNEL(GOCLMask, cv::gapi::core::GMask) +{ + static void run(const cv::UMat& in, const cv::UMat& mask, cv::UMat& out) + { + out = cv::UMat::zeros(in.size(), in.type()); + in.copyTo(out, mask); + } +}; + + +GAPI_OCL_KERNEL(GOCLMean, cv::gapi::core::GMean) +{ + static void run(const cv::UMat& in, cv::Scalar& out) + { + out = cv::mean(in); + } +}; + +GAPI_OCL_KERNEL(GOCLPolarToCart, cv::gapi::core::GPolarToCart) +{ + static void run(const cv::UMat& magn, const cv::UMat& angle, bool angleInDegrees, cv::UMat& outx, cv::UMat& outy) + { + cv::polarToCart(magn, angle, outx, outy, angleInDegrees); + } +}; + +GAPI_OCL_KERNEL(GOCLCartToPolar, cv::gapi::core::GCartToPolar) +{ + static void run(const cv::UMat& x, const cv::UMat& y, bool angleInDegrees, cv::UMat& outmagn, cv::UMat& outangle) + { + cv::cartToPolar(x, y, outmagn, outangle, angleInDegrees); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpGT, cv::gapi::core::GCmpGT) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_GT); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpGE, cv::gapi::core::GCmpGE) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_GE); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpLE, cv::gapi::core::GCmpLE) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_LE); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpLT, cv::gapi::core::GCmpLT) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_LT); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpEQ, cv::gapi::core::GCmpEQ) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_EQ); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpNE, cv::gapi::core::GCmpNE) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_NE); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpGTScalar, cv::gapi::core::GCmpGTScalar) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_GT); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpGEScalar, cv::gapi::core::GCmpGEScalar) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_GE); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpLEScalar, cv::gapi::core::GCmpLEScalar) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_LE); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpLTScalar, cv::gapi::core::GCmpLTScalar) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_LT); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpEQScalar, cv::gapi::core::GCmpEQScalar) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_EQ); + } +}; + +GAPI_OCL_KERNEL(GOCLCmpNEScalar, cv::gapi::core::GCmpNEScalar) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::compare(a, b, out, cv::CMP_NE); + } +}; + +GAPI_OCL_KERNEL(GOCLAnd, cv::gapi::core::GAnd) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::bitwise_and(a, b, out); + } +}; + +GAPI_OCL_KERNEL(GOCLAndS, cv::gapi::core::GAndS) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::bitwise_and(a, b, out); + } +}; + +GAPI_OCL_KERNEL(GOCLOr, cv::gapi::core::GOr) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::bitwise_or(a, b, out); + } +}; + +GAPI_OCL_KERNEL(GOCLOrS, cv::gapi::core::GOrS) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::bitwise_or(a, b, out); + } +}; + +GAPI_OCL_KERNEL(GOCLXor, cv::gapi::core::GXor) +{ + static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out) + { + cv::bitwise_xor(a, b, out); + } +}; + +GAPI_OCL_KERNEL(GOCLXorS, cv::gapi::core::GXorS) +{ + static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out) + { + cv::bitwise_xor(a, b, out); + } +}; + +GAPI_OCL_KERNEL(GOCLNot, cv::gapi::core::GNot) +{ + static void run(const cv::UMat& a, cv::UMat& out) + { + cv::bitwise_not(a, out); + } +}; + +GAPI_OCL_KERNEL(GOCLSelect, cv::gapi::core::GSelect) +{ + static void run(const cv::UMat& src1, const cv::UMat& src2, const cv::UMat& mask, cv::UMat& out) + { + src2.copyTo(out); + src1.copyTo(out, mask); + } +}; + +////TODO: doesn't compiled with UMat +//GAPI_OCL_KERNEL(GOCLMin, cv::gapi::core::GMin) +//{ +// static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out) +// { +// out = cv::min(in1, in2); +// } +//}; +// +////TODO: doesn't compiled with UMat +//GAPI_OCL_KERNEL(GOCLMax, cv::gapi::core::GMax) +//{ +// static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out) +// { +// out = cv::max(in1, in2); +// } +//}; + + +GAPI_OCL_KERNEL(GOCLAbsDiff, cv::gapi::core::GAbsDiff) +{ + static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out) + { + cv::absdiff(in1, in2, out); + } +}; + +GAPI_OCL_KERNEL(GOCLAbsDiffC, cv::gapi::core::GAbsDiffC) +{ + static void run(const cv::UMat& in1, const cv::Scalar& in2, cv::UMat& out) + { + cv::absdiff(in1, in2, out); + } +}; + +GAPI_OCL_KERNEL(GOCLSum, cv::gapi::core::GSum) +{ + static void run(const cv::UMat& in, cv::Scalar& out) + { + out = cv::sum(in); + } +}; + +GAPI_OCL_KERNEL(GOCLAddW, cv::gapi::core::GAddW) +{ + static void run(const cv::UMat& in1, double alpha, const cv::UMat& in2, double beta, double gamma, int dtype, cv::UMat& out) + { + cv::addWeighted(in1, alpha, in2, beta, gamma, out, dtype); + } +}; + + +GAPI_OCL_KERNEL(GOCLNormL1, cv::gapi::core::GNormL1) +{ + static void run(const cv::UMat& in, cv::Scalar& out) + { + out = cv::norm(in, cv::NORM_L1); + } +}; + +GAPI_OCL_KERNEL(GOCLNormL2, cv::gapi::core::GNormL2) +{ + static void run(const cv::UMat& in, cv::Scalar& out) + { + out = cv::norm(in, cv::NORM_L2); + } +}; + +GAPI_OCL_KERNEL(GOCLNormInf, cv::gapi::core::GNormInf) +{ + static void run(const cv::UMat& in, cv::Scalar& out) + { + out = cv::norm(in, cv::NORM_INF); + } +}; + +GAPI_OCL_KERNEL(GOCLIntegral, cv::gapi::core::GIntegral) +{ + static void run(const cv::UMat& in, int sdepth, int sqdepth, cv::UMat& out, cv::UMat& outSq) + { + cv::integral(in, out, outSq, sdepth, sqdepth); + } +}; + +GAPI_OCL_KERNEL(GOCLThreshold, cv::gapi::core::GThreshold) +{ + static void run(const cv::UMat& in, const cv::Scalar& a, const cv::Scalar& b, int type, cv::UMat& out) + { + cv::threshold(in, out, a.val[0], b.val[0], type); + } +}; + +GAPI_OCL_KERNEL(GOCLThresholdOT, cv::gapi::core::GThresholdOT) +{ + static void run(const cv::UMat& in, const cv::Scalar& b, int type, cv::UMat& out, cv::Scalar& outScalar) + { + outScalar = cv::threshold(in, out, b.val[0], b.val[0], type); + } +}; + + +GAPI_OCL_KERNEL(GOCLInRange, cv::gapi::core::GInRange) +{ + static void run(const cv::UMat& in, const cv::Scalar& low, const cv::Scalar& up, cv::UMat& out) + { + cv::inRange(in, low, up, out); + } +}; + +GAPI_OCL_KERNEL(GOCLSplit3, cv::gapi::core::GSplit3) +{ + static void run(const cv::UMat& in, cv::UMat &m1, cv::UMat &m2, cv::UMat &m3) + { + std::vector outMats = {m1, m2, m3}; + cv::split(in, outMats); + + // Write back FIXME: Write a helper or avoid this nonsence completely! + m1 = outMats[0]; + m2 = outMats[1]; + m3 = outMats[2]; + } +}; + +GAPI_OCL_KERNEL(GOCLSplit4, cv::gapi::core::GSplit4) +{ + static void run(const cv::UMat& in, cv::UMat &m1, cv::UMat &m2, cv::UMat &m3, cv::UMat &m4) + { + std::vector outMats = {m1, m2, m3, m4}; + cv::split(in, outMats); + + // Write back FIXME: Write a helper or avoid this nonsence completely! + m1 = outMats[0]; + m2 = outMats[1]; + m3 = outMats[2]; + m4 = outMats[3]; + } +}; + +GAPI_OCL_KERNEL(GOCLMerge3, cv::gapi::core::GMerge3) +{ + static void run(const cv::UMat& in1, const cv::UMat& in2, const cv::UMat& in3, cv::UMat &out) + { + std::vector inMats = {in1, in2, in3}; + cv::merge(inMats, out); + } +}; + +GAPI_OCL_KERNEL(GOCLMerge4, cv::gapi::core::GMerge4) +{ + static void run(const cv::UMat& in1, const cv::UMat& in2, const cv::UMat& in3, const cv::UMat& in4, cv::UMat &out) + { + std::vector inMats = {in1, in2, in3, in4}; + cv::merge(inMats, out); + } +}; + +GAPI_OCL_KERNEL(GOCLResize, cv::gapi::core::GResize) +{ + static void run(const cv::UMat& in, cv::Size sz, double fx, double fy, int interp, cv::UMat &out) + { + cv::resize(in, out, sz, fx, fy, interp); + } +}; + +GAPI_OCL_KERNEL(GOCLRemap, cv::gapi::core::GRemap) +{ + static void run(const cv::UMat& in, const cv::Mat& x, const cv::Mat& y, int a, int b, cv::Scalar s, cv::UMat& out) + { + cv::remap(in, out, x, y, a, b, s); + } +}; + +GAPI_OCL_KERNEL(GOCLFlip, cv::gapi::core::GFlip) +{ + static void run(const cv::UMat& in, int code, cv::UMat& out) + { + cv::flip(in, out, code); + } +}; + +GAPI_OCL_KERNEL(GOCLCrop, cv::gapi::core::GCrop) +{ + static void run(const cv::UMat& in, cv::Rect rect, cv::UMat& out) + { + cv::UMat(in, rect).copyTo(out); + } +}; + +GAPI_OCL_KERNEL(GOCLConcatHor, cv::gapi::core::GConcatHor) +{ + static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out) + { + cv::hconcat(in1, in2, out); + } +}; + +GAPI_OCL_KERNEL(GOCLConcatVert, cv::gapi::core::GConcatVert) +{ + static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out) + { + cv::vconcat(in1, in2, out); + } +}; + +GAPI_OCL_KERNEL(GOCLLUT, cv::gapi::core::GLUT) +{ + static void run(const cv::UMat& in, const cv::Mat& lut, cv::UMat& out) + { + cv::LUT(in, lut, out); + } +}; + +GAPI_OCL_KERNEL(GOCLConvertTo, cv::gapi::core::GConvertTo) +{ + static void run(const cv::UMat& in, int rtype, double alpha, double beta, cv::UMat& out) + { + in.convertTo(out, rtype, alpha, beta); + } +}; + +cv::gapi::GKernelPackage cv::gapi::core::ocl::kernels() +{ + static auto pkg = cv::gapi::kernels + < GOCLAdd + , GOCLAddC + , GOCLSub + , GOCLSubC + , GOCLSubRC + , GOCLMul + , GOCLMulC + , GOCLMulCOld + , GOCLDiv + , GOCLDivC + , GOCLDivRC + , GOCLMean + , GOCLMask + , GOCLPolarToCart + , GOCLCartToPolar + , GOCLCmpGT + , GOCLCmpGE + , GOCLCmpLE + , GOCLCmpLT + , GOCLCmpEQ + , GOCLCmpNE + , GOCLCmpGTScalar + , GOCLCmpGEScalar + , GOCLCmpLEScalar + , GOCLCmpLTScalar + , GOCLCmpEQScalar + , GOCLCmpNEScalar + , GOCLAnd + , GOCLAndS + , GOCLOr + , GOCLOrS + , GOCLXor + , GOCLXorS + , GOCLNot + , GOCLSelect + //, GOCLMin + //, GOCLMax + , GOCLAbsDiff + , GOCLAbsDiffC + , GOCLSum + , GOCLAddW + , GOCLNormL1 + , GOCLNormL2 + , GOCLNormInf + , GOCLIntegral + , GOCLThreshold + , GOCLThresholdOT + , GOCLInRange + , GOCLSplit3 + , GOCLSplit4 + , GOCLResize + , GOCLMerge3 + , GOCLMerge4 + , GOCLRemap + , GOCLFlip + , GOCLCrop + , GOCLConcatHor + , GOCLConcatVert + , GOCLLUT + , GOCLConvertTo + >(); + return pkg; +} diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.hpp new file mode 100644 index 0000000..a36695b --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclcore.hpp @@ -0,0 +1,24 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#ifndef OPENCV_GAPI_GOCLCORE_HPP +#define OPENCV_GAPI_GOCLCORE_HPP + +#include +#include + +#include "opencv2/gapi/ocl/goclkernel.hpp" + +namespace cv { namespace gimpl { + +// NB: This is what a "Kernel Package" from the original Wiki doc should be. +void loadOCLCore(std::map &kmap); + +} +} + +#endif // OPENCV_GAPI_GOCLCORE_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.cpp new file mode 100644 index 0000000..860ebf4 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.cpp @@ -0,0 +1,277 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#include "precomp.hpp" + +#include "opencv2/gapi/imgproc.hpp" +#include "opencv2/gapi/ocl/imgproc.hpp" +#include "backends/ocl/goclimgproc.hpp" + + +GAPI_OCL_KERNEL(GOCLSepFilter, cv::gapi::imgproc::GSepFilter) +{ + static void run(const cv::UMat& in, int ddepth, const cv::Mat& kernX, const cv::Mat& kernY, const cv::Point& anchor, const cv::Scalar& delta, + int border, const cv::Scalar& bordVal, cv::UMat &out) + { + if( border == cv::BORDER_CONSTANT ) + { + cv::UMat temp_in; + int width_add = (kernY.cols - 1) / 2; + int height_add = (kernX.rows - 1) / 2; + cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, border, bordVal); + cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows); + cv::sepFilter2D(temp_in(rect), out, ddepth, kernX, kernY, anchor, delta.val[0], border); + } + else + cv::sepFilter2D(in, out, ddepth, kernX, kernY, anchor, delta.val[0], border); + } +}; + +GAPI_OCL_KERNEL(GOCLBoxFilter, cv::gapi::imgproc::GBoxFilter) +{ + static void run(const cv::UMat& in, int ddepth, const cv::Size& ksize, const cv::Point& anchor, bool normalize, int borderType, const cv::Scalar& bordVal, cv::UMat &out) + { + if( borderType == cv::BORDER_CONSTANT ) + { + cv::UMat temp_in; + int width_add = (ksize.width - 1) / 2; + int height_add = (ksize.height - 1) / 2; + cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal); + cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows); + cv::boxFilter(temp_in(rect), out, ddepth, ksize, anchor, normalize, borderType); + } + else + cv::boxFilter(in, out, ddepth, ksize, anchor, normalize, borderType); + } +}; + +GAPI_OCL_KERNEL(GOCLBlur, cv::gapi::imgproc::GBlur) +{ + static void run(const cv::UMat& in, const cv::Size& ksize, const cv::Point& anchor, int borderType, const cv::Scalar& bordVal, cv::UMat &out) + { + if( borderType == cv::BORDER_CONSTANT ) + { + cv::UMat temp_in; + int width_add = (ksize.width - 1) / 2; + int height_add = (ksize.height - 1) / 2; + cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal); + cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows); + cv::blur(temp_in(rect), out, ksize, anchor, borderType); + } + else + cv::blur(in, out, ksize, anchor, borderType); + } +}; + + +GAPI_OCL_KERNEL(GOCLFilter2D, cv::gapi::imgproc::GFilter2D) +{ + static void run(const cv::UMat& in, int ddepth, const cv::Mat& k, const cv::Point& anchor, const cv::Scalar& delta, int border, + const cv::Scalar& bordVal, cv::UMat &out) + { + if( border == cv::BORDER_CONSTANT ) + { + cv::UMat temp_in; + int width_add = (k.cols - 1) / 2; + int height_add = (k.rows - 1) / 2; + cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, border, bordVal ); + cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows); + cv::filter2D(temp_in(rect), out, ddepth, k, anchor, delta.val[0], border); + } + else + cv::filter2D(in, out, ddepth, k, anchor, delta.val[0], border); + } +}; + +GAPI_OCL_KERNEL(GOCLGaussBlur, cv::gapi::imgproc::GGaussBlur) +{ + static void run(const cv::UMat& in, const cv::Size& ksize, double sigmaX, double sigmaY, int borderType, const cv::Scalar& bordVal, cv::UMat &out) + { + if( borderType == cv::BORDER_CONSTANT ) + { + cv::UMat temp_in; + int width_add = (ksize.width - 1) / 2; + int height_add = (ksize.height - 1) / 2; + cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal ); + cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows); + cv::GaussianBlur(temp_in(rect), out, ksize, sigmaX, sigmaY, borderType); + } + else + cv::GaussianBlur(in, out, ksize, sigmaX, sigmaY, borderType); + } +}; + +GAPI_OCL_KERNEL(GOCLMedianBlur, cv::gapi::imgproc::GMedianBlur) +{ + static void run(const cv::UMat& in, int ksize, cv::UMat &out) + { + cv::medianBlur(in, out, ksize); + } +}; + +GAPI_OCL_KERNEL(GOCLErode, cv::gapi::imgproc::GErode) +{ + static void run(const cv::UMat& in, const cv::Mat& kernel, const cv::Point& anchor, int iterations, int borderType, const cv::Scalar& borderValue, cv::UMat &out) + { + cv::erode(in, out, kernel, anchor, iterations, borderType, borderValue); + } +}; + +GAPI_OCL_KERNEL(GOCLDilate, cv::gapi::imgproc::GDilate) +{ + static void run(const cv::UMat& in, const cv::Mat& kernel, const cv::Point& anchor, int iterations, int borderType, const cv::Scalar& borderValue, cv::UMat &out) + { + cv::dilate(in, out, kernel, anchor, iterations, borderType, borderValue); + } +}; + +GAPI_OCL_KERNEL(GOCLSobel, cv::gapi::imgproc::GSobel) +{ + static void run(const cv::UMat& in, int ddepth, int dx, int dy, int ksize, double scale, double delta, int borderType, + const cv::Scalar& bordVal, cv::UMat &out) + { + if( borderType == cv::BORDER_CONSTANT ) + { + cv::UMat temp_in; + int add = (ksize - 1) / 2; + cv::copyMakeBorder(in, temp_in, add, add, add, add, borderType, bordVal ); + cv::Rect rect = cv::Rect(add, add, in.cols, in.rows); + cv::Sobel(temp_in(rect), out, ddepth, dx, dy, ksize, scale, delta, borderType); + } + else + cv::Sobel(in, out, ddepth, dx, dy, ksize, scale, delta, borderType); + } +}; + +GAPI_OCL_KERNEL(GOCLEqualizeHist, cv::gapi::imgproc::GEqHist) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::equalizeHist(in, out); + } +}; + +GAPI_OCL_KERNEL(GOCLCanny, cv::gapi::imgproc::GCanny) +{ + static void run(const cv::UMat& in, double thr1, double thr2, int apSize, bool l2gradient, cv::UMat &out) + { + cv::Canny(in, out, thr1, thr2, apSize, l2gradient); + } +}; + +GAPI_OCL_KERNEL(GOCLRGB2YUV, cv::gapi::imgproc::GRGB2YUV) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_RGB2YUV); + } +}; + +GAPI_OCL_KERNEL(GOCLYUV2RGB, cv::gapi::imgproc::GYUV2RGB) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_YUV2RGB); + } +}; + +GAPI_OCL_KERNEL(GOCLRGB2Lab, cv::gapi::imgproc::GRGB2Lab) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_RGB2Lab); + } +}; + +GAPI_OCL_KERNEL(GOCLBGR2LUV, cv::gapi::imgproc::GBGR2LUV) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_BGR2Luv); + } +}; + +GAPI_OCL_KERNEL(GOCLBGR2YUV, cv::gapi::imgproc::GBGR2YUV) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_BGR2YUV); + } +}; + +GAPI_OCL_KERNEL(GOCLLUV2BGR, cv::gapi::imgproc::GLUV2BGR) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_Luv2BGR); + } +}; + +GAPI_OCL_KERNEL(GOCLYUV2BGR, cv::gapi::imgproc::GYUV2BGR) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_YUV2BGR); + } +}; + +GAPI_OCL_KERNEL(GOCLRGB2Gray, cv::gapi::imgproc::GRGB2Gray) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_RGB2GRAY); + } +}; + +GAPI_OCL_KERNEL(GOCLBGR2Gray, cv::gapi::imgproc::GBGR2Gray) +{ + static void run(const cv::UMat& in, cv::UMat &out) + { + cv::cvtColor(in, out, cv::COLOR_BGR2GRAY); + } +}; + +GAPI_OCL_KERNEL(GOCLRGB2GrayCustom, cv::gapi::imgproc::GRGB2GrayCustom) +{ + //TODO: avoid copy + static void run(const cv::UMat& in, float rY, float bY, float gY, cv::UMat &out) + { + cv::Mat planes[3]; + cv::split(in.getMat(cv::ACCESS_READ), planes); + cv::Mat tmp_out = (planes[0]*rY + planes[1]*bY + planes[2]*gY); + tmp_out.copyTo(out); + } +}; + + +cv::gapi::GKernelPackage cv::gapi::imgproc::ocl::kernels() +{ + static auto pkg = cv::gapi::kernels + < GOCLFilter2D + , GOCLSepFilter + , GOCLBoxFilter + , GOCLBlur + , GOCLGaussBlur + , GOCLMedianBlur + , GOCLErode + , GOCLDilate + , GOCLSobel + , GOCLCanny + , GOCLEqualizeHist + , GOCLRGB2YUV + , GOCLYUV2RGB + , GOCLRGB2Lab + , GOCLBGR2LUV + , GOCLBGR2YUV + , GOCLYUV2BGR + , GOCLLUV2BGR + , GOCLBGR2Gray + , GOCLRGB2Gray + , GOCLRGB2GrayCustom + >(); + return pkg; +} diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.hpp new file mode 100644 index 0000000..fc8bb9b --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclimgproc.hpp @@ -0,0 +1,23 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#ifndef OPENCV_GAPI_GOCLIMGPROC_HPP +#define OPENCV_GAPI_GOCLIMGPROC_HPP + +#include +#include + +#include "opencv2/gapi/ocl/goclkernel.hpp" + +namespace cv { namespace gimpl { + +// NB: This is what a "Kernel Package" from the origianl Wiki doc should be. +void loadOCLImgProc(std::map &kmap); + +}} + +#endif // OPENCV_GAPI_GOCLIMGPROC_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclkernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclkernel.cpp new file mode 100644 index 0000000..2ae2e33 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/ocl/goclkernel.cpp @@ -0,0 +1,50 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#include + +#include "opencv2/gapi/ocl/goclkernel.hpp" + +const cv::UMat& cv::GOCLContext::inMat(int input) +{ + return (inArg(input)); +} + +cv::UMat& cv::GOCLContext::outMatR(int output) +{ + return (*(util::get(m_results.at(output)))); +} + +const cv::gapi::own::Scalar& cv::GOCLContext::inVal(int input) +{ + return inArg(input); +} + +cv::gapi::own::Scalar& cv::GOCLContext::outValR(int output) +{ + return *util::get(m_results.at(output)); +} + +cv::detail::VectorRef& cv::GOCLContext::outVecRef(int output) +{ + return util::get(m_results.at(output)); +} + +cv::GOCLKernel::GOCLKernel() +{ +} + +cv::GOCLKernel::GOCLKernel(const GOCLKernel::F &f) + : m_f(f) +{ +} + +void cv::GOCLKernel::apply(GOCLContext &ctx) +{ + CV_Assert(m_f); + m_f(ctx); +} diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp index 876575d..e0a6030 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp index e616b2b..82258c7 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMPILED_PRIV_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp index 32ce8e3..1a4eb9c 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" @@ -104,11 +104,13 @@ cv::gimpl::GCompiler::GCompiler(const cv::GComputation &c, // Remove GCompoundBackend to avoid calling setupBackend() with it in the list m_all_kernels.remove(cv::gapi::compound::backend()); - m_e.addPass("init", "resolve_kernels", std::bind(passes::resolveKernels, _1, + + m_e.addPassStage("kernels"); + m_e.addPass("kernels", "resolve_kernels", std::bind(passes::resolveKernels, _1, std::ref(m_all_kernels), // NB: and not copied here lookup_order)); + m_e.addPass("kernels", "check_islands_content", passes::checkIslandsContent); - m_e.addPass("init", "check_islands_content", passes::checkIslandsContent); m_e.addPassStage("meta"); m_e.addPass("meta", "initialize", std::bind(passes::initMeta, _1, std::ref(m_metas))); m_e.addPass("meta", "propagate", std::bind(passes::inferMeta, _1, false)); diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp index b369c14..db40284 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GCOMPILER_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp index 8e20302..2d554b1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp index 03b42ff..8cb247d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GISLANDMODEL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp index 4b24552..b21ab9f 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" @@ -26,7 +26,7 @@ ade::NodeHandle GModel::mkOpNode(GModel::Graph &g, const GKernel &k, const std:: ade::NodeHandle op_h = g.createNode(); g.metadata(op_h).set(NodeType{NodeType::OP}); //These extra empty {} are to please GCC (-Wmissing-field-initializers) - g.metadata(op_h).set(Op{k, args, {}, {}, {}}); + g.metadata(op_h).set(Op{k, args, {}, {}}); if (!island.empty()) g.metadata(op_h).set(Island{island}); return op_h; diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp index 003519b..5d46461 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GMODEL_HPP @@ -61,7 +61,6 @@ struct Op std::vector outs; // TODO: Introduce a new type for resource references cv::gapi::GBackend backend; - util::any opaque; }; struct Data diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp index c9b2fbb..e90b831 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation //////////////////////////////////////////////////////////////////////////////// @@ -128,7 +128,7 @@ cv::gimpl::Unrolled cv::gimpl::unrollExpr(const GProtoArgs &ins, // then add its operands to stack to continue recursion. ops.visit(&node.priv(), node); - const cv::GCall call = origin.node.call(); + const cv::GCall& call = origin.node.call(); const cv::GCall::Priv& call_p = call.priv(); // Put the outputs object description of the node diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp index ce12c7e..4185108 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GMODEL_BUILDER_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp index be365c9..9191f7e 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GMATREF_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp index 8741089..32a30ed 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp index 7119e34..f6ca64e 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp index 60bf36a..160d6c2 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp index 3aa18e6..4292923 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_COMPILER_PASSES_HELPERS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp index 942f738..8fc0b92 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp index 2703149..1fe2ab3 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp index 528d84c..2a98e6c 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp index 14f6acd..ef086fc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_COMPILER_PASSES_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp index 54af8a6..ccb0a32 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_COMPILER_TRANSACTIONS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp index f117c06..8b0af2a 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp index e4128ba..6e9be9b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_GEXECUTOR_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp index ff4c759..1a8f24d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef __OPENCV_GAPI_LOGGER_HPP__ diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp index eebe9d8..df59ed6 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef __OPENCV_GAPI_PRECOMP_HPP__ diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp index 1f5de7a..731bc87 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // FIXME: move out from Common diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp index eb77612..083da7d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp index 77a82df..7268132 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CORE_TESTS_HPP @@ -124,9 +124,9 @@ struct MinTest : public TestParams>{}; struct AbsDiffTest : public TestParams>{}; struct AbsDiffCTest : public TestParams> {}; -struct SumTest : public TestParams> {}; -struct AddWeightedTest : public TestParams>{}; -struct NormTest : public TestParams>{}; +struct SumTest : public TestParams> {}; +struct AddWeightedTest : public TestParams>{}; +struct NormTest : public TestParams>{}; struct IntegralTest : public TestWithParam> {}; struct ThresholdTest : public TestParams> {}; struct ThresholdOTTest : public TestParams> {}; diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp index d33b5cc..ca4190b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_CORE_TESTS_INL_HPP @@ -681,11 +681,11 @@ TEST_P(AbsDiffCTest, AccuracyTest) TEST_P(SumTest, AccuracyTest) { auto param = GetParam(); + compare_scalar_f cmpF = get<3>(GetParam()); + MatType type = std::get<0>(param); cv::Size sz_in = std::get<1>(param); - auto tolerance = std::get<3>(param); auto compile_args = std::get<4>(param); - //initMatrixRandU(std::get<0>(param), sz_in, std::get<2>(param)); - initMatsRandN(std::get<0>(param), sz_in, std::get<2>(param)); //TODO: workaround trying to fix SumTest failures + initMatrixRandU(type, sz_in, type, std::get<2>(param)); cv::Scalar out_sum; @@ -703,8 +703,7 @@ TEST_P(SumTest, AccuracyTest) } // Comparison ////////////////////////////////////////////////////////////// { - EXPECT_LE(std::abs(out_sum[0] - out_sum_ocv[0]) / std::max(1.0, std::abs(out_sum_ocv[0])), tolerance) - << "OCV=" << out_sum_ocv[0] << " GAPI=" << out_sum[0]; + EXPECT_TRUE(cmpF(out_sum, out_sum_ocv)); } } @@ -714,8 +713,8 @@ TEST_P(AddWeightedTest, AccuracyTest) cv::Size sz_in; bool initOut = false; cv::GCompileArgs compile_args; - double tolerance = 0.0; - std::tie(type, sz_in, dtype, initOut, tolerance, compile_args) = GetParam(); + compare_f cmpF; + std::tie(type, sz_in, dtype, initOut, cmpF, compile_args) = GetParam(); auto& rng = cv::theRNG(); double alpha = rng.uniform(0.0, 1.0); @@ -735,53 +734,19 @@ TEST_P(AddWeightedTest, AccuracyTest) cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, out_mat_ocv, dtype); } // Comparison ////////////////////////////////////////////////////////////// - { - // Note, that we cannot expect bitwise results for add-weighted: - // - // tmp = src1*alpha + src2*beta + gamma; - // dst = saturate( round(tmp) ); - // - // Because tmp is floating-point, dst depends on compiler optimizations - // - // However, we must expect good accuracy of tmp, and rounding correctly - - cv::Mat failures; - - if (out_mat_ocv.type() == CV_32FC1) - { - // result: float - may vary in 7th decimal digit - failures = abs(out_mat_gapi - out_mat_ocv) > abs(out_mat_ocv) * 1e-6; - } - else - { - // result: integral - rounding may vary if fractional part of tmp - // is nearly 0.5 - - cv::Mat inexact, incorrect, diff, tmp; - - inexact = out_mat_gapi != out_mat_ocv; - - // even if rounded differently, check if still rounded correctly - cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, tmp, CV_32F); - cv::subtract(out_mat_gapi, tmp, diff, cv::noArray(), CV_32F); - incorrect = abs(diff) >= tolerance;// 0.5000005f; // relative to 6 digits - - failures = inexact & incorrect; - } + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + EXPECT_EQ(out_mat_gapi.size(), sz_in); - EXPECT_EQ(0, cv::countNonZero(failures)); - EXPECT_EQ(out_mat_gapi.size(), sz_in); - } } TEST_P(NormTest, AccuracyTest) { + compare_scalar_f cmpF; NormTypes opType = NORM_INF; int type = 0; cv::Size sz; - double tolerance = 0.0; cv::GCompileArgs compile_args; - std::tie(opType, type, sz, tolerance, compile_args) = GetParam(); + std::tie(opType, type, sz, cmpF, compile_args) = GetParam(); initMatrixRandU(type, sz, type, false); cv::Scalar out_norm; @@ -803,8 +768,7 @@ TEST_P(NormTest, AccuracyTest) // Comparison ////////////////////////////////////////////////////////////// { - EXPECT_LE(std::abs(out_norm[0] - out_norm_ocv[0]) / std::max(1.0, std::abs(out_norm_ocv[0])), tolerance) - << "OCV=" << out_norm_ocv[0] << " GAPI=" << out_norm[0]; + EXPECT_TRUE(cmpF(out_norm, out_norm_ocv)); } } diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp index b7c0279..fcd5882 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp index c21b26b..94860bc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_IMGPROC_TESTS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp index 3de4289..f13c2b1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_IMGPROC_TESTS_INL_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp index 1f6f0ce..db6dd18 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp index 9f53d36..1730eab 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OPERATOR_TESTS_COMMON_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp index 7ec702a..fa9a269 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef OPENCV_GAPI_OPERATOR_TESTS_INL_COMMON_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp index be0fc3c..f226fbb 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include @@ -115,6 +115,9 @@ class TestPerfParams: public TestFunctional, public perf::TestBaseWithParam{} using compare_f = std::function; +using compare_scalar_f = std::function; + + template struct Wrappable { @@ -128,6 +131,20 @@ struct Wrappable } }; +template +struct WrappableScalar +{ + compare_scalar_f to_compare_f() + { + T t = *static_cast(this); + return [t](const cv::Scalar &a, const cv::Scalar &b) + { + return t(a, b); + }; + } +}; + + class AbsExact : public Wrappable { public: @@ -285,6 +302,28 @@ private: double _tol; double _inf_tol; }; + +class AbsToleranceScalar : public WrappableScalar +{ +public: + AbsToleranceScalar(double tol) : _tol(tol) {} + bool operator() (const cv::Scalar& in1, const cv::Scalar& in2) const + { + double abs_err = std::abs(in1[0] - in2[0]) / std::max(1.0, std::abs(in2[0])); + if (abs_err > _tol) + { + std::cout << "AbsToleranceScalar error: abs_err=" << abs_err << " tolerance=" << _tol << " in1[0]" << in1[0] << " in2[0]" << in2[0] << std::endl;; + return false; + } + else + { + return true; + } + } +private: + double _tol; +}; + } // namespace opencv_test namespace @@ -294,3 +333,11 @@ namespace return os << "compare_f"; } } + +namespace +{ + inline std::ostream& operator<<(std::ostream& os, const opencv_test::compare_scalar_f&) + { + return os << "compare_scalar_f"; + } +} diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp index 11e78bd..52289db 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" @@ -203,7 +203,8 @@ INSTANTIATE_TEST_CASE_P(SumTestCPU, SumTest, cv::Size(640, 480), cv::Size(128, 128)), /*init output matrices or not*/ testing::Bool(), - Values(1e-5), + //Values(1e-5), + Values(AbsToleranceScalar(1e-5).to_compare_f()), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(AbsDiffTestCPU, AbsDiffTest, @@ -222,15 +223,14 @@ INSTANTIATE_TEST_CASE_P(AbsDiffCTestCPU, AbsDiffCTest, /*init output matrices or not*/ testing::Bool(), Values(cv::compile_args(CORE_CPU)))); -// FIXME: Comparison introduced by YL doesn't work with C3 INSTANTIATE_TEST_CASE_P(AddWeightedTestCPU, AddWeightedTest, - Combine(Values( CV_8UC1/*, CV_8UC3*/, CV_16UC1, CV_16SC1, CV_32FC1 ), + Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values(cv::Size(1280, 720), cv::Size(640, 480), cv::Size(128, 128)), Values( -1, CV_8U, CV_16U, CV_32F ), /*init output matrices or not*/ testing::Bool(), - Values(0.5000005), + Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(NormTestCPU, NormTest, @@ -239,7 +239,8 @@ INSTANTIATE_TEST_CASE_P(NormTestCPU, NormTest, Values(cv::Size(1280, 720), cv::Size(640, 480), cv::Size(128, 128)), - Values(1e-5), + //Values(1e-5), + Values(AbsToleranceScalar(1e-5).to_compare_f()), Values(cv::compile_args(CORE_CPU))), opencv_test::PrintNormCoreParams()); diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp index c65052b..ea8b070 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" @@ -121,7 +121,8 @@ INSTANTIATE_TEST_CASE_P(AddWeightedTestFluid, AddWeightedTest, cv::Size(128, 128)), Values(-1, CV_8U, CV_32F), testing::Bool(), - Values(0.5000005), + Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()), + //Values(0.5000005), Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(LUTTestFluid, LUTTest, diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp index beda022..43d3dc9 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp index 5dca209..41e6725 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp index 435c798..6d5fb66 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp index 4179fa5..a6f8073 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp index e576562..1fa8584 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp index 62069d8..38e9b1e 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp index 711211d..3b15844 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" @@ -40,6 +40,43 @@ TEST(GAPI_MetaDesc, MatDesc) EXPECT_EQ(480, desc2.size.height); } +TEST(GAPI_MetaDesc, VecMatDesc) +{ + std::vector vec1 = { + cv::Mat(240, 320, CV_8U)}; + + const auto desc1 = cv::descr_of(vec1); + EXPECT_EQ((GMatDesc{CV_8U, 1, {320, 240}}), get(desc1[0])); + + std::vector vec2 = { + cv::UMat(480, 640, CV_8UC3)}; + + const auto desc2 = cv::descr_of(vec2); + EXPECT_EQ((GMatDesc{CV_8U, 3, {640, 480}}), get(desc2[0])); +} + +TEST(GAPI_MetaDesc, VecOwnMatDesc) +{ + std::vector vec = { + cv::gapi::own::Mat(240, 320, CV_8U, nullptr), + cv::gapi::own::Mat(480, 640, CV_8UC3, nullptr)}; + + const auto desc = cv::gapi::own::descr_of(vec); + EXPECT_EQ((GMatDesc{CV_8U, 1, {320, 240}}), get(desc[0])); + EXPECT_EQ((GMatDesc{CV_8U, 3, {640, 480}}), get(desc[1])); +} + +TEST(GAPI_MetaDesc, AdlVecOwnMatDesc) +{ + std::vector vec = { + cv::gapi::own::Mat(240, 320, CV_8U, nullptr), + cv::gapi::own::Mat(480, 640, CV_8UC3, nullptr)}; + + const auto desc = descr_of(vec); + EXPECT_EQ((GMatDesc{CV_8U, 1, {320, 240}}), get(desc[0])); + EXPECT_EQ((GMatDesc{CV_8U, 3, {640, 480}}), get(desc[1])); +} + TEST(GAPI_MetaDesc, Compare_Equal_MatDesc) { const auto desc1 = cv::GMatDesc{CV_8U, 1, {64, 64}}; diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp index bc0b991..9640536 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp index ee8674e..74ddd7b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp index 5b35011..f7dac09 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp index 6bd06fe..cc106ef 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp index f5d83ed..8082916 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #ifndef GAPI_FLUID_TEST_KERNELS_HPP diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp index e482e2e..8cd4b0b 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp index 070cea6..ffbb05e 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gpu_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gpu_test.cpp new file mode 100644 index 0000000..7717869 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gpu_test.cpp @@ -0,0 +1,207 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + + +#include "test_precomp.hpp" + + +#include "logger.hpp" +#include "common/gapi_tests_common.hpp" +#include "opencv2/gapi/gpu/ggpukernel.hpp" +#include "opencl_kernels_test_gapi.hpp" + + +namespace cv +{ + +#ifdef HAVE_OPENCL + + static void reference_symm7x7_CPU(const cv::Mat& in, const cv::Mat& kernel_coeff, int shift, cv::Mat &out) + { + cv::Point anchor = { -1, -1 }; + double delta = 0; + + const int* ci = kernel_coeff.ptr(); + + float c_float[10]; + float divisor = (float)(1 << shift); + for (int i = 0; i < 10; i++) + { + c_float[i] = ci[i] / divisor; + } + // J & I & H & G & H & I & J + // I & F & E & D & E & F & I + // H & E & C & B & C & E & H + // G & D & B & A & B & D & G + // H & E & C & B & C & E & H + // I & F & E & D & E & F & I + // J & I & H & G & H & I & J + + // A & B & C & D & E & F & G & H & I & J + + // 9 & 8 & 7 & 6 & 7 & 8 & 9 + // 8 & 5 & 4 & 3 & 4 & 5 & 8 + // 7 & 4 & 2 & 1 & 2 & 4 & 7 + // 6 & 3 & 1 & 0 & 1 & 3 & 6 + // 7 & 4 & 2 & 1 & 2 & 4 & 7 + // 8 & 5 & 4 & 3 & 4 & 5 & 8 + // 9 & 8 & 7 & 6 & 7 & 8 & 9 + + float coefficients[49] = + { + c_float[9], c_float[8], c_float[7], c_float[6], c_float[7], c_float[8], c_float[9], + c_float[8], c_float[5], c_float[4], c_float[3], c_float[4], c_float[5], c_float[8], + c_float[7], c_float[4], c_float[2], c_float[1], c_float[2], c_float[4], c_float[7], + c_float[6], c_float[3], c_float[1], c_float[0], c_float[1], c_float[3], c_float[6], + c_float[7], c_float[4], c_float[2], c_float[1], c_float[2], c_float[4], c_float[7], + c_float[8], c_float[5], c_float[4], c_float[3], c_float[4], c_float[5], c_float[8], + c_float[9], c_float[8], c_float[7], c_float[6], c_float[7], c_float[8], c_float[9] + }; + + cv::Mat kernel = cv::Mat(7, 7, CV_32FC1); + float* cf = kernel.ptr(); + for (int i = 0; i < 49; i++) + { + cf[i] = coefficients[i]; + } + + cv::filter2D(in, out, CV_8UC1, kernel, anchor, delta, cv::BORDER_REPLICATE); + } + + namespace gapi_test_kernels + { + G_TYPED_KERNEL(TSymm7x7_test, , "org.opencv.imgproc.symm7x7_test") { + static GMatDesc outMeta(GMatDesc in, Mat, int) { + return in.withType(CV_8U, 1); + } + }; + + + GAPI_GPU_KERNEL(GGPUSymm7x7_test, TSymm7x7_test) + { + static void run(const cv::UMat& in, const cv::Mat& kernel_coeff, int shift, cv::UMat &out) + { + if (cv::ocl::isOpenCLActivated()) + { + cv::Size size = in.size(); + size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; + + const cv::String moduleName = "gapi"; + cv::ocl::ProgramSource source(moduleName, "symm7x7", opencl_symm7x7_src, ""); + + static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_UNDEFINED" }; + std::string build_options = " -D BORDER_CONSTANT_VALUE=" + std::to_string(0) + + " -D " + borderMap[1] + + " -D SCALE=1.f/" + std::to_string(1 << shift) + ".f"; + + cv::String errmsg; + cv::ocl::Program program(source, build_options, errmsg); + if (program.ptr() == NULL) + { + CV_Error_(cv::Error::OpenCLInitError, ("symm_7x7_test Can't compile OpenCL program: = %s with build_options = %s\n", errmsg.c_str(), build_options.c_str())); + } + if (!errmsg.empty()) + { + std::cout << "OpenCL program build log:" << std::endl << errmsg << std::endl; + } + + cv::ocl::Kernel kernel("symm_7x7_test", program); + if (kernel.empty()) + { + CV_Error(cv::Error::OpenCLInitError, "symm_7x7_test Can't get OpenCL kernel\n"); + } + + cv::UMat gKer; + kernel_coeff.copyTo(gKer); + + int tile_y = 0; + + int idxArg = kernel.set(0, cv::ocl::KernelArg::PtrReadOnly(in)); + idxArg = kernel.set(idxArg, (int)in.step); + idxArg = kernel.set(idxArg, (int)size.width); + idxArg = kernel.set(idxArg, (int)size.height); + idxArg = kernel.set(idxArg, cv::ocl::KernelArg::PtrWriteOnly(out)); + idxArg = kernel.set(idxArg, (int)out.step); + idxArg = kernel.set(idxArg, (int)size.height); + idxArg = kernel.set(idxArg, (int)size.width); + idxArg = kernel.set(idxArg, (int)tile_y); + idxArg = kernel.set(idxArg, cv::ocl::KernelArg::PtrReadOnly(gKer)); + + if (!kernel.run(2, globalsize, NULL, false)) + { + CV_Error(cv::Error::OpenCLApiCallError, "symm_7x7_test OpenCL kernel run failed\n"); + } + } + else + { + //CPU fallback + cv::Mat in_Mat, out_Mat; + in_Mat = in.getMat(ACCESS_READ); + out_Mat = out.getMat(ACCESS_WRITE); + reference_symm7x7_CPU(in_Mat, kernel_coeff, shift, out_Mat); + } + } + }; + + cv::gapi::GKernelPackage gpuTestPackage = cv::gapi::kernels + (); + + } // namespace gapi_test_kernels +#endif //HAVE_OPENCL + +} // namespace cv + + +namespace opencv_test +{ + +#ifdef HAVE_OPENCL + +using namespace cv::gapi_test_kernels; + +TEST(GPU, Symm7x7_test) +{ + const auto sz = cv::Size(1280, 720); + cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1); + cv::Mat out_mat_gapi(sz, CV_8UC1); + cv::Mat out_mat_ocv(sz, CV_8UC1); + cv::Scalar mean = cv::Scalar(127.0f); + cv::Scalar stddev = cv::Scalar(40.f); + cv::randn(in_mat, mean, stddev); + + //Symm7x7 coefficients and shift + int coefficients_symm7x7[10] = { 1140, -118, 526, 290, -236, 64, -128, -5, -87, -7 }; + int shift = 10; + cv::Mat kernel_coeff(10, 1, CV_32S); + int* ci = kernel_coeff.ptr(); + for (int i = 0; i < 10; i++) + { + ci[i] = coefficients_symm7x7[i]; + } + + // Run G-API + cv::GMat in; + auto out = TSymm7x7_test::on(in, kernel_coeff, shift); + cv::GComputation comp(cv::GIn(in), cv::GOut(out)); + + auto cc = comp.compile(cv::descr_of(in_mat), cv::compile_args(gpuTestPackage)); + cc(cv::gin(in_mat), cv::gout(out_mat_gapi)); + + // Run OpenCV + reference_symm7x7_CPU(in_mat, kernel_coeff, shift, out_mat_ocv); + + compare_f cmpF = AbsSimilarPoints(1, 0.05).to_compare_f(); + + // Comparison ////////////////////////////////////////////////////////////// + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + EXPECT_EQ(out_mat_gapi.size(), sz); + } +} +#endif + +} // namespace opencv_test diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp index aeb4762..ee0cdfa 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp index cd876ef..a7b35bc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "opencv2/gapi/cpu/gcpukernel.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp index 815aa0d..ce87ba4 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp index 7b4baa0..705fd14 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp index 9ac47f6..630a8fc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp index 1716b55..223a546 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp index 574c0ab..b2d4353 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp index 6c331c0..34faddf 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp @@ -2,12 +2,11 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" #include "../common/gapi_core_tests.hpp" -#include "opencv2/gapi/gpu/core.hpp" #define CORE_GPU cv::gapi::core::gpu::kernels() @@ -190,7 +189,7 @@ INSTANTIATE_TEST_CASE_P(SumTestGPU, SumTest, cv::Size(640, 480), cv::Size(128, 128)), /*init output matrices or not*/ testing::Bool(), - Values(1e-3), //TODO: too relaxed? + Values(AbsToleranceScalar(1e-3).to_compare_f()),//TODO: too relaxed? Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(AbsDiffTestGPU, AbsDiffTest, @@ -209,15 +208,14 @@ INSTANTIATE_TEST_CASE_P(AbsDiffCTestGPU, AbsDiffCTest, /*init output matrices or not*/ testing::Bool(), Values(cv::compile_args(CORE_GPU)))); -// FIXME: Comparison introduced by YL doesn't work with C3 INSTANTIATE_TEST_CASE_P(AddWeightedTestGPU, AddWeightedTest, - Combine(Values( CV_8UC1/*, CV_8UC3*/, CV_16UC1, CV_16SC1, CV_32FC1 ), + Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values(cv::Size(1280, 720), cv::Size(640, 480), cv::Size(128, 128)), Values( -1, CV_8U, CV_16U, CV_32F ), /*init output matrices or not*/ testing::Bool(), - Values(0.50005), + Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(NormTestGPU, NormTest, @@ -226,7 +224,7 @@ INSTANTIATE_TEST_CASE_P(NormTestGPU, NormTest, Values(cv::Size(1280, 720), cv::Size(640, 480), cv::Size(128, 128)), - Values(1e-3), //TODO: too relaxed? + Values(AbsToleranceScalar(1e-3).to_compare_f()), //TODO: too relaxed? Values(cv::compile_args(CORE_GPU))), opencv_test::PrintNormCoreParams()); diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp index 65d452c..18c918c 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp @@ -2,13 +2,12 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" #include "../common/gapi_imgproc_tests.hpp" -#include "opencv2/gapi/gpu/imgproc.hpp" #define IMGPROC_GPU cv::gapi::imgproc::gpu::kernels() @@ -131,11 +130,23 @@ INSTANTIATE_TEST_CASE_P(Dilate3x3TestGPU, Dilate3x3Test, INSTANTIATE_TEST_CASE_P(SobelTestGPU, SobelTest, Combine(Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_f()), - Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1/*, CV_32FC1*/), //TODO: CV_32FC1 fails accuracy + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), Values(3, 5), Values(cv::Size(1280, 720), cv::Size(640, 480)), - Values(-1, CV_32F), + Values(-1, CV_16S, CV_32F), + Values(0, 1), + Values(1, 2), +/*init output matrices or not*/ testing::Bool(), + Values(cv::compile_args(IMGPROC_GPU)))); + +INSTANTIATE_TEST_CASE_P(SobelTestGPU32F, SobelTest, + Combine(Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_f()), + Values(CV_32FC1), + Values(3, 5), + Values(cv::Size(1280, 720), + cv::Size(640, 480)), + Values(CV_32F), Values(0, 1), Values(1, 2), /*init output matrices or not*/ testing::Bool(), diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp index 5a116bd..62c080c 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp @@ -2,12 +2,11 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "../test_precomp.hpp" #include "../common/gapi_operators_tests.hpp" -#include "opencv2/gapi/gpu/core.hpp" #define CORE_GPU cv::gapi::core::gpu::kernels() diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp index 67b6273..28da490 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp index 20aad89..00d13ee 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp index 67696db..602ec00 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp index 6dbf777..0860a01 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp index a815e0d..6c80a77 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp index 91e55be..c5694c8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp index 09f1880..4ca1af8 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp index 252af9c..d42aab1 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp index d4b16f6..09f9ca6 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp index 1b14e06..28702cc 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp index f550340..24224ba 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/opencl_kernels_test_gapi.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/opencl_kernels_test_gapi.hpp new file mode 100644 index 0000000..87fdd70 --- /dev/null +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/opencl_kernels_test_gapi.hpp @@ -0,0 +1,260 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018-2019 Intel Corporation + +#include "opencv2/core/ocl.hpp" +#include "opencv2/core/ocl_genbase.hpp" +#include "opencv2/core/opencl/ocl_defs.hpp" + +#ifdef HAVE_OPENCL +const char* opencl_symm7x7_src = +"#if BORDER_REPLICATE\n" +"#define GET_BORDER(elem) (elem)\n" +"#define SET_ALL(i, j) a0[i] = a0[j]; a1[i] = a1[j]; a2[i] = a2[j]; b[i] = b[j]; c0[i] = c0[j]; c1[i] = c1[j]; c2[i] = c2[j];\n" +"#else\n" +"#define GET_BORDER(elem) (BORDER_CONSTANT_VALUE)\n" +"#define SET_ALL(i, j) a0[i] = a1[i] = a2[i] = c0[i] = c1[i] = c2[i] = BORDER_CONSTANT_VALUE; b[i] = BORDER_CONSTANT_VALUE;\n" +"#endif\n" +"#define GET_A0(id, x, l_edge, a1) ((x) <= (l_edge + 2) ? GET_BORDER(a1) : (((const __global uchar*)(id))[-3]))\n" +"#define GET_A1(id, x, l_edge, a2) ((x) <= (l_edge + 1) ? GET_BORDER(a2) : (((const __global uchar*)(id))[-2]))\n" +"#define GET_A2(id, x, l_edge, b) ((x) <= (l_edge) ? GET_BORDER(b[0]) : (((const __global uchar*)(id))[-1]))\n" +"#define GET_C0(id, x, r_edge, b) ((x) >= (r_edge) ? GET_BORDER(b[8 - 1]) : (((const __global uchar*)(id))[8]))\n" +"#define GET_C1(id, x, r_edge, c0) ((x) >= (r_edge - 1) ? GET_BORDER(c0) : (((const __global uchar*)(id))[8 + 1]))\n" +"#define GET_C2(id, x, r_edge, c1) ((x) >= (r_edge - 2) ? GET_BORDER(c1) : (((const __global uchar*)(id))[8 + 2]))\n" +"__kernel void symm_7x7_test(\n" +"__global const uchar * srcptr,\n" +"int srcStep, int srcEndX, int srcEndY,\n" +"__global uchar * dstptr, int dstStep,\n" +"int rows, int cols,\n" +"int tile_y_coord,\n" +"__constant int * coeff)\n" +"{\n" +"int lEdge = 0, rEdge = cols - 8;\n" +"int x = (get_global_id(0) < cols/8) ? get_global_id(0) * 8: cols - 8;\n" +"int y = get_global_id(1);\n" +"int yd = min(3, tile_y_coord);\n" +"int dst_id = mad24(y, dstStep, x);\n" +"y+=yd;\n" +"int src_id = mad24(y, srcStep, x);\n" +"int y_limit = y + tile_y_coord;\n" +"y_limit-=yd;\n" +"const __global uchar* psrc = (const __global uchar*)(srcptr + src_id);\n" +"__global uchar* pdst = (__global uchar*)(dstptr + dst_id);\n" +"#define BSIZE (7)\n" +"float a0[BSIZE]; float a1[BSIZE]; float a2[BSIZE];\n" +"float8 b[BSIZE];\n" +"float c0[BSIZE]; float c1[BSIZE]; float c2[BSIZE];\n" +"b[3] = convert_float8(vload8(0, (const __global uchar*)psrc));\n" +"if( (y_limit <=2 ) || (y_limit >= srcEndY - 3) || (x >= rEdge-2) || (x <= lEdge + 2) )\n" +"{\n" +"a2[3] = GET_A2(psrc, x, lEdge, b[3]);\n" +"a1[3] = GET_A1(psrc, x, lEdge, a2[3]);\n" +"a0[3] = GET_A0(psrc, x, lEdge, a1[3]);\n" +"c0[3] = GET_C0(psrc, x, rEdge, b[3]);\n" +"c1[3] = GET_C1(psrc, x, rEdge, c0[3]);\n" +"c2[3] = GET_C2(psrc, x, rEdge, c1[3]);\n" +"if(y_limit > 0)\n" +"{\n" +"b[2] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep)));\n" +"a2[2] = GET_A2(psrc - srcStep, x, lEdge, b[2]);\n" +"a1[2] = GET_A1(psrc - srcStep, x, lEdge, a2[2]);\n" +"a0[2] = GET_A0(psrc - srcStep, x, lEdge, a1[2]);\n" +"c0[2] = GET_C0(psrc - srcStep, x, rEdge, b[2]);\n" +"c1[2] = GET_C1(psrc - srcStep, x, rEdge, c0[2]);\n" +"c2[2] = GET_C2(psrc - srcStep, x, rEdge, c1[2]);\n" +"}\n" +"else\n" +"{\n" +"SET_ALL(2, 3);\n" +"}\n" +"if( y_limit > 1 )\n" +"{\n" +"b[1] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*2)));\n" +"a2[1] = GET_A2(psrc - srcStep*2, x, lEdge, b[1]);\n" +"a1[1] = GET_A1(psrc - srcStep*2, x, lEdge, a2[1]);\n" +"a0[1] = GET_A0(psrc - srcStep*2, x, lEdge, a1[1]);\n" +"c0[1] = GET_C0(psrc - srcStep*2, x, rEdge, b[1]);\n" +"c1[1] = GET_C1(psrc - srcStep*2, x, rEdge, c0[1]);\n" +"c2[1] = GET_C2(psrc - srcStep*2, x, rEdge, c1[1]);\n" +"}\n" +"else\n" +"{\n" +"SET_ALL(1, 2);\n" +"}\n" +"if( y_limit > 2 )\n" +"{\n" +"b[0] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*3)));\n" +"a2[0] = GET_A2(psrc - srcStep*3, x, lEdge, b[0]);\n" +"a1[0] = GET_A1(psrc - srcStep*3, x, lEdge, a2[0]);\n" +"a0[0] = GET_A0(psrc - srcStep*3, x, lEdge, a1[0]);\n" +"c0[0] = GET_C0(psrc - srcStep*3, x, rEdge, b[0]);\n" +"c1[0] = GET_C1(psrc - srcStep*3, x, rEdge, c0[0]);\n" +"c2[0] = GET_C2(psrc - srcStep*3, x, rEdge, c1[0]);\n" +"}\n" +"else\n" +"{\n" +"SET_ALL(0, 1);\n" +"}\n" +"if( y_limit < srcEndY - 1 )\n" +"{\n" +"b[4] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep)));\n" +"a2[4] = GET_A2(psrc + srcStep, x, lEdge, b[4]);\n" +"a1[4] = GET_A1(psrc + srcStep, x, lEdge, a2[4]);\n" +"a0[4] = GET_A0(psrc + srcStep, x, lEdge, a1[4]);\n" +"c0[4] = GET_C0(psrc + srcStep, x, rEdge, b[4]);\n" +"c1[4] = GET_C1(psrc + srcStep, x, rEdge, c0[4]);\n" +"c2[4] = GET_C2(psrc + srcStep, x, rEdge, c1[4]);\n" +"}\n" +"else\n" +"{\n" +"SET_ALL(4, 3);\n" +"}\n" +"if( y_limit < srcEndY - 2 )\n" +"{\n" +"b[5] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*2)));\n" +"a2[5] = GET_A2(psrc + srcStep*2, x, lEdge, b[5]);\n" +"a1[5] = GET_A1(psrc + srcStep*2, x, lEdge, a2[5]);\n" +"a0[5] = GET_A0(psrc + srcStep*2, x, lEdge, a1[5]);\n" +"c0[5] = GET_C0(psrc + srcStep*2, x, rEdge, b[5]);\n" +"c1[5] = GET_C1(psrc + srcStep*2, x, rEdge, c0[5]);\n" +"c2[5] = GET_C2(psrc + srcStep*2, x, rEdge, c1[5]);\n" +"}\n" +"else\n" +"{\n" +"SET_ALL(5, 4);\n" +"}\n" +"if( y_limit < srcEndY - 3 )\n" +"{\n" +"b[6] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*3)));\n" +"a2[6] = GET_A2(psrc + srcStep*3, x, lEdge, b[6]);\n" +"a1[6] = GET_A1(psrc + srcStep*3, x, lEdge, a2[6]);\n" +"a0[6] = GET_A0(psrc + srcStep*3, x, lEdge, a1[6]);\n" +"c0[6] = GET_C0(psrc + srcStep*3, x, rEdge, b[6]);\n" +"c1[6] = GET_C1(psrc + srcStep*3, x, rEdge, c0[6]);\n" +"c2[6] = GET_C2(psrc + srcStep*3, x, rEdge, c1[6]);\n" +"}\n" +"else\n" +"{\n" +"SET_ALL(6, 5);\n" +"}\n" +"}\n" +"else\n" +"{\n" +"a2[3] = (((const __global uchar*)(psrc))[-1]);\n" +"a1[3] = (((const __global uchar*)(psrc))[-2]);\n" +"a0[3] = (((const __global uchar*)(psrc))[-3]);\n" +"c0[3] = (((const __global uchar*)(psrc))[8]);\n" +"c1[3] = (((const __global uchar*)(psrc))[8 + 1]);\n" +"c2[3] = (((const __global uchar*)(psrc))[8 + 2]);\n" +"b[2] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep)));\n" +"a2[2] = (((const __global uchar*)(psrc - srcStep))[-1]);\n" +"a1[2] = (((const __global uchar*)(psrc - srcStep))[-2]);\n" +"a0[2] = (((const __global uchar*)(psrc - srcStep))[-3]);\n" +"c0[2] = (((const __global uchar*)(psrc - srcStep))[8]);\n" +"c1[2] = (((const __global uchar*)(psrc - srcStep))[8 + 1]);\n" +"c2[2] = (((const __global uchar*)(psrc - srcStep))[8 + 2]);\n" +"b[1] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*2)));\n" +"a2[1] = (((const __global uchar*)(psrc - srcStep*2))[-1]);\n" +"a1[1] = (((const __global uchar*)(psrc - srcStep*2))[-2]);\n" +"a0[1] = (((const __global uchar*)(psrc - srcStep*2))[-3]);\n" +"c0[1] = (((const __global uchar*)(psrc - srcStep*2))[8]);\n" +"c1[1] = (((const __global uchar*)(psrc - srcStep*2))[8 + 1]);\n" +"c2[1] = (((const __global uchar*)(psrc - srcStep*2))[8 + 2]);\n" +"b[0] = convert_float8(vload8(0, (const __global uchar*)(psrc - srcStep*3)));\n" +"a2[0] = (((const __global uchar*)(psrc - srcStep*3))[-1]);\n" +"a1[0] = (((const __global uchar*)(psrc - srcStep*3))[-2]);\n" +"a0[0] = (((const __global uchar*)(psrc - srcStep*3))[-3]);\n" +"c0[0] = (((const __global uchar*)(psrc - srcStep*3))[8]);\n" +"c1[0] = (((const __global uchar*)(psrc - srcStep*3))[8 + 1]);\n" +"c2[0] = (((const __global uchar*)(psrc - srcStep*3))[8 + 2]);\n" +"b[4] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep)));\n" +"a2[4] = (((const __global uchar*)(psrc + srcStep))[-1]);\n" +"a1[4] = (((const __global uchar*)(psrc + srcStep))[-2]);\n" +"a0[4] = (((const __global uchar*)(psrc + srcStep))[-3]);\n" +"c0[4] = (((const __global uchar*)(psrc + srcStep))[8]);\n" +"c1[4] = (((const __global uchar*)(psrc + srcStep))[8 + 1]);\n" +"c2[4] = (((const __global uchar*)(psrc + srcStep))[8 + 2]);\n" +"b[5] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*2)));\n" +"a2[5] = (((const __global uchar*)(psrc + srcStep*2))[-1]);\n" +"a1[5] = (((const __global uchar*)(psrc + srcStep*2))[-2]);\n" +"a0[5] = (((const __global uchar*)(psrc + srcStep*2))[-3]);\n" +"c0[5] = (((const __global uchar*)(psrc + srcStep*2))[8]);\n" +"c1[5] = (((const __global uchar*)(psrc + srcStep*2))[8 + 1]);\n" +"c2[5] = (((const __global uchar*)(psrc + srcStep*2))[8 + 2]);\n" +"b[6] = convert_float8(vload8(0, (const __global uchar*)(psrc + srcStep*3)));\n" +"a2[6] = (((const __global uchar*)(psrc + srcStep*3))[-1]);\n" +"a1[6] = (((const __global uchar*)(psrc + srcStep*3))[-2]);\n" +"a0[6] = (((const __global uchar*)(psrc + srcStep*3))[-3]);\n" +"c0[6] = (((const __global uchar*)(psrc + srcStep*3))[8]);\n" +"c1[6] = (((const __global uchar*)(psrc + srcStep*3))[8 + 1]);\n" +"c2[6] = (((const __global uchar*)(psrc + srcStep*3))[8 + 2]);\n" +"}\n" +"float a0_sum[3]; float a1_sum[3]; float a2_sum[3];\n" +"float8 b_sum[3];\n" +"float c0_sum[3]; float c1_sum[3]; float c2_sum[3];\n" +"a0_sum[0] = a0[0] + a0[6];\n" +"a0_sum[1] = a0[1] + a0[5];\n" +"a0_sum[2] = a0[2] + a0[4];\n" +"a1_sum[0] = a1[0] + a1[6];\n" +"a1_sum[1] = a1[1] + a1[5];\n" +"a1_sum[2] = a1[2] + a1[4];\n" +"a2_sum[0] = a2[0] + a2[6];\n" +"a2_sum[1] = a2[1] + a2[5];\n" +"a2_sum[2] = a2[2] + a2[4];\n" +"c0_sum[0] = c0[0] + c0[6];\n" +"c0_sum[1] = c0[1] + c0[5];\n" +"c0_sum[2] = c0[2] + c0[4];\n" +"c1_sum[0] = c1[0] + c1[6];\n" +"c1_sum[1] = c1[1] + c1[5];\n" +"c1_sum[2] = c1[2] + c1[4];\n" +"c2_sum[0] = c2[0] + c2[6];\n" +"c2_sum[1] = c2[1] + c2[5];\n" +"c2_sum[2] = c2[2] + c2[4];\n" +"b_sum[0] = b[0] + b[6];\n" +"b_sum[1] = b[1] + b[5];\n" +"b_sum[2] = b[2] + b[4];\n" +"float8 A = b[3];\n" +"float8 intermediate = A * (float)coeff[0];\n" +"float8 B = b_sum[2] +\n" +"(float8)(a2[3], b[3].s0123, b[3].s456) +\n" +"(float8)(b[3].s123, b[3].s4567, c0[3]);\n" +"intermediate += B * (float)coeff[1];\n" +"float8 C = (float8)(a2_sum[2], b_sum[2].s0123, b_sum[2].s456) +\n" +"(float8)(b_sum[2].s123, b_sum[2].s4567, c0_sum[2]);\n" +"intermediate += C * (float)coeff[2];\n" +"float8 D = b_sum[1] +\n" +"(float8)(a1[3], a2[3], b[3].s0123, b[3].s45) +\n" +"(float8)(b[3].s23, b[3].s4567, c0[3], c1[3]);\n" +"intermediate += D * (float)coeff[3];\n" +"float8 E = (float8)(a2_sum[1], b_sum[1].s0123, b_sum[1].s456) +\n" +"(float8)( b_sum[1].s123, b_sum[1].s4567, c0_sum[1]) +\n" +"(float8)( a1_sum[2], a2_sum[2], b_sum[2].s0123, b_sum[2].s45) +\n" +"(float8)( b_sum[2].s23, b_sum[2].s4567, c0_sum[2], c1_sum[2]);\n" +"intermediate += E * (float)coeff[4];\n" +"float8 F = (float8)(a1_sum[1], a2_sum[1], b_sum[1].s0123, b_sum[1].s45) +\n" +"(float8)(b_sum[1].s23, b_sum[1].s4567, c0_sum[1], c1_sum[1]);\n" +"intermediate += F * (float)coeff[5];\n" +"float8 G = b_sum[0] +\n" +"(float8)(a0[3], a1[3], a2[3], b[3].s0123, b[3].s4) +\n" +"(float8)(b[3].s3, b[3].s4567, c0[3], c1[3], c2[3]);\n" +"intermediate += G * (float)coeff[6];\n" +"float8 H = (float8)(a2_sum[0], b_sum[0].s0123, b_sum[0].s456) +\n" +"(float8)(b_sum[0].s123, b_sum[0].s4567, c0_sum[0]) +\n" +"(float8)(a0_sum[2], a1_sum[2], a2_sum[2], b_sum[2].s0123, b_sum[2].s4) +\n" +"(float8)(b_sum[2].s3, b_sum[2].s4567, c0_sum[2], c1_sum[2], c2_sum[2]);\n" +"intermediate += H * (float)coeff[7];\n" +"float8 I = (float8)(a1_sum[0], a2_sum[0], b_sum[0].s0123, b_sum[0].s45) +\n" +"(float8)(b_sum[0].s23, b_sum[0].s4567, c0_sum[0], c1_sum[0]) +\n" +"(float8)(a0_sum[1], a1_sum[1], a2_sum[1], b_sum[1].s0123, b_sum[1].s4) +\n" +"(float8)(b_sum[1].s3, b_sum[1].s4567, c0_sum[1], c1_sum[1], c2_sum[1]);\n" +"intermediate += I * (float)coeff[8];\n" +"float8 J = (float8)(a0_sum[0], a1_sum[0], a2_sum[0], b_sum[0].s0123, b_sum[0].s4) +\n" +"(float8)(b_sum[0].s3, b_sum[0].s4567, c0_sum[0], c1_sum[0], c2_sum[0]);\n" +"intermediate += J * (float)coeff[9];\n" +"intermediate *= SCALE;\n" +"vstore8(convert_uchar8_sat(intermediate), 0, (__global uchar*)(pdst));\n" +"}\n" +; +#endif diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp index c254357..0b68229 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp index ba2cd2d..14db80d 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp index a9c5c01..34c6a73 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp index fa5862f..2caee86 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // FIXME: OpenCV license header diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp index bcab803..5e9adb0 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation // FIXME: OpenCV header @@ -19,6 +19,8 @@ #include "opencv2/gapi/core.hpp" #include "opencv2/gapi/cpu/gcpukernel.hpp" #include "opencv2/gapi/gpu/ggpukernel.hpp" +#include "opencv2/gapi/gpu/imgproc.hpp" +#include "opencv2/gapi/gpu/core.hpp" #include "opencv2/gapi/gcompoundkernel.hpp" #include "opencv2/gapi/operators.hpp" #include "opencv2/gapi/fluid/imgproc.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp index 60bbcc1..d562df0 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp index b7fabd5..df44849 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp index a95b6aa..5c736e4 100644 --- a/inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp +++ b/inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018 Intel Corporation +// Copyright (C) 2018-2019 Intel Corporation #include "test_precomp.hpp" diff --git a/inference-engine/thirdparty/fluid/revision.txt b/inference-engine/thirdparty/fluid/revision.txt index e088afd..c118617 100644 --- a/inference-engine/thirdparty/fluid/revision.txt +++ b/inference-engine/thirdparty/fluid/revision.txt @@ -1 +1 @@ -a3df05d93b188d4e86e23ffd1e988dbec0fc9211 +master / 2019-01-28 diff --git a/inference-engine/thirdparty/mkl-dnn/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/CMakeLists.txt index 939c81f..c522e4a 100644 --- a/inference-engine/thirdparty/mkl-dnn/CMakeLists.txt +++ b/inference-engine/thirdparty/mkl-dnn/CMakeLists.txt @@ -16,6 +16,10 @@ cmake_minimum_required(VERSION 2.8) +if(POLICY CMP0022) + cmake_policy(SET CMP0022 NEW) +endif() + if(POLICY CMP0054) cmake_policy(SET CMP0054 NEW) endif() @@ -40,7 +44,7 @@ endif() set(PROJECT_NAME "Intel(R) MKL-DNN") set(PROJECT_FULL_NAME "Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)") -set(PROJECT_VERSION "0.17") +set(PROJECT_VERSION "0.18.0") set(LIB_NAME mkldnn) @@ -64,6 +68,9 @@ set(CMAKE_SRC_CCXX_FLAGS) # SRC specifics set(CMAKE_EXAMPLE_CCXX_FLAGS) # EXAMPLE specifics set(CMAKE_TEST_CCXX_FLAGS) # TESTS specifics +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + include("cmake/utils.cmake") include("cmake/options.cmake") include("cmake/OpenMP.cmake") @@ -73,6 +80,7 @@ include("cmake/SDL.cmake") include("cmake/MKL.cmake") include("cmake/Doxygen.cmake") include("cmake/profiling.cmake") +include("cmake/version.cmake") enable_testing() @@ -82,4 +90,5 @@ add_subdirectory(src) add_subdirectory(examples) add_subdirectory(tests) -install(FILES LICENSE DESTINATION share/doc/${LIB_NAME}) +# Cannot use CMAKE_INSTALL_DOCDIR since it uses PROJECT_NAME and not LIB_NAME +install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${LIB_NAME}) diff --git a/inference-engine/thirdparty/mkl-dnn/LICENSE b/inference-engine/thirdparty/mkl-dnn/LICENSE index 8dada3e..fde864d 100644 --- a/inference-engine/thirdparty/mkl-dnn/LICENSE +++ b/inference-engine/thirdparty/mkl-dnn/LICENSE @@ -199,3 +199,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + ============================================================================ + + Intel MKL-DNN includes components with separate copyright + notices and license terms. + + XByak, 3-clause BSD license + Copyright (c) 2007 MITSUNARI Shigeo + See full copyright notice and license text in src/cpu/xbyak/COPYRIGHT + + gtest, 3-clause BSD license + Copyright 2008, Google Inc. + See full copyright notice and license text in tests/gtests/gtest/LICENSE + \ No newline at end of file diff --git a/inference-engine/thirdparty/mkl-dnn/README.md b/inference-engine/thirdparty/mkl-dnn/README.md index 2a5b29e..3a453c9 100644 --- a/inference-engine/thirdparty/mkl-dnn/README.md +++ b/inference-engine/thirdparty/mkl-dnn/README.md @@ -1,42 +1,43 @@ # Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN) -![v0.17 beta](https://img.shields.io/badge/v0.17-beta-orange.svg) +![v0.18 beta](https://img.shields.io/badge/v0.18-beta-orange.svg) Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN) is -an open source performance library for deep learning applications. The library -accelerates deep learning applications and framework on Intel(R) architecture. -Intel(R) MKL-DNN contains vectorized and threaded building blocks which you can +an open-source performance library for deep-learning applications. The library +accelerates deep-learning applications and frameworks on Intel architecture. +Intel MKL-DNN contains vectorized and threaded building blocks that you can use to implement deep neural networks (DNN) with C and C++ interfaces. -DNN functionality optimized for Intel architecture is also included in -[Intel(R) Math Kernel Library (Intel(R) MKL)](https://software.intel.com/en-us/mkl/features/deep-neural-networks). -API in this implementation is not compatible with Intel MKL-DNN and does not +DNN functionality optimized for Intel architecture is also included in +[Intel Math Kernel Library (Intel MKL)](https://software.intel.com/en-us/mkl/features/deep-neural-networks). +The API in that implementation is not compatible with Intel MKL-DNN and does not include certain new and experimental features. -This release contains performance critical functions that improve performance of -of the following deep learning topologies and variations of these. +This release contains performance-critical functions that improve performance of +the following deep learning topologies and variations of these: | Application | Example topology |:--- |:--- | Image recognition | AlexNet, VGG, GoogleNet, ResNet, MobileNet -| Image segmenation | FCN, SegNet, MaskRCNN, U-Net +| Image segmentation | FCN, SegNet, MaskRCNN, U-Net | Volumetric segmentation | 3D-Unet | Object detection | SSD, Faster R-CNN, Yolo -| Neural Machine Translation (experimental) | GNMT -| Speech Recognition (experimental) | DeepSpeech -| Adversarial Networks | DCGAN, 3DGAN -| Reinforcement Learning | A3C -| Text-to-Speech | WaveNet +| Neural machine translation | GNMT +| Speech recognition | DeepSpeech +| Adversarial networks | DCGAN, 3DGAN +| Reinforcement learning | A3C +| Text-to-speech | WaveNet Intel MKL-DNN is used in the following software products: * [Caffe\* Optimized for Intel Architecture](https://github.com/intel/caffe) * [Chainer\*](https://chainer.org) * [DeepBench](https://github.com/baidu-research/DeepBench) * [PaddlePaddle\*](http://www.paddlepaddle.org) +* [PyTorch\*](https://pytorch.org/) * [Tensorflow\*](https://www.tensorflow.org) * [Microsoft\* Cognitive Toolkit (CNTK)](https://docs.microsoft.com/en-us/cognitive-toolkit) * [Apache\* MXNet](https://mxnet.apache.org) * [OpenVINO(TM) toolkit](https://01.org/openvinotoolkit) -* [Intel(R) Nervana(TM) Graph](https://github.com/NervanaSystems/ngraph) +* [Intel Nervana Graph](https://github.com/NervanaSystems/ngraph) * [Menoh\*](https://github.com/pfnet-research/menoh) * [DeepLearning4J\*](https://deeplearning4j.org) * [BigDL](https://github.com/intel-analytics/BigDL) @@ -44,49 +45,47 @@ Intel MKL-DNN is used in the following software products: ## License Intel MKL-DNN is licensed under [Apache License Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). This -software includes the following third party components: +software includes the following third-party components: * [Xbyak](https://github.com/herumi/xbyak) distributed under [3-clause BSD licence](src/cpu/xbyak/COPYRIGHT) * [gtest](https://github.com/google/googletest) distributed under [3-clause BSD license](tests/gtests/gtest/LICENSE) ## Documentation -* [Introduction](https://intel.github.io/mkl-dnn) explains programming model +* [Introduction](https://intel.github.io/mkl-dnn) explains the programming model and basic concepts * [Reference manual](https://intel.github.io/mkl-dnn/modules.html) provides detailed functionality description -* [Examples](https://github.com/intel/mkl-dnn/tree/master/examples) -demonstrate use of C and C++ APIs in simple topologies -* [Tutorial](https://software.intel.com/en-us/articles/intel-mkl-dnn-part-1-library-overview-and-installation) -provides step by step installation instructions and an example walkthrough +* [Examples](https://github.com/intel/mkl-dnn/tree/master/examples) +demonstrates use of C and C++ APIs in simple topologies +* [Tutorial](https://software.intel.com/en-us/articles/intel-mkl-dnn-part-1-library-overview-and-installation) +provides step-by-step installation instructions and an example walkthrough ## Support -Please submit your questions, feature requests and bug reports on +Please submit your questions, feature requests, and bug reports on the [GitHub issues](https://github.com/intel/mkl-dnn/issues) page. **WARNING** The following functionality has preview status and might change without prior notification in future releases: * Convolutions with `s16` data type in source, weights or destination -* Convolutions and auxiliary primitives for 3D spatial data -* RNN, LSTM and GRU primitives -* Intel Threading Building Blocks (Intel TBB\*) support +* Threading Building Blocks (TBB) support ## How to Contribute -We welcome community contributions to Intel MKL-DNN. If you have an idea how to improve the library: +We welcome community contributions to Intel MKL-DNN. If you have an idea on how to improve the library: * Share your proposal via [GitHub issues](https://github.com/intel/mkl-dnn/issues). -* Ensure you can build the product and run all the examples with your patch -* In the case of a larger feature, create a test -* Submit a [pull request](https://github.com/intel/mkl-dnn/pulls) +* Ensure you can build the product and run all the examples with your patch. +* In the case of a larger feature, create a test. +* Submit a [pull request](https://github.com/intel/mkl-dnn/pulls). We will review your contribution and, if any additional fixes or modifications are necessary, may provide feedback to guide you. When accepted, your pull -request will be merged the repository. +request will be merged to the repository. ## System Requirements -Intel MKL-DNN supports Intel(R) 64 architecture and compatible architectures. +Intel MKL-DNN supports Intel 64 architecture and compatible architectures. The library is optimized for the systems based on -* Intel Atom(R) processor with Intel(R) SSE4.1 support -* 4th, 5th, 6th, 7th and 8th generation Intel(R) Core processor +* Intel Atom(R) processor with Intel SSE4.1 support +* 4th, 5th, 6th, 7th, and 8th generation Intel(R) Core(TM) processor * Intel(R) Xeon(R) processor E5 v3 family (formerly Haswell) * Intel Xeon processor E5 v4 family (formerly Broadwell) * Intel Xeon Platinum processor family (formerly Skylake) @@ -100,24 +99,24 @@ The software dependencies are: * [Doxygen](http://www.stack.nl/~dimitri/doxygen/download.html#srcbin) 1.8.5 or later * C++ compiler with C++11 standard support * Optional dependencies: - * GNU OpenMP\*, LLVM OpenMP\*, or Intel OpenMP - * Threading Building Blocks (TBB) - * Intel MKL or Intel MKL small libraries + * GNU\* OpenMP\*, LLVM OpenMP, or Intel OpenMP + * Threading Building Blocks (TBB) 2017 or later + * Intel MKL 2017 Update 1 or Intel MKL small libraries > **Note** -> Building Intel MKL-DNN with optinal dependencies may introduce additional -> runtime dependencies for the library. Please refer to corresponding -> software system requirements for details. +> Building Intel MKL-DNN with optional dependencies may introduce additional +> runtime dependencies for the library. For details, refer to the corresponding +> software system requirements. The software was validated on RedHat\* Enterprise Linux 7 with -* GNU\* Compiler Collection 4.8, 5.4, 6.1, 7.2 and 8.1 +* GNU Compiler Collection 4.8, 5.4, 6.1, 7.2, and 8.1 * Clang\* 3.8.0 -* [Intel(R) C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) - 17.0, 18.0 and 19.0 +* [Intel C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) + 17.0, 18.0, and 19.0 on Windows Server\* 2012 R2 with -* Microsoft\* Visual C++ 14.0 (Visual Studio 2015) -* [Intel(R) C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) +* Microsoft Visual C++ 14.0 (Visual Studio 2015 Update 3) +* [Intel C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) 17.0 and 19.0 on macOS\* 10.13 (High Sierra) with @@ -125,196 +124,300 @@ on macOS\* 10.13 (High Sierra) with * [Intel C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) 18.0 and 19.0 -The implementation uses OpenMP\* 4.0 SIMD extensions. We recommend using -Intel(R) Compiler for the best performance results. +The implementation uses OpenMP 4.0 SIMD extensions. We recommend using the +Intel C++ Compiler for the best performance results. ## Installation +### Build from source + +#### Download source code Download [Intel MKL-DNN source code](https://github.com/intel/mkl-dnn/archive/master.zip) -or clone the repository to your system +or clone [the repository](https://github.com/intel/mkl-dnn.git) to your system. ``` - git clone https://github.com/intel/mkl-dnn.git +git clone https://github.com/intel/mkl-dnn.git ``` -Ensure that all software dependencies are in place and have at least minimal -supported version. +#### Configure build +Intel MKL-DNN uses a CMake-based build system. You can use CMake options to control the build. +Along with the standard CMake options such as `CMAKE_INSTALL_PREFIX` and `CMAKE_BUILD_TYPE`, +you can pass Intel MKL-DNN specific options: + +|Option | Possible Values (defaults in bold) | Description +|:--- |:--- | :--- +|MKLDNN_LIBRARY_TYPE | **SHARED**, STATIC | Defines the resulting library type +|MKLDNN_THREADING | **OMP**, OMP:INTEL, OMP:COMP, TBB | Defines the threading type +|WITH_EXAMPLE | **ON**, OFF | Controls building the examples +|WITH_TEST | **ON**, OFF | Controls building the tests +|ARCH_OPT_FLAGS | *compiler flags* | Specifies compiler optimization flags (see warning note below) +|VTUNEROOT | *path* | Enables integration with Intel(R) VTune(TM) Amplifier + +> **WARNING** +> +> By default, Intel MKL-DNN is built specifically for the processor type of the +> compiling machine (for example, `-march=native` in the case of GCC). While this option +> gives better performance, the resulting library can be run only on systems +> that are instruction-set compatible with the compiling machine. +> +> Therefore, if Intel MKL-DNN is to be shipped to other platforms (for example, built by +> Linux distribution maintainers), consider setting `ARCH_OPT_FLAGS` to `""`. + +For more options and details, check [cmake/options.cmake](cmake/options.cmake). -Intel MKL-DNN can take advantage of optimized -matrix-matrix multiplication (GEMM) function from Intel MKL. The dynamic -library with this functionality is included in the repository. If you choose -to build Intel MKL-DNN with the binary dependency download Intel MKL small -libraries using provided script +##### Using Intel MKL (optional) +Intel MKL-DNN includes an optimized matrix-matrix multiplication (GEMM) implementation for modern platforms. +The library can also take advantage of GEMM functions from Intel MKL to improve performance with older +versions of compilers or on older platforms. This behavior is controlled by the `MKLDNN_USE_MKL` option. -###### Linux/macOS +|Option | Possible Values (defaults in bold) | Description +|:--- |:--- | :--- +|MKLDNN_USE_MKL | **DEF**, NONE, ML, FULL, FULL:STATIC | Defines the binary dependency on Intel MKL + +The dynamic library with this functionality is included in the repository. +If you choose to build Intel MKL-DNN with the binary dependency, download the Intel MKL small +libraries using the provided script: + +*Linux/macOS* ``` - cd scripts && ./prepare_mkl.sh && cd .. +cd scripts && ./prepare_mkl.sh && cd .. ``` -###### Windows +*Windows\** ``` - cd scripts && call prepare_mkl.bat && cd .. +cd scripts && call prepare_mkl.bat && cd .. ``` -or manually from [GitHub release section](https://github.com/intel/mkl-dnn/releases) +or manually from [GitHub release section](https://github.com/intel/mkl-dnn/releases), and unpack it to the `external` directory in the repository root. Intel MKL-DNN -can also be built with full Intel MKL, if the latter is installed on the system. -You might need to set `MKLROOT` environment variable to the path where full -Intel MKL is installed to help cmake locate the library. - -You can choose to build Intel MKL-DNN without binary dependency. The resulting -version will be fully functional, however performance of convolutions relying -on GEMM-based algorithm, inner product, and mkldnn_?gemm functionality may be -suboptimal. +can also be built with full Intel MKL if the latter is installed on the system. +You might need to set the `MKLROOT` environment variable to the path where the full +Intel MKL is installed to help `cmake` locate the library. > **Note** > -> Using Intel MKL small libraries currently work for Intel MKL-DNN built with -> OpenMP\* only. Building with Intel TBB requires either full Intel MKL library -> or standalone build. +> Using Intel MKL small libraries currently works only for Intel MKL-DNN built with +> OpenMP. Building with Intel TBB requires either the full Intel MKL library +> or a standalone build. > > Using Intel MKL or Intel MKL small libraries will introduce additional -> runtime dependencies. Please refer to Intel MKL -> [system requirements](https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-2019-system-requirements) -> for additional information. +> runtime dependencies. For additional information, refer to Intel MKL +> [system requirements](https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-2019-system-requirements). -Intel MKL-DNN uses a CMake-based build system - -``` - mkdir -p build && cd build && cmake $CMAKE_OPTIONS .. && make -``` - -Here `$CMAKE_OPTIONS` are options to control the build. Along with the standard -cmake options such as `CMAKE_INSTALL_PREFIX` or `CMAKE_BUILD_TYPE`, -user can also pass Intel MKL-DNN specific ones: +##### Threading +Intel MKL-DNN is parallelized and can use the OpenMP or TBB threading runtime. OpenMP threading is the default build mode +and is recommended for the best performance. TBB support is experimental. This behavior is controlled by the `MKLDNN_THREADING` option. |Option | Possible Values (defaults in bold) | Description |:--- |:--- | :--- -|MKLDNN_LIBRARY_TYPE | **SHARED**, STATIC | Defines resulting library type -|MKLDNN_THREADING | **OMP**, OMP:INTEL, OMP:COMP, TBB | Defines threading type -|MKLDNN_USE_MKL | **DEF**, NONE, ML, FULL, FULL:STATIC | Defines binary dependency on Intel MKL -|WITH_EXAMPLE | **ON**, OFF | Controls building examples -|WITH_TEST | **ON**, OFF | Controls building tests -|ARCH_OPT_FLAGS (\*) | *compiler flags* | Specifies compiler optimization flags -|VTUNEROOT | *path* | Enables integration with Intel(R) Vtune(tm) Amplifier - -Please check [cmake/options.cmake](cmake/options.cmake) for more options -and details. - -> (\*) **WARNING** -> -> By default Intel MKL-DNN is built specifically for the processor type of the -> compiling machine (e.g. `-march=native` in case of GCC). While this option -> gives better performance, the resulting library can only be run on systems -> that are instruction-set compatible with the compiling machine. +|MKLDNN_THREADING | **OMP**, OMP:INTEL, OMP:COMP, TBB | Defines the threading type + +##### OpenMP +Intel MKL-DNN can use Intel, GNU or CLANG OpenMP runtime. Because different OpenMP runtimes may not be binary compatible, +it's important to ensure that only one OpenMP runtime is used throughout the +application. Having more than one OpenMP runtime initialized may lead to +undefined behavior including incorrect results or crashes. + +Intel MKL-DNN library built with the binary dependency will link against the Intel OpenMP +runtime included with the Intel MKL small libraries package. The Intel OpenMP runtime +is binary compatible with the GNU OpenMP and Clang OpenMP runtimes and is +recommended for the best performance results. + +Intel MKL-DNN library built standalone will use the OpenMP runtime supplied by +the compiler, so as long as both the library and the application use the +same compiler, the correct OpenMP runtime will be used. + +##### TBB +TBB support is experimental. Intel MKL-DNN has limited optimizations done for Intel TBB and has some functional +limitations if built with Intel TBB. + +Functional limitations: +* Convolution with Winograd algorithm is not supported + +Performance limitations (mostly less parallelism than in case of OpenMP): +* Batch normalization +* Convolution backward by weights +* mkldnn_sgemm + +> **WARNING** > -> Hence if Intel MKL-DNN is to be shipped to other platforms (e.g. built by -> Linux distribution maintainers) consider setting ARCH_OPT_FLAGS to "". +> If the library is built with the full Intel MKL, the user is expected to set +> the `MKL_THREADING_LAYER` environment variable to either `tbb` or `sequential` in order +> to force Intel MKL to use Intel TBB for parallelization or to be sequential, +> respectively. Without this setting, Intel MKL (RT library) tries +> to use OpenMP for parallelization by default. + +#### Build on Linux/macOS +Ensure that all software dependencies are in place and have at least the minimal +supported version. -Intel MKL-DNN includes unit tests implemented using the googletest framework. To validate your build, run: +Configure CMake and create a makefile: ``` - make test +mkdir -p build && cd build && cmake $CMAKE_OPTIONS .. ``` -Documentation is provided inline and can be generated in HTML format with Doxygen: +Build the application: ``` - make doc +make ``` -Documentation will reside in `build/reference/html` folder. +The build can be validated with the unit-test suite: -Finally, ``` - make install +ctest ``` -will place the header files, libraries and documentation in `/usr/local`. To change -the installation path, use the option `-DCMAKE_INSTALL_PREFIX=` when invoking CMake. -## Linking your application +The reference manual is provided inline and can also be generated in HTML format with Doxygen: -Intel MKL-DNN includes several header files providing C and C++ APIs for -the functionality and one or several dynamic libraries depending on how -Intel MKL-DNN was built. The minimal installation: +``` +make doc +``` -|File | Description -|:--- |:--- -|include/mkldnn.h | C header -|include/mkldnn.hpp | C++ header -|include/mkldnn_types.h | auxiliary C header -|lib/libmkldnn.so | Intel MKL-DNN dynamic library -|lib/libmkldnn.a | Intel MKL-DNN static library (if built with `MKLDNN_LIBRARY_TYPE=STATIC`) +Documentation will reside in the `build/reference/html` folder. +Finally: -#### Intel MKL-DNN with OpenMP +``` +make install +``` -If Intel MKL-DNN is built with Intel MKL small libraries the following extra -libraries would be installed: +will place the header files, libraries, and documentation in `/usr/local`. To change +the installation path, use the option `-DCMAKE_INSTALL_PREFIX=` when invoking CMake. -|File | Description -|:--- |:--- -|lib/libiomp5.so | Intel OpenMP* runtime library -|lib/libmklml_gnu.so | Intel MKL small library for GNU* OpenMP runtime -|lib/libmklml_intel.so | Intel MKL small library for Intel(R) OpenMP runtime +#### Build on Windows +Ensure that all software dependencies are in place and have at least the minimal +supported version. -Intel MKL-DNN uses OpenMP\* for parallelism and requires an OpenMP runtime -library to work. As different OpenMP runtimes may not be binary compatible -it's important to ensure that only one OpenMP runtime is used throughout the -application. Having more than one OpenMP runtime initialized may lead to -undefined behavior resulting in incorrect results or crashes. +> **NOTE** +> +> Building Intel MKL-DNN from a terminal requires using either the Intel Parallel Studio command prompt +> or the Microsoft\* Visual Studio\* developer command prompt instead of the default Windows command prompt. +> +> The Intel(R) Parallel Studio command prompt is an item in the **Start** menu in the **Intel Parallel Studio +> \** folder that has a Windows Command Prompt icon and a name like **Compiler 18.0 Update 5…**. +> +> The default for building the project for the Intel C++ Compiler is to use the Intel +> Parallel Studio developer command prompt. + +Configure CMake and create a Microsoft Visual Studio solution: -Intel MKL-DNN library built with binary dependency will link against Intel OpenMP -runtime included with Intel MKL small libraries package. Intel OpenMP runtime -is binary compatible with GNU OpenMP and CLANG OpenMP runtimes and is -recommended for the best performance results. Here are example linklines for -GNU C++ compiler and Intel C++ compiler. -``` - g++ -std=c++11 -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn -lmklml_intel -liomp5 -``` ``` - icpc -std=c++11 -qopenmp -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn -lmklml_intel +mkdir build & cd build && cmake -G "Visual Studio 15 2017 Win64" .. ``` -Using GNU compiler with `-fopenmp` and `-liomp5` options will link the -application with both Intel and GNU OpenMP runtime libraries. This will lead -to undefined behavior of the application. -Intel MKL-DNN library built standalone will use OpenMP runtime supplied by -the compiler, so as long as both the library and the application use the -same compiler correct OpenMP runtime will be used. +For the solution to use Intel C++ Compiler: + ``` - g++ -std=c++11 -fopenmp -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn +cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" .. ``` + +After you have built the initial project using CMake, you can then open the project with +Microsoft Visual Studio and build from there. You can also use msbuild command-line tool +to build from the command line: + ``` - icpc -std=c++11 -qopenmp -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn +msbuild "Intel(R) MKL-DNN.sln" /p:Configuration=Release [/t:rebuild] /m ``` +where the optional argument `/t:rebuild` rebuilds the project. -#### Intel MKL-DNN with Intel TBB +The build can be validated with the unit-test suite: -Intel MKL-DNN built with Intel TBB doesn't require special handling: ``` - g++ -std=c++11 -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn -ltbb +ctest ``` -Please note that Intel MKL-DNN requires Intel TBB 2017 or above. -Also, Intel MKL-DNN has limited optimizations done for Intel TBB -and has some functional limitations if built with Intel TBB. +## Linking Your Application -Functional limitations: -* Convolution with Winograd algorithm is not supported +### Linux/macOS +Intel MKL-DNN includes several header files providing C and C++ APIs for +the functionality and one or several dynamic libraries depending on how +Intel MKL-DNN was built. -Performance limitations (mostly less parallelism than in case of OpenMP): -* Batch normalization -* Convolution backward by weights -* mkldnn_sgemm +**Linux** + +|File | Description +|:--- |:--- +|include/mkldnn.h | C header +|include/mkldnn.hpp | C++ header +|include/mkldnn_types.h | Auxiliary C header +|lib/libmkldnn.so | Intel MKL-DNN dynamic library +|lib/libmkldnn.a | Intel MKL-DNN static library (if built with `MKLDNN_LIBRARY_TYPE=STATIC`) +|lib/libiomp5.so | Intel OpenMP\* runtime library (if built with `MKLDNN_USE_MKL=ML`) +|lib/libmklml_gnu.so | Intel MKL small library for GNU OpenMP runtime (if built with `MKLDNN_USE_MKL=ML`) +|lib/libmklml_intel.so | Intel MKL small library for Intel OpenMP runtime (if built with `MKLDNN_USE_MKL=ML`) + +**macOS** + +|File | Description +|:--- |:--- +|include/mkldnn.h | C header +|include/mkldnn.hpp | C++ header +|include/mkldnn_types.h | Auxiliary C header +|lib/libmkldnn.dylib | Intel MKL-DNN dynamic library +|lib/libmkldnn.a | Intel MKL-DNN static library (if built with `MKLDNN_LIBRARY_TYPE=STATIC`) +|lib/libiomp5.dylib | Intel OpenMP\* runtime library (if built with `MKLDNN_USE_MKL=ML`) +|lib/libmklml_gnu.dylib | Intel MKL small library for GNU OpenMP runtime (if built with `MKLDNN_USE_MKL=ML`) +|lib/libmklml_intel.dylib | Intel MKL small library for Intel OpenMP runtime (if built with `MKLDNN_USE_MKL=ML`) + +Linkline examples below assume that Intel MKL-DNN is installed in the directory +defined in the MKLDNNROOT environment variable. + +``` +g++ -std=c++11 -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn +clang -std=c++11 -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn +icpc -std=c++11 -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn +``` > **WARNING** > -> If the library is built with full Intel MKL user is expected to set -> `MKL_THREADING_LAYER` environment variable to either `tbb` or `sequential` -> to force Intel MKL to use Intel TBB for parallelization or to be sequential -> respectively. Without this setting Intel MKL (RT library) by default would -> try to use OpenMP for parallelization. +> Using the GNU compiler with the `-fopenmp` and `-liomp5` options will link the +> application with both the Intel and GNU OpenMP runtime libraries. This will lead +> to undefined behavior in the application. + +> **NOTE** +> +> Applications linked dynamically will resolve the dependencies at runtime. +> Make sure that the dependencies are available in the standard locations +> defined by the operating system, in the locatons listed in `LD_LIBRARY_PATH` (Linux), +> `DYLD_LIBRARY_PATH` (macOS) environment variables, or `rpath` mechanism. + +### Windows +Intel MKL-DNN includes several header files providing C and C++ APIs for +the functionality and one or several dynamic libraries depending on how +Intel MKL-DNN was built. + +|File | Description +|:--- |:--- +|bin\libmkldnn.dll | Intel MKL-DNN dynamic library +|bin\libiomp5.dll | Intel OpenMP\* runtime library (if built with `MKLDNN_USE_MKL=ML`) +|bin\libmklml.dll | Intel MKL small library (if built with `MKLDNN_USE_MKL=ML`) +|include\mkldnn.h | C header +|include\mkldnn.hpp | C++ header +|include\mkldnn_types.h | Auxiliary C header +|lib\libmkldnn.lib | Intel MKL-DNN import library +|lib\libiomp5.lib | Intel OpenMP\* runtime import library (if built with `MKLDNN_USE_MKL=ML`) +|lib\libmklml.lib | Intel MKL small library import library (if built with `MKLDNN_USE_MKL=ML`) + +To link the application from the command line, set up the `LIB` and `INCLUDE` environment variables to point to the locations of +the Intel MKL-DNN headers and libraries. The Linkline examples below assume that Intel MKL-DNN is installed in the directory +defined in the MKLDNNROOT environment variable. + +``` +set INCLUDE=%MKLDNNROOT%\include;%INCLUDE% +set LIB=%MKLDNNROOT%\lib;%LIB% +icl /Qstd=c++11 /qopenmp simple_net.cpp mkldnn.lib +cl simple_net.cpp mkldnn.lib +``` + +Refer to [Microsoft Visual Studio documentation](https://docs.microsoft.com/en-us/cpp/build/walkthrough-creating-and-using-a-dynamic-link-library-cpp?view=vs-2017) +on linking the application using MSVS solutions. + +> **NOTE** +> Applications linked dynamically will resolve the dependencies at runtime. +> Make sure that the dependencies are available in the standard locations +> defined by the operating system or in the locatons listed in the `PATH` environment variable. -------- diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/Doxygen.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/Doxygen.cmake index b6ed79a..d23c617 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/Doxygen.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/Doxygen.cmake @@ -35,11 +35,11 @@ if(DOXYGEN_FOUND) ${CMAKE_CURRENT_BINARY_DIR}/header.html @ONLY) file(GLOB_RECURSE HEADERS - ${CMAKE_SOURCE_DIR}/include/*.h - ${CMAKE_SOURCE_DIR}/include/*.hpp + ${PROJECT_SOURCE_DIR}/include/*.h + ${PROJECT_SOURCE_DIR}/include/*.hpp ) file(GLOB_RECURSE DOX - ${CMAKE_SOURCE_DIR}/doc/* + ${PROJECT_SOURCE_DIR}/doc/* ) add_custom_command( OUTPUT ${DOXYGEN_STAMP_FILE} diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/MKL.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/MKL.cmake index bb02059..554bbd3 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/MKL.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/MKL.cmake @@ -22,6 +22,8 @@ if(MKL_cmake_included) return() endif() set(MKL_cmake_included true) +include("cmake/utils.cmake") +include("cmake/options.cmake") # set SKIP_THIS_MKL to true if given configuration is not supported function(maybe_skip_this_mkl LIBNAME) @@ -168,33 +170,18 @@ function(detect_mkl LIBNAME) string(FIND "${MKLLIBPATH}" ${CMAKE_CURRENT_SOURCE_DIR}/external __idx) if(${__idx} EQUAL 0) if(WIN32) - if(MINGW) - # We need to install *.dll into bin/ instead of lib/. - install(PROGRAMS ${MKLDLL} DESTINATION bin) - else() - install(PROGRAMS ${MKLDLL} DESTINATION lib) - endif() + install(PROGRAMS ${MKLDLL} ${MKLIOMP5DLL} + DESTINATION ${CMAKE_INSTALL_BINDIR}) else() - install(PROGRAMS ${MKLLIB} DESTINATION lib) - endif() - if(MKLIOMP5LIB) - if(WIN32) - if(MINGW) - # We need to install *.dll into bin/ instead of lib/. - install(PROGRAMS ${MKLIOMP5DLL} DESTINATION bin) - else() - install(PROGRAMS ${MKLIOMP5DLL} DESTINATION lib) - endif() - else() - install(PROGRAMS ${MKLIOMP5LIB} DESTINATION lib) - endif() + install(PROGRAMS ${MKLLIB} ${MKLIOMP5LIB} + DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() endif() if(WIN32) # Add paths to DLL to %PATH% on Windows get_filename_component(MKLDLLPATH "${MKLDLL}" PATH) - set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}\;${MKLDLLPATH}") + append_to_windows_path_list(CTESTCONFIG_PATH "${MKLDLLPATH}") set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}" PARENT_SCOPE) endif() @@ -203,6 +190,11 @@ function(detect_mkl LIBNAME) set(MKLINC ${MKLINC} PARENT_SCOPE) set(MKLLIB "${MKLLIB}" PARENT_SCOPE) set(MKLDLL "${MKLDLL}" PARENT_SCOPE) + if(LIBNAME MATCHES "mklml") + set(MKLDNN_USES_MKL "MKLML:SHARED" PARENT_SCOPE) + else() + set(MKLDNN_USES_MKL "FULL:SHARED" PARENT_SCOPE) + endif() set(MKLIOMP5LIB "${MKLIOMP5LIB}" PARENT_SCOPE) set(MKLIOMP5DLL "${MKLIOMP5DLL}" PARENT_SCOPE) @@ -232,20 +224,25 @@ function(set_static_mkl_libs libpath) set(MKLLIB "${MKLLIB}" PARENT_SCOPE) endfunction() +set(MKLDNN_USES_MKL "") detect_mkl("mklml_intel") detect_mkl("mklml_gnu") detect_mkl("mklml") detect_mkl("mkl_rt") -if (MKLDNN_USE_MKL STREQUAL "FULL:STATIC" AND HAVE_MKL) - set(MKLDLL "") - get_filename_component(MKLLIBPATH "${MKLLIB}" PATH) - set_static_mkl_libs(${MKLLIBPATH}) -endif () if(HAVE_MKL) + if (MKLDNN_USE_MKL STREQUAL "FULL:STATIC") + set(MKLDLL "") + get_filename_component(MKLLIBPATH "${MKLLIB}" PATH) + set_static_mkl_libs(${MKLLIBPATH}) + list(APPEND EXTRA_STATIC_LIBS ${MKLLIB}) + set(MKLDNN_USES_MKL "FULL:STATIC") + else() + list(APPEND EXTRA_SHARED_LIBS ${MKLLIB}) + endif() + add_definitions(-DUSE_MKL -DUSE_CBLAS) include_directories(AFTER ${MKLINC}) - list(APPEND mkldnn_LINKER_LIBS ${MKLLIB}) set(MSG "Intel(R) MKL:") message(STATUS "${MSG} include ${MKLINC}") diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/OpenMP.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/OpenMP.cmake index f9c3620..086c9c2 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/OpenMP.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/OpenMP.cmake @@ -21,10 +21,11 @@ if(OpenMP_cmake_included) return() endif() set(OpenMP_cmake_included true) - include("cmake/Threading.cmake") include("cmake/MKL.cmake") +set(MKLDNN_USES_INTEL_OPENMP FALSE) + if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # OSX Clang doesn't have OpenMP by default. # But we still want to build the library. @@ -33,13 +34,16 @@ else() set(_omp_severity "FATAL_ERROR") endif() - macro(forbid_link_compiler_omp_rt) if (NOT WIN32) - set_if(OpenMP_C_FOUND CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OpenMP_C_FLAGS}) - set_if(OpenMP_CXX_FOUND CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OpenMP_CXX_FLAGS}) + set_if(OpenMP_C_FOUND + CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS + "${OpenMP_C_FLAGS}") + set_if(OpenMP_CXX_FOUND + CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS + "${OpenMP_CXX_FLAGS}") if (NOT APPLE) - set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--as-needed") + append(CMAKE_SHARED_LINKER_FLAGS "-Wl,--as-needed") endif() endif() endmacro() @@ -47,30 +51,33 @@ endmacro() macro(use_intel_omp_rt) # fast return if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + set(MKLDNN_USES_INTEL_OPENMP TRUE) return() endif() # Do not link with compiler-native OpenMP library if Intel MKL is present. # Rationale: Intel MKL comes with Intel OpenMP library which is compatible # with all libraries shipped with compilers that Intel MKL-DNN supports. - if(HAVE_MKL) + get_filename_component(MKLIOMP5LIB "${MKLIOMP5LIB}" PATH) + find_library(IOMP5LIB + NAMES "iomp5" "iomp5md" "libiomp5" "libiomp5md" + HINTS ${MKLIOMP5LIB} ) + if(IOMP5LIB) forbid_link_compiler_omp_rt() - if (UNIX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - # For some reasons Clang ignores `-fopenmp=libiomp5` switch and - # links against libomp.so anyways. - # The workaround is to set the full path to libiomp5.so - add_library(libiomp5 SHARED IMPORTED) - set_property(TARGET libiomp5 PROPERTY IMPORTED_LOCATION "${MKLIOMP5LIB}") - list(APPEND EXTRA_LIBS libiomp5) - else() - list(APPEND EXTRA_LIBS ${MKLIOMP5LIB}) + if (WIN32) + get_filename_component(MKLIOMP5DLL "${MKLIOMP5DLL}" PATH) + find_file(IOMP5DLL + NAMES "libiomp5.dll" "libiomp5md.dll" + HINTS ${MKLIOMP5DLL}) endif() + list(APPEND EXTRA_SHARED_LIBS ${IOMP5LIB}) else() if (MKLDNN_THREADING STREQUAL "OMP:INTEL") message(${_omp_severity} "Intel OpenMP runtime could not be found. " "Please either use OpenMP runtime that comes with the compiler " "(via -DMKLDNN_THREADING={OMP,OMP:COMP}), or " - "install Intel MKL / Intel MKL-ML (e.g. scripts/prepare_mkl.sh)") + "explicitely provide the path to libiomp with the " + "-DCMAKE_LIBRARY_PATH option") endif() endif() endmacro() @@ -83,7 +90,7 @@ elseif(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") append(CMAKE_C_FLAGS "-Xclang -fopenmp") append(CMAKE_CXX_FLAGS "-Xclang -fopenmp") set(OpenMP_CXX_FOUND true) - list(APPEND EXTRA_LIBS ${MKLIOMP5LIB}) + list(APPEND EXTRA_SHARED_LIBS ${IOMP5LIB}) else() find_package(OpenMP) #newer version for findOpenMP (>= v. 3.9) @@ -96,24 +103,29 @@ else() set(OpenMP_C_FOUND true) set(OpenMP_CXX_FOUND true) endif() - append_if(OpenMP_C_FOUND CMAKE_C_FLAGS "${OpenMP_C_FLAGS}") - append_if(OpenMP_CXX_FOUND CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}") + append_if(OpenMP_C_FOUND CMAKE_SRC_CCXX_FLAGS "${OpenMP_C_FLAGS}") endif() if (MKLDNN_THREADING MATCHES "OMP") if (OpenMP_CXX_FOUND) set_threading("OMP") + append(CMAKE_TEST_CCXX_FLAGS "${OpenMP_CXX_FLAGS}") + append(CMAKE_EXAMPLE_CCXX_FLAGS "${OpenMP_CXX_FLAGS}") else() message(${_omp_severity} "OpenMP library could not be found. " "Proceeding might lead to highly sub-optimal performance.") endif() if (MKLDNN_THREADING STREQUAL "OMP:COMP") - set(MKLIOMP5LIB "") - set(MKLIOMP5DLL "") + set(IOMP5LIB "") + set(IOMP5DLL "") else() use_intel_omp_rt() endif() + + if(MKLIOMP5LIB) + set(MKLDNN_USES_INTEL_OPENMP TRUE) + endif() else() # Compilation happens with OpenMP to enable `#pragma omp simd` # but during linkage OpenMP dependency should be avoided @@ -121,9 +133,9 @@ else() return() endif() -set_ternary(_omp_lib_msg MKLIOMP5LIB "${MKLIOMP5LIB}" "provided by compiler") +set_ternary(_omp_lib_msg IOMP5LIB "${IOMP5LIB}" "provided by compiler") message(STATUS "OpenMP lib: ${_omp_lib_msg}") if(WIN32) - set_ternary(_omp_dll_msg MKLIOMP5DLL "${MKLIOMP5LIB}" "provided by compiler") + set_ternary(_omp_dll_msg IOMP5DLL "${IOMP5LIB}" "provided by compiler") message(STATUS "OpenMP dll: ${_omp_dll_msg}") endif() diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/SDL.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/SDL.cmake index b494a0f..c4e0ab4 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/SDL.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/SDL.cmake @@ -21,16 +21,17 @@ if(SDL_cmake_included) return() endif() set(SDL_cmake_included true) +include("cmake/utils.cmake") if(UNIX) set(CMAKE_CCXX_FLAGS "-fPIC -Wformat -Wformat-security") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2") + append(CMAKE_CXX_FLAGS_RELEASE "-D_FORTIFY_SOURCE=2") + append(CMAKE_C_FLAGS_RELEASE "-D_FORTIFY_SOURCE=2") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) - set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fstack-protector-all") + append(CMAKE_CCXX_FLAGS "-fstack-protector-all") else() - set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fstack-protector-strong") + append(CMAKE_CCXX_FLAGS "-fstack-protector-strong") endif() # GCC might be very paranoid for partial structure initialization, e.g. @@ -39,21 +40,21 @@ if(UNIX) # only. To prevent warnings on users' side who use the library and turn # this warning on, let's use it too. Applicable for the library sources # and interfaces only (tests currently rely on that fact heavily) - set(CMAKE_SRC_CCXX_FLAGS "${CMAKE_SRC_CCXX_FLAGS} -Wmissing-field-initializers") - set(CMAKE_EXAMPLE_CCXX_FLAGS "${CMAKE_EXAMPLE_CCXX_FLAGS} -Wmissing-field-initializers") + append(CMAKE_SRC_CCXX_FLAGS "-Wmissing-field-initializers") + append(CMAKE_EXAMPLE_CCXX_FLAGS "-Wmissing-field-initializers") elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fstack-protector-all") + append(CMAKE_CCXX_FLAGS "-fstack-protector-all") elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector") + append(CMAKE_CXX_FLAGS "-fstack-protector") endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CCXX_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CCXX_FLAGS}") + append(CMAKE_C_FLAGS "${CMAKE_CCXX_FLAGS}") + append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_FLAGS}") if(APPLE) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-bind_at_load") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-bind_at_load") + append(CMAKE_SHARED_LINKER_FLAGS "-Wl,-bind_at_load") + append(CMAKE_EXE_LINKER_FLAGS "-Wl,-bind_at_load") else() - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pie") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") + append(CMAKE_EXE_LINKER_FLAGS "-pie") + append(CMAKE_SHARED_LINKER_FLAGS "-Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") + append(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") endif() endif() diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/TBB.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/TBB.cmake index fb0cdc1..fe24e09 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/TBB.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/TBB.cmake @@ -21,6 +21,7 @@ if(TBB_cmake_included) return() endif() set(TBB_cmake_included true) +include("cmake/Threading.cmake") if(NOT MKLDNN_THREADING STREQUAL "TBB") return() @@ -43,6 +44,6 @@ elseif(UNIX) endif() set_threading("TBB") -list(APPEND mkldnn_LINKER_LIBS ${TBB_IMPORTED_TARGETS}) +list(APPEND EXTRA_SHARED_LIBS ${TBB_IMPORTED_TARGETS}) message(STATUS "Intel(R) TBB: ${TBBROOT}") diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/config.cmake.in b/inference-engine/thirdparty/mkl-dnn/cmake/config.cmake.in new file mode 100644 index 0000000..53b7032 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/cmake/config.cmake.in @@ -0,0 +1,6 @@ +@PACKAGE_INIT@ +include("${CMAKE_CURRENT_LIST_DIR}/@LIB_EXPORT_NAME@.cmake") +set(MKLDNN_THREADING "@MKLDNN_THREADING@") +set(MKLDNN_USES_INTEL_OPENMP @MKLDNN_USES_INTEL_OPENMP@) +set(MKLDNN_USES_MKL "@MKLDNN_USES_MKL@") +check_required_components("@LIB_NAME@") diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/options.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/options.cmake index e6ff249..2f76970 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/options.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/options.cmake @@ -128,6 +128,15 @@ set(VTUNEROOT "" CACHE STRING # Miscellaneous # ============= +option(BENCHDNN_USE_RDPMC + "enables rdpms counter to report precise cpu frequency in benchdnn. + CAUTION: may not work on all cpus (hence disabled by default)" + OFF) # disabled by default + +# ============= +# Developer flags +# ============= + set(MKLDNN_USE_CLANG_SANITIZER "" CACHE STRING "instructs build system to use a Clang sanitizer. Possible values: Address: enables MemorySanitizer @@ -136,8 +145,7 @@ set(MKLDNN_USE_CLANG_SANITIZER "" CACHE STRING Undefined: enables UndefinedBehaviourSanitizer This feature is experimental and is only available on Linux.") - -option(BENCHDNN_USE_RDPMC - "enables rdpms counter to report precise cpu frequency in benchdnn. - CAUTION: may not work on all cpus (hence disabled by default)" - OFF) # disabled by default +option(MKLDNN_PRODUCT_BUILD_MODE + "Enables/disables product build mode. For example, + setting MKLDNN_PRODUCT_BUILD_MODE=OFF makes warnings non-fatal" + ON) diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/platform.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/platform.cmake index 3597970..a541215 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/platform.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/platform.cmake @@ -22,6 +22,8 @@ if(platform_cmake_included) endif() set(platform_cmake_included true) +include("cmake/utils.cmake") + add_definitions(-DMKLDNN_DLL -DMKLDNN_DLL_EXPORTS) # UNIT8_MAX-like macros are a part of the C99 standard and not a part of the @@ -50,6 +52,8 @@ if(MSVC) set(DEF_ARCH_OPT_FLAGS "-QxHOST") # disable: loop was not vectorized with "simd" append(CMAKE_CCXX_NOWARN_FLAGS "-Qdiag-disable:15552") + # disable: unknown pragma + append(CMAKE_CCXX_NOWARN_FLAGS "-Qdiag-disable:3180") endif() if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") # Clang cannot vectorize some loops with #pragma omp simd and gets @@ -58,7 +62,8 @@ if(MSVC) append(CMAKE_CCXX_FLAGS "-Wno-pass-failed") endif() elseif(UNIX OR MINGW) - append(CMAKE_CCXX_FLAGS "-Wall -Werror -Wno-unknown-pragmas") + append(CMAKE_CCXX_FLAGS "-Wall -Wno-unknown-pragmas") + append_if_product(CMAKE_CCXX_FLAGS "-Werror") append(CMAKE_CCXX_FLAGS "-fvisibility=internal") append(CMAKE_C_FLAGS "-std=c99") append(CMAKE_CXX_FLAGS "-std=c++11 -fvisibility-inlines-hidden") @@ -125,11 +130,6 @@ elseif(UNIX OR MINGW) endif() endif() -if(WIN32) - string(REPLACE ";" "\;" ENV_PATH "$ENV{PATH}") - set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}\;${MKLDLLPATH}\;${ENV_PATH}") -endif() - if(UNIX OR MINGW) if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") # Link Intel libraries statically (except for iomp5) diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/profiling.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/profiling.cmake index c531d84..846135c 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/profiling.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/profiling.cmake @@ -23,6 +23,6 @@ if("${VTUNEROOT}" STREQUAL "") message(STATUS "VTune profiling environment is unset") else() set_ternary(JITPROFLIB MSVC "jitprofiling.lib" "libjitprofiling.a") - list(APPEND EXTRA_LIBS "${VTUNEROOT}/lib64/${JITPROFLIB}") + list(APPEND EXTRA_STATIC_LIBS "${VTUNEROOT}/lib64/${JITPROFLIB}") message(STATUS "VTune profiling environment is set") endif() diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/template.vcxproj.user b/inference-engine/thirdparty/mkl-dnn/cmake/template.vcxproj.user new file mode 100644 index 0000000..68b6c86 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/cmake/template.vcxproj.user @@ -0,0 +1,7 @@ + + + + PATH=@CTESTCONFIG_PATH@;$(PATH) + WindowsLocalDebugger + + diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/utils.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/utils.cmake index d8680b7..867ec08 100644 --- a/inference-engine/thirdparty/mkl-dnn/cmake/utils.cmake +++ b/inference-engine/thirdparty/mkl-dnn/cmake/utils.cmake @@ -21,6 +21,17 @@ if(utils_cmake_included) return() endif() set(utils_cmake_included true) +include("cmake/options.cmake") + +# Common configuration for tests / test cases on Windows +function(maybe_configure_windows_test name kind) + if(WIN32 OR MINGW) + string(REPLACE ";" "\;" PATH "${CTESTCONFIG_PATH};$ENV{PATH}") + set_property(${kind} ${name} PROPERTY ENVIRONMENT "PATH=${PATH}") + configure_file(${PROJECT_SOURCE_DIR}/cmake/template.vcxproj.user + ${name}.vcxproj.user @ONLY) + endif() +endfunction() # Register new executable/test # name -- name of the executable @@ -29,13 +40,10 @@ set(utils_cmake_included true) # arg4 -- (optional) list of extra library dependencies function(register_exe name srcs test) add_executable(${name} ${srcs}) - target_link_libraries(${name} ${LIB_NAME} ${EXTRA_LIBS} ${ARGV3}) + target_link_libraries(${name} ${LIB_NAME} ${EXTRA_SHARED_LIBS} ${ARGV3}) if("${test}" STREQUAL "test") add_test(${name} ${name}) - if(WIN32 OR MINGW) - set_property(TEST ${name} PROPERTY ENVIRONMENT "PATH=${CTESTCONFIG_PATH};$ENV{PATH}") - configure_file(${CMAKE_SOURCE_DIR}/config_template.vcxproj.user ${name}.vcxproj.user @ONLY) - endif() + maybe_configure_windows_test(${name} TEST) endif() endfunction() @@ -45,6 +53,20 @@ macro(append var value) set(${var} "${${var}} ${value}") endmacro() +# Append to a variable if building a product build (as opposed to a developer +# build that is detected via the MKLDNN_PRODUCT_BUILD_MODE option) +macro(append_if_product var value) + if(MKLDNN_PRODUCT_BUILD_MODE) + append(${var} "${value}") + endif() +endmacro() + +if(MKLDNN_PRODUCT_BUILD_MODE) + message(STATUS "This is a product build") +else() + message(WARNING "This is a developer build") +endif() + # Set variable depending on condition: # var = cond ? val_if_true : val_if_false macro(set_ternary var condition val_if_true val_if_false) @@ -70,3 +92,32 @@ macro(append_if condition var value) append(${var} "${value}") endif() endmacro() + +# Append a path to path_list variable (Windows-only version) +macro(append_to_windows_path_list path_list path) + file(TO_NATIVE_PATH "${path}" append_to_windows_path_list_tmp__) + if(${path_list}) + set(${path_list} + "${${path_list}};${append_to_windows_path_list_tmp__}") + else() + set(${path_list} + "${append_to_windows_path_list_tmp__}") + endif() +endmacro() + +function(target_link_libraries_private target list) + # Foreach is required for compatibility with 2.8.11 ways + foreach(lib ${list}) + target_link_libraries(${target} LINK_PRIVATE + "$") + endforeach(lib) +endfunction() + +function(target_link_libraries_public target list) + # Foreach is required for compatibility with 2.8.11 ways + foreach(lib ${list}) + get_filename_component(base "${lib}" NAME) + target_link_libraries(${target} LINK_PUBLIC + "$") + endforeach(lib) +endfunction() diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/version.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/version.cmake new file mode 100644 index 0000000..4591880 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/cmake/version.cmake @@ -0,0 +1,46 @@ +#=============================================================================== +# Copyright 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +# Control generating version file +#=============================================================================== + +if(version_cmake_included) + return() +endif() +set(version_cmake_included true) + +string(REPLACE "." ";" VERSION_LIST ${PROJECT_VERSION}) +list(GET VERSION_LIST 0 MKLDNN_VERSION_MAJOR) +list(GET VERSION_LIST 1 MKLDNN_VERSION_MINOR) +list(GET VERSION_LIST 2 MKLDNN_VERSION_PATCH) + +find_package(Git) +if (GIT_FOUND) + execute_process(COMMAND ${GIT_EXECUTABLE} log -1 --format=%H + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + RESULT_VARIABLE RESULT + OUTPUT_VARIABLE MKLDNN_VERSION_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +if(NOT GIT_FOUND OR RESULT) + set(MKLDNN_VERSION_HASH "N/A") +endif() + +configure_file( + "${PROJECT_SOURCE_DIR}/include/mkldnn_version.h.in" + "${PROJECT_BINARY_DIR}/include/mkldnn_version.h" +) diff --git a/inference-engine/thirdparty/mkl-dnn/doc/Doxyfile.in b/inference-engine/thirdparty/mkl-dnn/doc/Doxyfile.in index d1c466c..8c38fd9 100644 --- a/inference-engine/thirdparty/mkl-dnn/doc/Doxyfile.in +++ b/inference-engine/thirdparty/mkl-dnn/doc/Doxyfile.in @@ -158,7 +158,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = @CMAKE_SOURCE_DIR@ +STRIP_FROM_PATH = @PROJECT_SOURCE_DIR@ # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which diff --git a/inference-engine/thirdparty/mkl-dnn/doc/ex_simplenet.md b/inference-engine/thirdparty/mkl-dnn/doc/ex_simplenet.md index ef60775..ced53d8 100644 --- a/inference-engine/thirdparty/mkl-dnn/doc/ex_simplenet.md +++ b/inference-engine/thirdparty/mkl-dnn/doc/ex_simplenet.md @@ -59,7 +59,7 @@ auto conv1_src_md = memory::desc({conv1_src_tz}, /* similarly create conv_weights_md and conv_dst_md in format::any */ ~~~ -6. Create a convolution descriptor by specifying the algorithm, propagation +6. Create a convolution descriptor by specifying the algorithm([convolution algorithms](@ref winograd_convolution), propagation kind, shapes of input, weights, bias, output, convolution strides, padding, and kind of padding. Propagation kind is set to *forward_inference* -optimized for inference execution and omits computations that are only necessary diff --git a/inference-engine/thirdparty/mkl-dnn/doc/mainpage.md b/inference-engine/thirdparty/mkl-dnn/doc/mainpage.md index 2a0c7a8..fdec549 100644 --- a/inference-engine/thirdparty/mkl-dnn/doc/mainpage.md +++ b/inference-engine/thirdparty/mkl-dnn/doc/mainpage.md @@ -26,9 +26,9 @@ The table below summarizes the list of supported functions and their variants. | | 3D direct deconvolution | x | x | | | Inner Product | 2D inner product | x | x | x | | | 3D inner product | x | x | | -| RNN (experimental)| Vanilla RNN cell | x | x | | -| | LSTM cell | x | x | | -| | GRU cell | x | x | | +| RNN | Vanilla RNN | x | x | | +| | LSTM | x | x | x | +| | GRU | x | x | | | Pooling | 2D maximum pooling | x | x | x | | | 2D average pooling | x | x | x | | | 3D maximum pooling | x | x | | @@ -36,19 +36,22 @@ The table below summarizes the list of supported functions and their variants. | Normalization | 2D LRN (within channel) | x | x | | | | 2D LRN (across channels) | x | x | | | | 2D batch normalization | x | x | | -| | 3D Batch Normalization | x | x | | -| Activation | ReLU | x | x | x | -| | Tanh | | x | | -| | ELU | | x | | -| | Bounded ReLU | | x | | -| | Soft ReLU | | x | | -| | Logistic regression | | x | | +| | 3D batch normalization | x | x | | +| Activation and | ReLU | x | x | x | +| elementwise | Tanh | x | x | | +| functions | ELU | x | x | | +| | Square | x | x | | +| | Sqrt | x | x | | +| | Abs | x | x | | +| | Linear | x | x | | +| | Bounded ReLU | x | x | | +| | Soft ReLU | x | x | | +| | Logistic | x | x | | | | Softmax | x | x | | | Data manipulation | Reorder/quantization | x | x | x | | | Sum | x | x | x | | | Concat | x | x | x | -| | Elementwise operations | | x | | -| | Channel Shuffle | x | x | x | +| | Shuffle | x | x | x | ## Programming Model @@ -140,7 +143,7 @@ The following examples are available in the /examples directory and provide more - C: simple_training.c - C++: simple_training_net.cpp -* Creation of forward propagation of GNMT topology (experimental support) +* Creation of forward propagation of GNMT topology - C++: simple_rnn.cpp * Training RNN with sequences of variable length @@ -152,6 +155,7 @@ The following examples are available in the /examples directory and provide more format `any` for input or output. The memory format chosen is based on different circumstances such as hardware and convolutional parameters. +* Convolution could be executed using the [Winograd algorithm](@ref winograd_convolution) for a significant performance boost. * Operation primitives (such as ReLU, LRN, or pooling) following convolution or inner product, should have input in the same memory format as the convolution or inner-product. Reordering can be an expensive @@ -162,6 +166,7 @@ The following examples are available in the /examples directory and provide more might need workspace memory for storing results of intermediate operations that help with backward propagation. + The following link provides a guide to MKLDNN verbose mode for profiling execution: * [Performance profiling](@ref perf_profile) diff --git a/inference-engine/thirdparty/mkl-dnn/doc/perf_profile.md b/inference-engine/thirdparty/mkl-dnn/doc/perf_profile.md index 7c36ffe..d0c28bf 100644 --- a/inference-engine/thirdparty/mkl-dnn/doc/perf_profile.md +++ b/inference-engine/thirdparty/mkl-dnn/doc/perf_profile.md @@ -90,39 +90,42 @@ To dump JIT-kernels set MKLDNN_JIT_DUMP environment variable to `1`. For example ``` This will produce the following output files: - mkldnn_dump_jit_avx2_conv_fwd_kernel_f32.0.bin - mkldnn_dump_jit_uni_lrn_fwd_kernel_f32.2.bin + + mkldnn_dump_jit_uni_reorder_kernel_f32.0.bin + mkldnn_dump_jit_avx2_conv_fwd_kernel_f32.1.bin + mkldnn_dump_jit_uni_relu_kernel_f32.2.bin mkldnn_dump_jit_uni_lrn_fwd_kernel_f32.3.bin mkldnn_dump_jit_uni_lrn_fwd_kernel_f32.4.bin - mkldnn_dump_jit_uni_pool_kernel_f32.5.bin - mkldnn_dump_jit_uni_relu_kernel_f32.1.bin - + mkldnn_dump_jit_uni_lrn_fwd_kernel_f32.5.bin + mkldnn_dump_jit_uni_reorder_kernel_f32.6.bin + mkldnn_dump_jit_uni_pool_kernel_f32.7.bin + To open these files any disassembler can be used. For example: ``` - $ xed -ir mkldnn_dump_jit_avx2_conv_fwd_kernel_f32.0.bin - XDIS 0: PUSH BASE 53 push ebx - XDIS 1: PUSH BASE 55 push ebp - XDIS 2: BINARY BASE 41 inc ecx - XDIS 3: PUSH BASE 54 push esp - XDIS 4: BINARY BASE 41 inc ecx - XDIS 5: PUSH BASE 55 push ebp - XDIS 6: BINARY BASE 41 inc ecx - XDIS 7: PUSH BASE 56 push esi - XDIS 8: BINARY BASE 41 inc ecx - XDIS 9: PUSH BASE 57 push edi - XDIS a: BINARY BASE 48 dec eax - XDIS b: DATAXFER BASE 8B07 mov eax, dword ptr [edi] - XDIS d: BINARY BASE 48 dec eax - XDIS e: DATAXFER BASE 8B7708 mov esi, dword ptr [edi+0x8] - XDIS 11: BINARY BASE 48 dec eax - XDIS 12: DATAXFER BASE 8B5710 mov edx, dword ptr [edi+0x10] - XDIS 15: BINARY BASE 48 dec eax - XDIS 16: DATAXFER BASE 8B5F18 mov ebx, dword ptr [edi+0x18] - XDIS 19: BINARY BASE 48 dec eax - XDIS 1a: DATAXFER BASE 8B4F40 mov ecx, dword ptr [edi+0x40] - XDIS 1d: BINARY BASE 44 inc esp - XDIS 1e: DATAXFER BASE 8B6F70 mov ebp, dword ptr [edi+0x70] + $ xed -64 -ir mkldnn_dump_jit_avx2_conv_fwd_kernel_f32.1.bin + XDIS 0: PUSH BASE 53 push rbx + XDIS 1: PUSH BASE 55 push rbp + XDIS 2: PUSH BASE 4154 push r12 + XDIS 4: PUSH BASE 4155 push r13 + XDIS 6: PUSH BASE 4156 push r14 + XDIS 8: PUSH BASE 4157 push r15 + XDIS a: DATAXFER BASE 488B07 mov rax, qword ptr [rdi] + XDIS d: DATAXFER BASE 488B7708 mov rsi, qword ptr [rdi+0x8] + XDIS 11: DATAXFER BASE 488B5710 mov rdx, qword ptr [rdi+0x10] + XDIS 15: DATAXFER BASE 488B5F18 mov rbx, qword ptr [rdi+0x18] + XDIS 19: DATAXFER BASE 488B8F98000000 mov rcx, qword ptr [rdi+0x98] + XDIS 20: DATAXFER BASE 448BAF00010000 mov r13d, dword ptr [rdi+0x100] + XDIS 27: DATAXFER BASE 4C8BB7D0000000 mov r14, qword ptr [rdi+0xd0] + XDIS 2e: BINARY BASE 4983FE04 cmp r14, 0x4 + XDIS 32: COND_BR BASE 0F85EF030000 jnz 0x427 + XDIS 38: LOGICAL BASE 4D31DB xor r11, r11 + XDIS 3b: LOGICAL BASE 41F7C510000000 test r13d, 0x10 + XDIS 42: COND_BR BASE 0F8558000000 jnz 0xa0 + XDIS 48: DATAXFER AVX C5FC1006 vmovups ymm0, ymmword ptr [rsi] + XDIS 4c: DATAXFER AVX C5FC104E20 vmovups ymm1, ymmword ptr [rsi+0x20] + XDIS 51: DATAXFER AVX C5FC105640 vmovups ymm2, ymmword ptr [rsi+0x40] + XDIS 56: DATAXFER AVX C5FC109E207A0100 vmovups ymm3, ymmword ptr [rsi+0x17a20] ... ``` diff --git a/inference-engine/thirdparty/mkl-dnn/doc/winograd_convolution.md b/inference-engine/thirdparty/mkl-dnn/doc/winograd_convolution.md new file mode 100644 index 0000000..bbe3ebe --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/doc/winograd_convolution.md @@ -0,0 +1,93 @@ +Winograd Convolution {#winograd_convolution} +========================================== +## Why use a different convolution algorithm? +Executing convolution using the **Winograd algorithm** often gives a significant performance boost compared with using the **Direct algorithm**. +Details about the algorithm can be found in [Fast Algorithms for Convolutional Neural Networks by A. Lavin and S. Gray](https://arxiv.org/abs/1509.09308). + +## Winograd in Intel(R) MKL-DNN +Intel(R) MKL-DNN supports the **Winograd algorithm** for convolutions with the following sizes: +* 2D convolution (i.e. spatial depth `d=1`) +* kernel sizes `kh=3,kw=3`. +* strides `sh=sw=1`. + +* **Inference** - Based on convolution sizes, MKLDNN chooses between two different tile sizes F(2x2, 3x3) or F(4x4, 3x3)(refer to [Winograd paper](https://arxiv.org/abs/1509.09308) for more informartion on tile sizes). +* **Training** - Uses F(4x4, 3x3) winograd. + +Create a Winograd convolution by simply creating a convolution descriptor (step 6 in [SimpleNet Example](@ref ex_simplenet)) with right algorithm. +The rest of the steps for creating convolution are exactly the same as shown in the example. +~~~cpp +auto conv1_desc = convolution_forward::desc( + prop_kind::forward_inference, algorithm::convolution_winograd, + conv1_src_md, conv1_weights_md, conv1_bias_md, conv1_dst_md, + conv1_strides, conv1_padding, padding_kind::zero); +~~~ + +## Auto dispatching of convolution algorithm +Instead of choosing a convolution algorithm for each and every convolution in a topology, a user could simply ask MKLDNN to make the choice. + +Creating a convolution by using `convolution_auto` allows MKLDNN to dispatch the *best* algorithm. +~~~cpp +auto conv1_desc = convolution_forward::desc( + prop_kind::forward_inference, algorithm::convolution_auto, + conv1_src_md, conv1_weights_md, conv1_bias_md, conv1_dst_md, + conv1_strides, conv1_padding, padding_kind::zero); +~~~ + +MKLDNN would choose the algorithm which will potentially give *best performance* based on +* convolution dimensions +* number of logical processors available. (For auto-dispatching to work as intended, + use the same thread affinity settings when creating the convolution as when executing the convolution.) +*The relationship between convolution sizes and the best performing algorithm is empirically based on performance observations* + +### Example using benchdnn +The following examples use [benchdnn](https://github.com/intel/mkl-dnn/tree/master/tests/benchdnn) to illustrate the performance benefits of using `convolution_auto`. + +On a 2 Socket Intel Xeon 8180 processor with 28 cores/socket and HT off: +~~~sh +OMP_NUM_THREADS=56 KMP_AFFINITY=granularity=fine,compact numactl -l tests/benchdnn/benchdnn --mode=p --conv -v5 --alg=auto --dir=BWD_WB mb112ic64ih300oc64oh300kh3ph1n"ssd_300_voc0712:conv1_2" + +mkldnn implementation: jit_wino_4x3:avx512_core +... +mkldnn_verbose,exec,convolution,jit_wino_4x3:avx512_core,backward_weights,fsrc:nChw16c fwei:gOIhw16i16o fbia:x fdst:nChw16c,alg:convolution_winograd,mb112_g1ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1,61.32 +... +perf,ssd_300_voc0712:conv1_2,--dir=BWD_WB --alg=auto mb112ic64ih300oc64oh300kh3ph1nssd_300_voc0712:conv1_2,739.879,0,61.332,12063.5,62.503,11837.5 +~~~ + +In the above test-case `convolution_auto` choses winograd convolution (using a heuristic based on the convolution sizes and number of threads), as winograd convolution is faster than direct in this case. +~~~sh +OMP_NUM_THREADS=56 KMP_AFFINITY=granularity=fine,compact numactl -l tests/benchdnn/benchdnn --mode=p --conv -v5 --alg=direct --dir=BWD_WB mb112ic64ih300oc64oh300kh3ph1n"ssd_300_voc0712:conv1_2" + +mkldnn implementation: jit:avx512_common +... +mkldnn_verbose,exec,convolution,jit:avx512_common,backward_weights,fsrc:nchw fwei:gOhwi16o fbia:x fdst:nChw16c,alg:convolution_direct,mb112_g1ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1,176.10 +... +perf,ssd_300_voc0712:conv1_2,--dir=BWD_WB mb112ic64ih300oc64oh300kh3ph1nssd_300_voc0712:conv1_2,739.879,0,175.422,4217.7,180.315,4103.26 +~~~ + +
+ +In the following example, `convolution_auto` chooses direct convolution because the winograd implementation is slower than direct in this case. +~~~sh +OMP_NUM_THREADS=56 KMP_AFFINITY=granularity=fine,compact tests/benchdnn/benchdnn --mode=p --conv -v5 --alg=auto --dir=BWD_WB mb112ic64ih28oc64oh28kh3ph1n"googlenet_v2:inception_3a/3x3" + +mkldnn implementation: jit:avx512_common +... +mkldnn_verbose,exec,convolution,jit:avx512_common,backward_weights,fsrc:nChw16c fwei:gOIhw16i16o fbia:x fdst:nChw16c,alg:convolution_direct,mb112_g1ic64oc64_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1,1.13 +perf,googlenet_v2:inception_3a/3x3,--dir=BWD_WB --alg=auto mb112ic64ih28oc64oh28kh3ph1ngooglenet_v2:inception_3a/3x3,6.1693,0,1.04272,5916.52,1.13284,5445.88 +~~~ +~~~sh +OMP_NUM_THREADS=56 KMP_AFFINITY=granularity=fine,compact tests/benchdnn/benchdnn --mode=p --conv -v5 --alg=wino --dir=BWD_WB mb112ic64ih28oc64oh28kh3ph1n"googlenet_v2:inception_3a/3x3" + +mkldnn implementation: jit_wino_4x3:avx512_core +... +mkldnn_verbose,exec,convolution,jit_wino_4x3:avx512_core,backward_weights,fsrc:nChw16c fwei:gOIhw16i16o fbia:x fdst:nChw16c,alg:convolution_winograd,mb112_g1ic64oc64_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1,2.15 +... +perf,googlenet_v2:inception_3a/3x3,--dir=BWD_WB --alg=wino mb112ic64ih28oc64oh28kh3ph1ngooglenet_v2:inception_3a/3x3,6.1693,0,2.14404,2877.41,2.20445,2798.56 +~~~ + +## Other considerations when using Winograd +The following side-effects should be weighed against the performance boost achieved when using Winograd: +* **Memory** - Transforms are intermmediate results in winograd, which often require significant memory. Currently this memory is allocated internally by MKLDNN as scratchpad memory. As more convolutions using winograd +are added to the topology, this memory could grow significantly. This growth is mitigated when several convolutions using Winograd are created by the same instance and executed sequentially, because then +this scratchpad can be shared between convolutions. +* **Accuracy** - In some cases Winograd can be signficantly less accurate than direct as demontrated in [Winograd paper](https://arxiv.org/abs/1509.09308). diff --git a/inference-engine/thirdparty/mkl-dnn/examples/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/examples/CMakeLists.txt index 3d05855..601ce18 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/CMakeLists.txt +++ b/inference-engine/thirdparty/mkl-dnn/examples/CMakeLists.txt @@ -26,7 +26,7 @@ append(CMAKE_CXX_FLAGS "${CMAKE_EXAMPLE_CCXX_FLAGS}") append(CMAKE_C_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}") append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}") -include_directories(${CMAKE_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/include) set_if(UNIX LIBM m) @@ -35,8 +35,6 @@ register_exe(simple-net-cpp simple_net.cpp "test") register_exe(simple-training-net-c simple_training_net.c "test" ${LIBM}) register_exe(simple-training-net-cpp simple_training_net.cpp "test" ${LIBM}) register_exe(simple-net-int8-cpp simple_net_int8.cpp "test") - -if(HAVE_MKL) - register_exe(simple-rnn-cpp simple_rnn.cpp "test") - register_exe(simple-rnn-training-cpp simple_rnn_training.cpp "test") -endif() +register_exe(simple-rnn-cpp simple_rnn.cpp "test") +register_exe(simple-rnn-int8-cpp simple_rnn_int8.cpp "test") +register_exe(simple-rnn-training-cpp simple_rnn_training.cpp "test") diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_net.c b/inference-engine/thirdparty/mkl-dnn/examples/simple_net.c index 6a4e78a..a88d0a8 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_net.c +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_net.c @@ -76,13 +76,13 @@ void _free(void *ptr) { } #endif -static size_t product(int *arr, size_t size) { +static size_t product(ptrdiff_t *arr, size_t size) { size_t prod = 1; for (size_t i = 0; i < size; ++i) prod *= arr[i]; return prod; } -static void init_data_memory(uint32_t dim, const int *dims, +static void init_data_memory(uint32_t dim, const ptrdiff_t *dims, mkldnn_memory_format_t user_fmt, mkldnn_data_type_t mkldnn_f32, mkldnn_engine_t engine, float *data, mkldnn_primitive_t *memory) { @@ -159,12 +159,12 @@ mkldnn_status_t simple_net() { * {BATCH, OC, CONV_OH, CONV_OW} * strides: {CONV_STRIDE, CONV_STRIDE} */ - int conv_user_src_sizes[4] = { BATCH, IC, CONV_IH, CONV_IW }; - int conv_user_weights_sizes[4] = { OC, IC, 11, 11 }; - int conv_bias_sizes[4] = { OC }; - int conv_user_dst_sizes[4] = { BATCH, OC, CONV_OH, CONV_OW }; - int conv_strides[2] = { CONV_STRIDE, CONV_STRIDE }; - int conv_padding[2] = { CONV_PAD, CONV_PAD }; + ptrdiff_t conv_user_src_sizes[4] = { BATCH, IC, CONV_IH, CONV_IW }; + ptrdiff_t conv_user_weights_sizes[4] = { OC, IC, 11, 11 }; + ptrdiff_t conv_bias_sizes[4] = { OC }; + ptrdiff_t conv_user_dst_sizes[4] = { BATCH, OC, CONV_OH, CONV_OW }; + ptrdiff_t conv_strides[2] = { CONV_STRIDE, CONV_STRIDE }; + ptrdiff_t conv_padding[2] = { CONV_PAD, CONV_PAD }; float *conv_src = net_src; float *conv_weights = (float *)aligned_malloc( @@ -350,10 +350,10 @@ mkldnn_status_t simple_net() { * strides: {POOL_STRIDE, POOL_STRIDE} */ - int32_t pool_dst_sizes[4] = { BATCH, OC, POOL_OH, POOL_OW }; - int32_t pool_kernel[2] = { 3, 3 }; - int32_t pool_strides[2] = { POOL_STRIDE, POOL_STRIDE }; - int32_t pool_padding[2] = { POOL_PAD, POOL_PAD }; + ptrdiff_t pool_dst_sizes[4] = { BATCH, OC, POOL_OH, POOL_OW }; + ptrdiff_t pool_kernel[2] = { 3, 3 }; + ptrdiff_t pool_strides[2] = { POOL_STRIDE, POOL_STRIDE }; + ptrdiff_t pool_padding[2] = { POOL_PAD, POOL_PAD }; /* create pooling memory descriptor on dst descriptor * from previous primitive */ diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_net.cpp b/inference-engine/thirdparty/mkl-dnn/examples/simple_net.cpp index 8ebc5c5..586b6f6 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_net.cpp +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_net.cpp @@ -45,7 +45,7 @@ void simple_net(int times = 100) { memory::dims conv1_bias_tz = { 96 }; memory::dims conv1_dst_tz = { batch, 96, 55, 55 }; memory::dims conv1_strides = { 4, 4 }; - auto conv1_padding = { 0, 0 }; + memory::dims conv1_padding = { 0, 0 }; /* Allocate input and output buffers for user data */ std::vector user_src(batch * 3 * 227 * 227); @@ -165,7 +165,7 @@ void simple_net(int times = 100) { memory::dims pool1_dst_tz = { batch, 96, 27, 27 }; memory::dims pool1_kernel = { 3, 3 }; memory::dims pool1_strides = { 2, 2 }; - auto pool_padding = { 0, 0 }; + memory::dims pool_padding = { 0, 0 }; auto pool1_dst_md = memory::desc( { pool1_dst_tz }, memory::data_type::f32, memory::format::any); @@ -191,7 +191,7 @@ void simple_net(int times = 100) { memory::dims conv2_bias_tz = { 256 }; memory::dims conv2_dst_tz = { batch, 256, 27, 27 }; memory::dims conv2_strides = { 1, 1 }; - auto conv2_padding = { 2, 2 }; + memory::dims conv2_padding = { 2, 2 }; std::vector conv2_weights(std::accumulate( conv2_weights_tz.begin(), conv2_weights_tz.end(), 1, @@ -300,7 +300,7 @@ void simple_net(int times = 100) { memory::dims pool2_dst_tz = { batch, 256, 13, 13 }; memory::dims pool2_kernel = { 3, 3 }; memory::dims pool2_strides = { 2, 2 }; - auto pool2_padding = { 0, 0 }; + memory::dims pool2_padding = { 0, 0 }; auto pool2_dst_md = memory::desc( { pool2_dst_tz }, memory::data_type::f32, memory::format::any); @@ -328,7 +328,7 @@ void simple_net(int times = 100) { memory::dims conv3_bias_tz = { 384 }; memory::dims conv3_dst_tz = { batch, 384, 13, 13 }; memory::dims conv3_strides = { 1, 1 }; - auto conv3_padding = { 1, 1 }; + memory::dims conv3_padding = { 1, 1 }; std::vector conv3_weights(std::accumulate( conv3_weights_tz.begin(), conv3_weights_tz.end(), 1, @@ -415,7 +415,7 @@ void simple_net(int times = 100) { memory::dims conv4_bias_tz = { 384 }; memory::dims conv4_dst_tz = { batch, 384, 13, 13 }; memory::dims conv4_strides = { 1, 1 }; - auto conv4_padding = { 1, 1 }; + memory::dims conv4_padding = { 1, 1 }; std::vector conv4_weights(std::accumulate( conv4_weights_tz.begin(), conv4_weights_tz.end(), 1, @@ -501,7 +501,7 @@ void simple_net(int times = 100) { memory::dims conv5_bias_tz = { 256 }; memory::dims conv5_dst_tz = { batch, 256, 13, 13 }; memory::dims conv5_strides = { 1, 1 }; - auto conv5_padding = { 1, 1 }; + memory::dims conv5_padding = { 1, 1 }; std::vector conv5_weights(std::accumulate( conv5_weights_tz.begin(), conv5_weights_tz.end(), 1, @@ -586,7 +586,7 @@ void simple_net(int times = 100) { memory::dims pool5_dst_tz = { batch, 256, 6, 6 }; memory::dims pool5_kernel = { 3, 3 }; memory::dims pool5_strides = { 2, 2 }; - auto pool5_padding = { 0, 0 }; + memory::dims pool5_padding = { 0, 0 }; std::vector pool5_dst(std::accumulate(pool5_dst_tz.begin(), pool5_dst_tz.end(), 1, std::multiplies())); diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_net_int8.cpp b/inference-engine/thirdparty/mkl-dnn/examples/simple_net_int8.cpp index ec7879b..7ec0f4c 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_net_int8.cpp +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_net_int8.cpp @@ -38,7 +38,7 @@ void simple_net_int8() { memory::dims conv_bias_tz = { 384 }; memory::dims conv_dst_tz = { batch, 384, 13, 13 }; memory::dims conv_strides = { 1, 1 }; - auto conv_padding = { 1, 1 }; + memory::dims conv_padding = { 1, 1 }; /* Set Scaling mode for int8 quantizing */ const std::vector src_scales = { 1.8f }; diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn.cpp b/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn.cpp index 105979a..029e3c4 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn.cpp @@ -20,8 +20,6 @@ #include #include -#include "mkl_cblas.h" - #include "mkldnn.hpp" // MSVC doesn't support collapse clause in omp parallel @@ -49,6 +47,9 @@ std::vector alignment_model( std::vector alignments(src_seq_length_max *batch, 1.0f); std::vector exp_sums(batch, 1.0f); +const float onef = 1.0, zerof = 0.0; +const int onei = 1; + void compute_weighted_annotations(float *weighted_annotations, int src_seq_length_max, int batch, int feature_size, float *weights_annot, float *annotations) { @@ -56,10 +57,11 @@ void compute_weighted_annotations(float *weighted_annotations, // weights_annot is (2c, c) // annotation[i] = GEMM(weights_annot, enc_dst_layer[i]); - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, feature_size, - src_seq_length_max * batch, feature_size, 1.0f, weights_annot, - feature_size, annotations, feature_size, 0.0f, weighted_annotations, - feature_size); + int num_weighted_annotations = src_seq_length_max * batch; + mkldnn_sgemm("N", "N", + &feature_size, &num_weighted_annotations, &feature_size, + &onef, weights_annot, &feature_size, annotations, &feature_size, + &zerof, weighted_annotations, &feature_size); } void compute_attention(float *context_vectors, int src_seq_length_max, @@ -77,13 +79,16 @@ void compute_attention(float *context_vectors, int src_seq_length_max, // p is (n, 1) // first we precompute the weighted_dec_src_layer - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, feature_size, batch, - feature_size, 1.0f, weights_src_layer, feature_size, dec_src_layer, - feature_size, 0.0f, weighted_src_layer.data(), feature_size); + mkldnn_sgemm("N", "N", + &feature_size, &batch, &feature_size, &onef, + weights_src_layer, &feature_size, dec_src_layer, &feature_size, + &zerof, weighted_src_layer.data(), &feature_size); // then we compute the alignment model float *alignment_model_ptr = alignment_model.data(); +#ifdef _OPENMP #pragma omp parallel for collapse(2) +#endif for (int i = 0; i < src_seq_length_max; i++) { for (int j = 0; j < batch * feature_size; j++) alignment_model_ptr[i * batch * feature_size + j] = tanhf( @@ -92,15 +97,21 @@ void compute_attention(float *context_vectors, int src_seq_length_max, } // gemv with alignments weights. the resulting alignments are in alignments - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 1, - src_seq_length_max * batch, feature_size, 1.0f, weights_alignments, - 1, alignment_model_ptr, feature_size, 0.0f, alignments.data(), 1); - -// softmax on alignments. the resulting context weights are in alignments + int num_weighted_annotations = src_seq_length_max * batch; + mkldnn_sgemm("N", "N", + &onei, &num_weighted_annotations, &feature_size, &onef, + weights_alignments, &onei, alignment_model_ptr, &feature_size, + &zerof, alignments.data(), &onei); + + // softmax on alignments. the resulting context weights are in alignments +#ifdef _OPENMP #pragma omp parallel for +#endif for (int i = 0; i < batch; i++) exp_sums[i] = 0.0f; +#ifdef _OPENMP #pragma omp parallel for collapse(2) +#endif for (int i = 0; i < src_seq_length_max; i++) { for (int j = 0; j < batch; j++) { alignments[i * batch + j] = expf(alignments[i * batch + j]); @@ -108,20 +119,26 @@ void compute_attention(float *context_vectors, int src_seq_length_max, } } +#ifdef _OPENMP #pragma omp parallel for collapse(2) +#endif for (int i = 0; i < src_seq_length_max; i++) for (int j = 0; j < batch; j++) alignments[i * batch + j] /= exp_sums[j]; -// then we compute the context vectors + // then we compute the context vectors +#ifdef _OPENMP #pragma omp parallel for collapse(2) +#endif for (int i = 0; i < batch; i++) for (int j = 0; j < feature_size; j++) context_vectors[i * (feature_size + feature_size) + feature_size + j] = 0.0f; +#ifdef _OPENMP #pragma omp parallel for collapse(3) +#endif for (int i = 0; i < batch; i++) for (int k = 0; k < src_seq_length_max; k++) for (int j = 0; j < feature_size; j++) @@ -133,8 +150,10 @@ void compute_attention(float *context_vectors, int src_seq_length_max, void copy_context(float *src_iter, int n_layers, int n_states, int batch, int feature_size) { -// we copy the context from the first layer to all other layers + // we copy the context from the first layer to all other layers +#ifdef _OPENMP #pragma omp parallel for collapse(3) +#endif for (int k = 1; k < n_layers; k++) for (int j = 0; j < batch; j++) for (int i = 0; i < feature_size; i++) @@ -162,6 +181,7 @@ void simple_net() { for the context vectors in MKL-DNN yet */ + std::vector weights_reorders; std::vector encoder_net; std::vector decoder_net; @@ -181,8 +201,7 @@ void simple_net() { memory::dims enc_bidir_dst_layer_tz = { src_seq_length_max, batch, 2 * feature_size }; - /* GNMT encoder: 1 bidirectional layer and 7 unidirectional layers - */ + /* GNMT encoder: 1 bidirectional layer and 7 unidirectional layers */ std::vector user_enc_bidir_wei_layer( enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size, @@ -193,7 +212,7 @@ void simple_net() { std::vector user_enc_bidir_bias( enc_bidir_n_layers * 2 * lstm_n_gates * feature_size, 1.0f); - // We create the memory descriptors used by the user + /* Create the memory for user data */ auto user_enc_bidir_src_layer_md = mkldnn::memory::desc( { enc_bidir_src_layer_tz }, mkldnn::memory::data_type::f32, mkldnn::memory::format::tnc); @@ -209,11 +228,6 @@ void simple_net() { auto user_enc_bidir_bias_md = mkldnn::memory::desc({ enc_bidir_bias_tz }, mkldnn::memory::data_type::f32, mkldnn::memory::format::ldgo); - auto enc_bidir_dst_layer_md = mkldnn::memory::desc( - { enc_bidir_dst_layer_tz }, mkldnn::memory::data_type::f32, - mkldnn::memory::format::tnc); - - /* We create memories */ auto user_enc_bidir_src_layer_memory = mkldnn::memory( { user_enc_bidir_src_layer_md, cpu_engine }, net_src.data()); auto user_enc_bidir_wei_layer_memory @@ -225,40 +239,57 @@ void simple_net() { auto user_enc_bidir_bias_memory = mkldnn::memory( { user_enc_bidir_bias_md, cpu_engine }, user_enc_bidir_bias.data()); -#if 0 - /// These will be null memories - /// @todo introduce predefined null_memory() ? - auto enc_bidir_src_iter_memory = mkldnn::memory({enc_bidir_src_iter_md, cpu_engine}); - auto enc_bidir_dst_iter_memory = mkldnn::memory({enc_bidir_dst_iter_md, cpu_engine}); -#endif + /* Create memory descriptors for RNN data w/o specified layout */ + auto enc_bidir_wei_layer_md = memory::desc({ enc_bidir_weights_layer_tz }, + memory::data_type::f32, memory::format::any); + + auto enc_bidir_wei_iter_md = memory::desc({ enc_bidir_weights_iter_tz }, + memory::data_type::f32, memory::format::any); - /// @todo fix this once cell desc is merged with rnn_desc + auto enc_bidir_dst_layer_md = memory::desc({ enc_bidir_dst_layer_tz }, + memory::data_type::f32, memory::format::any); + + /* Create bidirectional RNN */ rnn_cell::desc bi_cell(algorithm::vanilla_lstm); rnn_forward::desc bi_layer_desc(prop_kind::forward_inference, bi_cell, rnn_direction::bidirectional_concat, user_enc_bidir_src_layer_md, - zero_md(), user_enc_bidir_wei_layer_md, user_enc_bidir_wei_iter_md, + zero_md(), enc_bidir_wei_layer_md, enc_bidir_wei_iter_md, user_enc_bidir_bias_md, enc_bidir_dst_layer_md, zero_md()); auto enc_bidir_prim_desc = mkldnn::rnn_forward::primitive_desc(bi_layer_desc, cpu_engine); - // there are currently no reorders - /// @todo add a reorder when they will be available + /* Create memory primitives for input data and use reorders to reorder + * user data to internal representation + */ + auto enc_bidir_wei_layer_memory + = memory(enc_bidir_prim_desc.weights_layer_primitive_desc()); + auto enc_bidir_wei_layer_reorder_pd = reorder::primitive_desc( + user_enc_bidir_wei_layer_memory.get_primitive_desc(), + enc_bidir_wei_layer_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(enc_bidir_wei_layer_reorder_pd, + user_enc_bidir_wei_layer_memory, enc_bidir_wei_layer_memory)); + + auto enc_bidir_wei_iter_memory + = memory(enc_bidir_prim_desc.weights_iter_primitive_desc()); + auto enc_bidir_wei_iter_reorder_pd = reorder::primitive_desc( + user_enc_bidir_wei_iter_memory.get_primitive_desc(), + enc_bidir_wei_iter_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(enc_bidir_wei_iter_reorder_pd, + user_enc_bidir_wei_iter_memory, enc_bidir_wei_iter_memory)); auto enc_bidir_dst_layer_memory = mkldnn::memory(enc_bidir_prim_desc.dst_layer_primitive_desc()); encoder_net.push_back( rnn_forward(enc_bidir_prim_desc, user_enc_bidir_src_layer_memory, - null_memory_, user_enc_bidir_wei_layer_memory, - user_enc_bidir_wei_iter_memory, user_enc_bidir_bias_memory, + null_memory_, enc_bidir_wei_layer_memory, + enc_bidir_wei_iter_memory, user_enc_bidir_bias_memory, enc_bidir_dst_layer_memory, null_memory_, null_memory_)); - /* GNMT encoder: unidirectional layers - */ - // First unidirectinal layer, the scaling from 2*feature size features - // comming from the previous layer come - /// memories + /* GNMT encoder: unidirectional layers */ + // First unidirectinal layer scales 2 * feature_size output of bidirectional + // layer to feature_size output std::vector user_enc_uni_first_wei_layer( 1 * 1 * 2 * feature_size * lstm_n_gates * feature_size, 1.0f); std::vector user_enc_uni_first_wei_iter( @@ -282,13 +313,9 @@ void simple_net() { auto user_enc_uni_first_bias_md = mkldnn::memory::desc( { user_enc_uni_first_bias_dims }, mkldnn::memory::data_type::f32, mkldnn::memory::format::ldgo); - auto enc_uni_first_dst_layer_md = mkldnn::memory::desc( - { enc_uni_first_dst_layer_dims }, mkldnn::memory::data_type::f32, - mkldnn::memory::format::tnc); auto user_enc_uni_first_wei_layer_memory = mkldnn::memory({ user_enc_uni_first_wei_layer_md, cpu_engine }, user_enc_uni_first_wei_layer.data()); - ; auto user_enc_uni_first_wei_iter_memory = mkldnn::memory({ user_enc_uni_first_wei_iter_md, cpu_engine }, user_enc_uni_first_wei_iter.data()); @@ -296,29 +323,55 @@ void simple_net() { = mkldnn::memory({ user_enc_uni_first_bias_md, cpu_engine }, user_enc_uni_first_bias.data()); + auto enc_uni_first_wei_layer_md + = memory::desc({ user_enc_uni_first_wei_layer_dims }, + memory::data_type::f32, memory::format::any); + auto enc_uni_first_wei_iter_md + = memory::desc({ user_enc_uni_first_wei_iter_dims }, + memory::data_type::f32, memory::format::any); + auto enc_uni_first_dst_layer_md + = memory::desc({ enc_uni_first_dst_layer_dims }, + memory::data_type::f32, memory::format::any); + /// @todo add suport for residual connections /// should it be a set residual in op_desc or a field to set manually? /// should be an integer to specify at which layer to start rnn_cell::desc enc_uni_first_cell(algorithm::vanilla_lstm); rnn_forward::desc enc_uni_first_layer_desc(prop_kind::forward_inference, enc_uni_first_cell, rnn_direction::unidirectional_left2right, - enc_bidir_dst_layer_md, zero_md(), user_enc_uni_first_wei_layer_md, - user_enc_uni_first_wei_iter_md, user_enc_uni_first_bias_md, + enc_bidir_dst_layer_md, zero_md(), enc_uni_first_wei_layer_md, + enc_uni_first_wei_iter_md, user_enc_uni_first_bias_md, enc_uni_first_dst_layer_md, zero_md()); auto enc_uni_first_prim_desc = mkldnn::rnn_forward::primitive_desc( enc_uni_first_layer_desc, cpu_engine); + + auto enc_uni_first_wei_layer_memory + = memory(enc_uni_first_prim_desc.weights_layer_primitive_desc()); + auto enc_uni_first_wei_layer_reorder_pd = reorder::primitive_desc( + user_enc_uni_first_wei_layer_memory.get_primitive_desc(), + enc_uni_first_wei_layer_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(enc_uni_first_wei_layer_reorder_pd, + user_enc_uni_first_wei_layer_memory, + enc_uni_first_wei_layer_memory)); + + auto enc_uni_first_wei_iter_memory + = memory(enc_uni_first_prim_desc.weights_iter_primitive_desc()); + auto enc_uni_first_wei_iter_reorder_pd = reorder::primitive_desc( + user_enc_uni_first_wei_iter_memory.get_primitive_desc(), + enc_uni_first_wei_iter_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(enc_uni_first_wei_iter_reorder_pd, + user_enc_uni_first_wei_iter_memory, enc_uni_first_wei_iter_memory)); + auto enc_uni_first_dst_layer_memory = mkldnn::memory( enc_uni_first_prim_desc.dst_layer_primitive_desc()); - /// @todo add a reorder when they will be available encoder_net.push_back(rnn_forward(enc_uni_first_prim_desc, enc_bidir_dst_layer_memory, null_memory_, - user_enc_uni_first_wei_layer_memory, - user_enc_uni_first_wei_iter_memory, user_enc_uni_first_bias_memory, + enc_uni_first_wei_layer_memory, + enc_uni_first_wei_iter_memory, user_enc_uni_first_bias_memory, enc_uni_first_dst_layer_memory, null_memory_, null_memory_)); - // Remainging Unidirectional layers - /// memories + /* Remainging unidirectional layers */ std::vector user_enc_uni_wei_layer((enc_unidir_n_layers - 1) * 1 * feature_size * lstm_n_gates * feature_size, 1.0f); std::vector user_enc_uni_wei_iter((enc_unidir_n_layers - 1) * 1 @@ -341,43 +394,60 @@ void simple_net() { mkldnn::memory::format::ldigo); auto user_enc_uni_bias_md = mkldnn::memory::desc({ user_enc_uni_bias_dims }, mkldnn::memory::data_type::f32, mkldnn::memory::format::ldgo); - auto enc_dst_layer_md = mkldnn::memory::desc({ enc_dst_layer_dims }, - mkldnn::memory::data_type::f32, mkldnn::memory::format::tnc); auto user_enc_uni_wei_layer_memory = mkldnn::memory({ user_enc_uni_wei_layer_md, cpu_engine }, user_enc_uni_wei_layer.data()); - ; auto user_enc_uni_wei_iter_memory = mkldnn::memory({ user_enc_uni_wei_iter_md, cpu_engine }, user_enc_uni_wei_iter.data()); auto user_enc_uni_bias_memory = mkldnn::memory( { user_enc_uni_bias_md, cpu_engine }, user_enc_uni_bias.data()); + auto enc_uni_wei_layer_md = memory::desc({ user_enc_uni_wei_layer_dims }, + memory::data_type::f32, memory::format::any); + auto enc_uni_wei_iter_md = memory::desc({ user_enc_uni_wei_iter_dims }, + memory::data_type::f32, memory::format::any); + auto enc_dst_layer_md = memory::desc({ enc_dst_layer_dims }, + memory::data_type::f32, memory::format::any); + /// @todo add suport for residual connections /// should it be a set residual in op_desc or a field to set manually? /// should be an integer to specify at which layer to start rnn_cell::desc enc_uni_cell(algorithm::vanilla_lstm); rnn_forward::desc enc_uni_layer_desc(prop_kind::forward_inference, enc_uni_cell, rnn_direction::unidirectional_left2right, - enc_uni_first_dst_layer_md, zero_md(), user_enc_uni_wei_layer_md, - user_enc_uni_wei_iter_md, user_enc_uni_bias_md, enc_dst_layer_md, + enc_uni_first_dst_layer_md, zero_md(), enc_uni_wei_layer_md, + enc_uni_wei_iter_md, user_enc_uni_bias_md, enc_dst_layer_md, zero_md()); auto enc_uni_prim_desc = mkldnn::rnn_forward::primitive_desc( enc_uni_layer_desc, cpu_engine); + + auto enc_uni_wei_layer_memory + = memory(enc_uni_prim_desc.weights_layer_primitive_desc()); + auto enc_uni_wei_layer_reorder_pd = reorder::primitive_desc( + user_enc_uni_wei_layer_memory.get_primitive_desc(), + enc_uni_wei_layer_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(enc_uni_wei_layer_reorder_pd, + user_enc_uni_wei_layer_memory, enc_uni_wei_layer_memory)); + + auto enc_uni_wei_iter_memory + = memory(enc_uni_prim_desc.weights_iter_primitive_desc()); + auto enc_uni_wei_iter_reorder_pd = reorder::primitive_desc( + user_enc_uni_wei_iter_memory.get_primitive_desc(), + enc_uni_wei_iter_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(enc_uni_wei_iter_reorder_pd, + user_enc_uni_wei_iter_memory, enc_uni_wei_iter_memory)); + auto enc_dst_layer_memory = mkldnn::memory(enc_uni_prim_desc.dst_layer_primitive_desc()); - /// @todo add a reorder when they will be available encoder_net.push_back( rnn_forward(enc_uni_prim_desc, enc_uni_first_dst_layer_memory, - null_memory_, user_enc_uni_wei_layer_memory, - user_enc_uni_wei_iter_memory, user_enc_uni_bias_memory, + null_memory_, enc_uni_wei_layer_memory, + enc_uni_wei_iter_memory, user_enc_uni_bias_memory, enc_dst_layer_memory, null_memory_, null_memory_)); - /* - * GNMT: decoder with attention mechanism - */ - // user provided memories + /* GNMT: decoder with attention mechanism */ std::vector user_dec_wei_layer( dec_n_layers * 1 * feature_size * lstm_n_gates * feature_size, 1.0f); @@ -402,8 +472,7 @@ void simple_net() { = { dec_n_layers, 1, lstm_n_gates, feature_size }; memory::dims dec_src_layer_dims = { 1, batch, feature_size }; - memory::dims dec_dst_layer_dims - = { tgt_seq_length_max, batch, feature_size }; + memory::dims dec_dst_layer_dims = { 1, batch, feature_size }; // We will use the same memory for dec_src_iter and dec_dst_iter // However, dec_src_iter has a context vector but not @@ -434,7 +503,6 @@ void simple_net() { mkldnn::memory::data_type::f32, mkldnn::memory::format::ldsnc); auto user_dec_wei_layer_memory = mkldnn::memory( { user_dec_wei_layer_md, cpu_engine }, user_dec_wei_layer.data()); - ; auto user_dec_wei_iter_memory = mkldnn::memory( { user_dec_wei_iter_md, cpu_engine }, user_dec_wei_iter.data()); auto user_dec_bias_memory = mkldnn::memory( @@ -444,6 +512,12 @@ void simple_net() { auto dec_src_layer_memory = mkldnn::memory({ dec_src_layer_md, cpu_engine }); + auto dec_wei_layer_md = mkldnn::memory::desc( + { user_dec_wei_layer_dims }, mkldnn::memory::data_type::f32, + mkldnn::memory::format::any); + auto dec_wei_iter_md = mkldnn::memory::desc({ user_dec_wei_iter_dims }, + mkldnn::memory::data_type::f32, mkldnn::memory::format::any); + // As mentioned above, we create a view without context out of the // memory with context. auto dec_dst_iter_memory = mkldnn::memory({ dec_dst_iter_md, cpu_engine }); @@ -457,15 +531,30 @@ void simple_net() { rnn_cell::desc dec_cell(algorithm::vanilla_lstm); rnn_forward::desc dec_ctx_desc(prop_kind::forward_inference, dec_cell, rnn_direction::unidirectional_left2right, dec_src_layer_md, - dec_dst_iter_md, user_dec_wei_layer_md, user_dec_wei_iter_md, + dec_dst_iter_md, dec_wei_layer_md, dec_wei_iter_md, user_dec_bias_md, dec_dst_layer_md, dec_dst_iter_noctx_md); auto dec_ctx_prim_desc = mkldnn::rnn_forward::primitive_desc(dec_ctx_desc, cpu_engine); - /// @todo add a reorder when they will be available + auto dec_wei_layer_memory + = memory(dec_ctx_prim_desc.weights_layer_primitive_desc()); + auto dec_wei_layer_reorder_pd = reorder::primitive_desc( + user_dec_wei_layer_memory.get_primitive_desc(), + dec_wei_layer_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(dec_wei_layer_reorder_pd, + user_dec_wei_layer_memory, dec_wei_layer_memory)); + + auto dec_wei_iter_memory + = memory(dec_ctx_prim_desc.weights_iter_primitive_desc()); + auto dec_wei_iter_reorder_pd = reorder::primitive_desc( + user_dec_wei_iter_memory.get_primitive_desc(), + dec_wei_iter_memory.get_primitive_desc()); + weights_reorders.push_back(reorder(dec_wei_iter_reorder_pd, + user_dec_wei_iter_memory, dec_wei_iter_memory)); + decoder_net.push_back(rnn_forward(dec_ctx_prim_desc, dec_src_layer_memory, - dec_dst_iter_memory, user_dec_wei_layer_memory, - user_dec_wei_iter_memory, user_dec_bias_memory, + dec_dst_iter_memory, dec_wei_layer_memory, + dec_wei_iter_memory, user_dec_bias_memory, user_dec_dst_layer_memory, dec_dst_iter_memory, null_memory_)); // allocating temporary buffer for attention mechanism @@ -476,10 +565,8 @@ void simple_net() { Execution */ auto execute = [&]() { - // We save the original handle on dst_layer as we will modify it at each - // iteration - void *dst_layer_original_handle - = user_dec_dst_layer_memory.get_data_handle(); + // reorder weights to MKLDNN internal representation + stream(stream::kind::eager).submit(weights_reorders).wait(); // run encoder (1 stream) stream(stream::kind::eager).submit(encoder_net).wait(); @@ -490,43 +577,40 @@ void simple_net() { user_weights_annotation.data(), (float *)enc_dst_layer_memory.get_data_handle()); - // We initialise dst_layer[0] to the embedding of , which are - // assumed to - // be 0 here - memset(dst_layer_original_handle, 0, - batch * feature_size * sizeof(float)); + // We initialise src_layer to the embedding of , which + // are assumed to be 0 here + memset(dec_src_layer_memory.get_data_handle(), 0, + dec_src_layer_memory.get_primitive_desc().get_size()); + // From now on, src points to the output of the last iteration for (int i = 0; i < tgt_seq_length_max; i++) { - float *dst_layer_handle - = (float *)user_dec_dst_layer_memory.get_data_handle(); - float *dst_iter_handle - = (float *)dec_dst_iter_memory.get_data_handle(); + float *src_att_layer_handle + = (float *) dec_src_layer_memory.get_data_handle(); + float *src_att_iter_handle + = (float *) dec_dst_iter_memory.get_data_handle(); // Compute attention context vector into the first layer src_iter - compute_attention(dst_iter_handle, src_seq_length_max, batch, + compute_attention(src_att_iter_handle, src_seq_length_max, batch, feature_size, user_weights_attention_src_layer.data(), - dst_layer_handle, + src_att_layer_handle, (float *)enc_bidir_dst_layer_memory.get_data_handle(), weighted_annotations.data(), user_weights_alignments.data()); // copy the context vectors to all layers of src_iter - copy_context(dst_iter_handle, dec_n_layers, lstm_n_states, batch, + copy_context(src_att_iter_handle, dec_n_layers, lstm_n_states, batch, feature_size); - // We set src_layer to be the previously - dec_src_layer_memory.set_data_handle(dst_layer_handle); - // run the decoder iteration stream(stream::kind::eager).submit(decoder_net).wait(); - // Move the handle on the dst layer to the next iteration + // Move the handle on the src/dst layer to the next iteration + auto dst_layer_handle = (float *) user_dec_dst_layer_memory.get_data_handle(); + dec_src_layer_memory.set_data_handle(dst_layer_handle); user_dec_dst_layer_memory.set_data_handle( dst_layer_handle + batch * feature_size); } - // we restore the handle to the begining of the buffer - user_dec_dst_layer_memory.set_data_handle(dst_layer_original_handle); - /// @todo run the softmax after each iteration or not? + }; execute(); diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_int8.cpp b/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_int8.cpp new file mode 100644 index 0000000..7822028 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_int8.cpp @@ -0,0 +1,709 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "mkldnn.hpp" + +// MSVC doesn't support collapse clause in omp parallel +#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#define collapse(x) +#endif + +using namespace mkldnn; + +const int batch = 64; +const int src_seq_length_max = 25; +const int tgt_seq_length_max = 27; + +const int feature_size = 1024; + +const int enc_bidir_n_layers = 1; +const int enc_unidir_n_layers = 7; +const int dec_n_layers = 8; + +const int lstm_n_gates = 4; +const int lstm_n_states = 2; +std::vector weighted_src_layer(batch *feature_size, 1); +std::vector alignment_model( + src_seq_length_max *batch *feature_size, 1.0f); +std::vector alignments(src_seq_length_max *batch, 1.0f); +std::vector exp_sums(batch, 1.0f); + +const float onef = 1.0, zerof = 0.0; +const int onei = 1; + +void compute_weighted_annotations(float *weighted_annotations, + int src_seq_length_max, int batch, int feature_size, + float *weights_annot, float *annotations) { + // annotations(aka enc_dst_layer) is (t, n, 2c) + // weights_annot is (2c, c) + + int num_weighted_annotations = src_seq_length_max * batch; + // annotation[i] = GEMM(weights_annot, enc_dst_layer[i]); + mkldnn_sgemm("N", "N", &feature_size, &num_weighted_annotations, + &feature_size, &onef, weights_annot, &feature_size, annotations, + &feature_size, &zerof, weighted_annotations, &feature_size); +} + +void compute_sum_of_rows(int8_t *a, int rows, int cols, int32_t *a_reduced) { +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int i = 0; i < cols; i++) { + a_reduced[i] = 0; + for (int j = 0; j < rows; j++) { + a_reduced[i] += (int32_t)a[i * rows + j]; + } + } +} + +void compute_attention(float *context_vectors, int src_seq_length_max, + int batch, int feature_size, int8_t *weights_src_layer, + float weights_src_layer_scale, int32_t *compensation, + uint8_t *dec_src_layer, float dec_src_layer_scale, + float dec_src_layer_shift, uint8_t *annotations, + float *weighted_annotations, float *weights_alignments) { + // dst_iter : (n, c) matrix + // src_layer: (n, c) matrix + // weighted_annotations (t, n, c) + + // weights_yi is (c, c) + // weights_ai is (c, 1) + // tmp[i] is (n, c) + // a[i] is (n, 1) + // p is (n, 1) + + // first we precompute the weighted_dec_src_layer + int8_t ao = 0; + int8_t bo = 0; + int32_t co = 0; + mkldnn_gemm_s8u8s32("N", "N", "F", &feature_size, &batch, &feature_size, + &onef, weights_src_layer, &feature_size, &ao, dec_src_layer, + &feature_size, &bo, &zerof, weighted_src_layer.data(), + &feature_size, &co); + + // then we compute the alignment model + float *alignment_model_ptr = alignment_model.data(); +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < src_seq_length_max; i++) { + for (int j = 0; j < batch; j++) { + for (int k = 0; k < feature_size; k++) { + size_t tnc_offset + = i * batch * feature_size + j * feature_size + k; + alignment_model_ptr[tnc_offset] = tanhf( + (float)(weighted_src_layer.data()[j * feature_size + k] + - dec_src_layer_shift * compensation[k]) + / (dec_src_layer_scale + * weights_src_layer_scale) + + weighted_annotations[tnc_offset]); + } + } + } + + // gemv with alignments weights. the resulting alignments are in alignments + int num_weighted_annotations = src_seq_length_max * batch; + mkldnn_sgemm("N", "N", &onei, &num_weighted_annotations, &feature_size, + &onef, weights_alignments, &onei, alignment_model_ptr, + &feature_size, &zerof, alignments.data(), &onei); + +// softmax on alignments. the resulting context weights are in alignments +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int i = 0; i < batch; i++) + exp_sums[i] = 0.0f; +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < src_seq_length_max; i++) { + for (int j = 0; j < batch; j++) { + alignments[i * batch + j] = expf(alignments[i * batch + j]); + exp_sums[j] += alignments[i * batch + j]; + } + } + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < src_seq_length_max; i++) + for (int j = 0; j < batch; j++) + alignments[i * batch + j] /= exp_sums[j]; + +// then we compute the context vectors +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < batch; i++) + for (int j = 0; j < feature_size; j++) + context_vectors[i * (feature_size + feature_size) + feature_size + + j] + = 0.0f; + +#ifdef _OPENMP +#pragma omp parallel for collapse(3) +#endif + for (int i = 0; i < batch; i++) + for (int k = 0; k < src_seq_length_max; k++) + for (int j = 0; j < feature_size; j++) + context_vectors[i * (feature_size + feature_size) + feature_size + + j] + += alignments[k * batch + i] + * (((float)annotations[j + + feature_size * (i + batch * k)] + - dec_src_layer_shift) + / dec_src_layer_scale); +} + +void copy_context(float *src_iter, int n_layers, int n_states, int batch, + int feature_size) { +// we copy the context from the first layer to all other layers +#ifdef _OPENMP +#pragma omp parallel for collapse(3) +#endif + for (int k = 1; k < n_layers; k++) + for (int j = 0; j < batch; j++) + for (int i = 0; i < feature_size; i++) + src_iter[(k * n_states * batch + j) + * (feature_size + feature_size) + + i] + = src_iter[j * (feature_size + feature_size) + i]; +} + +void simple_net() { + auto cpu_engine = engine(engine::cpu, 0); + auto null_memory_ = null_memory(cpu_engine); + + /* + GNMT low precicion example. + Note, we do not implement connection yet. + For the encoder we use: + - one primitive for the bidirectional layer of the encoder + - one primitive for all remaining unidirectional layers in the encoder + For the decoder we use: + - one primitive for the first iteration + - one primitive for all subsequent iterations in the decoder. Note that + in this example, this primitive computes the states in place. + - the attention mechanism is implemented separately as there is no support + for the context vectors in MKL-DNN yet + */ + + std::vector weights_reorders; + std::vector encoder_net; + std::vector decoder_net; + + std::vector net_src(batch * src_seq_length_max * feature_size, 0.1f); + std::vector net_dst(batch * tgt_seq_length_max * feature_size, 0.1f); + + /* Quantization factors for fp32 data */ + + const float data_shift = 64.; + const float data_scale = 63.; + const int weights_scale_mask = 3; // 11 for last two dimensions of ldigo + std::vector weights_scales(lstm_n_gates * feature_size); + /* assign halves of vector with arbitrary values */ + const int scales_half = lstm_n_gates * feature_size / 2; + std::fill( + weights_scales.begin(), weights_scales.begin() + scales_half, 30.f); + std::fill(weights_scales.begin() + scales_half + 1, weights_scales.end(), + 65.5f); + + /* Encoder */ + + memory::dims enc_bidir_src_layer_tz + = { src_seq_length_max, batch, feature_size }; + memory::dims enc_bidir_weights_layer_tz = { enc_bidir_n_layers, 2, + feature_size, lstm_n_gates, feature_size }; + memory::dims enc_bidir_weights_iter_tz = { enc_bidir_n_layers, 2, + feature_size, lstm_n_gates, feature_size }; + memory::dims enc_bidir_bias_tz + = { enc_bidir_n_layers, 2, lstm_n_gates, feature_size }; + memory::dims enc_bidir_dst_layer_tz + = { src_seq_length_max, batch, 2 * feature_size }; + + /* GNMT encoder: 1 bidirectional layer and 7 unidirectional layers */ + + std::vector user_enc_bidir_wei_layer( + enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size, + 0.3f); + std::vector user_enc_bidir_wei_iter( + enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size, + 0.2f); + std::vector user_enc_bidir_bias( + enc_bidir_n_layers * 2 * lstm_n_gates * feature_size, 1.0f); + + /* Create the memory for user data */ + auto user_enc_bidir_src_layer_md = memory::desc({ enc_bidir_src_layer_tz }, + memory::data_type::f32, memory::format::tnc); + + auto user_enc_bidir_wei_layer_md + = memory::desc({ enc_bidir_weights_layer_tz }, + memory::data_type::f32, memory::format::ldigo); + + auto user_enc_bidir_wei_iter_md + = memory::desc({ enc_bidir_weights_iter_tz }, + memory::data_type::f32, memory::format::ldigo); + + auto user_enc_bidir_bias_md = memory::desc({ enc_bidir_bias_tz }, + memory::data_type::f32, memory::format::ldgo); + + auto user_enc_bidir_src_layer_memory = memory( + { user_enc_bidir_src_layer_md, cpu_engine }, net_src.data()); + auto user_enc_bidir_wei_layer_memory + = memory({ user_enc_bidir_wei_layer_md, cpu_engine }, + user_enc_bidir_wei_layer.data()); + auto user_enc_bidir_wei_iter_memory + = memory({ user_enc_bidir_wei_iter_md, cpu_engine }, + user_enc_bidir_wei_iter.data()); + auto user_enc_bidir_bias_memory = memory( + { user_enc_bidir_bias_md, cpu_engine }, user_enc_bidir_bias.data()); + + /* Create memory descriptors for RNN data w/o specified layout */ + auto enc_bidir_src_layer_md = memory::desc({ enc_bidir_src_layer_tz }, + memory::data_type::u8, memory::format::any); + + auto enc_bidir_wei_layer_md = memory::desc({ enc_bidir_weights_layer_tz }, + memory::data_type::s8, memory::format::any); + + auto enc_bidir_wei_iter_md = memory::desc({ enc_bidir_weights_iter_tz }, + memory::data_type::s8, memory::format::any); + + auto enc_bidir_dst_layer_md = memory::desc({ enc_bidir_dst_layer_tz }, + memory::data_type::u8, memory::format::any); + + /* Create bidirectional RNN */ + rnn_cell::desc bi_cell(algorithm::vanilla_lstm); + + /* Check if int8 RNN is supported */ + try { + rnn_forward::desc bi_layer_desc(prop_kind::forward_inference, bi_cell, + rnn_direction::bidirectional_concat, enc_bidir_src_layer_md, + zero_md(), enc_bidir_wei_layer_md, enc_bidir_wei_iter_md, + user_enc_bidir_bias_md, enc_bidir_dst_layer_md, zero_md()); + } catch (error &e) { + if (e.status == mkldnn_unimplemented) { + std::cerr + << "Dependency on Intel(R) MKL version 2019u2 or newer is " + "required for int8 RNN" + << std::endl; + } + throw; + } + + rnn_forward::desc bi_layer_desc(prop_kind::forward_inference, bi_cell, + rnn_direction::bidirectional_concat, enc_bidir_src_layer_md, + zero_md(), enc_bidir_wei_layer_md, enc_bidir_wei_iter_md, + user_enc_bidir_bias_md, enc_bidir_dst_layer_md, zero_md()); + + /* Define RNN attributes that store quantization parameters */ + primitive_attr attr; + attr.set_int_output_round_mode(round_mode::round_nearest); + attr.set_rnn_data_qparams(data_scale, data_shift); + attr.set_rnn_weights_qparams(weights_scale_mask, weights_scales); + + auto enc_bidir_prim_desc + = rnn_forward::primitive_desc(bi_layer_desc, attr, cpu_engine); + + /* Create memory primitives for input data and use reorders to quantize + * values to int8 + * NOTE: same attributes are used when creating RNN primitive and reorders + */ + auto enc_bidir_src_layer_memory + = memory(enc_bidir_prim_desc.src_layer_primitive_desc()); + auto enc_bidir_src_layer_reorder_pd = reorder::primitive_desc( + user_enc_bidir_src_layer_memory.get_primitive_desc(), + enc_bidir_src_layer_memory.get_primitive_desc(), attr); + encoder_net.push_back(reorder(enc_bidir_src_layer_reorder_pd, + user_enc_bidir_src_layer_memory, enc_bidir_src_layer_memory)); + + auto enc_bidir_wei_layer_memory + = memory(enc_bidir_prim_desc.weights_layer_primitive_desc()); + auto enc_bidir_wei_layer_reorder_pd = reorder::primitive_desc( + user_enc_bidir_wei_layer_memory.get_primitive_desc(), + enc_bidir_wei_layer_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(enc_bidir_wei_layer_reorder_pd, + user_enc_bidir_wei_layer_memory, enc_bidir_wei_layer_memory)); + + auto enc_bidir_wei_iter_memory + = memory(enc_bidir_prim_desc.weights_iter_primitive_desc()); + auto enc_bidir_wei_iter_reorder_pd = reorder::primitive_desc( + user_enc_bidir_wei_iter_memory.get_primitive_desc(), + enc_bidir_wei_iter_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(enc_bidir_wei_iter_reorder_pd, + user_enc_bidir_wei_iter_memory, enc_bidir_wei_iter_memory)); + + auto enc_bidir_dst_layer_memory + = memory(enc_bidir_prim_desc.dst_layer_primitive_desc()); + + encoder_net.push_back( + rnn_forward(enc_bidir_prim_desc, enc_bidir_src_layer_memory, + null_memory_, enc_bidir_wei_layer_memory, + enc_bidir_wei_iter_memory, user_enc_bidir_bias_memory, + enc_bidir_dst_layer_memory, null_memory_, null_memory_)); + + /* GNMT encoder: unidirectional layers */ + // First unidirectinal layer scales 2 * feature_size output of bidirectional + // layer to feature_size output + std::vector user_enc_uni_first_wei_layer( + 1 * 1 * 2 * feature_size * lstm_n_gates * feature_size, 0.3f); + std::vector user_enc_uni_first_wei_iter( + 1 * 1 * feature_size * lstm_n_gates * feature_size, 0.2f); + std::vector user_enc_uni_first_bias( + 1 * 1 * lstm_n_gates * feature_size, 1.0f); + + memory::dims user_enc_uni_first_wei_layer_dims + = { 1, 1, 2 * feature_size, lstm_n_gates, feature_size }; + memory::dims user_enc_uni_first_wei_iter_dims + = { 1, 1, feature_size, lstm_n_gates, feature_size }; + memory::dims user_enc_uni_first_bias_dims + = { 1, 1, lstm_n_gates, feature_size }; + memory::dims enc_uni_first_dst_layer_dims + = { src_seq_length_max, batch, feature_size }; + + auto user_enc_uni_first_wei_layer_md + = memory::desc({ user_enc_uni_first_wei_layer_dims }, + memory::data_type::f32, memory::format::ldigo); + auto user_enc_uni_first_wei_iter_md + = memory::desc({ user_enc_uni_first_wei_iter_dims }, + memory::data_type::f32, memory::format::ldigo); + auto user_enc_uni_first_bias_md + = memory::desc({ user_enc_uni_first_bias_dims }, + memory::data_type::f32, memory::format::ldgo); + auto user_enc_uni_first_wei_layer_memory + = memory({ user_enc_uni_first_wei_layer_md, cpu_engine }, + user_enc_uni_first_wei_layer.data()); + auto user_enc_uni_first_wei_iter_memory + = memory({ user_enc_uni_first_wei_iter_md, cpu_engine }, + user_enc_uni_first_wei_iter.data()); + auto user_enc_uni_first_bias_memory + = memory({ user_enc_uni_first_bias_md, cpu_engine }, + user_enc_uni_first_bias.data()); + + auto enc_uni_first_wei_layer_md + = memory::desc({ user_enc_uni_first_wei_layer_dims }, + memory::data_type::s8, memory::format::any); + auto enc_uni_first_wei_iter_md + = memory::desc({ user_enc_uni_first_wei_iter_dims }, + memory::data_type::s8, memory::format::any); + auto enc_uni_first_dst_layer_md + = memory::desc({ enc_uni_first_dst_layer_dims }, + memory::data_type::u8, memory::format::any); + + rnn_cell::desc enc_uni_first_cell(algorithm::vanilla_lstm); + rnn_forward::desc enc_uni_first_layer_desc(prop_kind::forward_inference, + enc_uni_first_cell, rnn_direction::unidirectional_left2right, + enc_bidir_dst_layer_md, zero_md(), enc_uni_first_wei_layer_md, + enc_uni_first_wei_iter_md, user_enc_uni_first_bias_md, + enc_uni_first_dst_layer_md, zero_md()); + + auto enc_uni_first_prim_desc = rnn_forward::primitive_desc( + enc_uni_first_layer_desc, attr, cpu_engine); + + auto enc_uni_first_wei_layer_memory + = memory(enc_uni_first_prim_desc.weights_layer_primitive_desc()); + auto enc_uni_first_wei_layer_reorder_pd = reorder::primitive_desc( + user_enc_uni_first_wei_layer_memory.get_primitive_desc(), + enc_uni_first_wei_layer_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(enc_uni_first_wei_layer_reorder_pd, + user_enc_uni_first_wei_layer_memory, + enc_uni_first_wei_layer_memory)); + + auto enc_uni_first_wei_iter_memory + = memory(enc_uni_first_prim_desc.weights_iter_primitive_desc()); + auto enc_uni_first_wei_iter_reorder_pd = reorder::primitive_desc( + user_enc_uni_first_wei_iter_memory.get_primitive_desc(), + enc_uni_first_wei_iter_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(enc_uni_first_wei_iter_reorder_pd, + user_enc_uni_first_wei_iter_memory, enc_uni_first_wei_iter_memory)); + + auto enc_uni_first_dst_layer_memory + = memory(enc_uni_first_prim_desc.dst_layer_primitive_desc()); + + encoder_net.push_back(rnn_forward(enc_uni_first_prim_desc, + enc_bidir_dst_layer_memory, null_memory_, + enc_uni_first_wei_layer_memory, enc_uni_first_wei_iter_memory, + user_enc_uni_first_bias_memory, enc_uni_first_dst_layer_memory, + null_memory_, null_memory_)); + + /* Remainging unidirectional layers */ + std::vector user_enc_uni_wei_layer((enc_unidir_n_layers - 1) * 1 + * feature_size * lstm_n_gates * feature_size, + 0.3f); + std::vector user_enc_uni_wei_iter((enc_unidir_n_layers - 1) * 1 + * feature_size * lstm_n_gates * feature_size, + 0.2f); + std::vector user_enc_uni_bias( + (enc_unidir_n_layers - 1) * 1 * lstm_n_gates * feature_size, 1.0f); + + memory::dims user_enc_uni_wei_layer_dims = { (enc_unidir_n_layers - 1), 1, + feature_size, lstm_n_gates, feature_size }; + memory::dims user_enc_uni_wei_iter_dims = { (enc_unidir_n_layers - 1), 1, + feature_size, lstm_n_gates, feature_size }; + memory::dims user_enc_uni_bias_dims + = { (enc_unidir_n_layers - 1), 1, lstm_n_gates, feature_size }; + memory::dims enc_dst_layer_dims + = { src_seq_length_max, batch, feature_size }; + + auto user_enc_uni_wei_layer_md + = memory::desc({ user_enc_uni_wei_layer_dims }, + memory::data_type::f32, memory::format::ldigo); + auto user_enc_uni_wei_iter_md = memory::desc({ user_enc_uni_wei_iter_dims }, + memory::data_type::f32, memory::format::ldigo); + auto user_enc_uni_bias_md = memory::desc({ user_enc_uni_bias_dims }, + memory::data_type::f32, memory::format::ldgo); + + auto user_enc_uni_wei_layer_memory + = memory({ user_enc_uni_wei_layer_md, cpu_engine }, + user_enc_uni_wei_layer.data()); + auto user_enc_uni_wei_iter_memory + = memory({ user_enc_uni_wei_iter_md, cpu_engine }, + user_enc_uni_wei_iter.data()); + auto user_enc_uni_bias_memory = memory( + { user_enc_uni_bias_md, cpu_engine }, user_enc_uni_bias.data()); + + auto enc_uni_wei_layer_md = memory::desc({ user_enc_uni_wei_layer_dims }, + memory::data_type::s8, memory::format::any); + auto enc_uni_wei_iter_md = memory::desc({ user_enc_uni_wei_iter_dims }, + memory::data_type::s8, memory::format::any); + auto enc_dst_layer_md = memory::desc({ enc_dst_layer_dims }, + memory::data_type::f32, memory::format::any); + + rnn_cell::desc enc_uni_cell(algorithm::vanilla_lstm); + rnn_forward::desc enc_uni_layer_desc(prop_kind::forward_inference, + enc_uni_cell, rnn_direction::unidirectional_left2right, + enc_uni_first_dst_layer_md, zero_md(), enc_uni_wei_layer_md, + enc_uni_wei_iter_md, user_enc_uni_bias_md, enc_dst_layer_md, + zero_md()); + auto enc_uni_prim_desc + = rnn_forward::primitive_desc(enc_uni_layer_desc, attr, cpu_engine); + + auto enc_uni_wei_layer_memory + = memory(enc_uni_prim_desc.weights_layer_primitive_desc()); + auto enc_uni_wei_layer_reorder_pd = reorder::primitive_desc( + user_enc_uni_wei_layer_memory.get_primitive_desc(), + enc_uni_wei_layer_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(enc_uni_wei_layer_reorder_pd, + user_enc_uni_wei_layer_memory, enc_uni_wei_layer_memory)); + + auto enc_uni_wei_iter_memory + = memory(enc_uni_prim_desc.weights_iter_primitive_desc()); + auto enc_uni_wei_iter_reorder_pd = reorder::primitive_desc( + user_enc_uni_wei_iter_memory.get_primitive_desc(), + enc_uni_wei_iter_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(enc_uni_wei_iter_reorder_pd, + user_enc_uni_wei_iter_memory, enc_uni_wei_iter_memory)); + + auto enc_dst_layer_memory + = memory(enc_uni_prim_desc.dst_layer_primitive_desc()); + + encoder_net.push_back( + rnn_forward(enc_uni_prim_desc, enc_uni_first_dst_layer_memory, + null_memory_, enc_uni_wei_layer_memory, + enc_uni_wei_iter_memory, user_enc_uni_bias_memory, + enc_dst_layer_memory, null_memory_, null_memory_)); + + /* Decoder with attention mechanism */ + std::vector user_dec_wei_layer( + dec_n_layers * 1 * feature_size * lstm_n_gates * feature_size, + 0.2f); + std::vector user_dec_wei_iter(dec_n_layers * 1 + * (feature_size + feature_size) * lstm_n_gates + * feature_size, + 0.3f); + std::vector user_dec_bias( + dec_n_layers * 1 * lstm_n_gates * feature_size, 1.0f); + std::vector user_weights_attention_src_layer( + feature_size * feature_size, 1); + float weights_attention_scale = 127.; + std::vector user_weights_annotation( + feature_size * feature_size, 1.0f); + std::vector user_weights_alignments(feature_size, 1.0f); + // Buffer to store decoder output for all iterations + std::vector dec_dst(tgt_seq_length_max * batch * feature_size, 0); + + memory::dims user_dec_wei_layer_dims + = { dec_n_layers, 1, feature_size, lstm_n_gates, feature_size }; + memory::dims user_dec_wei_iter_dims = { dec_n_layers, 1, + feature_size + feature_size, lstm_n_gates, feature_size }; + memory::dims user_dec_bias_dims + = { dec_n_layers, 1, lstm_n_gates, feature_size }; + memory::dims dec_src_layer_dims = { 1, batch, feature_size }; + memory::dims dec_dst_layer_dims = { 1, batch, feature_size }; + + // We will use the same memory for dec_src_iter and dec_dst_iter + // However, dec_src_iter has a context vector but not + // dec_dst_iter. + // To resolve this we will create one memory that holds the + // context vector as well as the both the hidden and cell states. + // For the dst_iter, we will use a view on this memory. + // Note that the cell state will be padded by + // feature_size values. However, we do not compute or + // access those. + memory::dims dec_dst_iter_dims = { dec_n_layers, 1, lstm_n_states, batch, + feature_size + feature_size }; + memory::dims dec_dst_iter_noctx_dims + = { dec_n_layers, 1, lstm_n_states, batch, feature_size }; + + auto user_dec_wei_layer_md = memory::desc({ user_dec_wei_layer_dims }, + memory::data_type::f32, memory::format::ldigo); + auto user_dec_wei_iter_md = memory::desc({ user_dec_wei_iter_dims }, + memory::data_type::f32, memory::format::ldigo); + auto user_dec_bias_md = memory::desc({ user_dec_bias_dims }, + memory::data_type::f32, memory::format::ldgo); + auto dec_src_layer_md = memory::desc( + { dec_src_layer_dims }, memory::data_type::u8, memory::format::tnc); + auto dec_dst_layer_md = memory::desc( + { dec_dst_layer_dims }, memory::data_type::u8, memory::format::tnc); + auto dec_dst_iter_md = memory::desc({ dec_dst_iter_dims }, + memory::data_type::f32, memory::format::ldsnc); + + auto user_dec_wei_layer_memory = memory( + { user_dec_wei_layer_md, cpu_engine }, user_dec_wei_layer.data()); + auto user_dec_wei_iter_memory = memory( + { user_dec_wei_iter_md, cpu_engine }, user_dec_wei_iter.data()); + auto user_dec_bias_memory + = memory({ user_dec_bias_md, cpu_engine }, user_dec_bias.data()); + auto dec_src_layer_memory = memory({ dec_src_layer_md, cpu_engine }); + auto dec_dst_layer_memory + = memory({ dec_dst_layer_md, cpu_engine }, dec_dst.data()); + + /* Create memory descriptors for RNN data w/o specified layout */ + auto dec_wei_layer_md = memory::desc({ user_dec_wei_layer_dims }, + memory::data_type::s8, memory::format::any); + auto dec_wei_iter_md = memory::desc({ user_dec_wei_iter_dims }, + memory::data_type::s8, memory::format::any); + + /* As mentioned above, we create a view without context out of the + memory with context. */ + auto dec_dst_iter_memory = memory({ dec_dst_iter_md, cpu_engine }); + auto dec_dst_iter_noctx_md + = view::primitive_desc(dec_dst_iter_memory.get_primitive_desc(), + dec_dst_iter_noctx_dims, { 0, 0, 0, 0, 0 }) + .dst_primitive_desc() + .desc(); + + rnn_cell::desc dec_cell(algorithm::vanilla_lstm); + rnn_forward::desc dec_ctx_desc(prop_kind::forward_inference, dec_cell, + rnn_direction::unidirectional_left2right, dec_src_layer_md, + dec_dst_iter_md, dec_wei_layer_md, dec_wei_iter_md, + user_dec_bias_md, dec_dst_layer_md, dec_dst_iter_noctx_md); + auto dec_ctx_prim_desc + = rnn_forward::primitive_desc(dec_ctx_desc, attr, cpu_engine); + + /* Create memory primitives for input data and use reorders to quantize + * values to int8 */ + auto dec_wei_layer_memory + = memory(dec_ctx_prim_desc.weights_layer_primitive_desc()); + auto dec_wei_layer_reorder_pd = reorder::primitive_desc( + user_dec_wei_layer_memory.get_primitive_desc(), + dec_wei_layer_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(dec_wei_layer_reorder_pd, + user_dec_wei_layer_memory, dec_wei_layer_memory)); + + auto dec_wei_iter_memory + = memory(dec_ctx_prim_desc.weights_iter_primitive_desc()); + auto dec_wei_iter_reorder_pd = reorder::primitive_desc( + user_dec_wei_iter_memory.get_primitive_desc(), + dec_wei_iter_memory.get_primitive_desc(), attr); + weights_reorders.push_back(reorder(dec_wei_iter_reorder_pd, + user_dec_wei_iter_memory, dec_wei_iter_memory)); + + decoder_net.push_back(rnn_forward(dec_ctx_prim_desc, dec_src_layer_memory, + dec_dst_iter_memory, dec_wei_layer_memory, dec_wei_iter_memory, + user_dec_bias_memory, dec_dst_layer_memory, dec_dst_iter_memory, + null_memory_)); + + /* Allocating temporary buffers for attention mechanism */ + std::vector weighted_annotations( + src_seq_length_max * batch * feature_size, 1.0f); + std::vector weights_attention_sum_rows(feature_size, 1); + + /* + Execution + */ + auto execute = [&]() { + // reorder weights to MKLDNN internal representation + stream(stream::kind::eager).submit(weights_reorders).wait(); + + // run encoder (1 stream) + stream(stream::kind::eager).submit(encoder_net).wait(); + + // compute the weighted annotations once before the decoder + compute_weighted_annotations(weighted_annotations.data(), + src_seq_length_max, batch, feature_size, + user_weights_annotation.data(), + (float *)enc_dst_layer_memory.get_data_handle()); + // precompute compensation for s8u8s32 gemm in compute attention + compute_sum_of_rows(user_weights_attention_src_layer.data(), + feature_size, feature_size, weights_attention_sum_rows.data()); + + // We initialise src_layer to the embedding of , which + // are assumed to be 0 here + memset(dec_src_layer_memory.get_data_handle(), 0, + dec_src_layer_memory.get_primitive_desc().get_size()); + // From now on, src points to the output of the last iteration + + for (int i = 0; i < tgt_seq_length_max; i++) { + uint8_t *src_att_layer_handle + = (uint8_t *)dec_src_layer_memory.get_data_handle(); + float *src_att_iter_handle + = (float *)dec_dst_iter_memory.get_data_handle(); + + // Compute attention context vector into the first layer src_iter + compute_attention(src_att_iter_handle, src_seq_length_max, batch, + feature_size, user_weights_attention_src_layer.data(), + weights_attention_scale, weights_attention_sum_rows.data(), + src_att_layer_handle, data_scale, data_shift, + (uint8_t *)enc_bidir_dst_layer_memory.get_data_handle(), + weighted_annotations.data(), + user_weights_alignments.data()); + + // copy the context vectors to all layers of src_iter + copy_context(src_att_iter_handle, dec_n_layers, lstm_n_states, + batch, feature_size); + + // run the decoder iteration + stream(stream::kind::eager).submit(decoder_net).wait(); + + // Move the handle on the src/dst layer to the next iteration + auto dst_layer_handle + = (uint8_t *)dec_dst_layer_memory.get_data_handle(); + dec_src_layer_memory.set_data_handle(dst_layer_handle); + dec_dst_layer_memory.set_data_handle( + dst_layer_handle + batch * feature_size); + } + + }; + + execute(); +} + +int main(int argc, char **argv) { + try { + simple_net(); + std::cout << "ok\n"; + } catch (error &e) { + std::cerr << "status: " << e.status << std::endl; + std::cerr << "message: " << e.message << std::endl; + } + return 0; +} diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_training.cpp b/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_training.cpp index d63e675..bde52ce 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_training.cpp +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_training.cpp @@ -219,6 +219,14 @@ void simple_net() { memory::format::ldigo), cpu_engine }, user_common_weights_layer.data()); + std::vector user_common_weights_iter( + tz_volume(common_weights_iter_dims), + 1.0f); + auto user_common_weights_iter_memory + = mkldnn::memory({ formatted_md(common_weights_iter_dims, + memory::format::ldigo), cpu_engine }, + user_common_weights_layer.data()); + std::vector user_common_bias( tz_volume(common_bias_dims), 1.0f); @@ -325,10 +333,22 @@ void simple_net() { reorder_common_weights_layer = true; } - // Assume same memory would work for weights between leftmost and rightmost - // Allocate memory here based on the layout suggested by the primitive. - auto common_weights_iter_memory - = mkldnn::memory(leftmost_prim_desc.weights_iter_primitive_desc()); + auto common_weights_iter_memory = user_common_weights_iter_memory; + primitive common_weights_iter_reorder; + auto reorder_common_weights_iter = false; + if (memory::primitive_desc( + leftmost_prim_desc.weights_iter_primitive_desc()) + != memory::primitive_desc( + common_weights_iter_memory.get_primitive_desc()) + ) { + common_weights_iter_memory + = mkldnn::memory(leftmost_prim_desc.weights_iter_primitive_desc()); + common_weights_iter_reorder + = reorder(user_common_weights_iter_memory, + common_weights_iter_memory); + reorder_common_weights_iter = true; + } + auto common_bias_memory = user_common_bias_memory; primitive common_bias_reorder; @@ -426,6 +446,8 @@ void simple_net() { // Enqueue primitives for forward execution if (reorder_common_weights_layer) fwd_net.push_back(common_weights_layer_reorder); + if (reorder_common_weights_iter) + fwd_net.push_back(common_weights_iter_reorder); if (reorder_common_bias) fwd_net.push_back(common_bias_reorder); if (reorder_leftmost_dst_layer) diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.c b/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.c index dbe1ac0..964308c 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.c +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.c @@ -78,7 +78,7 @@ void _free(void *ptr) { } #endif -static size_t product(int *arr, size_t size) +static size_t product(ptrdiff_t *arr, size_t size) { size_t prod = 1; for (size_t i = 0; i < size; ++i) @@ -86,7 +86,7 @@ static size_t product(int *arr, size_t size) return prod; } -static void init_net_data(float *data, uint32_t dim, const int *dims) +static void init_net_data(float *data, uint32_t dim, const ptrdiff_t *dims) { if (dim == 1) { for (int i = 0; i < dims[0]; ++i) { @@ -107,7 +107,7 @@ static void init_net_data(float *data, uint32_t dim, const int *dims) } } -static void init_data_memory(uint32_t dim, const int *dims, +static void init_data_memory(uint32_t dim, const ptrdiff_t *dims, mkldnn_memory_format_t user_fmt, mkldnn_data_type_t data_type, mkldnn_engine_t engine, float *data, @@ -177,8 +177,8 @@ mkldnn_status_t simple_net() mkldnn_engine_t engine; CHECK(mkldnn_engine_create(&engine, mkldnn_cpu, 0 /* idx */)); - int net_src_sizes[4] = { BATCH, IC, CONV_IH, CONV_IW }; - int net_dst_sizes[4] = { BATCH, OC, POOL_OH, POOL_OW }; + ptrdiff_t net_src_sizes[4] = { BATCH, IC, CONV_IH, CONV_IW }; + ptrdiff_t net_dst_sizes[4] = { BATCH, OC, POOL_OH, POOL_OW }; float *net_src = (float *)aligned_malloc(product(net_src_sizes,4)*sizeof(float), 64); @@ -195,12 +195,12 @@ mkldnn_status_t simple_net() * {BATCH, OC, CONV_OH, CONV_OW} * strides: {CONV_STRIDE, CONV_STRIDE} */ - int *conv_user_src_sizes = net_src_sizes; - int conv_user_weights_sizes[4] = { OC, IC, 11, 11 }; - int conv_bias_sizes[4] = { OC }; - int conv_user_dst_sizes[4] = { BATCH, OC, CONV_OH, CONV_OW }; - int conv_strides[2] = { CONV_STRIDE, CONV_STRIDE }; - int conv_padding[2] = { CONV_PAD, CONV_PAD }; + ptrdiff_t *conv_user_src_sizes = net_src_sizes; + ptrdiff_t conv_user_weights_sizes[4] = { OC, IC, 11, 11 }; + ptrdiff_t conv_bias_sizes[4] = { OC }; + ptrdiff_t conv_user_dst_sizes[4] = { BATCH, OC, CONV_OH, CONV_OW }; + ptrdiff_t conv_strides[2] = { CONV_STRIDE, CONV_STRIDE }; + ptrdiff_t conv_padding[2] = { CONV_PAD, CONV_PAD }; float *conv_src = net_src; float *conv_weights = (float *)aligned_malloc( @@ -394,10 +394,10 @@ mkldnn_status_t simple_net() * kernel: {3, 3} * strides: {POOL_STRIDE, POOL_STRIDE} */ - int32_t *pool_dst_sizes = net_dst_sizes; - int32_t pool_kernel[2] = { 3, 3 }; - int32_t pool_strides[2] = { POOL_STRIDE, POOL_STRIDE }; - int32_t pool_padding[2] = { POOL_PAD, POOL_PAD }; + ptrdiff_t *pool_dst_sizes = net_dst_sizes; + ptrdiff_t pool_kernel[2] = { 3, 3 }; + ptrdiff_t pool_strides[2] = { POOL_STRIDE, POOL_STRIDE }; + ptrdiff_t pool_padding[2] = { POOL_PAD, POOL_PAD }; /* create pooling src memory descriptor using dst descriptor * from previous primitive */ diff --git a/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.cpp b/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.cpp index 836a08b..070d3dd 100644 --- a/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.cpp +++ b/inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.cpp @@ -44,7 +44,7 @@ void simple_net() memory::dims conv_bias_tz = { 96 }; memory::dims conv_dst_tz = { batch, 96, 55, 55 }; memory::dims conv_strides = { 4, 4 }; - auto conv_padding = { 0, 0 }; + memory::dims conv_padding = { 0, 0 }; std::vector conv_weights( std::accumulate(conv_weights_tz.begin(), conv_weights_tz.end(), 1, @@ -180,7 +180,7 @@ void simple_net() memory::dims pool_dst_tz = { batch, 96, 27, 27 }; memory::dims pool_kernel = { 3, 3 }; memory::dims pool_strides = { 2, 2 }; - auto pool_padding = { 0, 0 }; + memory::dims pool_padding = { 0, 0 }; /* create memory for pool dst data in user format */ auto pool_user_dst_memory = memory( diff --git a/inference-engine/thirdparty/mkl-dnn/include/mkldnn.h b/inference-engine/thirdparty/mkl-dnn/include/mkldnn.h index 73853ad..a0a2d1a 100644 --- a/inference-engine/thirdparty/mkl-dnn/include/mkldnn.h +++ b/inference-engine/thirdparty/mkl-dnn/include/mkldnn.h @@ -52,6 +52,7 @@ #endif #include "mkldnn_types.h" +#include "mkldnn_version.h" #endif /* DOXYGEN_SHOULD_SKIP_THIS */ #ifdef __cplusplus @@ -88,15 +89,15 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_iterator_create_v2( const_mkldnn_primitive_desc_t hint_forward_primitive_desc); /** Iterates over primitive descriptors. Returns #mkldnn_iterator_ends if no - * more primitive descriptors are available */ + * more primitive descriptors are available. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_iterator_next( mkldnn_primitive_desc_iterator_t iterator); -/** Fetches current primitive descriptor. +/** Fetches the current primitive descriptor. * * @note - * fetched primitive descriptor should be deleted by user using - * mkldnn_primitive_desc_destroy() once becomes unneeded */ + * The user should delete the fetched primitive descriptor using + * mkldnn_primitive_desc_destroy() once it is no longer needed. */ mkldnn_primitive_desc_t MKLDNN_API mkldnn_primitive_desc_iterator_fetch( const_mkldnn_primitive_desc_iterator_t iterator); @@ -106,8 +107,8 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_iterator_destroy( /** Creates a @p primitive_desc using @p op_desc, @p engine, and optionally a * hint primitive descriptor from forward propagation. The call is equivalent - * to create a primitive descriptor iterator, instantly fetch a primitive_desc - * and destroy the iterator. */ + * to creating a primitive descriptor iterator, immediately fetching a + * primitive descriptor, and then destroying the iterator. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_create( mkldnn_primitive_desc_t *primitive_desc, const_mkldnn_op_desc_t op_desc, mkldnn_engine_t engine, @@ -115,8 +116,8 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_create( /** Creates a @p primitive_desc using @p op_desc, @p attr, @p engine, and * optionally a hint primitive descriptor from forward propagation. The call is - * equivalent to create a primitive descriptor iterator, instantly fetch a @p - * primitive_desc and destroy the iterator. */ + * equivalent to creating a primitive descriptor iterator, immediately fetching + * a primitive descriptor, and then destroying the iterator. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_create_v2( mkldnn_primitive_desc_t *primitive_desc, const_mkldnn_op_desc_t op_desc, const_mkldnn_primitive_attr_t attr, @@ -131,11 +132,12 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_clone( /** Returns a constant reference to the attribute of a @p primitive_desc. * * @warning - * User should not destroy obtained @p attr + * The user should not destroy the obtained @p attr. * * @warning - * The lifetime of an @p attr is same as @p primitive_desc, so it is - * illegal to use the @p attr once @p primitive_desc is destroyed */ + * The lifetime of an @p attr is the same as that of a @p primitive_desc, + * so it is illegal to use the @p attr once @p primitive_desc has been + * destroyed. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_get_attr( const_mkldnn_primitive_desc_t primitive_desc, const_mkldnn_primitive_attr_t *attr); @@ -147,7 +149,7 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_destroy( /** Queries primitive descriptor * * One of the most typical use cases is to query a convolution primitive - * descriptor created with source, weights and destination formats equal + * descriptor created with source, weights, and destination formats equal * to #mkldnn_any about the corresponding memory primitive descriptors * (@p what equals #mkldnn_query_src_pd, #mkldnn_query_weights_pd, and * #mkldnn_query_dst_pd respectively) to be able to prepare memory and @@ -155,15 +157,15 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_destroy( * * Another quite typical use case is to query an operation primitive * descriptor for a workspace (@p what equals #mkldnn_query_workspace_pd). - * Returned status #mkldnn_not_required indicates that workspace is + * The returned status #mkldnn_not_required indicates that a workspace is * not required. * - * Few other possibilities: + * A few other possibilities: * - query a memory primitive descriptor for the underlying memory * descriptor (#mkldnn_query_memory_d) * - query an operation primitive descriptor for the underlying operation * descriptor (#mkldnn_query_convolution_d, #mkldnn_query_eltwise_d, - * #mkldnn_query_rnn_d, etc) + * #mkldnn_query_rnn_d, etc.) * - query an operation primitive descriptor for the implementation * information string (#mkldnn_query_impl_info_str) * - query an operation primitive descriptor for the number of inputs and @@ -178,7 +180,7 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_desc_query( /** Queries primitive descriptor for memory descriptor * - * @returns NULL in case of any error (in particular if queried entity is + * @returns NULL in case of any error (in particular if the queried entity is * not of type mkldnn_memory_desc_t). * * This is just a specialized version of mkldnn_primitive_desc_query @@ -189,16 +191,16 @@ const mkldnn_memory_desc_t MKLDNN_API *mkldnn_primitive_desc_query_memory_d( /** Queries primitive descriptor for primitive descriptor * - * @returns NULL in case of any error (in particular if queried entity is + * @returns NULL in case of any error (in particular if the queried entity is * not of type const_mkldnn_primitive_desc_t). * * This is just a specialized version of mkldnn_primitive_desc_query * used for convenience. * - * Example: query an operation primitive descriptor for a workspace + * Example: Query an operation primitive descriptor for a workspace * (@p what equals #mkldnn_query_workspace_pd). Returned - * NULL indicates the primitive does not require a workspace. - * Otherwise a user should prepare the workspace and pass it + * NULL indicates that the primitive does not require a workspace. + * Otherwise, a user should prepare the workspace and pass it * to the corresponding primitive. */ const_mkldnn_primitive_desc_t MKLDNN_API mkldnn_primitive_desc_query_pd( @@ -207,7 +209,7 @@ const_mkldnn_primitive_desc_t MKLDNN_API mkldnn_primitive_desc_query_pd( /** Queries primitive descriptor for signed 32bit int * - * @returns 0 in case of any error (in particular if queried entity is + * @returns 0 in case of any error (in particular if the queried entity is * not of type int32_t). Note that 0 might also be the actual returned * value. * @@ -230,8 +232,8 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_create( * primitive. * * @warning - * Returned object must not be destroyed by user. 'const' qualifier of the - * returned object prevents such attempts. */ + * The returned object must not be destroyed by the user. The @c const + * qualifier of the returned object prevents such attempts. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_get_primitive_desc( const_mkldnn_primitive_t primitive, const_mkldnn_primitive_desc_t *primitive_desc); @@ -252,7 +254,7 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_destroy( /** Creates an #mkldnn_primitive_at_t structure from a @p primitive and @p * output_index. This function only fills in the data structure - * and does not check whether parameters are correct. The actual error checking + * and does not check whether arguments are correct. The actual error checking * is done when the resulting #mkldnn_primitive_at structure is passed to a * primitive creation function. */ mkldnn_primitive_at_t MKLDNN_API mkldnn_primitive_at( @@ -264,11 +266,11 @@ mkldnn_primitive_at_t MKLDNN_API mkldnn_primitive_at( * An extension for controlling primitive behavior. * @{ */ -/** Creates an empty (default) @p attr attribute. All the parameters set to +/** Creates an empty (default) @p attr attribute. All the parameters are set to * default values. * - * An empty attribute is used in primitive descriptor creating whenever it is - * not passed explicitly, e.g. in mkldnn_primitive_desc_create. + * An empty attribute is used in primitive descriptor creation whenever it + * is not passed explicitly, e.g. in mkldnn_primitive_desc_create. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_create( mkldnn_primitive_attr_t *attr); @@ -295,17 +297,17 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_get_int_output_round_mode( mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_set_int_output_round_mode( mkldnn_primitive_attr_t attr, mkldnn_round_mode_t round_mode); -/** Returns @p count, correspondence scale @p mask, and pointer to a constant +/** Returns @p count, correspondence scale @p mask, and a pointer to a constant * floating point array of output @p scales for given @p attr, previously set * by mkldnn_primitive_attr_set_output_scales. * * @warning - * @p scales array points to the internal @p attr field, so user should - * not modify/destroy @p scales. + * The @p scales array points to the internal @p attr field, so the user + * should not modify or destroy @p scales. * * @warning - * The lifetime of @p scales is same as @p attr it belongs to, so it is - * illegal to use the @p scales after @p attr is destroyed + * The lifetime of @p scales is the same as that of the @p attr to which it + * belongs, so it is illegal to use @p scales after @p attr is destroyed. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_get_output_scales( const_mkldnn_primitive_attr_t attr, int *count, int *mask, @@ -314,10 +316,11 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_get_output_scales( /** Sets output @p scales for primitive operations. The number of elements @p * count and correspondence scale @p mask are stored for future use. * - * The @p mask argument defines correspondence between output tensor dimensions - * and the @p scales array. Set i-th bit of @p mask to 1 to use dedicated - * scaling factor for each slice of the output tensor over i-th dimension. Set - * @p mask to 0 to use common scaling factor for the whole output tensor. + * The @p mask argument defines the correspondence between the output tensor + * dimensions and the @p scales array. Set the i-th bit of @p mask to 1 to use a + * dedicated scaling factor for each slice of the output tensor over the i-th + * dimension. Set @p mask to 0 to use a common scaling factor for the whole + * output tensor. * * @note * The dimension order is always native and does not depend on the actual @@ -344,8 +347,8 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_get_output_scales( * * @note * There is no way to check that @p count corresponds to @p mask until an - * actual primitive descriptor is created, so it is user's responsibility - * to set proper values. The following formula must be hold: + * actual primitive descriptor is created, so it is the user's + * responsibility to set proper values. The following formula must hold: * * \f[count = \prod\limits_{d \in mask} output.dims[d]\f] */ @@ -353,31 +356,31 @@ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_set_output_scales( mkldnn_primitive_attr_t attr, int count, int mask, const float *scales); -/** Returns @p post_ops for given attr. +/** Returns @p post_ops for given @p attr. * * @warning - * @p post_ops points to the internal @p attr field, so user should not - * modify/destroy @p post_ops. Also the lifetime of @p post_ops is the - * same as @p attr it belongs to, so it is illegal to use @p post_ops once - * @p attr is destroyed. + * @p post_ops points to the internal @p attr field, so the user should not + * modify or destroy @p post_ops. Also, the lifetime of @p post_ops is the + * same as that of the @p attr it belongs to, so it is illegal to use @p + * post_ops after @p attr has been destroyed. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_get_post_ops( const_mkldnn_primitive_attr_t attr, const_mkldnn_post_ops_t *post_ops); /** Sets configured @p post_ops to an attribute @p attr for future use (when - * primitive descriptor is being created. + * primitive descriptor is being created). * * @note - * At this point of time there is no way to check whether primitive - * descriptor does or does not support given sequence of post operations. - * That means that user should handle an error that might happen at + * At this point in time, there is no way to check whether the primitive + * descriptor does or does not support a given sequence of post operations. + * Therefore the user should handle an error that might occur at the * mkldnn_primitive_desc_create call. */ mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_set_post_ops( mkldnn_primitive_attr_t attr, const_mkldnn_post_ops_t post_ops); /** @addtogroup c_api_attributes_post_ops Sequence of post operations - * An extension for performing extra operations after base operation. + * An extension for performing extra operations after a base operation. * @{ */ /** Creates an empty sequence of post operations @p post_ops. */ @@ -390,19 +393,19 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_destroy(mkldnn_post_ops_t post_ops); int MKLDNN_API mkldnn_post_ops_len(const_mkldnn_post_ops_t post_ops); /** Returns the type of post operation with index @p index in given - * @p post_ops. In case of error returns #mkldnn_undefined_primitive. */ + * @p post_ops. In case of error, returns #mkldnn_undefined_primitive. */ mkldnn_primitive_kind_t MKLDNN_API mkldnn_post_ops_get_kind( const_mkldnn_post_ops_t post_ops, int index); /** Appends accumulation (sum) post operation to the @p post_ops. Prior to - * accumulating the result the previous value would be multiplied by @p scale. + * accumulating the result, the previous value would be multiplied by @p scale. * * The kind of this post operation is #mkldnn_sum. * - * This feature might improve performance for the cases like residual learning + * This feature might improve performance for cases like residual learning * blocks, where the result of convolution is accumulated to the previously - * computed activations. Scale parameter @p scale might be extremely for the - * integer-based computations, when the result and previous activations have + * computed activations. The parameter @p scale might be extreme for the + * integer-based computations when the result and previous activations have * different logical scaling factors. * * In the simplest case when the accumulation is the only post operation, the @@ -410,9 +413,10 @@ mkldnn_primitive_kind_t MKLDNN_API mkldnn_post_ops_get_kind( * dst[] <- scale * dst[] + op(...) // instead of dst[] <- op(...) * * @note - * This post op (as well as all the others) disregards the original layout - * of dst, i.e. the layout of the original dst is expected to be the same - * as the layout of stored dst. + * This post operation (as well as all the others) disregards the original + * layout of the destination; that is, the layout of the original + * destination is expected to be the same as the layout of the stored + * destination. */ mkldnn_status_t MKLDNN_API mkldnn_post_ops_append_sum( mkldnn_post_ops_t post_ops, float scale); @@ -422,13 +426,13 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_append_sum( * * @note * If index @p index would not correspond to the accumulation post - * operation, the function return #mkldnn_invalid_arguments. + * operation, the function returns #mkldnn_invalid_arguments. */ mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_sum( const_mkldnn_post_ops_t post_ops, int index, float *scale); /** Appends eltwise post operation to the @p post_ops with given parameters - * @p kind, @p alpha and @p beta (@sa mkldnn_eltwise_forward_desc_init and + * @p kind, @p alpha, and @p beta (@sa mkldnn_eltwise_forward_desc_init and * mkldnn_eltwise_desc_t). * * The kind of this post operation is #mkldnn_eltwise. @@ -436,7 +440,7 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_sum( * In the simplest case when the eltwise is the only post operation, the * computations would be: * dst[] <- scale * eltwise_op ( op(...) ) // instead of dst[] <- op(...) - * where eltwise_op is configured with given parameters. + * where eltwise_op is configured with the given parameters. */ mkldnn_status_t MKLDNN_API mkldnn_post_ops_append_eltwise( mkldnn_post_ops_t post_ops, float scale, mkldnn_alg_kind_t alg, @@ -489,6 +493,27 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_dw_conv( int* ker_h, int* ker_w, int* str_h, int* str_w, const float** weights_data, const float** biases_data); +/** Appends binarization post operation to the @p post_ops with given parameters + * @p kind and @p weights (@sa mkldnn_binarization_forward_desc_init and + * mkldnn_binarization_desc_t). + * + * The kind of this post operation is #mkldnn_binarization. + * + * In the simplest case when the binarization is the only post operation, the + * computations would be: + * dst[] <- binarization_op ( op(...) ) // instead of dst[] <- op(...) + * where binarization_op is configured with given parameters. + */ +mkldnn_status_t MKLDNN_API mkldnn_post_ops_append_binarization( + mkldnn_post_ops_t post_ops, mkldnn_alg_kind_t alg, const float* weights_data); + +/** Gets the binarization parameters of the post operation with index @p index in + * the sequence of @p post_ops. + */ +mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_binarization( + const_mkldnn_post_ops_t post_ops, int index, + mkldnn_alg_kind_t *alg, const float** weights_data); + /** @} */ /** @} */ @@ -499,21 +524,21 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_dw_conv( * The library supports various data types and formats. Memory hierarchy * consists of three levels of abstraction: * 1. **Memory descriptor** -- engine agnostic logical description of data - * (number of dimensions, dimensions themselves and data type), and + * (number of dimensions, dimensions themselves, and data type), and * optionally the format/layout that describes the physical representation - * of data in memory. If the format/layout is not known yet one can pass - * #mkldnn_any. This approach is used to allow compute intensive - * primitives to specify the most appropriate layout on their own with - * users required to reorder the data if the incoming layout doesn't match - * the primitive's selection. Memory descriptor can be created with + * of data in memory. If the format is not known yet, one can pass + * #mkldnn_any. This approach is used to allow compute-intensive + * primitives to specify the most appropriate format on their own with + * users required to reorder the data if the incoming format doesn't match + * the primitive's selection. Memory descriptor can be created with the * mkldnn_memory_desc_init() function or by directly filling the - * mkldnn_memory_desc_t structure. The later requires deep knowledge of + * mkldnn_memory_desc_t structure. The latter requires deep knowledge of * how the physical data representation is mapped to the structure. The * @ref understanding_memory_formats topic should shed some light on that. * 2. **Memory primitive descriptor** -- logical description of data that is - * fully defined, i.e. cannot contain #mkldnn_any as a format. It also - * has the engine specified. A memory primitive descriptor is created by - * calling mkldnn_memory_primitive_desc_create() with two arguments: an + * fully defined; that is, it cannot contain #mkldnn_any as a format. It + * also has the engine specified. A memory primitive descriptor is created + * by calling mkldnn_memory_primitive_desc_create() with two arguments: an * mkldnn_memory_desc_t and an mkldnn_engine_t. It has the same type as * other primitive descriptors and can be: * - queried to return the underlying memory descriptor using @@ -521,51 +546,52 @@ mkldnn_status_t MKLDNN_API mkldnn_post_ops_get_params_dw_conv( * mkldnn_primitive_desc_query_memory_d(). * - compared with another memory primitive descriptor using * mkldnn_memory_primitive_desc_equal(). This is especially useful when - * checking whether a primitive requires reorder from user's data layout - * to the primitive's one. + * checking whether a primitive requires reorder from the user's data + * format to the primitive's format. * - queried to return the size of the data using * mkldnn_memory_primitive_desc_get_size(). As described in - * @ref understanding_memory_formats the size of data sometimes cannot - * be computed as a product of dimensions times the size of data type. - * So users are encouraged to use this function to have better code + * @ref understanding_memory_formats, the size of data sometimes cannot + * be computed as the product of dimensions times the size of the data + * type. So users are encouraged to use this function for better code * portability. * 3. **Memory primitive** or simply **memory** -- a pseudo-primitive that is * defined by a memory primitive descriptor and a handle to the data - * itself (in case of CPU engine the handle is simply a pointer `void*`). - * The data handle can be queried using mkldnn_memory_get_data_handle() - * and be set using mkldnn_memory_set_data_handle(). The latter function - * always sets the memory in the padding region to zero which is the - * invariant maintained by all the primitives in Intel MKL-DNN. See + * itself. (In the case of CPU engine, the handle is simply a pointer to + * @c void.) The data handle can be queried using + * mkldnn_memory_get_data_handle() and set using + * mkldnn_memory_set_data_handle(). The latter function always sets the + * memory in the padding region to zero, which is the invariant maintained + * by all the primitives in Intel MKL-DNN. See * @ref understanding_memory_formats for more details. * A memory primitive can be created using mkldnn_primitive_create() with * empty inputs and outputs. In this case, the memory primitive's data - * handle needs to be set manually using mkldnn_memory_set_data_handle(). + * handle must be set manually using mkldnn_memory_set_data_handle(). * * Along with ordinary memory with all dimensions being positive, Intel * MKL-DNN supports *zero-volume* memory with one or more dimensions set to - * zero. This is to support NumPy\* convention. - * If a *zero-volume* memory is passed to a primitive, the primitive would + * zero. This is to support the NumPy\* convention. + * If a *zero-volume* memory is passed to a primitive, the primitive does * not perform any computations on this memory. For example: * - Convolution with `(0 batch, 3 input channels, 13 height, 13 width)` * source and `(16 output channels, 3 inputs, channel, 3 height, 3 width)` * weights would produce `(0 batch, 16 ouput channels, 11 height, 11 width)` * destination (assuming strides are `1` and paddings are zero) and perform * zero multiply-add operations. - * - Concatenation of 3 memories of shapes `(3, 4, 13, 13)`, `(3, 0, 13, 13)`, - * and `(3, 1, 13, 13)` along the second axis would produce the output of - * the shape `(3, 5, 13, 13)`, effectively ignoring the second input - * (however if user created a concatenation primitive descriptor with 3 - * inputs they should also provide all 3 memories to the concatenation - * primitive, including the one with zero second dimension). + * - Concatenation of three memories of shapes `(3, 4, 13, 13)`, + * `(3, 0, 13, 13)`, and `(3, 1, 13, 13)` along the second axis would produce + * the output of the shape `(3, 5, 13, 13)`, effectively ignoring the second + * input (however, if the user created a concatenation primitive descriptor + * with three inputs they should also provide all three memories to the + * concatenation primitive, including the one with zero second dimension). * - However, Intel MKL-DNN would return an error when attempting to create a - * convolution with *zero-volume* memory passed for weights because such + * convolution with *zero-volume* memory passed for weights because such a * convolution is not well-defined: * ~~~ * dst(1, 16, 11, 11) <-- src(1, 0, 13, 13) (*) wei(16, 0, 3, 3) * ~~~ * Should the values in the destination be zeroes or just not accessed at - * all? Moreover, backward pass w.r.t. weights in such cases is not - * well-defined as well. + * all? Moreover, backward pass w.r.t. weights in such cases is also not + * well-defined. * * Data handle of *zero-volume* memory is never accessed and hence can be * unset (NULL in case of CPU engine). @@ -581,15 +607,16 @@ mkldnn_status_t MKLDNN_API mkldnn_memory_desc_init( mkldnn_data_type_t data_type, mkldnn_memory_format_t format); /** Creates a @p memory_primitive_desc memory primitive descriptor using @p - * memory_desc and @p engine. @p memory_desc cannot be uncertain, that is, - * initialized with #mkldnn_any. */ + * memory_desc and @p engine. @p memory_desc cannot be uncertain; that is, it + * cannot be initialized with #mkldnn_any. */ mkldnn_status_t MKLDNN_API mkldnn_memory_primitive_desc_create( mkldnn_primitive_desc_t *memory_primitive_desc, const mkldnn_memory_desc_t *memory_desc, mkldnn_engine_t engine); /** Creates a @p view_primitive_desc for a given @p memory_primitive_desc, with - * @p dims sizes and @p offset offsets. May fail if layout used does not allow - * obtain desired view. In this case consider using extract primitive */ + * @p dims sizes and @p offsets offsets. May fail if the format used does not + * allow obtaining the desired view. In this case, consider using the extract + * primitive. */ mkldnn_status_t MKLDNN_API mkldnn_view_primitive_desc_create( mkldnn_primitive_desc_t *view_primitive_desc, const_mkldnn_primitive_desc_t memory_primitive_desc, @@ -660,13 +687,13 @@ mkldnn_status_t MKLDNN_API mkldnn_reorder_primitive_desc_create_v2( /** @} */ /** @addtogroup c_api_concat Concat - * A primitive to concatenate data by arbitrary dimension + * A primitive to concatenate data by arbitrary dimension. * @{ */ /** Creates out-of-place @p concat_primitive_desc for concatenation of @p n * inputs by @p concat_dimension with resulting @p output_desc memory - * descriptor. @p output_desc can be NULL or be specified with #mkldnn_any - * format -- in this case appropriate memory format would be chosen + * descriptor. @p output_desc can be NULL or specified with the #mkldnn_any + * format -- in this case, the appropriate memory format would be chosen * automatically. * * Order of inputs: @@ -684,28 +711,28 @@ mkldnn_status_t MKLDNN_API mkldnn_concat_primitive_desc_create( const_mkldnn_primitive_desc_t *input_pds); #if 0 -/** Creates in-place @p concat_primitive_desc for given @p n @p inputs memory - * primitive descriptors along @p concat_dimension. All inputs must have the - * same memory format. Output memory format would be the same. Likewise - * view_primitive_desc_create the call may fail, if memory format of inputs do - * not allow inplace concatenation for given sizes. +/** Creates in-place @p concat_primitive_desc for given @p n and @p inputs + * memory primitive descriptors along @p concat_dimension. All inputs must have + * the same memory format. Output memory format would be the same. Likewise, the + * view_primitive_desc_create call may fail if the memory format of the inputs + * does not allow in-place concatenation for the given sizes. * - * @note this primitive is more like a synchronization stub for concatenation, - * since concat_inplace does no operation during execution. + * @note This primitive is more like a synchronization stub for concatenation, + * because concat_inplace performs no operation during execution. * - * @note since not operation happens user must ensure that input */ + * @note Because no operation occurs, the user must ensure the input. */ mkldnn_status_t MKLDNN_API mkldnn_concat_inplace_by_input_primitive_desc_create( mkldnn_primitive_desc_t *concat_primitive_desc, int n, int concat_dimension, const_mkldnn_primitive_desc_t *inputs); /** Creates in-place @p concat_primitive_desc for given @p output memory - * descriptor and @n inputs with @p sizes sizes along @p concat_dimension. As - * opposed to out-of-place concatenation @p output must be fully defined here. - * Likewise view_primitive_desc_create the call may fail, because given memory - * format does not allow inplace concatenation for given sizes. + * descriptor and @n inputs with @p sizes sizes along @p concat_dimension. + * Unlike out-of-place concatenation, @p output must be fully defined here. + * Likewise, the view_primitive_desc_create call may fail if the given memory + * format does not allow inplace concatenation for the given sizes. * - * @note this primitive is more like a synchronization stub for concatenation, - * since concat_inplace does no operation during execution. */ + * @note This primitive is more like a synchronization stub for concatenation, + * because concat_inplace performs no operation during execution. */ mkldnn_status_t MKLDNN_API mkldnn_concat_inplace_by_output_primitive_desc_create( mkldnn_primitive_desc_t *concat_primitive_desc, const mkldnn_primitive_desc_t output, int n, int concat_dimension, @@ -715,13 +742,13 @@ mkldnn_status_t MKLDNN_API mkldnn_concat_inplace_by_output_primitive_desc_create /** @} */ /** @addtogroup c_api_sum Sum - * A primitive to sum data + * A primitive to sum data. * @{ */ /** Creates out-of-place @p sum_primitive_desc for sum of @p n * inputs multiplied by scale with resulting @p output_desc memory - * descriptor. @p output_desc can be NULL or be specified with #mkldnn_any - * format -- in this case appropriate memory format would be chosen + * descriptor. @p output_desc can be NULL or specified with the #mkldnn_any + * format -- in this case, the appropriate memory format would be chosen * automatically. * * Order of inputs: @@ -761,15 +788,15 @@ mkldnn_status_t MKLDNN_API mkldnn_sum_primitive_desc_create( * @{ */ /** Initializes a convolution descriptor @p conv_desc for forward propagation - * using @p prop_kind (possible values are #mkldnn_forward_training or + * using @p prop_kind (possible values are #mkldnn_forward_training and * #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, @p * padding_l, @p padding_r, and @p padding_kind. In order to create a - * convolution without bias, @p bias_desc should be either @c NULL or point to - * a descriptor with memory format equals to #mkldnn_format_undef. + * convolution without bias, @p bias_desc should either be @c NULL or point to + * a descriptor with memory format equal to #mkldnn_format_undef. * - * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * @note If @p padding_r is @c NULL, the padding is supposed to be symmetric. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -791,15 +818,15 @@ mkldnn_status_t MKLDNN_API mkldnn_convolution_forward_desc_init( /** Initializes a dilated convolution descriptor @p conv_desc for forward * propagation using @p prop_kind (possible values are #mkldnn_forward_training - * or #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, + * and #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, * @p dilates, @p padding_l, @p padding_r, and @p padding_kind. * In order to create a dilated convolution without bias, @p bias_desc - * should be either @c NULL or point to a descriptor with memory format equals + * should either be @c NULL or point to a descriptor with memory format equal * to #mkldnn_format_undef. * - * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * @note If @p padding_r is @c NULL, the padding is supposed to be symmetric. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -823,7 +850,7 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_convolution_forward_desc_init( * with respect to data using @p alg_kind, memory descriptors, @p strides, @p * padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -845,7 +872,7 @@ mkldnn_status_t MKLDNN_API mkldnn_convolution_backward_data_desc_init( * propagation with respect to data using @p alg_kind, memory descriptors, @p * strides, @p dilates @p padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -867,7 +894,7 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_convolution_backward_data_desc_init( * with respect to weights using @p alg_kind, memory descriptors, @p strides, * @p padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -891,7 +918,7 @@ mkldnn_status_t MKLDNN_API mkldnn_convolution_backward_weights_desc_init( * with respect to weights using @p alg_kind, memory descriptors, @p strides, * @p dilates @p padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -920,16 +947,16 @@ mkldnn_dilated_convolution_backward_weights_desc_init( * @{ */ -/** Initializes a deconvolution descriptor @p deconv_desc for forward propagation - * using @p prop_kind (possible values are #mkldnn_forward_training or - * #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, @p - * padding_l, @p padding_r, and @p padding_kind. In order to create a - * deconvolution without bias, @p bias_desc should be either @c NULL or point to - * a descriptor with memory format equals to #mkldnn_format_undef. +/** Initializes a deconvolution descriptor @p deconv_desc for forward + * propagation using @p prop_kind (possible values are #mkldnn_forward_training + * and #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, + * @p padding_l, @p padding_r, and @p padding_kind. In order to create a + * deconvolution without bias, @p bias_desc should either be @c NULL or point to + * a descriptor with memory format equal to #mkldnn_format_undef. * - * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * @note If @p padding_r is @c NULL, the padding is supposed to be symmetric. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -951,15 +978,15 @@ mkldnn_status_t MKLDNN_API mkldnn_deconvolution_forward_desc_init( /** Initializes a dilated deconvolution descriptor @p deconv_desc for forward * propagation using @p prop_kind (possible values are #mkldnn_forward_training - * or #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, + * and #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, * @p dilates, @p padding_l, @p padding_r, and @p padding_kind. In order to - * create a dilated deconvolution without bias, @p bias_desc should be either - * @c NULL or point to a descriptor with memory format equals to + * create a dilated deconvolution without bias, @p bias_desc should either be + * @c NULL or point to a descriptor with memory format equal to * #mkldnn_format_undef. * - * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * @note If @p padding_r is @c NULL, the padding is supposed to be symmetric. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -983,7 +1010,7 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_deconvolution_forward_desc_init( * with respect to data using @p alg_kind, memory descriptors, @p strides, @p * padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -1005,7 +1032,7 @@ mkldnn_status_t MKLDNN_API mkldnn_deconvolution_backward_data_desc_init( * propagation with respect to data using @p alg_kind, memory descriptors, @p * strides, @p dilates, @p padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -1027,7 +1054,7 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_deconvolution_backward_data_desc_init( * with respect to weights using @p alg_kind, memory descriptors, @p strides, * @p padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -1051,7 +1078,7 @@ mkldnn_status_t MKLDNN_API mkldnn_deconvolution_backward_weights_desc_init( * propagation with respect to weights using @p alg_kind, memory descriptors, * @p strides, @p dilates, @p padding_l, @p padding_r, and @p padding_kind. * - * @note memory descriptors are allowed to be initialized with #mkldnn_any + * @note Memory descriptors are allowed to be initialized with #mkldnn_any * value of @p format_kind. * * Order of inputs: @@ -1078,8 +1105,7 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_deconvolution_backward_weights_desc_in * @{ */ /** Initializes a @p shuffle_desc for forward propagation using @p prop_kind, - * @p memory descriptor @p data_desc, @p axis and @p group - * number. + * memory descriptor @p data_desc, @p axis, and @p group_size. * * Order of inputs: * - src (#mkldnn_query_src_pd, 0) @@ -1092,8 +1118,8 @@ mkldnn_status_t MKLDNN_API mkldnn_shuffle_forward_desc_init( mkldnn_shuffle_desc_t *shuffle_desc, mkldnn_prop_kind_t prop_kind, const mkldnn_memory_desc_t *data_desc, int axis, int group_size); -/** Initializes a @p shuffle_desc for backward propagation using @p memory - * descriptor @p diff_data_desc, @p axis and @p group number. +/** Initializes a @p shuffle_desc for backward propagation using memory + * descriptor @p diff_data_desc, @p axis, and @p group_size. * * * Order of inputs: @@ -1110,27 +1136,27 @@ mkldnn_status_t MKLDNN_API mkldnn_shuffle_backward_desc_init( /** @} */ /** @addtogroup c_api_eltwise Eltwise - * A primitive to compute element wise operations like parametric rectifier + * A primitive to compute element-wise operations like parametric rectifier * linear unit (ReLU). * - * Both forward and backward passes support in-place operation, i.e. src - * and dst point to the same memory for forward, and diff_dst and diff_src + * Both forward and backward passes support in-place operation; that is, src + * and dst point to the same memory for forward pass, and diff_dst and diff_src * point to the same memory for backward pass. * - * @warning Since for backward pass original src is required, in-place forward - * pass in general cannot be applied during training. However for some kinds of - * element wise operations (namely ReLU with alpha parameter equals 0) dst and - * src can be interchangeable for the backward pass, which allows performing - * in-place forward even for training. + * @warning Because the original src is required for backward pass, in-place + * forward pass in general cannot be applied during training. However, for some + * kinds of element-wise operations (namely ReLU with alpha parameter equals 0), + * dst and src can be interchangeable for the backward pass, which enables + * performing in-place forward even for training. * * @{ */ -/** Initializes a @p eltwise_desc for forward propagation using @p prop_kind - * (possible values are #mkldnn_forward_training or #mkldnn_forward_inference), - * @p alg_kind algorithm, memory descriptor @p data_desc, and @p alpha, +/** Initializes an @p eltwise_desc for forward propagation using @p prop_kind + * (possible values are #mkldnn_forward_training and #mkldnn_forward_inference), + * @p alg_kind algorithm, memory descriptor @p data_desc, @p alpha, and * @p beta parameters. * - * @sa mkldnn_eltwise_desc_t for details + * @sa mkldnn_eltwise_desc_t for details. * * Order of inputs: * - src (#mkldnn_query_src_pd, 0) @@ -1143,11 +1169,11 @@ mkldnn_status_t MKLDNN_API mkldnn_eltwise_forward_desc_init( mkldnn_alg_kind_t alg_kind, const mkldnn_memory_desc_t *data_desc, float alpha, float beta); -/** Initializes a @p eltwise_desc for backward propagation using @p alg_kind - * algorithm memory descriptors @p diff_data_desc and @p data_desc, and - * @p alpha, @p beta parameters. +/** Initializes an @p eltwise_desc for backward propagation using @p alg_kind + * algorithm memory descriptors @p diff_data_desc and @p data_desc, and the + * @p alpha and @p beta parameters. * - * @sa mkldnn_eltwise_desc_t for details + * @sa mkldnn_eltwise_desc_t for details. * * Order of inputs: * - src (#mkldnn_query_src_pd, 0) @@ -1163,52 +1189,6 @@ mkldnn_status_t MKLDNN_API mkldnn_eltwise_backward_desc_init( /** @} */ -/** @addtogroup c_api_relu ReLU (deprecated, use Eltwise instead) - * A primitive to compute a parametric rectifier linear unit (ReLU). - * - * \f[dst[n][c][h][w] = \max(src[n][c][h][w], 0) + - * \min(src[n][c][h][w], 0) \cdot negative\_slope\f] - * @{ */ - -/** Initializes a @p relu_desc for forward propagation using @p prop_kind - * (possible values are #mkldnn_forward_training or #mkldnn_forward_inference), - * @p negative_slope and memory descriptor @p data_desc. - * - * @deprecated use mkldnn_eltwise_forward_desc_init() instead, with @p alpha - * equals @p negative_slope - * - * Order of inputs: - * - src (#mkldnn_query_src_pd, 0) - * - * Order of outputs: - * - dst (#mkldnn_query_dst_pd, 0) - */ -MKLDNN_DEPRECATED -mkldnn_status_t MKLDNN_API mkldnn_relu_forward_desc_init( - mkldnn_relu_desc_t *relu_desc, mkldnn_prop_kind_t prop_kind, - const mkldnn_memory_desc_t *data_desc, float negative_slope); - -/** Initializes a @p relu_desc for backward propagation using @p negative_slope - * and memory descriptors @p diff_data_desc and @p data_desc. - * - * @deprecated use mkldnn_eltwise_backward_desc_init() instead, with @p alpha - * equals @p negative_slope - * - * Order of inputs: - * - src (#mkldnn_query_src_pd, 0) - * - diff_dst (#mkldnn_query_diff_dst_pd, 0) - * - * Order of outputs: - * - diff_src (#mkldnn_query_diff_src_pd, 0) - */ -MKLDNN_DEPRECATED -mkldnn_status_t MKLDNN_API mkldnn_relu_backward_desc_init( - mkldnn_relu_desc_t *relu_desc, - const mkldnn_memory_desc_t *diff_data_desc, - const mkldnn_memory_desc_t *data_desc, float negative_slope); - -/** @} */ - /** @addtogroup c_api_depthwise Depthwise * A primitive to compute channel wise operations like scale and shift * @{ */ @@ -1237,7 +1217,7 @@ mkldnn_status_t MKLDNN_API mkldnn_depthwise_forward_desc_init( * @{ */ /** Initializes a @p softmax_desc for forward propagation using @p prop_kind - * (possible value are #mkldnn_forward_training or #mkldnn_forward_inference) + * (possible values are #mkldnn_forward_training and #mkldnn_forward_inference) * and memory descriptor @p data_desc. * * Order of inputs: @@ -1280,25 +1260,25 @@ mkldnn_status_t MKLDNN_API mkldnn_softmax_backward_desc_init( * \frac{1}{KW \cdot KH}\sum\limits_{kw,kh} * src[n][ic][oh \cdot s_h - p_l[0] + kh][ow \cdot s_w - p_r[1] + kw],\f] * - * where \f$p_l, p_r\f$ are @p padding_l and @p padding_r - * respectively and output spatial dimensions are calculated - * similarly as done in convolution. + * where \f$p_l, p_r\f$ are @p padding_l and @p padding_r respectively, and + * output spatial dimensions are calculated similarly to how they are done in + * convolution. * - * During training max pooling requires workspace on forward + * During training, max pooling requires a workspace on forward * (#mkldnn_forward_training) and backward (#mkldnn_backward) passes to - * save indices where maximum was found. Workspace layout is opaque and - * the indices cannot be restored from it. However one can use backward + * save indices where maximum was found. The workspace layout is opaque, and + * the indices cannot be restored from it. However, one can use backward * pooling to perform up-sampling (used in some detection topologies). * * @{ */ /** Initializes a pooling descriptor @p pool_desc for forward propagation using - * @p prop_kind (possible values are #mkldnn_forward_training or + * @p prop_kind (possible values are #mkldnn_forward_training and * #mkldnn_forward_inference), @p alg_kind, memory descriptors, and pooling - * parameters in spatial domain: @p strides, @p kernel sizes, @p padding_l, @p - * padding_r, and @p padding_kind. + * parameters in the spatial domain: @p strides, @p kernel sizes, @p padding_l, + * @p padding_r, and @p padding_kind. * - * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * @note If @p padding_r is @c NULL, the padding is supposed to be symmetric. * * Order of inputs: * - src (#mkldnn_query_src_pd, 0) @@ -1317,11 +1297,11 @@ mkldnn_status_t MKLDNN_API mkldnn_pooling_forward_desc_init( const mkldnn_dims_t padding_r, mkldnn_padding_kind_t padding_kind); /** Initializes a pooling descriptor @p pool_desc for backward propagation - * using @p alg_kind, memory descriptors, and pooling parameters in spatial + * using @p alg_kind, memory descriptors, and pooling parameters in the spatial * domain: @p strides, @p kernel sizes, @p padding_l, @p padding_r, and @p * padding_kind. * - * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * @note If @p padding_r is @c NULL, the padding is supposed to be symmetric. * * Order of inputs: * - diff_dst (#mkldnn_query_diff_dst_pd, 0) @@ -1358,21 +1338,21 @@ mkldnn_status_t MKLDNN_API mkldnn_pooling_backward_desc_init( * * where \f$n_{l}\f$ is the @p local_size. * - * During training LRN might or might not require workspace on forward + * During training, LRN might or might not require a workspace on forward * (#mkldnn_forward_training) and backward (#mkldnn_backward) passes. The * behavior is implementation specific. Optimized implementations typically - * require workspace and use it to save some intermediate results from the + * require a workspace and use it to save some intermediate results from the * forward pass that accelerate computations on the backward pass. * - * To check whether workspace is required one should query the LRN primitive - * descriptor for the workspace (#mkldnn_query_workspace_pd). Success would - * indicate the workspace is required and its description would be returned. + * To check whether a workspace is required, query the LRN primitive descriptor + * for the workspace (#mkldnn_query_workspace_pd). Success indicates that the + * workspace is required and its description will be returned. * @sa mkldnn_primitive_desc_query and mkldnn_primitive_desc_query_pd * * @{ */ /** Initializes an @p lrn_desc for forward propagation using @p prop_kind - * (possible values are #mkldnn_forward_training or #mkldnn_forward_inference), + * (possible values are #mkldnn_forward_training and #mkldnn_forward_inference), * @p alg_kind, memory descriptor @p data_desc, and regularization * parameters @p local_size, @p alpha, @p beta, and @p k. * @@ -1390,7 +1370,7 @@ mkldnn_status_t MKLDNN_API mkldnn_lrn_forward_desc_init( int local_size, float alpha, float beta, float k); /** Initializes an @p lrn_desc for backward propagation using @p alg_kind, - * memory descriptors @p data_desc, and @p diff_data_desc, and regularization + * memory descriptors @p data_desc and @p diff_data_desc, and regularization * parameters @p local_size, @p alpha, @p beta, and @p k. * * Order of inputs: @@ -1422,26 +1402,26 @@ mkldnn_status_t MKLDNN_API mkldnn_lrn_backward_desc_init( * \f$\sigma[c] = \frac{1}{NHW} \sum\limits_{whn} * (src[n][c][h][w] - \mu[c])^2\f$, * - * and eps is a constant to improve numerical stability. + * and @c eps is a constant to improve numerical stability. * - * Both forward and backward passes support in-place operation, i.e. src - * and dst point to the same memory for forward, and diff_dst and diff_src + * Both forward and backward passes support in-place operation; that is, src + * and dst point to the same memory for forward pass, and diff_dst and diff_src * point to the same memory for backward pass. * * Batch normalization supports different flavors controlled by - * mkldnn_batch_normalization_desc_t. For example batch normalization can - * compute the mean and variance on its own or can take them as inputs. - * It can either perform scaling and shifting using gamma and beta parameters - * or not. Optionally it can also perform a fused ReLU, which in case of - * training would also require a workspace. + * mkldnn_batch_normalization_desc_t. For example, batch normalization can + * compute the mean and variance on its own or take them as inputs. It can + * either perform scaling and shifting using gamma and beta parameters or not. + * Optionally it can also perform a fused ReLU, which in case of training would + * also require a workspace. * * @sa mkldnn_batch_normalization_desc_t * @{ */ /** Initializes a batch normalization descriptor @p bnrm_desc for forward - * propagation using @p prop_kind, (possible values are - * #mkldnn_forward_training or #mkldnn_forward_inference), memory descriptor - * @p data_desc, normalization parameter @p epsilon and @p flags set using bit + * propagation using @p prop_kind (possible values are + * #mkldnn_forward_training and #mkldnn_forward_inference), memory descriptor + * @p data_desc, normalization parameter @p epsilon, and @p flags set using bit * flags of type mkldnn_batch_normalization_desc_t. * * Order of inputs: @@ -1465,8 +1445,8 @@ mkldnn_status_t MKLDNN_API mkldnn_lrn_backward_desc_init( * if #mkldnn_fuse_bn_relu bit-flags is set in @p flags * and @p prop_kind = #mkldnn_forward_training * - * @note in-place operation is supported, - * i.e. dst points to the same memory as src. + * @note In-place operation is supported; that is, dst points to the same memory + * as src. * * @sa mkldnn_batch_normalization_desc_t */ @@ -1477,8 +1457,8 @@ mkldnn_status_t MKLDNN_API mkldnn_batch_normalization_forward_desc_init( /** Initializes a batch normalization descriptor @p bnrm_desc for backward * propagation with respect to data and scale-shift parameters using memory - * descriptors @p data_desc and @p diff_data_desc, and normalization parameter - * @p epsilon and @p flags set using bit flags of type + * descriptors @p data_desc and @p diff_data_desc, normalization parameter + * @p epsilon, and @p flags set using bit flags of type * mkldnn_batch_normalization_desc_t. * * Order of inputs: @@ -1515,7 +1495,7 @@ mkldnn_status_t MKLDNN_API mkldnn_batch_normalization_backward_desc_init( * A primitive to compute an inner product. * * Inner product layer is also known as fully connected layer. - * with spatial dimension: + * With spatial dimension: * * \f[dst[n][oc] = \sum\limits_{ic, kh, kw} * src[n][ic][kh][kw] \cdot weights[oc][ic][kh][kw] @@ -1523,13 +1503,13 @@ mkldnn_status_t MKLDNN_API mkldnn_batch_normalization_backward_desc_init( * @{ */ /** Initializes an inner product descriptor @p ip_desc for forward propagation - * using @p prop_kind (possible values are #mkldnn_forward_training or + * using @p prop_kind (possible values are #mkldnn_forward_training and * #mkldnn_forward_inference) and memory descriptors. In order to create an * inner product without bias, @p bias_desc should be either @c NULL or a - * pointer to descriptor with memory format equals to #mkldnn_format_undef. + * pointer to a descriptor with memory format equal to #mkldnn_format_undef. * * @note - * memory descriptors are allowed to be initialized with #mkldnn_any value + * Memory descriptors are allowed to be initialized with #mkldnn_any value * of @p format_kind. * * Order of inputs: @@ -1551,7 +1531,7 @@ mkldnn_status_t MKLDNN_API mkldnn_inner_product_forward_desc_init( * with respect to data using memory descriptors. * * @note - * memory descriptors are allowed to be initialized with #mkldnn_any value + * Memory descriptors are allowed to be initialized with #mkldnn_any value * of @p format_kind. * * Order of inputs: @@ -1571,7 +1551,7 @@ mkldnn_status_t MKLDNN_API mkldnn_inner_product_backward_data_desc_init( * with respect to weights using memory descriptors. * * @note - * memory descriptors are allowed to be initialized with #mkldnn_any value + * Memory descriptors are allowed to be initialized with #mkldnn_any value * of @p format_kind. * * Order of inputs: @@ -1591,43 +1571,17 @@ mkldnn_status_t MKLDNN_API mkldnn_inner_product_backward_weights_desc_init( /** @} */ -/** @addtogroup c_api_convolution_relu Convolution followed by ReLU (deprecated) - * A merged primitive to compute a convolution followed by relu. - * @{ */ - -/** Initializes a merged convolution-relu descriptor @p conv_relu_desc for - * forward propagation (supported inference mode only) using convolution - * descriptor @p conv_desc and ReLU parameter @p negative slope. - * - * @deprecated use mkldnn_convolution_desc_init with - * mkldnn_post_ops_append_eltwise to append ReLU - * - * Order of inputs: - * - src (#mkldnn_query_src_pd, 0) - * - weights (#mkldnn_query_weights_pd, 0) - * - bias (#mkldnn_query_weights_pd, 1), - * if convolution is created with bias - * - * Order of outputs: - * - dst (#mkldnn_query_dst_pd, 0) - */ -mkldnn_status_t MKLDNN_API mkldnn_convolution_relu_desc_init( - mkldnn_convolution_relu_desc_t *conv_relu_desc, - const mkldnn_convolution_desc_t *conv_desc, float negative_slope); - -/** @} */ - /** @addtogroup c_api_rnn RNN - * A primitive to compute common recurrent layer. + * A primitive to compute the common recurrent layer. * @todo add additional description for the group * @{ */ /** * Initializes a recurrent cell descriptor @p rnn_cell_desc * using @p rnn_cell_desc, @p kind (possible values are - * #mkldnn_vanilla_rnn, #mkldnn_vanilla_lstm, #mkldnn_vanilla_gru, + * #mkldnn_vanilla_rnn, #mkldnn_vanilla_lstm, #mkldnn_vanilla_gru, and * #mkldnn_gru_linear_before_reset), - * @p f (possible values are #mkldnn_eltwise_relu, + * @p f (possible values are #mkldnn_eltwise_relu and * #mkldnn_eltwise_tanh), @p flags, @p alpha, and @p clipping. */ mkldnn_status_t MKLDNN_API mkldnn_rnn_cell_desc_init( @@ -1643,16 +1597,94 @@ int MKLDNN_API mkldnn_rnn_cell_get_gates_count( int MKLDNN_API mkldnn_rnn_cell_get_states_count( const mkldnn_rnn_cell_desc_t *rnn_cell_desc); +/** Sets quantization @p scale and @p shift for RNN data tensors. + * For performance reasons, low precision configuration of RNN primitive + * expects input activations to have unsigned int8 data type. Scale and shift + * used to quantize floating point data to unsigned integer must be passed to + * RNN primitive using attributes. + * Example usage: + * @code + * // rnn parameters + * int l = 2, t = 2, mb = 32, sic = 32, slc = 32, dic = 32, dlc = 32; + * // activations quantization parameters + * float scale = ..., shift = ..; + * + * mkldnn_primitive_attr_t rnn_attr; + * // create default attributes + * mkldnn_primitive_attr_create(&rnn_attr); + * + * // set scale and shift for int8 quantization of activation + * mkldnn_primitive_attr_set_rnn_data_qparams(rnn_attr, scale, shift); + * + * // create & configure rnn op_desc + * mkldnn_rnn_desc_t rnn_d; + * mkldnn_primitive_desc_t rnn_pd; + * mkldnn_primitive_desc_create_v2(&rnn_pd, &rnn_d, attr, NULL); + * @endcode + * @note + * Quantization scale and shift are common for src_layer, src_iter, + * dst_iter and dst_layer. + */ +mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_set_rnn_data_qparams( + mkldnn_primitive_attr_t attr, const float scale, const float shift); + +/** Sets quantization scales @p weights_scales for RNN weights tensors. + * Low precision configuration of RNN primitive expects input weights to have + * signed int8 data type. Scales used to quantize floating point data + * to signed integer must be passed to RNN primitive using attributes. + * The @p mask argument defines correspondence between output tensor dimensions + * and the @p weights_scales array. Set i-th bit of @p mask to 1 to use + * dedicated scaling factor for each slice of the output tensor over i-th + * dimension. Set @p mask to 0 to use common scaling factor for the whole output + * tensor. Example usage: + * @code + * // rnn parameters + * int l = 2, t = 2, mb = 32, sic = 32, slc = 32, dic = 32, dlc = 32; + * // unique output scales per output channel + * float weights_scales[dic * n_gates] = { ... }; + * // mask that specifies last two dimensions of ldigo format + * int mask = 0x3; + * + * mkldnn_primitive_attr_t attr; + * // create default attributes + * mkldnn_primitive_attr_create(&attr); + * + * // set output channel-wise weights scales + * mkldnn_primitive_attr_set_rnn_weights_qparams(attr, dic * n_gates, mask, + * weights_scales); + * + * // create & configure rnn op_desc + * mkldnn_rnn_desc_t rnn_d; + * mkldnn_primitive_desc_t rnn_pd; + * mkldnn_primitive_desc_create_v2(&rnn_pd, &rnn_d, attr, NULL); + * @endcode + * @note + * The dimension order is always native and does not depend on the actual + * layout used. For example, 5 dimensional weights always have + * (l, d, i, g, o) logical dimension ordering. + * @note + * Quantization sales are common for weights_layer and weights_iteration + * @note + * There is no way to check that @p count corresponds to @p mask until an + * actual primitive descriptor is created, so it is user's responsibility + * to set proper values. The following formula must be held: + * + * \f[count = \prod\limits_{d \in mask} output.dims[d]\f] + */ +mkldnn_status_t MKLDNN_API mkldnn_primitive_attr_set_rnn_weights_qparams ( + mkldnn_primitive_attr_t attr, int count, int mask, + const float *weights_scales); + /** Initializes a rnn descriptor @p rnn_desc for forward propagation * using @p prop_kind, @p rnn_cell_desc, @p direction, and memory descriptors. - * @note if @p prop_kind equals #mkldnn_forward_training, you need to query a + * @note If @p prop_kind equals #mkldnn_forward_training, you must query a * workspace memory descriptor before creating the primitive. * - * @p src_iter_desc, @p bias_desc, and @p dst_iter_desc are allowed to be - * either NULL or point to a zero memory descriptor that would indicate + * @p src_iter_desc, @p bias_desc, and @p dst_iter_desc are allowed to either be + * @c NULL or point to a zero memory descriptor, which would indicate that the * RNN primitive should not use them. * - * @note all memory descriptors except @p src_iter_desc are allowed to be + * @note All memory descriptors except @p src_iter_desc are allowed to be * initialized with #mkldnn_any value of @p format_kind. * * Order of inputs: @@ -1682,14 +1714,14 @@ mkldnn_status_t MKLDNN_API mkldnn_rnn_forward_desc_init( /** Initializes a rnn descriptor @p rnn_desc for backward propagation * using @p prop_kind, @p rnn_cell_desc, @p direction, and memory descriptors. - * @note all memory descriptors are allowed to be initialized with + * @note All memory descriptors are allowed to be initialized with * #mkldnn_any value of @p format_kind. * * @p src_iter_desc (simultaneously with @p diff_src_iter_desc), * @p bias_desc (simultaneously with @p diff_bias_desc), and - * @p dst_iter_desc (simultaneously with @p diff_src_iter_desc) are allowed - * to be either NULL or point to a zero memory descriptor that would indicate - * RNN primitive should not use them. + * @p dst_iter_desc (simultaneously with @p diff_src_iter_desc) are allowed to + * either be @c NULL or point to a zero memory descriptor, which would indicate + * that the RNN primitive should not use them. * * Order of inputs: * - src_layer (#mkldnn_query_src_pd, 0) @@ -1747,6 +1779,50 @@ mkldnn_status_t MKLDNN_API mkldnn_roi_pooling_forward_desc_init( /** @} */ +/** @addtogroup c_api_binary_convolution Binary convolution + * A primitive to compute binary convolution using different algorithms. + * @{ */ + +/** Initializes a dilated binary convolution descriptor @p bin_conv_desc for forward + * propagation using @p prop_kind (possible values are #mkldnn_forward_training + * or #mkldnn_forward_inference), @p alg_kind, memory descriptors, @p strides, + * @p dilates, @p padding_l, @p padding_r, and @p padding_kind. + * + * @note if @p padding_r is @c NULL, the padding is supposed to be symmetric + * + * @note memory descriptors are allowed to be initialized with #mkldnn_any + * value of @p format_kind. + * + * Order of inputs: + * - src (#mkldnn_query_src_pd, 0) + * - weights (#mkldnn_query_weights_pd, 0) + * + * Order of outputs: + * - dst (#mkldnn_query_dst_pd, 0) + */ +mkldnn_status_t MKLDNN_API mkldnn_dilated_binary_convolution_forward_desc_init( + mkldnn_binary_convolution_desc_t *bin_conv_desc, mkldnn_prop_kind_t prop_kind, + mkldnn_alg_kind_t alg_kind, const mkldnn_memory_desc_t *src_desc, + const mkldnn_memory_desc_t *weights_desc, + const mkldnn_memory_desc_t *dst_desc, const mkldnn_dims_t strides, + const mkldnn_dims_t dilates, const mkldnn_dims_t padding_l, + const mkldnn_dims_t padding_r, float pad_value); + +/** @} */ + +/** @addtogroup c_api_binarization Binarization + * A primitive to binarize input using different approaches + * @{ */ + +/** Initializes a @p binarization_desc for forward propagation using @p prop_kind + * (possible values are #mkldnn_forward_training or #mkldnn_forward_inference), + * @p alg_kind algorithm and memory descriptors. + * @sa mkldnn_binarization_desc_t for details */ +mkldnn_status_t MKLDNN_API mkldnn_binarization_forward_desc_init( + mkldnn_binarization_desc_t *binarization_desc, mkldnn_prop_kind_t prop_kind, + mkldnn_alg_kind_t alg_kind, const mkldnn_memory_desc_t *src_desc, + const mkldnn_memory_desc_t *dst_desc, const mkldnn_memory_desc_t *weights_desc); + /** @} */ /** @addtogroup c_api_engine Engine operations @@ -1803,13 +1879,31 @@ mkldnn_status_t MKLDNN_API mkldnn_stream_destroy(mkldnn_stream_t stream); /** Sets verbosity level (print information to stdout). * Possible levels are: - * - 0 -- no verbose output + * - 0 -- no verbose output (default) * - 1 -- primitive information at execution * - 2 -- primitive information at creation and execution * * @note - * Dumping information might affect performance */ -mkldnn_status_t MKLDNN_API mkldnn_verbose_set(int level); + * Dumping information might affect performance. + * This setting overrides the MKLDNN_VERBOSE environment variable. */ +mkldnn_status_t MKLDNN_API mkldnn_set_verbose(int level); + +/** Sets jit dump control. + * dump equals: + * - zero -- turn jit dump off (default) + * - non-zero -- turn jit dump on + * + * @note + * This setting overrides the MKLDNN_JIT_DUMP environment variable. */ +mkldnn_status_t MKLDNN_API mkldnn_set_jit_dump(int dump); + +/** Gets library version information. + * Version information includes: + * - major -- major version number + * - minor -- minor version number + * - patch -- patch release number + * - hash -- git commit hash */ +const mkldnn_version_t MKLDNN_API *mkldnn_version(); /** Returns cache size for specified level in bytes. * @note @@ -1820,44 +1914,60 @@ unsigned int MKLDNN_API mkldnn_get_cache_size(int level, int per_core); /** @} */ /** @addtogroup c_api_blas BLAS functions + * A subset of Basic Linear ALgebra (BLAS) functions to perform + * matrix-matrix multiplication. * @{ */ -/** SGEMM performs matrix-matrix multiplication operation - * C := alpha*op( A )*op( B ) + beta*C, - * where op( X ) is one of - * op( X ) = X or op( X ) = X**T, - * alpha and beta are scalars, and A, B and C are matrices, with op( A ) - * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +/** SGEMM performs a matrix-matrix multiplication operation defined as + * + * C := alpha*op( A )*op( B ) + beta*C + * + * where + * - op( X ) is one of op( X ) = X or op( X ) = X**T, + * - alpha and beta are scalars, + * - A, B and C are matrices, with op( A ) an m by k matrix, op( B ) a k by n matrix + * and C an m by n matrix. + * + * The matrices are assumed to be stored in column-major order (the elements + * in a matrix columns are contiguous in memory). + * * @note - * API is different compared to standard BLAS routine - * as it returns mkldnn_status_t for error handling. + * The API is different from the standard BLAS routine + * because it returns mkldnn_status_t for error handling. * XERBLA is not supported: no error message will be printed - * in case of incorrect parameters */ + * in case of incorrect parameters. */ mkldnn_status_t MKLDNN_API mkldnn_sgemm(const char *transa, const char *transb, const int *M, const int *N, const int *K, const float *alpha, const float *A, const int *lda, const float *B, const int *ldb, const float *beta, float *C, const int *ldc); -/** gemm_s8u8s32 and gemm_s8s8s32 perform matrix-matrix multiplication operation - * and add the result to a scalar-matrix product. To get the final result, - * a vector is added to each row or column of the output matrix. +/** gemm_s8u8s32 and gemm_s8s8s32 perform a matrix-matrix multiplication + * operation and add the result to a scalar-matrix product. For the final + * result, a vector is added to each row or column of the output matrix. * The operation is defined as: + * * C := alpha*(op(A) + A_offset) * (op(B) + B_offset) + beta*C + C_offset - * where op( X ) = X or op( X ) = X**T, - * A_offset is an m-by-k matrix with every element equal to the value oa, - * B_offset is an k-by-n matrix with every element equal to the value ob, - * C_offset is an m-by-n matrix defined by the oc array, size len: - * if offsetc = F: len must be at least 1 - * if offsetc = C: len must be at least max(1, m) - * if offsetc = R: len must be at least max(1, n) - * alpha and beta are scalars, and A, B and C are matrices, with op( A ) - * an m-by-k matrix, op( B ) a k-by-n matrix and C an m-by-n matrix. + * + * where + * - op( X ) = X or op( X ) = X**T, + * - A_offset is an m-by-k matrix with every element equal to the value oa, + * - B_offset is an k-by-n matrix with every element equal to the value ob, + * - C_offset is an m-by-n matrix defined by the oc array, size len: + * - if offsetc = F: len must be at least 1 + * - if offsetc = C: len must be at least max(1, m) + * - if offsetc = R: len must be at least max(1, n) + * - alpha and beta are scalars, and A, B and C are matrices, with op( A ) + * an m-by-k matrix, op( B ) a k-by-n matrix and C an m-by-n matrix. + * + * The matrices are assumed to be stored in column-major order (the elements + * in a matrix columns are contiguous in memory). + * * @note - * API is different compared to standard BLAS routine - * as it returns mkldnn_status_t for error handling. + * The API is different compared with the standard BLAS routine + * because it returns mkldnn_status_t for error handling. * XERBLA is not supported: no error message will be printed - * in case of incorrect parameters */ + * in case of incorrect parameters. */ mkldnn_status_t MKLDNN_API mkldnn_gemm_s8u8s32(const char *transa, const char *transb, const char *offsetc, const int *M, const int *N, const int *K, const float *alpha, const int8_t *A, const int *lda, diff --git a/inference-engine/thirdparty/mkl-dnn/include/mkldnn.hpp b/inference-engine/thirdparty/mkl-dnn/include/mkldnn.hpp index b0869e7..2ce46c9 100644 --- a/inference-engine/thirdparty/mkl-dnn/include/mkldnn.hpp +++ b/inference-engine/thirdparty/mkl-dnn/include/mkldnn.hpp @@ -123,14 +123,14 @@ public: shuffle = mkldnn_shuffle, eltwise = mkldnn_eltwise, depthwise = mkldnn_depthwise, - relu = mkldnn_relu, softmax = mkldnn_softmax, pooling = mkldnn_pooling, lrn = mkldnn_lrn, batch_normalization = mkldnn_batch_normalization, inner_product = mkldnn_inner_product, - convolution_relu = mkldnn_convolution_relu, rnn = mkldnn_rnn, + binary_convolution = mkldnn_binary_convolution, + binarization = mkldnn_binarization, }; /// A wrapper structure to specify a particular output of a primitive. @@ -149,7 +149,7 @@ public: inline operator primitive() const; }; - /// Returns the descriptor of the underlying C API primitive + /// Returns the descriptor of the underlying C API primitive. inline const_mkldnn_primitive_desc_t get_primitive_desc() const; // TODO: use the C++ API wrapper structure. }; @@ -257,6 +257,7 @@ inline mkldnn_prop_kind_t convert_to_c(prop_kind kind) { enum algorithm { algorithm_undef = mkldnn_alg_kind_undef, + convolution_auto = mkldnn_convolution_auto, convolution_direct = mkldnn_convolution_direct, convolution_winograd = mkldnn_convolution_winograd, deconvolution_direct = mkldnn_deconvolution_direct, @@ -272,6 +273,8 @@ enum algorithm { eltwise_soft_relu = mkldnn_eltwise_soft_relu, eltwise_logistic = mkldnn_eltwise_logistic, eltwise_clamp = mkldnn_eltwise_clamp, + eltwise_exp = mkldnn_eltwise_exp, + eltwise_not = mkldnn_eltwise_not, depthwise_scale_shift = mkldnn_depthwise_scale_shift, depthwise_prelu = mkldnn_depthwise_prelu, lrn_across_channels = mkldnn_lrn_across_channels, @@ -285,7 +288,9 @@ enum algorithm { vanilla_gru = mkldnn_vanilla_gru, gru_linear_before_reset = mkldnn_gru_linear_before_reset, roi_pooling_max = mkldnn_roi_pooling_max, - roi_pooling_bilinear = mkldnn_roi_pooling_bilinear + roi_pooling_bilinear = mkldnn_roi_pooling_bilinear, + binary_convolution_direct = mkldnn_binary_convolution_direct, + binarization_depthwise = mkldnn_binarization_depthwise }; inline mkldnn_alg_kind_t convert_to_c(algorithm aalgorithm) { @@ -295,7 +300,6 @@ inline mkldnn_alg_kind_t convert_to_c(algorithm aalgorithm) { enum batch_normalization_flag { use_global_stats = mkldnn_use_global_stats, use_scale_shift = mkldnn_use_scaleshift, - omit_stats = mkldnn_omit_stats, fuse_bn_relu = mkldnn_fuse_bn_relu }; @@ -337,14 +341,14 @@ enum query { shuffle_d = mkldnn_query_shuffle_d, eltwise_d = mkldnn_query_eltwise_d, depthwise_d = mkldnn_query_depthwise_d, - relu_d = mkldnn_query_relu_d, softmax_d = mkldnn_query_softmax_d, pooling_d = mkldnn_query_pooling_d, lrn_d = mkldnn_query_lrn_d, batch_normalization_d = mkldnn_query_batch_normalization_d, inner_product_d = mkldnn_query_inner_product_d, - convolution_relu_d = mkldnn_query_convolution_relu_d, rnn_d = mkldnn_query_rnn_d, + binary_convolution_d = mkldnn_query_binary_convolution_d, + binarization_d = mkldnn_query_binarization_d, input_pd = mkldnn_query_input_pd, output_pd = mkldnn_query_output_pd, @@ -448,6 +452,18 @@ struct post_ops: public handle { &in_h, &in_w, &ker_h, &ker_w, &str_h, &str_w, weights_data, biases_data), "could not get dw conv params"); } + + void append_binarization(algorithm alg, const float* weights_data) { + error::wrap_c_api(mkldnn_post_ops_append_binarization(get(), convert_to_c(alg), weights_data), + "could not append binarization"); + } + + void get_params_binarization(int index, algorithm &alg, const float** weights_data) const { + mkldnn_alg_kind_t c_alg; + error::wrap_c_api(mkldnn_post_ops_get_params_binarization(get(), index, &c_alg, weights_data), + "could not get binarization params"); + alg = static_cast(c_alg); + } }; #ifndef DOXYGEN_SHOULD_SKIP_THIS @@ -511,12 +527,25 @@ struct primitive_attr: public handle { error::wrap_c_api(mkldnn_primitive_attr_set_post_ops(get(), ops.get()), "could not set post operation sequence"); } + + void set_rnn_data_qparams(const float scale, const float shift) + { + error::wrap_c_api(mkldnn_primitive_attr_set_rnn_data_qparams(get(), + scale, shift), "could not set rnn data int scale/shift"); + } + + void set_rnn_weights_qparams(int mask, const std::vector &scales) + { + error::wrap_c_api(mkldnn_primitive_attr_set_rnn_weights_qparams(get(), + (int)scales.size(), mask, &scales[0]), + "could not set rnn weights int scales"); + } }; /// @} /// @addtogroup cpp_api_engine Engine -/// Engine operations +/// Engine operations. /// /// @sa @ref c_api_engine in @ref c_api /// @{ @@ -532,7 +561,7 @@ struct engine: public handle { friend class primitive; // gcc bug??? using handle::handle; - /// Kinds of engines + /// Kinds of engines. enum kind { /// An unspecified engine any = mkldnn_any_engine, @@ -600,7 +629,7 @@ private: /// @addtogroup cpp_api_memory Memory /// A primitive to describe and store data. /// -/// For more information please refer to @ref c_api_memory in @ref c_api +/// For more information, refer to @ref c_api_memory in @ref c_api. /// @{ /// Memory primitive that describes the data. @@ -626,6 +655,7 @@ struct memory: public primitive { s16 = mkldnn_s16, s8 = mkldnn_s8, u8 = mkldnn_u8, + bin = mkldnn_bin, }; /// Memory format specification. See #mkldnn_memory_format_t @@ -642,22 +672,28 @@ struct memory: public primitive { nchw = mkldnn_nchw, nhwc = mkldnn_nhwc, chwn = mkldnn_chwn, + nCw4c = mkldnn_nCw4c, nCw8c = mkldnn_nCw8c, + nChw4c = mkldnn_nChw4c, nChw8c = mkldnn_nChw8c, nChw16c = mkldnn_nChw16c, ncdhw = mkldnn_ncdhw, ndhwc = mkldnn_ndhwc, + nCdhw4c = mkldnn_nCdhw4c, nCdhw8c = mkldnn_nCdhw8c, nCdhw16c = mkldnn_nCdhw16c, oi = mkldnn_oi, io = mkldnn_io, oiw = mkldnn_oiw, wio = mkldnn_wio, + Owi4o = mkldnn_Owi4o, + OIw4i4o = mkldnn_OIw4i4o, Owi8o = mkldnn_Owi8o, OIw8o8i = mkldnn_OIw8o8i, OIw8i8o = mkldnn_OIw8i8o, OIw16i16o = mkldnn_OIw16i16o, OIw16o16i = mkldnn_OIw16o16i, + Oiw4o = mkldnn_Oiw4o, Oiw16o = mkldnn_Oiw16o, Owi16o = mkldnn_Owi16o, OIw8i16o2i = mkldnn_OIw8i16o2i, @@ -666,20 +702,25 @@ struct memory: public primitive { oihw = mkldnn_oihw, ihwo = mkldnn_ihwo, hwio = mkldnn_hwio, + iohw = mkldnn_iohw, hwio_s8s8 = mkldnn_hwio_s8s8, dhwio = mkldnn_dhwio, oidhw = mkldnn_oidhw, + OIdhw4i4o = mkldnn_OIdhw4i4o, + Odhwi4o = mkldnn_Odhwi4o, OIdhw8i8o = mkldnn_OIdhw8i8o, OIdhw8o8i = mkldnn_OIdhw8o8i, Odhwi8o = mkldnn_Odhwi8o, OIdhw16i16o = mkldnn_OIdhw16i16o, OIdhw16o16i = mkldnn_OIdhw16o16i, + Oidhw4o = mkldnn_Oidhw4o, Oidhw16o = mkldnn_Oidhw16o, Odhwi16o = mkldnn_Odhwi16o, oIhw8i = mkldnn_oIhw8i, oIhw16i = mkldnn_oIhw16i, oIdhw8i = mkldnn_oIdhw8i, oIdhw16i = mkldnn_oIdhw16i, + OIhw4i4o = mkldnn_OIhw4i4o, OIhw8i8o = mkldnn_OIhw8i8o, OIhw16i16o = mkldnn_OIhw16i16o, OIhw8o8i = mkldnn_OIhw8o8i, @@ -691,18 +732,25 @@ struct memory: public primitive { OIhw4i16o4i = mkldnn_OIhw4i16o4i, OIhw4i16o4i_s8s8 = mkldnn_OIhw4i16o4i_s8s8, Oihw8o = mkldnn_Oihw8o, + Oihw4o = mkldnn_Oihw4o, Oihw16o = mkldnn_Oihw16o, Ohwi8o = mkldnn_Ohwi8o, + Ohwi4o = mkldnn_Ohwi4o, Ohwi16o = mkldnn_Ohwi16o, OhIw16o4i = mkldnn_OhIw16o4i, OhIw8o4i = mkldnn_OhIw8o4i, + OhIw8o32i = mkldnn_OhIw8o32i, + OhIw16o32i = mkldnn_OhIw16o32i, OhIw8o4i_s8s8 = mkldnn_OhIw8o4i_s8s8, goiw = mkldnn_goiw, + gOwi4o = mkldnn_gOwi4o, + gOIw4i4o = mkldnn_gOIw4i4o, gOwi8o = mkldnn_gOwi8o, gOIw8o8i = mkldnn_gOIw8o8i, gOIw8i8o = mkldnn_gOIw8i8o, gOIw16i16o = mkldnn_gOIw16i16o, gOIw16o16i = mkldnn_gOIw16o16i, + gOiw4o = mkldnn_gOiw4o, gOiw16o = mkldnn_gOiw16o, gOwi16o = mkldnn_gOwi16o, gOIw8i16o2i = mkldnn_gOIw8i16o2i, @@ -710,10 +758,14 @@ struct memory: public primitive { gOIw8o16i2o = mkldnn_gOIw8o16i2o, goihw = mkldnn_goihw, hwigo = mkldnn_hwigo, + giohw = mkldnn_giohw, hwigo_s8s8 = mkldnn_hwigo_s8s8, + gOIdhw4i4o = mkldnn_gOIdhw4i4o, + gOdhwi4o = mkldnn_gOdhwi4o, gOIdhw8i8o = mkldnn_gOIdhw8i8o, gOIdhw8o8i = mkldnn_gOIdhw8o8i, gOdhwi8o = mkldnn_gOdhwi8o, + gOIhw4i4o = mkldnn_gOIhw4i4o, gOIhw8i8o = mkldnn_gOIhw8i8o, gOIhw16i16o = mkldnn_gOIhw16i16o, gOIhw8i16o2i = mkldnn_gOIhw8i16o2i, @@ -721,12 +773,19 @@ struct memory: public primitive { gOIhw8o16i2o = mkldnn_gOIhw8o16i2o, gOIhw4i16o4i = mkldnn_gOIhw4i16o4i, gOIhw4i16o4i_s8s8 = mkldnn_gOIhw4i16o4i_s8s8, + gOIhw2i8o4i = mkldnn_gOIhw2i8o4i, + gOIhw2i8o4i_s8s8 = mkldnn_gOIhw2i8o4i_s8s8, gOihw8o = mkldnn_gOihw8o, + gOihw4o = mkldnn_gOihw4o, gOihw16o = mkldnn_gOihw16o, + gOhwi4o = mkldnn_gOhwi4o, gOhwi8o = mkldnn_gOhwi8o, gOhwi16o = mkldnn_gOhwi16o, Goihw8g = mkldnn_Goihw8g, Goihw16g = mkldnn_Goihw16g, + Goihw16g_s8s8 = mkldnn_Goihw16g_s8s8, + gOIhw4o4i = mkldnn_gOIhw4o4i, + gOIhw4o4i_s8s8 = mkldnn_gOIhw4o4i_s8s8, gOIhw8o8i = mkldnn_gOIhw8o8i, gOIhw16o16i = mkldnn_gOIhw16o16i, gIOhw16o16i = mkldnn_gIOhw16o16i, @@ -736,16 +795,16 @@ struct memory: public primitive { goidhw = mkldnn_goidhw, gOIdhw16i16o = mkldnn_gOIdhw16i16o, gOIdhw16o16i = mkldnn_gOIdhw16o16i, + gOidhw4o = mkldnn_gOidhw4o, gOidhw16o = mkldnn_gOidhw16o, gOdhwi16o = mkldnn_gOdhwi16o, ntc = mkldnn_ntc, tnc = mkldnn_tnc, ldsnc = mkldnn_ldsnc, ldigo = mkldnn_ldigo, - ldigo_p = mkldnn_ldigo_p, ldgoi = mkldnn_ldgoi, - ldgoi_p = mkldnn_ldgoi_p, ldgo = mkldnn_ldgo, + rnn_packed = mkldnn_rnn_packed, wino_fmt = mkldnn_wino_fmt, format_last = mkldnn_format_last, }; @@ -1080,7 +1139,7 @@ struct view : public primitive { /// @} /// @addtogroup cpp_api_concat Concat -/// A primitive to concatenate data by arbitrary dimension +/// A primitive to concatenate data by arbitrary dimension. /// /// @sa @ref c_api_concat in @ref c_api /// @{ @@ -1157,7 +1216,7 @@ struct concat : public primitive { /// @} /// @addtogroup cpp_api_sum Sum -/// A primitive to sum data +/// A primitive to sum data. /// /// @sa @ref c_api_sum in @ref c_api /// @{ @@ -1211,38 +1270,6 @@ struct sum : public primitive { reset(result); } - /** @deprecated: api backwards compatibility for double scales type */ - MKLDNN_DEPRECATED - primitive_desc(const memory::desc &output, std::vector scale, - std::vector inputs) { - mkldnn_primitive_desc_t result; - - auto c_api_inputs = cpp_to_c(inputs); - auto scale_f = scale_to_float(scale); - - error::wrap_c_api(mkldnn_sum_primitive_desc_create( - &result, &output.data, (int)c_api_inputs.size(), - &scale_f[0], &c_api_inputs[0]), - "could not create a sum primitive descriptor"); - reset(result); - } - - /** @deprecated: api backwards compatibility for double scales type */ - MKLDNN_DEPRECATED - primitive_desc(std::vector scale, - std::vector inputs) { - mkldnn_primitive_desc_t result; - - auto c_api_inputs = cpp_to_c(inputs); - auto scale_f = scale_to_float(scale); - - error::wrap_c_api(mkldnn_sum_primitive_desc_create( - &result, nullptr, (int)c_api_inputs.size(), &scale_f[0], - &c_api_inputs[0]), - "could not create a sum primitive descriptor"); - reset(result); - } - memory::primitive_desc dst_primitive_desc() const { memory::primitive_desc adesc; mkldnn_primitive_desc_t cdesc; @@ -1273,14 +1300,6 @@ struct sum : public primitive { "could not create a sum primitive"); reset(result); } - -private: - static std::vector scale_to_float(const std::vector &vd) { - std::vector vf(vd.size()); - std::transform(vd.begin(), vd.end(), vf.begin(), - [=](double x){return (float)x;}); - return vf; - } }; /// @} @@ -1293,7 +1312,7 @@ private: /// @addtogroup cpp_api_primitive_descriptors Primitive descriptors /// @{ -/// A base class for all primitive descriptors +/// A base class for all primitive descriptors. struct primitive_desc : public handle { primitive_desc(const_mkldnn_op_desc_t desc, const primitive_attr *attr, const engine &e, const_mkldnn_primitive_desc_t hint_fwd_pd) { @@ -1331,7 +1350,7 @@ struct primitive_desc : public handle { return res; } - /// Advances the next implementation for the given op descriptor + /// Advances the next implementation for the given op descriptor. /// /// Returns: /// - @c true on success @@ -1347,7 +1366,7 @@ struct primitive_desc : public handle { return true; } - /// Queries and returns requested memory primitive descriptor + /// Queries and returns requested memory primitive descriptor. memory::primitive_desc query_mpd(query what, int idx = 0) const { std::vector valid_w{input_pd, output_pd, src_pd, diff_src_pd, weights_pd, diff_weights_pd, dst_pd, diff_dst_pd, workspace_pd}; @@ -1727,66 +1746,6 @@ struct convolution_backward_weights : public primitive { } }; -/// A merged convolution-relu primitive for inference mode only -/// -/// @deprecated consider using convolution_forward with post_ops -/// (e.g. post_ops::append_eltwise(1.f, #eltwise_relu, negative_slope, 0.f) -struct convolution_relu_forward : public primitive { - struct desc { - mkldnn_convolution_relu_desc_t data; - - desc(const convolution_forward::desc conv_desc, - const float negative_slope) { - error::wrap_c_api(mkldnn_convolution_relu_desc_init(&data, - &conv_desc.data, negative_slope), - "could not create a convolution_relu_forward descriptor"); - } - }; - - struct primitive_desc : public mkldnn::primitive_desc { - primitive_desc(const desc &desc, const engine &e) - : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {} - - REG_QUERY_MPD(src, src, 0); - REG_QUERY_MPD(weights, weights, 0); - REG_QUERY_MPD(bias, weights, 1); - REG_QUERY_MPD(dst, dst, 0); - }; - - /// @deprecated consider using convolution_forward + post_ops - MKLDNN_DEPRECATED - convolution_relu_forward(const primitive_desc &aprimitive_desc, - const primitive::at &src, const primitive::at &weights, - const primitive::at &bias, const memory &dst) { - mkldnn_primitive_t result; - mkldnn_primitive_at_t inputs[] = { src.data, weights.data, - bias.data }; - const_mkldnn_primitive_t outputs[] = { dst.get() }; - check_num_parameters(aprimitive_desc.get(), 3, 1, - "convolution relu forward"); - error::wrap_c_api(mkldnn_primitive_create(&result, - aprimitive_desc.get(), inputs, outputs), - "could not create a convolution relu forward primitive"); - reset(result); - } - - /// @deprecated consider using convolution_forward + post_ops - MKLDNN_DEPRECATED - convolution_relu_forward(const primitive_desc &aprimitive_desc, - const primitive::at &src, const primitive::at &weights, - const memory &dst) { - mkldnn_primitive_t result; - mkldnn_primitive_at_t inputs[] = { src.data, weights.data }; - const_mkldnn_primitive_t outputs[] = { dst.get() }; - check_num_parameters(aprimitive_desc.get(), 2, 1, - "convolution relu forward"); - error::wrap_c_api(mkldnn_primitive_create(&result, - aprimitive_desc.get(), inputs, outputs), - "could not create a convolution relu forward primitive"); - reset(result); - } -}; - /// @} /// @addtogroup cpp_api_deconvolution Deconvolution @@ -2450,7 +2409,7 @@ struct pooling_backward : public primitive { /// @} /// @addtogroup cpp_api_eltwise Eltwise -/// A primitive to compute element wise operations like parametric rectifier +/// A primitive to compute element-wise operations like parametric rectifier /// linear unit (ReLU). /// /// @sa @ref c_api_eltwise in @ref c_api @@ -2468,13 +2427,6 @@ struct eltwise_forward : public primitive { static_cast(alpha), static_cast(beta)), "could not create a eltwise forward descriptor"); } - - /** @deprecated: api backward compatibility for relu */ - template - MKLDNN_DEPRECATED - desc(prop_kind aprop_kind, const memory::desc &src_desc, - T negative_slope) - : desc(aprop_kind, eltwise_relu, src_desc, negative_slope) {} }; struct primitive_desc : public mkldnn::primitive_desc { @@ -2501,8 +2453,6 @@ struct eltwise_forward : public primitive { } }; -typedef eltwise_forward relu_forward; - struct eltwise_backward : public primitive { struct desc { mkldnn_eltwise_desc_t data; @@ -2516,13 +2466,6 @@ struct eltwise_backward : public primitive { static_cast(beta)), "could not create a eltwise backward descriptor"); } - - /** @deprecated: api backward compatibility for relu */ - template - MKLDNN_DEPRECATED - desc(const memory::desc &diff_data_desc, const memory::desc &data_desc, - T negative_slope): desc(eltwise_relu, diff_data_desc, data_desc, - negative_slope) {} }; struct primitive_desc : public mkldnn::primitive_desc { @@ -2553,8 +2496,6 @@ struct eltwise_backward : public primitive { } }; -typedef eltwise_backward relu_backward; - /// @} /// @addtogroup cpp_api_depthwise Depthwise @@ -2569,8 +2510,8 @@ struct depthwise_forward : public primitive { const memory::desc &bias_desc) { error::wrap_c_api(mkldnn_depthwise_forward_desc_init(&data, mkldnn::convert_to_c(aprop_kind), - mkldnn::convert_to_c(alg_kind), - &src_desc.data, &dst_desc.data, + mkldnn::convert_to_c(alg_kind), + &src_desc.data, &dst_desc.data, &weights_desc.data, &bias_desc.data), "could not create a depthwise forward descriptor"); } @@ -2586,16 +2527,15 @@ struct depthwise_forward : public primitive { } }; - struct primitive_desc : public handle { - primitive_desc(const desc &adesc, const engine &aengine) { - mkldnn_primitive_desc_t result; - error::wrap_c_api(mkldnn_primitive_desc_create( - &result, &adesc.data, aengine.get(), nullptr), - "could not create a depthwise forward primitive descriptor"); - reset(result); - } + struct primitive_desc : public mkldnn::primitive_desc { + primitive_desc(const desc &desc, const engine &e) + : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {} - engine get_engine() { return engine::query(*this); } + primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e) + : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {} + + REG_QUERY_MPD(src, src, 0); + REG_QUERY_MPD(dst, dst, 0); }; depthwise_forward(const primitive_desc &aprimitive_desc, @@ -2787,12 +2727,12 @@ struct batch_normalization_forward : public primitive { reset(result); } - /// @warning batch_normalization_forward has 2 constructors with very + /// @warning batch_normalization_forward has two constructors with very /// similar signatures: /// - (pd, src, weights, dst, mean, variance) // 2 in, 3 out /// - (pd, src, dst, mean, variance, workspace) // 1 in, 4 out - /// The only way to distinguish between those is to explicitly - /// cast all input parameters to their type, i.e. to + /// The only way to distinguish between them is to explicitly + /// cast all input parameters to their type; that is, to /// const primitive:at &. batch_normalization_forward(const primitive_desc &aprimitive_desc, const primitive::at &src, const primitive::at &weights, @@ -2840,17 +2780,16 @@ struct batch_normalization_forward : public primitive { reset(result); } - /// @warning batch_normalization_forward has 2 constructors with very + /// @warning batch_normalization_forward has two constructors with very /// similar signatures: /// - (pd, src, weights, dst, mean, variance) // 2 in, 3 out /// - (pd, src, dst, mean, variance, workspace) // 1 in, 4 out - /// The only way to distinguish between those is to explicitly - /// cast all input parameters to their type, i.e. to + /// The only way to distinguish between them is to explicitly + /// cast all input parameters to their type; that is, to /// const primitive:at &. - /// @note to make users' experience a little bit better this constructor - /// checks if whether parameters match corresponding primitive - /// descriptor, and if they are not -- call the other (proper) - /// constructor. Yeah, this is still very ugly... + /// @note To make users' experience a little better, this constructor + /// checks whether parameters match the corresponding primitive + /// descriptor, and if not, calls the other (proper) constructor. batch_normalization_forward(const primitive_desc &aprimitive_desc, const primitive::at &src, const memory &dst, const memory &mean, const memory &variance, const memory &workspace) { @@ -3365,10 +3304,6 @@ struct rnn_backward : public primitive { }; struct primitive_desc : public mkldnn::primitive_desc { - MKLDNN_DEPRECATED - primitive_desc(const desc &desc, const engine &e) - : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {} - primitive_desc(const desc &desc, const engine &e, const rnn_forward::primitive_desc &hint_fwd_pd) : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {} @@ -3520,10 +3455,113 @@ struct shuffle_backward : public primitive { /// @} +/// @addtogroup cpp_api_binary_convolution Binary convolution +/// A primitive to compute binary convolution using different algorithms. +/// +/// @sa @ref c_api_binary_convolution in @ref c_api +/// @{ + +struct binary_convolution_forward: public primitive { + struct desc { + mkldnn_binary_convolution_desc_t data; + desc(prop_kind aprop_kind, algorithm aalgorithm, + const memory::desc &src_desc, + const memory::desc &weights_desc, + const memory::desc &dst_desc, + const memory::dims strides, + const memory::dims dilates, + const memory::dims padding_l, + const memory::dims padding_r, + const float pad_value) { + memory::validate_dims(strides); + memory::validate_dims(dilates); + memory::validate_dims(padding_l); + memory::validate_dims(padding_r); + error::wrap_c_api( + mkldnn_dilated_binary_convolution_forward_desc_init(&data, + mkldnn::convert_to_c(aprop_kind), convert_to_c(aalgorithm), + &src_desc.data, &weights_desc.data, &dst_desc.data, + &strides[0], &dilates[0], &padding_l[0], &padding_r[0], + pad_value), + "could not create a dilated binary convolution forward descriptor"); + } + }; + + struct primitive_desc : public mkldnn::primitive_desc { + primitive_desc(const desc &desc, const engine &e) + : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {} + + primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e) + : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {} + + REG_QUERY_MPD(src, src, 0); + REG_QUERY_MPD(weights, weights, 0); + REG_QUERY_MPD(dst, dst, 0); + }; + + binary_convolution_forward(const primitive_desc &aprimitive_desc, + const primitive::at &src, const primitive::at &weights, const memory &dst) { + mkldnn_primitive_t result; + mkldnn_primitive_at_t inputs[] = { src.data, weights.data }; + const_mkldnn_primitive_t outputs[] = { dst.get() }; + check_num_parameters(aprimitive_desc.get(), 2, 1, + "binary convolution forward"); + error::wrap_c_api(mkldnn_primitive_create(&result, + aprimitive_desc.get(), inputs, outputs), + "could not create a binary convolution forward primitive"); + reset(result); + } +}; + +/// @} + +/// @addtogroup cpp_api_binarization Binarization +/// @{ + +struct binarization_forward : public primitive { + struct desc { + mkldnn_binarization_desc_t data; + + desc(prop_kind aprop_kind, algorithm alg_kind, + const memory::desc &src_desc, const memory::desc &weights_desc, const memory::desc &dst_desc) { + error::wrap_c_api(mkldnn_binarization_forward_desc_init(&data, + mkldnn::convert_to_c(aprop_kind), + mkldnn::convert_to_c(alg_kind), + &src_desc.data, &dst_desc.data, + &weights_desc.data), + "could not create a binarization forward descriptor"); + } + }; + + struct primitive_desc : public handle { + primitive_desc(const desc &adesc, const engine &aengine) { + mkldnn_primitive_desc_t result; + error::wrap_c_api(mkldnn_primitive_desc_create( + &result, &adesc.data, aengine.get(), nullptr), + "could not create a binarization forward primitive descriptor"); + reset(result); + } + + engine get_engine() { return engine::query(*this); } + }; + + binarization_forward(const primitive_desc &aprimitive_desc, + const primitive::at &src, const primitive::at &weights, const memory &dst) { + mkldnn_primitive_t result; + mkldnn_primitive_at_t inputs[] = { src.data, weights.data }; + const_mkldnn_primitive_t outputs[] = { dst.get() }; + error::wrap_c_api(mkldnn_primitive_create(&result, aprimitive_desc.get(), inputs, outputs), + "could not create a binarization forward primitive"); + reset(result); + } +}; + +/// @} + /// @} Primitives /// @addtogroup cpp_api_stream Stream -/// Execution stream operations +/// Execution stream operations. /// /// @sa @ref c_api_stream in @ref c_api /// @{ @@ -3580,8 +3618,8 @@ struct stream: public handle { /// Waits for all computations submitted to the stream to complete. /// - /// @param block Specifies whether the operation should wait indefinitely or return - /// immediately. + /// @param block Specifies whether the operation should wait indefinitely or + /// return immediately. /// @returns @c true if all computations completed. /// @returns @c false if not all computations completed. bool wait(bool block = true) { diff --git a/inference-engine/thirdparty/mkl-dnn/include/mkldnn_debug.h b/inference-engine/thirdparty/mkl-dnn/include/mkldnn_debug.h index 568e91f..7ccba0c 100644 --- a/inference-engine/thirdparty/mkl-dnn/include/mkldnn_debug.h +++ b/inference-engine/thirdparty/mkl-dnn/include/mkldnn_debug.h @@ -67,6 +67,7 @@ const char MKLDNN_API *mkldnn_fmt2str(mkldnn_memory_format_t v); const char MKLDNN_API *mkldnn_prop_kind2str(mkldnn_prop_kind_t v); const char MKLDNN_API *mkldnn_prim_kind2str(mkldnn_primitive_kind_t v); const char MKLDNN_API *mkldnn_alg_kind2str(mkldnn_alg_kind_t v); +const char MKLDNN_API *mkldnn_rnn_direction2str(mkldnn_rnn_direction_t v); #ifdef __cplusplus } diff --git a/inference-engine/thirdparty/mkl-dnn/include/mkldnn_types.h b/inference-engine/thirdparty/mkl-dnn/include/mkldnn_types.h index b0ea527..a86eb66 100644 --- a/inference-engine/thirdparty/mkl-dnn/include/mkldnn_types.h +++ b/inference-engine/thirdparty/mkl-dnn/include/mkldnn_types.h @@ -35,6 +35,14 @@ extern "C" { * @addtogroup c_api_types_generic Generic * @{ */ +/** Intel(R) MKL-DNN Version type */ +typedef struct { + int major; + int minor; + int patch; + const char *hash; +} mkldnn_version_t; + /** Status values returned by Intel(R) MKL-DNN functions. */ typedef enum { /** The operation was successful */ @@ -72,6 +80,8 @@ typedef enum { mkldnn_s8 = 5, /** 8-bit unsigned integer. */ mkldnn_u8 = 6, + /** 1-bit integer. */ + mkldnn_bin = 7, } mkldnn_data_type_t; /** Rounding mode */ @@ -88,12 +98,12 @@ typedef enum { * is described as a sequence of the dimensions as they are laid out in the * memory (from the outer-most to the inner-most). Note that this order * doesn't affect the logical order of the dimensions that is kept in the - * `dims` field of mkldnn_memory_desc_t structure. The logical order of the + * `dims` field of the mkldnn_memory_desc_t structure. The logical order of the * dimensions is specified by the type of tensor. * - * For example, CNN 5D tensor always has its logical dimensions in order - * `(batch, channels, depth, height, width)`, while physical layout might - * be #mkldnn_ncdhw or #mkldnn_ndhwc: + * For example, CNN 5D tensor always has its logical dimensions in the order + * `(batch, channels, depth, height, width)`, while the physical layout might be + * #mkldnn_ncdhw or #mkldnn_ndhwc: * * ~~~cpp * int batch = 2, channels = 16, depth = 13, height = 13, width = 13; @@ -109,7 +119,7 @@ typedef enum { * mkldnn_memory_desc_init(&data_in_ndhwc, 5, dims, mlkdnn_ndhwc); * ~~~ * - * The following notation for memory format names: + * The following notation applies to memory format names: * - @c 'n' denotes the mini-batch dimension * - @c 'c' denotes a channels dimension * - When there are multiple channel dimensions (for example, in convolution @@ -119,14 +129,14 @@ typedef enum { * respectively * - Upper-case letters indicate that the data is laid out in blocks * for a particular dimension. In such cases, the format name contains both - * upper- and lower-case letters for that dimension with lower-case letter + * upper- and lower-case letters for that dimension with a lower-case letter * preceded by the block size. For example: @c 'mkldnn_nChw8c' describes a * format where the outermost dimension is mini-batch, followed by the * channel block number, followed by the spatial height and width, and * finally followed by 8-element channel blocks. * * @note - * Channel designations can be different. For example: both the @c + * Channel designations can be different. For example, both the @c * 'mkldnn_nc' and @c 'mkldnn_io' formats can be used to describe a 2D * tensor. * @@ -188,6 +198,9 @@ typedef enum { /** 4D weights tensor with physical layout @c ihwo. * Logical dimensions come in the order: (o, i, h, w) */ mkldnn_ihwo, + /** 4D weights tensor with physical layout @c iohw. + * Logical dimensions come in the order: (o, i, h, w) */ + mkldnn_iohw, /** 5D weights tensor with physical layout @c iodhw, used in Caffe. * Logical dimensions come in the order: (o, i, d, h, w) */ mkldnn_oidhw, @@ -205,6 +218,9 @@ typedef enum { * used in TensorFlow. * Logical dimensions come in the order: (g, o, i, h, w) */ mkldnn_hwigo, + /** 5D grouped weights tensor with the physical layout @c giohw. + * Logical dimensions come in the order: (g, o, i, h, w) */ + mkldnn_giohw, /** 6D grouped weights tensor with the physical layout @c goidhw, * used in Caffe. * Logical dimensions come in the order: (g, o, i, d, h, w) */ @@ -235,25 +251,31 @@ typedef enum { * * - For LSTM cells, the gates order is input, forget, candidate * and output gate. - * - For GRU cells, the gates order is update, reset and output gate. */ + * - For GRU cells, the gates order is update, reset and output gate. */ mkldnn_ldgo, /* Opaque data types, are not to be used explicitly */ /* data */ + mkldnn_nCw4c /** blocked data format */, mkldnn_nCw8c /** blocked data format */, mkldnn_nCw16c /** blocked data format */, + mkldnn_nChw4c /** blocked data format */, mkldnn_nChw8c /** blocked data format */, mkldnn_nChw16c /** blocked data format */, + mkldnn_nCdhw4c /** blocked data format */, mkldnn_nCdhw8c /** blocked data format */, mkldnn_nCdhw16c /** blocked data format */, /* weights, 3D */ + mkldnn_Owi4o /** blocked weights format */, + mkldnn_OIw4i4o /** blocked weights format */, mkldnn_Owi8o /** blocked weights format */, mkldnn_OIw8i8o /** blocked weights format */, mkldnn_OIw8o8i /** blocked weights format */, mkldnn_OIw16i16o /** blocked weights format */, mkldnn_OIw16o16i /** blocked weights format */, + mkldnn_Oiw4o /** blocked weights format */, mkldnn_Oiw16o /** blocked weights format */, mkldnn_Owi16o /** blocked weights format */, mkldnn_OIw8i16o2i /** blocked weights format */, @@ -268,6 +290,7 @@ typedef enum { mkldnn_hwio_s8s8, mkldnn_oIhw8i /** blocked weights format */, mkldnn_oIhw16i /** blocked weights format */, + mkldnn_OIhw4i4o /** blocked weights format */, mkldnn_OIhw8i8o /** blocked weights format */, mkldnn_OIhw16i16o /** blocked weights format */, mkldnn_OIhw4i16o4i /** blocked weights format */, @@ -282,8 +305,10 @@ typedef enum { mkldnn_OIhw16o16i /** blocked weights format */, mkldnn_IOhw16o16i /** blocked weights format */, mkldnn_Oihw8o /** blocked weights format */, + mkldnn_Oihw4o /** blocked weights format */, mkldnn_Oihw16o /** blocked weights format */, mkldnn_Ohwi8o /** blocked weights format */, + mkldnn_Ohwi4o /** blocked weights format */, mkldnn_Ohwi16o /** blocked weights format */, mkldnn_OhIw16o4i /** blocked weights format */, mkldnn_OhIw8o4i /** blocked weights format */, @@ -292,25 +317,33 @@ typedef enum { * and containing the values: * O[i:0,OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/ mkldnn_OhIw8o4i_s8s8, + mkldnn_OhIw8o32i /** blocked weights format */, + mkldnn_OhIw16o32i /** blocked weights format */, /* weights, 5D */ mkldnn_oIdhw8i /** blocked weights format */, mkldnn_oIdhw16i /** blocked weights format */, + mkldnn_OIdhw4i4o /** blocked weights format */, + mkldnn_Odhwi4o /** blocked weights format */, mkldnn_OIdhw8i8o /** blocked weights format */, mkldnn_OIdhw8o8i /** blocked weights format */, mkldnn_Odhwi8o /** blocked weights format */, mkldnn_OIdhw16i16o /** blocked weights format */, mkldnn_OIdhw16o16i /** blocked weights format */, + mkldnn_Oidhw4o /** blocked weights format */, mkldnn_Oidhw16o /** blocked weights format */, mkldnn_Odhwi16o /** blocked weights format */, mkldnn_OIdhw8i16o2i /** blocked weights format */, /* weights w/ groups, 4D */ + mkldnn_gOwi4o /** blocked weights format */, + mkldnn_gOIw4i4o /** blocked weights format */, mkldnn_gOwi8o /** blocked weights format */, mkldnn_gOIw8o8i /** blocked weights format */, mkldnn_gOIw8i8o /** blocked weights format */, mkldnn_gOIw16i16o /** blocked weights format */, mkldnn_gOIw16o16i /** blocked weights format */, + mkldnn_gOiw4o /** blocked weights format */, mkldnn_gOiw16o /** blocked weights format */, mkldnn_gOwi16o /** blocked weights format */, mkldnn_gOIw8i16o2i /** blocked weights format */, @@ -323,6 +356,7 @@ typedef enum { * multiplied by number of groups and containing the values: * O[i:0,G*OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/ mkldnn_hwigo_s8s8, + mkldnn_gOIhw4i4o /** blocked weights format */, mkldnn_gOIhw8i8o /** blocked weights format */, mkldnn_gOIhw16i16o /** blocked weights format */, mkldnn_gOIhw4i16o4i /** blocked weights format */, @@ -331,17 +365,35 @@ typedef enum { * multiplied by number of groups and containing the values: * O[i:0,G*OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/ mkldnn_gOIhw4i16o4i_s8s8, + mkldnn_gOIhw2i8o4i /** blocked weights format */, + /** blocked weights format with additional buffer + * with size equal to the number of output channels + * multiplied by number of groups and containing the values: + * O[i:0,G*OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/ + mkldnn_gOIhw2i8o4i_s8s8, mkldnn_gOIhw8i16o2i /** blocked weights format */, mkldnn_gOIhw8o16i2o /** blocked weights format */, + mkldnn_gOIhw4o4i /** blocked weights format */, + /** blocked weights format with additional buffer + * with size equal to the number of output channels + * and containing the values: + * O[i:0,OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/ + mkldnn_gOIhw4o4i_s8s8 /** blocked weights format */, mkldnn_gOIhw8o8i /** blocked weights format */, mkldnn_gOIhw16o16i /** blocked weights format */, mkldnn_gIOhw16o16i /** blocked weights format */, mkldnn_gOihw8o /** blocked weights format */, + mkldnn_gOihw4o /** blocked weights format */, mkldnn_gOihw16o /** blocked weights format */, mkldnn_gOhwi8o /** blocked weights format */, + mkldnn_gOhwi4o /** blocked weights format */, mkldnn_gOhwi16o /** blocked weights format */, mkldnn_Goihw8g /** blocked weights format */, mkldnn_Goihw16g /** blocked weights format */, + /** blocked weights format with additional buffer + * with size equal to the number of groups and containing the values: + * O[i:0,G] = -128 * SUM(h:0,H;w:0,W)(weights(i,i,h,w))*/ + mkldnn_Goihw16g_s8s8, mkldnn_gOhIw16o4i /** blocked weights format */, mkldnn_gOhIw8o4i /** blocked weights format */, /** blocked weights format with additional buffer @@ -351,20 +403,21 @@ typedef enum { mkldnn_gOhIw8o4i_s8s8, /* weights w/ groups, 6D */ + mkldnn_gOIdhw4i4o /** blocked weights format */, + mkldnn_gOdhwi4o /** blocked weights format */, mkldnn_gOIdhw8i8o /** blocked weights format */, mkldnn_gOIdhw8o8i /** blocked weights format */, mkldnn_gOdhwi8o /** blocked weights format */, mkldnn_gOIdhw8i16o2i /** blocked weights format */, mkldnn_gOIdhw16i16o /** blocked weights format */, mkldnn_gOIdhw16o16i /** blocked weights format */, + mkldnn_gOidhw4o /** blocked weights format */, mkldnn_gOidhw16o /** blocked weights format */, mkldnn_gOdhwi16o /** blocked weights format */, mkldnn_wino_fmt /** Weights format used in 8bit Winograd convolution */, - /* RNN packed weights */ - mkldnn_ldigo_p /** RNN packed weights (unused) */, - mkldnn_ldgoi_p /** RNN packed weights (unused) */, + mkldnn_rnn_packed /** Packed weights format used in RNN */, /** Just a sentinel, not real memory format. Must be changed after new * format is added. */ @@ -385,9 +438,9 @@ typedef enum { /** Forward data propagation (training mode). In this mode primitives * perform computations necessary for subsequent backward propagation. */ mkldnn_forward_training = 64, - /** Forward data propagation (inference mode). In this mode primitives only - * perform computations that are necessary for inference and omit - * computations that are only necessary for backward propagation. */ + /** Forward data propagation (inference mode). In this mode primitives + * perform only computations that are necessary for inference and omit + * computations that are necessary only for backward propagation. */ mkldnn_forward_inference = 96, /** Forward data propagation (alias for @c mkldnn_forward_inference) */ mkldnn_forward_scoring = mkldnn_forward_inference, @@ -428,8 +481,6 @@ typedef enum { mkldnn_deconvolution, /** An element-wise primitive. */ mkldnn_eltwise, - /** A ReLU primitive. @deprecated */ - mkldnn_relu = mkldnn_eltwise, /** A Softmax primitive. */ mkldnn_softmax, /** A pooling primitive. */ @@ -440,83 +491,95 @@ typedef enum { mkldnn_batch_normalization, /** An inner product primitive. */ mkldnn_inner_product, - /** A convolution primitive merged with ReLU. @deprecated */ - mkldnn_convolution_relu, /** A rnn primitive. */ mkldnn_rnn, /** A ROI pooling primitive. */ mkldnn_roi_pooling, /** An channel-wise primitive. */ mkldnn_depthwise, + /** A binary convolution primitive. */ + mkldnn_binary_convolution, + /** A binarization primitive. */ + mkldnn_binarization, } mkldnn_primitive_kind_t; /** Kinds of algorithms. */ typedef enum { mkldnn_alg_kind_undef, /** Direct convolution */ - mkldnn_convolution_direct = 1, + mkldnn_convolution_direct = 0x1, /** Winograd convolution */ - mkldnn_convolution_winograd = 2, + mkldnn_convolution_winograd = 0x2, + /** Convolution algorithm(either direct or Winograd) is chosen just in time **/ + mkldnn_convolution_auto = 0x3, + /** Direct deconvolution */ + mkldnn_deconvolution_direct = 0xa, + /** Winograd deconvolution */ + mkldnn_deconvolution_winograd = 0xb, /** Eltwise: ReLU */ - mkldnn_eltwise_relu = 8, + mkldnn_eltwise_relu = 0x1f, /** Eltwise: hyperbolic tangent non-linearity (tanh) */ - mkldnn_eltwise_tanh = 9, + mkldnn_eltwise_tanh = 0x2f, /** Eltwise: parametric exponential linear unit (elu) */ - mkldnn_eltwise_elu = 10, + mkldnn_eltwise_elu = 0x3f, /** Eltwise: square */ - mkldnn_eltwise_square = 11, + mkldnn_eltwise_square = 0x4f, /** Eltwise: abs */ - mkldnn_eltwise_abs = 12, + mkldnn_eltwise_abs = 0x5f, /** Eltwise: square root */ - mkldnn_eltwise_sqrt = 13, + mkldnn_eltwise_sqrt = 0x6f, /** Eltwise: linear */ - mkldnn_eltwise_linear = 14, + mkldnn_eltwise_linear = 0x7f, /** Eltwise: bounded_relu */ - mkldnn_eltwise_bounded_relu = 15, + mkldnn_eltwise_bounded_relu = 0x8f, /** Eltwise: soft_relu */ - mkldnn_eltwise_soft_relu = 16, + mkldnn_eltwise_soft_relu = 0x9f, /** Eltwise: logistic */ - mkldnn_eltwise_logistic = 17, + mkldnn_eltwise_logistic = 0xaf, /** Eltwise: clamp */ - mkldnn_eltwise_clamp = 18, + mkldnn_eltwise_clamp = 0xbf, + /** Eltwise: exp */ + mkldnn_eltwise_exp = 0xcf, + /** Eltwise: not */ + mkldnn_eltwise_not = 0xdf, /** Max pooling */ - mkldnn_pooling_max = 34, + mkldnn_pooling_max = 0x1ff, /** Average pooling include padding */ - mkldnn_pooling_avg_include_padding = 40, + mkldnn_pooling_avg_include_padding = 0x2ff, /** Average pooling exclude padding */ - mkldnn_pooling_avg_exclude_padding = 41, + mkldnn_pooling_avg_exclude_padding = 0x3ff, mkldnn_pooling_avg = mkldnn_pooling_avg_exclude_padding, /** Local response normalization (LRN) across multiple channels */ - mkldnn_lrn_across_channels = 65, + mkldnn_lrn_across_channels = 0xaff, /** LRN within a single channel */ - mkldnn_lrn_within_channel = 66, - /** Direct deconvolution */ - mkldnn_deconvolution_direct = 71, - /** Winograd deconvolution */ - mkldnn_deconvolution_winograd = 72, + mkldnn_lrn_within_channel = 0xbff, /** RNN cell */ - mkldnn_vanilla_rnn = 80, + mkldnn_vanilla_rnn = 0x1fff, /** LSTM cell */ - mkldnn_vanilla_lstm = 81, + mkldnn_vanilla_lstm = 0x2fff, /** GRU cell */ - mkldnn_vanilla_gru = 82, + mkldnn_vanilla_gru = 0x3fff, /** GRU cell with linear before reset * * Modification of original GRU cell. Differs from #mkldnn_vanilla_gru * in how the new memory gate is calculated: - * \f[ c_t = tanh(W_c*x_t + b_{c_h} + r_t*(U_c*h_{t-1}+b_{c_h})) \f] + * \f[ c_t = tanh(W_c*x_t + b_{c_x} + r_t*(U_c*h_{t-1}+b_{c_h})) \f] * Primitive expects 4 biases on input: * \f$[b_{u}, b_{r}, b_{c_x}, b_{c_h}]\f$ * */ - mkldnn_gru_linear_before_reset = 83, - /** Depthwise: scale_shift */ - mkldnn_depthwise_scale_shift = 100, - /** Depthwise: prelu */ - mkldnn_depthwise_prelu = 101, + mkldnn_gru_linear_before_reset = 0x4fff, /** ROI max pooling **/ - mkldnn_roi_pooling_max = 128, + mkldnn_roi_pooling_max = 0xafff, /** ROI pooling with bilinear interpolation**/ - mkldnn_roi_pooling_bilinear = 129 + mkldnn_roi_pooling_bilinear = 0xbfff, + /** Depthwise: scale_shift */ + mkldnn_depthwise_scale_shift = 0x1ffff, + /** Depthwise: prelu */ + mkldnn_depthwise_prelu = 0x2ffff, + /** Direct binary convolution */ + mkldnn_binary_convolution_direct = 0x1fffff, + /** Depthwise binarization */ + mkldnn_binarization_depthwise = 0xafffff } mkldnn_alg_kind_t; /** Flags for batch-normalization primititve. */ @@ -547,15 +610,6 @@ typedef enum { * same behavior as prop_kind == #mkldnn_backward */ mkldnn_use_scaleshift = 0x2U, - /** Omit statistics - * - * @deprecated use #mkldnn_use_global_stats instead - * - * For time being had an affect on backward propagation only which allowed - * skipping some computations (the same semantics as - * #mkldnn_use_global_stats) - */ - mkldnn_omit_stats = mkldnn_use_global_stats, /** Fuse with ReLU * * If specified: @@ -578,7 +632,7 @@ typedef enum { #define TENSOR_MAX_DIMS 12 /** A type to describe tensor dimensions. */ -typedef int mkldnn_dims_t[TENSOR_MAX_DIMS]; +typedef ptrdiff_t mkldnn_dims_t[TENSOR_MAX_DIMS]; /** A type to describe strides within a tensor. */ typedef ptrdiff_t mkldnn_strides_t[TENSOR_MAX_DIMS]; @@ -627,6 +681,27 @@ typedef struct { size_t size; } mkldnn_wino_desc_t; +typedef enum { + mkldnn_packed_format_undef = 0, + mkldnn_ldigo_p, + mkldnn_ldgoi_p +} mkldnn_rnn_packed_memory_format_t; + +/* Maximum number of parts of RNN weights tensor that require separate + * computation. */ +#define MKLDNN_RNN_MAX_N_PARTS 4 + +/** Description of tensor of packed weights for rnn. */ +typedef struct { + mkldnn_rnn_packed_memory_format_t format; + int n_parts; + int n; + int parts[MKLDNN_RNN_MAX_N_PARTS]; + size_t part_pack_size[MKLDNN_RNN_MAX_N_PARTS]; + size_t offset_compensation; + size_t size; +} mkldnn_rnn_packed_desc_t; + /** @addtogroup c_api_types_op_descs Operation descriptors * @{*/ @@ -640,7 +715,7 @@ typedef const void *const_mkldnn_op_desc_t; * format. Additionally, contains format-specific descriptions of the data * layout. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_memory. */ mkldnn_primitive_kind_t primitive_kind; /** Number of dimensions */ @@ -657,8 +732,8 @@ typedef struct { * * @note * The order of dimensions does not depend on the memory format, so - * no matter whether the data is laid in #mkldnn_nchw or #mkldnn_nhwc - * the dims for 4D CN data tensor would be {N, C, H, W} + * whether the data is laid out in #mkldnn_nchw or #mkldnn_nhwc + * the dims for 4D CN data tensor would be {N, C, H, W}. */ mkldnn_dims_t dims; /** Data type of the tensor elements. */ @@ -671,6 +746,8 @@ typedef struct { mkldnn_blocking_desc_t blocking; /** Tensor of weights for integer 8bit winograd convolution. */ mkldnn_wino_desc_t wino_desc; + /** Tensor of packed weights for RNN. */ + mkldnn_rnn_packed_desc_t rnn_packed_desc; /* ... other descriptions possible */ } layout_desc; } mkldnn_memory_desc_t; @@ -679,7 +756,7 @@ typedef struct { /** A descriptor of a convolution operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_convolution. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, @@ -724,13 +801,13 @@ typedef mkldnn_convolution_desc_t mkldnn_deconvolution_desc_t; /** A descriptor of a shuffle operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_convolution. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, - * #mkldnn_forward_inference, #mkldnn_backward_data*/ + * #mkldnn_forward_inference, and #mkldnn_backward_data. */ mkldnn_prop_kind_t prop_kind; - /** Source and destination memory descriptor. + /** Source and destination memory descriptor, * and source and destination gradient memory descriptor. */ mkldnn_memory_desc_t data_desc; /** axis for shuffling. */ @@ -741,7 +818,7 @@ typedef struct { /** A descriptor of a element-wise operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_eltwise. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, @@ -751,7 +828,7 @@ typedef struct { /** The kind of eltwise algorithm. Possible values: #mkldnn_eltwise_relu, * #mkldnn_eltwise_tanh, #mkldnn_eltwise_elu, #mkldnn_eltwise_square, * #mkldnn_eltwise_abs, #mkldnn_eltwise_sqrt, #mkldnn_eltwise_linear, - * #mkldnn_eltwise_bounded_relu, #mkldnn_eltwise_soft_relu, + * #mkldnn_eltwise_bounded_relu, #mkldnn_eltwise_soft_relu, and * #mkldnn_eltwise_logistic. */ mkldnn_alg_kind_t alg_kind; /** Source and destination memory descriptor. */ @@ -772,10 +849,6 @@ typedef struct { * - #mkldnn_eltwise_logistic: @p alpha and @p beta ignored */ float alpha, beta; - /** ReLU scaling factor for negative values. - * @deprecated: use alpha instead - * @warning: read-only value */ - float negative_slope; } mkldnn_eltwise_desc_t; /** A descriptor of a channel-wise operation. */ @@ -790,25 +863,22 @@ typedef struct { /** The kind of depthwise algorithm. Possible values: #mkldnn_depthwise_scale_shift * #mkldnn_depthwise_prelu */ mkldnn_alg_kind_t alg_kind; - /** Source memory descriptor. */ - mkldnn_memory_desc_t src_desc; - /** Destination memory descriptor. */ - mkldnn_memory_desc_t dst_desc; + /** Source memory descriptor. */ + mkldnn_memory_desc_t src_desc; + /** Destination memory descriptor. */ + mkldnn_memory_desc_t dst_desc; /** Weights memory descriptor. */ mkldnn_memory_desc_t weights_desc; /** Bias memory descriptor. */ mkldnn_memory_desc_t bias_desc; } mkldnn_depthwise_desc_t; -/* @deprecated: use mkldnn_eltwise_desc_t */ -typedef mkldnn_eltwise_desc_t mkldnn_relu_desc_t; - /** A descriptor of a Softmax operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_softmax. */ mkldnn_primitive_kind_t primitive_kind; - /** The kind of propagation. Possible values: #mkldnn_forward_training, + /** The kind of propagation. Possible values: #mkldnn_forward_training and * #mkldnn_forward_inference. */ mkldnn_prop_kind_t prop_kind; /** Source and destination memory descriptor. */ @@ -821,14 +891,14 @@ typedef struct { /** A descriptor of a pooling operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_pooling. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, * #mkldnn_forward_inference, #mkldnn_backward, and #mkldnn_backward_data. */ mkldnn_prop_kind_t prop_kind; - /** The kind of pooling algorithm. Possible values: #mkldnn_pooling_max, + /** The kind of pooling algorithm. Possible values: #mkldnn_pooling_max and * #mkldnn_pooling_avg. */ mkldnn_alg_kind_t alg_kind; /** Source memory descriptor. */ @@ -855,14 +925,14 @@ typedef struct { /** A descriptor of a Local Response Normalization (LRN) operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_lrn. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, * #mkldnn_forward_inference, #mkldnn_backward, and #mkldnn_backward_data. */ mkldnn_prop_kind_t prop_kind; - /** LRN algorithm. Possible values #mkldnn_lrn_within_channel or + /** LRN algorithm. Possible values: #mkldnn_lrn_within_channel and * #mkldnn_lrn_across_channels. */ mkldnn_alg_kind_t alg_kind; /** Source and destination memory descriptor. */ @@ -882,7 +952,7 @@ typedef struct { /** A descriptor of a Batch Normalization operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_batch_normalization. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, @@ -913,7 +983,7 @@ typedef struct { /** A descriptor of an inner product operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_inner_product. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, @@ -940,18 +1010,6 @@ typedef struct { mkldnn_data_type_t accum_data_type; } mkldnn_inner_product_desc_t; -/** A descriptor of a convolution followed by relu operation. */ -typedef struct { - /** The kind of primitive. Used for self identifying the primitive - * descriptor. Must be #mkldnn_convolution_relu. */ - mkldnn_primitive_kind_t primitive_kind; - /** A descriptor of a convolution operation. */ - mkldnn_convolution_desc_t convolution_desc; - /** Scaling factor for negative values, stored as float-precision but - * interpreted in a way specific to the data type in each implementation */ - float negative_slope; -} mkldnn_convolution_relu_desc_t; - /** Flags for RNN cell. */ typedef enum { mkldnn_rnn_cell_with_relu = 0x1U, @@ -960,23 +1018,23 @@ typedef enum { typedef struct { /** RNN cell kind. Must be one of #mkldnn_vanilla_rnn, - * #mkldnn_vanilla_lstm, #mkldnn_vanilla_gru + * #mkldnn_vanilla_lstm, #mkldnn_vanilla_gru, * or #mkldnn_gru_linear_before_reset. */ mkldnn_alg_kind_t cell_kind; - /** Activation function used. Must be one of #mkldnn_eltwise_relu, + /** Activation function used. Must be either #mkldnn_eltwise_relu or * #mkldnn_eltwise_tanh. */ mkldnn_alg_kind_t activation_kind; /** RNN cell flags */ unsigned int flags; - /** alpha is a negative slope parameter (used only if - * (flags & #mkldnn_rnn_cell_with_relu) != 0) */ + /** @c alpha is a negative slope parameter (used only if + * `(flags & #mkldnn_rnn_cell_with_relu) != 0`) */ float alpha; /** clipping parameter (used only if - * (flags & #mkldnn_rnn_cell_with_clipping) != 0) */ + * `(flags & #mkldnn_rnn_cell_with_clipping) != 0`) */ float clipping; } mkldnn_rnn_cell_desc_t; -/** A direction of RNN primitive execution */ +/** A direction of RNN primitive execution. */ typedef enum { /* Unidirectional execution of RNN primitive from left to right. */ mkldnn_unidirectional_left2right, @@ -991,13 +1049,13 @@ typedef enum { mkldnn_unidirectional = mkldnn_unidirectional_left2right, } mkldnn_rnn_direction_t; -/** A descriptor for an rnn operation */ +/** A descriptor for an RNN operation. */ typedef struct { - /** The kind of primitive. Used for self identifying the primitive + /** The kind of primitive. Used for self-identifying the primitive * descriptor. Must be #mkldnn_rnn. */ mkldnn_primitive_kind_t primitive_kind; /** The kind of propagation. Possible values: #mkldnn_forward_training, - * #mkldnn_forward_inference, #mkldnn_backward. */ + * #mkldnn_forward_inference, and #mkldnn_backward. */ mkldnn_prop_kind_t prop_kind; /** The RNN cell desc. */ mkldnn_rnn_cell_desc_t cell_desc; @@ -1053,6 +1111,56 @@ typedef struct { mkldnn_alg_kind_t alg_kind; } mkldnn_roi_pooling_desc_t; +/** A descriptor of a binary convolution operation. */ +typedef struct { + /** The kind of primitive. Used for self identifying the primitive + * descriptor. Must be #mkldnn_binary_convolution. */ + mkldnn_primitive_kind_t primitive_kind; + /** The kind of propagation. Possible values: #mkldnn_forward_training, + * #mkldnn_forward_inference */ + mkldnn_prop_kind_t prop_kind; + /** The kind of the binary convolution algorithm. Possible values: + * #mkldnn_binary_convolution_direct. */ + mkldnn_alg_kind_t alg_kind; + /** Source memory descriptor. */ + mkldnn_memory_desc_t src_desc; + /** Weights memory descriptor. */ + mkldnn_memory_desc_t weights_desc; + /** Destination memory descriptor. */ + mkldnn_memory_desc_t dst_desc; + /** Convolution strides in each spatial dimension. */ + mkldnn_dims_t strides; + /** Convolution dilates in each spatial dimension. */ + mkldnn_dims_t dilates; + /** Padding in each spatial dimension. padding[0] is a padding in the + * beginning (@p padding_l), padding[1] is a padding in the end (@p + * padding_r). */ + mkldnn_dims_t padding[2]; + /** The accumulator data type. Initialized automatically. */ + mkldnn_data_type_t accum_data_type; + /** Logic value of elements in padding area */ + float pad_value; +} mkldnn_binary_convolution_desc_t; + +/** A descriptor of a binarization operation. */ +typedef struct { + /** The kind of primitive. Used for self identifying the primitive + * descriptor. Must be #mkldnn_binarization. */ + mkldnn_primitive_kind_t primitive_kind; + /** The kind of propagation. Possible values: #mkldnn_forward_training, + * #mkldnn_forward_inference, #mkldnn_backward, and #mkldnn_backward_data. + */ + mkldnn_prop_kind_t prop_kind; + /** The kind of binarization algorithm. Possible values: #mkldnn_binarization_depthwise */ + mkldnn_alg_kind_t alg_kind; + /** Source memory descriptor. */ + mkldnn_memory_desc_t src_desc; + /** Destination memory descriptor. */ + mkldnn_memory_desc_t dst_desc; + /** Weights memory descriptor. */ + mkldnn_memory_desc_t weights_desc; +} mkldnn_binarization_desc_t; + /** @} */ /** @addtogroup c_api_engine_types Engine @@ -1083,7 +1191,7 @@ typedef const struct mkldnn_engine *const_mkldnn_engine_t; * @{ */ /** @struct mkldnn_primitive_desc_iterator - * @brief An opaque structure to describe a primitive descriptor iterator . */ + * @brief An opaque structure to describe a primitive descriptor iterator. */ struct mkldnn_primitive_desc_iterator; /** @brief A primitive descriptor iterator handle. */ @@ -1100,7 +1208,7 @@ typedef const struct mkldnn_primitive_desc_iterator * @{ */ /** @struct mkldnn_primitive_desc - * @brief An opaque structure to describe a primitive descriptor . */ + * @brief An opaque structure to describe a primitive descriptor. */ struct mkldnn_primitive_desc; /** @brief A primitive descriptor handle. */ @@ -1138,12 +1246,12 @@ typedef const struct mkldnn_primitive_attr *const_mkldnn_primitive_attr_t; * * Post operations might be combined together, making a chain of post * operations. For instance one can configure convolution followed by - * accumulation followed by eltwise (relu). This might be especially beneficial + * accumulation followed by eltwise. This might be especially beneficial * for residual learning blocks. * * @warning - * Of course not all the combinations are supported, so user should handle - * error accordingly. + * Of course not all combinations are supported, so the user should handle + * errors accordingly. * * Supported post operations: * - accumulation (base primitive: convolution) @@ -1185,8 +1293,8 @@ typedef struct { /** Primitive descriptor query specification * - * For generic function mkldnn_primitive_desc_query() the type of result must - * be agreed with queried argument. The correspondence table: + * For generic function mkldnn_primitive_desc_query(), the type of result must + * agree with the queried argument. The correspondence table: * Query | type of result * -------------------------------------------------------------- * #mkldnn_query_engine | mkldnn_engine_t * @@ -1205,10 +1313,10 @@ typedef struct { * reference. All numbers are returned by value. * * @warning - * All returned references point to constant objects and valid only during - * the lifetime of queried primitive descriptor. Returned objects must not - * be destroyed by user. If there is a need to keep the object longer than - * a lifetime of queried primitive descriptor use + * All returned references point to constant objects and are valid only + * during the lifetime of the queried primitive descriptor. Returned objects + * must not be destroyed by the user. If you need to keep the object longer + * than the lifetime of the queried primitive descriptor, use * mkldnn_primitive_desc_clone() to make a copy. */ typedef enum { mkldnn_query_undef = 0, /**< no query */ @@ -1234,16 +1342,16 @@ typedef enum { mkldnn_query_deconvolution_d, /**< deconvolution descriptor */ mkldnn_query_shuffle_d, /**< shuffle descriptor */ mkldnn_query_eltwise_d, /**< eltwise descriptor */ - mkldnn_query_relu_d = mkldnn_query_eltwise_d, /**< @deprecated */ mkldnn_query_softmax_d, /**< softmax descriptor */ mkldnn_query_pooling_d, /**< pooling descriptor */ mkldnn_query_lrn_d, /**< lrn descriptor */ mkldnn_query_batch_normalization_d, /**< batch normalization descriptor */ mkldnn_query_inner_product_d, /**< inner product descriptor */ - mkldnn_query_convolution_relu_d, /**< @deprecated */ mkldnn_query_rnn_d, /**< rnn descriptor */ mkldnn_query_roi_pooling_d, /**< roi descriptor */ mkldnn_query_depthwise_d, /**< eltwise descriptor */ + mkldnn_query_binary_convolution_d, /**< binary convolution descriptor */ + mkldnn_query_binarization_d, /**< binarization descriptor */ /* (memory) primitive descriptor section */ mkldnn_query_some_pd = 128, /**< stub */ diff --git a/inference-engine/thirdparty/mkl-dnn/include/mkldnn_version.h.in b/inference-engine/thirdparty/mkl-dnn/include/mkldnn_version.h.in new file mode 100644 index 0000000..5ee0126 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/include/mkldnn_version.h.in @@ -0,0 +1,32 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef MKLDNN_VERSION_H +#define MKLDNN_VERSION_H + +/* Major version of MKL-DNN */ +#define MKLDNN_VERSION_MAJOR @MKLDNN_VERSION_MAJOR@ + +/* Minor version of MKL-DNN */ +#define MKLDNN_VERSION_MINOR @MKLDNN_VERSION_MINOR@ + +/* Patch version of MKL-DNN */ +#define MKLDNN_VERSION_PATCH @MKLDNN_VERSION_PATCH@ + +/* Git Commit Hash of MKL-DNN */ +#define MKLDNN_VERSION_HASH "@MKLDNN_VERSION_HASH@" + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/scripts/generate_mkldnn_debug.py b/inference-engine/thirdparty/mkl-dnn/scripts/generate_mkldnn_debug.py index 9c53536..4f6efe2 100644 --- a/inference-engine/thirdparty/mkl-dnn/scripts/generate_mkldnn_debug.py +++ b/inference-engine/thirdparty/mkl-dnn/scripts/generate_mkldnn_debug.py @@ -120,7 +120,6 @@ def maybe_skip(enum): 'mkldnn_batch_normalization_flag_t', 'mkldnn_wino_memory_format_t', 'mkldnn_rnn_cell_flags_t', - 'mkldnn_rnn_direction_t', 'mkldnn_engine_kind_t', 'mkldnn_query_t', 'mkldnn_stream_kind_t', @@ -136,6 +135,7 @@ def enum_abbrev(enum): 'mkldnn_prop_kind_t': 'prop_kind', 'mkldnn_primitive_kind_t': 'prim_kind', 'mkldnn_alg_kind_t': 'alg_kind', + 'mkldnn_rnn_direction_t': 'rnn_direction', }.get(enum, enum) diff --git a/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.bat b/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.bat index 48979c3..04939a9 100644 --- a/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.bat +++ b/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.bat @@ -18,8 +18,8 @@ rem ============================================================================ rem req: PowerShell 3.0+ powershell.exe -command "if ($PSVersionTable.PSVersion.Major -ge 3) {exit 1} else {Write-Host \"The script requires PowerShell 3.0 or above (current version: $($PSVersionTable.PSVersion.Major).$($PSVersionTable.PSVersion.Minor))\"}" && goto Error_load -set MKLURLROOT=https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/ -set MKLVERSION=2019.0.1.20180928 +set MKLURLROOT=https://github.com/intel/mkl-dnn/releases/download/v0.18-rc/ +set MKLVERSION=2019.0.3.20190125 set MKLPACKAGE=mklml_win_%MKLVERSION% diff --git a/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.sh b/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.sh index 27115ef..3e2e39d 100644 --- a/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.sh +++ b/inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.sh @@ -15,8 +15,8 @@ # limitations under the License. #=============================================================================== -MKLURLROOT="https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/" -MKLVERSION="2019.0.1.20180928" +MKLURLROOT="https://github.com/intel/mkl-dnn/releases/download/v0.18-rc/" +MKLVERSION="2019.0.3.20190125" os=`uname` if [ "$os" = "Linux" ]; then diff --git a/inference-engine/thirdparty/mkl-dnn/src/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/src/CMakeLists.txt index 83ed499..f10feb2 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/CMakeLists.txt +++ b/inference-engine/thirdparty/mkl-dnn/src/CMakeLists.txt @@ -14,9 +14,8 @@ # limitations under the License. #=============================================================================== -set(TARGET_NAME ${LIB_NAME}) - file(GLOB_RECURSE HEADERS + ${PROJECT_BINARY_DIR}/include/*.h ${CMAKE_CURRENT_SOURCE_DIR}/../include/*.h ${CMAKE_CURRENT_SOURCE_DIR}/../include/*.hpp ) @@ -27,8 +26,10 @@ file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp ) include_directories( + ${PROJECT_BINARY_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common + ${CMAKE_CURRENT_SOURCE_DIR}/cpu ${CMAKE_CURRENT_SOURCE_DIR}/cpu/xbyak ) @@ -88,28 +89,68 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") endif() endif() -add_library(${TARGET_NAME} ${MKLDNN_LIBRARY_TYPE} ${HEADERS} ${SOURCES}) +add_library(${LIB_NAME} ${MKLDNN_LIBRARY_TYPE} ${HEADERS} ${SOURCES}) +set_property(TARGET ${LIB_NAME} PROPERTY CXX_STANDARD 11) +set_property(TARGET ${LIB_NAME} PROPERTY CXX_STANDARD_REQUIRED ON) +set_property(TARGET ${LIB_NAME} PROPERTY VERSION "${PROJECT_VERSION}.0") +set_property(TARGET ${LIB_NAME} PROPERTY SOVERSION "0") +set_property(TARGET ${LIB_NAME} PROPERTY PUBLIC_HEADER ${HEADERS}) + +target_include_directories(${LIB_NAME} PUBLIC + $ + $ + # $ is required for compatibility with cmake 2.8 + $/${CMAKE_INSTALL_INCLUDEDIR}> + ) -#Add mkldnn.dll to execution PATH -if(NOT(MINGW)) - set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}\;${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" PARENT_SCOPE) -else() - # CMake with "MSYS Makefiles" generator seems to build libmkldnn.dll in a directory without build type. - set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}\;${CMAKE_CURRENT_BINARY_DIR}" PARENT_SCOPE) +target_link_libraries_private(${LIB_NAME} + "${EXTRA_SHARED_LIBS};${EXTRA_STATIC_LIBS}") +target_link_libraries_public(${LIB_NAME} "${EXTRA_SHARED_LIBS}") +if(MKLDNN_LIBRARY_TYPE STREQUAL "STATIC") + target_link_libraries_public(${LIB_NAME} "${EXTRA_STATIC_LIBS}") endif() -target_link_libraries(${TARGET_NAME} ${${TARGET_NAME}_LINKER_LIBS} ${EXTRA_LIBS}) -set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 11) -set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD_REQUIRED ON) -set_property(TARGET ${TARGET_NAME} PROPERTY VERSION "${PROJECT_VERSION}.0") -set_property(TARGET ${TARGET_NAME} PROPERTY SOVERSION "0") -if(MINGW) - # We need to install *.dll into bin/ and *.a into lib/. - install(TARGETS ${TARGET_NAME} - RUNTIME DESTINATION bin - ARCHIVE DESTINATION lib${LIB_SUFFIX} - ) -else() - install(TARGETS ${TARGET_NAME} DESTINATION lib${LIB_SUFFIX}) +set(LIB_EXPORT_NAME "${LIB_NAME}-targets") +install(TARGETS ${LIB_NAME} + EXPORT "${LIB_EXPORT_NAME}" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + +# Write version and package config files +set(LIB_CONFIG_GENERATE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated") +set(LIB_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${LIB_NAME}") +set(LIB_VERSION_FILE + "${LIB_CONFIG_GENERATE_DIR}/${LIB_NAME}-config-version.cmake") +set(LIB_CONFIG_FILE + "${LIB_CONFIG_GENERATE_DIR}/${LIB_NAME}-config.cmake") +write_basic_package_version_file( + "${LIB_VERSION_FILE}" + VERSION ${PROJECT_VERSION} + COMPATIBILITY SameMajorVersion) +configure_package_config_file( + "../cmake/config.cmake.in" + "${LIB_CONFIG_FILE}" + INSTALL_DESTINATION ${LIB_CONFIG_INSTALL_DIR}) +install(FILES ${LIB_CONFIG_FILE} ${LIB_VERSION_FILE} + DESTINATION ${LIB_CONFIG_INSTALL_DIR}) +string(TOUPPER "${LIB_NAME}::" LIB_NAMESPACE) +install(EXPORT ${LIB_EXPORT_NAME} + NAMESPACE ${LIB_NAMESPACE} + DESTINATION ${LIB_CONFIG_INSTALL_DIR}) + +# On Windows we need to add mkldnn.dll path to CTESTCONFIG_PATH which is later +# passed to ctest and Visual Studio solutions +if(WIN32) + if(NOT(MINGW)) + foreach(BUILD_TYPE Release Debug RelWithDebInfo MinSizeRel) + append_to_windows_path_list(CTESTCONFIG_PATH + "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_TYPE}") + endforeach() + else() + append_to_windows_path_list(CTESTCONFIG_PATH + "${CMAKE_CURRENT_BINARY_DIR}") + endif() + set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}" PARENT_SCOPE) endif() -install(FILES ${HEADERS} DESTINATION include) diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/batch_normalization_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/batch_normalization_pd.hpp index 96f9cf9..bd04302 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/batch_normalization_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/batch_normalization_pd.hpp @@ -63,7 +63,7 @@ struct batch_normalization_pd_t: public primitive_desc_t { inline bool use_scaleshift() const { return desc_.flags & mkldnn_use_scaleshift; } - inline bool omit_stats() const { return desc_.flags & mkldnn_omit_stats; } + inline bool use_global_stats() const { return desc_.flags & mkldnn_use_global_stats; } inline bool is_training() const { return desc_.prop_kind == prop_kind::forward_training; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/binarization.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/binarization.cpp new file mode 100644 index 0000000..f6ab0c0 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/common/binarization.cpp @@ -0,0 +1,66 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include "mkldnn.h" + +#include "c_types_map.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" + +using namespace mkldnn::impl; +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::status; +using namespace mkldnn::impl::prop_kind; +using namespace mkldnn::impl::alg_kind; +using namespace mkldnn::impl::types; + +namespace { +status_t binarization_desc_init(binarization_desc_t *binarization_desc, prop_kind_t prop_kind, + alg_kind_t alg_kind, const memory_desc_t *src_desc, const memory_desc_t *dst_desc, + const memory_desc_t *weights_desc) { + bool args_ok = true + && !any_null(binarization_desc, src_desc, dst_desc, weights_desc) + && one_of(prop_kind, forward_training, forward_inference) + && one_of(alg_kind, binarization_depthwise); + if (!args_ok) return invalid_arguments; + + auto bd = binarization_desc_t(); + bd.primitive_kind = primitive_kind::binarization; + bd.prop_kind = prop_kind; + bd.alg_kind = alg_kind; + bd.src_desc = *src_desc; + bd.dst_desc = *dst_desc; + bd.weights_desc = *weights_desc; + + bool consistency = true + && memory_desc_wrapper(bd.src_desc).nelems() + && memory_desc_wrapper(bd.dst_desc).nelems(); + if (!consistency) return invalid_arguments; + + *binarization_desc = bd; + return success; +} +} + +status_t mkldnn_binarization_forward_desc_init(binarization_desc_t *binarization_desc, + prop_kind_t prop_kind, alg_kind_t alg_kind, + const memory_desc_t *src_desc, const memory_desc_t *dst_desc, const memory_desc_t *weights_desc) { + if (!one_of(prop_kind, forward_training, forward_inference)) + return invalid_arguments; + return binarization_desc_init(binarization_desc, prop_kind, alg_kind, src_desc, dst_desc, weights_desc); +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/binarization_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/binarization_pd.hpp new file mode 100644 index 0000000..1450230 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/common/binarization_pd.hpp @@ -0,0 +1,89 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef BINARIZATION_PD_HPP +#define BINARIZATION_PD_HPP + +#include +#include "mkldnn.h" + +#include "c_types_map.hpp" +#include "primitive_desc.hpp" +#include "memory_pd.hpp" + +namespace mkldnn { +namespace impl { + +struct binarization_fwd_pd_t: public primitive_desc_t { + typedef binarization_fwd_pd_t base_class; + typedef binarization_fwd_pd_t hint_class; + static constexpr auto base_pkind = primitive_kind::binarization; + + binarization_fwd_pd_t(mkldnn::impl::engine_t *engine, + const binarization_desc_t *adesc, const primitive_attr_t *attr, + const binarization_fwd_pd_t *hint_fwd_pd) + : primitive_desc_t(engine, attr, primitive_kind::binarization) + , desc_(*adesc), hint_fwd_pd_(hint_fwd_pd) {} + virtual ~binarization_fwd_pd_t() {} + + const binarization_desc_t *desc() const { return &desc_; } + virtual const op_desc_t *op_desc() const override + { return reinterpret_cast(this->desc()); } + virtual void init_info() override { init_info_binarization(this, this->info_); } + + virtual const memory_pd_t *input_pd(int index = 0) const override { + switch (index) { + case 0: return src_pd(); + case 1: return weights_pd(index - 1); + default: return nullptr; + } + } + virtual const memory_pd_t *output_pd(int index = 0) const override + { return index == 0 ? dst_pd() : nullptr; } + + virtual int n_inputs() const override { return 2; } + virtual int n_outputs() const override { return 1; } + + virtual status_t query(query_t what, int idx, void *result) const override + { + switch (what) { + case query::binarization_d: + *(const binarization_desc_t**)result = desc(); break; + default: return primitive_desc_t::query(what, idx, result); + } + return status::success; + } + + /* common binarization aux functions */ + + inline int MB() const { return input_pd()->desc()->ndims > 0 ? input_pd()->desc()->dims[0] : 1; } + inline int C() const { return input_pd()->desc()->ndims > 1 ? input_pd()->desc()->dims[1] : 1; } + inline int D() const { return input_pd()->desc()->ndims > 4 ? input_pd()->desc()->dims[2] : 1; } + inline int H() const { return input_pd()->desc()->ndims > 4 ? input_pd()->desc()->dims[3] : + input_pd()->desc()->ndims > 2 ? input_pd()->desc()->dims[2] : 1; } + inline int W() const { return input_pd()->desc()->ndims > 4 ? input_pd()->desc()->dims[4] : + input_pd()->desc()->ndims > 3 ? input_pd()->desc()->dims[3] : 1; } + +protected: + binarization_desc_t desc_; + const binarization_fwd_pd_t *hint_fwd_pd_; +}; + +} +} + +#endif + diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution.cpp new file mode 100644 index 0000000..76d5531 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution.cpp @@ -0,0 +1,120 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "mkldnn.h" + +#include "c_types_map.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" + +using namespace mkldnn::impl; +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::status; +using namespace mkldnn::impl::prop_kind; +using namespace mkldnn::impl::alg_kind; +using namespace mkldnn::impl::types; + +namespace mkldnn { +namespace impl { +status_t bin_conv_desc_init(binary_convolution_desc_t *bin_conv_desc, + prop_kind_t prop_kind, alg_kind_t alg_kind, + const memory_desc_t *src_desc, const memory_desc_t *weights_desc, + const memory_desc_t *dst_desc, + const dims_t strides, const dims_t dilates, + const dims_t padding_l, const dims_t padding_r, + float pad_value) { + bool args_ok = true + && !any_null(bin_conv_desc, src_desc, weights_desc, dst_desc, strides, + padding_l) + && one_of(alg_kind, binary_convolution_direct) + && one_of(pad_value, -1.f, 0.f, 1.f); + if (!args_ok) return invalid_arguments; + + if (padding_r == nullptr) padding_r = padding_l; + + auto bcd = binary_convolution_desc_t(); + bcd.primitive_kind = primitive_kind::binary_convolution; + bcd.prop_kind = prop_kind; + bcd.alg_kind = alg_kind; + + bcd.src_desc = zero_md(); + bcd.dst_desc = zero_md(); + bcd.weights_desc = zero_md(); + + const bool with_groups = weights_desc->ndims == src_desc->ndims + 1; + + bcd.src_desc = *src_desc; + bcd.dst_desc = *dst_desc; + bcd.weights_desc = *weights_desc; + + int sp_dims = src_desc->ndims - 2; + utils::array_copy(bcd.strides, strides, sp_dims); + utils::array_copy(bcd.padding[0], padding_l, sp_dims); + utils::array_copy(bcd.padding[1], padding_r, sp_dims); + if (dilates) + utils::array_copy(bcd.dilates, dilates, sp_dims); + else + utils::array_set(bcd.dilates, 0, sp_dims); + + bcd.pad_value = pad_value; + bcd.accum_data_type = types::default_accum_data_type(src_desc->data_type, + weights_desc->data_type, dst_desc->data_type, prop_kind); + + bool consistency = true + && memory_desc_wrapper(weights_desc).nelems() + && src_desc->ndims == dst_desc->ndims + && utils::one_of(src_desc->ndims, 3, 4, 5) + && utils::one_of(weights_desc->ndims, src_desc->ndims, src_desc->ndims + 1) + && src_desc->dims[0] == dst_desc->dims[0]; + for (int i = 2; i < src_desc->ndims; ++i) + { + int src = src_desc->dims[i]; + int ker = weights_desc->dims[with_groups + i]; + int dil = bcd.dilates[i - 2]; + int pad_l = padding_l[i - 2]; + int pad_r = padding_r[i - 2]; + int str = strides[i - 2]; + int dst = dst_desc->dims[i]; + int ker_range = 1 + (ker - 1) * (dil + 1); + + if (str < 1) return invalid_arguments; + consistency = consistency + && dil >= 0 + && pad_l >= 0 +// && pad_r + str > 0 // TODO: [dmitrygo] Commented as WA to support dw conv fusing + && (src - ker_range + pad_l + pad_r) / str + 1 == dst; + } + if (!consistency) return invalid_arguments; + + *bin_conv_desc = bcd; + return success; +} +} +} + +status_t mkldnn_dilated_binary_convolution_forward_desc_init( + binary_convolution_desc_t *bin_conv_desc, prop_kind_t prop_kind, + alg_kind_t alg_kind, const memory_desc_t *src_desc, + const memory_desc_t *weights_desc, const memory_desc_t *dst_desc, const dims_t strides, + const dims_t dilates, const dims_t padding_l, + const dims_t padding_r, + const float pad_value) { + if (!one_of(prop_kind, forward_training, forward_inference)) + return invalid_arguments; + return mkldnn::impl::bin_conv_desc_init(bin_conv_desc, prop_kind, alg_kind, src_desc, + weights_desc, dst_desc, strides, dilates, padding_l, padding_r, pad_value); +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution_pd.hpp new file mode 100644 index 0000000..22fb486 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/common/binary_convolution_pd.hpp @@ -0,0 +1,153 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef BINARY_CONVOLUTION_PD_HPP +#define BINARY_CONVOLUTION_PD_HPP + +#include "mkldnn.h" + +#include "c_types_map.hpp" +#include "primitive_desc.hpp" +#include "memory_pd.hpp" +#include "utils.hpp" + +namespace mkldnn { +namespace impl { + +status_t bin_conv_desc_init(binary_convolution_desc_t *bin_conv_desc, + prop_kind_t prop_kind, alg_kind_t alg_kind, + const memory_desc_t *src_desc, const memory_desc_t *weights_desc, + const memory_desc_t *dst_desc, + const dims_t strides, const dims_t dilates, + const dims_t padding_l, const dims_t padding_r, + padding_kind_t padding_kind); + +struct _binary_convolution_fwd_pd_t: public primitive_desc_t { + typedef _binary_convolution_fwd_pd_t base_class; + typedef _binary_convolution_fwd_pd_t hint_class; + typedef binary_convolution_desc_t base_desc_t; + static constexpr auto base_pkind = primitive_kind::binary_convolution; + + _binary_convolution_fwd_pd_t(mkldnn::impl::engine_t *engine, + const base_desc_t *adesc, const primitive_attr_t *attr, + const _binary_convolution_fwd_pd_t *hint_fwd_pd) + : primitive_desc_t(engine, attr, base_pkind), desc_(*adesc) + , hint_fwd_pd_(hint_fwd_pd) {} + virtual ~_binary_convolution_fwd_pd_t() {} + + const base_desc_t *desc() const { return &desc_; } + inline const binary_convolution_desc_t *cdesc() const { return &cdesc_(); } + virtual const op_desc_t *op_desc() const override + { return reinterpret_cast(this->desc()); } + virtual void init_info() override { init_info_bin_conv(this, this->info_); } + + virtual const memory_pd_t *input_pd(int index = 0) const override { + switch (index) { + case 0: return src_pd(); + case 1: return weights_pd(index - 1); + default: return nullptr; + } + } + virtual const memory_pd_t *output_pd(int index = 0) const override + { return index == 0 ? dst_pd() : nullptr; } + + virtual int n_inputs() const override { return 2; } + virtual int n_outputs() const override { return 1; } + + virtual status_t query(query_t what, int idx, void *result) const override + { + switch (what) { + case pkind_traits::query_d: + *(const base_desc_t**)result = desc(); break; + default: return primitive_desc_t::query(what, idx, result); + } + return status::success; + } + + /* common conv aux functions */ + + inline int MB() const { return input_pd()->desc()->dims[0]; } + + inline int IC() const { return input_pd()->desc()->dims[1]; } + inline int OC() const { return output_pd()->desc()->dims[1]; } + inline int G() const + { return with_groups() ? cdesc_().weights_desc.dims[0] : 1; } + + inline int ID() const { return (ndims() == 5) ? input_pd()->desc()->dims[2] : 1; } + inline int IH() const { return (ndims() == 3) ? 1 : input_pd()->desc()->dims[ndims()-2]; } + inline int IW() const { return input_pd()->desc()->dims[ndims()-1]; } + inline int OD() const { return (ndims() == 5) ? output_pd()->desc()->dims[2] : 1; } + inline int OH() const { return (ndims() == 3) ? 1 : output_pd()->desc()->dims[ndims()-2]; } + inline int OW() const { return output_pd()->desc()->dims[ndims()-1]; } + inline int KD() const { return (ndims() == 5) + ? cdesc_().weights_desc.dims[2 + with_groups()] : 1; } + inline int KH() const + { return (ndims() == 3) + ? 1 : cdesc_().weights_desc.dims[ndims() - (2 - with_groups())]; } + inline int KW() const + { return cdesc_().weights_desc.dims[ndims() - (1 - with_groups())]; } + + inline int KSD() const { return (ndims() == 5) ? cdesc_().strides[0] : 1; } + inline int KSH() const { return (ndims() == 3) + ? 1 : cdesc_().strides[ndims()-4]; } + inline int KSW() const { return cdesc_().strides[ndims()-3]; } + + inline int KDD() const { return (ndims() == 5) ? cdesc_().dilates[0] : 0; } + inline int KDH() const { return (ndims() == 3) + ? 0 : cdesc_().dilates[ndims()-4]; } + inline int KDW() const { return cdesc_().dilates[ndims()-3]; } + + inline int padFront() const + { return (ndims() == 5) ? cdesc_().padding[0][0] : 0; } + inline int padBack() const + { return (ndims() == 5) ? cdesc_().padding[1][0] : 0; } + inline int padT() const { return (ndims() == 3) + ? 0 : cdesc_().padding[0][ndims()-4]; } + inline int padB() const { return (ndims() == 3) + ? 0 : cdesc_().padding[1][ndims()-4]; } + inline int padL() const { return cdesc_().padding[0][ndims()-3]; } + inline int padR() const { return cdesc_().padding[1][ndims()-3]; } + + inline float pad_value() const { return cdesc_().pad_value; } + + inline bool with_groups() const + { return cdesc_().weights_desc.ndims == cdesc_().src_desc.ndims + 1; } + + inline int ndims() const { return cdesc_().src_desc.ndims; } + + bool has_zero_dim_memory() const { + return false + || memory_desc_wrapper(cdesc_().src_desc).has_zero_dim() + || memory_desc_wrapper(cdesc_().dst_desc).has_zero_dim(); + } + +protected: + base_desc_t desc_; + const _binary_convolution_fwd_pd_t *hint_fwd_pd_; + + inline const binary_convolution_desc_t &cdesc_() const; + + virtual status_t init() = 0; +}; + +using binary_convolution_fwd_pd_t = mkldnn::impl::_binary_convolution_fwd_pd_t; + +inline const binary_convolution_desc_t &_binary_convolution_fwd_pd_t::cdesc_() const { return desc_; } + +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/c_types_map.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/c_types_map.hpp index 5bc02ae..9392890 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/c_types_map.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/c_types_map.hpp @@ -28,7 +28,7 @@ using dims_t = mkldnn_dims_t; using strides_t = mkldnn_strides_t; /* FIXME: to inference from correspoding types */ -using dim_t = int; +using dim_t = ptrdiff_t; using stride_t = ptrdiff_t; using status_t = mkldnn_status_t; @@ -60,6 +60,7 @@ namespace prop_kind { using alg_kind_t = mkldnn_alg_kind_t; namespace alg_kind { const alg_kind_t undef = mkldnn_alg_kind_undef; + const alg_kind_t convolution_auto = mkldnn_convolution_auto; const alg_kind_t convolution_direct = mkldnn_convolution_direct; const alg_kind_t convolution_winograd = mkldnn_convolution_winograd; const alg_kind_t deconvolution_direct = mkldnn_deconvolution_direct; @@ -75,6 +76,8 @@ namespace alg_kind { const alg_kind_t eltwise_soft_relu = mkldnn_eltwise_soft_relu; const alg_kind_t eltwise_logistic = mkldnn_eltwise_logistic; const alg_kind_t eltwise_clamp = mkldnn_eltwise_clamp; + const alg_kind_t eltwise_exp = mkldnn_eltwise_exp; + const alg_kind_t eltwise_not = mkldnn_eltwise_not; const alg_kind_t depthwise_scale_shift = mkldnn_depthwise_scale_shift; const alg_kind_t depthwise_prelu = mkldnn_depthwise_prelu; const alg_kind_t pooling_max = mkldnn_pooling_max; @@ -89,6 +92,8 @@ namespace alg_kind { const alg_kind_t gru_linear_before_reset = mkldnn_gru_linear_before_reset; const alg_kind_t roi_pooling_max = mkldnn_roi_pooling_max; const alg_kind_t roi_pooling_bilinear = mkldnn_roi_pooling_bilinear; + const alg_kind_t binary_convolution_direct = mkldnn_binary_convolution_direct; + const alg_kind_t binarization_depthwise = mkldnn_binarization_depthwise; } using data_type_t = mkldnn_data_type_t; @@ -99,6 +104,7 @@ namespace data_type { const data_type_t s16 = mkldnn_s16; const data_type_t s8 = mkldnn_s8; const data_type_t u8 = mkldnn_u8; + const data_type_t bin = mkldnn_bin; } using round_mode_t = mkldnn_round_mode_t; @@ -107,6 +113,13 @@ namespace round_mode { const round_mode_t down = mkldnn_round_down; } +using rnn_packed_format_t = mkldnn_rnn_packed_memory_format_t; +namespace rnn_packed_format { + const rnn_packed_format_t undef = mkldnn_packed_format_undef; + const rnn_packed_format_t ldigo_p = mkldnn_ldigo_p; + const rnn_packed_format_t ldgoi_p = mkldnn_ldgoi_p; +} + using memory_format_t = mkldnn_memory_format_t; namespace memory_format { const memory_format_t undef = mkldnn_format_undef; @@ -116,27 +129,33 @@ namespace memory_format { const memory_format_t nc = mkldnn_nc; const memory_format_t ncw = mkldnn_ncw; const memory_format_t nwc = mkldnn_nwc; + const memory_format_t nCw4c = mkldnn_nCw4c; const memory_format_t nCw8c = mkldnn_nCw8c; const memory_format_t nCw16c = mkldnn_nCw16c; const memory_format_t nchw = mkldnn_nchw; const memory_format_t nhwc = mkldnn_nhwc; const memory_format_t chwn = mkldnn_chwn; + const memory_format_t nChw4c = mkldnn_nChw4c; const memory_format_t nChw8c = mkldnn_nChw8c; const memory_format_t nChw16c = mkldnn_nChw16c; const memory_format_t ncdhw = mkldnn_ncdhw; const memory_format_t ndhwc = mkldnn_ndhwc; + const memory_format_t nCdhw4c = mkldnn_nCdhw4c; const memory_format_t nCdhw8c = mkldnn_nCdhw8c; const memory_format_t nCdhw16c = mkldnn_nCdhw16c; const memory_format_t oi = mkldnn_oi; const memory_format_t io = mkldnn_io; const memory_format_t oiw = mkldnn_oiw; const memory_format_t wio = mkldnn_wio; + const memory_format_t Owi4o = mkldnn_Owi4o; + const memory_format_t OIw4i4o = mkldnn_OIw4i4o; const memory_format_t Owi8o = mkldnn_Owi8o; const memory_format_t OIw8i8o = mkldnn_OIw8i8o; const memory_format_t OIw8o8i = mkldnn_OIw8o8i; const memory_format_t OIw16i16o = mkldnn_OIw16i16o; const memory_format_t OIw16o16i = mkldnn_OIw16o16i; const memory_format_t Oiw16o = mkldnn_Oiw16o; + const memory_format_t Oiw4o = mkldnn_Oiw4o; const memory_format_t Owi16o = mkldnn_Owi16o; const memory_format_t OIw8i16o2i = mkldnn_OIw8i16o2i; const memory_format_t IOw16o16i = mkldnn_IOw16o16i; @@ -144,20 +163,25 @@ namespace memory_format { const memory_format_t oihw = mkldnn_oihw; const memory_format_t ihwo = mkldnn_ihwo; const memory_format_t hwio = mkldnn_hwio; + const memory_format_t iohw = mkldnn_iohw; const memory_format_t hwio_s8s8 = mkldnn_hwio_s8s8; const memory_format_t dhwio = mkldnn_dhwio; const memory_format_t oidhw = mkldnn_oidhw; + const memory_format_t OIdhw4i4o = mkldnn_OIdhw4i4o; + const memory_format_t Odhwi4o = mkldnn_Odhwi4o; const memory_format_t OIdhw8i8o = mkldnn_OIdhw8i8o; const memory_format_t OIdhw8o8i = mkldnn_OIdhw8o8i; const memory_format_t Odhwi8o = mkldnn_Odhwi8o; const memory_format_t OIdhw16i16o = mkldnn_OIdhw16i16o; const memory_format_t OIdhw16o16i = mkldnn_OIdhw16o16i; + const memory_format_t Oidhw4o = mkldnn_Oidhw4o; const memory_format_t Oidhw16o = mkldnn_Oidhw16o; const memory_format_t Odhwi16o = mkldnn_Odhwi16o; const memory_format_t oIhw8i = mkldnn_oIhw8i; const memory_format_t oIhw16i = mkldnn_oIhw16i; const memory_format_t oIdhw8i = mkldnn_oIdhw8i; const memory_format_t oIdhw16i = mkldnn_oIdhw16i; + const memory_format_t OIhw4i4o = mkldnn_OIhw4i4o; const memory_format_t OIhw8i8o = mkldnn_OIhw8i8o; const memory_format_t OIhw16i16o = mkldnn_OIhw16i16o; const memory_format_t OIhw4i16o4i = mkldnn_OIhw4i16o4i; @@ -168,46 +192,65 @@ namespace memory_format { const memory_format_t OIhw8o8i = mkldnn_OIhw8o8i; const memory_format_t OIhw16o16i = mkldnn_OIhw16o16i; const memory_format_t IOhw16o16i = mkldnn_IOhw16o16i; + const memory_format_t Oihw4o = mkldnn_Oihw4o; const memory_format_t Oihw16o = mkldnn_Oihw16o; const memory_format_t Ohwi8o = mkldnn_Ohwi8o; + const memory_format_t Ohwi4o = mkldnn_Ohwi4o; const memory_format_t Ohwi16o = mkldnn_Ohwi16o; const memory_format_t OhIw8o4i = mkldnn_OhIw8o4i; + const memory_format_t OhIw8o32i = mkldnn_OhIw8o32i; + const memory_format_t OhIw16o32i = mkldnn_OhIw16o32i; const memory_format_t OhIw8o4i_s8s8 = mkldnn_OhIw8o4i_s8s8; const memory_format_t goiw = mkldnn_goiw; + const memory_format_t gOwi4o = mkldnn_gOwi4o; + const memory_format_t gOIw4i4o = mkldnn_gOIw4i4o; const memory_format_t gOwi8o = mkldnn_gOwi8o; const memory_format_t gOIw8i8o = mkldnn_gOIw8i8o; const memory_format_t gOIw8o8i = mkldnn_gOIw8o8i; const memory_format_t gOIw16i16o = mkldnn_gOIw16i16o; const memory_format_t gOIw16o16i = mkldnn_gOIw16o16i; const memory_format_t gOiw16o = mkldnn_gOiw16o; + const memory_format_t gOiw4o = mkldnn_gOiw4o; const memory_format_t gOwi16o = mkldnn_gOwi16o; const memory_format_t gOIw8i16o2i = mkldnn_gOIw8i16o2i; const memory_format_t gIOw16o16i = mkldnn_gIOw16o16i; const memory_format_t gOIw8o16i2o = mkldnn_gOIw8o16i2o; const memory_format_t goihw = mkldnn_goihw; const memory_format_t hwigo = mkldnn_hwigo; + const memory_format_t giohw = mkldnn_giohw; const memory_format_t hwigo_s8s8 = mkldnn_hwigo_s8s8; + const memory_format_t gOIhw4i4o = mkldnn_gOIhw4i4o; const memory_format_t gOIhw8i8o = mkldnn_gOIhw8i8o; const memory_format_t gOIhw16i16o = mkldnn_gOIhw16i16o; const memory_format_t gOIhw4i16o4i = mkldnn_gOIhw4i16o4i; const memory_format_t gOIhw4i16o4i_s8s8 = mkldnn_gOIhw4i16o4i_s8s8; + const memory_format_t gOIhw2i8o4i = mkldnn_gOIhw2i8o4i; + const memory_format_t gOIhw2i8o4i_s8s8 = mkldnn_gOIhw2i8o4i_s8s8; const memory_format_t gOIhw8i16o2i = mkldnn_gOIhw8i16o2i; const memory_format_t gOIdhw8i16o2i = mkldnn_gOIdhw8i16o2i; const memory_format_t gOIhw8o16i2o = mkldnn_gOIhw8o16i2o; + const memory_format_t gOIhw4o4i = mkldnn_gOIhw4o4i; + const memory_format_t gOIhw4o4i_s8s8 = mkldnn_gOIhw4o4i_s8s8; const memory_format_t gOIhw8o8i = mkldnn_gOIhw8o8i; const memory_format_t gOIhw16o16i = mkldnn_gOIhw16o16i; const memory_format_t gIOhw16o16i = mkldnn_gIOhw16o16i; + const memory_format_t gOihw4o = mkldnn_gOihw4o; const memory_format_t gOihw16o = mkldnn_gOihw16o; const memory_format_t gOhwi8o = mkldnn_gOhwi8o; + const memory_format_t gOhwi4o = mkldnn_gOhwi4o; const memory_format_t gOhwi16o = mkldnn_gOhwi16o; const memory_format_t Goihw8g = mkldnn_Goihw8g; const memory_format_t Goihw16g = mkldnn_Goihw16g; + const memory_format_t Goihw16g_s8s8 = mkldnn_Goihw16g_s8s8; const memory_format_t goidhw = mkldnn_goidhw; + const memory_format_t gOIdhw4i4o = mkldnn_gOIdhw4i4o; + const memory_format_t gOdhwi4o = mkldnn_gOdhwi4o; const memory_format_t gOIdhw8i8o = mkldnn_gOIdhw8i8o; const memory_format_t gOIdhw8o8i = mkldnn_gOIdhw8o8i; const memory_format_t gOdhwi8o = mkldnn_gOdhwi8o; const memory_format_t gOIdhw16i16o = mkldnn_gOIdhw16i16o; const memory_format_t gOIdhw16o16i = mkldnn_gOIdhw16o16i; + const memory_format_t gOidhw4o = mkldnn_gOidhw4o; const memory_format_t gOidhw16o = mkldnn_gOidhw16o; const memory_format_t gOdhwi16o = mkldnn_gOdhwi16o; const memory_format_t gOhIw8o4i = mkldnn_gOhIw8o4i; @@ -216,11 +259,10 @@ namespace memory_format { const memory_format_t tnc = mkldnn_tnc; const memory_format_t ldsnc = mkldnn_ldsnc; const memory_format_t ldigo = mkldnn_ldigo; - const memory_format_t ldigo_p = mkldnn_ldigo_p; const memory_format_t ldgoi = mkldnn_ldgoi; - const memory_format_t ldgoi_p = mkldnn_ldgoi_p; const memory_format_t ldgo = mkldnn_ldgo; const memory_format_t wino_fmt = mkldnn_wino_fmt; + const memory_format_t rnn_packed = mkldnn_rnn_packed; } using padding_kind_t = mkldnn_padding_kind_t; @@ -253,9 +295,10 @@ namespace primitive_kind { const primitive_kind_t lrn = mkldnn_lrn; const primitive_kind_t batch_normalization = mkldnn_batch_normalization; const primitive_kind_t inner_product = mkldnn_inner_product; - const primitive_kind_t convolution_relu = mkldnn_convolution_relu; const primitive_kind_t rnn = mkldnn_rnn; const primitive_kind_t roi_pooling = mkldnn_roi_pooling; + const primitive_kind_t binary_convolution = mkldnn_binary_convolution; + const primitive_kind_t binarization = mkldnn_binarization; } using query_t = mkldnn_query_t; @@ -286,9 +329,10 @@ namespace query { const query_t lrn_d = mkldnn_query_lrn_d; const query_t batch_normalization_d = mkldnn_query_batch_normalization_d; const query_t inner_product_d = mkldnn_query_inner_product_d; - const query_t convolution_relu_d = mkldnn_query_convolution_relu_d; const query_t rnn_d = mkldnn_query_rnn_d; const query_t roi_pooling_d = mkldnn_query_roi_pooling_d; + const query_t binary_convolution_d = mkldnn_query_binary_convolution_d; + const query_t binarization_d = mkldnn_query_binarization_d; const query_t some_pd = mkldnn_query_some_pd; const query_t input_pd = mkldnn_query_input_pd; @@ -304,6 +348,7 @@ namespace query { } using blocking_desc_t = mkldnn_blocking_desc_t; +using rnn_packed_data_t = mkldnn_rnn_packed_desc_t; using wino_data_t = mkldnn_wino_desc_t; using memory_desc_t = mkldnn_memory_desc_t; using convolution_desc_t = mkldnn_convolution_desc_t; @@ -315,9 +360,10 @@ using softmax_desc_t = mkldnn_softmax_desc_t; using lrn_desc_t = mkldnn_lrn_desc_t; using batch_normalization_desc_t = mkldnn_batch_normalization_desc_t; using inner_product_desc_t = mkldnn_inner_product_desc_t; -using convolution_relu_desc_t = mkldnn_convolution_relu_desc_t; using roi_pooling_desc_t = mkldnn_roi_pooling_desc_t; using depthwise_desc_t = mkldnn_depthwise_desc_t; +using binary_convolution_desc_t = mkldnn_binary_convolution_desc_t; +using binarization_desc_t = mkldnn_binarization_desc_t; using rnn_direction_t = mkldnn_rnn_direction_t; using rnn_cell_desc_t = mkldnn_rnn_cell_desc_t; @@ -340,10 +386,11 @@ struct op_desc_t { lrn_desc_t lrn; batch_normalization_desc_t batch_normalization; inner_product_desc_t inner_product; - convolution_relu_desc_t convolution_relu; rnn_desc_t rnn; roi_pooling_desc_t roi_pooling; depthwise_desc_t depthwise; + binary_convolution_desc_t binary_convolution; + binarization_desc_t binarization; }; op_desc_t(const primitive_kind_t &_): kind(_) {} @@ -365,9 +412,10 @@ struct op_desc_t { DECL_CTOR_AND_CONVERTERS(lrn_desc_t, lrn); DECL_CTOR_AND_CONVERTERS(batch_normalization_desc_t, batch_normalization); DECL_CTOR_AND_CONVERTERS(inner_product_desc_t, inner_product); - DECL_CTOR_AND_CONVERTERS(convolution_relu_desc_t, convolution_relu); DECL_CTOR_AND_CONVERTERS(rnn_desc_t, rnn); DECL_CTOR_AND_CONVERTERS(roi_pooling_desc_t, roi_pooling); + DECL_CTOR_AND_CONVERTERS(binary_convolution_desc_t, binary_convolution); + DECL_CTOR_AND_CONVERTERS(binarization_desc_t, binarization); # undef DECL_CTOR_AND_CONVERTERS }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/convolution.cpp index 8340220..12b9569 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/convolution.cpp @@ -40,7 +40,7 @@ status_t conv_desc_init(convolution_desc_t *conv_desc, bool args_ok = true && !any_null(conv_desc, src_desc, weights_desc, dst_desc, strides, padding_l) - && one_of(alg_kind, convolution_direct, convolution_winograd) + && one_of(alg_kind, convolution_auto, convolution_direct, convolution_winograd) && one_of(padding_kind, padding_kind::padding_zero); if (!args_ok) return invalid_arguments; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.cpp new file mode 100644 index 0000000..e9b5965 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.cpp @@ -0,0 +1,56 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "utils.hpp" + +#include "convolution_pd.hpp" + +namespace mkldnn { +namespace impl { + +using namespace prop_kind; + +memory_desc_t *conv_prop_agnostic_src_d(convolution_desc_t *desc) { + return desc->prop_kind == backward_data + ? &desc->diff_src_desc : &desc->src_desc; +} + +memory_desc_t *conv_prop_agnostic_wei_d(convolution_desc_t *desc) { + return desc->prop_kind == backward_weights + ? &desc->diff_weights_desc : &desc->weights_desc; +} + +memory_desc_t *conv_prop_agnostic_bia_d(convolution_desc_t *desc) { + return desc->prop_kind == backward_weights + ? &desc->diff_bias_desc : &desc->bias_desc; +} + +memory_desc_t *conv_prop_agnostic_dst_d(convolution_desc_t *desc) { + return utils::one_of(desc->prop_kind, forward_inference, forward_training) + ? &desc->diff_bias_desc : &desc->bias_desc; +} + +const memory_desc_t *conv_prop_agnostic_src_d(const convolution_desc_t *desc) +{ return conv_prop_agnostic_src_d(const_cast(desc)); } +const memory_desc_t *conv_prop_agnostic_wei_d(const convolution_desc_t *desc) +{ return conv_prop_agnostic_wei_d(const_cast(desc)); } +const memory_desc_t *conv_prop_agnostic_bia_d(const convolution_desc_t *desc) +{ return conv_prop_agnostic_bia_d(const_cast(desc)); } +const memory_desc_t *conv_prop_agnostic_dst_d(const convolution_desc_t *desc) +{ return conv_prop_agnostic_dst_d(const_cast(desc)); } + +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.hpp index 90b6629..99e6e32 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.hpp @@ -35,25 +35,28 @@ status_t conv_desc_init(convolution_desc_t *conv_desc, const dims_t padding_l, const dims_t padding_r, padding_kind_t padding_kind); -template -struct _convolution_fwd_pd_t: public primitive_desc_t { - typedef _convolution_fwd_pd_t base_class; - typedef _convolution_fwd_pd_t hint_class; - typedef typename utils::conditional::type base_desc_t; - static constexpr auto base_pkind = - utils::conditional_v::value; - - _convolution_fwd_pd_t(mkldnn::impl::engine_t *engine, - const base_desc_t *adesc, const primitive_attr_t *attr, - const _convolution_fwd_pd_t *hint_fwd_pd) +memory_desc_t *conv_prop_agnostic_src_d(convolution_desc_t *desc); +memory_desc_t *conv_prop_agnostic_wei_d(convolution_desc_t *desc); +memory_desc_t *conv_prop_agnostic_bia_d(convolution_desc_t *desc); +memory_desc_t *conv_prop_agnostic_dst_d(convolution_desc_t *desc); +const memory_desc_t *conv_prop_agnostic_src_d(const convolution_desc_t *desc); +const memory_desc_t *conv_prop_agnostic_wei_d(const convolution_desc_t *desc); +const memory_desc_t *conv_prop_agnostic_bia_d(const convolution_desc_t *desc); +const memory_desc_t *conv_prop_agnostic_dst_d(const convolution_desc_t *desc); + +struct convolution_fwd_pd_t: public primitive_desc_t { + typedef convolution_fwd_pd_t base_class; + typedef convolution_fwd_pd_t hint_class; + static constexpr auto base_pkind = primitive_kind::convolution; + + convolution_fwd_pd_t(mkldnn::impl::engine_t *engine, + const convolution_desc_t *adesc, const primitive_attr_t *attr, + const convolution_fwd_pd_t *hint_fwd_pd) : primitive_desc_t(engine, attr, base_pkind), desc_(*adesc) , hint_fwd_pd_(hint_fwd_pd) {} - virtual ~_convolution_fwd_pd_t() {} + virtual ~convolution_fwd_pd_t() {} - const base_desc_t *desc() const { return &desc_; } - inline const convolution_desc_t *cdesc() const { return &cdesc_(); } + const convolution_desc_t *desc() const { return &desc_; } virtual const op_desc_t *op_desc() const override { return reinterpret_cast(this->desc()); } virtual void init_info() override { init_info_conv(this, this->info_); } @@ -75,7 +78,7 @@ struct _convolution_fwd_pd_t: public primitive_desc_t { { switch (what) { case pkind_traits::query_d: - *(const base_desc_t**)result = desc(); break; + *(const convolution_desc_t**)result = desc(); break; default: return primitive_desc_t::query(what, idx, result); } return status::success; @@ -88,7 +91,7 @@ struct _convolution_fwd_pd_t: public primitive_desc_t { inline int IC() const { return input_pd()->desc()->dims[1]; } inline int OC() const { return output_pd()->desc()->dims[1]; } inline int G() const - { return with_groups() ? cdesc_().weights_desc.dims[0] : 1; } + { return with_groups() ? desc_.weights_desc.dims[0] : 1; } inline int ID() const { return (ndims() == 5) ? input_pd()->desc()->dims[2] : 1; } inline int IH() const { return (ndims() == 3) ? 1 : input_pd()->desc()->dims[ndims()-2]; } @@ -97,73 +100,61 @@ struct _convolution_fwd_pd_t: public primitive_desc_t { inline int OH() const { return (ndims() == 3) ? 1 : output_pd()->desc()->dims[ndims()-2]; } inline int OW() const { return output_pd()->desc()->dims[ndims()-1]; } inline int KD() const { return (ndims() == 5) - ? cdesc_().weights_desc.dims[2 + with_groups()] : 1; } + ? desc_.weights_desc.dims[2 + with_groups()] : 1; } inline int KH() const { return (ndims() == 3) - ? 1 : cdesc_().weights_desc.dims[ndims() - (2 - with_groups())]; } + ? 1 : desc_.weights_desc.dims[ndims() - (2 - with_groups())]; } inline int KW() const - { return cdesc_().weights_desc.dims[ndims() - (1 - with_groups())]; } + { return desc_.weights_desc.dims[ndims() - (1 - with_groups())]; } - inline int KSD() const { return (ndims() == 5) ? cdesc_().strides[0] : 1; } + inline int KSD() const { return (ndims() == 5) ? desc_.strides[0] : 1; } inline int KSH() const { return (ndims() == 3) - ? 1 : cdesc_().strides[ndims()-4]; } - inline int KSW() const { return cdesc_().strides[ndims()-3]; } + ? 1 : desc_.strides[ndims()-4]; } + inline int KSW() const { return desc_.strides[ndims()-3]; } - inline int KDD() const { return (ndims() == 5) ? cdesc_().dilates[0] : 0; } + inline int KDD() const { return (ndims() == 5) ? desc_.dilates[0] : 0; } inline int KDH() const { return (ndims() == 3) - ? 0 : cdesc_().dilates[ndims()-4]; } - inline int KDW() const { return cdesc_().dilates[ndims()-3]; } + ? 0 : desc_.dilates[ndims()-4]; } + inline int KDW() const { return desc_.dilates[ndims()-3]; } inline int padFront() const - { return (ndims() == 5) ? cdesc_().padding[0][0] : 0; } + { return (ndims() == 5) ? desc_.padding[0][0] : 0; } inline int padBack() const - { return (ndims() == 5) ? cdesc_().padding[1][0] : 0; } + { return (ndims() == 5) ? desc_.padding[1][0] : 0; } inline int padT() const { return (ndims() == 3) - ? 0 : cdesc_().padding[0][ndims()-4]; } + ? 0 : desc_.padding[0][ndims()-4]; } inline int padB() const { return (ndims() == 3) - ? 0 : cdesc_().padding[1][ndims()-4]; } - inline int padL() const { return cdesc_().padding[0][ndims()-3]; } - inline int padR() const { return cdesc_().padding[1][ndims()-3]; } - - inline float negative_slope() const; + ? 0 : desc_.padding[1][ndims()-4]; } + inline int padL() const { return desc_.padding[0][ndims()-3]; } + inline int padR() const { return desc_.padding[1][ndims()-3]; } inline bool with_bias() const - { return !memory_desc_wrapper(cdesc_().bias_desc).is_zero(); } + { return !memory_desc_wrapper(desc_.bias_desc).is_zero(); } inline bool with_groups() const - { return cdesc_().weights_desc.ndims == cdesc_().src_desc.ndims + 1; } + { return desc_.weights_desc.ndims == desc_.src_desc.ndims + 1; } - inline int ndims() const { return cdesc_().src_desc.ndims; } + inline int ndims() const { return desc_.src_desc.ndims; } + + virtual status_t set_alg_kind(alg_kind_t alg) { + if (alg == alg_kind::undef) return status::invalid_arguments; + desc_.alg_kind = alg; + return status::success; + } bool has_zero_dim_memory() const { return false - || memory_desc_wrapper(cdesc_().src_desc).has_zero_dim() - || memory_desc_wrapper(cdesc_().dst_desc).has_zero_dim(); + || memory_desc_wrapper(desc_.src_desc).has_zero_dim() + || memory_desc_wrapper(desc_.dst_desc).has_zero_dim(); } -protected: - base_desc_t desc_; - const _convolution_fwd_pd_t *hint_fwd_pd_; - inline const convolution_desc_t &cdesc_() const; +protected: + convolution_desc_t desc_; + const convolution_fwd_pd_t *hint_fwd_pd_; virtual status_t init() = 0; }; -using convolution_fwd_pd_t = mkldnn::impl::_convolution_fwd_pd_t; -using convolution_relu_fwd_pd_t = mkldnn::impl::_convolution_fwd_pd_t; - -template<> inline float convolution_fwd_pd_t::negative_slope() const -{ return 0.; } -template<> inline float convolution_relu_fwd_pd_t::negative_slope() const -{ return desc()->negative_slope; } - -template inline const -convolution_desc_t &_convolution_fwd_pd_t::cdesc_() const -{ return desc_; } -template<> -inline const convolution_desc_t &convolution_relu_fwd_pd_t::cdesc_() const -{ return desc_.convolution_desc; } - struct convolution_bwd_data_pd_t: public primitive_desc_t { typedef convolution_bwd_data_pd_t base_class; typedef convolution_fwd_pd_t hint_class; @@ -178,7 +169,6 @@ struct convolution_bwd_data_pd_t: public primitive_desc_t { virtual ~convolution_bwd_data_pd_t() {} const convolution_desc_t *desc() const { return &desc_; } - const convolution_desc_t *cdesc() const { return desc(); } virtual const op_desc_t *op_desc() const override { return reinterpret_cast(this->desc()); } virtual void init_info() override { init_info_conv(this, this->info_); } @@ -257,6 +247,12 @@ struct convolution_bwd_data_pd_t: public primitive_desc_t { inline int ndims() const { return desc_.diff_src_desc.ndims; } virtual bool support_bias() const { return false; } + virtual status_t set_alg_kind(alg_kind_t alg) { + if (alg == alg_kind::undef) return status::invalid_arguments; + desc_.alg_kind = alg; + return status::success; + } + bool has_zero_dim_memory() const { return false || memory_desc_wrapper(desc_.diff_src_desc).has_zero_dim() @@ -284,7 +280,6 @@ struct convolution_bwd_weights_pd_t: public primitive_desc_t { virtual ~convolution_bwd_weights_pd_t() {} const convolution_desc_t *desc() const { return &desc_; } - const convolution_desc_t *cdesc() const { return desc(); } virtual const op_desc_t *op_desc() const override { return reinterpret_cast(this->desc()); } virtual void init_info() override { init_info_conv(this, this->info_); } @@ -372,6 +367,12 @@ struct convolution_bwd_weights_pd_t: public primitive_desc_t { inline int ndims() const { return desc_.src_desc.ndims; } + virtual status_t set_alg_kind(alg_kind_t alg) { + if (alg == alg_kind::undef) return status::invalid_arguments; + desc_.alg_kind = alg; + return status::success; + } + bool has_zero_dim_memory() const { return false || memory_desc_wrapper(desc_.src_desc).has_zero_dim() diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/convolution_relu.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/convolution_relu.cpp deleted file mode 100644 index 1df198f..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/common/convolution_relu.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/******************************************************************************* -* Copyright 2016-2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include -#include "mkldnn.h" - -#include "c_types_map.hpp" -#include "type_helpers.hpp" -#include "utils.hpp" - -using namespace mkldnn::impl; -using namespace mkldnn::impl::utils; -using namespace mkldnn::impl::status; -using namespace mkldnn::impl::prop_kind; -using namespace mkldnn::impl::alg_kind; - -status_t mkldnn_convolution_relu_desc_init( - convolution_relu_desc_t *conv_relu_desc, - const convolution_desc_t *conv_desc, float negative_slope) { - bool args_ok = !any_null(conv_relu_desc, conv_desc) - && utils::one_of(conv_desc->prop_kind, prop_kind::forward_training, - prop_kind::forward_inference); - if (!args_ok) return invalid_arguments; - conv_relu_desc->primitive_kind = primitive_kind::convolution_relu; - conv_relu_desc->convolution_desc = *conv_desc; - conv_relu_desc->negative_slope = negative_slope; - return success; -} - -// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/deconvolution_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/deconvolution_pd.hpp index ba699c5..a98a749 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/deconvolution_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/deconvolution_pd.hpp @@ -39,7 +39,6 @@ struct deconvolution_fwd_pd_t : public primitive_desc_t { virtual ~deconvolution_fwd_pd_t() {} const deconvolution_desc_t *desc() const { return &desc_; } - inline const deconvolution_desc_t *cdesc() const { return &desc_; } virtual const op_desc_t *op_desc() const override { return reinterpret_cast(this->desc()); } @@ -118,6 +117,12 @@ struct deconvolution_fwd_pd_t : public primitive_desc_t { } inline int ndims() const { return desc_.src_desc.ndims; } + bool has_zero_dim_memory() const { + return false + || memory_desc_wrapper(desc_.src_desc).has_zero_dim() + || memory_desc_wrapper(desc_.dst_desc).has_zero_dim(); + } + protected: deconvolution_desc_t desc_; const deconvolution_fwd_pd_t *hint_fwd_pd_; @@ -138,7 +143,6 @@ struct deconvolution_bwd_data_pd_t : public primitive_desc_t { virtual ~deconvolution_bwd_data_pd_t() {} const deconvolution_desc_t *desc() const { return &desc_; } - const deconvolution_desc_t *cdesc() const { return desc(); } virtual const op_desc_t *op_desc() const override { return reinterpret_cast(this->desc()); } @@ -214,7 +218,7 @@ struct deconvolution_bwd_data_pd_t : public primitive_desc_t { inline bool with_groups() const { return desc_.weights_desc.ndims == desc_.diff_src_desc.ndims + 1; } - inline int ndims() const { return desc_.src_desc.ndims; } + inline int ndims() const { return desc_.diff_src_desc.ndims; } protected: deconvolution_desc_t desc_; @@ -236,7 +240,6 @@ struct deconvolution_bwd_weights_pd_t : public primitive_desc_t { virtual ~deconvolution_bwd_weights_pd_t() {} const deconvolution_desc_t *desc() const { return &desc_; } - const deconvolution_desc_t *cdesc() const { return desc(); } virtual const op_desc_t *op_desc() const override { return reinterpret_cast(this->desc()); } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/depthwise.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/depthwise.cpp index 1a8220e..d206c36 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/depthwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/depthwise.cpp @@ -39,7 +39,7 @@ status_t depthwise_desc_init(depthwise_desc_t *depthwise_desc, prop_kind_t prop_ && one_of(alg_kind, depthwise_scale_shift, depthwise_prelu); if (!args_ok) return invalid_arguments; - depthwise_desc_t dd = {}; + auto dd = depthwise_desc_t(); dd.primitive_kind = primitive_kind::depthwise; dd.prop_kind = prop_kind; dd.alg_kind = alg_kind; @@ -62,7 +62,7 @@ status_t depthwise_desc_init(depthwise_desc_t *depthwise_desc, prop_kind_t prop_ status_t mkldnn_depthwise_forward_desc_init(depthwise_desc_t *depthwise_desc, prop_kind_t prop_kind, alg_kind_t alg_kind, - const memory_desc_t *src_desc, const memory_desc_t *dst_desc, const memory_desc_t *weights_desc, + const memory_desc_t *src_desc, const memory_desc_t *dst_desc, const memory_desc_t *weights_desc, const memory_desc_t *bias_desc) { if (!one_of(prop_kind, forward_training, forward_inference)) return invalid_arguments; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/eltwise.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/eltwise.cpp index 815d2d7..5d9a6dd 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/eltwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/eltwise.cpp @@ -39,7 +39,7 @@ status_t eltwise_desc_init(eltwise_desc_t *eltwise_desc, prop_kind_t prop_kind, && one_of(alg_kind, eltwise_relu, eltwise_tanh, eltwise_elu, eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic, - eltwise_clamp) + eltwise_clamp, eltwise_exp, eltwise_not) && IMPLICATION(prop_kind == backward_data, diff_data_desc != nullptr); if (!args_ok) return invalid_arguments; @@ -54,7 +54,6 @@ status_t eltwise_desc_init(eltwise_desc_t *eltwise_desc, prop_kind_t prop_kind, ed.alpha = alpha; ed.beta = beta; - ed.negative_slope = ed.alpha; bool consistency = true && IMPLICATION(ed.prop_kind == backward_data, @@ -83,19 +82,4 @@ status_t mkldnn_eltwise_backward_desc_init(eltwise_desc_t *eltwise_desc, diff_data_desc, alpha, beta); } -status_t mkldnn_relu_forward_desc_init(eltwise_desc_t *relu_desc, - prop_kind_t prop_kind, const memory_desc_t *data_desc, - float negative_slope) { - return mkldnn_eltwise_forward_desc_init(relu_desc, prop_kind, eltwise_relu, - data_desc, negative_slope, 0.); -} - -status_t mkldnn_relu_backward_desc_init(eltwise_desc_t *relu_desc, - const memory_desc_t *diff_data_desc, const memory_desc_t *data_desc, - float negative_slope) { - return mkldnn_eltwise_backward_desc_init(relu_desc, eltwise_relu, - diff_data_desc, data_desc, negative_slope, 0.); -} - - // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/eltwise_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/eltwise_pd.hpp index bf457a9..16120e5 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/eltwise_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/eltwise_pd.hpp @@ -72,10 +72,8 @@ struct eltwise_fwd_pd_t: public primitive_desc_t { inline int W() const { return input_pd()->desc()->ndims == 4 ? input_pd()->desc()->dims[3] : input_pd()->desc()->dims[4]; } - inline bool is_zero_preserved() const { - return !utils::one_of(desc_.alg_kind, alg_kind::eltwise_linear, - alg_kind::eltwise_soft_relu, alg_kind::eltwise_logistic, alg_kind::eltwise_clamp); - } + inline bool is_zero_preserved() const + { return math::eltwise_fwd_preserves_zero(desc_.alg_kind); } bool has_zero_dim_memory() const { return memory_desc_wrapper(desc_.data_desc).has_zero_dim(); } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp index 0a13a33..7afe129 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp @@ -35,12 +35,17 @@ enum class data_kind_t { enum class block_format_t { _, + _4c, _4i, _4o, _8c, _8g, _8i, _8o, - _8i8o, _8o8i, _8o4i, _8o4i_s8s8, - _16c, _16g, _16i, _16o, + _4i4o, _4o4i, _4o4i_s8s8, + _8i8o, _8o8i, + _8o4i, _8o4i_s8s8, + _8o32i, _16o32i, + _16c, _16g, _16g_s8s8, _16i, _16o, _16i16o, _16o16i, _8i16o2i, _8o16i2o, _4i16o4i, _4i16o4i_s8s8, + _2i8o4i, _2i8o4i_s8s8 }; template struct block_format_traits { @@ -48,15 +53,20 @@ template struct block_format_traits { static constexpr int levels = f == bf::_ ? 0 : utils::one_of(f, bf::_8i16o2i, bf::_8o16i2o, - bf::_4i16o4i, bf::_4i16o4i_s8s8) ? 2 : 1; + bf::_4i16o4i, bf::_4i16o4i_s8s8, + bf::_2i8o4i, bf::_2i8o4i_s8s8) ? 2 : 1; static constexpr int blk_ndims = f == bf::_ ? 0 - : utils::one_of(f, bf::_8c, bf::_8g, bf::_8i, bf::_8o, bf::_16c, - bf::_16g, bf::_16i, bf::_16o) ? 1 : 2; + : utils::one_of(f, bf::_4c, bf::_4i, bf::_4o, bf::_8c, bf::_8g, bf::_8i, bf::_8o, bf::_16c, + bf::_16g, bf::_16g_s8s8, bf::_16i, bf::_16o) ? 1 : 2; static constexpr int blk_size = f == bf::_ ? 1 - : utils::one_of(f, bf::_8c, bf::_8g, bf::_8i, bf::_8o, bf::_8i8o, - bf::_8o8i, bf::_8o4i, bf::_8o4i_s8s8) ? 8 : 16; + : (utils::one_of(f, bf::_4c, bf::_4i, bf::_4o, bf::_4i4o, bf::_4o4i, bf::_4o4i_s8s8) ? 4 + : (utils::one_of(f, bf::_8c, bf::_8g, bf::_8i, bf::_8o, + bf::_8i8o, bf::_8o8i, + bf::_8o4i, bf::_8o4i_s8s8, + bf::_2i8o4i, bf::_2i8o4i_s8s8, + bf::_8o32i) ? 8 : 16)); }; template struct format_traits { @@ -64,7 +74,7 @@ template struct format_traits { // block_format_t blk_fmt; -- the format of blocks (e.g. 8c or 4i16o4i) // int ndims; -- # of dimensions // int ndims_sp; -- # of spatial dimensions - // int blk_size; -- block size (1, 8, or 16) + // int blk_size; -- block size (1, 4, 8, or 16) }; #define DECL_TRAITS(_fmt, _data_kind, _blk_fmt, _ndims, _ndims_sp) \ @@ -87,6 +97,7 @@ DECL_TRAITS(nc, data, _, 2, 0); /* data: 3D */ DECL_TRAITS(ncw, data, _, 3, 1); DECL_TRAITS(nwc, data, _, 3, 1); +DECL_TRAITS(nCw4c, data, _4c, 3, 1); DECL_TRAITS(nCw8c, data, _8c, 3, 1); DECL_TRAITS(nCw16c, data, _16c, 3, 1); @@ -94,12 +105,14 @@ DECL_TRAITS(nCw16c, data, _16c, 3, 1); DECL_TRAITS(nchw, data, _, 4, 2); DECL_TRAITS(nhwc, data, _, 4, 2); DECL_TRAITS(chwn, data, _, 4, 2); +DECL_TRAITS(nChw4c, data, _4c, 4, 2); DECL_TRAITS(nChw8c, data, _8c, 4, 2); DECL_TRAITS(nChw16c, data, _16c, 4, 2); /* data: 5D */ DECL_TRAITS(ncdhw, data, _, 5, 3); DECL_TRAITS(ndhwc, data, _, 5, 3); +DECL_TRAITS(nCdhw4c, data, _4c, 5, 3); DECL_TRAITS(nCdhw8c, data, _8c, 5, 3); DECL_TRAITS(nCdhw16c, data, _16c, 5, 3); @@ -110,11 +123,14 @@ DECL_TRAITS(io, wei, _, 2, 0); /* wei: 3D */ DECL_TRAITS(oiw, wei, _, 3, 1); DECL_TRAITS(wio, wei, _, 3, 1); +DECL_TRAITS(Owi4o, wei, _4o, 3, 1); +DECL_TRAITS(OIw4i4o, wei, _4i4o, 3, 1); DECL_TRAITS(Owi8o, wei, _8o, 3, 1); DECL_TRAITS(OIw8i8o, wei, _8i8o, 3, 1); DECL_TRAITS(OIw8o8i, wei, _8o8i, 3, 1); DECL_TRAITS(OIw16i16o, wei, _16i16o, 3, 1); DECL_TRAITS(OIw16o16i, wei, _16o16i, 3, 1); +DECL_TRAITS(Oiw4o, wei, _4o, 3, 1); DECL_TRAITS(Oiw16o, wei, _16o, 3, 1); DECL_TRAITS(Owi16o, wei, _16o, 3, 1); DECL_TRAITS(OIw8i16o2i, wei, _8i16o2i, 3, 1); @@ -125,10 +141,14 @@ DECL_TRAITS(OIw8o16i2o, wei, _8o16i2o, 3, 1); DECL_TRAITS(oihw, wei, _, 4, 2); DECL_TRAITS(ihwo, wei, _, 4, 2); DECL_TRAITS(hwio, wei, _, 4, 2); +DECL_TRAITS(iohw, wei, _, 4, 2); DECL_TRAITS(hwio_s8s8, wei, _, 4, 2); DECL_TRAITS(oIhw8i, wei, _8i, 4, 2); DECL_TRAITS(oIhw16i, wei, _16i, 4, 2); +DECL_TRAITS(OIhw4i4o, wei, _4i4o, 4, 2); DECL_TRAITS(OIhw8i8o, wei, _8i8o, 4, 2); +DECL_TRAITS(OhIw8o32i, wei, _8o32i, 4, 2); +DECL_TRAITS(OhIw16o32i, wei, _16o32i, 4, 2); DECL_TRAITS(OhIw8o4i, wei, _8o4i, 4, 2); DECL_TRAITS(OhIw8o4i_s8s8, wei, _8o4i_s8s8, 4, 2); DECL_TRAITS(OIhw16i16o, wei, _16i16o, 4, 2); @@ -139,18 +159,23 @@ DECL_TRAITS(OIhw8o16i2o, wei, _8o16i2o, 4, 2); DECL_TRAITS(OIhw8o8i, wei, _8o8i, 4, 2); DECL_TRAITS(OIhw16o16i, wei, _16o16i, 4, 2); DECL_TRAITS(IOhw16o16i, wei, _16o16i, 4, 2); +DECL_TRAITS(Oihw4o, wei, _4o, 4, 2); DECL_TRAITS(Oihw16o, wei, _16o, 4, 2); DECL_TRAITS(Ohwi8o, wei, _8o, 4, 2); +DECL_TRAITS(Ohwi4o, wei, _4o, 4, 2); DECL_TRAITS(Ohwi16o, wei, _16o, 4, 2); /* wei: 5D */ DECL_TRAITS(dhwio, wei, _, 5, 3); DECL_TRAITS(oidhw, wei, _, 5, 3); +DECL_TRAITS(OIdhw4i4o, wei, _4i4o, 5, 3); +DECL_TRAITS(Odhwi4o, wei, _4o, 5, 3); DECL_TRAITS(OIdhw8i8o, wei, _8i8o, 5, 3); DECL_TRAITS(OIdhw8o8i, wei, _8o8i, 5, 3); DECL_TRAITS(Odhwi8o, wei, _8o, 5, 3); DECL_TRAITS(OIdhw16i16o, wei, _16i16o, 5, 3); DECL_TRAITS(OIdhw16o16i, wei, _16o16i, 5, 3); +DECL_TRAITS(Oidhw4o, wei, _4o, 5, 3); DECL_TRAITS(Oidhw16o, wei, _16o, 5, 3); DECL_TRAITS(Odhwi16o, wei, _16o, 5, 3); DECL_TRAITS(oIdhw8i, wei, _8i, 5, 3); @@ -159,11 +184,14 @@ DECL_TRAITS(OIdhw8i16o2i, wei, _8i16o2i, 5, 3); /* gwei: 4D */ DECL_TRAITS(goiw, gwei, _, 4, 1); +DECL_TRAITS(gOwi4o, gwei, _4o, 4, 1); +DECL_TRAITS(gOIw4i4o, gwei, _4i4o, 4, 1); DECL_TRAITS(gOwi8o, gwei, _8o, 4, 1); DECL_TRAITS(gOIw8i8o, gwei, _8i8o, 4, 1); DECL_TRAITS(gOIw8o8i, gwei, _8o8i, 4, 1); DECL_TRAITS(gOIw16i16o, gwei, _16i16o, 4, 1); DECL_TRAITS(gOIw16o16i, gwei, _16o16i, 4, 1); +DECL_TRAITS(gOiw4o, gwei, _4o, 4, 1); DECL_TRAITS(gOiw16o, gwei, _16o, 4, 1); DECL_TRAITS(gOwi16o, gwei, _16o, 4, 1); DECL_TRAITS(gOIw8i16o2i, gwei, _8i16o2i, 4, 1); @@ -173,32 +201,43 @@ DECL_TRAITS(gOIw8o16i2o, gwei, _8o16i2o, 4, 1); /* gwei: 5D */ DECL_TRAITS(goihw, gwei, _, 5, 2); DECL_TRAITS(hwigo, gwei, _, 5, 2); +DECL_TRAITS(giohw, gwei, _, 5, 2); DECL_TRAITS(hwigo_s8s8, gwei, _, 5, 2); +DECL_TRAITS(gOIhw4i4o, gwei, _4i4o, 5, 2); DECL_TRAITS(gOIhw8i8o, gwei, _8i8o, 5, 2); DECL_TRAITS(gOhIw8o4i, gwei, _8o4i, 5, 2); DECL_TRAITS(gOhIw8o4i_s8s8, gwei, _8o4i_s8s8, 5, 2); DECL_TRAITS(gOIhw16i16o, gwei, _16i16o, 5, 2); DECL_TRAITS(gOIhw4i16o4i, gwei, _4i16o4i, 5, 2); DECL_TRAITS(gOIhw4i16o4i_s8s8, gwei, _4i16o4i_s8s8, 5, 2); +DECL_TRAITS(gOIhw2i8o4i, gwei, _2i8o4i, 5, 2); +DECL_TRAITS(gOIhw2i8o4i_s8s8, gwei, _2i8o4i_s8s8, 5, 2); DECL_TRAITS(gOIhw8i16o2i, gwei, _8i16o2i, 5, 2); DECL_TRAITS(gOIdhw8i16o2i, gwei, _8i16o2i, 5, 2); DECL_TRAITS(gOIhw8o16i2o, gwei, _8o16i2o, 5, 2); DECL_TRAITS(gOIhw8o8i, gwei, _8o8i, 5, 2); +DECL_TRAITS(gOIhw4o4i, gwei, _4o4i, 5, 2); +DECL_TRAITS(gOIhw4o4i_s8s8, gwei, _4o4i_s8s8, 5, 2); DECL_TRAITS(gOIhw16o16i, gwei, _16o16i, 5, 2); DECL_TRAITS(gIOhw16o16i, gwei, _16o16i, 5, 2); +DECL_TRAITS(gOihw4o, gwei, _4o, 5, 2); DECL_TRAITS(gOihw16o, gwei, _16o, 5, 2); DECL_TRAITS(gOhwi8o, gwei, _8o, 5, 2); +DECL_TRAITS(gOhwi4o, gwei, _4o, 5, 2); DECL_TRAITS(gOhwi16o, gwei, _16o, 5, 2); DECL_TRAITS(Goihw8g, gwei, _8g, 5, 2); DECL_TRAITS(Goihw16g, gwei, _16g, 5, 2); +DECL_TRAITS(Goihw16g_s8s8, gwei, _16g_s8s8, 5, 2); /* gwei: 6D */ DECL_TRAITS(goidhw, gwei, _, 6, 3); +DECL_TRAITS(gOIdhw4i4o, gwei, _4i4o, 6, 3); DECL_TRAITS(gOIdhw8i8o, gwei, _8i8o, 6, 3); DECL_TRAITS(gOIdhw8o8i, gwei, _8o8i, 6, 3); DECL_TRAITS(gOdhwi8o, gwei, _8o, 6, 3); DECL_TRAITS(gOIdhw16i16o, gwei, _16i16o, 6, 3); DECL_TRAITS(gOIdhw16o16i, gwei, _16o16i, 6, 3); +DECL_TRAITS(gOidhw4o, gwei, _4o, 6, 3); DECL_TRAITS(gOidhw16o, gwei, _16o, 6, 3); DECL_TRAITS(gOdhwi16o, gwei, _16o, 6, 3); @@ -216,21 +255,28 @@ DECL_TRAITS(ldgo, rnn, _, 4, 0); template constexpr int OI_blk_off(int oc, int ic) { using bf = block_format_t; - static_assert(utils::one_of(f, bf::_8i8o, bf::_8o8i, bf::_8o4i, bf::_8o4i_s8s8, - bf::_16i16o, bf::_16o16i, bf::_8i16o2i, bf::_8o16i2o, - bf::_4i16o4i, bf::_4i16o4i_s8s8), + static_assert(utils::one_of(f, bf::_4i4o, bf::_4o4i, bf::_4o4i_s8s8, + bf::_8i8o, bf::_8o8i, bf::_16i16o, + bf::_16o16i, bf::_8i16o2i, bf::_8o16i2o, + bf::_4i16o4i, bf::_4i16o4i_s8s8, + bf::_2i8o4i, bf::_2i8o4i_s8s8, + bf::_8o4i, bf::_8o4i_s8s8, + bf::_8o32i, bf::_16o32i), "unexpected blocked format"); # define blksize block_format_traits::blk_size return f == bf::_8i16o2i ? (ic / 2) * blksize * 2 + 2 * oc + ic % 2 - : (f == bf::_4i16o4i || f == bf::_4i16o4i_s8s8) + : (f == bf::_4i16o4i || f == bf::_4i16o4i_s8s8 + || f == bf::_2i8o4i || f == bf::_2i8o4i_s8s8) ? (ic / 4) * blksize * 4 + oc * 4 + ic % 4 : f == bf::_8o16i2o ? (oc / 2) * blksize * 2 + 2 * ic + oc % 2 - : utils::one_of(f, bf::_8i8o, bf::_16i16o) + : utils::one_of(f, bf::_4i4o, bf::_8i8o, bf::_16i16o) ? ic * blksize + oc : (f == bf::_8o4i || f == bf::_8o4i_s8s8) ? (ic / 4) * blksize * 4 + 4 * oc + ic % 4 + : (f == bf::_8o32i || f == bf::_16o32i) + ? 32 * oc + 32 : oc * blksize + ic; # undef blksize // if only we program in C++14... } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/math_utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/math_utils.hpp index 0ae7093..6e2e285 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/math_utils.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/math_utils.hpp @@ -22,6 +22,7 @@ #include "utils.hpp" #include "nstl.hpp" +#include "mkldnn_traits.hpp" namespace mkldnn { namespace impl { @@ -107,118 +108,203 @@ inline int ilog2q(size_t v) { return p; } +template ::type> +inline U one_m_square(T x) { + return (U)(1 - x) * (1 + x); +} + +template ::type> +inline U x_m_square(T x) { + return (U)(1 - x) * x; +} + /* activation */ -template inline T relu_fwd(T s, A alpha) { - return s > 0 ? s : (T)(s * alpha); +template ::type> +inline U relu_fwd(T s, A alpha) { + return s > 0 ? s : (U)(s * alpha); } -template inline T relu_bwd(T dd, T s, A alpha) { - return s > 0 ? dd : (T)(dd * alpha); +template ::type> +inline U relu_bwd(T dd, T s, A alpha) { + return s > 0 ? dd : (U)(dd * alpha); } -template inline T tanh_fwd(T s) { +template ::type> +inline U tanh_fwd(T s) { const float e = tanhf((float) s); - return (T) e; + return (U)e; } -template inline T tanh_bwd(T dd, T s) { + +template ::type> +inline U tanh_bwd(T dd, T s) { const float e = tanh_fwd((float) s); - return (T)(dd * (1 - e) * (1 + e)); + return (U)(dd * (1 - e) * (1 + e)); } -template inline T elu_fwd(T s, A alpha) { - return s > 0 ? s : (T)(alpha * (::expm1f((float)s))); +template ::type> +inline U elu_fwd(T s, A alpha) { + return s > 0 ? s : (U)(alpha * (::expm1f((float)s))); } -template inline T elu_bwd(T dd, T s, A alpha) { - return (T)(dd * (s > 0 ? 1 : alpha * ::expf((float)s))); +template ::type> + inline U elu_bwd(T dd, T s, A alpha) { + return (U)(dd * (s > 0 ? 1 : alpha * ::expf((float)s))); } -template -inline T square_fwd(T s) { +template ::type> +inline U square_fwd(T s) { return s * s; } -template -inline T square_bwd(T dd, T s) { - return dd * 2*s; +template ::type> +inline U square_bwd(T dd, T s) { + return dd * 2 * s; } -template -inline T abs_fwd(T s) { +template ::type> +inline U abs_fwd(T s) { return s > 0 ? s : -s; } -template -inline T abs_bwd(T dd, T s) { +template ::type> +inline U abs_bwd(T dd, T s) { return s > 0 ? dd : s < 0 ? -dd : 0; } -template -inline T sqrt_fwd(T s) { - return s > 0 ? (T)(::sqrtf((float)(s))) : 0; +template ::type> +inline U sqrt_fwd(T s) { + return s > 0 ? (U)(::sqrtf((float)(s))) : 0; } -template -inline T sqrt_bwd(T dd, T s) { +template ::type> +inline U sqrt_bwd(T dd, T s) { return s > 0 - ? (T)(dd / (2 * ::sqrtf((float)(s)))) + ? (U)(dd / (2 * ::sqrtf((float)(s)))) : 0; } -template -inline T linear_fwd(T s, A alpha, A beta) { - return (T)(alpha * s + beta); +template ::type> +inline U linear_fwd(T s, A alpha, A beta) { + return (U)(alpha * s + beta); } -template -inline T linear_bwd(T dd, T s, A alpha, A beta) { +template ::type> +inline U linear_bwd(T dd, T s, A alpha, A beta) { (void) s; (void) beta; - return (T)(dd * alpha); + return (U)(dd * alpha); } -template -inline T bounded_relu_fwd(T s, A alpha) { +template ::type> +inline U bounded_relu_fwd(T s, A alpha) { s = s > 0 ? s : 0; - return s > alpha ? (T)(alpha) : s; + return s > alpha ? (U)(alpha) : s; } -template -inline T bounded_relu_bwd(T dd, T s, A alpha) { +template ::type> +inline U bounded_relu_bwd(T dd, T s, A alpha) { return dd * (0 < s && s < alpha ? 1 : 0); } -template -inline T soft_relu_fwd(T s) { +template ::type> +inline U soft_relu_fwd(T s) { float max_logf = 8.872284e+01; //::logf(FLT_MAX) - return s < max_logf ? (T)(::log1pf(::expf((float)s))) : s; + return s < max_logf ? (U)(::log1pf(::expf((float)s))) : s; } -template -inline T soft_relu_bwd(T dd, T s) { - return (T)(dd / (1 + ::expf((float)(-s)))); +template ::type> +inline U soft_relu_bwd(T dd, T s) { + return (U)(dd / (1 + ::expf((float)(-s)))); } -template -inline T logistic_fwd(T s) { - T v = (T)(::expf((float) -s)); +template ::type> +inline U logistic_fwd(T s) { + U v = (U)(::expf((float) -s)); return 1 / (1 + v); } -template -inline T logistic_bwd(T dd, T s) { - T v = logistic_fwd(s); +template ::type> +inline U logistic_bwd(T dd, T s) { + U v = logistic_fwd(s); return dd * v * (1 - v); } -template -T clamp_fwd(T s, A alpha, A beta) { - return s > alpha ? (T)(alpha) : s < beta ? (T)(beta) : s; +template ::type> +inline U clamp_fwd(T s, A alpha, A beta) { + return (U)(s > alpha ? alpha : s < beta ? beta : s); } -template -T clamp_bwd(T dd, T s, A alpha, A beta) { +template ::type> +inline U clamp_bwd(T dd, T s, A alpha, A beta) { return dd * (beta < s && s < alpha ? 1 : 0); } +template ::type> +inline U exp_fwd(T s) { + return (U)(::expf((float)s)); +} + +template ::type> + inline U exp_bwd(T dd, T s) { + return (U)(::expf((float)s)); +} + +template ::type> +inline U not_fwd(T s) { + return (U)(!s); +} + +template ::type> +inline U scale_shift_fwd(T s_val, A w_val, A b_val) { + return (U)(s_val*w_val + b_val); +} + +template ::type> +inline U prelu_fwd(T s_val, A w_val) { + return (U)(s_val >= 0 ? s_val : w_val*s_val); +} + +inline bool eltwise_fwd_preserves_zero(alg_kind_t alg, bool jit_impl = false) { + using namespace alg_kind; + using namespace utils; + const bool preserves_zero = true + && !one_of(alg, eltwise_linear, eltwise_soft_relu, eltwise_logistic, eltwise_clamp, eltwise_exp, eltwise_not) + && IMPLICATION(jit_impl, !one_of(alg, eltwise_elu, eltwise_tanh, eltwise_clamp, eltwise_exp, eltwise_not)); + return preserves_zero; +} + +inline float get_bias(const char *bias, size_t offset, data_type_t data_type) +{ + if (!bias) + return 0.0f; + +#define CASE(dt) \ + case dt: return (float)((const prec_traits
::type *)bias)[offset] + + switch (data_type) { + CASE(data_type::s8); + CASE(data_type::u8); + CASE(data_type::s32); + CASE(data_type::f32); + default: assert(!"unimplemented"); + } + return 0; // never happens (should probably be a NaN) +#undef CASE +} + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/memory.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/memory.cpp index efecc5e..082901c 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/memory.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/memory.cpp @@ -40,7 +40,7 @@ bool memory_desc_sanity_check(int ndims,const dims_t dims, bool ok = true && dims != nullptr && 0 < ndims && ndims <= TENSOR_MAX_DIMS - && one_of(data_type, f32, s32, s16, s8, u8) + && one_of(data_type, f32, s32, s16, s8, u8, bin) && format != memory_format::undef; if (!ok) return false; for (int d = 0; d < ndims; ++d) @@ -77,8 +77,7 @@ status_t mkldnn_memory_desc_init(memory_desc_t *memory_desc, int ndims, md.format = format; status_t status = success; - if (one_of(format, memory_format::undef, blocked, ldigo_p, ldgoi_p, - wino_fmt)) { + if (one_of(format, memory_format::undef, blocked, wino_fmt, rnn_packed)) { status = invalid_arguments; } else if (format == any) { // nop diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.cpp index 3df9295..61d1fd5 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.cpp @@ -63,7 +63,7 @@ inline void set_default_strides(strides_t strides, const dims_t dims, strides[curr_idx] = dims[curr_idx] == 0 ? 1 - : strides[prev_idx] * nstl::max(1, dims[prev_idx]); + : strides[prev_idx] * nstl::max((ptrdiff_t)1, dims[prev_idx]); } } @@ -72,10 +72,26 @@ status_t fill_nonblocked(memory_desc_t &md, const int perm[]) { blocking_desc_t &blk = md.layout_desc.blocking; array_set(blk.block_dims, 1, ndims); array_set(blk.strides[1], 1, ndims); - set_default_strides(blk.strides[0], md.dims, ndims, perm); - array_copy(blk.padding_dims, md.dims, ndims); + + if (md.format == mkldnn_nhwc && md.data_type == mkldnn_bin) { + dims_t padding_dims; + + const dims_t block_dims = {1, 8, 1, 1}; + for (int d = 0; d < ndims; ++d) { + padding_dims[d] = rnd_up(md.dims[d], block_dims[d]); + } + + set_default_strides(blk.strides[0], padding_dims, ndims, perm); + array_copy(blk.padding_dims, padding_dims, ndims); + + } else { + set_default_strides(blk.strides[0], md.dims, ndims, perm); + array_copy(blk.padding_dims, md.dims, ndims); + } + array_set(blk.offset_padding_to_data, 0, ndims); blk.offset_padding = 0; + return success; } @@ -126,6 +142,17 @@ status_t fill_nwc(memory_desc_t &md) { return fill_nonblocked(md, perm); } +status_t fill_nCw4c(memory_desc_t &md) { + if (md.ndims != 3) return invalid_arguments; + + const dims_t block_dims = { 1, 4, 1 }; + const int perm[] = { + 0, 1, 2, + 3, 4, 5 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + + status_t fill_nCw8c(memory_desc_t &md) { if (md.ndims != 3) return invalid_arguments; @@ -195,6 +222,16 @@ status_t fill_chwn(memory_desc_t &md) { return fill_nonblocked(md, perm); } +status_t fill_nChw4c(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 1, 4, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_nChw8c(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; @@ -225,6 +262,16 @@ status_t fill_nCdhw16c(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_nCdhw4c(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = { 1, 4, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_nCdhw8c(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -263,6 +310,16 @@ status_t fill_wio(memory_desc_t &md) { return fill_nonblocked(md, perm); } +status_t fill_Owi4o(memory_desc_t &md) { + if (md.ndims != 3) return invalid_arguments; + + const dims_t block_dims = { 4, 1, 1 }; + const int perm[] = { + 0, 2, 1, + 3, 4, 5 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_Owi8o(memory_desc_t &md) { if (md.ndims != 3) return invalid_arguments; @@ -283,6 +340,16 @@ status_t fill_OIw8o8i(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_OIw4i4o(memory_desc_t &md) { + if (md.ndims != 3) return invalid_arguments; + + const dims_t block_dims = { 4, 4, 1 }; + const int perm[] = { + 0, 1, 2, + 4, 3, 5 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_OIw8i8o(memory_desc_t &md) { if (md.ndims != 3) return invalid_arguments; @@ -313,16 +380,26 @@ status_t fill_OIw16o16i(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_Oiw16o(memory_desc_t &md) { +status_t fill_Oiw4o(memory_desc_t &md) { if (md.ndims != 3) return invalid_arguments; - const dims_t block_dims = {16, 1, 1}; + const dims_t block_dims = {4, 1, 1}; const int perm[] = { 0, 1, 2, 3, 4, 5}; return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_Oiw16o(memory_desc_t &md) { + if (md.ndims != 3) return invalid_arguments; + + const dims_t block_dims = { 16, 1, 1 }; + const int perm[] = { + 0, 1, 2, + 3, 4, 5 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_Owi16o(memory_desc_t &md) { if (md.ndims != 3) return invalid_arguments; @@ -384,6 +461,13 @@ status_t fill_hwio(memory_desc_t &md) { return fill_nonblocked(md, perm); } +status_t fill_iohw(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const int perm[4] = {1, 0, 2, 3}; + return fill_nonblocked(md, perm); +} + status_t fill_dhwio(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -391,6 +475,16 @@ status_t fill_dhwio(memory_desc_t &md) { return fill_nonblocked(md, perm); } +status_t fill_OIhw4i4o(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 4, 4, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 5, 4, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_OIhw8i8o(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; @@ -421,6 +515,16 @@ status_t fill_OIdhw16i16o(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_OIdhw4i4o(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = { 4, 4, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, + 6, 5, 7, 8, 9 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_OIdhw8i8o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -451,6 +555,26 @@ status_t fill_OhIw8o4i(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_OhIw8o32i(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = {8, 32, 1, 1}; + const int perm[] = { + 0, 2, 1, 3, + 4, 5, 6, 7}; + return fill_contiguous_blocked(md, block_dims, perm); +} + +status_t fill_OhIw16o32i(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = {16, 32, 1, 1}; + const int perm[] = { + 0, 2, 1, 3, + 4, 5, 6, 7}; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_OIhw8i16o2i(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; @@ -531,16 +655,36 @@ status_t fill_OIhw8o16i2o(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_Oihw16o(memory_desc_t &md) { +status_t fill_Oihw4o(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; - const dims_t block_dims = {16, 1, 1, 1}; + const dims_t block_dims = {4, 1, 1, 1}; const int perm[] = { 0, 1, 2, 3, 4, 5, 6, 7}; return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_Oihw16o(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 16, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + +status_t fill_Oidhw4o(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = { 4, 1, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_Oidhw16o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -561,16 +705,26 @@ status_t fill_Ohwi8o(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_Ohwi16o(memory_desc_t &md) { +status_t fill_Ohwi4o(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; - const dims_t block_dims = {16, 1, 1, 1}; + const dims_t block_dims = {4, 1, 1, 1}; const int perm[] = { 0, 2, 3, 1, 4, 5, 6, 7}; return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_Ohwi16o(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 16, 1, 1, 1 }; + const int perm[] = { + 0, 2, 3, 1, + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_Odhwi16o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -598,23 +752,43 @@ status_t fill_goiw(memory_desc_t &md) { return fill_nonblocked(md, perm); } -status_t fill_gOwi8o(memory_desc_t &md) { +status_t fill_gOwi4o(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; - const dims_t block_dims = {1, 8, 1, 1}; + const dims_t block_dims = {1, 4, 1, 1}; const int perm[] = { 0, 1, 3, 2, 4, 5, 6, 7}; return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_gOwi8o(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 1, 8, 1, 1 }; + const int perm[] = { + 0, 1, 3, 2, + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_gOIw8o8i(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; - const dims_t block_dims = {1, 8, 8, 1}; + const dims_t block_dims = { 1, 8, 8, 1 }; const int perm[] = { 0, 1, 2, 3, - 4, 5, 6, 7}; + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + +status_t fill_gOIw4i4o(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 1, 4, 4, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 4, 6, 5, 7 }; return fill_contiguous_blocked(md, block_dims, perm); } @@ -648,6 +822,16 @@ status_t fill_gOIw16o16i(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_gOiw4o(memory_desc_t &md) { + if (md.ndims != 4) return invalid_arguments; + + const dims_t block_dims = { 1, 4, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_gOiw16o(memory_desc_t &md) { if (md.ndims != 4) return invalid_arguments; @@ -712,13 +896,40 @@ status_t fill_hwigo(memory_desc_t &md) { return fill_nonblocked(md, perm); } +status_t fill_giohw(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const int perm[5] = {0, 2, 1, 3, 4}; + return fill_nonblocked(md, perm); +} + +status_t fill_gOIhw4o4i(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = {1, 4, 4, 1, 1}; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9}; + return fill_contiguous_blocked(md, block_dims, perm); +} + +status_t fill_gOIhw4i4o(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = { 1, 4, 4, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 7, 6, 8, 9 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_gOIhw8i8o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; - const dims_t block_dims = {1, 8, 8, 1, 1}; + const dims_t block_dims = { 1, 8, 8, 1, 1 }; const int perm[] = { 0, 1, 2, 3, 4, - 5, 7, 6, 8, 9}; + 5, 7, 6, 8, 9 }; return fill_contiguous_blocked(md, block_dims, perm); } @@ -742,36 +953,66 @@ status_t fill_gOIdhw16i16o(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_gOIdhw8i8o(memory_desc_t &md) { +status_t fill_gOIdhw4i4o(memory_desc_t &md) { if (md.ndims != 6) return invalid_arguments; - const dims_t block_dims = {1, 8, 8, 1, 1, 1}; + const dims_t block_dims = {1, 4, 4, 1, 1, 1}; const int perm[] = { 0, 1, 2, 3, 4, 5, 6, 8, 7, 9, 10, 11}; return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_gOihw16o(memory_desc_t &md) { +status_t fill_gOIdhw8i8o(memory_desc_t &md) { + if (md.ndims != 6) return invalid_arguments; + + const dims_t block_dims = { 1, 8, 8, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, 5, + 6, 8, 7, 9, 10, 11 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + +status_t fill_gOihw4o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; - const dims_t block_dims = {1, 16, 1, 1, 1}; + const dims_t block_dims = {1, 4, 1, 1, 1}; const int perm[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_gOidhw16o(memory_desc_t &md) { +status_t fill_gOihw16o(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = { 1, 16, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + +status_t fill_gOidhw4o(memory_desc_t &md) { if (md.ndims != 6) return invalid_arguments; - const dims_t block_dims = {1, 16, 1, 1, 1, 1}; + const dims_t block_dims = {1, 4, 1, 1, 1, 1}; const int perm[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_gOidhw16o(memory_desc_t &md) { + if (md.ndims != 6) return invalid_arguments; + + const dims_t block_dims = { 1, 16, 1, 1, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_gOhwi8o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -782,16 +1023,26 @@ status_t fill_gOhwi8o(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } -status_t fill_gOhwi16o(memory_desc_t &md) { +status_t fill_gOhwi4o(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; - const dims_t block_dims = {1, 16, 1, 1, 1}; + const dims_t block_dims = {1, 4, 1, 1, 1}; const int perm[] = { 0, 1, 3, 4, 2, 5, 6, 7, 8, 9}; return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_gOhwi16o(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = { 1, 16, 1, 1, 1 }; + const int perm[] = { + 0, 1, 3, 4, 2, + 5, 6, 7, 8, 9 }; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_gOdhwi16o(memory_desc_t &md) { if (md.ndims != 6) return invalid_arguments; @@ -822,6 +1073,16 @@ status_t fill_gOIhw4i16o4i(memory_desc_t &md) { return fill_contiguous_blocked(md, block_dims, perm); } +status_t fill_gOIhw2i8o4i(memory_desc_t &md) { + if (md.ndims != 5) return invalid_arguments; + + const dims_t block_dims = {1, 8, 8, 1, 1}; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 7, 6, 8, 9}; + return fill_contiguous_blocked(md, block_dims, perm); +} + status_t fill_gOhIw8o4i(memory_desc_t &md) { if (md.ndims != 5) return invalid_arguments; @@ -983,22 +1244,27 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc) case nc: return fill_nc(memory_desc); case ncw: return fill_ncw(memory_desc); case nwc: return fill_nwc(memory_desc); + case nCw4c: return fill_nCw4c(memory_desc); case nCw8c: return fill_nCw8c(memory_desc); case nCw16c: return fill_nCw16c(memory_desc); case nchw: return fill_nchw(memory_desc); case nhwc: return fill_nhwc(memory_desc); case chwn: return fill_chwn(memory_desc); + case nChw4c: return fill_nChw4c(memory_desc); case nChw8c: case oIhw8i: return fill_nChw8c(memory_desc); case nChw16c: case oIhw16i: return fill_nChw16c(memory_desc); case oi: return fill_oi(memory_desc); case io: return fill_io(memory_desc); case oiw: return fill_oiw(memory_desc); case wio: return fill_wio(memory_desc); + case Owi4o: return fill_Owi4o(memory_desc); + case OIw4i4o: return fill_OIw4i4o(memory_desc); case Owi8o: return fill_Owi8o(memory_desc); case OIw8o8i: return fill_OIw8o8i(memory_desc); case OIw8i8o: return fill_OIw8i8o(memory_desc); case OIw16i16o: return fill_OIw16i16o(memory_desc); case OIw16o16i: return fill_OIw16o16i(memory_desc); + case Oiw4o: return fill_Oiw4o(memory_desc); case Oiw16o: return fill_Oiw16o(memory_desc); case Owi16o: return fill_Owi16o(memory_desc); case OIw8i16o2i: return fill_OIw8i16o2i(memory_desc); @@ -1007,12 +1273,16 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc) case oihw: return fill_oihw(memory_desc); case ihwo: return fill_ihwo(memory_desc); case hwio: return fill_hwio(memory_desc); + case iohw: return fill_iohw(memory_desc); case hwio_s8s8: return fill_hwio(memory_desc); case dhwio: return fill_dhwio(memory_desc); + case OIhw4i4o: return fill_OIhw4i4o(memory_desc); case OIhw8i8o: return fill_OIhw8i8o(memory_desc); case OIhw16i16o: return fill_OIhw16i16o(memory_desc); case OIhw4i16o4i: return fill_OIhw4i16o4i(memory_desc); case OhIw8o4i: return fill_OhIw8o4i(memory_desc); + case OhIw8o32i: return fill_OhIw8o32i(memory_desc); + case OhIw16o32i: return fill_OhIw16o32i(memory_desc); case OhIw8o4i_s8s8: return fill_OhIw8o4i(memory_desc); case OIhw4i16o4i_s8s8: return fill_OIhw4i16o4i(memory_desc); case OIhw8i16o2i: return fill_OIhw8i16o2i(memory_desc); @@ -1021,15 +1291,20 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc) case OIhw8o8i: return fill_OIhw8o8i(memory_desc); case OIhw16o16i: return fill_OIhw16o16i(memory_desc); case IOhw16o16i: return fill_IOhw16o16i(memory_desc); + case Oihw4o: return fill_Oihw4o(memory_desc); case Oihw16o: return fill_Oihw16o(memory_desc); case Ohwi8o: return fill_Ohwi8o(memory_desc); + case Ohwi4o: return fill_Ohwi4o(memory_desc); case Ohwi16o: return fill_Ohwi16o(memory_desc); case goiw: return fill_goiw(memory_desc); + case gOwi4o: return fill_gOwi4o(memory_desc); + case gOIw4i4o: return fill_gOIw4i4o(memory_desc); case gOwi8o: return fill_gOwi8o(memory_desc); case gOIw8o8i: return fill_gOIw8o8i(memory_desc); case gOIw8i8o: return fill_gOIw8i8o(memory_desc); case gOIw16i16o: return fill_gOIw16i16o(memory_desc); case gOIw16o16i: return fill_gOIw16o16i(memory_desc); + case gOiw4o: return fill_gOiw4o(memory_desc); case gOiw16o: return fill_gOiw16o(memory_desc); case gOwi16o: return fill_gOwi16o(memory_desc); case gOIw8i16o2i: return fill_gOIw8i16o2i(memory_desc); @@ -1037,41 +1312,55 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc) case gIOw16o16i: return fill_gIOw16o16i(memory_desc); case goihw: return fill_goihw(memory_desc); case hwigo: return fill_hwigo(memory_desc); + case giohw: return fill_giohw(memory_desc); case hwigo_s8s8: return fill_hwigo(memory_desc); + case gOIhw4i4o: return fill_gOIhw4i4o(memory_desc); case gOIhw8i8o: return fill_gOIhw8i8o(memory_desc); case gOIhw16i16o: return fill_gOIhw16i16o(memory_desc); case gOIhw4i16o4i: return fill_gOIhw4i16o4i(memory_desc); case gOhIw8o4i: return fill_gOhIw8o4i(memory_desc); case gOhIw8o4i_s8s8: return fill_gOhIw8o4i(memory_desc); case gOIhw4i16o4i_s8s8: return fill_gOIhw4i16o4i(memory_desc); + case gOIhw2i8o4i: return fill_gOIhw2i8o4i(memory_desc); + case gOIhw2i8o4i_s8s8: return fill_gOIhw2i8o4i(memory_desc); case gOIhw8i16o2i: return fill_gOIhw8i16o2i(memory_desc); case gOIdhw8i16o2i: return fill_gOIdhw8i16o2i(memory_desc); case gOIhw8o16i2o: return fill_gOIhw8o16i2o(memory_desc); + case gOIhw4o4i: return fill_gOIhw4o4i(memory_desc); + case gOIhw4o4i_s8s8: return fill_gOIhw4o4i(memory_desc); case gOIhw8o8i: return fill_gOIhw8o8i(memory_desc); case gOIhw16o16i: return fill_gOIhw16o16i(memory_desc); case gIOhw16o16i: return fill_gIOhw16o16i(memory_desc); + case gOihw4o: return fill_gOihw4o(memory_desc); case gOihw16o: return fill_gOihw16o(memory_desc); case gOhwi8o: return fill_gOhwi8o(memory_desc); + case gOhwi4o: return fill_gOhwi4o(memory_desc); case gOhwi16o: return fill_gOhwi16o(memory_desc); case Goihw8g: return fill_Goihw8g(memory_desc); case Goihw16g: return fill_Goihw16g(memory_desc); + case Goihw16g_s8s8: return fill_Goihw16g(memory_desc); case ncdhw: return fill_ncdhw(memory_desc); case ndhwc: return fill_ndhwc(memory_desc); case oidhw: return fill_oidhw(memory_desc); case goidhw: return fill_goidhw(memory_desc); + case nCdhw4c: return fill_nCdhw4c(memory_desc); case nCdhw8c: case oIdhw8i: return fill_nCdhw8c(memory_desc); case nCdhw16c: case oIdhw16i: return fill_nCdhw16c(memory_desc); case OIdhw16i16o: return fill_OIdhw16i16o(memory_desc); case gOIdhw16i16o: return fill_gOIdhw16i16o(memory_desc); + case OIdhw4i4o: return fill_OIdhw4i4o(memory_desc); + case gOIdhw4i4o: return fill_gOIdhw4i4o(memory_desc); case OIdhw8i8o: return fill_OIdhw8i8o(memory_desc); case gOIdhw8i8o: return fill_gOIdhw8i8o(memory_desc); case OIdhw16o16i: return fill_OIdhw16o16i(memory_desc); case gOIdhw16o16i: return fill_gOIdhw16o16i(memory_desc); case OIdhw8o8i: return fill_OIdhw8o8i(memory_desc); case gOIdhw8o8i: return fill_gOIdhw8o8i(memory_desc); + case Oidhw4o: return fill_Oidhw4o(memory_desc); case Oidhw16o: return fill_Oidhw16o(memory_desc); case Odhwi16o: return fill_Odhwi16o(memory_desc); case Odhwi8o: return fill_Odhwi8o(memory_desc); + case gOidhw4o: return fill_gOidhw4o(memory_desc); case gOidhw16o: return fill_gOidhw16o(memory_desc); case gOdhwi16o: return fill_gOdhwi16o(memory_desc); case gOdhwi8o: return fill_gOdhwi8o(memory_desc); @@ -1081,7 +1370,8 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc) case ldigo: return fill_ldigo(memory_desc); case ldgoi: return fill_ldgoi(memory_desc); case ldgo: return fill_ldgo(memory_desc); - case wino_fmt: return success; + case wino_fmt: + case rnn_packed: return success; default: break; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp index 91e18cf..7c2f8ef 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp @@ -46,12 +46,16 @@ struct memory_desc_wrapper: public c_compatible { memory_format_t format() const { return _md->format; } bool is_blocking_desc() const { return (format() != memory_format::wino_fmt + && format() != memory_format::rnn_packed && format() != memory_format::any && format() != memory_format::undef); } bool is_wino_desc() const { return (format() == memory_format::wino_fmt); } + bool is_rnn_packed_desc() const { + return (format() == memory_format::rnn_packed); + } const blocking_desc_t &blocking_desc() const { assert(is_blocking_desc()); return _md->layout_desc.blocking; @@ -60,6 +64,10 @@ struct memory_desc_wrapper: public c_compatible { assert(is_wino_desc()); return _md->layout_desc.wino_desc; } + const rnn_packed_data_t &rnn_packed_desc() const { + assert(is_rnn_packed_desc()); + return _md->layout_desc.rnn_packed_desc; + } /* some useful function */ @@ -67,7 +75,7 @@ struct memory_desc_wrapper: public c_compatible { * is true, and the number of data elements otherwise */ size_t nelems(bool with_padding = false) const { if (is_zero()) return 0; - return (utils::array_product(with_padding + return (utils::array_product(with_padding ? blocking_desc().padding_dims : dims(), ndims())); } @@ -85,7 +93,11 @@ struct memory_desc_wrapper: public c_compatible { size_t additional_buffer_data_size() const { using namespace mkldnn::impl::memory_format; return (utils::one_of(format(), hwio_s8s8, hwigo_s8s8, - gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8, OhIw8o4i_s8s8, gOhIw8o4i_s8s8)) + gOIhw4o4i_s8s8, + gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8, + gOIhw2i8o4i_s8s8, + gOhIw8o4i_s8s8, OhIw8o4i_s8s8, + Goihw16g_s8s8)) ? sizeof(int32_t) : 0; } @@ -93,7 +105,11 @@ struct memory_desc_wrapper: public c_compatible { bool is_additional_buffer() const { using namespace mkldnn::impl::memory_format; return (utils::one_of(format(), hwio_s8s8, hwigo_s8s8, - gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8, OhIw8o4i_s8s8, gOhIw8o4i_s8s8)) + gOIhw4o4i_s8s8, + gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8, + gOIhw2i8o4i_s8s8, + gOhIw8o4i_s8s8, OhIw8o4i_s8s8, + Goihw16g_s8s8)) ? true : false; } @@ -103,10 +119,13 @@ struct memory_desc_wrapper: public c_compatible { const auto &padding_dims = blocking_desc().padding_dims; switch(format()) { case hwigo_s8s8: + case gOIhw4o4i_s8s8: + case gOIhw2i8o4i_s8s8: case gOIhw4i16o4i_s8s8: case gOhIw8o4i_s8s8: return size_t(padding_dims[0]) * size_t(padding_dims[1]) * additional_buffer_data_size(); + case Goihw16g_s8s8: case hwio_s8s8: case OIhw4i16o4i_s8s8: case OhIw8o4i_s8s8: @@ -126,11 +145,14 @@ struct memory_desc_wrapper: public c_compatible { assert((false || types::format_normalize(format()) == blocked || types::is_format_double_blocked(format()) - || format() == wino_fmt) + || format() == wino_fmt + || format() == rnn_packed) && "unknown format"); if (format() == wino_fmt) { return wino_desc().size; + } else if (format() == rnn_packed) { + return rnn_packed_desc().size; } else { if (blocking_desc().offset_padding != 0) return 0; @@ -147,7 +169,8 @@ struct memory_desc_wrapper: public c_compatible { max_size = nstl::max(max_size, size_t(block * strides[1][d])); } - return max_size * data_type_size() + additional_buffer_size();; + + return max_size * data_type_size() + additional_buffer_size(); } } @@ -231,6 +254,13 @@ struct memory_desc_wrapper: public c_compatible { const int ic_4 = pos[with_groups + 1] % 4; phys_offset += 4 * oc_16 + ic_4 - (oc_16 + 16 * ic_4); } + if (utils::one_of(format(), gOIhw2i8o4i, gOIhw2i8o4i_s8s8)) { + // TODO: Fix temporary workaround for formats with double blocking + const bool with_groups = true; + const int oc_8 = pos[with_groups + 0] % 8; + const int ic_4 = pos[with_groups + 1] % 4; + phys_offset += 4 * oc_8 + ic_4 - (oc_8 + 8 * ic_4); + } if (format() == gOIw8i16o2i || format() == OIw8i16o2i) { // TODO: Fix temporary workaround for formats with double blocking const bool with_groups = format() == gOIw8i16o2i; @@ -362,13 +392,18 @@ inline bool memory_desc_wrapper::operator==(const memory_desc_wrapper &rhs) && utils::array_cmp(dims(), rhs.dims(), ndims()) && data_type() == rhs.data_type() && ((is_blocking_desc() && rhs.is_blocking_desc()) - || (is_wino_desc() && rhs.is_wino_desc())) + || (is_wino_desc() && rhs.is_wino_desc()) + || (is_rnn_packed_desc() && rhs.is_rnn_packed_desc())) && (is_blocking_desc() ? blocking_desc_is_equal(blocking_desc(), rhs.blocking_desc(), ndims()) : true) && (is_wino_desc() ? wino_desc_is_equal( wino_desc(), rhs.wino_desc()) : - true); + true) + && (is_rnn_packed_desc() ? + rnn_packed_desc_is_equal(rnn_packed_desc(), + rhs.rnn_packed_desc()) : + true); } inline bool memory_desc_wrapper::similar_to(const memory_desc_wrapper &rhs, @@ -377,7 +412,8 @@ inline bool memory_desc_wrapper::similar_to(const memory_desc_wrapper &rhs, using namespace utils; if (utils::one_of(format(), memory_format::undef, memory_format::any)) return false; - if (is_wino_desc() || rhs.is_wino_desc()) + if (is_wino_desc() || rhs.is_wino_desc() || is_rnn_packed_desc() + || rhs.is_rnn_packed_desc()) return false; const int ds = dim_start; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/memory_tracking.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/memory_tracking.hpp new file mode 100644 index 0000000..f47536c --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/common/memory_tracking.hpp @@ -0,0 +1,297 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef MEMORY_TRACKING_HPP +#define MEMORY_TRACKING_HPP + +#include +#include + +#include "nstl.hpp" +#include "utils.hpp" + +namespace mkldnn { +namespace impl { +namespace memory_tracking { + +/* Memory tracking capabilities + * + * The main purpose of this header file is to provide uniform way to register + * required memory for a scratchpad at a primitive descriptor creation time + * and then easily access it having only the base address of the scratchpad. + * + * Primitives might contain multiple disjoint parts that require temporary + * buffers (known as scratchpad) during their execution. A primitive descriptor + * should summarize all the needs into one single number -- the buffer size + * that would be requested from a user. At execution time, the corresponding + * primitive will receive a base pointer to a scratchpad. It then needs to + * provide each part of algorithm the corresponding piece of memory. Three main + * challenges here are: + * 1. Track correct offset (from the base scratchpad address) for each piece + * 2. Algorithm might require that different memory pieces to be aligned, so + * the scratchpad size is no more just a sum of size of the corresponding + * subparts. + * 3. While a primitive is responsible for its scratchpad, the implementation + * might use some other basic blocks (e.g. cpu_reducer) that also require + * scratchpad memory. So there should be a simple way of passing the + * information back and force between the main algorithm (a primitive) and + * auxiliary stuff that lives completely separately from it (e.g. reducer). + * + * To address these challenges this header file provides 3 structures: + * 1. registry_t -- the class the stores the information about requested + * memory. The information includes required size and desired + * alignment for each piece. This class is also responsible + * for computing the right offset to a given piece using the + * base pointer. + * This class is basically a ledger with all entries. + * Lives in primitive descriptors. + * + * 2. registrar_t -- the interface to a registry_t to book memory. Used at + * primitive descriptor creation time only. Contains a + * reference to the corresponding *mutable* registry. + * Always modifiable. + * Allows chaining (using prefixes). + * + * 3. grantor_t -- the interface to a registry_t to access memory. Used at + * primitive execution time only. Contains a reference to + * the corresponding *constant* registry and base pointer. + * Always constant. + * Allows chaining (using prefixes). + * + * Both registrar_t and grantor_t allow chaining with extra prefix provided. + * The feature is useful when a primitive offload a part of computations to + * some other primitives which require their own scratchpad space + * (e.g. reducer). Prefixes are used to avoid key collision in cases when + * multiple sub-primitive (e.g. multiple reducers) are used. + * + * A short example below demonstrates how to use aforementioned classes. In it + * the main primitive is convolution that uses scratchpad for keeping padded + * bias. It also needs a reducer, that needs its own space as well. + * + * ``` c++ + * struct reducer_t { + * static void init(registrar_t &scratchpad) { + * // preserve space for the reduction (one page aligned) + * scratchpad.book(key_space, sizeof(float) * 980 * 1024, 4096); + * } + * + * void exec(const grantor_t &scratchpad) { + * // get the pointer to preserved space. scratchpad came from + * // upper primitive (convolution in this example) + * auto space = scratchpad.get(key_reducer_space); + * + * space[:] += ...; + * } + * }; + * + * struct conv_t { + * struct pd_t { + * void init() { + * registrar_t scratchpad(scratchpad_registry_); + * + * // preserve a space for padded bias (using default alignment) + * scratchpad.book(key_conv_padded_bias, 128); + * + * // create a proxy registrar for the reducer All entries made + * // by reducer would live in convolution's registry, but would + * // have their own `prefix`, so no interference with conv's + * // buffers. + * registrar_t reducer_scratchpad(scratchpad, prefix_reducer); + * + * reducer_t::init(reducer_scratchpad); + * } + * + * registry_t scratchpad_registry_; + * } + * + * void exec() { + * // get the base pointer to a scratchpad memory from a user + * void *scratchpad_ptr = this->input(MKLDNN_MEM_SCRATCHPAD); + * + * // create a grantor to the scratchpad (and provide the base + * // pointer). + * grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr); + * + * // access the padded_bias (need only key name and the grantor) + * auto padded_bias = scratchpad.get(key_conv_padded_bias); + * + * // to give the `right` grantor to reducer we need to add the + * // corresponding prefix, so that reducer would be able to access + * // its keys. The call is very similar to the one in pd_t::init + * // with only difference in types: grantor_t vs registrar_t. + * grantor_t reducer_scratchpad(scratchpad, prefix_reducer); + * reducer->exec(reducer_scratchpad); + * } + * }; + * ``` + */ + + +/* namespace with common keys and prefixes */ +namespace names { +enum { + key_none = 0, + key_bnorm_tmp_mean, + key_bnorm_tmp_var, + key_bnorm_tmp_diff_ss, + key_bnorm_tmp_stats, + key_bnorm_reduction, + key_concat_iptrs, + key_concat_istrides, + key_concat_nelems, + key_concat_optrs, + key_conv_adjusted_scales, + key_conv_bia_reduction, + key_conv_gemm_col, + key_conv_int_dat_in_acc_dt, + key_conv_padded_bias, + key_conv_rtus_space, + key_conv_tr_diff_dst, + key_conv_tr_diff_dst_bctx, + key_conv_tr_src, + key_conv_tr_src_bctx, + key_conv_wei_reduction, + key_conv_wei_bia_reduction, + key_conv_wei_bia_reduction_bctx, + key_iprod_int_dat_in_acc_dt, + key_reducer_space, + key_reducer_space_bctx, + key_reorder_wino_plain, + key_reorder_wino_transform_space, + key_reorder_rnn_weights_quantization, + key_reorder_rnn_weights_reduction, + key_rnn_space, + key_rnn_ptrs_bia, + key_rnn_ptrs_wei_layer, + key_rnn_ptrs_wei_iter, + key_softmax_reduction, + key_wino_U, + key_wino_V, + key_wino_M, + key_barrier, + key_dw_conv_buffer, + key_dw_conv_padded_bias, + key_conv_padded_compensation, +}; + +enum { + prefix_none = 0, + prefix_reducer_bia, + prefix_reducer_wei, +}; +} + +// level 0: 00 00 00 xxx +// level 1: 00 00 aa xxx +// level 2: 00 aa bb xxx +// level 3: aa bb cc xxx +// max # of levels: 3 + 1 (base_level) +// here: +// xxx : [1 .. MAX_KEY) : key +// aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3 + +using key_t = uint32_t; +enum { MAX_KEY = (1u << 10), MAX_PREFIX = (1u << 7), }; + +/// generates global key based on a prefix and a local key +inline key_t make_key(key_t prefix, key_t key) { return prefix + key; } + +/// generates global prefix based on the global parent and the local ones +inline key_t make_prefix(key_t parent_prefix, key_t prefix) +{ return MAX_PREFIX * parent_prefix + MAX_KEY * prefix; } + +struct registrar_t; +struct grantor_t; + +struct registry_t { + void book(const key_t &key, size_t size, size_t alignment) { + if (size == 0) return; + assert(offset_map_.count(key) == 0); + + size = utils::rnd_up(size, minimal_alignment); + alignment = nstl::max(alignment, minimal_alignment); + offset_map_[key] = entry_t{size_, size, alignment}; + + size_ += size + alignment - minimal_alignment; + } + + void *get(const key_t &key, void *base_ptr) const { + if (base_ptr == nullptr) { assert(size() == 0); return nullptr; } + if (offset_map_.count(key) != 1) return nullptr; + + const auto &e = offset_map_.at(key); + base_ptr = utils::align_ptr(base_ptr, minimal_alignment); + char *ptr = (char *)base_ptr + e.offset; + return utils::align_ptr(ptr, e.alignment); + } + + size_t size() const + { return size_ > 0 ? size_ + minimal_alignment - 1 : 0; } + + registrar_t registrar(); + grantor_t grantor(void *base_ptr) const; + +protected: + enum { minimal_alignment = 64 }; + struct entry_t { size_t offset, size, alignment; }; + + std::unordered_map offset_map_; + size_t size_ = 0; +}; + +struct registrar_t { + enum { default_alignment = 64 }; + + registrar_t(registry_t ®istry): registry_(registry), prefix_(0) {} + registrar_t(registrar_t &parent, const key_t &prefix) + : registry_(parent.registry_) + , prefix_(make_prefix(parent.prefix_, prefix)) {} + + void book(const key_t &key, size_t size, + size_t alignment = default_alignment) + { registry_.book(make_key(prefix_, key), size, alignment); } + +protected: + registry_t ®istry_; + const key_t prefix_; +}; + +struct grantor_t { + grantor_t(const registry_t ®istry, void *base_ptr) + : registry_(registry), prefix_(0), base_ptr_(base_ptr) {} + grantor_t(const grantor_t &parent, const key_t &prefix) + : registry_(parent.registry_) + , prefix_(make_prefix(parent.prefix_, prefix)) + , base_ptr_(parent.base_ptr_) {} + + template T *get(const key_t &key) const + { return (T *)registry_.get(make_key(prefix_, key), base_ptr_); } + +protected: + const registry_t ®istry_; + const key_t prefix_; + void *base_ptr_; +}; + +inline registrar_t registry_t::registrar() { return registrar_t(*this); } +inline grantor_t registry_t::grantor(void *base_ptr) const +{ return grantor_t(*this, base_ptr); } + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_debug.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_debug.cpp index b54848f..0784008 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_debug.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_debug.cpp @@ -42,6 +42,7 @@ const char *mkldnn_dt2str(mkldnn_data_type_t v) { if (v == mkldnn_s16) return "s16"; if (v == mkldnn_s8) return "s8"; if (v == mkldnn_u8) return "u8"; + if (v == mkldnn_bin) return "bin"; assert(!"unknown dt"); return "unknown dt"; } @@ -72,14 +73,14 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) { if (v == mkldnn_wio) return "wio"; if (v == mkldnn_oihw) return "oihw"; if (v == mkldnn_hwio) return "hwio"; - if (v == mkldnn_hwio_s8s8) return "hwio_s8s8"; if (v == mkldnn_ihwo) return "ihwo"; + if (v == mkldnn_iohw) return "iohw"; if (v == mkldnn_oidhw) return "oidhw"; if (v == mkldnn_dhwio) return "dhwio"; if (v == mkldnn_goiw) return "goiw"; if (v == mkldnn_goihw) return "goihw"; if (v == mkldnn_hwigo) return "hwigo"; - if (v == mkldnn_hwigo_s8s8) return "hwigo_s8s8"; + if (v == mkldnn_giohw) return "giohw"; if (v == mkldnn_goidhw) return "goidhw"; if (v == mkldnn_ntc) return "ntc"; if (v == mkldnn_tnc) return "tnc"; @@ -87,24 +88,32 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) { if (v == mkldnn_ldigo) return "ldigo"; if (v == mkldnn_ldgoi) return "ldgoi"; if (v == mkldnn_ldgo) return "ldgo"; + if (v == mkldnn_nCw4c) return "nCw4c"; if (v == mkldnn_nCw8c) return "nCw8c"; if (v == mkldnn_nCw16c) return "nCw16c"; + if (v == mkldnn_nChw4c) return "nChw4c"; if (v == mkldnn_nChw8c) return "nChw8c"; if (v == mkldnn_nChw16c) return "nChw16c"; + if (v == mkldnn_nCdhw4c) return "nCdhw4c"; if (v == mkldnn_nCdhw8c) return "nCdhw8c"; if (v == mkldnn_nCdhw16c) return "nCdhw16c"; + if (v == mkldnn_Owi4o) return "Owi4o"; + if (v == mkldnn_OIw4i4o) return "OIw4i4o"; if (v == mkldnn_Owi8o) return "Owi8o"; if (v == mkldnn_OIw8i8o) return "OIw8i8o"; if (v == mkldnn_OIw8o8i) return "OIw8o8i"; if (v == mkldnn_OIw16i16o) return "OIw16i16o"; if (v == mkldnn_OIw16o16i) return "OIw16o16i"; + if (v == mkldnn_Oiw4o) return "Oiw4o"; if (v == mkldnn_Oiw16o) return "Oiw16o"; if (v == mkldnn_Owi16o) return "Owi16o"; if (v == mkldnn_OIw8i16o2i) return "OIw8i16o2i"; if (v == mkldnn_OIw8o16i2o) return "OIw8o16i2o"; if (v == mkldnn_IOw16o16i) return "IOw16o16i"; + if (v == mkldnn_hwio_s8s8) return "hwio_s8s8"; if (v == mkldnn_oIhw8i) return "oIhw8i"; if (v == mkldnn_oIhw16i) return "oIhw16i"; + if (v == mkldnn_OIhw4i4o) return "OIhw4i4o"; if (v == mkldnn_OIhw8i8o) return "OIhw8i8o"; if (v == mkldnn_OIhw16i16o) return "OIhw16i16o"; if (v == mkldnn_OIhw4i16o4i) return "OIhw4i16o4i"; @@ -115,48 +124,69 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) { if (v == mkldnn_OIhw16o16i) return "OIhw16o16i"; if (v == mkldnn_IOhw16o16i) return "IOhw16o16i"; if (v == mkldnn_Oihw8o) return "Oihw8o"; + if (v == mkldnn_Oihw4o) return "Oihw4o"; if (v == mkldnn_Oihw16o) return "Oihw16o"; if (v == mkldnn_Ohwi8o) return "Ohwi8o"; + if (v == mkldnn_Ohwi4o) return "Ohwi4o"; if (v == mkldnn_Ohwi16o) return "Ohwi16o"; if (v == mkldnn_OhIw16o4i) return "OhIw16o4i"; if (v == mkldnn_OhIw8o4i) return "OhIw8o4i"; if (v == mkldnn_OhIw8o4i_s8s8) return "OhIw8o4i_s8s8"; + if (v == mkldnn_OhIw8o32i) return "OhIw8o32i"; + if (v == mkldnn_OhIw16o32i) return "OhIw16o32i"; if (v == mkldnn_oIdhw8i) return "oIdhw8i"; if (v == mkldnn_oIdhw16i) return "oIdhw16i"; + if (v == mkldnn_OIdhw4i4o) return "OIdhw4i4o"; + if (v == mkldnn_Odhwi4o) return "Odhwi4o"; if (v == mkldnn_OIdhw8i8o) return "OIdhw8i8o"; if (v == mkldnn_OIdhw8o8i) return "OIdhw8o8i"; if (v == mkldnn_Odhwi8o) return "Odhwi8o"; if (v == mkldnn_OIdhw16i16o) return "OIdhw16i16o"; if (v == mkldnn_OIdhw16o16i) return "OIdhw16o16i"; + if (v == mkldnn_Oidhw4o) return "Oidhw4o"; if (v == mkldnn_Oidhw16o) return "Oidhw16o"; if (v == mkldnn_Odhwi16o) return "Odhwi16o"; if (v == mkldnn_OIdhw8i16o2i) return "OIdhw8i16o2i"; + if (v == mkldnn_gOwi4o) return "gOwi4o"; + if (v == mkldnn_gOIw4i4o) return "gOIw4i4o"; if (v == mkldnn_gOwi8o) return "gOwi8o"; if (v == mkldnn_gOIw8o8i) return "gOIw8o8i"; if (v == mkldnn_gOIw8i8o) return "gOIw8i8o"; if (v == mkldnn_gOIw16i16o) return "gOIw16i16o"; if (v == mkldnn_gOIw16o16i) return "gOIw16o16i"; + if (v == mkldnn_gOiw4o) return "gOiw4o"; if (v == mkldnn_gOiw16o) return "gOiw16o"; if (v == mkldnn_gOwi16o) return "gOwi16o"; if (v == mkldnn_gOIw8i16o2i) return "gOIw8i16o2i"; if (v == mkldnn_gOIw8o16i2o) return "gOIw8o16i2o"; if (v == mkldnn_gIOw16o16i) return "gIOw16o16i"; + if (v == mkldnn_hwigo_s8s8) return "hwigo_s8s8"; + if (v == mkldnn_gOIhw4i4o) return "gOIhw4i4o"; if (v == mkldnn_gOIhw8i8o) return "gOIhw8i8o"; if (v == mkldnn_gOIhw16i16o) return "gOIhw16i16o"; if (v == mkldnn_gOIhw4i16o4i) return "gOIhw4i16o4i"; if (v == mkldnn_gOIhw4i16o4i_s8s8) return "gOIhw4i16o4i_s8s8"; + if (v == mkldnn_gOIhw2i8o4i) return "gOIhw2i8o4i"; + if (v == mkldnn_gOIhw2i8o4i_s8s8) return "gOIhw2i8o4i_s8s8"; if (v == mkldnn_gOIhw8i16o2i) return "gOIhw8i16o2i"; if (v == mkldnn_gOIhw8o16i2o) return "gOIhw8o16i2o"; + if (v == mkldnn_gOIhw4o4i) return "gOIhw4o4i"; + if (v == mkldnn_gOIhw4o4i_s8s8) return "gOIhw4o4i_s8s8"; if (v == mkldnn_gOIhw8o8i) return "gOIhw8o8i"; if (v == mkldnn_gOIhw16o16i) return "gOIhw16o16i"; if (v == mkldnn_gIOhw16o16i) return "gIOhw16o16i"; if (v == mkldnn_gOihw8o) return "gOihw8o"; + if (v == mkldnn_gOihw4o) return "gOihw4o"; if (v == mkldnn_gOihw16o) return "gOihw16o"; if (v == mkldnn_gOhwi8o) return "gOhwi8o"; + if (v == mkldnn_gOhwi4o) return "gOhwi4o"; if (v == mkldnn_gOhwi16o) return "gOhwi16o"; if (v == mkldnn_Goihw8g) return "Goihw8g"; if (v == mkldnn_Goihw16g) return "Goihw16g"; + if (v == mkldnn_Goihw16g_s8s8) return "Goihw16g_s8s8"; if (v == mkldnn_gOhIw16o4i) return "gOhIw16o4i"; + if (v == mkldnn_gOIdhw4i4o) return "gOIdhw4i4o"; + if (v == mkldnn_gOdhwi4o) return "gOdhwi4o"; if (v == mkldnn_gOhIw8o4i) return "gOhIw8o4i"; if (v == mkldnn_gOhIw8o4i_s8s8) return "gOhIw8o4i_s8s8"; if (v == mkldnn_gOIdhw8i8o) return "gOIdhw8i8o"; @@ -165,11 +195,11 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) { if (v == mkldnn_gOIdhw8i16o2i) return "gOIdhw8i16o2i"; if (v == mkldnn_gOIdhw16i16o) return "gOIdhw16i16o"; if (v == mkldnn_gOIdhw16o16i) return "gOIdhw16o16i"; + if (v == mkldnn_gOidhw4o) return "gOidhw4o"; if (v == mkldnn_gOidhw16o) return "gOidhw16o"; if (v == mkldnn_gOdhwi16o) return "gOdhwi16o"; if (v == mkldnn_wino_fmt) return "wino_fmt"; - if (v == mkldnn_ldigo_p) return "ldigo_p"; - if (v == mkldnn_ldgoi_p) return "ldgoi_p"; + if (v == mkldnn_rnn_packed) return "rnn_packed"; if (v == mkldnn_format_last) return "format_last"; assert(!"unknown fmt"); return "unknown fmt"; @@ -202,21 +232,22 @@ const char *mkldnn_prim_kind2str(mkldnn_primitive_kind_t v) { if (v == mkldnn_deconvolution) return "deconvolution"; if (v == mkldnn_eltwise) return "eltwise"; if (v == mkldnn_depthwise) return "depthwise"; - if (v == mkldnn_relu) return "relu"; if (v == mkldnn_softmax) return "softmax"; if (v == mkldnn_pooling) return "pooling"; if (v == mkldnn_lrn) return "lrn"; if (v == mkldnn_batch_normalization) return "batch_normalization"; if (v == mkldnn_inner_product) return "inner_product"; - if (v == mkldnn_convolution_relu) return "convolution_relu"; if (v == mkldnn_rnn) return "rnn"; if (v == mkldnn_roi_pooling) return "roi_pooling"; + if (v == mkldnn_binary_convolution) return "binary_convolution"; + if (v == mkldnn_binarization) return "binarization"; assert(!"unknown prim_kind"); return "unknown prim_kind"; } const char *mkldnn_alg_kind2str(mkldnn_alg_kind_t v) { if (v == mkldnn_alg_kind_undef) return "undef"; + if (v == mkldnn_convolution_auto) return "convolution_auto"; if (v == mkldnn_convolution_direct) return "convolution_direct"; if (v == mkldnn_convolution_winograd) return "convolution_winograd"; if (v == mkldnn_eltwise_relu) return "eltwise_relu"; @@ -230,6 +261,8 @@ const char *mkldnn_alg_kind2str(mkldnn_alg_kind_t v) { if (v == mkldnn_eltwise_soft_relu) return "eltwise_soft_relu"; if (v == mkldnn_eltwise_logistic) return "eltwise_logistic"; if (v == mkldnn_eltwise_clamp) return "eltwise_clamp"; + if (v == mkldnn_eltwise_exp) return "eltwise_exp"; + if (v == mkldnn_eltwise_not) return "eltwise_not"; if (v == mkldnn_pooling_max) return "pooling_max"; if (v == mkldnn_pooling_avg_include_padding) return "pooling_avg_include_padding"; if (v == mkldnn_pooling_avg_exclude_padding) return "pooling_avg_exclude_padding"; @@ -246,8 +279,20 @@ const char *mkldnn_alg_kind2str(mkldnn_alg_kind_t v) { if (v == mkldnn_depthwise_prelu) return "depthwise_prelu"; if (v == mkldnn_roi_pooling_max) return "roi_pooling_max"; if (v == mkldnn_roi_pooling_bilinear) return "roi_pooling_bilinear"; + if (v == mkldnn_binary_convolution_direct) return "binary_convolution_direct"; + if (v == mkldnn_binarization_depthwise) return "binarization_depthwise"; assert(!"unknown alg_kind"); return "unknown alg_kind"; } +const char *mkldnn_rnn_direction2str(mkldnn_rnn_direction_t v) { + if (v == mkldnn_unidirectional_left2right) return "unidirectional_left2right"; + if (v == mkldnn_unidirectional_right2left) return "unidirectional_right2left"; + if (v == mkldnn_bidirectional_concat) return "bidirectional_concat"; + if (v == mkldnn_bidirectional_sum) return "bidirectional_sum"; + if (v == mkldnn_unidirectional) return "unidirectional"; + assert(!"unknown rnn_direction"); + return "unknown rnn_direction"; +} + diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread.hpp index 9741c21..b65ddb1 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread.hpp @@ -43,6 +43,8 @@ inline int mkldnn_get_thread_num() { return 0; } inline int mkldnn_in_parallel() { return 0; } inline void mkldnn_thr_barrier() {} +#define PRAGMA_OMP(...) + #elif MKLDNN_THR == MKLDNN_THR_OMP #include #define MKLDNN_THR_SYNC 1 @@ -55,6 +57,8 @@ inline void mkldnn_thr_barrier() { # pragma omp barrier } +#define PRAGMA_OMP(...) PRAGMA_MACRO(CHAIN2(omp, __VA_ARGS__)) + #elif MKLDNN_THR == MKLDNN_THR_TBB #include "tbb/task_arena.h" #include "tbb/parallel_for.h" @@ -67,6 +71,9 @@ inline int mkldnn_get_thread_num() { return tbb::this_task_arena::current_thread_index(); } inline int mkldnn_in_parallel() { return 0; } inline void mkldnn_thr_barrier() { assert(!"no barrier in TBB"); } + +#define PRAGMA_OMP(...) + #endif /* MSVC still supports omp 2.0 only */ diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread_parallel_nd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread_parallel_nd.hpp index 77bf53b..4a1f487 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread_parallel_nd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread_parallel_nd.hpp @@ -56,9 +56,9 @@ void parallel(int nthr, F f) { template void for_nd(const int ithr, const int nthr, const T0 &D0, F f) { - T0 d0{0}, end{0}; - balance211(D0, nthr, ithr, d0, end); - for (; d0 < end; ++d0) f(d0); + T0 start{0}, end{0}; + balance211(D0, nthr, ithr, start, end); + for (T0 d0 = start; d0 < end; ++d0) f(d0); } template @@ -143,6 +143,13 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, } } +// Skip a lambda function in the parameter pack. +template +constexpr size_t get_work_amount(const T &v) { return 1; } +template +constexpr size_t get_work_amount(const T &v, Args &&...args) +{ return (size_t)v * get_work_amount(utils::forward(args)...); } + /* parallel_nd and parallel_nd_in_omp section */ #if MKLDNN_THR != MKLDNN_THR_TBB @@ -151,9 +158,13 @@ void parallel_nd(Args &&...args) { #if MKLDNN_THR == MKLDNN_THR_SEQ for_nd(0, 1, utils::forward(args)...); #elif MKLDNN_THR == MKLDNN_THR_OMP -# pragma omp parallel - for_nd(mkldnn_get_thread_num(), mkldnn_get_num_threads(), - utils::forward(args)...); + const bool do_parallel = get_work_amount(utils::forward(args)...) > 1; +# pragma omp parallel if (do_parallel) + { + const int nthr = !do_parallel ? 1 : mkldnn_get_num_threads(); + const int ithr = !do_parallel ? 0 : mkldnn_get_thread_num(); + for_nd(ithr, nthr, utils::forward(args)...); + } #endif } #else // MKLDNN_THR != MKLDNN_THR_TBB diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_traits.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_traits.hpp index f5512b8..367a02a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_traits.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_traits.hpp @@ -39,6 +39,7 @@ template <> struct prec_traits { typedef int32_t type; }; template <> struct prec_traits { typedef int16_t type; }; template <> struct prec_traits { typedef int8_t type; }; template <> struct prec_traits { typedef uint8_t type; }; +template <> struct prec_traits { typedef uint8_t type; }; template <> struct data_traits { static constexpr data_type_t data_type = data_type::f32; }; @@ -71,9 +72,10 @@ PKIND_TRAITS_INST(pooling); PKIND_TRAITS_INST(lrn); PKIND_TRAITS_INST(batch_normalization); PKIND_TRAITS_INST(inner_product); -PKIND_TRAITS_INST(convolution_relu); PKIND_TRAITS_INST(rnn); PKIND_TRAITS_INST(roi_pooling); +PKIND_TRAITS_INST(binary_convolution); +PKIND_TRAITS_INST(binarization); #undef PKIND_TRAITS_INST } diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/nstl.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/nstl.hpp index d9d03a5..5e42c3f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/nstl.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/nstl.hpp @@ -47,8 +47,8 @@ inline const T& min(const T& a, const T& b) { template void swap(T& t1, T& t2) { T tmp(t1); - t1=t2; - t2=tmp; + t1 = t2; + t2 = tmp; } // Rationale: MKL-DNN needs numeric limits implementation that does not diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/primitive.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/primitive.hpp index d1c4742..e91a627 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/primitive.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/primitive.hpp @@ -52,11 +52,11 @@ struct mkldnn_primitive: public mkldnn::impl::c_compatible { mkldnn_primitive(const mkldnn::impl::primitive_desc_t *pd, const input_vector &inputs, const output_vector &outputs) - : pd_(pd) + : pd_(pd->clone()) , inputs_(inputs) , outputs_(outputs) {} - virtual ~mkldnn_primitive() {} + virtual ~mkldnn_primitive() { delete pd_; } /** returns primitive's engine */ mkldnn::impl::engine_t *engine() const { return pd_->engine(); } @@ -79,7 +79,7 @@ struct mkldnn_primitive: public mkldnn::impl::c_compatible { * Suppose engine has a task pool and for some reasons submission failed. * In this case primitive will set @p e's state to event::error */ - virtual void execute(mkldnn::impl::event_t *e) = 0; + virtual void execute(mkldnn::impl::event_t *e) const = 0; /** returns data handle. Applicable for memory primitives only. */ virtual mkldnn::impl::status_t get_data_handle(void **handle) const { diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.cpp index 866c934..d48ab95 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.cpp @@ -49,13 +49,6 @@ status_t scales_t::set(int count, int mask, const float *scales) { return status::success; } -mkldnn::impl::status_t scales_t::scale(float factor) { - int cnt = (count_ == 1) ? scales_buf_size : count_; - for (int c = 0; c < cnt; ++c) - scales_[c] *= factor; - return status::success; -} - } } @@ -77,7 +70,7 @@ status_t post_ops_t::append_eltwise(float scale, alg_kind_t alg, float alpha, bool known_alg = one_of(alg, eltwise_relu, eltwise_tanh, eltwise_elu, eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic, - eltwise_clamp); + eltwise_clamp, eltwise_exp, eltwise_not); if (!known_alg) return invalid_arguments; @@ -136,6 +129,24 @@ status_t post_ops_t::append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, in return success; } +status_t post_ops_t::append_binarization(alg_kind_t alg, const float* weights_data) { + using namespace mkldnn::impl::alg_kind; + bool known_alg = one_of(alg, binarization_depthwise); + if (!known_alg) + return invalid_arguments; + + if (len_ == capacity) + return out_of_memory; + + entry_[len_].kind = primitive_kind::binarization; + entry_[len_].binarization.alg = alg; + entry_[len_].binarization.weights_data = weights_data; + + len_++; + + return success; +} + status_t primitive_attr_t::set_round_mode(round_mode_t round_mode) { using namespace mkldnn::impl::round_mode; @@ -320,6 +331,23 @@ status_t mkldnn_post_ops_get_params_eltwise(const post_ops_t *post_ops, return success; } +status_t mkldnn_primitive_attr_set_rnn_data_qparams( + primitive_attr_t *attr, const float scale, const float shift) { + if (attr == nullptr) + return invalid_arguments; + + return attr->rnn_data_qparams_.set(scale, shift); +} + +status_t mkldnn_primitive_attr_set_rnn_weights_qparams( + primitive_attr_t *attr, int count, int mask, const float *scales) { + bool ok = !any_null(attr, scales) && count > 0 && mask >= 0; + if (!ok) + return invalid_arguments; + + return attr->rnn_weights_qparams_.set(count, mask, scales); +} + status_t mkldnn_post_ops_append_depthwise(post_ops_t *post_ops, alg_kind_t kind, const float* weights_data, const float* biases_data) { if (post_ops == nullptr) @@ -375,4 +403,26 @@ status_t mkldnn_post_ops_get_params_dw_conv(const post_ops_t *post_ops, *biases_data = e.biases_data; return success; -} \ No newline at end of file +} + +status_t mkldnn_post_ops_append_binarization(post_ops_t *post_ops, alg_kind_t kind, const float* weights_data) { + if (post_ops == nullptr) + return invalid_arguments; + + return post_ops->append_binarization(kind, weights_data); +} + +status_t mkldnn_post_ops_get_params_binarization(const post_ops_t *post_ops, int index, alg_kind_t *alg, + const float** weights_data) { + bool ok = true + && simple_get_params_check(post_ops, index, primitive_kind::binarization) + && !any_null(alg, weights_data); + if (!ok) + return invalid_arguments; + + const auto &e = post_ops->entry_[index].binarization; + *alg = e.alg; + *weights_data = e.weights_data; + + return success; +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.hpp index 3f56d99..949449f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.hpp @@ -27,6 +27,20 @@ namespace mkldnn { namespace impl { +struct rnn_data_qparams_t : public c_compatible { + rnn_data_qparams_t() : scale_(1.), shift_(0.) {} + bool has_default_values() const { return (scale_ == 1. && shift_ == 0.); } + + status_t set(float scale, float shift) { + scale_ = scale; + shift_ = shift; + return status::success; + } + + float scale_; + float shift_; +}; + struct scales_t: public c_compatible { scales_t(): count_(1), mask_(0), scales_(scales_buf_) { set(1.); } @@ -54,7 +68,6 @@ struct scales_t: public c_compatible { status_t set(int count, int mask, const float *scales); status_t set(float single_scale) { return this->set(1, 0, &single_scale); } - status_t scale(float factor); int count_; int mask_; @@ -79,13 +92,15 @@ private: struct mkldnn_post_ops: public mkldnn::impl::c_compatible { struct entry_t { + struct eltwise_t { + mkldnn::impl::alg_kind_t alg; + float scale, alpha, beta; + }; + mkldnn::impl::primitive_kind_t kind; union { struct { float scale; } sum; - struct { - mkldnn::impl::alg_kind_t alg; - float scale, alpha, beta; - } eltwise; + eltwise_t eltwise; struct { mkldnn::impl::alg_kind_t alg; const float* weights_data; @@ -101,34 +116,45 @@ struct mkldnn_post_ops: public mkldnn::impl::c_compatible { const float* weights_data; const float* biases_data; } dw_conv; + struct { + mkldnn::impl::alg_kind_t alg; + const float* weights_data; + } binarization; }; + bool is_eltwise(bool require_scale_one = true) const { + using namespace mkldnn::impl; + return kind == primitive_kind::eltwise + && IMPLICATION(require_scale_one, eltwise.scale == 1.f); + } + bool is_relu(bool require_scale_one = true, bool require_nslope_zero = true) const { using namespace mkldnn::impl; - return kind == primitive_kind::eltwise - && IMPLICATION(require_scale_one, eltwise.scale == 1.f) + return is_eltwise(require_scale_one) && eltwise.alg == alg_kind::eltwise_relu && IMPLICATION(require_nslope_zero, eltwise.alpha == 0.f); } + bool is_sum(bool require_scale_one = true) const { using namespace mkldnn::impl; return kind == primitive_kind::sum && IMPLICATION(require_scale_one, sum.scale == 1.f); } - bool is_eltwise(bool require_scale_one = true) const { - using namespace mkldnn::impl; - return kind == primitive_kind::eltwise - && IMPLICATION(require_scale_one, eltwise.scale == 1.f); - } + bool is_depthwise() const { using namespace mkldnn::impl; return kind == primitive_kind::depthwise; } + bool is_dw_conv() const { using namespace mkldnn::impl; return kind == primitive_kind::convolution; } + bool is_binarization() const { + using namespace mkldnn::impl; + return kind == primitive_kind::binarization; + } }; mkldnn_post_ops(): len_(0) {} @@ -141,6 +167,7 @@ struct mkldnn_post_ops: public mkldnn::impl::c_compatible { mkldnn::impl::status_t append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w, const float* weights_data, const float* biases_data); + mkldnn::impl::status_t append_binarization(mkldnn::impl::alg_kind_t alg, const float* weights_data); int find(mkldnn::impl::primitive_kind_t kind, int start = 0, int stop = -1) const { @@ -173,7 +200,9 @@ struct mkldnn_primitive_attr: public mkldnn::impl::c_compatible { return true && round_mode_ == mkldnn::impl::round_mode::nearest && output_scales_.has_default_values() - && post_ops_.has_default_values() ; + && post_ops_.has_default_values() + && rnn_data_qparams_.has_default_values() + && rnn_weights_qparams_.has_default_values(); } mkldnn::impl::status_t set_round_mode( @@ -184,6 +213,8 @@ struct mkldnn_primitive_attr: public mkldnn::impl::c_compatible { mkldnn::impl::round_mode_t round_mode_; mkldnn::impl::scales_t output_scales_; mkldnn::impl::post_ops_t post_ops_; + mkldnn::impl::rnn_data_qparams_t rnn_data_qparams_; + mkldnn::impl::scales_t rnn_weights_qparams_; }; #endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.cpp index c88aaeb..c288aef 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.cpp @@ -35,6 +35,9 @@ status_t primitive_desc_t::query(query_t what, int idx, void *result) const { case query::engine: *(engine_t**)result = engine(); break; case query::primitive_kind: *(primitive_kind_t*)result = kind(); break; + case query::memory_consumption_s64: + *(ptrdiff_t*)result = scratchpad_registry().size(); break; + case query::op_d: if (idx != 0 || op_desc() == nullptr) return invalid_arguments; *(const_c_op_desc_t *)result diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.hpp index e13b156..542d38d 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.hpp @@ -20,6 +20,7 @@ #include "mkldnn.h" #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "nstl.hpp" #include "type_helpers.hpp" #include "primitive_attr.hpp" @@ -47,6 +48,11 @@ struct mkldnn_primitive_desc: public mkldnn::impl::c_compatible { virtual void init_info() {} const char *info() const { return info_; } + mkldnn::impl::memory_tracking::registry_t &scratchpad_registry() + { return scratchpad_registry_; } + const mkldnn::impl::memory_tracking::registry_t &scratchpad_registry() const + { return scratchpad_registry_; } + virtual const mkldnn::impl::op_desc_t *op_desc() const = 0; # define DECLARE_PD_STUB(stub) \ @@ -101,6 +107,8 @@ protected: mkldnn::impl::primitive_kind_t kind_; char info_[MKLDNN_VERBOSE_BUF_LEN]; + + mkldnn::impl::memory_tracking::registry_t scratchpad_registry_; }; #define DECLARE_COMMON_PD_t(impl_name, ...) \ diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/rnn.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/rnn.cpp index 432763b..3696743 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/rnn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/rnn.cpp @@ -19,6 +19,7 @@ #include "c_types_map.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "cpu/gemm/os_blas.hpp" using namespace mkldnn::impl; using namespace mkldnn::impl::status; @@ -63,7 +64,7 @@ status_t mkldnn_rnn_cell_desc_init(rnn_cell_desc_t *rnn_cell_desc, && IMPLICATION(cell_kind == vanilla_rnn, one_of(act_f, eltwise_relu, eltwise_tanh, eltwise_logistic)); if (!args_ok) - return status::invalid_arguments; + return invalid_arguments; auto rcd = mkldnn_rnn_cell_desc_t(); @@ -75,7 +76,7 @@ status_t mkldnn_rnn_cell_desc_init(rnn_cell_desc_t *rnn_cell_desc, *rnn_cell_desc = rcd; - return status::success; + return success; } int mkldnn_rnn_cell_get_gates_count(const rnn_cell_desc_t *rnn_cell_desc) { @@ -100,6 +101,161 @@ int mkldnn_rnn_cell_get_states_count(const rnn_cell_desc_t *rnn_cell_desc) { return 0; } +status_t check_data_type_consistency_fwd(const rnn_cell_desc_t *rnn_cell_desc, + prop_kind_t prop_kind, const memory_desc_t *src_layer_desc, + const memory_desc_t *src_iter_desc, + const memory_desc_t *weights_layer_desc, + const memory_desc_t *weights_iter_desc, const memory_desc_t *bias_desc, + const memory_desc_t *dst_layer_desc, + const memory_desc_t *dst_iter_desc) { + using namespace data_type; + data_type_t src_layer_dt = src_layer_desc->data_type; + data_type_t dst_layer_dt = dst_layer_desc->data_type; + data_type_t weights_iter_dt = weights_iter_desc->data_type; + data_type_t weights_layer_dt = weights_layer_desc->data_type; + + bool is_f32 = everyone_is(f32, src_layer_dt, dst_layer_dt, weights_iter_dt, + weights_layer_dt) + && IMPLICATION(!is_zero_md(src_iter_desc), + src_iter_desc->data_type == f32) + && IMPLICATION(!is_zero_md(dst_iter_desc), + dst_iter_desc->data_type == f32) + && IMPLICATION(!is_zero_md(bias_desc), bias_desc->data_type == f32); + +#if USE_MKL_PACKED_GEMM + bool is_u8u8u8 = src_layer_dt == u8 + && IMPLICATION(!is_zero_md(src_iter_desc), + src_iter_desc->data_type == u8) + && IMPLICATION(!is_zero_md(dst_iter_desc), + dst_iter_desc->data_type == u8) + && one_of(dst_layer_dt, u8, f32) + && everyone_is(s8, weights_iter_dt, weights_layer_dt) + && IMPLICATION(!is_zero_md(bias_desc), bias_desc->data_type == f32); + + bool is_f32u8f32 = src_layer_dt == u8 + && IMPLICATION(!is_zero_md(src_iter_desc), + src_iter_desc->data_type == f32) + && IMPLICATION(!is_zero_md(dst_iter_desc), + dst_iter_desc->data_type == f32) + && one_of(dst_layer_dt, u8, f32) + && everyone_is(s8, weights_iter_dt, weights_layer_dt) + && IMPLICATION(!is_zero_md(bias_desc), bias_desc->data_type == f32); + + bool is_inference = prop_kind == prop_kind::forward_inference; + bool is_lstm = rnn_cell_desc->cell_kind == mkldnn_vanilla_lstm; + + return (is_f32 || ((is_u8u8u8 || is_f32u8f32) && is_lstm && is_inference)) + ? success + : unimplemented; +#else + return is_f32 ? success : unimplemented; +#endif +} + +status_t check_dim_consistency(const rnn_cell_desc_t *rnn_cell_desc, + rnn_direction_t direction, int L, int D, int T, int N, int S, int G, + int SLC, int SIC, int DLC, int DIC, const memory_desc_t *src_layer_desc, + const memory_desc_t *src_iter_desc, + const memory_desc_t *weights_layer_desc, + const memory_desc_t *weights_iter_desc, const memory_desc_t *bias_desc, + const memory_desc_t *dst_layer_desc, + const memory_desc_t *dst_iter_desc) { + bool args_ok; + + // * algorithm specific + args_ok = true + && IMPLICATION(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru, + DIC == SIC); + if (!args_ok) return invalid_arguments; + int extra_bias = + rnn_cell_desc->cell_kind == alg_kind::gru_linear_before_reset; + + // * on num layers + args_ok = true + && L == weights_layer_desc->dims[0] + && L == weights_iter_desc->dims[0] + && IMPLICATION(!is_zero_md(bias_desc), L == bias_desc->dims[0]) + && IMPLICATION(!is_zero_md(src_iter_desc), L == src_iter_desc->dims[0]) + && IMPLICATION(!is_zero_md(dst_iter_desc), L == dst_iter_desc->dims[0]); + if (!args_ok) return invalid_arguments; + + // * on num directions + args_ok = true + && D == weights_layer_desc->dims[1] + && D == weights_iter_desc->dims[1] + && IMPLICATION(!is_zero_md(bias_desc), D == bias_desc->dims[1]) + && IMPLICATION(!is_zero_md(src_iter_desc), D == src_iter_desc->dims[1]) + && IMPLICATION(!is_zero_md(dst_iter_desc), D == dst_iter_desc->dims[1]); + if (!args_ok) return invalid_arguments; + + // * on num iterations + args_ok = true + && T == src_layer_desc->dims[0] + && T == dst_layer_desc->dims[0]; + if (!args_ok) return invalid_arguments; + + // * on mb + args_ok = true + && N == src_layer_desc->dims[1] + && N == dst_layer_desc->dims[1] + && IMPLICATION(!is_zero_md(src_iter_desc), N == src_iter_desc->dims[3]) + && IMPLICATION(!is_zero_md(dst_iter_desc), N == dst_iter_desc->dims[3]); + if (!args_ok) return invalid_arguments; + + // * on num gates + args_ok = true + && G == mkldnn_rnn_cell_get_gates_count(rnn_cell_desc) + && G == weights_layer_desc->dims[3] + && G == weights_iter_desc->dims[3] + && IMPLICATION(!is_zero_md(bias_desc), + G + extra_bias == bias_desc->dims[2]); + if (!args_ok) return invalid_arguments; + + // * on num states + args_ok = true + && S == mkldnn_rnn_cell_get_states_count(rnn_cell_desc) + && IMPLICATION(!is_zero_md(src_iter_desc), S == src_iter_desc->dims[2]) + && IMPLICATION(!is_zero_md(dst_iter_desc), S == dst_iter_desc->dims[2]); + if (!args_ok) return invalid_arguments; + + // * on slc + args_ok = true + && SLC == weights_layer_desc->dims[2] + && SLC == src_layer_desc->dims[2]; + if (!args_ok) return invalid_arguments; + + // * on sic + args_ok = true + && SIC == weights_iter_desc->dims[2] + && IMPLICATION(!is_zero_md(src_iter_desc), + SIC == src_iter_desc->dims[4]); + if (!args_ok) return invalid_arguments; + + // * on dlc + int dlc_multiplier = (direction == mkldnn_bidirectional_concat) ? 2 : 1; + args_ok = true + && DLC == dlc_multiplier * DIC + && DLC == dst_layer_desc->dims[2]; + if (!args_ok) return invalid_arguments; + + // * on dic + args_ok = true + && DIC == weights_layer_desc->dims[4] + && DIC == weights_iter_desc->dims[4] + && IMPLICATION(!is_zero_md(bias_desc), DIC == bias_desc->dims[3]) + && IMPLICATION(!is_zero_md(dst_iter_desc), + DIC == dst_iter_desc->dims[4]); + if (!args_ok) return invalid_arguments; + + // * unrolling/fusion conditions + args_ok = true + && IMPLICATION(L > 1, (dlc_multiplier * SLC) == DLC) + && IMPLICATION(T > 1, SIC == DIC); + if (!args_ok) return invalid_arguments; + + return success; +} + status_t MKLDNN_API mkldnn_rnn_forward_desc_init(mkldnn_rnn_desc_t *rnn_desc, prop_kind_t prop_kind, const rnn_cell_desc_t *rnn_cell_desc, const rnn_direction_t direction, const memory_desc_t *src_layer_desc, @@ -111,43 +267,33 @@ status_t MKLDNN_API mkldnn_rnn_forward_desc_init(mkldnn_rnn_desc_t *rnn_desc, bool args_ok = true && rnn_cell_desc != nullptr && !any_null(src_layer_desc, weights_layer_desc, weights_iter_desc, dst_layer_desc); - if (!args_ok) - return invalid_arguments; - - int DIC = 0, L = 0; - if (weights_layer_desc->ndims) { - DIC = weights_layer_desc->dims[4]; - L = weights_layer_desc->dims[0]; - } else if (weights_iter_desc->ndims) { - DIC = weights_iter_desc->dims[4]; - L = weights_iter_desc->dims[0]; - } else { - assert(!"cannot query cell state size"); - return unimplemented; - } + if (!args_ok) return invalid_arguments; + //check dimensions consistency + int L = weights_layer_desc->dims[0]; + int T = src_layer_desc->dims[0]; + int N = src_layer_desc->dims[1]; const int D = one_of(direction, mkldnn_unidirectional_left2right, mkldnn_unidirectional_right2left) ? 1 : 2; - const int DLC = (direction == mkldnn_bidirectional_concat ? 2 : 1) * DIC; - - args_ok = args_ok && D == weights_layer_desc->dims[1] - && D == weights_iter_desc->dims[1] - && DIC == weights_layer_desc->dims[4] - && DIC == weights_iter_desc->dims[4] - && DLC == dst_layer_desc->dims[2] && L == weights_iter_desc->dims[0] - && IMPLICATION(!is_zero_md(dst_iter_desc), true - && DIC == dst_iter_desc->dims[4] - && L == dst_iter_desc->dims[0]) - && IMPLICATION(!is_zero_md(bias_desc), L == bias_desc->dims[0]) - && IMPLICATION( - !is_zero_md(src_iter_desc), L == src_iter_desc->dims[0]) - && IMPLICATION(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru, - DIC == weights_iter_desc->dims[2]); - if (!args_ok) - return invalid_arguments; - + int G = mkldnn_rnn_cell_get_gates_count(rnn_cell_desc); + int S = mkldnn_rnn_cell_get_states_count(rnn_cell_desc); + int SLC = src_layer_desc->dims[2]; + int SIC = weights_iter_desc->dims[2]; + int DLC = dst_layer_desc->dims[2]; + int DIC = weights_layer_desc->dims[4]; + + CHECK(check_dim_consistency(rnn_cell_desc, direction, L, D, T, N, S, + G, SLC, SIC, DLC, DIC, src_layer_desc, src_iter_desc, + weights_layer_desc, weights_iter_desc, bias_desc, dst_layer_desc, + dst_iter_desc)); + + CHECK(check_data_type_consistency_fwd(rnn_cell_desc, prop_kind, + src_layer_desc, src_iter_desc, weights_layer_desc, + weights_iter_desc, bias_desc, dst_layer_desc, dst_iter_desc)); + + // Create the descriptor mkldnn_rnn_desc_t rd = zero_rnn_desc(); rd.primitive_kind = primitive_kind::rnn; @@ -179,28 +325,16 @@ status_t MKLDNN_API mkldnn_rnn_backward_desc_init(mkldnn_rnn_desc_t *rnn_desc, const memory_desc_t *diff_weights_layer_desc, const memory_desc_t *diff_weights_iter_desc, const memory_desc_t *diff_bias_desc, - const memory_desc_t *diff_dst_layer, + const memory_desc_t *diff_dst_layer_desc, const memory_desc_t *diff_dst_iter_desc) { bool args_ok = true && !any_null(src_layer_desc, weights_layer_desc, weights_iter_desc, dst_layer_desc, diff_src_layer_desc, diff_weights_layer_desc, diff_weights_iter_desc, - diff_dst_layer); + diff_dst_layer_desc); if (!args_ok) return invalid_arguments; - int DIC = 0, L = 0; - if (weights_layer_desc->ndims) { - DIC = weights_layer_desc->dims[4]; - L = weights_layer_desc->dims[0]; - } else if (weights_iter_desc->ndims) { - DIC = weights_iter_desc->dims[4]; - L = weights_iter_desc->dims[0]; - } else { - assert(!"cannot query cell state size"); - return unimplemented; - } - auto xnor_md = [=](const memory_desc_t *a_md, const memory_desc_t *b_md) { return is_zero_md(a_md) == is_zero_md(b_md); }; @@ -211,27 +345,32 @@ status_t MKLDNN_API mkldnn_rnn_backward_desc_init(mkldnn_rnn_desc_t *rnn_desc, if (!args_ok) return invalid_arguments; - int D = one_of(direction, mkldnn_unidirectional_left2right, - mkldnn_unidirectional_right2left) ? + //check dimensions consistency + int L = weights_layer_desc->dims[0]; + int T = src_layer_desc->dims[0]; + int N = src_layer_desc->dims[1]; + const int D = one_of(direction, mkldnn_unidirectional_left2right, + mkldnn_unidirectional_right2left) ? 1 : 2; - int DLC = (direction == mkldnn_bidirectional_concat ? 2 : 1) * DIC; - - args_ok = args_ok && D == weights_layer_desc->dims[1] - && D == weights_iter_desc->dims[1] - && DIC == weights_layer_desc->dims[4] - && DIC == weights_iter_desc->dims[4] - && DLC == dst_layer_desc->dims[2] && L == weights_iter_desc->dims[0] - && IMPLICATION(!is_zero_md(dst_iter_desc), true - && DIC == dst_iter_desc->dims[4] - && L == dst_iter_desc->dims[0]) - && IMPLICATION(!is_zero_md(bias_desc), L == bias_desc->dims[0]) - && IMPLICATION( - !is_zero_md(src_iter_desc), L == src_iter_desc->dims[0]) - && IMPLICATION(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru, - DIC == weights_iter_desc->dims[2]); - if (!args_ok) - return invalid_arguments; + int G = mkldnn_rnn_cell_get_gates_count(rnn_cell_desc); + int S = mkldnn_rnn_cell_get_states_count(rnn_cell_desc); + int SLC = src_layer_desc->dims[2]; + int SIC = weights_iter_desc->dims[2]; + int DLC = dst_layer_desc->dims[2]; + int DIC = weights_layer_desc->dims[4]; + + status_t st = check_dim_consistency(rnn_cell_desc, direction, L, D, T, N, S, + G, SLC, SIC, DLC, DIC, src_layer_desc, src_iter_desc, + weights_layer_desc, weights_iter_desc, bias_desc, dst_layer_desc, + dst_iter_desc); + if (st != success) return st; + + st = check_dim_consistency(rnn_cell_desc, direction, L, D, T, N, S, + G, SLC, SIC, DLC, DIC, diff_src_layer_desc, diff_src_iter_desc, + diff_weights_layer_desc, diff_weights_iter_desc, diff_bias_desc, + diff_dst_layer_desc, diff_dst_iter_desc); + if (st != success) return st; mkldnn_rnn_desc_t rd = zero_rnn_desc(); @@ -252,7 +391,7 @@ status_t MKLDNN_API mkldnn_rnn_backward_desc_init(mkldnn_rnn_desc_t *rnn_desc, rd.diff_weights_layer_desc = copy_maybe_null(diff_weights_layer_desc); rd.diff_weights_iter_desc = copy_maybe_null(diff_weights_iter_desc); rd.diff_bias_desc = copy_maybe_null(diff_bias_desc); - rd.diff_dst_layer_desc = copy_maybe_null(diff_dst_layer); + rd.diff_dst_layer_desc = copy_maybe_null(diff_dst_layer_desc); rd.diff_dst_iter_desc = copy_maybe_null(diff_dst_iter_desc); *rnn_desc = rd; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/rnn_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/rnn_pd.hpp index 5b11d5a..53facc8 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/rnn_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/rnn_pd.hpp @@ -62,153 +62,6 @@ struct rnn_pd_t : public primitive_desc_t { prop_kind::forward_inference); } - inline size_t ws_states_size() { - return (size_t)(L() + 1) * D() * (T() + 1) * S() * MB() * S_GLD(); - } - - inline size_t ws_diff_states_size() { - return (size_t)(L() + 1) * D() * (T() + 1) * (S() + 1) * MB() * S_GLD(); - } - - inline size_t ws_weights_layer_size() { - size_t ld = is_fwd() ? G_GLD() : S_GLD(); - size_t not_ld = is_fwd() ? SLC() : G() * DIC(); - return (size_t)(L() * D() * ld * not_ld); - } - - inline size_t ws_weights_iter_size() { - size_t ld = is_fwd() ? G_GLD() : S_GLD(); - size_t not_ld = is_fwd() ? SIC() : G() * DIC(); - return (size_t)(L() * D() * ld * not_ld); - } - - inline size_t ws_diff_weights_layer_size() { - return (size_t)(L() * D() * SLC() * GC()); - } - - inline size_t ws_diff_weights_iter_size() { - return (size_t)(L() * D() * SIC() * GC()); - } - - inline size_t ws_gates_size() { - return (size_t) L() * D() * T() * MB() * GC(); - } - - inline size_t ws_cell_comp_size() { - return (size_t)is_lbr() * MB() * GC(); - } - - inline size_t ws_grid_comp_size() { - return (size_t)is_lbr() * is_training() * L() * D() * T() * MB() * DIC(); - } - - inline int ws_per_cell() { - return is_lbr() * MB() * DIC(); - } - - // returns the scratchpad size if use_workspace is true - // returns the workspace size if use_workspace is false, - // and all scratchpad boolean are false - inline size_t set_offsets( bool use_workspace, - size_t &ws_gates_offset, size_t &ws_states_offset, - size_t &ws_diff_states_offset, size_t &ws_grid_comp_offset, - bool use_ws_cell_comp, size_t &ws_cell_comp_offset, - bool copy_weights_layer_, size_t &ws_weights_layer_offset, - bool copy_weights_iter_, size_t &ws_weights_iter_offset, - bool copy_diff_weights_layer, size_t &ws_diff_weights_layer_offset, - bool copy_diff_weights_iter, size_t &ws_diff_weights_iter_offset) { - const size_t page_size = 4096; // 2097152; - size_t current_offset; - - /* Mandatory workspaces: go to workspace if use_workspace, scratchpad otherwise */ - current_offset = 0; // assumes the workspace base pointer is page aligned - ws_gates_offset = current_offset; - current_offset += ws_gates_size(); - - current_offset = utils::rnd_up(current_offset, page_size); - ws_states_offset = current_offset; - current_offset += ws_states_size(); - - current_offset = utils::rnd_up(current_offset, page_size); - ws_diff_states_offset = current_offset; - current_offset += ws_diff_states_size(); - - current_offset = utils::rnd_up(current_offset, page_size); - ws_grid_comp_offset = current_offset; - current_offset += ws_grid_comp_size(); - - // ws_cell_comp is optional - if (use_ws_cell_comp) { - current_offset = utils::rnd_up(current_offset, page_size); - ws_cell_comp_offset = current_offset; - current_offset += ws_cell_comp_size(); - } - - /* Optional scratchpads */ - // Assumes the scratchpad base pointer is page aligned. - // If use_workspace, the following goes to scratchpad alone, - // otherwise, all goes to scratchpad and continue incrementing offset - current_offset = use_workspace ? 0 : current_offset; - - if (copy_weights_layer_) { - current_offset = utils::rnd_up(current_offset, page_size); - ws_weights_layer_offset = current_offset; - current_offset += ws_weights_layer_size(); - } - - if (copy_weights_iter_) { - current_offset = utils::rnd_up(current_offset, page_size); - ws_weights_iter_offset = current_offset; - current_offset += ws_weights_iter_size(); - } - - if (copy_diff_weights_layer) { - current_offset = utils::rnd_up(current_offset, page_size); - ws_diff_weights_layer_offset = current_offset; - current_offset += ws_diff_weights_layer_size(); - } - - if (copy_diff_weights_iter) { - current_offset = utils::rnd_up(current_offset, page_size); - ws_diff_weights_iter_offset = current_offset; - current_offset += ws_diff_weights_iter_size(); - } - - return current_offset; - } - - inline size_t get_ws_size() { - size_t ws_gates_offset, ws_states_offset, - ws_diff_states_offset,ws_grid_comp_offset, - ws_cell_comp_offset, ws_weights_layer_offset, - ws_weights_iter_offset, ws_diff_weights_layer_offset, - ws_diff_weights_iter_offset; - return set_offsets( false, - ws_gates_offset, ws_states_offset, - ws_diff_states_offset, ws_grid_comp_offset, - is_lbr(), ws_cell_comp_offset, - false, ws_weights_layer_offset, - false, ws_weights_iter_offset, - false, ws_diff_weights_layer_offset, - false, ws_diff_weights_iter_offset); - } - - inline size_t get_scratchpad_size(bool use_workspace) { - size_t ws_gates_offset, ws_states_offset, - ws_diff_states_offset,ws_grid_comp_offset, - ws_cell_comp_offset, ws_weights_layer_offset, - ws_weights_iter_offset, ws_diff_weights_layer_offset, - ws_diff_weights_iter_offset; - return set_offsets(use_workspace, - ws_gates_offset, ws_states_offset, - ws_diff_states_offset, ws_grid_comp_offset, - false, ws_cell_comp_offset, - false, ws_weights_layer_offset, - false, ws_weights_iter_offset, - false, ws_diff_weights_layer_offset, - false, ws_diff_weights_iter_offset); - } - int T() const { return desc_.src_layer_desc.dims[0]; } int MB() const { return desc_.src_layer_desc.dims[1]; } @@ -223,110 +76,6 @@ struct rnn_pd_t : public primitive_desc_t { int DLC() const { return desc_.dst_layer_desc.dims[2]; } - int get_good_ld(int dim){ - // we want matrices leading dimentions to be 64-byte aligned, - // and not divisible by 256 to avoid 4K aliasing effects - int ld = utils::rnd_up(dim, (int)(64/sizeof(float))); - return (ld % 256 == 0) ? ld + 64/sizeof(float) : ld; - } - - int WIC() { - // wic will be the leading dimension of our B matrices - return get_good_ld(nstl::max(SLC(), nstl::max(SIC(), DIC()))); - } - - int GC() { - // gc will be the leading dimension of our C matrices - return get_good_ld(G() * DIC()); - } - - /* replacement functions for meaningless WIC and GC: - - LD stands for leading dimension - - GLD stands for good leading dimension - - NLD stands for not leading dimension (so the other dim) - */ - int G_GLD() { - // good leading dimension for the gates - // C matrices for fwd, B matrices for bwd - return get_good_ld(G() * DIC()); - } - - int S_GLD() { - // good leading dimension for the states - // B matrices for fwd, B matrices for bwd_w, C matrices for bwd_d - return get_good_ld(nstl::max(SLC(), nstl::max(SIC(), DIC()))); - } - - int W_GLD() { - // good leading dimension for the weights - return is_fwd() ? G_GLD() : S_GLD(); - } - - int DW_GLD() { - // good leading dimension for the diff weights - return weights_copy_enabled() ? G_GLD() : G() * DIC(); - } - - int weights_copy_enabled() { return (T() > 1); } - - int get_weights_ld(int feature_dim) { - return is_fwd() ? G() * DIC() : feature_dim; - } - - int get_weights_nld(int feature_dim) { - return !(is_fwd()) ? G() * DIC() : feature_dim; - } - - int WL_LD() { - return get_weights_ld(SLC()); - } - - int WL_GLD() { - return weights_copy_enabled() ? get_good_ld(WL_LD()) : WL_LD(); - } - - int WI_LD() { - return get_weights_ld(SIC()); - } - - int WI_GLD() { - return weights_copy_enabled() ? get_good_ld(WI_LD()) : WI_LD(); - } - - int DWL_LD() { - return G() * DIC(); - } - - int DWL_GLD() { - return weights_copy_enabled() ? get_good_ld(DWL_LD()) : DWL_LD(); - } - - int DWI_LD() { - return G() * DIC(); - } - - int DWI_GLD() { - return weights_copy_enabled() ? get_good_ld(DWI_LD()) : DWI_LD(); - } - - int WL_NLD() { - return get_weights_nld(SLC()); - } - - int WI_NLD() { - return get_weights_nld(SIC()); - } - - int DWL_NLD() { - return SLC(); - } - - int DWI_NLD() { - return SIC(); - } - - int S() const { return mkldnn_rnn_cell_get_states_count(&desc_.cell_desc); } - bool with_bias() const { return !memory_desc_wrapper(desc_.bias_desc).is_zero(); } @@ -397,7 +146,7 @@ struct rnn_fwd_pd_t : public rnn_pd_t { struct rnn_bwd_pd_t : public rnn_pd_t { typedef rnn_bwd_pd_t base_class; - typedef rnn_bwd_pd_t hint_class; + typedef rnn_fwd_pd_t hint_class; using rnn_pd_t::rnn_pd_t; virtual ~rnn_bwd_pd_t() {} diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/roi_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/roi_pooling.cpp index ba78dbd..f1f2334 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/roi_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/roi_pooling.cpp @@ -33,7 +33,7 @@ status_t roi_pooling_desc_init(roi_pooling_desc_t *roi_pool_desc, memory_desc_t *src_descs, int num_src, const memory_desc_t *dst_desc, int pooled_h, int pooled_w, double spatial_scale) { - roi_pooling_desc_t pd = {}; + auto pd = roi_pooling_desc_t(); pd.primitive_kind = primitive_kind::roi_pooling; pd.prop_kind = prop_kind; pd.pooled_h = pooled_h; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/scratchpad.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/scratchpad.cpp index 30de4a4..31a56c2 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/scratchpad.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/scratchpad.cpp @@ -79,14 +79,14 @@ struct global_scratchpad_t : public scratchpad_t { } private: - THREAD_LOCAL static char *scratchpad_; - THREAD_LOCAL static size_t size_; - THREAD_LOCAL static unsigned int reference_count_; + thread_local static char *scratchpad_; + thread_local static size_t size_; + thread_local static unsigned int reference_count_; }; -THREAD_LOCAL char *global_scratchpad_t::scratchpad_ = nullptr; -THREAD_LOCAL size_t global_scratchpad_t::size_ = 0; -THREAD_LOCAL unsigned int global_scratchpad_t::reference_count_ = 0; +thread_local char *global_scratchpad_t::scratchpad_ = nullptr; +thread_local size_t global_scratchpad_t::size_ = 0; +thread_local unsigned int global_scratchpad_t::reference_count_ = 0; /* diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/softmax_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/softmax_pd.hpp index cb156e7..44032f7 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/softmax_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/softmax_pd.hpp @@ -102,9 +102,9 @@ struct softmax_bwd_pd_t: public primitive_desc_t { virtual const memory_pd_t *output_pd(int index = 0) const override { return index == 0 ? diff_src_pd() : nullptr; } - virtual int n_inputs() const override { return 2; } - virtual int n_outputs() const override - { return 1 + (workspace_pd() != nullptr); } + virtual int n_inputs() const override + { return 2 + (workspace_pd() != nullptr); } + virtual int n_outputs() const override { return 1; } virtual status_t query(query_t what, int idx, void *result) const override { diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/type_helpers.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/type_helpers.hpp index a7cf1a1..06a0e2f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/type_helpers.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/type_helpers.hpp @@ -64,6 +64,7 @@ inline size_t data_type_size(data_type_t data_type) { case s16: return sizeof(prec_traits::type); case s8: return sizeof(prec_traits::type); case u8: return sizeof(prec_traits::type); + case bin: return sizeof(prec_traits::type); case data_type::undef: default: assert(!"unknown data_type"); } @@ -94,26 +95,32 @@ inline memory_format_t format_normalize(const memory_format_t fmt) { nc, ncw, nwc, + nCw4c, nCw8c, nCw16c, nchw, nhwc, chwn, + nChw4c, nChw8c, nChw16c, ncdhw, ndhwc, + nCdhw4c, nCdhw8c, nCdhw16c, oi, io, oiw, wio, + Owi4o, + OIw4i4o, Owi8o, OIw8i8o, OIw8o8i, OIw16i16o, OIw16o16i, + Oiw4o, Oiw16o, Owi16o, OIw8i16o2i, @@ -122,20 +129,25 @@ inline memory_format_t format_normalize(const memory_format_t fmt) { oihw, ihwo, hwio, + iohw, hwio_s8s8, dhwio, oidhw, + OIdhw4i4o, + Odhwi4o, OIdhw8i8o, OIdhw8o8i, Odhwi8o, OIdhw16i16o, OIdhw16o16i, + Oidhw4o, Oidhw16o, Odhwi16o, oIhw8i, oIhw16i, oIdhw8i, oIdhw16i, + OIhw4i4o, OIhw8i8o, OIhw16i16o, OIhw4i16o4i, @@ -145,18 +157,25 @@ inline memory_format_t format_normalize(const memory_format_t fmt) { OIhw8o16i2o, OIhw8o8i, OhIw8o4i, + OhIw8o32i, + OhIw16o32i, OhIw8o4i_s8s8, OIhw16o16i, IOhw16o16i, + Oihw4o, Oihw16o, Ohwi8o, + Ohwi4o, Ohwi16o, goiw, + gOwi4o, + gOIw4i4o, gOwi8o, gOIw8i8o, gOIw8o8i, gOIw16i16o, gOIw16o16i, + gOiw4o, gOiw16o, gOwi16o, gOIw8i16o2i, @@ -164,31 +183,43 @@ inline memory_format_t format_normalize(const memory_format_t fmt) { gIOw16o16i, goihw, hwigo, + giohw, hwigo_s8s8, + gOIhw4i4o, gOIhw8i8o, gOIhw16i16o, gOIhw4i16o4i, gOIhw4i16o4i_s8s8, + gOIhw2i8o4i, + gOIhw2i8o4i_s8s8, gOIhw8i16o2i, gOIdhw8i16o2i, gOIhw8o16i2o, + gOIhw4o4i, + gOIhw4o4i_s8s8, gOIhw8o8i, gOhIw8o4i, gOhIw8o4i_s8s8, gOIhw16o16i, gIOhw16o16i, + gOihw4o, gOihw16o, gOhwi8o, + gOhwi4o, gOhwi16o, Goihw8g, Goihw16g, + Goihw16g_s8s8, goidhw, + gOIdhw4i4o, + gOdhwi4o, gOIdhw8i8o, gOIdhw8o8i, gOdhwi8o, gOIdhw16i16o, gOIdhw16o16i, gOidhw16o, + gOidhw4o, gOdhwi16o, ntc, tnc, @@ -202,9 +233,9 @@ inline memory_format_t format_normalize(const memory_format_t fmt) { inline bool is_format_double_blocked(memory_format_t fmt) { using namespace memory_format; return utils::one_of(OIw8o16i2o, OIw8i16o2i, OIhw8i16o2i, OIdhw8i16o2i, - OIhw8o16i2o, OIhw4i16o4i, OIhw4i16o4i_s8s8, gOIw8o16i2o, gOIw8i16o2i, - gOIhw8i16o2i, gOIdhw8i16o2i, gOIhw8o16i2o, gOIhw4i16o4i, - gOIhw4i16o4i_s8s8); + OIhw8o16i2o, OIhw4i16o4i, OIhw4i16o4i_s8s8, + gOIw8o16i2o, gOIw8i16o2i, gOIhw8i16o2i, gOIdhw8i16o2i, gOIhw8o16i2o, + gOIhw4i16o4i, gOIhw4i16o4i_s8s8, gOIhw2i8o4i, gOIhw2i8o4i_s8s8); } inline bool blocking_desc_is_equal(const blocking_desc_t &lhs, @@ -232,6 +263,22 @@ inline bool wino_desc_is_equal(const wino_data_t &lhs, && lhs.r == rhs.r; } +inline bool rnn_packed_desc_is_equal( + const rnn_packed_data_t &lhs, const rnn_packed_data_t &rhs) { + bool ok = lhs.format == rhs.format && lhs.n_parts == rhs.n_parts + && lhs.offset_compensation == rhs.offset_compensation + && lhs.size == rhs.size + && lhs.n == rhs.n; + if (!ok) + return false; + + for (int i = 0; i < rhs.n_parts; i++) + ok = ok && lhs.parts[i] == rhs.parts[i]; + for (int i = 0; i < rhs.n_parts; i++) + ok = ok && lhs.part_pack_size[i] == rhs.part_pack_size[i]; + return ok; +} + inline bool operator==(const memory_desc_t &lhs, const memory_desc_t &rhs) { assert(lhs.primitive_kind == mkldnn::impl::primitive_kind::memory); assert(rhs.primitive_kind == mkldnn::impl::primitive_kind::memory); @@ -247,6 +294,9 @@ inline bool operator==(const memory_desc_t &lhs, const memory_desc_t &rhs) { else if (lhs.format == memory_format::wino_fmt) return wino_desc_is_equal(lhs.layout_desc.wino_desc, rhs.layout_desc.wino_desc); + else if (lhs.format == memory_format::rnn_packed) + return rnn_packed_desc_is_equal(lhs.layout_desc.rnn_packed_desc, + rhs.layout_desc.rnn_packed_desc); return true; } @@ -276,6 +326,7 @@ inline data_type_t default_accum_data_type(data_type_t src_dt, if (one_of(f32, src_dt, dst_dt)) return f32; if (one_of(s32, src_dt, dst_dt)) return s32; if (one_of(s16, src_dt, dst_dt)) return s32; + if (one_of(bin, src_dt, dst_dt)) return s32; if (one_of(s8, src_dt, dst_dt) || one_of(u8, src_dt, dst_dt)) return s32; @@ -298,10 +349,13 @@ inline data_type_t default_accum_data_type(data_type_t src_dt, if ((src_dt == u8 || src_dt == s8) && wei_dt == s8 && one_of(dst_dt, f32, s32, s8, u8)) return s32; + if (src_dt == bin && wei_dt == bin && (dst_dt == f32 || dst_dt == bin)) + return s32; } else if (prop_kind == backward_data) { if (src_dt == s32 && wei_dt == s16 && dst_dt == s16) return s32; - if (one_of(src_dt, f32, s32, s8, u8) && wei_dt == s8 && dst_dt == u8) + if (one_of(src_dt, f32, s32, s8, u8) && wei_dt == s8 && + one_of(dst_dt, s8, u8)) return s32; } else if (prop_kind == backward_weights) { if (src_dt == s16 && wei_dt == s32 && dst_dt == s16) diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/utils.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/utils.cpp index 055681f..dd3f21a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/utils.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/utils.cpp @@ -25,12 +25,16 @@ #include #include #endif -#include "xmmintrin.h" +#include "mkldnn.h" #include "utils.hpp" #include "mkldnn_thread.hpp" #include "mkldnn.h" +#if defined(MKLDNN_X86_64) +#include "xmmintrin.h" +#endif + namespace mkldnn { namespace impl { @@ -66,9 +70,9 @@ int mkldnn_getenv(char *value, const char *name, int length) { } static bool dump_jit_code; +static bool initialized; bool mkldnn_jit_dump() { - static bool initialized = false; if (!initialized) { const int len = 2; char env_dump[len] = {0}; @@ -89,9 +93,10 @@ FILE *mkldnn_fopen(const char *filename, const char *mode) { #endif } -THREAD_LOCAL unsigned int mxcsr_save; +thread_local unsigned int mxcsr_save; void set_rnd_mode(round_mode_t rnd_mode) { +#if defined(MKLDNN_X86_64) mxcsr_save = _mm_getcsr(); unsigned int mxcsr = mxcsr_save & ~(3u << 13); switch (rnd_mode) { @@ -100,10 +105,15 @@ void set_rnd_mode(round_mode_t rnd_mode) { default: assert(!"unreachable"); } if (mxcsr != mxcsr_save) _mm_setcsr(mxcsr); +#else + UNUSED(rnd_mode); +#endif } void restore_rnd_mode() { +#if defined(MKLDNN_X86_64) _mm_setcsr(mxcsr_save); +#endif } void *malloc(size_t size, int alignment) { @@ -127,13 +137,22 @@ void free(void *p) { #endif } +// Atomic operations +int32_t mkldnn_fetch_and_add(int32_t *dst, int32_t val) { +#ifdef _WIN32 + return InterlockedExchangeAdd(reinterpret_cast(dst), val); +#else + return __sync_fetch_and_add(dst, val); +#endif +} + static Xbyak::util::Cpu cpu_; unsigned int get_cache_size(int level, bool per_core) { unsigned int l = level - 1; // Currently, if XByak is not able to fetch the cache topology // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core. - if (cpu_.data_cache_levels == 0){ + if (cpu_.getDataCacheLevels() == 0){ const int L1_cache_per_core = 32000; const int L2_cache_per_core = 512000; const int L3_cache_per_core = 1024000; @@ -145,9 +164,9 @@ unsigned int get_cache_size(int level, bool per_core) { default: return 0; } } - if (l < cpu_.data_cache_levels) { - return cpu_.data_cache_size[l] - / (per_core ? cpu_.cores_sharing_data_cache[l] : 1); + if (l < cpu_.getDataCacheLevels()) { + return cpu_.getDataCacheSize(l) + / (per_core ? cpu_.getCoresSharingDataCache(l) : 1); } else return 0; } @@ -155,7 +174,14 @@ unsigned int get_cache_size(int level, bool per_core) { } } +mkldnn_status_t mkldnn_set_jit_dump(int dump) { + using namespace mkldnn::impl::status; + if (dump < 0) return invalid_arguments; + mkldnn::impl::dump_jit_code = dump; + mkldnn::impl::initialized = true; + return success; +} + unsigned int mkldnn_get_cache_size(int level, int per_core) { return mkldnn::impl::get_cache_size(level, per_core != 0); } - diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/utils.hpp index 01fa467..59b8add 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/utils.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/utils.hpp @@ -21,6 +21,11 @@ #include #include #include +#include + +#if defined(__x86_64__) || defined(_M_X64) +#define MKLDNN_X86_64 +#endif #define MSAN_ENABLED 0 #if defined(__has_feature) @@ -50,17 +55,10 @@ static_assert(sizeof(void*) == 8, "Intel(R) MKL-DNN supports 64 bit only"); #define IMPLICATION(cause, effect) (!(cause) || !!(effect)) -#ifdef _WIN32 +#if defined(_WIN32) && !defined(__GNUC__) #define __PRETTY_FUNCTION__ __FUNCSIG__ #endif -#ifdef __APPLE__ -// older XCode doesn't support thread_local -#define THREAD_LOCAL __thread -#else -#define THREAD_LOCAL thread_local -#endif - namespace utils { /* a bunch of std:: analogues to be compliant with any msvs version @@ -181,6 +179,9 @@ inline typename remove_reference::type rnd_dn(const T a, const U b) { return (a / b) * b; } +template T *align_ptr(T *ptr, uintptr_t alignment) +{ return (T *)(((uintptr_t)ptr + alignment - 1) & ~(alignment - 1)); } + template inline U this_block_size(const T offset, const U max, const V block_size) { assert(offset < max); @@ -245,6 +246,24 @@ inline T pick(size_t i, const T &x0, Args &&... args) { return i == 0 ? x0 : pick(i - 1, utils::forward(args)...); } +template +T pick_by_prop_kind(prop_kind_t prop_kind, const T &val_fwd_inference, + const T &val_fwd_training, const T &val_bwd_d, const T &val_bwd_w) { + switch (prop_kind) { + case prop_kind::forward_inference: return val_fwd_inference; + case prop_kind::forward_training: return val_fwd_training; + case prop_kind::backward_data: return val_bwd_d; + case prop_kind::backward_weights: return val_bwd_w; + default: assert(!"unsupported prop_kind"); + } + return T(); +} + +template +T pick_by_prop_kind(prop_kind_t prop_kind, + const T &val_fwd, const T &val_bwd_d, const T &val_bwd_w) +{ return pick_by_prop_kind(prop_kind, val_fwd, val_fwd, val_bwd_d, val_bwd_w); } + template struct array_offset_calculator { template @@ -287,6 +306,7 @@ private: void *malloc(size_t size, int alignment); void free(void *p); +int32_t mkldnn_fetch_and_add(int32_t *dst, int32_t val); struct c_compatible { enum { default_alignment = 64 }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/verbose.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/verbose.cpp index e1af658..f2a0e17 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/verbose.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/verbose.cpp @@ -15,31 +15,58 @@ *******************************************************************************/ #include -#ifdef _WIN32 -#include -#else +#ifndef _WIN32 #include #endif #include "mkldnn.h" +#include "mkldnn_version.h" #include "c_types_map.hpp" #include "verbose.hpp" +#include "cpu_isa_traits.hpp" + +/* MKL-DNN CPU ISA info */ +#define ISA_ANY "No instruction set specific optimizations" +#define SSE42 "Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2)" +#define AVX "Intel(R) Advanced Vector Extensions (Intel(R) AVX)" +#define AVX2 "Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)" +#define AVX512_COMMON "Intel(R) Advanced Vector Extensions 512 (Intel(R) " \ + "AVX-512)" +#define AVX512_CORE "Intel(R) Advanced Vector Extensions 512 (Intel(R) " \ + "AVX-512) with AVX512BW, AVX512VL, and AVX512DQ extensions" +#define AVX512_CORE_VNNI "Intel(R) AVX512-Deep Learning Boost (Intel(R) " \ + "AVX512-DL Boost)" +#define AVX512_MIC "Intel(R) Advanced Vector Extensions 512 (Intel(R) " \ + "AVX-512) with AVX512CD, AVX512ER, and AVX512PF extensions" +#define AVX512_MIC_4OPS "Intel(R) Advanced Vector Extensions 512 (Intel(R) " \ + "AVX-512) with AVX512_4FMAPS and AVX512_4VNNIW extensions" namespace mkldnn { namespace impl { static verbose_t verbose; +static bool initialized; +static bool version_printed = false; const verbose_t *mkldnn_verbose() { #if !defined(DISABLE_VERBOSE) - static int initialized = 0; if (!initialized) { const int len = 2; char val[len] = {0}; if (mkldnn_getenv(val, "MKLDNN_VERBOSE", len) == 1) verbose.level = atoi(val); - initialized = 1; + initialized = true; } + if (!version_printed && verbose.level > 0) { + printf("mkldnn_verbose,info," + "Intel(R) MKL-DNN v%d.%d.%d (Git Hash %s),%s\n", + mkldnn_version()->major, mkldnn_version()->minor, + mkldnn_version()->patch, mkldnn_version()->hash, + get_isa_info()); + version_printed = true; + } +#else + verbose.level = 0; #endif return &verbose; } @@ -59,12 +86,36 @@ double get_msec() { #endif } +const char *get_isa_info() { + using namespace mkldnn::impl::cpu; + if (mayiuse(avx512_mic_4ops)) return AVX512_MIC_4OPS; + if (mayiuse(avx512_mic)) return AVX512_MIC; + if (mayiuse(avx512_core_vnni)) return AVX512_CORE_VNNI; + if (mayiuse(avx512_core)) return AVX512_CORE; + if (mayiuse(avx512_common)) return AVX512_COMMON; + if (mayiuse(avx2)) return AVX2; + if (mayiuse(avx)) return AVX; + if (mayiuse(sse42)) return SSE42; + return ISA_ANY; +} + } } -mkldnn_status_t mkldnn_verbose_set(int level) { +mkldnn_status_t mkldnn_set_verbose(int level) { using namespace mkldnn::impl::status; if (level < 0 || level > 2) return invalid_arguments; mkldnn::impl::verbose.level = level; + mkldnn::impl::initialized = true; return success; } + +const mkldnn_version_t *mkldnn_version() { + static mkldnn_version_t ver = { + MKLDNN_VERSION_MAJOR, + MKLDNN_VERSION_MINOR, + MKLDNN_VERSION_PATCH, + MKLDNN_VERSION_HASH}; + return &ver; +} + diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/verbose.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/verbose.hpp index e48e94a..3e4381c 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/verbose.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/verbose.hpp @@ -31,13 +31,14 @@ struct verbose_t { const verbose_t *mkldnn_verbose(); double get_msec(); +const char *get_isa_info(); #if !defined(DISABLE_VERBOSE) #include #define MKLDNN_VERBOSE_BUF_LEN 1024 -#define MKLDNN_VERBOSE_DAT_LEN 64 +#define MKLDNN_VERBOSE_DAT_LEN 128 #define MKLDNN_VERBOSE_AUX_LEN 384 #define MKLDNN_VERBOSE_PRB_LEN 384 @@ -55,6 +56,36 @@ inline void verbose_templ(char *buffer, mkldnn_primitive_kind_t prim_kind, mkldnn_prop_kind2str(prop_kind), data_str, aux_str, prb_str); } +inline void format_mem_desc_str_generic(char *str, int len, + const memory_desc_t *md) { + auto ndims = md->ndims; + auto dims = md->dims; + int l = 0; + for (int d = 0; d < ndims - 1; ++d) + l += snprintf(str + l, len - l, "%tdx", dims[d]); + snprintf(str + l, len - l, "%td", dims[ndims - 1]); +} + +// XXX: Outputs strings corresponding to memory formats used for data tensors. +inline void format_mem_desc_str(char *str, int len, const memory_desc_t *md) { + auto ndims = md->ndims; + auto dims = md->dims; + if (ndims == 1) + snprintf(str, len, "x%td", dims[0]); + else if (ndims == 2) + snprintf(str, len, "mb%tdic%td", dims[0], dims[1]); + else if (ndims == 3) + snprintf(str, len, "mb%tdic%tdiw%td", dims[0], dims[1], dims[2]); + else if (ndims == 4) + snprintf(str, len, "mb%tdic%tdih%tdiw%td", + dims[0], dims[1], dims[2], dims[3]); + else if (ndims == 5) + snprintf(str, len, "mb%tdic%tdid%tdih%tdiw%td", + dims[0], dims[1], dims[2], dims[3], dims[4]); + else + format_mem_desc_str_generic(str, len, md); +} + template static void init_info_bnorm(pd_t *s, char *buffer) { DECL_DAT_AUX_PRB_STRS(); @@ -66,17 +97,7 @@ template static void init_info_bnorm(pd_t *s, char *buffer) { snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, "flags:%u", s->desc()->flags); - if (s->ndims() == 5) - { - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%dic%did%dih%diw%d", s->MB(), s->C(), s->D(), s->H(), s->W()); - } else if (s->ndims() == 4) { - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%dic%dih%diw%d", s->MB(), s->C(), s->H(), s->W()); - } else if (s->ndims() == 2) { - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%dic%d", s->MB(), s->C()); - } + format_mem_desc_str(prb_str, MKLDNN_VERBOSE_PRB_LEN, s->src_pd()->desc()); verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); @@ -85,16 +106,16 @@ template static void init_info_bnorm(pd_t *s, char *buffer) { template static void init_info_conv(pd_t *s, char *buffer) { DECL_DAT_AUX_PRB_STRS(); - auto fmt_src = (s->cdesc()->prop_kind == prop_kind::backward_data + auto fmt_src = (s->desc()->prop_kind == prop_kind::backward_data ? s->diff_src_pd() : s->src_pd())->desc()->format; - auto fmt_wei = (s->cdesc()->prop_kind == prop_kind::backward_weights + auto fmt_wei = (s->desc()->prop_kind == prop_kind::backward_weights ? s->diff_weights_pd(0) : s->weights_pd(0))->desc()->format; auto fmt_bia = s->with_bias() - ? (s->cdesc()->prop_kind == prop_kind::backward_weights + ? (s->desc()->prop_kind == prop_kind::backward_weights ? s->diff_weights_pd(1) : s->weights_pd(1))->desc()->format : memory_format::undef; - auto fmt_dst = (s->cdesc()->prop_kind == prop_kind::backward_data - || s->cdesc()->prop_kind == prop_kind::backward_weights + auto fmt_dst = (s->desc()->prop_kind == prop_kind::backward_data + || s->desc()->prop_kind == prop_kind::backward_weights ? s->diff_dst_pd() : s->dst_pd())->desc()->format; snprintf(dat_str, MKLDNN_VERBOSE_DAT_LEN, "fsrc:%s fwei:%s fbia:%s fdst:%s", @@ -102,29 +123,49 @@ template static void init_info_conv(pd_t *s, char *buffer) { mkldnn_fmt2str(fmt_bia), mkldnn_fmt2str(fmt_dst)); snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, - "alg:%s", mkldnn_alg_kind2str(s->cdesc()->alg_kind)); + "alg:%s", mkldnn_alg_kind2str(s->desc()->alg_kind)); if (s->ndims() == 5) { - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%d_g%dic%doc%d" - "_id%dod%dkd%dsd%ddd%dpd%d" - "_ih%doh%dkh%dsh%ddh%dph%d" - "_iw%dow%dkw%dsw%ddw%dpw%d", - s->MB(), s->G(), s->IC(), s->OC(), - s->ID(), s->OD(), s->KD(), s->KSD(), s->KDD(), s->padFront(), - s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), - s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); + if (s->with_groups()) + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%d_g%dic%doc%d" + "_id%dod%dkd%dsd%ddd%dpd%d" + "_ih%doh%dkh%dsh%ddh%dph%d" + "_iw%dow%dkw%dsw%ddw%dpw%d", + s->MB(), s->G(), s->IC(), s->OC(), + s->ID(), s->OD(), s->KD(), s->KSD(), s->KDD(), s->padFront(), + s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), + s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); + else + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%d_ic%doc%d" + "_id%dod%dkd%dsd%ddd%dpd%d" + "_ih%doh%dkh%dsh%ddh%dph%d" + "_iw%dow%dkw%dsw%ddw%dpw%d", + s->MB(), s->IC(), s->OC(), + s->ID(), s->OD(), s->KD(), s->KSD(), s->KDD(), s->padFront(), + s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), + s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); } else { - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%d_g%dic%doc%d" - "_ih%doh%dkh%dsh%ddh%dph%d" - "_iw%dow%dkw%dsw%ddw%dpw%d", - s->MB(), s->G(), s->IC(), s->OC(), - s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), - s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); + if (s->with_groups()) + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%d_g%dic%doc%d" + "_ih%doh%dkh%dsh%ddh%dph%d" + "_iw%dow%dkw%dsw%ddw%dpw%d", + s->MB(), s->G(), s->IC(), s->OC(), + s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), + s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); + else + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%d_ic%doc%d" + "_ih%doh%dkh%dsh%ddh%dph%d" + "_iw%dow%dkw%dsw%ddw%dpw%d", + s->MB(), s->IC(), s->OC(), + s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), + s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); } - verbose_templ(buffer, s->kind(), s->name(), s->cdesc()->prop_kind, dat_str, + verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); } @@ -140,12 +181,7 @@ template static void init_info_shuffle(pd_t *s, char *buffer) { snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, "axis:%d group_size:%d", s->axis(), s->group_size()); - int l = 0; - for (int d = 0; d < md->ndims - 1; ++d) - l += snprintf(prb_str + l, MKLDNN_VERBOSE_PRB_LEN - l, - "%dx", md->dims[d]); - snprintf(prb_str + l, MKLDNN_VERBOSE_PRB_LEN - l, - "%d", md->dims[md->ndims - 1]); + format_mem_desc_str_generic(prb_str, MKLDNN_VERBOSE_PRB_LEN, md); verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); @@ -163,8 +199,7 @@ template static void init_info_eltwise(pd_t *s, char *buffer) { snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, "alg:%s", mkldnn_alg_kind2str(s->desc()->alg_kind)); - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%dic%dih%diw%d", s->MB(), s->C(), s->H(), s->W()); + format_mem_desc_str(prb_str, MKLDNN_VERBOSE_PRB_LEN, s->src_pd()->desc()); verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); @@ -227,8 +262,7 @@ template static void init_info_lrn(pd_t *s, char *buffer) { snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, "alg:%s", mkldnn_alg_kind2str(s->desc()->alg_kind)); - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%dic%dih%diw%d", s->MB(), s->C(), s->H(), s->W()); + format_mem_desc_str(prb_str, MKLDNN_VERBOSE_PRB_LEN, s->src_pd()->desc()); verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); @@ -246,12 +280,7 @@ template static void init_info_mem(pd_t *s, char *buffer) { snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, "num:%d", s->n_inputs()); - int l = 0; - for (int d = 0; d < o_md->ndims - 1; ++d) - l += snprintf(prb_str + l, MKLDNN_VERBOSE_PRB_LEN - l, - "%dx", o_md->dims[d]); - snprintf(prb_str + l, MKLDNN_VERBOSE_PRB_LEN - l, - "%d", o_md->dims[o_md->ndims - 1]); + format_mem_desc_str_generic(prb_str, MKLDNN_VERBOSE_PRB_LEN, o_md); verbose_templ(buffer, s->kind(), s->name(), prop_kind::undef, dat_str, aux_str, prb_str); @@ -293,15 +322,15 @@ template static void init_info_pool(pd_t *s, char *buffer) { template static void init_info_softmax(pd_t *s, char *buffer) { DECL_DAT_AUX_PRB_STRS(); - auto fmt_data = (s->desc()->prop_kind == prop_kind::backward_data - ? s->diff_src_pd() : s->src_pd())->desc()->format; + auto md = (s->desc()->prop_kind == prop_kind::backward_data + ? s->diff_src_pd() : s->src_pd())->desc(); + auto fmt_data = md->format; auto fmt_diff = s->desc()->prop_kind == prop_kind::backward_data ? s->diff_src_pd()->desc()->format : memory_format::undef; snprintf(dat_str, MKLDNN_VERBOSE_DAT_LEN, "fdata:%s fdiff:%s", mkldnn_fmt2str(fmt_data), mkldnn_fmt2str(fmt_diff)); - snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "mb%dic%dih%diw%d", s->MB(), s->C(), s->H(), s->W()); + format_mem_desc_str(prb_str, MKLDNN_VERBOSE_PRB_LEN, md); verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); @@ -311,15 +340,50 @@ template static void init_info_softmax(pd_t *s, char *buffer) { template static void init_info_rnn(pd_t *s, char *buffer) { DECL_DAT_AUX_PRB_STRS(); - alg_kind_t alg_kind = s->desc()->cell_desc.cell_kind; + const mkldnn::impl::memory_desc_t *src_lay_md, *src_iter_md, *wei_lay_md, + *wei_iter_md, *bias_md, *dst_lay_md, *dst_iter_md; + if (s->desc()->prop_kind != prop_kind::backward_data) { + src_lay_md = s->src_pd(0)->desc(); + src_iter_md = s->src_pd(1) ? s->src_pd(1)->desc() : nullptr; + wei_lay_md = s->weights_pd(0)->desc(); + wei_iter_md = s->weights_pd(1)->desc(); + bias_md = s->weights_pd(2)->desc(); + dst_lay_md = s->dst_pd(0)->desc(); + dst_iter_md = s->dst_pd(1) ? s->dst_pd(1)->desc() : nullptr; + } else { + src_lay_md = s->diff_src_pd(0)->desc(); + src_iter_md = s->diff_src_pd(1) ? s->diff_src_pd(1)->desc() : nullptr; + wei_lay_md = s->diff_weights_pd(0)->desc(); + wei_iter_md = s->diff_weights_pd(1)->desc(); + bias_md = s->diff_weights_pd(2)->desc(); + dst_lay_md = s->diff_dst_pd(0)->desc(); + dst_iter_md = s->diff_dst_pd(1) ? s->diff_dst_pd(1)->desc() : nullptr; + } + + alg_kind_t alg_kind = s->cell_kind(); + rnn_direction_t rnn_dir = s->direction(); snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, - "alg:%s", mkldnn_alg_kind2str(alg_kind)); + "alg:%s_%s", mkldnn_alg_kind2str(alg_kind), mkldnn_rnn_direction2str(rnn_dir)); + snprintf(dat_str, MKLDNN_VERBOSE_DAT_LEN, "fdata:%s-%s-%s-%s fwei:%s-%s-%s ddata:%s%s-%s%s dwei:%s%s%s", + mkldnn_fmt2str(src_lay_md->format), + mkldnn_fmt2str(src_iter_md ? src_iter_md->format : memory_format::undef), + mkldnn_fmt2str(dst_lay_md->format), + mkldnn_fmt2str(dst_iter_md ? dst_iter_md->format : memory_format::undef), + mkldnn_fmt2str(wei_lay_md->format), + mkldnn_fmt2str(wei_iter_md->format), + mkldnn_fmt2str(bias_md->format), + mkldnn_dt2str(src_lay_md->data_type), + mkldnn_dt2str(src_iter_md ? src_iter_md->data_type : data_type::undef), + mkldnn_dt2str(dst_lay_md->data_type), + mkldnn_dt2str(dst_iter_md ? dst_iter_md->data_type : data_type::undef), + mkldnn_dt2str(wei_lay_md->data_type), + mkldnn_dt2str(wei_iter_md->data_type), + mkldnn_dt2str(bias_md->data_type)); snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, - "l%dd%dmb%dt%d_ic%dsc%doc%d_wi%dws%d", - s->L(), s->D(), s->MB(), s->T(), - s->SLC(), s->DIC(), s->DIC(), - s->SLC(), s->SIC()); + "l%dt%dmb%dsic%dslc%ddic%ddlc%d", + s->L(), s->T(), s->MB(), + s->SIC(), s->SLC(), s->DIC(), s->DLC()); verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, aux_str, prb_str); @@ -343,6 +407,63 @@ template static void init_info_roi_pooling(pd_t *s, char *buffer aux_str, prb_str); } +template static void init_info_bin_conv(pd_t *s, char *buffer) { + DECL_DAT_AUX_PRB_STRS(); + + auto fmt_src = s->src_pd()->desc()->format; + auto fmt_wei = s->weights_pd(0)->desc()->format; + auto fmt_dst = s->dst_pd()->desc()->format; + + snprintf(dat_str, MKLDNN_VERBOSE_DAT_LEN, + "fsrc:%s fwei:%s fdst:%s", + mkldnn_fmt2str(fmt_src), mkldnn_fmt2str(fmt_wei), mkldnn_fmt2str(fmt_dst)); + + snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, + "alg:%s", mkldnn_alg_kind2str(s->cdesc()->alg_kind)); + + if (s->ndims() == 5) { + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%d_g%dic%doc%d" + "_id%dod%dkd%dsd%ddd%dpd%d" + "_ih%doh%dkh%dsh%ddh%dph%d" + "_iw%dow%dkw%dsw%ddw%dpw%d", + s->MB(), s->G(), s->IC(), s->OC(), + s->ID(), s->OD(), s->KD(), s->KSD(), s->KDD(), s->padFront(), + s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), + s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); + } else { + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%d_g%dic%doc%d" + "_ih%doh%dkh%dsh%ddh%dph%d" + "_iw%dow%dkw%dsw%ddw%dpw%d", + s->MB(), s->G(), s->IC(), s->OC(), + s->IH(), s->OH(), s->KH(), s->KSH(), s->KDH(), s->padT(), + s->IW(), s->OW(), s->KW(), s->KSW(), s->KDW(), s->padL()); + } + + verbose_templ(buffer, s->kind(), s->name(), s->cdesc()->prop_kind, dat_str, + aux_str, prb_str); +} + +template static void init_info_binarization(pd_t *s, char *buffer) { + DECL_DAT_AUX_PRB_STRS(); + + auto fmt_data = s->src_pd()->desc()->format; + auto fmt_diff = s->desc()->prop_kind == prop_kind::backward_data + ? s->diff_src_pd()->desc()->format : memory_format::undef; + snprintf(dat_str, MKLDNN_VERBOSE_DAT_LEN, "fdata:%s fdiff:%s", + mkldnn_fmt2str(fmt_data), mkldnn_fmt2str(fmt_diff)); + + snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, + "alg:%s", mkldnn_alg_kind2str(s->desc()->alg_kind)); + + snprintf(prb_str, MKLDNN_VERBOSE_PRB_LEN, + "mb%dic%dih%diw%d", s->MB(), s->C(), s->H(), s->W()); + + verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str, + aux_str, prb_str); +} + #else /* !defined(DISABLE_VERBOSE) */ #define MKLDNN_VERBOSE_BUF_LEN 1 @@ -361,7 +482,10 @@ DEFINE_STUB(mem); DEFINE_STUB(pool); DEFINE_STUB(softmax); DEFINE_STUB(rnn); +DEFINE_STUB(shuffle); DEFINE_STUB(roi_pooling); +DEFINE_STUB(bin_conv); +DEFINE_STUB(binarization); #undef DEFINE_STUB #endif /* !defined(DISABLE_VERBOSE) */ diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/z_magic.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/z_magic.hpp index 0c7d3c5..818a7dc 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/common/z_magic.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/common/z_magic.hpp @@ -26,8 +26,13 @@ #define STRINGIFy(s) #s #define STRINGIFY(s) STRINGIFy(s) -#define PRAGMA_MACRo(x) _Pragma(#x) -#define PRAGMA_MACRO(x) PRAGMA_MACRo(x) +#ifdef _MSC_VER +# define PRAGMA_MACRo(x) __pragma(x) +# define PRAGMA_MACRO(x) PRAGMA_MACRo(x) +#else +# define PRAGMA_MACRo(x) _Pragma(#x) +# define PRAGMA_MACRO(x) PRAGMA_MACRo(x) +#endif #endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.cpp index 81b91e7..370a925 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.cpp @@ -14,130 +14,126 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_types.h" - #include "c_types_map.hpp" #include "utils.hpp" + #include "jit_generator.hpp" -#include "cpu_batch_normalization_pd.hpp" -#include "utils.hpp" +#include "cpu_batch_normalization_utils.hpp" namespace mkldnn { namespace impl { namespace cpu { - namespace bnorm_utils { - void cache_balance(size_t working_set_size, int C_blks, int &C_blks_per_iter, - int &iters) { - int nthrs = mkldnn_get_max_threads(); - int l3_size = get_cache_size(3, true) * nthrs / 2; - C_blks_per_iter = l3_size / working_set_size; +void cache_balance(size_t working_set_size, int C_blks, int &C_blks_per_iter, + int &iters) { + int nthrs = mkldnn_get_max_threads(); + int l3_size = get_cache_size(3, true) * nthrs / 2; - if (C_blks_per_iter == 0) - C_blks_per_iter = 1; - if (C_blks_per_iter > C_blks) - C_blks_per_iter = C_blks; + C_blks_per_iter = l3_size / working_set_size; - iters = (C_blks + C_blks_per_iter - 1) / C_blks_per_iter; - } + if (C_blks_per_iter == 0) + C_blks_per_iter = 1; + if (C_blks_per_iter > C_blks) + C_blks_per_iter = C_blks; - bool thread_balance(bool do_blocking, bool spatial_thr_allowed, int ithr, - int nthr, int N, int C_blks, int SP, int &C_ithr, int &C_nthr, - int &C_blk_s, int &C_blk_e, int &N_ithr, int &N_nthr, int &N_s, - int &N_e, int &S_ithr, int &S_nthr, int &S_s, int &S_e) { - if (nthr <= C_blks || !mkldnn_thr_syncable()) { - C_ithr = ithr; C_nthr = nthr; - N_ithr = 0; N_nthr = 1; - S_ithr = 0; S_nthr = 1; - N_s = 0; N_e = N; S_s = 0; S_e = SP; - balance211(C_blks, C_nthr, C_ithr, C_blk_s, C_blk_e); + iters = (C_blks + C_blks_per_iter - 1) / C_blks_per_iter; +} + +bool thread_balance(bool do_blocking, bool spatial_thr_allowed, int ithr, + int nthr, int N, int C_blks, int SP, int &C_ithr, int &C_nthr, + int &C_blk_s, int &C_blk_e, int &N_ithr, int &N_nthr, int &N_s, + int &N_e, int &S_ithr, int &S_nthr, int &S_s, int &S_e) { + if (nthr <= C_blks || !mkldnn_thr_syncable()) { + C_ithr = ithr; C_nthr = nthr; + N_ithr = 0; N_nthr = 1; + S_ithr = 0; S_nthr = 1; + N_s = 0; N_e = N; S_s = 0; S_e = SP; + balance211(C_blks, C_nthr, C_ithr, C_blk_s, C_blk_e); + } else { + if (do_blocking) { + N_nthr = nstl::min(N, nthr); + C_nthr = nstl::min(C_blks, nthr / N_nthr); + S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); } else { - if (do_blocking) { - N_nthr = nstl::min(N, nthr); - C_nthr = nstl::min(C_blks, nthr / N_nthr); - S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); - } else { - C_nthr = math::gcd(nthr, C_blks); - N_nthr = nstl::min(N, nthr / C_nthr); - S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); - } - - if (!spatial_thr_allowed) - S_nthr = 1; - - if (S_nthr < 1) S_nthr = 1; - if (ithr < C_nthr * N_nthr * S_nthr) { - N_ithr = (ithr / S_nthr) % N_nthr ; - C_ithr = ithr / (N_nthr * S_nthr); - S_ithr = ithr % S_nthr; - balance211(C_blks, C_nthr, C_ithr, C_blk_s, C_blk_e); - balance211(N, N_nthr, N_ithr, N_s, N_e); - balance211(SP, S_nthr, S_ithr, S_s, S_e); - } else { - S_ithr = N_ithr = C_ithr = -ithr; - S_s = S_e = N_s = N_e = C_blk_s = C_blk_e = -1; - } + C_nthr = math::gcd(nthr, C_blks); + N_nthr = nstl::min(N, nthr / C_nthr); + S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); } - // spatial_thr_allowed is meant to help maintain - // consistent decisions about spatial threading - // between mutiple invocations of this routine. - // It is caller's responsibility to check the - // return value and pass it as a flag to the - // next call if needed. - if (S_nthr == 1) - spatial_thr_allowed = false; + if (!spatial_thr_allowed) + S_nthr = 1; - return spatial_thr_allowed; + if (S_nthr < 1) S_nthr = 1; + if (ithr < C_nthr * N_nthr * S_nthr) { + N_ithr = (ithr / S_nthr) % N_nthr ; + C_ithr = ithr / (N_nthr * S_nthr); + S_ithr = ithr % S_nthr; + balance211(C_blks, C_nthr, C_ithr, C_blk_s, C_blk_e); + balance211(N, N_nthr, N_ithr, N_s, N_e); + balance211(SP, S_nthr, S_ithr, S_s, S_e); + } else { + S_ithr = N_ithr = C_ithr = -ithr; + S_s = S_e = N_s = N_e = C_blk_s = C_blk_e = -1; + } } - void set_spatial_thr(const batch_normalization_pd_t *bdesc, - const int simd_w, const int data_size, int &is_spatial_thr) { - if (!mkldnn_thr_syncable()) { is_spatial_thr = 0; return; } + // spatial_thr_allowed is meant to help maintain + // consistent decisions about spatial threading + // between mutiple invocations of this routine. + // It is caller's responsibility to check the + // return value and pass it as a flag to the + // next call if needed. + if (S_nthr == 1) + spatial_thr_allowed = false; - int nthr = mkldnn_get_max_threads(); - int SP = bdesc->W() * bdesc->D() * bdesc->H(); - int C_PADDED = memory_desc_wrapper(bdesc->src_pd()) - .blocking_desc().padding_dims[1]; - assert(C_PADDED % simd_w == 0); - - size_t data = bdesc->MB() * C_PADDED * SP * data_size; - size_t l3_size_ = get_cache_size(3, true) * nthr / 2; - bool do_blocking = (data >= l3_size_ / 2 && l3_size_ > 0); - int C_blks_per_iter{ 1 }, iters{ 1 }; - int C_blks = C_PADDED / simd_w; - - if (do_blocking) { - int num_tensors = bdesc->is_fwd() ? 1 : 2; - size_t working_set_size - = (bdesc->MB() * SP * simd_w * data_size) * num_tensors; - cache_balance(working_set_size, C_blks, C_blks_per_iter, iters); - } + return spatial_thr_allowed; +} - // Spatial threading decision made in this function shall be consistent - // with thread_balance() behavior. - C_blks = do_blocking ? C_blks_per_iter : C_blks; +bool is_spatial_thr(const batch_normalization_pd_t *bdesc, int simd_w, + int data_size) { + if (!mkldnn_thr_syncable()) return false; + + int nthr = mkldnn_get_max_threads(); + int SP = bdesc->W() * bdesc->D() * bdesc->H(); + int C_PADDED = memory_desc_wrapper(bdesc->src_pd()) + .blocking_desc().padding_dims[1]; + assert(C_PADDED % simd_w == 0); + + size_t data = bdesc->MB() * C_PADDED * SP * data_size; + size_t l3_size_ = get_cache_size(3, true) * nthr / 2; + bool do_blocking = (data >= l3_size_ / 2 && l3_size_ > 0); + int C_blks_per_iter{ 1 }, iters{ 1 }; + int C_blks = C_PADDED / simd_w; + + if (do_blocking) { + int num_tensors = bdesc->is_fwd() ? 1 : 2; + size_t working_set_size + = (bdesc->MB() * SP * simd_w * data_size) * num_tensors; + cache_balance(working_set_size, C_blks, C_blks_per_iter, iters); + } - if (nthr <= C_blks) { - is_spatial_thr = 0; - } else { - int S_nthr = 1; - if (do_blocking) { - int N_nthr = nstl::min(bdesc->MB(), nthr); - int C_nthr = nstl::min(C_blks, nthr / N_nthr); - S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); - } else { - int C_nthr = math::gcd(nthr, C_blks); - int N_nthr = nstl::min(bdesc->MB(), nthr / C_nthr); - S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); - } - if (S_nthr < 1) S_nthr = 1; - is_spatial_thr = (S_nthr > 1) ? 1 : 0; - } + // Spatial threading decision made in this function shall be consistent + // with thread_balance() behavior. + C_blks = do_blocking ? C_blks_per_iter : C_blks; + + if (nthr <= C_blks) return false; + + int S_nthr = 1; + if (do_blocking) { + int N_nthr = nstl::min(bdesc->MB(), nthr); + int C_nthr = nstl::min(C_blks, nthr / N_nthr); + S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); + } else { + int C_nthr = math::gcd(nthr, C_blks); + int N_nthr = nstl::min(bdesc->MB(), nthr / C_nthr); + S_nthr = nstl::min(SP, nthr / (C_nthr * N_nthr)); } -}; + return S_nthr > 1; +} + +} } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.hpp index 5be96fc..4c83515 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_batch_normalization_utils.hpp @@ -17,29 +17,27 @@ #ifndef CPU_BATCH_NORMALIZATION_UTILS_HPP #define CPU_BATCH_NORMALIZATION_UTILS_HPP -#include "c_types_map.hpp" -#include "cpu_batch_normalization_pd.hpp" +#include "batch_normalization_pd.hpp" + namespace mkldnn { namespace impl { namespace cpu { - namespace bnorm_utils { - void cache_balance(size_t working_set_size, int C_blks, int &C_blks_per_iter, - int &iters); +void cache_balance(size_t working_set_size, int C_blks, int &C_blks_per_iter, + int &iters); - bool thread_balance(bool do_blocking, bool spatial_thr_allowed, int ithr, - int nthr, int N, int C_blks, int SP, int &C_ithr, int &C_nthr, - int &C_blk_s, int &C_blk_e, int &N_ithr, int &N_nthr, int &N_s, - int &N_e, int &S_ithr, int &S_nthr, int &S_s, int &S_e); +bool thread_balance(bool do_blocking, bool spatial_thr_allowed, int ithr, + int nthr, int N, int C_blks, int SP, int &C_ithr, int &C_nthr, + int &C_blk_s, int &C_blk_e, int &N_ithr, int &N_nthr, int &N_s, + int &N_e, int &S_ithr, int &S_nthr, int &S_s, int &S_e); - void set_spatial_thr(const batch_normalization_pd_t *bdesc, - const int simd_w, const int data_size, int &is_spatial_thr); - -}; +bool is_spatial_thr(const batch_normalization_pd_t *bdesc, int simd_w, + int data_size); } } } +} #endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binarization_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binarization_pd.hpp new file mode 100644 index 0000000..05d1059 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binarization_pd.hpp @@ -0,0 +1,86 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_BINARIZATION_PD_HPP +#define CPU_BINARIZATION_PD_HPP + +#include + +#include "c_types_map.hpp" +#include "binarization_pd.hpp" +#include "cpu_engine.hpp" +#include "cpu_memory.hpp" +#include "cpu_primitive.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +struct cpu_binarization_fwd_pd_t: public binarization_fwd_pd_t { + using cpu_memory_pd_t = cpu_memory_t::pd_t; + + cpu_binarization_fwd_pd_t(engine_t *engine, const binarization_desc_t *adesc, + const primitive_attr_t *attr, const binarization_fwd_pd_t *hint_fwd_pd) + : binarization_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , src_pd_(engine_, &desc_.src_desc) + , dst_pd_(engine_, &desc_.dst_desc) + , weights_pd_(engine_, &desc_.weights_desc) {} + virtual ~cpu_binarization_fwd_pd_t() {} + + virtual const cpu_memory_pd_t *src_pd(int index = 0) const override + { return index == 0 ? &src_pd_ : nullptr; } + virtual const cpu_memory_pd_t *dst_pd(int index = 0) const override + { return index == 0 ? &dst_pd_ : nullptr; } + virtual const cpu_memory_pd_t *weights_pd(int index = 0) const override { + if (index == 0) return &weights_pd_; + return nullptr; + } + +protected: + cpu_memory_pd_t src_pd_, dst_pd_, weights_pd_; + + inline memory_format_t src_format() + { + using namespace memory_format; + return utils::pick(desc_.src_desc.ndims - 3, ncw, nchw, ncdhw); + } + inline memory_format_t wei_format() + { + using namespace memory_format; + return x; + } + + virtual status_t set_default_params() { + using namespace memory_format; + if (src_pd_.desc()->format == any) + CHECK(src_pd_.set_format(src_format())); + if (dst_pd_.desc()->format == any) + CHECK(dst_pd_.set_format(src_pd_.desc()->format)); + if (weights_pd_.desc()->format == any) + CHECK(weights_pd_.set_format(wei_format())); + return status::success; + } + + virtual status_t init() = 0; +}; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binary_convolution_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binary_convolution_pd.hpp new file mode 100644 index 0000000..a2474ef --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_binary_convolution_pd.hpp @@ -0,0 +1,91 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_BINARY_CONVOLUTION_FWD_PD_HPP +#define CPU_BINARY_CONVOLUTION_FWD_PD_HPP + +#include + +#include "c_types_map.hpp" +#include "binary_convolution_pd.hpp" +#include "cpu_engine.hpp" +#include "cpu_memory.hpp" +#include "cpu_primitive.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +struct _cpu_binary_convolution_fwd_pd_t: public _binary_convolution_fwd_pd_t { + using cpu_memory_pd_t = cpu_memory_t::pd_t; + + _cpu_binary_convolution_fwd_pd_t(engine_t *engine, + const typename _cpu_binary_convolution_fwd_pd_t::base_desc_t *adesc, + const primitive_attr_t *attr, + const typename _cpu_binary_convolution_fwd_pd_t::base_class *hint_fwd_pd) + : _binary_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , src_pd_(this->engine_, &this->cdesc_().src_desc) + , dst_pd_(this->engine_, &this->cdesc_().dst_desc) + , weights_pd_(this->engine_, &this->cdesc_().weights_desc) {} + virtual ~_cpu_binary_convolution_fwd_pd_t() {} + + virtual const cpu_memory_pd_t *src_pd(int index = 0) const override + { return index == 0 ? &src_pd_ : nullptr; } + virtual const cpu_memory_pd_t *dst_pd(int index = 0) const override + { return index == 0 ? &dst_pd_ : nullptr; } + virtual const cpu_memory_pd_t *weights_pd(int index = 0) const override { + if (index == 0) return &weights_pd_; + return nullptr; + } + +protected: + cpu_memory_pd_t src_pd_, dst_pd_; + cpu_memory_pd_t weights_pd_; + + inline memory_format_t src_format() + { + using namespace memory_format; + return utils::pick(this->cdesc_().src_desc.ndims - 3, ncw, nchw, ncdhw); + } + inline memory_format_t wei_format() + { + using namespace memory_format; + return this->with_groups() + ? utils::pick(this->cdesc_().src_desc.ndims - 3, goiw, goihw, goidhw) + : utils::pick(this->cdesc_().src_desc.ndims - 3, oiw, oihw, oidhw); + } + + virtual status_t set_default_params() { + using namespace memory_format; + if (src_pd_.desc()->format == any) + CHECK(src_pd_.set_format(src_format())); + if (dst_pd_.desc()->format == any) + CHECK(dst_pd_.set_format(src_pd_.desc()->format)); + if (weights_pd_.desc()->format == any) + CHECK(weights_pd_.set_format(wei_format())); + return status::success; + } +}; + +using cpu_binary_convolution_fwd_pd_t = _cpu_binary_convolution_fwd_pd_t; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp index 477566b..edfb264 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp @@ -55,7 +55,7 @@ namespace cpu { } \ return ret; \ } \ - virtual pd_t *clone() const override { return nullptr; } \ + virtual pd_t *clone() const override { return new pd_t(*this); } \ virtual const char *name() const override { return impl_name; } #define DECLARE_CPU_CONCAT_PD_T(impl_name, ...) \ DECLARE_CPU_CONCAT_PD_t(impl_name, __VA_ARGS__) diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_convolution_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_convolution_pd.hpp index 1db3f4a..f50287a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_convolution_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_convolution_pd.hpp @@ -31,20 +31,19 @@ namespace mkldnn { namespace impl { namespace cpu { -template -struct _cpu_convolution_fwd_pd_t: public _convolution_fwd_pd_t { +struct cpu_convolution_fwd_pd_t: public convolution_fwd_pd_t { using cpu_memory_pd_t = cpu_memory_t::pd_t; - _cpu_convolution_fwd_pd_t(engine_t *engine, - const typename _cpu_convolution_fwd_pd_t::base_desc_t *adesc, + cpu_convolution_fwd_pd_t(engine_t *engine, + const convolution_desc_t *adesc, const primitive_attr_t *attr, - const typename _cpu_convolution_fwd_pd_t::base_class *hint_fwd_pd) - : _convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) - , src_pd_(this->engine_, &this->cdesc_().src_desc) - , dst_pd_(this->engine_, &this->cdesc_().dst_desc) - , weights_pd_(this->engine_, &this->cdesc_().weights_desc) - , bias_pd_(this->engine_, &this->cdesc_().bias_desc) {} - virtual ~_cpu_convolution_fwd_pd_t() {} + const typename cpu_convolution_fwd_pd_t::base_class *hint_fwd_pd) + : convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , src_pd_(this->engine_, &this->desc()->src_desc) + , dst_pd_(this->engine_, &this->desc()->dst_desc) + , weights_pd_(this->engine_, &this->desc()->weights_desc) + , bias_pd_(this->engine_, &this->desc()->bias_desc) {} + virtual ~cpu_convolution_fwd_pd_t() {} virtual const cpu_memory_pd_t *src_pd(int index = 0) const override { return index == 0 ? &src_pd_ : nullptr; } @@ -56,13 +55,26 @@ struct _cpu_convolution_fwd_pd_t: public _convolution_fwd_pd_t { return nullptr; } - bool want_padded_bias() const { - if (!this->with_bias()) return false; + bool has_padded_dst() const { memory_desc_wrapper dst_d(&dst_pd_); if (!dst_d.is_blocking_desc()) return false; return this->OC() != dst_d.blocking_desc().padding_dims[1]; } + bool wants_padded_bias() const { + if (!this->with_bias()) return false; + return has_padded_dst(); + } + + bool wants_zero_pad_dst(bool jit_impl = true) const { + if (!has_padded_dst()) return false; + const auto &po = this->attr()->post_ops_; + int idx; + if ((idx = po.find(primitive_kind::eltwise)) == -1) return false; + return !math::eltwise_fwd_preserves_zero(po.entry_[idx].eltwise.alg, + jit_impl); + } + protected: cpu_memory_pd_t src_pd_, dst_pd_; cpu_memory_pd_t weights_pd_, bias_pd_; @@ -70,14 +82,14 @@ protected: inline memory_format_t src_format() { using namespace memory_format; - return utils::pick(this->cdesc_().src_desc.ndims - 3, ncw, nchw, ncdhw); + return utils::pick(this->desc()->src_desc.ndims - 3, ncw, nchw, ncdhw); } inline memory_format_t wei_format() { using namespace memory_format; return this->with_groups() - ? utils::pick(this->cdesc_().src_desc.ndims - 3, goiw, goihw, goidhw) - : utils::pick(this->cdesc_().src_desc.ndims - 3, oiw, oihw, oidhw); + ? utils::pick(this->desc()->src_desc.ndims - 3, goiw, goihw, goidhw) + : utils::pick(this->desc()->src_desc.ndims - 3, oiw, oihw, oidhw); } virtual status_t set_default_params() { @@ -90,13 +102,12 @@ protected: CHECK(weights_pd_.set_format(wei_format())); if (bias_pd_.desc()->format == any) CHECK(bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; -using cpu_convolution_fwd_pd_t = _cpu_convolution_fwd_pd_t; -using cpu_convolution_relu_fwd_pd_t = _cpu_convolution_fwd_pd_t; - struct cpu_convolution_bwd_data_pd_t: public convolution_bwd_data_pd_t { using cpu_memory_pd_t = cpu_memory_t::pd_t; @@ -148,6 +159,8 @@ protected: CHECK(weights_pd_.set_format(wei_format())); if (bias_pd_.desc()->format == any) CHECK(bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; @@ -177,7 +190,7 @@ struct cpu_convolution_bwd_weights_pd_t: public convolution_bwd_weights_pd_t { return nullptr; } - bool want_padded_bias() const { + bool wants_padded_bias() const { if (!this->with_bias()) return false; memory_desc_wrapper diff_dst_d(&diff_dst_pd_); if (!diff_dst_d.is_blocking_desc()) return false; @@ -212,6 +225,8 @@ protected: CHECK(diff_weights_pd_.set_format(wei_format())); if (diff_bias_pd_.desc()->format == any) CHECK(diff_bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_deconvolution_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_deconvolution_pd.hpp index cd9cdfe..d236c23 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_deconvolution_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_deconvolution_pd.hpp @@ -28,6 +28,38 @@ #include "type_helpers.hpp" #include "utils.hpp" +#define DECLARE_DECONVOLUTION_PD_t(...) \ + virtual pd_t *clone() const override { return new pd_t(*this); } \ + virtual status_t create_primitive(primitive_t **primitive, \ + const primitive_at_t *inputs, const primitive_t **outputs) \ + const override { \ + double ms = get_msec(); \ + using namespace prop_kind; \ + primitive_t::input_vector ins(inputs, inputs + this->n_inputs()); \ + primitive_t::output_vector outs(outputs, outputs + this->n_outputs()); \ + auto ret = safe_ptr_assign( \ + *primitive, new (__VA_ARGS__)(this, ins, outs)); \ + primitive_t *conv_primitive; \ + if (this->desc()->prop_kind == backward_weights) { \ + primitive_at_t conv_inputs[2]; \ + conv_inputs[0] = inputs[1]; \ + conv_inputs[1] = inputs[0]; \ + conv_pd_->create_primitive( \ + (&conv_primitive), conv_inputs, outputs); \ + } else \ + conv_pd_->create_primitive((&conv_primitive), inputs, outputs); \ + ((__VA_ARGS__ *)(*primitive))->conv_p_ = conv_primitive; \ + ms = get_msec() - ms; \ + if (mkldnn_verbose()->level >= 2) { \ + printf("mkldnn_verbose,create,%s,%g\n", this->info(), ms); \ + fflush(0); \ + } \ + return ret; \ + } \ + virtual const char *name() const override { return conv_pd_->name(); } + +#define DECLARE_DECONVOLUTION_PD_T(...) DECLARE_DECONVOLUTION_PD_t(__VA_ARGS__) + namespace mkldnn { namespace impl { namespace cpu { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp index 104ce88..738725d 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp @@ -24,7 +24,7 @@ #include "cpu_concat.hpp" #include "cpu_sum.hpp" -#include "cpu/ref_rnn.hpp" +#include "cpu/rnn/ref_rnn.hpp" #include "cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp" #include "cpu/jit_avx512_common_1x1_convolution.hpp" @@ -39,14 +39,15 @@ #include "cpu/gemm_convolution.hpp" #include "cpu/gemm_x8s8s32x_convolution.hpp" #include "cpu/ref_convolution.hpp" -#include "cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp" +#include "cpu/jit_avx512_core_x8s8s32x_deconvolution.hpp" +#include "cpu/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp" #include "cpu/ref_deconvolution.hpp" #include "cpu/ref_shuffle.hpp" #include "cpu/jit_uni_eltwise.hpp" #include "cpu/ref_eltwise.hpp" #include "cpu/ref_softmax.hpp" #include "cpu/jit_uni_pooling.hpp" -#include "cpu/jit_avx512_core_i8i8_pooling.hpp" +#include "cpu/jit_uni_i8i8_pooling.hpp" #include "cpu/ref_pooling.hpp" #include "cpu/nchw_pooling.hpp" #include "cpu/nhwc_pooling.hpp" @@ -59,7 +60,7 @@ #include "cpu/nspc_batch_normalization.hpp" #include "cpu/ref_inner_product.hpp" #include "cpu/gemm_inner_product.hpp" -#include "cpu/gemm_u8s8s32x_inner_product.hpp" +#include "cpu/gemm_x8s8s32x_inner_product.hpp" #include "cpu/jit_uni_dw_convolution.hpp" #include "cpu/jit_avx512_core_u8s8s32x_wino_convolution.hpp" #include "cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp" @@ -69,9 +70,13 @@ #include "cpu/jit_uni_depthwise.hpp" #include "cpu/ref_depthwise.hpp" #include "cpu/jit_uni_x8s8s32x_convolution.hpp" -#include "cpu/jit_uni_x8s8s32x_1x1_convolution.hpp" #include "cpu/jit_uni_x8s8s32x_dw_convolution.hpp" -#include "cpu/jit_uni_i8i8_pooling.hpp" +#include "cpu/jit_sse42_i8i8_pooling.hpp" +#include "cpu/jit_uni_planar_convolution.hpp" +#include "cpu/jit_uni_binary_convolution.hpp" +#include "cpu/ref_binary_convolution.hpp" +#include "cpu/jit_uni_binarization.hpp" +#include "cpu/ref_binarization.hpp" namespace mkldnn { namespace impl { @@ -105,9 +110,11 @@ using namespace mkldnn::impl::data_type; #define INSTANCE(...) &primitive_desc_t::create<__VA_ARGS__::pd_t> static const pd_create_f cpu_impl_list[] = { /* RNN */ - INSTANCE(ref_rnn_fwd_t), - INSTANCE(ref_rnn_bwd_t), + INSTANCE(ref_rnn_fwd_f32_t), + INSTANCE(ref_rnn_fwd_u8s8_t), + INSTANCE(ref_rnn_bwd_f32_t), /* conv */ + INSTANCE(jit_avx512_common_planar_convolution_fwd_t), INSTANCE(jit_avx512_common_dw_convolution_fwd_t), INSTANCE(jit_avx512_common_dw_convolution_bwd_data_t), INSTANCE(jit_avx512_common_dw_convolution_bwd_weights_t), @@ -126,6 +133,7 @@ static const pd_create_f cpu_impl_list[] = { INSTANCE(jit_avx512_common_convolution_fwd_t), INSTANCE(jit_avx512_common_convolution_bwd_data_t), INSTANCE(jit_avx512_common_convolution_bwd_weights_t), + INSTANCE(jit_avx2_planar_convolution_fwd_t), INSTANCE(jit_avx2_dw_convolution_fwd_t), INSTANCE(jit_avx2_dw_convolution_bwd_data_t), INSTANCE(jit_avx2_dw_convolution_bwd_weights_t), @@ -194,14 +202,14 @@ static const pd_create_f cpu_impl_list[] = { INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t), INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t), INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), + INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), INSTANCE(_gemm_u8s8s32x_convolution_bwd_data_t), INSTANCE(_gemm_u8s8s32x_convolution_bwd_data_t), INSTANCE(_gemm_u8s8s32x_convolution_bwd_data_t), @@ -218,10 +226,22 @@ static const pd_create_f cpu_impl_list[] = { INSTANCE(ref_convolution_bwd_data_t), INSTANCE(ref_convolution_bwd_weights_t), /* deconv */ - INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t), - INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t), - INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t), - INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), + INSTANCE(_jit_avx512_core_x8s8s32x_deconvolution_fwd_t), INSTANCE(ref_deconvolution_bwd_weights_t), INSTANCE(ref_deconvolution_bwd_data_t), INSTANCE(ref_deconvolution_fwd_t), @@ -269,9 +289,9 @@ static const pd_create_f cpu_impl_list[] = { INSTANCE(ref_pooling_fwd_t), INSTANCE(ref_pooling_bwd_t), /* pool (int) */ - INSTANCE(jit_avx512_core_i8i8_pooling_fwd_t), + INSTANCE(jit_uni_i8i8_pooling_fwd_t), INSTANCE(jit_uni_i8i8_pooling_fwd_t), - INSTANCE(jit_uni_i8i8_pooling_fwd_t), + INSTANCE(jit_sse42_i8i8_pooling_fwd_t), INSTANCE(ref_pooling_fwd_t), INSTANCE(ref_pooling_fwd_t), INSTANCE(ref_pooling_fwd_t), @@ -307,69 +327,35 @@ static const pd_create_f cpu_impl_list[] = { INSTANCE(ref_inner_product_bwd_data_t), INSTANCE(ref_inner_product_bwd_weights_t), /* inner product (int) */ - INSTANCE(gemm_u8s8s32x_inner_product_fwd_t), - INSTANCE(gemm_u8s8s32x_inner_product_fwd_t), - INSTANCE(gemm_u8s8s32x_inner_product_fwd_t), - INSTANCE(gemm_u8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), + INSTANCE(gemm_x8s8s32x_inner_product_fwd_t), INSTANCE(ref_inner_product_fwd_t), INSTANCE(ref_inner_product_fwd_t), INSTANCE(ref_inner_product_fwd_t), INSTANCE(ref_inner_product_fwd_t), INSTANCE(ref_inner_product_fwd_t), INSTANCE(ref_inner_product_bwd_data_t), - /* conv_eltwise */ - INSTANCE(jit_avx512_common_dw_convolution_relu_t), - INSTANCE(jit_avx512_common_convolution_winograd_relu_t), - INSTANCE(jit_avx512_common_1x1_convolution_relu_f32_t), - INSTANCE(jit_avx512_common_convolution_relu_t), - INSTANCE(jit_avx2_dw_convolution_relu_t), - INSTANCE(jit_avx2_1x1_convolution_relu_t), - INSTANCE(jit_sse42_dw_convolution_relu_t), - INSTANCE(jit_sse42_1x1_convolution_relu_t), - INSTANCE(jit_avx2_convolution_relu_t), - INSTANCE(jit_sse42_convolution_relu_t), - INSTANCE(gemm_convolution_relu_t), - INSTANCE(ref_convolution_relu_t), - /* conv_eltwise (int) */ - INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_relu_t), - INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_relu_t), - INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_relu_t), - INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_relu_t), - INSTANCE(jit_avx512_common_1x1_convolution_relu_s16s16s32_t), - INSTANCE(jit_avx512_common_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(_gemm_x8s8s32x_convolution_fwd_t), - INSTANCE(ref_convolution_relu_t), - INSTANCE(ref_convolution_relu_t), - INSTANCE(ref_convolution_relu_t), - INSTANCE(ref_convolution_relu_t), /* roi pooling */ INSTANCE(jit_uni_roi_pooling_fwd_t), INSTANCE(jit_uni_roi_pooling_fwd_t), INSTANCE(jit_uni_roi_pooling_fwd_t), INSTANCE(ref_roi_pooling_fwd_t), + /* binary convolution */ +// INSTANCE(jit_uni_binary_convolution_fwd_t), + INSTANCE(jit_uni_binary_convolution_fwd_t), + INSTANCE(jit_uni_binary_convolution_fwd_t), + INSTANCE(ref_binary_convolution_fwd_t), + /* binarization */ + INSTANCE(jit_uni_binarization_fwd_t), + INSTANCE(jit_uni_binarization_fwd_t), + INSTANCE(jit_uni_binarization_fwd_t), + INSTANCE(ref_binarization_fwd_t), /* eol */ nullptr, }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.cpp index 4bbff22..e1c2dd6 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.cpp @@ -63,6 +63,7 @@ typed_zero_pad_data( template typename utils::enable_if::blk_fmt == bf::_4o || format_traits::blk_fmt == bf::_8o || format_traits::blk_fmt == bf::_16o >::type typed_zero_pad_weights(const memory_desc_wrapper &m_d, @@ -234,10 +235,10 @@ void typed_zero_pad_generic_blocked(const memory_desc_wrapper &m_d, assert(step_dim >= 0 && "no zero padding is required"); if (step_dim < 0) return; - parallel_nd(nelems, [&](ptrdiff_t e) { + parallel_nd(nelems / step, [&](ptrdiff_t e1) { bool need_zero = false; - ptrdiff_t idx = e / step; + ptrdiff_t idx = e1; for (int d = step_dim; d >= 0; --d) { if (idx % pdims[d] >= dims[d]) { need_zero = true; @@ -248,14 +249,14 @@ void typed_zero_pad_generic_blocked(const memory_desc_wrapper &m_d, if (need_zero) { for (ptrdiff_t e0 = 0; e0 < step; ++e0) - data[m_d.off_l(e + e0, true)] = 0; + data[m_d.off_l(e1 * step + e0, true)] = 0; } }); } template -status_t cpu_memory_t::typed_zero_pad() { - const memory_desc_wrapper mpd(&conf_); +status_t cpu_memory_t::typed_zero_pad() const { + const memory_desc_wrapper mpd(pd()); // FIXME: guard this check for non-blocked layout if (mpd.nelems(false) == mpd.nelems(true)) @@ -267,9 +268,12 @@ status_t cpu_memory_t::typed_zero_pad() { /* data */ # define MAYBE_DATA(f) if (fmt == f) \ { typed_zero_pad_data(mpd, data); return success; } + MAYBE_DATA(nCw4c); MAYBE_DATA(nCw8c); MAYBE_DATA(nCw16c); + MAYBE_DATA(nChw4c); MAYBE_DATA(nChw8c); + MAYBE_DATA(nCdhw4c); MAYBE_DATA(nCdhw8c); MAYBE_DATA(nChw16c); MAYBE_DATA(nCdhw16c); @@ -277,10 +281,12 @@ status_t cpu_memory_t::typed_zero_pad() { /* weights */ # define MAYBE_WEIGHTS(f) if (fmt == f) \ { typed_zero_pad_weights(mpd, data); return success; } + MAYBE_WEIGHTS(OIdhw4i4o); MAYBE_WEIGHTS(OIdhw8i8o); MAYBE_WEIGHTS(OIdhw8o8i); MAYBE_WEIGHTS(OIdhw16i16o); MAYBE_WEIGHTS(OIdhw16o16i); + MAYBE_WEIGHTS(Oidhw4o); MAYBE_WEIGHTS(Oidhw16o); MAYBE_WEIGHTS(Odhwi16o); MAYBE_WEIGHTS(Odhwi8o); @@ -288,15 +294,18 @@ status_t cpu_memory_t::typed_zero_pad() { MAYBE_WEIGHTS(oIhw16i); MAYBE_WEIGHTS(oIdhw8i); MAYBE_WEIGHTS(oIdhw16i); + MAYBE_WEIGHTS(OIhw4i4o); MAYBE_WEIGHTS(OIhw8i8o); MAYBE_WEIGHTS(OIhw16i16o); MAYBE_WEIGHTS(OIhw4i16o4i); MAYBE_WEIGHTS(OIhw4i16o4i_s8s8); + MAYBE_WEIGHTS(OIw4i4o); MAYBE_WEIGHTS(Owi8o); MAYBE_WEIGHTS(OIw8i8o); MAYBE_WEIGHTS(OIw8o8i); MAYBE_WEIGHTS(OIw16i16o); MAYBE_WEIGHTS(OIw16o16i); + MAYBE_WEIGHTS(Oiw4o); MAYBE_WEIGHTS(Oiw16o); MAYBE_WEIGHTS(Owi16o); MAYBE_WEIGHTS(OIw8i16o2i); @@ -308,18 +317,27 @@ status_t cpu_memory_t::typed_zero_pad() { MAYBE_WEIGHTS(OIhw8o8i); MAYBE_WEIGHTS(OIhw16o16i); MAYBE_WEIGHTS(IOhw16o16i); + MAYBE_WEIGHTS(Oihw4o); MAYBE_WEIGHTS(Oihw16o); MAYBE_WEIGHTS(Ohwi8o); + MAYBE_WEIGHTS(Ohwi4o); MAYBE_WEIGHTS(Ohwi16o); + MAYBE_WEIGHTS(gOIhw4o4i_s8s8); + MAYBE_WEIGHTS(gOIhw4o4i_s8s8); + MAYBE_WEIGHTS(gOIhw4i4o); MAYBE_WEIGHTS(gOIhw8i8o); MAYBE_WEIGHTS(gOIhw16i16o); MAYBE_WEIGHTS(gOIhw4i16o4i); MAYBE_WEIGHTS(gOIhw4i16o4i_s8s8); + MAYBE_WEIGHTS(gOIhw2i8o4i); + MAYBE_WEIGHTS(gOIhw2i8o4i_s8s8); + MAYBE_WEIGHTS(gOIw4i4o); MAYBE_WEIGHTS(gOwi8o); MAYBE_WEIGHTS(gOIw8i8o); MAYBE_WEIGHTS(gOIw8o8i); MAYBE_WEIGHTS(gOIw16i16o); MAYBE_WEIGHTS(gOIw16o16i); + MAYBE_WEIGHTS(gOiw4o); MAYBE_WEIGHTS(gOiw16o); MAYBE_WEIGHTS(gOwi16o); MAYBE_WEIGHTS(gOIw8i16o2i); @@ -331,13 +349,17 @@ status_t cpu_memory_t::typed_zero_pad() { MAYBE_WEIGHTS(gOIhw8o8i); MAYBE_WEIGHTS(gOIhw16o16i); MAYBE_WEIGHTS(gIOhw16o16i); + MAYBE_WEIGHTS(gOihw4o); MAYBE_WEIGHTS(gOihw16o); MAYBE_WEIGHTS(gOhwi8o); + MAYBE_WEIGHTS(gOhwi4o); MAYBE_WEIGHTS(gOhwi16o); + MAYBE_WEIGHTS(gOIdhw4i4o); MAYBE_WEIGHTS(gOIdhw8i8o); MAYBE_WEIGHTS(gOIdhw8o8i); MAYBE_WEIGHTS(gOIdhw16i16o); MAYBE_WEIGHTS(gOIdhw16o16i); + MAYBE_WEIGHTS(gOidhw4o); MAYBE_WEIGHTS(gOidhw16o); MAYBE_WEIGHTS(gOdhwi16o); MAYBE_WEIGHTS(gOdhwi8o); @@ -354,8 +376,8 @@ status_t cpu_memory_t::typed_zero_pad() { return unimplemented; } -status_t cpu_memory_t::zero_pad() { - memory_desc_wrapper md(&conf_); +status_t cpu_memory_t::zero_pad() const { + memory_desc_wrapper md(pd()); const bool skip_zeroing = false || data_ == nullptr || md.is_zero() @@ -368,6 +390,7 @@ status_t cpu_memory_t::zero_pad() { case s16: return typed_zero_pad(); case s8: return typed_zero_pad(); case u8: return typed_zero_pad(); + case bin: return typed_zero_pad(); default: assert(!"memory is undefined"); return unimplemented; } return unimplemented; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.hpp index 9932e7b..830adcc 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.hpp @@ -49,12 +49,12 @@ struct cpu_memory_t: public cpu_primitive_t { } }; - cpu_memory_t(const pd_t *mpd) - : cpu_primitive_t(&conf_, input_vector(), output_vector(1, this)) - , conf_(*mpd), data_(nullptr) {} + cpu_memory_t(const pd_t *apd) + : cpu_primitive_t(apd, input_vector(), output_vector(1, this)) + , data_(nullptr) {} virtual ~cpu_memory_t() {} - virtual void execute(mkldnn::impl::event_t *e) + virtual void execute(mkldnn::impl::event_t *e) const { e->set_state(event_t::ready); } virtual status_t get_data_handle(void **handle) const { @@ -71,13 +71,14 @@ struct cpu_memory_t: public cpu_primitive_t { virtual const char* const_memory(size_t output_index = 0) const { assert(output_index == 0); return data_; } + mkldnn::impl::status_t zero_pad() const; + private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } char *data_; template - mkldnn::impl::status_t typed_zero_pad(); - mkldnn::impl::status_t zero_pad(); + mkldnn::impl::status_t typed_zero_pad() const; }; struct cpu_view_t: public cpu_primitive_t { @@ -168,12 +169,12 @@ struct cpu_view_t: public cpu_primitive_t { : view_pd_t(src_pd.engine()), src_pd_(src_pd), dst_pd_(dst_pd) {} }; - cpu_view_t(const pd_t *conf, const input_vector &inputs) - : cpu_primitive_t(&conf_, inputs, output_vector(1, this)), conf_(*conf) + cpu_view_t(const pd_t *apd, const input_vector &inputs) + : cpu_primitive_t(apd, inputs, output_vector(1, this)) {} virtual ~cpu_view_t() {} - virtual void execute(mkldnn::impl::event_t *e) + virtual void execute(mkldnn::impl::event_t *e) const { e->set_state(event_t::ready); } virtual char *memory(size_t output_index = 0) const @@ -182,7 +183,7 @@ struct cpu_view_t: public cpu_primitive_t { { assert(output_index == 0); return input_memory(); } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.cpp similarity index 60% rename from inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_f32.cpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.cpp index 92e447c..80e06e7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2018 Intel Corporation +* Copyright 2018 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,27 +14,20 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_test_common.hpp" -#include "gtest/gtest.h" +#include "cpu_primitive.hpp" +#include "cpu_memory.hpp" -#include "mkldnn.hpp" -#include "test_convolution_relu_forward_common.hpp" namespace mkldnn { +namespace impl { +namespace cpu { -using convolution_test = convolution_relu_test; - -TEST_P(convolution_test, TestConvolution) +const cpu_memory_t *cpu_primitive_t::output_memory_primitive(size_t index) const { + return static_cast(outputs()[index]); } -#define FP32 -#define DIRECTION_FORWARD -#include "convolution_common.h" - -#undef ELTWISE_ALPHA -#define ELTWISE_ALPHA 0.2f -#undef ELTWISE_BETA -#define ELTWISE_BETA 0.0f -#include "convolution_common.h" } +} +} + diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.hpp index 136aa26..13aa078 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_primitive.hpp @@ -21,18 +21,33 @@ #include "c_types_map.hpp" #include "event.hpp" +#include "memory_tracking.hpp" #include "primitive.hpp" +#include "scratchpad.hpp" namespace mkldnn { namespace impl { namespace cpu { +struct cpu_memory_t; + struct cpu_primitive_t: public primitive_t { cpu_primitive_t(const primitive_desc_t *pd, const input_vector &inputs, - const output_vector &outputs) - : primitive_t(pd, inputs, outputs) - {} - virtual ~cpu_primitive_t() {} + const output_vector &outputs, bool use_global_scratchpad = false) + : primitive_t(pd, inputs, outputs), scratchpad_buffer_(nullptr) + , global_scratchpad_(nullptr) + { + size_t scratchpad_size = this->pd()->scratchpad_registry().size(); + if (use_global_scratchpad) + global_scratchpad_ = create_scratchpad(scratchpad_size); + else + scratchpad_buffer_ = malloc(scratchpad_size, 64); + } + + virtual ~cpu_primitive_t() { + delete global_scratchpad_; + free(scratchpad_buffer_); + } virtual char *memory(size_t output_index = 0) const { if (output_index >= this->outputs().size()) return nullptr; @@ -54,6 +69,19 @@ struct cpu_primitive_t: public primitive_t { this->inputs()[index].primitive); return p->const_memory(oi); } + + const cpu_memory_t *output_memory_primitive(size_t index = 0) const; + +protected: + memory_tracking::grantor_t scratchpad() const { + return pd()->scratchpad_registry().grantor(global_scratchpad_ + ? global_scratchpad_->get() : scratchpad_buffer_); + } + +private: + /* quite ugly, but luckily both will get away in v1.0 */ + void *scratchpad_buffer_; + scratchpad_t *global_scratchpad_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp index 116c4a8..1d41ac5 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp @@ -27,6 +27,8 @@ namespace mkldnn { namespace impl { namespace cpu { +using namespace memory_tracking::names; + void reduce_balancer_t::balance() { using namespace nstl; using namespace utils; @@ -277,90 +279,88 @@ inline reducer_2d_driver_t *create_reduce_2d_drv(int n_src, /* cpu_reducer_t */ template -cpu_reducer_t::cpu_reducer_t(const reduce_balancer_t &balancer) - : balancer_(balancer), workspace_(nullptr) - , drv_(nullptr), barriers_(nullptr) -{ - allocate_workspace(); - if (balancer_.nthr_per_group_ > 1) { - barriers_ = (simple_barrier::ctx_t *)malloc( - balancer_.ngroups_ * sizeof(simple_barrier::ctx_t), 64); - for (int i = 0; i < balancer_.ngroups_; ++i) - simple_barrier::ctx_init(&barriers_[i]); - drv_ = create_reduce_2d_drv(balancer_.nthr_per_group_ - 1, - ws_per_thread(), 0, 0, false); - } -} +void cpu_reducer_t::conf_t::init_scratchpad( + memory_tracking::registrar_t &scratchpad) const { + if (balancer_.nthr_per_group_ == 1) return; -template -cpu_reducer_t::~cpu_reducer_t() { - deallocate_workspace(); - free(barriers_); - delete drv_; + const size_t space_size = balancer_.ngroups_ + * (balancer_.nthr_per_group_ - 1) + * cpu_reducer_t::space_per_thread(balancer_); + scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K); + scratchpad.book(key_reducer_space_bctx, + sizeof(simple_barrier::ctx_t) * balancer_.ngroups_); } template -void cpu_reducer_t::allocate_workspace() { - if (balancer_.nthr_per_group_ == 1) return; +cpu_reducer_t::cpu_reducer_t(const conf_t &conf) + : conf_(conf), drv_(nullptr) +{ + if (balancer().nthr_per_group_ == 1) return; - const size_t ws_size = balancer_.ngroups_ * (balancer_.nthr_per_group_ - 1) - * ws_per_thread(); - workspace_ = (data_t *)malloc(ws_size * sizeof(data_t), PAGE_4K); + drv_ = create_reduce_2d_drv(balancer().nthr_per_group_ - 1, + space_per_thread(balancer()), 0, 0, false); } template +cpu_reducer_t::~cpu_reducer_t() { delete drv_; } + +template typename cpu_reducer_t::data_t * -cpu_reducer_t::get_local_ptr(int ithr, data_t *dst) { - const int id_in_grp = balancer_.id_in_group(ithr); +cpu_reducer_t::get_local_ptr(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + const int id_in_grp = balancer().id_in_group(ithr); /* threads 0 from each group writes directly to the destination */ if (id_in_grp == 0) - return dst + balancer_.ithr_job_off(ithr) * balancer_.job_size_; + return dst + balancer().ithr_job_off(ithr) * balancer().job_size_; - const int grp_id = balancer_.group_id(ithr); - const int offset_factor = grp_id * (balancer_.nthr_per_group_ - 1) + const int grp_id = balancer().group_id(ithr); + const int offset_factor = grp_id * (balancer().nthr_per_group_ - 1) + (id_in_grp - 1); - return workspace_ + offset_factor * ws_per_thread(); + + auto space = scratchpad.template get(key_reducer_space); + return space + offset_factor * space_per_thread(balancer()); } template -void cpu_reducer_t::reduce_nolock(int ithr, data_t *dst) { - bool redundant_reduction = balancer_.nthr_per_group_ == 1 - || balancer_.idle(ithr); +void cpu_reducer_t::reduce_nolock(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + bool redundant_reduction = balancer().nthr_per_group_ == 1 + || balancer().idle(ithr); if (redundant_reduction) return; #ifdef SIMPLE_IMPL - if (balancer_.id_in_group(ithr) != 0) + if (balancer().id_in_group(ithr) != 0) return; /* only threads 0 do the reduction */ - const int njobs_in_grp = balancer_.ithr_njobs(ithr); - data_t *d = get_local_ptr(ithr, dst); + const int njobs_in_grp = balancer().ithr_njobs(ithr); + data_t *d = get_local_ptr(ithr, dst, scratchpad); for (int id_in_grp = 1; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp) { - const data_t *wspace = get_local_ptr(ithr + id_in_grp, dst); - for (size_t i = 0; i < (size_t)njobs_in_grp * balancer_.job_size_; ++i) - d[i] += wspace[i]; + const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad); + for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i) + d[i] += space[i]; } #else using namespace utils; - const int id_in_grp = balancer_.id_in_group(ithr); - const int njobs_in_grp = balancer_.ithr_njobs(ithr); + const int id_in_grp = balancer().id_in_group(ithr); + const int njobs_in_grp = balancer().ithr_njobs(ithr); const size_t cl = 64 / sizeof(data_t); - const size_t reduction_size = njobs_in_grp * balancer_.job_size_; + const size_t reduction_size = njobs_in_grp * balancer().job_size_; size_t start{0}, end{0}; - balance211(div_up(reduction_size, cl), balancer_.nthr_per_group_, + balance211(div_up(reduction_size, cl), balancer().nthr_per_group_, id_in_grp, start, end); if (start == end) return; - data_t *d = get_local_ptr(ithr - id_in_grp, dst) + start * cl; - const data_t *wspace = get_local_ptr(ithr - id_in_grp + 1, dst) + data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl; + const data_t *space = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) + start * cl; const size_t len = nstl::min(end * cl, reduction_size) - start * cl; - (*drv_)(d, wspace, 1, len); + (*drv_)(d, space, 1, len); #endif } @@ -370,69 +370,48 @@ template struct cpu_reducer_t; /* cpu_reducer_2d_t */ template -cpu_reducer_2d_t::cpu_reducer_2d_t( - const reduce_balancer_t &balancer, - int job_size_x, int job_size_y, int x_block, - int dst_x, int dst_y, bool master_uses_dst) - : balancer_(balancer), master_uses_dst_(master_uses_dst) - , job_size_x_(job_size_x), job_size_y_(job_size_y), x_block_(x_block) - , dst_x_(dst_x), dst_y_(dst_y), workspace_(nullptr), drv_(nullptr) - , barriers_(nullptr) -{ - allocate_workspace(); - if (balancer_.nthr_per_group_ > 1) { - barriers_ = (simple_barrier::ctx_t *)malloc( - balancer_.ngroups_ * sizeof(simple_barrier::ctx_t), 64); - for (int i = 0; i < balancer_.ngroups_; ++i) - simple_barrier::ctx_init(&barriers_[i]); - const int n_src = balancer_.nthr_per_group_ - master_uses_dst_; - drv_ = create_reduce_2d_drv(n_src, ws_per_thread(), - job_size_x_, dst_x_, !master_uses_dst_); - } -} +void cpu_reducer_2d_t::conf_t::init_scratchpad( + memory_tracking::registrar_t &scratchpad) const { + if (balancer_.nthr_per_group_ == 1) return; -template -cpu_reducer_2d_t::~cpu_reducer_2d_t() { - deallocate_workspace(); - free(barriers_); - delete drv_; + const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_ + * cpu_reducer_2d_t::space_per_thread(balancer_); + scratchpad.book(key_reducer_space, sizeof(data_t) * space_size); + scratchpad.book(key_reducer_space_bctx, + sizeof(simple_barrier::ctx_t) * balancer_.ngroups_); } template -void cpu_reducer_2d_t::allocate_workspace() { - if (balancer_.nthr_per_group_ == 1) return; +cpu_reducer_2d_t::cpu_reducer_2d_t(const conf_t &conf) + : conf_(conf), drv_(nullptr) +{ + if (balancer().nthr_per_group_ == 1) return; - const size_t ws_size = balancer_.ngroups_ - * (balancer_.nthr_per_group_ - master_uses_dst_) - * ws_per_thread(); - workspace_ = (data_t *)malloc(ws_size * sizeof(data_t), 64); + drv_ = create_reduce_2d_drv(balancer().nthr_per_group_, + space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_, + true); } template -typename cpu_reducer_2d_t::data_t * -cpu_reducer_2d_t::get_local_ptr(int ithr, data_t *dst) { - const int id_in_grp = balancer_.id_in_group(ithr); - - /* master threads from each group should write directly to the destination - * if they are allowed to use it */ - if (master_uses_dst_ && id_in_grp == 0) { - assert(!"unsupported"); - return dst + balancer_.ithr_job_off(ithr) * balancer_.job_size_; - } +cpu_reducer_2d_t::~cpu_reducer_2d_t() { delete drv_; } - const int grp_id = balancer_.group_id(ithr); - const int offset_factor - = grp_id * (balancer_.nthr_per_group_ - master_uses_dst_) - + (id_in_grp - master_uses_dst_); - return workspace_ + offset_factor * ws_per_thread(); +template +typename cpu_reducer_2d_t::data_t *cpu_reducer_2d_t:: +get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const { + const int id_in_grp = balancer().id_in_group(ithr); + const int grp_id = balancer().group_id(ithr); + const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp; + auto space = scratchpad.template get(key_reducer_space); + return space + offset_factor * space_per_thread(balancer()); } template int cpu_reducer_2d_t::choose_x_blocking(int nx, int ny, - int nthr_per_grp) { + int nthr_per_grp) const { // find x_blocking for better balance reducing work between threads - assert(x_block_ > 0 && nx > x_block_ && nx % x_block_ == 0); - int x_blocking = nx / x_block_; + assert(conf_.x_block_ > 0 && nx > conf_.x_block_ + && nx % conf_.x_block_ == 0); + int x_blocking = nx / conf_.x_block_; int min_x_blocking = utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny)); while (true) { @@ -444,48 +423,49 @@ int cpu_reducer_2d_t::choose_x_blocking(int nx, int ny, break; } if (x_blocking >= min_x_blocking * 4) x_blocking = 1; - x_blocking *= x_block_; + x_blocking *= conf_.x_block_; return x_blocking; } template -void cpu_reducer_2d_t::reduce_block(const data_t* wspace_base, - data_t *dst, int job, int start_y, int start_x, - int ny_start, int nx_start, int ny_step, int nx_step) { - data_t *d = dst + (start_y + ny_start) * dst_x_ +void cpu_reducer_2d_t::reduce_block(const data_t* space_base, + data_t *dst, int job, int start_y, int start_x, + int ny_start, int nx_start, int ny_step, int nx_step) const { + data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ + start_x + nx_start; - const data_t *wspace = wspace_base + job * balancer_.job_size_ - + ny_start * job_size_x_ + nx_start; + const data_t *space = space_base + job * balancer().job_size_ + + ny_start * conf_.job_size_x_ + nx_start; #ifdef SIMPLE_IMPL - const int idg_start = master_uses_dst_ ? 1 : 0; - for (int idg = idg_start; idg < balancer_.nthr_per_group_; ++idg) { - const data_t *w = &wspace[(idg - idg_start) * ws_per_thread()]; + for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) { + const data_t *w = &space[idg * space_per_thread(balancer())]; for (int y = 0; y < ny_step; ++y) for (int x = 0; x < nx_step; ++x) { - d[y * dst_x_ + x] = (idg == 0 ? 0 : d[y * dst_x_ + x]) - + w[y * job_size_x_ + x]; + d[y * conf_.dst_x_ + x] + = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x]) + + w[y * conf_.job_size_x_ + x]; } } #else - (*drv_)(d, wspace, ny_step, nx_step); + (*drv_)(d, space, ny_step, nx_step); #endif } template -void cpu_reducer_2d_t::reduce_nolock(int ithr, data_t *dst) { - bool redundant_reduction = balancer_.nthr_per_group_ == 1 - || balancer_.idle(ithr); +void cpu_reducer_2d_t::reduce_nolock(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + bool redundant_reduction = balancer().nthr_per_group_ == 1 + || balancer().idle(ithr); if (redundant_reduction) return; - const int id_in_grp = balancer_.id_in_group(ithr); - const int njobs_in_grp = balancer_.ithr_njobs(ithr); - const int njobs_x = utils::div_up(dst_x_, job_size_x_); - const int global_job_start = balancer_.ithr_job_off(ithr); + const int id_in_grp = balancer().id_in_group(ithr); + const int njobs_in_grp = balancer().ithr_njobs(ithr); + const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_); + const int global_job_start = balancer().ithr_job_off(ithr); - const data_t *wspace_base = get_local_ptr(ithr - id_in_grp, nullptr); + const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad); - const int pr_grps = nstl::min(njobs_in_grp, balancer_.nthr_per_group_); - const int pr_nthr_per_grp = balancer_.nthr_per_group_ / pr_grps; + const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_); + const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps; if (id_in_grp >= pr_grps * pr_nthr_per_grp) return; /* idle */ @@ -500,10 +480,10 @@ void cpu_reducer_2d_t::reduce_nolock(int ithr, data_t *dst) { const int global_job = global_job_start + j; const int j_y = global_job / njobs_x; const int j_x = global_job % njobs_x; - const int start_y = j_y * job_size_y_; - const int start_x = j_x * job_size_x_; - const int ny = nstl::min(dst_y_ - start_y, job_size_y_); - const int nx = nstl::min(dst_x_ - start_x, job_size_x_); + const int start_y = j_y * conf_.job_size_y_; + const int start_x = j_x * conf_.job_size_x_; + const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_); + const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_); int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp); int nxy_start{0}, nxy_end{0}; @@ -516,18 +496,18 @@ void cpu_reducer_2d_t::reduce_nolock(int ithr, data_t *dst) { int nxy = nxy_start; if (nxy % nx != 0) { int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy); - reduce_block(wspace_base, dst, j, start_y, start_x, + reduce_block(space_base, dst, j, start_y, start_x, nxy / nx, nxy % nx, 1, nx_step); nxy += nx_step; } if ((nxy_end - nxy) > nx) { int ny_step = (nxy_end - nxy) / nx; - reduce_block(wspace_base, dst, j, start_y, start_x, + reduce_block(space_base, dst, j, start_y, start_x, nxy / nx, nxy % nx, ny_step, nx); nxy += nx * ny_step; } if ((nxy_end - nxy) > 0) { - reduce_block(wspace_base, dst, j, start_y, start_x, + reduce_block(space_base, dst, j, start_y, start_x, nxy / nx, nxy % nx, 1, nxy_end - nxy); } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.hpp index 6c36419..27f5939 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.hpp @@ -20,6 +20,7 @@ #include #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "mkldnn_thread.hpp" #include "mkldnn_types.h" #include "nstl.hpp" @@ -63,12 +64,23 @@ namespace cpu { * Intel(R) TBB) the # of thread per group is enforced to be 1. */ struct reduce_balancer_t { + reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */ reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size, size_t max_buffer_size) - : syncable_(mkldnn_thr_syncable()), nthr_(nthr), job_size_(job_size) - , njobs_(njobs), reduction_size_(reduction_size) - , max_buffer_size_(max_buffer_size) - { balance(); } + { init(nthr, job_size, njobs, reduction_size, max_buffer_size); } + + reduce_balancer_t &init(int nthr, int job_size, int njobs, + int reduction_size, size_t max_buffer_size) + { + syncable_ = mkldnn_thr_syncable(); + nthr_ = nthr; + job_size_ = job_size; + njobs_ = njobs; + reduction_size_ = reduction_size; + max_buffer_size_ = max_buffer_size; + balance(); + return *this; + } bool syncable_; int nthr_; @@ -154,14 +166,29 @@ template struct cpu_reducer_t { typedef typename prec_traits::type data_t; - cpu_reducer_t(const reduce_balancer_t &balancer); + struct conf_t { + conf_t() = default; + conf_t &init(const reduce_balancer_t &balancer) + { balancer_ = balancer; return *this; } + + void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; + + reduce_balancer_t balancer_; + }; + + cpu_reducer_t(const conf_t &conf); ~cpu_reducer_t(); - /** allocates internal buffer for partial computations. */ - void allocate_workspace(); + /** initializes reducer. + * Must be called from a single thread prior to actual usage */ + void init(const memory_tracking::grantor_t &scratchpad) const { + if (balancer().nthr_per_group_ == 1) return; - /** deallocates internal buffer. */ - void deallocate_workspace() { if (workspace_) free(workspace_); } + auto bctx = scratchpad.template get( + memory_tracking::names::key_reducer_space_bctx); + for (int i = 0; i < balancer().ngroups_; ++i) + simple_barrier::ctx_init(&bctx[i]); + } /** for given thread returns the pointer where to put partial results. * Reduction destination @p dst must be provided as well (master threads @@ -172,86 +199,118 @@ struct cpu_reducer_t { * threads should start writing from the very beginning of returned * address. */ - data_t *get_local_ptr(int ithr, data_t *dst); + data_t *get_local_ptr(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const; /** performs the reduction with built-in synchronization. */ - void reduce(int ithr, data_t *dst) { - bool redundant_reduction = balancer_.nthr_per_group_ == 1 - || balancer_.idle(ithr); + void reduce(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + bool redundant_reduction = balancer().nthr_per_group_ == 1 + || balancer().idle(ithr); if (redundant_reduction) return; - simple_barrier::barrier(&barriers_[balancer_.group_id(ithr)], - balancer_.nthr_per_group_); - reduce_nolock(ithr, dst); + auto bctx = scratchpad.template get( + memory_tracking::names::key_reducer_space_bctx); + simple_barrier::barrier(&bctx[balancer().group_id(ithr)], + balancer().nthr_per_group_); + + reduce_nolock(ithr, dst, scratchpad); } - reduce_balancer_t balancer_; + const reduce_balancer_t &balancer() const { return conf_.balancer_; } private: - size_t ws_per_thread() const - { return balancer_.njobs_per_group_ub_ * balancer_.job_size_; } + static size_t space_per_thread(const reduce_balancer_t &balancer) + { return balancer.njobs_per_group_ub_ * balancer.job_size_; } + + /* The scratchpad is organized as follows: + * + * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; + * simple_barrier::ctx_t barriers[groups_]; */ - data_t *workspace_; /** data_t[nthr_][njobs_per_group_ub_][jobs_size_] */ + const conf_t conf_; reducer_2d_driver_t *drv_; - simple_barrier::ctx_t *barriers_; /** barrier::ctx_t[groups_] */ - void reduce_nolock(int ithr, data_t *dst); + void reduce_nolock(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const; }; template struct cpu_reducer_2d_t { typedef typename prec_traits::type data_t; - cpu_reducer_2d_t(const reduce_balancer_t &balancer, int job_size_x, - int job_size_y, int x_block, int dst_x, int dst_y, - bool master_uses_dst); + struct conf_t { + conf_t() = default; + conf_t &init(const reduce_balancer_t &balancer, int job_size_x, + int job_size_y, int x_block, int dst_x, int dst_y) { + balancer_ = balancer; + job_size_x_ = job_size_x; + job_size_y_ = job_size_y; + x_block_ = x_block; + dst_x_ = dst_x; + dst_y_ = dst_y; + return *this; + } + + void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; + + reduce_balancer_t balancer_; + int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_; + }; + + cpu_reducer_2d_t(const conf_t &conf); ~cpu_reducer_2d_t(); - /** allocates internal buffer for partial computations. */ - void allocate_workspace(); + /** initializes reducer. + * Must be called from a single thread prior to actual usage */ + void init(const memory_tracking::grantor_t &scratchpad) const { + if (balancer().nthr_per_group_ == 1) return; - /** deallocates internal buffer. */ - void deallocate_workspace() { if (workspace_) free(workspace_); } + auto bctx = scratchpad.template get( + memory_tracking::names::key_reducer_space_bctx); + for (int i = 0; i < balancer().ngroups_; ++i) + simple_barrier::ctx_init(&bctx[i]); + } - /** for given thread returns the pointer where to put partial results. - * Depending on @p master_uses_dst_ returned pointer for master threads - * would be either equal to the destination memory or to the workspace (in - * contrast, cpu_reducer_t struct always use destination memory for master - * threads). - * - * @note: @p master_uses_dst_ == #false is unimplemented at the moment - */ - data_t *get_local_ptr(int ithr, data_t *dst); + /** for given thread returns the pointer where to put partial results */ + data_t *get_local_ptr(int ithr, + const memory_tracking::grantor_t &scratchpad) const; /** performs the reduction with built-in synchronization. */ - void reduce(int ithr, data_t *dst) { - bool redundant_reduction = balancer_.nthr_per_group_ == 1 - || balancer_.idle(ithr); + void reduce(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + bool redundant_reduction = balancer().nthr_per_group_ == 1 + || balancer().idle(ithr); if (redundant_reduction) return; - simple_barrier::barrier(&barriers_[balancer_.group_id(ithr)], - balancer_.nthr_per_group_); - reduce_nolock(ithr, dst); + auto bctx = scratchpad.template get( + memory_tracking::names::key_reducer_space_bctx); + simple_barrier::barrier(&bctx[balancer().group_id(ithr)], + balancer().nthr_per_group_); + + reduce_nolock(ithr, dst, scratchpad); } - reduce_balancer_t balancer_; - bool master_uses_dst_; + const reduce_balancer_t &balancer() const { return conf_.balancer_; } private: - int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_; + static size_t space_per_thread(const reduce_balancer_t &balancer) + { return balancer.njobs_per_group_ub_ * balancer.job_size_; } - size_t ws_per_thread() const - { return balancer_.njobs_per_group_ub_ * balancer_.job_size_; } + /* The scratchpad is organized as follows: + * + * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; + * simple_barrier::ctx_t barriers[groups_]; */ - data_t *workspace_; /** data_t[nthr_][njobs_per_group_ub_][jobs_size_] */ + const conf_t conf_; reducer_2d_driver_t *drv_; - simple_barrier::ctx_t *barriers_; /** barrier::ctx_t[groups_] */ - int choose_x_blocking(int nx, int ny, int nthr_per_grp); - void reduce_block(const data_t* wspace_base, - data_t *dst, int job, int start_y, int start_x, - int ny_start, int nx_start, int ny_step, int nx_step); - void reduce_nolock(int ithr, data_t *dst); + int choose_x_blocking(int nx, int ny, int nthr_per_grp) const; + void reduce_block(const data_t* space_base, data_t *dst, + int job, int start_y, int start_x, + int ny_start, int nx_start, int ny_step, int nx_step) const; + void reduce_nolock(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const; }; /** simple 1d accumulator: y[:] += x[:] */ diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder.cpp index eee668b..3020847 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder.cpp @@ -23,6 +23,7 @@ #include "cpu/jit_uni_reorder.hpp" #include "cpu/simple_reorder.hpp" #include "cpu/wino_reorder.hpp" +#include "cpu/rnn/rnn_reorders.hpp" namespace mkldnn { namespace impl { @@ -50,9 +51,22 @@ static const rpd_create_f cpu_reorder_impl_list[] = { wino_reorder_t::pd_t::create, wino_reorder_t::pd_t::create, + /* rnn reorders */ + rnn_data_reorder_t::pd_t::create, + rnn_weights_reorder_t::pd_t::create, + rnn_weights_reorder_t::pd_t::create, + +#if defined(__INTEL_COMPILER) || (defined(__GNUC__) && !defined(__clang__)) + /* Direct copy for icc which is faster than jitted code; + * Direct copy for gcc which might or might not be faster than jitted + * code, but still worth it because doesn't require jitting, i.e. much + * faster creation time. This is tentative solution and should be removed + * later (when we will cache jitted code?...). */ + REG_SR_DIRECT_COPY(f32, f32), +#endif + #ifdef __INTEL_COMPILER /* direct copy for icc, which is faster than jitted code */ - REG_SR_DIRECT_COPY(f32, f32), REG_SR_DIRECT_COPY(f32, s32), REG_SR_DIRECT_COPY(f32, s8), // REG_SR_DIRECT_COPY(f32, u8), FIXME: Disabled due to accuracy failure on int8 network @@ -73,10 +87,18 @@ static const rpd_create_f cpu_reorder_impl_list[] = { /* jit */ jit_uni_reorder_create, - /* fp32: flat <-> blocked with< tail */ + /* fp32: flat <-> blocked with tail */ + REG_SR_BIDIR(f32, any, f32, nCw4c), + + REG_SR_BIDIR(f32, nchw, bin, nhwc), + REG_SR_BIDIR(f32, nhwc, bin, nhwc), + REG_SR_DIRECT_COPY(bin, bin), + REG_SR_BIDIR(f32, any, f32, nCw8c), + REG_SR_BIDIR(f32, any, f32, OIw4i4o), REG_SR_BIDIR(f32, any, f32, OIw8i8o), REG_SR_BIDIR(f32, any, f32, OIw8o8i), + REG_SR_BIDIR(f32, any, f32, gOIw4i4o), REG_SR_BIDIR(f32, any, f32, gOIw8i8o), REG_SR_BIDIR(f32, any, f32, gOIw8o8i), @@ -88,46 +110,57 @@ static const rpd_create_f cpu_reorder_impl_list[] = { REG_SR_BIDIR(f32, any, f32, gOIw16i16o), REG_SR_BIDIR(f32, any, f32, gIOw16o16i), + REG_SR_BIDIR(f32, any, f32, nChw4c), REG_SR_BIDIR(f32, any, f32, nChw8c), + REG_SR_BIDIR(f32, any, f32, OIhw4i4o), REG_SR_BIDIR(f32, any, f32, Ohwi8o), REG_SR_BIDIR(f32, any, f32, OIhw8i8o), REG_SR_BIDIR(f32, any, f32, OIhw8o8i), + REG_SR_BIDIR(f32, any, f32, gOIhw4i4o), + REG_SR_BIDIR(f32, any, f32, gOIhw4o4i), REG_SR_BIDIR(f32, any, f32, gOhwi8o), REG_SR_BIDIR(f32, any, f32, gOIhw8i8o), REG_SR_BIDIR(f32, any, f32, gOIhw8o8i), REG_SR_BIDIR(f32, any, f32, nChw16c), + REG_SR_BIDIR(f32, any, f32, Oihw4o), REG_SR_BIDIR(f32, any, f32, Oihw16o), + REG_SR_BIDIR(f32, any, f32, Ohwi4o), REG_SR_BIDIR(f32, any, f32, Ohwi16o), REG_SR_BIDIR(f32, any, f32, OIhw16o16i), REG_SR_BIDIR(f32, any, f32, OIhw16i16o), REG_SR_BIDIR(f32, any, f32, IOhw16o16i), + REG_SR_BIDIR(f32, any, f32, gOihw4o), REG_SR_BIDIR(f32, any, f32, gOihw16o), + REG_SR_BIDIR(f32, any, f32, gOhwi4o), REG_SR_BIDIR(f32, any, f32, gOhwi16o), REG_SR_BIDIR(f32, any, f32, gOIhw16o16i), REG_SR_BIDIR(f32, any, f32, gOIhw16i16o), REG_SR_BIDIR(f32, any, f32, gIOhw16o16i), + REG_SR_BIDIR(f32, any, f32, nCdhw4c), REG_SR_BIDIR(f32, any, f32, nCdhw8c), + REG_SR_BIDIR(f32, any, f32, OIdhw4i4o), REG_SR_BIDIR(f32, any, f32, Odhwi8o), REG_SR_BIDIR(f32, any, f32, OIdhw8i8o), REG_SR_BIDIR(f32, any, f32, OIdhw8o8i), + REG_SR_BIDIR(f32, any, f32, gOIdhw4i4o), REG_SR_BIDIR(f32, any, f32, gOdhwi8o), REG_SR_BIDIR(f32, any, f32, gOIdhw8i8o), REG_SR_BIDIR(f32, any, f32, gOIdhw8o8i), REG_SR_BIDIR(f32, any, f32, nCdhw16c), + REG_SR_BIDIR(f32, any, f32, Oidhw4o), REG_SR_BIDIR(f32, any, f32, Oidhw16o), REG_SR_BIDIR(f32, any, f32, Odhwi16o), REG_SR_BIDIR(f32, any, f32, OIdhw16o16i), REG_SR_BIDIR(f32, any, f32, OIdhw16i16o), + REG_SR_BIDIR(f32, any, f32, gOidhw4o), REG_SR_BIDIR(f32, any, f32, gOidhw16o), REG_SR_BIDIR(f32, any, f32, gOdhwi16o), REG_SR_BIDIR(f32, any, f32, gOIdhw16o16i), REG_SR_BIDIR(f32, any, f32, gOIdhw16i16o), - REG_SR_BIDIR(f32, nChw8c, f32, nChw16c), - /* WA to prevent fallback on reference implementations */ REG_SR_DIRECT_COPY(u8, f32), REG_SR_DIRECT_COPY(u8, s8), @@ -135,6 +168,11 @@ static const rpd_create_f cpu_reorder_impl_list[] = { REG_SR_DIRECT_COPY(u8, u8), REG_SR_DIRECT_COPY(s8, s8), + /* fp32: blocked <-> blocked with tail */ + REG_SR_BIDIR(f32, nCw8c, f32, nCw16c), + REG_SR_BIDIR(f32, nChw8c, f32, nChw16c), + REG_SR_BIDIR(f32, nCdhw8c, f32, nCdhw16c), + /* int: flat <-> blocked with tail */ REG_SR(f32, nChw8c, u8, nhwc, fmt_order::keep), REG_SR(f32, nChw8c, s8, nhwc, fmt_order::keep), @@ -207,15 +245,27 @@ static const rpd_create_f cpu_reorder_impl_list[] = { REG_SR(f32, goihw, s8, gOhIw8o4i_s8s8, fmt_order::keep), REG_SR(s8, goihw, s8, gOhIw8o4i_s8s8, fmt_order::keep), + REG_SR(bin, any, bin, OhIw8o32i, fmt_order::keep), + REG_SR(bin, any, bin, OhIw16o32i, fmt_order::keep), + REG_SR(f32, any, s8, hwio_s8s8, fmt_order::keep), - REG_SR(s8, any, s8, hwio_s8s8, fmt_order::keep), REG_SR(f32, any, s8, hwigo_s8s8, fmt_order::keep), + REG_SR(s8, any, s8, hwio_s8s8, fmt_order::keep), REG_SR(s8, any, s8, hwigo_s8s8, fmt_order::keep), + + REG_SR(f32, goihw, s8, gOIhw4o4i_s8s8, fmt_order::keep), + REG_SR(s8, goihw, s8, gOIhw4o4i_s8s8, fmt_order::keep), + REG_SR(f32, oihw, s8, OIhw4i16o4i_s8s8, fmt_order::keep), - REG_SR(s8, oihw, s8, OIhw4i16o4i_s8s8, fmt_order::keep), REG_SR(f32, goihw, s8, gOIhw4i16o4i_s8s8, fmt_order::keep), + REG_SR(s8, oihw, s8, OIhw4i16o4i_s8s8, fmt_order::keep), REG_SR(s8, goihw, s8, gOIhw4i16o4i_s8s8, fmt_order::keep), + REG_SR(f32, goihw, s8, gOIhw2i8o4i_s8s8, fmt_order::keep), + REG_SR(s8, goihw, s8, gOIhw2i8o4i_s8s8, fmt_order::keep), + + REG_SR(f32, goihw, s8, Goihw16g_s8s8, fmt_order::keep), + REG_SR(s8, goihw, s8, Goihw16g_s8s8, fmt_order::keep), /* s16 <-> s16 */ REG_SR_DIRECT_COPY(s16, s16), diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder_pd.hpp index f929a9e..2fac7c7 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder_pd.hpp @@ -40,7 +40,7 @@ struct cpu_reorder_pd_t: public reorder_pd_t { , input_pd_(*input_pd), output_pd_(*output_pd) {} virtual ~cpu_reorder_pd_t() {} - virtual status_t init() const { + virtual status_t init() { const auto &post_ops = attr()->post_ops_; bool args_ok = true && IMPLICATION(post_ops.len_ != 0, diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_sum.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_sum.hpp index 00769ad..34a0f4f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_sum.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_sum.hpp @@ -53,7 +53,7 @@ namespace cpu { } \ return ret; \ } \ - virtual pd_t *clone() const override { return nullptr; } \ + virtual pd_t *clone() const override { return new pd_t(*this); } \ virtual const char *name() const override { return impl_name; } #define DECLARE_CPU_SUM_PD_T(impl_name, ...) \ DECLARE_CPU_SUM_PD_t(impl_name, __VA_ARGS__) diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.cpp similarity index 95% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.cpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.cpp index e3b6cff..a9810de 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.cpp @@ -13,10 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ -#include +#include #include "mkldnn_thread.hpp" #include "utils.hpp" +#include "gemm_utils_f32.hpp" namespace mkldnn { namespace impl { @@ -344,8 +345,9 @@ void partition_unit_diff( // Sum the m*n values from p_src into p_dst, assuming the two-dimensional // arrays have leading dimensions ld_src and ld_dst, respectively template -void sum_two_matrices( - int m, int n, data_t *p_src, int ld_src, data_t *p_dst, int ld_dst) +void sum_two_matrices(int m, int n, + data_t * __restrict p_src, dim_t ld_src, + data_t * __restrict p_dst, dim_t ld_dst) { int i, j; for (j = 0; j < n; j++) { @@ -355,11 +357,15 @@ void sum_two_matrices( } } -template void sum_two_matrices( - int m, int n, float *p_src, int ld_src, float *p_dst, int ld_dst); +template +void sum_two_matrices(int m, int n, + float * __restrict p_src, dim_t ld_src, + float * __restrict p_dst, dim_t ld_dst); -template void sum_two_matrices( - int m, int n, double *p_src, int ld_src, double *p_dst, int ld_dst); +template +void sum_two_matrices(int m, int n, + double * __restrict p_src, dim_t ld_src, + double * __restrict p_dst, dim_t ld_dst); } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.hpp similarity index 89% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.hpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.hpp index 0888787..3352298 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.hpp @@ -22,6 +22,8 @@ namespace impl { namespace cpu { namespace gemm_utils { +// Alias for any dimension related variable. +typedef ptrdiff_t dim_t; template struct gemm_traits {}; @@ -47,9 +49,10 @@ struct gemm_traits { template using unroll_factor = gemm_traits; -template -void sum_two_matrices( - int m, int n, data_type *p_src, int ld_src, data_type *p_dst, int ld_dst); +template +void sum_two_matrices(int m, int n, + data_t * __restrict p_src, dim_t ld_src, + data_t * __restrict p_dst, dim_t ld_dst); void calc_nthr_nocopy_avx512_common(int m, int n, int k, int nthrs, int *nthrs_m, int *nthrs_n, int *nthrs_k, @@ -61,8 +64,6 @@ void calc_nthr_nocopy_avx(int m, int n, int k, void partition_unit_diff( int ithr, int nthr, int n, int *t_offset, int *t_block); - -inline double saturate(double value, double min, double max); }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.cpp similarity index 91% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.cpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.cpp index 8aee85f..d7be43e 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.cpp @@ -14,24 +14,24 @@ * limitations under the License. *******************************************************************************/ -#include +#include +#include #include "mkldnn_thread.hpp" #include "utils.hpp" -#include "gemm_utils.hpp" +#include "ref_gemm_f32.hpp" +#include "gemm_utils_f32.hpp" #include "jit_avx512_common_gemm_f32.hpp" -#define CACHE_LINE_SIZE 64 +#include "jit_generator.hpp" namespace mkldnn { namespace impl { namespace cpu { -using namespace mkldnn::impl::memory_format; -using namespace mkldnn::impl::utils; +#define CACHE_LINE_SIZE 64 -using namespace Xbyak; #define STACKSIZE get_size_of_abi_save_regs() #ifdef _WIN32 #define STACK_K_CAPACITY 32 @@ -45,17 +45,22 @@ using namespace Xbyak; #define UNROLL_M 48 #define UNROLL_N 8 -struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator { - xbyak_gemm(char transa, char transb, float beta, bool hasBias = false, +namespace avx512_common_gemm_f32 { +using namespace gemm_utils; + +struct xbyak_gemm : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_common_gemm_f32_xbyak_gemm) + + xbyak_gemm(char isTransA, char isTransB, float beta, bool hasBias = false, void *code_ptr = nullptr, size_t code_size = 80 * Xbyak::DEFAULT_MAX_CODE_SIZE) : jit_generator(code_ptr, code_size) { + using namespace Xbyak; + enum { ver_avx512_core, ver_avx512_mic } ver = mayiuse(avx512_core) ? ver_avx512_core : ver_avx512_mic; - bool isTransA = (transa == 'T' || transa == 't'); - bool isTransB = (transb == 'T' || transb == 't'); bool isBeta0 = (beta == 0.0); bool isBetaN = (!isBeta0 && beta != 1.0); @@ -1698,34 +1703,55 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator { vzeroupper(); postamble(); - ker_ = reinterpret_cast( - const_cast(this->getCode())); + ker_ = this->getCode(); } - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_common_gemm_f32_xbyak_gemm) + typedef void (*ker_t)(dim_t m, dim_t n, dim_t k, + const float *alpha, const float *a, dim_t lda, + const float *b, dim_t ldb, const float *beta, float *c, + dim_t ldc, const float *bias, float *ws); - void operator()(long long int m, long long int n, long long int k, - const float *alpha, const float *a, long long int lda, - const float *b, long long int ldb, const float *beta, float *c, - long long int ldc, const float *bias, float *ws) + void operator()(dim_t m, dim_t n, dim_t k, + const float *alpha, const float *a, dim_t lda, + const float *b, dim_t ldb, const float *beta, float *c, + dim_t ldc, const float *bias, float *ws) const { - (*ker_)(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, bias, ws); + ker_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, bias, ws); } private: - void (*ker_)(long long int m, long long int n, long long int k, - const float *alpha, const float *a, long long int lda, - const float *b, long long int ldb, const float *beta, float *c, - long long int ldc, const float *bias, float *ws); + ker_t ker_; }; -typedef void (*ker)(long long int, long long int, long long int, float *, - float *, long long int, float *, long long int, float *, float *, - long long int, float *, float *); -void jit_avx512_common_gemm_f32::sgemm_nocopy_driver(const char *transa, +const xbyak_gemm *get_xbyak_gemm( + bool isTransA, bool isTransB, float beta, bool hasBias) { + auto beta_idx = [](float beta) { + return (beta == 0.0) ? 0 : (beta == 1.0 ? 1 : 2); + }; + + // Kernel table [isTransA][isTransB][hasBias][beta (0, 1, other)] + static xbyak_gemm *kernel_table[2][2][2][3]; + static std::once_flag initialized; + std::call_once(initialized, [=]{ + for (bool isTransA: {false, true}) + for (bool isTransB: {false, true}) + for (bool hasBias: {false, true}) + for (float beta: {0.0f, 1.0f, 2.0f}) { + // nocopy sgemm with bias for beta != 0.0 is not supported + if (hasBias && beta != 0.0) + continue; + kernel_table[isTransA][isTransB][hasBias][beta_idx(beta)] = + new xbyak_gemm(isTransA, isTransB, beta, hasBias); + } + }); + + return kernel_table[isTransA][isTransB][hasBias][beta_idx(beta)]; +} + +void sgemm_nocopy_driver(const char *transa, const char *transb, int m, int n, int k, const float *alpha, - const float *a, int lda, const float *b, int ldb, const float *beta, - float *c, int ldc, const float *bias, float *ws) + const float *a, dim_t lda, const float *b, dim_t ldb, const float *beta, + float *c, dim_t ldc, const float *bias, float *ws) { bool isTransA = (*transa == 'T' || *transa == 't'); bool isTransB = (*transb == 'T' || *transb == 't'); @@ -1752,6 +1778,15 @@ void jit_avx512_common_gemm_f32::sgemm_nocopy_driver(const char *transa, return; } + assert(IMPLICATION(bias != nullptr, *beta == 0.0)); + + // XXX: this happens on every thread... + bool hasBias = (bias != nullptr); + auto ker_bn = get_xbyak_gemm(isTransA, isTransB, *beta, hasBias); + auto ker_b1 = get_xbyak_gemm(isTransA, isTransB, 1.0, false); + auto ker_b0 = get_xbyak_gemm(isTransA, isTransB, 0.0, false); + assert(ker_bn && ker_b1 && ker_b0); + int BM = 4032, BN, BK; if (mayiuse(avx512_core)) { BN = isTransA ? 384 : 64; @@ -1793,14 +1828,14 @@ void jit_avx512_common_gemm_f32::sgemm_nocopy_driver(const char *transa, } if (!isTransA) { - curA = a + Bm + (size_t)Bk * lda; + curA = a + Bm + Bk * lda; } else { - curA = a + Bk + (size_t)Bm * lda; + curA = a + Bk + Bm * lda; } if (!isTransB) { - curB = b + Bk + (size_t)Bn * ldb; + curB = b + Bk + Bn * ldb; } else { - curB = b + Bn + (size_t)Bk * ldb; + curB = b + Bn + Bk * ldb; } curC = c + Bm + (size_t)Bn * ldc; if (bias != nullptr) { @@ -1812,52 +1847,54 @@ void jit_avx512_common_gemm_f32::sgemm_nocopy_driver(const char *transa, } if (Bk == 0) { if (*beta == 0.0 && bias == nullptr) - (*ker_b0_)((long long int)sizeM, (long long int)sizeN, - (long long int)sizeK, alpha, curA, - (long long int)lda, curB, (long long int)ldb, - beta, curC, (long long int)ldc, curBias, ws); + (*ker_b0)((dim_t)sizeM, (dim_t)sizeN, (dim_t)sizeK, + alpha, curA, lda, curB, ldb, beta, curC, ldc, + curBias, ws); else - (*ker_bn_)((long long int)sizeM, (long long int)sizeN, - (long long int)sizeK, alpha, curA, - (long long int)lda, curB, (long long int)ldb, - beta, curC, (long long int)ldc, curBias, ws); + (*ker_bn)((dim_t)sizeM, (dim_t)sizeN, (dim_t)sizeK, + alpha, curA, lda, curB, ldb, beta, curC, ldc, + curBias, ws); } else { - (*ker_b1_)((long long int)sizeM, (long long int)sizeN, - (long long int)sizeK, alpha, curA, - (long long int)lda, curB, (long long int)ldb, beta, - curC, (long long int)ldc, curBias, ws); + (*ker_b1)((dim_t)sizeM, (dim_t)sizeN, (dim_t)sizeK, + alpha, curA, lda, curB, ldb, beta, curC, ldc, + curBias, ws); } } } } - return; } -void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, +} + +mkldnn_status_t jit_avx512_common_gemm_f32( + const char *transa, const char *transb, const int *p_m, const int *p_n, const int *p_k, const float *p_alpha, const float *A, const int *p_lda, const float *B, const int *p_ldb, const float *p_beta, float *C, const int *p_ldc, const float *bias) { - if (beta_ == 0. || beta_ == 1.) - assert(*p_beta == beta_); - assert((one_of(*transa, 'T', 't') == one_of(transa_, 'T', 't'))); + using namespace mkldnn::impl::utils; + using namespace avx512_common_gemm_f32; + using namespace gemm_utils; + + if (*p_beta != 0 && bias) + return ref_gemm(transa, transb, p_m, p_n, p_k, + p_alpha, A, p_lda, B, p_lda, p_beta, C, p_ldc, bias); int nthr = (mkldnn_in_parallel()) ? 1 : mkldnn_get_max_threads(); + int m = *p_m; int n = *p_n; int k = *p_k; - int lda = *p_lda; - int ldb = *p_ldb; - int ldc = *p_ldc; + dim_t lda = *p_lda; + dim_t ldb = *p_ldb; + dim_t ldc = *p_ldc; float beta = *p_beta; int MB, NB, KB; int nthr_m, nthr_n, nthr_k, nthr_mn; - assert(nthr <= nthrs_); - // Determine threading partitioning - gemm_utils::calc_nthr_nocopy_avx512_common( + calc_nthr_nocopy_avx512_common( m, n, k, nthr, &nthr_m, &nthr_n, &nthr_k, &MB, &NB, &KB); assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_k == 1)); @@ -1879,6 +1916,7 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, CACHE_LINE_SIZE); ompstatus = (unsigned char volatile *) ompstatus_; assert(ompstatus); + for (int i = 0; i < nthr; i++) ompstatus[i * CACHE_LINE_SIZE] = 0; @@ -1886,14 +1924,14 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, * sizeof(float), PAGE_4K); } - const size_t ws_elems_per_thr = k * 48 + 64; + const size_t ws_elems_per_thr = (size_t)k * 48 + 64; const size_t ws_size_per_thr - = utils::rnd_up(ws_elems_per_thr * sizeof(float), PAGE_4K); + = rnd_up(ws_elems_per_thr * sizeof(float), PAGE_4K); if (k > STACK_K_CAPACITY) { ws_buffers = (float *)malloc(nthr * ws_size_per_thr, PAGE_4K); } - parallel(nthr, [&](const int ithr, const int nthr) { + parallel_nd(nthr, [&](const int ithr) { int ithr_m, ithr_n, ithr_k, ithr_mn; int m_from, m_to, myM; int n_from, n_to, myN; @@ -1903,7 +1941,9 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, float *myC = C, myBeta; float *ws = ws_buffers ? ws_buffers + ithr * ws_size_per_thr / sizeof(float) : 0; - int ld = ldc; + dim_t ld = ldc; + + int sum_later = (mkldnn_get_num_threads() < nthr_m * nthr_n * nthr_k); if (ithr < nthr_m * nthr_n * nthr_k) { @@ -1955,10 +1995,10 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, myC = &(C[m_from + n_from * ldc]); myBeta = beta; ld = ldc; - if (hasBias_) + if (bias) myBias = &(bias[m_from]); } else { - myC = c_buffers + MB * NB * (cbase + ithr_k - 1); + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1); myBeta = 0.0; ld = MB; myBias = nullptr; @@ -1967,40 +2007,40 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, sgemm_nocopy_driver(transa, transb, myM, myN, myK, p_alpha, myA, lda, myB, ldb, &myBeta, myC, ld, myBias, ws); - if (nthr_k > 1) + if (nthr_k > 1 && !sum_later) ompstatus[(ibase + ithr_k) * CACHE_LINE_SIZE] = 1; } - if (nthr_k > 1) { + if (nthr_k > 1 && !sum_later) { // sum matrices partitioned along K dimension int n1, n2; - gemm_utils::partition_unit_diff(ithr_k, nthr_k, myN, &n1, &n2); + partition_unit_diff(ithr_k, nthr_k, myN, &n1, &n2); if (ithr_k > 0) { - myC = c_buffers + MB * NB * (cbase + ithr_k - 1); - myC = myC + n1 * MB; + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1) + + (dim_t)n1 * MB; /* need to wait until main thread finishes */ while (ompstatus[ibase * CACHE_LINE_SIZE] != 1) { }; /* my cache is hot */ - gemm_utils::sum_two_matrices(myM, n2, myC, MB, + sum_two_matrices(myM, n2, myC, MB, &C[m_from + (n_from + n1) * ldc], ldc); } for (int ik = 1; ik < nthr_k; ++ik) { if (ik != ithr_k) { - myC = c_buffers + MB * NB * (cbase + ik - 1); - myC = myC + n1 * MB; + myC = c_buffers + (dim_t)MB * NB * (cbase + ik - 1) + + (dim_t)n1 * MB; while (ompstatus[(ibase + ik) * CACHE_LINE_SIZE] != 1) { }; - gemm_utils::sum_two_matrices(myM, n2, myC, MB, + sum_two_matrices(myM, n2, myC, MB, &C[m_from + (n_from + n1) * ldc], ldc); } } @@ -2008,44 +2048,82 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb, } }); + + // handle C summation later + if (nthr_k > 1 && ompstatus[0] == 0) { + + parallel_nd(nthr, [&](const int ithr) { + int ithr_m, ithr_n, ithr_k, ithr_mn; + int m_from, m_to, myM; + int n_from, n_to, myN; + int cbase; + float *myC = C; + + if (ithr < nthr_m * nthr_n * nthr_k) { + + ithr_mn = ithr % nthr_mn; + ithr_m = ithr_mn % nthr_m; + ithr_n = ithr_mn / nthr_m; + ithr_k = ithr / nthr_mn; + + /* swap ithr_k for performance improvement */ + if (ithr_k == 0) + ithr_k = nthr_k - 1; + else if (ithr_k == nthr_k - 1) + ithr_k = 0; + + m_from = MB * (ithr_m); + m_to = MB * (ithr_m + 1); + if (m_to > m) + m_to = m; + myM = m_to - m_from; + + n_from = NB * (ithr_n); + n_to = NB * (ithr_n + 1); + if (n_to > n) + n_to = n; + myN = n_to - n_from; + + cbase = (ithr_m + nthr_m * ithr_n) * (nthr_k - 1); + + if (nthr_k > 1) { + // sum matrices partitioned along K dimension + int n1, n2; + + partition_unit_diff(ithr_k, nthr_k, myN, &n1, &n2); + + if (ithr_k > 0) { + + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1) + + (dim_t)n1 * MB; + + /* my cache is hot */ + sum_two_matrices(myM, n2, myC, MB, + &C[m_from + (n_from + n1) * ldc], ldc); + } + + for (int ik = 1; ik < nthr_k; ++ik) { + if (ik != ithr_k) { + + myC = c_buffers + (dim_t)MB * NB * (cbase + ik - 1) + + (dim_t)n1 * MB; + + sum_two_matrices(myM, n2, myC, MB, + &C[m_from + (n_from + n1) * ldc], ldc); + } + } + } + } + }); + } + free(c_buffers); free(ompstatus_); free(ws_buffers); -} -jit_avx512_common_gemm_f32::jit_avx512_common_gemm_f32( - char transa, char transb, float beta, bool hasBias) -{ - transa_ = transa; - transb_ = transb; - beta_ = beta; - hasBias_ = hasBias; - if (hasBias) { - assert(beta == 0.0); - } - ker_bn_ = new xbyak_gemm(transa, transb, beta, hasBias); - if (beta != 1.0) { - ker_b1_ = new xbyak_gemm(transa, transb, 1.0); - } else { - ker_b1_ = ker_bn_; - } - if (beta != 0.0 || (beta == 0.0 && hasBias)) { - ker_b0_ = new xbyak_gemm(transa, transb, 0.0); - } else { - ker_b0_ = ker_bn_; - } - - nthrs_ = mkldnn_get_max_threads(); + return mkldnn_success; } -jit_avx512_common_gemm_f32::~jit_avx512_common_gemm_f32() -{ - delete ker_bn_; - if (beta_ != 1.0) - delete ker_b1_; - if (beta_ != 0.0 || (beta_ == 0.0 && hasBias_)) - delete ker_b0_; -} } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.hpp new file mode 100644 index 0000000..d581b7f --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.hpp @@ -0,0 +1,36 @@ +/******************************************************************************* +* Copyright 2017-2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef JIT_AVX512_COMMON_GEMM_F32_HPP +#define JIT_AVX512_COMMON_GEMM_F32_HPP + +#include "mkldnn_types.h" + +namespace mkldnn { +namespace impl { +namespace cpu { + +mkldnn_status_t jit_avx512_common_gemm_f32( + const char *transa, const char *transb, const int *M, + const int *N, const int *K, const float *alpha, const float *A, + const int *lda, const float *B, const int *ldb, const float *beta, + float *C, const int *ldc, const float *bias = nullptr); + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.cpp similarity index 93% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.cpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.cpp index 354fa0b..60d4220 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.cpp @@ -14,23 +14,24 @@ * limitations under the License. *******************************************************************************/ -#include +#include +#include #include "mkldnn_thread.hpp" #include "utils.hpp" -#include "gemm_utils.hpp" + +#include "ref_gemm_f32.hpp" +#include "gemm_utils_f32.hpp" #include "jit_avx_gemm_f32.hpp" -#define CACHE_LINE_SIZE 64 +#include "jit_generator.hpp" namespace mkldnn { namespace impl { namespace cpu { -using namespace mkldnn::impl::memory_format; -using namespace mkldnn::impl::utils; +#define CACHE_LINE_SIZE 64 -using namespace Xbyak; #define STACKSIZE get_size_of_abi_save_regs() #if _WIN32 #define STACK_K_CAPACITY 128 @@ -42,22 +43,25 @@ using namespace Xbyak; #define BASE_SHIFT 2 #define SECOND_FETCH 14 -struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator { +namespace avx_gemm_f32 { +using namespace gemm_utils; + +struct xbyak_gemm : public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_gemm_f32_xbyak_gemm) - xbyak_gemm(char transa, char transb, float beta, bool hasBias = false, + xbyak_gemm(char isTransA, char isTransB, float beta, bool hasBias = false, void *code_ptr = nullptr, size_t code_size = 80 * Xbyak::DEFAULT_MAX_CODE_SIZE) : jit_generator(code_ptr, code_size) { + using namespace Xbyak; + const bool is_avx2 = mayiuse(avx2); assert(IMPLICATION(!is_avx2, mayiuse(avx))); const int UNROLL_M = is_avx2 ? 16 : 8; const int UNROLL_N = 6; - bool isTransA = (transa == 'T' || transa == 't'); - bool isTransB = (transb == 'T' || transb == 't'); bool isBeta0 = (beta == 0.0); bool isBetaN = (!isBeta0 && beta != 1.0); @@ -2275,38 +2279,60 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator { L(main999); // Restore original stack - mov(rax, ORIG_SP); - mov(rsp, rax); + mov(rsp, ORIG_SP); vzeroupper(); postamble(); - ker_ = reinterpret_cast( - const_cast(this->getCode())); + ker_ = this->getCode(); } - void operator()(long long int m, long long int n, long long int k, - const float *alpha, const float *a, long long int lda, - const float *b, long long int ldb, const float *beta, float *c, - long long int ldc, const float *bias, float *ws) + typedef void (*ker_t)(dim_t m, dim_t n, dim_t k, + const float *alpha, const float *a, dim_t lda, + const float *b, dim_t ldb, const float *beta, float *c, + dim_t ldc, const float *bias, float *ws); + + void operator()(dim_t m, dim_t n, dim_t k, + const float *alpha, const float *a, dim_t lda, + const float *b, dim_t ldb, const float *beta, float *c, + dim_t ldc, const float *bias, float *ws) const { - (*ker_)(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, bias, ws); + ker_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, bias, ws); } private: - void (*ker_)(long long int m, long long int n, long long int k, - const float *alpha, const float *a, long long int lda, - const float *b, long long int ldb, const float *beta, float *c, - long long int ldc, const float *bias, float *ws); + ker_t ker_; }; -typedef void (*ker)(long long int, long long int, long long int, float *, - float *, long long int, float *, long long int, float *, float *, - long long int, float *); -void jit_avx_gemm_f32::sgemm_nocopy_driver(const char *transa, +const xbyak_gemm *get_xbyak_gemm( + bool isTransA, bool isTransB, float beta, bool hasBias) { + auto beta_idx = [](float beta) { + return (beta == 0.0) ? 0 : (beta == 1.0 ? 1 : 2); + }; + + // Kernel table [isTransA][isTransB][hasBias][beta (0, 1, other)] + static xbyak_gemm *kernel_table[2][2][2][3]; + static std::once_flag initialized; + std::call_once(initialized, [=]{ + for (bool isTransA: {false, true}) + for (bool isTransB: {false, true}) + for (bool hasBias: {false, true}) + for (float beta: {0.0f, 1.0f, 2.0f}) { + // nocopy sgemm with bias for beta != 0.0 is not supported + if (hasBias && beta != 0.0) + continue; + kernel_table[isTransA][isTransB][hasBias][beta_idx(beta)] = + new xbyak_gemm(isTransA, isTransB, beta, hasBias); + } + }); + + return kernel_table[isTransA][isTransB][hasBias][beta_idx(beta)]; +} + +void sgemm_nocopy_driver(const char *transa, const char *transb, int m, int n, int k, const float *alpha, - const float *a, int lda, const float *b, int ldb, const float *beta, - float *c, int ldc, const float *bias, float *ws) + const float *a, dim_t lda, const float *b, dim_t ldb, const float *beta, + float *c, dim_t ldc, const float *bias, float *ws) { bool isTransA = (*transa == 'T' || *transa == 't'); bool isTransB = (*transb == 'T' || *transb == 't'); @@ -2333,6 +2359,15 @@ void jit_avx_gemm_f32::sgemm_nocopy_driver(const char *transa, return; } + assert(IMPLICATION(bias != nullptr, *beta == 0.0)); + + // XXX: this happens on every thread... + bool hasBias = (bias != nullptr); + auto ker_bn = get_xbyak_gemm(isTransA, isTransB, *beta, hasBias); + auto ker_b1 = get_xbyak_gemm(isTransA, isTransB, 1.0, false); + auto ker_b0 = get_xbyak_gemm(isTransA, isTransB, 0.0, false); + assert(ker_bn && ker_b1 && ker_b0); + int BM = 4032; int BN = isTransA ? 96 : 48; int BK = isTransB ? 96 : 256; @@ -2367,14 +2402,14 @@ void jit_avx_gemm_f32::sgemm_nocopy_driver(const char *transa, } if (!isTransA) { - curA = a + Bm + (size_t)Bk * lda; + curA = a + Bm + Bk * lda; } else { - curA = a + Bk + (size_t)Bm * lda; + curA = a + Bk + Bm * lda; } if (!isTransB) { - curB = b + Bk + (size_t)Bn * ldb; + curB = b + Bk + Bn * ldb; } else { - curB = b + Bn + (size_t)Bk * ldb; + curB = b + Bn + Bk * ldb; } curC = c + Bm + (size_t)Bn * ldc; if (bias != nullptr) { @@ -2386,51 +2421,54 @@ void jit_avx_gemm_f32::sgemm_nocopy_driver(const char *transa, } if (Bk == 0) { if (*beta == 0.0 && bias == nullptr) - (*ker_b0_)((long long int)sizeM, (long long int)sizeN, - (long long int)sizeK, alpha, curA, - (long long int)lda, curB, (long long int)ldb, - beta, curC, (long long int)ldc, curBias, ws); + (*ker_b0)((dim_t)sizeM, (dim_t)sizeN, (dim_t)sizeK, + alpha, curA, lda, curB, ldb, beta, curC, ldc, + curBias, ws); else - (*ker_bn_)((long long int)sizeM, (long long int)sizeN, - (long long int)sizeK, alpha, curA, - (long long int)lda, curB, (long long int)ldb, - beta, curC, (long long int)ldc, curBias, ws); + (*ker_bn)((dim_t)sizeM, (dim_t)sizeN, (dim_t)sizeK, + alpha, curA, lda, curB, ldb, beta, curC, ldc, + curBias, ws); } else { - (*ker_b1_)((long long int)sizeM, (long long int)sizeN, - (long long int)sizeK, alpha, curA, - (long long int)lda, curB, (long long int)ldb, beta, - curC, (long long int)ldc, curBias, ws); + (*ker_b1)((dim_t)sizeM, (dim_t)sizeN, (dim_t)sizeK, + alpha, curA, lda, curB, ldb, beta, curC, ldc, + curBias, ws); } } } } - return; } -void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb, + +} + +mkldnn_status_t jit_avx_gemm_f32( + const char *transa, const char *transb, const int *p_m, const int *p_n, const int *p_k, const float *p_alpha, const float *A, const int *p_lda, const float *B, const int *p_ldb, const float *p_beta, float *C, const int *p_ldc, const float *bias) { - if (beta_ == 0. || beta_ == 1.) - assert(*p_beta == beta_); - assert((one_of(*transa, 'T', 't') == one_of(transa_, 'T', 't'))); + using namespace mkldnn::impl::utils; + using namespace avx_gemm_f32; + using namespace gemm_utils; + + if (*p_beta != 0 && bias) + return ref_gemm(transa, transb, p_m, p_n, p_k, + p_alpha, A, p_lda, B, p_lda, p_beta, C, p_ldc, bias); + + int nthr = (mkldnn_in_parallel()) ? 1 : mkldnn_get_max_threads(); - int nthr = mkldnn_in_parallel() ? 1 : mkldnn_get_max_threads(); int m = *p_m; int n = *p_n; int k = *p_k; - int lda = *p_lda; - int ldb = *p_ldb; - int ldc = *p_ldc; + dim_t lda = *p_lda; + dim_t ldb = *p_ldb; + dim_t ldc = *p_ldc; float beta = *p_beta; int MB, NB, KB; int nthr_m, nthr_n, nthr_k, nthr_mn; - assert(nthr <= nthrs_); - // Determine threading partitioning - gemm_utils::calc_nthr_nocopy_avx( + calc_nthr_nocopy_avx( m, n, k, nthr, &nthr_m, &nthr_n, &nthr_k, &MB, &NB, &KB); assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_k == 1)); @@ -2460,14 +2498,14 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb, * sizeof(float), PAGE_4K); } - const size_t ws_elems_per_thr = k * 16 + 64; + const size_t ws_elems_per_thr = (size_t)k * 16 + 64; const size_t ws_size_per_thr - = utils::rnd_up(ws_elems_per_thr * sizeof(float), PAGE_4K); + = rnd_up(ws_elems_per_thr * sizeof(float), PAGE_4K); if (k > STACK_K_CAPACITY) { ws_buffers = (float *)malloc(nthr * ws_size_per_thr, PAGE_4K); } - parallel(nthr, [&](const int ithr, const int nthr) { + parallel_nd(nthr, [&](const int ithr) { int ithr_m, ithr_n, ithr_k, ithr_mn; int m_from, m_to, myM; int n_from, n_to, myN; @@ -2477,7 +2515,9 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb, float *myC = C, myBeta; float *ws = ws_buffers ? ws_buffers + ithr * ws_size_per_thr / sizeof(float) : 0; - int ld = ldc; + dim_t ld = ldc; + + int sum_later = (mkldnn_get_num_threads() < nthr_m * nthr_n * nthr_k); if (ithr < nthr_m * nthr_n * nthr_k) { @@ -2529,10 +2569,10 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb, myC = &(C[m_from + n_from * ldc]); myBeta = beta; ld = ldc; - if (hasBias_) + if (bias) myBias = &(bias[m_from]); } else { - myC = c_buffers + MB * NB * (cbase + ithr_k - 1); + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1); myBeta = 0.0; ld = MB; myBias = nullptr; @@ -2541,40 +2581,40 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb, sgemm_nocopy_driver(transa, transb, myM, myN, myK, p_alpha, myA, lda, myB, ldb, &myBeta, myC, ld, myBias, ws); - if (nthr_k > 1) + if (nthr_k > 1 && !sum_later) ompstatus[(ibase + ithr_k) * CACHE_LINE_SIZE] = 1; } - if (nthr_k > 1) { + if (nthr_k > 1 && !sum_later) { // sum matrices partitioned along K dimension int n1, n2; - gemm_utils::partition_unit_diff(ithr_k, nthr_k, myN, &n1, &n2); + partition_unit_diff(ithr_k, nthr_k, myN, &n1, &n2); if (ithr_k > 0) { - myC = c_buffers + MB * NB * (cbase + ithr_k - 1); - myC = myC + n1 * MB; + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1) + + (dim_t)n1 * MB; /* need to wait until main thread finishes */ while (ompstatus[ibase * CACHE_LINE_SIZE] != 1) { }; /* my cache is hot */ - gemm_utils::sum_two_matrices(myM, n2, myC, MB, + sum_two_matrices(myM, n2, myC, MB, &C[m_from + (n_from + n1) * ldc], ldc); } for (int ik = 1; ik < nthr_k; ++ik) { if (ik != ithr_k) { - myC = c_buffers + MB * NB * (cbase + ik - 1); - myC = myC + n1 * MB; + myC = c_buffers + (dim_t)MB * NB * (cbase + ik - 1) + + (dim_t)n1 * MB; while (ompstatus[(ibase + ik) * CACHE_LINE_SIZE] != 1) { }; - gemm_utils::sum_two_matrices(myM, n2, myC, MB, + sum_two_matrices(myM, n2, myC, MB, &C[m_from + (n_from + n1) * ldc], ldc); } } @@ -2582,42 +2622,80 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb, } }); + // handle C summation later + if (nthr_k > 1 && ompstatus[0] == 0) { + + parallel_nd(nthr, [&](const int ithr) { + int ithr_m, ithr_n, ithr_k, ithr_mn; + int m_from, m_to, myM; + int n_from, n_to, myN; + int cbase; + float *myC = C; + + if (ithr < nthr_m * nthr_n * nthr_k) { + + ithr_mn = ithr % nthr_mn; + ithr_m = ithr_mn % nthr_m; + ithr_n = ithr_mn / nthr_m; + ithr_k = ithr / nthr_mn; + + /* swap ithr_k for performance improvement */ + if (ithr_k == 0) + ithr_k = nthr_k - 1; + else if (ithr_k == nthr_k - 1) + ithr_k = 0; + + m_from = MB * (ithr_m); + m_to = MB * (ithr_m + 1); + if (m_to > m) + m_to = m; + myM = m_to - m_from; + + n_from = NB * (ithr_n); + n_to = NB * (ithr_n + 1); + if (n_to > n) + n_to = n; + myN = n_to - n_from; + + cbase = (ithr_m + nthr_m * ithr_n) * (nthr_k - 1); + + if (nthr_k > 1) { + // sum matrices partitioned along K dimension + int n1, n2; + + partition_unit_diff(ithr_k, nthr_k, myN, &n1, &n2); + + if (ithr_k > 0) { + + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1) + + (dim_t)n1 * MB; + + /* my cache is hot */ + sum_two_matrices(myM, n2, myC, MB, + &C[m_from + (n_from + n1) * ldc], ldc); + } + + for (int ik = 1; ik < nthr_k; ++ik) { + if (ik != ithr_k) { + + myC = c_buffers + (dim_t)MB * NB * (cbase + ik - 1) + + (dim_t)n1 * MB; + + sum_two_matrices(myM, n2, myC, MB, + &C[m_from + (n_from + n1) * ldc], ldc); + } + } + } + } + }); + } + + free(c_buffers); free(ompstatus_); free(ws_buffers); -} - -jit_avx_gemm_f32::jit_avx_gemm_f32( - char transa, char transb, float beta, bool hasBias) -{ - transa_ = transa; - transb_ = transb; - beta_ = beta; - hasBias_ = hasBias; - if (hasBias) { - assert(beta == 0.0); - } - ker_bn_ = new xbyak_gemm(transa, transb, beta, hasBias); - if (beta != 1.0) { - ker_b1_ = new xbyak_gemm(transa, transb, 1.0); - } else { - ker_b1_ = ker_bn_; - } - if (beta != 0.0 || (beta == 0.0 && hasBias)) { - ker_b0_ = new xbyak_gemm(transa, transb, 0.0); - } else { - ker_b0_ = ker_bn_; - } - nthrs_ = mkldnn_get_max_threads(); -} -jit_avx_gemm_f32::~jit_avx_gemm_f32() -{ - delete ker_bn_; - if (beta_ != 1.0) - delete ker_b1_; - if (beta_ != 0.0 || (beta_ == 0.0 && hasBias_)) - delete ker_b0_; + return mkldnn_success; } } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_s16s16s32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.hpp similarity index 64% rename from inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_s16s16s32.cpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.hpp index a7c720c..aabf520 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_s16s16s32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.hpp @@ -14,23 +14,24 @@ * limitations under the License. *******************************************************************************/ -#include -#include "mkldnn_test_common.hpp" -#include "gtest/gtest.h" +#ifndef JIT_AVX_GEMM_F32_HPP +#define JIT_AVX_GEMM_F32_HPP + +#include "mkldnn_types.h" -#include "mkldnn.hpp" -#include "test_convolution_relu_forward_common.hpp" namespace mkldnn { +namespace impl { +namespace cpu { -using convolution_test = convolution_relu_test; +mkldnn_status_t jit_avx_gemm_f32( + const char *transa, const char *transb, const int *M, + const int *N, const int *K, const float *alpha, const float *A, + const int *lda, const float *B, const int *ldb, const float *beta, + float *C, const int *ldc, const float *bias = nullptr); -TEST_P(convolution_test, TestConvolution) -{ -} -#define S16S16S32 -#define DIRECTION_FORWARD -#include "convolution_common.h" +} +} +} -} \ No newline at end of file +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/ref_gemm.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.cpp similarity index 80% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/ref_gemm.cpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.cpp index e0331e0..5147885 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/ref_gemm.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.cpp @@ -14,13 +14,16 @@ * limitations under the License. *******************************************************************************/ +#include "mkldnn_types.h" + #include "mkldnn_thread.hpp" #include "nstl.hpp" #include "utils.hpp" -#include "../jit_generator.hpp" +#include "jit_generator.hpp" -#include "gemm_utils.hpp" +#include "gemm_utils_f32.hpp" +#include "ref_gemm_f32.hpp" namespace mkldnn { namespace impl { @@ -29,13 +32,14 @@ namespace cpu { using namespace mkldnn::impl::utils; using namespace gemm_utils; +namespace { template -static void copy_A( - bool isTransA, int K, const data_t *A, const int lda, data_t *ws) { +void copy_A( + bool isTransA, int K, const data_t *A, const dim_t lda, data_t *ws) { for (int k = 0; k < K; k++) { PRAGMA_OMP_SIMD() - for (int i = 0; i < gemm_utils::unroll_factor::m; i++) { + for (int i = 0; i < unroll_factor::m; i++) { ws[i] = isTransA ? A[i * lda + k] : A[i + k * lda]; } ws += unroll_factor::m; @@ -43,8 +47,8 @@ static void copy_A( } template -static void kernel_mxn(int K, const data_t *A, const int lda, - const data_t *B, const int ldb, data_t *C, const int ldc, +void kernel_mxn(int K, const data_t *A, const dim_t lda, + const data_t *B, const dim_t ldb, data_t *C, const dim_t ldc, const data_t alpha, const data_t beta) { data_t c[unroll_factor::m * unroll_factor::n] = { static_cast(0.) }; @@ -70,9 +74,9 @@ static void kernel_mxn(int K, const data_t *A, const int lda, } template -static void block_ker(const int M, const int N, const int K, - const data_t *A, const int lda, const data_t *B, const int ldb, - data_t *C, const int ldc, const data_t alpha, const data_t beta, +void block_ker(const int M, const int N, const int K, + const data_t *A, const dim_t lda, const data_t *B, const dim_t ldb, + data_t *C, const dim_t ldc, const data_t alpha, const data_t beta, data_t *ws, bool do_copy) { int Nu = rnd_dn(N, unroll_factor::n); int Mu = rnd_dn(M, unroll_factor::m); @@ -124,8 +128,9 @@ static void block_ker(const int M, const int N, const int K, template void gemm_ithr(const int M, const int N, const int K, const data_t alpha, - const data_t *A, const int lda, const data_t *B, const int ldb, - const data_t beta, data_t *C, const int ldc, bool do_copy, data_t *ws) { + const data_t *A, const dim_t lda, const data_t *B, const dim_t ldb, + const data_t beta, data_t *C, const dim_t ldc, bool do_copy, + data_t *ws) { constexpr int BM = gemm_traits::BM; constexpr int BN = gemm_traits::BN; constexpr int BK = gemm_traits::BK; @@ -138,12 +143,12 @@ void gemm_ithr(const int M, const int N, const int K, const data_t alpha, return; if ((K <= 0) || (alpha == static_cast(0))) { - ptrdiff_t MN = (ptrdiff_t)N * M; + dim_t MN = N * M; if (beta == static_cast(0.)) { - for (ptrdiff_t j = 0; j < MN; j++) + for (dim_t j = 0; j < MN; j++) C[j] = static_cast(0.); } else if (beta != static_cast(1.)) { - for (ptrdiff_t j = 0; j < MN; j++) + for (dim_t j = 0; j < MN; j++) C[j] *= beta; } return; @@ -171,21 +176,26 @@ void gemm_ithr(const int M, const int N, const int K, const data_t alpha, } } +} + template -void ref_gemm(const char *transa_, const char *transb_, const int *M_, +mkldnn_status_t ref_gemm( + const char *transa_, const char *transb_, const int *M_, const int *N_, const int *K_, const data_t *alpha_, const data_t *A, const int *lda_, const data_t *B, const int *ldb_, const data_t *beta_, data_t *C, const int *ldc_, const data_t *bias) { + bool isTransA = (*transa_ == 'T' || *transa_ == 't'); bool isTransB = (*transb_ == 'T' || *transb_ == 't'); - const int M = *M_, N = *N_, K = *K_, lda = *lda_, ldb = *ldb_, ldc = *ldc_; + const int M = *M_, N = *N_, K = *K_; + const dim_t lda = *lda_, ldb = *ldb_, ldc = *ldc_; const data_t alpha = *alpha_, beta = *beta_; int max_nthr = mkldnn_in_parallel() ? 1 : mkldnn_get_max_threads(); int nthr_m, nthr_n, nthr_k; int MB, NB, KB; // thread balancing over M, N, K & size of blocking dimensions - gemm_utils::calc_nthr_nocopy_avx( + calc_nthr_nocopy_avx( M, N, K, max_nthr, &nthr_m, &nthr_n, &nthr_k, &MB, &NB, &KB); assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_k == 1)); @@ -205,14 +215,23 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_, const int nthr = nthr_mn * nthr_k; const size_t ws_elems_per_thr = K * unroll_factor::m; const size_t ws_size_per_thr - = utils::rnd_up(ws_elems_per_thr * sizeof(data_t), PAGE_4K); + = rnd_up(ws_elems_per_thr * sizeof(data_t), PAGE_4K); if (do_copy) { ws_buffers = (data_t*)malloc(nthr * ws_size_per_thr, PAGE_4K); if (!ws_buffers) do_copy = false; } - parallel(nthr, [&](const int ithr, const int nthr) { + auto get_thr_block = [&](int &from, int &to, int &myN, int NB, int N, + int ithr) { + from = NB * (ithr); + to = NB * (ithr + 1); + if (to > N) + to = N; + myN = to - from; + }; + + parallel_nd(nthr, [&](const int ithr) { int ithr_mn = ithr % nthr_mn; int ithr_m = ithr_mn % nthr_m; int ithr_n = ithr_mn / nthr_m; @@ -226,27 +245,20 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_, int m_from = 0, m_to = 0, myM = 0, n_from = 0, n_to = 0, myN = 0, k_from = 0, k_to = 0, myK = 0; - auto get_thr_block = [&](int &from, int &to, int &myN, int NB, int N, - int ithr) { - from = NB * (ithr); - to = NB * (ithr + 1); - if (to > N) - to = N; - myN = to - from; - }; + get_thr_block(m_from, m_to, myM, MB, M, ithr_m); get_thr_block(n_from, n_to, myN, NB, N, ithr_n); get_thr_block(k_from, k_to, myK, KB, K, ithr_k); if (myM > 0 && myN > 0) { data_t myBeta, *myC; - int ld; + dim_t ld; if (ithr_k == 0) { myC = &(C[m_from + n_from * ldc]); myBeta = beta; ld = ldc; } else { - myC = c_buffers + MB * NB * (cbase + ithr_k - 1); + myC = c_buffers + (dim_t)MB * NB * (cbase + ithr_k - 1); myBeta = 0.0f; ld = MB; } @@ -275,23 +287,36 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_, } } } + }); - if (nthr_k > 1) { - assert(mkldnn_thr_syncable()); - mkldnn_thr_barrier(); + if (nthr_k > 1) { + parallel_nd(nthr, [&](const int ithr) { + int ithr_mn = ithr % nthr_mn; + int ithr_m = ithr_mn % nthr_m; + int ithr_k = ithr / nthr_mn; + int ithr_n = ithr_mn / nthr_m; + + int n_from = 0, n_to = 0, myN = 0; + int m_from = 0, m_to = 0, myM = 0; + + int cbase = (ithr_m + nthr_m * ithr_n) * (nthr_k - 1); + + get_thr_block(n_from, n_to, myN, NB, N, ithr_n); + get_thr_block(m_from, m_to, myM, MB, M, ithr_m); // sum matrices partitioned along K dimension int offset = 0, block = 0; gemm_utils::partition_unit_diff(ithr_k, nthr_k, myN, &offset, &block); for (int ik = 1; ik < nthr_k; ++ik) { - data_t *myC = c_buffers + MB * (NB * (cbase + ik - 1) + offset); + data_t *myC = c_buffers + + MB * ((dim_t)NB * (cbase + ik - 1) + offset); gemm_utils::sum_two_matrices(myM, block, myC, MB, &C[m_from + (n_from + offset) * ldc], ldc); } - } - }); + }); + } if (bias) { parallel_nd(N, M, [&](int i, int j) { @@ -301,14 +326,18 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_, free(ws_buffers); free(c_buffers); + + return mkldnn_success; } -template void ref_gemm(const char *transa_, const char *transb_, +template mkldnn_status_t ref_gemm( + const char *transa_, const char *transb_, const int *M_, const int *N_, const int *K_, const float *alpha_, const float *A, const int *lda_, const float *B, const int *ldb_, const float *beta_, float *C, const int *ldc_, const float *bias); -template void ref_gemm(const char *transa_, const char *transb_, +template mkldnn_status_t ref_gemm( + const char *transa_, const char *transb_, const int *M_, const int *N_, const int *K_, const double *alpha_, const double *A, const int *lda_, const double *B, const int *ldb_, const double *beta_, double *C, const int *ldc_, const double *bias); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.hpp new file mode 100644 index 0000000..7c90ba6 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.hpp @@ -0,0 +1,36 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef REF_GEMM_F32_HPP +#define REF_GEMM_F32_HPP + +#include "mkldnn_types.h" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +mkldnn_status_t ref_gemm(const char *transa, const char *transb, const int *M, + const int *N, const int *K, const data_t *alpha, const data_t *A, + const int *lda, const data_t *B, const int *ldb, const data_t *beta, + data_t *C, const int *ldc, const data_t *bias); + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.cpp index 146e688..ac619b1 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.cpp @@ -13,20 +13,25 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ -#include #include "mkldnn.h" -#include "verbose.hpp" +#include "mkldnn_traits.hpp" +#include "nstl.hpp" + +#include "jit_generator.hpp" -#include "jit_avx_gemm_f32.hpp" -#include "jit_avx512_common_gemm_f32.hpp" #include "gemm.hpp" -#include "../jit_generator.hpp" -#include "nstl.hpp" + +#include "f32/jit_avx512_common_gemm_f32.hpp" +#include "f32/jit_avx_gemm_f32.hpp" +#include "f32/ref_gemm_f32.hpp" + +#include "s8x8s32/jit_avx512_core_gemm_s8u8s32.hpp" +#include "s8x8s32/jit_avx512_core_gemm_s8s8s32.hpp" +#include "s8x8s32/ref_gemm_s8x8s32.hpp" + #include "os_blas.hpp" -#include "math_utils.hpp" -#include "mkldnn_traits.hpp" /* USE_MKL USE_CBLAS effect * ------- --------- ------ @@ -39,15 +44,15 @@ namespace mkldnn { namespace impl { namespace cpu { -using namespace mkldnn::impl::status; + mkldnn_status_t check_gemm_input(const char *transa, const char *transb, const int *M, const int *N, const int *K, const int *lda, const int *ldb, const int *ldc, const float *alpha, const float *beta, const bool with_bias) { if (utils::any_null(transa, transb, M, N, K, lda, ldb, ldc, alpha, beta)) - return invalid_arguments; + return mkldnn_invalid_arguments; if (with_bias && *beta != 0) - return unimplemented; + return mkldnn_unimplemented; bool consistency = true && utils::one_of(*transa, 'T', 't', 'N', 'n') && utils::one_of(*transb, 'T', 't', 'N', 'n') @@ -55,7 +60,8 @@ mkldnn_status_t check_gemm_input(const char *transa, const char *transb, && *N >= 0 && *K >= 0; - if (!consistency) return invalid_arguments; + if (!consistency) + return mkldnn_invalid_arguments; bool isTransA = utils::one_of(*transa, 'T', 't'); bool isTransB = utils::one_of(*transb, 'T', 't'); int nrowA = isTransA ? *K : *M; @@ -64,136 +70,65 @@ mkldnn_status_t check_gemm_input(const char *transa, const char *transb, && *lda >= nstl::max(1, nrowA) && *ldb >= nstl::max(1, nrowB) && *ldc >= nstl::max(1, *M); - if (!consistency) return invalid_arguments; + if (!consistency) + return mkldnn_invalid_arguments; - return success; + return mkldnn_success; } mkldnn_status_t check_gemm_x8x8x32_input(const char *offsetc, const char *transa, const char *transb, const int *M, const int *N, const int *K, const int *lda, const int *ldb, const int *ldc, const float *alpha, const float *beta, const bool with_bias) { - - if (offsetc == nullptr) return invalid_arguments; + if (offsetc == nullptr) + return mkldnn_invalid_arguments; if (!utils::one_of(*offsetc, 'F', 'f', 'C', 'c', 'R', 'r')) - return invalid_arguments; + return mkldnn_invalid_arguments; return check_gemm_input(transa, transb, M, N, K, lda, ldb, ldc, alpha, beta, with_bias); } -struct gemm_impl_t { - gemm_impl_t(char transa, char transb, bool zero_beta, bool with_bias) { - //jit kernel has three codepaths: beta is 0, 1 or arbitrary - //we will generate kernel for 0 and arbitrary beta - float zero = 0.0f, arbitrary_float = 2.0f; - if (mayiuse(avx512_common)) { - isa_ = avx512_common; - ker_ = (void *)new jit_avx512_common_gemm_f32( - transa, transb, zero_beta ? zero : arbitrary_float, - with_bias); - } - else if (mayiuse(avx)) { - isa_ = avx; - ker_ = (void *)new jit_avx_gemm_f32( - transa, transb, zero_beta ? zero : arbitrary_float, - with_bias); - } - } - - mkldnn_status_t call(const char *transa, const char *transb, const int *M, - const int *N, const int *K, const float *alpha, const float *A, - const int *lda, const float *B, const int *ldb, const float *beta, - float *C, const int *ldc, const float *bias = nullptr) { - switch (isa_) { - case avx: - ((jit_avx_gemm_f32*)ker_)->sgemm(transa, transb, M, N, K, - alpha, A, lda, B, ldb, beta, C, ldc, bias); - break; - case avx512_common: - ((jit_avx512_common_gemm_f32*)ker_)->sgemm(transa, transb, - M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, bias); - break; - default: - ref_gemm(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, - C, ldc, bias); - break; - } - return mkldnn_success; - } - - void *ker_; - cpu_isa_t isa_; -}; -//Gemm implementations for: zero/nonzero beta, transA, transB -static gemm_impl_t *gemm_impl[2][2][2]; -//Gemm with bias implementations for: transA, transB -//Gemm with bias for beta!=0. is not supported -static gemm_impl_t *gemm_bias_impl[2][2]; - -void initialize() { - for (int i = 0; i < 2; ++i) { - gemm_impl[i][0][0] = new gemm_impl_t('n', 'n', (bool)i, false); - gemm_impl[i][0][1] = new gemm_impl_t('n', 't', (bool)i, false); - gemm_impl[i][1][0] = new gemm_impl_t('t', 'n', (bool)i, false); - gemm_impl[i][1][1] = new gemm_impl_t('t', 't', (bool)i, false); - } - gemm_bias_impl[0][0] = new gemm_impl_t('n', 'n', true, true); - gemm_bias_impl[0][1] = new gemm_impl_t('n', 't', true, true); - gemm_bias_impl[1][0] = new gemm_impl_t('t', 'n', true, true); - gemm_bias_impl[1][1] = new gemm_impl_t('t', 't', true, true); -} - mkldnn_status_t extended_sgemm(const char *transa, const char *transb, const int *M, const int *N, const int *K, const float *alpha, const float *A, const int *lda, const float *B, const int *ldb, const float *beta, float *C, const int *ldc, const float *bias, const bool force_jit_gemm) { - //Check input mkldnn_status_t status = check_gemm_input(transa, transb, M, N, K, lda, ldb, ldc, alpha, beta, bias != nullptr); if (status != mkldnn_success) return status; - if (*M == 0 || *N == 0 || *K == 0) - return mkldnn_success; - int trA = *transa == 't' || *transa == 'T'; - int trB = *transb == 't' || *transb == 'T'; + #ifdef USE_CBLAS if (!force_jit_gemm) { - //Call cblas + bool trA = *transa == 't' || *transa == 'T'; + bool trB = *transb == 't' || *transb == 'T'; CBLAS_TRANSPOSE Cblas_trA = trA ? CblasTrans : CblasNoTrans; CBLAS_TRANSPOSE Cblas_trB = trB ? CblasTrans : CblasNoTrans; cblas_sgemm(CblasColMajor, Cblas_trA, Cblas_trB, *M, *N, *K, *alpha, A, *lda, B, *ldb, *beta, C, *ldc); - //Add bias if necessary (bias is applied to columns of C) + if (bias) { + // Add bias if necessary (bias is applied to columns of C) cblas_int incx = 1, incy = 1; parallel_nd(*N, [&](int n) { - cblas_saxpy(*M, 1.0, bias, incx, C + n*(*ldc), incy); + ptrdiff_t offset = (ptrdiff_t)n * (*ldc); + cblas_saxpy(*M, 1.0, bias, incx, C + offset, incy); }); } return mkldnn_success; } #endif - //Generate jit kernel and call sgemm with bias - volatile static int initialized = 0; - if (!initialized) { - static std::mutex mtx; - std::lock_guard lock(mtx); - if (!initialized) { - mkldnn::impl::cpu::initialize(); - initialized = 1; - } - } - if (bias) - gemm_bias_impl[trA][trB]->call( - transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, - bias); - else - gemm_impl[*beta == 0.f][trA][trB]->call( - transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); - return mkldnn_success; + if (mayiuse(avx512_common)) + return jit_avx512_common_gemm_f32(transa, transb, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, bias); + else if (mayiuse(avx)) + return jit_avx_gemm_f32(transa, transb, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, bias); + else + return ref_gemm(transa, transb, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, bias); } template @@ -202,22 +137,20 @@ mkldnn_status_t gemm_s8x8s32(const char *transa, const char *transb, const float *alpha, const int8_t *A, const int *LDA, const int8_t *ao, const b_dt *B, const int *LDB, const int8_t *bo, const float *beta, int32_t *C, const int *LDC, const int32_t *co) { - mkldnn_status_t status = check_gemm_x8x8x32_input(offsetc, transa, transb, M, N, K, LDA, LDB, LDC, alpha, beta, false); - if (status != mkldnn_success) return status; if (*M == 0 || *N == 0 || *K == 0) return mkldnn_success; - bool OCisR = (*offsetc == 'R' || *offsetc == 'r'); - bool OCisC = (*offsetc == 'C' || *offsetc == 'c'); - bool AisN = (*transa == 'N' || *transa == 'n'); - bool BisN = (*transb == 'N' || *transb == 'n'); +#if USE_MKL_IGEMM + bool OCisR = (*offsetc == 'R' || *offsetc == 'r'); + bool OCisC = (*offsetc == 'C' || *offsetc == 'c'); + bool AisN = (*transa == 'N' || *transa == 'n'); + bool BisN = (*transb == 'N' || *transb == 'n'); -#if defined(USE_MKL) && defined(USE_CBLAS) if (data_traits::data_type == data_type::u8) { CBLAS_TRANSPOSE Cblas_trA = AisN ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE Cblas_trB = BisN ? CblasNoTrans : CblasTrans; @@ -228,64 +161,58 @@ mkldnn_status_t gemm_s8x8s32(const char *transa, const char *transb, ? CblasColOffset : CblasFixOffset; cblas_gemm_s8u8s32(CblasColMajor, Cblas_trA, Cblas_trB, Cblas_offsetc, - *M, *N, *K, *alpha, A, *LDA, *ao, (b_dt*)B, *LDB, *bo, *beta, C, *LDC, co); + *M, *N, *K, *alpha, A, *LDA, *ao, (uint8_t *)B, *LDB, *bo, + *beta, C, *LDC, co); return mkldnn_success; + } else { + assert(data_traits::data_type == data_type::s8); + // TODO CBLAS implementation of gemm_s8s8s32 goes here. + // mkldnn_gemm_s8s8s32 doesn't support non-zero ao and bo + if ((mayiuse(avx512_core) || mayiuse(avx512_core_vnni)) + && *ao == 0 && *bo == 0) { + return jit_avx512_core_gemm_s8s8s32(transa, transb, offsetc, M, + N, K, alpha, A, LDA, ao, (int8_t *)B, LDB, bo, beta, + C, LDC, co); + } else { + return ref_gemm_s8x8s32(transa, transb, offsetc, M, N, K, + alpha, A, LDA, ao, B, LDB, bo, beta, C, LDC, co); + } } -#endif - int m = *M, n = *N, k = *K, lda = *LDA, ldb = *LDB, ldc = *LDC; - size_t sizeA = AisN ? lda * k : lda * m; - size_t sizeB = BisN ? ldb * n : ldb * k; - size_t sizeC = ldc * n; - - double *dA = (double *)malloc(sizeA * sizeof(double), PAGE_4K); - double *dB = (double *)malloc(sizeB * sizeof(double), PAGE_4K); - double *dC = (double *)malloc(sizeC * sizeof(double), PAGE_4K); - - if (utils::any_null(dA, dB, dC)) { - free(dA); - free(dB); - free(dC); - return mkldnn_out_of_memory; +#else + cpu_isa_t isa = isa_any; + if (mayiuse(avx512_core_vnni)) { + isa = avx512_core_vnni; + } else if (mayiuse(avx512_core)) { + isa = avx512_core; } - auto da_setter = [=] (int i, int j, double v) { dA[j * lda + i] = v; }; - auto db_setter = [=] (int i, int j, double v) { dB[j * ldb + i] = v; }; - - auto ia_accessor = [=] (int i, int j) { return A[j * lda + i]; }; - auto ib_accessor = [=] (int i, int j) { return B[j * ldb + i]; }; - - const int a_rows = AisN ? m : k; - const int a_cols = AisN ? k : m; - mkldnn::impl::parallel_nd(a_cols, a_rows, [&](int j, int i) { - da_setter(i, j, - static_cast(ia_accessor(i, j)) + static_cast(ao[0])); - }); - - const int b_rows = BisN ? k : n; - const int b_cols = BisN ? n : k; - mkldnn::impl::parallel_nd(b_cols, b_rows, [&](int j, int i) { - db_setter(i, j, - static_cast(ib_accessor(i, j)) + static_cast(bo[0])); - }); - double one = 1.0, zero = 0.0; - ref_gemm(transa, transb, M, N, K, &one, dA, LDA, dB, LDB, &zero, - dC, LDC, nullptr); - - auto i2d = [=] (int32_t v) { return static_cast(v); }; - auto f2d = [=] (float v) { return static_cast(v); }; - - mkldnn::impl::parallel_nd(n, m, [&] (int j, int i) { - double coffset = OCisR ? i2d(co[j]) : OCisC ? i2d(co[i]) : i2d(co[0]); - double val = ((*beta == 0.0f) ? 0.0 : f2d(*beta) * i2d(C[i + j * ldc])) - + f2d(*alpha) * dC[i + j * ldc] + coffset; - C[i + j * ldc] = math::out_round(math::saturate(val)); - }); - - free(dA); - free(dB); - free(dC); - return mkldnn_success; + if (data_traits::data_type == data_type::u8) { + switch (isa) { + case avx512_core: + case avx512_core_vnni: + return jit_avx512_core_gemm_s8u8s32(transa, transb, offsetc, M, + N, K, alpha, A, LDA, ao, (uint8_t *)B, LDB, bo, beta, + C, LDC, co); + default: + return ref_gemm_s8x8s32(transa, transb, offsetc, M, N, K, + alpha, A, LDA, ao, B, LDB, bo, beta, C, LDC, co); + } + } else { + assert(data_traits::data_type == data_type::s8); + // mkldnn_gemm_s8s8s32 doesn't support non-zero ao and bo + if ((mayiuse(avx512_core) || mayiuse(avx512_core_vnni)) + && *ao == 0 && *bo == 0) { + return jit_avx512_core_gemm_s8s8s32(transa, transb, offsetc, M, + N, K, alpha, A, LDA, ao, (int8_t *)B, LDB, bo, beta, + C, LDC, co); + } else { + return ref_gemm_s8x8s32(transa, transb, offsetc, M, N, K, + alpha, A, LDA, ao, B, LDB, bo, beta, C, LDC, co); + } + } +#endif } + } } } @@ -305,18 +232,18 @@ mkldnn_status_t mkldnn_gemm_s8u8s32(const char *transa, const char *transb, const char *offsetc, const int *M, const int *N, const int *K, const float *alpha, const int8_t *A, const int *lda, const int8_t *ao, const uint8_t *B, const int *ldb, const int8_t *bo, const float *beta, - int32_t *c, const int *ldc, const int32_t *co) { + int32_t *C, const int *ldc, const int32_t *co) { return gemm_s8x8s32( transa, transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, - beta, c, ldc, co); + beta, C, ldc, co); } mkldnn_status_t mkldnn_gemm_s8s8s32(const char *transa, const char *transb, const char *offsetc, const int *M, const int *N, const int *K, const float *alpha, const int8_t *A, const int *lda, const int8_t *ao, const int8_t *B, const int *ldb, const int8_t *bo, const float *beta, - int32_t *c, const int *ldc, const int32_t *co) { + int32_t *C, const int *ldc, const int32_t *co) { return gemm_s8x8s32( transa, transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, - beta, c, ldc, co); + beta, C, ldc, co); } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.hpp index 3f33a37..dc15ff7 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.hpp @@ -13,11 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ + #ifndef GEMM_HPP #define GEMM_HPP + +#include "mkldnn_types.h" +#include "os_blas.hpp" + namespace mkldnn { namespace impl { namespace cpu { + mkldnn_status_t extended_sgemm(const char *transa, const char *transb, const int *M, const int *N, const int *K, const float *alpha, const float *A, const int *lda, const float *B, const int *ldb, @@ -31,17 +37,22 @@ mkldnn_status_t gemm_s8x8s32(const char *transa, const char *transb, const b_dt *B, const int *ldb, const int8_t *bo, const float *beta, int32_t *c, const int *ldc, const int32_t *co); -template -void ref_gemm(const char *transa, const char *transb, const int *M, - const int *N, const int *K, const data_t *alpha, const data_t *A, - const int *lda, const data_t *B, const int *ldb, const data_t *beta, - data_t *C, const int *ldc, const data_t *bias); #ifdef USE_CBLAS #define GEMM_IMPL_STR "gemm:blas" #else #define GEMM_IMPL_STR "gemm:jit" #endif + +#if USE_MKL_IGEMM +#define IGEMM_S8U8S32_IMPL_STR "igemm_s8u8s32:blas" +#define IGEMM_S8S8S32_IMPL_STR "igemm_s8s8s32:blas" +#else +#define IGEMM_S8U8S32_IMPL_STR "igemm_s8u8s32:jit" +#define IGEMM_S8S8S32_IMPL_STR "igemm_s8s8s32:jit" +#endif + } } } + #endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.hpp deleted file mode 100644 index c057335..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/******************************************************************************* -* Copyright 2017-2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef JIT_AVX512_COMMON_GEMM_F32_HPP -#define JIT_AVX512_COMMON_GEMM_F32_HPP - -#include "c_types_map.hpp" -#include "../jit_generator.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -class jit_avx512_common_gemm_f32 { -public: - void sgemm(const char *transa, const char *transb, const int *M, - const int *N, const int *K, const float *alpha, const float *A, - const int *lda, const float *B, const int *ldb, const float *beta, - float *C, const int *ldc, const float *bias = NULL); - - jit_avx512_common_gemm_f32( - char transa, char transb, float beta, bool hasBias = false); - ~jit_avx512_common_gemm_f32(); - -private: - typedef void (*ker)(long long int, long long int, long long int, float *, - float *, long long int, float *, long long int, float *, float *, - long long int, float *, float *); - void sgemm_nocopy_driver(const char *transa, const char *transb, int m, - int n, int k, const float *alpha, const float *a, int lda, - const float *b, int ldb, const float *beta, float *c, int ldc, - const float *bias, float *ws); - - char transa_, transb_; - float beta_; - bool hasBias_; - struct xbyak_gemm; - xbyak_gemm *ker_bn_, *ker_b1_, *ker_b0_; - int nthrs_; -}; -} -} -} - -#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.hpp deleted file mode 100644 index dd34e09..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/******************************************************************************* -* Copyright 2016-2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef JIT_AVX_GEMM_F32_HPP -#define JIT_AVX_GEMM_F32_HPP - -#include "c_types_map.hpp" -#include "../jit_generator.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -class jit_avx_gemm_f32 { -public: - void sgemm(const char *transa, const char *transb, const int *M, - const int *N, const int *K, const float *alpha, const float *A, - const int *lda, const float *B, const int *ldb, const float *beta, - float *C, const int *ldc, const float *bias = NULL); - - jit_avx_gemm_f32( - char transa, char transb, float beta, bool hasBias = false); - ~jit_avx_gemm_f32(); - -private: - typedef void (*ker)(long long int, long long int, long long int, float *, - float *, long long int, float *, long long int, float *, float *, - long long int, float *); - void sgemm_nocopy_driver(const char *transa, const char *transb, int m, - int n, int k, const float *alpha, const float *a, int lda, - const float *b, int ldb, const float *beta, float *c, int ldc, - const float *bias, float *ws); - - char transa_, transb_; - float beta_; - bool hasBias_; - struct xbyak_gemm; - xbyak_gemm *ker_bn_, *ker_b1_, *ker_b0_; - int nthrs_; -}; -} -} -} - -#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/os_blas.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/os_blas.hpp index 6afe40d..85acfa1 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/os_blas.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/os_blas.hpp @@ -32,7 +32,7 @@ #include "mkl_version.h" -#define USE_MKL_PACKED_GEMM (INTEL_MKL_VERSION >= 20170000) +#define USE_MKL_PACKED_GEMM 0 #define USE_MKL_IGEMM \ (INTEL_MKL_VERSION >= 20180000 && __INTEL_MKL_BUILD_DATE >= 20170628) diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/common.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/common.hpp new file mode 100644 index 0000000..dde72f4 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/common.hpp @@ -0,0 +1,206 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef COMMON_H +#define COMMON_H + +#define GEMM_CODE_SIZE (4096L * 32) + +#define AVX512_UNROLL_M 48 +#define AVX512_UNROLL_N 8 +#define AVX512_UNROLL_K 1 +#define AVX512_BM 9984 +#define AVX512_BN 384 +#define AVX512_BK 768 +#define AVX512_BK_VNNI 1536 +#define AVX512_BK_TRADITIONAL 384 +#define AVX512_BLOCKING_SMALL_K 48 +#define AVX512_BN_SMALL_K 24 + + +#define PAGESIZE 4096 + +#define PADD_BYTESIZE_ONPAGE(x, size) (((x) * (size) + PAGESIZE - 1) / PAGESIZE) * PAGESIZE +#define NEXT_THR_STRIDE(x, size) (PADD_BYTESIZE_ONPAGE(x, size)) / size + +#include "jit_generator.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +enum { + PARTITION_1D_ROW, + PARTITION_1D_COL, + PARTITION_2D_COL_MAJOR, + PARTITION_2D = PARTITION_2D_COL_MAJOR, +}; + +enum { + COPY_NONE, + COPY_A, +}; + +enum { + NO_OFFSET, + FIX_OFFSET, + COL_OFFSET, + ROW_OFFSET, +}; + +// Alias for any dimension related variable. +typedef long long int dim_t; + +typedef struct { + // Interface arguments. + int transa, transb, offsetc; + dim_t m, n, k; + dim_t lda, ldb, ldc; + const int8_t *a; + const uint8_t *b; + int32_t *c; + const float *alpha, *beta; + + int8_t ao, bo; + const int32_t *co; + + // Kernel parameters. + dim_t um, un, uk, bm, bn, bk; + dim_t bn_small_k, bk_traditional, blocking_small_k; + + int (*copyA)(const dim_t *m, const dim_t *n, const int8_t *a, + const dim_t *lda, const int8_t *alpha, int8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + int (*copyB)(const dim_t *m, const dim_t *n, const uint8_t *a, + const dim_t *lda, const uint8_t *alpha, uint8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + int (*kernel)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_b)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_r)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_c)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_b0)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_b0_b)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_b0_r)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + int (*kernel_b0_c)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + // Gemv kernels + void (*gemv_s8u8s32_kernel)(const dim_t, const dim_t, const float, + const int8_t*, const dim_t, const uint8_t*, + const float, int32_t*); + + void (*gemv_u8s8s32_kernel)(const dim_t, const dim_t, const float, + const uint8_t*, const dim_t, const int8_t*, + const float, int32_t*); + + // Gemv parameters + int swap; + +} blas_t; + + +class jit_avx512_core_u8_copy_an_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_an_kern); + + public: + jit_avx512_core_u8_copy_an_kern(); +}; + +class jit_avx512_core_u8_copy_at_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_at_kern); + + public: + jit_avx512_core_u8_copy_at_kern(); +}; + +class jit_avx512_core_u8_copy_bn_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_bn_kern); + + public: + jit_avx512_core_u8_copy_bn_kern(); +}; + +class jit_avx512_core_u8_copy_bt_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_bt_kern); + + public: + jit_avx512_core_u8_copy_bt_kern(); +}; + +class jit_avx512_core_u8_copy_sum_an_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_an_kern); + + public: + jit_avx512_core_u8_copy_sum_an_kern(); +}; + +class jit_avx512_core_u8_copy_sum_at_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_at_kern); + + public: + jit_avx512_core_u8_copy_sum_at_kern(); +}; + +class jit_avx512_core_u8_copy_sum_bn_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_bn_kern); + + public: + jit_avx512_core_u8_copy_sum_bn_kern(); +}; + +class jit_avx512_core_u8_copy_sum_bt_kern : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_bt_kern); + + public: + jit_avx512_core_u8_copy_sum_bt_kern(); +}; + +} +} +} +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/gemv.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/gemv.hpp new file mode 100644 index 0000000..db9dd9e --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/gemv.hpp @@ -0,0 +1,28 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +int gemm_s8u8s32_jump_to_gemv_s8u8s32(blas_t *arg); +int gemv_threading_driver(blas_t *arg); + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.cpp new file mode 100644 index 0000000..07a1396 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.cpp @@ -0,0 +1,155 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "common.hpp" +#include "nstl.hpp" +#include "math_utils.hpp" +#include "jit_avx512_core_gemm_s8u8s32.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +void compensation_init(const char *offsetC, int32_t *compensation, int len, + const int32_t *oc) { + bool OCisC = (*offsetC == 'C' || *offsetC == 'c'); + bool OCisF = (*offsetC == 'F' || *offsetC == 'f'); + + if (OCisF && (*oc) != 0) { + for (int i = 0; i < len; i++) + compensation[i] = *oc; + } else if (OCisC) { + for (int i = 0; i < len; i++) + compensation[i] = oc[i]; + } else { + parallel_nd(len, [=](int i) { compensation[i] = 0; }); + } +} + +void compensation_compute(bool transa, int m, int k, float alpha, + const int8_t *a, int lda, int32_t *compensation) { + if (!transa) { + const int L2_cache_size = get_cache_size(2, true); + const int blocking_factor = nstl::min(k, L2_cache_size / lda + 1); + const int npanels = k / blocking_factor; + const bool has_tile = k % blocking_factor > 0; + + parallel_nd(npanels, m, [&](int j, int i) { + int32_t val = 0; + for (int jb = 0; jb < blocking_factor; jb++) { + val += a[(i + (ptrdiff_t)j * blocking_factor * lda) + + (ptrdiff_t)jb * lda]; + } + if (alpha != 1.0f) { + val = math::out_round(math::saturate( + (double)val * alpha * -128.0)); + } else { + val *= -128; + } + mkldnn_fetch_and_add(&compensation[i], val); + }); + + if (has_tile) { + parallel_nd(m, [=](int i) { + int32_t val = 0; + for (int j = npanels * blocking_factor; j < k; j++) { + val += a[i + (ptrdiff_t)j * lda]; + } + if (alpha != 1.0f) { + val = math::out_round(math::saturate( + (double)val * alpha * -128.0)); + } else { + val *= -128; + } + mkldnn_fetch_and_add(&compensation[i], val); + }); + } + } else { + parallel_nd(m, [=](int i) { + int32_t val = 0; + for (int j = 0; j < k; j++) { + val += a[j + (ptrdiff_t)i * lda]; + } + if (alpha != 1.0f) { + val = math::out_round(math::saturate( + (double)val * alpha * -128.0)); + } else { + val *= -128; + } + compensation[i] += val; + }); + } +} + +void copy_and_shift_b(bool transb, int k, int n, uint8_t *b_u8, int ldb_u8, + const int8_t *b_s8, int ldb_s8) { + const int b_cols = transb ? k : n; + + parallel_nd(b_cols, [=](int j) { + const int b_rows = transb ? n : k; + + uint8_t *pb_u8 = b_u8 + j * ldb_u8; + const int8_t *pb_s8 = b_s8 + j * ldb_s8; + + for (int i = 0; i < b_rows; i++) { + (*pb_u8) = (*pb_s8) + 128; + pb_u8++; + pb_s8++; + } + }); +} + +mkldnn_status_t jit_avx512_core_gemm_s8s8s32( + const char *transA, const char *transB, const char *offsetC, + const int *m, const int *n, const int *k, + const float *alpha, const int8_t *a, const int *lda, const int8_t *oa, + const int8_t *b, const int *ldb, const int8_t *ob, + const float *beta, int32_t *c, const int *ldc, const int32_t *oc) { + if (*oa != 0 || *ob != 0) return mkldnn_unimplemented; + + int M = *m, N = *n, K = *k; + bool transa = (*transA == 'T' || *transA == 't'); + bool transb = (*transB == 'T' || *transB == 't'); + int ld = transb ? N : K; + + uint8_t *b_u8 = (uint8_t *)malloc(sizeof(uint8_t) * K * N, 64); + int32_t *compensation = (int32_t *)malloc(sizeof(int32_t) * M, 64); + + if (utils::any_null(b_u8, compensation)) { + free(b_u8); + free(compensation); + return mkldnn_out_of_memory; + } + + compensation_init(offsetC, compensation, M, oc); + compensation_compute(transa, M, K, *alpha, a, *lda, compensation); + copy_and_shift_b(transb, K, N, b_u8, ld, b, *ldb); + + mkldnn_gemm_s8u8s32(transA, transB, "C", m, n, k, alpha, a, lda, oa, b_u8, + &ld, ob, beta, c, ldc, compensation); + + if ((*offsetC == 'R' || *offsetC == 'r')) + parallel_nd(M, N, + [=](int i, int j) { c[i + (ptrdiff_t)j * *ldc] += oc[j]; }); + + free(b_u8); + free(compensation); + + return mkldnn_success; +} +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.hpp new file mode 100644 index 0000000..dc9d43b --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8s8s32.hpp @@ -0,0 +1,37 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef JIT_AVX512_CORE_GEMM_S8S8S32_HPP +#define JIT_AVX512_CORE_GEMM_S8S8S32_HPP + +#include +#include "mkldnn_types.h" + +namespace mkldnn { +namespace impl { +namespace cpu { + +mkldnn_status_t jit_avx512_core_gemm_s8s8s32( + const char *transA, const char *transB, const char *offsetC, + const int *m, const int *n, const int *k, + const float *alpha, const int8_t *a, const int *lda, const int8_t *oa, + const int8_t *b, const int *ldb, const int8_t *ob, + const float *beta, int32_t *c, const int *ldc, const int32_t *oc); +} +} +} + +#endif // JIT_AVX512_CORE_GEMM_S8S8S32_HPP diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.cpp new file mode 100644 index 0000000..e4b8e1c --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.cpp @@ -0,0 +1,1409 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include + +#include "common.hpp" +#include "mkldnn_types.h" +#include "nstl.hpp" +#include "utils.hpp" + +#include "jit_avx512_core_gemm_s8u8s32.hpp" +#include "jit_avx512_core_gemm_s8u8s32_kern.hpp" +#include "jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp" +#include "gemv.hpp" + +#if defined(_MSC_VER) +#include +#endif + +namespace mkldnn { +namespace impl { +namespace cpu { + +typedef struct { + int nthrs_m, nthrs_n; + int partition; + int copy_type; +} blas_thread_t; + +static inline void round_to_nearest(int32_t *rounded_val, double fp_val) { + if (fp_val >= 0.) { + fp_val += 0.5; + if (fp_val > INT32_MAX) { + fp_val = INT32_MAX; + } + } else { + fp_val -= 0.5; + if (fp_val < INT32_MIN) { + fp_val = INT32_MIN; + } + } + *rounded_val = (int32_t) fp_val; +} + +static inline void add_results(const dim_t m, const dim_t n, const dim_t k, + const float alpha, const float beta, const int32_t *c_partial_sum, + const dim_t ldcp, int32_t *c_data, const dim_t ldc, + const int32_t *a_row_sum, const int32_t *b_col_sum, const int8_t ao, + const int8_t bo, const int32_t *co, const int offsetc) +{ + for (dim_t j = 0; j < n; ++j) { + for (dim_t i = 0; i < m; ++i) { + int32_t ctemp = c_partial_sum[i + j * ldcp]; + + if (alpha == 1.0f) { + if (beta == 0.0f) { + c_data[i + j * ldc] = ctemp; + } else { + double c_float = (double) beta + * (double) c_data[i + j * ldc]; + c_float += (double) ctemp; + round_to_nearest(&c_data[i + j * ldc], c_float); + } + } else if (alpha == -1.0f) { + if (beta == 0.0f) { + c_data[i + j * ldc] = -ctemp; + } else { + double c_float = (double) beta + * (double) c_data[i + j * ldc]; + c_float -= (double) ctemp; + round_to_nearest(&c_data[i + j * ldc], c_float); + } + } else { + if (beta == 0.0f) { + double c_float = alpha * (double) ctemp; + round_to_nearest(&c_data[i + j * ldc], c_float); + } else { + double c_float = alpha * (double) ctemp + + beta * (double) c_data[i + j * ldc]; + round_to_nearest(&c_data[i + j * ldc], c_float); + } + } + + if (offsetc == FIX_OFFSET) { + c_data[i + j * ldc] += co[0]; + } else if (offsetc == ROW_OFFSET) { + c_data[i + j * ldc] += co[j]; + } else if (offsetc == COL_OFFSET) { + c_data[i + j * ldc] += co[i]; + } + } + } +} + +// TODO Find a better place for those functions. +static inline dim_t ld_padd(const dim_t x) +{ + return ((x + ((2048 / sizeof(int32_t)) - 1)) / (2048 / sizeof(int32_t))) + * (2048 / sizeof(int32_t)) + (64 / sizeof(int32_t)); +} + +void igemm_inner_kernel(const dim_t m, const dim_t n, const dim_t k, + const int8_t *a, const uint8_t *b, float beta, int32_t *c, + const dim_t ldc, const int32_t *a_row_sum, const int32_t *b_col_sum, + const int32_t *co, const int offsetc, const blas_t *arg) +{ + int8_t ao = arg->ao; + int8_t bo = arg->bo; + int32_t co_0 = (offsetc == NO_OFFSET)? 0 : co[0]; + + // Since m and n are limited by blocking, stack overflow may not happen; + // it's up to 32kB +#if !defined(_MSC_VER) + int32_t col_offset[m]; + int32_t row_offset[n]; +#else + int32_t *col_offset = (int32_t *) _alloca(sizeof(*col_offset) * m); + int32_t *row_offset = (int32_t *) _alloca(sizeof(*row_offset) * n); +#endif + + int col_req = 0; + int row_req = 0; + + if ((bo != 0) || (offsetc == COL_OFFSET)) + col_req = 1; + if ((ao != 0) || (offsetc == ROW_OFFSET)) + row_req = 1; + + // It needs one of colum or row offsets, but it doesn't need both + if (((ao != 0) && (bo != 0)) || ((offsetc == FIX_OFFSET) && (co_0 != 0))) { + if ((col_req == 0) && (row_req == 0)) { + if (m <= n) { + col_req = 1; + } else { + row_req = 1; + } + } + } + + if (col_req) { + for (dim_t i = 0; i < m; i++) + col_offset[i] = 0; + + if (offsetc == COL_OFFSET) { + for (dim_t i = 0; i < m; i++) + col_offset[i] += co[i]; + } + + if (bo != 0) { + for (dim_t i = 0; i < m; i++) + col_offset[i] += bo * a_row_sum[i]; + } + } + + if (row_req) { + for (dim_t i = 0; i < n; i++) + row_offset[i] = 0; + + if (offsetc == ROW_OFFSET) { + for (dim_t i = 0; i < n; i++) + row_offset[i] += co[i]; + } + + if (ao != 0) { + for (dim_t i = 0; i < n; i++) + row_offset[i] += ao * b_col_sum[i]; + } + } + + if ((offsetc == FIX_OFFSET) && (co_0 != 0)) { + if (col_req) { + for (dim_t i = 0; i < m; i++) + col_offset[i] += co_0; + } else { + for (dim_t i = 0; i < n; i++) + row_offset[i] += co_0; + } + } + + if ((ao != 0) && (bo != 0)) { + if (col_req) { + for (dim_t i = 0; i < m; i++) + col_offset[i] += (int32_t) k * ao * bo; + } else { + for (dim_t i = 0; i < n; i++) + row_offset[i] += (int32_t) k * ao * bo; + } + } + + if (col_req == 0) { + if (row_req == 0) { + if (beta == 0.0) { + arg->kernel_b0(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } else { + arg->kernel(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } + } else { + if (beta == 0.0) { + arg->kernel_b0_r(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } else { + arg->kernel_r(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } + } + } else { + if (row_req == 0) { + if (beta == 0.0) { + arg->kernel_b0_c(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } else { + arg->kernel_c(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } + } else { + if (beta == 0.0) { + arg->kernel_b0_b(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } else { + arg->kernel_b(&m, &n, &k, NULL, a, b, c, ldc, col_offset, + row_offset); + } + } + } +} + +static inline void *align(void *ptr, size_t alignment) +{ + return (void *) utils::rnd_up((uintptr_t) ptr, alignment); +} + +static int gemm_kernel_driver(const dim_t m, const dim_t n, const dim_t k, + const int8_t *a, const uint8_t *b, int32_t *c, const int32_t *co, + const blas_t *arg) +{ + dim_t lda = arg->lda; + dim_t ldb = arg->ldb; + dim_t ldc = arg->ldc; + int8_t ao = arg->ao; + int8_t bo = arg->bo; + float alpha = *arg->alpha; + float beta = *arg->beta; + + if (m <= 0 || n <= 0) { + return 0; + } + + // Padding along K dimension. + dim_t k_padd = 0; + if (k <= arg->bk_traditional) { + k_padd = utils::rnd_up(k, arg->uk); + k_padd = nstl::max(128LL, k_padd); + } else if (k < 2 * arg->bk) { + k_padd = utils::rnd_up(k / 2, arg->uk); + } else { + k_padd = arg->bk; + } + + // Padding along M dimension. + dim_t m_padd = utils::rnd_up(nstl::min(nstl::max(m, arg->um), arg->bm), + arg->um); + + // Padding along N dimension. + dim_t n_padd = 0; + if (k < arg->blocking_small_k) { + n_padd = utils::rnd_up(nstl::min(nstl::max(n, arg->un), + arg->bn_small_k), arg->un); + } else { + n_padd = utils::rnd_up(nstl::min(nstl::max(n, arg->un), arg->bn), + arg->un); + } + + // Padding for temporary buffer for C + dim_t ldc_buf = ld_padd(m_padd); + + dim_t strideAm = (arg->transa == 0)? 1 : lda; + dim_t strideAn = (arg->transa != 0)? 1 : lda; + dim_t strideBm = (arg->transb == 0)? 1 : ldb; + dim_t strideBn = (arg->transb != 0)? 1 : ldb; + + size_t a_buf_nelems = m_padd * k_padd; + size_t b_buf_nelems = k_padd * n_padd; + size_t a_row_sum_nelems = m_padd; + size_t b_col_sum_nelems = n_padd; + + size_t mem_size = a_buf_nelems * sizeof(*a) + PAGE_4K + + b_buf_nelems * sizeof(*b) + PAGE_4K + + a_row_sum_nelems * sizeof(*c) + PAGE_4K + + b_col_sum_nelems * sizeof(*c) + PAGE_4K; + + bool need_c_buffer = alpha != 1.0f || (beta != 1 && beta != 0); + if (need_c_buffer) { + size_t c_buf_nelems = ldc_buf * n_padd; + mem_size += c_buf_nelems * sizeof(*c) + PAGE_4K; + } + + char *mem = (char *) malloc(mem_size, 128); + + if (!mem) { + return -1; + } + + int8_t *bufferA = (int8_t *) align(mem, PAGE_4K); + uint8_t *bufferB = (uint8_t *) align(bufferA + a_buf_nelems, PAGE_4K); + int32_t *a_row_sum = (int32_t *) align(bufferB + b_buf_nelems, PAGE_4K); + int32_t *b_col_sum = (int32_t *) align(a_row_sum + a_row_sum_nelems, + PAGE_4K); + + int32_t *bufferC = NULL; + if (need_c_buffer) { + bufferC = (int32_t *) align(b_col_sum + b_col_sum_nelems, PAGE_4K); + } + + float beta_saved = beta; + + int a_block_copied = 0; + dim_t sizeM = 0; + for (dim_t Bm = 0; Bm < m; Bm += sizeM) { + sizeM = m - Bm; + if (sizeM > m_padd) + sizeM = m_padd; + + dim_t sizeK = 0; + for (dim_t Bk = 0; Bk < k; Bk += sizeK) { + sizeK = k - Bk; + if (sizeK > k_padd) + sizeK = k_padd; + + // Scale C blocks by beta only for the first time + if (Bk == 0) + beta = beta_saved; + else + beta = 1.0f; + + // Apply C offset when to the last k-block of the partial sum. + int offsetc = NO_OFFSET; + if (Bk + sizeK == k) + offsetc = arg->offsetc; + + dim_t sizeN = 0; + for (dim_t Bn = 0; Bn < n; Bn += sizeN) { + sizeN = n - Bn; + if (sizeN > n_padd) + sizeN = n_padd; + + const uint8_t *b_block = b + Bk * strideBm + Bn * strideBn; + arg->copyB(&sizeK, &sizeN, b_block, &ldb, NULL, bufferB, NULL, + NULL, b_col_sum); + + dim_t sizeUM = 0; + for (dim_t Um = 0; Um < sizeM; Um += sizeUM) { + sizeUM = sizeM - Um; + if (sizeUM > arg->um) + sizeUM = arg->um; + + /* + * Use the whole A buffer only if we have multiple B blocks + * for k-dimension, otherwise we are wasting cache to store + * B and C blocks. + */ + dim_t Um_forA = 0; + if (sizeN < n) + Um_forA = Um; + + const int8_t *a_block = a + (Bm + Um) * strideAm + + Bk * strideAn; + if (!a_block_copied) { + arg->copyA(&sizeK, &sizeUM, a_block, &lda, NULL, + bufferA + Um_forA * sizeK, NULL, NULL, + a_row_sum + Um_forA); + } + + int32_t *c_block = c + (Bm + Um) + Bn * ldc; + dim_t co_stride = 0; + if (offsetc == FIX_OFFSET) { + co_stride = 0; + } else if (offsetc == ROW_OFFSET) { + co_stride = Bn; + } else if (offsetc == COL_OFFSET) { + co_stride = Bm + Um; + } + if (need_c_buffer) { + igemm_inner_kernel(sizeUM, sizeN, sizeK, + bufferA + Um_forA * sizeK, bufferB, 0.0f, + bufferC + Um, ldc_buf, a_row_sum + Um_forA, + b_col_sum, NULL, NO_OFFSET, arg); + + // Finish the block adding the necessary alpha, beta + // and offsets. + add_results(sizeUM, sizeN, sizeK, alpha, beta, + bufferC + Um, ldc_buf, c_block, ldc, + a_row_sum + Um_forA, b_col_sum, ao, bo, + co + co_stride, offsetc); + } else { + igemm_inner_kernel(sizeUM, sizeN, sizeK, + bufferA + Um_forA * sizeK, bufferB, beta, + c_block, ldc, a_row_sum + Um_forA, b_col_sum, + co + co_stride, offsetc, arg); + } + } + a_block_copied = 1; + } + a_block_copied = 0; + } + } + + free(mem); + + return 0; +} + +static int kernel_driver_parallel_acopiedbcopy(const dim_t m, const dim_t n, + const dim_t k, const int8_t *bufferA, const uint8_t *b, + const float beta, int32_t *c, const int offsetc, const int32_t *co, + const int32_t *a_row_sum, const blas_t *arg) +{ + dim_t ldb = arg->ldb; + dim_t ldc = arg->ldc; + int8_t ao = arg->ao; + int8_t bo = arg->bo; + float alpha = *arg->alpha; + + if (m <= 0 || n <= 0) { + return 0; + } + + // Padding along N dimension. + dim_t n_padd = 0; + if (k < arg->blocking_small_k) { + n_padd = utils::rnd_up(nstl::min(nstl::max(n, arg->un), + arg->bn_small_k), arg->un); + } else { + n_padd = utils::rnd_up(nstl::min(nstl::max(n, arg->un), arg->bn), + arg->un); + } + + // Padding for temporary buffer for C + dim_t ldc_buf = ld_padd(m); + + dim_t strideBn = (arg->transb != 0)? 1 : ldb; + + size_t b_buf_nelems = k * n_padd; + size_t b_col_sum_nelems = n_padd; + + size_t mem_size = b_buf_nelems * sizeof(*b) + PAGE_4K + + b_col_sum_nelems * sizeof(*c) + PAGE_4K; + + bool need_c_buffer = alpha != 1.0f || (beta != 1 && beta != 0); + if (need_c_buffer) { + size_t c_buf_nelems = ldc_buf * n_padd; + mem_size += c_buf_nelems * sizeof(*c) + PAGE_4K; + } + + char *mem = (char *) malloc(mem_size, 128); + + if (!mem) { + return -1; + } + + uint8_t *bufferB = (uint8_t *) align(mem, PAGE_4K); + int32_t *b_col_sum = (int32_t *) align(bufferB + b_buf_nelems, PAGE_4K); + + int32_t *bufferC = NULL; + if (need_c_buffer) { + bufferC = (int32_t *) align(b_col_sum + b_col_sum_nelems, PAGE_4K); + } + + dim_t sizeN = 0; + for (dim_t Bn = 0; Bn < n; Bn += sizeN) { + sizeN = n - Bn; + if (sizeN > n_padd) + sizeN = n_padd; + + // Implement the kernel here. + const uint8_t *b_block = b + Bn * strideBn; + arg->copyB(&k, &sizeN, b_block, &ldb, NULL, bufferB, NULL, NULL, + b_col_sum); + + dim_t co_stride = 0; + if (offsetc == FIX_OFFSET) { + co_stride = 0; + } else if (offsetc == ROW_OFFSET) { + co_stride = Bn; + } else if (offsetc == COL_OFFSET) { + co_stride = 0; + } + int32_t *c_block = c + Bn * ldc; + if (need_c_buffer) { + igemm_inner_kernel(m, sizeN, k, bufferA, bufferB, 0.0f, bufferC, + ldc_buf, a_row_sum, b_col_sum, NULL, NO_OFFSET, arg); + + // Finish the block adding the necessary alpha, beta and offsets. + add_results(m, sizeN, k, alpha, beta, bufferC, ldc_buf, c_block, + ldc, a_row_sum, b_col_sum, ao, bo, co + co_stride, + offsetc); + } else { + igemm_inner_kernel(m, sizeN, k, bufferA, bufferB, beta, c_block, + ldc, a_row_sum, b_col_sum, co + co_stride, offsetc, arg); + } + } + + free(mem); + + return 0; + +} + +#define N2D_MAX_AVX512 384 +#define M2D_MIN_AVX512 384 +#define VECLEN 16 +#define NCONS 1 +static inline void set_thread_opts_avx512(int *p_nthrs, + blas_thread_t *thread_info, const blas_t *arg) +{ + int nthrs = *p_nthrs; + dim_t m = arg->m; + dim_t n = arg->n; + + thread_info->nthrs_m = 0; + thread_info->nthrs_n = 0; + thread_info->copy_type = COPY_NONE; // By default don't do parallel copy. + + int condition_2D_bsrc = -1; + if ((256 * m > nthrs * n) && (nthrs * m < 256 * n)) { + condition_2D_bsrc = 1; + } else { + condition_2D_bsrc = 0; + } + + int condition_1D_copya = 0; + if ((m >= 1000) && (n >= nthrs * N2D_MAX_AVX512 / 4)) { + condition_2D_bsrc = 0; + condition_1D_copya = 1; + } + + // If offset is non-zero, we need to keep 1D_copya to reduce update overhead + if (arg->ao != 0 || arg->bo != 0 || arg->co[0] != 0 + || arg->offsetc != FIX_OFFSET) { + condition_2D_bsrc = 0; + condition_1D_copya = 1; + } + + if (condition_2D_bsrc == 1) { + int nthrs_m = 1; + int nthrs_n = nthrs; + + while ((nthrs_n % 2 == 0) && + (n / nthrs > N2D_MAX_AVX512 || + n / nthrs_n <= N2D_MAX_AVX512 / 2) && + (m / nthrs_m >= 2 * M2D_MIN_AVX512) && + (nthrs_m < 4)) { + nthrs_m *= 2; + nthrs_n /= 2; + } + + thread_info->nthrs_m = nthrs_m; + thread_info->nthrs_n = nthrs_n; + thread_info->partition = PARTITION_2D; + + // Reset the total number of threads that will be used. + *p_nthrs = nthrs_m * nthrs_n; + + } else if (condition_1D_copya && mkldnn_thr_syncable()) { + // Use parallel copy A algorithm + thread_info->copy_type = COPY_A; + thread_info->partition = PARTITION_1D_COL; + } else { + if ((m > n) && (m / nthrs >= VECLEN || n < NCONS * nthrs)) { + thread_info->partition = PARTITION_1D_ROW; + } else { + thread_info->partition = PARTITION_1D_COL; + } + } +} +#undef N2D_MAX_AVX512 +#undef M2D_MIN_AVX512 +#undef VECLEN +#undef NCONS + +static inline void partition_1d(const int ithr, const int nthrs, const dim_t n, + dim_t *t_offset, dim_t *t_block) +{ + dim_t band = n / nthrs; + + dim_t tail = n - (nthrs - 1) * band; + if (tail > (band + 1)) + band++; + tail = n - (nthrs - 1) * band; + + if (ithr < (nthrs - 1)) + *t_block = band; + else + *t_block = tail; + + *t_offset = ithr * band; + + if (*t_offset >= n) { + *t_block = 0; + *t_offset = 0; + } else if ((*t_offset + *t_block) > n) { + *t_block = n - *t_offset; + } +} + +static inline void partition_2d(const int ithr, int *nthrs, const int ithr_i, + const int ithr_j, const int nthrs_m, const int nthrs_n, const dim_t m, + const dim_t n, dim_t *p_m_disp, dim_t *p_m_band, dim_t *p_n_disp, + dim_t *p_n_band) +{ + dim_t m_disp = 0, n_disp = 0; + dim_t m_band = 0, n_band = 0; + + int mdiv = nthrs_m; + int ndiv = nthrs_n; + + dim_t m_bandt = m / mdiv; /* size per thread */ + dim_t n_bandt = n / ndiv; /* size per thread */ + int firstmgroup = mdiv - 1; + int firstngroup = ndiv - 1; + dim_t firstmval = m_bandt; + dim_t firstnval = n_bandt; + + int mthr_used = mdiv; + if (m - (mdiv - 1) * m_bandt > m_bandt + 1) { + if (m - (mdiv - 1) * m_bandt > mdiv) + ++m_bandt; + + firstmval = m_bandt + 1; + mthr_used = (int) (m / firstmval); + + if (mthr_used * firstmval < m) + ++mthr_used; + + firstmgroup = mthr_used - 1; + } + + int nthr_used = ndiv; + if (n - (ndiv - 1) * n_bandt > n_bandt + 1) { + firstnval = n_bandt + 1; + nthr_used = (int) (n / firstnval); + + if (nthr_used * firstnval < n) + ++nthr_used; + + firstngroup = nthr_used - 1; + } + + *nthrs = mthr_used * nthr_used; + + if (ithr < *nthrs) { + if (ithr_i < firstmgroup) { + m_band = firstmval; + m_disp = ithr_i * firstmval; + } else if (ithr_i <= mthr_used - 2) { + m_band = m_bandt; + m_disp = firstmgroup * firstmval + (ithr_i - firstmgroup) * m_bandt; + } else { + m_disp = firstmgroup * firstmval + + (mthr_used - 1 - firstmgroup) * m_bandt; + m_band = nstl::max(0LL, m - m_disp); + } + + if (ithr_j < firstngroup) { + n_band = firstnval; + n_disp = ithr_j * firstnval; + } else if (ithr_j <= nthr_used - 2) { + n_band = n_bandt; + n_disp = firstngroup * firstnval + (ithr_j - firstngroup) * n_bandt; + } else { + n_disp = firstngroup * firstnval + + (nthr_used - 1 - firstngroup) * n_bandt; + n_band = nstl::max(0LL, n - n_disp); + } + m_disp = nstl::max(nstl::min(m_disp, m - 1), 0LL); + n_disp = nstl::max(nstl::min(n_disp, n - 1), 0LL); + } + + if (ithr < *nthrs) { + *p_m_disp = m_disp; + *p_n_disp = n_disp; + *p_m_band = m_band; + *p_n_band = n_band; + } else { + *p_m_disp = 0; + *p_n_disp = 0; + *p_m_band = 0; + *p_n_band = 0; + } + + return; +} + +static inline void decompose_matrices(const int ithr, int *nthrs, dim_t *m, + dim_t *n, dim_t *k, const int8_t **a, const uint8_t **b, int32_t **c, + const int32_t **co, const blas_thread_t *thread_info, const blas_t *arg) +{ + dim_t strideAm = (arg->transa == 0)? 1 : arg->lda; + dim_t strideBn = (arg->transb != 0)? 1 : arg->ldb; + int offsetc = arg->offsetc; + + switch (thread_info->partition) { + case PARTITION_1D_ROW: + { + dim_t offset = 0; + dim_t block = 0; + partition_1d(ithr, *nthrs, arg->m, &offset, &block); + + *m = block; + *n = arg->n; + *k = arg->k; + + // Set matrix A. + *a = arg->a + offset * strideAm; + + // Set matrix B. + *b = arg->b; + + // Set matrix C. + *c = arg->c + offset; + + // Set offset vector for C matrix + dim_t co_stride = 0; + if (offsetc == FIX_OFFSET) { + co_stride = 0; + } else if (offsetc == ROW_OFFSET) { + co_stride = 0; + } else if (offsetc == COL_OFFSET) { + co_stride = offset; + } + *co = arg->co + co_stride; + break; + } + + case PARTITION_1D_COL: + { + dim_t offset = 0; + dim_t block = 0; + partition_1d(ithr, *nthrs, arg->n, &offset, &block); + + *m = arg->m; + *n = block; + *k = arg->k; + + // Set matrix A. + *a = arg->a; + + // Set matrix B. + *b = arg->b + offset * strideBn; + + // Set matrix C. + *c = arg->c + offset * arg->ldc; + + // Set offset vector for C matrix + dim_t co_stride = 0; + if (offsetc == FIX_OFFSET) { + co_stride = 0; + } else if (offsetc == ROW_OFFSET) { + co_stride = offset; + } else if (offsetc == COL_OFFSET) { + co_stride = 0; + } + *co = arg->co + co_stride; + break; + } + + case PARTITION_2D_COL_MAJOR: + { + int nthrs_m = thread_info->nthrs_m; + int nthrs_n = thread_info->nthrs_n; + int ithr_i = ithr % nthrs_m; + int ithr_j = ithr / nthrs_m; + + dim_t m_disp = 0; + dim_t m_band = 0; + dim_t n_disp = 0; + dim_t n_band = 0; + + partition_2d(ithr, nthrs, ithr_i, ithr_j, nthrs_m, nthrs_n, + arg->m, arg->n, &m_disp, &m_band, &n_disp, &n_band); + + *m = m_band; + *n = n_band; + *k = arg->k; + + // Set matrix A. + *a = arg->a + m_disp * strideAm; + + // Set matrix B. + *b = arg->b + n_disp * strideBn; + + // Set matrix C. + *c = arg->c + m_disp + n_disp * arg->ldc; + + // Set offset vector for C matrix + dim_t co_stride = 0; + if (offsetc == FIX_OFFSET) { + co_stride = 0; + } else if (offsetc == ROW_OFFSET) { + co_stride = n_disp; + } else if (offsetc == COL_OFFSET) { + co_stride = m_disp; + } + *co = arg->co + co_stride; + break; + } + } +} + +#define MULTIPLIER 10 +static int parallel_a_copy(const int ithr, const int nthrs, const dim_t m, + const dim_t n, const dim_t k, const int8_t *a, const uint8_t *b, + int32_t *c, const int32_t *co, const blas_t *arg, + char **p_shared_mem) +{ + const dim_t lda = arg->lda; + const dim_t ldb = arg->ldb; + const dim_t strideAm = (arg->transa == 0)? 1 : lda; + const dim_t strideAn = (arg->transa != 0)? 1 : lda; + const dim_t strideBm = (arg->transb == 0)? 1 : ldb; + + // Padding along M dimension. + dim_t m_padd = utils::rnd_up(nstl::min(nstl::max(m, arg->um), arg->bm), + arg->um); + + // Padding along K dimension. + dim_t k_padd = 0; + if (k <= arg->bk_traditional) { + k_padd = utils::rnd_up(k, arg->uk); + k_padd = nstl::max(128LL, k_padd); + } else if (k < 2 * arg->bk) { + k_padd = utils::rnd_up(k / 2, arg->uk); + } else { + k_padd = arg->bk; + } + + m_padd *= nthrs > MULTIPLIER ? MULTIPLIER : nthrs; + if (m_padd > m) { + m_padd = utils::rnd_up(m, arg->um); + } + + size_t a_buf_nelems = m_padd * k_padd; + + // Allocate shared memory for A and its row sum buffers in master thread. + if (ithr == 0) { // If thread master + size_t a_row_sum_nelems = m_padd; + + size_t mem_size = (a_buf_nelems * sizeof(*a) + PAGE_4K) + + a_row_sum_nelems * sizeof(*c) + PAGE_4K; + + *p_shared_mem = (char *) malloc(mem_size, 128); + + } + mkldnn_thr_barrier(); + + char *mem = *p_shared_mem; + int8_t *bufferA = (int8_t *) align(mem, PAGE_4K); + int32_t *a_row_sum = (int32_t *) align(bufferA + a_buf_nelems, PAGE_4K); + + if (!mem) { + return -1; + } + + int result = 0; // Return status + + dim_t sizeK = 0; + for (dim_t Bk = 0; Bk < k; Bk += sizeK) { + sizeK = k - Bk; + if (sizeK > k_padd) + sizeK = k_padd; + + // Scale C blocks by beta only for the first term of partial sum. + float beta = 1.0f; + if (Bk == 0) + beta = *(arg->beta); + + // Apply C offset for the last k-block of the partial sum. + int offsetc = NO_OFFSET; + if (Bk + sizeK == k) + offsetc = arg->offsetc; + + dim_t sizeM = 0; + for (dim_t Bm = 0; Bm < m; Bm += sizeM) { + sizeM = m - Bm; + if (sizeM > m_padd) + sizeM = m_padd; + + if (ithr < nthrs) { + dim_t band = (sizeM + nthrs - 1) / nthrs; + band = utils::rnd_up(band, arg->um); + + dim_t offset = band * ithr; + + // If offset is too large don't use that thread for copying. + if (offset >= sizeM) { + offset = 0; + band = 0; + } + + // Handle the tail of the copy. + if (offset + band > sizeM) { + band = sizeM - offset; + } + + if (band > 0) { + const int8_t *a_block = a + (Bm + offset) * strideAm + + Bk * strideAn; + arg->copyA(&sizeK, &band, a_block, &lda, NULL, + bufferA + offset * sizeK, NULL, NULL, + a_row_sum + offset); + } + } + mkldnn_thr_barrier(); // Wait for finishing parallel copy. + + const uint8_t *b_block = b + Bk * strideBm; + int32_t *c_block = c + Bm; + dim_t co_stride = 0; + if (offsetc == FIX_OFFSET) { + co_stride = 0; + } else if (offsetc == ROW_OFFSET) { + co_stride = 0; + } else if (offsetc == COL_OFFSET) { + co_stride = Bm; + } + + result = kernel_driver_parallel_acopiedbcopy(sizeM, n, sizeK, + bufferA, b_block, beta, c_block, offsetc, co + co_stride, + a_row_sum, arg); + + mkldnn_thr_barrier(); // Wait for kernel computations to finish. + } + } + + // Free memory allocated in master thread + if (ithr == 0) { + free(mem); + } + + return result; +} +#undef MULTIPLIER + +static inline void get_omp_thread_count(dim_t m, dim_t n, dim_t k, + double fp_per_cycle, int *nthrs) +{ + double omp_overhead_small_core = 3.0e+3; + double omp_intercept_big_core = 4.0e+3; + double omp_slope_big_core = 5.0e+2; + + double gemm_cycles = 8.0 * m * n * k / fp_per_cycle; + + int i = *nthrs; + + // Use a different model for omp overheads if nthrs is <= 4 + if (*nthrs <= 4 && omp_overhead_small_core > 0) { + double omp_cycles = omp_overhead_small_core; + if (gemm_cycles < omp_cycles) { + *nthrs = 1; + return; + } else { + while (i > 1) { + if (omp_cycles * i < gemm_cycles * (i - 1)) break; + --i; + } + } + } else { + if (gemm_cycles < (omp_intercept_big_core + 2 * omp_slope_big_core)) { + *nthrs = 1; + return; + } + + // adaptive decrement to march faster· + while (i > 1) { + double omp_cycles = omp_intercept_big_core + i * omp_slope_big_core; + if (omp_cycles * i < gemm_cycles * (i - 1)) + break; + + if (i < 10) + i -= 2; + else if (i < 30) + i -= 4; + else + i -= 8; + } + } + + if (i < 1) + i = 1; + + *nthrs = i; +} + +#define CACHE_LINE_SIZE 64 +static int gemm_threading_driver(blas_t *arg) +{ + if ((arg->m <= 0) || (arg->n <= 0)) + return mkldnn_success; + + if (gemm_s8u8s32_jump_to_gemv_s8u8s32(arg)) { + return mkldnn_success; + } + + int nthr = (mkldnn_in_parallel()) ? 1 : mkldnn_get_max_threads(); + get_omp_thread_count(arg->m, arg->n, arg->k, 64.0, &nthr); + + if (nthr == 1) { + return gemm_kernel_driver(arg->m, arg->n, arg->k, arg->a, arg->b, + arg->c, arg->co, arg); + } + + int *results = (int *) malloc(sizeof(*results) * nthr * CACHE_LINE_SIZE, + PAGE_4K); + + if (!results) { + return -1; + } + + for (int i = 0; i < nthr; i++) { + results[i * CACHE_LINE_SIZE] = 0; // Initialize to success + } + + char *shared_mem = NULL; + + parallel(nthr, [&](const int ithr, const int nthr) { + int nthrs = nthr; + if (nthrs == 1) { + results[0] = gemm_kernel_driver(arg->m, arg->n, arg->k, arg->a, + arg->b, arg->c, arg->co, arg); + } else { + blas_thread_t thread_info; + set_thread_opts_avx512(&nthrs, &thread_info, arg); + + const int8_t *a = NULL; + const uint8_t *b = NULL; + int32_t *c = NULL; + const int32_t *co = NULL; + dim_t m = -1; + dim_t n = -1; + dim_t k = -1; + decompose_matrices(ithr, &nthrs, &m, &n, &k, &a, &b, &c, &co, + &thread_info, arg); + + if (ithr < nthrs) { + switch (thread_info.copy_type) { + case COPY_A: + results[ithr * CACHE_LINE_SIZE] = + parallel_a_copy(ithr, nthrs, m, n, k, a, b, c, co, arg, + &shared_mem); + break; + + default: + case COPY_NONE: + results[ithr * CACHE_LINE_SIZE] = + gemm_kernel_driver(m, n, k, a, b, c, co, arg); + break; + } + } + } + }); + + int result = 0; // Initialize to success + for (int i = 0; i < nthr; i++) { + if (results[i] != 0) { + result = results[i * CACHE_LINE_SIZE]; + break; + } + } + + free(results); + + return result; +} +#undef CACHE_LINE_SIZE + +static jit_avx512_core_u8_copy_an_kern *copy_an; +static jit_avx512_core_u8_copy_at_kern *copy_at; +static jit_avx512_core_u8_copy_bn_kern *copy_bn; +static jit_avx512_core_u8_copy_bt_kern *copy_bt; +static jit_avx512_core_u8_copy_sum_an_kern *copy_sum_an; +static jit_avx512_core_u8_copy_sum_at_kern *copy_sum_at; +static jit_avx512_core_u8_copy_sum_bn_kern *copy_sum_bn; +static jit_avx512_core_u8_copy_sum_bt_kern *copy_sum_bt; +static jit_avx512_core_gemm_s8u8s32_kern *kernel; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_b; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_r; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_c; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_b0; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_b0_b; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_b0_r; +static jit_avx512_core_gemm_s8u8s32_kern *kernel_b0_c; +static jit_avx512_core_gemv_s8u8s32_kern *gemv_s8u8s32_kernel; +static jit_avx512_core_gemv_s8u8s32_kern *gemv_u8s8s32_kernel; + +static void jit_init(blas_t *arg) +{ + static int (*copyAn)(const dim_t *m, const dim_t *n, const int8_t *a, + const dim_t *lda, const int8_t *alpha, int8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copyAt)(const dim_t *m, const dim_t *n, const int8_t *a, + const dim_t *lda, const int8_t *alpha, int8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copyBn)(const dim_t *m, const dim_t *n, const uint8_t *a, + const dim_t *lda, const uint8_t *alpha, uint8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copyBt)(const dim_t *m, const dim_t *n, const uint8_t *a, + const dim_t *lda, const uint8_t *alpha, uint8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copySumAn)(const dim_t *m, const dim_t *n, const int8_t *a, + const dim_t *lda, const int8_t *alpha, int8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copySumAt)(const dim_t *m, const dim_t *n, const int8_t *a, + const dim_t *lda, const int8_t *alpha, int8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copySumBn)(const dim_t *m, const dim_t *n, const uint8_t *a, + const dim_t *lda, const uint8_t *alpha, uint8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*copySumBt)(const dim_t *m, const dim_t *n, const uint8_t *a, + const dim_t *lda, const uint8_t *alpha, uint8_t *b, + const dim_t *dummy1, const dim_t *dummy2, int32_t *row_col_sum); + + static int (*kern)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_b)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_r)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_c)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_b0)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_b0_b)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_b0_r)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static int (*kern_b0_c)(const dim_t *m, const dim_t *n, const dim_t *k, + const float *alpha, const int8_t *a, const uint8_t *b, int32_t *c, + const dim_t ldc, const int32_t *col_offset, + const int32_t *row_offset); + + static void (*gemv_s8u8s32_kern)(const dim_t, const dim_t, const float, + const int8_t*, const dim_t, const uint8_t*, + const float, int32_t*); + + static void (*gemv_u8s8s32_kern)(const dim_t, const dim_t, const float, + const uint8_t*, const dim_t, const int8_t*, + const float, int32_t*); + + if (mayiuse(avx512_core_vnni)) { + arg->um = AVX512_UNROLL_M; + arg->un = AVX512_UNROLL_N; + arg->uk = AVX512_UNROLL_K; + arg->bm = AVX512_BM; + arg->bn = AVX512_BN; + arg->bk = AVX512_BK_VNNI; + + arg->bk_traditional = AVX512_BK_TRADITIONAL; + arg->bn_small_k = AVX512_BN_SMALL_K; + arg->blocking_small_k = AVX512_BLOCKING_SMALL_K; + } else { + arg->um = AVX512_UNROLL_M; + arg->un = AVX512_UNROLL_N; + arg->uk = AVX512_UNROLL_K; + arg->bm = AVX512_BM; + arg->bn = AVX512_BN; + arg->bk = AVX512_BK; + + arg->bk_traditional = AVX512_BK_TRADITIONAL; + arg->bn_small_k = AVX512_BN_SMALL_K; + arg->blocking_small_k = AVX512_BLOCKING_SMALL_K; + } + + static std::once_flag initialized; + std::call_once(initialized, []{ + + copy_an = new jit_avx512_core_u8_copy_an_kern(); + copy_at = new jit_avx512_core_u8_copy_at_kern(); + copy_bn = new jit_avx512_core_u8_copy_bn_kern(); + copy_bt = new jit_avx512_core_u8_copy_bt_kern(); + + copy_sum_an = new jit_avx512_core_u8_copy_sum_an_kern(); + copy_sum_at = new jit_avx512_core_u8_copy_sum_at_kern(); + copy_sum_bn = new jit_avx512_core_u8_copy_sum_bn_kern(); + copy_sum_bt = new jit_avx512_core_u8_copy_sum_bt_kern(); + + kernel = new jit_avx512_core_gemm_s8u8s32_kern(false, false, false); + kernel_b = new jit_avx512_core_gemm_s8u8s32_kern(false, true, true); + kernel_r = new jit_avx512_core_gemm_s8u8s32_kern(false, false, true); + kernel_c = new jit_avx512_core_gemm_s8u8s32_kern(false, true, false); + kernel_b0 = new jit_avx512_core_gemm_s8u8s32_kern(true, false, false); + kernel_b0_b = new jit_avx512_core_gemm_s8u8s32_kern(true, true, true); + kernel_b0_r = new jit_avx512_core_gemm_s8u8s32_kern(true, false, true); + kernel_b0_c = new jit_avx512_core_gemm_s8u8s32_kern(true, true, false); + + gemv_s8u8s32_kernel = new jit_avx512_core_gemv_s8u8s32_kern(); + gemv_u8s8s32_kernel = new jit_avx512_core_gemv_s8u8s32_kern(); + + + copyAn = copy_an->getCode(); + + copyAt = copy_at->getCode(); + + copyBn = copy_bn->getCode(); + + copyBt = copy_bt->getCode(); + + copySumAn = copy_sum_an->getCode(); + + copySumAt = copy_sum_at->getCode(); + + copySumBn = copy_sum_bn->getCode(); + + copySumBt = copy_sum_bt->getCode(); + + kern = kernel->getCode(); + + kern_b = kernel_b->getCode(); + + kern_r = kernel_r->getCode(); + + kern_c = kernel_c->getCode(); + + kern_b0 = kernel_b0->getCode(); + + kern_b0_b = kernel_b0_b->getCode(); + + kern_b0_r = kernel_b0_r->getCode(); + + kern_b0_c = kernel_b0_c->getCode(); + + gemv_s8u8s32_kern = + gemv_s8u8s32_kernel -> generate + (mayiuse(avx512_core_vnni)); + gemv_u8s8s32_kern = + gemv_u8s8s32_kernel -> generate + (mayiuse(avx512_core_vnni)); + }); + + if (arg->bo == 0) { // No need to compute A row sum if bo is zero + if (arg->transa == 0) { + arg->copyA = copyAn; + } else { + arg->copyA = copyAt; + } + } else { + if (arg->transa == 0) { + arg->copyA = copySumAn; + } else { + arg->copyA = copySumAt; + } + } + + if (arg->ao == 0) { // No need to compute B column sum if ao is zero + if (arg->transb == 0) { + arg->copyB = copyBn; + } else { + arg->copyB = copyBt; + } + } else { + if (arg->transb == 0) { + arg->copyB = copySumBn; + } else { + arg->copyB = copySumBt; + } + } + + arg->kernel = kern; + arg->kernel_b = kern_b; + arg->kernel_r = kern_r; + arg->kernel_c = kern_c; + arg->kernel_b0 = kern_b0; + arg->kernel_b0_b = kern_b0_b; + arg->kernel_b0_r = kern_b0_r; + arg->kernel_b0_c = kern_b0_c; + arg -> gemv_s8u8s32_kernel = gemv_s8u8s32_kern; + arg -> gemv_u8s8s32_kernel = gemv_u8s8s32_kern; +} + +mkldnn_status_t jit_avx512_core_gemm_s8u8s32( + const char *transA, const char *transB, const char *offsetC, + const int *m, const int *n, const int *k, + const float *alpha, const int8_t *a, const int *lda, const int8_t *oa, + const uint8_t *b, const int *ldb, const int8_t *ob, + const float *beta, int32_t *c, const int *ldc, const int32_t *oc) +{ + char transa = *transA; + char transb = *transB; + char offsetc = *offsetC; + + blas_t args; + + // Initialize blas structure + args.m = *m; + args.n = *n; + args.k = *k; + args.alpha = alpha; + args.a = a; + args.lda = *lda; + args.b = b; + args.ldb = *ldb; + args.beta = beta; + args.c = c; + args.ldc = *ldc; + args.transa = (transa == 'N' || transa == 'n') ? 0 : 1; + args.transb = (transb == 'N' || transb == 'n') ? 0 : 1; + args.um = 0; + args.un = 0; + args.bm = 0; + args.bn = 0; + args.bk = 0; + args.copyA = NULL; + args.copyB = NULL; + args.kernel = NULL; + args.kernel_b0 = NULL; + args.ao = *oa; + args.bo = *ob; + args.co = oc; + + if (offsetc == 'F' || offsetc == 'f') { + args.offsetc = FIX_OFFSET; + } else if (offsetc == 'R' || offsetc == 'r') { + args.offsetc = ROW_OFFSET; + } else { // offsetc == 'C' || offsetc == 'c' + args.offsetc = COL_OFFSET; + } + + jit_init(&args); + int result = gemm_threading_driver(&args); + + return (result < 0) ? mkldnn_out_of_memory : mkldnn_success; +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.hpp new file mode 100644 index 0000000..b2e2902 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32.hpp @@ -0,0 +1,38 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef JIT_AVX512_CORE_GEMM_S8U8S32_HPP +#define JIT_AVX512_CORE_GEMM_S8U8S32_HPP + +#include +#include "mkldnn_types.h" + +namespace mkldnn { +namespace impl { +namespace cpu { + +mkldnn_status_t jit_avx512_core_gemm_s8u8s32( + const char *transA, const char *transB, const char *offsetC, + const int *m, const int *n, const int *k, + const float *alpha, const int8_t *a, const int *lda, const int8_t *oa, + const uint8_t *b, const int *ldb, const int8_t *ob, + const float *beta, int32_t *c, const int *ldc, const int32_t *oc); + +} +} +} + +#endif // JIT_AVX512_CORE_GEMM_S8U8S32_HPP diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp new file mode 100644 index 0000000..57554a1 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp @@ -0,0 +1,539 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_avx512_core_gemm_s8u8s32_kern.hpp" + + +#ifdef _WIN32 +static const bool is_windows = 1; +#else +static const bool is_windows = 0; +#endif + + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace Xbyak; + + + + +// Convert between vector register lengths. +static inline Xmm make_xmm(const Xmm &v) { return Xmm(v.getIdx()); } +static inline Ymm make_ymm(const Xmm &v) { return Ymm(v.getIdx()); } + +// Load from or store to C. +void jit_avx512_core_gemm_s8u8s32_kern::c_load(const Xbyak::Xmm &dst, + const Xbyak::Address &src, int nelems) +{ + switch (nelems) { + default: vmovups(dst, src); break; + case 8: vmovups(make_ymm(dst), src); break; + case 4: vmovups(make_xmm(dst), src); break; + case 2: vmovlps(make_xmm(dst), src); break; + case 1: vmovss(make_xmm(dst), src); break; + } +} +void jit_avx512_core_gemm_s8u8s32_kern::c_store(const Xbyak::Address &dst, + const Xbyak::Xmm &src, int nelems) +{ + switch (nelems) { + default: vmovups(dst, src); break; + case 8: vmovups(dst, make_ymm(src)); break; + case 4: vmovups(dst, make_xmm(src)); break; + case 2: vmovsd(dst, make_xmm(src)); break; + case 1: vmovss(dst, make_xmm(src)); break; + } +} + +// Perform length-4 dot product accumulations of unsigned and signed bytes +// in parallel. +// Use vpdpbusd if VNNI available, otherwise emulate. +void jit_avx512_core_gemm_s8u8s32_kern::dot_product(const Xmm &dst, + const Xmm &src1, const Xmm &src2) +{ + if (vnni) + vpdpbusd(dst, src1, src2); + else { + vpmaddubsw(dp_scratch, src1, src2); + vpmaddwd(dp_scratch, ones, dp_scratch); + vpaddd(dst, dst, dp_scratch); + } +} + +// Inner kernel. +void jit_avx512_core_gemm_s8u8s32_kern::kernel_loop(int unroll_m, int unroll_n, + bool cfetch) +{ + int um_vecs = (unroll_m + 15) >> 4; + Label label_kernel_loop; + + L_aligned(label_kernel_loop); { + for (int h = 0; h < 4; h++) { + for (int j = 0; j < unroll_n; j++) { + const Zmm b = b_regs[j & 1]; + + vpbroadcastd(b, ptr[BO + isize * + (2 * j + 2 * h * unroll_n - offset_b)]); + dot_product(c_regs[0][j], b, a_regs[0]); + + if (j == 1 && !(h & 1)) + prefetch_b(ptr[BO + isize * (prefetch_size_b + + 2 * h * unroll_n - offset_b)]); + else if (j % 3 == 0) + prefetch_a(ptr[AO + isize * (prefetch_size_a + + 32 * (j / 3) + 2 * h * unroll_m - offset_a)]); + + for (int i = 1; i < um_vecs; i++) + dot_product(c_regs[i][j], b, a_regs[i]); + + if (cfetch && (j == std::min(1, unroll_n - 1))) { + if (h == 3) + lea(CO2, ptr[CO2 + LDC]); + else if (h < um_vecs) + prefetch_c(ptr[CO2 + (16 * h * size)]); + } + + if (h == 3 && j == std::min(3, unroll_n - 1)) + lea(AA, ptr[AA + (32 * isize)]); + } + + for (int i = 0; i < um_vecs; i++) + vmovups(a_regs[i], ptr[AO + isize * + (32 * i + 2 * (h + 1) * unroll_m - offset_a)]); + + if (h == 2) + prefetch_x(ptr[AA - (offset_a * isize)]); + } + + add(AO, 8 * isize * unroll_m); + add(BO, 8 * isize * unroll_n); + sub(LoopCount, 1); + jg(label_kernel_loop, T_NEAR); + } +} + +// k remainder loop for kernel. +void jit_avx512_core_gemm_s8u8s32_kern::remainder_kernel(int unroll_m, + int unroll_n, int unroll_k, int bwidth) +{ + if ((unroll_m > IGEMM_UNROLL_M) || (unroll_n > IGEMM_UNROLL_N) + || (unroll_m < 0) || (unroll_n < 0)) + return; + + int um_vecs = (unroll_m + 15) >> 4; + + for (int h = 0; h < unroll_k; h++) { + for (int j = 0; j < unroll_n; j++) { + Zmm b = b_regs[j & 1]; + auto b_src = ptr[BO + (-isize * offset_b + + bwidth * (j + h * unroll_n))]; + + switch (bwidth) { + case 4: + vpbroadcastd(b, b_src); + break; + case 2: + vpbroadcastw(b, b_src); + break; + case 1: + vpbroadcastb(b, b_src); + break; + } + for (int i = 0; i < um_vecs; i++) + dot_product(c_regs[i][j], b, a_regs[i]); + } + + if (unroll_k > 1) { + for (int i = 0; i < um_vecs; i++) + vmovups(a_regs[i], ptr[AO + isize * (32 * i + + (h + 1) * 2 * unroll_m - offset_a)]); + } + } + + add(AO, unroll_k * unroll_m * bwidth); + add(BO, unroll_k * unroll_n * bwidth); +} + +// Inner loop. +void jit_avx512_core_gemm_s8u8s32_kern::innerloop(int unroll_m, int unroll_n) +{ + if ((unroll_m > IGEMM_UNROLL_M) || (unroll_n > IGEMM_UNROLL_N) + || (unroll_m < 0) || (unroll_n < 0)) + return; + + int um_vecs = (unroll_m + 15) >> 4; + int stage1 = unroll_n, stage2 = unroll_n; + + Label label_kernel_loop_1, label_k_main_loop_2, label_kernel_loop_2; + Label label_k_main_loop_3, label_kernel_loop_3; + Label label_k_remainder_loop_begin, label_k_rem_4, label_k_rem_2; + Label label_k_rem_1, label_update_begin; + + mov(AO, A); + for (int i = 0; i < um_vecs; i++) + vmovups(a_regs[i], ptr[AO + isize * (32 * i - offset_a)]); + + mov(LoopCount, K); + sar(LoopCount, 4); + jle(label_k_remainder_loop_begin, T_NEAR); + + // Main k loops, broken into three parts to time C prefetching. + sub(LoopCount, stage1 + stage2); + jle(label_k_main_loop_2, T_NEAR); + + kernel_loop(unroll_m, unroll_n, false); + + L_aligned(label_k_main_loop_2); + lea(CO2, ptr[CO1 + size * (std::min(unroll_m, 16) - 1)]); + add(LoopCount, stage1); + jle(label_k_main_loop_3, T_NEAR); + + kernel_loop(unroll_m, unroll_n, true); + + L_aligned(label_k_main_loop_3); + lea(CO2, ptr[CO1 + size * (std::min(unroll_m, 16) - 1)]); + add(LoopCount, stage2); + jle(label_k_remainder_loop_begin, T_NEAR); + + kernel_loop(unroll_m, unroll_n, true); + + // k remainder handling + L_aligned(label_k_remainder_loop_begin); + mov(LoopCount, K); + test(LoopCount, 8); + je(label_k_rem_4, T_NEAR); + + remainder_kernel(unroll_m, unroll_n, 2, 4); + + L_aligned(label_k_rem_4); + mov(LoopCount, K); + test(LoopCount, 4); + je(label_k_rem_2, T_NEAR); + + remainder_kernel(unroll_m, unroll_n, 1, 4); + + L_aligned(label_k_rem_2); + mov(LoopCount, K); + test(LoopCount, 2); + je(label_k_rem_1, T_NEAR); + + Zmm zero = zmm6; + Zmm tmp = zmm5; + + vpxorq(zero, zero, zero); + for (int i = 0; i < um_vecs; i++) { + Zmm a = a_regs[i]; + vbroadcasti64x4(a, ptr[AO + isize * (16 * i - offset_a)]); + vpunpcklwd(tmp, a, zero); + vpunpckhwd(a, a, zero); + vshufi32x4(a, tmp, a, 0x44); + vshufi32x4(a, a, a, 0xD8); + } + + remainder_kernel(unroll_m, unroll_n, 1, 2); + + L_aligned(label_k_rem_1); + mov(LoopCount, K); + test(LoopCount, 1); + je(label_update_begin, T_NEAR); + + vpxorq(zero, zero, zero); + for (int i = 0; i < um_vecs; i++) { + Zmm a = a_regs[i]; + vbroadcasti32x4(a, ptr[AO + isize * (8 * i - offset_a)]); + vpunpcklbw(tmp, a, zero); + vpunpckhbw(a, a, zero); + vinsertf128(make_ymm(a), make_ymm(tmp), make_xmm(a), 1); + vpunpcklwd(tmp, a, zero); + vpunpckhwd(a, a, zero); + vshufi32x4(a, tmp, a, 0x44); + vshufi32x4(a, a, a, 0xD8); + } + + remainder_kernel(unroll_m, unroll_n, 1, 1); + + // Add offsets and update C. + L_aligned(label_update_begin); + + if (enable_offset_r) { + // Add row offsets. + mov(rax, coffset_ry); + for (int j = 0; j < unroll_n; j++) { + Zmm row_offset = zmm0; + + vbroadcastss(row_offset, ptr[rax + size * j]); + + for (int i = 0; i < um_vecs; i++) + vpaddd(c_regs[i][j], c_regs[i][j], row_offset); + } + add(coffset_ry, size * unroll_n); + } + + if (enable_offset_c) { + // Add column offsets. + mov(rax, coffset_cy); + for (int i = 0; i < um_vecs; i++) { + Zmm col_offset = zmm0; + + c_load(col_offset, ptr[rax + size * 16 * i], unroll_m); + + for (int j = 0; j < unroll_n; j++) + vpaddd(c_regs[i][j], c_regs[i][j], col_offset); + } + } + + Reg64 LDC3 = rax; + lea(LDC3, ptr[LDC + LDC * 2]); + + // C updates. + int c_off_j = 0; + for (int j = 0; j < unroll_n; j++) { + if (j > 0 && (j & 3) == 0) { + lea(CO1, ptr[CO1 + LDC * 4]); + c_off_j += 4; + } + + int jj = j - c_off_j; + + for (int i = 0; i < um_vecs; i++) { + Zmm c = c_regs[i][j]; + Zmm c_old = zmm0; + decltype(LDC * jj) ldc_mult = (jj == 3) ? LDC3 : LDC * jj; + + auto c_mem = ptr[CO1 + ldc_mult + size * 16 * i]; + + if (beta_zero) + c_store(c_mem, c, unroll_m); + else { + c_load(c_old, c_mem, unroll_m); + vpaddd(c_old, c, c_old); + c_store(c_mem, c_old, unroll_m); + } + + vpxorq(c, c, c); + } + } + + lea(CO1, ptr[CO1 + LDC * (unroll_n - c_off_j)]); +} + +// Outer loop. +void jit_avx512_core_gemm_s8u8s32_kern::outerloop(int unroll_x, int unroll_y, + Label *&cur_outerloop_label) +{ + Label label_m_loop, label_n_loop, label_n_remainder_loops[6]; + + L(*cur_outerloop_label); + cur_outerloop_label++; + if (unroll_x >= IGEMM_UNROLL_M) { + mov(J, M); + cmp(J, unroll_x); + jl(*cur_outerloop_label, T_NEAR); // Jump to next outerloop label. + } else { + test(J, unroll_x); + jle(*cur_outerloop_label, T_NEAR); + } + + L_aligned(label_m_loop); { + mov(CO1, C); + add(C, unroll_x * size); + + mov(BO, B); + + mov(AA, K); + imul(AA, AA, unroll_x * isize); + lea(AA, ptr[A + AA + isize * prefetch_size_a]); + + if (enable_offset_c) { + mov(rax, coffset_cx); + mov(coffset_cy, rax); + add(rax, unroll_x * size); + mov(coffset_cx, rax); + } + + if (enable_offset_r) { + mov(rax, coffset_rx); + mov(coffset_ry, rax); + } + + mov(I, N); + cmp(I, unroll_y); + jl(label_n_remainder_loops[0], T_NEAR); + + L_aligned(label_n_loop); { + innerloop(unroll_x, unroll_y); + sub(I, unroll_y); + cmp(I, unroll_y); + jge(label_n_loop, T_NEAR); + } + + align(16); + + int label_idx = 0; + for (int uy = 16; uy > 0; uy >>= 1) { + L(label_n_remainder_loops[label_idx++]); + if (unroll_y > uy) { + test(I, uy); + jle(label_n_remainder_loops[label_idx], T_NEAR); + + innerloop(unroll_x, uy); + align(16); + } + } + L(label_n_remainder_loops[label_idx]); + + mov(A, AO); + if (unroll_x >= IGEMM_UNROLL_M) { + sub(J, unroll_x); + cmp(J, unroll_x); + jge(label_m_loop); + } + } + + align(16); +} + +void jit_avx512_core_gemm_s8u8s32_kern::generate() +{ + // Prologue + preamble(); + sub(rsp, stack_alloc_size); + + if (is_windows) { + mov(A, arg_a); + mov(B, arg_b); + } + + mov(C, arg_c); + mov(LDC, arg_ldc); + + sub(A, -offset_a * isize); + sub(B, -offset_b * isize); + + mov(M, qword[M]); + mov(N, qword[N]); + mov(K, qword[K]); + + lea(LDC, ptr[LDC * size]); + + if (enable_offset_c) { + mov(rax, arg_coffset_c); + mov(coffset_cx, rax); + } + if (enable_offset_r) { + mov(rax, arg_coffset_r); + mov(coffset_rx, rax); + } + + for (int i = 0; i < (max_unroll_m >> 4); i++) { + for (int j = 0; j < max_unroll_n; j++) { + auto &c = c_regs[i][j]; + vpxorq(c, c, c); + } + } + + if (!vnni) { + mov(rax, 1); + movq(make_xmm(ones), rax); + vpbroadcastw(ones, make_xmm(ones)); + } + + Label outerloop_labels[8]; + Label *cur_outerloop_label = &outerloop_labels[0]; + + // Main m loop. + outerloop(IGEMM_UNROLL_M, IGEMM_UNROLL_N, cur_outerloop_label); + + // m remainder loops. + for (int um = 32; um > 0; um >>= 1) + if (IGEMM_UNROLL_M > um) + outerloop(um, IGEMM_UNROLL_N, cur_outerloop_label); + + L(*cur_outerloop_label); + + // Epilogue. + add(rsp, stack_alloc_size); + postamble(); +} + + +jit_avx512_core_gemm_s8u8s32_kern::jit_avx512_core_gemm_s8u8s32_kern(bool + beta_zero_, bool enable_offset_c_, bool enable_offset_r_) : + jit_generator(nullptr, 100000), arg_a(0), arg_b(0), arg_c(0), arg_ldc(0), + arg_coffset_c(0), arg_coffset_r(0), coffset_cx(0), coffset_cy(0), + coffset_rx(0), coffset_ry(0) +{ + beta_zero = beta_zero_; + enable_offset_c = enable_offset_c_; + enable_offset_r = enable_offset_r_; + vnni = mayiuse(avx512_core_vnni); + + // Assign integer registers + M = is_windows ? rcx : rdi; + N = is_windows ? rdx : rsi; + K = is_windows ? r8 : rdx; + A = is_windows ? rsi : r8; + B = r9; + C = r10; + LDC = r11; + I = r12; + J = r13; + LoopCount = rax; + AO = r14; + BO = r15; + CO1 = rbx; + CO2 = rbp; + AA = is_windows ? rdi : rcx; + + // Assign vector registers + dp_scratch = zmm6; + ones = zmm7; + for (int i = 0; i < (max_unroll_m >> 4); i++) + a_regs[i] = Zmm(i); + b_regs[0] = zmm4; + b_regs[1] = zmm5; + + int rn = 0; + for (int i = 0; i < (max_unroll_m >> 4); i++) + for (int j = 0; j < max_unroll_n; j++) + c_regs[i][j] = Zmm(8 + rn++); + + // Assign stack variables. + stack_alloc_size = 32; + auto args_offset = stack_alloc_size + get_size_of_abi_save_regs() + + 8 + (is_windows ? 48 : 0); + + arg_a = ptr[rsp + (args_offset - 16)]; + arg_b = ptr[rsp + (args_offset - 8)]; + arg_c = ptr[rsp + (args_offset + 0)]; + arg_ldc = ptr[rsp + (args_offset + 8)]; + arg_coffset_c = ptr[rsp + (args_offset + 16)]; + arg_coffset_r = ptr[rsp + (args_offset + 24)]; + + coffset_cx = qword[rsp + 0]; + coffset_cy = qword[rsp + 8]; + coffset_rx = qword[rsp + 16]; + coffset_ry = qword[rsp + 24]; + + generate(); +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp new file mode 100644 index 0000000..e8efcc1 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp @@ -0,0 +1,101 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef IGEMM_KERNEL_GENERATOR_HPP +#define IGEMM_KERNEL_GENERATOR_HPP + +#include "jit_generator.hpp" + + +namespace mkldnn { +namespace impl { +namespace cpu { + +class jit_avx512_core_gemm_s8u8s32_kern : public jit_generator { +public: + jit_avx512_core_gemm_s8u8s32_kern(bool beta_zero_, bool enable_offset_c_, + bool enable_offset_r_); + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_s8u8s32_kern); + +protected: + bool beta_zero; + bool enable_offset_c, enable_offset_r; + bool vnni; + + void prefetch_a(const Xbyak::Address &src) { + prefetcht0(src); + } + void prefetch_b(const Xbyak::Address &src) { + prefetcht0(src); + } + void prefetch_c(const Xbyak::Address &src) { + prefetchw(src); + } + void prefetch_x(const Xbyak::Address &src) { + prefetcht0(src); + } + + void c_load(const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems); + void c_store(const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems); + + void dot_product(const Xbyak::Xmm &dst, const Xbyak::Xmm &src1, + const Xbyak::Xmm &src2); + void kernel_loop(int unroll_m, int unroll_n, bool cfetch); + void remainder_kernel(int unroll_m, int unroll_n, int unroll_k, int bwidth); + void innerloop(int unroll_m, int unroll_n); + void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label); + + void generate(); + + +private: + static const int IGEMM_UNROLL_M = 48; + static const int IGEMM_UNROLL_N = 8; + + static const int isize = 2; + static const int size = 4; + + // Prefetch configuration + static const int prefetch_size_a = 32 * 5; + static const int prefetch_size_b = 32 * 4; + + static const int offset_a = 256, offset_b = 256; + static const int max_unroll_m = 48, max_unroll_n = 8; + + // Integer register assignments + Xbyak::Reg64 M, N, K, A, B, C, LDC, I, J, LoopCount; + Xbyak::Reg64 AO, BO, CO1, CO2, AA; + + // Vector register assignments + Xbyak::Zmm dp_scratch, ones, a_regs[max_unroll_m >> 4], b_regs[2]; + Xbyak::Zmm c_regs[max_unroll_m >> 4][max_unroll_n]; + + // Stack variable assignments + int stack_alloc_size; + Xbyak::Address arg_a, arg_b, arg_c, arg_ldc, arg_coffset_c, arg_coffset_r; + Xbyak::Address coffset_cx, coffset_cy, coffset_rx, coffset_ry; + + void L_aligned(Xbyak::Label &label, int alignment = 16) { + align(alignment); + L(label); + } +}; + +} +} +} + +#endif /* header guard */ diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemv_s8u8s32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemv_s8u8s32.cpp new file mode 100644 index 0000000..4f0b10d --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_gemv_s8u8s32.cpp @@ -0,0 +1,290 @@ +/******************************************************************************* + * Copyright 2019 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#include "gemv.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +int gemm_s8u8s32_jump_to_gemv_s8u8s32(blas_t *arg) { + + blas_t arg_gemv = *arg; + + if ((arg -> offsetc == FIX_OFFSET) && // Fix offset + (arg -> ao == 0) && + (arg -> bo == 0) && + (arg -> co[0] == 0) && + (*(arg -> alpha) == 1.0f) && + ((*(arg -> beta) == 1.0f) || *(arg -> beta) == 0.0f)) { + + if (arg -> n == 1) { + + if (arg -> transa == 1) { // A transpose + arg_gemv.n = arg -> k; + arg_gemv.ldc = 1; + arg_gemv.swap = 0; + if (arg -> transb == 0) { // B non transpose + arg_gemv.ldb = 1; + } + // B transpose arg_gemv.ldb = arg -> ldb + gemv_threading_driver(&arg_gemv); + return 1; + } + } + + if (arg -> m == 1) { + + if (arg -> transb == 0) { // B non transpose + arg_gemv.transa = 1; + arg_gemv.m = arg -> n; + arg_gemv.n = arg -> k; + arg_gemv.a = (int8_t *) arg -> b; + arg_gemv.lda = arg -> ldb; + arg_gemv.b = (uint8_t *) arg -> a; + arg_gemv.swap = 1; + if (arg -> transa == 0) { // A non transpose + arg_gemv.ldb = arg -> lda; + } + else { // A transpose + arg_gemv.ldb = 1; + } + gemv_threading_driver(&arg_gemv); + return 1; + } + } + } + + return 0; +} + + +int gemv_kernel_driver(blas_t *arg) { + + dim_t m = arg -> m; + dim_t n = arg -> n; + uint8_t *a = (uint8_t *) arg -> a; + dim_t lda = arg -> lda; + int8_t *b = (int8_t *) arg -> b; + float beta = *(arg -> beta); + + if (arg -> swap) { + arg -> gemv_u8s8s32_kernel(m, n, 1.0f, a, lda, b, beta, arg -> c); + } + else { + arg -> gemv_s8u8s32_kernel(arg -> m, arg -> n, 1.0f, arg -> a, + arg -> lda, arg -> b, *(arg -> beta), arg -> c); + } + + return 0; +} + +int gemv_threading_driver(blas_t *arg) { + + dim_t nthr_m, nthr_n = 1; + dim_t MB, NB, UM = 16, UN = 64; + dim_t BLOCKM = 192, BLOCKN = 3072; + int status; + dim_t i; + + dim_t nthr = (mkldnn_in_parallel()) ? 1 : mkldnn_get_max_threads(); + + uint8_t *new_x = NULL; + int32_t *tmp_y = NULL, *new_y = NULL; + + dim_t m = arg -> m, n = arg -> n; + + blas_t arg_seq = *arg; + float zero = 0.0f; + + nthr_m = std::min(std::max(m / BLOCKM, (dim_t) 1), nthr); + MB = m / nthr_m; + MB = (((MB / UM) * UM) == MB) ? MB : (MB / UM) * UM + UM; + nthr_m = (((m / MB) * MB) == m) ? m / MB : m / MB + 1; + nthr_m = std::min(std::max(nthr_m, (dim_t) 1), nthr); + + while ((nthr_m * (nthr_n + 1) <= nthr) && ((n / (nthr_n + 1)) >= BLOCKN)) { + nthr_n++; + } + + NB = n / nthr_n; + NB = (((NB / UN) * UN) == NB) ? NB : (NB / UN) * UN + UN; + nthr_n = (((n / NB) * NB) == n) ? n / NB : n / NB + 1; + nthr_n = std::min(std::max(nthr_n, (dim_t) 1), nthr / nthr_m); + + nthr = nthr_m * nthr_n; + + if (arg -> ldb != 1) { + new_x = (uint8_t *)malloc(n, 64); + if (new_x == NULL) + return 1; + for (i = 0; i < n; i++) { + new_x[i] = (arg -> b)[i * arg -> ldb]; + } + arg_seq.b = new_x; + arg_seq.ldb = 1; + } + else new_x = (uint8_t *) arg -> b; + + if (arg -> ldc != 1) { + new_y = (int32_t *) malloc(nthr_m * PADD_BYTESIZE_ONPAGE(MB, sizeof(int32_t)), 64); + if (new_y == NULL) { + if (arg -> ldb != 1) { + free(new_x); + } + return 1; + } + } + + // GEMV computation + if (nthr == 1) { + + if (arg -> ldc != 1) { + if (*(arg -> beta) != 0.0f) { + for (i = 0; i < m; i++) { + new_y[i] = arg -> c[i * arg -> ldc]; + } + } + } + + status = gemv_kernel_driver(&arg_seq); + + if (arg -> ldc != 1) { + for (i = 0; i < m; i++) { + arg -> c[i * arg -> ldc] = new_y[i]; + } + } + + if (arg -> ldb != 1) { + free(new_x); + } + if (arg -> ldc != 1) { + free(new_y); + } + return status; + } + + if (nthr_n > 1) { + tmp_y = (int32_t *) malloc((nthr_n - 1) * PADD_BYTESIZE_ONPAGE(m, sizeof(int32_t)), PAGESIZE); + if (tmp_y == NULL) { + if (arg -> ldb != 1) { + free(new_x); + } + return 1; + } + } + + parallel_nd((int) nthr, [&](const dim_t ithr) { + + dim_t m_from, m_to, myM; + dim_t n_from, n_to, myN; + + dim_t n_id, m_id; + dim_t loc_incy = 1; + int32_t *loc_y; + + blas_t arg_loc = arg_seq; + int j; + + m_id = ithr / nthr_n; + n_id = ithr % nthr_n; + + m_from = MB * m_id; + m_to = MB * (m_id + 1); + if ((m_to > m) || (m_id == nthr_m - 1)) + m_to = m; + + myM = m_to - m_from; + + n_from = NB * n_id; + n_to = NB * (n_id + 1); + if ((n_to > n) || (n_id == nthr_n - 1)) + n_to = n; + + myN = n_to - n_from; + + if (n_id != 0) { + arg_loc.beta = &zero; + loc_y = tmp_y + (NEXT_THR_STRIDE(m, sizeof(int32_t))) * (n_id - 1) + m_from; + } + else { + if (arg -> ldc == 1) { + loc_y = arg_seq.c + m_from; + } + else { + // need to copy the block of c in new_y + loc_y = new_y + m_id * NEXT_THR_STRIDE(MB, sizeof(int32_t)); + if (*(arg -> beta) != 0.0f) { + for (j = 0; j < myM; j++) { + loc_y[j] = arg -> c[(m_from + j) * arg -> ldc]; + } + } + } + } + + arg_loc.m = myM; + arg_loc.n = myN; + arg_loc.a = arg_seq.a + m_from * arg_seq.lda + n_from; + arg_loc.b = arg_seq.b + n_from; + arg_loc.c = loc_y; + arg_loc.ldc = loc_incy; + + gemv_kernel_driver(&arg_loc); + + if ((n_id == 0) && (arg -> ldc != 1)) { + for (j = 0; j < myM; j++) { + arg -> c[(m_from + j) * arg -> ldc] = loc_y[j]; + } + } + + }); + + if (nthr_n > 1) { + parallel_nd((int) nthr_m, [&](const dim_t ithr) { + + dim_t j, j_from, j_to, ii; + int32_t acc; + + j_from = MB * ithr; + j_to = MB * (ithr + 1); + if ((j_to > m) || (ithr == nthr - 1)) + j_to = m; + + for (j = j_from; j < j_to; j++) { + acc = 0; + for (ii = 0; ii < nthr_n - 1; ii++) { + acc += tmp_y[ii * NEXT_THR_STRIDE(m, sizeof(int32_t)) + j]; + } + (arg -> c)[j * arg -> ldc] += acc; + } + }); + free(tmp_y); + } + + if (arg -> ldb != 1) { + free(new_x); + } + + if (arg -> ldc != 1) { + free(new_y); + } + + return 0; +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.cpp new file mode 100644 index 0000000..c57a8c1 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.cpp @@ -0,0 +1,411 @@ +/******************************************************************************* + * Copyright 2019 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#include "jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp" + +#ifdef _WIN32 +#define is_windows 1 +#else +#define is_windows 0 +#endif + +namespace mkldnn { +namespace impl { +namespace cpu { + +void jit_avx512_core_gemv_s8u8s32_kern::vnni(Xbyak::Zmm acc, Xbyak::Zmm b, + Xbyak::Zmm a, Xbyak::Zmm tmp, + Xbyak::Zmm one, bool swap, + int use_vnni) { + + if (use_vnni) { + if (swap) + vpdpbusd(acc, a, b); + else + vpdpbusd(acc, b, a); + } + + else { + if (swap) + vpmaddubsw(tmp, a, b); + else + vpmaddubsw(tmp, b, a); + vpmaddwd(tmp, tmp, one); + vpaddd(acc, tmp, acc); + } + +} + +void jit_avx512_core_gemv_s8u8s32_kern::n_loop_body(int start_a_idx, int start_acc_idx, + int b_idx, int nreg_acc, + Xbyak::Reg64 A, Xbyak::Reg64 lda, + Xbyak::Reg64 X, Xbyak::Zmm tmp, + Xbyak::Zmm one, bool swap, int use_vnni, + int use_mask, Xbyak::Opmask mask_n) { + + int i; + int nreg_A = nreg_acc / 2 + (nreg_acc % 2); + + // load X + j + if (use_mask) + vmovdqu8(Xbyak::Zmm(b_idx) | mask_n | T_z, ptr[X]); + else + vmovdqu8(Xbyak::Zmm(b_idx), ptr[X]); + + xor_(r14, r14); + // load values of A + for (i = 0; i < nreg_A; i++) { + if (use_mask) + vmovdqu8(Xbyak::Zmm(start_a_idx + i) | mask_n | T_z, ptr[A + r14]); + else + vmovdqu8(Xbyak::Zmm(start_a_idx + i), ptr[A + r14]); + add(r14, lda); + } + + for (i = 0; i < nreg_A; i++) { + // vnni (acc, b, a, tmp, one, swap, use_vnni) + vnni(Xbyak::Zmm(start_acc_idx + i), Xbyak::Zmm(b_idx), + Xbyak::Zmm(start_a_idx + i), tmp, one, swap, use_vnni); + } + + for (i = 0; i < nreg_A - (nreg_acc % 2); i++) { + if (use_mask) + vmovdqu8(Xbyak::Zmm(start_a_idx + i) | mask_n | T_z, ptr[A + r14]); + else + vmovdqu8(Xbyak::Zmm(start_a_idx + i), ptr[A + r14]); + add(r14, lda); + } + + for (i = 0; i < nreg_A - (nreg_acc % 2); i++) { + vnni(Xbyak::Zmm(start_acc_idx + i + nreg_A), Xbyak::Zmm(b_idx), + Xbyak::Zmm(start_a_idx + i), tmp, one, swap, use_vnni); + } + +} + +void jit_avx512_core_gemv_s8u8s32_kern::shuffle_and_add(Xbyak::Zmm dest, Xbyak::Zmm A, + Xbyak::Zmm B, Xbyak::Zmm C, + Xbyak::Zmm D) { + + vshufi32x4(dest, A, C, 0x44); + vshufi32x4(A, A, C, 0xEE); + vpaddd(C, dest, A); // C = A0 + A2|A1 + A3|C0 + C2|C1 + C3 + + vshufi32x4(dest, B, D, 0x44); + vshufi32x4(B, B, D, 0xEE); + vpaddd(D, dest, B); // D = B0 + B2|B1 + B3|D0 + D2|D1 + D3 + + vshufi32x4(A, C, D, 0x88); + vshufi32x4(B, C, D, 0xDD); + vpaddd(dest, A, B); // dest = SAi|SBi|SCi|SDi + +} + +void jit_avx512_core_gemv_s8u8s32_kern::update_c(int nreg_acc, Xbyak::Reg64 Y, + int start_a_idx, int start_acc_idx, + Xbyak::Xmm beta, int use_mask, + Xbyak::Opmask mask_m) { + + int l, i, k, j, last_it; + Xbyak::Label store_label; + + l = 0; + for (k = 0; k < nreg_acc; k += 8) { + for (i = 0, j = k; i < 8; i += 4, j += 2) { + if (j < nreg_acc) { + // shuffle per block of 4 registers + shuffle_and_add(Xbyak::Zmm(start_a_idx + l), // dest + Xbyak::Zmm(start_acc_idx + j), // A = acc0 + Xbyak::Zmm(start_acc_idx + 1 + j), // B = acc1 + Xbyak::Zmm(start_acc_idx + 4 + j), // C = acc4 + Xbyak::Zmm(start_acc_idx + 5 + j)); // D = acc5 + + // extract low and high from dest and hadd + vextracti32x8(Xbyak::Ymm(start_a_idx + l + 1), Xbyak::Zmm(start_a_idx + l), 0); + vextracti32x8(Xbyak::Ymm(start_a_idx + l + 2), Xbyak::Zmm(start_a_idx + l), 1); + vphaddd(Xbyak::Ymm(start_a_idx + l), + Xbyak::Ymm(start_a_idx + l + 1), + Xbyak::Ymm(start_a_idx + l + 2)); + } + l++; + } + + vphaddd(Xbyak::Ymm(start_a_idx + l), + Xbyak::Ymm(start_a_idx + l - 2), + Xbyak::Ymm(start_a_idx + l - 1)); + + l++; + } + + // eventually add with C and store new value + vxorps(Xbyak::Ymm(start_a_idx), + Xbyak::Ymm(start_a_idx), + Xbyak::Ymm(start_a_idx)); + vucomiss(beta, Xbyak::Ymm(start_a_idx)); + je(store_label, T_NEAR); + + // beta = 1 + for (k = 0, l = 2; k < nreg_acc; k += 8, l += 3) { + // load Y and add + last_it = (k + 8) > nreg_acc; + if (use_mask && last_it) + vmovdqu32(Xbyak::Ymm(start_a_idx + k / 8) | mask_m | T_z, ptr[Y + (k / 8) * 32]); + else + vmovdqu32(Xbyak::Ymm(start_a_idx + k / 8), ptr[Y + (k / 8) * 32]); + + vpaddd(Xbyak::Ymm(start_a_idx + l), + Xbyak::Ymm(start_a_idx + l), + Xbyak::Ymm(start_a_idx + k / 8)); + } + + // store + aligned_label(store_label); + for (k = 0, l = 2; k < nreg_acc; k += 8, l += 3) { + last_it = (k + 8) > nreg_acc; + if (use_mask && last_it) + vmovdqu32(ptr[Y + (k / 8) * 32], Xbyak::Ymm(start_a_idx + l) | mask_m); + else + vmovdqu32(ptr[Y + (k / 8) * 32], Xbyak::Ymm(start_a_idx + l)); + } + +} + +template +T jit_avx512_core_gemv_s8u8s32_kern::generate(int use_vnni) { + + Xbyak::Opmask mask_n = k1, mask_m = k2; + Xbyak::Label one_label, m_tail_label, m_loop_label, n_loop_label; + Xbyak::Label n_tail_label, update_c_label, end_label; + constexpr unsigned int n_labels = (1 << unroll_m) - 1; + Xbyak::Label m_tail_label_case[n_labels]; + Xbyak::Label n_loop_label_case[n_labels]; + Xbyak::Label n_tail_label_case[n_labels]; + Xbyak::Label update_c_label_case[n_labels]; + + int i, ii; + + Xbyak::Zmm one, tmp; + Xbyak::Reg64 n = abi_param2, m = abi_param1; + Xbyak::Reg64 A = is_windows ? abi_param4 : abi_param3; + Xbyak::Reg64 lda = is_windows ? abi_param3 : abi_param4; + Xbyak::Reg64 X = is_windows ? rdi : r8; + Xbyak::Xmm beta = xmm1; + Xbyak::Reg64 Y = is_windows ? rsi : r9; + + bool swap = !std::is_same::value; + + // Windows: read on the stack lda, X, beta, Y + + int zmm_idx = 1; + int nreg_acc = 1 << unroll_m; + int nreg_A = 1 << (unroll_m - 1); + int nreg_A_acc = nreg_acc + nreg_A; + + if (!use_vnni) { + // set a zmm register to one + tmp = Xbyak::Zmm(0); + one = Xbyak::Zmm(zmm_idx + 1); + zmm_idx += 2; // one + tmp + } + else { + beta = xmm0; + } + + preamble(); + + if (is_windows) { + mov(lda, ptr[rsp + get_size_of_abi_save_regs() + 40]); + mov(X, ptr[rsp + get_size_of_abi_save_regs() + 48]); + movss(beta, ptr[rsp + get_size_of_abi_save_regs() + 56]); + mov(Y, ptr[rsp + get_size_of_abi_save_regs() + 64]); + } + + if (use_vnni && !is_windows) { + movaps(beta, xmm1); + } + + mov(rax, (1 << unroll_n) - 1); + kmovq(k3, rax); + + and_(rax, n); // rax contains n & ((1 << unroll_n) - 1) + mov(rbx, 1); + shlx(rbx, rbx, rax); + sub(rbx, 1); + kmovq(mask_n, rbx); + // mask_n set (AVX512 only), can use rax and rbx again + + // set mask_m for update of the C matrix + // load/store on the C matrix use Ymm so tail according to Ymm size + mov(rax, 7); // 8 * 32 = 256 Ymm size + and_(rax, m); // rax contains m & 7 + mov(rbx, 1); + shlx(rbx, rbx, rax); + sub(rbx, 1); + kmovq(mask_m, rbx); + // mask_m set (AVX512 only), can use rax and rbx again + + // setup register of ones when VNNI instructions not available + if (!use_vnni) { + vmovdqu16(one, ptr[rip + one_label]); + } + + // M loop + // base pointer for A rax contains a + i * lda + // Loop stop when rax >= a + (m & mask_um) * lda = rbx + // loop increment r10 = um * lda + // rbp = Y + i + mov(rax, A); // i = 0 + mov(rbx, m); + and_(rbx, mask_um); + imul(rbx, lda); + add(rbx, A); + mov(r10, lda); + sal(r10, unroll_m); + mov(rbp, Y); + + // N loop + // base pointer for X r11 contains x + j + // Loop stop when r11 >= x + n & mask_un = r12 + // loop increment un + // r13 = rax + j = A + i * lda + j + mov(r12, n); + and_(r12, mask_un); + add(r12, X); + + // M loop + aligned_label(m_loop_label); + cmp(rax, rbx); + jge(m_tail_label, T_NEAR); + + // enter M loop + for(i = 0; i < nreg_acc; i++) { + vpxorq(Xbyak::Zmm(i + zmm_idx + nreg_A), + Xbyak::Zmm(i + zmm_idx + nreg_A), + Xbyak::Zmm(i + zmm_idx + nreg_A)); + } + + // N loop + mov(r11, X); // j = 0 + mov(r13, rax); + aligned_label(n_loop_label); + cmp(r11, r12); + jge(n_tail_label, T_NEAR); + + // enter N loop + + n_loop_body(zmm_idx, zmm_idx + nreg_A, zmm_idx + nreg_A_acc, nreg_acc, + r13, lda, r11, tmp, one, swap, use_vnni, 0, mask_n); + + // increment rax with un + add(r11, 1 << unroll_n); + add(r13, 1 << unroll_n); + jmp(n_loop_label, T_NEAR); + // end N loop + + // N tail + aligned_label(n_tail_label); + + ktestq(mask_n, k3); + je(update_c_label, T_NEAR); + n_loop_body(zmm_idx, zmm_idx + nreg_A, zmm_idx + nreg_A_acc, nreg_acc, + r13, lda, r11, tmp, one, swap, use_vnni, 1, mask_n); + + // update C matrix + aligned_label(update_c_label); + + update_c(nreg_acc, rbp, zmm_idx, zmm_idx + nreg_A, beta, 0, mask_m); + + // increment rax with um * lda + add(rax, r10); + add(rbp, 1 << (unroll_m + 2)); + jmp(m_loop_label, T_NEAR); + // end M loop + + // M tail + aligned_label(m_tail_label); + + // r10 will contain m_tail = m % unroll_m = m & (1 << unroll_m) - 1 + mov(r10, m); + and_(r10, (1 << unroll_m) - 1); + for (ii = 1; ii < 1 << unroll_m; ii++) { + aligned_label(m_tail_label_case[ii-1]); + cmp(r10, ii); + if (ii == (1 << unroll_m) - 1) + jne(end_label, T_NEAR); + else + jne(m_tail_label_case[ii], T_NEAR); + + // m_tail = i, use i accumulators + + for(i = 0; i < ii; i++) { + vpxorq(Xbyak::Zmm(i + zmm_idx + nreg_A), + Xbyak::Zmm(i + zmm_idx + nreg_A), + Xbyak::Zmm(i + zmm_idx + nreg_A)); + } + + // N loop + mov(r11, X); // j = 0 + mov(r13, rax); + aligned_label(n_loop_label_case[ii - 1]); + cmp(r11, r12); + jge(n_tail_label_case[ii - 1], T_NEAR); + + n_loop_body(zmm_idx, zmm_idx + nreg_A, zmm_idx + nreg_A_acc, ii, r13, + lda, r11, tmp, one, swap, use_vnni, 0, mask_n); + + // increment rax with un + add(r11, 1 << unroll_n); + add(r13, 1 << unroll_n); + jmp(n_loop_label_case[ii - 1], T_NEAR); + // end N loop + + // N tail + aligned_label(n_tail_label_case[ii - 1]); + ktestq(mask_n, k3); + je(update_c_label_case[ii - 1], T_NEAR); + n_loop_body(zmm_idx, zmm_idx + nreg_A, zmm_idx + nreg_A_acc, ii, r13, + lda, r11, tmp, one, swap, use_vnni, 1, mask_n); + + // update C matrix + aligned_label(update_c_label_case[ii - 1]); + update_c(ii, rbp, zmm_idx, zmm_idx + nreg_A, beta, 1, mask_m); + + if (ii < ((1 << unroll_m) - 1)) + jmp(end_label, T_NEAR); + } + + aligned_label(end_label); + + postamble(); + + if (!use_vnni) { + aligned_label(one_label); + for (i = 0; i < size_vec_reg/8; i++) + dq(0x0001000100010001); + } + + return (T) getCode(); +} + +template jit_avx512_core_gemv_s8u8s32_kern::gemv_s8u8s32_kernel_t +jit_avx512_core_gemv_s8u8s32_kern::generate(int); + +template jit_avx512_core_gemv_s8u8s32_kern::gemv_u8s8s32_kernel_t +jit_avx512_core_gemv_s8u8s32_kern::generate(int); + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp new file mode 100644 index 0000000..9ea23a5 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8u8s32_kern.hpp @@ -0,0 +1,64 @@ +/******************************************************************************* + * Copyright 2019 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +class jit_avx512_core_gemv_s8u8s32_kern : jit_generator { + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemv_s8u8s32_kern); + + // assumes untoll_{m,n} are a power of 2 + static constexpr unsigned int unroll_m = 4; // real unrolling factor is 2^unroll_m + const int mask_um = 0xFFFFFFF0; + static constexpr unsigned int unroll_n = 6; // real unrolling factor is 2^unroll_n + const int mask_un = 0xFFFFFFC0; + const int size_vec_reg = 64; // bytes + + void aligned_label(Xbyak::Label &label, int alignment = 16) { + align(alignment); + L(label); + } + + void vnni(Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, bool, int); + void n_loop_body(int, int, int, int, Xbyak::Reg64, Xbyak::Reg64, + Xbyak::Reg64, Xbyak::Zmm, Xbyak::Zmm, bool, int, int, Xbyak::Opmask); + void shuffle_and_add(Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm); + void update_c(int, Xbyak::Reg64, int, int, Xbyak::Xmm, int, Xbyak::Opmask); + +public: + jit_avx512_core_gemv_s8u8s32_kern() : jit_generator(nullptr, GEMM_CODE_SIZE) {}; + + // m, n, alpha, a, lda, x, beta, y + typedef void (*gemv_s8u8s32_kernel_t)(const dim_t, const dim_t, const float, + const int8_t*, const dim_t, const uint8_t*, + const float, int32_t*); + typedef void (*gemv_u8s8s32_kernel_t)(const dim_t, const dim_t, const float, + const uint8_t*, const dim_t, const int8_t*, + const float, int32_t*); + + template + T generate(int use_vnni); + +}; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern.cpp new file mode 100644 index 0000000..544cd2f --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern.cpp @@ -0,0 +1,819 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_an_kern::jit_avx512_core_u8_copy_an_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l170; +Xbyak::Label l1f0; +Xbyak::Label l20; +Xbyak::Label l224; +Xbyak::Label l234; +Xbyak::Label l240; +Xbyak::Label l254; +Xbyak::Label l32c; +Xbyak::Label l34; +Xbyak::Label l388; +Xbyak::Label l3b0; +Xbyak::Label l3c0; +Xbyak::Label l3cc; +Xbyak::Label l3dc; +Xbyak::Label l454; +Xbyak::Label l48c; +Xbyak::Label l4a8; +Xbyak::Label l4b8; +Xbyak::Label l4c4; +Xbyak::Label l4d8; +Xbyak::Label l570; +Xbyak::Label l5c4; +Xbyak::Label l5f0; +Xbyak::Label l60c; +Xbyak::Label l61c; +Xbyak::Label l628; +Xbyak::Label l638; +Xbyak::Label l6b0; +Xbyak::Label l6f4; +Xbyak::Label l720; +Xbyak::Label l73c; +Xbyak::Label l74c; +Xbyak::Label l758; +Xbyak::Label l76c; +Xbyak::Label l804; +Xbyak::Label l858; +Xbyak::Label l88c; +Xbyak::Label l8a4; +Xbyak::Label l8b2; +Xbyak::Label l8bc; +Xbyak::Label l8cc; +Xbyak::Label l944; +Xbyak::Label l98c; +Xbyak::Label l9b0; +Xbyak::Label l9c8; +Xbyak::Label l9d8; + + preamble(); +#ifdef _WIN32 + auto stacksize = get_size_of_abi_save_regs(); + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(M, qword[M]); + mov(N, qword[N]); + mov(LDA, qword[LDA]); + lea(LDA3, ptr[LDA+LDA*2]); + sub(A, -128); + sub(B, -128); + cmp(N, 0x30); + jl(l234, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + add(A, 0x30); + mov(I, M); + sar(I, 0x2); + jle(l170, T_NEAR); + align(4); + +L(l34); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movdqu(xword[B-0x60], xmm4); + movdqu(xword[B-0x50], xmm2); + movdqu(xmm0, xword[A1-0x70]); + movdqu(xmm1, xword[A1+LDA*1-0x70]); + movdqu(xmm2, xword[A1+LDA*2-0x70]); + movdqu(xmm3, xword[A1+LDA3*1-0x70]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + movdqu(xword[B-0x40], xmm0); + movdqu(xword[B-0x30], xmm1); + movdqu(xword[B-0x20], xmm4); + movdqu(xword[B-0x10], xmm2); + movdqu(xmm0, xword[A1-0x60]); + movdqu(xmm1, xword[A1+LDA*1-0x60]); + movdqu(xmm2, xword[A1+LDA*2-0x60]); + movdqu(xmm3, xword[A1+LDA3*1-0x60]); + lea(A1, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + movdqu(xword[B], xmm0); + movdqu(xword[B+0x10], xmm1); + movdqu(xword[B+0x20], xmm4); + movdqu(xword[B+0x30], xmm2); + sub(B, -192); + dec(I); + jg(l34, T_NEAR); + align(4); + +L(l170); + test(M, 0x2); + jle(l1f0, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1-0x70]); + movdqu(xmm2, xword[A1-0x60]); + add(A1, LDA); + movdqu(xmm3, xword[A1-0x80]); + movdqu(xmm4, xword[A1-0x70]); + movdqu(xmm5, xword[A1-0x60]); + add(A1, LDA); + movdqa(xmm6, xmm0); + punpcklbw(xmm0, xmm3); + punpckhbw(xmm6, xmm3); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm6); + movdqa(xmm6, xmm1); + punpcklbw(xmm1, xmm4); + punpckhbw(xmm6, xmm4); + movdqu(xword[B-0x60], xmm1); + movdqu(xword[B-0x50], xmm6); + movdqa(xmm6, xmm2); + punpcklbw(xmm2, xmm5); + punpckhbw(xmm6, xmm5); + movdqu(xword[B-0x40], xmm2); + movdqu(xword[B-0x30], xmm6); + sub(B, -96); + align(4); + +L(l1f0); + test(M, 0x1); + jle(l224, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1-0x70]); + movdqu(xmm2, xword[A1-0x60]); + add(A1, LDA); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movdqu(xword[B-0x60], xmm2); + sub(B, -48); + align(4); + +L(l224); + sub(N, 0x30); + cmp(N, 0x30); + jge(l20, T_NEAR); + align(4); + +L(l234); + cmp(N, 0x20); + jl(l3c0, T_NEAR); + align(4); + +L(l240); + mov(A1, A); + add(A, 0x20); + mov(I, M); + sar(I, 0x2); + jle(l32c, T_NEAR); + align(4); + +L(l254); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movdqu(xword[B-0x60], xmm4); + movdqu(xword[B-0x50], xmm2); + movdqu(xmm0, xword[A1-0x70]); + movdqu(xmm1, xword[A1+LDA*1-0x70]); + movdqu(xmm2, xword[A1+LDA*2-0x70]); + movdqu(xmm3, xword[A1+LDA3*1-0x70]); + lea(A1, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + movdqu(xword[B-0x40], xmm0); + movdqu(xword[B-0x30], xmm1); + movdqu(xword[B-0x20], xmm4); + movdqu(xword[B-0x10], xmm2); + sub(B, -128); + dec(I); + jg(l254, T_NEAR); + align(4); + +L(l32c); + test(M, 0x2); + jle(l388, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1-0x70]); + add(A1, LDA); + movdqu(xmm2, xword[A1-0x80]); + movdqu(xmm3, xword[A1-0x70]); + add(A1, LDA); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm2); + punpckhbw(xmm4, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm4); + movdqa(xmm4, xmm1); + punpcklbw(xmm1, xmm3); + punpckhbw(xmm4, xmm3); + movdqu(xword[B-0x60], xmm1); + movdqu(xword[B-0x50], xmm4); + sub(B, -64); + align(4); + +L(l388); + test(M, 0x1); + jle(l3b0, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1-0x70]); + add(A1, LDA); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l3b0); + sub(N, 0x20); + cmp(N, 0x20); + jge(l240, T_NEAR); + align(4); + +L(l3c0); + cmp(N, 0x10); + jl(l4b8, T_NEAR); + align(4); + +L(l3cc); + mov(A1, A); + add(A, 0x10); + mov(I, M); + sar(I, 0x2); + jle(l454, T_NEAR); + align(4); + +L(l3dc); + movdqu(xmm0, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm1, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm2, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm3, xword[A1-0x80]); + add(A1, LDA); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm1, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm1, xmm3); + movdqa(xmm3, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm3, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm1); + punpckhwd(xmm2, xmm1); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm3); + movdqu(xword[B-0x60], xmm4); + movdqu(xword[B-0x50], xmm2); + sub(B, -64); + dec(I); + jg(l3dc, T_NEAR); + align(4); + +L(l454); + test(M, 0x2); + jle(l48c, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm1, xword[A1-0x80]); + add(A1, LDA); + movdqa(xmm2, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm2, xmm1); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm2); + sub(B, -32); + align(4); + +L(l48c); + test(M, 0x1); + jle(l4a8, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + add(A1, LDA); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l4a8); + sub(N, 0x10); + cmp(N, 0x10); + jge(l3cc, T_NEAR); + align(4); + +L(l4b8); + cmp(N, 0x8); + jl(l61c, T_NEAR); + align(4); + +L(l4c4); + mov(A1, A); + add(A, 0x8); + mov(I, M); + sar(I, 0x3); + jle(l570, T_NEAR); + align(4); + +L(l4d8); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + dec(I); + jg(l4d8, T_NEAR); + align(4); + +L(l570); + test(M, 0x4); + jle(l5c4, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l5c4); + test(M, 0x2); + jle(l5f0, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l5f0); + test(M, 0x1); + jle(l60c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l60c); + sub(N, 0x8); + cmp(N, 0x8); + jge(l4c4, T_NEAR); + align(4); + +L(l61c); + cmp(N, 0x4); + jl(l74c, T_NEAR); + align(4); + +L(l628); + mov(A1, A); + add(A, 0x4); + mov(I, M); + sar(I, 0x3); + jle(l6b0, T_NEAR); + align(4); + +L(l638); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + dec(I); + jg(l638, T_NEAR); + align(4); + +L(l6b0); + test(M, 0x4); + jle(l6f4, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l6f4); + test(M, 0x2); + jle(l720, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l720); + test(M, 0x1); + jle(l73c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l73c); + sub(N, 0x4); + cmp(N, 0x4); + jge(l628, T_NEAR); + align(4); + +L(l74c); + cmp(N, 0x2); + jl(l8b2, T_NEAR); + align(4); + +L(l758); + mov(A1, A); + add(A, 0x2); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l804, T_NEAR); + align(4); + +L(l76c); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm4, eax, 0x0); + punpcklbw(xmm1, xmm2); + punpcklbw(xmm3, xmm4); + punpcklwd(xmm1, xmm3); + punpcklqdq(xmm0, xmm1); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(LDA3); + jg(l76c, T_NEAR); + align(4); + +L(l804); + test(M, 0x4); + jle(l858, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l858); + test(M, 0x2); + jle(l88c, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + punpcklbw(xmm0, xmm1); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l88c); + test(M, 0x1); + jle(l8a4, T_NEAR); + mov(ax, word[A1-0x80]); + mov(word[B-0x80], ax); + sub(B, -2); + align(4); + +L(l8a4); + sub(N, 0x2); + cmp(N, 0x2); + jge(l758, T_NEAR); + align(4); + +L(l8b2); + cmp(N, 0x1); + jl(l9d8, T_NEAR); + align(4); + +L(l8bc); + mov(A1, A); + add(A, 0x1); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l944, T_NEAR); + align(4); + +L(l8cc); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x7); + movq(qword[B-0x80], xmm0); + sub(B, -8); + dec(LDA3); + jg(l8cc, T_NEAR); + align(4); + +L(l944); + test(M, 0x4); + jle(l98c, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l98c); + test(M, 0x2); + jle(l9b0, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + mov(byte[B-0x80], al); + mov(al, byte[A1-0x80]); + add(A1, LDA); + mov(byte[B-0x7f], al); + sub(B, -2); + align(4); + +L(l9b0); + test(M, 0x1); + jle(l9c8, T_NEAR); + mov(al, byte[A1-0x80]); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l9c8); + sub(N, 0x1); + cmp(N, 0x1); + jge(l8bc, T_NEAR); + align(4); + +L(l9d8); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern.cpp new file mode 100644 index 0000000..1c11fc6 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern.cpp @@ -0,0 +1,2209 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_at_kern::jit_avx512_core_u8_copy_at_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l1014; +Xbyak::Label l1390; +Xbyak::Label l159c; +Xbyak::Label l173c; +Xbyak::Label l18e4; +Xbyak::Label l1a7c; +Xbyak::Label l1a8c; +Xbyak::Label l1a98; +Xbyak::Label l1ab4; +Xbyak::Label l1c64; +Xbyak::Label l1d74; +Xbyak::Label l1e50; +Xbyak::Label l1f2c; +Xbyak::Label l1ffc; +Xbyak::Label l20; +Xbyak::Label l200c; +Xbyak::Label l2018; +Xbyak::Label l2034; +Xbyak::Label l2110; +Xbyak::Label l21a0; +Xbyak::Label l2210; +Xbyak::Label l2284; +Xbyak::Label l22f0; +Xbyak::Label l2300; +Xbyak::Label l230c; +Xbyak::Label l2324; +Xbyak::Label l2398; +Xbyak::Label l23e8; +Xbyak::Label l242c; +Xbyak::Label l2474; +Xbyak::Label l24b4; +Xbyak::Label l24c4; +Xbyak::Label l24d0; +Xbyak::Label l24e8; +Xbyak::Label l2520; +Xbyak::Label l254c; +Xbyak::Label l2578; +Xbyak::Label l25a8; +Xbyak::Label l25c8; +Xbyak::Label l25d6; +Xbyak::Label l25e0; +Xbyak::Label l25f0; +Xbyak::Label l260c; +Xbyak::Label l262c; +Xbyak::Label l264c; +Xbyak::Label l2668; +Xbyak::Label l2680; +Xbyak::Label l2690; +Xbyak::Label l44; +Xbyak::Label l58c; +Xbyak::Label l8b0; +Xbyak::Label lb14; +Xbyak::Label ld84; +Xbyak::Label lfdc; +Xbyak::Label lfec; +Xbyak::Label lff8; + + preamble(); +#ifdef _WIN32 + auto stacksize = get_size_of_abi_save_regs(); + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(N, qword[N]); + mov(M, qword[M]); + mov(LDA, qword[LDA]); + sub(A, -128); + sub(B, -128); + lea(LDA3, ptr[LDA+LDA*2]); + cmp(N, 0x30); + jl(lfec, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + mov(I, LDA); + shl(I, 0x5); + lea(I, ptr[I+LDA*8]); + lea(I, ptr[I+LDA*8]); + add(A, I); + mov(I, M); + sar(I, 0x4); + jle(l58c, T_NEAR); + align(4); + +L(l44); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B+0x40], xmm1); + movdqu(xword[B+0x100], xmm4); + movdqu(xword[B+0x1c0], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B+0x50], xmm1); + movdqu(xword[B+0x110], xmm4); + movdqu(xword[B+0x1d0], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B+0x60], xmm1); + movdqu(xword[B+0x120], xmm4); + movdqu(xword[B+0x1e0], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x50], xmm0); + movdqu(xword[B+0x70], xmm1); + movdqu(xword[B+0x130], xmm4); + movdqu(xword[B+0x1f0], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x40], xmm0); + movdqu(xword[B+0x80], xmm1); + movdqu(xword[B+0x140], xmm4); + movdqu(xword[B+0x200], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x30], xmm0); + movdqu(xword[B+0x90], xmm1); + movdqu(xword[B+0x150], xmm4); + movdqu(xword[B+0x210], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x20], xmm0); + movdqu(xword[B+0xa0], xmm1); + movdqu(xword[B+0x160], xmm4); + movdqu(xword[B+0x220], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x10], xmm0); + movdqu(xword[B+0xb0], xmm1); + movdqu(xword[B+0x170], xmm4); + movdqu(xword[B+0x230], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B], xmm0); + movdqu(xword[B+0xc0], xmm1); + movdqu(xword[B+0x180], xmm4); + movdqu(xword[B+0x240], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B+0x10], xmm0); + movdqu(xword[B+0xd0], xmm1); + movdqu(xword[B+0x190], xmm4); + movdqu(xword[B+0x250], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B+0x20], xmm0); + movdqu(xword[B+0xe0], xmm1); + movdqu(xword[B+0x1a0], xmm4); + movdqu(xword[B+0x260], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B+0x30], xmm0); + movdqu(xword[B+0xf0], xmm1); + movdqu(xword[B+0x1b0], xmm4); + movdqu(xword[B+0x270], xmm3); + sub(A1, -16); + sub(B, -768); + dec(I); + jg(l44, T_NEAR); + align(4); + +L(l58c); + test(M, 0x8); + jle(l8b0, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B+0x40], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B+0x50], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B+0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x50], xmm0); + movdqu(xword[B+0x70], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x40], xmm0); + movdqu(xword[B+0x80], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x30], xmm0); + movdqu(xword[B+0x90], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x20], xmm0); + movdqu(xword[B+0xa0], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x10], xmm0); + movdqu(xword[B+0xb0], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B], xmm0); + movdqu(xword[B+0xc0], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B+0x10], xmm0); + movdqu(xword[B+0xd0], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B+0x20], xmm0); + movdqu(xword[B+0xe0], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B+0x30], xmm0); + movdqu(xword[B+0xf0], xmm1); + sub(A1, -8); + sub(B, -384); + align(4); + +L(l8b0); + test(M, 0x4); + jle(lb14, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x60], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x50], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x40], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x30], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x20], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x10], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B+0x10], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B+0x20], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B+0x30], xmm0); + sub(A1, -4); + sub(B, -192); + align(4); + +L(lb14); + test(M, 0x2); + jle(ld84, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x7); + movdqu(xword[B-0x80], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x70], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x60], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x50], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x40], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x30], xmm0); + sub(A1, -2); + sub(B, -96); + align(4); + +L(ld84); + test(M, 0x1); + jle(lfdc, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + movdqu(xword[B-0x80], xmm0); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + movdqu(xword[B-0x70], xmm0); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + movdqu(xword[B-0x60], xmm0); + sub(B, -48); + align(4); + +L(lfdc); + sub(N, 0x30); + cmp(N, 0x30); + jge(l20, T_NEAR); + align(4); + +L(lfec); + cmp(N, 0x20); + jl(l1a8c, T_NEAR); + align(4); + +L(lff8); + mov(A1, A); + mov(I, LDA); + shl(I, 0x5); + add(A, I); + mov(I, M); + sar(I, 0x4); + jle(l1390, T_NEAR); + align(4); + +L(l1014); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B], xmm1); + movdqu(xword[B+0x80], xmm4); + movdqu(xword[B+0x100], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B+0x10], xmm1); + movdqu(xword[B+0x90], xmm4); + movdqu(xword[B+0x110], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B+0x20], xmm1); + movdqu(xword[B+0xa0], xmm4); + movdqu(xword[B+0x120], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x50], xmm0); + movdqu(xword[B+0x30], xmm1); + movdqu(xword[B+0xb0], xmm4); + movdqu(xword[B+0x130], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x40], xmm0); + movdqu(xword[B+0x40], xmm1); + movdqu(xword[B+0xc0], xmm4); + movdqu(xword[B+0x140], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x30], xmm0); + movdqu(xword[B+0x50], xmm1); + movdqu(xword[B+0xd0], xmm4); + movdqu(xword[B+0x150], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x20], xmm0); + movdqu(xword[B+0x60], xmm1); + movdqu(xword[B+0xe0], xmm4); + movdqu(xword[B+0x160], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x10], xmm0); + movdqu(xword[B+0x70], xmm1); + movdqu(xword[B+0xf0], xmm4); + movdqu(xword[B+0x170], xmm3); + sub(A1, -16); + sub(B, -512); + dec(I); + jg(l1014, T_NEAR); + align(4); + +L(l1390); + test(M, 0x8); + jle(l159c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B+0x10], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B+0x20], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x50], xmm0); + movdqu(xword[B+0x30], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x40], xmm0); + movdqu(xword[B+0x40], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x30], xmm0); + movdqu(xword[B+0x50], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x20], xmm0); + movdqu(xword[B+0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x10], xmm0); + movdqu(xword[B+0x70], xmm1); + sub(A1, -8); + sub(B, -256); + align(4); + +L(l159c); + test(M, 0x4); + jle(l173c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x60], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x50], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x40], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x30], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x20], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x10], xmm0); + sub(A1, -4); + sub(B, -128); + align(4); + +L(l173c); + test(M, 0x2); + jle(l18e4, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x7); + movdqu(xword[B-0x80], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x70], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x60], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + movdqu(xword[B-0x50], xmm0); + sub(A1, -2); + sub(B, -64); + align(4); + +L(l18e4); + test(M, 0x1); + jle(l1a7c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + movdqu(xword[B-0x80], xmm0); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + align(4); + +L(l1a7c); + sub(N, 0x20); + cmp(N, 0x20); + jge(lff8, T_NEAR); + align(4); + +L(l1a8c); + cmp(N, 0x10); + jl(l200c, T_NEAR); + align(4); + +L(l1a98); + mov(A1, A); + mov(I, LDA); + shl(I, 0x4); + add(A, I); + mov(I, M); + sar(I, 0x4); + jle(l1c64, T_NEAR); + align(4); + +L(l1ab4); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x40], xmm1); + movdqu(xword[B], xmm4); + movdqu(xword[B+0x40], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B-0x30], xmm1); + movdqu(xword[B+0x10], xmm4); + movdqu(xword[B+0x50], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B-0x20], xmm1); + movdqu(xword[B+0x20], xmm4); + movdqu(xword[B+0x60], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x50], xmm0); + movdqu(xword[B-0x10], xmm1); + movdqu(xword[B+0x30], xmm4); + movdqu(xword[B+0x70], xmm3); + sub(A1, -16); + sub(B, -256); + dec(I); + jg(l1ab4, T_NEAR); + align(4); + +L(l1c64); + test(M, 0x8); + jle(l1d74, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x40], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B-0x30], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B-0x20], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x50], xmm0); + movdqu(xword[B-0x10], xmm1); + sub(A1, -8); + sub(B, -128); + align(4); + +L(l1d74); + test(M, 0x4); + jle(l1e50, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x60], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x50], xmm0); + sub(A1, -4); + sub(B, -64); + align(4); + +L(l1e50); + test(M, 0x2); + jle(l1f2c, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x7); + movdqu(xword[B-0x80], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + movdqu(xword[B-0x70], xmm0); + sub(A1, -2); + sub(B, -32); + align(4); + +L(l1f2c); + test(M, 0x1); + jle(l1ffc, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0xf); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l1ffc); + sub(N, 0x10); + cmp(N, 0x10); + jge(l1a98, T_NEAR); + align(4); + +L(l200c); + cmp(N, 0x8); + jl(l2300, T_NEAR); + align(4); + +L(l2018); + mov(A1, A); + lea(A2, ptr[A1+LDA*4]); + lea(I, ptr[A1+LDA*8]); + mov(A, I); + mov(I, M); + sar(I, 0x4); + jle(l2110, T_NEAR); + align(4); + +L(l2034); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + sub(A1, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x60], xmm1); + movdqu(xword[B-0x40], xmm4); + movdqu(xword[B-0x20], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B-0x50], xmm1); + movdqu(xword[B-0x30], xmm4); + movdqu(xword[B-0x10], xmm3); + sub(B, -128); + dec(I); + jg(l2034, T_NEAR); + align(4); + +L(l2110); + test(M, 0x8); + jle(l21a0, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + sub(A1, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + align(4); + +L(l21a0); + test(M, 0x4); + jle(l2210, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + sub(A1, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + align(4); + +L(l2210); + test(M, 0x2); + jle(l2284, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x7); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l2284); + test(M, 0x1); + jle(l22f0, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x7); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l22f0); + sub(N, 0x8); + cmp(N, 0x8); + jge(l2018, T_NEAR); + align(4); + +L(l2300); + cmp(N, 0x4); + jl(l24c4, T_NEAR); + align(4); + +L(l230c); + mov(A1, A); + lea(A2, ptr[A1+LDA*2]); + lea(I, ptr[A1+LDA*4]); + mov(A, I); + mov(I, M); + sar(I, 0x4); + jle(l2398, T_NEAR); + align(4); + +L(l2324); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + sub(A1, -16); + movdqu(xmm2, xword[A2-0x80]); + movdqu(xmm3, xword[A2+LDA*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movdqu(xword[B-0x60], xmm4); + movdqu(xword[B-0x50], xmm3); + sub(B, -64); + dec(I); + jg(l2324, T_NEAR); + align(4); + +L(l2398); + test(M, 0x8); + jle(l23e8, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + sub(A1, -8); + movq(xmm2, qword[A2-0x80]); + movq(xmm3, qword[A2+LDA*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l23e8); + test(M, 0x4); + jle(l242c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + sub(A1, -4); + movd(xmm2, dword[A2-0x80]); + movd(xmm3, dword[A2+LDA*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l242c); + test(M, 0x2); + jle(l2474, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x3); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l2474); + test(M, 0x1); + jle(l24b4, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x3); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l24b4); + sub(N, 0x4); + cmp(N, 0x4); + jge(l230c, T_NEAR); + align(4); + +L(l24c4); + cmp(N, 0x2); + jl(l25d6, T_NEAR); + align(4); + +L(l24d0); + mov(A1, A); + lea(A2, ptr[A1+LDA*1]); + lea(I, ptr[A1+LDA*2]); + mov(A, I); + mov(I, M); + sar(I, 0x4); + jle(l2520, T_NEAR); + align(4); + +L(l24e8); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + movdqu(xmm1, xword[A2-0x80]); + sub(A2, -16); + movdqa(xmm2, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm2, xmm1); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm2); + sub(B, -32); + dec(I); + jg(l24e8, T_NEAR); + align(4); + +L(l2520); + test(M, 0x8); + jle(l254c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + movq(xmm1, qword[A2-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l254c); + test(M, 0x4); + jle(l2578, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + movd(xmm1, dword[A2-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l2578); + test(M, 0x2); + jle(l25a8, T_NEAR); + mov(ax, word[A1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x1); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l25a8); + test(M, 0x1); + jle(l25c8, T_NEAR); + mov(al, byte[A1-0x80]); + mov(byte[B-0x80], al); + mov(al, byte[A2-0x80]); + mov(byte[B-0x7f], al); + sub(B, -2); + align(4); + +L(l25c8); + sub(N, 0x2); + cmp(N, 0x2); + jge(l24d0, T_NEAR); + align(4); + +L(l25d6); + cmp(N, 0x1); + jl(l2690, T_NEAR); + align(4); + +L(l25e0); + mov(A1, A); + add(A, LDA); + mov(I, M); + sar(I, 0x4); + jle(l260c, T_NEAR); + align(4); + +L(l25f0); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(I); + jg(l25f0, T_NEAR); + align(4); + +L(l260c); + test(M, 0x8); + jle(l262c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l262c); + test(M, 0x4); + jle(l264c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l264c); + test(M, 0x2); + jle(l2668, T_NEAR); + mov(ax, word[A1-0x80]); + mov(word[B-0x80], ax); + sub(A1, -2); + sub(B, -2); + align(4); + +L(l2668); + test(M, 0x1); + jle(l2680, T_NEAR); + mov(al, byte[A1-0x80]); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l2680); + sub(N, 0x1); + cmp(N, 0x1); + jge(l25e0, T_NEAR); + align(4); + +L(l2690); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern.cpp new file mode 100644 index 0000000..56c36ee --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern.cpp @@ -0,0 +1,564 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_bn_kern::jit_avx512_core_u8_copy_bn_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l118; +Xbyak::Label l1a8; +Xbyak::Label l20; +Xbyak::Label l218; +Xbyak::Label l28c; +Xbyak::Label l2f8; +Xbyak::Label l308; +Xbyak::Label l314; +Xbyak::Label l32c; +Xbyak::Label l3a0; +Xbyak::Label l3c; +Xbyak::Label l3f0; +Xbyak::Label l434; +Xbyak::Label l47c; +Xbyak::Label l4bc; +Xbyak::Label l4cc; +Xbyak::Label l4d8; +Xbyak::Label l4f0; +Xbyak::Label l528; +Xbyak::Label l554; +Xbyak::Label l580; +Xbyak::Label l5b0; +Xbyak::Label l5d0; +Xbyak::Label l5de; +Xbyak::Label l5e8; +Xbyak::Label l5f8; +Xbyak::Label l614; +Xbyak::Label l634; +Xbyak::Label l654; +Xbyak::Label l670; +Xbyak::Label l688; +Xbyak::Label l698; + + preamble(); +#ifdef _WIN32 + auto stacksize = get_size_of_abi_save_regs(); + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(N, qword[N]); + mov(M, qword[M]); + mov(LDA, qword[LDA]); + sub(A, -128); + sub(B, -128); + lea(LDA3, ptr[LDA+LDA*2]); + cmp(N, 0x8); + jl(l308, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + lea(A2, ptr[A1+LDA*4]); + lea(I, ptr[A1+LDA*8]); + mov(A, I); + mov(I, M); + sar(I, 0x4); + jle(l118, T_NEAR); + align(4); + +L(l3c); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + sub(A1, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x60], xmm1); + movdqu(xword[B-0x40], xmm4); + movdqu(xword[B-0x20], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B-0x50], xmm1); + movdqu(xword[B-0x30], xmm4); + movdqu(xword[B-0x10], xmm3); + sub(B, -128); + dec(I); + jg(l3c, T_NEAR); + align(4); + +L(l118); + test(M, 0x8); + jle(l1a8, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + sub(A1, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x70], xmm0); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + align(4); + +L(l1a8); + test(M, 0x4); + jle(l218, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + sub(A1, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + align(4); + +L(l218); + test(M, 0x2); + jle(l28c, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x7); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l28c); + test(M, 0x1); + jle(l2f8, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x7); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l2f8); + sub(N, 0x8); + cmp(N, 0x8); + jge(l20, T_NEAR); + align(4); + +L(l308); + cmp(N, 0x4); + jl(l4cc, T_NEAR); + align(4); + +L(l314); + mov(A1, A); + lea(A2, ptr[A1+LDA*2]); + lea(I, ptr[A1+LDA*4]); + mov(A, I); + mov(I, M); + sar(I, 0x4); + jle(l3a0, T_NEAR); + align(4); + +L(l32c); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + sub(A1, -16); + movdqu(xmm2, xword[A2-0x80]); + movdqu(xmm3, xword[A2+LDA*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movdqu(xword[B-0x60], xmm4); + movdqu(xword[B-0x50], xmm3); + sub(B, -64); + dec(I); + jg(l32c, T_NEAR); + align(4); + +L(l3a0); + test(M, 0x8); + jle(l3f0, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + sub(A1, -8); + movq(xmm2, qword[A2-0x80]); + movq(xmm3, qword[A2+LDA*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l3f0); + test(M, 0x4); + jle(l434, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + sub(A1, -4); + movd(xmm2, dword[A2-0x80]); + movd(xmm3, dword[A2+LDA*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l434); + test(M, 0x2); + jle(l47c, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x3); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l47c); + test(M, 0x1); + jle(l4bc, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x3); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l4bc); + sub(N, 0x4); + cmp(N, 0x4); + jge(l314, T_NEAR); + align(4); + +L(l4cc); + cmp(N, 0x2); + jl(l5de, T_NEAR); + align(4); + +L(l4d8); + mov(A1, A); + lea(A2, ptr[A1+LDA*1]); + lea(I, ptr[A1+LDA*2]); + mov(A, I); + mov(I, M); + sar(I, 0x4); + jle(l528, T_NEAR); + align(4); + +L(l4f0); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + movdqu(xmm1, xword[A2-0x80]); + sub(A2, -16); + movdqa(xmm2, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm2, xmm1); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm2); + sub(B, -32); + dec(I); + jg(l4f0, T_NEAR); + align(4); + +L(l528); + test(M, 0x8); + jle(l554, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + movq(xmm1, qword[A2-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l554); + test(M, 0x4); + jle(l580, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + movd(xmm1, dword[A2-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l580); + test(M, 0x2); + jle(l5b0, T_NEAR); + mov(ax, word[A1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x1); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l5b0); + test(M, 0x1); + jle(l5d0, T_NEAR); + mov(al, byte[A1-0x80]); + mov(byte[B-0x80], al); + mov(al, byte[A2-0x80]); + mov(byte[B-0x7f], al); + sub(B, -2); + align(4); + +L(l5d0); + sub(N, 0x2); + cmp(N, 0x2); + jge(l4d8, T_NEAR); + align(4); + +L(l5de); + cmp(N, 0x1); + jl(l698, T_NEAR); + align(4); + +L(l5e8); + mov(A1, A); + add(A, LDA); + mov(I, M); + sar(I, 0x4); + jle(l614, T_NEAR); + align(4); + +L(l5f8); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(I); + jg(l5f8, T_NEAR); + align(4); + +L(l614); + test(M, 0x8); + jle(l634, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l634); + test(M, 0x4); + jle(l654, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l654); + test(M, 0x2); + jle(l670, T_NEAR); + mov(ax, word[A1-0x80]); + mov(word[B-0x80], ax); + sub(A1, -2); + sub(B, -2); + align(4); + +L(l670); + test(M, 0x1); + jle(l688, T_NEAR); + mov(al, byte[A1-0x80]); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l688); + sub(N, 0x1); + cmp(N, 0x1); + jge(l5e8, T_NEAR); + align(4); + +L(l698); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern.cpp new file mode 100644 index 0000000..53e99d9 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern.cpp @@ -0,0 +1,501 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_bt_kern::jit_avx512_core_u8_copy_bt_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l120; +Xbyak::Label l14c; +Xbyak::Label l168; +Xbyak::Label l178; +Xbyak::Label l184; +Xbyak::Label l194; +Xbyak::Label l20; +Xbyak::Label l20c; +Xbyak::Label l250; +Xbyak::Label l27c; +Xbyak::Label l298; +Xbyak::Label l2a8; +Xbyak::Label l2b4; +Xbyak::Label l2c8; +Xbyak::Label l34; +Xbyak::Label l360; +Xbyak::Label l3b4; +Xbyak::Label l3e8; +Xbyak::Label l400; +Xbyak::Label l40e; +Xbyak::Label l418; +Xbyak::Label l428; +Xbyak::Label l4a0; +Xbyak::Label l4e8; +Xbyak::Label l50c; +Xbyak::Label l524; +Xbyak::Label l534; +Xbyak::Label lcc; + + preamble(); +#ifdef _WIN32 + auto stacksize = get_size_of_abi_save_regs(); + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(M, qword[M]); + mov(N, qword[N]); + mov(LDA, qword[LDA]); + lea(LDA3, ptr[LDA+LDA*2]); + sub(A, -128); + sub(B, -128); + cmp(N, 0x8); + jl(l178, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + add(A, 0x8); + mov(I, M); + sar(I, 0x3); + jle(lcc, T_NEAR); + align(4); + +L(l34); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + dec(I); + jg(l34, T_NEAR); + align(4); + +L(lcc); + test(M, 0x4); + jle(l120, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l120); + test(M, 0x2); + jle(l14c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l14c); + test(M, 0x1); + jle(l168, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l168); + sub(N, 0x8); + cmp(N, 0x8); + jge(l20, T_NEAR); + align(4); + +L(l178); + cmp(N, 0x4); + jl(l2a8, T_NEAR); + align(4); + +L(l184); + mov(A1, A); + add(A, 0x4); + mov(I, M); + sar(I, 0x3); + jle(l20c, T_NEAR); + align(4); + +L(l194); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + dec(I); + jg(l194, T_NEAR); + align(4); + +L(l20c); + test(M, 0x4); + jle(l250, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l250); + test(M, 0x2); + jle(l27c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l27c); + test(M, 0x1); + jle(l298, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l298); + sub(N, 0x4); + cmp(N, 0x4); + jge(l184, T_NEAR); + align(4); + +L(l2a8); + cmp(N, 0x2); + jl(l40e, T_NEAR); + align(4); + +L(l2b4); + mov(A1, A); + add(A, 0x2); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l360, T_NEAR); + align(4); + +L(l2c8); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm4, eax, 0x0); + punpcklbw(xmm1, xmm2); + punpcklbw(xmm3, xmm4); + punpcklwd(xmm1, xmm3); + punpcklqdq(xmm0, xmm1); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(LDA3); + jg(l2c8, T_NEAR); + align(4); + +L(l360); + test(M, 0x4); + jle(l3b4, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l3b4); + test(M, 0x2); + jle(l3e8, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + punpcklbw(xmm0, xmm1); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l3e8); + test(M, 0x1); + jle(l400, T_NEAR); + mov(ax, word[A1-0x80]); + mov(word[B-0x80], ax); + sub(B, -2); + align(4); + +L(l400); + sub(N, 0x2); + cmp(N, 0x2); + jge(l2b4, T_NEAR); + align(4); + +L(l40e); + cmp(N, 0x1); + jl(l534, T_NEAR); + align(4); + +L(l418); + mov(A1, A); + add(A, 0x1); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l4a0, T_NEAR); + align(4); + +L(l428); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x7); + movq(qword[B-0x80], xmm0); + sub(B, -8); + dec(LDA3); + jg(l428, T_NEAR); + align(4); + +L(l4a0); + test(M, 0x4); + jle(l4e8, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l4e8); + test(M, 0x2); + jle(l50c, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + mov(byte[B-0x80], al); + mov(al, byte[A1-0x80]); + add(A1, LDA); + mov(byte[B-0x7f], al); + sub(B, -2); + align(4); + +L(l50c); + test(M, 0x1); + jle(l524, T_NEAR); + mov(al, byte[A1-0x80]); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l524); + sub(N, 0x1); + cmp(N, 0x1); + jge(l418, T_NEAR); + align(4); + +L(l534); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern.cpp new file mode 100644 index 0000000..49a312f --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern.cpp @@ -0,0 +1,1283 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_sum_an_kern::jit_avx512_core_u8_copy_sum_an_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#define ARG_BIAS 24+stacksize+rsp + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp +#define ARG_BIAS 72+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l1024; +Xbyak::Label l1090; +Xbyak::Label l10d4; +Xbyak::Label l10fc; +Xbyak::Label l111a; +Xbyak::Label l1124; +Xbyak::Label l113c; +Xbyak::Label l11d4; +Xbyak::Label l1234; +Xbyak::Label l1278; +Xbyak::Label l129c; +Xbyak::Label l12bc; +Xbyak::Label l20; +Xbyak::Label l2a0; +Xbyak::Label l3c0; +Xbyak::Label l438; +Xbyak::Label l480; +Xbyak::Label l48c; +Xbyak::Label l4c8; +Xbyak::Label l5c; +Xbyak::Label l6a8; +Xbyak::Label l7b4; +Xbyak::Label l850; +Xbyak::Label l89c; +Xbyak::Label l8a8; +Xbyak::Label l8d0; +Xbyak::Label l9d0; +Xbyak::Label la64; +Xbyak::Label lab8; +Xbyak::Label lae8; +Xbyak::Label laf4; +Xbyak::Label lb14; +Xbyak::Label lc30; +Xbyak::Label lcc8; +Xbyak::Label ld1c; +Xbyak::Label ld54; +Xbyak::Label ld78; +Xbyak::Label ld84; +Xbyak::Label ld9c; +Xbyak::Label le58; +Xbyak::Label lebc; +Xbyak::Label lef8; +Xbyak::Label lf1c; +Xbyak::Label lf3c; +Xbyak::Label lf48; +Xbyak::Label lf60; + + preamble(); + auto stacksize = get_size_of_abi_save_regs(); +#ifdef _WIN32 + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(M, qword[M]); + mov(N, qword[N]); + mov(LDA, qword[LDA]); + lea(LDA3, ptr[LDA+LDA*2]); + sub(A, -128); + sub(B, -128); + cmp(N, 0x30); + jl(l480, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + add(A, 0x30); + vxorps(ymm8, ymm8, ymm8); + vxorps(ymm9, ymm9, ymm9); + vxorps(ymm10, ymm10, ymm10); + vxorps(ymm11, ymm11, ymm11); + vxorps(ymm12, ymm12, ymm12); + vxorps(ymm13, ymm13, ymm13); + vxorps(ymm14, ymm14, ymm14); + vxorps(ymm15, ymm15, ymm15); + mov(I, M); + sar(I, 0x2); + jle(l2a0, T_NEAR); + align(4); + +L(l5c); + vmovdqu(xmm0, xword[A1-0x80]); + vmovdqu(xmm1, xword[A1+LDA*1-0x80]); + vmovdqu(xmm2, xword[A1+LDA*2-0x80]); + vmovdqu(xmm3, xword[A1+LDA3*1-0x80]); + vpunpcklbw(xmm4, xmm0, xmm1); + vpunpckhbw(xmm5, xmm0, xmm1); + vpunpcklbw(xmm6, xmm2, xmm3); + vpunpckhbw(xmm7, xmm2, xmm3); + vpunpcklwd(xmm0, xmm4, xmm6); + vpunpckhwd(xmm1, xmm4, xmm6); + vpunpcklwd(xmm2, xmm5, xmm7); + vpunpckhwd(xmm3, xmm5, xmm7); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm8, ymm8, ymm5); + vmovdqu(xword[B-0x80], xmm0); + vmovdqu(xword[B-0x70], xmm1); + vpmovsxbw(ymm5, xmm2); + vmovhlps(xmm6, xmm2, xmm2); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm9, ymm9, ymm5); + vmovdqu(xword[B-0x60], xmm2); + vmovdqu(xword[B-0x50], xmm3); + vmovdqu(xmm0, xword[A1-0x70]); + vmovdqu(xmm1, xword[A1+LDA*1-0x70]); + vmovdqu(xmm2, xword[A1+LDA*2-0x70]); + vmovdqu(xmm3, xword[A1+LDA3*1-0x70]); + vpunpcklbw(xmm4, xmm0, xmm1); + vpunpckhbw(xmm5, xmm0, xmm1); + vpunpcklbw(xmm6, xmm2, xmm3); + vpunpckhbw(xmm7, xmm2, xmm3); + vpunpcklwd(xmm0, xmm4, xmm6); + vpunpckhwd(xmm1, xmm4, xmm6); + vpunpcklwd(xmm2, xmm5, xmm7); + vpunpckhwd(xmm3, xmm5, xmm7); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm10, ymm10, ymm5); + vmovdqu(xword[B-0x40], xmm0); + vmovdqu(xword[B-0x30], xmm1); + vpmovsxbw(ymm5, xmm2); + vmovhlps(xmm6, xmm2, xmm2); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm11, ymm11, ymm5); + vmovdqu(xword[B-0x20], xmm2); + vmovdqu(xword[B-0x10], xmm3); + vmovdqu(xmm0, xword[A1-0x60]); + vmovdqu(xmm1, xword[A1+LDA*1-0x60]); + vmovdqu(xmm2, xword[A1+LDA*2-0x60]); + vmovdqu(xmm3, xword[A1+LDA3*1-0x60]); + lea(A1, ptr[A1+LDA*4]); + vpunpcklbw(xmm4, xmm0, xmm1); + vpunpckhbw(xmm5, xmm0, xmm1); + vpunpcklbw(xmm6, xmm2, xmm3); + vpunpckhbw(xmm7, xmm2, xmm3); + vpunpcklwd(xmm0, xmm4, xmm6); + vpunpckhwd(xmm1, xmm4, xmm6); + vpunpcklwd(xmm2, xmm5, xmm7); + vpunpckhwd(xmm3, xmm5, xmm7); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm12, ymm12, ymm5); + vmovdqu(xword[B], xmm0); + vmovdqu(xword[B+0x10], xmm1); + vpmovsxbw(ymm5, xmm2); + vmovhlps(xmm6, xmm2, xmm2); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm13, ymm13, ymm5); + vmovdqu(xword[B+0x20], xmm2); + vmovdqu(xword[B+0x30], xmm3); + sub(B, -192); + dec(I); + jg(l5c, T_NEAR); + align(4); + +L(l2a0); + test(M, 0x2); + jle(l3c0, T_NEAR); + vmovdqu(xmm0, xword[A1-0x80]); + vmovdqu(xmm1, xword[A1-0x70]); + vmovdqu(xmm2, xword[A1-0x60]); + add(A1, LDA); + vmovdqu(xmm6, xword[A1-0x80]); + vmovdqu(xmm4, xword[A1-0x70]); + vmovdqu(xmm5, xword[A1-0x60]); + add(A1, LDA); + vpunpcklbw(xmm3, xmm0, xmm6); + vpunpckhbw(xmm0, xmm0, xmm6); + vpmovsxbw(ymm7, xmm3); + vmovhlps(xmm6, xmm3, xmm3); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm7, ymm7, ymm6); + vpmovsxwd(ymm7, xmm7); + vpaddd(ymm8, ymm8, ymm7); + vmovdqu(xword[B-0x80], xmm3); + vpmovsxbw(ymm7, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm7, ymm7, ymm6); + vpmovsxwd(ymm7, xmm7); + vpaddd(ymm9, ymm9, ymm7); + vmovdqu(xword[B-0x70], xmm0); + vpunpcklbw(xmm3, xmm1, xmm4); + vpunpckhbw(xmm0, xmm1, xmm4); + vpmovsxbw(ymm7, xmm3); + vmovhlps(xmm6, xmm3, xmm3); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm7, ymm7, ymm6); + vpmovsxwd(ymm7, xmm7); + vpaddd(ymm10, ymm10, ymm7); + vmovdqu(xword[B-0x60], xmm3); + vpmovsxbw(ymm7, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm7, ymm7, ymm6); + vpmovsxwd(ymm7, xmm7); + vpaddd(ymm11, ymm11, ymm7); + vmovdqu(xword[B-0x50], xmm0); + vpunpcklbw(xmm3, xmm2, xmm5); + vpunpckhbw(xmm0, xmm2, xmm5); + vpmovsxbw(ymm7, xmm3); + vmovhlps(xmm6, xmm3, xmm3); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm7, ymm7, ymm6); + vpmovsxwd(ymm7, xmm7); + vpaddd(ymm12, ymm12, ymm7); + vmovdqu(xword[B-0x40], xmm3); + vpmovsxbw(ymm7, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm7, ymm7, ymm6); + vpmovsxwd(ymm7, xmm7); + vpaddd(ymm13, ymm13, ymm7); + vmovdqu(xword[B-0x30], xmm0); + sub(B, -96); + align(4); + +L(l3c0); + test(M, 0x1); + jle(l438, T_NEAR); + vmovdqu(xmm0, xword[A1-0x80]); + vmovdqu(xmm1, xword[A1-0x70]); + vmovdqu(xmm2, xword[A1-0x60]); + add(A1, LDA); + vpmovsxbd(ymm7, xmm0); + vpaddd(ymm8, ymm8, ymm7); + vmovhlps(xmm7, xmm0, xmm0); + vpmovsxbd(ymm7, xmm7); + vpaddd(ymm9, ymm9, ymm7); + vmovdqu(xword[B-0x80], xmm0); + vpmovsxbd(ymm7, xmm1); + vpaddd(ymm10, ymm10, ymm7); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbd(ymm7, xmm7); + vpaddd(ymm11, ymm11, ymm7); + vmovdqu(xword[B-0x70], xmm1); + vpmovsxbd(ymm7, xmm2); + vpaddd(ymm12, ymm12, ymm7); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbd(ymm7, xmm7); + vpaddd(ymm13, ymm13, ymm7); + vmovdqu(xword[B-0x60], xmm2); + sub(B, -48); + align(4); + +L(l438); + mov(A1, qword[ARG_BIAS]); + vmovdqu(yword[A1], ymm8); + vmovdqu(yword[A1+0x20], ymm9); + vmovdqu(yword[A1+0x40], ymm10); + vmovdqu(yword[A1+0x60], ymm11); + vmovdqu(yword[A1+0x80], ymm12); + vmovdqu(yword[A1+0xa0], ymm13); + add(qword[ARG_BIAS], 0xc0); + sub(N, 0x30); + cmp(N, 0x30); + jge(l20, T_NEAR); + vzeroupper(); + align(4); + +L(l480); + cmp(N, 0x20); + jl(l89c, T_NEAR); + align(4); + +L(l48c); + mov(A1, A); + add(A, 0x20); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + pxor(xmm10, xmm10); + pxor(xmm11, xmm11); + pxor(xmm12, xmm12); + pxor(xmm13, xmm13); + pxor(xmm14, xmm14); + pxor(xmm15, xmm15); + mov(I, M); + sar(I, 0x2); + jle(l6a8, T_NEAR); + align(4); + +L(l4c8); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm4); + pmovsxbw(xmm5, xmm2); + movhlps(xmm6, xmm2); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm2); + movdqu(xmm0, xword[A1-0x70]); + movdqu(xmm1, xword[A1+LDA*1-0x70]); + movdqu(xmm2, xword[A1+LDA*2-0x70]); + movdqu(xmm3, xword[A1+LDA3*1-0x70]); + lea(A1, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm5); + punpckhwd(xmm2, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B-0x40], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B-0x30], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B-0x20], xmm4); + pmovsxbw(xmm5, xmm2); + movhlps(xmm6, xmm2); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B-0x10], xmm2); + sub(B, -128); + dec(I); + jg(l4c8, T_NEAR); + align(4); + +L(l6a8); + test(M, 0x2); + jle(l7b4, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1-0x70]); + add(A1, LDA); + movdqu(xmm2, xword[A1-0x80]); + movdqu(xmm3, xword[A1-0x70]); + add(A1, LDA); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm2); + punpckhbw(xmm4, xmm2); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm4); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x70], xmm4); + movdqa(xmm4, xmm1); + punpcklbw(xmm1, xmm3); + punpckhbw(xmm4, xmm3); + pmovsxbw(xmm5, xmm1); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm13, xmm6); + movdqu(xword[B-0x60], xmm1); + pmovsxbw(xmm5, xmm4); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm15, xmm6); + movdqu(xword[B-0x50], xmm4); + sub(B, -64); + align(4); + +L(l7b4); + test(M, 0x1); + jle(l850, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1-0x70]); + add(A1, LDA); + pmovsxbd(xmm5, xmm0); + paddd(xmm8, xmm5); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm9, xmm6); + pshufd(xmm5, xmm0, 0xaa); + pmovsxbd(xmm5, xmm5); + paddd(xmm10, xmm5); + pshufd(xmm6, xmm0, 0xff); + pmovsxbd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x80], xmm0); + pmovsxbd(xmm5, xmm1); + paddd(xmm12, xmm5); + pshufd(xmm6, xmm1, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm13, xmm6); + pshufd(xmm5, xmm1, 0xaa); + pmovsxbd(xmm5, xmm5); + paddd(xmm14, xmm5); + pshufd(xmm6, xmm1, 0xff); + pmovsxbd(xmm6, xmm6); + paddd(xmm15, xmm6); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l850); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + movdqu(xword[A1+0x20], xmm10); + movdqu(xword[A1+0x30], xmm11); + movdqu(xword[A1+0x40], xmm12); + movdqu(xword[A1+0x50], xmm13); + movdqu(xword[A1+0x60], xmm14); + movdqu(xword[A1+0x70], xmm15); + add(qword[ARG_BIAS], 0x80); + sub(N, 0x20); + cmp(N, 0x20); + jge(l48c, T_NEAR); + align(4); + +L(l89c); + cmp(N, 0x10); + jl(lae8, T_NEAR); + align(4); + +L(l8a8); + mov(A1, A); + add(A, 0x10); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + pxor(xmm10, xmm10); + pxor(xmm11, xmm11); + mov(I, M); + sar(I, 0x2); + jle(l9d0, T_NEAR); + align(4); + +L(l8d0); + movdqu(xmm0, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm1, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm2, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm3, xword[A1-0x80]); + add(A1, LDA); + movdqa(xmm4, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm4, xmm1); + movdqa(xmm1, xmm2); + punpcklbw(xmm2, xmm3); + punpckhbw(xmm1, xmm3); + movdqa(xmm3, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm3, xmm2); + movdqa(xmm2, xmm4); + punpcklwd(xmm4, xmm1); + punpckhwd(xmm2, xmm1); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm3); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + pmovsxbw(xmm5, xmm2); + movhlps(xmm6, xmm2); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x60], xmm4); + movdqu(xword[B-0x50], xmm2); + sub(B, -64); + dec(I); + jg(l8d0, T_NEAR); + align(4); + +L(l9d0); + test(M, 0x2); + jle(la64, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + add(A1, LDA); + movdqu(xmm1, xword[A1-0x80]); + add(A1, LDA); + movdqa(xmm2, xmm0); + punpcklbw(xmm0, xmm1); + punpckhbw(xmm2, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + pmovsxbw(xmm5, xmm2); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movhlps(xmm6, xmm2); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm2); + sub(B, -32); + align(4); + +L(la64); + test(M, 0x1); + jle(lab8, T_NEAR); + movdqu(xmm0, xword[A1-0x80]); + add(A1, LDA); + pmovsxbd(xmm5, xmm0); + paddd(xmm8, xmm5); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm9, xmm6); + pshufd(xmm5, xmm0, 0xaa); + pmovsxbd(xmm5, xmm5); + paddd(xmm10, xmm5); + pshufd(xmm6, xmm0, 0xff); + pmovsxbd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(lab8); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + movdqu(xword[A1+0x20], xmm10); + movdqu(xword[A1+0x30], xmm11); + add(qword[ARG_BIAS], 0x40); + sub(N, 0x10); + cmp(N, 0x10); + jge(l8a8, T_NEAR); + align(4); + +L(lae8); + cmp(N, 0x8); + jl(ld78, T_NEAR); + align(4); + +L(laf4); + mov(A1, A); + add(A, 0x8); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + mov(I, M); + sar(I, 0x3); + jle(lc30, T_NEAR); + align(4); + +L(lb14); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + dec(I); + jg(lb14, T_NEAR); + align(4); + +L(lc30); + test(M, 0x4); + jle(lcc8, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(lcc8); + test(M, 0x2); + jle(ld1c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(ld1c); + test(M, 0x1); + jle(ld54, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + pmovsxbd(xmm5, xmm0); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm8, xmm5); + paddd(xmm9, xmm6); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(ld54); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + add(qword[ARG_BIAS], 0x20); + sub(N, 0x8); + cmp(N, 0x8); + jge(laf4, T_NEAR); + align(4); + +L(ld78); + cmp(N, 0x4); + jl(lf3c, T_NEAR); + align(4); + +L(ld84); + mov(A1, A); + add(A, 0x4); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x3); + jle(le58, T_NEAR); + align(4); + +L(ld9c); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + dec(I); + jg(ld9c, T_NEAR); + align(4); + +L(le58); + test(M, 0x4); + jle(lebc, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(lebc); + test(M, 0x2); + jle(lef8, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(lef8); + test(M, 0x1); + jle(lf1c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(lf1c); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm7); + add(qword[ARG_BIAS], 0x10); + sub(N, 0x4); + cmp(N, 0x4); + jge(ld84, T_NEAR); + align(4); + +L(lf3c); + cmp(N, 0x2); + jl(l111a, T_NEAR); + align(4); + +L(lf48); + mov(A1, A); + add(A, 0x2); + pxor(xmm7, xmm7); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l1024, T_NEAR); + align(4); + +L(lf60); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm4, eax, 0x0); + punpcklbw(xmm1, xmm2); + punpcklbw(xmm3, xmm4); + punpcklwd(xmm1, xmm3); + punpcklqdq(xmm0, xmm1); + pshufd(xmm6, xmm0, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(LDA3); + jg(lf60, T_NEAR); + align(4); + +L(l1024); + test(M, 0x4); + jle(l1090, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l1090); + test(M, 0x2); + jle(l10d4, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + punpcklbw(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l10d4); + test(M, 0x1); + jle(l10fc, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + mov(word[B-0x80], ax); + sub(B, -2); + align(4); + +L(l10fc); + mov(A1, qword[ARG_BIAS]); + movq(qword[A1], xmm7); + add(qword[ARG_BIAS], 0x8); + sub(N, 0x2); + cmp(N, 0x2); + jge(lf48, T_NEAR); + align(4); + +L(l111a); + cmp(N, 0x1); + jl(l12bc, T_NEAR); + align(4); + +L(l1124); + mov(A1, A); + add(A, 0x1); + pxor(xmm7, xmm7); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l11d4, T_NEAR); + align(4); + +L(l113c); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + dec(LDA3); + jg(l113c, T_NEAR); + align(4); + +L(l11d4); + test(M, 0x4); + jle(l1234, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l1234); + test(M, 0x2); + jle(l1278, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(byte[B-0x80], al); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + mov(byte[B-0x7f], al); + sub(B, -2); + align(4); + +L(l1278); + test(M, 0x1); + jle(l129c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l129c); + mov(A1, qword[ARG_BIAS]); + movd(dword[A1], xmm7); + add(qword[ARG_BIAS], 0x4); + sub(N, 0x1); + cmp(N, 0x1); + jge(l1124, T_NEAR); + align(4); + +L(l12bc); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +#undef ARG_BIAS +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern.cpp new file mode 100644 index 0000000..a4f4ff0 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern.cpp @@ -0,0 +1,3163 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_sum_at_kern::jit_avx512_core_u8_copy_sum_at_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#define ARG_BIAS 24+stacksize+rsp + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp +#define ARG_BIAS 72+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l1750; +Xbyak::Label l1b6c; +Xbyak::Label l1e14; +Xbyak::Label l20; +Xbyak::Label l2068; +Xbyak::Label l226c; +Xbyak::Label l22b8; +Xbyak::Label l22c4; +Xbyak::Label l22f4; +Xbyak::Label l26b4; +Xbyak::Label l28cc; +Xbyak::Label l2a2c; +Xbyak::Label l2b5c; +Xbyak::Label l2c64; +Xbyak::Label l2c94; +Xbyak::Label l2ca0; +Xbyak::Label l2cc8; +Xbyak::Label l2eac; +Xbyak::Label l2fc0; +Xbyak::Label l3078; +Xbyak::Label l3118; +Xbyak::Label l319c; +Xbyak::Label l31c0; +Xbyak::Label l31cc; +Xbyak::Label l31ec; +Xbyak::Label l32e4; +Xbyak::Label l3378; +Xbyak::Label l33dc; +Xbyak::Label l3434; +Xbyak::Label l347c; +Xbyak::Label l349c; +Xbyak::Label l34a8; +Xbyak::Label l34c8; +Xbyak::Label l3558; +Xbyak::Label l35b0; +Xbyak::Label l35f4; +Xbyak::Label l3638; +Xbyak::Label l366c; +Xbyak::Label l368a; +Xbyak::Label l3694; +Xbyak::Label l36a8; +Xbyak::Label l36ec; +Xbyak::Label l3728; +Xbyak::Label l3760; +Xbyak::Label l3794; +Xbyak::Label l37b8; +Xbyak::Label l37d8; +Xbyak::Label l5cc; +Xbyak::Label l6c; +Xbyak::Label l968; +Xbyak::Label lc80; +Xbyak::Label lf1c; +Xbyak::Label lf64; +Xbyak::Label lf70; +Xbyak::Label lfb4; + + preamble(); + auto stacksize = get_size_of_abi_save_regs(); +#ifdef _WIN32 + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(N, qword[N]); + mov(M, qword[M]); + mov(LDA, qword[LDA]); + sub(A, -128); + sub(B, -128); + lea(LDA3, ptr[LDA+LDA*2]); + cmp(N, 0x30); + jl(lf64, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + mov(I, LDA); + shl(I, 0x5); + lea(I, ptr[I+LDA*8]); + lea(I, ptr[I+LDA*8]); + add(A, I); + vxorps(ymm8, ymm8, ymm8); + vxorps(ymm9, ymm9, ymm9); + vxorps(ymm10, ymm10, ymm10); + vxorps(ymm11, ymm11, ymm11); + vxorps(ymm12, ymm12, ymm12); + vxorps(ymm13, ymm13, ymm13); + vxorps(ymm14, ymm14, ymm14); + vxorps(ymm15, ymm15, ymm15); + mov(I, M); + sar(I, 0x3); + jle(l5cc, T_NEAR); + align(4); + +L(l6c); + vmovq(xmm0, qword[A1-0x80]); + vmovq(xmm1, qword[A1+LDA*1-0x80]); + vmovq(xmm2, qword[A1+LDA*2-0x80]); + vmovq(xmm3, qword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + vpunpckldq(xmm1, xmm0, xmm1); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm1, xmm3); + vpunpckhqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x80], xmm0); + vmovdqu(xword[B+0x40], xmm1); + vmovq(xmm2, qword[A2-0x80]); + vmovq(xmm3, qword[A2+LDA*1-0x80]); + vmovq(xmm4, qword[A2+LDA*2-0x80]); + vmovq(xmm5, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpckldq(xmm5, xmm4, xmm5); + vpunpcklqdq(xmm2, xmm3, xmm5); + vpunpckhqdq(xmm3, xmm3, xmm5); + vmovdqu(xword[B-0x70], xmm2); + vmovdqu(xword[B+0x50], xmm3); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm2); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm8, ymm8, ymm5); + vpmovsxbw(ymm5, xmm1); + vmovhlps(xmm6, xmm1, xmm1); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm8, ymm8, ymm5); + vmovq(xmm0, qword[A2-0x80]); + vmovq(xmm1, qword[A2+LDA*1-0x80]); + vmovq(xmm2, qword[A2+LDA*2-0x80]); + vmovq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm0, xmm1); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm1, xmm3); + vpunpckhqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x60], xmm0); + vmovdqu(xword[B+0x60], xmm1); + vmovq(xmm2, qword[A2-0x80]); + vmovq(xmm3, qword[A2+LDA*1-0x80]); + vmovq(xmm4, qword[A2+LDA*2-0x80]); + vmovq(xmm5, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpckldq(xmm5, xmm4, xmm5); + vpunpcklqdq(xmm2, xmm3, xmm5); + vpunpckhqdq(xmm3, xmm3, xmm5); + vmovdqu(xword[B-0x50], xmm2); + vmovdqu(xword[B+0x70], xmm3); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm2); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm9, ymm9, ymm5); + vpmovsxbw(ymm5, xmm1); + vmovhlps(xmm6, xmm1, xmm1); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm9, ymm9, ymm5); + vmovq(xmm0, qword[A2-0x80]); + vmovq(xmm1, qword[A2+LDA*1-0x80]); + vmovq(xmm2, qword[A2+LDA*2-0x80]); + vmovq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm0, xmm1); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm1, xmm3); + vpunpckhqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x40], xmm0); + vmovdqu(xword[B+0x80], xmm1); + vmovq(xmm2, qword[A2-0x80]); + vmovq(xmm3, qword[A2+LDA*1-0x80]); + vmovq(xmm4, qword[A2+LDA*2-0x80]); + vmovq(xmm5, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpckldq(xmm5, xmm4, xmm5); + vpunpcklqdq(xmm2, xmm3, xmm5); + vpunpckhqdq(xmm3, xmm3, xmm5); + vmovdqu(xword[B-0x30], xmm2); + vmovdqu(xword[B+0x90], xmm3); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm2); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm10, ymm10, ymm5); + vpmovsxbw(ymm5, xmm1); + vmovhlps(xmm6, xmm1, xmm1); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm10, ymm10, ymm5); + vmovq(xmm0, qword[A2-0x80]); + vmovq(xmm1, qword[A2+LDA*1-0x80]); + vmovq(xmm2, qword[A2+LDA*2-0x80]); + vmovq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm0, xmm1); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm1, xmm3); + vpunpckhqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x20], xmm0); + vmovdqu(xword[B+0xa0], xmm1); + vmovq(xmm2, qword[A2-0x80]); + vmovq(xmm3, qword[A2+LDA*1-0x80]); + vmovq(xmm4, qword[A2+LDA*2-0x80]); + vmovq(xmm5, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpckldq(xmm5, xmm4, xmm5); + vpunpcklqdq(xmm2, xmm3, xmm5); + vpunpckhqdq(xmm3, xmm3, xmm5); + vmovdqu(xword[B-0x10], xmm2); + vmovdqu(xword[B+0xb0], xmm3); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm2); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm11, ymm11, ymm5); + vpmovsxbw(ymm5, xmm1); + vmovhlps(xmm6, xmm1, xmm1); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm11, ymm11, ymm5); + vmovq(xmm0, qword[A2-0x80]); + vmovq(xmm1, qword[A2+LDA*1-0x80]); + vmovq(xmm2, qword[A2+LDA*2-0x80]); + vmovq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm0, xmm1); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm1, xmm3); + vpunpckhqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B], xmm0); + vmovdqu(xword[B+0xc0], xmm1); + vmovq(xmm2, qword[A2-0x80]); + vmovq(xmm3, qword[A2+LDA*1-0x80]); + vmovq(xmm4, qword[A2+LDA*2-0x80]); + vmovq(xmm5, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpckldq(xmm5, xmm4, xmm5); + vpunpcklqdq(xmm2, xmm3, xmm5); + vpunpckhqdq(xmm3, xmm3, xmm5); + vmovdqu(xword[B+0x10], xmm2); + vmovdqu(xword[B+0xd0], xmm3); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm2); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm12, ymm12, ymm5); + vpmovsxbw(ymm5, xmm1); + vmovhlps(xmm6, xmm1, xmm1); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm12, ymm12, ymm5); + vmovq(xmm0, qword[A2-0x80]); + vmovq(xmm1, qword[A2+LDA*1-0x80]); + vmovq(xmm2, qword[A2+LDA*2-0x80]); + vmovq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm0, xmm1); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm1, xmm3); + vpunpckhqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B+0x20], xmm0); + vmovdqu(xword[B+0xe0], xmm1); + vmovq(xmm2, qword[A2-0x80]); + vmovq(xmm3, qword[A2+LDA*1-0x80]); + vmovq(xmm4, qword[A2+LDA*2-0x80]); + vmovq(xmm5, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm3, xmm2, xmm3); + vpunpckldq(xmm5, xmm4, xmm5); + vpunpcklqdq(xmm2, xmm3, xmm5); + vpunpckhqdq(xmm3, xmm3, xmm5); + vmovdqu(xword[B+0x30], xmm2); + vmovdqu(xword[B+0xf0], xmm3); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm2); + vmovhlps(xmm7, xmm2, xmm2); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm13, ymm13, ymm5); + vpmovsxbw(ymm5, xmm1); + vmovhlps(xmm6, xmm1, xmm1); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm3); + vmovhlps(xmm7, xmm3, xmm3); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm13, ymm13, ymm5); + sub(A1, -8); + sub(B, -384); + dec(I); + jg(l6c, T_NEAR); + align(4); + +L(l5cc); + test(M, 0x4); + jle(l968, T_NEAR); + vmovd(xmm0, dword[A1-0x80]); + vmovd(xmm1, dword[A1+LDA*1-0x80]); + vmovd(xmm2, dword[A1+LDA*2-0x80]); + vmovd(xmm3, dword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + vpunpckldq(xmm0, xmm0, xmm1); + vpunpckldq(xmm2, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm0, xmm2); + vmovdqu(xword[B-0x80], xmm0); + vmovd(xmm1, dword[A2-0x80]); + vmovd(xmm2, dword[A2+LDA*1-0x80]); + vmovd(xmm3, dword[A2+LDA*2-0x80]); + vmovd(xmm4, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm1, xmm2); + vpunpckldq(xmm3, xmm3, xmm4); + vpunpcklqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x70], xmm1); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm8, ymm8, ymm5); + vmovd(xmm0, dword[A2-0x80]); + vmovd(xmm1, dword[A2+LDA*1-0x80]); + vmovd(xmm2, dword[A2+LDA*2-0x80]); + vmovd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm0, xmm0, xmm1); + vpunpckldq(xmm2, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm0, xmm2); + vmovdqu(xword[B-0x60], xmm0); + vmovd(xmm1, dword[A2-0x80]); + vmovd(xmm2, dword[A2+LDA*1-0x80]); + vmovd(xmm3, dword[A2+LDA*2-0x80]); + vmovd(xmm4, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm1, xmm2); + vpunpckldq(xmm3, xmm3, xmm4); + vpunpcklqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x50], xmm1); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm9, ymm9, ymm5); + vmovd(xmm0, dword[A2-0x80]); + vmovd(xmm1, dword[A2+LDA*1-0x80]); + vmovd(xmm2, dword[A2+LDA*2-0x80]); + vmovd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm0, xmm0, xmm1); + vpunpckldq(xmm2, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm0, xmm2); + vmovdqu(xword[B-0x40], xmm0); + vmovd(xmm1, dword[A2-0x80]); + vmovd(xmm2, dword[A2+LDA*1-0x80]); + vmovd(xmm3, dword[A2+LDA*2-0x80]); + vmovd(xmm4, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm1, xmm2); + vpunpckldq(xmm3, xmm3, xmm4); + vpunpcklqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x30], xmm1); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm10, ymm10, ymm5); + vmovd(xmm0, dword[A2-0x80]); + vmovd(xmm1, dword[A2+LDA*1-0x80]); + vmovd(xmm2, dword[A2+LDA*2-0x80]); + vmovd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm0, xmm0, xmm1); + vpunpckldq(xmm2, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm0, xmm2); + vmovdqu(xword[B-0x20], xmm0); + vmovd(xmm1, dword[A2-0x80]); + vmovd(xmm2, dword[A2+LDA*1-0x80]); + vmovd(xmm3, dword[A2+LDA*2-0x80]); + vmovd(xmm4, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm1, xmm2); + vpunpckldq(xmm3, xmm3, xmm4); + vpunpcklqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B-0x10], xmm1); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm11, ymm11, ymm5); + vmovd(xmm0, dword[A2-0x80]); + vmovd(xmm1, dword[A2+LDA*1-0x80]); + vmovd(xmm2, dword[A2+LDA*2-0x80]); + vmovd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm0, xmm0, xmm1); + vpunpckldq(xmm2, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm0, xmm2); + vmovdqu(xword[B], xmm0); + vmovd(xmm1, dword[A2-0x80]); + vmovd(xmm2, dword[A2+LDA*1-0x80]); + vmovd(xmm3, dword[A2+LDA*2-0x80]); + vmovd(xmm4, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm1, xmm2); + vpunpckldq(xmm3, xmm3, xmm4); + vpunpcklqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B+0x10], xmm1); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm12, ymm12, ymm5); + vmovd(xmm0, dword[A2-0x80]); + vmovd(xmm1, dword[A2+LDA*1-0x80]); + vmovd(xmm2, dword[A2+LDA*2-0x80]); + vmovd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm0, xmm0, xmm1); + vpunpckldq(xmm2, xmm2, xmm3); + vpunpcklqdq(xmm0, xmm0, xmm2); + vmovdqu(xword[B+0x20], xmm0); + vmovd(xmm1, dword[A2-0x80]); + vmovd(xmm2, dword[A2+LDA*1-0x80]); + vmovd(xmm3, dword[A2+LDA*2-0x80]); + vmovd(xmm4, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpunpckldq(xmm1, xmm1, xmm2); + vpunpckldq(xmm3, xmm3, xmm4); + vpunpcklqdq(xmm1, xmm1, xmm3); + vmovdqu(xword[B+0x30], xmm1); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxbw(ymm6, xmm1); + vmovhlps(xmm7, xmm1, xmm1); + vpmovsxbw(ymm7, xmm7); + vphaddw(ymm6, ymm6, ymm7); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm13, ymm13, ymm5); + sub(A1, -4); + sub(B, -192); + align(4); + +L(l968); + test(M, 0x2); + jle(lc80, T_NEAR); + mov(ax, word[A1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x7); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm8, ymm8, ymm5); + vmovdqu(xword[B-0x80], xmm0); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm9, ymm9, ymm5); + vmovdqu(xword[B-0x70], xmm0); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm10, ymm10, ymm5); + vmovdqu(xword[B-0x60], xmm0); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm11, ymm11, ymm5); + vmovdqu(xword[B-0x50], xmm0); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm12, ymm12, ymm5); + vmovdqu(xword[B-0x40], xmm0); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrw(xmm0, xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + vpinsrw(xmm0, xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + vpmovsxbw(ymm5, xmm0); + vmovhlps(xmm6, xmm0, xmm0); + vpmovsxbw(ymm6, xmm6); + vphaddw(ymm5, ymm5, ymm6); + vpmovsxwd(ymm5, xmm5); + vpaddd(ymm13, ymm13, ymm5); + vmovdqu(xword[B-0x30], xmm0); + sub(A1, -2); + sub(B, -96); + align(4); + +L(lc80); + test(M, 0x1); + jle(lf1c, T_NEAR); + mov(al, byte[A1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0xf); + vpmovsxbd(ymm7, xmm0); + vpaddd(ymm8, ymm8, ymm7); + vmovhlps(xmm7, xmm0, xmm0); + vpmovsxbd(ymm7, xmm7); + vpaddd(ymm9, ymm9, ymm7); + vmovdqu(xword[B-0x80], xmm0); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x0); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x1); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x2); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0xf); + vpmovsxbd(ymm7, xmm0); + vpaddd(ymm10, ymm10, ymm7); + vmovhlps(xmm7, xmm0, xmm0); + vpmovsxbd(ymm7, xmm7); + vpaddd(ymm11, ymm11, ymm7); + vmovdqu(xword[B-0x70], xmm0); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x0); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x1); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x2); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + vpinsrb(xmm0, xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + vpinsrb(xmm0, xmm0, eax, 0xf); + vpmovsxbd(ymm7, xmm0); + vpaddd(ymm12, ymm12, ymm7); + vmovhlps(xmm7, xmm0, xmm0); + vpmovsxbd(ymm7, xmm7); + vpaddd(ymm13, ymm13, ymm7); + vmovdqu(xword[B-0x60], xmm0); + sub(B, -48); + align(4); + +L(lf1c); + mov(A1, qword[ARG_BIAS]); + vmovdqu(yword[A1], ymm8); + vmovdqu(yword[A1+0x20], ymm9); + vmovdqu(yword[A1+0x40], ymm10); + vmovdqu(yword[A1+0x60], ymm11); + vmovdqu(yword[A1+0x80], ymm12); + vmovdqu(yword[A1+0xa0], ymm13); + add(qword[ARG_BIAS], 0xc0); + sub(N, 0x30); + cmp(N, 0x30); + jge(l20, T_NEAR); + vzeroupper(); + align(4); + +L(lf64); + cmp(N, 0x20); + jl(l22b8, T_NEAR); + align(4); + +L(lf70); + mov(A1, A); + mov(I, LDA); + shl(I, 0x5); + add(A, I); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + pxor(xmm10, xmm10); + pxor(xmm11, xmm11); + pxor(xmm12, xmm12); + pxor(xmm13, xmm13); + pxor(xmm14, xmm14); + pxor(xmm15, xmm15); + mov(I, M); + sar(I, 0x4); + jle(l1750, T_NEAR); + align(4); + +L(lfb4); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B+0x80], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B+0x100], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B+0x10], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B+0x90], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B+0x110], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B+0x20], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B+0xa0], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B+0x120], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B+0x30], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B+0xb0], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B+0x130], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B-0x40], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B+0x40], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B+0xc0], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B+0x140], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B-0x30], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B+0x50], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B+0xd0], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B+0x150], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B-0x20], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B+0x60], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B+0xe0], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B+0x160], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B-0x10], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B+0x70], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B+0xf0], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B+0x170], xmm3); + sub(A1, -16); + sub(B, -512); + dec(I); + jg(lfb4, T_NEAR); + align(4); + +L(l1750); + test(M, 0x8); + jle(l1b6c, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B+0x10], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B+0x20], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B+0x30], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B-0x40], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B+0x40], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B-0x30], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B+0x50], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B-0x20], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B+0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B-0x10], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B+0x70], xmm1); + sub(A1, -8); + sub(B, -256); + align(4); + +L(l1b6c); + test(M, 0x4); + jle(l1e14, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movdqu(xword[B-0x40], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm13, xmm5); + movdqu(xword[B-0x30], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movdqu(xword[B-0x20], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm15, xmm5); + movdqu(xword[B-0x10], xmm0); + sub(A1, -4); + sub(B, -128); + align(4); + +L(l1e14); + test(M, 0x2); + jle(l2068, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x70], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm12, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm13, xmm6); + movdqu(xword[B-0x60], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + lea(A2, ptr[A2+LDA*4]); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm14, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm15, xmm6); + movdqu(xword[B-0x50], xmm0); + sub(A1, -2); + sub(B, -64); + align(4); + +L(l2068); + test(M, 0x1); + jle(l226c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + pmovsxbd(xmm5, xmm0); + paddd(xmm8, xmm5); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm9, xmm6); + pshufd(xmm5, xmm0, 0xaa); + pmovsxbd(xmm5, xmm5); + paddd(xmm10, xmm5); + pshufd(xmm6, xmm0, 0xff); + pmovsxbd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x80], xmm0); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xf); + pmovsxbd(xmm5, xmm0); + paddd(xmm12, xmm5); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm13, xmm6); + pshufd(xmm5, xmm0, 0xaa); + pmovsxbd(xmm5, xmm5); + paddd(xmm14, xmm5); + pshufd(xmm6, xmm0, 0xff); + pmovsxbd(xmm6, xmm6); + paddd(xmm15, xmm6); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + align(4); + +L(l226c); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + movdqu(xword[A1+0x20], xmm10); + movdqu(xword[A1+0x30], xmm11); + movdqu(xword[A1+0x40], xmm12); + movdqu(xword[A1+0x50], xmm13); + movdqu(xword[A1+0x60], xmm14); + movdqu(xword[A1+0x70], xmm15); + add(qword[ARG_BIAS], 0x80); + sub(N, 0x20); + cmp(N, 0x20); + jge(lf70, T_NEAR); + align(4); + +L(l22b8); + cmp(N, 0x10); + jl(l2c94, T_NEAR); + align(4); + +L(l22c4); + mov(A1, A); + mov(I, LDA); + shl(I, 0x4); + add(A, I); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + pxor(xmm10, xmm10); + pxor(xmm11, xmm11); + mov(I, M); + sar(I, 0x4); + jle(l26b4, T_NEAR); + align(4); + +L(l22f4); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x40], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B+0x40], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x30], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B+0x10], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B+0x50], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x20], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B+0x20], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B+0x60], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x10], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B+0x30], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B+0x70], xmm3); + sub(A1, -16); + sub(B, -256); + dec(I); + jg(l22f4, T_NEAR); + align(4); + +L(l26b4); + test(M, 0x8); + jle(l28cc, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x40], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x30], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x20], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x10], xmm1); + sub(A1, -8); + sub(B, -128); + align(4); + +L(l28cc); + test(M, 0x4); + jle(l2a2c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movdqu(xword[B-0x60], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm11, xmm5); + movdqu(xword[B-0x50], xmm0); + sub(A1, -4); + sub(B, -64); + align(4); + +L(l2a2c); + test(M, 0x2); + jle(l2b5c, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + pinsrw(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm10, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x70], xmm0); + sub(A1, -2); + sub(B, -32); + align(4); + +L(l2b5c); + test(M, 0x1); + jle(l2c64, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + lea(A2, ptr[A1+LDA*4]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0x7); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x8); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x9); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xa); + mov(al, byte[A2+LDA3*1-0x80]); + lea(A2, ptr[A2+LDA*4]); + pinsrb(xmm0, eax, 0xb); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0xc); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0xd); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0xe); + mov(al, byte[A2+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0xf); + pmovsxbd(xmm5, xmm0); + paddd(xmm8, xmm5); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm9, xmm6); + pshufd(xmm5, xmm0, 0xaa); + pmovsxbd(xmm5, xmm5); + paddd(xmm10, xmm5); + pshufd(xmm6, xmm0, 0xff); + pmovsxbd(xmm6, xmm6); + paddd(xmm11, xmm6); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l2c64); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + movdqu(xword[A1+0x20], xmm10); + movdqu(xword[A1+0x30], xmm11); + add(qword[ARG_BIAS], 0x40); + sub(N, 0x10); + cmp(N, 0x10); + jge(l22c4, T_NEAR); + align(4); + +L(l2c94); + cmp(N, 0x8); + jl(l31c0, T_NEAR); + align(4); + +L(l2ca0); + mov(A1, A); + lea(A2, ptr[A1+LDA*4]); + lea(I, ptr[A1+LDA*8]); + mov(A, I); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + mov(I, M); + sar(I, 0x4); + jle(l2eac, T_NEAR); + align(4); + +L(l2cc8); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + sub(A1, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x60], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x40], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x20], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x50], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x30], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x10], xmm3); + sub(B, -128); + dec(I); + jg(l2cc8, T_NEAR); + align(4); + +L(l2eac); + test(M, 0x8); + jle(l2fc0, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + sub(A1, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + align(4); + +L(l2fc0); + test(M, 0x4); + jle(l3078, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + sub(A1, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + align(4); + +L(l3078); + test(M, 0x2); + jle(l3118, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l3118); + test(M, 0x1); + jle(l319c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x7); + pmovsxbd(xmm5, xmm0); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm8, xmm5); + paddd(xmm9, xmm6); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l319c); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + add(qword[ARG_BIAS], 0x20); + sub(N, 0x8); + cmp(N, 0x8); + jge(l2ca0, T_NEAR); + align(4); + +L(l31c0); + cmp(N, 0x4); + jl(l349c, T_NEAR); + align(4); + +L(l31cc); + mov(A1, A); + lea(A2, ptr[A1+LDA*2]); + lea(I, ptr[A1+LDA*4]); + mov(A, I); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x4); + jle(l32e4, T_NEAR); + align(4); + +L(l31ec); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + sub(A1, -16); + movdqu(xmm2, xword[A2-0x80]); + movdqu(xmm3, xword[A2+LDA*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x60], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x50], xmm3); + sub(B, -64); + dec(I); + jg(l31ec, T_NEAR); + align(4); + +L(l32e4); + test(M, 0x8); + jle(l3378, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + sub(A1, -8); + movq(xmm2, qword[A2-0x80]); + movq(xmm3, qword[A2+LDA*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l3378); + test(M, 0x4); + jle(l33dc, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + sub(A1, -4); + movd(xmm2, dword[A2-0x80]); + movd(xmm3, dword[A2+LDA*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l33dc); + test(M, 0x2); + jle(l3434, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x3); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l3434); + test(M, 0x1); + jle(l347c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x3); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l347c); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm7); + add(qword[ARG_BIAS], 0x10); + sub(N, 0x4); + cmp(N, 0x4); + jge(l31cc, T_NEAR); + align(4); + +L(l349c); + cmp(N, 0x2); + jl(l368a, T_NEAR); + align(4); + +L(l34a8); + mov(A1, A); + lea(A2, ptr[A1+LDA*1]); + lea(I, ptr[A1+LDA*2]); + mov(A, I); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x4); + jle(l3558, T_NEAR); + align(4); + +L(l34c8); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + movdqu(xmm1, xword[A2-0x80]); + sub(A2, -16); + movdqa(xmm2, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm2, xmm1); + pshufd(xmm6, xmm0, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + pshufd(xmm6, xmm2, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm2); + sub(B, -32); + dec(I); + jg(l34c8, T_NEAR); + align(4); + +L(l3558); + test(M, 0x8); + jle(l35b0, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + movq(xmm1, qword[A2-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + pshufd(xmm6, xmm0, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l35b0); + test(M, 0x4); + jle(l35f4, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + movd(xmm1, dword[A2-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l35f4); + test(M, 0x2); + jle(l3638, T_NEAR); + mov(ax, word[A1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l3638); + test(M, 0x1); + jle(l366c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(byte[B-0x80], al); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(byte[B-0x7f], al); + sub(B, -2); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + align(4); + +L(l366c); + mov(A1, qword[ARG_BIAS]); + movq(qword[A1], xmm7); + add(qword[ARG_BIAS], 0x8); + sub(N, 0x2); + cmp(N, 0x2); + jge(l34a8, T_NEAR); + align(4); + +L(l368a); + cmp(N, 0x1); + jl(l37d8, T_NEAR); + align(4); + +L(l3694); + mov(A1, A); + add(A, LDA); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x4); + jle(l36ec, T_NEAR); + align(4); + +L(l36a8); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(I); + jg(l36a8, T_NEAR); + align(4); + +L(l36ec); + test(M, 0x8); + jle(l3728, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l3728); + test(M, 0x4); + jle(l3760, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l3760); + test(M, 0x2); + jle(l3794, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + mov(word[B-0x80], ax); + sub(A1, -2); + sub(B, -2); + align(4); + +L(l3794); + test(M, 0x1); + jle(l37b8, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l37b8); + mov(A1, qword[ARG_BIAS]); + movd(dword[A1], xmm7); + add(qword[ARG_BIAS], 0x4); + sub(N, 0x1); + cmp(N, 0x1); + jge(l3694, T_NEAR); + align(4); + +L(l37d8); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +#undef ARG_BIAS +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern.cpp new file mode 100644 index 0000000..c7f1393 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern.cpp @@ -0,0 +1,821 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_sum_bn_kern::jit_avx512_core_u8_copy_sum_bn_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#define ARG_BIAS 24+stacksize+rsp + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp +#define ARG_BIAS 72+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l20; +Xbyak::Label l22c; +Xbyak::Label l340; +Xbyak::Label l3f8; +Xbyak::Label l48; +Xbyak::Label l498; +Xbyak::Label l51c; +Xbyak::Label l540; +Xbyak::Label l54c; +Xbyak::Label l56c; +Xbyak::Label l664; +Xbyak::Label l6f8; +Xbyak::Label l75c; +Xbyak::Label l7b4; +Xbyak::Label l7fc; +Xbyak::Label l81c; +Xbyak::Label l828; +Xbyak::Label l848; +Xbyak::Label l8d8; +Xbyak::Label l930; +Xbyak::Label l974; +Xbyak::Label l9b8; +Xbyak::Label l9ec; +Xbyak::Label la0a; +Xbyak::Label la14; +Xbyak::Label la28; +Xbyak::Label la6c; +Xbyak::Label laa8; +Xbyak::Label lae0; +Xbyak::Label lb14; +Xbyak::Label lb38; +Xbyak::Label lb58; + + preamble(); + auto stacksize = get_size_of_abi_save_regs(); +#ifdef _WIN32 + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(N, qword[N]); + mov(M, qword[M]); + mov(LDA, qword[LDA]); + sub(A, -128); + sub(B, -128); + lea(LDA3, ptr[LDA+LDA*2]); + cmp(N, 0x8); + jl(l540, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + lea(A2, ptr[A1+LDA*4]); + lea(I, ptr[A1+LDA*8]); + mov(A, I); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + mov(I, M); + sar(I, 0x4); + jle(l22c, T_NEAR); + align(4); + +L(l48); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + movdqu(xmm2, xword[A1+LDA*2-0x80]); + movdqu(xmm3, xword[A1+LDA3*1-0x80]); + sub(A1, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x60], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x40], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x20], xmm3); + movdqu(xmm0, xword[A2-0x80]); + movdqu(xmm1, xword[A2+LDA*1-0x80]); + movdqu(xmm2, xword[A2+LDA*2-0x80]); + movdqu(xmm3, xword[A2+LDA3*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x50], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x30], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x10], xmm3); + sub(B, -128); + dec(I); + jg(l48, T_NEAR); + align(4); + +L(l22c); + test(M, 0x8); + jle(l340, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + movq(xmm2, qword[A1+LDA*2-0x80]); + movq(xmm3, qword[A1+LDA3*1-0x80]); + sub(A1, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x60], xmm1); + movq(xmm0, qword[A2-0x80]); + movq(xmm1, qword[A2+LDA*1-0x80]); + movq(xmm2, qword[A2+LDA*2-0x80]); + movq(xmm3, qword[A2+LDA3*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + align(4); + +L(l340); + test(M, 0x4); + jle(l3f8, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + movd(xmm2, dword[A1+LDA*2-0x80]); + movd(xmm3, dword[A1+LDA3*1-0x80]); + sub(A1, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A2-0x80]); + movd(xmm1, dword[A2+LDA*1-0x80]); + movd(xmm2, dword[A2+LDA*2-0x80]); + movd(xmm3, dword[A2+LDA3*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + align(4); + +L(l3f8); + test(M, 0x2); + jle(l498, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A1+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A1+LDA3*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x3); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x4); + mov(ax, word[A2+LDA*1-0x80]); + pinsrw(xmm0, eax, 0x5); + mov(ax, word[A2+LDA*2-0x80]); + pinsrw(xmm0, eax, 0x6); + mov(ax, word[A2+LDA3*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l498); + test(M, 0x1); + jle(l51c, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A2+LDA*2-0x80]); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A2+LDA3*1-0x80]); + pinsrb(xmm0, eax, 0x7); + pmovsxbd(xmm5, xmm0); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm8, xmm5); + paddd(xmm9, xmm6); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l51c); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + add(qword[ARG_BIAS], 0x20); + sub(N, 0x8); + cmp(N, 0x8); + jge(l20, T_NEAR); + align(4); + +L(l540); + cmp(N, 0x4); + jl(l81c, T_NEAR); + align(4); + +L(l54c); + mov(A1, A); + lea(A2, ptr[A1+LDA*2]); + lea(I, ptr[A1+LDA*4]); + mov(A, I); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x4); + jle(l664, T_NEAR); + align(4); + +L(l56c); + movdqu(xmm0, xword[A1-0x80]); + movdqu(xmm1, xword[A1+LDA*1-0x80]); + sub(A1, -16); + movdqu(xmm2, xword[A2-0x80]); + movdqu(xmm3, xword[A2+LDA*1-0x80]); + sub(A2, -16); + movdqa(xmm4, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm4, xmm1); + movdqa(xmm5, xmm2); + punpckldq(xmm2, xmm3); + punpckhdq(xmm5, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + movdqa(xmm3, xmm4); + punpcklqdq(xmm4, xmm5); + punpckhqdq(xmm3, xmm5); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm1); + pmovsxbw(xmm5, xmm4); + movhlps(xmm6, xmm4); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x60], xmm4); + pmovsxbw(xmm5, xmm3); + movhlps(xmm6, xmm3); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x50], xmm3); + sub(B, -64); + dec(I); + jg(l56c, T_NEAR); + align(4); + +L(l664); + test(M, 0x8); + jle(l6f8, T_NEAR); + movq(xmm0, qword[A1-0x80]); + movq(xmm1, qword[A1+LDA*1-0x80]); + sub(A1, -8); + movq(xmm2, qword[A2-0x80]); + movq(xmm3, qword[A2+LDA*1-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklqdq(xmm0, xmm2); + punpckhqdq(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l6f8); + test(M, 0x4); + jle(l75c, T_NEAR); + movd(xmm0, dword[A1-0x80]); + movd(xmm1, dword[A1+LDA*1-0x80]); + sub(A1, -4); + movd(xmm2, dword[A2-0x80]); + movd(xmm3, dword[A2+LDA*1-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + punpckldq(xmm2, xmm3); + punpcklqdq(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l75c); + test(M, 0x2); + jle(l7b4, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1+LDA*1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x1); + mov(ax, word[A2-0x80]); + pinsrw(xmm0, eax, 0x2); + mov(ax, word[A2+LDA*1-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x3); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l7b4); + test(M, 0x1); + jle(l7fc, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A2+LDA*1-0x80]); + pinsrb(xmm0, eax, 0x3); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l7fc); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm7); + add(qword[ARG_BIAS], 0x10); + sub(N, 0x4); + cmp(N, 0x4); + jge(l54c, T_NEAR); + align(4); + +L(l81c); + cmp(N, 0x2); + jl(la0a, T_NEAR); + align(4); + +L(l828); + mov(A1, A); + lea(A2, ptr[A1+LDA*1]); + lea(I, ptr[A1+LDA*2]); + mov(A, I); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x4); + jle(l8d8, T_NEAR); + align(4); + +L(l848); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + movdqu(xmm1, xword[A2-0x80]); + sub(A2, -16); + movdqa(xmm2, xmm0); + punpckldq(xmm0, xmm1); + punpckhdq(xmm2, xmm1); + pshufd(xmm6, xmm0, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + pshufd(xmm6, xmm2, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm2); + sub(B, -32); + dec(I); + jg(l848, T_NEAR); + align(4); + +L(l8d8); + test(M, 0x8); + jle(l930, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + movq(xmm1, qword[A2-0x80]); + sub(A2, -8); + punpckldq(xmm0, xmm1); + pshufd(xmm6, xmm0, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l930); + test(M, 0x4); + jle(l974, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + movd(xmm1, dword[A2-0x80]); + sub(A2, -4); + punpckldq(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l974); + test(M, 0x2); + jle(l9b8, T_NEAR); + mov(ax, word[A1-0x80]); + sub(A1, -2); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A2-0x80]); + sub(A2, -2); + pinsrw(xmm0, eax, 0x1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l9b8); + test(M, 0x1); + jle(l9ec, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + mov(byte[B-0x80], al); + mov(al, byte[A2-0x80]); + pinsrb(xmm0, eax, 0x1); + mov(byte[B-0x7f], al); + sub(B, -2); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + align(4); + +L(l9ec); + mov(A1, qword[ARG_BIAS]); + movq(qword[A1], xmm7); + add(qword[ARG_BIAS], 0x8); + sub(N, 0x2); + cmp(N, 0x2); + jge(l828, T_NEAR); + align(4); + +L(la0a); + cmp(N, 0x1); + jl(lb58, T_NEAR); + align(4); + +L(la14); + mov(A1, A); + add(A, LDA); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x4); + jle(la6c, T_NEAR); + align(4); + +L(la28); + movdqu(xmm0, xword[A1-0x80]); + sub(A1, -16); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(I); + jg(la28, T_NEAR); + align(4); + +L(la6c); + test(M, 0x8); + jle(laa8, T_NEAR); + movq(xmm0, qword[A1-0x80]); + sub(A1, -8); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(laa8); + test(M, 0x4); + jle(lae0, T_NEAR); + movd(xmm0, dword[A1-0x80]); + sub(A1, -4); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(lae0); + test(M, 0x2); + jle(lb14, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + mov(word[B-0x80], ax); + sub(A1, -2); + sub(B, -2); + align(4); + +L(lb14); + test(M, 0x1); + jle(lb38, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrb(xmm0, eax, 0x0); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(lb38); + mov(A1, qword[ARG_BIAS]); + movd(dword[A1], xmm7); + add(qword[ARG_BIAS], 0x4); + sub(N, 0x1); + cmp(N, 0x1); + jge(la14, T_NEAR); + align(4); + +L(lb58); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +#undef ARG_BIAS +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern.cpp new file mode 100644 index 0000000..afe4f17 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern.cpp @@ -0,0 +1,647 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_generator.hpp" +#include "common.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +jit_avx512_core_u8_copy_sum_bt_kern::jit_avx512_core_u8_copy_sum_bt_kern(): jit_generator(nullptr, GEMM_CODE_SIZE) +{ + +#ifndef _WIN32 +#define M rdi +#define N rsi +#define A rdx +#define LDA rcx +#define ALPHA r8 +#define B r9 + +#define I rax +#define A1 r10 +#define A2 r8 +#define LDA3 r11 + +#define ARG_BIAS 24+stacksize+rsp + +#else + +#define M rcx +#define N rdx +#define A r8 +#define LDA r9 +#define ALPHA rax +#define B rdi + +#define I rax +#define A1 rsi +#define A2 r10 +#define LDA3 r11 + +#define ARG_ALPHA 40+stacksize+rsp +#define ARG_B 48+stacksize+rsp +#define ARG_BIAS 72+stacksize+rsp + +#endif + +inLocalLabel(); +{ + +Xbyak::Label l15c; +Xbyak::Label l1f4; +Xbyak::Label l20; +Xbyak::Label l248; +Xbyak::Label l280; +Xbyak::Label l2a4; +Xbyak::Label l2b0; +Xbyak::Label l2c8; +Xbyak::Label l384; +Xbyak::Label l3e8; +Xbyak::Label l40; +Xbyak::Label l424; +Xbyak::Label l448; +Xbyak::Label l468; +Xbyak::Label l474; +Xbyak::Label l48c; +Xbyak::Label l550; +Xbyak::Label l5bc; +Xbyak::Label l600; +Xbyak::Label l628; +Xbyak::Label l646; +Xbyak::Label l650; +Xbyak::Label l668; +Xbyak::Label l700; +Xbyak::Label l760; +Xbyak::Label l7a4; +Xbyak::Label l7c8; +Xbyak::Label l7e8; + + preamble(); + auto stacksize = get_size_of_abi_save_regs(); +#ifdef _WIN32 + mov(ALPHA, ptr[ARG_ALPHA]); + mov(B, ptr[ARG_B]); +#endif + + mov(M, qword[M]); + mov(N, qword[N]); + mov(LDA, qword[LDA]); + lea(LDA3, ptr[LDA+LDA*2]); + sub(A, -128); + sub(B, -128); + cmp(N, 0x8); + jl(l2a4, T_NEAR); + align(4); + +L(l20); + mov(A1, A); + add(A, 0x8); + pxor(xmm8, xmm8); + pxor(xmm9, xmm9); + mov(I, M); + sar(I, 0x3); + jle(l15c, T_NEAR); + align(4); + +L(l40); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x60], xmm0); + movdqu(xword[B-0x50], xmm1); + sub(B, -64); + dec(I); + jg(l40, T_NEAR); + align(4); + +L(l15c); + test(M, 0x4); + jle(l1f4, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + movq(xmm2, qword[A1-0x80]); + add(A1, LDA); + movq(xmm3, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + movdqa(xmm1, xmm0); + punpcklwd(xmm0, xmm2); + punpckhwd(xmm1, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + pmovsxbw(xmm5, xmm1); + movhlps(xmm6, xmm1); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm9, xmm5); + movdqu(xword[B-0x80], xmm0); + movdqu(xword[B-0x70], xmm1); + sub(B, -32); + align(4); + +L(l1f4); + test(M, 0x2); + jle(l248, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + movq(xmm1, qword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm8, xmm5); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm6, xmm6); + pmovsxwd(xmm6, xmm6); + paddd(xmm9, xmm6); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l248); + test(M, 0x1); + jle(l280, T_NEAR); + movq(xmm0, qword[A1-0x80]); + add(A1, LDA); + pmovsxbd(xmm5, xmm0); + pshufd(xmm6, xmm0, 0x55); + pmovsxbd(xmm6, xmm6); + paddd(xmm8, xmm5); + paddd(xmm9, xmm6); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l280); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm8); + movdqu(xword[A1+0x10], xmm9); + add(qword[ARG_BIAS], 0x20); + sub(N, 0x8); + cmp(N, 0x8); + jge(l20, T_NEAR); + align(4); + +L(l2a4); + cmp(N, 0x4); + jl(l468, T_NEAR); + align(4); + +L(l2b0); + mov(A1, A); + add(A, 0x4); + pxor(xmm7, xmm7); + mov(I, M); + sar(I, 0x3); + jle(l384, T_NEAR); + align(4); + +L(l2c8); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x70], xmm0); + sub(B, -32); + dec(I); + jg(l2c8, T_NEAR); + align(4); + +L(l384); + test(M, 0x4); + jle(l3e8, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + movd(xmm2, dword[A1-0x80]); + add(A1, LDA); + movd(xmm3, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + movhlps(xmm6, xmm0); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + align(4); + +L(l3e8); + test(M, 0x2); + jle(l424, T_NEAR); + movd(xmm0, dword[A1-0x80]); + add(A1, LDA); + movd(xmm1, dword[A1-0x80]); + add(A1, LDA); + punpcklbw(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l424); + test(M, 0x1); + jle(l448, T_NEAR); + movd(xmm0, dword[A1-0x80]); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l448); + mov(A1, qword[ARG_BIAS]); + movdqu(xword[A1], xmm7); + add(qword[ARG_BIAS], 0x10); + sub(N, 0x4); + cmp(N, 0x4); + jge(l2b0, T_NEAR); + align(4); + +L(l468); + cmp(N, 0x2); + jl(l646, T_NEAR); + align(4); + +L(l474); + mov(A1, A); + add(A, 0x2); + pxor(xmm7, xmm7); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l550, T_NEAR); + align(4); + +L(l48c); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm4, eax, 0x0); + punpcklbw(xmm1, xmm2); + punpcklbw(xmm3, xmm4); + punpcklwd(xmm1, xmm3); + punpcklqdq(xmm0, xmm1); + pshufd(xmm6, xmm0, 0xd8); + pmovsxbw(xmm5, xmm6); + movhlps(xmm6, xmm6); + pmovsxbw(xmm6, xmm6); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movdqu(xword[B-0x80], xmm0); + sub(B, -16); + dec(LDA3); + jg(l48c, T_NEAR); + align(4); + +L(l550); + test(M, 0x4); + jle(l5bc, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm2, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm3, eax, 0x0); + punpcklbw(xmm0, xmm1); + punpcklbw(xmm2, xmm3); + punpcklwd(xmm0, xmm2); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + align(4); + +L(l5bc); + test(M, 0x2); + jle(l600, T_NEAR); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm0, eax, 0x0); + mov(ax, word[A1-0x80]); + add(A1, LDA); + pinsrw(xmm1, eax, 0x0); + punpcklbw(xmm0, xmm1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l600); + test(M, 0x1); + jle(l628, T_NEAR); + mov(ax, word[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + mov(word[B-0x80], ax); + sub(B, -2); + align(4); + +L(l628); + mov(A1, qword[ARG_BIAS]); + movq(qword[A1], xmm7); + add(qword[ARG_BIAS], 0x8); + sub(N, 0x2); + cmp(N, 0x2); + jge(l474, T_NEAR); + align(4); + +L(l646); + cmp(N, 0x1); + jl(l7e8, T_NEAR); + align(4); + +L(l650); + mov(A1, A); + add(A, 0x1); + pxor(xmm7, xmm7); + mov(LDA3, M); + sar(LDA3, 0x3); + jle(l700, T_NEAR); + align(4); + +L(l668); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x4); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x5); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x6); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x7); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm6); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movq(qword[B-0x80], xmm0); + sub(B, -8); + dec(LDA3); + jg(l668, T_NEAR); + align(4); + +L(l700); + test(M, 0x4); + jle(l760, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x2); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x3); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + movd(dword[B-0x80], xmm0); + sub(B, -4); + align(4); + +L(l760); + test(M, 0x2); + jle(l7a4, T_NEAR); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x0); + mov(byte[B-0x80], al); + mov(al, byte[A1-0x80]); + add(A1, LDA); + pinsrb(xmm0, eax, 0x1); + pmovsxbw(xmm5, xmm0); + phaddw(xmm5, xmm5); + pmovsxwd(xmm5, xmm5); + paddd(xmm7, xmm5); + mov(byte[B-0x7f], al); + sub(B, -2); + align(4); + +L(l7a4); + test(M, 0x1); + jle(l7c8, T_NEAR); + mov(al, byte[A1-0x80]); + pinsrw(xmm0, eax, 0x0); + pmovsxbd(xmm5, xmm0); + paddd(xmm7, xmm5); + mov(byte[B-0x80], al); + sub(B, -1); + align(4); + +L(l7c8); + mov(A1, qword[ARG_BIAS]); + movd(dword[A1], xmm7); + add(qword[ARG_BIAS], 0x4); + sub(N, 0x1); + cmp(N, 0x1); + jge(l650, T_NEAR); + align(4); + +L(l7e8); + + postamble(); +} +outLocalLabel(); + +#undef M +#undef N +#undef A +#undef LDA +#undef ALPHA +#undef B +#undef I +#undef A1 +#undef A2 +#undef LDA3 +#ifdef _WIN32 +#undef ARG_ALPHA +#undef ARG_B +#endif +#undef ARG_BIAS +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.cpp new file mode 100644 index 0000000..4fc11af --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.cpp @@ -0,0 +1,116 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" + +#include "../f32/ref_gemm_f32.hpp" +#include "jit_generator.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +mkldnn_status_t ref_gemm_s8x8s32(const char *transa, const char *transb, + const char *offsetc, const int *M, const int *N, const int *K, + const float *alpha, const int8_t *A, const int *LDA, const int8_t *ao, + const b_dt *B, const int *LDB, const int8_t *bo, const float *beta, + int32_t *C, const int *LDC, const int32_t *co) { + + if (*M == 0 || *N == 0 || *K == 0) + return mkldnn_success; + + bool OCisR = (*offsetc == 'R' || *offsetc == 'r'); + bool OCisC = (*offsetc == 'C' || *offsetc == 'c'); + bool AisN = (*transa == 'N' || *transa == 'n'); + bool BisN = (*transb == 'N' || *transb == 'n'); + + int m = *M, n = *N, k = *K, lda = *LDA, ldb = *LDB, ldc = *LDC; + size_t sizeA = AisN ? lda * k : lda * m; + size_t sizeB = BisN ? ldb * n : ldb * k; + size_t sizeC = ldc * n; + + double *dA = (double *)malloc(sizeA * sizeof(double), PAGE_4K); + double *dB = (double *)malloc(sizeB * sizeof(double), PAGE_4K); + double *dC = (double *)malloc(sizeC * sizeof(double), PAGE_4K); + + if (utils::any_null(dA, dB, dC)) { + free(dA); + free(dB); + free(dC); + return mkldnn_out_of_memory; + } + + auto da_setter = [=] (int i, int j, double v) { dA[j * lda + i] = v; }; + auto db_setter = [=] (int i, int j, double v) { dB[j * ldb + i] = v; }; + + auto ia_accessor = [=] (int i, int j) { return A[j * lda + i]; }; + auto ib_accessor = [=] (int i, int j) { return B[j * ldb + i]; }; + + const int a_rows = AisN ? m : k; + const int a_cols = AisN ? k : m; + mkldnn::impl::parallel_nd(a_cols, a_rows, [&](int j, int i) { + da_setter(i, j, + static_cast(ia_accessor(i, j)) + static_cast(ao[0])); + }); + + const int b_rows = BisN ? k : n; + const int b_cols = BisN ? n : k; + mkldnn::impl::parallel_nd(b_cols, b_rows, [&](int j, int i) { + db_setter(i, j, + static_cast(ib_accessor(i, j)) + static_cast(bo[0])); + }); + double one = 1.0, zero = 0.0; + ref_gemm(transa, transb, M, N, K, &one, dA, LDA, dB, LDB, &zero, + dC, LDC, nullptr); + + auto i2d = [=] (int32_t v) { return static_cast(v); }; + auto f2d = [=] (float v) { return static_cast(v); }; + + mkldnn::impl::parallel_nd(n, m, [&] (int j, int i) { + double coffset = OCisR ? i2d(co[j]) : OCisC ? i2d(co[i]) : i2d(co[0]); + double val = ((*beta == 0.0f) ? 0.0 : f2d(*beta) * i2d(C[i + j * ldc])) + + f2d(*alpha) * dC[i + j * ldc] + coffset; + C[i + j * ldc] = math::out_round(math::saturate(val)); + }); + + free(dA); + free(dB); + free(dC); + return mkldnn_success; +} + +template mkldnn_status_t ref_gemm_s8x8s32( + const char *transa, const char *transb, const char *offsetc, + const int *M, const int *N, const int *K, + const float *alpha, const int8_t *A, const int *LDA, const int8_t *ao, + const uint8_t *B, const int *LDB, const int8_t *bo, + const float *beta, int32_t *C, const int *LDC, const int32_t *co); + +template mkldnn_status_t ref_gemm_s8x8s32( + const char *transa, const char *transb, const char *offsetc, + const int *M, const int *N, const int *K, + const float *alpha, const int8_t *A, const int *LDA, const int8_t *ao, + const int8_t *B, const int *LDB, const int8_t *bo, + const float *beta, int32_t *C, const int *LDC, const int32_t *co); + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.hpp new file mode 100644 index 0000000..67b6c59 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/s8x8s32/ref_gemm_s8x8s32.hpp @@ -0,0 +1,39 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef REF_GEMM_S8X8S32_HPP +#define REF_GEMM_S8X8S32_HPP + +#include + +#include "mkldnn_types.h" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +mkldnn_status_t ref_gemm_s8x8s32(const char *transa, const char *transb, + const char *offsetc, const int *M, const int *N, const int *K, + const float *alpha, const int8_t *A, const int *LDA, const int8_t *ao, + const b_dt *B, const int *LDB, const int8_t *bo, const float *beta, + int32_t *C, const int *LDC, const int32_t *co); + +} +} +} +#endif + diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.cpp index c403e45..154b5c3 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.cpp @@ -14,7 +14,6 @@ * limitations under the License. *******************************************************************************/ -#include #include "mkldnn_types.h" #include "c_types_map.hpp" @@ -22,7 +21,6 @@ #include "utils.hpp" #include "type_helpers.hpp" #include "mkldnn_thread.hpp" - #include "ref_eltwise.hpp" namespace mkldnn { @@ -31,20 +29,22 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; -template -void _gemm_convolution_fwd_t::execute_forward() { +void gemm_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; - const int MB = conf_.MB(); + auto col = scratchpad().get(key_conv_gemm_col); + + const auto &jcp = this->pd()->jcp_; + const int MB = pd()->MB(); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); const int M = jcp.os * jcp.od; const size_t src_step = (src_d.blk_off(1) - src_d.off_l(0)) / jcp.ngroups; @@ -53,60 +53,68 @@ void _gemm_convolution_fwd_t::execute_forward() { src += src_d.off_l(0); dst += dst_d.off_l(0); + assert(IMPLICATION( + jcp.id != 1, jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow)); + assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1)); + const int K = jcp.ic * jcp.ks; const int N = jcp.oc; - const int m = jcp.os; - const int LDA = jcp.im2col_sz ? m : M; - - const data_t one = 1.0; - - data_t *col = (jcp.im2col_sz) - ? (data_t *)this->scratchpad_->get() - : nullptr; - parallel_nd(jcp.im2col_sz * jcp.nthr, - [&](ptrdiff_t i) { col[i] = (data_t)0; }); + if (jcp.im2col_sz && jcp.id != 1) + parallel_nd(jcp.im2col_sz * jcp.nthr, + [&](ptrdiff_t i) { col[i] = (data_t)0; }); - const size_t work_amount = jcp.ngroups * MB * jcp.od; + const int nb_oh = div_up(jcp.oh, jcp.oh_block); + const int nb_ow = div_up(jcp.ow, jcp.ow_block); + const size_t work_amount = jcp.ngroups * MB * jcp.od * nb_oh * nb_ow; parallel(jcp.nthr, [&](const int ithr, const int nthr) { data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz; - int g{0}, n{0}, od{0}; + int g{ 0 }, n{ 0 }, od{ 0 }, ohb{ 0 }, owb{ 0 }; size_t start = 0, end = 0; balance211(work_amount, nthr, ithr, start, end); - nd_iterator_init(start, g, jcp.ngroups, n, MB, od, jcp.od); - + nd_iterator_init(start, g, jcp.ngroups, n, MB, od, jcp.od, ohb, + nb_oh, owb, nb_ow); for (size_t iwork = start; iwork < end; ++iwork) { + int oh = ohb * jcp.oh_block; + int ow = owb * jcp.ow_block; const data_t *_src = src + (n * jcp.ngroups + g) * src_step; const data_t *_weights = weights + g * weights_g_size; - data_t *_dst = dst + (n * jcp.ngroups + g) * dst_step; - + data_t *_dst_im = dst + (n * jcp.ngroups + g) * dst_step; + const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh); + const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow); if (jcp.im2col_sz) { if (jcp.id == 1) - jit_gemm_convolution_utils::im2col(jcp, _src, _col); + jit_gemm_convolution_utils::im2col( + jcp, _src, _col, oh, h_step, ow, w_step); else jit_gemm_convolution_utils::im2col_3d(jcp, _src, _col, od); } const data_t one = 1.0; + + const int m = h_step * w_step; + const int LDA = jcp.im2col_sz ? m : M; + data_t *_dst = _dst_im + od * jcp.os + oh * jcp.ow + ow; + extended_sgemm("N", "N", &m, &N, &K, &one, jcp.im2col_sz ? _col : _src + od * m, &LDA, _weights, &K, - &this->beta_, _dst + od * m, &M); + &this->beta_, _dst, &M); - const auto &p = conf_.attr()->post_ops_; + data_t *d = _dst; + const auto &p = pd()->attr()->post_ops_; bool need_bias = jcp.with_bias; if (use_fast_relu) { - data_t *d = _dst + od * m; - - for (int oc = 0; oc < jcp.oc; ++oc) { + parallel_nd(jcp.oc, [&](const int oc) { data_t b = need_bias ? bias[g * jcp.oc + oc] : 0; + data_t *d_ = d + oc * M; + PRAGMA_OMP_SIMD() for (int oS = 0; oS < m; ++oS) { - d[oS] += b; - if (d[oS] < 0) d[oS] *= fast_relu_ns; + d_[oS] += b; + if (d_[oS] < 0) d_[oS] *= fast_relu_ns; } - d += M; - } + }); need_bias = false; } else if (p.len_ > 0) { @@ -114,17 +122,17 @@ void _gemm_convolution_fwd_t::execute_forward() { int depthwise_inj_idx = 0; for (int i = 0; i < p.len_; i++) { - data_t *d = _dst + od * m; auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - for (int oc = 0; oc < jcp.oc; ++oc) { + parallel_nd(jcp.oc, [&](const int oc) { data_t b = need_bias ? bias[g * jcp.oc + oc] : 0; + data_t *d_ = d + oc * M; + PRAGMA_OMP_SIMD() for (int oS = 0; oS < m; ++oS) { - d[oS] += b; - d[oS] = eltwise_injectors[eltwise_inj_idx]->compute_scalar(d[oS]); + d_[oS] += b; + d_[oS] = eltwise_injectors[eltwise_inj_idx]->compute_scalar(d_[oS]); } - d += M; - } + }); eltwise_inj_idx++; need_bias = false; @@ -132,16 +140,17 @@ void _gemm_convolution_fwd_t::execute_forward() { auto depthwise_weights = post_op.depthwise.weights_data; auto depthwise_bias = post_op.depthwise.biases_data; - for (int oc = 0; oc < jcp.oc; ++oc) { + parallel_nd(jcp.oc, [&](const int oc) { data_t b = need_bias ? bias[g * jcp.oc + oc] : 0; + data_t *d_ = d + oc * M; + PRAGMA_OMP_SIMD() for (int oS = 0; oS < m; ++oS) { - d[oS] += b; - d[oS] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(d[oS], + d_[oS] += b; + d_[oS] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(d_[oS], depthwise_weights + g * jcp.oc + oc, depthwise_bias + g * jcp.oc + oc); } - d += M; - } + }); depthwise_inj_idx++; need_bias = false; @@ -150,46 +159,53 @@ void _gemm_convolution_fwd_t::execute_forward() { } if (need_bias) { - data_t *d = _dst + od * m; - - for (int oc = 0; oc < jcp.oc; ++oc) { + parallel_nd(jcp.oc, [&](const int oc) { data_t b = bias[g * jcp.oc + oc]; + data_t *d_ = d + oc * M; + PRAGMA_OMP_SIMD() for (int oS = 0; oS < m; ++oS) { - d[oS] += b; + d_[oS] += b; } - d += M; - } + }); } - nd_iterator_step(g, jcp.ngroups, n, MB, od, jcp.od); + nd_iterator_step(g, jcp.ngroups, n, MB, od, jcp.od, ohb, nb_oh, + owb, nb_ow); } }); } -void gemm_convolution_bwd_data_t::execute_backward_data() { +void gemm_convolution_bwd_data_t::execute_backward_data() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; - const int MB = conf_.MB(); + auto col = scratchpad().get(key_conv_gemm_col); + + const auto &jcp = this->pd()->jcp_; + const int MB = pd()->MB(); const int M = jcp.os * jcp.od; - const size_t src_step = jcp.ic * jcp.ih * jcp.iw * jcp.id; - const size_t dst_step = jcp.oc * M; + const size_t src_step_to_clean = jcp.ic * jcp.ih * jcp.iw * jcp.id; + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const size_t src_step = diff_src_d.blk_off(1) / jcp.ngroups; + const size_t dst_step = diff_dst_d.blk_off(1) / jcp.ngroups; const size_t weights_g_size = jcp.ic * jcp.oc * jcp.ks; const int m = jcp.os; const int K = jcp.oc; const int N = jcp.ic * jcp.ks; const int LDC = jcp.im2col_sz ? m : M; - data_t *col = jcp.im2col_sz ? (data_t *)this->scratchpad_->get() : nullptr; const size_t work_amount = (size_t)jcp.ngroups * MB; if (jcp.id > 1) { - const ptrdiff_t diff_src_sz = (ptrdiff_t)(work_amount * src_step); - parallel_nd(diff_src_sz, [&](ptrdiff_t i) { diff_src[i] = (data_t)0; }); + for (size_t j = 0; j < work_amount; j++) { + int j_step = src_step * j; + const ptrdiff_t diff_src_sz = (ptrdiff_t)(src_step_to_clean); + parallel_nd(diff_src_sz, [&](ptrdiff_t i) { diff_src[j_step + i] = (data_t)0; }); + } } parallel(jcp.nthr, [&](const int ithr, const int nthr) { @@ -201,7 +217,7 @@ void gemm_convolution_bwd_data_t::execute_backward_data() { nd_iterator_init(start, g, jcp.ngroups, n, MB); for (size_t iwork = start; iwork < end; ++iwork) { - data_t *_diff_src = diff_src + (n * jcp.ngroups + g)*src_step; + data_t *_diff_src = diff_src + (n * jcp.ngroups + g) * src_step; const data_t *_weights = weights + g * weights_g_size; for (int od = 0; od < jcp.od; ++od) { const data_t *_diff_dst = diff_dst + (n * jcp.ngroups + g) @@ -226,13 +242,17 @@ void gemm_convolution_bwd_data_t::execute_backward_data() { }); } -void gemm_convolution_bwd_weights_t::execute_backward_weights() { +void gemm_convolution_bwd_weights_t::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias = reinterpret_cast(this->memory(1)); - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; + auto col = scratchpad().get(key_conv_gemm_col); + auto wei_reduction = scratchpad().get(key_conv_wei_reduction); + + const jit_gemm_conv_conf_t &jcp = this->pd()->jcp_; + const int K = jcp.os * jcp.od; const size_t src_step = jcp.ic * jcp.ih * jcp.iw * jcp.id; const size_t dst_step = jcp.oc * K; @@ -243,15 +263,6 @@ void gemm_convolution_bwd_weights_t::execute_backward_weights() { const int M = jcp.ic * jcp.ks; const int LDA = jcp.im2col_sz ? k : K; - data_t *col = nullptr, *wei_reduction = nullptr; - ptrdiff_t wei_offset = 0; - if (jcp.im2col_sz) { - col = (data_t *)this->scratchpad_->get(); - wei_offset = jcp.im2col_sz * jcp.nthr; - } - if (jcp.need_wei_reduction) - wei_reduction = (data_t *)this->scratchpad_->get() + wei_offset; - parallel_nd(jcp.im2col_sz * jcp.nthr, [&](ptrdiff_t i) { col[i] = (data_t)0; }); @@ -289,7 +300,8 @@ void gemm_convolution_bwd_weights_t::execute_backward_weights() { if (jcp.im2col_sz) { if (jcp.id == 1) - jit_gemm_convolution_utils::im2col(jcp, _src, _col); + jit_gemm_convolution_utils::im2col( + jcp, _src, _col, 0, jcp.oh, 0, jcp.ow); else jit_gemm_convolution_utils::im2col_3d(jcp, _src, _col, od); @@ -331,13 +343,10 @@ void gemm_convolution_bwd_weights_t::execute_backward_weights() { } } diff_bias[g*jcp.oc+oc] = db; - nd_iterator_step(g, jcp.ngroups, oc, jcp.oc); }); } } -template struct _gemm_convolution_fwd_t; -template struct _gemm_convolution_fwd_t; } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.hpp index d0d65c1..2a0da52 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.hpp @@ -18,11 +18,12 @@ #define CPU_JIT_GEMM_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "gemm_convolution_utils.hpp" #include "gemm/gemm.hpp" -#include "scratchpad.hpp" #include "ref_eltwise.hpp" #include "ref_depthwise.hpp" @@ -30,34 +31,15 @@ namespace mkldnn { namespace impl { namespace cpu { -template -struct _gemm_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { +struct gemm_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public cpu_convolution_fwd_pd_t { pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, - const primitive_attr_t *attr, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() {} - DECLARE_COMMON_PD_T(GEMM_IMPL_STR, _gemm_convolution_fwd_t); - - inline memory_format_t src_format() - { - using namespace memory_format; - return (utils::pick(this->cdesc_().src_desc.ndims - 3, - ncw, nchw, ncdhw)); - } - inline memory_format_t wei_format() - { - using namespace memory_format; - return (this->with_groups() - ? utils::pick(this->cdesc_().src_desc.ndims - 3, - goiw, goihw, goidhw) - : utils::pick(this->cdesc_().src_desc.ndims - 3, - oiw, oihw, oidhw)); - } + DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; @@ -67,26 +49,47 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), data_type::f32 - == this->cdesc_().bias_desc.data_type) + == this->desc()->bias_desc.data_type) && this->src_pd_.desc()->format == src_format() && this->dst_pd_.desc()->format == src_format() && this->weights_pd_.desc()->format == wei_format() && this->is_gemm_conv_format(); - return ok ? status::success : status::unimplemented; + if (!ok) return status::unimplemented; + + auto scratchpad = scratchpad_registry().registrar(); + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + *desc(), src_pd(), weights_pd(0), dst_pd(), + mkldnn_get_max_threads()); } jit_gemm_conv_conf_t jcp_; protected: + memory_format_t src_format() const { + using namespace memory_format; + const int ndims_sp = this->desc()->src_desc.ndims - 2; + return (utils::pick(ndims_sp - 1, ncw, nchw, ncdhw)); + } + + memory_format_t wei_format() const { + using namespace memory_format; + const int ndims_sp = this->desc()->src_desc.ndims - 2; + return (this->with_groups() + ? utils::pick(ndims_sp - 1, goiw, goihw, goidhw) + : utils::pick(ndims_sp - 1, oiw, oihw, oidhw)); + } + virtual status_t set_default_params() override { using namespace memory_format; if (this->src_pd_.desc()->format == any) @@ -97,11 +100,12 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t { CHECK(this->weights_pd_.set_format(wei_format())); if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } virtual bool is_gemm_conv_format() const { - bool ok = true; auto const &po = this->attr()->post_ops_; auto is_eltwise = [&](int idx) { return po.entry_[idx].is_eltwise(); }; @@ -110,48 +114,24 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t { auto is_simple = [&](int idx) { return (is_eltwise(idx) || is_depthwise(idx)); }; switch (po.len_) { - using namespace mkldnn::impl::primitive_kind; - case 0: // no post_ops - break; - case 1: - ok = ok && // sum OR eltwise/depthwise - (is_simple(0) || is_sum(0)); - break; - case 2: - ok = ok && // sum->eltwise/depthwise OR eltwise/depthwise->eltwise/depthwise - ((is_sum(0) && is_simple(1)) || (is_simple(0) && is_simple(1))); - break; - case 3: - ok = ok && // sum->eltwise/depthwise->eltwise/depthwise - (is_sum(0) && is_simple(1) && is_simple(2)); - break; - - default: ok = false; + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_simple(1)); + case 3: return is_sum(0) && is_simple(1) && is_simple(2); + default: return false; } - return ok; + return false; } }; - _gemm_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + gemm_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , scratchpad_(nullptr) + : cpu_primitive_t(apd, inputs, outputs, true) { - using namespace prop_kind; - - const auto &post_ops = conf_.attr()->post_ops_; + const auto &post_ops = pd()->attr()->post_ops_; const data_t one = 1.0, zero = 0.0; beta_ = post_ops.find(primitive_kind::sum) >= 0 ? one : zero; - jit_gemm_convolution_utils::init_conf(conf_.jcp_, - *(conf_.cdesc()), conf_.src_pd(), conf_.weights_pd(0), - conf_.dst_pd(), mkldnn_get_max_threads(), with_relu, - conf_.negative_slope()); - - size_t size = (size_t)conf_.jcp_.im2col_sz * sizeof(data_t); - jit_gemm_convolution_utils::prepare_scratchpad(this->conf_.jcp_, - &this->scratchpad_, size, this->conf_.jcp_.nthr); - for (int i = 0; i < post_ops.len_; i++) { auto &post_op = post_ops.entry_[i]; if (post_op.is_eltwise()) { @@ -168,10 +148,7 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t { } use_fast_relu = false; - if (conf_.jcp_.with_relu && post_ops.len_ == 0) { - use_fast_relu = true; - fast_relu_ns = conf_.jcp_.relu_negative_slope; - } else if (post_ops.len_ == 1 && post_ops.entry_[0].is_relu(true, false)) { + if (post_ops.len_ == 1 && post_ops.entry_[0].is_relu(true, false)) { use_fast_relu = true; fast_relu_ns = post_ops.entry_[0].eltwise.alpha; } else if (post_ops.len_ == 2 && post_ops.entry_[0].is_sum() && post_ops.entry_[1].is_relu(true, false)) { @@ -180,9 +157,7 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t { } } - ~_gemm_convolution_fwd_t() { - delete this->scratchpad_; - + ~gemm_convolution_fwd_t() { for (auto inj : eltwise_injectors) delete inj; eltwise_injectors.clear(); @@ -190,19 +165,19 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t { for (auto inj : depthwise_injectors) delete inj; depthwise_injectors.clear(); - }; + } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; - scratchpad_t *scratchpad_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + data_t beta_; nstl::vector eltwise_injectors; @@ -212,39 +187,16 @@ private: float fast_relu_ns; }; -using gemm_convolution_fwd_t = - _gemm_convolution_fwd_t; -using gemm_convolution_relu_t = - _gemm_convolution_fwd_t; - struct gemm_convolution_bwd_data_t: public cpu_primitive_t { struct pd_t: public cpu_convolution_bwd_data_pd_t { pd_t(engine_t *engine, - const convolution_desc_t *adesc, - const primitive_attr_t *attr, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const convolution_fwd_pd_t *hint_fwd_pd) : cpu_convolution_bwd_data_pd_t(engine, adesc, attr, hint_fwd_pd) - , jcp_() - {} + , jcp_() {} DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_convolution_bwd_data_t); - inline memory_format_t src_format() - { - using namespace memory_format; - return (utils::pick(this->desc()->diff_src_desc.ndims - 3, - ncw, nchw, ncdhw)); - } - inline memory_format_t wei_format() - { - using namespace memory_format; - return (this->with_groups() - ? utils::pick(this->desc()->diff_src_desc.ndims - 3, - goiw, goihw, goidhw) - : utils::pick(this->desc()->diff_src_desc.ndims - 3, - oiw, oihw, oidhw)); - } - virtual status_t init() override { using namespace prop_kind; using namespace memory_format; @@ -254,7 +206,8 @@ struct gemm_convolution_bwd_data_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_data - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->diff_src_desc.data_type, @@ -263,12 +216,31 @@ struct gemm_convolution_bwd_data_t: public cpu_primitive_t { && this->diff_src_pd_.desc()->format == src_format() && this->diff_dst_pd_.desc()->format == src_format() && this->weights_pd_.desc()->format == wei_format(); - return ok ? status::success : status::unimplemented; + if (!ok) return status::unimplemented; + + auto scratchpad = scratchpad_registry().registrar(); + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + *desc(), diff_src_pd(), weights_pd(0), diff_dst_pd(), + mkldnn_get_max_threads()); } jit_gemm_conv_conf_t jcp_; protected: + memory_format_t src_format() const { + using namespace memory_format; + const int ndims_sp = this->desc()->diff_src_desc.ndims - 2; + return (utils::pick(ndims_sp - 1, ncw, nchw, ncdhw)); + } + + memory_format_t wei_format() const { + using namespace memory_format; + const int ndims_sp = this->desc()->diff_src_desc.ndims - 2; + return (this->with_groups() + ? utils::pick(ndims_sp - 1, goiw, goihw, goidhw) + : utils::pick(ndims_sp - 1, oiw, oihw, oidhw)); + } + virtual status_t set_default_params() override { using namespace memory_format; if (this->diff_src_pd_.desc()->format == any) @@ -277,34 +249,21 @@ struct gemm_convolution_bwd_data_t: public cpu_primitive_t { CHECK(this->diff_dst_pd_.set_format(src_format())); if (this->weights_pd_.desc()->format == any) CHECK(this->weights_pd_.set_format(wei_format())); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - gemm_convolution_bwd_data_t(const pd_t *pd, const input_vector &inputs, + gemm_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , scratchpad_(nullptr) - { - using namespace prop_kind; - - jit_gemm_convolution_utils::init_conf(conf_.jcp_, - *(conf_.desc()), conf_.diff_src_pd(), conf_.weights_pd(0), - conf_.diff_dst_pd(), mkldnn_get_max_threads()); - - size_t size = (size_t)conf_.jcp_.im2col_sz * sizeof(data_t); - jit_gemm_convolution_utils::prepare_scratchpad(this->conf_.jcp_, - &this->scratchpad_, size, this->conf_.jcp_.nthr); - } - - ~gemm_convolution_bwd_data_t() { - delete this->scratchpad_; - }; + : cpu_primitive_t(apd, inputs, outputs, true) {} + ~gemm_convolution_bwd_data_t() {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: execute_backward_data(); break; @@ -315,9 +274,8 @@ struct gemm_convolution_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; - scratchpad_t *scratchpad_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; struct gemm_convolution_bwd_weights_t: public cpu_primitive_t { @@ -327,27 +285,10 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t { const primitive_attr_t *attr, const convolution_fwd_pd_t *hint_fwd_pd) : cpu_convolution_bwd_weights_pd_t(engine, adesc, attr, hint_fwd_pd) - , jcp_() - {} + , jcp_() {} DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_convolution_bwd_weights_t); - inline memory_format_t src_format() - { - using namespace memory_format; - return (utils::pick(this->desc()->src_desc.ndims - 3, - ncw, nchw, ncdhw)); - } - inline memory_format_t wei_format() - { - using namespace memory_format; - return (this->with_groups() - ? utils::pick(this->desc()->src_desc.ndims - 3, - goiw, goihw, goidhw) - : utils::pick(this->desc()->src_desc.ndims - 3, - oiw, oihw, oidhw)); - } - virtual status_t init() override { using namespace prop_kind; using namespace memory_format; @@ -357,7 +298,8 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, @@ -368,12 +310,31 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t { && this->src_pd_.desc()->format == src_format() && this->diff_dst_pd_.desc()->format == src_format() && this->diff_weights_pd_.desc()->format == wei_format(); - return ok ? status::success : status::unimplemented; + if (!ok) return status::unimplemented; + + auto scratchpad = scratchpad_registry().registrar(); + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + *desc(), src_pd(), diff_weights_pd(0), diff_dst_pd(), + mkldnn_get_max_threads()); } jit_gemm_conv_conf_t jcp_; protected: + memory_format_t src_format() const { + using namespace memory_format; + const int ndims_sp = this->desc()->src_desc.ndims - 2; + return (utils::pick(ndims_sp - 1, ncw, nchw, ncdhw)); + } + + memory_format_t wei_format() const { + using namespace memory_format; + const int ndims_sp = this->desc()->src_desc.ndims - 2; + return (this->with_groups() + ? utils::pick(ndims_sp - 1, goiw, goihw, goidhw) + : utils::pick(ndims_sp - 1, oiw, oihw, oidhw)); + } + virtual status_t set_default_params() override { using namespace memory_format; if (this->src_pd_.desc()->format == any) @@ -384,38 +345,21 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t { CHECK(this->diff_weights_pd_.set_format(wei_format())); if (this->diff_bias_pd_.desc()->format == any) CHECK(this->diff_bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - gemm_convolution_bwd_weights_t(const pd_t *pd, const input_vector &inputs, + gemm_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , scratchpad_(nullptr) - { - using namespace prop_kind; - - jit_gemm_convolution_utils::init_conf(conf_.jcp_, - *(conf_.desc()), conf_.src_pd(), conf_.diff_weights_pd(0), - conf_.diff_dst_pd(), mkldnn_get_max_threads()); - const memory_desc_wrapper weights_d(conf_.diff_weights_pd(0)); - - size_t size = (size_t)conf_.jcp_.im2col_sz * sizeof(data_t); - if (conf_.jcp_.need_wei_reduction) - size += (size_t)conf_.jcp_.ngroups * weights_d.size(); - - jit_gemm_convolution_utils::prepare_scratchpad(this->conf_.jcp_, - &this->scratchpad_, size, conf_.jcp_.nthr); - } - - ~gemm_convolution_bwd_weights_t() { - delete this->scratchpad_; - }; + : cpu_primitive_t(apd, inputs, outputs, true) {} + ~gemm_convolution_bwd_weights_t() {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_weights: execute_backward_weights(); break; @@ -426,9 +370,8 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t { } private: - void execute_backward_weights(); - pd_t conf_; - scratchpad_t *scratchpad_; + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.cpp index 80dfe9f..2b7cea2 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.cpp @@ -23,6 +23,7 @@ #include "cpu_isa_traits.hpp" #include "gemm_convolution_utils.hpp" +#include "jit_generator.hpp" namespace mkldnn { namespace impl { @@ -36,17 +37,19 @@ using namespace data_type; namespace jit_gemm_convolution_utils { -void im2col_3d(jit_gemm_conv_conf_t &jcp, const float *im, float *col, int od) { +void im2col_3d(const jit_gemm_conv_conf_t &jcp, const float *im, float *col, + int od) +{ const size_t OHW = jcp.oh * jcp.ow; const size_t im_step = jcp.ih * jcp.iw * jcp.id; const size_t col_step = jcp.ks * OHW; parallel_nd(jcp.ic, [&](int ic) { - const float *im_loc = im + ic * im_step; - float *col_loc = col + ic * col_step; + const float *__restrict im_loc = im + ic * im_step; + float *__restrict col_loc = col + ic * col_step; int id = od * jcp.stride_d - jcp.f_pad; for (int kd = 0; kd < jcp.kd; ++kd) { - float *col_ = col_loc + kd * jcp.kh * jcp.kw * OHW; + float *__restrict col_ = col_loc + kd * jcp.kh * jcp.kw * OHW; if (id < 0 || id >= jcp.id) { int ih_ = -jcp.t_pad; for (int kh = 0; kh < jcp.kh; ++kh) { @@ -79,7 +82,7 @@ void im2col_3d(jit_gemm_conv_conf_t &jcp, const float *im, float *col, int od) { col_ += jcp.kw * OHW; } } else { - const float *im_ = im_loc + id * jcp.ih * jcp.iw; + const float *__restrict im_ = im_loc + id * jcp.ih * jcp.iw; int ih_ = -jcp.t_pad; for (int kh = 0; kh < jcp.kh; ++kh) { int ih = ih_; @@ -117,88 +120,226 @@ void im2col_3d(jit_gemm_conv_conf_t &jcp, const float *im, float *col, int od) { }); } -void im2col(jit_gemm_conv_conf_t &jcp, const float *im, float *col) { - if (jcp.ic == 1) { - parallel_nd(jcp.kh, jcp.oh, [&](int kh, int oh) { - const int ih = oh * jcp.stride_h - jcp.t_pad + kh * (1 + jcp.dilate_h); - if (ih < 0 || ih >= jcp.ih) return; - - for (int kw = 0; kw < jcp.kw; ++kw) { - for (int ow = 0; ow < jcp.ow; ++ow) { - const int iw = ow * jcp.stride_w - jcp.l_pad + kw * (1 + jcp.dilate_w); - if (iw < 0 || iw >= jcp.iw) continue; - - const size_t col_idx = ((kh*jcp.kw + kw)*jcp.oh+oh)*jcp.ow+ow; - const size_t im_idx = ih*jcp.iw + iw; - col[col_idx] = im[im_idx]; - }} +/* col[ic][kh][kw][oh][ow] <-- im2col(im[ic][ih][iw]) */ +void im2col(const jit_gemm_conv_conf_t &jcp, const float *__restrict im, + float *__restrict col, int hs, int hb, int ws, int wb) { + const size_t im_step = jcp.is; + const size_t col_step = jcp.ks * hb * wb; + if (jcp.stride_w == 1) { + // Generated code is more optimized for stride_w == 1 + // because innermost loop is by width + auto ker = [&](int ic, int kh, int kw, int oh) { + const float *__restrict im_ = im + ic * im_step; + float *__restrict col_ + = col + ic * col_step + ((kh * jcp.kw + kw) * hb + oh) * wb; + + const int ih = (oh + hs) * jcp.stride_h - jcp.t_pad + + kh * (1 + jcp.dilate_h); + if (ih < 0 || ih >= jcp.ih) { + for (int ow = 0; ow < wb; ++ow) + col_[ow] = 0.f; + } else { + for (int ow = 0; ow < wb; ++ow) { + const int iw = ow + ws - jcp.l_pad + kw * (1 + jcp.dilate_w); + if (iw < 0 || iw >= jcp.iw) + col_[ow] = 0.f; + else { + const size_t im_idx = ih * jcp.iw + iw; + col_[ow] = im_[im_idx]; + } + } + } + }; + + if (jcp.outer_threading) { + for (int ic = 0; ic < jcp.ic; ic++) + for (int kh = 0; kh < jcp.kh; kh++) + for (int kw = 0; kw < jcp.kw; kw++) + for (int oh = 0; oh < hb; oh++) + ker(ic, kh, kw, oh); + } + else { + parallel_nd(jcp.ic, jcp.kh, jcp.kw, hb, ker); + } + } else if (jcp.ic == 1) { + parallel_nd(jcp.kh, hb, [&](int kh, int oh) { + const int ih = (oh + hs) * jcp.stride_h - jcp.t_pad + + kh * (1 + jcp.dilate_h); + if (ih < 0 || ih >= jcp.ih) + for (int kw = 0; kw < jcp.kw; ++kw) { + for (int ow = 0; ow < wb; ++ow) { + const size_t col_idx + = ((kh * jcp.kw + kw) * hb + oh) * wb + ow; + col[col_idx] = 0; + } + } + else + for (int kw = 0; kw < jcp.kw; ++kw) { + for (int ow = 0; ow < wb; ++ow) { + const int iw = (ow + ws) * jcp.stride_w - jcp.l_pad + + kw * (1 + jcp.dilate_w); + const size_t col_idx + = ((kh * jcp.kw + kw) * hb + oh) * wb + ow; + const size_t im_idx = ih * jcp.iw + iw; + if (iw < 0 || iw >= jcp.iw) + col[col_idx] = 0; + else + col[col_idx] = im[im_idx]; + } + } }); } else { - const size_t im_step = jcp.ih * jcp.iw; - const size_t col_step = jcp.ks * jcp.os; - - parallel_nd(jcp.ic, [&](int ic) { - const float *im_ = im + ic * im_step; - float *col_ = col + ic * col_step; - - for (int kh = 0; kh < jcp.kh; ++kh) { - for (int oh = 0; oh < jcp.oh; ++oh) { - const int ih = oh * jcp.stride_h - - jcp.t_pad + kh * (1 + jcp.dilate_h); - if (ih < 0 || ih >= jcp.ih) continue; - - for (int kw = 0; kw < jcp.kw; ++kw) { - for (int ow = 0; ow < jcp.ow; ++ow) { - const int iw = ow * jcp.stride_w - - jcp.l_pad + kw * (1 + jcp.dilate_w); - if (iw < 0 || iw >= jcp.iw) continue; - const size_t col_idx = ((kh * jcp.kw + kw) * jcp.oh+oh) - * jcp.ow + ow; - const size_t im_idx = ih*jcp.iw + iw; - col_[col_idx] = im_[im_idx]; - }} - }} + parallel_nd(jcp.ic, jcp.kh, jcp.kw, hb, + [&](int ic, int kh, int kw, int oh) { + const float *__restrict im_ = im + ic * im_step; + float *__restrict col_ = col + ic * col_step + + ((kh * jcp.kw + kw) * hb + oh) * wb; + + const int ih = (oh + hs) * jcp.stride_h - jcp.t_pad + + kh * (1 + jcp.dilate_h); + if (ih < 0 || ih >= jcp.ih) { + for (int ow = 0; ow < wb; ++ow) + col_[ow] = 0.f; + } else { + for (int ow = 0; ow < wb; ++ow) { + const int iw = (ow + ws) * jcp.stride_w - jcp.l_pad + + kw * (1 + jcp.dilate_w); + const size_t im_idx = ih * jcp.iw + iw; + if (iw < 0 || iw >= jcp.iw) + col_[ow] = 0.f; + else + col_[ow] = im_[im_idx]; + } + } }); } } /* col[oh][ow][kh][kw][ic] <-- im2col_u8(im[ih][iw][ic]) */ template -void im2col_u8(jit_gemm_conv_conf_t &jcp, const T *im, uint8_t *col) { - parallel_nd(jcp.oh, jcp.ow, [&](int oh, int ow) { - for (int kh = 0; kh < jcp.kh; ++kh) { - const int ih = oh * jcp.stride_h - - jcp.t_pad + kh * (1 + jcp.dilate_h); - if (ih < 0 || ih >= jcp.ih) continue; +void im2col_u8(const jit_gemm_conv_conf_t &jcp, const T *__restrict im, + uint8_t *__restrict col) { + uint8_t shift = jcp.signed_input ? 128 : 0; + const int dh = 1 + jcp.dilate_h; + const int dw = 1 + jcp.dilate_w; + const int sh = jcp.stride_h; + const int sw = jcp.stride_w; + if (sh == 1 && sw == 1 && jcp.oh > 2 * mkldnn_get_max_threads()) { + const int ihp = jcp.ih + jcp.t_pad; + const int iwp = jcp.iw + jcp.l_pad; + const int col_kw_step = jcp.ic; + const int col_kh_step = jcp.kw * col_kw_step; + const int col_ow_step = jcp.kh * col_kh_step; + const int col_oh_step = jcp.ow * col_ow_step; + const int im_iw_step = jcp.ngroups * jcp.ic; + const int im_ih_step = jcp.iw * im_iw_step; + + const int nb_ic = jcp.ic / 4; + const int ic_blocked = nb_ic * 4; + + parallel_nd(jcp.oh, [&](int oh) { + const int kh_start = nstl::max(div_up(jcp.t_pad - oh, dh), 0); + const int kh_end = nstl::min(div_up(ihp - oh, dh), jcp.kh); + const int ih_start = oh - jcp.t_pad + kh_start * dh; + const int col_oh_idx = oh * col_oh_step; + + for (int kh = kh_start, ih = ih_start; kh < kh_end; ++kh, ih += dh) + { + const int col_kh_idx = col_oh_idx + kh * col_kh_step; + const int im_kh_idx = ih * im_ih_step; for (int kw = 0; kw < jcp.kw; ++kw) { - const int iw = ow * jcp.stride_w - - jcp.l_pad + kw * (1 + jcp.dilate_w); - if (iw < 0 || iw >= jcp.iw) continue; - - const size_t col_idx = (((oh * jcp.ow + ow) * jcp.kh + kh) - * jcp.kw + kw) * jcp.ic; - const size_t im_idx - = (ih * jcp.iw + iw) * jcp.ngroups * jcp.ic; - PRAGMA_OMP_SIMD() - for (int ic = 0; ic < jcp.ic; ++ic) { - col[col_idx + ic] = jcp.signed_input - ? im[im_idx + ic] + 128 - : im[im_idx + ic]; + const int ow_start = nstl::max(jcp.l_pad - kw * dw, 0); + const int ow_end = nstl::min(iwp - kw * dw, jcp.ow); + const int iw_start = ow_start - jcp.l_pad + kw * dw; + const int col_kw_idx = col_kh_idx + kw * col_kw_step; + + const int col_idx_start + = col_kw_idx + ow_start * col_ow_step; + const int im_idx_start = im_kh_idx + iw_start * im_iw_step; + const int col_idx_end = col_kw_idx + ow_end * col_ow_step; + + // loop by iw and ow + if (nb_ic > 0) { + for (int col_idx = col_idx_start, im_idx = im_idx_start; + col_idx < col_idx_end; + col_idx += col_ow_step, im_idx += im_iw_step) { + for (int icb = 0; icb < 4 * nb_ic; icb += 4) { + PRAGMA_OMP_SIMD() + for (int ic = 0; ic < 4; ++ic) { + col[col_idx + icb + ic] + = im[im_idx + icb + ic] + shift; + } + } + } + } + if (ic_blocked != jcp.ic) { + for (int col_idx = col_idx_start, im_idx = im_idx_start; + col_idx < col_idx_end; + col_idx += col_ow_step, im_idx += im_iw_step) { + PRAGMA_OMP_SIMD() + for (int ic = ic_blocked; ic < jcp.ic; ++ic) { + col[col_idx + ic] = im[im_idx + ic] + shift; + } + } } } } - } - ); + }); + } + else { + const size_t col_kh_step = jcp.kw * jcp.ic; + const size_t col_ow_step = jcp.kh * col_kh_step; + const size_t col_oh_step = jcp.ow * col_ow_step; + const size_t im_ih_step = jcp.iw * jcp.ngroups * jcp.ic; + const size_t im_iw_step = jcp.ngroups * jcp.ic; + const int ih_pad = jcp.ih + jcp.t_pad; + const int iw_pad = jcp.iw + jcp.l_pad; + parallel_nd(jcp.oh, jcp.ow, [&](int oh, int ow) { + const int ihs = oh * sh; + const int ihsp = jcp.t_pad - ihs; + const int kh_start = nstl::max(div_up(ihsp, dh), 0); + const int kh_end = nstl::min(div_up(ih_pad - ihs, dh), jcp.kh); + const int ih_start = kh_start * dh - ihsp; + const int iws = ow * sw; + const int iwsp = jcp.l_pad - iws; + const int kw_start = nstl::max(div_up(iwsp, dw), 0); + const int kw_end = nstl::min(div_up(iw_pad - iws, dw), jcp.kw); + const int iw_start = kw_start * dw - iwsp; + + uint8_t *__restrict col_base + = col + oh * col_oh_step + ow * col_ow_step; + for (int kh = kh_start, ih = ih_start; kh < kh_end; + ++kh, ih += dh) { + uint8_t *__restrict col_ = col_base + kh * col_kh_step; + const T *__restrict im_ = im + ih * im_ih_step; + + for (int kw = kw_start, iw = iw_start; kw < kw_end; + ++kw, iw += dw) { + + const size_t col_idx = kw * jcp.ic; + const size_t im_idx = iw * im_iw_step; + PRAGMA_OMP_SIMD() + for (int ic = 0; ic < jcp.ic; ++ic) { + col_[col_idx + ic] = im_[im_idx + ic] + shift; + } + } + } + }); + } + } -template void im2col_u8( - jit_gemm_conv_conf_t &jcp, const int8_t *im, uint8_t *col); -template void im2col_u8( - jit_gemm_conv_conf_t &jcp, const uint8_t *im, uint8_t *col); + +template void im2col_u8(const jit_gemm_conv_conf_t &jcp, + const int8_t *__restrict im, uint8_t *__restrict col); +template void im2col_u8(const jit_gemm_conv_conf_t &jcp, + const uint8_t *__restrict im, uint8_t *__restrict col); /* im[ih][iw][ic] <-- col2im_s32(col[oh][ow][kh][kw][ic]) */ -void col2im_s32(jit_gemm_conv_conf_t &jcp, const int32_t *col, int32_t *im) { +void col2im_s32(const jit_gemm_conv_conf_t &jcp, const int32_t *__restrict col, + int32_t *__restrict im) +{ parallel(0, [&](const int ithr, const int nthr) { int h_nthr = nstl::min(jcp.ih, nthr); int w_nthr = nstl::min(jcp.iw, nthr / h_nthr); @@ -250,10 +391,12 @@ void col2im_s32(jit_gemm_conv_conf_t &jcp, const int32_t *col, int32_t *im) { }); } -void col2im_3d(jit_gemm_conv_conf_t &jcp, const float *col, float *im, int od) { +void col2im_3d(const jit_gemm_conv_conf_t &jcp, const float *col, float *im, + int od) +{ parallel_nd(jcp.ic, [&](int ic) { - const float *col_ = col + (size_t)ic * jcp.ks * jcp.os; - float *im_ic = im + (size_t)ic * jcp.ih * jcp.iw * jcp.id; + const float *__restrict col_ = col + (size_t)ic * jcp.ks * jcp.os; + float *__restrict im_ic = im + (size_t)ic * jcp.ih * jcp.iw * jcp.id; int id = od * jcp.stride_d - jcp.f_pad; for (int kd = 0; kd < jcp.kd; ++kd) { @@ -263,7 +406,7 @@ void col2im_3d(jit_gemm_conv_conf_t &jcp, const float *col, float *im, int od) { continue; } - float *im_ = im_ic + id * jcp.ih * jcp.iw; + float *__restrict im_ = im_ic + id * jcp.ih * jcp.iw; for (int oh = 0; oh < jcp.oh; ++oh) { for (int kh = 0; kh < jcp.kh; ++kh) { @@ -289,16 +432,14 @@ void col2im_3d(jit_gemm_conv_conf_t &jcp, const float *col, float *im, int od) { }); } -void col2im( - jit_gemm_conv_conf_t &jcp, const float *col, float *im) { - +void col2im(const jit_gemm_conv_conf_t &jcp, const float *col, float *im) { const size_t col_step = jcp.ks * jcp.os; const size_t im_step = jcp.ih * jcp.iw; const int iS = jcp.ih * jcp.iw; parallel_nd(jcp.ic, [&](int ic) { - float *im_ = im + ic * im_step; - const float *col_ = col + ic * col_step; + float *__restrict im_ = im + ic * im_step; + const float *__restrict col_ = col + ic * col_step; PRAGMA_OMP_SIMD() for (int is = 0; is < iS; ++is) im_[is] = 0.; @@ -322,18 +463,17 @@ void col2im( }); } -void init_conf( - jit_gemm_conv_conf_t &jcp, const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, int max_threads, - bool with_relu, float relu_negative_slope) { - +status_t init_conf(jit_gemm_conv_conf_t &jcp, + memory_tracking::registrar_t &scratchpad, const convolution_desc_t &cd, + const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, + const memory_desc_wrapper &dst_d, int max_threads) { const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; - jcp.prop_kind = cd.prop_kind; const int ndims = src_d.ndims(); const int is_1d = ndims == 3; const int is_3d = ndims == 5; + jcp.prop_kind = cd.prop_kind; + jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; jcp.mb = src_d.dims()[0]; @@ -363,59 +503,198 @@ void init_conf( jcp.dilate_w = cd.dilates[ndims - 3]; jcp.src_fmt = src_d.format(); - jcp.with_bias - = cd.bias_desc.format != memory_format::undef + jcp.with_bias = cd.bias_desc.format != memory_format::undef || cd.diff_bias_desc.format != memory_format::undef; - jcp.with_relu = with_relu; - jcp.relu_negative_slope = relu_negative_slope; jcp.is = jcp.ih * jcp.iw; jcp.os = jcp.oh * jcp.ow; jcp.ks = jcp.kh * jcp.kw * jcp.kd; - jcp.signed_input = (src_d.data_type() == data_type::s8); - jcp.wei_adj_scale = (!jcp.signed_input || mayiuse(avx512_core_vnni)) - ? 1.0f - : (1.0f / 2.0f); + jcp.signed_input = src_d.data_type() == data_type::s8; + jcp.wei_adj_scale = + !jcp.signed_input || mayiuse(avx512_core_vnni) ? 1.f : 0.5f; + jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, jcp.od == jcp.id, jcp.stride_w == 1, jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, !jcp.signed_input) - ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os - : 0; - - bool do_outer_threading = false; - bool is_int8_conv - = (utils::one_of(cd.src_desc.data_type == u8, cd.src_desc.data_type == s8) - && cd.weights_desc.data_type == s8); + ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os : 0; + + jcp.outer_threading = false; + jcp.oh_block = jcp.oh; + jcp.ow_block = jcp.ow; + + bool is_int8_conv = utils::one_of(src_d.data_type(), s32, s8, u8) + && weights_d.data_type() == s8; + + const int vlen = mayiuse(avx512_common) + ? cpu_isa_traits::vlen + : mayiuse(avx) + ? cpu_isa_traits::vlen + : mayiuse(sse42) ? cpu_isa_traits::vlen : 4; + const int simd_w = vlen / (is_int8_conv ? 1 : 4); + + const bool is_bwd_d = jcp.prop_kind == backward_data; + const bool is_bwd_w = jcp.prop_kind == backward_weights; + const bool is_fwd = !is_bwd_d && !is_bwd_w; + + using namespace memory_tracking::names; + // For threading selection we do: + // 1. Rough estimation of efficiency for inner and outer threading. + // 2. Gemm size estimation in assumption that it does not work + // so effectively for small sizes. + // 64K - this is heuristic gemm size per thread threshold. + const int gemm_threshold = 64 * 1024; if (is_int8_conv) { - bool is_depthwise = - utils::everyone_is(1, jcp.ic, jcp.oc) && jcp.ngroups != 1; - do_outer_threading - = (is_depthwise || (jcp.os / max_threads < 64 && jcp.mb != 1)); + bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + + const int bs = is_fwd ? jcp.os : jcp.is; + const int ls = is_fwd ? jcp.oc : jcp.ic; + const size_t outer_work_amount = jcp.ngroups * jcp.mb; + const float outer_thr_eff = (float)outer_work_amount + / rnd_up(outer_work_amount, max_threads); + const size_t inner_work_amount + = div_up(bs, simd_w) * div_up(ls, simd_w); + const float inner_thr_eff = (float)inner_work_amount + / rnd_up(inner_work_amount, max_threads); + jcp.outer_threading = (is_depthwise + || (bs / max_threads < 64 && jcp.mb != 1)) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (bs * jcp.ic * jcp.oc) / max_threads < gemm_threshold); + jcp.nthr = jcp.outer_threading ? max_threads : 1; + + if (is_fwd) { + scratchpad.book(key_conv_gemm_col, + sizeof(int8_t) * jcp.nthr * jcp.im2col_sz); + scratchpad.book(key_conv_int_dat_in_acc_dt, + sizeof(int32_t) * jcp.nthr * jcp.os * jcp.oc); + } else if (is_bwd_d) { + scratchpad.book(key_conv_gemm_col, + sizeof(int32_t) * jcp.nthr * jcp.im2col_sz); + scratchpad.book(key_conv_int_dat_in_acc_dt, + sizeof(int32_t) * jcp.nthr * jcp.is * jcp.ic); + } else if (is_bwd_w) { + assert(!"unimplemented prop_kind"); + return status::unimplemented; + } } else { - if (utils::one_of(jcp.prop_kind, forward_training, forward_inference)) - do_outer_threading = jcp.os / max_threads < 512 - && IMPLICATION(jcp.od == 1, (jcp.mb != 1 || jcp.ngroups > 2)); - else if (jcp.prop_kind == backward_data) - do_outer_threading = (jcp.mb != 1 || jcp.ngroups > 2); - else //(jcp.prop_kind == backward_weights) - do_outer_threading = jcp.os / max_threads < 256 - && (jcp.mb != 1 || jcp.ngroups > 2); - } - jcp.nthr = do_outer_threading ? max_threads : 1; - jcp.need_wei_reduction = mkldnn_thr_syncable() - ? (jcp.mb != 1 && jcp.nthr != 1) : false; -} + if (is_fwd) { + const int L2 = get_cache_size(2, true) / sizeof(float); + const int wei_size = jcp.oc * jcp.ic * jcp.kh * jcp.kw; + + // It makes sense to try blocking for some special cases: + // when weights size is small and we have to do im2col + if (wei_size < L2/2 && jcp.im2col_sz && jcp.id == 1 && jcp.od == 1) { + // looking for oh and ow blocking + int h_block{ jcp.oh }, w_block{ jcp.ow }; + // 1. cache requirement + // !!! used memory (assuming strides = 1 and dilate = 0 etc): + const int row_size = jcp.ic * jcp.kh * jcp.kw * jcp.ow + + 2 * jcp.ic * jcp.iw + 2 * jcp.oc * jcp.ow; + h_block = nstl::max( + 1, nstl::min(jcp.oh, div_up(L2 - wei_size, row_size))); + if (h_block == 1) { + const int col_size = jcp.ic * jcp.kh * jcp.kw + 2 * jcp.ic + + 2 * jcp.oc; + w_block = nstl::max( + 1, nstl::min(jcp.ow, div_up(L2 - wei_size, col_size))); + } -status_t prepare_scratchpad(jit_gemm_conv_conf_t &jcp, - scratchpad_t **scratchpad_, size_t size, const int nthr) { - if (size > 0) { - *scratchpad_ = create_scratchpad(nthr * size); - if (*scratchpad_ == nullptr) return status::out_of_memory; - } else { - *scratchpad_ = nullptr; + // 2. threading requirement + if (h_block != jcp.oh) + h_block = nstl::max(1, rnd_dn(h_block, 4)); + if (w_block != jcp.ow) + w_block = nstl::max(1, rnd_dn(w_block, simd_w)); + + float thr_eff = 0.f; + float thr_eff_treshold = 0.9f; + if (w_block == jcp.ow) { + do { + int nb_oh = div_up(jcp.oh, h_block); + size_t work = jcp.ngroups * jcp.mb * jcp.od * nb_oh; + float disb = (float)jcp.oh / rnd_up(jcp.oh, h_block); + thr_eff = (float)work + / rnd_up(work, max_threads); + thr_eff = (thr_eff + disb) / 2.f; + if (thr_eff >= thr_eff_treshold) + break; + h_block = rnd_dn(h_block - 4, 4); + } while (h_block > 0); + } + if (thr_eff < thr_eff_treshold) // we didn't find suitable h_block + { + h_block = 1; + int nb_oh = jcp.oh; + do { + int nb_ow = div_up(jcp.ow, w_block); + size_t work_amount + = jcp.ngroups * jcp.mb * jcp.od * nb_oh * nb_ow; + float disb = (float)jcp.ow / rnd_up(jcp.ow, w_block); + thr_eff = (float)work_amount + / rnd_up(work_amount, max_threads); + thr_eff = (thr_eff + disb) / 2.f; + if (thr_eff > thr_eff_treshold) + break; + w_block = rnd_dn(w_block - simd_w, simd_w); + } while (w_block > 0); + } + const size_t inner_work_amount + = div_up(jcp.os, simd_w) * div_up(jcp.oc, simd_w); + const float inner_thr_eff = (float)inner_work_amount + / rnd_up(inner_work_amount, max_threads); + if (thr_eff >= inner_thr_eff / 2 && h_block > 0 && w_block > 0) { + jcp.oh_block = h_block; + jcp.ow_block = w_block; + jcp.outer_threading = true; + } + // updating jcp.im2col_sz + if (jcp.oh_block != 1) + jcp.ow_block = jcp.ow; + jcp.im2col_sz + = (ptrdiff_t)jcp.ic * jcp.ks * jcp.oh_block * jcp.ow_block; + } else { + const size_t outer_work_amount = jcp.ngroups * jcp.mb * jcp.od; + const float outer_thr_eff = (float)outer_work_amount + / rnd_up(outer_work_amount, max_threads); + const size_t inner_work_amount + = div_up(jcp.os, simd_w) * div_up(jcp.oc, simd_w); + const float inner_thr_eff = (float)inner_work_amount + / rnd_up(inner_work_amount, max_threads); + jcp.outer_threading = jcp.os / max_threads < 512 + && IMPLICATION(jcp.od == 1, jcp.mb != 1 || jcp.ngroups > 2) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (jcp.os * jcp.ic * jcp.oc) / max_threads < gemm_threshold); + } + } else if (is_bwd_d) { + const size_t outer_work_amount = jcp.ngroups * jcp.mb; + const float outer_thr_eff = (float)outer_work_amount + / rnd_up(outer_work_amount, max_threads); + const size_t inner_work_amount + = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); + const float inner_thr_eff = (float)inner_work_amount + / rnd_up(inner_work_amount, max_threads); + jcp.outer_threading = (jcp.os / max_threads < 512 || jcp.ks < 64) + && (jcp.mb != 1 || jcp.ngroups > 2) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (jcp.os * jcp.ic * jcp.oc) / max_threads < gemm_threshold); + } else if (is_bwd_w) + jcp.outer_threading = jcp.os / max_threads < 256 + && (jcp.mb != 1 || jcp.ngroups > 2); + + jcp.nthr = jcp.outer_threading ? max_threads : 1; + + scratchpad.book(key_conv_gemm_col, + sizeof(float) * jcp.nthr * jcp.im2col_sz); + + if (is_bwd_w) { + jcp.need_wei_reduction = mkldnn_thr_syncable() + ? jcp.mb != 1 && jcp.nthr != 1 : false; + + scratchpad.book(key_conv_wei_reduction, + sizeof(float) * jcp.nthr * jcp.ngroups * weights_d.size()); + } } + return status::success; } @@ -431,8 +710,9 @@ void bwd_weights_balance(int ithr, int nthr, int ngroups, int mb, int &ithr_g, } } -void bwd_weights_reduction_par(int ithr, int nthr, const jit_gemm_conv_conf_t &jcp, - const float *weights_reduce_ws, float *weights) { +void bwd_weights_reduction_par(int ithr, int nthr, + const jit_gemm_conv_conf_t &jcp, const float *weights_reduce_ws, + float *weights) { const size_t weights_g_size = jcp.ic * jcp.oc * jcp.ks; size_t weights_start{0}, weights_end{0}; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.hpp index c2ebc45..1bcfcc3 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.hpp @@ -18,11 +18,12 @@ #define CPU_JIT_GEMM_CONVOLUTION_UTILS_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" + #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "jit_primitive_conf.hpp" -#include "mkldnn_thread.hpp" -#include "scratchpad.hpp" namespace mkldnn { namespace impl { @@ -30,32 +31,32 @@ namespace cpu { namespace jit_gemm_convolution_utils { - void im2col_3d(jit_gemm_conv_conf_t &jcp, const float *im, float *col, +void im2col_3d(const jit_gemm_conv_conf_t &jcp, const float *im, float *col, int od); - void im2col(jit_gemm_conv_conf_t &jcp, const float *im, float *col); - template - void im2col_u8(jit_gemm_conv_conf_t &jcp, const T *im, uint8_t *col); +void im2col(const jit_gemm_conv_conf_t &jcp, const float *__restrict im, + float *__restrict col, int hs, int hb, int ws, int wb); +template +void im2col_u8(const jit_gemm_conv_conf_t &jcp, const T *__restrict im, + uint8_t *__restrict col); - void col2im_s32(jit_gemm_conv_conf_t &jcp, const int32_t *col, int32_t *im); - void col2im_3d(jit_gemm_conv_conf_t &jcp, const float *col, float *im, +void col2im_s32(const jit_gemm_conv_conf_t &jcp, const int32_t *__restrict col, + int32_t *__restrict im); +void col2im_3d(const jit_gemm_conv_conf_t &jcp, const float *col, float *im, int od); - void col2im(jit_gemm_conv_conf_t &jcp, const float *col, float *im); +void col2im(const jit_gemm_conv_conf_t &jcp, const float *col, float *im); - void init_conf(jit_gemm_conv_conf_t &jcp, - const convolution_desc_t &cd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - int max_threads, bool with_relu = false, float relu_negative_slope = -1.0); +status_t init_conf(jit_gemm_conv_conf_t &jcp, + memory_tracking::registrar_t &scratchpad, const convolution_desc_t &cd, + const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, + const memory_desc_wrapper &dst_d, int max_threads); - status_t prepare_scratchpad(jit_gemm_conv_conf_t &jcp, - scratchpad_t **col_scratchpad_, size_t size, const int nthr); - - void bwd_weights_balance(int ithr, int nthr, - int ngroups, int mb, int &ithr_g, int &nthr_g, int &ithr_mb, - int &nthr_mb); - void bwd_weights_reduction_par(int ithr, int nthr, +void bwd_weights_balance(int ithr, int nthr, int ngroups, int mb, + int &ithr_g, int &nthr_g, int &ithr_mb, int &nthr_mb); +void bwd_weights_reduction_par(int ithr, int nthr, const jit_gemm_conv_conf_t &jcp, const float *weights_reduce_ws, - float *weights); -}; + float *weights); + +} } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.cpp index d9a8fe5..7f62c6b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.cpp @@ -31,20 +31,20 @@ using namespace mkldnn::impl::memory_format; using namespace mkldnn::impl::primitive_kind; template -void gemm_inner_product_fwd_t::execute_forward() { +void gemm_inner_product_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int IC = conf_.IC_total_padded(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int IC = pd()->IC_total_padded(); - bool wei_tr = !utils::one_of(conf_.weights_pd()->desc()->format, + bool wei_tr = !utils::one_of(pd()->weights_pd()->desc()->format, hwio, dhwio, io); - const auto &post_ops = conf_.attr()->post_ops_; + const auto &post_ops = pd()->attr()->post_ops_; const bool do_relu = post_ops.len_ == 1; float alpha = 1.0, beta = 0.0; @@ -62,16 +62,16 @@ void gemm_inner_product_fwd_t::execute_forward() { } template -void gemm_inner_product_bwd_data_t::execute_backward_data() { +void gemm_inner_product_bwd_data_t::execute_backward_data() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int IC = conf_.IC_total_padded(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int IC = pd()->IC_total_padded(); - bool wei_tr = utils::one_of(conf_.weights_pd()->desc()->format, + bool wei_tr = utils::one_of(pd()->weights_pd()->desc()->format, hwio, dhwio, io); float alpha = 1.0, beta = 0.0; @@ -80,22 +80,22 @@ void gemm_inner_product_bwd_data_t::execute_backward_data() { } template -void gemm_inner_product_bwd_weights_t::execute_backward_weights() { +void gemm_inner_product_bwd_weights_t::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_bias_d(conf_.diff_weights_pd(1)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); diff_dst += diff_dst_d.blocking_desc().offset_padding; - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int IC = conf_.IC_total_padded(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int IC = pd()->IC_total_padded(); - bool wei_tr = utils::one_of(conf_.diff_weights_pd()->desc()->format, + bool wei_tr = utils::one_of(pd()->diff_weights_pd()->desc()->format, hwio, dhwio, io); float alpha = 1.0, beta = 0.0; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.hpp index 6e7806e..dcd9041 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.hpp @@ -64,19 +64,19 @@ struct gemm_inner_product_fwd_t: public cpu_primitive_t { } }; - gemm_inner_product_fwd_t(const pd_t *pd, const input_vector &inputs, + gemm_inner_product_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -108,19 +108,19 @@ struct gemm_inner_product_bwd_data_t: public cpu_primitive_t { } }; - gemm_inner_product_bwd_data_t(const pd_t *pd, const input_vector &inputs, + gemm_inner_product_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward_data(); e->set_state(event_t::ready); } private: - void execute_backward_data(); - pd_t conf_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -152,19 +152,19 @@ struct gemm_inner_product_bwd_weights_t: public cpu_primitive_t { } }; - gemm_inner_product_bwd_weights_t(const pd_t *pd, const input_vector &inputs, + gemm_inner_product_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward_weights(); e->set_state(event_t::ready); } private: - void execute_backward_weights(); - pd_t conf_; + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.cpp deleted file mode 100644 index eb902a1..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "mkldnn_types.h" -#include "mkldnn_thread.hpp" -#include "simple_q10n.hpp" -#include "gemm_u8s8s32x_inner_product.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -using namespace math; -using namespace memory_format; - -template -void gemm_u8s8s32x_inner_product_fwd_t::execute_forward() { -#if USE_MKL_IGEMM - auto src = reinterpret_cast(this->input_memory(0)); - auto weights = reinterpret_cast(this->input_memory(1)); - auto bias = reinterpret_cast(this->input_memory(2)); - auto dst = reinterpret_cast(this->memory()); - - const int MB = conf_.MB(); - const int OC = conf_.OC(); - - bool wei_tr = utils::one_of(conf_.weights_pd()->desc()->format, - oihw, oidhw, oi); - - const int M = OC; - const int N = MB; - const int K = conf_.IC_total_padded(); - const int8_t off_a = 0, off_b = 0; - const int32_t off_c = 0; - - const int scale_idx_mult = conf_.attr()->output_scales_.mask_ == (1 << 1); - const float *scales = conf_.attr()->output_scales_.scales_; - const auto rmode = conf_.attr()->round_mode_; - - const auto &post_ops = conf_.attr()->post_ops_; - const bool do_relu = post_ops.len_ == 1; - const float nslope = do_relu ? post_ops.entry_[0].eltwise.alpha : 0.f; - - acc_data_t *acc = this->dst_is_acc_ - ? (acc_data_t *)dst - : (acc_data_t *)this->scratchpad_->get(); - - auto get_bias = [=, &bias](size_t off) -> acc_data_t { -# define CASE(dt) case dt: return (acc_data_t)\ - (*((const prec_traits
::type *)bias + off)) - switch (conf_.desc()->bias_desc.data_type) { - CASE(data_type::s8); - CASE(data_type::u8); - CASE(data_type::s32); - CASE(data_type::f32); - default: assert(!"unimplemented"); - } -# undef CASE - return 0; - }; - - cblas_gemm_s8u8s32(CblasColMajor, wei_tr ? CblasTrans : CblasNoTrans, - CblasNoTrans, CblasFixOffset, M, N, K, 1., weights, - wei_tr ? K : M, off_a, src, K, off_b, 0., acc, M, &off_c); - - parallel_nd(MB, OC, [&](int mb, int oc) { - size_t dst_off = mb * OC + oc; - float d = (float)acc[dst_off]; - if (bias) - d += get_bias(oc); - d *= scales[oc * scale_idx_mult]; - if (do_relu && d < 0) - d *= nslope; - dst[dst_off] = qz_a1b0()(d, rmode); - }); -#endif -} - -using namespace data_type; - -template struct gemm_u8s8s32x_inner_product_fwd_t; -template struct gemm_u8s8s32x_inner_product_fwd_t; -template struct gemm_u8s8s32x_inner_product_fwd_t; -template struct gemm_u8s8s32x_inner_product_fwd_t; -} -} -} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.cpp index 5512626..d9b8205 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.cpp @@ -32,99 +32,547 @@ namespace cpu { using namespace mkldnn::impl::utils; using namespace mkldnn::impl::math; +using namespace mkldnn::impl::memory_tracking::names; -template -void _gemm_x8s8s32x_convolution_fwd_t::execute_forward() { +template +void _gemm_x8s8s32x_convolution_fwd_t:: +execute_forward() const { auto src_base = reinterpret_cast(this->input_memory(0)); auto wei_base = reinterpret_cast(this->input_memory(1)); auto bia_base = reinterpret_cast(this->input_memory(2)); auto dst_base = reinterpret_cast(this->memory()); - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; + auto scratchpad = this->scratchpad(); - char *scratchpad = (char *)this->scratchpad_->get(); - uint8_t *col = (uint8_t *)scratchpad; + const jit_gemm_conv_conf_t &jcp = this->pd()->jcp_; + + auto col = scratchpad.template get(key_conv_gemm_col); parallel_nd(jcp.im2col_sz * jcp.nthr, [&](ptrdiff_t i) { col[i] = jcp.signed_input ? (uint8_t)128 : (uint8_t)0; }); parallel(jcp.nthr, [&](const int ithr, const int nthr) { - execute_forward_thr(ithr, nthr, src_base, wei_base, bia_base, - dst_base, scratchpad); + execute_forward_thr(ithr, nthr, src_base, wei_base, bia_base, dst_base, + scratchpad); }); } -template -void _gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr, const int nthr, - const src_data_t *src_base, const wei_data_t *wei_base, - const char *bia_base, dst_data_t *dst_base, char *scratchpad) { -#if USE_MKL_IGEMM - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; +template +_gemm_x8s8s32x_convolution_fwd_t::pp_ker_t::pp_ker_t( + const pd_t *pd) + : ker_(nullptr) + , jcp_(pd->jcp_) + , OC_(pd->jcp_.oc) + , OS_(pd->jcp_.os) + , bias_data_type_(data_type::undef) + , bias_data_type_size_(0) + , scale_idx_mult_(0) + , rmode_(round_mode::nearest) + , do_bias_(false) + , do_relu_(false) + , do_sum_(false) +{ + using namespace types; - const auto src_md = memory_desc_wrapper(conf_.src_pd()); - const size_t src_mb_stride = src_md.blk_off(1); - const size_t src_g_stride = src_md.blk_off(0, 1) * jcp.ic; + const auto dst_md = memory_desc_wrapper(pd->dst_pd()); + dst_os_stride_ = dst_md.blk_off(0, 0, 0, 1); - const auto wei_md = memory_desc_wrapper(conf_.weights_pd(0)); - const size_t wei_g_stride = conf_.with_groups() ? wei_md.blk_off(1) : 0; + scale_idx_mult_ = (pd->attr()->output_scales_.mask_ == (1 << 1)); + rmode_ = pd->attr()->round_mode_; - const auto dst_md = memory_desc_wrapper(conf_.dst_pd()); - const size_t dst_mb_stride = dst_md.blk_off(1); - const size_t dst_g_stride = dst_md.blk_off(0, 1) * jcp.oc; - const size_t dst_os_stride = dst_md.blk_off(0, 0, 0, 1); - - auto get_bias = [=, &bia_base](size_t off) -> acc_data_t { -# define CASE(dt) case dt: return (acc_data_t)\ - (*((const prec_traits
::type *)bia_base + off)) - switch (conf_.cdesc()->bias_desc.data_type) { - CASE(data_type::s8); - CASE(data_type::u8); - CASE(data_type::s32); - CASE(data_type::f32); + auto &post_ops = pd->attr()->post_ops_; + + int entry_idx = -1; + for (int idx = 0; idx < post_ops.len_; ++idx) { + const auto &e = post_ops.entry_[idx]; + if (e.is_relu(true, false)) { + entry_idx = idx; + break; + } + } + do_relu_ = entry_idx >= 0; + + do_signed_scaling_ = jcp_.signed_input; + + do_sum_ = post_ops.contain(primitive_kind::sum, 0); + do_bias_ = pd->with_bias(); + bias_data_type_ = pd->desc()->bias_desc.data_type; + if (do_bias_) { + assert(bias_data_type_ != data_type::undef); + bias_data_type_size_ = data_type_size(bias_data_type_); + } + const size_t vlen_start + = cpu_isa_traits::vlen / sizeof(float); + + for (size_t i = vlen_start; i > 0; i--) { + if (OC_ % i == 0) { + vlen_ = i; + break; + } + } + + if (!mayiuse(avx512_core)) + // use fallback code for older CPUs + return; + else + generate(); +} + +template +void _gemm_x8s8s32x_convolution_fwd_t::pp_ker_t::generate() +{ + using namespace Xbyak; + using namespace utils; + using namespace round_mode; + + // TODO: clean-up + Reg64 reg_param = abi_param1; + Reg64 reg_dst = rdx; + Reg64 reg_acc = rax; + Reg64 reg_bias = rbx; + Reg64 reg_scales = rsi; + + Reg64 reg_len = r8; + Reg64 reg_tmp = rcx; // intentional for shifting purposes + Reg64 reg_oc_offset = r9; + Reg64 reg_rem_mask_short = r10; + Reg64 reg_rem_mask_vlen = r11; + Opmask kreg_rem_mask_short = k1; + Opmask kreg_rem_mask_vlen = k3; + Opmask kreg_relu_cmp = k2; + + const size_t vlen = 4; + + Zmm vreg_zero = Zmm(0); + Zmm vreg_scale = Zmm(1); + Zmm vreg_nslope = Zmm(2); + Zmm vreg_sum_scale = Zmm(3); + Zmm vreg_signed_scale = Zmm(4); + + size_t def_unroll = 4; + size_t max_unroll = 12; + size_t zmm_step = 2; + if (do_sum_) { + max_unroll = 8; + zmm_step = 3; + } + + auto vreg_dst = [&](int idx) { + return Zmm(5 + idx * zmm_step + 0); + }; + auto vreg_bias = [&](int idx) { + return Zmm(5 + idx * zmm_step + 1); + }; + auto vreg_prev_dst = [&](int idx) { + return Zmm(5 + idx * zmm_step + 2); + }; + + preamble(); + +#define PARAM_OFF(x) offsetof(ker_args, x) + mov(reg_dst, ptr[reg_param + PARAM_OFF(dst)]); + mov(reg_acc, ptr[reg_param + PARAM_OFF(acc)]); + mov(reg_bias, ptr[reg_param + PARAM_OFF(bias)]); + mov(reg_scales, ptr[reg_param + PARAM_OFF(scales)]); + mov(reg_len, ptr[reg_param + PARAM_OFF(len)]); + mov(reg_oc_offset, ptr[reg_param + PARAM_OFF(oc_offset)]); + vbroadcastss(vreg_nslope, ptr[reg_param + PARAM_OFF(nslope)]); + vbroadcastss(vreg_sum_scale, ptr[reg_param + PARAM_OFF(sum_scale)]); + vbroadcastss(vreg_signed_scale, ptr[reg_param + PARAM_OFF(signed_scale)]); + if (scale_idx_mult_ == 0) + vbroadcastss(vreg_scale, dword[reg_scales]); + +#undef PARAM_OFF + + mov(reg_rem_mask_vlen, 1); + shl(reg_rem_mask_vlen, vlen); + sub(reg_rem_mask_vlen, 1); + kmovq(kreg_rem_mask_vlen, reg_rem_mask_vlen); + + if (do_relu_ || dst_type == data_type::u8) + vxorps(vreg_zero, vreg_zero, vreg_zero); + + // Load accumulated value, convert to float, apply sum (if any), + // bias (if any), scaling, and relu (if any); + // then convert to destination type and store + auto compute = [&](size_t offset, int idx, bool apply_mask) { + auto acc_addr = ptr[reg_acc + offset * sizeof(acc_data_t)]; + + if (scale_idx_mult_ > 0) { + assert(scale_idx_mult_ == 1); + auto scale_addr = ptr[reg_scales + offset * sizeof(float)]; + auto vreg_scale_ = vreg_scale; + if (apply_mask) + vreg_scale_ = vreg_scale_ | kreg_rem_mask_short; + else + vreg_scale_ = vreg_scale_ | kreg_rem_mask_vlen; + vmovups(vreg_scale_, scale_addr); + } + + auto vreg_dst_ = vreg_dst(idx); + if (apply_mask) + vreg_dst_ = vreg_dst_ | kreg_rem_mask_short; + else + vreg_dst_ = vreg_dst_ | kreg_rem_mask_vlen; + vcvtdq2ps(vreg_dst_, acc_addr); + + if (do_signed_scaling_) + vmulps(vreg_dst(idx), vreg_dst(idx), vreg_signed_scale); + + if (do_bias_) { + auto bias_addr = ptr[reg_bias + offset * bias_data_type_size_]; + auto vreg_bias_ = vreg_bias(idx); + if (apply_mask) + vreg_bias_ = vreg_bias_ | kreg_rem_mask_short; + else + vreg_bias_ = vreg_bias_ | kreg_rem_mask_vlen; + + switch (bias_data_type_) { + case data_type::s8: + vpmovsxbd(vreg_bias_, bias_addr); + break; + case data_type::u8: + vpmovzxbd(vreg_bias_, bias_addr); + break; + case data_type::s32: + vcvtdq2ps(vreg_bias_, bias_addr); + break; + case data_type::f32: + vmovups(vreg_bias_, bias_addr); + break; + default: assert(!"unimplemented"); + } + vaddps(vreg_dst(idx), vreg_dst(idx), vreg_bias(idx)); + } + + vmulps(vreg_dst(idx), vreg_dst(idx), vreg_scale); + + auto dst_addr = ptr[reg_dst + offset * sizeof(dst_data_t)]; + + if (do_sum_) + { + auto vreg_prev_dst_ = vreg_prev_dst(idx); + if (apply_mask) + vreg_prev_dst_ = vreg_prev_dst_ | kreg_rem_mask_short; + else + vreg_prev_dst_ = vreg_prev_dst_ | kreg_rem_mask_vlen; + + switch (dst_type) { + case data_type::f32: + case data_type::s32: vmovups(vreg_prev_dst_, dst_addr); break; + case data_type::s8: vpmovsxbd(vreg_prev_dst_, dst_addr); break; + case data_type::u8: vpmovzxbd(vreg_prev_dst_, dst_addr); break; + default: assert(!"unsupported data type"); + } + if (dst_type != data_type::f32) + vcvtdq2ps(vreg_prev_dst(idx), vreg_prev_dst(idx)); + + vfmadd231ps(vreg_dst(idx), vreg_prev_dst(idx), vreg_sum_scale); + } + + if (do_relu_) { + vcmpps(kreg_relu_cmp, vreg_dst(idx), vreg_zero, _cmp_lt_os); + vmulps(vreg_dst(idx) | kreg_relu_cmp, vreg_dst(idx), vreg_nslope); + } + + if (dst_type != data_type::f32) { + auto rmode_control = (rmode_ == nearest ? T_rn_sae : T_rd_sae); + vcvtps2dq(vreg_dst(idx) | rmode_control, vreg_dst(idx)); + } + + if (dst_type == data_type::u8) + vpmaxsd(vreg_dst(idx), vreg_dst(idx), vreg_zero); + + switch (dst_type) { + case data_type::s8: + vpmovsdb(dst_addr, vreg_dst_); + break; + case data_type::u8: + vpmovusdb(dst_addr, vreg_dst_); + break; + case data_type::f32: + case data_type::s32: + vmovups(dst_addr, vreg_dst_); + break; default: assert(!"unimplemented"); } -# undef CASE - return 0; }; - /* scale_idx_mult = 1 for per_oc scales and 0, otherwise */ - const int scale_idx_mult = conf_.attr()->output_scales_.mask_ == (1 << 1); - const float *scales = conf_.attr()->output_scales_.scales_; + // Advance all pointers by an immediate + auto advance_ptrs_imm = [&](size_t offset) { + add(reg_dst, offset * sizeof(dst_data_t)); + add(reg_acc, offset * sizeof(acc_data_t)); + if (scale_idx_mult_) { + assert(scale_idx_mult_ == 1); + add(reg_scales, offset * sizeof(float)); + } + if (do_bias_) + add(reg_bias, offset * bias_data_type_size_); + }; + + // Advance all pointers by a value stored in a register + auto advance_ptrs_reg = [&](Reg64 offset) { + lea(reg_dst, ptr[reg_dst + offset * sizeof(dst_data_t)]); + lea(reg_acc, ptr[reg_acc + offset * sizeof(acc_data_t)]); + if (scale_idx_mult_) { + assert(scale_idx_mult_ == 1); + lea(reg_scales, ptr[reg_scales + offset * sizeof(float)]); + } + if (do_bias_) + lea(reg_bias, ptr[reg_bias + offset * bias_data_type_size_]); + }; + + // Rewind pointers that point to data that is indexed by output channel + // (bias or per-oc scaling factors) + auto rewind_ptrs = [&]() { + if (do_bias_) + sub(reg_bias, OC_ * bias_data_type_size_); + if (scale_idx_mult_) { + assert(scale_idx_mult_ == 1); + sub(reg_scales, OC_ * sizeof(float)); + } + add(reg_dst, (dst_os_stride_ - OC_) * sizeof(dst_data_t)); + }; + + // <--------- OC ---------------> + // + // ^ ................+..............+-------------+....................... + // | . : not accessed |Prologue loop| . + // | . +--------------+-------------+ . + // . | | . + // O . | Main loop (unrolled) | . + // S . | | . + // . +--------------+-------------+ . + // | . | Epilogue loop|not accessed : . + // v ................+--------------+.............+....................... + + Label prologue_end; + cmp(reg_oc_offset, 0); + je(prologue_end, T_NEAR); + + // Prologue loop + { + mov(reg_tmp, OC_); + sub(reg_tmp, reg_oc_offset); + cmp(reg_tmp, reg_len); + cmovg(reg_tmp, reg_len); + sub(reg_len, reg_tmp); + + Label prologue_loop, prologue_loop_tail, prologue_loop_end; + cmp(reg_tmp, vlen); + jle(prologue_loop_tail, T_NEAR); + L(prologue_loop); { + compute(0, 0, false); + advance_ptrs_imm(vlen); + sub(reg_tmp, vlen); + cmp(reg_tmp, vlen); + jge(prologue_loop, T_NEAR); + } + + L(prologue_loop_tail); + mov(reg_rem_mask_short, 1); + // cl == reg_tmp because reg_tmp <= vlen here + shl(reg_rem_mask_short, cl); + sub(reg_rem_mask_short, 1); + jz(prologue_loop_end, T_NEAR); + + kmovq(kreg_rem_mask_short, reg_rem_mask_short); + compute(0, 0, true); + advance_ptrs_reg(reg_tmp); - const auto rmode = conf_.attr()->round_mode_; + L(prologue_loop_end); + rewind_ptrs(); + } + L(prologue_end); + + // Main loop + Label main_loop_end; + { + cmp(reg_len, OC_); + jle(main_loop_end, T_NEAR); + + Label main_loop; + L(main_loop); { + size_t OC_loop, OC_tail; + if (OC_ < max_unroll * vlen) { + // Fully unroll small loops + OC_loop = 0; + OC_tail = OC_; + } + else { + OC_loop = vlen * def_unroll; + OC_tail = OC_ % OC_loop; + } + + assert(!!OC_loop || !!OC_tail); + + if (OC_tail % vlen) { + int vlen_tail = OC_tail % vlen; + unsigned tail_mask = (1 << vlen_tail) - 1; + mov(reg_tmp, tail_mask); + kmovq(kreg_rem_mask_short, reg_tmp); + } + + if (OC_loop) { + mov(reg_tmp, rnd_dn(OC_, OC_loop)); + Label oc_loop; + L(oc_loop); { + for (size_t offset = 0; offset < OC_loop; offset += vlen) + compute(offset, offset / vlen, false); + advance_ptrs_imm(OC_loop); + sub(reg_tmp, OC_loop); + jnz(oc_loop); + } + } + + if (OC_tail) { + for (size_t offset = 0; offset < OC_tail; offset += vlen) { + bool use_mask = (offset + vlen) > OC_tail; + compute(offset, offset / vlen, use_mask); + } + advance_ptrs_imm(OC_tail); + } + + rewind_ptrs(); + sub(reg_len, OC_); + cmp(reg_len, OC_); + jge(main_loop, T_NEAR); + } + } + L(main_loop_end); + + // Epilogue loop + Label epilogue_end; + { + cmp(reg_len, 0); + je(epilogue_end, T_NEAR); + + Label epilogue_loop, epilogue_loop_tail; + cmp(reg_len, vlen); + jle(epilogue_loop_tail, T_NEAR); + L(epilogue_loop); { + compute(0, 0, false); + sub(reg_len, vlen); + advance_ptrs_imm(vlen); + cmp(reg_len, vlen); + jge(epilogue_loop, T_NEAR); + } + + L(epilogue_loop_tail); + mov(reg_tmp, reg_len); // reg_tmp is rcx, and we need cl for the shift + mov(reg_rem_mask_short, 1); + shl(reg_rem_mask_short, cl); // reg_tmp == rcx and reg_tail < vlen + sub(reg_rem_mask_short, 1); + jz(epilogue_end, T_NEAR); + kmovq(kreg_rem_mask_short, reg_rem_mask_short); + compute(0, 0, true); + } - const bool use_fast_path = true - && scale_idx_mult == 0 - && jcp.ngroups == 1 - && !jcp.with_bias; - const float fast_path_alpha = scales[0] / jcp.wei_adj_scale; + L(epilogue_end); - const auto &post_ops = conf_.attr()->post_ops_; + postamble(); + + ker_ = getCode(); +} + +template +void _gemm_x8s8s32x_convolution_fwd_t::pp_ker_t::operator () + (dst_data_t *dst, const acc_data_t *acc, const char *bias, + const float *scales, float nslope, float sum_scale, float signed_scale, + int g, size_t start, size_t end) +{ + using math::get_bias; + + if (end <= start) + return; + + if (ker_) { + // JIT + ker_args args; + size_t oc_offset = start % OC_; + size_t os_offset = start / OC_; + args.acc = acc + start; + args.dst = dst + os_offset * dst_os_stride_ + oc_offset; + args.bias = bias + (g * jcp_.oc + oc_offset) * bias_data_type_size_; + args.scales = scales + scale_idx_mult_ * (g * jcp_.oc + oc_offset); + args.nslope = nslope; + args.sum_scale = sum_scale; + args.signed_scale = signed_scale; + args.len = end - start; + args.oc_offset = oc_offset; + ker_(&args); + } + else { + // Fallback + const size_t first_oc = start % OC_; + const size_t last_oc = (end - 1) % OC_; + const size_t first_os = start / OC_; + const size_t last_os = (end - 1) / OC_; + for (size_t os = first_os; os <= last_os; os++) { + const size_t start_oc = (os == first_os) ? first_oc : 0; + const size_t end_oc = (os == last_os) ? last_oc : OC_ - 1; + for (size_t oc = start_oc; oc <= end_oc; oc++) { + const size_t acc_off = os * jcp_.oc + oc; + const size_t dst_off = os * dst_os_stride_ + oc; + + float d = (float)(acc[acc_off]); + if (jcp_.signed_input) + d *= signed_scale; + + if (do_bias_) + d += get_bias(bias, g * jcp_.oc + oc, + bias_data_type_); + + d *= scales[(g * jcp_.oc + oc) * scale_idx_mult_]; + if (do_sum_) + d += sum_scale * dst[dst_off]; + if (do_relu_ && d < 0) + d *= nslope; + dst[dst_off] = qz_a1b0()(d, rmode_); + } + } + } +}; + +template +void _gemm_x8s8s32x_convolution_fwd_t:: +execute_forward_thr(const int ithr, const int nthr, const src_data_t *src_base, + const wei_data_t *wei_base, const char *bia_base, dst_data_t *dst_base, + const memory_tracking::grantor_t &scratchpad) const { + const jit_gemm_conv_conf_t &jcp = this->pd()->jcp_; + + const auto src_md = memory_desc_wrapper(pd()->src_pd()); + const size_t src_mb_stride = src_md.blk_off(1); + const size_t src_g_stride = src_md.blk_off(0, 1) * jcp.ic; + + const auto wei_md = memory_desc_wrapper(pd()->weights_pd(0)); + const size_t wei_g_stride = pd()->with_groups() ? wei_md.blk_off(1) : 0; + + const auto dst_md = memory_desc_wrapper(pd()->dst_pd()); + const size_t dst_mb_stride = dst_md.blk_off(1); + const size_t dst_g_stride = dst_md.blk_off(0, 1) * jcp.oc; + + const float *scales = pd()->attr()->output_scales_.scales_; + + const auto &post_ops = pd()->attr()->post_ops_; const bool do_sum = post_ops.contain(primitive_kind::sum, 0); const float sum_scale = do_sum ? post_ops.entry_[0].sum.scale : 0; - float nslope = jcp.with_relu ? jcp.relu_negative_slope : 0; - int entry_idx = -1; + float nslope = 0; for (int idx = 0; idx < post_ops.len_; ++idx) { const auto &e = post_ops.entry_[idx]; if (e.is_relu(true, false)) { - entry_idx = idx; nslope = e.eltwise.alpha; break; } } - const bool do_relu = jcp.with_relu || (entry_idx >= 0); - - uint8_t *_col = (uint8_t *)scratchpad; - ptrdiff_t offset = (ptrdiff_t)jcp.im2col_sz * sizeof(uint8_t) * jcp.nthr; - acc_data_t *_acc = (acc_data_t *)(scratchpad + offset); - uint8_t *col = _col + (ptrdiff_t)ithr * jcp.im2col_sz; - acc_data_t *acc = _acc + (ptrdiff_t)ithr * jcp.os * jcp.oc; + auto col = scratchpad.get(key_conv_gemm_col) + + (ptrdiff_t)ithr * jcp.im2col_sz; + auto acc = scratchpad.get(key_conv_int_dat_in_acc_dt) + + (ptrdiff_t)ithr * jcp.os * jcp.oc; - offset = (ptrdiff_t)jcp.ngroups * jcp.ks * jcp.ic * jcp.oc; + const ptrdiff_t offset = (ptrdiff_t)jcp.ngroups * jcp.ks * jcp.ic * jcp.oc; const int32_t *_wei_comp = (const int32_t *)(wei_base + offset); int n{0}, g{0}; @@ -147,62 +595,40 @@ void _gemm_x8s8s32x_convolution_fwd_t()(d, rmode); - }; - -# if MKLDNN_THR == MKLDNN_THR_OMP && _OPENMP >= 201307 -# pragma omp parallel for simd - for (int o = 0; o < jcp.os * jcp.oc; ++o) body(o); -# else - parallel_nd(jcp.os * jcp.oc, body); -# endif - } else { - parallel_nd(jcp.os, jcp.oc, [&](const int os, const int oc) { - const size_t acc_off = os * jcp.oc + oc; - float d = (float)acc[acc_off]; - if (jcp.signed_input) - d /= jcp.wei_adj_scale; - - if (jcp.with_bias) - d += get_bias(g * jcp.oc + oc); - - d *= scales[(g * jcp.oc + oc) * scale_idx_mult]; - - const size_t dst_off = os * dst_os_stride + oc; - if (do_sum) d += sum_scale * dst[dst_off]; - if (do_relu && d < 0) d *= nslope; - dst[dst_off] = qz_a1b0()(d, rmode); - }); - } nd_iterator_step(n, jcp.mb, g, jcp.ngroups); } -#endif } template -void _gemm_u8s8s32x_convolution_bwd_data_t::execute_backward_data() { +void _gemm_u8s8s32x_convolution_bwd_data_t:: +execute_backward_data() const { auto diff_dst_base = reinterpret_cast (this->input_memory(0)); auto wei_base = reinterpret_cast(this->input_memory(1)); auto bia_base = reinterpret_cast(this->input_memory(2)); auto diff_src_base = reinterpret_cast(this->memory()); - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; - char *scratchpad = (char *)this->scratchpad_->get(); + auto scratchpad = this->scratchpad(); + + const jit_gemm_conv_conf_t &jcp = this->pd()->jcp_; parallel(jcp.nthr, [&](const int ithr, const int nthr) { execute_backward_data_thr(ithr, nthr, diff_dst_base, wei_base, @@ -211,53 +637,36 @@ void _gemm_u8s8s32x_convolution_bwd_data_t::execute_backward_data() { } template -void _gemm_u8s8s32x_convolution_bwd_data_t -::execute_backward_data_thr(const int ithr, const int nthr, +void _gemm_u8s8s32x_convolution_bwd_data_t:: +execute_backward_data_thr(const int ithr, const int nthr, const diff_dst_data_t *diff_dst_base, const wei_data_t *wei_base, - const char *bia_base, diff_src_data_t *diff_src_base, char *scratchpad) + const char *bia_base, diff_src_data_t *diff_src_base, + const memory_tracking::grantor_t &scratchpad) const { -#if USE_MKL_IGEMM - jit_gemm_conv_conf_t &jcp = this->conf_.jcp_; + const jit_gemm_conv_conf_t &jcp = this->pd()->jcp_; - const auto diff_dst_md = memory_desc_wrapper(conf_.diff_dst_pd()); + const auto diff_dst_md = memory_desc_wrapper(pd()->diff_dst_pd()); const size_t diff_dst_mb_stride = diff_dst_md.blk_off(1); const size_t diff_dst_g_stride = diff_dst_md.blk_off(0, 1) * jcp.oc; - const auto wei_md = memory_desc_wrapper(conf_.weights_pd(0)); - const size_t wei_g_stride = conf_.with_groups() ? wei_md.blk_off(1) : 0; + const auto wei_md = memory_desc_wrapper(pd()->weights_pd(0)); + const size_t wei_g_stride = pd()->with_groups() ? wei_md.blk_off(1) : 0; - const auto diff_src_md = memory_desc_wrapper(conf_.diff_src_pd()); + const auto diff_src_md = memory_desc_wrapper(pd()->diff_src_pd()); const size_t diff_src_mb_stride = diff_src_md.blk_off(1); const size_t diff_src_g_stride = diff_src_md.blk_off(0, 1) * jcp.ic; const size_t diff_src_os_stride = diff_src_md.blk_off(0, 0, 0, 1); - auto get_bias = [=, &bia_base](size_t off) -> acc_data_t { -# define CASE(dt) case dt: return (acc_data_t)\ - (*((const prec_traits
::type *)bia_base + off)) - switch (conf_.desc()->bias_desc.data_type) { - CASE(data_type::s8); - CASE(data_type::u8); - CASE(data_type::s32); - CASE(data_type::f32); - default: assert(!"unimplemented"); - } -# undef CASE - return 0; - }; - /* scale_idx_mult = 1 for per_oc scales and 0, otherwise */ - const int scale_idx_mult = conf_.attr()->output_scales_.mask_ == (1 << 1); - const float *scales = conf_.attr()->output_scales_.scales_; - const auto rmode = conf_.attr()->round_mode_; + const int scale_idx_mult = pd()->attr()->output_scales_.mask_ == (1 << 1); + const float *scales = pd()->attr()->output_scales_.scales_; + const auto rmode = pd()->attr()->round_mode_; const size_t work_amount = jcp.ngroups * jcp.mb; - acc_data_t *_col = (acc_data_t *)scratchpad; - ptrdiff_t offset = (ptrdiff_t)jcp.im2col_sz - * sizeof(acc_data_t) * jcp.nthr; - acc_data_t *_acc = (acc_data_t *)(scratchpad + offset); - - acc_data_t *col = _col + (ptrdiff_t)ithr * jcp.im2col_sz; - acc_data_t *acc = _acc + (ptrdiff_t)ithr * jcp.is * jcp.ic; + auto col = scratchpad.get(key_conv_gemm_col) + + (ptrdiff_t)ithr * jcp.im2col_sz; + auto acc = scratchpad.get(key_conv_int_dat_in_acc_dt) + + (ptrdiff_t)ithr * jcp.is * jcp.ic; int n{0}, g{0}; size_t start = 0, end = 0; @@ -277,11 +686,12 @@ void _gemm_u8s8s32x_convolution_bwd_data_t const int K = jcp.oc; const int8_t off_a = 0, off_b = 0; const int32_t off_c = 0; + const float onef = 1.0, zerof = 0.0; + const int LD = K * jcp.ngroups; - cblas_gemm_s8u8s32(CblasColMajor, CblasTrans, CblasNoTrans, - CblasFixOffset, M, N, K, 1., wei, K * jcp.ngroups, off_a, - diff_dst, K * jcp.ngroups, off_b, 0., jcp.im2col_sz ? col - : acc, M, &off_c); + mkldnn_gemm_s8u8s32("T", "N", "F", &M, &N, &K, &onef, + wei, &LD, &off_a, diff_dst, &LD, &off_b, + &zerof, jcp.im2col_sz ? col : acc, &M, &off_c); if (jcp.im2col_sz) jit_gemm_convolution_utils::col2im_s32(jcp, col, acc); @@ -289,7 +699,8 @@ void _gemm_u8s8s32x_convolution_bwd_data_t parallel_nd(jcp.is, jcp.ic, [&](int is, int ic) { float d = (float)acc[is * jcp.ic + ic]; if (jcp.with_bias) - d += get_bias(g * jcp.ic + ic); + d += get_bias(bia_base, g * jcp.ic + ic, + pd()->desc()->bias_desc.data_type); d *= scales[(g * jcp.ic + ic) * scale_idx_mult]; const size_t diff_src_off = is * diff_src_os_stride + ic; diff_src[diff_src_off] = @@ -297,28 +708,19 @@ void _gemm_u8s8s32x_convolution_bwd_data_t }); nd_iterator_step(n, jcp.mb, g, jcp.ngroups); } -#endif } using namespace data_type; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; - -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; -template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; + +template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; +template struct _gemm_x8s8s32x_convolution_fwd_t; template struct _gemm_u8s8s32x_convolution_bwd_data_t; template struct _gemm_u8s8s32x_convolution_bwd_data_t; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.hpp index 3bc0cc4..e7943ac 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.hpp @@ -18,28 +18,31 @@ #define GEMM_X8S8S32X_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "jit_primitive_conf.hpp" +#include "jit_generator.hpp" #include "gemm_convolution_utils.hpp" -#include "gemm/os_blas.hpp" +#include "gemm/gemm.hpp" namespace mkldnn { namespace impl { namespace cpu { -template +template struct _gemm_x8s8s32x_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd), jcp_() {} + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_() {} - DECLARE_COMMON_PD_T("gemm:blas", - _gemm_x8s8s32x_convolution_fwd_t); + DECLARE_COMMON_PD_T(IGEMM_S8U8S32_IMPL_STR, + _gemm_x8s8s32x_convolution_fwd_t); virtual status_t init() override { using namespace data_type; @@ -48,30 +51,33 @@ struct _gemm_x8s8s32x_convolution_fwd_t: public cpu_primitive_t { assert(this->engine()->kind() == engine_kind::cpu); bool ok = true -#if !USE_MKL_IGEMM - && false -#endif && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, + && utils::one_of(this->desc()->prop_kind, prop_kind::forward_training, prop_kind::forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().dst_desc.data_type == dst_type - && this->cdesc_().weights_desc.data_type == s8 + && this->desc()->src_desc.data_type == src_type + && this->desc()->dst_desc.data_type == dst_type + && this->desc()->weights_desc.data_type == s8 && IMPLICATION(this->with_bias(), utils::one_of( - this->cdesc_().bias_desc.data_type, f32, s32, s8, + this->desc()->bias_desc.data_type, f32, s32, s8, u8)) - && this->cdesc_().accum_data_type == data_type::s32 + && this->desc()->accum_data_type == data_type::s32 && utils::everyone_is(nhwc, this->src_pd_.desc()->format, this->dst_pd_.desc()->format) && this->weights_pd_.desc()->format == (this->with_groups() ? ((src_type == data_type::s8) ? hwigo_s8s8 : hwigo) : ((src_type == data_type::s8) ? hwio_s8s8 : hwio)) && this->is_gemm_conv_format(); + if (!ok) return status::unimplemented; - return ok ? status::success : status::unimplemented; + auto scratchpad = scratchpad_registry().registrar(); + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + *this->desc(), this->src_pd(), this->weights_pd(0), + this->dst_pd(), mkldnn_get_max_threads()); } jit_gemm_conv_conf_t jcp_; @@ -79,94 +85,127 @@ struct _gemm_x8s8s32x_convolution_fwd_t: public cpu_primitive_t { protected: virtual status_t set_default_params() override { using namespace memory_format; - bool is_sign_input = - (this->cdesc_().src_desc.data_type == data_type::s8); + const bool is_sign_input = + this->desc()->src_desc.data_type == data_type::s8; + if (this->src_pd_.desc()->format == any) CHECK(this->src_pd_.set_format(nhwc)); if (this->dst_pd_.desc()->format == any) CHECK(this->dst_pd_.set_format(nhwc)); if (this->weights_pd_.desc()->format == any) CHECK(this->weights_pd_.set_format(this->with_groups() - ? ((is_sign_input) ? hwigo_s8s8 : hwigo) - : ((is_sign_input) ? hwio_s8s8 : hwio))); + ? (is_sign_input ? hwigo_s8s8 : hwigo) + : (is_sign_input ? hwio_s8s8 : hwio))); if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } virtual bool is_gemm_conv_format() const { using namespace mkldnn::impl::primitive_kind; - bool ok = true; auto const &po = this->attr()->post_ops_; + auto is_relu = [&](int idx) { + return po.entry_[idx].is_relu(true, false); }; + switch (po.len_) { - case 0: break; - case 1: ok = ok - && (po.entry_[0].is_relu() || po.contain(sum, 0)); - break; - case 2: ok = ok - && (po.contain(sum, 0) && po.entry_[1].is_relu()); - break; - default: ok = false; + case 0: return true; + case 1: return is_relu(0) || po.contain(sum, 0); + case 2: return po.contain(sum, 0) && is_relu(1); + default: return false; } - return ok; + return false; } }; - _gemm_x8s8s32x_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + _gemm_x8s8s32x_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , scratchpad_(nullptr) - { - jit_gemm_convolution_utils::init_conf(conf_.jcp_, - *conf_.cdesc(), conf_.src_pd(), conf_.weights_pd(0), - conf_.dst_pd(), mkldnn_get_max_threads(), with_relu, conf_.negative_slope()); - - size_t col_size = (size_t)conf_.jcp_.im2col_sz * sizeof(src_data_t); - size_t acc_size = (size_t)conf_.jcp_.os * conf_.jcp_.oc - * sizeof(acc_data_t); - size_t size = col_size + acc_size; - - jit_gemm_convolution_utils::prepare_scratchpad(this->conf_.jcp_, - &this->scratchpad_, size, this->conf_.jcp_.nthr); + : cpu_primitive_t(apd, inputs, outputs, true) { + pp_ker_ = new pp_ker_t(apd); } - ~_gemm_x8s8s32x_convolution_fwd_t() { - delete this->scratchpad_; - }; + delete pp_ker_; + } typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + void execute_forward() const; + // XXX: this is throwaway code that will become unnecessary when we have a + // sufficiently advanced igemm jit generator that supports quantization, + // relu, and whatnot + class pp_ker_t : jit_generator { + public: + DECLARE_CPU_JIT_AUX_FUNCTIONS( + _gemm_x8s8s32x_convolution_fwd_t::pp_kernel); + pp_ker_t(const pd_t *pd); + + void operator()(dst_data_t *dst, const acc_data_t *acc, + const char *bias, const float *scales, + float nslope, float sum_scale, float signed_scale, + int g, size_t start, size_t end); + private: + void generate(); + + struct ker_args { + dst_data_t *dst; + const acc_data_t *acc; + const char *bias; + const float *scales; + float nslope; + float sum_scale; + float signed_scale; + size_t len; + size_t oc_offset; + }; + void(*ker_)(const ker_args *args); + + const jit_gemm_conv_conf_t jcp_; + size_t OC_; + size_t OS_; + data_type_t bias_data_type_; + size_t bias_data_type_size_; + size_t scale_idx_mult_; + round_mode_t rmode_; + bool do_bias_; + bool do_relu_; + bool do_sum_; + bool do_signed_scaling_; + size_t dst_os_stride_; + size_t vlen_; + }; + + void execute_forward_thr(const int ithr, const int nthr, const src_data_t *src_base, const wei_data_t *wei_base, const char *bia_base, dst_data_t *dst_base, - char *scratchpad); - pd_t conf_; - scratchpad_t *scratchpad_; + const memory_tracking::grantor_t &scratchpad) const; + int nthr_; + pp_ker_t *pp_ker_; + }; template struct _gemm_u8s8s32x_convolution_bwd_data_t: public cpu_primitive_t { struct pd_t: public cpu_convolution_bwd_data_pd_t{ pd_t(engine_t *engine, - const convolution_desc_t *adesc, - const primitive_attr_t *attr, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const convolution_fwd_pd_t *hint_fwd_pd) : cpu_convolution_bwd_data_pd_t(engine, adesc, attr, hint_fwd_pd) - , jcp_() - {} + , jcp_() {} - DECLARE_COMMON_PD_T("gemm:blas", + DECLARE_COMMON_PD_T(IGEMM_S8U8S32_IMPL_STR, _gemm_u8s8s32x_convolution_bwd_data_t); virtual status_t init() override { @@ -176,12 +215,10 @@ struct _gemm_u8s8s32x_convolution_bwd_data_t: public cpu_primitive_t { assert(this->engine()->kind() == engine_kind::cpu); bool ok = true -#if !USE_MKL_IGEMM - && false -#endif && this->set_default_params() == status::success && this->desc()->prop_kind == prop_kind::backward_data - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && this->desc()->diff_src_desc.data_type == dst_type && this->desc()->diff_dst_desc.data_type == u8 @@ -195,8 +232,12 @@ struct _gemm_u8s8s32x_convolution_bwd_data_t: public cpu_primitive_t { && this->weights_pd_.desc()->format == (this->with_groups() ? hwigo : hwio) && attr()->post_ops_.has_default_values(); + if (!ok) return status::unimplemented; - return ok ? status::success : status::unimplemented; + auto scratchpad = scratchpad_registry().registrar(); + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + *this->desc(), this->diff_src_pd(), this->weights_pd(0), + this->diff_dst_pd(), mkldnn_get_max_threads()); } virtual bool support_bias() const override { return true; } @@ -206,59 +247,44 @@ struct _gemm_u8s8s32x_convolution_bwd_data_t: public cpu_primitive_t { protected: virtual status_t set_default_params() override { using namespace memory_format; + if (this->diff_src_pd_.desc()->format == any) CHECK(this->diff_src_pd_.set_format(nhwc)); if (this->diff_dst_pd_.desc()->format == any) CHECK(this->diff_dst_pd_.set_format(nhwc)); if (this->weights_pd_.desc()->format == any) - CHECK(this->weights_pd_.set_format(this->with_groups() - ? hwigo : hwio)); + CHECK(this->weights_pd_.set_format( + this->with_groups() ? hwigo : hwio)); if (bias_pd_.desc()->format == any) CHECK(bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _gemm_u8s8s32x_convolution_bwd_data_t(const pd_t *pd, const input_vector &inputs, + _gemm_u8s8s32x_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , scratchpad_(nullptr) - { - jit_gemm_convolution_utils::init_conf(conf_.jcp_, - *conf_.desc(), conf_.diff_src_pd(), conf_.weights_pd(0), - conf_.diff_dst_pd(), mkldnn_get_max_threads()); - - size_t col_size = (size_t)conf_.jcp_.im2col_sz * sizeof(acc_data_t); - size_t acc_size = (size_t)conf_.jcp_.is * conf_.jcp_.ic - * sizeof(acc_data_t); - size_t size = col_size + acc_size; - - jit_gemm_convolution_utils::prepare_scratchpad(this->conf_.jcp_, - &this->scratchpad_, size, this->conf_.jcp_.nthr); - } - - ~_gemm_u8s8s32x_convolution_bwd_data_t() { - delete this->scratchpad_; - }; + : cpu_primitive_t(apd, inputs, outputs, true) {} + ~_gemm_u8s8s32x_convolution_bwd_data_t() {} typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type diff_src_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward_data(); e->set_state(event_t::ready); } private: - void execute_backward_data(); + void execute_backward_data() const; void execute_backward_data_thr(const int ithr, const int nthr, const diff_dst_data_t *diff_dst_base, const wei_data_t *wei_base, const char *bia_base, diff_src_data_t *diff_src_base, - char *scratchpad); - pd_t conf_; - scratchpad_t *scratchpad_; + const memory_tracking::grantor_t &scratchpad) const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.cpp new file mode 100644 index 0000000..d49a781 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.cpp @@ -0,0 +1,461 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" +#include "simple_q10n.hpp" +#include "gemm_x8s8s32x_inner_product.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace math; +using namespace memory_format; +using namespace memory_tracking::names; + +template +gemm_x8s8s32x_inner_product_fwd_t::pp_kernel_t::pp_kernel_t( + const pd_t *pd, bool dst_is_acc) + : ker_(nullptr), OC_(pd->OC()) + , bias_data_type_(data_type::undef), bias_data_type_size_(0) + , scale_idx_mult_(0), rmode_(round_mode::nearest) + , do_bias_(false), do_relu_(false) +{ + using namespace types; + + scale_idx_mult_ = (pd->attr()->output_scales_.mask_ == (1 << 1)); + rmode_ = pd->attr()->round_mode_; + + auto &post_ops = pd->attr()->post_ops_; + do_relu_ = post_ops.len_ == 1; + do_bias_ = pd->with_bias(); + bias_data_type_ = pd->desc()->bias_desc.data_type; + if (do_bias_) { + assert(bias_data_type_ != data_type::undef); + bias_data_type_size_ = data_type_size(bias_data_type_); + } + + if (!mayiuse(avx512_core)) + // use fallback code for older CPUs since they do not have optimized + // x8s8s32 GEMM anyways. The configuration variables above are used by + // the fallback code. + return; + else + generate(); +} + +template +void gemm_x8s8s32x_inner_product_fwd_t::pp_kernel_t::generate() +{ + using namespace Xbyak; + using namespace utils; + using namespace round_mode; + + // TODO: clean-up + Reg64 reg_param = abi_param1; + Reg64 reg_dst = rdx; + Reg64 reg_acc = rax; + Reg64 reg_bias = rbx; + Reg64 reg_scales = rsi; + + Reg64 reg_len = r8; + Reg64 reg_tmp = rcx; // intentional for shifting purposes + Reg64 reg_oc_offset = r9; + Reg64 reg_rem_mask = r10; + Opmask kreg_rem_mask = k1; + Opmask kreg_relu_cmp = k2; + + const size_t vlen = cpu_isa_traits::vlen / sizeof(float); + + Zmm vreg_zero = Zmm(0); + Zmm vreg_scale = Zmm(1); + Zmm vreg_nslope = Zmm(2); + + auto vreg_dst = [&](int idx) { return Zmm(3 + idx * 2 + 0); }; + auto vreg_bias = [&](int idx) { return Zmm(3 + idx * 2 + 1); }; + + preamble(); + +#define PARAM_OFF(x) offsetof(ker_args, x) + mov(reg_dst, ptr[reg_param + PARAM_OFF(dst)]); + mov(reg_acc, ptr[reg_param + PARAM_OFF(acc)]); + mov(reg_bias, ptr[reg_param + PARAM_OFF(bias)]); + mov(reg_scales, ptr[reg_param + PARAM_OFF(scales)]); + mov(reg_len, ptr[reg_param + PARAM_OFF(len)]); + mov(reg_oc_offset, ptr[reg_param + PARAM_OFF(oc_offset)]); + vbroadcastss(vreg_nslope, ptr[reg_param + PARAM_OFF(nslope)]); + if (scale_idx_mult_ == 0) + vbroadcastss(vreg_scale, dword[reg_scales]); +#undef PARAM_OFF + + if (do_relu_ || dst_type == data_type::u8) + vxorps(vreg_zero, vreg_zero, vreg_zero); + + // Load accumulated value, convert to float, apply bias (if any), scaling, + // and relu (if any); then convert to destination type and store + auto compute = [&](size_t offset, int idx, bool apply_mask) { + auto acc_addr = ptr[reg_acc + offset * sizeof(acc_data_t)]; + + if (scale_idx_mult_ > 0) { + assert(scale_idx_mult_ == 1); + auto scale_addr = ptr[reg_scales + offset * sizeof(float)]; + auto vreg_scale_ = vreg_scale; + if (apply_mask) + vreg_scale_ = vreg_scale_ | kreg_rem_mask; + vmovups(vreg_scale, scale_addr); + } + + auto vreg_dst_ = vreg_dst(idx); + if (apply_mask) + vreg_dst_ = vreg_dst_ | kreg_rem_mask; + vcvtdq2ps(vreg_dst_, acc_addr); + + if (do_bias_) { + auto bias_addr = ptr[reg_bias + offset * bias_data_type_size_]; + auto vreg_bias_ = vreg_bias(idx); + if (apply_mask) + vreg_bias_ = vreg_bias_ | kreg_rem_mask; + + switch (bias_data_type_) { + case data_type::s8: + vpmovsxbd(vreg_bias_, bias_addr); + break; + case data_type::u8: + vpmovzxbd(vreg_bias_, bias_addr); + break; + case data_type::s32: + case data_type::f32: + vmovups(vreg_bias_, bias_addr); + break; + default: assert(!"unimplemented"); + } + if (bias_data_type_ != data_type::f32) + vcvtdq2ps(vreg_bias(idx), vreg_bias(idx)); + vaddps(vreg_dst(idx), vreg_dst(idx), vreg_bias(idx)); + } + + vmulps(vreg_dst(idx), vreg_dst(idx), vreg_scale); + if (do_relu_) { + vcmpps(kreg_relu_cmp, vreg_dst(idx), vreg_zero, _cmp_lt_os); + vmulps(vreg_dst(idx) | kreg_relu_cmp, vreg_dst(idx), vreg_nslope); + } + + if (dst_type == data_type::u8) + vmaxps(vreg_dst(idx), vreg_dst(idx), vreg_zero); + + if (dst_type != data_type::f32) { + auto rmode_control = (rmode_ == nearest ? T_rn_sae : T_rd_sae); + vcvtps2dq(vreg_dst(idx) | rmode_control, vreg_dst(idx)); + } + + auto dst_addr = ptr[reg_dst + offset * sizeof(dst_data_t)]; + switch (dst_type) { + case data_type::s8: + vpmovsdb(dst_addr, vreg_dst_); + break; + case data_type::u8: + vpmovusdb(dst_addr, vreg_dst_); + break; + case data_type::f32: + case data_type::s32: + vmovups(dst_addr, vreg_dst_); + break; + default: assert(!"unimplemented"); + } + }; + + // Advance all pointers by an immediate + auto advance_ptrs_imm = [&](size_t offset) { + add(reg_dst, offset * sizeof(dst_data_t)); + add(reg_acc, offset * sizeof(acc_data_t)); + if (scale_idx_mult_) { + assert(scale_idx_mult_ == 1); + add(reg_scales, offset * sizeof(float)); + } + if (do_bias_) + add(reg_bias, offset * bias_data_type_size_); + }; + + // Advance all pointers by a value stored in a register + auto advance_ptrs_reg = [&](Reg64 offset) { + lea(reg_dst, ptr[reg_dst + offset * sizeof(dst_data_t)]); + lea(reg_acc, ptr[reg_acc + offset * sizeof(acc_data_t)]); + if (scale_idx_mult_) { + assert(scale_idx_mult_ == 1); + lea(reg_scales, ptr[reg_scales + offset * sizeof(float)]); + } + if (do_bias_) + lea(reg_bias, ptr[reg_bias + offset * bias_data_type_size_]); + }; + + // Rewind pointers that point to data that is indixed by output channel + // (bias or per-oc scaling factors) + auto rewind_ptrs = [&]() { + if (do_bias_) + sub(reg_bias, OC_ * bias_data_type_size_); + if (scale_idx_mult_) { + assert(scale_idx_mult_ == 1); + sub(reg_scales, OC_ * sizeof(float)); + } + }; + + // <-------------------- OC -------------------------------> + // + // ^ +....................+----------------------------------+ + // | : not accessed | Prologue loop | + // | +--------------------+----------------------------------+ + // | | + // M | Main loop (unrolled) | + // B | | + // +--------------------------------+----------------------+ + // | | Epilogue loop | not accessed : + // v +--------------------------------+......................+ + + Label prologue_end; + cmp(reg_oc_offset, 0); + je(prologue_end, T_NEAR); + + // Prologue loop + { + mov(reg_tmp, OC_); + sub(reg_tmp, reg_oc_offset); + cmp(reg_tmp, reg_len); + cmovg(reg_tmp, reg_len); + sub(reg_len, reg_tmp); + + Label prologue_loop, prologue_loop_tail, prologue_loop_end; + cmp(reg_tmp, vlen); + jle(prologue_loop_tail, T_NEAR); // Skips for reg_tmp == 16 too (?) + L(prologue_loop); { + compute(0, 0, false); + advance_ptrs_imm(vlen); + sub(reg_tmp, vlen); + cmp(reg_tmp, vlen); + jge(prologue_loop, T_NEAR); + } + + L(prologue_loop_tail); + mov(reg_rem_mask, 1); + shl(reg_rem_mask, cl); // cl == reg_tmp because reg_tmp <= vlen here + sub(reg_rem_mask, 1); + jz(prologue_loop_end, T_NEAR); + + kmovq(kreg_rem_mask, reg_rem_mask); + compute(0, 0, true); + advance_ptrs_reg(reg_tmp); + + L(prologue_loop_end); + rewind_ptrs(); + } + L(prologue_end); + + // Main loop + Label main_loop_end; + { + cmp(reg_len, OC_); + jle(main_loop_end, T_NEAR); + + Label main_loop; + L(main_loop); { + size_t def_unroll = 4; + size_t max_unroll = 13; + + size_t OC_loop, OC_tail; + if (OC_ < max_unroll * vlen) { + // Fully unroll small loops + OC_loop = 0; + OC_tail = OC_; + } else { + OC_loop = vlen * def_unroll; + OC_tail = OC_ % OC_loop; + } + + assert(!!OC_loop || !!OC_tail); + + if (OC_tail % vlen) { + int vlen_tail = OC_tail % vlen; + unsigned tail_mask = (1 << vlen_tail) - 1; + mov(reg_tmp, tail_mask); + kmovq(kreg_rem_mask, reg_tmp); + } + + if (OC_loop) { + mov(reg_tmp, rnd_dn(OC_, OC_loop)); + Label oc_loop; + L(oc_loop); { + for (size_t offset = 0; offset < OC_loop; offset += vlen) + compute(offset, offset / vlen, false); + advance_ptrs_imm(OC_loop); + sub(reg_tmp, OC_loop); + jnz(oc_loop); + } + } + + if (OC_tail) { + for (size_t offset = 0; offset < OC_tail; offset += vlen) { + bool use_mask = (offset + vlen) > OC_tail; + compute(offset, offset / vlen, use_mask); + } + advance_ptrs_imm(OC_tail); + } + + rewind_ptrs(); + sub(reg_len, OC_); + cmp(reg_len, OC_); + jge(main_loop, T_NEAR); + } + } + L(main_loop_end); + + // Epilogue loop + Label epilogue_end; + { + cmp(reg_len, 0); + je(epilogue_end, T_NEAR); + + Label epilogue_loop, epilogue_loop_tail; + cmp(reg_len, vlen); + jle(epilogue_loop_tail, T_NEAR); // Skips for reg_len == 16 (?) + L(epilogue_loop); { + compute(0, 0, false); + sub(reg_len, vlen); + advance_ptrs_imm(vlen); + cmp(reg_len, vlen); + jge(epilogue_loop, T_NEAR); + } + + L(epilogue_loop_tail); + mov(reg_tmp, reg_len); // reg_tmp is rcx, and we need cl for the shift + mov(reg_rem_mask, 1); + shl(reg_rem_mask, cl); // reg_tmp == rcx and reg_tail < vlen == 16 + sub(reg_rem_mask, 1); + jz(epilogue_end, T_NEAR); + kmovq(kreg_rem_mask, reg_rem_mask); + compute(0, 0, true); + } + + L(epilogue_end); + + postamble(); + + ker_ = getCode(); +} + +template +void gemm_x8s8s32x_inner_product_fwd_t::pp_kernel_t::operator ()( + dst_data_t *dst, const acc_data_t *acc, + const char *bias, const float *scales, float nslope, + size_t start, size_t end) +{ + using math::get_bias; + + if (end <= start) + return; + + if (ker_) { + // JIT + ker_args args; + size_t oc_offset = start % OC_; + args.dst = dst + start; + args.acc = acc + start; + args.bias = bias + oc_offset * bias_data_type_size_; + args.scales = scales + scale_idx_mult_ * oc_offset; + args.nslope = nslope; + args.len = end - start; + args.oc_offset = oc_offset; + ker_(&args); + } else { + // Fallback + size_t oc = start % OC_; + for (size_t i = start; i < end; i++) { + float d = (float)acc[i]; + float b = get_bias(bias, oc, bias_data_type_); + d = d + b; + d *= scales[oc * scale_idx_mult_]; + if (do_relu_ && d < 0) + d *= nslope; + dst[i] = qz_a1b0()(d, rmode_); + oc = (oc == OC_ - 1) ? 0 : oc + 1; + } + } +}; + +template +void gemm_x8s8s32x_inner_product_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto bias = reinterpret_cast(this->input_memory(2)); + auto dst = reinterpret_cast(this->memory()); + + const int MB = pd()->MB(); + const int OC = pd()->OC(); + + bool wei_tr = utils::one_of(pd()->weights_pd()->desc()->format, + oihw, oidhw, oi); + + const int M = OC; + const int N = MB; + const int K = pd()->IC_total_padded(); + const int8_t off_a = 0, off_b = 0; + const int32_t off_c = 0; + + const float *scales = pd()->attr()->output_scales_.scales_; + + const auto &post_ops = pd()->attr()->post_ops_; + const bool do_relu = post_ops.len_ == 1; + const float nslope = do_relu ? post_ops.entry_[0].eltwise.alpha : 0.f; + + acc_data_t *acc = pd()->dst_is_acc_ + ? (acc_data_t *)dst + : scratchpad().template get(key_iprod_int_dat_in_acc_dt); + + const float onef = 1.0, zerof = 0.0; + + if (src_type == data_type::u8) { + mkldnn_gemm_s8u8s32(wei_tr ? "T" : "N", "N", "F", &M, &N, &K, &onef, + weights, wei_tr ? &K : &M, &off_a, (uint8_t *)src, &K, &off_b, &zerof, + acc, &M, &off_c); + } else if (src_type == data_type::s8) { + mkldnn_gemm_s8s8s32(wei_tr ? "T" : "N", "N", "F", &M, &N, &K, &onef, + weights, wei_tr ? &K : &M, &off_a, (int8_t *)src, &K, &off_b, &zerof, + acc, &M, &off_c); + } else { + assert(!"incorrect src type"); + } + + const bool force_sequential = MB * OC < 2000; + parallel(force_sequential ? 1 : 0, [&](int ithr, int nthr) { + size_t start, end; + balance211((size_t)OC * MB, nthr, ithr, start, end); + (*pp_kernel_)(dst, acc, bias, scales, nslope, start, end); + }); +} + +using namespace data_type; + +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +template struct gemm_x8s8s32x_inner_product_fwd_t; +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.hpp similarity index 56% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.hpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.hpp index a4163fe..0fadd174 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_inner_product.hpp @@ -14,33 +14,37 @@ * limitations under the License. *******************************************************************************/ -#ifndef GEMM_U8S8S32X_INNER_PRODUCT_HPP -#define GEMM_U8S8S32X_INNER_PRODUCT_HPP +#ifndef GEMM_X8S8S32X_INNER_PRODUCT_HPP +#define GEMM_X8S8S32X_INNER_PRODUCT_HPP #include #include "c_types_map.hpp" -#include "cpu_inner_product_pd.hpp" -#include "cpu_engine.hpp" +#include "memory_tracking.hpp" #include "type_helpers.hpp" #include "utils.hpp" -#include "scratchpad.hpp" -#include "gemm/os_blas.hpp" +#include "gemm/gemm.hpp" +#include "jit_generator.hpp" + +#include "cpu_inner_product_pd.hpp" namespace mkldnn { namespace impl { namespace cpu { -template -struct gemm_u8s8s32x_inner_product_fwd_t: public cpu_primitive_t { +template +struct gemm_x8s8s32x_inner_product_fwd_t: public cpu_primitive_t { struct pd_t: public cpu_inner_product_fwd_pd_t { pd_t(engine_t *engine, const inner_product_desc_t *adesc, const primitive_attr_t *attr, const inner_product_fwd_pd_t *hint_fwd_pd) : cpu_inner_product_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {} - DECLARE_COMMON_PD_T("gemm:blas", gemm_u8s8s32x_inner_product_fwd_t); + DECLARE_COMMON_PD_T(src_type == data_type::u8 + ? IGEMM_S8U8S32_IMPL_STR + : IGEMM_S8S8S32_IMPL_STR, + gemm_x8s8s32x_inner_product_fwd_t); virtual status_t init() override { using namespace utils; @@ -49,14 +53,11 @@ struct gemm_u8s8s32x_inner_product_fwd_t: public cpu_primitive_t { assert(engine()->kind() == engine_kind::cpu); bool ok = true -#if !USE_MKL_IGEMM - && false -#endif && this->set_default_params() == status::success && one_of(desc()->prop_kind, prop_kind::forward_training, prop_kind::forward_inference) && !has_zero_dim_memory() - && this->desc()->src_desc.data_type == u8 + && this->desc()->src_desc.data_type == src_type && this->desc()->dst_desc.data_type == dst_type && this->desc()->weights_desc.data_type == s8 && IMPLICATION(this->with_bias(), utils::one_of( @@ -67,63 +68,108 @@ struct gemm_u8s8s32x_inner_product_fwd_t: public cpu_primitive_t { attr()->post_ops_.entry_[0].is_relu(true, false)) && dense_gemm_consitency_check(src_pd(), weights_pd(), dst_pd()); - return ok ? status::success : status::unimplemented; + if (!ok) return status::unimplemented; + + dst_is_acc_ = one_of(dst_type, s32, f32); + + init_scratchpad(); + + return status::success; } + bool dst_is_acc_; + protected: virtual status_t set_default_params() override { using namespace memory_format; - if (this->src_pd_.desc()->format == any) - { + if (this->src_pd_.desc()->format == any) { if (ndims() == 4) CHECK(this->src_pd_.set_format(nhwc)); else if (ndims() == 5) CHECK(this->src_pd_.set_format(ndhwc)); else CHECK(this->src_pd_.set_format(nc)); } if (this->dst_pd_.desc()->format == any) CHECK(this->dst_pd_.set_format(nc)); - if (this->weights_pd_.desc()->format == any) - { + if (this->weights_pd_.desc()->format == any) { if (ndims() == 4) CHECK(this->weights_pd_.set_format(hwio)); else if (ndims() == 5) CHECK(this->weights_pd_.set_format(dhwio)); else CHECK(this->weights_pd_.set_format(io)); } if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + return status::success; } + + private: + void init_scratchpad() { + if (!dst_is_acc_) { + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book( + memory_tracking::names::key_iprod_int_dat_in_acc_dt, + sizeof(acc_data_t) * MB() * OC()); + } + } }; - gemm_u8s8s32x_inner_product_fwd_t(const pd_t *pd, const input_vector &inputs, + gemm_x8s8s32x_inner_product_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), dst_is_acc_(false), - scratchpad_(nullptr) - { - dst_is_acc_ = utils::one_of(dst_type, data_type::s32, data_type::f32); - if (!dst_is_acc_) { - size_t size = conf_.MB() * conf_.OC() * sizeof(acc_data_t); - scratchpad_ = create_scratchpad(size); - } - } - ~gemm_u8s8s32x_inner_product_fwd_t() { delete scratchpad_; }; + : cpu_primitive_t(apd, inputs, outputs, true) + { pp_kernel_ = new pp_kernel_t(apd, pd()->dst_is_acc_); } + ~gemm_x8s8s32x_inner_product_fwd_t() { delete pp_kernel_; } typedef typename prec_traits::type data_t; - typedef typename prec_traits::type src_data_t; + typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; - bool dst_is_acc_; - scratchpad_t *scratchpad_; + // XXX: this is throwaway code that will become unnecessary when we have a + // sufficiently advanced igemm jit generator that supports quantization, + // relu, and whatnot + class pp_kernel_t: jit_generator { + public: + DECLARE_CPU_JIT_AUX_FUNCTIONS( + gemm_x8s8s32x_inner_product_fwd_t::pp_kernel); + pp_kernel_t(const pd_t *pd, bool dst_is_acc); + + void operator()(dst_data_t *dst, const acc_data_t *acc, + const char *bias, const float *scales, float nslope, + size_t start, size_t end); + private: + void generate(); + + struct ker_args { + dst_data_t *dst; + const acc_data_t *acc; + const char *bias; + const float *scales; + float nslope; + size_t len; + size_t oc_offset; + }; + void (*ker_)(const ker_args *args); + + size_t OC_; + data_type_t bias_data_type_; + size_t bias_data_type_size_; + size_t scale_idx_mult_; + round_mode_t rmode_; + bool do_bias_; + bool do_relu_; + }; + + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + + pp_kernel_t *pp_kernel_; }; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.cpp index 9ef2558..73f01f5 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.cpp @@ -15,10 +15,14 @@ * limitations under the License. *******************************************************************************/ +#include + #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "nstl.hpp" #include "type_helpers.hpp" #include "utils.hpp" + #include "cpu_memory.hpp" #include "jit_avx2_1x1_conv_kernel_f32.hpp" @@ -140,7 +144,7 @@ void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop( default: if (jcp.with_dw_conv) { return ptr[aux_reg_output_data + - (i * jcp.dw_conv_ker_h * jcp.ow + j) * jcp.oc_block * sizeof(float)]; + (i * jcp_dw.kh * jcp.ow + j) * jcp.oc_block * sizeof(float)]; } else { return ptr[aux_reg_output_data + (i * jcp.os + j) * jcp.oc_block * sizeof(float)]; @@ -176,7 +180,7 @@ void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop( }; auto store = [=]() { - Label store_done, store_noadd; + Label store_noadd; if (!jcp.with_sum) { test(reg_reduce_pos_flag, FLAG_REDUCE_FIRST); @@ -198,9 +202,6 @@ void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop( int eltwise_inj_idx = 0; int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - eltwise_injectors[0]->compute_vector_range(0, ur * load_loop_blk); - } int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { @@ -236,8 +237,6 @@ void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop( for (int i = 0; i < load_loop_blk; ++i) { vmovups(output_ptr(i, j), vreg_accum(i, j)); } - - L(store_done); }; auto fma_block = [=](bool last_block) { @@ -247,9 +246,8 @@ void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop( if (mayiuse(avx2)) vfmadd231ps(vreg_accum(i, j), vreg_load(i), vreg_bcast); else { // Intel(R) Advanced Vector Extensions (Intel(R) AVX) support - auto tmp = vmask; - vmulps(tmp, vreg_bcast, vreg_load(i)); - vaddps(vreg_accum(i, j), vreg_accum(i, j), tmp); + vmulps(vtmp, vreg_bcast, vreg_load(i)); + vaddps(vreg_accum(i, j), vreg_accum(i, j), vtmp); } if (j == ur - 1 && !(last_block && u == jcp.reduce_loop_unroll - 1)) @@ -347,12 +345,6 @@ void jit_avx2_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk) void jit_avx2_1x1_conv_kernel_f32::generate() { - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - const auto &p = attr_.post_ops_; int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { @@ -485,24 +477,15 @@ bool jit_avx2_1x1_conv_kernel_f32::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: - return true // sum OR eltwise OR dw_conv - && !jcp.with_eltwise && (is_simple(0) || is_sum(0) || is_dw_conv(0)); - case 2: - return true // sum->eltwise OR dw_conv->eltwise OR eltwise->dw_conv OR dw_conv->sum OR sum->depthwise OR - // eltwise->depthwise OR depthwise->depthwise - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || - (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || - (is_simple(0) && is_simple(1))); - case 3: - return true // eltwise->dw_conv->eltwise OR dw_conv->sum->eltwise OR sum->eltwise->depthwise OR - // sum->depthwise->eltwise OR sum->depthwise->depthwise - && !jcp.with_eltwise && ((is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || - (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || - (is_sum(0) && is_simple(1) && is_simple(2))); - case 4: return true // eltwise->dw_conv->sum->eltwise - && !jcp.with_eltwise && (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); + case 0: return true; + case 1: return is_simple(0) || is_sum(0) || is_dw_conv(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || + (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || + (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || + (is_sum(0) && is_simple(1) && is_simple(2)); + case 4: return (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); default: return false; } @@ -512,7 +495,7 @@ bool jit_avx2_1x1_conv_kernel_f32::post_ops_ok( status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope) + const primitive_attr_t &attr) { if (!mayiuse(avx)) return status::unimplemented; @@ -547,51 +530,41 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp, jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; - - if (!post_ops_ok(jcp, attr)) { + if (!post_ops_ok(jcp, attr)) return status::unimplemented; - } const auto &p = attr.post_ops_; - jcp.with_dw_conv = false; + int dw_conv_ind = p.find(primitive_kind::convolution); - if (dw_conv_ind != -1) { - jcp.with_dw_conv = true; - jcp.dw_conv_in_h = p.entry_[dw_conv_ind].dw_conv.in_h; - jcp.dw_conv_in_w = p.entry_[dw_conv_ind].dw_conv.in_w; - jcp.dw_conv_ker_h = p.entry_[dw_conv_ind].dw_conv.ker_h; - jcp.dw_conv_ker_w = p.entry_[dw_conv_ind].dw_conv.ker_w; - jcp.dw_conv_str_h = p.entry_[dw_conv_ind].dw_conv.str_h; - jcp.dw_conv_str_w = p.entry_[dw_conv_ind].dw_conv.str_w; - jcp.dw_conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; - jcp.dw_conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; + jcp.with_dw_conv = dw_conv_ind != -1; + jcp.with_dw_conv = dw_conv_ind != -1; + if (jcp.with_dw_conv) { + jcp.dw_conv_oh = jcp.oh; + jcp.dw_conv_ow = jcp.ow; + jcp.oh = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp.ow = p.entry_[dw_conv_ind].dw_conv.in_w; } if (jcp.with_dw_conv && !mayiuse(avx2)) return status::unimplemented; - if (jcp.with_dw_conv) { - int dw_conv_eltwise_ind = p.find(primitive_kind::eltwise, dw_conv_ind); - if (dw_conv_eltwise_ind != -1) { - jcp.dw_conv_with_eltwise = true; - jcp.dw_conv_eltwise_alg = p.entry_[dw_conv_eltwise_ind].eltwise.alg; - jcp.dw_conv_eltwise_alpha = p.entry_[dw_conv_eltwise_ind].eltwise.alpha; - jcp.dw_conv_eltwise_beta = p.entry_[dw_conv_eltwise_ind].eltwise.beta; + if (!mayiuse(avx2)) { + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + if (post_op.eltwise.alg != alg_kind::eltwise_relu) + return status::unimplemented; + } else if (post_op.is_depthwise()) { + return status::unimplemented; + } } } jcp.with_sum = p.find(primitive_kind::sum, 0, dw_conv_ind) != -1; - if (jcp.with_dw_conv) { - jcp.dw_conv_with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; - } - if (jcp.with_dw_conv) { - jcp.oh = jcp.dw_conv_in_h; - jcp.ow = jcp.dw_conv_in_w; - } + jcp.src_dt = cd.src_desc.data_type; + jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; + jcp.dst_dt = cd.dst_desc.data_type; jcp.os = jcp.oh * jcp.ow; jcp.is = jcp.ih * jcp.iw; @@ -770,6 +743,24 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp, return status::success; } +void jit_avx2_1x1_conv_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw) { + using namespace mkldnn::impl::memory_tracking::names; + + if (jcp.prop_kind != backward_data && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp.oc); + + if (jcp.with_dw_conv) { + const int nthreads = mkldnn_get_max_threads(); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * (jcp.oc / jcp.oc_block); + scratchpad.book(key_dw_conv_buffer, sizeof(float) * dw_conv_buffer_size_ * nthreads); + + if (jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_dw_conv_padded_bias, sizeof(float) * jcp.oc); + } +} + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.hpp index 2c10b85..e856140 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.hpp @@ -18,9 +18,11 @@ #define JIT_AVX2_1x1_CONV_KERNEL_F32_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + +#include "cpu_memory.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" -#include "cpu_memory.hpp" #include "jit_uni_eltwise.hpp" #include "jit_uni_depthwise.hpp" @@ -31,8 +33,9 @@ namespace cpu { struct jit_avx2_1x1_conv_kernel_f32: public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_1x1_conv_kernel_f32) - jit_avx2_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp, - const primitive_attr_t &attr): jcp(ajcp), attr_(attr) + jit_avx2_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp, jit_conv_conf_t ajcp_dw, + const primitive_attr_t &attr) + : jcp(ajcp), jcp_dw(ajcp_dw), attr_(attr) { this->generate(); jit_ker = (void (*)(jit_1x1_conv_call_s *))this->getCode(); @@ -56,20 +59,13 @@ struct jit_avx2_1x1_conv_kernel_f32: public jit_generator { const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope); + const primitive_attr_t &attr); - static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr) - { - return init_conf(jcp, cd, src_d, weights_d, dst_d, attr, false, 0.0); - } + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw = jit_conv_conf_t()); jit_1x1_conv_conf_t jcp; + jit_conv_conf_t jcp_dw; const primitive_attr_t &attr_; void (*jit_ker)(jit_1x1_conv_call_s *); @@ -104,7 +100,7 @@ private: int stack_space_needed = 8; ymm_t vreg_bcast = ymm_t(15); - Xbyak::Ymm vmask = Xbyak::Ymm(14); + ymm_t vtmp = ymm_t(14); void generate_bcast_loop(int load_loop_blk); void generate_reduce_loop(int load_loop_blk, int ur); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.cpp index 7a6e17c..5f888a2 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.cpp @@ -14,25 +14,22 @@ * limitations under the License. *******************************************************************************/ -#include -#include -#include -#include "mkldnn_types.h" - #include "c_types_map.hpp" -#include "jit_avx2_1x1_convolution.hpp" -#include "utils.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" - +#include "utils.hpp" +#include #include "jit_generator.hpp" +#include "jit_avx2_1x1_convolution.hpp" + namespace mkldnn { namespace impl { namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; #define data_blk_off(f, n, c, h, w) \ @@ -42,27 +39,28 @@ using namespace mkldnn::impl::utils; /* convolution forward */ -template -void _jit_avx2_1x1_convolution_fwd_t::execute_forward() { +void jit_avx2_1x1_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + auto rtus_space = scratchpad().get(key_conv_rtus_space); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); const int work_amount = MB * jcp.ngroups * jcp.nb_bcast; const int ndims = dst_d.ndims(); - const int stride_h = (ndims == 3) ? 1 : conf_.cdesc()->strides[0]; - const int stride_w = conf_.cdesc()->strides[ndims - 3]; - const int pad_t = (ndims == 3) ? 0 : conf_.cdesc()->padding[0][0]; - const int pad_l = conf_.cdesc()->padding[0][ndims - 3]; + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[ndims - 3]; + const int pad_t = (ndims == 3) ? 0 : pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][ndims - 3]; auto step = [](int default_step, int remaining, int tail_step) { assert(default_step <= tail_step); @@ -73,8 +71,8 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward() { // TODO (Roma): remove this restriction assert(jcp.stride_w == 1 && jcp.stride_h == 1); - jit_1x1_conv_call_s p = {}; - rtus_driver_t::call_params_t rp = {}; + auto p = jit_1x1_conv_call_s(); + auto rp = rtus_driver_t::call_params_t(); const int nb_oc = jcp.nb_load; const int nb_ic = jcp.nb_reduce; @@ -129,13 +127,14 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward() { nb_ic_blocking * jcp.ic_block); rp.icb = p.reduce_dim / jcp.reduce_block; - p.load_data = &weights[conf_.with_groups() + p.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; const int _icb = g * nb_ic + icb; - if (conf_.rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_ + if (pd()->rtus_.reduce_src_) { + rp.ws = rtus_space + + ithr * pd()->rtus_.space_per_thread_ + _icb * jcp.is * jcp.ic_block; if (ocb == 0) { @@ -159,29 +158,37 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward() { } }; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; } parallel(0, ker); + + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); } -template -void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { +void jit_avx2_1x1_convolution_fwd_t::execute_forward_with_dw_conv() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + auto rtus_space = scratchpad().get(key_conv_rtus_space); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const auto &jcp_dw = kernel_dw_->jcp; + const int MB = pd()->MB(); - auto dw_bias = jcp.dw_conv_biases; + auto dw_bias = jcp_dw.conv_biases; int ocb_work = jcp.with_dw_conv ? utils::div_up(jcp.nb_load, jcp.nb_load_blocking) : 1; const int work_amount = MB * jcp.ngroups * ocb_work * jcp.nb_bcast; @@ -205,8 +212,8 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { if ((oh + h) < 0 || (oh + h) >= jcp.ih) { for (int chb = ocb; chb < ocb + load_step; chb++) { - memset(ws_p + (((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * jcp.oc_block + - (chb - ocb) * jcp.dw_conv_ker_h * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); + memset(ws_p + (((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block + + (chb - ocb) * jcp_dw.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); } } else { const int _ocb = g * jcp.nb_load + ocb; @@ -217,7 +224,7 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { rp.os = p.bcast_dim; p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc, load_step * jcp.oc_block); - p.output_data = &ws_p[(((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * jcp.oc_block]; + p.output_data = &ws_p[(((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block]; p.bias_data = &bias[_ocb * jcp.oc_block]; @@ -231,13 +238,14 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { jcp.nb_reduce_blocking * jcp.ic_block); rp.icb = p.reduce_dim / jcp.reduce_block; - p.load_data = &weights[conf_.with_groups() + p.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; const int _icb = g * jcp.nb_reduce + icb; - if (conf_.rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_ + if (pd()->rtus_.reduce_src_) { + rp.ws = rtus_space + + ithr * pd()->rtus_.space_per_thread_ + _icb * jcp.is * jcp.ic_block; if (ocb == 0) { @@ -259,7 +267,6 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { }; auto compute_row_dw = [&](const float* ws_p, int n, int ocb, int load_step, int dst_idx) { - const auto &jcp_dw = kernel_dw_->jcp; for (int chb = ocb; chb < ocb + load_step; chb++) { auto par_conv_dw = jit_conv_call_s(); @@ -275,9 +282,11 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { dst_idx/jcp_dw.stride_h*jcp_dw.ow*jcp_dw.ch_block]; par_conv_dw.kh_padding = jcp_dw.kh; - par_conv_dw.filt = &jcp.dw_conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; + par_conv_dw.filt = &jcp_dw.conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; par_conv_dw.bias = &dw_bias[chb * jcp_dw.ch_block]; par_conv_dw.ur_w = (size_t)(jcp_dw.ow); + par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw.ch_block, (int)jcp_dw.oc) - chb*jcp_dw.ch_block; + par_conv_dw.oc_off = chb * jcp_dw.ch_block * sizeof(float); kernel_dw_->jit_ker(&par_conv_dw); } @@ -288,7 +297,9 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { int start{0}, end{0}; balance211(work_amount, nthr, ithr, start, end); - auto pbuf = dw_conv_buffer_ + ithr * dw_conv_buffer_size_; + auto dw_conv_buffer = scratchpad().get(key_dw_conv_buffer); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * (jcp.oc / jcp.oc_block); + auto pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_; const int os_block = jcp.iw; @@ -319,7 +330,7 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { compute_block_1x1(pbuf, n, g, oh + 1, ow, ih, iw, os, os_block, bcast_step, ocb, load_step, bcast_step); } - if ((oh % jcp.dw_conv_str_h == 0)) { + if ((oh % jcp_dw.stride_h == 0)) { compute_row_dw(pbuf, n, ocb, load_step, oh); } @@ -327,44 +338,50 @@ void _jit_avx2_1x1_convolution_fwd_t::execute_forward_fusing() { } }; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - dw_padded_bias_[oc] = dw_bias[oc]; - dw_bias = dw_padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; + + auto dw_padded_bias = scratchpad().get(key_dw_conv_padded_bias); + utils::array_copy(dw_padded_bias, dw_bias, jcp.oc_without_padding); + utils::array_set(dw_padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + dw_bias = dw_padded_bias; } parallel(0, ker); -} -template struct _jit_avx2_1x1_convolution_fwd_t; -template struct _jit_avx2_1x1_convolution_fwd_t; + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); +} /* convolution backward wtr data */ -void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() { +void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + + auto rtus_space = scratchpad().get(key_conv_rtus_space); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); // TODO (Roma): remove this restriction assert(jcp.stride_w == 1 && jcp.stride_h == 1); const int ndims = diff_dst_d.ndims(); - const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0]; - const int stride_w = conf_.desc()->strides[ndims - 3]; - const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0]; - const int pad_l = conf_.desc()->padding[0][ndims - 3]; + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[ndims - 3]; + const int pad_t = (ndims == 3) ? 0 : pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][ndims - 3]; const int nb_ic = jcp.nb_load; const int nb_oc = jcp.nb_reduce; @@ -417,8 +434,9 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() { const int _icb = g * nb_ic + icb; rp.src = diff_src + data_blk_off(diff_src_d, n, _icb, ih, iw); - if (conf_.rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_; + if (pd()->rtus_.reduce_src_) { + rp.ws = rtus_space + + ithr * pd()->rtus_.space_per_thread_; p.output_data = rp.ws; } else p.output_data = rp.src; @@ -430,7 +448,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() { ow); p.bcast_data = &diff_dst[diff_dst_off]; - p.load_data = &weights[conf_.with_groups() + p.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; @@ -442,7 +460,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() { kernel_->jit_ker(&p); } - if (conf_.rtus_.reduce_src_) + if (pd()->rtus_.reduce_src_) rtus_driver_->ker_(&rp); } } @@ -454,64 +472,46 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() { /* convolution backward wtr weights */ jit_avx2_1x1_convolution_bwd_weights_t::jit_avx2_1x1_convolution_bwd_weights_t( - const pd_t *pd, const input_vector &inputs, + const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), kernel_(nullptr) - , rtus_driver_(nullptr), ws_per_thread_(0), scratch_(nullptr) - , padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr) + , rtus_driver_(nullptr) { - kernel_ = new jit_avx2_1x1_conv_kernel_f32(conf_.jcp_, *conf_.attr()); - - const auto &jcp = kernel_->jcp; - - const int ic_block = jcp.bcast_block; - const int nb_ic = jcp.nb_bcast; - const int nb_ic_blocking = jcp.nb_bcast_blocking; - const int bcast_work = utils::div_up(nb_ic, nb_ic_blocking); - - const int oc_block = jcp.load_block; - const int nb_oc = jcp.nb_load; - const int nb_oc_blocking = jcp.nb_load_blocking; - const int load_work = utils::div_up(nb_oc, nb_oc_blocking); - - const int job_size - = nb_oc_blocking * nb_ic_blocking * ic_block * oc_block; - const int njobs_x = bcast_work; - const int njobs_y = jcp.ngroups * load_work; - - const int max_threads = mkldnn_get_max_threads(); - const size_t max_buffer_size = max_threads * job_size * 8; - - reducer_weights_ = new cpu_reducer_2d_t( - reduce_balancer_t(max_threads, job_size, njobs_y * njobs_x, - jcp.mb * jcp.nb_reduce, max_buffer_size), - job_size / nb_oc_blocking, nb_oc_blocking, ic_block, - nb_ic * ic_block * oc_block, nb_oc, false); - - reducer_bias_ = !conf_.with_bias() ? nullptr - : new cpu_reducer_t(reduce_balancer_t(max_threads, - oc_block, jcp.ngroups * jcp.oc / oc_block, - jcp.mb, max_buffer_size)); - - if (conf_.want_padded_bias()) - padded_bias_ = (data_t *)malloc(sizeof(data_t) * jcp.oc, 64); - + kernel_ = new jit_avx2_1x1_conv_kernel_f32(pd()->jcp_, jit_conv_conf_t(), *pd()->attr()); + reducer_weights_ = + new cpu_reducer_2d_t(pd()->reducer_wei_conf_); + reducer_bias_ = new cpu_reducer_t(pd()->reducer_bia_conf_); init_rtus_driver(this); } -void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { +void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias_in = reinterpret_cast(this->memory(1)); - data_t *diff_bias = conf_.want_padded_bias() ? padded_bias_ : diff_bias_in; - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); - const memory_desc_wrapper diff_bias_d(conf_.diff_weights_pd(1)); + auto scratchpad = this->scratchpad(); + + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); + const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); const auto &jcp = kernel_->jcp; + auto rtus_space = scratchpad.get(key_conv_rtus_space); + + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; + + auto reducer_bia_scratchpad = memory_tracking::grantor_t(scratchpad, + prefix_reducer_bia); + auto rb = this->reducer_bias_; + rb->init(reducer_bia_scratchpad); + + auto reducer_wei_scratchpad = memory_tracking::grantor_t(scratchpad, + prefix_reducer_wei); + auto rw = this->reducer_weights_; + rw->init(reducer_wei_scratchpad); const int ndims = diff_dst_d.ndims(); // TODO (Roma): remove this restriction @@ -528,10 +528,10 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { const int sp_dim = jcp.reduce_dim; const int mb_sp_work = jcp.mb * sp_dim; - const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0]; - const int stride_w = conf_.desc()->strides[ndims - 3]; - const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0]; - const int pad_l = conf_.desc()->padding[0][ndims - 3]; + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[ndims - 3]; + const int pad_t = (ndims == 3) ? 0 : pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][ndims - 3]; auto step = [](int default_step, int remaining, int tail_step) { assert(default_step <= tail_step); @@ -574,7 +574,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { p.load_data = diff_dst + (oc_b * jcp.reduce_dim + sp) * jcp.oc_block; - if (conf_.rtus_.reduce_src_) { + if (pd()->rtus_.reduce_src_) { const int oh = sp / jcp.ow; const int ow = sp % jcp.ow; @@ -582,7 +582,8 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { const int iw = nstl::max(ow * stride_w - pad_l, 0); rp.iw_start = iw; - rp.ws = scratch_ + ithr * ws_per_thread_ + rp.ws = rtus_space + + ithr * pd()->rtus_.space_per_thread_ + (ic_b * jcp.is + sp) * jcp.ic_block; if (ndims == 3) rp.src = src @@ -607,22 +608,21 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { }; auto ker = [&](const int ithr, const int nthr) { - auto rw = this->reducer_weights_; - assert(nthr == rw->balancer_.nthr_); + assert(nthr == rw->balancer().nthr_); - const int w_njobs = rw->balancer_.ithr_njobs(ithr); + const int w_njobs = rw->balancer().ithr_njobs(ithr); if (w_njobs == 0) return; /* setup: independent work (oc, ic) */ - const int w_job_start = rw->balancer_.ithr_job_off(ithr); + const int w_job_start = rw->balancer().ithr_job_off(ithr); int g{0}, load_i{0}, bcast_i{0}; nd_iterator_init(w_job_start, g, jcp.ngroups, load_i, load_work, bcast_i, bcast_work); /* setup: reduction work (mb, sp) */ int mb_sp_start{0}, mb_sp_end{0}; - balance211(mb_sp_work, rw->balancer_.nthr_per_group_, - rw->balancer_.id_in_group(ithr), mb_sp_start, mb_sp_end); + balance211(mb_sp_work, rw->balancer().nthr_per_group_, + rw->balancer().id_in_group(ithr), mb_sp_start, mb_sp_end); int img_start{0}, sp_start{0}; nd_iterator_init(mb_sp_start, img_start, jcp.mb, sp_start, sp_dim); @@ -637,16 +637,16 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { data_t *store_to; size_t store_to_ld; - if (rw->balancer_.nthr_per_group_ == 1 || - (rw->balancer_.master(ithr) && rw->master_uses_dst_)) { - const size_t off = conf_.with_groups() + if (rw->balancer().nthr_per_group_ == 1) { + const size_t off = pd()->with_groups() ? diff_weights_d.blk_off(g, oc_b, ic_b) : diff_weights_d.blk_off(oc_b, ic_b); store_to = &diff_weights[off]; store_to_ld = jcp.ic * jcp.oc_block; } else { - const size_t off = iwork * rw->balancer_.job_size_; - store_to = &rw->get_local_ptr(ithr, nullptr)[off]; + const size_t off = iwork * rw->balancer().job_size_; + store_to = + rw->get_local_ptr(ithr, reducer_wei_scratchpad) + off; store_to_ld = nb_ic_blocking * jcp.ic_block * jcp.oc_block; } @@ -670,22 +670,21 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { nd_iterator_step(g, jcp.ngroups, load_i, load_work, bcast_i, bcast_work); } - rw->reduce(ithr, diff_weights); + rw->reduce(ithr, diff_weights, reducer_wei_scratchpad); }; auto ker_bias = [&](int ithr, int nthr) { - auto rb = this->reducer_bias_; - assert(nthr == rb->balancer_.nthr_); + assert(nthr == rb->balancer().nthr_); - const int b_job_start = rb->balancer_.ithr_job_off(ithr); - const int b_njobs = rb->balancer_.ithr_njobs(ithr); + const int b_job_start = rb->balancer().ithr_job_off(ithr); + const int b_njobs = rb->balancer().ithr_njobs(ithr); if (b_njobs == 0) return; /* reduction dimension */ int img_start{0}, img_end{0}; - balance211(jcp.mb, rb->balancer_.nthr_per_group_, - rb->balancer_.id_in_group(ithr), img_start, img_end); + balance211(jcp.mb, rb->balancer().nthr_per_group_, + rb->balancer().id_in_group(ithr), img_start, img_end); /* jobs */ int g_start{0}, ocb_start{0}; @@ -697,8 +696,9 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { const size_t _oc = g * nb_oc + ocb; const data_t *d_dst = &diff_dst[diff_dst_d.blk_off(img, _oc)]; - data_t *d_bias = &rb->get_local_ptr(ithr, diff_bias)[ - b_job_loc * rb->balancer_.job_size_]; + data_t *d_bias = + rb->get_local_ptr(ithr, diff_bias, reducer_bia_scratchpad) + + b_job_loc * rb->balancer().job_size_; if (img == img_start) for (int o = 0; o < 8; ++o) d_bias[o] = 0.; @@ -713,17 +713,17 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() { nd_iterator_step(g, jcp.ngroups, ocb, nb_oc); } } - rb->reduce(ithr, diff_bias); + rb->reduce(ithr, diff_bias, reducer_bia_scratchpad); }; parallel(0, [&](const int ithr, const int nthr) { ker(ithr, nthr); - if (conf_.with_bias()) + if (pd()->with_bias()) ker_bias(ithr, nthr); }); /* TODO: put this in ker_bias */ - if (conf_.want_padded_bias()) { + if (pd()->wants_padded_bias()) { assert(jcp.ngroups == 1); for (int oc = 0; oc < jcp.oc_without_padding; ++oc) diff_bias_in[oc] = diff_bias[oc]; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.hpp index 7846252..ede5978 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.hpp @@ -19,85 +19,81 @@ #include #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" + #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "cpu_reducer.hpp" + #include "jit_avx2_1x1_conv_kernel_f32.hpp" #include "jit_uni_1x1_conv_utils.hpp" -#include "mkldnn_thread.hpp" -#include "utils.hpp" + #include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t { +struct jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t { // TODO: (Roma) Code duplication duplication! Remove with templates // (maybe...)! - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) - , jcp_(), jcp_dw(), rtus_() {} + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_(), jcp_dw_(), rtus_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_1x1:", avx2, ""), - _jit_avx2_1x1_convolution_fwd_t); + jit_avx2_1x1_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), - data_type::f32 == this->cdesc_().bias_desc.data_type); + data_type::f32 == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; - const convolution_desc_t *conv_d = &this->cdesc_(); + const convolution_desc_t *conv_d = this->desc(); const memory_desc_t *src_d = this->src_pd_.desc(); rtus_prepare(this, conv_d, src_d, this->dst_pd_.desc()); status_t sts_1x1 = jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, *conv_d, *src_d, *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->attr(), - with_relu, this->negative_slope()); + *this->dst_pd_.desc(), *this->attr()); if (sts_1x1 != status::success) return sts_1x1; if (jcp_.with_dw_conv) { - int dw_conv_oh = (jcp_.oh - ((jcp_.dw_conv_ker_h - 1) + 1) + 2) / jcp_.dw_conv_str_h + 1; - int dw_conv_ow = (jcp_.ow - ((jcp_.dw_conv_ker_w - 1) + 1) + 2) / jcp_.dw_conv_str_w + 1; - - status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_dw, - jcp_.oc, jcp_.oh, jcp_.ow, dw_conv_oh, dw_conv_ow, - jcp_.dw_conv_ker_h, jcp_.dw_conv_ker_w, - jcp_.dw_conv_str_h, jcp_.dw_conv_str_w, - jcp_.dw_conv_eltwise_alg, jcp_.dw_conv_eltwise_alpha, - jcp_.dw_conv_eltwise_beta, jcp_.dw_conv_with_sum); + status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_, jcp_dw_, *this->attr()); if (sts_dw != status::success) return sts_dw; } + auto scratchpad = scratchpad_registry().registrar(); + jit_avx2_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_, jcp_dw_); + + rtus_prepare_space_info(this, scratchpad); + return status::success; } jit_1x1_conv_conf_t jcp_; - jit_conv_conf_t jcp_dw; - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; + jit_conv_conf_t jcp_dw_; + reduce_to_unit_stride_t rtus_; protected: virtual status_t set_default_params() override { @@ -114,6 +110,8 @@ struct _jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t { : utils::pick(this->ndims() - 3, OIw8i8o, OIhw8i8o))); if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; @@ -121,61 +119,33 @@ struct _jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t { template friend void init_rtus_driver(conv_t *self); - _jit_avx2_1x1_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_avx2_1x1_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0) - , scratch_(nullptr), padded_bias_(nullptr), dw_conv_buffer_size_(0), dw_conv_buffer_(nullptr), dw_padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), rtus_driver_(nullptr) { - kernel_ = new jit_avx2_1x1_conv_kernel_f32(conf_.jcp_, *conf_.attr()); - if (conf_.jcp_.with_dw_conv) { - kernel_dw_ = new jit_uni_dw_conv_row_f32(conf_.jcp_dw); - } - + kernel_ = new jit_avx2_1x1_conv_kernel_f32(pd()->jcp_, pd()->jcp_dw_, *pd()->attr()); init_rtus_driver(this); - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; - } - - if (conf_.jcp_.with_dw_conv) { - const int nthreads = mkldnn_get_max_threads(); - - dw_conv_buffer_size_ = (size_t) conf_.jcp_dw.kh * conf_.jcp_dw.iw * conf_.jcp_dw.ch_block * - (conf_.jcp_.oc / conf_.jcp_.oc_block); - dw_conv_buffer_ = (data_t *) malloc(dw_conv_buffer_size_ * nthreads * sizeof(data_t), 64); - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - dw_padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - dw_padded_bias_[oc] = 0; - } + if (pd()->jcp_.with_dw_conv) { + kernel_dw_ = new jit_uni_dw_conv_row_f32(pd()->jcp_dw_, *pd()->attr(), pd()->jcp_dw_.ch_block); } } - ~_jit_avx2_1x1_convolution_fwd_t() { + + ~jit_avx2_1x1_convolution_fwd_t() { delete kernel_; delete rtus_driver_; - free(scratch_); - free(padded_bias_); - if (conf_.jcp_.with_dw_conv) { + if (pd()->jcp_.with_dw_conv) { delete kernel_dw_; - free(dw_conv_buffer_); - free(dw_padded_bias_); } } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.jcp_.with_dw_conv) - execute_forward_fusing(); + virtual void execute(event_t *e) const { + if (pd()->jcp_.with_dw_conv) + execute_forward_with_dw_conv(); else execute_forward(); @@ -183,28 +153,15 @@ struct _jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t { } private: - void execute_forward(); - void execute_forward_fusing(); + void execute_forward() const; + void execute_forward_with_dw_conv() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; jit_avx2_1x1_conv_kernel_f32 *kernel_; jit_uni_dw_conv_row_f32 *kernel_dw_; - - /* reduction to unit stride */ rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - data_t *scratch_; - data_t *padded_bias_; - - /* fuse with dw conv */ - size_t dw_conv_buffer_size_; - data_t *dw_conv_buffer_; - data_t *dw_padded_bias_; }; -using jit_avx2_1x1_convolution_fwd_t = _jit_avx2_1x1_convolution_fwd_t; -using jit_avx2_1x1_convolution_relu_t = _jit_avx2_1x1_convolution_fwd_t; - struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t { struct pd_t: public cpu_convolution_bwd_data_pd_t { pd_t(engine_t *engine, @@ -224,7 +181,8 @@ struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_data - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->diff_src_desc.data_type, @@ -236,17 +194,22 @@ struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t { const memory_desc_t *diff_src_d = this->diff_src_pd_.desc(); rtus_prepare(this, conv_d, diff_src_d, this->diff_dst_pd_.desc()); - return jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, *conv_d, - *diff_src_d, *this->weights_pd_.desc(), + status_t status = jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, + *conv_d, *diff_src_d, *this->weights_pd_.desc(), *this->diff_dst_pd_.desc(), *this->attr()); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx2_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_); + + rtus_prepare_space_info(this, scratchpad); + + return status::success; } // TODO (Roma): structs conf header cleanup jit_1x1_conv_conf_t jcp_; - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; + reduce_to_unit_stride_t rtus_; protected: virtual status_t set_default_params() override { @@ -262,6 +225,8 @@ struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t { CHECK(this->weights_pd_.set_format(this->with_groups() ? utils::pick(this->ndims() - 3, gOIw8o8i, gOIhw8o8i) : utils::pick(this->ndims() - 3, OIw8o8i, OIhw8o8i))); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; @@ -269,25 +234,24 @@ struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t { template friend void init_rtus_driver(conv_t *self); - jit_avx2_1x1_convolution_bwd_data_t(const pd_t *pd, + jit_avx2_1x1_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0) - , scratch_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), rtus_driver_(nullptr) { - kernel_ = new jit_avx2_1x1_conv_kernel_f32(conf_.jcp_, *conf_.attr()); + kernel_ = new jit_avx2_1x1_conv_kernel_f32(pd()->jcp_, jit_conv_conf_t(), *pd()->attr()); init_rtus_driver(this); } + ~jit_avx2_1x1_convolution_bwd_data_t() { delete kernel_; delete rtus_driver_; - free(scratch_); } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: execute_backward_data(); break; @@ -298,20 +262,16 @@ struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; - jit_avx2_1x1_conv_kernel_f32 *kernel_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - /* reduction to unit stride */ + jit_avx2_1x1_conv_kernel_f32 *kernel_; rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - data_t *scratch_; }; struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t { struct pd_t: public cpu_convolution_bwd_weights_pd_t { - pd_t(engine_t *engine, - const convolution_desc_t *adesc, + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const convolution_fwd_pd_t *hint_fwd_pd) : cpu_convolution_bwd_weights_pd_t(engine, adesc, attr, hint_fwd_pd) @@ -327,7 +287,8 @@ struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, @@ -341,18 +302,33 @@ struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t { const memory_desc_t *src_d = this->src_pd_.desc(); rtus_prepare(this, conv_d, src_d, this->diff_dst_pd_.desc()); - return jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, *conv_d, - *src_d, *this->diff_weights_pd_.desc(), + status_t status = jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, + *conv_d, *src_d, *this->diff_weights_pd_.desc(), *this->diff_dst_pd_.desc(), *this->attr()); + if (status != status::success) return status; + + init_balancers(); + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx2_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_); + + rtus_prepare_space_info(this, scratchpad); + + auto reducer_bia_scratchpad = memory_tracking::registrar_t( + scratchpad, memory_tracking::names::prefix_reducer_bia); + reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad); + + auto reducer_wei_scratchpad = memory_tracking::registrar_t( + scratchpad, memory_tracking::names::prefix_reducer_wei); + reducer_wei_conf_.init_scratchpad(reducer_wei_scratchpad); + + return status::success; } - // TODO (Roma): structs conf header cleanup jit_1x1_conv_conf_t jcp_; - - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; + cpu_reducer_t::conf_t reducer_bia_conf_; + cpu_reducer_2d_t::conf_t reducer_wei_conf_; + reduce_to_unit_stride_t rtus_; protected: virtual status_t set_default_params() override { @@ -370,28 +346,62 @@ struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t { : utils::pick(this->ndims() - 3, OIw8i8o, OIhw8i8o))); if (this->diff_bias_pd_.desc()->format == any) CHECK(this->diff_bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } + + private: + void init_balancers() { + const int ic_block = jcp_.bcast_block; + const int nb_ic = jcp_.nb_bcast; + const int nb_ic_blocking = jcp_.nb_bcast_blocking; + const int bcast_work = utils::div_up(nb_ic, nb_ic_blocking); + + const int oc_block = jcp_.load_block; + const int nb_oc = jcp_.nb_load; + const int nb_oc_blocking = jcp_.nb_load_blocking; + const int load_work = utils::div_up(nb_oc, nb_oc_blocking); + + const int job_size + = nb_oc_blocking * nb_ic_blocking * ic_block * oc_block; + const int njobs_x = bcast_work; + const int njobs_y = jcp_.ngroups * load_work; + + const int max_threads = mkldnn_get_max_threads(); + const size_t max_buffer_size = max_threads * job_size * 8; + + if (with_bias()) { + reducer_bia_conf_.init(reduce_balancer_t(max_threads, + oc_block, jcp_.ngroups * jcp_.oc / oc_block, + jcp_.mb, max_buffer_size)); + } + + reducer_wei_conf_.init( + reduce_balancer_t(max_threads, job_size, njobs_y * njobs_x, + jcp_.mb * jcp_.nb_reduce, max_buffer_size), + job_size / nb_oc_blocking, nb_oc_blocking, ic_block, + nb_ic * ic_block * oc_block, nb_oc); + } }; template friend void init_rtus_driver(conv_t *self); - jit_avx2_1x1_convolution_bwd_weights_t(const pd_t *pd, + jit_avx2_1x1_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); + ~jit_avx2_1x1_convolution_bwd_weights_t() { delete kernel_; delete rtus_driver_; delete reducer_weights_; delete reducer_bias_; - free(scratch_); - free(padded_bias_); } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_weights: execute_backward_weights(); break; @@ -402,17 +412,13 @@ struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t { } private: - void execute_backward_weights(); - pd_t conf_; + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx2_1x1_conv_kernel_f32 *kernel_; cpu_reducer_2d_t *reducer_weights_; cpu_reducer_t *reducer_bias_; - - /* reduction to unit stride */ rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - data_t *scratch_; - data_t *padded_bias_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.cpp index 392622a..0caa4b4 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.cpp @@ -15,7 +15,6 @@ * limitations under the License. *******************************************************************************/ -#include #include "c_types_map.hpp" #include "nstl.hpp" #include "type_helpers.hpp" @@ -32,6 +31,7 @@ namespace cpu { using namespace mkldnn::impl::prop_kind; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; @@ -77,9 +77,8 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_unroll_kw(int ur_w, vfmadd231ps(Ymm(ur_w * ii + jj), Ymm(oc_blocks * ur_w + jj), ymm15); else { // Intel(R) Advanced Vector Extensions (Intel(R) AVX) support - Ymm tmp = ymask; - vmulps(tmp, ymm15, Ymm(oc_blocks * ur_w + jj)); - vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), tmp); + vmulps(ytmp, ymm15, Ymm(oc_blocks * ur_w + jj)); + vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), ytmp); } } } @@ -131,9 +130,8 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_nopad(int ur_w, vfmadd231ps(Ymm(ur_w * ii + jj), Ymm(oc_blocks * ur_w + jj), ymm15); else { // Intel AVX support - Ymm tmp = ymask; - vmulps(tmp, ymm15, Ymm(oc_blocks * ur_w + jj)); - vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), tmp); + vmulps(ytmp, ymm15, Ymm(oc_blocks * ur_w + jj)); + vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), ytmp); } } } @@ -176,7 +174,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w, for (int jj = 0; jj < ur_w; jj++) { size_t offt; if (jcp.with_dw_conv) - offt = sizeof(float) * ((size_t)ii * od * jcp.dw_conv_ker_h * ow + jj) * oc_blk; + offt = sizeof(float) * ((size_t)ii * od * jcp_dw.kh * ow + jj) * oc_blk; else offt = sizeof(float) * ((size_t)ii * od * oh * ow + jj) * oc_blk; vmovups(Ymm(ur_w * ii + jj), @@ -224,7 +222,8 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w, mov(aux_reg_ker_d, ptr[param1 + GET_OFF(filt)]); mov(aux_reg_inp_d, reg_input); - if ((jcp.kd - 1) * (jcp.dilate_d + 1) < jcp.f_pad) { + if ((jcp.dilate_d >= jcp.id) + || (jcp.kd - 1) * (jcp.dilate_d + 1) < jcp.f_pad) { cmp(reg_ki, 0); je(skip_kd_loop, T_NEAR); } @@ -239,7 +238,8 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w, mov(aux_reg_kernel, aux_reg_ker_d); } - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { + if ((jcp.dilate_h >= jcp.ih) + || (jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { cmp(kj, 0); je(skip_kh_loop, T_NEAR); } @@ -279,8 +279,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w, pop(reg_output); } - - Label done, regular_store; + Label regular_store; test(reg_ci_flag, FLAG_IC_LAST); je(regular_store, T_NEAR); @@ -289,10 +288,6 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w, int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - eltwise_injectors[0]->compute_vector_range(0, oc_blocks * ur_w); - } - int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { auto& post_op = p.entry_[i]; @@ -324,14 +319,13 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w, for (int jj = 0; jj < ur_w; jj++) { size_t o_off; if (jcp.with_dw_conv) - o_off = sizeof(float) * ((size_t)ii * od * jcp.dw_conv_ker_h * ow + jj) * oc_blk; + o_off = sizeof(float) * ((size_t)ii * od * jcp_dw.kh * ow + jj) * oc_blk; else o_off = sizeof(float) * ((size_t)ii * od * oh * ow + jj) * oc_blk; Ymm reg_out = Ymm(ur_w * ii + jj); vmovups(make_safe_addr(reg_output, o_off, reg_long_offt), reg_out); } } - L(done); } inline void jit_avx2_conv_fwd_kernel_f32::solve_common( @@ -397,12 +391,6 @@ inline void jit_avx2_conv_fwd_kernel_f32::solve_common( void jit_avx2_conv_fwd_kernel_f32::generate() { - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - const auto &p = attr_.post_ops_; int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { @@ -474,25 +462,16 @@ bool jit_avx2_conv_fwd_kernel_f32::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: - return true // sum OR eltwise OR dw_conv - && !jcp.with_eltwise && (is_simple(0) || is_sum(0) || is_dw_conv(0)); - case 2: - return true // sum->eltwise OR dw_conv->eltwise OR eltwise->dw_conv OR dw_conv->sum OR sum->depthwise OR - // eltwise->depthwise OR depthwise->depthwise - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || - (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || - (is_simple(0) && is_simple(1))); - case 3: - return true // eltwise->dw_conv->eltwise OR dw_conv->sum->eltwise OR sum->eltwise->depthwise OR - // sum->depthwise->eltwise OR sum->depthwise->depthwise - && !jcp.with_eltwise && ((is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || - (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || - (is_sum(0) && is_simple(1) && is_simple(2))); - case 4: return true // eltwise->dw_conv->sum->eltwise - && !jcp.with_eltwise && (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); - default: return false; + case 0: return true; + case 1: return is_simple(0) || is_sum(0) || is_dw_conv(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || + (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || + (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || + (is_sum(0) && is_simple(1) && is_simple(2)); + case 4: return (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); + default: return false; } return false; @@ -501,7 +480,7 @@ bool jit_avx2_conv_fwd_kernel_f32::post_ops_ok( status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope) + const primitive_attr_t &attr) { if (!mayiuse(avx)) return status::unimplemented; @@ -539,63 +518,62 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4]; jcp.dilate_w = cd.dilates[ndims-3]; - jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) - - (jcp.ih + jcp.t_pad - 1); - jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; if (!post_ops_ok(jcp, attr)) return status::unimplemented; const auto &p = attr.post_ops_; - jcp.with_dw_conv = false; + int dw_conv_ind = p.find(primitive_kind::convolution); - if (dw_conv_ind != -1) { - jcp.with_dw_conv = true; - jcp.dw_conv_in_h = p.entry_[dw_conv_ind].dw_conv.in_h; - jcp.dw_conv_in_w = p.entry_[dw_conv_ind].dw_conv.in_w; - jcp.dw_conv_ker_h = p.entry_[dw_conv_ind].dw_conv.ker_h; - jcp.dw_conv_ker_w = p.entry_[dw_conv_ind].dw_conv.ker_w; - jcp.dw_conv_str_h = p.entry_[dw_conv_ind].dw_conv.str_h; - jcp.dw_conv_str_w = p.entry_[dw_conv_ind].dw_conv.str_w; - jcp.dw_conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; - jcp.dw_conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; + jcp.with_dw_conv = dw_conv_ind != -1; + if (jcp.with_dw_conv) { + jcp.dw_conv_oh = jcp.oh; + jcp.dw_conv_ow = jcp.ow; + jcp.oh = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp.ow = p.entry_[dw_conv_ind].dw_conv.in_w; } + jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) + - (jcp.ih + jcp.t_pad - 1); + if (jcp.with_dw_conv && !mayiuse(avx2)) return status::unimplemented; if (jcp.with_dw_conv && jcp.ndims == 5) return status::unimplemented; - if (jcp.with_dw_conv) { - int dw_conv_eltwise_ind = p.find(primitive_kind::eltwise, dw_conv_ind); - if (dw_conv_eltwise_ind != -1) { - jcp.dw_conv_with_eltwise = true; - jcp.dw_conv_eltwise_alg = p.entry_[dw_conv_eltwise_ind].eltwise.alg; - jcp.dw_conv_eltwise_alpha = p.entry_[dw_conv_eltwise_ind].eltwise.alpha; - jcp.dw_conv_eltwise_beta = p.entry_[dw_conv_eltwise_ind].eltwise.beta; + if (!mayiuse(avx2)) { + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + if (post_op.eltwise.alg != alg_kind::eltwise_relu) + return status::unimplemented; + } else if (post_op.is_depthwise()) { + return status::unimplemented; + } } } jcp.with_sum = p.find(primitive_kind::sum, 0, dw_conv_ind) != -1; - if (jcp.with_dw_conv) { - jcp.dw_conv_with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; - } - if (jcp.with_dw_conv) { - jcp.oh = jcp.dw_conv_in_h; - jcp.ow = jcp.dw_conv_in_w; - } + jcp.src_dt = cd.src_desc.data_type; + jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; + jcp.dst_dt = cd.dst_desc.data_type; const int simd_w = 8; const bool flat = jcp.ic < simd_w; const bool mimo = !flat; + + /* Grouped channel offset to support 'non-blocked data' format for + * convolution sizes with '(input_channel / ngroups) < simd' */ + jcp.nonblk_group_off + = (one_of(src_d.format(), ncw, nchw, ncdhw) && jcp.ngroups > 1) ? + jcp.ic : + 1; + bool ok_to_pad_channels = true && jcp.ngroups == 1; @@ -686,8 +664,23 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, return status::success; } -void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow, - int r_overflow, int start_off) +void jit_avx2_conv_fwd_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw) { + if (jcp.with_bias && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp.oc); + + if (jcp.with_dw_conv) { + const int nthreads = mkldnn_get_max_threads(); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * jcp.nb_oc_blocking; + scratchpad.book(key_dw_conv_buffer, sizeof(float) * dw_conv_buffer_size_ * nthreads); + + if (jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_dw_conv_padded_bias, sizeof(float) * jcp.oc); + } +} + +void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(int ur_w, int l_overflow, + int r_overflow) { int kw = jcp.kw; int kh = jcp.kh; @@ -696,29 +689,37 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow, int ih = jcp.ih; int id = jcp.id; int ow = jcp.ow; - int stride_w = jcp.stride_w; - int stride_h = jcp.stride_h; int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; int nb_ic_block = jcp.nb_ic_blocking; + int stride_w = jcp.stride_w; + int stride_h = jcp.stride_h; Label kd_loop, skip_kd_loop; + Label oc_loop, skip_oc_loop; for (int ii = 0; ii < nb_ic_block; ii++) for (int jj = 0; jj < ur_w; jj++) { - size_t offt = sizeof(float) * ((size_t)ii * id * ih * iw + jj) - * ic_block; - vmovups(Ymm(ur_w * ii + jj), - make_safe_addr(reg_dsrc, offt, reg_long_offt)); + uni_vpxor(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), + Ymm(ur_w * ii + jj)); } if (one_of(jcp.ndims, 3, 4)) { - mov(aux_reg_ddst, reg_ddst); - mov(aux_reg_kernel, reg_kernel); + cmp(reg_channel_work, 0); + jle(skip_oc_loop, T_NEAR); + xor_(reg_channel, reg_channel); + + mov(aux_reg_ddst_oc_loop, reg_ddst); + mov(aux_reg_kernel_oc_loop, reg_kernel); + + L(oc_loop); + mov(aux_reg_ddst, aux_reg_ddst_oc_loop); + mov(aux_reg_kernel, aux_reg_kernel_oc_loop); } if (jcp.ndims == 5) { + assert(jcp.nb_oc_blocking == 1); push(oi_iter); mov(reg_ki, ptr[this->param1 + GET_OFF(kd_padding)]); @@ -736,42 +737,46 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow, mov(aux_reg_kernel, aux_reg_ker_d); } - mov(kj, reg_kh); - - Label kh_label; - - L(kh_label); { + Label kh_loop, skip_kh_loop; + cmp(kj, 0); + jle(skip_kh_loop, T_NEAR); + L(kh_loop); { for (int ki = 0; ki < kw; ki++) { - int jj_start = nstl::max(0, l_overflow - (kw - 1) + ki) ; // 0; - int jj_end = ur_w - nstl::max(0, r_overflow - ki); // ur_w; + int jj_start = get_iw_start(ki, l_overflow); // 0; + int jj_end = get_iw_end(ur_w, ki, r_overflow); // ur_w; for (int ofm2 = 0; ofm2 < jcp.oc_block; ofm2++) { - for (int jj = jj_start; jj < jj_end; jj++) { - if ((jj - ki + jcp.l_pad + start_off) % stride_w == 0) { - int aux_output_offset = ((jj - ki + jcp.l_pad + start_off) / stride_w) * jcp.oc_block + ofm2; - vbroadcastss(Ymm(nb_ic_block * ur_w + jj), ptr[aux_reg_ddst + sizeof(float) * aux_output_offset]); - } + for (int jj = jj_start ; jj < jj_end; jj += stride_w) { + int aux_output_offset + = (jj + jcp.l_pad - ki) / stride_w * jcp.oc_block + ofm2; + vbroadcastss(Ymm(nb_ic_block * ur_w + jj / stride_w), + ptr[aux_reg_ddst + + sizeof(float) * aux_output_offset]); } - for (int ii = 0; ii < nb_ic_block; ii++) { - int aux_kernel_offset = ii * kd * kh * kw * jcp.ic_block * jcp.oc_block + ki * jcp.ic_block * jcp.oc_block + ofm2 * jcp.ic_block; - vmovups(ymm15, ptr[aux_reg_kernel + sizeof(float) * aux_kernel_offset]); - - for (int jj = jj_start; jj < jj_end; jj++) { - if ((jj - ki + jcp.l_pad + start_off) % stride_w == 0) { - vfmadd231ps(Ymm(ur_w * ii + jj), Ymm(nb_ic_block * ur_w + jj), ymm15); - } - } + for (int ii = 0; ii < nb_ic_block; ii++) { + int aux_kernel_offset + = ii * kd * kh * kw * jcp.ic_block * jcp.oc_block + + ki * jcp.ic_block * jcp.oc_block + + ofm2 * jcp.ic_block; + vmovups(ymm15, + ptr[aux_reg_kernel + + sizeof(float) * aux_kernel_offset]); + for (int jj = jj_start; jj < jj_end; jj += stride_w) + vfmadd231ps(Ymm(ur_w * ii + jj), + Ymm(nb_ic_block * ur_w + jj / stride_w), ymm15); } } } - add(aux_reg_kernel, sizeof(float) * kw * oc_block * ic_block * stride_h); + add(aux_reg_kernel, sizeof(float) * stride_h * kw * oc_block + * ic_block); sub(aux_reg_ddst, sizeof(float) * ow * oc_block); - sub(kj, stride_h); + dec(kj); cmp(kj, 0); - jg(kh_label, T_NEAR); + jg(kh_loop, T_NEAR); } + L(skip_kh_loop); if (jcp.ndims == 5) { sub(aux_reg_dst_d, @@ -787,6 +792,39 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow, pop(oi_iter); } + if (one_of(jcp.ndims, 3, 4)) { + int ddst_oc_shift = sizeof(float) * jcp.od * jcp.oh * jcp.ow + * jcp.oc_block; + int kernel_oc_shift = sizeof(float) * jcp.kd * jcp.kh * jcp.kw + * jcp.ic * jcp.oc_block; + + add(aux_reg_ddst_oc_loop, ddst_oc_shift); + add(aux_reg_kernel_oc_loop, kernel_oc_shift); + + inc(reg_channel); + cmp(reg_channel, reg_channel_work); + jl(oc_loop, T_NEAR); + + L(skip_oc_loop); + mov(reg_channel, ptr[param1 + GET_OFF(channel)]); + } + + Label no_update_label; + cmp(reg_channel, 0); + je(no_update_label, T_NEAR); + for (int ii = 0; ii < nb_ic_block; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + size_t offt = + sizeof(float) * ((size_t)ii * id * ih * iw + jj) * ic_block; + vmovups(Ymm(15), + make_safe_addr(reg_dsrc, offt, reg_long_offt)); + vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), + Ymm(15)); + + } + } + L(no_update_label); + for (int ii = 0; ii < nb_ic_block; ii++) for (int jj = 0; jj < ur_w; jj++) { size_t offt = @@ -799,79 +837,63 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow, void jit_avx2_conv_bwd_data_kernel_f32::generate() { preamble(); - auto hsw_iter_body = [=] (int ur_w, int l_overflow, int r_overflow) { - if (jcp.stride_w == 1) { - hsw_iter(ur_w, l_overflow, r_overflow, 0); - add(reg_dsrc, sizeof(float) * jcp.ur_w * jcp.ic_block); - add(reg_ddst, sizeof(float) * jcp.ur_w * jcp.oc_block); - } else { - Label hsw_iter_off_0; - Label hsw_iter_off_1; - Label hsw_iter_exit; - - int dst_off = jcp.ur_w / jcp.stride_w; - - and_(start_off_reg, 1); - - L(hsw_iter_off_0); { - cmp(start_off_reg, 0); - jg(hsw_iter_off_1, T_NEAR); - - hsw_iter(ur_w, l_overflow, r_overflow, 0); - add(reg_dsrc, sizeof(float) * jcp.ur_w * jcp.ic_block); - add(reg_ddst, sizeof(float) * dst_off * jcp.oc_block); - - jmp(hsw_iter_exit, T_NEAR); - } - - L(hsw_iter_off_1); { - hsw_iter(ur_w, l_overflow, r_overflow, 1); - add(reg_dsrc, sizeof(float) * jcp.ur_w * jcp.ic_block); - add(reg_ddst, sizeof(float) * (dst_off + 1) * jcp.oc_block); - } - - L(hsw_iter_exit); - add(start_off_reg, std::abs(jcp.ur_w - jcp.stride_w)); - } - }; - mov(reg_dsrc, ptr[this->param1 + GET_OFF(src)]); mov(reg_ddst, ptr[this->param1 + GET_OFF(dst)]); mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]); mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); + mov(reg_channel, ptr[param1 + GET_OFF(channel)]); + mov(reg_channel_work, ptr[param1 + GET_OFF(ch_blocks)]); - int n_oi = jcp.iw / jcp.ur_w; - xor_(oi_iter, oi_iter); - xor_(start_off_reg, start_off_reg); + int ddst_shift = sizeof(float) * (jcp.ur_w / jcp.stride_w) * jcp.ic_block; + int dsrc_shift = sizeof(float) * jcp.ur_w * jcp.oc_block; - int l_overflow = nstl::max(0, jcp.kw - 1 - jcp.l_pad); - if (l_overflow > 0) { - hsw_iter_body(jcp.ur_w, l_overflow, 0); - inc(oi_iter); - } + int l_overflow = nstl::max(0, (jcp.kw - 1 - jcp.l_pad) / jcp.stride_w); + int r_overflow = nstl::max(0, (jcp.kw - 1 + - nstl::max(0, jcp.r_pad)) / jcp.stride_w); + int r_overflow1 = nstl::max(0, (jcp.kw - 1 + - nstl::max(0, jcp.r_pad) - jcp.ur_w_tail) / jcp.stride_w); - int r_pad = jcp.iwp - jcp.iw - jcp.l_pad; - int r_overflow1 - = nstl::max(0, jcp.kw - 1 - (jcp.iw - jcp.ur_w * n_oi) - r_pad); - int r_overflow = nstl::max(0, jcp.kw - 1 - r_pad); + int n_oi = jcp.iw / jcp.ur_w; if (r_overflow1 > 0) n_oi--; - if ((l_overflow <= 0 && n_oi > 0) || (l_overflow > 0 && n_oi > 1)) { - Label ow_loop; - L(ow_loop); { - hsw_iter_body(jcp.ur_w, 0, 0); + if (jcp.ur_w == jcp.iw) { + compute_loop(jcp.ur_w, l_overflow, r_overflow); + } else if (n_oi == 0) { + compute_loop(jcp.ur_w, l_overflow, r_overflow1); + add(reg_dsrc, dsrc_shift); + add(reg_ddst, ddst_shift); + if (jcp.ur_w_tail != 0) + compute_loop(jcp.ur_w_tail, 0, r_overflow); + } else { + xor_(oi_iter, oi_iter); + if (l_overflow > 0) { + compute_loop(jcp.ur_w, l_overflow, 0); + add(reg_dsrc, dsrc_shift); + add(reg_ddst, ddst_shift); inc(oi_iter); - cmp(oi_iter, n_oi); - jl(ow_loop, T_NEAR); } - } - if (r_overflow1 > 0 ) - hsw_iter_body(jcp.ur_w, 0, r_overflow1); + if ((l_overflow <= 0 && n_oi > 0) || (l_overflow > 0 && n_oi > 1)) { + Label ow_loop; + L(ow_loop); { + compute_loop(jcp.ur_w, 0, 0); + add(reg_dsrc, dsrc_shift); + add(reg_ddst, ddst_shift); + inc(oi_iter); + cmp(oi_iter, n_oi); jl(ow_loop, T_NEAR); + } + } - if (jcp.ur_w_tail != 0) - hsw_iter_body(jcp.ur_w_tail, 0, r_overflow); + if (r_overflow1 > 0 ) { + compute_loop(jcp.ur_w, 0, r_overflow1); + add(reg_dsrc, dsrc_shift); + add(reg_ddst, ddst_shift); + } + + if (jcp.ur_w_tail != 0) + compute_loop(jcp.ur_w_tail, 0, r_overflow); + } this->postamble(); } @@ -930,6 +952,10 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp, bool ok_to_pad_channels = true && jcp.ngroups == 1; + /* gemm-based convolution performs better in these cases */ + if (jcp.ic < simd_w && jcp.kw > 3 && jcp.stride_w > 1) + return status::unimplemented; + if (ok_to_pad_channels) { jcp.oc = rnd_up(jcp.oc, simd_w); jcp.ic = rnd_up(jcp.ic, simd_w); @@ -945,16 +971,19 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp, jcp.ur_h = 1; /* no code-unrolling by h so far */ jcp.nb_ic_blocking = 1; jcp.nb_oc_blocking = 1; + jcp.ur_w = 1; + + if(one_of(ndims, 3, 4) && jcp.ow < 40) + jcp.nb_oc_blocking = jcp.ow < 15 ? 4 : 2; jcp.src_fmt = diff_src_d.format(); - jcp.with_eltwise = false; bool args_ok = true && one_of(diff_src_d.format(), nCw8c, nChw8c, nCdhw8c) && one_of(weights_d.format(), gOIw8o8i, OIw8i8o, gOIhw8o8i, OIhw8o8i, gOIdhw8o8i, OIdhw8o8i) && one_of(diff_dst_d.format(), nCw8c, nChw8c, nCdhw8c) - && (jcp.stride_w == 1 || jcp.stride_w == 2) + && jcp.stride_w == jcp.stride_h && jcp.stride_d == 1 && jcp.dilate_d == 0 && jcp.dilate_h == 0 @@ -965,34 +994,69 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp, && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1 && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1; if (!args_ok) return status::unimplemented; + jcp.r_pad = (jcp.ow - 1) * jcp.stride_w + jcp.kw - jcp.iw - jcp.l_pad; + jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + jcp.kh - jcp.ih - jcp.t_pad; + int l_overflow = nstl::max(0, (jcp.kw - 1 - jcp.l_pad) / jcp.stride_w); + + const int max_regs = 15; /* Maximun number of registers available for + result accumulation and delta dst data. + One additional register is reserved for weights + data. */ + + /* Find the best blocking with maximum number of fma instructions + per ur_w * nb_ic_blocking compute loops. Number of required registers + is num_regs = ur_w * nb_ic_blocking + ur_w / stride_w <= max_regs. + ur_w must be divisible by stride_w */ + if (jcp.stride_w + 1 > max_regs) /* Minimal possible registers + distribution exceeds max_regs */ + return status::unimplemented; - jcp.ur_w = 3; - - for (int b = 4; b > 1; b--) + int best_nfmas = 0; + for (int b = 1; b <= 4; b++) { - if (jcp.nb_ic % b == 0) + if (jcp.nb_ic % b != 0) + continue; + + for (int u = jcp.stride_w; + u * b + u / jcp.stride_w <= max_regs && u < jcp.iw + jcp.stride_w; + u += jcp.stride_w) { - jcp.nb_ic_blocking = b; - break; + int ur_w = nstl::min(u, jcp.iw); + /* maximum 1 step with l_overflow so far */ + if (l_overflow * jcp.stride_w > ur_w && ur_w != jcp.iw) + continue; + int nfmas = utils::div_up(ur_w, jcp.stride_w) * b; + if (nfmas > best_nfmas + || (nfmas == best_nfmas && jcp.ur_w < ur_w)) { + jcp.ur_w = ur_w; + jcp.nb_ic_blocking = b; + best_nfmas = nfmas; + } } } + if (best_nfmas == 0) /* can't find appropriate blocking */ + return status::unimplemented; jcp.ur_w_tail = jcp.iw % jcp.ur_w; - int l_overflow = nstl::max(0, jcp.kw - 1 - jcp.l_pad); - if (l_overflow > jcp.ur_w) /* maximum 1 step with l_overflow so far */ - return status::unimplemented; - int r_pad = jcp.iwp - jcp.iw - jcp.l_pad; - int r_overflow_step0 = nstl::max(0, jcp.kw - 1 - (jcp.iw - jcp.ur_w) - r_pad); - if (l_overflow > 0 && r_overflow_step0 > 0) /* no steps with both left and - right overflow so far */ + + int r_overflow_no_tail = nstl::max(0, (jcp.kw - 1 - jcp.ur_w_tail + - nstl::max(0, jcp.r_pad) - jcp.ur_w_tail) / jcp.stride_w); + /* maximum 1 ur_w block with r_overflow so far */ + if (r_overflow_no_tail * jcp.stride_w > jcp.ur_w) return status::unimplemented; - int r_overflow_no_tail = nstl::max(0,jcp.kw - 1 - jcp.ur_w_tail - r_pad); - if (r_overflow_no_tail > jcp.ur_w) /* maximum 1 ur_w block with - r_overflow so far */ + + if ((jcp.iw > jcp.ur_w) && (jcp.ur_w % jcp.stride_w != 0)) return status::unimplemented; + return status::success; } +void jit_avx2_conv_bwd_data_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + UNUSED(scratchpad); + UNUSED(jcp); +} + void jit_avx2_conv_bwd_weights_kernel_f32::generate() { this->preamble(); @@ -1045,8 +1109,6 @@ status_t jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jit_conv_conf_t &jcp, jcp.src_fmt = src_d.format(); jcp.with_bias = cd.diff_bias_desc.format != memory_format::undef; - jcp.with_eltwise = false; - jcp.eltwise_alpha = 0; const bool flat = jcp.ic == 3; const bool mimo = !flat; @@ -1097,9 +1159,16 @@ status_t jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jit_conv_conf_t &jcp, jcp.oc_block = simd_w; jcp.nb_oc = jcp.oc / jcp.oc_block; jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; + return status::success; } +void jit_avx2_conv_bwd_weights_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + if (jcp.with_bias && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp.oc); +} + inline void jit_avx2_conv_bwd_weights_kernel_f32::od_step_comeback_pointers() { Label kd_comeback_loop; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.hpp index f370054..0c4eb31 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.hpp @@ -18,9 +18,11 @@ #define JIT_AVX2_CONV_KERNEL_F32_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + +#include "cpu_memory.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" -#include "cpu_memory.hpp" #include "jit_uni_eltwise.hpp" #include "jit_uni_depthwise.hpp" @@ -29,8 +31,9 @@ namespace impl { namespace cpu { struct jit_avx2_conv_fwd_kernel_f32: public jit_generator { - jit_avx2_conv_fwd_kernel_f32(jit_conv_conf_t ajcp, - const primitive_attr_t &attr): jcp(ajcp), attr_(attr) + jit_avx2_conv_fwd_kernel_f32(jit_conv_conf_t ajcp, jit_conv_conf_t ajcp_dw, + const primitive_attr_t &attr) + : jcp(ajcp), jcp_dw(ajcp_dw), attr_(attr) { this->generate(); jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); @@ -54,11 +57,12 @@ struct jit_avx2_conv_fwd_kernel_f32: public jit_generator { const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, - bool with_relu = false, - float relu_negative_slope = 0.); + const primitive_attr_t &attr); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw = jit_conv_conf_t()); jit_conv_conf_t jcp; + jit_conv_conf_t jcp_dw; const primitive_attr_t &attr_; void (*jit_ker)(jit_conv_call_s *); @@ -84,7 +88,7 @@ private: reg64_t reg_long_offt = r15; Xbyak::Reg32 reg_ci_flag = r13d; - Xbyak::Ymm ymask = Xbyak::Ymm(14); + Xbyak::Ymm ytmp = Xbyak::Ymm(14); reg64_t reg_d_weights = imm_addr64; reg64_t reg_d_bias = ki_iter; @@ -116,6 +120,8 @@ struct jit_avx2_conv_bwd_data_kernel_f32: public jit_generator { const convolution_desc_t &cd, const memory_desc_wrapper &diff_src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &diff_dst_d); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_s *); @@ -123,33 +129,52 @@ struct jit_avx2_conv_bwd_data_kernel_f32: public jit_generator { private: using reg64_t = const Xbyak::Reg64; - reg64_t reg_input = rax; reg64_t reg_ddst = rax; - reg64_t aux_reg_input = r8; reg64_t aux_reg_ddst = r8; - reg64_t aux1_reg_input = r9; reg64_t reg_kernel = rdx; reg64_t aux_reg_kernel = r10; - reg64_t reg_output = rsi; reg64_t reg_dsrc = rsi; - reg64_t aux_reg_output = rbx; - reg64_t aux_reg_dsrc = rbx; + reg64_t aux_reg_ddst_oc_loop = rbx; // used in ndims < 5 case only + reg64_t aux_reg_kernel_oc_loop = abi_not_param1; /* used in ndims < 5 + case only */ - reg64_t aux_reg_dst_d = r12; - reg64_t aux_reg_ker_d = r14; + reg64_t aux_reg_dst_d = r12; // used in ndims == 5 case only + reg64_t aux_reg_ker_d = r14; // used in ndims == 5 case only - reg64_t reg_ki = abi_not_param1; + reg64_t reg_ki = abi_not_param1; // used in ndims == 5 case only reg64_t kj = r11; reg64_t oi_iter = r12; reg64_t reg_kh = r14; - reg64_t ki_iter = r13; + reg64_t reg_channel = r13; // used in ndims < 5 case only + reg64_t reg_channel_work = r9; // used in ndims < 5 case only reg64_t reg_long_offt = r15; - reg64_t start_off_reg = aux1_reg_input; - inline void hsw_iter(int ur_w, int l_overflow, int r_overflow, - int start_off); + inline void compute_loop(int ur_w, int l_overflow, int r_overflow); void generate(); + + inline int get_iw_start(int ki, int l_overflow) + { + int res = (jcp.iw - 1 + jcp.r_pad) % jcp.stride_w + + l_overflow * jcp.stride_w + - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1); + while (res < 0) + res += jcp.stride_w; + + return res; + } + + inline int get_iw_end(int ur_w, int ki, int r_overflow) + { + if (utils::one_of(ur_w, jcp.iw, jcp.ur_w_tail)) + ur_w += nstl::min(0, jcp.r_pad); // remove negative padding + int res = (ur_w - 1 + jcp.l_pad) % jcp.stride_w + + r_overflow * jcp.stride_w - ki * (jcp.dilate_w + 1); + while (res < 0) + res += jcp.stride_w; + + return ur_w - res; + } }; struct jit_avx2_conv_bwd_weights_kernel_f32: public jit_generator { @@ -165,6 +190,8 @@ struct jit_avx2_conv_bwd_weights_kernel_f32: public jit_generator { const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &diff_weights_d, const memory_desc_wrapper &diff_dst_d); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_s *); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.cpp index e9ccf6f..d7ea64b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.cpp @@ -14,14 +14,13 @@ * limitations under the License. *******************************************************************************/ -#include -#include "mkldnn_types.h" - #include "c_types_map.hpp" -#include "jit_avx2_convolution.hpp" -#include "utils.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" +#include "utils.hpp" +#include + +#include "jit_avx2_convolution.hpp" namespace mkldnn { namespace impl { @@ -29,39 +28,38 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; - #define src_blk_off(f, n, c, d, h, w) \ - (conf_.ndims() == 3) \ + (pd()->ndims() == 3) \ ? (f).blk_off(n, c, w) \ - : (conf_.ndims() == 4) \ + : (pd()->ndims() == 4) \ ? (f).blk_off(n, c, h, w) \ : (f).blk_off(n, c, d, h, w) #define wht_blk_off_(f, g, ...) \ - conf_.with_groups() ? (f).blk_off(g, __VA_ARGS__) : (f).blk_off(__VA_ARGS__) + pd()->with_groups() ? (f).blk_off(g, __VA_ARGS__) : (f).blk_off(__VA_ARGS__) #define wht_blk_off(f, g, oc, ic, kd, kh, kw) \ - (conf_.ndims() == 3) \ + (pd()->ndims() == 3) \ ? wht_blk_off_(f, g, oc, ic, kw) \ - : (conf_.ndims() == 4) \ + : (pd()->ndims() == 4) \ ? wht_blk_off_(f, g, oc, ic, kh, kw) \ : wht_blk_off_(f, g, oc, ic, kd, kh, kw) -template -void _jit_avx2_convolution_fwd_t::execute_forward() { +void jit_avx2_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.od @@ -86,7 +84,7 @@ void _jit_avx2_convolution_fwd_t::execute_forward() { int ocb_num = jcp.nb_oc_blocking; for (int icb = icbb; icb < icbb + icb_step; ++icb) { - jit_conv_call_s par_conv = {}; + auto par_conv = jit_conv_call_s(); const int ij = oh * jcp.stride_h; const int i_t_overflow = nstl::max(0, jcp.t_pad - ij); @@ -99,7 +97,7 @@ void _jit_avx2_convolution_fwd_t::execute_forward() { + (jcp.kd-1) * (jcp.dilate_d+1) - jcp.f_pad+1) - jcp.id; const size_t _oc = g * jcp.nb_oc + ocb; - const size_t _ic = g * jcp.nb_ic + icb; + const size_t _ic = g * jcp.nb_ic * jcp.nonblk_group_off + icb; const int ih = nstl::max(ij - jcp.t_pad + div_up(i_t_overflow, @@ -155,31 +153,35 @@ void _jit_avx2_convolution_fwd_t::execute_forward() { } }; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; } parallel(0, ker); + + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); } -template -void _jit_avx2_convolution_fwd_t::execute_forward_fusing() { +void jit_avx2_convolution_fwd_t::execute_forward_with_dw_conv() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; const auto &jcp_dw = kernel_dw_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); - auto dw_bias = jcp.dw_conv_biases; + auto dw_bias = jcp_dw.conv_biases; int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; @@ -189,8 +191,8 @@ void _jit_avx2_convolution_fwd_t::execute_forward_fusing() { for (int h = 0; h < num_rows; h++) { if ((oh + h) < 0 || (oh + h) >= jcp.oh) { for (int chb = ocb; chb < ocb + ocb_num; chb++) { - memset(ws_p + (((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * jcp.oc_block + - (chb - ocb) * jcp.dw_conv_ker_h * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); + memset(ws_p + (((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block + + (chb - ocb) * jcp_dw.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); } } else { for (int icb = 0; icb < jcp.nb_ic; ++icb) { @@ -211,11 +213,11 @@ void _jit_avx2_convolution_fwd_t::execute_forward_fusing() { par_conv.src = &src[src_d.blk_off(n, jcp.ic == 3 ? 0 : _ic, ih, 0)]; - par_conv.dst = &ws_p[(((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * + par_conv.dst = &ws_p[(((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block]; const int wh = div_up(i_t_overflow, (jcp.dilate_h + 1)); - par_conv.filt = &weights[conf_.with_groups() + par_conv.filt = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, jcp.ic == 3 ? 0 : icb, wh, 0) : weights_d.blk_off(ocb, @@ -264,9 +266,11 @@ void _jit_avx2_convolution_fwd_t::execute_forward_fusing() { dst_idx/jcp_dw.stride_h*jcp_dw.ow*jcp_dw.ch_block]; par_conv_dw.kh_padding = jcp_dw.kh; - par_conv_dw.filt = &jcp.dw_conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; + par_conv_dw.filt = &jcp_dw.conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; par_conv_dw.bias = &dw_bias[chb * jcp_dw.ch_block]; par_conv_dw.ur_w = (size_t)(jcp_dw.ow); + par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw.ch_block, (int)jcp_dw.oc) - chb*jcp_dw.ch_block; + par_conv_dw.oc_off = chb * jcp_dw.ch_block * sizeof(float); kernel_dw_->jit_ker(&par_conv_dw); } @@ -275,7 +279,9 @@ void _jit_avx2_convolution_fwd_t::execute_forward_fusing() { size_t start{0}, end{0}; balance211(work_amount, nthr, ithr, start, end); - auto pbuf = dw_conv_buffer_ + ithr * dw_conv_buffer_size_; + auto dw_conv_buffer = scratchpad().get(key_dw_conv_buffer); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * jcp.nb_oc_blocking; + auto pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_; size_t n{0}, g{0}, ocbb{0}, oh{0}; nd_iterator_init(start, n, MB, g, jcp.ngroups, ocbb, ocb_work, @@ -304,138 +310,156 @@ void _jit_avx2_convolution_fwd_t::execute_forward_fusing() { } }; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - dw_padded_bias_[oc] = dw_bias[oc]; - dw_bias = dw_padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; + + auto dw_padded_bias = scratchpad().get(key_dw_conv_padded_bias); + utils::array_copy(dw_padded_bias, dw_bias, jcp.oc_without_padding); + utils::array_set(dw_padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + dw_bias = dw_padded_bias; } parallel(0, ker); -} -template void _jit_avx2_convolution_fwd_t::execute_forward(); -template void _jit_avx2_convolution_fwd_t::execute_forward(); -template void _jit_avx2_convolution_fwd_t::execute_forward_fusing(); -template void _jit_avx2_convolution_fwd_t::execute_forward_fusing(); + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); +} -void jit_avx2_convolution_bwd_data_t::execute_backward_data() { +void jit_avx2_convolution_bwd_data_t::execute_backward_data() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); int icb_work = jcp.nb_ic / jcp.nb_ic_blocking; - const size_t work_amount = MB * jcp.ngroups * icb_work * jcp.ih; + int ih_block_size = jcp.ih; + int num_ih_blocks = utils::div_up(jcp.ih, ih_block_size); + size_t work_amount = MB * jcp.ngroups * icb_work * num_ih_blocks; + if (work_amount < (size_t)2 * mkldnn_get_max_threads()) { + ih_block_size = 1; + num_ih_blocks = utils::div_up(jcp.ih, ih_block_size); + work_amount *= num_ih_blocks; + } auto ker = [&](const int ithr, const int nthr) { size_t start{0}, end{0}; balance211(work_amount, nthr, ithr, start, end); - size_t n{0}, g{0}, icbb{0}, ih{0}; - nd_iterator_init(start, n, MB, g, jcp.ngroups, icbb, icb_work, ih, jcp.ih); + size_t n{0}, g{0}, icbb{0}, ihb{0}; + nd_iterator_init(start, n, MB, g, jcp.ngroups, icbb, icb_work, + ihb, num_ih_blocks); + for (size_t iwork = start; iwork < end; ++iwork) { - for (int oc = 0; oc < jcp.nb_oc; ++oc) + for (int oc = 0; oc < jcp.nb_oc; oc += jcp.nb_oc_blocking) for (int id = 0; id < jcp.id; ++id) { auto par_conv = jit_conv_call_s(); const int idp = jcp.id + 2 * jcp.f_pad; const int d_t_overflow = nstl::max(0, - jcp.kd - 1 - id - jcp.f_pad); + jcp.kd - 1 - id - jcp.f_pad); const int back_pad = idp - jcp.id - jcp.f_pad; const int d_b_overflow = nstl::max(0, - jcp.kd - 1 - (jcp.id - 1 - id) - back_pad); + jcp.kd - 1 - (jcp.id - 1 - id) - back_pad); const int od = id + jcp.f_pad - d_b_overflow; - const int simd_w = 8; - - const int i_t_overflow = nstl::max(0, - jcp.kh - 1 - (int)ih - jcp.t_pad); - const int b_pad = jcp.ihp - jcp.ih - jcp.t_pad; - const int i_b_overflow = nstl::max(0, - jcp.kh - 1 - (jcp.ih - 1 - (int)ih) - b_pad); - int oh = ih + jcp.t_pad - i_b_overflow; - - int stride_off_h = oh % jcp.stride_h; - oh /= jcp.stride_h; - - par_conv.src = &diff_src[src_blk_off(diff_src_d, n, - /*jcp.ic == 3 ? 0 :*/ - g * jcp.nb_ic + jcp.nb_ic_blocking * icbb, id, ih, 0)]; - par_conv.dst = &diff_dst[src_blk_off(diff_dst_d, - n, g * jcp.nb_oc + oc, od, oh, 0)]; - par_conv.filt = &weights[wht_blk_off(weights_d, g, oc, - jcp.ic == 3 ? 0 : jcp.nb_ic_blocking * icbb, - d_b_overflow, i_b_overflow + stride_off_h, 0)]; - - par_conv.src_prf = nullptr; - par_conv.dst_prf = nullptr; - par_conv.filt_prf = nullptr; - // TODO: move initialization into the kernel - if (oc == 0) { - for (int iw = 0; iw < jcp.iw; iw++) { - for (int b = 0; b < jcp.nb_ic_blocking; b++) { - int current_ic = - (jcp.ic == 3 ? 0 : g * jcp.nb_ic) - + jcp.nb_ic_blocking * icbb + b; - int current_idx = - src_blk_off(diff_src_d, n, current_ic, - id, ih, iw); - for (int v = 0; v < simd_w; v++) - diff_src[current_idx + v] = 0.0; - } - } - } + int ih_start = ihb * ih_block_size; + int ih_end = nstl::min(jcp.ih, ih_start + ih_block_size); + for (int ih = ih_start; ih < ih_end; ++ih) { + + const int i_t_overflow = nstl::max(0, (jcp.kh - 1 + - ih - jcp.t_pad) / jcp.stride_h); + const int i_b_overflow = nstl::max(0, (jcp.kh - jcp.ih + + ih - jcp.b_pad) / jcp.stride_h); + int overflow_kh_hi = jcp.kh - 1 - abs((jcp.ih - 1 + + jcp.b_pad - ih) % jcp.stride_h); + int overflow_kh_lo = (ih + jcp.t_pad) % jcp.stride_h; + + par_conv.kd_padding = jcp.kd - d_t_overflow - d_b_overflow; + par_conv.kh_padding = (overflow_kh_hi - overflow_kh_lo) + / jcp.stride_h + 1 - i_t_overflow - i_b_overflow; + par_conv.kw_padding = 0; - par_conv.kd_padding = jcp.kd - d_t_overflow - d_b_overflow; - par_conv.kh_padding = nstl::max(0, jcp.kh - i_t_overflow - i_b_overflow - stride_off_h); - par_conv.kw_padding = 0; + const int k_lo = overflow_kh_lo + + i_b_overflow * jcp.stride_h; + const int oh = (ih + jcp.t_pad - k_lo) / jcp.stride_h; + + par_conv.src = &diff_src[src_blk_off(diff_src_d, n, + /*jcp.ic == 3 ? 0 :*/ + g * jcp.nb_ic + jcp.nb_ic_blocking * icbb, id, ih, 0)]; + par_conv.dst = &diff_dst[src_blk_off(diff_dst_d, + n, g * jcp.nb_oc + oc, od, oh, 0)]; + par_conv.filt = &weights[wht_blk_off(weights_d, g, oc, + jcp.ic == 3 ? 0 : jcp.nb_ic_blocking * icbb, + d_b_overflow, k_lo, 0)]; + + par_conv.src_prf = nullptr; + par_conv.dst_prf = nullptr; + par_conv.filt_prf = nullptr; + par_conv.channel = oc; + par_conv.ch_blocks = nstl::min(jcp.nb_oc - oc, + jcp.nb_oc_blocking); - if (par_conv.kh_padding > 0) kernel_->jit_ker(&par_conv); + } } - nd_iterator_step(n, MB, g, jcp.ngroups, icbb, icb_work, ih, jcp.ih); + nd_iterator_step(n, MB, g, jcp.ngroups, icbb, icb_work, ihb, + num_ih_blocks); } }; parallel(0, ker); } -void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() { +void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias_in = reinterpret_cast(this->memory(1)); - data_t *diff_bias = conf_.want_padded_bias() ? padded_bias_ : diff_bias_in; - const memory_desc_wrapper src_d(conf_.src_pd(0)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); + auto scratchpad = this->scratchpad(); + + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; + + const memory_desc_wrapper src_d(pd()->src_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); const auto &jcp = kernel_->jcp; + auto reducer_bia_scratchpad = memory_tracking::grantor_t(scratchpad, + prefix_reducer_bia); + auto rb = this->reducer_bias_; + rb->init(reducer_bia_scratchpad); + + auto reducer_wei_scratchpad = memory_tracking::grantor_t(scratchpad, + prefix_reducer_wei); + auto rw = this->reducer_weights_; + rw->init(reducer_wei_scratchpad); + auto ker = [&](int ithr, int nthr) { - auto rw = this->reducer_weights_; - assert(nthr == rw->balancer_.nthr_); + assert(nthr == rw->balancer().nthr_); - const int w_job_start = rw->balancer_.ithr_job_off(ithr); - const int w_njobs = rw->balancer_.ithr_njobs(ithr); + const int w_job_start = rw->balancer().ithr_job_off(ithr); + const int w_njobs = rw->balancer().ithr_njobs(ithr); if (w_njobs == 0) return; /* reduction dimension */ int img_od_start{0}, img_od_end{0}, img{0}, od_s{0}; - balance211(jcp.mb * jcp.od, rw->balancer_.nthr_per_group_, - rw->balancer_.id_in_group(ithr), img_od_start, img_od_end); + balance211(jcp.mb * jcp.od, rw->balancer().nthr_per_group_, + rw->balancer().id_in_group(ithr), img_od_start, img_od_end); int img_start = img_od_start, img_end = img_od_end; nd_iterator_init(img_start, img, jcp.mb, od_s, jcp.od); @@ -461,9 +485,10 @@ void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() { /* TODO: put dw <-- 0 in kernel */ if (img == img_first) - array_set((data_t *)&rw->get_local_ptr(ithr, diff_weights)[ - w_job_loc * rw->balancer_.job_size_], 0, - rw->balancer_.job_size_); + array_set(rw->get_local_ptr(ithr, diff_weights, + reducer_wei_scratchpad) + + w_job_loc * rw->balancer().job_size_, 0, + rw->balancer().job_size_); for (int od = od_s; od < od_e; ++od) { const int id = od * jcp.stride_d; @@ -473,8 +498,9 @@ void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() { par_conv.src = &src[src_blk_off(src_d, img, _ic, id, 0, 0)]; par_conv.dst = &diff_dst[src_blk_off(diff_dst_d, img, _oc, od, 0, 0)]; - par_conv.filt = &rw->get_local_ptr(ithr, diff_weights)[ - w_job_loc * rw->balancer_.job_size_]; + par_conv.filt = rw->get_local_ptr(ithr, diff_weights, + reducer_wei_scratchpad) + + w_job_loc * rw->balancer().job_size_; kernel_->jit_ker(&par_conv); } @@ -483,22 +509,21 @@ void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() { } nd_iterator_jump(img_start, img_end, img, jcp.mb, od_s, jcp.od); } - rw->reduce(ithr, diff_weights); + rw->reduce(ithr, diff_weights, reducer_wei_scratchpad); }; auto ker_bias = [&](int ithr, int nthr) { - auto rb = this->reducer_bias_; - assert(nthr == rb->balancer_.nthr_); + assert(nthr == rb->balancer().nthr_); - const int b_job_start = rb->balancer_.ithr_job_off(ithr); - const int b_njobs = rb->balancer_.ithr_njobs(ithr); + const int b_job_start = rb->balancer().ithr_job_off(ithr); + const int b_njobs = rb->balancer().ithr_njobs(ithr); if (b_njobs == 0) return; /* reduction dimension */ int img_start{0}, img_end{0}; - balance211(jcp.mb, rb->balancer_.nthr_per_group_, - rb->balancer_.id_in_group(ithr), img_start, img_end); + balance211(jcp.mb, rb->balancer().nthr_per_group_, + rb->balancer().id_in_group(ithr), img_start, img_end); /* jobs */ int g_start{0}, ocb_start{0}; @@ -511,8 +536,9 @@ void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() { const size_t _oc = g * jcp.nb_oc + ocb; const data_t *d_dst = &diff_dst[diff_dst_d.blk_off(img, _oc)]; - data_t *d_bias = &rb->get_local_ptr(ithr, diff_bias)[ - b_job_loc * rb->balancer_.job_size_]; + data_t *d_bias = rb->get_local_ptr(ithr, diff_bias, + reducer_bia_scratchpad) + + b_job_loc * rb->balancer().job_size_; if (img == img_start) for (int o = 0; o < 8; ++o) @@ -528,18 +554,17 @@ void jit_avx2_convolution_bwd_weights_t::execute_backward_weights() { nd_iterator_step(g, jcp.ngroups, ocb, jcp.nb_oc); } } - rb->reduce(ithr, diff_bias); + rb->reduce(ithr, diff_bias, reducer_bia_scratchpad); }; - parallel(0, [&](const int ithr, const int nthr) { ker(ithr, nthr); - if (conf_.with_bias()) + if (pd()->with_bias()) ker_bias(ithr, nthr); }); /* TODO: put this in ker_bias */ - if (conf_.want_padded_bias()) { + if (pd()->wants_padded_bias()) { assert(jcp.ngroups == 1); for (int oc = 0; oc < jcp.oc_without_padding; ++oc) diff_bias_in[oc] = diff_bias[oc]; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.hpp index bd151dd..1dff656 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.hpp @@ -18,74 +18,73 @@ #define CPU_JIT_AVX2_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" + #include "cpu_convolution_pd.hpp" -#include "cpu_engine.hpp" #include "cpu_reducer.hpp" -#include "jit_primitive_conf.hpp" + #include "jit_avx2_conv_kernel_f32.hpp" -#include "mkldnn_thread.hpp" #include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_avx2_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { +struct jit_avx2_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public cpu_convolution_fwd_pd_t { pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) - , jcp_(), jcp_dw() {} + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_(), jcp_dw_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit:", avx2, ""), - _jit_avx2_convolution_fwd_t); + jit_avx2_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), - data_type::f32 == this->cdesc_().bias_desc.data_type); + data_type::f32 == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; - status_t sts = jit_avx2_conv_fwd_kernel_f32::init_conf(jcp_, this->cdesc_(), - *this->src_pd_.desc(), *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->attr(), - with_relu, this->negative_slope()); + + + status_t sts = jit_avx2_conv_fwd_kernel_f32::init_conf(jcp_, + *this->desc(), *this->src_pd_.desc(), + *this->weights_pd_.desc(), *this->dst_pd_.desc(), + *this->attr()); if (sts != status::success) return sts; if (jcp_.with_dw_conv) { - int dw_conv_oh = (jcp_.oh - ((jcp_.dw_conv_ker_h - 1) + 1) + 2) / jcp_.dw_conv_str_h + 1; - int dw_conv_ow = (jcp_.ow - ((jcp_.dw_conv_ker_w - 1) + 1) + 2) / jcp_.dw_conv_str_w + 1; - - status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_dw, - jcp_.oc, jcp_.oh, jcp_.ow, dw_conv_oh, dw_conv_ow, - jcp_.dw_conv_ker_h, jcp_.dw_conv_ker_w, - jcp_.dw_conv_str_h, jcp_.dw_conv_str_w, - jcp_.dw_conv_eltwise_alg, jcp_.dw_conv_eltwise_alpha, - jcp_.dw_conv_eltwise_beta, jcp_.dw_conv_with_sum); + status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_, jcp_dw_, *this->attr()); if (sts_dw != status::success) return sts_dw; } + auto scratchpad = scratchpad_registry().registrar(); + jit_avx2_conv_fwd_kernel_f32::init_scratchpad(scratchpad, jcp_, jcp_dw_); + return status::success; } jit_conv_conf_t jcp_; - jit_conv_conf_t jcp_dw; + jit_conv_conf_t jcp_dw_; protected: virtual status_t set_default_params() override { @@ -109,62 +108,36 @@ struct _jit_avx2_convolution_fwd_t: public cpu_primitive_t { if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _jit_avx2_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_avx2_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), - padded_bias_(nullptr), - dw_conv_buffer_size_(0), dw_conv_buffer_(nullptr), dw_padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) { - kernel_ = new jit_avx2_conv_fwd_kernel_f32(conf_.jcp_, *conf_.attr()); - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; - } + kernel_ = new jit_avx2_conv_fwd_kernel_f32(pd()->jcp_, pd()->jcp_dw_, *pd()->attr()); - if (conf_.jcp_.with_dw_conv) { - kernel_dw_ = new jit_uni_dw_conv_row_f32(conf_.jcp_dw); - } - - if (conf_.jcp_.with_dw_conv) { - const int nthreads = mkldnn_get_max_threads(); - dw_conv_buffer_size_ = (size_t)conf_.jcp_dw.kh * conf_.jcp_dw.iw * conf_.jcp_dw.ch_block * - conf_.jcp_.nb_oc_blocking; - dw_conv_buffer_ = (float *)malloc(nthreads * dw_conv_buffer_size_ * sizeof(float), 64); - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - dw_padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - dw_padded_bias_[oc] = 0; - } + if (pd()->jcp_.with_dw_conv) { + kernel_dw_ = new jit_uni_dw_conv_row_f32(pd()->jcp_dw_, *pd()->attr(), pd()->jcp_dw_.ch_block); } } - ~_jit_avx2_convolution_fwd_t() { + ~jit_avx2_convolution_fwd_t() { delete kernel_; - free(padded_bias_); - if (conf_.jcp_.with_dw_conv) { + if (pd()->jcp_.with_dw_conv) { delete kernel_dw_; - free(dw_conv_buffer_); - free(dw_padded_bias_); } }; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.jcp_.with_dw_conv) - execute_forward_fusing(); + virtual void execute(event_t *e) const { + if (pd()->jcp_.with_dw_conv) + execute_forward_with_dw_conv(); else execute_forward(); @@ -172,23 +145,14 @@ struct _jit_avx2_convolution_fwd_t: public cpu_primitive_t { } private: - void execute_forward(); - void execute_forward_fusing(); + void execute_forward() const; + void execute_forward_with_dw_conv() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; jit_avx2_conv_fwd_kernel_f32 *kernel_; - data_t *padded_bias_; jit_uni_dw_conv_row_f32 *kernel_dw_; - - /* fuse with dw conv */ - size_t dw_conv_buffer_size_; - data_t *dw_conv_buffer_; - data_t *dw_padded_bias_; }; -using jit_avx2_convolution_fwd_t = _jit_avx2_convolution_fwd_t; -using jit_avx2_convolution_relu_t = _jit_avx2_convolution_fwd_t; - struct jit_avx2_convolution_bwd_data_t: public cpu_primitive_t { struct pd_t: public cpu_convolution_bwd_data_pd_t { pd_t(engine_t *engine, @@ -209,7 +173,8 @@ struct jit_avx2_convolution_bwd_data_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward_data) - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->diff_src_desc.data_type, @@ -217,9 +182,16 @@ struct jit_avx2_convolution_bwd_data_t: public cpu_primitive_t { this->desc()->diff_dst_desc.data_type); if (!ok) return status::unimplemented; - return jit_avx2_conv_bwd_data_kernel_f32::init_conf(jcp_, - *this->desc(), *this->diff_src_pd_.desc(), + status_t status = jit_avx2_conv_bwd_data_kernel_f32::init_conf( + jcp_, *this->desc(), *this->diff_src_pd_.desc(), *this->weights_pd_.desc(), *this->diff_dst_pd_.desc()); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx2_conv_bwd_data_kernel_f32::init_scratchpad(scratchpad, + jcp_); + + return status::success; } jit_conv_conf_t jcp_; @@ -240,20 +212,22 @@ struct jit_avx2_convolution_bwd_data_t: public cpu_primitive_t { gOIdhw8o8i) : utils::pick(this->ndims() - 3, OIw8o8i, OIhw8o8i, OIdhw8o8i))); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - jit_avx2_convolution_bwd_data_t(const pd_t *pd, const input_vector &inputs, + jit_avx2_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { kernel_ = new jit_avx2_conv_bwd_data_kernel_f32(conf_.jcp_); } - ~jit_avx2_convolution_bwd_data_t() { delete kernel_; }; + : cpu_primitive_t(apd, inputs, outputs) + { kernel_ = new jit_avx2_conv_bwd_data_kernel_f32(pd()->jcp_); } + ~jit_avx2_convolution_bwd_data_t() { delete kernel_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: execute_backward_data(); break; @@ -264,8 +238,9 @@ struct jit_avx2_convolution_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx2_conv_bwd_data_kernel_f32 *kernel_; }; @@ -286,7 +261,8 @@ struct jit_avx2_convolution_bwd_weights_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == prop_kind::backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, @@ -294,13 +270,32 @@ struct jit_avx2_convolution_bwd_weights_t: public cpu_primitive_t { this->desc()->diff_weights_desc.data_type); if (!ok) return status::unimplemented; - return jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jcp_, - *this->desc(), *this->src_pd_.desc(), + status_t status = jit_avx2_conv_bwd_weights_kernel_f32::init_conf( + jcp_, *this->desc(), *this->src_pd_.desc(), *this->diff_weights_pd_.desc(), *this->diff_dst_pd_.desc()); + if (status != status::success) return status; + + init_balancers(); + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx2_conv_bwd_weights_kernel_f32::init_scratchpad(scratchpad, + jcp_); + + auto reducer_bia_scratchpad = memory_tracking::registrar_t( + scratchpad, memory_tracking::names::prefix_reducer_bia); + reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad); + + auto reducer_wei_scratchpad = memory_tracking::registrar_t( + scratchpad, memory_tracking::names::prefix_reducer_wei); + reducer_wei_conf_.init_scratchpad(reducer_wei_scratchpad); + + return status::success; } jit_conv_conf_t jcp_; + cpu_reducer_t::conf_t reducer_bia_conf_; + cpu_reducer_t::conf_t reducer_wei_conf_; protected: virtual status_t set_default_params() override { @@ -322,54 +317,61 @@ struct jit_avx2_convolution_bwd_weights_t: public cpu_primitive_t { OIhw8i8o, Ohwi8o, OIdhw8i8o, Odhwi8o))); if (this->diff_bias_pd_.desc()->format == any) CHECK(this->diff_bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } + + private: + void init_balancers() { + const int max_threads = mkldnn_get_max_threads(); + const size_t max_buffer_size = 1<<21; /* just a heuristic */ + + if(with_bias()) { + reducer_bia_conf_.init(reduce_balancer_t(max_threads, + jcp_.oc_block, jcp_.ngroups * jcp_.nb_oc, jcp_.mb, + max_buffer_size)); + } + + reducer_wei_conf_.init(reduce_balancer_t(max_threads, + jcp_.kd * jcp_.kh * jcp_.kw + * jcp_.ic_block * jcp_.oc_block, + jcp_.ngroups * jcp_.nb_ic * jcp_.nb_oc, + jcp_.mb * jcp_.od, max_buffer_size)); + } }; - jit_avx2_convolution_bwd_weights_t(const pd_t *pd, + jit_avx2_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs) , kernel_(nullptr), reducer_weights_(nullptr), reducer_bias_(nullptr) - , padded_bias_(nullptr) { - kernel_ = new jit_avx2_conv_bwd_weights_kernel_f32(conf_.jcp_); - - const int max_threads = mkldnn_get_max_threads(); - const size_t max_buffer_size = 1<<21; /* just a heuristic */ - const auto &j = conf_.jcp_; - reducer_weights_ = new cpu_reducer_t(reduce_balancer_t( - max_threads, j.kd * j.kh * j.kw * j.ic_block * j.oc_block, - j.ngroups * j.nb_ic * j.nb_oc, j.mb * j.od, max_buffer_size)); - if (conf_.with_bias()) { - reducer_bias_ = new cpu_reducer_t( - reduce_balancer_t(max_threads, j.oc_block, - j.ngroups * j.nb_oc, j.mb, max_buffer_size)); - - if (conf_.want_padded_bias()) - padded_bias_ = (data_t *) - malloc(sizeof(data_t) * j.oc, 64); - } + kernel_ = new jit_avx2_conv_bwd_weights_kernel_f32(pd()->jcp_); + reducer_bias_ = + new cpu_reducer_t(pd()->reducer_bia_conf_); + reducer_weights_ = + new cpu_reducer_t(pd()->reducer_wei_conf_); } + ~jit_avx2_convolution_bwd_weights_t() { delete kernel_; delete reducer_weights_; delete reducer_bias_; - free(padded_bias_); - }; + } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward_weights(); e->set_state(event_t::ready); } private: - void execute_backward_weights(); - pd_t conf_; + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx2_conv_bwd_weights_kernel_f32 *kernel_; cpu_reducer_t *reducer_weights_, *reducer_bias_; - data_t *padded_bias_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp index 30f1823..bdfee81 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp @@ -13,13 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ + +#include #include + #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" #include "nstl.hpp" #include "type_helpers.hpp" -#include "mkldnn_thread.hpp" #include "utils.hpp" + #include "cpu_memory.hpp" +#include "cpu_barrier.hpp" #include "jit_uni_1x1_conv_utils.hpp" #include "jit_avx512_common_1x1_conv_kernel.hpp" @@ -257,14 +263,23 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - eltwise_injectors[0]->compute_vector_range(0, ur * load_loop_blk); - } - for (int i = 0; i < p.len_; i++) { auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur * load_loop_blk); + if (jcp.ver == ver_4vnni) { + zmm_t zmm_zero = vreg_bcast; + vpxord(zmm_zero, zmm_zero, zmm_zero); + + for (int i_ur = 0; i_ur < ur; ++i_ur) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + Zmm zmm = vreg_accum(i_load, i_ur); + vpcmpd(k1, zmm, zmm_zero, _cmp_lt_os); + vpmulld(zmm | k1, zmm, zmm_zero); + } + } + } else { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur * load_loop_blk); + } eltwise_inj_idx++; } else if (post_op.is_depthwise()) { mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); @@ -502,12 +517,6 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, void jit_avx512_common_1x1_conv_kernel::generate() { - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - const auto &p = attr_.post_ops_; for (int i = 0; i < p.len_; i++) { auto &post_op = p.entry_[i]; @@ -542,6 +551,8 @@ void jit_avx512_common_1x1_conv_kernel::generate() mov(EVEX_compress_addr(rsp, bcast_loop_work_offt), reg_bcast_loop_work); mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]); mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]); + if (one_of(jcp.prop_kind, forward_training, forward_inference)) + mov(reg_relu_ns, reinterpret_cast(&jcp.eltwise.alpha)); if (jcp.prop_kind == backward_weights) mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]); mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); @@ -653,30 +664,20 @@ bool jit_avx512_common_1x1_conv_kernel::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: - return true // sum OR eltwise OR depthwise - && !jcp.with_eltwise && (is_simple(0) || is_sum(0)); - case 2: - return true // sum->relu - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || - (is_simple(0) && is_simple(1))); - case 3: - return true // sum->relu - && !jcp.with_eltwise && (is_sum(0) && is_simple(1) && is_simple(2)); + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_simple(1)); + case 3: return is_sum(0) && is_simple(1) && is_simple(2); default: return false; } return false; } -status_t jit_avx512_common_1x1_conv_kernel::init_conf( - jit_1x1_conv_conf_t &jcp, const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope, - int nthreads, bool reduce_src) -{ +status_t jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp, + const convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, + const primitive_attr_t &attr, int nthreads, bool reduce_src) { if (!mayiuse(avx512_common)) return status::unimplemented; const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; @@ -715,11 +716,9 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf( jcp.stride_w = cd.strides[ndims - 3]; jcp.src_fmt = src_d.format(); - jcp.with_bias = one_of(jcp.prop_kind, forward_training, forward_inference) - ? cd.bias_desc.format != memory_format::undef : false; - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; + jcp.with_bias = pick_by_prop_kind(jcp.prop_kind, cd.bias_desc.format, + memory_format::undef, cd.diff_bias_desc.format) + != memory_format::undef; jcp.os = jcp.oh * jcp.ow; jcp.is = jcp.ih * jcp.iw; @@ -730,6 +729,12 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf( const auto &p = attr.post_ops_; jcp.with_sum = p.find(primitive_kind::sum) != -1; + const int eltwise_ind = p.find(primitive_kind::eltwise); + jcp.with_eltwise = eltwise_ind != -1; + if (jcp.with_eltwise) { + jcp.eltwise = p.entry_[eltwise_ind].eltwise; + if (dst_d.data_type() == data_type::s32) return status::unimplemented; + } bool args_ok = true && jcp.ngroups == 1 @@ -894,9 +899,7 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf( } else { bool is4ops = (jcp.ver == ver_4fma || jcp.ver == ver_4vnni); -// max_regs = is4ops ? 28 : 30; - // FIXME (ichuraev): it is a fix for densnet-121 - max_regs = 28; + max_regs = is4ops ? 28 : 30; min_regs = 9; size_treshold = is4ops ? 28 : 14; ur_step = is4ops ? 4 : 1; @@ -1062,6 +1065,48 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf( load_blocking = jcp.load_block; } + if (jcp.ver == ver_4fma && jcp.bcast_dim * jcp.mb < jcp.load_dim + && jcp.oh * jcp.ow > 64 + && IMPLICATION(reduce_src, jcp.load_dim < 1024)) { + /* Looking for best loading dimension blocking + * to get the best thread and data read/write efficiency + * by finding the optimal 'load_chunk' value + * Example: + * for 72 threads and convolution with mb=1, ih=iw=7, oc = 512 + * the 'best' load_chunk value should be 1 + * TODO: remove heuristic constants in above condition + * TODO: check this blocking for other ISA + */ + float best_eff = -1.f; + int best_lgc = 1; + + for (int load_chunk = 1; load_chunk <= nb_load; load_chunk++) { + int lgc = div_up(nb_load, load_chunk); + if (lgc > nthreads) + continue; + int thr_per_grp = div_up(nthreads, lgc); + int bcast_per_thr = div_up(jcp.mb * nb_bcast, thr_per_grp) + * jcp.bcast_block; + int load_per_thr = load_chunk * simd_w; + float data_norm = (bcast_per_thr + load_per_thr) / 2.f; + float data_eff = (bcast_per_thr * load_per_thr) + / (data_norm * data_norm); + float thr_eff_over_grp = (float)nstl::max(1, nthreads / lgc) + / div_up(nthreads, lgc); + float thr_eff_in_grp = ((float)jcp.mb * nb_bcast) + / rnd_up(jcp.mb * nb_bcast, thr_per_grp); + float thr_eff = thr_eff_over_grp * thr_eff_in_grp; + float load_eff = (float)nb_load / rnd_up(nb_load, lgc); + float overall_eff = data_eff + thr_eff + load_eff; + if (overall_eff > best_eff) { + best_eff = overall_eff; + best_lgc = lgc; + } + } + jcp.load_grp_count = best_lgc; + load_blocking + = div_up(nb_load, jcp.load_grp_count) * jcp.load_block; + } bcast_blocking = div_up(jcp.mb * jcp.ngroups * nb_bcast, div_up(nthreads, jcp.load_grp_count)) * jcp.bcast_block; @@ -1230,6 +1275,30 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf( return status::success; } +void jit_avx512_common_1x1_conv_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp) { + using namespace mkldnn::impl::memory_tracking::names; + + if (jcp.prop_kind != backward_data && jcp.with_bias + && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, jcp.typesize_out * jcp.oc); + + if (jcp.prop_kind == backward_weights) { + const size_t wei_size = (size_t)jcp.ngroups * jcp.oc * jcp.ic; + scratchpad.book(key_conv_wei_reduction, + jcp.typesize_out * wei_size * (jcp.nthr_mb - 1)); + } + + if (jcp.transpose_src) { + const size_t tr_src_size = + (size_t)jcp.nthr_mb * jcp.ngroups * jcp.ic * jcp.tr_is; + scratchpad.book(key_conv_tr_src, jcp.typesize_out * tr_src_size); + scratchpad.book(key_conv_tr_src_bctx, + sizeof(simple_barrier::ctx_t) * jcp.nthr); + } +} + void jit_avx512_common_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp, int nthreads) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.hpp index 31d5b62..af7ca95 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.hpp @@ -18,6 +18,8 @@ #define JIT_AVX512_COMMON_1x1_CONV_KERNEL_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" #include "jit_uni_eltwise.hpp" @@ -29,7 +31,8 @@ namespace cpu { struct jit_avx512_common_1x1_conv_kernel : public jit_generator { jit_avx512_common_1x1_conv_kernel(jit_1x1_conv_conf_t ajcp, - const primitive_attr_t &attr) : jcp(ajcp), attr_(attr) + const primitive_attr_t &attr) + : jcp(ajcp), attr_(attr) { this->generate(); jit_ker = (void (*)(jit_1x1_conv_call_s *)) this->getCode(); @@ -51,25 +54,15 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator { const primitive_attr_t &attr); static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope, - int nthreads, bool reduce_src); + const convolution_desc_t &cd, + const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, + const memory_desc_wrapper &dst_d, + const primitive_attr_t &attr, + int nthreads, bool reduce_src); - static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, - int nthreads, bool reduce_src) - { - return init_conf(jcp, cd, src_d, weights_d, dst_d, attr, false, 0.0, - nthreads, reduce_src); - } + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp); jit_1x1_conv_conf_t jcp; const primitive_attr_t &attr_; @@ -78,7 +71,6 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator { private: using reg64_t = const Xbyak::Reg64; using zmm_t = const Xbyak::Zmm; - using mask_t = const Xbyak::Opmask; reg64_t reg_bcast_data = r8; reg64_t reg_load_data = r10; @@ -95,6 +87,7 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator { reg64_t reg_reduce_pos_flag = rax; reg64_t reg_output_stride = r13; reg64_t reg_bias_data = r12; + reg64_t reg_relu_ns = r13; reg64_t reg_bcast_loop_work = aux1_reg_bcast_data; Xbyak::Zmm vreg_bcast = Xbyak::Zmm(31); @@ -115,6 +108,7 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator { void generate(); static void balance(jit_1x1_conv_conf_t &jcp, int nthreads); }; + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.cpp index da38121..099f1bd 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.cpp @@ -14,22 +14,22 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_types.h" - #include "c_types_map.hpp" -#include "jit_avx512_common_1x1_convolution.hpp" -#include "utils.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" +#include "utils.hpp" #include "jit_generator.hpp" +#include "jit_avx512_common_1x1_convolution.hpp" + namespace mkldnn { namespace impl { namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; #define data_blk_off(f, n, c, h, w) \ @@ -37,74 +37,84 @@ using namespace mkldnn::impl::utils; ? (f).blk_off(n, c, w) \ : (f).blk_off(n, c, h, w)) + namespace { template void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end, T nx, T &nx_start, T &nx_end, T nx_divider) { - const T grp_size = utils::div_up(nthr, nx_divider); - const T grp_count = utils::div_up(nthr, grp_size); - - T grp = ithr / grp_size; - T grp_ithr = ithr % grp_size; - T grp_nthr = grp_size; - T first_grps = nthr % grp_count; - if (first_grps > 0 && grp >= first_grps) { - ithr -= first_grps * grp_size; - grp_nthr--; - grp = ithr / grp_nthr + first_grps; - grp_ithr = ithr % grp_nthr; + const int grp_count = nstl::min(nx_divider, nthr); + const int grp_size_big = nthr / grp_count + 1; + const int grp_size_small = nthr / grp_count; + const int n_grp_big = nthr % grp_count; + const int threads_in_big_groups = n_grp_big * grp_size_big; + + const int ithr_bound_distance = ithr - threads_in_big_groups; + T grp, grp_ithr, grp_nthr; + if (ithr_bound_distance < 0) { // ithr in first groups + grp = ithr / grp_size_big; + grp_ithr = ithr % grp_size_big; + grp_nthr = grp_size_big; + } else { // ithr in last groups + grp = n_grp_big + ithr_bound_distance / grp_size_small; + grp_ithr = ithr_bound_distance % grp_size_small; + grp_nthr = grp_size_small; } + balance211(nx, grp_count, grp, nx_start, nx_end); balance211(ny, grp_nthr, grp_ithr, ny_start, ny_end); } } /* convolution forward */ -template -void _jit_avx512_common_1x1_convolution_fwd_t - ::execute_forward() -{ +template +void jit_avx512_common_1x1_convolution_fwd_t:: +execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); + auto scratchpad = this->scratchpad(); + auto &jcp = kernel_->jcp; - if (conf_.want_padded_bias()) { - assert(jcp.ngroups == 1); - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad.template get( + key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; } parallel(0, [&](const int ithr, const int nthr) { - execute_forward_thr(ithr, nthr, src, weights, bias, dst); + execute_forward_thr(ithr, nthr, src, weights, bias, dst, scratchpad); }); + + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); } -template -void _jit_avx512_common_1x1_convolution_fwd_t - ::execute_forward_thr( - const int ithr, const int nthr, - const src_data_t *src, const wei_data_t *weights, - const dst_data_t *bias, dst_data_t *dst) -{ - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); +template +void jit_avx512_common_1x1_convolution_fwd_t:: +execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, + const wei_data_t *weights, const dst_data_t *bias, dst_data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + auto rtus_space = scratchpad.get(key_conv_rtus_space); const int ndims = src_d.ndims(); - const int stride_h = (ndims == 3) ? 1 : conf_.cdesc()->strides[0]; - const int stride_w = conf_.cdesc()->strides[ndims - 3]; - const int pad_t = (ndims == 3) ? 0 : conf_.cdesc()->padding[0][0]; - const int pad_l = conf_.cdesc()->padding[0][ndims - 3]; + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[ndims - 3]; + const int pad_t = (ndims == 3) ? 0 : pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][ndims - 3]; - auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const auto &jcp = kernel_->jcp; + const int MB = pd()->MB(); const int work_amount = MB * jcp.ngroups * jcp.nb_bcast; auto step = [](int default_step, int remaining, int tail_step) { @@ -179,13 +189,13 @@ void _jit_avx512_common_1x1_convolution_fwd_t p.output_data = &dst[dst_off]; p.bias_data = &bias[_ocb * jcp.oc_block]; - p.load_data = &weights[conf_.with_groups() + p.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; const int _icb = g * nb_ic + icb; - if (conf_.rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_ + if (pd()->rtus_.reduce_src_) { + rp.ws = rtus_space + ithr * pd()->rtus_.space_per_thread_ + _icb * jcp.is * jcp.ic_block; if (ocb == ocb_start) { rp.src = src + data_blk_off(src_d, n, _icb, ih, iw); @@ -274,40 +284,39 @@ void _jit_avx512_common_1x1_convolution_fwd_t } -template struct _jit_avx512_common_1x1_convolution_fwd_t; -template struct _jit_avx512_common_1x1_convolution_fwd_t; -template struct _jit_avx512_common_1x1_convolution_fwd_t; -template struct _jit_avx512_common_1x1_convolution_fwd_t; +template struct jit_avx512_common_1x1_convolution_fwd_t; /* convolution backward wtr data */ template -void _jit_avx512_common_1x1_convolution_bwd_data_t - ::execute_backward_data() -{ + data_type_t diff_src_type> +void jit_avx512_common_1x1_convolution_bwd_data_t::execute_backward_data() const { auto diff_dst = reinterpret_cast (this->input_memory(0)); auto weights = reinterpret_cast (this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + + auto rtus_space = scratchpad().template get( + key_conv_rtus_space); const int ndims = diff_src_d.ndims(); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); // TODO (Roma): remove this restriction assert(jcp.stride_w == 1 && jcp.stride_h == 1); - const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0]; - const int stride_w = conf_.desc()->strides[ndims - 3]; - const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0]; - const int pad_l = conf_.desc()->padding[0][ndims - 3]; + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[ndims - 3]; + const int pad_t = (ndims == 3) ? 0 : pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][ndims - 3]; const int nb_ic = jcp.nb_load; const int nb_oc = jcp.nb_reduce; @@ -376,8 +385,9 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t const int _icb = g * nb_ic + icb; rp.src = diff_src + data_blk_off(diff_src_d, n, _icb, ih, iw); - if (conf_.rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_; + if (pd()->rtus_.reduce_src_) { + rp.ws = rtus_space + + ithr * pd()->rtus_.space_per_thread_; p.output_data = rp.ws; } else p.output_data = rp.src; @@ -395,7 +405,7 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t size_t diff_dst_off = data_blk_off(diff_dst_d, n, _ocb, oh, ow); p.bcast_data = &diff_dst[diff_dst_off]; - p.load_data = &weights[conf_.with_groups() + p.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; @@ -406,7 +416,7 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t kernel_->jit_ker(&p); } - if (conf_.rtus_.reduce_src_) + if (pd()->rtus_.reduce_src_) rtus_driver_->ker_(&rp); } } @@ -414,87 +424,81 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t }); } -template struct _jit_avx512_common_1x1_convolution_bwd_data_t; -template struct _jit_avx512_common_1x1_convolution_bwd_data_t; +template struct jit_avx512_common_1x1_convolution_bwd_data_t; /* convolution backward wtr weights */ #define wht_blk_off(d, g, ...) \ - (conf_.with_groups() \ + (pd()->with_groups() \ ? (d).blk_off((g), __VA_ARGS__) \ : (d).blk_off(__VA_ARGS__)) jit_avx512_common_1x1_convolution_bwd_weights_t :: - jit_avx512_common_1x1_convolution_bwd_weights_t(const pd_t *pd, + jit_avx512_common_1x1_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd), kernel_(nullptr), acc_ker_(nullptr), reducer_bias_(nullptr) - , trans_kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0) - , scratch_(nullptr), padded_bias_(nullptr), bctx_(nullptr) - , tr_src_(nullptr), ws_reduction_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), acc_ker_(nullptr), reducer_bias_(nullptr) + , trans_kernel_(nullptr), rtus_driver_(nullptr) { - kernel_ = new jit_avx512_common_1x1_conv_kernel(conf_.jcp_, *conf_.attr()); - - const auto &jcp = kernel_->jcp; - - const int wei_size = jcp.ngroups * jcp.oc * jcp.ic; - ws_reduction_ = - (data_t *)malloc((jcp.nthr_mb - 1) * wei_size * sizeof(data_t), 64); + kernel_ = new jit_avx512_common_1x1_conv_kernel(pd()->jcp_, *pd()->attr()); acc_ker_ = new cpu_accumulator_1d_t(); + reducer_bias_ = new cpu_reducer_t(pd()->reducer_bia_conf_); + init_rtus_driver(this); - if (conf_.with_bias()) { - const size_t max_buffer_size = jcp.nthr * 3 * 5 * 5 * 16 * 16; - reducer_bias_ = new cpu_reducer_t( - reduce_balancer_t(jcp.nthr, jcp.oc_block, - jcp.ngroups * jcp.nb_load, jcp.mb, max_buffer_size)); - - if (conf_.want_padded_bias()) { - assert(jcp.ngroups == 1); - padded_bias_ = (data_t *)malloc(sizeof(data_t) * jcp.oc, 64); - } - } + const auto &jcp = kernel_->jcp; if (jcp.transpose_src) { - const ptrdiff_t tr_src_size = (ptrdiff_t)jcp.nthr_mb - * (ptrdiff_t)jcp.ngroups * (ptrdiff_t)jcp.ic * jcp.tr_is; - tr_src_ = (data_t *)malloc(tr_src_size * sizeof(data_t), 64); - parallel_nd(tr_src_size, [&](ptrdiff_t i) { tr_src_[i] = 0; }); auto tp = jit_transpose4x16_src_t(); tp.src_pf0_distance = 4; tp.tr_src_pf0_distance = 0; tp.src_pf1 = true; tp.tr_src_pf1 = false; trans_kernel_ = new jit_transpose4x16_src(&jcp, &tp); - - bctx_ = (simple_barrier::ctx_t *)malloc( - jcp.nthr * sizeof(simple_barrier::ctx_t), 64); - for (int i = 0; i < jcp.nthr; ++i) - simple_barrier::ctx_init(&bctx_[i]); } - - init_rtus_driver(this); } -void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() +void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias_in = reinterpret_cast(this->memory(1)); - data_t *diff_bias = conf_.want_padded_bias() ? padded_bias_ : diff_bias_in; - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); const auto &jcp = kernel_->jcp; + + const auto scratchpad = this->scratchpad(); + + auto rtus_space = scratchpad.get(key_conv_rtus_space); + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; + auto wei_reduction = scratchpad.get(key_conv_wei_reduction); + + /* prepare src transposition barriers */ + auto tr_src = scratchpad.get(key_conv_tr_src); + auto tr_src_bctx = scratchpad.get( + key_conv_tr_src_bctx); + if (jcp.transpose_src) { + for (int i = 0; i < jcp.nthr; ++i) + simple_barrier::ctx_init(&tr_src_bctx[i]); + } + const int ndims = src_d.ndims(); const int wei_size = jcp.ngroups * jcp.oc * jcp.ic; simple_barrier::ctx_t reduction_barrier; simple_barrier::ctx_init(&reduction_barrier); + const auto reducer_bia_scratchpad = memory_tracking::grantor_t(scratchpad, + prefix_reducer_bia); + auto rb = this->reducer_bias_; + rb->init(reducer_bia_scratchpad); + // TODO (Roma): remove this restriction assert(jcp.stride_w == 1 && jcp.stride_h == 1); @@ -507,10 +511,10 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const int sp_nb = jcp.nb_reduce; const int mb_sp_work = jcp.mb * sp_nb; - const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0]; - const int stride_w = conf_.desc()->strides[ndims - 3]; - const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0]; - const int pad_l = conf_.desc()->padding[0][ndims - 3]; + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[ndims - 3]; + const int pad_t = (ndims == 3) ? 0 : pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][ndims - 3]; auto step = [](int default_step, int remaining, int tail_step) { assert(default_step <= tail_step); @@ -548,7 +552,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const int src1_off = data_blk_off(src_d, img, _ic, ih, iw); data_t *src1 = (data_t *)&src[src1_off]; - data_t *tr_src1 = &tr_src_[tr_src_off(ithr_mb, ic_b_tr, is)]; + data_t *tr_src1 = &tr_src[tr_src_off(ithr_mb, ic_b_tr, is)]; assert(jcp.ic_block == 16); const int src_stride = jcp.is * jcp.ic_block; @@ -611,9 +615,8 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const int oc_b_work = oc_b_end - oc_b_start; const int ic_b_work = ic_b_end - ic_b_start; - data_t *diff_wei = ithr_mb == 0 ? - diff_weights : - ws_reduction_ + (ithr_mb - 1) * wei_size; + data_t *diff_wei = ithr_mb == 0 + ? diff_weights : wei_reduction + (ithr_mb - 1) * wei_size; int sp_b_step = 0; for (int mb_sp_b = mb_sp_b_start; mb_sp_b < mb_sp_b_end; @@ -634,7 +637,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() if (jcp.transpose_src) { if (jcp.nthr_oc_b > 1) simple_barrier::barrier( - &bctx_[ithr_but_oc], jcp.nthr_oc_b); + &tr_src_bctx[ithr_but_oc], jcp.nthr_oc_b); const int sp_size = nstl::min(sp_b_step * jcp.reduce_block, jcp.is - sp_b * jcp.reduce_block); @@ -642,7 +645,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() bcast_step, ithr_oc_b, jcp.nthr_oc_b, ic_b_start); if (jcp.nthr_oc_b > 1) simple_barrier::barrier( - &bctx_[ithr_but_oc], jcp.nthr_oc_b); + &tr_src_bctx[ithr_but_oc], jcp.nthr_oc_b); } for (int oc_b = oc_b_start; oc_b < oc_b_end; @@ -660,7 +663,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() store_to = diff_wei + off; const data_t *diff_src = jcp.transpose_src ? - &tr_src_[tr_src_off(ithr_mb, _ic_b_tr, 0)] : + &tr_src[tr_src_off(ithr_mb, _ic_b_tr, 0)] : &src[src_d.blk_off(img, _ic_b)]; int sp_b_end = sp_b + sp_b_step; @@ -690,7 +693,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() int sp = sp_b * jcp.reduce_block; p.load_data = pdiff_dst + sp * jcp.oc_block; - if (conf_.rtus_.reduce_src_) { + if (pd()->rtus_.reduce_src_) { const int oh = sp / jcp.ow; const int ow = sp % jcp.ow; @@ -698,8 +701,9 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const int iw = nstl::max(ow * stride_w - pad_l, 0); rp.iw_start = iw; - rp.ws = scratch_ + ithr * ws_per_thread_ - + sp * jcp.ic_block; + rp.ws = rtus_space + + ithr * pd()->rtus_.space_per_thread_ + + sp * jcp.ic_block; if (ndims == 3) rp.src = local_src + iw @@ -720,7 +724,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() } } - /* diff_weights[:] += sum(ws_reduction_[thr_mb][:]) */ + /* diff_weights[:] += sum(wei_reduction[thr_mb][:]) */ if (jcp.nthr_mb > 1) { simple_barrier::barrier(&reduction_barrier, jcp.nthr); const int work = g_work * oc_b_work * ic_b_work; @@ -747,7 +751,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const size_t off = wht_blk_off(diff_weights_d, g, oc_b, ic_b); data_t *d = diff_weights + off; - data_t *s = ws_reduction_ + (thr_mb - 1) * wei_size + off; + data_t *s = wei_reduction + (thr_mb - 1) * wei_size + off; acc_ker_->accumulate(d, s, acc_size); @@ -760,11 +764,10 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() }; auto ker_bias = [&](int ithr, int nthr) { - auto rb = this->reducer_bias_; - assert(nthr == rb->balancer_.nthr_); + assert(nthr == rb->balancer().nthr_); - const int b_job_start = rb->balancer_.ithr_job_off(ithr); - const int b_njobs = rb->balancer_.ithr_njobs(ithr); + const int b_job_start = rb->balancer().ithr_job_off(ithr); + const int b_njobs = rb->balancer().ithr_njobs(ithr); if (b_njobs == 0) return; @@ -772,8 +775,8 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() /* reduction dimension */ int img_start{ 0 }, img_end{ 0 }; - balance211(jcp.mb, rb->balancer_.nthr_per_group_, - rb->balancer_.id_in_group(ithr), img_start, img_end); + balance211(jcp.mb, rb->balancer().nthr_per_group_, + rb->balancer().id_in_group(ithr), img_start, img_end); /* jobs */ int g_start{ 0 }, ocb_start{ 0 }; @@ -786,8 +789,9 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() const size_t _oc = g * jcp.nb_load + ocb; const data_t *d_dst = &diff_dst[diff_dst_d.blk_off(img, _oc)]; - data_t *d_bias = &rb->get_local_ptr( - ithr, diff_bias)[b_job_loc * rb->balancer_.job_size_]; + data_t *d_bias = rb->get_local_ptr(ithr, diff_bias, + reducer_bia_scratchpad) + + b_job_loc * rb->balancer().job_size_; if (img == img_start) for (int o = 0; o < 16; ++o) @@ -803,20 +807,19 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() nd_iterator_step(g, jcp.ngroups, ocb, jcp.nb_load); } } - rb->reduce(ithr, diff_bias); + rb->reduce(ithr, diff_bias, reducer_bia_scratchpad); }; parallel(jcp.nthr, [&](const int ithr, const int nthr) { ker(ithr, jcp.nthr); - if (conf_.with_bias()) + if (pd()->with_bias()) ker_bias(ithr, jcp.nthr); }); /* TODO: put this in ker_bias */ - if (conf_.want_padded_bias()) { + if (pd()->wants_padded_bias()) { assert(jcp.ngroups == 1); - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - diff_bias_in[oc] = diff_bias[oc]; + utils::array_copy(diff_bias_in, diff_bias, jcp.oc_without_padding); } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.hpp index 7878697..67e8dab 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.hpp @@ -18,37 +18,38 @@ #define CPU_JIT_AVX512_COMMON_1x1_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" + #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "cpu_reducer.hpp" + #include "jit_avx512_common_1x1_conv_kernel.hpp" #include "jit_uni_1x1_conv_utils.hpp" #include "jit_transpose_src_utils.hpp" -#include "mkldnn_thread.hpp" -#include "utils.hpp" namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_avx512_common_1x1_convolution_fwd_t : public cpu_primitive_t { +struct jit_avx512_common_1x1_convolution_fwd_t : public cpu_primitive_t { // TODO: (Roma) Code duplication duplication! Remove with templates // (maybe...)! - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_(), rtus_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_1x1:", avx512_common, ""), - _jit_avx512_common_1x1_convolution_fwd_t); + jit_avx512_common_1x1_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; @@ -56,37 +57,42 @@ struct _jit_avx512_common_1x1_convolution_fwd_t : public cpu_primitive_t { assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().weights_desc.data_type == wei_type - && this->cdesc_().dst_desc.data_type == dst_type + && this->desc()->src_desc.data_type == src_type + && this->desc()->weights_desc.data_type == wei_type + && this->desc()->dst_desc.data_type == dst_type && IMPLICATION(this->with_bias(), - dst_type == this->cdesc_().bias_desc.data_type) - && IMPLICATION(with_relu && dst_type == data_type::s32 - && everyone_is(data_type::s16, src_type, wei_type), - this->negative_slope() == 0.); + dst_type == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; - const convolution_desc_t *conv_d = &this->cdesc_(); + const convolution_desc_t *conv_d = this->desc(); const memory_desc_t *src_d = this->src_pd_.desc(); rtus_prepare(this, conv_d, src_d, this->dst_pd_.desc()); - return jit_avx512_common_1x1_conv_kernel::init_conf(jcp_, - *conv_d, *src_d, *this->weights_pd_.desc(), + + status_t status = jit_avx512_common_1x1_conv_kernel::init_conf( + jcp_, *conv_d, *src_d, *this->weights_pd_.desc(), *this->dst_pd_.desc(), *this->attr(), - with_relu, this->negative_slope(), mkldnn_get_max_threads(), rtus_.reduce_src_); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_common_1x1_conv_kernel::init_scratchpad(scratchpad, + jcp_); + + rtus_prepare_space_info(this, scratchpad); + + return status::success; } jit_1x1_conv_conf_t jcp_; - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; + reduce_to_unit_stride_t rtus_; - protected: + protected: virtual status_t set_default_params() override { using namespace memory_format; if (this->src_pd_.desc()->format == any) @@ -110,78 +116,61 @@ struct _jit_avx512_common_1x1_convolution_fwd_t : public cpu_primitive_t { } if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; template friend void init_rtus_driver(conv_t *self); - _jit_avx512_common_1x1_convolution_fwd_t(const pd_t *pd, - const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0) - , scratch_(nullptr), padded_bias_(nullptr) - { - kernel_ = new jit_avx512_common_1x1_conv_kernel(conf_.jcp_, - *conf_.attr()); + jit_avx512_common_1x1_convolution_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), rtus_driver_(nullptr) + { + kernel_ = + new jit_avx512_common_1x1_conv_kernel(pd()->jcp_, *pd()->attr()); init_rtus_driver(this); - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (dst_data_t *)malloc(sizeof(dst_data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; - } } - ~_jit_avx512_common_1x1_convolution_fwd_t() { + ~jit_avx512_common_1x1_convolution_fwd_t() { delete kernel_; delete rtus_driver_; - free(scratch_); - free(padded_bias_); } typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); + void execute_forward() const; void execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, const wei_data_t *weights, - const dst_data_t *bias, dst_data_t *dst); - pd_t conf_; + const dst_data_t *bias, dst_data_t *dst, + const memory_tracking::grantor_t &scratchpad) const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_common_1x1_conv_kernel *kernel_; - /* reduction to unit stride */ rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - src_data_t *scratch_; - dst_data_t *padded_bias_; }; using jit_avx512_common_1x1_convolution_fwd_f32_t - = _jit_avx512_common_1x1_convolution_fwd_t; -using jit_avx512_common_1x1_convolution_relu_f32_t - = _jit_avx512_common_1x1_convolution_fwd_t; + = jit_avx512_common_1x1_convolution_fwd_t; using jit_avx512_common_1x1_convolution_fwd_s16s16s32_t - = _jit_avx512_common_1x1_convolution_fwd_t; -using jit_avx512_common_1x1_convolution_relu_s16s16s32_t - = _jit_avx512_common_1x1_convolution_fwd_t; template -struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { +struct jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { struct pd_t : public cpu_convolution_bwd_data_pd_t { pd_t(engine_t *engine, const convolution_desc_t *adesc, @@ -192,7 +181,7 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_1x1:", avx512_common, ""), - _jit_avx512_common_1x1_convolution_bwd_data_t); + jit_avx512_common_1x1_convolution_bwd_data_t); virtual status_t init() override { using namespace prop_kind; @@ -200,7 +189,8 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_data - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && this->desc()->diff_dst_desc.data_type == diff_dst_type && this->desc()->weights_desc.data_type == wei_type @@ -210,18 +200,25 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { const convolution_desc_t *conv_d = this->desc(); const memory_desc_t *diff_src_d = this->diff_src_pd_.desc(); rtus_prepare(this, conv_d, diff_src_d, this->diff_dst_pd_.desc()); - return jit_avx512_common_1x1_conv_kernel::init_conf(jcp_, - *conv_d, *diff_src_d, *this->weights_pd_.desc(), - *this->diff_dst_pd_.desc(), *this->attr(), - mkldnn_get_max_threads(), rtus_.reduce_src_); + + status_t status = jit_avx512_common_1x1_conv_kernel::init_conf( + jcp_, *conv_d, *diff_src_d, *this->weights_pd_.desc(), + *this->diff_dst_pd_.desc(), *this->attr(), + mkldnn_get_max_threads(), rtus_.reduce_src_); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_common_1x1_conv_kernel::init_scratchpad(scratchpad, + jcp_); + + rtus_prepare_space_info(this, scratchpad); + + return status::success; } // TODO (Roma): structs conf header cleanup jit_1x1_conv_conf_t jcp_; - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; + reduce_to_unit_stride_t rtus_; protected: virtual status_t set_default_params() override { @@ -248,6 +245,8 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { ? pick(this->ndims() - 3, gOIw8o16i2o, gOIhw8o16i2o) : pick(this->ndims() - 3, OIw8o16i2o, OIhw8o16i2o))); } + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } @@ -255,30 +254,28 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { template friend void init_rtus_driver(conv_t *self); - _jit_avx512_common_1x1_convolution_bwd_data_t(const pd_t *pd, - const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0) - , scratch_(nullptr) + + jit_avx512_common_1x1_convolution_bwd_data_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), rtus_driver_(nullptr) { - kernel_ = new jit_avx512_common_1x1_conv_kernel(conf_.jcp_, - *conf_.attr()); + kernel_ = new jit_avx512_common_1x1_conv_kernel(pd()->jcp_, + *pd()->attr()); init_rtus_driver(this); } - ~_jit_avx512_common_1x1_convolution_bwd_data_t() - { + + ~jit_avx512_common_1x1_convolution_bwd_data_t() { delete kernel_; delete rtus_driver_; - free(scratch_); } typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type diff_src_data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: execute_backward_data(); break; @@ -289,19 +286,17 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_common_1x1_conv_kernel *kernel_; - /* reduction to unit stride */ rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - diff_src_data_t *scratch_; }; using jit_avx512_common_1x1_convolution_bwd_data_f32_t - = _jit_avx512_common_1x1_convolution_bwd_data_t; + = jit_avx512_common_1x1_convolution_bwd_data_t; using jit_avx512_common_1x1_convolution_bwd_data_s16s16s32_t - = _jit_avx512_common_1x1_convolution_bwd_data_t; struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t @@ -324,7 +319,9 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, @@ -337,19 +334,32 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t const convolution_desc_t *conv_d = this->desc(); const memory_desc_t *src_d = this->src_pd_.desc(); rtus_prepare(this, conv_d, src_d, this->diff_dst_pd_.desc()); - return jit_avx512_common_1x1_conv_kernel::init_conf(jcp_, - *conv_d, *src_d, *this->diff_weights_pd_.desc(), - *this->diff_dst_pd_.desc(), *this->attr(), - mkldnn_get_max_threads(), rtus_.reduce_src_); + + status_t status = jit_avx512_common_1x1_conv_kernel::init_conf( + jcp_, *conv_d, *src_d, *this->diff_weights_pd_.desc(), + *this->diff_dst_pd_.desc(), *this->attr(), + mkldnn_get_max_threads(), rtus_.reduce_src_); + if (status != status::success) return status; + + init_balancers(); + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_common_1x1_conv_kernel::init_scratchpad(scratchpad, + jcp_); + + auto reducer_bia_scratchpad = memory_tracking::registrar_t( + scratchpad, memory_tracking::names::prefix_reducer_bia); + reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad); + + rtus_prepare_space_info(this, scratchpad); + + return status::success; } // TODO (Roma): structs conf header cleanup jit_1x1_conv_conf_t jcp_; - - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; + cpu_reducer_t::conf_t reducer_bia_conf_; + reduce_to_unit_stride_t rtus_; protected: virtual status_t set_default_params() override { @@ -367,32 +377,40 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t : pick(this->ndims() - 3, OIw16i16o, OIhw16i16o))); if (this->diff_bias_pd_.desc()->format == any) CHECK(this->diff_bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } + + private: + void init_balancers() { + const size_t max_buffer_size = jcp_.nthr * 3 * 5 * 5 * 16 * 16; + if (with_bias()) { + reducer_bia_conf_.init(reduce_balancer_t(jcp_.nthr, + jcp_.oc_block, jcp_.ngroups * jcp_.nb_load, + jcp_.mb, max_buffer_size)); + } + } }; template friend void init_rtus_driver(conv_t *self); - jit_avx512_common_1x1_convolution_bwd_weights_t(const pd_t *pd, - const input_vector &inputs, - const output_vector &outputs); + + jit_avx512_common_1x1_convolution_bwd_weights_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs); + ~jit_avx512_common_1x1_convolution_bwd_weights_t() { delete kernel_; delete acc_ker_; delete reducer_bias_; delete rtus_driver_; delete trans_kernel_; - free(bctx_); - free(ws_reduction_); - free(scratch_); - free(tr_src_); - free(padded_bias_); } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_weights: execute_backward_weights(); break; @@ -403,23 +421,14 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t } private: - void execute_backward_weights(); + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; jit_avx512_common_1x1_conv_kernel *kernel_; cpu_accumulator_1d_t *acc_ker_; cpu_reducer_t *reducer_bias_; jit_transpose4x16_src *trans_kernel_; - - /* reduction to unit stride */ rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - data_t *scratch_; - data_t *padded_bias_; - - simple_barrier::ctx_t *bctx_; - data_t *tr_src_; - data_t *ws_reduction_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.cpp index 7f00356..3206270 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.cpp @@ -18,6 +18,8 @@ #include "nstl.hpp" #include "type_helpers.hpp" #include "utils.hpp" + +#include "cpu_barrier.hpp" #include "cpu_memory.hpp" #include "jit_avx512_common_conv_kernel.hpp" @@ -30,6 +32,7 @@ namespace impl { namespace cpu { using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; @@ -59,32 +62,29 @@ inline void pick_loop_order(jit_conv_conf_t &jcp) { inline bool is_1stconv(const jit_conv_conf_t &jcp) { if (mayiuse(avx512_core) && !mayiuse(avx512_core_vnni)) - return jcp.ic < 16; + return (jcp.ic < 16 && jcp.ngroups == 1); else return one_of(jcp.ic, 1, 3); } -inline bool is_1D_conv(const jit_conv_conf_t &jcp) { - return (jcp.ih == 1 && jcp.kh == 1); -} -inline bool is_ow_threading_available(const jit_conv_conf_t &jcp) { - return (is_1D_conv(jcp) && one_of(jcp.ndims, 3, 4) - && !(jcp.ver == ver_fma && mayiuse(avx512_mic))); -} + inline bool is_ow_threading_on(const jit_conv_conf_t &jcp) { return (jcp.nb_ow > 1); } -inline bool is_1D_prefetching(const jit_conv_conf_t &jcp) { - return (jcp.ver == ver_4fma && is_1D_conv(jcp) && is_ow_threading_on(jcp)); + +inline bool is_owb_prefetching(const jit_conv_conf_t &jcp) { + return (jcp.ver == ver_4fma && is_ow_threading_on(jcp)); } + } -void jit_avx512_common_conv_fwd_kernel::prepare_output(int ur_w) +template +void _jit_avx512_common_conv_fwd_kernel::prepare_output(int ur_w) { for (int k = 0; k < jcp.nb_oc_blocking; k++) for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vpxord(zmm, zmm, zmm); - if (!is_1D_prefetching(jcp)) { + Vmm vmm = vmm_out(j, k); + vpxord(vmm, vmm, vmm); + if (!is_owb_prefetching(jcp)) { size_t aux_output_offset = get_output_offset(j, k); mic_prefetcht1(EVEX_compress_addr_safe(reg_out_prf, aux_output_offset, reg_out_long_offt)); @@ -92,7 +92,8 @@ void jit_avx512_common_conv_fwd_kernel::prepare_output(int ur_w) } } -void jit_avx512_common_conv_fwd_kernel::store_output(int ur_w) +template +void _jit_avx512_common_conv_fwd_kernel::store_output(int ur_w) { Label no_update_label, store_label, postproc_label; @@ -108,9 +109,9 @@ void jit_avx512_common_conv_fwd_kernel::store_output(int ur_w) for (int k = 0; k < jcp.nb_oc_blocking; k++) for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); + Vmm vmm = vmm_out(j, k); size_t aux_output_offset = get_output_offset(j, k); - vadd(zmm, + vadd(vmm, make_safe_addr(reg_out, aux_output_offset, reg_out_long_offt)); } @@ -126,8 +127,8 @@ void jit_avx512_common_conv_fwd_kernel::store_output(int ur_w) for (int k = 0; k < jcp.nb_oc_blocking; k++) { int bias_offset = jcp.typesize_out * k * jcp.oc_block; for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vadd(zmm, EVEX_compress_addr(reg_bias, bias_offset)); + Vmm vmm = vmm_out(j, k); + vadd(vmm, EVEX_compress_addr(reg_bias, bias_offset)); } mic_prefetcht1(EVEX_compress_addr(reg_bias, bias_offset + 64)); } @@ -142,18 +143,29 @@ void jit_avx512_common_conv_fwd_kernel::store_output(int ur_w) int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) - eltwise_injectors[0]->compute_vector_range( - k*jcp.ur_w, k*jcp.ur_w + ur_w); - } - for (int i = 0; i < p.len_; i++) { auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) - eltwise_injectors[eltwise_inj_idx]->compute_vector_range( - k*jcp.ur_w, k*jcp.ur_w + ur_w); + if (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) { + Vmm vmm_zero = vmm_wei; + vpxord(vmm_zero, vmm_zero, vmm_zero); + + for (int k = 0; k < jcp.nb_oc_blocking; k++) + for (int j = 0; j < ur_w; j++) { + Vmm vmm = vmm_out(j, k); + vpcmpd(k1, vmm, vmm_zero, _cmp_lt_os); + vpmulld(vmm | k1, vmm, vmm_zero); + } + } else { + if (ur_w == jcp.ur_w) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, + jcp.nb_oc_blocking * jcp.ur_w); + } else { + for (int k = 0; k < jcp.nb_oc_blocking; k++) + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(k * jcp.ur_w, + k * jcp.ur_w + ur_w); + } + } eltwise_inj_idx++; } else if (post_op.is_depthwise()) { @@ -178,18 +190,25 @@ void jit_avx512_common_conv_fwd_kernel::store_output(int ur_w) L(store_label); for (int k = 0; k < jcp.nb_oc_blocking; k++) for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); + Vmm vmm = vmm_out(j, k); size_t aux_output_offset = (size_t)typesize * ((size_t)k * jcp.od * jcp.oh * jcp.ow + j) * jcp.oc_block; vmovups(EVEX_compress_addr_safe(reg_out, aux_output_offset, - reg_out_long_offt), zmm); - if (!is_1D_prefetching(jcp)) + reg_out_long_offt), vmm); + if (!is_owb_prefetching(jcp)) mic_prefetcht0(EVEX_compress_addr_safe(reg_out_prf, aux_output_offset, reg_out_long_offt)); } } -void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, +template +void _jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, + int pad_l, int pad_r) +{ +} + +template<> +void _jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, int pad_l, int pad_r) { assert(jcp.dilate_d == 0 && jcp.dilate_h == 0 && jcp.dilate_w == 0); @@ -201,9 +220,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; - Label kh_label, kd_label, skip_kd_loop; - - prepare_output(ur_w); + Label kh_label, kd_label; if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_inp, reg_inp); @@ -226,18 +243,9 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, mov(aux_reg_inp_d, reg_inp); mov(aux_reg_inp_d_prf, reg_inp_prf); - if ((jcp.kd - 1) < nstl::max(jcp.f_pad, jcp.back_pad)) { - cmp(reg_ki, 0); - je(skip_kd_loop, T_NEAR); - } L(kd_label); } mov(reg_kj, reg_kh); - Label skip_kh_loop; - if ((jcp.kh - 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } if (jcp.ndims == 5) { mov(aux_reg_inp, aux_reg_inp_d); mov(aux_reg_ker, aux_reg_ker_d); @@ -253,10 +261,10 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, * ((ki + i) * oc_block + ic * kw * jcp.kh * jcp.kd * oc_block); if (ki + i < kw) - vmovups(zmm_ker(i), + vmovups(vmm_ker(i), EVEX_compress_addr(aux_reg_ker, aux_ker_offset)); else - vpxord(zmm_ker(i), zmm_ker(i), zmm_ker(i)); + vpxord(vmm_ker(i), vmm_ker(i), vmm_ker(i)); } int j_start = get_ow_start(ki, pad_l); @@ -266,7 +274,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, size_t aux_input_offset = (size_t)jcp.typesize_in * ((size_t)(ki + j * stride_w - pad_l) + (size_t)ic * iw * ih * jcp.id); - v4fmaddps(zmm_out(j, 0), zmm_ker(0), + v4fmaddps(vmm_out(j, 0), vmm_ker(0), EVEX_compress_addr_safe(aux_reg_inp, aux_input_offset, reg_long_offt)); if (ki + prf_count < kw && prf_count < 4 @@ -299,8 +307,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, cmp(reg_kj, 0); jg(kh_label, T_NEAR); - L(skip_kh_loop); - if (jcp.ndims == 5) { add(aux_reg_inp_d, typesize * jcp.ih * jcp.iw); add(aux_reg_ker_d, typesize * jcp.kw * jcp.kh * oc_block); @@ -309,23 +315,28 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w, dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_out); pop(reg_out_prf); } - store_output(ur_w); if (max_input_offset > INT_MAX) pop(reg_inp_prf); } -void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, +template +void _jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, + int pad_l, int pad_r) +{ +} + +template<> +void _jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, int pad_l, int pad_r) { int stride_w = jcp.stride_w; int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; - Label kh_label, last_iter_label, loop_end_label, kd_label, skip_kd_loop; + Label kh_label, last_iter_label, loop_end_label, kd_label; int ker_load_number = 4; int shift_kernel_ptr = typesize * jcp.kw * jcp.oc_block * jcp.ic_block; int shift_input_ptr = typesize * (jcp.dilate_h + 1) * jcp.iw * jcp.ic_block; @@ -347,7 +358,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, auto kernel_loads = [=](int ki, int ic, int kk) { for (int ii = 0; ii < ker_load_number; ii++) { int aux_kernel_offset = kernel_offset(kk, ic + ii, ki); - vmovups(zmm_ker(ii), + vmovups(vmm_ker(ii), EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); } }; @@ -364,8 +375,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, } }; - prepare_output(ur_w); - if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_inp, reg_inp); mov(aux_reg_ker, reg_ker); @@ -382,21 +391,11 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, mov(aux_reg_inp_d, reg_inp); mov(aux_reg_inp_d_prf, reg_inp_prf); mov(aux_reg_ker_d_prf, reg_ker_prf); - - if ((jcp.kd - 1) * (jcp.dilate_d + 1) < nstl::max(jcp.f_pad, jcp.back_pad)) { - cmp(reg_ki, 0); - je(skip_kd_loop, T_NEAR); - } L(kd_label); mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]); } else { mov(reg_kj, reg_kh); } - Label skip_kh_loop; - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } if (jcp.ndims == 5) { mov(aux_reg_inp, aux_reg_inp_d); mov(aux_reg_ker, aux_reg_ker_d); @@ -427,7 +426,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, * ((ki * (jcp.dilate_w + 1) + oi * stride_w - pad_l) * ic_block + ic); - v4fmaddps(zmm_out(oi, kk), zmm_ker(0), + v4fmaddps(vmm_out(oi, kk), vmm_ker(0), EVEX_compress_addr(aux_reg_inp, aux_input_offset)); if (oi % 2) { @@ -468,7 +467,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, * ((ki * (jcp.dilate_w + 1) + oi * stride_w - pad_l) * ic_block + ic); - v4fmaddps(zmm_out(oi, kk), zmm_ker(0), + v4fmaddps(vmm_out(oi, kk), vmm_ker(0), EVEX_compress_addr(aux_reg_inp, aux_input_offset)); if (oi % 2) { @@ -499,11 +498,11 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, int aux_input_offset = typesize * ((ki * (jcp.dilate_w + 1) + oi * stride_w - pad_l) * ic_block + ic); - v4fmaddps(zmm_out(oi, kk), zmm_ker(0), + v4fmaddps(vmm_out(oi, kk), vmm_ker(0), EVEX_compress_addr(aux_reg_inp, aux_input_offset)); - if (!is_1D_prefetching(jcp)) { + if (!is_owb_prefetching(jcp)) { if ((oi % 2) && (prf_count_t1 < 4)) { mic_prefetcht1(EVEX_compress_addr( aux_reg_ker_prf, kernel_offset(kk, @@ -521,7 +520,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, prf_count_t0++; } } - if (!is_1D_prefetching(jcp)) { + if (!is_owb_prefetching(jcp)) { if (pref_current_inp) { if (ki == 0 && ic == 0 && kk == 0) mic_prefetcht0(EVEX_compress_addr( @@ -560,8 +559,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, cmp(reg_kj, 0); jg(kh_label, T_NEAR); - L(skip_kh_loop); - if (jcp.ndims == 5) { add(aux_reg_inp_d, typesize * (jcp.dilate_d + 1) * jcp.ih * jcp.iw * jcp.ic_block); @@ -575,16 +572,14 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w, dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_out); pop(reg_out_prf); } - - store_output(ur_w); } -void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, +template +void _jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, int pad_l, int pad_r) { bool prf_ker = true; @@ -597,20 +592,19 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; int nb_oc_block = jcp.nb_oc_blocking; - Label kh_label, kd_label, skip_kd_loop; + Label kh_label, kd_label; int ker_pipeline_depth = 4; assert(ker_reg_base_idx + ker_pipeline_depth <= 32); assert(oc_block >= ker_pipeline_depth); int num_ker_loads = ic_block * nb_oc_block * kw; - const int simd_w = 16; int num_ker_prfs = prf_ker ? num_ker_loads : 0; int num_inp_prfs = prf_inp ? ur_w * nstl::min(kw, stride_w) + nstl::max(0, kw - stride_w) : 0; if (jcp.is_1stconv && prf_inp) { - num_inp_prfs = div_up(num_inp_prfs, simd_w) * ic_block; + num_inp_prfs = div_up(num_inp_prfs, jcp.simd_w) * ic_block; } int num_prfs = num_ker_prfs + num_inp_prfs; int num_fmas = num_ker_loads * ur_w; @@ -619,8 +613,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, int prf_inst_trigger = (num_fmas % prf_inst_spacing) / 2; int inp_mul = !jcp.is_1stconv ? ic_block : 1; - prepare_output(ur_w); - if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_inp, reg_inp); mov(aux_reg_ker, reg_ker); @@ -643,20 +635,11 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, mov(aux_reg_inp_d_prf, reg_inp_prf); mov(aux_reg_ker_d_prf, reg_ker_prf); - if ((jcp.kd - 1) * (jcp.dilate_d + 1) < nstl::max(jcp.f_pad, jcp.back_pad)) { - cmp(reg_ki, 0); - je(skip_kd_loop, T_NEAR); - } L(kd_label); mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]); } else { mov(reg_kj, reg_kh); } - Label skip_kh_loop; - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } if (jcp.ndims == 5) { mov(aux_reg_inp, aux_reg_inp_d); @@ -676,7 +659,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, if (step == 0) { for (int i = 0; i < ker_pipeline_depth; i++) { aux_kernel_offset = get_kernel_offset(ki, ic, 0, i); - vmovups(zmm_ker(i), EVEX_compress_addr( + vmovups(vmm_ker(i), EVEX_compress_addr( aux_reg_ker, aux_kernel_offset)); } } else if (step < num_ker_loads - ker_pipeline_depth + 1) { @@ -685,19 +668,19 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, = (step + load_offset) % ker_pipeline_depth; aux_kernel_offset = get_kernel_offset(ki, ic, 0, load_offset); - vmovups(zmm_ker(ker_load_reg_idx), + vmovups(vmm_ker(ker_load_reg_idx), EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); } bool ker_prf_inserted = false; - Zmm zmm_kernel = zmm_ker(step % ker_pipeline_depth); + Vmm vmm_kernel = vmm_ker(step % ker_pipeline_depth); int j_start = get_ow_start(ki, pad_l); int j_end = get_ow_end(ur_w, ki, pad_r); for (int j = j_start; j < j_end; j++) { size_t aux_input_offset = get_input_offset(ki, ic, j, pad_l); auto addr = EVEX_compress_addr_safe(aux_reg_inp, aux_input_offset, reg_long_offt, true); - vfmadd231ps(zmm_out(j, 0), zmm_kernel, addr); + vfmadd231ps(vmm_out(j, 0), vmm_kernel, addr); int fma_idx = step * ur_w + j; int prf_slot_idx = fma_idx / prf_inst_spacing; if (fma_idx % prf_inst_spacing == prf_inst_trigger) { @@ -724,7 +707,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, size_t ic_prf_stride = (size_t)jcp.typesize_in * iw * ih * id; size_t iw_prf_stride - = jcp.typesize_in * simd_w; + = jcp.typesize_in * jcp.simd_w; inp_prf_offset = ((inp_prf_idx / ic_block) * iw_prf_stride + (inp_prf_idx % ic_block) @@ -752,7 +735,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, jg(kh_label, T_NEAR); } - L(skip_kh_loop); if (jcp.ndims == 5) { add(aux_reg_inp_d, @@ -767,16 +749,15 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w, dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_out); pop(reg_out_prf); } if (max_input_offset > INT_MAX) pop(reg_inp_prf); - store_output(ur_w); } -void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, +template +void _jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, int pad_l, int pad_r) { int kw = jcp.kw; @@ -784,7 +765,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; int nb_oc_block = jcp.nb_oc_blocking; - Label kh_label, skip_kh_loop, kd_label, skip_kd_loop; + Label kh_label, kd_label; int shift_kernel_ptr = jcp.typesize_in * jcp.kw * jcp.oc_block * jcp.ic_block; int inp_mul = !jcp.is_1stconv ? ic_block : 1; @@ -799,8 +780,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, * (!jcp.is_1stconv ? 1 : (size_t)jcp.iw * jcp.ih * jcp.id)); }; - prepare_output(ur_w); - if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_inp, reg_inp); mov(aux_reg_ker, reg_ker); @@ -813,19 +792,11 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, mov(aux_reg_ker_d, ptr[param1 + GET_OFF(filt)]); mov(aux_reg_inp_d, reg_inp); - if ((jcp.kd - 1) * (jcp.dilate_d + 1) < nstl::max(jcp.f_pad, jcp.back_pad)) { - cmp(reg_ki, 0); - je(skip_kd_loop, T_NEAR); - } L(kd_label); mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]); } else { mov(reg_kj, reg_kh); } - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } if (jcp.ndims == 5) { mov(aux_reg_inp, aux_reg_inp_d); @@ -841,7 +812,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, if (jcp.kernel_kind == expl_bcast) { for (int jj = jj_start; jj < jj_end; jj++) { size_t aux_input_offset = input_offset(jj, ic, ki); - vbroadcastss(zmm_inp(jj, nb_oc_block), + vbroadcastss(vmm_inp(jj, nb_oc_block), EVEX_compress_addr_safe(aux_reg_inp, aux_input_offset, reg_long_offt)); } @@ -851,15 +822,15 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, * (ii * jcp.nb_ic * jcp.kh * jcp.kw * jcp.kd * ic_block * oc_block + ki * ic_block * oc_block + ic * oc_block); if (jj_end - jj_start > 0) - vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker, + vmovups(vmm_wei, EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); for (int jj = jj_start; jj < jj_end; jj++) if (jcp.kernel_kind == expl_bcast) - vfmadd231ps(zmm_out(jj, ii), - zmm_inp(jj, nb_oc_block), zmm_wei); + vfmadd231ps(vmm_out(jj, ii), + vmm_inp(jj, nb_oc_block), vmm_wei); else { size_t aux_input_offset = input_offset(jj, ic, ki); - vfmadd231ps(zmm_out(jj, ii), zmm_wei, + vfmadd231ps(vmm_out(jj, ii), vmm_wei, EVEX_compress_addr_safe(aux_reg_inp, aux_input_offset, reg_long_offt, true)); } @@ -872,7 +843,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, cmp(reg_kj, 0); jg(kh_label, T_NEAR); } - L(skip_kh_loop); if (jcp.ndims == 5) { add(aux_reg_inp_d, @@ -883,15 +853,19 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w, dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_out); } +} - store_output(ur_w); +template +void _jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( + int ur_w, int pad_l, int pad_r) +{ } -void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( +template<> +void _jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( int ur_w, int pad_l, int pad_r) { Label kh_label, kd_label; @@ -908,7 +882,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( assert(reg_inp_prf == reg_long_offt); if (max_input_offset > INT_MAX) push(reg_inp_prf); - prepare_output(ur_w); if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_inp, reg_inp); @@ -917,8 +890,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( mov(aux_reg_inp_prf, reg_inp_prf); } - Label skip_kh_loop, skip_kd_loop; - if (jcp.ndims == 5) { push(reg_out_prf); push(reg_out); @@ -929,19 +900,11 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( mov(aux_reg_inp_d_prf, reg_inp_prf); mov(aux_reg_ker_d_prf, reg_ker_prf); - if ((jcp.kd - 1) * (jcp.dilate_d + 1) < nstl::max(jcp.f_pad, jcp.back_pad)) { - cmp(reg_ki, 0); - je(skip_kd_loop, T_NEAR); - } L(kd_label); mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]); } else { mov(reg_kj, reg_kh); } - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } if (jcp.ndims == 5) { mov(aux_reg_inp, aux_reg_inp_d); mov(aux_reg_ker, aux_reg_ker_d); @@ -957,7 +920,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( if (jcp.kernel_kind == expl_bcast) { for (int oi = ow_start; oi < ow_end; oi++) { size_t input_offset = get_input_offset(ki, ic, oi, pad_l); - vpbroadcastd(zmm_inp(oi, jcp.nb_oc_blocking), + vpbroadcastd(vmm_inp(oi, jcp.nb_oc_blocking), EVEX_compress_addr_safe(aux_reg_inp, input_offset, reg_long_offt)); } @@ -965,7 +928,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( for (int kk = 0; kk < jcp.nb_oc_blocking; kk++) { if (jcp.kernel_kind == expl_bcast) { int kernel_offset = get_kernel_offset(ki, ic, kk, 0); - vmovups(zmm_wei, + vmovups(vmm_wei, EVEX_compress_addr(aux_reg_ker, kernel_offset)); } else { for (int ii = 0; ii < ker_load_number; ii++) { @@ -979,12 +942,17 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( for (int oi = ow_start, prf_count = 0; oi < ow_end; oi++) { size_t input_offset = get_input_offset(ki, ic, oi, pad_l); if (jcp.kernel_kind == expl_bcast) { - vpdpwssd(zmm_out(oi, kk), zmm_wei, - zmm_inp(oi, jcp.nb_oc_blocking)); + vpdpwssd(vmm_out(oi, kk), vmm_wei, + vmm_inp(oi, jcp.nb_oc_blocking)); } else { - vpXdpwssd(zmm_out(oi, kk), Zmm(ker_reg_base_idx), - EVEX_compress_addr_safe(aux_reg_inp, input_offset, - reg_long_offt, jcp.ver != ver_4vnni)); + if (jcp.ver == ver_4vnni) + vp4dpwssd(vmm_out(oi, kk), Zmm(ker_reg_base_idx), + EVEX_compress_addr_safe(aux_reg_inp, + input_offset, reg_long_offt, false)); + else + vpdpwssd(vmm_out(oi, kk), Zmm(ker_reg_base_idx), + EVEX_compress_addr_safe(aux_reg_inp, + input_offset, reg_long_offt, true)); } if ((oi % 2) && (prf_count < ker_load_number)) { int kernel_offset = get_kernel_offset( @@ -1014,8 +982,6 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( jg(kh_label, T_NEAR); } - L(skip_kh_loop); - if (jcp.ndims == 5) { add(aux_reg_inp_d, jcp.typesize_in * jcp.ih * jcp.iw * jcp.ic_block); add(aux_reg_ker_d, jcp.typesize_in * jcp.kw * jcp.kh * jcp.oc_block @@ -1027,19 +993,37 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni( dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_out); pop(reg_out_prf); } if (max_input_offset > INT_MAX) pop(reg_inp_prf); - store_output(ur_w); } -void jit_avx512_common_conv_fwd_kernel::compute_loop(int ur_w, +template +void _jit_avx512_common_conv_fwd_kernel::compute_loop(int ur_w, int pad_l, int pad_r) { if (jcp.ndims == 5) push(reg_oi); + + prepare_output(ur_w); + + Label skip_compute_loop; + if (jcp.ndims == 5) { + if ((jcp.dilate_d >= jcp.id) + || (jcp.kd - 1) * (jcp.dilate_d + 1) < nstl::max(jcp.f_pad, jcp.back_pad)) { + mov(reg_kj, ptr[param1 + GET_OFF(kd_padding)]); + cmp(reg_kj, 0); + je(skip_compute_loop, T_NEAR); + } + } + if ((jcp.dilate_h >= jcp.ih) + || (jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { + mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]); + cmp(reg_kj, 0); + je(skip_compute_loop, T_NEAR); + } + if (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) compute_loop_vnni(ur_w, pad_l, pad_r); else if (jcp.ver == ver_4fma) @@ -1058,17 +1042,15 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop(int ur_w, compute_loop_fma_core(ur_w, pad_l, pad_r); else assert(!"unknown convolution version"); + + L(skip_compute_loop); + store_output(ur_w); if (jcp.ndims == 5) pop(reg_oi); } -void jit_avx512_common_conv_fwd_kernel::generate() +template +void _jit_avx512_common_conv_fwd_kernel::generate() { - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - const auto &p = attr_.post_ops_; for (int i = 0; i < p.len_; i++) { auto &post_op = p.entry_[i]; @@ -1318,17 +1300,10 @@ bool jit_avx512_common_conv_fwd_kernel::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: - return true // sum OR eltwise OR depthwise - && !jcp.with_eltwise && (is_simple(0) || is_sum(0)); - case 2: - return true // sum->relu - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || - (is_simple(0) && is_simple(1))); - case 3: - return true // sum->relu - && !jcp.with_eltwise && (is_sum(0) && is_simple(1) && is_simple(2)); + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_simple(1)); + case 3: return is_sum(0) && is_simple(1) && is_simple(2); default: return false; } @@ -1336,25 +1311,22 @@ bool jit_avx512_common_conv_fwd_kernel::post_ops_ok( } status_t jit_avx512_common_conv_fwd_kernel::init_conf( - jit_conv_conf_t &jcp, - const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, - cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, - cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr, - int nthreads, bool with_relu, float relu_negative_slope) + jit_conv_conf_t &jcp, const convolution_desc_t &cd, + cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, + cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, + const primitive_attr_t &attr, int nthreads) { using namespace prop_kind; if (!mayiuse(avx512_common)) return status::unimplemented; - const int simd_w = cpu_isa_traits::vlen / sizeof(float); - const memory_desc_wrapper src_d(&src_pd); const memory_desc_wrapper weights_d(&weights_pd); const memory_desc_wrapper dst_d(&dst_pd); const memory_desc_wrapper bias_d(&bias_pd); - int regs = 28; + const int regs = 28; const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; int ndims = src_d.ndims(); @@ -1382,9 +1354,6 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4]; jcp.stride_w = cd.strides[ndims-3]; jcp.src_fmt = src_d.format(); - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0; jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4]; @@ -1397,14 +1366,26 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( jcp.is_1stconv = is_1stconv(jcp); - jcp.oc_block = simd_w; - jcp.ic_block = jcp.is_1stconv ? jcp.ic : simd_w; - jcp.aligned_threads = 0; - bool ok_to_pad_channels = true && jcp.ngroups == 1 && src_d.data_type() == data_type::f32; + const int full_simd_w = cpu_isa_traits::vlen / sizeof(float); + jcp.simd_w = full_simd_w; + bool ok_to_try_xmm = true + && mayiuse(avx512_core) + && src_d.data_type() == data_type::f32 + && !jcp.is_1stconv + && !ok_to_pad_channels + && (jcp.ic % jcp.simd_w != 0 || jcp.oc % jcp.simd_w != 0) + && (jcp.ic % 8 != 0 || jcp.oc % 8 != 0); + if (ok_to_try_xmm) + jcp.simd_w = 4; + + jcp.oc_block = jcp.simd_w; + jcp.ic_block = jcp.is_1stconv ? jcp.ic : jcp.simd_w; + jcp.aligned_threads = 0; + if (ok_to_pad_channels) { jcp.oc = rnd_up(jcp.oc, jcp.oc_block); jcp.ic = rnd_up(jcp.ic, jcp.ic_block); @@ -1420,14 +1401,28 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( const auto &p = attr.post_ops_; jcp.with_sum = p.find(primitive_kind::sum) != -1; + const int eltwise_ind = p.find(primitive_kind::eltwise); + jcp.with_eltwise = eltwise_ind != -1; + if (jcp.with_eltwise) { + jcp.eltwise = p.entry_[eltwise_ind].eltwise; + if (dst_d.data_type() == data_type::s32) return status::unimplemented; + } auto src_format = jcp.is_1stconv ? pick(ndims - 3, ncw, nchw, ncdhw) + : ((jcp.simd_w == 4) + ? pick(ndims - 3, nCw4c, nChw4c, nCdhw4c) + : pick(ndims - 3, nCw16c, nChw16c, nCdhw16c)); + auto dst_format = (jcp.simd_w == 4) + ? pick(ndims - 3, nCw4c, nChw4c, nCdhw4c) : pick(ndims - 3, nCw16c, nChw16c, nCdhw16c); - auto dst_format = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c); auto wei_format = with_groups - ? pick(ndims - 3, gOIw16i16o, gOIhw16i16o, gOIdhw16i16o) - : pick(ndims - 3, OIw16i16o, OIhw16i16o, OIdhw16i16o); + ? ((jcp.simd_w == 4) + ? pick(ndims - 3, gOIw4i4o, gOIhw4i4o, gOIdhw4i4o) + : pick(ndims - 3, gOIw16i16o, gOIhw16i16o, gOIdhw16i16o)) + : ((jcp.simd_w == 4) + ? pick(ndims - 3, OIw4i4o, OIhw4i4o, OIdhw4i4o) + : pick(ndims - 3, OIw16i16o, OIhw16i16o, OIdhw16i16o)); if (src_d.format() == any) CHECK(src_pd.set_format(src_format)); @@ -1491,16 +1486,24 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( jcp.ver = ver_fma; if (jcp.ver == ver_4fma) { const auto w_format = with_groups - ? pick(ndims - 3, gOiw16o, gOihw16o, gOidhw16o) - : pick(ndims - 3, Oiw16o, Oihw16o, Oidhw16o); + ? ((jcp.simd_w == 4) + ? pick(ndims - 3, gOiw4o, gOihw4o, gOidhw4o) + : pick(ndims - 3, gOiw16o, gOihw16o, gOidhw16o)) + : ((jcp.simd_w == 4) + ? pick(ndims - 3, Oiw4o, Oihw4o, Oidhw4o) + : pick(ndims - 3, Oiw16o, Oihw16o, Oidhw16o)); if (weights_d.format() == any) CHECK(weights_pd.set_format(w_format)); if (weights_d.format() != w_format) return status::unimplemented; } else { const auto w_format = with_groups - ? pick(ndims - 3, gOwi16o, gOhwi16o, gOdhwi16o) - : pick(ndims - 3, Owi16o, Ohwi16o, Odhwi16o); + ? ((jcp.simd_w == 4) + ? pick(ndims - 3, gOwi4o, gOhwi4o, gOdhwi4o) + : pick(ndims - 3, gOwi16o, gOhwi16o, gOdhwi16o)) + : ((jcp.simd_w == 4) + ? pick(ndims - 3, Owi4o, Ohwi4o, Odhwi4o) + : pick(ndims - 3, Owi16o, Ohwi16o, Odhwi16o)); if (weights_d.format() == any) CHECK(weights_pd.set_format(w_format)); if (weights_d.format() != w_format) @@ -1561,10 +1564,25 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( } } + /* Grouped channel offset to support 'non-blocked data' format for + * convolution sizes with '(input_channel / ngroups) < simd' */ + jcp.nonblk_group_off + = (jcp.ngroups > 1 && one_of(src_d.format(), ncw, nchw, ncdhw)) ? + jcp.ic : + 1; + jcp.nb_ic = jcp.ic / jcp.ic_block; jcp.nb_oc = jcp.oc / jcp.oc_block; jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; + auto is_ow_threading_applicable = [=]() { + return (true && !jcp.is_1stconv && one_of(jcp.ndims, 3, 4) + && IMPLICATION(mayiuse(avx512_mic), + jcp.ver == ver_4fma + && IMPLICATION(jcp.mb != 1, + jcp.ih == 1 && jcp.kh == 1))); + }; + if (jcp.ver == ver_4vnni) { jcp.kernel_kind = embd_bcast; } @@ -1593,9 +1611,13 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( } if (one_of(jcp.ver, ver_4vnni, ver_4fma) && !jcp.is_1stconv) { - if (jcp.kw == 3 && jcp.kh == 3 && jcp.ow == 7 && jcp.oh == 7) { - if (jcp.nb_oc % 2 == 0) + if ((jcp.kw <= 5 && jcp.kh <= 5 && jcp.kw == jcp.kh && jcp.ow <= 8 + && jcp.oh <= 8 && jcp.ow == jcp.oh) + || (jcp.stride_h != 1 && jcp.ur_w < jcp.ow)) { + if (jcp.nb_oc % 2 == 0) { jcp.nb_oc_blocking = 2; + jcp.ur_w = nstl::min(jcp.ow, regs / jcp.nb_oc_blocking); + } } else { for (int i = jcp.nb_oc; i > 0; i--) if (i * jcp.ur_w <= regs && jcp.nb_oc % i == 0) { @@ -1603,15 +1625,74 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( break; } } - if (jcp.ver == ver_4fma - && is_1D_conv(jcp) && one_of(jcp.ndims, 3, 4)) { - if (jcp.nb_oc % 2 == 0) { + if (jcp.ver == ver_4fma && is_ow_threading_applicable()) { + if (jcp.nb_oc % 2 == 0 && jcp.ur_w < jcp.ow + && jcp.ow != 2 * jcp.ur_w) { jcp.nb_oc_blocking = 2; jcp.ur_w = nstl::min(jcp.ow, regs / jcp.nb_oc_blocking); } } } + jcp.ow_block = jcp.ow; + + auto get_thr_eff = [=](int nb_oc_blocking, int ow_block) { + int nb_ow = div_up(jcp.ow, ow_block); + int nb_oc_chunks = div_up(jcp.nb_oc, nb_oc_blocking); + int work_amount = jcp.mb * jcp.oh * nb_oc_chunks * nb_ow; + float disbalance = (float)jcp.ow / rnd_up(jcp.ow, ow_block); + float thr_eff = disbalance * (float)work_amount + / rnd_up(work_amount, nthreads); + return thr_eff; + }; + + auto get_ow_block = [=](int nb_oc_blocking, int ur_w, float &eff) { + int res_ow_block = jcp.ow; + eff = get_thr_eff(nb_oc_blocking, res_ow_block); + if (!is_ow_threading_applicable()) + return res_ow_block; + + int L2_part = (get_cache_size(2) * 7 / 8) / typesize; + if (jcp.ver == ver_4fma) + L2_part /= 2; + int size_src_chunk = jcp.ic_block * ur_w * jcp.kh; + int size_dst_chunk = jcp.oc_block * nb_oc_blocking * ur_w; + int size_wei_chunk = jcp.oc_block * nb_oc_blocking * jcp.ic_block + * jcp.kw * jcp.kh; + int nurw_cache = (L2_part - 2 * size_wei_chunk) + / (2 * size_dst_chunk + 2 * size_src_chunk); + // current design of generate() requires ow_block >= 2 * ur_w + int ow_block_cache = ur_w * nstl::max(2, nurw_cache); + + int ow_block_thr = ow_block_cache; + eff = get_thr_eff(nb_oc_blocking, ow_block_thr); + + int max_nb_ow = div_up(jcp.ow, 2 * ur_w); + int start_nb_ow = div_up(jcp.ow, ow_block_thr); + for (int nb_ow = start_nb_ow; nb_ow <= max_nb_ow; nb_ow++) { + int ow_block + = nstl::min(rnd_up(div_up(jcp.ow, nb_ow), ur_w), jcp.ow); + float eff_threshold = (jcp.ver == ver_4fma) ? 0.8f : 0.9f; + if (ow_block < nb_oc_blocking * jcp.oc_block && eff > eff_threshold) + break; + if (div_up(jcp.ow, ow_block) != nb_ow) + continue; + float thr_eff = get_thr_eff(nb_oc_blocking, ow_block); + float eff_step = (jcp.ver == ver_4fma) ? 1.1f : 1.f; + if (ow_block >= 2 * ur_w && thr_eff > eff_step * eff) { + ow_block_thr = ow_block; + eff = thr_eff; + } + eff_threshold = (jcp.ver == ver_4fma) ? 0.9f : 0.98f; + if (eff > eff_threshold) + break; + } + res_ow_block = nstl::min(jcp.ow, nstl::max(2 * ur_w, ow_block_thr)); + eff = get_thr_eff(nb_oc_blocking, res_ow_block); + return res_ow_block; + }; + + if (jcp.ver == ver_fma && mayiuse(avx512_core)) { int try_nb_oc_blocking = 2; unsigned int ker_inp_size = typesize * div_up(jcp.iw, jcp.stride_w) @@ -1629,7 +1710,6 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( && !(jcp.kw == 3 && jcp.ow == 28 && jcp.ic >= 512); if (jcp.mb == 1) { - jcp.kernel_kind = embd_bcast; unsigned int inp_size = jcp.mb * div_up(jcp.ih, jcp.stride_h) * div_up(jcp.iw, jcp.stride_w) * jcp.ic; unsigned int wei_size = jcp.ic * jcp.oc * jcp.kh * jcp.kw; @@ -1662,59 +1742,52 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( } } } - } else if (jcp.kw > 3 - || (jcp.stride_w == 1 && jcp.stride_h == 1 - && embd_bcast_condition) - || ((jcp.stride_w != 1 || jcp.stride_h != 1) - && ((jcp.mb <= 16 && (jcp.oc <= 192 || jcp.oh <= 10) - && embd_bcast_condition))) - ) { + } + + if (jcp.kw > 3 + || (jcp.stride_w == 1 && jcp.stride_h == 1 + && embd_bcast_condition) + || ((jcp.stride_w != 1 || jcp.stride_h != 1) + && ((jcp.mb <= 16 && (jcp.oc <= 192 || jcp.oh <= 10) + && embd_bcast_condition))) + || (jcp.mb == 1 + && (jcp.ur_w >= jcp.ow || jcp.is_1stconv + || (jcp.ow <= 147 && jcp.oc <= 96)))) { jcp.kernel_kind = embd_bcast; jcp.ur_w = nstl::min(jcp.ow, regs); jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; if (ker_total_size < L1_cache_size && jcp.ow <= 8 && jcp.kh <= 3 - && jcp.kw <= 3) { - if (jcp.nb_oc % try_nb_oc_blocking == 0 && !jcp.is_1stconv) { - jcp.nb_oc_blocking = try_nb_oc_blocking; - jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) - jcp.ur_w = jcp.ow; - } + && jcp.kw <= 3 && jcp.nb_oc % try_nb_oc_blocking == 0 + && IMPLICATION(jcp.is_1stconv, jcp.mb == 1) + && IMPLICATION(jcp.mb == 1, jcp.ur_w < jcp.ow)) { + jcp.nb_oc_blocking = try_nb_oc_blocking; + jcp.ur_w = nstl::min(jcp.ow, 31 / (jcp.nb_oc_blocking + 1)); } } else { jcp.kernel_kind = expl_bcast; jcp.nb_ic_blocking = 1; - jcp.nb_oc_blocking = 4; - if (jcp.nb_oc < jcp.nb_oc_blocking) jcp.nb_oc_blocking = jcp.nb_oc; - if (jcp.nb_oc % jcp.nb_oc_blocking != 0) - for (int i = jcp.nb_oc_blocking; i > 0; i--) + if (IMPLICATION(jcp.is_1stconv, jcp.mb > 1)) { + float best_thr_eff = 0.f; + int best_nb_oc_blocking = 1; + for (int i = nstl::min(jcp.nb_oc, 5); i > 0; i--) { if (jcp.nb_oc % i == 0) { - jcp.nb_oc_blocking = i; - break; + float thr_eff; + int ur_w = nstl::min(jcp.ow, 31 / (i + 1)); + get_ow_block(i, ur_w, thr_eff); + if (thr_eff > 1.05f * best_thr_eff) { + best_nb_oc_blocking = i; + best_thr_eff = thr_eff; + } } - jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) - jcp.ur_w = jcp.ow; + } + jcp.nb_oc_blocking = best_nb_oc_blocking; + jcp.ur_w = nstl::min(jcp.ow, 31 / (jcp.nb_oc_blocking + 1)); + } } } jcp.ur_w_tail = jcp.ow % jcp.ur_w; - jcp.ow_block = jcp.ow; - if (is_ow_threading_available(jcp)) { - const int L1_part = get_cache_size(1) * 5 / 8; - int size_src_chunk = typesize * jcp.ic_block * jcp.ur_w; - int size_dst_chunk = typesize - * jcp.oc_block * jcp.nb_oc_blocking * jcp.ur_w; - int size_wei_chunk = typesize - * jcp.oc_block * jcp.ic_block * jcp.nb_oc_blocking * jcp.kw; - int nurw = (L1_part - size_wei_chunk) - / (size_dst_chunk + size_src_chunk); - // current design of generate() requires ow_block >= 2 * ur_w - jcp.ow_block = jcp.ur_w * nstl::max(2, nurw); - } - jcp.nb_ow = div_up(jcp.ow, jcp.ow_block); - args_ok = true && jcp.l_pad <= jcp.ur_w && jcp.ic <= src_d.blocking_desc().padding_dims[1] @@ -1734,10 +1807,14 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( jcp.nb_ic_L2 = jcp.nb_ic; + float thr_eff; + jcp.ow_block = get_ow_block(jcp.nb_oc_blocking, jcp.ur_w, thr_eff); + jcp.nb_ow = div_up(jcp.ow, jcp.ow_block); + const int L2_size = get_cache_size(2, true) / sizeof(float); // Source and output data needs to fit in L2, // leaving some space for weights and prefetching. - int h_L2 = int(((0.6f * L2_size) / simd_w + int h_L2 = int(((0.6f * L2_size) / jcp.simd_w - nstl::min(0, jcp.kh - jcp.stride_h) * jcp.iw) / (jcp.stride_h * jcp.iw + jcp.ow)); jcp.h_blocking = nstl::max(1, nstl::min(jcp.oh, h_L2)); @@ -1765,7 +1842,7 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( break; } } - } else { + } else if (jcp.ic > 64) { jcp.nb_ic_L2 = 2; /* according to performance data*/ } } @@ -1773,6 +1850,12 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf( return status::success; } +void jit_avx512_common_conv_fwd_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + if (jcp.with_bias && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, jcp.typesize_out * jcp.oc); +} + void jit_avx512_common_conv_bwd_data_kernel_f32::prepare_output(int ur_w) { for (int k = 0; k < jcp.nb_ic_blocking; k++) { @@ -1826,7 +1909,7 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_4fma( int kw = jcp.kw; int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; - Label kh_label, last_iter_label, loop_end_label, kd_label, skip_kd_loop; + Label kh_label, last_iter_label, loop_end_label, kd_label; int ker_load_number = 4; int shift_ker_ptr = typesize * kw * oc_block * ic_block; int shift_dst_ptr = typesize * ow * oc_block; @@ -1857,8 +1940,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_4fma( } }; - prepare_output(ur_w); - if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_dst, reg_dst); mov(aux_reg_ker, reg_ker); @@ -2004,13 +2085,10 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_4fma( dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_src); pop(reg_src_prf); } - - store_output(ur_w); } void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_vnni( @@ -2031,8 +2109,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_vnni( return jcp.typesize_in * (blk_offset + oc_offset); }; - prepare_output(ur_w); - mov(aux_reg_dst, reg_dst); mov(aux_reg_ker, reg_ker); mov(aux_reg_dst_prf, reg_dst_prf); @@ -2108,15 +2184,12 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_vnni( cmp(reg_kj, 0); jg(kh_label, T_NEAR); } - - store_output(ur_w); } void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma( int ur_w, int l_overflow, int r_overflow) { - Label kh_label, kd_label, skip_kd_loop; - Label store_output_label; + Label kh_label, kd_label; int kw = jcp.kw; int ow = jcp.ow; @@ -2139,8 +2212,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma( int prf_inst_spacing = nstl::max(1, num_fmas / num_prfs); int prf_inst_trigger = (num_fmas % prf_inst_spacing) / 2; - prepare_output(ur_w); - if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_dst, reg_dst); mov(aux_reg_ker, reg_ker); @@ -2154,9 +2225,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma( push(reg_src); mov(reg_ki, ptr[param + GET_OFF(kd_padding)]); - cmp(reg_ki, 0); - je(store_output_label, T_NEAR); - mov(aux_reg_dst_d, reg_dst); mov(aux_reg_ker_d, ptr[param + GET_OFF(filt)]); mov(aux_reg_dst_d_prf, reg_dst_prf); @@ -2167,8 +2235,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma( } else { mov(reg_kj, reg_kh); } - cmp(reg_kj, 0); - je(store_output_label, T_NEAR); if (jcp.ndims == 5) { mov(aux_reg_dst, aux_reg_dst_d); @@ -2268,16 +2334,12 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma( dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); } - L(store_output_label); { - if (jcp.ndims == 5) - { - pop(reg_src); - pop(reg_src_prf); - } - store_output(ur_w); + if (jcp.ndims == 5) + { + pop(reg_src); + pop(reg_src_prf); } } @@ -2291,7 +2353,7 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma_core( int ic_block = jcp.ic_block; int oc_block = jcp.oc_block; int nb_ic_block = jcp.nb_ic_blocking; - Label kh_label, skip_kh_loop, kd_label, skip_kd_loop; + Label kh_label, kd_label; int shift_ker_ptr = typesize * kw * oc_block * ic_block; int shift_dst_ptr = typesize * (jcp.dilate_h + 1) * ow * oc_block; @@ -2307,8 +2369,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma_core( return typesize * (blk_offset + oc_offset); }; - prepare_output(ur_w); - if (one_of(jcp.ndims, 3, 4)) { mov(aux_reg_dst, reg_dst); mov(aux_reg_ker, reg_ker); @@ -2327,8 +2387,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma_core( } else { mov(reg_kj, reg_kh); } - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); if (jcp.ndims == 5) { mov(aux_reg_dst, aux_reg_dst_d); @@ -2370,7 +2428,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma_core( cmp(reg_kj, 0); jg(kh_label, T_NEAR); } - L(skip_kh_loop); if (jcp.ndims == 5) { sub(aux_reg_dst_d, @@ -2380,19 +2437,29 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma_core( dec(reg_ki); cmp(reg_ki, 0); jg(kd_label, T_NEAR); - L(skip_kd_loop); pop(reg_src); pop(reg_src_prf); } - - store_output(ur_w); } inline void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop( int ur_w, int l_overflow, int r_overflow) { if (jcp.ndims == 5) push(reg_oi); + + prepare_output(ur_w); + + Label skip_compute_loop; + if (jcp.ndims == 5) { + mov(reg_kj, ptr[param + GET_OFF(kd_padding)]); + cmp(reg_kj, 0); + je(skip_compute_loop, T_NEAR); + } + mov(reg_kj, ptr[param + GET_OFF(kh_padding)]); + cmp(reg_kj, 0); + je(skip_compute_loop, T_NEAR); + if (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) compute_loop_vnni(ur_w, l_overflow, r_overflow); else if (jcp.ver == ver_4fma) @@ -2407,6 +2474,9 @@ inline void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop( compute_loop_fma_core(ur_w, l_overflow, r_overflow); else assert("!unknown convolution version"); + + L(skip_compute_loop); + store_output(ur_w); if (jcp.ndims == 5) pop(reg_oi); } @@ -2504,7 +2574,9 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf( { if (!mayiuse(avx512_common)) return status::unimplemented; - const int simd_w = cpu_isa_traits::vlen / sizeof(float); + jcp = zero(); + + jcp.simd_w = cpu_isa_traits::vlen / sizeof(float); const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1; int ndims = diff_src_d.ndims(); @@ -2556,8 +2628,8 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf( jcp.is_1stconv = false; - jcp.oc_block = simd_w; - jcp.ic_block = jcp.is_1stconv ? jcp.ic : simd_w; + jcp.oc_block = jcp.simd_w; + jcp.ic_block = jcp.is_1stconv ? jcp.ic : jcp.simd_w; bool ok_to_pad_channels = true && jcp.ngroups == 1 @@ -2777,8 +2849,15 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf( && jcp.oc <= diff_dst_d.blocking_desc().padding_dims[1] && jcp.ic <= weights_d.blocking_desc().padding_dims[with_groups + 1] && jcp.oc <= weights_d.blocking_desc().padding_dims[with_groups + 0]; + if (!args_ok) return status::unimplemented; - return args_ok ? status::success : status::unimplemented; + return status::success; +} + +void jit_avx512_common_conv_bwd_data_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + UNUSED(scratchpad); + UNUSED(jcp); } const int jit_avx512_common_conv_bwd_weights_kernel_f32::max_ur_w = 28; @@ -4464,13 +4543,10 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32::generate() status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( jit_conv_conf_t &jcp, const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &diff_weights_pd, - cpu_memory_t::pd_t &diff_bias_pd, cpu_memory_t::pd_t &diff_dst_pd) -{ + cpu_memory_t::pd_t &diff_bias_pd, cpu_memory_t::pd_t &diff_dst_pd) { if (!mayiuse(avx512_common)) return status::unimplemented; - const int simd_w = cpu_isa_traits::vlen / sizeof(float); - const memory_desc_wrapper src_d(&src_pd); const memory_desc_wrapper diff_weights_d(&diff_weights_pd); const memory_desc_wrapper diff_bias_d(&diff_bias_pd); @@ -4480,6 +4556,8 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( int ndims = src_d.ndims(); jcp = zero(); + + jcp.simd_w = cpu_isa_traits::vlen / sizeof(float); jcp.ndims = ndims; jcp.prop_kind = cd.prop_kind; @@ -4545,14 +4623,14 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( /* check for the 1st convolution */ jcp.is_1stconv = is_1stconv(jcp); - jcp.oc_block = simd_w; + jcp.oc_block = jcp.simd_w; bool ok_to_pad_channels = true && jcp.ngroups == 1 && src_d.data_type() == data_type::f32; if (ok_to_pad_channels) - jcp.oc = rnd_up(jcp.oc, simd_w); + jcp.oc = rnd_up(jcp.oc, jcp.simd_w); if (jcp.oc % jcp.oc_block) return status::unimplemented; @@ -4628,7 +4706,7 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( && everyone_is(0, jcp.l_pad, jcp.r_pad, jcp.t_pad, jcp.b_pad) && jcp.kw <= 28 - jcp.with_bias && jcp.stride_w == 4 - && tr_ld / simd_w <= 4 /* [bwd_w:tr_src:r1] */ + && tr_ld / jcp.simd_w <= 4 /* [bwd_w:tr_src:r1] */ && IMPLICATION(jcp.with_bias, kh_step_rem == 1) /* [bwd_w:b:r1] */ && IMPLICATION(diff_weights_d.format() != any, diff_weights_d.format() == want_4fma_wfmt); @@ -4667,7 +4745,7 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( if (!ok) return status::unimplemented; - jcp.ic_block = simd_w; + jcp.ic_block = jcp.simd_w; if (ok_to_pad_channels) jcp.ic = rnd_up(jcp.ic, jcp.ic_block); jcp.nb_ic = jcp.ic / jcp.ic_block; @@ -4735,10 +4813,209 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( && jcp.oc <= diff_dst_d.blocking_desc().padding_dims[1] && jcp.ic <= diff_weights_d.blocking_desc().padding_dims[with_groups + 1] && jcp.oc <= diff_weights_d.blocking_desc().padding_dims[with_groups + 0]; + if (!args_ok) return status::unimplemented; + + { // balancing + int nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b; + balance(jcp, nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b); + jcp.nthr = nthr; + jcp.nthr_mb = nthr_mb; + jcp.nthr_g = nthr_g; + jcp.nthr_oc_b = nthr_oc_b; + jcp.nthr_ic_b = nthr_ic_b; + } + + return status::success; +} + +void jit_avx512_common_conv_bwd_weights_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + if (utils::one_of(jcp.ver, ver_4fma, ver_4vnni, ver_vnni)) { + if (jcp.is_1stconv) { + const size_t tr_src_size = + jcp.nthr / jcp.nthr_oc_b * jcp.ih * jcp.stride_w * jcp.tr_ld; + scratchpad.book(key_conv_tr_src, jcp.typesize_in * tr_src_size); + } else { + // XXX: See the comment about tr_iw and guarding elements in + // jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf() + const size_t max_nthr = jcp.nthr_mb * jcp.ngroups * jcp.nb_ic; + const size_t min_tr_src_size_per_thr + = jcp.ih * jcp.ic_block * jcp.tr_iw; + const size_t tr_src_size = max_nthr * min_tr_src_size_per_thr + + jcp.tr_src_num_guard_elems; + scratchpad.book(key_conv_tr_src, jcp.typesize_in * tr_src_size); + } + + /* prepare synchronization contexts */ + if (jcp.nthr_oc_b > 1) { + const int tr_src_bctx_size = jcp.nthr / jcp.nthr_oc_b; + scratchpad.book(key_conv_tr_src_bctx, + sizeof(simple_barrier::ctx_t) * tr_src_bctx_size); + } + + if (utils::one_of(jcp.ver, ver_4vnni, ver_vnni)) { + const size_t tr_diff_dst_size = jcp.nthr_mb * jcp.ngroups + * jcp.nb_oc * jcp.oc_block * jcp.tr_ow * jcp.oh; + scratchpad.book(key_conv_tr_diff_dst, + jcp.typesize_in * tr_diff_dst_size); + + /* prepare synchronization contexts */ + if (jcp.nthr_ic_b > 1) { + const size_t tr_diff_dst_bctx_size = jcp.nthr / jcp.nthr_ic_b; + scratchpad.book(key_conv_tr_diff_dst_bctx, + sizeof(simple_barrier::ctx_t) * tr_diff_dst_bctx_size); + } + } + } + + if (jcp.nthr_mb > 1) { + const int wei_size = jcp.ngroups * jcp.oc * jcp.ic + * jcp.kh * jcp.kw * jcp.kd; + const int bia_size = jcp.ngroups * jcp.oc; + const size_t wei_bia_reduction_size = wei_size + bia_size; + + scratchpad.book(key_conv_wei_bia_reduction, + jcp.typesize_out * wei_bia_reduction_size * (jcp.nthr_mb - 1)); + scratchpad.book(key_conv_wei_bia_reduction_bctx, + sizeof(simple_barrier::ctx_t)); + } + + if (jcp.with_bias && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, jcp.typesize_out * jcp.oc); +} - return args_ok ? status::success : status::unimplemented; +void jit_avx512_common_conv_bwd_weights_kernel_f32::balance( + const jit_conv_conf_t &j, int &nthr_, int &nthr_mb_, int &nthr_g_, + int &nthr_oc_b_, int &nthr_ic_b_) +{ + nthr_ = nthr_mb_ = nthr_g_ = nthr_oc_b_ = nthr_ic_b_ = 1; + + const int max_threads = mkldnn_get_max_threads(); + + if (max_threads < j.ngroups) { + /* simplification... fortunately it doesn't hurt much */ + return; + } + + if (!mkldnn_thr_syncable() + && utils::one_of(j.ver, ver_4fma, ver_4vnni, ver_vnni)) { + // should not happen -- the driver is not ready + // for TBB-like non-synchronous threading yet + return; + } + + if (j.ver == ver_4fma && j.is_1stconv) { + nthr_g_ = 1; + nthr_oc_b_ = 1; + nthr_ic_b_ = nstl::min(j.nb_ic, max_threads); + nthr_mb_ = nstl::min(max_threads / nthr_ic_b_, j.mb); + nthr_ = nthr_mb_ * nthr_oc_b_ * nthr_ic_b_ * nthr_g_; + return; + } + + nthr_g_ = j.ngroups; + const int nthr = max_threads / nthr_g_; + + auto calc_mem_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) { + /* calculate per thread memory cost (read/write). high level optimizer + * tries to minimize memory consumption. few notes: + * (n1) unclear why, but that essentially helps first convolution... + * (n2) assuming the reduction over minibatch is always there: + * - instead of 8 it should be 5 here (write ~= 2 read): + * kernel: temporal workspace 1 write + * reduction: 1 read from workspace and 1 write to the diff_wei + * - but experiments showed 8 works better than 5 or 6... */ + + const int src_coef = j.ver == ver_4fma || j.ver == ver_vnni ? 4 : 1; + const int dst_coef = 1; + const int wei_coef = j.ver == ver_vnni ? 4 : 8; + + return 0 + + src_coef + * div_up(j.mb, nthr_mb) * div_up(j.ngroups, nthr_g_) + * div_up(j.nb_ic, nthr_ic_b) * j.ic_block * j.ih * j.iw * j.id + / j.stride_d / j.stride_h / j.stride_w /* (n1) */ + + dst_coef + * div_up(j.mb, nthr_mb) * div_up(j.ngroups, nthr_g_) + * div_up(j.nb_oc, nthr_oc_b) * j.oc_block * j.oh * j.ow * j.od + + wei_coef /* (n2) */ + * div_up(j.ngroups, nthr_g_) + * div_up(j.nb_oc, nthr_oc_b) * div_up(j.nb_ic, nthr_ic_b) + * j.kh * j.kw * j.kd * j.ic_block * j.oc_block; + }; + + int best_mem_cost = calc_mem_cost(nthr_mb_, nthr_oc_b_, nthr_ic_b_); + + /* step 1: find the best thread distribution with lowest memory cost */ + const int nthr_mb_max = nstl::min(nthr, j.mb * j.od); + for (int nthr_mb = 1; nthr_mb <= nthr_mb_max; ++nthr_mb) { + const int nthr_par = nthr / nthr_mb; + const int nthr_oc_b_max = nstl::min(nthr_par, j.nb_oc); + for (int nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) { + int nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, j.nb_ic); + + int mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b); + if (mem_cost <= best_mem_cost) { + best_mem_cost = mem_cost; + nthr_mb_ = nthr_mb; + nthr_oc_b_ = nthr_oc_b; + nthr_ic_b_ = nthr_ic_b; + } + } + + if (!mkldnn_thr_syncable()) { assert(nthr_mb == 1); break; } + } + + if (j.ver != ver_vnni && !mayiuse(avx512_mic)) { + auto calc_comp_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) { + return 1 + * div_up(j.mb, nthr_mb) + * div_up(j.ngroups, nthr_g_) + * div_up(j.nb_oc, nthr_oc_b) + * div_up(j.nb_ic, nthr_ic_b); + }; + + /* step 2: search for a thread distribution with lower compute cost. + * the constrains: + * - memory cost cannot exceed 110% of the best found in the step 1 + * - unless compute cost is 133% lower than the current best case + * note: both constants were found empirically */ + int best_comp_cost = calc_comp_cost(nthr_mb_, nthr_oc_b_, nthr_ic_b_); + for (int nthr_mb = 1; nthr_mb <= nthr_mb_max; ++nthr_mb) { + const int nthr_par = nthr / nthr_mb; + const int nthr_oc_b_max = nstl::min(nthr_par, j.nb_oc); + for (int nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) { + int nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, j.nb_ic); + int mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b); + int comp_cost = calc_comp_cost(nthr_mb, nthr_oc_b, nthr_ic_b); + + const bool opt1 = comp_cost <= best_comp_cost + && mem_cost < 1.1 * best_mem_cost; + const bool opt2 = 4 * comp_cost <= 3 * best_comp_cost; + + if (opt1 || opt2) { + best_comp_cost = comp_cost; + nthr_mb_ = nthr_mb; + nthr_oc_b_ = nthr_oc_b; + nthr_ic_b_ = nthr_ic_b; + } + } + + if (!mkldnn_thr_syncable()) { assert(nthr_mb == 1); break; } + } + } + + if (nthr_mb_ > max_threads/2 && nthr_mb_ < max_threads) + nthr_mb_ = nstl::min(j.mb * j.od, max_threads); + nthr_ = nthr_mb_ * nthr_g_ * nthr_oc_b_ * nthr_ic_b_; + + assert(nthr_ <= max_threads); + assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_mb_ == 1)); } +template struct _jit_avx512_common_conv_fwd_kernel; +template struct _jit_avx512_common_conv_fwd_kernel; + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.hpp index ec6e185..4641292 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.hpp @@ -18,8 +18,9 @@ #define JIT_AVX512_COMMON_CONV_KERNEL_F32_HPP #include "c_types_map.hpp" -#include "cpu_memory.hpp" +#include "memory_tracking.hpp" +#include "cpu_memory.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" #include "jit_uni_eltwise.hpp" @@ -29,16 +30,18 @@ namespace mkldnn { namespace impl { namespace cpu { -struct jit_avx512_common_conv_fwd_kernel : public jit_generator { +template +struct _jit_avx512_common_conv_fwd_kernel : public jit_generator { - jit_avx512_common_conv_fwd_kernel(jit_conv_conf_t ajcp, - const primitive_attr_t &attr) : jcp(ajcp), attr_(attr) + _jit_avx512_common_conv_fwd_kernel(jit_conv_conf_t ajcp, + const primitive_attr_t &attr) + : jcp(ajcp), attr_(attr) { generate(); - jit_ker = (void (*)(jit_conv_call_s *))getCode(); + jit_ker_ = (void (*)(jit_conv_call_s *))getCode(); } - ~jit_avx512_common_conv_fwd_kernel() { + ~_jit_avx512_common_conv_fwd_kernel() { for (auto inj : eltwise_injectors) delete inj; eltwise_injectors.clear(); @@ -48,24 +51,11 @@ struct jit_avx512_common_conv_fwd_kernel : public jit_generator { depthwise_injectors.clear(); } - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_common_conv_fwd_kernel) - - static bool post_ops_ok(jit_conv_conf_t &jcp, - const primitive_attr_t &attr); - static status_t init_conf(jit_conv_conf_t &jcp, - const convolution_desc_t &cd, - cpu_memory_t::pd_t &src_pd, - cpu_memory_t::pd_t &weights_pd, - cpu_memory_t::pd_t &dst_pd, - cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr, - int nthreads, - bool with_relu, - float relu_negative_slope); + DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_avx512_common_conv_fwd_kernel) jit_conv_conf_t jcp; const primitive_attr_t &attr_; - void (*jit_ker)(jit_conv_call_s *); + void (*jit_ker_)(jit_conv_call_s *); private: using reg64_t = const Xbyak::Reg64; @@ -121,25 +111,25 @@ private: reg64_t reg_long_offt = r11; reg64_t reg_out_long_offt = r14; - inline Xbyak::Zmm zmm_ker(int i_ic) { + inline Vmm vmm_ker(int i_ic) { assert(i_ic < 4); - return Xbyak::Zmm(ker_reg_base_idx + i_ic); + return Vmm(ker_reg_base_idx + i_ic); } - inline Xbyak::Zmm zmm_out(int i_ur, int i_oc) { + inline Vmm vmm_out(int i_ur, int i_oc) { int idx = i_ur + i_oc * jcp.ur_w; assert(idx < ker_reg_base_idx); - return Xbyak::Zmm(idx); + return Vmm(idx); } - inline Xbyak::Zmm zmm_inp(int i_ic, int nb_x_blocking) { + inline Vmm vmm_inp(int i_ic, int nb_x_blocking) { int idx = i_ic + nb_x_blocking * jcp.ur_w; assert(idx < 31); - return Xbyak::Zmm(idx); + return Vmm(idx); } Xbyak::Reg64 imm_addr64 = r15; - Xbyak::Zmm zmm_wei = Xbyak::Zmm(31); + Vmm vmm_wei = Vmm(31); reg64_t reg_d_weights = imm_addr64; reg64_t reg_d_bias = reg_kj; @@ -158,35 +148,11 @@ private: void generate(); - inline void vpXdpwssd(Xbyak::Zmm zmm1, Xbyak::Zmm zmm2, - const Xbyak::Address& op) { - if (jcp.ver == ver_4vnni) - vp4dpwssd(zmm1, zmm2, op); - else - vpdpwssd(zmm1, zmm2, op); - } - - inline void vadd(Xbyak::Zmm zmm, const Xbyak::Operand& op) { + inline void vadd(Vmm vmm, const Xbyak::Operand& op) { if (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) - vpaddd(zmm, zmm, op); + vpaddd(vmm, vmm, op); else - vaddps(zmm, zmm, op); - } - - inline void vcmp(Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2, const unsigned char cmp) { - if (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) - vpcmpd(kmask, zmm_src1, zmm_src2, cmp); - else - vcmpps(kmask, zmm_src1, zmm_src2, cmp); - } - - inline void vmul(Xbyak::Zmm zmm_dst, Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2) { - if (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) - vpmulld(zmm_dst | kmask, zmm_src1, zmm_src2); - else - vmulps(zmm_dst | kmask, zmm_src1, zmm_src2); + vaddps(vmm, vmm, op); } inline size_t get_output_offset(int oi, int n_oc_block) { @@ -224,6 +190,59 @@ private: } }; +struct jit_avx512_common_conv_fwd_kernel { + + jit_avx512_common_conv_fwd_kernel(jit_conv_conf_t ajcp, + const primitive_attr_t &attr) : + jit_ker(nullptr), + zmm_kernel_(nullptr), + xmm_kernel_(nullptr) { + int ch_block = ajcp.is_depthwise ? ajcp.ch_block : ajcp.oc_block; + switch (ch_block) { + case 16: + zmm_kernel_ = + new _jit_avx512_common_conv_fwd_kernel( + ajcp, attr); + jit_ker = zmm_kernel_->jit_ker_; + return; + case 4: + xmm_kernel_ = + new _jit_avx512_common_conv_fwd_kernel( + ajcp, attr); + jit_ker = xmm_kernel_->jit_ker_; + return; + default: + assert(!"invalid channel blocking"); + } + } + + ~jit_avx512_common_conv_fwd_kernel() { + delete xmm_kernel_; + delete zmm_kernel_; + } + + enum { + typesize = sizeof(float) + }; + + static bool post_ops_ok(jit_conv_conf_t &jcp, + const primitive_attr_t &attr); + static status_t init_conf(jit_conv_conf_t &jcp, + const convolution_desc_t &cd, + cpu_memory_t::pd_t &src_pd, + cpu_memory_t::pd_t &weights_pd, + cpu_memory_t::pd_t &dst_pd, + cpu_memory_t::pd_t &bias_pd, + const primitive_attr_t &attr, + int nthreads); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); + + void(*jit_ker)(jit_conv_call_s *); + _jit_avx512_common_conv_fwd_kernel *zmm_kernel_; + _jit_avx512_common_conv_fwd_kernel *xmm_kernel_; +}; + struct jit_avx512_common_conv_bwd_data_kernel_f32: public jit_generator { jit_avx512_common_conv_bwd_data_kernel_f32(jit_conv_conf_t ajcp): jcp(ajcp) @@ -239,6 +258,8 @@ struct jit_avx512_common_conv_bwd_data_kernel_f32: public jit_generator { const memory_desc_wrapper &diff_src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &diff_dst_d); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_s *); @@ -358,6 +379,8 @@ struct jit_avx512_common_conv_bwd_weights_kernel_f32 : public jit_generator { const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &diff_weights_pd, cpu_memory_t::pd_t &diff_bias_pd, cpu_memory_t::pd_t &diff_dst_pd); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_s *); @@ -423,6 +446,9 @@ private: inline void compute_loop(); void generate(); + + static void balance(const jit_conv_conf_t &j, int &nthr, int &nthr_mb, + int &nthr_g, int &nthr_oc_b, int &nthr_ic_b); }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.cpp index 0405eee..63cd074 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.cpp @@ -66,6 +66,15 @@ int get_divisor_satisfying_cond(jit_conv_winograd_conf_t &jcp, int number, return best_divisor; } +namespace { +bool is_winograd_faster_than_direct(const jit_conv_winograd_conf_t &jcp) { + if (jcp.ver == ver_4fma) + return jcp.mb >= 32; + else + return jcp.mb >= 16; +} +} + /* assumes 512 bits registers */ /* TODO: add support for strides */ /* TODO: handle the prefetch distance automatically */ @@ -137,29 +146,6 @@ private: }; // utilities to support kernel parameter selection -bool check_L2_block_per_thread(jit_conv_winograd_conf_t &jcp, - int dimN_block, float C2_min, float C2_max) { - /* V_L2_block + M_L2_block + W */ - float block_size = (alpha * alpha * (jcp.oc + jcp.ic) - * dimN_block * jcp.dimN_reg_block - + jcp.ic * jcp.oc) * (float)sizeof(float); - float L2_lb = C2_min * L2_cache_size; - float L2_ub = C2_max * L2_cache_size; - return (block_size > L2_lb && block_size < L2_ub); -} - -bool check_L1_block_gemm(jit_conv_winograd_conf_t &jcp, int dimK_block, - int dimM_block, float C1_min, float C1_max) { - float gemm_block_size = (dimM_block * jcp.dimM_simd_block * dimK_block - * jcp.dimK_reg_block - + dimK_block * jcp.dimK_reg_block * jcp.dimN_reg_block - + dimM_block * jcp.dimM_simd_block * jcp.dimN_reg_block) - * (float)sizeof(float); - float L1_lb = C1_min * L1_cache_size; - float L1_ub = C1_max * L1_cache_size; - return (gemm_block_size > L1_lb && gemm_block_size < L1_ub); -} - bool check_cond1(int dimN_reg_block, int dimK_block, int dimK_reg_block, int dimM_block, int dimM_simd_block, float C) { @@ -311,10 +297,8 @@ void _jit_avx512_common_conv_winograd_data_kernel_f32::gemm_loop_generate( auto store_output = [=](bool output_is_aligned) { for (int tile = 0; tile < jcp.dimN_reg_block; tile++) { Zmm zmm(jcp.zmm_start + tile); - // In W_SGD, output will be reused. if (output_is_aligned && jcp.dimK_nb_block == 1 - && jcp.sched_policy == WSCHED_DATA_W_S_G_D && (jcp.dimN * jcp.dimM * alpha * alpha * sizeof(float) > 2 * LLC_data_size)) vmovntps(zword[reg_dstC + 64 * tile], zmm); @@ -359,15 +343,17 @@ status_t _jit_avx512_common_conv_winograd_data_kernel_f32::init_conf_common( const memory_desc_wrapper &dst_d) { - if (!mayiuse(avx512_common)) + if (mayiuse(avx512_core)) + return status::unimplemented; + else if (!mayiuse(avx512_common)) return status::unimplemented; - else if (mayiuse(avx512_core)) - jcp.ver = ver_avx512_core; else if (mayiuse(avx512_mic_4ops)) jcp.ver = ver_4fma; else jcp.ver = ver_fma; + jcp.nthr = mkldnn_get_max_threads(); + const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; @@ -402,6 +388,10 @@ status_t _jit_avx512_common_conv_winograd_data_kernel_f32::init_conf_common( jcp.ic = rnd_up(jcp.ic, simd_w); } + if (!IMPLICATION(cd.alg_kind == alg_kind::convolution_auto, + is_winograd_faster_than_direct(jcp))) + return status::unimplemented; + // Checking conditions not supported by these kernels if (jcp.ngroups != 1) return status::unimplemented; @@ -431,83 +421,6 @@ status_t _jit_avx512_common_conv_winograd_data_kernel_f32::init_conf_common( return status::success; } -status_t set_wsched_DATA_W_SGD_avx512_common(jit_conv_winograd_conf_t &jcp) { - - if (jcp.ver != ver_avx512_core) - return status::unimplemented; - - /* ----------- dimN reg block ---------------------*/ - auto test_cond_dimN_reg_block = [](jit_conv_winograd_conf_t &jcp, - int dimN_reg_block, int current_best) { - return (dimN_reg_block >= MIN_REQUIRED_DIMN_REG_BLOCK) - && (dimN_reg_block <= jcp.nb_reg) - && (dimN_reg_block < current_best); - }; - - jcp.dimN_reg_block = get_divisor_satisfying_cond( - jcp, jcp.dimN, jcp.dimN, test_cond_dimN_reg_block); - - if (jcp.dimN_reg_block >= jcp.nb_reg) { - auto test_cond_dimN_reg_block = [](jit_conv_winograd_conf_t &jcp, - int dimN_reg_block, int current_best) { - return (dimN_reg_block < jcp.nb_reg) - && (dimN_reg_block > current_best); - }; - - jcp.dimN_reg_block = get_divisor_satisfying_cond( - jcp, jcp.dimN, 1, test_cond_dimN_reg_block); - } - - /*-------------- L2 blocking for dimN block ---------*/ - - auto test_cond_dimN_block = [](jit_conv_winograd_conf_t &jcp, - int dimN_block, int current_best) { - return check_L2_block_per_thread(jcp, dimN_block, 0.1, 1.3) - && (dimN_block > current_best) - && ((jcp.dimN / dimN_block / jcp.dimN_reg_block) > 2 * mkldnn_get_max_threads()); - }; - - jcp.dimN_block = get_divisor_satisfying_cond( - jcp, jcp.dimN / jcp.dimN_reg_block, 1, test_cond_dimN_block); - - if (check_L2_block_per_thread(jcp, jcp.dimN_block, 0.1, 1.3) - && jcp.dimN/ jcp.dimN_block/ jcp.dimN_reg_block > 2 * mkldnn_get_max_threads()) { - jcp.dimN_nb_block = jcp.dimN / jcp.dimN_block / jcp.dimN_reg_block; - - /* ------------------- L1 blocking for GEMM --------------*/ - /* -------------------- Choose dimK block ----------------*/ - auto test_cond_dimK_block = [](jit_conv_winograd_conf_t &jcp, - int dimK_block, int current_best) { - return check_L1_block_gemm(jcp, dimK_block, 1, 0.1, 0.6) - && (dimK_block > current_best); - }; - - jcp.dimK_block = get_divisor_satisfying_cond( - jcp, jcp.dimK / jcp.dimK_reg_block, 1, test_cond_dimK_block); - - if (check_L1_block_gemm(jcp, jcp.dimK_block, 1, 0.1, 0.6)) { - jcp.dimK_nb_block = jcp.dimK / jcp.dimK_block / jcp.dimK_reg_block; - - /* -------------- Choose dimM block -------------------*/ - auto test_cond_dimM_block = [](jit_conv_winograd_conf_t &jcp, - int dimM_block, int current_best) { - return check_L1_block_gemm(jcp, jcp.dimK_block, dimM_block, 0.1, 0.7) - && (dimM_block > current_best); - }; - - jcp.dimM_block = get_divisor_satisfying_cond( - jcp, jcp.dimM / jcp.dimM_simd_block, 1, test_cond_dimM_block); - jcp.dimM_nb_block = jcp.dimM / jcp.dimM_block / jcp.dimM_simd_block; - - jcp.sched_policy = WSCHED_DATA_W_SGD; - return status::success; - } - - } - return status::unimplemented; - -} - status_t set_wsched_DATA_W_S_G_D_avx512_common(jit_conv_winograd_conf_t &jcp) { @@ -593,7 +506,6 @@ status_t set_wsched_DATA_W_S_G_D_avx512_common(jit_conv_winograd_conf_t &jcp) { jcp.dimN_nb_block = jcp.dimN / (jcp.dimN_reg_block * jcp.dimN_block); jcp.sched_policy = WSCHED_DATA_W_S_G_D; return status::success; - //return status::unimplemented; } status_t _jit_avx512_common_conv_winograd_data_kernel_f32::init_conf_kernel( @@ -618,10 +530,9 @@ status_t _jit_avx512_common_conv_winograd_data_kernel_f32::init_conf_kernel( jcp.dimM = dimM; jcp.sched_policy = WSCHED_INVALID; - if (!(set_wsched_DATA_W_SGD_avx512_common(jcp) == status::success)) - set_wsched_DATA_W_S_G_D_avx512_common(jcp); + set_wsched_DATA_W_S_G_D_avx512_common(jcp); - assert(jcp.sched_policy != WSCHED_INVALID); + assert(jcp.sched_policy == WSCHED_DATA_W_S_G_D); return status::success; } @@ -629,28 +540,16 @@ bool jit_avx512_common_conv_winograd_fwd_kernel_f32::post_ops_ok( jit_conv_conf_t &jcp, const primitive_attr_t &attr) { const auto &p = attr.post_ops_; - auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); }; auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; switch (p.len_) { - case 0: - return true; // no post_ops - case 1: - return true // relu or sum - && IMPLICATION(jcp.with_eltwise, is_sum(0)) - && IMPLICATION(!jcp.with_eltwise, is_eltwise(0) || is_sum(0)); - case 2: - return true // sum->relu or relu->sum - && IMPLICATION(jcp.with_eltwise, is_sum(0) && is_eltwise(1)) - && IMPLICATION(!jcp.with_eltwise, false - || (is_sum(0) && is_eltwise(1)) - || (is_eltwise(0) && is_sum(1))); - case 3: - return true // relu->sum->relu - && jcp.with_eltwise == false - && (is_eltwise(0) && is_sum(1) && is_eltwise(2)); - default: - return false; + case 0: return true; // no post_ops + case 1: return is_relu(0) || is_sum(0); // relu or sum + case 2: return (is_sum(0) && is_relu(1)) || + (is_relu(0) && is_sum(1)); // sum->relu or relu->sum + case 3: return is_relu(0) && is_sum(1) && is_relu(2); // relu->sum->relu + default: return false; } return false; @@ -659,8 +558,7 @@ bool jit_avx512_common_conv_winograd_fwd_kernel_f32::post_ops_ok( status_t jit_avx512_common_conv_winograd_fwd_kernel_f32::init_conf( jit_conv_winograd_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope) { + const memory_desc_wrapper &dst_d, const primitive_attr_t &attr) { status_t st = init_conf_common(jcp, cd, src_d, weights_d, dst_d); if (st != status::success) @@ -672,18 +570,14 @@ status_t jit_avx512_common_conv_winograd_fwd_kernel_f32::init_conf( jcp.ntiles = jcp.mb * jcp.itiles * jcp.jtiles; jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; if (!post_ops_ok(jcp, attr)) return status::unimplemented; const auto &p = attr.post_ops_; - if (!jcp.with_eltwise) { - /* PostOps ReLU before SUM is handled the same as ReLU primitive */ - jcp.with_eltwise = p.find(primitive_kind::eltwise, 0, 1) != -1; - jcp.eltwise_alpha = 0.f; - } + const int eltwise_ind = p.find(primitive_kind::eltwise, 0, 1); + jcp.with_eltwise = eltwise_ind != -1; + if (jcp.with_eltwise) jcp.eltwise = p.entry_[eltwise_ind].eltwise; jcp.with_sum = p.find(primitive_kind::sum, 0) != -1; status_t res = init_conf_kernel(jcp, jcp.oc, jcp.ntiles, jcp.ic); @@ -1014,7 +908,7 @@ bool check_cond2_wu(int dimM_block, int dimM_simdw, int dimK_block, } } // namespace -bool set_wsched_WEI_S_D_G_W_avx512_common(jit_conv_winograd_conf_t &jcp) +status_t set_wsched_WEI_S_D_G_W_avx512_common(jit_conv_winograd_conf_t &jcp) { /*************** Choose dimN_reg_block (ic_simd_block) * *******************************/ @@ -1113,245 +1007,7 @@ bool set_wsched_WEI_S_D_G_W_avx512_common(jit_conv_winograd_conf_t &jcp) jcp.dimM_nb_block = (jcp.dimM / jcp.dimM_simd_block) / jcp.dimM_block; jcp.sched_policy = WSCHED_WEI_S_D_G_W; - return true; -} - -namespace { -bool is_in_L1_range(int v, float C1, float C2) -{ - return ((v > C1 * L1_cache_size) && (v < C2 * L1_cache_size)); -} - -bool is_in_L2_range(int v, float C1, float C2) -{ - return ((v > C1 * L2_cache_size) && (v < C2 * L2_cache_size)); -} - -void set_jcp_WEI_params(jit_conv_winograd_conf_t &jcp, int tile_block_ur, - int tile_block, int nb_ic, int nb_oc) -{ - jcp.tile_block_ur = tile_block_ur; - jcp.tile_block = tile_block; - jcp.nb_ic = nb_ic; - jcp.nb_oc = nb_oc; - - jcp.nb_tile_block_ur = jcp.ntiles / jcp.tile_block / jcp.tile_block_ur; - jcp.ic_block = jcp.ic / jcp.ic_simd_block / jcp.nb_ic; - jcp.oc_block = jcp.oc / jcp.oc_simd_block / jcp.nb_oc; - - jcp.dimK_reg_block = jcp.tile_block_ur; - jcp.dimK_block = jcp.nb_tile_block_ur; - jcp.dimK_nb_block = jcp.tile_block; - jcp.dimN_reg_block = jcp.ic_simd_block; - jcp.dimN_block = jcp.ic_block; - jcp.dimN_nb_block = jcp.nb_ic; - jcp.dimM_simd_block = jcp.oc_simd_block; - jcp.dimM_block = jcp.oc_block; - jcp.dimM_nb_block = jcp.nb_oc; -} -} - -bool set_wsched_WEI_SDGt_W_avx512_common(jit_conv_winograd_conf_t &jcp) -{ - jcp.ic_simd_block = jcp.oc_simd_block = 16; - int nb_ic_simd_block = jcp.ic / jcp.ic_simd_block; - int nb_oc_simd_block = jcp.oc / jcp.oc_simd_block; - - int min_tile_block_ur = 8; - int max_tile_block_ur = 64; - int max_tile_block = jcp.ntiles / min_tile_block_ur; - - // Consider L2 + L3 together on SKX - const float C1_min = .1, C1_0 = .4, C1_max = .5; - const float C2_0 = .4, C2_max = .5; - const float TC2_0 = .7, TC2_max = 1.2; - const int T_min = 2, T0 = 20; - float C1, C2, TC2; - int T, tile_block, tile_block_ur, nb_oc, nb_ic; - - auto blocking_ok = [&]() -> bool { - // V:tile_block + M:tile_block + U - int thread_size = alpha * alpha * jcp.oc - * (jcp.ntiles / tile_block) * sizeof(float) - + alpha * alpha * jcp.ic * (jcp.ntiles / tile_block) - * sizeof(float) - + alpha * alpha * jcp.ic * jcp.oc * sizeof(float); - // V:tile_block + M:tile_block - int L2_reuse = alpha * alpha * jcp.oc - * (jcp.ntiles / tile_block) * sizeof(float) - + alpha * alpha * jcp.ic * (jcp.ntiles / tile_block) - * sizeof(float); - // V:nb_ic + M:nb_tile_block_ur - // Use M:nb_oc + V:nb_ic as an superset estimation - int L1_reuse - = (jcp.ic / nb_ic) * (jcp.ntiles / tile_block) * sizeof(float) - + (jcp.oc / nb_oc) * (jcp.ntiles / tile_block) * sizeof(float); - - return jcp.ntiles % tile_block == 0 - && (jcp.ntiles / tile_block) % tile_block_ur == 0 - && is_in_L2_range(thread_size, TC2, TC2_max) - && is_in_L2_range(L2_reuse, C2, C2_max) - && tile_block > T * mkldnn_get_max_threads() - && nb_oc_simd_block % nb_oc == 0 - && nb_ic_simd_block % nb_ic == 0 - && is_in_L1_range(L1_reuse, C1, C1_max); - }; - - for (C1 = C1_0, C2 = C2_0, TC2 = TC2_0; C1 > C1_min; - C1 -= .02, C2 -= .02, TC2 -= .04) { - for (T = T0; T >= T_min; --T) { - for (tile_block = 1; tile_block <= max_tile_block; ++tile_block) { - for (tile_block_ur = max_tile_block_ur; - tile_block_ur >= min_tile_block_ur; --tile_block_ur) { - for (nb_oc = 1; nb_oc <= nb_oc_simd_block; ++nb_oc) { - for (nb_ic = nb_ic_simd_block; nb_ic >= 1; --nb_ic) { - if (blocking_ok()) { - set_jcp_WEI_params(jcp, tile_block_ur, - tile_block, nb_ic, nb_oc); - jcp.sched_policy = WSCHED_WEI_SDGt_W; - return true; - } - } - } - } - } - } - } - - return false; -} - -bool set_wsched_WEI_SDGtWo_avx512_common(jit_conv_winograd_conf_t &jcp) -{ - jcp.ic_simd_block = jcp.oc_simd_block = 16; - int nb_ic_simd_block = jcp.ic / jcp.ic_simd_block; - int nb_oc_simd_block = jcp.oc / jcp.oc_simd_block; - - int min_tile_block_ur = 12; - int max_tile_block_ur = 64; - int max_tile_block = jcp.ntiles / min_tile_block_ur; - - const float C1_min = .1, C1_0 = .4, C1_max = .5; - const float C2_0 = .4, C2_max = .6; - const float TC2_0 = .7, TC2_max = 1.6; - - const int max_nb_oc = 2; // Limit the # of sequential execution - const int T0 = 12, T_min = 8; - float C1, C2, TC2; - int T, tile_block, tile_block_ur, nb_oc, nb_ic; - - auto blocking_ok = [&]() -> bool { - // M:tile_block:nb_oc + V:tile_block + U:nb_oc - int thread_size = alpha * alpha * (jcp.oc / nb_oc) - * (jcp.ntiles / tile_block) * sizeof(float) - + alpha * alpha * jcp.ic * (jcp.ntiles / tile_block) - * sizeof(float) - + alpha * alpha * jcp.ic * (jcp.oc / nb_oc) - * sizeof(float); - // M:tile_block:nb_oc + V:tile_block - int L2_reuse = alpha * alpha * (jcp.oc / nb_oc) - * (jcp.ntiles / tile_block) * sizeof(float) - + alpha * alpha * jcp.ic * (jcp.ntiles / tile_block) - * sizeof(float); - // V:nb_ic + M:nb_tile_block_ur - // Use M:nb_oc + V:nb_ic as an superset estimation - int L1_reuse - = (jcp.ic / nb_ic) * (jcp.ntiles / tile_block) * sizeof(float) - + (jcp.oc / nb_oc) * (jcp.ntiles / tile_block) * sizeof(float); - - return jcp.ntiles % tile_block == 0 - && (jcp.ntiles / tile_block) % tile_block_ur == 0 - && is_in_L2_range(thread_size, TC2, TC2_max) - && is_in_L2_range(L2_reuse, C2, C2_max) - && tile_block > T * mkldnn_get_max_threads() - && nb_oc_simd_block % nb_oc == 0 - && nb_ic_simd_block % nb_ic == 0 - && is_in_L1_range(L1_reuse, C1, C1_max); - }; - - for (T = T0; T >= T_min; --T) { - for (C1 = C1_0, C2 = C2_0, TC2 = TC2_0; C1 > C1_min; - C1 -= .02, C2 -= .02, TC2 -= .04) { - for (nb_oc = 1; nb_oc <= max_nb_oc; ++nb_oc) { - for (tile_block = max_tile_block; tile_block >= 1; - --tile_block) { - for (tile_block_ur = min_tile_block_ur; - tile_block_ur <= max_tile_block_ur; - ++tile_block_ur) { - for (nb_ic = 1; nb_ic <= nb_ic_simd_block; ++nb_ic) { - if (blocking_ok()) { - set_jcp_WEI_params(jcp, tile_block_ur, - tile_block, nb_ic, nb_oc); - jcp.sched_policy = WSCHED_WEI_SDGtWo; - return true; - } - } - } - } - } - } - } - - return false; -} - -bool set_wsched_WEI_S_D_Giot_W_avx512_common(jit_conv_winograd_conf_t &jcp) -{ - jcp.ic_simd_block = jcp.oc_simd_block = 16; - int nb_ic_simd_block = jcp.ic / jcp.ic_simd_block; - - int min_tile_block_ur = 8; - int max_tile_block_ur = 64; - const float C1_min = .2, C1_0 = .4, C1_max = .9; - const float C2_min = .1, C2_0 = .4, C2_max = .5; - const int T0 = 16, T_min = 12; - float C1, C2; - int T, tile_block, tile_block_ur, nb_ic; - int nb_oc = 1; // Keep nb_oc small to increase - // oc_block, for better reuse of V in - // L2 - - auto blocking_ok = [&]() -> bool { - // V[:ic_block][][][] - int L2_reuse - = (jcp.ic / nb_ic) * (jcp.ntiles / tile_block) * sizeof(float); - // M[:nb_tile_block_ur][][] + V[:nb_tile_block_ur][][] - int L1_reuse - = (jcp.ntiles / tile_block) * jcp.oc_simd_block * sizeof(float); - - int work_amount = tile_block * nb_ic * nb_oc * alpha * alpha; - - return (jcp.ntiles / tile_block_ur) % tile_block == 0 - && jcp.ntiles % tile_block_ur == 0 - && nb_ic_simd_block % nb_ic == 0 - && is_in_L2_range(L2_reuse, C2, C2_max) - && is_in_L1_range(L1_reuse, C1, C1_max) - && work_amount > T * mkldnn_get_max_threads(); - }; - - for (T = T0; T >= T_min; --T) { - for (C1 = C1_0; C1 > C1_min; C1 -= .02) { - for (C2 = C2_0; C2 > C2_min; C2 -= .02) { - for (nb_ic = 1; nb_ic <= nb_ic_simd_block; ++nb_ic) { - for (tile_block_ur = min_tile_block_ur; - tile_block_ur <= max_tile_block_ur; - ++tile_block_ur) { - for (tile_block = 1; - tile_block <= jcp.ntiles / min_tile_block_ur; - ++tile_block) { - if (blocking_ok()) { - set_jcp_WEI_params(jcp, tile_block_ur, - tile_block, nb_ic, nb_oc); - jcp.sched_policy = WSCHED_WEI_S_D_Giot_W; - return true; - } - } - } - } - } - } - } - return false; + return status::success; } status_t jit_avx512_common_conv_winograd_bwd_weights_kernel_f32::init_conf( @@ -1359,8 +1015,7 @@ status_t jit_avx512_common_conv_winograd_bwd_weights_kernel_f32::init_conf( const memory_desc_wrapper &src_d, const memory_desc_wrapper &diff_dst_d, const memory_desc_wrapper &diff_weights_d) { - if (!mayiuse(avx512_common)) - return status::unimplemented; + jcp.nthr = mkldnn_get_max_threads(); const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1; @@ -1397,15 +1052,18 @@ status_t jit_avx512_common_conv_winograd_bwd_weights_kernel_f32::init_conf( jcp.ic = rnd_up(jcp.ic, simd_w); } + if (mayiuse(avx512_core)) + return status::unimplemented; if (!mayiuse(avx512_common)) return status::unimplemented; - else if (mayiuse(avx512_core)) - jcp.ver = ver_avx512_core; else if (mayiuse(avx512_mic_4ops)) jcp.ver = ver_4fma; else jcp.ver = ver_fma; + if (!IMPLICATION(cd.alg_kind == alg_kind::convolution_auto, + is_winograd_faster_than_direct(jcp))) + return status::unimplemented; // Winograd specific initialization jcp.itiles = (jcp.ow + tile_size - 1) / tile_size; jcp.jtiles = (jcp.oh + tile_size - 1) / tile_size; @@ -1474,16 +1132,9 @@ status_t jit_avx512_common_conv_winograd_bwd_weights_kernel_f32::init_conf( jcp.zmm_start = jcp.ver == ver_4fma ? 4 : 1; jcp.nb_reg = 32 - jcp.zmm_start; - status_t res; jcp.sched_policy = WSCHED_INVALID; - if ((jcp.ver == ver_avx512_core && - (set_wsched_WEI_SDGt_W_avx512_common(jcp) - || set_wsched_WEI_SDGtWo_avx512_common(jcp) - || set_wsched_WEI_S_D_Giot_W_avx512_common(jcp))) - || set_wsched_WEI_S_D_G_W_avx512_common(jcp)) - res = status::success; - else - return status::unimplemented; + status_t res = set_wsched_WEI_S_D_G_W_avx512_common(jcp); + assert(jcp.sched_policy == WSCHED_WEI_S_D_G_W); jcp.tile_block_ur = jcp.dimK_reg_block; jcp.nb_tile_block_ur = jcp.dimK_block; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.hpp index f6fb2da..6c11714 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.hpp @@ -91,8 +91,7 @@ struct jit_avx512_common_conv_winograd_fwd_kernel_f32 static status_t init_conf(jit_conv_winograd_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, - bool with_relu = false, float relu_negative_slope = 0.); + const memory_desc_wrapper &dst_d, const primitive_attr_t &attr); }; struct jit_avx512_common_conv_winograd_bwd_data_kernel_f32 diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.cpp index 8767207..da07a52 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.cpp @@ -14,19 +14,20 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_types.h" #include "c_types_map.hpp" -#include "jit_avx512_common_convolution.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "jit_avx512_common_convolution.hpp" + namespace mkldnn { namespace impl { namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace nstl; @@ -127,25 +128,40 @@ void jit_conv_3d_ker_bwd_w_pipeline(jit_conv_ker_t ker, jit_conv_call_s &p, ker(&p); } #define wht_blk_off(d, g, ...) \ - (conf_.with_groups() \ + (pd()->with_groups() \ ? (d).blk_off((g), __VA_ARGS__) \ : (d).blk_off(__VA_ARGS__)) -template +void jit_avx512_common_convolution_fwd_t:: +prepare_padded_bias(const dst_data_t *&bias) const { + if (!pd()->wants_padded_bias()) return; + + auto padded_bias = scratchpad().template get( + key_conv_padded_bias); + utils::array_copy(padded_bias, bias, pd()->jcp_.oc_without_padding); + utils::array_set(padded_bias + pd()->jcp_.oc_without_padding, + (dst_data_t)0, pd()->jcp_.oc - pd()->jcp_.oc_without_padding); + bias = padded_bias; +} + +template -void _jit_avx512_common_convolution_fwd_t - ::execute_forward_1d() +void jit_avx512_common_convolution_fwd_t + ::execute_forward_1d() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + prepare_padded_bias(bias); - const auto &jcp = kernel_->jcp; + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = pd()->jcp_; assert(jcp.nb_oc % jcp.nb_oc_blocking == 0); int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; @@ -157,11 +173,6 @@ void _jit_avx512_common_convolution_fwd_t else nthr = mkldnn_get_max_threads(); - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - } parallel(nthr, [&](const int ithr, const int nthr) { int start{0}, end{0}, start_copy; balance211(work_amount, nthr, ithr, start, end); @@ -191,7 +202,7 @@ void _jit_avx512_common_convolution_fwd_t int ocb = occ * jcp.nb_oc_blocking; int g_ocb = g * jcp.nb_oc + ocb; int g_oc = g_ocb * jcp.oc_block; - int g_icb = g * jcp.nb_ic; + int g_icb = g * jcp.nb_ic * jcp.nonblk_group_off; int ow_s = owb * jcp.ow_block; int iw_s = ow_s * jcp.stride_w; @@ -228,22 +239,24 @@ void _jit_avx512_common_convolution_fwd_t }); } -template -void _jit_avx512_common_convolution_fwd_t - ::execute_forward_2d() +void jit_avx512_common_convolution_fwd_t + ::execute_forward_2d() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + prepare_padded_bias(bias); - const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = pd()->jcp_; + const int MB = pd()->MB(); assert(jcp.nb_oc % jcp.nb_oc_blocking == 0); int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; @@ -255,12 +268,6 @@ void _jit_avx512_common_convolution_fwd_t else nthr = mkldnn_get_max_threads(); - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - } - parallel(nthr, [&](const int ithr, const int nthr) { int start{0}, end{0}, start_copy; balance211(work_amount, nthr, ithr, start, end); @@ -290,7 +297,7 @@ void _jit_avx512_common_convolution_fwd_t int ocb = occ * jcp.nb_oc_blocking; int g_ocb = g * jcp.nb_oc + ocb; int g_oc = g_ocb * jcp.oc_block; - int g_icb = g * jcp.nb_ic; + int g_icb = g * jcp.nb_ic * jcp.nonblk_group_off; int work_rem = end - start; @@ -357,30 +364,26 @@ void _jit_avx512_common_convolution_fwd_t }); } -template -void _jit_avx512_common_convolution_fwd_t - ::execute_forward_3d() +void jit_avx512_common_convolution_fwd_t + ::execute_forward_3d() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + prepare_padded_bias(bias); - const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); - assert(jcp.nb_oc % jcp.nb_oc_blocking == 0); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - } + const auto &jcp = pd()->jcp_; + const int MB = pd()->MB(); + assert(jcp.nb_oc % jcp.nb_oc_blocking == 0); parallel(0, [&](const int ithr, const int nthr) { int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; @@ -418,7 +421,7 @@ void _jit_avx512_common_convolution_fwd_t int ocb = occ * jcp.nb_oc_blocking; int g_ocb = g * jcp.nb_oc + ocb; int g_oc = g_ocb * jcp.oc_block; - int g_icb = g * jcp.nb_ic; + int g_icb = g * jcp.nb_ic * jcp.nonblk_group_off; int work_rem = end - start; int ih_s = -jcp.t_pad + oh_s * jcp.stride_h; @@ -491,25 +494,22 @@ void _jit_avx512_common_convolution_fwd_t }); } -template struct _jit_avx512_common_convolution_fwd_t; -template struct _jit_avx512_common_convolution_fwd_t; -template struct _jit_avx512_common_convolution_fwd_t; -template struct _jit_avx512_common_convolution_fwd_t; +template struct jit_avx512_common_convolution_fwd_t; template void jit_avx512_common_convolution_bwd_data_t::execute_backward_data_1d() { + diff_src_type>::execute_backward_data_1d() const { auto diff_dst = reinterpret_cast (this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); const auto &jcp = kernel_->jcp; @@ -579,18 +579,18 @@ void jit_avx512_common_convolution_bwd_data_t void jit_avx512_common_convolution_bwd_data_t::execute_backward_data_2d() { + diff_src_type>::execute_backward_data_2d() const { auto diff_dst = reinterpret_cast (this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); parallel(0, [&](const int ithr, const int nthr) { int start{0}, end{0}, start_copy; @@ -704,18 +704,18 @@ void jit_avx512_common_convolution_bwd_data_t void jit_avx512_common_convolution_bwd_data_t::execute_backward_data_3d() { + diff_src_type>::execute_backward_data_3d() const { auto diff_dst = reinterpret_cast (this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); const auto &jcp = kernel_->jcp; - const int MB = conf_.MB(); + const int MB = pd()->MB(); parallel(0, [&](const int ithr, const int nthr) { int start{0}, end{0}, start_copy; @@ -881,89 +881,33 @@ template jit_avx512_common_convolution_bwd_weights_t:: -jit_avx512_common_convolution_bwd_weights_t(const pd_t *pd, +jit_avx512_common_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), kernel_(nullptr) + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr) , trans_kernel_(nullptr), trans_dst_kernel_(nullptr), acc_ker_(nullptr) - , reducer_bias_(nullptr), padded_bias_(nullptr), tr_src_(nullptr) - , tr_diff_dst_(nullptr), ws_reduction_(nullptr), tr_src_bctx_(nullptr) - , tr_diff_dst_bctx_(nullptr) + , reducer_bias_(nullptr) { - const auto &j = conf_.jcp_; - kernel_ = new jit_avx512_common_conv_bwd_weights_kernel_f32(j); + const auto &j = pd()->jcp_; - balance(); + nthr_ = j.nthr; + nthr_mb_ = j.nthr_mb; + nthr_g_ = j.nthr_g; + nthr_oc_b_ = j.nthr_oc_b; + nthr_ic_b_ = j.nthr_ic_b; + + kernel_ = new jit_avx512_common_conv_bwd_weights_kernel_f32(j); if (utils::one_of(j.ver, ver_4fma, ver_4vnni, ver_vnni)) { trans_kernel_ = create_trans_src(&j); if (utils::one_of(j.ver, ver_4vnni, ver_vnni)) trans_dst_kernel_ = create_trans_dst(&j); - if (j.is_1stconv) { - const int tr_src_size = - nthr_ / nthr_oc_b_ * j.ih * j.stride_w * j.tr_ld; - tr_src_ = (src_data_t *)malloc(tr_src_size * sizeof(src_data_t), 64); - } else { - // XXX: See the comment about tr_iw and guarding elements in - // jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf() - const int max_nthr = nthr_mb_ * j.ngroups * j.nb_ic; - const int min_tr_src_size_per_thr = j.ih * j.ic_block * j.tr_iw; - const int tr_src_size = max_nthr * min_tr_src_size_per_thr - + j.tr_src_num_guard_elems; - tr_src_ = (src_data_t *)malloc(tr_src_size * sizeof(src_data_t), 64); - /* to avoid NaNs in computations we zero tail num_guard_elems for - * each possible thread group */ - for (int ithr = 1; ithr <= max_nthr; ++ithr) { - src_data_t *ts = &tr_src_[ithr * min_tr_src_size_per_thr]; - for (int i = 0; i < j.tr_src_num_guard_elems; ++i) - ts[i] = 0; - } - } - - /* prepare synchronization contexts */ - if (nthr_oc_b_ > 1) { - const int tr_src_bctx_size = nthr_ / nthr_oc_b_; - tr_src_bctx_ = (simple_barrier::ctx_t *)malloc( - tr_src_bctx_size * sizeof(simple_barrier::ctx_t), 64); - for (int i = 0; i < tr_src_bctx_size; ++i) - simple_barrier::ctx_init(&tr_src_bctx_[i]); - } - - if (utils::one_of(j.ver, ver_4vnni, ver_vnni)) { - const size_t tr_diff_dst_size = - nthr_mb_ * j.ngroups * j.nb_oc * j.oc_block * j.tr_ow * j.oh; - tr_diff_dst_ = (diff_dst_data_t *)malloc( - tr_diff_dst_size * sizeof(diff_dst_data_t), 64); - - /* prepare synchronization contexts */ - if (nthr_ic_b_ > 1) { - const size_t tr_diff_dst_bctx_size = nthr_ / nthr_ic_b_; - tr_diff_dst_bctx_ = (simple_barrier::ctx_t *)malloc( - tr_diff_dst_bctx_size * sizeof(simple_barrier::ctx_t), - 64); - for (size_t i = 0; i < tr_diff_dst_bctx_size; ++i) - simple_barrier::ctx_init(&tr_diff_dst_bctx_[i]); - } - } } - if (nthr_mb_ > 1) { - const int wei_size = j.ngroups * j.oc * j.ic * j.kh * j.kw * j.kd; - const int bia_size = j.ngroups * j.oc; - ws_reduction_ = (diff_weights_data_t *)malloc((nthr_mb_ - 1) - * (wei_size + bia_size) * sizeof(diff_weights_data_t), 64); + if (nthr_mb_ > 1) acc_ker_ = new cpu_accumulator_1d_t(); - simple_barrier::ctx_init(&reduction_bctx_); - } - if (conf_.with_bias()) { - const size_t max_buffer_size = nthr_ * 3 * 5 * 5 * 16 * 16; - reducer_bias_ = new cpu_reducer_t(reduce_balancer_t( - nthr_, j.oc_block, j.ngroups * j.nb_oc, j.mb, - max_buffer_size)); - if (conf_.want_padded_bias()) - padded_bias_ = (diff_weights_data_t *) - malloc(sizeof(diff_weights_data_t) * j.oc, 64); - } + reducer_bias_ = + new cpu_reducer_t(pd()->reducer_bia_conf_); } template scratchpad()), ithr(ithr) { src = reinterpret_cast(self->input_memory(0)); diff_dst = reinterpret_cast( self->input_memory(1)); diff_weights = reinterpret_cast(self->memory(0)); - diff_bias = self->conf_.want_padded_bias() - ? self->padded_bias_ + diff_bias = self->pd()->wants_padded_bias() + ? scratchpad.template get( + key_conv_padded_bias) : reinterpret_cast(self->memory(1)); + tr_src = scratchpad.template get(key_conv_tr_src); + tr_src_bctx = scratchpad.template get( + key_conv_tr_src_bctx); + + tr_diff_dst = scratchpad.template get( + key_conv_tr_diff_dst); + tr_diff_dst_bctx = scratchpad.template get( + key_conv_tr_diff_dst_bctx); + + wei_bia_reduction = scratchpad.template get( + key_conv_wei_bia_reduction); + wei_bia_reduction_bctx = scratchpad.template get( + key_conv_wei_bia_reduction_bctx); + ithr_ic_b = ithr % self->nthr_ic_b_; ithr_oc_b = ithr / self->nthr_ic_b_ % self->nthr_oc_b_; ithr_g = ithr / self->nthr_ic_b_ / self->nthr_oc_b_ % self->nthr_g_; @@ -1030,20 +999,20 @@ struct jit_avx512_common_convolution_bwd_weights_t void jit_avx512_common_convolution_bwd_weights_t::compute_diff_weights(const thread_info_t *ti) { - const memory_desc_wrapper src_d(conf_.src_pd(0)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); + diff_weights_type>::compute_diff_weights(const thread_info_t *ti) const { + const memory_desc_wrapper src_d(pd()->src_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); const auto &jcp = kernel_->jcp; const int wei_size = jcp.ngroups * jcp.oc * jcp.ic * jcp.kh*jcp.kw*jcp.kd; diff_weights_data_t *diff_wei = ti->ithr_mb == 0 ? (diff_weights_data_t*)ti->diff_weights - : (diff_weights_data_t*)ws_reduction_ + (ti->ithr_mb - 1) * wei_size; + : ti->wei_bia_reduction + (ti->ithr_mb - 1) * wei_size; diff_weights_data_t *diff_bia = ti->ithr_mb == 0 ? (diff_weights_data_t*)ti->diff_bias - : (diff_weights_data_t*)ws_reduction_ + (nthr_mb_ - 1) * wei_size + : ti->wei_bia_reduction + (nthr_mb_ - 1) * wei_size + (ti->ithr_mb - 1) * jcp.ngroups * jcp.oc; // TODO: use memory descriptor with the same fmt as src (or use a macro :)) @@ -1069,7 +1038,7 @@ void jit_avx512_common_convolution_bwd_weights_tsrc[src_d.blk_off(img, _ic, j)]; - src_data_t *tr_src1 = &tr_src_[tr_src_off(ti->ithr_mb, _ic, j)]; + src_data_t *tr_src1 = &ti->tr_src[tr_src_off(ti->ithr_mb, _ic, j)]; assert(jcp.ic_block == 16); const int src_stride = jcp.iw * jcp.ic_block; @@ -1147,7 +1116,7 @@ void jit_avx512_common_convolution_bwd_weights_tdiff_dst[diff_dst_d.blk_off(img, oc, j)]; diff_dst_data_t *tr_diff_dst1 - = &tr_diff_dst_[tr_diff_dst_off(img, oc, j)]; + = &ti->tr_diff_dst[tr_diff_dst_off(img, oc, j)]; assert(jcp.ic_block == 16); @@ -1206,7 +1175,7 @@ void jit_avx512_common_convolution_bwd_weights_ttr_src + ti->ithr_but_oc * jcp.ih * jcp.stride_w * jcp.tr_ld; assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_oc_b_ == 1)); @@ -1215,7 +1184,7 @@ void jit_avx512_common_convolution_bwd_weights_tithr_oc_b, ih_start, ih_end); tr_ctx.tr_src_ih_start = ih_start; tr_ctx.tr_src_ih_end = ih_end; - tr_ctx.tr_src_bctx = tr_src_bctx_ + ti->ithr_but_oc; + tr_ctx.tr_src_bctx = ti->tr_src_bctx + ti->ithr_but_oc; auto p = jit_conv_call_s(); p.src = tr_ctx.tr_src; @@ -1267,20 +1236,20 @@ void jit_avx512_common_convolution_bwd_weights_t 1) - barrier(&tr_src_bctx_[ti->ithr_but_oc], nthr_oc_b_); + barrier(&ti->tr_src_bctx[ti->ithr_but_oc], nthr_oc_b_); uker_trans(img); if (nthr_oc_b_ > 1) - barrier(&tr_src_bctx_[ti->ithr_but_oc], nthr_oc_b_); + barrier(&ti->tr_src_bctx[ti->ithr_but_oc], nthr_oc_b_); } if (utils::one_of(jcp.ver, ver_4vnni, ver_vnni)) { /* tr_diff_dst[nb_oc][OW][oh][16c][2ow] * <- diff_dst[nb_oc][oh][ow][16c] */ if (nthr_ic_b_ > 1) - barrier(&tr_diff_dst_bctx_[ti->ithr_but_ic], nthr_ic_b_); + barrier(&ti->tr_diff_dst_bctx[ti->ithr_but_ic], nthr_ic_b_); diff_dst_trans(img); if (nthr_ic_b_ > 1) - barrier(&tr_diff_dst_bctx_[ti->ithr_but_ic], nthr_ic_b_); + barrier(&ti->tr_diff_dst_bctx[ti->ithr_but_ic], nthr_ic_b_); } for (int g = ti->g_start; g < ti->g_end; ++g) { @@ -1291,10 +1260,10 @@ void jit_avx512_common_convolution_bwd_weights_tjit_ker, p, (utils::one_of(jcp.ver, ver_4fma, ver_4vnni, ver_vnni) - ? &tr_src_[tr_src_off(ti->ithr_mb, _ic, 0)] + ? &ti->tr_src[tr_src_off(ti->ithr_mb, _ic, 0)] : &ti->src[src_d.blk_off(img, _ic)]), utils::one_of(jcp.ver, ver_4vnni, ver_vnni) - ? &tr_diff_dst_[tr_diff_dst_off(ti->ithr_mb, _oc, 0)] + ? &ti->tr_diff_dst[tr_diff_dst_off(ti->ithr_mb, _oc, 0)] : &ti->diff_dst[diff_dst_d.blk_off(img, _oc)], diff_wei + wht_blk_off(diff_weights_d, g, oc_b, ic_b), 0, (img == ti->img_start), 0, 0); @@ -1307,10 +1276,10 @@ void jit_avx512_common_convolution_bwd_weights_tg_start * jcp.nb_ic + ti->ic_b_start; jit_conv_ker_pipeline(kernel_->jit_ker, p, (utils::one_of(jcp.ver, ver_4fma, ver_4vnni, ver_vnni) - ? &tr_src_[tr_src_off(ti->ithr_mb, _ic, 0)] + ? &ti->tr_src[tr_src_off(ti->ithr_mb, _ic, 0)] : &ti->src[src_d.blk_off(img + 1, _ic)]), utils::one_of(jcp.ver, ver_4vnni, ver_vnni) - ? &tr_diff_dst_[tr_diff_dst_off(ti->ithr_mb, _oc, 0)] + ? &ti->tr_diff_dst[tr_diff_dst_off(ti->ithr_mb, _oc, 0)] : &ti->diff_dst[diff_dst_d.blk_off(img + 1, _oc)], diff_wei + wht_blk_off( diff_weights_d, ti->g_start, @@ -1323,10 +1292,11 @@ void jit_avx512_common_convolution_bwd_weights_t void jit_avx512_common_convolution_bwd_weights_t::compute_diff_weights_3d(const thread_info_t *ti) { - const memory_desc_wrapper src_d(conf_.src_pd(0)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); + diff_weights_type>::compute_diff_weights_3d(const thread_info_t *ti) const +{ + const memory_desc_wrapper src_d(pd()->src_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); const auto &jcp = kernel_->jcp; const int wei_size @@ -1334,10 +1304,10 @@ void jit_avx512_common_convolution_bwd_weights_tithr_mb == 0 ? (diff_weights_data_t*)ti->diff_weights - : (diff_weights_data_t*)ws_reduction_ + (ti->ithr_mb - 1) * wei_size; + : ti->wei_bia_reduction + (ti->ithr_mb - 1) * wei_size; diff_weights_data_t *diff_bia = ti->ithr_mb == 0 ? (diff_weights_data_t*)ti->diff_bias - : (diff_weights_data_t*)ws_reduction_ + (nthr_mb_ - 1) * wei_size + : ti->wei_bia_reduction + (nthr_mb_ - 1) * wei_size + (ti->ithr_mb - 1) * jcp.ngroups * jcp.oc; const int inp_mult = jcp.is_1stconv ? 1 : jcp.ic_block; @@ -1397,17 +1367,17 @@ void jit_avx512_common_convolution_bwd_weights_t void jit_avx512_common_convolution_bwd_weights_t::reduce_diff_weights(const thread_info_t *ti) { - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); + diff_weights_type>::reduce_diff_weights(const thread_info_t *ti) const { + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); const auto &jcp = kernel_->jcp; const int wei_size = jcp.ngroups * jcp.oc * jcp.ic * jcp.kh * jcp.kw; const int bia_size = jcp.ngroups * jcp.oc; const diff_weights_data_t *diff_bias_ws - = ws_reduction_ + (nthr_mb_ - 1) * wei_size; + = ti->wei_bia_reduction + (nthr_mb_ - 1) * wei_size; - /* diff_weights[:] += sum(ws_reduction_[thr_mb][:]) */ - simple_barrier::barrier(&reduction_bctx_, nthr_); + /* diff_weights[:] += sum(wei_reduction_[thr_mb][:]) */ + simple_barrier::barrier(ti->wei_bia_reduction_bctx, nthr_); const int ic_b_kh_work = ti->ic_b_work * jcp.kh; const int work = ti->g_work * ti->oc_b_work * ic_b_kh_work; @@ -1437,7 +1407,7 @@ void jit_avx512_common_convolution_bwd_weights_tdiff_weights + off; diff_weights_data_t *s - = ws_reduction_ + (thr_mb - 1) * wei_size + off; + = ti->wei_bia_reduction + (thr_mb - 1) * wei_size + off; acc_ker_->accumulate(d, s, acc_size); @@ -1457,15 +1427,15 @@ void jit_avx512_common_convolution_bwd_weights_t void jit_avx512_common_convolution_bwd_weights_t::reduce_diff_weights_3d(const thread_info_t *ti) { - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); + diff_weights_type>::reduce_diff_weights_3d(const thread_info_t *ti) const { + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); const auto &jcp = kernel_->jcp; const int wei_size = jcp.ngroups * jcp.oc * jcp.ic * jcp.kh * jcp.kw * jcp.kd; - /* diff_weights[:] += sum(ws_reduction_[thr_mb][:]) */ - simple_barrier::barrier(&reduction_bctx_, nthr_); + /* diff_weights[:] += sum(wei_reduction_[thr_mb][:]) */ + simple_barrier::barrier(ti->wei_bia_reduction_bctx, nthr_); const int ic_b_kh_work = ti->ic_b_work * jcp.kd; const int work = ti->g_work * ti->oc_b_work * ic_b_kh_work; @@ -1494,7 +1464,7 @@ void jit_avx512_common_convolution_bwd_weights_tdiff_weights + off; diff_weights_data_t *s - = ws_reduction_ + (thr_mb - 1) * wei_size + off; + = ti->wei_bia_reduction + (thr_mb - 1) * wei_size + off; acc_ker_->accumulate(d, s, acc_size); nd_iterator_jump(w, end, sub_g_start, ti->g_work, sub_oc_b_start, @@ -1506,25 +1476,28 @@ void jit_avx512_common_convolution_bwd_weights_t void jit_avx512_common_convolution_bwd_weights_t::compute_diff_bias(const thread_info_t *ti) { - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); + diff_weights_type>::compute_diff_bias(const thread_info_t *ti) const { + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); auto rb = this->reducer_bias_; - assert(nthr_ == rb->balancer_.nthr_); + assert(nthr_ == rb->balancer().nthr_); + + const auto reducer_bia_scratchpad = memory_tracking::grantor_t( + ti->scratchpad, prefix_reducer_bia); const auto &jcp = kernel_->jcp; if (jcp.with_bias && jcp.is_1stconv && jcp.ver == ver_4fma) return; - const int b_job_start = rb->balancer_.ithr_job_off(ti->ithr); - const int b_njobs = rb->balancer_.ithr_njobs(ti->ithr); + const int b_job_start = rb->balancer().ithr_job_off(ti->ithr); + const int b_njobs = rb->balancer().ithr_njobs(ti->ithr); if (b_njobs == 0) return; /* reduction dimension */ int img_start{0}, img_end{0}; - balance211(jcp.mb, rb->balancer_.nthr_per_group_, - rb->balancer_.id_in_group(ti->ithr), img_start, img_end); + balance211(jcp.mb, rb->balancer().nthr_per_group_, + rb->balancer().id_in_group(ti->ithr), img_start, img_end); /* jobs */ int g_start{0}, ocb_start{0}; @@ -1536,9 +1509,9 @@ void jit_avx512_common_convolution_bwd_weights_tdiff_dst[diff_dst_d.blk_off(img, _oc)]; - diff_weights_data_t *d_bias = &rb->get_local_ptr(ti->ithr, - (diff_weights_data_t *)ti->diff_bias)[ - b_job_loc * rb->balancer_.job_size_]; + diff_weights_data_t *d_bias = rb->get_local_ptr(ti->ithr, + ti->diff_bias, reducer_bia_scratchpad) + + b_job_loc * rb->balancer().job_size_; if (img == img_start) for (int o = 0; o < 16; ++o) @@ -1554,13 +1527,13 @@ void jit_avx512_common_convolution_bwd_weights_treduce(ti->ithr, ti->diff_bias); + rb->reduce(ti->ithr, ti->diff_bias, reducer_bia_scratchpad); } template void jit_avx512_common_convolution_bwd_weights_t::compute_diff_bias_3d(const thread_info_t *ti) { + diff_weights_type>::compute_diff_bias_3d(const thread_info_t *ti) const { const auto &jcp = kernel_->jcp; @@ -1568,7 +1541,7 @@ void jit_avx512_common_convolution_bwd_weights_twei_bia_reduction + (size_t)(nthr_mb_ - 1) * wei_size; if (nthr_mb_ > 1) mkldnn_thr_barrier(); @@ -1584,161 +1557,91 @@ void jit_avx512_common_convolution_bwd_weights_t void jit_avx512_common_convolution_bwd_weights_t::execute_backward_weights() { + diff_weights_type>::prepare_scratchpad_data() const +{ + const auto &j = pd()->jcp_; + auto scratchpad = this->scratchpad(); + + if (utils::one_of(j.ver, ver_4fma, ver_4vnni, ver_vnni)) { + if (!j.is_1stconv) { + // XXX: See the comment about tr_iw and guarding elements in + // jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf() + const int max_nthr = j.nthr_mb * j.ngroups * j.nb_ic; + const int min_tr_src_size_per_thr = j.ih * j.ic_block * j.tr_iw; + + auto tr_src = scratchpad.template get(key_conv_tr_src); + /* to avoid NaNs in computations we zero tail num_guard_elems for + * each possible thread group */ + + for (int ithr = 1; ithr <= max_nthr; ++ithr) { + src_data_t *ts = &tr_src[ithr * min_tr_src_size_per_thr]; + for (int i = 0; i < j.tr_src_num_guard_elems; ++i) + ts[i] = 0; + } + } + + if (j.nthr_oc_b > 1) { + const int tr_src_bctx_size = j.nthr / j.nthr_oc_b; + auto tr_src_bctx = scratchpad.template get( + key_conv_tr_src_bctx); + for (int i = 0; i < tr_src_bctx_size; ++i) + simple_barrier::ctx_init(&tr_src_bctx[i]); + } + + if (utils::one_of(j.ver, ver_4vnni, ver_vnni) && j.nthr_ic_b > 1) { + const int tr_diff_dst_bctx_size = j.nthr / j.nthr_ic_b; + auto tr_diff_dst_bctx = + scratchpad.template get( + key_conv_tr_diff_dst_bctx); + for (int i = 0; i < tr_diff_dst_bctx_size; ++i) + simple_barrier::ctx_init(&tr_diff_dst_bctx[i]); + } + } + + if (nthr_mb_ > 1) { + simple_barrier::ctx_init(scratchpad.template get( + key_conv_wei_bia_reduction_bctx)); + } + + const auto reducer_bia_scratchpad = memory_tracking::grantor_t(scratchpad, + prefix_reducer_bia); + auto rb = this->reducer_bias_; + rb->init(reducer_bia_scratchpad); +} + +template +void jit_avx512_common_convolution_bwd_weights_t::execute_backward_weights() const { + prepare_scratchpad_data(); + parallel(nthr_, [&](const int ithr, const int nthr) { assert(nthr_ == nthr); thread_info_t thread_info(this, ithr); - if (utils::one_of(conf_.ndims(), 3, 4)) { + if (utils::one_of(pd()->ndims(), 3, 4)) { compute_diff_weights(&thread_info); if (nthr_mb_ > 1) reduce_diff_weights(&thread_info); - if (conf_.with_bias()) compute_diff_bias(&thread_info); - } else if (conf_.ndims() == 5) { + if (pd()->with_bias()) compute_diff_bias(&thread_info); + } else if (pd()->ndims() == 5) { compute_diff_weights_3d(&thread_info); if (nthr_mb_ > 1) reduce_diff_weights_3d(&thread_info); - if (conf_.with_bias()) compute_diff_bias_3d(&thread_info); + if (pd()->with_bias()) compute_diff_bias_3d(&thread_info); } else { assert(false); } }); /* TODO: put that into compute_diff_bias() */ - if (conf_.want_padded_bias()) { + if (pd()->wants_padded_bias()) { + auto diff_bias = scratchpad().template get( + key_conv_padded_bias); auto diff_bias_in = reinterpret_cast(this->memory(1)); - for (int oc = 0; oc < conf_.jcp_.oc_without_padding; ++oc) - diff_bias_in[oc] = this->padded_bias_[oc]; - } -} - -template -void jit_avx512_common_convolution_bwd_weights_t::balance() { - const int max_threads = mkldnn_get_max_threads(); - const auto &j = conf_.jcp_; - - nthr_ = nthr_mb_ = nthr_g_ = nthr_oc_b_ = nthr_ic_b_ = 1; - - if (max_threads < j.ngroups) { - /* simplification... fortunately it doesn't hurt much */ - return; - } - - if (!mkldnn_thr_syncable() - && utils::one_of(j.ver, ver_4fma, ver_4vnni, ver_vnni)) { - // should not happen -- the driver is not ready - // for TBB-like non-synchronous threading yet - return; + for (int oc = 0; oc < pd()->jcp_.oc_without_padding; ++oc) + diff_bias_in[oc] = diff_bias[oc]; } - - if (j.ver == ver_4fma && j.is_1stconv) { - nthr_g_ = 1; - nthr_oc_b_ = 1; - nthr_ic_b_ = nstl::min(j.nb_ic, max_threads); - nthr_mb_ = nstl::min(max_threads / nthr_ic_b_, j.mb); - nthr_ = nthr_mb_ * nthr_oc_b_ * nthr_ic_b_ * nthr_g_; - return; - } - - nthr_g_ = j.ngroups; - const int nthr = max_threads / nthr_g_; - - auto calc_mem_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) { - /* calculate per thread memory cost (read/write). high level optimizer - * tries to minimize memory consumption. few notes: - * (n1) unclear why, but that essentially helps first convolution... - * (n2) assuming the reduction over minibatch is always there: - * - instead of 8 it should be 5 here (write ~= 2 read): - * kernel: temporal workspace 1 write - * reduction: 1 read from workspace and 1 write to the diff_wei - * - but experiments showed 8 works better than 5 or 6... */ - - const int src_coef = j.ver == ver_4fma || j.ver == ver_vnni ? 4 : 1; - const int dst_coef = 1; - const int wei_coef = j.ver == ver_vnni ? 4 : 8; - - return 0 - + src_coef - * div_up(j.mb, nthr_mb) * div_up(j.ngroups, nthr_g_) - * div_up(j.nb_ic, nthr_ic_b) * j.ic_block * j.ih * j.iw * j.id - / j.stride_d / j.stride_h / j.stride_w /* (n1) */ - + dst_coef - * div_up(j.mb, nthr_mb) * div_up(j.ngroups, nthr_g_) - * div_up(j.nb_oc, nthr_oc_b) * j.oc_block * j.oh * j.ow * j.od - + wei_coef /* (n2) */ - * div_up(j.ngroups, nthr_g_) - * div_up(j.nb_oc, nthr_oc_b) * div_up(j.nb_ic, nthr_ic_b) - * j.kh * j.kw * j.kd * j.ic_block * j.oc_block; - }; - - int best_mem_cost = calc_mem_cost(nthr_mb_, nthr_oc_b_, nthr_ic_b_); - - /* step 1: find the best thread distribution with lowest memory cost */ - const int nthr_mb_max = nstl::min(nthr, j.mb * j.od); - for (int nthr_mb = 1; nthr_mb <= nthr_mb_max; ++nthr_mb) { - const int nthr_par = nthr / nthr_mb; - const int nthr_oc_b_max = nstl::min(nthr_par, j.nb_oc); - for (int nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) { - int nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, j.nb_ic); - - int mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b); - if (mem_cost <= best_mem_cost) { - best_mem_cost = mem_cost; - nthr_mb_ = nthr_mb; - nthr_oc_b_ = nthr_oc_b; - nthr_ic_b_ = nthr_ic_b; - } - } - - if (!mkldnn_thr_syncable()) { assert(nthr_mb == 1); break; } - } - - if (j.ver != ver_vnni && !mayiuse(avx512_mic)) { - auto calc_comp_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) { - return 1 - * div_up(j.mb, nthr_mb) - * div_up(j.ngroups, nthr_g_) - * div_up(j.nb_oc, nthr_oc_b) - * div_up(j.nb_ic, nthr_ic_b); - }; - - /* step 2: search for a thread distribution with lower compute cost. - * the constrains: - * - memory cost cannot exceed 110% of the best found in the step 1 - * - unless compute cost is 133% lower than the current best case - * note: both constants were found empirically */ - int best_comp_cost = calc_comp_cost(nthr_mb_, nthr_oc_b_, nthr_ic_b_); - for (int nthr_mb = 1; nthr_mb <= nthr_mb_max; ++nthr_mb) { - const int nthr_par = nthr / nthr_mb; - const int nthr_oc_b_max = nstl::min(nthr_par, j.nb_oc); - for (int nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) { - int nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, j.nb_ic); - int mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b); - int comp_cost = calc_comp_cost(nthr_mb, nthr_oc_b, nthr_ic_b); - - const bool opt1 = comp_cost <= best_comp_cost - && mem_cost < 1.1 * best_mem_cost; - const bool opt2 = 4 * comp_cost <= 3 * best_comp_cost; - - if (opt1 || opt2) { - best_comp_cost = comp_cost; - nthr_mb_ = nthr_mb; - nthr_oc_b_ = nthr_oc_b; - nthr_ic_b_ = nthr_ic_b; - } - } - - if (!mkldnn_thr_syncable()) { assert(nthr_mb == 1); break; } - } - } - - if (nthr_mb_ > max_threads/2 && nthr_mb_ < max_threads) - nthr_mb_ = min(j.mb * j.od, max_threads); - nthr_ = nthr_mb_ * nthr_g_ * nthr_oc_b_ * nthr_ic_b_; - assert(nthr_ <= max_threads); - assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_mb_ == 1)); } template struct jit_avx512_common_convolution_bwd_weights_t; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.hpp index 42080cc..e500218 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.hpp @@ -18,124 +18,116 @@ #define CPU_JIT_AVX512_COMMON_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" + +#include "cpu_barrier.hpp" #include "cpu_convolution_pd.hpp" -#include "cpu_engine.hpp" -#include "jit_avx512_common_conv_kernel.hpp" -#include "jit_transpose_src_utils.hpp" #include "cpu_reducer.hpp" -#include "cpu_barrier.hpp" + +#include "jit_transpose_src_utils.hpp" +#include "jit_avx512_common_conv_kernel.hpp" namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_avx512_common_convolution_fwd_t : public cpu_primitive_t { - struct pd_t : public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, +struct jit_avx512_common_convolution_fwd_t : public cpu_primitive_t { + struct pd_t : public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() { } DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit:", avx512_common, ""), - _jit_avx512_common_convolution_fwd_t); + jit_avx512_common_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().weights_desc.data_type == wei_type - && this->cdesc_().dst_desc.data_type == dst_type + && this->desc()->src_desc.data_type == src_type + && this->desc()->weights_desc.data_type == wei_type + && this->desc()->dst_desc.data_type == dst_type && IMPLICATION(this->with_bias(), dst_type - == this->cdesc_().bias_desc.data_type) - && !(with_relu && this->negative_slope()!= 0. - && dst_type == data_type::s32 - && src_type == data_type::s16 - && wei_type == data_type::s16); + == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; - return jit_avx512_common_conv_fwd_kernel::init_conf( - jcp_, this->cdesc_(), this->src_pd_, this->weights_pd_, + status_t status = jit_avx512_common_conv_fwd_kernel::init_conf( + jcp_, *this->desc(), this->src_pd_, this->weights_pd_, this->dst_pd_,this->bias_pd_, *this->attr(), - mkldnn_get_max_threads(), with_relu, this->negative_slope()); - } + mkldnn_get_max_threads()); + if (status != status::success) return status; - inline int ndims() { return this->cdesc_().src_desc.ndims; } + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_common_conv_fwd_kernel::init_scratchpad(scratchpad, + jcp_); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); + return status; + } jit_conv_conf_t jcp_; }; - _jit_avx512_common_convolution_fwd_t(const pd_t *pd, + jit_avx512_common_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) { - kernel_ = new jit_avx512_common_conv_fwd_kernel(conf_.jcp_, - *conf_.attr()); - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (dst_data_t *)malloc(sizeof(dst_data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; - } + kernel_ = new jit_avx512_common_conv_fwd_kernel(pd()->jcp_, + *pd()->attr()); } - ~_jit_avx512_common_convolution_fwd_t() { - delete kernel_; - free(padded_bias_); - }; + ~jit_avx512_common_convolution_fwd_t() { delete kernel_; } typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { - if (conf_.ndims() == 3) + if (pd()->ndims() == 3) execute_forward_1d(); - else if (conf_.ndims() == 4) + else if (pd()->ndims() == 4) execute_forward_2d(); - else if (conf_.ndims() == 5) + else if (pd()->ndims() == 5) execute_forward_3d(); else assert(false); + + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); + e->set_state(event_t::ready); } private: - void execute_forward_1d(); - void execute_forward_2d(); - void execute_forward_3d(); - pd_t conf_; + void prepare_padded_bias(const dst_data_t *&bias) const; + void execute_forward_1d() const; + void execute_forward_2d() const; + void execute_forward_3d() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_common_conv_fwd_kernel *kernel_; - dst_data_t *padded_bias_; }; -template -using jit_avx512_common_convolution_fwd_t = - _jit_avx512_common_convolution_fwd_t; - -template -using jit_avx512_common_convolution_relu_t = - _jit_avx512_common_convolution_fwd_t; - template @@ -159,19 +151,27 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward_data) // XXX (this->!) + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() - && this->desc()->alg_kind == alg_kind::convolution_direct && this->desc()->diff_dst_desc.data_type == diff_dst_type && this->desc()->weights_desc.data_type == wei_type && this->desc()->diff_src_desc.data_type == diff_src_type; if (!ok) return status::unimplemented; - return jit_avx512_common_conv_bwd_data_kernel_f32::init_conf( - jcp_,*this->desc(), *this->diff_src_pd_.desc(), - *this->weights_pd_.desc(), *this->diff_dst_pd_.desc()); - } + status_t status = + jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(jcp_, + *this->desc(), *this->diff_src_pd_.desc(), + *this->weights_pd_.desc(), *this->diff_dst_pd_.desc()); + if (status != status::success) return status; - inline int ndims() { return this->desc()->diff_src_desc.ndims; } + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_common_conv_bwd_data_kernel_f32::init_scratchpad( + scratchpad, jcp_); + + return status::success; + } inline memory_format_t src_format() { @@ -206,30 +206,30 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t { CHECK(this->diff_dst_pd_.set_format(src_format())); if (this->weights_pd_.desc()->format == any) CHECK(this->weights_pd_.set_format(wei_format())); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - jit_avx512_common_convolution_bwd_data_t(const pd_t *pd, + jit_avx512_common_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { - kernel_ = new jit_avx512_common_conv_bwd_data_kernel_f32(conf_.jcp_); - } + : cpu_primitive_t(apd, inputs, outputs) + { kernel_ = new jit_avx512_common_conv_bwd_data_kernel_f32(pd()->jcp_); } ~jit_avx512_common_convolution_bwd_data_t() { delete kernel_; }; typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type diff_src_data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: - if (conf_.ndims() == 3) + if (pd()->ndims() == 3) execute_backward_data_1d(); - else if (conf_.ndims() == 4) + else if (pd()->ndims() == 4) execute_backward_data_2d(); - else if (conf_.ndims() == 5) + else if (pd()->ndims() == 5) execute_backward_data_3d(); else assert(false); @@ -241,10 +241,11 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data_1d(); - void execute_backward_data_2d(); - void execute_backward_data_3d(); - pd_t conf_; + void execute_backward_data_1d() const; + void execute_backward_data_2d() const; + void execute_backward_data_3d() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_common_conv_bwd_data_kernel_f32 *kernel_; }; @@ -267,7 +268,9 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t { assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->desc()->prop_kind == prop_kind::backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && this->desc()->src_desc.data_type == src_type && this->desc()->diff_dst_desc.data_type == diff_dst_type @@ -275,12 +278,27 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t { == diff_weights_type; if (!ok) return status::unimplemented; - return jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf( - jcp_, *this->desc(), this->src_pd_, this->diff_weights_pd_, - this->diff_bias_pd_, this->diff_dst_pd_); - } + status_t status = + jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(jcp_, + *this->desc(), this->src_pd_, this->diff_weights_pd_, + this->diff_bias_pd_, this->diff_dst_pd_); + if (status != status::success) return status; + + init_balancers(); - inline int ndims() { return this->desc()->src_desc.ndims; } + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_common_conv_bwd_weights_kernel_f32::init_scratchpad( + scratchpad, jcp_); + + auto reducer_bia_scratchpad = memory_tracking::registrar_t( + scratchpad, memory_tracking::names::prefix_reducer_bia); + reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad); + + if (status == status::success && + this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); + return status; + } inline memory_format_t src_format() { @@ -297,29 +315,37 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t { OIdhw16o16i); } - jit_conv_conf_t jcp_; + typename cpu_reducer_t::conf_t reducer_bia_conf_; - protected: - virtual status_t set_default_params() override { - using namespace memory_format; + protected: + virtual status_t set_default_params() override { + using namespace memory_format; - if (this->src_pd_.desc()->format == any) - CHECK(this->src_pd_.set_format(src_format())); - if (this->diff_weights_pd_.desc()->format == any) - CHECK(this->diff_weights_pd_.set_format(wei_format())); - if (this->diff_dst_pd_.desc()->format == any) - CHECK(this->diff_dst_pd_.set_format(src_format())); + if (this->src_pd_.desc()->format == any) + CHECK(this->src_pd_.set_format(src_format())); + if (this->diff_weights_pd_.desc()->format == any) + CHECK(this->diff_weights_pd_.set_format(wei_format())); + if (this->diff_dst_pd_.desc()->format == any) + CHECK(this->diff_dst_pd_.set_format(src_format())); - return status::success; - } + return status::success; + } + private: + void init_balancers() { + const size_t max_buffer_size = jcp_.nthr * 3 * 5 * 5 * 16 * 16; + if (with_bias()) { + reducer_bia_conf_.init(reduce_balancer_t(jcp_.nthr, + jcp_.oc_block, jcp_.ngroups * jcp_.nb_oc, jcp_.mb, + max_buffer_size)); + } + } }; - jit_avx512_common_convolution_bwd_weights_t(const pd_t *pd, + jit_avx512_common_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_avx512_common_convolution_bwd_weights_t() { - delete kernel_; if (trans_kernel_) delete trans_kernel_; @@ -328,53 +354,37 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t { if (acc_ker_) delete acc_ker_; delete reducer_bias_; - free(padded_bias_); - - free(tr_src_); - free(ws_reduction_); - - free(tr_src_bctx_); - free(tr_diff_dst_bctx_); - - free(tr_diff_dst_); } typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type diff_weights_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward_weights(); e->set_state(event_t::ready); } private: - void execute_backward_weights(); - void balance(); - + void execute_backward_weights() const; + void prepare_scratchpad_data() const; struct thread_info_t; - void compute_diff_weights(const thread_info_t *); - void compute_diff_weights_3d(const thread_info_t *); - void reduce_diff_weights(const thread_info_t *); - void reduce_diff_weights_3d(const thread_info_t *); - void compute_diff_bias(const thread_info_t *); - void compute_diff_bias_3d(const thread_info_t *); + void compute_diff_weights(const thread_info_t *) const; + void compute_diff_weights_3d(const thread_info_t *) const; + void reduce_diff_weights(const thread_info_t *) const; + void reduce_diff_weights_3d(const thread_info_t *) const; + void compute_diff_bias(const thread_info_t *) const; + void compute_diff_bias_3d(const thread_info_t *) const; - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + + int nthr_, nthr_mb_, nthr_g_, nthr_oc_b_, nthr_ic_b_; jit_avx512_common_conv_bwd_weights_kernel_f32 *kernel_; jit_trans_src_t *trans_kernel_; jit_trans_dst_t *trans_dst_kernel_; cpu_accumulator_1d_t *acc_ker_; cpu_reducer_t *reducer_bias_; - diff_weights_data_t *padded_bias_; - - src_data_t *tr_src_; - diff_dst_data_t *tr_diff_dst_; - diff_weights_data_t *ws_reduction_; - - int nthr_, nthr_mb_, nthr_g_, nthr_oc_b_, nthr_ic_b_; - simple_barrier::ctx_t *tr_src_bctx_, *tr_diff_dst_bctx_, reduction_bctx_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.cpp index 93db55e..eb45ba9 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.cpp @@ -37,6 +37,8 @@ namespace mkldnn { namespace impl { namespace cpu { +using namespace memory_tracking::names; + namespace { unsigned int LLC_cache_size = get_cache_size(3, false); @@ -511,80 +513,6 @@ void input_transform_data(int image, const jit_conv_winograd_conf_t &jcp, } template -void input_transform_tileblock_data(int tile_block, - const jit_conv_winograd_conf_t &jcp, - float *inp, float *tinp) -{ - const int inph = is_fwd ? jcp.ih : jcp.oh; - const int inpw = is_fwd ? jcp.iw : jcp.ow; - const int t_pad = is_fwd ? jcp.t_pad : jcp.ih + jcp.t_pad - jcp.oh; - const int l_pad = is_fwd ? jcp.l_pad : jcp.iw + jcp.r_pad - jcp.ow; - const int wp_max = inpw + l_pad; - const int hp_max = inph + t_pad; - float Iw[alpha][alpha][simd_w]; - float I[alpha][alpha][simd_w]; - - array_offset_calculator input(inp, - jcp.mb, jcp.dimK/simd_w, inph, inpw, simd_w); - array_offset_calculator output(tinp, - alpha, alpha, - jcp.dimN_block, jcp.dimK_nb_block, jcp.dimK_block, - jcp.dimN_reg_block, jcp.dimK_reg_block); - - int tile_index = tile_block * jcp.nb_tile_block_ur * jcp.tile_block_ur; - - for (int nb_tile_block_ur = 0; - nb_tile_block_ur < jcp.nb_tile_block_ur; - nb_tile_block_ur++) { - for (int tile_block_ur = 0; tile_block_ur < jcp.tile_block_ur; - tile_block_ur++) { - - int img = tile_index / (jcp.jtiles * jcp.itiles); - int ti = tile_index % jcp.itiles; - int tj = (tile_index / jcp.itiles) % jcp.jtiles; - float *pinp_b = &(input(img, 0, 0, 0, 0)); - - for (int j = 0; j < alpha; j++) { - int ydim = tj * tile_size + j; - if ((t_pad <= ydim) && (ydim < hp_max)) { - float *pinp_j = pinp_b + (ydim - t_pad) * inpw * simd_w; - for (int i = 0; i < alpha; i++) { - int xdim = ti * tile_size + i; - if ((l_pad <= xdim) && (xdim < wp_max)) { - float *pinp_i = pinp_j + (xdim - l_pad) * simd_w; - load_ps(I[j][i], pinp_i); - } else { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = 0.0f; - } - } - } - } else { - for (int i = 0; i < alpha; i++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = 0.0f; - } - } - } - } - - trans_I_4x4_3x3(Iw, I); - for (int j = 0; j < alpha; j++) { - for (int i = 0; i < alpha; i++) { - store_output(&(output(j, i, - nb_tile_block_ur, 0, 0, - tile_block_ur, 0)), - Iw[j][i], false); - } - } - tile_index++; - } - } -} - -template void weight_transform_data(const jit_conv_winograd_conf_t &jcp, float *wp, float *twp) { @@ -691,7 +619,7 @@ void output_transform_data(int image, const jit_conv_winograd_conf_t &jcp, O[j][i][v] = true && with_relu_presum && O[j][i][v] < 0.f ? O[j][i][v] - * jcp.eltwise_alpha + * jcp.eltwise.alpha : O[j][i][v]; } } @@ -717,83 +645,6 @@ void output_transform_data(int image, const jit_conv_winograd_conf_t &jcp, } } -template -void output_transform_tileblock_data(int tile_block, - const jit_conv_winograd_conf_t &jcp, const post_ops_t &p_ops, - float *toutp, float *outp, float *bias, bool streamout) { - float Ow[alpha][alpha][simd_w]; - float O[tile_size][tile_size][simd_w]; - int outw = is_fwd ? jcp.ow : jcp.iw; - int outh = is_fwd ? jcp.oh : jcp.ih; - - /* Prepare for PostOps */ - bool with_relu_postsum = p_ops.find(primitive_kind::eltwise, 1) != -1; - - array_offset_calculator input(toutp, - alpha, alpha, - jcp.dimN_block, jcp.dimM_block, - jcp.dimN_reg_block, jcp.dimM_simd_block); - array_offset_calculator output(outp, - jcp.mb, jcp.dimM/jcp.dimM_simd_block, outh, outw, - jcp.dimM_simd_block); - - int tile_index = tile_block * jcp.nb_tile_block_ur * jcp.tile_block_ur; - - for (int nb_tile_block_ur = 0; - nb_tile_block_ur < jcp.nb_tile_block_ur; - nb_tile_block_ur++) { - - for (int tile_block_ur = 0; tile_block_ur < jcp.tile_block_ur; - tile_block_ur++) { - int img = tile_index / (jcp.jtiles * jcp.itiles); - int ti = tile_index % jcp.itiles; - int tj = (tile_index / jcp.itiles) % jcp.jtiles; - - for (int j = 0; j < alpha; j++) { - for (int i = 0; i < alpha; i++) { - float *pinp_tile = &(input(j, i, nb_tile_block_ur, 0, - tile_block_ur, 0)); - load_ps(Ow[j][i], pinp_tile); - } - } - - trans_O_4x4_3x3(Ow, O); - - float *pout_b = &(output(img, 0, 0, 0, 0)); - for (int j = 0; j < tile_size; j++) { - int ydim = tj * tile_size + j; - if (ydim < outh) { - float *pout_j = pout_b + ydim * outw * simd_w; - for (int i = 0; i < tile_size; i++) { - int xdim = ti * tile_size + i; - if (xdim < outw) { - float *pout_i = pout_j + xdim * simd_w; - if (is_fwd) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - O[j][i][v] += with_bias ? bias[v] : 0.f; - O[j][i][v] = true - && with_relu_presum && O[j][i][v] < 0.f - ? O[j][i][v] - * jcp.eltwise_alpha - : O[j][i][v]; - - } - } - if (with_sum) - accum_output(pout_i, O[j][i], streamout, - with_relu_postsum); - else - store_output(pout_i, O[j][i], streamout); - } - } - } - } - tile_index++; - } - } -} - template void diff_src_transform_bwd_weights(int image, jit_conv_winograd_conf_t conv, float *inp, float *tinp, float *Iw_temp, @@ -1049,7 +900,8 @@ void diff_weights_transform_bwd_weights(jit_conv_winograd_conf_t conv, template void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( - const int MB, float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr) { + const int MB, float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr, + const memory_tracking::grantor_t &scratchpad) const{ const auto &jcp = kernel_->jcp; const auto &p_ops = attr_->post_ops_; @@ -1058,7 +910,7 @@ void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( const int outh = is_fwd ? jcp.oh : jcp.ih; const int outw = is_fwd ? jcp.ow : jcp.iw; - /* Note that jcp.with_relu is true for both fused conv+relu primitive + /* Note that jcp.with_eltwise is true for both fused conv+relu primitive * and conv primitive with PostOps with relu before sum * (PostOps relu after sum is handled later) */ auto output_transform = jcp.with_bias @@ -1094,24 +946,23 @@ void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( array_offset_calculator bias(bias_ptr, jcp.dimM/jcp.dimM_simd_block, jcp.dimM_simd_block); - array_offset_calculator M( - (float *)((is_fwd - ? (this->scratchpad_)->M_ptr() - : (this->scratchpad_)->V_ptr())), + array_offset_calculator M(is_fwd + ? scratchpad.template get(key_wino_M) + : scratchpad.template get(key_wino_V), jcp.dimN_nb_block, jcp.dimM_nb_block, alpha, alpha, jcp.dimN_block, jcp.dimM_block, jcp.dimN_reg_block, jcp.dimM_simd_block); - array_offset_calculator U((float *)((this->scratchpad_)->U_ptr()), + array_offset_calculator U( + scratchpad.template get(key_wino_U), jcp.dimM_nb_block, alpha, alpha, jcp.dimK_nb_block, jcp.dimM_block, jcp.dimK_block, jcp.dimK_reg_block, jcp.dimM_simd_block); - array_offset_calculator V( - (float *)((is_fwd - ? (this->scratchpad_)->V_ptr() - : (this->scratchpad_)->M_ptr())), + array_offset_calculator V(is_fwd + ? scratchpad.template get(key_wino_V) + : scratchpad.template get(key_wino_M), jcp.dimN_nb_block, alpha, alpha, jcp.dimN_block, jcp.dimK_nb_block, jcp.dimK_block, jcp.dimN_reg_block, jcp.dimK_reg_block); @@ -1121,15 +972,15 @@ void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( const bool output_is_aligned = ((size_t)out_ptr & (64 - 1)) == 0; - const bool want_padded_bias = jcp.with_bias + const bool wants_padded_bias = jcp.with_bias && jcp.oc_without_padding != jcp.oc; float last_slice_bias[simd_w] = {0}; - if (want_padded_bias) { + if (wants_padded_bias) { for (int oc = 0; oc < jcp.oc_without_padding % jcp.oc_simd_block; ++oc) last_slice_bias[oc] = bias(jcp.dimM / jcp.dimM_simd_block - 1, oc); } -#pragma omp parallel +PRAGMA_OMP(parallel) { parallel_nd_in_omp(MB, jcp.dimK_nb_block, jcp.dimK_block, [&](int img, int K_blk1, int K_blk2) { @@ -1148,7 +999,7 @@ void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( ifm1 * jcp.ic_block + ifm2, 0, 0, 0, 0)), U_base_ptr); }); -#pragma omp barrier +PRAGMA_OMP(barrier) parallel_nd_in_omp(jcp.dimN_nb_block, alpha, alpha, jcp.dimM_nb_block, jcp.dimN_block, [&](int N_blk1, int oj, int oi, int M_blk1, int N_blk2) { @@ -1174,14 +1025,14 @@ void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( }); -#pragma omp barrier +PRAGMA_OMP(barrier) parallel_nd_in_omp(MB, jcp.dimM_nb_block, jcp.dimM_block, [&](int img, int M_blk1, int M_blk2) { const int M_blk = M_blk1 * jcp.dimM_block + M_blk2; - float *bias_ptr = want_padded_bias + float *bias_ptr = wants_padded_bias && M_blk == jcp.dimM / jcp.dimM_simd_block - 1 ? last_slice_bias : &bias(M_blk, 0); @@ -1194,180 +1045,25 @@ void _jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( } } -template void -_jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( - const int, float *, float *, float *, float *); -template void -_jit_avx512_common_convolution_winograd_t::_execute_data_W_S_G_D( - const int, float *, float *, float *, float *); - -template -void _jit_avx512_common_convolution_winograd_t::_execute_data_W_SGD( - const int MB, float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr) { - const auto &jcp = kernel_->jcp; - const auto &p_ops = attr_->post_ops_; - - const int inph = is_fwd ? jcp.ih : jcp.oh; - const int inpw = is_fwd ? jcp.iw : jcp.ow; - const int outh = is_fwd ? jcp.oh : jcp.ih; - const int outw = is_fwd ? jcp.ow : jcp.iw; - - /* Note that jcp.with_relu is true for both fused conv+relu primitive - * and conv primitive with PostOps with relu before sum - * (PostOps relu after sum is handled later) */ - auto output_transform_tileblock = jcp.with_bias - ? (jcp.with_eltwise - ? (jcp.with_sum - ? output_transform_tileblock_data - : output_transform_tileblock_data) - : (jcp.with_sum - ? output_transform_tileblock_data - : output_transform_tileblock_data)) - : (jcp.with_eltwise - ? (jcp.with_sum - ? output_transform_tileblock_data - : output_transform_tileblock_data) - : (jcp.with_sum - ? output_transform_tileblock_data - : output_transform_tileblock_data)); - - array_offset_calculator input(inp_ptr, - MB, jcp.dimK/jcp.dimK_reg_block, inph, inpw, jcp.dimK_reg_block); - array_offset_calculator output(out_ptr, - MB, jcp.dimM/jcp.dimM_simd_block, outh, outw, jcp.dimM_simd_block); - array_offset_calculator weights(wei_ptr, - jcp.oc/jcp.oc_simd_block, jcp.ic/jcp.ic_simd_block, jcp.kh, jcp.kw, - jcp.ic_simd_block, jcp.oc_simd_block); - array_offset_calculator bias(bias_ptr, - jcp.oc/jcp.oc_simd_block, jcp.oc_simd_block); - - array_offset_calculator U((float *)((this->scratchpad_)->U_ptr()), - jcp.dimM_nb_block, - alpha, alpha, - jcp.dimK_nb_block, - jcp.dimM_block, jcp.dimK_block, - jcp.dimK_reg_block, jcp.dimM_simd_block); - - array_offset_calculator M( - (float *)((is_fwd - ? (this->scratchpad_)->M_ptr() - : (this->scratchpad_)->V_ptr())), - 0, jcp.dimM_nb_block, alpha, alpha, - jcp.dimN_block, jcp.dimM_block, - jcp.dimN_reg_block, jcp.dimM_simd_block); - - array_offset_calculator V( - (float *)((is_fwd - ? (this->scratchpad_)->V_ptr() - : (this->scratchpad_)->M_ptr())), - 0, alpha, alpha, jcp.dimN_block, - jcp.dimK_nb_block, jcp.dimK_block, - jcp.dimN_reg_block, jcp.dimK_reg_block); - - const bool output_is_aligned = ((size_t)out_ptr & (64 - 1)) == 0; - - const bool want_padded_bias = jcp.with_bias - && jcp.oc_without_padding != jcp.oc; - float last_slice_bias[simd_w] = {0}; - if (want_padded_bias) { - for (int oc = 0; oc < jcp.oc_without_padding % jcp.oc_simd_block; ++oc) - last_slice_bias[oc] = bias(jcp.dimM / jcp.dimM_simd_block - 1, oc); - } - -#pragma omp parallel - { - parallel_nd_in_omp(jcp.nb_oc, jcp.nb_ic, jcp.oc_block, jcp.ic_block, - [&](int ofm1, int ifm1, int ofm2, int ifm2) { - - float *U_base_ptr = is_fwd - ? &(U(ofm1, 0, 0, ifm1, ofm2, ifm2, 0, 0)) - : &(U(ifm1, 0, 0, ofm1, ifm2, ofm2, 0, 0)); - weight_transform_data(jcp, - &(weights(ofm1 * jcp.oc_block + ofm2, - ifm1 * jcp.ic_block + ifm2, - 0, 0, 0, 0)), - U_base_ptr); - }); - -#pragma omp barrier - - int ithr = mkldnn_get_thread_num(); - -#pragma omp for schedule(static) - for (int tile_block = 0; tile_block < jcp.tile_block; tile_block++) { - for (int K_blk1 = 0; K_blk1 < jcp.dimK_nb_block; K_blk1++) { - for (int K_blk2 = 0; K_blk2 < jcp.dimK_block; K_blk2++) { - input_transform_tileblock_data( - tile_block, jcp, - &(input(0, K_blk1 * jcp.dimK_block + K_blk2, 0, 0, 0)), - &(V(ithr, 0, 0, 0, K_blk1, K_blk2, 0, 0))); - } - } - - for (int oj = 0; oj < alpha; oj++) { - for (int oi = 0; oi < alpha; oi++) { - for (int M_blk1 = 0; M_blk1 < jcp.dimM_nb_block; M_blk1++) { - for (int N_blk = 0; N_blk < jcp.dimN_block; N_blk++) { - kernel_->gemm_loop_ker_first_iter( - (float *)&(M(ithr, M_blk1, oj, oi, - N_blk, 0, 0, 0)), - (const float *)&(U(M_blk1, oj, oi, 0, - 0, 0, 0, 0)), - (const float *)&(V(ithr, oj, oi, - N_blk, 0, 0, 0, 0))); - for (int K_blk1 = 1; K_blk1 < jcp.dimK_nb_block; K_blk1++) { - kernel_->gemm_loop_ker( - (float *)&(M(ithr, M_blk1, oj, oi, - N_blk, 0, 0, 0)), - (const float *)&(U(M_blk1, oj, oi, K_blk1, - 0, 0, 0, 0)), - (const float *)&(V(ithr, oj, oi, - N_blk, K_blk1, 0, 0, 0))); - } - } - } - } - } - - for (int M_blk1 = 0; M_blk1 < jcp.dimM_nb_block; M_blk1++) { - for (int M_blk2 = 0; M_blk2 < jcp.dimM_block; M_blk2++) { - const int M_blk = M_blk1 * jcp.dimM_block + M_blk2; - - float *bias_ptr = want_padded_bias - && M_blk == jcp.dimM / jcp.dimM_simd_block - 1 - ? last_slice_bias : &bias(M_blk, 0); - - output_transform_tileblock(tile_block, jcp, p_ops, - &(M(ithr, M_blk1, 0, 0, 0, M_blk2, 0, 0)), - &(output(0, M_blk, 0, 0, 0)), - bias_ptr, output_is_aligned); - } - } - } - } -} - -template void -_jit_avx512_common_convolution_winograd_t::_execute_data_W_SGD( - const int, float *, float *, float *, float *); -template void -_jit_avx512_common_convolution_winograd_t::_execute_data_W_SGD( - const int, float *, float *, float *, float *); +template struct _jit_avx512_common_convolution_winograd_t; +template struct _jit_avx512_common_convolution_winograd_t; void jit_avx512_common_convolution_winograd_bwd_weights_t:: -_maybe_execute_diff_bias_copy() { - if (conf_.want_padded_bias()) { +_maybe_execute_diff_bias_copy( + const memory_tracking::grantor_t &scratchpad) const { + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad.get(key_conv_padded_bias); float *diff_bias = (float *)this->memory(1); - for (int oc = 0; oc < conf_.jcp_.oc_without_padding; ++oc) - diff_bias[oc] = this->padded_bias_[oc]; + for (int oc = 0; oc < pd()->jcp_.oc_without_padding; ++oc) + diff_bias[oc] = padded_bias[oc]; } } void jit_avx512_common_convolution_winograd_bwd_weights_t:: -_execute_backward_weights_S_D_G_W() -{ +_execute_backward_weights_S_D_G_W( + const memory_tracking::grantor_t &scratchpad) const { const auto &jcp = kernel_->jcp; - const int nthreads = scratchpad_->num_threads(); + const int nthreads = jcp.nthr; auto diff_src_transform_bwd_weights_ver = jcp.ver == ver_4fma ? diff_src_transform_bwd_weights : @@ -1382,25 +1078,25 @@ _execute_backward_weights_S_D_G_W() jcp.mb, jcp.oc/simd_w, jcp.oh, jcp.ow, simd_w); array_offset_calculator diff_weights((float *)this->memory(0), jcp.oc/simd_w, jcp.ic/simd_w, jcp.kh, jcp.kw, simd_w, simd_w); - array_offset_calculator diff_bias( - conf_.want_padded_bias() ? padded_bias_ : (float *)this->memory(1), - jcp.oc/simd_w, simd_w); + array_offset_calculator diff_bias(pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) + : (float *)this->memory(1), jcp.oc/simd_w, simd_w); array_offset_calculator U( - (float *)(scratchpad_->U_ptr()), + scratchpad.get(key_wino_U), jcp.nb_ic, jcp.nb_oc, alpha, alpha, jcp.oc_block, jcp.ic_block, jcp.ic_simd_block, jcp.oc_simd_block); array_offset_calculator M( - (float *)(scratchpad_->M_ptr()), + scratchpad.get(key_wino_M), jcp.nb_oc, alpha, alpha, jcp.tile_block, jcp.oc_block, jcp.nb_tile_block_ur, jcp.tile_block_ur * jcp.tile_4fma, jcp.oc_simd_block); array_offset_calculator V( - (float *)(scratchpad_->V_ptr()), + scratchpad.get(key_wino_V), jcp.nb_ic, alpha, alpha, jcp.tile_block, jcp.ic_block, jcp.nb_tile_block_ur, jcp.tile_block_ur, @@ -1409,23 +1105,23 @@ _execute_backward_weights_S_D_G_W() const int trans_buffer_size = alpha * alpha * jcp.tile_4fma * jcp.ic_simd_block; array_offset_calculator trans_buffer( - (float *)(scratchpad_->src_transpose_ptr()), + scratchpad.get(key_conv_tr_src), nthreads, trans_buffer_size); array_offset_calculator diff_bias_prv( - (float *)(scratchpad_->bias_ptr()), - mkldnn_get_max_threads(), + scratchpad.get(key_conv_bia_reduction), + nthreads, jcp.oc); -#pragma omp parallel num_threads(nthreads) +PRAGMA_OMP(parallel num_threads(nthreads)) { if (jcp.with_bias) { parallel_nd_in_omp(nthreads, jcp.oc, [&](int ithr, int ofm) { diff_bias_prv(ithr, ofm) = 0.0f; }); -#pragma omp for nowait +PRAGMA_OMP(for nowait) for (int bofm = 0; bofm < jcp.oc / simd_w; bofm++) { PRAGMA_OMP_SIMD() for (int v = 0; v < simd_w; v++) @@ -1461,7 +1157,7 @@ _execute_backward_weights_S_D_G_W() dbias); }); -#pragma omp barrier +PRAGMA_OMP(barrier) for (int ifm1 = 0; ifm1 < jcp.nb_ic; ifm1++) { parallel_nd_in_omp(alpha, alpha, jcp.nb_oc, @@ -1486,7 +1182,7 @@ _execute_backward_weights_S_D_G_W() }); } -#pragma omp barrier +PRAGMA_OMP(barrier) parallel_nd_in_omp(jcp.nb_ic, jcp.nb_oc, jcp.oc_block, jcp.ic_block, [&](int ifm1, int ofm1, int ofm2, int ifm2) { @@ -1497,7 +1193,7 @@ _execute_backward_weights_S_D_G_W() }); if (jcp.with_bias) { -#pragma omp for +PRAGMA_OMP(for) for (int ofm1 = 0; ofm1 < jcp.oc / simd_w; ofm1++) { for (int ithr = 0; ithr < nthreads; ithr++) { float* base_bias_ptr = &(diff_bias(ofm1, 0)); @@ -1512,806 +1208,9 @@ _execute_backward_weights_S_D_G_W() } } - _maybe_execute_diff_bias_copy(); + _maybe_execute_diff_bias_copy(scratchpad); } -namespace { - -const int max_threads_number = 1024; - -template -void diff_src_transform_bwd_weights_tile(int tile_block, - jit_conv_winograd_conf_t conv, float *inp, float *tinp, - void(*transpose_4fma_ker)(float *, float *)) -{ - const int ifwp = conv.iw + conv.l_pad; - const int ifhp = conv.ih + conv.t_pad; - float I[alpha][alpha][simd_w]; - float Iw[alpha][alpha][simd_w]; - - float *Iw_buffer = nullptr; - if (ver_4fma) { - Iw_buffer = (float *)malloc(alpha * alpha * conv.tile_4fma - * simd_w * sizeof(float), 64); - } - array_offset_calculator Iw_scratchpad(Iw_buffer, - alpha, alpha, conv.tile_4fma, simd_w); - array_offset_calculator input(inp, - conv.mb, conv.ic / simd_w, conv.ih, conv.iw, simd_w); - array_offset_calculator output(tinp, - 0, alpha, alpha, - conv.ic_block, - conv.nb_tile_block_ur, conv.tile_block_ur, - conv.ic_simd_block * conv.tile_4fma); - - int tile_4fma = 0; - - int n_tiles = tile_block * conv.nb_tile_block_ur * conv.tile_block_ur; - for (int nb_tile_block_ur = 0; nb_tile_block_ur < conv.nb_tile_block_ur; - nb_tile_block_ur++) { - for (int tile_block_ur = 0; tile_block_ur < conv.tile_block_ur; - tile_block_ur++) { - - int img = n_tiles / (conv.jtiles * conv.itiles); - int no_tile = n_tiles % (conv.jtiles * conv.itiles); - int ti = no_tile % conv.itiles; - int tj = no_tile / conv.itiles; - - for (int j = 0; j < alpha; j++) { - int ydim = tj * tile_size + j; - if ((conv.t_pad <= ydim) && ydim < ifhp) { - for (int i = 0; i < alpha; i++) { - int xdim = ti * tile_size + i; - if ((conv.l_pad <= xdim) && xdim < ifwp) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = input(img, 0, - ydim - conv.t_pad, - xdim - conv.l_pad, v); - } - } - else { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = 0.0f; - } - } - } - } - else { - for (int i = 0; i < alpha; i++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = 0.0f; - } - } - } - } - - trans_I_4x4_3x3(Iw, I); - - if (ver_4fma) { - for (int j = 0; j < alpha; j++) { - for (int i = 0; i < alpha; i++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - Iw_scratchpad(j, i, tile_4fma, v) = Iw[j][i][v]; - } - } - } - tile_4fma++; - if (tile_4fma == conv.tile_4fma) { - float *outp = &(output(0, 0, 0, 0, - nb_tile_block_ur, tile_block_ur, 0)); - transpose_4fma_ker(outp, (float *)Iw_buffer); - tile_4fma = 0; - } - } - else { - for (int j = 0; j < alpha; j++) { - for (int i = 0; i < alpha; i++) { - store_output( - &(output(0, j, i, 0, - nb_tile_block_ur, tile_block_ur, 0)), - Iw[j][i], false); - - } - } - } - n_tiles++; - } - } -} - -template -void diff_dst_transform_bwd_weights_tile(int tile_block, - jit_conv_winograd_conf_t conv, float *inp, float *tinp, float *dbias) -{ - float I[alpha][alpha][simd_w]; - float Iw[alpha][alpha][simd_w]; - - array_offset_calculator input(inp, - conv.mb, conv.oc / simd_w, conv.oh, conv.ow, conv.oc_simd_block); - array_offset_calculator output(tinp, - conv.nb_oc, alpha, alpha, - conv.oc_block, - conv.nb_tile_block_ur, - conv.tile_block_ur * conv.tile_4fma, conv.oc_simd_block); - - int n_tiles = tile_block * conv.nb_tile_block_ur * conv.tile_block_ur; - for (int nb_tile_block_ur = 0; nb_tile_block_ur < conv.nb_tile_block_ur; - nb_tile_block_ur++) { - for (int tile_block_ur = 0; tile_block_ur < conv.tile_block_ur; - tile_block_ur++) { - - int img = n_tiles / (conv.jtiles * conv.itiles); - int no_tile = n_tiles % (conv.jtiles * conv.itiles); - int ti = no_tile % conv.itiles; - int tj = no_tile / conv.itiles; - - for (int j = 0; j < alpha; j++) { - int ydim = tj * tile_size + j; - if (ydim < conv.oh) { - for (int i = 0; i < alpha; i++) { - int xdim = ti * tile_size + i; - if (xdim < conv.ow) { - float *input_base = &input(img, 0, ydim, xdim, 0); - - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = input_base[v]; - } - if (with_bias && j < tile_size && i < tile_size) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - dbias[v] += input_base[v]; - } - } - } - else { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = 0.0f; - } - } - } - } - else { - for (int i = 0; i < alpha; i++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) { - I[j][i][v] = 0.0f; - } - } - } - } - - trans_W_3x3_4x4_wu(Iw, I); - - for (int j = 0; j < alpha; j++) { - for (int i = 0; i < alpha; i++) { - /*TODO: Try instrinsic for casting into __m512*/ - store_output(&(output(0, j, i, 0, - nb_tile_block_ur, tile_block_ur, 0)), - Iw[j][i], false); - } - } - n_tiles++; - } - } -} - -// Sum to the first buffer array -void array_sum(int num_arrs, float *output, - size_t nelems, float *input_ptrs[], bool reduce_to_first = true) -{ - const size_t block_size = 16 * 1024 / sizeof(float); - const size_t blocks_number = nelems / block_size; - const size_t tail = nelems % block_size; - -#pragma omp parallel - { - const int ithr = mkldnn_get_thread_num(); - const int nthr = mkldnn_get_num_threads(); - size_t start{ 0 }, end{ 0 }; - balance211(blocks_number, nthr, ithr, start, end); - - for (size_t nb = start; nb < end; ++nb) { - size_t start_e = nb * block_size; - size_t end_e = start_e + block_size; - if (!reduce_to_first) { - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < end_e; e++) { - output[e] = input_ptrs[0][e]; - } - } - for (int a = 1; a < num_arrs; a++) { - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < end_e; e++) { - output[e] += input_ptrs[a][e]; - } - } - } - - if (tail != 0 && ithr == nthr - 1) { - size_t start_e = nelems - tail; - size_t end_e = nelems; - if (!reduce_to_first) { - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < end_e; e++) { - output[e] = input_ptrs[0][e]; - } - } - for (int a = 1; a < num_arrs; a++) { - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < end_e; e++) { - output[e] += input_ptrs[a][e]; - } - } - } - } -} - -void subarray_sum(int num_arrs, float *output, size_t nelems, - float *input_ptrs[], size_t input_starts[], size_t input_ends[]) -{ - using namespace nstl; - const size_t block_size = 16 * 1024 / sizeof(float); - const size_t blocks_number = nelems / block_size; - const size_t tail = nelems % block_size; - -#pragma omp parallel - { - const int ithr = mkldnn_get_thread_num(); - const int nthr = mkldnn_get_num_threads(); - size_t start{ 0 }, end{ 0 }; - balance211(blocks_number, nthr, ithr, start, end); - - for (size_t nb = start; nb < end; ++nb) { - size_t start_e = nb * block_size; - size_t end_e = start_e + block_size; - size_t input_start = max(start_e, min(input_starts[0], end_e)); - size_t input_end = max(start_e, min(input_ends[0], end_e)); - - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < input_start; e++) { - output[e] = 0.f; - } - - PRAGMA_OMP_SIMD() - for (size_t e = input_start; e < input_end; e++) { - output[e] = input_ptrs[0][e]; - } - - PRAGMA_OMP_SIMD() - for (size_t e = input_end; e < end_e; e++) { - output[e] = 0.f; - } - for (int a = 1; a < num_arrs; a++) { - input_start = max(start_e, input_starts[a]); - input_end = min(input_ends[a], end_e); - - PRAGMA_OMP_SIMD() - for (size_t e = input_start; e < input_end; e++) { - output[e] += input_ptrs[a][e]; - } - } - } - - if (tail != 0 && ithr == nthr - 1) { - size_t start_e = nelems - tail; - size_t end_e = nelems; - size_t input_start = max(start_e, min(input_starts[0], end_e)); - size_t input_end = max(start_e, min(input_ends[0], end_e)); - - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < input_start; e++) { - output[e] = 0.f; - } - - PRAGMA_OMP_SIMD() - for (size_t e = input_start; e < input_end; e++) { - output[e] = input_ptrs[0][e]; - } - - PRAGMA_OMP_SIMD() - for (size_t e = input_end; e < end_e; e++) { - output[e] = 0.f; - } - for (int a = 1; a < num_arrs; a++) { - input_start = max(start_e, input_starts[a]); - input_end = min(input_ends[a], end_e); - - PRAGMA_OMP_SIMD() - for (size_t e = start_e; e < end_e; e++) { - output[e] += input_ptrs[a][e]; - } - } - } - } -} -} // namespace - -void jit_avx512_common_convolution_winograd_bwd_weights_t:: -_execute_backward_weights_S_D_Giot_W() -{ - const auto &jcp = kernel_->jcp; - const int nthreads = scratchpad_->num_threads(); - int U_size = jcp.oc * jcp.ic * alpha * alpha * sizeof(float); - - auto diff_src_transform_bwd_weights_ver = jcp.ver == ver_4fma ? - diff_src_transform_bwd_weights : - diff_src_transform_bwd_weights; - auto diff_dst_transform_bwd_weights_ver = jcp.with_bias - ? diff_dst_transform_bwd_weights - : diff_dst_transform_bwd_weights; - - array_offset_calculator diff_src((float *)this->input_memory(0), - jcp.mb, jcp.ic / simd_w, jcp.ih, jcp.iw, simd_w); - array_offset_calculator diff_dst((float *)this->input_memory(1), - jcp.mb, jcp.oc / simd_w, jcp.oh, jcp.ow, simd_w); - array_offset_calculator diff_weights((float *)this->memory(0), - jcp.oc / simd_w, jcp.ic / simd_w, jcp.kh, jcp.kw, simd_w, simd_w); - array_offset_calculator diff_bias( - conf_.want_padded_bias() ? padded_bias_ : (float *)this->memory(1), - jcp.oc / simd_w, simd_w); - - array_offset_calculator U((float *)(scratchpad_->U_ptr()), - jcp.nb_ic, jcp.nb_oc, - alpha, alpha, - jcp.oc_block, jcp.ic_block, - jcp.ic_simd_block, jcp.oc_simd_block); - - array_offset_calculator Us( - (float *)(scratchpad_->U_ptr() + U_size), - 0, jcp.nb_ic, jcp.nb_oc, - alpha, alpha, - jcp.oc_block, jcp.ic_block, - jcp.ic_simd_block, jcp.oc_simd_block); - - array_offset_calculator M((float *)(scratchpad_->M_ptr()), - jcp.nb_oc, alpha, alpha, - jcp.tile_block, jcp.oc_block, - jcp.nb_tile_block_ur, jcp.tile_block_ur * jcp.tile_4fma, - jcp.oc_simd_block); - - array_offset_calculator V((float *)(scratchpad_->V_ptr()), - jcp.nb_ic, alpha, alpha, - jcp.tile_block, jcp.ic_block, - jcp.nb_tile_block_ur, jcp.tile_block_ur, - jcp.ic_simd_block * jcp.tile_4fma); - - const int trans_buffer_size = alpha * alpha * jcp.tile_4fma - * jcp.ic_simd_block; - array_offset_calculator trans_buffer( - (float *)(scratchpad_->src_transpose_ptr()), - nthreads, - trans_buffer_size); - - array_offset_calculator diff_bias_prv( - (float *)(scratchpad_->bias_ptr()), nthreads, jcp.oc); - -#pragma omp parallel - { - if (jcp.with_bias) { - parallel_nd_in_omp(nthreads, jcp.oc, [&](int ithr, int ofm) { - diff_bias_prv(ithr, ofm) = 0.0f; - }); -#pragma omp for nowait - for (int bofm = 0; bofm < jcp.oc / simd_w; bofm++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) - diff_bias(bofm, v) = 0.0f; - } - } - } - -#pragma omp parallel - { - const int ithread = mkldnn_get_thread_num(); - parallel_nd_in_omp(jcp.mb, jcp.nb_ic, jcp.ic_block, - [&](int img, int ifm1, int ifm2) { - float *transb = jcp.ver == ver_4fma - ? &(trans_buffer(ithread, 0)) - : NULL; - diff_src_transform_bwd_weights_ver(img, jcp, - &(diff_src(img, ifm1 * jcp.ic_block + ifm2, - 0, 0, 0)), - &(V(ifm1, 0, 0, 0, ifm2, 0, 0, 0)), - transb, - kernel_->transpose_4fma_ker); - }); - } - -#pragma omp parallel num_threads(nthreads) - { - parallel_nd_in_omp(jcp.mb, jcp.nb_oc, jcp.oc_block, - [&](int img, int ofm1, int ofm2) { - const int ithread = mkldnn_get_thread_num(); - float *dbias = jcp.with_bias - ? &(diff_bias_prv(ithread, - simd_w * (ofm1 * jcp.oc_block + ofm2))) - : NULL; - diff_dst_transform_bwd_weights_ver(img, jcp, - &(diff_dst(img, ofm1 * jcp.oc_block + ofm2, 0, 0, 0)), - &(M(ofm1, 0, 0, 0, ofm2, 0, 0, 0)), dbias); - }); - } - - size_t input_starts[max_threads_number]; - size_t input_ends[max_threads_number]; - int th_counter = 0; -#pragma omp parallel firstprivate(th_counter) num_threads(nthreads) - { - parallel_nd_in_omp(jcp.nb_ic, jcp.nb_oc, alpha, alpha, jcp.tile_block, - [&](int ifm1, int ofm1, int oj, int oi, int tile_block) { - int ithr = mkldnn_get_thread_num(); - if (th_counter == 0) { - input_starts[ithr] = (float *)&(Us(ithr, ifm1, ofm1, - oj, oi, 0, 0, 0, 0)) - (float *)&(Us(ithr, 0, 0, - 0, 0, 0, 0, 0, 0)); - input_ends[ithr] = input_starts[ithr] - + jcp.oc_block * jcp.ic_block - * jcp.ic_simd_block * jcp.oc_simd_block; - } - else if (tile_block == 0) { - input_ends[ithr] += jcp.oc_block * jcp.ic_block - * jcp.ic_simd_block * jcp.oc_simd_block; - } - - if (th_counter == 0 || tile_block == 0) { - kernel_->gemm_loop_ker_first_iter( - &(Us(ithr, ifm1, ofm1, oj, oi, 0, 0, 0, 0)), - &(M(ofm1, oj, oi, tile_block, 0, 0, 0, 0)), - &(V(ifm1, oj, oi, tile_block, 0, 0, 0, 0))); - } else { - kernel_->gemm_loop_ker( - &(Us(ithr, ifm1, ofm1, oj, oi, 0, 0, 0, 0)), - &(M(ofm1, oj, oi, tile_block, 0, 0, 0, 0)), - &(V(ifm1, oj, oi, tile_block, 0, 0, 0, 0))); - } - th_counter++; - }); - } - - - // Reduce diff-weights - { - float *output = &(U(0, 0, 0, 0, 0, 0, 0, 0)); - size_t nelems = jcp.ic * jcp.oc * alpha * alpha; - float *input_ptrs[max_threads_number]; - for (int i = 0; i < nthreads; i++) - input_ptrs[i] = output + nelems * (i + 1); - subarray_sum( - nthreads, output, nelems, input_ptrs, input_starts, input_ends); - } - - parallel_nd(jcp.nb_ic, jcp.nb_oc, jcp.oc_block, jcp.ic_block, - [&](int ifm1, int ofm1, int ofm2, int ifm2) { - diff_weights_transform_bwd_weights(jcp, - &(diff_weights(ofm1 * jcp.oc_block + ofm2, - ifm1 * jcp.ic_block + ifm2, - 0, 0, 0, 0)), - &(U(ifm1, ofm1, 0, 0, ofm2, ifm2, 0, 0))); - }); - -#pragma omp parallel - if (jcp.with_bias) { -#pragma omp for - for (int ofm1 = 0; ofm1 < jcp.oc / simd_w; ofm1++) { - for (int ithr = 0; ithr < nthreads; ithr++) { - float* base_bias_ptr = &(diff_bias(ofm1, 0)); - float* base_bias_prv_ptr = &(diff_bias_prv( - ithr * jcp.oc + ofm1 * simd_w)); - PRAGMA_OMP_SIMD() - for (int ofm2 = 0; ofm2 < simd_w; ofm2++) { - base_bias_ptr[ofm2] += base_bias_prv_ptr[ofm2]; - } - } - } - } - - _maybe_execute_diff_bias_copy(); -} - -void jit_avx512_common_convolution_winograd_bwd_weights_t:: -_execute_backward_weights_SDGtWo() -{ - const auto &jcp = kernel_->jcp; - const int nthreads = scratchpad_->num_threads(); - - auto diff_src_transform_bwd_weights_ver_tile = jcp.ver == ver_4fma ? - diff_src_transform_bwd_weights_tile : - diff_src_transform_bwd_weights_tile; - auto diff_dst_transform_bwd_weights_ver = jcp.with_bias - ? diff_dst_transform_bwd_weights_tile - : diff_dst_transform_bwd_weights_tile; - - array_offset_calculator diff_src((float *)this->input_memory(0), - jcp.mb, jcp.ic / simd_w, jcp.ih, jcp.iw, simd_w); - array_offset_calculator diff_dst((float *)this->input_memory(1), - jcp.mb, jcp.oc / simd_w, jcp.oh, jcp.ow, simd_w); - array_offset_calculator diff_weights((float *)this->memory(0), - jcp.oc / simd_w, jcp.ic / simd_w, jcp.kh, jcp.kw, simd_w, simd_w); - array_offset_calculator diff_bias( - conf_.want_padded_bias() ? padded_bias_ : (float *)this->memory(1), - jcp.nb_oc, jcp.oc_block, simd_w); - - array_offset_calculator Us((float *)(scratchpad_->U_ptr()), - 0, jcp.nb_ic, alpha, alpha, - jcp.oc_block, jcp.ic_block, - jcp.ic_simd_block, jcp.oc_simd_block); - - array_offset_calculator M((float *)(scratchpad_->M_ptr()), - 0, alpha, alpha, - jcp.oc_block, - jcp.nb_tile_block_ur, jcp.tile_block_ur * jcp.tile_4fma, - jcp.oc_simd_block); - - array_offset_calculator V((float *)(scratchpad_->V_ptr()), - 0, jcp.nb_ic, alpha, alpha, - jcp.ic_block, - jcp.nb_tile_block_ur, jcp.tile_block_ur, - jcp.ic_simd_block * jcp.tile_4fma); - - array_offset_calculator diff_bias_prv( - (float *)(scratchpad_->bias_ptr()), - nthreads, jcp.oc / jcp.nb_oc); - - for (int ofm1 = 0; ofm1 < jcp.nb_oc; ++ofm1) { - int th_counter = 0; - -#pragma omp parallel - { - if (jcp.with_bias) { - parallel_nd_in_omp(nthreads, jcp.oc / jcp.nb_oc, - [&](int ithr, int ofm) { - diff_bias_prv(ithr, ofm) = 0.0f; - }); -#pragma omp for nowait - for (int bofm = 0; bofm < jcp.oc_block; bofm++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) - diff_bias(ofm1, bofm, v) = 0.0f; - } - } - } - -#pragma omp parallel firstprivate(th_counter) num_threads(nthreads) -#pragma omp for nowait - for (int tile_block = 0; tile_block < jcp.tile_block; tile_block++) { - int ithr = mkldnn_get_thread_num(); - for (int ifm1 = 0; ifm1 < jcp.nb_ic; ++ifm1) { - for (int ifm2 = 0; ifm2 < jcp.ic_block; ++ifm2) { - diff_src_transform_bwd_weights_ver_tile(tile_block, jcp, - &(diff_src(0, ifm1 * jcp.ic_block + ifm2, 0, 0, 0)), - &(V(ithr, ifm1, 0, 0, ifm2, 0, 0, 0)), - kernel_->transpose_4fma_ker); - } - } - - for (int ofm2 = 0; ofm2 < jcp.oc_block; ofm2++) { - float *dbias = jcp.with_bias - ? &(diff_bias_prv(ithr, simd_w * ofm2)) - : NULL; - diff_dst_transform_bwd_weights_ver(tile_block, jcp, - &(diff_dst(0, ofm1 * jcp.oc_block + ofm2, 0, 0, 0)), - &(M(ithr, 0, 0, ofm2, 0, 0, 0)), - dbias); - } - - for (int ifm1 = 0; ifm1 < jcp.nb_ic; ifm1++) { - for (int oj = 0; oj < alpha; oj++) { - for (int oi = 0; oi < alpha; oi++) { - if (th_counter == 0) - kernel_->gemm_loop_ker_first_iter( - &(Us(ithr, ifm1, oj, oi, 0, 0, 0, 0)), - &(M(ithr, oj, oi, 0, 0, 0, 0)), - &(V(ithr, ifm1, oj, oi, 0, 0, 0, 0))); - else - kernel_->gemm_loop_ker( - &(Us(ithr, ifm1, oj, oi, 0, 0, 0, 0)), - &(M(ithr, oj, oi, 0, 0, 0, 0)), - &(V(ithr, ifm1, oj, oi, 0, 0, 0, 0))); - } - } - } - th_counter++; - } - // Reduce diff-weights - { - float *output = (float *)(scratchpad_->U_ptr()); - size_t nelems - = jcp.ic * (jcp.oc / jcp.nb_oc) * alpha * alpha; - float *input_ptrs[max_threads_number]; - for (int i = 0; i < nthreads; i++) { - input_ptrs[i] = output + nelems * i; - } - array_sum(nthreads, output, nelems, input_ptrs); - } - - parallel_nd(jcp.nb_ic, jcp.oc_block, jcp.ic_block, - [&](int ifm1, int ofm2, int ifm2) { - diff_weights_transform_bwd_weights(jcp, - &(diff_weights(ofm1 * jcp.oc_block + ofm2, - ifm1 * jcp.ic_block + ifm2, - 0, 0, 0, 0)), - &(Us(0, ifm1, 0, 0, ofm2, ifm2, 0, 0))); - }); - -#pragma omp parallel - if (jcp.with_bias) { -#pragma omp for - for (int ofm2 = 0; ofm2 < jcp.oc_block; ofm2++) { - for (int ithr = 0; ithr < nthreads; ithr++) { - float* base_bias_ptr = &(diff_bias(ofm1, ofm2, 0)); - float* base_bias_prv_ptr = &(diff_bias_prv( - ithr * jcp.oc_block * simd_w + ofm2 * simd_w)); - PRAGMA_OMP_SIMD() - for (int ofm3 = 0; ofm3 < simd_w; ofm3++) { - base_bias_ptr[ofm3] += base_bias_prv_ptr[ofm3]; - } - } - } - } - } - - _maybe_execute_diff_bias_copy(); -} - -void jit_avx512_common_convolution_winograd_bwd_weights_t:: -_execute_backward_weights_SDGt_W() -{ - const auto &jcp = kernel_->jcp; - const int nthreads = scratchpad_->num_threads(); - - auto diff_src_transform_bwd_weights_ver_tile = jcp.ver == ver_4fma ? - diff_src_transform_bwd_weights_tile : - diff_src_transform_bwd_weights_tile; - auto diff_dst_transform_bwd_weights_ver = jcp.with_bias - ? diff_dst_transform_bwd_weights_tile - : diff_dst_transform_bwd_weights_tile; - - array_offset_calculator diff_src((float *)this->input_memory(0), - jcp.mb, jcp.ic / simd_w, jcp.ih, jcp.iw, simd_w); - array_offset_calculator diff_dst((float *)this->input_memory(1), - jcp.mb, jcp.oc / simd_w, jcp.oh, jcp.ow, simd_w); - array_offset_calculator diff_weights((float *)this->memory(0), - jcp.oc / simd_w, jcp.ic / simd_w, jcp.kh, jcp.kw, simd_w, simd_w); - array_offset_calculator diff_bias( - conf_.want_padded_bias() ? padded_bias_ : (float *)this->memory(1), - jcp.oc / simd_w, simd_w); - - array_offset_calculator U((float *)(scratchpad_->U_ptr()), - jcp.nb_oc, jcp.nb_ic, - alpha, alpha, - jcp.oc_block, jcp.ic_block, - jcp.ic_simd_block, jcp.oc_simd_block); - - array_offset_calculator Us((float *)(scratchpad_->U_ptr()), - 0, jcp.nb_oc, jcp.nb_ic, - alpha, alpha, - jcp.oc_block, jcp.ic_block, - jcp.ic_simd_block, jcp.oc_simd_block); - - array_offset_calculator M((float *)(scratchpad_->M_ptr()), - 0, jcp.nb_oc, alpha, alpha, jcp.oc_block, - jcp.nb_tile_block_ur, jcp.tile_block_ur * jcp.tile_4fma, - jcp.oc_simd_block); - - array_offset_calculator V((float *)(scratchpad_->V_ptr()), - 0, jcp.nb_ic, alpha, alpha, jcp.ic_block, - jcp.nb_tile_block_ur, jcp.tile_block_ur, - jcp.ic_simd_block * jcp.tile_4fma); - - array_offset_calculator diff_bias_prv( - (float *)(scratchpad_->bias_ptr()), - nthreads, jcp.oc); - -#pragma omp parallel - { - if (jcp.with_bias) { - parallel_nd_in_omp(nthreads, jcp.oc, - [&](int ithr, int ofm) { - diff_bias_prv(ithr, ofm) = 0.0f; - }); -#pragma omp for nowait - for (int bofm = 0; bofm < jcp.oc / simd_w; bofm++) { - PRAGMA_OMP_SIMD() - for (int v = 0; v < simd_w; v++) - diff_bias(bofm, v) = 0.0f; - } - } - } - - int th_counter = 0; -#pragma omp parallel firstprivate(th_counter) num_threads(nthreads) -#pragma omp for nowait - for (int tile_block = 0; tile_block < jcp.tile_block; tile_block++) { - int ithr = mkldnn_get_thread_num(); - - for (int ifm1 = 0; ifm1 < jcp.nb_ic; ++ifm1) { - for (int ifm2 = 0; ifm2 < jcp.ic_block; ++ifm2) { - diff_src_transform_bwd_weights_ver_tile(tile_block, jcp, - &(diff_src(0, ifm1 * jcp.ic_block + ifm2, - 0, 0, 0)), - &(V(ithr, ifm1, 0, 0, ifm2, 0, 0, 0)), - kernel_->transpose_4fma_ker); - } - } - - for (int ofm1 = 0; ofm1 < jcp.nb_oc; ofm1++) { - for (int ofm2 = 0; ofm2 < jcp.oc_block; ofm2++) { - float *dbias = jcp.with_bias - ? &(diff_bias_prv(ithr, - simd_w * (ofm1 * jcp.oc_block + ofm2))) - : NULL; - diff_dst_transform_bwd_weights_ver(tile_block, jcp, - &(diff_dst(0, ofm1 * jcp.oc_block + ofm2, - 0, 0, 0)), - &(M(ithr, ofm1, 0, 0, ofm2, 0, 0, 0)), - dbias); - } - } - - for (int ofm1 = 0; ofm1 < jcp.nb_oc; ofm1++) { - for (int oj = 0; oj < alpha; oj++) { - for (int oi = 0; oi < alpha; oi++) { - for (int ifm1 = 0; ifm1 < jcp.nb_ic; ifm1++) { - if (th_counter == 0) - kernel_->gemm_loop_ker_first_iter( - &(Us(ithr, ofm1, ifm1, oj, oi, 0, 0, 0, 0)), - &(M(ithr, ofm1, oj, oi, 0, 0, 0, 0)), - &(V(ithr, ifm1, oj, oi, 0, 0, 0, 0))); - else - kernel_->gemm_loop_ker( - &(Us(ithr, ofm1, ifm1, oj, oi, 0, 0, 0, 0)), - &(M(ithr, ofm1, oj, oi, 0, 0, 0, 0)), - &(V(ithr, ifm1, oj, oi, 0, 0, 0, 0))); - } - } - } - } - th_counter++; - } - - // Reduce diff-weights - { - float *output = (float *)(scratchpad_->U_ptr()); - size_t nelems = jcp.ic * jcp.oc * alpha * alpha; - float *input_ptrs[max_threads_number]; - for (int i = 0; i < nthreads; i++) { - input_ptrs[i] = output + nelems * i; - } - array_sum(nthreads, output, nelems, input_ptrs); - } - - parallel_nd(jcp.nb_oc, jcp.nb_ic, jcp.oc_block, jcp.ic_block, - [&](int ofm1, int ifm1, int ofm2, int ifm2) { - diff_weights_transform_bwd_weights(jcp, - &(diff_weights(ofm1 * jcp.oc_block + ofm2, - ifm1 * jcp.ic_block + ifm2, 0, 0, 0, 0)), - &(U(ofm1, ifm1, 0, 0, ofm2, ifm2, 0, 0))); - }); - -#pragma omp parallel - if (jcp.with_bias) { -#pragma omp for - for (int ofm1 = 0; ofm1 < jcp.oc / simd_w; ofm1++) { - for (int ithr = 0; ithr < nthreads; ithr++) { - float* base_bias_ptr = &(diff_bias(ofm1, 0)); - float* base_bias_prv_ptr = &(diff_bias_prv( - ithr * jcp.oc + ofm1 * simd_w)); - PRAGMA_OMP_SIMD() - for (int ofm2 = 0; ofm2 < simd_w; ofm2++) { - base_bias_ptr[ofm2] += base_bias_prv_ptr[ofm2]; - } - } - } - } - - _maybe_execute_diff_bias_copy(); -} } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.hpp index fbdf9eb..6f6bb0f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.hpp @@ -18,9 +18,9 @@ #define CPU_JIT_AVX512_COMMON_CONVOLUTION_WINOGRAD_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" -#include "scratchpad.hpp" #include "mkldnn_thread.hpp" #include "jit_avx512_common_conv_winograd_kernel_f32.hpp" @@ -29,152 +29,36 @@ namespace mkldnn { namespace impl { namespace cpu { -namespace winograd { +namespace winograd_avx512_common { +inline void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_winograd_conf_t &jcp) { + using namespace memory_tracking::names; -struct winograd_scratchpad_t { - public: - winograd_scratchpad_t(const jit_conv_winograd_conf_t &jcp) - { - get_scratchpad_size_(jcp); - allocate_scratchpad_(jcp); - } - - ~winograd_scratchpad_t() { - if (scratchpad_ != nullptr) - delete scratchpad_; - } - - char *U_ptr() { - /* buffer for wei transform U*/ - return scratchpad_->get() + U_offset_; - } + size_t U_sz = (size_t)alpha * alpha * jcp.ic * jcp.oc; + size_t V_sz = (size_t)alpha * alpha * jcp.mb * jcp.ic + * (jcp.itiles * jcp.jtiles + jcp.tile_4fma_padding); + size_t M_sz = (size_t)alpha * alpha * jcp.mb * jcp.oc + * (jcp.itiles * jcp.jtiles + jcp.tile_4fma_padding); - char *V_ptr() { - /* buffer for src transform V*/ - return scratchpad_->get() + V_offset_; - } - - char *M_ptr() { - /* buffer for dst transform M*/ - return scratchpad_->get() + M_offset_; - } + scratchpad.book(key_wino_U, sizeof(float) * U_sz, PAGE_2M); + scratchpad.book(key_wino_V, sizeof(float) * V_sz, PAGE_2M); + scratchpad.book(key_wino_M, sizeof(float) * M_sz, PAGE_2M); - char *bias_ptr() { - /* buffer for bias update in bwdw*/ - return scratchpad_->get() + bias_offset_; - } + if (jcp.sched_policy == WSCHED_WEI_S_D_G_W) { + const int nthr = mkldnn_get_max_threads(); - char *src_transpose_ptr() { - /* buffer for src transpose in bwdw using qfma*/ - return scratchpad_->get() + src_transpose_offset_; - } + size_t tr_src_sz = jcp.ver != ver_4fma ? 0 : (size_t)nthr + * alpha * alpha * jcp.tile_4fma * jcp.ic_simd_block; + scratchpad.book(key_conv_tr_src, sizeof(float) * tr_src_sz, PAGE_2M); - int num_threads(){ - return nthreads_; - } + size_t br_sz = jcp.with_bias ? nthr * jcp.oc : 0; + scratchpad.book(key_conv_bia_reduction, sizeof(float) * br_sz, PAGE_2M); - private: - inline void get_scratchpad_size_(const jit_conv_winograd_conf_t &jcp) { - nthreads_ = mkldnn_get_max_threads(); - - U_sz_ = (size_t)alpha * alpha * jcp.ic * jcp.oc * sizeof(float); - V_sz_ = (size_t)alpha * alpha * jcp.mb * jcp.ic - * (jcp.itiles * jcp.jtiles + jcp.tile_4fma_padding) - * sizeof(float); - M_sz_ = (size_t)alpha * alpha * jcp.mb * jcp.oc - * (jcp.itiles * jcp.jtiles + jcp.tile_4fma_padding) - * sizeof(float); - - switch (jcp.sched_policy) { - case WSCHED_DATA_W_SGD: - V_sz_ = (size_t)nthreads_ * alpha * alpha - * jcp.nb_tile_block_ur * jcp.tile_block_ur - * jcp.ic * sizeof(float); - M_sz_ = (size_t)nthreads_* alpha * alpha - * jcp.nb_tile_block_ur * jcp.tile_block_ur - * jcp.oc * sizeof(float); - break; - case WSCHED_WEI_SDGt_W: - U_sz_ = (size_t)nthreads_ * U_sz_; - V_sz_ = (size_t)nthreads_ * alpha * alpha - * (jcp.nb_tile_block_ur * jcp.tile_block_ur - + jcp.tile_4fma_padding) - * jcp.ic * sizeof(float); - M_sz_ = (size_t)nthreads_ * alpha * alpha - * (jcp.nb_tile_block_ur * jcp.tile_block_ur - + jcp.tile_4fma_padding) - * jcp.oc * sizeof(float); - bias_sz_ = nthreads_ * jcp.oc * sizeof(float); - break; - case WSCHED_WEI_SDGtWo: - U_sz_ = (size_t)nthreads_ * alpha * alpha - * jcp.oc_block * jcp.oc_simd_block * jcp.ic * sizeof(float); - M_sz_ = (size_t)nthreads_ * alpha * alpha - * (jcp.nb_tile_block_ur * jcp.tile_block_ur - + jcp.tile_4fma_padding) - * jcp.oc_simd_block * jcp.oc_block * sizeof(float); - bias_sz_ = nthreads_ * jcp.oc * sizeof(float); - break; - case WSCHED_WEI_S_D_Giot_W: - U_sz_ = (size_t)(nthreads_ + 1) * alpha * alpha - * jcp.ic * jcp.oc * sizeof(float); - V_sz_ = (size_t)alpha * alpha - * (jcp.itiles * jcp.jtiles + jcp.tile_4fma_padding) - * jcp.ic * jcp.mb * sizeof(float); - M_sz_ = (size_t)alpha * alpha - * (jcp.itiles * jcp.jtiles + jcp.tile_4fma_padding) - * jcp.oc * jcp.mb * sizeof(float); - bias_sz_ = nthreads_ * jcp.oc * sizeof(float); - src_transpose_sz_ = jcp.ver == ver_4fma - ? ((size_t)nthreads_ * alpha * alpha - * jcp.tile_4fma - * jcp.ic_simd_block * sizeof(float)) - : 0; - break; - case WSCHED_WEI_S_D_G_W: - src_transpose_sz_ = jcp.ver == ver_4fma - ? ((size_t)nthreads_ * alpha * alpha - * jcp.tile_4fma - * jcp.ic_simd_block * sizeof(float)) - : 0; - bias_sz_ = jcp.with_bias ? nthreads_ * jcp.oc * sizeof(float) : 0; - break; - default: - break; - } - } - - inline void allocate_scratchpad_(const jit_conv_winograd_conf_t &jcp) { - const size_t page_size = PAGE_2M; - U_offset_ = 0; - V_offset_ = utils::rnd_up(U_sz_, page_size); - M_offset_ = V_offset_ + utils::rnd_up(V_sz_, page_size); - scratchpad_sz_ = M_offset_ + M_sz_; - if (src_transpose_sz_) { - src_transpose_offset_ = M_offset_ - + utils::rnd_up(M_sz_, page_size); - scratchpad_sz_ = src_transpose_offset_ + src_transpose_sz_; - } - if (bias_sz_) { - bias_offset_ = src_transpose_sz_ - ? src_transpose_offset_ - + utils::rnd_up(src_transpose_sz_, page_size) - : M_offset_ + utils::rnd_up(M_sz_, page_size); - scratchpad_sz_ = bias_offset_ + bias_sz_; - } - scratchpad_ = create_scratchpad(scratchpad_sz_); - } - - scratchpad_t *scratchpad_; - int nthreads_; - size_t scratchpad_sz_ = 0, U_sz_ = 0, V_sz_ = 0, M_sz_ = 0, - bias_sz_ = 0, src_transpose_sz_ = 0; - size_t U_offset_ = 0; - size_t V_offset_ = 0; - size_t M_offset_ = 0; - size_t bias_offset_ = 0; - size_t src_transpose_offset_ = 0; // only relevant for bwdw using qfma -}; + size_t padded_bias_sz = + jcp.with_bias && jcp.oc_without_padding != jcp.oc ? jcp.oc : 0; + scratchpad.book(key_conv_padded_bias, sizeof(float) * padded_bias_sz); + } +} } template @@ -182,67 +66,72 @@ struct _jit_avx512_common_convolution_winograd_t { _jit_avx512_common_convolution_winograd_t( const jit_conv_winograd_conf_t &jcp, const primitive_attr_t *attr) - : kernel_(nullptr), scratchpad_(nullptr), attr_(attr) { + : kernel_(nullptr), attr_(attr) { kernel_ = new _jit_avx512_common_conv_winograd_data_kernel_f32(jcp); - scratchpad_ = new winograd::winograd_scratchpad_t(jcp); } - ~_jit_avx512_common_convolution_winograd_t() { - delete kernel_; - delete scratchpad_; - }; + ~_jit_avx512_common_convolution_winograd_t() { delete kernel_; } protected: void _execute_data_W_S_G_D(const int MB, float *inp_ptr, float *out_ptr, - float *wei_ptr, float *bias_ptr = NULL); - void _execute_data_W_SGD(const int MB, float *inp_ptr, float *out_ptr, - float *wei_ptr, float *bias_ptr = NULL); + float *wei_ptr, float *bias_ptr, + const memory_tracking::grantor_t &scratchpad) const; _jit_avx512_common_conv_winograd_data_kernel_f32 *kernel_; - // Buffer required to store transforms in the frequency domain - winograd::winograd_scratchpad_t *scratchpad_; const primitive_attr_t *attr_; }; -template -struct _jit_avx512_common_convolution_winograd_fwd_t +struct jit_avx512_common_convolution_winograd_fwd_t : _jit_avx512_common_convolution_winograd_t , public cpu_primitive_t { - struct pd_t : public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, + struct pd_t : public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_wino:", avx512_common, ""), - _jit_avx512_common_convolution_winograd_fwd_t); + jit_avx512_common_convolution_winograd_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), data_type::f32 - == this->cdesc_().bias_desc.data_type) + == this->desc()->bias_desc.data_type) && mkldnn_thr_syncable(); + if (!ok) return status::unimplemented; - return jit_avx512_common_conv_winograd_fwd_kernel_f32::init_conf( - jcp_, this->cdesc_(), *this->src_pd_.desc(), - *this->weights_pd_.desc(), *this->dst_pd_.desc(), - *this->attr(), with_relu, this->negative_slope()); + status_t status = + jit_avx512_common_conv_winograd_fwd_kernel_f32::init_conf( + jcp_, *this->desc(), *this->src_pd_.desc(), + *this->weights_pd_.desc(), *this->dst_pd_.desc(), + *this->attr()); + if (status != status::success) return status; + + auto scratchpad = this->scratchpad_registry().registrar(); + winograd_avx512_common::init_scratchpad(scratchpad, jcp_); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return status; } jit_conv_winograd_conf_t jcp_; @@ -264,45 +153,32 @@ struct _jit_avx512_common_convolution_winograd_fwd_t } }; - _jit_avx512_common_convolution_winograd_fwd_t(const pd_t *pd, + jit_avx512_common_convolution_winograd_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : _jit_avx512_common_convolution_winograd_t(pd->jcp_, pd->attr()) - , cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) {} + : _jit_avx512_common_convolution_winograd_t(apd->jcp_, apd->attr()) + , cpu_primitive_t(apd, inputs, outputs, true) {} - ~_jit_avx512_common_convolution_winograd_fwd_t(){}; + ~jit_avx512_common_convolution_winograd_fwd_t(){}; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { float *src = (float *)this->input_memory(0); float *dst = (float *)this->memory(); float *weights = (float *)this->input_memory(1); float *bias = (float *)this->input_memory(2); - switch ((conf_.jcp_).sched_policy) { - case WSCHED_DATA_W_S_G_D: - this->_execute_data_W_S_G_D(conf_.MB(), src, dst, weights, bias); - break; - case WSCHED_DATA_W_SGD: - this->_execute_data_W_SGD(conf_.MB(), src, dst, weights, bias); - break; - default: - break; - } + this->_execute_data_W_S_G_D(pd()->MB(), src, dst, weights, bias, + this->scratchpad()); + e->set_state(event_t::ready); } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; -using jit_avx512_common_convolution_winograd_fwd_t - = _jit_avx512_common_convolution_winograd_fwd_t; -using jit_avx512_common_convolution_winograd_relu_t - = _jit_avx512_common_convolution_winograd_fwd_t; - struct jit_avx512_common_convolution_winograd_bwd_data_t : _jit_avx512_common_convolution_winograd_t, public cpu_primitive_t { @@ -323,20 +199,33 @@ struct jit_avx512_common_convolution_winograd_bwd_data_t assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward_data) - && this->desc()->alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->diff_src_desc.data_type, this->desc()->weights_desc.data_type, this->desc()->diff_dst_desc.data_type) && mkldnn_thr_syncable(); + if (!ok) return status::unimplemented; - return jit_avx512_common_conv_winograd_bwd_data_kernel_f32:: - init_conf(jcp_, *this->desc(), *this->diff_src_pd_.desc(), - *this->weights_pd_.desc(), - *this->diff_dst_pd_.desc()); + status_t status = + jit_avx512_common_conv_winograd_bwd_data_kernel_f32::init_conf( + jcp_, *this->desc(), *this->diff_src_pd_.desc(), + *this->weights_pd_.desc(), *this->diff_dst_pd_.desc()); + if (status != status::success) return status; + + auto scratchpad = this->scratchpad_registry().registrar(); + winograd_avx512_common::init_scratchpad(scratchpad, jcp_); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return status; } jit_conv_winograd_conf_t jcp_; @@ -357,44 +246,32 @@ struct jit_avx512_common_convolution_winograd_bwd_data_t } }; - jit_avx512_common_convolution_winograd_bwd_data_t(const pd_t *pd, + jit_avx512_common_convolution_winograd_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : _jit_avx512_common_convolution_winograd_t(pd->jcp_, pd->attr()) - , cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) {} + : _jit_avx512_common_convolution_winograd_t(apd->jcp_, apd->attr()) + , cpu_primitive_t(apd, inputs, outputs, true) {} ~jit_avx512_common_convolution_winograd_bwd_data_t(){}; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { + assert(pd()->desc()->prop_kind == prop_kind::backward_data + && "invalid prop_kind"); + float *diff_dst = (float *)this->input_memory(0); float *diff_src = (float *)this->memory(); float *weights = (float *)this->input_memory(1); - if (conf_.desc()->prop_kind == prop_kind::backward_data) { - switch ((conf_.jcp_).sched_policy) { - case WSCHED_DATA_W_S_G_D: - this->_execute_data_W_S_G_D(conf_.MB(), diff_dst, diff_src, weights, NULL); - break; - - case WSCHED_DATA_W_SGD: - this->_execute_data_W_SGD(conf_.MB(), diff_dst, diff_src, weights, NULL); - break; - - default: - break; - } - } else { - assert(!"invalid prop_kind"); - } + this->_execute_data_W_S_G_D(pd()->MB(), diff_dst, diff_src, weights, nullptr, + this->scratchpad()); e->set_state(event_t::ready); } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; struct jit_avx512_common_convolution_winograd_bwd_weights_t @@ -417,7 +294,9 @@ struct jit_avx512_common_convolution_winograd_bwd_weights_t assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward_weights) - && this->desc()->alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, @@ -427,10 +306,21 @@ struct jit_avx512_common_convolution_winograd_bwd_weights_t if (!ok) return status::unimplemented; - return jit_avx512_common_conv_winograd_bwd_weights_kernel_f32:: - init_conf(jcp_, *this->desc(), *this->src_pd_.desc(), - *this->diff_dst_pd_.desc(), - *this->diff_weights_pd_.desc()); + status_t status = + jit_avx512_common_conv_winograd_bwd_weights_kernel_f32:: + init_conf(jcp_, *this->desc(), *this->src_pd_.desc(), + *this->diff_dst_pd_.desc(), + *this->diff_weights_pd_.desc()); + if (status != status::success) return status; + + auto scratchpad = this->scratchpad_registry().registrar(); + winograd_avx512_common::init_scratchpad(scratchpad, jcp_); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return status; } jit_conv_winograd_conf_t jcp_; @@ -453,72 +343,35 @@ struct jit_avx512_common_convolution_winograd_bwd_weights_t } }; - jit_avx512_common_convolution_winograd_bwd_weights_t(const pd_t *pd, + jit_avx512_common_convolution_winograd_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) - , kernel_(nullptr) - , scratchpad_(nullptr) - , padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs, true), kernel_(nullptr) { - auto jcp = conf_.jcp_; kernel_ = new jit_avx512_common_conv_winograd_bwd_weights_kernel_f32( - jcp); - scratchpad_ = new winograd::winograd_scratchpad_t(jcp); - if (conf_.want_padded_bias()) - padded_bias_ = (float *)malloc(sizeof(float) * jcp.oc, 64); + pd()->jcp_); } ~jit_avx512_common_convolution_winograd_bwd_weights_t() - { - delete kernel_; - delete scratchpad_; - free(padded_bias_); - }; + { delete kernel_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { - if (conf_.desc()->prop_kind == prop_kind::backward_weights) { - const auto &jcp = kernel_->jcp; - switch (jcp.sched_policy) { - case WSCHED_WEI_S_D_G_W: - _execute_backward_weights_S_D_G_W(); - break; - case WSCHED_WEI_S_D_Giot_W: - _execute_backward_weights_S_D_Giot_W(); - break; - case WSCHED_WEI_SDGtWo: - _execute_backward_weights_SDGtWo(); - break; - case WSCHED_WEI_SDGt_W: - _execute_backward_weights_SDGt_W(); - break; - default: - assert(!"Unknown Winograd schedule policy!"); - break; - } - } - else - assert(!"invalid prop_kind"); + assert(pd()->desc()->prop_kind == prop_kind::backward_weights + && "invalid prop_kind"); + _execute_backward_weights_S_D_G_W(scratchpad()); e->set_state(event_t::ready); } private: - void _execute_backward_weights_S_D_G_W(); - void _execute_backward_weights_S_D_Giot_W(); - void _execute_backward_weights_SDGtWo(); - void _execute_backward_weights_SDGt_W(); - void _maybe_execute_diff_bias_copy(); + void _execute_backward_weights_S_D_G_W( + const memory_tracking::grantor_t &scratchpad) const; + void _maybe_execute_diff_bias_copy( + const memory_tracking::grantor_t &scratchpad) const; - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_avx512_common_conv_winograd_bwd_weights_kernel_f32 *kernel_; - - // Buffer required to store transforms in the frequency domain - winograd::winograd_scratchpad_t *scratchpad_; - - float *padded_bias_; }; void trans_W_4x4_3x3(float Fw_[6][6][16][16], float F[3][3][16][16]); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.cpp index 3d1701f..1953182 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.cpp @@ -349,20 +349,20 @@ status_t jit_avx512_common_lrn_fwd_t::pd_t::init() { return args_ok_across ? success : unimplemented; } -jit_avx512_common_lrn_fwd_t::jit_avx512_common_lrn_fwd_t(const pd_t *pd, +jit_avx512_common_lrn_fwd_t::jit_avx512_common_lrn_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs) , use_h_parallelism(0), ker_(nullptr), ker_first_(nullptr) , ker_last_(nullptr) { using namespace alg_kind; - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); - const int ls = conf_.desc()->local_size; - const float alpha = conf_.desc()->lrn_alpha / ls; - const float k = conf_.desc()->lrn_k; + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); + const int ls = pd()->desc()->local_size; + const float alpha = pd()->desc()->lrn_alpha / ls; + const float k = pd()->desc()->lrn_k; - auto pk = conf_.desc()->prop_kind; + auto pk = pd()->desc()->prop_kind; use_h_parallelism = H > 28 ? 1 : 0; @@ -382,15 +382,15 @@ jit_avx512_common_lrn_fwd_t::jit_avx512_common_lrn_fwd_t(const pd_t *pd, jit_avx512_common_lrn_fwd_t::~jit_avx512_common_lrn_fwd_t() { delete ker_; delete ker_first_; delete ker_last_; } -void jit_avx512_common_lrn_fwd_t::execute_forward() { +void jit_avx512_common_lrn_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); auto ws = reinterpret_cast(this->memory(1)); - const int N = conf_.MB(); - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); + const int N = pd()->MB(); + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); parallel(0, [&](const int ithr, const int nthr) { size_t start{0}, end{0}; @@ -761,17 +761,17 @@ status_t jit_avx512_common_lrn_bwd_t::pd_t::init() { return args_ok_across ? success : unimplemented; } -jit_avx512_common_lrn_bwd_t::jit_avx512_common_lrn_bwd_t(const pd_t *pd, +jit_avx512_common_lrn_bwd_t::jit_avx512_common_lrn_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs) , use_h_parallelism(0), ker_(nullptr), ker_first_(nullptr) , ker_last_(nullptr) { - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); - const int ls = conf_.desc()->local_size; - const float alpha = conf_.desc()->lrn_alpha / ls; - const float beta = conf_.desc()->lrn_beta; + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); + const int ls = pd()->desc()->local_size; + const float alpha = pd()->desc()->lrn_alpha / ls; + const float beta = pd()->desc()->lrn_beta; use_h_parallelism = H > 28 ? 1 : 0; @@ -791,16 +791,16 @@ jit_avx512_common_lrn_bwd_t::jit_avx512_common_lrn_bwd_t(const pd_t *pd, jit_avx512_common_lrn_bwd_t::~jit_avx512_common_lrn_bwd_t() { delete ker_; delete ker_first_; delete ker_last_; } -void jit_avx512_common_lrn_bwd_t::execute_backward() { +void jit_avx512_common_lrn_bwd_t::execute_backward() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto ws = reinterpret_cast(this->input_memory(2)); auto diff_src = reinterpret_cast(this->memory(0)); - const int N = conf_.MB(); - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); + const int N = pd()->MB(); + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); parallel(0, [&](const int ithr, const int nthr) { size_t start{0}, end{0}; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.hpp index 10b5bb8..8ec624a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.hpp @@ -39,20 +39,20 @@ struct jit_avx512_common_lrn_fwd_t: public cpu_primitive_t { virtual status_t init() override; }; - jit_avx512_common_lrn_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_avx512_common_lrn_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_avx512_common_lrn_fwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } int use_h_parallelism; struct jit_avx512_common_lrn_kernel_f32; @@ -73,20 +73,20 @@ struct jit_avx512_common_lrn_bwd_t: public cpu_primitive_t { virtual status_t init() override; }; - jit_avx512_common_lrn_bwd_t(const pd_t *pd, const input_vector &inputs, + jit_avx512_common_lrn_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_avx512_common_lrn_bwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } int use_h_parallelism; struct jit_avx512_common_lrn_kernel_f32; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.cpp index 1239186..82a18b6 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.cpp @@ -32,6 +32,7 @@ namespace impl { namespace cpu { using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; @@ -247,7 +248,6 @@ bool jit_avx512_core_fp32_wino_conv_2x3_dst_trans_t::maybe_relu(int position) { if (position == 0) { /* relu before sum */ return false - || jcp.with_relu || p.contain(eltwise, 0); } else if (position == 1) { /* relu after sum */ @@ -411,7 +411,6 @@ struct jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t: public jit_generator { cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope, memory_desc_t& expect_wei_md); Zmm vreg_out(int n, int m) { @@ -448,26 +447,14 @@ bool jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t::post_ops_ok( using namespace primitive_kind; const auto &p = attr.post_ops_; - auto is_relu = [&](int idx) { - return p.entry_[idx].kind == eltwise - && p.entry_[idx].eltwise.scale == 1. - && p.entry_[idx].eltwise.alg == alg_kind::eltwise_relu - && p.entry_[idx].eltwise.alpha == 0.; - }; + auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); }; - switch (p.len_) { + switch (p.len_) { case 0: return true; - case 1: return true - && IMPLICATION(jcp.with_relu, p.contain(sum, 0)) - && IMPLICATION(!jcp.with_relu, is_relu(0) || p.contain(sum, 0)); - case 2: return true - && IMPLICATION(jcp.with_relu, p.contain(sum, 0) && is_relu(1)) - && IMPLICATION(!jcp.with_relu, false - || (p.contain(sum, 0) && is_relu(1)) - || (p.contain(sum, 1) && is_relu(0))); - case 3: return true - && jcp.with_relu == false - && (is_relu(0) && p.contain(sum, 1) && is_relu(2)); + case 1: return is_relu(0) || p.contain(sum, 0); + case 2: return (p.contain(sum, 0) && is_relu(1)) || + (p.contain(sum, 1) && is_relu(0)); + case 3: return is_relu(0) && p.contain(sum, 1) && is_relu(2); default: return false; } @@ -577,12 +564,17 @@ void jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t::generate() { postamble(); } +namespace { +bool is_winograd_faster_than_direct(const jit_conv_conf_2x3_wino_t &jcp) { + return jcp.mb >= 4; +} +} + status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( jit_conv_conf_2x3_wino_t &jcp, const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &wei_pd, cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope, - memory_desc_t &expect_wei_md) { + const primitive_attr_t &attr, memory_desc_t &expect_wei_md) { const memory_desc_wrapper src_d(&src_pd); const memory_desc_wrapper wei_d(&wei_pd); const memory_desc_wrapper dst_d(&dst_pd); @@ -590,6 +582,8 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( const bool with_groups = wei_d.ndims() == src_d.ndims() + 1; + jcp.nthr = mkldnn_get_max_threads(); + jcp.ngroups = with_groups ? wei_d.dims()[0] : 1; jcp.mb = src_d.dims()[0]; jcp.oc = dst_d.dims()[1] / jcp.ngroups; @@ -616,10 +610,7 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( int simdw = 16; jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_relu = with_relu; - jcp.relu_negative_slope = relu_negative_slope; - if (!IMPLICATION(with_relu, relu_negative_slope == 0.)) - return status::unimplemented; + if (!post_ops_ok(jcp, attr)) return status::unimplemented; @@ -639,6 +630,10 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( if (!(mayiuse(avx512_core))) return status::unimplemented; + if (!IMPLICATION(cd.alg_kind == alg_kind::convolution_auto, + is_winograd_faster_than_direct(jcp))) + return status::unimplemented; + if (src_d.data_type() != data_type::f32) return status::unimplemented; if (wei_d.data_type() != data_type::f32) @@ -673,7 +668,6 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( auto wei_sz = (float)aa * ic * oc; auto inp_sz = (float)mb * ih * iw * ic; auto sp_sz = (float)mb * ih * iw; - const int nthr = mkldnn_get_max_threads(); /* Heuristics here. Numbers '28','196' is an observation from data. */ if (wei_sz / inp_sz > 5) @@ -681,10 +675,10 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( else jcp.small_mb = false; - if (mb > nstl::min(nthr, 28) + if (mb > nstl::min(jcp.nthr, 28) || (!jcp.small_mb && (wei_sz >= 0.9f * L2_cap - || inp_sz > L2_cap * nthr + L3_capacity)) + || inp_sz > L2_cap * jcp.nthr + L3_capacity)) || (jcp.small_mb && sp_sz > 196)) return unimplemented; @@ -749,7 +743,7 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( /* outer parallelization */ int nblocks = mb * div_up(ih, iy) * div_up(iw, ix); - thr_eff = (float)nblocks / rnd_up(nblocks, nthr); + thr_eff = (float)nblocks / rnd_up(nblocks, jcp.nthr); mem_eff = 1.f; req_mem = (((float)ix + 2) * (iy + 2) + aa * M) * Z + aa * Y; @@ -765,14 +759,15 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( /* inner parallelization */ int bsz = iy * ix / a; int gemmw = aa * (nb_oc / n2_b); - int bsz_r = rnd_up(bsz, nthr); - int gemmw_r = rnd_up(gemmw, nthr); + int bsz_r = rnd_up(bsz, jcp.nthr); + int gemmw_r = rnd_up(gemmw, jcp.nthr); thr_eff = ((float)Z * bsz / bsz_r + Y * gemmw / gemmw_r) / (Z + Y); req_mem = (float)ix * iy * (ic + simdw * n2_b) + simdw * n2_b * ic; mem_eff = nstl::min(1.f, L2_cap / req_mem); - int M_per_thr = nstl::max(2, div_up(aa, nthr)); - int oc_per_thr = nstl::min(oc, div_up(aa * (nb_oc / n2_b), nthr)); + int M_per_thr = nstl::max(2, div_up(aa, jcp.nthr)); + int oc_per_thr = + nstl::min(oc, div_up(aa * (nb_oc / n2_b), jcp.nthr)); req_mem = (float)aa * oc_per_thr * ic + M_per_thr * M * Z; if (req_mem > L2_cap) mem_eff = 0.1f; @@ -839,63 +834,34 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf( } //////////////////////////////////////////////////////////////////////////////// -template -status_t _jit_avx512_core_fp32_wino_conv_2x3_fwd_t +status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_t ::pd_t::jit_conf(memory_desc_t& expect_wei_md) { return jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t::init_conf( - jcp_, this->cdesc_(), this->src_pd_, this->weights_pd_, - this->dst_pd_,this->bias_pd_, *this->attr(), - with_relu, this->negative_slope(), expect_wei_md); + jcp_, *this->desc(), this->src_pd_, this->weights_pd_, + this->dst_pd_,this->bias_pd_, *this->attr(), expect_wei_md); } -template -_jit_avx512_core_fp32_wino_conv_2x3_fwd_t:: - _jit_avx512_core_fp32_wino_conv_2x3_fwd_t(const pd_t *pd, +jit_avx512_core_fp32_wino_conv_2x3_fwd_t:: + jit_avx512_core_fp32_wino_conv_2x3_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd), padded_bias_(nullptr) { - const int nthreads = mkldnn_get_max_threads(); + : cpu_primitive_t(apd, inputs, outputs) +{ kernel_ = new jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t( - conf_.jcp_, *conf_.attr()); + pd()->jcp_, *pd()->attr()); src_trans_ = new jit_avx512_core_fp32_wino_conv_2x3_src_trans_t( - conf_.jcp_, *conf_.attr()); + pd()->jcp_, *pd()->attr()); dst_trans_ = new jit_avx512_core_fp32_wino_conv_2x3_dst_trans_t( - conf_.jcp_, *conf_.attr()); - - int wino_size_offset - = (conf_.jcp_.yb / 2) * (conf_.jcp_.xb / 2) + (conf_.jcp_.xb); - - size_wino_src = (conf_.jcp_.ic * 16) * (wino_size_offset); - size_wino_dst = (conf_.jcp_.oc * 16) * (wino_size_offset); - - wino_src_ = (float *)malloc(sizeof(float) * nthreads * size_wino_src, 4096); - wino_dst_ = (float *)malloc(sizeof(float) * nthreads * size_wino_dst, 4096); - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (float *)malloc(sizeof(float) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; - } - - + pd()->jcp_, *pd()->attr()); } -template -_jit_avx512_core_fp32_wino_conv_2x3_fwd_t - ::~_jit_avx512_core_fp32_wino_conv_2x3_fwd_t() { +jit_avx512_core_fp32_wino_conv_2x3_fwd_t + ::~jit_avx512_core_fp32_wino_conv_2x3_fwd_t() { delete kernel_; delete src_trans_; delete dst_trans_; - - free(wino_src_); - free(wino_dst_); - free(padded_bias_); } -template -void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t< - with_relu>::execute_forward() { +void jit_avx512_core_fp32_wino_conv_2x3_fwd_t::execute_forward() const { const auto &jcp = kernel_->jcp; if (jcp.small_mb) @@ -904,33 +870,41 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t< execute_forward_mbN(); } -template -void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t -::execute_forward_mbN() { +void jit_avx512_core_fp32_wino_conv_2x3_fwd_t::execute_forward_mbN() const { auto src = reinterpret_cast(input_memory(0)); auto wei = reinterpret_cast(input_memory(1)); auto bia = reinterpret_cast(input_memory(2)); auto dst = reinterpret_cast(memory(0)); - const auto &jcp = kernel_->jcp; - const auto &oscales = conf_.attr()->output_scales_; - - wino_wei_ = wei; + auto scratchpad = this->scratchpad(); - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bia[oc]; - bia = padded_bias_; + const auto &jcp = kernel_->jcp; + const auto &oscales = pd()->attr()->output_scales_; + + const size_t wino_size_offset = + (size_t)(pd()->jcp_.yb / 2) * (pd()->jcp_.xb / 2) + (pd()->jcp_.xb); + const size_t size_wino_src = wino_size_offset * pd()->jcp_.ic * 16; + const size_t size_wino_dst = wino_size_offset * pd()->jcp_.oc * 16; + + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad.get(key_conv_padded_bias); + utils::array_copy(padded_bias, bia, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bia = padded_bias; } + auto ptr_V = scratchpad.get(key_wino_V); + auto ptr_M = scratchpad.get(key_wino_M); + parallel_nd(jcp.mb, div_up(jcp.oh,jcp.yb), div_up(jcp.ow, jcp.xb), [&](int mb, int tile_y_b, int tile_x_b) { int tile_y = tile_y_b * jcp.yb; int tile_x = tile_x_b * jcp.xb; int ithr = mkldnn_get_thread_num(); - auto wino_src = wino_src_ + size_wino_src * ithr; - auto wino_dst = wino_dst_ + size_wino_dst * ithr; + auto wino_src = ptr_V + size_wino_src * ithr; + auto wino_dst = ptr_M + size_wino_dst * ithr; auto src_trans_p = jit_avx512_core_fp32_wino_conv_2x3_src_trans_t @@ -985,7 +959,7 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t int offset = (tile_ij + ithr) % 16; gemm_p.src = wino_src + jcp.inp_stride * offset; gemm_p.dst = wino_dst + jcp.out_stride * offset; - gemm_p.wei = wino_wei_ + jcp.wei_stride * offset; + gemm_p.wei = wei + jcp.wei_stride * offset; kernel_->ker_(&gemm_p); } @@ -1027,25 +1001,29 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t }); } -template -void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t - ::execute_forward_small_mb() { +void jit_avx512_core_fp32_wino_conv_2x3_fwd_t::execute_forward_small_mb() const +{ auto src = reinterpret_cast(input_memory(0)); auto wei = reinterpret_cast(input_memory(1)); auto bia = reinterpret_cast(input_memory(2)); auto dst = reinterpret_cast(memory(0)); - const auto &jcp = kernel_->jcp; - const auto &oscales = conf_.attr()->output_scales_; - - wino_wei_ = wei; + auto scratchpad = this->scratchpad(); - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bia[oc]; - bia = padded_bias_; + const auto &jcp = kernel_->jcp; + const auto &oscales = pd()->attr()->output_scales_; + + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad.get(key_conv_padded_bias); + utils::array_copy(padded_bias, bia, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bia = padded_bias; } + auto ptr_V = scratchpad.get(key_wino_V); + auto ptr_M = scratchpad.get(key_wino_M); + for (int mb = 0; mb < jcp.mb; mb++) { for (int tile_y = 0; tile_y < jcp.oh; tile_y += jcp.yb) { for (int tile_x = 0; tile_x < jcp.ow; tile_x += jcp.xb) { @@ -1080,7 +1058,7 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t auto local_s = src + mb * jcp.nb_ic * jcp.ih * jcp.iw * jcp.ic_block + y * jcp.iw * jcp.ic_block + x * jcp.ic_block; - auto local_w = wino_src_ + m * jcp.ic; + auto local_w = ptr_V + m * jcp.ic; src_trans_p.src = local_s; src_trans_p.wino_src = local_w; @@ -1095,10 +1073,10 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t auto gemm_p = jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t :: call_params_t(); - gemm_p.src = wino_src_ + jcp.inp_stride * tile_ij; - gemm_p.dst = wino_dst_ + jcp.out_stride * tile_ij + gemm_p.src = ptr_V + jcp.inp_stride * tile_ij; + gemm_p.dst = ptr_M + jcp.out_stride * tile_ij + nnb * jcp.n2_block * jcp.n_block; - gemm_p.wei = wino_wei_ + jcp.wei_stride * tile_ij + gemm_p.wei = wei + jcp.wei_stride * tile_ij + nnb * jcp.n2_block * jcp.n_block * jcp.K; kernel_->ker_(&gemm_p); @@ -1128,7 +1106,7 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t auto local_d = dst + mb * jcp.nb_oc * jcp.oh * jcp.ow * jcp.oc_block + y * jcp.ow * jcp.oc_block + x * jcp.oc_block; - auto local_w = wino_dst_ + m * jcp.oc; + auto local_w = ptr_M + m * jcp.oc; auto scales = oscales.scales_; dst_trans_p.dst = local_d; @@ -1144,9 +1122,6 @@ void _jit_avx512_core_fp32_wino_conv_2x3_fwd_t }}} } -template struct _jit_avx512_core_fp32_wino_conv_2x3_fwd_t; -template struct _jit_avx512_core_fp32_wino_conv_2x3_fwd_t; - } // namespace cpu } // namespace impl } // namespace mkldnn diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp index cd4d5da..ec7d05b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp @@ -37,46 +37,52 @@ struct jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t; struct jit_avx512_core_fp32_wino_conv_2x3_src_trans_t; struct jit_avx512_core_fp32_wino_conv_2x3_dst_trans_t; -template -struct _jit_avx512_core_fp32_wino_conv_2x3_fwd_t : public cpu_primitive_t { - struct pd_t : public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, +struct jit_avx512_core_fp32_wino_conv_2x3_fwd_t : public cpu_primitive_t { + struct pd_t : public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_fp32_wino_2x3:", avx512_core, ""), - _jit_avx512_core_fp32_wino_conv_2x3_fwd_t); + jit_avx512_core_fp32_wino_conv_2x3_fwd_t); virtual status_t init() override { using namespace prop_kind; using namespace memory_format; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_winograd - && this->cdesc_().src_desc.data_type == data_type::f32 - && this->cdesc_().dst_desc.data_type == data_type::f32 - && this->cdesc_().weights_desc.data_type == data_type::f32 + && utils::one_of(this->desc()->prop_kind, forward_inference) + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) + && this->desc()->src_desc.data_type == data_type::f32 + && this->desc()->dst_desc.data_type == data_type::f32 + && this->desc()->weights_desc.data_type == data_type::f32 && IMPLICATION(this->with_bias(), - utils::one_of(this->cdesc_().bias_desc.data_type, + utils::one_of(this->desc()->bias_desc.data_type, data_type::f32)); if (!ok) return status::unimplemented; memory_desc_t expect_wei_md = *(this->weights_pd_.desc()); status_t jit_conf_result = jit_conf(expect_wei_md); - if (jit_conf_result == success) { - cpu_memory_t::pd_t new_weights_pd(this->engine_, &expect_wei_md); - if (this->weights_pd_.desc()->format == any) - this->weights_pd_ = new_weights_pd; - if (!this->weights_pd_.is_equal(&new_weights_pd)) - return status::unimplemented; - } - return jit_conf_result; + if (jit_conf_result != success) return jit_conf_result; + + cpu_memory_t::pd_t new_weights_pd(this->engine_, &expect_wei_md); + if (this->weights_pd_.desc()->format == any) + this->weights_pd_ = new_weights_pd; + if (!this->weights_pd_.is_equal(&new_weights_pd)) + return unimplemented; + + init_scratchpad(); + + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return success; } jit_conv_conf_2x3_wino_t jcp_; @@ -84,6 +90,25 @@ struct _jit_avx512_core_fp32_wino_conv_2x3_fwd_t : public cpu_primitive_t { protected: status_t jit_conf(memory_desc_t& expect_wei_md); + void init_scratchpad() { + using namespace memory_tracking::names; + + auto scratchpad = this->scratchpad_registry().registrar(); + + int wino_size_offset = (jcp_.yb / 2) * (jcp_.xb / 2) + jcp_.xb; + + size_t V_sz = (size_t)jcp_.ic * 16 * wino_size_offset * jcp_.nthr; + scratchpad.book(key_wino_V, sizeof(float) * V_sz, PAGE_4K); + + size_t M_sz = (size_t)jcp_.oc * 16 * wino_size_offset * jcp_.nthr; + scratchpad.book(key_wino_M, sizeof(float) * M_sz, PAGE_4K); + + if (wants_padded_bias()) { + assert(jcp_.ngroups == 1); + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp_.oc); + } + } + virtual status_t set_default_params() override { using namespace memory_format; if (this->src_pd_.desc()->format == any) @@ -96,43 +121,27 @@ struct _jit_avx512_core_fp32_wino_conv_2x3_fwd_t : public cpu_primitive_t { } }; - _jit_avx512_core_fp32_wino_conv_2x3_fwd_t(const pd_t *pd, + jit_avx512_core_fp32_wino_conv_2x3_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); - - ~_jit_avx512_core_fp32_wino_conv_2x3_fwd_t(); - virtual void execute(event_t *e) { + ~jit_avx512_core_fp32_wino_conv_2x3_fwd_t(); + + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - void execute_forward_small_mb(); - void execute_forward_mbN(); - pd_t conf_; + void execute_forward() const; + void execute_forward_small_mb() const; + void execute_forward_mbN() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t *kernel_; jit_avx512_core_fp32_wino_conv_2x3_src_trans_t *src_trans_; jit_avx512_core_fp32_wino_conv_2x3_dst_trans_t *dst_trans_; - - size_t size_wino_wei; - size_t size_wino_src; - size_t size_wino_dst; - - const float *wino_wei_; - const float *dst_bias_; - - float *wino_src_; - float *wino_dst_; - float *padded_bias_; }; -using jit_avx512_core_fp32_wino_conv_2x3_fwd_t = - _jit_avx512_core_fp32_wino_conv_2x3_fwd_t; - -using jit_avx512_core_fp32_wino_convolution_relu_t = - _jit_avx512_core_fp32_wino_conv_2x3_fwd_t; } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.cpp index 4b9fbd6..60e2a69 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.cpp @@ -25,7 +25,6 @@ #include "type_helpers.hpp" #include "utils.hpp" -#include "jit_avx512_common_convolution_winograd.hpp" #include "jit_avx512_core_fp32_wino_conv_4x3.hpp" #ifndef _MSC_VER @@ -41,12 +40,13 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; template void _jit_avx512_core_fp32_wino_conv_4x3_t ::weight_transform_data(const jit_conv_winograd_conf_t &jcp, - float *wp, float *twp) + float *wp, float *twp) const { float G[] = {0.26890756302521f, 0.688403361344538f, 0.119514472455649f, 1.13777777777778f, 0.430252100840336f, 0.179271708683473f}; @@ -70,7 +70,7 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t template void _jit_avx512_core_fp32_wino_conv_4x3_t::output_transform_data (int image, const jit_conv_winograd_conf_t &jcp, - const post_ops_t &p_ops, float *toutp, float *pout_b, float *bias) { + const post_ops_t &p_ops, float *toutp, float *pout_b, float *bias) const { float G[] = {0.625f, 1.5f, 0.390625f, 2.25f, 0.244140625f, 3.375f}; float Ow[alpha][alpha][simd_w]; @@ -121,7 +121,7 @@ template void _jit_avx512_core_fp32_wino_conv_4x3_t ::output_transform_tileblock_data(int tile_block, const jit_conv_winograd_conf_t &jcp, const post_ops_t &p_ops, - float *toutp, float *outp, float *bias) { + float *toutp, float *outp, float *bias) const { float G[] = {0.625f, 1.5f, 0.390625f, 2.25f, 0.244140625f, 3.375f}; float Ow[alpha][alpha][simd_w]; @@ -171,7 +171,7 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t template void _jit_avx512_core_fp32_wino_conv_4x3_t ::input_transform_data(int image, const jit_conv_winograd_conf_t &jcp, - float *inp, float *tinp) + float *inp, float *tinp) const { float G[] = {-2.25f, -0.390625f, 0.87890625f, -2.640625f, 0.625f, -0.625f, 1.5f, -1.5f, -2.640625f}; @@ -224,7 +224,7 @@ template void _jit_avx512_core_fp32_wino_conv_4x3_t ::input_transform_tileblock_data(int tile_block, const jit_conv_winograd_conf_t &jcp, - float *inp, float *tinp) + float *inp, float *tinp) const { float G[] = {-2.25f, -0.390625f, 0.87890625f, -2.640625f, 0.625f, -0.625f, 1.5f, -1.5f, -2.640625f}; @@ -280,7 +280,8 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t template void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( - const int MB, float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr) { + const int MB, float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr, + const memory_tracking::grantor_t &scratchpad) const { const auto &jcp = kernel_->jcp; const auto &p_ops = attr_->post_ops_; @@ -306,10 +307,9 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( array_offset_calculator bias(bias_ptr, jcp.dimM/jcp.dimM_simd_block, jcp.dimM_simd_block); - array_offset_calculator M( - (float *)((is_fwd - ? (this->scratchpad_)->M_ptr() - : (this->scratchpad_)->V_ptr())), + array_offset_calculator M(is_fwd + ? scratchpad.template get(key_wino_M) + : scratchpad.template get(key_wino_V), jcp.dimN_nb_block, jcp.dimM_nb_block, alpha, alpha, jcp.dimN_block, jcp.dimM_block * jcp.dimM_reg_block, @@ -317,7 +317,7 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( auto wino_wei = (jcp.prop_kind == prop_kind::forward_inference) ? wei_ptr - : (float *)(this->scratchpad_)->U_ptr(); + : scratchpad.template get(key_wino_U); array_offset_calculator U(wino_wei, jcp.dimM_nb_block, @@ -325,23 +325,22 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( jcp.dimK_nb_block, jcp.dimM_block * jcp.dimM_reg_block, jcp.dimK_block, jcp.dimK_reg_block, jcp.dimM_simd_block); - array_offset_calculator V( - (float *)((is_fwd - ? (this->scratchpad_)->V_ptr() - : (this->scratchpad_)->M_ptr())), + array_offset_calculator V(is_fwd + ? scratchpad.template get(key_wino_V) + : scratchpad.template get(key_wino_M), jcp.dimN_nb_block, alpha, alpha, jcp.dimN_block, jcp.dimK_nb_block, jcp.dimK_block, jcp.dimN_reg_block, jcp.dimK_reg_block); - const bool want_padded_bias = jcp.with_bias + const bool wants_padded_bias = jcp.with_bias && jcp.oc_without_padding != jcp.oc; float last_slice_bias[simd_w] = {0}; - if (want_padded_bias) { + if (wants_padded_bias) { for (int oc = 0; oc < jcp.oc_without_padding % jcp.oc_simd_block; ++oc) last_slice_bias[oc] = bias(jcp.dimM / jcp.dimM_simd_block - 1, oc); } -#pragma omp parallel +PRAGMA_OMP(parallel) { parallel_nd_in_omp(MB, jcp.dimK_nb_block, jcp.dimK_block, [&](int img, int K_blk1, int K_blk2) { @@ -367,7 +366,7 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( }); } -#pragma omp barrier +PRAGMA_OMP(barrier) parallel_nd_in_omp(jcp.dimN_nb_block, alpha, alpha, jcp.dimM_nb_block, [&](int N_blk1, int oj, int oi, int M_blk1) { @@ -383,14 +382,14 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( N_blk2, K_blk1, 0, 0, 0)), K_blk1); }); -#pragma omp barrier +PRAGMA_OMP(barrier) parallel_nd_in_omp(MB, jcp.dimM_nb_block, (jcp.dimM_block * jcp.dimM_reg_block), [&](int img, int M_blk1, int M_blk2) { const int M_blk = M_blk1 * jcp.dimM_block * jcp.dimM_reg_block + M_blk2; - float *bias_ptr = want_padded_bias + float *bias_ptr = wants_padded_bias && M_blk == jcp.dimM / jcp.dimM_simd_block - 1 ? last_slice_bias : &bias(M_blk, 0); output_transform_data(img, jcp, p_ops, @@ -400,16 +399,11 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( } } -template void -_jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( - const int, float *, float *, float *, float *); -template void -_jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_S_G_D( - const int, float *, float *, float *, float *); - template -void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( - const int MB, float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr) { +void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD(const int MB, + float *inp_ptr, float *out_ptr, float *wei_ptr, float *bias_ptr, + const memory_tracking::grantor_t &scratchpad) const { + const auto &jcp = kernel_->jcp; const auto &p_ops = attr_->post_ops_; @@ -430,7 +424,7 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( auto wino_wei = (jcp.prop_kind == prop_kind::forward_inference) ? wei_ptr - : (float *)(this->scratchpad_)->U_ptr(); + : scratchpad.template get(key_wino_U); array_offset_calculator U(wino_wei, jcp.dimM_nb_block, @@ -439,25 +433,23 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( jcp.dimM_block * jcp.dimM_reg_block, jcp.dimK_block, jcp.dimK_reg_block, jcp.dimM_simd_block); - array_offset_calculator M( - (float *)((is_fwd - ? (this->scratchpad_)->M_ptr() - : (this->scratchpad_)->V_ptr())), + array_offset_calculator M(is_fwd + ? scratchpad.template get(key_wino_M) + : scratchpad.template get(key_wino_V), 0, jcp.dimM_nb_block, alpha, alpha, jcp.dimN_block, jcp.dimM_block * jcp.dimM_reg_block, jcp.dimN_reg_block, jcp.dimM_simd_block); - array_offset_calculator V( - (float *)((is_fwd - ? (this->scratchpad_)->V_ptr() - : (this->scratchpad_)->M_ptr())), + array_offset_calculator V(is_fwd + ? scratchpad.template get(key_wino_V) + : scratchpad.template get(key_wino_M), 0, alpha, alpha, jcp.dimN_block, jcp.dimK_nb_block, jcp.dimK_block, jcp.dimN_reg_block, jcp.dimK_reg_block); - const bool want_padded_bias = jcp.with_bias + const bool wants_padded_bias = jcp.with_bias && jcp.oc_without_padding != jcp.oc; float last_slice_bias[simd_w] = {0}; - if (want_padded_bias) { + if (wants_padded_bias) { for (int oc = 0; oc < jcp.oc_without_padding % jcp.oc_simd_block; ++oc) last_slice_bias[oc] = bias(jcp.dimM / jcp.dimM_simd_block - 1, oc); } @@ -478,12 +470,12 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( }); } -#pragma omp parallel +PRAGMA_OMP(parallel) { int ithr = mkldnn_get_thread_num(); -#pragma omp for schedule(static) +PRAGMA_OMP(for schedule(static)) for (int tile_block = 0; tile_block < jcp.tile_block; tile_block++) { for (int K_blk1 = 0; K_blk1 < jcp.dimK_nb_block; K_blk1++) { for (int K_blk2 = 0; K_blk2 < jcp.dimK_block; K_blk2++) { @@ -516,7 +508,7 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( const int M_blk = M_blk1 * jcp.dimM_block * jcp.dimM_reg_block + M_blk2; - float *bias_ptr = want_padded_bias + float *bias_ptr = wants_padded_bias && M_blk == jcp.dimM / jcp.dimM_simd_block - 1 ? last_slice_bias : &bias(M_blk, 0); @@ -529,12 +521,8 @@ void _jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( } } -template void -_jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( - const int, float *, float *, float *, float *); -template void -_jit_avx512_core_fp32_wino_conv_4x3_t::_execute_data_W_SGD( - const int, float *, float *, float *, float *); +template struct _jit_avx512_core_fp32_wino_conv_4x3_t; +template struct _jit_avx512_core_fp32_wino_conv_4x3_t; namespace { @@ -545,7 +533,7 @@ void subarray_sum(size_t num_arrs, float *output, size_t nelems, const size_t blocks_number = nelems / block_size; const size_t tail = nelems % block_size; -#pragma omp parallel +PRAGMA_OMP(parallel) { const int ithr = mkldnn_get_thread_num(); const int nthr = mkldnn_get_num_threads(); @@ -627,7 +615,7 @@ void array_sum(size_t num_arrs, float *output, const size_t blocks_number = nelems / block_size; const size_t tail = nelems % block_size; -#pragma omp parallel +PRAGMA_OMP(parallel) { const size_t ithr = mkldnn_get_thread_num(); const size_t nthr = mkldnn_get_num_threads(); @@ -672,9 +660,10 @@ void array_sum(size_t num_arrs, float *output, } //bwdw namespace void jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t:: -_execute_backward_weights_SDGtWo() { +_execute_backward_weights_SDGtWo( + const memory_tracking::grantor_t &scratchpad) const { const auto &jcp = kernel_->jcp; - const int nthreads = scratchpad_->num_threads(); + const int nthreads = jcp.nthr; array_offset_calculator src((float *)this->input_memory(0), jcp.mb, jcp.ic / simd_w, jcp.ih, jcp.iw, simd_w); @@ -683,20 +672,20 @@ _execute_backward_weights_SDGtWo() { array_offset_calculator diff_weights((float *)this->memory(0), jcp.oc / simd_w, jcp.ic / simd_w, jcp.kh, jcp.kw, simd_w, simd_w); - array_offset_calculator Us((float *)(scratchpad_->U_ptr()), + array_offset_calculator Us(scratchpad.get(key_wino_U), 0, alpha, alpha, jcp.oc_block, jcp.ic_block, jcp.ic_simd_block, jcp.oc_reg_block, jcp.oc_simd_block); - int U_sz = nthreads * alpha * alpha * jcp.oc / jcp.nb_oc - * jcp.ic / jcp.nb_ic * sizeof(float); + const int U_sz = nthreads * alpha * alpha * jcp.oc / jcp.nb_oc + * jcp.ic / jcp.nb_ic; array_offset_calculatordiff_weights_prv( - (float *)(scratchpad_->U_ptr() + U_sz), + scratchpad.get(key_wino_U) + U_sz, 0, jcp.oc / simd_w, jcp.ic / simd_w, jcp.kh, jcp.kw, simd_w, simd_w); - array_offset_calculator M((float *)(scratchpad_->M_ptr()), + array_offset_calculator M(scratchpad.get(key_wino_M), 0, alpha, alpha, jcp.oc_block, jcp.nb_tile_block_ur, @@ -704,7 +693,7 @@ _execute_backward_weights_SDGtWo() { jcp.oc_reg_block, jcp.oc_simd_block); - array_offset_calculator V((float *)(scratchpad_->V_ptr()), + array_offset_calculator V(scratchpad.get(key_wino_V), 0, alpha, alpha, jcp.ic_block, jcp.nb_tile_block_ur, @@ -712,7 +701,7 @@ _execute_backward_weights_SDGtWo() { jcp.ic_simd_block); array_offset_calculator diff_bias_prv( - (float *)(scratchpad_->bias_ptr()), nthreads, jcp.oc); + scratchpad.get(key_conv_bia_reduction), nthreads, jcp.oc); auto trans_ker_p = jit_wino_transform_call_s(); float I[alpha][alpha][simd_w]; @@ -724,7 +713,7 @@ _execute_backward_weights_SDGtWo() { 1.13777777777778f}; float G_O_3x3_4x4[4] = {2.25f, 0.625f, 1.5f, 0.390625f}; -#pragma omp parallel num_threads(nthreads) firstprivate(trans_ker_p, I, T) +PRAGMA_OMP(parallel num_threads(nthreads) firstprivate(trans_ker_p, I, T)) { if (jcp.with_bias) { parallel_nd_in_omp(nthreads, jcp.oc / simd_w, @@ -740,7 +729,7 @@ _execute_backward_weights_SDGtWo() { int ithr = mkldnn_get_thread_num(); for (int ifm1 = 0; ifm1 < jcp.nb_ic; ++ifm1) { int first_tblk = 0; -#pragma omp for +PRAGMA_OMP(for) for (int tblk1 = 0; tblk1 < jcp.tile_block; ++tblk1) { int tile_index = tblk1 * jcp.nb_tile_block_ur * jcp.tile_block_ur; int img = tile_index / (jcp.itiles * jcp.jtiles); @@ -806,7 +795,7 @@ _execute_backward_weights_SDGtWo() { // Reduce diff-weights { float *output = (float *)(this->memory(0)); - float *input_base = (float *)(scratchpad_->U_ptr() + U_sz); + float *input_base = scratchpad.get(key_wino_U) + U_sz; int nelems = jcp.oc * jcp.ic * jcp.kh * jcp.kw; float *input_ptrs[max_threads_number]; for (int i = 0; i < nthreads; ++i) { @@ -816,7 +805,7 @@ _execute_backward_weights_SDGtWo() { if (jcp.with_bias) { output = (float *)(this->memory(1)); - input_base = (float *)(scratchpad_->bias_ptr()); + input_base = scratchpad.get(key_conv_bia_reduction); for (int i = 0; i < nthreads; ++i) { input_ptrs[i] = input_base + jcp.oc * i; } @@ -827,9 +816,10 @@ _execute_backward_weights_SDGtWo() { } void jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t:: -_execute_backward_weights_S_D_Giot_W() { +_execute_backward_weights_S_D_Giot_W( + const memory_tracking::grantor_t &scratchpad) const { const auto &jcp = kernel_->jcp; - const int nthreads = scratchpad_->num_threads(); + const int nthreads = jcp.nthr; array_offset_calculator src((float *)this->input_memory(0), jcp.mb, jcp.ic / simd_w, jcp.ih, jcp.iw, simd_w); @@ -839,7 +829,7 @@ _execute_backward_weights_S_D_Giot_W() { jcp.oc / simd_w, jcp.ic / simd_w, jcp.kh, jcp.kw, simd_w, simd_w); array_offset_calculator diff_bias((float *)this->memory(1), jcp.oc); - array_offset_calculator U((float *)(scratchpad_->U_ptr()), + array_offset_calculator U(scratchpad.get(key_wino_U), jcp.nb_ic, jcp.nb_oc, alpha, alpha, jcp.oc_block, jcp.ic_block, @@ -847,9 +837,9 @@ _execute_backward_weights_S_D_Giot_W() { jcp.oc_reg_block, jcp.oc_simd_block); - int U_size = jcp.oc * jcp.ic * alpha * alpha * sizeof(float); + const int U_size = jcp.oc * jcp.ic * alpha * alpha; array_offset_calculator Us( - (float *)(scratchpad_->U_ptr() + U_size), + scratchpad.get(key_wino_U) + U_size, 0, jcp.nb_ic, jcp.nb_oc, alpha, alpha, jcp.oc_block, jcp.ic_block, @@ -857,7 +847,7 @@ _execute_backward_weights_S_D_Giot_W() { jcp.oc_reg_block, jcp.oc_simd_block); - array_offset_calculator M((float *)(scratchpad_->M_ptr()), + array_offset_calculator M(scratchpad.get(key_wino_M), jcp.nb_oc, jcp.tile_block, alpha, alpha, @@ -867,7 +857,7 @@ _execute_backward_weights_S_D_Giot_W() { jcp.oc_reg_block, jcp.oc_simd_block); - array_offset_calculator V((float *)(scratchpad_->V_ptr()), + array_offset_calculator V(scratchpad.get(key_wino_V), jcp.nb_ic, jcp.tile_block, alpha, alpha, @@ -876,7 +866,7 @@ _execute_backward_weights_S_D_Giot_W() { jcp.ic_simd_block); array_offset_calculator diff_bias_prv( - (float *)(scratchpad_->bias_ptr()), nthreads, jcp.oc); + scratchpad.get(key_conv_bia_reduction), nthreads, jcp.oc); size_t input_starts[max_threads_number] = {0}; size_t input_ends[max_threads_number] = {0}; @@ -892,7 +882,7 @@ _execute_backward_weights_S_D_Giot_W() { float I[alpha][alpha][simd_w]; float T[alpha][alpha][simd_w]; -#pragma omp parallel firstprivate(first_tblk, trans_ker_p, I, T) +PRAGMA_OMP(parallel firstprivate(first_tblk, trans_ker_p, I, T)) { if (jcp.with_bias) { parallel_nd_in_omp(nthreads, jcp.oc, [&](int ithr, int ofm) { @@ -941,7 +931,7 @@ _execute_backward_weights_S_D_Giot_W() { } }); - #pragma omp barrier + PRAGMA_OMP(barrier) parallel_nd_in_omp(jcp.nb_ic, jcp.nb_oc, alpha, alpha, jcp.tile_block, [&](int ifm1, int ofm1, int oj, int oi, int tblk1){ @@ -991,7 +981,7 @@ _execute_backward_weights_S_D_Giot_W() { } trans_ker_p.G = G_O_3x3_4x4; -#pragma omp parallel firstprivate(trans_ker_p) +PRAGMA_OMP(parallel firstprivate(trans_ker_p)) { parallel_nd_in_omp(jcp.nb_ic, jcp.nb_oc, jcp.oc_block, jcp.ic_block, jcp.oc_reg_block, [&](int ifm1, int ofm1, int ofm2, int ifm2, int ofm3){ diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp index e4ef286..8f4f7a5 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp @@ -18,9 +18,9 @@ #define CPU_JIT_AVX512_CORE_FP32_WINO_CONV_4x3_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" -#include "scratchpad.hpp" #include "jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp" @@ -28,116 +28,50 @@ namespace mkldnn { namespace impl { namespace cpu { -namespace winograd { - -struct winograd_scratchpad_avx512_core_t { - public: - winograd_scratchpad_avx512_core_t(const jit_conv_winograd_conf_t &jcp) - { - get_scratchpad_size_(jcp); - allocate_scratchpad_(jcp); - } - - ~winograd_scratchpad_avx512_core_t() { - if (scratchpad_ != nullptr) - delete scratchpad_; - } - - char *U_ptr() { - /* buffer for wei transform U*/ - return scratchpad_->get() + U_offset_; - } - - char *V_ptr() { - /* buffer for src transform V*/ - return scratchpad_->get() + V_offset_; - } - - char *M_ptr() { - /* buffer for dst transform M*/ - return scratchpad_->get() + M_offset_; - } - - char *bias_ptr() { - /* buffer for bias update in bwdw*/ - return scratchpad_->get() + bias_offset_; - } - - int num_threads(){ - return nthreads_; - } - - private: - inline void get_scratchpad_size_(const jit_conv_winograd_conf_t &jcp) { - nthreads_ = mkldnn_get_max_threads(); - - U_sz_ = size_t(alpha) * alpha * jcp.ic * jcp.oc * sizeof(float); - V_sz_ = size_t(alpha) * alpha * jcp.mb * jcp.ic - * jcp.itiles * jcp.jtiles - * sizeof(float); - M_sz_ = size_t(alpha) * alpha * jcp.mb * jcp.oc - * jcp.itiles * jcp.jtiles - * sizeof(float); - - switch (jcp.sched_policy) { - case WSCHED_DATA_W_SGD: - V_sz_ = nthreads_ * alpha * alpha - * jcp.nb_tile_block_ur * jcp.tile_block_ur - * jcp.ic * sizeof(float); - M_sz_ = nthreads_* alpha * alpha - * jcp.nb_tile_block_ur * jcp.tile_block_ur - * jcp.oc * sizeof(float); - break; - case WSCHED_WEI_SDGtWo: - nthreads_ = nstl::min(mkldnn_get_max_threads(), jcp.tile_block); - - U_sz_ = nthreads_ - * (alpha * alpha * jcp.oc * (jcp.ic / jcp.nb_ic) - + jcp.ic * jcp.oc * jcp.kh * jcp.kw) - * sizeof(float); - M_sz_ = nthreads_ * alpha * alpha - * (jcp.ntiles / jcp.tile_block) - * (jcp.oc / jcp.nb_oc) * sizeof(float); - V_sz_ = nthreads_ * alpha * alpha - * (jcp.ntiles / jcp.tile_block) - * (jcp.ic / jcp.nb_ic) - * sizeof(float); - bias_sz_ = nthreads_ * jcp.oc * sizeof(float); - break; - case WSCHED_WEI_S_D_Giot_W: - U_sz_ = (nthreads_ + 1) * alpha * alpha * jcp.ic * jcp.oc - * sizeof(float); - M_sz_ = size_t(alpha) * alpha * jcp.oc * jcp.ntiles * sizeof(float); - V_sz_ = size_t(alpha) * alpha * jcp.ic * jcp.ntiles * sizeof(float); - bias_sz_ = nthreads_ * jcp.oc * sizeof(float); - break; - default: - break; - } - } +namespace winograd_avx512_core { +inline void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_winograd_conf_t &jcp) { + using namespace utils; + using namespace memory_tracking::names; + + size_t U_sz = (size_t)alpha * alpha * jcp.ic * jcp.oc; + size_t V_sz = (size_t)alpha * alpha * jcp.mb * jcp.ic * jcp.itiles + * jcp.jtiles; + size_t M_sz = (size_t)alpha * alpha * jcp.mb * jcp.oc * jcp.itiles + * jcp.jtiles; + + switch (jcp.sched_policy) { + case WSCHED_DATA_W_SGD: + V_sz = (size_t)jcp.nthr * alpha * alpha * jcp.nb_tile_block_ur + * jcp.tile_block_ur * jcp.ic; + M_sz = (size_t)jcp.nthr * alpha * alpha * jcp.nb_tile_block_ur + * jcp.tile_block_ur * jcp.oc; + break; + case WSCHED_WEI_SDGtWo: + U_sz = (size_t)jcp.nthr * (alpha * alpha * jcp.oc + * (jcp.ic / jcp.nb_ic) + jcp.ic * jcp.oc * jcp.kh * jcp.kw); + M_sz = (size_t)jcp.nthr * alpha * alpha * (jcp.ntiles / jcp.tile_block) + * (jcp.oc / jcp.nb_oc); + V_sz = (size_t)jcp.nthr * alpha * alpha * (jcp.ntiles / jcp.tile_block) + * (jcp.ic / jcp.nb_ic); + break; + case WSCHED_WEI_S_D_Giot_W: + U_sz = (size_t)(jcp.nthr + 1) * alpha * alpha * jcp.ic * jcp.oc; + M_sz = (size_t)alpha * alpha * jcp.oc * jcp.ntiles; + V_sz = (size_t)alpha * alpha * jcp.ic * jcp.ntiles; + break; + default: break; + } - inline void allocate_scratchpad_(const jit_conv_winograd_conf_t &jcp) { - const size_t page_size = PAGE_2M; - U_offset_ = 0; - V_offset_ = utils::rnd_up(U_sz_, page_size); - M_offset_ = V_offset_ + utils::rnd_up(V_sz_, page_size); - scratchpad_sz_ = M_offset_ + M_sz_; - if (bias_sz_) { - bias_offset_ = M_offset_ + utils::rnd_up(M_sz_, page_size); - scratchpad_sz_ = bias_offset_ + bias_sz_; - } - scratchpad_ = create_scratchpad(scratchpad_sz_); - } + scratchpad.book(key_wino_U, sizeof(float) * U_sz, PAGE_2M); + scratchpad.book(key_wino_V, sizeof(float) * V_sz, PAGE_2M); + scratchpad.book(key_wino_M, sizeof(float) * M_sz, PAGE_2M); - scratchpad_t *scratchpad_; - size_t nthreads_; - size_t scratchpad_sz_ = 0, U_sz_ = 0, V_sz_ = 0, M_sz_ = 0, - bias_sz_ = 0; - size_t U_offset_ = 0; - size_t V_offset_ = 0; - size_t M_offset_ = 0; - size_t bias_offset_ = 0; -}; + if (one_of(jcp.sched_policy, WSCHED_WEI_SDGtWo, WSCHED_WEI_S_D_Giot_W)) { + size_t br_sz = (size_t)jcp.nthr * jcp.oc; + scratchpad.book(key_conv_bia_reduction, sizeof(float) * br_sz, PAGE_2M); + } +} } template @@ -145,80 +79,86 @@ struct _jit_avx512_core_fp32_wino_conv_4x3_t { _jit_avx512_core_fp32_wino_conv_4x3_t( const jit_conv_winograd_conf_t &jcp, const primitive_attr_t *attr) - : kernel_(nullptr), scratchpad_(nullptr), attr_(attr) { + : kernel_(nullptr), attr_(attr) { kernel_ = new _jit_avx512_core_fp32_wino_conv_4x3_data_kernel(jcp); - scratchpad_ = new winograd::winograd_scratchpad_avx512_core_t(jcp); } - ~_jit_avx512_core_fp32_wino_conv_4x3_t() { - delete kernel_; - delete scratchpad_; - }; + ~_jit_avx512_core_fp32_wino_conv_4x3_t() { delete kernel_; } protected: void weight_transform_data(const jit_conv_winograd_conf_t &jcp, - float *wp, float *twp); + float *wp, float *twp) const; void input_transform_data(int image, const jit_conv_winograd_conf_t &jcp, - float *inp, float *tinp); + float *inp, float *tinp) const; void input_transform_tileblock_data(int tile_block, const jit_conv_winograd_conf_t &jcp, - float *inp, float *tinp); + float *inp, float *tinp) const; void output_transform_data(int image, const jit_conv_winograd_conf_t &jcp, - const post_ops_t &p_ops, float *toutp, float *pout_b, float *bias); + const post_ops_t &p_ops, float *toutp, float *pout_b, + float *bias) const; void output_transform_tileblock_data(int tile_block, const jit_conv_winograd_conf_t &jcp, const post_ops_t &p_ops, - float *toutp, float *outp, float *bias); + float *toutp, float *outp, float *bias) const; void _execute_data_W_S_G_D(const int MB, float *inp_ptr, float *out_ptr, - float *wei_ptr, float *bias_ptr = NULL); + float *wei_ptr, float *bias_ptr, + const memory_tracking::grantor_t &scratchpad) const; void _execute_data_W_SGD(const int MB, float *inp_ptr, float *out_ptr, - float *wei_ptr, float *bias_ptr = NULL); + float *wei_ptr, float *bias_ptr, + const memory_tracking::grantor_t &scratchpad) const; _jit_avx512_core_fp32_wino_conv_4x3_data_kernel *kernel_; - // Buffer required to store transforms in the frequency domain - winograd::winograd_scratchpad_avx512_core_t *scratchpad_; const primitive_attr_t *attr_; }; -template -struct _jit_avx512_core_fp32_wino_conv_4x3_fwd_t +struct jit_avx512_core_fp32_wino_conv_4x3_fwd_t : _jit_avx512_core_fp32_wino_conv_4x3_t , public cpu_primitive_t { - struct pd_t : public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, + struct pd_t : public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_wino_4x3:", avx512_core, ""), - _jit_avx512_core_fp32_wino_conv_4x3_fwd_t); + jit_avx512_core_fp32_wino_conv_4x3_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), data_type::f32 - == this->cdesc_().bias_desc.data_type) + == this->desc()->bias_desc.data_type) && mkldnn_thr_syncable(); if (!ok) return status::unimplemented; - return jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::init_conf(jcp_, - this->cdesc_(), this->src_pd_, - this->weights_pd_, this->dst_pd_, - *this->attr(), with_relu, this->negative_slope()); + status_t status = + jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::init_conf(jcp_, + *this->desc(), this->src_pd_, this->weights_pd_, + this->dst_pd_, *this->attr()); + if (status != status::success) return status; + + auto scratchpad = this->scratchpad_registry().registrar(); + winograd_avx512_core::init_scratchpad(scratchpad, jcp_); + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return status; } jit_conv_winograd_conf_t jcp_; @@ -232,7 +172,7 @@ struct _jit_avx512_core_fp32_wino_conv_4x3_fwd_t if (this->dst_pd_.desc()->format == any) CHECK(this->dst_pd_.set_format(nChw16c)); if (this->weights_pd_.desc()->format == any - && (this->cdesc_().prop_kind != mkldnn_forward_inference)) + && (this->desc()->prop_kind != mkldnn_forward_inference)) CHECK(this->weights_pd_.set_format( this->with_groups() ? gOIhw16i16o : OIhw16i16o)); if (this->bias_pd_.desc()->format == any) @@ -241,29 +181,30 @@ struct _jit_avx512_core_fp32_wino_conv_4x3_fwd_t } }; - _jit_avx512_core_fp32_wino_conv_4x3_fwd_t(const pd_t *pd, + jit_avx512_core_fp32_wino_conv_4x3_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : _jit_avx512_core_fp32_wino_conv_4x3_t(pd->jcp_, pd->attr()) - , cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) {} + : _jit_avx512_core_fp32_wino_conv_4x3_t(apd->jcp_, apd->attr()) + , cpu_primitive_t(apd, inputs, outputs, true) + {} - ~_jit_avx512_core_fp32_wino_conv_4x3_fwd_t(){}; + ~jit_avx512_core_fp32_wino_conv_4x3_fwd_t(){}; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { float *src = (float *)this->input_memory(0); float *dst = (float *)this->memory(); float *weights = (float *)this->input_memory(1); float *bias = (float *)this->input_memory(2); + auto scratchpad = this->scratchpad(); - switch ((conf_.jcp_).sched_policy) { + switch ((pd()->jcp_).sched_policy) { case WSCHED_DATA_W_S_G_D: - this->_execute_data_W_S_G_D(conf_.MB(), src, dst, weights, bias); + this->_execute_data_W_S_G_D(pd()->MB(), src, dst, weights, bias, scratchpad); break; case WSCHED_DATA_W_SGD: - this->_execute_data_W_SGD(conf_.MB(), src, dst, weights, bias); + this->_execute_data_W_SGD(pd()->MB(), src, dst, weights, bias, scratchpad); break; default: break; @@ -272,14 +213,9 @@ struct _jit_avx512_core_fp32_wino_conv_4x3_fwd_t } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; -using jit_avx512_core_fp32_wino_conv_4x3_fwd_t - = _jit_avx512_core_fp32_wino_conv_4x3_fwd_t; -using jit_avx512_core_fp32_wino_conv_4x3_relu_t - = _jit_avx512_core_fp32_wino_conv_4x3_fwd_t; - struct jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t : _jit_avx512_core_fp32_wino_conv_4x3_t, public cpu_primitive_t { @@ -300,7 +236,9 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward_data) - && this->desc()->alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && utils::everyone_is(data_type::f32, this->desc()->diff_src_desc.data_type, this->desc()->weights_desc.data_type, @@ -309,10 +247,20 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t if (!ok) return status::unimplemented; - return jit_avx512_core_fp32_wino_conv_4x3_bwd_data_kernel:: - init_conf(jcp_, *this->desc(), *this->diff_src_pd_.desc(), - *this->weights_pd_.desc(), - *this->diff_dst_pd_.desc()); + status_t status = + jit_avx512_core_fp32_wino_conv_4x3_bwd_data_kernel::init_conf( + jcp_, *this->desc(), *this->diff_src_pd_.desc(), + *this->weights_pd_.desc(), *this->diff_dst_pd_.desc()); + if (status != status::success) return status; + + auto scratchpad = this->scratchpad_registry().registrar(); + winograd_avx512_core::init_scratchpad(scratchpad, jcp_); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return status; } jit_conv_winograd_conf_t jcp_; @@ -333,30 +281,33 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t } }; - jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t(const pd_t *pd, + jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : _jit_avx512_core_fp32_wino_conv_4x3_t(pd->jcp_, pd->attr()) - , cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) {} + : _jit_avx512_core_fp32_wino_conv_4x3_t(apd->jcp_, apd->attr()) + , cpu_primitive_t(apd, inputs, outputs, true) + {} ~jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t(){}; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { float *diff_dst = (float *)this->input_memory(0); float *diff_src = (float *)this->memory(); float *weights = (float *)this->input_memory(1); + auto scratchpad = this->scratchpad(); - if (conf_.desc()->prop_kind == prop_kind::backward_data) { - switch ((conf_.jcp_).sched_policy) { + if (pd()->desc()->prop_kind == prop_kind::backward_data) { + switch ((pd()->jcp_).sched_policy) { case WSCHED_DATA_W_S_G_D: - this->_execute_data_W_S_G_D(conf_.MB(), diff_dst, diff_src, weights, NULL); + this->_execute_data_W_S_G_D(pd()->MB(), diff_dst, diff_src, weights, NULL, + scratchpad); break; case WSCHED_DATA_W_SGD: - this->_execute_data_W_SGD(conf_.MB(), diff_dst, diff_src, weights, NULL); + this->_execute_data_W_SGD(pd()->MB(), diff_dst, diff_src, weights, NULL, + scratchpad); break; default: @@ -370,7 +321,7 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_data_t } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; struct jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t @@ -393,7 +344,9 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward_weights) - && this->desc()->alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, this->desc()->diff_dst_desc.data_type, @@ -402,10 +355,21 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t if (!ok) return status::unimplemented; - return jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel:: - init_conf(jcp_, *this->desc(), *this->src_pd_.desc(), - *this->diff_dst_pd_.desc(), - *this->diff_weights_pd_.desc()); + status_t status = + jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel:: + init_conf(jcp_, *this->desc(), *this->src_pd_.desc(), + *this->diff_dst_pd_.desc(), + *this->diff_weights_pd_.desc()); + if (status != status::success) return status; + + auto scratchpad = this->scratchpad_registry().registrar(); + winograd_avx512_core::init_scratchpad(scratchpad, jcp_); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_winograd)); + + return status; } jit_conv_winograd_conf_t jcp_; @@ -428,37 +392,32 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t } }; - jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t(const pd_t *pd, + jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs, true) , kernel_(nullptr) - , scratchpad_(nullptr) { - auto jcp = conf_.jcp_; kernel_ = new jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel( - jcp); - scratchpad_ = new winograd::winograd_scratchpad_avx512_core_t(jcp); + pd()->jcp_); } ~jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t() { delete kernel_; - delete scratchpad_; }; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { - if (conf_.desc()->prop_kind == prop_kind::backward_weights) { + if (pd()->desc()->prop_kind == prop_kind::backward_weights) { const auto &jcp = kernel_->jcp; switch (jcp.sched_policy) { case WSCHED_WEI_SDGtWo: - _execute_backward_weights_SDGtWo(); + _execute_backward_weights_SDGtWo(scratchpad()); break; case WSCHED_WEI_S_D_Giot_W: - _execute_backward_weights_S_D_Giot_W(); + _execute_backward_weights_S_D_Giot_W(scratchpad()); break; default: assert(jcp.sched_policy != WSCHED_INVALID); @@ -471,14 +430,13 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_t } private: - void _execute_backward_weights_SDGtWo(); - void _execute_backward_weights_S_D_Giot_W(); + void _execute_backward_weights_SDGtWo( + const memory_tracking::grantor_t &scratchpad) const; + void _execute_backward_weights_S_D_Giot_W( + const memory_tracking::grantor_t &scratchpad) const; - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel *kernel_; - - // Buffer required to store transforms in the frequency domain - winograd::winograd_scratchpad_avx512_core_t *scratchpad_; }; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.cpp index 831f182..164bbe0 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.cpp @@ -62,6 +62,41 @@ int get_divisor_satisfying_cond(jit_conv_winograd_conf_t &jcp, int number, return best_divisor; } +namespace { +bool is_winograd_faster_than_direct(const jit_conv_winograd_conf_t &jcp) { + /* Determines if current winograd implementation is faster than direct. + Following conditions are empirical and based on performance data */ + unsigned int ncores_per_socket = + cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel); + unsigned int nthreads = mkldnn_get_max_threads(); + + if (jcp.prop_kind == prop_kind::forward_inference) { + return jcp.mb >= 4; + } else if (nthreads > ncores_per_socket) { + double src_dst_transforms_per_core = alpha * alpha + * (jcp.ic + jcp.oc) + * jcp.mb * ((jcp.oh + tile_size - 1) / tile_size) + * ((jcp.ow + tile_size - 1) / tile_size) + * sizeof(float) / 1024. / 1024. / nthreads; + double wei_transform = alpha * alpha + * jcp.ic * jcp.oc * sizeof(float) /1024. / 1024.; + + if (jcp.prop_kind == prop_kind::backward_weights) { + if (src_dst_transforms_per_core < 0.3 + || (src_dst_transforms_per_core <= 28 && wei_transform < 4)) + return false; + else + return true; + } else { + if (src_dst_transforms_per_core < 2.0 || wei_transform < 0.02) + return false; + } + } + + return jcp.mb > 8; +} +} + /* assumes 512 bits registers */ /* TODO: add support for strides */ /* TODO: handle the prefetch distance automatically */ @@ -730,16 +765,16 @@ void _jit_avx512_core_fp32_wino_conv_4x3_data_kernel vaddps(zmm_O, zmm_O, ptr[oreg_bias]); } if (with_relu) { - Opmask kmask = Opmask(7); - if (jcp.eltwise_alpha == 0) { - zmm_relu_ns = zmm_zero; + if (jcp.eltwise.alpha == 0) { + vmaxps(zmm_O, zmm_O, zmm_zero); } else { - mov(imm_addr64, float2int(jcp.eltwise_alpha)); + Opmask kmask = Opmask(7); + mov(imm_addr64, float2int(jcp.eltwise.alpha)); vmovq(xmm_relu_ns, imm_addr64); vbroadcastss(zmm_relu_ns, xmm_relu_ns); + vcmpps(kmask, zmm_O, zmm_zero, _cmp_lt_os); + vmulps(zmm_O | kmask, zmm_O, zmm_relu_ns); } - vcmpps(kmask, zmm_O, zmm_zero, _cmp_lt_os); - vmulps(zmm_O | kmask, zmm_O, zmm_relu_ns); } } if (with_sum) { @@ -1095,6 +1130,9 @@ status_t _jit_avx512_core_fp32_wino_conv_4x3_data_kernel::init_conf_common( if (!mayiuse(avx512_core)) { return status::unimplemented; } + + jcp.nthr = mkldnn_get_max_threads(); + jcp.ver = ver_avx512_core; jcp.prop_kind = cd.prop_kind; @@ -1133,6 +1171,10 @@ status_t _jit_avx512_core_fp32_wino_conv_4x3_data_kernel::init_conf_common( } // Checking conditions not supported by these kernels + if (!IMPLICATION(cd.alg_kind == alg_kind::convolution_auto, + is_winograd_faster_than_direct(jcp))) + return status::unimplemented; + if (jcp.ngroups != 1) return status::unimplemented; if ((jcp.kh != 3) || (jcp.kw != 3)) @@ -1366,28 +1408,16 @@ bool jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::post_ops_ok( jit_conv_conf_t &jcp, const primitive_attr_t &attr) { const auto &p = attr.post_ops_; - auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); }; auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; switch (p.len_) { - case 0: - return true; // no post_ops - case 1: - return true // relu or sum - && IMPLICATION(jcp.with_eltwise, is_sum(0)) - && IMPLICATION(!jcp.with_eltwise, is_eltwise(0) || is_sum(0)); - case 2: - return true // sum->relu or relu->sum - && IMPLICATION(jcp.with_eltwise, is_sum(0) && is_eltwise(1)) - && IMPLICATION(!jcp.with_eltwise, false - || (is_sum(0) && is_eltwise(1)) - || (is_eltwise(0) && is_sum(1))); - case 3: - return true // relu->sum->relu - && jcp.with_eltwise == false - && (is_eltwise(0) && is_sum(1) && is_eltwise(2)); - default: - return false; + case 0: return true; // no post_ops + case 1: return is_relu(0) || is_sum(0); // relu or sum + case 2: return (is_sum(0) && is_relu(1)) + || (is_relu(0) && is_sum(1)); // sum->relu or relu->sum + case 3: return is_relu(0) && is_sum(1) && is_relu(2); // relu->sum->relu + default: return false; } return false; @@ -1396,8 +1426,7 @@ bool jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::post_ops_ok( status_t jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::init_conf( jit_conv_winograd_conf_t &jcp, const convolution_desc_t &cd, const cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, - const cpu_memory_t::pd_t &dst_pd, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope) { + const cpu_memory_t::pd_t &dst_pd, const primitive_attr_t &attr) { status_t st = init_conf_common(jcp, cd, *src_pd.desc(), *weights_pd.desc(), *dst_pd.desc()); @@ -1411,18 +1440,16 @@ status_t jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::init_conf( jcp.ntiles = jcp.mb * jcp.itiles * jcp.jtiles; jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; if (!post_ops_ok(jcp, attr)) return status::unimplemented; const auto &p = attr.post_ops_; - if (!jcp.with_eltwise) { - /* PostOps ReLU before SUM is handled the same as ReLU primitive */ - jcp.with_eltwise = p.find(primitive_kind::eltwise, 0, 1) != -1; - jcp.eltwise_alpha = 0.f; - } + const int eltwise_ind = p.find(primitive_kind::eltwise, 0, 1); + jcp.with_eltwise = eltwise_ind != -1; + if (jcp.with_eltwise) + jcp.eltwise = p.entry_[eltwise_ind].eltwise; + jcp.with_sum = p.find(primitive_kind::sum, 0) != -1; jcp.with_relu_postsum = p.find(primitive_kind::eltwise, 1) != -1; @@ -2376,6 +2403,8 @@ status_t set_wsched_WEI_SDGtWo(jit_conv_winograd_conf_t &jcp) { jcp.dimM_block = M_blk; jcp.sched_policy = WSCHED_WEI_SDGtWo; set_jcp_WEI_params(jcp); + jcp.nthr = nstl::min(mkldnn_get_max_threads(), + jcp.tile_block); return status::success; } } @@ -2467,6 +2496,9 @@ status_t jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel::init_conf( else jcp.ver = ver_avx512_core; + jcp.nthr = mkldnn_get_max_threads(); + + jcp.prop_kind = cd.prop_kind; const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1; jcp.mb = src_d.dims()[0]; jcp.ngroups = with_groups ? diff_weights_d.dims()[0] : 1; @@ -2507,6 +2539,10 @@ status_t jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel::init_conf( jcp.ntiles = jcp.mb * jcp.itiles * jcp.jtiles; // Winograd kernel works only for 3x3 convolution with stride 1 + if (!IMPLICATION(cd.alg_kind == alg_kind::convolution_auto, + is_winograd_faster_than_direct(jcp))) + return status::unimplemented; + if (jcp.ngroups != 1) return status::unimplemented; if ((jcp.kh != 3) || (jcp.kw != 3)) diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp index eb9d7fd..c9f1559 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp @@ -161,8 +161,7 @@ struct jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel static status_t init_conf(jit_conv_winograd_conf_t &jcp, const convolution_desc_t &cd, const cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, const cpu_memory_t::pd_t &dst_pd, - const primitive_attr_t &attr, bool with_relu, - float relu_negative_slope); + const primitive_attr_t &attr); }; struct jit_avx512_core_fp32_wino_conv_4x3_bwd_data_kernel @@ -188,7 +187,7 @@ struct jit_avx512_core_fp32_wino_conv_4x3_bwd_weights_kernel //******************* First iter kernel ********************// this->gemm_loop_generate(true); gemm_loop_ker_first_iter = (decltype(gemm_loop_ker_first_iter))this->getCode(); - + align(); const Xbyak::uint8 *addr = getCurr(); this->src_transform_generate(); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.cpp deleted file mode 100644 index f51c956..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.cpp +++ /dev/null @@ -1,582 +0,0 @@ -/******************************************************************************* -* Copyright 2017-2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include - -#include "mkldnn_types.h" - -#include "mkldnn_thread.hpp" -#include "utils.hpp" - -#include "jit_generator.hpp" - -#include "jit_avx512_core_i8i8_pooling.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -using namespace Xbyak; - -using namespace mkldnn::impl::utils; -using namespace mkldnn::impl::memory_format; -using namespace mkldnn::impl::utils; -using namespace mkldnn::impl::types; -using namespace alg_kind; - -struct jit_avx512_core_i8i8_pool_fwd_ker_t: public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_i8i8_pool_fwd_ker_t) - - struct call_params_t { - const char *src_i8; - const char *dst_i8; - size_t kw_range; - size_t kh_range; - float idivider; - }; - - Reg64 reg_ptr_src_i8 = r8; - Reg64 reg_ptr_dst_i8 = r9; - - Reg64 ki = r10; - Reg64 kj = r11; - Reg64 reg_kw = r12; - Reg64 reg_kh = r13; - Reg64 c_iter = r14; - - Reg64 aux_reg_src_h = rax; - Reg64 aux_reg_src_w = rbx; - - Reg64 reg_tmp = rdx; - - Reg64 reg_mask = r15; - - Opmask k_cmp_mask = Opmask(7); - - Opmask mask(int idx) { - return Opmask(6 - idx); - } - - Xmm xmm_tmp = Xmm(0); - Zmm vreg_tmp = Zmm(30); - Zmm vreg_zeros = Zmm(31); - - size_t sizeof_src_dt() const { return data_type_size(jpp.src_dt); } - size_t sizeof_dst_dt() const { return data_type_size(jpp.dst_dt); } - - /* max pooling */ - Zmm vreg_src(int idx) { - return Zmm(idx); - } - - Zmm vreg_dst(int idx) { - return Zmm(jpp.ur_c + idx); - } - - /* avg pooling */ - Zmm vreg_src_s32(int jj, int ll) { - return Zmm(12*jj + ll); - } - - Zmm vreg_dst_s32(int jj, int ll) { - return Zmm(12*jj + ll + 4); - } - - Zmm vreg_dst_f32(int jj, int ll) { - return Zmm(12*jj + ll + 8); - } - - void (*ker_)(const call_params_t *); - jit_pool_conf_t jpp; - - void init_tmp_reg(); - void init_mask(); - - void load_src(int jj, int ll, int c_tail); - void store_dst(int jj, int ll, int c_tail); - - void compute_avg_step(int ur_c, int c_tail); - void compute_max_step(int ur_c, int c_tail); - void compute_step(int ur_c, int c_tail); - - void compute_c_block(); - void generate(); - - static status_t init_conf(jit_pool_conf_t &jpp, - const pooling_desc_t &pd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &dst_d); - - jit_avx512_core_i8i8_pool_fwd_ker_t(const jit_pool_conf_t &jpp_) - : jpp(jpp_) { - generate(); - ker_ = reinterpret_cast(const_cast( - getCode())); - } -}; - -void jit_avx512_core_i8i8_pool_fwd_ker_t::load_src(int jj, int ll, int c_tail) { - using namespace data_type; - - int c_block = jpp.c_block; - int ur_c = jpp.ur_c; - - switch (jpp.alg) { - case pooling_max: { - auto offset = jj*c_block*sizeof_src_dt(); - if (jj == ur_c - 1 && c_tail) { - if (jpp.src_dt == data_type::s32) { - vmovups(vreg_src(jj) | mask(0), - ptr[aux_reg_src_w + offset]); - } else { - vmovdqu8(vreg_src(jj) | mask(0), - ptr[aux_reg_src_w + offset]); - } - } else { - vmovups(vreg_src(jj), ptr[aux_reg_src_w + offset]); - } - break; - } - case pooling_avg_include_padding: - case pooling_avg_exclude_padding: { - auto offset = (ll*(c_block/4) + jj*c_block)*sizeof_src_dt(); - if (jj == jpp.ur_c - 1 && c_tail) { - if (jpp.tail[ll]) { - switch (jpp.src_dt) { - case s32: - vmovups(vreg_src_s32(jj, ll) | mask(ll), - ptr[aux_reg_src_w + offset]); - break; - case s8: - vpmovsxbd(vreg_src_s32(jj, ll) | mask(ll), - ptr[aux_reg_src_w + offset]); - break; - case u8: - vpmovzxbd(vreg_src_s32(jj, ll) | mask(ll), - ptr[aux_reg_src_w + offset]); - break; - default: assert(!"unsupported src data type"); - } - } - } else { - switch (jpp.src_dt) { - case s32: - vmovups(vreg_src_s32(jj, ll), - ptr[aux_reg_src_w + offset]); - break; - case s8: - vpmovsxbd(vreg_src_s32(jj, ll), - ptr[aux_reg_src_w + offset]); - break; - case u8: - vpmovzxbd(vreg_src_s32(jj, ll), - ptr[aux_reg_src_w + offset]); - break; - default: assert(!"unsupported src data type"); - } - } - break; - } - default: assert(!"unsupported algorithm"); - } -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::store_dst(int jj, int ll, - int c_tail) { - using namespace data_type; - - int c_block = jpp.c_block; - int ur_c = jpp.ur_c; - - switch(jpp.alg) { - case pooling_max: { - auto offset = jj*c_block*sizeof_dst_dt(); - if (jj == ur_c - 1 && c_tail) { - if (jpp.src_dt == data_type::s32) { - vmovups(ptr[reg_ptr_dst_i8 + offset], - vreg_dst(jj) | mask(0)); - } else { - vmovdqu8(ptr[reg_ptr_dst_i8 + offset], - vreg_dst(jj) | mask(0)); - } - } else { - vmovups(ptr[reg_ptr_dst_i8 + offset], vreg_dst(jj)); - } - break; - } - case pooling_avg_include_padding: - case pooling_avg_exclude_padding: { - auto offset = (ll*(c_block/4) + jj*c_block)*sizeof_dst_dt(); - if (jj == ur_c - 1 && c_tail) { - if (jpp.tail[ll]) { - switch (jpp.dst_dt) { - case s32: - vmovups(ptr[reg_ptr_dst_i8 + offset], - vreg_dst_s32(jj, ll) | mask(ll)); - break; - case s8: - vpmovdb(ptr[reg_ptr_dst_i8 + offset], - vreg_dst_s32(jj, ll) | mask(ll)); - break; - case u8: - vpmovusdb(ptr[reg_ptr_dst_i8 + offset], - vreg_dst_s32(jj, ll) | mask(ll)); - break; - default: assert(!"unsupported dst data_type"); - } - } - } else { - switch (jpp.dst_dt) { - case s32: - vmovups(ptr[reg_ptr_dst_i8 + offset], - vreg_dst_s32(jj, ll)); - break; - case s8: - vpmovdb(ptr[reg_ptr_dst_i8 + offset], - vreg_dst_s32(jj, ll)); - break; - case u8: - vpmovusdb(ptr[reg_ptr_dst_i8 + offset], - vreg_dst_s32(jj, ll)); - break; - default: assert(!"unsuppotred dst data_type"); - } - } - break; - } - default: assert(!"unsupported pooling algorithm"); - } -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::compute_max_step(int ur_c, int c_tail) -{ - Label l_kw, l_kh; - - int iw = jpp.iw; - int c = jpp.c; - - for (int jj = 0; jj < ur_c; jj++) - vmovups(vreg_dst(jj), vreg_tmp); - - mov(aux_reg_src_h, reg_ptr_src_i8); - - xor_(kj, kj); - L(l_kh); - { - mov(aux_reg_src_w, aux_reg_src_h); - xor_(ki, ki); - L(l_kw); - { - for (int jj = 0; jj < ur_c; jj++) { - load_src(jj, 0, c_tail); - if (jpp.src_dt == data_type::s32) { - vpcmpd(k_cmp_mask, vreg_dst(jj), vreg_src(jj), _cmp_lt_os); - vpblendmd(vreg_dst(jj) | k_cmp_mask, vreg_dst(jj), - vreg_src(jj)); - } else { - if (jpp.src_dt == data_type::s8) - vpcmpb(k_cmp_mask, vreg_dst(jj), vreg_src(jj), - _cmp_lt_os); - else - vpcmpub(k_cmp_mask, vreg_dst(jj), vreg_src(jj), - _cmp_lt_os); - vpblendmb(vreg_dst(jj) | k_cmp_mask, vreg_dst(jj), - vreg_src(jj)); - } - } - add(aux_reg_src_w, c * sizeof_src_dt()); - inc(ki); - cmp(ki, reg_kw); - jl(l_kw, T_NEAR); - } - add(aux_reg_src_h, iw * c * sizeof_src_dt()); - inc(kj); - cmp(kj, reg_kh); - jl(l_kh, T_NEAR); - } - - for (int jj = 0; jj < ur_c; jj++) - store_dst(jj, 0, c_tail); -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::compute_avg_step(int ur_c, int c_tail) -{ - using namespace data_type; - - Label l_kw, l_kh; - - int iw = jpp.iw; - int c = jpp.c; - - int num_ll = jpp.src_dt == data_type::s32 ? 1 : 4; - - for (int jj = 0; jj < ur_c; jj++) { - for (int ll = 0; ll < 4; ll++) { - uni_vpxor(vreg_src_s32(jj, ll), - vreg_src_s32(jj, ll), vreg_src_s32(jj, ll)); - uni_vpxor(vreg_dst_s32(jj, ll), - vreg_dst_s32(jj, ll), vreg_dst_s32(jj, ll)); - } - } - - mov(aux_reg_src_h, reg_ptr_src_i8); - - xor_(kj, kj); - L(l_kh); - { - mov(aux_reg_src_w, aux_reg_src_h); - xor_(ki, ki); - L(l_kw); - { - for (int jj = 0; jj < ur_c; jj++) { - for (int ll = 0; ll < num_ll; ll++) { - load_src(jj, ll, c_tail); - vpaddd(vreg_dst_s32(jj, ll), - vreg_dst_s32(jj, ll), vreg_src_s32(jj, ll)); - } - } - add(aux_reg_src_w, c * sizeof_src_dt()); - inc(ki); - cmp(ki, reg_kw); - jl(l_kw, T_NEAR); - } - add(aux_reg_src_h, iw * c * sizeof_src_dt()); - inc(kj); - cmp(kj, reg_kh); - jl(l_kh, T_NEAR); - } - - for (int jj = 0; jj < ur_c; jj++) { - for (int ll = 0; ll < num_ll; ll++) { - vcvtdq2ps(vreg_dst_f32(jj, ll), vreg_dst_s32(jj, ll)); - vfmadd132ps(vreg_dst_f32(jj, ll), vreg_zeros, vreg_tmp); - vcvtps2dq(vreg_dst_s32(jj, ll) | T_rn_sae, vreg_dst_f32(jj, ll)); - - store_dst(jj, ll, c_tail); - } - } -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::compute_step(int ur_c, int c_tail) { - switch (jpp.alg) { - case pooling_max: - compute_max_step(ur_c, c_tail); break; - case pooling_avg_include_padding: - case pooling_avg_exclude_padding: - compute_avg_step(ur_c, c_tail); break; - default: assert(!"unsupported pooling algorithm"); - } -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::compute_c_block(){ - Label l_main_loop; - - int nb_c = jpp.nb_c; - int c_block = jpp.c_block; - int ur_c = jpp.ur_c; - int ur_c_tail = jpp.ur_c_tail; - int c_steps = nb_c / ur_c; - int c_tail = jpp.c_tail; - - xor_(c_iter, c_iter); - if (c_steps > 0) { - L(l_main_loop); { - compute_step(ur_c, 0); - add(reg_ptr_src_i8, ur_c*c_block*sizeof_src_dt()); - add(reg_ptr_dst_i8, ur_c*c_block*sizeof_dst_dt()); - inc(c_iter); - cmp(c_iter, c_steps); - jl(l_main_loop, T_NEAR); - } - } - - if (ur_c_tail != 0) { - compute_step(ur_c_tail, c_tail); - } -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::init_mask() { - for (int i = 0; i < 4; i++) { - mov(reg_mask, jpp.tail[i]); - kmovq(mask(i), reg_mask); - } -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::init_tmp_reg() { - using namespace data_type; - - switch (jpp.alg) { - case pooling_avg_include_padding: - case pooling_avg_exclude_padding: - mov(reg_tmp, ptr[abi_param1 + offsetof(call_params_t, idivider)]); - movq(xmm_tmp, reg_tmp); - vpbroadcastd(vreg_tmp, xmm_tmp); - break; - case pooling_max: - switch (jpp.src_dt) { - case s32: - mov(reg_tmp, nstl::numeric_limits::lowest()); - break; - case s8: - mov(reg_tmp, nstl::numeric_limits::lowest()); - break; - case u8: - mov(reg_tmp, nstl::numeric_limits::lowest()); - break; - default: assert(!"unsupported src data_type"); - } - - movq(xmm_tmp, reg_tmp); - if (jpp.src_dt == s32) - vpbroadcastd(vreg_tmp, xmm_tmp); - else - vpbroadcastb(vreg_tmp, xmm_tmp); - break; - default: assert(!"unsupported pooling algorithm"); - } - -} - -void jit_avx512_core_i8i8_pool_fwd_ker_t::generate() { - preamble(); - -# define READ_PARAM(reg, field) \ - mov(reg, ptr[abi_param1 + offsetof(call_params_t, field)]) - READ_PARAM(reg_ptr_src_i8, src_i8); - READ_PARAM(reg_ptr_dst_i8, dst_i8); - READ_PARAM(reg_kw, kw_range); - READ_PARAM(reg_kh, kh_range); - -# undef READ_PARAM - - init_tmp_reg(); - init_mask(); - - uni_vpxor(vreg_zeros, vreg_zeros, vreg_zeros); - - compute_c_block(); - - postamble(); -} - -status_t jit_avx512_core_i8i8_pool_fwd_ker_t::init_conf(jit_pool_conf_t &jpp, - const pooling_desc_t &pd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &dst_d) { - if (!mayiuse(avx512_core)) { - return status::unimplemented; - } - - jpp.mb = src_d.dims()[0]; - jpp.c = src_d.dims()[1]; - jpp.ih = src_d.dims()[2]; - jpp.iw = src_d.dims()[3]; - jpp.oh = dst_d.dims()[2]; - jpp.ow = dst_d.dims()[3]; - - jpp.stride_h = pd.strides[0]; - jpp.stride_w = pd.strides[1]; - jpp.kh = pd.kernel[0]; - jpp.kw = pd.kernel[1]; - - jpp.t_pad = pd.padding[0][0]; - jpp.l_pad = pd.padding[0][1]; - - jpp.alg = pd.alg_kind; - - jpp.src_dt = pd.src_desc.data_type; - jpp.dst_dt = pd.dst_desc.data_type; - - jpp.c_block = 64 / (jpp.src_dt == data_type::s32 ? 4 : 1); - jpp.c_tail = jpp.c % jpp.c_block; - jpp.nb_c = jpp.c / jpp.c_block; - jpp.ur_c = 1; - jpp.ur_c_tail = jpp.nb_c - (jpp.nb_c / jpp.ur_c)*jpp.ur_c + - (jpp.c_tail != 0); - - size_t tail_mask = (1ULL << jpp.c_tail) - 1; - - switch(jpp.alg) { - case pooling_max: - jpp.tail[0] = tail_mask; - jpp.tail[1] = 0; - jpp.tail[2] = 0; - jpp.tail[3] = 0; - break; - case pooling_avg_include_padding: - case pooling_avg_exclude_padding: - jpp.tail[0] = tail_mask & 0xffff; - for (size_t i = 1, m = tail_mask; i < 4; i++) { - m = m >> 16; - jpp.tail[i] = m & 0xffff; - } - break; - default: return status::unimplemented; - } - - return status::success; -} - -status_t jit_avx512_core_i8i8_pooling_fwd_t::pd_t::jit_conf() { - return jit_avx512_core_i8i8_pool_fwd_ker_t::init_conf(jpp_, - desc_, src_pd_.desc(), dst_pd_.desc()); -} - -jit_avx512_core_i8i8_pooling_fwd_t:: -jit_avx512_core_i8i8_pooling_fwd_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), ker_(nullptr) -{ ker_ = new jit_avx512_core_i8i8_pool_fwd_ker_t(conf_.jpp_); } - -jit_avx512_core_i8i8_pooling_fwd_t:: -~jit_avx512_core_i8i8_pooling_fwd_t() { delete ker_; } - -void jit_avx512_core_i8i8_pooling_fwd_t::execute_forward() { - auto src_i8 = reinterpret_cast(input_memory(0)); - auto dst_i8 = reinterpret_cast(memory()); - - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - - const auto &jpp = conf_.jpp_; - - parallel_nd(jpp.mb, jpp.oh, jpp.ow, - [&](int n, int oh, int ow) { - const int ih = nstl::max(oh*jpp.stride_h - jpp.t_pad, 0); - const int iw = nstl::max(ow*jpp.stride_w - jpp.l_pad, 0); - - const int kh_start = nstl::max(0, jpp.t_pad - oh * jpp.stride_h); - const int kh_end = nstl::min(jpp.kh, - jpp.ih + jpp.t_pad - oh * jpp.stride_h); - const int kw_start = nstl::max(0, jpp.l_pad - ow * jpp.stride_w); - const int kw_end = nstl::min(jpp.kw, - jpp.iw + jpp.l_pad - ow * jpp.stride_w); - - auto p = jit_avx512_core_i8i8_pool_fwd_ker_t::call_params_t(); - p.src_i8 = &src_i8[ - src_d.blk_off(n, 0, ih, iw) * src_d.data_type_size()]; - p.dst_i8 = &dst_i8[ - dst_d.blk_off(n, 0, oh, ow) * dst_d.data_type_size()]; - p.kw_range = (size_t)(kw_end - kw_start); - p.kh_range = (size_t)(kh_end - kh_start); - p.idivider = 1.0f / ((jpp.alg == pooling_avg_exclude_padding) ? - p.kh_range*p.kw_range : jpp.kw*jpp.kh); - - ker_->ker_(&p); - }); -} - -} -} -} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp deleted file mode 100644 index 6ea1542..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp +++ /dev/null @@ -1,602 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "jit_avx512_core_u8s8s32x_deconvolution.hpp" - -#define GET_OFF(field) offsetof(jit_deconv_call_s, field) - -namespace mkldnn { -namespace impl { -namespace cpu { - -using namespace mkldnn::impl::status; -using namespace mkldnn::impl::memory_format; -using namespace mkldnn::impl::utils; - -using namespace nstl; - -#define wht_blk_off(d, g, ...) \ - (conf_.with_groups() \ - ? (d).blk_off((g), __VA_ARGS__) \ - : (d).blk_off(__VA_ARGS__)) - -status_t jit_avx512_core_u8s8s32x_deconv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, - const deconvolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, - cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, - const bool with_bias, cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr) { - const memory_desc_wrapper src_d(&src_pd); - const memory_desc_wrapper dst_d(&dst_pd); - const memory_desc_wrapper weights_d(&weights_pd); - const memory_desc_wrapper bias_d(&bias_pd); - - if (!(mayiuse(avx512_core) && - src_d.data_type() == data_type::u8 - && weights_d.data_type() == data_type::s8 - && one_of(dst_d.data_type(), data_type::f32, data_type::s32, - data_type::s8, data_type::u8))) - return status::unimplemented; - - jcp = zero(); - - const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; - - jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; - jcp.oc = dst_d.dims()[1] / jcp.ngroups; - jcp.ic = src_d.dims()[1] / jcp.ngroups; - jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups; - jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups; - jcp.is_depthwise = true && with_groups && utils::everyone_is(1, - jcp.ic_without_padding, jcp.oc_without_padding); - - const auto w_format = with_groups - ? (jcp.is_depthwise ? Goihw16g : gOIhw4i16o4i) - : OIhw4i16o4i; - - if (dst_d.format() == any) - CHECK(dst_pd.set_format(nhwc)); - if (dst_d.format() != nhwc) - return status::unimplemented; - if (src_d.format() == any) - CHECK(src_pd.set_format(nhwc)); - if (src_d.format() != nhwc) - return status::unimplemented; - if (weights_d.format() == any) - CHECK(weights_pd.set_format(w_format)); - if (weights_d.format() != w_format) - return status::unimplemented; - - jcp.with_bias = with_bias; - if (jcp.with_bias) { - if (bias_d.format() == any) - CHECK(bias_pd.set_format(x)); - if (bias_d.format() != x) - return status::unimplemented; - } - - jcp.ndims = dst_d.ndims(); - jcp.prop_kind = cd.prop_kind; - jcp.mb = src_d.dims()[0]; - jcp.ih = src_d.dims()[2]; - jcp.iw = src_d.dims()[3]; - jcp.oh = dst_d.dims()[2]; - jcp.ow = dst_d.dims()[3]; - jcp.kh = weights_d.dims()[with_groups + 2]; - jcp.kw = weights_d.dims()[with_groups + 3]; - jcp.t_pad = cd.padding[0][0]; - jcp.l_pad = cd.padding[0][1]; - jcp.stride_h = cd.strides[0]; - jcp.stride_w = cd.strides[1]; - jcp.src_fmt = src_d.format(); - jcp.with_eltwise = false;/*TODO: support post-ops*/ - - if (jcp.is_depthwise) { - jcp.ch_block = 16; - jcp.oc_block = 1; - jcp.ic_block = 1; - } else { - jcp.ch_block = 1; - jcp.oc_block = 16; - jcp.ic_block = 16; - - if (jcp.ngroups == 1) { - jcp.oc = utils::rnd_up(jcp.oc_without_padding, jcp.oc_block); - jcp.ic = utils::rnd_up(jcp.ic_without_padding, jcp.ic_block); - } - if (jcp.ic % jcp.ic_block != 0) - return status::unimplemented; - } - - jcp.dilate_h = cd.dilates[0]; - jcp.dilate_w = cd.dilates[1]; - - if (!IMPLICATION(jcp.dilate_h, jcp.stride_h == 1) - || !IMPLICATION(jcp.dilate_w, jcp.stride_w == 1)) - return status::unimplemented; - - /*bottom and right :padding*/ - jcp.b_pad = (jcp.ih - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) - - (jcp.oh + jcp.t_pad - 1); - jcp.r_pad = (jcp.iw - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) - - (jcp.ow + jcp.l_pad - 1); - - if (!attr.post_ops_.has_default_values()) - return status::unimplemented; - - jcp.ver = ver_avx512_core; - if (mayiuse(avx512_core_vnni)) - jcp.ver = ver_vnni; - const auto &oscales = attr.output_scales_; - jcp.is_oc_scale = oscales.mask_ == 1 << 1; - - jcp.dst_dt = dst_d.data_type(); - jcp.bia_dt = jcp.with_bias ? bias_d.data_type() : data_type::undef; - jcp.typesize_bia = jcp.with_bias ? types::data_type_size(bias_d.data_type()) : 0; - jcp.typesize_in = types::data_type_size(src_d.data_type()); - jcp.typesize_out = types::data_type_size(dst_d.data_type()); - - jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block); - jcp.nb_oc = jcp.oc / jcp.oc_block; - jcp.nb_ic = jcp.ic / jcp.ic_block; - - /*kernel blocking params*/ - const int regs = jcp.ver == ver_vnni ? 31 : 29; - jcp.nb_oc_blocking = nstl::min(4, jcp.nb_oc); - for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--) - if (jcp.nb_oc % jcp.nb_oc_blocking == 0 - && jcp.l_pad <= regs / (jcp.nb_oc_blocking + 1)) - break; - - jcp.ur_w = regs / (jcp.nb_oc_blocking + 1); - int l_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - jcp.l_pad) / jcp.stride_w); - int r_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - - max(0, jcp.r_pad)) / jcp.stride_w); - if (jcp.ow < jcp.ur_w) - jcp.ur_w = jcp.ow; - for (; jcp.ur_w > 1; jcp.ur_w--) - if (jcp.ur_w % jcp.stride_w == 0 - && max(l_overflow, - r_overflow - (jcp.ow % jcp.ur_w) / jcp.stride_w) * jcp.stride_w <= jcp.ur_w) - break; - jcp.ur_w_tail = jcp.ow % jcp.ur_w; - - jcp.loop_order = jcp.ngroups > 1 ? loop_ngc : loop_cgn; - return status::success; -} - -void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::compute_ker( - int ur_w, int l_overflow, int r_overflow, ker_block_t last_block) { - - int ch_block_all = jcp.ch_block * jcp.ic_block * jcp.oc_block; - int shift_src_ih = jcp.typesize_in * (jcp.dilate_h + 1) - * jcp.iw * jcp.ngroups * jcp.ic_without_padding; - int shift_filt_kh = jcp.typesize_in * jcp.kw * jcp.stride_h * ch_block_all; - - auto src_offset = [=] (int oj, int icb, int ki) { - return jcp.typesize_in * - (((oj + jcp.l_pad - ki * (jcp.dilate_w + 1)) / jcp.stride_w) * jcp.ngroups * jcp.ic_without_padding + icb * 4); - }; - - auto kernel_offset = [=] (int ocb, int icb, int ki) { - return jcp.typesize_in * - (ocb * jcp.nb_ic * jcp.kh * jcp.kw * ch_block_all + icb * jcp.oc_block * jcp.ic_block/4 - + ki * ch_block_all); - }; - - auto compute = [=](zmm_t vreg_acc, zmm_t vreg_wei, zmm_t vreg_src) { - if (jcp.ver == ver_vnni) { - vpdpbusd(vreg_acc, vreg_src, vreg_wei); - } else if (jcp.is_depthwise) { - vpmulld(zmm_tmp, vreg_src, vreg_wei); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); - } else { - vpmaddubsw(zmm_tmp, vreg_src, vreg_wei); - vpmaddwd(zmm_tmp, zmm_tmp, zmm_one); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); - } - }; - - mov(aux_reg_src, reg_src); - mov(aux_reg_filt, reg_filt); - mov(reg_kj, reg_kh); - Xbyak::Label kh_loop_label; - L(kh_loop_label); { - for (int ki = 0; ki < jcp.kw; ki++) { - int jj_start = get_ow_start(ki, l_overflow); - int jj_end = get_ow_end(ur_w, ki, r_overflow); - int tail_size = jcp.ic_without_padding % 4; - int n_ic_blocks = jcp.is_depthwise - ? 1 - : (last_block & ~no_last_block - ? div_up(jcp.ic_without_padding % jcp.ic_block, 4) - : jcp.ic_block / 4); - for (int icb1 = 0; icb1 < n_ic_blocks; icb1++) { - for (int jj = jj_start; jj < jj_end; jj += jcp.stride_w) { - assert((jj + jcp.l_pad - ki) % jcp.stride_w == 0); - - int aux_src_off = src_offset(jj, icb1, ki); - if (jcp.is_depthwise) { - vpmovzxbd(zmm_inp(jj, jcp.nb_oc_blocking), - EVEX_compress_addr(aux_reg_src, aux_src_off)); - } else if ((last_block & last_sp_block) - && tail_size != 0 && icb1 == n_ic_blocks - 1) { - xmm_t xmm_tmp = xmm_t(zmm_inp(jj, jcp.nb_oc_blocking).getIdx()); - for (int r = 0; r < tail_size; ++r) - vpinsrb(xmm_tmp, xmm_tmp, - ptr[aux_reg_src + aux_src_off + r], r); - vpbroadcastd(zmm_inp(jj, jcp.nb_oc_blocking), xmm_tmp); - } else { - vpbroadcastd(zmm_inp(jj, jcp.nb_oc_blocking), - EVEX_compress_addr(aux_reg_src, aux_src_off)); - } - } - - for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { - int aux_filt_off = kernel_offset(ocb, icb1, ki); - if (jj_end - jj_start > 0) { - if (jcp.is_depthwise) - vpmovsxbd(zmm_wei, - EVEX_compress_addr(aux_reg_filt, aux_filt_off)); - else - vmovups(zmm_wei, - EVEX_compress_addr(aux_reg_filt, aux_filt_off)); - } - for (int jj = jj_start; jj < jj_end; jj += jcp.stride_w) { - compute(zmm_out(jj, ocb), - zmm_wei, zmm_inp(jj, jcp.nb_oc_blocking)); - } - } - } - } - sub(aux_reg_src, shift_src_ih); - add(aux_reg_filt, shift_filt_kh); - dec(reg_kj); - cmp(reg_kj, 0); - jg(kh_loop_label, T_NEAR); - } -} - -void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::prepare_output(int ur_w) { - for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { - for (int ur = 0; ur < ur_w; ur++) { - zmm_t zmm = zmm_out(ur, ocb); - vpxord(zmm, zmm, zmm); - } - } -} - -void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::cvt2ps(data_type_t type_in, - zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag) { - zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in; - switch (type_in) { - case data_type::f32: - case data_type::s32: vmovups(zmm, op); break; - case data_type::s8: vpmovsxbd(zmm, op); break; - case data_type::u8: vpmovzxbd(zmm, op); break; - default: assert(!"unsupported data type"); - } - if (type_in != data_type::f32) - vcvtdq2ps(zmm_in, zmm_in); -} - -void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::store_output(int ur_w, bool last_oc_block) { - mov(reg_bias, ptr[param1 + GET_OFF(bias)]); - mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]); - - vpxord(zmm_zero, zmm_zero, zmm_zero); - for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { - const bool mask_flag = last_oc_block && ocb == jcp.nb_oc_blocking - 1; - int scale_offset = jcp.is_oc_scale * (sizeof(float) * ocb * jcp.oc_block); - - auto zmm_bias = zmm_tmp; - if (jcp.with_bias) { - int bias_offset = jcp.typesize_bia * ocb * jcp.oc_block; - auto bias_addr = EVEX_compress_addr(reg_bias, bias_offset); - cvt2ps(jcp.bia_dt, zmm_bias, bias_addr, mask_flag); - } - - for (int ur = 0; ur < ur_w; ur++) { - zmm_t zmm = zmm_out(ur, ocb); - vcvtdq2ps(zmm, zmm); - if (jcp.with_bias) vaddps(zmm, zmm, zmm_bias); - zmm_t mask_zmm = mask_flag - ? zmm | ktail_mask | T_z - : zmm; - vmulps(mask_zmm, zmm, - EVEX_compress_addr(reg_ptr_scales, scale_offset)); - - if (jcp.dst_dt == data_type::u8) vmaxps(zmm, zmm_zero, zmm); - - if (jcp.dst_dt != data_type::f32) { - if (attr_.round_mode_ == round_mode::nearest) - vcvtps2dq(zmm | T_rn_sae, zmm); - else if (attr_.round_mode_ == round_mode::down) - vcvtps2dq(zmm | T_rd_sae, zmm); - else - assert(!"unimplemented"); - } - } - for (int ur = 0; ur < ur_w; ur++) { - int aux_dst_off = jcp.typesize_out - * (ur * jcp.ngroups * jcp.oc_without_padding + ocb * jcp.oc_block); - auto addr = EVEX_compress_addr(reg_dst, aux_dst_off); - - zmm_t zmm = zmm_out(ur, ocb); - zmm_t r_zmm = mask_flag - ? zmm | ktail_mask - : zmm; - switch (jcp.dst_dt) { - case data_type::f32: - case data_type::s32: vmovups(addr, r_zmm); break; - case data_type::s8: vpmovsdb(addr, r_zmm); break; - case data_type::u8: vpmovusdb(addr, r_zmm); break; - default: assert(!"unknown dst_dt"); - } - } - } -} - -void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::compute_loop( - int ur_w, int l_overflow, int r_overflow, bool is_last_sp_block) { - - int shift_src_icb = jcp.typesize_in * jcp.ic_block; - int shift_filt_icb = jcp.typesize_in * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; - - prepare_output(ur_w); - - Xbyak::Label icb_loop_label; - mov(reg_icb, jcp.nb_ic); - L(icb_loop_label); { - - if (jcp.ic_without_padding != jcp.ic) { - Xbyak::Label common_ker, end_ker; - cmp(reg_icb, 1); - jg(common_ker, T_NEAR); - - compute_ker(ur_w, l_overflow, r_overflow, - is_last_sp_block ? last_sp_block : last_ic_block); - jmp(end_ker, T_NEAR); - - L(common_ker); - compute_ker(ur_w, l_overflow, r_overflow, no_last_block); - - L(end_ker); - } else { - compute_ker(ur_w, l_overflow, r_overflow, no_last_block); - } - - add(reg_src, shift_src_icb); - add(reg_filt, shift_filt_icb); - dec(reg_icb); - cmp(reg_icb, 0); - jg(icb_loop_label, T_NEAR); - } - sub(reg_src, jcp.nb_ic * shift_src_icb); - sub(reg_filt, jcp.nb_ic * shift_filt_icb); - - if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) { - Xbyak::Label common_store, end_store; - mov(reg_oc_blocks, ptr[param1 + GET_OFF(oc_blocks)]); - if (jcp.is_depthwise) - cmp(reg_oc_blocks, jcp.nb_ch - 1); - else - cmp(reg_oc_blocks, jcp.nb_oc - jcp.nb_oc_blocking); - jne(common_store, T_NEAR); - - store_output(ur_w, true); - jmp(end_store, T_NEAR); - - L(common_store); - store_output(ur_w, false); - - L(end_store); - - } else { - store_output(ur_w, false); - } -} - -void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::generate() { - preamble(); - - Xbyak::Reg16 _t = reg_scratch.cvt16(); - mov(_t, 0x1); - vpbroadcastw(zmm_one, _t); - - if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) { - int tail_size = jcp.is_depthwise - ? jcp.ngroups % jcp.ch_block - : jcp.oc_without_padding % jcp.oc_block; - int mask = (1 << tail_size) - 1; - Xbyak::Reg32 regw_tmp = reg_nur_w.cvt32(); - mov(regw_tmp, mask); - kmovw(ktail_mask, regw_tmp); - } - - mov(reg_src, ptr[param1 + GET_OFF(src)]); - mov(reg_filt, ptr[param1 + GET_OFF(filt)]); - mov(reg_dst, ptr[param1 + GET_OFF(dst)]); - mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]); - - int dst_shift = jcp.typesize_out * jcp.ur_w * jcp.ngroups * jcp.oc_without_padding; - int src_shift = jcp.typesize_in * (jcp.ur_w / jcp.stride_w) * jcp.ngroups * jcp.ic_without_padding; - - int l_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - jcp.l_pad) / jcp.stride_w); - int r_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - - max(0, jcp.r_pad)) / jcp.stride_w); - - int r_overflow1 = nstl::max(0, ((jcp.kw -1) * (jcp.dilate_w + 1) - - nstl::max(0, jcp.r_pad) - jcp.ur_w_tail) / jcp.stride_w); - int nur_w = jcp.ow / jcp.ur_w; - if (r_overflow1 > 0) nur_w--; - - if (jcp.ur_w == jcp.ow) { - compute_loop(jcp.ur_w, l_overflow, r_overflow, true); - } else if (nur_w == 0) { - compute_loop(jcp.ur_w, l_overflow, r_overflow1, jcp.ur_w_tail == 0); - add(reg_src, src_shift); - add(reg_dst, dst_shift); - if (jcp.ur_w_tail != 0) - compute_loop(jcp.ur_w_tail, 0, r_overflow, true); - } else { - xor_(reg_nur_w, reg_nur_w); - if (l_overflow > 0) { - compute_loop(jcp.ur_w, l_overflow, 0, false); - add(reg_src, src_shift); - add(reg_dst, dst_shift); - inc(reg_nur_w); - } - if ((l_overflow <= 0 && nur_w > 0) - || (l_overflow > 0 && nur_w > 1)) { - Xbyak::Label ow_loop_label; - L(ow_loop_label); { - compute_loop(jcp.ur_w, 0, 0, false); - add(reg_src, src_shift); - add(reg_dst, dst_shift); - inc(reg_nur_w); - cmp(reg_nur_w, nur_w); - jl(ow_loop_label, T_NEAR); - } - } - if (r_overflow1 > 0) { - compute_loop(jcp.ur_w, 0, r_overflow1, jcp.ur_w_tail == 0); - add(reg_src, src_shift); - add(reg_dst, dst_shift); - } - if (jcp.ur_w_tail != 0) { - compute_loop(jcp.ur_w_tail, 0, r_overflow, true); - } - } - postamble(); -} - -template -void _jit_avx512_core_u8s8s32x_deconvolution_fwd_t:: -execute_forward() -{ - auto src = reinterpret_cast(this->input_memory(0)); - auto weights = reinterpret_cast(this->input_memory(1)); - auto bias = reinterpret_cast(this->input_memory(2)); - auto dst = reinterpret_cast(this->memory()); - - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); - - auto &jcp = kernel_->jcp; - - int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; - int nb_groups = jcp.nb_ch; - - size_t src_h_stride = src_d.blk_off(0, 0, 1); - size_t dst_h_stride = dst_d.blk_off(0, 0, 1); - size_t wht_kh_stride = wht_blk_off(weights_d, 0, 0, 0, 1); - - const auto &oscales = conf_.attr()->output_scales_; - - parallel(0, - [&](const int ithr, const int nthr) { - int start{0}, end{0}; - int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh; - balance211(work_amount, nthr, ithr, start, end); - - auto p = jit_deconv_call_s(); - - /*loop order = cgn*/ - int n{0}, g{0}, occ{0}, oh_s{0}; - if (jcp.loop_order == loop_ngc) - nd_iterator_init(start, n, jcp.mb, g, nb_groups, occ, oc_chunks, - oh_s, jcp.oh); - else if (jcp.loop_order == loop_cgn) - nd_iterator_init(start, occ, oc_chunks, g, nb_groups, n, jcp.mb, - oh_s, jcp.oh); - else - assert(!"unsupported loop order"); - while (start < end) { - - int ocb = occ * jcp.nb_oc_blocking; - int g_oc = (g * jcp.ch_block * jcp.nb_oc + ocb) * jcp.oc_block; - int g_ic = g * jcp.ch_block * jcp.ic; - int work_rem = end - start; - int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; - - auto dst_w = dst + dst_d.blk_off(n, g_oc); - auto src_w = src + src_d.blk_off(n, g_ic); - auto wht_w = weights + wht_blk_off(weights_d, g, ocb, 0); - auto bias_w = jcp.with_bias - ? bias + (bias_d.blk_off(g_oc) * jcp.typesize_bia) - : 0; - - auto scales = &oscales.scales_[jcp.is_oc_scale * g_oc]; - for (int oj = oh_s; oj < oh_e; oj++) { - int ih_max, kh_lo, kh_len; - if (jcp.dilate_h != 0 && jcp.stride_h == 1) { - int dilate_h = jcp.dilate_h + 1; - // Note: use div_up to account for "holes" in filter - int o_t_overflow - = div_up(max(0, (jcp.kh - 1) * dilate_h - - oj - jcp.t_pad), dilate_h); - int o_b_overflow - = div_up(max(0, (jcp.kh - 1) * dilate_h + 1 - - jcp.ih + oj - jcp.b_pad), dilate_h); - kh_len = jcp.kh - o_t_overflow - o_b_overflow; - kh_lo = o_b_overflow; - ih_max = oj + jcp.t_pad - o_b_overflow * dilate_h; - } else { - int o_t_overflow = max(0, - (jcp.kh - (oj + 1 + jcp.t_pad)) / jcp.stride_h); - int o_b_overflow = max(0, - ((oj + 1 + jcp.kh - 1) - - (jcp.oh + jcp.b_pad)) / jcp.stride_h); - int overflow_kh_hi = jcp.kh - 1 - - abs(jcp.oh + jcp.b_pad - (oj + 1)) % jcp.stride_h; - int overflow_kh_lo = ((oj + 1 + jcp.t_pad) - 1) % jcp.stride_h; - - kh_len = (overflow_kh_hi - overflow_kh_lo) / jcp.stride_h - + 1 - o_t_overflow - o_b_overflow; - kh_lo = overflow_kh_lo + o_b_overflow * jcp.stride_h; - ih_max = (oj + jcp.t_pad - kh_lo) / jcp.stride_h; - } - - p.src = src_w + ih_max * src_h_stride; - p.dst = dst_w + oj * dst_h_stride; - p.filt = wht_w + kh_lo * wht_kh_stride; - p.bias = bias_w; - p.kh_padding = kh_len; - p.scales = scales; - p.oc_blocks = jcp.is_depthwise ? g : ocb; - kernel_->jit_ker(&p); - } - if (jcp.loop_order == loop_ngc) - nd_iterator_jump(start, end, - n, jcp.mb, g, nb_groups, occ, oc_chunks, oh_s, jcp.oh); - else if (jcp.loop_order == loop_cgn) - nd_iterator_jump(start, end, - occ, oc_chunks, g, nb_groups, n, jcp.mb, oh_s, jcp.oh); - else - assert(!"unsupported loop order"); - } - }); -} - -template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t; -} -} -} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.cpp index 45f516c..1377290 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.cpp @@ -17,6 +17,7 @@ #include #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "mkldnn_thread.hpp" @@ -33,6 +34,7 @@ namespace impl { namespace cpu { using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; @@ -100,7 +102,6 @@ struct jit_avx512_core_u8s8s32x_wino_conv_src_trans_t: public jit_generator { return Opmask(3 + id); } - Reg64 reg_ptr_offset = r15; Reg64 reg_ptr_src = r14; Reg64 reg_ptr_dst = r13; @@ -117,12 +118,49 @@ struct jit_avx512_core_u8s8s32x_wino_conv_src_trans_t: public jit_generator { Reg64 reg_scratch_src_alpha = rdx; Xmm xmm_src_alpha = Xmm(0); Zmm zmm_src_alpha = Zmm(0); + + Reg64 reg_shift = rax; + Xmm xmm_shift = Xmm(1); + Xmm xmm_zero = Xmm(0); + + Reg64 reg_maskx = rbx; + Reg64 reg_masky = rsi; + Reg64 reg_nomask = reg_maskx; }; void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() { Label ic_block_label; + Label end_label; + Label mask_label; + Label nomask_label; + + auto load_src = [=](bool mask) { + for (int y = 0; y < jcp.alpha; y++) { + if (mask) + kmovw(y_mask, ptr[reg_ptr_v_y_masks + sizeof(uint16_t) * y]); + for (int x = 0; x < jcp.alpha; x++) { + Zmm zmm_i = zmm_inp(y * jcp.alpha + x); + Xmm vreg_i = vreg_inp(y * jcp.alpha + x); + int inp_offset = sizeof(uint8_t) + * ((-jcp.t_pad + y) * jcp.iw * jcp.ic + + (-jcp.l_pad + x) * jcp.ic); + if (mask) { + kandw(r_mask, y_mask, x_mask(x)); + vmovdqu8(vreg_i | r_mask | T_z, + EVEX_compress_addr(reg_aux_ptr_src, inp_offset)); + } else { + vmovdqu8(vreg_i, + EVEX_compress_addr(reg_aux_ptr_src, inp_offset)); + } + vpmovzxbd(zmm_i, vreg_i); // to int32 + vcvtdq2ps(zmm_i, zmm_i); // to fp32 + vmulps(zmm_i, zmm_i, zmm_src_alpha); // *alpha + vcvtps2dq(zmm_i | T_rn_sae, zmm_i); // to int32 + vpmovusdb(vreg_i, zmm_i); // to u8 + } + } + }; - int out_offset = 0, inp_offset = 0; preamble(); # define READ_PARAM(reg, field) \ @@ -133,14 +171,24 @@ void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() { READ_PARAM(reg_ptr_v_x_masks, v_x_masks); # undef READ_PARAM - xor_(eax, eax); - mov(ax, (int8_t)-128); + mov(reg_maskx, ptr[reg_ptr_v_x_masks]); + mov(reg_masky, ptr[reg_ptr_v_y_masks]); + test(reg_maskx, reg_maskx); + jz(end_label, T_NEAR); // skip kernel if x mask is all 0's + test(reg_masky, reg_masky); + jz(end_label, T_NEAR); // skip kernel if y mask is all 0's + and_(reg_maskx, reg_masky); + mov(reg_nomask, reg_maskx); + not_(reg_nomask); // zero if x and y masks are all 1's + + xor_(reg_shift, reg_shift); + mov(reg_shift.cvt8(), (int8_t)-128); mov(reg_aux_ptr_src, reg_ptr_src); mov(reg_aux_ptr_dst, reg_ptr_dst); for (int i = 0; i < jcp.alpha; i++) { - kmovw(x_mask(i), ptr[reg_ptr_v_x_masks + sizeof(int16_t) * i]); + kmovw(x_mask(i), ptr[reg_ptr_v_x_masks + sizeof(uint16_t) * i]); } mov(reg_scratch_src_alpha, float2int(adj_src_scale)); @@ -151,24 +199,14 @@ void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() { vmovq(xmm_src_alpha, reg_scratch_src_alpha); vbroadcastss(zmm_src_alpha, xmm_src_alpha); - for(int y = 0; y < jcp.alpha; y++) { - kmovw(y_mask, ptr[reg_ptr_v_y_masks + sizeof(int16_t) * y]); - for(int x = 0; x < jcp.alpha; x++) { - Zmm zmm_i = zmm_inp(y*jcp.alpha + x); - Xmm vreg_i = vreg_inp(y*jcp.alpha + x); - vpxord(vreg_i, vreg_i, vreg_i); - kandw(r_mask, y_mask, x_mask(x)); - inp_offset = sizeof(uint8_t) * - ((-jcp.t_pad + y) * jcp.iw * jcp.ic - + (-jcp.l_pad + x) * jcp.ic); - vmovdqu8(vreg_i | r_mask, EVEX_compress_addr(reg_aux_ptr_src, inp_offset)); - vpmovzxbd(zmm_i, vreg_i); // to int32 - vcvtdq2ps(zmm_i, zmm_i); // to fp32 - vmulps(zmm_i, zmm_i, zmm_src_alpha); // *alpha - vcvtps2dq(zmm_i | T_rn_sae, zmm_i); // to int32 - vpmovusdb(vreg_i, zmm_i); // to u8 - } - } + test(reg_nomask, reg_nomask); + jz(nomask_label, T_NEAR); + load_src(true); + jmp(mask_label, T_NEAR); + L(nomask_label); + load_src(false); + L(mask_label); + for(int y = 0; y < 4; y++) { vpsubb(vreg_tmp(y*4+0), vreg_inp(y*4+0), vreg_inp(y*4+2)); vpaddb(vreg_tmp(y*4+1), vreg_inp(y*4+1), vreg_inp(y*4+2)); @@ -182,12 +220,12 @@ void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() { vpsubb(vreg_out(x+3*4), vreg_tmp(x+4*1), vreg_tmp(x+4*3)); } - movd(Xmm(1), eax); - pxor(Xmm(0), Xmm(0)); - pshufb(Xmm(1), Xmm(0)); + vmovd(xmm_shift, reg_shift.cvt32()); + vpxor(xmm_zero, xmm_zero, xmm_zero); + vpshufb(xmm_shift, xmm_shift, xmm_zero); for (int i = 0; i < 16; i++) { - out_offset = sizeof(uint8_t) * (jcp.inp_stride * i); + int out_offset = sizeof(uint8_t) * (jcp.inp_stride * i); if (i != unsign_val_in_wino_domain) vpsubb(vreg_out(i), vreg_out(i), Xmm(1)); vmovups(EVEX_compress_addr(reg_aux_ptr_dst, out_offset), vreg_out(i)); @@ -199,6 +237,7 @@ void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() { dec(reg_ic_block); jnz(ic_block_label, T_NEAR); + L(end_label); postamble(); } @@ -294,7 +333,6 @@ bool jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::maybe_relu(int position) { if (position == 0) { /* relu before sum */ return false - || jcp.with_relu || p.contain(eltwise, 0) || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0)); } else if (position == 1) { @@ -362,7 +400,7 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() { vmulps(vreg_bias, vreg_bias, zmm_bias_alpha); // *alpha } for(int y = 0; y < jcp.m; y++) { - kmovw(y_mask, ptr[ reg_ptr_v_y_masks + sizeof(int16_t) * y ]); + kmovw(y_mask, ptr[ reg_ptr_v_y_masks + sizeof(uint16_t) * y ]); for(int x = 0; x < jcp.m; x++) { kandw(r_mask, y_mask, x_mask(x)); @@ -442,11 +480,9 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() { mov(reg_aux_ptr_dst, reg_ptr_dst); vpxord(vreg_zero, vreg_zero, vreg_zero); - for (int i = 0; i < jcp.alpha * jcp.alpha; i++) - vpxord(vreg_inp(i), vreg_inp(i), vreg_inp(i)); - for (int i = 0; i < jcp.alpha; i++) - kmovw(x_mask(i), ptr[reg_ptr_v_x_masks + sizeof(int16_t) * i]); + for (int i = 0; i < jcp.m; i++) + kmovw(x_mask(i), ptr[reg_ptr_v_x_masks + sizeof(uint16_t) * i]); int oc_blocks = jcp.oc / load_block; mov(reg_oc_block, oc_blocks); @@ -461,9 +497,6 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() { dec(reg_oc_block); jnz(oc_block_label, T_NEAR); - sub(reg_ptr_scales, jcp.is_oc_scale * sizeof(float) * load_block); - sub(reg_ptr_bias, oc_blocks * sizeof(jcp.typesize_bia) * load_block); - postamble(); } @@ -498,8 +531,7 @@ struct jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t: public jit_generator { jit_conv_conf_2x3_wino_t &jcp, const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope); + const primitive_attr_t &attr); Zmm vreg_out(int n, int m) { const int id_reg_out = n * jcp.m_block + m; @@ -536,26 +568,14 @@ bool jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::post_ops_ok( using namespace primitive_kind; const auto &p = attr.post_ops_; - auto is_relu = [&](int idx) { - return p.entry_[idx].kind == eltwise - && p.entry_[idx].eltwise.scale == 1. - && p.entry_[idx].eltwise.alg == alg_kind::eltwise_relu - && p.entry_[idx].eltwise.alpha == 0.; - }; + auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); }; - switch (p.len_) { + switch (p.len_) { case 0: return true; - case 1: return true - && IMPLICATION(jcp.with_relu, p.contain(sum, 0)) - && IMPLICATION(!jcp.with_relu, is_relu(0) || p.contain(sum, 0)); - case 2: return true - && IMPLICATION(jcp.with_relu, p.contain(sum, 0) && is_relu(1)) - && IMPLICATION(!jcp.with_relu, false - || (p.contain(sum, 0) && is_relu(1)) - || (p.contain(sum, 1) && is_relu(0))); - case 3: return true - && jcp.with_relu == false - && (is_relu(0) && p.contain(sum, 1) && is_relu(2)); + case 1: return is_relu(0) || p.contain(sum, 0); + case 2: return (p.contain(sum, 0) && is_relu(1)) || + (p.contain(sum, 1) && is_relu(0)); + case 3: return is_relu(0) && p.contain(sum, 1) && is_relu(2); default: return false; } @@ -657,13 +677,24 @@ void jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::generate() { postamble(); } +namespace { +bool is_winograd_faster_than_direct(const jit_conv_conf_2x3_wino_t &jcp) { + if (jcp.ver == ver_vnni) { + return (jcp.mb <= mkldnn_get_max_threads() + && (jcp.mb > 4 + && jcp.ic > 64 + && !(jcp.oc > 128 && jcp.ih < 14))) + || jcp.mb > mkldnn_get_max_threads(); + } + return true; +} +} status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t ::init_conf(jit_conv_conf_2x3_wino_t &jcp, const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &wei_pd, cpu_memory_t::pd_t &dst_pd, - cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope) { + cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr) { const memory_desc_wrapper src_d(&src_pd); const memory_desc_wrapper wei_d(&wei_pd); const memory_desc_wrapper dst_d(&dst_pd); @@ -671,6 +702,8 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t const bool with_groups = wei_d.ndims() == src_d.ndims() + 1; + jcp.nthr = mkldnn_get_max_threads(); + jcp.ngroups = with_groups ? wei_d.dims()[0] : 1; jcp.mb = src_d.dims()[0]; jcp.oc = dst_d.dims()[1] / jcp.ngroups; @@ -700,6 +733,10 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t if (mayiuse(avx512_core_vnni)) jcp.ver = ver_vnni; + if (!IMPLICATION(cd.alg_kind == alg_kind::convolution_auto, + is_winograd_faster_than_direct(jcp))) + return status::unimplemented; + // block sizes needed for GEMM kernel jcp.ic_block = 4; jcp.oc_block = 16; @@ -718,10 +755,7 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_relu = with_relu; - jcp.relu_negative_slope = relu_negative_slope; - if (!IMPLICATION(with_relu, relu_negative_slope == 0.)) - return status::unimplemented; + if (!post_ops_ok(jcp, attr)) return status::unimplemented; @@ -743,7 +777,6 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t jcp.alpha = jcp.m + jcp.r - 1; int aa = jcp.alpha * jcp.alpha; - int nthr = mkldnn_get_max_threads(); int L1_cap = get_cache_size(1, true); int L2_cap = get_cache_size(2, true); // need 1 extra reg for bcast, and 2 tmp regs for non-vnni @@ -755,12 +788,12 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t float Y = (float)jcp.ic * jcp.oc; if (small_mb == 0) { // outer par int nblocks = jcp.mb * div_up(jcp.oh, iy) * div_up(jcp.ow, ix); - thr_eff = (float)nblocks / rnd_up(nblocks, nthr); + thr_eff = (float)nblocks / rnd_up(nblocks, jcp.nthr); } else { // inner par int tranw = iy * ix / jcp.alpha; int gemmw = aa * (jcp.nb_oc / n2_b); - int tranw_r = rnd_up(tranw, nthr); - int gemmw_r = rnd_up(gemmw, nthr); + int tranw_r = rnd_up(tranw, jcp.nthr); + int gemmw_r = rnd_up(gemmw, jcp.nthr); thr_eff = (Z * tranw / tranw_r + Y * gemmw / gemmw_r) / (Z + Y); } return thr_eff; @@ -779,7 +812,7 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t req_mem = (float)jcp.ic * (M + N) + jcp.typesize_acc * M * N; mem_eff = nstl::min(1.f, L2_cap / req_mem); // memory used during wino transforms - int M_per_thr = div_up(M, nthr); + int M_per_thr = div_up(M, jcp.nthr); req_mem = (float)aa * M_per_thr * (jcp.ic + jcp.typesize_acc * jcp.oc); if (req_mem > L2_cap) @@ -868,15 +901,34 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t assert((jcp.m_block + 1) * jcp.n2_block <= free_regs); assert(jcp.xb % 2 == 0 && jcp.yb % 2 == 0); - jcp.inp_stride = jcp.yb * jcp.xb / 4 * jcp.ic; - jcp.out_stride = jcp.yb * jcp.xb / 4 * jcp.oc; - jcp.wei_stride = jcp.ic * jcp.oc; - jcp.bia_stride = jcp.oc; + jcp.mb_block = 1; + if (jcp.small_mb) { + // For small mb harness, set mb_block as large as possible subject to + // the constraint that winograd activations fit into available L3 cache + int L3_cap = get_cache_size(3, true); + int M = jcp.xb * jcp.yb / 4; + int wino_src_size = 16 * M * jcp.ic * jcp.typesize_in; + int wino_dst_size = 16 * M * jcp.oc * jcp.typesize_acc; + int max_mb_block = nstl::min( + jcp.mb, jcp.nthr * L3_cap / (wino_src_size + wino_dst_size)); + for (int i = max_mb_block; i > 1; i--) { + if (jcp.mb % i == 0) { + jcp.mb_block = i; + break; + } + } + } + jcp.nb_mb = jcp.mb / jcp.mb_block; - jcp.M = jcp.xb * jcp.yb / 4; + jcp.M = jcp.mb_block * jcp.xb * jcp.yb / 4; jcp.N = jcp.oc; jcp.K = jcp.ic; + jcp.inp_stride = jcp.M * jcp.ic; + jcp.out_stride = jcp.M * jcp.oc; + jcp.wei_stride = jcp.ic * jcp.oc; + jcp.bia_stride = jcp.oc; + jcp.n_block = jcp.oc_block; jcp.k_block = jcp.ic_block; @@ -922,69 +974,82 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t if (!wei_pd.is_equal(&new_weights_pd)) return status::unimplemented; + const int tilesize = jcp.alpha * jcp.alpha; + const int numtiles = jcp.M; + const int alltiles = numtiles * tilesize; + + jcp.size_wino_src + = utils::rnd_up(jcp.typesize_in * alltiles * jcp.ic, PAGE_4K) + / jcp.typesize_in; + jcp.size_wino_wei = tilesize * jcp.oc * jcp.ic; + jcp.size_wino_dst = alltiles * jcp.oc; + return status::success; } //////////////////////////////////////////////////////////////////////////////// -template -status_t _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t::pd_t::jit_conf() { +template +status_t jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: + pd_t::jit_conf() { return jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::init_conf( - jcp_, this->cdesc_(), this->src_pd_, this->weights_pd_, - this->dst_pd_,this->bias_pd_, *this->attr(), - with_relu, this->negative_slope()); + jcp_, *this->desc(), this->src_pd_, this->weights_pd_, + this->dst_pd_,this->bias_pd_, *this->attr()); } -template -_jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: - _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(const pd_t *pd, +template +void jit_avx512_core_u8s8s32x_wino_convolution_fwd_t::pd_t:: +init_scratchpad() { + auto scratchpad = this->scratchpad_registry().registrar(); + + int nthr_multiplier = jcp_.small_mb ? 1 : jcp_.nthr; + scratchpad.book(key_wino_V, + sizeof(src_data_t) * jcp_.size_wino_src * nthr_multiplier, PAGE_4K); + scratchpad.book(key_wino_M, + sizeof(acc_data_t) * jcp_.size_wino_dst * nthr_multiplier, PAGE_4K); + + scratchpad.book(key_conv_adjusted_scales, + sizeof(float) * nstl::max(attr()->output_scales_.count_, 16)); +} + +template +jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: + jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs) - , conf_(*pd) - , scratchpad_(nullptr) { - const int nthreads = mkldnn_get_max_threads(); + : cpu_primitive_t(apd, inputs, outputs, true) +{ kernel_ = new jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t( - conf_.jcp_, *conf_.attr()); + pd()->jcp_, *pd()->attr()); src_trans_ = new jit_avx512_core_u8s8s32x_wino_conv_src_trans_t( - conf_.jcp_, *conf_.attr()); + pd()->jcp_, *pd()->attr()); dst_trans_ = new jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t( - conf_.jcp_, *conf_.attr()); - - const int tilesize = conf_.jcp_.alpha * conf_.jcp_.alpha; - const int numtiles = (conf_.jcp_.yb / 2) * (conf_.jcp_.xb / 2); - const int alltiles = tilesize * numtiles; - size_wino_wei_ = tilesize * conf_.jcp_.oc * conf_.jcp_.ic; - size_wino_src_ = sizeof(src_data_t) * alltiles * conf_.jcp_.ic; - size_wino_src_ = rnd_up(size_wino_src_, PAGE_4K); - size_wino_src_ /= sizeof(src_data_t); - size_wino_dst_ = alltiles * conf_.jcp_.oc; - - size_t workspace_size = (conf_.jcp_.small_mb ? 1 : nthreads) - * (sizeof(src_data_t) * size_wino_src_ - + sizeof(acc_data_t) * size_wino_dst_); - - scratchpad_ = create_scratchpad(workspace_size); - assert(scratchpad_); // TODO: add proper check and raise exception? - - wino_shift_ = (conf_.jcp_.small_mb ? 1 : nthreads) * sizeof(src_data_t) - * size_wino_src_; - - updated_output_scales_ = conf_.attr()->output_scales_; - updated_output_scales_.scale(1.f / (adj_src_scale * adj_wei_scale)); + pd()->jcp_, *pd()->attr()); } -template -_jit_avx512_core_u8s8s32x_wino_convolution_fwd_t::~_jit_avx512_core_u8s8s32x_wino_convolution_fwd_t() { +template +jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: + ~jit_avx512_core_u8s8s32x_wino_convolution_fwd_t() { delete kernel_; delete src_trans_; delete dst_trans_; - delete scratchpad_; } -template -void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t::execute_forward() { +template +const float *jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: +adjust_oscales(const memory_tracking::grantor_t &scratchpad) const { + const float *oscales = pd()->attr()->output_scales_.scales_; + auto loc_scales = scratchpad.template get(key_conv_adjusted_scales); + size_t count = pd()->attr()->output_scales_.count_; + float factor = 1.f / (adj_src_scale * adj_wei_scale); + if (count == 1) + utils::array_set(loc_scales, oscales[0] * factor, 16); + else + for (size_t c = 0; c < count; c++) loc_scales[c] = oscales[c] * factor; + return loc_scales; +} + +template +void jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: +execute_forward() const { const auto &jcp = kernel_->jcp; if (jcp.small_mb) execute_forward_small_mb(); @@ -992,21 +1057,22 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t -void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t::execute_forward_mbN() { +template +void jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: +execute_forward_mbN() const { auto src = reinterpret_cast(input_memory(0)); auto wei = reinterpret_cast(input_memory(1)); auto bia = reinterpret_cast(input_memory(2)); auto dst = reinterpret_cast(memory(0)); + auto scratchpad = this->scratchpad(); + const auto &jcp = kernel_->jcp; - const auto &oscales = updated_output_scales_; + const float *oscales = adjust_oscales(scratchpad); - auto wino_wei = wei; - auto dst_bias = (const acc_data_t *)(wei + size_wino_wei_); - auto wino_src_base = (src_data_t *)scratchpad_->get(); - auto wino_dst_base = (acc_data_t *)(scratchpad_->get() + wino_shift_); + auto dst_bias = (const acc_data_t *)(wei + jcp.size_wino_wei); + auto wino_src_base = scratchpad.template get(key_wino_V); + auto wino_dst_base = scratchpad.template get(key_wino_M); parallel_nd(jcp.mb, div_up(jcp.oh, jcp.yb), div_up(jcp.ow, jcp.xb), [&](int mb, int tile_y_b, int tile_x_b) { @@ -1015,8 +1081,8 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t= v_ye) ? 0 : 0xffff; - v_x_masks[i] = (i < v_xs || i >= v_xe) ? 0 : 0xffff; + v_y_masks[i] = uint16_t(i < v_ys || i >= v_ye ? 0 : 0xffff); + v_x_masks[i] = uint16_t(i < v_xs || i >= v_xe ? 0 : 0xffff); } auto local_s = src + mb * jcp.ih * jcp.iw * jcp.ic @@ -1066,7 +1132,7 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_tker_(&gemm_p); @@ -1075,7 +1141,7 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t -void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t::execute_forward_small_mb() { +template +void jit_avx512_core_u8s8s32x_wino_convolution_fwd_t:: +execute_forward_small_mb() const { auto src = reinterpret_cast(input_memory(0)); auto wei = reinterpret_cast(input_memory(1)); auto bia = reinterpret_cast(input_memory(2)); auto dst = reinterpret_cast(memory(0)); + auto scratchpad = this->scratchpad(); + const auto &jcp = kernel_->jcp; - const auto &oscales = updated_output_scales_; + const float *oscales = adjust_oscales(scratchpad); - auto wino_wei = wei; - auto dst_bias = (const acc_data_t *)(wei + size_wino_wei_); - auto wino_src = (src_data_t *)scratchpad_->get(); - auto wino_dst = (acc_data_t *)(scratchpad_->get() + wino_shift_); + auto dst_bias = (const acc_data_t *)(wei + jcp.size_wino_wei); + auto wino_src = scratchpad.template get(key_wino_V); + auto wino_dst = scratchpad.template get(key_wino_M); - for (int mb = 0; mb < jcp.mb; mb++) { + for (int mbb = 0; mbb < jcp.nb_mb; mbb++) { for (int tile_y = 0; tile_y < jcp.oh; tile_y += jcp.yb) { for (int tile_x = 0; tile_x < jcp.ow; tile_x += jcp.xb) { /* transformation of input tensor to winograd domain */ - parallel_nd(div_up(jcp.yb, 2), div_up(jcp.xb, 2), - [&](int y_in_block_b, int x_in_block_b) { + parallel_nd(div_up(jcp.yb, 2), div_up(jcp.xb, 2), jcp.mb_block, + [&](int y_in_block_b, int x_in_block_b, int mb) { int y_in_block = y_in_block_b * 2; int x_in_block = x_in_block_b * 2; auto src_trans_p = jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::call_params_t(); - unsigned short v_y_masks[4], v_x_masks[4]; + uint16_t v_y_masks[4], v_x_masks[4]; int y = y_in_block + tile_y; int x = x_in_block + tile_x; - int m = (y_in_block / 2) * (jcp.xb / 2) + (x_in_block / 2); + int m = (mb * (jcp.yb / 2) + (y_in_block / 2)) * (jcp.xb / 2) + + (x_in_block / 2); int v_ys = nstl::max(0, jcp.t_pad - y); int v_ye = nstl::min( @@ -1150,11 +1218,11 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t= v_ye) ? 0 : 0xffff; - v_x_masks[i] = (i < v_xs || i >= v_xe) ? 0 : 0xffff; + v_y_masks[i] = uint16_t(i < v_ys || i >= v_ye ? 0 : 0xffff); + v_x_masks[i] = uint16_t(i < v_xs || i >= v_xe ? 0 : 0xffff); } auto local_s = src - + mb * jcp.ih * jcp.iw * jcp.ic + + (mbb * jcp.mb_block + mb) * jcp.ih * jcp.iw * jcp.ic + y * jcp.iw * jcp.ic + x * jcp.ic; auto local_w = wino_src + m * jcp.ic; @@ -1174,7 +1242,7 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; -template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; +template struct jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; +template struct jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; +template struct jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; +template struct jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; } // namespace cpu } // namespace impl diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.hpp index 83392ab..5c1c8cb 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.hpp @@ -23,7 +23,6 @@ #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" #include "mkldnn_thread.hpp" -#include "scratchpad.hpp" #include "type_helpers.hpp" #include "utils.hpp" @@ -39,20 +38,18 @@ struct jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t; struct jit_avx512_core_u8s8s32x_wino_conv_src_trans_t; struct jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t; -template -struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t : public cpu_primitive_t { - struct pd_t : public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, +template +struct jit_avx512_core_u8s8s32x_wino_convolution_fwd_t : public cpu_primitive_t { + struct pd_t : public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_int8_wino:", avx512_core, ""), - _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t); + jit_avx512_core_u8s8s32x_wino_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; @@ -60,28 +57,39 @@ struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t : public cpu_primitive_t assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_winograd + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_winograd) && !this->has_zero_dim_memory() - && this->cdesc_().src_desc.data_type == data_type::u8 - && this->cdesc_().dst_desc.data_type == dst_data_type - && this->cdesc_().weights_desc.data_type == data_type::s8 + && this->desc()->src_desc.data_type == data_type::u8 + && this->desc()->dst_desc.data_type == dst_data_type + && this->desc()->weights_desc.data_type == data_type::s8 && IMPLICATION(this->with_bias(), - utils::one_of(this->cdesc_().bias_desc.data_type, + utils::one_of(this->desc()->bias_desc.data_type, data_type::f32, data_type::s32, data_type::s8, data_type::u8)) - && this->cdesc_().accum_data_type == data_type::s32; + && this->desc()->accum_data_type == data_type::s32; if (!ok) return status::unimplemented; - return jit_conf(); + status_t status = jit_conf(); + if (status != status::success) return status; + + init_scratchpad(); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + this->set_alg_kind(alg_kind::convolution_winograd); + return status; } jit_conv_conf_2x3_wino_t jcp_; protected: status_t jit_conf(); + void init_scratchpad(); virtual status_t set_default_params() override { using namespace memory_format; @@ -100,42 +108,28 @@ struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t : public cpu_primitive_t typedef typename prec_traits::type acc_data_t; typedef typename prec_traits::type dst_data_t; - _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(const pd_t *pd, + jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); - ~_jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(); + ~jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(); - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - void execute_forward_small_mb(); - void execute_forward_mbN(); - pd_t conf_; + const float *adjust_oscales(const memory_tracking::grantor_t &scratchpad) + const; + void execute_forward() const; + void execute_forward_small_mb() const; + void execute_forward_mbN() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t *kernel_; jit_avx512_core_u8s8s32x_wino_conv_src_trans_t *src_trans_; jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t *dst_trans_; - - size_t size_wino_wei_; - size_t size_wino_src_; - size_t size_wino_dst_; - size_t wino_shift_; - - scratchpad_t *scratchpad_; - - mkldnn::impl::scales_t updated_output_scales_; }; -template -using jit_avx512_core_u8s8s32x_wino_convolution_fwd_t = - _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; - -template -using jit_avx512_core_u8s8s32x_wino_convolution_relu_t = - _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t; } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp index 40ca5f0..011db24 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp @@ -13,12 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ -#include + +#include + #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "nstl.hpp" #include "type_helpers.hpp" -#include "mkldnn_thread.hpp" #include "utils.hpp" + #include "cpu_memory.hpp" #include "jit_uni_1x1_conv_utils.hpp" @@ -35,32 +38,6 @@ using namespace mkldnn::impl::utils; using namespace Xbyak; -bool jit_avx512_core_x8s8s32x_1x1_conv_kernel::maybe_relu(int position) -{ - using namespace primitive_kind; - const auto &p = attr_.post_ops_; - - if (position == 0) { - /* relu before sum */ - return false - || jcp.with_eltwise - || p.contain(eltwise, 0) - || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0)); - } else if (position == 1) { - /* relu after sum */ - const int sum_idx = p.contain(sum, 0) - ? 0 : (p.contain(sum, 1) ? 1 : -1); - if (sum_idx == -1) - return false; - - return false - || p.contain(eltwise, sum_idx + 1) - || jcp.dst_dt == data_type::u8; - } - - return false; -} - void jit_avx512_core_x8s8s32x_1x1_conv_kernel::bcast_loop(int load_loop_blk) { mov(aux1_reg_bcast_data, reg_bcast_data); @@ -131,7 +108,7 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk, }; auto vreg_accum = [=](int i_load, int i_ur) { - return Zmm(i_ur * load_loop_blk + i_load); + return Zmm(i_ur + i_load * ur); }; auto zmm_bias_alpha = [=]() { @@ -242,23 +219,60 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk, zmm_t mask_zmm = mask_flag ? r | ktail_mask | T_z : r; vmulps(mask_zmm, r, scale_ptr(i_load)); - if (maybe_relu(0)) { - vpxord(zmm_zero, zmm_zero, zmm_zero); - vmaxps(r, zmm_zero, r); - } - if (p_sum_scale) { // post_op: sum - vpxord(zmm_zero, zmm_zero, zmm_zero); - auto zmm_prev_dst = zmm_zero; + } + } + + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + for (int i = 0; i < p.len_; i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur * load_loop_blk); + + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); + mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); + + add(reg_d_weights, reg_oc_off); + add(reg_d_bias, reg_oc_off); - cvt2ps(jcp.dst_dt, zmm_prev_dst, output_ptr(i_load, i_ur), - mask_flag); + for (int k = 0; k < load_loop_blk; k++) { + depthwise_injectors[depthwise_inj_idx]->compute_vector_range( + k * ur, k * ur + ur, reg_d_weights, reg_d_bias); - if (*p_sum_scale == 1.f) - vaddps(r, zmm_prev_dst); - else - vfmadd231ps(r, zmm_prev_dst, zword_b[reg_ptr_sum_scale]); + add(reg_d_weights, jcp.oc_block * sizeof(float)); + add(reg_d_bias, jcp.oc_block * sizeof(float)); } - if (maybe_relu(1)) { + + depthwise_inj_idx++; + } else if (post_op.is_sum(false)) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + const bool mask_flag = mask_flag_in && + i_load == load_loop_blk - 1; + for (int i_ur = 0; i_ur < ur; ++i_ur) { + vpxord(zmm_zero, zmm_zero, zmm_zero); + auto zmm_prev_dst = zmm_zero; + + auto r = vreg_accum(i_load, i_ur); + cvt2ps(jcp.dst_dt, zmm_prev_dst, output_ptr(i_load, i_ur), + mask_flag); + + if (*p_sum_scale == 1.f) + vaddps(r, zmm_prev_dst); + else + vfmadd231ps(r, zmm_prev_dst, zword_b[reg_ptr_sum_scale]); + } + } + } + } + + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + const bool mask_flag = mask_flag_in && + i_load == load_loop_blk - 1; + for (int i_ur = 0; i_ur < ur; ++i_ur) { + auto r = vreg_accum(i_load, i_ur); + if (jcp.dst_dt == data_type::u8) { vpxord(zmm_zero, zmm_zero, zmm_zero); vmaxps(r, zmm_zero, r); } @@ -274,6 +288,7 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk, for (int i_ur = 0; i_ur < ur; ++i_ur) { auto r = vreg_accum(i_load, i_ur); zmm_t r_zmm = mask_flag ? r | ktail_mask : r; + switch (jcp.dst_dt) { case data_type::f32: case data_type::s32: @@ -335,6 +350,8 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk, Label reduce_loop; Label reduce_loop_tail; + push(reg_oc_off); + mov(aux_reg_load_data, reg_load_data); mov(aux_reg_bcast_data, aux1_reg_bcast_data); @@ -359,6 +376,8 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk, fma_block(false); } + pop(reg_oc_off); + if (jcp.oc_without_padding != jcp.oc) { Label end_store, common_store; mov(EVEX_compress_addr(rsp, reg_bcast_data_off), reg_bcast_data); @@ -388,6 +407,24 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk, void jit_avx512_core_x8s8s32x_1x1_conv_kernel::generate() { + const auto &p = attr_.post_ops_; + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32( + this, + post_op.depthwise.alg + )); + } + } + preamble(); xor_(reg_scratch, reg_scratch); @@ -423,7 +460,7 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::generate() mov(EVEX_compress_addr(rsp, bcast_loop_work_off), reg_bcast_loop_work); mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]); mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]); - + mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); auto load_loop_body = [=](int load_loop_blk) { bcast_loop(load_loop_blk); @@ -451,6 +488,7 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::generate() add(reg_output_data, load_loop_blk * jcp.load_block * jcp.typesize_out); sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step); + add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float)); }; const int simd_w = 16; @@ -480,6 +518,12 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::generate() cmp(reg_load_loop_work, 0); je(load_loop_blk[num_ur_cases], T_NEAR); } + + for (int _i = 1; _i <= label_idx + 1; _i++) { + prefetcht0(ptr [ reg_load_data + _i * jcp.ic * jcp.oc_block ]); + prefetcht1(ptr [ reg_output_data + _i * jcp.oc_block ]); + } + load_loop_body(label_idx + 1); if (label_idx - 1 > 0) { cmp(reg_load_loop_work, 2 * label_idx * simd_w); @@ -503,6 +547,9 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::generate() add(rsp, stack_space_needed); postamble(); + + for (auto& inj : eltwise_injectors) + inj->prepare_table(); } bool jit_avx512_core_x8s8s32x_1x1_conv_kernel::post_ops_ok( @@ -510,27 +557,18 @@ bool jit_avx512_core_x8s8s32x_1x1_conv_kernel::post_ops_ok( using namespace primitive_kind; const auto &p = attr.post_ops_; - auto is_relu = [&](int idx) { - return p.entry_[idx].kind == eltwise - && p.entry_[idx].eltwise.scale == 1. - && p.entry_[idx].eltwise.alg == alg_kind::eltwise_relu - && p.entry_[idx].eltwise.alpha == 0.; - }; + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return p.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; - case 1: return true - && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0)) - && IMPLICATION(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0)); - case 2: return true - && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1)) - && IMPLICATION(!jcp.with_eltwise, false - || (p.contain(sum, 0) && is_relu(1)) - || (p.contain(sum, 1) && is_relu(0))); - case 3: return true - && jcp.with_eltwise == false - && (is_relu(0) && p.contain(sum, 1) && is_relu(2)); - default: return false; + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_simple(0) && is_sum(1) && is_simple(2)); + default: return false; } return false; @@ -540,9 +578,7 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf( jit_1x1_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, const memory_desc_wrapper &bias_d, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope, - int nthreads, bool reduce_src) -{ + const primitive_attr_t &attr, int nthreads, bool reduce_src) { if (!mayiuse(avx512_core)) return status::unimplemented; const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; @@ -577,10 +613,6 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf( jcp.stride_w = cd.strides[1]; jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; - if (!IMPLICATION(with_relu, relu_negative_slope == 0.)) - return status::unimplemented; jcp.signed_input = (src_d.data_type() == data_type::s8) ? true : false; @@ -646,25 +678,30 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf( max_regs = 8; jcp.expl_bcast = true; - const int spatial = jcp.oh; - jcp.ur = 1; - for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) { - if ((spatial >= size_treshold && spatial % ur_w == 0) - || (spatial < size_treshold && jcp.os % ur_w == 0)) { - jcp.ur = ur_w; - break; - } - } - if (jcp.ur == 1) { + if (jcp.mb == 1 && jcp.ic > 128 + && (jcp.oh <= size_treshold && jcp.ow <= size_treshold)) { jcp.ur = nstl::min(max_regs, jcp.os); - int os_tail = jcp.os % max_regs; - for (int i = max_regs; i >= min_regs; i--) { - int i_tail = jcp.os % i; - if (i_tail > os_tail || i_tail == 0) { - jcp.ur = i; - os_tail = i_tail; - if (i_tail == 0) - break; + } else { + const int spatial = jcp.oh; + jcp.ur = 1; + for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) { + if ((spatial >= size_treshold && spatial % ur_w == 0) + || (spatial < size_treshold && jcp.os % ur_w == 0)) { + jcp.ur = ur_w; + break; + } + } + if (jcp.ur == 1) { + jcp.ur = nstl::min(max_regs, jcp.os); + int os_tail = jcp.os % max_regs; + for (int i = max_regs; i >= min_regs; i--) { + int i_tail = jcp.os % i; + if (i_tail > os_tail || i_tail == 0) { + jcp.ur = i; + os_tail = i_tail; + if (i_tail == 0) + break; + } } } } @@ -786,6 +823,17 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf( return status::success; } +void jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) { + using namespace mkldnn::impl::memory_tracking::names; + + if (jcp.signed_input && jcp.ver != ver_vnni) { + size_t count = nstl::max(attr.output_scales_.count_, 16); + scratchpad.book(key_conv_adjusted_scales, sizeof(float) * count); + } +} + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp index 9765de9..4e3ff51 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp @@ -18,8 +18,12 @@ #define JIT_AVX512_CORE_X8S8S32X_1X1_CONV_KERNEL_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { @@ -34,38 +38,39 @@ struct jit_avx512_core_x8s8s32x_1x1_conv_kernel: public jit_generator { jit_ker = (void (*)(jit_1x1_conv_call_s *)) this->getCode(); } + ~jit_avx512_core_x8s8s32x_1x1_conv_kernel() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); + } + static bool post_ops_ok(jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr); static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const memory_desc_wrapper &bias_d, - const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope, - int nthreads, bool reduce_src); + const convolution_desc_t &cd, + const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, + const memory_desc_wrapper &dst_d, + const memory_desc_wrapper &bias_d, + const primitive_attr_t &attr, + int nthreads, bool reduce_src); - static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const memory_desc_wrapper &bias_d, - const primitive_attr_t &attr, - int nthreads, bool reduce_src) - { - return init_conf(jcp, cd, src_d, weights_d, dst_d, bias_d, attr, false, - 0.0, nthreads, reduce_src); - } - bool maybe_relu(int position); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr); jit_1x1_conv_conf_t jcp; const primitive_attr_t &attr_; void (*jit_ker)(jit_1x1_conv_call_s *); private: + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; + using reg64_t = const Xbyak::Reg64; using zmm_t = const Xbyak::Zmm; using mask_t = const Xbyak::Opmask; @@ -90,6 +95,10 @@ struct jit_avx512_core_x8s8s32x_1x1_conv_kernel: public jit_generator { reg64_t aux_reg_output_data = abi_not_param1; reg64_t reduce_loop_iter = abi_param1; + const Xbyak::Reg64 reg_d_weights = aux_reg_bcast_data; + const Xbyak::Reg64 reg_d_bias = reduce_loop_iter; + const Xbyak::Reg64 reg_oc_off = aux_reg_load_data; + reg64_t reg_last_load = r8; mask_t ktail_mask = k6; @@ -109,18 +118,17 @@ struct jit_avx512_core_x8s8s32x_1x1_conv_kernel: public jit_generator { int reg_bcast_data_off = 16; int reg_load_data_off = 24; int reg_ptr_sum_scale_off = 32; - int reg_last_load_off = 40; - int reg_comp_data_off = 48; - int stack_space_needed = 56; + int reg_comp_data_off = 40; + int stack_space_needed = 48; void bcast_loop(int load_loop_blk); void reduce_loop(int load_loop_blk, int ur, int substep, bool wraparound); void generate(); - static void balance(jit_1x1_conv_conf_t &jcp, int nthreads); void cvt2ps(data_type_t type_in, zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag); }; + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp index a71f285..1bab22e 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp @@ -14,12 +14,11 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_types.h" - #include "c_types_map.hpp" -#include "utils.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" +#include "utils.hpp" + #include "jit_generator.hpp" #include "jit_avx512_core_x8s8s32x_1x1_convolution.hpp" @@ -30,6 +29,7 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; namespace { @@ -56,41 +56,61 @@ void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end, } /* convolution forward */ -template -void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t - ::execute_forward() +template +void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t + ::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); + + auto scratchpad = this->scratchpad(); + + if (pd()->jcp_.signed_input && pd()->jcp_.ver != ver_vnni) { + auto local_scales = scratchpad.template get( + key_conv_adjusted_scales); + auto scales = pd()->attr()->output_scales_.scales_; + size_t count = pd()->attr()->output_scales_.count_; + float factor = 1.f / pd()->jcp_.wei_adj_scale; + if (count == 1) { + utils::array_set(local_scales, scales[0] * factor, 16); + } else { + for (size_t c = 0; c < count; c++) + local_scales[c] = scales[c] * factor; + } + } + parallel(kernel_->jcp.nthr, [&](const int ithr, const int nthr) { - execute_forward_thr(ithr, nthr, src, weights, bias, dst); + execute_forward_thr(ithr, nthr, src, weights, bias, dst, scratchpad); }); } -template -void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t +template +void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t ::execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, - const wei_data_t *weights, const char *bias, dst_data_t *dst) { - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const wei_data_t *weights, const char *bias, dst_data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); - const size_t bia_dt_size = conf_.with_bias() - ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0; + const size_t bia_dt_size = pd()->with_bias() + ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0; const auto &jcp = kernel_->jcp; + auto rtus_space = scratchpad.get(key_conv_rtus_space); + auto local_scales = scratchpad.get(key_conv_adjusted_scales); const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; - const int stride_h = conf_.cdesc()->strides[0]; - const int stride_w = conf_.cdesc()->strides[1]; - const int pad_t = conf_.cdesc()->padding[0][0]; - const int pad_l = conf_.cdesc()->padding[0][1]; + const int stride_h = pd()->desc()->strides[0]; + const int stride_w = pd()->desc()->strides[1]; + const int pad_t = pd()->desc()->padding[0][0]; + const int pad_l = pd()->desc()->padding[0][1]; - const auto &oscales = conf_.attr()->output_scales_; + const auto &oscales = pd()->attr()->output_scales_; int offset = jcp.ngroups * (jcp.oc / jcp.oc_block) * (jcp.ic / jcp.ic_block) * jcp.oc_block * jcp.ic_block; @@ -167,17 +187,17 @@ void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_twith_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; p.bias_data = &bias[_ocb * jcp.oc_block * bia_dt_size]; p.compensation = (jcp.signed_input) ? &compensation[_ocb * jcp.oc_block] : 0; p.scales = (jcp.signed_input && jcp.ver != ver_vnni) - ? &local_scales_[jcp.is_oc_scale * _ocb * jcp.oc_block] + ? &local_scales[jcp.is_oc_scale * _ocb * jcp.oc_block] : &oscales.scales_[jcp.is_oc_scale * _ocb * jcp.oc_block]; - if (conf_.rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_ + if (pd()->rtus_.reduce_src_) { + rp.ws = rtus_space + ithr * pd()->rtus_.space_per_thread_ + _icb * jcp.is * jcp.ic_block; if (ocb == ocb_start) { rp.src = src + src_d.blk_off(n, _icb * jcp.ic_block, ih, iw); @@ -187,6 +207,8 @@ void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_tjit_ker(&p); }; @@ -255,38 +277,16 @@ void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +using namespace data_type; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; +template struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp index 23e0aab..850cb97 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp @@ -18,33 +18,32 @@ #define CPU_JIT_AVX512_CORE_X8S8S32X_1X1_CONVOLUTION_HPP #include "c_types_map.hpp" -#include "cpu_convolution_pd.hpp" -#include "cpu_engine.hpp" -#include "cpu_reducer.hpp" +#include "memory_tracking.hpp" #include "mkldnn_thread.hpp" #include "utils.hpp" -#include "jit_uni_1x1_conv_utils.hpp" +#include "cpu_convolution_pd.hpp" +#include "cpu_engine.hpp" + #include "jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp" +#include "jit_uni_1x1_conv_utils.hpp" namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, +template +struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t { + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_(), rtus_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_int8_1x1:", avx512_core, ""), - _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t); virtual status_t init() override { @@ -53,84 +52,84 @@ struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().dst_desc.data_type == dst_type - && this->cdesc_().weights_desc.data_type == data_type::s8 + && this->desc()->src_desc.data_type == src_type + && this->desc()->dst_desc.data_type == dst_type + && this->desc()->weights_desc.data_type == data_type::s8 && IMPLICATION(this->with_bias(), utils::one_of( - this->cdesc_().bias_desc.data_type, data_type::f32, + this->desc()->bias_desc.data_type, data_type::f32, data_type::s32, data_type::s8, data_type::u8)) - && this->cdesc_().accum_data_type == data_type::s32; - + && this->desc()->accum_data_type == data_type::s32; if (!ok) return status::unimplemented; - const convolution_desc_t *conv_d = &this->cdesc_(); + const convolution_desc_t *conv_d = this->desc(); const memory_desc_t *src_d = this->src_pd_.desc(); rtus_prepare(this, conv_d, src_d, this->dst_pd_.desc()); - return jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(jcp_, - *conv_d, *src_d, *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->bias_pd_.desc(), *this->attr(), - with_relu, this->negative_slope(), - mkldnn_get_max_threads(), rtus_.reduce_src_); + + status_t status = + jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(jcp_, + *conv_d, *src_d, *this->weights_pd_.desc(), + *this->dst_pd_.desc(), *this->bias_pd_.desc(), + *this->attr(), mkldnn_get_max_threads(), + rtus_.reduce_src_); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_scratchpad( + scratchpad, jcp_, *this->attr()); + + rtus_prepare_space_info(this, scratchpad); + + return status::success; } jit_1x1_conv_conf_t jcp_; - struct reduce_to_unit_stride_t { - convolution_desc_t conv_d_; - bool reduce_src_; - } rtus_; - - protected: - virtual status_t set_default_params() override { - using namespace memory_format; - bool is_sign_input = - (this->cdesc_().src_desc.data_type == data_type::s8) - ? true : false; - if (this->src_pd_.desc()->format == any) - CHECK(this->src_pd_.set_format(nhwc)); - if (this->dst_pd_.desc()->format == any) - CHECK(this->dst_pd_.set_format(nhwc)); - if (this->weights_pd_.desc()->format == any) - CHECK(this->weights_pd_.set_format(this->with_groups() - ? ((is_sign_input) ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i) - : ((is_sign_input) ? OIhw4i16o4i_s8s8 : OIhw4i16o4i))); - if (this->bias_pd_.desc()->format == any) - CHECK(this->bias_pd_.set_format(x)); - return status::success; - } + reduce_to_unit_stride_t rtus_; + + protected: + virtual status_t set_default_params() override { + using namespace memory_format; + bool is_sign_input = + this->desc()->src_desc.data_type == data_type::s8; + + if (this->src_pd_.desc()->format == any) + CHECK(this->src_pd_.set_format(nhwc)); + if (this->dst_pd_.desc()->format == any) + CHECK(this->dst_pd_.set_format(nhwc)); + if (this->weights_pd_.desc()->format == any) + CHECK(this->weights_pd_.set_format(this->with_groups() + ? (is_sign_input ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i) + : (is_sign_input ? OIhw4i16o4i_s8s8 : OIhw4i16o4i))); + if (this->bias_pd_.desc()->format == any) + CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); + + return status::success; + } }; template friend void init_rtus_driver(conv_t *self); - _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t(const pd_t *pd, - const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0) - , scratch_(nullptr), local_scales_(nullptr) + + jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), rtus_driver_(nullptr) { - kernel_ = new jit_avx512_core_x8s8s32x_1x1_conv_kernel(conf_.jcp_, - *conf_.attr()); + kernel_ = new jit_avx512_core_x8s8s32x_1x1_conv_kernel(pd()->jcp_, + *pd()->attr()); init_rtus_driver(this); - if (conf_.jcp_.signed_input && conf_.jcp_.ver != ver_vnni) { - size_t scales_size = ((conf_.attr()->output_scales_.count_ == 1) - ? 16 - : conf_.attr()->output_scales_.count_); - local_scales_ = (float *)malloc(sizeof(float) * scales_size, 64); - for (size_t i = 0; i < scales_size; i++) { - local_scales_[i] = conf_.attr()->output_scales_.scales_[i] * - (1.f / conf_.jcp_.wei_adj_scale); - } - } } - ~_jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t() { + + ~jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t() { delete kernel_; delete rtus_driver_; - free(scratch_); - if (local_scales_) free(local_scales_); } typedef typename prec_traits::type src_data_t; @@ -138,32 +137,23 @@ struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t typedef typename prec_traits::type dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); + void execute_forward() const; void execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, const wei_data_t *weights, - const char *bias, dst_data_t *dst); - pd_t conf_; - jit_avx512_core_x8s8s32x_1x1_conv_kernel *kernel_; + const char *bias, dst_data_t *dst, + const memory_tracking::grantor_t &scratchpad) const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_core_x8s8s32x_1x1_conv_kernel *kernel_; rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - src_data_t *scratch_; - float* local_scales_; }; -template -using jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t = - _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; - -template -using jit_avx512_core_x8s8s32x_1x1_convolution_relu_t = - _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t; } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp new file mode 100644 index 0000000..426c13f --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp @@ -0,0 +1,162 @@ + +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_JIT_AVX512_CORE_X8S8S32X_1X1_DECONVOLUTION_HPP +#define CPU_JIT_AVX512_CORE_X8S8S32X_1X1_DECONVOLUTION_HPP + +#include "c_types_map.hpp" +#include "cpu_deconvolution_pd.hpp" +#include "cpu_engine.hpp" +#include "cpu_reducer.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" +#include "cpu_convolution_pd.hpp" +#include "type_helpers.hpp" +#include "primitive_iterator.hpp" + +#include "jit_uni_1x1_conv_utils.hpp" +#include "jit_avx512_core_x8s8s32x_1x1_convolution.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t + : public cpu_primitive_t { + struct pd_t : public cpu_deconvolution_fwd_pd_t { + pd_t(engine_t *engine, const deconvolution_desc_t *adesc, + const primitive_attr_t *attr, + const deconvolution_fwd_pd_t *hint_fwd_pd) + : cpu_deconvolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , conv_pd_(nullptr) {} + + pd_t(const pd_t &other) + : cpu_deconvolution_fwd_pd_t(other) + , conv_pd_(other.conv_pd_->clone()) + , conv_supports_bias_(other.conv_supports_bias_) {} + + ~pd_t() { delete conv_pd_; } + + DECLARE_DECONVOLUTION_PD_T( + jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t); + + status_t init_convolution() { + + convolution_desc_t cd; + status_t status; + + auto dd = this->desc(); + status = conv_desc_init(&cd, prop_kind::forward_training, + alg_kind::convolution_direct, &(dd->src_desc), + &(dd->weights_desc), &(dd->bias_desc), &(dd->dst_desc), + dd->strides, dd->dilates, dd->padding[0], dd->padding[1], + dd->padding_kind); + + if (status == status::success) { + status = mkldnn_primitive_desc::create< + typename mkldnn::impl::cpu:: + jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::pd_t>(&conv_pd_, + (op_desc_t *)&cd, &(this->attr_), this->engine_, + nullptr); + } + + if (status == status::success) { + status = set_default_params(); + } + + return status; + }; + + virtual status_t init() override { + using namespace prop_kind; + status_t status; + + assert(this->engine()->kind() == engine_kind::cpu); + bool ok = true && utils::one_of(this->desc()->prop_kind, + prop_kind::forward_training, + prop_kind::forward_inference) + && this->desc()->alg_kind == alg_kind::deconvolution_direct + && !this->has_zero_dim_memory() + && this->desc()->src_desc.data_type == src_type + && this->desc()->dst_desc.data_type == dst_type + && this->desc()->weights_desc.data_type == data_type::s8 + && IMPLICATION(this->with_bias(), + utils::one_of(this->desc()->bias_desc.data_type, + data_type::f32, data_type::s32, + data_type::s8, data_type::u8)) + && this->desc()->accum_data_type == data_type::s32; + + if (ok) + status = init_convolution(); + else + status = status::unimplemented; + + return status; + } + + protected: + virtual status_t set_default_params() { + using namespace memory_format; + auto conv_1x1_pd_ = static_cast::pd_t *>(conv_pd_); + CHECK(this->src_pd_.set_format( + conv_1x1_pd_->src_pd()->desc()->format)); + CHECK(this->dst_pd_.set_format( + conv_1x1_pd_->dst_pd()->desc()->format)); + CHECK(this->weights_pd_.set_format( + conv_1x1_pd_->weights_pd()->desc()->format)); + if (this->with_bias()) + CHECK(this->bias_pd_.set_format( + conv_1x1_pd_->weights_pd(1)->desc()->format)); + return status::success; + } + + primitive_desc_t *conv_pd_; + bool conv_supports_bias_; + }; + + jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs), conv_p_(nullptr) {} + + ~jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t() { + delete this->conv_p_; + } + + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { + case prop_kind::forward_training: + case prop_kind::forward_inference: (conv_p_)->execute(e); break; + default: assert(!"invalid prop_kind"); + } + e->set_state(event_t::ready); + } + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + primitive_t *conv_p_; +}; + +} +} +} + +#endif /* CPU_JIT_AVX512_CORE_X8S8S32X_1X1_DECONVOLUTION_HPP */ diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.cpp index 9acad2e..054fe4e 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.cpp @@ -15,9 +15,11 @@ *******************************************************************************/ #include "c_types_map.hpp" +#include "memory_tracking.hpp" #include "nstl.hpp" #include "type_helpers.hpp" #include "utils.hpp" + #include "cpu_memory.hpp" #include "jit_avx512_core_x8s8s32x_conv_kernel.hpp" @@ -29,77 +31,85 @@ namespace impl { namespace cpu { using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; namespace { -void pick_loop_order(jit_conv_conf_t &jcp) +void pick_loop_order(jit_conv_conf_t &jcp, int nthr) { jcp.loop_order = loop_cwgn; - if (jcp.ngroups > 1) + if (jcp.ngroups > 1) { jcp.loop_order = loop_ngcw; -} -} - -bool jit_avx512_core_x8s8s32x_fwd_kernel::maybe_relu(int position) -{ - using namespace primitive_kind; - const auto &p = attr_.post_ops_; - - if (position == 0) { - /* relu before sum */ - return false - || jcp.with_eltwise - || p.contain(eltwise, 0) - || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0)); - } else if (position == 1) { - /* relu after sum */ - const int sum_idx = p.contain(sum, 0) - ? 0 : (p.contain(sum, 1) ? 1 : -1); - if (sum_idx == -1) - return false; - - return false - || p.contain(eltwise, sum_idx + 1) - || jcp.dst_dt == data_type::u8; + if (jcp.mb < nthr) + jcp.loop_order = loop_nhwcg; } - - return false; +} } -void jit_avx512_core_x8s8s32x_fwd_kernel::prepare_output(int ur_w) +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::prepare_output(int ur_w) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) + int nb_oc_block + = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking; + for (int k = 0; k < nb_oc_block; k++) for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vpxord(zmm, zmm, zmm); + Vmm vmm = vmm_out(j, k); + vpxord(vmm, vmm, vmm); } if (jcp.signed_input) { xor_(reg_scratch, reg_scratch); - Reg8 _t8 = reg_scratch.cvt8(); - mov(_t8, (int8_t)-128); - vpbroadcastb(zmm_shift, _t8); + if (jcp.is_depthwise && !jcp.is_fast_depthwise) { + Reg32 _t32 = reg_scratch.cvt32(); + mov(_t32, (uint32_t)128); + vpbroadcastd(vmm_shift, _t32); + } else { + Reg8 _t8 = reg_scratch.cvt8(); + mov(_t8, (int8_t)128); + vpbroadcastb(vmm_shift, _t8); + } } + if (jcp.is_fast_depthwise) { + vpxord(zmm_zero_blend, zmm_zero_blend, zmm_zero_blend); + } +} + +template +const Vmm _jit_avx512_core_x8s8s32x_fwd_kernel:: + vmm_mask(const Vmm vmm_in, bool mask_flag, bool store) { + return vmm_in; } -void jit_avx512_core_x8s8s32x_fwd_kernel::cvt2ps(data_type_t type_in, - zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag) { - zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in; +template<> +const Zmm _jit_avx512_core_x8s8s32x_fwd_kernel:: + vmm_mask(const Zmm zmm_in, bool mask_flag, bool store) { + return mask_flag ? (store ? zmm_in | ktail_mask : zmm_in | ktail_mask | T_z) + : zmm_in; +} + + +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::cvt2ps(data_type_t type_in, + const Vmm vmm_in, const Operand &op, bool mask_flag) { + //const Vmm vmm = mask_flag ? vmm_in | ktail_mask | T_z : vmm_in; + const Vmm vmm = vmm_mask(vmm_in, mask_flag); switch (type_in) { case data_type::f32: - case data_type::s32: vmovups(zmm, op); break; - case data_type::s8: vpmovsxbd(zmm, op); break; - case data_type::u8: vpmovzxbd(zmm, op); break; + case data_type::s32: vmovups(vmm, op); break; + case data_type::s8: vpmovsxbd(vmm, op); break; + case data_type::u8: vpmovzxbd(vmm, op); break; default: assert(!"unsupported data type"); } if (type_in != data_type::f32) - vcvtdq2ps(zmm_in, zmm_in); + vcvtdq2ps(vmm_in, vmm_in); } -void jit_avx512_core_x8s8s32x_fwd_kernel::store_output(int ur_w, - int last_oc_block_flag) -{ - int nb_oc_block = jcp.nb_oc_blocking; +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::store_output( + int ur_w, bool last_oc_block_flag) { + int nb_oc_block + = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking; + int oc_block = jcp.is_depthwise ? jcp.ch_block : jcp.oc_block; mov(reg_bias, ptr[param1 + GET_OFF(bias)]); mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]); @@ -108,71 +118,122 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::store_output(int ur_w, const auto &p = attr_.post_ops_; const int sum_idx = p.find(primitive_kind::sum); - const float *p_sum_scale = (sum_idx != -1) - ? &p.entry_[sum_idx].sum.scale - : nullptr; + const float *p_sum_scale = nullptr; + if (sum_idx != -1) { + const auto &p_entry = p.entry_[sum_idx]; + p_sum_scale = &p_entry.sum.scale; + } + if (p_sum_scale && *p_sum_scale != 1.f) mov(reg_ptr_sum_scale, (size_t)p_sum_scale); - if (jcp. signed_input && jcp.ver != ver_vnni) { + if (jcp.signed_input && jcp.ver != ver_vnni) { + /* put 'wei_adj_scale = 0.5' for bias calculation */ mov(reg_bias_alpha, float2int(jcp.wei_adj_scale)); vmovq(xmm_bias_alpha(), reg_bias_alpha); - vbroadcastss(zmm_bias_alpha(), xmm_bias_alpha()); + vbroadcastss(vmm_bias_alpha(), xmm_bias_alpha()); } for (int k = 0; k < nb_oc_block; k++) { - const bool mask_flag = last_oc_block_flag == 1 && k == nb_oc_block - 1; - int scale_offset = jcp.is_oc_scale * (sizeof(float) * k * jcp.oc_block); - auto zmm_bias = zmm_tmp; - auto zmm_comp = zmm_shift; + const bool mask_flag = last_oc_block_flag && k == nb_oc_block - 1; + int scale_offset = jcp.is_oc_scale * (sizeof(float) * k * oc_block); if (jcp.with_bias) { - int bias_offset = jcp.typesize_bia * k * jcp.oc_block; + int bias_offset = jcp.typesize_bia * k * oc_block; auto bias_addr = EVEX_compress_addr(reg_bias, bias_offset); - cvt2ps(jcp.bia_dt, zmm_bias, bias_addr, mask_flag); - if (jcp. signed_input && jcp.ver != ver_vnni) - vmulps(zmm_bias, zmm_bias, zmm_bias_alpha()); + cvt2ps(jcp.bia_dt, vmm_bias, bias_addr, mask_flag); + if (jcp.signed_input && jcp.ver != ver_vnni) + /* bias *= 0.5 */ + vmulps(vmm_bias, vmm_bias, vmm_bias_alpha()); } if (jcp.signed_input) { - int comp_offset = sizeof(int32_t) * k * jcp.oc_block; + int comp_offset = sizeof(int32_t) * k * oc_block; auto comp_addr = EVEX_compress_addr(reg_compensation, comp_offset); - cvt2ps(data_type::s32, zmm_comp, comp_addr, mask_flag); + cvt2ps(data_type::s32, vmm_comp, comp_addr, mask_flag); } + /* add to zmm_accum: compensation, bias and permute */ for (int j = 0; j < ur_w; j++) { - int aux_output_offset - = jcp.typesize_out * (k * jcp.oc_block - + j * jcp.oc_without_padding * jcp.ngroups); - auto addr = EVEX_compress_addr(reg_out, aux_output_offset); - - Zmm zmm = zmm_out(j, k); - vcvtdq2ps(zmm, zmm); + Vmm vmm = vmm_out(j, k); + if (jcp.is_fast_depthwise) + vpermd(zmm_out(j, k), zmm_permute, zmm_out(j, k)); + vcvtdq2ps(vmm, vmm); if (jcp.signed_input) - vaddps(zmm, zmm, zmm_comp); + vaddps(vmm, vmm, vmm_comp); if (jcp.with_bias) - vaddps(zmm, zmm, zmm_bias); + vaddps(vmm, vmm, vmm_bias); - zmm_t mask_zmm = mask_flag ? zmm | ktail_mask | T_z : zmm; - vmulps(mask_zmm, zmm, + const Vmm vmm_k = vmm_mask(vmm, mask_flag); + vmulps(vmm_k, vmm, EVEX_compress_addr(reg_ptr_scales, scale_offset)); - if (maybe_relu(0)) { - vpxord(zmm_zero, zmm_zero, zmm_zero); - vmaxps(zmm, zmm_zero, zmm); + } + } + + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + for (int i = 0; i < p.len_; i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + if (ur_w == jcp.ur_w) + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, nb_oc_block * jcp.ur_w); + else + for (int k = 0; k < nb_oc_block; k++) + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(k * jcp.ur_w, k * jcp.ur_w + ur_w); + + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); + mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); + + add(reg_d_weights, ptr[param1 + GET_OFF(oc_off)]); + add(reg_d_bias, ptr[param1 + GET_OFF(oc_off)]); + + for (int k = 0; k < nb_oc_block; k++) { + depthwise_injectors[depthwise_inj_idx]->compute_vector_range( + k * jcp.ur_w, k * jcp.ur_w + ur_w, reg_d_weights, reg_d_bias); + + add(reg_d_weights, oc_block * sizeof(float)); + add(reg_d_bias, oc_block * sizeof(float)); } - if (p_sum_scale) { // post_op: sum - vpxord(zmm_zero, zmm_zero, zmm_zero); - auto zmm_prev_dst = zmm_zero; - cvt2ps(jcp.dst_dt, zmm_prev_dst, addr, mask_flag); - if (*p_sum_scale == 1.f) - vaddps(zmm, zmm_prev_dst); - else - vfmadd231ps(zmm, zmm_prev_dst, zword_b[reg_ptr_sum_scale]); + + depthwise_inj_idx++; + } else if (post_op.is_sum(false)) { + for (int k = 0; k < nb_oc_block; k++) { + const bool mask_flag = last_oc_block_flag && k == nb_oc_block - 1; + for (int j = 0; j < ur_w; j++) { + int aux_output_offset + = jcp.typesize_out + * (k * oc_block + + j * jcp.oc_without_padding * jcp.ngroups); + auto addr = EVEX_compress_addr(reg_out, aux_output_offset); + Zmm zmm = zmm_out(j, k); + cvt2ps(jcp.dst_dt, vmm_prev_dst, addr, mask_flag); + if (*p_sum_scale == 1.f) + vaddps(zmm, vmm_prev_dst); + else + vfmadd231ps(zmm, vmm_prev_dst, zword_b[reg_ptr_sum_scale]); + } } - if (maybe_relu(1)) { - vpxord(zmm_zero, zmm_zero, zmm_zero); - vmaxps(zmm, zmm_zero, zmm); + } + } + + /* write out register to output_addr */ + for (int k = 0; k < nb_oc_block; k++) { + const bool mask_flag = last_oc_block_flag && k == nb_oc_block - 1; + for (int j = 0; j < ur_w; j++) { + Vmm vmm = vmm_out(j, k); + if (jcp.dst_dt == data_type::u8) { + vpxord(vmm_zero, vmm_zero, vmm_zero); + vmaxps(vmm, vmm_zero, vmm); } + if (jcp.dst_dt != data_type::f32) { + /* Note: using Zmm for rounding in Xmm/Ymm kernel + because there is no instruction to do rounding + from Xmm/Ymm -> Xmm/Ymm. + Embedded rounding is not supported for Xmm. + TODO: maybe avoid Zmm if it helps performance.*/ + Zmm zmm = zmm_out(j, k); if (attr_.round_mode_ == round_mode::nearest) vcvtps2dq(zmm | T_rn_sae, zmm); else if (attr_.round_mode_ == round_mode::down) @@ -183,26 +244,120 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::store_output(int ur_w, } for (int j = 0; j < ur_w; j++) { - int aux_output_offset = jcp.typesize_out * (k * jcp.oc_block - + j * jcp.oc_without_padding * jcp.ngroups); + int aux_output_offset = jcp.typesize_out + * (k * oc_block + j * jcp.oc_without_padding * jcp.ngroups); auto addr = EVEX_compress_addr(reg_out, aux_output_offset); - Zmm zmm = zmm_out(j, k); - zmm_t r_zmm = mask_flag ? zmm | ktail_mask : zmm; + Vmm vmm = vmm_out(j, k); + const Vmm r_vmm = vmm_mask(vmm, mask_flag, true); + switch (jcp.dst_dt) { case data_type::f32: - case data_type::s32: vmovups(addr, r_zmm); break; - case data_type::s8: vpmovsdb(addr, r_zmm); break; - case data_type::u8: vpmovusdb(addr, r_zmm); break; + case data_type::s32: vmovups(addr, r_vmm); break; + case data_type::s8: vpmovsdb(addr, r_vmm); break; + case data_type::u8: vpmovusdb(addr, r_vmm); break; default: assert(!"unknown dst_dt"); } } } + } -void jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker(int ur_w, - int pad_l, int pad_r, int last_ic_block_flag, bool h_padded) -{ +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker_dw( + int ur_w, int pad_l, int pad_r, ic_block_t last_ic_block_flag, bool h_padded) { + assert(!"invalid group blocking for depthwise convolution"); +} + +template <> +void _jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker_dw( + int ur_w, int pad_l, int pad_r, ic_block_t last_ic_block_flag, bool h_padded) { + auto input_offset = [=](int oi, int ii, int ki) { + return jcp.typesize_in + * ((ki * (jcp.dilate_w + 1) + oi * jcp.stride_w - pad_l) + * jcp.ngroups + + ii * jcp.ch_block); + }; + + auto kernel_offset = [=](int ii, int ki) { + return jcp.typesize_in * ((ii * jcp.kh * jcp.kw + ki) * jcp.ch_block); + }; + + auto compute = [=](Zmm vreg_acc, Zmm vreg_wei, + Zmm vreg_src) { + // okay for depthwise since src is zero-extended + if (jcp.ver == ver_vnni) { + vpdpbusd(vreg_acc, vreg_src, vreg_wei); + } else { + // zmm_src is a tmp register that can be safely overwritten here + vpmaddwd(vreg_src, vreg_src, vreg_wei); + vpaddd(vreg_acc, vreg_acc, vreg_src); + } + }; + + for (int ki = 0; ki < jcp.kw; ki++) { + for (int ii = 0; ii < jcp.nb_ch_blocking; ii++) { + int aux_kernel_offset = kernel_offset(ii, ki); + if (jcp.is_fast_depthwise) { + vbroadcasti32x4(zmm_wei, + EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); + vpblendmb(zmm_wei | kblend_mask, zmm_zero_blend, zmm_wei); + } else { + vpmovsxbd(zmm_wei, + EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); + } + if (h_padded) { + if (jcp.ver == ver_vnni) { + vpxord(zmm_src, zmm_src, zmm_src); + vpaddb(zmm_src, zmm_src, vmm_shift); + } + for (int jj = 0; jj < ur_w; jj++) { + if (jcp.ver != ver_vnni) { + vpxord(zmm_src, zmm_src, zmm_src); + vpaddb(zmm_src, zmm_src, vmm_shift); + } + compute(zmm_out(jj, ii), zmm_wei, zmm_src); + } + } else { + const bool mask_flag = last_ic_block_flag != no_last_block + && ii == jcp.nb_ch_blocking - 1; + const Zmm r_zmm_src = mask_flag ? zmm_src | ktail_mask : zmm_src; + int jj_start = get_ow_start(ki, pad_l); + int jj_end = get_ow_end(ur_w, ki, pad_r); + int start_ = jcp.signed_input ? 0 : jj_start; + int end_ = jcp.signed_input ? ur_w : jj_end; + for (int jj = start_; jj < end_; jj++) { + if (jj >= jj_start && jj < jj_end) { + int aux_input_offset = input_offset(jj, ii, ki); + if (jcp.is_fast_depthwise) { + vbroadcasti32x4(zmm_src, + EVEX_compress_addr(aux_reg_inp, aux_input_offset)); + } else { + vpmovzxbd(r_zmm_src, + EVEX_compress_addr(aux_reg_inp, aux_input_offset)); + } + if (jcp.signed_input) { + vpaddb(zmm_src, zmm_src, vmm_shift); + } + } else { + if (jcp.signed_input) { + vpxord(zmm_src, zmm_src, zmm_src); + vpaddb(zmm_src, zmm_src, vmm_shift); + } + } + compute(zmm_out(jj, ii), zmm_wei, zmm_src); + } + } + } + } +} + +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker(int ur_w, int pad_l, + int pad_r, ic_block_t last_ic_block_flag, bool h_padded) { + if (jcp.is_depthwise) + return compute_ker_dw(ur_w, pad_l, pad_r, last_ic_block_flag, h_padded); + int kw = jcp.kw; int stride_w = jcp.stride_w; int ic_block = jcp.ic_block; @@ -221,17 +376,13 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker(int ur_w, * ((ii * jcp.nb_ic * jcp.kh * jcp.kw + ki) * ch_block_all + 4 * ic * oc_block); }; - auto compute = [=](Zmm vreg_acc, Zmm vreg_wei, Zmm vreg_src) { + auto compute = [=](Vmm vreg_acc, Vmm vreg_wei, Vmm vreg_src) { if (jcp.ver == ver_vnni) { - // also okay for depthwise since src is zero-extended vpdpbusd(vreg_acc, vreg_src, vreg_wei); - } else if (jcp.is_depthwise) { - vpmulld(zmm_tmp, vreg_src, vreg_wei); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); } else { - vpmaddubsw(zmm_tmp, vreg_src, vreg_wei); - vpmaddwd(zmm_tmp, zmm_tmp, zmm_one); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); + vpmaddubsw(vmm_tmp, vreg_src, vreg_wei); + vpmaddwd(vmm_tmp, vmm_tmp, vmm_one); + vpaddd(vreg_acc, vreg_acc, vmm_tmp); } }; @@ -242,69 +393,61 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker(int ur_w, int _start = (jcp.signed_input) ? 0 : jj_start; int _end = (jcp.signed_input) ? ur_w : jj_end; /* Skip the last loads of input if (ic%16)/4 < ic_block/4 */ - int icb = jcp.is_depthwise - ? 1 - : (last_ic_block_flag != no_last_block) - ? div_up((jcp.ic_without_padding % ic_block), 4) - : ic_block / 4; + int icb = (last_ic_block_flag != no_last_block) + ? div_up((jcp.ic_without_padding % ic_block), 4) + : ic_block / 4; for (int ic = 0; ic < icb; ic++) { if (h_padded == true) { - Zmm inp = zmm_inp(0,nb_oc_block); + /* fill padded area with shifted values */ + Vmm inp = vmm_inp(0,nb_oc_block); vpxord(inp, inp, inp); - vpsubb(inp, inp, zmm_shift); + vpaddb(inp, inp, vmm_shift); } else { for (int jj = _start; jj < _end; jj++) { int aux_input_offset = input_offset(jj, ic, ki); if (jj >= jj_start && jj < jj_end) { - if (jcp.is_depthwise) { - vpmovzxbd(zmm_inp(jj, nb_oc_block), - EVEX_compress_addr( - aux_reg_inp, aux_input_offset)); - } else if (last_ic_block_flag == last_sp_block + if (last_ic_block_flag == last_sp_block && tail_size != 0 && ic == icb - 1) { - Xmm xmm_tmp = Xmm(zmm_inp(jj, nb_oc_block).getIdx()); + Xmm xmm_tmp = Xmm(vmm_inp(jj, nb_oc_block).getIdx()); for (int r = 0; r < tail_size; ++r) vpinsrb(xmm_tmp, xmm_tmp, ptr[aux_reg_inp + aux_input_offset + r], r); - vpbroadcastd(zmm_inp(jj, nb_oc_block), xmm_tmp); + vpbroadcastd(vmm_inp(jj, nb_oc_block), xmm_tmp); } else { - vpbroadcastd(zmm_inp(jj, nb_oc_block), + vpbroadcastd(vmm_inp(jj, nb_oc_block), EVEX_compress_addr( aux_reg_inp, aux_input_offset)); } if (jcp.signed_input) - vpsubb(zmm_inp(jj, nb_oc_block), - zmm_inp(jj, nb_oc_block), zmm_shift); + vpaddb(vmm_inp(jj, nb_oc_block), + vmm_inp(jj, nb_oc_block), vmm_shift); } else { + /* fill padded area with shifted values */ if (jcp.signed_input) { - Zmm inp = zmm_inp(jj, nb_oc_block); + Vmm inp = vmm_inp(jj, nb_oc_block); vpxord(inp, inp, inp); - vpsubb(inp, inp, zmm_shift); + vpaddb(inp, inp, vmm_shift); } } } } for (int ii = 0; ii < nb_oc_block; ii++) { int aux_kernel_offset = kernel_offset(ii, ic, ki); - if (jcp.is_depthwise) - vpmovsxbd( - zmm_wei, EVEX_compress_addr(aux_reg_ker, - aux_kernel_offset)); - else - vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker, - aux_kernel_offset)); + vmovups(vmm_wei, + EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); for (int jj = _start; jj < _end; jj++) { - Zmm inp = (h_padded == true) - ? zmm_inp(0,nb_oc_block) : zmm_inp(jj, nb_oc_block); - compute(zmm_out(jj, ii), zmm_wei, inp); + Vmm inp = (h_padded == true) + ? vmm_inp(0,nb_oc_block) : vmm_inp(jj, nb_oc_block); + compute(vmm_out(jj, ii), vmm_wei, inp); } } } } } -void jit_avx512_core_x8s8s32x_fwd_kernel::kh_loop(int ur_w, - int pad_l, int pad_r, int last_ic_block_flag) -{ + +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::kh_loop( + int ur_w, int pad_l, int pad_r, ic_block_t last_ic_block_flag) { Label kh_label, skip_kh_loop; Label t_overflow_label, no_t_overflow_label, b_overflow_label, no_b_overflow_label; @@ -318,7 +461,7 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::kh_loop(int ur_w, mov(aux_reg_ker, reg_ker); if (jcp.signed_input) { - mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); + mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); cmp(reg_overflow, 0); je(no_t_overflow_label, T_NEAR); L(t_overflow_label); { @@ -348,7 +491,7 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::kh_loop(int ur_w, } L(skip_kh_loop); if (jcp.signed_input) { - mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); + mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); cmp(reg_overflow, 0); je(no_b_overflow_label, T_NEAR); L(b_overflow_label); { @@ -363,7 +506,8 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::kh_loop(int ur_w, } } -void jit_avx512_core_x8s8s32x_fwd_kernel::icb_loop( +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::icb_loop( int ur_w, int pad_l, int pad_r, bool is_last_sp_block) { prepare_output(ur_w); @@ -372,7 +516,7 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::icb_loop( Label icb_label; mov(reg_icb, jcp.nb_ic); L(icb_label); - if (jcp.ic_without_padding != jcp.ic) { + if (jcp.ngroups % jcp.ch_block != 0 || jcp.ic_without_padding != jcp.ic) { Label common_ker, end_ker; cmp(reg_icb, 1); // The last IC block @@ -406,26 +550,46 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::icb_loop( Label common_store, end_store; if (jcp.is_depthwise) - cmp(reg_oc_blocks, jcp.nb_ch - 1); + cmp(reg_oc_blocks, jcp.nb_ch - jcp.nb_ch_blocking); else cmp(reg_oc_blocks, jcp.nb_oc - jcp.nb_oc_blocking); jne(common_store, T_NEAR); - store_output(ur_w, 1); + store_output(ur_w, true); // last oc block jmp(end_store, T_NEAR); L(common_store); - store_output(ur_w, 0); + store_output(ur_w, false); L(end_store); } else { - store_output(ur_w, 0); + store_output(ur_w, false); } } -void jit_avx512_core_x8s8s32x_fwd_kernel::generate() +template +void _jit_avx512_core_x8s8s32x_fwd_kernel::generate() { + const auto &p = attr_.post_ops_; + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32( + this, + post_op.depthwise.alg + )); + } + } + + Label permute_index_table; int inp_shift_pad = jcp.typesize_in * (jcp.ur_w * jcp.stride_w - jcp.l_pad) * jcp.ic_without_padding * jcp.ngroups; int inp_shift_pad_second_block = -1 * jcp.typesize_in * jcp.l_pad @@ -437,10 +601,20 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::generate() (jcp.ur_w * jcp.oc_without_padding * jcp.ngroups); preamble(); - xor_(reg_scratch, reg_scratch); - Reg16 _t16 = reg_scratch.cvt16(); - mov(_t16, 0x1); - vpbroadcastw(zmm_one, _t16); + if (jcp.is_depthwise) { + zmm_src = Zmm(jcp.max_regs_ur); + if (jcp.is_fast_depthwise) { + zmm_zero_blend = Zmm(jcp.max_regs_ur + 1); + zmm_permute = Zmm(jcp.max_regs_ur + 2); + } + } + + if (!jcp.is_depthwise && jcp.ver != ver_vnni) { + xor_(reg_scratch, reg_scratch); + Reg16 _t16 = reg_scratch.cvt16(); + mov(_t16, 0x1); + vpbroadcastw(vmm_one, _t16); + } mov(reg_inp, ptr[param1 + GET_OFF(src)]); mov(reg_out, ptr[param1 + GET_OFF(dst)]); @@ -456,6 +630,14 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::generate() mov(regw_tmp, mask); kmovw(ktail_mask, regw_tmp); } + if (jcp.is_fast_depthwise) { + // prepare mask register for blending weights + mov(reg_scratch, 0x8888444422221111); + kmovq(kblend_mask, reg_scratch); + // load permute indices from data section + mov(reg_scratch, permute_index_table); + vmovdqu32(zmm_permute, ptr[reg_scratch]); + } int r_pad = nstl::max(0, (jcp.ow - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) @@ -626,6 +808,18 @@ void jit_avx512_core_x8s8s32x_fwd_kernel::generate() L(end_label); } postamble(); + + for (auto& inj : eltwise_injectors) + inj->prepare_table(); + + if (jcp.is_fast_depthwise) { + align(64); + L(permute_index_table); + const uint32_t _idx[] + = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + for (size_t i = 0; i < sizeof(_idx) / sizeof(_idx[0]); ++i) + dd(_idx[i]); + } } bool jit_avx512_core_x8s8s32x_fwd_kernel::post_ops_ok( @@ -634,27 +828,18 @@ bool jit_avx512_core_x8s8s32x_fwd_kernel::post_ops_ok( using namespace primitive_kind; const auto &p = attr.post_ops_; - auto is_relu = [&](int idx) { - return p.entry_[idx].kind == eltwise - && p.entry_[idx].eltwise.scale == 1. - && p.entry_[idx].eltwise.alg == alg_kind::eltwise_relu - && p.entry_[idx].eltwise.alpha == 0.; - }; + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return p.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; - case 1: return true - && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0)) - && IMPLICATION(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0)); - case 2: return true - && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1)) - && IMPLICATION(!jcp.with_eltwise, false - || (p.contain(sum, 0) && is_relu(1)) - || (p.contain(sum, 1) && is_relu(0))); - case 3: return true - && jcp.with_eltwise == false - && (is_relu(0) && p.contain(sum, 1) && is_relu(2)); - default: return false; + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_simple(0) && is_sum(1) && is_simple(2)); + default: return false; } return false; @@ -664,7 +849,7 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr, - int nthreads, bool with_relu, float relu_negative_slope) + int nthreads) { using namespace prop_kind; @@ -702,22 +887,15 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, jcp.stride_w = cd.strides[1]; jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; + jcp.ur_h = 1; jcp.dilate_h = cd.dilates[0]; jcp.dilate_w = cd.dilates[1]; - if (!IMPLICATION(with_relu, relu_negative_slope == 0.)) - return status::unimplemented; - jcp.signed_input = (src_d.data_type() == data_type::s8) ? true : false; jcp.is_depthwise = true && with_groups && everyone_is(1, jcp.ic, jcp.oc); - if (jcp.is_depthwise && jcp.signed_input) - return status::unimplemented; - if (jcp.is_depthwise) { jcp.ch_block = 16; jcp.ic_block = 1; @@ -728,11 +906,17 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, jcp.oc_block = 16; if (jcp.ngroups == 1) { + /* For non grouped convolutions, pad channels by 16 if needed */ jcp.oc = rnd_up(jcp.oc, jcp.oc_block); jcp.ic = rnd_up(jcp.ic, jcp.ic_block); + } else if (jcp.ngroups != 1 && jcp.ic % jcp.ic_block != 0) { + /* For grouped convolutions, MKL-DNN doesn't support padding. + Use Ymm when channels per group is multiple of 8, + Xmm when channels per group is multiple of 4 */ + jcp.ic_block = jcp.ic % 8 == 0 ? 8 : 4; + jcp.oc_block = jcp.ic_block; } - - if (jcp.ic % jcp.ic_block != 0) + if (jcp.ic % jcp.ic_block !=0 || jcp.oc % jcp.oc_block != 0) return status::unimplemented; } @@ -742,16 +926,30 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, if (!post_ops_ok(jcp, attr)) return status::unimplemented; - jcp.ver = ver_avx512_core; - if (mayiuse(avx512_core_vnni)) - jcp.ver = ver_vnni; + jcp.ver = mayiuse(avx512_core_vnni) ? ver_vnni : ver_avx512_core; + jcp.is_fast_depthwise = true && jcp.is_depthwise && jcp.ver == ver_vnni + && jcp.ngroups % jcp.ch_block == 0; // for groups not multiple of 16 would require byte masking for load from src + if (jcp.is_depthwise) { + jcp.max_regs_ur = jcp.is_fast_depthwise + ? (jcp.signed_input ? 27 : 28) + : (jcp.signed_input ? 29 : 30); + } else { + jcp.max_regs_ur = jcp.ver == ver_vnni ? 31 : 28; + } - const int regs = (jcp.ver == ver_vnni && !jcp.is_depthwise) ? 31 : 28; + memory_format_t w_format; + if (jcp.ic_block == 16 || jcp.ch_block == 16) { + w_format = with_groups + ? (jcp.is_depthwise ? (jcp.signed_input ? Goihw16g_s8s8 : Goihw16g) + : (jcp.signed_input) ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i) + : (jcp.signed_input) ? OIhw4i16o4i_s8s8 : OIhw4i16o4i; + /* Non-grouped conv will always be padded by 16*/ + } else if (with_groups && jcp.ic_block == 8) { + w_format = jcp.signed_input ? gOIhw2i8o4i_s8s8 : gOIhw2i8o4i; + } else { + w_format = jcp.signed_input ? gOIhw4o4i_s8s8 : gOIhw4o4i; + } - const auto w_format = with_groups - ? (jcp.is_depthwise ? Goihw16g - : (jcp.signed_input) ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i) - : (jcp.signed_input) ? OIhw4i16o4i_s8s8 : OIhw4i16o4i; if (weights_d.format() == any) CHECK(weights_pd.set_format(w_format)); if (weights_d.format() != w_format) @@ -785,20 +983,26 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, jcp.nb_ic = jcp.ic / jcp.ic_block; jcp.nb_oc = jcp.oc / jcp.oc_block; + // Try to use 4 channel-groups at a time to avoid false sharing (depthwise) + jcp.nb_ch_blocking = jcp.is_depthwise + ? (jcp.nb_ch % 4 == 0 ? 4 : jcp.nb_ch % 2 == 0 ? 2 : 1) + : 1; + // If OC blocking is incommensurate with the number of OC blocks (general // requirement for all convolutions), or if it results in an unrolling // factor smaller than the left padding (special requirement for SSD:fc6), // then search for a smaller OC blocking that satisfies both constraints. jcp.nb_oc_blocking = nstl::min(4, jcp.nb_oc); for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--) { - int ur_w = regs / (jcp.nb_oc_blocking + 1); + int ur_w = jcp.max_regs_ur / (jcp.nb_oc_blocking + 1); if (jcp.nb_oc % jcp.nb_oc_blocking == 0 && (jcp.l_pad <= ur_w && IMPLICATION(jcp.ow != 1, jcp.ow % ur_w != 1))) break; } - jcp.ur_w = regs / (jcp.nb_oc_blocking + 1); + jcp.ur_w = jcp.max_regs_ur + / (jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking + 1); if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; jcp.ur_w_tail = jcp.ow % jcp.ur_w; @@ -840,7 +1044,7 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, if (r_pad_no_tail > jcp.ur_w) return status::unimplemented; - pick_loop_order(jcp); + pick_loop_order(jcp, nthreads); jcp.nb_ic_L2 = jcp.nb_ic; @@ -854,6 +1058,18 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp, return status::success; } +void jit_avx512_core_x8s8s32x_fwd_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp, + const primitive_attr_t &attr) { + if (jcp.signed_input && jcp.ver != ver_vnni) { + size_t count = nstl::max(attr.output_scales_.count_, jcp.ic_block); + scratchpad.book(key_conv_adjusted_scales, sizeof(float) * count); + } +} + +template struct _jit_avx512_core_x8s8s32x_fwd_kernel; +template struct _jit_avx512_core_x8s8s32x_fwd_kernel; +template struct _jit_avx512_core_x8s8s32x_fwd_kernel; } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.hpp index d243004..0e8e7ca 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.hpp @@ -18,109 +18,134 @@ #define CPU_JIT_AVX512_CORE_X8S8S32X_CONV_KERNEL_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + #include "cpu_memory.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { namespace cpu { -struct jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_x8s8s32x_conv_fwd_ker_t) +template +struct _jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_avx512_core_x8s8s32x_conv_fwd_ker_t) enum { STATE_FIRST_DST_LOAD = 0x1U }; - jit_avx512_core_x8s8s32x_fwd_kernel(jit_conv_conf_t ajcp, + _jit_avx512_core_x8s8s32x_fwd_kernel(jit_conv_conf_t ajcp, const primitive_attr_t &attr) : jcp(ajcp), attr_(attr) { generate(); - jit_ker = (void (*)(jit_conv_call_s *))getCode(); + jit_ker_ = (void (*)(jit_conv_call_s *))getCode(); + } + + ~_jit_avx512_core_x8s8s32x_fwd_kernel() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); } - static bool post_ops_ok(jit_conv_conf_t &jcp, - const primitive_attr_t &attr); - static status_t init_conf(jit_conv_conf_t &jcp, - const convolution_desc_t &cd, - cpu_memory_t::pd_t &src_pd, - cpu_memory_t::pd_t &weights_pd, - cpu_memory_t::pd_t &dst_pd, - cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr, - int nthreads, - bool with_relu = false, - float relu_negative_slope = 0.); jit_conv_conf_t jcp; const primitive_attr_t &attr_; - void (*jit_ker)(jit_conv_call_s *); + void (*jit_ker_)(jit_conv_call_s *); private: - using reg64_t = const Xbyak::Reg64; - using zmm_t = const Xbyak::Zmm; - using xmm_t = const Xbyak::Xmm; + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; + enum { typesize = sizeof(float), ker_reg_base_idx = 28, + ker_dw_reg_base_idx = 30, }; - enum { + typedef enum { no_last_block, last_ic_block, last_sp_block, - }; - - reg64_t reg_inp = r8; - reg64_t reg_ker = r9; - reg64_t reg_out = r10; - reg64_t aux_reg_inp = r11; - reg64_t reg_ptr_sum_scale = r11; - reg64_t aux_reg_ker = r12; - reg64_t reg_owb = r12; - - reg64_t reg_scratch = r14; - reg64_t reg_kj = rax; - reg64_t reg_overflow = rax; - reg64_t reg_ptr_scales = rax; - reg64_t reg_oi = rbx; - reg64_t reg_bias = rdx; - reg64_t reg_compensation = reg_scratch; - reg64_t reg_kh = abi_not_param1; - reg64_t param = abi_param1; - reg64_t reg_tmp = rbp; - reg64_t imm_addr64 = r15; - reg64_t reg_oc_blocks = rsi; - reg64_t reg_icb = reg_bias; - reg64_t reg_bias_alpha = reg_kh; - - Xbyak::Opmask ktail_mask = Xbyak::Opmask(2); - - zmm_t zmm_tmp = zmm_t(28); - zmm_t zmm_one = zmm_t(29); - zmm_t zmm_scales = zmm_t(30); - zmm_t zmm_shift = zmm_t(30); - zmm_t zmm_zero = zmm_t(31); - zmm_t zmm_wei = zmm_t(31); - - zmm_t zmm_out(int i_ur, int i_oc) { + } ic_block_t; + + /* data regs */ + const Xbyak::Reg64 reg_ptr_scales = rax; + const Xbyak::Reg64 reg_inp = r8; + const Xbyak::Reg64 reg_ker = r9; + const Xbyak::Reg64 reg_out = r10; + const Xbyak::Reg64 aux_reg_inp = r11; + const Xbyak::Reg64 reg_ptr_sum_scale = r11; + const Xbyak::Reg64 aux_reg_ker = r12; + const Xbyak::Reg64 reg_compensation = r14; + /* counter regs */ + const Xbyak::Reg64 reg_bias_alpha = abi_not_param1; + const Xbyak::Reg64 reg_oi = rbx; + const Xbyak::Reg64 reg_bias = rdx; + const Xbyak::Reg64 reg_oc_blocks = rsi; + const Xbyak::Reg64 reg_owb = aux_reg_ker; + const Xbyak::Reg64 reg_scratch = reg_compensation; + const Xbyak::Reg64 reg_kj = reg_ptr_scales; + const Xbyak::Reg64 reg_overflow = reg_ptr_scales; + const Xbyak::Reg64 reg_icb = reg_bias; + + const Xbyak::Reg64 reg_d_weights = r15; + const Xbyak::Reg64 reg_d_bias = r13; + + const Xbyak::Opmask ktail_mask = Xbyak::Opmask(2); + const Xbyak::Opmask kblend_mask = Xbyak::Opmask(3); + + const Vmm vmm_wei = Vmm(31); + /* used during bias section of store_output */ + const Vmm vmm_comp = Vmm(30); // only for signed input + const Vmm vmm_bias = Vmm(31); + /* used during post_op sum section of store_output */ + const Vmm vmm_prev_dst = Vmm(31); + /* used during write-out section of store_output */ + const Vmm vmm_zero = Vmm(31); + + /* used in compute_ker (but set during prepare_output) */ + const Vmm vmm_shift = vmm_comp; // only for signed input + /* used in compute_ker (but only for pre-VNNI machines) */ + const Vmm vmm_tmp = Vmm(28); // not used for depthwise + const Vmm vmm_one = Vmm(29); // set at start of kernel, not used for depthwise. + + /* registers use only for depthwise + groups are always blocked by 16(padded if needed), + hence use only Zmm registers */ + const Xbyak::Zmm zmm_wei = Xbyak::Zmm(31); + Xbyak::Zmm zmm_src; + Xbyak::Zmm zmm_permute; + Xbyak::Zmm zmm_zero_blend; // used only for fast depthwise + + Vmm vmm_out(int i_ur, int i_oc) { int idx = i_ur + i_oc * jcp.ur_w; - assert(idx < ker_reg_base_idx); - return zmm_t(idx); + assert(idx < (jcp.is_depthwise + ? ker_dw_reg_base_idx : ker_reg_base_idx)); + return Vmm(idx); } - xmm_t xmm_out(int i_ur, int i_oc) { + Xbyak::Zmm zmm_out(int i_ur, int i_oc) { int idx = i_ur + i_oc * jcp.ur_w; - assert(idx < ker_reg_base_idx); - return xmm_t(idx); + assert(idx < (jcp.is_depthwise + ? ker_dw_reg_base_idx : ker_reg_base_idx)); + return Xbyak::Zmm(idx); } - zmm_t zmm_inp(int i_ic, int nb_x_blocking) { + Vmm vmm_inp(int i_ic, int nb_x_blocking) { int idx = i_ic + nb_x_blocking * jcp.ur_w; assert(idx < 31); - return zmm_t(idx); + return Vmm(idx); } - zmm_t zmm_bias_alpha() { - return zmm_t(jcp.nb_oc_blocking * jcp.ur_w); + Vmm vmm_bias_alpha() { + int nb_c_block = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking; + return Vmm(nb_c_block * jcp.ur_w); } - xmm_t xmm_bias_alpha() { - return xmm_t(jcp.nb_oc_blocking * jcp.ur_w); + Xbyak::Xmm xmm_bias_alpha() { + int nb_c_block = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking; + return Xbyak::Xmm(nb_c_block * jcp.ur_w); } int get_ow_start(int ki, int pad_l) { return nstl::max(0, @@ -132,17 +157,79 @@ private: * (jcp.dilate_w + 1), jcp.stride_w)); } - bool maybe_relu(int position); + void prepare_output(int ur_w); - void store_output(int ur_w, int last_oc_block_flag); - void compute_ker(int ur_w, int pad_l, int pad_r, int last_ic_block_flag, - bool h_padded = false); - void kh_loop(int ur_w, int pad_l, int pad_r, int last_ic_block_flag); + void store_output(int ur_w, bool last_oc_block_flag); + void compute_ker_dw( + int ur_w, int pad_l, int pad_r, ic_block_t last_ic_block_flag, bool h_padded); + void compute_ker(int ur_w, int pad_l, int pad_r, + ic_block_t last_ic_block_flag, bool h_padded = false); + void kh_loop(int ur_w, int pad_l, int pad_r, ic_block_t last_ic_block_flag); void icb_loop( int ur_w, int pad_l, int pad_r, bool is_last_spatial_block); void generate(); - void cvt2ps(data_type_t type_in, zmm_t zmm_in, const Xbyak::Operand &op, + void cvt2ps(data_type_t type_in, Vmm ymm_in, const Xbyak::Operand &op, bool mask_flag); + const Vmm vmm_mask(const Vmm vmm_in, bool mask_flag, bool store = false); +}; + +struct jit_avx512_core_x8s8s32x_fwd_kernel { + + jit_avx512_core_x8s8s32x_fwd_kernel(jit_conv_conf_t ajcp, + const primitive_attr_t &attr) : + jit_ker(nullptr), + zmm_kernel_(nullptr), + ymm_kernel_(nullptr), + xmm_kernel_(nullptr) { + int ch_block = ajcp.is_depthwise ? ajcp.ch_block : ajcp.ic_block; + switch (ch_block) { + case 16: + zmm_kernel_ = + new _jit_avx512_core_x8s8s32x_fwd_kernel( + ajcp, attr); + jit_ker = zmm_kernel_->jit_ker_; + return; + case 8: + ymm_kernel_ = + new _jit_avx512_core_x8s8s32x_fwd_kernel( + ajcp, attr); + jit_ker = ymm_kernel_->jit_ker_; + return; + case 4: + xmm_kernel_ = + new _jit_avx512_core_x8s8s32x_fwd_kernel( + ajcp, attr); + jit_ker = xmm_kernel_->jit_ker_; + return; + default: + assert(!"invalid channel blocking"); + } + } + + ~jit_avx512_core_x8s8s32x_fwd_kernel() { + delete xmm_kernel_; + delete ymm_kernel_; + delete zmm_kernel_; + } + + static bool post_ops_ok(jit_conv_conf_t &jcp, + const primitive_attr_t &attr); + + static status_t init_conf(jit_conv_conf_t &jcp, + const convolution_desc_t &cd, + cpu_memory_t::pd_t &src_pd, + cpu_memory_t::pd_t &weights_pd, + cpu_memory_t::pd_t &dst_pd, + cpu_memory_t::pd_t &bias_pd, + const primitive_attr_t &attr, + int nthreads); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp, const primitive_attr_t &attr); + + void (*jit_ker)(jit_conv_call_s *); + _jit_avx512_core_x8s8s32x_fwd_kernel *zmm_kernel_; + _jit_avx512_core_x8s8s32x_fwd_kernel *ymm_kernel_; + _jit_avx512_core_x8s8s32x_fwd_kernel *xmm_kernel_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp index 8d1297f..e5cdcb1 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp @@ -14,7 +14,6 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_types.h" #include "c_types_map.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" @@ -28,6 +27,7 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace nstl; @@ -35,37 +35,52 @@ using namespace nstl; using jit_conv_ker_t = void (*)(jit_conv_call_s *); #define wht_blk_off(d, g, ...) \ - (conf_.with_groups() \ + (pd()->with_groups() \ ? (d).blk_off((g), __VA_ARGS__) \ : (d).blk_off(__VA_ARGS__)) -template -void _jit_avx512_core_x8s8s32x_convolution_fwd_t:: -execute_forward() +template +void jit_avx512_core_x8s8s32x_convolution_fwd_t:: +execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); - const size_t bia_dt_size = conf_.with_bias() - ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0; + const size_t bia_dt_size = pd()->with_bias() + ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0; - const auto &jcp = kernel_->jcp; + const auto &jcp = pd()->jcp_; assert(jcp.nb_oc % jcp.nb_oc_blocking == 0); + assert(jcp.nb_ch % jcp.nb_ch_blocking == 0); + + const float *oscales = pd()->attr()->output_scales_.scales_; + if (jcp.signed_input && jcp.ver != ver_vnni) { + auto local_scales = scratchpad().template get( + key_conv_adjusted_scales); + size_t count = pd()->attr()->output_scales_.count_; + float factor = 1.f / pd()->jcp_.wei_adj_scale; + if (count == 1) { + utils::array_set(local_scales, oscales[0] * factor, 16); + } else { + for (size_t c = 0; c < count; c++) + local_scales[c] = oscales[c] * factor; + } + oscales = local_scales; + } - size_t offset = (size_t)jcp.ngroups * jcp.oc * jcp.ic * jcp.kh * jcp.kw; + size_t offset = weights_d.size() - weights_d.additional_buffer_size(); auto w = const_cast(weights); int32_t* compensation = (jcp.signed_input) ? reinterpret_cast(&w[offset]) : 0; - const auto &oscales = conf_.attr()->output_scales_; int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; - int nb_groups = jcp.nb_ch; + int nb_groups = jcp.nb_ch / jcp.nb_ch_blocking; int group_block = jcp.ch_block; int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh * jcp.nb_ow; @@ -80,20 +95,24 @@ execute_forward() size_t dst_h_stride = dst_d.blk_off(0, 0, 1); size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1); - int n{ 0 }, gb{ 0 }, occ{ 0 }, oh_s{ 0 }, owb{ 0 }; + int n{ 0 }, gg{ 0 }, occ{ 0 }, oh_s{ 0 }, owb{ 0 }; if (jcp.loop_order == loop_cwgn) - nd_iterator_init(start, occ, oc_chunks, owb, jcp.nb_ow, gb, + nd_iterator_init(start, occ, oc_chunks, owb, jcp.nb_ow, gg, nb_groups, n, jcp.mb, oh_s, jcp.oh); else if (jcp.loop_order == loop_gncw) - nd_iterator_init(start, gb, nb_groups, n, jcp.mb, occ, oc_chunks, + nd_iterator_init(start, gg, nb_groups, n, jcp.mb, occ, oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh); else if (jcp.loop_order == loop_ngcw) - nd_iterator_init(start, n, jcp.mb, gb, nb_groups, occ, oc_chunks, + nd_iterator_init(start, n, jcp.mb, gg, nb_groups, occ, oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh); + else if (jcp.loop_order == loop_nhwcg) + nd_iterator_init(start, n, jcp.mb, oh_s, jcp.oh, owb, jcp.nb_ow, + occ, oc_chunks, gg, nb_groups); else assert(!"unsupported loop order"); while (start < end) { int ocb = occ * jcp.nb_oc_blocking; + int gb = gg * jcp.nb_ch_blocking; int g = gb * group_block; int g_oc = (g * jcp.nb_oc + ocb) * jcp.oc_block; @@ -102,6 +121,7 @@ execute_forward() int work_rem = end - start; int ih_s = -jcp.t_pad + oh_s * jcp.stride_h; int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; + if (jcp.loop_order == loop_nhwcg) oh_e = oh_s + 1; // step instead int ow_s = owb * jcp.ow_block; int iw_s = ow_s * jcp.stride_w; @@ -115,9 +135,7 @@ execute_forward() auto src_w = src + src_d.blk_off(n, g_ic, ih_s, iw_s); auto wht_w = weights + wht_blk_off(weights_d, gb, ocb, 0); - auto scales = (jcp.signed_input && jcp.ver != ver_vnni) - ? &local_scales_[jcp.is_oc_scale * g_oc] - : &oscales.scales_[jcp.is_oc_scale * g_oc]; + auto scales = &oscales[jcp.is_oc_scale * g_oc]; for (int oj = oh_s, ij = ih_s; oj < oh_e; ++oj, ij += jcp.stride_h) { @@ -144,57 +162,48 @@ execute_forward() p.b_overflow = i_b_overflow; p.owb = owb; + p.oc_off = g_oc * sizeof(float); + kernel_->jit_ker(&p); src_w += src_h_stride * jcp.stride_h; dst_w += dst_h_stride; } if (jcp.loop_order == loop_cwgn) - nd_iterator_jump(start, end, occ, oc_chunks, owb, jcp.nb_ow, gb, + nd_iterator_jump(start, end, occ, oc_chunks, owb, jcp.nb_ow, gg, nb_groups, n, jcp.mb, oh_s, jcp.oh); else if (jcp.loop_order == loop_gncw) - nd_iterator_jump(start, end, gb, nb_groups, n, jcp.mb, occ, + nd_iterator_jump(start, end, gg, nb_groups, n, jcp.mb, occ, oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh); else if (jcp.loop_order == loop_ngcw) - nd_iterator_jump(start, end, n, jcp.mb, gb, nb_groups, occ, + nd_iterator_jump(start, end, n, jcp.mb, gg, nb_groups, occ, oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh); + else if (jcp.loop_order == loop_nhwcg) { + ++start; + nd_iterator_step(n, jcp.mb, oh_s, jcp.oh, owb, jcp.nb_ow, occ, + oc_chunks, gg, nb_groups); + } else assert(!"unsupported loop order"); } }); } -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; -template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.hpp index 6ac59f9..1afcda6 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.hpp @@ -18,11 +18,11 @@ #define CPU_JIT_AVX512_CORE_X8S8S32X_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" + #include "cpu_convolution_pd.hpp" -#include "cpu_engine.hpp" -#include "jit_transpose_src_utils.hpp" -#include "cpu_reducer.hpp" -#include "cpu_barrier.hpp" #include "jit_avx512_core_x8s8s32x_conv_kernel.hpp" @@ -30,99 +30,85 @@ namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_avx512_core_x8s8s32x_convolution_fwd_t : public cpu_primitive_t { - struct pd_t : public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, +template +struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public cpu_primitive_t { + struct pd_t : public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() - { - } + {} + DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_int8:", avx512_core, ""), - _jit_avx512_core_x8s8s32x_convolution_fwd_t); + jit_avx512_core_x8s8s32x_convolution_fwd_t); - virtual status_t init() override - { + virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); + bool ok = true - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().dst_desc.data_type == dst_type + && this->desc()->src_desc.data_type == src_type + && this->desc()->dst_desc.data_type == dst_type && IMPLICATION(this->with_bias(), utils::one_of( - this->cdesc_().bias_desc.data_type, data_type::f32, + this->desc()->bias_desc.data_type, data_type::f32, data_type::s32, data_type::s8, data_type::u8)) - && this->cdesc_().accum_data_type == data_type::s32; - if (!ok) - return status::unimplemented; + && this->desc()->accum_data_type == data_type::s32; + if (!ok) return status::unimplemented; - return jit_avx512_core_x8s8s32x_fwd_kernel::init_conf( - jcp_, this->cdesc_(), this->src_pd_, this->weights_pd_, + status_t status = jit_avx512_core_x8s8s32x_fwd_kernel::init_conf( + jcp_, *this->desc(), this->src_pd_, this->weights_pd_, this->dst_pd_,this->bias_pd_, *this->attr(), - mkldnn_get_max_threads(), - with_relu, this->negative_slope()); + mkldnn_get_max_threads()); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_core_x8s8s32x_fwd_kernel::init_scratchpad(scratchpad, + jcp_, *this->attr()); + + if (status == status::success + && this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); + return status; } jit_conv_conf_t jcp_; }; - _jit_avx512_core_x8s8s32x_convolution_fwd_t(const pd_t *pd, + jit_avx512_core_x8s8s32x_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , local_scales_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) { - kernel_ = new jit_avx512_core_x8s8s32x_fwd_kernel(conf_.jcp_, - *conf_.attr()); - if (conf_.jcp_.signed_input && conf_.jcp_.ver != ver_vnni) { - size_t scales_size = (conf_.attr()->output_scales_.count_ == 1) - ? 16 - : conf_.attr()->output_scales_.count_; - local_scales_ = (float *)malloc(sizeof(float) * scales_size, 64); - for (size_t i = 0; i < scales_size; i++) { - local_scales_[i] = conf_.attr()->output_scales_.scales_[i] * - (1.f / conf_.jcp_.wei_adj_scale); - } - } + kernel_ = new jit_avx512_core_x8s8s32x_fwd_kernel(pd()->jcp_, + *pd()->attr()); } - ~_jit_avx512_core_x8s8s32x_convolution_fwd_t() { - delete kernel_; - if (local_scales_) free(local_scales_); - }; + ~jit_avx512_core_x8s8s32x_convolution_fwd_t() { delete kernel_; } typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_core_x8s8s32x_fwd_kernel *kernel_; - float *local_scales_; }; -template -using jit_avx512_core_x8s8s32x_convolution_fwd_t = - _jit_avx512_core_x8s8s32x_convolution_fwd_t; - -template -using jit_avx512_core_x8s8s32x_convolution_relu_t = - _jit_avx512_core_x8s8s32x_convolution_fwd_t; - } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.cpp new file mode 100644 index 0000000..5c69879 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.cpp @@ -0,0 +1,928 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "jit_avx512_core_x8s8s32x_deconvolution.hpp" + +#define GET_OFF(field) offsetof(jit_deconv_call_s, field) + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::status; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; +using namespace mkldnn::impl::utils; +using namespace Xbyak; + +using namespace nstl; + +#define wht_blk_off(d, g, ...) \ + (pd()->with_groups() ? (d).blk_off((g), __VA_ARGS__) : \ + (d).blk_off(__VA_ARGS__)) + +status_t jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf( + jit_conv_conf_t &jcp, const deconvolution_desc_t &cd, + cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, + cpu_memory_t::pd_t &dst_pd, const bool with_bias, + cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr) { + const memory_desc_wrapper src_d(&src_pd); + const memory_desc_wrapper dst_d(&dst_pd); + const memory_desc_wrapper weights_d(&weights_pd); + const memory_desc_wrapper bias_d(&bias_pd); + + if (!(mayiuse(avx512_core) + && one_of(src_d.data_type(), data_type::u8, data_type::s8) + && weights_d.data_type() == data_type::s8 + && one_of(dst_d.data_type(), data_type::f32, data_type::s32, + data_type::s8, data_type::u8))) + return status::unimplemented; + + jcp = zero(); + + const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; + jcp.signed_input = src_d.data_type() == data_type::s8; + + jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; + jcp.oc = dst_d.dims()[1] / jcp.ngroups; + jcp.ic = src_d.dims()[1] / jcp.ngroups; + jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups; + jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups; + jcp.is_depthwise = true && with_groups + && utils::everyone_is(1, jcp.ic_without_padding, + jcp.oc_without_padding); + + /* TODO: future work, on hold until depthwise specialized kernel is + * implemented. */ + if (jcp.is_depthwise && jcp.signed_input) + return status::unimplemented; + + const auto w_format = jcp.is_depthwise ? Goihw16g : with_groups ? + (jcp.signed_input ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i) : + (jcp.signed_input ? OIhw4i16o4i_s8s8 : OIhw4i16o4i); + + if (dst_d.format() == any) + CHECK(dst_pd.set_format(nhwc)); + if (dst_d.format() != nhwc) + return status::unimplemented; + if (src_d.format() == any) + CHECK(src_pd.set_format(nhwc)); + if (src_d.format() != nhwc) + return status::unimplemented; + if (weights_d.format() == any) + CHECK(weights_pd.set_format(w_format)); + if (weights_d.format() != w_format) + return status::unimplemented; + + jcp.with_bias = with_bias; + if (jcp.with_bias) { + if (bias_d.format() == any) + CHECK(bias_pd.set_format(x)); + if (bias_d.format() != x) + return status::unimplemented; + } + + jcp.ndims = dst_d.ndims(); + jcp.prop_kind = cd.prop_kind; + jcp.mb = src_d.dims()[0]; + jcp.ih = src_d.dims()[2]; + jcp.iw = src_d.dims()[3]; + jcp.oh = dst_d.dims()[2]; + jcp.ow = dst_d.dims()[3]; + jcp.kh = weights_d.dims()[with_groups + 2]; + jcp.kw = weights_d.dims()[with_groups + 3]; + jcp.t_pad = cd.padding[0][0]; + jcp.l_pad = cd.padding[0][1]; + jcp.stride_h = cd.strides[0]; + jcp.stride_w = cd.strides[1]; + jcp.src_fmt = src_d.format(); + + if (jcp.is_depthwise) { + jcp.ch_block = 16; + jcp.oc_block = 1; + jcp.ic_block = 1; + } else { + jcp.ch_block = 1; + jcp.oc_block = 16; + jcp.ic_block = 16; + + if (jcp.ngroups == 1) { + jcp.oc = utils::rnd_up(jcp.oc_without_padding, jcp.oc_block); + jcp.ic = utils::rnd_up(jcp.ic_without_padding, jcp.ic_block); + } + if (jcp.ic % jcp.ic_block != 0) + return status::unimplemented; + } + + jcp.dilate_h = cd.dilates[0]; + jcp.dilate_w = cd.dilates[1]; + + if (!IMPLICATION(jcp.dilate_h, jcp.stride_h == 1) + || !IMPLICATION(jcp.dilate_w, jcp.stride_w == 1)) + return status::unimplemented; + + /* padding: bottom and right */ + jcp.b_pad = (jcp.ih - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) + - (jcp.oh + jcp.t_pad - 1); + jcp.r_pad = (jcp.iw - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) + - (jcp.ow + jcp.l_pad - 1); + + if (!post_ops_ok(jcp, attr)) + return status::unimplemented; + + const auto &p = attr.post_ops_; + const int eltwise_ind = p.find(primitive_kind::eltwise); + jcp.with_eltwise = eltwise_ind != -1; + if (jcp.with_eltwise) + jcp.eltwise = p.entry_[eltwise_ind].eltwise; + + jcp.ver = ver_avx512_core; + if (mayiuse(avx512_core_vnni)) + jcp.ver = ver_vnni; + const auto &oscales = attr.output_scales_; + jcp.is_oc_scale = oscales.mask_ == 1 << 1; + + assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0)); + + jcp.dst_dt = dst_d.data_type(); + jcp.bia_dt = jcp.with_bias ? bias_d.data_type() : data_type::undef; + jcp.typesize_bia + = jcp.with_bias ? types::data_type_size(bias_d.data_type()) : 0; + jcp.typesize_in = types::data_type_size(src_d.data_type()); + jcp.typesize_out = types::data_type_size(dst_d.data_type()); + + jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block); + jcp.nb_oc = jcp.oc / jcp.oc_block; + jcp.nb_ic = jcp.ic / jcp.ic_block; + + /* kernel blocking params */ + const int regs = jcp.ver == ver_vnni ? 30 : 28; + jcp.nb_oc_blocking = nstl::min(4, jcp.nb_oc); + for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--) + if (jcp.nb_oc % jcp.nb_oc_blocking == 0 + && jcp.l_pad <= regs / (jcp.nb_oc_blocking + 1)) + break; + + jcp.ur_w = regs / (jcp.nb_oc_blocking + 1); + int l_overflow = max( + 0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - jcp.l_pad) / jcp.stride_w); + + if (jcp.ow < jcp.ur_w) { + jcp.ur_w = jcp.ow; + jcp.ur_w_tail = 0; + } else { + for (; jcp.ur_w >= 1; jcp.ur_w--) { + /* ur_w should be multiple of stride_w in order + to simplify logic for get_ow_start and get_ow_end */ + bool is_multiple_of_stride = jcp.ur_w % jcp.stride_w == 0; + + /* boundary conditions: + These conditions ensure all elements close to boundary + are computed in a single call of compute loop */ + bool left_boundary_covered = jcp.ur_w >= l_overflow * jcp.stride_w; + jcp.ur_w_tail = jcp.ow % jcp.ur_w; + int r_overflow_no_tail + = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) + - max(0, jcp.r_pad) - jcp.ur_w_tail) + / jcp.stride_w); + bool right_boundary_covered + = jcp.ur_w >= r_overflow_no_tail * jcp.stride_w; + + if (is_multiple_of_stride && left_boundary_covered + && right_boundary_covered) + break; + else if (jcp.ur_w == 1) + /* The boundary conditions above are also important + to maintain simplicity of calls to icb_loop, + if those conditions are not satisfied, + then special cases will need to be added + to use correct l_overflow/r_overflow values + when different iterations of compute loop + work on the locations close to boundary. + So to keep code simple, return unimplemented + for extreme case when a good ur_w cannot be found. + */ + return status::unimplemented; + } + } + + jcp.wei_adj_scale + = (jcp.signed_input && (jcp.ver != ver_vnni)) ? (1.f / 2.f) : 1.f; + + jcp.loop_order = jcp.ngroups > 1 ? loop_ngc : loop_cgn; + return status::success; +} + +bool jit_avx512_core_x8s8s32x_deconv_fwd_kernel::maybe_eltwise(int position) { + using namespace primitive_kind; + const auto &p = attr_.post_ops_; + + if (position == 0) { + /* eltwise before sum */ + return p.contain(eltwise, 0); + } else if (position == 1) { + /* eltwise after sum */ + return p.contain(sum, 0) && p.contain(eltwise, 1); + } + return false; +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::compute_eltwise(int ur_w) { + int nb_oc_block + = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking; + if (ur_w == jcp.ur_w) + eltwise_injector_->compute_vector_range(0, nb_oc_block * jcp.ur_w); + else + for (int k = 0; k < nb_oc_block; k++) + eltwise_injector_->compute_vector_range( + k * jcp.ur_w, k * jcp.ur_w + ur_w); +} + +bool jit_avx512_core_x8s8s32x_deconv_fwd_kernel::post_ops_ok( + jit_conv_conf_t &jcp, const primitive_attr_t &attr) { + using namespace primitive_kind; + const auto &p = attr.post_ops_; + + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + + switch (p.len_) { + case 0: return true; + case 1: return is_eltwise(0) || p.contain(sum, 0); + case 2: + return (p.contain(sum, 0) && is_eltwise(1)) + || (p.contain(sum, 1) && is_eltwise(0)); + default: return false; + } + + return false; +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp, + const primitive_attr_t &attr) { + if (jcp.signed_input && jcp.ver != ver_vnni) { + size_t count = nstl::max(attr.output_scales_.count_, 16); + scratchpad.book(key_conv_adjusted_scales, sizeof(float) * count); + } +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::compute_ker(int ur_w, + int l_overflow, int r_overflow, ker_block_t last_ic_block_flag, + bool h_padded) { + + const int ch_block_all = jcp.ch_block * jcp.ic_block * jcp.oc_block; + const int ur_w_stride = jcp.signed_input ? 1 : jcp.stride_w; + + auto src_offset = [=](int oj, int icb, int ki) { + return jcp.typesize_in + * (((oj + jcp.l_pad - ki * (jcp.dilate_w + 1)) / jcp.stride_w) + * jcp.ngroups * jcp.ic_without_padding + + icb * 4); + }; + + auto kernel_offset = [=](int ocb, int icb, int ki) { + return jcp.typesize_in + * (ocb * jcp.nb_ic * jcp.kh * jcp.kw * ch_block_all + + icb * jcp.oc_block * jcp.ic_block / 4 + + ki * ch_block_all); + }; + + auto compute = [=](zmm_t vreg_acc, zmm_t vreg_wei, zmm_t vreg_src) { + if (jcp.ver == ver_vnni) { + vpdpbusd(vreg_acc, vreg_src, vreg_wei); + } else if (jcp.is_depthwise) { + vpmulld(zmm_tmp, vreg_src, vreg_wei); + vpaddd(vreg_acc, vreg_acc, zmm_tmp); + } else { + vpmaddubsw(zmm_tmp, vreg_src, vreg_wei); + vpmaddwd(zmm_tmp, zmm_tmp, zmm_one); + vpaddd(vreg_acc, vreg_acc, zmm_tmp); + } + }; + + for (int ki = 0; ki < jcp.kw; ki++) { + + int jj_start = get_ow_start(ki, l_overflow); + int jj_end = get_ow_end(ur_w, ki, r_overflow); + + int _start = (jcp.signed_input) ? 0 : jj_start; + int _end = (jcp.signed_input) ? ur_w : jj_end; + + int tail_size = jcp.ic_without_padding % 4; + int n_ic_blocks = jcp.is_depthwise ? + 1 : + (last_ic_block_flag & ~no_last_block ? + div_up(jcp.ic_without_padding % jcp.ic_block, + 4) : + jcp.ic_block / 4); + + for (int icb1 = 0; icb1 < n_ic_blocks; icb1++) { + if (h_padded == true) { + /* fill padded area with shifted values */ + Zmm inp = zmm_inp(0, jcp.nb_oc_blocking); + vpxord(inp, inp, inp); + vpsubb(inp, inp, zmm_shift); + } else { + + for (int jj = _start; jj < _end; jj += ur_w_stride) { + + int aux_src_off = src_offset(jj, icb1, ki); + + if (jj >= jj_start && jj < jj_end + && ((jj + jcp.l_pad - ki) % jcp.stride_w == 0)) { + if (jcp.is_depthwise) { + vpmovzxbd(zmm_inp(jj, jcp.nb_oc_blocking), + EVEX_compress_addr( + aux_reg_src, aux_src_off)); + } else if ((last_ic_block_flag & last_sp_block) + && tail_size != 0 && icb1 == n_ic_blocks - 1) { + xmm_t xmm_tmp = xmm_t( + zmm_inp(jj, jcp.nb_oc_blocking).getIdx()); + for (int r = 0; r < tail_size; ++r) + vpinsrb(xmm_tmp, xmm_tmp, + ptr[aux_reg_src + aux_src_off + r], r); + vpbroadcastd( + zmm_inp(jj, jcp.nb_oc_blocking), xmm_tmp); + } else { + vpbroadcastd(zmm_inp(jj, jcp.nb_oc_blocking), + EVEX_compress_addr( + aux_reg_src, aux_src_off)); + } + if (jcp.signed_input) + vpsubb(zmm_inp(jj, jcp.nb_oc_blocking), + zmm_inp(jj, jcp.nb_oc_blocking), zmm_shift); + } else { + /* fill padded area with shifted values */ + if (jcp.signed_input) { + Zmm inp = zmm_inp(jj, jcp.nb_oc_blocking); + vpxord(inp, inp, inp); + vpsubb(inp, inp, zmm_shift); + } + } + } + } + for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { + int aux_filt_off = kernel_offset(ocb, icb1, ki); + + if (_end - _start > 0) { + if (jcp.is_depthwise) + vpmovsxbd(zmm_wei, + EVEX_compress_addr(aux_reg_filt, aux_filt_off)); + else + vmovups(zmm_wei, + EVEX_compress_addr(aux_reg_filt, aux_filt_off)); + } + for (int jj = _start; jj < _end; jj += ur_w_stride) { + Zmm inp = (h_padded == true) ? + zmm_inp(0, jcp.nb_oc_blocking) : + zmm_inp(jj, jcp.nb_oc_blocking); + compute(zmm_out(jj, ocb), zmm_wei, inp); + } + } + } + } +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::kh_loop(int ur_w, + int l_overflow, int r_overflow, ker_block_t last_ic_block_flag) { + + int ch_block_all = jcp.ch_block * jcp.ic_block * jcp.oc_block; + int shift_src_ih = jcp.typesize_in * (jcp.dilate_h + 1) * jcp.iw + * jcp.ngroups * jcp.ic_without_padding; + const int stride_h = jcp.signed_input ? 1 : jcp.stride_h; + int shift_filt_kh = jcp.typesize_in * jcp.kw * ch_block_all * stride_h; + + Label kh_loop_label, skip_kh_loop; + Label t_overflow_label, no_t_overflow_label, b_overflow_label, + no_b_overflow_label; + + mov(aux_reg_src, reg_src); + mov(aux_reg_filt, reg_filt); + + if (jcp.signed_input) { + /* Weights are transposed, so first compute 'bottom' padding. */ + mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); + cmp(reg_overflow, 0); + je(no_b_overflow_label, T_NEAR); + L(b_overflow_label); { + compute_ker(ur_w, 0, 0, last_ic_block_flag, true); + + add(aux_reg_filt, shift_filt_kh); + dec(reg_overflow); + cmp(reg_overflow, 0); + jg(b_overflow_label, T_NEAR); + } + L(no_b_overflow_label); + } + + mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]); + + if (jcp.signed_input || ((!jcp.signed_input) + && ((min(jcp.t_pad, jcp.b_pad) < 0) + || ((jcp.kh - 1) * (jcp.dilate_h + 1) + < nstl::max(jcp.t_pad, jcp.b_pad))))) { + cmp(reg_kh, 0); + je(skip_kh_loop, T_NEAR); + } + + L(kh_loop_label); { + compute_ker(ur_w, l_overflow, r_overflow, last_ic_block_flag, false); + sub(aux_reg_src, shift_src_ih); + add(aux_reg_filt, shift_filt_kh); + dec(reg_kh); + + /* Insert weight compensation in stride 'holes' */ + if (jcp.signed_input && jcp.stride_h > 1) { + Label kh_comp_loop; + + cmp(reg_kh, 0); + je(skip_kh_loop, T_NEAR); + mov(reg_comp_strides, jcp.stride_h - 1); + L(kh_comp_loop); + { + compute_ker( + ur_w, 0, 0, last_ic_block_flag, true); + add(aux_reg_filt, shift_filt_kh); + dec(reg_comp_strides); + cmp(reg_comp_strides, 0); + jg(kh_comp_loop, T_NEAR); + } + } + cmp(reg_kh, 0); + jg(kh_loop_label, T_NEAR); + } + L(skip_kh_loop); + if (jcp.signed_input) { + mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); + cmp(reg_overflow, 0); + je(no_t_overflow_label, T_NEAR); + L(t_overflow_label); { + compute_ker(ur_w, 0, 0, last_ic_block_flag, true); + + add(aux_reg_filt, shift_filt_kh); + dec(reg_overflow); + cmp(reg_overflow, 0); + jg(t_overflow_label, T_NEAR); + } + L(no_t_overflow_label); + } +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::prepare_output(int ur_w) { + for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { + for (int ur = 0; ur < ur_w; ur++) { + zmm_t zmm = zmm_out(ur, ocb); + vpxord(zmm, zmm, zmm); + } + } + if (jcp.signed_input) { + xor_(reg_scratch, reg_scratch); + Reg8 _t8 = reg_scratch.cvt8(); + mov(_t8, (int8_t)-128); + vpbroadcastb(zmm_shift, _t8); + } +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::cvt2ps( + data_type_t type_in, zmm_t zmm_in, const Operand &op, bool mask_flag) { + zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in; + switch (type_in) { + case data_type::f32: + case data_type::s32: vmovups(zmm, op); break; + case data_type::s8: vpmovsxbd(zmm, op); break; + case data_type::u8: vpmovzxbd(zmm, op); break; + default: assert(!"unsupported data type"); + } + if (type_in != data_type::f32) + vcvtdq2ps(zmm_in, zmm_in); +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::store_output( + int ur_w, bool last_oc_block) { + mov(reg_bias, ptr[param1 + GET_OFF(bias)]); + mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]); + + if (jcp.signed_input) + mov(reg_compensation, ptr[param1 + GET_OFF(compensation)]); + + const auto &p = attr_.post_ops_; + const int sum_idx = p.find(primitive_kind::sum); + const float *p_sum_scale + = (sum_idx != -1) ? &p.entry_[sum_idx].sum.scale : nullptr; + if (p_sum_scale && *p_sum_scale != 1.f) + mov(reg_ptr_sum_scale, (size_t)p_sum_scale); + + if (jcp.with_bias && jcp.signed_input && jcp.ver != ver_vnni) { + mov(reg_bias_alpha, float2int(jcp.wei_adj_scale)); + vmovq(xmm_bias_alpha(), reg_bias_alpha); + vbroadcastss(zmm_bias_alpha(), xmm_bias_alpha()); + } + + for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { + const bool mask_flag = last_oc_block && ocb == jcp.nb_oc_blocking - 1; + int scale_offset + = jcp.is_oc_scale * (sizeof(float) * ocb * jcp.oc_block); + + auto zmm_bias = zmm_tmp; + if (jcp.with_bias) { + int bias_offset = jcp.typesize_bia * ocb * jcp.oc_block; + auto bias_addr = EVEX_compress_addr(reg_bias, bias_offset); + cvt2ps(jcp.bia_dt, zmm_bias, bias_addr, mask_flag); + if (jcp.signed_input && jcp.ver != ver_vnni) + vmulps(zmm_bias, zmm_bias, zmm_bias_alpha()); + } + if (jcp.signed_input) { + int comp_offset = sizeof(int32_t) * ocb * jcp.oc_block; + auto comp_addr = EVEX_compress_addr(reg_compensation, comp_offset); + cvt2ps(data_type::s32, zmm_comp, comp_addr, mask_flag); + } + + for (int ur = 0; ur < ur_w; ur++) { + zmm_t zmm = zmm_out(ur, ocb); + vcvtdq2ps(zmm, zmm); + if (jcp.signed_input) + vaddps(zmm, zmm, zmm_comp); + if (jcp.with_bias) + vaddps(zmm, zmm, zmm_bias); + zmm_t mask_zmm = mask_flag ? zmm | ktail_mask | T_z : zmm; + vmulps(mask_zmm, zmm, + EVEX_compress_addr(reg_ptr_scales, scale_offset)); + } + } + if (maybe_eltwise(0)) + compute_eltwise(ur_w); + if (p_sum_scale) { // post_op: sum + for (int k = 0; k < jcp.nb_oc_blocking; k++) { + const bool mask_flag + = last_oc_block == 1 && k == jcp.nb_oc_blocking - 1; + for (int j = 0; j < ur_w; j++) { + int aux_output_offset + = jcp.typesize_out + * (k * jcp.oc_block + + j * jcp.oc_without_padding * jcp.ngroups); + auto addr = EVEX_compress_addr(reg_dst, aux_output_offset); + Zmm zmm = zmm_out(j, k); + cvt2ps(jcp.dst_dt, zmm_prev_dst, addr, mask_flag); + if (*p_sum_scale == 1.f) + vaddps(zmm, zmm_prev_dst); + else + vfmadd231ps(zmm, zmm_prev_dst, zword_b[reg_ptr_sum_scale]); + } + } + } + if (maybe_eltwise(1)) + compute_eltwise(ur_w); + + for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) { + const bool mask_flag = last_oc_block && ocb == jcp.nb_oc_blocking - 1; + for (int ur = 0; ur < ur_w; ur++) { + zmm_t zmm = zmm_out(ur, ocb); + if (jcp.dst_dt == data_type::u8) { + vpxord(zmm_zero, zmm_zero, zmm_zero); + vmaxps(zmm, zmm_zero, zmm); + } + if (jcp.dst_dt != data_type::f32) { + if (attr_.round_mode_ == round_mode::nearest) + vcvtps2dq(zmm | T_rn_sae, zmm); + else if (attr_.round_mode_ == round_mode::down) + vcvtps2dq(zmm | T_rd_sae, zmm); + else + assert(!"unimplemented"); + } + } + for (int ur = 0; ur < ur_w; ur++) { + int aux_dst_off = jcp.typesize_out + * (ur * jcp.ngroups * jcp.oc_without_padding + + ocb * jcp.oc_block); + auto addr = EVEX_compress_addr(reg_dst, aux_dst_off); + + zmm_t zmm = zmm_out(ur, ocb); + zmm_t r_zmm = mask_flag ? zmm | ktail_mask : zmm; + switch (jcp.dst_dt) { + case data_type::f32: + case data_type::s32: vmovups(addr, r_zmm); break; + case data_type::s8: vpmovsdb(addr, r_zmm); break; + case data_type::u8: vpmovusdb(addr, r_zmm); break; + default: assert(!"unknown dst_dt"); + } + } + } +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::icb_loop( + int ur_w, int l_overflow, int r_overflow, bool is_last_sp_block) { + + int shift_src_icb = jcp.typesize_in * jcp.ic_block; + int shift_filt_icb + = jcp.typesize_in * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; + + prepare_output(ur_w); + + Label skip_icb_loop, icb_loop_label; + + mov(reg_icb, jcp.nb_ic); + L(icb_loop_label); { + + if (jcp.ic_without_padding != jcp.ic) { + Label common_ker, end_ker; + cmp(reg_icb, 1); + jg(common_ker, T_NEAR); + + kh_loop(ur_w, l_overflow, r_overflow, + is_last_sp_block ? last_sp_block : last_ic_block); + jmp(end_ker, T_NEAR); + + L(common_ker); + kh_loop(ur_w, l_overflow, r_overflow, no_last_block); + + L(end_ker); + } else { + kh_loop(ur_w, l_overflow, r_overflow, no_last_block); + } + + add(reg_src, shift_src_icb); + add(reg_filt, shift_filt_icb); + dec(reg_icb); + cmp(reg_icb, 0); + jg(icb_loop_label, T_NEAR); + } + + /* come-back pointers */ + sub(reg_src, jcp.nb_ic * shift_src_icb); + sub(reg_filt, jcp.nb_ic * shift_filt_icb); + L(skip_icb_loop); + + if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) { + Label common_store, end_store; + mov(reg_oc_blocks, ptr[param1 + GET_OFF(oc_blocks)]); + if (jcp.is_depthwise) + cmp(reg_oc_blocks, jcp.nb_ch - 1); + else + cmp(reg_oc_blocks, jcp.nb_oc - jcp.nb_oc_blocking); + jne(common_store, T_NEAR); + + store_output(ur_w, true); + jmp(end_store, T_NEAR); + + L(common_store); + store_output(ur_w, false); + + L(end_store); + + } else { + store_output(ur_w, false); + } +} + +void jit_avx512_core_x8s8s32x_deconv_fwd_kernel::generate() { + preamble(); + + xor_(reg_scratch, reg_scratch); + Reg16 _t = reg_scratch.cvt16(); + mov(_t, 0x1); + vpbroadcastw(zmm_one, _t); + + if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) { + int tail_size = jcp.is_depthwise ? + jcp.ngroups % jcp.ch_block : + jcp.oc_without_padding % jcp.oc_block; + int mask = (1 << tail_size) - 1; + Reg32 regw_tmp = reg_nur_w.cvt32(); + mov(regw_tmp, mask); + kmovw(ktail_mask, regw_tmp); + } + + mov(reg_src, ptr[param1 + GET_OFF(src)]); + mov(reg_filt, ptr[param1 + GET_OFF(filt)]); + mov(reg_dst, ptr[param1 + GET_OFF(dst)]); + + int dst_shift = jcp.typesize_out * jcp.ur_w * jcp.ngroups + * jcp.oc_without_padding; + int src_shift = jcp.typesize_in * (jcp.ur_w / jcp.stride_w) * jcp.ngroups + * jcp.ic_without_padding; + + int l_overflow = max( + 0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - jcp.l_pad) / jcp.stride_w); + int r_overflow + = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - max(0, jcp.r_pad)) + / jcp.stride_w); + + int r_overflow1 + = nstl::max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) + - nstl::max(0, jcp.r_pad) - jcp.ur_w_tail) + / jcp.stride_w); + int nur_w = jcp.ow / jcp.ur_w; + if (r_overflow1 > 0) + nur_w--; + + if (jcp.ur_w == jcp.ow) { + icb_loop(jcp.ur_w, l_overflow, r_overflow, true); + } else if (nur_w == 0) { + icb_loop(jcp.ur_w, l_overflow, r_overflow1, jcp.ur_w_tail == 0); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + if (jcp.ur_w_tail != 0) + icb_loop(jcp.ur_w_tail, 0, r_overflow, true); + } else { + xor_(reg_nur_w, reg_nur_w); + if (l_overflow > 0) { + icb_loop(jcp.ur_w, l_overflow, 0, false); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + inc(reg_nur_w); + } + if ((l_overflow <= 0 && nur_w > 0) || (l_overflow > 0 && nur_w > 1)) { + Label ow_loop_label; + L(ow_loop_label); + { + icb_loop(jcp.ur_w, 0, 0, false); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + inc(reg_nur_w); + cmp(reg_nur_w, nur_w); + jl(ow_loop_label, T_NEAR); + } + } + if (r_overflow1 > 0) { + icb_loop(jcp.ur_w, 0, r_overflow1, jcp.ur_w_tail == 0); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + } + if (jcp.ur_w_tail != 0) { + icb_loop(jcp.ur_w_tail, 0, r_overflow, true); + } + } + postamble(); + + if (jcp.with_eltwise) + eltwise_injector_->prepare_table(); +} + +template +void _jit_avx512_core_x8s8s32x_deconvolution_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto bias = reinterpret_cast(this->input_memory(2)); + auto dst = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); + + auto &jcp = kernel_->jcp; + + int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; + int nb_groups = jcp.nb_ch; + + size_t src_h_stride = src_d.blk_off(0, 0, 1); + size_t dst_h_stride = dst_d.blk_off(0, 0, 1); + size_t wht_kh_stride = wht_blk_off(weights_d, 0, 0, 0, 1); + + const float *oscales = pd()->attr()->output_scales_.scales_; + if (jcp.signed_input && jcp.ver != ver_vnni) { + auto local_scales + = scratchpad().template get(key_conv_adjusted_scales); + size_t count = pd()->attr()->output_scales_.count_; + float factor = 1.f / pd()->jcp_.wei_adj_scale; + if (count == 1) { + utils::array_set(local_scales, oscales[0] * factor, 16); + } else { + for (size_t c = 0; c < count; c++) + local_scales[c] = oscales[c] * factor; + } + oscales = local_scales; + } + size_t offset = (size_t)jcp.ngroups * jcp.oc * jcp.ic * jcp.kh * jcp.kw; + auto w = const_cast(weights); + int32_t *compensation + = (jcp.signed_input) ? reinterpret_cast(&w[offset]) : 0; + + parallel(0, [&](const int ithr, const int nthr) { + int start{ 0 }, end{ 0 }; + int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh; + balance211(work_amount, nthr, ithr, start, end); + + auto p = jit_deconv_call_s(); + + /*loop order = cgn*/ + int n{ 0 }, g{ 0 }, occ{ 0 }, oh_s{ 0 }; + if (jcp.loop_order == loop_ngc) + nd_iterator_init(start, n, jcp.mb, g, nb_groups, occ, oc_chunks, + oh_s, jcp.oh); + else if (jcp.loop_order == loop_cgn) + nd_iterator_init(start, occ, oc_chunks, g, nb_groups, n, jcp.mb, + oh_s, jcp.oh); + else + assert(!"unsupported loop order"); + while (start < end) { + + int ocb = occ * jcp.nb_oc_blocking; + int g_oc = (g * jcp.ch_block * jcp.nb_oc + ocb) * jcp.oc_block; + int g_ic = g * jcp.ch_block * jcp.ic; + int work_rem = end - start; + int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; + + auto dst_w = dst + dst_d.blk_off(n, g_oc); + auto src_w = src + src_d.blk_off(n, g_ic); + auto wht_w = weights + wht_blk_off(weights_d, g, ocb, 0); + auto bias_w = jcp.with_bias ? + bias + (bias_d.blk_off(g_oc) * jcp.typesize_bia) : + 0; + int32_t *compensation_w + = (jcp.signed_input) ? compensation + g_oc : 0; + + auto scales = &oscales[jcp.is_oc_scale * g_oc]; + for (int oj = oh_s; oj < oh_e; oj++) { + int ih_max = 0, kh_lo = 0, kh_len = 0; + if (jcp.dilate_h != 0 && jcp.stride_h == 1) { + /* dilation */ + int dilate_h = jcp.dilate_h + 1; + // Note: use div_up to account for "holes" in filter + int o_t_overflow = div_up( + max(0, (jcp.kh - 1) * dilate_h - oj - jcp.t_pad), + dilate_h); + int o_b_overflow + = div_up(max(0, (jcp.kh - 1) * dilate_h + 1 - jcp.oh + + oj - jcp.b_pad), + dilate_h); + kh_len = jcp.kh - o_t_overflow - o_b_overflow; + kh_lo = o_b_overflow; + ih_max = oj + jcp.t_pad - o_b_overflow * dilate_h; + } else { + int o_t_overflow = max( + 0, (jcp.kh - (oj + 1 + jcp.t_pad)) / jcp.stride_h); + int o_b_overflow + = max(0, ((oj + jcp.kh) - (jcp.oh + jcp.b_pad)) + / jcp.stride_h); + int overflow_kh_hi = jcp.kh - 1 + - abs(jcp.oh + jcp.b_pad - (oj + 1)) % jcp.stride_h; + int overflow_kh_lo = (oj + jcp.t_pad) % jcp.stride_h; + + kh_len = (overflow_kh_hi - overflow_kh_lo) / jcp.stride_h + + 1 - o_t_overflow - o_b_overflow; + kh_lo = overflow_kh_lo + o_b_overflow * jcp.stride_h; + ih_max = (oj + jcp.t_pad - kh_lo) / jcp.stride_h; + } + + int wei_stride + = (!jcp.signed_input) ? kh_lo * wht_kh_stride : 0; + p.src = src_w + ih_max * src_h_stride; + p.dst = dst_w + oj * dst_h_stride; + p.filt = wht_w + wei_stride; + p.bias = bias_w; + p.compensation = compensation_w; + p.t_overflow = max( + 0, jcp.kh - (kh_lo + max(0, kh_len - 1) * jcp.stride_h + + 1)); + p.b_overflow = kh_lo; + p.kh_padding = kh_len; + p.scales = scales; + p.oc_blocks = jcp.is_depthwise ? g : ocb; + kernel_->jit_ker(&p); + } + if (jcp.loop_order == loop_ngc) + nd_iterator_jump(start, end, n, jcp.mb, g, nb_groups, occ, + oc_chunks, oh_s, jcp.oh); + else if (jcp.loop_order == loop_cgn) + nd_iterator_jump(start, end, occ, oc_chunks, g, nb_groups, n, + jcp.mb, oh_s, jcp.oh); + else + assert(!"unsupported loop order"); + } + }); +} + +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +template struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t; +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.hpp similarity index 63% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.hpp index 17f3a52..8053db8 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_deconvolution.hpp @@ -29,6 +29,7 @@ #include "cpu_deconvolution_pd.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" +#include "jit_uni_eltwise.hpp" namespace mkldnn { namespace impl { @@ -38,18 +39,28 @@ typedef enum { no_last_block = 0x1U, last_ic_block = 0x2U, last_sp_block = 0x4U, - last_ic } ker_block_t; -struct jit_avx512_core_u8s8s32x_deconv_fwd_kernel : public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8s8s32x_deconv_fwd_ker_t); +struct jit_avx512_core_x8s8s32x_deconv_fwd_kernel : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_x8s8s32x_deconv_fwd_ker_t); - jit_avx512_core_u8s8s32x_deconv_fwd_kernel(jit_conv_conf_t ajcp, - const primitive_attr_t &attr) : jcp(ajcp), attr_(attr) { + jit_avx512_core_x8s8s32x_deconv_fwd_kernel( + jit_conv_conf_t ajcp, const primitive_attr_t &attr) + : jcp(ajcp), attr_(attr), eltwise_injector_(nullptr) { + if (jcp.with_eltwise) + eltwise_injector_ = new jit_uni_eltwise_injector_f32( + this, jcp.eltwise); generate(); jit_ker = (void (*)(jit_deconv_call_s *))getCode(); } + ~jit_avx512_core_x8s8s32x_deconv_fwd_kernel() { + delete eltwise_injector_; + } + + static bool post_ops_ok(jit_conv_conf_t &jcp, + const primitive_attr_t &attr); + static status_t init_conf(jit_conv_conf_t &jcp, const deconvolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, @@ -59,10 +70,14 @@ struct jit_avx512_core_u8s8s32x_deconv_fwd_kernel : public jit_generator { cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp, const primitive_attr_t &attr); + jit_conv_conf_t jcp; const primitive_attr_t &attr_; void (*jit_ker)(jit_deconv_call_s *); private: + jit_uni_eltwise_injector_f32 *eltwise_injector_; using reg64_t = const Xbyak::Reg64; using zmm_t = const Xbyak::Zmm; using xmm_t = const Xbyak::Xmm; @@ -78,17 +93,29 @@ private: reg64_t reg_ptr_scales = rax; reg64_t reg_oc_blocks = rsi; - reg64_t reg_scratch = r14; reg64_t aux_reg_src = r11; reg64_t aux_reg_filt = r12; - reg64_t reg_kj = rax; + + reg64_t reg_compensation = r14; + reg64_t reg_scratch = r14; + reg64_t reg_ptr_sum_scale = r11; + reg64_t reg_bias_alpha = abi_not_param1; + reg64_t reg_overflow = rax; + reg64_t reg_comp_strides = reg_overflow; Xbyak::Opmask ktail_mask = Xbyak::Opmask(2); - zmm_t zmm_tmp = zmm_t(29); - zmm_t zmm_one = zmm_t(30); + zmm_t zmm_tmp = zmm_t(28); + zmm_t zmm_one = zmm_t(29); + /* used during write-out section of store_output */ zmm_t zmm_zero = zmm_t(31); zmm_t zmm_wei = zmm_t(31); + /* signed input */ + zmm_t zmm_shift = zmm_t(30); + zmm_t zmm_comp = zmm_t(30); + zmm_t zmm_bias = zmm_t(31); + zmm_t zmm_prev_dst = zmm_t(31); + zmm_t zmm_out(int i_ur, int i_oc) { int idx = i_ur * jcp.nb_oc_blocking + i_oc; assert(idx < 31); @@ -99,6 +126,12 @@ private: assert(idx < 31); return zmm_t(idx); } + zmm_t zmm_bias_alpha() { + return zmm_t(jcp.nb_oc_blocking * jcp.ur_w); + } + xmm_t xmm_bias_alpha() { + return xmm_t(jcp.nb_oc_blocking * jcp.ur_w); + } int get_ow_start(int ki, int l_overflow) { int res = (jcp.ow - 1 + jcp.r_pad) % jcp.stride_w @@ -111,25 +144,28 @@ private: int get_ow_end(int ur_w, int ki, int r_overflow) { if (utils::one_of(ur_w, jcp.ow, jcp.ur_w_tail)) - ur_w += nstl::min(0, jcp.r_pad); + ur_w += nstl::min(0, jcp.r_pad); // remove negative padding int res = (ur_w - 1 + jcp.l_pad) % jcp.stride_w + r_overflow * jcp.stride_w - ki * (jcp.dilate_w + 1); while (res < 0) res += jcp.stride_w; return ur_w - res; } - + bool maybe_eltwise(int position); + void compute_eltwise(int ur_w); void prepare_output(int ur_w); void store_output(int ur_w, bool last_oc_block); - void compute_ker(int ur_w, int pad_l, int pad_r, ker_block_t last_ker_block); - void compute_loop(int ur_w, int pad_l, int pad_r, bool last_block); + void compute_ker(int ur_w, int l_overflow, int r_overflow, + ker_block_t last_ic_block_flag, bool h_padded = false); + void kh_loop(int ur_w, int pad_l, int pad_r, ker_block_t last_ker_block); + void icb_loop(int ur_w, int pad_l, int pad_r, bool last_block); void generate(); void cvt2ps(data_type_t type_in, zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag); }; -template -struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t : public cpu_primitive_t { +template +struct _jit_avx512_core_x8s8s32x_deconvolution_fwd_t : public cpu_primitive_t { struct pd_t : public cpu_deconvolution_fwd_pd_t { pd_t(engine_t *engine, const deconvolution_desc_t *adesc, @@ -138,7 +174,7 @@ struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t : public cpu_primitive_t { : cpu_deconvolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {} DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_deconvolution:", avx512_core, ""), - _jit_avx512_core_u8s8s32x_deconvolution_fwd_t); + _jit_avx512_core_x8s8s32x_deconvolution_fwd_t); virtual status_t init() override { assert(this->engine()->kind() == engine_kind::cpu); @@ -147,6 +183,7 @@ struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t : public cpu_primitive_t { && utils::one_of(this->desc()->prop_kind, prop_kind::forward_training, prop_kind::forward_inference) && this->desc()->alg_kind & alg_kind::deconvolution_direct + && this->desc()->src_desc.data_type == src_type && this->desc()->dst_desc.data_type == dst_type && IMPLICATION(this->with_bias(), utils::one_of( this->desc()->bias_desc.data_type, data_type::f32, @@ -154,41 +191,48 @@ struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t : public cpu_primitive_t { && this->desc()->accum_data_type == data_type::s32; if (!ok) return status::unimplemented; - /*TODO: support signed input and postops */ - return jit_avx512_core_u8s8s32x_deconv_fwd_kernel::init_conf( + status_t status = jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf( jcp_, *this->desc(), this->src_pd_, this->weights_pd_, this->dst_pd_, this->with_bias(), this->bias_pd_, *this->attr()); + + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_scratchpad(scratchpad, + jcp_, *this->attr()); + + return status::success; } jit_conv_conf_t jcp_; }; - _jit_avx512_core_u8s8s32x_deconvolution_fwd_t(const pd_t *pd, + _jit_avx512_core_x8s8s32x_deconvolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - kernel_ = new jit_avx512_core_u8s8s32x_deconv_fwd_kernel(conf_.jcp_, - *conf_.attr()); + : cpu_primitive_t(apd, inputs, outputs) { + kernel_ = new jit_avx512_core_x8s8s32x_deconv_fwd_kernel(pd()->jcp_, + *pd()->attr()); } - ~_jit_avx512_core_u8s8s32x_deconvolution_fwd_t() { + ~_jit_avx512_core_x8s8s32x_deconvolution_fwd_t() { delete kernel_; } - typedef typename prec_traits::type src_data_t; + typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; - jit_avx512_core_u8s8s32x_deconv_fwd_kernel *kernel_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_avx512_core_x8s8s32x_deconv_fwd_kernel *kernel_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_generator.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_generator.hpp index b72ed2d..b247724 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_generator.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_generator.hpp @@ -102,6 +102,8 @@ static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RDI), abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), abi_param4(Xbyak::Operand::RCX), + abi_param5(Xbyak::Operand::R8), + abi_param6(Xbyak::Operand::R9), abi_not_param1(Xbyak::Operand::RCX); #endif #endif @@ -110,7 +112,7 @@ inline unsigned int get_cache_size(int level, bool per_core = true){ unsigned int l = level - 1; // Currently, if XByak is not able to fetch the cache topology // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core. - if (cpu.data_cache_levels == 0){ + if (cpu.getDataCacheLevels() == 0){ const int L1_cache_per_core = 32000; const int L2_cache_per_core = 512000; const int L3_cache_per_core = 1024000; @@ -122,31 +124,15 @@ inline unsigned int get_cache_size(int level, bool per_core = true){ default: return 0; } } - if (l < cpu.data_cache_levels) { - return cpu.data_cache_size[l] - / (per_core ? cpu.cores_sharing_data_cache[l] : 1); + if (l < cpu.getDataCacheLevels()) { + return cpu.getDataCacheSize(l) + / (per_core ? cpu.getCoresSharingDataCache(l) : 1); } else return 0; } } -// TODO (Roma): move all_same to a more appropriate location - -template -struct all_same : std::false_type {}; - -template -struct all_same : all_same { }; - -template -struct all_same : std::true_type {}; - -struct jit_code_injection { - const Xbyak::uint8* code; - size_t size; -}; - class jit_generator : public Xbyak::CodeGenerator { private: @@ -174,6 +160,8 @@ public: _cmp_neq_uq = 4u, _cmp_nlt_us = 5u, _cmp_nle_us = 6u, + + _op_floor = 1u, }; Xbyak::Reg64 param1 = abi_param1; @@ -302,7 +290,7 @@ public: // Disallow char-based labels completely void L(const char *label) = delete; - void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } + void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } void uni_vpxor(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) { @@ -322,6 +310,32 @@ public: vpxord(x1, x2, op); } + void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Xmm &x) { + movss(addr, x); + } + void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Ymm &x) { + vmovss(addr, x); + } + void uni_vmovss(const Xbyak::Xmm &x, const Xbyak::Address& addr) { + movss(x, addr); + } + void uni_vmovss(const Xbyak::Ymm &x, const Xbyak::Address& addr) { + vmovss(x, addr); + } + + void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Xmm &x) { + movsd(addr, x); + } + void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Ymm &x) { + vmovsd(addr, x); + } + void uni_vmovsd(const Xbyak::Xmm &x, const Xbyak::Address& addr) { + movsd(x, addr); + } + void uni_vmovsd(const Xbyak::Ymm &x, const Xbyak::Address& addr) { + vmovsd(x, addr); + } + void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Xmm &x) { movdqu(addr, x); } @@ -393,6 +407,29 @@ public: } } + void uni_vrcpss(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + rcpss(x, op); + } + void uni_vrcpss(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2) { + Xbyak::Xmm x1_(x1.getIdx()); + Xbyak::Xmm x2_(x2.getIdx()); + vrcpss(x1_, x1_, x2_); + } + void uni_vrcpss(const Xbyak::Ymm &x, const Xbyak::Address &op) { + Xbyak::Xmm x_(x.getIdx()); + vrcpss(x_, x_, op); + } + + void uni_vrcpps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + rcpps(x, op); + } + void uni_vrcpps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + vrcpps(x, op); + } + void uni_vrcpps(const Xbyak::Zmm &x, const Xbyak::Operand &op) { + vrcp14ps(x, op); + } + void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, const Xbyak::Operand &op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); @@ -519,24 +556,30 @@ public: vpaddd(x1, x2, op); } - void uni_vandps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { - assert(x.getIdx() == op1.getIdx()); - andps(x, op2); + void uni_vandps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + assert(x1.getIdx() == x2.getIdx()); + andps(x1, op); } - void uni_vandps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { - vandps(x, op1, op2); + void uni_vandps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + if (!mayiuse(avx512_common) || x1.getBit() < 512) + vandps(x1, x2, op); + else + vpandd(x1, x2, op); } - void uni_vorps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { - assert(x.getIdx() == op1.getIdx()); - orps(x, op2); + void uni_vorps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + assert(x1.getIdx() == x2.getIdx()); + orps(x1, op); } - void uni_vorps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { - vorps(x, op1, op2); + void uni_vorps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + if (!mayiuse(avx512_common) || x1.getBit() < 512) + vorps(x1, x2, op); + else + vpord(x1, x2, op); } void uni_vpslld(const Xbyak::Xmm &x, const Xbyak::Operand &op, @@ -582,16 +625,38 @@ public: void uni_vcmpgtps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) { assert(x1.getIdx() == x2.getIdx()); - cmpps(x1, op, 0x6); + cmpps(x1, op, _cmp_nle_us); } + void uni_vcmpgtps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) { vcmpgtps(x1, x2, op); } + void uni_vcmpgeps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + cmpps(x1, op, _cmp_nlt_us); + } + + void uni_vcmpgeps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vcmpps(x1, x2, op, _cmp_nlt_us); + } + + void uni_vtestps(const Xbyak::Xmm &x1, const Xbyak::Operand &op) { + ptest(x1, op); + } + + void uni_vtestps(const Xbyak::Ymm &x1, const Xbyak::Operand &op) { + assert(!(x1.isZMM() || op.isZMM())); + vtestps(x1, op); + } + void uni_vblendvps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op, const Xbyak::Xmm &msk) { assert(x1.getIdx() == x2.getIdx()); + assert(msk.getIdx() == 0); blendvps(x1, op); } void uni_vblendvps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, @@ -629,6 +694,22 @@ public: vmovmskps(x1, x2); } + void uni_vpackssdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){ + assert(x1.getIdx() == x1.getIdx()); + packssdw(x1, op); + } + void uni_vpackssdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){ + vpackssdw(x1, x2, op); + } + + void uni_vpackuswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){ + assert(x1.getIdx() == x1.getIdx()); + packuswb(x1, op); + } + void uni_vpackuswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){ + vpackuswb(x1, x2, op); + } + void uni_vpmovsxbd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { pmovsxbd(x, op); } @@ -643,14 +724,6 @@ public: vpmovzxbd(x, op); } - void uni_vpackssdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) { - assert(x1.getIdx() == x2.getIdx()); - packssdw(x1, op); - } - void uni_vpackssdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) { - vpackssdw(x1, x2, op); - } - void uni_vpackusdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) { assert(x1.getIdx() == x2.getIdx()); packusdw(x1, op); @@ -667,14 +740,6 @@ public: vpacksswb(x1, x2, op); } - void uni_vpackuswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) { - assert(x1.getIdx() == x2.getIdx()); - packuswb(x1, op); - } - void uni_vpackuswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) { - vpackuswb(x1, x2, op); - } - void uni_vpmaxsd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) { assert(x1.getIdx() == x2.getIdx()); pmaxsd(x1, op); @@ -731,6 +796,45 @@ public: vpsubb(x1, x2, op); } + void uni_vpslldq(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::uint8 &op) { + assert(x1.getIdx() == x2.getIdx()); + pslldq(x1, op); + } + void uni_vpslldq(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::uint8 &op) { + vpslldq(x1, x2, op); + } + + void uni_vpand(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + assert(x1.getIdx() == x2.getIdx()); + pand(x1, op); + } + void uni_vpand(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + vpand(x1, x2, op); + } + + void uni_vpaddb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + paddb(x2, op); + } + void uni_vpaddb(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + vpaddb(x1, x2, op); + } + + void uni_vpshufb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + pshufb(x1, op); + } + + void uni_vpshufb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vpshufb(x1, x2, op); + } + void mul_by_const(const Xbyak::Reg &out, const Xbyak::Reg64 &tmp, int value) { // Generates a shift + add sequence for multiplicating contents of the @@ -764,10 +868,6 @@ public: mov(out, tmp); } - void inject(jit_code_injection&& in) { - db(in.code, in.size); - } - void dump_code(const Xbyak::uint8 *code) const { if (code) { static int counter = 0; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_primitive_conf.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_primitive_conf.hpp index 47c9799..9de97fe 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_primitive_conf.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_primitive_conf.hpp @@ -19,6 +19,8 @@ #include +#include "common/primitive_attr.hpp" + namespace mkldnn { namespace impl { namespace cpu { @@ -27,7 +29,7 @@ namespace cpu { enum conv_version_t {ver_unused, ver_fma, ver_avx512_core, ver_4fma, ver_4vnni, ver_vnni}; enum conv_loop_order_t {loop_cgn, loop_gnc, loop_ngc, loop_gncw, loop_cwgn, - loop_ngcw}; + loop_ngcw, loop_nhwcg}; enum conv_1x1_loop_order_t {loop_rbl, loop_rlb, loop_lbr, loop_lrb, loop_blr, loop_brl}; enum conv_kernel_kind_t {embd_bcast, expl_bcast}; @@ -53,6 +55,7 @@ struct jit_conv_conf_t { conv_version_t ver; conv_loop_order_t loop_order; + int simd_w; int ndims; int mb; int ngroups, ic, oc, oc_without_padding, ic_without_padding; @@ -64,32 +67,22 @@ struct jit_conv_conf_t { int stride_d, stride_h, stride_w; int dilate_d, dilate_h, dilate_w; memory_format_t src_fmt; + memory_format_t dst_fmt; bool with_bias; bool with_sum; bool with_eltwise; bool with_dw_conv; + bool with_binarization; + + post_ops_t::entry_t::eltwise_t eltwise; - alg_kind_t eltwise_alg; - float eltwise_alpha; - float eltwise_beta; - float eltwise_scale; + int nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b; int idp, ihp, iwp, ohp, owp; - int dw_conv_in_h; - int dw_conv_in_w; - int dw_conv_ker_h; - int dw_conv_ker_w; - int dw_conv_str_h; - int dw_conv_str_w; - const float* dw_conv_weights; - const float* dw_conv_biases; - - bool dw_conv_with_sum; - bool dw_conv_with_eltwise; - alg_kind_t dw_conv_eltwise_alg; - float dw_conv_eltwise_alpha; - float dw_conv_eltwise_beta; + const float* conv_weights; + const float* conv_biases; + int dw_conv_oh, dw_conv_ow; int nb_ic, ic_block; int nb_oc, oc_block; @@ -102,6 +95,7 @@ struct jit_conv_conf_t { int ur_h, ur_w; int ur_w_tail; bool is_1stconv; + int nonblk_group_off; /* fma avx512_core */ conv_kernel_kind_t kernel_kind; /* 4fma */ @@ -121,6 +115,7 @@ struct jit_conv_conf_t { int oc_nb1; int ur_ow_max, ur_ow, ur_ow_tail; int ur_ow_nsteps; + data_type_t src_dt; data_type_t bia_dt; data_type_t dst_dt; /* avx512: max possible value is nregs(32) - aux_regs(4) */ @@ -129,16 +124,22 @@ struct jit_conv_conf_t { bool expl_bcast; bool large_spatial; int is_oc_scale; + int max_regs_ur; // maximum accumulation registers // dw conv int nb_ch, ch_block, nb_ch_blocking; - bool is_depthwise; + bool is_depthwise, is_fast_depthwise; int aligned_threads; // large spatial int oh_blk_size; - int ow_blk_size; // s8s8 convolution bool signed_input; float wei_adj_scale; + // planar conv + int nb_ow_blocking; + + int oh_block; + int nb_oh_blocking; + int oh_block_step; }; struct jit_conv_conf_2x3_wino_t { @@ -173,9 +174,7 @@ struct jit_conv_conf_2x3_wino_t { int typesize_acc; memory_format_t src_fmt; - bool with_bias, with_relu; - float relu_negative_slope; - bool with_sum; + bool with_bias; bool small_mb; int xb, yb; @@ -188,6 +187,12 @@ struct jit_conv_conf_2x3_wino_t { int m_block, n_block, k_block; int n2_block, n_chunks; int k2_block, k_chunks; + + int mb_block, nb_mb; + + size_t size_wino_src, size_wino_wei, size_wino_dst; + + int nthr; }; /* @@ -267,6 +272,47 @@ struct jit_conv_winograd_conf_t : public jit_conv_conf_t { winograd_sched_t sched_policy; }; +struct jit_bin_conv_conf_t { + prop_kind_t prop_kind; + conv_version_t ver; + conv_loop_order_t loop_order; + + int ndims; + int mb; + int ngroups, ic, oc, oc_padded, ic_padded; + int id, ih, iw, od, oh, ow; + int f_pad, l_pad, t_pad; + int back_pad, r_pad, b_pad; + int kd, kh, kw; + int stride_d, stride_h, stride_w; + int dilate_d, dilate_h, dilate_w; + memory_format_t src_fmt; + bool with_bias; + bool with_sum; + bool with_eltwise; + bool with_dw_conv; + bool with_binarization; + + float pad_value; + bool exclude_pad; + + int dw_conv_oh; + int dw_conv_ow; + + int nb_ic, ic_block; + int nb_oc, oc_block; + int nb_ic_blocking, nb_oc_blocking; // blocking of nb_ic and nb_ic + int ur_h, ur_w; + int ur_w_tail; + int typesize_in; + int typesize_out; + int typesize_bia; + int typesize_acc; + data_type_t src_dt; + data_type_t bia_dt; + data_type_t dst_dt; +}; + struct jit_conv_call_s { const void *src; /* hack, non-const for backward_data */ const void *dst; /* hack, non-const for forward */ @@ -302,6 +348,7 @@ struct jit_conv_call_s { size_t ch_work; size_t t_overflow; size_t b_overflow; + size_t oh_blocks; int flags; const void *src_row0; /* hack, non-const for backward_data */ @@ -318,6 +365,9 @@ struct jit_deconv_call_s { const void *filt; /* hack, non-const for backward_weights */ const void *bias; /* hack, non-const for backward_bias */ const void *scales; + const void *compensation; + size_t t_overflow; + size_t b_overflow; size_t kh_padding; size_t oc_blocks; }; @@ -327,19 +377,12 @@ struct jit_dw_conv_call_s { const void *output; const void *filter; const void *bias; - union { - size_t table_flags; /* This allows both bytes to be read simultaneously - */ - struct { - unsigned char - table_idx; /* Indicates the table entry for the - JIT-generated values that control the - inner loop execution. The entry is - determined by the oh_block exectuion. */ - unsigned char - exec_flag; /* Flags passed by driver execution to inner kernel */ - }; - }; + size_t kh_count; + size_t oh_count; + size_t oh_index; + size_t filter_pad_off; + unsigned char + exec_flags; /* Flags passed by driver execution to inner kernel */ }; struct jit_wino_transform_call_s { @@ -370,30 +413,13 @@ struct jit_1x1_conv_conf_t { int kh, kw; int stride_h, stride_w; memory_format_t src_fmt; + memory_format_t dst_fmt; bool with_bias; bool with_sum; bool with_eltwise; bool with_dw_conv; - alg_kind_t eltwise_alg; - float eltwise_alpha; - float eltwise_beta; - float eltwise_scale; - - int dw_conv_in_h; - int dw_conv_in_w; - int dw_conv_ker_h; - int dw_conv_ker_w; - int dw_conv_str_h; - int dw_conv_str_w; - const float* dw_conv_weights; - const float* dw_conv_biases; - - bool dw_conv_with_sum; - bool dw_conv_with_eltwise; - alg_kind_t dw_conv_eltwise_alg; - float dw_conv_eltwise_alpha; - float dw_conv_eltwise_beta; + post_ops_t::entry_t::eltwise_t eltwise; int is, os; int ic_block, oc_block; @@ -427,10 +453,12 @@ struct jit_1x1_conv_conf_t { int tr_is; int nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b; int is_oc_scale; + data_type_t src_dt; data_type_t bia_dt; data_type_t dst_dt; bool signed_input; float wei_adj_scale; + int dw_conv_oh, dw_conv_ow; /* u8s8s32x */ int ic_dim, nb_ic, nb_ic_blocking, nb_ic_blocking_max; @@ -454,8 +482,7 @@ struct jit_gemm_conv_conf_t { int stride_h, stride_w, stride_d; int dilate_h, dilate_w, dilate_d; memory_format_t src_fmt; - bool with_bias, with_relu; - float relu_negative_slope; + bool with_bias; int is, os, ks; int ic_block, oc_block; @@ -465,6 +492,9 @@ struct jit_gemm_conv_conf_t { bool need_wei_reduction; bool signed_input; float wei_adj_scale; + int oh_block; + int ow_block; + bool outer_threading; }; struct jit_1x1_conv_call_s { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.cpp index cbce262..3ba4715 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.cpp @@ -139,7 +139,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop( default: if (jcp.with_dw_conv) return ptr[aux_reg_output_data + - (i * jcp.dw_conv_ker_h * jcp.ow + j) * jcp.oc_block * sizeof(float) + n*4*sizeof(float)]; + (i * jcp_dw.kh * jcp.ow + j) * jcp.oc_block * sizeof(float) + n*4*sizeof(float)]; else return ptr[aux_reg_output_data + (i * jcp.os + j) * jcp.oc_block * sizeof(float) + n*4*sizeof(float)]; @@ -185,7 +185,6 @@ void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop( }; // init() auto store = [=]() { - Label store_done; Label store_noadd; if (!jcp.with_sum) { @@ -203,16 +202,13 @@ void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop( L(store_noadd); - Label store_norelu; + Label store_no_postops; test(reg_reduce_pos_flag, FLAG_REDUCE_LAST); - jz(store_norelu, T_NEAR); + jz(store_no_postops, T_NEAR); int eltwise_inj_idx = 0; int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - eltwise_injectors[0]->compute_vector_range(1, 2 * ur * load_loop_blk + 1); - } int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { @@ -244,15 +240,13 @@ void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop( } } - L(store_norelu); + L(store_no_postops); for (int j = 0; j < ur; ++j) for (int i = 0; i < load_loop_blk; ++i) { movups(output_ptr(i, j, 0), reg_accum(i, j, 0)); movups(output_ptr(i, j, 1), reg_accum(i, j, 1)); } - - L(store_done); }; auto fma_block = [=](bool last_block) { @@ -375,12 +369,6 @@ void jit_sse42_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk) void jit_sse42_1x1_conv_kernel_f32::generate() { - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - const auto &p = attr_.post_ops_; int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { @@ -513,24 +501,15 @@ bool jit_sse42_1x1_conv_kernel_f32::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: - return true // sum OR eltwise OR dw_conv - && !jcp.with_eltwise && (is_simple(0) || is_sum(0) || is_dw_conv(0)); - case 2: - return true // sum->eltwise OR dw_conv->eltwise OR eltwise->dw_conv OR dw_conv->sum OR sum->depthwise OR - // eltwise->depthwise OR depthwise->depthwise - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || - (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || - (is_simple(0) && is_simple(1))); - case 3: - return true // eltwise->dw_conv->eltwise OR dw_conv->sum->eltwise OR sum->eltwise->depthwise OR - // sum->depthwise->eltwise OR sum->depthwise->depthwise - && !jcp.with_eltwise && ((is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || - (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || - (is_sum(0) && is_simple(1) && is_simple(2))); - case 4: return true // eltwise->dw_conv->sum->eltwise - && !jcp.with_eltwise && (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); + case 0: return true; + case 1: return is_simple(0) || is_sum(0) || is_dw_conv(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || + (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || + (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || + (is_sum(0) && is_simple(1) && is_simple(2)); + case 4: return (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); default: return false; } @@ -540,7 +519,7 @@ bool jit_sse42_1x1_conv_kernel_f32::post_ops_ok( status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope) + const primitive_attr_t &attr) { if (!mayiuse(sse42)) return status::unimplemented; @@ -576,47 +555,25 @@ status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp, jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; - if (!post_ops_ok(jcp, attr)) return status::unimplemented; const auto &p = attr.post_ops_; - jcp.with_dw_conv = false; - int dw_conv_ind = p.find(primitive_kind::convolution); - if (dw_conv_ind != -1) { - jcp.with_dw_conv = true; - jcp.dw_conv_in_h = p.entry_[dw_conv_ind].dw_conv.in_h; - jcp.dw_conv_in_w = p.entry_[dw_conv_ind].dw_conv.in_w; - jcp.dw_conv_ker_h = p.entry_[dw_conv_ind].dw_conv.ker_h; - jcp.dw_conv_ker_w = p.entry_[dw_conv_ind].dw_conv.ker_w; - jcp.dw_conv_str_h = p.entry_[dw_conv_ind].dw_conv.str_h; - jcp.dw_conv_str_w = p.entry_[dw_conv_ind].dw_conv.str_w; - jcp.dw_conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; - jcp.dw_conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; - } + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp.with_dw_conv = dw_conv_ind != -1; if (jcp.with_dw_conv) { - int dw_conv_eltwise_ind = p.find(primitive_kind::eltwise, dw_conv_ind); - if (dw_conv_eltwise_ind != -1) { - jcp.dw_conv_with_eltwise = true; - jcp.dw_conv_eltwise_alg = p.entry_[dw_conv_eltwise_ind].eltwise.alg; - jcp.dw_conv_eltwise_alpha = p.entry_[dw_conv_eltwise_ind].eltwise.alpha; - jcp.dw_conv_eltwise_beta = p.entry_[dw_conv_eltwise_ind].eltwise.beta; - } + jcp.dw_conv_oh = jcp.oh; + jcp.dw_conv_ow = jcp.ow; + jcp.oh = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp.ow = p.entry_[dw_conv_ind].dw_conv.in_w; } jcp.with_sum = p.find(primitive_kind::sum, 0, dw_conv_ind) != -1; - if (jcp.with_dw_conv) { - jcp.dw_conv_with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; - } - if (jcp.with_dw_conv) { - jcp.oh = jcp.dw_conv_in_h; - jcp.ow = jcp.dw_conv_in_w; - } + jcp.src_dt = cd.src_desc.data_type; + jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; + jcp.dst_dt = cd.dst_desc.data_type; jcp.os = jcp.oh * jcp.ow; jcp.is = jcp.ih * jcp.iw; @@ -791,6 +748,24 @@ status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp, return status::success; } +void jit_sse42_1x1_conv_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw) { + using namespace mkldnn::impl::memory_tracking::names; + + if (jcp.prop_kind != backward_data && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp.oc); + + if (jcp.with_dw_conv) { + const int nthreads = mkldnn_get_max_threads(); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * (jcp.oc / jcp.oc_block); + scratchpad.book(key_dw_conv_buffer, sizeof(float) * dw_conv_buffer_size_ * nthreads); + + if (jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_dw_conv_padded_bias, sizeof(float) * jcp.oc); + } +} + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.hpp index f2b7edd..f41daf1 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.hpp @@ -18,9 +18,9 @@ #define JIT_SSE42_1x1_CONV_KERNEL_F32_HPP #include "c_types_map.hpp" +#include "cpu_memory.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" -#include "cpu_memory.hpp" #include "jit_uni_eltwise.hpp" #include "jit_uni_depthwise.hpp" @@ -29,8 +29,10 @@ namespace impl { namespace cpu { struct jit_sse42_1x1_conv_kernel_f32: public jit_generator { - jit_sse42_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp, - const primitive_attr_t &attr): jcp(ajcp), attr_(attr) { + jit_sse42_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp, jit_conv_conf_t ajcp_dw, + const primitive_attr_t &attr) + : jcp(ajcp), jcp_dw(ajcp_dw), attr_(attr) + { this->generate(); jit_ker = (void (*)(jit_1x1_conv_call_s *))this->getCode(); } @@ -53,22 +55,15 @@ struct jit_sse42_1x1_conv_kernel_f32: public jit_generator { const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope); + const primitive_attr_t &attr); - static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, - const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr) - { - return init_conf(jcp, cd, src_d, weights_d, dst_d, attr, false, 0.0); - } + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw = jit_conv_conf_t()); DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse42_1x1_conv_kernel_f32) jit_1x1_conv_conf_t jcp; + jit_conv_conf_t jcp_dw; const primitive_attr_t &attr_; void (*jit_ker)(jit_1x1_conv_call_s *); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.cpp index 3b95a10..2fe6e8f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.cpp @@ -34,36 +34,38 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; -template -void _jit_sse42_1x1_convolution_fwd_t::execute_forward() { +void jit_sse42_1x1_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); const int ndims = src_d.ndims(); const auto &jcp = kernel_->jcp; - int MB = conf_.MB(); + int MB = pd()->MB(); const int work_amount = MB * jcp.ngroups * jcp.nb_bcast; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; } parallel(0, [&](const int ithr, const int nthr) { // TODO (Roma): remove this restriction assert(jcp.stride_w == 1 && jcp.stride_h == 1); - jit_1x1_conv_call_s par_conv = {}; + auto par_conv = jit_1x1_conv_call_s(); const int nb_oc = jcp.nb_load; const int nb_ic = jcp.nb_reduce; @@ -120,7 +122,7 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward() { const size_t src_off = data_blk_off(src_d, n, _icb, ih, iw); par_conv.bcast_data = &src[src_off]; - par_conv.load_data = &weights[conf_.with_groups() + par_conv.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; @@ -135,22 +137,25 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward() { iwork += bcast_step; } }); + + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); } -template -void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { +void jit_sse42_1x1_convolution_fwd_t::execute_forward_with_dw_conv() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); - auto &jcp = kernel_->jcp; - int MB = conf_.MB(); + const auto &jcp = kernel_->jcp; + const auto &jcp_dw = kernel_dw_->jcp; + int MB = pd()->MB(); - auto dw_bias = jcp.dw_conv_biases; + auto dw_bias = jcp_dw.conv_biases; int ocb_work = jcp.with_dw_conv ? utils::div_up(jcp.nb_load, jcp.nb_load_blocking) : 1; const int work_amount = MB * jcp.ngroups * ocb_work * jcp.nb_bcast; @@ -173,8 +178,8 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { if ((oh + h) < 0 || (oh + h) >= jcp.ih) { for (int chb = ocb; chb < ocb + load_step; chb++) { - memset(ws_p + (((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * jcp.oc_block + - (chb - ocb) * jcp.dw_conv_ker_h * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); + memset(ws_p + (((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block + + (chb - ocb) * jcp_dw.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); } } else { const int _ocb = g * jcp.nb_load + ocb; @@ -182,7 +187,7 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { p.bcast_dim = this_block_size(os, jcp.os, bcast_step * os_block); p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc, load_step * jcp.oc_block); - p.output_data = &ws_p[(((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * jcp.oc_block]; + p.output_data = &ws_p[(((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block]; p.bias_data = &bias[_ocb * jcp.oc_block]; @@ -194,7 +199,7 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { p.reduce_dim = this_block_size(icb * jcp.ic_block, jcp.ic, jcp.nb_reduce_blocking * jcp.ic_block); - p.load_data = &weights[conf_.with_groups() + p.load_data = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb) : weights_d.blk_off(ocb, icb)]; @@ -210,8 +215,6 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { }; auto compute_row_dw = [&](const float* ws_p, int n, int ocb, int load_step, int dst_idx) { - const auto &jcp_dw = kernel_dw_->jcp; - for (int chb = ocb; chb < ocb + load_step; chb++) { auto par_conv_dw = jit_conv_call_s(); @@ -226,9 +229,11 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { dst_idx/jcp_dw.stride_h*jcp_dw.ow*jcp_dw.ch_block]; par_conv_dw.kh_padding = jcp_dw.kh; - par_conv_dw.filt = &jcp.dw_conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; + par_conv_dw.filt = &jcp_dw.conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; par_conv_dw.bias = &dw_bias[chb * jcp_dw.ch_block]; par_conv_dw.ur_w = (size_t)(jcp_dw.ow); + par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw.ch_block, (int)jcp_dw.oc) - chb*jcp_dw.ch_block; + par_conv_dw.oc_off = chb * jcp_dw.ch_block * sizeof(float); kernel_dw_->jit_ker(&par_conv_dw); } @@ -239,11 +244,12 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { int start{0}, end{0}; balance211(work_amount, nthr, ithr, start, end); - auto pbuf = dw_conv_buffer_ + ithr * dw_conv_buffer_size_; + auto dw_conv_buffer = scratchpad().get(key_dw_conv_buffer); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * (jcp.oc / jcp.oc_block); + auto pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_; const int os_block = jcp.iw; - int iwork = start; while (iwork < end) { int n{0}, g{0}, ocbb{0}, osb{0}; @@ -272,7 +278,7 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { compute_block_1x1(pbuf, n, g, oh + 1, ow, ih, iw, os, os_block, bcast_step, ocb, load_step, bcast_step); } - if ((oh % jcp.dw_conv_str_h == 0)) { + if ((oh % jcp_dw.stride_h == 0)) { compute_row_dw(pbuf, n, ocb, load_step, oh); } @@ -280,23 +286,25 @@ void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing() { } }; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - dw_padded_bias_[oc] = dw_bias[oc]; - dw_bias = dw_padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; + + auto dw_padded_bias = scratchpad().get(key_dw_conv_padded_bias); + utils::array_copy(dw_padded_bias, dw_bias, jcp.oc_without_padding); + utils::array_set(dw_padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + dw_bias = dw_padded_bias; } parallel(0, ker); -} -template void _jit_sse42_1x1_convolution_fwd_t::execute_forward(); -template void _jit_sse42_1x1_convolution_fwd_t::execute_forward(); -template void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing(); -template void _jit_sse42_1x1_convolution_fwd_t::execute_forward_fusing(); + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); +} } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.hpp index a98619d..5931102 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.hpp @@ -20,7 +20,6 @@ #include "c_types_map.hpp" #include "cpu_convolution_pd.hpp" #include "cpu_engine.hpp" -#include "cpu_reducer.hpp" #include "jit_sse42_1x1_conv_kernel_f32.hpp" #include "mkldnn_thread.hpp" #include "utils.hpp" @@ -30,65 +29,59 @@ namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t { +struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t { // TODO: (Roma) Code duplication duplication! Remove with templates // (maybe...)! - struct pd_t: public _cpu_convolution_fwd_pd_t { + struct pd_t: public cpu_convolution_fwd_pd_t { pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) - , jcp_(), jcp_dw() {} + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_(), jcp_dw_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_1x1:", sse42, ""), - _jit_sse42_1x1_convolution_fwd_t); + jit_sse42_1x1_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), - data_type::f32 == this->cdesc_().bias_desc.data_type); + data_type::f32 == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; status_t sts_1x1 = jit_sse42_1x1_conv_kernel_f32::init_conf(jcp_, - this->cdesc_(), + *this->desc(), *this->src_pd_.desc(), *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->attr(), with_relu, - this->negative_slope()); + *this->dst_pd_.desc(), *this->attr()); if (sts_1x1 != status::success) return sts_1x1; if (jcp_.with_dw_conv) { - int dw_conv_oh = (jcp_.oh - ((jcp_.dw_conv_ker_h - 1) + 1) + 2) / jcp_.dw_conv_str_h + 1; - int dw_conv_ow = (jcp_.ow - ((jcp_.dw_conv_ker_w - 1) + 1) + 2) / jcp_.dw_conv_str_w + 1; - - status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_dw, - jcp_.oc, jcp_.oh, jcp_.ow, dw_conv_oh, dw_conv_ow, - jcp_.dw_conv_ker_h, jcp_.dw_conv_ker_w, - jcp_.dw_conv_str_h, jcp_.dw_conv_str_w, - jcp_.dw_conv_eltwise_alg, jcp_.dw_conv_eltwise_alpha, - jcp_.dw_conv_eltwise_beta, jcp_.dw_conv_with_sum); + status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_, jcp_dw_, *this->attr()); if (sts_dw != status::success) return sts_dw; } + auto scratchpad = scratchpad_registry().registrar(); + jit_sse42_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_, jcp_dw_); + return status::success; } jit_1x1_conv_conf_t jcp_; - jit_conv_conf_t jcp_dw; + jit_conv_conf_t jcp_dw_; protected: virtual status_t set_default_params() override { @@ -105,56 +98,36 @@ struct _jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t { : utils::pick(this->ndims() - 3, OIw8i8o, OIhw8i8o))); if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _jit_sse42_1x1_convolution_fwd_t(const pd_t *pd, + jit_sse42_1x1_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), - dw_conv_buffer_size_(0), dw_conv_buffer_(nullptr), padded_bias_(nullptr), dw_padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) { - kernel_ = new jit_sse42_1x1_conv_kernel_f32(conf_.jcp_, *conf_.attr()); - if (conf_.jcp_.with_dw_conv) { - kernel_dw_ = new jit_uni_dw_conv_row_f32(conf_.jcp_dw); - - const int nthreads = mkldnn_get_max_threads(); - dw_conv_buffer_size_ = (size_t) conf_.jcp_dw.kh * conf_.jcp_dw.iw * conf_.jcp_dw.ch_block * - (conf_.jcp_.oc / conf_.jcp_.oc_block); - dw_conv_buffer_ = (data_t *) malloc(dw_conv_buffer_size_ * nthreads * sizeof(data_t), 64); - - } - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; + kernel_ = new jit_sse42_1x1_conv_kernel_f32(pd()->jcp_, pd()->jcp_dw_, *pd()->attr()); - dw_padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - dw_padded_bias_[oc] = 0; + if (pd()->jcp_.with_dw_conv) { + kernel_dw_ = new jit_uni_dw_conv_row_f32(pd()->jcp_dw_, *pd()->attr(), pd()->jcp_dw_.ch_block); } } - ~_jit_sse42_1x1_convolution_fwd_t() { + ~jit_sse42_1x1_convolution_fwd_t() { delete kernel_; - if (conf_.jcp_.with_dw_conv) { + if (pd()->jcp_.with_dw_conv) { delete kernel_dw_; - free(dw_conv_buffer_); - free(dw_padded_bias_); } - - free(padded_bias_); }; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.jcp_.with_dw_conv) - execute_forward_fusing(); + virtual void execute(event_t *e) const { + if (pd()->jcp_.with_dw_conv) + execute_forward_with_dw_conv(); else execute_forward(); @@ -162,24 +135,14 @@ struct _jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t { } private: - void execute_forward(); - void execute_forward_fusing(); + void execute_forward() const; + void execute_forward_with_dw_conv() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; jit_sse42_1x1_conv_kernel_f32 *kernel_; jit_uni_dw_conv_row_f32 *kernel_dw_; - - /* fuse with dw conv */ - size_t dw_conv_buffer_size_; - data_t *dw_conv_buffer_; - - data_t *padded_bias_; - data_t *dw_padded_bias_; }; -using jit_sse42_1x1_convolution_fwd_t = _jit_sse42_1x1_convolution_fwd_t; -using jit_sse42_1x1_convolution_relu_t = _jit_sse42_1x1_convolution_fwd_t; - } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.cpp index 32f1903..c192504 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.cpp @@ -29,6 +29,7 @@ namespace cpu { using namespace mkldnn::impl::prop_kind; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; @@ -170,7 +171,7 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w, for (int jj = 0; jj < ur_w; jj++) { int o_off; if (jcp.with_dw_conv) - o_off = (ii * jcp.dw_conv_ker_h * ow + jj) * oc_blk; + o_off = (ii * jcp_dw.kh * ow + jj) * oc_blk; else o_off = (ii * oh * ow + jj) * oc_blk; @@ -206,7 +207,8 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w, Label skip_kh_loop; mov(kj, reg_kh); - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { + if ((jcp.dilate_h >= jcp.ih) + || (jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) { cmp(kj, 0); je(skip_kh_loop, T_NEAR); } @@ -240,10 +242,6 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w, int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - eltwise_injectors[0]->compute_vector_range(1, oc_blocks * ur_w + 1); - } - int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { auto& post_op = p.entry_[i]; @@ -275,7 +273,7 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w, for (int jj = 0; jj < ur_w; jj++) { int o_off; if (jcp.with_dw_conv) - o_off = (ii * jcp.dw_conv_ker_h * ow + jj) * oc_blk; + o_off = (ii * jcp_dw.kh * ow + jj) * oc_blk; else o_off = (ii * oh * ow + jj) * oc_blk; @@ -284,8 +282,6 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w, } } - L(done); - mov(aux_reg_kernel, reg_kernel); mov(aux_reg_input, reg_input); add(aux_reg_kernel, sizeof(float) * 4); @@ -359,12 +355,6 @@ inline void jit_sse42_conv_fwd_kernel_f32::solve_common(int oc_blocks) void jit_sse42_conv_fwd_kernel_f32::generate() { - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - const auto &p = attr_.post_ops_; int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; for (int i = 0; i < end_idx; i++) { @@ -431,24 +421,15 @@ bool jit_sse42_conv_fwd_kernel_f32::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: - return true // sum OR eltwise OR dw_conv - && !jcp.with_eltwise && (is_simple(0) || is_sum(0) || is_dw_conv(0)); - case 2: - return true // sum->eltwise OR dw_conv->eltwise OR eltwise->dw_conv OR dw_conv->sum OR sum->depthwise OR - // eltwise->depthwise OR depthwise->depthwise - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || - (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || - (is_simple(0) && is_simple(1))); - case 3: - return true // eltwise->dw_conv->eltwise OR dw_conv->sum->eltwise OR sum->eltwise->depthwise OR - // sum->depthwise->eltwise OR sum->depthwise->depthwise - && !jcp.with_eltwise && ((is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || - (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || - (is_sum(0) && is_simple(1) && is_simple(2))); - case 4: return true // eltwise->dw_conv->sum->eltwise - && !jcp.with_eltwise && (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); + case 0: return true; + case 1: return is_simple(0) || is_sum(0) || is_dw_conv(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_eltwise(1)) || + (is_eltwise(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_eltwise(0) && is_dw_conv(1) && is_eltwise(2)) || + (is_dw_conv(0) && is_sum(1) && is_eltwise(2)) || + (is_sum(0) && is_simple(1) && is_simple(2)); + case 4: return (is_eltwise(0) && is_dw_conv(1) && is_sum(2) && is_eltwise(3)); default: return false; } @@ -458,7 +439,7 @@ bool jit_sse42_conv_fwd_kernel_f32::post_ops_ok( status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope) + const primitive_attr_t &attr) { if (!mayiuse(sse42)) return status::unimplemented; @@ -496,47 +477,26 @@ status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; if (!post_ops_ok(jcp, attr)) return status::unimplemented; const auto &p = attr.post_ops_; - jcp.with_dw_conv = false; - int dw_conv_ind = p.find(primitive_kind::convolution); - if (dw_conv_ind != -1) { - jcp.with_dw_conv = true; - jcp.dw_conv_in_h = p.entry_[dw_conv_ind].dw_conv.in_h; - jcp.dw_conv_in_w = p.entry_[dw_conv_ind].dw_conv.in_w; - jcp.dw_conv_ker_h = p.entry_[dw_conv_ind].dw_conv.ker_h; - jcp.dw_conv_ker_w = p.entry_[dw_conv_ind].dw_conv.ker_w; - jcp.dw_conv_str_h = p.entry_[dw_conv_ind].dw_conv.str_h; - jcp.dw_conv_str_w = p.entry_[dw_conv_ind].dw_conv.str_w; - jcp.dw_conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; - jcp.dw_conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; - } + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp.with_dw_conv = dw_conv_ind != -1; if (jcp.with_dw_conv) { - int dw_conv_eltwise_ind = p.find(primitive_kind::eltwise, dw_conv_ind); - if (dw_conv_eltwise_ind != -1) { - jcp.dw_conv_with_eltwise = true; - jcp.dw_conv_eltwise_alg = p.entry_[dw_conv_eltwise_ind].eltwise.alg; - jcp.dw_conv_eltwise_alpha = p.entry_[dw_conv_eltwise_ind].eltwise.alpha; - jcp.dw_conv_eltwise_beta = p.entry_[dw_conv_eltwise_ind].eltwise.beta; - } + jcp.dw_conv_oh = jcp.oh; + jcp.dw_conv_ow = jcp.ow; + jcp.oh = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp.ow = p.entry_[dw_conv_ind].dw_conv.in_w; } jcp.with_sum = p.find(primitive_kind::sum, 0, dw_conv_ind) != -1; - if (jcp.with_dw_conv) { - jcp.dw_conv_with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; - } - if (jcp.with_dw_conv) { - jcp.oh = jcp.dw_conv_in_h; - jcp.ow = jcp.dw_conv_in_w; - } + jcp.src_dt = cd.src_desc.data_type; + jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; + jcp.dst_dt = cd.dst_desc.data_type; const bool flat = jcp.ic == 3 || jcp.ic == 1; const bool mimo = !flat; @@ -613,6 +573,21 @@ status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, return status::success; } +void jit_sse42_conv_fwd_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw) { + if (jcp.with_bias && jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp.oc); + + if (jcp.with_dw_conv) { + const int nthreads = mkldnn_get_max_threads(); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * jcp.nb_oc_blocking; + scratchpad.book(key_dw_conv_buffer, sizeof(float) * dw_conv_buffer_size_ * nthreads); + + if (jcp.oc != jcp.oc_without_padding) + scratchpad.book(key_dw_conv_padded_bias, sizeof(float) * jcp.oc); + } +} + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.hpp index ea30028..f30952f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.hpp @@ -18,9 +18,9 @@ #define JIT_SSE42_CONV_KERNEL_F32_HPP #include "c_types_map.hpp" +#include "cpu_memory.hpp" #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" -#include "cpu_memory.hpp" #include "jit_uni_eltwise.hpp" #include "jit_uni_depthwise.hpp" @@ -29,8 +29,9 @@ namespace impl { namespace cpu { struct jit_sse42_conv_fwd_kernel_f32: public jit_generator { - jit_sse42_conv_fwd_kernel_f32(jit_conv_conf_t ajcp, - const primitive_attr_t &attr): jcp(ajcp), attr_(attr) + jit_sse42_conv_fwd_kernel_f32(jit_conv_conf_t ajcp, jit_conv_conf_t ajcp_dw, + const primitive_attr_t &attr) + : jcp(ajcp), jcp_dw(ajcp_dw), attr_(attr) { this->generate(); jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); @@ -52,11 +53,13 @@ struct jit_sse42_conv_fwd_kernel_f32: public jit_generator { static status_t init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, - bool with_relu = false, float relu_negative_slope = 0.); + const memory_desc_wrapper &dst_d, const primitive_attr_t &attr); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw = jit_conv_conf_t()); DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse42_conv_fwd_kernel_f32) jit_conv_conf_t jcp; + jit_conv_conf_t jcp_dw; const primitive_attr_t &attr_; void (*jit_ker)(jit_conv_call_s *); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp index a37c317..e025af7 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp @@ -27,44 +27,46 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; #define src_blk_off(f, n, c, h, w) \ - (conf_.ndims() == 3) \ + (pd()->ndims() == 3) \ ? (f).blk_off(n, c, w) \ : (f).blk_off(n, c, h, w) #define wht_blk_off_(f, g, ...) \ - conf_.with_groups() \ + pd()->with_groups() \ ? (f).blk_off(g, __VA_ARGS__) \ : (f).blk_off(__VA_ARGS__) #define wht_blk_off(f, g, oc, ic, kh, kw) \ - conf_.ndims() == 3 \ + pd()->ndims() == 3 \ ? wht_blk_off_(f, g, oc, ic, kw) \ : wht_blk_off_(f, g, oc, ic, kh, kw) -template -void _jit_sse42_convolution_fwd_t::execute_forward() { +void jit_sse42_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; - int MB = conf_.MB(); + int MB = pd()->MB(); int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; } parallel(0, [&](const int ithr, const int nthr) { @@ -86,7 +88,7 @@ void _jit_sse42_convolution_fwd_t::execute_forward() { int ocb_num = jcp.nb_oc_blocking; for (int icb = icbb; icb < icbb + icb_step; ++icb) { - jit_conv_call_s par_conv = {}; + auto par_conv = jit_conv_call_s(); const int ij = oh * jcp.stride_h; const int i_t_overflow = nstl::max(0, jcp.t_pad - ij); @@ -138,24 +140,26 @@ void _jit_sse42_convolution_fwd_t::execute_forward() { icbb += icb_step; } }); + + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); } -template -void _jit_sse42_convolution_fwd_t::execute_forward_fusing() { +void jit_sse42_convolution_fwd_t::execute_forward_with_dw_conv() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; const auto &jcp_dw = kernel_dw_->jcp; - int MB = conf_.MB(); + int MB = pd()->MB(); - auto dw_bias = jcp.dw_conv_biases; + auto dw_bias = jcp_dw.conv_biases; int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; @@ -165,8 +169,8 @@ void _jit_sse42_convolution_fwd_t::execute_forward_fusing() { for (int h = 0; h < num_rows; h++) { if ((oh + h) < 0 || (oh + h) >= jcp.oh) { for (int chb = ocb; chb < ocb + ocb_num; chb++) { - memset(ws_p + (((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * jcp.oc_block + - (chb - ocb) * jcp.dw_conv_ker_h * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); + memset(ws_p + (((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block + + (chb - ocb) * jcp_dw.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); } } else { for (int icb = 0; icb < jcp.nb_ic; ++icb) { @@ -187,11 +191,11 @@ void _jit_sse42_convolution_fwd_t::execute_forward_fusing() { par_conv.src = &src[src_d.blk_off(n, jcp.ic == 3 ? 0 : _ic, ih, 0)]; - par_conv.dst = &ws_p[(((oh + h) + 1) % jcp.dw_conv_ker_h) * jcp.ow * + par_conv.dst = &ws_p[(((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block]; const int wh = div_up(i_t_overflow, (jcp.dilate_h + 1)); - par_conv.filt = &weights[conf_.with_groups() + par_conv.filt = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, jcp.ic == 3 ? 0 : icb, wh, 0) : weights_d.blk_off(ocb, @@ -241,9 +245,11 @@ void _jit_sse42_convolution_fwd_t::execute_forward_fusing() { dst_idx/jcp_dw.stride_h*jcp_dw.ow*jcp_dw.ch_block]; par_conv_dw.kh_padding = jcp_dw.kh; - par_conv_dw.filt = &jcp.dw_conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; + par_conv_dw.filt = &jcp_dw.conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; par_conv_dw.bias = &dw_bias[chb * jcp_dw.ch_block]; par_conv_dw.ur_w = (size_t)(jcp_dw.ow); + par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw.ch_block, (int)jcp_dw.oc) - chb*jcp_dw.ch_block; + par_conv_dw.oc_off = chb * jcp_dw.ch_block * sizeof(float); kernel_dw_->jit_ker(&par_conv_dw); } @@ -252,7 +258,9 @@ void _jit_sse42_convolution_fwd_t::execute_forward_fusing() { size_t start{0}, end{0}; balance211(work_amount, nthr, ithr, start, end); - auto pbuf = dw_conv_buffer_ + ithr * dw_conv_buffer_size_; + auto dw_conv_buffer = scratchpad().get(key_dw_conv_buffer); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * jcp.nb_oc_blocking; + auto pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_; size_t n{0}, g{0}, ocbb{0}, oh{0}; nd_iterator_init(start, n, MB, g, jcp.ngroups, ocbb, ocb_work, @@ -281,23 +289,25 @@ void _jit_sse42_convolution_fwd_t::execute_forward_fusing() { } }; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; - - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - dw_padded_bias_[oc] = dw_bias[oc]; - dw_bias = dw_padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = scratchpad().get(key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; + + auto dw_padded_bias = scratchpad().get(key_dw_conv_padded_bias); + utils::array_copy(dw_padded_bias, dw_bias, jcp.oc_without_padding); + utils::array_set(dw_padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + dw_bias = dw_padded_bias; } parallel(0, ker); -} -template void _jit_sse42_convolution_fwd_t::execute_forward(); -template void _jit_sse42_convolution_fwd_t::execute_forward(); -template void _jit_sse42_convolution_fwd_t::execute_forward_fusing(); -template void _jit_sse42_convolution_fwd_t::execute_forward_fusing(); + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); +} } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.hpp index 1923495..5eb720c 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.hpp @@ -28,62 +28,56 @@ namespace mkldnn { namespace impl { namespace cpu { -template -struct _jit_sse42_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { +struct jit_sse42_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public cpu_convolution_fwd_pd_t { pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) - , jcp_(), jcp_dw() {} + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_(), jcp_dw_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit:", sse42, ""), - _jit_sse42_convolution_fwd_t); + jit_sse42_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), - data_type::f32 == this->cdesc_().bias_desc.data_type); + data_type::f32 == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; - status_t sts = jit_sse42_conv_fwd_kernel_f32::init_conf(jcp_, this->cdesc_(), + status_t status = jit_sse42_conv_fwd_kernel_f32::init_conf(jcp_, *this->desc(), *this->src_pd_.desc(), *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->attr(), with_relu, - this->negative_slope()); - if (sts != status::success) return sts; + *this->dst_pd_.desc(), *this->attr()); + if (status != status::success) return status; if (jcp_.with_dw_conv) { - int dw_conv_oh = (jcp_.oh - ((jcp_.dw_conv_ker_h - 1) + 1) + 2) / jcp_.dw_conv_str_h + 1; - int dw_conv_ow = (jcp_.ow - ((jcp_.dw_conv_ker_w - 1) + 1) + 2) / jcp_.dw_conv_str_w + 1; - - status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_dw, - jcp_.oc, jcp_.oh, jcp_.ow, dw_conv_oh, dw_conv_ow, - jcp_.dw_conv_ker_h, jcp_.dw_conv_ker_w, - jcp_.dw_conv_str_h, jcp_.dw_conv_str_w, - jcp_.dw_conv_eltwise_alg, jcp_.dw_conv_eltwise_alpha, - jcp_.dw_conv_eltwise_beta, jcp_.dw_conv_with_sum); + status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_, jcp_dw_, *this->attr()); if (sts_dw != status::success) return sts_dw; } + auto scratchpad = scratchpad_registry().registrar(); + jit_sse42_conv_fwd_kernel_f32::init_scratchpad(scratchpad, jcp_, jcp_dw_); + return status::success; } jit_conv_conf_t jcp_; - jit_conv_conf_t jcp_dw; + jit_conv_conf_t jcp_dw_; protected: virtual status_t set_default_params() override { @@ -105,57 +99,36 @@ struct _jit_sse42_convolution_fwd_t: public cpu_primitive_t { OIhw8i8o, Ohwi8o))); if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _jit_sse42_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_sse42_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), - dw_conv_buffer_size_(0), dw_conv_buffer_(nullptr), padded_bias_(nullptr), dw_padded_bias_(nullptr) + : cpu_primitive_t(apd, inputs, outputs) { - kernel_ = new jit_sse42_conv_fwd_kernel_f32(conf_.jcp_, *conf_.attr()); - if (conf_.jcp_.with_dw_conv) { - kernel_dw_ = new jit_uni_dw_conv_row_f32(conf_.jcp_dw); - } - - if (conf_.jcp_.with_dw_conv) { - const int nthreads = mkldnn_get_max_threads(); - dw_conv_buffer_size_ = (size_t)conf_.jcp_dw.kh * conf_.jcp_dw.iw * conf_.jcp_dw.ch_block * - conf_.jcp_.nb_oc_blocking; - dw_conv_buffer_ = (float *)malloc(nthreads * dw_conv_buffer_size_ * sizeof(float), 64); - } - - if (conf_.want_padded_bias()) { - const auto &j = conf_.jcp_; - assert(j.ngroups == 1); - padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - padded_bias_[oc] = 0; + kernel_ = new jit_sse42_conv_fwd_kernel_f32(pd()->jcp_, pd()->jcp_dw_, *pd()->attr()); - dw_padded_bias_ = (data_t *)malloc(sizeof(data_t) * j.oc, 64); - for (int oc = j.oc_without_padding; oc < j.oc; ++oc) - dw_padded_bias_[oc] = 0; + if (pd()->jcp_.with_dw_conv) { + kernel_dw_ = new jit_uni_dw_conv_row_f32(pd()->jcp_dw_, *pd()->attr(), pd()->jcp_dw_.ch_block); } } - ~_jit_sse42_convolution_fwd_t() { + ~jit_sse42_convolution_fwd_t() { delete kernel_; - if (conf_.jcp_.with_dw_conv) { + if (pd()->jcp_.with_dw_conv) { delete kernel_dw_; - free(dw_conv_buffer_); - free(dw_padded_bias_); } - - free(padded_bias_); }; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.jcp_.with_dw_conv) - execute_forward_fusing(); + virtual void execute(event_t *e) const { + if (pd()->jcp_.with_dw_conv) + execute_forward_with_dw_conv(); else execute_forward(); @@ -163,24 +136,14 @@ struct _jit_sse42_convolution_fwd_t: public cpu_primitive_t { } private: - void execute_forward(); - void execute_forward_fusing(); + void execute_forward() const; + void execute_forward_with_dw_conv() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; jit_sse42_conv_fwd_kernel_f32 *kernel_; jit_uni_dw_conv_row_f32 *kernel_dw_; - - /* fuse with dw conv */ - size_t dw_conv_buffer_size_; - data_t *dw_conv_buffer_; - - data_t *padded_bias_; - data_t *dw_padded_bias_; }; -using jit_sse42_convolution_fwd_t = _jit_sse42_convolution_fwd_t; -using jit_sse42_convolution_relu_t = _jit_sse42_convolution_fwd_t; - } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.cpp new file mode 100644 index 0000000..cefecbd --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.cpp @@ -0,0 +1,586 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include + +#include "mkldnn_types.h" + +#include "mkldnn_thread.hpp" +#include "utils.hpp" + +#include "jit_generator.hpp" + +#include "jit_sse42_i8i8_pooling.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace Xbyak; + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::types; +using namespace alg_kind; + +struct call_params_t { + const char *src_i8; + const char *dst_i8; + size_t kw_range; + size_t kh_range; + float idivider; +}; + +struct jit_sse42_i8i8_pool_fwd_ker_t : public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse42_i8i8_pool_fwd_ker_t) + + Reg64 reg_ptr_src_i8 = r8; + Reg64 reg_ptr_dst_i8 = r9; + + Reg64 ki = r10; + Reg64 kj = r11; + Reg64 reg_kw = r12; + Reg64 reg_kh = r13; + Reg64 c_iter = r14; + + Reg64 aux_reg_src_h = rax; + Reg64 aux_reg_src_w = rbx; + + Reg64 reg_tmp = rdx; + Reg64 reg_src_64 = r15; + Reg32 reg_src_32 = r15d; + Reg8 reg_src_8 = r15b; + + size_t sizeof_src_dt() const { return data_type_size(jpp.src_dt); } + size_t sizeof_dst_dt() const { return data_type_size(jpp.dst_dt); } + + Xmm xmm_tmp = Xmm(0); + Xmm vreg_tmp = Xmm(14); + Xmm vreg_zeros = Xmm(15); + + /* max pooling */ + Xmm vmm_src(int jj, int ii) { + return Xmm(2*jj + ii); + } + + Xmm xmm_src(int jj) { + return Xmm(2*jj); + } + + Xmm vmm_dst(int jj, int ii) { + return Xmm(2*jj + ii + 2 * jpp.ur_c); + } + + Xmm xmm_dst(int jj) { + return Xmm(2*jj + 2 * jpp.ur_c); + } + + /* avg pooling */ + Xmm vmm_src_s32(int jj, int ii) { + return Xmm(2*jj + ii); + } + + Xmm xmm_src_s32(int jj, int ii) { + return Xmm(2*jj + ii); + } + + Xmm vmm_dst_s32(int jj, int ii) { + return Xmm(2*jj + ii + 2 * jpp.ur_c); + } + + Ymm ymm_dst_s32(int jj, int ii) { + return Ymm(2*jj + ii + 2 * jpp.ur_c); + } + + Xmm xmm_dst_s32(int jj, int ii) { + return Xmm(2*jj + ii + 2 * jpp.ur_c); + } + + Xmm vmm_dst_f32(int jj, int ii) { + return Xmm(2*jj + ii + 4 * jpp.ur_c); + } + + void (*ker_)(const call_params_t *); + jit_pool_conf_t jpp; + + void init_tmp_reg(); + + void load_src(int jj, int c_step); + void store_dst(int jj, int c_step); + + void compute_avg_step(int ur_c, int c_step); + void compute_max_step(int ur_c, int c_step); + void compute_step(int ur_c, int c_step); + + void compute_c_block(); + void generate(); + + static status_t init_conf(jit_pool_conf_t &jpp, + const pooling_desc_t &pd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &dst_d); + + jit_sse42_i8i8_pool_fwd_ker_t(const jit_pool_conf_t &jpp_) + : jpp(jpp_) { + generate(); + ker_ = reinterpret_cast(const_cast( + getCode())); + } +}; + +void jit_sse42_i8i8_pool_fwd_ker_t::load_src(int jj, int c_step) { + using namespace data_type; + + int repeats = c_step != 1 ? 2 : 1; + switch (jpp.alg) { + case pooling_max: { + auto offset = jj*c_step*sizeof_src_dt(); + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) + uni_vmovups(vmm_src(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); + } else if (c_step == 1) { + if (jpp.src_dt == s32) { + movsd(xmm_src(jj), ptr[aux_reg_src_w + offset]); + } else { + mov(reg_src_8, ptr[aux_reg_src_w + offset]); + movq(xmm_src(jj), reg_src_64); + } + } + break; + } + case pooling_avg_include_padding: + case pooling_avg_exclude_padding: { + auto offset = jj*c_step*sizeof_src_dt(); + switch (jpp.src_dt) { + case s32: + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) + uni_vmovups(vmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); + } else if (c_step == 1) { + movsd(xmm_src_s32(jj, 0), ptr[aux_reg_src_w + offset]); + } + break; + case s8: + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) { + movd(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); + + uni_vpmovsxbd(vmm_src_s32(jj, ii), xmm_src_s32(jj, ii)); + } + } else if (c_step == 1) { + movsx(reg_src_32, ptr[aux_reg_src_w + offset]); + movq(xmm_src_s32(jj, 0), reg_src_64); + } + break; + case u8: + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) { + movd(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); + + uni_vpmovzxbd(vmm_src_s32(jj, ii), xmm_src_s32(jj, ii)); + } + } else if (c_step == 1) { + movzx(reg_src_32, ptr[aux_reg_src_w + offset]); + movq(xmm_src_s32(jj, 0), reg_src_64); + } + break; + default: assert(!"unsupported src data type"); + } + break; + } + default: assert(!"unsupported algorithm"); + } +} + +void jit_sse42_i8i8_pool_fwd_ker_t::store_dst(int jj, int c_step) { + using namespace data_type; + + int repeats = c_step != 1 ? 2 : 1; + switch(jpp.alg) { + case pooling_max: { + auto offset = jj*c_step*sizeof_dst_dt(); + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) + uni_vmovups(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], vmm_dst(jj, ii)); + } else if (c_step == 1) { + if (jpp.src_dt == s32) { + movq(reg_src_64, xmm_dst(jj)); + mov(ptr[reg_ptr_dst_i8 + offset], reg_src_32); + } else { + movq(reg_src_64, xmm_dst(jj)); + mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8); + } + } + break; + } + case pooling_avg_include_padding: + case pooling_avg_exclude_padding: { + auto offset = jj*c_step*sizeof_dst_dt(); + switch (jpp.dst_dt) { + case s32: + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) + uni_vmovups(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], vmm_dst_s32(jj, ii)); + } else if (c_step == 1) { + movq(reg_src_64, xmm_dst_s32(jj, 0)); + mov(ptr[reg_ptr_dst_i8 + offset], reg_src_32); + } + break; + case s8: + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) { + uni_vpackssdw(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii)); + uni_vpacksswb(xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii)); + + movd(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii)); + } + } else if (c_step == 1) { + vpackssdw(vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0)); + vpacksswb(xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0)); + movq(reg_src_64, xmm_dst_s32(jj, 0)); + mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8); + } + break; + case u8: + if (c_step == jpp.c_block) { + for (int ii = 0; ii < repeats; ii++) { + uni_vpackusdw(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii)); + uni_vpackuswb(xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii)); + + movd(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii)); + } + } else if (c_step == 1) { + vpackusdw(vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0)); + vpackuswb(xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0)); + movq(reg_src_64, xmm_dst_s32(jj, 0)); + mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8); + } + break; + default: assert(!"unsuppotred dst data_type"); + } + break; + } + default: assert(!"unsupported pooling algorithm"); + } +} + +void jit_sse42_i8i8_pool_fwd_ker_t::compute_max_step(int ur_c, int c_step) +{ + Label l_kw, l_kh; + + int iw = jpp.iw; + int c = jpp.c; + + int repeats = c_step != 1 ? 2 : 1; + + for (int jj = 0; jj < ur_c; jj++) { + for (int ii = 0; ii < repeats; ii++) { + uni_vmovups(vmm_dst(jj, ii), vreg_tmp); + } + } + + mov(aux_reg_src_h, reg_ptr_src_i8); + + xor_(kj, kj); + L(l_kh); + { + mov(aux_reg_src_w, aux_reg_src_h); + xor_(ki, ki); + L(l_kw); + { + for (int jj = 0; jj < ur_c; jj++) { + load_src(jj, c_step); + + for (int ii = 0; ii < repeats; ii++) { + if (jpp.src_dt == data_type::s32) { + uni_vpmaxsd(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii)); + } else { + if (jpp.src_dt == data_type::s8) + uni_vpmaxsb(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii)); + else + uni_vpmaxub(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii)); + } + } + } + add(aux_reg_src_w, c * sizeof_src_dt()); + inc(ki); + cmp(ki, reg_kw); + jl(l_kw, T_NEAR); + } + add(aux_reg_src_h, iw * c * sizeof_src_dt()); + inc(kj); + cmp(kj, reg_kh); + jl(l_kh, T_NEAR); + } + + for (int jj = 0; jj < ur_c; jj++) + store_dst(jj, c_step); +} + +void jit_sse42_i8i8_pool_fwd_ker_t::compute_avg_step(int ur_c, int c_step) +{ + using namespace data_type; + + Label l_kw, l_kh; + + int iw = jpp.iw; + int c = jpp.c; + + int repeats = c_step != 1 ? 2 : 1; + + for (int jj = 0; jj < ur_c; jj++) { + for (int ii = 0; ii < repeats; ii++) { + uni_vpxor(vmm_src_s32(jj, ii), vmm_src_s32(jj, ii), vmm_src_s32(jj, ii)); + uni_vpxor(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii)); + } + } + + mov(aux_reg_src_h, reg_ptr_src_i8); + + xor_(kj, kj); + L(l_kh); + { + mov(aux_reg_src_w, aux_reg_src_h); + xor_(ki, ki); + L(l_kw); + { + for (int jj = 0; jj < ur_c; jj++) { + load_src(jj, c_step); + + for (int ii = 0; ii < repeats; ii++) { + uni_vpaddd(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_src_s32(jj, ii)); + } + } + add(aux_reg_src_w, c * sizeof_src_dt()); + inc(ki); + cmp(ki, reg_kw); + jl(l_kw, T_NEAR); + } + add(aux_reg_src_h, iw * c * sizeof_src_dt()); + inc(kj); + cmp(kj, reg_kh); + jl(l_kh, T_NEAR); + } + + for (int jj = 0; jj < ur_c; jj++) { + for (int ii = 0; ii < repeats; ii++) { + uni_vcvtdq2ps(vmm_dst_f32(jj, ii), vmm_dst_s32(jj, ii)); + + mulps(vmm_dst_f32(jj, ii), vreg_tmp); + + uni_vcvtps2dq(vmm_dst_s32(jj, ii), vmm_dst_f32(jj, ii)); + } + + store_dst(jj, c_step); + } +} + +void jit_sse42_i8i8_pool_fwd_ker_t::compute_step(int ur_c, int c_step) { + switch (jpp.alg) { + case pooling_max: + compute_max_step(ur_c, c_step); break; + case pooling_avg_include_padding: + case pooling_avg_exclude_padding: + compute_avg_step(ur_c, c_step); break; + default: assert(!"unsupported pooling algorithm"); + } +} + +void jit_sse42_i8i8_pool_fwd_ker_t::compute_c_block() { + Label l_main_loop; + Label l_tail_loop; + Label exit; + + int ur_c = jpp.ur_c; + + xor_(c_iter, c_iter); + + L(l_main_loop); + { + cmp(c_iter, jpp.c - ur_c * jpp.c_block); + jg(l_tail_loop, T_NEAR); + + compute_step(ur_c, jpp.c_block); + + add(reg_ptr_src_i8, ur_c * jpp.c_block * sizeof_src_dt()); + add(reg_ptr_dst_i8, ur_c * jpp.c_block * sizeof_dst_dt()); + add(c_iter, ur_c * jpp.c_block); + jmp(l_main_loop); + } + + L(l_tail_loop); + { + cmp(c_iter, jpp.c - ur_c); + jg(exit, T_NEAR); + + compute_step(ur_c, 1); + + add(reg_ptr_src_i8, ur_c * sizeof_src_dt()); + add(reg_ptr_dst_i8, ur_c * sizeof_dst_dt()); + add(c_iter, ur_c); + jmp(l_tail_loop); + } + + L(exit); +} + +void jit_sse42_i8i8_pool_fwd_ker_t::init_tmp_reg() { + using namespace data_type; + + switch (jpp.alg) { + case pooling_avg_include_padding: + case pooling_avg_exclude_padding: + mov(reg_tmp, ptr[abi_param1 + offsetof(call_params_t, idivider)]); + movq(xmm_tmp, reg_tmp); + uni_vpbroadcastd(vreg_tmp, xmm_tmp); + break; + case pooling_max: + switch (jpp.src_dt) { + case s32: + mov(reg_tmp, nstl::numeric_limits::lowest()); + break; + case s8: + mov(reg_tmp, nstl::numeric_limits::lowest()); + break; + case u8: + mov(reg_tmp, nstl::numeric_limits::lowest()); + break; + default: assert(!"unsupported src data_type"); + } + + movq(xmm_tmp, reg_tmp); + if (jpp.src_dt == s32) { + uni_vpbroadcastd(vreg_tmp, xmm_tmp); + } else { + movups(vreg_tmp, xmm_tmp); + uni_vpxor(xmm_tmp, xmm_tmp, xmm_tmp); + pshufb(vreg_tmp, xmm_tmp); + } + break; + default: assert(!"unsupported pooling algorithm"); + } + +} + +void jit_sse42_i8i8_pool_fwd_ker_t::generate() { + preamble(); + +# define READ_PARAM(reg, field) \ + mov(reg, ptr[abi_param1 + offsetof(call_params_t, field)]) + READ_PARAM(reg_ptr_src_i8, src_i8); + READ_PARAM(reg_ptr_dst_i8, dst_i8); + READ_PARAM(reg_kw, kw_range); + READ_PARAM(reg_kh, kh_range); + +# undef READ_PARAM + + init_tmp_reg(); + + uni_vpxor(vreg_zeros, vreg_zeros, vreg_zeros); + + compute_c_block(); + + postamble(); +} + +status_t jit_sse42_i8i8_pool_fwd_ker_t::init_conf(jit_pool_conf_t &jpp, + const pooling_desc_t &pd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &dst_d) { + if (!mayiuse(sse42)) { + return status::unimplemented; + } + + jpp.mb = src_d.dims()[0]; + jpp.c = src_d.dims()[1]; + jpp.ih = src_d.dims()[2]; + jpp.iw = src_d.dims()[3]; + jpp.oh = dst_d.dims()[2]; + jpp.ow = dst_d.dims()[3]; + + jpp.stride_h = pd.strides[0]; + jpp.stride_w = pd.strides[1]; + jpp.kh = pd.kernel[0]; + jpp.kw = pd.kernel[1]; + + jpp.t_pad = pd.padding[0][0]; + jpp.l_pad = pd.padding[0][1]; + + jpp.alg = pd.alg_kind; + + jpp.src_dt = pd.src_desc.data_type; + jpp.dst_dt = pd.dst_desc.data_type; + + jpp.c_block = jpp.alg == pooling_max ? 32 / (jpp.src_dt == data_type::s32 ? 4 : 1) : 8; + jpp.c_tail = jpp.c % jpp.c_block; + jpp.nb_c = jpp.c / jpp.c_block; + jpp.ur_c = 1; + jpp.ur_c_tail = jpp.nb_c - (jpp.nb_c / jpp.ur_c)*jpp.ur_c + (jpp.c_tail != 0); + + return status::success; +} + +status_t jit_sse42_i8i8_pooling_fwd_t::pd_t::jit_conf() { + return jit_sse42_i8i8_pool_fwd_ker_t::init_conf(jpp_, + desc_, src_pd_.desc(), dst_pd_.desc()); +} + +jit_sse42_i8i8_pooling_fwd_t::jit_sse42_i8i8_pooling_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs), ker_(nullptr) +{ ker_ = new jit_sse42_i8i8_pool_fwd_ker_t(pd()->jpp_); } + +jit_sse42_i8i8_pooling_fwd_t::~jit_sse42_i8i8_pooling_fwd_t() { + delete ker_; +} + +void jit_sse42_i8i8_pooling_fwd_t::execute_forward() const { + auto src_i8 = reinterpret_cast(input_memory(0)); + auto dst_i8 = reinterpret_cast(memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + + const auto &jpp = pd()->jpp_; + + parallel_nd(jpp.mb, jpp.oh, jpp.ow, + [&](int n, int oh, int ow) { + const int ih = nstl::max(oh * jpp.stride_h - jpp.t_pad, 0); + const int iw = nstl::max(ow * jpp.stride_w - jpp.l_pad, 0); + + const int kh_start = nstl::max(0, jpp.t_pad - oh * jpp.stride_h); + const int kh_end = nstl::min(jpp.kh, + jpp.ih + jpp.t_pad - oh * jpp.stride_h); + const int kw_start = nstl::max(0, jpp.l_pad - ow * jpp.stride_w); + const int kw_end = nstl::min(jpp.kw, + jpp.iw + jpp.l_pad - ow * jpp.stride_w); + + auto p = call_params_t(); + p.src_i8 = &src_i8[ + src_d.blk_off(n, 0, ih, iw) * src_d.data_type_size()]; + p.dst_i8 = &dst_i8[ + dst_d.blk_off(n, 0, oh, ow) * dst_d.data_type_size()]; + p.kw_range = (size_t) (kw_end - kw_start); + p.kh_range = (size_t) (kh_end - kh_start); + p.idivider = 1.0f / ((jpp.alg == pooling_avg_exclude_padding) ? + p.kh_range * p.kw_range : jpp.kw * jpp.kh); + + ker_->ker_(&p); + }); +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.hpp similarity index 81% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.hpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.hpp index a63984e..bd4192b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_i8i8_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_i8i8_pooling.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2017-2018 Intel Corporation +* Copyright 2018 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,22 +14,22 @@ * limitations under the License. *******************************************************************************/ -#ifndef CPU_JIT_AVX512_CORE_I8I8_POOLING_HPP -#define CPU_JIT_AVX512_CORE_I8I8_POOLING_HPP +#ifndef CPU_JIT_uni_I8I8_POOLING_HPP +#define CPU_JIT_uni_I8I8_POOLING_HPP #include "c_types_map.hpp" #include "cpu_pooling_pd.hpp" #include "cpu_engine.hpp" - +#include "jit_generator.hpp" #include "jit_primitive_conf.hpp" namespace mkldnn { namespace impl { namespace cpu { -struct jit_avx512_core_i8i8_pool_fwd_ker_t; +struct jit_sse42_i8i8_pool_fwd_ker_t; -struct jit_avx512_core_i8i8_pooling_fwd_t : public cpu_primitive_t { +struct jit_sse42_i8i8_pooling_fwd_t : public cpu_primitive_t { struct pd_t : public cpu_pooling_fwd_pd_t { pd_t(engine_t *engine, const pooling_desc_t *adesc, const primitive_attr_t *attr, @@ -37,8 +37,8 @@ struct jit_avx512_core_i8i8_pooling_fwd_t : public cpu_primitive_t { : cpu_pooling_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {} DECLARE_COMMON_PD_T( - JIT_IMPL_NAME_HELPER("jit:", avx512_core, ""), - jit_avx512_core_i8i8_pooling_fwd_t); + JIT_IMPL_NAME_HELPER("jit:", sse42, ""), + jit_sse42_i8i8_pooling_fwd_t); virtual status_t init() override { assert(this->engine()->kind() == engine_kind::cpu); @@ -73,20 +73,20 @@ struct jit_avx512_core_i8i8_pooling_fwd_t : public cpu_primitive_t { } }; - jit_avx512_core_i8i8_pooling_fwd_t(const pd_t *pd, + jit_sse42_i8i8_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, const output_vector &outputs); - ~jit_avx512_core_i8i8_pooling_fwd_t(); + ~jit_sse42_i8i8_pooling_fwd_t(); - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - jit_avx512_core_i8i8_pool_fwd_ker_t *ker_; + jit_sse42_i8i8_pool_fwd_ker_t *ker_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_1x1_conv_utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_1x1_conv_utils.hpp index d360a14..a3ed769 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_1x1_conv_utils.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_1x1_conv_utils.hpp @@ -17,9 +17,11 @@ #ifndef JIT_UNI_1x1_CONV_UTILS_HPP #define JIT_UNI_1x1_CONV_UTILS_HPP +#include "memory_tracking.hpp" #include "mkldnn_thread.hpp" -#include "utils.hpp" #include "nstl.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" #include "jit_generator.hpp" @@ -29,6 +31,12 @@ namespace cpu { using namespace mkldnn::impl::utils; +struct reduce_to_unit_stride_t { + convolution_desc_t conv_d_; + bool reduce_src_; + size_t space_per_thread_; +}; + /* 1x1-kernel does not support non-unit strides so far, so the idea is: * - for fwd or bwd_weights: to copy src to a scratch memory (with strides * equal to 1) and then call the kernel @@ -38,7 +46,7 @@ using namespace mkldnn::impl::utils; template inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d, const memory_desc_t *&src_d, const memory_desc_t *dst_d) { - const bool is_bwd_data = self->cdesc()->prop_kind + const bool is_bwd_data = self->desc()->prop_kind == prop_kind::backward_data; const int ndims = src_d->ndims; @@ -83,6 +91,22 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d, } } +template +inline void rtus_prepare_space_info(conv_pd_t *self, + memory_tracking::registrar_t &scratchpad) { + const auto &jcp = self->jcp_; + + const int max_threads = mkldnn_get_max_threads(); + const size_t factor = utils::pick_by_prop_kind(self->desc()->prop_kind, + jcp.nb_reduce, jcp.nb_load_blocking_max, jcp.nb_bcast_blocking); + size_t typesize = types::data_type_size( + conv_prop_agnostic_src_d(self->desc())->data_type); + + self->rtus_.space_per_thread_ = factor * jcp.is * jcp.ic_block; + scratchpad.book(memory_tracking::names::key_conv_rtus_space, + typesize * max_threads * self->rtus_.space_per_thread_); +} + template struct rtus_driver_t: public jit_generator { @@ -246,62 +270,44 @@ struct rtus_driver_t: public jit_generator { template inline void init_rtus_driver(conv_t *self) { - const auto &conf = self->conf_; - const auto &cd = *conf.cdesc(); - const bool is_bwd_data = cd.prop_kind == prop_kind::backward_data; - const int ndims = conf.ndims(); - + const auto &conf = *self->pd(); if (!conf.rtus_.reduce_src_) return; - const int max_threads = mkldnn_get_max_threads(); - size_t factor = 0; - switch (cd.prop_kind) { - case prop_kind::forward_training: case prop_kind::forward_inference: - factor = conf.jcp_.nb_reduce; break; - case prop_kind::backward_data: - factor = conf.jcp_.nb_load_blocking_max; break; - case prop_kind::backward_weights: - factor = conf.jcp_.nb_bcast_blocking; break; - default: assert(!"unsupported prop_kind"); - } - - size_t typesize = sizeof(decltype(*self->scratch_)); - - self->ws_per_thread_ = factor * conf.jcp_.is * conf.jcp_.ic_block; - self->scratch_ = (decltype(self->scratch_))malloc( - max_threads * self->ws_per_thread_ * typesize, 64); - + const auto &cd = *conf.desc(); + const int ndims = conf.ndims(); const int stride_h = (conf.ndims() == 3) ? 1 : cd.strides[0]; const int stride_w = cd.strides[ndims - 3]; + const bool is_bwd_data = cd.prop_kind == prop_kind::backward_data; const auto &src_d = is_bwd_data ? *conf.diff_src_pd()->desc() : *conf.src_pd()->desc(); assert((isa == avx2 && utils::one_of(src_d.format, memory_format::nCw8c, memory_format::nChw8c)) || (isa == avx512_common && utils::one_of( src_d.format, memory_format::nCw16c, memory_format::nChw16c))); - const int ih = (ndims == 3) ? 1 : src_d.dims[2]; + const int ih = ndims == 3 ? 1 : src_d.dims[2]; const int iw = src_d.dims[ndims - 1]; const int src_step_h = stride_h * iw; const int src_step_icb = ih * iw; const int ws_step_icb = conf.jcp_.is; const bool src_to_ws = !is_bwd_data; + const size_t typesize = types::data_type_size( + conv_prop_agnostic_src_d(self->pd()->desc())->data_type); + self->rtus_driver_ = new rtus_driver_t(iw, stride_w, src_step_h, src_step_icb, ws_step_icb, src_to_ws, typesize); } -inline float loss_ratio(int amount, int divider) -{ - return float(rnd_up(amount, divider) - amount) / rnd_up(amount, divider); -} - inline int best_divider(int value, int min_divider, int max_divider, - bool find_max, int step = 1) + bool find_max, int step = 1) { max_divider = nstl::max(1, nstl::min(max_divider, value)); min_divider = nstl::max(1, nstl::min(min_divider, max_divider)); + auto loss_ratio = [](int total, int chunk) + { return float(rnd_up(total, chunk) - total) / rnd_up(total, chunk); }; + float min_loss = FLT_MAX; int x_divider = max_divider; for (int divider = max_divider; divider >= min_divider; divider -= step) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.cpp index 3a667ac..38e4f48 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.cpp @@ -17,17 +17,18 @@ #include #include "c_types_map.hpp" +#include "math_utils.hpp" +#include "memory_tracking.hpp" +#include "mkldnn_thread.hpp" #include "nstl.hpp" #include "type_helpers.hpp" -#include "mkldnn_thread.hpp" -#include "math_utils.hpp" #include "utils.hpp" -#include "jit_generator.hpp" #include "cpu_barrier.hpp" +#include "cpu_batch_normalization_utils.hpp" +#include "jit_generator.hpp" #include "jit_uni_batch_normalization.hpp" -#include "cpu_batch_normalization_utils.hpp" namespace mkldnn { namespace impl { @@ -35,6 +36,8 @@ namespace cpu { namespace { +using namespace memory_tracking::names; + using namespace Xbyak; namespace barrier = simple_barrier; @@ -71,7 +74,7 @@ struct jit_bnorm_t: public jit_generator { const int vlen = isa == sse42 ? 32 : cpu_isa_traits::vlen; const batch_normalization_pd_t *bdesc_; - int is_spatial_thr_; + bool is_spatial_thr_; void (*ker)(const call_params_t *); void operator()(const call_params_t *p) { (*ker)(p); } @@ -846,7 +849,7 @@ struct jit_bnorm_t: public jit_generator { else assert(false); } - if (!bdesc_->omit_stats()) { + if (!bdesc_->use_global_stats()) { uni_vsubps(v, v, vdiff_beta); uni_vmovups(t, vmmword[reg_src + reg_soff + offt]); @@ -1006,11 +1009,15 @@ struct jit_bnorm_t: public jit_generator { } } - jit_bnorm_t(const batch_normalization_pd_t *bdesc, int is_spatial_thr): - bdesc_(bdesc), is_spatial_thr_(is_spatial_thr) { + jit_bnorm_t(const batch_normalization_pd_t *bdesc): bdesc_(bdesc) { static_assert(isa == sse42 || isa == avx2 || isa == avx512_common || isa == avx512_mic, "unsupported isa"); + const int simd_w = isa == sse42 ? 8 : + cpu_isa_traits::vlen / sizeof(data_t); + is_spatial_thr_ = + bnorm_utils::is_spatial_thr(bdesc_, simd_w, sizeof(data_t)); + unroll_blocks = isa == avx512_common && !is_spatial_thr_ ? 4 : 1; unroll_regs = isa == avx512_common && !is_spatial_thr_ ? 4 : 1; @@ -1044,52 +1051,51 @@ struct jit_bnorm_t: public jit_generator { template struct uni_bnorm_driver_t: public c_compatible { - uni_bnorm_driver_t(const batch_normalization_pd_t *bdesc, - int is_spatial_thr) : bdesc_(bdesc), ker_(bdesc_,is_spatial_thr), - buf_(nullptr), barriers_(nullptr) + uni_bnorm_driver_t(const batch_normalization_pd_t *bdesc) + : bdesc_(bdesc), ker_(bdesc_) { - use_tmp_stats_ = !bdesc_->stats_is_src() - && bdesc_->desc()->prop_kind == prop_kind::forward_inference; - use_tmp_diff_scale_shift_ = false - || (bdesc_->is_bwd() && !bdesc_->use_scaleshift()) - || bdesc_->desc()->prop_kind == prop_kind::backward_data; - int num_sbufs = 2 * use_tmp_stats_; - int num_pbufs = 2 * use_tmp_diff_scale_shift_; - int num_rbufs = bdesc_->is_fwd() ? 1 : 2; + const int nthrs = mkldnn_get_max_threads(); + const int C_PADDED = get_c_padded(bdesc_); + + size_t data_size = sizeof(data_t) * bdesc_->MB() * C_PADDED + * bdesc_->D() * bdesc_->H() * bdesc_->W(); + l3_size_ = get_cache_size(3, true) * nthrs / 2; + do_blocking_ = (data_size >= l3_size_ / 2 && l3_size_ > 0); + } + + ~uni_bnorm_driver_t() {} + + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const batch_normalization_pd_t *bdesc) { int nthrs = mkldnn_get_max_threads(); - int C_PADDED = memory_desc_wrapper(bdesc_->src_pd()).blocking_desc() - .padding_dims[1]; + int C_PADDED = get_c_padded(bdesc); - int buf_size = (num_sbufs + num_pbufs + num_rbufs * nthrs) * C_PADDED; - buf_ = (data_t *)malloc(buf_size * sizeof(data_t), 64); + int sbuf_sz = use_tmp_stats(bdesc) * 2 * C_PADDED; + int pbuf_sz = use_tmp_diff_scale_shift(bdesc) * 2 * C_PADDED; + int rbuf_sz = (bdesc->is_fwd() ? 1 : 2) * C_PADDED * nthrs; - sbuf_ = buf_; - pbuf_ = sbuf_ + num_sbufs * C_PADDED; - rbuf_ = pbuf_ + num_pbufs * C_PADDED; + scratchpad.book(key_bnorm_tmp_stats, sizeof(data_t) * sbuf_sz); + scratchpad.book(key_bnorm_tmp_diff_ss, sizeof(data_t) * pbuf_sz); + scratchpad.book(key_bnorm_reduction, sizeof(data_t) * rbuf_sz); - int num_barriers = C_PADDED / simd_w; if (mkldnn_thr_syncable()) { - barriers_ = (barrier::ctx_t *)malloc( - num_barriers * sizeof(barrier::ctx_t), 64); - for (int i = 0; i < num_barriers; ++i) - barrier::ctx_init(&barriers_[i]); + int n_barriers = C_PADDED / simd_w; + scratchpad.book(key_barrier, sizeof(barrier::ctx_t) * n_barriers); } - - size_t data_size = bdesc_->MB() * C_PADDED * bdesc_->H() - * bdesc_->W() * bdesc_->D() * sizeof(data_t); - l3_size_ = get_cache_size(3, true) * nthrs / 2; - do_blocking_ = (data_size >= l3_size_ / 2 && l3_size_ > 0); } - ~uni_bnorm_driver_t() { free(buf_); free(barriers_); } void exec(int ithr, int nthr, const data_t *src, data_t *diff_src, data_t *dst, const data_t *diff_dst, const data_t *scale_shift, data_t *diff_scale_shift, const data_t *mean, const data_t *var, - const uint8_t *ws) { + const uint8_t *ws, const memory_tracking::grantor_t &scratchpad) { + auto sbuf = scratchpad.get(key_bnorm_tmp_stats); + auto pbuf = scratchpad.get(key_bnorm_tmp_diff_ss); + auto rbuf = scratchpad.get(key_bnorm_reduction); + auto barriers = scratchpad.get(key_barrier); + size_t N = bdesc_->MB(); size_t C = bdesc_->C(); - size_t C_PADDED = memory_desc_wrapper(bdesc_->src_pd()).blocking_desc() - .padding_dims[1]; + size_t C_PADDED = get_c_padded(bdesc_); size_t D = bdesc_->D(); size_t H = bdesc_->H(); size_t W = bdesc_->W(); @@ -1162,12 +1168,11 @@ struct uni_bnorm_driver_t: public c_compatible { p.S_s = S_s * vlen; p.S_tail = (p.spat_size - S_e) * vlen; p.coff_max = C_blks_thr * simd_w; - p.mean = (use_tmp_stats_ ? sbuf_ : mean) + coff_base; - p.var = (use_tmp_stats_ ? sbuf_ + C_PADDED : var) + coff_base; + p.mean = (use_tmp_stats(bdesc_) ? sbuf : mean) + coff_base; + p.var = (use_tmp_stats(bdesc_) ? sbuf + C_PADDED : var) + coff_base; p.scale_shift = scale_shift + coff_base; - p.diff_scale_shift - = (use_tmp_diff_scale_shift_ ? pbuf_ : diff_scale_shift) - + coff_base; + p.diff_scale_shift = (use_tmp_diff_scale_shift(bdesc_) + ? pbuf : diff_scale_shift) + coff_base; p.soff_max = N_thr * img_size; p.src = src + soff_base; @@ -1180,10 +1185,8 @@ struct uni_bnorm_driver_t: public c_compatible { // use SP_N_nthr which is the same as p.N_nthr except maybe for // the last iteration. - p.rbuf1 = rbuf_ - + ((it * C_blks_per_iter) * SP_N_nthr + C_blk_s * p.N_nthr - + p.N_ithr * C_blks_thr) - * simd_w; + p.rbuf1 = rbuf + ((it * C_blks_per_iter) * SP_N_nthr + + C_blk_s * p.N_nthr + p.N_ithr * C_blks_thr) * simd_w; // rbuf1 and rbuf2 have to be disjoint p.rbuf2 = p.rbuf1 + C_PADDED * nthr; p.is_cblk_tail = @@ -1191,89 +1194,193 @@ struct uni_bnorm_driver_t: public c_compatible { size_t iter_bariers = do_blocking_ ? it * global_barriers_per_iter : 0; - p.barrier = barriers_ + C_ithr + iter_bariers; + p.barrier = barriers + C_ithr + iter_bariers; if (p.soff_max != 0 && p.coff_max != 0) ker_(&p); } } + void init_barriers(const memory_tracking::grantor_t &scratchpad) { + auto barriers = scratchpad.get(key_barrier); + if (barriers) { + const int n_barriers = get_c_padded(bdesc_) / simd_w; + for (int i = 0; i < n_barriers; ++i) + barrier::ctx_init(&barriers[i]); + } + } + private: - const int simd_w = isa == sse42 ? 8 : - cpu_isa_traits::vlen / sizeof(data_t); + enum { + simd_w = isa == sse42 ? 8 : cpu_isa_traits::vlen / sizeof(data_t) + }; + + static bool use_tmp_stats(const batch_normalization_pd_t *bdesc) { + return true + && !bdesc->stats_is_src() + && bdesc->desc()->prop_kind == prop_kind::forward_inference; + } + + static bool use_tmp_diff_scale_shift(const batch_normalization_pd_t *bdesc) + { + return false + || (bdesc->is_bwd() && !bdesc->use_scaleshift()) + || bdesc->desc()->prop_kind == prop_kind::backward_data; + } + + static int get_c_padded(const batch_normalization_pd_t *bdesc) + { return bdesc->src_pd()->desc()->layout_desc.blocking.padding_dims[1]; } const batch_normalization_pd_t *bdesc_; - jit_bnorm_t ker_; - bool use_tmp_stats_, use_tmp_diff_scale_shift_; bool do_blocking_; size_t l3_size_; - data_t *buf_, *sbuf_, *rbuf_, *pbuf_; - - barrier::ctx_t *barriers_; + jit_bnorm_t ker_; }; } +using namespace data_type; +using namespace memory_format; +using namespace utils; + +/* fwd */ + template -jit_uni_batch_normalization_fwd_t::jit_uni_batch_normalization_fwd_t( - const pd_t *pd, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) -{ - int is_spatial_thr = 0; - const int simd_w = isa == sse42 ? 8 : - cpu_isa_traits::vlen / sizeof(data_t); +status_t jit_uni_batch_normalization_fwd_t::pd_t::init() { + assert(engine()->kind() == engine_kind::cpu); + auto desired_fmt = (ndims() == 4) + ? isa == avx512_common ? nChw16c : nChw8c + : isa == avx512_common ? nCdhw16c : nCdhw8c; + + bool ok = true + && mayiuse(isa) + && is_fwd() + && !has_zero_dim_memory() + && one_of(ndims(), 4, 5) + && desc()->data_desc.data_type == f32 + && IMPLICATION(use_scaleshift(), + desc()->data_scaleshift_desc.data_type == f32) + && desc()->data_desc.format == desired_fmt + && (attr()->has_default_values() || this->with_relu_post_op()); + if (!ok) return status::unimplemented; + + if (is_training() && fuse_bn_relu()) { + if (isa < avx2) return status::unimplemented; + bn_init_default_ws(this, this->workspace_pd_, 1); + } - bnorm_utils::set_spatial_thr(&conf_,simd_w,sizeof(data_t),is_spatial_thr); + if (memory_desc_wrapper(&data_pd_).blocking_desc().padding_dims[1] + != this->C() && isa < avx2) + return status::unimplemented; - bnorm_driver_ = new uni_bnorm_driver_t(&conf_,is_spatial_thr); + if (stats_is_src() || is_training()) { + memory_desc_t stats_d; + dims_t stats_dims = { C() }; + mkldnn_memory_desc_init(&stats_d, 1, stats_dims, f32, x); + mean_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); + variance_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); + } + + auto scratchpad = scratchpad_registry().registrar(); + uni_bnorm_driver_t::init_scratchpad(scratchpad, this); + + return status::success; } template -void jit_uni_batch_normalization_fwd_t::execute(event_t *e) { +jit_uni_batch_normalization_fwd_t::jit_uni_batch_normalization_fwd_t( + const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) +{ bnorm_driver_ = new uni_bnorm_driver_t(pd()); } + +template +void jit_uni_batch_normalization_fwd_t::execute(event_t *e) const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - auto mean = reinterpret_cast(conf_.stats_is_src() + auto mean = reinterpret_cast(pd()->stats_is_src() ? const_cast(this->input_memory(1)) : this->memory(1)); - auto var = reinterpret_cast(conf_.stats_is_src() + auto var = reinterpret_cast(pd()->stats_is_src() ? const_cast(this->input_memory(2)) : this->memory(2)); - auto idx_scale_shift = 1 + 2*conf_.stats_is_src(); + auto idx_scale_shift = 1 + 2*pd()->stats_is_src(); auto scale_shift = reinterpret_cast(this->input_memory(idx_scale_shift)); - auto ws = reinterpret_cast(this->memory(conf_.ws_idx())); + auto ws = reinterpret_cast(this->memory(pd()->ws_idx())); + + auto scratchpad = this->scratchpad(); + + bnorm_driver_->init_barriers(scratchpad); parallel(0, [&](const int ithr, const int nthr) { - bnorm_driver_->exec(ithr, nthr, src, - nullptr, dst, nullptr, scale_shift, nullptr, mean, var, ws); + bnorm_driver_->exec(ithr, nthr, src, nullptr, dst, nullptr, + scale_shift, nullptr, mean, var, ws, scratchpad); }); e->set_state(event_t::ready); } template -jit_uni_batch_normalization_fwd_t::~jit_uni_batch_normalization_fwd_t() { - delete bnorm_driver_; -} +jit_uni_batch_normalization_fwd_t::~jit_uni_batch_normalization_fwd_t() +{ delete bnorm_driver_; } + +/* bwd */ template -jit_uni_batch_normalization_bwd_t::jit_uni_batch_normalization_bwd_t( - const pd_t *pd, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) -{ - int is_spatial_thr = 0; - const int simd_w = isa == sse42 ? 8 : - cpu_isa_traits::vlen / sizeof(data_t); +status_t jit_uni_batch_normalization_bwd_t::pd_t::init() { + assert(engine()->kind() == engine_kind::cpu); + auto desired_fmt = (ndims() == 4) + ? one_of(isa, sse42, avx2) ? nChw8c : nChw16c + : one_of(isa, sse42, avx2) ? nCdhw8c : nCdhw16c; + + bool ok = true + && mayiuse(isa) + && is_bwd() + && !has_zero_dim_memory() + && one_of(ndims(), 4, 5) + && everyone_is(f32, desc()->data_desc.data_type, + desc()->diff_data_desc.data_type) + && IMPLICATION(use_scaleshift(), + desc()->data_scaleshift_desc.data_type == f32) + && everyone_is(desired_fmt, desc()->diff_data_desc.format, + desc()->data_desc.format) + && attr()->has_default_values(); + if (!ok) return status::unimplemented; + + if (memory_desc_wrapper(&data_pd_).blocking_desc() + .padding_dims[1] != this->C() && isa < avx2) + return status::unimplemented; + + if (fuse_bn_relu()) { + if (isa < avx2) return status::unimplemented; + bn_init_default_ws(this, this->workspace_pd_, 1); + size_t this_ws_sz = memory_desc_wrapper(this->workspace_pd()).size(); + + bool ws_ok = true + && hint_fwd_pd_->workspace_pd() + && memory_desc_wrapper(hint_fwd_pd_->workspace_pd()).size() + == this_ws_sz; + if (!ws_ok) return status::unimplemented; + } + + /* TODO: extra checks required */ - bnorm_utils::set_spatial_thr(&conf_,simd_w,sizeof(data_t),is_spatial_thr); + auto scratchpad = scratchpad_registry().registrar(); + uni_bnorm_driver_t::init_scratchpad(scratchpad, this); - bnorm_driver_ = new uni_bnorm_driver_t(&conf_,is_spatial_thr); + return status::success; } template -void jit_uni_batch_normalization_bwd_t::execute(event_t *e) { +jit_uni_batch_normalization_bwd_t::jit_uni_batch_normalization_bwd_t( + const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) +{ bnorm_driver_ = new uni_bnorm_driver_t(pd()); } + +template +void jit_uni_batch_normalization_bwd_t::execute(event_t *e) const { auto src = reinterpret_cast(this->input_memory(0)); auto mean = reinterpret_cast(this->input_memory(1)); auto var = reinterpret_cast(this->input_memory(2)); @@ -1282,20 +1389,22 @@ void jit_uni_batch_normalization_bwd_t::execute(event_t *e) { auto diff_src = reinterpret_cast(this->memory(0)); auto diff_scale_shift = reinterpret_cast(this->memory(1)); auto ws = reinterpret_cast( - this->input_memory(conf_.ws_idx())); + this->input_memory(pd()->ws_idx())); + + auto scratchpad = this->scratchpad(); + + bnorm_driver_->init_barriers(scratchpad); parallel(0, [&](const int ithr, const int nthr) { - bnorm_driver_->exec(ithr, nthr, src, - diff_src, nullptr, diff_dst, scale_shift, diff_scale_shift, - mean, var, ws); + bnorm_driver_->exec(ithr, nthr, src, diff_src, nullptr, diff_dst, + scale_shift, diff_scale_shift, mean, var, ws, scratchpad); }); e->set_state(event_t::ready); } template -jit_uni_batch_normalization_bwd_t::~jit_uni_batch_normalization_bwd_t() { - delete bnorm_driver_; -} +jit_uni_batch_normalization_bwd_t::~jit_uni_batch_normalization_bwd_t() +{ delete bnorm_driver_; } /* struct instantiation */ template struct jit_uni_batch_normalization_fwd_t; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.hpp index 7dbc47a..857e3a0 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.hpp @@ -20,11 +20,10 @@ #include #include "c_types_map.hpp" -#include "cpu_batch_normalization_pd.hpp" -#include "cpu_engine.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "cpu_batch_normalization_pd.hpp" #include "jit_generator.hpp" namespace mkldnn { @@ -46,58 +45,21 @@ struct jit_uni_batch_normalization_fwd_t: public cpu_primitive_t { JIT_IMPL_NAME_HELPER("jit:", isa, ""), jit_uni_batch_normalization_fwd_t); - virtual status_t init() override { - using namespace prop_kind; - using namespace data_type; - using namespace memory_format; - assert(engine()->kind() == engine_kind::cpu); - auto desired_fmt = (ndims() == 4) - ? isa == avx512_common ? nChw16c : nChw8c - : isa == avx512_common ? nCdhw16c : nCdhw8c; - bool ok = true - && mayiuse(isa) - && is_fwd() - && !has_zero_dim_memory() - && utils::one_of(ndims(), 4, 5) - && desc()->data_desc.data_type == f32 - && IMPLICATION(use_scaleshift(), - desc()->data_scaleshift_desc.data_type == f32) - && desc()->data_desc.format == desired_fmt - && (attr()->has_default_values() || this->with_relu_post_op()); - if (!ok) return status::unimplemented; - - if (is_training() && fuse_bn_relu()) { - if (isa < avx2) return status::unimplemented; - bn_init_default_ws(this, this->workspace_pd_, 1); - } - if (memory_desc_wrapper(&data_pd_).blocking_desc() - .padding_dims[1] != this->C() && isa < avx2) - return status::unimplemented; - - if (stats_is_src() || is_training()) { - memory_desc_t stats_d; - dims_t stats_dims = { C() }; - mkldnn_memory_desc_init(&stats_d, 1, stats_dims, - data_type::f32, x); - mean_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); - variance_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); - } - - return success; - } + virtual status_t init() override; }; typedef typename prec_traits::type data_t; - jit_uni_batch_normalization_fwd_t(const pd_t *pd, + jit_uni_batch_normalization_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_batch_normalization_fwd_t(); - virtual void execute(event_t *e); + + virtual void execute(event_t *e) const; private: - uni_bnorm_driver_t *bnorm_driver_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; + uni_bnorm_driver_t *bnorm_driver_; }; template @@ -113,63 +75,21 @@ struct jit_uni_batch_normalization_bwd_t: public cpu_primitive_t { JIT_IMPL_NAME_HELPER("jit:", isa, ""), jit_uni_batch_normalization_bwd_t); - virtual status_t init() override { - using namespace prop_kind; - using namespace data_type; - using namespace utils; - using namespace memory_format; - assert(engine()->kind() == engine_kind::cpu); - auto desired_fmt = (ndims() == 4) - ? utils::one_of(isa, sse42, avx2) ? nChw8c : nChw16c - : utils::one_of(isa, sse42, avx2) ? nCdhw8c : nCdhw16c; - bool ok = true - && mayiuse(isa) - && is_bwd() - && !has_zero_dim_memory() - && utils::one_of(ndims(), 4, 5) - && everyone_is(f32, desc()->data_desc.data_type, - desc()->diff_data_desc.data_type) - && IMPLICATION(use_scaleshift(), - desc()->data_scaleshift_desc.data_type == f32) - && everyone_is(desired_fmt, desc()->diff_data_desc.format, - desc()->data_desc.format) - && attr()->has_default_values(); - if (!ok) return status::unimplemented; - if (memory_desc_wrapper(&data_pd_).blocking_desc() - .padding_dims[1] != this->C() && isa < avx2) - return status::unimplemented; - - if (fuse_bn_relu()) { - if (isa < avx2) return status::unimplemented; - bn_init_default_ws(this, this->workspace_pd_, 1); - const size_t this_ws_sz - = memory_desc_wrapper(this->workspace_pd()).size(); - - bool ws_ok = true - && hint_fwd_pd_->workspace_pd() - && memory_desc_wrapper(hint_fwd_pd_->workspace_pd()).size() - == this_ws_sz; - if (!ws_ok) - return status::unimplemented; - } - - /* TODO: extra checks required */ - - return success; - } + virtual status_t init() override; }; typedef typename prec_traits::type data_t; - jit_uni_batch_normalization_bwd_t(const pd_t *pd, + jit_uni_batch_normalization_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_batch_normalization_bwd_t(); - virtual void execute(event_t *e); + + virtual void execute(event_t *e) const; private: - uni_bnorm_driver_t *bnorm_driver_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; + uni_bnorm_driver_t *bnorm_driver_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.cpp new file mode 100644 index 0000000..447a017 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.cpp @@ -0,0 +1,925 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "c_types_map.hpp" +#include "nstl.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" +#include "cpu_memory.hpp" + +#include "jit_uni_bin_conv_kernel.hpp" + +#define GET_OFF(field) offsetof(jit_conv_call_s, field) + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::prop_kind; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; +using namespace mkldnn::impl::utils; + +using namespace Xbyak; + +template +void jit_uni_bin_conv_fwd_kernel::cvt2ps(data_type_t type_in, Vmm vmm_in, const Operand &op, bool scalar_load) { + Xmm xmm_in = Xmm(vmm_in.getIdx()); + + switch (type_in) { + case data_type::f32: + case data_type::s32: + if (scalar_load) { + mov(reg_tmp_32, op); + movq(xmm_in, reg_tmp_64); + } else { + uni_vmovups(vmm_in, op); + } + break; + case data_type::s8: + if (scalar_load) { + movsx(reg_tmp_32, op); + movq(xmm_in, reg_tmp_64); + } else { + uni_vpmovsxbd(vmm_in, op); + } + break; + case data_type::u8: + if (scalar_load) { + movzx(reg_tmp_32, op); + movq(xmm_in, reg_tmp_64); + } else { + uni_vpmovzxbd(vmm_in, op); + } + break; + default: assert(!"unsupported data type"); + } + + if (type_in != data_type::f32) + uni_vcvtdq2ps(vmm_in, vmm_in); +} + +template +void jit_uni_bin_conv_fwd_kernel::store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) { + Ymm ymm_dst = Ymm(vmm_dst.getIdx()); + Xmm xmm_dst = Xmm(vmm_dst.getIdx()); + + switch (jcp.dst_dt) { + case data_type::f32: + case data_type::s32: + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_32); + } else { + uni_vmovups(op, vmm_dst); + } + break; + case data_type::s8: + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + + if (isa != sse42 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); + + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case data_type::u8: + case data_type::bin: + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + + if (isa != sse42 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); + + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + + break; + default: + assert(!"unknown dst_dt"); + } +} + +template +void jit_uni_bin_conv_fwd_kernel::apply_filter(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, + int ic_blocks, bool last_icb, bool h_padded) +{ + int kw = jcp.kw; + int kh = jcp.kh; + int stride_w = jcp.stride_w; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int oc_blk = jcp.oc_block; + + int repeats = isa == sse42 && oc_step > (oc_blk / 2) ? 2 : 1; + int nbits = 8; + + for (int ki = 0; ki < kw; ki++) { + int jj_start = nstl::max(0, div_up(pad_l - ki * dilate_w, stride_w)); + int jj_end = ur_w - nstl::max(0, div_up(ki*dilate_w+pad_r-(kw-1)*dilate_w, stride_w)); + + int _start = (!jcp.exclude_pad) ? 0 : jj_start; + int _end = (!jcp.exclude_pad) ? ur_w : jj_end; + + for (int ifm2 = 0; ifm2 < ic_blocks; ifm2++) { + for (int jj = _start; jj < _end; jj++) { + int inp_off = ((ki*dilate_w + jj*stride_w - pad_l)*div_up(jcp.ic, nbits) + ifm2 * div_up(ic_blk, nbits)) * jcp.typesize_in; + + if (h_padded || jj < jj_start || jj >= jj_end) { + uni_vmovups(vmm_src, ptr[reg_table + 256]); + } else { + uni_vpbroadcastd(vmm_src, ptr[aux1_reg_input + inp_off]); + } + + for (int r = 0; r < repeats; r++) { + for (int ii = 0; ii < oc_blocks; ii++) { + int ker_off = (ifm2 * kw * div_up(ic_blk, nbits) * oc_blk + + ii * jcp.nb_ic * div_up(ic_blk, nbits) * kh * kw * oc_blk + + ki * div_up(ic_blk, nbits) * oc_blk + r * div_up(ic_blk, nbits) * (oc_blk / 2)) * jcp.typesize_in; + + uni_vmovups(vmm_tmp, ptr[aux1_reg_kernel + ker_off]); + + uni_vpxor(vmm_tmp, vmm_tmp, vmm_src); + if (jcp.ic_padded != jcp.ic && last_icb && ifm2 == (ic_blocks - 1)) + uni_vandps(vmm_tmp, vmm_tmp, ptr[reg_table + 224]); + + if (isa == sse42) { + movups(vmm_tmp1, vmm_tmp); + pand(vmm_tmp1, vmm_mask); + } else { + uni_vandps(vmm_tmp1, vmm_mask, vmm_tmp); + } + + uni_vpsrld(vmm_tmp, vmm_tmp, 4); + uni_vandps(vmm_tmp, vmm_tmp, vmm_mask); + + if (isa == sse42) { + movups(vmm_tmp2, vmm_lookup); + pshufb(vmm_tmp2, vmm_tmp); + movups(vmm_tmp, vmm_lookup); + pshufb(vmm_tmp, vmm_tmp1); + paddb(vmm_tmp, vmm_tmp2); + } else { + uni_vpshufb(vmm_tmp, vmm_lookup, vmm_tmp); + uni_vpshufb(vmm_tmp1, vmm_lookup, vmm_tmp1); + uni_vpaddb(vmm_tmp, vmm_tmp, vmm_tmp1); + } + + uni_vpmaddubsw(vmm_tmp, vmm_tmp, vmm_one_u8); + uni_vpmaddwd(vmm_tmp, vmm_tmp, vmm_one_s16); + uni_vpaddd(Vmm(1 + r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj), + Vmm(1 + r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj), vmm_tmp); + } + } + } + } + } +} + +template +void jit_uni_bin_conv_fwd_kernel::oh_step_unroll_kw(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, bool h_padded) { + int kw = jcp.kw; + + int nbits = 8; + int inp_mult = div_up(jcp.ic_block, nbits); + int out_mult = jcp.oc_block; + + Label icb_main_loop; + Label icb_tail; + + mov(aux1_reg_input, aux_reg_input); + mov(aux1_reg_kernel, aux_reg_kernel); + + mov(reg_icb_iter, jcp.nb_ic); + L(icb_main_loop); + { + cmp(reg_icb_iter, 1); + jle(icb_tail, T_NEAR); + + apply_filter(ur_w, pad_l, pad_r, oc_blocks, oc_step, 1, false, h_padded); + + add(aux1_reg_input, inp_mult * jcp.typesize_in); + add(aux1_reg_kernel, kw * inp_mult * out_mult * jcp.typesize_in); + sub(reg_icb_iter, 1); + jmp(icb_main_loop, T_NEAR); + } + + L(icb_tail); + + apply_filter(ur_w, pad_l, pad_r, oc_blocks, oc_step, 1, true, h_padded); +} + +template +void jit_uni_bin_conv_fwd_kernel::kh_loop(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step) { + int iw = jcp.iw; + int kw = jcp.kw; + int dilate_h = jcp.dilate_h + 1; + + int nbits = 8; + const int inp_mult = dilate_h * div_up(jcp.ic, nbits); + + Label t_overflow_label, no_t_overflow_label, + b_overflow_label, no_b_overflow_label; + + mov(aux_reg_input, reg_input); + mov(aux_reg_kernel, reg_kernel_base); + + uni_vmovups(vmm_lookup, ptr[reg_table]); + uni_vmovups(vmm_mask, ptr[reg_table + 32]); + uni_vmovups(vmm_one_u8, ptr[reg_table + 160]); + uni_vmovups(vmm_one_s16, ptr[reg_table + 192]); + + if (!jcp.exclude_pad) { + mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); + cmp(reg_overflow, 0); + je(no_t_overflow_label, T_NEAR); + L(t_overflow_label); { + oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true); + + add(aux_reg_kernel, jcp.typesize_in * kw * jcp.oc_block * jcp.nb_ic * div_up(jcp.ic_block, nbits)); + dec(reg_overflow); + cmp(reg_overflow, 0); + jg(t_overflow_label, T_NEAR); + } + L(no_t_overflow_label); + } + + Label skip_kh_loop; + mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); + if (!jcp.exclude_pad || (jcp.exclude_pad && + (jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad))) { + cmp(reg_kh, 0); + je(skip_kh_loop, T_NEAR); + } + + Label kh_label; + L(kh_label); + { + oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, false); + + add(aux_reg_kernel, jcp.typesize_in * kw * jcp.oc_block * jcp.nb_ic * div_up(jcp.ic_block, nbits)); + add(aux_reg_input, jcp.typesize_in * iw * inp_mult); + + dec(reg_kh); + cmp(reg_kh, 0); + jg(kh_label, T_NEAR); + } + + L(skip_kh_loop); + + if (!jcp.exclude_pad) { + mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); + cmp(reg_overflow, 0); + je(no_b_overflow_label, T_NEAR); + L(b_overflow_label); { + oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true); + + add(aux_reg_kernel, jcp.typesize_in * kw * jcp.oc_block * jcp.nb_ic * div_up(jcp.ic_block, nbits)); + dec(reg_overflow); + cmp(reg_overflow, 0); + jg(b_overflow_label, T_NEAR); + } + L(no_b_overflow_label); + } +} + +template +void jit_uni_bin_conv_fwd_kernel::width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step) +{ + int nbits = 8; + int repeats = isa == sse42 && oc_step > (jcp.oc_block / 2) ? 2 : 1; + + for (int r = 0; r < repeats; r++) + for (int ii = 0; ii < oc_blocks; ii++) + for (int jj = 0; jj < ur_w; jj++) + uni_vpxor(Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj), + Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj), + Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj)); + + kh_loop(ur_w, pad_l, pad_r, oc_blocks, oc_step); + + const auto &p = attr_.post_ops_; + for (int r = 0; r < repeats; r++) { + int tail_size = isa == sse42 ? nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2) : oc_step; + bool is_scalar_store = isa == sse42 ? tail_size < jcp.oc_block / 2 : tail_size < jcp.oc_block; + + int kw_padding[ur_w]; + if (jcp.exclude_pad) { + mov(reg_tmp_32, jcp.ic); + imul(reg_tmp_32, ptr[param1 + GET_OFF(kh_padding)]); + + for (int jj = 0; jj < ur_w; jj++) + kw_padding[jj] = 0; + + for (int ki = 0; ki < jcp.kw; ki++) { + int jj_start = nstl::max(0, div_up(pad_l - ki * (jcp.dilate_w + 1), jcp.stride_w)); + int jj_end = ur_w - nstl::max(0, div_up(ki * (jcp.dilate_w + 1) + pad_r - + (jcp.kw - 1) * (jcp.dilate_w + 1), jcp.stride_w)); + for (int jj = jj_start; jj < jj_end; jj++) { + kw_padding[jj]++; + } + } + } else { + uni_vmovups(vmm_shift, ptr[reg_table + 128]); + } + uni_vmovups(vmm_scale, ptr[reg_table + 96]); + + for (int jj = 0; jj < ur_w; jj++) { + if (jcp.exclude_pad) { + mov(reg_shift, kw_padding[jj]); + imul(reg_shift, reg_tmp_32); + movq(Xmm(vmm_shift.getIdx()), reg_shift); + uni_vbroadcastss(vmm_shift, Xmm(vmm_shift.getIdx())); + uni_vcvtdq2ps(vmm_shift, vmm_shift); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + uni_vcvtdq2ps(Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj), Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj)); + uni_vfmadd213ps(Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj), vmm_scale, vmm_shift); + } + } + + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; + for (int i = 0; i < end_idx; i++) { + int start_idx = 1 + r * jcp.ur_w * jcp.nb_oc_blocking; + + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(start_idx, start_idx + oc_blocks * ur_w); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + pop(reg_oc_off); + + mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); + mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); + + add(reg_d_weights, reg_oc_off); + add(reg_d_bias, reg_oc_off); + + if (r == 1) { + add(reg_d_weights, (jcp.oc_block / 2) * sizeof(float)); + add(reg_d_bias, (jcp.oc_block / 2) * sizeof(float)); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + depthwise_injectors[depthwise_inj_idx]->compute_vector_range(start_idx + ur_w * ii, + start_idx + ur_w * ii + ur_w, reg_d_weights, reg_d_bias); + + add(reg_d_weights, jcp.oc_block * sizeof(float)); + add(reg_d_bias, jcp.oc_block * sizeof(float)); + } + + depthwise_inj_idx++; + + push(reg_oc_off); + } else if (post_op.is_sum(false)) { + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); + + if (is_scalar_store) { + for (int oc = 0; oc < tail_size; oc++) { + int o_off = jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc; + + uni_vpxor(vmm_sum, vmm_sum, vmm_sum); + cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true); + + if (oc < jcp.oc_block / 2) { + uni_vpslldq(vmm_sum, vmm_sum, oc * sizeof(float)); + } else { + Ymm ymm_prev_dst = Ymm(vmm_sum.getIdx()); + vperm2i128(ymm_prev_dst, ymm_prev_dst, ymm_prev_dst, 0x01); + vpslldq(vmm_sum, vmm_sum, (oc - jcp.oc_block / 2) * sizeof(float)); + } + + uni_vaddps(vmm_dst, vmm_dst, vmm_sum); + } + } else { + size_t o_off = ii * jcp.oc_block + jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2); + + cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], false); + uni_vaddps(vmm_dst, vmm_dst, vmm_sum); + } + } + } + } + } + } + + if (jcp.with_binarization) { + int binarization_idx = p.find(primitive_kind::binarization); + + pop(reg_oc_off); + + mov(reg_b_weights, reinterpret_cast(p.entry_[binarization_idx].binarization.weights_data)); + add(reg_b_weights, reg_oc_off); + + push(reg_oc_off); + + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + for (int r = 0; r < repeats; r++) { + int tail_size = isa == sse42 ? nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2) : oc_step; + mov(reg_b_mask, (1 << tail_size) - 1); + uni_vmovups(vmm_thr, ptr[reg_b_weights + (ii * jcp.oc_block + r * (jcp.oc_block / 2)) * sizeof(float)]); + + Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); + + uni_vcmpgtps(vmm_dst, vmm_dst, vmm_thr); + + if (r == 0) { + uni_vmovmskps(reg_tmp_32, vmm_dst); + and_(reg_tmp_64, reg_b_mask); + } else { + uni_vmovmskps(reg_tmp2_32, vmm_dst); + and_(reg_tmp2_64, reg_b_mask); + shl(reg_tmp2_32, 4); + or_(reg_tmp_32, reg_tmp2_32); + } + + if (r == repeats - 1) { + const size_t o_off = (ii + jj * div_up(jcp.oc, nbits)); + mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_8); + } + } + } + } + } else { + for (int r = 0; r < repeats; r++) { + int tail_size = isa == sse42 ? nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2) : oc_step; + bool is_scalar_store = isa == sse42 ? tail_size < jcp.oc_block / 2 : tail_size < jcp.oc_block; + if (is_scalar_store) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + jj); + Ymm ymm_dst = Ymm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + jj); + + for (int oc = 0; oc < tail_size; oc++) { + size_t o_off; + if (jcp.with_dw_conv) + o_off = jj * jcp.oc_block + oc + r * (jcp.oc_block / 2); + else + o_off = jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc; + + store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true); + + if (isa == sse42) { + psrldq(vmm_dst, jcp.typesize_out); + } else { + vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01); + vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out); + } + } + } + } else { + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = Vmm(1 + r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); + + size_t o_off; + if (jcp.with_dw_conv) + o_off = ((size_t) ii * jcp_dw_conv.kh * jcp.ow + jj) * jcp.oc_block + + r * (jcp.oc_block / 2); + else + o_off = ii * jcp.oc_block + jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2); + + store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false); + } + } + } + } + } +} + +template +inline void jit_uni_bin_conv_fwd_kernel::solve_common(int oc_blocks, int oc_step) +{ + int ur_w = jcp.ur_w; + int ur_w_tail = jcp.ur_w_tail; + int n_oi = jcp.ow / ur_w; + int iw = jcp.iw; + int kw = jcp.kw; + int dilate_w = jcp.dilate_w + 1; + int str_w = jcp.stride_w; + + int nbits = 8; + const int inp_mult = div_up(jcp.ic, nbits); + const int out_mult = jcp.with_dw_conv ? jcp.oc_block : jcp.with_binarization ? div_up(jcp.oc, nbits) : jcp.oc; + + int l_pad = jcp.l_pad; + int r_pad = nstl::max(0, (jcp.ow - 1) * str_w + (kw - 1) * dilate_w + - (iw + l_pad - 1)); + int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w + - (iw + l_pad - 1); + if (r_pad1 > 0) n_oi--; + + mov(reg_input, reg_input_base); + mov(reg_output, reg_output_base); + + push(reg_input_base); + push(reg_output_base); + push(reg_oc_work); + push(reg_oc_off); + + if (l_pad > 0) { + n_oi--; + if (n_oi < 0 && r_pad1 > 0) + width_blk_step(ur_w, l_pad, r_pad1, oc_blocks, oc_step); // "lrpad" + else + width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad" + add(reg_input, jcp.typesize_in * (ur_w * str_w - l_pad) * inp_mult); + add(reg_output, jcp.typesize_out * ur_w * out_mult); + } + + Label ow_loop_label; + xor_(oi_iter, oi_iter); + + if (n_oi > 0) { + L(ow_loop_label); + + width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle" + add(reg_input, jcp.typesize_in * ur_w * str_w * inp_mult); + add(reg_output, jcp.typesize_out * ur_w * out_mult); + + inc(oi_iter); + cmp(oi_iter, n_oi); + jl(ow_loop_label, T_NEAR); + } + + if (r_pad1 > 0 && n_oi >=0) { + width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad" + add(reg_input, jcp.typesize_in * ur_w * str_w * inp_mult); + add(reg_output, jcp.typesize_out * ur_w * out_mult); + } + + if (ur_w_tail != 0) + width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail" + + pop(reg_oc_off); + pop(reg_oc_work); + pop(reg_output_base); + pop(reg_input_base); +} + +template +void jit_uni_bin_conv_fwd_kernel::generate() +{ + const auto &p = attr_.post_ops_; + int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; + for (int i = 0; i < end_idx; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32( + this, + post_op.depthwise.alg + )); + } + } + + this->preamble(); + + mov(reg_input_base, ptr[this->param1 + GET_OFF(src)]); + mov(reg_output_base, ptr[this->param1 + GET_OFF(dst)]); + mov(reg_kernel_base, ptr[this->param1 + GET_OFF(filt)]); + + mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); + mov(reg_oc_work, ptr[this->param1 + GET_OFF(oc_work)]); + + mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); + mov(reg_table, l_table); + + Label main_loop_label; + Label tail_label; + Label exit_label; + + cmp(reg_oc_work, jcp.nb_oc_blocking * jcp.oc_block); + jne(main_loop_label, T_NEAR); + + solve_common(jcp.nb_oc_blocking, jcp.oc_block); + + sub(reg_oc_work, jcp.nb_oc_blocking * jcp.oc_block); + + jmp(exit_label, T_NEAR); + + int nbits = 8; + + L(main_loop_label); { + cmp(reg_oc_work, jcp.oc_block); + jl(tail_label, T_NEAR); + + solve_common(1, jcp.oc_block); + + sub(reg_oc_work, jcp.oc_block); + add(reg_kernel_base, jcp.oc_block * jcp.nb_ic * jcp.kh * jcp.kw * div_up(jcp.ic_block, nbits) * jcp.typesize_in); + + if (jcp.with_dw_conv) { + add(reg_output_base, jcp.oc_block * jcp_dw_conv.kh * jcp.ow * jcp.typesize_out); + } else { + if (jcp.with_binarization) + add(reg_output_base, jcp.typesize_out); + else + add(reg_output_base, jcp.oc_block * jcp.typesize_out); + } + + add(reg_oc_off, jcp.oc_block * sizeof(float)); + + jmp(main_loop_label, T_NEAR); + } + + L(tail_label); + + if (jcp.oc % jcp.oc_block != 0) + solve_common(1, jcp.oc % jcp.oc_block); + + L(exit_label); + + this->postamble(); + + prepare_table(); + + for (auto& inj : eltwise_injectors) + inj->prepare_table(); +} + +template +void jit_uni_bin_conv_fwd_kernel::prepare_table() { + const unsigned int cvals[] = { + 0x02010100, // 0 1 1 2 + 0x03020201, // 1 2 2 3 + 0x03020201, // 1 2 2 3 + 0x04030302, // 2 3 3 4 + 0x02010100, // 0 1 1 2 + 0x03020201, // 1 2 2 3 + 0x03020201, // 1 2 2 3 + 0x04030302, // 2 3 3 4 + 0x0f0f0f0f, + 0x000000ff, + 0xc0000000, // -2.0f + 0x01010101, + 0x00010001 + }; + + align(64); + L(l_table); + // offset = 0 + for (size_t d = 0; d < 8; ++d) { + dd(cvals[d % 8]); + } + // offset = 32 + for (size_t d = 0; d < 8; ++d) { + dd(cvals[8]); + } + // offset = 64 + for (size_t d = 0; d < 8; ++d) { + dd(cvals[9]); + } + // offset = 96 + for (size_t d = 0; d < 8; ++d) { + dd(cvals[10]); + } + + // offset = 128 + for (size_t d = 0; d < 8; ++d) { + dd(float2int(jcp.ic * jcp.kw * jcp.kh)); + } + + // offset = 160 + for (size_t d = 0; d < 8; ++d) { + dd(cvals[11]); + } + // offset = 192 + for (size_t d = 0; d < 8; ++d) { + dd(cvals[12]); + } + // offset = 224 + for (size_t d = 0; d < 8; ++d) { + uint32_t mask = 0xffffffff >> (jcp.ic_padded - jcp.ic); + dd(mask); + } + // offset = 256 + for (size_t d = 0; d < 8; ++d) { + uint32_t val = jcp.pad_value == 1.0f ? 0xffffffff : 0x00000000; + dd(val); + } +} + +template +bool jit_uni_bin_conv_fwd_kernel::post_ops_ok(jit_bin_conv_conf_t &jcp, const primitive_attr_t &attr) { + const auto &p = attr.post_ops_; + + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return p.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; + auto is_dw_conv = [&](int idx) { return p.entry_[idx].is_dw_conv(); }; + auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; + auto is_binarization = [&](int idx) { return p.entry_[idx].is_binarization(); }; + + switch (p.len_) { + case 0: return true; // no post_ops + case 1: + return (is_simple(0) || is_sum(0) || is_dw_conv(0) || is_binarization(0)); + case 2: + return ((is_sum(0) && is_simple(1)) || (is_dw_conv(0) && is_simple(1)) || + (is_simple(0) && is_dw_conv(1)) || (is_dw_conv(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)) || (is_simple(0) && is_binarization(1)) || + (is_dw_conv(0) && is_binarization(1)) || (is_simple(0) && is_sum(1))); + case 3: + return ((is_simple(0) && is_dw_conv(1) && is_simple(2)) || + (is_dw_conv(0) && is_sum(1) && is_simple(2)) || + (is_sum(0) && is_simple(1) && is_simple(2)) || + (is_simple(0) && is_sum(1) && is_simple(2)) || + (is_simple(0) && is_dw_conv(1) && is_binarization(2)) || + (is_simple(0) && is_simple(1) && is_dw_conv(2))); + case 4: return ((is_simple(0) && is_dw_conv(1) && is_sum(2) && is_simple(3)) || + (is_simple(0) && is_dw_conv(1) && is_simple(2) && is_binarization(3)) || + (is_simple(0) && is_simple(1) && is_dw_conv(2) && is_binarization(3)) || + (is_simple(0) && is_simple(1) && is_simple(2) && is_binarization(3)) || + (is_simple(0) && is_simple(1) && is_dw_conv(2) && is_simple(3))); + default: return false; + } + + return false; +} + +template +status_t jit_uni_bin_conv_fwd_kernel::init_conf(jit_bin_conv_conf_t &jcp, + const binary_convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, const primitive_attr_t &attr) +{ + if (!mayiuse(isa)) return status::unimplemented; + + jcp.prop_kind = cd.prop_kind; + + jcp.dst_dt = cd.dst_desc.data_type; + + const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; + + jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; + + if (jcp.ngroups != 1) + return status::unimplemented; + + jcp.mb = src_d.dims()[0]; + + int simd_w = isa == avx512_common ? 16 : 8; + + jcp.ic = src_d.dims()[1] / jcp.ngroups; + jcp.oc = dst_d.dims()[1] / jcp.ngroups; + + jcp.oc_padded = rnd_up(jcp.oc, simd_w); + + jcp.ih = src_d.dims()[2]; + jcp.iw = src_d.dims()[3]; + jcp.oh = dst_d.dims()[2]; + jcp.ow = dst_d.dims()[3]; + + jcp.kh = weights_d.dims()[with_groups + 2]; + jcp.kw = weights_d.dims()[with_groups + 3]; + + jcp.t_pad = cd.padding[0][0]; + jcp.l_pad = cd.padding[0][1]; + + jcp.stride_h = cd.strides[0]; + jcp.stride_w = cd.strides[1]; + + jcp.dilate_h = cd.dilates[0]; + jcp.dilate_w = cd.dilates[1]; + + jcp.src_fmt = src_d.format(); + + if (!post_ops_ok(jcp, attr)) + return status::unimplemented; + + jcp.pad_value = cd.pad_value; + jcp.exclude_pad = jcp.pad_value == 0.0f; + + const auto &p = attr.post_ops_; + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp.with_dw_conv = dw_conv_ind != -1; + if (jcp.with_dw_conv) { + jcp.dw_conv_oh = jcp.oh; + jcp.dw_conv_ow = jcp.ow; + jcp.oh = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp.ow = p.entry_[dw_conv_ind].dw_conv.in_w; + } + jcp.with_sum = p.find(primitive_kind::sum, 0, dw_conv_ind) != -1; + jcp.with_binarization = p.find(primitive_kind::binarization, 0, dw_conv_ind) != -1; + + if (with_groups) + return status::unimplemented; + + auto desired_weights_format = isa == avx512_common ? OhIw16o32i : OhIw8o32i; + bool args_ok = true + && src_d.format() == nhwc + && weights_d.format() == desired_weights_format + && dst_d.format() == nhwc; + if (!args_ok) return status::unimplemented; + + jcp.ur_h = 1; /* no code-unrolling by h so far */ + jcp.ur_w = 2; + if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; + jcp.ur_w_tail = jcp.ow % jcp.ur_w; + + jcp.nb_oc_blocking = isa == sse42 ? 2 : 4; /* the optimal value for the kernel */ + + args_ok = true + && jcp.l_pad <= jcp.ur_w + && IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) + || (jcp.stride_w == 1 && jcp.stride_h == 1)); + if (!args_ok) return status::unimplemented; + + int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + + if (r_pad_no_tail > jcp.ur_w) { + /* recalculate ur_w, nb_oc_blocking and ur_w_tail */ + jcp.ur_w = r_pad_no_tail + 1; + jcp.nb_oc_blocking = ((16 - 1)-jcp.ur_w)/jcp.ur_w; + jcp.ur_w_tail = jcp.ow % jcp.ur_w; + /* check again ... */ + r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w)) + return status::unimplemented; + } + if (jcp.l_pad > jcp.ur_w) return status::unimplemented; + + jcp.ic_block = 32; + jcp.nb_ic = div_up(jcp.ic, jcp.ic_block); + jcp.ic_padded = rnd_up(jcp.ic, jcp.ic_block); + + jcp.oc_block = simd_w; + jcp.nb_oc = div_up(jcp.oc, jcp.oc_block); + + jcp.nb_ic_blocking = 1; + + jcp.src_dt = cd.src_desc.data_type; + jcp.bia_dt = mkldnn_f32; + jcp.dst_dt = jcp.with_binarization ? mkldnn_bin : mkldnn_f32; + + jcp.typesize_in = types::data_type_size(jcp.src_dt); + jcp.typesize_out = types::data_type_size(jcp.dst_dt); + jcp.typesize_acc = sizeof(int32_t); + + return status::success; +} + +template +void jit_uni_bin_conv_fwd_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_bin_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw_conv) { + if (jcp.with_dw_conv) { + const int nthreads = mkldnn_get_max_threads(); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw_conv.kh * jcp_dw_conv.iw * jcp_dw_conv.ch_block * jcp.nb_oc_blocking; + scratchpad.book(key_dw_conv_buffer, sizeof(float) * dw_conv_buffer_size_ * nthreads); + + if (jcp.oc != jcp.oc_padded) + scratchpad.book(key_dw_conv_padded_bias, sizeof(float) * jcp.oc_padded); + } +} + +template struct jit_uni_bin_conv_fwd_kernel; +template struct jit_uni_bin_conv_fwd_kernel; +template struct jit_uni_bin_conv_fwd_kernel; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.hpp new file mode 100644 index 0000000..83f6f6a --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_bin_conv_kernel.hpp @@ -0,0 +1,140 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef JIT_UNI_BIN_CONV_KERNEL_HPP +#define JIT_UNI_BIN_CONV_KERNEL_HPP + +#include "c_types_map.hpp" +#include "jit_generator.hpp" +#include "jit_primitive_conf.hpp" +#include "cpu_memory.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct jit_uni_bin_conv_fwd_kernel: public jit_generator { + jit_uni_bin_conv_fwd_kernel(jit_bin_conv_conf_t ajcp, jit_conv_conf_t ajcp_dw_conv, + const primitive_attr_t &attr): jcp(ajcp), jcp_dw_conv(ajcp_dw_conv), attr_(attr) + { + this->generate(); + jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); + } + + ~jit_uni_bin_conv_fwd_kernel() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); + } + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_bin_conv_fwd_kernel) + + static bool post_ops_ok(jit_bin_conv_conf_t &jcp, const primitive_attr_t &attr); + static status_t init_conf(jit_bin_conv_conf_t &jcp, + const binary_convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, const primitive_attr_t &attr); + static void init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_bin_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw_conv); + + jit_bin_conv_conf_t jcp; + jit_conv_conf_t jcp_dw_conv; + const primitive_attr_t &attr_; + void (*jit_ker)(jit_conv_call_s *); + +private: + using Vmm = typename utils::conditional3::type; + using Ymm = const Xbyak::Ymm; + using reg64_t = const Xbyak::Reg64; + using reg32_t = const Xbyak::Reg32; + using reg8_t = const Xbyak::Reg8; + + reg64_t reg_input = r13; + reg64_t reg_output = rbp; + reg64_t reg_input_base = rax; + reg64_t aux_reg_input = r8; + reg64_t reg_kernel_base = rdx; + reg64_t aux_reg_kernel = r9; + reg64_t reg_output_base = rsi; + reg64_t aux1_reg_input = reg_input_base; + reg64_t aux1_reg_kernel = reg_output_base; + + reg64_t kj = r10; + reg64_t oi_iter = r11; + reg64_t reg_kh = abi_not_param1; + reg64_t reg_overflow = reg_kh; + reg64_t reg_oc_work = r14; + reg64_t reg_table = r15; + reg64_t reg_icb_iter = reg_oc_work; + + reg32_t reg_tmp_32 = r12d; + reg64_t reg_tmp_64 = r12; + reg8_t reg_tmp_8 = r12b; + + reg64_t reg_d_weights = aux_reg_input; + reg64_t reg_d_bias = aux_reg_kernel; + reg64_t reg_oc_off = kj; + reg64_t reg_tmp2_64 = reg_oc_off; + reg32_t reg_tmp2_32 = reg_oc_off.cvt32(); + + reg64_t reg_b_weights = aux_reg_input; + reg64_t reg_b_mask = aux_reg_kernel; + + reg64_t reg_shift = aux_reg_input; + + Vmm vmm_scale = Vmm(14); + Vmm vmm_shift = Vmm(15); + Vmm vmm_sum = Vmm(10); + Vmm vmm_lookup = Vmm(12); + Vmm vmm_mask = Vmm(13); + Vmm vmm_one_u8 = Vmm(14); + Vmm vmm_one_s16 = Vmm(15); + Ymm ymm_tmp = Ymm(10); + Vmm vmm_tmp = Vmm(10); + Vmm vmm_tmp1 = Vmm(11); + Vmm vmm_src = Vmm(0); + Vmm vmm_tmp2 = Vmm(9); + Vmm vmm_thr = Vmm(10); + + Xbyak::Label l_table; + + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; + + inline void cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load); + inline void store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store); + inline void apply_filter(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, int ic_blocks, bool last_icb, bool h_padded); + inline void oh_step_unroll_kw(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, bool h_padded); + inline void kh_loop(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step); + inline void width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step); + inline void solve_common(int oc_blocks, int oc_step); + inline void prepare_table(); + + void generate(); +}; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.cpp new file mode 100644 index 0000000..be3b284 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.cpp @@ -0,0 +1,276 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_types.h" +#include "mkldnn_thread.hpp" +#include "nstl.hpp" +#include "utils.hpp" +#include "jit_uni_binarization.hpp" + +#define GET_OFF(field) offsetof(jit_args, field) + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace Xbyak; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::utils; + +struct jit_args { + const float* from; + const uint8_t* to; + const float* weights; + size_t work_amount; +}; + +struct jit_uni_binarization_kernel_f32 : public c_compatible { + const binarization_desc_t &desc_; + void (*ker_)(const jit_args *); + + void operator()(const jit_args *args) { assert(ker_); ker_(args); } + + jit_uni_binarization_kernel_f32(const binarization_desc_t &desc) + : desc_(desc), ker_(nullptr) {} + virtual ~jit_uni_binarization_kernel_f32() {} +}; + +/* jit kernels */ +namespace { + +template +struct jit_uni_bin_depthwise_kernel_f32 : public jit_uni_binarization_kernel_f32, + public jit_generator +{ + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_bin_depthwise_kernel_f32) + jit_uni_bin_depthwise_kernel_f32(const binarization_desc_t &desc) + : jit_uni_binarization_kernel_f32(desc), jit_generator() { + assert(desc.alg_kind == alg_kind::binarization_depthwise); + assert(isa == sse42 || isa == avx2 || isa == avx512_common); + + this->preamble(); + + mov(reg_from, ptr[param + GET_OFF(from)]); + mov(reg_to, ptr[param + GET_OFF(to)]); + mov(reg_weights, ptr[param + GET_OFF(weights)]); + mov(reg_work_amount, ptr[param + GET_OFF(work_amount)]); + + const int nbits = 8; + int simd_w = isa == avx512_common ? 16 : 8; + const int C = desc.src_desc.dims[1]; + const int tail_size = C % simd_w; + + Label unrolled_loop_label; + Label main_loop_label; + Label tail_label; + Label exit_label; + + L(unrolled_loop_label); { + int step = isa == sse42 ? nbits / 2 : isa == avx2 ? nbits : 2 * nbits; + const int ur_ch = isa == sse42 ? nbits : isa == avx2 ? nbits / 2 : nbits / 4; + const int unrolled_loop_step = ur_ch * step; + + cmp(reg_work_amount, unrolled_loop_step); + jl(main_loop_label, T_NEAR); + + xor_(reg_bin_32, reg_bin_32); + for (int ch = 0; ch < ur_ch; ch++) { + uni_vmovups(vmm_src(0), ptr[reg_from + ch*step*sizeof(float)]); + uni_vmovups(vmm_wei(0), ptr[reg_weights + ch*step*sizeof(float)]); + if (isa == avx512_common) { + vcmpps(k_mask, vmm_src(0), vmm_wei(0), _cmp_gt_os); + kmovw(reg_src_32, k_mask); + } else { + uni_vcmpgtps(vmm_src(0), vmm_src(0), vmm_wei(0)); + uni_vmovmskps(reg_src_32, vmm_src(0)); + } + shl(reg_src_32, ch * step); + or_(reg_bin_32, reg_src_32); + } + mov(ptr[reg_to], reg_bin_32); + + add(reg_from, unrolled_loop_step*sizeof(float)); + add(reg_weights, unrolled_loop_step*sizeof(float)); + add(reg_to, sizeof(uint32_t)); + sub(reg_work_amount, unrolled_loop_step); + + jmp(unrolled_loop_label, T_NEAR); + } + + L(main_loop_label); { + int repeats = isa == sse42 ? 2 : 1; + int step = isa == sse42 ? nbits / 2 : isa == avx2 ? nbits : nbits * 2; + const int main_loop_step = step * repeats; + + cmp(reg_work_amount, main_loop_step); + jl(tail_label, T_NEAR); + + xor_(reg_bin_32, reg_bin_32); + for (int i = 0; i < repeats; i++) { + uni_vmovups(vmm_src(0), ptr[reg_from + i*step*sizeof(float)]); + uni_vmovups(vmm_wei(0), ptr[reg_weights + i*step*sizeof(float)]); + if (isa == avx512_common) { + vcmpps(k_mask, vmm_src(0), vmm_wei(0), _cmp_gt_os); + kmovw(reg_src_32, k_mask); + } else { + uni_vcmpgtps(vmm_src(0), vmm_src(0), vmm_wei(0)); + uni_vmovmskps(reg_src_32, vmm_src(0)); + } + shl(reg_src_32, i * step); + or_(reg_bin_32, reg_src_32); + } + if (isa == avx512_common) + mov(ptr[reg_to], reg_bin_16); + else + mov(ptr[reg_to], reg_bin_8); + + add(reg_from, main_loop_step*sizeof(float)); + add(reg_weights, main_loop_step*sizeof(float)); + add(reg_to, isa == avx512_common ? sizeof(uint16_t) : sizeof(uint8_t)); + sub(reg_work_amount, main_loop_step); + + jmp(main_loop_label, T_NEAR); + } + + L(tail_label); { + if (tail_size != 0) { + xor_(reg_bin_32, reg_bin_32); + for (int c = 0; c < tail_size; c++) { + uni_vpxor(xmm_src(0), xmm_src(0), xmm_src(0)); + uni_vpxor(xmm_wei(0), xmm_wei(0), xmm_wei(0)); + + movss(xmm_src(0), ptr[reg_from + c * sizeof(float)]); + movss(xmm_wei(0), ptr[reg_weights + c * sizeof(float)]); + uni_vcmpgtps(xmm_src(0), xmm_src(0), xmm_wei(0)); + uni_vmovmskps(reg_src_32, xmm_src(0)); + + shl(reg_src_32, c); + or_(reg_bin_32, reg_src_32); + } + if (isa == avx512_common && tail_size > nbits) + mov(ptr[reg_to], reg_bin_16); + else + mov(ptr[reg_to], reg_bin_8); + } + } + + L(exit_label); + + this->postamble(); + + ker_ = (decltype(ker_))this->getCode(); + } + +private: + using Vmm = typename utils::conditional3::type; + + inline Vmm vmm_src(int idx) { return Vmm(idx); } + inline Xmm xmm_src(int idx) { return Xmm(idx); } + inline Vmm vmm_wei(int idx) { return Vmm(idx + 4); } + inline Xmm xmm_wei(int idx) { return Xmm(idx + 4); } + + Reg64 param = abi_param1; + Reg64 reg_from = r8; + Reg64 reg_to = r9; + Reg64 reg_work_amount = r10; + Reg64 reg_weights = r11; + Reg16 reg_bin_16 = r12w; + Reg32 reg_bin_32 = r12d; + Reg8 reg_bin_8 = r12b; + Reg32 reg_src_32 = r13d; + Reg64 reg_src_64 = r13; + + const unsigned char _cmp_gt_os = 6; + Xbyak::Opmask k_mask = Xbyak::Opmask(1); +}; + +} /* namespace */ + +template +status_t jit_uni_binarization_fwd_t::pd_t::init() { + using namespace alg_kind; + + auto desired_fmt = nhwc; + + assert(engine()->kind() == engine_kind::cpu); + bool ok = true && mayiuse(isa) + && utils::one_of(desc()->prop_kind, prop_kind::forward_training, prop_kind::forward_inference) + && utils::everyone_is(data_type::f32, desc()->src_desc.data_type, desc()->weights_desc.data_type) + && utils::everyone_is(data_type::bin, desc()->dst_desc.data_type) + && desc()->src_desc.format == desc()->dst_desc.format + && utils::one_of(desc()->src_desc.format, desired_fmt) + && utils::one_of(desc()->dst_desc.format, desired_fmt) + && utils::one_of(desc()->weights_desc.format, x) + && attr()->has_default_values(); + + return ok ? status::success : status::unimplemented; +} + +template +jit_uni_binarization_fwd_t::jit_uni_binarization_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr) { + const auto &desc = *pd()->desc(); + switch (desc.alg_kind) { + case alg_kind::binarization_depthwise: + kernel_ = new jit_uni_bin_depthwise_kernel_f32(desc); break; + default: assert(!"unknown binarization alg_kind"); + } +} + +template +jit_uni_binarization_fwd_t::~jit_uni_binarization_fwd_t() { + delete kernel_; +} + +template +void jit_uni_binarization_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto dst = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const int N = src_d.dims()[0]; + const int C = src_d.dims()[1]; + const int H = src_d.dims()[2]; + const int W = src_d.dims()[3]; + + int nbits = 8; + + parallel_nd(N, H, W, + [&](int n, int h, int w) { + auto arg = jit_args(); + + arg.from = &src[src_d.blk_off(n, 0, h, w)]; + arg.to = &dst[dst_d.blk_off(n, 0, h, w) / nbits]; + arg.weights = &weights[weights_d.blk_off(0)]; + arg.work_amount = (size_t)C; + + (*kernel_)(&arg); + }); +} + +template struct jit_uni_binarization_fwd_t; +template struct jit_uni_binarization_fwd_t; +template struct jit_uni_binarization_fwd_t; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.hpp new file mode 100644 index 0000000..1c29a3e --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binarization.hpp @@ -0,0 +1,73 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_JIT_UNI_BINARIZATION_HPP +#define CPU_JIT_UNI_BINARIZATION_HPP + +#include + +#include "c_types_map.hpp" +#include "cpu_binarization_pd.hpp" +#include "cpu_engine.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" +#include "jit_primitive_conf.hpp" +#include "jit_generator.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +struct jit_uni_binarization_kernel_f32; + +template +struct jit_uni_binarization_fwd_t : public cpu_primitive_t { + struct pd_t : public cpu_binarization_fwd_pd_t { + pd_t(engine_t *engine, const binarization_desc_t *adesc, + const primitive_attr_t *attr, + const binarization_fwd_pd_t *hint_fwd_pd) + : cpu_binarization_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {} + + DECLARE_COMMON_PD_T( + JIT_IMPL_NAME_HELPER("jit:", isa, ""), + jit_uni_binarization_fwd_t); + + virtual status_t init() override; + }; + + jit_uni_binarization_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs); + ~jit_uni_binarization_fwd_t(); + + typedef typename prec_traits::type src_data_t; + + virtual void execute(event_t *e) const + { + execute_forward(); + e->set_state(event_t::ready); + } + +private: + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_uni_binarization_kernel_f32 *kernel_; +}; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.cpp new file mode 100644 index 0000000..fa9f0d9 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.cpp @@ -0,0 +1,251 @@ + /******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "mkldnn_types.h" + +#include "c_types_map.hpp" +#include "jit_uni_binary_convolution.hpp" +#include "utils.hpp" +#include "mkldnn_thread.hpp" +#include "type_helpers.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::status; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; +using namespace mkldnn::impl::utils; + +template +void jit_uni_binary_convolution_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto dst_u8 = reinterpret_cast(this->memory()); + auto dst_f32 = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = kernel_->jcp; + const int MB = pd()->MB(); + + int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); + const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; + + int nbits = 8; + + auto ker = [&](const int ithr, const int nthr) { + size_t start{0}, end{0}; + balance211(work_amount, nthr, ithr, start, end); + + size_t n{0}, g{0}, ocbb{0}, oh{0}; + nd_iterator_init(start, n, MB, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { + int ocb = ocbb * jcp.nb_oc_blocking; + int ocb_num = jcp.nb_oc_blocking; + + auto par_conv = jit_conv_call_s(); + + const int ij = oh * jcp.stride_h; + const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1))); + const int i_b_overflow = nstl::min(jcp.kh, div_up(nstl::max(jcp.ih, ij + (jcp.kh-1) * (jcp.dilate_h+1) - + jcp.t_pad+1) - jcp.ih, (jcp.dilate_h + 1))); + + const size_t _oc = g * jcp.nb_oc + ocb; + const size_t _ic = g * jcp.nb_ic; + + const int ih = nstl::max(ij - jcp.t_pad + i_t_overflow * (jcp.dilate_h + 1), 0); + par_conv.src = &src[src_d.blk_off(n, _ic*jcp.ic_block, ih, 0) / nbits]; + + if (jcp.with_binarization) { + par_conv.dst = &dst_u8[dst_d.blk_off(n, _oc*jcp.oc_block, oh, 0) / nbits]; + } else { + par_conv.dst = &dst_f32[dst_d.blk_off(n, _oc*jcp.oc_block, oh, 0)]; + } + + const int wh = jcp.exclude_pad ? i_t_overflow : 0; + int widx = weights_d.blk_off(ocb, 0, wh, 0); + par_conv.filt = &weights[widx / nbits]; + + par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block; + + par_conv.kw_padding = 0; + const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow; + par_conv.kh_padding = nstl::max(0, kh_padding); + par_conv.t_overflow = i_t_overflow; + par_conv.b_overflow = i_b_overflow; + + par_conv.oc_off = _oc * jcp.oc_block * sizeof(float); + + kernel_->jit_ker(&par_conv); + + nd_iterator_step(n, MB, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + } + }; + + parallel(0, ker); +} + +template +void jit_uni_binary_convolution_fwd_t::execute_forward_with_dw_conv() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto dst_u8 = reinterpret_cast(this->memory()); + auto dst_f32 = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = kernel_->jcp; + const auto &jcp_dw_conv = dw_conv_kernel_->jcp; + const int MB = pd()->MB(); + + auto dw_conv_bias = jcp_dw_conv.conv_biases; + auto dw_conv_weights = reinterpret_cast(jcp_dw_conv.conv_weights); + + int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); + const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; + + int nbits = 8; + + auto ker = [&](const int ithr, const int nthr) { + auto compute_row_generic_conv = [&](float* ws_p, int n, int g, int ocb, int ocb_num, int oh, int num_rows) { + for (int h = 0; h < num_rows; h++) { + if ((oh + h) < 0 || (oh + h) >= jcp.oh) { + for (int chb = ocb; chb < ocb + ocb_num; chb++) { + memset(ws_p + (((oh + h) + 1) % jcp_dw_conv.kh) * jcp.ow * jcp.oc_block + + (chb - ocb) * jcp_dw_conv.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float)); + } + } else { + auto par_conv = jit_conv_call_s(); + + const int ij = (oh + h) * jcp.stride_h; + const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1))); + const int i_b_overflow = nstl::min(jcp.kh, div_up(nstl::max(jcp.ih, ij + (jcp.kh-1) * (jcp.dilate_h+1) - + jcp.t_pad+1) - jcp.ih, (jcp.dilate_h + 1))); + + const size_t _oc = g * jcp.nb_oc + ocb; + const size_t _ic = g * jcp.nb_ic; + + const int ih = nstl::max(ij - jcp.t_pad + i_t_overflow * (jcp.dilate_h + 1), 0); + par_conv.src = &src[src_d.blk_off(n, _ic*jcp.ic_block, ih, 0) / nbits]; + + par_conv.dst = &ws_p[(((oh + h) + 1) % jcp_dw_conv.kh) * jcp.ow * jcp.oc_block]; + + const int wh = jcp.exclude_pad ? i_t_overflow : 0; + int widx = weights_d.blk_off(ocb, 0, wh, 0); + par_conv.filt = &weights[widx / nbits]; + + par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block; + + par_conv.kw_padding = 0; + const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow; + par_conv.kh_padding = nstl::max(0, kh_padding); + par_conv.t_overflow = i_t_overflow; + par_conv.b_overflow = i_b_overflow; + + par_conv.oc_off = _oc * jcp.oc_block * sizeof(float); + + kernel_->jit_ker(&par_conv); + } + } + }; + + auto compute_row_dw_conv = [&](const float* ws_p, int n, int ocb, int ocb_num, int dst_idx) { + for (int chb = ocb; chb < nstl::min(ocb + ocb_num, jcp.nb_oc); chb++) { + auto par_conv_dw = jit_conv_call_s(); + + par_conv_dw.src_row0 = &ws_p[(((dst_idx+1) - 1) % jcp_dw_conv.kh) * jcp_dw_conv.iw * jcp_dw_conv.ch_block + + (chb - ocb) * jcp_dw_conv.kh * jcp_dw_conv.iw * jcp_dw_conv.ch_block]; + par_conv_dw.src_row1 = &ws_p[(((dst_idx+1) - 0) % jcp_dw_conv.kh) * jcp_dw_conv.iw * jcp_dw_conv.ch_block + + (chb - ocb) * jcp_dw_conv.kh * jcp_dw_conv.iw * jcp_dw_conv.ch_block]; + par_conv_dw.src_row2 = &ws_p[(((dst_idx+1) + 1) % jcp_dw_conv.kh) * jcp_dw_conv.iw * jcp_dw_conv.ch_block + + (chb - ocb) * jcp_dw_conv.kh * jcp_dw_conv.iw * jcp_dw_conv.ch_block]; + + if (jcp_dw_conv.with_binarization) { + int nbits = 8; + + int didx = n*jcp_dw_conv.oc*jcp_dw_conv.oh*jcp_dw_conv.ow + + dst_idx/jcp_dw_conv.stride_h*jcp_dw_conv.ow*jcp_dw_conv.oc + chb*jcp_dw_conv.ch_block; + par_conv_dw.dst = &dst_u8[didx / nbits]; + } else { + par_conv_dw.dst = &dst_f32[n*jcp_dw_conv.oc*jcp_dw_conv.oh*jcp_dw_conv.ow + + dst_idx/jcp_dw_conv.stride_h*jcp_dw_conv.ow*jcp_dw_conv.oc + chb*jcp_dw_conv.ch_block]; + } + + par_conv_dw.kh_padding = jcp_dw_conv.kh; + par_conv_dw.filt = &dw_conv_weights[chb * jcp_dw_conv.kh * jcp_dw_conv.kw * jcp_dw_conv.ch_block]; + par_conv_dw.bias = &dw_conv_bias[chb * jcp_dw_conv.ch_block]; + par_conv_dw.ur_w = (size_t)(jcp_dw_conv.ow); + par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw_conv.ch_block, jcp_dw_conv.oc) - chb*jcp_dw_conv.ch_block; + par_conv_dw.oc_off = chb * jcp_dw_conv.ch_block * sizeof(float); + + dw_conv_kernel_->jit_ker(&par_conv_dw); + } + }; + + size_t start{0}, end{0}; + balance211(work_amount, nthr, ithr, start, end); + auto dw_conv_buffer_ = scratchpad().template get(key_dw_conv_buffer); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw_conv.kh * jcp_dw_conv.iw * jcp_dw_conv.ch_block * jcp.nb_oc_blocking; + auto pbuf = dw_conv_buffer_ + ithr * dw_conv_buffer_size_; + + size_t n{0}, g{0}, ocbb{0}, oh{0}; + nd_iterator_init(start, n, MB, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { + int ocb = ocbb * jcp.nb_oc_blocking; + int ocb_num = jcp.nb_oc_blocking; + + if (iwork == start || oh == 0) { + compute_row_generic_conv(pbuf, n, g, ocb, ocb_num, oh - 1, 2); + } else { + compute_row_generic_conv(pbuf, n, g, ocb, ocb_num, oh, 1); + } + + if (iwork > start && ((oh - 1) % jcp_dw_conv.stride_h == 0) && oh > 0) { + compute_row_dw_conv(pbuf, n, ocb, ocb_num, oh - 1); + } + + if ((iwork == end - 1 || (int) oh == jcp.oh - 1) && ((oh) % jcp_dw_conv.stride_h == 0)) { + compute_row_generic_conv(pbuf, n, g, ocb, ocb_num, oh + 1, 1); + compute_row_dw_conv(pbuf, n, ocb, ocb_num, oh); + } + + nd_iterator_step(n, MB, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + } + }; + + if (jcp.oc != jcp.oc_padded) { + auto dw_conv_padded_bias = scratchpad().template get(key_dw_conv_padded_bias); + utils::array_copy(dw_conv_padded_bias, dw_conv_bias, jcp.oc); + utils::array_set(dw_conv_padded_bias + jcp.oc, 0.f, jcp.oc_padded - jcp.oc); + dw_conv_bias = dw_conv_padded_bias; + } + + parallel(0, ker); +} + +template struct jit_uni_binary_convolution_fwd_t; +template struct jit_uni_binary_convolution_fwd_t; +template struct jit_uni_binary_convolution_fwd_t; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.hpp new file mode 100644 index 0000000..c5a188e --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_binary_convolution.hpp @@ -0,0 +1,138 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_JIT_UNI_BINARY_CONVOLUTION_HPP +#define CPU_JIT_UNI_BINARY_CONVOLUTION_HPP + +#include "c_types_map.hpp" +#include "cpu_binary_convolution_pd.hpp" +#include "cpu_engine.hpp" +#include "cpu_reducer.hpp" +#include "jit_primitive_conf.hpp" +#include "jit_uni_bin_conv_kernel.hpp" +#include "mkldnn_thread.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct jit_uni_binary_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public _cpu_binary_convolution_fwd_pd_t { + pd_t(engine_t *engine, + const binary_convolution_desc_t *adesc, + const primitive_attr_t *attr, + const typename pd_t::base_class *hint_fwd_pd) + : _cpu_binary_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_(), jcp_dw_conv() {} + + DECLARE_COMMON_PD_T( + JIT_IMPL_NAME_HELPER("jit:", isa, ""), + jit_uni_binary_convolution_fwd_t); + + virtual status_t init() override { + using namespace prop_kind; + assert(this->engine()->kind() == engine_kind::cpu); + bool ok = true + && this->set_default_params() == status::success + && utils::one_of(this->cdesc_().prop_kind, forward_training, forward_inference) + && this->cdesc_().alg_kind == alg_kind::binary_convolution_direct + && utils::everyone_is(data_type::bin, + this->cdesc_().src_desc.data_type, + this->cdesc_().weights_desc.data_type) + && utils::one_of(this->cdesc_().dst_desc.data_type, + memory::data_type::f32, + memory::data_type::bin); + if (!ok) return status::unimplemented; + + status_t sts = jit_uni_bin_conv_fwd_kernel::init_conf(jcp_, *this->desc(), + *this->src_pd_.desc(), *this->weights_pd_.desc(), + *this->dst_pd_.desc(), *this->attr()); + if (sts != status::success) return sts; + + if (jcp_.with_dw_conv) { + status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_, jcp_dw_conv, *this->attr()); + if (sts_dw != status::success) return sts_dw; + } + + auto scratchpad = scratchpad_registry().registrar(); + jit_uni_bin_conv_fwd_kernel::init_scratchpad(scratchpad, jcp_, jcp_dw_conv); + + return status::success; + } + + jit_bin_conv_conf_t jcp_; + jit_conv_conf_t jcp_dw_conv; + + protected: + virtual status_t set_default_params() override { + using namespace memory_format; + + auto desired_weights_format = isa == avx512_common ? OhIw16o32i : OhIw8o32i; + + if (this->src_pd_.desc()->format == any) + CHECK(this->src_pd_.set_format(nhwc)); + if (this->dst_pd_.desc()->format == any) + CHECK(this->dst_pd_.set_format(nhwc)); + if (this->weights_pd_.desc()->format == any) + CHECK(this->weights_pd_.set_format(desired_weights_format)); + return status::success; + } + }; + + jit_uni_binary_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) { + kernel_ = new jit_uni_bin_conv_fwd_kernel(pd()->jcp_, pd()->jcp_dw_conv, *pd()->attr()); + + if (pd()->jcp_.with_dw_conv) { + dw_conv_kernel_ = new jit_uni_dw_conv_row_f32(pd()->jcp_dw_conv, *pd()->attr(), pd()->jcp_dw_conv.oc); + } + } + + ~jit_uni_binary_convolution_fwd_t() { + delete kernel_; + + if (pd()->jcp_.with_dw_conv) { + delete dw_conv_kernel_; + } + }; + + virtual void execute(event_t *e) const { + if (pd()->jcp_.with_dw_conv) + execute_forward_with_dw_conv(); + else + execute_forward(); + + e->set_state(event_t::ready); + } + +private: + void execute_forward() const; + void execute_forward_with_dw_conv() const; + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + + jit_uni_bin_conv_fwd_kernel *kernel_; + /* fuse with dw conv */ + jit_uni_dw_conv_row_f32 *dw_conv_kernel_; +}; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.cpp index 634e9f9..9aad4f1 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2018-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ struct jit_uni_depthwise_kernel_f32 : public c_compatible { template int jit_uni_depthwise_injector_f32::aux_vecs_count(alg_kind_t depthwise_alg) { switch (depthwise_alg) { - case alg_kind::depthwise_scale_shift: return 0; + case alg_kind::depthwise_scale_shift: return isa == sse42 ? 1 : 0; case alg_kind::depthwise_prelu: return 2; default: assert(!"unsupported depthwise algorithm"); } @@ -132,8 +132,15 @@ void jit_uni_depthwise_injector_f32::assign_regs() { template void jit_uni_depthwise_injector_f32::scale_shift_compute_vector(const Vmm &vmm_src, const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias) { - h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_weights]); - h->uni_vaddps(vmm_src, vmm_src, h->ptr[p_bias]); + if (isa == sse42) { + h->movups(vmm_mask, h->ptr[p_weights]); + h->mulps(vmm_src, vmm_mask); + h->movups(vmm_mask, h->ptr[p_bias]); + h->addps(vmm_src, vmm_mask); + } else { + h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_weights]); + h->uni_vaddps(vmm_src, vmm_src, h->ptr[p_bias]); + }; } template @@ -145,8 +152,8 @@ void jit_uni_depthwise_injector_f32::prelu_compute_vector(const Vmm &vmm_sr if (isa == sse42) { h->pxor(vmm_mask, vmm_mask); h->cmpps(vmm_mask, vmm_src, _cmp_gt_os); - h->movups(vmm_aux0, vmm_src); - h->mulps(vmm_aux0, h->ptr[p_weights]); + h->movups(vmm_aux0, h->ptr[p_weights]); + h->mulps(vmm_aux0, vmm_src); h->blendvps(vmm_src, vmm_aux0); } else if (isa == avx2) { h->vxorps(vmm_mask, vmm_mask, vmm_mask); @@ -202,7 +209,7 @@ struct jit_uni_scale_shift_kernel_f32 : public jit_uni_depthwise_kernel_f32, assert(desc.alg_kind == alg_kind::depthwise_scale_shift); assert(isa == sse42 || isa == avx2 || isa == avx512_common); - bool isFlat = desc.src_desc.format == nchw && desc.dst_desc.format == nchw ; + bool isFlat = desc.src_desc.format == nchw && desc.dst_desc.format == nchw; Reg64 param = abi_param1; @@ -465,30 +472,30 @@ status_t jit_uni_depthwise_fwd_t::pd_t::init() { } template -jit_uni_depthwise_fwd_t::jit_uni_depthwise_fwd_t(const pd_t *pd, +jit_uni_depthwise_fwd_t::jit_uni_depthwise_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), kernel_(nullptr), + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr), padded_weights_(nullptr), padded_bias_(nullptr) { - const auto &desc = *conf_.desc(); + const auto &desc = *pd()->desc(); switch (desc.alg_kind) { case alg_kind::depthwise_scale_shift: - kernel_ = new jit_uni_scale_shift_kernel_f32(desc, pd->with_bias()); break; + kernel_ = new jit_uni_scale_shift_kernel_f32(desc, pd()->with_bias()); break; case alg_kind::depthwise_prelu: - kernel_ = new jit_uni_prelu_kernel_f32(desc, pd->with_bias()); break; + kernel_ = new jit_uni_prelu_kernel_f32(desc, pd()->with_bias()); break; default: assert(!"unknown depthwise alg_kind"); } const int simd_w = isa == avx512_common ? 16 : 8; - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); const int c_without_padding = data_d.dims()[1]; const int c_padded = rnd_up(c_without_padding, simd_w); - if (conf_.want_padded_weights()) { + if (pd()->want_padded_weights()) { padded_weights_ = (data_t *)malloc(sizeof(data_t) * c_padded, 64); for (int oc = c_without_padding; oc < c_padded; ++oc) padded_weights_[oc] = 0; - if (conf_.with_bias()) { + if (pd()->with_bias()) { padded_bias_ = (data_t *)malloc(sizeof(data_t) * c_padded, 64); for (int oc = c_without_padding; oc < c_padded; ++oc) padded_bias_[oc] = 0; @@ -504,15 +511,15 @@ jit_uni_depthwise_fwd_t::~jit_uni_depthwise_fwd_t() { } template -void jit_uni_depthwise_fwd_t::execute_forward() { +void jit_uni_depthwise_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const int N = data_d.dims()[0]; const int C = data_d.dims()[1]; @@ -523,12 +530,12 @@ void jit_uni_depthwise_fwd_t::execute_forward() { const int ch_block_size = data_d.format() == nchw ? 1 : simd_w; const int CB = div_up(C, ch_block_size); - if (conf_.want_padded_weights()) { + if (pd()->want_padded_weights()) { for (int oc = 0; oc < C; ++oc) padded_weights_[oc] = weights[oc]; weights = padded_weights_; - if (conf_.with_bias()) { + if (pd()->with_bias()) { for (int oc = 0; oc < C; ++oc) padded_bias_[oc] = bias[oc]; bias = padded_bias_; @@ -537,7 +544,7 @@ void jit_uni_depthwise_fwd_t::execute_forward() { parallel_nd(N, CB, H, [&](int n, int cb, int h) { - jit_args arg = {}; + auto arg = jit_args(); arg.from = &src[data_d.blk_off(n, cb, h)]; arg.to = &dst[data_d.blk_off(n, cb, h)]; @@ -564,21 +571,38 @@ void jit_uni_dw_conv_row_f32::load_src(int ur_w) { for (int ow = 0; ow < ur_w; ow++) { Vmm vmm_acc = get_acc_reg(i*ur_w + ow); - if (this->jcp.with_bias) - uni_vmovups(vmm_acc, vmmword[reg_bias + i*4*sizeof(float)]); - else - uni_vpxor(vmm_acc, vmm_acc, vmm_acc); - - int o_off = ow*jcp.ch_block + i*4; - if (this->jcp.with_sum) - uni_vaddps(vmm_acc, vmm_acc, - vmmword[reg_output + o_off*sizeof(float)]); + uni_vpxor(vmm_acc, vmm_acc, vmm_acc); } } } template void jit_uni_dw_conv_row_f32::apply_filter(int ur_w, int kw_size) { + auto load_src = [=](Vmm vmm_src, const Xbyak::Address &op) { + if (jcp.src_dt == data_type::u8) { + uni_vpmovzxbd(vmm_src, op); + } else { + uni_vmovups(vmm_src, op); + } + }; + + auto load_ker = [=](Vmm vmm_ker, const Xbyak::Address &op) { + if (jcp.src_dt == data_type::u8) { + uni_vpmovsxbd(vmm_ker, op); + } else { + uni_vmovups(vmm_ker, op); + } + }; + + auto compute = [=](Vmm vmm_acc, Vmm vmm_src, Vmm vmm_ker) { + if (jcp.src_dt == data_type::u8) { + uni_vpmulld(vmm_src, vmm_src, vmm_ker); + uni_vpaddd(vmm_acc, vmm_acc, vmm_src); + } else { + uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker); + } + }; + int ch_blk = jcp.ch_block; int stride_w = jcp.stride_w; @@ -590,69 +614,63 @@ void jit_uni_dw_conv_row_f32::apply_filter(int ur_w, int kw_size) { jl(exit_label, T_NEAR); for (int i = 0; i < repeats; i++) { for (int kw = 0; kw < kw_size; kw++) { - int ker_off = kw * ch_blk + i*4; + int ker_off = kw * ch_blk + i*(jcp.ch_block / 2); Vmm vmm_ker = get_ker_reg(0); - uni_vmovups(vmm_ker, ptr[aux_reg_kernel - + ker_off * sizeof(float)]); + load_ker(vmm_ker, ptr[aux_reg_kernel + ker_off * jcp.typesize_in]); for (int ow = 0; ow < ur_w; ow++) { - int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*4; + int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*(jcp.ch_block / 2); Vmm vmm_src = get_src_reg(0); - uni_vmovups(vmm_src, ptr[aux_reg_input0 - + inp_off * sizeof(float)]); + load_src(vmm_src, ptr[aux_reg_input0 + inp_off * jcp.typesize_in]); Vmm vmm_acc = get_acc_reg(i*ur_w + ow); - uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker); + compute(vmm_acc, vmm_src, vmm_ker); } } } - add(aux_reg_kernel, jcp.kw*ch_blk*sizeof(float)); + add(aux_reg_kernel, jcp.kw*ch_blk*jcp.typesize_in); cmp(reg_kh, 2); jl(exit_label, T_NEAR); for (int i = 0; i < repeats; i++) { for (int kw = 0; kw < kw_size; kw++) { - int ker_off = kw * ch_blk + i*4; + int ker_off = kw * ch_blk + i*(jcp.ch_block / 2); Vmm vmm_ker = get_ker_reg(0); - uni_vmovups(vmm_ker, ptr[aux_reg_kernel - + ker_off * sizeof(float)]); + load_ker(vmm_ker, ptr[aux_reg_kernel + ker_off * jcp.typesize_in]); for (int ow = 0; ow < ur_w; ow++) { - int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*4; + int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*(jcp.ch_block / 2); Vmm vmm_src = get_src_reg(0); - uni_vmovups(vmm_src, ptr[aux_reg_input1 - + inp_off * sizeof(float)]); + load_src(vmm_src, ptr[aux_reg_input1 + inp_off * jcp.typesize_in]); Vmm vmm_acc = get_acc_reg(i*ur_w + ow); - uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker); + compute(vmm_acc, vmm_src, vmm_ker); } } } - add(aux_reg_kernel, jcp.kw*ch_blk*sizeof(float)); + add(aux_reg_kernel, jcp.kw*ch_blk*jcp.typesize_in); cmp(reg_kh, 3); jl(exit_label, T_NEAR); for (int i = 0; i < repeats; i++) { for (int kw = 0; kw < kw_size; kw++) { - int ker_off = kw * ch_blk + i*4; + int ker_off = kw * ch_blk + i*(jcp.ch_block / 2); Vmm vmm_ker = get_ker_reg(0); - uni_vmovups(vmm_ker, ptr[aux_reg_kernel - + ker_off * sizeof(float)]); + load_ker(vmm_ker, ptr[aux_reg_kernel + ker_off * jcp.typesize_in]); for (int ow = 0; ow < ur_w; ow++) { - int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*4; + int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*(jcp.ch_block / 2); Vmm vmm_src = get_src_reg(0); - uni_vmovups(vmm_src, ptr[aux_reg_input2 - + inp_off * sizeof(float)]); + load_src(vmm_src, ptr[aux_reg_input2 + inp_off * jcp.typesize_in]); Vmm vmm_acc = get_acc_reg(i*ur_w + ow); - uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker); + compute(vmm_acc, vmm_src, vmm_ker); } } } @@ -661,34 +679,276 @@ void jit_uni_dw_conv_row_f32::apply_filter(int ur_w, int kw_size) { } template -void jit_uni_dw_conv_row_f32::apply_activation(int ur_w) { - if (this->jcp.with_eltwise) { - int repeats = isa == sse42 ? 2 : 1; - eltwise_injector->compute_vector_range(4, repeats * ur_w + 4); +void jit_uni_dw_conv_row_f32::cvt2ps(data_type_t type_in, Vmm vmm_in, const Operand &op, bool scalar_load) { + Xmm xmm_in = Xmm(vmm_in.getIdx()); + + switch (type_in) { + case data_type::f32: + case data_type::s32: + if (scalar_load) { + mov(reg_tmp_32, op); + movq(xmm_in, reg_tmp_64); + } else { + uni_vmovups(vmm_in, op); + } + break; + case data_type::s8: + if (scalar_load) { + movsx(reg_tmp_32, op); + movq(xmm_in, reg_tmp_64); + } else { + uni_vpmovsxbd(vmm_in, op); + } + break; + case data_type::u8: + if (scalar_load) { + movzx(reg_tmp_32, op); + movq(xmm_in, reg_tmp_64); + } else { + uni_vpmovzxbd(vmm_in, op); + } + break; + default: assert(!"unsupported data type"); } + + if (type_in != data_type::f32) + uni_vcvtdq2ps(vmm_in, vmm_in); } template -void jit_uni_dw_conv_row_f32::store_dst(int ur_w) { +void jit_uni_dw_conv_row_f32::apply_postprocessing(int ur_w, int oc_step) { int repeats = isa == sse42 ? 2 : 1; + + for (int r = 0; r < repeats; r++) { + for (int ow = 0; ow < ur_w; ow++) { + if (jcp.src_dt == data_type::u8) { + uni_vcvtdq2ps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow)); + } + + if (jcp.with_bias) { + int b_off = r * (jcp.ch_block / 2); + cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias + b_off * jcp.typesize_bia], false); + uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_bias); + } + } + } + + if (jcp.with_sum) { + for (int r = 0; r < repeats; r++) { + int tail_size = isa == sse42 ? nstl::min(jcp.ch_block / 2, oc_step - r * jcp.ch_block / 2) : oc_step; + bool is_scalar_store = isa == sse42 ? tail_size < jcp.ch_block / 2 : tail_size < jcp.ch_block; + + for (int ow = 0; ow < ur_w; ow++) { + if (is_scalar_store) { + for (int oc = 0; oc < tail_size; oc++) { + int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2) + oc; + + uni_vpxor(vmm_sum, vmm_sum, vmm_sum); + cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true); + + if (oc >= jcp.ch_block / 2) { + vperm2i128(Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), 0x01); + } + uni_vpslldq(vmm_sum, vmm_sum, jcp.typesize_out * (oc % (jcp.ch_block / 2))); + + uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum); + } + } else { + int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2); + + uni_vpxor(vmm_sum, vmm_sum, vmm_sum); + cvt2ps(jcp.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], false); + + uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum); + } + } + } + } + + const auto &p = attr_.post_ops_; + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + int start_idx = p.find(primitive_kind::convolution) + 1; + for (int i = start_idx; i < p.len_; i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(4, 4 + repeats * ur_w); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); + mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); + + add(reg_d_weights, reg_oc_off); + add(reg_d_bias, reg_oc_off); + + depthwise_injectors[depthwise_inj_idx]->compute_vector_range(4, 4 + ur_w, reg_d_weights, reg_d_bias); + + if (repeats == 2) { + add(reg_d_weights, (jcp.ch_block / 2) * sizeof(float)); + add(reg_d_bias, (jcp.ch_block / 2) * sizeof(float)); + + depthwise_injectors[depthwise_inj_idx]->compute_vector_range(4 + ur_w, 4 + 2 * ur_w, reg_d_weights, reg_d_bias); + } + + depthwise_inj_idx++; + } + } +} + +template +void jit_uni_dw_conv_row_f32::store_dst_typed(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) { + Ymm ymm_dst = Ymm(vmm_dst.getIdx()); + Xmm xmm_dst = Xmm(vmm_dst.getIdx()); + + switch (jcp.dst_dt) { + case data_type::f32: + case data_type::s32: + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_32); + } else { + uni_vmovups(op, vmm_dst); + } + break; + case data_type::s8: + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + + if (isa != sse42 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); + + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); + + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case data_type::u8: + case data_type::bin: + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + + if (isa != sse42 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); + + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); + + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != sse42) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + default: + assert(!"unknown dst_dt"); + } +} + +template +void jit_uni_dw_conv_row_f32::store_dst(int ur_w, int oc_step) { + int repeats = isa == sse42 && oc_step > (jcp.ch_block / 2) ? 2 : 1; + for (int i = 0; i < repeats; i++) { for (int ow = 0; ow < ur_w; ow++) { - int o_off = ow*jcp.ch_block + i*4; - Vmm vmm_dst = get_acc_reg(i*ur_w + ow); + Vmm vmm_dst = get_acc_reg(i * ur_w + ow); + if (jcp.dst_dt != data_type::f32 && jcp.dst_dt != data_type::bin) { + if (attr_.round_mode_ == round_mode::nearest) + uni_vcvtps2dq(vmm_dst, vmm_dst); + else if (attr_.round_mode_ == round_mode::down) { + uni_vroundps(vmm_dst, vmm_dst, 1); + uni_vcvtps2dq(vmm_dst, vmm_dst); + } else + assert(!"unimplemented"); + } + } + } + + if (jcp.with_binarization) { + int output_step = div_up(ow_stride_, 8); + + const auto &p = attr_.post_ops_; + int binarization_idx = p.find(primitive_kind::binarization); + + mov(reg_b_weights, reinterpret_cast(p.entry_[binarization_idx].binarization.weights_data)); + add(reg_b_weights, reg_oc_off); + + for (int ow = 0; ow < ur_w; ow++) { + for (int i = 0; i < repeats; i++) { + int tail_size = isa == sse42 ? nstl::min(jcp.ch_block / 2, oc_step - i * jcp.ch_block / 2) : oc_step; + mov(reg_b_mask, (1 << tail_size) - 1); + uni_vmovups(vmm_thr, ptr[reg_b_weights + i * (jcp.ch_block / 2) * sizeof(float)]); + + Vmm vmm_dst = get_acc_reg(i * ur_w + ow); + + uni_vcmpgtps(vmm_dst, vmm_dst, vmm_thr); + + if (i == 0) { + uni_vmovmskps(reg_tmp_32, vmm_dst); + and_(reg_tmp_64, reg_b_mask); + } else { + uni_vmovmskps(reg_tmp2_32, vmm_dst); + and_(reg_tmp2_64, reg_b_mask); + shl(reg_tmp2_32, 4); + or_(reg_tmp_32, reg_tmp2_32); + } + + if (i == repeats - 1) { + const size_t o_off = ow * output_step; + mov(ptr[reg_output + o_off * jcp.typesize_out], reg_tmp_8); + } + } + } + } else { + for (int i = 0; i < repeats; i++) { + int tail_size = isa == sse42 ? nstl::min(jcp.ch_block / 2, oc_step - i * jcp.ch_block / 2) : oc_step; + bool is_scalar_store = isa == sse42 ? tail_size < jcp.ch_block / 2 : tail_size < jcp.ch_block; + if (is_scalar_store) { + for (int ow = 0; ow < ur_w; ow++) { + Vmm vmm_dst = get_acc_reg(i * ur_w + ow); + Ymm ymm_dst = Ymm(vmm_dst.getIdx()); + + for (int oc = 0; oc < tail_size; oc++) { + int o_off = ow * ow_stride_ + i * (jcp.ch_block / 2) + oc; + store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true); + + if (isa == sse42) { + psrldq(vmm_dst, jcp.typesize_out); + } else { + vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01); + vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out); + } + } + } + } else { + for (int ow = 0; ow < ur_w; ow++) { + int o_off = ow * ow_stride_ + i * (jcp.ch_block / 2); + Vmm vmm_dst = get_acc_reg(i * ur_w + ow); - uni_vmovups(vmmword[reg_output + o_off*sizeof(float)], vmm_dst); + store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false); + } + } } } } template -void jit_uni_dw_conv_row_f32::loop_body() { +void jit_uni_dw_conv_row_f32::loop_body(int oc_step) { Label left_pad_label; Label right_pad_label; Label unrolled_w_label; Label tail_w_label; Label exit_label; + int output_step = jcp.with_binarization ? div_up(ow_stride_, 8) : ow_stride_; + L(left_pad_label); { int ur_w = 1; int kw = jcp.iw == 1 ? jcp.kw - 2 : jcp.kw - 1; @@ -697,18 +957,17 @@ void jit_uni_dw_conv_row_f32::loop_body() { mov(aux_reg_input1, reg_input1); mov(aux_reg_input2, reg_input2); mov(aux_reg_kernel, reg_kernel); - add(aux_reg_kernel, jcp.ch_block*sizeof(float)); + add(aux_reg_kernel, jcp.ch_block*jcp.typesize_in); load_src(ur_w); apply_filter(ur_w, kw); - apply_activation(ur_w); - store_dst(ur_w); + apply_postprocessing(ur_w, oc_step); + store_dst(ur_w, oc_step); - add(reg_input0, sizeof(float) * ur_w * jcp.ch_block * (jcp.stride_w-1)); - add(reg_input1, sizeof(float) * ur_w * jcp.ch_block * (jcp.stride_w-1)); - add(reg_input2, sizeof(float) * ur_w * jcp.ch_block * (jcp.stride_w-1)); - - add(reg_output, sizeof(float) * ur_w * jcp.ch_block); + add(reg_input0, jcp.typesize_in * ur_w * jcp.ch_block * (jcp.stride_w-1)); + add(reg_input1, jcp.typesize_in * ur_w * jcp.ch_block * (jcp.stride_w-1)); + add(reg_input2, jcp.typesize_in * ur_w * jcp.ch_block * (jcp.stride_w-1)); + add(reg_output, jcp.typesize_out * ur_w * output_step); sub(reg_ur_w, ur_w); } @@ -727,13 +986,13 @@ void jit_uni_dw_conv_row_f32::loop_body() { load_src(ur_w); apply_filter(ur_w, kw); - apply_activation(ur_w); - store_dst(ur_w); + apply_postprocessing(ur_w, oc_step); + store_dst(ur_w, oc_step); - add(reg_input0, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w); - add(reg_input1, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w); - add(reg_input2, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w); - add(reg_output, sizeof(float) * ur_w * jcp.ch_block); + add(reg_input0, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w); + add(reg_input1, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w); + add(reg_input2, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w); + add(reg_output, jcp.typesize_out * ur_w * output_step); sub(reg_ur_w, ur_w); jmp(unrolled_w_label, T_NEAR); @@ -756,13 +1015,13 @@ void jit_uni_dw_conv_row_f32::loop_body() { load_src(ur_w); apply_filter(ur_w, kw); - apply_activation(ur_w); - store_dst(ur_w); + apply_postprocessing(ur_w, oc_step); + store_dst(ur_w, oc_step); - add(reg_input0, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w); - add(reg_input1, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w); - add(reg_input2, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w); - add(reg_output, sizeof(float) * ur_w * jcp.ch_block); + add(reg_input0, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w); + add(reg_input1, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w); + add(reg_input2, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w); + add(reg_output, jcp.typesize_out * ur_w * output_step); sub(reg_ur_w, ur_w); jmp(tail_w_label, T_NEAR); @@ -780,8 +1039,8 @@ void jit_uni_dw_conv_row_f32::loop_body() { load_src(ur_w); apply_filter(ur_w, kw); - apply_activation(ur_w); - store_dst(ur_w); + apply_postprocessing(ur_w, oc_step); + store_dst(ur_w, oc_step); sub(reg_ur_w, ur_w); } @@ -791,8 +1050,26 @@ void jit_uni_dw_conv_row_f32::loop_body() { } template -void jit_uni_dw_conv_row_f32::generate() -{ +void jit_uni_dw_conv_row_f32::generate() { + const auto &p = attr_.post_ops_; + int start_idx = p.find(primitive_kind::convolution) + 1; + for (int i = start_idx; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32( + this, + post_op.depthwise.alg + )); + } + } + this->preamble(); mov(reg_input0, ptr[this->param1 + GET_OFF_DW(src_row0)]); @@ -804,45 +1081,196 @@ void jit_uni_dw_conv_row_f32::generate() mov(reg_bias, ptr[this->param1 + GET_OFF_DW(bias)]); mov(reg_kh, ptr[this->param1 + GET_OFF_DW(kh_padding)]); mov(reg_ur_w, ptr[this->param1 + GET_OFF_DW(ur_w)]); + mov(reg_oc_work, ptr[this->param1 + GET_OFF_DW(oc_work)]); + mov(reg_oc_off, ptr[this->param1 + GET_OFF_DW(oc_off)]); + + Label(tail_label); + Label(exit_label); - loop_body(); + cmp(reg_oc_work, jcp.ch_block); + jl(tail_label, T_NEAR); + + loop_body(jcp.ch_block); + jmp(exit_label, T_NEAR); + + L(tail_label); + + if (jcp.oc % jcp.ch_block != 0) + loop_body(jcp.oc % jcp.ch_block); + + L(exit_label); this->postamble(); - if (jcp.with_eltwise) - eltwise_injector->prepare_table(); + for (auto& inj : eltwise_injectors) + inj->prepare_table(); +} + +template +bool jit_uni_dw_conv_row_f32::post_ops_ok(jit_conv_conf_t &jcp, const primitive_attr_t &attr) { + const auto &p = attr.post_ops_; + + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return p.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; + auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; + auto is_binarization = [&](int idx) { return p.entry_[idx].is_binarization(); }; + + int start_idx = p.find(primitive_kind::convolution) + 1; + + switch (p.len_ - start_idx) { + case 0: return true; // no post_ops + case 1: return is_simple(start_idx) || is_sum(start_idx) || is_binarization(start_idx); + case 2: return (is_sum(start_idx) && is_simple(start_idx+1)) || (is_simple(start_idx) && is_simple(start_idx+1)) || + (is_simple(start_idx) && is_binarization(start_idx+1)); + case 3: return (is_sum(start_idx) && is_simple(start_idx+1) && is_simple(start_idx+2)); + default: return false; + } + + return false; +} + +template +status_t jit_uni_dw_conv_row_f32::init_conf(jit_1x1_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, + const primitive_attr_t &attr) { + if (!mayiuse(isa)) return status::unimplemented; + const int simd_w = isa == avx512_common ? 16 : 8; + + const auto &p = attr.post_ops_; + + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp_dw.with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; + + jcp_dw.ch_block = simd_w; + jcp_dw.with_bias = true; + + jcp_dw.kh = p.entry_[dw_conv_ind].dw_conv.ker_h; + jcp_dw.kw = p.entry_[dw_conv_ind].dw_conv.ker_w; + jcp_dw.ic = jcp.oc; + jcp_dw.oc = jcp.oc; + jcp_dw.ih = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp_dw.iw = p.entry_[dw_conv_ind].dw_conv.in_w; + jcp_dw.oh = jcp.dw_conv_oh; + jcp_dw.ow = jcp.dw_conv_ow; + jcp_dw.stride_h = p.entry_[dw_conv_ind].dw_conv.str_h; + jcp_dw.stride_w = p.entry_[dw_conv_ind].dw_conv.str_w; + jcp_dw.conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; + jcp_dw.conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; + + if (jcp_dw.kh != 3 || jcp_dw.kw != 3) + return status::unimplemented; + + if (!post_ops_ok(jcp_dw, attr)) + return status::unimplemented; + + jcp_dw.ur_w = 4; + + jcp_dw.src_dt = jcp.src_dt; + jcp_dw.dst_dt = jcp.dst_dt; + jcp_dw.bia_dt = jcp.bia_dt; + jcp_dw.typesize_in = (int)types::data_type_size(jcp.src_dt); + jcp_dw.typesize_bia = (int)types::data_type_size(jcp.bia_dt); + jcp_dw.typesize_out = (int)types::data_type_size(jcp.dst_dt); + + if (jcp_dw.src_dt != mkldnn_f32 && jcp_dw.src_dt != mkldnn_u8) + return status::unimplemented; + + return status::success; +} + +template +status_t jit_uni_dw_conv_row_f32::init_conf(jit_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, + const primitive_attr_t &attr) { + if (!mayiuse(isa)) return status::unimplemented; + const int simd_w = isa == avx512_common ? 16 : 8; + + const auto &p = attr.post_ops_; + + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp_dw.with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; + + jcp_dw.ch_block = simd_w; + jcp_dw.with_bias = true; + + jcp_dw.kh = p.entry_[dw_conv_ind].dw_conv.ker_h; + jcp_dw.kw = p.entry_[dw_conv_ind].dw_conv.ker_w; + jcp_dw.ic = jcp.oc; + jcp_dw.oc = jcp.oc; + jcp_dw.ih = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp_dw.iw = p.entry_[dw_conv_ind].dw_conv.in_w; + jcp_dw.oh = jcp.dw_conv_oh; + jcp_dw.ow = jcp.dw_conv_ow; + jcp_dw.stride_h = p.entry_[dw_conv_ind].dw_conv.str_h; + jcp_dw.stride_w = p.entry_[dw_conv_ind].dw_conv.str_w; + jcp_dw.conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; + jcp_dw.conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; + + if (jcp_dw.kh != 3 || jcp_dw.kw != 3) + return status::unimplemented; + + if (!post_ops_ok(jcp_dw, attr)) + return status::unimplemented; + + jcp_dw.ur_w = 4; + + jcp_dw.src_dt = jcp.dst_dt; + jcp_dw.dst_dt = jcp.dst_dt; + jcp_dw.bia_dt = jcp.bia_dt; + jcp_dw.typesize_in = (int)types::data_type_size(jcp.src_dt); + jcp_dw.typesize_bia = (int)types::data_type_size(jcp.bia_dt); + jcp_dw.typesize_out = (int)types::data_type_size(jcp.dst_dt); + + if (jcp_dw.src_dt != mkldnn_f32 && jcp_dw.src_dt != mkldnn_u8) + return status::unimplemented; + + return status::success; } template -status_t jit_uni_dw_conv_row_f32::init_conf(jit_conv_conf_t &jcp, - int ic, int ih, int iw, int oh, int ow, int ker_h, int ker_w, int str_h, int str_w, alg_kind_t eltwise_alg, - float eltwise_alpha, float eltwise_beta, bool with_sum) { +status_t jit_uni_dw_conv_row_f32::init_conf(jit_bin_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, + const primitive_attr_t &attr) { if (!mayiuse(isa)) return status::unimplemented; const int simd_w = isa == avx512_common ? 16 : 8; - jcp.kh = ker_h; - jcp.kw = ker_w; - jcp.ch_block = simd_w; - jcp.with_bias = true; - jcp.ic = ic; - jcp.oc = ic; - jcp.ih = ih; - jcp.iw = iw; - jcp.oh = oh; - jcp.ow = ow; - jcp.stride_h = str_h; - jcp.stride_w = str_w; - - if (jcp.kh != 3 || jcp.kw != 3) - return status::unimplemented; - - jcp.ur_w = 4; - - jcp.with_eltwise = eltwise_alg != mkldnn_alg_kind_undef; - jcp.eltwise_alg = eltwise_alg; - jcp.eltwise_alpha = eltwise_alpha; - jcp.eltwise_beta = eltwise_beta; - jcp.with_sum = with_sum; + const auto &p = attr.post_ops_; + + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp_dw.with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1; + jcp_dw.with_binarization = p.find(primitive_kind::binarization, dw_conv_ind) != -1; + + jcp_dw.ch_block = simd_w; + jcp_dw.with_bias = true; + + jcp_dw.kh = p.entry_[dw_conv_ind].dw_conv.ker_h; + jcp_dw.kw = p.entry_[dw_conv_ind].dw_conv.ker_w; + jcp_dw.ic = jcp.oc; + jcp_dw.oc = jcp.oc; + jcp_dw.ih = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp_dw.iw = p.entry_[dw_conv_ind].dw_conv.in_w; + jcp_dw.oh = jcp.dw_conv_oh; + jcp_dw.ow = jcp.dw_conv_ow; + jcp_dw.stride_h = p.entry_[dw_conv_ind].dw_conv.str_h; + jcp_dw.stride_w = p.entry_[dw_conv_ind].dw_conv.str_w; + jcp_dw.conv_weights = p.entry_[dw_conv_ind].dw_conv.weights_data; + jcp_dw.conv_biases = p.entry_[dw_conv_ind].dw_conv.biases_data; + + if (jcp_dw.kh != 3 || jcp_dw.kw != 3) + return status::unimplemented; + + if (!post_ops_ok(jcp_dw, attr)) + return status::unimplemented; + + jcp_dw.ur_w = 4; + + jcp_dw.src_dt = mkldnn_f32; + jcp_dw.dst_dt = jcp_dw.with_binarization ? mkldnn_bin : mkldnn_f32; + jcp_dw.bia_dt = mkldnn_f32; + jcp_dw.typesize_in = (int)types::data_type_size(jcp_dw.src_dt); + jcp_dw.typesize_bia = (int)types::data_type_size(jcp_dw.bia_dt); + jcp_dw.typesize_out = (int)types::data_type_size(jcp_dw.dst_dt); + + if (jcp_dw.src_dt != mkldnn_f32 && jcp_dw.src_dt != mkldnn_u8) + return status::unimplemented; return status::success; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.hpp index 1119992..47d93c8 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2018-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -93,21 +93,21 @@ struct jit_uni_depthwise_fwd_t : public cpu_primitive_t { virtual status_t init() override; }; - jit_uni_depthwise_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_depthwise_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_depthwise_fwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_depthwise_kernel_f32 *kernel_; data_t *padded_weights_; data_t *padded_bias_; @@ -118,37 +118,39 @@ template struct jit_uni_dw_conv_row_f32: public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_ds_dw_conv_kernel_f32) - jit_uni_dw_conv_row_f32(jit_conv_conf_t ajcp): jcp(ajcp) { - if (jcp.with_eltwise) { - eltwise_injector = new jit_uni_eltwise_injector_f32(this, - jcp.eltwise_alg, jcp.eltwise_alpha, jcp.eltwise_beta); - } - + jit_uni_dw_conv_row_f32(jit_conv_conf_t ajcp, const primitive_attr_t &attr, int ow_stride) + : jcp(ajcp), attr_(attr), ow_stride_(ow_stride) { this->generate(); jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); } ~jit_uni_dw_conv_row_f32() { - if (jcp.with_eltwise) { - delete eltwise_injector; - } + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); } static bool post_ops_ok(jit_conv_conf_t &jcp, const primitive_attr_t &attr); - static status_t init_conf(jit_conv_conf_t &jcp, - int ic, int ih, int iw, int oh, int ow, - int ker_h, int ker_w, int str_h, int str_w, - alg_kind_t eltwise_alg, - float eltwise_alpha, float eltwise_beta, bool with_sum); + static status_t init_conf(jit_1x1_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, const primitive_attr_t &attr); + static status_t init_conf(jit_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, const primitive_attr_t &attr); + static status_t init_conf(jit_bin_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, const primitive_attr_t &attr); jit_conv_conf_t jcp; + const primitive_attr_t &attr_; void (*jit_ker)(jit_conv_call_s *); + int ow_stride_; private: using Vmm = typename utils::conditional3::type; using reg64_t = const Xbyak::Reg64; + using reg32_t = const Xbyak::Reg32; + using reg8_t = const Xbyak::Reg8; const Xbyak::AddressFrame &vmmword = (isa == sse42) ? xword : (isa == avx2) ? yword : zword; const int vlen = cpu_isa_traits::vlen; @@ -161,29 +163,50 @@ private: reg64_t aux_reg_input1 = r12; reg64_t aux_reg_input2 = r13; - reg64_t reg_kernel = r14; reg64_t aux_reg_kernel = r15; reg64_t reg_output = rdx; reg64_t reg_bias = rbx; reg64_t reg_kh = rax; reg64_t reg_ur_w = rbp; + reg64_t reg_oc_work = abi_not_param1; + + reg64_t reg_oc_off = rsi; + reg64_t reg_d_weights = aux_reg_input0; + reg64_t reg_d_bias = aux_reg_input1; - reg64_t imm_addr64 = aux_reg_input0; + reg64_t reg_b_weights = r15; + reg64_t reg_b_mask = reg_d_bias; + + reg32_t reg_tmp_32 = r11d; + reg64_t reg_tmp_64 = r11; + reg8_t reg_tmp_8 = r11b; + + reg32_t reg_tmp2_32 = r13d; + reg64_t reg_tmp2_64 = r13; inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); } inline Vmm get_src_reg(int idx) { return Vmm(idx + 1); } inline Vmm get_acc_reg(int idx) { return Vmm(idx + 4); } + Xbyak::Ymm ymm_tmp = Xbyak::Ymm(0); + Vmm vmm_tmp = Vmm(0); + Vmm vmm_sum = Vmm(0); + Vmm vmm_bias = Vmm(0); + Vmm vmm_thr = Vmm(0); + inline void load_src(int ur_w); inline void apply_filter(int ur_w, int kw_size); - inline void apply_activation(int ur_w); - inline void store_dst(int ur_w); - inline void loop_body(); + inline void cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load); + inline void apply_postprocessing(int ur_w, int oc_step); + inline void store_dst_typed(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store); + inline void store_dst(int ur_w, int oc_step); + inline void loop_body(int oc_step); void generate(); - jit_uni_eltwise_injector_f32* eltwise_injector; + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.cpp index 0d97cce..db6454c 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.cpp @@ -30,6 +30,7 @@ namespace cpu { using namespace mkldnn::impl::prop_kind; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; @@ -183,13 +184,6 @@ void jit_uni_dw_conv_fwd_kernel_f32::apply_postprocess(int ur_ch_blocks, in int depthwise_inj_idx = 0; const auto &p = attr_.post_ops_; - if (p.len_ == 0 && eltwise_injectors.size() == 1) { - int start_idx = get_acc_reg(0).getIdx(); - int end_idx = get_acc_reg(repeats * ur_w * ur_ch_blocks).getIdx(); - - eltwise_injectors[0]->compute_vector_range(start_idx, end_idx); - } - for (int i = 0; i < p.len_; i++) { auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { @@ -293,14 +287,7 @@ void jit_uni_dw_conv_fwd_kernel_f32::loop_body(int ur_ch_blocks) { } template -void jit_uni_dw_conv_fwd_kernel_f32::generate() -{ - if (jcp.with_eltwise) { - eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( - this, jcp.eltwise_alg, jcp.eltwise_alpha, 0 - )); - } - +void jit_uni_dw_conv_fwd_kernel_f32::generate() { const auto &p = attr_.post_ops_; for (int i = 0; i < p.len_; i++) { auto &post_op = p.entry_[i]; @@ -369,14 +356,10 @@ bool jit_uni_dw_conv_fwd_kernel_f32::post_ops_ok( auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: return true // sum OR eltwise OR deptwise - && !jcp.with_eltwise && (is_simple(0) || is_sum(0)); - case 2: return true // sum->relu OR sum->depthwise OR eltwise->depthwise OR depthwise->depthwise - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || - (is_simple(0) && is_simple(1))); - case 3: return true // sum->eltwise->depthwise OR sum->depthwise->eltwise OR sum->depthwise->depthwise - && !jcp.with_eltwise && ((is_sum(0) && is_simple(1) && is_simple(2))); + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_simple(1)); + case 3: return is_sum(0) && is_simple(1) && is_simple(2); default: return false; } @@ -387,7 +370,7 @@ template status_t jit_uni_dw_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope) + const primitive_attr_t &attr) { if (!mayiuse(isa)) return status::unimplemented; @@ -426,9 +409,6 @@ status_t jit_uni_dw_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alg = mkldnn_eltwise_relu; - jcp.eltwise_alpha = relu_negative_slope; if (!post_ops_ok(jcp, attr)) return status::unimplemented; @@ -473,6 +453,13 @@ status_t jit_uni_dw_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, return status::success; } +template +void jit_uni_dw_conv_fwd_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + if (jcp.with_bias && jcp.oc_without_padding != jcp.oc) + scratchpad.book(key_conv_padded_bias, sizeof(float) * jcp.oc); +} + template struct jit_uni_dw_conv_fwd_kernel_f32; template struct jit_uni_dw_conv_fwd_kernel_f32; template struct jit_uni_dw_conv_fwd_kernel_f32; @@ -754,6 +741,13 @@ status_t jit_uni_dw_conv_bwd_data_kernel_f32::init_conf( return status::success; } +template +void jit_uni_dw_conv_bwd_data_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + UNUSED(scratchpad); + UNUSED(jcp); +} + template struct jit_uni_dw_conv_bwd_data_kernel_f32; template struct jit_uni_dw_conv_bwd_data_kernel_f32; template struct jit_uni_dw_conv_bwd_data_kernel_f32; @@ -776,7 +770,7 @@ inline void jit_uni_dw_conv_bwd_weights_kernel_f32::load_filter() { int off_filter = (reg_set + i) * simd_w; Vmm vmm_acc = get_acc_reg(reg_set + i); uni_vmovups(vmm_acc, - vmmword[tmp_reg_filter + off_filter * sizeof(float)]); + vmmword[reg_tmp_filter + off_filter * sizeof(float)]); } } } @@ -800,58 +794,59 @@ inline void jit_uni_dw_conv_bwd_weights_kernel_f32::load_bias() { template inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_ow_step_unroll( - int l_pad, int r_pad, int pad_offset, int ow_block) { - const int pad = nstl::max(jcp.l_pad, jcp.r_pad); - const int iw_overlap = jcp.iw + jcp.kw - 1 - jcp.l_pad - jcp.r_pad; - const int unroll_w = nstl::min(jcp.ur_w, iw_overlap); - const int right_border = iw_overlap - ow_block; + int unroll_w, int l_pad, int pad_offset, int ow_block) { + + const int iw_block = ow_block * jcp.stride_w; + const int right_border = jcp.iw - iw_block; + + const int cascade_input = nstl::min(jcp.stride_w, jcp.kw); /* preamble count for number of cascaded LOAD + FMA operation */ - const int input_preamble_count - = nstl::max(jcp.kw - jcp.stride_w - l_pad, 0); + const int input_overlap = nstl::max(jcp.kw - l_pad, 0); /* LOAD initial input registers, then cascade LOADs and FMAs*/ for (int r = 0; r < reg_repeats; ++r) { - for (int i = 0; i < input_preamble_count; i++) { - int off_input = ((i - pad_offset) * reg_repeats + r) * simd_w; - Vmm vmm_input = get_input_reg((i + l_pad) * reg_repeats + r); - uni_vmovups(vmm_input, - ptr[tmp_reg_idx_input + off_input * sizeof(float)]); - } - - for (int i = 0; i < unroll_w; ++i) { - int off_output = (i * reg_repeats + r) * simd_w; + for (int i_ur = 0; i_ur < unroll_w; ++i_ur) { + int off_output = (i_ur * reg_repeats + r) * simd_w; Vmm vmm_output = get_output_reg(r); uni_vmovups(vmm_output, - ptr[tmp_reg_idx_output + off_output * sizeof(float)]); - - int input_load_overlap = i * jcp.stride_w + input_preamble_count; - - /* Cascade 'input' loads for the corresponding FMAs */ - const int cascade_input = nstl::min(jcp.stride_w, jcp.kw); - for (int c = 0; c < cascade_input; ++c) { - int off_input - = ((c + input_load_overlap - pad_offset) * reg_repeats - + r) - * simd_w; - Vmm vmm_input = get_input_reg( - ((c + input_load_overlap + l_pad) % jcp.kw) - * reg_repeats - + r); - uni_vmovups(vmm_input, - ptr[tmp_reg_idx_input + off_input * sizeof(float)]); + ptr[reg_tmp_output + off_output * sizeof(float)]); + if (i_ur == 0) { + for (int c = 0; c < input_overlap; ++c) { + int off_input + = ((c - pad_offset) * reg_repeats + r) * simd_w; + Vmm vmm_input + = get_input_reg((c % jcp.kw) * reg_repeats + r); + uni_vmovups(vmm_input, + ptr[reg_tmp_input + off_input * sizeof(float)]); + } + } else { + for (int c = 0; c < cascade_input; ++c) { + int overlap = (i_ur - 1) * jcp.stride_w + input_overlap; + int off_input + = ((overlap + c - pad_offset) * reg_repeats + r) + * simd_w; + Vmm vmm_input = get_input_reg( + ((overlap + c) % jcp.kw) * reg_repeats + r); + uni_vmovups(vmm_input, + ptr[reg_tmp_input + off_input * sizeof(float)]); + } } - for (int j = 0; j < jcp.kw; ++j) { + for (int i_kw = 0; i_kw < jcp.kw; ++i_kw) { + int io_overlap = i_kw + (i_ur * jcp.stride_w); /* Don't apply FMAs that fall into the padded region */ - if (i + j < l_pad || i + j - pad >= right_border) + if (io_overlap - l_pad < 0 + || io_overlap - jcp.l_pad >= right_border) continue; + Vmm vmm_input = get_input_reg( - ((i * jcp.stride_w + j) % jcp.kw) * reg_repeats + r); - Vmm vmm_acc = get_acc_reg(j * reg_repeats + r); + ((io_overlap - l_pad) % jcp.kw) * reg_repeats + r); + Vmm vmm_acc = get_acc_reg(i_kw * reg_repeats + r); Vmm vmm_aux = isa == sse42 ? get_aux_reg() : vmm_input; - if( isa == sse42 ) uni_vmovups(vmm_aux, vmm_input); + if (isa == sse42) + uni_vmovups(vmm_aux, vmm_input); uni_vfmadd231ps(vmm_acc, vmm_aux, vmm_output); } } @@ -866,8 +861,16 @@ jit_uni_dw_conv_bwd_weights_kernel_f32::compute_bias_step_unroll( for (int i = 0; i < unroll_w; ++i) { Vmm vmm_bias = get_bias_reg(r); int off_output = (i * reg_repeats + r) * simd_w; - uni_vaddps(vmm_bias, vmm_bias, - vmmword[tmp_reg_idx_output + off_output * sizeof(float)]); + if (isa == sse42) { + /* Need to support unaligned address loads for SSE42*/ + Vmm vmm_output = get_output_reg(1 + r); + uni_vmovups(vmm_output, + ptr[reg_tmp_output + off_output * sizeof(float)]); + uni_vaddps(vmm_bias, vmm_bias, vmm_output); + } else { + uni_vaddps(vmm_bias, vmm_bias, + vmmword[reg_tmp_output + off_output * sizeof(float)]); + } } } } @@ -879,7 +882,7 @@ inline void jit_uni_dw_conv_bwd_weights_kernel_f32::store_filter() { for (int i = 0; i < jcp.kw; ++i) { int off_filter = (i + reg_set) * simd_w; Vmm vmm_acc = get_acc_reg(i + reg_set); - uni_vmovups(vmmword[tmp_reg_filter + off_filter * sizeof(float)], + uni_vmovups(vmmword[reg_tmp_filter + off_filter * sizeof(float)], vmm_acc); } } @@ -895,343 +898,304 @@ inline void jit_uni_dw_conv_bwd_weights_kernel_f32::store_bias() { } template -inline void jit_uni_dw_conv_bwd_weights_kernel_f32::create_h_bounds_table() { - /* Bounds are stored on an 8-bit sized element. - * XXX: potential issues if bounds exceed 255. - */ - const bool handle_padding = (jcp.t_pad > 0) || (jcp.b_pad > 0); - if (handle_padding) { - - /* Calculate how many 'h_start' bounds are needed */ - const int h_bounds_count = get_loop_bounds_count( - nstl::max(jcp.t_pad, jcp.b_pad), jcp.oh, jcp.oh_blk_size); - - align(64); - L(bound_start_table); - /* Generate starting bounds for 'oh' loop. This value also determines - * the overlap (computed as an address offset) between the output over - * the input for that loop iteration. */ - for (int oh_block = 0; oh_block < h_bounds_count; ++oh_block) { - for (int kh = 0; kh < jcp.kh; ++kh) { - te_size start_bound = nstl::max( - jcp.t_pad - oh_block * jcp.oh_blk_size - kh, 0); - write_table(start_bound); - } - } - /* Write offset count for 'input' address calculation. The offset for - * the input address is conditioned by the 'h' padding intersection over - * the output rows. */ - for (int kh = 1; kh < jcp.kh; ++kh) { - te_size kh_accum_value = nstl::max(nstl::min(kh - jcp.t_pad, 1), 0); - write_table(kh_accum_value); - } - /* Last value is not used for offset calculation, write 'nop' - * equivalent*/ - write_table(0); - - /* Non-padded blocks always increment 'kh' dimension */ - for (int oh_block = 0; oh_block < h_bounds_count - 1; oh_block++) { - for (int kh = 0; kh < jcp.kh; ++kh) { - te_size kh_accum_value = 1; - write_table(kh_accum_value); - } - } - - /* number of input elements that overlap over output */ - int ih_overlap = jcp.oh_blk_size + jcp.kh - 1 - jcp.t_pad - jcp.b_pad; - - /* End Bounds for 'oh' default to 'OH' or OH_BLOCK_SIZE, unless - * the 'oh_block' is within the 'bottom_padding' region. */ - int oh_end_blk = 0; - for (; oh_end_blk < h_bounds_count - 1; ++oh_end_blk) { - for (int kh = 0; kh < jcp.kh; ++kh) { - te_size end_bound = nstl::min((jcp.ih / jcp.stride_h) - - jcp.oh_blk_size - oh_end_blk * jcp.oh_blk_size - + ih_overlap + 1 - kh, - jcp.oh_blk_size); - write_table(end_bound); - } - } - /* Write bounds for the special case of when 'oh_block' falls within the - * 'bottom_paddin' region - this always executes since at least 1 row of - * bounds should exist. */ - const int pad = nstl::max(jcp.b_pad, jcp.t_pad); - ih_overlap - = (jcp.ih / jcp.stride_h + jcp.kh - 1 - jcp.t_pad - jcp.b_pad); - oh_end_blk = jcp.oh - jcp.oh_blk_size; - for (int kh = 0; kh < jcp.kh; ++kh) { - te_size end_bound = nstl::min( - jcp.oh_blk_size, ih_overlap - oh_end_blk + pad - kh); - write_table(end_bound); - } - } -} - -template -inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_bias_loop() { - +inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_bias_loop( + const int block_size) { Label oh_label; Label ow_blk_label; - const int oh_block_size = jcp.oh_blk_size; - const int ow_unroll = jcp.ur_w; - const int ow_block_count = jcp.ow / ow_unroll; + const int unroll_w = nstl::min(block_size, jcp.ow); + const int unroll_w_trips = jcp.ow / unroll_w; + const int tail_w = jcp.ow > block_size ? jcp.ow % block_size : 0; + const int ch_offset = jcp.ch_block; - mov(tmp_reg_idx_output, reg_output_baddr); + mov(reg_oh, ptr[this->param1 + offsetof(jit_dw_conv_call_s, oh_index)]); + mov(reg_oh_worksize, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, oh_count)]); - xor_(iter_oh, iter_oh); + mov(reg_tmp_output, reg_output_baddr); L(oh_label); { - xor_(iter_ow_blk, iter_ow_blk); + mov(iter_ow_blk, unroll_w_trips); L(ow_blk_label); { - compute_bias_step_unroll(ow_unroll); + compute_bias_step_unroll(unroll_w); + add(reg_tmp_output, unroll_w * ch_offset * sizeof(float)); - add(tmp_reg_idx_output, ow_unroll * ch_offset * sizeof(float)); + dec(iter_ow_blk); + cmp(iter_ow_blk, 0); + jg(ow_blk_label, T_NEAR); + } - inc(iter_ow_blk); - cmp(iter_ow_blk, ow_block_count); - jl(ow_blk_label, T_NEAR); + if (tail_w > 0) { + compute_bias_step_unroll(tail_w); + add(reg_tmp_output, tail_w * ch_offset * sizeof(float)); } - inc(iter_oh); - cmp(iter_oh, oh_block_size); + inc(reg_oh); + cmp(reg_oh, reg_oh_worksize); jl(oh_label, T_NEAR); } } template -inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_kh_loop( - int l_pad, int r_pad, int pad_offset, bool first_iteration, - int ow_block) { +inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_zero_filter() { - Label kh_label; - Label oh_label; - Label exit_innerloop_label; - Label skip_load_acc; + const int ch_offset = jcp.ch_block; - const int table_row_count = get_loop_bounds_count( - nstl::max(jcp.t_pad, jcp.b_pad), jcp.oh, jcp.oh_blk_size); - const int ih_table_off = 1 * table_row_count * jcp.kh * sizeof(te_size); - const int end_bound_table_off - = 2 * table_row_count * jcp.kh * sizeof(te_size); + Label kh_loop_label, skip_zeroing_label; + + mov(reg_exec_flags, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, exec_flags)]); + and_(reg_exec_flags, FLAG_ZERO_FILTER); + test(reg_exec_flags, reg_exec_flags); + je(skip_zeroing_label); + + zero_filter(); + + mov(reg_tmp_filter, reg_filter_baddr); + mov(reg_kh, jcp.kh); + L(kh_loop_label); + { + store_filter(); + + add(reg_tmp_filter, jcp.kw * ch_offset * sizeof(float)); + dec(reg_kh); + cmp(reg_kh, 0); + jg(kh_loop_label); + } + + /* Comeback pointers */ + sub(reg_tmp_filter, jcp.kh * jcp.kw * ch_offset * sizeof(float)); + + L(skip_zeroing_label); +} + +template +inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_h_step( + int unroll_w, int l_pad, int pad_offset, int ow_block) { const int ch_offset = jcp.ch_block; - const bool handle_padding = (jcp.t_pad > 0) || (jcp.b_pad > 0); + Label kh_loop_label, skip_loop_label; - mov(tmp_reg_filter, reg_filter_baddr); - mov(tmp_reg_kh_input, reg_input_baddr); - xor_(reg_tmp_off, reg_tmp_off); + cmp(reg_kh_count, 0); + je(skip_loop_label, T_NEAR); - if (handle_padding) { - mov(reg_bound_table_addr, bound_start_table); + mov(reg_kh, reg_kh_count); + L(kh_loop_label); + { + load_filter(); + compute_ow_step_unroll(unroll_w, l_pad, pad_offset, ow_block); + store_filter(); - /* move to the row containing the indices for the current 'h' block */ - mov(reg_tmp_off, reg_table_idx); - imul(reg_tmp_off, reg_tmp_off, jcp.kh * sizeof(unsigned char)); - add(reg_bound_table_addr, reg_tmp_off); + add(reg_tmp_filter, jcp.kw * ch_offset * sizeof(float)); + add(reg_tmp_input, jcp.iw * ch_offset * sizeof(float)); + dec(reg_kh); + cmp(reg_kh, 0); + jg(kh_loop_label); } - xor_(iter_kh, iter_kh); - L(kh_label); + /* Comeback pointers */ + Label kh_comeback_label; + mov(reg_kh, reg_kh_count); + L(kh_comeback_label); { + sub(reg_tmp_input, jcp.iw * ch_offset * sizeof(float)); + sub(reg_tmp_filter, jcp.kw * ch_offset * sizeof(float)); + dec(reg_kh); + cmp(reg_kh, 0); + jg(kh_comeback_label, T_NEAR); + } - mov(tmp_reg_idx_output, reg_output_baddr); - mov(tmp_reg_idx_input, tmp_reg_kh_input); + L(skip_loop_label); +} - if (first_iteration) { +template +inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_h_loop( + int unroll_w, int l_pad, int pad_offset, int ow_block) { - /* apply zero filter */ - zero_filter(); + const size_t io_overlap = jcp.ih / jcp.stride_h < jcp.oh ? + jcp.ih / jcp.stride_h - 1 : + jcp.oh - jcp.b_pad - 1; + const int ch_offset = jcp.ch_block; + const int t_overlap_off = jcp.t_pad % jcp.stride_h == 0 ? jcp.stride_h : 1; + const int b_overlap_off = jcp.b_pad % jcp.stride_h == 0 ? jcp.stride_h : 1; - /* if zero_filter_flag is set to '1', load filter memory into - * reg_accum */ - if (jcp.with_bias) { - mov(reg_tmp_al, reg_exec_flag); - and_(reg_tmp_al, FLAG_ZERO_FILTER); - cmp(reg_tmp_al, 0); - } else { - /* none of the other flags are active, so we can use the - * register directly */ - cmp(reg_exec_flag, 0); - } - je(skip_load_acc); - load_filter(); - L(skip_load_acc); + Label tpad_loop_label, h_loop_label, skip_tpad_label, skip_bpad_label, + end_h_loop_label; - } else { - load_filter(); - } + mov(reg_oh, ptr[this->param1 + offsetof(jit_dw_conv_call_s, oh_index)]); + mov(reg_oh_worksize, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, oh_count)]); + mov(reg_kh_count, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, kh_count)]); - xor_(iter_oh, iter_oh); + mov(reg_tmp_output, reg_output_baddr); + mov(reg_tmp_input, reg_input_baddr); + mov(reg_tmp_filter, reg_filter_baddr); - if (handle_padding) { + L(h_loop_label); + { - /* 'oh loop' initial bounds are stored in bound_table */ - mov(iter_oh_lb, byte[reg_bound_table_addr]); + compute_h_step(unroll_w, l_pad, pad_offset, ow_block); - /* skip 'oh' row that intersects with top padding */ - xor_(reg_tmp_off, reg_tmp_off); - mov(reg_tmp_off, iter_oh); - imul(reg_tmp_off, reg_tmp_off, jcp.ow * ch_offset * sizeof(float)); - add(tmp_reg_idx_output, reg_tmp_off); + add(reg_tmp_output, jcp.ow * ch_offset * sizeof(float)); - /* forward the input address by 'stride_h' */ - if (jcp.stride_h > 1) { - xor_(reg_tmp_off, reg_tmp_off); - mov(reg_tmp_off, iter_oh); - imul(reg_tmp_off, reg_tmp_off, - (jcp.stride_h - 1) * jcp.iw * ch_offset * sizeof(float)); - add(tmp_reg_idx_input, reg_tmp_off); - } - } - - L(oh_label); - { + /* If within the top_pad region */ + if (jcp.t_pad > 0) { + /* Skip t_pad area if no longer in initial h_block */ + cmp(reg_oh, jcp.t_pad); + jg(skip_tpad_label, T_NEAR); - compute_ow_step_unroll(l_pad, r_pad, pad_offset, ow_block); + cmp(reg_kh_count, jcp.kh); + jge(skip_tpad_label, T_NEAR); - add(tmp_reg_idx_input, - jcp.stride_h * jcp.iw * ch_offset * sizeof(float)); - add(tmp_reg_idx_output, jcp.ow * ch_offset * sizeof(float)); + add(reg_kh_count, t_overlap_off); + sub(reg_tmp_filter, + t_overlap_off * jcp.kw * ch_offset * sizeof(float)); - inc(iter_oh); - if (handle_padding) { - /* 'oh loop' end bounds are stored in bound_table (precomputed - * during JIT generation) */ - cmp(iter_oh_lb, - byte[reg_bound_table_addr + end_bound_table_off]); - } else { - cmp(iter_oh, jcp.oh_blk_size); + /* kernel has moved beyond padding (adjust for stride effects) */ + if (jcp.t_pad % jcp.stride_h != 0) { + int inp_corr = jcp.stride_h - jcp.t_pad % jcp.stride_h; + add(reg_tmp_input, + inp_corr * jcp.iw * ch_offset * sizeof(float)); } - jl(oh_label, T_NEAR); + jmp(tpad_loop_label, T_NEAR); } - store_filter(); + L(skip_tpad_label); - add(tmp_reg_filter, jcp.kw * ch_offset * sizeof(float)); + cmp(reg_oh, io_overlap); + jl(skip_bpad_label, T_NEAR); + sub(reg_kh_count, b_overlap_off); - if (handle_padding) { - xor_(kh_offset, kh_offset); - mov(kh_offset_lb, byte[reg_bound_table_addr + ih_table_off]); - /* increase 'ih' row in regards to 'kh'. */ - imul(kh_offset, kh_offset, jcp.iw * ch_offset * sizeof(float)); - add(tmp_reg_kh_input, kh_offset); + L(skip_bpad_label); + add(reg_tmp_input, jcp.stride_h * jcp.iw * ch_offset * sizeof(float)); - /* increase bound_table idx for the next 'kh' value in table*/ - add(reg_bound_table_addr, sizeof(te_size)); - } else { - add(tmp_reg_kh_input, jcp.iw * ch_offset * sizeof(float)); - } + L(tpad_loop_label); + + cmp(reg_oh, jcp.ih / jcp.stride_h); + jge(end_h_loop_label, T_NEAR); - inc(iter_kh); - cmp(iter_kh, jcp.kh); - jl(kh_label, T_NEAR); + inc(reg_oh); + + cmp(reg_oh, reg_oh_worksize); + jl(h_loop_label, T_NEAR); } + L(end_h_loop_label); } template inline void jit_uni_dw_conv_bwd_weights_kernel_f32::compute_ow_block_unroll() { - Label skip_load_bias; - - /* Only apply zero_filter (xor'ing accum_reg) on the left edge */ - bool zero_filter_1st_iter = true; - const int ch_offset = jcp.ch_block; - - const int ow_block_size = jcp.ow_blk_size; - const int iw_block_size = jcp.ow_blk_size * jcp.stride_w; - - int w_unrolled_loop_count = jcp.ow / ow_block_size; - - const bool handle_padding = (jcp.l_pad > 0) || (jcp.r_pad > 0); - - int pad_offset = jcp.l_pad; - - int ow_block = 0; - + int ow = jcp.ow; + int pad_offset = 0; + int l_pad = jcp.l_pad; + + /* Calculate effective padding */ + int r_pad = nstl::max(0, (ow - 1) * jcp.stride_w + + (jcp.kw - 1) * (jcp.dilate_w + 1) + - (jcp.iw + jcp.l_pad - 1)); + + /* Is this strictly defined by: + * -code-size (?) + * -address size (?) */ + const int max_unroll_w = 30; + const int block_size = 15; + + int unroll_w_tail = 0; + int unroll_w = 0; + int unroll_w_trips = 0; + + if (jcp.ow > max_unroll_w) { + unroll_w = nstl::min(block_size, jcp.ow); + unroll_w_trips = ow / unroll_w; + /* calculate tail */ + unroll_w_tail = ow % unroll_w; + /* Perform some rebalancing if tail too small*/ + if ((unroll_w_tail == 0 && r_pad != 0) + || (r_pad > 0 && r_pad >= unroll_w_tail)) { + if (unroll_w_trips > 1) { + unroll_w_tail += unroll_w; + unroll_w_trips--; + } else { + /* Idealy, this case shouldn't happen */ + unroll_w_tail += (unroll_w - unroll_w / 2); + unroll_w = unroll_w / 2; + } + } + } else { + unroll_w = jcp.ow; + unroll_w_trips = nstl::max(1, ow / unroll_w); + } if (jcp.with_bias) { + Label skip_load_bias; + mov(reg_bias_baddr, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, bias)]); zero_bias(); - /* if zero_bias is '1', load bias accumulator from memory. This happens - * after the first iteration is executed */ - mov(reg_tmp_al, reg_exec_flag); - and_(reg_tmp_al, FLAG_ZERO_BIAS); - cmp(reg_tmp_al, 0); - je(skip_load_bias); + mov(reg_exec_flags, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, exec_flags)]); + and_(reg_exec_flags, FLAG_ZERO_BIAS); + test(reg_exec_flags, reg_exec_flags); + jne(skip_load_bias); + load_bias(); - L(skip_load_bias); - compute_bias_loop(); + L(skip_load_bias); + compute_bias_loop(block_size); store_bias(); } - /* compute left padded block */ - if (handle_padding) { - - const int r_pad = jcp.iw - ow_block_size > 0 ? 0 : jcp.r_pad; - - compute_kh_loop(jcp.l_pad, r_pad, 0, zero_filter_1st_iter, ow_block); - zero_filter_1st_iter = false; + /* Pass filter address, then offset for h_padding. */ + compute_zero_filter(); + mov(reg_kh_offset, + ptr[this->param1 + offsetof(jit_dw_conv_call_s, filter_pad_off)]); + add(reg_filter_baddr, reg_kh_offset); - w_unrolled_loop_count--; - - if (w_unrolled_loop_count >= 1) { - add(reg_output_baddr, ow_block_size * ch_offset * sizeof(float)); - add(reg_input_baddr, iw_block_size * ch_offset * sizeof(float)); - } + /* compute left padded block */ + if (l_pad) { + compute_h_loop(unroll_w, l_pad, 0, 0); + add(reg_output_baddr, unroll_w * ch_offset * sizeof(float)); + add(reg_input_baddr, + unroll_w * jcp.stride_w * ch_offset * sizeof(float)); + unroll_w_trips--; + pad_offset = l_pad; + l_pad = 0; } - /* This block may execute under 2 different scenarios: - * 1) When padding is present, this executes the middle loop (if any). - * 2) With no padding, it writes the full loop of the micro-kernel. */ - int middle_loop_count = handle_padding ? w_unrolled_loop_count - 1 : - w_unrolled_loop_count; - if (middle_loop_count >= 1) { - Label ow_blk_label; - - /* Insert loop for 'ow' block when middle block needs to execute more - * than once */ - bool do_ow_blk_loop = middle_loop_count > 1; - if (do_ow_blk_loop) { - mov(iter_ow_blk, middle_loop_count); - L(ow_blk_label); - } - - compute_kh_loop(0, 0, pad_offset, zero_filter_1st_iter); - /* disable zero_filter for the rest of the iterations i.e. from now on - * load contents of 'filter' from memory */ - mov(reg_exec_flag, FLAG_ZERO_FILTER); - - if (do_ow_blk_loop || handle_padding) { - add(reg_output_baddr, ow_block_size * ch_offset * sizeof(float)); - add(reg_input_baddr, iw_block_size * ch_offset * sizeof(float)); - } - - if (do_ow_blk_loop) { - dec(iter_ow_blk); - cmp(iter_ow_blk, 0); - jg(ow_blk_label, T_NEAR); - } + /* compute middle block */ + Label ow_blk_label; - w_unrolled_loop_count -= middle_loop_count; + /* Insert loop for 'ow' block when middle block needs to execute more + * than once */ + bool do_ow_blk_loop = unroll_w_trips > 1; + if (do_ow_blk_loop) { + mov(iter_ow_blk, unroll_w_trips); + L(ow_blk_label); + } + if (unroll_w_trips > 0) { + compute_h_loop(unroll_w, l_pad, pad_offset, 0); + add(reg_output_baddr, unroll_w * ch_offset * sizeof(float)); + add(reg_input_baddr, + unroll_w * jcp.stride_w * ch_offset * sizeof(float)); + } + if (do_ow_blk_loop) { + dec(iter_ow_blk); + cmp(iter_ow_blk, 0); + jg(ow_blk_label, T_NEAR); } - /* compute right padded block: ow_blk = LAST */ - if (handle_padding && w_unrolled_loop_count >= 1) { - ow_block = jcp.ow - ow_block_size; - compute_kh_loop( - 0, jcp.r_pad, pad_offset, zero_filter_1st_iter, ow_block); - - w_unrolled_loop_count--; + /* compute right padded block */ + if (unroll_w_tail) { + compute_h_loop(unroll_w_tail, 0, pad_offset, jcp.ow - unroll_w_tail); } } @@ -1245,17 +1209,10 @@ void jit_uni_dw_conv_bwd_weights_kernel_f32::generate() { ptr[this->param1 + offsetof(jit_dw_conv_call_s, output)]); mov(reg_filter_baddr, ptr[this->param1 + offsetof(jit_dw_conv_call_s, filter)]); - if (jcp.with_bias) - mov(reg_bias_baddr, - ptr[this->param1 + offsetof(jit_dw_conv_call_s, bias)]); - mov(reg_table_flags, - ptr[this->param1 + offsetof(jit_dw_conv_call_s, table_flags)]); compute_ow_block_unroll(); this->postamble(); - - create_h_bounds_table(); } template @@ -1263,8 +1220,7 @@ status_t jit_uni_dw_conv_bwd_weights_kernel_f32::init_conf( jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &diff_weights_d, - const memory_desc_wrapper &diff_dst_d) { - + const memory_desc_wrapper &diff_dst_d, int nthreads) { if (!mayiuse(isa)) return status::unimplemented; @@ -1295,8 +1251,6 @@ status_t jit_uni_dw_conv_bwd_weights_kernel_f32::init_conf( jcp.stride_w = cd.strides[1]; jcp.t_pad = cd.padding[0][0]; - /* bottom padding should equal top padding to generate the proper 'h' loop - * bounds. */ jcp.b_pad = cd.padding[1][0]; jcp.l_pad = cd.padding[0][1]; @@ -1315,53 +1269,71 @@ status_t jit_uni_dw_conv_bwd_weights_kernel_f32::init_conf( auto desired_act_fmt = isa == avx512_common ? nChw16c : nChw8c; auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g; - bool args_ok = true - && src_d.format() == desired_act_fmt - && diff_weights_d.format() == desired_wei_fmt - && diff_dst_d.format() == desired_act_fmt - && one_of(cd.bias_desc.format, memory_format::undef, any, x) - //&& jcp.ngroups % simd_w == 0 - && jcp.ngroups % jcp.ch_block == 0 - && jcp.dilate_h == 0 - && jcp.dilate_w == 0 - && jcp.kw <= 3 - && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1 - && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1; - if (!args_ok) return status::unimplemented; - - /* Note: this IMPLICATION-check does not allow 'negative padding' execution - */ - bool ok = true && IMPLICATION(jcp.r_pad > 0, jcp.r_pad == jcp.l_pad) - && IMPLICATION(jcp.b_pad > 0, jcp.b_pad == jcp.t_pad); - if (!ok) + bool args_ok = true && src_d.format() == desired_act_fmt + && diff_weights_d.format() == desired_wei_fmt + && diff_dst_d.format() == desired_act_fmt + && one_of(cd.bias_desc.format, memory_format::undef, any, x) + && jcp.ngroups % jcp.ch_block == 0 && jcp.dilate_h == 0 + && jcp.dilate_w == 0 && jcp.kw <= 3 + && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1 + && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1; + if (!args_ok) return status::unimplemented; jcp.nb_ch = jcp.ngroups / jcp.ch_block; - /* Values for block size to try; order gives priority */ - constexpr int BLOCK_SIZE[] = { 14, 16, 7, 8 }; - - int block_size_h = 1; - int block_size_w = 1; + /* kernel applicability check wrt boundaries + * the conditions are quite general across the kernels we have, + * but ideally the check should belong to a specific kernel... */ + const int max_hpad = (jcp.kh - 1 + 1) / 2; + const int max_wpad = (jcp.kw - 1 + 1) / 2; + const bool boundaries_ok = true && jcp.t_pad <= max_hpad + && jcp.b_pad <= max_hpad && jcp.l_pad <= max_wpad + && jcp.r_pad <= max_wpad; + if (!boundaries_ok) + return status::unimplemented; - /* *Try different block sizes for convolution */ - for (int block : BLOCK_SIZE) { + balance(jcp, nthreads); - block_size_h = block / jcp.stride_h; - block_size_w = block / jcp.stride_w; + return status::success; +} - if ((jcp.oh % block_size_h == 0) && (jcp.ow % block_size_w == 0)) - break; +template +void jit_uni_dw_conv_bwd_weights_kernel_f32::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) { + /* Notes: if splitting thread work on 'mb', then a reduction has to take + * place. Hence, book a per-thread, local weights-buffer for the + * reduction */ + if (jcp.nthr_mb > 1) { + const size_t wei_size = jcp.ngroups * jcp.kh * jcp.kw; + scratchpad.book(key_conv_wei_reduction, + sizeof(float) * wei_size * (jcp.nthr_mb - 1)); + + if (jcp.with_bias) + scratchpad.book(key_conv_bia_reduction, + sizeof(float) * jcp.ngroups * (jcp.nthr_mb - 1)); } +} - if (jcp.oh % block_size_h != 0 || jcp.ow % block_size_w != 0) - return status::unimplemented; - - jcp.oh_blk_size = block_size_h; - - jcp.ur_w = jcp.ow_blk_size = block_size_w; +template +void jit_uni_dw_conv_bwd_weights_kernel_f32::balance(jit_conv_conf_t &jcp, + int nthreads) { + jcp.nthr = nthreads; + jcp.nthr_g = jcp.nthr_mb = 1; + + /* Basic-Heuristics for parallel strategy: + * 1) Tries to parallel on the number of Groups (g) where tasks are + * independent. Otherwise, + * 2) Tries to split the work across g and MiniBatch (mb). + * Parallelizing on mb requires computing a reduction for weights. + * + * NOTE: because of 'task partitioning' scheme, there will be unbalanced + * per-thread load when the number of threads is high (e.g. > 16). + */ + jcp.nthr_g = nstl::min(jcp.nb_ch, jcp.nthr); + jcp.nthr_mb = nstl::min(nstl::max(1, jcp.nthr / jcp.nthr_g), jcp.mb); - return status::success; + jcp.nthr = jcp.nthr_g * jcp.nthr_mb; } template struct jit_uni_dw_conv_bwd_weights_kernel_f32; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.hpp index 103687b..6a6aa27 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.hpp @@ -18,6 +18,8 @@ #define JIT_UNI_DW_CONV_KERNEL_F32_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" #include "jit_uni_eltwise.hpp" @@ -52,8 +54,10 @@ struct jit_uni_dw_conv_fwd_kernel_f32: public jit_generator { static status_t init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, - bool with_relu = false, float relu_negative_slope = 0.f); + const memory_desc_wrapper &dst_d, const primitive_attr_t &attr); + + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); jit_conv_conf_t jcp; const primitive_attr_t &attr_; @@ -114,10 +118,14 @@ struct jit_uni_dw_conv_bwd_data_kernel_f32: public jit_generator { } static status_t init_conf(jit_conv_conf_t &jcp, - const convolution_desc_t &cd, const memory_desc_wrapper &diff_src_d, + const convolution_desc_t &cd, + const memory_desc_wrapper &diff_src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &diff_dst_d); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); + jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_s *); @@ -167,23 +175,23 @@ struct jit_uni_dw_conv_bwd_weights_kernel_f32 : public jit_generator { static status_t init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &diff_weights_d, - const memory_desc_wrapper &diff_dst_d); + const memory_desc_wrapper &diff_dst_d, int nthreads); + + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp); + + static void balance(jit_conv_conf_t &jcp, int nthreads); jit_conv_conf_t jcp; void (*jit_ker)(jit_dw_conv_call_s *); private: - //using Vmm = Xbyak::Zmm; using Vmm = typename utils::conditional3::type; using reg64_t = const Xbyak::Reg64; - using te_size - = unsigned char; /* set the 'table_entry' data size. For this - implementation, only values > 255 are needed. */ const int simd_w = cpu_isa_traits::vlen / sizeof(float); const int reg_repeats = (isa == sse42) ? 2 : 1; - inline void write_table(te_size data) { db(data); } - //const Xbyak::AddressFrame &vmmword = zword; + const Xbyak::AddressFrame &vmmword = (isa == sse42) ? xword : (isa == avx2) ? yword : zword; @@ -195,116 +203,51 @@ private: inline Vmm get_acc_reg(int idx) { return Vmm(idx + 1 * reg_repeats + 1); } inline Vmm get_aux_reg() { return Vmm(0); } - reg64_t tmp_reg_idx_input = r8; - reg64_t tmp_reg_kh_input = r9; - reg64_t tmp_reg_idx_output = r10; - reg64_t tmp_reg_filter = r11; + reg64_t reg_tmp_input = r9; + reg64_t reg_tmp_output = r10; + reg64_t reg_tmp_filter = r13; + reg64_t reg_kh_offset = rax; /* parameter passed by driver into kernel */ - reg64_t reg_table_flags = rbx; - Xbyak::Reg8 reg_table_idx = bl; - Xbyak::Reg8 reg_exec_flag = bh; - - /* holds the address for the 'bounds table' that is generated during JIT */ - reg64_t reg_bound_table_addr = r13; + Xbyak::Reg8 reg_exec_flags = bl; - reg64_t reg_tmp_off = rax; - Xbyak::Reg8 reg_tmp_al = al; + reg64_t reg_oh_worksize = r14; + reg64_t reg_oh = rax; - reg64_t iter_oh = rdx; - Xbyak::Reg8 iter_oh_lb = dl; - reg64_t kh_offset = rdx; - Xbyak::Reg8 kh_offset_lb = dl; + reg64_t iter_ow_blk = r11; - reg64_t iter_ow_blk = rbp; - reg64_t iter_kh = rsi; + reg64_t reg_kh = rsi; + reg64_t reg_kh_count = rdx; /* Base addresses for convolution parameters. */ reg64_t reg_input_baddr = r15; reg64_t reg_output_baddr = r12; reg64_t reg_filter_baddr = abi_not_param1; - reg64_t reg_bias_baddr = r14; - - Xbyak::Label bound_start_table; - - /* Return the amount of blocks to execute depending on the convolution - * dimensions and block_size e.g. - * {ow = 112, ow_block_size = 14} -> requires: - * 1 left block, - * 1 middle block, - * 1 right block; - * {ow = 28, ow_block_size = * 14} -> requires: - * 1 left block, - * 1 right block. */ - inline int get_loop_bounds_count( - const int padding, const int h_dimension, const int block_size) { - const int num_top_padded_blk = utils::div_up(padding, block_size); - const int num_tail_blk - = (h_dimension - num_top_padded_blk * block_size > 0) ? 1 : 0; - const int num_middle_blk - = (h_dimension - - (num_top_padded_blk + num_tail_blk) * block_size - > 0) ? 1 : 0; - return num_top_padded_blk + num_middle_blk + num_tail_blk; - } - - /* Create a table containing the values that define the kernel's loop - * behavior. The purpose of using this table is to eliminate the - * implementation complexities and performance impact of in-execution - * computation of loop bounds in regards to stride and padding. The table - * consists of 3 sections: - * 1) Initial Bounds for 'oh' loop. - * 2) Input address offset flag: '1' indicates an input address increment, - * '0' results in no increment. - * 3) End-bounds for 'oh' loop. - * - * The table is written into memory as the following format: - * Filter_size: |--- kh ---| - * Table: __________ - * 1st section: | | - * |- - - - - | - * 2nd section: | | - * |- - - - - | - * 3rd section: |__________| - * - * Example for convolution: ih=112, oh=112, kh=3, ph=1 - * __________ - * | 1, 0, 0| -> upper 'oh' loop initial bounds - * | 0, 0, 0| -> middle 'oh' loop initial bounds - * | 0, 0, 0| -> bottom loop initial bounds - * |----------| - * | 0, 1, 0| -> *There is no input offset for kh = 0, i.e. the - * | 1, 1, 1| offset_flag is '0' becase of padding. - * | 1, 1, 1| - * |----------| - * |14, 14, 14| -> lower 'oh' loop end bounds - * |14, 14, 14| -> (etc) - * |14, 14, 13| -> *The last 'kh' loop has an upper bound of 13 - * |__________| because of padding. - * 0, 1, 2 -> kh values - * */ - inline void create_h_bounds_table(); + reg64_t reg_bias_baddr = r13; /* Micro-kernel JIT'ing, fusing 'kw' and 'ow_block' loops into unrolled FMAs */ inline void compute_ow_step_unroll( - int l_pad, int r_pad, int pad_offset, int ow_block); + int unroll_w, int l_pad, int pad_offset, int ow_block); /* JIT'ing the outer loops for the micro-kernel -> {kh, oh_block} */ - inline void compute_kh_loop(int l_pad, int r_pad, int pad_offset, - bool first_iteration, int ow_block = 0); + inline void compute_h_step( + int unroll_w, int l_pad, int pad_offset, int ow_block); + inline void compute_h_loop( + int unroll_w, int l_pad, int pad_offset, int ow_block); /* Write 'width' micro-kernel JITs; depending on the padding and convolution * size, write a micro-kernel for the left ow-block, middle ow-block(s), and * right ow-block.*/ inline void compute_ow_block_unroll(); + inline void compute_zero_filter(); inline void load_filter(); inline void zero_filter(); inline void load_bias(); inline void zero_bias(); inline void compute_bias_step_unroll(const int unroll_w); - inline void compute_bias_loop(); + inline void compute_bias_loop(const int block_size); inline void store_filter(); inline void store_bias(); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.cpp index 48c1961..82a7a9d 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.cpp @@ -14,38 +14,42 @@ * limitations under the License. *******************************************************************************/ -#include "mkldnn_types.h" - #include "c_types_map.hpp" -#include "jit_uni_dw_convolution.hpp" +#include "memory_tracking.hpp" #include "mkldnn_thread.hpp" +#include "jit_uni_dw_convolution.hpp" + namespace mkldnn { namespace impl { namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; -template -void _jit_uni_dw_convolution_fwd_t::execute_forward() { +template +void _jit_uni_dw_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; - if (conf_.want_padded_bias()) { - for (int oc = 0; oc < jcp.oc_without_padding; ++oc) - padded_bias_[oc] = bias[oc]; - bias = padded_bias_; + if (pd()->wants_padded_bias()) { + auto padded_bias = this->scratchpad().template get( + key_conv_padded_bias); + utils::array_copy(padded_bias, bias, jcp.oc_without_padding); + utils::array_set(padded_bias + jcp.oc_without_padding, 0.f, + jcp.oc - jcp.oc_without_padding); + bias = padded_bias; } int dil_h = jcp.dilate_h + 1; @@ -85,7 +89,7 @@ void _jit_uni_dw_convolution_fwd_t::execute_forward() { return par_conv; }; - int MB = conf_.MB(); + int MB = pd()->MB(); const int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking); parallel_nd(MB, chb_work, jcp.oh, [&](int n, int chb, int oh) { @@ -134,31 +138,24 @@ void _jit_uni_dw_convolution_fwd_t::execute_forward() { kernel_->jit_ker(&par_conv); } }); -} -template void _jit_uni_dw_convolution_fwd_t - ::execute_forward(); -template void _jit_uni_dw_convolution_fwd_t - ::execute_forward(); -template void _jit_uni_dw_convolution_fwd_t - ::execute_forward(); + if (pd()->wants_zero_pad_dst()) + output_memory_primitive(0)->zero_pad(); +} -template void _jit_uni_dw_convolution_fwd_t - ::execute_forward(); -template void _jit_uni_dw_convolution_fwd_t - ::execute_forward(); -template void _jit_uni_dw_convolution_fwd_t - ::execute_forward(); +template struct _jit_uni_dw_convolution_fwd_t; +template struct _jit_uni_dw_convolution_fwd_t; +template struct _jit_uni_dw_convolution_fwd_t; template -void _jit_uni_dw_convolution_bwd_data_t::execute_backward_data() { +void _jit_uni_dw_convolution_bwd_data_t::execute_backward_data() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); const auto &jcp = kernel_->jcp; @@ -192,7 +189,7 @@ void _jit_uni_dw_convolution_bwd_data_t::execute_backward_data() { return par_conv; }; - int MB = conf_.MB(); + int MB = pd()->MB(); const int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking); parallel_nd(MB, chb_work, jcp.ih, [&](int n, int chb, int ih) { @@ -247,264 +244,185 @@ void _jit_uni_dw_convolution_bwd_data_t::execute_backward_data() { }); } -template void _jit_uni_dw_convolution_bwd_data_t - ::execute_backward_data(); -template void _jit_uni_dw_convolution_bwd_data_t - ::execute_backward_data(); -template void _jit_uni_dw_convolution_bwd_data_t - ::execute_backward_data(); +template struct _jit_uni_dw_convolution_bwd_data_t; +template struct _jit_uni_dw_convolution_bwd_data_t; +template struct _jit_uni_dw_convolution_bwd_data_t; template _jit_uni_dw_convolution_bwd_weights_t:: - _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - - const auto &jcp = conf_.jcp_; - - kernel_ = new jit_uni_dw_conv_bwd_weights_kernel_f32(jcp); - - const int max_threads - = (mkldnn_in_parallel()) ? 1 : mkldnn_get_max_threads(); - nthr_ = max_threads; - - nthr_g_ = nthr_mb_ = 1; - - /* Basic-Heuristics for parallel strategy: - * 1) Tries to parallel on the number of Groups (g) where tasks are - * independent. Otherwise, - * 2) Tries to split the work across g and MiniBatch (mb). - * Parallelizing on mb requires computing a reduction for weights. - * - * NOTE: because of 'task partitioning' scheme, there will be unbalanced - * per-thread load when the number of threads is high (e.g. > 16). - */ - nthr_g_ = nstl::min(jcp.nb_ch, nthr_); - nthr_mb_ = nstl::min(nstl::max(1, nthr_ / nthr_g_), jcp.mb); - - nthr_ = nthr_g_ * nthr_mb_; - - /* Notes: if splitting thread work on 'mb', then a reduction has to take - * place. Hence, allocate a per-thread, local weights-buffer for the - * reduction */ - if (nthr_mb_ > 1) { - const size_t wei_size = jcp.ngroups * jcp.kh * jcp.kw; - ws_reduction_ = (data_t *)malloc( - (nthr_mb_ - 1) * wei_size * sizeof(data_t), 64); - - if (jcp.with_bias) { - const size_t bias_size = jcp.ngroups; - bias_reduction_ = (data_t *)malloc( - (nthr_mb_ - 1) * bias_size * sizeof(data_t), 64); - } - - /* Used when executing a parallel reduction */ - if(do_parallel_reduction()){ - acc_ker_ = new cpu_accumulator_1d_t(); - simple_barrier::ctx_init(&reduction_bctx_); - } - } +_jit_uni_dw_convolution_bwd_weights_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) + , kernel_(nullptr), acc_ker_(nullptr) +{ + kernel_ = new jit_uni_dw_conv_bwd_weights_kernel_f32(pd()->jcp_); + if (pd()->jcp_.nthr_mb > 1 && do_parallel_reduction()) + acc_ker_ = new cpu_accumulator_1d_t(); } + template -void _jit_uni_dw_convolution_bwd_weights_t::execute_backward_weights() { +void _jit_uni_dw_convolution_bwd_weights_t::execute_backward_weights() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto diff_dst = reinterpret_cast(this->input_memory(1)); + auto diff_weights = reinterpret_cast(this->memory(0)); + auto diff_bias = reinterpret_cast(this->memory(1)); + + auto diff_wei_reduction_buf = + scratchpad().template get(key_conv_wei_reduction); + auto diff_bia_reduction_buf = + scratchpad().template get(key_conv_bia_reduction); - auto src - = (data_t *)reinterpret_cast(this->input_memory(0)); - auto diff_dst - = (data_t *)reinterpret_cast(this->input_memory(1)); const auto &jcp = kernel_->jcp; - /* JIT-code skips the unnecessary computations within the padded region. */ - const int SKIP_TOP_PADDING = 0; + /* Used when executing a parallel reduction */ + simple_barrier::ctx_t reduction_bctx; + simple_barrier::ctx_init(&reduction_bctx); const size_t wei_size = jcp.ngroups * jcp.kh * jcp.kw; const size_t bias_size = jcp.with_bias ? jcp.ngroups : 0; - const int oh_blk_size = jcp.oh_blk_size; - - //const int simd_w = jcp.ch_block; const int ch_block = jcp.ch_block; auto set_kernel_params = [&](jit_dw_conv_call_s *conv_params, - const int batch, const int group, const int oh_block, - const unsigned char table_idx, const int negative_padding_offset, - const unsigned char exec_flag) { + const int batch, const int group, const int oh_start, + const int work_size, const unsigned char exec_flag, + const size_t kh_padding, const size_t filter_off) { + const int tpad_underflow_off = jcp.t_pad - filter_off; + + conv_params->exec_flags = exec_flag; + conv_params->kh_count = jcp.kh - kh_padding; - const int ih_block = oh_block * jcp.stride_h; + const int oh_s = oh_start; + const int oh_e = oh_start + work_size; + const int ih_s = oh_s * jcp.stride_h; - conv_params->table_idx = table_idx; - conv_params->exec_flag = exec_flag; + conv_params->filter_pad_off + = filter_off * jcp.kw * ch_block * sizeof(float); + conv_params->oh_index = oh_s; + conv_params->oh_count = oh_e; size_t diff_dst_off - = ((batch * (jcp.ngroups / ch_block) + group) * jcp.oh + oh_block) + = ((batch * (jcp.ngroups / ch_block) + group) * jcp.oh + + oh_start) * jcp.ow; size_t src_off = ((batch * (jcp.ngroups / ch_block) + group) * jcp.ih - + ih_block - negative_padding_offset) - * jcp.iw; + + ih_s - tpad_underflow_off) * jcp.iw; conv_params->output = &diff_dst[diff_dst_off * ch_block]; conv_params->input = &src[src_off * ch_block]; }; - parallel(nthr_, [&](const int ithr, const int nthr_) { + parallel(jcp.nthr, [&](const int ithr, const int nthr) { + assert(nthr == jcp.nthr); + auto conv_params = jit_dw_conv_call_s(); + const int h_block_size = 15; /* assign iteration space to thread */ - const int ithr_g = ithr % nthr_g_; - const int ithr_mb = (ithr / nthr_g_) % nthr_mb_; + const int ithr_g = ithr % jcp.nthr_g; + const int ithr_mb = (ithr / jcp.nthr_g) % jcp.nthr_mb; /* split dimensions */ int g_start{ 0 }, g_end{ 0 }; - balance211(jcp.nb_ch, nthr_g_, ithr_g, g_start, g_end); + balance211(jcp.nb_ch, jcp.nthr_g, ithr_g, g_start, g_end); int mb_start{ 0 }, mb_end{ 0 }; - balance211(jcp.mb, nthr_mb_, ithr_mb, mb_start, mb_end); - - auto diff_wei = ithr_mb == 0 ? - (data_t *)reinterpret_cast(this->memory(0)) : - (data_t *)ws_reduction_ + (ithr_mb - 1) * wei_size; + balance211(jcp.mb, jcp.nthr_mb, ithr_mb, mb_start, mb_end); - auto diff_bias = ithr_mb == 0 ? - (data_t *)reinterpret_cast(this->memory(1)) : - (data_t *)bias_reduction_ + (ithr_mb - 1) * bias_size; + auto diff_wei = ithr_mb == 0 + ? diff_weights : diff_wei_reduction_buf + (ithr_mb - 1) * wei_size; + auto diff_bia = ithr_mb == 0 + ? diff_bias : diff_bia_reduction_buf + (ithr_mb - 1) * bias_size; for (int g = g_start; g < g_end; ++g) { - - /* This flag controls whether the kernel loads weights from memory - * or initializes the 'weight accummulator' registers to '0'. The - * latter happens at the beginning of each group/16 computation. */ - unsigned char zero_filter_flag = ~FLAG_ZERO_FILTER; - unsigned char zero_bias_flag = jcp.with_bias ? ~FLAG_ZERO_BIAS : 0; + unsigned char zero_filter_flag = FLAG_ZERO_FILTER; + unsigned char zero_bias_flag = jcp.with_bias ? FLAG_ZERO_BIAS : 0; size_t diff_wei_off = g * jcp.kh * jcp.kw; conv_params.filter = &diff_wei[diff_wei_off * ch_block]; if (jcp.with_bias) - conv_params.bias = &diff_bias[g * ch_block]; + conv_params.bias = &diff_bia[g * ch_block]; for (int mb = mb_start; mb < mb_end; ++mb) { - - /* The 'table index' parameter controls the table entry for the - * inner kernel execution. For more details see - * jit_uni_dw_conv_kernel_f32. */ - int table_idx = 0; - - /* OH_BLOCK is unrolled to separate the computations according - * to numerous condition-setting 'h' parameter. */ - int oh_blk = 0; - - /* Top-padding case - this case always executes. */ - set_kernel_params(&conv_params, mb, g, oh_blk, table_idx, - SKIP_TOP_PADDING, zero_filter_flag & zero_bias_flag); - kernel_->jit_ker(&conv_params); - - zero_bias_flag |= FLAG_ZERO_BIAS; - zero_filter_flag |= FLAG_ZERO_FILTER; - oh_blk += oh_blk_size; - - /* Middle OH_BLOCK cases. */ - for (; oh_blk < (jcp.oh - oh_blk_size); oh_blk += oh_blk_size) { - table_idx = 1; - set_kernel_params(&conv_params, mb, g, oh_blk, table_idx, - jcp.t_pad, zero_filter_flag & zero_bias_flag); + int oh = 0; + while (oh < jcp.oh) { + const int h_work = nstl::min(h_block_size, jcp.oh - oh); + auto kh_t_padding = nstl::max(0, jcp.t_pad - oh); + auto kh_b_padding + = (oh * jcp.stride_h + jcp.kh - 1 > jcp.ih) ? + jcp.b_pad - (h_work - 1) : + 0; + + set_kernel_params(&conv_params, mb, g, oh, h_work, + zero_filter_flag | zero_bias_flag, + kh_t_padding + kh_b_padding, kh_t_padding); kernel_->jit_ker(&conv_params); - } - table_idx++; - /* Bottom block */ - if (oh_blk < jcp.oh) { - set_kernel_params(&conv_params, mb, g, oh_blk, table_idx, - jcp.t_pad, zero_filter_flag & zero_bias_flag); - kernel_->jit_ker(&conv_params); + zero_bias_flag &= ~FLAG_ZERO_BIAS; + zero_filter_flag &= ~FLAG_ZERO_FILTER; + oh += h_work; } } } - if (do_parallel_reduction() && nthr_mb_ > 1) { + if (do_parallel_reduction() && jcp.nthr_mb > 1) { size_t reduct_start{ 0 }, reduct_end{ 0 }; - balance211(wei_size, nthr_, ithr, reduct_start, reduct_end); - - const size_t reduct_off = reduct_start; - - auto *acc_data - = (data_t *)reinterpret_cast(this->memory(0)) - + reduct_off; + balance211(wei_size, nthr, ithr, reduct_start, reduct_end); const int acc_size = reduct_end - reduct_start; + const size_t reduct_off = reduct_start; + auto *acc_data = diff_weights + reduct_off; - simple_barrier::barrier(&reduction_bctx_, nthr_); - - for (int thr_mb = 1; thr_mb < nthr_mb_; ++thr_mb) { + simple_barrier::barrier(&reduction_bctx, nthr); - auto *src_data = (data_t *)ws_reduction_ + for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) { + auto *src_data = diff_wei_reduction_buf + (thr_mb - 1) * wei_size + reduct_off; - acc_ker_->accumulate(acc_data, src_data, acc_size); } } }); - /* Apply single-threaded 'mb' reduction */ - if (nthr_mb_ > 1) { - - auto diff_weights - = (data_t *)reinterpret_cast(this->memory(0)); - auto diff_bias - = (data_t *)reinterpret_cast(this->memory(1)); - - for (int thr_mb = 1; thr_mb < nthr_mb_; ++thr_mb) { - - size_t mb_accum_offset = (thr_mb - 1) * wei_size; - size_t b_accum_offset = (thr_mb - 1) * bias_size; + if (jcp.nthr_mb <= 1) return; - for (int g = 0; g < jcp.nb_ch; ++g) { - - /* Reduction on Bias */ - if (jcp.with_bias) { - PRAGMA_OMP_SIMD() - for (int g_block = 0; g_block < ch_block; ++g_block) { - size_t bias_offset = g * ch_block + g_block; - diff_bias[bias_offset] += bias_reduction_[b_accum_offset - + bias_offset]; - } + /* Apply single-threaded 'mb' reduction */ + for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) { + size_t mb_accum_offset = (thr_mb - 1) * wei_size; + size_t b_accum_offset = (thr_mb - 1) * bias_size; + + for (int g = 0; g < jcp.nb_ch; ++g) { + /* Reduction on Bias */ + if (jcp.with_bias) { + PRAGMA_OMP_SIMD() + for (int g_block = 0; g_block < ch_block; ++g_block) { + size_t bias_offset = g * ch_block + g_block; + diff_bias[bias_offset] += diff_bia_reduction_buf[ + b_accum_offset + bias_offset]; } - if (!do_parallel_reduction()) { - for (int kh = 0; kh < jcp.kh; ++kh) { - for (int kw = 0; kw < jcp.kw; ++kw) { - - size_t wei_offset = (g * jcp.kh + kh) * jcp.kw + kw; - PRAGMA_OMP_SIMD() - for (int g_block = 0; g_block < ch_block; ++g_block) { - diff_weights[wei_offset * ch_block + g_block] - += ws_reduction_[mb_accum_offset - + wei_offset * ch_block - + g_block]; - } - } - } + } + + if (do_parallel_reduction()) continue; + + for (int kh = 0; kh < jcp.kh; ++kh) + for (int kw = 0; kw < jcp.kw; ++kw) + { + size_t wei_offset = (g * jcp.kh + kh) * jcp.kw + kw; + PRAGMA_OMP_SIMD() + for (int g_block = 0; g_block < ch_block; ++g_block) { + const size_t off = wei_offset * ch_block + g_block; + diff_weights[off] += + diff_wei_reduction_buf[mb_accum_offset + off]; } } } } } -template _jit_uni_dw_convolution_bwd_weights_t:: - _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs); -template _jit_uni_dw_convolution_bwd_weights_t:: - _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs); -template _jit_uni_dw_convolution_bwd_weights_t:: - _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs); - -template void _jit_uni_dw_convolution_bwd_weights_t:: - execute_backward_weights(); -template void _jit_uni_dw_convolution_bwd_weights_t:: - execute_backward_weights(); -template void _jit_uni_dw_convolution_bwd_weights_t:: - execute_backward_weights(); +template struct _jit_uni_dw_convolution_bwd_weights_t; +template struct _jit_uni_dw_convolution_bwd_weights_t; +template struct _jit_uni_dw_convolution_bwd_weights_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.hpp index b723c1c..2f2cc7a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.hpp @@ -18,54 +18,62 @@ #define CPU_JIT_UNI_DW_CONVOLUTION_HPP #include "c_types_map.hpp" +#include "memory_tracking.hpp" + +#include "cpu_barrier.hpp" #include "cpu_convolution_pd.hpp" -#include "cpu_engine.hpp" -#include "jit_primitive_conf.hpp" -#include "jit_uni_dw_conv_kernel_f32.hpp" #include "cpu_reducer.hpp" -#include "cpu_barrier.hpp" + +#include "jit_uni_dw_conv_kernel_f32.hpp" namespace mkldnn { namespace impl { namespace cpu { -template +template struct _jit_uni_dw_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) , jcp_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_dw:", isa, ""), - _jit_uni_dw_convolution_fwd_t); + _jit_uni_dw_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, - this->cdesc_().src_desc.data_type, - this->cdesc_().weights_desc.data_type, - this->cdesc_().dst_desc.data_type) + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) && IMPLICATION(this->with_bias(), - data_type::f32 == this->cdesc_().bias_desc.data_type); + data_type::f32 == this->desc()->bias_desc.data_type); if (!ok) return status::unimplemented; - return jit_uni_dw_conv_fwd_kernel_f32::init_conf(jcp_, - this->cdesc_(), - this->src_pd_.desc(), *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->attr(), - with_relu, this->negative_slope()); + status_t status = jit_uni_dw_conv_fwd_kernel_f32::init_conf( + jcp_, *this->desc(), this->src_pd_.desc(), + *this->weights_pd_.desc(), *this->dst_pd_.desc(), + *this->attr()); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_uni_dw_conv_fwd_kernel_f32::init_scratchpad(scratchpad, + jcp_); + + return status::success; } jit_conv_conf_t jcp_; @@ -84,54 +92,37 @@ struct _jit_uni_dw_convolution_fwd_t: public cpu_primitive_t { CHECK(this->weights_pd_.set_format(desired_wei_fmt)); if (this->bias_pd_.desc()->format == any) CHECK(this->bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _jit_uni_dw_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + _jit_uni_dw_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , padded_bias_(nullptr) { - kernel_ = new jit_uni_dw_conv_fwd_kernel_f32(conf_.jcp_, *conf_.attr()); - if (conf_.want_padded_bias()) { - padded_bias_ = (float *)malloc(sizeof(float) * conf_.jcp_.oc, 64); - for (int c = conf_.jcp_.oc_without_padding; c < conf_.jcp_.oc; ++c) - padded_bias_[c] = 0; - } - } + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr) + { kernel_ = new jit_uni_dw_conv_fwd_kernel_f32(pd()->jcp_, *pd()->attr()); } - ~_jit_uni_dw_convolution_fwd_t() { - delete kernel_; - free(padded_bias_); - } + ~_jit_uni_dw_convolution_fwd_t() { delete kernel_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_uni_dw_conv_fwd_kernel_f32 *kernel_; - float *padded_bias_; }; using jit_avx512_common_dw_convolution_fwd_t = - _jit_uni_dw_convolution_fwd_t; -using jit_avx2_dw_convolution_fwd_t = - _jit_uni_dw_convolution_fwd_t; -using jit_sse42_dw_convolution_fwd_t = - _jit_uni_dw_convolution_fwd_t; - -using jit_avx512_common_dw_convolution_relu_t = - _jit_uni_dw_convolution_fwd_t; -using jit_avx2_dw_convolution_relu_t = - _jit_uni_dw_convolution_fwd_t; -using jit_sse42_dw_convolution_relu_t = - _jit_uni_dw_convolution_fwd_t; + _jit_uni_dw_convolution_fwd_t; +using jit_avx2_dw_convolution_fwd_t = _jit_uni_dw_convolution_fwd_t; +using jit_sse42_dw_convolution_fwd_t = _jit_uni_dw_convolution_fwd_t; template struct _jit_uni_dw_convolution_bwd_data_t: public cpu_primitive_t { @@ -156,7 +147,9 @@ struct _jit_uni_dw_convolution_bwd_data_t: public cpu_primitive_t { && this->set_default_params() == status::success && utils::one_of(this->desc()->prop_kind, backward, backward_data) - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && !this->has_zero_dim_memory() && utils::everyone_is(data_type::f32, this->desc()->diff_src_desc.data_type, @@ -165,16 +158,23 @@ struct _jit_uni_dw_convolution_bwd_data_t: public cpu_primitive_t { if (!ok) return status::unimplemented; - return jit_uni_dw_conv_bwd_data_kernel_f32::init_conf(jcp_, + status_t status = + jit_uni_dw_conv_bwd_data_kernel_f32::init_conf(jcp_, *this->desc(), *this->diff_src_pd_.desc(), *this->weights_pd_.desc(), *this->diff_dst_pd_.desc()); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_uni_dw_conv_bwd_data_kernel_f32::init_scratchpad( + scratchpad, jcp_); + + return status::success; } jit_conv_conf_t jcp_; protected: virtual status_t set_default_params() override { - using namespace memory_format; auto desired_act_fmt = isa == avx512_common ? nChw16c : nChw8c; auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g; @@ -185,21 +185,23 @@ struct _jit_uni_dw_convolution_bwd_data_t: public cpu_primitive_t { CHECK(this->diff_dst_pd_.set_format(desired_act_fmt)); if (this->weights_pd_.desc()->format == any) CHECK(this->weights_pd_.set_format(desired_wei_fmt)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _jit_uni_dw_convolution_bwd_data_t(const pd_t *pd, + _jit_uni_dw_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { kernel_ = new jit_uni_dw_conv_bwd_data_kernel_f32(conf_.jcp_); } + : cpu_primitive_t(apd, inputs, outputs) + { kernel_ = new jit_uni_dw_conv_bwd_data_kernel_f32(pd()->jcp_); } ~_jit_uni_dw_convolution_bwd_data_t() { delete kernel_; }; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: execute_backward_data(); break; @@ -210,8 +212,9 @@ struct _jit_uni_dw_convolution_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_uni_dw_conv_bwd_data_kernel_f32 *kernel_; }; @@ -243,7 +246,9 @@ struct _jit_uni_dw_convolution_bwd_weights_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == prop_kind::backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && utils::everyone_is(data_type::f32, this->desc()->src_desc.data_type, this->desc()->diff_weights_desc.data_type, @@ -251,16 +256,27 @@ struct _jit_uni_dw_convolution_bwd_weights_t: public cpu_primitive_t { if (!ok) return status::unimplemented; - return jit_uni_dw_conv_bwd_weights_kernel_f32::init_conf(jcp_, + const int max_threads = mkldnn_in_parallel() + ? 1 : mkldnn_get_max_threads(); + + status_t status = + jit_uni_dw_conv_bwd_weights_kernel_f32::init_conf(jcp_, *this->desc(), *this->src_pd_.desc(), - *this->diff_weights_pd_.desc(), *this->diff_dst_pd_.desc()); + *this->diff_weights_pd_.desc(), + *this->diff_dst_pd_.desc(), max_threads); + if (status != status::success) return status; + + auto scratchpad = scratchpad_registry().registrar(); + jit_uni_dw_conv_bwd_weights_kernel_f32::init_scratchpad( + scratchpad, jcp_); + + return status::success; } jit_conv_conf_t jcp_; protected: virtual status_t set_default_params() override { - using namespace memory_format; auto desired_act_fmt = isa == avx512_common ? nChw16c : nChw8c; auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g; @@ -273,49 +289,35 @@ struct _jit_uni_dw_convolution_bwd_weights_t: public cpu_primitive_t { CHECK(this->diff_weights_pd_.set_format(desired_wei_fmt)); if (this->diff_bias_pd_.desc()->format == any) CHECK(this->diff_bias_pd_.set_format(x)); + if (this->desc()->alg_kind == alg_kind::convolution_auto) + CHECK(this->set_alg_kind(alg_kind::convolution_direct)); return status::success; } }; - _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd, + _jit_uni_dw_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); + ~_jit_uni_dw_convolution_bwd_weights_t() { delete kernel_; - if (acc_ker_) - delete acc_ker_; - - free(ws_reduction_); - free(bias_reduction_); + delete acc_ker_; }; typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward_weights(); e->set_state(event_t::ready); } private: - void execute_backward_weights(); + void execute_backward_weights() const; + bool do_parallel_reduction() const { return false; } + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; jit_uni_dw_conv_bwd_weights_kernel_f32 *kernel_; - - data_t *ws_reduction_ = nullptr; - data_t *bias_reduction_ = nullptr; - - /* Used when executing a parallel reduction */ - cpu_accumulator_1d_t *acc_ker_ = nullptr; - simple_barrier::ctx_t reduction_bctx_; - - /* For parallel implementation details see '.cpp' file in the - * backwards-by-wights section. */ - int nthr_, nthr_g_, nthr_mb_; - - inline bool do_parallel_reduction(){ - return false; - } + cpu_accumulator_1d_t *acc_ker_; }; using jit_avx512_common_dw_convolution_bwd_weights_t = diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.cpp index 2896b1b..f659fdc 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.cpp @@ -32,21 +32,10 @@ namespace cpu { using namespace Xbyak; template -bool jit_uni_eltwise_injector_f32::is_free_vec(size_t idx) { - for (size_t i = 0; i < preserved_vecs_count; i++) { - if (preserved_vec_idxs[i] == idx) { - return false; - } - } - return true; -} - -template void jit_uni_eltwise_injector_f32::injector_preamble(size_t start_idx, size_t end_idx) { preserved_vecs_count = 0; - vecs_to_preserve = (size_t)jit_uni_eltwise_injector_f32:: - aux_vecs_count(elt_alg); + vecs_to_preserve = (size_t)aux_vecs_count(alg_); start_idx_tail = start_idx; // For sse42 mask register has to be Xmm(0) @@ -56,78 +45,80 @@ void jit_uni_eltwise_injector_f32::injector_preamble(size_t start_idx, preserved_vec_idxs[preserved_vecs_count++] = idx; } - for (size_t i = 0; i < vecs_count; i++) { - if (preserved_vecs_count >= vecs_to_preserve) - break; + for (size_t idx = preserved_vecs_count; idx < vecs_count; idx++) { + if (preserved_vecs_count >= vecs_to_preserve) break; + if (start_idx <= idx && idx < end_idx) continue; - size_t idx = i; - if (is_free_vec(idx) && (idx < start_idx || idx >= end_idx)) { - preserved_vec_idxs[preserved_vecs_count++] = idx; - } + preserved_vec_idxs[preserved_vecs_count++] = idx; } size_t preserved_vecs_count_tail = vecs_to_preserve - preserved_vecs_count; for (size_t i = 0; i < preserved_vecs_count_tail; i++) { - size_t idx = start_idx_tail; - if (is_free_vec(idx)) { - preserved_vec_idxs[preserved_vecs_count++] = idx; - start_idx_tail++; - } + preserved_vec_idxs[preserved_vecs_count++] = start_idx_tail++; } assert(preserved_vecs_count == vecs_to_preserve); - if (save_vecs_state) { + if (save_state_) { h->push(p_table); - h->sub(h->rsp, preserved_vecs_count * vlen); + if (preserved_vecs_count) + h->sub(h->rsp, preserved_vecs_count * vlen); + for (size_t i = 0; i < preserved_vecs_count; ++i) h->uni_vmovups(h->ptr[h->rsp + i * vlen], Vmm(preserved_vec_idxs[i])); + + load_table_addr(); } assign_regs(); } template -void jit_uni_eltwise_injector_f32::injector_preamble_tail( - size_t start_idx) { +void jit_uni_eltwise_injector_f32::injector_preamble_tail(size_t start_idx) +{ size_t tail_vecs_to_preserve = start_idx_tail - start_idx; - int idx_off = (vecs_to_preserve - tail_vecs_to_preserve); + if (tail_vecs_to_preserve == 0) return; + + const int idx_off = vecs_to_preserve - tail_vecs_to_preserve; - if (tail_vecs_to_preserve > 0) { - if (save_vecs_state) { + if (save_state_) { + if (idx_off) h->add(h->rsp, idx_off * vlen); - for (size_t i = 0; i < tail_vecs_to_preserve; ++i) - h->uni_vmovups(Vmm(preserved_vec_idxs[idx_off + i]), - h->ptr[h->rsp + i * vlen]); - } - for (size_t i = 0; i < tail_vecs_to_preserve; ++i) { - preserved_vec_idxs[idx_off + i] += tail_vecs_to_preserve; - } + for (size_t i = 0; i < tail_vecs_to_preserve; ++i) + h->uni_vmovups(Vmm(preserved_vec_idxs[idx_off + i]), + h->ptr[h->rsp + i * vlen]); + } - if (save_vecs_state) { - for (size_t i = 0; i < tail_vecs_to_preserve; ++i) - h->uni_vmovups(h->ptr[h->rsp + i * vlen], - Vmm(preserved_vec_idxs[idx_off + i])); - h->sub(h->rsp, idx_off * vlen); - } + for (size_t i = 0; i < tail_vecs_to_preserve; ++i) + preserved_vec_idxs[idx_off + i] += tail_vecs_to_preserve; + + if (save_state_) { + for (size_t i = 0; i < tail_vecs_to_preserve; ++i) + h->uni_vmovups(h->ptr[h->rsp + i * vlen], + Vmm(preserved_vec_idxs[idx_off + i])); - assign_regs(); + if (idx_off) + h->sub(h->rsp, idx_off * vlen); } + + assign_regs(); } template void jit_uni_eltwise_injector_f32::injector_postamble() { - if (save_vecs_state) { - for (size_t i = 0; i < preserved_vecs_count; ++i) - h->uni_vmovups(Vmm(preserved_vec_idxs[i]), - h->ptr[h->rsp + i * vlen]); + if (!save_state_) return; + + for (size_t i = 0; i < preserved_vecs_count; ++i) + h->uni_vmovups(Vmm(preserved_vec_idxs[i]), + h->ptr[h->rsp + i * vlen]); + + if (preserved_vecs_count) h->add(h->rsp, preserved_vecs_count * vlen); - h->pop(p_table); - } + h->pop(p_table); } template @@ -137,33 +128,26 @@ void jit_uni_eltwise_injector_f32::assign_regs() { vmm_aux1 = Vmm(preserved_vec_idxs[1]); vmm_aux2 = Vmm(preserved_vec_idxs[2]); vmm_aux3 = Vmm(preserved_vec_idxs[3]); - - p_table = Xbyak::Reg64(table_reg_idx); - k_mask = Xbyak::Opmask(opmask_idx); + vmm_aux4 = Vmm(preserved_vec_idxs[4]); } template void jit_uni_eltwise_injector_f32::exp_compute_vector(const Vmm &vmm_src) { - const unsigned char _op_floor = 1; - - h->uni_vminps(vmm_src, vmm_src, h->ptr[p_table + 10 * vlen]); - h->uni_vmaxps(vmm_src, vmm_src, h->ptr[p_table + 11 * vlen]); + h->uni_vminps(vmm_src, vmm_src, table_val(10)); + h->uni_vmaxps(vmm_src, vmm_src, table_val(11)); h->uni_vmovups(vmm_aux0, vmm_src); //calculate exp(x) // fx = x * log2ef + 0.5 - h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_table + 2 * vlen]); - h->uni_vaddps(vmm_src, vmm_src, h->ptr[p_table + 1 * vlen]); + h->uni_vmulps(vmm_src, vmm_src, table_val(2)); + h->uni_vaddps(vmm_src, vmm_src, table_val(1)); // tmp = floorf(fx) if (isa == avx512_common) { h->vcvtps2dq(vmm_aux1 | h->T_rd_sae, vmm_src); h->vcvtdq2ps(vmm_aux1, vmm_aux1); - unsigned char _cmp_gt_os = 14; - Xbyak::Opmask k_mask_tmp = Xbyak::Opmask(2); - h->vcmpps(k_mask_tmp, vmm_aux1, vmm_src, _cmp_gt_os); - h->vmovups(vmm_aux3 | k_mask_tmp | h->T_z, - h->zword[p_table + 0 * vlen]); + h->vcmpps(k_mask, vmm_aux1, vmm_src, _cmp_nle_us); + h->vmovups(vmm_aux3 | k_mask | h->T_z, table_val(0)); h->uni_vsubps(vmm_aux1, vmm_aux1, vmm_aux3); } else { @@ -174,105 +158,213 @@ void jit_uni_eltwise_injector_f32::exp_compute_vector(const Vmm &vmm_src) { h->uni_vmovups(vmm_src, vmm_aux1); //vmm_src = fx //x = x - fx * ln2 - h->uni_vfnmadd231ps(vmm_aux0, vmm_aux1, h->ptr[p_table + 3 * vlen]); + h->uni_vfnmadd231ps(vmm_aux0, vmm_aux1, table_val(3)); // compute 2^n h->uni_vcvtps2dq(vmm_aux1, vmm_src); - h->uni_vpaddd(vmm_aux1, vmm_aux1, h->ptr[p_table + 4 * vlen]); + h->uni_vpaddd(vmm_aux1, vmm_aux1, table_val(4)); h->uni_vpslld(vmm_aux1, vmm_aux1, 23); //Vmm(6) = 2^-fx // y = p5 - h->uni_vmovups(vmm_src, h->ptr[p_table + 9 * vlen]); + h->uni_vmovups(vmm_src, table_val(9)); // y = y * x + p4 - h->uni_vfmadd213ps(vmm_src, vmm_aux0, h->ptr[p_table + 8 * vlen]); + h->uni_vfmadd213ps(vmm_src, vmm_aux0, table_val(8)); // y = y * x + p3 - h->uni_vfmadd213ps(vmm_src, vmm_aux0, h->ptr[p_table + 7 * vlen]); + h->uni_vfmadd213ps(vmm_src, vmm_aux0, table_val(7)); // y = y * x + p2 - h->uni_vfmadd213ps(vmm_src, vmm_aux0, h->ptr[p_table + 6 * vlen]); + h->uni_vfmadd213ps(vmm_src, vmm_aux0, table_val(6)); // y = y * x + p1 - h->uni_vfmadd213ps(vmm_src, vmm_aux0, h->ptr[p_table + 0 * vlen]); + h->uni_vfmadd213ps(vmm_src, vmm_aux0, table_val(0)); // y = y * x + p0 - h->uni_vfmadd213ps(vmm_src, vmm_aux0, h->ptr[p_table + 5 * vlen]); //exp(q) + h->uni_vfmadd213ps(vmm_src, vmm_aux0, table_val(5)); //exp(q) // y = y * 2^n h->uni_vmulps(vmm_src, vmm_src, vmm_aux1); } template -void jit_uni_eltwise_injector_f32::relu_compute_vector( - const Vmm &vmm_src) { - unsigned char _cmp_gt_os = isa == avx512_common ? 14 : 6; - - int alpha_off = 0 * vlen; - int zero_off = 1 * vlen; +void jit_uni_eltwise_injector_f32::relu_compute_vector(const Vmm &vmm_src) +{ + const int alpha_off = 0, zero_off = 1; h->uni_vmovups(vmm_aux1, vmm_src); if (isa == sse42) { h->movups(vmm_mask, vmm_src); - h->mulps(vmm_src, h->ptr[p_table + alpha_off]); - h->cmpps(vmm_mask, h->ptr[p_table + zero_off], _cmp_gt_os); + h->mulps(vmm_src, table_val(alpha_off)); + h->cmpps(vmm_mask, table_val(zero_off), _cmp_nle_us); h->blendvps(vmm_src, vmm_aux1); } else if (isa == avx2) { - h->vmulps(vmm_src, vmm_src, h->ptr[p_table + alpha_off]); - h->vcmpgtps(vmm_mask, vmm_aux1, h->ptr[p_table + zero_off]); + h->vmulps(vmm_src, vmm_src, table_val(alpha_off)); + h->vcmpgtps(vmm_mask, vmm_aux1, table_val(zero_off)); h->vblendvps(vmm_src, vmm_src, vmm_aux1, vmm_mask); } else if (isa == avx512_common) { - h->vmulps(vmm_src, vmm_src, h->ptr[p_table + alpha_off]); - h->vcmpps(k_mask, vmm_aux1, h->ptr[p_table + zero_off], _cmp_gt_os); - h->vblendmps(vmm_src | k_mask, vmm_src, - vmm_aux1); + h->vmulps(vmm_src, vmm_src, table_val(alpha_off)); + h->vcmpps(k_mask, vmm_aux1, table_val(zero_off), _cmp_nle_us); + h->vblendmps(vmm_src | k_mask, vmm_src, vmm_aux1); } } template void jit_uni_eltwise_injector_f32::relu_zero_ns_compute_vector( const Vmm &vmm_src) { - int zero_off = 1 * vlen; - h->uni_vmaxps(vmm_src, vmm_src, h->ptr[p_table + zero_off]); + const int zero_off = 1; + h->uni_vmaxps(vmm_src, vmm_src, table_val(zero_off)); } template void jit_uni_eltwise_injector_f32::elu_compute_vector(const Vmm &vmm_src) { - const unsigned char _cmp_gt_os = 6; - const unsigned char _cmp_let_os = 2; - int alpha_off = 12 * vlen; - int zero_off = 13 * vlen; + const int alpha_off = 23, zero_off = 24; // compute exponent h->uni_vmovups(vmm_aux2, vmm_src); exp_compute_vector(vmm_src); // alpha * (exp(x) - 1) - h->uni_vsubps(vmm_src, vmm_src, h->ptr[p_table + 0 * 32]); - h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_table + alpha_off]); + h->uni_vsubps(vmm_src, vmm_src, table_val(0)); + h->uni_vmulps(vmm_src, vmm_src, table_val(alpha_off)); // combine with mask if (isa == sse42) { h->pxor(vmm_mask, vmm_mask); - h->cmpps(vmm_mask, vmm_aux2, _cmp_let_os); + h->cmpps(vmm_mask, vmm_aux2, _cmp_le_os); h->blendvps(vmm_src, vmm_aux2); } else if (isa == avx2) { - h->uni_vcmpgtps(vmm_mask, vmm_aux2, h->ptr[p_table + zero_off]); + h->uni_vcmpgtps(vmm_mask, vmm_aux2, table_val(zero_off)); h->uni_vblendvps(vmm_src, vmm_src, vmm_aux2, vmm_mask); } else if (isa == avx512_common) { - h->vcmpps(k_mask, vmm_aux2, h->ptr[p_table + zero_off], _cmp_gt_os); + h->vcmpps(k_mask, vmm_aux2, table_val(zero_off), _cmp_nle_us); h->vblendmps(vmm_src | k_mask, vmm_src, vmm_aux2); } } template -void jit_uni_eltwise_injector_f32::tanh_compute_vector( - const Vmm &vmm_src) { - // compute exp(2x) - h->uni_vaddps(vmm_src, vmm_src, vmm_src); - exp_compute_vector(vmm_src); - // dup exp(2x) - h->uni_vmovups(vmm_aux0, vmm_src); - // (exp(2x) - 1) - h->uni_vsubps(vmm_src, vmm_src, h->ptr[p_table + 0 * vlen]); - // (exp(2x) + 1) - h->uni_vaddps(vmm_aux0, vmm_aux0, h->ptr[p_table + 0 * vlen]); - // y = (exp(2x) - 1) / (exp(2x) + 1) - h->uni_vdivps(vmm_src, vmm_src, vmm_aux0); +void jit_uni_eltwise_injector_f32::tanh_compute_vector(const Vmm &vmm_src) +{ + // # comes from Taylor expansion error bound + // > linear_sat_point = single(sqrt(3) * 1b-12); + // # comes from the exp formula cancellation + // > exp_bound_point = (single(log(3)/2)); + // # comes from rounding accuracy in float + // > one_sat_point = round(atanh(1 - 1b-25), single, RU); + // > P = fpminimax(f, [|1, 3, 5, 7, 9|], [|24... |], + // [linear_sat_point, exp_bound_point], relative, floating); + // > err_bound = D(sup(supnorm(P, tanh(x), + // [linear_sat_point, exp_bound_point], relative, theta))); + // 0x1.fffd6f00b9539p-25 + // > P; + // x * (0x1.fffffep-1 + x^0x1p1 * (-0x1.55539ep-2 + x^0x1p1 * + // (0x1.10be3ep-3 + x^0x1p1 * (-0x1.ae57b4p-5 + // + x^0x1p1 * 0x1.09fa1p-6)))) + + // register mapping + // vmm_src contains input + // vmm_aux0 contains mask of currently valid results. + // 1 is need computation, 0 is already computed + // vmm_aux1 contains current output + // vmm_aux2, vmm_aux3 contains auxiliary values + // vmm_aux4 contains the original sign of inputs + + Label end_tanh_label; + + auto test_exit =[&](Xbyak::Address threshold){ + // is not necessary for >AVX, but should not matter on perf + h->uni_vmovups(vmm_aux0, vmm_src); + if (isa == avx512_common){ + h->vcmpps(k_mask, vmm_aux0, threshold, 0x5); + h->kortestw(k_mask, k_mask); + } else { + h->uni_vcmpgeps(vmm_aux0, vmm_aux0, threshold); + h->uni_vtestps(vmm_aux0, vmm_aux0); + } + h->jz(end_tanh_label, Xbyak::CodeGenerator::T_NEAR); + }; + + auto blend_results=[&](Vmm vmm_partial_res){ + if (isa == avx512_common) + h->vblendmps(vmm_aux1 | k_mask, vmm_aux1, vmm_partial_res); + else + h->uni_vblendvps(vmm_aux1, vmm_aux1, vmm_partial_res, vmm_aux0); + }; + + // because tanh(x) = -tanh(-x), we extract sign to make x postive + // and reapply sign at the end + // mov is not necessary for >AVX, but should not matter for performance + h->uni_vmovups(vmm_aux4, vmm_src); + h->uni_vandps(vmm_aux4, vmm_aux4, table_val(12)); + h->uni_vandps(vmm_src, vmm_src, table_val(17)); + + // if x < linear_sat_point for all inputs, we just return the input + h->uni_vmovups(vmm_aux1, vmm_src); + test_exit(table_val(13)); + + // if one of the mask is one, we have to compute an better approx + h->uni_vmovups(vmm_aux2, vmm_src); + h->uni_vmulps(vmm_aux2, vmm_aux2, vmm_aux2); + h->uni_vmovups(vmm_aux3, table_val(22)); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux2, table_val(21)); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux2, table_val(20)); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux2, table_val(19)); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux2, table_val(18)); + h->uni_vmulps(vmm_aux3, vmm_aux3, vmm_src); + + // we blend only the result that need update + blend_results(vmm_aux3); + + // if x < exp_bound_point, we go to return point + test_exit(table_val(14)); + + // if not we use a better approx 1 - 2 / (1 + exp(2x)) + // compute 2x + h->uni_vmovups(vmm_aux3, vmm_src); + h->uni_vaddps(vmm_aux3, vmm_aux3, vmm_aux3); + + // Compute exp(2x) + // We need to save kmask, vmm_aux0, vmm_aux1 and vmm_src as exp can use them + // vmm_src is not more read afterwards, so we do not have to save it + auto stack_size = 3 * vlen + (isa == avx512_common) * 4; + h->sub(h->rsp, stack_size); + h->uni_vmovups(h->ptr[h->rsp + 0 * vlen], vmm_aux0); + h->uni_vmovups(h->ptr[h->rsp + 1 * vlen], vmm_aux1); + h->uni_vmovups(h->ptr[h->rsp + 2 * vlen], vmm_src); + if (isa == avx512_common) + h->kmovw(h->ptr[h->rsp + 3 * vlen], k_mask); + + exp_compute_vector(vmm_aux3); + + h->uni_vmovups(vmm_aux0, h->ptr[h->rsp + 0 * vlen]); + h->uni_vmovups(vmm_aux1, h->ptr[h->rsp + 1 * vlen]); + h->uni_vmovups(vmm_src, h->ptr[h->rsp + 2 * vlen]); + if (isa == avx512_common) + h->kmovw(k_mask, h->ptr[h->rsp + 3 * vlen]); + h->add(h->rsp, stack_size); + + // 1 + exp(2x) + h->uni_vaddps(vmm_aux3, vmm_aux3, table_val(0)); + + // 1 - 2 / (1 + exp(2x)) + h->uni_vmovups(vmm_aux2, table_val(16)); + h->uni_vdivps(vmm_aux2, vmm_aux2, vmm_aux3); + h->uni_vaddps(vmm_aux2, vmm_aux2, table_val(0)); + + // we blend only the result that need update + blend_results(vmm_aux2); + + // finally, we saturate to 1 if needed + // TODO: maybe move that up if most inputs saturate in practice + if (isa == avx512_common) + h->vcmpps(k_mask, vmm_aux0, table_val(15), 0x5); + else { + h->uni_vmovups(vmm_aux0, vmm_src); + h->uni_vcmpgeps(vmm_aux0, vmm_aux0, table_val(15)); + } + h->uni_vmovups(vmm_aux2, table_val(0)); + blend_results(vmm_aux2); + + h->L(end_tanh_label); + { + // we apply the sign of x to the result and we are done + h->uni_vmovups(vmm_src, vmm_aux1); + h->uni_vpxor(vmm_src, vmm_src, vmm_aux4); + } } template @@ -284,24 +376,22 @@ void jit_uni_eltwise_injector_f32::square_compute_vector( template void jit_uni_eltwise_injector_f32::abs_compute_vector(const Vmm &vmm_src) { // compute abs(x) = _mm_and_ps(x, 01111..111)); - h->uni_vandps(vmm_src, vmm_src, h->ptr[p_table + 0*vlen]); + h->uni_vandps(vmm_src, vmm_src, table_val(0)); } template -void jit_uni_eltwise_injector_f32::sqrt_compute_vector( - const Vmm &vmm_src) { +void jit_uni_eltwise_injector_f32::sqrt_compute_vector(const Vmm &vmm_src) +{ if (isa == avx512_common) { - unsigned char _cmp_gt_os = 6; - - h->vcmpps(k_mask, vmm_src, h->ptr[p_table + 0 * vlen], _cmp_gt_os); + h->vcmpps(k_mask, vmm_src, table_val(0), _cmp_nle_us); h->uni_vsqrtps(vmm_aux1, vmm_src); - h->uni_vmovups(vmm_src, h->ptr[p_table + 0*vlen]); + h->uni_vmovups(vmm_src, table_val(0)); h->vblendmps(vmm_src | k_mask, vmm_src, vmm_aux1); } else { h->uni_vmovups(vmm_mask, vmm_src); - h->uni_vcmpgtps(vmm_mask, vmm_mask, h->ptr[p_table + 0*vlen]); + h->uni_vcmpgtps(vmm_mask, vmm_mask, table_val(0)); h->uni_vsqrtps(vmm_aux1, vmm_src); - h->uni_vmovups(vmm_src, h->ptr[p_table + 0*vlen]); + h->uni_vmovups(vmm_src, table_val(0)); h->uni_vblendvps(vmm_src, vmm_src, vmm_aux1, vmm_mask); } } @@ -310,48 +400,39 @@ template void jit_uni_eltwise_injector_f32::linear_compute_vector( const Vmm &vmm_src) { // compute x = alpha * x + beta; - h->uni_vmovups(vmm_aux0, h->ptr[p_table + 0*vlen]); - h->uni_vfmadd213ps(vmm_src, vmm_aux0, h->ptr[p_table + 1*vlen]); + h->uni_vmovups(vmm_aux0, table_val(0)); + h->uni_vfmadd213ps(vmm_src, vmm_aux0, table_val(1)); } template void jit_uni_eltwise_injector_f32::bounded_relu_compute_vector( const Vmm &vmm_src) { // compute bounded relu */ - h->uni_vmaxps(vmm_src, vmm_src, h->ptr[p_table + 1*vlen]); - h->uni_vminps(vmm_src, vmm_src, h->ptr[p_table + 0*vlen]); -} - -template -void jit_uni_eltwise_injector_f32::clamp_compute_vector( - const Vmm &vmm_src) { - h->uni_vmaxps(vmm_src, vmm_src, h->ptr[p_table + 1*vlen]); - h->uni_vminps(vmm_src, vmm_src, h->ptr[p_table + 0*vlen]); + h->uni_vmaxps(vmm_src, vmm_src, table_val(1)); + h->uni_vminps(vmm_src, vmm_src, table_val(0)); } template void jit_uni_eltwise_injector_f32::soft_relu_compute_vector( const Vmm &vmm_src) { - const unsigned char _op_floor = 1; // duplicate src h->uni_vmovups(vmm_aux2, vmm_src); - h->uni_vminps(vmm_src, vmm_src, h->ptr[p_table + 24 * vlen]); - h->uni_vmaxps(vmm_src, vmm_src, h->ptr[p_table + 25 * vlen]); + h->uni_vminps(vmm_src, vmm_src, table_val(24)); + h->uni_vmaxps(vmm_src, vmm_src, table_val(25)); h->uni_vmovups(vmm_aux1, vmm_src); // calculate exp(x) // fx = x * log2ef + 0.5 - h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_table + 2 * vlen]); - h->uni_vaddps(vmm_src, vmm_src, h->ptr[p_table + 1 * vlen]); + h->uni_vmulps(vmm_src, vmm_src, table_val(2)); + h->uni_vaddps(vmm_src, vmm_src, table_val(1)); // tmp = floorf(fx) if (isa == avx512_common) { h->vcvtps2dq(vmm_aux0 | h->T_rd_sae, vmm_src); h->vcvtdq2ps(vmm_aux0, vmm_aux0); - unsigned char _cmp_gt_os = 14; - h->vcmpps(k_mask, vmm_aux0, vmm_src, _cmp_gt_os); - h->vmovups(vmm_aux3 | k_mask | h->T_z, h->ptr[p_table + 0 * vlen]); + h->vcmpps(k_mask, vmm_aux0, vmm_src, _cmp_nle_us); + h->vmovups(vmm_aux3 | k_mask | h->T_z, table_val(0)); h->vsubps(vmm_aux0, vmm_aux0, vmm_aux3); } else { @@ -361,32 +442,32 @@ void jit_uni_eltwise_injector_f32::soft_relu_compute_vector( // keep fx for further computations h->uni_vmovups(vmm_src, vmm_aux0); //vmm_src = fx // calculation fx * ln2 - h->uni_vmulps(vmm_aux0, vmm_aux0, h->ptr[p_table + 3 * vlen]); + h->uni_vmulps(vmm_aux0, vmm_aux0, table_val(3)); // x = x - fx * ln2 h->uni_vsubps(vmm_aux1, vmm_aux1, vmm_aux0); // y = p5 - h->uni_vmovups(vmm_aux3, h->ptr[p_table + 22 * vlen]); + h->uni_vmovups(vmm_aux3, table_val(22)); // y = y * x + p4 - h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, h->ptr[p_table + 21 * vlen]); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, table_val(21)); // y = y * x + p3 - h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, h->ptr[p_table + 20 * vlen]); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, table_val(20)); // y = y * x + p2 - h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, h->ptr[p_table + 19 * vlen]); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, table_val(19)); // y = y * x + p1 - h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, h->ptr[p_table + 0 * vlen]); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, table_val(0)); // y = y * x + p0 - h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, h->ptr[p_table + 17 * vlen]); + h->uni_vfmadd213ps(vmm_aux3, vmm_aux1, table_val(17)); // compute 2^(-n) if (isa == avx512_common) { - h->vmulps(vmm_aux1, vmm_src, h->ptr[p_table + 23 * vlen]); + h->vmulps(vmm_aux1, vmm_src, table_val(23)); h->vcvtps2dq(vmm_aux1, vmm_aux1); } else { h->uni_vcvtps2dq(vmm_aux1, vmm_src); - h->uni_vpsignd(vmm_aux1, vmm_aux1, h->ptr[p_table + 23 * vlen]); + h->uni_vpsignd(vmm_aux1, vmm_aux1, table_val(23)); } - h->uni_vpaddd(vmm_aux1, vmm_aux1, h->ptr[p_table + 4 * vlen]); + h->uni_vpaddd(vmm_aux1, vmm_aux1, table_val(4)); h->uni_vpslld(vmm_aux1, vmm_aux1, 23); //vmm_aux1 = 2^-fx // calculate ln(1 + y) h->uni_vaddps(vmm_aux3, vmm_aux3, vmm_aux1); @@ -396,46 +477,45 @@ void jit_uni_eltwise_injector_f32::soft_relu_compute_vector( h->uni_vpsrld(vmm_src, vmm_src, 23); h->uni_vcvtdq2ps(vmm_src, vmm_src); // got n. where n is x = 2^n * y. y = 0.5 .. 1 - h->uni_vsubps(vmm_src, vmm_src, h->ptr[p_table + 5 * vlen]); + h->uni_vsubps(vmm_src, vmm_src, table_val(5)); - h->uni_vandps(vmm_aux3, vmm_aux3, h->ptr[p_table + 6 * vlen]); + h->uni_vandps(vmm_aux3, vmm_aux3, table_val(6)); // got y. (mantisa) 0.5 < y < 1 - h->uni_vorps(vmm_aux3, vmm_aux3, h->ptr[p_table + 7 * vlen]); + h->uni_vorps(vmm_aux3, vmm_aux3, table_val(7)); // y = y - 1 - h->uni_vsubps(vmm_aux3, vmm_aux3, h->ptr[p_table + 0 * vlen]); + h->uni_vsubps(vmm_aux3, vmm_aux3, table_val(0)); // y = p8 - h->uni_vmovups(vmm_aux1, h->ptr[p_table + 16 * vlen]); + h->uni_vmovups(vmm_aux1, table_val(16)); // y = y * x + p7 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 15 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(15)); // y = y * x + p6 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 14 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(14)); // y = y * x + p5 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 13 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(13)); // y = y * x + p4 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 12 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(12)); // y = y * x + p3 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 11 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(11)); // y = y * x + p2 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 10 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(10)); // y = y * x + p1 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 9 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(9)); // y = y * x + p0 ; p0 = 0 - h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, h->ptr[p_table + 8 * vlen]); + h->uni_vfmadd213ps(vmm_aux1, vmm_aux3, table_val(8)); //calculate ln(2) * n - h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_table + 3 * vlen]); + h->uni_vmulps(vmm_src, vmm_src, table_val(3)); h->uni_vaddps(vmm_aux1, vmm_aux1, vmm_src); h->uni_vaddps(vmm_aux1, vmm_aux1, vmm_aux0); // get vmm_mask = src > max logf h->uni_vmovups(vmm_mask, vmm_aux2); if (isa == avx512_common) { - unsigned char _cmp_gt_os = 6; // y = (x < max log f) ? soft_relu(x) : x - h->vcmpps(k_mask, vmm_mask, h->ptr[p_table + 24 * vlen], _cmp_gt_os); + h->vcmpps(k_mask, vmm_mask, table_val(24), _cmp_nle_us); h->vblendmps(vmm_aux1 | k_mask, vmm_aux1, vmm_aux2); } else { // y = (x < max log f) ? soft_relu(x) : x - h->uni_vcmpgtps(vmm_mask, vmm_mask, h->ptr[p_table + 24 * vlen]); + h->uni_vcmpgtps(vmm_mask, vmm_mask, table_val(24)); h->uni_vblendvps(vmm_aux1, vmm_aux1, vmm_aux2, vmm_mask); } @@ -445,23 +525,46 @@ void jit_uni_eltwise_injector_f32::soft_relu_compute_vector( template void jit_uni_eltwise_injector_f32::logistic_compute_vector( const Vmm &vmm_src) { + // we store the original sign and make x negative + // IMPORTANT: we assume vmm_aux0 to be xmm0, as for sse4.2 path it is required + // IMPORTANT: we use vmm_aux2 for the mask as exp_compute does not use it. + h->uni_vmovups(vmm_aux2, vmm_src); + h->uni_vandps(vmm_aux2, vmm_aux2, table_val(12)); + h->uni_vorps(vmm_src, vmm_src, table_val(12)); + exp_compute_vector(vmm_src); // dup exp(x) - h->uni_vmovups(vmm_aux0, vmm_src); + h->uni_vmovups(vmm_aux1, vmm_src); // (exp(x) + 1) - h->uni_vaddps(vmm_aux0, vmm_aux0, h->ptr[p_table + 0 * vlen]); + h->uni_vaddps(vmm_aux1, vmm_aux1, table_val(0)); // y = exp(x) / (exp(x) + 1) - h->uni_vdivps(vmm_src, vmm_src, vmm_aux0); + h->uni_vdivps(vmm_src, vmm_src, vmm_aux1); + + // Now we have to apply the "symmetry" based on original sign + h->uni_vmovups(vmm_aux3, table_val(0)); + h->uni_vsubps(vmm_aux3, vmm_aux3, vmm_src); + if (isa == avx512_common) { + h->vptestmd(k_mask, vmm_aux2, vmm_aux2); + h->vblendmps(vmm_aux3 | k_mask, vmm_aux3, vmm_src); + } else { + h->uni_vmovups(vmm_aux0, vmm_aux2);// The mask should be xmm0 for sse4.2 + h->uni_vblendvps(vmm_aux3, vmm_aux3, vmm_src, vmm_aux0); + } + h->uni_vmovups(vmm_src, vmm_aux3); +} + +template +void jit_uni_eltwise_injector_f32::clamp_compute_vector( + const Vmm &vmm_src) { + // compute clamp */ + h->uni_vmaxps(vmm_src, vmm_src, table_val(1)); + h->uni_vminps(vmm_src, vmm_src, table_val(0)); } template void jit_uni_eltwise_injector_f32::relu_prepare_table() { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(alpha)); - } - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(0); - } + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(alpha_)); + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(0); } template @@ -479,20 +582,28 @@ void jit_uni_eltwise_injector_f32::elu_prepare_table() { 0x3d2bb1b1, // [8] p4 = 0.041917507f 0x3c091ec1, // [9] p5 = 0.008369149f 0x42b0c0a5, //[10] max logf = 88.3762589f - 0xc1766666 //[11] min logf = -14.5f + 0xc1766666, //[11] min logf = -14.5f + // tanh(x) constants, + 0x80000000, //[12] mask to extract sign + 0x39ddb3d7, //[13] arg below which tanh(x) = x + 0x3f0c9f54, //[14] arg below which pol approx is valid + 0x41102cb4, //[15] arg after which tanh(x) = 1 + 0xc0000000, //[16] -2.0f + 0x7fffffff, //[17] mask to make positive + // tanh pol approx + 0x3f7fffff, //[18] p0 + 0xbeaaa9cf, //[19] p1 + 0x3e085f1f, //[20] p2 + 0xbd572bda, //[21] p3 + 0x3c84fd08, //[22] p4 }; for (size_t i = 0; i < sizeof(cvals) / sizeof(cvals[0]); ++i) { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(cvals[i]); - } - } - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(alpha)); - } - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(0); + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(cvals[i]); } + + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(alpha_)); + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(0); } template @@ -537,63 +648,48 @@ void jit_uni_eltwise_injector_f32::soft_relu_prepare_table() { template void jit_uni_eltwise_injector_f32::abs_prepare_table() { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(0x7fffffff); - } + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(0x7fffffff); } template void jit_uni_eltwise_injector_f32::sqrt_prepare_table() { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(0); - } + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(0); } template void jit_uni_eltwise_injector_f32::linear_prepare_table() { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(alpha)); - } - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(beta)); - } + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(alpha_)); + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(beta_)); } template void jit_uni_eltwise_injector_f32::bounded_relu_prepare_table() { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(alpha)); - } - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(0); - } + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(alpha_)); + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(0); } template void jit_uni_eltwise_injector_f32::clamp_prepare_table() { - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(alpha)); - } - for (size_t d = 0; d < vlen / sizeof(float); ++d) { - h->dd(float2int(beta)); - } + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(alpha_)); + for (size_t d = 0; d < vlen / sizeof(float); ++d) h->dd(float2int(beta_)); } template -int jit_uni_eltwise_injector_f32::aux_vecs_count(alg_kind_t elt_alg) { - switch (elt_alg) { - case alg_kind::eltwise_relu: return (alpha == 0.f) ? 0 : 2; - case alg_kind::eltwise_elu: return 4; - case alg_kind::eltwise_tanh: return 4; - case alg_kind::eltwise_square: return 0; - case alg_kind::eltwise_abs: return 0; - case alg_kind::eltwise_sqrt: return 2; - case alg_kind::eltwise_linear: return 1; - case alg_kind::eltwise_bounded_relu: return 0; - case alg_kind::eltwise_soft_relu: return 4; - case alg_kind::eltwise_logistic: return 4; - case alg_kind::eltwise_clamp: return 0; - default: assert(!"unsupported eltwise algorithm"); +int jit_uni_eltwise_injector_f32::aux_vecs_count(alg_kind_t alg_) { + switch (alg_) { + case alg_kind::eltwise_relu: return (alpha_ == 0.f) ? 0 : 2; + case alg_kind::eltwise_elu: return 4; + case alg_kind::eltwise_tanh: return 5; + case alg_kind::eltwise_square: return 0; + case alg_kind::eltwise_abs: return 0; + case alg_kind::eltwise_sqrt: return 2; + case alg_kind::eltwise_linear: return 1; + case alg_kind::eltwise_bounded_relu: return 0; + case alg_kind::eltwise_soft_relu: return 4; + case alg_kind::eltwise_logistic: return 4; + case alg_kind::eltwise_clamp: return 0; + case alg_kind::eltwise_exp: return 4; + default: assert(!"unsupported eltwise algorithm"); } return 0; @@ -602,37 +698,25 @@ int jit_uni_eltwise_injector_f32::aux_vecs_count(alg_kind_t elt_alg) { template void jit_uni_eltwise_injector_f32::compute_body(size_t start_idx, size_t end_idx) { - h->mov(p_table, l_table); - + using namespace alg_kind; for (size_t idx = start_idx; idx < end_idx; idx++) { - switch (elt_alg) { - case alg_kind::eltwise_relu: - if (alpha == 0.f) - relu_zero_ns_compute_vector(Vmm(idx)); - else - relu_compute_vector(Vmm(idx)); - break; - case alg_kind::eltwise_elu: - elu_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_tanh: - tanh_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_square: - square_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_abs: - abs_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_sqrt: - sqrt_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_linear: - linear_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_bounded_relu: - bounded_relu_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_soft_relu: - soft_relu_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_logistic: - logistic_compute_vector(Vmm(idx)); break; - case alg_kind::eltwise_clamp: - clamp_compute_vector(Vmm(idx)); break; - default: assert(!"unsupported eltwise algorithm"); + switch (alg_) { + case eltwise_relu: + if (alpha_ == 0.f) relu_zero_ns_compute_vector(Vmm(idx)); + else relu_compute_vector(Vmm(idx)); + break; + case eltwise_elu: elu_compute_vector(Vmm(idx)); break; + case eltwise_tanh: tanh_compute_vector(Vmm(idx)); break; + case eltwise_square: square_compute_vector(Vmm(idx)); break; + case eltwise_abs: abs_compute_vector(Vmm(idx)); break; + case eltwise_sqrt: sqrt_compute_vector(Vmm(idx)); break; + case eltwise_linear: linear_compute_vector(Vmm(idx)); break; + case eltwise_bounded_relu: bounded_relu_compute_vector(Vmm(idx)); break; + case eltwise_soft_relu: soft_relu_compute_vector(Vmm(idx)); break; + case eltwise_logistic: logistic_compute_vector(Vmm(idx)); break; + case eltwise_clamp: clamp_compute_vector(Vmm(idx)); break; + case eltwise_exp: exp_compute_vector(Vmm(idx)); break; + default: assert(!"unsupported eltwise algorithm"); } } } @@ -640,9 +724,7 @@ void jit_uni_eltwise_injector_f32::compute_body(size_t start_idx, template void jit_uni_eltwise_injector_f32::compute_vector_range(size_t start_idx, size_t end_idx) { - assert(start_idx < vecs_count); - assert(end_idx <= vecs_count); - assert(start_idx < end_idx); + assert(start_idx < end_idx && end_idx <= vecs_count); injector_preamble(start_idx, end_idx); compute_body(start_idx_tail, end_idx); @@ -652,38 +734,30 @@ void jit_uni_eltwise_injector_f32::compute_vector_range(size_t start_idx, } template -void jit_uni_eltwise_injector_f32::compute_vector(size_t idx) { - compute_vector_range(idx, idx + 1); -} +void jit_uni_eltwise_injector_f32::prepare_table(bool gen_table) { + using namespace alg_kind; -template -void jit_uni_eltwise_injector_f32::prepare_table() { h->align(64); h->L(l_table); - switch (elt_alg) { - case alg_kind::eltwise_relu: - relu_prepare_table(); break; - case alg_kind::eltwise_elu: - case alg_kind::eltwise_tanh: - case alg_kind::eltwise_logistic: + if (gen_table) { + switch (alg_) { + case eltwise_relu: relu_prepare_table(); break; + case eltwise_elu: + case eltwise_tanh: + case eltwise_logistic: + case eltwise_exp: elu_prepare_table(); break; - case alg_kind::eltwise_soft_relu: - soft_relu_prepare_table(); break; - case alg_kind::eltwise_abs: - abs_prepare_table(); break; - case alg_kind::eltwise_sqrt: - sqrt_prepare_table(); break; - case alg_kind::eltwise_linear: - linear_prepare_table(); break; - case alg_kind::eltwise_bounded_relu: - bounded_relu_prepare_table(); break; - case alg_kind::eltwise_square: - break; - case alg_kind::eltwise_clamp: - clamp_prepare_table(); break; + case eltwise_soft_relu: soft_relu_prepare_table(); break; + case eltwise_abs: abs_prepare_table(); break; + case eltwise_sqrt: sqrt_prepare_table(); break; + case eltwise_linear: linear_prepare_table(); break; + case eltwise_bounded_relu: bounded_relu_prepare_table(); break; + case eltwise_square: break; + case eltwise_clamp: clamp_prepare_table(); break; default: assert(!"unsupported eltwise algorithm"); } + } } template struct jit_uni_eltwise_injector_f32; @@ -861,27 +935,27 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32, jit_uni_kernel_fwd_f32(const eltwise_desc_t &desc) : jit_uni_eltwise_kernel_f32(desc), jit_generator() { - eltwise_injector = new jit_uni_eltwise_injector_f32(this, - desc.alg_kind, desc.alpha, desc.beta, false, 9, 1); + eltwise_injector_ = new jit_uni_eltwise_injector_f32(this, + desc.alg_kind, desc.alpha, desc.beta, false, r9, Opmask(1)); using namespace alg_kind; assert(is_bwd() == false); assert(utils::one_of(desc.alg_kind, eltwise_tanh, eltwise_elu, eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, - eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic)); + eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic, + eltwise_clamp, eltwise_exp)); preamble(); - Label vectorized_loop_start; - Label reminder_loop_start; - Label vectorized_loop_end; - Label reminder_loop_end; - Reg64 param = abi_param1; mov(reg_from, ptr[param + GET_OFF(from)]); mov(reg_to, ptr[param + GET_OFF(to)]); mov(reg_work_amount, ptr[param + GET_OFF(work_amount)]); + eltwise_injector_->load_table_addr(); + + Label reminder_loop_start, reminder_loop_end; + Label vectorized_loop_start, vectorized_loop_end; cmp(reg_work_amount, simd_w); jl(reminder_loop_start, T_NEAR); @@ -889,7 +963,7 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32, L(vectorized_loop_start); uni_vmovups(vmm_src, ptr[reg_from]); - eltwise_injector->compute_vector(vmm_src.getIdx()); + eltwise_injector_->compute_vector(vmm_src.getIdx()); uni_vmovups(ptr[reg_to], vmm_src); add(reg_from, vlen); @@ -907,7 +981,7 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32, jle(reminder_loop_end, T_NEAR); movss(xmm_src, ptr[reg_from]); - eltwise_injector->compute_vector(xmm_src.getIdx()); + eltwise_injector_->compute_vector(xmm_src.getIdx()); movss(ptr[reg_to], xmm_src); add(reg_from, sizeof(float)); @@ -920,14 +994,12 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32, postamble(); - eltwise_injector->prepare_table(); + eltwise_injector_->prepare_table(); ker_ = (decltype(ker_))this->getCode(); } - ~jit_uni_kernel_fwd_f32() { - delete eltwise_injector; - } + ~jit_uni_kernel_fwd_f32() { delete eltwise_injector_; } private: using Vmm = typename utils::conditional3* eltwise_injector; + jit_uni_eltwise_injector_f32 *eltwise_injector_; }; } /* namespace */ @@ -959,23 +1031,23 @@ status_t jit_uni_eltwise_fwd_t::pd_t::init() { prop_kind::forward_inference) && utils::everyone_is(data_type::f32, desc()->data_desc.data_type) && !has_zero_dim_memory() - && IMPLICATION(isa > avx2, utils::one_of(desc()->alg_kind, - eltwise_relu, eltwise_elu)) - && IMPLICATION(isa == sse42 || isa == avx2, utils::one_of( - desc()->alg_kind, eltwise_relu, eltwise_tanh, eltwise_elu, - eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, - eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic)) - && memory_desc_wrapper(src_pd()).is_dense() + && utils::one_of(desc()->alg_kind, eltwise_relu, eltwise_tanh, + eltwise_elu, eltwise_square, eltwise_abs, eltwise_sqrt, + eltwise_linear, eltwise_bounded_relu, eltwise_soft_relu, + eltwise_logistic, eltwise_clamp, eltwise_exp) + && memory_desc_wrapper(src_pd()).is_dense(true) + && IMPLICATION(!memory_desc_wrapper(src_pd()).is_dense(false), + math::eltwise_fwd_preserves_zero(desc()->alg_kind, true)) && attr()->has_default_values(); return ok ? status::success : status::unimplemented; } template -jit_uni_eltwise_fwd_t::jit_uni_eltwise_fwd_t(const pd_t *pd, +jit_uni_eltwise_fwd_t::jit_uni_eltwise_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), kernel_(nullptr) { - const auto &desc = *conf_.desc(); + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr) { + const auto &desc = *pd()->desc(); switch (desc.alg_kind) { case alg_kind::eltwise_relu: kernel_ = new jit_uni_relu_kernel_f32(desc); break; @@ -989,13 +1061,13 @@ jit_uni_eltwise_fwd_t::~jit_uni_eltwise_fwd_t() { delete kernel_; } template -void jit_uni_eltwise_fwd_t::execute_forward() { +void jit_uni_eltwise_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); - const size_t nelems = data_d.nelems(); + const size_t nelems = data_d.nelems(true); src += data_d.blocking_desc().offset_padding; dst += data_d.blocking_desc().offset_padding; @@ -1037,10 +1109,10 @@ status_t jit_uni_eltwise_bwd_t::pd_t::init() { } template -jit_uni_eltwise_bwd_t::jit_uni_eltwise_bwd_t(const pd_t *pd, +jit_uni_eltwise_bwd_t::jit_uni_eltwise_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), kernel_(nullptr) { - const auto &desc = *conf_.desc(); + : cpu_primitive_t(apd, inputs, outputs), kernel_(nullptr) { + const auto &desc = *pd()->desc(); switch (desc.alg_kind) { case alg_kind::eltwise_relu: kernel_ = new jit_uni_relu_kernel_f32(desc); break; @@ -1053,13 +1125,13 @@ jit_uni_eltwise_bwd_t::~jit_uni_eltwise_bwd_t() { delete kernel_; } template -void jit_uni_eltwise_bwd_t::execute_backward() { +void jit_uni_eltwise_bwd_t::execute_backward() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper diff_data_d(conf_.diff_src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper diff_data_d(pd()->diff_src_pd()); const size_t nelems = data_d.nelems(); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.hpp index 063556d..1acc239 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.hpp @@ -18,7 +18,6 @@ #define CPU_JIT_UNI_ELTWISE_HPP #include -#include #include "c_types_map.hpp" #include "cpu_eltwise_pd.hpp" @@ -33,45 +32,57 @@ namespace cpu { template struct jit_uni_eltwise_injector_f32 { - jit_uni_eltwise_injector_f32(jit_generator* host, alg_kind_t elt_alg_, - float alpha_, float beta_, bool save_vecs_state_ = true, - int table_reg_idx_ = 0, int opmask_idx_ = 1) { + using Vmm = typename utils::conditional3::type; + + jit_uni_eltwise_injector_f32(jit_generator *host, alg_kind_t alg, + float alpha, float beta, bool save_state = true, + Xbyak::Reg64 p_table = Xbyak::util::rax, + Xbyak::Opmask k_mask = Xbyak::Opmask(1)) + : alg_(alg), alpha_(alpha), beta_(beta), h(host) + , save_state_(save_state), p_table(p_table), k_mask(k_mask) + { + using namespace alg_kind; assert(utils::one_of(isa, sse42, avx2, avx512_common)); - assert(utils::one_of(elt_alg_, alg_kind::eltwise_relu, - alg_kind::eltwise_tanh, alg_kind::eltwise_elu, - alg_kind::eltwise_square, alg_kind::eltwise_abs, - alg_kind::eltwise_sqrt, alg_kind::eltwise_linear, - alg_kind::eltwise_bounded_relu, alg_kind::eltwise_soft_relu, - alg_kind::eltwise_logistic, alg_kind::eltwise_clamp)); - - h = host; - elt_alg = elt_alg_; - alpha = alpha_; - beta = beta_; - save_vecs_state = save_vecs_state_; - table_reg_idx = table_reg_idx_; - opmask_idx = opmask_idx_; + assert(utils::one_of(alg_, eltwise_relu, eltwise_tanh, eltwise_elu, + eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, + eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic, + eltwise_clamp, eltwise_exp)); } + // note that eltwise.scale is ignored + jit_uni_eltwise_injector_f32(jit_generator *host, + const post_ops_t::entry_t::eltwise_t &eltwise, + bool save_state = true, Xbyak::Reg64 p_table = Xbyak::util::rax, + Xbyak::Opmask k_mask = Xbyak::Opmask(1)) + : jit_uni_eltwise_injector_f32(host, eltwise.alg, eltwise.alpha, + eltwise.beta, save_state, p_table, k_mask) {} + void compute_vector_range(size_t start_idx, size_t end_idx); - void compute_vector(size_t idx); - void prepare_table(); + void compute_vector(size_t idx) { compute_vector_range(idx, idx + 1); } + void prepare_table(bool gen_table=true); + void load_table_addr() { h->mov(p_table, l_table); } -private: - jit_generator* h; + const alg_kind_t alg_; + const float alpha_; + const float beta_; - using Vmm = typename utils::conditional3::type; + jit_generator * const h; - size_t vlen = cpu_isa_traits::vlen; + const bool save_state_; + const Xbyak::Reg64 p_table; + const Xbyak::Opmask k_mask; + Xbyak::Label l_table; - alg_kind_t elt_alg; - float alpha; - float beta; +private: + // if only the injector was inherited from jit_generator... + enum { + _cmp_le_os = jit_generator::_cmp_le_os, + _cmp_nle_us = jit_generator::_cmp_nle_us, + _op_floor = jit_generator::_op_floor, + }; - bool save_vecs_state; - int table_reg_idx; - int opmask_idx; + size_t vlen = cpu_isa_traits::vlen; const static size_t preserved_vecs_max = 5; @@ -81,20 +92,17 @@ private: size_t preserved_vec_idxs[preserved_vecs_max] = {0}; size_t start_idx_tail = 0; - Vmm vmm_mask, vmm_aux0, vmm_aux1, vmm_aux2, vmm_aux3; - - Xbyak::Reg64 p_table; - Xbyak::Opmask k_mask; - Xbyak::Label l_table; + Vmm vmm_mask, vmm_aux0, vmm_aux1, vmm_aux2, vmm_aux3, vmm_aux4; - int aux_vecs_count(alg_kind_t elt_alg); + Xbyak::Address table_val(int index) + { return h->ptr[p_table + index * vlen]; } + int aux_vecs_count(alg_kind_t alg); void compute_body(size_t start_idx, size_t end_idx); void injector_preamble(size_t start_idx, size_t end_idx); void injector_preamble_tail(size_t start_idx); void injector_postamble(); void assign_regs(); - bool is_free_vec(size_t idx); void exp_compute_vector(const Vmm &vmm_src); void relu_compute_vector(const Vmm &vmm_src); @@ -137,21 +145,21 @@ struct jit_uni_eltwise_fwd_t : public cpu_primitive_t { virtual status_t init() override; }; - jit_uni_eltwise_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_eltwise_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_eltwise_fwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_eltwise_kernel_f32 *kernel_; }; @@ -170,21 +178,21 @@ struct jit_uni_eltwise_bwd_t : public cpu_primitive_t { virtual status_t init() override; }; - jit_uni_eltwise_bwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_eltwise_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_eltwise_bwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_eltwise_kernel_f32 *kernel_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp index ccc1c34..8f93163 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2017-2018 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. *******************************************************************************/ +#include "jit_uni_i8i8_pooling.hpp" + #include #include "mkldnn_types.h" @@ -23,7 +25,6 @@ #include "jit_generator.hpp" -#include "jit_uni_i8i8_pooling.hpp" namespace mkldnn { namespace impl { @@ -37,20 +38,34 @@ using namespace mkldnn::impl::utils; using namespace mkldnn::impl::types; using namespace alg_kind; -struct call_params_t { - const char *src_i8; - const char *dst_i8; - size_t kw_range; - size_t kh_range; - float idivider; -}; - template -struct jit_uni_i8i8_pool_fwd_ker_t : public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_i8i8_pool_fwd_ker_t) - +struct jit_uni_i8i8_pooling_fwd_ker_t: public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_i8i8_pooling_fwd_ker_t) + + struct call_params_t { + const char *src_i8; + const char *dst_i8; + size_t kw_range; + size_t kh_range; + float idivider; + }; + + using Vmm = typename cpu_isa_traits::Vmm; + Xmm xreg(int idx) const { return Xmm(idx); } + Ymm yreg(int idx) const { return Ymm(xreg(idx).getIdx()); } + Vmm vreg(int idx) const { return Vmm(xreg(idx).getIdx()); } + + // Rounding modes for axv2 + enum:uint8_t { rnd_op_nearest = 0x0 }; + + // In case of avx2 with data type i8 we need to use + // maskmovdqu instruction which has its destination hardcoded in rdi. + // Windows ABI: abi_param1 is rcx - nothing to do else + // Unix ABI: abi_param1 is rdi - copy it to rcx and use it as abi_param1 + Reg64 reg_param = rcx; // Our "unified abi_param1" Reg64 reg_ptr_src_i8 = r8; Reg64 reg_ptr_dst_i8 = r9; + Reg64 reg_ptr_maskmovdqu_dst = rdi; // store destination - must be rdi Reg64 ki = r10; Reg64 kj = r11; @@ -62,73 +77,70 @@ struct jit_uni_i8i8_pool_fwd_ker_t : public jit_generator { Reg64 aux_reg_src_w = rbx; Reg64 reg_tmp = rdx; - Reg64 reg_src_64 = r15; - Reg32 reg_src_32 = r15d; - Reg8 reg_src_8 = r15b; - size_t sizeof_src_dt() const { return data_type_size(jpp.src_dt); } - size_t sizeof_dst_dt() const { return data_type_size(jpp.dst_dt); } + Reg64 reg_mask = r15; - using Vmm = typename utils::conditional3::type; + Opmask k_cmp_mask = Opmask(7); - Xmm xmm_tmp = Xmm(0); - Vmm vreg_tmp = Vmm(14); - Vmm vreg_zeros = Vmm(15); - - /* max pooling */ - Vmm vmm_src(int jj, int ii) { - return Vmm(2*jj + ii); + Opmask mask(int idx) { + return Opmask(6 - idx); } - Xmm xmm_src(int jj) { - return Xmm(2*jj); - } + // ref to any of XYZ-regs via xreg/yreg/vreg functions + Xmm xmm_tmp = xreg(0); // temp to init vreg_tmp + Vmm vreg_tmp = vreg(0); // max pooling : holds minimum values for data_type + Vmm vreg_zeros = vreg(1); - Vmm vmm_dst(int jj, int ii) { - return Vmm(2*jj + ii + 2 * jpp.ur_c); - } + // only in case of == avx2 + Vmm vreg_mask = vreg(2); // full byte-mask + Xmm xreg_mask_lo = xreg(2); // low 128-bits part of byte-mask (alias for xmm part of vreg_mask) + Xmm xreg_mask_hi = xreg(3); // "max" - high 128-bits part of byte-mask (stored separately) + Xmm xreg_mask_q = xreg(3); // "avg" - 1/4 part of the mask for s8/u8 operations + Vmm vreg_mask_q = vreg(3); // "avg" - 1/4 part for non-zero tails - Xmm xmm_dst(int jj) { - return Xmm(2*jj + 2 * jpp.ur_c); - } + enum:int {vidx_base = isa == avx2 ? 4 : 2}; + Vmm base_vr(int idx) const { return vreg(vidx_base + idx); } - /* avg pooling */ - Vmm vmm_src_s32(int jj, int ii) { - return Vmm(2*jj + ii); - } - - Xmm xmm_src_s32(int jj, int ii) { - return Xmm(2*jj + ii); - } - - Vmm vmm_dst_s32(int jj, int ii) { - return Vmm(2*jj + ii + 2 * jpp.ur_c); - } - - Ymm ymm_dst_s32(int jj, int ii) { - return Ymm(2*jj + ii + 2 * jpp.ur_c); - } + size_t sizeof_src_dt() const { return data_type_size(jpp.src_dt); } + size_t sizeof_dst_dt() const { return data_type_size(jpp.dst_dt); } - Xmm xmm_dst_s32(int jj, int ii) { - return Xmm(2*jj + ii + 2 * jpp.ur_c); - } + /* max pooling */ + Vmm vreg_src(int idx) const { return base_vr(idx); } // [0 .. ur_c-1] + Vmm vreg_dst(int idx) const { return base_vr(jpp.ur_c + idx); } // [ur_c .. 2*ur_c-1] - Vmm vmm_dst_f32(int jj, int ii) { - return Vmm(2*jj + ii + 4 * jpp.ur_c); - } + /* avg pooling */ + // s32 used for processing of s8/u8 data + // thus we need to take into account ratio of sizes s32/i8 = 4 + static constexpr data_type_t avg_proc_dt = data_type::s32; + enum:int { + s32_to_i8_ratio = sizeof(typename prec_traits::type) + / sizeof(typename prec_traits::type), + max_num_ll = s32_to_i8_ratio + }; + Vmm vreg_src_s32(int jj, int ll) { return base_vr(3*max_num_ll*jj + ll + 0*max_num_ll); } // ll: 0..4 [0..3] + Vmm vreg_dst_s32(int jj, int ll) { return base_vr(3*max_num_ll*jj + ll + 1*max_num_ll); } // ll: 0..4 [4..7] + Vmm vreg_dst_f32(int jj, int ll) { return base_vr(3*max_num_ll*jj + ll + 2*max_num_ll); } // ll: 0..4 [8..11] void (*ker_)(const call_params_t *); jit_pool_conf_t jpp; void init_tmp_reg(); + void init_mask(); + + void load_vreg_mask_q(int ll) {}; + + void load_src_max_op(int jj, int ll, size_t offset, bool masked, uint64_t msk); + void load_src_avg_op(int jj, int ll, size_t offset, bool masked, uint64_t msk); + void load_src(int jj, int ll, int c_tail); - void load_src(int jj, int c_step); - void store_dst(int jj, int c_step); + void store_dst_max_op(int jj, int ll, size_t offset, bool masked, uint64_t msk); + void store_dst_avg_op(int jj, int ll, size_t offset, bool masked, uint64_t msk); + void store_dst(int jj, int ll, int c_tail); - void compute_avg_step(int ur_c, int c_step); - void compute_max_step(int ur_c, int c_step); - void compute_step(int ur_c, int c_step); + void compute_avg_step(int ur_c, int c_tail); + void compute_max_op(const int jj); + void compute_max_step(int ur_c, int c_tail); + void compute_step(int ur_c, int c_tail); void compute_c_block(); void generate(); @@ -137,7 +149,7 @@ struct jit_uni_i8i8_pool_fwd_ker_t : public jit_generator { const pooling_desc_t &pd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &dst_d); - jit_uni_i8i8_pool_fwd_ker_t(const jit_pool_conf_t &jpp_) + jit_uni_i8i8_pooling_fwd_ker_t(const jit_pool_conf_t &jpp_) : jpp(jpp_) { generate(); ker_ = reinterpret_cast(const_cast( @@ -145,179 +157,376 @@ struct jit_uni_i8i8_pool_fwd_ker_t : public jit_generator { } }; +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::load_vreg_mask_q(int ll) { + + // extract ll-th part of mask (ll-th QWORD) + vpblendd(vreg_mask_q, vreg_zeros, vreg_mask, 0x3 << ll); // 0x3 - mask for 2 x DWORD + + // Move mask from ll-th pos to 0-th pos + if (ll>0) + vpermq(vreg_mask_q, vreg_mask_q, ll); +}; + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::load_src_max_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + if (masked) { + if (jpp.src_dt == s32) { + vpblendd(vreg_src(jj), vreg_tmp, ptr[aux_reg_src_w + offset], static_cast(msk)); + } else { + vpblendvb(vreg_src(jj), vreg_tmp, ptr[aux_reg_src_w + offset], vreg_mask); + } + } else + vmovups(vreg_src(jj), ptr[aux_reg_src_w + offset]); +}; + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::load_src_max_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + if (masked) { + if (jpp.src_dt == s32) + vmovups(vreg_src(jj) | mask(0), ptr[aux_reg_src_w + offset]); + else + vmovdqu8(vreg_src(jj) | mask(0), ptr[aux_reg_src_w + offset]); + } else + vmovups(vreg_src(jj), ptr[aux_reg_src_w + offset]); +}; + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::load_src_avg_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + // Don't generate useless code + if (masked && !msk) + return; + + auto load_i8 = [&](bool is_signed, const Vmm& vr_src) { + + // Need to use mask of tail? + if (masked) { + + // load ll-th part of mask into vreg_mask_q + load_vreg_mask_q(ll); + + // Load by mask from mem into register vr_src + vpblendvb(vr_src, vreg_zeros, ptr[aux_reg_src_w + offset], vreg_mask_q); + + // Conversion s8/u8 -> s32 + if (is_signed) + vpmovsxbd(vr_src, vr_src); + else + vpmovzxbd(vr_src, vr_src); + } else { + + // Load from mem into vr_src with conversion + if (is_signed) + vpmovsxbd(vr_src, ptr[aux_reg_src_w + offset]); + else + vpmovzxbd(vr_src, ptr[aux_reg_src_w + offset]); + } + }; + + switch (jpp.src_dt) { + case s32: + if (masked) + vpblendd(vreg_src_s32(jj, ll), vreg_zeros, ptr[aux_reg_src_w + offset], + static_cast(msk)); + else + vmovups(vreg_src_s32(jj, ll), ptr[aux_reg_src_w + offset]); + break; + case s8: + load_i8(true, vreg_src_s32(jj, ll)); + break; + case u8: + load_i8(false, vreg_src_s32(jj, ll)); + break; + default: assert(!"unsupported src data type"); + } +}; + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::load_src_avg_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + // Don't generate useless code + if (masked && !msk) + return; + + const Vmm& vr_src = masked ? + vreg_src_s32(jj, ll) | mask(ll) : + vreg_src_s32(jj, ll); + + switch (jpp.src_dt) { + case s32: + vmovups(vr_src, ptr[aux_reg_src_w + offset]); + break; + case s8: + vpmovsxbd(vr_src, ptr[aux_reg_src_w + offset]); + break; + case u8: + vpmovzxbd(vr_src, ptr[aux_reg_src_w + offset]); + break; + default: assert(!"unsupported src data type"); + } +}; + template -void jit_uni_i8i8_pool_fwd_ker_t::load_src(int jj, int c_step) { +void jit_uni_i8i8_pooling_fwd_ker_t::load_src(int jj, int ll, int c_tail) { using namespace data_type; - int repeats = isa == sse42 && c_step != 1 ? 2 : 1; + int c_block = jpp.c_block; + int ur_c = jpp.ur_c; + switch (jpp.alg) { case pooling_max: { - auto offset = jj*c_step*sizeof_src_dt(); - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) - uni_vmovups(vmm_src(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); - } else if (c_step == 1) { - if (jpp.src_dt == s32) { - movsd(xmm_src(jj), ptr[aux_reg_src_w + offset]); - } else { - mov(reg_src_8, ptr[aux_reg_src_w + offset]); - movq(xmm_src(jj), reg_src_64); - } - } + auto offset = jj*c_block*sizeof_src_dt(); + bool masked = jj == ur_c - 1 && c_tail; + load_src_max_op(jj, ll, offset, masked, jpp.tail[0]); break; } case pooling_avg_include_padding: case pooling_avg_exclude_padding: { - auto offset = jj*c_step*sizeof_src_dt(); - switch (jpp.src_dt) { - case s32: - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) - uni_vmovups(vmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); - } else if (c_step == 1) { - movsd(xmm_src_s32(jj, 0), ptr[aux_reg_src_w + offset]); - } - break; - case s8: - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) { - if (isa == sse42) - movd(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); - else - movq(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); - - uni_vpmovsxbd(vmm_src_s32(jj, ii), xmm_src_s32(jj, ii)); - } - } else if (c_step == 1) { - movsx(reg_src_32, ptr[aux_reg_src_w + offset]); - movq(xmm_src_s32(jj, 0), reg_src_64); - } - break; - case u8: - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) { - if (isa == sse42) - movd(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); - else - movq(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]); - - uni_vpmovzxbd(vmm_src_s32(jj, ii), xmm_src_s32(jj, ii)); - } - } else if (c_step == 1) { - movzx(reg_src_32, ptr[aux_reg_src_w + offset]); - movq(xmm_src_s32(jj, 0), reg_src_64); - } - break; - default: assert(!"unsupported src data type"); - } + auto offset = (ll*(c_block/max_num_ll) + jj*c_block)*sizeof_src_dt(); + bool masked = jj == ur_c - 1 && c_tail; + load_src_avg_op(jj, ll, offset, masked, jpp.tail[ll]); break; } default: assert(!"unsupported algorithm"); } } +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::store_dst_max_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + int c_block = jpp.c_block; + + if (masked) { + switch (jpp.src_dt) { + case s32: + vpmaskmovd(ptr[reg_ptr_dst_i8 + offset], vreg_mask, vreg_dst(jj)); + break; + case s8: + case u8: { + // Store low half by mask (bytes 0...15) + lea(reg_ptr_maskmovdqu_dst, ptr[reg_ptr_dst_i8 + offset]); + maskmovdqu(vreg_dst(jj), xreg_mask_lo); + + // Do we need to store high half (bytes 16...31) ? + const uint64_t low_mask = (1ULL << (c_block/2))-1; + if (msk & ~low_mask) { + vextracti128(Xmm(vreg_dst(jj).getIdx()), vreg_dst(jj), 1); + add(reg_ptr_maskmovdqu_dst, c_block / 2); + maskmovdqu(vreg_dst(jj), xreg_mask_hi); + } + } break; + default: assert(!"unsupported src data type"); + } + } else + vmovups(ptr[reg_ptr_dst_i8 + offset], vreg_dst(jj)); +} + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::store_dst_max_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + if (masked) { + switch (jpp.src_dt) { + case s32: + vmovups(ptr[reg_ptr_dst_i8 + offset], vreg_dst(jj) | mask(0)); + break; + case s8: + case u8: + vmovdqu8(ptr[reg_ptr_dst_i8 + offset], vreg_dst(jj) | mask(0)); + break; + default: assert(!"unsupported src data type"); + } + } else + vmovups(ptr[reg_ptr_dst_i8 + offset], vreg_dst(jj)); +} + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::store_dst_avg_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk){ + using namespace data_type; + + // Don't generate useless code + if (masked && !msk) + return; + + auto s32_to_i8 = [&](bool is_signed, const Vmm& vr_dst) { + + // conversion: s32 -> s16/u16 : {8 x s32}{8 x 0} -> {16 x s16/u16} + // Result QWORDs (qw0, qw1) permuted: {qw0, 0, qw1, 0} + if (is_signed) + vpackssdw(vr_dst, vr_dst, vreg_zeros); + else + vpackusdw(vr_dst, vr_dst, vreg_zeros); + + // Permute qwords to restore original order + // {qw0, 0, qw1, 0} -> {qw0, qw1, 0, 0} + vpermq(vr_dst, vr_dst, 0x58); + + // conversion: s16/u16 -> s8/u8 : {16 x s16/u16}{16 x 0} -> {32 x s8/u8} + // Target QWORD qw = {8 x s8/u8} has proper position: {qw, xx, xx, xx} + if (is_signed) + vpacksswb(vr_dst, vr_dst, vreg_zeros); + else + vpackuswb(vr_dst, vr_dst, vreg_zeros); + + }; + + auto store_i8 = [&](bool is_signed, bool is_masked, const Vmm& vr_dst) { + + // Conversion s32 -> s8/u8 + s32_to_i8(is_signed, vr_dst); + + // Need to use mask of tail? + if (is_masked) { + // load ll-th part of mask into vreg_mask_q + load_vreg_mask_q(ll); + } + + // store 8 bytes + lea(reg_ptr_maskmovdqu_dst, ptr[reg_ptr_dst_i8 + offset]); + maskmovdqu(vr_dst, xreg_mask_q); + }; + + switch (jpp.dst_dt) { + case s32: + if (masked) { + vpmaskmovd(ptr[reg_ptr_dst_i8 + offset], vreg_mask, vreg_dst_s32(jj, ll)); + } else + vmovups(ptr[reg_ptr_dst_i8 + offset], vreg_dst_s32(jj, ll)); + break; + case s8: + store_i8(true, masked, vreg_dst_s32(jj, ll)); + break; + case u8: + store_i8(false, masked, vreg_dst_s32(jj, ll)); + break; + default: assert(!"unsuppotred dst data_type"); + } +} + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::store_dst_avg_op(int jj, int ll, + size_t offset, bool masked, uint64_t msk) { + using namespace data_type; + + // Don't generate useless code + if (masked && !msk) + return; + + const Vmm& vr_dst = masked ? + vreg_dst_s32(jj, ll) | mask(ll) : + vreg_dst_s32(jj, ll); + + switch (jpp.dst_dt) { + case s32: + vmovups(ptr[reg_ptr_dst_i8 + offset], vr_dst); + break; + case s8: + vpmovdb(ptr[reg_ptr_dst_i8 + offset], vr_dst); + break; + case u8: + vpmovusdb(ptr[reg_ptr_dst_i8 + offset], vr_dst); + break; + default: assert(!"unsupported dst data_type"); + } +} + + template -void jit_uni_i8i8_pool_fwd_ker_t::store_dst(int jj, int c_step) { +void jit_uni_i8i8_pooling_fwd_ker_t::store_dst(int jj, int ll, + int c_tail) { using namespace data_type; - int repeats = isa == sse42 && c_step != 1 ? 2 : 1; + int c_block = jpp.c_block; + int ur_c = jpp.ur_c; + switch(jpp.alg) { case pooling_max: { - auto offset = jj*c_step*sizeof_dst_dt(); - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) - uni_vmovups(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], vmm_dst(jj, ii)); - } else if (c_step == 1) { - if (jpp.src_dt == s32) { - movq(reg_src_64, xmm_dst(jj)); - mov(ptr[reg_ptr_dst_i8 + offset], reg_src_32); - } else { - movq(reg_src_64, xmm_dst(jj)); - mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8); - } - } + auto offset = jj*c_block*sizeof_dst_dt(); + bool masked = jj == ur_c - 1 && c_tail; + store_dst_max_op(jj, ll, offset, masked, jpp.tail[ll]); break; } case pooling_avg_include_padding: case pooling_avg_exclude_padding: { - auto offset = jj*c_step*sizeof_dst_dt(); - switch (jpp.dst_dt) { - case s32: - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) - uni_vmovups(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], vmm_dst_s32(jj, ii)); - } else if (c_step == 1) { - movq(reg_src_64, xmm_dst_s32(jj, 0)); - mov(ptr[reg_ptr_dst_i8 + offset], reg_src_32); - } - break; - case s8: - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) { - uni_vpackssdw(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii)); - - if (isa != sse42) - vpermq(ymm_dst_s32(jj, ii), ymm_dst_s32(jj, ii), 0x08); - - uni_vpacksswb(xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii)); - - if (isa != sse42) - movq(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii)); - else - movd(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii)); - } - } else if (c_step == 1) { - vpackssdw(vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0)); - vpacksswb(xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0)); - movq(reg_src_64, xmm_dst_s32(jj, 0)); - mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8); - } - break; - case u8: - if (c_step == jpp.c_block) { - for (int ii = 0; ii < repeats; ii++) { - uni_vpackusdw(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii)); - - if (isa != sse42) - vpermq(ymm_dst_s32(jj, ii), ymm_dst_s32(jj, ii), 0x08); - - uni_vpackuswb(xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii)); - - if (isa != sse42) - movq(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii)); - else - movd(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii)); - } - } else if (c_step == 1) { - vpackusdw(vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0)); - vpackuswb(xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0)); - movq(reg_src_64, xmm_dst_s32(jj, 0)); - mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8); - } - break; - default: assert(!"unsuppotred dst data_type"); - } + auto offset = (ll*(c_block/max_num_ll) + jj*c_block)*sizeof_dst_dt(); + bool masked = jj == ur_c - 1 && c_tail; + store_dst_avg_op(jj, ll, offset, masked, jpp.tail[ll]); break; } default: assert(!"unsupported pooling algorithm"); } } +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::compute_max_op(const int jj) +{ + using namespace data_type; + switch (jpp.src_dt) { + case s32: + vpmaxsd(vreg_dst(jj), vreg_dst(jj), vreg_src(jj)); + break; + case s8: + vpmaxsb(vreg_dst(jj), vreg_dst(jj), vreg_src(jj)); + break; + case u8: + vpmaxub(vreg_dst(jj), vreg_dst(jj), vreg_src(jj)); + break; + default: assert(!"unsupported src data type"); + } +} + +template <> +void jit_uni_i8i8_pooling_fwd_ker_t::compute_max_op(const int jj) +{ + using namespace data_type; + + // Compare + switch (jpp.src_dt) { + case s32: + vpcmpd(k_cmp_mask, vreg_dst(jj), vreg_src(jj), _cmp_lt_os); + break; + case s8: + vpcmpb(k_cmp_mask, vreg_dst(jj), vreg_src(jj), _cmp_lt_os); + break; + case u8: + vpcmpub(k_cmp_mask, vreg_dst(jj), vreg_src(jj), _cmp_lt_os); + break; + default: assert(!"unsupported src data type"); + } + + // move max values into vreg_dst + if (jpp.src_dt == s32) + vpblendmd(vreg_dst(jj) | k_cmp_mask, vreg_dst(jj), vreg_src(jj)); + else + vpblendmb(vreg_dst(jj) | k_cmp_mask, vreg_dst(jj), vreg_src(jj)); +} + + template -void jit_uni_i8i8_pool_fwd_ker_t::compute_max_step(int ur_c, int c_step) +void jit_uni_i8i8_pooling_fwd_ker_t::compute_max_step(int ur_c, int c_tail) { Label l_kw, l_kh; int iw = jpp.iw; int c = jpp.c; - int repeats = isa == sse42 && c_step != 1 ? 2 : 1; - - for (int jj = 0; jj < ur_c; jj++) { - for (int ii = 0; ii < repeats; ii++) { - uni_vmovups(vmm_dst(jj, ii), vreg_tmp); - } - } + for (int jj = 0; jj < ur_c; jj++) + vmovups(vreg_dst(jj), vreg_tmp); mov(aux_reg_src_h, reg_ptr_src_i8); @@ -329,18 +538,8 @@ void jit_uni_i8i8_pool_fwd_ker_t::compute_max_step(int ur_c, int c_step) L(l_kw); { for (int jj = 0; jj < ur_c; jj++) { - load_src(jj, c_step); - - for (int ii = 0; ii < repeats; ii++) { - if (jpp.src_dt == data_type::s32) { - uni_vpmaxsd(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii)); - } else { - if (jpp.src_dt == data_type::s8) - uni_vpmaxsb(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii)); - else - uni_vpmaxub(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii)); - } - } + load_src(jj, 0, c_tail); + compute_max_op(jj); } add(aux_reg_src_w, c * sizeof_src_dt()); inc(ki); @@ -354,11 +553,11 @@ void jit_uni_i8i8_pool_fwd_ker_t::compute_max_step(int ur_c, int c_step) } for (int jj = 0; jj < ur_c; jj++) - store_dst(jj, c_step); + store_dst(jj, 0, c_tail); } template -void jit_uni_i8i8_pool_fwd_ker_t::compute_avg_step(int ur_c, int c_step) +void jit_uni_i8i8_pooling_fwd_ker_t::compute_avg_step(int ur_c, int c_tail) { using namespace data_type; @@ -367,12 +566,16 @@ void jit_uni_i8i8_pool_fwd_ker_t::compute_avg_step(int ur_c, int c_step) int iw = jpp.iw; int c = jpp.c; - int repeats = isa == sse42 && c_step != 1 ? 2 : 1; + const int num_ll = data_type_size(avg_proc_dt)/data_type_size(jpp.src_dt); for (int jj = 0; jj < ur_c; jj++) { - for (int ii = 0; ii < repeats; ii++) { - uni_vpxor(vmm_src_s32(jj, ii), vmm_src_s32(jj, ii), vmm_src_s32(jj, ii)); - uni_vpxor(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii)); + for (int ll = 0; ll < num_ll; ll++) { + bool masked = jj == ur_c - 1 && c_tail; + size_t msk = jpp.tail[ll]; + if (!(masked && !msk)) { + uni_vpxor(vreg_src_s32(jj, ll), vreg_src_s32(jj, ll), vreg_src_s32(jj, ll)); + uni_vpxor(vreg_dst_s32(jj, ll), vreg_dst_s32(jj, ll), vreg_dst_s32(jj, ll)); + } } } @@ -386,10 +589,14 @@ void jit_uni_i8i8_pool_fwd_ker_t::compute_avg_step(int ur_c, int c_step) L(l_kw); { for (int jj = 0; jj < ur_c; jj++) { - load_src(jj, c_step); - - for (int ii = 0; ii < repeats; ii++) { - uni_vpaddd(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_src_s32(jj, ii)); + for (int ll = 0; ll < num_ll; ll++) { + bool masked = jj == ur_c - 1 && c_tail; + size_t msk = jpp.tail[ll]; + if (!(masked && !msk)) { + load_src(jj, ll, c_tail); + vpaddd(vreg_dst_s32(jj, ll), vreg_dst_s32(jj, ll), + vreg_src_s32(jj, ll)); + } } } add(aux_reg_src_w, c * sizeof_src_dt()); @@ -404,82 +611,171 @@ void jit_uni_i8i8_pool_fwd_ker_t::compute_avg_step(int ur_c, int c_step) } for (int jj = 0; jj < ur_c; jj++) { - for (int ii = 0; ii < repeats; ii++) { - uni_vcvtdq2ps(vmm_dst_f32(jj, ii), vmm_dst_s32(jj, ii)); + for (int ll = 0; ll < num_ll; ll++) { + bool masked = jj == ur_c - 1 && c_tail; + size_t msk = jpp.tail[ll]; + if (!(masked && !msk)) { - if (isa == sse42) - mulps(vmm_dst_f32(jj, ii), vreg_tmp); - else - vfmadd132ps(vmm_dst_f32(jj, ii), vreg_zeros, vreg_tmp); + vcvtdq2ps(vreg_dst_f32(jj, ll), vreg_dst_s32(jj, ll)); + vfmadd132ps(vreg_dst_f32(jj, ll), vreg_zeros, vreg_tmp); - uni_vcvtps2dq(vmm_dst_s32(jj, ii), vmm_dst_f32(jj, ii)); - } + if (isa == avx2) { + uni_vroundps(vreg_dst_f32(jj, ll), vreg_dst_f32(jj, ll), rnd_op_nearest); + vcvtps2dq(vreg_dst_s32(jj, ll), vreg_dst_f32(jj, ll)); + } else if (isa >= avx512_common) { + // AVX512: use of EVEX-embedded static rounding override + vcvtps2dq(vreg_dst_s32(jj, ll) | T_rn_sae, vreg_dst_f32(jj, ll)); + } - store_dst(jj, c_step); + store_dst(jj, ll, c_tail); + } + } } } template -void jit_uni_i8i8_pool_fwd_ker_t::compute_step(int ur_c, int c_step) { +void jit_uni_i8i8_pooling_fwd_ker_t::compute_step(int ur_c, int c_tail) { switch (jpp.alg) { case pooling_max: - compute_max_step(ur_c, c_step); break; + compute_max_step(ur_c, c_tail); break; case pooling_avg_include_padding: case pooling_avg_exclude_padding: - compute_avg_step(ur_c, c_step); break; + compute_avg_step(ur_c, c_tail); break; default: assert(!"unsupported pooling algorithm"); } } template -void jit_uni_i8i8_pool_fwd_ker_t::compute_c_block() { +void jit_uni_i8i8_pooling_fwd_ker_t::compute_c_block(){ Label l_main_loop; - Label l_tail_loop; - Label exit; + int nb_c = jpp.nb_c; + int c_block = jpp.c_block; int ur_c = jpp.ur_c; + int ur_c_tail = jpp.ur_c_tail; + int c_steps = nb_c / ur_c; + int c_tail = jpp.c_tail; xor_(c_iter, c_iter); + if (c_steps > 0) { + L(l_main_loop); { + compute_step(ur_c, 0); + add(reg_ptr_src_i8, ur_c*c_block*sizeof_src_dt()); + add(reg_ptr_dst_i8, ur_c*c_block*sizeof_dst_dt()); + inc(c_iter); + cmp(c_iter, c_steps); + jl(l_main_loop, T_NEAR); + } + } - L(l_main_loop); - { - cmp(c_iter, jpp.c - ur_c * jpp.c_block); - jg(l_tail_loop, T_NEAR); + if (ur_c_tail != 0) { + compute_step(ur_c_tail, c_tail); + } +} - compute_step(ur_c, jpp.c_block); +template<> +void jit_uni_i8i8_pooling_fwd_ker_t::init_mask() { + using namespace data_type; + using cpu_isa = cpu_isa_traits; + + // AVX2 mask initialization: mask stored in Ymm-regs + auto init = [&](uint64_t bit_mask, bool init_mask_q) { + const size_t QW_PER_VREG = cpu_isa::vlen / sizeof(uint64_t); + + uint64_t vmask[QW_PER_VREG]; + for (size_t i = 0; i < QW_PER_VREG; i++){ + + uint64_t qw_vmask=0ULL; + const size_t DBITS = 8*sizeof_src_dt(); + const uint64_t VMSK = 1ULL << (DBITS-1); + const size_t D_PER_QW = (8*sizeof(qw_vmask))/DBITS; + for (size_t j = 0; j < D_PER_QW; j++) { + if (bit_mask & 1) + qw_vmask |= VMSK << DBITS * j; + bit_mask >>= 1; + } + vmask[i] = qw_vmask; + } - add(reg_ptr_src_i8, ur_c * jpp.c_block * sizeof_src_dt()); - add(reg_ptr_dst_i8, ur_c * jpp.c_block * sizeof_dst_dt()); - add(c_iter, ur_c * jpp.c_block); - jmp(l_main_loop); - } + // Put QWORDS with target mask into xmm regs + const int xdst_i[QW_PER_VREG] = { + xreg_mask_lo.getIdx(), + xreg_mask_lo.getIdx(), + xreg_mask_hi.getIdx(), + xreg_mask_hi.getIdx() + }; + const int xsrc_i[QW_PER_VREG] = { + vreg_zeros.getIdx(), // 0-th qword insert in zeros -> {qw0, 0} + xreg_mask_lo.getIdx(), // 1-st and 0-th merge -> {qw0,qw1} + vreg_zeros.getIdx(), + xreg_mask_hi.getIdx() + }; + const uint8 qw_dst_idx[QW_PER_VREG] = {0, 1, 0, 1}; // qword index in 128-bit xreg + + for (size_t i = 0; i < QW_PER_VREG; i++) { + mov(reg_mask, vmask[i]); + vpinsrq(Xmm(xdst_i[i]), Xmm(xsrc_i[i]), reg_mask, qw_dst_idx[i]); + } - L(l_tail_loop); - { - cmp(c_iter, jpp.c - ur_c); - jg(exit, T_NEAR); + // Merge Low (xreg_mask_lo alias for vreg_mask.xreg) + // and High (xreg_mask_hi) into full vreg_mask + // vreg_mask -> {xreg_mask_hi, vreg_mask.xreg} + vinserti128(vreg_mask, vreg_mask, xreg_mask_hi, 1); - compute_step(ur_c, 1); + // Keep only low qword of mask in xreg_mask_q + if (init_mask_q) { + mov(reg_mask, vmask[0]); + vpinsrq(xreg_mask_q, Xmm(vreg_zeros.getIdx()), reg_mask, 0); + } + }; - add(reg_ptr_src_i8, ur_c * sizeof_src_dt()); - add(reg_ptr_dst_i8, ur_c * sizeof_dst_dt()); - add(c_iter, ur_c); - jmp(l_tail_loop); + uint64_t tail_mask = (1ULL << jpp.c_tail) - 1; + switch (jpp.alg) { + case pooling_max: + // For "max" we need mask only in case of non-zero tail + if (tail_mask) + init(tail_mask, false); + break; + case pooling_avg_include_padding: + case pooling_avg_exclude_padding: + // For "avg" we need mask: + // - s32 - in case of the non-zero tail + // - s8/u8 - irrespective of the tail + switch (jpp.src_dt) { + case s32: + if (tail_mask) + init(tail_mask, false); + break; + case s8: + case u8: + init(tail_mask ? tail_mask : ~0ULL, tail_mask == 0); + break; + default: assert(!"unsupported src data type"); + } + break; + default: assert(!"unsupported pooling algorithm"); } +} + +template<> +void jit_uni_i8i8_pooling_fwd_ker_t::init_mask() { - L(exit); + for (int ll = 0; ll < max_num_ll; ll++) { + mov(reg_mask, jpp.tail[ll]); + kmovq(mask(ll), reg_mask); + } } template -void jit_uni_i8i8_pool_fwd_ker_t::init_tmp_reg() { +void jit_uni_i8i8_pooling_fwd_ker_t::init_tmp_reg() { using namespace data_type; switch (jpp.alg) { case pooling_avg_include_padding: case pooling_avg_exclude_padding: - mov(reg_tmp, ptr[abi_param1 + offsetof(call_params_t, idivider)]); + mov(reg_tmp, ptr[reg_param + offsetof(call_params_t, idivider)]); movq(xmm_tmp, reg_tmp); - uni_vpbroadcastd(vreg_tmp, xmm_tmp); + vpbroadcastd(vreg_tmp, xmm_tmp); break; case pooling_max: switch (jpp.src_dt) { @@ -496,17 +792,10 @@ void jit_uni_i8i8_pool_fwd_ker_t::init_tmp_reg() { } movq(xmm_tmp, reg_tmp); - if (jpp.src_dt == s32) { - uni_vpbroadcastd(vreg_tmp, xmm_tmp); - } else { - if (isa == avx2) { - vpbroadcastb(vreg_tmp, xmm_tmp); - } else { - movups(vreg_tmp, xmm_tmp); - uni_vpxor(xmm_tmp, xmm_tmp, xmm_tmp); - pshufb(vreg_tmp, xmm_tmp); - } - } + if (jpp.src_dt == s32) + vpbroadcastd(vreg_tmp, xmm_tmp); + else + vpbroadcastb(vreg_tmp, xmm_tmp); break; default: assert(!"unsupported pooling algorithm"); } @@ -514,11 +803,17 @@ void jit_uni_i8i8_pool_fwd_ker_t::init_tmp_reg() { } template -void jit_uni_i8i8_pool_fwd_ker_t::generate() { +void jit_uni_i8i8_pooling_fwd_ker_t::generate() { preamble(); +#if !defined(_WIN32) + // Always use rcx as abi_param1 - + // see the note about maskmovdqu near reg_param. + mov(rcx, rdi); +#endif + # define READ_PARAM(reg, field) \ - mov(reg, ptr[abi_param1 + offsetof(call_params_t, field)]) + mov(reg, ptr[reg_param + offsetof(call_params_t, field)]) READ_PARAM(reg_ptr_src_i8, src_i8); READ_PARAM(reg_ptr_dst_i8, dst_i8); READ_PARAM(reg_kw, kw_range); @@ -526,22 +821,23 @@ void jit_uni_i8i8_pool_fwd_ker_t::generate() { # undef READ_PARAM - init_tmp_reg(); - uni_vpxor(vreg_zeros, vreg_zeros, vreg_zeros); + init_mask(); + + init_tmp_reg(); + compute_c_block(); postamble(); } template -status_t jit_uni_i8i8_pool_fwd_ker_t::init_conf(jit_pool_conf_t &jpp, +status_t jit_uni_i8i8_pooling_fwd_ker_t::init_conf(jit_pool_conf_t &jpp, const pooling_desc_t &pd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &dst_d) { - if (!mayiuse(isa)) { + if (!mayiuse(isa)) return status::unimplemented; - } jpp.mb = src_d.dims()[0]; jpp.c = src_d.dims()[1]; @@ -563,71 +859,106 @@ status_t jit_uni_i8i8_pool_fwd_ker_t::init_conf(jit_pool_conf_t &jpp, jpp.src_dt = pd.src_desc.data_type; jpp.dst_dt = pd.dst_desc.data_type; - jpp.c_block = jpp.alg == pooling_max ? 32 / (jpp.src_dt == data_type::s32 ? 4 : 1) : 8; + // data_type items per one vreg on the + // isa == avx2 : 32 bytes -> 32 for s8/u8, 8 for s32 + // isa == avx512* : 64 bytes -> 64 for s8/u8, 16 for s32 + int simd_w = cpu_isa_traits::vlen / data_type_size(jpp.src_dt); + + jpp.c_block = simd_w; jpp.c_tail = jpp.c % jpp.c_block; jpp.nb_c = jpp.c / jpp.c_block; jpp.ur_c = 1; - jpp.ur_c_tail = jpp.nb_c - (jpp.nb_c / jpp.ur_c)*jpp.ur_c + (jpp.c_tail != 0); + jpp.ur_c_tail = jpp.nb_c - (jpp.nb_c / jpp.ur_c)*jpp.ur_c + + (jpp.c_tail != 0); + + size_t tail_mask = (1ULL << jpp.c_tail) - 1; + + switch (jpp.alg) { + case pooling_max: + jpp.tail[0] = tail_mask; + jpp.tail[1] = 0; + jpp.tail[2] = 0; + jpp.tail[3] = 0; + break; + case pooling_avg_include_padding: + case pooling_avg_exclude_padding: { + // avg_proc_dt (s32) defines granularity (because u8/s8 processed as s32) + // avx2 : 8, avx512 : 16 + const size_t msk_gran = cpu_isa_traits::vlen / data_type_size(avg_proc_dt); + const size_t msk_msk = (1ULL << msk_gran) - 1; + size_t m = tail_mask; + for (size_t ll = 0; ll < max_num_ll; ll++) { + jpp.tail[ll] = m & msk_msk; + m = m >> msk_gran; + } + break; + } + default: return status::unimplemented; + } return status::success; } template status_t jit_uni_i8i8_pooling_fwd_t::pd_t::jit_conf() { - return jit_uni_i8i8_pool_fwd_ker_t::init_conf(jpp_, + return jit_uni_i8i8_pooling_fwd_ker_t::init_conf(jpp_, desc_, src_pd_.desc(), dst_pd_.desc()); } template -jit_uni_i8i8_pooling_fwd_t::jit_uni_i8i8_pooling_fwd_t(const pd_t *pd, +jit_uni_i8i8_pooling_fwd_t:: +jit_uni_i8i8_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), ker_(nullptr) { - ker_ = new jit_uni_i8i8_pool_fwd_ker_t(conf_.jpp_); -} + : cpu_primitive_t(apd, inputs, outputs), ker_(nullptr) +{ ker_ = new jit_uni_i8i8_pooling_fwd_ker_t(pd()->jpp_); } template -jit_uni_i8i8_pooling_fwd_t::~jit_uni_i8i8_pooling_fwd_t() { - delete ker_; -} +jit_uni_i8i8_pooling_fwd_t:: +~jit_uni_i8i8_pooling_fwd_t() { delete ker_; } template -void jit_uni_i8i8_pooling_fwd_t::execute_forward() { +void jit_uni_i8i8_pooling_fwd_t::execute_forward() const { auto src_i8 = reinterpret_cast(input_memory(0)); auto dst_i8 = reinterpret_cast(memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); - const auto &jpp = conf_.jpp_; + const auto &jpp = pd()->jpp_; parallel_nd(jpp.mb, jpp.oh, jpp.ow, - [&](int n, int oh, int ow) { - const int ih = nstl::max(oh * jpp.stride_h - jpp.t_pad, 0); - const int iw = nstl::max(ow * jpp.stride_w - jpp.l_pad, 0); + [&](int n, int oh, int ow) { + const int ih = nstl::max(oh*jpp.stride_h - jpp.t_pad, 0); + const int iw = nstl::max(ow*jpp.stride_w - jpp.l_pad, 0); const int kh_start = nstl::max(0, jpp.t_pad - oh * jpp.stride_h); const int kh_end = nstl::min(jpp.kh, - jpp.ih + jpp.t_pad - oh * jpp.stride_h); + jpp.ih + jpp.t_pad - oh * jpp.stride_h); const int kw_start = nstl::max(0, jpp.l_pad - ow * jpp.stride_w); const int kw_end = nstl::min(jpp.kw, - jpp.iw + jpp.l_pad - ow * jpp.stride_w); + jpp.iw + jpp.l_pad - ow * jpp.stride_w); - auto p = call_params_t(); + auto p = typename jit_uni_i8i8_pooling_fwd_ker_t::call_params_t(); p.src_i8 = &src_i8[ - src_d.blk_off(n, 0, ih, iw) * src_d.data_type_size()]; + src_d.blk_off(n, 0, ih, iw) * src_d.data_type_size()]; p.dst_i8 = &dst_i8[ - dst_d.blk_off(n, 0, oh, ow) * dst_d.data_type_size()]; - p.kw_range = (size_t) (kw_end - kw_start); - p.kh_range = (size_t) (kh_end - kh_start); + dst_d.blk_off(n, 0, oh, ow) * dst_d.data_type_size()]; + p.kw_range = (size_t)(kw_end - kw_start); + p.kh_range = (size_t)(kh_end - kh_start); p.idivider = 1.0f / ((jpp.alg == pooling_avg_exclude_padding) ? - p.kh_range * p.kw_range : jpp.kw * jpp.kh); + p.kh_range*p.kw_range : jpp.kw*jpp.kh); ker_->ker_(&p); }); } +// Explicit instantiation only for supported values. +// +template struct jit_uni_i8i8_pooling_fwd_ker_t; +template struct jit_uni_i8i8_pooling_fwd_t; + +template struct jit_uni_i8i8_pooling_fwd_ker_t; template struct jit_uni_i8i8_pooling_fwd_t; -template struct jit_uni_i8i8_pooling_fwd_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp index 2e274ed..fe44d5a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2017-2018 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,14 @@ * limitations under the License. *******************************************************************************/ -#ifndef CPU_JIT_uni_I8I8_POOLING_HPP -#define CPU_JIT_uni_I8I8_POOLING_HPP +#ifndef CPU_JIT_UNI_I8I8_POOLING_HPP +#define CPU_JIT_UNI_I8I8_POOLING_HPP #include "c_types_map.hpp" +#include "cpu_isa_traits.hpp" #include "cpu_pooling_pd.hpp" #include "cpu_engine.hpp" -#include "jit_generator.hpp" + #include "jit_primitive_conf.hpp" namespace mkldnn { @@ -28,7 +29,7 @@ namespace impl { namespace cpu { template -struct jit_uni_i8i8_pool_fwd_ker_t; +struct jit_uni_i8i8_pooling_fwd_ker_t; template struct jit_uni_i8i8_pooling_fwd_t : public cpu_primitive_t { @@ -40,11 +41,12 @@ struct jit_uni_i8i8_pooling_fwd_t : public cpu_primitive_t { DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit:", isa, ""), - jit_uni_i8i8_pooling_fwd_t); + jit_uni_i8i8_pooling_fwd_t); virtual status_t init() override { assert(this->engine()->kind() == engine_kind::cpu); bool ok = true + && mayiuse(isa) && desc()->src_desc.ndims == 4 && set_default_params() == status::success && desc()->prop_kind == prop_kind::forward_inference @@ -75,20 +77,20 @@ struct jit_uni_i8i8_pooling_fwd_t : public cpu_primitive_t { } }; - jit_uni_i8i8_pooling_fwd_t(const pd_t *pd, + jit_uni_i8i8_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_i8i8_pooling_fwd_t(); - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - jit_uni_i8i8_pool_fwd_ker_t *ker_; + jit_uni_i8i8_pooling_fwd_ker_t *ker_; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.cpp index f774d44..00bea07 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.cpp @@ -26,23 +26,23 @@ namespace cpu { template jit_uni_lrn_fwd_t::jit_uni_lrn_fwd_t( - const pd_t *pd, + const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), ker_(nullptr) + : cpu_primitive_t(apd, inputs, outputs), ker_(nullptr) , ker_first_(nullptr), ker_last_(nullptr) { using namespace alg_kind; - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); - const int ls = conf_.desc()->local_size; - float A = conf_.desc()->lrn_alpha / ls; - float K = conf_.desc()->lrn_k; + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); + const int ls = pd()->desc()->local_size; + float A = pd()->desc()->lrn_alpha / ls; + float K = pd()->desc()->lrn_k; - auto pk = conf_.desc()->prop_kind; - auto ak = conf_.desc()->alg_kind; - auto dfmt = conf_.src_pd()->desc()->format; + auto pk = pd()->desc()->prop_kind; + auto ak = pd()->desc()->alg_kind; + auto dfmt = pd()->src_pd()->desc()->format; if (dfmt == nChw8c && ls == 5 && ak == lrn_across_channels) { ker_ = new jit_uni_lrn_fwd_kernel_f32( @@ -74,20 +74,20 @@ jit_uni_lrn_fwd_t::~jit_uni_lrn_fwd_t() { delete ker_; delete ker_first_; delete ker_last_; } template -void jit_uni_lrn_fwd_t::execute_forward() { +void jit_uni_lrn_fwd_t::execute_forward() const { using namespace alg_kind; auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); auto ws = reinterpret_cast(this->memory(1)); - const int N = conf_.MB(); - const int C = conf_.C(); - const int HW = conf_.H() * conf_.W(); - const int ls = conf_.desc()->local_size; + const int N = pd()->MB(); + const int C = pd()->C(); + const int HW = pd()->H() * pd()->W(); + const int ls = pd()->desc()->local_size; - auto ak = conf_.desc()->alg_kind; - auto dfmt = conf_.src_pd()->desc()->format; + auto ak = pd()->desc()->alg_kind; + auto dfmt = pd()->src_pd()->desc()->format; if (dfmt == nChw8c && ls == 5 && ak == lrn_across_channels) { parallel_nd(N, C / VECTOR_LENGTH, [&](int n, int c8) { @@ -177,18 +177,18 @@ status_t jit_uni_lrn_fwd_t::pd_t::init() { } template -jit_uni_lrn_bwd_t::jit_uni_lrn_bwd_t(const pd_t *pd, +jit_uni_lrn_bwd_t::jit_uni_lrn_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs) , ker_(nullptr), ker_first_(nullptr), ker_last_(nullptr) { using namespace alg_kind; - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); - const int ls = conf_.desc()->local_size; - float A = conf_.desc()->lrn_alpha / ls; - float B = conf_.desc()->lrn_beta; + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); + const int ls = pd()->desc()->local_size; + float A = pd()->desc()->lrn_alpha / ls; + float B = pd()->desc()->lrn_beta; int use_h_parallelizm = 0;// XXX if (C / VECTOR_LENGTH == 1) { @@ -212,16 +212,16 @@ jit_uni_lrn_bwd_t::~jit_uni_lrn_bwd_t() } template -void jit_uni_lrn_bwd_t::execute_backward() { +void jit_uni_lrn_bwd_t::execute_backward() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto ws = reinterpret_cast(this->input_memory(2)); auto diff_src = reinterpret_cast(this->memory(0)); - const int N = conf_.MB(); - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); + const int N = pd()->MB(); + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); int use_h_parallelizm = 0; // XXX if (use_h_parallelizm) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.hpp index c88e7af..f10fb52 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn.hpp @@ -47,20 +47,20 @@ struct jit_uni_lrn_fwd_t: public cpu_primitive_t { virtual status_t init() override; }; - jit_uni_lrn_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_lrn_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_lrn_fwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_lrn_fwd_kernel_f32 *ker_, *ker_first_, *ker_last_; }; @@ -79,20 +79,20 @@ struct jit_uni_lrn_bwd_t: public cpu_primitive_t { virtual status_t init() override; }; - jit_uni_lrn_bwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_lrn_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_lrn_bwd_t(); typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_lrn_bwd_kernel_f32 *ker_, *ker_first_, *ker_last_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.cpp new file mode 100644 index 0000000..bdba891 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.cpp @@ -0,0 +1,760 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "c_types_map.hpp" +#include "nstl.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" +#include "cpu_memory.hpp" + +#include "jit_uni_planar_conv_kernel_f32.hpp" +#include "cpu_isa_traits.hpp" + +#define GET_OFF(field) offsetof(jit_conv_call_s, field) + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::prop_kind; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::utils; + +using namespace Xbyak; + +template +void jit_uni_planar_conv_fwd_kernel_f32::load_src_scalar(int ur_h) { + Label init_done_label; + Label init_first_label; + + mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]); + if (jcp.with_bias) + mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]); + + if (!jcp.with_sum) { + test(reg_ci_flag, FLAG_IC_FIRST); + jne(init_first_label, T_NEAR); + } + + for (int kk = 0; kk < ur_h; kk++) { + size_t offt = sizeof(float) * (kk * jcp.ow * jcp.oh_block_step); + movss(Xmm(kk), make_safe_addr(reg_output, offt, reg_long_offt)); + } + + if (jcp.with_sum && jcp.with_bias) { + test(reg_ci_flag, FLAG_IC_FIRST); + je(init_done_label, T_NEAR); + + movss(xmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt)); + for (int kk = 0; kk < ur_h; kk++) { + uni_vaddps(Vmm(kk), Vmm(kk), vmm_tmp); + } + } + + jmp(init_done_label, T_NEAR); + + L(init_first_label); + if (this->jcp.with_bias) { + movss(xmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt)); + for (int kk = 0; kk < ur_h; kk++) { + uni_vmovups(Vmm(kk), vmm_tmp); + } + } else { + for (int kk = 0; kk < ur_h; kk++) { + uni_vpxor(Vmm(kk), Vmm(kk), Vmm(kk)); + } + } + + L(init_done_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::filter_scalar(int ur_h) { + Label iter_exit_label; + + int iw = jcp.iw; + int ih = jcp.ih; + int id = jcp.id; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int kw = jcp.kw; + int kh = jcp.kh; + int kd = jcp.kd; + + cmp(reg_kw, 0); + je(iter_exit_label, T_NEAR); + + mov(aux_reg_input_w, aux_reg_input_h); + mov(aux_reg_kernel_w, aux_reg_kernel_h); + mov(kw_iter, reg_kw); + + Label kw_label; + L(kw_label); + { + for (size_t ifm2 = 0; ifm2 < (size_t)ic_blk; ifm2++) { + for (int kk = 0; kk < ur_h; kk++) { + size_t inp_off = sizeof(float) * (ifm2 * id * ih * iw + kk * jcp.iw * jcp.oh_block_step); + movss(xmm_src, make_safe_addr(aux_reg_input_w, inp_off, reg_long_offt)); + + size_t ker_off = sizeof(float) * (ifm2 * kd * kh * kw); + movss(xmm_ker, ptr[aux_reg_kernel_w + ker_off]); + + uni_vfmadd231ps(Vmm(kk), vmm_src, vmm_ker); + } + } + + add(aux_reg_kernel_w, sizeof(float)); + add(aux_reg_input_w, dilate_w * sizeof(float)); + + dec(kw_iter); + cmp(kw_iter, 0); + jg(kw_label, T_NEAR); + } + + L(iter_exit_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::apply_filter_scalar(int ur_h) { + int iw = jcp.iw; + int kw = jcp.kw; + int dilate_h = jcp.dilate_h + 1; + int dilate_d = jcp.dilate_h + 1; + const int inp_mult_h = dilate_h; + const int inp_mult_d = dilate_d; + + Label skip_kh_loop, skip_kd_loop, kd_label; + if (jcp.ndims == 5) { + push(reg_kernel); + push(reg_output); + + mov(reg_kd, ptr[param1 + GET_OFF(kd_padding)]); + mov(aux_reg_ker_d, aux_reg_kernel_h); + mov(aux_reg_inp_d, aux_reg_input_h); + + cmp(reg_kd, 0); + je(skip_kd_loop, T_NEAR); + + L(kd_label); + mov(kh_iter, ptr[param1 + GET_OFF(kh_padding)]); + } else { + mov(kh_iter, reg_kh); + } + + if (jcp.ndims == 5) { + mov(aux_reg_input_h, aux_reg_inp_d); + mov(aux_reg_kernel_h, aux_reg_ker_d); + } + + cmp(kh_iter, 0); + je(skip_kh_loop, T_NEAR); + + Label kh_label; + L(kh_label); + { + filter_scalar(ur_h); + + add(aux_reg_kernel_h, sizeof(float) * kw); + add(aux_reg_input_h, sizeof(float) * iw * inp_mult_h); + + dec(kh_iter); + cmp(kh_iter, 0); + jg(kh_label, T_NEAR); + } + + L(skip_kh_loop); + + if (jcp.ndims == 5) { + add(aux_reg_ker_d, sizeof(float) * jcp.kw * jcp.kh); + add(aux_reg_inp_d, sizeof(float) * jcp.ih * jcp.iw * inp_mult_d); + + dec(reg_kd); + cmp(reg_kd, 0); + jg(kd_label, T_NEAR); + L(skip_kd_loop); + + pop(reg_output); + pop(reg_kernel); + } +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::apply_postprocess_scalar(int ur_h) { + Label regular_store_label; + + mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]); + test(reg_ci_flag, FLAG_IC_LAST); + je(regular_store_label, T_NEAR); + + int eltwise_inj_idx = 0; + const auto &p = attr_.post_ops_; + + + for (int i = 0; i < p.len_; i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur_h); + eltwise_inj_idx++; + } + } + + L(regular_store_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::store_dst_scalar(int ur_h) { + for (int kk = 0; kk < ur_h; kk++) { + size_t o_off = sizeof(float) * (kk * jcp.ow * jcp.oh_block_step); + movss(make_safe_addr(reg_output, o_off, reg_long_offt), Xmm(kk)); + } +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::load_src(int ur_h, int ur_w) { + Label init_done_label; + Label init_first_label; + + mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]); + if (jcp.with_bias) + mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]); + + if (!jcp.with_sum) { + test(reg_ci_flag, FLAG_IC_FIRST); + jne(init_first_label, T_NEAR); + } + + for (int kk = 0; kk < ur_h; kk++) { + for (int jj = 0; jj < ur_w; jj++) { + size_t offt = sizeof(float) * (jj * jcp.ow_block + kk * jcp.ow * jcp.oh_block_step); + uni_vmovups(Vmm(kk * ur_w + jj), make_safe_addr(reg_output, offt, reg_long_offt)); + } + } + + if (jcp.with_sum && jcp.with_bias) { + test(reg_ci_flag, FLAG_IC_FIRST); + je(init_done_label, T_NEAR); + + uni_vbroadcastss(vmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt)); + for (int kk = 0; kk < ur_h; kk++) { + for (int jj = 0; jj < ur_w; jj++) { + uni_vaddps(Vmm(kk * ur_w + jj), Vmm(kk * ur_w + jj), vmm_tmp); + } + } + } + + jmp(init_done_label, T_NEAR); + + L(init_first_label); + if (this->jcp.with_bias) { + uni_vbroadcastss(vmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt)); + for (int kk = 0; kk < ur_h; kk++) { + for (int jj = 0; jj < ur_w; jj++) { + uni_vmovups(Vmm(kk * ur_w + jj), vmm_tmp); + } + } + } else { + for (int kk = 0; kk < ur_h; kk++) { + for (int jj = 0; jj < ur_w; jj++) { + uni_vpxor(Vmm(kk * ur_w + jj), Vmm(kk * ur_w + jj), Vmm(kk * ur_w + jj)); + } + } + } + + L(init_done_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::filter_unrolled(int ur_h, int ur_w) { + int iw = jcp.iw; + int ih = jcp.ih; + int id = jcp.id; + int stride_w = jcp.stride_w; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int kw = jcp.kw; + int kh = jcp.kh; + int kd = jcp.kd; + int ow_blk = jcp.ow_block; + + for (int ki = 0; ki < kw; ki++) { + for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) { + for (int kk = 0; kk < ur_h; kk++) { + for (int jj = 0; jj < ur_w; jj++) { + size_t inp_off = sizeof(float) * ((size_t) ifm2 * id * ih * iw + ki * dilate_w + + jj * stride_w * ow_blk + kk * jcp.ow * jcp.oh_block_step); + uni_vmovups(vmm_src, make_safe_addr(aux_reg_input_h, inp_off, reg_long_offt)); + + int ker_off = sizeof(float) * ((size_t) ifm2 * kd * kh * kw + ki); + uni_vbroadcastss(vmm_ker, ptr[aux_reg_kernel_h + ker_off]); + + uni_vfmadd231ps(Vmm(kk * ur_w + jj), vmm_src, vmm_ker); + } + } + } + } +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::filter(int ur_h) { + Label iter_exit_label; + + int iw = jcp.iw; + int ih = jcp.ih; + int id = jcp.id; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int kw = jcp.kw; + int kh = jcp.kh; + int kd = jcp.kd; + + cmp(reg_kw, 0); + je(iter_exit_label, T_NEAR); + + mov(aux_reg_input_w, aux_reg_input_h); + mov(aux_reg_kernel_w, aux_reg_kernel_h); + mov(kw_iter, reg_kw); + + Label kw_label; + L(kw_label); + { + for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) { + for (int kk = 0; kk < ur_h; kk++) { + size_t inp_off = sizeof(float) * ((size_t) ifm2 * id * ih * iw + kk * jcp.ow * jcp.oh_block_step); + uni_vmovups(vmm_src, make_safe_addr(aux_reg_input_w, inp_off, reg_long_offt)); + + size_t ker_off = sizeof(float) * ((size_t) ifm2 * kd * kh * kw); + uni_vbroadcastss(vmm_ker, ptr[aux_reg_kernel_w + ker_off]); + + uni_vfmadd231ps(Vmm(kk), vmm_src, vmm_ker); + } + } + + add(aux_reg_kernel_w, sizeof(float)); + add(aux_reg_input_w, dilate_w * sizeof(float)); + + dec(kw_iter); + cmp(kw_iter, 0); + jg(kw_label, T_NEAR); + } + + L(iter_exit_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::apply_filter(int ur_h, int ur_w) { + int iw = jcp.iw; + int kw = jcp.kw; + int dilate_h = jcp.dilate_h + 1; + int dilate_d = jcp.dilate_h + 1; + const int inp_mult_h = dilate_h; + const int inp_mult_d = dilate_d; + + Label skip_kh_loop, skip_kd_loop, kd_label; + if (jcp.ndims == 5) { + push(reg_kernel); + push(reg_output); + + mov(reg_kd, ptr[param1 + GET_OFF(kd_padding)]); + mov(aux_reg_ker_d, aux_reg_kernel_h); + mov(aux_reg_inp_d, aux_reg_input_h); + + cmp(reg_kd, 0); + je(skip_kd_loop, T_NEAR); + + L(kd_label); + mov(kh_iter, ptr[param1 + GET_OFF(kh_padding)]); + } else { + mov(kh_iter, reg_kh); + } + + if (jcp.ndims == 5) { + mov(aux_reg_input_h, aux_reg_inp_d); + mov(aux_reg_kernel_h, aux_reg_ker_d); + } + + cmp(kh_iter, 0); + je(skip_kh_loop, T_NEAR); + + Label kh_label; + L(kh_label); + { + if (ur_w == jcp.nb_ow_blocking) + filter_unrolled(ur_h, ur_w); + else + filter(ur_h); + + add(aux_reg_kernel_h, sizeof(float) * kw); + add(aux_reg_input_h, sizeof(float) * iw * inp_mult_h); + + dec(kh_iter); + cmp(kh_iter, 0); + jg(kh_label, T_NEAR); + } + + L(skip_kh_loop); + + if (jcp.ndims == 5) { + add(aux_reg_ker_d, sizeof(float) * jcp.kw * jcp.kh); + add(aux_reg_inp_d, sizeof(float) * jcp.ih * jcp.iw * inp_mult_d); + + dec(reg_kd); + cmp(reg_kd, 0); + jg(kd_label, T_NEAR); + L(skip_kd_loop); + + pop(reg_output); + pop(reg_kernel); + } +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::apply_postprocess(int ur_h, int ur_w) { + Label regular_store_label; + + mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]); + test(reg_ci_flag, FLAG_IC_LAST); + je(regular_store_label, T_NEAR); + + int eltwise_inj_idx = 0; + const auto &p = attr_.post_ops_; + + for (int i = 0; i < p.len_; i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur_w * ur_h); + eltwise_inj_idx++; + } + } + + L(regular_store_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::store_dst(int ur_h, int ur_w) { + for (int kk = 0; kk < ur_h; kk++) { + for (int jj = 0; jj < ur_w; jj++) { + size_t o_off = sizeof(float) * (jj * jcp.ow_block + kk * jcp.ow * jcp.oh_block_step); + uni_vmovups(make_safe_addr(reg_output, o_off, reg_long_offt), Vmm(kk * ur_w + jj)); + } + } +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::solve_common(int ur_h) { + auto solve_loop = [&](int ur_w, int step_w) { + Label loop_label; + Label exit_label; + + L(loop_label); + { + if (step_w == 1) { + load_src_scalar(ur_h); + apply_filter_scalar(ur_h); + apply_postprocess_scalar(ur_h); + store_dst_scalar(ur_h); + } else { + load_src(ur_h, ur_w); + apply_filter(ur_h, ur_w); + apply_postprocess(ur_h, ur_w); + store_dst(ur_h, ur_w); + } + + add(reg_input, sizeof(float) * step_w * jcp.stride_w); + add(reg_output, sizeof(float) * step_w); + } + + L(exit_label); + }; + + Label left_border_label; + Label main_loop_unrolled_label; + Label main_loop_label; + Label right_border_label; + Label exit_label; + + xor_(reg_ow, reg_ow); + sub(reg_input, sizeof(float) * jcp.l_pad); + + auto adjust_indexes_left = [&]() { + Label border_indexes_label; + Label border_indexes_exit_label; + + mov(reg_wj, jcp.l_pad); + sub(reg_wj, reg_ow); + L(border_indexes_label); + { + cmp(reg_wj, 0); + jle(border_indexes_exit_label, T_NEAR); + + add(aux_reg_kernel_h, sizeof(float)); + add(aux_reg_input_h, sizeof(float) * (jcp.dilate_w + 1)); + dec(reg_kw); + sub(reg_wj, jcp.dilate_w + 1); + + jmp(border_indexes_label); + + L(border_indexes_exit_label); + } + }; + + auto adjust_indexes_right = [&]() { + Label border_indexes_right_label; + Label border_indexes_right_exit_label; + + imul(reg_wj, reg_ow, jcp.stride_w); + add(reg_wj, (jcp.kw-1) * (jcp.dilate_w+1) - jcp.l_pad+1 - jcp.iw); + + L(border_indexes_right_label); + { + cmp(reg_wj, 0); + jle(border_indexes_right_exit_label, T_NEAR); + + dec(reg_kw); + sub(reg_wj, jcp.dilate_w + 1); + + jmp(border_indexes_right_label); + + L(border_indexes_right_exit_label); + } + }; + + int left_border_end = nstl::min(div_up(jcp.l_pad, jcp.stride_w), jcp.ow); + L(left_border_label); { + cmp(reg_ow, left_border_end); + jge(main_loop_unrolled_label, T_NEAR); + + mov(aux_reg_input_h, reg_input); + mov(aux_reg_kernel_h, reg_kernel); + mov(reg_kw, jcp.kw); + + adjust_indexes_left(); + adjust_indexes_right(); + + solve_loop(1, 1); // scalar + + inc(reg_ow); + jmp(left_border_label, T_NEAR); + } + + int main_loop_end = (jcp.iw - (jcp.kw - 1)*(jcp.dilate_w + 1) + jcp.l_pad - 1) / jcp.stride_w + 1; + L(main_loop_unrolled_label); { + cmp(reg_ow, main_loop_end - jcp.nb_ow_blocking * jcp.ow_block); + jg(main_loop_label, T_NEAR); + + mov(aux_reg_input_h, reg_input); + mov(aux_reg_kernel_h, reg_kernel); + mov(reg_kw, jcp.kw); + + solve_loop(jcp.nb_ow_blocking, jcp.nb_ow_blocking * jcp.ow_block); + + add(reg_ow, jcp.nb_ow_blocking * jcp.ow_block); + jmp(main_loop_unrolled_label, T_NEAR); + } + + L(main_loop_label); { + cmp(reg_ow, main_loop_end - jcp.ow_block); + jg(right_border_label, T_NEAR); + + mov(aux_reg_input_h, reg_input); + mov(aux_reg_kernel_h, reg_kernel); + mov(reg_kw, jcp.kw); + + solve_loop(1, jcp.ow_block); // vectorized + + add(reg_ow, jcp.ow_block); + jmp(main_loop_label, T_NEAR); + } + + int right_border_end = jcp.ow; + L(right_border_label); { + cmp(reg_ow, right_border_end); + jge(exit_label, T_NEAR); + + mov(aux_reg_input_h, reg_input); + mov(aux_reg_kernel_h, reg_kernel); + mov(reg_kw, jcp.kw); + + adjust_indexes_left(); + adjust_indexes_right(); + + solve_loop(1, 1); // scalar + + inc(reg_ow); + jmp(right_border_label, T_NEAR); + } + + L(exit_label); +} + +template +void jit_uni_planar_conv_fwd_kernel_f32::generate() { + const auto &p = attr_.post_ops_; + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } + } + + this->preamble(); + + mov(reg_input, ptr[this->param1 + GET_OFF(src)]); + mov(reg_output, ptr[this->param1 + GET_OFF(dst)]); + mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]); + mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); + mov(reg_oh_blocks, ptr[this->param1 + GET_OFF(oh_blocks)]); + + Label tail_label; + Label exit_label; + + solve_common(1); + + this->postamble(); + + for (auto& inj : eltwise_injectors) + inj->prepare_table(); +} + +template +bool jit_uni_planar_conv_fwd_kernel_f32::post_ops_ok( + jit_conv_conf_t &jcp, const primitive_attr_t &attr) { + const auto &p = attr.post_ops_; + + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; + auto is_simple = [&](int idx) { return is_eltwise(idx); }; + + switch (p.len_) { + case 0: return true; // no post_ops + case 1: + return true // sum OR eltwise OR depthwise + && !jcp.with_eltwise && (is_simple(0) || is_sum(0)); + case 2: + return true // sum->relu + && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) || + (is_simple(0) && is_simple(1))); + case 3: + return true // sum->relu + && !jcp.with_eltwise && (is_sum(0) && is_simple(1) && is_simple(2)); + default: return false; + } + + return false; +} + +template +status_t jit_uni_planar_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp, + const convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, + const primitive_attr_t &attr) { + if (!mayiuse(isa)) return status::unimplemented; + + jcp.prop_kind = cd.prop_kind; + + const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; + int ndims = src_d.ndims(); + jcp.ndims = ndims; + + jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; + jcp.mb = src_d.dims()[0]; + + jcp.oc = dst_d.dims()[1] / jcp.ngroups; + jcp.oc_without_padding = jcp.oc; + jcp.ic = src_d.dims()[1] / jcp.ngroups; + + jcp.id = (ndims == 5) ? src_d.dims()[2] : 1; + jcp.ih = src_d.dims()[ndims-2]; + jcp.iw = src_d.dims()[ndims-1]; + jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1; + jcp.oh = dst_d.dims()[ndims-2]; + jcp.ow = dst_d.dims()[ndims-1]; + jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1; + jcp.kh = weights_d.dims()[with_groups + ndims-2]; + jcp.kw = weights_d.dims()[with_groups + ndims-1]; + + jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0; + jcp.t_pad = cd.padding[0][ndims-4]; + jcp.l_pad = cd.padding[0][ndims-3]; + jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1; + jcp.stride_h = cd.strides[ndims-4]; + jcp.stride_w = cd.strides[ndims-3]; + + jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0; + jcp.dilate_h = cd.dilates[ndims-4]; + jcp.dilate_w = cd.dilates[ndims-3]; + + jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) + - (jcp.ih + jcp.t_pad - 1); + + jcp.src_fmt = src_d.format(); + jcp.with_bias = cd.bias_desc.format != memory_format::undef; + jcp.with_eltwise = false; + + if (!post_ops_ok(jcp, attr)) + return status::unimplemented; + + const auto &p = attr.post_ops_; + jcp.with_sum = p.find(primitive_kind::sum) != -1; + + const int simd_w = isa == avx512_common ? 16 : 8; + + bool args_ok = true + && one_of(src_d.format(), nchw, ncdhw) + && one_of(weights_d.format(), oihw, oidhw) + && one_of(cd.bias_desc.format, memory_format::undef, any, x) + && one_of(dst_d.format(), nchw, ncdhw); + if (!args_ok) return status::unimplemented; + + // This convolution implementation was introduced as workaround to provide competitive performance on MSD topology. + // The conditions below are needed to bound applicability scope. + args_ok = jcp.ngroups == 1 && + jcp.oc == 1 && + jcp.stride_d == 1 && jcp.stride_h == 1 && jcp.stride_w == 1; + + if (!args_ok) return status::unimplemented; + + jcp.ur_w = 1; + + jcp.ow_block = simd_w; + jcp.nb_ow_blocking = isa == avx512_common ? 3 : 3; + + jcp.oh_block = 1; + jcp.nb_oh_blocking = 1; + jcp.oh_block_step = 1; // (jcp.dilate_h + 1); + + jcp.oc_block = 1; + jcp.nb_oc = jcp.oc / jcp.oc_block; + jcp.nb_oc_blocking = 1; + + jcp.ic_block = 1; + jcp.nb_ic = jcp.ic / jcp.ic_block; + jcp.nb_ic_blocking = 1; + + return status::success; +} + +template struct jit_uni_planar_conv_fwd_kernel_f32; +template struct jit_uni_planar_conv_fwd_kernel_f32; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.hpp new file mode 100644 index 0000000..f5104ec --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_conv_kernel_f32.hpp @@ -0,0 +1,135 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef JIT_UNI_PLANAR_CONV_KERNEL_F32_HPP +#define JIT_UNI_PLANAR_CONV_KERNEL_F32_HPP + +#include "c_types_map.hpp" +#include "jit_generator.hpp" +#include "jit_primitive_conf.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct jit_uni_planar_conv_fwd_kernel_f32: public jit_generator { + jit_uni_planar_conv_fwd_kernel_f32(jit_conv_conf_t ajcp, + const primitive_attr_t &attr): jcp(ajcp), attr_(attr) + { + this->generate(); + jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); + } + + ~jit_uni_planar_conv_fwd_kernel_f32() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); + } + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_planar_conv_fwd_kernel_f32) + + static bool post_ops_ok(jit_conv_conf_t &jcp, + const primitive_attr_t &attr); + static status_t init_conf(jit_conv_conf_t &jcp, + const convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, + const memory_desc_wrapper &dst_d, + const primitive_attr_t &attr); + + jit_conv_conf_t jcp; + const primitive_attr_t &attr_; + void (*jit_ker)(jit_conv_call_s *); + +private: + using Vmm = typename utils::conditional3::type; + using reg64_t = const Xbyak::Reg64; + using reg32_t = const Xbyak::Reg32; + const Xbyak::AddressFrame &vmmword = (isa == sse42) + ? xword : (isa == avx2) ? yword : zword; + + reg64_t reg_input = r8; + reg64_t reg_kernel = r9; + reg64_t reg_output = r10; + + reg64_t aux_reg_input_h = r11; + reg64_t aux_reg_kernel_h = r12; + + reg64_t aux_reg_input_w = r13; + reg64_t aux_reg_kernel_w = r14; + + reg64_t aux_reg_inp_d = r9; + reg64_t aux_reg_ker_d = r10; + + reg64_t reg_kd = rbx; + reg64_t reg_kh = rdx; + reg64_t reg_kw = rsi; + + reg64_t kh_iter = rax; + reg64_t kw_iter = abi_not_param1; + + reg64_t reg_bias = r13; + reg64_t reg_long_offt = r15; + reg32_t reg_ci_flag = r15d; + + reg64_t reg_d_weights = r15; + reg64_t reg_d_bias = kh_iter; + + reg64_t reg_ow = rbp; + + reg64_t reg_oh_blocks = aux_reg_kernel_w; + + reg64_t reg_wj = aux_reg_input_w; + + Vmm vmm_ker = Vmm(15); + Vmm vmm_tmp = Vmm(15); + Vmm vmm_src = Vmm(14); + Xbyak::Xmm xmm_ker = Xbyak::Xmm(15); + Xbyak::Xmm xmm_tmp = Xbyak::Xmm(15); + Xbyak::Xmm xmm_src = Xbyak::Xmm(14); + + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; + + inline void load_src(int ur_h, int ur_w); + inline void filter(int ur_h); + inline void filter_unrolled(int ur_h, int ur_w); + inline void apply_filter(int ur_h, int ur_w); + inline void apply_postprocess(int ur_h, int ur_w); + inline void store_dst(int ur_h, int ur_w); + inline void solve_common(int ur_h); + + inline void filter_scalar(int ur_h); + inline void load_src_scalar(int ur_h); + inline void apply_filter_scalar(int ur_h); + inline void apply_postprocess_scalar(int ur_h); + inline void store_dst_scalar(int ur_h); + + void generate(); +}; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.cpp new file mode 100644 index 0000000..5a8f302 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.cpp @@ -0,0 +1,172 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "mkldnn_types.h" + +#include "c_types_map.hpp" +#include "jit_uni_planar_convolution.hpp" +#include "utils.hpp" +#include "mkldnn_thread.hpp" +#include "type_helpers.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::status; +using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::utils; + +#define src_blk_off(f, n, c, d, h, w) \ + pd()->ndims() == 5 \ + ? (f).blk_off(n, c, d, h, w) \ + : (f).blk_off(n, c, h, w) + +#define wht_blk_off(f, g, oc, ic, kd, kh, kw) \ + pd()->ndims() == 5 \ + ? pd()->with_groups() \ + ? (f).blk_off(g, oc, ic, kd, kh, kw) \ + : (f).blk_off(oc, ic, kd, kh, kw) \ + : pd()->with_groups() \ + ? (f).blk_off(g, oc, ic, kh, kw) \ + : (f).blk_off(oc, ic, kh, kw) + +template +void _jit_uni_planar_convolution_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto bias = reinterpret_cast(this->input_memory(2)); + auto dst = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); + + const auto &jcp = kernel_->jcp; + const int MB = pd()->MB(); + + int od_indexes[jcp.od]; + + int idx = 0; + for (int i = 0; i < (jcp.dilate_d + 1); i++) { + for (int ib = 0; ib < jcp.od; ib += (jcp.dilate_d + 1)) { + if (ib + i >= jcp.od) + continue; + + od_indexes[idx++] = ib + i; + if (idx >= jcp.od) + break; + } + if (idx >= jcp.od) + break; + } + + int threads_count = mkldnn_get_max_threads(); + int odb_size = div_up(jcp.od, threads_count); + + auto kernel_params = [&](int n, int g, int icb, int oc, int od, int oh, int oh_blocks, int id, int wd, int kd_padding) { + auto par_conv = jit_conv_call_s(); + + const int hj = oh * jcp.stride_h; + const int i_t_overflow = nstl::max(0, jcp.t_pad - hj); + const int i_b_overflow = nstl::max(jcp.ih, hj + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih; + const int ih = nstl::max(hj - jcp.t_pad + div_up(i_t_overflow, (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0); + const int wh = div_up(i_t_overflow, (jcp.dilate_h + 1)); + const int kh_padding = jcp.kh - div_up(i_t_overflow, (jcp.dilate_h + 1)) - div_up(i_b_overflow, (jcp.dilate_h + 1)); + + const size_t _oc = oc; + const size_t _ic = g * jcp.nb_ic + icb; + + par_conv.src = &src[src_blk_off(src_d, n, _ic, id, ih, 0)]; + par_conv.dst = &dst[src_blk_off(dst_d, n, _oc, od, oh, 0)]; + par_conv.filt = &weights[wht_blk_off(weights_d, g, _oc, _ic, wd, wh, 0)]; + + if (icb == 0) { + if (bias) + par_conv.bias = &bias[bias_d.blk_off(_oc)]; + par_conv.flags |= FLAG_IC_FIRST; + } + + if (icb + 1 == jcp.nb_ic) { + par_conv.flags |= FLAG_IC_LAST; + } + + par_conv.oc_off = _oc * sizeof(float); + par_conv.oh_blocks = (size_t)oh_blocks; + + par_conv.kh_padding = (size_t)nstl::max(0, kh_padding); + par_conv.kd_padding = (size_t)nstl::max(0, kd_padding); + + return par_conv; + }; + + auto ker = [&](const int ithr, const int nthr) { + int g = 0; + int oc = 0; + + for (int n = 0; n < MB; n++) { + int icbb = 0; + while (icbb < jcp.nb_ic) { + int icb_step = jcp.nb_ic_blocking; + int icb_step_rem = jcp.nb_ic - icbb; + if (icb_step_rem < jcp.nb_ic_blocking_max) + icb_step = icb_step_rem; + + for (int icb = icbb; icb < icbb + icb_step; ++icb) { + for (int ohb = 0; ohb < (jcp.dilate_h + 1); ohb++) { + for (int oh = ohb; oh < jcp.oh; oh += (jcp.dilate_h + 1)) { + int od_idx_off = ithr * odb_size; + for (int od_idx = 0; od_idx < odb_size; od_idx++) { + if ((od_idx_off + od_idx) >= jcp.od || od_indexes[od_idx_off + od_idx] >= jcp.od) + continue; + int od = od_indexes[od_idx_off + od_idx]; + + const int dj = od * jcp.stride_d; + const int d_t_overflow = nstl::max(0, jcp.f_pad - dj); + const int d_b_overflow = + nstl::max(jcp.id, dj + (jcp.kd - 1) * (jcp.dilate_d + 1) - jcp.f_pad + 1) - + jcp.id; + const int id = nstl::max(dj - jcp.f_pad + + div_up(d_t_overflow, (jcp.dilate_d + 1)) * (jcp.dilate_d + 1), + 0); + const int wd = div_up(d_t_overflow, (jcp.dilate_d + 1)); + const int kd_padding = jcp.kd - div_up(d_t_overflow, (jcp.dilate_d + 1)) - + div_up(d_b_overflow, (jcp.dilate_d + 1)); + + jit_conv_call_s par_conv = kernel_params(n, g, icb, oc, od, oh, 1, id, wd, kd_padding); + + kernel_->jit_ker(&par_conv); + } + } + } + } + icbb += icb_step; + } + } + }; + + parallel(0, ker); +} + + +template struct _jit_uni_planar_convolution_fwd_t; +template struct _jit_uni_planar_convolution_fwd_t; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.hpp new file mode 100644 index 0000000..007ebb8 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_planar_convolution.hpp @@ -0,0 +1,119 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_JIT_UNI_PLANAR_CONVOLUTION_HPP +#define CPU_JIT_UNI_PLANAR_CONVOLUTION_HPP + +#include "c_types_map.hpp" +#include "cpu_convolution_pd.hpp" +#include "cpu_engine.hpp" +#include "cpu_reducer.hpp" +#include "jit_primitive_conf.hpp" +#include "jit_uni_planar_conv_kernel_f32.hpp" +#include "mkldnn_thread.hpp" +#include "jit_uni_depthwise.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct _jit_uni_planar_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, + const primitive_attr_t *attr, + const typename pd_t::base_class *hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_() {} + + DECLARE_COMMON_PD_T( + JIT_IMPL_NAME_HELPER("jit_planar:", isa, ""), + _jit_uni_planar_convolution_fwd_t); + + virtual status_t init() override { + using namespace prop_kind; + assert(this->engine()->kind() == engine_kind::cpu); + bool ok = true + && this->set_default_params() == status::success + && utils::one_of(this->desc()->prop_kind, forward_training, + forward_inference) + && this->desc()->alg_kind == alg_kind::convolution_direct + && !this->has_zero_dim_memory() + && utils::everyone_is(data_type::f32, + this->desc()->src_desc.data_type, + this->desc()->weights_desc.data_type, + this->desc()->dst_desc.data_type) + && IMPLICATION(this->with_bias(), + data_type::f32 == this->desc()->bias_desc.data_type); + if (!ok) return status::unimplemented; + + status_t sts = jit_uni_planar_conv_fwd_kernel_f32::init_conf(jcp_, *this->desc(), + *this->src_pd_.desc(), *this->weights_pd_.desc(), + *this->dst_pd_.desc(), *this->attr()); + + return sts; + } + + jit_conv_conf_t jcp_; + + protected: + virtual status_t set_default_params() override { + using namespace memory_format; + + if (this->src_pd_.desc()->format == any) + CHECK(this->src_pd_.set_format(this->ndims() == 4 ? nchw : ncdhw)); + if (this->dst_pd_.desc()->format == any) + CHECK(this->dst_pd_.set_format(this->ndims() == 4 ? nchw : ncdhw)); + if (this->weights_pd_.desc()->format == any) + CHECK(this->weights_pd_.set_format(this->ndims() == 4 ? oihw : oidhw)); + if (this->bias_pd_.desc()->format == any) + CHECK(this->bias_pd_.set_format(x)); + return status::success; + } + }; + + _jit_uni_planar_convolution_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) { + kernel_ = new jit_uni_planar_conv_fwd_kernel_f32(pd()->jcp_, *pd()->attr()); + } + + ~_jit_uni_planar_convolution_fwd_t() { + delete kernel_; + }; + + typedef typename prec_traits::type data_t; + + virtual void execute(event_t *e) const { + execute_forward(); + e->set_state(event_t::ready); + } + +private: + void execute_forward() const; + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + jit_uni_planar_conv_fwd_kernel_f32 *kernel_; +}; + +using jit_avx512_common_planar_convolution_fwd_t = _jit_uni_planar_convolution_fwd_t; +using jit_avx2_planar_convolution_fwd_t = _jit_uni_planar_convolution_fwd_t; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.cpp index 8e2a03e..d85f338 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.cpp @@ -26,20 +26,20 @@ namespace impl { namespace cpu { template -void jit_uni_pooling_fwd_t::execute_forward() { +void jit_uni_pooling_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - auto indices = conf_.desc()->alg_kind == alg_kind::pooling_max ? + auto indices = pd()->desc()->alg_kind == alg_kind::pooling_max ? reinterpret_cast(this->memory(1)) : nullptr; - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper indices_d(conf_.workspace_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper indices_d(pd()->workspace_pd()); const size_t ind_dt_size = indices ? types::data_type_size(indices_d.data_type()) : 0; - const auto &jpp = conf_.jpp_; - int mb = conf_.MB(); + const auto &jpp = pd()->jpp_; + int mb = pd()->MB(); auto ker = [&](int n, int b_c, int oh) { auto arg = jit_pool_call_s(); @@ -59,7 +59,7 @@ void jit_uni_pooling_fwd_t::execute_forward() { arg.kh_padding = jpp.kh - i_t_overflow - i_b_overflow; arg.kh_padding_shift = i_t_overflow*jpp.kw; arg.kw_padding = 0; - arg.ker_area_h = conf_.desc()->alg_kind == alg_kind::pooling_avg_exclude_padding + arg.ker_area_h = pd()->desc()->alg_kind == alg_kind::pooling_avg_exclude_padding ? (float)(jpp.kh - nstl::max(0, oh*jpp.stride_h - jpp.t_pad + jpp.kh - jpp.ih) - nstl::max(0, jpp.t_pad - oh*jpp.stride_h)) : (float)(jpp.kh - nstl::max(0, oh*jpp.stride_h - jpp.t_pad + jpp.kh - jpp.ih - jpp.b_pad)); @@ -74,20 +74,20 @@ void jit_uni_pooling_fwd_t::execute_forward() { } template -void jit_uni_pooling_fwd_t::execute_forward_3d() { +void jit_uni_pooling_fwd_t::execute_forward_3d() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - auto indices = conf_.desc()->alg_kind == alg_kind::pooling_max ? + auto indices = pd()->desc()->alg_kind == alg_kind::pooling_max ? reinterpret_cast(this->memory(1)) : nullptr; - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper indices_d(conf_.workspace_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper indices_d(pd()->workspace_pd()); const size_t ind_dt_size = indices ? types::data_type_size(indices_d.data_type()) : 0; - const auto &jpp = conf_.jpp_; - int mb = conf_.MB(); + const auto &jpp = pd()->jpp_; + int mb = pd()->MB(); auto ker = [&](int n, int b_c, int od, int oh, int id, int d_t_overflow, int d_b_overflow) { @@ -135,20 +135,20 @@ void jit_uni_pooling_fwd_t::execute_forward_3d() { template -void jit_uni_pooling_bwd_t::execute_backward() { +void jit_uni_pooling_bwd_t::execute_backward() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto diff_src = reinterpret_cast(this->memory(0)); - auto indices = conf_.desc()->alg_kind == alg_kind::pooling_max ? + auto indices = pd()->desc()->alg_kind == alg_kind::pooling_max ? reinterpret_cast(this->input_memory(1)) : nullptr; - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper indices_d(conf_.workspace_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper indices_d(pd()->workspace_pd()); const size_t ind_dt_size = indices ? types::data_type_size(indices_d.data_type()) : 0; - const auto &jpp = conf_.jpp_; - int mb = conf_.MB(); + const auto &jpp = pd()->jpp_; + int mb = pd()->MB(); auto ker = [&](int n, int b_c, int oh) { auto arg = jit_pool_call_s(); @@ -183,20 +183,20 @@ void jit_uni_pooling_bwd_t::execute_backward() { } template -void jit_uni_pooling_bwd_t::execute_backward_3d() { +void jit_uni_pooling_bwd_t::execute_backward_3d() const { auto diff_dst = reinterpret_cast(this->input_memory(0)); auto diff_src = reinterpret_cast(this->memory(0)); - auto indices = conf_.desc()->alg_kind == alg_kind::pooling_max ? + auto indices = pd()->desc()->alg_kind == alg_kind::pooling_max ? reinterpret_cast(this->input_memory(1)) : nullptr; - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper indices_d(conf_.workspace_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper indices_d(pd()->workspace_pd()); const size_t ind_dt_size = indices ? types::data_type_size(indices_d.data_type()) : 0; - const auto &jpp = conf_.jpp_; - int mb = conf_.MB(); + const auto &jpp = pd()->jpp_; + int mb = pd()->MB(); auto ker = [&](int n, int b_c, int od, int oh, int id, int d_t_overflow, int d_b_overflow, int zero_size, int kd) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.hpp index 520ab12..25d3d79 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.hpp @@ -91,25 +91,25 @@ struct jit_uni_pooling_fwd_t: public cpu_primitive_t { } }; - jit_uni_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { kernel_ = new jit_uni_pool_kernel_f32(conf_.jpp_); } + : cpu_primitive_t(apd, inputs, outputs) + { kernel_ = new jit_uni_pool_kernel_f32(pd()->jpp_); } ~jit_uni_pooling_fwd_t() { delete kernel_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.jpp_.ndims == 5) execute_forward_3d(); + virtual void execute(event_t *e) const { + if (pd()->jpp_.ndims == 5) execute_forward_3d(); else execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - void execute_forward_3d(); - pd_t conf_; + void execute_forward() const; + void execute_forward_3d() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_pool_kernel_f32 *kernel_; }; @@ -175,25 +175,25 @@ struct jit_uni_pooling_bwd_t: public cpu_primitive_t { } }; - jit_uni_pooling_bwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_pooling_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { kernel_ = new jit_uni_pool_kernel_f32(conf_.jpp_); } + : cpu_primitive_t(apd, inputs, outputs) + { kernel_ = new jit_uni_pool_kernel_f32(pd()->jpp_); } ~jit_uni_pooling_bwd_t() { delete kernel_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.jpp_.ndims == 5) execute_backward_3d(); + virtual void execute(event_t *e) const { + if (pd()->jpp_.ndims == 5) execute_backward_3d(); else execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - void execute_backward_3d(); - pd_t conf_; + void execute_backward() const; + void execute_backward_3d() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_pool_kernel_f32 *kernel_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder.cpp index 81677ba..7afc3fb 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder.cpp @@ -116,7 +116,7 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator { && simple_impl_desc_init(p, nullptr) && mayiuse(sse42) && IMPLICATION(!utils::everyone_is(f32, p.itype, p.otype), - mayiuse(avx512_core)); + mayiuse(avx)); if (!ok) return false; const ptrdiff_t max_stride = (1LL<<31) - 1; @@ -306,14 +306,26 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator { break; case s8: if (idt == f32) vcvtps2dq(xmm, xmm); - if (idt == f32 || idt == s32) vpmovsdb(xmm, xmm); - if (idt == u8) vpminub(xmm, xmm, xmm_127b); + if (idt == f32 || idt == s32) { + if (mayiuse(avx512_core)) { + vpmovsdb(xmm, xmm); + } else { + vpackssdw(xmm, xmm, xmm_zero); + vpacksswb(xmm, xmm, xmm_zero); + } + } + if (idt == u8) vpminub(xmm, xmm, xmm_4x127b); break; case u8: if (idt == f32) vcvtps2dq(xmm, xmm); if (idt == f32 || idt == s32) { - vpmaxsd(xmm, xmm, xmm_zero); - vpmovusdb(xmm, xmm); + if (mayiuse(avx512_core)) { + vpmaxsd(xmm, xmm, xmm_zero); + vpmovusdb(xmm, xmm); + } else { + vpackssdw(xmm, xmm, xmm_zero); + vpackuswb(xmm, xmm, xmm_zero); + } } if (idt == s8) vpmaxsb(xmm, xmm, xmm_zero); break; @@ -495,7 +507,13 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator { if (prb_.otype == f32) { addss(Xmm(ur), o_addr(o_off[ur])); } else { - vmovss(xmm_tmp, o_addr(o_off[ur])); + if (prb_.otype == s32) { + vmovss(xmm_tmp, o_addr(o_off[ur])); + } else if (utils::one_of(prb_.otype, s8, u8)) { + pinsrb(xmm_tmp, o_addr(o_off[ur]), 0x0); + } else { + assert(!"unsupported o_type"); + } cvt2ps(xmm_tmp, xmm_tmp, prb_.otype); addps(Xmm(ur), xmm_tmp); } @@ -631,13 +649,12 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator { mov(reg_ptr_out, PARAM(out)); # undef PARAM - if (mayiuse(avx512_core)) { + if (mayiuse(avx)) { vxorps(xmm_zero, xmm_zero, xmm_zero); if (prb_.itype == data_type::u8 && prb_.otype == data_type::s8) { mov(reg_tmp.cvt32(), 0x7f7f7f7f); - movd(xmm_127b, reg_tmp.cvt32()); - vbroadcastss(xmm_127b, xmm_127b); + movd(xmm_4x127b, reg_tmp.cvt32()); } } @@ -663,7 +680,7 @@ private: Xmm xmm_scale = xmm15; Xmm xmm_zero = xmm14; - Xmm xmm_127b = xmm13; // TODO: unite with xmm_zero + Xmm xmm_4x127b = xmm13; // TODO: unite with xmm_zero Xmm xmm_tmp = xmm12; }; @@ -825,6 +842,12 @@ struct jit_uni_reorder_t : public cpu_primitive_t { auto prb = tr::prb_t(); + if (imd->format == mkldnn_OhIw8o4i || imd->format == mkldnn_gOhIw8o4i || + imd->format == mkldnn_OhIw8o4i_s8s8 || imd->format == mkldnn_gOhIw8o4i_s8s8 || + omd->format == mkldnn_OhIw8o4i || omd->format == mkldnn_gOhIw8o4i || + omd->format == mkldnn_OhIw8o4i_s8s8 || omd->format == mkldnn_gOhIw8o4i_s8s8) + return status::unimplemented; + status_t prb_init_status = prb_init(prb, *imd, *omd, attr); if (prb_init_status != success) return prb_init_status; @@ -863,97 +886,98 @@ struct jit_uni_reorder_t : public cpu_primitive_t { tr::kernel_t::desc_t ker_desc_; }; - jit_uni_reorder_t(const pd_t *pd, const input_vector &inputs, + jit_uni_reorder_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - kernel_ = tr::kernel_t::create(conf_.ker_desc_); + : cpu_primitive_t(apd, inputs, outputs) { + kernel_ = tr::kernel_t::create(pd()->ker_desc_); assert(kernel_); } ~jit_uni_reorder_t() { delete kernel_; } - void omp_driver_0d(int off, const char *in, char *out, const float *scale) { + void omp_driver_0d(int off, const char *in, char *out, + const float *scale) const { tr::call_param_t c{in, out, scale}; (*kernel_)(&c); } void omp_driver_1d(int ithr, int nthr, int off, const char *in, char *out, - const float *scale) { - tr::node_t *ns = conf_.prb_.nodes + off; + const float *scale) const { + const tr::node_t *ns = pd()->prb_.nodes + off; for_nd(ithr, nthr, (ptrdiff_t)ns[0].n, [&](ptrdiff_t d0) { auto c = tr::call_param_t(); - c.in = in + d0 * ns[0].is * data_type_size(conf_.prb_.itype); - c.out = out + d0 * ns[0].os * data_type_size(conf_.prb_.otype); + c.in = in + d0 * ns[0].is * data_type_size(pd()->prb_.itype); + c.out = out + d0 * ns[0].os * data_type_size(pd()->prb_.otype); c.scale = scale + d0 * ns[0].ss; (*kernel_)(&c); }); } void omp_driver_2d(int ithr, int nthr, int off, const char *in, char *out, - const float *scale) { - tr::node_t *ns = conf_.prb_.nodes + off; + const float *scale) const { + const tr::node_t *ns = pd()->prb_.nodes + off; for_nd(ithr, nthr, (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n, [&](ptrdiff_t d1, ptrdiff_t d0) { auto c = tr::call_param_t(); c.in = in + (d0 * ns[0].is + d1 * ns[1].is) - * data_type_size(conf_.prb_.itype); + * data_type_size(pd()->prb_.itype); c.out = out + (d0 * ns[0].os + d1 * ns[1].os) - * data_type_size(conf_.prb_.otype); + * data_type_size(pd()->prb_.otype); c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss; (*kernel_)(&c); }); } void omp_driver_3d(int ithr, int nthr, int off, const char *in, char *out, - const float *scale) { - tr::node_t *ns = conf_.prb_.nodes + off; + const float *scale) const { + const tr::node_t *ns = pd()->prb_.nodes + off; for_nd(ithr, nthr, (ptrdiff_t)ns[2].n, (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n, [&](ptrdiff_t d2, ptrdiff_t d1, ptrdiff_t d0) { auto c = tr::call_param_t(); c.in = in + (d0 * ns[0].is + d1 * ns[1].is + d2 * ns[2].is) - * data_type_size(conf_.prb_.itype); + * data_type_size(pd()->prb_.itype); c.out = out + (d0 * ns[0].os + d1 * ns[1].os + d2 * ns[2].os) - * data_type_size(conf_.prb_.otype); + * data_type_size(pd()->prb_.otype); c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss; (*kernel_)(&c); }); } void omp_driver_4d(int ithr, int nthr, int off, const char *in, char *out, - const float *scale) { - tr::node_t *ns = conf_.prb_.nodes + off; + const float *scale) const { + const tr::node_t *ns = pd()->prb_.nodes + off; for_nd(ithr, nthr, (ptrdiff_t)ns[3].n, (ptrdiff_t)ns[2].n, (ptrdiff_t)ns[1].n, (ptrdiff_t)ns[0].n, [&](ptrdiff_t d3, ptrdiff_t d2, ptrdiff_t d1, ptrdiff_t d0) { auto c = tr::call_param_t(); c.in = in + (d0 * ns[0].is + d1 * ns[1].is + d2 * ns[2].is - + d3 * ns[3].is) * data_type_size(conf_.prb_.itype); + + d3 * ns[3].is) * data_type_size(pd()->prb_.itype); c.out = out + (d0 * ns[0].os + d1 * ns[1].os + d2 * ns[2].os - + d3 * ns[3].os) * data_type_size(conf_.prb_.otype); + + d3 * ns[3].os) * data_type_size(pd()->prb_.otype); c.scale = scale + d0 * ns[0].ss + d1 * ns[1].ss + d2 * ns[2].ss + d3 * ns[3].ss; (*kernel_)(&c); }); } - void omp_driver(const char *in, char *out, const float *scale) { - in += conf_.prb_.ioff * data_type_size(conf_.prb_.itype); - out += conf_.prb_.ooff * data_type_size(conf_.prb_.otype); + void omp_driver(const char *in, char *out, const float *scale) const { + in += pd()->prb_.ioff * data_type_size(pd()->prb_.itype); + out += pd()->prb_.ooff * data_type_size(pd()->prb_.otype); - DEBUG({ printf("prb : "); tr::prb_dump(conf_.prb_); }); - DEBUG({ printf("ker : "); tr::prb_dump(conf_.ker_desc_.prb); }); + DEBUG({ printf("prb : "); tr::prb_dump(pd()->prb_); }); + DEBUG({ printf("ker : "); tr::prb_dump(pd()->ker_desc_.prb); }); - int ndims = conf_.prb_.ndims; - int ndims_ker = conf_.ker_desc_.prb.ndims; + int ndims = pd()->prb_.ndims; + int ndims_ker = pd()->ker_desc_.prb.ndims; assert(ndims - ndims_ker <= ndims_driver_max); if (ndims - ndims_ker == 0) { - set_rnd_mode(conf_.attr()->round_mode_); + set_rnd_mode(pd()->attr()->round_mode_); omp_driver_0d(ndims_ker, in, out, scale); restore_rnd_mode(); } else { parallel(0, [&](const int ithr, const int nthr) { - set_rnd_mode(conf_.attr()->round_mode_); + set_rnd_mode(pd()->attr()->round_mode_); switch (ndims - ndims_ker) { case 1: omp_driver_1d(ithr, nthr, ndims_ker, in, out, scale); break; case 2: omp_driver_2d(ithr, nthr, ndims_ker, in, out, scale); break; @@ -966,11 +990,11 @@ struct jit_uni_reorder_t : public cpu_primitive_t { } } - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { auto in = reinterpret_cast(input_memory(0)); auto out = reinterpret_cast(memory()); - omp_driver(in, out, conf_.attr()->output_scales_.scales_); + omp_driver(in, out, pd()->attr()->output_scales_.scales_); e->set_state(event_t::ready); } @@ -978,7 +1002,7 @@ struct jit_uni_reorder_t : public cpu_primitive_t { enum { ndims_driver_max = 4 }; private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } tr::kernel_t *kernel_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder_utils.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder_utils.cpp index cb9c1d1..cf193c8 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder_utils.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder_utils.cpp @@ -69,8 +69,11 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_, case memory_format::any: case hwio_s8s8: case hwigo_s8s8: + case gOIhw4o4i_s8s8: + case gOIhw2i8o4i_s8s8: case gOIhw4i16o4i_s8s8: case OIhw4i16o4i_s8s8: + case Goihw16g_s8s8: case wino_fmt: return invalid_arguments; case OIhw4i16o4i: @@ -107,6 +110,16 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_, if (md.format() == OIhw8o16i2o) P(3, bd.padding_dims[3], bd.strides[0][3]); return success; + case gOIhw2i8o4i: + P(0, bd.padding_dims[0], bd.strides[0][0]); + P(1, bd.padding_dims[1] / 8, bd.strides[0][1]); + P(1, 8, 4); + P(2, bd.padding_dims[2] / 8, bd.strides[0][2]); + P(2, 2, 8*4); + P(2, 4, 1); + P(3, bd.padding_dims[3], bd.strides[0][3]); + P(4, bd.padding_dims[4], bd.strides[0][4]); + return success; case gOIhw4i16o4i: P(0, bd.padding_dims[0], bd.strides[0][0]); P(1, bd.padding_dims[1] / 16, bd.strides[0][1]); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.cpp index 08a129a..8ac889b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.cpp @@ -28,16 +28,16 @@ namespace impl { namespace cpu { template -void jit_uni_roi_pooling_fwd_t::execute_forward() { +void jit_uni_roi_pooling_fwd_t::execute_forward() const { auto src_data = reinterpret_cast(this->input_memory(0)); auto src_roi = reinterpret_cast(this->input_memory(1)); auto dst = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper src_d(conf_.src_pd(0)); - const memory_desc_wrapper src_roi_d(conf_.src_pd(1)); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper src_d(pd()->src_pd(0)); + const memory_desc_wrapper src_roi_d(pd()->src_pd(1)); + const memory_desc_wrapper dst_d(pd()->dst_pd()); - const auto &jpp = conf_.jpp_; + const auto &jpp = pd()->jpp_; int cb_work = utils::div_up(jpp.nb_c, jpp.nb_c_blocking); int MB = jpp.mb; @@ -68,7 +68,7 @@ void jit_uni_roi_pooling_fwd_t::execute_forward() { utils::nd_iterator_init(start, n, MB, cbb, cb_work, oh, jpp.oh, ow, jpp.ow); for (int iwork = start; iwork < end; iwork++) { - jit_roi_pool_call_s arg = {}; + auto arg = jit_roi_pool_call_s(); int cb = cbb * jpp.nb_c_blocking; int cb_num = jpp.nb_c_blocking; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.hpp index ca7dd2e..e0325d3 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_roi_pooling.hpp @@ -82,23 +82,23 @@ struct jit_uni_roi_pooling_fwd_t: public cpu_primitive_t { } }; - jit_uni_roi_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_roi_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { kernel_ = new jit_uni_roi_pool_kernel_f32(conf_.jpp_); } + : cpu_primitive_t(apd, inputs, outputs) + { kernel_ = new jit_uni_roi_pool_kernel_f32(pd()->jpp_); } ~jit_uni_roi_pooling_fwd_t() { delete kernel_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_roi_pool_kernel_f32 *kernel_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.cpp index 8d40269..32d2139 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.cpp @@ -33,11 +33,11 @@ using namespace mkldnn::impl::memory_format; using namespace mkldnn::impl::utils; template -jit_uni_softmax_fwd_t::jit_uni_softmax_fwd_t(const pd_t *pd, +jit_uni_softmax_fwd_t::jit_uni_softmax_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs) { - kernel_ = new jit_uni_softmax_kernel_f32(conf_.jpp_); + kernel_ = new jit_uni_softmax_kernel_f32(pd()->jpp_); } template @@ -46,16 +46,16 @@ jit_uni_softmax_fwd_t::~jit_uni_softmax_fwd_t() { } template -void jit_uni_softmax_fwd_t::execute_forward() +void jit_uni_softmax_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); - const auto &jpp = conf_.jpp_; + const auto &jpp = pd()->jpp_; - size_t outer_size = utils::array_product(conf_.src_pd()->desc()->dims, conf_.desc()->softmax_axis); + size_t outer_size = utils::array_product(pd()->src_pd()->desc()->dims, pd()->desc()->softmax_axis); size_t dim = jpp.channels * jpp.inner_size; @@ -70,7 +70,7 @@ void jit_uni_softmax_fwd_t::execute_forward() nd_iterator_init(start, ou, outer_size); for (size_t iwork = start; iwork < end; ++iwork) { - jit_softmax_call_s args{}; + auto args = jit_softmax_call_s(); args.channels = jpp.channels; args.work = jpp.inner_size; size_t off = data_d.off_l(ou * dim); @@ -99,7 +99,7 @@ void jit_uni_softmax_fwd_t::execute_forward() for (size_t iwork = start; iwork < end; ++iwork) { size_t work = nstl::min(jpp.outer_block, outer_size - oub * jpp.outer_block); - jit_softmax_call_s args{}; + auto args = jit_softmax_call_s(); args.channels = jpp.channels; args.work = work; size_t off = data_d.off_l(oub * jpp.outer_block * dim); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.hpp index 24f4f48..19d61eb 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_softmax.hpp @@ -76,20 +76,20 @@ struct jit_uni_softmax_fwd_t : public cpu_primitive_t { jit_softmax_conf_t jpp_; }; - jit_uni_softmax_fwd_t(const pd_t *pd, const input_vector &inputs, + jit_uni_softmax_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs); ~jit_uni_softmax_fwd_t(); using data_t = prec_traits::type; - virtual void execute(event_t *e) override { + virtual void execute(event_t *e) const override { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_softmax_kernel_f32 *kernel_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp deleted file mode 100644 index b3917d5..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp +++ /dev/null @@ -1,507 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "c_types_map.hpp" -#include "nstl.hpp" -#include "type_helpers.hpp" -#include "utils.hpp" -#include "cpu_memory.hpp" - -#include "jit_uni_x8s8s32x_1x1_conv_kernel.hpp" - -#define GET_OFF(field) offsetof(jit_1x1_conv_call_s, field) - -#include - -namespace mkldnn { -namespace impl { -namespace cpu { - -using namespace mkldnn::impl::prop_kind; -using namespace mkldnn::impl::memory_format; -using namespace mkldnn::impl::utils; -using namespace mkldnn::impl::types; - -using namespace Xbyak; - -template -void jit_uni_x8s8s32x_1x1_conv_fwd_kernel::cvt2ps(data_type_t type_in, - Vmm vmm_in, const Xbyak::Operand &op) { - switch (type_in) { - case data_type::f32: - case data_type::s32: vmovups(vmm_in, op); break; - case data_type::s8: vpmovsxbd(vmm_in, op); break; - case data_type::u8: vpmovzxbd(vmm_in, op); break; - default: assert(!"unsupported data type"); - } - if (type_in != data_type::f32) - vcvtdq2ps(vmm_in, vmm_in); -} - -template -void jit_uni_x8s8s32x_1x1_conv_fwd_kernel::loop_os(int oc_loop_blk) -{ - mov(aux_reg_dst_data, reg_dst_data); - - Label loop_os; - Label loop_ow_tail; - - mov(reg_ow_loop_work, jcp.ow); - - L(loop_os); { - assert(jcp.os_block == jcp.ur); - cmp(reg_ow_loop_work, jcp.ow_tail); - je(loop_ow_tail, T_NEAR); - - ic_loop(oc_loop_blk, jcp.ur); - - sub(reg_ow_loop_work, jcp.ur); - - add(reg_src_data, jcp.os_loop_src_step); - add(aux_reg_dst_data, jcp.os_loop_dst_step); - - sub(reg_loop_os_iter, jcp.os_block); - cmp(reg_loop_os_iter, jcp.os_block); - jge(loop_os, T_NEAR); - - L(loop_ow_tail); { - if (jcp.ow_tail > 0) { - ic_loop(oc_loop_blk, jcp.ow_tail); - } - - add(reg_src_data, jcp.os_loop_src_tail_step); - add(aux_reg_dst_data, jcp.os_loop_dst_tail_step); - - mov(reg_ow_loop_work, jcp.ow); - - sub(reg_loop_os_iter, jcp.ow_tail); - cmp(reg_loop_os_iter, 0); - jg(loop_os, T_NEAR); - } - } -} - -template -void jit_uni_x8s8s32x_1x1_conv_fwd_kernel::ic_loop(int oc_loop_blk, int ur) -{ - auto vreg_wei = [=](int i) { - return Vmm(ur * oc_loop_blk + i); - }; - - auto vreg_accum_vmm = [=](int i, int j) { - return Vmm(j * oc_loop_blk + i); - }; - - auto vreg_accum_xmm = [=](int i, int j) { - return Xmm(j * oc_loop_blk + i); - }; - - auto src_ptr = [=](int u, int j) { - size_t offt = j * jcp.ic * jcp.stride_w + u*jcp.ic_block; - return ptr[aux_reg_src_data + jcp.typesize_in * offt]; - }; - - auto wei_ptr = [=](int u, int i) { - size_t offt = i*jcp.nb_ic*jcp.oc_block*jcp.ic_block + u*jcp.ic_block * jcp.oc_block; - return ptr[aux_reg_weight_data + offt * jcp.typesize_in]; - }; - - auto output_ptr = [=](int i, int j) { - return ptr[aux_reg_dst_data + (i * jcp.oc_block + j * jcp.oc) * - jcp.typesize_out]; - }; - - auto init = [&]() { - for (int i = 0; i < oc_loop_blk; ++i) { - for (int j = 0; j < ur; ++j) { - auto vmm_acc = vreg_accum_vmm(i, j); - uni_vpxor(vmm_acc, vmm_acc, vmm_acc); - } - } - - for (int i = 0; i < oc_loop_blk; ++i) - uni_vmovdqu(vreg_wei(i), wei_ptr(0, i)); - - uni_vpbroadcastd(vreg_src, src_ptr(0, 0)); - }; - - auto store = [=]() { - mov(reg_scales, ptr[this->param1 + GET_OFF(scales)]); - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - - for (int j = 0; j < ur; ++j) - for (int i = 0; i < oc_loop_blk; ++i) { - int b_off = i*jcp.oc_block; - - if (jcp.with_bias) { - switch (jcp.bia_dt) { - case data_type::f32: - case data_type::s32: vmovups(vmm_bias, ptr[reg_bias_data + b_off*jcp.typesize_bia]); break; - case data_type::s8: vpmovsxbd(vmm_bias, ptr[reg_bias_data + b_off*jcp.typesize_bia]); break; - case data_type::u8: vpmovzxbd(vmm_bias, ptr[reg_bias_data + b_off*jcp.typesize_bia]); break; - default: assert(!"unsupported dst data type"); - } - } - if (jcp.bia_dt != data_type::f32) - vcvtdq2ps(vmm_bias, vmm_bias); - - Vmm vmm_dst = vreg_accum_vmm(i, j); - Xmm xmm_dst = vreg_accum_xmm(i, j); - - vcvtdq2ps(vmm_dst, vmm_dst); - - if (jcp.with_bias) - vaddps(vmm_dst, vmm_dst, vmm_bias); - - int s_off = jcp.is_oc_scale * (sizeof(float) * (i*jcp.oc_block)); - vmulps(vmm_dst, vmm_dst, ptr[reg_scales + s_off]); - - if (jcp.with_sum) { - Ymm vmm_prev_dst = Ymm(12); - cvt2ps(jcp.dst_dt, vmm_prev_dst, output_ptr(i, j)); - vaddps(vmm_dst, vmm_prev_dst); - } - - if (maybe_relu(0)) - vmaxps(vmm_dst, vmm_zero, vmm_dst); - - if (maybe_relu(1)) - vmaxps(vmm_dst, vmm_zero, vmm_dst); - - if (jcp.dst_dt != data_type::f32) { - if (attr_.round_mode_ == round_mode::nearest) - if (isa == avx512_common) { - vcvtps2dq(vmm_dst | T_rn_sae, vmm_dst); - } else { - vcvtps2dq(vmm_dst, vmm_dst); - } - else if (attr_.round_mode_ == round_mode::down) { - if (isa == avx512_common) { - vcvtps2dq(vmm_dst | T_rd_sae, vmm_dst); - } else { - vroundps(vmm_dst, vmm_dst, 1); - vcvtps2dq(vmm_dst, vmm_dst); - } - } else - assert(!"unimplemented"); - } - - switch (jcp.dst_dt) { - case data_type::f32: - case data_type::s32: vmovups(output_ptr(i, j), vmm_dst); break; - case data_type::s8: - if (isa == avx512_common) { - vpmovsdb(xmm_dst, vmm_dst); - vmovups(output_ptr(i, j), xmm_dst); - } else if (isa == avx2) { - Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - - vpackssdw(ymm_dst, ymm_dst, ymm_dst); - vpermq(ymm_dst, ymm_dst, 0x08); - vpacksswb(xmm_dst, xmm_dst, xmm_dst); - vmovq(output_ptr(i, j), xmm_dst); - } - break; - case data_type::u8: - if (isa == avx512_common) { - vpmovusdb(xmm_dst, vmm_dst); - vmovups(output_ptr(i, j), xmm_dst); - } else if (isa == avx2) { - Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - - vpackusdw(ymm_dst, ymm_dst, ymm_dst); - vpermq(ymm_dst, ymm_dst, 0x08); - vpackuswb(xmm_dst, xmm_dst, xmm_dst); - vmovq(output_ptr(i, j), xmm_dst); - } - break; - default: assert(!"unknown dst_dt"); - } - } - }; - - auto fma_block = [=]() { - for (int j = 0; j < ur; ++j) { - for (int i = 0; i < oc_loop_blk; i++) { - vpmaddubsw(vreg_sum_0, vreg_src, vreg_wei(i)); - vpmaddwd(vreg_sum_0, vreg_sum_0, vmm_one); - vpaddd(vreg_accum_vmm(i, j), vreg_accum_vmm(i, j), vreg_sum_0); - - if (j == ur - 1) { - uni_vmovdqu(vreg_wei(i), wei_ptr(1, i)); - } - } - - if (j < ur - 1) - uni_vpbroadcastd(vreg_src, src_ptr(0, j + 1)); - } - - uni_vpbroadcastd(vreg_src, src_ptr(1, 0)); - }; - - mov(aux_reg_weight_data, reg_weight_data); - mov(aux_reg_src_data, reg_src_data); - - init(); - - Label ic_loop; - Label exit; - - xor_(reg_loop_ic_iter, reg_loop_ic_iter); - L(ic_loop); { - cmp(reg_loop_ic_iter, jcp.nb_ic); - jge(exit, T_NEAR); - - fma_block(); - - add(aux_reg_src_data, jcp.ic_block * jcp.typesize_in); - add(aux_reg_weight_data, jcp.ic_block * jcp.oc_block * jcp.typesize_in); - inc(reg_loop_ic_iter); - jmp(ic_loop, T_NEAR); - } - - L(exit); - - store(); -} - -template -void jit_uni_x8s8s32x_1x1_conv_fwd_kernel::generate() -{ - preamble(); - - mov(reg_scratch, 0x1); - movq(xmm_one, reg_scratch); - vpbroadcastw(vmm_one, xmm_one); - - mov(reg_weight_data, ptr[param1 + GET_OFF(oc_data)]); - mov(reg_dst_data, ptr[param1 + GET_OFF(output_data)]); - if (jcp.with_bias) { - mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]); - } - - mov(reg_oc_loop_work, ptr[param1 + GET_OFF(oc_dim)]); - mov(reg_src_data, ptr[param1 + GET_OFF(is_data)]); - mov(reg_loop_os_iter, ptr[param1 + GET_OFF(os_dim)]); - - Label oc_blocks_tail_label; - Label exit_label; - - int oc_blocks_tail = jcp.nb_oc % jcp.nb_oc_blocking; - - cmp(reg_oc_loop_work, jcp.nb_oc_blocking); - jne(oc_blocks_tail ? oc_blocks_tail_label : exit_label, T_NEAR); - - loop_os(jcp.nb_oc_blocking); // channel main loop - jmp(exit_label, T_NEAR); - - if (oc_blocks_tail) { - L(oc_blocks_tail_label); - - cmp(reg_oc_loop_work, oc_blocks_tail); - jne(exit_label, T_NEAR); - - loop_os(oc_blocks_tail); // channel tail loop - } - - L(exit_label); - - postamble(); -} - -template -bool jit_uni_x8s8s32x_1x1_conv_fwd_kernel::post_ops_ok( - jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) { - const auto &p = attr.post_ops_; - - auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); }; - auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; - - switch (p.len_) { - case 0: return true; // no post_ops - case 1: return !jcp.with_eltwise && (is_relu(0) || is_sum(0)); // sum OR relu - case 2: return !jcp.with_eltwise && (is_sum(0) && is_relu(1)); // sum->relu - default: return false; - } - - return false; -} - -template -bool jit_uni_x8s8s32x_1x1_conv_fwd_kernel::maybe_relu(int position) { - using namespace primitive_kind; - const auto &p = attr_.post_ops_; - - if (position == 0) { - /* relu before sum */ - return false - || jcp.with_eltwise - || p.contain(eltwise, 0) - || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0)); - } else if (position == 1) { - /* relu after sum */ - const int sum_idx = p.contain(sum, 0) - ? 0 : (p.contain(sum, 1) ? 1 : -1); - if (sum_idx == -1) - return false; - - return false - || p.contain(eltwise, sum_idx + 1) - || jcp.dst_dt == data_type::u8; - } - - return false; -} - -template -status_t jit_uni_x8s8s32x_1x1_conv_fwd_kernel::init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const memory_desc_wrapper &bias_pd, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope) -{ - if (!mayiuse(isa)) return status::unimplemented; - - const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; - - jcp.prop_kind = cd.prop_kind; - - jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; - jcp.mb = src_d.dims()[0]; - - jcp.oc = dst_d.dims()[1] / jcp.ngroups; - jcp.ic = src_d.dims()[1] / jcp.ngroups; - - jcp.ih = src_d.dims()[2]; - jcp.iw = src_d.dims()[3]; - jcp.oh = dst_d.dims()[2]; - jcp.ow = dst_d.dims()[3]; - - jcp.kh = weights_d.dims()[with_groups + 2]; - jcp.kw = weights_d.dims()[with_groups + 3]; - - jcp.t_pad = cd.padding[0][0]; - jcp.l_pad = cd.padding[0][1]; - - jcp.stride_h = cd.strides[0]; - jcp.stride_w = cd.strides[1]; - - jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; - jcp.dst_dt = cd.dst_desc.data_type; - - jcp.src_fmt = src_d.format(); - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; - - jcp.os = jcp.oh * jcp.ow; - jcp.is = jcp.ih * jcp.iw; - - auto desired_wei_fmt = OhIw8o4i; - auto desired_gr_wei_fmt = gOhIw8o4i; - - int simd_w = isa == avx512_common ? 16 : 8; - - bool args_ok = true - && jcp.ngroups == 1 - && src_d.format() == nhwc - && one_of(weights_d.format(), desired_wei_fmt, desired_gr_wei_fmt) - && one_of(cd.bias_desc.format, memory_format::undef, any, x) - && dst_d.format() == nhwc - && jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0 - && jcp.t_pad == 0 && jcp.l_pad == 0 - && jcp.kh == 1 && jcp.kw == 1 - && jcp.stride_h == 1 && jcp.stride_w == 1; - - if (!args_ok) return status::unimplemented; - - jcp.ic_block = 4; - jcp.oc_block = simd_w; - - jcp.ur = 2; - jcp.ow_tail = jcp.ow % jcp.ur; - - int oc_blocking{ 0 }; - int oc_blocking_max{ 0 }; - int os_blocking{ 0 }; - int os_blocking_max{ 0 }; - int ic_blocking{ 0 }; - - jcp.ic_dim = jcp.ic; - jcp.oc_dim = jcp.oc; - jcp.is_dim = jcp.is; - jcp.os_block = jcp.ur; - - jcp.typesize_in = types::data_type_size(src_d.data_type()); - jcp.typesize_out = types::data_type_size(dst_d.data_type()); - jcp.typesize_acc = sizeof(int32_t); - jcp.typesize_bia = jcp.with_bias - ? types::data_type_size(bias_pd.data_type()) - : 0; - - const auto &oscales = attr.output_scales_; - jcp.is_oc_scale = oscales.mask_ == 1 << 1; - - const auto &p = attr.post_ops_; - jcp.with_sum = p.find(primitive_kind::sum) != -1; - - assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0)); - - jcp.ic_loop_src_step = jcp.ic_block * jcp.ic_loop_unroll * jcp.typesize_in; - jcp.ic_loop_wei_step = jcp.ic_block * jcp.ic_loop_unroll * jcp.oc_block * jcp.typesize_in; - - jcp.os_loop_dst_step = jcp.ur * jcp.oc * jcp.typesize_out; - jcp.os_loop_acc_step = jcp.ur * jcp.oc_block * jcp.typesize_acc; - jcp.os_loop_src_step = jcp.stride_w * jcp.ur * jcp.ic * jcp.typesize_in; - jcp.os_loop_dst_tail_step = jcp.ow_tail * jcp.oc * jcp.typesize_out; - jcp.os_loop_acc_tail_step = jcp.ow_tail * jcp.oc_block * jcp.typesize_acc; - jcp.os_loop_src_tail_step = jcp.stride_w * jcp.ow_tail * jcp.ic * jcp.typesize_in - + ((jcp.stride_h-1)*jcp.iw*jcp.ic*jcp.typesize_in); - - oc_blocking = 4 * jcp.oc_block; - oc_blocking_max = 4 * jcp.oc_block; - os_blocking = 48; // affects oc balancing across threads - os_blocking_max = 320; - ic_blocking = 4*128; // affects L1$ utilization - - assert(oc_blocking); - assert(oc_blocking_max); - assert(os_blocking); - assert(os_blocking_max); - assert(ic_blocking); - - assert(jcp.os_block % jcp.ur == 0); - jcp.ur_tail = jcp.is_dim % jcp.ur; - - jcp.nb_oh_blocking = nstl::max(1, os_blocking / jcp.ow); - jcp.nb_oh_blocking_max = nstl::max(1, os_blocking_max / jcp.ow); - jcp.nb_oc_blocking = oc_blocking / jcp.oc_block; - jcp.nb_oc_blocking_max = oc_blocking_max / jcp.oc_block; - jcp.nb_ic_blocking = ic_blocking / jcp.ic_block; - - jcp.nb_oc = div_up(jcp.oc_dim, jcp.oc_block); - - jcp.nb_ic = jcp.ic / jcp.ic_block; - - return status::success; -} - -template struct jit_uni_x8s8s32x_1x1_conv_fwd_kernel; -template struct jit_uni_x8s8s32x_1x1_conv_fwd_kernel; - -} -} -} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp deleted file mode 100644 index d082231..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp +++ /dev/null @@ -1,98 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef JIT_UNI_X8S8S32X_1x1_CONV_KERNEL_HPP -#define JIT_UNI_X8S8S32X_1x1_CONV_KERNEL_HPP - -#include "c_types_map.hpp" -#include "type_helpers.hpp" -#include "jit_generator.hpp" -#include "jit_primitive_conf.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -using Xbyak::Reg64; -using Xbyak::Ymm; -using Xbyak::Xmm; - -template -struct jit_uni_x8s8s32x_1x1_conv_fwd_kernel: public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_x8s8s32x_1x1_conv_fwd_kernel) - - jit_uni_x8s8s32x_1x1_conv_fwd_kernel(jit_1x1_conv_conf_t ajcp, - const primitive_attr_t &attr): jcp(ajcp), attr_(attr) - { - this->generate(); - jit_ker = (void (*)(jit_1x1_conv_call_s *))this->getCode(); - } - - static bool post_ops_ok(jit_1x1_conv_conf_t &jcp, - const primitive_attr_t &attr); - static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, - const memory_desc_wrapper &bias_pd, - const primitive_attr_t &attr, - bool with_relu = false, float relu_negative_slope = 0.f); - - jit_1x1_conv_conf_t jcp; - const primitive_attr_t &attr_; - void (*jit_ker)(jit_1x1_conv_call_s *); - -private: - using Vmm = typename utils::conditional3::type; - - Reg64 reg_weight_data = rsi; - Reg64 reg_src_data = abi_not_param1; - Reg64 reg_dst_data = rbx; - Reg64 reg_bias_data = r12; - - Reg64 reg_scales = rdx; - Reg64 aux_reg_src_data = rdx; - Reg64 aux_reg_weight_data = rax; - Reg64 aux_reg_dst_data = rbp; - Reg64 reg_oc_loop_work = r9; - Reg64 reg_ow_loop_work = r10; - Reg64 reg_loop_os_iter = r14; - Reg64 reg_loop_ic_iter = r15; - - Reg64 reg_scratch = r14; - - Vmm vreg_sum_0 = Vmm(15); - Vmm vreg_src = Vmm(14); - Vmm vmm_bias = Vmm(15); - Vmm vmm_zero = Vmm(14); - Vmm vmm_one = Vmm(13); - Xmm xmm_one = Xmm(13); - - void loop_os(int oc_loop_blk); - void ic_loop(int oc_loop_blk, int ur); - - void generate(); - - bool maybe_relu(int position); - void cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op); -}; - -} -} -} - -#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp deleted file mode 100644 index 1eddc79..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "mkldnn_types.h" -#include "c_types_map.hpp" -#include "jit_uni_x8s8s32x_1x1_convolution.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -using namespace mkldnn::impl::status; -using namespace mkldnn::impl::memory_format; -using namespace mkldnn::impl::utils; - -template -void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward() { - auto src = reinterpret_cast(this->input_memory(0)); - auto weights = reinterpret_cast(this->input_memory(1)); - auto bias = reinterpret_cast(this->input_memory(2)); - auto dst = reinterpret_cast(this->memory()); - - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); - - const auto &jcp = kernel_->jcp; - - int ocb_work = utils::div_up(jcp.nb_oc, jcp.nb_oc_blocking); - int ohb_work = utils::div_up(jcp.oh, jcp.nb_oh_blocking); - const int work_amount = jcp.mb * jcp.ngroups * ocb_work * ohb_work; - - const int stride_h = conf_.cdesc()->strides[0]; - const int stride_w = conf_.cdesc()->strides[1]; - const int pad_t = conf_.cdesc()->padding[0][0]; - const int pad_l = conf_.cdesc()->padding[0][1]; - - const size_t bia_dt_size = conf_.with_bias() - ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0; - - const auto &oscales = conf_.attr()->output_scales_; - - auto ker = [&](const int ithr, const int nthr) { - jit_1x1_conv_call_s p = {}; - p.acc_s32 = ws_ + ithr * ws_per_thread_; - - const int oh_block = jcp.ow; - - int start{0}, end{0}; - balance211(work_amount, nthr, ithr, start, end); - - int n{0}, g{0}, ocb{0}, ohb{0}; - nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, - ohb_work, ocb, ocb_work); - - for (int iwork = start; iwork < end; ++iwork) { - int oc_ = ocb * jcp.nb_oc_blocking; - int oc_num = jcp.nb_oc_blocking; - - int oh_ = ohb * jcp.nb_oh_blocking; - int oh_num = jcp.nb_oh_blocking; - - int oh_step = nstl::min(oh_ + oh_num, jcp.oh) - oh_; - - const int os = oh_ * oh_block; - const int oh = os / jcp.ow; - const int ow = os % jcp.ow; - - const int ih = nstl::max(oh * stride_h - pad_t, 0); - const int iw = nstl::max(ow * stride_w - pad_l, 0); - - p.os_dim = this_block_size(os, jcp.os, oh_step * oh_block); - p.oc_dim = nstl::min(oc_ + oc_num, jcp.nb_oc) - oc_; - - const size_t dst_off = dst_d.blk_off(n, oc_*jcp.oc_block, oh, ow); - p.output_data = &dst[dst_off]; - - if (bias) - p.bias_data = &bias[bias_d.blk_off(oc_ * jcp.oc_block * bia_dt_size)]; - - p.scales = &oscales.scales_[jcp.is_oc_scale * oc_ * jcp.oc_block]; - p.oc_data = &weights[conf_.with_groups() ? weights_d.blk_off(g, oc_, 0) : weights_d.blk_off(oc_, 0)]; - p.is_data = src + src_d.blk_off(n, 0, ih, iw); - - kernel_->jit_ker(&p); - - nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ohb, - ohb_work, ocb, ocb_work); - } - }; - - parallel(0, ker); -} - -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t::execute_forward(); - -} -} -} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp deleted file mode 100644 index 5ae3b8f..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_JIT_UNI_X8S8S32X_1x1_CONVOLUTION_HPP -#define CPU_JIT_UNI_X8S8S32X_1x1_CONVOLUTION_HPP - -#include "c_types_map.hpp" -#include "cpu_convolution_pd.hpp" -#include "cpu_engine.hpp" -#include "cpu_reducer.hpp" -#include "jit_uni_x8s8s32x_1x1_conv_kernel.hpp" -#include "mkldnn_thread.hpp" -#include "utils.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -template -struct _jit_uni_x8s8s32x_1x1_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, - const primitive_attr_t *attr, - const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) - , jcp_({}) {} - - DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", isa, ""), - _jit_uni_x8s8s32x_1x1_convolution_fwd_t); - - virtual status_t init() override { - using namespace prop_kind; - assert(this->engine()->kind() == engine_kind::cpu); - bool ok = true - && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, - forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct - && this->cdesc_().src_desc.data_type == data_type::u8 - && this->cdesc_().dst_desc.data_type == dst_type - && this->cdesc_().weights_desc.data_type == data_type::s8 - && IMPLICATION(this->with_bias(), utils::one_of( - this->cdesc_().bias_desc.data_type, data_type::f32, - data_type::s32, data_type::s8, data_type::u8)) - && this->cdesc_().accum_data_type == data_type::s32; - if (!ok) return status::unimplemented; - - return jit_uni_x8s8s32x_1x1_conv_fwd_kernel::init_conf(jcp_, - this->cdesc_(), - this->src_pd_.desc(), *this->weights_pd_.desc(), - *this->dst_pd_.desc(), *this->bias_pd_.desc(), - *this->attr(), with_relu, this->negative_slope()); - } - - jit_1x1_conv_conf_t jcp_; - - protected: - virtual status_t set_default_params() override { - using namespace memory_format; - auto desired_act_fmt = nhwc; - - auto desired_wei_fmt = OhIw8o4i; - auto desired_gr_wei_fmt = gOhIw8o4i; - - if (this->src_pd_.desc()->format == any) - CHECK(this->src_pd_.set_format(desired_act_fmt)); - if (this->dst_pd_.desc()->format == any) - CHECK(this->dst_pd_.set_format(desired_act_fmt)); - if (this->weights_pd_.desc()->format == any) - CHECK(this->weights_pd_.set_format(this->with_groups() ? desired_gr_wei_fmt : desired_wei_fmt)); - if (this->bias_pd_.desc()->format == any) - CHECK(this->bias_pd_.set_format(x)); - return status::success; - } - }; - - _jit_uni_x8s8s32x_1x1_convolution_fwd_t(const pd_t *pd, const - input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , kernel_(nullptr), ws_(nullptr) - { - kernel_ = new jit_uni_x8s8s32x_1x1_conv_fwd_kernel(conf_.jcp_, *conf_.attr()); - const int nthreads = mkldnn_get_max_threads(); - ws_per_thread_ = conf_.jcp_.ow * conf_.jcp_.nb_oh_blocking_max * conf_.jcp_.oc_block; - ws_ = (acc_data_t*)malloc(nthreads * ws_per_thread_ * sizeof(acc_data_t), 64); - } - ~_jit_uni_x8s8s32x_1x1_convolution_fwd_t() { - delete kernel_; - free(ws_); - } - - typedef typename prec_traits::type src_data_t; - typedef typename prec_traits::type wei_data_t; - typedef typename prec_traits::type dst_data_t; - typedef typename prec_traits::type acc_data_t; - - virtual void execute(event_t *e) { - execute_forward(); - e->set_state(event_t::ready); - } - -private: - void execute_forward(); - pd_t conf_; - jit_uni_x8s8s32x_1x1_conv_fwd_kernel *kernel_; - - /* reduction to unit stride */ - size_t ws_per_thread_; - acc_data_t *ws_; -}; - -template -using jit_avx2_x8s8s32x_1x1_convolution_fwd_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t; -template -using jit_sse42_x8s8s32x_1x1_convolution_fwd_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t; -template -using jit_avx2_x8s8s32x_1x1_convolution_relu_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t; -template -using jit_sse42_x8s8s32x_1x1_convolution_relu_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t; - -} -} -} - -#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp index b94295b..09c60dc 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2018-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. *******************************************************************************/ +#include #include "c_types_map.hpp" #include "nstl.hpp" #include "type_helpers.hpp" @@ -30,37 +31,12 @@ namespace cpu { using namespace mkldnn::impl::prop_kind; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; using namespace Xbyak; template -bool jit_uni_x8s8s32x_conv_fwd_kernel::maybe_relu(int position) { - using namespace primitive_kind; - const auto &p = attr_.post_ops_; - - if (position == 0) { - /* relu before sum */ - return false - || jcp.with_eltwise - || p.contain(eltwise, 0) - || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0)); - } else if (position == 1) { - /* relu after sum */ - const int sum_idx = p.contain(sum, 0) - ? 0 : (p.contain(sum, 1) ? 1 : -1); - if (sum_idx == -1) - return false; - - return false - || p.contain(eltwise, sum_idx + 1) - || jcp.dst_dt == data_type::u8; - } - - return false; -} - -template void jit_uni_x8s8s32x_conv_fwd_kernel::cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load) { Xmm xmm_in = Xmm(vmm_in.getIdx()); @@ -118,7 +94,7 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::store_dst(const Xbyak::Address &op, if (isa != sse42 && !scalar_store) vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); if (scalar_store) { movq(reg_tmp_64, xmm_dst); @@ -136,7 +112,7 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::store_dst(const Xbyak::Address &op, if (isa != sse42 && !scalar_store) vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); if (scalar_store) { movq(reg_tmp_64, xmm_dst); @@ -177,32 +153,27 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::apply_filter(int ur_w, int pad_l, in for (int r = 0; r < repeats; r++) { for (int jj = _start; jj < _end; jj++) { int inp_off = (ki * dilate_w + jj * stride_w - pad_l) * jcp.ic * jcp.ngroups; - if (tail_size > 0) { - if (h_padded || jj < jj_start || jj >= jj_end) { - uni_vpxor(get_src_reg(jj), get_src_reg(jj), get_src_reg(jj)); - uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift); - uni_vandps(get_src_reg(jj), get_src_reg(jj), vmm_mask); - uni_vpbroadcastd(get_src_reg(jj), Xmm(get_src_reg(jj).getIdx())); - } else { - uni_vpbroadcastd(get_src_reg(jj), ptr[aux1_reg_input + jcp.typesize_in * inp_off]); - - if (jcp.signed_input) { - uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift); - } - - uni_vandps(get_src_reg(jj), get_src_reg(jj), vmm_mask); - uni_vpbroadcastd(get_src_reg(jj), Xmm(get_src_reg(jj).getIdx())); - } + if (tail_size > 0) { + if (h_padded || jj < jj_start || jj >= jj_end) { + uni_vpxor(get_src_reg(jj), get_src_reg(jj), get_src_reg(jj)); + uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift); } else { - if (h_padded || jj < jj_start || jj >= jj_end) { - uni_vpxor(get_src_reg(jj), get_src_reg(jj), get_src_reg(jj)); - } else { - uni_vpbroadcastd(get_src_reg(jj), ptr[aux1_reg_input + jcp.typesize_in * inp_off]); - } + uni_vpbroadcastd(get_src_reg(jj), ptr[aux1_reg_input + jcp.typesize_in * inp_off]); - if (jcp.signed_input) + if (jcp.signed_input) { uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift); + } + } + } else { + if (h_padded || jj < jj_start || jj >= jj_end) { + uni_vpxor(get_src_reg(jj), get_src_reg(jj), get_src_reg(jj)); + } else { + uni_vpbroadcastd(get_src_reg(jj), ptr[aux1_reg_input + jcp.typesize_in * inp_off]); } + + if (jcp.signed_input) + uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift); + } } for (int ii = 0; ii < oc_blocks; ii++) { @@ -279,7 +250,6 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::kh_loop(int ur_w, int pad_l, int pad mov(imm_addr64, l_table); uni_vmovups(vmm_one, ptr[imm_addr64 + 0 * vlen]); uni_vmovups(vmm_shift, ptr[imm_addr64 + 1 * vlen]); - uni_vmovups(vmm_mask, ptr[imm_addr64 + 4 * vlen]); if (jcp.signed_input) { mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); @@ -349,6 +319,7 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::width_blk_step(int ur_w, int pad_l, kh_loop(ur_w, pad_l, pad_r, oc_blocks, oc_step); + pop(reg_oc_off); pop(reg_scales_base); mov(imm_addr64, l_table); @@ -359,140 +330,143 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::width_blk_step(int ur_w, int pad_l, const float p_sum_scale = (sum_idx != -1) ? p.entry_[sum_idx].sum.scale : 1.f; for (int r = 0; r < repeats; r++) { + auto get_dst_off = [=](int ii, int jj) { + if (jcp.with_dw_conv) + return (ii * jcp_dw.kh * jcp.ow + jj) * jcp.oc_block + r * (jcp.oc_block / 2); + else + return ii * jcp.oc_block + jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2); + }; + int tail_size = isa == avx2 ? oc_step : nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2); bool is_scalar_store = isa == avx2 ? tail_size < jcp.oc_block : tail_size < jcp.oc_block / 2; - if (is_scalar_store) { + for (int ii = 0; ii < oc_blocks; ii++) { + if (jcp.with_bias) { + int b_off = ii * jcp.oc_block + r * (jcp.oc_block / 2); + cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], false); + + if (jcp.signed_input) + uni_vmulps(vmm_bias, vmm_bias, vmm_bias_alpha); + } + for (int jj = 0; jj < ur_w; jj++) { - Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + jj); + Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); uni_vcvtdq2ps(vmm_dst, vmm_dst); - uni_vmovups(vmm_reminder_dst, vmm_dst); - for (int oc = 0; oc < tail_size; oc++) { - uni_vmovups(vmm_dst, vmm_reminder_dst); + if (jcp.signed_input) { + int c_off = ii * jcp.oc_block + r * (jcp.oc_block / 2); + cvt2ps(data_type::s32, vmm_comp, ptr[reg_compensation_base + c_off * sizeof(int32_t)], false); + } - if (jcp.with_bias) { - int b_off = r * (jcp.oc_block / 2) + oc; - cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], true); + if (jcp.signed_input) + uni_vaddps(vmm_dst, vmm_dst, vmm_comp); + if (jcp.with_bias) + uni_vaddps(vmm_dst, vmm_dst, vmm_bias); - if (jcp.signed_input) - uni_vmulps(vmm_bias, vmm_bias, vmm_bias_alpha); - } - if (jcp.signed_input) { - int c_off = r * (jcp.oc_block / 2) + oc; - cvt2ps(data_type::s32, vmm_comp, ptr[reg_compensation_base + c_off * sizeof(int32_t)], true); - } + int s_off = jcp.is_oc_scale * (ii * jcp.oc_block + r * (jcp.oc_block / 2)); + cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], false); + uni_vmulps(vmm_dst, vmm_dst, vmm_scale); + } + } - if (jcp.signed_input) - uni_vaddps(vmm_dst, vmm_dst, vmm_comp); - if (jcp.with_bias) - uni_vaddps(vmm_dst, vmm_dst, vmm_bias); + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; + for (int i = 0; i < end_idx; i++) { + int start_idx = 1 + r * jcp.ur_w * jcp.nb_oc_blocking; + + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(start_idx, start_idx + oc_blocks * ur_w); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); + mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); + + add(reg_d_weights, reg_oc_off); + add(reg_d_bias, reg_oc_off); + + if (r == 1) { + add(reg_d_weights, (jcp.oc_block / 2) * sizeof(float)); + add(reg_d_bias, (jcp.oc_block / 2) * sizeof(float)); + } - int s_off = jcp.is_oc_scale * (r * (jcp.oc_block / 2) + oc); - cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], true); - uni_vmulps(vmm_dst, vmm_dst, vmm_scale); + for (int ii = 0; ii < oc_blocks; ii++) { + depthwise_injectors[depthwise_inj_idx]->compute_vector_range(start_idx + ur_w * ii, + start_idx + ur_w * ii + ur_w, reg_d_weights, reg_d_bias); - int o_off = jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc; - if (jcp.with_sum) { - uni_vpxor(vmm_prev_dst, vmm_prev_dst, vmm_prev_dst); - cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], true); + add(reg_d_weights, jcp.oc_block * sizeof(float)); + add(reg_d_bias, jcp.oc_block * sizeof(float)); + } - if (p_sum_scale == 1.f) { - uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); + depthwise_inj_idx++; + } else if (post_op.is_sum(false)) { + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); + int o_off = get_dst_off(ii, jj); + + if (is_scalar_store) { + for (int oc = 0; oc < tail_size; oc++) { + uni_vpxor(vmm_prev_dst, vmm_prev_dst, vmm_prev_dst); + cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + (o_off + oc) * jcp.typesize_out], true); + + if (oc < jcp.oc_block / 2) { + uni_vpslldq(vmm_prev_dst, vmm_prev_dst, oc * sizeof(float)); + } else { + Ymm ymm_prev_dst = Ymm(vmm_prev_dst.getIdx()); + vperm2i128(ymm_prev_dst, ymm_prev_dst, ymm_prev_dst, 0x01); + vpslldq(vmm_prev_dst, vmm_prev_dst, (oc - jcp.oc_block / 2) * sizeof(float)); + } + + if (p_sum_scale == 1.f) { + uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); + } else { + uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 3 * vlen]); + } + } } else { - uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 3 * vlen]); - } - } - - if (maybe_relu(0)) { - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); - } - - if (maybe_relu(1)) { - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); - } - - if (jcp.dst_dt != data_type::f32) { - if (attr_.round_mode_ == round_mode::nearest) - uni_vcvtps2dq(vmm_dst, vmm_dst); - else if (attr_.round_mode_ == round_mode::down) { - uni_vroundps(vmm_dst, vmm_dst, 1); - uni_vcvtps2dq(vmm_dst, vmm_dst); - } else - assert(!"unimplemented"); - } - - store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true); + cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], false); - if (isa == avx2) { - vperm2i128(ymm_tmp, ymm_reminder_dst, ymm_reminder_dst, 0x01); - vpalignr(ymm_reminder_dst, ymm_tmp, ymm_reminder_dst, jcp.typesize_out); - } else { - psrldq(vmm_reminder_dst, jcp.typesize_out); + if (p_sum_scale == 1.f) { + uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); + } else { + uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 3 * vlen]); + } + } } } } - } else { - for (int ii = 0; ii < oc_blocks; ii++) { - if (jcp.with_bias) { - int b_off = ii * jcp.oc_block + r * (jcp.oc_block / 2); - cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], false); + } - if (jcp.signed_input) - uni_vmulps(vmm_bias, vmm_bias, vmm_bias_alpha); + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); + int o_off = get_dst_off(ii, jj); + + if (jcp.dst_dt != data_type::f32) { + if (attr_.round_mode_ == round_mode::nearest) + uni_vcvtps2dq(vmm_dst, vmm_dst); + else if (attr_.round_mode_ == round_mode::down) { + uni_vroundps(vmm_dst, vmm_dst, 1); + uni_vcvtps2dq(vmm_dst, vmm_dst); + } else + assert(!"unimplemented"); } - for (int jj = 0; jj < ur_w; jj++) { - Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj); - uni_vcvtdq2ps(vmm_dst, vmm_dst); - - if (jcp.signed_input) { - int c_off = ii * jcp.oc_block + r * (jcp.oc_block / 2); - cvt2ps(data_type::s32, vmm_comp, ptr[reg_compensation_base + c_off * sizeof(int32_t)], false); - } - - if (jcp.signed_input) - uni_vaddps(vmm_dst, vmm_dst, vmm_comp); - if (jcp.with_bias) - uni_vaddps(vmm_dst, vmm_dst, vmm_bias); - - int s_off = jcp.is_oc_scale * (ii * jcp.oc_block + r * (jcp.oc_block / 2)); - cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], false); - uni_vmulps(vmm_dst, vmm_dst, vmm_scale); - - int o_off = ii * jcp.oc_block + jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2); - if (jcp.with_sum) { - cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], false); + if (is_scalar_store) { + for (int oc = 0; oc < tail_size; oc++) { + store_dst(ptr[reg_output + (o_off + oc) * jcp.typesize_out], vmm_dst, true); - if (p_sum_scale == 1.f) { - uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); + if (isa == avx2) { + Ymm ymm_dst = Ymm(vmm_dst.getIdx()); + vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01); + vpalignr(ymm_dst, ymm_tmp, ymm_dst, jcp.typesize_out); } else { - uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 3 * vlen]); + psrldq(vmm_dst, jcp.typesize_out); } } - - if (maybe_relu(0)) { - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); - } - - if (maybe_relu(1)) { - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); - } - - if (jcp.dst_dt != data_type::f32) { - if (attr_.round_mode_ == round_mode::nearest) - uni_vcvtps2dq(vmm_dst, vmm_dst); - else if (attr_.round_mode_ == round_mode::down) { - uni_vroundps(vmm_dst, vmm_dst, 1); - uni_vcvtps2dq(vmm_dst, vmm_dst); - } else - assert(!"unimplemented"); - } - + } else { store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false); } } @@ -500,6 +474,7 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::width_blk_step(int ur_w, int pad_l, } push(reg_scales_base); + push(reg_oc_off); } template @@ -513,6 +488,7 @@ inline void jit_uni_x8s8s32x_conv_fwd_kernel::solve_common(int oc_blocks, i int dilate_w = jcp.dilate_w + 1; int str_w = jcp.stride_w; const int inp_mult = jcp.ic * jcp.ngroups; + const int out_mult = jcp.with_dw_conv ? jcp.oc_block : jcp.oc * jcp.ngroups; int l_pad = jcp.l_pad; int r_pad = nstl::max(0, (int(jcp.ow) - 1) * str_w + (kw - 1) * dilate_w @@ -529,6 +505,7 @@ inline void jit_uni_x8s8s32x_conv_fwd_kernel::solve_common(int oc_blocks, i push(reg_output_base); push(reg_kernel_base); push(reg_scales_base); + push(reg_oc_off); if (l_pad > 0) { n_oi--; @@ -537,7 +514,7 @@ inline void jit_uni_x8s8s32x_conv_fwd_kernel::solve_common(int oc_blocks, i else width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad" add(reg_input, jcp.typesize_in * (ur_w * str_w - l_pad) * inp_mult); - add(reg_output, jcp.typesize_out * ur_w * jcp.oc * jcp.ngroups); + add(reg_output, jcp.typesize_out * ur_w * out_mult); } Label ow_loop_label; @@ -548,7 +525,7 @@ inline void jit_uni_x8s8s32x_conv_fwd_kernel::solve_common(int oc_blocks, i width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle" add(reg_input, jcp.typesize_in * ur_w * str_w * inp_mult); - add(reg_output, jcp.typesize_out * ur_w * jcp.oc * jcp.ngroups); + add(reg_output, jcp.typesize_out * ur_w * out_mult); inc(reg_oi_iter); cmp(reg_oi_iter, n_oi); @@ -558,12 +535,13 @@ inline void jit_uni_x8s8s32x_conv_fwd_kernel::solve_common(int oc_blocks, i if (r_pad1 > 0 && n_oi >=0) { width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad" add(reg_input, jcp.typesize_in * ur_w * str_w * inp_mult); - add(reg_output, jcp.typesize_out * ur_w * jcp.oc * jcp.ngroups); + add(reg_output, jcp.typesize_out * ur_w * out_mult); } if (ur_w_tail != 0) width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail" + pop(reg_oc_off); pop(reg_scales_base); pop(reg_kernel_base); pop(reg_output_base); @@ -573,56 +551,84 @@ inline void jit_uni_x8s8s32x_conv_fwd_kernel::solve_common(int oc_blocks, i template void jit_uni_x8s8s32x_conv_fwd_kernel::generate() { + const auto &p = attr_.post_ops_; + int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len_; + for (int i = 0; i < end_idx; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32( + this, + post_op.depthwise.alg + )); + } + } + this->preamble(); mov(reg_kernel_base, ptr[this->param1 + GET_OFF(filt)]); mov(reg_input_base, ptr[this->param1 + GET_OFF(src)]); mov(reg_output_base, ptr[this->param1 + GET_OFF(dst)]); - mov(reg_oc, ptr[this->param1 + GET_OFF(oc_work)]); + mov(reg_oc_work, ptr[this->param1 + GET_OFF(oc_work)]); if (jcp.with_bias) mov(reg_bias_base, ptr[this->param1 + GET_OFF(bias)]); mov(reg_scales_base, ptr[this->param1 + GET_OFF(scales)]); if (jcp.signed_input) mov(reg_compensation_base, ptr[param1 + GET_OFF(compensation)]); + mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); Label main_loop_label; Label tail_label; Label exit_label; - cmp(reg_oc, jcp.nb_oc_blocking * jcp.oc_block); + cmp(reg_oc_work, jcp.nb_oc_blocking * jcp.oc_block); jne(main_loop_label, T_NEAR); solve_common(jcp.nb_oc_blocking, jcp.oc_block); - sub(reg_oc, jcp.nb_oc_blocking * jcp.oc_block); + sub(reg_oc_work, jcp.nb_oc_blocking * jcp.oc_block); jmp(exit_label, T_NEAR); L(main_loop_label); { - cmp(reg_oc, jcp.oc_block); + cmp(reg_oc_work, jcp.oc_block); jl(tail_label, T_NEAR); solve_common(1, jcp.oc_block); - sub(reg_oc, jcp.oc_block); + sub(reg_oc_work, jcp.oc_block); add(reg_kernel_base, jcp.oc_block * jcp.nb_ic * jcp.kh * jcp.kw * jcp.ic_block * jcp.typesize_in); - add(reg_output_base, jcp.oc_block * jcp.typesize_out); + if (jcp.with_dw_conv) + add(reg_output_base, jcp.oc_block * jcp_dw.kh * jcp.ow * jcp.typesize_out); + else + add(reg_output_base, jcp.oc_block * jcp.typesize_out); add(reg_bias_base, jcp.oc_block * jcp.typesize_bia); add(reg_scales_base, jcp.is_oc_scale * jcp.oc_block * sizeof(float)); add(reg_compensation_base, jcp.oc_block * sizeof(int32_t)); + add(reg_oc_off, jcp.oc_block * sizeof(float)); jmp(main_loop_label, T_NEAR); } L(tail_label); - solve_common(1, jcp.oc % jcp.oc_block); + if (jcp.oc % jcp.oc_block != 0) + solve_common(1, jcp.oc % jcp.oc_block); L(exit_label); this->postamble(); prepare_table(); + + for (auto& inj : eltwise_injectors) + inj->prepare_table(); } template @@ -672,43 +678,29 @@ void jit_uni_x8s8s32x_conv_fwd_kernel::prepare_table() { dd(cvals_sum_scale[i]); } } - - for (size_t i = 0; i < sizeof(cvals_shift) / sizeof(cvals_shift[0]); ++i) { - for (size_t d = 0; d < vlen / sizeof(int8_t); ++d) { - if ((int)d < jcp.ic % jcp.ic_block) - db(255); - else - db(0); - } - } } template bool jit_uni_x8s8s32x_conv_fwd_kernel::post_ops_ok( jit_conv_conf_t &jcp, const primitive_attr_t &attr) { - using namespace primitive_kind; const auto &p = attr.post_ops_; - auto is_relu = [&](int idx) { - return p.entry_[idx].kind == eltwise - && p.entry_[idx].eltwise.scale == 1. - && p.entry_[idx].eltwise.alg == alg_kind::eltwise_relu - && p.entry_[idx].eltwise.alpha == 0.; - }; + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return p.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + auto is_dw_conv = [&](int idx) { return p.entry_[idx].is_dw_conv(); }; + auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { case 0: return true; - case 1: return true - && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0)) - && IMPLICATION(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0)); - case 2: return true - && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1)) - && IMPLICATION(!jcp.with_eltwise, false - || (p.contain(sum, 0) && is_relu(1)) - || (p.contain(sum, 1) && is_relu(0))); - case 3: return true - && jcp.with_eltwise == false - && (is_relu(0) && p.contain(sum, 1) && is_relu(2)); + case 1: return is_simple(0) || is_sum(0) || is_dw_conv(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_sum(1)) || + (is_dw_conv(0) && is_simple(1)) || (is_simple(0) && is_dw_conv(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_simple(0) && is_sum(1) && is_simple(2)) || + (is_simple(0) && is_dw_conv(1) && is_simple(2)) || + (is_dw_conv(0) && is_simple(1) && is_simple(2)); + case 4: return (is_simple(0) && is_dw_conv(1) && is_simple(2) && is_simple(3)); default: return false; } @@ -720,7 +712,7 @@ status_t jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd, cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr, bool with_relu, float relu_negative_slope) + const primitive_attr_t &attr) { if (!mayiuse(isa)) return status::unimplemented; @@ -758,8 +750,6 @@ status_t jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; jcp.signed_input = src_d.data_type() == data_type::s8; @@ -772,14 +762,23 @@ status_t jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, jcp.oc_padded = rnd_up(jcp.oc, jcp.oc_block); jcp.nb_oc = div_up(jcp.oc, jcp.oc_block); + if (jcp.ngroups != 1) { + if (jcp.ic % jcp.ic_block != 0 || jcp.oc % jcp.oc_block != 0) + return status::unimplemented; + } + if (!post_ops_ok(jcp, attr)) return status::unimplemented; const auto &p = attr.post_ops_; - jcp.with_sum = p.find(primitive_kind::sum) != -1; - if (!jcp.with_eltwise) { - jcp.with_eltwise = p.find(primitive_kind::eltwise) != -1; - jcp.eltwise_alpha = 0.f; + + int dw_conv_ind = p.find(primitive_kind::convolution); + jcp.with_dw_conv = dw_conv_ind != -1; + if (jcp.with_dw_conv) { + jcp.dw_conv_oh = jcp.oh; + jcp.dw_conv_ow = jcp.ow; + jcp.oh = p.entry_[dw_conv_ind].dw_conv.in_h; + jcp.ow = p.entry_[dw_conv_ind].dw_conv.in_w; } auto desired_act_fmt = nhwc; @@ -808,6 +807,7 @@ status_t jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, return status::unimplemented; } + jcp.src_dt = cd.src_desc.data_type; jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; jcp.dst_dt = cd.dst_desc.data_type; @@ -824,9 +824,15 @@ status_t jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0)); jcp.ur_h = 1; /* no code-unrolling by h so far */ - jcp.ur_w = isa == avx2 ? 3 : 2; - jcp.nb_oc_blocking = 2; - if (jcp.nb_oc % jcp.nb_oc_blocking != 0) jcp.nb_oc_blocking = 1; + jcp.ur_w = isa == avx2 ? 4 : 2; + jcp.nb_oc_blocking = nstl::min(2, jcp.nb_oc); + jcp.max_regs_ur = 12; + + // WA to prevent fallback on gemm implementation + if (isa == sse42 && jcp.ic == 3) { + jcp.ur_w = 4; + jcp.nb_oc_blocking = 1; + } if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; jcp.ur_w_tail = jcp.ow % jcp.ur_w; @@ -839,24 +845,42 @@ status_t jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + if (r_pad_no_tail > jcp.ur_w) + return status::unimplemented; - if (r_pad_no_tail > jcp.ur_w) { - /* recalculate ur_w, nb_oc_blocking and ur_w_tail */ - jcp.ur_w = r_pad_no_tail + 1; - jcp.ur_w_tail = jcp.ow % jcp.ur_w; - /* check again ... */ - r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w - + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); - if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w)) - return status::unimplemented; - } - if (jcp.l_pad > jcp.ur_w) return status::unimplemented; + if (jcp.l_pad > jcp.ur_w) + return status::unimplemented; jcp.wei_adj_scale = (jcp.signed_input) ? (1.0f / 2.0f) : 1.0f; return status::success; } +template +void jit_uni_x8s8s32x_conv_fwd_kernel::init_scratchpad( + memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw, + const primitive_attr_t &attr) { + if (jcp.oc != jcp.oc_padded) + scratchpad.book(key_conv_padded_bias, (size_t)jcp.typesize_bia * jcp.oc_padded); + + if (jcp.signed_input) { + size_t count = nstl::max(attr.output_scales_.count_, 8); + scratchpad.book(key_conv_adjusted_scales, sizeof(float) * count); + + if (jcp.oc != jcp.oc_padded) + scratchpad.book(key_conv_padded_compensation, sizeof(int32_t) * jcp.oc_padded); + } + + if (jcp.with_dw_conv) { + const int nthreads = mkldnn_get_max_threads(); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * jcp.nb_oc_blocking; + scratchpad.book(key_dw_conv_buffer, jcp_dw.typesize_in * dw_conv_buffer_size_ * nthreads); + + if (jcp.oc != jcp.oc_padded) + scratchpad.book(key_dw_conv_padded_bias, (size_t)jcp_dw.typesize_bia * jcp.oc_padded); + } +} + template struct jit_uni_x8s8s32x_conv_fwd_kernel; template struct jit_uni_x8s8s32x_conv_fwd_kernel; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp index 110fa3a..a7af3d3 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2018-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,8 @@ #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" #include "cpu_memory.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { @@ -28,13 +30,23 @@ namespace cpu { template struct jit_uni_x8s8s32x_conv_fwd_kernel: public jit_generator { - jit_uni_x8s8s32x_conv_fwd_kernel(jit_conv_conf_t ajcp, - const primitive_attr_t &attr): jcp(ajcp), attr_(attr) + jit_uni_x8s8s32x_conv_fwd_kernel(jit_conv_conf_t ajcp, jit_conv_conf_t ajcp_dw, + const primitive_attr_t &attr): jcp(ajcp), jcp_dw(ajcp_dw), attr_(attr) { this->generate(); jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); } + ~jit_uni_x8s8s32x_conv_fwd_kernel() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); + } + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_x8s8s32x_conv_fwd_kernel) static bool post_ops_ok(jit_conv_conf_t &jcp, @@ -45,11 +57,12 @@ struct jit_uni_x8s8s32x_conv_fwd_kernel: public jit_generator { cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd, cpu_memory_t::pd_t &bias_pd, - const primitive_attr_t &attr, - bool with_relu = false, - float relu_negative_slope = 0.); + const primitive_attr_t &attr); + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw, const primitive_attr_t &attr); jit_conv_conf_t jcp; + jit_conv_conf_t jcp_dw; const primitive_attr_t &attr_; void (*jit_ker)(jit_conv_call_s *); @@ -81,30 +94,30 @@ private: reg64_t reg_oi_iter = r11; reg64_t reg_ic_iter = r15; reg64_t reg_compensation_base = abi_not_param1; - reg64_t reg_oc = r12; + reg64_t reg_oc_work = r12; reg64_t imm_addr64 = rbx; reg8_t reg_tmp_8 = r14b; reg32_t reg_tmp_32 = r14d; reg64_t reg_tmp_64 = r14; - Vmm vmm_zero = Vmm(14); + reg64_t reg_oc_off = r10; + reg64_t reg_d_weights = aux_reg_kernel; + reg64_t reg_d_bias = aux_reg_input; + Vmm vmm_one = Vmm(15); Vmm vmm_bias_alpha = Vmm(13); Vmm vmm_shift = Vmm(14); - Vmm vmm_mask = Vmm(13); Vmm vmm_bias = Vmm(15); - Vmm vmm_reminder_dst = Vmm(11); - Ymm ymm_reminder_dst = Ymm(11); Ymm ymm_tmp = Ymm(10); Vmm vmm_scale = Vmm(12); Vmm vmm_comp = Vmm(12); Vmm vmm_prev_dst = Vmm(12); - inline Vmm get_src_reg(int idx) { return Vmm(idx + 8); } - inline Vmm get_ker_reg(int idx) { return Vmm(idx + 11); } - inline Vmm get_tmp_reg(int idx) { return Vmm(idx + 12); } - inline Vmm get_acc_reg(int idx) { return Vmm(idx + 0); } + inline Vmm get_src_reg(int idx) { return Vmm(idx + 9); } + inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); } + inline Vmm get_tmp_reg(int idx) { return Vmm(idx + 13); } + inline Vmm get_acc_reg(int idx) { return Vmm(idx + 1); } inline void cvt2ps(data_type_t type_in, Vmm ymm_in, const Xbyak::Operand &op, bool scalar_load); inline void store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store); @@ -116,12 +129,13 @@ private: inline void width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step); inline void solve_common(int oc_blocks, int oc_step); - bool maybe_relu(int position); - void generate(); void prepare_table(); + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; + Xbyak::Label l_table; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp index d574361..83ca9ce 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2018-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include "utils.hpp" #include "mkldnn_thread.hpp" #include "type_helpers.hpp" +#include namespace mkldnn { namespace impl { @@ -27,19 +28,20 @@ namespace cpu { using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; +using namespace mkldnn::impl::memory_tracking::names; using namespace mkldnn::impl::utils; -template -void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward() { +template +void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; @@ -47,8 +49,33 @@ void _jit_uni_x8s8s32x_convolution_fwd_t::ex auto w = const_cast(weights); int32_t* compensation = (jcp.signed_input) ? reinterpret_cast(&w[offset]) : 0; - const size_t bia_dt_size = conf_.with_bias() ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0; - float* scales = conf_.attr()->output_scales_.scales_; + if (bias && jcp.oc != jcp.oc_padded) { + auto padded_bias = this->scratchpad().template get(key_conv_padded_bias); + utils::array_copy(padded_bias, (bia_data_t*)bias, jcp.oc); + utils::array_set(padded_bias + jcp.oc, 0, jcp.oc_padded - jcp.oc); + bias = (char *)padded_bias; + } + + const float *oscales = pd()->attr()->output_scales_.scales_; + if (jcp.signed_input) { + auto local_scales = scratchpad().template get(key_conv_adjusted_scales); + size_t count = pd()->attr()->output_scales_.count_; + float factor = 1.f / jcp.wei_adj_scale; + if (count == 1) { + utils::array_set(local_scales, oscales[0] * factor, 8); + } else { + for (size_t c = 0; c < count; c++) + local_scales[c] = oscales[c] * factor; + } + oscales = local_scales; + + if (jcp.oc != jcp.oc_padded) { + auto padded_compensation = this->scratchpad().template get(key_conv_padded_compensation); + utils::array_copy(padded_compensation, compensation, jcp.oc); + utils::array_set(padded_compensation + jcp.oc, 0, jcp.oc_padded - jcp.oc); + compensation = padded_compensation; + } + } int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = jcp.mb * jcp.ngroups * ocb_work * jcp.oh; @@ -64,7 +91,7 @@ void _jit_uni_x8s8s32x_convolution_fwd_t::ex int ocb = ocbb * jcp.nb_oc_blocking; int ocb_num = jcp.nb_oc_blocking; - jit_conv_call_s par_conv = {}; + auto par_conv = jit_conv_call_s(); const int ij = oh * jcp.stride_h; const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1))); @@ -81,12 +108,12 @@ void _jit_uni_x8s8s32x_convolution_fwd_t::ex par_conv.dst = &dst[dst_off]; const int wh = (!jcp.signed_input) ? i_t_overflow : 0; - par_conv.filt = &weights[conf_.with_groups() + par_conv.filt = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, 0, wh, 0) : weights_d.blk_off(ocb, 0, wh, 0)]; if (bias) - par_conv.bias = &bias[bias_d.blk_off(_oc * jcp.oc_block*bia_dt_size)]; + par_conv.bias = &bias[bias_d.blk_off(_oc * jcp.oc_block*jcp.typesize_bia)]; par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block; @@ -95,13 +122,14 @@ void _jit_uni_x8s8s32x_convolution_fwd_t::ex const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow; par_conv.kh_padding = nstl::max(0, kh_padding); - par_conv.scales = (jcp.signed_input) ? &local_scales_[jcp.is_oc_scale * _oc * jcp.oc_block] - : &scales[jcp.is_oc_scale * _oc * jcp.oc_block]; + par_conv.scales = &oscales[jcp.is_oc_scale * _oc * jcp.oc_block]; par_conv.compensation = (jcp.signed_input) ? compensation + _oc * jcp.oc_block : 0; par_conv.t_overflow = i_t_overflow; par_conv.b_overflow = i_b_overflow; + par_conv.oc_off = _oc * jcp.oc_block * sizeof(float); + kernel_->jit_ker(&par_conv); nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); } @@ -110,41 +138,193 @@ void _jit_uni_x8s8s32x_convolution_fwd_t::ex parallel(0, ker); } -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward(); +template +void _jit_uni_x8s8s32x_convolution_fwd_t::execute_forward_with_dw_conv() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto bias = reinterpret_cast(this->input_memory(2)); + auto dst = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); + + const auto &jcp = kernel_->jcp; + const auto &jcp_dw = kernel_dw_->jcp; + const int MB = pd()->MB(); + + size_t offset = (size_t)jcp.ngroups * rnd_up(jcp.oc, jcp.oc_block) * rnd_up(jcp.ic, jcp.ic_block) * jcp.kh * jcp.kw; + auto w = const_cast(weights); + int32_t* compensation = (jcp.signed_input) ? reinterpret_cast(&w[offset]) : 0; + + auto dw_bias = jcp_dw.conv_biases; + auto dw_weights = reinterpret_cast(jcp_dw.conv_weights); + + if (jcp.oc != jcp.oc_padded) { + auto padded_bias = this->scratchpad().template get(key_conv_padded_bias); + utils::array_copy(padded_bias, (bia_data_t*)bias, jcp.oc); + utils::array_set(padded_bias + jcp.oc, 0, jcp.oc_padded - jcp.oc); + bias = (char *)padded_bias; + + auto dw_padded_bias = this->scratchpad().template get(key_dw_conv_padded_bias); + utils::array_copy(dw_padded_bias, dw_bias, jcp.oc); + utils::array_set(dw_padded_bias + jcp.oc, 0.f, jcp.oc_padded - jcp.oc); + dw_bias = dw_padded_bias; + } + + const float *oscales = pd()->attr()->output_scales_.scales_; + if (jcp.signed_input) { + auto local_scales = scratchpad().template get(key_conv_adjusted_scales); + size_t count = pd()->attr()->output_scales_.count_; + float factor = 1.f / jcp.wei_adj_scale; + if (count == 1) { + utils::array_set(local_scales, oscales[0] * factor, 8); + } else { + for (size_t c = 0; c < count; c++) + local_scales[c] = oscales[c] * factor; + } + oscales = local_scales; + + if (jcp.oc != jcp.oc_padded) { + auto padded_compensation = this->scratchpad().template get(key_conv_padded_compensation); + utils::array_copy(padded_compensation, compensation, jcp.oc); + utils::array_set(padded_compensation + jcp.oc, 0, jcp.oc_padded - jcp.oc); + compensation = padded_compensation; + } + } + + int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); + const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; + + auto ker = [&](const int ithr, const int nthr) { + auto compute_row_gen = [&](dst_data_t* ws_p, int n, int g, int ocb, int ocb_num, int oh, int num_rows) { + for (int h = 0; h < num_rows; h++) { + if ((oh + h) < 0 || (oh + h) >= jcp.oh) { + for (int chb = ocb; chb < ocb + ocb_num; chb++) { + memset(ws_p + (((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block + + (chb - ocb) * jcp_dw.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(dst_data_t)); + } + } else { + auto par_conv = jit_conv_call_s(); + + const int ij = (oh + h) * jcp.stride_h; + const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1))); + const int i_b_overflow = nstl::min(jcp.kh, div_up(nstl::max(jcp.ih, ij + (jcp.kh-1) * (jcp.dilate_h+1) - + jcp.t_pad+1) - jcp.ih, (jcp.dilate_h + 1))); + + const size_t _oc = g * jcp.nb_oc + ocb; + const size_t _ic = g * jcp.nb_ic; + + const int ih = nstl::max(ij - jcp.t_pad + i_t_overflow * (jcp.dilate_h + 1), 0); + par_conv.src = &src[src_d.blk_off(n, _ic*jcp.ic_block, ih, 0)]; + + par_conv.dst = &ws_p[(((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block]; + + const int wh = (!jcp.signed_input) ? i_t_overflow : 0; + par_conv.filt = &weights[pd()->with_groups() + ? weights_d.blk_off(g, ocb, 0, wh, 0) + : weights_d.blk_off(ocb, 0, wh, 0)]; + + if (bias) + par_conv.bias = &bias[bias_d.blk_off(_oc * jcp.oc_block*jcp.typesize_bia)]; + + par_conv.oc_work = + nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block; + + par_conv.kw_padding = 0; + const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow; + par_conv.kh_padding = nstl::max(0, kh_padding); + + par_conv.scales = &oscales[jcp.is_oc_scale * _oc * jcp.oc_block]; + par_conv.compensation = (jcp.signed_input) ? compensation + _oc * jcp.oc_block : 0; + par_conv.t_overflow = i_t_overflow; + par_conv.b_overflow = i_b_overflow; + + par_conv.oc_off = _oc * jcp.oc_block * sizeof(float); + + kernel_->jit_ker(&par_conv); + } + } + }; + + auto compute_row_dw = [&](const dst_data_t* ws_p, int n, int ocb, int ocb_num, int dst_idx) { + for (int chb = ocb; chb < nstl::min(ocb + ocb_num, jcp.nb_oc); chb++) { + auto par_conv_dw = jit_conv_call_s(); + + par_conv_dw.src_row0 = &ws_p[(((dst_idx+1) - 1) % jcp_dw.kh) * jcp_dw.iw * jcp_dw.ch_block + + (chb - ocb) * jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block]; + par_conv_dw.src_row1 = &ws_p[(((dst_idx+1) - 0) % jcp_dw.kh) * jcp_dw.iw * jcp_dw.ch_block + + (chb - ocb) * jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block]; + par_conv_dw.src_row2 = &ws_p[(((dst_idx+1) + 1) % jcp_dw.kh) * jcp_dw.iw * jcp_dw.ch_block + + (chb - ocb) * jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block]; + + par_conv_dw.dst = &dst[n*jcp_dw.oc*jcp_dw.oh*jcp_dw.ow + dst_idx/jcp_dw.stride_h*jcp_dw.ow*jcp_dw.oc + chb*jcp_dw.ch_block]; + + par_conv_dw.kh_padding = jcp_dw.kh; + par_conv_dw.filt = &dw_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; + par_conv_dw.bias = &dw_bias[chb * jcp_dw.ch_block]; + par_conv_dw.ur_w = (size_t)(jcp_dw.ow); + par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw.ch_block, (int)jcp_dw.oc) - chb*jcp_dw.ch_block; + par_conv_dw.oc_off = chb * jcp_dw.ch_block * sizeof(float); + + kernel_dw_->jit_ker(&par_conv_dw); + } + }; + + size_t start{0}, end{0}; + balance211(work_amount, nthr, ithr, start, end); + + auto dw_conv_buffer = scratchpad().template get(key_dw_conv_buffer); + size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * jcp.nb_oc_blocking; + auto pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_; + + size_t n{0}, g{0}, ocbb{0}, oh{0}; + nd_iterator_init(start, n, MB, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { + int ocb = ocbb * jcp.nb_oc_blocking; + int ocb_num = jcp.nb_oc_blocking; + + if (iwork == start || oh == 0) { + compute_row_gen(pbuf, n, g, ocb, ocb_num, oh - 1, 2); + } else { + compute_row_gen(pbuf, n, g, ocb, ocb_num, oh, 1); + } + + if (iwork > start && ((oh - 1) % jcp_dw.stride_h == 0) && oh > 0) { + compute_row_dw(pbuf, n, ocb, ocb_num, oh - 1); + } + + if ((iwork == end - 1 || (int) oh == jcp.oh - 1) && ((oh) % jcp_dw.stride_h == 0)) { + compute_row_gen(pbuf, n, g, ocb, ocb_num, oh + 1, 1); + compute_row_dw(pbuf, n, ocb, ocb_num, oh); + } + + nd_iterator_step(n, MB, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + } + }; + + parallel(0, ker); +} + +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; + +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; + +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; + +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_convolution_fwd_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp index efd1185..7b5d61c 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018 Intel Corporation +* Copyright 2018-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,100 +25,105 @@ #include "jit_uni_x8s8s32x_conv_kernel.hpp" #include "jit_generator.hpp" #include "mkldnn_thread.hpp" - +#include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { namespace cpu { -template +template struct _jit_uni_x8s8s32x_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) - , jcp_({}) {} + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) + , jcp_(), jcp_dw_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit:", isa, ""), - _jit_uni_x8s8s32x_convolution_fwd_t); + _jit_uni_x8s8s32x_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct + && this->desc()->alg_kind == alg_kind::convolution_direct && IMPLICATION(this->with_bias(), utils::one_of( - this->cdesc_().bias_desc.data_type, data_type::f32, + this->desc()->bias_desc.data_type, data_type::f32, data_type::s32, data_type::s8, data_type::u8)) - && this->cdesc_().accum_data_type == data_type::s32 - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().dst_desc.data_type == dst_type; + && this->desc()->accum_data_type == data_type::s32 + && this->desc()->src_desc.data_type == src_type + && this->desc()->dst_desc.data_type == dst_type; if (!ok) return status::unimplemented; - return jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jcp_, this->cdesc_(), + status_t sts = jit_uni_x8s8s32x_conv_fwd_kernel::init_conf(jcp_, *this->desc(), this->src_pd_, this->weights_pd_, - this->dst_pd_, this->bias_pd_, *this->attr(), - with_relu, this->negative_slope()); + this->dst_pd_, this->bias_pd_, *this->attr()); + if (sts != status::success) return sts; + + if (jcp_.with_dw_conv) { + status_t sts_dw = jit_uni_dw_conv_row_f32::init_conf(jcp_, jcp_dw_, *this->attr()); + if (sts_dw != status::success) return sts_dw; + } + + auto scratchpad = scratchpad_registry().registrar(); + jit_uni_x8s8s32x_conv_fwd_kernel::init_scratchpad(scratchpad, jcp_, jcp_dw_, *this->attr()); + + return status::success; } jit_conv_conf_t jcp_; + jit_conv_conf_t jcp_dw_; }; - _jit_uni_x8s8s32x_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), local_scales_(nullptr) { - kernel_ = new jit_uni_x8s8s32x_conv_fwd_kernel(conf_.jcp_, *conf_.attr()); - - if (conf_.jcp_.signed_input) { - size_t scales_size = (conf_.attr()->output_scales_.count_ == 1) - ? 8 - : conf_.attr()->output_scales_.count_; - local_scales_ = (float *)malloc(sizeof(float) * scales_size, 64); - for (size_t i = 0; i < scales_size; i++) { - local_scales_[i] = conf_.attr()->output_scales_.scales_[i] * - (1.0 / conf_.jcp_.wei_adj_scale); - } + _jit_uni_x8s8s32x_convolution_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) { + kernel_ = new jit_uni_x8s8s32x_conv_fwd_kernel(pd()->jcp_, pd()->jcp_dw_, *pd()->attr()); + + if (pd()->jcp_.with_dw_conv) { + kernel_dw_ = new jit_uni_dw_conv_row_f32(pd()->jcp_dw_, *pd()->attr(), pd()->jcp_dw_.oc); } } ~_jit_uni_x8s8s32x_convolution_fwd_t() { delete kernel_; - if (local_scales_) free(local_scales_); + + if (pd()->jcp_.with_dw_conv) { + delete kernel_dw_; + } }; typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; + typedef typename prec_traits::type bia_data_t; typedef typename prec_traits::type dst_data_t; - virtual void execute(event_t *e) { - execute_forward(); + virtual void execute(event_t *e) const { + if (pd()->jcp_.with_dw_conv) + execute_forward_with_dw_conv(); + else + execute_forward(); + e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + void execute_forward_with_dw_conv() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_x8s8s32x_conv_fwd_kernel *kernel_; - float *local_scales_; + jit_uni_dw_conv_row_f32 *kernel_dw_; }; template -using jit_avx2_x8s8s32x_convolution_fwd_t = _jit_uni_x8s8s32x_convolution_fwd_t; - -template -using jit_avx2_x8s8s32x_convolution_relu_t = _jit_uni_x8s8s32x_convolution_fwd_t; - -template -using jit_sse42_x8s8s32x_convolution_fwd_t = _jit_uni_x8s8s32x_convolution_fwd_t; +using jit_avx2_x8s8s32x_convolution_fwd_t = _jit_uni_x8s8s32x_convolution_fwd_t; template -using jit_sse42_x8s8s32x_convolution_relu_t = _jit_uni_x8s8s32x_convolution_fwd_t; +using jit_sse42_x8s8s32x_convolution_fwd_t = _jit_uni_x8s8s32x_convolution_fwd_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp index c02bd80..d7b3994 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp @@ -183,32 +183,6 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::apply_filter_unrolled(int ur_ch_b } template -bool jit_uni_x8s8s32x_dw_conv_fwd_kernel::maybe_relu(int position) { - using namespace primitive_kind; - const auto &p = attr_.post_ops_; - - if (position == 0) { - /* relu before sum */ - return false - || jcp.with_eltwise - || p.contain(eltwise, 0) - || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0)); - } else if (position == 1) { - /* relu after sum */ - const int sum_idx = p.contain(sum, 0) - ? 0 : (p.contain(sum, 1) ? 1 : -1); - if (sum_idx == -1) - return false; - - return false - || p.contain(eltwise, sum_idx + 1) - || jcp.dst_dt == data_type::u8; - } - - return false; -} - -template void jit_uni_x8s8s32x_dw_conv_fwd_kernel::store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) { Ymm ymm_dst = Ymm(vmm_dst.getIdx()); Xmm xmm_dst = Xmm(vmm_dst.getIdx()); @@ -229,7 +203,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::store_dst(const Xbyak::Address &o if (isa != sse42 && !scalar_store) vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); if (scalar_store) { movq(reg_tmp_64, xmm_dst); @@ -247,7 +221,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::store_dst(const Xbyak::Address &o if (isa != sse42 && !scalar_store) vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); if (scalar_store) { movq(reg_tmp_64, xmm_dst); @@ -306,37 +280,89 @@ template void jit_uni_x8s8s32x_dw_conv_fwd_kernel::store_dst(int ur_ch_blocks, int ch_step, int ur_w) { int repeats = isa == sse42 && ch_step > (jcp.ch_block / 2) ? 2 : 1; + pop(reg_oc_off); pop(reg_scales_base); - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + mov(imm_addr64, l_table); + + const auto &p = attr_.post_ops_; + const int sum_idx = p.find(primitive_kind::sum); + const float p_sum_scale = (sum_idx != -1) ? p.entry_[sum_idx].sum.scale : 1.f; + + bool is_scalar_store = ch_step < jcp.ch_block; + for (int r = 0; r < repeats; r++) { - if (ch_step < jcp.ch_block) { + for (int ii = 0; ii < ur_ch_blocks; ii++) { + if (jcp.with_bias) { + int b_off = ii * jcp.ch_block + r * (jcp.ch_block / 2); + cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], is_scalar_store); + } + for (int jj = 0; jj < ur_w; jj++) { - Vmm vmm_dst = get_acc_reg(r * ur_w * ur_ch_blocks + jj); + Vmm vmm_dst = get_acc_reg(r * ur_ch_blocks * ur_w + ur_w * ii + jj); uni_vcvtdq2ps(vmm_dst, vmm_dst); - if (jcp.with_bias) { - int b_off = r * (jcp.ch_block / 2); - cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], true); + if (jcp.with_bias) uni_vaddps(vmm_dst, vmm_dst, vmm_bias); - } - int s_off = jcp.is_oc_scale * (r * (jcp.ch_block / 2)); - cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], true); + int s_off = jcp.is_oc_scale * (ii * jcp.ch_block + r * (jcp.ch_block / 2)); + cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], is_scalar_store); uni_vmulps(vmm_dst, vmm_dst, vmm_scale); + } + } - int o_off = jj * jcp.oc + r * (jcp.ch_block / 2); - if (jcp.with_sum) { - uni_vpxor(vmm_prev_dst, vmm_prev_dst, vmm_prev_dst); - cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], true); - uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + for (int i = 0; i < p.len_; i++) { + int start_idx = 4 + r * ur_ch_blocks*ur_w; + + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(start_idx, start_idx + ur_ch_blocks * ur_w); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, reinterpret_cast(post_op.depthwise.weights_data)); + mov(reg_d_bias, reinterpret_cast(post_op.depthwise.biases_data)); + + add(reg_d_weights, reg_oc_off); + add(reg_d_bias, reg_oc_off); + + if (r == 1) { + add(reg_d_weights, (jcp.ch_block / 2) * sizeof(float)); + add(reg_d_bias, (jcp.ch_block / 2) * sizeof(float)); } - if (maybe_relu(0)) - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); + for (int ii = 0; ii < ur_ch_blocks; ii++) { + depthwise_injectors[depthwise_inj_idx]->compute_vector_range( + start_idx + ur_w * ii, start_idx + ur_w * ii + ur_w, reg_d_weights, reg_d_bias); + + add(reg_d_weights, jcp.ch_block * sizeof(float)); + add(reg_d_bias, jcp.ch_block * sizeof(float)); + } + + depthwise_inj_idx++; + } else if (post_op.is_sum(false)) { + for (int ii = 0; ii < ur_ch_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = get_acc_reg(r * ur_ch_blocks*ur_w + ur_w * ii + jj); + int o_off = ii * jcp.ch_block + jj * jcp.oc + r * (jcp.ch_block / 2); + + cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], is_scalar_store); + + if (p_sum_scale == 1.f) { + uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); + } else { + uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 0 * vlen]); + } + } + } + } + } - if (maybe_relu(1)) - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); + for (int ii = 0; ii < ur_ch_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + Vmm vmm_dst = get_acc_reg(r * ur_ch_blocks * ur_w + ur_w * ii + jj); + int o_off = ii * jcp.ch_block + jj * jcp.oc + r * (jcp.ch_block / 2); if (jcp.dst_dt != data_type::f32) { if (attr_.round_mode_ == round_mode::nearest) @@ -348,55 +374,13 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::store_dst(int ur_ch_blocks, int c assert(!"unimplemented"); } - store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true); - } - } else { - for (int ii = 0; ii < ur_ch_blocks; ii++) { - if (jcp.with_bias) { - int b_off = ii * jcp.ch_block + r * (jcp.ch_block / 2); - cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], false); - } - - for (int jj = 0; jj < ur_w; jj++) { - Vmm vmm_dst = get_acc_reg(r * ur_ch_blocks*ur_w + ur_w * ii + jj); - uni_vcvtdq2ps(vmm_dst, vmm_dst); - - if (jcp.with_bias) - uni_vaddps(vmm_dst, vmm_dst, vmm_bias); - - int s_off = jcp.is_oc_scale * (ii * jcp.ch_block + r * (jcp.ch_block / 2)); - cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], false); - uni_vmulps(vmm_dst, vmm_dst, vmm_scale); - - int o_off = ii * jcp.ch_block + jj * jcp.oc + r * (jcp.ch_block / 2); - if (jcp.with_sum) { - cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], false); - uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst); - } - - if (maybe_relu(0)) - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); - - if (maybe_relu(1)) - uni_vmaxps(vmm_dst, vmm_dst, vmm_zero); - - if (jcp.dst_dt != data_type::f32) { - if (attr_.round_mode_ == round_mode::nearest) - uni_vcvtps2dq(vmm_dst, vmm_dst); - else if (attr_.round_mode_ == round_mode::down) { - uni_vroundps(vmm_dst, vmm_dst, 1); - uni_vcvtps2dq(vmm_dst, vmm_dst); - } else - assert(!"unimplemented"); - } - - store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false); - } + store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, is_scalar_store); } } } push(reg_scales_base); + push(reg_oc_off); } template @@ -415,6 +399,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::loop_body(int ur_ch_blocks, int c push(reg_kernel_base); push(reg_ch_work); push(reg_scales_base); + push(reg_oc_off); L(unrolled_w_label); { int ur_w = jcp.ur_w; @@ -458,6 +443,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::loop_body(int ur_ch_blocks, int c L(exit_label); + pop(reg_oc_off); pop(reg_scales_base); pop(reg_ch_work); pop(reg_kernel_base); @@ -467,6 +453,24 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::loop_body(int ur_ch_blocks, int c template void jit_uni_x8s8s32x_dw_conv_fwd_kernel::generate() { + const auto &p = attr_.post_ops_; + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new jit_uni_eltwise_injector_f32( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32( + this, + post_op.depthwise.alg + )); + } + } + this->preamble(); mov(reg_input_base, ptr[this->param1 + GET_OFF(src)]); @@ -478,6 +482,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::generate() { mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); mov(reg_kw, ptr[this->param1 + GET_OFF(kw_padding)]); mov(reg_ch_work, ptr[this->param1 + GET_OFF(ch_work)]); + mov(reg_oc_off, ptr[this->param1 + GET_OFF(oc_off)]); Label main_loop_label; Label tail_loop_label; @@ -504,6 +509,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::generate() { add(reg_kernel_base, jcp.ch_block * jcp.kh * jcp.kw * jcp.typesize_in); add(reg_bias_base, jcp.ch_block * jcp.typesize_bia); add(reg_scales_base, jcp.is_oc_scale * jcp.ch_block * sizeof(float)); + add(reg_oc_off, jcp.ch_block * sizeof(float)); jmp(main_loop_label, T_NEAR); } @@ -520,6 +526,7 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::generate() { add(reg_kernel_base, 1 * jcp.typesize_in); add(reg_bias_base, 1 * jcp.typesize_bia); add(reg_scales_base, jcp.is_oc_scale * 1 * sizeof(float)); + add(reg_oc_off, 1 * sizeof(float)); jmp(tail_loop_label, T_NEAR); } @@ -527,6 +534,30 @@ void jit_uni_x8s8s32x_dw_conv_fwd_kernel::generate() { L(exit_label); this->postamble(); + + prepare_table(); + + for (auto& inj : eltwise_injectors) + inj->prepare_table(); +} + +template +void jit_uni_x8s8s32x_dw_conv_fwd_kernel::prepare_table() { + const auto &p = attr_.post_ops_; + const int sum_idx = p.find(primitive_kind::sum); + const float p_sum_scale = (sum_idx != -1) ? p.entry_[sum_idx].sum.scale : 1.f; + + const int32_t cvals_sum_scale[] = { + float2int(p_sum_scale) + }; + + align(64); + L(l_table); + for (size_t i = 0; i < sizeof(cvals_sum_scale) / sizeof(cvals_sum_scale[0]); ++i) { + for (size_t d = 0; d < vlen / sizeof(int32_t); ++d) { + dd(cvals_sum_scale[i]); + } + } } template @@ -534,14 +565,18 @@ bool jit_uni_x8s8s32x_dw_conv_fwd_kernel::post_ops_ok( jit_conv_conf_t &jcp, const primitive_attr_t &attr) { const auto &p = attr.post_ops_; - auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); }; - auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); }; + auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return p.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); }; + auto is_simple = [&](int idx) { return is_eltwise(idx) || is_depthwise(idx); }; switch (p.len_) { - case 0: return true; // no post_ops - case 1: return !jcp.with_eltwise && (is_relu(0) || is_sum(0)); // sum OR relu - case 2: return !jcp.with_eltwise && (is_sum(0) && is_relu(1)); // sum->relu - default: return false; + case 0: return true; + case 1: return is_simple(0) || is_sum(0); + case 2: return (is_sum(0) && is_simple(1)) || (is_simple(0) && is_sum(1)) || + (is_simple(0) && is_simple(1)); + case 3: return (is_simple(0) && is_sum(1) && is_simple(2)); + default: return false; } return false; @@ -551,8 +586,7 @@ template status_t jit_uni_x8s8s32x_dw_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp, const convolution_desc_t &cd, const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const memory_desc_wrapper &bias_pd, const primitive_attr_t &attr, - bool with_relu, float relu_negative_slope) + const memory_desc_wrapper &bias_pd, const primitive_attr_t &attr) { if (!mayiuse(isa)) return status::unimplemented; @@ -593,8 +627,6 @@ status_t jit_uni_x8s8s32x_dw_conv_fwd_kernel::init_conf(jit_conv_conf_t &jc jcp.src_fmt = src_d.format(); jcp.with_bias = cd.bias_desc.format != memory_format::undef; - jcp.with_eltwise = with_relu; - jcp.eltwise_alpha = relu_negative_slope; jcp.signed_input = (src_d.data_type() == data_type::s8) ? true : false; @@ -610,13 +642,10 @@ status_t jit_uni_x8s8s32x_dw_conv_fwd_kernel::init_conf(jit_conv_conf_t &jc const auto &p = attr.post_ops_; jcp.with_sum = p.find(primitive_kind::sum) != -1; - if (!jcp.with_eltwise) { - int eltwise_ind = p.find(primitive_kind::eltwise); - if (eltwise_ind != -1) { - jcp.with_eltwise = true; - jcp.eltwise_alpha = p.entry_[eltwise_ind].eltwise.alpha; - } - } + const int eltwise_ind = p.find(primitive_kind::eltwise); + jcp.with_eltwise = eltwise_ind != -1; + if (jcp.with_eltwise) + jcp.eltwise = p.entry_[eltwise_ind].eltwise; auto desired_act_fmt = nhwc; auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp index 9c9b41f..8bb7811 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp @@ -21,6 +21,8 @@ #include "jit_generator.hpp" #include "jit_primitive_conf.hpp" #include "type_helpers.hpp" +#include "jit_uni_eltwise.hpp" +#include "jit_uni_depthwise.hpp" namespace mkldnn { namespace impl { @@ -36,6 +38,16 @@ struct jit_uni_x8s8s32x_dw_conv_fwd_kernel: public jit_generator { jit_ker = (void (*)(jit_conv_call_s *))this->getCode(); } + ~jit_uni_x8s8s32x_dw_conv_fwd_kernel() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); + } + static bool post_ops_ok(jit_conv_conf_t &jcp, const primitive_attr_t &attr); static status_t init_conf(jit_conv_conf_t &jcp, @@ -43,8 +55,7 @@ struct jit_uni_x8s8s32x_dw_conv_fwd_kernel: public jit_generator { const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, const memory_desc_wrapper &bias_pd, - const primitive_attr_t &attr, - bool with_relu = false, float relu_negative_slope = 0.f); + const primitive_attr_t &attr); jit_conv_conf_t jcp; const primitive_attr_t &attr_; @@ -84,6 +95,12 @@ private: reg64_t reg_tmp_64 = r15; reg8_t reg_tmp_8 = r15b; + reg64_t imm_addr64 = r10; + + reg64_t reg_oc_off = iter_kw; + reg64_t reg_d_weights = aux1_reg_kernel; + reg64_t reg_d_bias = aux_reg_input; + Vmm vmm_zero = Vmm(0); Vmm vmm_bias = Vmm(3); Vmm vmm_scale = Vmm(2); @@ -99,11 +116,16 @@ private: inline void load_src(int ur_ch_blocks, int ch_step, int ur_w); inline void apply_filter(int ur_ch_blocks, int ch_step, int ur_w); inline void apply_filter_unrolled(int ur_ch_blocks, int ch_step, int ur_w); - inline bool maybe_relu(int position); inline void store_dst(int ur_ch_blocks, int ch_step, int ur_w); inline void loop_body(int ur_ch_blocks, int ch_step); + inline void prepare_table(); void generate(); + + nstl::vector*> eltwise_injectors; + nstl::vector*> depthwise_injectors; + + Xbyak::Label l_table; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp index bc31a38..b102c53 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp @@ -26,17 +26,17 @@ using namespace mkldnn::impl::status; using namespace mkldnn::impl::memory_format; using namespace mkldnn::impl::utils; -template -void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward() { +template +void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); const auto &jcp = kernel_->jcp; @@ -45,10 +45,10 @@ void _jit_uni_x8s8s32x_dw_convolution_fwd_t: int str_h = jcp.stride_h; int str_w = jcp.stride_w; - const size_t bia_dt_size = conf_.with_bias() - ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0; + const size_t bia_dt_size = pd()->with_bias() + ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0; - const auto &oscales = conf_.attr()->output_scales_; + const auto &oscales = pd()->attr()->output_scales_; int MB = jcp.mb; int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking); @@ -56,7 +56,7 @@ void _jit_uni_x8s8s32x_dw_convolution_fwd_t: auto kernel_params = [&](int ur_w_step, int ow, int oh, int ih, int kh, int kh_padding, int ch, int ch_num, int n) { - jit_conv_call_s par_conv = {}; + auto par_conv = jit_conv_call_s(); const int i_l_overflow = nstl::max(0, (jcp.l_pad - ow * str_w)); const int i_r_overflow = nstl::max(jcp.iw, (ow * str_w @@ -86,6 +86,7 @@ void _jit_uni_x8s8s32x_dw_convolution_fwd_t: par_conv.ch_work = nstl::min((ch + ch_num) * jcp.ch_block, jcp.oc) - ch*jcp.ch_block; par_conv.scales = &oscales.scales_[jcp.is_oc_scale * ch * jcp.ch_block]; + par_conv.oc_off = ch * jcp.ch_block * sizeof(float); return par_conv; }; @@ -149,23 +150,25 @@ void _jit_uni_x8s8s32x_dw_convolution_fwd_t: parallel(0, ker); } -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); - -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); -template void _jit_uni_x8s8s32x_dw_convolution_fwd_t::execute_forward(); +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; + +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; + +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; + +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp index 17d70c1..a6c3cf6 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp @@ -28,40 +28,40 @@ namespace mkldnn { namespace impl { namespace cpu { -template +template struct _jit_uni_x8s8s32x_dw_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { - pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc, + struct pd_t: public cpu_convolution_fwd_pd_t { + pd_t(engine_t *engine, const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) - , jcp_({}) {} + , jcp_() {} DECLARE_COMMON_PD_T( JIT_IMPL_NAME_HELPER("jit_dw:", isa, ""), - _jit_uni_x8s8s32x_dw_convolution_fwd_t); + _jit_uni_x8s8s32x_dw_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct - && this->cdesc_().dst_desc.data_type == dst_type + && this->desc()->alg_kind == alg_kind::convolution_direct + && this->desc()->dst_desc.data_type == dst_type && IMPLICATION(this->with_bias(), utils::one_of( - this->cdesc_().bias_desc.data_type, data_type::f32, + this->desc()->bias_desc.data_type, data_type::f32, data_type::s32, data_type::s8, data_type::u8)) - && this->cdesc_().accum_data_type == data_type::s32; + && this->desc()->accum_data_type == data_type::s32; if (!ok) return status::unimplemented; return jit_uni_x8s8s32x_dw_conv_fwd_kernel::init_conf(jcp_, - this->cdesc_(), - this->src_pd_.desc(), *this->weights_pd_.desc(), + *this->desc(), + *this->src_pd_.desc(), *this->weights_pd_.desc(), *this->dst_pd_.desc(), *this->bias_pd_.desc(), - *this->attr(), with_relu, this->negative_slope()); + *this->attr()); } jit_conv_conf_t jcp_; @@ -84,35 +84,34 @@ struct _jit_uni_x8s8s32x_dw_convolution_fwd_t: public cpu_primitive_t { } }; - _jit_uni_x8s8s32x_dw_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - { kernel_ = new jit_uni_x8s8s32x_dw_conv_fwd_kernel(conf_.jcp_, *conf_.attr()); } + _jit_uni_x8s8s32x_dw_convolution_fwd_t(const pd_t *apd, + const input_vector &inputs, const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) + { + kernel_ = new jit_uni_x8s8s32x_dw_conv_fwd_kernel(pd()->jcp_, *pd()->attr()); + } + ~_jit_uni_x8s8s32x_dw_convolution_fwd_t() { delete kernel_; }; typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const ; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } jit_uni_x8s8s32x_dw_conv_fwd_kernel *kernel_; }; template -using jit_avx2_x8s8s32x_dw_convolution_fwd_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t; -template -using jit_sse42_x8s8s32x_dw_convolution_fwd_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t; -template -using jit_avx2_x8s8s32x_dw_convolution_relu_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t; +using jit_avx2_x8s8s32x_dw_convolution_fwd_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t; template -using jit_sse42_x8s8s32x_dw_convolution_relu_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t; +using jit_sse42_x8s8s32x_dw_convolution_fwd_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.cpp index e9da692..fa3c514 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.cpp @@ -30,44 +30,44 @@ namespace impl { namespace cpu { template -void nchw_pooling_fwd_t::execute_forward() { +void nchw_pooling_fwd_t::execute_forward() const { using namespace alg_kind; auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - auto ws = conf_.desc()->alg_kind == alg_kind::pooling_max ? + auto ws = pd()->desc()->alg_kind == alg_kind::pooling_max ? reinterpret_cast(this->memory(1)) : nullptr; - const memory_desc_wrapper ws_d(conf_.workspace_pd()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper ws_d(pd()->workspace_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); const data_type_t ws_dt = ws ? ws_d.data_type() : data_type::undef; src += src_d.off_l(0); dst += dst_d.off_l(0); - const int MB = conf_.MB(); - const int C = conf_.C(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); - const int SD = conf_.KSD(); - const int SH = conf_.KSH(); - const int SW = conf_.KSW(); - const int padF = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); - const int padBack = conf_.padBack(); - const int padB = conf_.padB(); - const int padR = conf_.padR(); - - auto alg = conf_.desc()->alg_kind; + const int MB = pd()->MB(); + const int C = pd()->C(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + const int SD = pd()->KSD(); + const int SH = pd()->KSH(); + const int SW = pd()->KSW(); + const int padF = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); + const int padBack = pd()->padBack(); + const int padB = pd()->padB(); + const int padR = pd()->padR(); + + auto alg = pd()->desc()->alg_kind; auto set_ws = [=](int mb, int c, int od, int oh, int ow, int value) { if (ws) { @@ -160,7 +160,7 @@ void nchw_pooling_fwd_t::execute_forward() { }; - if (conf_.desc()->alg_kind == pooling_max) { + if (pd()->desc()->alg_kind == pooling_max) { parallel_nd(MB, C, OD, OH, OW, [&](int mb, int c, int od, int oh, int ow) { size_t dst_offset @@ -191,37 +191,37 @@ void nchw_pooling_fwd_t::execute_forward() { } template -void nchw_pooling_bwd_t::execute_backward() { +void nchw_pooling_bwd_t::execute_backward() const { using namespace alg_kind; auto diff_dst = reinterpret_cast(this->input_memory(0)); - auto ws = conf_.desc()->alg_kind != alg_kind::pooling_max ? nullptr : + auto ws = pd()->desc()->alg_kind != alg_kind::pooling_max ? nullptr : reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper ws_d(conf_.workspace_pd()); - - const int MB = conf_.MB(); - const int C = conf_.C(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); - const int SD = conf_.KSD(); - const int SH = conf_.KSH(); - const int SW = conf_.KSW(); - const int padF = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); - - const bool is_3d = conf_.desc()->diff_src_desc.ndims == 5; - - auto alg = conf_.desc()->alg_kind; + const memory_desc_wrapper ws_d(pd()->workspace_pd()); + + const int MB = pd()->MB(); + const int C = pd()->C(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + const int SD = pd()->KSD(); + const int SH = pd()->KSH(); + const int SW = pd()->KSW(); + const int padF = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); + + const bool is_3d = pd()->desc()->diff_src_desc.ndims == 5; + + auto alg = pd()->desc()->alg_kind; auto apply_offset = [=](int index, int offset) { return (index > offset) ? index - offset : 0; @@ -296,7 +296,7 @@ void nchw_pooling_bwd_t::execute_backward() { } }; - if (conf_.desc()->alg_kind == pooling_max) { + if (pd()->desc()->alg_kind == pooling_max) { parallel_nd(MB, C, [&](int mb, int c) { size_t diff_dst_offset = (size_t)mb*C*OD*OH*OW + (size_t)c*OD*OH*OW; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.hpp index 951ef50..0e57565 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.hpp @@ -72,19 +72,19 @@ struct nchw_pooling_fwd_t: public cpu_primitive_t { } }; - nchw_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, + nchw_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -133,19 +133,19 @@ struct nchw_pooling_bwd_t: public cpu_primitive_t { } }; - nchw_pooling_bwd_t(const pd_t *pd, const input_vector &inputs, + nchw_pooling_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.cpp index d755538..66523a6 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.cpp @@ -17,11 +17,13 @@ #include #include -#include "cpu_batch_normalization_utils.hpp" #include "c_types_map.hpp" +#include "type_helpers.hpp" + +#include "cpu_batch_normalization_utils.hpp" #include "jit_generator.hpp" + #include "ncsp_batch_normalization.hpp" -#include "type_helpers.hpp" // clang 6 and 7 generate incorrect code with OMP_SIMD in some particular cases #if (defined __clang_major__) && (__clang_major__ >= 6) @@ -34,38 +36,17 @@ namespace mkldnn { namespace impl { namespace cpu { -typedef float data_t; -ncsp_batch_normalization_fwd_t::ncsp_batch_normalization_fwd_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), stats_reduction_(nullptr), - tmp_mean_(nullptr), tmp_variance_(nullptr), conf_(*pd) { - if (!conf_.stats_is_src()) { - this->stats_reduction_ = (data_t *)malloc( - conf_.C() * mkldnn_get_max_threads() * sizeof(data_t), 64); - if (!conf_.is_training()) { - this->tmp_mean_ = (data_t *)malloc(conf_.C() * sizeof(data_t), 64); - this->tmp_variance_ - = (data_t *)malloc(conf_.C() * sizeof(data_t), 64); - } - } -} -ncsp_batch_normalization_fwd_t::~ncsp_batch_normalization_fwd_t() { - if (!conf_.stats_is_src()) { - free(this->stats_reduction_); - if (!conf_.is_training()) { - free(this->tmp_mean_); - free(this->tmp_variance_); - } - } -} +using namespace memory_tracking::names; -void ncsp_batch_normalization_fwd_t::execute_forward() { +void ncsp_batch_normalization_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - const bool calculate_stats = !conf_.stats_is_src(); - const bool save_stats = conf_.is_training(); - const bool is_training = conf_.is_training(); - const bool fuse_bn_relu = conf_.fuse_bn_relu(); + auto scratchpad = this->scratchpad(); + + const bool calculate_stats = !pd()->stats_is_src(); + const bool save_stats = pd()->is_training(); + const bool is_training = pd()->is_training(); + const bool fuse_bn_relu = pd()->fuse_bn_relu(); data_t *mean, *variance; if (!calculate_stats) { @@ -78,25 +59,25 @@ void ncsp_batch_normalization_fwd_t::execute_forward() { mean = reinterpret_cast(this->memory(1)); variance = reinterpret_cast(this->memory(2)); } else { - mean = this->tmp_mean_; - variance = this->tmp_variance_; + mean = scratchpad.get(key_bnorm_tmp_mean); + variance = scratchpad.get(key_bnorm_tmp_var); } } - auto idx_scale_shift = 1 + 2 * conf_.stats_is_src(); + auto idx_scale_shift = 1 + 2 * pd()->stats_is_src(); auto scaleshift = reinterpret_cast( this->input_memory(idx_scale_shift)); - auto ws = reinterpret_cast(this->memory(conf_.ws_idx())); - data_t *ws_reduce = this->stats_reduction_; + auto ws = reinterpret_cast(this->memory(pd()->ws_idx())); + auto *ws_reduce = scratchpad.get(key_bnorm_reduction); - const float eps = conf_.desc()->batch_norm_epsilon; - const bool use_scaleshift = conf_.use_scaleshift(); - const bool with_relu = conf_.with_relu_post_op(); + const float eps = pd()->desc()->batch_norm_epsilon; + const bool use_scaleshift = pd()->use_scaleshift(); + const bool with_relu = pd()->with_relu_post_op(); auto maybe_post_op = [&](data_t res) { return (with_relu && res < 0) ? 0 : res; }; - const bool has_spatial = utils::one_of(conf_.ndims(), 4, 5); - int SP = (has_spatial) ? conf_.H() * conf_.W() * conf_.D() : 1; - size_t N = conf_.MB(); - size_t C = conf_.C(); + const bool has_spatial = utils::one_of(pd()->ndims(), 4, 5); + int SP = (has_spatial) ? pd()->H() * pd()->W() * pd()->D() : 1; + size_t N = pd()->MB(); + size_t C = pd()->C(); int nthr = mkldnn_get_max_threads(); size_t l3_size_ = get_cache_size(3, true) * nthr / 2; @@ -232,44 +213,30 @@ void ncsp_batch_normalization_fwd_t::execute_forward() { }); } -ncsp_batch_normalization_bwd_t::ncsp_batch_normalization_bwd_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) - , stats_reduction_(nullptr), tmp_diff_scaleshift_(nullptr) { - this->stats_reduction_ = (data_t *)malloc( - conf_.C() * 2 * mkldnn_get_max_threads() * sizeof(data_t), 64); - if (!(conf_.use_scaleshift() - && conf_.desc()->prop_kind == prop_kind::backward)) - this->tmp_diff_scaleshift_ - = (data_t *)malloc(conf_.C() * 2 * sizeof(data_t), 64); -} - -ncsp_batch_normalization_bwd_t::~ncsp_batch_normalization_bwd_t() { - free(this->stats_reduction_); - free(this->tmp_diff_scaleshift_); -} - -void ncsp_batch_normalization_bwd_t::execute_backward() { +void ncsp_batch_normalization_bwd_t::execute_backward() const { auto src = reinterpret_cast(this->input_memory(0)); auto mean = reinterpret_cast(this->input_memory(1)); auto variance = reinterpret_cast(this->input_memory(2)); auto diff_dst = reinterpret_cast(this->input_memory(3)); auto scaleshift = reinterpret_cast(this->input_memory(4)); auto diff_src = reinterpret_cast(this->memory(0)); - auto diff_scaleshift = (this->memory(1)) ? - reinterpret_cast(this->memory(1)) : - this->tmp_diff_scaleshift_; + + auto scratchpad = this->scratchpad(); + + auto diff_scaleshift = this->memory(1) + ? reinterpret_cast(this->memory(1)) + : scratchpad.get(key_bnorm_tmp_diff_ss); auto ws = reinterpret_cast( - this->input_memory(conf_.ws_idx())); - data_t *ws_reduce = this->stats_reduction_; - - const bool has_spatial = utils::one_of(conf_.ndims(), 4, 5); - int SP = (has_spatial) ? conf_.H() * conf_.W() * conf_.D() : 1; - size_t C = conf_.C(), N = conf_.MB(); - const bool use_scaleshift = conf_.use_scaleshift(); - const float eps = conf_.desc()->batch_norm_epsilon; - const bool calculate_diff_stats = !conf_.omit_stats(); - const bool fuse_bn_relu = conf_.fuse_bn_relu(); + this->input_memory(pd()->ws_idx())); + auto *ws_reduce = scratchpad.get(key_bnorm_reduction); + + const bool has_spatial = utils::one_of(pd()->ndims(), 4, 5); + int SP = (has_spatial) ? pd()->H() * pd()->W() * pd()->D() : 1; + size_t C = pd()->C(), N = pd()->MB(); + const bool use_scaleshift = pd()->use_scaleshift(); + const float eps = pd()->desc()->batch_norm_epsilon; + const bool calculate_diff_stats = !pd()->use_global_stats(); + const bool fuse_bn_relu = pd()->fuse_bn_relu(); int nthr = mkldnn_get_max_threads(); size_t l3_size_ = get_cache_size(3, true) * nthr / 2; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.hpp index ddf6df6..a723e9a 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.hpp @@ -20,11 +20,12 @@ #include #include "c_types_map.hpp" -#include "cpu_batch_normalization_pd.hpp" -#include "cpu_engine.hpp" +#include "memory_tracking.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "cpu_batch_normalization_pd.hpp" + namespace mkldnn { namespace impl { namespace cpu { @@ -40,9 +41,11 @@ struct ncsp_batch_normalization_fwd_t : public cpu_primitive_t { DECLARE_COMMON_PD_T("ncsp_bnorm:any", ncsp_batch_normalization_fwd_t); virtual status_t init() override { - using namespace prop_kind; using namespace data_type; + using namespace prop_kind; + assert(engine()->kind() == engine_kind::cpu); + bool ok = true && is_fwd() && !has_zero_dim_memory() @@ -52,41 +55,56 @@ struct ncsp_batch_normalization_fwd_t : public cpu_primitive_t { && utils::one_of(data_pd_.desc()->format, memory_format::nchw, memory_format::ncdhw, memory_format::nc) && (attr()->has_default_values() || this->with_relu_post_op()); - if (!ok) - return status::unimplemented; + if (!ok) return status::unimplemented; - if (is_training() && fuse_bn_relu()) { + if (is_training() && fuse_bn_relu()) bn_init_default_ws(this, this->workspace_pd_, 8); - } if (stats_is_src() || is_training()) { memory_desc_t stats_d; dims_t stats_dims = { C() }; - mkldnn_memory_desc_init(&stats_d, 1, stats_dims, data_type::f32, - memory_format::x); + mkldnn_memory_desc_init(&stats_d, 1, stats_dims, + data_type::f32, memory_format::x); mean_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); variance_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); } + init_scratchpad(); + return success; } + + private: + void init_scratchpad() { + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + if (!stats_is_src()) { + scratchpad.book(key_bnorm_reduction, + sizeof(data_t) * C() * mkldnn_get_max_threads()); + + if (!is_training()) { + scratchpad.book(key_bnorm_tmp_mean, sizeof(data_t) * C()); + scratchpad.book(key_bnorm_tmp_var, sizeof(data_t) * C()); + } + } + } }; typedef typename prec_traits::type data_t; - ncsp_batch_normalization_fwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs); - ~ncsp_batch_normalization_fwd_t(); + ncsp_batch_normalization_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + ~ncsp_batch_normalization_fwd_t() {} - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - data_t *stats_reduction_, *tmp_mean_, *tmp_variance_; - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; struct ncsp_batch_normalization_bwd_t : public cpu_primitive_t { @@ -95,14 +113,14 @@ struct ncsp_batch_normalization_bwd_t : public cpu_primitive_t { const primitive_attr_t *attr, const batch_normalization_fwd_pd_t *hint_fwd_pd) : cpu_batch_normalization_bwd_pd_t( - engine, adesc, attr, hint_fwd_pd) {} + engine, adesc, attr, hint_fwd_pd) {} DECLARE_COMMON_PD_T("ncsp_bnorm:any", ncsp_batch_normalization_bwd_t); virtual status_t init() override { - using namespace prop_kind; using namespace data_type; assert(engine()->kind() == engine_kind::cpu); + bool ok = true && is_bwd() && !has_zero_dim_memory() @@ -112,42 +130,54 @@ struct ncsp_batch_normalization_bwd_t : public cpu_primitive_t { && utils::one_of(data_pd_.desc()->format, memory_format::nchw, memory_format::ncdhw, memory_format::nc) && attr()->has_default_values(); - if (!ok) - return status::unimplemented; + if (!ok) return status::unimplemented; if (fuse_bn_relu()) { bn_init_default_ws(this, this->workspace_pd_, 8); const size_t this_ws_sz - = memory_desc_wrapper(this->workspace_pd()).size(); - - bool ws_ok = true && hint_fwd_pd_->workspace_pd() - && memory_desc_wrapper(hint_fwd_pd_->workspace_pd()) - .size() - == this_ws_sz; - if (!ws_ok) - return status::unimplemented; + = memory_desc_wrapper(this->workspace_pd()).size(); + + bool ws_ok = true + && hint_fwd_pd_->workspace_pd() + && memory_desc_wrapper(hint_fwd_pd_->workspace_pd()).size() + == this_ws_sz; + if (!ws_ok) return status::unimplemented; } + init_scratchpad(); + return success; } + + private: + void init_scratchpad() { + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(key_bnorm_reduction, + sizeof(data_t) * 2 * C() * mkldnn_get_max_threads()); + if (!(use_scaleshift() && desc()->prop_kind == prop_kind::backward)) + scratchpad.book(key_bnorm_tmp_diff_ss, + sizeof(data_t) * 2 * C()); + } }; typedef typename prec_traits::type data_t; - ncsp_batch_normalization_bwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs); - ~ncsp_batch_normalization_bwd_t(); - virtual void execute(event_t *e) { + ncsp_batch_normalization_bwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + ~ncsp_batch_normalization_bwd_t() {} + + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; - - data_t *stats_reduction_, *tmp_diff_scaleshift_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.cpp index 1fc4788..553fddc 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.cpp @@ -58,7 +58,7 @@ namespace nhwc_pooling { template void nhwc_pooling_fwd_t::array_div_by_const(const int n, - const data_t *src, const size_t num, data_t *dst) + const data_t *src, const size_t num, data_t *dst) const { for (int i = 0; i < n; ++i) { @@ -69,8 +69,8 @@ void nhwc_pooling_fwd_t::array_div_by_const(const int n, } template -void nhwc_pooling_fwd_t::array_add(const int n, - const data_t *src, data_t *dst) +void nhwc_pooling_fwd_t::array_add(const int n, const data_t *src, + data_t *dst) const { for (int i = 0; i < n; ++i) { @@ -79,44 +79,44 @@ void nhwc_pooling_fwd_t::array_add(const int n, } template -void nhwc_pooling_fwd_t::execute_forward() { +void nhwc_pooling_fwd_t::execute_forward() const { using namespace alg_kind; using namespace prop_kind; using namespace nhwc_pooling; - auto alg = conf_.desc()->alg_kind; + auto alg = pd()->desc()->alg_kind; auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); unsigned char * ws = reinterpret_cast( alg == pooling_max - && conf_.desc()->prop_kind == forward_training ? + && pd()->desc()->prop_kind == forward_training ? this->memory(1) : nullptr ); - const memory_desc_wrapper MEM_D(dst)(conf_.dst_pd()); - const memory_desc_wrapper MEM_D(ws)(conf_.workspace_pd()); - const memory_desc_wrapper MEM_D(src)(conf_.src_pd()); - - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); - const int SD = conf_.KSD(); - const int SH = conf_.KSH(); - const int SW = conf_.KSW(); - const int padF = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); - const int MB = conf_.MB(); - const int OC = conf_.C(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - - const bool is_3d = conf_.desc()->src_desc.ndims == 5; + const memory_desc_wrapper MEM_D(dst)(pd()->dst_pd()); + const memory_desc_wrapper MEM_D(ws)(pd()->workspace_pd()); + const memory_desc_wrapper MEM_D(src)(pd()->src_pd()); + + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + const int SD = pd()->KSD(); + const int SH = pd()->KSH(); + const int SW = pd()->KSW(); + const int padF = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); + const int MB = pd()->MB(); + const int OC = pd()->C(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + + const bool is_3d = pd()->desc()->src_desc.ndims == 5; const data_type_t ws_dt = ws ? ws_d.data_type() : data_type::undef; DECLARE_READ_STRIDES(src); @@ -234,38 +234,38 @@ void nhwc_pooling_fwd_t::execute_forward() { } template -void nhwc_pooling_bwd_t::execute_backward() { +void nhwc_pooling_bwd_t::execute_backward() const { using namespace alg_kind; using namespace nhwc_pooling; auto diff_dst = reinterpret_cast(this->input_memory(0)); - auto ws = conf_.desc()->alg_kind != alg_kind::pooling_max ? nullptr + auto ws = pd()->desc()->alg_kind != alg_kind::pooling_max ? nullptr : reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper MEM_D(diff_dst)(conf_.diff_dst_pd()); - const memory_desc_wrapper MEM_D(ws)(conf_.workspace_pd()); - const memory_desc_wrapper MEM_D(diff_src)(conf_.diff_src_pd()); - - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); - const int SD = conf_.KSD(); - const int SH = conf_.KSH(); - const int SW = conf_.KSW(); - const int OC = conf_.C(); - const int padF = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - - const bool is_3d = conf_.desc()->diff_src_desc.ndims == 5; - auto alg = conf_.desc()->alg_kind; + const memory_desc_wrapper MEM_D(diff_dst)(pd()->diff_dst_pd()); + const memory_desc_wrapper MEM_D(ws)(pd()->workspace_pd()); + const memory_desc_wrapper MEM_D(diff_src)(pd()->diff_src_pd()); + + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + const int SD = pd()->KSD(); + const int SH = pd()->KSH(); + const int SW = pd()->KSW(); + const int OC = pd()->C(); + const int padF = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + + const bool is_3d = pd()->desc()->diff_src_desc.ndims == 5; + auto alg = pd()->desc()->alg_kind; DECLARE_READ_STRIDES(diff_src); DECLARE_READ_STRIDES(diff_dst); @@ -274,7 +274,7 @@ void nhwc_pooling_bwd_t::execute_backward() { return (index > offset) ? index - offset : 0; }; - const int MB = conf_.MB(); + const int MB = pd()->MB(); parallel_nd(MB, ID, IH, IW, [&](int mb, int id, int ih, int iw) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.hpp index 91cb2ab..c510b77 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/nhwc_pooling.hpp @@ -90,27 +90,27 @@ struct nhwc_pooling_fwd_t: public cpu_primitive_t { } }; - nhwc_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, + nhwc_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); + void execute_forward() const; void array_div_by_const(const int n, const data_t *src, const size_t num, - data_t *dst); - void array_add(const int n, const data_t *src, data_t *dst); + data_t *dst) const; + void array_add(const int n, const data_t *src, data_t *dst) const; template void array_nhwc_max(const int n, data_t *dst, const data_t *src, unsigned char *ws, const size_t ws_offset, const data_type_t ws_dt, - const int index) { + const int index) const { assert(!((use_workspace == false) ^ (!ws))); // ensure ws pointer exists PRAGMA_OMP_SIMD() for (int oc = 0; oc < n; ++oc) { @@ -158,7 +158,7 @@ private: template void array_nhwc_initialize(const int n, data_t *dst, unsigned char *ws, - const size_t ws_offset, const data_type_t ws_dt) { + const size_t ws_offset, const data_type_t ws_dt) const { assert(!((use_workspace == false) ^ (!ws))); // ensure ws pointer exists for (int oc = 0; oc < n; ++oc) { if (use_workspace) { @@ -172,7 +172,7 @@ private: } } - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -224,19 +224,19 @@ struct nhwc_pooling_bwd_t: public cpu_primitive_t { } }; - nhwc_pooling_bwd_t(const pd_t *pd, const input_vector &inputs, + nhwc_pooling_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; }// namespace cpu diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.cpp index 96eb50b..f7162ff 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.cpp @@ -18,9 +18,12 @@ #include #include "c_types_map.hpp" +#include "type_helpers.hpp" + +#include "cpu_batch_normalization_utils.hpp" #include "jit_generator.hpp" + #include "nspc_batch_normalization.hpp" -#include "type_helpers.hpp" // clang 6 and 7 generate incorrect code with OMP_SIMD in some particular cases #if (defined __clang_major__) && (__clang_major__ >= 6) @@ -33,36 +36,21 @@ namespace mkldnn { namespace impl { namespace cpu { -typedef float data_t; -nspc_batch_normalization_fwd_t::nspc_batch_normalization_fwd_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), stats_reduction_(nullptr), - tmp_mean_(nullptr), tmp_variance_(nullptr), conf_(*pd) { - if (!conf_.stats_is_src()) { - this->stats_reduction_ = (data_t *)malloc( - nstl::max(conf_.C(), 16) * mkldnn_get_max_threads() * sizeof(data_t), 64); - this->tmp_mean_ = (data_t *)malloc(mkldnn_get_max_threads() * - nstl::max(conf_.C(), 16) * sizeof(data_t), 64); - this->tmp_variance_ - = (data_t *)malloc(mkldnn_get_max_threads() * - nstl::max(conf_.C(), 16) * sizeof(data_t), 64); - } -} -nspc_batch_normalization_fwd_t::~nspc_batch_normalization_fwd_t() { - if (!conf_.stats_is_src()) { - free(this->stats_reduction_); - free(this->tmp_mean_); - free(this->tmp_variance_); - } -} +using namespace memory_tracking::names; -void nspc_batch_normalization_fwd_t::execute_forward() { +void nspc_batch_normalization_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); - const bool save_stats = conf_.is_training(); - const bool is_training = conf_.is_training(); - const bool fuse_bn_relu = conf_.fuse_bn_relu(); - const bool calculate_stats = !conf_.stats_is_src(); - const bool with_relu = conf_.with_relu_post_op(); + + const bool save_stats = pd()->is_training(); + const bool is_training = pd()->is_training(); + const bool fuse_bn_relu = pd()->fuse_bn_relu(); + const bool calculate_stats = !pd()->stats_is_src(); + const bool with_relu = pd()->with_relu_post_op(); + + auto scratchpad = this->scratchpad(); + auto tmp_mean = scratchpad.get(key_bnorm_tmp_mean); + auto tmp_var = scratchpad.get(key_bnorm_tmp_var); + data_t *mean, *variance; if (!calculate_stats) { mean = reinterpret_cast( @@ -74,24 +62,24 @@ void nspc_batch_normalization_fwd_t::execute_forward() { mean = reinterpret_cast(this->memory(1)); variance = reinterpret_cast(this->memory(2)); } else { - mean = this->tmp_mean_; - variance = this->tmp_variance_; + mean = tmp_mean; + variance = tmp_var; } } - auto idx_scaleshift = 1 + 2 * conf_.stats_is_src(); + auto idx_scaleshift = 1 + 2 * pd()->stats_is_src(); auto scaleshift = reinterpret_cast( this->input_memory(idx_scaleshift)); auto dst = reinterpret_cast(this->memory(0)); - auto ws = reinterpret_cast(this->memory(conf_.ws_idx())); - auto ws_reduce = this->stats_reduction_; + auto ws = reinterpret_cast(this->memory(pd()->ws_idx())); + auto *ws_reduce = scratchpad.get(key_bnorm_reduction); - const int N = conf_.MB(); - const int C = conf_.C(); - const int SP = conf_.H() * conf_.W() * conf_.D(); + const int N = pd()->MB(); + const int C = pd()->C(); + const int SP = pd()->H() * pd()->W() * pd()->D(); - const float eps = conf_.desc()->batch_norm_epsilon; - const bool use_scaleshift = conf_.use_scaleshift(); + const float eps = pd()->desc()->batch_norm_epsilon; + const bool use_scaleshift = pd()->use_scaleshift(); auto maybe_post_op = [&](data_t res) { return (with_relu && res < 0) ? 0 : res; }; @@ -100,8 +88,8 @@ void nspc_batch_normalization_fwd_t::execute_forward() { int N_s = 0, N_e = 0, C_s = 0, C_e = 0; balance211(N, nthr, ithr, N_s, N_e); balance211(C, nthr, ithr, C_s, C_e); - data_t *mean_loc = this->tmp_mean_ + nstl::max(C, 16)*ithr; - data_t *variance_loc = this->tmp_variance_ + nstl::max(C,16)*ithr; + data_t *mean_loc = tmp_mean + nstl::max(C, 16) * ithr; + data_t *variance_loc = tmp_var + nstl::max(C, 16) * ithr; if (calculate_stats) { for (int c = 0; c < C; c++) @@ -187,45 +175,32 @@ void nspc_batch_normalization_fwd_t::execute_forward() { }); } -nspc_batch_normalization_bwd_t::nspc_batch_normalization_bwd_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - this->stats_reduction_ = (data_t *)malloc( - conf_.C() * 2 * mkldnn_get_max_threads() * sizeof(data_t), 64); - this->tmp_diff_scaleshift_ - = (data_t *)malloc((mkldnn_get_max_threads() + 1) * conf_.C() * 2 * - sizeof(data_t), 64); -} -nspc_batch_normalization_bwd_t::~nspc_batch_normalization_bwd_t() { - free(this->stats_reduction_); - free(this->tmp_diff_scaleshift_); -} - - -void nspc_batch_normalization_bwd_t::execute_backward() { +void nspc_batch_normalization_bwd_t::execute_backward() const { auto src = reinterpret_cast(this->input_memory(0)); auto mean = reinterpret_cast(this->input_memory(1)); auto variance = reinterpret_cast(this->input_memory(2)); auto diff_dst = reinterpret_cast(this->input_memory(3)); auto scaleshift = reinterpret_cast(this->input_memory(4)); auto ws = reinterpret_cast( - this->input_memory(conf_.ws_idx())); + this->input_memory(pd()->ws_idx())); + + auto scratchpad = this->scratchpad(); + auto tmp_diff_ss = scratchpad.get(key_bnorm_tmp_diff_ss); auto diff_src = reinterpret_cast(this->memory(0)); - auto diff_scaleshift = (this->memory(1)) ? - reinterpret_cast(this->memory(1)) : - this->tmp_diff_scaleshift_; + auto diff_scaleshift = this->memory(1) + ? reinterpret_cast(this->memory(1)) : tmp_diff_ss; - const int N = conf_.MB(); - const int C = conf_.C(); - const int SP = conf_.D() * conf_.H() * conf_.W(); + const int N = pd()->MB(); + const int C = pd()->C(); + const int SP = pd()->D() * pd()->H() * pd()->W(); data_t *diff_gamma = diff_scaleshift, *diff_beta = diff_scaleshift + C; - data_t *ws_reduce = this->stats_reduction_; + auto *ws_reduce = scratchpad.get(key_bnorm_reduction); - const float eps = conf_.desc()->batch_norm_epsilon; - const bool use_scaleshift = conf_.use_scaleshift(); - const bool calculate_diff_stats = !conf_.omit_stats(); - const bool fuse_bn_relu = conf_.fuse_bn_relu(); + const float eps = pd()->desc()->batch_norm_epsilon; + const bool use_scaleshift = pd()->use_scaleshift(); + const bool calculate_diff_stats = !pd()->use_global_stats(); + const bool fuse_bn_relu = pd()->fuse_bn_relu(); assert(mkldnn_thr_syncable()); parallel(0, [&](const int ithr, const int nthr) { @@ -233,9 +208,8 @@ void nspc_batch_normalization_bwd_t::execute_backward() { balance211(N, nthr, ithr, N_s, N_e); balance211(C, nthr, ithr, C_s, C_e); - data_t *diff_gamma_loc = this->tmp_diff_scaleshift_ + 2*C + C*ithr; - data_t *diff_beta_loc = this->tmp_diff_scaleshift_ + 2*C + C*nthr - + C*ithr; + data_t *diff_gamma_loc = tmp_diff_ss + 2 * C + C * ithr; + data_t *diff_beta_loc = tmp_diff_ss + 2 * C + C * (nthr + ithr); for (int c = 0; c < C; c++) { ws_reduce[C * ithr + c] = 0.; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.hpp index 168caf9..6c1ec25 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.hpp @@ -20,11 +20,12 @@ #include #include "c_types_map.hpp" -#include "cpu_batch_normalization_pd.hpp" -#include "cpu_engine.hpp" +#include "memory_tracking.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "cpu_batch_normalization_pd.hpp" + namespace mkldnn { namespace impl { namespace cpu { @@ -40,9 +41,11 @@ struct nspc_batch_normalization_fwd_t : public cpu_primitive_t { DECLARE_COMMON_PD_T("nspc_bnorm:any", nspc_batch_normalization_fwd_t); virtual status_t init() override { - using namespace prop_kind; using namespace data_type; + using namespace prop_kind; + assert(engine()->kind() == engine_kind::cpu); + bool ok = true /* the algorithm requires barriers while switching * between parallelization over N and C dimensions */ @@ -54,8 +57,7 @@ struct nspc_batch_normalization_fwd_t : public cpu_primitive_t { desc()->data_scaleshift_desc.data_type == f32) && utils::one_of(data_pd_.desc()->format, memory_format::nhwc) && (attr()->has_default_values() || this->with_relu_post_op()); - if (!ok) - return status::unimplemented; + if (!ok) return status::unimplemented; if (is_training() && fuse_bn_relu()) bn_init_default_ws(this, this->workspace_pd_, 8); @@ -63,31 +65,45 @@ struct nspc_batch_normalization_fwd_t : public cpu_primitive_t { if (stats_is_src() || is_training()) { memory_desc_t stats_d; dims_t stats_dims = { C() }; - mkldnn_memory_desc_init(&stats_d, 1, stats_dims, data_type::f32, - memory_format::x); + mkldnn_memory_desc_init(&stats_d, 1, stats_dims, + data_type::f32, memory_format::x); mean_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); variance_pd_ = cpu_memory_t::pd_t(engine_, &stats_d); } + init_scratchpad(); + return status::success; } + + private: + void init_scratchpad() { + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + if (!stats_is_src()) { + int sz = nstl::max(C(), 16) * mkldnn_get_max_threads(); + scratchpad.book(key_bnorm_reduction, sizeof(data_t) * sz); + scratchpad.book(key_bnorm_tmp_mean, sizeof(data_t) * sz); + scratchpad.book(key_bnorm_tmp_var, sizeof(data_t) * sz); + } + } }; typedef typename prec_traits::type data_t; - nspc_batch_normalization_fwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs); - ~nspc_batch_normalization_fwd_t(); - virtual void execute(event_t *e) { + nspc_batch_normalization_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + ~nspc_batch_normalization_fwd_t() {} + + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - data_t *stats_reduction_; - data_t *tmp_mean_, *tmp_variance_; - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; struct nspc_batch_normalization_bwd_t : public cpu_primitive_t { @@ -101,9 +117,11 @@ struct nspc_batch_normalization_bwd_t : public cpu_primitive_t { DECLARE_COMMON_PD_T("nspc_bnorm:any", nspc_batch_normalization_bwd_t); virtual status_t init() override { - using namespace prop_kind; using namespace data_type; + using namespace prop_kind; + assert(engine()->kind() == engine_kind::cpu); + bool ok = true /* the algorithm requires barriers while switching * between parallelization over N and C dimensions */ @@ -115,42 +133,53 @@ struct nspc_batch_normalization_bwd_t : public cpu_primitive_t { desc()->data_scaleshift_desc.data_type == f32) && utils::one_of(data_pd_.desc()->format, memory_format::nhwc) && (attr()->has_default_values() || this->with_relu_post_op()); - if (!ok) - return status::unimplemented; + if (!ok) return status::unimplemented; if (fuse_bn_relu()) { bn_init_default_ws(this, this->workspace_pd_, 8); const size_t this_ws_sz - = memory_desc_wrapper(this->workspace_pd()).size(); - - bool ws_ok = true && hint_fwd_pd_->workspace_pd() - && memory_desc_wrapper(hint_fwd_pd_->workspace_pd()) - .size() - == this_ws_sz; - if (!ws_ok) - return status::unimplemented; + = memory_desc_wrapper(this->workspace_pd()).size(); + + bool ws_ok = true + && hint_fwd_pd_->workspace_pd() + && memory_desc_wrapper(hint_fwd_pd_->workspace_pd()).size() + == this_ws_sz; + if (!ws_ok) return status::unimplemented; } + init_scratchpad(); + return status::success; } + + private: + void init_scratchpad() { + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(key_bnorm_reduction, + sizeof(data_t) * 2 * C() * mkldnn_get_max_threads()); + scratchpad.book(key_bnorm_tmp_diff_ss, sizeof(data_t) * 2 * C() + * (mkldnn_get_max_threads() + 1)); + } }; typedef typename prec_traits::type data_t; - nspc_batch_normalization_bwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs); - ~nspc_batch_normalization_bwd_t(); - virtual void execute(event_t *e) { + nspc_batch_normalization_bwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + ~nspc_batch_normalization_bwd_t() {} + + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - data_t *stats_reduction_; - data_t *tmp_diff_scaleshift_; - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; + } } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.cpp index 65570f1..f009d85 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.cpp @@ -28,51 +28,51 @@ namespace impl { namespace cpu { template -void ref_batch_normalization_fwd_t::execute_forward() { +void ref_batch_normalization_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); /* FIXME: check this */ - data_t* mean = conf_.stats_is_src() ? + data_t* mean = pd()->stats_is_src() ? const_cast(reinterpret_cast( this->input_memory(1))) : reinterpret_cast(this->memory(1)); - data_t* variance = conf_.stats_is_src() ? + data_t* variance = pd()->stats_is_src() ? const_cast(reinterpret_cast( this->input_memory(2))) : reinterpret_cast(this->memory(2)); - auto idx_scaleshift = 1 + 2*conf_.stats_is_src(); + auto idx_scaleshift = 1 + 2*pd()->stats_is_src(); auto scaleshift = reinterpret_cast(this->input_memory(idx_scaleshift)); auto dst = reinterpret_cast(this->memory(0)); - auto ws = reinterpret_cast(this->memory(conf_.ws_idx())); + auto ws = reinterpret_cast(this->memory(pd()->ws_idx())); /* fast return */ - if (this->conf_.has_zero_dim_memory()) return; + if (this->pd()->has_zero_dim_memory()) return; - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper scaleshift_d(conf_.weights_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper scaleshift_d(pd()->weights_pd()); - const int N = conf_.MB(); - const int C = conf_.C(); + const int N = pd()->MB(); + const int C = pd()->C(); int H = 1, W = 1, D = 1; const bool has_spatial = utils::one_of(data_d.ndims(), 4 ,5); if (has_spatial) { - D = conf_.D(); - H = conf_.H(); - W = conf_.W(); + D = pd()->D(); + H = pd()->H(); + W = pd()->W(); } - const float eps = conf_.desc()->batch_norm_epsilon; - const bool use_scaleshift = conf_.use_scaleshift();; - const bool save_stats = conf_.is_training(); - const bool is_training = conf_.is_training(); - const bool fuse_bn_relu = conf_.fuse_bn_relu(); - const bool calculate_stats = !conf_.stats_is_src(); + const float eps = pd()->desc()->batch_norm_epsilon; + const bool use_scaleshift = pd()->use_scaleshift();; + const bool save_stats = pd()->is_training(); + const bool is_training = pd()->is_training(); + const bool fuse_bn_relu = pd()->fuse_bn_relu(); + const bool calculate_stats = !pd()->stats_is_src(); - const bool with_relu = conf_.with_relu_post_op(); + const bool with_relu = pd()->with_relu_post_op(); auto maybe_post_op = [&](data_t res) { return (with_relu && res < 0) ? 0 : res; }; @@ -146,29 +146,29 @@ void ref_batch_normalization_fwd_t::execute_forward() { template struct ref_batch_normalization_fwd_t; template -void ref_batch_normalization_bwd_t::execute_backward() { +void ref_batch_normalization_bwd_t::execute_backward() const { auto src = reinterpret_cast(this->input_memory(0)); auto mean = reinterpret_cast(this->input_memory(1)); auto variance = reinterpret_cast(this->input_memory(2)); auto diff_dst = reinterpret_cast(this->input_memory(3)); auto scaleshift = reinterpret_cast(this->input_memory(4)); auto ws = reinterpret_cast( - this->input_memory(conf_.ws_idx())); + this->input_memory(pd()->ws_idx())); auto diff_src = reinterpret_cast(this->memory(0)); auto diff_scaleshift = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper diff_data_d(conf_.diff_src_pd()); - const memory_desc_wrapper scaleshift_d(conf_.weights_pd()); - const memory_desc_wrapper diff_scaleshift_d(conf_.diff_weights_pd()); - const memory_desc_wrapper mean_d(conf_.mean_pd()); - const memory_desc_wrapper variance_d(conf_.variance_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper diff_data_d(pd()->diff_src_pd()); + const memory_desc_wrapper scaleshift_d(pd()->weights_pd()); + const memory_desc_wrapper diff_scaleshift_d(pd()->diff_weights_pd()); + const memory_desc_wrapper mean_d(pd()->mean_pd()); + const memory_desc_wrapper variance_d(pd()->variance_pd()); - const int C = conf_.C(); + const int C = pd()->C(); /* fast return */ - if (this->conf_.has_zero_dim_memory()) { + if (this->pd()->has_zero_dim_memory()) { if (diff_scaleshift) { for (int c = 0; c < C; ++c) { diff_scaleshift[diff_scaleshift_d.off(0, c)] = 0; @@ -178,20 +178,20 @@ void ref_batch_normalization_bwd_t::execute_backward() { return; } - const int N = conf_.MB(); + const int N = pd()->MB(); int H = 1, W = 1, D = 1; const bool has_spatial = utils::one_of(data_d.ndims(), 4 ,5); if (has_spatial) { - D = conf_.D(); - H = conf_.H(); - W = conf_.W(); + D = pd()->D(); + H = pd()->H(); + W = pd()->W(); } - const float eps = conf_.desc()->batch_norm_epsilon; - const bool use_scaleshift = conf_.use_scaleshift(); - const bool calculate_diff_stats = !conf_.omit_stats(); - const bool fuse_bn_relu = conf_.fuse_bn_relu(); + const float eps = pd()->desc()->batch_norm_epsilon; + const bool use_scaleshift = pd()->use_scaleshift(); + const bool calculate_diff_stats = !pd()->use_global_stats(); + const bool fuse_bn_relu = pd()->fuse_bn_relu(); const bool is_3d = data_d.ndims() == 5; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.hpp index 95bf343..a3e53a0 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_batch_normalization.hpp @@ -67,19 +67,19 @@ struct ref_batch_normalization_fwd_t: public cpu_primitive_t { } }; - ref_batch_normalization_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_batch_normalization_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -132,19 +132,19 @@ struct ref_batch_normalization_bwd_t: public cpu_primitive_t { } }; - ref_batch_normalization_bwd_t(const pd_t *pd, const input_vector &inputs, + ref_batch_normalization_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.cpp new file mode 100644 index 0000000..4fa9372 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.cpp @@ -0,0 +1,86 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include + +#include "c_types_map.hpp" +#include "type_helpers.hpp" +#include "mkldnn_thread.hpp" + +#include "ref_binarization.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace alg_kind; + +template +void ref_binarization_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + auto dst = reinterpret_cast(this->memory()); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + int nbits = 8; + + const int MB = pd()->MB(); + const int C = pd()->C(); + const int CB = utils::div_up(C, nbits); + const int D = pd()->D(); + const int H = pd()->H(); + const int W = pd()->W(); + + parallel_nd(MB, CB, D, H, W, + [&](int n, int cb, int d, int h, int w) { + + uint8_t bin_val = 0x00; + for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { + size_t src_off = src_d.ndims() == 4 + ? src_d.off(n, c, h, w) + : src_d.ndims() == 5 + ? src_d.off(n, c, d, h, w) + : src_d.off(n, c); + + size_t wei_off = weights_d.off(c); + + float val = src[src_off]; + float thr = weights[wei_off]; + + auto bit = uint8_t((val > thr) ? 0x01 : 0x00); + bin_val |= (bit << shift); + } + + size_t dst_off = dst_d.ndims() == 4 + ? dst_d.off(n, cb*nbits, h, w) + : dst_d.ndims() == 5 + ? dst_d.off(n, cb, d, h, w) + : dst_d.off(n, cb); + + dst[dst_off / nbits] = bin_val; + }); +} + +template struct ref_binarization_fwd_t; + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.hpp new file mode 100644 index 0000000..726d700 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binarization.hpp @@ -0,0 +1,78 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_REF_BINARIZATION_HPP +#define CPU_REF_BINARIZATION_HPP + +#include + +#include "cpu_binarization_pd.hpp" +#include "cpu_engine.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" +#include "c_types_map.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct ref_binarization_fwd_t: public cpu_primitive_t { + struct pd_t: public cpu_binarization_fwd_pd_t { + pd_t(engine_t *engine, const binarization_desc_t *adesc, + const primitive_attr_t *attr, + const binarization_fwd_pd_t *hint_fwd_pd) + : cpu_binarization_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {} + + DECLARE_COMMON_PD_T("ref:any", ref_binarization_fwd_t); + + virtual status_t init() override { + using namespace prop_kind; + assert(engine()->kind() == engine_kind::cpu); + + bool ok = true + && utils::one_of(desc()->prop_kind, forward_training, forward_inference) + && utils::everyone_is(src_type, desc()->src_desc.data_type, desc()->weights_desc.data_type) + && utils::everyone_is(data_type::bin, desc()->dst_desc.data_type) + && utils::one_of(desc()->alg_kind, mkldnn_binarization_depthwise) + && attr()->has_default_values(); + if (!ok) return status::unimplemented; + + return status::success; + } + }; + + ref_binarization_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + + typedef typename prec_traits::type src_data_t; + + virtual void execute(event_t *e) const { + execute_forward(); + e->set_state(event_t::ready); + } + +private: + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } +}; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.cpp new file mode 100644 index 0000000..2c9cbde --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.cpp @@ -0,0 +1,284 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include "c_types_map.hpp" +#include "type_helpers.hpp" +#include "mkldnn_thread.hpp" +#include "mkldnn_traits.hpp" +#include "math_utils.hpp" + +#include "ref_binary_convolution.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using math::saturate; + +void _ref_binary_convolution_fwd_t::execute_forward() const { + auto src = reinterpret_cast(this->input_memory(0)); + auto weights = reinterpret_cast(this->input_memory(1)); + + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const bool with_groups = pd()->with_groups(); + + const int G = pd()->G(); + const int MB = pd()->MB(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + + const int OC = pd()->OC() / G; + const int IC = pd()->IC() / G; + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + + const int KSD = pd()->KSD(); + const int KSH = pd()->KSH(); + const int KSW = pd()->KSW(); + + const int KDD = pd()->KDD(); + const int KDH = pd()->KDH(); + const int KDW = pd()->KDW(); + + const int padFront = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); + + const float pad_value = pd()->pad_value(); + + const int ndims = pd()->cdesc()->src_desc.ndims; + + const int nbits = 8; + + const auto &p = pd()->attr()->post_ops_; + bool with_sum = p.find(primitive_kind::sum) != -1; + bool with_binarization = p.find(primitive_kind::binarization) != -1; + + auto extract_bit = [](uint8_t val, uint8_t bit) -> uint8_t { + return (uint8_t)((val >> bit) & 0x0001); + }; + + auto ker = [=](int32_t &d, int g, int mb, int oc, int od, int oh, int ow) { + for (int ic = 0; ic < IC; ++ic) + for (int kd = 0; kd < KD; ++kd) + for (int kh = 0; kh < KH; ++kh) + for (int kw = 0; kw < KW; ++kw) { + const int id = od * KSD - padFront + kd * (1 + KDD); + const int ih = oh * KSH - padT + kh * (1 + KDH); + const int iw = ow * KSW - padL + kw * (1 + KDW); + + size_t iidx = 0; + size_t widx = 0; + if (ndims == 5) { + iidx = src_d.off(mb, g * IC + ic, id, ih, iw); + widx = with_groups ? weights_d.off(g, oc, ic, kd, kh, kw) + : weights_d.off(oc, ic, kd, kh, kw); + } else if (ndims == 4) { + iidx = src_d.off(mb, g * IC + ic, ih, iw); + widx = with_groups ? weights_d.off(g, oc, ic, kh, kw) + : weights_d.off(oc, ic, kh, kw); + } else if (ndims == 3) { + iidx = src_d.off(mb, g * IC + ic, iw); + widx = with_groups ? weights_d.off(g, oc, ic, kw) + : weights_d.off(oc, ic, kw); + } else { + assert(false); + } + + + uint8_t s; + if (id < 0 || id >= ID || ih < 0 || ih >= IH || iw < 0 || iw >= IW) { + if (pad_value == 0) + continue; + else { + s = pad_value == 1.0f ? (uint8_t)1 : (uint8_t)0; + } + } else { + s = extract_bit(src[iidx/nbits], (uint8_t)(iidx % nbits)); + } + + uint8_t w = extract_bit(weights[widx/nbits], (uint8_t)(widx % nbits)); + + d += (int32_t)(s ^ w); + } + }; + + if (with_binarization) { + auto dst = reinterpret_cast(this->memory()); + + int binarization_idx = p.find(primitive_kind::binarization); + const float* binarization_weights = p.entry_[binarization_idx].binarization.weights_data; + + parallel_nd(G, MB, utils::div_up(OC, nbits), OD, OH, OW, + [&](int g, int mb, int ocb, int od, int oh, int ow) { + + uint8_t bin_val = 0x00; + for (int oc = ocb * nbits, shift = 0; oc < std::min(OC, (ocb + 1) * nbits); oc++, shift++) { + int32_t a = 0; + ker(a, g, mb, oc, od, oh, ow); + + float base_value; + if (pad_value == 0.0f) { + const int i_left_overflow = nstl::max(0, (padL - ow * KSW)); + const int i_right_overflow = nstl::max(IW, (ow * KSW + (KW - 1) * (KDW + 1) - padL + 1)) - IW; + const int kw_padding = + KW - utils::div_up(i_left_overflow, (KDW + 1)) - utils::div_up(i_right_overflow, (KDW + 1)); + + const int i_top_overflow = nstl::max(0, (padT - oh * KSH)); + const int i_bottom_overflow = nstl::max(IH, (oh * KSH + (KH - 1) * (KDH + 1) - padT + 1)) - IH; + const int kh_padding = + KH - utils::div_up(i_top_overflow, (KDH + 1)) - utils::div_up(i_bottom_overflow, (KDH + 1)); + + const int i_front_overflow = nstl::max(0, (padFront - od * KSD)); + const int i_back_overflow = nstl::max(ID, (od * KSD + (KD - 1) * (KDD + 1) - padFront + 1)) - ID; + const int kd_padding = + KD - utils::div_up(i_front_overflow, (KDD + 1)) - utils::div_up(i_back_overflow, (KDD + 1)); + + base_value = IC * kd_padding * kh_padding * kw_padding; + } else { + base_value = IC * KD * KH * KW; + } + + float a_fp = base_value - (float)(2 * a); + + if (with_sum) { + if (ndims == 5) + a_fp += dst[dst_d.off(mb, g * OC + oc, od, oh, ow)]; + else if (ndims == 4) + a_fp += dst[dst_d.off(mb, g * OC + oc, oh, ow)]; + else if (ndims == 3) + a_fp += dst[dst_d.off(mb, g * OC + oc, ow)]; + else + assert(false); + } + + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + for (int i = 0; i < p.len_; i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + a_fp = eltwise_injectors[eltwise_inj_idx]->compute_scalar(a_fp); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + auto depthwise_weights = post_op.depthwise.weights_data; + auto depthwise_bias = post_op.depthwise.biases_data; + + a_fp = depthwise_injectors[depthwise_inj_idx]->compute_scalar(a_fp, + depthwise_weights + g * OC + oc, + depthwise_bias + g * OC + oc); + depthwise_inj_idx++; + } + } + + float thr = binarization_weights[g * OC + oc]; + auto bit = uint8_t((a_fp > thr) ? 0x01 : 0x00); + bin_val |= (bit << shift); + } + + if (ndims == 5) + dst[dst_d.off(mb, g*OC + ocb*nbits, od, oh, ow) / nbits] = bin_val; + else if (ndims == 4) + dst[dst_d.off(mb, g*OC + ocb*nbits, oh, ow) / nbits] = bin_val; + else if (ndims == 3) + dst[dst_d.off(mb, g*OC + ocb*nbits, ow) / nbits] = bin_val; + else + assert(false); + }); + } else { + auto dst = reinterpret_cast(this->memory()); + + parallel_nd(G, MB, OC, OD, OH, OW, + [&](int g, int mb, int oc, int od, int oh, int ow) { + int32_t a = 0; + ker(a, g, mb, oc, od, oh, ow); + + float base_value; + if (pad_value == 0.0f) { + const int i_left_overflow = nstl::max(0, (padL - ow * KSW)); + const int i_right_overflow = nstl::max(IW, (ow * KSW + (KW - 1) * (KDW + 1) - padL + 1)) - IW; + const int kw_padding = + KW - utils::div_up(i_left_overflow, (KDW + 1)) - utils::div_up(i_right_overflow, (KDW + 1)); + + const int i_top_overflow = nstl::max(0, (padT - oh * KSH)); + const int i_bottom_overflow = nstl::max(IH, (oh * KSH + (KH - 1) * (KDH + 1) - padT + 1)) - IH; + const int kh_padding = + KH - utils::div_up(i_top_overflow, (KDH + 1)) - utils::div_up(i_bottom_overflow, (KDH + 1)); + + const int i_front_overflow = nstl::max(0, (padFront - od * KSD)); + const int i_back_overflow = nstl::max(ID, (od * KSD + (KD - 1) * (KDD + 1) - padFront + 1)) - ID; + const int kd_padding = + KD - utils::div_up(i_front_overflow, (KDD + 1)) - utils::div_up(i_back_overflow, (KDD + 1)); + + base_value = IC * kd_padding * kh_padding * kw_padding; + } else { + base_value = IC * KD * KH * KW; + } + + float a_fp = base_value - (float)(2 * a); + + if (with_sum) { + if (ndims == 5) + a_fp += dst[dst_d.off(mb, g*OC + oc, od, oh, ow)]; + else if (ndims == 4) + a_fp += dst[dst_d.off(mb, g*OC + oc, oh, ow)]; + else if (ndims == 3) + a_fp += dst[dst_d.off(mb, g*OC + oc, ow)]; + else + assert(false); + } + + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + for (int i = 0; i < p.len_; i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + a_fp = eltwise_injectors[eltwise_inj_idx]->compute_scalar(a_fp); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + auto depthwise_weights = post_op.depthwise.weights_data; + auto depthwise_bias = post_op.depthwise.biases_data; + + a_fp = depthwise_injectors[depthwise_inj_idx]->compute_scalar(a_fp, depthwise_weights + g * OC + oc, + depthwise_bias + g * OC + oc); + depthwise_inj_idx++; + } + } + + if (ndims == 5) + dst[dst_d.off(mb, g*OC + oc, od, oh, ow)] = a_fp; + else if (ndims == 4) + dst[dst_d.off(mb, g*OC + oc, oh, ow)] = a_fp; + else if (ndims == 3) + dst[dst_d.off(mb, g*OC + oc, ow)] = a_fp; + else + assert(false); + }); + } +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.hpp new file mode 100644 index 0000000..2160d9b --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_binary_convolution.hpp @@ -0,0 +1,151 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_REF_BINARY_CONVOLUTION_HPP +#define CPU_REF_BINARY_CONVOLUTION_HPP + +#include + +#include "c_types_map.hpp" +#include "cpu_binary_convolution_pd.hpp" +#include "cpu_engine.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" +#include "ref_eltwise.hpp" +#include "ref_depthwise.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +struct _ref_binary_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public _cpu_binary_convolution_fwd_pd_t { + pd_t(engine_t *engine, + const typename pd_t::base_desc_t *adesc, + const primitive_attr_t *attr, + const typename pd_t::base_class *hint_fwd_pd) + : _cpu_binary_convolution_fwd_pd_t(engine, adesc, attr, + hint_fwd_pd) + {} + + DECLARE_COMMON_PD_T("ref:any", _ref_binary_convolution_fwd_t); + + virtual status_t init() override { + using namespace prop_kind; + using namespace data_type; + assert(this->engine()->kind() == engine_kind::cpu); + bool ok = true + && this->set_default_params() == status::success + && utils::one_of(this->cdesc_().prop_kind, forward_training, + forward_inference) + && this->cdesc_().alg_kind == alg_kind::binary_convolution_direct + && this->cdesc_().src_desc.data_type == bin + && this->cdesc_().weights_desc.data_type == bin + && this->cdesc_().accum_data_type == s32 + && utils::one_of(this->cdesc_().dst_desc.data_type, f32, bin) + && is_supported_post_ops(); + return ok ? status::success : status::unimplemented; + } + + virtual bool is_supported_post_ops() const { + bool ok = true; + auto const &po = this->attr()->post_ops_; + + auto is_eltwise = [&](int idx) { return po.entry_[idx].is_eltwise(); }; + auto is_depthwise = [&](int idx) { return po.entry_[idx].is_depthwise(); }; + auto is_sum = [&](int idx) { return po.entry_[idx].is_sum(); }; + auto is_simple = [&](int idx) { return (is_eltwise(idx) || is_depthwise(idx)); }; + auto is_binarization = [&](int idx) { return po.entry_[idx].is_binarization(); }; + + switch (po.len_) { + case 0: // no post_ops + break; + case 1: + ok = ok && (is_simple(0) || is_sum(0) || is_binarization(0)); + break; + case 2: + ok = ok && ((is_sum(0) && is_simple(1)) || (is_simple(0) && is_simple(1)) || + (is_simple(0) && is_binarization(1))); + break; + case 3: + ok = ok && ((is_sum(0) && is_simple(1) && is_simple(2)) || + (is_simple(0) && is_simple(1) && is_binarization(2))); + break; + + default: ok = false; + } + return ok; + } + }; + + _ref_binary_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) { + const auto &post_ops = pd()->attr()->post_ops_; + + for (int i = 0; i < post_ops.len_; i++) { + auto &post_op = post_ops.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(new ref_eltwise_scalar_fwd_t( + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta + )); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(new ref_depthwise_scalar_fwd_t( + post_op.depthwise.alg + )); + } + } + } + + ~_ref_binary_convolution_fwd_t() { + for (auto inj : eltwise_injectors) + delete inj; + eltwise_injectors.clear(); + + for (auto inj : depthwise_injectors) + delete inj; + depthwise_injectors.clear(); + } + + virtual void execute(event_t *e) const { + switch (pd()->cdesc()->prop_kind) { + case prop_kind::forward_training: + case prop_kind::forward_inference: + execute_forward(); + break; + default: + assert(!"invalid prop_kind"); + } + e->set_state(event_t::ready); + } + +private: + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + + nstl::vector eltwise_injectors; + nstl::vector depthwise_injectors; +}; + +using ref_binary_convolution_fwd_t = _ref_binary_convolution_fwd_t; + +} +} +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_concat.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_concat.hpp index 923bb61..5d346df 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_concat.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_concat.hpp @@ -77,7 +77,7 @@ struct ref_concat_t: public cpu_primitive_t { } return ret; } - virtual pd_t *clone() const override { return nullptr; } + virtual pd_t *clone() const override { return new pd_t(*this); } virtual const char *name() const override { return "ref:any"; } virtual status_t init() override { @@ -99,15 +99,15 @@ struct ref_concat_t: public cpu_primitive_t { } } } - return success; + return (size_t)n_ == reorder_pds_.size() ? success : unimplemented; } nstl::vector reorder_pds_; }; - ref_concat_t(const pd_t *conf, const input_vector &inputs, + ref_concat_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs, nstl::vector reorders) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*conf), + : cpu_primitive_t(apd, inputs, outputs), reorders_(reorders) {} ~ref_concat_t() { @@ -116,7 +116,7 @@ struct ref_concat_t: public cpu_primitive_t { delete reorders_[i]; } - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { for (size_t i = 0; i < reorders_.size(); ++i) { event_t ei; reorders_[i]->execute(&ei); @@ -125,7 +125,7 @@ struct ref_concat_t: public cpu_primitive_t { } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } nstl::vector reorders_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.cpp index 33b5fe0..d3e6483 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.cpp @@ -27,56 +27,59 @@ namespace impl { namespace cpu { using math::saturate; +using math::get_bias; -template -void _ref_convolution_fwd_t - ::execute_forward() { +void ref_convolution_fwd_t + ::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); - const bool with_groups = conf_.with_groups(); + const bool with_groups = pd()->with_groups(); - const int G = conf_.G(); - const int MB = conf_.MB(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); + const int G = pd()->G(); + const int MB = pd()->MB(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); - const int OC = conf_.OC() / G; - const int IC = conf_.IC() / G; - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); + const int OC = pd()->OC() / G; + const int IC = pd()->IC() / G; + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); - const int KSD = conf_.KSD(); - const int KSH = conf_.KSH(); - const int KSW = conf_.KSW(); + const int KSD = pd()->KSD(); + const int KSH = pd()->KSH(); + const int KSW = pd()->KSW(); - const int KDD = conf_.KDD(); - const int KDH = conf_.KDH(); - const int KDW = conf_.KDW(); + const int KDD = pd()->KDD(); + const int KDH = pd()->KDH(); + const int KDW = pd()->KDW(); - const int padFront = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); + const int padFront = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); - const float nslope = conf_.negative_slope(); + const bool with_relu = 0; // TODO: change if support post_ops + const float nslope = 0.f; - const int ndims = conf_.cdesc()->src_desc.ndims; + const int ndims = pd()->desc()->src_desc.ndims; - auto ker = [=](acc_data_t &d, int g, int mb, int oc, int od, int oh, + auto ker = [=](int g, int mb, int oc, int od, int oh, int ow) { + acc_data_t d = 0; for (int ic = 0; ic < IC; ++ic) for (int kd = 0; kd < KD; ++kd) for (int kh = 0; kh < KH; ++kh) @@ -107,36 +110,23 @@ void _ref_convolution_fwd_t else assert(false); - } - }; - auto get_bias = [=, &bias](size_t off) -> float { -# define CASE(dt) case dt: \ - return (float)(*((const prec_traits
::type *)bias + off)) - switch (conf_.cdesc()->bias_desc.data_type) { - CASE(data_type::s8); - CASE(data_type::u8); - CASE(data_type::s32); - CASE(data_type::f32); - default: assert(!"unimplemented"); } -# undef CASE - return 0; + return d; }; + parallel_nd(G, MB, OC, OD, OH, OW, [&](int g, int mb, int oc, int od, int oh, int ow) { - acc_data_t a = 0; - ker(a, g, mb, oc, od, oh, ow); - - float a_fp = (float)a; + float a_fp = ker(g, mb, oc, od, oh, ow); if (bias) - a_fp += get_bias(bias_d.off(g*OC + oc)); + a_fp += get_bias(bias, bias_d.off(g * OC + oc), + pd()->desc()->bias_desc.data_type); if (with_relu && a_fp < 0) a_fp *= nslope; if (data_traits::data_type != data_type::f32) { - switch (conf_.attr()->round_mode_) { + switch (pd()->attr()->round_mode_) { case round_mode::down: a_fp = floorf(a_fp); break; case round_mode::nearest: a_fp = nearbyintf(a_fp); break; } @@ -156,51 +146,52 @@ void _ref_convolution_fwd_t template void ref_convolution_bwd_data_t::execute_backward_data() { + acc_type>::execute_backward_data() const { auto diff_dst = reinterpret_cast( this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); - const bool with_groups = conf_.with_groups(); + const bool with_groups = pd()->with_groups(); - const int G = conf_.G(); - const int MB = conf_.MB(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); + const int G = pd()->G(); + const int MB = pd()->MB(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); - const int OC = conf_.OC() / G; - const int IC = conf_.IC() / G; - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); + const int OC = pd()->OC() / G; + const int IC = pd()->IC() / G; + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); - const int KSD = conf_.KSD(); - const int KSH = conf_.KSH(); - const int KSW = conf_.KSW(); + const int KSD = pd()->KSD(); + const int KSH = pd()->KSH(); + const int KSW = pd()->KSW(); - const int KDD = conf_.KDD(); - const int KDH = conf_.KDH(); - const int KDW = conf_.KDW(); + const int KDD = pd()->KDD(); + const int KDH = pd()->KDH(); + const int KDW = pd()->KDW(); - const int padFront = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); + const int padFront = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); - const int ndims = conf_.cdesc()->diff_src_desc.ndims; + const int ndims = pd()->desc()->diff_src_desc.ndims; - auto ker = [=](acc_data_t &d, int g, int mb, int ic, int id, int ih, + auto ker = [=](int g, int mb, int ic, int id, int ih, int iw) { + acc_data_t d = 0; for (int oc = 0; oc < OC; ++oc) for (int kd = 0; kd < KD; ++kd) for (int kh = 0; kh < KH; ++kh) @@ -239,20 +230,9 @@ void ref_convolution_bwd_data_t acc_data_t { -# define CASE(dt) case dt: \ - return (acc_data_t)(*((const prec_traits
::type *)bias + off)) - switch (conf_.desc()->bias_desc.data_type) { - CASE(data_type::s8); - CASE(data_type::u8); - CASE(data_type::s32); - CASE(data_type::f32); - default: assert(!"unimplemented"); - } -# undef CASE - return 0; - }; + parallel_nd(G, MB, IC, ID, IH, IW, [&](int g, int mb, int ic, int id, int ih, int iw) { auto ds_idx = (ndims == 5) @@ -260,10 +240,11 @@ void ref_convolution_bwd_data_tdesc()->bias_desc.data_type) + : 0; + a += ker(g, mb, ic, id, ih, iw); diff_src[ds_idx] = saturate(a); }); } @@ -271,48 +252,48 @@ void ref_convolution_bwd_data_t void ref_convolution_bwd_weights_t::execute_backward_weights() { + acc_type>::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast( this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); - const memory_desc_wrapper diff_bias_d(conf_.diff_weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); + const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); - const bool with_groups = conf_.with_groups(); + const bool with_groups = pd()->with_groups(); - const int G = conf_.G(); - const int MB = conf_.MB(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); + const int G = pd()->G(); + const int MB = pd()->MB(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); - const int OC = conf_.OC() / G; - const int IC = conf_.IC() / G; - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); + const int OC = pd()->OC() / G; + const int IC = pd()->IC() / G; + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); - const int KSD = conf_.KSD(); - const int KSH = conf_.KSH(); - const int KSW = conf_.KSW(); + const int KSD = pd()->KSD(); + const int KSH = pd()->KSH(); + const int KSW = pd()->KSW(); - const int KDD = conf_.KDD(); - const int KDH = conf_.KDH(); - const int KDW = conf_.KDW(); + const int KDD = pd()->KDD(); + const int KDH = pd()->KDH(); + const int KDW = pd()->KDW(); - const int padFront = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); + const int padFront = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); - const int ndims = conf_.cdesc()->src_desc.ndims; + const int ndims = pd()->desc()->src_desc.ndims; auto ker = [=](acc_data_t &d, int g, int oc, int ic, int kd, int kh, int kw) { for (int mb = 0; mb < MB; ++mb) @@ -364,6 +345,7 @@ auto ker = [=](acc_data_t &d, int g, int oc, int ic, int kd, int kh, int kw) { parallel_nd(G, OC, [&](int g, int oc) { if (diff_bias) { + // XXX: loss of precision when bias is a float... acc_data_t db = 0; ker_bias(db, g, oc); diff_bias[diff_bias_d.off(g*OC+oc)] @@ -401,19 +383,13 @@ auto ker = [=](acc_data_t &d, int g, int oc, int ic, int kd, int kh, int kw) { using namespace data_type; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; - -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; -template struct _ref_convolution_fwd_t; +template struct ref_convolution_fwd_t; +template struct ref_convolution_fwd_t; + +template struct ref_convolution_fwd_t; +template struct ref_convolution_fwd_t; +template struct ref_convolution_fwd_t; +template struct ref_convolution_fwd_t; template struct ref_convolution_bwd_data_t; template struct ref_convolution_bwd_data_t; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.hpp index 3153e4d..9cb8dc2 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.hpp @@ -29,21 +29,20 @@ namespace mkldnn { namespace impl { namespace cpu { -template -struct _ref_convolution_fwd_t: public cpu_primitive_t { - struct pd_t: public _cpu_convolution_fwd_pd_t { +struct ref_convolution_fwd_t: public cpu_primitive_t { + struct pd_t: public cpu_convolution_fwd_pd_t { pd_t(engine_t *engine, - const typename pd_t::base_desc_t *adesc, + const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) - : _cpu_convolution_fwd_pd_t(engine, adesc, attr, - hint_fwd_pd) + : cpu_convolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {} - DECLARE_COMMON_PD_T("ref:any", _ref_convolution_fwd_t); + DECLARE_COMMON_PD_T("ref:any", ref_convolution_fwd_t); virtual status_t init() override { using namespace prop_kind; @@ -51,35 +50,37 @@ struct _ref_convolution_fwd_t: public cpu_primitive_t { assert(this->engine()->kind() == engine_kind::cpu); bool ok = true && this->set_default_params() == status::success - && utils::one_of(this->cdesc_().prop_kind, forward_training, + && utils::one_of(this->desc()->prop_kind, forward_training, forward_inference) - && this->cdesc_().alg_kind == alg_kind::convolution_direct - && this->cdesc_().src_desc.data_type == src_type - && this->cdesc_().weights_desc.data_type == wei_type - && this->cdesc_().accum_data_type == acc_type - && this->cdesc_().dst_desc.data_type == dst_type + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) + && this->desc()->src_desc.data_type == src_type + && this->desc()->weights_desc.data_type == wei_type + && this->desc()->accum_data_type == acc_type + && this->desc()->dst_desc.data_type == dst_type && IMPLICATION(this->with_bias(), true && IMPLICATION(src_type == u8, - utils::one_of(this->cdesc_().bias_desc.data_type, + utils::one_of(this->desc()->bias_desc.data_type, f32, s32, s8, u8)) && IMPLICATION(src_type == f32, - this->cdesc_().bias_desc.data_type == f32)) + this->desc()->bias_desc.data_type == f32)) && this->attr()->has_default_values(); return ok ? status::success : status::unimplemented; } }; - _ref_convolution_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_convolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { - switch (conf_.cdesc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::forward_training: case prop_kind::forward_inference: execute_forward(); @@ -91,22 +92,10 @@ struct _ref_convolution_fwd_t: public cpu_primitive_t { } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; -template -using ref_convolution_fwd_t = _ref_convolution_fwd_t; - -template -using ref_convolution_relu_t = _ref_convolution_fwd_t; - template @@ -127,7 +116,9 @@ struct ref_convolution_bwd_data_t: public cpu_primitive_t { bool ok = true && this->set_default_params() == status::success && this->desc()->prop_kind == backward_data - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && this->desc()->diff_dst_desc.data_type == diff_dst_type && this->desc()->weights_desc.data_type == wei_type && this->desc()->accum_data_type == acc_type @@ -139,17 +130,17 @@ struct ref_convolution_bwd_data_t: public cpu_primitive_t { virtual bool support_bias() const override { return true; } }; - ref_convolution_bwd_data_t(const pd_t *pd, const input_vector &inputs, + ref_convolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type diff_src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: execute_backward_data(); break; @@ -160,8 +151,8 @@ struct ref_convolution_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template set_default_params() == status::success && this->desc()->prop_kind == backward_weights - && this->desc()->alg_kind == alg_kind::convolution_direct + && utils::one_of(this->desc()->alg_kind, + alg_kind::convolution_auto, + alg_kind::convolution_direct) && this->desc()->src_desc.data_type == src_type && this->desc()->diff_weights_desc.data_type == diff_wei_type && this->desc()->diff_dst_desc.data_type == diff_dst_type @@ -197,17 +190,17 @@ struct ref_convolution_bwd_weights_t: public cpu_primitive_t { } }; - ref_convolution_bwd_weights_t(const pd_t *pd, const input_vector &inputs, + ref_convolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type diff_wei_data_t; typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_weights: execute_backward_weights(); break; @@ -218,8 +211,8 @@ struct ref_convolution_bwd_weights_t: public cpu_primitive_t { } private: - void execute_backward_weights(); - pd_t conf_; + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.cpp index 0100367..d97f3b4 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.cpp @@ -28,18 +28,18 @@ namespace cpu { typedef float data_t; -void ref_deconvolution_fwd_t::compute_fwd_bias() { +void ref_deconvolution_fwd_t::compute_fwd_bias() const { auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); - const int G = conf_.G(); - const int MB = conf_.MB(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int OD = conf_.OD(); - const int OC = conf_.OC() / G; - const int ndims = conf_.desc()->src_desc.ndims; + const int G = pd()->G(); + const int MB = pd()->MB(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int OD = pd()->OD(); + const int OC = pd()->OC() / G; + const int ndims = pd()->desc()->src_desc.ndims; parallel_nd(MB, G, OC, OD, OH, OW, [&](int mb, int g, int oc, int od, int oh, int ow) { @@ -51,15 +51,15 @@ void ref_deconvolution_fwd_t::compute_fwd_bias() { }); } -void ref_deconvolution_fwd_t::compute_fwd_bias_ncdhw() { +void ref_deconvolution_fwd_t::compute_fwd_bias_ncdhw() const { auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int SP = conf_.OW()*conf_.OH()*conf_.OD(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int SP = pd()->OW()*pd()->OH()*pd()->OD(); parallel_nd(MB, OC, [&](int mb, int oc) { PRAGMA_OMP_SIMD() @@ -71,15 +71,15 @@ void ref_deconvolution_fwd_t::compute_fwd_bias_ncdhw() { } template -void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc() { +void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc() const { auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int SP = conf_.OW() * conf_.OH() * conf_.OD(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int SP = pd()->OW() * pd()->OH() * pd()->OD(); const ptrdiff_t stride_mb = dst_d.blocking_desc().strides[0][0]; @@ -95,18 +95,18 @@ void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc() { }); } -void ref_deconvolution_bwd_weights_t::compute_bwd_bias() { +void ref_deconvolution_bwd_weights_t::compute_bwd_bias() const { auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_bias = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); - const int G = conf_.G(); - const int MB = conf_.MB(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); - const int OC = conf_.OC() / G; - const int OD = conf_.OD(); - const int ndims = conf_.desc()->src_desc.ndims; + const int G = pd()->G(); + const int MB = pd()->MB(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); + const int OC = pd()->OC() / G; + const int OD = pd()->OD(); + const int ndims = pd()->desc()->src_desc.ndims; parallel_nd(G, OC, [&](int g, int oc) { data_t db = 0; @@ -128,15 +128,15 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias() { }); } -void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ncdhw() { +void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ncdhw() const { auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_bias = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); - const int OC = conf_.OC(); - const int MB = conf_.MB(); - const int SP = conf_.OH()*conf_.OW()*conf_.OD(); + const int OC = pd()->OC(); + const int MB = pd()->MB(); + const int SP = pd()->OH()*pd()->OW()*pd()->OD(); parallel_nd(OC, [&](int oc) { data_t db = 0; @@ -152,15 +152,15 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ncdhw() { } template -void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc() { +void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc() const { auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_bias = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); - const int OC = conf_.OC(); - const int MB = conf_.MB(); - const int SP = conf_.OH() * conf_.OW() * conf_.OD(); + const int OC = pd()->OC(); + const int MB = pd()->MB(); + const int SP = pd()->OH() * pd()->OW() * pd()->OD(); const ptrdiff_t stride_mb = diff_dst_d.blocking_desc().strides[0][0]; @@ -185,10 +185,10 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc() { }); } -template void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc<8>(); -template void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc<16>(); -template void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc<8>(); -template void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc<16>(); +template void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc<8>() const; +template void ref_deconvolution_fwd_t::compute_fwd_bias_nCdhwXc<16>() const; +template void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc<8>() const; +template void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc<16>() const; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.hpp index 6890c1c..e185172 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.hpp @@ -28,39 +28,6 @@ #include "utils.hpp" #include "primitive_iterator.hpp" -#define DECLARE_DECONVOLUTION_PD_t(impl_name, ...) \ - virtual pd_t *clone() const override { return new pd_t(*this); } \ - virtual status_t create_primitive(primitive_t **primitive, \ - const primitive_at_t *inputs, \ - const primitive_t **outputs) const override { \ - double ms = get_msec(); \ - using namespace prop_kind;\ - primitive_t::input_vector ins(inputs, inputs + this->n_inputs()); \ - primitive_t::output_vector outs(outputs, outputs + this->n_outputs()); \ - auto ret = safe_ptr_assign(*primitive, \ - new (__VA_ARGS__)(this, ins, outs)); \ - primitive_t *conv_primitive; \ - if (this->desc()->prop_kind == backward_weights) {\ - primitive_at_t conv_inputs[2];\ - conv_inputs[0] = inputs[1];\ - conv_inputs[1] = inputs[0];\ - conv_pd_->create_primitive((&conv_primitive), conv_inputs, outputs);\ - } \ - else conv_pd_->create_primitive((&conv_primitive), inputs, outputs);\ - ((__VA_ARGS__ *)(*primitive))->conv_p_ = conv_primitive;\ - ms = get_msec() - ms; \ - if (mkldnn_verbose()->level >= 2) { \ - printf("mkldnn_verbose,create,%s,%g\n", this->info(), ms); \ - fflush(0); \ - } \ - return ret; \ - } \ -virtual const char *name() const override { return impl_name; } - -#define DECLARE_DECONVOLUTION_PD_T(impl_name, ...) \ - DECLARE_DECONVOLUTION_PD_t(impl_name, __VA_ARGS__) - - namespace mkldnn { namespace impl { namespace cpu { @@ -146,7 +113,7 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t { ~pd_t() { delete conv_pd_; } - DECLARE_DECONVOLUTION_PD_T("ref:any", ref_deconvolution_fwd_t); + DECLARE_DECONVOLUTION_PD_T(ref_deconvolution_fwd_t); status_t init_convolution(){ using namespace memory_format; @@ -154,7 +121,7 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t { convolution_desc_t cd; status_t status; - status = conv_descr_create(this->cdesc(), &cd); + status = conv_descr_create(this->desc(), &cd); if (status != status::success) return status; mkldnn_primitive_desc_iterator it(this->engine_, (op_desc_t *)&cd, @@ -216,19 +183,19 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t { bool conv_supports_bias_; }; - ref_deconvolution_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_deconvolution_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), conv_p_(nullptr) {} + : cpu_primitive_t(apd, inputs, outputs), conv_p_(nullptr) {} ~ref_deconvolution_fwd_t() { delete this->conv_p_; } - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::forward_training: case prop_kind::forward_inference: (conv_p_)->execute(e); - if (conf_.with_bias() && !conf_.conv_supports_bias_) { - switch (conf_.dst_pd()->desc()->format) { + if (pd()->with_bias() && !pd()->conv_supports_bias_) { + switch (pd()->dst_pd()->desc()->format) { case memory_format::nchw : case memory_format::ncdhw : compute_fwd_bias_ncdhw(); @@ -254,10 +221,10 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t { } private: - void compute_fwd_bias(); - void compute_fwd_bias_ncdhw(); - template void compute_fwd_bias_nCdhwXc(); - pd_t conf_; + void compute_fwd_bias() const; + void compute_fwd_bias_ncdhw() const; + template void compute_fwd_bias_nCdhwXc() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } primitive_t *conv_p_; }; @@ -277,7 +244,7 @@ struct ref_deconvolution_bwd_data_t: public cpu_primitive_t { ~pd_t() { delete conv_pd_; } - DECLARE_DECONVOLUTION_PD_T("ref:any", ref_deconvolution_bwd_data_t); + DECLARE_DECONVOLUTION_PD_T(ref_deconvolution_bwd_data_t); status_t init_convolution(){ using namespace memory_format; @@ -285,7 +252,7 @@ struct ref_deconvolution_bwd_data_t: public cpu_primitive_t { convolution_desc_t cd; status_t status; - status = conv_descr_create(this->cdesc(), &cd); + status = conv_descr_create(this->desc(), &cd); if (status != status::success) return status; mkldnn_primitive_desc_iterator it(this->engine_, (op_desc_t *)&cd, @@ -336,13 +303,13 @@ struct ref_deconvolution_bwd_data_t: public cpu_primitive_t { } primitive_desc_t *conv_pd_; }; - ref_deconvolution_bwd_data_t(const pd_t *pd, const input_vector &inputs, + ref_deconvolution_bwd_data_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), conv_p_(nullptr) {} + : cpu_primitive_t(apd, inputs, outputs), conv_p_(nullptr) {} ~ref_deconvolution_bwd_data_t() { delete this->conv_p_; } - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_data: (conv_p_)->execute(e); break; @@ -353,7 +320,7 @@ struct ref_deconvolution_bwd_data_t: public cpu_primitive_t { } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } primitive_t *conv_p_; }; @@ -373,7 +340,7 @@ struct ref_deconvolution_bwd_weights_t: public cpu_primitive_t { ~pd_t() { delete conv_pd_; } - DECLARE_DECONVOLUTION_PD_T("ref:any", ref_deconvolution_bwd_weights_t); + DECLARE_DECONVOLUTION_PD_T(ref_deconvolution_bwd_weights_t); status_t init_convolution(){ using namespace memory_format; @@ -381,7 +348,7 @@ struct ref_deconvolution_bwd_weights_t: public cpu_primitive_t { convolution_desc_t cd; status_t status; - status = conv_descr_create(this->cdesc(), &cd); + status = conv_descr_create(this->desc(), &cd); if (status != status::success) return status; mkldnn_primitive_desc_iterator it(this->engine_, (op_desc_t *)&cd, @@ -434,20 +401,20 @@ struct ref_deconvolution_bwd_weights_t: public cpu_primitive_t { primitive_desc_t *conv_pd_; }; - ref_deconvolution_bwd_weights_t(const pd_t *pd, const input_vector &inputs, + ref_deconvolution_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), conv_p_(nullptr) {} + : cpu_primitive_t(apd, inputs, outputs), conv_p_(nullptr) {} ~ref_deconvolution_bwd_weights_t() { delete this->conv_p_; } typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward_weights: (conv_p_)->execute(e); - if (conf_.with_bias()) { - switch (conf_.diff_dst_pd()->desc()->format) { + if (pd()->with_bias()) { + switch (pd()->diff_dst_pd()->desc()->format) { case memory_format::nchw : case memory_format::ncdhw : compute_bwd_bias_ncdhw(); @@ -472,11 +439,11 @@ struct ref_deconvolution_bwd_weights_t: public cpu_primitive_t { } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } primitive_t *conv_p_; - void compute_bwd_bias(); - void compute_bwd_bias_ncdhw(); - template void compute_bwd_bias_nCdhwXc(); + void compute_bwd_bias() const; + void compute_bwd_bias_ncdhw() const; + template void compute_bwd_bias_nCdhwXc() const; }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.cpp index b5d334a..4e95474 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.cpp @@ -55,22 +55,22 @@ float ref_depthwise_scalar_fwd_t::compute_scalar(float s, const float* weights, } template -void ref_depthwise_fwd_t::execute_forward() { +void ref_depthwise_fwd_t::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); - const int MB = conf_.MB(); - const int C = conf_.C(); - const int D = conf_.D(); - const int H = conf_.H(); - const int W = conf_.W(); - const auto alg_kind = conf_.desc()->alg_kind; + const int MB = pd()->MB(); + const int C = pd()->C(); + const int D = pd()->D(); + const int H = pd()->H(); + const int W = pd()->W(); + const auto alg_kind = pd()->desc()->alg_kind; parallel_nd(MB, C, D, H, W, [&](int n, int c, int d, int h, int w) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.hpp index 28c08be..4ac116c 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.hpp @@ -63,19 +63,19 @@ struct ref_depthwise_fwd_t: public cpu_primitive_t { } }; - ref_depthwise_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_depthwise_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.cpp index 0d0122b..e3e703d 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.cpp @@ -30,51 +30,56 @@ namespace cpu { using namespace alg_kind; using namespace math; -ref_eltwise_scalar_fwd_t::ref_eltwise_scalar_fwd_t(const alg_kind_t alg_, const float alpha_, const float beta_) - : alg(alg_), alpha(alpha_), beta(beta_) { - using namespace alg_kind; - - assert(utils::one_of(alg, eltwise_relu, eltwise_tanh, eltwise_elu, - eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, - eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic, eltwise_clamp)); +ref_eltwise_scalar_fwd_t::ref_eltwise_scalar_fwd_t(alg_kind_t alg, float alpha, + float beta): alg_(alg), alpha_(alpha), beta_(beta) { + assert(utils::one_of(alg_, eltwise_relu, eltwise_tanh, eltwise_elu, + eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear, + eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic, + eltwise_clamp, eltwise_exp, eltwise_not)); } +ref_eltwise_scalar_fwd_t::ref_eltwise_scalar_fwd_t( + const post_ops_t::entry_t::eltwise_t &eltwise) + : ref_eltwise_scalar_fwd_t(eltwise.alg, eltwise.alpha, eltwise.beta) {} + float ref_eltwise_scalar_fwd_t::compute_scalar(float s) { - switch (alg) { - case eltwise_relu: return relu_fwd(s, alpha); - case eltwise_tanh: return tanh_fwd(s); - case eltwise_elu: return elu_fwd(s, alpha); + switch (alg_) { + case eltwise_relu: return relu_fwd(s, alpha_); + case eltwise_tanh: return tanh_fwd(s); + case eltwise_elu: return elu_fwd(s, alpha_); case eltwise_square: return square_fwd(s); - case eltwise_abs: return abs_fwd(s); - case eltwise_sqrt: return sqrt_fwd(s); - case eltwise_linear: return linear_fwd(s, alpha, beta); - case eltwise_bounded_relu: return bounded_relu_fwd(s, alpha); + case eltwise_abs: return abs_fwd(s); + case eltwise_sqrt: return sqrt_fwd(s); + case eltwise_linear: return linear_fwd(s, alpha_, beta_); + case eltwise_bounded_relu: return bounded_relu_fwd(s, alpha_); case eltwise_soft_relu: return soft_relu_fwd(s); case eltwise_logistic: return logistic_fwd(s); - case eltwise_clamp: return clamp_fwd(s, alpha, beta); + case eltwise_clamp: return clamp_fwd(s, alpha_, beta_); + case eltwise_exp: return exp_fwd(s); + case eltwise_not: return not_fwd(s); default: assert(!"unknown eltwise alg_kind"); } - return 0.0f; + return 0.f; } template -void ref_eltwise_fwd_t::execute_forward_nCspBc_padded() { +void ref_eltwise_fwd_t::execute_forward_nCspBc_padded() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); const blocking_desc_t &blk = data_d.blocking_desc(); const int block = blk.block_dims[1]; - const int MB = conf_.MB(); - const int C = conf_.C() / block; + const int MB = pd()->MB(); + const int C = pd()->C() / block; const int C_PADDED = blk.padding_dims[1] / block; - const int tail = conf_.C() % block; - const int SP = conf_.D() * conf_.H() * conf_.W(); - const auto alg_kind = conf_.desc()->alg_kind; - const float alpha = conf_.desc()->alpha; - const float beta = conf_.desc()->beta; + const int tail = pd()->C() % block; + const int SP = pd()->D() * pd()->H() * pd()->W(); + const auto alg_kind = pd()->desc()->alg_kind; + const float alpha = pd()->desc()->alpha; + const float beta = pd()->desc()->beta; auto ker = [=] (data_t &d, data_t s) { switch (alg_kind) { @@ -84,6 +89,8 @@ void ref_eltwise_fwd_t::execute_forward_nCspBc_padded() { case eltwise_soft_relu: d = soft_relu_fwd(s); break; case eltwise_logistic: d = logistic_fwd(s); break; case eltwise_clamp: d = clamp_fwd(s, alpha, beta); break; + case eltwise_exp: d = exp_fwd(s); break; + case eltwise_not: d = not_fwd(s); break; default: assert(!"unknown eltwise alg_kind"); } }; @@ -104,24 +111,24 @@ void ref_eltwise_fwd_t::execute_forward_nCspBc_padded() { } template -void ref_eltwise_fwd_t::execute_forward_generic() { +void ref_eltwise_fwd_t::execute_forward_generic() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); /* fast return */ - if (conf_.has_zero_dim_memory()) return; + if (pd()->has_zero_dim_memory()) return; - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); - const int MB = conf_.MB(); - const int C = conf_.C(); - const int D = conf_.D(); - const int H = conf_.H(); - const int W = conf_.W(); - const auto alg_kind = conf_.desc()->alg_kind; - const float alpha = conf_.desc()->alpha; - const float beta = conf_.desc()->beta; - const bool is_3d = conf_.desc()->data_desc.ndims == 5; + const int MB = pd()->MB(); + const int C = pd()->C(); + const int D = pd()->D(); + const int H = pd()->H(); + const int W = pd()->W(); + const auto alg_kind = pd()->desc()->alg_kind; + const float alpha = pd()->desc()->alpha; + const float beta = pd()->desc()->beta; + const bool is_3d = pd()->desc()->data_desc.ndims == 5; parallel_nd(MB, C, D, H, W, [&](int n, int c, int id, int h, int w) { @@ -142,22 +149,24 @@ void ref_eltwise_fwd_t::execute_forward_generic() { case eltwise_soft_relu: d = soft_relu_fwd(s); break; case eltwise_logistic: d = logistic_fwd(s); break; case eltwise_clamp: d = clamp_fwd(s, alpha, beta); break; + case eltwise_exp: d = exp_fwd(s); break; + case eltwise_not: d = not_fwd(s); break; default: assert(!"unknown eltwise alg_kind"); } }); } template -void ref_eltwise_fwd_t::execute_forward_dense() { +void ref_eltwise_fwd_t::execute_forward_dense() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); const ptrdiff_t nelems = static_cast(data_d.nelems(true)); - const auto alg_kind = conf_.desc()->alg_kind; - const float alpha = conf_.desc()->alpha; - const float beta = conf_.desc()->beta; + const auto alg_kind = pd()->desc()->alg_kind; + const float alpha = pd()->desc()->alpha; + const float beta = pd()->desc()->beta; src += data_d.blocking_desc().offset_padding; dst += data_d.blocking_desc().offset_padding; @@ -185,32 +194,34 @@ void ref_eltwise_fwd_t::execute_forward_dense() { case eltwise_soft_relu: d = soft_relu_fwd(s); break; case eltwise_logistic: d = logistic_fwd(s); break; case eltwise_clamp: d = clamp_fwd(s, alpha, beta); break; + case eltwise_exp: d = exp_fwd(s); break; + case eltwise_not: d = not_fwd(s); break; default: assert(!"unknown eltwise alg_kind"); } }); } template -void ref_eltwise_bwd_t::execute_backward_generic() { +void ref_eltwise_bwd_t::execute_backward_generic() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); /* fast return */ - if (conf_.has_zero_dim_memory()) return; + if (pd()->has_zero_dim_memory()) return; - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper diff_data_d(conf_.diff_src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper diff_data_d(pd()->diff_src_pd()); - const int MB = conf_.MB(); - const int C = conf_.C(); - const int D = conf_.D(); - const int H = conf_.H(); - const int W = conf_.W(); - const auto alg_kind = conf_.desc()->alg_kind; - const float alpha = conf_.desc()->alpha; - const float beta = conf_.desc()->beta; - const bool is_3d = conf_.desc()->data_desc.ndims == 5; + const int MB = pd()->MB(); + const int C = pd()->C(); + const int D = pd()->D(); + const int H = pd()->H(); + const int W = pd()->W(); + const auto alg_kind = pd()->desc()->alg_kind; + const float alpha = pd()->desc()->alpha; + const float beta = pd()->desc()->beta; + const bool is_3d = pd()->desc()->data_desc.ndims == 5; parallel_nd(MB, C, D, H, W, [&](int n, int c, int d, int h, int w) { @@ -236,24 +247,25 @@ void ref_eltwise_bwd_t::execute_backward_generic() { case eltwise_soft_relu: ds = soft_relu_bwd(dd, s); break; case eltwise_logistic: ds = logistic_bwd(dd, s); break; case eltwise_clamp: ds = clamp_bwd(dd, s, alpha, beta); break; + case eltwise_exp: ds = exp_bwd(dd, s); break; default: assert(!"unknown eltwise alg_kind"); } }); } template -void ref_eltwise_bwd_t::execute_backward_dense() { +void ref_eltwise_bwd_t::execute_backward_dense() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper diff_data_d(conf_.diff_src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper diff_data_d(pd()->diff_src_pd()); const ptrdiff_t nelems = static_cast(data_d.nelems(true)); - const auto alg_kind = conf_.desc()->alg_kind; - const float alpha = conf_.desc()->alpha; - const float beta = conf_.desc()->beta; + const auto alg_kind = pd()->desc()->alg_kind; + const float alpha = pd()->desc()->alpha; + const float beta = pd()->desc()->beta; src += data_d.blocking_desc().offset_padding; diff_dst += diff_data_d.blocking_desc().offset_padding; @@ -276,6 +288,7 @@ void ref_eltwise_bwd_t::execute_backward_dense() { case eltwise_soft_relu: ds = soft_relu_bwd(dd, s); break; case eltwise_logistic: ds = logistic_bwd(dd, s); break; case eltwise_clamp: ds = clamp_bwd(dd, s, alpha, beta); break; + case eltwise_exp: ds = exp_bwd(dd, s); break; default: assert(!"unknown eltwise alg_kind"); } }); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.hpp index bd90dc1..718844b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.hpp @@ -31,13 +31,16 @@ namespace cpu { struct ref_eltwise_scalar_fwd_t { public: - ref_eltwise_scalar_fwd_t(const alg_kind_t alg, float alpha, float beta); + ref_eltwise_scalar_fwd_t(alg_kind_t alg, float alpha, float beta); + + // note that eltwise.scale is ignored + ref_eltwise_scalar_fwd_t(const post_ops_t::entry_t::eltwise_t &eltwise); + float compute_scalar(float s); -private: - alg_kind_t alg; - float alpha; - float beta; + const alg_kind_t alg_; + const float alpha_; + const float beta_; }; template @@ -87,15 +90,15 @@ struct ref_eltwise_fwd_t: public cpu_primitive_t { bool use_dense_, use_nCspBc_padded_; }; - ref_eltwise_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_eltwise_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.use_dense_) + virtual void execute(event_t *e) const { + if (pd()->use_dense_) execute_forward_dense(); - else if (conf_.use_nCspBc_padded_) + else if (pd()->use_nCspBc_padded_) execute_forward_nCspBc_padded(); else execute_forward_generic(); @@ -103,10 +106,10 @@ struct ref_eltwise_fwd_t: public cpu_primitive_t { } private: - void execute_forward_nCspBc_padded(); - void execute_forward_dense(); - void execute_forward_generic(); - pd_t conf_; + void execute_forward_nCspBc_padded() const; + void execute_forward_dense() const; + void execute_forward_generic() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -142,27 +145,30 @@ struct ref_eltwise_bwd_t: public cpu_primitive_t { if (use_generic && !one_of(diff_dst_d.ndims(), 4, 5)) return status::unimplemented; + if (desc()->alg_kind == mkldnn_eltwise_not) + return status::unimplemented; + return status::success; } bool use_dense_; }; - ref_eltwise_bwd_t(const pd_t *pd, const input_vector &inputs, + ref_eltwise_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - if (conf_.use_dense_) execute_backward_dense(); + virtual void execute(event_t *e) const { + if (pd()->use_dense_) execute_backward_dense(); else execute_backward_generic(); e->set_state(event_t::ready); } private: - void execute_backward_dense(); - void execute_backward_generic(); - pd_t conf_; + void execute_backward_dense() const; + void execute_backward_generic() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.cpp index 6d3edfa..9261665 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.cpp @@ -27,37 +27,39 @@ namespace impl { namespace cpu { using math::saturate; +using math::get_bias; template void ref_inner_product_fwd_t - ::execute_forward() { + ::execute_forward() const { auto src = reinterpret_cast(this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto bias = reinterpret_cast(this->input_memory(2)); auto dst = reinterpret_cast(this->memory()); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper bias_d(conf_.weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper bias_d(pd()->weights_pd(1)); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int IC = conf_.IC(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int IC = pd()->IC(); const bool src_has_spatial = utils::one_of(src_d.ndims(), 4, 5); const bool is_3d = src_d.ndims() == 5; - const auto &post_ops = conf_.attr()->post_ops_; + const auto &post_ops = pd()->attr()->post_ops_; const bool do_relu = post_ops.len_ == 1; const float nslope = do_relu ? post_ops.entry_[0].eltwise.alpha : 0.f; - auto ker_has_spatial = [=](acc_data_t &d, int mb, int oc) { - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); + auto ker_has_spatial = [=](int mb, int oc) { + acc_data_t d = 0; + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); for (int ic = 0; ic < IC; ++ic) { for (int kd = 0; kd < KD; ++kd) { for (int kh = 0; kh < KH; ++kh) { @@ -72,42 +74,29 @@ void ref_inner_product_fwd_t } } } + return d; }; - auto ker_no_spatial = [=](acc_data_t &d, int mb, int oc) { + auto ker_no_spatial = [=](int mb, int oc) { + acc_data_t d = 0; for (int ic = 0; ic < IC; ++ic) { d += (acc_data_t)src[src_d.off(mb, ic)] * weights[weights_d.off(oc, ic)]; } - }; - - auto get_bias = [=, &bias](size_t off) -> acc_data_t { -# define CASE(dt) case dt: \ - return (acc_data_t)(*((const prec_traits
::type *)bias + off)) - switch (conf_.desc()->bias_desc.data_type) { - CASE(data_type::s8); - CASE(data_type::u8); - CASE(data_type::s32); - CASE(data_type::f32); - default: assert(!"unimplemented"); - } -# undef CASE - return 0; + return d; }; parallel_nd(MB, OC, [&](int mb, int oc) { - acc_data_t a = bias ? get_bias(bias_d.off(oc)) : (acc_data_t)0; - if (src_has_spatial) { - ker_has_spatial(a, mb, oc); - } else { - ker_no_spatial(a, mb, oc); - } - if (do_relu && a < (acc_data_t)0) { - float ds = (float)a * nslope; - dst[dst_d.off(mb, oc)] = saturate(ds); - } else { - dst[dst_d.off(mb, oc)] = saturate(a); - } + float a = bias + ? get_bias(bias, bias_d.off(oc), pd()->desc()->bias_desc.data_type) + : 0; + if (src_has_spatial) + a += ker_has_spatial(mb, oc); + else + a += ker_no_spatial(mb, oc); + if (do_relu && a < (acc_data_t)0) + a *= nslope; + dst[dst_d.off(mb, oc)] = saturate(a); }); } using namespace data_type; @@ -121,19 +110,19 @@ template struct ref_inner_product_fwd_t; template void ref_inner_product_bwd_data_t::execute_backward_data() { + acc_type>::execute_backward_data() const { auto diff_dst = reinterpret_cast( this->input_memory(0)); auto weights = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper weights_d(conf_.weights_pd(0)); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int IC = conf_.IC(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int IC = pd()->IC(); const bool diff_src_has_spatial = utils::one_of(diff_src_d.ndims(), 4, 5); @@ -141,9 +130,9 @@ void ref_inner_product_bwd_data_tKD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); for (int kd = 0; kd < KD; ++kd) for (int kh = 0; kh < KH; ++kh) for (int kw = 0; kw < KW; ++kw) { @@ -176,20 +165,20 @@ template struct ref_inner_product_bwd_data_t; template struct ref_inner_product_bwd_data_t; template -void ref_inner_product_bwd_weights_t::execute_backward_weights() { +void ref_inner_product_bwd_weights_t::execute_backward_weights() const { auto src = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_weights = reinterpret_cast(this->memory(0)); auto diff_bias = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0)); - const memory_desc_wrapper diff_bias_d(conf_.diff_weights_pd(1)); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper diff_weights_d(pd()->diff_weights_pd(0)); + const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); - const int MB = conf_.MB(); - const int OC = conf_.OC(); - const int IC = conf_.IC(); + const int MB = pd()->MB(); + const int OC = pd()->OC(); + const int IC = pd()->IC(); const bool src_has_spatial = utils::one_of(src_d.ndims(), 4 ,5); @@ -197,9 +186,9 @@ void ref_inner_product_bwd_weights_t::execute_backward_weights() { parallel_nd(OC, IC, [&](int oc, int ic) { if (src_has_spatial) { - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); for (int kd = 0; kd < KD; ++kd) { for (int kh = 0; kh < KH; ++kh) { for (int kw = 0; kw < KW; ++kw) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.hpp index afb21a1..e777c6d 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.hpp @@ -64,17 +64,17 @@ struct ref_inner_product_fwd_t: public cpu_primitive_t { } }; - ref_inner_product_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_inner_product_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::forward_training: case prop_kind::forward_inference: execute_forward(); @@ -86,8 +86,8 @@ struct ref_inner_product_fwd_t: public cpu_primitive_t { } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template ::type diff_src_data_t; typedef typename prec_traits::type wei_data_t; typedef typename prec_traits::type diff_dst_data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward: case prop_kind::backward_data: execute_backward_data(); @@ -141,8 +141,8 @@ struct ref_inner_product_bwd_data_t: public cpu_primitive_t { } private: - void execute_backward_data(); - pd_t conf_; + void execute_backward_data() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -174,13 +174,13 @@ struct ref_inner_product_bwd_weights_t: public cpu_primitive_t { } }; - ref_inner_product_bwd_weights_t(const pd_t *pd, const input_vector &inputs, + ref_inner_product_bwd_weights_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { - switch (conf_.desc()->prop_kind) { + virtual void execute(event_t *e) const { + switch (pd()->desc()->prop_kind) { case prop_kind::backward: case prop_kind::backward_weights: execute_backward_weights(); @@ -192,8 +192,8 @@ struct ref_inner_product_bwd_weights_t: public cpu_primitive_t { } private: - void execute_backward_weights(); - pd_t conf_; + void execute_backward_weights() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.cpp index 38b81dd..de9a1d9 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.cpp @@ -47,7 +47,7 @@ static inline float fast_negative_powf(float omega, float beta) { template template -void ref_lrn_fwd_t::execute_forward() { +void ref_lrn_fwd_t::execute_forward() const { using namespace alg_kind; using namespace memory_format; @@ -55,15 +55,15 @@ void ref_lrn_fwd_t::execute_forward() { auto dst = reinterpret_cast(this->memory(0)); auto ws = reinterpret_cast(this->memory(1)); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper ws_d(conf_.workspace_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper ws_d(pd()->workspace_pd()); MAYBE_UNUSED(ws_d); - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); const size_t stride_mb = data_d.blocking_desc().strides[0][0]; - const bool across_channels = conf_.desc()->alg_kind == lrn_across_channels; + const bool across_channels = pd()->desc()->alg_kind == lrn_across_channels; constexpr int blksize = fmt == nChw16c ? 16 : 8; auto data_off = [&](int mb, int c, int h, int w) -> size_t { @@ -78,11 +78,11 @@ void ref_lrn_fwd_t::execute_forward() { }; auto ker = [=](data_t *d, int mb, int oc, int oh, int ow) { - const float alpha = static_cast(conf_.desc()->lrn_alpha); - const float beta = static_cast(conf_.desc()->lrn_beta); - const float k = static_cast(conf_.desc()->lrn_k); + const float alpha = static_cast(pd()->desc()->lrn_alpha); + const float beta = static_cast(pd()->desc()->lrn_beta); + const float k = static_cast(pd()->desc()->lrn_k); - const int size = conf_.desc()->local_size; + const int size = pd()->desc()->local_size; const int half_size = (size - 1) / 2; float sum = 0; @@ -114,7 +114,7 @@ void ref_lrn_fwd_t::execute_forward() { d[0] = static_cast(src[off] * fast_negative_powf(sum, beta)); }; - const int MB = conf_.MB(); + const int MB = pd()->MB(); if (fmt == nChw16c || fmt == nChw8c) { parallel_nd(MB, utils::div_up(C, blksize), H, W, [&](int mb, int c_blk, int h, int w) { @@ -142,7 +142,7 @@ void ref_lrn_fwd_t::execute_forward() { template template -void ref_lrn_bwd_t::execute_backward() { +void ref_lrn_bwd_t::execute_backward() const { using namespace alg_kind; using namespace memory_format; @@ -150,21 +150,21 @@ void ref_lrn_bwd_t::execute_backward() { auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); - const memory_desc_wrapper diff_data_d(conf_.diff_dst_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); + const memory_desc_wrapper diff_data_d(pd()->diff_dst_pd()); MAYBE_UNUSED(diff_data_d); - const int MB = conf_.MB(); - const int C = conf_.C(); - const int H = conf_.H(); - const int W = conf_.W(); + const int MB = pd()->MB(); + const int C = pd()->C(); + const int H = pd()->H(); + const int W = pd()->W(); const size_t stride_mb = data_d.blocking_desc().strides[0][0]; constexpr int blksize = fmt == nChw16c ? 16 : 8; - const float alpha = static_cast(conf_.desc()->lrn_alpha); - const float beta = static_cast(conf_.desc()->lrn_beta); - const float k = static_cast(conf_.desc()->lrn_k); - const int kernel_size = conf_.desc()->local_size; + const float alpha = static_cast(pd()->desc()->lrn_alpha); + const float beta = static_cast(pd()->desc()->lrn_beta); + const float k = static_cast(pd()->desc()->lrn_k); + const int kernel_size = pd()->desc()->local_size; const int half_ksize = (kernel_size - 1) / 2; auto data_off = [&](int mb, int c, int h, int w) -> size_t { @@ -231,16 +231,16 @@ void ref_lrn_bwd_t::execute_backward() { } } -template void ref_lrn_fwd_t::execute_forward(); -template void ref_lrn_fwd_t::execute_forward(); -template void ref_lrn_fwd_t::execute_forward(); -template void ref_lrn_fwd_t::execute_forward(); -template void ref_lrn_fwd_t::execute_forward(); -template void ref_lrn_bwd_t::execute_backward(); -template void ref_lrn_bwd_t::execute_backward(); -template void ref_lrn_bwd_t::execute_backward(); -template void ref_lrn_bwd_t::execute_backward(); -template void ref_lrn_bwd_t::execute_backward(); +template void ref_lrn_fwd_t::execute_forward() const; +template void ref_lrn_fwd_t::execute_forward() const; +template void ref_lrn_fwd_t::execute_forward() const; +template void ref_lrn_fwd_t::execute_forward() const; +template void ref_lrn_fwd_t::execute_forward() const; +template void ref_lrn_bwd_t::execute_backward() const; +template void ref_lrn_bwd_t::execute_backward() const; +template void ref_lrn_bwd_t::execute_backward() const; +template void ref_lrn_bwd_t::execute_backward() const; +template void ref_lrn_bwd_t::execute_backward() const; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.hpp index ad89ed7..e2750f9 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.hpp @@ -57,14 +57,14 @@ struct ref_lrn_fwd_t: public cpu_primitive_t { } }; - ref_lrn_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_lrn_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { using namespace memory_format; - switch (conf_.src_pd()->desc()->format) { + switch (pd()->src_pd()->desc()->format) { case nChw16c: execute_forward(); break; case nChw8c: execute_forward(); break; case nchw: execute_forward(); break; @@ -77,8 +77,8 @@ struct ref_lrn_fwd_t: public cpu_primitive_t { } private: - templatevoid execute_forward(); - pd_t conf_; + templatevoid execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -106,14 +106,14 @@ struct ref_lrn_bwd_t: public cpu_primitive_t { } }; - ref_lrn_bwd_t(const pd_t *pd, const input_vector &inputs, + ref_lrn_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { using namespace memory_format; - switch (conf_.src_pd()->desc()->format) { + switch (pd()->src_pd()->desc()->format) { case nChw16c: execute_backward(); break; case nChw8c: execute_backward(); break; case nchw: execute_backward(); break; @@ -126,8 +126,8 @@ struct ref_lrn_bwd_t: public cpu_primitive_t { } private: - templatevoid execute_backward(); - pd_t conf_; + templatevoid execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.cpp index 4ee010d..d7ae208 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.cpp @@ -30,43 +30,39 @@ namespace impl { namespace cpu { template -void ref_pooling_fwd_t::execute_forward() { +void ref_pooling_fwd_t::execute_forward() const { using namespace alg_kind; using namespace prop_kind; - auto alg = conf_.desc()->alg_kind; + auto alg = pd()->desc()->alg_kind; auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - auto ws = alg == pooling_max && conf_.desc()->prop_kind == forward_training + auto ws = alg == pooling_max && pd()->desc()->prop_kind == forward_training ? reinterpret_cast(this->memory(1)) : nullptr; - const memory_desc_wrapper src_d(conf_.src_pd()); - const memory_desc_wrapper dst_d(conf_.dst_pd()); - const memory_desc_wrapper ws_d(conf_.workspace_pd()); + const memory_desc_wrapper src_d(pd()->src_pd()); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + const memory_desc_wrapper ws_d(pd()->workspace_pd()); const data_type_t ws_dt = ws ? ws_d.data_type() : data_type::undef; - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); - const int SD = conf_.KSD(); - const int SH = conf_.KSH(); - const int SW = conf_.KSW(); - const int padF = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); - const int padBack = conf_.padBack(); - const int padB = conf_.padB(); - const int padR = conf_.padR(); - - const bool is_3d = conf_.desc()->src_desc.ndims == 5; - -// auto apply_offset = [=](int index, int offset) { -// return (index > offset) ? index - offset : 0; -// }; + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + const int SD = pd()->KSD(); + const int SH = pd()->KSH(); + const int SW = pd()->KSW(); + const int padF = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); + const int padBack = pd()->padBack(); + const int padB = pd()->padB(); + const int padR = pd()->padR(); + + const bool is_3d = pd()->desc()->src_desc.ndims == 5; auto set_ws = [=](int mb, int oc, int od, int oh, int ow, int value) { if (ws) { @@ -195,11 +191,11 @@ void ref_pooling_fwd_t::execute_forward() { d[0] = math::out_round((float)dst / num_summands); }; - const int MB = conf_.MB(); - const int OC = conf_.C(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); + const int MB = pd()->MB(); + const int OC = pd()->C(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); if (alg == pooling_max) { parallel_nd(MB, OC, OD, OH, OW, @@ -226,34 +222,34 @@ void ref_pooling_fwd_t::execute_forward() { } template -void ref_pooling_bwd_t::execute_backward() { +void ref_pooling_bwd_t::execute_backward() const { using namespace alg_kind; auto diff_dst = reinterpret_cast(this->input_memory(0)); - auto ws = conf_.desc()->alg_kind != alg_kind::pooling_max ? nullptr + auto ws = pd()->desc()->alg_kind != alg_kind::pooling_max ? nullptr : reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd()); - const memory_desc_wrapper ws_d(conf_.workspace_pd()); - const memory_desc_wrapper diff_src_d(conf_.diff_src_pd()); + const memory_desc_wrapper diff_dst_d(pd()->diff_dst_pd()); + const memory_desc_wrapper ws_d(pd()->workspace_pd()); + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); - const int ID = conf_.ID(); - const int IH = conf_.IH(); - const int IW = conf_.IW(); - const int KD = conf_.KD(); - const int KH = conf_.KH(); - const int KW = conf_.KW(); - const int SD = conf_.KSD(); - const int SH = conf_.KSH(); - const int SW = conf_.KSW(); - const int padF = conf_.padFront(); - const int padT = conf_.padT(); - const int padL = conf_.padL(); + const int ID = pd()->ID(); + const int IH = pd()->IH(); + const int IW = pd()->IW(); + const int KD = pd()->KD(); + const int KH = pd()->KH(); + const int KW = pd()->KW(); + const int SD = pd()->KSD(); + const int SH = pd()->KSH(); + const int SW = pd()->KSW(); + const int padF = pd()->padFront(); + const int padT = pd()->padT(); + const int padL = pd()->padL(); - const bool is_3d = conf_.desc()->diff_src_desc.ndims == 5; + const bool is_3d = pd()->desc()->diff_src_desc.ndims == 5; - auto alg = conf_.desc()->alg_kind; + auto alg = pd()->desc()->alg_kind; auto apply_offset = [=](int index, int offset) { return (index > offset) ? index - offset : 0; @@ -360,13 +356,13 @@ void ref_pooling_bwd_t::execute_backward() { } }; - const int MB = conf_.MB(); - const int OC = conf_.C(); - const int OD = conf_.OD(); - const int OH = conf_.OH(); - const int OW = conf_.OW(); + const int MB = pd()->MB(); + const int OC = pd()->C(); + const int OD = pd()->OD(); + const int OH = pd()->OH(); + const int OW = pd()->OW(); - if (conf_.desc()->alg_kind == alg_kind::pooling_max) { + if (pd()->desc()->alg_kind == alg_kind::pooling_max) { parallel_nd(MB, OC, [&](int mb, int oc) { if (is_3d) ker_zero_3d(mb, oc); else ker_zero(mb, oc); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.hpp index b2be03b..ef01167 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.hpp @@ -67,21 +67,21 @@ struct ref_pooling_fwd_t: public cpu_primitive_t { } }; - ref_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward(); e->set_state(event_t::ready); } private: - void execute_forward(); - pd_t conf_; + void execute_forward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; template @@ -120,20 +120,20 @@ struct ref_pooling_bwd_t: public cpu_primitive_t { } }; - ref_pooling_bwd_t(const pd_t *pd, const input_vector &inputs, + ref_pooling_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} typedef typename prec_traits::type data_t; typedef typename prec_traits::type acc_data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_backward(); e->set_state(event_t::ready); } private: - void execute_backward(); - pd_t conf_; + void execute_backward() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.cpp deleted file mode 100644 index 122b424..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.cpp +++ /dev/null @@ -1,1192 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* - General architecture - - for diff states, we have n_states + 1 as we have n_states diff - to propagate to the previous iteration and 1 states to propagate - to the previous layer - index 0 is dh for cell(t-1, l) to consume - index 1 is dc for cell(t-1, l) to consume - index 2 is dh for cell(t, l-1) to consume - this indexing enables to have the same indexing for states in elemwise - function - only the cell execution function should be impacted - - */ - -#include "c_types_map.hpp" -#include "math_utils.hpp" -#include "mkldnn_thread.hpp" -#include "mkldnn_traits.hpp" -#include "type_helpers.hpp" -#include "gemm/gemm.hpp" - -#include "ref_rnn.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -using namespace mkldnn::impl::utils; -using namespace mkldnn::impl::math; -using namespace prop_kind; -using namespace alg_kind; - -#define AOC array_offset_calculator - -inline float one_m_square(float x) { - return (1.0f - x) * (1.0f + x); -} -inline float x_m_square(float x) { - return (1.0f - x) * x; -} - -template <> -float activation( - float dd, float s, float alpha, float cliping) { - return relu_fwd(s, alpha); -} - -template <> -float activation( - float dd, float s, float alpha, float cliping) { - return relu_bwd(dd, s, alpha); -} - -template <> -float activation( - float dd, float s, float alpha, float cliping) { - return tanh_fwd(s); -} - -template <> -float activation( - float dd, float s, float alpha, float cliping) { - return dd * one_m_square(s); -} - -template <> -float activation( - float dd, float s, float alpha, float cliping) { - return logistic_fwd(s); -} - -template <> -float activation( - float dd, float s, float alpha, float cliping) { - return dd * x_m_square(s); -} - -//************************* Cell execution *************************// -/// @todo shall this be templated on activation function to enable svml calls -/// particularly? -template <> -elemwise_sig(_ref_rnn_common_t::rnn_elemwise) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC bias(bias_, n_gates, dic); - AOC states_t_l(states_t_l_, n_states, iter_stride, batch, wic); - parallel_nd(batch, [&](int i) { - for (int j = 0; j < dic; j++) { - const float h = - activation_func(0, ws_gates(i, j) + bias(0, j), 0, 0); - ws_gates(i, j) = states_t_l(0, 0, i, j) = h; - } - }); -} - -template <> -elemwise_sig(_ref_rnn_common_t::rnn_elemwise) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC diff_states_tp1_l( - diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic); - AOC diff_states_t_lp1( - diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic); - parallel_nd(batch, [&](int i) { - for (int j = 0; j < dic; ++j) { - const float dH = diff_states_t_lp1(n_states, 0, i, j) - + diff_states_tp1_l(0, 0, i, j); - auto g = ws_gates(i, j); - ws_gates(i, j) = activation_func(dH, g, 0, 0); - } - }); -} - -template <> -elemwise_sig(_ref_rnn_common_t::lstm_elemwise) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC bias(bias_, n_gates, dic); - AOC states_t_l(states_t_l_, n_states, iter_stride, batch, wic); - AOC states_tm1_l(states_tm1_l_, n_states, iter_stride, batch, wic); - - parallel_nd(batch, [&](int i) { -// WA. Loss of correctnes in case of simd loop unrolling with icc 18 -#if !defined(__INTEL_COMPILER) - PRAGMA_OMP_SIMD() -#endif - for (int j = 0; j < dic; j++) { - ws_gates(i, 0 * dic + j) = logistic_fwd(ws_gates(i, 0 * dic + j) + bias(0, j)); - ws_gates(i, 1 * dic + j) = logistic_fwd(ws_gates(i, 1 * dic + j) + bias(1, j)); - ws_gates(i, 2 * dic + j) = tanh_fwd(ws_gates(i, 2 * dic + j) + bias(2, j)); - ws_gates(i, 3 * dic + j) = logistic_fwd(ws_gates(i, 3 * dic + j) + bias(3, j)); - - float tmp = ws_gates(i, 1 * dic + j) * states_tm1_l(1, 0, i, j) - + ws_gates(i, 0 * dic + j) * ws_gates(i, 2 * dic + j); - states_t_l(0, 0, i, j) = ws_gates(i, 3 * dic + j) * tanh_fwd(tmp); - states_t_l(1, 0, i, j) = tmp; - } - }); -} - -template <> -elemwise_sig(_ref_rnn_common_t::lstm_elemwise) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC bias(bias_, n_gates, dic); - AOC states_t_l(states_t_l_, n_states, iter_stride, batch, wic); - AOC states_tm1_l(states_tm1_l_, n_states, iter_stride, batch, wic); - AOC diff_states_t_l(diff_states_t_l_, n_states + 1, iter_stride, batch, wic); - AOC diff_states_tp1_l( - diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic); - AOC diff_states_t_lp1( - diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic); - - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - float Ct = states_t_l(1, 0, i, j); - /// @todo save it in the workspace in fwd pass or recompute it to - /// save bw - float tanhCt = tanh_fwd(Ct); - // we have 2 incoming diffs on Ht - float dHt = diff_states_tp1_l(0, 0, i, j) - + diff_states_t_lp1(n_states, 0, i, j); - float dCt = diff_states_tp1_l(1, 0, i, j) - + one_m_square(tanhCt) * ws_gates(i, 3 * dic + j) * dHt; - - float dG1 = states_tm1_l(1, 0, i, j) * dCt - * x_m_square(ws_gates(i, 1 * dic + j)); - float dG0 = ws_gates(i, 2 * dic + j) * dCt - * x_m_square(ws_gates(i, 0 * dic + j)); - float dG3 = tanhCt * dHt * x_m_square(ws_gates(i, 3 * dic + j)); - float dG2 = ws_gates(i, 0 * dic + j) * dCt - * one_m_square(ws_gates(i, 2 * dic + j)); - - diff_states_t_l(1, 0, i, j) = dCt * ws_gates(i, 1 * dic + j); - - ws_gates(i, 0 * dic + j) = dG0; - ws_gates(i, 1 * dic + j) = dG1; - ws_gates(i, 2 * dic + j) = dG2; - ws_gates(i, 3 * dic + j) = dG3; - } - }); -} - -template -gemm_sig(_ref_rnn_common_t::packed_gemm) { -#if (USE_MKL_PACKED_GEMM) - cblas_sgemm_compute(CblasColMajor, CblasPacked, - is_B_trans ? CblasTrans : CblasNoTrans, m, n, k, a_, strideA_m, b_, - is_B_trans ? strideB_n : strideB_k, beta, c_, strideC_m); -#else - UNUSED(m); - UNUSED(n); - UNUSED(k); - UNUSED(a_); - UNUSED(b_); - UNUSED(c_); - UNUSED(is_B_trans); - UNUSED(beta); - assert(!"packed gemm is disabled"); -#endif -} - -template -gemm_sig(_ref_rnn_common_t::gemm) { - float alpha = 1.f; - extended_sgemm("N", is_B_trans ? "T" : "N", &m, &n, &k, &alpha, - a_, &strideA_m, b_, is_B_trans ? &strideB_n : &strideB_k, &beta, - c_, &strideC_m, nullptr, use_jit_sgemm_); -} - -template -void _ref_rnn_common_t::gates_reduction(int n_gates, int dic, int wic, int batch, - const float *ws_gates_, float *diff_bias_) { - auto body = [&](int i, int k) { - for (int j = 0; j < batch; j++) - diff_bias_[i * dic + k] - += ws_gates_[j * conf_.GC() + i * dic + k]; - }; - - // @todo block k on simd-width -#if MKLDNN_THR == MKLDNN_THR_OMP && _OPENMP >= 201307 \ - /* icc 17.0 has a problem with simd collapse */ \ - && !((defined __INTEL_COMPILER) && (__INTEL_COMPILER == 1700)) -#pragma omp parallel for simd collapse(2) - for (int i = 0; i < n_gates; i++) - for (int k = 0; k < dic; k++) - body(i, k); -#else - parallel_nd(n_gates, dic, body); -#endif -} -/// @todo template this function on fwd or bwd, if the overhead -/// to pass argument for empty function is too big -template <> -cell_execution_sig(_ref_rnn_common_t::cell_execution) { - if (!merge_gemm_layer) { - (this->*gemm_input_func)(n_gates * dic, batch, slc, conf_.WL_GLD(), slc, - batch, wic, conf_.GC(), batch, w_input_[0], states_t_lm1_, - ws_gates_, false, 0.0f); - } - (this->*gemm_state_func)(n_gates * dic, batch, sic, conf_.WI_GLD(), sic, - batch, wic, conf_.GC(), batch, w_state_[0], states_tm1_l_, - ws_gates_, false, 1.0f); - (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_, - states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_, - diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_); -} - -template <> -cell_execution_sig(_ref_rnn_common_t::cell_execution) { - (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_, - states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_, - diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_); - - /// bwd by data on the cell - (this->*gemm_state_func)(sic, batch, n_gates * dic, conf_.WI_GLD(), - n_gates * dic, batch, conf_.GC(), wic, batch, w_state_[0], - ws_gates_, diff_states_t_l_, false, 0.0f); - - if (!merge_gemm_layer) { - (this->*gemm_input_func)(slc, batch, n_gates * dic, conf_.WL_GLD(), - n_gates * dic, batch, conf_.GC(), wic, batch, w_input_[0], - ws_gates_, - diff_states_t_l_ + n_states * iter_stride * (batch * wic), - false, 0.0f); - - /// bwd by weights on the cell - gemm(n_gates * dic, slc, batch, conf_.GC(), batch, wic, batch, - conf_.DWL_GLD(), slc, ws_gates_, states_t_lm1_, diff_w_input_, - true, 1.0f); - } - - if (!merge_gemm_iter) - gemm(n_gates * dic, sic, batch, conf_.GC(), batch, wic, batch, - conf_.DWI_GLD(), sic, ws_gates_, states_tm1_l_, diff_w_state_, - true, 1.0f); - /// bwd by bias we just accumulate diffs from the gates - gates_reduction(n_gates, dic, wic, batch, ws_gates_, diff_bias_); -} - -template <> -cell_execution_sig(_ref_rnn_common_t::cell_execution_gru) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC bias(bias_, n_gates, dic); - AOC states_t_l(states_t_l_, batch, wic); - AOC states_tm1_l(states_tm1_l_, batch, wic); - - // 1. gemm Wx[0-2],x - if (!merge_gemm_layer) { - (this->*gemm_input_func)(n_gates * dic, batch, slc, conf_.WL_GLD(), slc, - batch, wic, conf_.GC(), batch, w_input_[0], states_t_lm1_, - ws_gates_, false, 0.0f); - } - - // 2. gemm Wh[0-1],h - (this->*gemm_state_func)((n_gates - 1) * dic, batch, sic, conf_.WI_GLD(), - sic, batch, wic, conf_.GC(), batch, w_state_[0], states_tm1_l_, - ws_gates_, false, 1.0f); - - // 3. activation zt and rt + elemwise multiplication rt,ht-1 - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - ws_gates(i, 0 * dic + j) = logistic_fwd(ws_gates(i, 0 * dic + j) + bias(0, j)); - ws_gates(i, 1 * dic + j) = logistic_fwd(ws_gates(i, 1 * dic + j) + bias(1, j)); - states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 1 * dic + j); - } - }); - - // 4. gemm Wh[2],h~t - (this->*gemm_state_func)(dic, batch, sic, conf_.WI_GLD(), sic, batch, wic, - conf_.GC(), batch, w_state_[1], states_t_l_, - &(ws_gates(0, 2 * dic)), false, 1.0f); - - // 5. activation h~t + calculate ht - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - ws_gates(i, 2 * dic + j) = tanh_fwd(ws_gates(i, 2 * dic + j) + bias(2, j)); - states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0 * dic + j) + - (1.0f - ws_gates(i, 0 * dic + j)) * ws_gates(i, 2 * dic + j); - } - }); -} - -template <> -elemwise_sig(_ref_rnn_common_t::gru_lbr_elemwise) { - bool is_training = conf_.is_training(); - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC ws_Wh_b(ws_grid_, batch, dic); - AOC bias(bias_, n_gates + 1, dic); - AOC states_t_l(states_t_l_, batch, wic); - AOC states_tm1_l(states_tm1_l_, batch, wic); - AOC ws_gemm_state(ws_cell_, batch, conf_.GC()); - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - float Wh_b = ws_gemm_state(i, 2 * dic + j) + bias(3, j); - ws_gates(i, 0 * dic + j) = logistic_fwd(ws_gates(i, 0 * dic + j) + - ws_gemm_state(i, j) + bias(0, j)); - ws_gates(i, 1 * dic + j) = logistic_fwd(ws_gates(i, 1 * dic + j) + - ws_gemm_state(i, dic + j) + bias(1, j)); - ws_gates(i, 2 * dic + j) = tanh_fwd(ws_gates(i, 2 * dic + j) + - ws_gates(i, 1 * dic + j) * Wh_b + bias(2, j)); - states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0 * dic + j) + - (1.0f - ws_gates(i, 0 * dic + j)) * ws_gates(i, 2 * dic + j); - if (is_training) ws_Wh_b(i, j) = Wh_b; - } - }); -} - -template <> -cell_execution_sig(_ref_rnn_common_t::cell_execution_gru_lbr) { - if (!merge_gemm_layer) { - (this->*gemm_input_func)(n_gates * dic, batch, slc, conf_.WL_GLD(), slc, - batch, wic, conf_.GC(), batch, w_input_[0], states_t_lm1_, - ws_gates_, false, 0.0f); - } - (this->*gemm_state_func)(n_gates * dic, batch, sic, conf_.WI_GLD(), sic, - batch, wic, conf_.GC(), batch, w_state_[0], states_tm1_l_, ws_cell_, - false, 0.0f); - (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_, - states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_, - diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_); -} - -template <> -elemwise_sig(_ref_rnn_common_t::gru_lbr_elemwise) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC states_tm1_l(states_tm1_l_, batch, wic); - AOC diff_states_t_l(diff_states_t_l_, n_states + 1, iter_stride, batch, wic);//dht-1 dxt - AOC diff_states_tp1_l( - diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic); - AOC diff_states_t_lp1( - diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic); - AOC ws_gates_r(ws_cell_, batch, conf_.GC()); - AOC ws_Wh_b(ws_grid_, batch, dic); - - // 1. calculate dG1 dG2 dG3 - // dG0 = (dht - G2) * dht * (1 - G0) * G0 - // dG1 = (W*h + b) * dG2 * (1 - G1) * G1 - // dG2 = (1 - G0) * dht * (1 - G2*G2) - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - float h = states_tm1_l(i, j); - float dHt = diff_states_tp1_l(0, 0, i, j) - + diff_states_t_lp1(n_states, 0, i, j); - float dG0 = (h - ws_gates(i, 2 * dic + j)) * dHt - * x_m_square(ws_gates(i, 0 * dic + j)); - float dG2 = (1.0f - ws_gates(i, 0 * dic + j)) - * one_m_square(ws_gates(i, 2 * dic + j)) * dHt; - float dG1 = ws_Wh_b(i, j) * dG2 - * x_m_square(ws_gates(i, 1 * dic + j)); - - diff_states_t_l(0, 0, i, j) = dHt * ws_gates(i, 0 * dic + j); - ws_gates(i, 2 * dic + j) = dG2; - ws_gates_r(i, 2 * dic + j) = dG2 * ws_gates(i, 1 * dic + j); - ws_gates(i, 0 * dic + j) = ws_gates_r(i, 0 * dic + j) = dG0; - ws_gates(i, 1 * dic + j) = ws_gates_r(i, 1 * dic + j) = dG1; - } - }); -} - -template <> -cell_execution_sig(_ref_rnn_common_t::cell_execution_gru_lbr) { - AOC diff_bias(diff_bias_, n_gates + 1, dic); - AOC ws_gates_r(ws_cell_, batch, conf_.GC()); - - (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_, - states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_, - diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_); - - if (!merge_gemm_layer) { - // dx = dG * Wx^t - (this->*gemm_input_func)(slc, batch, n_gates * dic, conf_.WL_GLD(), - n_gates * dic, batch, conf_.GC(), wic, batch, w_input_[0], - ws_gates_, - diff_states_t_l_ + n_states * iter_stride * (batch * wic), - false, 0.0f); - // dWx += dG^t * x - gemm(n_gates * dic, slc, batch, conf_.GC(), batch, wic, batch, - conf_.DWL_GLD(), slc, ws_gates_, states_t_lm1_, diff_w_input_, - true, 1.0f); - } - // dh += dGr * Wh^t - (this->*gemm_state_func)(sic, batch, n_gates * dic, conf_.WI_GLD(), - n_gates * dic, batch, conf_.GC(), wic, batch, w_state_[0], ws_cell_, - diff_states_t_l_, false, 1.0f); - - // dWh += dGr^t * h - gemm(n_gates * dic, sic, batch, conf_.GC(), batch, wic, batch, - conf_.DWL_GLD(), sic, ws_cell_, states_tm1_l_, diff_w_state_, true, - 1.0f); - - // db1-3 += e * dG - // db4 += e * (r * dG2) - gates_reduction(n_gates, dic, wic, batch, ws_gates_, diff_bias_); - - parallel_nd(dic, [&](int j) { - for (int i = 0; i < batch; i++) { - diff_bias_[3 * dic + j] += ws_gates_r(i, 2 *dic + j); - } - }); -} - -template <> -cell_execution_sig(_ref_rnn_common_t::cell_execution_gru) { - AOC ws_gates(ws_gates_, batch, conf_.GC()); - AOC states_tm1_l(states_tm1_l_, batch, wic); - AOC diff_states_t_l(diff_states_t_l_, n_states + 1, iter_stride, batch, wic);//dht-1 dxt - AOC diff_w_state(diff_w_state_, sic, conf_.GC()); - AOC diff_states_tp1_l( - diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic); - AOC diff_states_t_lp1( - diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic); - //use state memory for intermediate computations - float *dhG1_ = &(diff_states_t_l(n_states, 0, 0, 0)); - float *hG1_ = dhG1_; - AOC dhG1(dhG1_, batch, wic); - AOC hG1(hG1_, batch, wic); - - // 1. calculate dG2, dG1, and part of dht-1 - // dG2^ = dh * (1 - G0) * (1 - G2^2) - // dG0^ = dh * (ht-1 - G2) * u * (1 - G0) - // dht-1 (part) = dh * G0 - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - float h = states_tm1_l(i, j); - float dHt = diff_states_tp1_l(0, 0, i, j) - + diff_states_t_lp1(n_states, 0, i, j); - float dG2 = (1.0f - ws_gates(i, 0 * dic + j)) * dHt - * one_m_square(ws_gates(i, 2 * dic + j)); - float dG0 = (h - ws_gates(i, 2 * dic + j)) * dHt - * x_m_square(ws_gates(i, 0 * dic + j)); - - diff_states_t_l(0, 0, i, j) = dHt * ws_gates(i, 0 * dic + j); - ws_gates(i, 0 * dic + j) = dG0; - ws_gates(i, 2 * dic + j) = dG2; - } - }); - - //2. calculate intermediate d(hG1) - //d(hG1) = dG2 * W2h^t - (this->*gemm_state_func)(sic, batch, dic, conf_.WI_GLD(), n_gates * dic, - batch, conf_.GC(), wic, batch, w_state_[1], &(ws_gates(0, 2 * dic)), - dhG1_, false, 0.0f); - - //3. calculate dG1^ and part of dht-1 - //dG1^ = d(hG1) * h * G1 * (1 - G1) - //dht-1 (part) += d(hG1) * G1 - //h * G1 (required for dWh) - parallel_nd(batch, [&](int i) { - PRAGMA_OMP_SIMD() - for (int j = 0; j < dic; j++) { - float h = states_tm1_l(i, j); - float G1 = ws_gates(i, 1 * dic + j); - diff_states_t_l(0, 0, i, j) += dhG1(i, j) * G1; - ws_gates(i, 1 * dic + j) = dhG1(i, j) * h * x_m_square(G1); - hG1(i, j) = G1 * h; - } - }); - - //4. calculate diff weights - //dWh1 += dG1 * h, dWh2 += dG2 * h, dWh3 += dG3 * (G1(*)h) - gemm((n_gates - 1) * dic, sic, batch, conf_.GC(), batch, wic, batch, - conf_.DWI_GLD(), sic, ws_gates_, states_tm1_l_, diff_w_state_, true, - 1.0f); - gemm(dic, sic, batch, conf_.GC(), batch, wic, batch, conf_.DWI_GLD(), sic, - &(ws_gates(0, 2 * dic)), hG1_, &(diff_w_state(0, 2 * dic)), true, - 1.0f); - - //5. calculate diff states - //dht-1 += dG1 * W1h + dG0 * W0h - (this->*gemm_state_func)(sic, batch, (n_gates - 1) * dic, conf_.WI_GLD(), - n_gates * dic, batch, conf_.GC(), wic, batch, w_state_[0], - ws_gates_, diff_states_t_l_, false, 1.0f); - - if (!merge_gemm_layer) { - //dWx += [dG0 dG1 dG2] * [x] - gemm(n_gates * dic, slc, batch, conf_.GC(), batch, wic, batch, - conf_.DWL_GLD(), slc, ws_gates_, states_t_lm1_, diff_w_input_, - true, 1.0f); - //dx = dG2 * W2x + dG1 * W1x + dG0 * W0x - (this->*gemm_input_func)(slc, batch, n_gates * dic, conf_.WL_GLD(), - n_gates * dic, batch, conf_.GC(), wic, batch, w_input_[0], - ws_gates_, &(diff_states_t_l(n_states, 0, 0, 0)), false, 0.0f); - } - - //6. calculate diff bias - gates_reduction(n_gates, dic, wic, batch, ws_gates_, diff_bias_); -} - -//*************** Grid computations strategy: linear ***************// -template -grid_execution_sig(_ref_rnn_common_t::linear_execution) { - AOC ws_states(ws_states_, n_layer + 1, n_direction, n_states, n_iter + 1, - batch * wic); - AOC ws_diff_states(ws_diff_states_, n_layer + 1, n_direction, (n_states + 1), - n_iter + 1, batch * wic); - AOC ws_gates( - ws_gates_, n_layer, n_direction, n_iter, batch * conf_.GC()); - AOC weights_input(weights_input_, n_layer, n_direction, - n_parts_wei_i); - AOC weights_states(weights_states_, n_layer, n_direction, - n_parts_wei_st); - AOC bias(bias_, n_layer, n_direction, n_bias * dic); - AOC diff_weights_layer( - diff_weights_layer_, n_layer, n_direction, slc * conf_.DWL_GLD()); - AOC diff_weights_iter( - diff_weights_iter_, n_layer, n_direction, sic * conf_.DWI_GLD()); - AOC diff_bias(diff_bias_, n_layer, n_direction, n_bias * dic); - AOC ws_grid(ws_grid_, n_layer, n_direction, n_iter, ws_per_cell); - - // We run the grid of computation - for (int dir = 0; dir < n_direction; dir++) { - for (int j = 0; j < n_layer; j++) { - int lay = (aprop == prop_kind::forward) ? j : n_layer - j - 1; - if ((aprop == prop_kind::forward) && merge_gemm_layer) { - /* Assumption: merge_gemm_layer happens only on forward */ - (this->*gemm_input_func)(n_gates * dic, batch * n_iter, slc, - conf_.WL_GLD(), slc, batch * n_iter, wic, conf_.GC(), - batch * n_iter, weights_input(lay, dir, 0), - &(ws_states(lay, dir, 0, 1, 0)), - &(ws_gates(lay, dir, 0, 0)), false, 0.0f); - } - for (int i = 0; i < n_iter; i++) { - int iter = (aprop == prop_kind::forward) ? i : n_iter - i - 1; - (this->*cell_func)(dic, slc, sic, wic, batch, n_gates, n_states, n_iter + 1, - &(ws_states(lay + 1, dir, 0, iter + 1, 0)), - &(ws_diff_states(lay, dir, 0, iter, 0)), - &(weights_input(lay, dir, 0)), - &(weights_states(lay, dir, 0)), - &(bias(lay, dir, 0)), - &(ws_states(lay, dir, 0, iter + 1, 0)), - &(ws_states(lay + 1, dir, 0, iter, 0)), - &(ws_diff_states(lay + 1, dir, 0, iter, 0)), - &(ws_diff_states(lay, dir, 0, iter + 1, 0)), - &(diff_weights_layer(lay, dir, 0)), - &(diff_weights_iter(lay, dir, 0)), - &(diff_bias(lay, dir, 0)), - &(ws_gates(lay, dir, iter, 0)), - &(ws_grid(lay, dir, iter, 0)), - ws_cell_); - } - if ((aprop == prop_kind::backward) && merge_gemm_layer) { - (this->*gemm_input_func)(slc, batch * n_iter, n_gates * dic, - conf_.WL_GLD(), n_gates * dic, batch * n_iter, - conf_.GC(), wic, batch * n_iter, - weights_input(lay, dir, 0), &(ws_gates(lay, dir, 0, 0)), - &(ws_diff_states(lay, dir, n_states, 0, 0)), false, - 0.0f); - gemm(n_gates * dic, slc, batch * n_iter, conf_.GC(), - batch * n_iter, wic, batch * n_iter, conf_.DWL_GLD(), - slc, &(ws_gates(lay, dir, 0, 0)), - &(ws_states(lay, dir, 0, 1, 0)), - &(diff_weights_layer(lay, dir, 0)), true, 1.0f); - } - if ((aprop == prop_kind::backward) && merge_gemm_iter) { - gemm(n_gates * dic, sic, batch * n_iter, conf_.GC(), - batch * n_iter, wic, batch * n_iter, conf_.DWI_GLD(), - sic, &(ws_gates(lay, dir, 0, 0)), - &(ws_states(lay + 1, dir, 0, 0, 0)), - &(diff_weights_iter(lay, dir, 0)), true, 1.0f); - } - } - } -} - -//********* GRID computations strategy: utility functions **********// - -template <> -void _ref_rnn_common_t::copy_init_layer(bool lr, bool rl, - int n_layer, int n_direction, int n_iter, int batch, int slc, int dic, - int dlc, int wic, int n_states, float *ws_states_, - float *ws_diff_states_, const float *xt_, - const float *diff_dst_layer_) { - AOC ws_states( - ws_states_, n_direction, n_states, n_iter + 1, batch, wic); - auto xt_d = memory_desc_wrapper(conf_.src_pd(0)); - - parallel_nd(n_iter, [&](int it) { - auto xxt = xt_ + xt_d.blk_off(it); - if (lr) - for (int b = 0; b < batch; b++) - for (int c = 0; c < slc; c++) - ws_states(0, 0, it + 1, b, c) = *(xxt + b * slc + c); - if (rl) - for (int b = 0; b < batch; b++) - for (int c = 0; c < slc; c++) - ws_states(n_direction - 1, 0, n_iter - it, b, c) - = *(xxt + b * slc + c); - }); -} - -template <> -void _ref_rnn_common_t::copy_init_layer(bool lr, bool rl, - int n_layer, int n_direction, int n_iter, int batch, int slc, int dic, - int dlc, int wic, int n_states, float *ws_states_, - float *ws_diff_states_, const float *xt_, - const float *diff_dst_layer_) { - AOC ws_diff_states(ws_diff_states_, n_layer + 1, n_direction, - (n_states + 1), n_iter + 1, batch, wic); - auto diff_dst_layer_d = memory_desc_wrapper(conf_.diff_dst_pd(0)); - - switch (conf_.direction()) { - case mkldnn_bidirectional_concat: - parallel_nd(n_iter, batch, [&](int it, int b) { - auto diff_dst_layer_x - = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b); - for (int s = 0; s < dic; s++) { - ws_diff_states(n_layer, 0, n_states, it, b, s) - = diff_dst_layer_x[s]; - ws_diff_states(n_layer, 1, n_states, n_iter - it - 1, b, s) - = diff_dst_layer_x[dic + s]; - } - }); - break; - case mkldnn_bidirectional_sum: - parallel_nd(n_iter, batch, [&](int it, int b) { - auto diff_dst_layer_x - = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b); - for (int s = 0; s < dic; s++) { - ws_diff_states(n_layer, 0, n_states, it, b, s) - = diff_dst_layer_x[s]; - ws_diff_states(n_layer, 1, n_states, n_iter - it - 1, b, s) - = diff_dst_layer_x[s]; - } - }); - break; - case mkldnn_unidirectional_left2right: - parallel_nd(n_iter, batch, [&](int it, int b) { - auto diff_dst_layer_x - = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b); - for (int s = 0; s < dic; s++) { - ws_diff_states(n_layer, 0, n_states, it, b, s) - = diff_dst_layer_x[s]; - } - }); - break; - case mkldnn_unidirectional_right2left: - parallel_nd(n_iter, batch, [&](int it, int b) { - auto diff_dst_layer_x - = diff_dst_layer_ + diff_dst_layer_d.blk_off(n_iter - it - 1, b); - for (int s = 0; s < dic; s++) { - ws_diff_states(n_layer, 0, n_states, it, b, s) - = diff_dst_layer_x[s]; - } - }); - break; - default: - assert(!"Unsupported direction"); - break; - } -} - -template <> -void _ref_rnn_common_t::copy_init_iter(int n_layer, - int n_direction, int n_states, int batch, int sic, int dic, int wic, - int n_iter, float *ws_states_, float *ws_diff_states_, - const float *firstit_states_, const float *diff_dst_iter_) { - AOC ws_states(ws_states_, n_layer + 1, n_direction, n_states, - n_iter + 1, batch, wic); - auto firstit_states_d = memory_desc_wrapper(conf_.src_pd(1)); - if (firstit_states_) { - parallel_nd(n_layer, n_direction, [&](int lay, int dir) { - for (int state = 0; state < n_states; state++) - for (int b = 0; b < batch; ++b) { - array_copy(&(ws_states(lay + 1, dir, state, 0, b, 0)), - firstit_states_ + firstit_states_d.blk_off( - lay, dir, state, b), sic); - } - }); - } else { - parallel_nd(n_layer, n_direction, [&](int lay, int dir) { - for (int state = 0; state < n_states; state++) - for (int i = 0; i < batch; i++) - for (int j = 0; j < sic; j++) - ws_states(lay + 1, dir, state, 0, i, j) = 0.0f; - }); - } -} - -template <> -void _ref_rnn_common_t::copy_init_iter(int n_layer, - int n_direction, int n_states, int batch, int sic, int dic, int wic, - int n_iter, float *ws_states_, float *ws_diff_states_, - const float *firstit_states_, const float *diff_dst_iter_) { - AOC ws_diff_states(ws_diff_states_, n_layer + 1, n_direction, - n_states + 1, n_iter + 1, batch, wic); - auto diff_dst_iter_d = memory_desc_wrapper(conf_.diff_dst_pd(1)); - if (diff_dst_iter_) { - parallel_nd(n_layer, n_direction, n_states, batch, - [&](int lay, int dir, int state, int b) { - array_copy(&(ws_diff_states(lay, dir, state, n_iter, b, 0)), - diff_dst_iter_ + diff_dst_iter_d.blk_off(lay, dir, state, b), - dic); - }); - } else { - parallel_nd(n_layer, n_direction, n_states, batch, - [&](int lay, int dir, int state, int i) { - for (int j = 0; j < dic; j++) - ws_diff_states(lay, dir, state, n_iter, i, j) = 0.0f; - }); - } -} - -template <> -void _ref_rnn_common_t::copy_res_layer(bool lr, bool rl, - int n_layer, int n_direction, int n_iter, int batch, - int n_output_features, int slc, int dic, int wic, int n_states, - mkldnn_rnn_direction_t direction, float *dst_layer_, - float *diff_src_layer, const float *ws_states_, - const float *ws_diff_states_) { - auto dst_layer_d = memory_desc_wrapper(conf_.dst_pd(0)); - AOC ws_states(ws_states_, n_layer + 1, n_direction, - n_states, n_iter + 1, batch, wic); - - parallel_nd(n_iter, batch, [&](int it, int b) { - int dir = 0; - if (lr) { - for (int s = 0; s < dic; s++) - dst_layer_[dst_layer_d.blk_off(it, b, dir * dic + s)] - = ws_states(n_layer, dir, 0, it + 1, b, s); - dir = 1; - } - if (rl) { - for (int s = 0; s < dic; s++) - switch (direction) { - case mkldnn_bidirectional_sum: - dst_layer_[dst_layer_d.blk_off(it, b, s)] += ws_states( - n_layer, dir, 0, n_iter - it, b, s); - break; - default: - dst_layer_[dst_layer_d.blk_off(it, b, dir * dic + s)] - = ws_states(n_layer, dir, 0, n_iter - it, b, s); - } - } - }); -} - -template <> -void _ref_rnn_common_t::copy_res_layer(bool lr, bool rl, - int n_layer, int n_direction, int n_iter, int batch, - int n_output_features, int slc, int dic, int wic, int n_states, - mkldnn_rnn_direction_t direction, float *dst_layer_, - float *diff_src_layer_, const float *ws_states_, - const float *ws_diff_states_) { - auto diff_src_layer_d = memory_desc_wrapper(conf_.diff_src_pd(0)); - AOC ws_diff_states(ws_diff_states_, n_layer + 1, - n_direction, n_states + 1, n_iter + 1, batch, wic); - - parallel_nd(n_iter, batch, [&](int it, int b) { - int dir = 0; - for (int s = 0; s < slc; s++) { - float *dst_addr = diff_src_layer_ - + diff_src_layer_d.blk_off( - (direction - == mkldnn_unidirectional_right2left) ? - n_iter - 1 - it : - it, - b, dir * slc + s); - float res = ws_diff_states(0, 0, n_states, it, b, s); - if (n_direction - 1) - res += ws_diff_states( - 0, 1, n_states, n_iter - 1 - it, b, s); - dst_addr[0] = res; - } - }); -} - -template <> -void _ref_rnn_common_t::copy_res_iter(int n_layer, - int n_direction, int n_states, int batch, int sic, int dic, int wic, - int n_iter, float *dst_iter_, float *diff_src_iter_, - const float *ws_states_, const float *ws_diff_states_) { - auto dst_iter_d = memory_desc_wrapper(conf_.dst_pd(1)); - AOC ws_states(ws_states_, n_layer + 1, n_direction, - n_states, n_iter + 1, batch, wic); - if (dst_iter_) { - parallel_nd(n_layer, n_direction, n_states, batch, - [&](int lay, int dir, int state, int b) { - for (int s = 0; s < dic; s++) { - dst_iter_[dst_iter_d.blk_off(lay, dir, state, b, s)] - = ws_states(lay + 1, dir, state, n_iter, b, s); - } - }); - } -} - -template <> -void _ref_rnn_common_t::copy_res_iter(int n_layer, - int n_direction, int n_states, int batch, int sic, int dic, int wic, - int n_iter, float *dst_iter_, float *diff_src_iter_, - const float *ws_states_, const float *ws_diff_states_) { - auto diff_src_iter_d = memory_desc_wrapper(conf_.diff_src_pd(1)); - AOC ws_diff_states(ws_diff_states_, n_layer + 1, - n_direction, n_states + 1, n_iter + 1, batch, wic); - if (diff_src_iter_) { - parallel_nd(n_layer, n_direction, n_states, batch, - [&](int lay, int dir, int state, int b) { - for (int s = 0; s < sic; s++) { - diff_src_iter_[diff_src_iter_d.blk_off( - lay, dir, state, b, s)] - = ws_diff_states(lay, dir, state, 0, b, s); - } - }); - } -} - -template -packing_sig(_ref_rnn_common_t::pack_weights) { -#if (USE_MKL_PACKED_GEMM) - AOC w( - w_, n_layer, n_direction, IC_size, n_gates, OC_size); - AOC weights(weights_, n_layer, n_direction, n_parts); - int m = 0, n = 0, k = 0; - auto transA = CblasNoTrans; - bool is_fwd = aprop == prop_kind::forward; - if (is_fwd) { - m = n_gates * OC_size; - n = batch; - k = IC_size; - //todo: do a transposition if ldgoi - transA = CblasNoTrans; - } else { - m = IC_size; - n = batch; - k = n_gates * OC_size; - //TODO: do a transposition if ldigo - transA = CblasNoTrans; - } - for (int i = 0; i < n_layer; i++) { - for (int d = 0; d < n_direction; d++) { - for (int p = 0; p < n_parts; p++) { - int m_p = is_fwd ? (gates_per_part[p] * OC_size) : m; - int k_p = is_fwd ? k : (gates_per_part[p] * OC_size); - int g = (p > 0) ? gates_per_part[p - 1] : 0; - weights(i, d, p) = cblas_sgemm_alloc(CblasAMatrix, m_p, n, k_p); - cblas_sgemm_pack(CblasColMajor, CblasAMatrix, transA, m_p, n, - k_p, 1.0f, &(w(i, d, 0, g, 0)), m, weights(i, d, p)); - } - } - } -#else - UNUSED(n_layer); - UNUSED(n_direction); - UNUSED(n_weights); - UNUSED(n_gates); - UNUSED(n_parts); - UNUSED(gates_per_part); - UNUSED(batch); - UNUSED(OC_size); - UNUSED(IC_size); - UNUSED(weights_); - UNUSED(w_); - assert(!"packed gemm is disabled"); -#endif -} - -template -packing_sig(_ref_rnn_common_t::no_pack_weights) { - AOC w( - w_, n_layer, n_direction, IC_size * n_gates * OC_size); - AOC weights(weights_, n_layer, n_direction, n_parts); - int m = 0, n = 0, ldA = 0; - - bool is_fwd = aprop == prop_kind::forward; - if (is_fwd) { - m = n_gates * OC_size; - n = IC_size; - ldA = conf_.GC(); - } else { - m = IC_size; - n = n_gates * OC_size; - ldA = conf_.WIC(); - } - - if (!do_copy) { - for (int i=0; i < n_layer; i++) - for (int d = 0; d < n_direction; d++) { - weights(i, d, 0) = (float *) &(w(i, d, 0)); - for (int p = 1; p < n_parts; p++) { - size_t offset = is_fwd - ? gates_per_part[p - 1] * OC_size - : gates_per_part[p - 1] * OC_size * IC_size; - weights(i, d, p) = (float *) &w(i, d, offset); - } - } - return; - } - - /* We always assume - - column major - - alpha = 1.0f - */ - auto copy_matrix = [](char trans, int nrows, int ncols, - const float *src, const int ld_src, float *dst, const int ld_dst){ - for (int i = 0; i < ncols; i++) - for (int j = 0; j < nrows; j++) - dst[i * ld_dst + j] = src[i * ld_src + j]; - }; - - AOC tmp(scratch_mem, n_layer, n_direction, ldA * n); - mkldnn::impl::parallel_nd(n_layer, n_direction, [&](int i, int d) { - auto src_mat = &(w(i, d, 0)); - auto dst_mat = &(tmp(i, d, 0)); - copy_matrix('N', m, n, src_mat, m, dst_mat, ldA); - weights(i, d, 0) = &tmp(i, d, 0); - for (int p = 1; p < n_parts; p++) { - size_t offset = is_fwd - ? gates_per_part[p - 1] * OC_size - : gates_per_part[p - 1] * OC_size * conf_.WIC(); - weights(i, d, p) = &tmp(i, d, offset); - } - }); -} - - -template -free_packed_sig(_ref_rnn_common_t::free_packed_weights) { -#if (USE_MKL_PACKED_GEMM) - AOC weights(weights_, n_layer, n_direction, n_parts); - for (int i = 0; i < n_layer; i++) - for (int j = 0; j < n_direction; j++) - for (int k = 0; k < n_parts; k++) - cblas_sgemm_free(weights(i, j, k)); -#else - UNUSED(n_layer); - UNUSED(n_direction); - UNUSED(n_parts); - UNUSED(weights_); - assert(!"packed gemm is disabled"); -#endif -} - -template -free_packed_sig(_ref_rnn_common_t::free_no_packed_weights) { - // IN this case, only scratchpad is used, so no free necessary -} - -//********************* Execution function *********************// -template -void _ref_rnn_common_t::execute_() { - int n_layer = conf_.L(); - int n_direction = conf_.D(); - int n_iter = conf_.T(); - int n_gates = conf_.G(); - int n_bias = n_gates + conf_.is_lbr(); - int n_states = conf_.S(); - int n_weights_input = conf_.SLC(); - int n_weights_state = conf_.SIC(); - int batch = conf_.MB(); - int slc = conf_.SLC(); - int sic = conf_.SIC(); - int dic = conf_.DIC(); - int dlc = conf_.DLC(); - int wic = conf_.WIC(); - - bool is_orig_gru = conf_.cell_kind() - == alg_kind::vanilla_gru; - int n_parts_wei_st = is_orig_gru ? 2 : 1, n_parts_wei_i = 1; - int parts_wei_st = n_gates, parts_wei_i = n_gates, - parts_wei_st_gru[2] = {2, 1}; - bool is_fwd = aprop == prop_kind::forward; - int ws_per_cell = conf_.ws_per_cell(); - - int input_idx = 0; - int output_idx = 0; - auto input - = reinterpret_cast(this->input_memory(input_idx++)); - auto states = conf_.with_src_iter() ? - reinterpret_cast(this->input_memory(input_idx++)) : - nullptr; - auto w_input - = reinterpret_cast(this->input_memory(input_idx++)); - auto w_state - = reinterpret_cast(this->input_memory(input_idx++)); - auto bias = conf_.with_bias() ? - reinterpret_cast(this->input_memory(input_idx++)) : - nullptr; - - auto dst_last_layer = is_fwd ? - reinterpret_cast(this->memory(output_idx++)) : - const_cast(reinterpret_cast( - this->input_memory(input_idx++))); - auto dst_last_iter = conf_.with_dst_iter() ? - (is_fwd ? reinterpret_cast(this->memory(output_idx++)) : - const_cast(reinterpret_cast( - this->input_memory(input_idx++)))) : - nullptr; - - auto diff_dst_layer = is_fwd ? - nullptr : - reinterpret_cast(this->input_memory(input_idx++)); - auto diff_dst_iter = is_fwd || !conf_.with_dst_iter() ? - nullptr : - reinterpret_cast(this->input_memory(input_idx++)); - - // fetchihg buffers from the workspace - // if no workspace was provided we use the scratchpad - float *scratch_ptr = ((float *)scratchpad_->get()); - float *ws_ptr = nullptr; - if (use_workspace_) - ws_ptr = is_fwd ? - reinterpret_cast(this->memory(output_idx++)) : - const_cast(reinterpret_cast( - this->input_memory(input_idx++))); - float *base_ptr = use_workspace_ ? ws_ptr : scratch_ptr; - ws_gates_ = base_ptr + ws_gates_offset_; - ws_states_ = base_ptr + ws_states_offset_; - ws_diff_states_ = base_ptr + ws_diff_states_offset_; - ws_grid_ = base_ptr + ws_grid_comp_offset_; - ws_cell_ = base_ptr + ws_cell_comp_offset_; - - auto diff_src_layer = is_fwd ? - nullptr : - reinterpret_cast(this->memory(output_idx++)); - auto diff_src_iter = is_fwd || !conf_.with_src_iter() ? - nullptr : - reinterpret_cast(this->memory(output_idx++)); - auto diff_weights_layer = is_fwd ? - nullptr : - reinterpret_cast(this->memory(output_idx++)); - auto diff_weights_iter = is_fwd ? - nullptr : - reinterpret_cast(this->memory(output_idx++)); - auto diff_bias = is_fwd || !conf_.with_bias() ? - nullptr : - reinterpret_cast(this->memory(output_idx++)); - - // Fetching extra buffers from scratchpad - ws_weights_layer_ = scratch_ptr + ws_weights_layer_offset_; - ws_weights_iter_ = scratch_ptr + ws_weights_iter_offset_; - ws_diff_weights_layer_ = scratch_ptr + ws_diff_weights_layer_offset_; - ws_diff_weights_iter_ = scratch_ptr + ws_diff_weights_iter_offset_; - - -// initialize diff_states to 0 - if (aprop == prop_kind::backward) { - array_set(ws_diff_states_, 0.0f, conf_.ws_diff_states_size()); - // TODO: add a variable to check if good_ld_copy is necessary - if (copy_diff_weights_layer_) { - parallel_nd(conf_.ws_diff_weights_layer_size(), [&](size_t i) { - ws_diff_weights_layer_[i] = 0.; - }); - } else - ws_diff_weights_layer_ = diff_weights_layer; - if (copy_diff_weights_iter_) { - parallel_nd(conf_.ws_diff_weights_iter_size(), [&](size_t i) { - ws_diff_weights_iter_[i] = 0.; - }); - } else - ws_diff_weights_iter_ = diff_weights_iter; - } - - // TODO: implement without copies - bool is_lr = !one_of(exec_dir, b2t_r2l, t2b_r2l); - bool is_rl = !one_of(exec_dir, b2t_l2r, t2b_l2r); - // we pack the weights if we are using the packed API - (this->*weights_state_pack_func)(n_layer, n_direction, n_weights_state, - n_gates, batch, dic, sic, ptr_wei_state_, n_parts_wei_st, - (is_orig_gru ? parts_wei_st_gru : &parts_wei_st), w_state, - ws_weights_iter_, copy_weights_iter_); - (this->*weights_input_pack_func)(n_layer, n_direction, n_weights_input, - n_gates, batch, dic, slc, ptr_wei_input_, n_parts_wei_i, - &parts_wei_i, w_input, - ws_weights_layer_, copy_weights_layer_); - - // we first need to copy the initial states and input into ws - copy_init_layer(is_lr, is_rl, n_layer, n_direction, n_iter, batch, slc, dic, - dlc, wic, n_states, ws_states_, ws_diff_states_, input, - diff_dst_layer); - copy_init_iter(n_layer, n_direction, n_states, batch, sic, dic, wic, n_iter, - ws_states_, ws_diff_states_, states, diff_dst_iter); - - // run the execution on the grid - (this->*grid_computation)(dic, slc, sic, wic, batch, n_layer, n_direction, - n_iter, n_gates, n_states, n_bias, ptr_wei_input_, n_parts_wei_i, - ptr_wei_state_, n_parts_wei_st, (float *)bias, ws_states_, - ws_diff_states_, ws_gates_, ws_cell_, ws_grid_, ws_per_cell, - ws_diff_weights_layer_, ws_diff_weights_iter_, diff_bias); - - // Finally we copy the results to the result buffers - copy_res_layer(is_lr, is_rl, n_layer, n_direction, n_iter, batch, - n_output_features, slc, dic, wic, n_states, conf_.direction(), - dst_last_layer, diff_src_layer, ws_states_, ws_diff_states_); - copy_res_iter(n_layer, n_direction, n_states, batch, sic, dic, wic, n_iter, - dst_last_iter, diff_src_iter, ws_states_, ws_diff_states_); - - // copy of the diff weights if bwd - if (aprop == prop_kind::backward){ - // TODO: write an impl of matcopy in MKL-DNN - // TODO: support ldgoi using the trans parameters - AOC diff_weights_layer_aoc(diff_weights_layer, n_layer, n_direction, slc * n_gates * dic); - AOC diff_weights_iter_aoc(diff_weights_iter, n_layer, n_direction, sic * n_gates * dic); - AOC ws_diff_weights_layer_aoc(ws_diff_weights_layer_, n_layer, n_direction, slc * conf_.GC()); - AOC ws_diff_weights_iter_aoc(ws_diff_weights_iter_, n_layer, n_direction, sic * conf_.GC()); - - /* - - assumes column major and non transposed matrices - - computes B = A + B - */ - auto inplace_matadd = [=](const int nrows, const int ncols, - const float *A, const int ldA, float *B, const int ldB){ - for(int i = 0; i < ncols; i++) - for(int j = 0; j < nrows; j++) - B[i * ldB + j] += A[i * ldA + j]; - }; - mkldnn::impl::parallel_nd(n_layer, n_direction, [&](int i, int d) { - auto wei_lay = &(diff_weights_layer_aoc(i, d, 0)); - auto wei_it = &(diff_weights_iter_aoc(i, d, 0)); - auto ws_wei_lay = &(ws_diff_weights_layer_aoc(i, d, 0)); - auto ws_wei_it = &(ws_diff_weights_iter_aoc(i, d, 0)); - if (copy_diff_weights_layer_) - inplace_matadd(n_gates*dic, slc, ws_wei_lay, conf_.GC(), - wei_lay, n_gates*dic); - if (copy_diff_weights_iter_) - inplace_matadd(n_gates*dic, sic, ws_wei_it, conf_.GC(), - wei_it, n_gates*dic); - }); - } - - // We free the packed weights if they were packed internally - (this->*weights_state_free_packed_func)(n_layer, n_direction, - n_parts_wei_st, ptr_wei_state_); - (this->*weights_input_free_packed_func)(n_layer, n_direction, - n_parts_wei_i, ptr_wei_input_); -}; - -template struct _ref_rnn_common_t; -template struct _ref_rnn_common_t; - -#undef AOC -} -} -} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.hpp deleted file mode 100644 index 703aa18..0000000 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.hpp +++ /dev/null @@ -1,440 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_REF_RNN_HPP -#define CPU_REF_RNN_HPP - -#include - -#include "c_types_map.hpp" -#include "cpu_engine.hpp" -#include "cpu_rnn_pd.hpp" -#include "cpu_isa_traits.hpp" -#include "scratchpad.hpp" -#include "type_helpers.hpp" -#include "utils.hpp" - -#include "gemm/os_blas.hpp" - -namespace mkldnn { -namespace impl { -namespace cpu { - -#define elemwise_sig(f) \ - void f(int dic, int wic, int batch, int n_states, int iter_stride, int n_gates, \ - float *ws_gates_, float *states_t_l_, float *states_t_lm1_, \ - float *states_tm1_l_, float *diff_states_t_l_, \ - float *diff_states_t_lp1_, float *diff_states_tp1_l_, \ - const float *bias_, float *ws_grid_, float *ws_cell_) - -#define cell_execution_sig(f) \ - void f(int dic, int slc, int sic, int wic, int batch, int n_gates, \ - int n_states, int iter_stride, float *states_t_l_, float *diff_states_t_l_, \ - float **w_input_, float **w_state_, const float *bias_, \ - float *states_t_lm1_, float *states_tm1_l_, \ - float *diff_states_t_lp1_, float *diff_states_tp1_l_, \ - float *diff_w_input_, float *diff_w_state_, float *diff_bias_, \ - float *ws_gates_, float *ws_grid_, float *ws_cell_) - -#define grid_execution_sig(f) \ - void f(int dic, int slc, int sic, int wic, int batch, int n_layer, \ - int n_direction, int n_iter, int n_gates, int n_states, \ - int n_bias, float **weights_input_, int n_parts_wei_i, \ - float **weights_states_, int n_parts_wei_st, \ - const float *bias_, float *ws_states_, float *ws_diff_states_, \ - float *ws_gates_, float *ws_cell_, float *ws_grid_, \ - int ws_per_cell, float *diff_weights_layer_, \ - float *diff_weights_iter_, float *diff_bias_) - -#define gemm_sig(f) \ - void f(int m, int n, int k, int strideA_m, int strideA_k, int strideB_n, \ - int strideB_k, int strideC_m, int strideC_n, const float *a_, \ - float *b_, float *c_, bool is_B_trans, float beta) - -#define packing_sig(f) \ - void f(int n_layer, int n_direction, int n_weights, int n_gates, \ - int batch, int OC_size, int IC_size, float **weights_, \ - int n_parts, int *gates_per_part, const float *w_, \ - float * scratch_mem, bool do_copy) - -#define free_packed_sig(f) void f(int n_layer, int n_direction, int n_parts, \ - float **weights_) - -template -float activation(float s, float alpha, float cliping, float dd); - -template -struct _ref_rnn_common_t : public cpu_primitive_t { - using class_name = _ref_rnn_common_t; - typedef enum execution_direction_ { - b2t_l2r, - b2t_r2l, - b2t_bi_concat, - b2t_bi_sum, - t2b_l2r, - t2b_r2l, - t2b_bi_concat, - t2b_bi_sum - } execution_direction; - typedef elemwise_sig((class_name::*elemwise_f)); - typedef cell_execution_sig((class_name::*cell_execution_f)); - typedef grid_execution_sig((class_name::*grid_execution_f)); - - typedef gemm_sig((class_name::*gemm_t)); - typedef packing_sig((class_name::*packing_t)); - typedef free_packed_sig((class_name::*free_packed_t)); - - using base_pd_t = - typename utils::conditional::type; - - struct pd_t : public base_pd_t { - pd_t(engine_t *engine, const rnn_desc_t *adesc, - const primitive_attr_t *attr, - const typename pd_t::base_class *hint_pd) - : base_pd_t(engine, adesc, attr, hint_pd) {} - - DECLARE_COMMON_PD_T("ref:any", class_name); - - status_t init() { - using namespace prop_kind; - using namespace utils; - using namespace memory_format; - assert(this->engine()->kind() == engine_kind::cpu); - const alg_kind_t cell_kind = this->desc()->cell_desc.cell_kind; - - bool ok = true - && one_of(cell_kind, alg_kind::vanilla_rnn, - alg_kind::vanilla_lstm, alg_kind::vanilla_gru, - alg_kind::gru_linear_before_reset) - && IMPLICATION(aprop == prop_kind::forward, - one_of(this->desc()->prop_kind, forward_training, - forward_inference)) - && IMPLICATION(aprop == backward, - one_of(this->desc()->prop_kind, backward)) - && this->set_default_params() == status::success; - if (!ok) - return status::unimplemented; - - ok = ok && utils::one_of(cell_kind, alg_kind::vanilla_rnn, - alg_kind::vanilla_lstm, alg_kind::vanilla_gru, - alg_kind::gru_linear_before_reset); - - /// @todo check data layouts for all input tensors - ok = ok && this->desc()->src_layer_desc.format == tnc - && this->desc()->dst_layer_desc.format == tnc; - - ok = ok && this->with_bias(); - switch (aprop) { - case (prop_kind::forward): - ok = ok && utils::one_of(this->desc()->prop_kind, - forward_training, forward_inference); - ok = ok && utils::one_of( - this->desc()->weights_layer_desc.format, any, - ldigo, ldigo_p) - && utils::one_of(this->desc()->weights_iter_desc.format, - any, ldigo, ldigo_p); - break; - case (prop_kind::backward): - ok = ok && utils::one_of(this->desc()->prop_kind, backward); - ok = ok && utils::one_of( - this->desc()->weights_layer_desc.format, any, - ldgoi, ldgoi_p) - && utils::one_of(this->desc()->weights_iter_desc.format, - any, ldgoi, ldgoi_p); - break; - default: ok = false; - } - - // Check dimensions consistency - int ls_multiplier - = (this->direction() == mkldnn_bidirectional_concat) ? 2 : - 1; - - ok = ok && (ls_multiplier * this->DIC() == this->DLC()) - && ((ls_multiplier * this->SLC()) == this->DLC() - || (this->L() == 1)) - && (this->SIC() == this->DIC() || (this->T() == 1)); - - // initialize the workspace_pd if needed - if (this->desc()->prop_kind != forward_inference){ - dims_t ws_dims = { (dim_t)this->get_ws_size() }; - memory_desc_t ws_d; - mkldnn_memory_desc_init( - &ws_d, 1, ws_dims, impl::data_type::f32, memory_format::x); - this->ws_pd_ = cpu_memory_t::pd_t(this->engine(), &ws_d); - } - - return ok ? status::success : status::unimplemented; - } - }; - - _ref_rnn_common_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - /// @todo set max_feature_size assuming that we limit the number of - /// iterations and layer to one if slc != dic and sic != dic - /// respectively - - memory_format_t packed_format; - switch (aprop) { - case prop_kind::forward_inference: - case prop_kind::forward_training: - packed_format = memory_format::ldigo_p; - break; - case prop_kind::backward: packed_format = memory_format::ldgoi_p; break; - default: assert(false); - } - - merge_gemm_layer = ((aprop == prop_kind::forward) && (conf_.MB() < 128)) - || (aprop == prop_kind::backward); - merge_gemm_iter = (aprop == prop_kind::backward) - && (!utils::one_of(conf_.cell_kind(), alg_kind::vanilla_gru, - alg_kind::gru_linear_before_reset)); - auto set_pack_funcs = [](bool packed_gemm, gemm_t &g, bool pack_w, - packing_t &p, free_packed_t &f) { - g = packed_gemm ? &class_name::packed_gemm : &class_name::gemm; - p = pack_w ? &class_name::pack_weights : - &class_name::no_pack_weights; - f = pack_w ? &class_name::free_packed_weights : - &class_name::free_no_packed_weights; - }; -#ifdef USE_MKL_PACKED_GEMM - const bool weights_pack_cond = - (conf_.T() > 1) && (conf_.MB() == 32) && - (conf_.SIC() == 512) &&(conf_.SLC() == 512) && (conf_.DIC() == 512); -#else - const bool weights_pack_cond = false; -#endif - - const bool is_weights_state_packed = conf_.desc()->weights_iter_desc.format == packed_format; - set_pack_funcs(weights_pack_cond || is_weights_state_packed, - gemm_state_func, weights_pack_cond && !is_weights_state_packed, - weights_state_pack_func, weights_state_free_packed_func); - - const bool is_weights_input_packed = conf_.desc()->weights_layer_desc.format == packed_format; - set_pack_funcs(weights_pack_cond || is_weights_input_packed, - gemm_input_func, weights_pack_cond && !is_weights_input_packed, - weights_input_pack_func, weights_input_free_packed_func); - - switch (conf_.cell_kind()) { - case alg_kind::vanilla_lstm: - cell_func = &class_name::cell_execution; - elemwise_func = &class_name::lstm_elemwise; - break; - case alg_kind::vanilla_rnn: // @todo switch on cell kind - cell_func = &class_name::cell_execution; - elemwise_func = &class_name::rnn_elemwise; - switch (conf_.activation_kind()) { - case alg_kind::eltwise_relu: - activation_func = &activation; - break; - case alg_kind::eltwise_tanh: - activation_func = &activation; - break; - case alg_kind::eltwise_logistic: - activation_func = &activation; - break; - default: break; - } - break; - case alg_kind::vanilla_gru: - cell_func = &class_name::cell_execution_gru; - break; - case alg_kind::gru_linear_before_reset: - cell_func = &class_name::cell_execution_gru_lbr; - elemwise_func = &class_name::gru_lbr_elemwise; - break; - default: break; - } - - n_output_features - = (conf_.direction() == mkldnn_bidirectional_concat) ? 2 : 1; - switch (conf_.direction()) { - case mkldnn_unidirectional_left2right: exec_dir = b2t_l2r; break; - case mkldnn_unidirectional_right2left: exec_dir = b2t_r2l; break; - case mkldnn_bidirectional_concat: exec_dir = b2t_bi_concat; break; - case mkldnn_bidirectional_sum: exec_dir = b2t_bi_sum; break; - default: break; - } - - /// @todo put a heuristic to choose between linear execution and - /// wavefront - grid_computation = &class_name::linear_execution; - - // we need to allocate memory for: - // - the states to compute a pass. - // - the intermediate results from the gates. - // - the diff_states to compute the backward pass (training only) - // These should be allocated on scratchpad if fwd inference - // or on a workspace provided by the user for training. - /// @todo shall we require the workspace for training or make it - /// optional? - - // if no worskpace is provided on forward, we use a scratchpad - // NOTE: here we use a large worskpace for simplicity: - // - for states: - // - TODO: allocate only n_iter * dic + dic for linear execution - // (inference) - // - TODO: allocate only n_layer_wav * (2*dic) for wavefront - // execution (inference) - // - for gates: - // - TODO: allocate only batch * n_gates * dic for linear execution - // (inference) - // = TODO: allocate only n_layer_wav * batch * n_gates * dic for - // wavefront execution (inference) - - use_jit_sgemm_ = ((aprop == prop_kind::forward_inference) - || (conf_.is_training() && conf_.DIC() < 500)) - && !mayiuse(avx512_mic); - - copy_weights_layer_ = (conf_.WL_LD() != conf_.WL_GLD()); - copy_weights_iter_ = (conf_.WI_LD() != conf_.WI_GLD()); - - copy_diff_weights_layer_ = (aprop == prop_kind::backward) - && (conf_.DWL_LD() != conf_.DWL_GLD()); - copy_diff_weights_iter_ = (aprop == prop_kind::backward) - && (conf_.DWI_LD() != conf_.DWI_GLD()); - - use_workspace_ = (conf_.desc()->prop_kind != prop_kind::forward_inference); - - size_t scratchpad_size = conf_.set_offsets(use_workspace_, - ws_gates_offset_, ws_states_offset_, ws_diff_states_offset_, - ws_grid_comp_offset_, - conf_.is_lbr(), ws_cell_comp_offset_, - copy_weights_layer_, ws_weights_layer_offset_, - copy_weights_iter_, ws_weights_iter_offset_, - copy_diff_weights_layer_, ws_diff_weights_layer_offset_, - copy_diff_weights_iter_, ws_diff_weights_iter_offset_); - - scratchpad_ = - create_scratchpad(scratchpad_size * sizeof(float)); - - int max_nparts = (conf_.cell_kind() == alg_kind::vanilla_gru) ? 2 : 1; - int ptr_wei_sz = conf_.L() * conf_.D() * max_nparts; - ptr_wei_input_ = (float **)malloc(sizeof(float *) * ptr_wei_sz, 64); - ptr_wei_state_ = (float **)malloc(sizeof(float *) * ptr_wei_sz, 64); - } - ~_ref_rnn_common_t() { - delete scratchpad_; - free(ptr_wei_input_); - free(ptr_wei_state_); - } - - // typedef typename prec_traits::type data_t; - - virtual void execute(event_t *e) { - execute_(); - e->set_state(event_t::ready); - } - -private: - void execute_(); - grid_execution_sig(linear_execution); - // grid_execution_sig(wavefront_execution); - cell_execution_sig(cell_execution); - cell_execution_sig(cell_execution_gru); - cell_execution_sig(cell_execution_gru_lbr); - elemwise_sig(rnn_elemwise); - elemwise_sig(lstm_elemwise); - elemwise_sig(gru_lbr_elemwise); - gemm_sig(gemm); - gemm_sig(packed_gemm); - packing_sig(pack_weights); - packing_sig(no_pack_weights); - free_packed_sig(free_packed_weights); - free_packed_sig(free_no_packed_weights); - - float (*activation_func)(float dd, float s, float alpha, float cliping); - - void copy_init_layer(bool lr, bool rl, int n_direction, int n_layer, - int n_iter, int batch, int slc, int dic, int dlc, int wic, - int n_states, float *ws_states_, float *ws_diff_states_, - const float *xt_, const float *diff_dst_layer); - void copy_init_iter(int n_layer, int n_direction, int n_states, int batch, - int sic, int dic, int wic, int n_iter, float *ws_states_, - float *ws_diff_states_, const float *firstit_states_, - const float *diff_dst_iter); - void copy_res_layer(bool lr, bool rl, int n_layer, int n_direction, - int n_iter, int batch, int n_output_features, int slc, int dlc, - int wic, int n_states, mkldnn_rnn_direction_t direction, - float *dst_layer_, float *diff_src_layer, const float *ws_states_, - const float *ws_diff_states_); - void copy_res_iter(int n_layer, int n_direction, int n_states, int batch, - int sic, int dic, int wic, int n_iter, float *dst_iter_, - float *diff_src_iter, const float *ws_states_, - const float *ws_diff_states_); - void gates_reduction(int n_gates, int dic, int wic, int batch, - const float *ws_gates_, float *diff_bias_); - pd_t conf_; - bool use_workspace_; - scratchpad_t *scratchpad_; - - size_t ws_gates_offset_; - size_t ws_states_offset_; - size_t ws_weights_layer_offset_; - size_t ws_weights_iter_offset_; - size_t ws_diff_states_offset_; - size_t ws_diff_weights_layer_offset_; - size_t ws_diff_weights_iter_offset_; - size_t ws_grid_comp_offset_; - size_t ws_cell_comp_offset_; - - float *ws_gates_; - float *ws_states_; - float *ws_diff_states_; - float *ws_cell_; - float *ws_grid_; - float *ws_weights_layer_; - float *ws_weights_iter_; - float *ws_diff_weights_layer_; - float *ws_diff_weights_iter_; - int n_output_features; - - float **ptr_wei_input_; - float **ptr_wei_state_; - - execution_direction exec_dir; - grid_execution_f grid_computation; - cell_execution_f cell_func; - - bool copy_weights_layer_; - bool copy_weights_iter_; - bool copy_diff_weights_layer_; - bool copy_diff_weights_iter_; - bool merge_gemm_layer; - bool merge_gemm_iter; - bool use_jit_sgemm_; - - packing_t weights_input_pack_func; - packing_t weights_state_pack_func; - - gemm_t gemm_input_func; - gemm_t gemm_state_func; - elemwise_f elemwise_func; - - free_packed_t weights_input_free_packed_func; - free_packed_t weights_state_free_packed_func; -}; - -using ref_rnn_fwd_t = _ref_rnn_common_t; -using ref_rnn_bwd_t = _ref_rnn_common_t; -} -} -} -#endif - -// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.cpp index 2d8188d..e8806cb 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.cpp @@ -31,20 +31,20 @@ namespace impl { namespace cpu { template -void ref_roi_pooling_fwd_t::execute_forward_generic() { +void ref_roi_pooling_fwd_t::execute_forward_generic() const { int roi_idx = 1; int data_idx = 0; - const memory_desc_wrapper dst_d(conf_.dst_pd()); - memory_desc_wrapper src_data_d = conf_.src_pd(data_idx); - memory_desc_wrapper src_roi_d = conf_.src_pd(roi_idx); + const memory_desc_wrapper dst_d(pd()->dst_pd()); + memory_desc_wrapper src_data_d = pd()->src_pd(data_idx); + memory_desc_wrapper src_roi_d = pd()->src_pd(roi_idx); if (src_roi_d.dims()[0] < src_data_d.dims()[0]) { roi_idx = 0; data_idx = 1; - src_data_d = conf_.src_pd(data_idx); - src_roi_d = conf_.src_pd(roi_idx); + src_data_d = pd()->src_pd(data_idx); + src_roi_d = pd()->src_pd(roi_idx); } auto dst = reinterpret_cast(this->memory(0)); @@ -57,9 +57,9 @@ void ref_roi_pooling_fwd_t::execute_forward_generic() { int ROIS = src_roi_d.dims()[0]; - double spatial_scale = conf_.spatialScale(); - int pooled_h = conf_.pooledH(); - int pooled_w = conf_.pooledW(); + double spatial_scale = pd()->spatialScale(); + int pooled_h = pd()->pooledH(); + int pooled_w = pd()->pooledW(); for (size_t i = 0; i < dst_d.size() / sizeof(data_t); i++) { dst[i] = -FLT_MAX; @@ -94,7 +94,7 @@ void ref_roi_pooling_fwd_t::execute_forward_generic() { const data_t* src_roi_ptr = &src_roi[roi_off]; int roi_batch_ind = src_roi_ptr[0]; - if (conf_.desc()->alg_kind == mkldnn_roi_pooling_max) { + if (pd()->desc()->alg_kind == mkldnn_roi_pooling_max) { int roi_start_w = round(src_roi_ptr[1] * spatial_scale); int roi_start_h = round(src_roi_ptr[2] * spatial_scale); int roi_end_w = round(src_roi_ptr[3] * spatial_scale); @@ -152,7 +152,7 @@ void ref_roi_pooling_fwd_t::execute_forward_generic() { } } } - } else if (conf_.desc()->alg_kind == mkldnn_roi_pooling_bilinear) { + } else if (pd()->desc()->alg_kind == mkldnn_roi_pooling_bilinear) { float roi_start_w_ = src_roi_ptr[1]; float roi_start_h_ = src_roi_ptr[2]; float roi_end_w_ = src_roi_ptr[3]; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.hpp index 5bcc56a..afb6661 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_roi_pooling.hpp @@ -52,22 +52,22 @@ struct ref_roi_pooling_fwd_t: public cpu_primitive_t { } }; - ref_roi_pooling_fwd_t(const pd_t *pd, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { } + ref_roi_pooling_fwd_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) { } typedef typename prec_traits::type data_t; ~ref_roi_pooling_fwd_t() { } - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute_forward_generic(); e->set_state(event_t::ready); } private: - void execute_forward_generic(); - pd_t conf_; + void execute_forward_generic() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp index 42234e9..89eb24e 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp @@ -31,27 +31,27 @@ using namespace memory_format; template template -void ref_shuffle_t::execute_() { +void ref_shuffle_t::execute_() const { using namespace prop_kind; using namespace utils; - const memory_desc_wrapper data_d(conf_.data_pd()); + const memory_desc_wrapper data_d(pd()->data_pd()); auto input = reinterpret_cast(this->input_memory(0)); auto output = reinterpret_cast(this->memory(0)); - const int axis = conf_.axis(); - const int axis_size = conf_.axis_size(); + const int axis = pd()->axis(); + const int axis_size = pd()->axis_size(); - const int MB = conf_.MB(); - const int C = conf_.C(); + const int MB = pd()->MB(); + const int C = pd()->C(); int H = 1, W = 1, D = 1, HW = 1, SP = 1; const bool has_spatial = utils::one_of(data_d.ndims(), 3, 4 ,5); if (has_spatial) { - D = conf_.D(); - H = conf_.H(); - W = conf_.W(); + D = pd()->D(); + H = pd()->H(); + W = pd()->W(); HW = H * W; SP = D * HW; } @@ -107,8 +107,8 @@ void ref_shuffle_t::execute_() { } }); } else { - auto dims = conf_.desc()->data_desc.dims; - auto ndims = conf_.desc()->data_desc.ndims; + auto dims = pd()->desc()->data_desc.dims; + auto ndims = pd()->desc()->data_desc.ndims; const size_t outer_size = utils::array_product(dims, axis); const size_t inner_size = utils::array_product(dims + axis + 1, ndims - axis - 1); @@ -124,25 +124,25 @@ void ref_shuffle_t::execute_() { } } -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); -template void ref_shuffle_t<4>::execute_(); - -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); -template void ref_shuffle_t<1>::execute_(); +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; +template void ref_shuffle_t<4>::execute_() const; + +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; +template void ref_shuffle_t<1>::execute_() const; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp index 763bbaa..cd653dc 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp @@ -53,15 +53,15 @@ struct ref_shuffle_t : public cpu_primitive_t { } }; - ref_shuffle_t(const pd_t *pd, const input_vector &inputs, + ref_shuffle_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) + : cpu_primitive_t(apd, inputs, outputs) { - const int axis_size = conf_.axis_size(); - const int group_size = conf_.group_size(); - const int transpose_row = conf_.is_fwd() ? group_size + const int axis_size = pd()->axis_size(); + const int group_size = pd()->group_size(); + const int transpose_row = pd()->is_fwd() ? group_size : axis_size / group_size; - const int transpose_col = conf_.is_fwd() ? axis_size / group_size + const int transpose_col = pd()->is_fwd() ? axis_size / group_size : group_size; rev_transposed_ = (int *)malloc(axis_size * sizeof(int), 64); parallel_nd(transpose_col, transpose_row, [&](int i, int j) { @@ -73,9 +73,9 @@ struct ref_shuffle_t : public cpu_primitive_t { typedef typename typesize_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { using namespace memory_format; - switch (conf_.data_pd()->desc()->format) { + switch (pd()->data_pd()->desc()->format) { case nCdhw16c: execute_(); break; case nChw16c: execute_(); break; case nCdhw8c: execute_(); break; @@ -91,8 +91,8 @@ struct ref_shuffle_t : public cpu_primitive_t { } private: - templatevoid execute_(); - pd_t conf_; + templatevoid execute_() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } int *rev_transposed_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.cpp index a65632f..30b3299 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.cpp @@ -23,10 +23,10 @@ #include "mkldnn_thread.hpp" #include "ref_softmax.hpp" +#include "gemm/os_blas.hpp" #ifdef USE_MKL #include "mkl_vml_functions.h" -#include "mkl_cblas.h" #endif namespace mkldnn { @@ -34,11 +34,11 @@ namespace impl { namespace cpu { template -void ref_softmax_fwd_t::execute_forward_dense() { +void ref_softmax_fwd_t::execute_forward_dense() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - outer_size_ = utils::array_product(conf_.src_pd()->desc()->dims, conf_.desc()->softmax_axis); + int outer_size_ = utils::array_product(pd()->src_pd()->desc()->dims, pd()->desc()->softmax_axis); if (outer_size_ == 1) { for (int ou = 0; ou < outer_size_; ou++) { @@ -68,60 +68,112 @@ void ref_softmax_fwd_t::execute_forward_dense() { } template -void ref_softmax_fwd_t::execute_forward_generic() { +void ref_softmax_fwd_t::execute_forward_generic() const { auto src = reinterpret_cast(this->input_memory(0)); auto dst = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper data_d(conf_.src_pd()); + data_t space_max_val = 0, space_denom_val = 0; + data_t *space_max = &space_max_val, *space_denom = &space_denom_val; + if (inner_size_ > 1) { + using namespace memory_tracking::names; + space_max = scratchpad().template get(key_softmax_reduction); + space_denom = space_max + inner_size_; + } + + const memory_desc_wrapper data_d(pd()->src_pd()); const size_t dim = channels_ * inner_size_; - outer_size_ = utils::array_product(conf_.src_pd()->desc()->dims, conf_.desc()->softmax_axis); + int outer_size_ = utils::array_product(pd()->src_pd()->desc()->dims, pd()->desc()->softmax_axis); for (int ou = 0; ou < outer_size_; ou++) { - utils::array_set(max_, -FLT_MAX, inner_size_); - utils::array_set(denom_, 0, inner_size_); + utils::array_set(space_max, -FLT_MAX, inner_size_); + utils::array_set(space_denom, 0, inner_size_); for (int c = 0; c < channels_; c++) { for(int in = 0; in < inner_size_; in++) { size_t off = data_d.off_l(ou * dim + c * inner_size_ + in); - max_[in] = nstl::max(max_[in], src[off]); + space_max[in] = nstl::max(space_max[in], src[off]); } } for (int c = 0; c < channels_; c++) { for(int in = 0; in < inner_size_; in++) { size_t off = data_d.off_l(ou * dim + c * inner_size_ + in); - denom_[in] += dst[off] = exp(src[off] - max_[in]); + space_denom[in] += dst[off] = exp(src[off] - space_max[in]); } } for (int c = 0; c < channels_; c++) { for (int in = 0; in < inner_size_; in++) { size_t off = data_d.off_l(ou * dim + c * inner_size_ + in); - dst[off] /= denom_[in]; + dst[off] /= space_denom[in]; } } } } - template void ref_softmax_fwd_t::_max(int n, const data_t *x, - data_t *max_data) { + data_t *max_data) const { +// Intel(R) C++ Compiler generates the maxps + shuffle pattern +// for the max search which works faster +#if !defined(__INTEL_COMPILER) + // The code below makes a compiler to generate maxps instruction + // rather than maxss, which is generated for the 'else' code path + auto max_wrapper = [](data_t a, data_t b) { return nstl::max(a, b); }; + auto min_wrapper = [](int a, int b) { return nstl::min(a, b); }; + + constexpr int unroll_factor = 32; + data_t max_values[unroll_factor]; + + if (n < unroll_factor) { + data_t max_val = x[0]; + for (int i = 1; i < n; i++) { + max_val = max_wrapper(max_val, x[i]); + } + max_data[0] = max_val; + return; + } + for (int i = 0; i < unroll_factor; i++) { + max_values[i] = x[i]; + } + for (int i = unroll_factor; i < n; i += unroll_factor) { + int offset = min_wrapper(i, n - unroll_factor); + for (int j = 0; j < unroll_factor; j++) { + max_values[j] = max_wrapper(max_values[j], x[offset + j]); + } + } + data_t max_val = max_values[0]; + for (int i = 1; i < unroll_factor; i++) { + max_val = max_wrapper(max_val, max_values[i]); + } + max_data[0] = max_val; +#else max_data[0] = x[0]; for (int c = 1; c < n; ++c) max_data[0] = nstl::max(max_data[0], x[c]); +#endif } template void ref_softmax_fwd_t::_sub(int n, data_t alpha, const data_t *x, - data_t *y) { - for (int c = 0; c < n; ++c) - y[c] = x[c] - alpha; + data_t *y) const { + constexpr int unroll_factor = 32; + int tail = n % unroll_factor; + for (int i = 0; i < n - tail; i += unroll_factor) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < unroll_factor; j++) { + y[i + j] = x[i + j] - alpha; + } + } + PRAGMA_OMP_SIMD() + for (int i = n - tail; i < n; i++) { + y[i] = x[i] - alpha; + } } template -void ref_softmax_fwd_t::_exp_parallel(int n, const data_t *a, data_t *r) { +void ref_softmax_fwd_t::_exp_parallel(int n, const data_t *a, data_t *r) const { #ifdef USE_MKL if (data_type == data_type::f32) { vsExp(n, a, r); @@ -132,22 +184,32 @@ void ref_softmax_fwd_t::_exp_parallel(int n, const data_t *a, data_t } template -void ref_softmax_fwd_t::_exp(int n, const data_t *a, data_t *r) { +void ref_softmax_fwd_t::_exp(int n, const data_t *a, data_t *r) const { for (int c = 0; c < n; c++) r[c] = expf(a[c]); } template void ref_softmax_fwd_t::_sum(int n, const data_t *x, - data_t *sum_data) { - sum_data[0] = 0; + data_t *sum_data) const { +#ifdef USE_CBLAS + // Here we are summing x's eg. e^z , which are positives + // so we can use BLAS ASUM + if (data_type == data_type::f32) { + sum_data[0] = cblas_sasum(n, x, 1); + return; + } +#endif + data_t tsum = static_cast(0); + PRAGMA_OMP_SIMD(reduction(+ : tsum)) for (int c = 0; c < n; ++c) - sum_data[0] += x[c]; + tsum += x[c]; + sum_data[0] = tsum; } template -void ref_softmax_fwd_t::_scal_parallel(int n, data_t alpha, data_t *x) { -#ifdef USE_MKL +void ref_softmax_fwd_t::_scal_parallel(int n, data_t alpha, data_t *x) const { +#ifdef USE_CBLAS if (data_type == data_type::f32) { cblas_sscal(n, alpha, x, 1); return; @@ -157,7 +219,7 @@ void ref_softmax_fwd_t::_scal_parallel(int n, data_t alpha, data_t *x } template -void ref_softmax_fwd_t::_scal(int n, data_t alpha, data_t *x) { +void ref_softmax_fwd_t::_scal(int n, data_t alpha, data_t *x) const { for (int c = 0; c < n; c++) x[c] *= alpha; } @@ -167,7 +229,7 @@ template struct ref_softmax_fwd_t; // NC/NCHW softmax for along final axe (1 for NC, 3 for NCHW) template -void ref_softmax_bwd_t::execute_backward_dense() { +void ref_softmax_bwd_t::execute_backward_dense() const { auto data = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); @@ -190,13 +252,13 @@ void ref_softmax_bwd_t::execute_backward_dense() { } template -void ref_softmax_bwd_t::execute_backward_generic() { +void ref_softmax_bwd_t::execute_backward_generic() const { const size_t dim = channels_ * inner_size_; auto data = reinterpret_cast(this->input_memory(0)); auto diff_dst = reinterpret_cast(this->input_memory(1)); auto diff_src = reinterpret_cast(this->memory(0)); - const memory_desc_wrapper diff_d(conf_.diff_src_pd()); - const memory_desc_wrapper data_d(conf_.dst_pd()); + const memory_desc_wrapper diff_d(pd()->diff_src_pd()); + const memory_desc_wrapper data_d(pd()->dst_pd()); parallel_nd(outer_size_, [&](int ou) { for (int in = 0; in < inner_size_; in++) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.hpp index c82f5b2..8023785 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_softmax.hpp @@ -14,17 +14,18 @@ * limitations under the License. *******************************************************************************/ -#ifndef CPU_REF_SOFTMAX_FWD_HPP -#define CPU_REF_SOFTMAX_FWD_HPP +#ifndef CPU_REF_SOFTMAX_HPP +#define CPU_REF_SOFTMAX_HPP #include #include "c_types_map.hpp" -#include "cpu_softmax_pd.hpp" -#include "cpu_engine.hpp" +#include "memory_tracking.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "cpu_softmax_pd.hpp" + namespace mkldnn { namespace impl { namespace cpu { @@ -49,63 +50,68 @@ struct ref_softmax_fwd_t: public cpu_primitive_t { && attr()->has_default_values(); if (!ok) return status::unimplemented; + init_scratchpad(); + return status::success; } + + private: + void init_scratchpad() { + const int inner_size = utils::array_product( + desc()->data_desc.dims + desc()->softmax_axis + 1, + desc()->data_desc.ndims - desc()->softmax_axis - 1); + + if (inner_size > 1) { + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_softmax_reduction, + sizeof(data_t) * 2 * inner_size); + } + } }; - ref_softmax_fwd_t(const pd_t *pd, const input_vector &inputs, + ref_softmax_fwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), ws_(nullptr) { - auto ndims = conf_.desc()->data_desc.ndims; - auto dims = conf_.desc()->data_desc.dims; - auto axis = conf_.desc()->softmax_axis; + : cpu_primitive_t(apd, inputs, outputs) + { + auto ndims = pd()->desc()->data_desc.ndims; + auto dims = pd()->desc()->data_desc.dims; + auto axis = pd()->desc()->softmax_axis; outer_size_ = utils::array_product(dims, axis); channels_ = dims[axis]; inner_size_ = utils::array_product(dims + axis + 1, ndims - axis - 1); - val_max_ = val_denom_ = 0; - - if (inner_size_ > 1) { - ws_ = new data_t[2*inner_size_]; - max_ = &ws_[0]; - denom_ = &ws_[inner_size_]; - } else { - max_ = &val_max_; - denom_ = &val_denom_; - } - const memory_desc_wrapper data_d(conf_.src_pd()); + const memory_desc_wrapper data_d(pd()->src_pd()); use_dense_ = inner_size_ == 1 && data_d.is_dense() && data_d.blocking_desc().block_dims[axis] == 1 && data_d.blocking_desc().strides[0][axis] == 1; } - ~ref_softmax_fwd_t() { if (ws_) delete [] ws_; } + ~ref_softmax_fwd_t() {} + typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { if (use_dense_) execute_forward_dense(); else execute_forward_generic(); e->set_state(event_t::ready); } private: - void execute_forward_dense(); - void execute_forward_generic(); + void execute_forward_dense() const; + void execute_forward_generic() const; - void _max(int n, const data_t *x, data_t *max_data); - void _sub(int n, data_t alpha, const data_t *x, data_t *y); - void _exp(int n, const data_t *a, data_t *r); - void _exp_parallel(int n, const data_t *a, data_t *r); - void _sum(int n, const data_t *x, data_t *sum_data); - void _scal(int n, data_t alpha, data_t *x); - void _scal_parallel(int n, data_t alpha, data_t *x); + void _max(int n, const data_t *x, data_t *max_data) const; + void _sub(int n, data_t alpha, const data_t *x, data_t *y) const; + void _exp(int n, const data_t *a, data_t *r) const; + void _exp_parallel(int n, const data_t *a, data_t *r) const; + void _sum(int n, const data_t *x, data_t *sum_data) const; + void _scal(int n, data_t alpha, data_t *x) const; + void _scal_parallel(int n, data_t alpha, data_t *x) const; - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } bool use_dense_; int outer_size_, channels_, inner_size_; - data_t val_max_, val_denom_; - data_t *ws_, *max_, *denom_; }; template @@ -132,20 +138,20 @@ struct ref_softmax_bwd_t: public cpu_primitive_t { } }; - ref_softmax_bwd_t(const pd_t *pd, const input_vector &inputs, + ref_softmax_bwd_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - auto dims = conf_.desc()->diff_desc.dims; - auto axis = conf_.desc()->softmax_axis; - auto ndims = conf_.desc()->diff_desc.ndims; + : cpu_primitive_t(apd, inputs, outputs) { + auto dims = pd()->desc()->diff_desc.dims; + auto axis = pd()->desc()->softmax_axis; + auto ndims = pd()->desc()->diff_desc.ndims; outer_size_ = utils::array_product(dims, axis); channels_ = dims[axis]; inner_size_ = utils::array_product(dims + axis + 1, ndims - axis - 1); // Diff desc as well as data desc whould be checked - const memory_desc_wrapper data_d(conf_.dst_pd()); - const memory_desc_wrapper diff_d(conf_.diff_dst_pd()); + const memory_desc_wrapper data_d(pd()->dst_pd()); + const memory_desc_wrapper diff_d(pd()->diff_dst_pd()); use_dense_ = true && inner_size_ == 1 && diff_d == data_d @@ -154,23 +160,22 @@ struct ref_softmax_bwd_t: public cpu_primitive_t { && diff_d.blocking_desc().strides[0][axis] == 1; } ~ref_softmax_bwd_t() {} + typedef typename prec_traits::type data_t; - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { if (use_dense_) execute_backward_dense(); else execute_backward_generic(); e->set_state(event_t::ready); } private: - void execute_backward_dense(); - void execute_backward_generic(); + void execute_backward_dense() const; + void execute_backward_generic() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } - pd_t conf_; bool use_dense_; int outer_size_, channels_, inner_size_; - data_t val_max_, val_denom_; - data_t *max_, *denom_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_sum.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_sum.hpp index 4fd9bad..17e0bde 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_sum.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_sum.hpp @@ -78,8 +78,8 @@ struct ref_sum_t: public cpu_primitive_t { } return ret; } - virtual pd_t *clone() const override { return nullptr; /* FIXME */ } - virtual const char *name() const override { return "ref:any"; } + virtual pd_t *clone() const override { return new pd_t(*this); } + virtual const char *name() const override { return "ref:any"; } virtual status_t init() override { bool ok = cpu_sum_pd_t::init() == success; @@ -109,9 +109,9 @@ struct ref_sum_t: public cpu_primitive_t { nstl::vector reorder_pds_; }; - ref_sum_t(const pd_t *conf, const input_vector &inputs, + ref_sum_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs, nstl::vector reorders) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*conf), + : cpu_primitive_t(apd, inputs, outputs), reorders_(reorders) {} ~ref_sum_t() { @@ -120,7 +120,7 @@ struct ref_sum_t: public cpu_primitive_t { delete reorders_[i]; } - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { const auto n = reorders_.size(); for (size_t i = 0; i < n; ++i) { event_t ei; @@ -130,7 +130,7 @@ struct ref_sum_t: public cpu_primitive_t { } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } nstl::vector reorders_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_common.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_common.cpp new file mode 100644 index 0000000..537084d --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_common.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + * Common for RNN and LSTM cell execution + */ +#include "ref_rnn.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { +using namespace rnn_utils; + +template +rnn_cell_execution_sig( + (_ref_rnn_common_t::cell_execution)) { + if (!rnn.merge_gemm_layer) { + (this->*gemm_layer_func)('N', 'N', rnn.n_gates * rnn.dic, rnn.mb, + rnn.slc, 1.0, w_layer_[0], rnn.weights_layer_ld, + states_t_lm1_, rnn.states_ws_ld, 0.0, ws_gates_, + rnn.gates_ws_ld); + } + (this->*gemm_iter_func)('N', 'N', rnn.n_gates * rnn.dic, rnn.mb, rnn.sic, + 1.0, w_iter_[0], rnn.weights_iter_ld, states_tm1_l_, + rnn.states_ws_ld, 1.0, ws_gates_, rnn.gates_ws_ld); + + if (rnn_postgemm_ != nullptr) + rnn_postgemm_->execute(rnn, ws_gates_, states_t_l_, c_states_t_l_, + states_tm1_l_, c_states_tm1_l_, diff_states_t_l_, + diff_states_t_lp1_, diff_states_tp1_l_, bias_[0], ws_grid_, + ws_cell_); + else + (this->*elemwise_func)(rnn, ws_gates_, states_t_l_, c_states_t_l_, + states_tm1_l_, c_states_tm1_l_, diff_states_t_l_, + diff_states_t_lp1_, diff_states_tp1_l_, bias_[0], ws_grid_, + ws_cell_); +} +template rnn_cell_execution_sig(ref_rnn_fwd_f32_t::cell_execution); +template rnn_cell_execution_sig(ref_rnn_fwd_u8s8_t::cell_execution); + +template <> +rnn_cell_execution_sig(ref_rnn_bwd_f32_t::cell_execution) { + ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_); + (this->*elemwise_func)(rnn, ws_gates_, states_t_l_, c_states_t_l_, + states_tm1_l_, c_states_tm1_l_, diff_states_t_l_, + diff_states_t_lp1_, diff_states_tp1_l_, bias_[0], ws_grid_, + ws_cell_); + + /// bwd by data on the cell + (this->*gemm_iter_func)('N', 'N', rnn.sic, rnn.mb, rnn.n_gates * rnn.dic, + 1.0, w_iter_[0], rnn.weights_iter_ld, ws_gates_, rnn.gates_ws_ld, + 0.0, diff_states_t_l_, rnn.states_ws_ld); + + if (!rnn.merge_gemm_layer) { + (this->*gemm_layer_func)('N', 'N', rnn.slc, rnn.mb, + rnn.n_gates * rnn.dic, 1.0, w_layer_[0], + rnn.weights_layer_ld, ws_gates_, rnn.gates_ws_ld, 0.0, + &diff_states_t_l(rnn.n_states, 0, 0), rnn.states_ws_ld); + + /// bwd by weights on the cell + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.slc, rnn.mb, 1.0, ws_gates_, + rnn.gates_ws_ld, states_t_lm1_, rnn.states_ws_ld, 1.0, + diff_w_layer_, rnn.diff_weights_layer_ld); + } + + if (!rnn.merge_gemm_iter) + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.sic, rnn.mb, 1.0, ws_gates_, + rnn.gates_ws_ld, states_tm1_l_, rnn.states_ws_ld, 1.0, + diff_w_iter_, rnn.diff_weights_iter_ld); + + /// bwd by bias we just accumulate diffs from the gates + gates_reduction(rnn, ws_gates_, diff_bias_); +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru.cpp new file mode 100644 index 0000000..e1a61d4 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru.cpp @@ -0,0 +1,180 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + * Cell execution GRU + */ + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" + +#include "ref_rnn.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::math; +using namespace rnn_utils; + +#define AOC array_offset_calculator +template <> +rnn_cell_execution_sig(ref_rnn_fwd_f32_t::cell_execution_gru) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_[0]); + ws_states_aoc_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t states_tm1_l(rnn, states_tm1_l_); + + // 1. gemm Wx[0-2],x + if (!rnn.merge_gemm_layer) { + (this->*gemm_layer_func)('N', 'N', rnn.n_gates * rnn.dic, rnn.mb, + rnn.slc, 1.0, w_layer_[0], rnn.weights_layer_ld, + states_t_lm1_, rnn.states_ws_ld, 0.0, ws_gates_, + rnn.gates_ws_ld); + } + + // 2. gemm Wh[0-1],h + (this->*gemm_iter_func)('N', 'N', (rnn.n_gates - 1) * rnn.dic, rnn.mb, + rnn.sic, 1.0, w_iter_[0], rnn.weights_iter_ld, states_tm1_l_, + rnn.states_ws_ld, 1.0, ws_gates_, rnn.gates_ws_ld); + + // 3. activation zt and rt + elemwise multiplication rt,ht-1 + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + ws_gates(i, 0, j) = logistic_fwd(ws_gates(i, 0, j) + bias(0, j)); + ws_gates(i, 1, j) = logistic_fwd(ws_gates(i, 1, j) + bias(1, j)); + states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 1, j); + } + }); + + // 4. gemm Wh[2],h~t + (this->*gemm_iter_func)('N', 'N', rnn.dic, rnn.mb, rnn.sic, 1.0, w_iter_[1], + rnn.weights_iter_ld, states_t_l_, rnn.states_ws_ld, 1.0, + &(ws_gates(0, 2, 0)), rnn.gates_ws_ld); + + // 5. activation h~t + calculate ht + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + ws_gates(i, 2, j) = tanh_fwd(ws_gates(i, 2, j) + bias(2, j)); + states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0, j) + + (1.0f - ws_gates(i, 0, j)) * ws_gates(i, 2, j); + } + }); +} + +template <> +rnn_cell_execution_sig(ref_rnn_fwd_u8s8_t::cell_execution_gru) { + assert(!"GRU int8 is not supported"); +} + +template <> +rnn_cell_execution_sig(ref_rnn_bwd_f32_t::cell_execution_gru) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + ws_states_aoc_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t states_tm1_l(rnn, states_tm1_l_); + ws_diff_w_iter_aoc_t diff_w_iter(rnn, diff_w_iter_); + ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_); + ws_diff_states_aoc_t diff_states_tp1_l(rnn, diff_states_tp1_l_); + ws_diff_states_aoc_t diff_states_t_lp1(rnn, diff_states_t_lp1_); + + // use state memory for intermediate computations + // TODO: use cell ws for that + float *dhG1_ = &(diff_states_t_l(rnn.n_states, 0, 0)); + float *hG1_ = dhG1_; + AOC dhG1(dhG1_, rnn.states_nld, rnn.states_ws_ld); + AOC hG1(hG1_, rnn.states_nld, rnn.states_ws_ld); + + // 1. calculate dG2, dG1, and part of dht-1 + // dG2^ = dh * (1 - G0) * (1 - G2^2) + // dG0^ = dh * (ht-1 - G2) * u * (1 - G0) + // dht-1 (part) = dh * G0 + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + float h = states_tm1_l(i, j); + float dHt = diff_states_tp1_l(0, i, j) + + diff_states_t_lp1(rnn.n_states, i, j); + float dG2 = (1.0f - ws_gates(i, 0, j)) * dHt + * one_m_square(ws_gates(i, 2, j)); + float dG0 = (h - ws_gates(i, 2, j)) * dHt + * x_m_square(ws_gates(i, 0, j)); + + diff_states_t_l(0, i, j) = dHt * ws_gates(i, 0, j); + ws_gates(i, 0, j) = dG0; + ws_gates(i, 2, j) = dG2; + } + }); + + // 2. calculate intermediate d(hG1) + // d(hG1) = dG2 * W2h^t + (this->*gemm_iter_func)('N', 'N', rnn.sic, rnn.mb, rnn.dic, 1.0, w_iter_[1], + rnn.weights_iter_ld, &(ws_gates(0, 2, 0)), rnn.gates_ws_ld, 0.0, + dhG1_, rnn.states_ws_ld); + + // 3. calculate dG1^ and part of dht-1 + // dG1^ = d(hG1) * h * G1 * (1 - G1) + // dht-1 (part) += d(hG1) * G1 + // h * G1 (required for dWh) + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + float h = states_tm1_l(i, j); + float G1 = ws_gates(i, 1, j); + diff_states_t_l(0, i, j) += dhG1(i, j) * G1; + ws_gates(i, 1, j) = dhG1(i, j) * h * x_m_square(G1); + hG1(i, j) = G1 * h; + } + }); + + // 4. calculate diff weights + // dWh1 += dG1 * h, dWh2 += dG2 * h, dWh3 += dG3 * (G1(*)h) + gemm('N', 'T', (rnn.n_gates - 1) * rnn.dic, rnn.sic, rnn.mb, 1.0, ws_gates_, + rnn.gates_ws_ld, states_tm1_l_, rnn.states_ws_ld, 1.0, diff_w_iter_, + rnn.diff_weights_iter_ld); + gemm('N', 'T', rnn.dic, rnn.sic, rnn.mb, 1.0, &(ws_gates(0, 2, 0)), + rnn.gates_ws_ld, hG1_, rnn.states_ws_ld, 1.0, + &(diff_w_iter(0, 2, 0)), rnn.diff_weights_iter_ld); + + // 5. calculate diff states + // dht-1 += dG1 * W1h + dG0 * W0h + (this->*gemm_iter_func)('N', 'N', rnn.sic, rnn.mb, + (rnn.n_gates - 1) * rnn.dic, 1.0, w_iter_[0], + rnn.weights_iter_ld, ws_gates_, rnn.gates_ws_ld, 1.0, + diff_states_t_l_, rnn.states_ws_ld); + + if (!rnn.merge_gemm_layer) { + // dWx += [dG0 dG1 dG2] * [x] + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.slc, rnn.mb, 1.0, ws_gates_, + rnn.gates_ws_ld, states_t_lm1_, rnn.states_ws_ld, 1.0, + diff_w_layer_, rnn.diff_weights_layer_ld); + // dx = dG2 * W2x + dG1 * W1x + dG0 * W0x + (this->*gemm_layer_func)('N', 'N', rnn.slc, rnn.mb, + rnn.n_gates * rnn.dic, 1.0, w_layer_[0], + rnn.weights_layer_ld, ws_gates_, rnn.gates_ws_ld, 0.0, + &(diff_states_t_l(rnn.n_states, 0, 0)), rnn.states_ws_ld); + } + + // 6. calculate diff bias + gates_reduction(rnn, ws_gates_, diff_bias_); +} +#undef AOC + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru_lbr.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru_lbr.cpp new file mode 100644 index 0000000..8dea8c9 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_gru_lbr.cpp @@ -0,0 +1,170 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + * Cell execution GRU with linear before reset + */ + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" + +#include "ref_rnn.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::math; +using namespace rnn_utils; +#define AOC array_offset_calculator + +template <> +rnn_elemwise_sig(ref_rnn_fwd_f32_t::gru_lbr_elemwise) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_); + ws_states_aoc_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t states_tm1_l(rnn, states_tm1_l_); + ws_gates_aoc_t ws_gemm_state(rnn, ws_cell_); + AOC ws_Wh_b(ws_grid_, rnn.mb, rnn.dic); + + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + float Wh_b = ws_gemm_state(i, 2, j) + bias(3, j); + ws_gates(i, 0, j) = logistic_fwd( + ws_gates(i, 0, j) + ws_gemm_state(i, 0, j) + bias(0, j)); + ws_gates(i, 1, j) = logistic_fwd( + ws_gates(i, 1, j) + ws_gemm_state(i, 1, j) + bias(1, j)); + ws_gates(i, 2, j) = tanh_fwd( + ws_gates(i, 2, j) + ws_gates(i, 1, j) * Wh_b + bias(2, j)); + states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0, j) + + (1.0f - ws_gates(i, 0, j)) * ws_gates(i, 2, j); + if (rnn.is_training) + ws_Wh_b(i, j) = Wh_b; + } + }); +} + +template <> +rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::gru_lbr_elemwise) { + assert(!"GRU LBR int8 is not supported"); +} + +template <> +rnn_cell_execution_sig(ref_rnn_fwd_f32_t::cell_execution_gru_lbr) { + if (!rnn.merge_gemm_layer) { + (this->*gemm_layer_func)('N', 'N', rnn.n_gates * rnn.dic, rnn.mb, + rnn.slc, 1.0, w_layer_[0], rnn.weights_layer_ld, + states_t_lm1_, rnn.states_ws_ld, 0.0, ws_gates_, + rnn.gates_ws_ld); + } + (this->*gemm_iter_func)('N', 'N', rnn.n_gates * rnn.dic, rnn.mb, rnn.sic, + 1.0, w_iter_[0], rnn.weights_iter_ld, states_tm1_l_, + rnn.states_ws_ld, 0.0, ws_cell_, rnn.gates_ws_ld); + (this->*elemwise_func)(rnn, ws_gates_, states_t_l_, c_states_t_l_, + states_tm1_l_, c_states_tm1_l_, diff_states_t_l_, + diff_states_t_lp1_, diff_states_tp1_l_, bias_[0], ws_grid_, + ws_cell_); +} + +template <> +rnn_cell_execution_sig(ref_rnn_fwd_u8s8_t::cell_execution_gru_lbr) { + assert(!"GRU LBR int8 is not supported"); +} + +template <> +rnn_elemwise_sig(ref_rnn_bwd_f32_t::gru_lbr_elemwise) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + ws_states_aoc_t states_tm1_l(rnn, states_tm1_l_); + ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_); + ws_diff_states_aoc_t diff_states_tp1_l(rnn, diff_states_tp1_l_); + ws_diff_states_aoc_t diff_states_t_lp1(rnn, diff_states_t_lp1_); + ws_gates_aoc_t ws_gates_r(rnn, ws_cell_); + AOC ws_Wh_b(ws_grid_, rnn.mb, rnn.dic); + + // 1. calculate dG1 dG2 dG3 + // dG0 = (dht - G2) * dht * (1 - G0) * G0 + // dG1 = (W*h + b) * dG2 * (1 - G1) * G1 + // dG2 = (1 - G0) * dht * (1 - G2*G2) + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + float h = states_tm1_l(i, j); + float dHt = diff_states_tp1_l(0, i, j) + + diff_states_t_lp1(rnn.n_states, i, j); + float dG0 = (h - ws_gates(i, 2, j)) * dHt + * x_m_square(ws_gates(i, 0, j)); + float dG2 = (1.0f - ws_gates(i, 0, j)) + * one_m_square(ws_gates(i, 2, j)) * dHt; + float dG1 = ws_Wh_b(i, j) * dG2 * x_m_square(ws_gates(i, 1, j)); + + diff_states_t_l(0, i, j) = dHt * ws_gates(i, 0, j); + ws_gates(i, 2, j) = dG2; + ws_gates_r(i, 2, j) = dG2 * ws_gates(i, 1, j); + ws_gates(i, 0, j) = ws_gates_r(i, 0, j) = dG0; + ws_gates(i, 1, j) = ws_gates_r(i, 1, j) = dG1; + } + }); +} + +template <> +rnn_cell_execution_sig(ref_rnn_bwd_f32_t::cell_execution_gru_lbr) { + ws_gates_aoc_t ws_gates_r(rnn, ws_cell_); + ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_); + + (this->*elemwise_func)(rnn, ws_gates_, states_t_l_, c_states_t_l_, + states_tm1_l_, c_states_tm1_l_, diff_states_t_l_, + diff_states_t_lp1_, diff_states_tp1_l_, bias_[0], ws_grid_, + ws_cell_); + + if (!rnn.merge_gemm_layer) { + // dx = dG * Wx^t + (this->*gemm_layer_func)('N', 'N', rnn.slc, rnn.mb, + rnn.n_gates * rnn.dic, 1.0, w_layer_[0], + rnn.weights_layer_ld, ws_gates_, rnn.gates_ws_ld, 0.0, + &diff_states_t_l(rnn.n_states, 0, 0), rnn.states_ws_ld); + // dWx += dG^t * x + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.slc, rnn.mb, 1.0, ws_gates_, + rnn.gates_ws_ld, states_t_lm1_, rnn.states_ws_ld, 1.0, + diff_w_layer_, rnn.diff_weights_layer_ld); + } + // dh += dGr * Wh^t + (this->*gemm_iter_func)('N', 'N', rnn.sic, rnn.mb, rnn.n_gates * rnn.dic, + 1.0, w_iter_[0], rnn.weights_iter_ld, ws_cell_, rnn.gates_ws_ld, + 1.0, diff_states_t_l_, rnn.states_ws_ld); + + // dWh += dGr^t * h + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.sic, rnn.mb, 1.0, ws_cell_, + rnn.gates_ws_ld, states_tm1_l_, rnn.states_ws_ld, 1.0, diff_w_iter_, + rnn.diff_weights_layer_ld); + + // db1-3 += e * dG + // db4 += e * (r * dG2) + gates_reduction(rnn, ws_gates_, diff_bias_); + + parallel_nd(rnn.dic, [&](int j) { + for (int i = 0; i < rnn.mb; i++) { + diff_bias_[3 * rnn.dic + j] += ws_gates_r(i, 2, j); + } + }); +} + +#undef AOC + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_lstm.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_lstm.cpp new file mode 100644 index 0000000..3345521 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_lstm.cpp @@ -0,0 +1,147 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + * Cell execution LSTM + */ + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" + +#include "../simple_q10n.hpp" +#include "ref_rnn.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::math; +using namespace rnn_utils; + +template <> +rnn_elemwise_sig(ref_rnn_fwd_f32_t::lstm_elemwise) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_); + ws_states_aoc_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_); + ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_); + + parallel_nd(rnn.mb, [&](int i) { +// WA. Loss of correctnes in case of simd loop unrolling with icc 18 +#if !defined(__INTEL_COMPILER) + PRAGMA_OMP_SIMD() +#endif + for (int j = 0; j < rnn.dic; j++) { + ws_gates(i, 0, j) = logistic_fwd(ws_gates(i, 0, j) + bias(0, j)); + ws_gates(i, 1, j) = logistic_fwd(ws_gates(i, 1, j) + bias(1, j)); + ws_gates(i, 2, j) = tanh_fwd(ws_gates(i, 2, j) + bias(2, j)); + ws_gates(i, 3, j) = logistic_fwd(ws_gates(i, 3, j) + bias(3, j)); + + float tmp = ws_gates(i, 1, j) * c_states_tm1_l(i, j) + + ws_gates(i, 0, j) * ws_gates(i, 2, j); + states_t_l(i, j) = ws_gates(i, 3, j) * tanh_fwd(tmp); + c_states_t_l(i, j) = tmp; + } + }); +} + +template <> +rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::lstm_elemwise) { + ws_gates_aoc_s32_t ws_gates_s32(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_); + ws_states_aoc_u8_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_); + ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_); + + float *weights_scales = pd()->attr()->rnn_weights_qparams_.scales_; + float data_shift = pd()->attr()->rnn_data_qparams_.shift_; + float data_scale = pd()->attr()->rnn_data_qparams_.scale_; + round_mode_t rmode = pd()->attr()->round_mode_; + + auto q_d = [&](float f) { + float qf = f * data_scale + data_shift; + return qz_a1b0()(qf, rmode); + }; + + auto deq_w = [&](acc_data_t s, int gate, int j) { + return pd()->attr()->rnn_weights_qparams_.mask_ == 0 ? + saturate(s) * (1.f / (weights_scales[0] * data_scale)) : + saturate(s) * (1.f / (weights_scales[gate * rnn.dic + j] + * data_scale)); + }; + + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + float G0 = logistic_fwd( + deq_w(ws_gates_s32(i, 0, j), 0, j) + bias(0, j)); + float G1 = logistic_fwd( + deq_w(ws_gates_s32(i, 1, j), 1, j) + bias(1, j)); + float G2 = tanh_fwd( + deq_w(ws_gates_s32(i, 2, j), 2, j) + bias(2, j)); + float G3 = logistic_fwd( + deq_w(ws_gates_s32(i, 3, j), 3, j) + bias(3, j)); + float tmp = G1 * c_states_tm1_l(i, j) + G0 * G2; + states_t_l(i, j) = q_d(G3 * tanh_fwd(tmp)); + c_states_t_l(i, j) = tmp; + } + }); +} + +template <> +rnn_elemwise_sig(ref_rnn_bwd_f32_t::lstm_elemwise) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_); + ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_); + ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_); + ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_); + ws_diff_states_aoc_t diff_states_tp1_l(rnn, diff_states_tp1_l_); + ws_diff_states_aoc_t diff_states_t_lp1(rnn, diff_states_t_lp1_); + + parallel_nd(rnn.mb, [&](int i) { + PRAGMA_OMP_SIMD() + for (int j = 0; j < rnn.dic; j++) { + float Ct = c_states_t_l(i, j); + /// @todo save it in the workspace in fwd pass or recompute it to + /// save bw + float tanhCt = tanh_fwd(Ct); + // we have 2 incoming diffs on Ht + float dHt = diff_states_tp1_l(0, i, j) + + diff_states_t_lp1(rnn.n_states, i, j); + float dCt = diff_states_tp1_l(1, i, j) + + one_m_square(tanhCt) * ws_gates(i, 3, j) * dHt; + + float dG1 = c_states_tm1_l(i, j) * dCt + * x_m_square(ws_gates(i, 1, j)); + float dG0 = ws_gates(i, 2, j) * dCt * x_m_square(ws_gates(i, 0, j)); + float dG3 = tanhCt * dHt * x_m_square(ws_gates(i, 3, j)); + float dG2 + = ws_gates(i, 0, j) * dCt * one_m_square(ws_gates(i, 2, j)); + + diff_states_t_l(1, i, j) = dCt * ws_gates(i, 1, j); + + ws_gates(i, 0, j) = dG0; + ws_gates(i, 1, j) = dG1; + ws_gates(i, 2, j) = dG2; + ws_gates(i, 3, j) = dG3; + } + }); +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_rnn.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_rnn.cpp new file mode 100644 index 0000000..4536e8d --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cell_rnn.cpp @@ -0,0 +1,113 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + * Cell execution of Vanilla RNN + */ + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" + +#include "ref_rnn.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::math; +using namespace rnn_utils; + +template <> +float activation( + float dd, float s, float alpha, float cliping) { + return relu_fwd(s, alpha); +} + +template <> +float activation( + float dd, float s, float alpha, float cliping) { + return relu_bwd(dd, s, alpha); +} + +template <> +float activation( + float dd, float s, float alpha, float cliping) { + return tanh_fwd(s); +} + +template <> +float activation( + float dd, float s, float alpha, float cliping) { + return dd * one_m_square(s); +} + +template <> +float activation( + float dd, float s, float alpha, float cliping) { + return logistic_fwd(s); +} + +template <> +float activation( + float dd, float s, float alpha, float cliping) { + return dd * x_m_square(s); +} + +template <> +rnn_elemwise_sig(ref_rnn_fwd_f32_t::rnn_elemwise) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_); + ws_states_aoc_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t states_tm1_l(rnn, states_tm1_l_); + + parallel_nd(rnn.mb, [&](int i) { + for (int j = 0; j < rnn.dic; j++) { + const float h + = activation_func(0, ws_gates(i, 0, j) + bias(0, j), 0, 0); + ws_gates(i, 0, j) = states_t_l(i, j) = h; + } + }); +} + +template <> +rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::rnn_elemwise) { + assert(!"VANILLA RNN int8 is not supported"); +} + +template <> +rnn_elemwise_sig(ref_rnn_bwd_f32_t::rnn_elemwise) { + ws_gates_aoc_t ws_gates(rnn, ws_gates_); + bias_aoc_t bias(rnn, bias_); + ws_states_aoc_t states_t_l(rnn, states_t_l_); + ws_states_aoc_t states_tm1_l(rnn, states_tm1_l_); + ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_); + ws_diff_states_aoc_t diff_states_tp1_l(rnn, diff_states_tp1_l_); + ws_diff_states_aoc_t diff_states_t_lp1(rnn, diff_states_t_lp1_); + + parallel_nd(rnn.mb, [&](int i) { + for (int j = 0; j < rnn.dic; ++j) { + const float dH = diff_states_t_lp1(rnn.n_states, i, j) + + diff_states_tp1_l(0, i, j); + auto g = ws_gates(i, 0, j); + ws_gates(i, 0, j) = activation_func(dH, g, 0, 0); + } + }); +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_rnn_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cpu_rnn_pd.hpp similarity index 63% rename from inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_rnn_pd.hpp rename to inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cpu_rnn_pd.hpp index 3b9317a..12b95c8 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_rnn_pd.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/cpu_rnn_pd.hpp @@ -18,13 +18,14 @@ #define CPU_RNN_PD_HPP #include "c_types_map.hpp" -#include "cpu_engine.hpp" -#include "cpu_memory.hpp" -#include "cpu_primitive.hpp" +#include "../cpu_engine.hpp" +#include "../cpu_memory.hpp" +#include "../cpu_primitive.hpp" #include "nstl.hpp" #include "rnn_pd.hpp" #include "type_helpers.hpp" #include "utils.hpp" +#include "rnn_utils.hpp" namespace mkldnn { namespace impl { @@ -87,10 +88,6 @@ protected: using namespace memory_format; if (src_layer_pd_.desc()->format == any) CHECK(src_layer_pd_.set_format(tnc)); - if (weights_layer_pd_.desc()->format == any) - CHECK(weights_layer_pd_.set_format(ldigo)); - if (weights_iter_pd_.desc()->format == any) - CHECK(weights_iter_pd_.set_format(ldigo)); if (dst_layer_pd_.desc()->format == any) CHECK(dst_layer_pd_.set_format(tnc)); @@ -104,14 +101,51 @@ protected: return status::success; } + + status_t check_layout_consistency() { + using namespace memory_format; + using namespace utils; + using namespace data_type; + bool ok = true; + ok = ok && src_layer_pd_.desc()->format == tnc + && dst_layer_pd_.desc()->format == tnc; + ok = ok && IMPLICATION(!src_iter_pd_.is_zero(), + src_iter_pd_.desc()->format == ldsnc) + && IMPLICATION(!dst_iter_pd_.is_zero(), + dst_iter_pd_.desc()->format == ldsnc); + + ok = ok && one_of(weights_layer_pd_.desc()->format, ldigo, rnn_packed) + && one_of(weights_iter_pd_.desc()->format, ldigo, rnn_packed); + ok = ok && IMPLICATION(weights_iter_pd_.desc()->format == rnn_packed, + weights_iter_pd_.desc() + ->layout_desc.rnn_packed_desc.format + == mkldnn_ldigo_p); + ok = ok && IMPLICATION(weights_layer_pd_.desc()->format == rnn_packed, + weights_layer_pd_.desc() + ->layout_desc.rnn_packed_desc.format + == mkldnn_ldigo_p); + + ok = ok && IMPLICATION(!bias_pd_.is_zero(), + bias_pd_.desc()->format == ldgo); + + /* Int8 is supported only for packed weights */ + data_type_t weights_iter_dt = weights_iter_pd_.desc()->data_type; + data_type_t weights_layer_dt = weights_layer_pd_.desc()->data_type; + ok = ok && IMPLICATION(weights_iter_dt == s8, + weights_iter_pd_.desc()->format == rnn_packed); + ok = ok && IMPLICATION(weights_layer_dt == s8, + weights_layer_pd_.desc()->format == rnn_packed); + + return ok ? status::success : status::unimplemented; + } }; struct cpu_rnn_bwd_pd_t : public rnn_bwd_pd_t { using cpu_memory_pd_t = cpu_memory_t::pd_t; cpu_rnn_bwd_pd_t(engine_t *engine, const rnn_desc_t *adesc, - const primitive_attr_t *attr, const rnn_bwd_pd_t *hint_bwd_pd) - : rnn_bwd_pd_t(engine, adesc, attr, hint_bwd_pd) + const primitive_attr_t *attr, const rnn_fwd_pd_t *hint_fwd_pd) + : rnn_bwd_pd_t(engine, adesc, attr, hint_fwd_pd) , src_layer_pd_(engine, &desc_.src_layer_desc) , src_iter_pd_(engine, &desc_.src_iter_desc) , weights_layer_pd_(engine, &desc_.weights_layer_desc) @@ -203,14 +237,22 @@ protected: CHECK(src_layer_pd_.set_format(tnc)); if (diff_src_layer_pd_.desc()->format == any) CHECK(diff_src_layer_pd_.set_format(tnc)); - if (weights_layer_pd_.desc()->format == any) - CHECK(weights_layer_pd_.set_format(ldgoi)); - if (diff_weights_layer_pd_.desc()->format == any) - CHECK(diff_weights_layer_pd_.set_format(ldigo)); - if (weights_iter_pd_.desc()->format == any) - CHECK(weights_iter_pd_.set_format(ldgoi)); - if (diff_weights_iter_pd_.desc()->format == any) - CHECK(diff_weights_iter_pd_.set_format(ldigo)); + if (diff_weights_layer_pd_.desc()->format == any) { + memory_desc_t md = *(diff_weights_layer_pd_.desc()); + md.format = ldigo; + CHECK(memory_desc_wrapper::compute_blocking(md)); + CHECK(rnn_utils::set_good_strides(md)); + cpu_memory_t::pd_t new_pd(engine_, &md); + diff_weights_layer_pd_ = new_pd; + } + if (diff_weights_iter_pd_.desc()->format == any) { + memory_desc_t md = *(diff_weights_iter_pd_.desc()); + md.format = ldigo; + CHECK(memory_desc_wrapper::compute_blocking(md)); + CHECK(rnn_utils::set_good_strides(md)); + cpu_memory_t::pd_t new_pd(engine_, &md); + diff_weights_iter_pd_ = new_pd; + } if (dst_layer_pd_.desc()->format == any) CHECK(dst_layer_pd_.set_format(tnc)); if (diff_dst_layer_pd_.desc()->format == any) @@ -234,6 +276,45 @@ protected: return status::success; } + + status_t check_layout_consistency() { + using namespace memory_format; + using namespace utils; + bool ok = true; + ok = ok && src_layer_pd_.desc()->format == tnc + && dst_layer_pd_.desc()->format == tnc; + ok = ok && IMPLICATION(!src_iter_pd_.is_zero(), + src_iter_pd_.desc()->format == ldsnc) + && IMPLICATION(!dst_iter_pd_.is_zero(), + dst_iter_pd_.desc()->format == ldsnc); + + ok = ok && one_of(weights_layer_pd_.desc()->format, ldgoi, rnn_packed) + && one_of(weights_iter_pd_.desc()->format, ldgoi, rnn_packed); + ok = ok && IMPLICATION(weights_iter_pd_.desc()->format == rnn_packed, + weights_iter_pd_.desc() + ->layout_desc.rnn_packed_desc.format + == mkldnn_ldgoi_p); + ok = ok && IMPLICATION(weights_layer_pd_.desc()->format == rnn_packed, + weights_layer_pd_.desc() + ->layout_desc.rnn_packed_desc.format + == mkldnn_ldgoi_p); + + ok = ok && IMPLICATION(!bias_pd_.is_zero(), + bias_pd_.desc()->format == ldgo); + + ok = ok && diff_src_layer_pd_.desc()->format == tnc + && diff_dst_layer_pd_.desc()->format == tnc; + ok = ok && IMPLICATION(!diff_states_pd_.is_zero(), + diff_states_pd_.desc()->format == ldsnc) + && IMPLICATION(!diff_dst_iter_pd_.is_zero(), + diff_dst_iter_pd_.desc()->format == ldsnc); + ok = ok && diff_weights_layer_pd_.desc()->format == ldigo + && diff_weights_iter_pd_.desc()->format == ldigo; + ok = ok && IMPLICATION(!diff_bias_pd_.is_zero(), + diff_bias_pd_.desc()->format == ldgo); + + return ok ? status::success : status::unimplemented; + } }; } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/jit_uni_rnn_postgemm.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/jit_uni_rnn_postgemm.hpp new file mode 100644 index 0000000..048264c --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/jit_uni_rnn_postgemm.hpp @@ -0,0 +1,424 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + * Cell execution LSTM + */ + +#include "rnn_utils.hpp" +#include "../jit_generator.hpp" +#include "../jit_uni_eltwise.hpp" +#include "c_types_map.hpp" +#include "utils.hpp" + +#include "mkldnn_thread.hpp" + + +namespace mkldnn { +namespace impl { +namespace cpu { + +struct jit_uni_rnn_postgemm_kernel : public jit_generator { + + typedef void (*kernel_t)(void *gates_, const void *bias, void *states_t_l_, + void *c_states_t_l_, void *c_states_tm1_l_); + + jit_uni_rnn_postgemm_kernel(const rnn_utils::rnn_conf_t &rnn, const primitive_attr_t *attr): rnn_(rnn), attr_(attr){} + + virtual void init() = 0; + +template + rnn_elemwise_sig(execute) { + rnn_utils::ws_gates_aoc ws_gates(rnn, ws_gates_); + rnn_utils::bias_aoc_t bias(rnn, bias_); + rnn_utils::ws_states_aoc states_t_l(rnn, states_t_l_); + rnn_utils::ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_); + rnn_utils::ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_); + + // Todo: add parallelization on dic for the batch 1 case + // Assumption: the kernel runs a loop on dic elements + parallel_nd(rnn.mb, [&](int i) { + auto b_ = &bias(0, 0); + auto g_ = &ws_gates(i, 0, 0); + auto s_tl_ = &states_t_l(i, 0); + auto c_tl_ = &c_states_t_l(i, 0); + auto c_tm1l_ = &c_states_tm1_l(i, 0); + kernel_(g_, b_, s_tl_, c_tm1l_, c_tl_); + }); + } + +protected: + kernel_t kernel_; + const rnn_utils::rnn_conf_t &rnn_; + const primitive_attr_t *attr_; +}; + +template +struct jit_uni_lstm_postgemm_kernel_fwd: public jit_uni_rnn_postgemm_kernel +{ + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_lstm_postgemm_kernel_fwd) + + typedef typename utils::conditional::type acc_data_t; + typedef typename utils::conditional, + jit_uni_eltwise_injector_f32>::type injector_t; + + jit_uni_lstm_postgemm_kernel_fwd(const rnn_utils::rnn_conf_t &rnn, const primitive_attr_t *attr) + : jit_uni_rnn_postgemm_kernel(rnn, attr){} + + void init() override { + // we use rax for both constant tables as they use the same table + sigmoid_injector_ = new injector_t(this, + alg_kind::eltwise_logistic, 0.0f, 0.0f, true, rax); + tanh_injector_ = new injector_t(this, + alg_kind::eltwise_tanh, 0.0f, 0.0f, true, rax); + generate(); + kernel_ = (kernel_t) this->getCode(); + } + +protected: + injector_t *sigmoid_injector_; + injector_t *tanh_injector_; + + // register size in bytes + using Vmm = typename jit_uni_eltwise_injector_f32::Vmm; + size_t vlen = cpu_isa_traits::vlen; + size_t vlen_dst = (src_data_t == data_type::u8) ? vlen/4 : vlen; + size_t cstate_dt_size = sizeof(float); + size_t hstate_dt_size = (src_data_t == data_type::u8) ? sizeof(uint8_t) : sizeof(float); + size_t gate_dt_size = (src_data_t == data_type::u8) ? sizeof(uint32_t) : sizeof(float); + size_t qscale_dt_size = sizeof(float); + size_t bias_dt_size = sizeof(float); + + void generate() { + using namespace Xbyak; + + int mask = attr_->rnn_weights_qparams_.mask_; + float *weights_scales = attr_->rnn_weights_qparams_.scales_; + float data_scale = attr_->rnn_data_qparams_.scale_; + float data_shift = attr_->rnn_data_qparams_.shift_; + round_mode_t rmode = attr_->round_mode_; + + // Labels declaration + Label vector_loop_start_label, vector_loop_end_label; + Label rem_loop_start_label, rem_loop_end_label; + Label table_label; + + // Register map + Reg64 loop_cnt(r11); // loop counter + Reg64 table_reg(rbx); // table is used for data scale and shifts + Reg64 tmp_reg(r12); // used as temporary to customize mxcsr + Reg64 weights_scales_reg(r13); + // We skip vmm0 as it can be used by the injector for masks on sse4.2 + Vmm G0(1), G1(2), G2(3), G3(4), tmp1_vmm(5), tmp2_vmm(6), zero_vmm(7); + + // stack map + Address saved_csr_addr = ptr[rsp]; + Address modified_csr_addr = ptr[rsp + sizeof(int64_t)]; + size_t stack_size = 2 * sizeof(int64_t); + + // constant table map + Address dscale_off_addr = ptr[table_reg]; + Address dshift_off_addr = ptr[table_reg + vlen]; + Address ymm_perm_mask_addr = ptr[table_reg + 2*vlen]; + Address zmm_perm_mask_addr = ptr[table_reg + 2*vlen + cpu_isa_traits::vlen]; + + // quantize from float to u8 + auto q_d = [&](Vmm f, Vmm tmp_vmm, Reg64 tmp_reg) { + sub(rsp, stack_size); + stmxcsr(saved_csr_addr); // save the mxcsr + + // set the rounding mode appropriatly + mov(tmp_reg, saved_csr_addr); + and_(tmp_reg, 0xffff9fff); // clear rc bits (rc = RNE) + if (rmode == round_mode::down) + or_(tmp_reg, 0x00002000); // set rc=01 if RD + mov(modified_csr_addr, tmp_reg); + ldmxcsr(modified_csr_addr); + + uni_vpxor(tmp_vmm, tmp_vmm, tmp_vmm); + uni_vmulps(f, f, dscale_off_addr); // apply scale + uni_vaddps(f, f, dshift_off_addr); // apply shift + uni_vcvtps2dq(f, f); // convert to int32 with mxcsr rounding + uni_vpackssdw(f, f, tmp_vmm); // convert from s32 to s16 + uni_vpackuswb(f, f, tmp_vmm); // convert from s16 to u8 with saturation + // Note that the results are interleaved by 128 bit chunks, so we need to merge them together + switch (vlen) { + case 64: { //avx512 + Zmm fz(f.getIdx()), tmpz(tmp_vmm.getIdx()); + uni_vmovups(tmpz, zmm_perm_mask_addr); + vpermd(fz, tmpz, fz); + break; } + case 32: { //avx + Ymm fy(f.getIdx()), tmpy(tmp_vmm.getIdx()); + uni_vmovups(tmpy, ymm_perm_mask_addr); + vpermd(fy, tmpy, fy); + break; } + case 16: // sse: nothing to do + break; + default: assert(!"Unsupported case"); + }; + + ldmxcsr(saved_csr_addr); // restore the original mxcsr + add(rsp, stack_size); + }; + + auto fast_recip =[&](Vmm s, Vmm tmp, bool packed) { + if (packed) + uni_vrcpps(tmp, s); + else + uni_vrcpss(tmp, s); // prevent divide by zero + // we add one Newton iteration + uni_vmulps(s, s, tmp); + uni_vmulps(s, s, tmp); // s <- s * tmp^2 + uni_vaddps(tmp, tmp, tmp); + uni_vsubps(tmp, tmp, s); + uni_vmovups(s, tmp); // s <- 2 * tmp - s * tmp^2 + }; + + // dequantize from s32 to float + auto deq_w = [&](Vmm s, Vmm tmp1, Vmm tmp2, int gate, bool packed) { + // TODO: if mask is 0 precompute mul and inverse + if (mask == 0) + uni_vbroadcastss(tmp1, ptr[weights_scales_reg]); + else + uni_vmovups(tmp1, ptr[weights_scales_reg + gate * rnn_.dic * qscale_dt_size]); + uni_vcvtdq2ps(s, s); + uni_vmulps(tmp1, tmp1, dscale_off_addr); + fast_recip(tmp1, tmp2, packed); + uni_vmulps(s, s, tmp1); + }; + + // We start code generations here + preamble(); + + // extract addresses passed as parameter +#ifdef _WIN32 + auto addr_ws_gates_reg = abi_param1; + auto addr_bias_reg = abi_param2; + auto addr_states_t_l_reg = abi_param3; + auto addr_c_states_tm1_l_reg = abi_param4; + auto addr_c_states_t_l_reg = r10; + // Here we cannot use rbp to have initial stack pointer so we + // use rsp and offset it with the size of pushed registers in + // preamble + mov(addr_c_states_t_l_reg, ptr[rsp + get_size_of_abi_save_regs() + 40]); +#else + auto addr_ws_gates_reg = abi_param1; + auto addr_bias_reg = abi_param2; + auto addr_states_t_l_reg = abi_param3; + auto addr_c_states_tm1_l_reg = abi_param4; + auto addr_c_states_t_l_reg = abi_param5; +#endif + + // initialize registers with addresses and constants + mov(table_reg, table_label); + mov(weights_scales_reg, size_t(weights_scales)); + // both sigmoid and tanh use the same table so load address just once in rax + sigmoid_injector_->load_table_addr(); + + mov(loop_cnt, rnn_.dic * gate_dt_size); + cmp(loop_cnt, vlen); + jl(vector_loop_end_label, Xbyak::CodeGenerator::T_NEAR); + + L(vector_loop_start_label); + { + // load G0 G1 G2 G3 + uni_vmovups(G0, ptr[addr_ws_gates_reg + 0 * rnn_.dic * gate_dt_size]); + uni_vmovups(G1, ptr[addr_ws_gates_reg + 1 * rnn_.dic * gate_dt_size]); + uni_vmovups(G2, ptr[addr_ws_gates_reg + 2 * rnn_.dic * gate_dt_size]); + uni_vmovups(G3, ptr[addr_ws_gates_reg + 3 * rnn_.dic * gate_dt_size]); + + // dequantize the gates from s32 to f32 if needed + if (src_data_t == data_type::u8){ + deq_w(G0, tmp1_vmm, tmp2_vmm, 0, true); + deq_w(G1, tmp1_vmm, tmp2_vmm, 1, true); + deq_w(G2, tmp1_vmm, tmp2_vmm, 2, true); + deq_w(G3, tmp1_vmm, tmp2_vmm, 3, true); + } + + // add biases + uni_vaddps(G0, G0, ptr[addr_bias_reg + 0 * rnn_.dic * bias_dt_size]); + uni_vaddps(G1, G1, ptr[addr_bias_reg + 1 * rnn_.dic * bias_dt_size]); + uni_vaddps(G2, G2, ptr[addr_bias_reg + 2 * rnn_.dic * bias_dt_size]); + uni_vaddps(G3, G3, ptr[addr_bias_reg + 3 * rnn_.dic * bias_dt_size]); + + // inject eltwise code + sigmoid_injector_->compute_vector(G0.getIdx()); + sigmoid_injector_->compute_vector(G1.getIdx()); + tanh_injector_->compute_vector(G2.getIdx()); + sigmoid_injector_->compute_vector(G3.getIdx()); + + // compute c_states_t_l = G1 * c_tm1_l + G0 * G2 + uni_vmovups(tmp1_vmm, ptr[addr_c_states_tm1_l_reg]); + uni_vmulps(tmp1_vmm, tmp1_vmm, G1); + uni_vfmadd231ps(tmp1_vmm, G0, G2); + uni_vmovups(ptr[addr_c_states_t_l_reg], tmp1_vmm); + + // states_t_l = G3 * tanh(c_states_t_l) + tanh_injector_->compute_vector(tmp1_vmm.getIdx()); + uni_vmulps(tmp1_vmm, tmp1_vmm, G3); + + // if int8, we quantize the resulting state + if (src_data_t == data_type::u8) { + q_d(tmp1_vmm, tmp2_vmm, tmp_reg); + } + + // write back the result + if(vlen_dst == vlen) + uni_vmovups(ptr[addr_states_t_l_reg], tmp1_vmm); + else + // we write only 1/4 of the register + switch(vlen_dst){ + case 16: uni_vmovups(ptr[addr_states_t_l_reg], Xmm(tmp1_vmm.getIdx())); break; + case 8: uni_vmovsd(ptr[addr_states_t_l_reg], Xmm(tmp1_vmm.getIdx())); break; + case 4: uni_vmovss(ptr[addr_states_t_l_reg], Xmm(tmp1_vmm.getIdx())); break; + default: + assert(!"Unsuported vector length for quantization"); + } + + // increment address pointers + add(addr_ws_gates_reg, vlen); + add(addr_bias_reg, vlen); + add(addr_states_t_l_reg, vlen_dst); + add(addr_c_states_tm1_l_reg, vlen); + add(addr_c_states_t_l_reg, vlen); + if (mask != 0) + add(weights_scales_reg, vlen); + + // increment loop counter + sub(loop_cnt, vlen); + cmp(loop_cnt, vlen); + jge(vector_loop_start_label); + } + L(vector_loop_end_label); + + cmp(loop_cnt, 0); + je(rem_loop_end_label, Xbyak::CodeGenerator::T_NEAR); + // Same code as above, we just use movuss for accessing inputs + // TODO: smarter handling of tails with Zmm -> Ymm -> Xmm -> scalar + L(rem_loop_start_label); + { + // remaping registers to Xmms + Xmm G0s(G0.getIdx()), G1s(G1.getIdx()), G2s(G2.getIdx()), G3s(G3.getIdx()); + Xmm tmp1s_vmm(tmp1_vmm.getIdx()); + + // load G0 G1 G2 G3 + uni_vmovss(G0s, ptr[addr_ws_gates_reg + 0 * rnn_.dic * gate_dt_size]); + uni_vmovss(G1s, ptr[addr_ws_gates_reg + 1 * rnn_.dic * gate_dt_size]); + uni_vmovss(G2s, ptr[addr_ws_gates_reg + 2 * rnn_.dic * gate_dt_size]); + uni_vmovss(G3s, ptr[addr_ws_gates_reg + 3 * rnn_.dic * gate_dt_size]); + + // dequantize the gates from s32 to f32 if needed + if (src_data_t == data_type::u8){ + deq_w(G0, tmp1_vmm, tmp2_vmm, 0, false); + deq_w(G1, tmp1_vmm, tmp2_vmm, 1, false); + deq_w(G2, tmp1_vmm, tmp2_vmm, 2, false); + deq_w(G3, tmp1_vmm, tmp2_vmm, 3, false); + } + + // add biases + uni_vmovss(tmp1s_vmm, ptr[addr_bias_reg + 0 * rnn_.dic * bias_dt_size]); + uni_vaddps(G0s, G0s, tmp1s_vmm); + uni_vmovss(tmp1s_vmm, ptr[addr_bias_reg + 1 * rnn_.dic * bias_dt_size]); + uni_vaddps(G1s, G1s, tmp1s_vmm); + uni_vmovss(tmp1s_vmm, ptr[addr_bias_reg + 2 * rnn_.dic * bias_dt_size]); + uni_vaddps(G2s, G2s, tmp1s_vmm); + uni_vmovss(tmp1s_vmm, ptr[addr_bias_reg + 3 * rnn_.dic * bias_dt_size]); + uni_vaddps(G3s, G3s, tmp1s_vmm); + + // inject eltwise code + sigmoid_injector_->compute_vector(G0s.getIdx()); + sigmoid_injector_->compute_vector(G1s.getIdx()); + tanh_injector_->compute_vector(G2s.getIdx()); + sigmoid_injector_->compute_vector(G3s.getIdx()); + + // compute c_states_t_l = G1 * c_tm1_l + G0s * G2 + uni_vmovups(tmp1s_vmm, ptr[addr_c_states_tm1_l_reg]); + uni_vmulps(tmp1s_vmm, tmp1s_vmm, G1s); + uni_vfmadd231ps(tmp1s_vmm, G0s, G2s); + uni_vmovss(ptr[addr_c_states_t_l_reg], tmp1s_vmm); + + // states_t_l = G3 * tanh(c_states_t_l) + tanh_injector_->compute_vector(tmp1s_vmm.getIdx()); + uni_vmulps(tmp1s_vmm, tmp1s_vmm, G3s); + + // if int8, we quantize the resulting state + if (src_data_t == data_type::u8) { + q_d(tmp1_vmm, tmp2_vmm, tmp_reg); + } + + // write back the result + if(vlen_dst == vlen) + uni_vmovups(ptr[addr_states_t_l_reg], tmp1s_vmm); + else + // we write only 1/4 of the register + switch(vlen_dst){ + case 16: uni_vmovups(ptr[addr_states_t_l_reg], Xmm(tmp1s_vmm.getIdx())); break; + case 8: uni_vmovsd(ptr[addr_states_t_l_reg], Xmm(tmp1s_vmm.getIdx())); break; + case 4: uni_vmovss(ptr[addr_states_t_l_reg], Xmm(tmp1s_vmm.getIdx())); break; + default: + assert(!"Unsuported vector length for quantization"); + } + + // increment address pointers + add(addr_ws_gates_reg, gate_dt_size); + add(addr_bias_reg, bias_dt_size); + add(addr_states_t_l_reg, hstate_dt_size); + add(addr_c_states_tm1_l_reg, cstate_dt_size); + add(addr_c_states_t_l_reg, cstate_dt_size); + if (mask != 0) + add(weights_scales_reg, qscale_dt_size); + + // increment loop counter + sub(loop_cnt, gate_dt_size); + cmp(loop_cnt, 0); + jg(rem_loop_start_label); + + } + L(rem_loop_end_label); + + postamble(); + + // Again, only one table is needed and shared between sigmoid and tanh + sigmoid_injector_->prepare_table(false); + tanh_injector_->prepare_table(true); + + L(table_label); + { + for (size_t i = 0; i < vlen / sizeof(float); i++) dd(float2int(data_scale)); + for (size_t i = 0; i < vlen / sizeof(float); i++) dd(float2int(data_shift)); + // perm mask for ymm + dd(0); dd(4); dd(2); dd(3); dd(1); dd(5); dd(6); dd(7); + // perm mask for zmm + dd(0); dd(4); dd(8); dd(12); dd(1); dd(5); dd(6); dd(7); + dd(2); dd(9); dd(10); dd(11); dd(3); dd(12); dd(13); dd(14); + } + } + +}; + +template struct jit_uni_lstm_postgemm_kernel_fwd; +template struct jit_uni_lstm_postgemm_kernel_fwd; +template struct jit_uni_lstm_postgemm_kernel_fwd; + +template struct jit_uni_lstm_postgemm_kernel_fwd; +template struct jit_uni_lstm_postgemm_kernel_fwd; +template struct jit_uni_lstm_postgemm_kernel_fwd; +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.cpp new file mode 100644 index 0000000..1e88713 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.cpp @@ -0,0 +1,807 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* + General architecture + + for diff states, we have n_states + 1 as we have n_states diff + to propagate to the previous iteration and 1 states to propagate + to the previous layer + index 0 is dh for cell(t-1, l) to consume + index 1 is dc for cell(t-1, l) to consume + index 2 is dh for cell(t, l-1) to consume + this indexing enables to have the same indexing for states in elemwise + function + only the cell execution function should be impacted + + */ + +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" + +#include "ref_rnn.hpp" +#include "../gemm/gemm.hpp" +#include "../simple_q10n.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::memory_tracking::names; +using namespace rnn_utils; +#define AOC array_offset_calculator + +template +void _ref_rnn_common_t::gates_reduction( + const rnn_conf_t &rnn, const acc_data_t *ws_gates_, + float *diff_bias_) const { + auto body = [&](int i, int k) { + for (int j = 0; j < rnn.mb; j++) + diff_bias_[i * rnn.dic + k] + += ws_gates_[j * rnn.gates_ws_ld + i * rnn.dic + k]; + }; + + // @todo block k on simd-width +#if MKLDNN_THR == MKLDNN_THR_OMP && _OPENMP >= 201307 \ + /* icc 17.0 has a problem with simd collapse */ \ + && !((defined __INTEL_COMPILER) && (__INTEL_COMPILER == 1700)) +#pragma omp parallel for simd collapse(2) + for (int i = 0; i < rnn.n_gates; i++) + for (int k = 0; k < rnn.dic; k++) + body(i, k); +#else + parallel_nd(rnn.n_gates, rnn.dic, body); +#endif +} + +template +rnn_gemm_sig((_ref_rnn_common_t::gemm)) { + assert(ldA * ldB * ldC != 0); + extended_sgemm(&transA, &transB, &m, &n, &k, &alpha, a_, &ldA, b_, &ldB, + &beta, c_, &ldC, nullptr, pd()->rnn_.use_jit_gemm); +} + +template <> +rnn_gemm_sig((ref_rnn_fwd_u8s8_t::gemm)) { + assert(!"non packed gemm is disabled for int8"); +} + +template +rnn_gemm_sig((_ref_rnn_common_t::packed_gemm)) { +#if (USE_MKL_PACKED_GEMM) + assert(transA == 'N'); + cblas_sgemm_compute(CblasColMajor, CblasPacked, + (transB == 'T') ? CblasTrans : CblasNoTrans, m, n, k, a_, ldA, b_, + ldB, beta, c_, ldC); +#else + UNUSED(transA); + UNUSED(transB); + UNUSED(m); + UNUSED(n); + UNUSED(k); + UNUSED(alpha); + UNUSED(ldA); + UNUSED(b_); + UNUSED(ldB); + UNUSED(beta); + UNUSED(c_); + UNUSED(ldC); + assert(!"packed gemm is disabled"); +#endif +} + +template <> +rnn_gemm_sig((ref_rnn_fwd_u8s8_t::packed_gemm)) { +#if (USE_MKL_PACKED_GEMM) + int8_t offseta = 0, offsetb = 0; + int32_t offsetc = 0; + cblas_gemm_s8u8s32_compute(CblasColMajor, (CBLAS_TRANSPOSE)CblasPacked, + CblasNoTrans, CblasFixOffset, m, n, k, alpha, a_, ldA, offseta, b_, + ldB, offsetb, beta, c_, ldC, &offsetc); +#else + UNUSED(transA); + UNUSED(transB); + UNUSED(m); + UNUSED(n); + UNUSED(k); + UNUSED(alpha); + UNUSED(ldA); + UNUSED(b_); + UNUSED(ldB); + UNUSED(beta); + UNUSED(c_); + UNUSED(ldC); + assert(!"packed gemm is disabled"); +#endif +} + +//*************** Grid computations strategy: linear ***************// +template +rnn_grid_execution_sig( + (_ref_rnn_common_t::linear_execution)) { + AOC ws_states(ws_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.states_nld * rnn.states_ws_ld); + AOC ws_c_states(ws_c_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.states_nld * rnn.states_ws_ld); + AOC ws_diff_states(ws_diff_states_, rnn.n_layer + 1, rnn.n_dir, + (rnn.n_states + 1), rnn.n_iter + 1, + rnn.states_nld * rnn.states_ws_ld); + AOC ws_gates(ws_gates_, rnn.n_layer, rnn.n_dir, rnn.n_iter, + rnn.gates_nld * rnn.gates_ws_ld); + AOC weights_input( + weights_layer_, rnn.n_layer, rnn.n_dir, rnn.n_parts_weights_layer); + AOC weights_states( + weights_states_, rnn.n_layer, rnn.n_dir, rnn.n_parts_weights_iter); + AOC bias( + bias_, rnn.n_layer, rnn.n_dir, rnn.n_parts_bias); + AOC diff_weights_layer(diff_weights_layer_, rnn.n_layer, + rnn.n_dir, + rnn.diff_weights_layer_nld * rnn.diff_weights_layer_ld); + AOC diff_weights_iter(diff_weights_iter_, rnn.n_layer, rnn.n_dir, + rnn.diff_weights_iter_nld * rnn.diff_weights_iter_ld); + AOC diff_bias( + diff_bias_, rnn.n_layer, rnn.n_dir, rnn.n_bias * rnn.dic); + AOC ws_grid( + ws_grid_, rnn.n_layer, rnn.n_dir, rnn.n_iter, (int)rnn.ws_per_cell); + + // We run the grid of computation + for (int dir = 0; dir < rnn.n_dir; dir++) { + for (int j = 0; j < rnn.n_layer; j++) { + int lay = (aprop == prop_kind::forward) ? j : rnn.n_layer - j - 1; + + if ((aprop == prop_kind::forward) && rnn.merge_gemm_layer) { + (this->*gemm_layer_func)('N', 'N', rnn.n_gates * rnn.dic, + rnn.mb * rnn.n_iter, rnn.slc, 1.0, + weights_input(lay, dir, 0), rnn.weights_iter_ld, + &(ws_states(lay, dir, 1, 0)), rnn.states_ws_ld, 0.0, + &(ws_gates(lay, dir, 0, 0)), rnn.gates_ws_ld); + } + + for (int i = 0; i < rnn.n_iter; i++) { + int iter = (aprop == prop_kind::forward) ? i : rnn.n_iter - i - 1; + (this->*cell_func)(rnn, + &(ws_states(lay + 1, dir, iter + 1, 0)), + &(ws_c_states(lay + 1, dir, iter + 1, 0)), + &(ws_diff_states(lay, dir, 0, iter, 0)), + &(weights_input(lay, dir, 0)), + &(weights_states(lay, dir, 0)), + &(bias(lay, dir, 0)), + &(ws_states(lay, dir, iter + 1, 0)), + &(ws_states(lay + 1, dir, iter, 0)), + &(ws_c_states(lay + 1, dir, iter, 0)), + &(ws_diff_states(lay + 1, dir, 0, iter, 0)), + &(ws_diff_states(lay, dir, 0, iter + 1, 0)), + &(diff_weights_layer(lay, dir, 0)), + &(diff_weights_iter(lay, dir, 0)), + &(diff_bias(lay, dir, 0)), + &(ws_gates(lay, dir, iter, 0)), + &(ws_grid(lay, dir, iter, 0)), + ws_cell_); + } + + if ((aprop == prop_kind::backward) && rnn.merge_gemm_layer) { + (this->*gemm_layer_func)('N', 'N', rnn.slc, rnn.mb * rnn.n_iter, + rnn.n_gates * rnn.dic, 1.0, weights_input(lay, dir, 0), + rnn.weights_layer_ld, + (src_data_t *)(&(ws_gates(lay, dir, 0, 0))), + rnn.gates_ws_ld, 0.0, + (acc_data_t *)(&(ws_diff_states( + lay, dir, rnn.n_states, 0, 0))), + rnn.states_ws_ld); + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.slc, + rnn.mb * rnn.n_iter, 1.0, + (weights_data_t *)(&(ws_gates(lay, dir, 0, 0))), + rnn.gates_ws_ld, + (src_data_t *)(&(ws_states(lay, dir, 1, 0))), + rnn.states_ws_ld, 1.0, + (acc_data_t *)(&(diff_weights_layer(lay, dir, 0))), + rnn.diff_weights_layer_ld); + } + if ((aprop == prop_kind::backward) && rnn.merge_gemm_iter) { + gemm('N', 'T', rnn.n_gates * rnn.dic, rnn.sic, + rnn.mb * rnn.n_iter, 1.0, + (weights_data_t *)(&(ws_gates(lay, dir, 0, 0))), + rnn.gates_ws_ld, + (src_data_t *)(&(ws_states(lay + 1, dir, 0, 0))), + rnn.states_ws_ld, 1.0, + (acc_data_t *)(&(diff_weights_iter(lay, dir, 0))), + rnn.diff_weights_iter_ld); + } + } + } +} + +//********* GRID computations strategy: utility functions **********// + +template +void _ref_rnn_common_t::copy_init_layer( + const rnn_conf_t &rnn, src_data_t *__restrict ws_states_, + float *__restrict ws_diff_states_, const src_data_t *__restrict xt_, + const float *__restrict diff_dst_layer_) const { + + AOC ws_states( + ws_states_, rnn.n_dir, rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + auto xt_d = memory_desc_wrapper(pd()->src_pd(0)); + + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + auto xxt = xt_ + xt_d.blk_off(it, b); + src_data_t *ws_l2r_ptr = &(ws_states(0, it + 1, b, 0)); + src_data_t *ws_r2l_ptr = &(ws_states(rnn.n_dir - 1, rnn.n_iter - it, b, 0)); + if (rnn.exec_dir != r2l) + for (int c = 0; c < rnn.slc; c++) + ws_l2r_ptr[c] = xxt[c]; + if (rnn.exec_dir != l2r) + for (int c = 0; c < rnn.slc; c++) + ws_r2l_ptr[c] = xxt[c]; + }); +} + +template <> +void ref_rnn_bwd_f32_t::copy_init_layer(const rnn_conf_t &rnn, + src_data_t *ws_states_, float *ws_diff_states_, const src_data_t *xt_, + const float *diff_dst_layer_) const { + AOC ws_diff_states(ws_diff_states_, rnn.n_layer + 1, rnn.n_dir, + (rnn.n_states + 1), rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + auto diff_dst_layer_d = memory_desc_wrapper(pd()->diff_dst_pd(0)); + + switch (rnn.exec_dir) { + case bi_concat: + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + auto diff_dst_layer_x + = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b); + for (int s = 0; s < rnn.dic; s++) { + ws_diff_states(rnn.n_layer, 0, rnn.n_states, it, b, s) + = diff_dst_layer_x[s]; + ws_diff_states( + rnn.n_layer, 1, rnn.n_states, rnn.n_iter - it - 1, b, s) + = diff_dst_layer_x[rnn.dic + s]; + } + }); + break; + case bi_sum: + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + auto diff_dst_layer_x + = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b); + for (int s = 0; s < rnn.dic; s++) { + ws_diff_states(rnn.n_layer, 0, rnn.n_states, it, b, s) + = diff_dst_layer_x[s]; + ws_diff_states( + rnn.n_layer, 1, rnn.n_states, rnn.n_iter - it - 1, b, s) + = diff_dst_layer_x[s]; + } + }); + break; + case l2r: + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + auto diff_dst_layer_x + = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b); + for (int s = 0; s < rnn.dic; s++) { + ws_diff_states(rnn.n_layer, 0, rnn.n_states, it, b, s) + = diff_dst_layer_x[s]; + } + }); + break; + case r2l: + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + auto diff_dst_layer_x = diff_dst_layer_ + + diff_dst_layer_d.blk_off(rnn.n_iter - it - 1, b); + for (int s = 0; s < rnn.dic; s++) { + ws_diff_states(rnn.n_layer, 0, rnn.n_states, it, b, s) + = diff_dst_layer_x[s]; + } + }); + break; + default: assert(!"Unsupported direction"); break; + } +} + +/* For int8 configuration, input iteration states may be of types f32 or u8 + * Internally h_state is always stored in u8 and c_state is always stored in f32 + * If input states are of type u8 then h state is copied and c state is dequantized + * If input states are of type f32 then h state is quantized and c_state is copied + * */ +template +template +void _ref_rnn_common_t::copy_init_iter( + const rnn_conf_t &rnn, src_data_t *__restrict ws_states_, + float *__restrict ws_c_states_, float *__restrict ws_diff_states_, + const input_data_t *__restrict firstit_states_, + const float *__restrict diff_dst_iter_) const { + AOC ws_states(ws_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + AOC ws_c_states(ws_c_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + float data_shift = pd()->attr()->rnn_data_qparams_.shift_; + float data_scale = pd()->attr()->rnn_data_qparams_.scale_; + round_mode_t rmode = pd()->attr()->round_mode_; + + const bool quantize + = pd()->desc()->src_iter_desc.data_type == data_type::f32 + && rnn.dt_conf != all_f32; + auto maybe_q = [&](input_data_t f) { + if (quantize) { + float qf = f * data_scale + data_shift; + return qz_a1b0()(qf, rmode); + } else + return (src_data_t)f; + }; + + const bool dequantize + = pd()->desc()->src_iter_desc.data_type == data_type::u8; + auto maybe_deq = [&](input_data_t s) { + if (dequantize) + return (((float)s - data_shift) / data_scale); + else + return (float)s; + }; + auto firstit_states_d = memory_desc_wrapper(pd()->src_pd(1)); + if (firstit_states_) { + parallel_nd( + rnn.n_layer, rnn.n_dir, rnn.mb, [&](int lay, int dir, int b) { + for (int s = 0; s < rnn.sic; s++) + ws_states(lay + 1, dir, 0, b, s) = maybe_q( + firstit_states_[firstit_states_d.blk_off( + lay, dir, 0, b, s)]); + if (pd()->cell_kind() == alg_kind::vanilla_lstm) + for (int s = 0; s < rnn.sic; s++) + ws_c_states(lay + 1, dir, 0, b, s) = maybe_deq( + firstit_states_[firstit_states_d.blk_off( + lay, dir, 1, b, s)]); + }); + } else { + parallel_nd( + rnn.n_layer, rnn.n_dir, rnn.mb, [&](int lay, int dir, int b) { + for (int j = 0; j < rnn.sic; j++) { + ws_states(lay + 1, dir, 0, b, j) = (src_data_t)0; + ws_c_states(lay + 1, dir, 0, b, j) = 0.0f; + } + }); + } +} + +template <> +template +void ref_rnn_bwd_f32_t::copy_init_iter(const rnn_conf_t &rnn, + src_data_t *ws_states_, float *ws_c_states_, float *ws_diff_states_, + const input_data_t *firstit_states_, + const float *diff_dst_iter_) const { + AOC ws_diff_states(ws_diff_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_states + 1, rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + auto diff_dst_iter_d = memory_desc_wrapper(pd()->diff_dst_pd(1)); + if (diff_dst_iter_) { + parallel_nd(rnn.n_layer, rnn.n_dir, rnn.n_states, rnn.mb, + [&](int lay, int dir, int state, int b) { + array_copy(&(ws_diff_states( + lay, dir, state, rnn.n_iter, b, 0)), + diff_dst_iter_ + + diff_dst_iter_d.blk_off( + lay, dir, state, b), + rnn.dic); + }); + } else { + parallel_nd(rnn.n_layer, rnn.n_dir, rnn.n_states, rnn.mb, + [&](int lay, int dir, int state, int i) { + for (int j = 0; j < rnn.dic; j++) + ws_diff_states(lay, dir, state, rnn.n_iter, i, j) + = 0.0f; + }); + } +} + +template +template +void _ref_rnn_common_t::copy_res_layer( + const rnn_conf_t &rnn, dst_data_t *dst_layer_, float *diff_src_layer, + const src_data_t *ws_states_, const float *ws_diff_states_) const { + + auto dst_layer_d = memory_desc_wrapper(pd()->dst_pd(0)); + AOC ws_states(ws_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + float shift = (pd()->attr()->rnn_data_qparams_.shift_); + float scale = (pd()->attr()->rnn_data_qparams_.scale_); + + const bool dequantize + = pd()->desc()->dst_layer_desc.data_type == data_type::f32 + && rnn.dt_conf != all_f32; + auto maybe_deq = [&](src_data_t s) { + if (dequantize) + return (dst_data_t)(((float)s - shift) / scale); + else + return (dst_data_t)s; + }; + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + int dir = 0; + if (rnn.exec_dir != r2l) { + for (int s = 0; s < rnn.dic; s++) { + dst_layer_[dst_layer_d.blk_off(it, b, dir * rnn.dic + s)] + = maybe_deq(ws_states(rnn.n_layer, dir, it + 1, b, s)); + } + dir = 1; + } + if (rnn.exec_dir != l2r) { + for (int s = 0; s < rnn.dic; s++) + switch (rnn.exec_dir) { + case bi_sum: + dst_layer_[dst_layer_d.blk_off(it, b, s)] + += maybe_deq(ws_states( + rnn.n_layer, dir, rnn.n_iter - it, b, s)); + break; + default: + dst_layer_[dst_layer_d.blk_off(it, b, dir * rnn.dic + s)] + = maybe_deq(ws_states( + rnn.n_layer, dir, rnn.n_iter - it, b, s)); + } + } + }); +} + +template <> +template +void ref_rnn_bwd_f32_t::copy_res_layer( + const rnn_conf_t &rnn, dst_data_t *dst_layer_, float *diff_src_layer_, + const src_data_t *ws_states_, const float *ws_diff_states_) const { + auto diff_src_layer_d = memory_desc_wrapper(pd()->diff_src_pd(0)); + AOC ws_diff_states(ws_diff_states_, rnn.n_layer + 1, + rnn.n_dir, rnn.n_states + 1, rnn.n_iter + 1, rnn.mb, + rnn.states_ws_ld); + + parallel_nd(rnn.n_iter, rnn.mb, [&](int it, int b) { + int dir = 0; + for (int s = 0; s < rnn.slc; s++) { + float *dst_addr = diff_src_layer_ + + diff_src_layer_d.blk_off( + (rnn.exec_dir == r2l) ? rnn.n_iter - 1 - it : it, + b, dir * rnn.slc + s); + float res = ws_diff_states(0, 0, rnn.n_states, it, b, s); + if (rnn.n_dir - 1) + res += ws_diff_states( + 0, 1, rnn.n_states, rnn.n_iter - 1 - it, b, s); + dst_addr[0] = res; + } + }); +} + +template +template +void _ref_rnn_common_t::copy_res_iter( + const rnn_conf_t &rnn, output_data_t *dst_iter_, float *diff_src_iter_, + const src_data_t *ws_states_, float *ws_c_states_, + const float *ws_diff_states_) const { + auto dst_iter_d = memory_desc_wrapper(pd()->dst_pd(1)); + AOC ws_states(ws_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + AOC ws_c_states(ws_c_states_, rnn.n_layer + 1, rnn.n_dir, + rnn.n_iter + 1, rnn.mb, rnn.states_ws_ld); + float data_shift = pd()->attr()->rnn_data_qparams_.shift_; + float data_scale = pd()->attr()->rnn_data_qparams_.scale_; + round_mode_t rmode = pd()->attr()->round_mode_; + + const bool quantize = pd()->desc()->dst_iter_desc.data_type == data_type::u8 + && rnn.dt_conf != all_f32; + auto maybe_q = [&](float f) { + if (quantize) { + float qf = f * data_scale + data_shift; + return qz_a1b0()(qf, rmode); + } else + return (output_data_t)f; + }; + + const bool dequantize + = pd()->desc()->dst_iter_desc.data_type == data_type::f32 + && rnn.dt_conf != all_f32; + auto maybe_deq = [&](src_data_t s) { + if (dequantize) + return (output_data_t)(((float)s - data_shift) / data_scale); + else + return (output_data_t)s; + }; + if (dst_iter_) { + parallel_nd(rnn.n_layer, rnn.n_dir, rnn.mb, + [&](int lay, int dir, int b) { + for (int s = 0; s < rnn.dic; s++) { + dst_iter_[dst_iter_d.blk_off(lay, dir, 0, b, s)] + = maybe_deq(ws_states(lay + 1, dir, rnn.n_iter, b, s)); + } + if (pd()->cell_kind() == alg_kind::vanilla_lstm) + for (int s = 0; s < rnn.dic; s++) { + dst_iter_[dst_iter_d.blk_off(lay, dir, 1, b, s)] + = maybe_q(ws_c_states( + lay + 1, dir, rnn.n_iter, b, s)); + } + }); + } +} + +template <> +template +void ref_rnn_bwd_f32_t::copy_res_iter( + const rnn_conf_t &rnn, output_data_t *dst_iter_, float *diff_src_iter_, + const src_data_t *ws_states_, float *ws_c_states_, + const float *ws_diff_states_) const { + auto diff_src_iter_d = memory_desc_wrapper(pd()->diff_src_pd(1)); + AOC ws_diff_states(ws_diff_states_, rnn.n_layer + 1, + rnn.n_dir, rnn.n_states + 1, rnn.n_iter + 1, rnn.mb, + rnn.states_ws_ld); + if (diff_src_iter_) { + parallel_nd(rnn.n_layer, rnn.n_dir, rnn.n_states, rnn.mb, + [&](int lay, int dir, int state, int b) { + for (int s = 0; s < rnn.sic; s++) { + diff_src_iter_[diff_src_iter_d.blk_off( + lay, dir, state, b, s)] + = ws_diff_states(lay, dir, state, 0, b, s); + } + }); + } +} + +template +rnn_bias_prepare_sig((_ref_rnn_common_t::bias_prepare)) { + /* Original set of bias provided by the user */ + AOC b( + b_, rnn.n_layer, rnn.n_dir, rnn.n_bias * rnn.dic); + /* Array of pointers initialized in packing */ + AOC bias(bias_, rnn.n_layer, rnn.n_dir, rnn.n_parts_bias); + AOC scratch_bias( + scratch_bias_, rnn.n_layer, rnn.n_dir, rnn.n_bias * rnn.dic); + + if (rnn.copy_bias) { + parallel_nd(rnn.n_layer * rnn.n_dir * rnn.n_bias * rnn.dic, + [&](size_t i) { scratch_bias_[i] = b_[i]; }); + } + + for (int i = 0; i < rnn.n_layer; i++) { + for (int d = 0; d < rnn.n_dir; d++) { + int offset_bias = 0; + for (int p = 0; p < rnn.n_parts_bias; p++) { + bias(i, d, p) = rnn.copy_bias + ? (float *) &scratch_bias(i, d, offset_bias) + : (float *) &b(i, d, offset_bias); + offset_bias += rnn.parts_bias[p] * rnn.dic; + } + } + } + +} + +template +rnn_bias_finalize_sig( + (_ref_rnn_common_t::bias_finalize)) { + if (rnn.dt_conf != all_f32) { + float data_shift = pd()->attr()->rnn_data_qparams_.shift_; + float data_scale = pd()->attr()->rnn_data_qparams_.scale_; + float *weights_scales = pd()->attr()->rnn_weights_qparams_.scales_; + bool scale_per_oc = pd()->attr()->rnn_weights_qparams_.mask_ != 0; + for (int i = 0; i < rnn.n_layer * rnn.n_dir; i++) + for (int j = 0; j < rnn.n_bias * rnn.dic; j++) { + size_t off = i * rnn.n_bias * rnn.dic + j; + float weights_scale + = scale_per_oc ? weights_scales[j] : weights_scales[0]; + scratch_bias_[off] -= (w_iter_comp[off] + w_layer_comp[off]) + * data_shift / (weights_scale * data_scale); + } + } +} + +template +rnn_weights_assign_sig((_ref_rnn_common_t::assign_packed_weights)) { + AOC weights(weights_, rnn.n_layer, rnn.n_dir, n_parts); + + size_t offset_packed = 0; + for (int l = 0; l < rnn.n_layer; l++) + for (int d = 0; d < rnn.n_dir; d++) { + for (int p = 0; p < n_parts; p++) { + weights(l, d, p) = (weights_data_t *)&w_[offset_packed]; + offset_packed + += part_weights_pack_size[p] / sizeof(weights_data_t); + } + } +} + +template +rnn_weights_assign_sig( + (_ref_rnn_common_t::assign_weights)) { + assert(nld * ld != 0); + /* Original set of weights provided by the user */ + AOC w(w_, rnn.n_layer, rnn.n_dir, nld * ld); + /* Array of pointers for each part of weights */ + AOC weights(weights_, rnn.n_layer, rnn.n_dir, n_parts); + + for (int i = 0; i < rnn.n_layer; i++) + for (int d = 0; d < rnn.n_dir; d++) { + size_t offset_weights = 0; + for (int p = 0; p < n_parts; p++) { + weights(i, d, p) = (weights_data_t *)&w(i, d, offset_weights); + offset_weights += fmt == memory_format::ldigo ? + gates_per_part[p] * OC_size : + gates_per_part[p] * OC_size * ld; + } + } +} + +//********************* Execution function *********************// +template +void _ref_rnn_common_t::execute_() const { + const rnn_conf_t &rnn = this->pd()->rnn_; + int input_idx = 0; + int output_idx = 0; + auto input = reinterpret_cast( + this->input_memory(input_idx++)); + auto states = pd()->with_src_iter() ? (this->input_memory(input_idx++)) : + nullptr; + + const char *layer_weights_n_comp = this->input_memory(input_idx++); + auto w_layer + = reinterpret_cast(layer_weights_n_comp); + auto w_layer_comp = reinterpret_cast(layer_weights_n_comp + + rnn.weights_layer_comp_offset); + const char *iter_weights_n_comp = this->input_memory(input_idx++); + auto w_iter + = reinterpret_cast(iter_weights_n_comp); + auto w_iter_comp = reinterpret_cast(iter_weights_n_comp + + rnn.weights_iter_comp_offset); + auto bias = pd()->with_bias() ? + reinterpret_cast(this->input_memory(input_idx++)) : + nullptr; + + auto dst_last_layer = rnn.is_fwd ? this->memory(output_idx++) : + this->input_memory(input_idx++); + auto dst_last_iter = pd()->with_dst_iter() + ? (rnn.is_fwd + ? this->memory(output_idx++) + : this->input_memory(input_idx++)) + : nullptr; + + auto diff_dst_layer = rnn.is_fwd ? + nullptr : + reinterpret_cast(this->input_memory(input_idx++)); + auto diff_dst_iter = rnn.is_fwd || !pd()->with_dst_iter() ? + nullptr : + reinterpret_cast(this->input_memory(input_idx++)); + + auto scratchpad = this->scratchpad(); + + auto ptr_wei_layer + = scratchpad.template get(key_rnn_ptrs_wei_layer); + auto ptr_wei_iter + = scratchpad.template get(key_rnn_ptrs_wei_iter); + auto ptr_bias = + scratchpad.template get(key_rnn_ptrs_bia); + + // fetchihg buffers from the workspace + // if no workspace was provided we use the scratchpad + char *scratch_ptr = scratchpad.template get(key_rnn_space); + char *ws_ptr = nullptr; + if (rnn.use_workspace) + ws_ptr = rnn.is_fwd + ? this->memory(output_idx++) + : const_cast(this->input_memory(input_idx++)); + char *base_ptr = rnn.use_workspace ? ws_ptr : scratch_ptr; + acc_data_t *ws_gates = (acc_data_t *)(base_ptr + ws_gates_offset_); + src_data_t *ws_states = (src_data_t *)(base_ptr + ws_states_offset_); + float *ws_c_states = (float *)(base_ptr + ws_c_states_offset_); + float *ws_diff_states = (float *)(base_ptr + ws_diff_states_offset_); + float *ws_grid = (float *)(base_ptr + ws_grid_comp_offset_); + float *ws_cell = (float *)(base_ptr + ws_cell_comp_offset_); + + auto diff_src_layer = rnn.is_fwd ? + nullptr : + reinterpret_cast(this->memory(output_idx++)); + auto diff_src_iter = rnn.is_fwd || !pd()->with_src_iter() ? + nullptr : + reinterpret_cast(this->memory(output_idx++)); + auto diff_weights_layer = rnn.is_fwd ? + nullptr : + reinterpret_cast(this->memory(output_idx++)); + auto diff_weights_iter = rnn.is_fwd ? + nullptr : + reinterpret_cast(this->memory(output_idx++)); + auto diff_bias = rnn.is_fwd || !pd()->with_bias() ? + nullptr : + reinterpret_cast(this->memory(output_idx++)); + + // Fetching extra buffers from scratchpad + float *ws_bias = (float *)(scratch_ptr + ws_bias_offset_); + + // initialize diff_states to 0 + if (aprop == prop_kind::backward) + array_set(ws_diff_states, 0.0f, rnn.ws_diff_states_size / sizeof(float)); + + /* Pack(if using packed gemm API) or copy(if input arrays have bad leading + * dimension */ + (this->*bias_preparation_func)(rnn, ptr_bias, bias, ws_bias); + + (this->*weights_iter_assign_func)(rnn, rnn.weights_iter_fmt, + rnn.weights_iter_nld, rnn.weights_iter_ld, rnn.dic, + rnn.sic, rnn.n_parts_weights_iter, rnn.parts_weights_iter, + rnn.part_weights_iter_pack_size, ptr_wei_iter, w_iter, + ptr_bias, bias, ws_bias); + (this->*weights_layer_assign_func)(rnn, rnn.weights_layer_fmt, + rnn.weights_layer_nld, rnn.weights_layer_ld, rnn.dic, rnn.slc, + rnn.n_parts_weights_layer, rnn.parts_weights_layer, + rnn.part_weights_layer_pack_size, ptr_wei_layer, w_layer, ptr_bias, + bias, ws_bias); + + (this->*bias_finalization_func)(rnn, ws_bias, w_iter_comp, w_layer_comp); + + // we first need to copy the initial states and input into ws + copy_init_layer(rnn, ws_states, ws_diff_states, input, diff_dst_layer); + if (rnn.dt_conf == f32u8f32u8 || rnn.dt_conf == f32u8f32f32 + || rnn.dt_conf == all_f32) + copy_init_iter(rnn, ws_states, ws_c_states, ws_diff_states, + (const float *)states, diff_dst_iter); + else if (rnn.dt_conf == u8u8u8u8 || rnn.dt_conf == u8u8u8f32) + copy_init_iter(rnn, ws_states, ws_c_states, ws_diff_states, + (const uint8_t *)states, diff_dst_iter); + else + assert(!"unimplemented"); + + // run the execution on the grid + (this->*grid_computation)(rnn, ptr_wei_layer, ptr_wei_iter, ptr_bias, + ws_states, ws_c_states, ws_diff_states, ws_gates, ws_cell, ws_grid, + diff_weights_layer, diff_weights_iter, diff_bias); + + // Finally we copy the results to the result buffers + if (rnn.dt_conf == u8u8u8f32 || rnn.dt_conf == f32u8f32f32 + || rnn.dt_conf == all_f32) + copy_res_layer(rnn, (float *)dst_last_layer, diff_src_layer, ws_states, + ws_diff_states); + else if (rnn.dt_conf == u8u8u8u8 || rnn.dt_conf == f32u8f32u8) + copy_res_layer(rnn, (uint8_t *)dst_last_layer, diff_src_layer, + ws_states, ws_diff_states); + else + assert(!"unimplemented"); + + if (rnn.dt_conf == f32u8f32u8 || rnn.dt_conf == f32u8f32f32 + || rnn.dt_conf == all_f32) + copy_res_iter(rnn, (float *)dst_last_iter, diff_src_iter, ws_states, + ws_c_states, ws_diff_states); + else if (rnn.dt_conf == u8u8u8u8 || rnn.dt_conf == u8u8u8f32) + copy_res_iter(rnn, (uint8_t *)dst_last_iter, diff_src_iter, ws_states, + ws_c_states, ws_diff_states); + else + assert(!"unimplemented"); +}; + +/* Fix for MSVS warning C4661 */ +template<> rnn_cell_execution_sig(ref_rnn_fwd_f32_t::cell_execution); +template<> rnn_cell_execution_sig(ref_rnn_fwd_u8s8_t::cell_execution); +template<> rnn_cell_execution_sig(ref_rnn_bwd_f32_t::cell_execution); +template<> rnn_cell_execution_sig(ref_rnn_fwd_f32_t::cell_execution_gru); +template<> rnn_cell_execution_sig(ref_rnn_fwd_u8s8_t::cell_execution_gru); +template<> rnn_cell_execution_sig(ref_rnn_bwd_f32_t::cell_execution_gru); +template<> rnn_cell_execution_sig(ref_rnn_fwd_f32_t::cell_execution_gru_lbr); +template<> rnn_cell_execution_sig(ref_rnn_fwd_u8s8_t::cell_execution_gru_lbr); +template<> rnn_cell_execution_sig(ref_rnn_bwd_f32_t::cell_execution_gru_lbr); +template<> rnn_elemwise_sig(ref_rnn_fwd_f32_t::rnn_elemwise); +template<> rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::rnn_elemwise); +template<> rnn_elemwise_sig(ref_rnn_bwd_f32_t::rnn_elemwise); +template<> rnn_elemwise_sig(ref_rnn_fwd_f32_t::lstm_elemwise); +template<> rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::lstm_elemwise); +template<> rnn_elemwise_sig(ref_rnn_bwd_f32_t::lstm_elemwise); +template<> rnn_elemwise_sig(ref_rnn_fwd_f32_t::gru_lbr_elemwise); +template<> rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::gru_lbr_elemwise); +template<> rnn_elemwise_sig(ref_rnn_bwd_f32_t::gru_lbr_elemwise); + +template struct _ref_rnn_common_t; +template struct _ref_rnn_common_t; +template struct _ref_rnn_common_t; + +#undef AOC +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.hpp new file mode 100644 index 0000000..c213b41 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/ref_rnn.hpp @@ -0,0 +1,335 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_REF_RNN_HPP +#define CPU_REF_RNN_HPP + +#include + +#include "c_types_map.hpp" +#include "memory_tracking.hpp" +#include "type_helpers.hpp" +#include "utils.hpp" + +#include "../cpu_isa_traits.hpp" +#include "../gemm/os_blas.hpp" + +#include "cpu_rnn_pd.hpp" +#include "rnn_utils.hpp" +#include "jit_uni_rnn_postgemm.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +float activation(float s, float alpha, float cliping, float dd); + +template +struct _ref_rnn_common_t : public cpu_primitive_t { + typedef typename prec_traits::type src_data_t; + typedef typename prec_traits::type weights_data_t; + typedef typename utils::conditional::type acc_data_t; + + using class_name = _ref_rnn_common_t; + + typedef rnn_elemwise_sig((class_name::*elemwise_f)); + typedef rnn_cell_execution_sig((class_name::*cell_execution_f)); + typedef rnn_grid_execution_sig((class_name::*grid_execution_f)); + + typedef rnn_gemm_sig((class_name::*gemm_t)); + typedef rnn_bias_prepare_sig((class_name::*bias_prepare_t)); + typedef rnn_bias_finalize_sig((class_name::*bias_finalize_t)); + typedef rnn_weights_assign_sig((class_name::*weights_assign_t)); + + using base_pd_t = + typename utils::conditional::type; + + struct pd_t : public base_pd_t { + pd_t(engine_t *engine, const rnn_desc_t *adesc, + const primitive_attr_t *attr, + const typename pd_t::hint_class *hint_pd) + : base_pd_t(engine, adesc, attr, hint_pd) {} + + DECLARE_COMMON_PD_T("ref:any", class_name); + + status_t init() { + using namespace prop_kind; + using namespace utils; + using namespace memory_format; + using namespace rnn_utils; + assert(this->engine()->kind() == engine_kind::cpu); + const alg_kind_t cell_kind = this->desc()->cell_desc.cell_kind; + + data_type_t src_layer_dt = this->desc()->src_layer_desc.data_type; + data_type_t weights_iter_dt + = this->desc()->weights_iter_desc.data_type; + data_type_t weights_layer_dt + = this->desc()->weights_layer_desc.data_type; + + bool ok = true + && one_of(cell_kind, alg_kind::vanilla_rnn, + alg_kind::vanilla_lstm, alg_kind::vanilla_gru, + alg_kind::gru_linear_before_reset) + && IMPLICATION(aprop == prop_kind::forward, + one_of(this->desc()->prop_kind, forward_training, + forward_inference)) + && IMPLICATION(aprop == backward, + one_of(this->desc()->prop_kind, backward)) + && src_layer_dt == src_type + && everyone_is( + weights_type, weights_iter_dt, weights_layer_dt) + && this->set_default_params() == status::success + && this->with_bias(); + if (!ok) + return status::unimplemented; + + init_conf(rnn_, *this->desc(), this->src_pd(0), this->src_pd(1), + this->weights_pd(0), this->weights_pd(1), this->dst_pd(0)); + + if (rnn_.dt_conf == all_f32) + ok = ok && this->attr()->has_default_values(); + + // Set weights descriptors to desired format + memory_desc_t weights_layer_md = *(this->weights_layer_pd_.desc()); + CHECK(set_expected_desc(rnn_, weights_layer_md, false)); + cpu_memory_t::pd_t new_weights_layer_pd( + this->engine_, &weights_layer_md); + if (this->weights_layer_pd_.desc()->format == any) { + this->weights_layer_pd_ = new_weights_layer_pd; + } else if (this->weights_layer_pd_.desc()->format == rnn_packed) { + if (!this->weights_layer_pd_.is_equal(&new_weights_layer_pd)) + return status::unimplemented; + } + + memory_desc_t weights_iter_md = *(this->weights_iter_pd_.desc()); + CHECK(set_expected_desc(rnn_, weights_iter_md, true)); + cpu_memory_t::pd_t new_weights_iter_pd( + this->engine_, &weights_iter_md); + if (this->weights_iter_pd_.desc()->format == any) { + this->weights_iter_pd_ = new_weights_iter_pd; + } else if (this->weights_iter_pd_.desc()->format == rnn_packed) { + if (!this->weights_iter_pd_.is_equal(&new_weights_iter_pd)) + return status::unimplemented; + } + + CHECK(this->check_layout_consistency()); + + set_conf(rnn_, *this->desc(), this->weights_pd(0), + this->weights_pd(1), this->diff_weights_pd(0), + this->diff_weights_pd(1)); + + size_t scratchpad_sz{0}, ws_sz{0}; + get_scratchpad_and_workspace_sizes(rnn_, scratchpad_sz, ws_sz); + + // initialize the workspace_pd if needed + if (rnn_.is_training) { + dims_t ws_dims = {(int)ws_sz}; + memory_desc_t ws_d; + mkldnn_memory_desc_init(&ws_d, 1, ws_dims, data_type::u8, x); + this->ws_pd_ = cpu_memory_t::pd_t(this->engine(), &ws_d); + } + + init_scratchpad(scratchpad_sz); + + return status::success; + } + + rnn_utils::rnn_conf_t rnn_; + + private: + void init_scratchpad(size_t scratchpad_sz) { + using namespace memory_tracking::names; + auto scratchpad = this->scratchpad_registry().registrar(); + scratchpad.book(key_rnn_space, sizeof(float) * scratchpad_sz, 4096); + + int max_nparts = this->cell_kind() == alg_kind::vanilla_gru ? 2 : 1; + int ptr_wei_sz = rnn_.n_layer * rnn_.n_dir * max_nparts; + scratchpad.book(key_rnn_ptrs_wei_layer, + sizeof(float *) * ptr_wei_sz); + scratchpad.book(key_rnn_ptrs_wei_iter, + sizeof(float *) * ptr_wei_sz); + scratchpad.book(key_rnn_ptrs_bia, + sizeof(float *) * ptr_wei_sz); + } + }; + + _ref_rnn_common_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs, true), rnn_postgemm_(nullptr) { + /// @todo set max_feature_size assuming that we limit the number of + /// iterations and layer to one if slc != dic and sic != dic + /// respectively + + bias_preparation_func = &class_name::bias_prepare; + bias_finalization_func = &class_name::bias_finalize; + + auto set_gemm_funcs + = [](bool packed_gemm, gemm_t &g, weights_assign_t &a) { + if (packed_gemm) { + g = &class_name::packed_gemm; + a = &class_name::assign_packed_weights; + } else { + g = &class_name::gemm; + a = &class_name::assign_weights; + } + }; + set_gemm_funcs(pd()->rnn_.use_iter_packed_gemm, gemm_iter_func, + weights_iter_assign_func); + + set_gemm_funcs(pd()->rnn_.use_layer_packed_gemm, gemm_layer_func, + weights_layer_assign_func); + + switch (pd()->cell_kind()) { + case alg_kind::vanilla_lstm: + cell_func = &class_name::cell_execution; + if (aprop == prop_kind::forward) { + if (mayiuse(avx512_core)) + rnn_postgemm_ = new jit_uni_lstm_postgemm_kernel_fwd( + pd()->rnn_, pd()->attr()); + else if (mayiuse(avx2)) + rnn_postgemm_ = new jit_uni_lstm_postgemm_kernel_fwd( + pd()->rnn_, pd()->attr()); + else if (mayiuse(sse42)) + rnn_postgemm_ = new jit_uni_lstm_postgemm_kernel_fwd( + pd()->rnn_, pd()->attr()); + assert(rnn_postgemm_ != nullptr); + rnn_postgemm_->init(); + } + elemwise_func = &class_name::lstm_elemwise; + break; + case alg_kind::vanilla_rnn: // @todo switch on cell kind + cell_func = &class_name::cell_execution; + elemwise_func = &class_name::rnn_elemwise; + switch (pd()->activation_kind()) { + case alg_kind::eltwise_relu: + activation_func = &activation; + break; + case alg_kind::eltwise_tanh: + activation_func = &activation; + break; + case alg_kind::eltwise_logistic: + activation_func = &activation; + break; + default: break; + } + break; + case alg_kind::vanilla_gru: + cell_func = &class_name::cell_execution_gru; + break; + case alg_kind::gru_linear_before_reset: + cell_func = &class_name::cell_execution_gru_lbr; + elemwise_func = &class_name::gru_lbr_elemwise; + break; + default: break; + } + + grid_computation = &class_name::linear_execution; + + size_t scratchpad_size, workspace_size; + rnn_utils::set_offsets(pd()->rnn_, ws_gates_offset_, ws_states_offset_, + ws_c_states_offset_, ws_diff_states_offset_, + ws_grid_comp_offset_, ws_cell_comp_offset_, + ws_bias_offset_, scratchpad_size, workspace_size); + } + + ~_ref_rnn_common_t() {} + + // typedef typename prec_traits::type data_t; + + virtual void execute(event_t *e) const { + execute_(); + e->set_state(event_t::ready); + } + +private: + void execute_() const; + rnn_grid_execution_sig(linear_execution); + rnn_cell_execution_sig(cell_execution); + rnn_cell_execution_sig(cell_execution_gru); + rnn_cell_execution_sig(cell_execution_gru_lbr); + rnn_elemwise_sig(rnn_elemwise); + rnn_elemwise_sig(lstm_elemwise); + rnn_elemwise_sig(gru_lbr_elemwise); + rnn_gemm_sig(gemm); + rnn_gemm_sig(packed_gemm); + rnn_bias_prepare_sig(bias_prepare); + rnn_bias_finalize_sig(bias_finalize); + rnn_weights_assign_sig(assign_weights); + rnn_weights_assign_sig(assign_packed_weights); + + float (*activation_func)(float dd, float s, float alpha, float cliping); + + void copy_init_layer(const rnn_utils::rnn_conf_t &rnn, + src_data_t *ws_states_, float *ws_diff_states_, + const src_data_t *xt_, const float *diff_dst_layer) const; + + template + void copy_init_iter(const rnn_utils::rnn_conf_t &rnn, + src_data_t *ws_states_, float *ws_c_states, float *ws_diff_states_, + const input_data_t *firstit_states_, + const float *diff_dst_iter) const; + + template + void copy_res_layer(const rnn_utils::rnn_conf_t &rnn, + dst_data_t *dst_layer_, float *diff_src_layer, + const src_data_t *ws_states_, const float *ws_diff_states_) const; + + template + void copy_res_iter(const rnn_utils::rnn_conf_t &rnn, + output_data_t *dst_iter_, float *diff_src_iter, + const src_data_t *ws_states_, float *ws_c_states, + const float *ws_diff_states_) const; + + void gates_reduction(const rnn_utils::rnn_conf_t &rnn, + const acc_data_t *ws_gates_, float *diff_bias_) const; + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } + + size_t ws_gates_offset_; + size_t ws_states_offset_; + size_t ws_c_states_offset_; + size_t ws_bias_offset_; + size_t ws_diff_states_offset_; + size_t ws_grid_comp_offset_; + size_t ws_cell_comp_offset_; + jit_uni_rnn_postgemm_kernel *rnn_postgemm_; + + grid_execution_f grid_computation; + cell_execution_f cell_func; + + bias_prepare_t bias_preparation_func; + bias_finalize_t bias_finalization_func; + weights_assign_t weights_layer_assign_func; + weights_assign_t weights_iter_assign_func; + + gemm_t gemm_layer_func; + gemm_t gemm_iter_func; + elemwise_f elemwise_func; +}; + +using ref_rnn_fwd_f32_t = _ref_rnn_common_t; +using ref_rnn_bwd_f32_t = _ref_rnn_common_t; +using ref_rnn_fwd_u8s8_t = _ref_rnn_common_t; +} +} +} +#endif + +// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_reorders.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_reorders.hpp new file mode 100644 index 0000000..91dd85a --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_reorders.hpp @@ -0,0 +1,396 @@ +/******************************************************************************* + * Copyright 2018 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#ifndef CPU_RNN_REORDERS_HPP +#define CPU_RNN_REORDERS_HPP + +#include + +#include "type_helpers.hpp" +#include "mkldnn_thread.hpp" +#include "utils.hpp" +#include "simple_q10n.hpp" +#include "cpu_reorder_pd.hpp" +#include "../gemm/os_blas.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +template +struct rnn_data_reorder_t : public cpu_primitive_t { + struct pd_t : public cpu_reorder_pd_t { + pd_t(const cpu_memory_pd_t *input_pd, const cpu_memory_pd_t *output_pd, + const primitive_attr_t *attr) + : cpu_reorder_pd_t(input_pd, output_pd, attr) {} + + DECLARE_COMMON_PD_T("rnn_data_reorder", rnn_data_reorder_t); + + static status_t create(reorder_pd_t **reorder_pd, + const memory_pd_t *input_pd, const memory_pd_t *output_pd, + const primitive_attr_t *attr) { + using namespace memory_format; + using namespace data_type; + assert(input_pd->engine()->kind() == engine_kind::cpu); + assert(output_pd->engine()->kind() == engine_kind::cpu); + + const memory_desc_wrapper id(input_pd), od(output_pd); + bool args_ok = true + && id.data_type() == type_i + && od.data_type() == type_o + && utils::one_of(id.format(), tnc, ldsnc) + && od.format() == id.format(); + if (!args_ok) return status::invalid_arguments; + + auto _pd = new pd_t((const cpu_memory_pd_t *)input_pd, + (const cpu_memory_pd_t *)output_pd, attr); + if (_pd == nullptr) return out_of_memory; + if (_pd->init() != success) { delete _pd; return unimplemented; } + return safe_ptr_assign(*reorder_pd, _pd); + } + }; + +private: + typedef typename prec_traits::type in_data_t; + typedef typename prec_traits::type out_data_t; + + rnn_data_reorder_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + + virtual void execute(event_t *e) const { + auto input = reinterpret_cast(input_memory(0)); + auto output = reinterpret_cast(memory()); + const memory_desc_wrapper &input_d = pd()->input_pd(); + const memory_desc_wrapper &output_d = pd()->output_pd(); + const round_mode_t rmode = pd()->attr()->round_mode_; + const size_t nelems = input_d.nelems(); + const float scale = pd()->attr()->rnn_data_qparams_.scale_; + const float shift = pd()->attr()->rnn_data_qparams_.shift_; + + parallel_nd(nelems, [&](size_t i) { + float in = (float)input[input_d.off_l(i)] * scale + shift; + output[output_d.off_l(i)] = qz_a1b0()(in, rmode); + }); + + e->set_state(event_t::ready); + } + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } +}; + +template +struct rnn_weights_reorder_t : public cpu_primitive_t { + struct pd_t : public cpu_reorder_pd_t { + pd_t(const cpu_memory_pd_t *input_pd, const cpu_memory_pd_t *output_pd, + const primitive_attr_t *attr) + : cpu_reorder_pd_t(input_pd, output_pd, attr) {} + + DECLARE_COMMON_PD_T("rnn_weights_reorder", rnn_weights_reorder_t); + + static status_t create(reorder_pd_t **reorder_pd, + const memory_pd_t *input_pd, const memory_pd_t *output_pd, + const primitive_attr_t *attr) { +#if !USE_MKL_PACKED_GEMM + return status::unimplemented; +#endif + using namespace memory_format; + assert(input_pd->engine()->kind() == engine_kind::cpu); + assert(output_pd->engine()->kind() == engine_kind::cpu); + const memory_desc_wrapper output_d(output_pd); + + const memory_desc_wrapper id(input_pd), od(output_pd); + bool args_ok = true + && id.data_type() == type_i + && od.data_type() == type_o + && utils::one_of(id.format(), ldigo, ldgoi) + && od.format() == rnn_packed + && od.rnn_packed_desc().format + == mkldnn_ldigo_p + && od.rnn_packed_desc().n_parts == 1 + && attr != nullptr; + if (!args_ok) return status::invalid_arguments; + + const int mask = attr->rnn_weights_qparams_.mask_; + if (!utils::one_of(mask, 0, 3)) return status::unimplemented; + + auto _pd = new pd_t((const cpu_memory_pd_t *)input_pd, + (const cpu_memory_pd_t *)output_pd, attr); + if (_pd == nullptr) return out_of_memory; + if (_pd->init() != success) { delete _pd; return unimplemented; } + return safe_ptr_assign(*reorder_pd, _pd); + } + + virtual status_t init() override { + status_t status = cpu_reorder_pd_t::init(); + if (status != status::success) return status; + + init_scratchpad(); + + return status::success; + } + + private: + void init_scratchpad() { + const memory_desc_wrapper id(input_pd()); + const size_t nelems = id.nelems(); + const auto &dims = id.dims(); + + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + size_t quantization_size = sizeof(int8_t) * nelems; + size_t reduction_size = id.format() == ldigo + ? sizeof(int32_t) * mkldnn_get_max_threads() * dims[0] + * dims[1] * dims[3] * dims[4] + : 0; + scratchpad.book( + key_reorder_rnn_weights_quantization, quantization_size); + scratchpad.book(key_reorder_rnn_weights_reduction, reduction_size); + } + }; + +private: + typedef typename prec_traits::type in_data_t; + typedef typename prec_traits::type out_data_t; + + rnn_weights_reorder_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + + virtual void execute(event_t *e) const { +#if USE_MKL_PACKED_GEMM + auto input = reinterpret_cast(input_memory(0)); + auto output = reinterpret_cast(memory()); + const memory_desc_wrapper &input_d = pd()->input_pd(); + const memory_desc_wrapper &output_d = pd()->output_pd(); + const auto &dims = input_d.dims(); + + const int L = dims[0]; + const int D = dims[1]; + const int I = dims[2]; + const int G = dims[3]; + const int O = dims[4]; + + const bool is_igo = input_d.format() == memory_format::ldigo; + + /* Quantize input & compute compensation */ + auto quantized = (int8_t * __restrict)scratchpad().template get( + memory_tracking::names::key_reorder_rnn_weights_quantization); + auto reduction = (int32_t * __restrict)scratchpad().template get( + memory_tracking::names::key_reorder_rnn_weights_reduction); + float *comp = reinterpret_cast( + output + output_d.rnn_packed_desc().offset_compensation); + const round_mode_t rmode = pd()->attr()->round_mode_; + const float *scales = pd()->attr()->rnn_weights_qparams_.scales_; + const int mask = pd()->attr()->rnn_weights_qparams_.mask_; + + if (is_igo) { + int nthr = mkldnn_get_max_threads(); + int LD_nthr = nstl::min(L * D, nthr); + int I_nthr = nstl::min(I, nthr / LD_nthr); + parallel(nthr, [&](const int ithr, const int nthr) { + int LD_ithr = -1, LD_s = -1, LD_e = -1; + int I_ithr = -1, I_s = -1, I_e = -1; + if (ithr < LD_nthr * I_nthr) { + LD_ithr = ithr % LD_nthr; + I_ithr = ithr / LD_nthr; + balance211(L * D, LD_nthr, LD_ithr, LD_s, LD_e); + balance211(I, I_nthr, I_ithr, I_s, I_e); + } + int32_t *comp_ithr = reduction + I_ithr * L * D * G * O; + for (int ld = LD_s; ld < LD_e; ld++) { + for (int go = 0; go < G * O; go++) + comp_ithr[ld * G * O + go] = 0; + for (int i = I_s; i < I_e; i++) { + PRAGMA_OMP_SIMD() + for (int go = 0; go < G * O; go++) { + const float s = scales[(mask == 0) ? 0 : go]; + int8_t q = qz_b0()( + input[ld * I * G * O + i * G * O + go], s, + rmode); + quantized[ld * I * G * O + i * G * O + go] + = (int32_t)q; + comp_ithr[ld * G * O + go] += (int32_t)q; + } + } + } + }); + parallel_nd(L * D * G * O, + [&](int s) { comp[s] = saturate(reduction[s]); }); + for (int i = 1; i < I_nthr; i++) { + parallel_nd(L * D * G * O, [&](int s) { + comp[s] += saturate( + reduction[i * L * D * G * O + s]); + }); + } + } else { + parallel_nd(L * D, G * O, [&](int ld, int go) { + int32_t compensation = 0; + const float s = scales[(mask == 0) ? 0 : go]; + PRAGMA_OMP_SIMD() + for (int i = 0; i < I; i++) { + int8_t q = qz_b0()( + input[ld * G * O * I + go * I + i], s, rmode); + compensation += (int32_t)q; + quantized[ld * G * O * I + go * I + i] = q; + } + comp[ld * G * O + go] = saturate(compensation); + }); + } + + /* Pack */ + auto off_igo = [&](int l, int d, int i, int g, int o) { + return l * D * I * G * O + d * I * G * O + i * G * O + g * O + o; + }; + auto off_goi = [&](int l, int d, int i, int g, int o) { + return l * D * G * O * I + d * G * O * I + g * O * I + o * I + i; + }; + int n_parts = output_d.rnn_packed_desc().n_parts; + const size_t *size_packed_cell + = output_d.rnn_packed_desc().part_pack_size; + const int *parts = output_d.rnn_packed_desc().parts; + const int n = output_d.rnn_packed_desc().n; + char *to_pack = output; + for (int l = 0; l < L; l++) { + for (int d = 0; d < D; d++) { + for (int p = 0; p < n_parts; p++) { + int g = (p > 0) ? parts[p - 1] : 0; + int m_p = parts[p] * O; + int k_p = I; + cblas_gemm_s8u8s32_pack(CblasColMajor, CblasAMatrix, + is_igo ? CblasNoTrans : CblasTrans, m_p, n, k_p, + &quantized[is_igo ? off_igo(l, d, 0, g, 0) : + off_goi(l, d, g, 0, 0)], + is_igo ? G * O : I, to_pack); + to_pack += size_packed_cell[p]; + } + } + } +#endif + e->set_state(event_t::ready); + } + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } +}; + +template <> +struct rnn_weights_reorder_t + : public cpu_primitive_t { + struct pd_t : public cpu_reorder_pd_t { + pd_t(const cpu_memory_pd_t *input_pd, const cpu_memory_pd_t *output_pd, + const primitive_attr_t *attr) + : cpu_reorder_pd_t(input_pd, output_pd, attr) {} + + DECLARE_COMMON_PD_T("rnn_weights_reorder", rnn_weights_reorder_t); + + static status_t create(reorder_pd_t **reorder_pd, + const memory_pd_t *input_pd, const memory_pd_t *output_pd, + const primitive_attr_t *attr) { +#if !USE_MKL_PACKED_GEMM + return status::unimplemented; +#endif + using namespace memory_format; + using namespace data_type; + assert(input_pd->engine()->kind() == engine_kind::cpu); + assert(output_pd->engine()->kind() == engine_kind::cpu); + const memory_desc_wrapper output_d(output_pd); + + const memory_desc_wrapper id(input_pd), od(output_pd); + bool args_ok = true + && id.data_type() == f32 + && od.data_type() == f32 + && utils::one_of(id.format(), ldigo, ldgoi) + && od.format() == rnn_packed + && utils::one_of(od.rnn_packed_desc().format, + mkldnn_ldigo_p, mkldnn_ldgoi_p) + && attr->has_default_values(); + if (!args_ok) return status::invalid_arguments; + + const int mask = attr->rnn_weights_qparams_.mask_; + if (!utils::one_of(mask, 0, 3)) return status::unimplemented; + + auto _pd = new pd_t((const cpu_memory_pd_t *)input_pd, + (const cpu_memory_pd_t *)output_pd, attr); + if (_pd == nullptr) return out_of_memory; + if (_pd->init() != success) { delete _pd; return unimplemented; } + return safe_ptr_assign(*reorder_pd, _pd); + } + }; + +private: + rnn_weights_reorder_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + + virtual void execute(event_t *e) const { +#if USE_MKL_PACKED_GEMM + auto input = reinterpret_cast(input_memory(0)); + auto output = reinterpret_cast(memory()); + const memory_desc_wrapper &input_d = pd()->input_pd(); + const memory_desc_wrapper &output_d = pd()->output_pd(); + const auto &dims = input_d.dims(); + const rnn_packed_data_t &rnn_pdata = output_d.rnn_packed_desc(); + const int L = dims[0]; + const int D = dims[1]; + const int I = dims[2]; + const int G = dims[3]; + const int O = dims[4]; + + /* Pack */ + bool cross_case = (input_d.format() == memory_format::ldigo + && rnn_pdata.format == mkldnn_ldgoi_p) + || (input_d.format() == memory_format::ldgoi + && rnn_pdata.format == mkldnn_ldigo_p); + auto trans = cross_case ? CblasTrans : CblasNoTrans; + int n_parts = rnn_pdata.n_parts; + const size_t *size_packed_cell = rnn_pdata.part_pack_size; + const int *parts = rnn_pdata.parts; + const int n = rnn_pdata.n; + + const bool is_igo = input_d.format() == memory_format::ldigo; + auto off_igo = [&](int l, int d, int i, int g, int o) { + return l * D * I * G * O + d * I * G * O + i * G * O + g * O + o; + }; + auto off_goi = [&](int l, int d, int i, int g, int o) { + return l * D * G * O * I + d * G * O * I + g * O * I + o * I + i; + }; + for (int l = 0; l < L; l++) { + for (int d = 0; d < D; d++) { + for (int p = 0; p < n_parts; p++) { + int g = (p > 0) ? parts[p - 1] : 0; + int m_p = is_igo ? parts[p] * O : I; + int k_p = is_igo ? I : parts[p] * O; + int ld = is_igo ? G * O : I; + cblas_sgemm_pack(CblasColMajor, CblasAMatrix, trans, m_p, n, + k_p, 1.0f, &input[is_igo ? off_igo(l, d, 0, g, 0) : + off_goi(l, d, 0, g, 0)], + ld, output); + output += size_packed_cell[p] / sizeof(float); + } + } + } + e->set_state(event_t::ready); +#endif + } + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } +}; + +} // namespace cpu +} // namespace impl +} // namespace mkldnn + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.cpp new file mode 100644 index 0000000..7a073b8 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.cpp @@ -0,0 +1,400 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "c_types_map.hpp" +#include "math_utils.hpp" +#include "mkldnn_thread.hpp" + +#include "ref_rnn.hpp" +#include "rnn_utils.hpp" +#include "type_helpers.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace mkldnn::impl::utils; +using namespace rnn_utils; +using namespace memory_format; +using namespace rnn_packed_format; +using namespace data_type; + +void rnn_utils::init_conf(rnn_conf_t &rnn, const rnn_desc_t &rd, + const memory_desc_wrapper &src_layer_d, + const memory_desc_wrapper &src_iter_d, + const memory_desc_wrapper &weights_layer_d, + const memory_desc_wrapper &weights_iter_d, + const memory_desc_wrapper &dst_layer_d) { + rnn.is_fwd = utils::one_of(rd.prop_kind, prop_kind::forward_training, + prop_kind::forward_inference); + rnn.is_training = utils::one_of( + rd.prop_kind, prop_kind::forward_training, prop_kind::backward); + rnn.is_lbr = rd.cell_desc.cell_kind == mkldnn_gru_linear_before_reset; + + switch (rd.direction) { + case mkldnn_unidirectional_left2right: rnn.exec_dir = l2r; break; + case mkldnn_unidirectional_right2left: rnn.exec_dir = r2l; break; + case mkldnn_bidirectional_concat: rnn.exec_dir = bi_concat; break; + case mkldnn_bidirectional_sum: rnn.exec_dir = bi_sum; break; + default: break; + } + + if (everyone_is(f32, src_layer_d.data_type(), dst_layer_d.data_type(), + weights_layer_d.data_type())) + rnn.dt_conf = all_f32; + else if (dst_layer_d.data_type() == u8) { + if (IMPLICATION(src_iter_d._md, src_iter_d.data_type() == u8)) + rnn.dt_conf = u8u8u8u8; + else + rnn.dt_conf = f32u8f32u8; + } else { + if (IMPLICATION(src_iter_d._md, src_iter_d.data_type() == u8)) + rnn.dt_conf = u8u8u8f32; + else + rnn.dt_conf = f32u8f32f32; + } + + rnn.n_layer = weights_layer_d.dims()[0]; + rnn.n_iter = src_layer_d.dims()[0]; + rnn.n_dir = weights_layer_d.dims()[1]; + rnn.n_gates = weights_layer_d.dims()[3]; + rnn.n_states = mkldnn_rnn_cell_get_states_count(&rd.cell_desc); + rnn.n_bias = rnn.n_gates + rnn.is_lbr; + rnn.mb = src_layer_d.dims()[1]; + rnn.sic = weights_iter_d.dims()[2]; + rnn.slc = weights_layer_d.dims()[2]; + rnn.dic = weights_layer_d.dims()[4]; + rnn.dlc = dst_layer_d.dims()[2]; + + rnn.gates_ld = rnn.dic * rnn.n_gates; + rnn.gates_nld = rnn.mb; + rnn.states_nld = rnn.mb; + + /* Set the correct number of weights parts */ + bool is_orig_gru = rd.cell_desc.cell_kind == alg_kind::vanilla_gru; + rnn.n_parts_weights_layer = 1; + rnn.parts_weights_layer[0] = rnn.n_gates; + rnn.parts_weights_layer[1] = 0; + + rnn.n_parts_weights_iter = is_orig_gru ? 2 : 1; + rnn.parts_weights_iter[0] = is_orig_gru ? 2 : rnn.n_gates; + rnn.parts_weights_iter[1] = is_orig_gru ? 1 : 0; + + rnn.n_parts_bias = 1; + rnn.parts_bias[0] = rnn.n_bias; + rnn.parts_bias[1] = 0; + + /* Decide wich gemm implementation to use: packed/nonpacked jit/cblas + * and if to mergre gemm across iterations */ + bool is_int8 = rnn.dt_conf != all_f32; + rnn.merge_gemm_layer = ((rnn.is_fwd && rnn.mb < 128) || !rnn.is_fwd) + || is_int8; + bool is_gru = utils::one_of(rd.cell_desc.cell_kind, alg_kind::vanilla_gru, + alg_kind::gru_linear_before_reset); + rnn.merge_gemm_iter = !(rnn.is_fwd || is_gru) || is_int8; + bool is_inference = !rnn.is_training; + + rnn.use_jit_gemm = !mayiuse(avx512_mic) + && ((is_inference && (rnn.n_layer > 1 || rnn.mb < 100)) + || (rnn.is_training && rnn.dic < 500)); + + /* Decide to copy bias */ + rnn.copy_bias = rnn.dt_conf != all_f32; + +#if USE_MKL_PACKED_GEMM + rnn.use_layer_packed_gemm + = (weights_layer_d.format() == any && rnn.slc > 760 && rnn.dic > 760 + && is_inference) + || is_int8; // packed gemm is the only supported option for int8 + rnn.use_iter_packed_gemm = (weights_iter_d.format() == any && rnn.sic > 760 + && rnn.dic > 760 && is_inference) + || is_int8; +#else + rnn.use_layer_packed_gemm = false; + rnn.use_iter_packed_gemm = false; +#endif + + /* Set packed gemm sizes */ + if (rnn.use_layer_packed_gemm) { + rnn.weights_layer_pack_size = 0; + for (int p = 0; p < rnn.n_parts_weights_layer; p++) { + int m_p = rnn.is_fwd + ? (rnn.parts_weights_layer[p] * rnn.dic) + : rnn.slc; + int k_p = rnn.is_fwd + ? rnn.slc + : (rnn.parts_weights_layer[p] * rnn.dic); + int n_p = rnn.merge_gemm_layer ? rnn.mb * rnn.n_iter : rnn.mb; + +#if USE_MKL_PACKED_GEMM + if (rnn.dt_conf == all_f32) + rnn.part_weights_layer_pack_size[p] = cblas_sgemm_pack_get_size( + CblasAMatrix, m_p, n_p, k_p); + else + rnn.part_weights_layer_pack_size[p] + = cblas_gemm_s8u8s32_pack_get_size( + CblasAMatrix, m_p, n_p, k_p); +#else + UNUSED(m_p); + UNUSED(k_p); + UNUSED(n_p); + rnn.part_weights_layer_pack_size[p] = 0; +#endif + rnn.weights_layer_pack_size += rnn.n_layer * rnn.n_dir + * rnn.part_weights_layer_pack_size[p]; + } + rnn.weights_layer_comp_offset = rnn.weights_layer_pack_size; + rnn.weights_layer_pack_size += rnn.dt_conf == all_f32 ? 0 : rnn.n_layer + * rnn.n_dir * rnn.n_gates * rnn.dlc * sizeof(float); + } + + if (rnn.use_iter_packed_gemm) { + rnn.weights_iter_pack_size = 0; + for (int p = 0; p < rnn.n_parts_weights_iter; p++) { + int m_p = rnn.is_fwd ? (rnn.parts_weights_iter[p] * rnn.dic) : + rnn.sic; + int k_p = rnn.is_fwd ? rnn.sic : + (rnn.parts_weights_iter[p] * rnn.dic); + int n_p = rnn.merge_gemm_iter ? rnn.mb * rnn.n_iter : rnn.mb; + +#if USE_MKL_PACKED_GEMM + if (rnn.dt_conf == all_f32) + rnn.part_weights_iter_pack_size[p] = cblas_sgemm_pack_get_size( + CblasAMatrix, m_p, n_p, k_p); + else + rnn.part_weights_iter_pack_size[p] + = cblas_gemm_s8u8s32_pack_get_size( + CblasAMatrix, m_p, n_p, k_p); +#else + UNUSED(m_p); + UNUSED(k_p); + UNUSED(n_p); + rnn.part_weights_iter_pack_size[p] = 0; +#endif + rnn.weights_iter_pack_size += rnn.n_layer * rnn.n_dir + * rnn.part_weights_iter_pack_size[p]; + } + rnn.weights_iter_comp_offset = rnn.weights_iter_pack_size; + rnn.weights_iter_pack_size += rnn.dt_conf == all_f32 ? 0 : rnn.n_layer + * rnn.n_dir * rnn.n_gates * rnn.dic * sizeof(float); + } + +} + +void rnn_utils::set_conf(rnn_conf_t &rnn, const rnn_desc_t &rd, + const memory_desc_wrapper &weights_layer_d, + const memory_desc_wrapper &weights_iter_d, + const memory_desc_wrapper &diff_weights_layer_d, + const memory_desc_wrapper &diff_weights_iter_d) { + + /* Set leading dimensions for input weights arrays depending on input format + */ + rnn.weights_layer_fmt = weights_layer_d.format(); + rnn.weights_iter_fmt = weights_iter_d.format(); + rnn.weights_layer_is_packed = rnn.weights_layer_fmt == rnn_packed; + rnn.weights_iter_is_packed = rnn.weights_iter_fmt == rnn_packed; + + auto set_dims = [&](const memory_desc_wrapper &md, int &ld, int &nld) { + switch (md.format()) { + case ldigo: + ld = (int)md.blocking_desc().strides[0][2]; + nld = md.dims()[2]; + return; + case ldgoi: + ld = (int)md.blocking_desc().strides[0][4]; + nld = md.dims()[3] * md.dims()[4]; + return; + default: ld = 0; nld = 0; + } + }; + set_dims(weights_layer_d, rnn.weights_layer_ld, rnn.weights_layer_nld); + set_dims(weights_iter_d, rnn.weights_iter_ld, rnn.weights_iter_nld); + if (!rnn.is_fwd) { + set_dims(diff_weights_layer_d, rnn.diff_weights_layer_ld, + rnn.diff_weights_layer_nld); + set_dims(diff_weights_iter_d, rnn.diff_weights_iter_ld, + rnn.diff_weights_iter_nld); + } + + int sizeof_states_dt + = rnn.dt_conf == all_f32 ? sizeof(float) : sizeof(uint8_t); + rnn.states_ws_ld + = get_good_ld(nstl::max(rnn.slc, nstl::max(rnn.sic, rnn.dic)), + sizeof_states_dt); + rnn.gates_ws_ld = get_good_ld(rnn.gates_ld, sizeof(float)); + + /* Set workspace sizes to store: + * states to copmute a pass + * diff states to copmute bwd pass (training only) + * intermediate results from the gates + */ + rnn.use_workspace = rnn.is_training; + rnn.ws_states_size = (size_t)(rnn.n_layer + 1) * rnn.n_dir + * (rnn.n_iter + 1) * rnn.mb * rnn.states_ws_ld * sizeof_states_dt; + bool is_lstm = rd.cell_desc.cell_kind == mkldnn_vanilla_lstm; + rnn.ws_c_states_size = is_lstm + ? (size_t)(rnn.n_layer + 1) * rnn.n_dir * (rnn.n_iter + 1) * rnn.mb + * rnn.states_ws_ld * sizeof(float) + : 0; + rnn.ws_diff_states_size = rnn.is_training + ? (size_t)(rnn.n_layer + 1) * rnn.n_dir * (rnn.n_iter + 1) + * (rnn.n_states + 1) * rnn.mb * rnn.states_ws_ld + * sizeof(float) + : (size_t)0; + rnn.ws_gates_size = (size_t)rnn.n_layer * rnn.n_dir * rnn.n_iter * rnn.mb + * rnn.gates_ws_ld * sizeof(float); + + /* set other sizes */ + rnn.ws_per_cell = (size_t)rnn.is_lbr * rnn.mb * rnn.dic * sizeof(float); + rnn.ws_cell_comp_size + = rnn.is_lbr || rnn.dt_conf != all_f32 + ? (size_t) rnn.gates_nld * rnn.gates_ws_ld * sizeof(float) + : 0; + rnn.ws_grid_comp_size = (size_t)rnn.is_lbr * rnn.is_training * rnn.n_layer + * rnn.n_dir * rnn.n_iter * rnn.ws_per_cell * sizeof(float); + rnn.ws_bias_size = (size_t)rnn.n_layer * rnn.n_dir * rnn.n_bias * rnn.dic + * sizeof(float); +} + +int rnn_utils::get_good_ld(int dim, int sizeof_dt) { + // we want matrices leading dimentions to be 64-byte aligned, + // and not divisible by 256 to avoid 4K aliasing effects + int ld = rnd_up(dim, 64 / sizeof_dt); + return (ld % 256 == 0) ? ld + 64 / sizeof_dt : ld; +} + +void rnn_utils::set_offsets(const rnn_conf_t &rnn, size_t &ws_gates_offset, + size_t &ws_states_offset, size_t &ws_c_states_offset, + size_t &ws_diff_states_offset, size_t &ws_grid_comp_offset, + size_t &ws_cell_comp_offset, size_t &ws_bias_offset, + size_t &scratchpad_size, size_t &workspace_size) { + + const size_t page_size = 4096; // 2097152; + size_t current_offset; + /* Mandatory workspaces: go to workspace if use_workspace, scratchpad + * otherwise */ + current_offset = 0; // assumes the workspace base pointer is page aligned + ws_gates_offset = current_offset; + current_offset += rnn.ws_gates_size; + + current_offset = utils::rnd_up(current_offset, page_size); + ws_states_offset = current_offset; + current_offset += rnn.ws_states_size; + + current_offset = utils::rnd_up(current_offset, page_size); + ws_c_states_offset = current_offset; + current_offset += rnn.ws_c_states_size; + + current_offset = utils::rnd_up(current_offset, page_size); + ws_diff_states_offset = current_offset; + current_offset += rnn.ws_diff_states_size; + + current_offset = utils::rnd_up(current_offset, page_size); + ws_grid_comp_offset = current_offset; + current_offset += rnn.ws_grid_comp_size; + + current_offset = utils::rnd_up(current_offset, page_size); + ws_cell_comp_offset = current_offset; + current_offset += rnn.ws_cell_comp_size; + + workspace_size = rnn.use_workspace ? current_offset : 0; + + /* Optional scratchpads */ + // Assumes the scratchpad base pointer is page aligned. + // If use_workspace, the following goes to scratchpad alone, + // otherwise, all goes to scratchpad and continue incrementing offset + current_offset = rnn.use_workspace ? 0 : current_offset; + + if (rnn.copy_bias) { + current_offset = utils::rnd_up(current_offset, page_size); + ws_bias_offset = current_offset; + current_offset += rnn.ws_bias_size; + } + + scratchpad_size = current_offset; +} + +void rnn_utils::get_scratchpad_and_workspace_sizes(const rnn_conf_t &rnn, + size_t &scratchpad_size, size_t &workspace_size) { + size_t ws_gates_offset, ws_states_offset, ws_c_states_offset, + ws_diff_states_offset, ws_grid_comp_offset, ws_cell_comp_offset, + ws_bias_offset; + set_offsets(rnn, ws_gates_offset, ws_states_offset, ws_diff_states_offset, + ws_c_states_offset, ws_grid_comp_offset, ws_cell_comp_offset, + ws_bias_offset, scratchpad_size, workspace_size); +} + +status_t rnn_utils::set_good_strides(memory_desc_t &weights_md) { + auto &strides = weights_md.layout_desc.blocking.strides[0]; + auto dims = weights_md.dims; + + if (weights_md.format == ldigo) { + strides[2] = rnn_utils::get_good_ld((int)strides[2], + (int)types::data_type_size(weights_md.data_type)); + strides[1] = dims[2] * strides[2]; + strides[0] = dims[1] * strides[1]; + } else if (weights_md.format == ldgoi) { + strides[4] = rnn_utils::get_good_ld((int)strides[4], + (int)types::data_type_size(weights_md.data_type)); + strides[3] = dims[4] * strides[4]; + strides[1] = dims[3] * strides[3]; + strides[0] = dims[1] * strides[1]; + } else + return unimplemented; + + return success; +} + +status_t rnn_utils::set_expected_desc(rnn_conf_t &rnn, + memory_desc_t &weights_md, bool is_iter) { + bool use_packed_gemm = is_iter + ? rnn.use_iter_packed_gemm + : rnn.use_layer_packed_gemm; + if (use_packed_gemm) { + weights_md.format = rnn_packed; + rnn_packed_data_t &rnn_pdata = weights_md.layout_desc.rnn_packed_desc; + rnn_pdata.format = rnn.is_fwd ? mkldnn_ldigo_p : mkldnn_ldgoi_p; + if (is_iter) { + rnn_pdata.n = rnn.mb; + rnn_pdata.n_parts = rnn.n_parts_weights_iter; + array_copy(rnn_pdata.parts, rnn.parts_weights_iter, + MKLDNN_RNN_MAX_N_PARTS); + array_copy(rnn_pdata.part_pack_size, + rnn.part_weights_iter_pack_size, MKLDNN_RNN_MAX_N_PARTS); + rnn_pdata.offset_compensation = rnn.weights_iter_comp_offset; + rnn_pdata.size = rnn.weights_iter_pack_size; + } else { + rnn_pdata.n = rnn.merge_gemm_layer ? rnn.n_iter * rnn.mb : rnn.mb; + rnn_pdata.n_parts = rnn.n_parts_weights_layer; + array_copy(rnn_pdata.parts, rnn.parts_weights_layer, + MKLDNN_RNN_MAX_N_PARTS); + array_copy(rnn_pdata.part_pack_size, + rnn.part_weights_layer_pack_size, MKLDNN_RNN_MAX_N_PARTS); + rnn_pdata.offset_compensation = rnn.weights_layer_comp_offset; + rnn_pdata.size = rnn.weights_layer_pack_size; + } + } else { + weights_md.format = rnn.is_fwd ? ldigo : ldgoi; + CHECK(memory_desc_wrapper::compute_blocking(weights_md)); + // Adjust strides for good leading dimension in GEMM + CHECK(set_good_strides(weights_md)); + } + return success; +} + +} +} +} diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.hpp new file mode 100644 index 0000000..88f0b44 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/rnn/rnn_utils.hpp @@ -0,0 +1,224 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef RNN_UTILS_HPP +#define RNN_UTILS_HPP + +#include "mkldnn.h" + +#include "cpu_rnn_pd.hpp" + + +#define rnn_elemwise_sig(f) \ + void f(const rnn_utils::rnn_conf_t &rnn, acc_data_t *ws_gates_, \ + src_data_t *states_t_l_, float *c_states_t_l_, \ + src_data_t *states_tm1_l_, float *c_states_tm1_l_, \ + float *diff_states_t_l_, float *diff_states_t_lp1_, \ + float *diff_states_tp1_l_, float *bias_, float *ws_grid_, \ + float *ws_cell_) const + +#define rnn_cell_execution_sig(f) \ + void f(const rnn_utils::rnn_conf_t &rnn, src_data_t *states_t_l_, \ + float *c_states_t_l_, float *diff_states_t_l_, \ + weights_data_t **w_layer_, weights_data_t **w_iter_, \ + float **bias_, src_data_t *states_t_lm1_, \ + src_data_t *states_tm1_l_, float *c_states_tm1_l_, \ + float *diff_states_t_lp1_, float *diff_states_tp1_l_, \ + float *diff_w_layer_, float *diff_w_iter_, float *diff_bias_, \ + acc_data_t *ws_gates_, float *ws_grid_, float *ws_cell_) const + +#define rnn_grid_execution_sig(f) \ + void f(const rnn_utils::rnn_conf_t &rnn, weights_data_t **weights_layer_, \ + weights_data_t **weights_states_, float **bias_, \ + src_data_t *ws_states_, float *ws_c_states_, \ + float *ws_diff_states_, acc_data_t *ws_gates_, float *ws_cell_, \ + float *ws_grid_, float *diff_weights_layer_, \ + float *diff_weights_iter_, float *diff_bias_) const + +#define rnn_gemm_sig(f) \ + void f(const char transA, const char transB, int m, int n, int k, \ + const float alpha, const weights_data_t *a_, const int ldA, \ + const src_data_t *b_, const int ldB, const float beta, \ + acc_data_t *c_, const int ldC) const + +#define rnn_bias_prepare_sig(f) \ + void f(const rnn_utils::rnn_conf_t &rnn, float **bias_, const float *b_, \ + float *scratch_bias_) const + +#define rnn_bias_finalize_sig(f) \ + void f(const rnn_utils::rnn_conf_t &rnn, float *scratch_bias_, \ + const float *w_iter_comp, const float *w_layer_comp) const + +#define rnn_weights_assign_sig(f) \ + void f(const rnn_utils::rnn_conf_t &rnn, memory_format_t fmt, int nld, \ + int ld, int OC_size, int IC_size, const int n_parts, \ + const int *gates_per_part, const size_t *part_weights_pack_size, \ + weights_data_t **weights_, const weights_data_t *w_, \ + float **bias_, const float *b_, float *scratch_bias_) const + + +namespace mkldnn { +namespace impl { +namespace cpu { + +namespace rnn_utils { + +using namespace mkldnn::impl::utils; + +enum execution_direction_t { + l2r, + r2l, + bi_concat, + bi_sum, +}; + +enum data_type_conf_t { + all_f32, + u8u8u8f32, + f32u8f32f32, + u8u8u8u8, + f32u8f32u8 +}; + +struct rnn_conf_t { + execution_direction_t exec_dir; + data_type_conf_t dt_conf; + int n_layer, n_iter, n_dir, n_gates, n_states; + int mb; + int slc, sic, dic, dlc; + int gates_ld, gates_nld, gates_ws_ld; + int n_parts_weights_layer, parts_weights_layer[MKLDNN_RNN_MAX_N_PARTS]; + int n_parts_weights_iter, parts_weights_iter[MKLDNN_RNN_MAX_N_PARTS]; + int n_bias, n_parts_bias, parts_bias[MKLDNN_RNN_MAX_N_PARTS]; + size_t part_weights_iter_pack_size[MKLDNN_RNN_MAX_N_PARTS], + part_weights_layer_pack_size[MKLDNN_RNN_MAX_N_PARTS]; + bool weights_layer_is_packed, weights_iter_is_packed; + /* Size of packed data in bytes */ + size_t weights_layer_comp_offset, weights_layer_pack_size, + weights_iter_comp_offset, weights_iter_pack_size; + + bool copy_bias; + int weights_layer_ld, weights_layer_nld; + int diff_weights_layer_ld, diff_weights_layer_nld; + int weights_iter_ld, weights_iter_nld; + int diff_weights_iter_ld, diff_weights_iter_nld; + int states_nld, states_ws_ld; + int weights_iter_compensation_size, weights_layer_compensation_size; + bool is_fwd, is_training, is_lbr; + bool use_workspace; + + /* Size of workspace for each tensor in bytes */ + size_t ws_gates_size, ws_states_size, ws_c_states_size, ws_diff_states_size, + ws_cell_comp_size, ws_grid_comp_size, ws_per_cell, ws_bias_size; + bool merge_gemm_iter, merge_gemm_layer, use_jit_gemm, use_layer_packed_gemm, + use_iter_packed_gemm; + memory_format_t weights_layer_fmt, weights_iter_fmt, diff_weights_layer_fmt, + diff_weights_iter_fmt; +}; + +int get_good_ld(int dim, int sizeof_dt); + +void init_conf(rnn_conf_t &rnn, const rnn_desc_t &rd, + const memory_desc_wrapper &src_layer_d, + const memory_desc_wrapper &src_iter_d, + const memory_desc_wrapper &weights_layer_d, + const memory_desc_wrapper &weights_iter_d, + const memory_desc_wrapper &dst_layer_d); + +void set_conf(rnn_conf_t &rnn, const rnn_desc_t &rd, + const memory_desc_wrapper &weights_layer_d, + const memory_desc_wrapper &weights_iter_d, + const memory_desc_wrapper &diff_weights_layer_d, + const memory_desc_wrapper &diff_weights_iter_d); + +void set_offsets(const rnn_conf_t &rnn, size_t &ws_gates_offset, + size_t &ws_h_state_offset, size_t &ws_c_state_offset, + size_t &ws_diff_states_offset, size_t &ws_grid_comp_offset, + size_t &ws_cell_comp_offset, size_t &ws_bias_offset, + size_t &scratchpad_size, size_t &workspace_size); + +void get_scratchpad_and_workspace_sizes(const rnn_conf_t &rnn, + size_t &scratchpad_size, size_t &workspace_size); +status_t set_expected_desc( + rnn_conf_t &rnn, memory_desc_t &weights_md, bool is_iter); +status_t set_good_strides(memory_desc_t &weights_md); + +template +struct ws_gates_aoc { + ws_gates_aoc(const rnn_conf_t &rnn, T *data) + : gates_(data, rnn.gates_nld, rnn.gates_ws_ld), DIC_(rnn.dic) {} + T &operator()(int batch, int gate, int dic) { + return gates_(batch, gate * DIC_ + dic); + } + +private: + mkldnn::impl::utils::array_offset_calculator gates_; + int DIC_; +}; +using ws_gates_aoc_t = ws_gates_aoc; +using ws_gates_aoc_s32_t = ws_gates_aoc; + +struct bias_aoc_t { + bias_aoc_t(const rnn_conf_t &rnn, const float *data) + : bias_(data, rnn.n_bias, rnn.dic) {} + const float &operator()(int bias_n, int dic) { return bias_(bias_n, dic); } + +private: + mkldnn::impl::utils::array_offset_calculator bias_; +}; + +template +struct ws_states_aoc { + ws_states_aoc(const rnn_conf_t &rnn, T *data) + : state_(data, rnn.states_nld, rnn.states_ws_ld) {} + T &operator()(int batch, int dic) { return state_(batch, dic); } + +private: + mkldnn::impl::utils::array_offset_calculator state_; +}; +using ws_states_aoc_t = ws_states_aoc; +using ws_states_aoc_u8_t = ws_states_aoc; + +struct ws_diff_states_aoc_t { + ws_diff_states_aoc_t(const rnn_conf_t &rnn, float *data) + : diff_states_(data, rnn.n_states + 1, rnn.n_iter + 1, rnn.states_nld, + rnn.states_ws_ld) {} + float &operator()(int state_n, int batch, int dic) { + return diff_states_(state_n, 0, batch, dic); + } + +private: + mkldnn::impl::utils::array_offset_calculator diff_states_; +}; + +struct ws_diff_w_iter_aoc_t { + ws_diff_w_iter_aoc_t(const rnn_conf_t &rnn, float *data) + : diff_weights_iter_( + data, rnn.diff_weights_iter_nld, rnn.diff_weights_iter_ld) + , DIC_(rnn.dic) {} + float &operator()(int sic, int gate, int dic) { + return diff_weights_iter_(sic, gate * DIC_ + dic); + } + +private: + mkldnn::impl::utils::array_offset_calculator diff_weights_iter_; + int DIC_; +}; +} +} +} +} +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.cpp index eb5723f..c642489 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.cpp @@ -22,68 +22,95 @@ namespace mkldnn { namespace impl { namespace cpu { +using namespace memory_tracking::names; + template -void simple_concat_t::execute() { - const int num_arrs = conf_.n_inputs(); - int *perm = conf_.perm_, *iperm = conf_.iperm_; - int concat_dim = conf_.concat_dim(); +void simple_concat_t::execute() const { + auto scratchpad = this->scratchpad(); + auto iptrs = scratchpad.template get(key_concat_iptrs); + auto optrs = scratchpad.template get(key_concat_optrs); + auto nelems_to_copy = scratchpad.template get(key_concat_nelems); + auto is = scratchpad.template get(key_concat_istrides); + + const int num_arrs = pd()->n_inputs(); + const ptrdiff_t *perm = pd()->perm_, *iperm = pd()->iperm_; + const int concat_dim = pd()->concat_dim(); auto o_base_ptr = reinterpret_cast(this->memory()); for (int a = 0; a < num_arrs; ++a) { - const memory_desc_wrapper i_d(conf_.src_pd(a)); - const memory_desc_wrapper o_d(conf_.src_image_pd(a)); + const memory_desc_wrapper i_d(pd()->src_pd(a)); + const memory_desc_wrapper o_d(pd()->src_image_pd(a)); - input_ptrs_[a] = reinterpret_cast( + iptrs[a] = reinterpret_cast( this->input_memory(a)) + i_d.blk_off(0); - output_ptrs_[a] = o_base_ptr + o_d.blk_off(0); - nelems_to_copy_[a] = nelems_to_concat(concat_dim, perm, iperm, i_d); + optrs[a] = o_base_ptr + o_d.blk_off(0); + nelems_to_copy[a] = pd()->nelems_to_concat(i_d); for (int i = 0; i < TENSOR_MAX_DIMS; i++) { if (i < perm[concat_dim]) - is_[a][i] = size_t(i_d.blocking_desc().strides[0][iperm[i]]); + is[a][i] = size_t(i_d.blocking_desc().strides[0][iperm[i]]); else - is_[a][i] = 0; + is[a][i] = 0; } } - const memory_desc_wrapper o_d(conf_.src_image_pd()); + const memory_desc_wrapper o_d(pd()->src_image_pd()); auto &blk = o_d.blocking_desc(); + strides_t os = { 0 }; for (int i = 0; i < perm[concat_dim]; i++) os[i] = o_d.blocking_desc().strides[0][iperm[i]]; + dims_t phys_dims; for (size_t i = 0; i < sizeof(phys_dims)/sizeof(phys_dims[0]); i++) - phys_dims[i] = (i < (size_t)perm[concat_dim]) ? - o_d.dims()[iperm[i]] / blk.block_dims[iperm[i]] : - 1; + phys_dims[i] = (i < (size_t)perm[concat_dim]) + ? o_d.dims()[iperm[i]] / blk.block_dims[iperm[i]] : 1; - switch (perm[concat_dim]) { - case (0): { + if (perm[concat_dim] == 0) { for (int a = 0; a < num_arrs; ++a) { - const data_t *i = &input_ptrs_[a][0]; - data_t *o = &output_ptrs_[a][0]; - parallel_nd((ptrdiff_t)nelems_to_copy_[a], + const data_t *i = &iptrs[a][0]; + data_t *o = &optrs[a][0]; + parallel_nd((ptrdiff_t)nelems_to_copy[a], [&](ptrdiff_t e) { o[e] = i[e]; }); } - break; - } - default: + } else { parallel_nd(phys_dims[0], phys_dims[1], phys_dims[2], phys_dims[3], phys_dims[4], num_arrs, [&](int n0, int n1, int n2, int n3, int n4, int a) { - // XXX: this code may access unitialized values in is_[*][0-4] -- + // XXX: this code may access uninitialized values in is[*][0-4] -- // that's why we have to set them to zero although this is // probably benign - size_t in_off = is_[a][0] * n0 + is_[a][1] * n1 - + is_[a][2] * n2 + is_[a][3] * n3 - + is_[a][4] * n4; - size_t out_off = os[0] * n0 + os[1] * n1 - + os[2] * n2 + os[3] * n3 + os[4] * n4; - const data_t *i = &input_ptrs_[a][in_off]; - data_t *o = &output_ptrs_[a][out_off]; + size_t in_off = is[a][0] * n0 + is[a][1] * n1 + is[a][2] * n2 + + is[a][3] * n3 + is[a][4] * n4; + size_t out_off = os[0] * n0 + os[1] * n1 + os[2] * n2 + + os[3] * n3 + os[4] * n4; + const data_t *i = &iptrs[a][in_off]; + data_t *o = &optrs[a][out_off]; +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) + // The code below performs data copying: o[e] = i[e] + // and uses a workaround to make GNU compilers optimize it + uint8_t *ptro = reinterpret_cast(o); + const uint8_t *ptri = reinterpret_cast(i); + const size_t main_part = + nelems_to_copy[a] * sizeof(data_t) / sizeof(uint32_t); + const size_t tail_part = + nelems_to_copy[a] * sizeof(data_t) % sizeof(uint32_t); PRAGMA_OMP_SIMD() - for (size_t e = 0; e < nelems_to_copy_[a]; ++e) - o[e] = i[e]; + for (size_t e = 0; e < main_part; ++e) { + *(reinterpret_cast(ptro)) + = *(reinterpret_cast(ptri)); + ptro += sizeof(uint32_t); + ptri += sizeof(uint32_t); + } + for (size_t e = 0; e < tail_part; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } +#else + PRAGMA_OMP_SIMD() + for (size_t e = 0; e < nelems_to_copy[a]; ++e) o[e] = i[e]; +#endif }); } } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.hpp index 45193b2..84946da 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.hpp @@ -17,6 +17,8 @@ #ifndef SIMPLE_CONCAT_HPP #define SIMPLE_CONCAT_HPP +#include "memory_tracking.hpp" + #include "cpu_concat.hpp" namespace mkldnn { @@ -28,29 +30,25 @@ struct simple_concat_t: public cpu_primitive_t { using cpu_memory_pd_t = cpu_memory_t::pd_t; struct pd_t: public cpu_concat_pd_t { - pd_t(const memory_desc_t *output_d, int n, - int concat_dim, const cpu_memory_pd_t **input_pds, + pd_t(const memory_desc_t *output_d, int n, int concat_dim, + const cpu_memory_pd_t **input_pds, const primitive_attr_t *attr) - : cpu_concat_pd_t(output_d, n, concat_dim, input_pds, attr) - {} + : cpu_concat_pd_t(output_d, n, concat_dim, input_pds, attr) {} + pd_t(const pd_t &rhs) : cpu_concat_pd_t(rhs) { for (size_t i = 0; i < sizeof(perm_)/sizeof(perm_[0]); i++) { perm_[i] = rhs.perm_[i]; iperm_[i] = rhs.iperm_[i]; } } + DECLARE_CPU_CONCAT_PD_T("simple:any", simple_concat_t); virtual status_t init() override { - auto is_dense = [&](const memory_desc_wrapper &data_d) { - return nelems_to_concat(concat_dim_, perm_, iperm_, data_d) - == _size_to_concat(concat_dim_, perm_, iperm_, data_d); - }; const memory_desc_wrapper dst_d(&dst_pd_); bool ok = true && cpu_concat_pd_t::init() == success && dst_d.ndims() <= 6; - if (!ok) return unimplemented; for (size_t i = 0; i < src_pds_.size(); ++i) { @@ -61,118 +59,110 @@ struct simple_concat_t: public cpu_primitive_t { o_d.data_type()) && i_d.format() == o_d.format() && !utils::one_of(i_d.format(), memory_format::blocked, - memory_format::wino_fmt) + memory_format::wino_fmt) && !i_d.is_additional_buffer(); + if (!ok) return unimplemented; } - if (!ok) - return unimplemented; - - format_perm(dst_d.ndims(), dst_d.blocking_desc().strides[0], perm_, - iperm_); + format_perm(); + // density check for (size_t i = 0; i < src_pds_.size(); ++i) { const memory_desc_wrapper i_d(&src_pds_[i]); const memory_desc_wrapper o_d(&src_image_pds_[i]); - ok = ok && is_dense(i_d) && is_dense(o_d); + ok = ok + && nelems_to_concat(i_d) == size_to_concat(i_d) + && nelems_to_concat(o_d) == size_to_concat(o_d); + if (!ok) return unimplemented; } - return ok ? success : unimplemented; + init_scratchpad(); + + return success; } + dims_t perm_; dims_t iperm_; - }; - simple_concat_t(const pd_t *conf, const input_vector &inputs, - const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*conf) - { - const int n = conf_.n_inputs(); - input_ptrs_ = (decltype(input_ptrs_))malloc( - sizeof(*input_ptrs_) * n, 64); - output_ptrs_ = (decltype(output_ptrs_))malloc( - sizeof(*output_ptrs_) * n, 64); - nelems_to_copy_ = (decltype(nelems_to_copy_))malloc( - sizeof(*nelems_to_copy_) * n, 64); - is_ = (decltype(is_))malloc(sizeof(*is_) * n, 64); - } + size_t nelems_to_concat(const memory_desc_wrapper &data_d) const { + const int ndims = data_d.ndims(); + auto &blk = data_d.blocking_desc(); - ~simple_concat_t() { - free(input_ptrs_); - free(output_ptrs_); - free(nelems_to_copy_); - free(is_); - } + size_t nelems = 1; + for (int i = perm_[concat_dim()]; i < ndims; i++) + nelems *= data_d.dims()[iperm_[i]] / blk.block_dims[iperm_[i]]; + for (int i = 0; i < ndims; i++) + nelems *= blk.block_dims[i]; - virtual void execute(event_t *e) { - execute(); - e->set_state(event_t::ready); - } + return nelems; + } - typedef typename prec_traits::type data_t; + private: + void format_perm() { + const memory_desc_wrapper dst_d(&dst_pd_); + const int ndims = dst_d.ndims(); -private: - static void format_perm( - const int ndims, const stride_t *strides, int *perm, int *iperm) { - assert(ndims >= 0); - bool swapped; - strides_t strides_tmp; - utils::array_copy(strides_tmp, strides, ndims); - for (int i = 0; i < ndims; i++) - iperm[i] = i; - for (int i = 0; i < ndims - 1; i++) { - swapped = false; - for (int j = 0; j < ndims - i - 1; j++) { - if (strides_tmp[j] < strides_tmp[j + 1]) { - nstl::swap(strides_tmp[j], strides_tmp[j + 1]); - nstl::swap(iperm[j], iperm[j + 1]); - swapped = true; + strides_t strides; + utils::array_copy(strides, dst_d.blocking_desc().strides[0], ndims); + + for (int i = 0; i < ndims; i++) iperm_[i] = i; + + for (int i = 0; i < ndims - 1; i++) { + bool swapped = false; + for (int j = 0; j < ndims - i - 1; j++) { + if (strides[j] < strides[j + 1]) { + nstl::swap(strides[j], strides[j + 1]); + nstl::swap(iperm_[j], iperm_[j + 1]); + swapped = true; + } } + if (swapped == false) + break; } - if (swapped == false) - break; - } - for (int i = 0; i < ndims; i++) - perm[iperm[i]] = i; - } - static size_t nelems_to_concat(const int concat_dim, int *perm, int *iperm, - const memory_desc_wrapper &data_d) { - const int ndims = data_d.ndims(); - auto &blk = data_d.blocking_desc(); - int nelems = 1; - for (int i = perm[concat_dim]; i < ndims; i++) { - nelems *= data_d.dims()[iperm[i]] / blk.block_dims[iperm[i]]; + for (int i = 0; i < ndims; i++) perm_[iperm_[i]] = i; } - for (int i = 0; i < ndims; i++) { - nelems *= blk.block_dims[i]; - } - return nelems; - } - static size_t _size_to_concat(const int concat_dim, int *perm, int *iperm, - const memory_desc_wrapper &data_d) { - size_t max_size = 0; - auto &blk = data_d.blocking_desc(); - for (int d = perm[concat_dim]; d < data_d.ndims(); ++d) { - auto block = blk.block_dims[iperm[d]]; - max_size = nstl::max(max_size, - size_t(blk.padding_dims[iperm[d]] / block) - * blk.strides[0][iperm[d]]); - if (block > 1) + size_t size_to_concat(const memory_desc_wrapper &data_d) const { + size_t max_size = 0; + auto &blk = data_d.blocking_desc(); + for (int d = perm_[concat_dim()]; d < data_d.ndims(); ++d) { + auto block = blk.block_dims[iperm_[d]]; max_size = nstl::max(max_size, - size_t(block * blk.strides[1][iperm[d]])); + size_t(blk.padding_dims[iperm_[d]] / block) + * blk.strides[0][iperm_[d]]); + if (block > 1) max_size = nstl::max(max_size, + size_t(block * blk.strides[1][iperm_[d]])); + } + return max_size; + } + + void init_scratchpad() { + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(key_concat_iptrs, sizeof(data_t *) * n_inputs()); + scratchpad.book(key_concat_optrs, sizeof(data_t *) * n_inputs()); + scratchpad.book(key_concat_nelems, sizeof(size_t) * n_inputs()); + scratchpad.book(key_concat_istrides, + sizeof(strides_t) * n_inputs()); } - return max_size; + }; + + simple_concat_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) {} + ~simple_concat_t() {} + + virtual void execute(event_t *e) const { + execute(); + e->set_state(event_t::ready); } - void execute(); - pd_t conf_; + typedef typename prec_traits::type data_t; - const data_t **input_ptrs_ = nullptr; - data_t **output_ptrs_ = nullptr; - size_t *nelems_to_copy_ = nullptr; - strides_t *is_ = nullptr; +private: + void execute() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp index e78d6ad..4e4a7da 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp @@ -101,75 +101,6 @@ bool simple_attr_check(const primitive_attr_t *attr, bool many_scales_support) { /* specific reorders: implementation */ template struct simple_reorder_impl::type> -{ - static bool is_applicable(const memory_desc_wrapper &input_d, - const memory_desc_wrapper &output_d, const primitive_attr_t *attr) - { - return simple_fmt_check(order_keep, fmt_i, fmt_o, input_d, output_d) - && simple_attr_check(attr, false); - } - - - static status_t execute(const cpu_reorder_pd_t *pd, - const data_t *input, data_t *output) { - DECLARE_COMMON_PARAMS(); - - const auto &dims = input_d.dims(); - - constexpr int blksize_16c = 16; - constexpr int blksize_8c = 8; - constexpr int ic_mult = order_keep ? 2 : 1; - constexpr int oc_mult = order_keep ? 1 : 2; - - const auto stride_8c = order_keep ? input_d.blocking_desc().strides[0] - : output_d.blocking_desc().strides[0]; - - auto ker = [&](const data_t *i, data_t *o, int blk_proc) { - if (alpha == 1.0 && beta == 0.0) { - for (int blk = 0; blk < blk_proc; ++blk){ - const int i_blk = order_keep ? blk * (int)stride_8c[1] - : blk * blksize_8c; - const int o_blk = order_keep ? blk * blksize_8c - : blk * (int)stride_8c[1]; - for (int c = 0; c < blksize_8c; ++c) { - o[o_blk + c] = i[i_blk + c]; - } - } - } else { - for (int blk = 0; blk < 2; ++blk) { - const int i_blk = order_keep ? blk * (int)stride_8c[1] - : blk * blksize_8c; - const int o_blk = order_keep ? blk * blksize_8c - : blk * (int)stride_8c[1]; - for (int c = 0; c < blk_proc; ++c) { - o[o_blk + c] = data_t( - alpha * i[i_blk + c] - + (beta ? beta * o[o_blk + c] : 0)); - } - } - } - }; - - const int CB = (dims[1] - 1) / blksize_16c + 1; - const int blktile_16 = ((dims[1] - 1) % blksize_16c + 1); - int blktile = ((blktile_16 - 1) / blksize_8c + 1); - - parallel_nd(dims[0], CB, dims[2], dims[3], - [&](int n, int C, int h, int w) { - auto i = &input[input_d.blk_off(n, C * ic_mult, h, w)]; - auto o = &output[output_d.blk_off(n, C * oc_mult, h, w)]; - ker(i,o, C < CB-1 ? 2 : blktile ); - - }); - - return success; - } -}; - - -template -struct simple_reorder_impl::type> @@ -234,8 +165,10 @@ typename utils::enable_if struct simple_reorder_impl::blk_fmt == bf::_4i16o4i_s8s8 + || format_traits::blk_fmt == bf::_2i8o4i_s8s8 + || format_traits::blk_fmt == bf::_4o4i_s8s8)) >::type> { static bool is_applicable(const memory_desc_wrapper &input_d, @@ -258,7 +191,7 @@ struct simple_reorder_impl::blk_size; const int sblk = 4; const auto &_g_oihw_d = order_keep ? input_d : output_d; @@ -333,6 +266,85 @@ struct simple_reorder_impl struct simple_reorder_impl::type> +{ + static bool is_applicable(const memory_desc_wrapper &input_d, + const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { + const size_t D_mask = utils::array_product(input_d.dims(), + math::ilog2q(attr->output_scales_.mask_ + 1)); + const int oc = input_d.dims()[1]; + const int g = input_d.dims()[0]; + + return true + && order_keep + && input_d.format() == fmt_i + && output_d.format() == fmt_o + && (input_d.data_type() == f32 || input_d.data_type() == s8) + && output_d.data_type() == s8 + && (D_mask == 1 || D_mask == (size_t)g * oc); + } + + static status_t execute(const cpu_reorder_pd_t *pd, + const data_t *input, data_t *output) { + DECLARE_COMMON_PARAMS(); + + const int blksize = 16; + + const auto &dims = input_d.dims(); + const auto &pdims = output_d.blocking_desc().padding_dims; + const int G = dims[0]; + const int Gp = pdims[0]; + const int OC = dims[1]; + const int IC = dims[2]; + const int H = dims[3]; + const int W = dims[4]; + + const size_t D_mask = utils::array_product(input_d.dims(), + math::ilog2q(pd->attr()->output_scales_.mask_ + 1)); + const float *scales = pd->attr()->output_scales_.scales_; + float adj_scale = (mayiuse(avx512_core_vnni)) ? 1.f : (1.f / 2.f); + + + auto ker = [&](const data_t *inp, data_t *out, + int32_t *cp, const float *s, const int g_block) { + PRAGMA_OMP_SIMD() + for (int g = 0; g < g_block; g++) { + const auto i_off = g * input_d.blocking_desc().strides[0][0]; + out[g] = qz_b0, data_t>()( + inp[i_off], s[g * OC] * adj_scale, rmode); + cp[g * OC] -= 128 * (int32_t)(out[g]); + } + }; + + size_t cp_offset = output_d.size() - output_d.additional_buffer_size(); + int32_t *cp = reinterpret_cast(output + cp_offset); + parallel_nd((Gp/blksize) * OC, [&](int ib) { + PRAGMA_OMP_SIMD() + for (int i = 0; i < blksize; i++) + cp[ib * blksize + i] = 0; + }); + + parallel_nd(Gp/blksize, OC, [&](int gb, int O) { + for (int I = 0; I < IC; I++) { + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + const int g_block = nstl::min(G - gb * blksize, blksize); + const auto inp = &input[input_d.blk_off(gb * blksize, O, I, h, w)]; + const auto out = &output[output_d.blk_off(gb, O, I, h, w)]; + int offset = gb * blksize + O; + ker(inp, out, &cp[offset], + &scales[(D_mask == 1) ? 0 : offset], g_block); + } + } + } + }); + return success; + } +}; + +template +struct simple_reorder_impl::blk_fmt == bf::_8i16o2i && format_traits::blk_fmt == bf::_8o16i2o>::type> { @@ -530,7 +542,7 @@ typename utils::enable_if::type> template struct simple_reorder_impl::type> +typename utils::enable_if::type> { static bool is_applicable(const memory_desc_wrapper &input_d, const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { @@ -570,7 +582,7 @@ typename utils::enable_if::type> template struct simple_reorder_impl::type> +typename utils::enable_if::type> { static bool is_applicable(const memory_desc_wrapper &input_d, const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { @@ -621,6 +633,56 @@ typename utils::enable_if::type> template struct simple_reorder_impl::type> +{ + static bool is_applicable(const memory_desc_wrapper &input_d, + const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { + int smask = attr ? attr->output_scales_.mask_ : 0; + return smask == 0 && order_keep && (input_d._md->format == nchw || input_d._md->format == nhwc) && output_d._md->format == nhwc; + } + + static status_t execute(const cpu_reorder_pd_t *pd, + const data_t *input, data_t *output) { + DECLARE_COMMON_PARAMS(); + + const auto &dims = input_d.dims(); + const int C = dims[1]; + const int H = dims[2]; + const int W = dims[3]; + + int nbits = 8; + const int CB = div_up(C, nbits); + + auto ker = [&](const data_t *i, data_t *o) { + for (int cb = 0; cb < CB; ++cb) { + uint8_t bin_val = 0x00; + for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { + const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[0][1]; + + auto bit = uint8_t((i[flat_off] > 0) ? 0x01 : 0x00); + bin_val |= (bit << shift); + } + + o[cb] = bin_val; + } + }; + + parallel_nd(dims[0], H, W, + [&](int n, int h, int w) { + auto iidx = input_d.blk_off(n, 0, h, w); + auto oidx = output_d.blk_off(n, 0, h, w); + + auto i = &input[iidx]; + auto o = &output[oidx / nbits]; + ker(i, o); + }); + + return success; + } +}; + +template +struct simple_reorder_impl::type> { static bool is_applicable(const memory_desc_wrapper &input_d, @@ -670,6 +732,90 @@ typename utils::enable_if::type> } }; +template +struct simple_reorder_impl::blk_fmt == bf::_8c + && format_traits::blk_fmt == bf::_16c>::type> +{ + static bool is_applicable(const memory_desc_wrapper &input_d, + const memory_desc_wrapper &output_d, const primitive_attr_t *attr) + { + return simple_fmt_check(order_keep, fmt_i, fmt_o, input_d, output_d) + && simple_attr_check(attr, false); + } + + static status_t execute(const cpu_reorder_pd_t *pd, + const data_t *input, data_t *output) { + DECLARE_COMMON_PARAMS(); + + constexpr int is_1d = format_traits::ndims_sp == 1; + constexpr int is_3d = format_traits::ndims_sp == 3; + constexpr int blksize_16 = format_traits::blk_size; + constexpr int blksize_8 = format_traits::blk_size; + constexpr int ic_mult = order_keep ? 2 : 1; + constexpr int oc_mult = order_keep ? 1 : 2; + + const auto &nchw8c_d = order_keep ? input_d : output_d; + const auto &dims = input_d.dims(); + const auto &pdims = order_keep ? output_d.blocking_desc().padding_dims + : input_d.blocking_desc().padding_dims; + const auto stride_8c = nchw8c_d.blocking_desc().strides[0]; + + const int C = dims[1]; + const int D = is_3d ? dims[2] : 1; + const int H = is_1d ? 1 : dims[2 + is_3d]; + const int W = dims[3 + is_3d - is_1d]; + + auto ker = [&](const data_t *i, data_t *o, + const int block_16) { + const int nb = (block_16 - 1) / blksize_8 + 1; + if (alpha == 1.0 && beta == 0.0) { + for (int b = 0; b < nb; ++b) { + const ptrdiff_t i_off = order_keep ? b * stride_8c[1] + : b * blksize_8; + const ptrdiff_t o_off = order_keep ? b * blksize_8 + : b * stride_8c[1]; + const int block_8 = nstl::min(blksize_8, + block_16 - b * blksize_8); + for (int c = 0; c < block_8; ++c) { + o[o_off + c] = _qz_a1b0()( + i[i_off + c], rmode); + } + } + } else { + for (int b = 0; b < nb; ++b) { + const ptrdiff_t i_off = order_keep ? b * stride_8c[1] + : b * blksize_8; + const ptrdiff_t o_off = order_keep ? b * blksize_8 + : b * stride_8c[1]; + const int block_8 = nstl::min(blksize_8, + block_16 - b * blksize_8); + for (int c = 0; c < block_8; ++c) { + o[o_off + c] = _qz()(i[i_off + c], + o[o_off + c], alpha, beta, rmode); + } + } + } + }; + +# define data_blk_off(md, n, c, d, h, w) \ + ( is_1d ? (md).blk_off(n, c, w) \ + : is_3d ? (md).blk_off(n, c, d, h, w) : (md).blk_off(n, c, h, w)) + + parallel_nd(dims[0], pdims[1] / blksize_16, D, H, W, + [&](int n, int nb_c, int d, int h, int w) { + auto i = &input[data_blk_off(input_d, n, ic_mult * nb_c, d, h, w)]; + auto o = &output[data_blk_off(output_d, n, oc_mult * nb_c, d, h, w)]; + const int block_16 = nstl::min(blksize_16, C - nb_c * blksize_16); + ker(i, o, block_16); + }); + +# undef data_blk_off + + return success; + } +}; + #define PLAIN_TO_BLOCKED_IS_APPLICABLE() \ static bool is_applicable(const memory_desc_wrapper &input_d, \ const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { \ @@ -681,6 +827,7 @@ typename utils::enable_if::type> template struct simple_reorder_impl::blk_fmt == bf::_4c || format_traits::blk_fmt == bf::_8c || format_traits::blk_fmt == bf::_16c)>::type> { @@ -956,8 +1103,77 @@ typename utils::enable_if struct simple_reorder_impl::type> +{ + PLAIN_TO_BLOCKED_IS_APPLICABLE(); + + static status_t execute(const cpu_reorder_pd_t *pd, + const data_t *input, data_t *output) { + DECLARE_COMMON_PARAMS(); + + static constexpr bool w_groups + = format_traits::data_kind == dk::gwei; + constexpr int is_1d = format_traits::ndims_sp == 1; + constexpr int is_3d = format_traits::ndims_sp == 3; + constexpr int blksize_o = fmt_o == OhIw8o32i ? 8 : 16; + constexpr int blksize_i = 32; + + const auto &dims = input_d.dims(); + const auto &pdims = order_keep + ? output_d.blocking_desc().padding_dims + : input_d.blocking_desc().padding_dims; + + const int G = w_groups ? dims[0] : 1; + const int OC = dims[w_groups + 0]; + const int NB_OC = pdims[w_groups + 0] / blksize_o; + const int IC = dims[w_groups + 1]; + const int NB_IC = pdims[w_groups + 1] / blksize_i; + const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d]; + const int W = dims[w_groups + 3 + is_3d - is_1d]; + + constexpr int i_mult_o = blksize_o; + constexpr int i_mult_i = blksize_i; + constexpr int nbits = 8; + + auto extract_bit = [](uint8_t val, uint8_t bit) -> uint8_t { + return (uint8_t) ((val >> bit) & 0x0001); + }; + + parallel_nd(G, NB_OC, NB_IC, H, W, + [&](int g, int nb_oc, int nb_ic, int h, int w) { + const int oc_block = nstl::min(blksize_o, OC - nb_oc * blksize_o); + const int ic_block = nstl::min(blksize_i, IC - nb_ic * blksize_i); + + for (int oc = 0; oc < oc_block; ++oc) { + for (int icb = 0; icb < div_up(ic_block, nbits); ++icb) { + + uint8_t bin_val = 0x00; + for (int ic = icb*nbits, shift = 0; ic < std::min(IC, (icb + 1)*nbits); ic++, shift++) { + size_t iidx = (i_mult_o * nb_oc + oc) * input_d.blocking_desc().strides[0][0] + + (i_mult_i * nb_ic + ic) *input_d.blocking_desc().strides[0][1] + + h * input_d.blocking_desc().strides[0][2] + + w; + + uint8_t bit = extract_bit(input[iidx / nbits], (uint8_t)(iidx % nbits)); + bin_val |= (bit << shift); + } + + size_t oidx = wei_blk_off_like_gwei3D(output_d, g, nb_oc, nb_ic, 0, h, w) + oc * blksize_i + icb * blksize_o; + output[oidx / nbits] = bin_val; + + } + } + }); + + return success; + } +}; + +template +struct simple_reorder_impl::blk_fmt>::blk_ndims == 2 && fmt_o != OhIw8o4i && fmt_o != gOhIw8o4i>::type> +&& block_format_traits::blk_fmt>::blk_ndims == 2 +&& fmt_o != OhIw8o4i && fmt_o != gOhIw8o4i && fmt_o != OhIw8o32i && fmt_o != OhIw16o32i>::type> { PLAIN_TO_BLOCKED_IS_APPLICABLE(); @@ -1045,6 +1261,7 @@ typename utils::enable_if struct simple_reorder_impl::blk_fmt == bf::_4o || format_traits::blk_fmt == bf::_8o || format_traits::blk_fmt == bf::_16o)>::type> { @@ -1392,21 +1609,21 @@ struct simple_reorder_t: public cpu_primitive_t { } }; - simple_reorder_t(const pd_t *pd, const input_vector &inputs, + simple_reorder_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {} + : cpu_primitive_t(apd, inputs, outputs) {} - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { auto input = reinterpret_cast *>( this->input_memory(0)); auto output = reinterpret_cast *>(this->memory()); simple_reorder_impl::execute( - &conf_, input, output); + pd(), input, output); e->set_state(event_t::ready); } private: - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; #undef SIMPLE_REORDER_TEMPL_DECL diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.cpp index 4a49061..fc7f94b 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.cpp @@ -22,16 +22,16 @@ namespace impl { namespace cpu { template -void simple_sum_t::execute() { +void simple_sum_t::execute() const { auto output = reinterpret_cast(this->memory()); - const int num_arrs = conf_.n_inputs(); - const memory_desc_wrapper o_d(conf_.dst_pd()); + const int num_arrs = pd()->n_inputs(); + const memory_desc_wrapper o_d(pd()->dst_pd()); output += o_d.blk_off(0); const size_t nelems = o_d.nelems(); const data_t *input_ptrs[max_num_arrs]; for (int a = 0; a < num_arrs; ++a) { - const memory_desc_wrapper i_d(conf_.src_pd(a)); + const memory_desc_wrapper i_d(pd()->src_pd(a)); input_ptrs[a] = reinterpret_cast( this->input_memory(a)) + i_d.blk_off(0); @@ -41,7 +41,7 @@ void simple_sum_t::execute() { const size_t blocks_number = nelems / block_size; const size_t tail = nelems % block_size; - const auto &scales = conf_.scales_; + const auto &scales = pd()->scales_; parallel(0, [&](const int ithr, const int nthr) { size_t start{0}, end{0}; balance211(blocks_number, nthr, ithr, start, end); diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.hpp index 8704be5..133b251 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/simple_sum.hpp @@ -58,11 +58,11 @@ struct simple_sum_t: public cpu_primitive_t { } }; - simple_sum_t(const pd_t *conf, const input_vector &inputs, + simple_sum_t(const pd_t *apd, const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*conf) {} + : cpu_primitive_t(apd, inputs, outputs) {} - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { execute(); e->set_state(event_t::ready); } @@ -71,8 +71,8 @@ struct simple_sum_t: public cpu_primitive_t { typedef typename prec_traits::type data_t; private: - void execute(); - pd_t conf_; + void execute() const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } }; } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/wino_reorder.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/wino_reorder.hpp index 78d005e..0e24746 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/wino_reorder.hpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/wino_reorder.hpp @@ -35,30 +35,47 @@ struct wino_reorder_t : public cpu_primitive_t { const primitive_attr_t *attr) { assert(input_pd->engine()->kind() == engine_kind::cpu); assert(output_pd->engine()->kind() == engine_kind::cpu); - const memory_desc_wrapper output_d(output_pd); - bool args_ok = true && input_pd->desc()->data_type == type_i - && output_pd->desc()->data_type == type_o - && one_of(input_pd->desc()->format, goihw, oihw) - && output_pd->desc()->format == wino_fmt - && one_of(output_d.wino_desc().wino_format, - mkldnn_wino_wei_aaOIoi, mkldnn_wino_wei_aaOio, - mkldnn_wino_wei_aaOBiOo, - mkldnn_wino_wei_OBaaIBOIio); - - if (!args_ok) - return status::invalid_arguments; + const memory_desc_wrapper id(input_pd), od(output_pd); + bool args_ok = true + && id.data_type() == type_i + && od.data_type() == type_o + && utils::one_of(id.format(), goihw, oihw) + && od.format() == wino_fmt + && one_of(od.wino_desc().wino_format, + mkldnn_wino_wei_aaOIoi, mkldnn_wino_wei_aaOio, + mkldnn_wino_wei_aaOBiOo, mkldnn_wino_wei_OBaaIBOIio); + if (!args_ok) return status::invalid_arguments; auto _pd = new pd_t((const cpu_memory_pd_t *)input_pd, (const cpu_memory_pd_t *)output_pd, attr); - if (_pd == nullptr) - return out_of_memory; - if (_pd->init() != success) { - delete _pd; - return unimplemented; - } + if (_pd == nullptr) return out_of_memory; + if (_pd->init() != success) { delete _pd; return unimplemented; } return safe_ptr_assign(*reorder_pd, _pd); } + + virtual status_t init() override { + status_t status = cpu_reorder_pd_t::init(); + if (status != status::success) return status; + + init_scratchpad(); + + return status::success; + } + + private: + void init_scratchpad() { + auto &o = memory_desc_wrapper(output_pd()).wino_desc(); + size_t transform_space_size = (size_t)o.r * o.alpha * o.oc_block; + size_t plain_size = (size_t)o.alpha * o.alpha * o.oc * o.ic; + + using namespace memory_tracking::names; + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(key_reorder_wino_transform_space, + sizeof(in_data_t) * transform_space_size); + scratchpad.book(key_reorder_wino_plain, + sizeof(out_data_t) * plain_size); + } }; private: @@ -66,11 +83,12 @@ private: typedef typename prec_traits::type out_data_t; const int unsign_val_in_wino_domain_ = 5; - wino_reorder_t(const pd_t *pd, - const input_vector &inputs, const output_vector &outputs) - : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) { - const memory_desc_wrapper input_d(conf_.input_pd()); - const memory_desc_wrapper output_d(conf_.output_pd()); + wino_reorder_t(const pd_t *apd, const input_vector &inputs, + const output_vector &outputs) + : cpu_primitive_t(apd, inputs, outputs) + { + const memory_desc_wrapper input_d(pd()->input_pd()); + const memory_desc_wrapper output_d(pd()->output_pd()); r_ = output_d.wino_desc().r; w_alpha_ = output_d.wino_desc().alpha; @@ -111,25 +129,18 @@ private: size_wino_wei_ = w_alpha_ * w_alpha_ * oc_ * ic_; size_wspace_ = r_ * w_alpha_ * oc_block_; - - wspace_ = (in_data_t *)malloc(sizeof(in_data_t) * size_wspace_, 64); - tmp_wei_ = - (out_data_t *)malloc(sizeof(out_data_t) * size_wino_wei_, 64); } - ~wino_reorder_t() { - free(wspace_); - free(tmp_wei_); - } - - void transform(const in_data_t *__restrict input) { - const memory_desc_wrapper input_d(conf_.input_pd()->desc()); + void transform(out_data_t *__restrict tmp_wei, + const in_data_t *__restrict input, + in_data_t *__restrict wspace) const { + const memory_desc_wrapper input_d(pd()->input_pd()->desc()); - round_mode_t rmode = conf_.attr()->round_mode_; - const int smask = conf_.attr()->output_scales_.mask_; + round_mode_t rmode = pd()->attr()->round_mode_; + const int smask = pd()->attr()->output_scales_.mask_; const int ndims_mask = math::ilog2q(smask + 1); const size_t D_mask = utils::array_product(input_d.dims(), ndims_mask); - const float *__restrict scales = conf_.attr()->output_scales_.scales_; + const float *__restrict scales = pd()->attr()->output_scales_.scales_; assert(D_mask == 1 || D_mask == (size_t)oc_); /* transform weights to winograd domain */ @@ -162,9 +173,9 @@ private: const in_data_t *__restrict _inp = input + (ob * oc_block_ * or_ic_ + iic) * kh_ * kw_; out_data_t *__restrict _out - = tmp_wei_ + (iic * nb_oc_ + ob) * oc_block_; + = tmp_wei + (iic * nb_oc_ + ob) * oc_block_; - parallel_nd(size_wspace_, [&](int i) { wspace_[i] = 0.f; }); + parallel_nd(size_wspace_, [&](int i) { wspace[i] = 0.f; }); parallel_nd(r_, w_alpha_, oc_block_, [&](int ih, int j, int ioc) { @@ -174,7 +185,7 @@ private: in_data_t inp_v = (inp_ic < or_ic_ && inp_oc < or_oc_) ? _inp[ioc * or_ic_ * kh_ * kw_ + ih * kw_ + iw] : 0.f; - wspace_[(ih * w_alpha_ + j) * oc_block_ + ioc] + wspace[(ih * w_alpha_ + j) * oc_block_ + ioc] += inp_v * g[j * r_ + iw]; } }); @@ -184,7 +195,7 @@ private: float t = 0; for (int k = 0; k < r_; ++k) t += g[i * r_ + k] - * wspace_[(k * w_alpha_ + j) * oc_block_ + ioc]; + * wspace[(k * w_alpha_ + j) * oc_block_ + ioc]; if (type_o == s8) { const float scale = (D_mask == 1) ? scales[0] @@ -199,7 +210,8 @@ private: }} } - void reorder_to_aaOIoi(out_data_t *__restrict output) { + void reorder_to_aaOIoi(out_data_t *__restrict output, + const out_data_t *__restrict tmp_wei) const { int32_t *__restrict dst_bias = nullptr; if (type_o == s8) { const auto bias_shift = sizeof(out_data_t) * size_wino_wei_; @@ -229,7 +241,7 @@ private: int dst_offset = u_h_shift + u_w_shift + oc_block_shift + ic_block_shift; - output[dst_offset] = tmp_wei_[src_offset]; + output[dst_offset] = tmp_wei[src_offset]; if (type_o == s8) { int bias_offset = u_h_shift_b + u_w_shift_b + oc_shift; if (index != unsign_val_in_wino_domain_) @@ -244,7 +256,8 @@ private: }} } - void reorder_to_aaOio(out_data_t *__restrict output) { + void reorder_to_aaOio(out_data_t *__restrict output, + const out_data_t *__restrict tmp_wei) const { parallel_nd(w_alpha_, w_alpha_, nb_oc_, [&](int u_h, int u_w, int ob) { for (int ib = 0; ib < nb_ic_; ib++) { @@ -258,12 +271,13 @@ private: + u_w * nb_oc_ * nb_ic_ * ic_block_ * oc_block_ + ob * nb_ic_ * ic_block_ * oc_block_ + ib * ic_block_ * oc_block_ + i * oc_block_ + o; - output[dst_offset] = tmp_wei_[src_offset]; + output[dst_offset] = tmp_wei[src_offset]; }}} }); } - void reorder_to_aaOBiOo(out_data_t *__restrict output) { + void reorder_to_aaOBiOo(out_data_t *__restrict output, + const out_data_t *__restrict tmp_wei) const { int oc_chunks = nb_oc_ / oc2_block_; parallel_nd(w_alpha_, w_alpha_, oc_chunks, @@ -282,7 +296,7 @@ private: int src_offset = u_h * w_alpha_ * ic_ * oc_ + u_w * ic_ * oc_ + icp * oc_ + ocp; - wei_ptr[wei_offset + o] = tmp_wei_[src_offset]; + wei_ptr[wei_offset + o] = tmp_wei[src_offset]; } wei_offset += oc_block_; }} @@ -290,7 +304,8 @@ private: }); } - void reorder_to_OBaaIBOIio(out_data_t *__restrict output) { + void reorder_to_OBaaIBOIio(out_data_t *__restrict output, + const out_data_t *__restrict tmp_wei) const { int ic_chunks = nb_ic_ / ic2_block_; int oc_chunks = nb_oc_ / oc2_block_; @@ -310,39 +325,46 @@ private: * ic_chunks + icc) * oc2_block_ + ob) * ic2_block_ + ib) * ic_block_ + i) * oc_block_; for (int o = 0; o < oc_block_; o++) - output[wei_offset + o] = tmp_wei_[src_offset + o]; + output[wei_offset + o] = tmp_wei[src_offset + o]; }} }} }); } - virtual void execute(event_t *e) { + virtual void execute(event_t *e) const { auto input = reinterpret_cast(input_memory(0)); auto output = reinterpret_cast(memory()); - transform(input); + auto wspace = (in_data_t *__restrict)scratchpad().template get( + memory_tracking::names::key_reorder_wino_transform_space); + auto tmp_wei = (out_data_t *__restrict)scratchpad().template get( + memory_tracking::names::key_reorder_wino_plain); + + transform(tmp_wei, input, wspace); /* reorder to winograd domain */ switch (wino_format_) { - case mkldnn_wino_wei_aaOIoi: reorder_to_aaOIoi(output); break; - case mkldnn_wino_wei_aaOio: reorder_to_aaOio(output); break; - case mkldnn_wino_wei_aaOBiOo: reorder_to_aaOBiOo(output); break; - case mkldnn_wino_wei_OBaaIBOIio: reorder_to_OBaaIBOIio(output); break; + case mkldnn_wino_wei_aaOIoi: + reorder_to_aaOIoi(output, tmp_wei); break; + case mkldnn_wino_wei_aaOio: + reorder_to_aaOio(output, tmp_wei); break; + case mkldnn_wino_wei_aaOBiOo: + reorder_to_aaOBiOo(output, tmp_wei); break; + case mkldnn_wino_wei_OBaaIBOIio: + reorder_to_OBaaIBOIio(output, tmp_wei); break; default: assert("Unknown wino format"); break; } e->set_state(event_t::ready); } - pd_t conf_; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); } int r_, w_alpha_; int ic_, oc_, or_ic_, or_oc_, kh_, kw_; int oc_block_, ic_block_, oc2_block_, ic2_block_; float adj_scale_; int nb_oc_, nb_ic_; mkldnn_wino_memory_format_t wino_format_; - in_data_t *__restrict wspace_; - out_data_t *__restrict tmp_wei_; int size_wino_wei_; int size_wspace_; }; diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak.h b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak.h index 74d91d4..5c202f4 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak.h +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak.h @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2018 Intel Corporation +* Copyright 2016-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -85,6 +85,8 @@ // This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft. #if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\ ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__))) + #include + #define XBYAK_STD_UNORDERED_SET std::unordered_set #include #define XBYAK_STD_UNORDERED_MAP std::unordered_map #define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap @@ -94,16 +96,22 @@ libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version). */ #elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || defined(__llvm__) + #include + #define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set #include #define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map #define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap #elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600) + #include + #define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set #include #define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map #define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap #else + #include + #define XBYAK_STD_UNORDERED_SET std::set #include #define XBYAK_STD_UNORDERED_MAP std::map #define XBYAK_STD_UNORDERED_MULTIMAP std::multimap @@ -150,7 +158,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x5631 /* 0xABCD = A.BC(D) */ + VERSION = 0x5760 /* 0xABCD = A.BC(D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -223,7 +231,8 @@ enum { ERR_INVALID_ZERO, ERR_INVALID_RIP_IN_AUTO_GROW, ERR_INVALID_MIB_ADDRESS, - ERR_INTERNAL + ERR_INTERNAL, + ERR_X2APIC_IS_NOT_SUPPORTED }; class Error : public std::exception { @@ -285,6 +294,7 @@ public: "invalid rip in AutoGrow", "invalid mib address", "internal error", + "x2APIC is not supported" }; assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl)); return errTbl[err_]; @@ -662,6 +672,12 @@ struct RegRip { const Label* label_; bool isAddr_; explicit RegRip(sint64 disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} + friend const RegRip operator+(const RegRip& r, int disp) { + return RegRip(r.disp_ + disp, r.label_, r.isAddr_); + } + friend const RegRip operator-(const RegRip& r, int disp) { + return RegRip(r.disp_ - disp, r.label_, r.isAddr_); + } friend const RegRip operator+(const RegRip& r, sint64 disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); } @@ -831,6 +847,7 @@ inline RegExp operator-(const RegExp& e, size_t disp) // 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc) void *const AutoGrow = (void*)1; //-V566 +void *const DontSetProtectRWE = (void*)2; //-V566 class CodeArray { enum Type { @@ -870,6 +887,7 @@ protected: size_t size_; bool isCalledCalcJmpAddress_; + bool useProtect() const { return alloc_->useProtect(); } /* allocate new memory and copy old data to the new area */ @@ -893,12 +911,16 @@ protected: uint64 disp = i->getVal(top_); rewrite(i->codeOffset, disp, i->jmpSize); } - if (alloc_->useProtect() && !protect(top_, size_, true)) throw Error(ERR_CANT_PROTECT); isCalledCalcJmpAddress_ = true; } public: + enum ProtectMode { + PROTECT_RW = 0, // read/write + PROTECT_RWE = 1, // read/write/exec + PROTECT_RE = 2 // read/exec + }; explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0) - : type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF) + : type_(userPtr == AutoGrow ? AUTO_GROW : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF) , alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_) , maxSize_(maxSize) , top_(type_ == USER_BUF ? reinterpret_cast(userPtr) : alloc_->alloc((std::max)(maxSize, 1))) @@ -906,7 +928,7 @@ public: , isCalledCalcJmpAddress_(false) { if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC); - if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, true)) { + if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) { alloc_->free(top_); throw Error(ERR_CANT_PROTECT); } @@ -914,10 +936,19 @@ public: virtual ~CodeArray() { if (isAllocType()) { - if (alloc_->useProtect()) protect(top_, maxSize_, false); + if (useProtect()) setProtectModeRW(false); alloc_->free(top_); } } + bool setProtectMode(ProtectMode mode, bool throwException = true) + { + bool isOK = protect(top_, maxSize_, mode); + if (isOK) return true; + if (throwException) throw Error(ERR_CANT_PROTECT); + return false; + } + bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); } + bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); } void resetSize() { size_ = 0; @@ -949,10 +980,10 @@ public: void dq(uint64 code) { db(code, 8); } const uint8 *getCode() const { return top_; } template - const F getCode() const { return CastTo(top_); } + const F getCode() const { return reinterpret_cast(top_); } const uint8 *getCurr() const { return &top_[size_]; } template - const F getCurr() const { return CastTo(&top_[size_]); } + const F getCurr() const { return reinterpret_cast(&top_[size_]); } size_t getSize() const { return size_; } void setSize(size_t size) { @@ -1005,19 +1036,39 @@ public: change exec permission of memory @param addr [in] buffer address @param size [in] buffer size - @param canExec [in] true(enable to exec), false(disable to exec) + @param protectMode [in] mode(RW/RWE/RE) @return true(success), false(failure) */ - static inline bool protect(const void *addr, size_t size, bool canExec) + static inline bool protect(const void *addr, size_t size, int protectMode) { #if defined(_WIN32) + const DWORD c_rw = PAGE_READWRITE; + const DWORD c_rwe = PAGE_EXECUTE_READWRITE; + const DWORD c_re = PAGE_EXECUTE_READ; + DWORD mode; +#else + const int c_rw = PROT_READ | PROT_WRITE; + const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC; + const int c_re = PROT_READ | PROT_EXEC; + int mode; +#endif + switch (protectMode) { + case PROTECT_RW: mode = c_rw; break; + case PROTECT_RWE: mode = c_rwe; break; + case PROTECT_RE: mode = c_re; break; + default: + return false; + } +#if defined(_WIN32) DWORD oldProtect; - return VirtualProtect(const_cast(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0; + return VirtualProtect(const_cast(addr), size, mode, &oldProtect) != 0; #elif defined(__GNUC__) size_t pageSize = sysconf(_SC_PAGESIZE); size_t iaddr = reinterpret_cast(addr); size_t roundAddr = iaddr & ~(pageSize - static_cast(1)); - int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0); +#ifndef NDEBUG + if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize); +#endif return mprotect(reinterpret_cast(roundAddr), size + (iaddr - roundAddr), mode) == 0; #else return true; @@ -1044,46 +1095,43 @@ public: M_ripAddr }; Address(uint32 sizeBit, bool broadcast, const RegExp& e) - : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), permitVsib_(false), broadcast_(broadcast) + : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) { e_.verify(); } #ifdef XBYAK64 explicit Address(size_t disp) - : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), permitVsib_(false), broadcast_(false){ } + : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false){ } Address(uint32 sizeBit, bool broadcast, const RegRip& addr) - : Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), permitVsib_(false), broadcast_(broadcast) { } + : Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), broadcast_(broadcast) { } #endif - void permitVsib() const { permitVsib_ = true; } RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; } Mode getMode() const { return mode_; } - bool is32bit() const { verify(); return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; } - bool isOnlyDisp() const { verify(); return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax - size_t getDisp() const { verify(); return e_.getDisp(); } + bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; } + bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax + size_t getDisp() const { return e_.getDisp(); } uint8 getRex() const { - verify(); if (mode_ != M_ModRM) return 0; return getRegExp().getRex(); } - bool is64bitDisp() const { verify(); return mode_ == M_64bitDisp; } // for moffset + bool is64bitDisp() const { return mode_ == M_64bitDisp; } // for moffset bool isBroadcast() const { return broadcast_; } const Label* getLabel() const { return label_; } bool operator==(const Address& rhs) const { - return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ && permitVsib_ == rhs.permitVsib_ && broadcast_ == rhs.broadcast_; + return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ && broadcast_ == rhs.broadcast_; } bool operator!=(const Address& rhs) const { return !operator==(rhs); } + bool isVsib() const { return e_.isVsib(); } private: RegExp e_; const Label* label_; Mode mode_; - mutable bool permitVsib_; bool broadcast_; - void verify() const { if (e_.isVsib() && !permitVsib_) throw Error(ERR_BAD_VSIB_ADDRESSING); } }; inline const Address& Operand::getAddress() const @@ -1141,6 +1189,7 @@ public: Label(const Label& rhs); Label& operator=(const Label& rhs); ~Label(); + void clear() { mgr = 0; id = 0; } int getId() const { return id; } const uint8 *getAddress() const; @@ -1179,6 +1228,7 @@ class LabelManager { }; typedef XBYAK_STD_UNORDERED_MAP ClabelDefList; typedef XBYAK_STD_UNORDERED_MULTIMAP ClabelUndefList; + typedef XBYAK_STD_UNORDERED_SET LabelPtrList; CodeArray *base_; // global : stateList_.front(), local : stateList_.back() @@ -1186,6 +1236,7 @@ class LabelManager { mutable int labelId_; ClabelDefList clabelDefList_; ClabelUndefList clabelUndefList_; + LabelPtrList labelPtrList_; int getId(const Label& label) const { @@ -1234,9 +1285,14 @@ class LabelManager { return true; } friend class Label; - void incRefCount(int id) { clabelDefList_[id].refCount++; } - void decRefCount(int id) + void incRefCount(int id, Label *label) { + clabelDefList_[id].refCount++; + labelPtrList_.insert(label); + } + void decRefCount(int id, Label *label) + { + labelPtrList_.erase(label); ClabelDefList::iterator i = clabelDefList_.find(id); if (i == clabelDefList_.end()) return; if (i->second.refCount == 1) { @@ -1255,11 +1311,23 @@ class LabelManager { #endif return !list.empty(); } + // detach all labels linked to LabelManager + void resetLabelPtrList() + { + for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) { + (*i)->clear(); + } + labelPtrList_.clear(); + } public: LabelManager() { reset(); } + ~LabelManager() + { + resetLabelPtrList(); + } void reset() { base_ = 0; @@ -1269,6 +1337,7 @@ public: stateList_.push_back(SlabelState()); clabelDefList_.clear(); clabelUndefList_.clear(); + resetLabelPtrList(); } void enterLocal() { @@ -1301,10 +1370,11 @@ public: SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front(); define_inner(st.defList, st.undefList, label, base_->getSize()); } - void defineClabel(const Label& label) + void defineClabel(Label& label) { define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize()); label.mgr = this; + labelPtrList_.insert(&label); } void assign(Label& dst, const Label& src) { @@ -1312,6 +1382,7 @@ public: if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L); define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset); dst.mgr = this; + labelPtrList_.insert(&dst); } bool getOffset(size_t *offset, std::string& label) const { @@ -1359,19 +1430,19 @@ inline Label::Label(const Label& rhs) { id = rhs.id; mgr = rhs.mgr; - if (mgr) mgr->incRefCount(id); + if (mgr) mgr->incRefCount(id, this); } inline Label& Label::operator=(const Label& rhs) { if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L); id = rhs.id; mgr = rhs.mgr; - if (mgr) mgr->incRefCount(id); + if (mgr) mgr->incRefCount(id, this); return *this; } inline Label::~Label() { - if (id && mgr) mgr->decRefCount(id); + if (id && mgr) mgr->decRefCount(id, this); } inline const uint8* Label::getAddress() const { @@ -1488,6 +1559,8 @@ private: T_B32 = 1 << 26, // m32bcst T_B64 = 1 << 27, // m64bcst T_M_K = 1 << 28, // mem{k} + T_VSIB = 1 << 29, + T_MEM_EVEX = 1 << 30, // use evex if mem T_XXX }; void vex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false) @@ -1525,7 +1598,7 @@ private: if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) return Error(err); return v; } - int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0) + int evex(const Reg& reg, const Reg& base, const Operand *v, int type, int code, bool x = false, bool b = false, int aaa = 0, uint32 VL = 0, bool Hi16Vidx = false) { if (!(type & (T_EVEX | T_MUST_EVEX))) throw Error(ERR_EVEX_IS_INVALID); int w = (type & T_EW1) ? 1 : 0; @@ -1568,7 +1641,7 @@ private: } } } - bool Vp = !(v ? v->isExtIdx2() : 0); + bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx); bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false); if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET); db(0x62); @@ -1714,8 +1787,9 @@ private: // reg is reg field of ModRM // immSize is the size for immediate value // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement - void opAddr(const Address &addr, int reg, int immSize = 0, int disp8N = 0) + void opAddr(const Address &addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) { + if (!permitVisb && addr.isVsib()) throw Error(ERR_BAD_VSIB_ADDRESSING); if (addr.getMode() == Address::M_ModRM) { setSIB(addr.getRegExp(), reg, disp8N); } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) { @@ -1857,15 +1931,20 @@ private: } void opPushPop(const Operand& op, int code, int ext, int alt) { - if (op.isREG()) { - if (op.isBit(16)) db(0x66); - if (op.getReg().getIdx() >= 8) db(0x41); - db(alt | (op.getIdx() & 7)); - } else if (op.isMEM()) { - opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code); - } else { - throw Error(ERR_BAD_COMBINATION); + int bit = op.getBit(); + if (bit == 16 || bit == BIT) { + if (bit == 16) db(0x66); + if (op.isREG()) { + if (op.getReg().getIdx() >= 8) db(0x41); + db(alt | (op.getIdx() & 7)); + return; + } + if (op.isMEM()) { + opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code); + return; + } } + throw Error(ERR_BAD_COMBINATION); } void verifyMemHasSize(const Operand& op) const { @@ -1954,10 +2033,11 @@ private: const Address& addr = op2.getAddress(); const RegExp& regExp = addr.getRegExp(); const Reg& base = regExp.getBase(); + const Reg& index = regExp.getIndex(); if (BIT == 64 && addr.is32bit()) db(0x67); int disp8N = 0; - bool x = regExp.getIndex().isExtIdx(); - if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) { + bool x = index.isExtIdx(); + if ((type & (T_MUST_EVEX|T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() || addr.getOpmaskIdx()) { int aaa = addr.getOpmaskIdx(); if (aaa && !(type & T_M_K)) throw Error(ERR_INVALID_OPMASK_WITH_MEMORY); bool b = false; @@ -1965,12 +2045,12 @@ private: if (!(type & (T_B32 | T_B64))) throw Error(ERR_INVALID_BROADCAST); b = true; } - int VL = regExp.isVsib() ? regExp.getIndex().getBit() : 0; - disp8N = evex(r, base, p1, type, code, x, b, aaa, VL); + int VL = regExp.isVsib() ? index.getBit() : 0; + disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2()); } else { vex(r, base, p1, type, code, x); } - opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N); + opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0); } else { const Reg& base = op2.getReg(); if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) { @@ -2071,8 +2151,7 @@ private: } if (!isOK) throw Error(ERR_BAD_VSIB_ADDRESSING); } - addr.permitVsib(); - opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type | T_YMM, code); + opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type, code); } enum { xx_yy_zz = 0, @@ -2096,7 +2175,6 @@ private: { if (x.hasZero()) throw Error(ERR_INVALID_ZERO); checkGather2(x, addr.getRegExp().getIndex(), mode); - addr.permitVsib(); opVex(x, 0, addr, type, code); } /* @@ -2116,7 +2194,6 @@ private: { if (addr.hasZero()) throw Error(ERR_INVALID_ZERO); if (addr.getRegExp().getIndex().getKind() != kind) throw Error(ERR_BAD_VSIB_ADDRESSING); - addr.permitVsib(); opVex(x, 0, addr, type, code); } public: @@ -2169,7 +2246,8 @@ public: const Segment es, cs, ss, ds, fs, gs; #endif void L(const std::string& label) { labelMgr_.defineSlabel(label); } - void L(const Label& label) { labelMgr_.defineClabel(label); } + void L(Label& label) { labelMgr_.defineClabel(label); } + Label L() { Label label; L(label); return label; } void inLocalLabel() { labelMgr_.enterLocal(); } void outLocalLabel() { labelMgr_.leaveLocal(); } /* @@ -2200,7 +2278,7 @@ public: // call(function pointer) #ifdef XBYAK_VARIADIC_TEMPLATE template - void call(Ret(*func)(Params...)) { call(CastTo(func)); } + void call(Ret(*func)(Params...)) { call(reinterpret_cast(func)); } #endif void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); } @@ -2458,11 +2536,16 @@ public: MUST call ready() to complete generating code if you use AutoGrow mode. It is not necessary for the other mode if hasUndefinedLabel() is true. */ - void ready() + void ready(ProtectMode mode = PROTECT_RWE) { if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND); - if (isAutoGrow()) calcJmpAddress(); + if (isAutoGrow()) { + calcJmpAddress(); + if (useProtect()) setProtectMode(mode); + } } + // set read/exec + void readyRE() { return ready(PROTECT_RE); } #ifdef XBYAK_TEST void dump(bool doClear = true) { diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_bin2hex.h b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_bin2hex.h index 5b812bd..a22e522 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_bin2hex.h +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_bin2hex.h @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2018 Intel Corporation +* Copyright 2016-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_mnemonic.h b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_mnemonic.h index 9e3c535..28d2d22 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_mnemonic.h +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_mnemonic.h @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2018 Intel Corporation +* Copyright 2016-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,7 @@ * THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ -const char *getVersionString() const { return "5.631"; } +const char *getVersionString() const { return "5.76"; } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -167,8 +167,11 @@ void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); } void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); } void cmppd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); } void cmpps(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); } +void cmpsb() { db(0xA6); } +void cmpsd() { db(0xA7); } void cmpsd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); } void cmpss(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); } +void cmpsw() { db(0x66); db(0xA7); } void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); } void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); } void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); } @@ -728,6 +731,9 @@ void sar(const Operand& op, int imm) { opShift(op, imm, 7); } void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); } void sbb(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x18, 3); } void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); } +void scasb() { db(0xAE); } +void scasd() { db(0xAF); } +void scasw() { db(0x66); db(0xAF); } void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524 void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524 void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524 @@ -787,6 +793,9 @@ void stc() { db(0xF9); } void std() { db(0xFD); } void sti() { db(0xFB); } void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); } +void stosb() { db(0xAA); } +void stosd() { db(0xAB); } +void stosw() { db(0x66); db(0xAB); } void sub(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x28, 5); } void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); } void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); } @@ -1046,10 +1055,10 @@ void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_X void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE); } void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF); } void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF); } -void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x92, 0); } -void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x92, 1); } -void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x93, 1); } -void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x93, 2); } +void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); } +void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1); } +void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1); } +void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2); } void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm); } void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm); } void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF); } @@ -1059,7 +1068,7 @@ void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D); } void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm); } void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { if (!(y1.isYMM() && y2.isYMM() && op.isXMEM())) throw Error(ERR_BAD_COMBINATION); opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm); } -void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); } +void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm); } void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); } void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); } void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); } @@ -1180,10 +1189,10 @@ void vpextrb(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(8|16| void vpextrd(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) throw Error(ERR_BAD_COMBINATION); opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm); } void vpextrq(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(64) || op.isMEM()) && x.isXMM())) throw Error(ERR_BAD_COMBINATION); opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm); } void vpextrw(const Operand& op, const Xmm& x, uint8 imm) { if (!((op.isREG(16|i32e) || op.isMEM()) && x.isXMM())) throw Error(ERR_BAD_COMBINATION); if (op.isREG() && x.getIdx() < 16) { opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm); } else { opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm); } } -void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x90, 1); } -void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x90, 0); } -void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W0, 0x91, 2); } -void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_W1, 0x91, 1); } +void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1); } +void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0); } +void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2); } +void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1); } void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); } void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03); } void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); } @@ -1242,28 +1251,28 @@ void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); } void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); } void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); } -void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } +void vpslld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); } void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2); } -void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); } -void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); } +void vpslldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); } +void vpsllq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); } void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3); } void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47); } void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47); } -void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } +void vpsllw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); } void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1); } -void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } +void vpsrad(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); } void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2); } void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46); } -void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } +void vpsraw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); } void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1); } -void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x72, imm); } +void vpsrld(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm); } void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2); } -void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x73, imm); } -void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x73, imm); } +void vpsrldq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm); } +void vpsrlq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm); } void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3); } void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45); } void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45); } -void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX, 0x71, imm); } +void vpsrlw(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm); } void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1); } void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8); } void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA); } @@ -1589,7 +1598,10 @@ void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); } void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); } void cdqe() { db(0x48); db(0x98); } void cqo() { db(0x48); db(0x99); } +void cmpsq() { db(0x48); db(0xA7); } void movsq() { db(0x48); db(0xA5); } +void scasq() { db(0x48); db(0xAF); } +void stosq() { db(0x48); db(0xAB); } void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); } void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); } void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); } @@ -1762,18 +1774,18 @@ void vfpclasspd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(1 void vfpclassps(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isBit(128|256|512)) throw Error(ERR_BAD_MEM_SIZE); Reg x = k; x.setBit(op.getBit()); opVex(x, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } void vfpclasssd(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm); } void vfpclassss(const Opmask& k, const Operand& op, uint8 imm) { if (!op.isXMEM()) throw Error(ERR_BAD_MEM_SIZE); opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm); } -void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x92, 1); } -void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x92, 0); } -void vgatherpf0dpd(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vgatherpf0dps(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vgatherpf0qpd(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherpf0qps(const Address& addr) { opGatherFetch(addr, zm1, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherpf1dpd(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vgatherpf1dps(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x93, 0); } -void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x93, 2); } +void vgatherdpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1); } +void vgatherdps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0); } +void vgatherpf0dpd(const Address& addr) { opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vgatherpf0dps(const Address& addr) { opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vgatherpf0qpd(const Address& addr) { opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherpf0qps(const Address& addr) { opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherpf1dpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vgatherpf1dps(const Address& addr) { opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0); } +void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42); } void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43); } @@ -1860,10 +1872,10 @@ void vpexpandb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N1 | T void vpexpandd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89); } void vpexpandq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89); } void vpexpandw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62); } -void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x90, 0); } -void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x90, 1); } -void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x91, 2); } -void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x91, 0); } +void vpgatherdd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0); } +void vpgatherdq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1); } +void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2); } +void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); } @@ -1914,10 +1926,10 @@ void vprord(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.get void vprorq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); } void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14); } void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14); } -void vpscatterdd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA0, 0); } -void vpscatterdq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA0, 1); } -void vpscatterqd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA1, 2); } -void vpscatterqq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA1, 0); } +void vpscatterdd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0); } +void vpscatterdq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1); } +void vpscatterqd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2); } +void vpscatterqq(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0); } void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm); } void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm); } void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71); } @@ -1981,18 +1993,18 @@ void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C); } void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D); } void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D); } -void vscatterdpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA2, 1); } -void vscatterdps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA2, 0); } -void vscatterpf0dpd(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vscatterpf0dps(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vscatterpf0qpd(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterpf0qps(const Address& addr) { opGatherFetch(addr, zm5, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterpf1dpd(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC6, Operand::YMM); } -void vscatterpf1dps(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC6, Operand::ZMM); } -void vscatterpf1qpd(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_N8 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterpf1qps(const Address& addr) { opGatherFetch(addr, zm6, T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_N4 | T_M_K, 0xC7, Operand::ZMM); } -void vscatterqpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K, 0xA3, 0); } -void vscatterqps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0xA3, 2); } +void vscatterdpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1); } +void vscatterdps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0); } +void vscatterpf0dpd(const Address& addr) { opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vscatterpf0dps(const Address& addr) { opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vscatterpf0qpd(const Address& addr) { opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterpf0qps(const Address& addr) { opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterpf1dpd(const Address& addr) { opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM); } +void vscatterpf1dps(const Address& addr) { opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM); } +void vscatterpf1qpd(const Address& addr) { opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterpf1qps(const Address& addr) { opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM); } +void vscatterqpd(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0); } +void vscatterqps(const Address& addr, const Xmm& x) { opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2); } void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm); } void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); } void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); } diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h index 713c68d..08f0a30 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2018 Intel Corporation +* Copyright 2016-2019 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,6 +54,11 @@ */ #include "xbyak.h" +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + #define XBYAK_INTEL_CPU_SPECIFIC +#endif + +#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef _MSC_VER #if (_MSC_VER < 1400) && defined(XBYAK32) static inline __declspec(naked) void __cpuid(int[4], int) @@ -92,14 +97,30 @@ #endif #endif #endif +#endif namespace Xbyak { namespace util { +typedef enum { + SmtLevel = 1, + CoreLevel = 2 +} IntelCpuTopologyLevel; + /** CPU detection class */ class Cpu { uint64 type_; + //system topology + bool x2APIC_supported_; + static const size_t maxTopologyLevels = 2; + unsigned int numCores_[maxTopologyLevels]; + + static const unsigned int maxNumberCacheLevels = 10; + unsigned int dataCacheSize_[maxNumberCacheLevels]; + unsigned int coresSharignDataCache_[maxNumberCacheLevels]; + unsigned int dataCacheLevels_; + unsigned int get32bitAsBE(const char *x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); @@ -110,7 +131,7 @@ class Cpu { } void setFamily() { - unsigned int data[4]; + unsigned int data[4] = {}; getCpuid(1, data); stepping = data[0] & mask(4); model = (data[0] >> 4) & mask(4); @@ -133,6 +154,42 @@ class Cpu { { return (val >> base) & ((1u << (end - base)) - 1); } + void setNumCores() + { + if ((type_ & tINTEL) == 0) return; + + unsigned int data[4] = {}; + + /* CAUTION: These numbers are configuration as shipped by Intel. */ + getCpuidEx(0x0, 0, data); + if (data[0] >= 0xB) { + /* + if leaf 11 exists(x2APIC is supported), + we use it to get the number of smt cores and cores on socket + + leaf 0xB can be zeroed-out by a hypervisor + */ + x2APIC_supported_ = true; + for (unsigned int i = 0; i < maxTopologyLevels; i++) { + getCpuidEx(0xB, i, data); + IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); + if (level == SmtLevel || level == CoreLevel) { + numCores_[level - 1] = extractBit(data[1], 0, 15); + } + } + if (numCores_[SmtLevel - 1] != 0) { + numCores_[CoreLevel - 1] /= numCores_[SmtLevel - 1]; + } + } else { + /* + Failed to deremine num of cores without x2APIC support. + TODO: USE initial APIC ID to determine ncores. + */ + numCores_[SmtLevel - 1] = 0; + numCores_[CoreLevel - 1] = 0; + } + + } void setCacheHierarchy() { if ((type_ & tINTEL) == 0) return; @@ -141,21 +198,12 @@ class Cpu { // const unsigned int INSTRUCTION_CACHE = 2; const unsigned int UNIFIED_CACHE = 3; unsigned int smt_width = 0; - unsigned int n_cores = 0; - unsigned int data[4]; + unsigned int logical_cores = 0; + unsigned int data[4] = {}; - /* - if leaf 11 exists, we use it to get the number of smt cores and cores on socket - If x2APIC is supported, these are the only correct numbers. - - leaf 0xB can be zeroed-out by a hypervisor - */ - getCpuidEx(0x0, 0, data); - if (data[0] >= 0xB) { - getCpuidEx(0xB, 0, data); // CPUID for SMT Level - smt_width = data[1] & 0x7FFF; - getCpuidEx(0xB, 1, data); // CPUID for CORE Level - n_cores = data[1] & 0x7FFF; + if (x2APIC_supported_) { + smt_width = numCores_[0]; + logical_cores = numCores_[1]; } /* @@ -163,28 +211,29 @@ class Cpu { the first level of data cache is not shared (which is the case for every existing architecture) and use this to determine the SMT width for arch not supporting leaf 11. - when leaf 4 reports a number of core less than n_cores + when leaf 4 reports a number of core less than numCores_ on socket reported by leaf 11, then it is a correct number of cores not an upperbound. */ - for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) { + for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { getCpuidEx(0x4, i, data); unsigned int cacheType = extractBit(data[0], 0, 4); if (cacheType == NO_CACHE) break; if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { - unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1; - if (n_cores != 0) // true only if leaf 0xB is supported and valid - nb_logical_cores = (std::min)(nb_logical_cores, n_cores); - assert(nb_logical_cores != 0); - data_cache_size[data_cache_levels] = + unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; + if (logical_cores != 0) { // true only if leaf 0xB is supported and valid + actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); + } + assert(actual_logical_cores != 0); + dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) * (extractBit(data[1], 0, 11) + 1) * (data[2] + 1); - if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores; + if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; assert(smt_width != 0); - cores_sharing_data_cache[data_cache_levels] = nb_logical_cores / smt_width; - data_cache_levels++; + coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); + dataCacheLevels_++; } } } @@ -198,22 +247,22 @@ public: int displayFamily; // family + extFamily int displayModel; // model + extModel - // may I move these members into private? - static const unsigned int maxNumberCacheLevels = 10; - unsigned int data_cache_size[maxNumberCacheLevels]; - unsigned int cores_sharing_data_cache[maxNumberCacheLevels]; - unsigned int data_cache_levels; + unsigned int getNumCores(IntelCpuTopologyLevel level) { + if (level != SmtLevel && level != CoreLevel) throw Error(ERR_BAD_PARAMETER); + if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED); + return numCores_[level - 1]; + } - unsigned int getDataCacheLevels() const { return data_cache_levels; } + unsigned int getDataCacheLevels() const { return dataCacheLevels_; } unsigned int getCoresSharingDataCache(unsigned int i) const { - if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); - return cores_sharing_data_cache[i]; + if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); + return coresSharignDataCache_[i]; } unsigned int getDataCacheSize(unsigned int i) const { - if (i >= data_cache_levels) throw Error(ERR_BAD_PARAMETER); - return data_cache_size[i]; + if (i >= dataCacheLevels_) throw Error(ERR_BAD_PARAMETER); + return dataCacheSize_[i]; } /* @@ -221,30 +270,45 @@ public: */ static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER __cpuid(reinterpret_cast(data), eaxIn); -#else + #else __cpuid(eaxIn, data[0], data[1], data[2], data[3]); + #endif +#else + (void)eaxIn; + (void)data; #endif } static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER __cpuidex(reinterpret_cast(data), eaxIn, ecxIn); -#else + #else __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); + #endif +#else + (void)eaxIn; + (void)ecxIn; + (void)data; #endif } static inline uint64 getXfeature() { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER return _xgetbv(0); -#else + #else unsigned int eax, edx; // xgetvb is not support on gcc 4.2 // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); return ((uint64)edx << 32) | eax; + #endif +#else + return 0; #endif } typedef uint64 Type; @@ -315,9 +379,13 @@ public: Cpu() : type_(NONE) - , data_cache_levels(0) + , x2APIC_supported_(false) + , numCores_() + , dataCacheSize_() + , coresSharignDataCache_() + , dataCacheLevels_(0) { - unsigned int data[4]; + unsigned int data[4] = {}; const unsigned int& EAX = data[0]; const unsigned int& EBX = data[1]; const unsigned int& ECX = data[2]; @@ -407,6 +475,7 @@ public: if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; } setFamily(); + setNumCores(); setCacheHierarchy(); } void putFamily() const @@ -425,12 +494,17 @@ class Clock { public: static inline uint64 getRdtsc() { -#ifdef _MSC_VER +#ifdef XBYAK_INTEL_CPU_SPECIFIC + #ifdef _MSC_VER return __rdtsc(); -#else + #else unsigned int eax, edx; __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); return ((uint64)edx << 32) | eax; + #endif +#else + // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu + return 0; #endif } Clock() @@ -460,7 +534,7 @@ const int UseRCX = 1 << 6; const int UseRDX = 1 << 7; class Pack { - static const size_t maxTblNum = 10; + static const size_t maxTblNum = 15; const Xbyak::Reg64 *tbl_[maxTblNum]; size_t n_; public: @@ -520,7 +594,7 @@ public: const Xbyak::Reg64& operator[](size_t n) const { if (n >= n_) { - fprintf(stderr, "ERR Pack bad n=%d\n", (int)n); + fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_); throw Error(ERR_BAD_PARAMETER); } return *tbl_[n]; @@ -562,6 +636,7 @@ class StackFrame { static const int rcxPos = 3; static const int rdxPos = 2; #endif + static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax Xbyak::CodeGenerator *code_; int pNum_; int tNum_; @@ -571,7 +646,7 @@ class StackFrame { int P_; bool makeEpilog_; Xbyak::Reg64 pTbl_[4]; - Xbyak::Reg64 tTbl_[10]; + Xbyak::Reg64 tTbl_[maxRegNum]; Pack p_; Pack t_; StackFrame(const StackFrame&); @@ -583,7 +658,7 @@ public: make stack frame @param sf [in] this @param pNum [in] num of function parameter(0 <= pNum <= 4) - @param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX) + @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14 @param stackSizeByte [in] local stack size @param makeEpilog [in] automatically call close() if true @@ -610,27 +685,17 @@ public: using namespace Xbyak; if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM); const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); - if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM); + if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM); const Reg64& _rsp = code->rsp; - const AddressFrame& _ptr = code->ptr; saveNum_ = (std::max)(0, allRegNum - noSaveNum); const int *tbl = getOrderTbl() + noSaveNum; - P_ = saveNum_ + (stackSizeByte + 7) / 8; - if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment - P_ *= 8; - if (P_ > 0) code->sub(_rsp, P_); -#ifdef XBYAK64_WIN - for (int i = 0; i < (std::min)(saveNum_, 4); i++) { - code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i])); - } - for (int i = 4; i < saveNum_; i++) { - code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i])); - } -#else for (int i = 0; i < saveNum_; i++) { - code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i])); + code->push(Reg64(tbl[i])); } -#endif + P_ = (stackSizeByte + 7) / 8; + if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment + P_ *= 8; + if (P_ > 0) code->sub(_rsp, P_); int pos = 0; for (int i = 0; i < pNum; i++) { pTbl_[i] = Xbyak::Reg64(getRegIdx(pos)); @@ -651,21 +716,11 @@ public: { using namespace Xbyak; const Reg64& _rsp = code_->rsp; - const AddressFrame& _ptr = code_->ptr; const int *tbl = getOrderTbl() + noSaveNum; -#ifdef XBYAK64_WIN - for (int i = 0; i < (std::min)(saveNum_, 4); i++) { - code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]); - } - for (int i = 4; i < saveNum_; i++) { - code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]); - } -#else + if (P_ > 0) code_->add(_rsp, P_); for (int i = 0; i < saveNum_; i++) { - code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]); + code_->pop(Reg64(tbl[saveNum_ - 1 - i])); } -#endif - if (P_ > 0) code_->add(_rsp, P_); if (callRet) code_->ret(); } @@ -677,9 +732,6 @@ public: } catch (std::exception& e) { printf("ERR:StackFrame %s\n", e.what()); exit(1); - } catch (...) { - printf("ERR:StackFrame otherwise\n"); - exit(1); } } private: @@ -698,7 +750,7 @@ private: } int getRegIdx(int& pos) const { - assert(pos < 14); + assert(pos < maxRegNum); using namespace Xbyak; const int *tbl = getOrderTbl(); int r = tbl[pos++]; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/tests/CMakeLists.txt index 6e9caa6..a4816ee 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/CMakeLists.txt +++ b/inference-engine/thirdparty/mkl-dnn/tests/CMakeLists.txt @@ -37,7 +37,7 @@ append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}") # allow tests to include internal header files with, e.g. # include "src/common/mkldnn_thread.hpp" -include_directories(${CMAKE_SOURCE_DIR}) +include_directories(${PROJECT_SOURCE_DIR}) if(UNIX OR MINGW) # workaround for Intel Compiler 16.0 that doesn't suppress warning on @@ -68,7 +68,7 @@ if(UNIX OR MINGW) add_custom_command( OUTPUT ${test_c_symbols} COMMAND /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/generate_c_symbols_refs.sh - ${CMAKE_CURRENT_SOURCE_DIR}/.. ${test_c_symbols} + ${CMAKE_CURRENT_SOURCE_DIR}/.. ${PROJECT_BINARY_DIR}/include ${test_c_symbols} ) register_exe(test_c_symbols-c ${test_c_symbols} "test") # elseif(WIN32) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/api.c b/inference-engine/thirdparty/mkl-dnn/tests/api.c index da91859..55581d2 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/api.c +++ b/inference-engine/thirdparty/mkl-dnn/tests/api.c @@ -37,7 +37,7 @@ } \ } while(0) -static size_t product(int *arr, size_t size) { +static size_t product(ptrdiff_t *arr, size_t size) { size_t prod = 1; for (size_t i = 0; i < size; ++i) prod *= arr[i]; return prod; @@ -92,12 +92,12 @@ void test2() { const int mb = 2; const int groups = 2; - int c3_src_sizes[4] = {mb, 256, 13, 13}; - int c3_weights_sizes[] = {groups, 384/groups, 256/groups, 3, 3}; - int c3_bias_sizes[1] = {384}; - int strides[] = {1, 1}; - int32_t padding[] = {0, 0}; // set proper values - int c3_dst_sizes[4] = {mb, 384, + ptrdiff_t c3_src_sizes[4] = {mb, 256, 13, 13}; + ptrdiff_t c3_weights_sizes[] = {groups, 384/groups, 256/groups, 3, 3}; + ptrdiff_t c3_bias_sizes[1] = {384}; + ptrdiff_t strides[] = {1, 1}; + ptrdiff_t padding[] = {0, 0}; // set proper values + ptrdiff_t c3_dst_sizes[4] = {mb, 384, (c3_src_sizes[2] + 2*padding[0] - c3_weights_sizes[3])/strides[0] + 1, (c3_src_sizes[3] + 2*padding[1] - c3_weights_sizes[4])/strides[1] + 1 }; @@ -249,7 +249,7 @@ void test2() { void test3() { const int mb = 2; - int l2_data_sizes[4] = {mb, 256, 13, 13}; + ptrdiff_t l2_data_sizes[4] = {mb, 256, 13, 13}; real_t *src = (real_t*)calloc(product(l2_data_sizes, 4), sizeof(real_t)); real_t *dst = (real_t*)calloc(product(l2_data_sizes, 4), sizeof(real_t)); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/CMakeLists.txt index aaaf7f8..ee32b4f 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/CMakeLists.txt +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/CMakeLists.txt @@ -55,16 +55,13 @@ function(register_benchdnn_test name cmd) DEPENDS benchdnn WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) - if(WIN32) - set_property(TARGET ${name} PROPERTY ENVIRONMENT "PATH=${CTESTCONFIG_PATH}") - configure_file(${CMAKE_SOURCE_DIR}/config_template.vcxproj.user ${name}.vcxproj.user @ONLY) - endif() + maybe_configure_windows_test(${name} TARGET) endfunction() register_benchdnn_test(test_conv "benchdnn -v1 --conv --batch=inputs/test_conv_all") register_benchdnn_test(test_benchdnn_conv "benchdnn -v1 --conv --batch=inputs/test_conv_all") -register_benchdnn_test(test_benchdnn_deconv "benchdnn -v1 --deconv --batch=inputs/test_deconv_all") -register_benchdnn_test(test_benchdnn_rnn "benchdnn -v1 --rnn") +register_benchdnn_test(test_benchdnn_deconv "benchdnn -v1 --deconv --batch=inputs/deconv/test_deconv_all") +register_benchdnn_test(test_benchdnn_rnn "benchdnn -v1 --rnn --batch=inputs/rnn/test_rnn_small") register_benchdnn_test(test_benchdnn_reorder "benchdnn --reorder --batch=inputs/reorder/test_default") register_benchdnn_test(test_benchdnn_bnorm "benchdnn --bnorm --batch=inputs/bnorm/test_bnorm_topo") register_benchdnn_test(test_benchdnn_ip "benchdnn --ip --batch=inputs/ip/test_ip_all") diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/README.md b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/README.md index 9d5ba2f..95253e7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/README.md +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/README.md @@ -1,10 +1,10 @@ # benchdnn **benchdnn** is a standalone correctness and performance benchmark for -[Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](/intel/mkl-dnn) library. +[Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](/intel/mkl-dnn). The purpose of the benchmark is extended and robust correctness verification of -the primitives provided by MKL-DNN. So far **benchdnn** supports convolutions -and inner products of different data types. It also implicitly tests reorders. +the primitives provided by Intel MKL-DNN. Currently, **benchdnn** supports convolutions +, inner products, reorder, batch normalization, deconvolution, recurrent neural network, and shuffle of different data types. ## License @@ -14,40 +14,62 @@ and inner products of different data types. It also implicitly tests reorders. ## Usage (main driver) -**benchdnn** itself is a driver for different implementation specific -harnesses. So far it has harness for Intel MKL-DNN convolution, inner product, -reorder, batch normalization, and harness for testing itself. -The usage: +**benchdnn** itself is a driver for different implementation-specific +harnesses. So far it uses a harness for Intel MKL-DNN [convolution](/tests/benchdnn/README.md#usage-convolution-harness), [inner product](/tests/benchdnn/README.md#usage-ip-harness), +[reorder](/tests/benchdnn/README.md#usage-reorder-harness), [batch normalization](/tests/benchdnn/README.md#usage-batch-normalization-harness), [deconvolution](/tests/benchdnn/README.md#usage-deconvolution-harness), [shuffle](/tests/benchdnn/README.md#usage-shuffle-harness), and [recurrent neural network](/tests/benchdnn/README.md#usage-rnn-harness) as well as a +harness for testing [itself](/tests/benchdnn/README.md#usage-self-harness). + +Usage: ``` - $ ./benchdnn: [--HARNESS] [--mode=MODE] [-vN|--verbose=N] HARNESS-OPTS + $ ./benchdnn: [--HARNESS] [--mode=MODE] [--max-ms-per-prb=MAX-MS-PER-PRB] [-vN|--verbose=N] HARNESS-OPTS ``` where: - - `HARNESS` is either `conv` [default], `ip`, `shuffle`, `reorder`, `bnorm`, `rnn` or `self` + - `HARNESS` is either `conv` [default], `ip`, `shuffle`, `reorder`, `bnorm`, `rnn`, or `self` - `MODE` -- string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance - - `N` -- verbose level (integer from 0 [default] to ...) + - `MAX-MS-PER-PRB` is passed to assign the maximum time spent per problem in milliseconds, by default `3e3` + - `-vN|--verbose=N` -- verbose level, default `0` + + - `HARNESS-OPTS` are passed to the chosen harness - - `HARNESS-OPTS` are passed to the chosen harness +Returns `0` on success (all tests passed) or non-zero in case of any error. -Returns `0` on success (all tests passed), and non-zero in case of any error -happened. +## Notations / Glossary / Abbreviations + +|Abbreviation | Description +|:--- |:--- +| src | Source image (input image for forward convolution) +| wei | Weights (aka filter) +| bia | Bias +| dst | Destination image (output image for forward convolution) +| acc | Accumulation (typically in terms of data type) +| ic, oc | Input/Output channels (aka feature maps) +| ih, iw | Input height and width +| oh, ow | Output height and width +| kh, kw | Kernel (filter, weights) height and width +| sh, sw | Convolution stride over height and width +| ph, pw | Convolution top and left padding +| mb | Minibatch (amount of images processed at once) +| g | Groups (a way to reduce the amount of computations, see Alexnet topology) +| FWD_{D,B} | forward w/o and w/ bias +| BWD_{D,W,WB} | backward wrt data, weights, and weights and bias +| DIRECT, WINO | convolution algorithm: direct or Winograd based +| AUTO | convolution algorithm is chosen by MKL-DNN for best performance ## Usage (convolution harness) -The usage: ``` [harness-knobs] [conv-desc] ... ``` where *harness-knobs* are: - - `--cfg={f32, u8s8u8s32, ...}` configuration (see below), default `f32` + - `--cfg={f32, u8s8u8s32, ...}` configuration (see below [convolution configuration](/tests/benchdnn/README.md#convolution-configurations-also-known-as-precision-specification)), default `f32` - `--dir={FWD_D (forward data), FWD_B (forward data + bias),FWD_I (forward data inference), BWD_D (backward data), BWD_W (backward weights), BWD_WB (backward weights + bias)}` direction, default `FWD_B` - - `--alg={DIRECT, WINO}` convolution algorithm, default DIRECT - - `--merge={NONE, RELU}` merged primitive, default NONE (nothing merged) + - `--alg={DIRECT, WINO, AUTO}` convolution algorithm, default DIRECT - `--attr="attr_str"` convolution attributes (see in the section below), default `""` (no attributes set) - `--mb=N` override minibatch that is specified in convolution description, default `0` (use mb specified in conv desc) - `--match=regex` check only convolutions that match with regex, default is `".*"`. Notice: Windows may only interpret string arguments surrounded by double quotation marks. @@ -57,20 +79,21 @@ where *harness-knobs* are: - `--reset` reset all the parameters set before to default one - `-vN|--verbose=N` verbose level, default `0` - `--batch=file` use options from the given file (see in subdirectory) + - `--mode=` string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance -and *conv-desc* is convolution description. The canonical form is: +and *conv-desc* is the convolution description. The canonical form is: ``` gXmbXicXihXiwXocXohXowXkhXkwXshXswXphXpwXdhXdwXnS ``` -Here X is a number and S is string (n stands for name). Some of the parameters -might be omitted if there is either default one (e.g. if g is not specified -**benchdnn** uses 1) or if the can be computed automatically (e.g. output shape -can be derived from the input one and kernel). Also if either width or height -is not specified than it is assumed height == width. Special symbol `_` is -ignored, hence maybe used as delimiter. See `str2desc()` in conv/conv_aux.cpp -for more details and implicit rules :^) +Here X is a number and S is a string (n stands for name). Some of the parameters +may be omitted if a default exists (for example, if g is not specified +**benchdnn** uses 1) or if it can be computed automatically (for example, the output shape +can be derived from the input one and the kernel). Also, if either width or height +is not specified, it is assumed that height == width. The special symbol `_` is +ignored, so it may be used as a delimiter. See `str2desc()` in conv/conv_aux.cpp +for more details and implicit rules. -The attribute string *attr_str* is defined as (new lines for readability): +The attribute string *attr_str* is defined as follows (line breaks are for readability): ``` [irmode={nearest,down};] [oscale={none,common,per_oc}[:scale];] @@ -81,8 +104,8 @@ Here `irmode` defines the rounding mode for integer output (default is nearest). Next, `oscale` stands for output_scales. The first parameter is the policy that is defined below. The second optional parameter is a scale that specifies -either the one common output scale (for `none` and `common` polices) or a -starting point for `per_oc` policy, which uses many scales. The default scale +either the one common output scale (for the `none` and `common` polices) or a +starting point for the `per_oc` policy, which uses many scales. The default scale is 1.0. Known policies are: - `none` (default) means no output scales set (i.e. scale = 1.) @@ -90,19 +113,19 @@ is 1.0. Known policies are: - `per_oc` corresponds to `mask=1<<1` (i.e. output channels) with different scale factors Next, `post_ops` stands for post operation sequence. Currently supported post -ops are: +operations are: - `relu` with no parameters (i.e. corresponding scale is 1., alg = eltwise_relu, alpha = beta = 0.) - `sum` with optional parameter scale (default 1.) -### convolution configurations (aka precision specification) +### Convolution configurations (also known as precision specification) `--cfg` option specifies what convolution would be used in terms of data type. -Also it defines all the magic with data filling inside. For integer type +Also it defines all the magic with data filling inside. For the integer type, saturation is implicitly implied. -Finally configuration defines threshold for computation errors (ideally we -want keep it 0 and it seems to work for now). +Finally configuration defines the threshold for computation errors (ideally we +want to keep it at 0, and it seems to work for now). The table below shows cases supported by Intel MKL-DNN and corresponding configurations for **benchdnn**: @@ -123,18 +146,18 @@ configurations for **benchdnn**: | s8 | s8 | u8 | s32 | s8s8u8s32 | same notes as for u8s8f32s32 -## Performance measurements +### Performance measurements (convolution harness) -**benchdnn** supports custom performance report. Template is passed via +**benchdnn** supports a custom performance report. A template is passed via the command line and consists of terminal and nonterminal symbols. Nonterminal -symbols are printed as is. Description of terminal symbols is given below. -There is also a notion of modifiers (marked as @) that change meaning of -terminal symbols, e.g. sign '-' means minimum of (in terms of time). See -table of modifiers below. +symbols are printed as-is. A description of terminal symbols is given below. +There is also a notion of modifiers (marked with @) that change the meaning of +terminal symbols; for example, the sign '-' means minimum of (in terms of time). +See the table of modifiers below. -> **caution:** threads have to be pinned in order to get consistent frequency +> **Caution:** Threads must be pinned in order to get consistent frequency. -| abbreviation | description +| Abbreviation | Description |:------------ |:----------- | %d | problem descriptor | %D | expanded problem descriptor (conv parameters in csv format) @@ -146,7 +169,7 @@ table of modifiers below. | %@c | time in clocks | %@p | ops per second -| modifier | description +| Modifier | Description |:-------- |:----------- | | default | - | min (time) -- default @@ -160,7 +183,7 @@ table of modifiers below. The definition of expanded problem descriptor is: `g,mb,ic,ih,iw,oc,oh,ow,kh,kw,sh,sw,ph,pw`. -The default template can be found in conv/bench_conv.cpp that is defined as +The default template can be found in conv/bench_conv.cpp and is defined as `perf,%n,%d,%GO,%GF,%-t,%-Gp,%0t,%0Gp`. That will produce the following output in CSV format: ``` @@ -174,8 +197,13 @@ best gigaops (since it corresponds to mimimum time) average time spent in ms average gigaops (since it corresponds to average time) ``` +Here is an example of the performance output: +``` + perf,"yolov2:conv1",mb16ic3ih610oc32oh608kh3n"yolov2:conv1",10.2205,0,43.9827,232.375,58.0146,176.171 +``` +full convolution descriptor is `mb16ic3ih610oc32oh608kh3n"yolov2:conv1"` in the above example. -## Examples +### Examples (convolution harness) Run the set of f32 forward convolutions from inputs/conv_all file w/ bias and default minibatch: ``` @@ -183,19 +211,19 @@ Run the set of f32 forward convolutions from inputs/conv_all file w/ bias and de --cfg=f32 --dir=FWD_B --batch=inputs/conv_all ``` -Run the same but with merged ReLU: +Run the same but with post_ops ReLU: ``` $ ./benchdnn --conv \ - --cfg=f32 --dir=FWD_B --merge=RELU --batch=inputs/conv_all + --cfg=f32 --dir=FWD_B --attr="post_ops='relu'" --batch=inputs/conv_all ``` Run the same as previous but also measure performance: ``` - $ ./benchdnn --conv --mode=CORRnPERF \ - --cfg=f32 --dir=FWD_B --merge=RELU --batch=inputs/conv_all + $ ./benchdnn --conv --mode=CORRnPERF \ + --cfg=f32 --dir=FWD_B --attr="post_ops='relu'" --batch=inputs/conv_all ``` -> **note**: instead of `CORRnPERF` one can use `CP`, `PC`, `cp`, or `pc` +> **Note**: Instead of `CORRnPERF`, one can use `CP`, `PC`, `cp`, or `pc` Run a set of f32 backward convolutions wrt weights with kh=3 and verbose level set to 2: @@ -221,18 +249,19 @@ configurations (`u8s8u8s32` and `f32`): --cfg=f32 ic3ih227iw227_oc96oh55ow55_kh11kw11_sh4sw4ph0pw0_n"alexnet:conv1" ``` -Run batch file for different algorithms (assuming the file only specifies -convolutions and does not include harness options that would override ones -passed in the command line). Also ignore mkldnn_unimplemented errors in case of +Run batch file for different algorithms (assuming the file specifies only +convolutions and does not include harness options that would override any +passed on the command line). Also ignore mkldnn_unimplemented errors in case of Winograd: ``` $ ./benchdnn --conv \ --alg=DIRECT --batch=convs.in \ --allow-unimpl=true \ - --alg=WINO --batch=convs.in + --alg=WINO --batch=convs.in \ + --alg=AUTO --batch=convs.in ``` -Run a set of u8s8u8s32 forward convolutions w/o bias, skipping +Run a set of u8s8u8s32 forward convolutions without bias, skipping reference implementations and not triggering unimplemented as an error, with one common output scale set to 0.5 with rounding mode set to down (via attributes): @@ -242,42 +271,10 @@ one common output scale set to 0.5 with rounding mode set to down --attr="irmode=down;oscale=common:.5" --batch=inputs/conv_all ``` -Almost the same as above (with minor changes), but also add post operation -sequence **(relu, then sum with scale .3, then relu)** using -attributes/mkldnn_post_ops_t: -``` - $ ./benchdnn --conv \ - --cfg=u8s8s32s32 --dir=FWD_B \ - --attr="oscale=common:.5;post_ops='relu;sum:.3;relu'" --batch=inputs/conv_all -``` - - -## Notations / Glossary / Abbreviations - -|Abbreviation | Description -|:--- |:--- -| src | Source image (input image for forward convolution) -| wei | Weights (aka filter) -| bia | Bias -| dst | Destination image (output image for forward convolution) -| acc | Accumulation (typically in terms of data type) -| ic, oc | Input/Output channels (aka feature maps) -| ih, iw | Input height and width -| oh, ow | Output height and width -| kh, kw | Kernel (filter, weights) height and width -| sh, sw | Convolution stride over height and width -| ph, pw | Convolution top and left padding -| mb | Minibatch (amount of images processed at once) -| g | Groups (a way to reduce the amount of computations, see Alexnet topology) -| FWD_{D,B} | forward w/o and w/ bias -| BWD_{D,W,WB} | backward wrt data, weights, and weights and bias -| DIRECT, WINO | convolution algorithm: direct or Winograd based -| NONE, RELU | merged primitives: nothing or ReLU ## Usage (batch normalization harness) -The usage: ``` ./benchdnn --bnorm [harness-knobs] bnorm-desc ... ``` @@ -290,7 +287,7 @@ where *harness-knobs* are: - `--fmt={nchw, nChw16c, ...}` data layout, default `nchw` - `--flags=[|G|S|R]` batch normalization flags, default `none` (G -- global stats, S -- use scale shift, R -- fuse with ReLU) - `--attr="attr_str"` attributes (see in the convolution section above), default `""` (no attributes set) - - `--match=regex` check only convolutions that match with regex, default is `".*"`. Notice: Windows may only interpret string arguments surrounded by double quotation marks. + - `--match=regex` check only bnorm that match with regex, default is `".*"`. Notice: Windows may only interpret string arguments surrounded by double quotation marks. - `--skip-impl="str1[:str2]..."` skip implementation (see mkldnn_query_impl_info_str), default `""` - `--perf-template=template-str` set template for performance report (very similar to the convolution one) - `--reset` reset all the parameters set before to default one @@ -299,10 +296,10 @@ where *harness-knobs* are: and *bnorm-desc* is a batch normalization description. The canonical form is: ``` - mbXicXihXiwXepsYnS + mbXicXidXihXiwXepsYnS ``` -Here X is an integer number, Y is a real number, and S is string (n stands for -name). Special symbol `_` is ignored, hence maybe used as delimiter. There are +Here X is an integer number, Y is a real number, and S is a string (n stands for +name). The special symbol `_` is ignored, so it may be used as delimiter. There are some implicit rules: - if mb is omitted set mb to 2 @@ -310,39 +307,516 @@ some implicit rules: - if eps is omitted set eps to 1./16 +### Performance measurements (batch normalization harness) + +**benchdnn** supports a custom performance report. A template is passed via the +command line and consists of terminal and nonterminal symbols. Nonterminal +symbols are printed as-is. A description of terminal symbols is given below. +There is also a notion of modifiers (marked with @) that change the meaning of +terminal symbols; for example, the sign '-' means minimum of (in terms of time). See the +table of modifiers below. + +> **Caution:** Threads must be pinned in order to get consistent frequency. + +| abbreviation | description +|:------------ |:----------- +| %d | problem descriptor +| %D | expanded problem descriptor (parameters in csv format) +| %n | problem name +| %z | direction +| %f | flags +| %q | data type (precision) +| %f | data format (layout) +| %@t | time in ms + +The definition of expanded problem descriptor is: `mb,ic,id,ih,iw,eps`. + +The default template can be found in bnorm/bench_bnorm.cpp and is defined as +`perf,%n,%z,%f,%q,%f,%D,%-t,%0t`. That will produce the following output +in CSV format: +``` +string: perf +bnorm name +direction +batch normalization flags +base data type +batch normalization flags +expanded bnorm problem descriptor +minimum time spent in ms +average time spent in ms +``` +Here is an example of performance output: +``` +perf,"resnet_50:bn_conv1",FWD_D,,f32,,50,64,1,112,112,0.0625,10.7729,77.1917 +``` +expanded bnorm problem descriptor is `50,64,1,112,112,0.0625` in the above example. + +### Examples (batch normalization harness) + +Run the set of bnorms from inputs/bnorm/bnorm_resnet_50 file with default minibatch: +``` + $ ./benchdnn --bnorm \ + --batch=inputs/bnorm/bnorm_resnet_50 +``` + +Run the same as previous but also measure performance: +``` + $ ./benchdnn --bnorm --mode=CORRnPERF \ + --batch=inputs/bnorm/bnorm_resnet_50 +``` + + +## Usage (rnn harness) + +``` + ./benchdnn --rnn [harness-knobs] [rnn-desc] ... +``` + +where *harness-knobs* are: + + - `--prop={FWD_D (forward data), BWD_DW (backward data + weights)}` direction, default `FWD_D`` + - `--alg={VANILLA_RNN, VANILLA_LSTM, VANILLA_GRU, LBR_GRU}` algorithm, default `VANILLA_RNN`` + - `--direction={left2right, right2left, concat, sum}` direction, default `left2right`` + - `--activation={RELU, LOGISTIC, TANH}` activation, default `RELU`` + - `--reset` reset all the parameters set before to default one + - `--batch=file` use options from the given file (see in subdirectory) + +and *rnn-desc* is rnn description. The canonical form is: +``` + lXtXmbXsicXslcXdicXdlc +``` +Here X is a number and S is a string. Some implicit rules: + - default values: l = 1, t = 1, mb = 2, S="wip" + + - if slc/dlc/dic is undefined => slc/dlc/dic = sic + +See `str2desc()` in rnn/rnn_aux.cpp +for more details and implicit rules :^) + +### Performance measurements (rnn harness) + + +Runing rnn with performance measurememt mode will produce the following output +in CSV format: +``` +string: perf +algorithm +activation function +direction +expanded rnn problem descriptor +name +time spent in ms +minimum time spent in ms +maximum time spent in ms +average time spent in ms +``` +Here is an example of performance output: +``` +perf,VANILLA_RNN,RELU,left2right,l1t1mb128sic512slc512dic512dlc512n""GNMT_enc-training"",time(ms):min=68.0007,max=176.006,avg=91.2686 +``` +expanded rnn problem descriptor is `l1t1mb128sic512slc512dic512dlc512n` in the above example. + +### Examples (rnn harness) + +Run the set of rnn training from inputs/rnn/rnn_training file with default minibatch: +``` + $ ./benchdnn --rnn \ + --batch=inputs/rnn/rnn_training +``` + +Run the same as previous but also measure performance: +``` + $ ./benchdnn --rnn --mode=CORRnPERF \ + --batch=inputs/rnn/rnn_training +``` + + +## Usage (deconvolution harness) + +``` + ./benchdnn --deconv [harness-knobs] [deconv-desc] ... +``` + +where *harness-knobs* are: + + - `--cfg={f32, u8s8u8s32, ...}` configuration (ref conv session above [convolution configuration](/tests/benchdnn/README.md#convolution-configurations-also-known-as-precision-specification)), default `f32` + - `--match=regex` check only deconvolutions that match with regex, default is `".*"`. Notice: Windows may only interpret string arguments surrounded by double quotation marks. + - `--mb=N` override minibatch that is specified in deconvolution description, default `0` (use mb specified in deconv desc) + - `--dir={FWD_D (forward data), FWD_B (forward data + bias),FWD_I (forward data inference), BWD_D (backward data), BWD_W (backward weights), BWD_WB (backward weights + bias)}` direction, default `FWD_B` + - `--alg={DIRECT, WINO, AUTO}` deconvolution algorithm, default DIRECT + - `--attr="attr_str"` deconvolution attributes (see in the convolution section above), default `""` (no attributes set) + - `--skip-impl="str1[:str2]..."` skip implementation (see mkldnn_query_impl_info_str), default `""` + - `--allow-unimpl=true|false` do not treat unimplemented configuration as an error, default `false` + - `--perf-template=template-str` set template for performance report (see section *Performance measurements*) + - `--mode=` string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance + - `--reset` reset all the parameters set before to default one + - `-vN|--verbose=N` verbose level, default `0` + - `--batch=file` use options from the given file (see in subdirectory) + +and *deconv-desc* is deconvolution description. The canonical form is: +``` + gXmbXicXihXiwXocXohXowXkhXkwXshXswXphXpwXdhXdwXnS +``` +Here X is a number and S is string (n stands for name). Some of the parameters +might be omitted if a default exists (e.g. if g is not specified +**benchdnn** uses 1) or if the can be computed automatically (e.g. output shape +can be derived from the input one and kernel). Also if either width or height +is not specified than it is assumed height == width. Special symbol `_` is +ignored, hence maybe used as delimiter. See `str2desc()` in conv/conv_aux.cpp +for more details and implicit rules :^) + + +### Performance measurements (deconvolution harness) + +**benchdnn** supports a custom performance report. please refer above Performance measurements convolution harness session for detail, [convolution harness](/tests/benchdnn/README.md#performance-measurements-convolution-harness). + +The default template can be found in conv/bench_deconv.cpp and is defined as +`perf,%n,%d,%GO,%GF,%-t,%-Gp,%0t,%0Gp`. That will produce the following output +in CSV format: +``` +string: perf +deconvolution name +full deconv-desc +number of giga ops calculated +effective cpu frequency in GHz (amb clocks[min] / time[min]) +minimum time spent in ms +best gigaops (since it corresponds to mimimum time) +average time spent in ms +average gigaops (since it corresponds to average time) +``` +Here is an example of performance output: +``` + perf,"alexnet:deconv1",mb256ic96ih55oc3oh227kh11sh4n"alexnet:deconv1",2.9733,0,249.474,11.9183,307.702,9.66291 +``` +full deconvolution descriptor is `mb256ic96ih55oc3oh227kh11sh4n"alexnet:deconv1"` in the above example. + +### Examples (deconvolution harness) + +Run the set of f32 forward deconvolutions from inputs/deconv_all file w/ bias and default minibatch: +``` + $ ./benchdnn --deconv \ + --cfg=f32 --dir=FWD_B --batch=inputs/deconv_all +``` + +Run the same as previous but also measure performance: +``` + $ ./benchdnn --deconv --mode=CORRnPERF \ + --cfg=f32 --dir=FWD_B --batch=inputs/deconv_all +``` + +## Usage (ip harness) + +``` + ./benchdnn --ip [harness-knobs] [ip-desc] ... +``` + +where *harness-knobs* are: + + - `--cfg={f32, u8s8u8s32, ...}` configuration (ref conv session above [convolution configuration](/tests/benchdnn/README.md#convolution-configurations-also-known-as-precision-specification)), default `f32`` + - `--mb=N` override minibatch that is specified in ip description, default `0` (use mb specified in ip desc) + - `--dir={FWD_D (forward data), FWD_B (forward data + bias),FWD_I (forward data inference), BWD_D (backward data), BWD_W (backward weights), BWD_WB (backward weights + bias)}` direction, default `FWD_B` + - `--attr="attr_str"` ip attributes (see in the convolution section above), default `""` (no attributes set) + - `--allow-unimpl=true|false` do not treat unimplemented configuration as an error, default `false` + - `--perf-template=template-str` set template for performance report (see section *Performance measurements*) + - `--mode=` string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance + - `--reset` reset all the parameters set before to default one + - `-vN|--verbose=N` verbose level, default `0` + - `--batch=file` use options from the given file (see in subdirectory) + +and *ip-desc* is ip description. The canonical form is: +``` + mbXicXidXihXiwXSocXnS +``` +Here X is a number and S is a string (n stands for name). +The special symbol `_` is ignored, so it may be used as a delimiter. +Some implicit rules: + - default values: mb = 2, id = 1, S="wip" + + - if H is undefined => H = W + + - if W is undefined => W = H + +See `str2desc()` in ip/ip_aux.cpp +for more details and implicit rules :^) + +### Performance measurements (ip harness) + +**benchdnn** supports a custom performance report. A template is passed via the +command line and consists of terminal and nonterminal symbols. Nonterminal +symbols are printed as-is. A description of terminal symbols is given below. +There is also a notion of modifiers (marked with @) that change the meaning of +terminal symbols; for example, the sign '-' means minimum of (in terms of time). See the +table of modifiers below. + +> **Caution:** Threads must be pinned in order to get consistent frequency. + +| abbreviation | description +|:------------ |:----------- +| %d | problem descriptor +| %D | expanded problem descriptor (parameters in csv format) +| %n | problem name +| %z | direction +| %f | flags +| %q | data type (precision) +| %f | data format (layout) +| %@t | time in ms + +The definition of expanded problem descriptor is: `mb,oc,ic,id,ih,iw`. + +The default template can be found in bnorm/bench_ip.cpp and is defined as +`perf,%D,%n,%z,%q,%-t,%-Gp,%0t,%0Gp`. That will produce the following output +in CSV format: +``` +string: perf +expanded ip problem descriptor +name +direction +data type +minimum time spent in ms +best gigaops (since it corresponds to mimimum time) +average time spent in ms +average gigaops (since it corresponds to average time) +``` + +Here is an example of performance output: +``` +perf,112,1000,2048,1,1,1,"resnet:ip1",FWD_B,f32,3.99976,114.695,19.0323,24.1039 +``` +expanded ip problem descriptor is `112,1000,2048,1,1,1` in the above example. + +### Examples (ip harness) + +Run the set of ip from inputs/ip/ip_all file with default minibatch: +``` + $ ./benchdnn --ip \ + --batch=inputs/ip/ip_all +``` + +Run the same as previous but also measure performance: +``` + $ ./benchdnn --ip --mode=CORRnPERF \ + --batch=inputs/ip/ip_all +``` + +## Usage (shuffle harness) + +``` + ./benchdnn --shuffle [harness-knobs] [dim]... +``` + +where *harness-knobs* are: + + - `--match==regex` check only shuffle that match with regex, default is `".*"`. Notice: Windows may only interpret string arguments surrounded by double quotation marks. + - `--dir={FWD_D (forward data), FWD_B (forward data + bias),FWD_I (forward data inference), BWD_D (backward data), BWD_W (backward weights), BWD_WB (backward weights + bias)}` direction, default `FWD_B` + - `--dt={f32, s32, ...}` base data type, default `f32` + - `--fmt={nchw, nChw16c, ...}` data layout, default `nchw` + - `--axis=` default `1` + - `--group=` default `1` + - `--mode=` string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance + - `-vN|--verbose=N` verbose level, default `0` + - `--batch=file` use options from the given file (see in subdirectory) + +and *dim* is ip description. The canonical form is: +``` + dxdxdxdxd +``` +Here d is a number. + +See `str2dims()` in shuffle/shuffle_aux.cpp for more details. + +### Performance measurements (shuffle harness) + +**benchdnn** supports a custom performance report. A template is passed via the +command line and consists of terminal and nonterminal symbols. Nonterminal +symbols are printed as-is. A description of terminal symbols is given below. +There is also a notion of modifiers (marked with @) that change the meaning of +terminal symbols; for example, the sign '-' means minimum of (in terms of time). See the +table of modifiers below. + +> **Caution:** Threads must be pinned in order to get consistent frequency. + +| Abbreviation | Description +|:------------ |:----------- +| %d | problem descriptor +| %D | expanded problem descriptor (parameters in csv format) +| %z | direction +| %q | data type (precision) +| %f | data format (layout) +| %a | axis +| %g | group size +| %@t | time in ms + +The definition of expanded problem descriptor is: `dxdxdxdxd`. + +The default template can be found in shuffle/bench_shuffle.cpp and is defined as +`perf,%z,%q,%f,%D,%a,%g,%-t,%0t`. That will produce the following output +in CSV format: +``` +string: perf +direction +data type +data format +expanded shuffle problem descriptor +axis +group size +minimum time spent in ms +average time spent in ms +``` +Here is an example of performance output. +``` +perf,FWD_D,u8,nCdhw16c,1x272x2x56x56,4,4,11.6177,16.509 +``` +expanded shuffle problem descriptor is `1x272x2x56x56` in the above example. + +### Examples (shuffle harness) + +Run the set of shuffle from inputs/shuffle/test_shuffle_axis file with default minibatch: +``` + $ ./benchdnn --shuffle \ + --batch=inputs/shuffle/test_shuffle_axis +``` + +Run the same as previous but also measure performance: +``` + $ ./benchdnn --shuffle --mode=CORRnPERF \ + --batch=inputs/shuffle/test_shuffle_axis +``` + +## Usage (reorder harness) + +``` + ./benchdnn --reorder [harness-knobs] ... +``` + +where *harness-knobs* are: + + - `--idt={f32, s32, ...}` base input data type, default `f32` + - `--odt={f32, s32, ...}` base output data type, default `f32` + - `--dt={f32, s32, ...}` base data type, default `f32` + - `--ifmt={nchw, nChw16c, ...}` input data layout, default `nchw` + - `--ofmt={nchw, nChw16c, ...}` output data layout, default `nchw` + - `--fmt={nchw, nChw16c, ...}` data layout, default `nchw` + - `--def-scales={,,}` input defined scales. separate number by ',' ex : 0.125, 0.25, 0.5, 1, 2, 4, 8 + - `--attr="attr_str"` ip attributes (see in the section below), default `""` (no attributes set) + - `--both-dir-dt=true|false` , default `false` + - `--both-dir-fmt=true|false` , default `false` + - `--allow-unimpl=true|false` do not treat unimplemented configuration as an error, default `false` + - `--run` run reorder bench + - `--perf-template=template-str` set template for performance report (see section *Performance measurements*) + - `--reset` reset all the parameters set before to default one + - `--mode=` string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance + - `-vN|--verbose=N` verbose level, default `0` + - `--batch=file` use options from the given file (see in subdirectory) + +### Performance measurements (reorder harness) + +**benchdnn** supports a custom performance report. A template is passed via the +command line and consists of terminal and nonterminal symbols. Nonterminal +symbols are printed as-is. A description of terminal symbols is given below. +There is also a notion of modifiers (marked with @) that change the meaning of +terminal symbols; for example, the sign '-' means minimum of (in terms of time). See the +table of modifiers below. + +> **Caution:** Threads must be pinned in order to get consistent frequency. + +| abbreviation | description +|:------------ |:----------- +| %d | problem descriptor +| %D | expanded problem descriptor (reorder parameters in csv format) +| %n | dimensionality of the problem +| %@O | number of elements being reordered +| %@t | time in ms +| %@p | elements per second + +| modifier | description +|:-------- |:----------- +| | default +| - | min (time) -- default +| 0 | avg (time) +| + | max (time) +| | +| K | Kilo (1e3) +| M | Mega (1e6) +| G | Giga (1e9) + +The definition of expanded problem descriptor is: +`idt,odt,ifmt,ofmt,attrs,dims`. + +The default template can be found in reorder/bench_reorder.cpp and is defined as +`perf,%n,%D,%O,%-t,%-Gp,%0t,%0Gp`. That will produce the following output +in CSV format: +``` +string: perf +dimensionality of the problem +expanded reorder problem descriptor +number of elements being reordered +minimum time spent in ms +best gigaops (since it corresponds to mimimum time) +average time spent in ms +average gigaops (since it corresponds to average time) +``` +Here is an example of performance output: +``` + perf,4,f32,f32,nchw,nchw,irmode=nearest;oscale=per_oc:0.125;post_ops='',2x64x3x3,1152,4.00244,0.000287824,24.0279,4.79442e-05 +``` +expanded reorder problem descriptor is `f32,f32,nchw,nchw,irmode=nearest;oscale=per_oc:0.125;post_ops='',2x64x3x3` in the above example. + +### Examples (reorder harness) + +Run the set of reorder from reorder/test_default file with default minibatch: +``` + $ ./benchdnn --reorder \ + --batch=inputs/reorder/test_default +``` + +Run the same as previous but also measure performance: +``` + $ ./benchdnn --reorder --mode=CORRnPERF \ + --batch=inputs/reorder/test_default +``` + +## Usage (self harness) + +``` + ./benchdnn --self ... +``` + +Check enumlation type, attributes, flags, and descriptions. + + ## Installation -**benchdnn** is automatically built with Intel MKL-DNN. For the convenience one -may build **benchdnn** using cmake or make. +**benchdnn** is automatically built with Intel MKL-DNN. For convenience, you can +build **benchdnn** using cmake or make. ## Essence of convolution testing -Intel MKL-DNN supports different data types, such as single precision floating -point (`mkldnn_f32`), signed/unsigned integer of different length -(`mkldnn_{s,u}{8,16,32}`). We need to cover all those cases by tests. It is -essential to test real convolution sizes, since Intel MKL-DNN provides -different optimizations depending on convolution parameters, so there is no -one unified approach inside, which means it would not be enough to test only -few convolutions (aka unit tests). - -But even for given convolution the correctness convolution test is not as -simple as it might seem to be at first sight. One of the biggest problem we -encountered is numerical instability. For every output point a lot of -operations may happen. For instance on backward propagation with respect to -filter each filter point requires `mb * oh * ow` operations (see *Notation* -section below). That big amount of compute operations may lead to either +Intel MKL-DNN supports different data types, such as single-precision floating +point (`mkldnn_f32`) and signed/unsigned integer of different length +(`mkldnn_{s,u}{8,16,32}`). We need to cover all those cases with tests. It is +essential to test real convolution sizes, because Intel MKL-DNN provides +different optimizations depending on convolution parameters. There is no +single unified approach inside, so it would not be enough to test only a few +convolutions (also known as unit tests). + +But even for a given convolution, the correctness convolution test is not as +simple as it might seem at first sight. One of the biggest problems we +encountered is numerical instability. For every output point, a lot of +operations may occur. For instance, on backward propagation with respect to +filter, each filter point requires `mb * oh * ow` operations (see the *Notation* +section below). That large amount of compute operations may lead to either integer overflow or accuracy loss if initial data was chosen inadequately. -These two main things complicate testing. **benchdnn** tries to address these -issues by using integers for initialization with uniform distribution in a +These two main issues complicate testing. **benchdnn** tries to address these +by using integers for initialization with uniform distribution in a range `[cfg->f_min .. cfg->f_max]`, with the step `cfg->f_step` (see `struct dt_conf_t` in conv/conv.hpp). `f_min` and `f_max` are chosen so -that most of the result would belong `[cfg->min .. cfg->max]` range. Also -for floating point all integers in both ranges have exact representation (i.e. +that most of the results would belong in the `[cfg->min .. cfg->max]` range. Also +for floating point all integers in both ranges have exact representation (that is, the absolute numbers are less than `2^size_of_mantissa`). Uniform distribution -leads to have result uniformly distributed and quite small `f_min/f_max` keep +leads to results that are uniformly distributed and quite small. `f_min/f_max` keep the result in a reasonable range. Yet another trick: not all the points are initialized with non-zero values: see `fill_{src,wei,bia,dst}` in conv/conv.cpp. @@ -350,14 +824,14 @@ conv/conv.cpp. ## Further plans -Please see TODO.md in **benchdnn** root directory for development plans. +Please see TODO.md in the **benchdnn** root directory for development plans. ## Issues and contributions -We welcome community contributions to **benchdnn** as well as Intel MKL-DNN. +We welcome community contributions to **benchdnn** as well as to Intel MKL-DNN. If you have any ideas or issues please submit an issue or pull request. For -clarity please include ''benchdnn: '' in the title. +clarity, please include ''benchdnn: '' in the title. ## Inspiration diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bench_bnorm.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bench_bnorm.cpp index 3675176..4c0a835 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bench_bnorm.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bench_bnorm.cpp @@ -41,7 +41,7 @@ attr_t attr; const char *pattern = NULL; const char *skip_impl = ""; bool allow_unimpl = false; -const char *perf_template = "perf,%n,%z,%f,%q,%f,%D,%-t,%0t"; +const char *perf_template = "perf,%n,%z,%F,%q,%f,%D,%-t,%0t"; void reset_parameters() { check_alg = ALG_AUTO; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bnorm.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bnorm.cpp index 7a6c81c..0d47b9e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bnorm.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bnorm.cpp @@ -371,15 +371,15 @@ static int compare(const prb_t *p, data_kind_t kind, const dnn_mem_t &fp_mem, int check_fwd_ws(const dnn_mem_t &data_dt, const dnn_mem_t &ws_dt, res_t *r) { /* so far we know ws is just bit-mask of whether value was negative or * positive */ - const size_t nelems = data_dt.nelems(); + const size_t nelems = data_dt.nelems(true); const float *d = (const float *)data_dt; const uint8_t *ws = (const uint8_t *)ws_dt; /* some internal knowledge: flags in ws are either stored as bytes (e.g. * for the ref implementation) or as bits (e.g. for the jitted one); in - * the first case the ws memory has fewer elements than the data memory */ + * the latter case the ws memory has fewer elements than the data memory */ enum { ws_byte, ws_bit } ws_type; - ws_type = ws_dt.nelems() < nelems ? ws_bit : ws_byte; + ws_type = ws_dt.nelems(true) < nelems ? ws_bit : ws_byte; /* more internal knowledge: data_dt and ws_dt are expected to have exactly * the same data layout, and data_dt padded regions are expected to be @@ -488,8 +488,9 @@ static int cvt_mask_to_ws(const prb_t *p, const dnn_mem_t &mask_fp, is_bnorm_3d(p) ? data_dims_3d : data_dims, mkldnn_f32, p->fmt); SAFE(data.reorder(mask_fp), WARN); - dnn_mem_t mean(1, &p->ic, mkldnn_f32, mkldnn_x); - dnn_mem_t var(1, &p->ic, mkldnn_f32, mkldnn_x); + ptrdiff_t ic = p->ic; + dnn_mem_t mean(1, &ic, mkldnn_f32, mkldnn_x); + dnn_mem_t var(1, &ic, mkldnn_f32, mkldnn_x); for (int c = 0; c < p->ic; ++c) ((float *)mean)[c] = 0.5; for (int c = 0; c < p->ic; ++c) ((float *)var)[c] = 1; @@ -603,8 +604,7 @@ int doit(const prb_t *p, res_t *r) { SAFE(compare(p, MEAN, mean_fp, mean_dt, r), WARN); SAFE(compare(p, VAR, var_fp, var_dt, r), WARN); } - dnn_mem_t data(data_dt.md_, fp, src_format); - SAFE(data.reorder(data_dt), WARN); + dnn_mem_t data(data_dt, fp, src_format); SAFE(compare(p, DATA, data_fp, data, r), WARN); if ((p->flags & FUSE_BN_RELU) && !(p->dir & FLAG_INF)) SAFE(check_fwd_ws(data_dt, ws_dt, r), WARN); @@ -652,9 +652,8 @@ int doit(const prb_t *p, res_t *r) { ws_fp, d_data_fp, d_ss_fp); if ((p->flags & USE_SCALESHIFT) && (p->dir & FLAG_WEI)) SAFE(compare(p, SS, d_ss_fp, d_ss_dt, r), WARN); - dnn_mem_t d_data(d_data_dt.md_, fp, + dnn_mem_t d_data(d_data_dt, fp, is_bnorm_3d(p) ? mkldnn_ncdhw : mkldnn_nchw); - SAFE(d_data.reorder(d_data_dt), WARN); SAFE(compare(p, DATA, d_data_fp, d_data, r), WARN); } } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/perf_report.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/perf_report.cpp index 97399fb..8373d44 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/perf_report.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/perf_report.cpp @@ -37,7 +37,7 @@ See modifiers at the same place. | %D | expanded problem descriptor (parameters in csv format) | %n | problem name | %z | direction -| %f | flags +| %F | flags | %q | data type (precision) | %f | data format (layout) | %@t | time in ms @@ -100,7 +100,7 @@ void perf_report(const prb_t *p, const res_t *r, const char *pstr) { DPRINT("%s", p->name); else if (c == 'z') DPRINT("%s", dir2str(p->dir)); - else if (c == 'f') + else if (c == 'F') DPRINT("%s", flags2str(p->flags)); else if (c == 'q') DPRINT("%s", dt2str(p->dt)); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_conv.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_conv.cpp index d3de6ed..1c3db17 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_conv.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_conv.cpp @@ -35,7 +35,6 @@ const char *pattern = NULL; dir_t dir = FWD_B; int mb = 0; alg_t alg = DIRECT; -merge_t merge = NONE; attr_t attr; const char *skip_impl = ""; bool allow_unimpl = false; @@ -47,14 +46,13 @@ void reset_parameters() { dir = FWD_B; mb = 0; alg = DIRECT; - merge = NONE; attr = attr_t(); skip_impl = ""; allow_unimpl = false; } void check_correctness(const desc_t *c) { - const prb_t p(*c, dir, cfg, alg, merge, attr, mb); + const prb_t p(*c, dir, cfg, alg, attr, mb); char pstr[max_prb_len]; prb2str(&p, pstr); @@ -90,8 +88,6 @@ int bench(int argc, char **argv, bool main_bench) { dir = str2dir(argv[arg] + 6); else if (!strncmp("--alg=", argv[arg], 6)) alg = str2alg(argv[arg] + 6); - else if (!strncmp("--merge=", argv[arg], 8)) - merge = str2merge(argv[arg] + 8); else if (!strncmp("--attr=", argv[arg], 7)) SAFE(str2attr(&attr, argv[arg] + 7), CRIT); else if (!strncmp("--skip-impl=", argv[arg], 12)) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_deconv.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_deconv.cpp index 18792c1..937d50e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_deconv.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_deconv.cpp @@ -36,7 +36,6 @@ const char *pattern = NULL; dir_t dir = FWD_B; int mb = 0; alg_t alg = DIRECT; -merge_t merge = NONE; attr_t attr; const char *skip_impl = ""; bool allow_unimpl = false; @@ -48,14 +47,13 @@ void reset_parameters() { dir = FWD_B; mb = 0; alg = DIRECT; - merge = NONE; attr = attr_t(); skip_impl = ""; allow_unimpl = false; } void check_correctness(const desc_t *c) { - const prb_t p(*c, dir, cfg, alg, merge, attr, mb); + const prb_t p(*c, dir, cfg, alg, attr, mb, true); char pstr[max_prb_len]; prb2str(&p, pstr); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/cfg.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/cfg.cpp index a08e1d1..28093fa 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/cfg.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/cfg.cpp @@ -42,6 +42,14 @@ const _dt_conf_t conf_f32 = { {mkldnn_f32,}, }; +const _dt_conf_t conf_f32_no_limits = { + {mkldnn_f32, -FLT_MAX, FLT_MAX, -32, 32, 0, 1, .25, 0.}, + {mkldnn_f32, -FLT_MAX, FLT_MAX, -32, 32, 0, 1, 1.0, 0.}, + {mkldnn_f32, -FLT_MAX, FLT_MAX, -512, 512, 0, 1, 1.0, 0.}, + {mkldnn_f32, -FLT_MAX, FLT_MAX, -32, 32, 0, 1, .25, 0.}, + {mkldnn_f32,}, +}; + const _dt_conf_t conf_f32_full = { {mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1.0, 0.}, {mkldnn_f32, -int_max_exact, int_max_exact, -32, 32, 0, 1, 1.0, 0.}, @@ -182,6 +190,7 @@ const dt_conf_t *str2cfg(const char *str) { #define CASE(cfg) \ if (!strcasecmp(STRINGIFY(cfg), str)) return CONCAT2(conf_,cfg) CASE(f32); + CASE(f32_no_limits); CASE(f32_full); CASE(f32_wino); CASE(s16s16s32s32); @@ -207,6 +216,7 @@ const dt_conf_t *str2cfg(const char *str) { const char *cfg2str(const dt_conf_t *cfg) { #define CASE(_cfg) if (cfg == CONCAT2(conf_,_cfg)) return STRINGIFY(_cfg) CASE(f32); + CASE(f32_no_limits); CASE(f32_full); CASE(f32_wino); CASE(s16s16s32s32); @@ -229,4 +239,17 @@ const char *cfg2str(const dt_conf_t *cfg) { return NULL; } +const dt_conf_t *auto_cfg(const alg_t alg, const dt_conf_t *cfg) { + const char *cfg_s = cfg2str(cfg); +#define CASE(_cfg_) \ + if (alg == WINO && !strcmp(cfg_s, STRINGIFY(_cfg_))) return CONCAT2(conf_, CONCAT2(_cfg_, _wino)) + CASE(f32); + CASE(u8s8f32s32); + CASE(u8s8s32s32); + CASE(u8s8s8s32); + CASE(u8s8u8s32); +#undef CASE + return cfg; +} + } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv.cpp index eb1e4ca..7248c92 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv.cpp @@ -25,38 +25,37 @@ #include "mkldnn_common.hpp" #include "mkldnn_memory.hpp" - #include "norm.hpp" #include "conv/conv_common.hpp" namespace conv { -inline bool is_conv_3d(const prb_t *p) -{ - return (p->id > 1) ? 1 : 0; +inline bool is_conv_3d(const prb_t *p) { + return p->id > 1; } -inline bool is_conv_1d(const prb_t *p) -{ - return (!is_conv_3d(p) && p->ih == 1 && p->kh == 1 +inline bool is_conv_1d(const prb_t *p) { + return !is_conv_3d(p) && p->ih == 1 && p->kh == 1 && p->cfg[SRC].dt != mkldnn_s8 // temporary workaround until - && p->cfg[SRC].dt != mkldnn_u8) // int8 jit supports 1d - ? 1 : 0; + && p->cfg[SRC].dt != mkldnn_u8; // int8 jit supports 1d } -double get_trust_nz_level(const prb_t *p, data_kind_t kind, bool final_compare) -{ +double get_trust_nz_level(const prb_t *p, data_kind_t kind, + bool final_compare) { if (!final_compare) return p->cfg[kind].f_sparsity; - auto count_relu = [&]() { + auto negative_to_zero = [&]() { + using pk = attr_t::post_ops_t::kind_t; const auto &po = p->attr.post_ops; int count = 0; - for (int i = 0; i < po.len; ++i) - count += po.entry[i].kind == attr_t::post_ops_t::kind_t::RELU; - count = MAX2(count, p->merge == RELU ? 1 : 0); - return count; + for (int i = 0; i < po.len; ++i) { + auto k = po.entry[i].kind; + count += + k == pk::RELU || k == pk::ELU || k == pk::SQRT || k == pk::BRELU; + } + return !!count; }; double trust = 0.3; /* why? */ @@ -73,36 +72,70 @@ double get_trust_nz_level(const prb_t *p, data_kind_t kind, bool final_compare) trust = 0.8 * p->cfg[DST].f_sparsity; /* why? */ break; case DST: - trust /= count_relu() == 0 ? 1 : 2; + trust /= negative_to_zero() == 0 ? 1 : 2; break; } return trust; } +inline bool post_ops_require_integral_check(const prb_t *p) { + if (p->attr.post_ops.len == 0) return false; + + using pk = attr_t::post_ops_t::kind_t; + const auto &ops = p->attr.post_ops; + + // assumptions: at most 1 eltwise, scale = 1. + for (int idx = 0; idx < ops.len; ++idx) { + const auto &e = ops.entry[idx]; + if (e.kind == pk::SUM || e.kind == pk::ABS) continue; + if (e.kind == pk::RELU && e.eltwise.alpha == 0.f) continue; + return true; + } + + return false; +} + inline double get_eps(const prb_t *p, const data_kind_t kind) { + // Winograd specifics if (p->alg & WINO && p->dir & FLAG_WEI) { /*This is an empirical equation derived by observing growth error with increasing 'k' dimension in gemm of winograd*/ return p->cfg[kind].eps * (MAX2(1, pow(10, 0.4 * log10(0.125 * p->mb * p->oh * p->ow)))); } + + // post-ops specifics + if (post_ops_require_integral_check(p)) + return MAX2(1e-5, p->cfg[kind].eps); + return p->cfg[kind].eps; } inline void get_result(const prb_t *p, const data_kind_t kind, res_t *r, const diff_norm_t diff_norm) { - bool wino_test = (p->alg & WINO) - && (diff_norm.rel_diff(norm_t::L2) <= get_eps(p, kind)); - /* Ignoring elementwise errors for winograd, - since large relative error in few elements(which are anyways close to zero) - results in false positive failures*/ + const float eps = get_eps(p, kind); + + /* Ignoring element-wise errors for Winograd and in some cases of post-ops, + * since large relative error in few elements (which are anyways close + * to zero) results in false positive failures */ + + bool wino_test = (p->alg & WINO) && diff_norm.rel_diff(norm_t::L2) <= eps; if (wino_test) r->errors = 0; - r->state = r->errors ? FAILED : r->state; + + bool post_ops_test = post_ops_require_integral_check(p) + && diff_norm.rel_diff(norm_t::L2) <= eps; + if (post_ops_test) r->errors = 0; + + if (r->errors) r->state = FAILED; } inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp, res_t *r, bool final_compare = false) { + const bool dont_complain = false + || (p->alg & WINO) + || post_ops_require_integral_check(p); + size_t nelems = mem_dt.nelems(); const char *skind = data_kind2str(kind); @@ -153,7 +186,7 @@ inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt, } if (!ok) { r->errors++; - if ((!(p->alg & WINO) && r->errors < 10) || verbose >=10) { + if ((!dont_complain && r->errors < 10) || verbose >=10) { int mb_or_g = 0, g_or_oc = 0, c = 0, d = 0, h = 0, w = 0; switch (kind) { case SRC: inv_src_off_f(p, i, mb_or_g, g_or_oc, c, d, h, w); break; @@ -189,14 +222,15 @@ inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt, } diff_norm.done(); + get_result(p, kind, r, diff_norm); if (final_compare || r->errors) { const int vl = r->errors ? 0 : 2; - print(vl, "@@@ [%s] %sdiff: l0(``%g``) " + print(vl, "@@@ [%s] %sdiff: err:%d, l0(``%g``) " "l1:(%g,%g,%g,``%g``) " "l2:(%g,%g,%g,``%g``) " "l8:(%g,%g,%g,``%g``)\n", - skind, final_compare ? "final: " : "", + skind, final_compare ? "final: " : "", (int)r->errors, diff_norm.rel_diff(norm_t::L0), diff_norm.a_[norm_t::L1], diff_norm.b_[norm_t::L1], diff_norm.diff_[norm_t::L1], diff_norm.rel_diff(norm_t::L1), @@ -236,8 +270,6 @@ inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt, non_zero, (unsigned long)r->total); } - get_result(p, kind, r, diff_norm); - if (final_compare && r->state == UNTESTED) r->state = PASSED; /* optimism */ @@ -298,7 +330,7 @@ int fill_wei(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp, dnn_mem_t *p_mem_00 = check_reorder ? new dnn_mem_t(mem_dt.md_, mkldnn_f32, - get_default_format(mem_dt.md_.ndims, GWEI)) + get_default_format(mem_dt.md_.ndims, p->has_groups ? GWEI : WEI)) : &mem_fp; dnn_mem_t &mem_00 = *p_mem_00; @@ -394,47 +426,59 @@ inline int init_pd(const prb_t *p, mkldnn_convolution_desc_t &cd, mkldnn_memory_desc_t src_d, wei_d, bia_d, dst_d; int ndims = is_conv_3d(p) ? 5 : is_conv_1d(p) ? 3 : 4; - mkldnn_dims_t src_dims = {p->mb, p->ic, p->ih, p->iw}; mkldnn_dims_t src_1d_dims = {p->mb, p->ic, p->iw}; + mkldnn_dims_t src_2d_dims = {p->mb, p->ic, p->ih, p->iw}; mkldnn_dims_t src_3d_dims = {p->mb, p->ic, p->id, p->ih, p->iw}; - mkldnn_dims_t wei_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kh, p->kw}; + mkldnn_dims_t wei_1d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kw}; + mkldnn_dims_t wei_2d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kh, p->kw}; mkldnn_dims_t wei_3d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kd, p->kh, p->kw}; + mkldnn_dims_t bia_dims = {p->oc}; - mkldnn_dims_t dst_dims = {p->mb, p->oc, p->oh, p->ow}; + mkldnn_dims_t dst_1d_dims = {p->mb, p->oc, p->ow}; + mkldnn_dims_t dst_2d_dims = {p->mb, p->oc, p->oh, p->ow}; mkldnn_dims_t dst_3d_dims = {p->mb, p->oc, p->od, p->oh, p->ow}; DNN_SAFE(mkldnn_memory_desc_init(&src_d, ndims, - is_conv_3d(p) ? src_3d_dims : is_conv_1d(p) ? src_1d_dims : src_dims, + is_conv_3d(p) ? src_3d_dims : is_conv_1d(p) ? src_1d_dims : src_2d_dims, p->cfg[SRC].dt, mkldnn_any), WARN); - DNN_SAFE(mkldnn_memory_desc_init(&wei_d, ndims + 1, - is_conv_3d(p) ? wei_3d_dims : is_conv_1d(p) ? wei_1d_dims : wei_dims, + + DNN_SAFE(mkldnn_memory_desc_init(&wei_d, ndims + p->has_groups, + is_conv_3d(p) + ? &wei_3d_dims[!p->has_groups] + : is_conv_1d(p) + ? &wei_1d_dims[!p->has_groups] + : &wei_2d_dims[!p->has_groups], p->cfg[WEI].dt, mkldnn_any), WARN); + DNN_SAFE(mkldnn_memory_desc_init(&bia_d, 1, bia_dims, p->cfg[BIA].dt, mkldnn_any), WARN); + DNN_SAFE(mkldnn_memory_desc_init(&dst_d, ndims, - is_conv_3d(p) ? dst_3d_dims : is_conv_1d(p) ? dst_1d_dims : dst_dims, + is_conv_3d(p) ? dst_3d_dims : is_conv_1d(p) ? dst_1d_dims : dst_2d_dims, p->cfg[DST].dt, mkldnn_any), WARN); - int strides_nd[] = {p->sd, p->sh, p->sw}; - int dilates_nd[] = {p->dd, p->dh, p->dw}; - int padding_nd[] = {p->pd, p->ph, p->pw}; + + ptrdiff_t strides_nd[] = {p->sd, p->sh, p->sw}; + ptrdiff_t dilates_nd[] = {p->dd, p->dh, p->dw}; + ptrdiff_t padding_nd[] = {p->pd, p->ph, p->pw}; auto bph = [&](int ih, int oh, int kh, int sh, int ph, int dh) { return (oh - 1) * sh - ih + ((kh - 1) * (dh + 1) + 1) - ph; }; - int padding_r_nd[] = { + ptrdiff_t padding_r_nd[] = { bph(p->id, p->od, p->kd, p->sd, p->pd, p->dd), bph(p->ih, p->oh, p->kh, p->sh, p->ph, p->dh), bph(p->iw, p->ow, p->kw, p->sw, p->pw, p->dw)}; - int *strides = strides_nd + (5 - ndims); - int *dilates = dilates_nd + (5 - ndims); - int *padding = padding_nd + (5 - ndims); - int *padding_r = padding_r_nd + (5 - ndims); + ptrdiff_t *strides = strides_nd + (5 - ndims); + ptrdiff_t *dilates = dilates_nd + (5 - ndims); + ptrdiff_t *padding = padding_nd + (5 - ndims); + ptrdiff_t *padding_r = padding_r_nd + (5 - ndims); mkldnn_alg_kind_t alg = mkldnn_convolution_direct; if (p->alg == WINO) alg = mkldnn_convolution_winograd; + if (p->alg == AUTO) alg = mkldnn_convolution_auto; switch (p->dir) { case FWD_D: case FWD_B: case FWD_I: @@ -467,15 +511,8 @@ inline int init_pd(const prb_t *p, mkldnn_convolution_desc_t &cd, auto mkldnn_attr = create_mkldnn_attr(p->attr, p->oc, p->scales); mkldnn_status_t init_status = mkldnn_success; - if (p->merge == RELU) { - mkldnn_convolution_relu_desc_t crd; - DNN_SAFE(mkldnn_convolution_relu_desc_init(&crd, &cd, 0), WARN); - init_status = mkldnn_primitive_desc_create_v2(&cpd, &crd, mkldnn_attr, + init_status = mkldnn_primitive_desc_create_v2(&cpd, &cd, mkldnn_attr, engine, NULL); - } else { - init_status = mkldnn_primitive_desc_create_v2(&cpd, &cd, mkldnn_attr, - engine, NULL); - } mkldnn_primitive_attr_destroy(mkldnn_attr); @@ -498,6 +535,13 @@ inline int init_pd(const prb_t *p, mkldnn_convolution_desc_t &cd, mkldnn_primitive_desc_query_pd(cpd, query, index)); }; + if (p->alg == AUTO) { + mkldnn_convolution_desc_t *temp_conv_desc = {0}; + DNN_SAFE(mkldnn_primitive_desc_query(cpd, + mkldnn_query_convolution_d, 0, &temp_conv_desc), CRIT); + cd.alg_kind = temp_conv_desc->alg_kind; + } + if (p->dir == BWD_D) cd.diff_src_desc = q(mkldnn_query_diff_src_pd); else @@ -532,6 +576,17 @@ int doit(const prb_t *p, res_t *r) { mkldnn_primitive_t c{}; SAFE(init_pd(p, cd, cpd, r), WARN); + + prb_t *p_temp = nullptr; + if (p->alg == AUTO || p->alg == WINO) { + p_temp = new prb_t((desc_t)*p, p->dir, p->cfg, + p->alg, p->attr, p->mb); + if (p->alg == AUTO) p_temp->alg = alg_kind2alg(cd.alg_kind); + p_temp->cfg = auto_cfg(p_temp->alg, p->cfg); + p = p_temp; + } + + if (r->state == SKIPPED || r->state == UNIMPLEMENTED) return OK; @@ -548,7 +603,8 @@ int doit(const prb_t *p, res_t *r) { dnn_mem_t &bia_dt = *p_bia_dt; auto src_format = get_default_format(src_dt.md_.ndims, DATA); - auto wei_format = get_default_format(wei_dt.md_.ndims, GWEI); + auto wei_format = get_default_format(wei_dt.md_.ndims, + p->has_groups ? GWEI : WEI); const auto fp = mkldnn_f32; dnn_mem_t src_fp(src_dt_d, fp, src_format); @@ -574,7 +630,6 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_fwd(p, src_fp, wei_fp, bia_fp, dst_fp); dnn_mem_t dst(dst_dt, fp, src_format); - SAFE(dst.reorder(dst_dt), WARN); SAFE(compare_dst(p, dst, dst_fp, r, true), WARN); } } else if (p->dir == BWD_D) { @@ -585,7 +640,6 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_bwd_d(p, src_fp, wei_fp, bia_fp, dst_fp); dnn_mem_t src(src_dt, fp, src_format); - SAFE(src.reorder(src_dt), WARN); SAFE(compare_src(p, src, src_fp, r, true), WARN); } } else if (p->dir & FLAG_BWD && p->dir & FLAG_WEI) { @@ -598,11 +652,9 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_bwd_w(p, src_fp, wei_fp, bia_fp, dst_fp); dnn_mem_t wei(wei_dt, fp, wei_format); - SAFE(wei.reorder(wei_dt), WARN); SAFE(compare_wei(p, wei, wei_fp, r, true), WARN); if (p->dir & FLAG_BIA) { dnn_mem_t bia(bia_dt, fp, mkldnn_x); - SAFE(bia.reorder(bia_dt), WARN); SAFE(compare_bia(p, bia, bia_fp, r, true), WARN); } } @@ -632,6 +684,7 @@ int doit(const prb_t *p, res_t *r) { delete p_bia_dt; delete p_bia_fp; + delete p_temp; return OK; } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_aux.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_aux.cpp index 8301e87..44a504d 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_aux.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_aux.cpp @@ -30,6 +30,7 @@ namespace conv { alg_t str2alg(const char *str) { #define CASE(_alg) if (!strcasecmp(STRINGIFY(_alg), str)) return _alg + CASE(AUTO); CASE(DIRECT); CASE(WINO); #undef CASE @@ -38,26 +39,19 @@ alg_t str2alg(const char *str) { } const char *alg2str(alg_t alg) { + if (alg == AUTO) return "auto"; if (alg == DIRECT) return "direct"; if (alg == WINO) return "wino"; assert(!"unknown algorithm"); return "unknown algorithm"; } -merge_t str2merge(const char *str) { -#define CASE(_mrg) if (!strcasecmp(STRINGIFY(_mrg), str)) return _mrg - CASE(NONE); - CASE(RELU); -#undef CASE - assert(!"unknown merge"); - return NONE; -} - -const char *merge2str(merge_t merge) { - if (merge == NONE) return "none"; - if (merge == RELU) return "relu"; - assert(!"unknown merge"); - return "unknown merge"; +alg_t alg_kind2alg(mkldnn_alg_kind_t alg) { + if (alg == mkldnn_convolution_auto) return AUTO; + if (alg == mkldnn_convolution_direct) return DIRECT; + if (alg == mkldnn_convolution_winograd) return WINO; + assert(!"unknown algorithm"); + return DIRECT; } int str2desc(desc_t *desc, const char *str, bool is_deconv) { @@ -78,7 +72,9 @@ int str2desc(desc_t *desc, const char *str, bool is_deconv) { * - if padding is undefined => compute trivial padding */ - d.g = 1; d.mb = 2; d.sd = d.sh = d.sw = 1; d.dd = d.dh = d.dw = 0; d.name = "\"wip\""; + d.g = 1; d.mb = 2; d.sd = d.sh = d.sw = 1; d.dd = d.dh = d.dw = 0; + d.has_groups = false, d.name = "\"wip\""; + d.pw = -1; d.ph = -1; d.pd = -1; const char *s = str; assert(s); @@ -87,6 +83,7 @@ int str2desc(desc_t *desc, const char *str, bool is_deconv) { if (!strncmp(p, s, strlen(p))) { \ ok = 1; s += strlen(p); \ char *end_s; d. c = strtol(s, &end_s, 10); s += (end_s - s); \ + if (!strncmp(p, "g", 1)) d.has_groups = true; \ /* printf("@@@debug: %s: %d\n", p, d. c); */ \ } \ } while (0) @@ -123,34 +120,35 @@ int str2desc(desc_t *desc, const char *str, bool is_deconv) { return ((o - 1) * s - i + ((k - 1) * (d + 1) + 1)) / 2; }; - const bool no_d = (d.id | d.kd | d.od | d.pd | d.dd) == 0 && d.sd == 1; - const bool no_h = (d.ih | d.kh | d.oh | d.ph | d.dh) == 0 && d.sh == 1; - const bool no_w = (d.iw | d.kw | d.ow | d.pw | d.dw) == 0 && d.sw == 1; - + const bool no_d = (d.id | d.kd | d.od | d.dd) == 0 && d.sd == 1 && d.pd < 1; + const bool no_h = (d.ih | d.kh | d.oh | d.dh) == 0 && d.sh == 1 && d.ph < 1; + const bool no_w = (d.iw | d.kw | d.ow | d.dw) == 0 && d.sw == 1 && d.pw < 1; if (!no_h) { if (!d.ih || !d.kh) return FAIL; - - if (!d.oh) d.oh = compute_out(is_deconv, d.ih, d.kh, d.sh, d.ph, d.dh); - else if (!d.ph && d.oh != compute_out(is_deconv, d.ih, d.kh, d.sh, d.ph, d.dh)) + if (!d.oh) { + d.ph = 0; + d.oh = compute_out(is_deconv, d.ih, d.kh, d.sh, d.ph, d.dh); + } else if (d.ph < 0) d.ph = compute_pad(is_deconv, d.oh, d.ih, d.kh, d.sh, d.dh); } if (!no_w) { if (!d.iw || !d.kw) return FAIL; - - if (!d.ow) d.ow = compute_out(is_deconv, d.iw, d.kw, d.sw, d.pw, d.dw); - else if (!d.pw && d.ow != compute_out(is_deconv, d.iw, d.kw, d.sw, d.pw, d.dw)) + if (!d.ow) { + d.pw = 0; + d.ow = compute_out(is_deconv, d.iw, d.kw, d.sw, d.pw, d.dw); + } else if (d.pw < 0) d.pw = compute_pad(is_deconv, d.ow, d.iw, d.kw, d.sw, d.dw); } if (!no_d && d.id) { if (!d.id || !d.kd) return FAIL; - - if (!d.od) d.od = compute_out(is_deconv, d.id, d.kd, d.sd, d.pd, d.dd); - else if (!d.pd && d.od != compute_out(is_deconv, d.id, d.kd, d.sd, d.pd, d.dd)) + if (!d.od) { + d.pd = 0; + d.od = compute_out(is_deconv, d.id, d.kd, d.sd, d.pd, d.dd); + } else if (d.pd < 0) d.pd = compute_pad(is_deconv, d.od, d.id, d.kd, d.sd, d.dd); } - if (no_w && no_h && d.id) { d.iw = d.ih = d.id; d.kw = d.kh = d.kd; @@ -187,7 +185,7 @@ void desc2str(const desc_t *d, char *buffer, bool canonical) { buffer += l; rem_len -= l; \ } while(0) - if (canonical || d->g != 1) DPRINT("g%d", d->g); + if (canonical || d->has_groups) DPRINT("g%d", d->g); if (canonical || d->mb != 2) DPRINT("mb%d", d->mb); const bool half_form = (d->ih == d->iw && d->kh == d->kw && d->oh == d->ow @@ -230,19 +228,25 @@ void desc2str(const desc_t *d, char *buffer, bool canonical) { void prb_t::count_ops() { if (ops > 0) return; + int od_t = is_deconv ? this->id : this->od; + int oh_t = is_deconv ? this->ih : this->oh; + int ow_t = is_deconv ? this->iw : this->ow; + int id_t = is_deconv ? this->od : this->id; + int ih_t = is_deconv ? this->oh : this->ih; + int iw_t = is_deconv ? this->ow : this->iw; double sp_ops = 0; - for (int od = 0; od < this->od; ++od) { - for (int oh = 0; oh < this->oh; ++oh) { - for (int ow = 0; ow < this->ow; ++ow) { + for (int od = 0; od < od_t; ++od) { + for (int oh = 0; oh < oh_t; ++oh) { + for (int ow = 0; ow < ow_t; ++ow) { for (int kd = 0; kd < this->kd; ++kd) { const int id = od * this->sd - this->pd + kd * (this->dd + 1); - if (id < 0 || id >= this->id) continue; + if (id < 0 || id >= id_t) continue; for (int kh = 0; kh < this->kh; ++kh) { const int ih = oh * this->sh - this->ph + kh * (this->dh + 1); - if (ih < 0 || ih >= this->ih) continue; + if (ih < 0 || ih >= ih_t) continue; for (int kw = 0; kw < this->kw; ++kw) { const int iw = ow * this->sw - this->pw + kw * (this->dw + 1); - if (iw < 0 || iw >= this->iw) continue; + if (iw < 0 || iw >= iw_t) continue; sp_ops += 1; } } @@ -278,13 +282,11 @@ void prb_t::generate_oscales() { void prb2str(const prb_t *p, char *buffer, bool canonical) { char desc_buf[max_desc_len], attr_buf[max_attr_len]; - char dir_str[32] = {0}, cfg_str[32] = {0}, alg_str[32] = {0}, - merge_str[32] = {0}; + char dir_str[32] = {0}, cfg_str[32] = {0}, alg_str[32] = {0}; desc2str(p, desc_buf, canonical); snprintf(dir_str, sizeof(dir_str), "--dir=%s ", dir2str(p->dir)); snprintf(cfg_str, sizeof(cfg_str), "--cfg=%s ", cfg2str(p->cfg)); snprintf(alg_str, sizeof(alg_str), "--alg=%s ", alg2str(p->alg)); - snprintf(merge_str, sizeof(merge_str), "--merge=%s ", merge2str(p->merge)); bool is_attr_def = p->attr.is_def(); if (!is_attr_def) { int len = snprintf(attr_buf, max_attr_len, "--attr=\""); @@ -293,11 +295,10 @@ void prb2str(const prb_t *p, char *buffer, bool canonical) { len = (int)strnlen(attr_buf, max_attr_len); snprintf(attr_buf + len, max_attr_len - len, "\" "); } - snprintf(buffer, max_prb_len, "%s%s%s%s%s%s", + snprintf(buffer, max_prb_len, "%s%s%s%s%s", p->dir == FWD_B ? "" : dir_str, p->cfg == conf_f32 ? "" : cfg_str, p->alg == DIRECT ? "" : alg_str, - p->merge == NONE ? "" : merge_str, is_attr_def ? "" : attr_buf, desc_buf); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_common.hpp index d3969ec..624338e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_common.hpp @@ -26,15 +26,19 @@ #include "mkldnn_common.hpp" #include "mkldnn_memory.hpp" +namespace deconv { +/* some extra control parameters which shouldn't be placed in prb_t */ +extern const char *skip_impl; /* NULL or "" means do not skip anything */ +extern bool allow_unimpl; /* true means do not treat unimplemented as error */ +extern const char *perf_template; /* performance output template */ +} + namespace conv { -enum alg_t { DIRECT, WINO }; +enum alg_t { DIRECT, WINO, AUTO }; alg_t str2alg(const char *str); const char *alg2str(alg_t alg); - -enum merge_t { NONE, RELU, }; -merge_t str2merge(const char *str); -const char *merge2str(merge_t merge); +alg_t alg_kind2alg(mkldnn_alg_kind_t alg); struct desc_t { int g, mb; @@ -44,6 +48,7 @@ struct desc_t { int sd, sh, sw; int pd, ph, pw; int dd, dh, dw; + bool has_groups; const char *name; }; @@ -95,12 +100,13 @@ extern const _dt_conf_t conf_u8s8u8s32_wino; const dt_conf_t *str2cfg(const char *str); const char *cfg2str(const dt_conf_t *cfg); +const dt_conf_t *auto_cfg(const alg_t alg, const dt_conf_t *cfg); struct prb_t: public desc_t { prb_t(const desc_t &desc, dir_t dir, const dt_conf_t *cfg, alg_t alg, - merge_t merge, const attr_t &attr, int mb = 0) - : desc_t(desc), dir(dir), cfg(cfg), alg(alg), merge(merge), attr(attr) - , ops(0), scales(NULL) { + const attr_t &attr, int mb = 0, bool is_deconv = false) + : desc_t(desc), dir(dir), cfg(cfg), alg(alg), attr(attr) + , ops(0), scales(NULL), is_deconv(is_deconv) { if (mb) this->mb = mb; count_ops(); generate_oscales(); @@ -110,11 +116,11 @@ struct prb_t: public desc_t { dir_t dir; const dt_conf_t *cfg; alg_t alg; - merge_t merge; attr_t attr; double ops; float *scales; + bool is_deconv; void count_ops(); void generate_oscales(); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/deconv.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/deconv.cpp index ec0e0d0..034acfe 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/deconv.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/deconv.cpp @@ -33,15 +33,15 @@ using namespace conv; namespace deconv { -inline static void swap(int &a, int &b) -{ - int temp = a; +template +inline static void swap(T &a, T &b) { + T temp = a; a = b; b = temp; } -inline bool is_deconv_3d(const prb_t *p) -{ - return (p->id > 1 || p->od > 1) ? 1 : 0; + +inline bool is_deconv_3d(const prb_t *p) { + return p->id > 1; } inline int transpose_data_wei(const prb_t *p, dnn_mem_t &wei, dnn_mem_t &wei_tr) { @@ -61,43 +61,42 @@ inline int init_pd(const prb_t *p, mkldnn_deconvolution_desc_t &cd, int ndims = is_deconv_3d(p) ? 5 : 4; mkldnn_memory_desc_t src_d, wei_d, bia_d, dst_d; - mkldnn_dims_t src_dims = {p->mb, p->ic, p->ih, p->iw}; + mkldnn_dims_t src_2d_dims = {p->mb, p->ic, p->ih, p->iw}; mkldnn_dims_t src_3d_dims = {p->mb, p->ic, p->id, p->ih, p->iw}; - mkldnn_dims_t wei_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kh, p->kw}; + mkldnn_dims_t wei_2d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kh, p->kw}; mkldnn_dims_t wei_3d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kd, p->kh, p->kw}; mkldnn_dims_t bia_dims = {p->oc}; - mkldnn_dims_t dst_dims = {p->mb, p->oc, p->oh, p->ow}; + mkldnn_dims_t dst_2d_dims = {p->mb, p->oc, p->oh, p->ow}; mkldnn_dims_t dst_3d_dims = {p->mb, p->oc, p->od, p->oh, p->ow}; - DNN_SAFE(mkldnn_memory_desc_init(&src_d, ndims, - is_deconv_3d(p) ? src_3d_dims : src_dims, p->cfg[SRC].dt, mkldnn_any), WARN); - DNN_SAFE(mkldnn_memory_desc_init(&wei_d, ndims + 1, - is_deconv_3d(p) ? wei_3d_dims : wei_dims, p->cfg[WEI].dt, mkldnn_any), WARN); + is_deconv_3d(p) ? src_3d_dims : src_2d_dims, p->cfg[SRC].dt, mkldnn_any), WARN); + DNN_SAFE(mkldnn_memory_desc_init(&wei_d, ndims + p->has_groups, + is_deconv_3d(p) + ? &wei_3d_dims[!p->has_groups] + : &wei_2d_dims[!p->has_groups], + p->cfg[WEI].dt, mkldnn_any), WARN); DNN_SAFE(mkldnn_memory_desc_init(&bia_d, 1, bia_dims, p->cfg[BIA].dt, mkldnn_any), WARN); DNN_SAFE(mkldnn_memory_desc_init(&dst_d, ndims, - is_deconv_3d(p) ? dst_3d_dims : dst_dims, p->cfg[DST].dt, mkldnn_any), WARN); - int strides_2d[] = {p->sh, p->sw}; - int dilates_2d[] = {p->dh, p->dw}; - int padding_2d[] = {p->ph, p->pw}; - int strides_3d[] = {p->sd, p->sh, p->sw}; - int dilates_3d[] = {p->dd, p->dh, p->dw}; - int padding_3d[] = {p->pd, p->ph, p->pw}; + is_deconv_3d(p) ? dst_3d_dims : dst_2d_dims, p->cfg[DST].dt, mkldnn_any), WARN); + + ptrdiff_t strides_nd[] = {p->sd, p->sh, p->sw}; + ptrdiff_t dilates_nd[] = {p->dd, p->dh, p->dw}; + ptrdiff_t padding_nd[] = {p->pd, p->ph, p->pw}; auto bph = [&](int ih, int oh, int kh, int sh, int ph, int dh) { return (oh - 1) * sh - ih + ((kh - 1) * (dh + 1) + 1) - ph; }; - int padding_r_3d[] = { + + ptrdiff_t padding_r_nd[] = { bph(p->od, p->id, p->kd, p->sd, p->pd, p->dd), bph(p->oh, p->ih, p->kh, p->sh, p->ph, p->dh), bph(p->ow, p->iw, p->kw, p->sw, p->pw, p->dw)}; - int padding_r_2d[] = { - bph(p->oh, p->ih, p->kh, p->sh, p->ph, p->dh), - bph(p->ow, p->iw, p->kw, p->sw, p->pw, p->dw)}; - int *strides = is_deconv_3d(p) ? strides_3d : strides_2d; - int *dilates = is_deconv_3d(p) ? dilates_3d : dilates_2d; - int *padding = is_deconv_3d(p) ? padding_3d : padding_2d; - int *padding_r = is_deconv_3d(p) ? padding_r_3d : padding_r_2d; + ptrdiff_t *strides = strides_nd + (5 - ndims); + ptrdiff_t *dilates = dilates_nd + (5 - ndims); + ptrdiff_t *padding = padding_nd + (5 - ndims); + ptrdiff_t *padding_r = padding_r_nd + (5 - ndims); + mkldnn_alg_kind_t alg = mkldnn_deconvolution_direct; if (p->alg == WINO) alg = mkldnn_deconvolution_winograd; @@ -182,7 +181,7 @@ int doit(const prb_t *p, res_t *r) { *r = res_zero; bool with_groups = 1; - prb_t p_tr((desc_t)*p, p->dir, p->cfg, p->alg, p->merge, p->attr, p->mb); + prb_t p_tr((desc_t)*p, p->dir, p->cfg, p->alg, p->attr, p->mb, true); swap(p_tr.ic, p_tr.oc); swap(p_tr.ih, p_tr.oh); swap(p_tr.id, p_tr.od); @@ -210,8 +209,9 @@ int doit(const prb_t *p, res_t *r) { ? new dnn_mem_t(bia_dt_d, p->cfg[BIA].dt) : new dnn_mem_t(); dnn_mem_t &bia_dt = *p_bia_dt; - auto src_format = is_deconv_3d(p) ? mkldnn_ncdhw : mkldnn_nchw; - auto wei_format = is_deconv_3d(p) ? mkldnn_goidhw : mkldnn_goihw; + auto src_format = get_default_format(src_dt.md_.ndims, DATA); + auto wei_format = get_default_format(wei_dt.md_.ndims, + p->has_groups ? GWEI : WEI); const auto fp = mkldnn_f32; @@ -243,7 +243,6 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_bwd_d(&p_tr, dst_fp, wei_tr_fp, bia_fp, src_fp); dnn_mem_t dst(dst_dt, fp, src_format); - SAFE(dst.reorder(dst_dt), WARN); SAFE(compare_dst(p, dst, dst_fp, r, true), WARN); } } else if (p->dir == BWD_D) { @@ -254,7 +253,6 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_fwd(&p_tr, dst_fp, wei_tr_fp, zero_fp, src_fp); dnn_mem_t src(src_dt, fp, src_format); - SAFE(src.reorder(src_dt), WARN); SAFE(compare_src(p, src, src_fp, r, true), WARN); } } else if (p->dir & FLAG_BWD && p->dir & FLAG_WEI) { @@ -268,12 +266,10 @@ int doit(const prb_t *p, res_t *r) { compute_ref_bwd_weights(&p_tr, dst_fp, wei_tr_fp, src_fp); transpose_data_wei(&p_tr, wei_tr_fp, wei_fp); dnn_mem_t wei(wei_dt, fp, wei_format); - SAFE(wei.reorder(wei_dt), WARN); SAFE(compare_wei(&p_tr, wei, wei_fp, r, true), WARN); if (p->dir & FLAG_BIA) { compute_ref_bwd_bias(p, bia_fp, dst_fp); dnn_mem_t bia(bia_dt, fp, mkldnn_x); - SAFE(bia.reorder(bia_dt), WARN); SAFE(compare_bia(p, bia, bia_fp, r, true), WARN); } } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_conv.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_conv.cpp index a471d21..60b7912 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_conv.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_conv.cpp @@ -15,6 +15,7 @@ *******************************************************************************/ #include "src/common/mkldnn_thread.hpp" +#include "src/common/math_utils.hpp" #include "conv/conv_common.hpp" @@ -85,17 +86,29 @@ void compute_ref_direct_fwd(const prb_t *p, dnn_mem_t &src_m, }; auto maybe_post_ops = [&](float &conv_res, float dst) { + using namespace mkldnn::impl::math; + const auto &ops = p->attr.post_ops; for (int idx = 0; idx < ops.len; ++idx) { using pk = attr_t::post_ops_t::kind_t; const auto &e = ops.entry[idx]; + + const auto &s = e.eltwise.scale; + const auto &a = e.eltwise.alpha; + const auto &b = e.eltwise.beta; + switch (e.kind) { - case pk::SUM: - conv_res += e.sum.scale * dst; - break; - case pk::RELU: - conv_res = e.eltwise.scale * (conv_res < 0 ? 0 : conv_res); - break; + case pk::SUM: conv_res += e.sum.scale * dst; break; + case pk::RELU: conv_res = s*relu_fwd(conv_res, a); break; + case pk::TANH: conv_res = s*tanh_fwd(conv_res); break; + case pk::ELU: conv_res = s*elu_fwd(conv_res, a); break; + case pk::SQUARE: conv_res = s*square_fwd(conv_res); break; + case pk::ABS: conv_res = s*abs_fwd(conv_res); break; + case pk::SQRT: conv_res = s*sqrt_fwd(conv_res); break; + case pk::LINEAR: conv_res = s*linear_fwd(conv_res, a, b); break; + case pk::BRELU: conv_res = s*bounded_relu_fwd(conv_res, a); break; + case pk::SRELU: conv_res = s*soft_relu_fwd(conv_res); break; + case pk::LOGISTIC: conv_res = s*logistic_fwd(conv_res); break; default: assert(!"unknown attr::post_ops::kind"); } @@ -115,9 +128,6 @@ void compute_ref_direct_fwd(const prb_t *p, dnn_mem_t &src_m, conv_res += ((float*)bia_m)[bia_off]; } - if (p->merge == RELU && conv_res < 0) - conv_res = 0; - maybe_scale(conv_res, g * p->oc / p->g + oc); maybe_post_ops(conv_res, dst); @@ -211,21 +221,55 @@ void compute_ref_direct_bwd_d(const prb_t *p, dnn_mem_t &diff_src_m, } }; + /* Used for Deconv FWD */ + auto maybe_post_ops = [&](float &conv_res, float dst) { + using namespace mkldnn::impl::math; + + const auto &ops = p->attr.post_ops; + for (int idx = 0; idx < ops.len; ++idx) { + using pk = attr_t::post_ops_t::kind_t; + const auto &e = ops.entry[idx]; + + const auto &s = e.eltwise.scale; + const auto &a = e.eltwise.alpha; + const auto &b = e.eltwise.beta; + + switch (e.kind) { + case pk::SUM: conv_res += e.sum.scale * dst; break; + case pk::RELU: conv_res = s*relu_fwd(conv_res, a); break; + case pk::TANH: conv_res = s*tanh_fwd(conv_res); break; + case pk::ELU: conv_res = s*elu_fwd(conv_res, a); break; + case pk::SQUARE: conv_res = s*square_fwd(conv_res); break; + case pk::ABS: conv_res = s*abs_fwd(conv_res); break; + case pk::SQRT: conv_res = s*sqrt_fwd(conv_res); break; + case pk::LINEAR: conv_res = s*linear_fwd(conv_res, a, b); break; + case pk::BRELU: conv_res = s*bounded_relu_fwd(conv_res, a); break; + case pk::SRELU: conv_res = s*soft_relu_fwd(conv_res); break; + case pk::LOGISTIC: conv_res = s*logistic_fwd(conv_res); break; + default: + assert(!"unknown attr::post_ops::kind"); + } + } + }; + mkldnn::impl::parallel_nd(p->g, p->mb, p->ic / p->g, p->id, p->ih, p->iw, [&](int g, int mb, int ic, int id, int ih, int iw) { size_t src_off = src_off_f(p, mb, g, ic, id, ih, iw); float &ds = ((float*)diff_src_m)[src_off]; - ds = 0; + float conv_res = 0; if (fast) - ker_fast(ds, g, mb, ic, id, ih, iw); + ker_fast(conv_res, g, mb, ic, id, ih, iw); else - ker(ds, g, mb, ic, id, ih, iw); + ker(conv_res, g, mb, ic, id, ih, iw); if (p->dir & FLAG_BIA) { const size_t bia_off = (size_t)g * p->ic / p->g + ic; - ds += ((float*)bia_m)[bia_off]; + conv_res += ((float*)bia_m)[bia_off]; } - maybe_scale(ds, g * p->ic / p->g + ic); + maybe_scale(conv_res, g * p->ic / p->g + ic); + maybe_post_ops(conv_res, ds); + + ds = conv_res; } ); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_wino.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_wino.cpp index a5c56a3..ac31f1f 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_wino.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_wino.cpp @@ -422,10 +422,6 @@ void compute_wino_ref_fwd(const prb_t *p, dnn_mem_t &src_m, dnn_mem_t &wei_m, ((float *)bia_m)[bia_off] : 0.f; - if (p->merge == RELU && conv_res < 0) { - conv_res = 0.f; - } - const auto &ops = p->attr.post_ops; for (int idx = 0; idx < ops.len; ++idx) { using pk = attr_t::post_ops_t::kind_t; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.cpp index 2bb3429..ca200ae 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.cpp @@ -102,6 +102,7 @@ data_kind_t fmt2data_kind(mkldnn_memory_format_t fmt) { case mkldnn_gOIw8i16o2i: case mkldnn_goihw: case mkldnn_hwigo: + case mkldnn_giohw: case mkldnn_hwigo_s8s8: case mkldnn_gOIhw8i8o: case mkldnn_gOIhw16i16o: @@ -119,6 +120,7 @@ data_kind_t fmt2data_kind(mkldnn_memory_format_t fmt) { case mkldnn_gOhwi16o: case mkldnn_Goihw8g: case mkldnn_Goihw16g: + case mkldnn_Goihw16g_s8s8: case mkldnn_gOhIw16o4i: case mkldnn_goidhw: case mkldnn_gOIdhw16i16o: @@ -192,18 +194,56 @@ attr_t::post_ops_t::kind_t attr_t::post_ops_t::str2kind(const char *str) { #define CASE(_knd) if (!strcasecmp(STRINGIFY(_knd), str)) return _knd CASE(SUM); CASE(RELU); + CASE(TANH); + CASE(ELU); + CASE(SQUARE); + CASE(ABS); + CASE(SQRT); + CASE(LINEAR); + CASE(BRELU); + CASE(SRELU); + CASE(LOGISTIC); #undef CASE assert(!"unknown attr::post_ops::kind"); return KIND_TOTAL; } const char *attr_t::post_ops_t::kind2str(attr_t::post_ops_t::kind_t kind) { - if (kind == SUM) return "sum"; - if (kind == RELU) return "relu"; +#define CASE(_knd, str) if (kind == _knd) return str + CASE(SUM, "sum"); + CASE(RELU, "relu"); + CASE(TANH, "tanh"); + CASE(ELU, "elu"); + CASE(SQUARE, "square"); + CASE(ABS, "abs"); + CASE(SQRT, "sqrt"); + CASE(LINEAR, "linear"); + CASE(BRELU, "brelu"); + CASE(SRELU, "srelu"); + CASE(LOGISTIC, "logistic"); +#undef CASE assert(!"unknown attr::post_ops::kind"); return "unknown attr::post_ops::kind"; } +mkldnn_alg_kind_t attr_t::post_ops_t::kind2mkldnn_kind( + attr_t::post_ops_t::kind_t kind) { +#define CASE(_knd, _mknd) if (kind == _knd) return _mknd + CASE(RELU, mkldnn_eltwise_relu); + CASE(TANH, mkldnn_eltwise_tanh); + CASE(ELU, mkldnn_eltwise_elu); + CASE(SQUARE, mkldnn_eltwise_square); + CASE(ABS, mkldnn_eltwise_abs); + CASE(SQRT, mkldnn_eltwise_sqrt); + CASE(LINEAR, mkldnn_eltwise_linear); + CASE(BRELU, mkldnn_eltwise_bounded_relu); + CASE(SRELU, mkldnn_eltwise_soft_relu); + CASE(LOGISTIC, mkldnn_eltwise_logistic); +#undef CASE + assert(!"unknown attr::post_ops::kind"); + return mkldnn_alg_kind_undef; +} + int attr_t::post_ops_t::from_str(const char *str, const char **end_s) { *this = post_ops_t(); @@ -236,9 +276,26 @@ int attr_t::post_ops_t::from_str(const char *str, const char **end_s) { } else { e.sum.scale = 1.f; } - } else if (k == RELU) { + } else { + e.eltwise.alg = kind2mkldnn_kind(k); e.eltwise.scale = 1.f; e.eltwise.alpha = e.eltwise.beta = 0.f; + + for (int i = 0; i < 3; ++i) { + // :alpha:beta:scale + float &val = i == 0 ? e.eltwise.alpha + : i == 1 ? e.eltwise.beta : e.eltwise.scale; + if (*s == ':') { + char *end; + val = strtof(++s, &end); + if (end == s) return FAIL; + s = end; + } else { + break; + } + } + + if (e.eltwise.scale <= 0) return FAIL; } break; @@ -265,7 +322,18 @@ void attr_t::post_ops_t::to_str(char *buffer, char **end_b) const { buffer += sprintf(buffer, "%s:%g", kind2str(e.kind), e.sum.scale); break; case RELU: - buffer += sprintf(buffer, "%s", kind2str(e.kind)); + case TANH: + case ELU: + case SQUARE: + case ABS: + case SQRT: + case LINEAR: + case BRELU: + case SRELU: + case LOGISTIC: + buffer += sprintf(buffer, "%s:%g", kind2str(e.kind), e.eltwise.alpha); + if (e.eltwise.beta != 0.f || e.eltwise.scale != 1.f) + buffer += sprintf(buffer, ":%g:%g", e.eltwise.beta, e.eltwise.scale); break; default: assert(!"unknown kind"); @@ -372,9 +440,17 @@ mkldnn_primitive_attr_t create_mkldnn_attr(const attr_t &attr, int scale_cnt, DNN_SAFE_V(mkldnn_post_ops_append_sum(ops, e.sum.scale)); break; case attr_t::post_ops_t::RELU: + case attr_t::post_ops_t::TANH: + case attr_t::post_ops_t::ELU: + case attr_t::post_ops_t::SQUARE: + case attr_t::post_ops_t::ABS: + case attr_t::post_ops_t::SQRT: + case attr_t::post_ops_t::LINEAR: + case attr_t::post_ops_t::BRELU: + case attr_t::post_ops_t::SRELU: + case attr_t::post_ops_t::LOGISTIC: DNN_SAFE_V(mkldnn_post_ops_append_eltwise(ops, e.eltwise.scale, - mkldnn_eltwise_relu, e.eltwise.alpha, - e.eltwise.beta)); + e.eltwise.alg, e.eltwise.alpha, e.eltwise.beta)); break; default: assert(!"unknown attr::post_ops::kind"); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.hpp index 7010c98..594ac41 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.hpp @@ -70,17 +70,19 @@ struct attr_t { }; struct post_ops_t { - enum kind_t { SUM, RELU, KIND_TOTAL }; + enum kind_t { SUM, RELU, TANH, ELU, SQUARE, ABS, SQRT, LINEAR, BRELU, + SRELU, LOGISTIC, KIND_TOTAL }; static kind_t str2kind(const char *str); static const char *kind2str(kind_t kind); + static mkldnn_alg_kind_t kind2mkldnn_kind(kind_t kind); struct entry_t { kind_t kind; union { struct { float scale; } sum; struct { - // eltwise algorithm in future - float scale, alpha, beta; // unused now + mkldnn_alg_kind_t alg; + float scale, alpha, beta; } eltwise; }; }; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_auto b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_auto new file mode 100644 index 0000000..aafdbc6 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_auto @@ -0,0 +1,2 @@ +mb2_ic3oc64_ih300kh3oh300ph1n"ssd_300_voc0712:conv1_1" +mb32_ic3oc64_ih300kh3oh300ph1n"ssd_300_voc0712:conv1_1" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p1 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p1 index 44b9bf7..efbf159 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p1 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p1 @@ -3,10 +3,10 @@ mb1_g1ic3oc64_ih606oh300kh7sh2dh0ph0_iw756ow375kw7sw2dw0pw0_n"fastrcnn_p1:conv1" mb1_g1ic64oc64_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv2" -mb1_g1ic64oc64_ih150oh150kh3sh1dh0ph1_iw188ow188kw3sw1dw0pw1_n"fastrcnn_p1:conv3" -mb1_g1ic64oc256_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv4" +mb1_g1ic64oc64_ih150oh150kh3sh1dh0ph1_iw188ow188kw3sw1dw0pw1_n"fastrcnn_p1:conv3*2" +mb1_g1ic64oc256_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv4*3" # mb1_g1ic64oc256_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv4" -mb1_g1ic256oc64_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv5" +mb1_g1ic256oc64_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv5*2" # mb1_g1ic64oc64_ih150oh150kh3sh1dh0ph1_iw188ow188kw3sw1dw0pw1_n"fastrcnn_p1:conv3" # mb1_g1ic64oc256_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv4" # mb1_g1ic256oc64_ih150oh150kh1sh1dh0ph0_iw188ow188kw1sw1dw0pw0_n"fastrcnn_p1:conv5" @@ -14,9 +14,9 @@ mb1_g1ic64oc64_ih152oh75kh3sh2dh0ph0_iw190ow94kw3sw2dw0pw0_n"fastrcnn_p1:conv6" mb1_g1ic64oc256_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv7" mb1_g1ic256oc128_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv8" mb1_g1ic256oc512_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv9" -mb1_g1ic128oc128_ih75oh75kh3sh1dh0ph1_iw94ow94kw3sw1dw0pw1_n"fastrcnn_p1:conv10" -mb1_g1ic128oc512_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv11" -mb1_g1ic512oc128_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv12" +mb1_g1ic128oc128_ih75oh75kh3sh1dh0ph1_iw94ow94kw3sw1dw0pw1_n"fastrcnn_p1:conv10*3" +mb1_g1ic128oc512_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv11*3" +mb1_g1ic512oc128_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv12*3" # mb1_g1ic128oc128_ih75oh75kh3sh1dh0ph1_iw94ow94kw3sw1dw0pw1_n"fastrcnn_p1:conv10" # mb1_g1ic128oc512_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv11" # mb1_g1ic512oc128_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv12" @@ -26,10 +26,10 @@ mb1_g1ic512oc128_ih75oh75kh1sh1dh0ph0_iw94ow94kw1sw1dw0pw0_n"fastrcnn_p1:conv12" mb1_g1ic128oc128_ih77oh38kh3sh2dh0ph0_iw96ow47kw3sw2dw0pw0_n"fastrcnn_p1:conv13" mb1_g1ic128oc512_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv14" mb1_g1ic512oc256_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv15" -mb1_g1ic256oc256_ih38oh38kh3sh1dh0ph1_iw47ow47kw3sw1dw0pw1_n"fastrcnn_p1:conv16" -mb1_g1ic256oc1024_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv17" +mb1_g1ic256oc256_ih38oh38kh3sh1dh0ph1_iw47ow47kw3sw1dw0pw1_n"fastrcnn_p1:conv16*6" +mb1_g1ic256oc1024_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv17*6" mb1_g1ic512oc1024_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv18" -mb1_g1ic1024oc256_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv19" +mb1_g1ic1024oc256_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv19*5" # mb1_g1ic256oc256_ih38oh38kh3sh1dh0ph1_iw47ow47kw3sw1dw0pw1_n"fastrcnn_p1:conv16" # mb1_g1ic256oc1024_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv17" # mb1_g1ic1024oc256_ih38oh38kh1sh1dh0ph0_iw47ow47kw1sw1dw0pw0_n"fastrcnn_p1:conv19" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p2 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p2 index 758d70e..d2cc2eb 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p2 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_fastrcnn_p2 @@ -1,10 +1,10 @@ # FastRCNN part 2 -mb64_g1ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv1" -mb64_g1ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fastrcnn_p2:conv2" -mb64_g1ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv3" -mb64_g1ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv4" -mb64_g1ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv5" +mb64_g1ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv1*3" +mb64_g1ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fastrcnn_p2:conv2*9" +mb64_g1ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv3*9" +mb64_g1ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv4*3" +mb64_g1ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv5*6" # mb64_g1ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fastrcnn_p2:conv2" # mb64_g1ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv3" # mb64_g1ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fastrcnn_p2:conv5" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v1 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v1 index ec08dcc..248148d 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v1 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v1 @@ -9,7 +9,7 @@ mb96ic96ih28oc128oh28kh3ph1n"googlenet_v1:inception_3a/3x3" mb96ic192ih28oc16oh28kh1ph0n"googlenet_v1:inception_3a/5x5_reduce" mb96ic16ih28oc32oh28kh5ph2n"googlenet_v1:inception_3a/5x5" mb96ic192ih28oc32oh28kh1ph0n"googlenet_v1:inception_3a/pool_proj" -mb96ic256ih28oc128oh28kh1ph0n"googlenet_v1:inception_3b/1x1" +mb96ic256ih28oc128oh28kh1ph0n"googlenet_v1:inception_3b/1x1*2" # mb96ic256ih28oc128oh28kh1ph0n"googlenet_v1:inception_3b/3x3_reduce" # inception_3b/1x1 mb96ic128ih28oc192oh28kh3ph1n"googlenet_v1:inception_3b/3x3" mb96ic256ih28oc32oh28kh1ph0n"googlenet_v1:inception_3b/5x5_reduce" @@ -23,12 +23,12 @@ mb96ic16ih14oc48oh14kh5ph2n"googlenet_v1:inception_4a/5x5" mb96ic480ih14oc64oh14kh1ph0n"googlenet_v1:inception_4a/pool_proj" mb96ic512ih4oc128oh4kh1ph0n"googlenet_v1:loss1/conv" mb96ic512ih14oc160oh14kh1ph0n"googlenet_v1:inception_4b/1x1" -mb96ic512ih14oc112oh14kh1ph0n"googlenet_v1:inception_4b/3x3_reduce" +mb96ic512ih14oc112oh14kh1ph0n"googlenet_v1:inception_4b/3x3_reduce*2" mb96ic112ih14oc224oh14kh3ph1n"googlenet_v1:inception_4b/3x3" -mb96ic512ih14oc24oh14kh1ph0n"googlenet_v1:inception_4b/5x5_reduce" -mb96ic24ih14oc64oh14kh5ph2n"googlenet_v1:inception_4b/5x5" -mb96ic512ih14oc64oh14kh1ph0n"googlenet_v1:inception_4b/pool_proj" -mb96ic512ih14oc128oh14kh1ph0n"googlenet_v1:inception_4c/1x1" +mb96ic512ih14oc24oh14kh1ph0n"googlenet_v1:inception_4b/5x5_reduce*2" +mb96ic24ih14oc64oh14kh5ph2n"googlenet_v1:inception_4b/5x5*2" +mb96ic512ih14oc64oh14kh1ph0n"googlenet_v1:inception_4b/pool_proj*3" +mb96ic512ih14oc128oh14kh1ph0n"googlenet_v1:inception_4c/1x1*2" # mb96ic512ih14oc128oh14kh1ph0n"googlenet_v1:inception_4c/3x3_reduce" # inception_4c/1x1 mb96ic128ih14oc256oh14kh3ph1n"googlenet_v1:inception_4c/3x3" # mb96ic512ih14oc24oh14kh1ph0n"googlenet_v1:inception_4c/5x5_reduce" # inception_4b/5x5_reduce @@ -52,7 +52,7 @@ mb96ic832ih7oc160oh7kh1ph0n"googlenet_v1:inception_5a/3x3_reduce" mb96ic160ih7oc320oh7kh3ph1n"googlenet_v1:inception_5a/3x3" mb96ic832ih7oc32oh7kh1ph0n"googlenet_v1:inception_5a/5x5_reduce" mb96ic32ih7oc128oh7kh5ph2n"googlenet_v1:inception_5a/5x5" -mb96ic832ih7oc128oh7kh1ph0n"googlenet_v1:inception_5a/pool_proj" +mb96ic832ih7oc128oh7kh1ph0n"googlenet_v1:inception_5a/pool_proj*2" mb96ic832ih7oc384oh7kh1ph0n"googlenet_v1:inception_5b/1x1" mb96ic832ih7oc192oh7kh1ph0n"googlenet_v1:inception_5b/3x3_reduce" mb96ic192ih7oc384oh7kh3ph1n"googlenet_v1:inception_5b/3x3" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v2 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v2 index 835970e..caf100a 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v2 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v2 @@ -3,14 +3,14 @@ g1mb96ic3ih224iw224oc64oh112ow112kh7kw7sh2sw2ph3pw3n"googlenet_v2:conv1/7x7_s2" mb96ic64ih56oc64oh56kh1ph0n"googlenet_v2:conv2/3x3_reduce" mb96ic64ih56oc192oh56kh3ph1n"googlenet_v2:conv2/3x3" -mb96ic192ih28oc64oh28kh1ph0n"googlenet_v2:inception_3a/1x1" +mb96ic192ih28oc64oh28kh1ph0n"googlenet_v2:inception_3a/1x1*3" # mb96ic192ih28oc64oh28kh1ph0n"googlenet_v2:inception_3a/3x3_reduce" # inception_3a/1x1 mb96ic64ih28oc64oh28kh3ph1n"googlenet_v2:inception_3a/3x3" # mb96ic192ih28oc64oh28kh1ph0n"googlenet_v2:inception_3a/double3x3_reduce" # inception_3a/1x1 -mb96ic64ih28oc96oh28kh3ph1n"googlenet_v2:inception_3a/double3x3a" -mb96ic96ih28oc96oh28kh3ph1n"googlenet_v2:inception_3a/double3x3b" +mb96ic64ih28oc96oh28kh3ph1n"googlenet_v2:inception_3a/double3x3a*4" +mb96ic96ih28oc96oh28kh3ph1n"googlenet_v2:inception_3a/double3x3b*2" mb96ic192ih28oc32oh28kh1ph0n"googlenet_v2:inception_3a/pool_proj" -mb96ic256ih28oc64oh28kh1ph0n"googlenet_v2:inception_3b/1x1" +mb96ic256ih28oc64oh28kh1ph0n"googlenet_v2:inception_3b/1x1*4" # mb96ic256ih28oc64oh28kh1ph0n"googlenet_v2:inception_3b/3x3_reduce" # inception_3b/1x1 # mb96ic64ih28oc96oh28kh3ph1n"googlenet_v2:inception_3b/3x3" # inception_3a/double3x3a # mb96ic256ih28oc64oh28kh1ph0n"googlenet_v2:inception_3b/double3x3_reduce" # inception_3b/1x1 @@ -26,20 +26,20 @@ mb96ic576ih4oc128oh4kh1ph0n"googlenet_v2:loss1/conv" mb96ic576ih14oc224oh14kh1ph0n"googlenet_v2:inception_4a/1x1" mb96ic576ih14oc64oh14kh1ph0n"googlenet_v2:inception_4a/3x3_reduce" mb96ic64ih14oc96oh14kh3ph1n"googlenet_v2:inception_4a/3x3" -mb96ic576ih14oc96oh14kh1ph0n"googlenet_v2:inception_4a/double3x3_reduce" -mb96ic96ih14oc128oh14kh3ph1n"googlenet_v2:inception_4a/double3x3a" -mb96ic128ih14oc128oh14kh3ph1n"googlenet_v2:inception_4a/double3x3b" -mb96ic576ih14oc128oh14kh1ph0n"googlenet_v2:inception_4a/pool_proj" -mb96ic576ih14oc192oh14kh1ph0n"googlenet_v2:inception_4b/1x1" +mb96ic576ih14oc96oh14kh1ph0n"googlenet_v2:inception_4a/double3x3_reduce*6" +mb96ic96ih14oc128oh14kh3ph1n"googlenet_v2:inception_4a/double3x3a*3" +mb96ic128ih14oc128oh14kh3ph1n"googlenet_v2:inception_4a/double3x3b*2" +mb96ic576ih14oc128oh14kh1ph0n"googlenet_v2:inception_4a/pool_proj*6" +mb96ic576ih14oc192oh14kh1ph0n"googlenet_v2:inception_4b/1x1*2" # mb96ic576ih14oc96oh14kh1ph0n"googlenet_v2:inception_4b/3x3_reduce" # inception_4a/double3x3_reduce # mb96ic96ih14oc128oh14kh3ph1n"googlenet_v2:inception_4b/3x3" # inception_4a/double3x3a # mb96ic576ih14oc96oh14kh1ph0n"googlenet_v2:inception_4b/double3x3_reduce" # inception_4a/double3x3_reduce # mb96ic96ih14oc128oh14kh3ph1n"googlenet_v2:inception_4b/double3x3a" # inception_4a/double3x3a # mb96ic128ih14oc128oh14kh3ph1n"googlenet_v2:inception_4b/double3x3b" # inception_4a/double3x3b # mb96ic576ih14oc128oh14kh1ph0n"googlenet_v2:inception_4b/pool_proj" # inception_4a/pool_proj -mb96ic576ih14oc160oh14kh1ph0n"googlenet_v2:inception_4c/1x1" +mb96ic576ih14oc160oh14kh1ph0n"googlenet_v2:inception_4c/1x1*2" # mb96ic576ih14oc128oh14kh1ph0n"googlenet_v2:inception_4c/3x3_reduce" # inception_4a/pool_proj -mb96ic128ih14oc160oh14kh3ph1n"googlenet_v2:inception_4c/3x3" +mb96ic128ih14oc160oh14kh3ph1n"googlenet_v2:inception_4c/3x3*2" # mb96ic576ih14oc128oh14kh1ph0n"googlenet_v2:inception_4c/double3x3_reduce" # inception_4a/pool_proj # mb96ic128ih14oc160oh14kh3ph1n"googlenet_v2:inception_4c/double3x3a" # inception_4c/3x3 mb96ic160ih14oc160oh14kh3ph1n"googlenet_v2:inception_4c/double3x3b" @@ -57,13 +57,13 @@ g1mb96ic128ih14iw14oc192oh7ow7kh3kw3sh2sw2ph1pw1n"googlenet_v2:inception_4e/3x3" mb96ic192ih14oc256oh14kh3ph1n"googlenet_v2:inception_4e/double3x3a" g1mb96ic256ih14iw14oc256oh7ow7kh3kw3sh2sw2ph1pw1n"googlenet_v2:inception_4e/double3x3b" mb96ic1024ih2oc128oh2kh1ph0n"googlenet_v2:loss2/conv" -mb96ic1024ih7oc352oh7kh1ph0n"googlenet_v2:inception_5a/1x1" -mb96ic1024ih7oc192oh7kh1ph0n"googlenet_v2:inception_5a/3x3_reduce" -mb96ic192ih7oc320oh7kh3ph1n"googlenet_v2:inception_5a/3x3" +mb96ic1024ih7oc352oh7kh1ph0n"googlenet_v2:inception_5a/1x1*2" +mb96ic1024ih7oc192oh7kh1ph0n"googlenet_v2:inception_5a/3x3_reduce*3" +mb96ic192ih7oc320oh7kh3ph1n"googlenet_v2:inception_5a/3x3*2" mb96ic1024ih7oc160oh7kh1ph0n"googlenet_v2:inception_5a/double3x3_reduce" mb96ic160ih7oc224oh7kh3ph1n"googlenet_v2:inception_5a/double3x3a" -mb96ic224ih7oc224oh7kh3ph1n"googlenet_v2:inception_5a/double3x3b" -mb96ic1024ih7oc128oh7kh1ph0n"googlenet_v2:inception_5a/pool_proj" +mb96ic224ih7oc224oh7kh3ph1n"googlenet_v2:inception_5a/double3x3b*2" +mb96ic1024ih7oc128oh7kh1ph0n"googlenet_v2:inception_5a/pool_proj*2" # mb96ic1024ih7oc352oh7kh1ph0n"googlenet_v2:inception_5b/1x1" # inception_5a/1x1 # mb96ic1024ih7oc192oh7kh1ph0n"googlenet_v2:inception_5b/3x3_reduce" # inception_5a/3x3_reduce # mb96ic192ih7oc320oh7kh3ph1n"googlenet_v2:inception_5b/3x3" # inception_5a/3x3 diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v3 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v3 index f300f77..f71086d 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v3 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_googlenet_v3 @@ -4,21 +4,21 @@ g1mb22ic32ih149iw149oc32oh147ow147kh3kw3ph0pw0sh1sw1n"googlenet_v3:conv_1_1_conv g1mb22ic32ih147iw147oc64oh147ow147kh3kw3ph1pw1sh1sw1n"googlenet_v3:conv_2_2_conv2d" g1mb22ic64ih73iw73oc80oh73ow73kh1kw1ph0pw0sh1sw1n"googlenet_v3:conv_3_3_conv2d" g1mb22ic80ih73iw73oc192oh71ow71kh3kw3ph0pw0sh1sw1n"googlenet_v3:conv_4_4_conv2d" -g1mb22ic192ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_conv_conv2d" +g1mb22ic192ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_conv_conv2d*2" g1mb22ic192ih35iw35oc48oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_tower_conv_conv2d" -g1mb22ic48ih35iw35oc64oh35ow35kh5kw5ph2pw2sh1sw1n"googlenet_v3:mixed_tower_conv_1_conv2d" +g1mb22ic48ih35iw35oc64oh35ow35kh5kw5ph2pw2sh1sw1n"googlenet_v3:mixed_tower_conv_1_conv2d*3" # g1mb22ic192ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_tower_1_conv_conv2d" -g1mb22ic64ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_tower_1_conv_1_conv2d" -g1mb22ic96ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_tower_1_conv_2_conv2d" +g1mb22ic64ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_tower_1_conv_1_conv2d*4" +g1mb22ic96ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_tower_1_conv_2_conv2d*3" g1mb22ic192ih35iw35oc32oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_tower_2_conv_conv2d" -g1mb22ic256ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_1_conv_conv2d" +g1mb22ic256ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_1_conv_conv2d*3" g1mb22ic256ih35iw35oc48oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_1_tower_conv_conv2d" # g1mb22ic48ih35iw35oc64oh35ow35kh5kw5ph2pw2sh1sw1n"googlenet_v3:mixed_1_tower_conv_1_conv2d" # g1mb22ic256ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_1_tower_1_conv_conv2d" # g1mb22ic64ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_1_tower_1_conv_1_conv2d" # g1mb22ic96ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_1_tower_1_conv_2_conv2d" # g1mb22ic256ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_1_tower_2_conv_conv2d" -g1mb22ic288ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_2_conv_conv2d" +g1mb22ic288ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_2_conv_conv2d*4" g1mb22ic288ih35iw35oc48oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_2_tower_conv_conv2d" # g1mb22ic48ih35iw35oc64oh35ow35kh5kw5ph2pw2sh1sw1n"googlenet_v3:mixed_2_tower_conv_1_conv2d" # g1mb22ic288ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_2_tower_1_conv_conv2d" @@ -29,25 +29,25 @@ g1mb22ic288ih35iw35oc384oh17ow17kh3kw3ph0pw0sh2sw2n"googlenet_v3:mixed_3_conv_co # g1mb22ic288ih35iw35oc64oh35ow35kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_3_tower_conv_conv2d" # g1mb22ic64ih35iw35oc96oh35ow35kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_3_tower_conv_1_conv2d" g1mb22ic96ih35iw35oc96oh17ow17kh3kw3ph0pw0sh2sw2n"googlenet_v3:mixed_3_tower_conv_2_conv2d" -g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_4_conv_conv2d" -g1mb22ic768ih17iw17oc128oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_4_tower_conv_conv2d" -g1mb22ic128ih17iw17oc128oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_4_tower_conv_1_conv2d" +g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_4_conv_conv2d*12" +g1mb22ic768ih17iw17oc128oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_4_tower_conv_conv2d*2" +g1mb22ic128ih17iw17oc128oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_4_tower_conv_1_conv2d*2" g1mb22ic128ih17iw17oc192oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_4_tower_conv_2_conv2d" # g1mb22ic768ih17iw17oc128oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_4_tower_1_conv_conv2d" -g1mb22ic128ih17iw17oc128oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_4_tower_1_conv_1_conv2d" +g1mb22ic128ih17iw17oc128oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_4_tower_1_conv_1_conv2d*2" # g1mb22ic128ih17iw17oc128oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_4_tower_1_conv_2_conv2d" # g1mb22ic128ih17iw17oc128oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_4_tower_1_conv_3_conv2d" g1mb22ic128ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_4_tower_1_conv_4_conv2d" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_4_tower_2_conv_conv2d" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_5_conv_conv2d" -g1mb22ic768ih17iw17oc160oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_5_tower_conv_conv2d" -g1mb22ic160ih17iw17oc160oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_5_tower_conv_1_conv2d" -g1mb22ic160ih17iw17oc192oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_5_tower_conv_2_conv2d" +g1mb22ic768ih17iw17oc160oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_5_tower_conv_conv2d*4" +g1mb22ic160ih17iw17oc160oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_5_tower_conv_1_conv2d*4" +g1mb22ic160ih17iw17oc192oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_5_tower_conv_2_conv2d*2" # g1mb22ic768ih17iw17oc160oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_conv2d" -g1mb22ic160ih17iw17oc160oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_1_conv2d" +g1mb22ic160ih17iw17oc160oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_1_conv2d*4" # g1mb22ic160ih17iw17oc160oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_2_conv2d" # g1mb22ic160ih17iw17oc160oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_3_conv2d" -g1mb22ic160ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_4_conv2d" +g1mb22ic160ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_5_tower_1_conv_4_conv2d*2" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_5_tower_2_conv_conv2d" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_6_conv_conv2d" # g1mb22ic768ih17iw17oc160oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_6_tower_conv_conv2d" @@ -61,8 +61,8 @@ g1mb22ic160ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_5_tower_1 # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_6_tower_2_conv_conv2d" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_7_conv_conv2d" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_7_tower_conv_conv2d" -g1mb22ic192ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_7_tower_conv_1_conv2d" -g1mb22ic192ih17iw17oc192oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_7_tower_conv_2_conv2d" +g1mb22ic192ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_7_tower_conv_1_conv2d*4" +g1mb22ic192ih17iw17oc192oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_7_tower_conv_2_conv2d*4" # g1mb22ic768ih17iw17oc192oh17ow17kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_7_tower_1_conv_conv2d" # g1mb22ic192ih17iw17oc192oh17ow17kh7kw1ph3pw0sh1sw1n"googlenet_v3:mixed_7_tower_1_conv_1_conv2d" # g1mb22ic192ih17iw17oc192oh17ow17kh1kw7ph0pw3sh1sw1n"googlenet_v3:mixed_7_tower_1_conv_2_conv2d" @@ -77,10 +77,10 @@ g1mb22ic192ih17iw17oc320oh8ow8kh3kw3ph0pw0sh2sw2n"googlenet_v3:mixed_8_tower_con g1mb22ic192ih17iw17oc192oh8ow8kh3kw3ph0pw0sh2sw2n"googlenet_v3:mixed_8_tower_1_conv_3_conv2d" g1mb22ic1280ih8iw8oc320oh8ow8kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_9_conv_conv2d" g1mb22ic1280ih8iw8oc384oh8ow8kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_9_tower_conv_conv2d" -g1mb22ic384ih8iw8oc384oh8ow8kh1kw3ph0pw1sh1sw1n"googlenet_v3:mixed_9_tower_mixed_conv_conv2d" -g1mb22ic384ih8iw8oc384oh8ow8kh3kw1ph1pw0sh1sw1n"googlenet_v3:mixed_9_tower_mixed_conv_1_conv2d" +g1mb22ic384ih8iw8oc384oh8ow8kh1kw3ph0pw1sh1sw1n"googlenet_v3:mixed_9_tower_mixed_conv_conv2d*4" +g1mb22ic384ih8iw8oc384oh8ow8kh3kw1ph1pw0sh1sw1n"googlenet_v3:mixed_9_tower_mixed_conv_1_conv2d*4" g1mb22ic1280ih8iw8oc448oh8ow8kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_9_tower_1_conv_conv2d" -g1mb22ic448ih8iw8oc384oh8ow8kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_9_tower_1_conv_1_conv2d" +g1mb22ic448ih8iw8oc384oh8ow8kh3kw3ph1pw1sh1sw1n"googlenet_v3:mixed_9_tower_1_conv_1_conv2d*2" # g1mb22ic384ih8iw8oc384oh8ow8kh1kw3ph0pw1sh1sw1n"googlenet_v3:mixed_9_tower_1_mixed_conv_conv2d" # g1mb22ic384ih8iw8oc384oh8ow8kh3kw1ph1pw0sh1sw1n"googlenet_v3:mixed_9_tower_1_mixed_conv_1_conv2d" g1mb22ic1280ih8iw8oc192oh8ow8kh1kw1ph0pw0sh1sw1n"googlenet_v3:mixed_9_tower_2_conv_conv2d" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p1 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p1 index f7e81f7..5c89307 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p1 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p1 @@ -1,11 +1,11 @@ # MaskRCNN part 1 mb1_g1ic3oc64_ih1030oh512kh7sh2dh0ph0_iw1030ow512kw7sw2dw0pw0_n"masknet_p1:conv1" -mb1_g1ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv2" +mb1_g1ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv2*4" mb1_g1ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv3" -mb1_g1ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"masknet_p1:conv4" +mb1_g1ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"masknet_p1:conv4*3" # mb1_g1ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv2" -mb1_g1ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv5" +mb1_g1ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv5*2" # mb1_g1ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"masknet_p1:conv4" # mb1_g1ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv2" # mb1_g1ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv5" @@ -14,9 +14,9 @@ mb1_g1ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv5 mb1_g1ic256oc128_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"masknet_p1:conv6" mb1_g1ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"masknet_p1:conv7" mb1_g1ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"masknet_p1:conv8" -mb1_g1ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"masknet_p1:conv9" -mb1_g1ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv10" -mb1_g1ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv11" +mb1_g1ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"masknet_p1:conv9*4" +mb1_g1ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv10*4" +mb1_g1ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv11*3" # mb1_g1ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"masknet_p1:conv9" # mb1_g1ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv10" # mb1_g1ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv11" @@ -26,11 +26,11 @@ mb1_g1ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv # mb1_g1ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"masknet_p1:conv9" # mb1_g1ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv10" mb1_g1ic512oc256_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"masknet_p1:conv12" -mb1_g1ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"masknet_p1:conv13" -mb1_g1ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv14" +mb1_g1ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"masknet_p1:conv13*24" +mb1_g1ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv14*23" mb1_g1ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"masknet_p1:conv15" mb1_g1ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"masknet_p1:conv16" -mb1_g1ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv17" +mb1_g1ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv17*23" # mb1_g1ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"masknet_p1:conv13" # mb1_g1ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv14" # mb1_g1ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv17" @@ -98,9 +98,9 @@ mb1_g1ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv17" # mb1_g1ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"masknet_p1:conv14" mb1_g1ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"masknet_p1:conv18" mb1_g1ic1024oc512_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"masknet_p1:conv19" -mb1_g1ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"masknet_p1:conv20" -mb1_g1ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"masknet_p1:conv21" -mb1_g1ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"masknet_p1:conv22" +mb1_g1ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"masknet_p1:conv20*3" +mb1_g1ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"masknet_p1:conv21*3" +mb1_g1ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"masknet_p1:conv22*2" # mb1_g1ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"masknet_p1:conv20" # mb1_g1ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"masknet_p1:conv21" # mb1_g1ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"masknet_p1:conv22" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p2 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p2 index 8998eb6..914b181 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p2 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_maskrcnn_p2 @@ -2,7 +2,7 @@ mb1000_g1ic256oc1024_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"masknet_p2:conv1" mb1000_g1ic1024oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"masknet_p2:conv2" -mb100_g1ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"masknet_p2:conv3" +mb100_g1ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"masknet_p2:conv3*4" # mb100_g1ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"masknet_p2:conv3" # mb100_g1ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"masknet_p2:conv3" # mb100_g1ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"masknet_p2:conv3" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet index 0b425eb..f67143f 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet @@ -1,21 +1,21 @@ # MobileNet # according to TF log -mb32_g1ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"mobilenet:conv1" -mb32_g1ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"mobilenet:conv2" -mb32_g1ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"mobilenet:conv3" -mb32_g1ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"mobilenet:conv4" -mb32_g1ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"mobilenet:conv5" -mb32_g1ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"mobilenet:conv6" -mb32_g1ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv7" -mb32_g1ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv8" +mb32_g1ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"mobilenet:conv1*3" +mb32_g1ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"mobilenet:conv2*4" +mb32_g1ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"mobilenet:conv3*4" +mb32_g1ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"mobilenet:conv4*4" +mb32_g1ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"mobilenet:conv5*4" +mb32_g1ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"mobilenet:conv6*4" +mb32_g1ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv7*4" +mb32_g1ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv8*20" # mb32_g1ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv8" # mb32_g1ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv8" # mb32_g1ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv8" # mb32_g1ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"mobilenet:conv8" -mb32_g1ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"mobilenet:conv9" -mb32_g1ic1024oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"mobilenet:conv10" -mb32_g1ic1024oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"mobilenet:conv11" +mb32_g1ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"mobilenet:conv9*4" +mb32_g1ic1024oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"mobilenet:conv10*4" +mb32_g1ic1024oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"mobilenet:conv11*4" # mb32_g1ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"mobilenet:conv1" # mb32_g1ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"mobilenet:conv2" # mb32_g1ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"mobilenet:conv3" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet_dw b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet_dw index 418496a..433896a 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet_dw +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_mobilenet_dw @@ -9,4 +9,3 @@ g256mb1ic256ih28iw28oc256oh14ow14kh3kw3sh2sw2ph1pw1n"mobilenet:conv4_2/dw" g512mb1ic512ih14iw14oc512oh14ow14kh3kw3sh1sw1ph1pw1n"mobilenet:conv5_1/dw" g512mb1ic512ih14iw14oc512oh7ow7kh3kw3sh2sw2ph1pw1n"mobilenet:conv5_6/dw" g1024mb1ic1024ih7iw7oc1024oh7ow7kh3kw3sh1sw1ph1pw1n"mobilenet:conv6/dw" - diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_regression_gemm b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_regression_gemm new file mode 100644 index 0000000..8863cf3 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_regression_gemm @@ -0,0 +1,6 @@ +# ResNext50 +mb2_g32ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1 + +# Faster RCNN +mb1_g64ic256oc256_ih240oh240kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1 + diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_resnet_50 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_resnet_50 index 0432eda..f946de3 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_resnet_50 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_resnet_50 @@ -1,11 +1,11 @@ # resnet_50 g1mb50ic3ih224iw224oc64oh112ow112kh7kw7sh2sw2ph3pw3n"resnet_50:conv1" -mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2a_branch1" +mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2a_branch1*4" mb50ic64ih56oc64oh56kh1ph0n"resnet_50:res2a_branch2a" -mb50ic64ih56oc64oh56kh3ph1n"resnet_50:res2a_branch2b" +mb50ic64ih56oc64oh56kh3ph1n"resnet_50:res2a_branch2b*3" # mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2a_branch2c" # conv1 -mb50ic256ih56oc64oh56kh1ph0n"resnet_50:res2b_branch2a" +mb50ic256ih56oc64oh56kh1ph0n"resnet_50:res2b_branch2a*2" # mb50ic64ih56oc64oh56kh3ph1n"resnet_50:res2b_branch2b" # res2a_branch2b # mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2b_branch2c" # conv1 # mb50ic256ih56oc64oh56kh1ph0n"resnet_50:res2c_branch2a" # res2b_branch2a @@ -13,9 +13,9 @@ mb50ic256ih56oc64oh56kh1ph0n"resnet_50:res2b_branch2a" # mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2c_branch2c" # conv1 g1mb50ic256ih56iw56oc512oh28ow28kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch1" g1mb50ic256ih56iw56oc128oh28ow28kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch2a" -mb50ic128ih28oc128oh28kh3ph1n"resnet_50:res3a_branch2b" -mb50ic128ih28oc512oh28kh1ph0n"resnet_50:res3a_branch2c" -mb50ic512ih28oc128oh28kh1ph0n"resnet_50:res3b_branch2a" +mb50ic128ih28oc128oh28kh3ph1n"resnet_50:res3a_branch2b*4" +mb50ic128ih28oc512oh28kh1ph0n"resnet_50:res3a_branch2c*4" +mb50ic512ih28oc128oh28kh1ph0n"resnet_50:res3b_branch2a*3" # mb50ic128ih28oc128oh28kh3ph1n"resnet_50:res3b_branch2b" # res3a_branch2b # mb50ic128ih28oc512oh28kh1ph0n"resnet_50:res3b_branch2c" # res3a_branch2c # mb50ic512ih28oc128oh28kh1ph0n"resnet_50:res3c_branch2a" # res3b_branch2a @@ -26,9 +26,9 @@ mb50ic512ih28oc128oh28kh1ph0n"resnet_50:res3b_branch2a" # mb50ic128ih28oc512oh28kh1ph0n"resnet_50:res3d_branch2c" # res3a_branch2c g1mb50ic512ih28iw28oc1024oh14ow14kh1kw1sh2sw2ph0pw0n"resnet_50:res4a_branch1" g1mb50ic512ih28iw28oc256oh14ow14kh1kw1sh2sw2ph0pw0n"resnet_50:res4a_branch2a" -mb50ic256ih14oc256oh14kh3ph1n"resnet_50:res4a_branch2b" -mb50ic256ih14oc1024oh14kh1ph0n"resnet_50:res4a_branch2c" -mb50ic1024ih14oc256oh14kh1ph0n"resnet_50:res4b_branch2a" +mb50ic256ih14oc256oh14kh3ph1n"resnet_50:res4a_branch2b*6" +mb50ic256ih14oc1024oh14kh1ph0n"resnet_50:res4a_branch2c*6" +mb50ic1024ih14oc256oh14kh1ph0n"resnet_50:res4b_branch2a*5" # mb50ic256ih14oc256oh14kh3ph1n"resnet_50:res4b_branch2b" # res4a_branch2b # mb50ic256ih14oc1024oh14kh1ph0n"resnet_50:res4b_branch2c" # res4a_branch2c # mb50ic1024ih14oc256oh14kh1ph0n"resnet_50:res4c_branch2a" # res4b_branch2a @@ -45,9 +45,9 @@ mb50ic1024ih14oc256oh14kh1ph0n"resnet_50:res4b_branch2a" # mb50ic256ih14oc1024oh14kh1ph0n"resnet_50:res4f_branch2c" # res4a_branch2c g1mb50ic1024ih14iw14oc2048oh7ow7kh1kw1sh2sw2ph0pw0n"resnet_50:res5a_branch1" g1mb50ic1024ih14iw14oc512oh7ow7kh1kw1sh2sw2ph0pw0n"resnet_50:res5a_branch2a" -mb50ic512ih7oc512oh7kh3ph1n"resnet_50:res5a_branch2b" -mb50ic512ih7oc2048oh7kh1ph0n"resnet_50:res5a_branch2c" -mb50ic2048ih7oc512oh7kh1ph0n"resnet_50:res5b_branch2a" +mb50ic512ih7oc512oh7kh3ph1n"resnet_50:res5a_branch2b*3" +mb50ic512ih7oc2048oh7kh1ph0n"resnet_50:res5a_branch2c*3" +mb50ic2048ih7oc512oh7kh1ph0n"resnet_50:res5b_branch2a*2" # mb50ic512ih7oc512oh7kh3ph1n"resnet_50:res5b_branch2b" # res5a_branch2b # mb50ic512ih7oc2048oh7kh1ph0n"resnet_50:res5b_branch2c" # res5a_branch2c # mb50ic2048ih7oc512oh7kh1ph0n"resnet_50:res5c_branch2a" # res5b_branch2a diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_ssd_mobilenet b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_ssd_mobilenet new file mode 100644 index 0000000..134cf3a --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_ssd_mobilenet @@ -0,0 +1,11 @@ +# ssd_mobilenet + +mb12_g1024ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1n"conv_1:ssd_mobilenet_dw" +mb12_g512ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1n"conv_2:ssd_mobilenet_dw" +mb12_g512ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1n"conv_3:ssd_mobilenet_dw" +mb12_g256ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0n"conv_4:ssd_mobilenet_dw" +mb12_g256ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1n"conv_5:ssd_mobilenet_dw" +mb12_g128ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1n"conv_6:ssd_mobilenet_dw" +mb12_g128ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1n"conv_7:ssd_mobilenet_dw" +mb12_g64ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0n"conv_8:ssd_mobilenet_dw" +mb12_g32ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1n"conv_9:ssd_mobilenet_dw" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_tails b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_tails index 7d8b0fd..d338562 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_tails +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_tails @@ -24,6 +24,8 @@ ic25oc24_ih13oh12kh3ph0_n"tails_conv:17" ic27oc30_ih13oh13kh3ph1_n"tails_conv:18" ic28oc20_ih13oh12kh3ph0_n"tails_conv:19" ic29oc65_ih13oh13kh3ph1_n"tails_conv:20" +g64ic512oc512_ih240oh120kh3sh2dh0ph1_n"tails_conv:21" +g128ic512oc512_ih240oh120kh3sh2dh0ph1_n"tails_conv:22" # conv 1x1 ic32oc13_ih13oh13kh1ph0_n"tails_conv_1x1:1" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_vgg_19 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_vgg_19 index 738f7c5..e65ae89 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_vgg_19 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_vgg_19 @@ -5,14 +5,14 @@ mb64ic64ih224oc64oh224kh3ph1n"vgg_19:conv1_2" mb64ic64ih112oc128oh112kh3ph1n"vgg_19:conv2_1" mb64ic128ih112oc128oh112kh3ph1n"vgg_19:conv2_2" mb64ic128ih56oc256oh56kh3ph1n"vgg_19:conv3_1" -mb64ic256ih56oc256oh56kh3ph1n"vgg_19:conv3_2" +mb64ic256ih56oc256oh56kh3ph1n"vgg_19:conv3_2*3" # mb64ic256ih56oc256oh56kh3ph1n"vgg_19:conv3_3" # conv3_2 # mb64ic256ih56oc256oh56kh3ph1n"vgg_19:conv3_4" # conv3_2 mb64ic256ih28oc512oh28kh3ph1n"vgg_19:conv4_1" -mb64ic512ih28oc512oh28kh3ph1n"vgg_19:conv4_2" +mb64ic512ih28oc512oh28kh3ph1n"vgg_19:conv4_2*3" # mb64ic512ih28oc512oh28kh3ph1n"vgg_19:conv4_3" # conv4_2 # mb64ic512ih28oc512oh28kh3ph1n"vgg_19:conv4_4" # conv4_2 -mb64ic512ih14oc512oh14kh3ph1n"vgg_19:conv5_1" +mb64ic512ih14oc512oh14kh3ph1n"vgg_19:conv5_1*4" # mb64ic512ih14oc512oh14kh3ph1n"vgg_19:conv5_2" # conv5_2 # mb64ic512ih14oc512oh14kh3ph1n"vgg_19:conv5_3" # conv5_2 # mb64ic512ih14oc512oh14kh3ph1n"vgg_19:conv5_4" # conv5_2 diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_yolov2 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_yolov2 index 90b027c..8174d76 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_yolov2 +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_yolov2 @@ -1,28 +1,28 @@ # Yolo v2 -mb16_g1ic3oc32_ih610oh608kh3sh1dh0ph0_iw610ow608kw3sw1dw0pw0_n"yolov2:conv1" -mb16_g1ic32oc64_ih306oh304kh3sh1dh0ph0_iw306ow304kw3sw1dw0pw0_n"yolov2:conv2" -mb16_g1ic64oc128_ih154oh152kh3sh1dh0ph0_iw154ow152kw3sw1dw0pw0_n"yolov2:conv3" -mb16_g1ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"yolov2:conv4" +mb16_g1ic3oc32_ih610oh608kh3sh1dh0ph0_iw610ow608kw3sw1dw0pw0_n"yolov2:conv1*6" +mb16_g1ic32oc64_ih306oh304kh3sh1dh0ph0_iw306ow304kw3sw1dw0pw0_n"yolov2:conv2*9" +mb16_g1ic64oc128_ih154oh152kh3sh1dh0ph0_iw154ow152kw3sw1dw0pw0_n"yolov2:conv3*18" +mb16_g1ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"yolov2:conv4*9" # mb16_g1ic64oc128_ih154oh152kh3sh1dh0ph0_iw154ow152kw3sw1dw0pw0_n"yolov2:conv3" -mb16_g1ic128oc256_ih78oh76kh3sh1dh0ph0_iw78ow76kw3sw1dw0pw0_n"yolov2:conv5" -mb16_g1ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"yolov2:conv6" +mb16_g1ic128oc256_ih78oh76kh3sh1dh0ph0_iw78ow76kw3sw1dw0pw0_n"yolov2:conv5*18" +mb16_g1ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"yolov2:conv6*9" # mb16_g1ic128oc256_ih78oh76kh3sh1dh0ph0_iw78ow76kw3sw1dw0pw0_n"yolov2:conv5" -mb16_g1ic256oc512_ih40oh38kh3sh1dh0ph0_iw40ow38kw3sw1dw0pw0_n"yolov2:conv7" -mb16_g1ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"yolov2:conv8" +mb16_g1ic256oc512_ih40oh38kh3sh1dh0ph0_iw40ow38kw3sw1dw0pw0_n"yolov2:conv7*27" +mb16_g1ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"yolov2:conv8*18" # mb16_g1ic256oc512_ih40oh38kh3sh1dh0ph0_iw40ow38kw3sw1dw0pw0_n"yolov2:conv7" # mb16_g1ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"yolov2:conv8" # mb16_g1ic256oc512_ih40oh38kh3sh1dh0ph0_iw40ow38kw3sw1dw0pw0_n"yolov2:conv7" -mb16_g1ic512oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv9" -mb16_g1ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv10" +mb16_g1ic512oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv9*27" +mb16_g1ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv10*18" # mb16_g1ic512oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv9" # mb16_g1ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv10" # mb16_g1ic512oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv9" -mb16_g1ic1024oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv11" +mb16_g1ic1024oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv11*18" # mb16_g1ic1024oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv11" -mb16_g1ic512oc64_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"yolov2:conv12" -mb16_g1ic1280oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv13" -mb16_g1ic1024oc425_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv14" +mb16_g1ic512oc64_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"yolov2:conv12*9" +mb16_g1ic1280oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv13*9" +mb16_g1ic1024oc425_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv14*9" # mb16_g1ic1024oc425_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv14" # mb16_g1ic1024oc425_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"yolov2:conv14" # mb16_g1ic1280oc1024_ih21oh19kh3sh1dh0ph0_iw21ow19kw3sw1dw0pw0_n"yolov2:conv13" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_1x1 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_1x1 new file mode 100644 index 0000000..c07c179 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_1x1 @@ -0,0 +1,33 @@ +# 1x1 2d deconv +mb96ic64ih56oc64oh56kh1ph0n"googlenet_v1:conv2/3x3_reduce" +mb96ic64ih28oc192oh28kh1ph0n"googlenet_v1:inception_3a/1x1" +mb96ic96ih28oc192oh28kh1ph0n"googlenet_v1:inception_3a/3x3_reduce" +mb96ic16ih28oc192oh28kh1ph0n"googlenet_v1:inception_3a/5x5_reduce" +mb96ic32ih28oc192oh28kh1ph0n"googlenet_v1:inception_3a/pool_proj" +mb96ic128ih28oc256oh28kh1ph0n"googlenet_v1:inception_3b/1x1" +mb96ic32ih28oc256oh28kh1ph0n"googlenet_v1:inception_3b/5x5_reduce" +mb96ic64ih28oc256oh28kh1ph0n"googlenet_v1:inception_3b/pool_proj" +mb96ic192ih14oc480oh14kh1ph0n"googlenet_v1:inception_4a/1x1" +mb96ic96ih14oc480oh14kh1ph0n"googlenet_v1:inception_4a/3x3_reduce" +mb96ic16ih14oc480oh14kh1ph0n"googlenet_v1:inception_4a/5x5_reduce" +mb96ic64ih14oc480oh14kh1ph0n"googlenet_v1:inception_4a/pool_proj" +mb96ic128ih4oc512oh4kh1ph0n"googlenet_v1:loss1/conv" +mb96ic160ih14oc512oh14kh1ph0n"googlenet_v1:inception_4b/1x1" +mb96ic112ih14oc512oh14kh1ph0n"googlenet_v1:inception_4b/3x3_reduce" +mb96ic24ih14oc512oh14kh1ph0n"googlenet_v1:inception_4b/5x5_reduce" +mb96ic64ih14oc512oh14kh1ph0n"googlenet_v1:inception_4b/pool_proj" +mb96ic128ih14oc512oh14kh1ph0n"googlenet_v1:inception_4c/1x1" +mb96ic144ih14oc512oh14kh1ph0n"googlenet_v1:inception_4d/3x3_reduce" +mb96ic32ih14oc512oh14kh1ph0n"googlenet_v1:inception_4d/5x5_reduce" +mb96ic128ih4oc528oh4kh1ph0n"googlenet_v1:loss2/conv" +mb96ic256ih14oc528oh14kh1ph0n"googlenet_v1:inception_4e/1x1" +mb96ic160ih14oc528oh14kh1ph0n"googlenet_v1:inception_4e/3x3_reduce" +mb96ic32ih14oc528oh14kh1ph0n"googlenet_v1:inception_4e/5x5_reduce" +mb96ic128ih14oc528oh14kh1ph0n"googlenet_v1:inception_4e/pool_proj" +mb96ic256ih7oc832oh7kh1ph0n"googlenet_v1:inception_5a/1x1" +mb96ic160ih7oc832oh7kh1ph0n"googlenet_v1:inception_5a/3x3_reduce" +mb96ic32ih7oc832oh7kh1ph0n"googlenet_v1:inception_5a/5x5_reduce" +mb96ic128ih7oc832oh7kh1ph0n"googlenet_v1:inception_5a/pool_proj" +mb96ic384ih7oc832oh7kh1ph0n"googlenet_v1:inception_5b/1x1" +mb96ic192ih7oc832oh7kh1ph0n"googlenet_v1:inception_5b/3x3_reduce" +mb96ic48ih7oc832oh7kh1ph0n"googlenet_v1:inception_5b/5x5_reduce" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_2d b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_2d similarity index 92% rename from inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_2d rename to inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_2d index 5c04359..bee5de1 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_2d +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_2d @@ -55,3 +55,9 @@ mb96ic192ih7oc832oh7kh1ph0n"googlenet_v1:inception_5b/3x3_reduce" mb96ic384ih7oc192oh7kh3ph1n"googlenet_v1:inception_5b/3x3" mb96ic48ih7oc832oh7kh1ph0n"googlenet_v1:inception_5b/5x5_reduce" mb96ic128ih7oc48oh7kh5ph2n"googlenet_v1:inception_5b/5x5" + +mb1_g1oc3ic64_oh1030ih512kh7sh2dh0ph0_ow1030iw512kw7sw2dw0pw0_n"masknet_p1:deconv1" +g1mb50_oc512oh56ow56_ic256ih28iw28_kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch1" + +ic8ih1iw5oc8oh1ow2kh1kw3ph0pw3dh0dw2n"deconv1d:1" +ic8ih5iw1oc8oh2ow1kh3kw1ph3pw0dh2dw0n"deconv1d:2" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_3d b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_3d similarity index 100% rename from inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_3d rename to inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_3d diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_all b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_all similarity index 61% rename from inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_all rename to inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_all index f2d02a4..e198bde 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv_all +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_all @@ -1,3 +1,3 @@ --batch=deconv_3d --batch=deconv_2d ---batch=dilated_deconv \ No newline at end of file +--batch=deconv_dilated diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/dilated_deconv b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_dilated similarity index 100% rename from inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/dilated_deconv rename to inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/deconv_dilated diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_1x1 b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_1x1 new file mode 100644 index 0000000..e7687c4 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_1x1 @@ -0,0 +1,24 @@ +--cfg=u8s8u8s32 +--batch=deconv_1x1 + +--cfg=s8s8u8s32 +--batch=deconv_1x1 + +--cfg=u8s8s8s32 +--batch=deconv_1x1 + +--cfg=s8s8s8s32 +--batch=deconv_1x1 + +--cfg=u8s8s32s32 +--batch=deconv_1x1 + +--cfg=s8s8s32s32 +--batch=deconv_1x1 + +--cfg=u8s8f32s32 +--batch=deconv_1x1 + +--cfg=s8s8f32s32 +--batch=deconv_1x1 + diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_all b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_all new file mode 100644 index 0000000..b120295 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deconv/test_deconv_all @@ -0,0 +1,30 @@ +# f32 +--reset --skip-impl=ref +--mb=2 --cfg=f32 + +--dir=FWD_B --batch=deconv_all +--dir=BWD_D --batch=deconv_all +--dir=BWD_W --batch=deconv_all +--dir=BWD_WB --batch=deconv_all + +# int8 +--reset --skip-impl=ref --allow-unimpl=true +--mb=2 --dir=FWD_B + +--attr=irmode=down;oscale=per_oc:2.25; +--cfg=u8s8u8s32 --batch=deconv_2d --batch=deconv_dilated +--cfg=s8s8u8s32 --batch=deconv_2d --batch=deconv_dilated + +--attr=irmode=nearest;oscale=common:2.25; +--cfg=u8s8s8s32 --batch=deconv_2d +--cfg=u8s8s32s32 --batch=deconv_2d +--cfg=s8s8u8s32 --batch=deconv_2d + +--attr=irmode=nearest;oscale=none; +--cfg=s8s8s8s32 --batch=deconv_2d +--cfg=s8s8s32s32 --batch=deconv_2d + +# 1x1 int8 +--reset --mb=2 --dir=FWD_B --allow-unimpl=true +--attr=irmode=down;oscale=per_oc:2.25;post_ops='sum:1.5;relu' --batch=test_deconv_1x1 +--attr=irmode=nearest;oscale=common:2.25;post_ops='sum:1.5' --batch=test_deconv_1x1 diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deepbench b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deepbench index 5256c75..afb663c 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deepbench +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/deepbench @@ -12,7 +12,7 @@ --dir=BWD_W --batch=deepbench_inference_server --dir=BWD_W --batch=deepbench_training ---merge=RELU +--attr=post_ops='relu' --dir=FWD_B --batch=deepbench_inference_device --dir=FWD_B --batch=deepbench_inference_server diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/ip_all b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/ip_all index 085fdfe..19c48ec 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/ip_all +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/ip_all @@ -2,11 +2,14 @@ mb112ic2048ih1iw1oc1000n"resnet:ip1" mb128ic128ih4oc1024n"googlenet_v1:ip1" mb128ic1024ih1oc1000n"googlenet_v1:ip2" mb224ic2048ih1oc1000n"inceptionv3:ip1" -mb64ic2048ih1oc1000n"resnet_sparse:ip2" +mb64ic2048ih1oc1000n"resnet_sparse:ip1" mb64ic512ih7iw7oc4096n"VGG16:ip1" mb64ic4096ih1iw1oc4096n"VGG16:ip2" mb64ic4096ih1iw1oc81n"VGG16:ip3" mb64ic4096ih1iw1oc324n"VGG16:ip4" -mb32ic64id2ih3iw3oc1000n"wip_3d:1" -mb32ic512id5ih5iw5oc1000n"wip_3d:2" -mb256ic128id5ih5iw5oc128n"wip_3d:3" +mb32ic64id2ih3iw3oc1000n"wip_3d:ip1" +mb32ic512id5ih5iw5oc1000n"wip_3d:ip2" +mb256ic128id5ih5iw5oc128n"wip_3d:ip3" +mb1024ic845iw1ih1oc1024n"WD:ip1" +mb1024ic1024iw1ih1oc1024n"WD:ip2" +mb1024ic512iw1ih1oc256n"WD:ip3" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/test_ip_all b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/test_ip_all index e59a669..a99582d 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/test_ip_all +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/test_ip_all @@ -9,8 +9,12 @@ --cfg=u8s8u8s32 --batch=ip_all --cfg=u8s8s8s32 --batch=ip_all --cfg=u8s8s32s32 --batch=ip_all +--cfg=s8s8u8s32 --batch=ip_all +--cfg=s8s8s8s32 --batch=ip_all +--cfg=s8s8s32s32 --batch=ip_all # relu --reset --dir=FWD_B --mb=2 --attr=post_ops='relu' --batch=ip_all --cfg=u8s8s32s32 --batch=ip_all +--cfg=s8s8s32s32 --batch=ip_all diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/reorder/test_default b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/reorder/test_default index f7b5fef..339ae1a 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/reorder/test_default +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/reorder/test_default @@ -8,6 +8,9 @@ --fmt=oihw,hwio 2x64x3x3 --fmt=goihw,gOIhw16i16o 3x32x32x2x2 +--both-dir-fmt=true +--ifmt=nChw8c --ofmt=nChw16c 2x40x3x3 # blocked with tail + --attr=irmode=down;oscale=common:0. --fmt=nchw,nhwc,nChw8c,nChw16c 2x64x3x3 --fmt=oihw,hwio 2x64x3x3 diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru index 7496c8b..986193d 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru @@ -1,4 +1,4 @@ l2t2mb128sic512n"exp-gru-0" -l7t1mb128sic512slc1024dic512dlc512n"exp-gru-1" +l1t7mb128sic512slc1024dic512dlc512n"exp-gru-1" l1t10mb32sic128slc512dic128dlc128n"exp-gru-2" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference index be35247..3d5ddd7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference @@ -1,6 +1,7 @@ l1t30mb1sic512n"GNMT_enc-inference" l7t30mb1sic1024n"GNMT_enc-inference" l8t1mb1sic2048slc1024dic1024dlc1024n"GNMT_dec-inference" +l1t1mb1sic2048slc1024dic1024dlc1024n"GNMT_dec-inference" l1t50mb1sic1760n"deepspeech2-inference" l1t100mb1sic760n"deepspeech2-inference" l1t200mb1sic1760n"deepspeech2-inference" @@ -8,3 +9,5 @@ l1t50mb1sic500n"pytorch_testcase-inference" l1t629mb1sic128n"paddlepaddle_testcase-inference" l1t10mb1sic128slc512dic128dlc128n"exp-0" l10t1mb1sic512slc128dic128dlc128n"exp-1" + +l1t1mb640sic2048slc1024dic1024dlc1024n"GNMT_dec-inference" \ No newline at end of file diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training index 5d9a0dd..d60107d 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training @@ -1,6 +1,7 @@ l1t1mb128sic512n"GNMT_enc-training" l2t2mb128sic1024n"GNMT_enc-training" l8t1mb128sic2048slc1024dic1024dlc1024n"GNMT_dec-training" +l1t1mb128sic2048slc1024dic1024dlc1024n"GNMT_dec-training" l1t50mb32sic1760n"deepspeech2-training" l1t100mb32sic1760n"deepspeech2-training" l1t200mb32sic1760n"deepspeech2-training" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small index 6ca0cb3..9f1ac2e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small @@ -48,7 +48,40 @@ --direction=left2right --activation=TANH --prop=FWD_D --batch=rnn_small ---prop=BWD_DW --batch=rnn_small +# --prop=BWD_DW --batch=rnn_small + +# LSTM int8 +--reset --alg=VANILLA_LSTM +--direction=left2right +--activation=TANH +--cfg=u8u8u8u8 +--allow-unimpl=true +--attr=irmode=nearest --scaling=common +--prop=FWD_D --batch=rnn_small + +--reset --alg=VANILLA_LSTM +--direction=left2right +--activation=TANH +--allow-unimpl=true +--cfg=u8u8u8f32 +--attr=irmode=down --scaling=common +--prop=FWD_D --batch=rnn_small + +--reset --alg=VANILLA_LSTM +--direction=left2right +--activation=TANH +--allow-unimpl=true +--cfg=f32u8f32u8 +--attr=irmode=down --scaling=per_oc +--prop=FWD_D --batch=rnn_small + +--reset --alg=VANILLA_LSTM +--direction=left2right +--activation=TANH +--allow-unimpl=true +--cfg=f32u8f32f32 +--attr=irmode=nearest --scaling=per_oc +--prop=FWD_D --batch=rnn_small # GRU --reset --alg=VANILLA_GRU diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_all b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_all index 6f725f9..6f705c1 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_all +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_all @@ -8,7 +8,7 @@ --dir=BWD_WB --batch=conv_resnet_50 --mb=2 ---merge=RELU # +relu +--attr=post_ops='relu' # +relu --dir=FWD_B --batch=conv_alexnet # depthwise @@ -21,12 +21,12 @@ --cfg=u8s8u8s32 --batch=conv_all --cfg=u8s8s8s32 --batch=conv_resnet_50 --cfg=u8s8s32s32 --batch=conv_googlenet_v3 ---merge=RELU +--attr=post_ops='relu' --cfg=u8s8s32s32 --batch=conv_vgg_19 --cfg=s8s8u8s32 --batch=conv_all --cfg=s8s8s8s32 --batch=conv_resnet_50 --cfg=s8s8s32s32 --batch=conv_googlenet_v3 ---merge=RELU +--attr=post_ops='relu' --cfg=s8s8s32s32 --batch=conv_vgg_19 # s16 (knm) @@ -36,7 +36,8 @@ --cfg=s32s16s16s32 --dir=BWD_D --batch=conv_all --cfg=s16s32s16s32 --dir=BWD_WB --batch=conv_all ---merge=RELU # +relu +--attr=post_ops='relu' # +relu +--allow-unimpl=true # TODO: remove if ref_convolution accepts post_ops --cfg=s16s16s32s32 --dir=FWD_B --batch=conv_googlenet_v1 # f32 wino @@ -58,7 +59,7 @@ --cfg=u8s8u8s32_wino --batch=conv_all --cfg=u8s8s8s32_wino --batch=conv_resnet_50 --cfg=u8s8s32s32_wino --batch=conv_googlenet_v3 ---merge=RELU +--attr=post_ops='relu' --cfg=u8s8s32s32_wino --batch=conv_googlenet_v2 # dilated @@ -72,3 +73,12 @@ # 3D conv --batch=test_conv_3d + +# auto algo +--reset --cfg=f32 --alg=auto +--dir=FWD_B --batch=conv_auto +--dir=BWD_D --batch=conv_auto +--dir=BWD_WB --batch=conv_auto +--cfg=u8s8s8s32 +--dir=FWD_B --batch=conv_auto + diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_attrs b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_attrs index 00d4cff..a54a038 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_attrs +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_attrs @@ -7,23 +7,38 @@ --attr=irmode=down;oscale=per_oc:2.25;post_ops='sum:1.5;relu' --cfg=u8s8u8s32 --batch=conv_vgg_19 --cfg=u8s8f32s32 --batch=conv_googlenet_v2 +--cfg=u8s8s32s32 --batch=conv_tails --cfg=s8s8u8s32 --batch=conv_vgg_19 --cfg=s8s8f32s32 --batch=conv_googlenet_v2 +--cfg=s8s8s32s32 --batch=conv_tails --dir=FWD_D --attr=irmode=nearest;oscale=common:2.25;post_ops='sum:1.5' --cfg=u8s8s8s32 --batch=conv_googlenet_v3 --cfg=u8s8s32s32 --batch=conv_alexnet +--cfg=u8s8s32s32 --batch=conv_tails --cfg=s8s8s8s32 --batch=conv_googlenet_v3 --cfg=s8s8s32s32 --batch=conv_alexnet +--cfg=s8s8s32s32 --batch=conv_tails # f32 --reset --cfg=f32 --mb=2 --skip-impl="ref:gemm" # ! test jit version only --allow-unimpl=true ---dir=FWD_B --attr=post_ops='sum;relu' --batch=conv_resnet_50 ---dir=FWD_B --attr=post_ops='sum;relu' --batch=conv_3d ---dir=FWD_B --attr=post_ops='sum;relu' --batch=conv_1d + +--dir=FWD_B +--attr=post_ops='sum;relu' --batch=conv_resnet_50 +--attr=post_ops='sum;relu:0.5' --batch=conv_tails +--attr=post_ops='sum;tanh' --batch=conv_tails +--attr=post_ops='sum;elu:0.5' --batch=conv_tails +--attr=post_ops='sum;abs' --batch=conv_tails +--attr=post_ops='sum;sqrt' --batch=conv_tails +--attr=post_ops='sum;linear:0.5:1.5' --batch=conv_tails +--attr=post_ops='sum;brelu:0.5' --batch=conv_tails +--attr=post_ops='sum;logistic' --batch=conv_tails +--cfg=f32_no_limits # square and srelu might overrun int_max_exact +--attr=post_ops='sum;square' --batch=conv_tails +--attr=post_ops='sum;srelu' --batch=conv_tails # f32_wino --reset --alg=wino --cfg=f32_wino @@ -44,3 +59,17 @@ --attr=irmode=nearest;oscale=common:2.25;post_ops='sum:1.5' --cfg=u8s8s8s32_wino --batch=conv_googlenet_v3 --cfg=u8s8s32s32_wino --batch=conv_resnet_50 + +# i8 conv + f32 leaky relu +--reset --dir=FWD_B --mb=2 +--skip-impl="ref:gemm" # ! test jit version only +--allow-unimpl=true +--attr=post_ops='relu:0.5' +--cfg=s8s8f32s32 --batch=conv_yolov2 +--cfg=u8s8f32s32 --batch=conv_yolov2 +--attr=post_ops='relu:0.5;sum' +--cfg=s8s8f32s32 --batch=conv_yolov2 +--cfg=u8s8f32s32 --batch=conv_yolov2 +--attr=post_ops='sum;relu:0.5' +--cfg=s8s8f32s32 --batch=conv_yolov2 +--cfg=u8s8f32s32 --batch=conv_yolov2 diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_depthwise b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_depthwise index 0fa5973..577b2f9 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_depthwise +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_depthwise @@ -3,7 +3,7 @@ --mb=2 --dir=FWD_D --batch=conv_mobilenet_dw --dir=BWD_D --batch=conv_mobilenet_dw ---merge=RELU +--attr=post_ops='relu' --dir=FWD_D --batch=conv_mobilenet_dw # +relu #post-ops @@ -30,8 +30,11 @@ --dir=FWD_B --attr=irmode=down;oscale=per_oc:2.25;post_ops='sum:1.5;relu' --cfg=u8s8u8s32 --batch=conv_mobilenet_dw ---cfg=u8s8f32s32 --batch=conv_mobilenet_dw +--cfg=s8s8f32s32 --batch=conv_mobilenet_dw --dir=FWD_D --attr=irmode=nearest;oscale=common:2.25;post_ops='sum:1.5' --cfg=u8s8s8s32 --batch=conv_mobilenet_dw ---cfg=u8s8s32s32 --batch=conv_mobilenet_dw +--cfg=s8s8s32s32 --batch=conv_mobilenet_dw + +--cfg=u8s8s8s32 g8mb1ic8ih112iw112oc8oh112ow112kh3kw3sh1sw1ph1pw1n"depthwise:conv1" +--cfg=s8s8u8s32 g8mb1ic8ih112iw112oc8oh112ow112kh3kw3sh1sw1ph1pw1n"depthwise:conv1" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression index 27c2a9e..82c81af 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression @@ -6,7 +6,10 @@ --dir=FWD_B --batch=conv_regression_padding --dir=BWD_D --batch=conv_regression_padding --dir=BWD_WB --batch=conv_regression_padding ---merge=RELU +--dir=FWD_B --batch=conv_regression_gemm +--dir=BWD_D --batch=conv_regression_gemm +--dir=BWD_WB --batch=conv_regression_gemm +--attr=post_ops='relu' --dir=FWD_B --batch=conv_regression_small_spatial --dir=FWD_B --batch=conv_regression_padding diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression_general b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression_general index e2cebb9..d50f74c 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression_general +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression_general @@ -69,3 +69,13 @@ # MKLDNN-1074: FPE for mb1 with ih < sh or iw < sw --reset --dir=FWD_D mb1_g1ic128oc256_ih1oh1kh3sh2dh0ph1_iw1ow1kw3sw2dw0pw1 +#MKLDNN-1184 grouped convolutions with small input-channel and +# non-blocked src format +--reset --dir=FWD_D +#AVX2 +mb1_g2ic4oc16_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0 +#AVX512 +mb1_g2ic16oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1 +mb1_g2ic8oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1 +mb1_g2ic4oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1 +mb1_g2ic22oc32_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0 diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_deconv_all b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_deconv_all deleted file mode 100644 index 2b71b50..0000000 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_deconv_all +++ /dev/null @@ -1,26 +0,0 @@ -# f32 ---reset --cfg=f32 ---mb=2 ---dir=FWD_B --batch=deconv_all ---dir=BWD_D --batch=deconv_all ---dir=BWD_W --batch=deconv_all ---dir=BWD_WB --batch=deconv_all - -#int8 ---skip-impl=ref ---reset --allow-unimpl=true --dir=FWD_B --mb=2 ---attr=irmode=down;oscale=per_oc:2.25; ---cfg=u8s8u8s32 --batch=deconv_2d ---cfg=u8s8s8s32 --batch=deconv_2d ---cfg=u8s8s32s32 --batch=deconv_2d ---cfg=s8s8u8s32 --batch=deconv_2d ---cfg=s8s8s8s32 --batch=deconv_2d ---cfg=s8s8s32s32 --batch=deconv_2d ---attr=irmode=nearest;oscale=common:2.25; ---attr=irmode=down;oscale=per_oc:2.25; ---cfg=u8s8u8s32 --batch=deconv_2d ---cfg=u8s8s8s32 --batch=deconv_2d ---cfg=u8s8s32s32 --batch=deconv_2d ---cfg=s8s8u8s32 --batch=deconv_2d ---cfg=s8s8s8s32 --batch=deconv_2d ---cfg=s8s8s32s32 --batch=deconv_2d diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/cfg.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/cfg.cpp index 2f4d1bc..15f1540 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/cfg.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/cfg.cpp @@ -80,6 +80,38 @@ const _dt_conf_t conf_u8s8u8s32 = { {mkldnn_s32,}, }; +const _dt_conf_t conf_s8s8f32s32 = { + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_f32, -int_max_exact, int_max_exact, -8, 32, 0, 1, .35, 0.}, + {mkldnn_f32, -int_max_exact, int_max_exact, -255, 255, 0, 1, .35, 0.}, + {mkldnn_s32,}, +}; + +const _dt_conf_t conf_s8s8s32s32 = { + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_f32, -int_max_exact, int_max_exact, -8, 32, 0, 1, .35, 0.}, + {mkldnn_s32, INT32_MIN, INT32_MAX, -255, 255, 0, 1, .35, 0.}, + {mkldnn_s32,}, +}; + +const _dt_conf_t conf_s8s8s8s32 = { + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_f32, -int_max_exact, int_max_exact, -8, 32, 0, 1, .35, 0.}, + {mkldnn_s8, INT8_MIN, INT8_MAX, -127, 127, 0, 1, .35, 0.}, + {mkldnn_s32,}, +}; + +const _dt_conf_t conf_s8s8u8s32 = { + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_s8, INT8_MIN, INT8_MAX, -5, 5, 0, 1, .35, 0.}, + {mkldnn_f32, -int_max_exact, int_max_exact, -8, 32, 0, 1, .35, 0.}, + {mkldnn_u8, 0, UINT8_MAX, 0, 255, 0, 1, .35, 0.}, + {mkldnn_s32,}, +}; + const dt_conf_t *str2cfg(const char *str) { #define CASE(cfg) \ if (!strcasecmp(STRINGIFY(cfg), str)) return CONCAT2(conf_,cfg) @@ -89,6 +121,10 @@ const dt_conf_t *str2cfg(const char *str) { CASE(u8s8s32s32); CASE(u8s8s8s32); CASE(u8s8u8s32); + CASE(s8s8f32s32); + CASE(s8s8s32s32); + CASE(s8s8s8s32); + CASE(s8s8u8s32); #undef CASE []() { SAFE(FAIL, CRIT); return 0; }(); return (const dt_conf_t *)1; @@ -102,6 +138,10 @@ const char *cfg2str(const dt_conf_t *cfg) { CASE(u8s8s32s32); CASE(u8s8s8s32); CASE(u8s8u8s32); + CASE(s8s8f32s32); + CASE(s8s8s32s32); + CASE(s8s8s8s32); + CASE(s8s8u8s32); #undef CASE []() { SAFE(FAIL, CRIT); return 0; }(); return NULL; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/ip.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/ip.cpp index eba082c..4166161 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/ip.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/ip.cpp @@ -263,9 +263,6 @@ int fill_dst(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp, res_t *r) { ((float *)mem_00)[dst_off_f(p, mb, oc)] = value; }); - mem_dt.reorder(mem_00); - mem_fp.reorder(mem_dt); - SAFE(mem_dt.reorder(mem_00), WARN); SAFE(mem_fp.reorder(mem_dt), WARN); @@ -317,7 +314,6 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_fwd(p, src_fp, wei_fp, bia_fp, dst_fp); dnn_mem_t dst(dst_dt, fp, mkldnn_nc); - SAFE(dst.reorder(dst_dt), WARN); SAFE(compare_dat(p, DST, dst, dst_fp, r), WARN); } } else if (p->dir == BWD_D) { @@ -328,7 +324,6 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_bwd_d(p, src_fp, wei_fp, dst_fp); dnn_mem_t src(src_dt, fp, src_format); - SAFE(src.reorder(src_dt), WARN); SAFE(compare_dat(p, SRC, src, src_fp, r), WARN); } } else if (p->dir & FLAG_BWD && p->dir & FLAG_WEI) { @@ -341,11 +336,9 @@ int doit(const prb_t *p, res_t *r) { if (bench_mode & CORR) { compute_ref_bwd_w(p, src_fp, wei_fp, bia_fp, dst_fp); dnn_mem_t wei(wei_dt, fp, wei_format); - SAFE(wei.reorder(wei_dt), WARN); if (compare_dat(p, WEI, wei, wei_fp, r) != OK) return FAIL; if (p->dir & FLAG_BIA) { dnn_mem_t bia(bia_dt, fp, mkldnn_x); - SAFE(bia.reorder(bia_dt), WARN); SAFE(compare_dat(p, BIA, bia, bia_fp, r), WARN); } } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_debug.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_debug.cpp index decf41b..12a1ffa 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_debug.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_debug.cpp @@ -78,6 +78,7 @@ mkldnn_memory_format_t str2fmt(const char *str) { CASE(nc); CASE(ncw); CASE(nwc); + CASE(nCw8c); CASE(nCw16c); CASE(nchw); CASE(nhwc); @@ -96,6 +97,7 @@ mkldnn_memory_format_t str2fmt(const char *str) { CASE(oihw); CASE(ihwo); CASE(hwio); + CASE(iohw); CASE(hwio_s8s8); CASE(dhwio); CASE(OIhw8i8o); @@ -114,6 +116,7 @@ mkldnn_memory_format_t str2fmt(const char *str) { CASE(goiw); CASE(goihw); CASE(hwigo); + CASE(giohw); CASE(hwigo_s8s8); CASE(goiw); CASE(gOIw16i16o); @@ -136,6 +139,7 @@ mkldnn_memory_format_t str2fmt(const char *str) { CASE(gOhwi16o); CASE(Goihw8g); CASE(Goihw16g); + CASE(Goihw16g_s8s8); CASE(oIhw8i); CASE(oIhw16i); CASE(ncdhw); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_memory.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_memory.hpp index 8c6a4c1..6a1441c 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_memory.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_memory.hpp @@ -77,10 +77,13 @@ struct dnn_mem_t { size_t size() const { return mkldnn_memory_primitive_desc_get_size(mpd_); } - size_t nelems() const { + size_t nelems(bool with_padding_dims = false) const { + auto dims = with_padding_dims + ? md_.layout_desc.blocking.padding_dims + : md_.dims; size_t n = 1; for (int i = 0; i < md_.ndims; ++i) - n *= md_.dims[i]; + n *= dims[i]; return n; } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.cpp index a19917b..235e1af 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.cpp @@ -258,7 +258,7 @@ int check_reorder(const prb_t *p, res_t *res) { const reorder_conf_t &r = p->reorder; const int ndims = (int)r.dims.size(); - const int *dims = &r.dims[0]; + const ptrdiff_t *dims = &r.dims[0]; mkldnn_memory_format_t fmt_ref; const bool is_data = fmt2data_kind(r.fmt_in) == DATA; @@ -313,18 +313,21 @@ int check_reorder(const prb_t *p, res_t *res) { SAFE(init_status, WARN); SAFE(mem_dt_out_fmt_out.reorder(mem_dt_in_fmt_in, mkldnn_attr), WARN); - SAFE(mem_dt_out_fmt_ref.reorder(mem_dt_out_fmt_out), WARN); - /* Step 5: execute benchdnn reorder */ - SAFE(reorder(p, mem_test_dt_out_fmt_ref, mem_dt_in_fmt_ref, scales), WARN); - - /* Step 6: compare results */ + /* Step 5: check corrrectness */ if (bench_mode & CORR) { + /* Step 5a: reorder output from mkldnn to ref format using mkldnn */ + SAFE(mem_dt_out_fmt_ref.reorder(mem_dt_out_fmt_out), WARN); + + /* Step 5b: execute benchdnn reorder */ + SAFE(reorder(p, mem_test_dt_out_fmt_ref, mem_dt_in_fmt_ref, scales), WARN); + + /* Step 5c: compare benchdnn and mkldnn output */ SAFE(compare(p, mem_test_dt_out_fmt_ref, mem_dt_out_fmt_ref, scales, count, res), WARN); } - /* Step 7: performance measurement */ + /* Step 6: performance measurement */ if (bench_mode & PERF) { mkldnn_primitive_desc_t perf_r_pd; mkldnn_primitive_t perf_r; @@ -353,7 +356,7 @@ int check_reorder(const prb_t *p, res_t *res) { DNN_SAFE_V(mkldnn_primitive_destroy(perf_r)); } - /* Step 8: clean up */ + /* Step 7: clean up */ cleanup: mkldnn_primitive_attr_destroy(mkldnn_attr); zfree(scales); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.hpp index 3564205..d509f4b 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.hpp @@ -28,7 +28,7 @@ namespace reorder { -using dims_t = std::vector; +using dims_t = std::vector; struct dt_conf_s { mkldnn_data_type_t dt; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder_aux.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder_aux.cpp index a4137a8..51df6eb 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder_aux.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder_aux.cpp @@ -40,8 +40,8 @@ dims_t str2dims(const char *str) { void dims2str(const dims_t &dims, char *buffer) { int rem_len = max_dims_len; for (size_t d = 0; d < dims.size() - 1; ++d) - DPRINT("%dx", dims[d]); - DPRINT("%d", dims[dims.size() - 1]); + DPRINT("%tdx", dims[d]); + DPRINT("%td", dims[dims.size() - 1]); } void prb2str(const prb_t *p, const res_t *res, char *buffer) { diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/bench_rnn.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/bench_rnn.cpp index 3d43c77..875db4c 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/bench_rnn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/bench_rnn.cpp @@ -35,12 +35,23 @@ mkldnn_prop_kind_t prop = mkldnn_forward; alg_t alg = VANILLA_RNN; mkldnn_rnn_direction_t direction = mkldnn_unidirectional_left2right; activation_t activation = RELU; +const char *perf_template = "perf,%n,%d,,,%-t,,%0t,"; +const dt_conf_t *cfg = conf_f32; +policy_t scale_policy = NONE; +attr_t attr; +bool allow_unimpl = false; +int mb = 0; void reset_parameters() { + cfg = conf_f32; + attr = attr_t(); prop = mkldnn_forward; alg = VANILLA_RNN; direction = mkldnn_unidirectional_left2right; activation = RELU; + scale_policy = NONE; + allow_unimpl = false; + mb = 0; } int bench(int argc, char **argv, bool main_bench) { @@ -57,12 +68,28 @@ int bench(int argc, char **argv, bool main_bench) { assert("unknown dir"); } else if (!strncmp("--alg=", argv[arg], 6)) alg = str2alg(argv[arg] + 6); + else if (!strncmp("--cfg=", argv[arg], 6)) + cfg = str2cfg(argv[arg] + 6); + else if (!strncmp("--attr=", argv[arg], 7)) + SAFE(str2attr(&attr, argv[arg] + 7), CRIT); else if (!strncmp("--direction=", argv[arg], 12)) direction = str2direction(argv[arg] + 12); else if (!strncmp("--activation=", argv[arg], 13)) activation = str2activation(argv[arg] + 13); + else if (!strncmp("--allow-unimpl=", argv[arg], 15)) + allow_unimpl = str2bool(argv[arg] + 15); + else if (!strncmp("--scaling=", argv[arg], 10)) + scale_policy = str2policy(argv[arg] + 10); else if (!strncmp("--reset", argv[arg], 7)) reset_parameters(); + else if (!strncmp("--perf-template=", argv[arg], 16)) + perf_template = argv[arg] + 16; + else if (!strncmp("--mb=", argv[arg], 5)) + mb = atoi(argv[arg] + 5); + else if (!strncmp("-v", argv[arg], 2)) + verbose = atoi(argv[arg] + 2); + else if (!strncmp("--verbose=", argv[arg], 10)) + verbose = atoi(argv[arg] + 10); else { rnn_desc_t d; if (str2desc(&d, argv[arg]) == FAIL) { @@ -70,6 +97,20 @@ int bench(int argc, char **argv, bool main_bench) { argv[arg]); exit(2); } + if (cfg != conf_f32 && alg != VANILLA_LSTM) { + fprintf(stderr, + "driver: configuration ``%s` is supported for LSTM " + "cell only, exiting...\n", + cfg2str(cfg)); + exit(2); + } + if (cfg != conf_f32 && scale_policy == NONE) { + fprintf(stderr, + "driver: configuration ``%s` requires scale policy to " + "be COMMON or PER_OC, exiting...\n", + cfg2str(cfg)); + exit(2); + } check(&d); } } @@ -77,17 +118,17 @@ int bench(int argc, char **argv, bool main_bench) { } void check(rnn_desc_t *d) { - const rnn_prb_t p(*d, conf_f32, prop, alg, direction, activation); + const rnn_prb_t p(*d, cfg, prop, alg, direction, activation, attr, + scale_policy, mb); res_t res{}; char pstr[max_prb_len]; - prb2str(&p, &res, pstr); int status = rnn::doit(&p, &res); prb2str(&p, &res, pstr); bool want_perf_report = false; - parse_result(res, want_perf_report, false, status, pstr); + parse_result(res, want_perf_report, allow_unimpl, status, pstr); if (bench_mode & PERF) perf_report(&p, &res, pstr); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/cfg.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/cfg.cpp index 4680572..7d93f06 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/cfg.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/cfg.cpp @@ -26,66 +26,70 @@ states, weights_input, weights_states, bias, -dst_last_layer, dst_last_iteration, +dst_last_layer, dst_diff_input, dst_diff_states, dst_diff_weights_input, dst_diff_weights_states, dst_diff_bias, -diff_last_layer, diff_last_iteration, -params: {data_type, min, max, f_min,* f_max, f_base, f_step, f_sparsity, eps} +diff_last_layer, +params: {data_type, min, max, f_min, f_max, f_mean, f_var, eps} */ const int int_max_exact = 1 << 24; const _dt_conf_t conf_f32 = { -#if 0 - { mkldnn_f32, -int_max_exact, int_max_exact, 1, 1, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 1, 1, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 1, 1, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 1, 1, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 1, 1, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 777, 777, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 2, 2, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, 2, 2, 0, 1, .25, 1e-5 }, -#elif 0 - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -4, 4, 0, 1, .25, 1e-5 }, -#else - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, - { mkldnn_f32, -int_max_exact, int_max_exact, -64, 64, 0, 1, 1., 1e-5 }, -#endif + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //input + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //weights_input + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //weights_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //bias + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_last_iteration + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_last_layer + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_diff_input + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_diff_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_diff_weights_input + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_diff_weights_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_diff_bias + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //diff_last_iteration + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //diff_last_layer +}; +const _dt_conf_t conf_u8u8u8u8 = { + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 5.f, 0. }, //input + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 5.f, 0. }, //states + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_input + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 0. }, //bias + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 10.f, 0. }, //dst_iter + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 10.f, 0. }, //dst_layer +}; +const _dt_conf_t conf_u8u8u8f32 = { + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 5.f, 0. }, //input + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 5.f, 0. }, //states + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_input + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 0. }, //bias + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 10.f, 0. }, //dst_iter + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.001f, 1e-5 }, //dst_last_layer +}; +const _dt_conf_t conf_f32u8f32u8 = { + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 5.f, 0. }, //input + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.05f, 1e-5 }, //states + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_input + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 0. }, //bias + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 1e-5 }, //dst_iter + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 10.f, 0. }, //dst_layer +}; +const _dt_conf_t conf_f32u8f32f32 = { + { mkldnn_u8, 0, UINT8_MAX, 0, 127, 64.f, 5.f, 0. }, //input + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.05f, 1e-5 }, //states + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_input + { mkldnn_s8, INT8_MIN, INT8_MAX, -63, 63, 0.f, 10.f, 0. }, //weights_states + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 0. }, //bias + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 1e-5 }, //dst_iter + { mkldnn_f32, -int_max_exact, int_max_exact, -1, 1, 0.f, 0.01f, 1e-5 }, //dst_last_layer }; const dt_conf_t *str2cfg(const char *str) { @@ -93,6 +97,10 @@ const dt_conf_t *str2cfg(const char *str) { if (!strcasecmp(STRINGIFY(cfg), str)) \ return CONCAT2(conf_, cfg) CASE(f32); + CASE(u8u8u8u8); + CASE(u8u8u8f32); + CASE(f32u8f32u8); + CASE(f32u8f32f32); #undef CASE []() { SAFE(FAIL, CRIT); @@ -106,6 +114,10 @@ const char *cfg2str(const dt_conf_t *cfg) { if (cfg == CONCAT2(conf_, _cfg)) \ return STRINGIFY(_cfg) CASE(f32); + CASE(u8u8u8u8); + CASE(u8u8u8f32); + CASE(f32u8f32u8); + CASE(f32u8f32f32); #undef CASE []() { SAFE(FAIL, CRIT); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/perf_report.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/perf_report.cpp index ddecb23..334568e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/perf_report.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/perf_report.cpp @@ -30,20 +30,70 @@ namespace rnn { void perf_report(const rnn_prb_t *p, const res_t *r, const char *pstr) { const auto &t = r->timer; const int max_len = 400; - char buffer[max_len], *buf = buffer; int rem_len = max_len - 1; + char buffer[max_len], *buf = buffer; - # define DPRINT(...) do { \ +# define DPRINT(...) do { \ int l = snprintf(buf, rem_len, __VA_ARGS__); \ buf += l; rem_len -= l; \ } while(0) - DPRINT("perf,"); - DPRINT("%s,", pstr); - DPRINT("time(ms):"); - DPRINT("min=%g,", t.ms(benchdnn_timer_t::min)); - DPRINT("max=%g,", t.ms(benchdnn_timer_t::max)); - DPRINT("avg=%g", t.ms(benchdnn_timer_t::avg)); + auto modifier2mode = [](char c) { + if (c == '-') return benchdnn_timer_t::min; + if (c == '0') return benchdnn_timer_t::avg; + if (c == '+') return benchdnn_timer_t::max; + return benchdnn_timer_t::min; + }; + + auto modifier2unit = [](char c) { + if (c == 'K') return 1e3; + if (c == 'M') return 1e6; + if (c == 'G') return 1e9; + return 1e0; + }; + + const char *pt = perf_template; + char c; + + while ((c = *pt++) != '\0') { + if (c != '%') { *buf++ = c; rem_len--; continue; } + + c = *pt++; + + benchdnn_timer_t::mode_t mode = benchdnn_timer_t::min; + double unit = 1e0; + + if (c == '-' || c == '0' || c == '+') { + mode = modifier2mode(c); + c = *pt++; + } + + if (c == 'K' || c == 'M' || c == 'G') { + unit = modifier2unit(c); + c = *pt++; + } + // cellkind:activation:direction:l d mb + if (c == 'd') DPRINT("%s_%s_%s_l%dd%dt%dmb%d_slc%dsic%ddic%d", + alg2str(p->alg), activation2str(p->activation), direction2str(p->direction), + p->n_layer, p->n_directions(), p->n_iter, p->mb, p->slc, p->sic, p->dic); + else if (c == 'D') + DPRINT("%s", pstr); + else if (c == 'n') + DPRINT("%s", p->name); + else if (c == 'z') + DPRINT("%s", prop2str(p->prop)); + else if (c == 'F') + DPRINT("%g", t.ticks(mode) / t.ms(mode) / unit * 1e3); + else if (c == 't') + DPRINT("%g", t.ms(mode) / unit); + else if (c == 'c') + DPRINT("%g", t.ticks(mode) / unit); + else + []() { SAFE(FAIL, CRIT); return 0; }(); + } + + *buf = '\0'; + assert(rem_len >= 0); # undef DPRINT print(0, "%s\n", buffer); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/ref_rnn.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/ref_rnn.cpp index ed668c1..9bb9a1f 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/ref_rnn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/ref_rnn.cpp @@ -52,8 +52,8 @@ float activation(activation_t f, float x, bool is_fwd = true) { float result = 0; switch (f) { case RELU: result = is_fwd ? relu(x) : drelu(x); break; - case LOGISTIC: result = is_fwd ? logistic(x) : dlogistic(x); break; - case TANH: result = is_fwd ? tanhf(x) : dtanhf(x); break; + case LOGISTIC: result = is_fwd ? logistic(x) : x_m_square(x); break; + case TANH: result = is_fwd ? tanhf(x) : one_m_square(x); break; default: assert(!"unknown activation"); } return result; @@ -164,8 +164,8 @@ void gru_lbr_fwd(int sic, int slc, int dic, int wc, int batch, int n_gates, } // w = [weights_layer | weights_iter] : with order f, i , o, \bar(c) -void lstm_fwd(int sic, int slc, int dic, int wc, int batch, int n_gates, - float *dst_iter_h_, float *c_dst_, float *gates_, +void lstm_fwd(const rnn_prb_t *p, int sic, int slc, int dic, int wc, int batch, + int n_gates, float *dst_iter_h_, float *c_dst_, float *gates_, const float *weights_layer_, const float *weights_iter_h_, const float *bias_, const float *src_layer_, const float *src_iter_h_, const float *src_iter_c_) { @@ -182,34 +182,64 @@ void lstm_fwd(int sic, int slc, int dic, int wc, int batch, int n_gates, gemm("C", "N", "N", batch, n_gates * dic, slc, 1.0, src_layer_, wc, weights_layer_, n_gates * dic, 0.0, gates_, n_gates * dic); - gemm("C", "N", "N", batch, n_gates * dic, sic,1.0, src_iter_h_, wc, + gemm("C", "N", "N", batch, n_gates * dic, sic, 1.0, src_iter_h_, wc, weights_iter_h_, n_gates * dic, 1.0, gates_, n_gates * dic); + auto maybe_deq_w = [&](float g, int oc) { + if (p->cfg == conf_f32) + return g; + float scale = 1.; + if (p->scale_policy == PER_OC) + scale = p->wei_oc_scales[oc]; + else if (p->scale_policy == COMMON) + scale = p->wei_scale; + scale *= p->data_scale; + return g / scale; + }; + // add bias for (int i = 0; i < batch; i++) for (int j = 0; j < n_gates; j++) for (int k = 0; k < dic; k++) { - gates(i, j, k) += bias(j, k); + gates(i, j, k) + = maybe_deq_w(gates(i, j, k), j * dic + k) + bias(j, k); } // run the eltwise lstm_activation(dic, n_gates, batch, gates_); + auto maybe_q_d = [&](float h) { + if (p->cfg == conf_f32) + return h; + float fp = p->data_scale * h; + using R = attr_t::round_mode_t; + switch (p->attr.irmode) { + case R::DOWN: fp = floorf(fp); break; + case R::NEAREST: fp = nearbyintf(fp); break; + default: assert(!"unkown round mode"); + } + if (fp + p->data_shift > p->cfg[input].max) + fp = p->cfg[input].max - p->data_shift; + if (fp + p->data_shift < p->cfg[input].min) + fp = p->cfg[input].min - p->data_shift; + return fp; + }; + // compute C_t_l and H_t_l for (int i = 0; i < batch; i++) for (int j = 0; j < dic; j++) { float tmp = gates(i, ohf, j) * src_iter_c(i, j) + gates(i, ohi, j) * gates(i, ohc, j); c_dst(i, j) = tmp; - h_dst(i, j) = gates(i, oho, j) * tanhf(tmp); + h_dst(i, j) = maybe_q_d(gates(i, oho, j) * tanhf(tmp)); } } -void rnn_cell_fwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, - int batch, int n_gates, float *dst_iter_h, float *dst_iter_c, - float *gates, const float *weights_layer, const float *weights_iter, - const float *bias, const float *src_layer, const float *src_iter_h, - const float *src_iter_c, float *ws_local_) { +void rnn_cell_fwd(const rnn_prb_t *p, alg_t alg, activation_t f, int sic, + int slc, int dic, int wc, int batch, int n_gates, float *dst_iter_h, + float *dst_iter_c, float *gates, const float *weights_layer, + const float *weights_iter, const float *bias, const float *src_layer, + const float *src_iter_h, const float *src_iter_c, float *ws_local_) { switch (alg) { case VANILLA_GRU: gru_fwd(sic, slc, dic, wc, batch, n_gates, dst_iter_h, gates, @@ -221,7 +251,7 @@ void rnn_cell_fwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, ws_local_); break; case VANILLA_LSTM: - lstm_fwd(sic, slc, dic, wc, batch, n_gates, dst_iter_h, dst_iter_c, + lstm_fwd(p, sic, slc, dic, wc, batch, n_gates, dst_iter_h, dst_iter_c, gates, weights_layer, weights_iter, bias, src_layer, src_iter_h, src_iter_c); break; @@ -232,6 +262,7 @@ void rnn_cell_fwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, default: break; } } + void copy(int dimc, int dimr, int ld_src, int ld_dst, const float *src_, float *dst_, rnn_action_t action = action_copy) { AOC src(src_, dimc, ld_src); @@ -245,86 +276,212 @@ void copy(int dimc, int dimr, int ld_src, int ld_dst, const float *src_, }); } -/* FIXME: separate copy_init ??? - * fwd: ws_states = n_states - * bwd: ws_states = n_states + 1 - * - * lstm example: +void shift(int dimc, int dimr, int ld_src, float *src_, float shift, + bool round = false, const rnn_prb_t *p = nullptr) { + AOC src(src_, dimc, ld_src); + mkldnn::impl::parallel_nd(dimc, [&](int i) { + for (int j = 0; j < dimr; j++) { + float fp = src(i, j) + shift; + if (round) { + using R = attr_t::round_mode_t; + switch (p->attr.irmode) { + case R::DOWN: fp = floorf(fp); break; + case R::NEAREST: fp = nearbyintf(fp); break; + default: assert(!"unkown round mode"); + } + if (fp > UINT8_MAX) + fp = UINT8_MAX; + if (fp < 0) + fp = 0; + } + src(i, j) = fp; + } + }); +} + +void scale(int dimc, int dimr, int ld_src, float *src_, float scale, + bool round = false, const rnn_prb_t *p = nullptr) { + AOC src(src_, dimc, ld_src); + mkldnn::impl::parallel_nd(dimc, [&](int i) { + for (int j = 0; j < dimr; j++) { + float fp = src(i, j) * scale; + if (round) { + using R = attr_t::round_mode_t; + switch (p->attr.irmode) { + case R::DOWN: fp = floorf(fp); break; + case R::NEAREST: fp = nearbyintf(fp); break; + default: assert(!"unkown round mode"); + } + } + src(i, j) = fp; + } + }); +} + +/* lstm example: * fwd: ws keeps {h, c} for every cell - * bwd: wsb keeps {dh, dc, dx} for every cell */ -void copy_init(alg_t alg, int sic, int slc, int dic, int dlc, int wc, int batch, - int n_layer, int n_iter, int n_states, float *ws_, +void copy_init_fwd(const rnn_prb_t *p, alg_t alg, int sic, int slc, int dic, + int dlc, int wc, int batch, int n_layer, int n_iter, int n_dir, + int n_states, float *ws_, const float *src_layer_, + const float *firstit_states_, rnn_iter_direction_t iter_dir, + rnn_layer_direction_t lay_dir, int dir_val) { + AOC ws(ws_, n_layer + 2, n_dir, n_iter + 2, n_states, batch * wc); + AOC src_layer(src_layer_, n_iter, batch * slc); + AOC firstit_states( + firstit_states_, n_layer, n_dir, n_states, batch * sic); + + int lay_dest = (lay_dir == bottom2top) ? 0 : n_layer + 1; + int it_dest = (iter_dir == left2right) ? 0 : n_iter + 1; + bool is_int8 = p->cfg[input].dt == mkldnn_u8; + + // Copy input + for (int it = 0; it < n_iter; it++) { + copy(batch, slc, slc, wc, &src_layer(it, 0), + &ws(lay_dest, dir_val, it + 1, H, 0)); + if (p->cfg[input].dt == mkldnn_u8) + // shift u8 input to s8 to avoid compensation in gemm + shift(batch, slc, wc, &ws(lay_dest, dir_val, it + 1, H, 0), + -1. * p->data_shift); + } + + // Copy states + for (int lay = 0; lay < n_layer; lay++) { + copy(batch, sic, sic, wc, &firstit_states(lay, dir_val, H, 0), + &ws(lay + 1, dir_val, it_dest, H, 0)); + if (p->cfg[states].dt == mkldnn_u8) + shift(batch, sic, wc, &ws(lay + 1, dir_val, it_dest, H, 0), + -1. * p->data_shift); + else if (p->cfg[states].dt == mkldnn_f32 && is_int8) { + // quantize to s8 + scale(batch, sic, wc, &ws(lay + 1, dir_val, it_dest, H, 0), + p->data_scale, true, p); + } + + if (alg == VANILLA_LSTM) { + copy(batch, sic, sic, wc, &firstit_states(lay, dir_val, C, 0), + &ws(lay + 1, dir_val, it_dest, C, 0)); + if (p->cfg[states].dt == mkldnn_u8) { + // dequantize to f32 + shift(batch, sic, wc, &ws(lay + 1, dir_val, it_dest, C, 0), + -1. * p->data_shift); + scale(batch, sic, wc, &ws(lay + 1, dir_val, it_dest, C, 0), + 1. / p->data_scale); + } + } + } +} + +/* lstm example: + * bwd: wsb keeps {dh, dc, dx} for every cell +*/ +void copy_init_bwd(alg_t alg, int sic, int slc, int dic, int dlc, int wc, + int batch, int n_layer, int n_iter, int n_dir, int n_states, float *ws_, const float *src_layer_, const float *firstit_states_, rnn_iter_direction_t iter_dir, rnn_layer_direction_t lay_dir, - int dir_val, int n_dir, bool is_bwd = false, bool is_concat = false) { + int dir_val, bool is_concat = false) { AOC ws( - ws_, n_layer + 2, n_dir, n_iter + 2, n_states + is_bwd, batch, wc); - auto c_stride = is_bwd ? (is_concat ? 2 * dlc : dlc) : slc; + ws_, n_layer + 2, n_dir, n_iter + 2, n_states + 1, batch * wc); + auto c_stride = is_concat ? 2 * dlc : dlc; AOC src_layer(src_layer_, n_iter, batch * c_stride); - AOC firstit_states(firstit_states_, n_layer, n_dir, n_states, - batch, is_bwd ? dic : sic); + AOC firstit_states( + firstit_states_, n_layer, n_dir, n_states, batch * dic); int lay_dest = (lay_dir == bottom2top) ? 0 : n_layer + 1; int it_dest = (iter_dir == left2right) ? 0 : n_iter + 1; - if (!is_bwd) { - for (int it = 0; it < n_iter; it++) - copy(batch, slc, slc, wc, &src_layer(it, 0), - &ws(lay_dest, dir_val, it + 1, H, 0, 0)); - - for (int lay = 0; lay < n_layer; lay++) { - copy(batch, sic, sic, wc, &firstit_states(lay, dir_val, H, 0, 0), - &ws(lay + 1, dir_val, it_dest, H, 0, 0)); - if (alg == VANILLA_LSTM) { - copy(batch, sic, sic, wc, - &firstit_states(lay, dir_val, C, 0, 0), - &ws(lay + 1, dir_val, it_dest, C, 0, 0)); + for (int it = 0; it < n_iter; it++) + copy(batch, dic, c_stride, wc, + &src_layer(it, dir_val * is_concat * dlc), + &ws(lay_dest, dir_val, it + 1, n_states, 0)); + + for (int lay = 0; lay < n_layer; lay++) { + copy(batch, dic, dic, wc, &firstit_states(lay, dir_val, H, 0), + &ws(lay + 1, dir_val, it_dest, H, 0)); + if (alg == VANILLA_LSTM) { + copy(batch, dic, dic, wc, &firstit_states(lay, dir_val, C, 0), + &ws(lay + 1, dir_val, it_dest, C, 0)); + } + } +} + +void copy_res_fwd(const rnn_prb_t *p, alg_t alg, int sic, int slc, int dic, + int dlc, int wc, int batch, int n_layer, int n_iter, int n_dir, + int n_states, float *lastit_states_, float *lastlay_states_, + const float *ws_, rnn_iter_direction_t iter_dir, + rnn_layer_direction_t lay_dir, int dir_val, rnn_action_t action, + bool is_concat = false) { + int lastlay_c = is_concat ? 2 * dlc : dlc; + AOC lastit_states( + lastit_states_, n_layer, n_dir, n_states, batch, dic); + AOC lastlay_states(lastlay_states_, n_iter, batch, lastlay_c); + AOC ws( + ws_, n_layer + 2, n_dir, n_iter + 2, n_states, batch, wc); + + // Copy states layer + for (int it = 0; it < n_iter; it++) { + for (int nb = 0; nb < batch; nb++) { + auto from = &ws(n_layer, dir_val, it + 1, H, nb, 0); + auto to = &lastlay_states( + it, nb, action == action_concat ? dlc : 0); + copy(1, dlc, wc, lastlay_c, from, to, action); + + if (p->cfg[dst_last_layer].dt == mkldnn_u8) { + // shift s8 internal ws to u8 + shift(1, dlc, lastlay_c, to, p->data_shift); + } else { + // dequantize to f32 + scale(1, dlc, lastlay_c, to, 1. / p->data_scale); } } - } else { - for (int it = 0; it < n_iter; it++) - copy(batch, dic, c_stride, wc, - &src_layer(it, dir_val * is_concat * dlc), - &ws(lay_dest, dir_val, it + 1, n_states, 0, 0)); - - for (int lay = 0; lay < n_layer; lay++) { - copy(batch, dic, dic, wc, &firstit_states(lay, dir_val, H, 0, 0), - &ws(lay + 1, dir_val, it_dest, H, 0, 0)); - if (alg == VANILLA_LSTM) { - copy(batch, dic, dic, wc, - &firstit_states(lay, dir_val, C, 0, 0), - &ws(lay + 1, dir_val, it_dest, C, 0, 0)); + } + + int it_source = (iter_dir == left2right) ? n_iter : 1; + + // Copy states iteration + for (int lay = 0; lay < n_layer; lay++) { + if (alg == VANILLA_LSTM) { + copy(batch, dic, wc, dic, &ws(lay + 1, dir_val, it_source, C, 0, 0), + &lastit_states(lay, dir_val, C, 0, 0)); + if (p->cfg[dst_last_iteration].dt == mkldnn_u8) { + // quantize internal f32 ws to u8 + scale(batch, dic, dic, &lastit_states(lay, dir_val, C, 0, 0), + p->data_scale); + shift(batch, dic, dic, &lastit_states(lay, dir_val, C, 0, 0), + p->data_shift, true, p); } } + copy(batch, dic, wc, dic, &ws(lay + 1, dir_val, it_source, H, 0, 0), + &lastit_states(lay, dir_val, H, 0, 0)); + if (p->cfg[dst_last_iteration].dt == mkldnn_u8) { + // shift s8 internal ws to u8 + shift(batch, dic, dic, &lastit_states(lay, dir_val, H, 0, 0), + p->data_shift); + } else { + // dequantize to f32 + scale(batch, dic, dic, &lastit_states(lay, dir_val, H, 0, 0), + 1. / p->data_scale); + } } } -void copy_res(alg_t alg, int sic, int slc, int dic, int dlc, int wc, int batch, - int n_layer, int n_iter, int n_states, float *lastit_states_, - float *lastlay_states_, const float *ws_, - mkldnn_rnn_direction_t direction, rnn_iter_direction_t iter_dir, - rnn_layer_direction_t lay_dir, int dir_val, int n_dir, - rnn_action_t action, bool is_bwd = false) { - int lastlay_c = is_bwd ? - slc : - (direction == mkldnn_bidirectional_concat) * dlc + dlc; - int lastiter_c = is_bwd ? sic : dic; +void copy_res_bwd(alg_t alg, int sic, int slc, int dic, int dlc, int wc, + int batch, int n_layer, int n_iter, int n_dir, int n_states, + float *lastit_states_, float *lastlay_states_, const float *ws_, + rnn_iter_direction_t iter_dir, rnn_layer_direction_t lay_dir, + int dir_val, rnn_action_t action) { AOC lastit_states( - lastit_states_, n_layer, n_dir, n_states, batch, lastiter_c); - AOC lastlay_states(lastlay_states_, n_iter, batch, lastlay_c); + lastit_states_, n_layer, n_dir, n_states, batch, sic); + AOC lastlay_states(lastlay_states_, n_iter, batch, slc); AOC ws( - ws_, n_layer + 2, n_dir, n_iter + 2, n_states + is_bwd, batch, wc); + ws_, n_layer + 2, n_dir, n_iter + 2, n_states + 1, batch, wc); for (int it = 0; it < n_iter; it++) { for (int nb = 0; nb < batch; nb++) { // copy H to last layer states - int lay = is_bwd ? 1 : n_layer; - int state = is_bwd ? n_states : H; - auto from = &ws(lay, dir_val, it + 1, state, nb, 0); - auto to = &lastlay_states( - it, nb, (action == action_concat) && (!is_bwd) ? dlc : 0); + auto from = &ws(1, dir_val, it + 1, n_states, nb, 0); + auto to = &lastlay_states(it, nb, 0); - copy(1, is_bwd ? slc : dlc, wc, lastlay_c, from, to, action); + copy(1, slc, wc, slc, from, to, action); } } @@ -332,12 +489,10 @@ void copy_res(alg_t alg, int sic, int slc, int dic, int dlc, int wc, int batch, for (int lay = 0; lay < n_layer; lay++) { if (alg == VANILLA_LSTM) { - copy(batch, lastiter_c, wc, lastiter_c, - &ws(lay + 1, dir_val, it_source, C, 0, 0), + copy(batch, sic, wc, sic, &ws(lay + 1, dir_val, it_source, C, 0, 0), &lastit_states(lay, dir_val, C, 0, 0)); } - copy(batch, lastiter_c, wc, lastiter_c, - &ws(lay + 1, dir_val, it_source, H, 0, 0), + copy(batch, sic, wc, sic, &ws(lay + 1, dir_val, it_source, H, 0, 0), &lastit_states(lay, dir_val, H, 0, 0)); } } @@ -355,6 +510,7 @@ void rnn_linear_fwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction, const int dlc = p->dlc; const int wc = max(sic, max(slc, dic)); bool is_lbr = p->alg == LBR_GRU; + bool is_concat = direction == mkldnn_bidirectional_concat; const int batch = p->mb; const int n_gates = p->n_gates(); @@ -380,8 +536,9 @@ void rnn_linear_fwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction, // we first need to copy the initial states and input into ws // it simplifies the logic in the following code print(80, "rnn_linear_fwd: call copy_init dir_val = %d\n", dir_val); - copy_init(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_states, ws_, - src_layer_, src_iter_, iter_dir, lay_dir, dir_val, n_dir); + copy_init_fwd(p, alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, + n_dir, n_states, ws_, src_layer_, src_iter_, iter_dir, lay_dir, + dir_val); // We run the grid of computation for (int il = 0; il < n_layer; il++) { @@ -390,7 +547,7 @@ void rnn_linear_fwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction, int iter = (iter_dir == left2right) ? it + 1 : n_iter - it; int prev_iter = (iter_dir == left2right) ? iter - 1 : iter + 1; int lay = il + 1; - rnn_cell_fwd(alg, f, sic, slc, dic, wc, batch, n_gates, + rnn_cell_fwd(p, alg, f, sic, slc, dic, wc, batch, n_gates, &ws(lay, dir_val, iter, H, 0, 0), &ws(lay, dir_val, iter, C, 0, 0), &gates(lay - 1, dir_val, iter - 1, 0, 0, 0), @@ -399,15 +556,14 @@ void rnn_linear_fwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction, &bias(lay - 1, dir_val, 0), &ws(lay - 1, dir_val, iter, H, 0, 0), &ws(lay, dir_val, prev_iter, H, 0, 0), - &ws(lay, dir_val, prev_iter, C, 0, 0), - ws_local_); + &ws(lay, dir_val, prev_iter, C, 0, 0), ws_local_); } } // Finally we copy the results to the result buffers - copy_res(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_states, - dst_iter_, dst_layer_, ws_, direction, iter_dir, lay_dir, - dir_val, n_dir, action); + copy_res_fwd(p, alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, + n_dir, n_states, dst_iter_, dst_layer_, ws_, iter_dir, lay_dir, + dir_val, action, is_concat); }; switch (direction) { @@ -533,7 +689,7 @@ void lstm_bwd(alg_t alg, int sic, int slc, int dic, int wc, int batch, float dh = diff_dst_layer(ib, ih) + diff_dst_iter_h(ib, ih); float c = dst_iter_c(ib, ih); float dho = tanhf(c) * dh; - b_gates(ib, oho, ih) = dlogistic(ho) * dho; + b_gates(ib, oho, ih) = x_m_square(ho) * dho; float dc_next = diff_dst_iter_c(ib, ih); float dc = ho * dh * dtanhf(c) + dc_next; @@ -541,13 +697,13 @@ void lstm_bwd(alg_t alg, int sic, int slc, int dic, int wc, int batch, float c_old = src_iter_c(ib, ih); float dhf = c_old * dc; - b_gates(ib, ohf, ih) = dlogistic(hf) * dhf; + b_gates(ib, ohf, ih) = x_m_square(hf) * dhf; float dhi = hc * dc; - b_gates(ib, ohi, ih) = dlogistic(hi) * dhi; + b_gates(ib, ohi, ih) = x_m_square(hi) * dhi; float dhc = hi * dc; - b_gates(ib, ohc, ih) = dtanhf(hc) * dhc; + b_gates(ib, ohc, ih) = one_m_square(hc) * dhc; } gemm("C", "T", "N", sic, n_gates * dic, batch, 1.0, src_iter_h_, wc, b_gates_, @@ -592,10 +748,10 @@ void gru_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, AOC dhr(dhr_, batch, wc); AOC hr(hr_, batch, wc); -// dc = (1 - u) * dh; dc^ = dtanhf(c) * dc; -// du = (h - u) * dh; du^ = dlogistic(u) * du; +// dc = (1 - u) * dh; dc^ = one_m_square(c) * dc; +// du = (h - u) * dh; du^ = x_m_square(u) * du; // dhr = Wc dc^; -// dr = h * dhr; dr^ = dlogistic(r) * dr; +// dr = h * dhr; dr^ = x_m_square(r) * dr; const int ohu = 0; const int ohr = 1; const int ohc = 2; @@ -607,12 +763,12 @@ void gru_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, float dh = diff_dst_layer(ib, ih) + diff_dst_iter_h(ib, ih); float du = (h - c) * dh; float dc = (1.0f - u) * dh; - b_gates(ib, ohu, ih) = dlogistic(u) * du; - b_gates(ib, ohc, ih) = dtanhf(c) * dc; + b_gates(ib, ohu, ih) = x_m_square(u) * du; + b_gates(ib, ohc, ih) = one_m_square(c) * dc; diff_src_iter(ib, ih) = dh * u; } - gemm("C", "N", "T", batch, slc, dic, 1.0, &(b_gates(0, 2, 0)), n_gates * dic, - &(weights_layer(0, 2, 0)), n_gates * dic, 0.0, dhr_, wc); + gemm("C", "N", "T", batch, sic, dic, 1.0, &(b_gates(0, 2, 0)), n_gates * dic, + &(weights_iter_h(0, 2, 0)), n_gates * dic, 0.0, dhr_, wc); for (int ib = 0; ib < batch; ib++) for (int ih = 0; ih < dic; ih++) { @@ -621,7 +777,7 @@ void gru_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, float dr = h * dhr(ib, ih); hr(ib, ih) = h * r; diff_src_iter(ib, ih) += dhr(ib, ih) * r; - b_gates(ib, ohr, ih) = dlogistic(r) * dr; + b_gates(ib, ohr, ih) = x_m_square(r) * dr; } // dWx += xdu^ | xdr^ | xdc^ @@ -682,9 +838,9 @@ void gru_lbr_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, &weights_iter_h(0, 2, 0), n_gates * dic, 1.0, Wh_b_, dic); -// dc = (1 - u) * dh; dc^ = dtanhf(c) * dc; -// du = (h - u) * dh; du^ = dlogistic(u) * du; -// dr = (Wh + b) * dc^; dr^ = dlogistic(r) * dr; +// dc = (1 - u) * dh; dc^ = one_m_square(c) * dc; +// du = (h - c) * dh; du^ = x_m_square(u) * du; +// dr = (Wh + b) * dc^; dr^ = x_m_square(r) * dr; const int ohu = 0; const int ohr = 1; const int ohc = 2; @@ -698,11 +854,11 @@ void gru_lbr_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc, float du = (h - c) * dh; float dc = (1.0f - u) * dh; - b_gates(ib, ohu, ih) = dlogistic(u) * du; - b_gates(ib, ohc, ih) = dtanhf(c) * dc; + b_gates(ib, ohu, ih) = x_m_square(u) * du; + b_gates(ib, ohc, ih) = one_m_square(c) * dc; float dr = Wh_b(ib, ih) * b_gates(ib, ohc, ih); - b_gates(ib, ohr, ih) = dlogistic(r) * dr; + b_gates(ib, ohr, ih) = x_m_square(r) * dr; b_gates_r(ib, ohu, ih) = b_gates(ib, ohu, ih); b_gates_r(ib, ohr, ih) = b_gates(ib, ohr, ih); @@ -841,9 +997,10 @@ void rnn_linear_bwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction, rnn_layer_direction_t lay_dir, int dir_val, rnn_action_t action) { // we first need to copy the initial states and input into ws // it simplifies the logic in the following code - copy_init(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_states, - wsb_, diff_dst_layer_, diff_dst_iter_, iter_dir, lay_dir, - dir_val, n_dir, true, direction == mkldnn_bidirectional_concat); + copy_init_bwd(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, + n_dir, n_states, wsb_, diff_dst_layer_, diff_dst_iter_, + iter_dir, lay_dir, dir_val, + direction == mkldnn_bidirectional_concat); // We run the grid of computation for (int j = n_layer - 1; j >= 0; j--) { @@ -881,9 +1038,9 @@ void rnn_linear_bwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction, } // Finally we copy the results to the result buffers - copy_res(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_states, - diff_src_iter_, diff_src_layer_, wsb_, direction, iter_dir, - lay_dir, dir_val, n_dir, action, true); + copy_res_bwd(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_dir, + n_states, diff_src_iter_, diff_src_layer_, wsb_, iter_dir, + lay_dir, dir_val, action); }; switch (direction) { diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.cpp index d940831..526b2da 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.cpp @@ -35,6 +35,30 @@ namespace rnn { #define CALL_MKLDNN_RNN 1 +mkldnn_primitive_attr_t create_mkldnn_rnn_attr(const rnn_prb_t *p) { + mkldnn_primitive_attr_t mkldnn_attr = NULL; + + DNN_SAFE_V(mkldnn_primitive_attr_create(&mkldnn_attr)); + if (p->attr.irmode != attr_t::round_mode_t::NEAREST) + DNN_SAFE_V(mkldnn_primitive_attr_set_int_output_round_mode( + mkldnn_attr, (mkldnn_round_mode_t)p->attr.irmode)); + + if (p->scale_policy == PER_OC) { + DNN_SAFE_V(mkldnn_primitive_attr_set_rnn_weights_qparams( + mkldnn_attr, p->dic * p->n_gates(), 0x3, p->wei_oc_scales)); + } else if (p->scale_policy == COMMON && p->wei_scale != 1.) { + DNN_SAFE_V(mkldnn_primitive_attr_set_rnn_weights_qparams( + mkldnn_attr, 1, 0, &p->wei_scale)); + } + + if (p->data_scale != 1.0 || p->data_shift != 0.0) { + DNN_SAFE_V(mkldnn_primitive_attr_set_rnn_data_qparams( + mkldnn_attr, p->data_scale, p->data_shift)); + } + + return mkldnn_attr; +} + int fill_memory(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem1, dnn_mem_t &mem2) { #ifdef CALL_MKLDNN_RNN @@ -43,20 +67,20 @@ int fill_memory(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem1, #else const size_t nelems = mem2.nelems(); #endif - size_t nchunks = mkldnn_get_max_threads(); - size_t chunk_size = (nelems + nchunks - 1) / nchunks; + dt_conf_t c = p->cfg[kind]; + float mean = c.f_mean, var = c.f_var, min = c.f_min, max = c.f_max; mkldnn::impl::parallel(0, [&](int ithr, int nthr) { + size_t chunk_size = (nelems + nthr - 1) / nthr; size_t idx_start = ithr * chunk_size; size_t idx_end = MIN2(idx_start + chunk_size, nelems); - std::minstd_rand msr; - std::normal_distribution gen(.0f, .001f); + msr.seed((unsigned long int)kind); + std::normal_distribution gen(mean, var); msr.discard(idx_start); - - for (size_t idx = idx_start; idx < idx_end; ++idx){ - auto val = gen(msr); - mem2.set_elem(idx, MAX2(MIN2(val, 1.0f), -1.0f)); + for (size_t idx = idx_start; idx < idx_end; ++idx) { + auto val = (c.dt == mkldnn_f32) ? gen(msr) : round(gen(msr)); + mem2.set_elem(idx, MAX2(MIN2(val, max), min)); } }); @@ -88,23 +112,20 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2], mkldnn_dims_t bias_dims = { p->n_layer, p->n_directions(), p->n_gates() + is_gru_lbr, p->dic }; // mkldnn_tnc - int lastlay_dlc = (p->direction == mkldnn_bidirectional_concat) ? - 2 * p->dlc : - p->dlc; + int lastlay_dlc = (p->direction == mkldnn_bidirectional_concat) + ? 2 * p->dlc + : p->dlc; mkldnn_dims_t dst_last_layer_dims = { p->n_iter, p->mb, lastlay_dlc }; DNN_SAFE(mkldnn_memory_desc_init( - &input_d, 3, input_dims, p->cfg[SRC].dt, mkldnn_tnc), + &input_d, 3, input_dims, p->cfg[input].dt, mkldnn_tnc), WARN); input_d.layout_desc.blocking.strides[0][0] += the_stride; - DNN_SAFE(mkldnn_memory_desc_init( - &diff_input_d, 3, input_dims, p->cfg[SRC].dt, mkldnn_any), - WARN); mkldnn_dims_t states_dims = { p->n_layer, p->n_directions(), p->n_states(), p->mb, p->sic }; - DNN_SAFE(mkldnn_memory_desc_init( - &states_d, 5, states_dims, p->cfg[SRC].dt, mkldnn_ldsnc), + DNN_SAFE(mkldnn_memory_desc_init(&states_d, 5, states_dims, + p->cfg[states].dt, mkldnn_ldsnc), WARN); states_d.layout_desc.blocking.strides[0][3] = p->sic + the_stride; @@ -116,43 +137,28 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2], = states_d.layout_desc.blocking.strides[0][d + 1] * states_d.dims[d + 1]; - DNN_SAFE(mkldnn_memory_desc_init(&diff_states_d, 5, states_dims, - p->cfg[SRC].dt, mkldnn_any), - WARN); - DNN_SAFE(mkldnn_memory_desc_init(&weights_input_d, 5, weights_input_dims, - p->cfg[SRC].dt, mkldnn_any), - WARN); - DNN_SAFE(mkldnn_memory_desc_init(&diff_weights_input_d, 5, - weights_input_dims, p->cfg[SRC].dt, mkldnn_any), + p->cfg[weights_input].dt, mkldnn_any), WARN); DNN_SAFE(mkldnn_memory_desc_init(&weights_states_d, 5, weights_states_dims, - p->cfg[SRC].dt, mkldnn_any), - WARN); - DNN_SAFE(mkldnn_memory_desc_init(&diff_weights_states_d, 5, - weights_states_dims, p->cfg[SRC].dt, mkldnn_any), + p->cfg[weights_states].dt, mkldnn_any), WARN); DNN_SAFE(mkldnn_memory_desc_init( - &bias_d, 4, bias_dims, p->cfg[SRC].dt, mkldnn_any), - WARN); - DNN_SAFE(mkldnn_memory_desc_init( - &diff_bias_d, 4, bias_dims, p->cfg[SRC].dt, mkldnn_any), + &bias_d, 4, bias_dims, p->cfg[bias].dt, mkldnn_any), WARN); DNN_SAFE(mkldnn_memory_desc_init(&dst_last_layer_d, 3, dst_last_layer_dims, - p->cfg[SRC].dt, mkldnn_tnc), + p->cfg[dst_last_layer].dt, mkldnn_tnc), WARN); dst_last_layer_d.layout_desc.blocking.strides[0][0] += the_stride; - DNN_SAFE(mkldnn_memory_desc_init(&diff_last_layer_d, 3, dst_last_layer_dims, - p->cfg[SRC].dt, mkldnn_any), - WARN); mkldnn_dims_t dst_last_iteration_dims = { p->n_layer, p->n_directions(), p->n_states(), p->mb, p->dic }; DNN_SAFE(mkldnn_memory_desc_init(&dst_last_iteration_d, 5, - dst_last_iteration_dims, p->cfg[SRC].dt, mkldnn_ldsnc), + dst_last_iteration_dims, p->cfg[dst_last_iteration].dt, + mkldnn_ldsnc), WARN); dst_last_iteration_d.layout_desc.blocking.strides[0][3] @@ -166,10 +172,6 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2], = dst_last_iteration_d.layout_desc.blocking.strides[0][d + 1] * dst_last_iteration_d.dims[d + 1]; - DNN_SAFE(mkldnn_memory_desc_init(&diff_last_iteration_d, 5, - dst_last_iteration_dims, p->cfg[SRC].dt, mkldnn_any), - WARN); - mkldnn_alg_kind_t kind = alg2kind(p->alg); mkldnn_alg_kind_t f = activation2kind(p->activation); @@ -179,14 +181,43 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2], // When inference, we use forward_inference // When training, we use forward_training { - DNN_SAFE(mkldnn_rnn_forward_desc_init(&rd[0], fwd_prop, &rcd, + mkldnn_status_t init_status = mkldnn_success; + init_status = mkldnn_rnn_forward_desc_init(&rd[0], fwd_prop, &rcd, p->direction, &input_d, &states_d, &weights_input_d, &weights_states_d, &bias_d, &dst_last_layer_d, - &dst_last_iteration_d), - WARN); + &dst_last_iteration_d); + if (init_status == mkldnn_unimplemented) + return r->state = UNIMPLEMENTED, OK; + else + SAFE(init_status, WARN); } if (is_bwd) { + DNN_SAFE(mkldnn_memory_desc_init(&diff_input_d, 3, input_dims, + p->cfg[dst_diff_input].dt, mkldnn_any), + WARN); + DNN_SAFE(mkldnn_memory_desc_init(&diff_states_d, 5, states_dims, + p->cfg[dst_diff_states].dt, mkldnn_any), + WARN); + DNN_SAFE(mkldnn_memory_desc_init(&diff_weights_input_d, 5, + weights_input_dims, p->cfg[dst_diff_weights_input].dt, + mkldnn_any), + WARN); + DNN_SAFE(mkldnn_memory_desc_init(&diff_weights_states_d, 5, + weights_states_dims, + p->cfg[dst_diff_weights_states].dt, mkldnn_any), + WARN); + DNN_SAFE(mkldnn_memory_desc_init(&diff_bias_d, 4, bias_dims, + p->cfg[dst_diff_bias].dt, mkldnn_any), + WARN); + DNN_SAFE(mkldnn_memory_desc_init(&diff_last_layer_d, 3, + dst_last_layer_dims, p->cfg[diff_last_layer].dt, + mkldnn_any), + WARN); + DNN_SAFE(mkldnn_memory_desc_init(&diff_last_iteration_d, 5, + dst_last_iteration_dims, + p->cfg[diff_last_iteration].dt, mkldnn_any), + WARN); DNN_SAFE(mkldnn_rnn_backward_desc_init(&rd[1], p->prop, &rcd, p->direction, &input_d, &states_d, &weights_input_d, &weights_states_d, &bias_d, &dst_last_layer_d, @@ -196,17 +227,17 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2], &diff_last_iteration_d), WARN); } + auto mkldnn_attr = create_mkldnn_rnn_attr(p); mkldnn_status_t init_status = mkldnn_success; for (int i = 0; i < 1 + (int)is_bwd; i++) { - init_status = mkldnn_primitive_desc_create( - &(rpd[i]), &(rd[i]), engine, NULL); + init_status = mkldnn_primitive_desc_create_v2( + &(rpd[i]), &(rd[i]), mkldnn_attr, engine, NULL); if (init_status == mkldnn_unimplemented) return r->state = UNIMPLEMENTED, OK; else SAFE(init_status, WARN); } - - // const char *impl_str = query_impl_info(rpd); + mkldnn_primitive_attr_destroy(mkldnn_attr); auto q = [=](mkldnn_query_t query, int rpd_idx, int index = 0) { return *mkldnn_primitive_desc_query_memory_d( @@ -311,13 +342,17 @@ int doit(const rnn_prb_t *p, res_t *r) { auto &diff_dst_layer_dt_d = rd[1].diff_dst_layer_desc; auto &diff_dst_iter_dt_d = rd[1].diff_dst_iter_desc; - input_dt = new dnn_mem_t(input_dt_d, fp); - states_dt = new dnn_mem_t(states_dt_d, fp); - weights_input_dt = new dnn_mem_t(weights_input_dt_d, fp); - weights_states_dt = new dnn_mem_t(weights_states_dt_d, fp); - bias_dt = new dnn_mem_t(bias_dt_d, fp); - dst_last_layer_dt = new dnn_mem_t(dst_last_layer_dt_d, fp); - dst_last_iteration_dt = new dnn_mem_t(dst_last_iteration_dt_d, fp); + input_dt = new dnn_mem_t(input_dt_d, p->cfg[input].dt); + states_dt = new dnn_mem_t(states_dt_d, p->cfg[states].dt); + weights_input_dt + = new dnn_mem_t(weights_input_dt_d, p->cfg[weights_input].dt); + weights_states_dt + = new dnn_mem_t(weights_states_dt_d, p->cfg[weights_states].dt); + bias_dt = new dnn_mem_t(bias_dt_d, p->cfg[bias].dt); + dst_last_layer_dt + = new dnn_mem_t(dst_last_layer_dt_d, p->cfg[dst_last_layer].dt); + dst_last_iteration_dt = new dnn_mem_t( + dst_last_iteration_dt_d, p->cfg[dst_last_iteration].dt); if (is_bwd) { bwd_weights_input_dt = new dnn_mem_t(bwd_weights_input_dt_d, fp); @@ -417,8 +452,6 @@ int doit(const rnn_prb_t *p, res_t *r) { dnn_mem_t dst_last_layer(*dst_last_layer_dt, fp, mkldnn_tnc); dnn_mem_t dst_last_iteration( *dst_last_iteration_dt, fp, mkldnn_ldsnc); - SAFE(dst_last_layer.reorder(*dst_last_layer_dt), WARN); - SAFE(dst_last_iteration.reorder(*dst_last_iteration_dt), WARN); SAFE(compare_dst_last_layer( p, dst_last_layer, *dst_last_layer_fp, r, true), WARN); @@ -457,8 +490,6 @@ int doit(const rnn_prb_t *p, res_t *r) { dnn_mem_t dst_last_layer(*dst_last_layer_dt, fp, mkldnn_tnc); dnn_mem_t dst_last_iteration( *dst_last_iteration_dt, fp, mkldnn_ldsnc); - SAFE(dst_last_layer.reorder(*dst_last_layer_dt), WARN); - SAFE(dst_last_iteration.reorder(*dst_last_iteration_dt), WARN); SAFE(compare_dst_last_layer( p, dst_last_layer, *dst_last_layer_fp, r, true), WARN); @@ -468,8 +499,6 @@ int doit(const rnn_prb_t *p, res_t *r) { dnn_mem_t diff_input(*dst_diff_input_dt, fp, mkldnn_tnc); dnn_mem_t diff_states(*dst_diff_states_dt, fp, mkldnn_ldsnc); - SAFE(diff_input.reorder(*dst_diff_input_dt), WARN); - SAFE(diff_states.reorder(*dst_diff_states_dt), WARN); SAFE(compare_input(p, diff_input, *dst_diff_input_fp, r, true), WARN); SAFE(compare_states(p, diff_states, *dst_diff_states_fp, r, true), @@ -479,9 +508,6 @@ int doit(const rnn_prb_t *p, res_t *r) { *dst_diff_weights_input_dt, fp, mkldnn_ldigo); dnn_mem_t diff_weights_states( *dst_diff_weights_states_dt, fp, mkldnn_ldigo); - SAFE(diff_weights_input.reorder(*dst_diff_weights_input_dt), WARN); - SAFE(diff_weights_states.reorder(*dst_diff_weights_states_dt), - WARN); SAFE(compare_weights_input(p, diff_weights_input, *dst_diff_weights_input_fp, r, true), WARN); @@ -490,7 +516,6 @@ int doit(const rnn_prb_t *p, res_t *r) { WARN); dnn_mem_t diff_bias(*dst_diff_bias_dt, fp, mkldnn_ldgo); - SAFE(diff_bias.reorder(*dst_diff_bias_dt), WARN); SAFE(compare_bias(p, diff_bias, *dst_diff_bias_fp, r, true), WARN); } } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.hpp index 36d6a56..45ab7fb 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.hpp @@ -29,6 +29,8 @@ namespace rnn { +extern const char *perf_template; + enum alg_t { VANILLA_RNN, VANILLA_LSTM, VANILLA_GRU, LBR_GRU }; alg_t str2alg(const char *str); const char *alg2str(alg_t alg); @@ -39,6 +41,9 @@ activation_t str2activation(const char *str); const char *activation2str(activation_t alg); mkldnn_alg_kind_t activation2kind(activation_t alg); +mkldnn_prop_kind_t str2prop(const char *str); +const char *prop2str(mkldnn_prop_kind_t prop); + mkldnn_rnn_direction_t str2direction(const char *str); const char *direction2str(mkldnn_rnn_direction_t direction); @@ -104,15 +109,15 @@ enum rnn_data_kind_t { weights_input, weights_states, bias, - dst_last_layer, dst_last_iteration, + dst_last_layer, dst_diff_input, dst_diff_states, dst_diff_weights_input, dst_diff_weights_states, dst_diff_bias, - diff_last_layer, diff_last_iteration, + diff_last_layer, data_kind_total // should be last to provide the total number of data kinds }; @@ -149,20 +154,46 @@ typedef struct dt_conf_t { mkldnn_data_type_t dt; int min, max; /* representative */ int f_min, f_max; /* fill range */ - int f_base; /* fill base, use 0 */ - int f_step; /* fill step, use 1 */ - double f_sparsity; /* amount of non-zeros, default 0.25 */ + float f_mean, f_var; /* mean and variance of normally distributed data */ double eps; /* acceptable error */ } _dt_conf_t[data_kind_total]; extern const _dt_conf_t conf_f32; +extern const _dt_conf_t conf_u8u8u8u8; +extern const _dt_conf_t conf_u8u8u8f32; +extern const _dt_conf_t conf_f32u8f32f32; +extern const _dt_conf_t conf_f32u8f32u8; + +const dt_conf_t *str2cfg(const char *str); +const char *cfg2str(const dt_conf_t *cfg); + +enum policy_t { NONE = 0, COMMON, PER_OC }; +policy_t str2policy(const char *str); +const char *policy2str(attr_t::scale_t::policy_t policy); struct rnn_prb_t : public rnn_desc_t { rnn_prb_t(const rnn_desc_t desc, const dt_conf_t *cfg, mkldnn_prop_kind_t prop, alg_t alg, - mkldnn_rnn_direction_t direction, activation_t activation) - : rnn_desc_t(desc), cfg(cfg), prop(prop), alg(alg), - direction(direction), activation(activation){ + mkldnn_rnn_direction_t direction, activation_t activation, + const attr_t &attr, policy_t scale_policy, int mb = 0) + : rnn_desc_t(desc) + , cfg(cfg) + , prop(prop) + , alg(alg) + , direction(direction) + , activation(activation) + , attr(attr) + , scale_policy(scale_policy) { + if (mb) this->mb = mb; + wei_oc_scales = NULL; + if (scale_policy == PER_OC) + wei_oc_scales + = (float *)zmalloc(sizeof(float) * dic * n_gates(), 64); + set_qparams(-1., 1.); + } + ~rnn_prb_t() { + if (wei_oc_scales) + zfree(wei_oc_scales); } int n_directions() const { @@ -178,14 +209,24 @@ struct rnn_prb_t : public rnn_desc_t { 4 : (alg == VANILLA_GRU || alg == LBR_GRU ? 3 : 1); } + int n_bias() const { + return alg == LBR_GRU ? n_gates() + 1 : n_gates(); + } const dt_conf_t *cfg; mkldnn_prop_kind_t prop; alg_t alg; mkldnn_rnn_direction_t direction; activation_t activation; + attr_t attr; + policy_t scale_policy; + + float data_scale, data_shift; + float wei_scale; + float *wei_oc_scales; private: + void set_qparams(float fp_min, float fp_max); rnn_prb_t(const rnn_prb_t &) = delete; rnn_prb_t &operator=(const rnn_prb_t &) = delete; }; @@ -301,7 +342,7 @@ inline void inv_ldwOcIc_off_f(const rnn_prb_t *p, size_t off, int &l, int &d, // bias: mkldnn_ldgo inline size_t ldgo_off_f(const rnn_prb_t *p, int l, int d, int b, int c) { - return (((size_t)l * p->n_directions() + d) * p->n_gates() + b) * p->sic + return (((size_t)l * p->n_directions() + d) * p->n_bias() + b) * p->sic + c; } @@ -309,8 +350,8 @@ inline void inv_ldgo_off_f( const rnn_prb_t *p, size_t off, int &l, int &d, int &b, int &c) { c = off % p->sic; off /= p->sic; - b = off % p->n_gates(); - off /= p->n_gates(); + b = off % p->n_bias(); + off /= p->n_bias(); d = off % p->n_directions(); off /= p->n_directions(); l = off % p->n_layer; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.cpp index 124cbec..c6068da 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.cpp @@ -39,6 +39,24 @@ alg_t str2alg(const char *str) { return VANILLA_RNN; } +policy_t str2policy(const char *str) { +#define CASE(_plc) if (!strcasecmp(STRINGIFY(_plc), str)) return _plc + CASE(NONE); + CASE(COMMON); + CASE(PER_OC); +#undef CASE + assert(!"unknown policy"); + return NONE; +} + +const char * policy2str(policy_t policy) { + if (policy == NONE) return "none"; + if (policy == COMMON) return "common"; + if (policy == PER_OC) return "per_oc"; + assert(!"unknown policy"); + return "unknown policy"; +} + const char *alg2str(alg_t alg) { if (alg == VANILLA_RNN) return "VANILLA_RNN"; @@ -99,6 +117,25 @@ mkldnn_alg_kind_t activation2kind(activation_t act) { return alg_kind; } +mkldnn_prop_kind_t str2prop(const char *str) { + if (!strcasecmp("FWD_D", str)) + return mkldnn_forward; + if (!strcasecmp("BWD_D", str)) + return mkldnn_backward; + assert(!"unknown propagation"); + return mkldnn_forward; +} + +const char *prop2str(mkldnn_prop_kind_t prop) { + if (prop == mkldnn_forward) + return "FWD_D"; + if (prop == mkldnn_backward) + return "BWD_DW"; + assert(!"unknown propagation"); + return "unknown propagation"; + +} + mkldnn_rnn_direction_t str2direction(const char *str) { if (!strcasecmp("left2right", str)) return mkldnn_unidirectional_left2right; @@ -185,8 +222,11 @@ int str2desc(rnn_desc_t *desc, const char *str) { void prb2str(const rnn_prb_t *p, const res_t *res, char *buffer) { int rem_len = max_prb_len; - DPRINT("%s,%s,%s,", alg2str(p->alg), activation2str(p->activation), - direction2str(p->direction)); + DPRINT("--prop=%s --alg=%s --activation=%s --direction=%s --cfg=%s " + "--scaling=%s ", + prop2str(p->prop), alg2str(p->alg), activation2str(p->activation), + direction2str(p->direction), cfg2str(p->cfg), + policy2str(p->scale_policy)); DPRINT("l%d", p->n_layer); DPRINT("t%d", p->n_iter); DPRINT("mb%d", p->mb); @@ -203,10 +243,20 @@ void init_buffer(float *buf, int size, float value) { } float logistic(float x) { - return 1.0f / (1.0f + expf(-x)); + if (x < 0) + return (expf(x) / (1 + expf(x))); + else + return 1.0f - (expf(-x) / (1 + expf(-x))); } float dlogistic(float x) { - return x * (1 - x); + float tmp = logistic(x); + return tmp * (1 - tmp); +} +float dtanhf(float x) { + return (1 - tanhf(x)) * (1 + tanhf(x)); +} +float x_m_square(float x) { + return x - x * x; } float relu(float x) { return x > 0 ? x : 0; @@ -214,8 +264,8 @@ float relu(float x) { float drelu(float x) { return float(x > 0); } -float dtanhf(float x) { - return (1 - x) * (1 + x); +float one_m_square(float x) { + return 1 - x * x; } int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt, @@ -414,4 +464,32 @@ int compare_dst_last_iteration(const rnn_prb_t *p, dnn_mem_t &mem_dt, return compare_dat(p, dst_last_iteration, mem_dt, mem_fp, r, final_compare); } +void rnn_prb_t::set_qparams(float fp_min, float fp_max) { + if (cfg == conf_f32) { + data_shift = 0.; + data_scale = 1.; + wei_scale = 1.; + return; + } + + /* Set parameters for quantization of src and weights from fp32 data + * in [-1, 1] to int8 data in a range specified in cfg */ + float fp_range = fp_max - fp_min; + float int8_src_range = cfg[input].f_max - cfg[input].f_min, + int8_wei_range = cfg[weights_input].f_max - cfg[weights_input].f_min; + + data_shift = cfg[input].f_mean; + data_scale = int8_src_range / fp_range; + + if (scale_policy == COMMON) { + wei_scale = int8_wei_range / fp_range; + } else if (scale_policy == PER_OC) { + float K = int8_wei_range / fp_range; + int nelems = dic * n_gates(); + for (int i = 0; i < nelems; i++) { + wei_oc_scales[i] = K * (1. + (float)i / nelems); + } + } +} + } // namespace rnn diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.hpp index 3ac8598..71d0400 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.hpp @@ -44,6 +44,8 @@ float dlogistic(float x); float relu(float x); float drelu(float x); float dtanhf(float x); +float one_m_square(float x); +float x_m_square(float x); int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp, res_t *r, bool final_compare); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/self/conv.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/self/conv.cpp index 46662d9..b449cbe 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/self/conv.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/self/conv.cpp @@ -26,31 +26,24 @@ namespace self { static int check_simple_enums() { /* alg */ + CHECK_CASE_STR_EQ(alg2str(alg_t::AUTO), "auto"); + CHECK_CASE_STR_NE(alg2str(alg_t::AUTO), "autox"); + CHECK_CASE_STR_EQ(alg2str(alg_t::DIRECT), "direct"); CHECK_CASE_STR_NE(alg2str(alg_t::DIRECT), "directx"); CHECK_CASE_STR_EQ(alg2str(alg_t::WINO), "wino"); CHECK_CASE_STR_NE(alg2str(alg_t::WINO), "winox"); + CHECK_EQ(str2alg("auto"), alg_t::AUTO); + CHECK_EQ(str2alg("AUTO"), alg_t::AUTO); + CHECK_EQ(str2alg("direct"), alg_t::DIRECT); CHECK_EQ(str2alg("DIRECT"), alg_t::DIRECT); CHECK_EQ(str2alg("wino"), alg_t::WINO); CHECK_EQ(str2alg("WINO"), alg_t::WINO); - /* merge */ - CHECK_CASE_STR_EQ(merge2str(merge_t::NONE), "none"); - CHECK_CASE_STR_NE(merge2str(merge_t::NONE), "nonex"); - - CHECK_CASE_STR_EQ(merge2str(merge_t::RELU), "relu"); - CHECK_CASE_STR_NE(merge2str(merge_t::RELU), "relux"); - - CHECK_EQ(str2merge("none"), merge_t::NONE); - CHECK_EQ(str2merge("NONE"), merge_t::NONE); - - CHECK_EQ(str2merge("relu"), merge_t::RELU); - CHECK_EQ(str2merge("RELU"), merge_t::RELU); - return OK; } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp index f2db808..cddbb49 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp @@ -144,26 +144,23 @@ int doit(const prb_t *p, res_t *r) { ? mkldnn_nc : get_default_format(ndims, fmt2data_kind(p->fmt)); - dnn_mem_t data_fp(src_dt_d, fp, src_format), - data_dt(src_dt_d); - dnn_mem_t d_data_fp(src_dt_d, fp, src_format), - d_data_dt(src_dt_d); + dnn_mem_t src_fp(src_dt_d, fp, src_format), src_dt(src_dt_d); + dnn_mem_t dst_fp(src_dt_d, fp, src_format), dst_dt(src_dt_d); - SAFE(fill_memory(p, data_fp), WARN); + SAFE(fill_memory(p, src_fp), WARN); mkldnn_primitive_at_t inputs[1]; const_mkldnn_primitive_t outputs[1]; - SAFE(data_dt.reorder(data_fp), WARN); - inputs[0] = {data_dt.p_, 0}; - outputs[0] = d_data_dt.p_; + SAFE(src_dt.reorder(src_fp), WARN); + inputs[0] = {src_dt.p_, 0}; + outputs[0] = dst_dt.p_; DNN_SAFE(mkldnn_primitive_create(&s, spd, inputs, outputs), WARN); DNN_SAFE_V(mkldnn_primitive_desc_destroy(spd)); SAFE(execute(s), WARN); if (bench_mode & CORR) { - compute_shuffle(p, data_fp, d_data_fp); - dnn_mem_t data(d_data_dt.md_, fp, src_format); - SAFE(data.reorder(d_data_dt), WARN); - SAFE(compare(p, d_data_fp, data, r), WARN); + compute_shuffle(p, src_fp, dst_fp); + dnn_mem_t data(dst_dt, fp, src_format); + SAFE(compare(p, dst_fp, data, r), WARN); } if (bench_mode & PERF) { diff --git a/inference-engine/thirdparty/mkl-dnn/tests/generate_c_symbols_refs.sh b/inference-engine/thirdparty/mkl-dnn/tests/generate_c_symbols_refs.sh index 690dc6a..45040b7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/generate_c_symbols_refs.sh +++ b/inference-engine/thirdparty/mkl-dnn/tests/generate_c_symbols_refs.sh @@ -16,10 +16,13 @@ #=============================================================================== mkldnn_root="$1" -output="$2" +extra_include_dir="$2" +output="$3" echo -e '#include "mkldnn.h"' > "$output" echo -e "const void *c_functions[] = {" >> "$output" -cpp "${mkldnn_root}/include/mkldnn.h" | grep -o 'mkldnn_\w\+(' \ - | sed 's/\(.*\)(/(void*)\1,/g' | sort -u >> "$output" +cpp -I"${extra_include_dir}" "${mkldnn_root}/include/mkldnn.h" \ + | grep -o 'mkldnn_\w\+(' \ + | sed 's/\(.*\)(/(void*)\1,/g' \ + | sort -u >> "$output" echo -e "NULL};\nint main() { return 0; }" >> "$output" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/tests/gtests/CMakeLists.txt index 9439423..48829a5 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/CMakeLists.txt +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/CMakeLists.txt @@ -45,7 +45,6 @@ file(GLOB PRIM_TEST_CASES_SRC test_softmax_backward.cpp test_eltwise.cpp test_depthwise.cpp - test_relu.cpp test_lrn_forward.cpp test_lrn_backward.cpp test_pooling_forward.cpp @@ -61,9 +60,8 @@ file(GLOB PRIM_TEST_CASES_SRC test_convolution_forward_s16s16s32.cpp test_convolution_forward_u8s8s32.cpp test_convolution_forward_u8s8fp.cpp - test_convolution_relu_forward_f32.cpp - test_convolution_relu_forward_neg_slope_f32.cpp - test_convolution_relu_forward_s16s16s32.cpp + test_convolution_eltwise_forward_f32.cpp + test_convolution_eltwise_forward_x8s8f32s32.cpp test_convolution_backward_data_f32.cpp test_convolution_backward_data_s16s16s32.cpp test_convolution_backward_weights_f32.cpp @@ -72,10 +70,23 @@ file(GLOB PRIM_TEST_CASES_SRC test_gemm_f32.cpp test_gemm_s8u8s32.cpp test_gemm_s8s8s32.cpp + test_rnn_forward.cpp test_roi_pooling_forward.cpp - test_convolution_eltwise_forward_f32.cpp test_convolution_depthwise_forward_f32.cpp + test_convolution_depthwise_forward_x8s8f32s32.cpp test_convolution_dw_conv_f32.cpp + test_convolution_dw_conv_u8s8s32.cpp + test_binary_convolution_forward.cpp + test_binary_convolution_eltwise_forward.cpp + test_binary_convolution_depthwise_forward.cpp + test_binary_convolution_sum_forward.cpp + test_binary_convolution_binarization_forward.cpp + test_binarization.cpp + test_binary_convolution_dw_conv_forward.cpp + test_binary_convolution_dw_conv_eltwise_forward.cpp + test_binary_convolution_dw_conv_depthwise_forward.cpp + test_binary_convolution_dw_conv_sum_forward.cpp + test_binary_convolution_dw_conv_binarization_forward.cpp ) #temporary foreach(TEST_FILE ${PRIM_TEST_CASES_SRC}) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/convolution_common.h b/inference-engine/thirdparty/mkl-dnn/tests/gtests/convolution_common.h index 6fc6d85..8306a72 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/convolution_common.h +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/convolution_common.h @@ -18,12 +18,24 @@ #include "mkldnn.hpp" +#if defined(WITH_DW_CONV) +#define EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst) \ + { mkldnn::memory::format::src, mkldnn::memory::format::conv1_weights, mkldnn::memory::format::conv1_bias, \ + mkldnn::memory::format::conv2_weights, mkldnn::memory::format::conv2_bias, mkldnn::memory::format::dst } +#else #define EXPAND_FORMATS(src, weights, bias, dst) \ { mkldnn::memory::format::src, mkldnn::memory::format::weights, \ mkldnn::memory::format::bias, mkldnn::memory::format::dst } +#endif + +#define EXPAND_ARGS(args) args #define ENGINE mkldnn::engine::kind::cpu +#if defined(BIN) +#define ALGORITHM mkldnn::binary_convolution_direct +#else #define ALGORITHM mkldnn::convolution_direct +#endif #ifdef DIRECTION_FORWARD #if defined(FP32) @@ -47,6 +59,15 @@ #define FMT_WEIGHTS_BLOCKED_G gOhIw8o4i #define FMT_WEIGHTS_BLOCKED16 OIhw4i16o4i #define FMT_WEIGHTS_BLOCKED16_G gOIhw4i16o4i +#elif defined(BIN) +#define FMT_DATA_BLOCKED nhwc +#define FMT_DATA_BLOCKED16 nhwc +#define FMT_WEIGHTS_BLOCKED OhIw8o32i +#define FMT_WEIGHTS_BLOCKED_G OhIw8o32i +#define FMT_WEIGHTS_BLOCKED16 OhIw16o32i +#define FMT_WEIGHTS_BLOCKED16_G OhIw16o32i +#define FMT_WEIGHTS_DW_BLOCKED Goihw8g +#define FMT_WEIGHTS_DW_BLOCKED16 Goihw16g #endif #define FMT_WEIGHTS_BLOCKED16_IOhw16o16i FMT_WEIGHTS_BLOCKED16 #define TEST_CASE_NAME_PREFIX Forward @@ -85,42 +106,104 @@ #define CONCAT_WITH_UNDERSCORE_(a,b) a ## _ ## b #define CONCAT_WITH_UNDERSCORE(a,b) CONCAT_WITH_UNDERSCORE_(a,b) +#if defined(BIN) +#define INST_TEST_CASE_(str, ...) INSTANTIATE_TEST_CASE_P( \ + str, binary_convolution_test, ::testing::Values(__VA_ARGS__)) +#define INST_TEST_CASE(str, ...) INST_TEST_CASE_( \ + CONCAT_WITH_UNDERSCORE(TEST_CASE_NAME_PREFIX, str), __VA_ARGS__) +#else #define INST_TEST_CASE_(str, ...) INSTANTIATE_TEST_CASE_P( \ str, convolution_test, ::testing::Values(__VA_ARGS__)) #define INST_TEST_CASE(str, ...) INST_TEST_CASE_( \ CONCAT_WITH_UNDERSCORE(TEST_CASE_NAME_PREFIX, str), __VA_ARGS__) +#endif #define INST_TEST_CASE_3D_(str, ...) INSTANTIATE_TEST_CASE_P( \ str, convolution_test_3d, ::testing::Values(__VA_ARGS__)) #define INST_TEST_CASE_3D(str, ...) INST_TEST_CASE_3D_( \ CONCAT_WITH_UNDERSCORE(TEST_CASE_NAME_PREFIX, str), __VA_ARGS__) -#ifndef NEGATIVE_SLOPE -#define NEGATIVE_SLOPE 0.0f +#if defined(BIN) +#define PAD_VALUE -1.0f +#define ELTWISE_ALGORITHM mkldnn::algorithm_undef +#define DEPTHWISE_ALGORITHM mkldnn::algorithm_undef +#define BINARIZATION_ALGORITHM mkldnn::algorithm_undef +#define ELTWISE_ALPHA 0.5f +#define ELTWISE_BETA 0.1f + +#if defined(WITH_SUM) +#define WITH_SUM_BOOL true #else -#undef INST_TEST_CASE -#define INST_TEST_CASE(str, ...) INST_TEST_CASE_( \ - CONCAT_WITH_UNDERSCORE(CONCAT_WITH_UNDERSCORE(TEST_CASE_NAME_PREFIX, \ - str), neg_slope), __VA_ARGS__) +#define WITH_SUM_BOOL false #endif +#if defined(WITH_ELTWISE) +#if defined(WITH_DW_CONV) +#define PARAMS(elt_alg, src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst, ...) \ + test_binary_convolution_dw_conv_params_t { ENGINE, ALGORITHM, elt_alg, ELTWISE_ALPHA, ELTWISE_BETA, DEPTHWISE_ALGORITHM, WITH_SUM_BOOL, BINARIZATION_ALGORITHM, \ + EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst), \ + {__VA_ARGS__} } +#else +#define PARAMS(elt_alg, src, weights, bias, dst, ...) \ + test_binary_convolution_params_t { ENGINE, ALGORITHM, PAD_VALUE, elt_alg, ELTWISE_ALPHA, ELTWISE_BETA, DEPTHWISE_ALGORITHM, WITH_SUM_BOOL, BINARIZATION_ALGORITHM, \ + EXPAND_FORMATS(src, weights, bias, dst), \ + {__VA_ARGS__} } +#endif +#elif defined(WITH_DEPTHWISE) +#if defined(WITH_DW_CONV) +#define PARAMS(dep_alg, src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst, ...) \ + test_binary_convolution_dw_conv_params_t { ENGINE, ALGORITHM, ELTWISE_ALGORITHM, ELTWISE_ALPHA, ELTWISE_BETA, dep_alg, WITH_SUM_BOOL, BINARIZATION_ALGORITHM, \ + EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst), \ + {__VA_ARGS__} } +#else +#define PARAMS(dep_alg, src, weights, bias, dst, ...) \ + test_binary_convolution_params_t { ENGINE, ALGORITHM, PAD_VALUE, ELTWISE_ALGORITHM, ELTWISE_ALPHA, ELTWISE_BETA, dep_alg, WITH_SUM_BOOL, BINARIZATION_ALGORITHM, \ + EXPAND_FORMATS(src, weights, bias, dst), \ + {__VA_ARGS__} } +#endif +#elif defined(WITH_BINARIZATION) +#if defined(WITH_DW_CONV) +#define PARAMS(bin_alg, src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst, ...) \ + test_binary_convolution_dw_conv_params_t { ENGINE, ALGORITHM, ELTWISE_ALGORITHM, ELTWISE_ALPHA, ELTWISE_BETA, DEPTHWISE_ALGORITHM, WITH_SUM_BOOL, bin_alg, \ + EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst), \ + {__VA_ARGS__} } +#else +#define PARAMS(bin_alg, src, weights, bias, dst, ...) \ + test_binary_convolution_params_t { ENGINE, ALGORITHM, PAD_VALUE, ELTWISE_ALGORITHM, ELTWISE_ALPHA, ELTWISE_BETA, DEPTHWISE_ALGORITHM, WITH_SUM_BOOL, bin_alg, \ + EXPAND_FORMATS(src, weights, bias, dst), \ + {__VA_ARGS__} } +#endif +#else +#if defined(WITH_DW_CONV) +#define PARAMS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst, ...) \ + test_binary_convolution_dw_conv_params_t { ENGINE, ALGORITHM, ELTWISE_ALGORITHM, ELTWISE_ALPHA, ELTWISE_BETA, DEPTHWISE_ALGORITHM, WITH_SUM_BOOL, BINARIZATION_ALGORITHM, \ + EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst), \ + {__VA_ARGS__} } +#else #define PARAMS(src, weights, bias, dst, ...) \ - test_convolution_params_t { ENGINE, ALGORITHM, NEGATIVE_SLOPE, \ + test_binary_convolution_params_t { ENGINE, ALGORITHM, PAD_VALUE, ELTWISE_ALGORITHM, ELTWISE_ALPHA, ELTWISE_BETA, DEPTHWISE_ALGORITHM, WITH_SUM_BOOL, BINARIZATION_ALGORITHM, \ + EXPAND_FORMATS(src, weights, bias, dst), \ + {__VA_ARGS__} } +#endif +#endif +#else +#define PARAMS(src, weights, bias, dst, ...) \ + test_convolution_params_t { ENGINE, ALGORITHM, \ EXPAND_FORMATS(src, weights, bias, dst), /* empty attributes */ {}, \ {__VA_ARGS__} } +#endif #define PARAMS_3D(src, weights, bias, dst, ...) \ - test_convolution_params_t_3d { ENGINE, ALGORITHM, NEGATIVE_SLOPE, \ + test_convolution_params_t_3d { ENGINE, ALGORITHM, \ EXPAND_FORMATS(src, weights, bias, dst), /* empty attributes */ {}, \ {__VA_ARGS__} } - #define PARAMS_EXPECT_FAIL(src, weights, bias, dst, code, ...) \ - test_convolution_params_t { ENGINE, ALGORITHM, NEGATIVE_SLOPE, \ + test_convolution_params_t { ENGINE, ALGORITHM, \ EXPAND_FORMATS(src, weights, bias, dst), /* empty attributes */ {}, \ {__VA_ARGS__}, true, code } #define PARAMS_ATTR(src, weights, bias, dst, round_mode, scale, policy, ...) \ - test_convolution_params_t { ENGINE, ALGORITHM, NEGATIVE_SLOPE, \ + test_convolution_params_t { ENGINE, ALGORITHM, \ EXPAND_FORMATS(src, weights, bias, dst), \ {mkldnn::round_mode, scale, test_convolution_attr_t::scale_t::policy}, \ {__VA_ARGS__} } @@ -128,8 +211,12 @@ #ifdef TEST_PARAM_ATTR #include "convolution_attr.h" #else + +#if !defined(BIN) #include "convolution_simple_small.h" #endif + +#endif //#include "convolution_alexnet.h" //#include "convolution_googlenet_v1.h" //#include "convolution_googlenet_v2.h" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/convolution_simple_small.h b/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/convolution_simple_small.h index c9bf46a..f901e9b 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/convolution_simple_small.h +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/convolution_simple_small.h @@ -119,7 +119,7 @@ INST_TEST_CASE(SimpleSmall_Blocked_1x1, PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, 2, 4, 16, 10, 10, 32, 10, 10, 1, 1, 0, 0, 1, 1), PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, - 2, 8, 32, 10, 10, 256, 10, 10, 1, 1, 0, 0, 1, 1) + 1, 8, 32, 1, 1, 128, 1, 1, 1, 1, 0, 0, 1, 1) ); INST_TEST_CASE(SimpleSmall_Blocked16, @@ -164,6 +164,12 @@ INST_TEST_CASE(SimpleSmall_Blocked16, ); INST_TEST_CASE(SimpleSmall_Regression, + /* grouped small input-channel avx512 */ + PARAMS(nchw, gOhwi16o, FMT_BIAS, FMT_DATA_BLOCKED16, + 2, 2, 16, 8, 8, 32, 8, 8, 3, 3, 1, 1, 1, 1), + /* grouped small input-channel avx2 */ + PARAMS(nchw, gOhwi8o, FMT_BIAS, nChw8c, + 2, 2, 4, 2, 2, 16, 8, 8, 3, 3, 1, 1, 1, 1), PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 32, 16, 16, 32, 16, 16, 3, 3, 0, 0, 1, 1), PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h b/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h index 5b3c34a..7c8b692 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h @@ -1,136 +1,307 @@ -constexpr char unused = 'x'; - #if defined(FP32) INST_TEST_CASE(TestGEMM, - test_params{unused, 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments}, - test_params{unused, 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments}, - test_params{unused, 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments}, - test_params{unused, 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments}, - - test_params{unused, 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{unused, 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{unused, 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{unused, 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{unused, 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false}, - test_params{unused, 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{unused, 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{unused, 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{unused, 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false}, - - test_params{unused, 'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{unused, 'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - test_params{unused, 't', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{unused, 't', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - test_params{unused, 'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{unused, 'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - test_params{unused, 't', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{unused, 't', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false} + test_params{'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'t', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, true, mkldnn_invalid_arguments}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, {}, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, {}, false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, {}, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, {}, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, {}, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, {}, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, {}, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, {}, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, {}, false}, + + test_params{'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, {}, false}, + test_params{'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, {}, false}, + test_params{'t', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, {}, false}, + test_params{'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, {}, false}, + test_params{'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, {}, false}, + test_params{'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, {}, false}, + test_params{'t', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, {}, false}, + test_params{'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, {}, false} ); #else +constexpr test_igemm_params fix_use_oc = {'F', true, true, false}; +constexpr test_igemm_params col_use_oc = {'C', true, true, false}; +constexpr test_igemm_params row_use_oc = {'R', true, true, false}; + +constexpr test_igemm_params fix_use_all_offsets = {'F', false, false, false}; +constexpr test_igemm_params col_use_all_offsets = {'C', false, false, false}; +constexpr test_igemm_params row_use_all_offsets = {'R', false, false, false}; + +constexpr test_igemm_params fix_no_offsets = {'F', true, true, true}; +constexpr test_igemm_params col_no_offsets = {'C', true, true, true}; +constexpr test_igemm_params row_no_offsets = {'R', true, true, true}; INST_TEST_CASE(TestGEMM_expected_failures, - test_params{'f', 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments}, - test_params{'f', 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments}, - test_params{'f', 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments}, - test_params{'f', 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments}, - - test_params{'r', 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments}, - test_params{'R', 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments}, - test_params{'r', 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments}, - test_params{'R', 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments}, - - test_params{'c', 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments}, - test_params{'C', 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments}, - test_params{'c', 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments}, - test_params{'C', 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments} + test_params{'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'t', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, true, mkldnn_invalid_arguments}, + + test_params{'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'t', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, true, mkldnn_invalid_arguments}, + + test_params{'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'t', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, {}, true, mkldnn_invalid_arguments}, + test_params{'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, true, mkldnn_invalid_arguments} +); + +INST_TEST_CASE(TestGEMM_general_cases_fix_offset, + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc, false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, fix_use_oc, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, fix_use_oc, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_use_oc, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_use_oc, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, fix_use_oc, false}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_all_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_all_offsets,false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_all_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_all_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, fix_use_all_offsets, false}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets,false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, fix_no_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, fix_no_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_no_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_no_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, fix_no_offsets, false} +); + +INST_TEST_CASE(TestGEMM_general_cases_col_offset, + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc, false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, col_use_oc, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, col_use_oc, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_use_oc, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_use_oc, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, col_use_oc, false}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_all_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_all_offsets,false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_all_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_all_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, col_use_all_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, col_use_all_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_use_all_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_use_all_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, col_use_all_offsets, false}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets,false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, col_no_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, col_no_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_no_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_no_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, col_no_offsets, false} ); -INST_TEST_CASE(TestGEMM_general_cases, - /* offsetc is fixed */ - test_params{'f', 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'f', 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'f', 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'f', 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'f', 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false}, - test_params{'f', 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'f', 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'f', 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'f', 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false}, - - /* offsetc is row */ - test_params{'r', 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'R', 'n', 'T', 30, 20, 10, 2.0, 1.0, 120, 120, 120, false}, - test_params{'r', 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'R', 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'r', 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false}, - test_params{'r', 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'R', 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'R', 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'R', 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false}, - - /* offsetc is column */ - test_params{'C', 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'c', 'n', 'T', 30, 20, 10, 2.0, 1.0, 120, 120, 120, false}, - test_params{'c', 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'c', 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false}, - test_params{'C', 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false}, - test_params{'C', 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'C', 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'c', 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false}, - test_params{'c', 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false} +INST_TEST_CASE(TestGEMM_general_cases_row_offset, + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc, false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, row_use_oc, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, row_use_oc, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_use_oc, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_use_oc, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, row_use_oc, false}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_all_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_all_offsets,false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_all_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_all_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, row_use_all_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, row_use_all_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_use_all_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_use_all_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, row_use_all_offsets, false}, + + test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets,false}, + test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, row_no_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, row_no_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_no_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_no_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, row_no_offsets, false} ); -INST_TEST_CASE(TestGEMM_fractional_scales, +INST_TEST_CASE(TestGEMM_fractional_scales_fix_offset, /* alpha and beta have non-zero fractional part */ - test_params{'f', 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, false}, - test_params{'F', 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, false}, - test_params{'f', 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, false}, - test_params{'F', 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, false}, - test_params{'f', 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, false}, - test_params{'f', 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, false}, - test_params{'F', 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, false}, - test_params{'F', 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, false}, - test_params{'f', 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, false}, - - test_params{'r', 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, false}, - test_params{'R', 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, false}, - test_params{'r', 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, false}, - test_params{'R', 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, false}, - test_params{'r', 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, false}, - test_params{'r', 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, false}, - test_params{'R', 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, false}, - test_params{'R', 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, false}, - test_params{'r', 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, false}, - - test_params{'C', 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, false}, - test_params{'c', 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, false}, - test_params{'c', 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, false}, - test_params{'c', 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, false}, - test_params{'C', 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, false}, - test_params{'C', 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, false}, - test_params{'C', 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, false}, - test_params{'c', 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, false}, - test_params{'c', 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, false} + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, fix_use_oc, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, fix_use_oc, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, fix_use_oc, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, fix_use_oc, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, fix_use_oc, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, fix_use_oc, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, fix_use_oc, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, fix_use_oc, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, fix_use_oc, false}, + + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, fix_use_all_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, fix_use_all_offsets, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, fix_use_all_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, fix_use_all_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, fix_use_all_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, fix_use_all_offsets, false}, + + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, fix_no_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, fix_no_offsets, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, fix_no_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, fix_no_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, fix_no_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, fix_no_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, fix_no_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, fix_no_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, fix_no_offsets, false} ); +INST_TEST_CASE(TestGEMM_fractional_scales_col_offset, + /* alpha and beta have non-zero fractional part */ + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, col_use_oc, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, col_use_oc, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, col_use_oc, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, col_use_oc, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, col_use_oc, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, col_use_oc, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, col_use_oc, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, col_use_oc, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, col_use_oc, false}, + + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, col_use_all_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, col_use_all_offsets, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, col_use_all_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, col_use_all_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, col_use_all_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, col_use_all_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, col_use_all_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, col_use_all_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, col_use_all_offsets, false}, + + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, col_no_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, col_no_offsets, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, col_no_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, col_no_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, col_no_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, col_no_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, col_no_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, col_no_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, col_no_offsets, false} +); + +INST_TEST_CASE(TestGEMM_fractional_scales_row_offset, + /* alpha and beta have non-zero fractional part */ + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, row_use_oc, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, row_use_oc, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, row_use_oc, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, row_use_oc, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, row_use_oc, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, row_use_oc, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, row_use_oc, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, row_use_oc, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, row_use_oc, false}, + + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, row_use_all_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, row_use_all_offsets, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, row_use_all_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, row_use_all_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, row_use_all_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, row_use_all_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, row_use_all_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, row_use_all_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, row_use_all_offsets, false}, + + test_params{'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, row_no_offsets, false}, + test_params{'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, row_no_offsets, false}, + test_params{'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, row_no_offsets, false}, + test_params{'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, row_no_offsets, false}, + test_params{'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, row_no_offsets, false}, + test_params{'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, row_no_offsets, false}, + test_params{'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, row_no_offsets, false}, + test_params{'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, row_no_offsets, false}, + test_params{'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, row_no_offsets, false} +); + + +INST_TEST_CASE(TestGEMV, + test_params{'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1000, 2000, fix_no_offsets, false}, + test_params{'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 1, 2000, 1, fix_no_offsets, false}, + test_params{'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1000, 2000, fix_no_offsets, false}, + test_params{'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2000, 2000, 1, fix_no_offsets, false}, + test_params{'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1, 2000, fix_no_offsets, false}, + test_params{'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 1, 3000, 1, fix_no_offsets, false}, + test_params{'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1, 2000, fix_no_offsets, false}, + test_params{'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 2000, 3000, 1, fix_no_offsets, false}, + + test_params{'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 2000, fix_no_offsets, false}, + test_params{'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 2000, 1, fix_no_offsets, false}, + test_params{'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 2000, fix_no_offsets, false}, + test_params{'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 1, fix_no_offsets, false}, + test_params{'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 2000, fix_no_offsets, false}, + test_params{'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 1, fix_no_offsets, false}, + test_params{'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 2000, fix_no_offsets, false}, + test_params{'t', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 1, fix_no_offsets, false} +); + +INST_TEST_CASE(TestGEMV_kblocking, + test_params{'t', 'n', 20, 1, 7000, 1.0f, 0.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 't', 50, 1, 7000, 1.0f, 0.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 400, 1, 7000, 1.0f, 0.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 't', 500, 1, 7000, 1.0f, 0.0f, 7000, 1, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 20, 1, 7000, 1.0f, 1.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 't', 50, 1, 7000, 1.0f, 1.0f, 7000, 1, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 500, 1, 7000, 1.0f, 1.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 't', 500, 1, 7000, 1.0f, 1.0f, 7000, 7000, 7000, fix_no_offsets, false}, + + test_params{'n', 'n', 1, 40, 7000, 1.0f, 0.0f, 1, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 1, 10, 7000, 1.0f, 0.0f, 7000, 7000, 1, fix_no_offsets, false}, + test_params{'n', 'n', 1, 400, 7000, 1.0f, 0.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 1, 100, 7000, 1.0f, 0.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'n', 'n', 1, 40, 7000, 1.0f, 1.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 1, 10, 7000, 1.0f, 1.0f, 7000, 7000, 7000, fix_no_offsets, false}, + test_params{'n', 'n', 1, 400, 7000, 1.0f, 1.0f, 1, 7000, 7000, fix_no_offsets, false}, + test_params{'t', 'n', 1, 550, 7000, 1.0f, 1.0f, 7000, 7000, 1, fix_no_offsets, false} +); + + INST_TEST_CASE(TestGEMM_heavy, - test_params{'f', 'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{'f', 'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - test_params{'f', 't', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{'f', 't', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - test_params{'f', 'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{'f', 'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - test_params{'f', 't', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false}, - test_params{'f', 't', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}, - - test_params{'f', 'n', 'n', 2000, 2000, 2000, 2.33f, 1.66f, 2000, 2000, 2000, false}, - test_params{'f', 'n', 'n', 3000, 3000, 3000, 2.19f, 1.99f, 3000, 3000, 3000, false}, - test_params{'f', 't', 'n', 2000, 2000, 2000, 2.01f, 1.01f, 2000, 2000, 2000, false}, - test_params{'f', 't', 'n', 3000, 3000, 3000, 2.99f, 1.19f, 3000, 3000, 3000, false}, - test_params{'f', 'n', 't', 2000, 2000, 2000, 1.33f, 2.33f, 2000, 2000, 2000, false}, - test_params{'f', 'n', 't', 3000, 3000, 3000, 1.19f, 2.99f, 3000, 3000, 3000, false}, - test_params{'f', 't', 't', 2000, 2000, 2000, 1.01f, 2.01f, 2000, 2000, 2000, false}, - test_params{'f', 't', 't', 3000, 3000, 3000, 1.99f, 2.19f, 3000, 3000, 3000, false} + test_params{'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, fix_use_oc, false}, + test_params{'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, fix_use_oc, false}, + test_params{'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, fix_use_oc, false}, + test_params{'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, fix_use_oc, false}, + + test_params{'n', 'n', 3000, 3000, 3000, 2.19f, 1.99f, 3000, 3000, 3000, fix_use_oc, false}, + test_params{'t', 'n', 3000, 3000, 3000, 2.99f, 1.19f, 3000, 3000, 3000, fix_use_oc, false}, + test_params{'n', 't', 3000, 3000, 3000, 1.19f, 2.99f, 3000, 3000, 3000, fix_use_oc, false}, + test_params{'t', 't', 3000, 3000, 3000, 1.99f, 2.19f, 3000, 3000, 3000, fix_use_oc, false} ); + #endif diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/mkldnn_test_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/mkldnn_test_common.hpp index 317c086..61efe71 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/mkldnn_test_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/mkldnn_test_common.hpp @@ -112,9 +112,9 @@ inline size_t map_index(const mkldnn::memory::desc &md, size_t index, || (md.data.format == bwd_weights_qvnni); const int ndims = md.data.ndims; - const int *dims = md.data.dims; - const int *pdims = md.data.layout_desc.blocking.padding_dims; - const int *optd = md.data.layout_desc.blocking.offset_padding_to_data; + const ptrdiff_t *dims = md.data.dims; + const ptrdiff_t *pdims = md.data.layout_desc.blocking.padding_dims; + const ptrdiff_t *optd = md.data.layout_desc.blocking.offset_padding_to_data; auto *strides_block = md.data.layout_desc.blocking.strides[0]; auto *strides_within_block = md.data.layout_desc.blocking.strides[1]; @@ -179,8 +179,8 @@ void check_zero_tail(int set_zero_flag, mkldnn::memory &src) { const mkldnn::memory::desc src_d = src.get_primitive_desc().desc(); const int ndims = src_d.data.ndims; - const int *dims = src_d.data.dims; - const int *pdims = src_d.data.layout_desc.blocking.padding_dims; + const ptrdiff_t *dims = src_d.data.dims; + const ptrdiff_t *pdims = src_d.data.layout_desc.blocking.padding_dims; size_t idx[MAX_NDIMS] = {}, str[MAX_NDIMS] = {}; size_t nelems = 1; @@ -237,6 +237,7 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims, case f::nChw16c: case f::oihw: case f::hwio: + case f::iohw: case f::oIhw8i: case f::oIhw16i: case f::OIhw8i8o: @@ -250,6 +251,10 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims, case f::Ohwi8o: case f::Ohwi16o: case f::OhIw8o4i: + case f::OIhw4i16o4i_s8s8: + case f::OhIw8o4i_s8s8: + case f::OhIw8o32i: + case f::OhIw16o32i: ndims = 4; break; case f::ncdhw: case f::ndhwc: @@ -259,6 +264,7 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims, case f::oidhw: case f::goihw: case f::hwigo: + case f::giohw: case f::oIdhw8i: case f::oIdhw16i: case f::OIdhw8i8o: @@ -268,6 +274,7 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims, case f::gOhwi8o: case f::Goihw8g: case f::Goihw16g: + case f::gOhwi16o: case f::gOIhw8i8o: case f::gOIhw16i16o: case f::gOIhw8i16o2i: @@ -277,6 +284,7 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims, case f::gOIhw16o16i: case f::gIOhw16o16i: case f::gOhIw8o4i: + case f::Goihw16g_s8s8: ndims = 5; break; case f::gOIdhw8i8o: case f::gOIdhw16i16o: @@ -340,14 +348,19 @@ static void fill_data(const size_t size, data_t *data, double sparsity = 1., }); } +int div_up(const int a, const int b) { + return (a + b - 1) / b; +} + template static void compare_data(mkldnn::memory& ref, mkldnn::memory& dst, - data_t threshold = (data_t)1e-4) + data_t threshold = (data_t)1e-4, bool isBinary = false) { using data_type = mkldnn::memory::data_type; ASSERT_TRUE(data_traits::data_type == data_type::f32 || - data_traits::data_type == data_type::s32); + data_traits::data_type == data_type::s32 || + data_traits::data_type == data_type::u8); /* Note: size_t incompatible with MSVC++ */ auto ref_desc = ref.get_primitive_desc().desc(); @@ -365,21 +378,27 @@ static void compare_data(mkldnn::memory& ref, mkldnn::memory& dst, ptrdiff_t num = 1; for (auto d = 0; d < ndims; ++d) { - num *= dims[d]; + if (isBinary && d == 1) { + num *= div_up(dims[d], 8); + } else { + num *= dims[d]; + } } data_t *ref_data = (data_t *)ref.get_data_handle(); data_t *dst_data = (data_t *)dst.get_data_handle(); mkldnn::impl::parallel_nd(num, [&](ptrdiff_t i) { - data_t ref = ref_data[map_index(ref_desc, i)]; - data_t got = dst_data[map_index(dst_desc, i)]; + int divider = isBinary ? 8 : 1; + + data_t ref = ref_data[map_index(ref_desc, i) / divider]; + data_t got = dst_data[map_index(dst_desc, i) / divider]; if (data_traits::data_type == data_type::f32) { data_t diff = got - ref; data_t e = (std::abs(ref) > threshold) ? diff / ref : diff; - EXPECT_NEAR(e, (data_t)0.0, threshold) - << "Index: " << i << " Total: " << num; + EXPECT_NEAR(e, (data_t) 0.0, threshold) + << "Index: " << i << " Total: " << num; } else { EXPECT_EQ(ref, got) << "Index: " << i << " Total: " << num; } @@ -505,7 +524,6 @@ struct test_convolution_formats_t { struct test_convolution_params_t { const mkldnn::engine::kind engine_kind; mkldnn::algorithm aalgorithm; - const float relu_negative_slope; test_convolution_formats_t formats; test_convolution_attr_t attr; test_convolution_sizes_t sizes; @@ -516,7 +534,6 @@ struct test_convolution_params_t { struct test_convolution_params_t_3d { const mkldnn::engine::kind engine_kind; mkldnn::algorithm aalgorithm; - const float relu_negative_slope; test_convolution_formats_t formats; test_convolution_attr_t attr; test_convolution_sizes_t_3d sizes; @@ -621,6 +638,33 @@ struct roi_pool_test_params { test_roi_pool_desc_t test_pd; }; +struct test_binary_convolution_params_t { + const mkldnn::engine::kind engine_kind; + mkldnn::algorithm aalgorithm; + float pad_value; + mkldnn::algorithm eltwise_algorithm; + const float eltwise_alpha; + const float eltwise_beta; + mkldnn::algorithm depthwise_algorithm; + bool with_sum; + mkldnn::algorithm binarization_algorithm; + test_convolution_formats_t formats; + test_convolution_sizes_t sizes; +}; + +struct test_binary_convolution_dw_conv_params_t { + const mkldnn::engine::kind engine_kind; + mkldnn::algorithm aalgorithm; + mkldnn::algorithm eltwise_algorithm; + const float eltwise_alpha; + const float eltwise_beta; + mkldnn::algorithm depthwise_algorithm; + bool with_sum; + mkldnn::algorithm binarization_algorithm; + test_convolution_dw_conv_formats_t formats; + test_convolution_dw_conv_sizes_t sizes; +}; + std::ostream &operator<<(std::ostream &stream, const roi_pool_test_params &tp) { @@ -634,7 +678,7 @@ std::ostream &operator<<(std::ostream &stream, } template bool catch_expected_failures(const F &f, - bool expect_to_fail, mkldnn_status_t expected_status) + bool expect_to_fail, mkldnn_status_t expected_status, bool ignore_unimplemented = true) { try { f(); @@ -643,7 +687,7 @@ template bool catch_expected_failures(const F &f, // not match. if (!(expect_to_fail) || e.status != (expected_status)) { // Ignore unimplemented - if (e.status == mkldnn_unimplemented) + if ( ignore_unimplemented && (e.status == mkldnn_unimplemented)) return true; else throw e; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_batch_normalization.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_batch_normalization.cpp index 48d5bfc..8b82ddb 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_batch_normalization.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_batch_normalization.cpp @@ -152,7 +152,7 @@ void check_bnrm_bwd(const test_bnrm_params_t &p, { const test_bnrm_sizes_t &bp = p.sizes; const bool use_weights = flags & use_scale_shift; - const bool calculate_diff_stats = !(flags & omit_stats); + const bool calculate_diff_stats = !(flags & use_global_stats); const data_t *src_data = (const data_t *)src.get_data_handle(); const data_t *weights_data = use_weights ? (const data_t *)weights.get_data_handle() : nullptr; @@ -316,11 +316,11 @@ protected: Forward(use_scale_shift | use_global_stats, training); Backward(0u, backward_data); - Backward(omit_stats, backward_data); + Backward(use_global_stats, backward_data); Backward(use_scale_shift, backward); Backward(use_scale_shift, backward_data); - Backward(use_scale_shift | omit_stats, backward); - Backward(use_scale_shift | omit_stats, backward_data); + Backward(use_scale_shift | use_global_stats, backward); + Backward(use_scale_shift | use_global_stats, backward_data); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binarization.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binarization.cpp new file mode 100644 index 0000000..e720faf --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binarization.cpp @@ -0,0 +1,160 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "gtest/gtest.h" +#include "mkldnn_test_common.hpp" +#include "mkldnn.hpp" + +namespace mkldnn { + +template +struct binarization_test_params { + engine::kind engine_kind; + algorithm alg_kind; + memory::format data_format; + memory::dims dims; +}; + +template +void check_binarization_fwd(const binarization_test_params &p, + const memory::desc &src_md, const memory &src, const memory &weights, const memory &dst) { + auto src_data = (src_data_t*)src.get_data_handle(); + auto weights_data = (src_data_t*)weights.get_data_handle(); + auto dst_data = (uint8_t*)dst.get_data_handle(); + + const memory::desc src_d = src.get_primitive_desc().desc(); + const memory::desc weights_d = weights.get_primitive_desc().desc(); + const memory::desc dst_d = dst.get_primitive_desc().desc(); + + int N = src_md.data.ndims > 0 ? src_md.data.dims[0] : 1; + int C = src_md.data.ndims > 1 ? src_md.data.dims[1] : 1; + int H = src_md.data.ndims > 2 ? src_md.data.dims[2] : 1; + int W = src_md.data.ndims > 3 ? src_md.data.dims[3] : 1; + + int nbits = 8; + int CB = div_up(C, nbits); + + int padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + int padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1]; + + for (int n = 0; n < N; ++n) { + for (int cb = 0; cb < CB; ++cb) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + + uint8_t bin_val = 0x00; + for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { + int src_idx = n*padded_ic*H*W + c*H*W + h*W + w; + int wei_idx = c; + + src_data_t s_val = src_data[map_index(src_d, src_idx)]; + src_data_t w_val = weights_data[map_index(weights_d, wei_idx)]; + + auto bit = uint8_t((s_val > w_val) ? 0x01 : 0x00); + bin_val |= (bit << shift); + } + + int dst_idx = n*padded_oc*H*W + cb*nbits*H*W + h*W + w; + dst_idx = map_index(dst_d, dst_idx); + + EXPECT_EQ(dst_data[dst_idx / nbits], bin_val); + } + } + } + } +} + +template +class binarization_test : public ::testing::TestWithParam> { +private: + +protected: + virtual void SetUp() { + auto p = ::testing::TestWithParam>::GetParam(); + + auto eng = engine(p.engine_kind, 0); + auto src_data_type = data_traits::data_type; + + memory::dims src_dims = memory::dims({p.dims[0], p.dims[1], p.dims[2], p.dims[3]}); + memory::dims wei_dims = memory::dims({src_dims[1]}); + memory::dims dst_dims = memory::dims({p.dims[0], p.dims[1], p.dims[2], p.dims[3]}); + + auto src_desc = create_md(src_dims, src_data_type, p.data_format); + auto weights_desc = create_md(wei_dims, src_data_type, memory::format::x); + auto dst_desc = create_md(dst_dims, memory::data_type::bin, p.data_format); + + auto src = test_memory(src_desc, eng); + auto weights = test_memory(weights_desc, eng); + auto dst = test_memory(dst_desc, eng); + + fill_data(src.get_size() / sizeof(src_data_t), (src_data_t *)src.get().get_data_handle(), + src_data_t(0), src_data_t(1)); + fill_data(weights.get_size() / sizeof(src_data_t), (src_data_t *)weights.get().get_data_handle(), + src_data_t(0), src_data_t(1)); + fill_data(dst.get_size() / sizeof(uint8_t), (uint8_t*)dst.get().get_data_handle()); + + std::vector pipeline; + auto binarization_desc = binarization_forward::desc(prop_kind::forward_training, p.alg_kind, src_desc, weights_desc, dst_desc); + auto binarization_prim_desc = binarization_forward::primitive_desc(binarization_desc, eng); + auto binarization = binarization_forward(binarization_prim_desc, src.get(), weights.get(), dst.get()); + + pipeline.push_back(binarization); + auto s = stream(stream::kind::lazy); + s.submit(pipeline).wait(); + + check_binarization_fwd(p, src_desc, src.get(), weights.get(), dst.get()); + } +}; + +using binarization_test_float = binarization_test; +using binarization_test_params_float = binarization_test_params; + +TEST_P(binarization_test_float, TestsBinarization) +{ +} + +#define EXPAND(args) args + +#define EXPAND_FORMATS(data) memory::format::data + +#define ENGINE engine::kind::cpu + +#define PARAMS(alg, data, mb, c, h, w) \ + binarization_test_params_float { ENGINE, algorithm::alg, \ + EXPAND_FORMATS(data), {mb, c, h, w} } + +#define PARAMS_ALL_ALG(...) \ + EXPAND(PARAMS(binarization_depthwise, __VA_ARGS__)) + +#define INST_TEST_CASE(str, ...) INSTANTIATE_TEST_CASE_P( \ + str, binarization_test_float, ::testing::Values(__VA_ARGS__)) + +INST_TEST_CASE(Simple_NHWC, + PARAMS_ALL_ALG(nhwc, 2, 8, 4, 4), + PARAMS_ALL_ALG(nhwc, 2, 16, 4, 4), + PARAMS_ALL_ALG(nhwc, 2, 16, 8, 8), + PARAMS_ALL_ALG(nhwc, 2, 16, 16, 8), + PARAMS_ALL_ALG(nhwc, 2, 16, 10, 8), + PARAMS_ALL_ALG(nhwc, 10, 10, 10, 10), + PARAMS_ALL_ALG(nhwc, 256, 64, 8, 16), + PARAMS_ALL_ALG(nhwc, 1, 1, 1, 1), + PARAMS_ALL_ALG(nhwc, 3, 5, 7, 11), + PARAMS_ALL_ALG(nhwc, 2, 129, 7, 4), + PARAMS_ALL_ALG(nhwc, 2, 333, 8, 3) +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_binarization_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_binarization_forward.cpp new file mode 100644 index 0000000..acdd555 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_binarization_forward.cpp @@ -0,0 +1,74 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionBinarization) +{ +} + +#define BIN +#define WITH_BINARIZATION +#define DIRECTION_FORWARD +#include "convolution_common.h" + +#define PARAMS_WITH_BINARIZATION(...) \ + EXPAND_ARGS(PARAMS(binarization_depthwise, __VA_ARGS__)) + +INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels, + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 41, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 47, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +); + +INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels, + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 47, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1) +); + +//INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels, +// PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 43, 43, 10, 10, 43, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 256, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +//); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_depthwise_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_depthwise_forward.cpp new file mode 100644 index 0000000..3293371 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_depthwise_forward.cpp @@ -0,0 +1,75 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionDepthwise) +{ +} + +#define BIN +#define WITH_DEPTHWISE +#define DIRECTION_FORWARD +#include "convolution_common.h" + +#define PARAMS_WITH_DEPTHWISE(...) \ + EXPAND_ARGS(PARAMS(depthwise_scale_shift, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(depthwise_prelu, __VA_ARGS__)) + +INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels, + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 41, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 47, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +); + +INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels, + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 47, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1) +); + +//INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels, +// PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 43, 43, 10, 10, 43, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 256, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +//); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_binarization_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_binarization_forward.cpp new file mode 100644 index 0000000..8d0019a --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_binarization_forward.cpp @@ -0,0 +1,56 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_dw_conv_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionDwConvBinarization) +{ +} + +#define BIN +#define WITH_DW_CONV +#define WITH_BINARIZATION +#define DIRECTION_FORWARD +#include "convolution_common.h" + +#define PARAMS_WITH_BINARIZATION(...) \ + EXPAND_ARGS(PARAMS(binarization_depthwise, __VA_ARGS__)) + +INST_TEST_CASE(SimpleSmall_Blocked, + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 1, 19, 5, 5, 77, 1, 1, 0, 0, 1, 1, 77, 3, 3, 1, 1, 1, 1) +); + +INST_TEST_CASE(Mobilenet_Blocked, + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 8, 19, 33, 56, 3, 3, 1, 1, 2, 2, 56, 3, 3, 1, 1, 1, 1), // 1_1 + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 56, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 1, 1), // 2_2 + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 2, 2), // 3_1 + PARAMS_WITH_BINARIZATION(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 240, 1, 1, 0, 0, 1, 1, 240, 3, 3, 1, 1, 1, 1) // 5_3 +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_depthwise_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_depthwise_forward.cpp new file mode 100644 index 0000000..23c7ab1 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_depthwise_forward.cpp @@ -0,0 +1,46 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_dw_conv_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionDwConvDepthwise) +{ +} + +#define BIN +#define WITH_DW_CONV +#define WITH_DEPTHWISE +#define DIRECTION_FORWARD +#include "convolution_common.h" + +#define PARAMS_WITH_DEPTHWISE(...) \ + EXPAND_ARGS(PARAMS(depthwise_scale_shift, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(depthwise_prelu, __VA_ARGS__)) + +INST_TEST_CASE(SimpleSmall_Blocked, + PARAMS_WITH_DEPTHWISE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 1, 7, 10, 10, 37, 1, 1, 0, 0, 1, 1, 37, 3, 3, 1, 1, 1, 1) +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_eltwise_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_eltwise_forward.cpp new file mode 100644 index 0000000..acbdb23 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_eltwise_forward.cpp @@ -0,0 +1,55 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_dw_conv_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionDwConvEltwise) +{ +} + +#define BIN +#define WITH_DW_CONV +#define WITH_ELTWISE +#define DIRECTION_FORWARD +#include "convolution_common.h" + +#define PARAMS_WITH_ELTIWSE(...) \ + EXPAND_ARGS(PARAMS(eltwise_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_elu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_tanh, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_square, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_abs, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_sqrt, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_linear, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_bounded_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_soft_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_logistic, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_clamp, __VA_ARGS__)) + +INST_TEST_CASE(Mobilenet_Blocked, + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 1, 7, 10, 10, 37, 1, 1, 0, 0, 1, 1, 37, 3, 3, 1, 1, 2, 2) +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward.cpp new file mode 100644 index 0000000..c813834 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward.cpp @@ -0,0 +1,61 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_dw_conv_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionDwConv) +{ +} + +#define BIN +#define WITH_DW_CONV +#define DIRECTION_FORWARD +#include "convolution_common.h" + +INST_TEST_CASE(Mobilenet_Blocked, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 8, 19, 33, 56, 3, 3, 1, 1, 2, 2, 56, 3, 3, 1, 1, 1, 1), // 1_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 32, 19, 33, 56, 1, 1, 0, 0, 1, 1, 56, 3, 3, 1, 1, 2, 2), // 2_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 56, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 1, 1), // 2_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 2, 2), // 3_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 4, 8, 208, 1, 1, 0, 0, 1, 1, 208, 3, 3, 1, 1, 1, 1), // 3_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 208, 4, 8, 216, 1, 1, 0, 0, 1, 1, 216, 3, 3, 1, 1, 2, 2), // 4_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 216, 2, 4, 328, 1, 1, 0, 0, 1, 1, 328, 3, 3, 1, 1, 1, 1), // 4_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 328, 2, 4, 288, 1, 1, 0, 0, 1, 1, 288, 3, 3, 1, 1, 1, 1), // 5_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 288, 1, 1, 0, 0, 1, 1, 288, 3, 3, 1, 1, 1, 1), // 5_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 240, 1, 1, 0, 0, 1, 1, 240, 3, 3, 1, 1, 1, 1), // 5_3 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 240, 2, 4, 264, 1, 1, 0, 0, 1, 1, 264, 3, 3, 1, 1, 1, 1) // 5_4 +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward_common.hpp new file mode 100644 index 0000000..b84f715 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_forward_common.hpp @@ -0,0 +1,528 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef TEST_BINARY_CONVOLUTION_DW_CONV_FORWARD_COMMON_HPP +#define TEST_BINARY_CONVOLUTION_DW_CONV_FORWARD_COMMON_HPP + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" +#include "math_utils.hpp" +#include "mkldnn.hpp" + +using namespace mkldnn::impl::math; + +namespace mkldnn { + +void compute_ref_bin_conv_fwd(const test_binary_convolution_dw_conv_params_t &p, + const memory::desc &src_d, + const memory::desc &weights_d, + const memory::desc &dst_d, + const memory &src, + const memory &weights, + const memory &dst, + const memory &depthwise_weights, + const memory &depthwise_bias) +{ + auto src_dims = src_d.data.dims; + auto dst_dims = dst_d.data.dims; + auto sizes = p.sizes; + test_convolution_sizes_t c = {(int)src_dims[0], 1, sizes.ic, (int)src_dims[2], (int)src_dims[3], + (int)dst_dims[1], (int)dst_dims[2], (int)dst_dims[3], + sizes.conv1_kh, sizes.conv1_kw, sizes.conv1_padh, sizes.conv1_padw, sizes.conv1_strh, sizes.conv1_strw}; + + float pad_value = -1.f; + + uint8_t* src_data = (uint8_t*)src.get_data_handle(); + uint8_t* weights_data = (uint8_t*)weights.get_data_handle(); + float* dst_data = (float*)dst.get_data_handle(); + + float *d_weights_data = (float *)depthwise_weights.get_data_handle(); + float *d_bias_data = (float *)depthwise_bias.get_data_handle(); + + int nbits = 8; + + size_t padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_ic_w = weights_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_oc_w = weights_d.data.layout_desc.blocking.padding_dims[0]; + + auto extract_bit = [](uint8_t val, uint8_t bit) -> uint8_t { + return (uint8_t) ((val >> bit) & 0x0001); + }; + + mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow, + [&](int n, int g, int oc, int oh, int ow) { + int32_t a = 0; + int roi = 0; + for (int ic = 0; ic < c.ic; ic++) { + for (int kh = 0; kh < c.kh; kh++) { + for (int kw = 0; kw < c.kw; kw++) { + int ih = oh * c.strh - c.padh + kh * (1 + c.dilh); + int iw = ow * c.strw - c.padw + kw * (1 + c.dilw); + + size_t iidx = n * padded_ic * c.ih * c.iw + + g * padded_ic / c.ng * c.ih * c.iw + + ic * c.ih * c.iw + ih * c.iw + iw; + iidx = map_index(src_d, iidx); + + uint8_t s; + if (ih < 0 || ih >= c.ih || iw < 0 || iw >= c.iw) { + if (pad_value == 0.0f) { + continue; + } else { + s = pad_value == 1.0f ? (uint8_t)1 : (uint8_t)0; + } + } else { + s = extract_bit(src_data[iidx/nbits], (uint8_t)(iidx % nbits)); + } + + size_t widx = g * padded_oc_w / c.ng * padded_ic_w + / c.ng * c.kh * c.kw + + oc * padded_ic_w / c.ng * c.kh * c.kw + + ic * c.kh * c.kw + kh * c.kw + kw; + widx = map_index(weights_d, widx); + + uint8_t w = extract_bit(weights_data[widx/nbits], (uint8_t)(widx % nbits)); + + a += (int32_t)(s ^ w); + + roi++; + } + } + } + + float a_fp = (float)(roi - 2*a); + + size_t oidx = n * c.oc * c.oh * c.ow + + g * c.oc / c.ng * c.oh * c.ow + + oc * c.oh * c.ow + + oh * c.ow + + ow; + + switch (p.eltwise_algorithm) { + case algorithm_undef: + break; + case eltwise_relu: + a_fp = relu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_tanh: + a_fp = tanh_fwd(a_fp); + break; + case eltwise_elu: + a_fp = elu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_square: + a_fp = square_fwd(a_fp); + break; + case eltwise_abs: + a_fp = abs_fwd(a_fp); + break; + case eltwise_sqrt: + a_fp = sqrt_fwd(a_fp); + break; + case eltwise_linear: + a_fp = linear_fwd(a_fp, p.eltwise_alpha, p.eltwise_beta); + break; + case eltwise_bounded_relu: + a_fp = bounded_relu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_soft_relu: + a_fp = soft_relu_fwd(a_fp); + break; + case eltwise_logistic: + a_fp = logistic_fwd(a_fp); + break; + case eltwise_clamp: + a_fp = clamp_fwd(a_fp, p.eltwise_alpha, p.eltwise_beta); + break; + default: + assert(!"unknown alg_kind"); + } + + switch (p.depthwise_algorithm) { + case algorithm_undef: + break; + case depthwise_scale_shift: + a_fp = scale_shift_fwd(a_fp, d_weights_data[g * c.oc / c.ng + oc], d_bias_data[g * c.oc / c.ng + oc]); + break; + case depthwise_prelu: + a_fp = prelu_fwd(a_fp, d_weights_data[g * c.oc / c.ng + oc]); + break; + default: assert(!"unknown alg_kind"); + } + + dst_data[map_index(dst_d, oidx)] = a_fp; + } + ); +} + +void compute_ref_dw_conv_fwd(const test_binary_convolution_dw_conv_params_t &p, + const memory &src, const memory &weights, const memory &bias, const memory &dst, + const memory &depthwise_weights, const memory &depthwise_bias) +{ + const memory::desc src_d = src.get_primitive_desc().desc(); + const memory::desc weights_d = weights.get_primitive_desc().desc(); + const memory::desc dst_d = dst.get_primitive_desc().desc(); + + auto src_dims = src_d.data.dims; + auto dst_dims = dst_d.data.dims; + + int MB = src_dims[0]; + int G = src_dims[1]; + int IC = src_dims[1]; + int IH = src_dims[2]; + int IW = src_dims[3]; + int OC = dst_dims[1]; + int OH = dst_dims[2]; + int OW = dst_dims[3]; + + int KH = p.sizes.conv2_kh; + int KW = p.sizes.conv2_kw; + int SH = p.sizes.conv2_strh; + int SW = p.sizes.conv2_strw; + int PH = p.sizes.conv2_padh; + int PW = p.sizes.conv2_padw; + int DH = 0; + int DW = 0; + + float *src_data = (float *)src.get_data_handle(); + float *weights_data = (float *)weights.get_data_handle(); + float *bias_data = (float *)bias.get_data_handle(); + float *dst_data = (float *)dst.get_data_handle(); + + float *d_weights_data = (float *)depthwise_weights.get_data_handle(); + float *d_bias_data = (float *)depthwise_bias.get_data_handle(); + + mkldnn::impl::parallel_nd(MB, G, OC / G, OH, OW, + [&](int n, int g, int oc, int oh, int ow) { + int oidx = n * OC * OH * OW + + g * OC / G * OH * OW + + oc * OH * OW + oh * OW + ow; + + float a = (float)0; + + for (int ic = 0; ic < IC / G; ic++) { + for (int kh = 0; kh < KH; kh++) { + for (int kw = 0; kw < KW; kw++) { + int iw = ow * SW + - PW + kw * (1 + DW); + int ih = oh * SH + - PH + kh * (1 + DH); + if (iw < 0 || iw >= IW) continue; + if (ih < 0 || ih >= IH) continue; + int iidx = n * IC * IH * IW + + g * IC / G * IH * IW + + ic * IH * IW + ih * IW + iw; + int widx = g * OC / G * IC + / G * KH * KW + + oc * IC / G * KH * KW + + ic * KH * KW + kh * KW + kw; + + iidx = map_index(src_d, iidx); + + float s = src_data[iidx]; + float w = weights_data[map_index(weights_d, widx)]; + + a += s * w; + + } + } + } + + float a_fp = (float)a; + + a_fp += bias_data[G > 1 ? g : oc]; + + if (p.with_sum) + a_fp += dst_data[map_index(dst_d, oidx)]; + + switch (p.eltwise_algorithm) { + case algorithm_undef: + break; + case eltwise_relu: + a_fp = relu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_tanh: + a_fp = tanh_fwd(a_fp); + break; + case eltwise_elu: + a_fp = elu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_square: + a_fp = square_fwd(a_fp); + break; + case eltwise_abs: + a_fp = abs_fwd(a_fp); + break; + case eltwise_sqrt: + a_fp = sqrt_fwd(a_fp); + break; + case eltwise_linear: + a_fp = linear_fwd(a_fp, p.eltwise_alpha, p.eltwise_beta); + break; + case eltwise_bounded_relu: + a_fp = bounded_relu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_soft_relu: + a_fp = soft_relu_fwd(a_fp); + break; + case eltwise_logistic: + a_fp = logistic_fwd(a_fp); + break; + case eltwise_clamp: + a_fp = clamp_fwd(a_fp, p.eltwise_alpha, p.eltwise_beta); + break; + default: + assert(!"unknown alg_kind"); + } + + switch (p.depthwise_algorithm) { + case algorithm_undef: + break; + case depthwise_scale_shift: + a_fp = scale_shift_fwd(a_fp, d_weights_data[g * OC / G + oc], d_bias_data[g * OC / G + oc]); + break; + case depthwise_prelu: + a_fp = prelu_fwd(a_fp, d_weights_data[g * OC / G + oc]); + break; + default: assert(!"unknown alg_kind"); + } + + dst_data[map_index(dst_d, oidx)] = (float)a_fp; + } + ); +} + +void compute_ref_binarization_fwd(const test_binary_convolution_dw_conv_params_t &p, + const memory::desc &src_md, const memory &src, const memory &weights, const memory &dst) { + auto src_data = (float*)src.get_data_handle(); + auto weights_data = (float*)weights.get_data_handle(); + auto dst_data = (uint8_t*)dst.get_data_handle(); + + const memory::desc src_d = src.get_primitive_desc().desc(); + const memory::desc weights_d = weights.get_primitive_desc().desc(); + const memory::desc dst_d = dst.get_primitive_desc().desc(); + + int N = src_md.data.ndims > 0 ? src_md.data.dims[0] : 1; + int C = src_md.data.ndims > 1 ? src_md.data.dims[1] : 1; + int H = src_md.data.ndims > 2 ? src_md.data.dims[2] : 1; + int W = src_md.data.ndims > 3 ? src_md.data.dims[3] : 1; + + int nbits = 8; + int CB = div_up(C, nbits); + + int padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + int padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1]; + + for (int n = 0; n < N; ++n) { + for (int cb = 0; cb < CB; ++cb) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + + uint8_t bin_val = 0x00; + for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { + int src_idx = n*padded_ic*H*W + c*H*W + h*W + w; + int wei_idx = c; + + float s_val = src_data[map_index(src_d, src_idx)]; + float w_val = weights_data[map_index(weights_d, wei_idx)]; + + auto bit = uint8_t((s_val > w_val) ? 0x01 : 0x00); + bin_val |= (bit << shift); + } + + int dst_idx = n*padded_oc*H*W + cb*nbits*H*W + h*W + w; + dst_idx = map_index(dst_d, dst_idx); + dst_data[dst_idx / nbits] = bin_val; + } + } + } + } +} + +class binary_convolution_forward_test : public ::testing::TestWithParam +{ +protected: + virtual void SetUp() + { + test_binary_convolution_dw_conv_params_t p = ::testing::TestWithParam::GetParam(); + + ASSERT_TRUE(p.engine_kind == engine::kind::cpu); + ASSERT_EQ(p.aalgorithm, algorithm::binary_convolution_direct); + + test_convolution_dw_conv_sizes_t cd = p.sizes; + + auto eng = engine(p.engine_kind, 0); + auto aprop_kind = prop_kind::forward; + bool with_binarization = p.binarization_algorithm != algorithm_undef; +// int nbits = 8; + + memory::data_type data_type_bin_conv_src = memory::data_type::bin; + memory::data_type data_type_bin_conv_wei = memory::data_type::bin; + memory::data_type data_type_bin_conv_bia = data_traits::data_type; + memory::data_type data_type_bin_conv_dst = data_traits::data_type; + + memory::data_type data_type_dw_conv_wei = data_traits::data_type; + memory::data_type data_type_dw_conv_bia = data_traits::data_type; + memory::data_type data_type_dw_conv_dst = with_binarization ? memory::data_type::bin + : data_traits::data_type; + + int bin_conv_oh = (cd.ih - ((cd.conv1_kh - 1) + 1) + 2 * cd.conv1_padh) / cd.conv1_strh + 1; + int bin_conv_ow = (cd.iw - ((cd.conv1_kw - 1) + 1) + 2 * cd.conv1_padw) / cd.conv1_strw + 1; + + int dw_conv_oh = (bin_conv_oh - ((cd.conv2_kh - 1) + 1) + 2 * cd.conv2_padh) / cd.conv2_strh + 1; + int dw_conv_ow = (bin_conv_ow - ((cd.conv2_kw - 1) + 1) + 2 * cd.conv2_padw) / cd.conv2_strw + 1; + + std::vector bin_conv_padR = { cd.conv1_padh, cd.conv1_padw }; + bin_conv_padR[0] += dw_conv_oh - bin_conv_oh; + bin_conv_padR[1] += dw_conv_ow - bin_conv_ow; + + auto bin_conv_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw }, data_type_bin_conv_src, p.formats.src_format); + auto bin_conv_weights_desc = create_md({ cd.conv1_oc, cd.ic, cd.conv1_kh, cd.conv1_kw }, data_type_bin_conv_wei, p.formats.conv1_weights_format); + auto bin_conv_dst_desc = create_md({ cd.mb, cd.conv1_oc, dw_conv_oh, dw_conv_ow }, data_type_bin_conv_dst, p.formats.dst_format); + + auto bin_conv_src = test_memory(bin_conv_src_desc, eng); + auto bin_conv_weights = test_memory(bin_conv_weights_desc, eng); + + fill_data(bin_conv_src.get_size() / sizeof(uint8_t), (uint8_t*)bin_conv_src.get().get_data_handle()); + fill_data(bin_conv_weights.get_size() / sizeof(uint8_t), (uint8_t*)bin_conv_weights.get().get_data_handle()); + + auto dw_conv_weights_desc = create_md({ cd.conv2_oc, 1, 1, cd.conv2_kh, cd.conv2_kw }, data_type_dw_conv_wei, p.formats.conv2_weights_format); + auto dw_conv_dst_desc = create_md({ cd.mb, cd.conv2_oc, dw_conv_oh, dw_conv_ow }, data_type_dw_conv_dst, p.formats.dst_format); + auto dw_conv_bias_desc = create_md({ cd.conv2_oc }, data_type_dw_conv_bia, p.formats.conv2_bias_format); + + auto dw_conv_weights = test_memory(dw_conv_weights_desc, eng); + auto dw_conv_bias = test_memory(dw_conv_bias_desc, eng); + auto dw_conv_dst = test_memory(dw_conv_dst_desc, eng); + + if (with_binarization) + fill_data(dw_conv_dst.get_size() / sizeof(uint8_t), (uint8_t*)dw_conv_dst.get().get_data_handle()); + else + fill_data(dw_conv_dst.get_size() / sizeof(float), (float*)dw_conv_dst.get().get_data_handle()); + + fill_data(dw_conv_weights.get_size() / sizeof(float), (float*)dw_conv_weights.get().get_data_handle()); + fill_data(dw_conv_bias.get_size() / sizeof(float), (float*)dw_conv_bias.get().get_data_handle()); + + auto bin_conv_desc = binary_convolution_forward::desc(aprop_kind, p.aalgorithm, + bin_conv_src_desc, bin_conv_weights_desc, bin_conv_dst_desc, + { cd.conv1_strh, cd.conv1_strw }, { 0, 0 }, + { cd.conv1_padh, cd.conv1_padw }, bin_conv_padR, -1.f); + + mkldnn::post_ops bin_conv_post_ops; + if (p.eltwise_algorithm != algorithm_undef) + bin_conv_post_ops.append_eltwise(1.0, p.eltwise_algorithm, p.eltwise_alpha, p.eltwise_beta); + + auto bin_conv_depthwise_weights_desc = create_md({ cd.conv1_oc }, data_type_bin_conv_bia, memory::x); + auto bin_conv_depthwise_bias_desc = create_md({ cd.conv1_oc }, data_type_bin_conv_bia, memory::x); + auto bin_conv_depthwise_weights = memory({bin_conv_depthwise_weights_desc, eng}); + auto bin_conv_depthwise_bias = memory({bin_conv_depthwise_bias_desc, eng}); + + if (p.depthwise_algorithm != algorithm_undef) { + fill_data(bin_conv_depthwise_weights.get_primitive_desc().get_size() / sizeof(float), + (float *)bin_conv_depthwise_weights.get_data_handle(), 1., true); + fill_data(bin_conv_depthwise_bias.get_primitive_desc().get_size() / sizeof(float), + (float *)bin_conv_depthwise_bias.get_data_handle(), 1., true); + + bin_conv_post_ops.append_depthwise(p.depthwise_algorithm, static_cast(bin_conv_depthwise_weights.get_data_handle()), + static_cast(bin_conv_depthwise_bias.get_data_handle())); + } + + bin_conv_post_ops.append_dw_conv(bin_conv_oh, bin_conv_ow, cd.conv2_kh, cd.conv2_kw, cd.conv2_strh, cd.conv2_strw, + static_cast(dw_conv_weights.get().get_data_handle()), + static_cast(dw_conv_bias.get().get_data_handle())); + + if (p.with_sum) + bin_conv_post_ops.append_sum(); + + if (p.eltwise_algorithm != algorithm_undef) + bin_conv_post_ops.append_eltwise(1.0, p.eltwise_algorithm, p.eltwise_alpha, p.eltwise_beta); + + auto dw_conv_depthwise_weights_desc = create_md({ cd.conv2_oc }, data_type_bin_conv_bia, memory::x); + auto dw_conv_depthwise_bias_desc = create_md({ cd.conv2_oc }, data_type_bin_conv_bia, memory::x); + auto dw_conv_depthwise_weights = memory({dw_conv_depthwise_weights_desc, eng}); + auto dw_conv_depthwise_bias = memory({dw_conv_depthwise_bias_desc, eng}); + + if (p.depthwise_algorithm != algorithm_undef) { + fill_data(dw_conv_depthwise_weights.get_primitive_desc().get_size() / sizeof(float), + (float *)dw_conv_depthwise_weights.get_data_handle(), 1., true); + fill_data(dw_conv_depthwise_bias.get_primitive_desc().get_size() / sizeof(float), + (float *)dw_conv_depthwise_bias.get_data_handle(), 1., true); + + bin_conv_post_ops.append_depthwise(p.depthwise_algorithm, static_cast(dw_conv_depthwise_weights.get_data_handle()), + static_cast(dw_conv_depthwise_bias.get_data_handle())); + } + + auto dw_conv_binarization_weights_desc = create_md({ cd.conv2_oc }, memory::data_type::f32, memory::x); + auto dw_conv_binarization_weights = memory({dw_conv_binarization_weights_desc, eng}); + + if (p.binarization_algorithm != algorithm_undef) { + fill_data(dw_conv_binarization_weights.get_primitive_desc().get_size() / sizeof(float), + (float *)dw_conv_binarization_weights.get_data_handle(), 0.f, p.sizes.conv2_oc * p.sizes.conv2_kh * p.sizes.conv2_kw); + + bin_conv_post_ops.append_binarization(p.binarization_algorithm, static_cast(dw_conv_binarization_weights.get_data_handle())); + } + + mkldnn::primitive_attr bin_conv_attr; + bin_conv_attr.set_post_ops(bin_conv_post_ops); + + auto bin_conv_primitive_desc = binary_convolution_forward::primitive_desc(bin_conv_desc, bin_conv_attr, eng); + + auto bin_conv = binary_convolution_forward(bin_conv_primitive_desc, bin_conv_src.get(), bin_conv_weights.get(), dw_conv_dst.get()); + + auto bin_conv_dst_desc_ref = create_md({ cd.mb, cd.conv1_oc, bin_conv_oh, bin_conv_ow }, data_type_bin_conv_dst, p.formats.dst_format); + auto ref_bin_conv_dst = test_memory(bin_conv_dst_desc_ref, eng); + compute_ref_bin_conv_fwd(p, bin_conv_src_desc, bin_conv_weights_desc, bin_conv_dst_desc_ref, + bin_conv_src.get(), bin_conv_weights.get(), ref_bin_conv_dst.get(), + bin_conv_depthwise_weights, bin_conv_depthwise_bias); + + if (with_binarization) { + auto ref_dw_conv_dst_desc = create_md({ cd.mb, cd.conv2_oc, dw_conv_oh, dw_conv_ow }, memory::data_type::f32, p.formats.dst_format); + auto ref_dw_conv_dst = test_memory(ref_dw_conv_dst_desc, eng); + + compute_ref_dw_conv_fwd(p, ref_bin_conv_dst.get(), dw_conv_weights.get(), dw_conv_bias.get(), + ref_dw_conv_dst.get(), + dw_conv_depthwise_weights, dw_conv_depthwise_bias); + + auto ref_binarization_dst = test_memory(dw_conv_dst_desc, eng); + + compute_ref_binarization_fwd(p, ref_dw_conv_dst_desc, ref_dw_conv_dst.get(), dw_conv_binarization_weights, ref_binarization_dst.get()); + + std::vector pipeline; + pipeline.push_back(bin_conv); + auto s = stream(stream::kind::lazy); + s.submit(pipeline).wait(); + + compare_data(ref_binarization_dst.get(), dw_conv_dst.get(), 0, true); + } else { + auto ref_dw_conv_dst = test_memory(dw_conv_dst_desc, eng); + memcpy((float *) ref_dw_conv_dst.get().get_data_handle(), (float *) dw_conv_dst.get().get_data_handle(), + ref_dw_conv_dst.get_size()); + compute_ref_dw_conv_fwd(p, ref_bin_conv_dst.get(), dw_conv_weights.get(), dw_conv_bias.get(), + ref_dw_conv_dst.get(), + dw_conv_depthwise_weights, dw_conv_depthwise_bias); + + std::vector pipeline; + pipeline.push_back(bin_conv); + auto s = stream(stream::kind::lazy); + s.submit(pipeline).wait(); + + compare_data(ref_dw_conv_dst.get(), dw_conv_dst.get(), 1e-3); + } + } +}; + +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_sum_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_sum_forward.cpp new file mode 100644 index 0000000..7e0bcae --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_dw_conv_sum_forward.cpp @@ -0,0 +1,67 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_dw_conv_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionDwConvSum) +{ +} + +#define BIN +#define WITH_DW_CONV +#define WITH_SUM +#define DIRECTION_FORWARD +#include "convolution_common.h" + +INST_TEST_CASE(SimpleSmall_Blocked, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 1, 7, 10, 10, 37, 1, 1, 0, 0, 1, 1, 37, 3, 3, 1, 1, 1, 1) +); + +INST_TEST_CASE(Mobilenet_Blocked, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 8, 19, 33, 56, 3, 3, 1, 1, 2, 2, 56, 3, 3, 1, 1, 1, 1), // 1_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 32, 19, 33, 56, 1, 1, 0, 0, 1, 1, 56, 3, 3, 1, 1, 2, 2), // 2_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 56, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 1, 1), // 2_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 2, 2), // 3_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 4, 8, 208, 1, 1, 0, 0, 1, 1, 208, 3, 3, 1, 1, 1, 1), // 3_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 208, 4, 8, 216, 1, 1, 0, 0, 1, 1, 216, 3, 3, 1, 1, 2, 2), // 4_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 216, 2, 4, 328, 1, 1, 0, 0, 1, 1, 328, 3, 3, 1, 1, 1, 1), // 4_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 328, 2, 4, 288, 1, 1, 0, 0, 1, 1, 288, 3, 3, 1, 1, 1, 1), // 5_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 288, 1, 1, 0, 0, 1, 1, 288, 3, 3, 1, 1, 1, 1), // 5_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 240, 1, 1, 0, 0, 1, 1, 240, 3, 3, 1, 1, 1, 1), // 5_3 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 240, 2, 4, 264, 1, 1, 0, 0, 1, 1, 264, 3, 3, 1, 1, 1, 1) // 5_4 +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_eltwise_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_eltwise_forward.cpp new file mode 100644 index 0000000..74dcc03 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_eltwise_forward.cpp @@ -0,0 +1,80 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionEltwise) +{ +} + +#define BIN +#define WITH_ELTWISE +#define DIRECTION_FORWARD +#include "convolution_common.h" + +#define PARAMS_WITH_ELTIWSE(...) \ + EXPAND_ARGS(PARAMS(eltwise_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_elu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_tanh, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_square, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_abs, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_sqrt, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_linear, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_bounded_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_soft_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_logistic, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS(eltwise_clamp, __VA_ARGS__)) + +INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels, + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 5, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 4, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +); + +INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels, + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 15, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 14, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1) +); + +//INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels, +// PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 43, 43, 10, 10, 43, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS_WITH_ELTIWSE(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 256, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +//); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward.cpp new file mode 100644 index 0000000..0dcc326 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward.cpp @@ -0,0 +1,92 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolution) +{ +} + +#define BIN +#define DIRECTION_FORWARD +#include "convolution_common.h" + +INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 7, 3, 3, 5, 3, 3, 1, 1, 0, 0, 1, 1, 0), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 15, 3, 3, 37, 4, 4, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 14, 4, 4, 1, 4, 4, 3, 3, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 7, 3, 3, 33, 3, 3, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 19, 2, 2, 22, 2, 2, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 126, 13, 13, 126, 13, 13, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 77, 13, 13, 99, 11, 11, 3, 3, 0, 0, 1, 1) +); + +INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 13, 13, 35, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 7, 3, 3, 11, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 1, 4, 4, 58, 4, 4, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 27, 3, 3, 33, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 81, 2, 2, 81, 2, 2, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 126, 13, 13, 13, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 111, 13, 13, 71, 13, 13, 1, 1, 0, 0, 1, 1) +); + +//INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels, +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 126, 126, 10, 10, 126, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 77, 77, 9, 9, 77, 2, 2, 5, 5, 0, 0, 3, 3), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 68, 68, 26, 26, 68, 13, 13, 4, 4, 1, 1, 2, 2), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 33, 33, 111, 111, 33, 112, 112, 1, 1, 0, 0, 1, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 1, 111, 111, 1, 2, 111, 1, 1, 3, 3, 1, 1, 1, 2), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 1, 29, 29, 16, 32, 29, 16, 18, 3, 3, 1, 2, 1, 2), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 1, 53, 53, 32, 16, 53, 16, 14, 3, 3, 1, 0, 2, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 1, 13, 13, 32, 16, 13, 18, 16, 3, 3, 2, 1, 2, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 1, 9, 9, 500, 500, 9, 698, 698, 3, 3, 100, 100, 1, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, +// 1, 2, 2, 500, 500, 2, 698, 698, 3, 3, 100, 100, 1, 1) +//); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward_common.hpp new file mode 100644 index 0000000..bef6e15 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_forward_common.hpp @@ -0,0 +1,352 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef TEST_BINARY_CONVOLUTION_FORWARD_COMMON_HPP +#define TEST_BINARY_CONVOLUTION_FORWARD_COMMON_HPP + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" +#include "math_utils.hpp" +#include "mkldnn.hpp" + +using namespace mkldnn::impl::math; + +namespace { + +} + +namespace mkldnn { + +void compute_ref_bin_conv_fwd(const test_binary_convolution_params_t &p, + const memory::desc &src_d, + const memory::desc &weights_d, + const memory::desc &dst_d, + const memory &src, + const memory &weights, + const memory &dst, + const memory &depthwise_weights, + const memory &depthwise_bias) +{ + auto c = p.sizes; + + uint8_t* src_data = (uint8_t*)src.get_data_handle(); + uint8_t* weights_data = (uint8_t*)weights.get_data_handle(); + float* dst_data = (float*)dst.get_data_handle(); + + float *d_weights_data = (float *)depthwise_weights.get_data_handle(); + float *d_bias_data = (float *)depthwise_bias.get_data_handle(); + + int nbits = 8; + + size_t padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_ic_w = weights_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_oc_w = weights_d.data.layout_desc.blocking.padding_dims[0]; + + auto extract_bit = [](uint8_t val, uint8_t bit) -> uint8_t { + return (uint8_t) ((val >> bit) & 0x0001); + }; + + mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow, + [&](int n, int g, int oc, int oh, int ow) { + int32_t a = 0; + int roi = 0; + for (int ic = 0; ic < c.ic; ic++) { + for (int kh = 0; kh < c.kh; kh++) { + for (int kw = 0; kw < c.kw; kw++) { + int ih = oh * c.strh - c.padh + kh * (1 + c.dilh); + int iw = ow * c.strw - c.padw + kw * (1 + c.dilw); + + size_t iidx = n * padded_ic * c.ih * c.iw + + g * padded_ic / c.ng * c.ih * c.iw + + ic * c.ih * c.iw + ih * c.iw + iw; + iidx = map_index(src_d, iidx); + + uint8_t s; + if (ih < 0 || ih >= c.ih || iw < 0 || iw >= c.iw) { + if (p.pad_value == 0.0f) { + continue; + } else { + s = p.pad_value == 1.0f ? (uint8_t)1 : (uint8_t)0; + } + } else { + s = extract_bit(src_data[iidx/nbits], (uint8_t)(iidx % nbits)); + } + + size_t widx = g * padded_oc_w / c.ng * padded_ic_w + / c.ng * c.kh * c.kw + + oc * padded_ic_w / c.ng * c.kh * c.kw + + ic * c.kh * c.kw + kh * c.kw + kw; + widx = map_index(weights_d, widx); + + uint8_t w = extract_bit(weights_data[widx/nbits], (uint8_t)(widx % nbits)); + + a += (int32_t)(s ^ w); + + roi++; + } + } + } + + float a_fp = (float)(roi - 2*a); + + size_t oidx = n * c.oc * c.oh * c.ow + + g * c.oc / c.ng * c.oh * c.ow + + oc * c.oh * c.ow + + oh * c.ow + + ow; + + if (p.with_sum) + a_fp += dst_data[map_index(dst_d, oidx)]; + + switch (p.eltwise_algorithm) { + case algorithm_undef: + break; + case eltwise_relu: + a_fp = relu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_tanh: + a_fp = tanh_fwd(a_fp); + break; + case eltwise_elu: + a_fp = elu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_square: + a_fp = square_fwd(a_fp); + break; + case eltwise_abs: + a_fp = abs_fwd(a_fp); + break; + case eltwise_sqrt: + a_fp = sqrt_fwd(a_fp); + break; + case eltwise_linear: + a_fp = linear_fwd(a_fp, p.eltwise_alpha, p.eltwise_beta); + break; + case eltwise_bounded_relu: + a_fp = bounded_relu_fwd(a_fp, p.eltwise_alpha); + break; + case eltwise_soft_relu: + a_fp = soft_relu_fwd(a_fp); + break; + case eltwise_logistic: + a_fp = logistic_fwd(a_fp); + break; + case eltwise_clamp: + a_fp = clamp_fwd(a_fp, p.eltwise_alpha, p.eltwise_beta); + break; + default: + assert(!"unknown alg_kind"); + } + + switch (p.depthwise_algorithm) { + case algorithm_undef: + break; + case depthwise_scale_shift: + a_fp = scale_shift_fwd(a_fp, d_weights_data[g * c.oc / c.ng + oc], d_bias_data[g * c.oc / c.ng + oc]); + break; + case depthwise_prelu: + a_fp = prelu_fwd(a_fp, d_weights_data[g * c.oc / c.ng + oc]); + break; + default: assert(!"unknown alg_kind"); + } + + dst_data[map_index(dst_d, oidx)] = a_fp; + } + ); +} + +void compute_ref_binarization_fwd(const test_binary_convolution_params_t &p, + const memory::desc &src_md, const memory &src, const memory &weights, const memory &dst) { + auto src_data = (float*)src.get_data_handle(); + auto weights_data = (float*)weights.get_data_handle(); + auto dst_data = (uint8_t*)dst.get_data_handle(); + + const memory::desc src_d = src.get_primitive_desc().desc(); + const memory::desc weights_d = weights.get_primitive_desc().desc(); + const memory::desc dst_d = dst.get_primitive_desc().desc(); + + int N = src_md.data.ndims > 0 ? src_md.data.dims[0] : 1; + int C = src_md.data.ndims > 1 ? src_md.data.dims[1] : 1; + int H = src_md.data.ndims > 2 ? src_md.data.dims[2] : 1; + int W = src_md.data.ndims > 3 ? src_md.data.dims[3] : 1; + + int nbits = 8; + int CB = div_up(C, nbits); + + int padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + int padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1]; + + for (int n = 0; n < N; ++n) { + for (int cb = 0; cb < CB; ++cb) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + + uint8_t bin_val = 0x00; + for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) { + int src_idx = n*padded_ic*H*W + c*H*W + h*W + w; + int wei_idx = c; + + float s_val = src_data[map_index(src_d, src_idx)]; + float w_val = weights_data[map_index(weights_d, wei_idx)]; + + auto bit = uint8_t((s_val > w_val) ? 0x01 : 0x00); + bin_val |= (bit << shift); + } + + int dst_idx = n*padded_oc*H*W + cb*nbits*H*W + h*W + w; + dst_idx = map_index(dst_d, dst_idx); + dst_data[dst_idx / nbits] = bin_val; + } + } + } + } +} + +class binary_convolution_forward_test : public ::testing::TestWithParam +{ +protected: + virtual void SetUp() + { + test_binary_convolution_params_t p = ::testing::TestWithParam::GetParam(); + + ASSERT_TRUE(p.engine_kind == engine::kind::cpu); + ASSERT_EQ(p.aalgorithm, algorithm::binary_convolution_direct); + + test_convolution_sizes_t cd = p.sizes; + + auto eng = engine(p.engine_kind, 0); + auto aprop_kind = prop_kind::forward; + bool with_binarization = p.binarization_algorithm != algorithm_undef; + + memory::data_type data_type_src = memory::data_type::bin; + memory::data_type data_type_wei = memory::data_type::bin; + memory::data_type data_type_bia = memory::data_type::f32; + memory::data_type data_type_dst = with_binarization ? memory::data_type::bin + : data_traits::data_type; + + auto c_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw }, data_type_src, p.formats.src_format); + auto c_weights_desc = cd.ng > 1 + ? create_md({ cd.ng, cd.oc / cd.ng, cd.ic / cd.ng, cd.kh, cd.kw }, data_type_wei, p.formats.weights_format) + : create_md({ cd.oc, cd.ic, cd.kh, cd.kw }, data_type_wei, p.formats.weights_format); + auto c_dst_desc = create_md({ cd.mb, cd.oc, cd.oh, cd.ow }, data_type_dst, p.formats.dst_format); + + auto c_src = test_memory(c_src_desc, eng); + auto c_weights = test_memory(c_weights_desc, eng); + auto c_dst = test_memory(c_dst_desc, eng); + + // Only true for dense format + if (with_binarization) + fill_data(c_dst.get_size() / sizeof(uint8_t), (uint8_t*)c_dst.get().get_data_handle()); + else + fill_data(c_dst.get_size() / sizeof(float), (float*)c_dst.get().get_data_handle()); + fill_data(c_src.get_size() / sizeof(uint8_t), (uint8_t*)c_src.get().get_data_handle()); + fill_data(c_weights.get_size() / sizeof(uint8_t), (uint8_t*)c_weights.get().get_data_handle()); + + std::vector padR = { + right_padding(cd.ih, cd.oh, cd.kh, cd.padh, cd.strh, cd.dilh), + right_padding(cd.iw, cd.ow, cd.kw, cd.padw, cd.strw, cd.dilw) + }; + + auto bin_conv_desc = binary_convolution_forward::desc(aprop_kind, p.aalgorithm, + c_src_desc, c_weights_desc, c_dst_desc, + { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, + { cd.padh, cd.padw }, padR, p.pad_value); + + mkldnn::post_ops ops; + + if (p.with_sum) + ops.append_sum(); + + if (p.eltwise_algorithm != algorithm_undef) + ops.append_eltwise(1.0, p.eltwise_algorithm, p.eltwise_alpha, p.eltwise_beta); + + auto c_depthwise_weights_desc = create_md({ cd.oc }, data_type_bia, memory::x); + auto c_depthwise_bias_desc = create_md({ cd.oc }, data_type_bia, memory::x); + + auto c_depthwise_weights = memory({c_depthwise_weights_desc, eng}); + auto c_depthwise_bias = memory({c_depthwise_bias_desc, eng}); + + if (p.depthwise_algorithm != algorithm_undef) { + fill_data(c_depthwise_weights.get_primitive_desc().get_size() / sizeof(float), + (float *)c_depthwise_weights.get_data_handle(), 1., true); + fill_data(c_depthwise_bias.get_primitive_desc().get_size() / sizeof(float), + (float *)c_depthwise_bias.get_data_handle(), 1., true); + + ops.append_depthwise(p.depthwise_algorithm, static_cast(c_depthwise_weights.get_data_handle()), + static_cast(c_depthwise_bias.get_data_handle())); + } + + auto c_binarization_weights_desc = create_md({ cd.oc }, memory::data_type::f32, memory::x); + auto c_binarization_weights = memory({c_binarization_weights_desc, eng}); + + if (p.binarization_algorithm != algorithm_undef) { + fill_data(c_binarization_weights.get_primitive_desc().get_size() / sizeof(float), + (float *)c_binarization_weights.get_data_handle(), 1., true); + + ops.append_binarization(p.binarization_algorithm, static_cast(c_binarization_weights.get_data_handle())); + } + + mkldnn::primitive_attr attr; + attr.set_post_ops(ops); + + auto bin_conv_primitive_desc = binary_convolution_forward::primitive_desc(bin_conv_desc, attr, eng); + + auto bin_conv = binary_convolution_forward(bin_conv_primitive_desc, c_src.get(), c_weights.get(), c_dst.get()); + + if (with_binarization) { + auto c_dst_desc_ref = create_md({ cd.mb, cd.oc, cd.oh, cd.ow }, memory::data_type::f32, p.formats.dst_format); + auto c_dst_ref = test_memory(c_dst_desc_ref, eng); + + std::vector ref_dst_conv_data(c_dst_ref.get_size() / sizeof(float)); + auto ref_conv_memory = memory(memory::primitive_desc(c_dst_desc_ref, eng), &ref_dst_conv_data[0]); + + std::vector ref_dst_data(c_dst.get_size() / sizeof(uint8_t)); + auto ref_memory = memory(memory::primitive_desc(c_dst_desc, eng), &ref_dst_data[0]); + + compute_ref_bin_conv_fwd(p, c_src_desc, c_weights_desc, c_dst_desc_ref, + c_src.get(), c_weights.get(), ref_conv_memory, + c_depthwise_weights, c_depthwise_bias); + + compute_ref_binarization_fwd(p, c_dst_desc_ref, ref_conv_memory, c_binarization_weights, ref_memory); + + std::vector pipeline; + pipeline.push_back(bin_conv); + auto s = stream(stream::kind::lazy); + s.submit(pipeline).wait(); + + compare_data(ref_memory, c_dst.get(), 0, true); + } else { + std::vector ref_dst_data(c_dst.get_size() / sizeof(float)); + memcpy(&ref_dst_data[0], (float*)c_dst.get().get_data_handle(), ref_dst_data.size() * sizeof(float)); + auto ref_memory = memory(memory::primitive_desc(c_dst_desc, eng), &ref_dst_data[0]); + + compute_ref_bin_conv_fwd(p, c_src_desc, c_weights_desc, c_dst_desc, + c_src.get(), c_weights.get(), ref_memory, + c_depthwise_weights, c_depthwise_bias); + + std::vector pipeline; + pipeline.push_back(bin_conv); + auto s = stream(stream::kind::lazy); + s.submit(pipeline).wait(); + + compare_data(ref_memory, c_dst.get(), 1e-3); + } + } +}; + +} + +#endif diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_sum_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_sum_forward.cpp new file mode 100644 index 0000000..1a9a548 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_binary_convolution_sum_forward.cpp @@ -0,0 +1,71 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_binary_convolution_forward_common.hpp" + +namespace mkldnn { + +using binary_convolution_test = binary_convolution_forward_test; + +TEST_P(binary_convolution_test, TestBinaryConvolutionSum) +{ +} + +#define BIN +#define WITH_SUM +#define DIRECTION_FORWARD +#include "convolution_common.h" + +INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 41, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 47, 10, 10, 137, 10, 10, 3, 3, 1, 1, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +); + +INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 3, 10, 10, 3, 10, 10, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 13, 3, 3, 41, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 47, 3, 3, 137, 3, 3, 1, 1, 0, 0, 1, 1), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 1, 256, 3, 3, 256, 3, 3, 1, 1, 0, 0, 1, 1) +); + +//INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels, +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 32, 32, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 43, 43, 10, 10, 43, 10, 10, 3, 3, 1, 1, 1, 1), +// PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED, +// 2, 256, 256, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1) +//); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_concat.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_concat.cpp index b479779..48e2f4f 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_concat.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_concat.cpp @@ -39,7 +39,7 @@ class concat_test: public ::testing::TestWithParam { const data_t *dst_data = (const data_t *)dst.get_data_handle(); const auto &dst_d = dst.get_primitive_desc().desc(); const auto dst_dims = dst_d.data.dims; - const int* dst_pdims = dst_d.data.layout_desc.blocking.padding_dims; + const ptrdiff_t* dst_pdims = dst_d.data.layout_desc.blocking.padding_dims; int acc_concat_dim = 0; const auto ndims = dst_d.data.ndims; @@ -47,8 +47,8 @@ class concat_test: public ::testing::TestWithParam { for (size_t num = 0; num < srcs.size(); num++) { const data_t *src_data = (const data_t *)srcs[num].get_data_handle(); const auto &src_d = srcs[num].get_primitive_desc().desc(); - const int* src_dims = src_d.data.dims; - const int* src_pdims = src_d.data.layout_desc.blocking.padding_dims; + const ptrdiff_t* src_dims = src_d.data.dims; + const ptrdiff_t* src_pdims = src_d.data.layout_desc.blocking.padding_dims; auto N = src_dims[0]; auto C = src_dims[1]; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_data_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_data_common.hpp index b523c50..1df9df7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_data_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_data_common.hpp @@ -119,7 +119,7 @@ protected: auto c_weights = test_memory(c_weights_desc, eng); auto c_diff_dst = test_memory(c_dst_desc, eng); - std::vector padR = { + std::vector padR = { right_padding(cd.ih, cd.oh, cd.kh, cd.padh, cd.strh, cd.dilh), right_padding(cd.iw, cd.ow, cd.kw, cd.padw, cd.strw, cd.dilw) }; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_weights_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_weights_common.hpp index 8331c18..00b8966 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_weights_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_backward_weights_common.hpp @@ -172,7 +172,7 @@ protected: check_zero_tail(1, c_src.get()); check_zero_tail(1, c_diff_weights.get()); - std::vector padR = { + std::vector padR = { right_padding(cd.ih, cd.oh, cd.kh, cd.padh, cd.strh, cd.dilh), right_padding(cd.iw, cd.ow, cd.kw, cd.padw, cd.strw, cd.dilw) }; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_common.hpp new file mode 100644 index 0000000..730be03 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_common.hpp @@ -0,0 +1,237 @@ +/******************************************************************************* +* Copyright 2018-2019 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" +#include "math_utils.hpp" +#include "mkldnn.hpp" + +using namespace mkldnn::impl::math; + +namespace mkldnn { + +template +inline typename std::remove_reference::type div_up(const T a, const U b) { + assert(b); + return (a + b - 1) / b; +} + +template +inline typename std::remove_reference::type rnd_up(const T a, const U b) { + return div_up(a, b) * b; +} + +template +void compute_ref_conv_depthwise_fwd(const test_convolution_sizes_t &c, + const memory &src, const memory &weights, const memory &bias, + const memory &dst, bool w_bias, algorithm depthwise_alg, + const memory &depthwise_weights, const memory &depthwise_bias) +{ + data_t_src *src_data = (data_t_src *)src.get_data_handle(); + data_t_wei *weights_data = (data_t_wei *)weights.get_data_handle(); + data_t_dst *bias_data + = (data_t_dst *)(w_bias ? bias.get_data_handle() : nullptr); + data_t_dst *dst_data = (data_t_dst *)dst.get_data_handle(); + + float *d_weights_data = (float *)depthwise_weights.get_data_handle(); + float *d_bias_data = (float *)depthwise_bias.get_data_handle(); + + const memory::desc src_d = src.get_primitive_desc().desc(); + const memory::desc weights_d = weights.get_primitive_desc().desc(); + const memory::desc dst_d = dst.get_primitive_desc().desc(); + + size_t padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1]; + + size_t padded_ic_w = weights_d.data.format == mkldnn_OhIw8o4i ? weights_d.data.layout_desc.blocking.padding_dims[1] : + src_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_oc_w = weights_d.data.format == mkldnn_OhIw8o4i ? weights_d.data.layout_desc.blocking.padding_dims[0] : + dst_d.data.layout_desc.blocking.padding_dims[1]; + + mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow, + [&](int n, int g, int oc, int oh, int ow) { + size_t oidx = n * padded_oc * c.oh * c.ow + + g * padded_oc / c.ng * c.oh * c.ow + + oc * c.oh * c.ow + oh * c.ow + ow; + + size_t didx = map_index(dst_d, oidx); + size_t bidx = g * c.oc / c.ng + oc; + dst_data[didx] = bias_data + ? bias_data[bidx] : data_t_dst{0}; + + for (int ic = 0; ic < c.ic / c.ng; ic++) + for (int kh = 0; kh < c.kh; kh++) + for (int kw = 0; kw < c.kw; kw++) + { + int ih = oh * c.strh - c.padh + kh * (1 + c.dilh); + if (ih < 0 || ih >= c.ih) continue; + int iw = ow * c.strw - c.padw + kw * (1 + c.dilw); + if (iw < 0 || iw >= c.iw) continue; + + size_t iidx = n * padded_ic * c.ih * c.iw + + g * padded_ic / c.ng * c.ih * c.iw + + ic * c.ih * c.iw + ih * c.iw + iw; + size_t widx = g * padded_oc_w / c.ng * padded_ic_w + / c.ng * c.kh * c.kw + + oc * padded_ic_w / c.ng * c.kh * c.kw + + ic * c.kh * c.kw + kh * c.kw + kw; + + dst_data[didx] += src_data[map_index(src_d, iidx)] + * weights_data[map_index(weights_d, widx)]; + } + + switch (depthwise_alg) { + case depthwise_scale_shift: + dst_data[didx] = scale_shift_fwd(dst_data[didx], d_weights_data[bidx], d_bias_data[bidx]); + break; + case depthwise_prelu: + dst_data[didx] = prelu_fwd(dst_data[didx], d_weights_data[bidx]); + break; + default: assert(!"unknown alg_kind"); + } + } + ); +} + +template +class convolution_depthwise_test + : public ::testing::TestWithParam { +protected: + virtual void SetUp() { + test_convolution_depthwise_params_t p + = ::testing::TestWithParam< + test_convolution_depthwise_params_t>::GetParam(); + + ASSERT_TRUE(p.engine_kind == engine::kind::cpu); + ASSERT_EQ(p.aalgorithm, convolution_direct); + auto eng = engine(p.engine_kind, 0); + + memory::data_type data_type_src = data_traits::data_type; + memory::data_type data_type_dst = data_traits::data_type; + memory::data_type data_type_wei = data_traits::data_type; + + test_convolution_sizes_t cd = p.sizes; + + auto c_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw }, + data_type_src, p.formats.src_format); + auto c_weights_desc = cd.ng > 1 ? + create_md({ cd.ng, cd.oc / cd.ng, cd.ic / cd.ng, cd.kh, cd.kw }, + data_type_wei, p.formats.weights_format) : + create_md({ cd.oc, cd.ic, cd.kh, cd.kw }, + data_type_wei, p.formats.weights_format); + auto c_dst_desc = create_md({ cd.mb, cd.oc, cd.oh, cd.ow }, + data_type_dst, p.formats.dst_format); + + auto c_src = memory({c_src_desc, eng}); + auto c_weights = memory({c_weights_desc, eng}); + auto c_dst = memory({c_dst_desc, eng}); + + auto dst_ref = memory({c_dst_desc, eng}); + + fill_data(c_src.get_primitive_desc().get_size() + / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle(), + data_t_src(0), data_t_src(1)); + check_zero_tail(1, c_src); + + fill_data( + c_weights.get_primitive_desc().get_size() + / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle(), + data_t_wei(0), data_t_wei(1)); + check_zero_tail(1, c_weights); + + bool with_bias = p.formats.bias_format != memory::format::format_undef; + auto c_bias_desc = with_bias ? + create_md({ cd.oc }, data_type_dst, p.formats.bias_format) : + create_md({}, data_type_dst, p.formats.bias_format); + auto c_bias = memory({c_bias_desc, eng}); + if (with_bias) { + fill_data( + c_bias.get_primitive_desc().get_size() / sizeof(data_t_dst), + (data_t_dst *)c_bias.get_data_handle(), 1., true); + } + + std::vector padR = { cd.padh, cd.padw }; + for (int i = 0; i < 2; ++i) { + if ((cd.ih - ((cd.kh - 1) * (cd.dilh + 1) + 1) + cd.padh + padR[0]) + / cd.strh + 1 != cd.oh) + ++padR[0]; + if ((cd.iw - ((cd.kw - 1) * (cd.dilw + 1) + 1) + cd.padw + padR[1]) + / cd.strw + 1 != cd.ow) + ++padR[1]; + } + + auto c_depthwise_weights_desc = create_md({ rnd_up(cd.oc, 16) }, data_type_dst, memory::x); + auto c_depthwise_bias_desc = create_md({ rnd_up(cd.oc, 16) }, data_type_dst, memory::x); + + auto c_depthwise_weights = memory({c_depthwise_weights_desc, eng}); + auto c_depthwise_bias = memory({c_depthwise_bias_desc, eng}); + + fill_data( + c_depthwise_weights.get_primitive_desc().get_size() / sizeof(data_t_dst), + (data_t_dst *)c_depthwise_weights.get_data_handle(), 1., true); + fill_data( + c_depthwise_bias.get_primitive_desc().get_size() / sizeof(data_t_dst), + (data_t_dst *)c_depthwise_bias.get_data_handle(), 1., true); + + + auto test = [&]() { + mkldnn::post_ops ops; + ops.append_depthwise(p.alg, static_cast(c_depthwise_weights.get_data_handle()), + static_cast(c_depthwise_bias.get_data_handle())); + + mkldnn::primitive_attr attr; + attr.set_post_ops(ops); + + auto conv_desc = with_bias + ? convolution_forward::desc(prop_kind::forward_scoring, + p.aalgorithm, c_src_desc, c_weights_desc, c_bias_desc, + c_dst_desc, { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, + { cd.padh, cd.padw }, padR, padding_kind::zero) + : convolution_forward::desc(prop_kind::forward_scoring, + p.aalgorithm, c_src_desc, c_weights_desc, c_dst_desc, + { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, + { cd.padh, cd.padw }, padR, padding_kind::zero); + + auto conv_primitive_desc = + convolution_forward::primitive_desc(conv_desc, attr, eng); + + auto conv = with_bias + ? convolution_forward(conv_primitive_desc, + c_src, c_weights, c_bias, c_dst) + : convolution_forward(conv_primitive_desc, + c_src, c_weights, c_dst); + std::vector pipeline; + pipeline.push_back(conv); + + stream(stream::kind::lazy).submit(pipeline).wait(); + }; + + if (catch_expected_failures(test, p.expect_to_fail, p.expected_status)) + return; + + compute_ref_conv_depthwise_fwd(cd, c_src, c_weights, c_bias, dst_ref, with_bias, + p.alg, c_depthwise_weights, c_depthwise_bias); + check_zero_tail(1, dst_ref); + + compare_data(dst_ref, c_dst, 1e-2); + check_zero_tail(0, c_dst); + } +}; + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_f32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_f32.cpp index 3789f8f..9008310 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_f32.cpp @@ -16,217 +16,11 @@ #include "mkldnn_test_common.hpp" #include "gtest/gtest.h" - #include "mkldnn.hpp" +#include "test_convolution_depthwise_forward_common.hpp" namespace mkldnn { -template inline T scale_shift_fwd(T s_val, T w_val, T b_val) { - return s_val*w_val + b_val; -} - -template inline T prelu_fwd(T s_val, T w_val) { - return s_val >= 0 ? s_val : w_val*s_val; -} - -template -void compute_ref_conv_depthwise_fwd(const test_convolution_sizes_t &c, - const memory &src, const memory &weights, const memory &bias, - const memory &dst, bool w_bias, algorithm depthwise_alg, - const memory &depthwise_weights, const memory &depthwise_bias) -{ - data_t_src *src_data = (data_t_src *)src.get_data_handle(); - data_t_wei *weights_data = (data_t_wei *)weights.get_data_handle(); - data_t_dst *bias_data - = (data_t_dst *)(w_bias ? bias.get_data_handle() : nullptr); - data_t_dst *dst_data = (data_t_dst *)dst.get_data_handle(); - - data_t_dst *d_weights_data = (data_t_dst *)depthwise_weights.get_data_handle(); - data_t_dst *d_bias_data = (data_t_dst *)depthwise_bias.get_data_handle(); - - const memory::desc src_d = src.get_primitive_desc().desc(); - const memory::desc weights_d = weights.get_primitive_desc().desc(); - const memory::desc dst_d = dst.get_primitive_desc().desc(); - -#pragma omp parallel for collapse(5) schedule(static) - for (int n = 0; n < c.mb; n++) { - for (int g = 0; g < c.ng; g++) { - for (int oc = 0; oc < c.oc / c.ng; oc++) { - for (int oh = 0; oh < c.oh; oh++) { - for (int ow = 0; ow < c.ow; ow++) { - int oidx = n * c.oc * c.oh * c.ow - + g * c.oc / c.ng * c.oh * c.ow - + oc * c.oh * c.ow + oh * c.ow + ow; - - int didx = map_index(dst_d, oidx); - int bidx = g * c.oc / c.ng + oc; - dst_data[didx] = bias_data ? - bias_data[map_index( - bias.get_primitive_desc().desc(), - bidx)] : - data_t_dst{0}; - for (int ic = 0; ic < c.ic / c.ng; ic++) { - for (int kh = 0; kh < c.kh; kh++) { - for (int kw = 0; kw < c.kw; kw++) { - int iw = ow * c.strw - - c.padw + kw * (1 + c.dilw); - int ih = oh * c.strh - - c.padh + kh * (1 + c.dilh); - if (iw < 0 || iw >= c.iw) continue; - if (ih < 0 || ih >= c.ih) continue; - int iidx = n * c.ic * c.ih * c.iw - + g * c.ic / c.ng * c.ih * c.iw - + ic * c.ih * c.iw + ih * c.iw + iw; - int widx = g * c.oc / c.ng * c.ic - / c.ng * c.kh * c.kw - + oc * c.ic / c.ng * c.kh * c.kw - + ic * c.kh * c.kw + kh * c.kw + kw; - - dst_data[didx] - += src_data[map_index(src_d, iidx)] - * weights_data[map_index( - weights_d, widx)]; - } - } - } - - switch (depthwise_alg) { - case depthwise_scale_shift: - dst_data[didx] = scale_shift_fwd(dst_data[didx], d_weights_data[bidx], d_bias_data[bidx]); - break; - case depthwise_prelu: - dst_data[didx] = prelu_fwd(dst_data[didx], d_weights_data[bidx]); - break; - default: assert(!"unknown alg_kind"); - } - } - } - } - } - } -} - -template -class convolution_depthwise_test - : public ::testing::TestWithParam { -protected: - virtual void SetUp() - { - test_convolution_depthwise_params_t p - = ::testing::TestWithParam< - test_convolution_depthwise_params_t>::GetParam(); - - ASSERT_TRUE(p.engine_kind == engine::kind::cpu); - ASSERT_EQ(p.aalgorithm, convolution_direct); - auto eng = engine(p.engine_kind, 0); - - memory::data_type data_type_src = data_traits::data_type; - memory::data_type data_type_dst = data_traits::data_type; - memory::data_type data_type_wei = data_traits::data_type; - - test_convolution_sizes_t cd = p.sizes; - - auto c_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw }, - data_type_src, p.formats.src_format); - auto c_weights_desc = cd.ng > 1 ? - create_md({ cd.ng, cd.oc / cd.ng, cd.ic / cd.ng, cd.kh, cd.kw }, - data_type_wei, p.formats.weights_format) : - create_md({ cd.oc, cd.ic, cd.kh, cd.kw }, - data_type_wei, p.formats.weights_format); - auto c_dst_desc = create_md({ cd.mb, cd.oc, cd.oh, cd.ow }, - data_type_dst, p.formats.dst_format); - - auto c_src = memory({c_src_desc, eng}); - auto c_weights = memory({c_weights_desc, eng}); - auto c_dst = memory({c_dst_desc, eng}); - - auto dst_ref = memory({c_dst_desc, eng}); - - fill_data(c_src.get_primitive_desc().get_size() - / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle(), data_t_src(0), data_t_src(1)); - - fill_data( - c_weights.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle(), data_t_wei(0), data_t_wei(1)); - - bool with_bias = p.formats.bias_format != memory::format::format_undef; - auto c_bias_desc = with_bias ? - create_md({ cd.oc }, data_type_dst, p.formats.bias_format) : - create_md({}, data_type_dst, p.formats.bias_format); - auto c_bias = memory({c_bias_desc, eng}); - if (with_bias) { - fill_data( - c_bias.get_primitive_desc().get_size() / sizeof(data_t_dst), - (data_t_dst *)c_bias.get_data_handle(), 1., true); - } - - std::vector padR = { cd.padh, cd.padw }; - for (int i = 0; i < 2; ++i) { - if ((cd.ih - ((cd.kh - 1) * (cd.dilh + 1) + 1) + cd.padh + padR[0]) - / cd.strh + 1 != cd.oh) - ++padR[0]; - if ((cd.iw - ((cd.kw - 1) * (cd.dilw + 1) + 1) + cd.padw + padR[1]) - / cd.strw + 1 != cd.ow) - ++padR[1]; - } - - auto c_depthwise_weights_desc = create_md({ cd.oc }, data_type_dst, memory::x); - auto c_depthwise_bias_desc = create_md({ cd.oc }, data_type_dst, memory::x); - - auto c_depthwise_weights = memory({c_depthwise_weights_desc, eng}); - auto c_depthwise_bias = memory({c_depthwise_bias_desc, eng}); - - fill_data( - c_depthwise_weights.get_primitive_desc().get_size() / sizeof(data_t_dst), - (data_t_dst *)c_depthwise_weights.get_data_handle(), 1., true); - fill_data( - c_depthwise_bias.get_primitive_desc().get_size() / sizeof(data_t_dst), - (data_t_dst *)c_depthwise_bias.get_data_handle(), 1., true); - - auto test = [&]() { - mkldnn::post_ops ops; - ops.append_depthwise(p.alg, static_cast(c_depthwise_weights.get_data_handle()), - static_cast(c_depthwise_bias.get_data_handle())); - - mkldnn::primitive_attr attr; - attr.set_post_ops(ops); - - auto conv_desc = with_bias - ? convolution_forward::desc(prop_kind::forward_scoring, - p.aalgorithm, c_src_desc, c_weights_desc, c_bias_desc, - c_dst_desc, { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, - { cd.padh, cd.padw }, padR, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward_scoring, - p.aalgorithm, c_src_desc, c_weights_desc, c_dst_desc, - { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, - { cd.padh, cd.padw }, padR, padding_kind::zero); - - auto conv_primitive_desc = - convolution_forward::primitive_desc(conv_desc, attr, eng); - - auto conv = with_bias - ? convolution_forward(conv_primitive_desc, - c_src, c_weights, c_bias, c_dst) - : convolution_forward(conv_primitive_desc, - c_src, c_weights, c_dst); - std::vector pipeline; - pipeline.push_back(conv); - - stream(stream::kind::lazy).submit(pipeline).wait(); - }; - - if (catch_expected_failures(test, p.expect_to_fail, p.expected_status)) - return; - - compute_ref_conv_depthwise_fwd(cd, c_src, c_weights, c_bias, dst_ref, with_bias, - p.alg, c_depthwise_weights, c_depthwise_bias); - compare_data(dst_ref, c_dst, 1e-3); - } -}; - using convolution_test = convolution_depthwise_test; TEST_P(convolution_test, TestConvolution) @@ -237,8 +31,10 @@ TEST_P(convolution_test, TestConvolution) { mkldnn::memory::format::src, mkldnn::memory::format::weights, \ mkldnn::memory::format::bias, mkldnn::memory::format::dst } -#define FMT_WEIGHTS_BLOCKED OIhw8i8o +#define FMT_WEIGHTS_BLOCKED8 OIhw8i8o +#define FMT_WEIGHTS_BLOCKED8_DW Goihw8g #define FMT_WEIGHTS_BLOCKED16 OIhw16i16o +#define FMT_WEIGHTS_BLOCKED16_DW Goihw16g #define ENGINE mkldnn::engine::kind::cpu #define ALGORITHM mkldnn::convolution_direct @@ -259,7 +55,6 @@ TEST_P(convolution_test, TestConvolution) EXPAND_ARGS(PARAMS_CONV(depthwise_scale_shift, __VA_ARGS__)), \ EXPAND_ARGS(PARAMS_CONV(depthwise_prelu, __VA_ARGS__)) - #define PARAMS_CONV(alg, src, weights, bias, dst, ...) \ test_convolution_depthwise_params_t {alg, ENGINE, ALGORITHM, \ EXPAND_FORMATS(src, weights, bias, dst), /* empty attributes */ {}, \ @@ -276,25 +71,25 @@ TEST_P(convolution_test, TestConvolution) 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) ); - INST_TEST_CASE(SimpleSmall_Blocked, - PARAMS(nChw8c, OIhw8i8o, x, nChw8c, + INST_TEST_CASE(SimpleSmall_Blocked8, + PARAMS(nChw8c, FMT_WEIGHTS_BLOCKED8, x, nChw8c, 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), - PARAMS(nChw8c, OIhw8i8o, x, nChw8c, + PARAMS(nChw8c, FMT_WEIGHTS_BLOCKED8, x, nChw8c, 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), - PARAMS(nChw8c, Goihw8g, x, nChw8c, + PARAMS(nChw8c, FMT_WEIGHTS_BLOCKED8_DW, x, nChw8c, 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), - PARAMS(nChw8c, Goihw8g, x, nChw8c, + PARAMS(nChw8c, FMT_WEIGHTS_BLOCKED8_DW, x, nChw8c, 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) ); INST_TEST_CASE(SimpleSmall_Blocked16, - PARAMS(nChw16c, OIhw16i16o, x, nChw16c, + PARAMS(nChw16c, FMT_WEIGHTS_BLOCKED16, x, nChw16c, 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), - PARAMS(nChw16c, OIhw16i16o, x, nChw16c, + PARAMS(nChw16c, FMT_WEIGHTS_BLOCKED16, x, nChw16c, 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), - PARAMS(nChw16c, Goihw16g, x, nChw16c, + PARAMS(nChw16c, FMT_WEIGHTS_BLOCKED16_DW, x, nChw16c, 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), - PARAMS(nChw16c, Goihw16g, x, nChw16c, + PARAMS(nChw16c, FMT_WEIGHTS_BLOCKED16_DW, x, nChw16c, 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) ); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_x8s8f32s32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_x8s8f32s32.cpp new file mode 100644 index 0000000..79ba406 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_depthwise_forward_x8s8f32s32.cpp @@ -0,0 +1,106 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" +#include "mkldnn.hpp" +#include "test_convolution_depthwise_forward_common.hpp" + +namespace mkldnn { + +using convolution_test = convolution_depthwise_test; + +TEST_P(convolution_test, TestConvolution) +{ +} + +#define EXPAND_FORMATS(src, weights, bias, dst) \ + { mkldnn::memory::format::src, mkldnn::memory::format::weights, \ + mkldnn::memory::format::bias, mkldnn::memory::format::dst } + +#define FMT_WEIGHTS_BLOCKED8 OhIw8o4i +#define FMT_WEIGHTS_BLOCKED8_DW Goihw8g +#define FMT_WEIGHTS_BLOCKED16 OIhw4i16o4i +#define FMT_WEIGHTS_BLOCKED16_DW Goihw16g + +#define ENGINE mkldnn::engine::kind::cpu +#define ALGORITHM mkldnn::convolution_direct + +#define CONCAT_WITH_UNDERSCORE_(a,b) a ## _ ## b +#define CONCAT_WITH_UNDERSCORE(a,b) CONCAT_WITH_UNDERSCORE_(a,b) + +#define INST_TEST_CASE_(str, ...) INSTANTIATE_TEST_CASE_P( \ + str, convolution_test, ::testing::Values(__VA_ARGS__)) + +#define INST_TEST_CASE(str, ...) INST_TEST_CASE_( \ + CONCAT_WITH_UNDERSCORE(CONCAT_WITH_UNDERSCORE(Convolution, \ + str), depthwise), __VA_ARGS__) + +#define EXPAND_ARGS(args) args + +#define PARAMS(...) \ + EXPAND_ARGS(PARAMS_CONV(depthwise_scale_shift, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(depthwise_prelu, __VA_ARGS__)) + +#define PARAMS_CONV(alg, src, weights, bias, dst, ...) \ + test_convolution_depthwise_params_t {alg, ENGINE, ALGORITHM, \ + EXPAND_FORMATS(src, weights, bias, dst), /* empty attributes */ {}, \ + {__VA_ARGS__} } + + INST_TEST_CASE(SimpleSmall, + PARAMS(nhwc, oihw, x, nhwc, + 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, oihw, x, nhwc, + 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(nhwc, goihw, x, nhwc, + 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, goihw, x, nhwc, + 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) + ); + + INST_TEST_CASE(SimpleSmall_Blocked8, + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8, x, nhwc, + 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8, x, nhwc, + 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8_DW, x, nhwc, + 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8_DW, x, nhwc, + 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) + ); + + INST_TEST_CASE(SimpleSmall_Blocked_Tail8, + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8, x, nhwc, + 2, 1, 15, 13, 13, 19, 11, 11, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8, x, nhwc, + 2, 1, 77, 13, 13, 91, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8_DW, x, nhwc, + 2, 21, 21, 16, 16, 21, 16, 16, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED8_DW, x, nhwc, + 2, 77, 77, 9, 9, 77, 9, 9, 1, 1, 0, 0, 1, 1) + ); + + INST_TEST_CASE(SimpleSmall_Blocked16, + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED16, x, nhwc, + 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED16, x, nhwc, + 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED16_DW, x, nhwc, + 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), + PARAMS(nhwc, FMT_WEIGHTS_BLOCKED16_DW, x, nhwc, + 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) + ); +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_common.hpp index 7f3537b..4c8445b 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_common.hpp @@ -25,7 +25,7 @@ template void compute_ref_conv_fwd(const mkldnn_convolution_desc_t &conv_desc, const memory &src, const memory &weights, const memory &bias, const memory &dst, - bool with_relu, float eltwise_alpha) + bool with_relu, float eltwise_alpha, const float* depthwise_weights) { int MB = conv_desc.src_desc.dims[0]; int G = conv_desc.weights_desc.ndims == 5 ? conv_desc.weights_desc.dims[0] : 1; @@ -47,7 +47,7 @@ void compute_ref_conv_fwd(const mkldnn_convolution_desc_t &conv_desc, data_t_src *src_data = (data_t_src *)src.get_data_handle(); data_t_wei *weights_data = (data_t_wei *)weights.get_data_handle(); - data_t_dst *bias_data = (data_t_dst *)bias.get_data_handle(); + float *bias_data = (float *)bias.get_data_handle(); data_t_dst *dst_data = (data_t_dst *)dst.get_data_handle(); const memory::desc src_d = src.get_primitive_desc().desc(); @@ -82,8 +82,6 @@ void compute_ref_conv_fwd(const mkldnn_convolution_desc_t &conv_desc, a += src_data[map_index(src_d, iidx)] * weights_data[map_index( weights_d, widx)]; - - } } } @@ -92,11 +90,19 @@ void compute_ref_conv_fwd(const mkldnn_convolution_desc_t &conv_desc, a_fp += bias_data[G > 1 ? g : oc]; + if (depthwise_weights) + a_fp *= depthwise_weights[G > 1 ? g : oc]; + if (with_relu) { a_fp = (a_fp > 0) ? a_fp : eltwise_alpha * a_fp; } - dst_data[map_index(dst_d, oidx)] = (data_t_dst) a_fp; + using D = memory::data_type; + if (data_traits::data_type != D::f32){ + a_fp = nearbyintf(a_fp); + } + + dst_data[map_index(dst_d, oidx)] = (data_t_dst)a_fp; } ); } @@ -115,7 +121,9 @@ protected: memory::data_type data_type_src = data_traits::data_type; memory::data_type data_type_dst = data_traits::data_type; memory::data_type data_type_wei = data_traits::data_type; - memory::data_type data_type_bia = data_traits::data_type; + memory::data_type data_type_bia = data_traits::data_type; + + bool is_int8 = data_type_src == mkldnn_u8 || data_type_src == mkldnn_s8; test_convolution_dw_conv_sizes_t cd = p.sizes; @@ -125,7 +133,7 @@ protected: int conv2_oh = (conv1_oh - ((cd.conv2_kh - 1) + 1) + 2 * cd.conv2_padh) / cd.conv2_strh + 1; int conv2_ow = (conv1_ow - ((cd.conv2_kw - 1) + 1) + 2 * cd.conv2_padw) / cd.conv2_strw + 1; - std::vector conv1_padR = { cd.conv1_padh, cd.conv1_padw }; + std::vector conv1_padR = { cd.conv1_padh, cd.conv1_padw }; conv1_padR[0] += conv2_oh - conv1_oh; conv1_padR[1] += conv2_ow - conv1_ow; @@ -159,27 +167,62 @@ protected: auto conv2_dst = memory({conv2_dst_desc, eng}); fill_data(conv1_src.get_primitive_desc().get_size() - / sizeof(data_t_src), (data_t_src *)conv1_src.get_data_handle(), 1., true); + / sizeof(data_t_src), (data_t_src *)conv1_src.get_data_handle(), (data_t_src)1, (data_t_src)1); fill_data( conv1_weights.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)conv1_weights.get_data_handle(), 1., true); - fill_data( + / sizeof(data_t_wei),(data_t_wei *)conv1_weights.get_data_handle(), (data_t_wei)1, (data_t_wei)1); + fill_data( conv1_bias.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)conv1_bias.get_data_handle(), 1., true); + / sizeof(float),(float *)conv1_bias.get_data_handle(), 1., true); fill_data( conv2_weights.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)conv2_weights.get_data_handle(), 1., true); - fill_data( + / sizeof(data_t_wei),(data_t_wei *)conv2_weights.get_data_handle(), (data_t_wei)1, (data_t_wei)1); + fill_data( conv2_bias.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)conv2_bias.get_data_handle(), 1., true); + / sizeof(float),(float *)conv2_bias.get_data_handle(), 1., true); + +// auto conv1_depthwise_weights_desc = create_md({ cd.conv2_oc }, mkldnn::memory::data_type::f32, memory::x); +// auto conv1_depthwise_weights = memory({conv1_depthwise_weights_desc, eng}); + std::vector conv1_depthwise_weights; + conv1_depthwise_weights.resize(cd.conv1_oc); + fill_data(conv1_depthwise_weights.size(), &conv1_depthwise_weights[0], 1.f / ((float)cd.ic), 1.f / ((float)cd.ic * cd.conv1_kh * cd.conv1_kw)); + + std::vector conv2_depthwise_weights; + conv2_depthwise_weights.resize(cd.conv1_oc); + fill_data(conv2_depthwise_weights.size(), &conv2_depthwise_weights[0], 1.f / ((float)cd.conv2_oc), 1.f / ((float)cd.conv2_oc * cd.conv2_kh * cd.conv2_kw)); + + std::vector conv2_depthwise_bias; + conv2_depthwise_bias.resize(cd.conv1_oc); +// fill_data(conv2_depthwise_bias.size(), &conv2_depthwise_bias[0], 1., true); + memset(&conv2_depthwise_bias[0], 0, conv2_depthwise_bias.size() * sizeof(float)); + +// auto conv2_depthwise_weights_desc = create_md({ cd.conv2_oc }, mkldnn::memory::data_type::f32, memory::x); +// auto conv2_depthwise_bias_desc = create_md({ cd.conv2_oc }, mkldnn::memory::data_type::f32, memory::x); +// +// auto conv2_depthwise_weights = memory({conv2_depthwise_weights_desc, eng}); +// auto conv2_depthwise_bias = memory({conv2_depthwise_bias_desc, eng}); + +// fill_data(conv2_depthwise_weights.get_primitive_desc().get_size() / sizeof(float), +// (float *)conv2_depthwise_weights.get_data_handle(), 1., true); +// memset((float*)conv2_depthwise_bias.get_data_handle(), 0, conv2_depthwise_bias.get_primitive_desc().get_size()); mkldnn::post_ops conv1_post_ops; conv1_post_ops.append_eltwise(1.0, mkldnn::algorithm::eltwise_relu, 0.0f, 0.0f); conv1_post_ops.append_dw_conv(conv1_oh, conv1_ow, cd.conv2_kh, cd.conv2_kw, cd.conv2_strh, cd.conv2_strw, static_cast(conv2_weights.get_data_handle()), static_cast(conv2_bias.get_data_handle())); + + if (is_int8) + conv1_post_ops.append_depthwise(depthwise_scale_shift, &conv2_depthwise_weights[0], &conv2_depthwise_bias[0]); + conv1_post_ops.append_eltwise(1.0, mkldnn::algorithm::eltwise_relu, 0.0f, 0.0f); mkldnn::primitive_attr conv1_attr; + + if (is_int8) { + conv1_attr.set_int_output_round_mode(mkldnn::round_nearest); + conv1_attr.set_output_scales(1 << 1 /*through C dim*/, conv1_depthwise_weights); + } + conv1_attr.set_post_ops(conv1_post_ops); auto conv1_primitive_desc = convolution_forward::primitive_desc(conv1_desc, conv1_attr, eng); @@ -197,8 +240,14 @@ protected: auto conv1_dst_ref = memory({conv1_dst_desc_ref, eng}); auto conv2_dst_ref = memory({conv2_dst_desc, eng}); - compute_ref_conv_fwd(conv1_desc_ref.data, conv1_src, conv1_weights, conv1_bias, conv1_dst_ref, true, 0.0f); - compute_ref_conv_fwd(conv2_desc.data, conv1_dst_ref, conv2_weights, conv2_bias, conv2_dst_ref, true, 0.0f); + + auto conv1_depthwise_weights_data = is_int8 ? &conv1_depthwise_weights[0] : nullptr; + auto conv2_depthwise_weights_data = is_int8 ? &conv2_depthwise_weights[0] : nullptr; + + compute_ref_conv_fwd(conv1_desc_ref.data, + conv1_src, conv1_weights, conv1_bias, conv1_dst_ref, true, 0.0f, conv1_depthwise_weights_data); + compute_ref_conv_fwd(conv2_desc.data, + conv1_dst_ref, conv2_weights, conv2_bias, conv2_dst_ref, true, 0.0f, conv2_depthwise_weights_data); compare_data(conv2_dst_ref, conv2_dst); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_f32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_f32.cpp index 4db7a2e..c519533 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_f32.cpp @@ -82,7 +82,12 @@ INST_TEST_CASE(Mobilenet_Blocked, PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 288, 2, 4, 240, 1, 1, 0, 0, 1, 1, 240, 3, 3, 1, 1, 1, 1), // 5_3 PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, - 2, 240, 2, 4, 264, 1, 1, 0, 0, 1, 1, 264, 3, 3, 1, 1, 1, 1) // 5_4 + 2, 240, 2, 4, 264, 1, 1, 0, 0, 1, 1, 264, 3, 3, 1, 1, 1, 1), // 5_4 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 48, 75, 75, 48, 1, 1, 0, 0, 1, 1, 48, 3, 3, 1, 1, 2, 2), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 48, 75, 75, 48, 3, 3, 1, 1, 1, 1, 48, 3, 3, 1, 1, 2, 2) + ); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_u8s8s32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_u8s8s32.cpp new file mode 100644 index 0000000..bed1937 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_dw_conv_u8s8s32.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* +* Copyright 2016-2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" + +#include "mkldnn.hpp" +#include "test_convolution_dw_conv_common.hpp" +namespace mkldnn { + +using convolution_test = convolution_dw_conv_test; + +TEST_P(convolution_test, TestConvolutionDwConv) +{ +} + +#define FMT_BIAS x +#define FMT_DATA_BLOCKED nhwc + +#define EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst) \ + { mkldnn::memory::format::src, mkldnn::memory::format::conv1_weights, mkldnn::memory::format::conv1_bias, \ + mkldnn::memory::format::conv2_weights, mkldnn::memory::format::conv2_bias, mkldnn::memory::format::dst } + +#define FMT_WEIGHTS_BLOCKED OhIw8o4i + +#define FMT_WEIGHTS_DW_BLOCKED Goihw8g + +#define ENGINE mkldnn::engine::kind::cpu +#define ALGORITHM mkldnn::convolution_direct + +#define CONCAT_WITH_UNDERSCORE_(a,b) a ## _ ## b +#define CONCAT_WITH_UNDERSCORE(a,b) CONCAT_WITH_UNDERSCORE_(a,b) + +#define INST_TEST_CASE_(str, ...) INSTANTIATE_TEST_CASE_P( \ + str, convolution_test, ::testing::Values(__VA_ARGS__)) + +#define INST_TEST_CASE(str, ...) INST_TEST_CASE_( \ + CONCAT_WITH_UNDERSCORE(CONCAT_WITH_UNDERSCORE(TEST_CASE_NAME_PREFIX, \ + str), dw_conv), __VA_ARGS__) + +#define EXPAND_ARGS(args) args + +#define PARAMS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst, ...) \ + test_convolution_dw_conv_params_t {ENGINE, ALGORITHM, \ + EXPAND_FORMATS(src, conv1_weights, conv1_bias, conv2_weights, conv2_bias, dst), {__VA_ARGS__} } + +INST_TEST_CASE(Mobilenet_Blocked, + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 8, 19, 33, 56, 3, 3, 1, 1, 2, 2, 56, 3, 3, 1, 1, 1, 1), // 1_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 32, 19, 33, 56, 1, 1, 0, 0, 1, 1, 56, 3, 3, 1, 1, 2, 2), // 2_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 56, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 1, 1), // 2_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 9, 16, 112, 1, 1, 0, 0, 1, 1, 112, 3, 3, 1, 1, 2, 2), // 3_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 112, 4, 8, 208, 1, 1, 0, 0, 1, 1, 208, 3, 3, 1, 1, 1, 1), // 3_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 208, 4, 8, 216, 1, 1, 0, 0, 1, 1, 216, 3, 3, 1, 1, 2, 2), // 4_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 216, 2, 4, 328, 1, 1, 0, 0, 1, 1, 328, 3, 3, 1, 1, 1, 1), // 4_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 328, 2, 4, 288, 1, 1, 0, 0, 1, 1, 288, 3, 3, 1, 1, 1, 1), // 5_1 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 288, 1, 1, 0, 0, 1, 1, 288, 3, 3, 1, 1, 1, 1), // 5_2 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 288, 2, 4, 240, 1, 1, 0, 0, 1, 1, 240, 3, 3, 1, 1, 1, 1), // 5_3 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 240, 2, 4, 264, 1, 1, 0, 0, 1, 1, 264, 3, 3, 1, 1, 1, 1), // 5_4 + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 48, 75, 75, 48, 1, 1, 0, 0, 1, 1, 48, 3, 3, 1, 1, 2, 2), + PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_WEIGHTS_DW_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, + 2, 48, 75, 75, 48, 3, 3, 1, 1, 1, 1, 48, 3, 3, 1, 1, 2, 2) +); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp index 5337807..c0b6e21 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp @@ -16,66 +16,12 @@ #include "mkldnn_test_common.hpp" #include "gtest/gtest.h" - +#include "math_utils.hpp" #include "mkldnn.hpp" -namespace mkldnn { - - -template inline T relu_fwd(T s, A alpha) { - return s > 0 ? s : static_cast(s * alpha); -} - -template T tanh_fwd(T s) { - const float e = ::expf(2*s); /* maybe replace with -2*s? */ - return static_cast((e - 1.0) / (e + 1.0)); -} - -template T elu_fwd(T s, A alpha) { - return s > 0 ? s : static_cast(alpha * (::expf(s) - 1)); -} - -template -T square_fwd(T s) { - return s * s; -} - -template -T abs_fwd(T s) { - return s > 0 ? s : -s;; -} - -template -T sqrt_fwd(T s) { - return s > 0 ? ::sqrtf(s) : 0; -} - -template -T linear_fwd(T s, A alpha, A beta) { - return alpha * s + beta; -} - -template -T bounded_relu_fwd(T s, A alpha) { - s = s > 0 ? s : 0; - return s > alpha ? alpha : s; -} - -template -T soft_relu_fwd(T s) { - return logf(1 + ::expf(s)); -} +using namespace mkldnn::impl::math; -template -T logistic_fwd(T s) { - T v = ::expf(s); - return v / (v + 1); -} - -template -T clamp_fwd(T s, A alpha, A beta) { - return s > alpha ? (T)(alpha) : s < beta ? (T)(beta) : s; -} +namespace mkldnn { template @@ -94,76 +40,60 @@ void compute_ref_conv_eltwise_fwd(const test_convolution_sizes_t &c, const memory::desc weights_d = weights.get_primitive_desc().desc(); const memory::desc dst_d = dst.get_primitive_desc().desc(); + size_t padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1]; + + size_t padded_ic_w = weights_d.data.format == mkldnn_OhIw8o4i ? weights_d.data.layout_desc.blocking.padding_dims[1] : + src_d.data.layout_desc.blocking.padding_dims[1]; + size_t padded_oc_w = weights_d.data.format == mkldnn_OhIw8o4i ? weights_d.data.layout_desc.blocking.padding_dims[0] : + dst_d.data.layout_desc.blocking.padding_dims[1]; + mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow, [&](int n, int g, int oc, int oh, int ow) { - int oidx = n * c.oc * c.oh * c.ow - + g * c.oc / c.ng * c.oh * c.ow - + oc * c.oh * c.ow + oh * c.ow + ow; - - int didx = map_index(dst_d, oidx); - dst_data[didx] = bias_data ? - bias_data[map_index( - bias.get_primitive_desc().desc(), - g * c.oc / c.ng + oc)] : - data_t_dst{0}; - for (int ic = 0; ic < c.ic / c.ng; ic++) { - for (int kh = 0; kh < c.kh; kh++) { - for (int kw = 0; kw < c.kw; kw++) { - int iw = ow * c.strw - - c.padw + kw * (1 + c.dilw); - int ih = oh * c.strh - - c.padh + kh * (1 + c.dilh); - if (iw < 0 || iw >= c.iw) continue; - if (ih < 0 || ih >= c.ih) continue; - int iidx = n * c.ic * c.ih * c.iw - + g * c.ic / c.ng * c.ih * c.iw - + ic * c.ih * c.iw + ih * c.iw + iw; - int widx = g * c.oc / c.ng * c.ic - / c.ng * c.kh * c.kw - + oc * c.ic / c.ng * c.kh * c.kw - + ic * c.kh * c.kw + kh * c.kw + kw; - - dst_data[didx] - += src_data[map_index(src_d, iidx)] - * weights_data[map_index( - weights_d, widx)]; - } - } + size_t oidx = n * padded_oc * c.oh * c.ow + + g * padded_oc / c.ng * c.oh * c.ow + + oc * c.oh * c.ow + oh * c.ow + ow; + + size_t didx = map_index(dst_d, oidx); + dst_data[didx] = bias_data + ? bias_data[g * c.oc / c.ng + oc] : data_t_dst{0}; + + for (int ic = 0; ic < c.ic / c.ng; ic++) + for (int kh = 0; kh < c.kh; kh++) + for (int kw = 0; kw < c.kw; kw++) + { + int ih = oh * c.strh - c.padh + kh * (1 + c.dilh); + if (ih < 0 || ih >= c.ih) continue; + int iw = ow * c.strw - c.padw + kw * (1 + c.dilw); + if (iw < 0 || iw >= c.iw) continue; + + size_t iidx = n * padded_ic * c.ih * c.iw + + g * padded_ic / c.ng * c.ih * c.iw + + ic * c.ih * c.iw + ih * c.iw + iw; + size_t widx = g * padded_oc_w / c.ng * padded_ic_w + / c.ng * c.kh * c.kw + + oc * padded_ic_w / c.ng * c.kh * c.kw + + ic * c.kh * c.kw + kh * c.kw + kw; + + dst_data[didx] += src_data[map_index(src_d, iidx)] + * weights_data[map_index(weights_d, widx)]; } + auto &d = dst_data[didx]; switch (elt_alg) { - case eltwise_relu: - dst_data[didx] = relu_fwd(dst_data[didx], elt_alpha); - break; - case eltwise_tanh: - dst_data[didx] = tanh_fwd(dst_data[didx]); - break; - case eltwise_elu: - dst_data[didx] = elu_fwd(dst_data[didx], elt_alpha); - break; - case eltwise_square: - dst_data[didx] = square_fwd(dst_data[didx]); - break; - case eltwise_abs: - dst_data[didx] = abs_fwd(dst_data[didx]); - break; - case eltwise_sqrt: - dst_data[didx] = sqrt_fwd(dst_data[didx]); - break; - case eltwise_linear: - dst_data[didx] = linear_fwd(dst_data[didx], elt_alpha, elt_beta); - break; - case eltwise_bounded_relu: - dst_data[didx] = bounded_relu_fwd(dst_data[didx], elt_alpha); - break; - case eltwise_soft_relu: - dst_data[didx] = soft_relu_fwd(dst_data[didx]); - break; - case eltwise_logistic: - dst_data[didx] = logistic_fwd(dst_data[didx]); - break; - default: - assert(!"unknown alg_kind"); + case eltwise_relu: d = relu_fwd(d, elt_alpha); break; + case eltwise_tanh: d = tanh_fwd(d); break; + case eltwise_elu: d = elu_fwd(d, elt_alpha); break; + case eltwise_square: d = square_fwd(d); break; + case eltwise_abs: d = abs_fwd(d); break; + case eltwise_sqrt: d = sqrt_fwd(d); break; + case eltwise_linear: d = linear_fwd(d, elt_alpha, elt_beta); break; + case eltwise_bounded_relu: d = bounded_relu_fwd(d, elt_alpha); break; + case eltwise_soft_relu: d = soft_relu_fwd(d); break; + case eltwise_logistic: d = logistic_fwd(d); break; + case eltwise_clamp: d = clamp_fwd(d, elt_alpha, elt_beta); break; + case eltwise_exp: d = exp_fwd(d); break; + default: assert(!"unknown alg_kind"); } } ); @@ -174,8 +104,7 @@ template { protected: - virtual void SetUp() - { + virtual void SetUp() { test_convolution_eltwise_params_t p = ::testing::TestWithParam< test_convolution_eltwise_params_t>::GetParam(); @@ -209,11 +138,15 @@ protected: auto dst_ref = memory({c_dst_desc, eng}); fill_data(c_src.get_primitive_desc().get_size() - / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle(), data_t_src(0), data_t_src(1)); + / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle(), + data_t_src(0), data_t_src(1)); + check_zero_tail(1, c_src); fill_data( c_weights.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle(), data_t_wei(0), data_t_wei(1)); + / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle(), + data_t_wei(0), data_t_wei(1)); + check_zero_tail(1, c_weights); bool with_bias = p.formats.bias_format != memory::format::format_undef; auto c_bias_desc = with_bias ? @@ -226,7 +159,7 @@ protected: (data_t_dst *)c_bias.get_data_handle(), 1., true); } - std::vector padR = { cd.padh, cd.padw }; + std::vector padR = { cd.padh, cd.padw }; for (int i = 0; i < 2; ++i) { if ((cd.ih - ((cd.kh - 1) * (cd.dilh + 1) + 1) + cd.padh + padR[0]) / cd.strh + 1 != cd.oh) @@ -273,7 +206,10 @@ protected: compute_ref_conv_eltwise_fwd(cd, c_src, c_weights, c_bias, dst_ref, with_bias, p.alg, eltwise_alpha, eltwise_beta); - compare_data(dst_ref, c_dst); + check_zero_tail(1, dst_ref); + + compare_data(dst_ref, c_dst, 1e-2); + check_zero_tail(0, c_dst); } }; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_f32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_f32.cpp index 19a6def..9b751cf 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_f32.cpp @@ -18,220 +18,10 @@ #include "gtest/gtest.h" #include "math_utils.hpp" #include "mkldnn.hpp" - -using namespace mkldnn::impl::math; +#include "test_convolution_eltwise_forward_common.hpp" namespace mkldnn { -template -void compute_ref_conv_eltwise_fwd(const test_convolution_sizes_t &c, - const memory &src, const memory &weights, const memory &bias, - const memory &dst, bool w_bias, algorithm elt_alg, - float elt_alpha, float elt_beta) -{ - data_t_src *src_data = (data_t_src *)src.get_data_handle(); - data_t_wei *weights_data = (data_t_wei *)weights.get_data_handle(); - data_t_dst *bias_data - = (data_t_dst *)(w_bias ? bias.get_data_handle() : nullptr); - data_t_dst *dst_data = (data_t_dst *)dst.get_data_handle(); - - const memory::desc src_d = src.get_primitive_desc().desc(); - const memory::desc weights_d = weights.get_primitive_desc().desc(); - const memory::desc dst_d = dst.get_primitive_desc().desc(); - -#pragma omp parallel for collapse(5) schedule(static) - for (int n = 0; n < c.mb; n++) { - for (int g = 0; g < c.ng; g++) { - for (int oc = 0; oc < c.oc / c.ng; oc++) { - for (int oh = 0; oh < c.oh; oh++) { - for (int ow = 0; ow < c.ow; ow++) { - int oidx = n * c.oc * c.oh * c.ow - + g * c.oc / c.ng * c.oh * c.ow - + oc * c.oh * c.ow + oh * c.ow + ow; - - int didx = map_index(dst_d, oidx); - dst_data[didx] = bias_data ? - bias_data[map_index( - bias.get_primitive_desc().desc(), - g * c.oc / c.ng + oc)] : - data_t_dst{0}; - for (int ic = 0; ic < c.ic / c.ng; ic++) { - for (int kh = 0; kh < c.kh; kh++) { - for (int kw = 0; kw < c.kw; kw++) { - int iw = ow * c.strw - - c.padw + kw * (1 + c.dilw); - int ih = oh * c.strh - - c.padh + kh * (1 + c.dilh); - if (iw < 0 || iw >= c.iw) continue; - if (ih < 0 || ih >= c.ih) continue; - int iidx = n * c.ic * c.ih * c.iw - + g * c.ic / c.ng * c.ih * c.iw - + ic * c.ih * c.iw + ih * c.iw + iw; - int widx = g * c.oc / c.ng * c.ic - / c.ng * c.kh * c.kw - + oc * c.ic / c.ng * c.kh * c.kw - + ic * c.kh * c.kw + kh * c.kw + kw; - - dst_data[didx] - += src_data[map_index(src_d, iidx)] - * weights_data[map_index( - weights_d, widx)]; - } - } - } - - switch (elt_alg) { - case eltwise_relu: dst_data[didx] = - relu_fwd(dst_data[didx], elt_alpha); - break; - case eltwise_tanh: dst_data[didx] = - tanh_fwd(dst_data[didx]); - break; - case eltwise_elu: dst_data[didx] = - elu_fwd(dst_data[didx], elt_alpha); - break; - case eltwise_square: dst_data[didx] = - square_fwd(dst_data[didx]); - break; - case eltwise_abs: dst_data[didx] = - abs_fwd(dst_data[didx]); - break; - case eltwise_sqrt: dst_data[didx] = - sqrt_fwd(dst_data[didx]); - break; - case eltwise_linear: dst_data[didx] = - linear_fwd(dst_data[didx], elt_alpha, - elt_beta); - break; - case eltwise_bounded_relu: dst_data[didx] = - bounded_relu_fwd(dst_data[didx], elt_alpha); - break; - case eltwise_soft_relu: dst_data[didx] = - soft_relu_fwd(dst_data[didx]); - break; - case eltwise_logistic: dst_data[didx] = - logistic_fwd(dst_data[didx]); - break; - default: assert(!"unknown alg_kind"); - } - } - } - } - } - } -} - -template -class convolution_eltwise_test - : public ::testing::TestWithParam { -protected: - virtual void SetUp() - { - test_convolution_eltwise_params_t p - = ::testing::TestWithParam< - test_convolution_eltwise_params_t>::GetParam(); - - ASSERT_TRUE(p.engine_kind == engine::kind::cpu); - ASSERT_EQ(p.aalgorithm, convolution_direct); - auto eng = engine(p.engine_kind, 0); - float eltwise_alpha = p.eltwise_alpha; - float eltwise_beta = p.eltwise_beta; - - memory::data_type data_type_src = data_traits::data_type; - memory::data_type data_type_dst = data_traits::data_type; - memory::data_type data_type_wei = data_traits::data_type; - - test_convolution_sizes_t cd = p.sizes; - - auto c_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw }, - data_type_src, p.formats.src_format); - auto c_weights_desc = cd.ng > 1 ? - create_md({ cd.ng, cd.oc / cd.ng, cd.ic / cd.ng, cd.kh, cd.kw }, - data_type_wei, p.formats.weights_format) : - create_md({ cd.oc, cd.ic, cd.kh, cd.kw }, - data_type_wei, p.formats.weights_format); - auto c_dst_desc = create_md({ cd.mb, cd.oc, cd.oh, cd.ow }, - data_type_dst, p.formats.dst_format); - - auto c_src = memory({c_src_desc, eng}); - auto c_weights = memory({c_weights_desc, eng}); - auto c_dst = memory({c_dst_desc, eng}); - - auto dst_ref = memory({c_dst_desc, eng}); - - fill_data(c_src.get_primitive_desc().get_size() - / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle(), - data_t_src(0), data_t_src(1)); - - fill_data( - c_weights.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle(), - data_t_wei(0), data_t_wei(1)); - - bool with_bias = p.formats.bias_format != memory::format::format_undef; - auto c_bias_desc = with_bias ? - create_md({ cd.oc }, data_type_dst, p.formats.bias_format) : - create_md({}, data_type_dst, p.formats.bias_format); - auto c_bias = memory({c_bias_desc, eng}); - if (with_bias) { - fill_data( - c_bias.get_primitive_desc().get_size() / sizeof(data_t_dst), - (data_t_dst *)c_bias.get_data_handle(), 1., true); - } - - std::vector padR = { cd.padh, cd.padw }; - for (int i = 0; i < 2; ++i) { - if ((cd.ih - ((cd.kh - 1) * (cd.dilh + 1) + 1) + cd.padh + padR[0]) - / cd.strh + 1 != cd.oh) - ++padR[0]; - if ((cd.iw - ((cd.kw - 1) * (cd.dilw + 1) + 1) + cd.padw + padR[1]) - / cd.strw + 1 != cd.ow) - ++padR[1]; - } - - auto test = [&]() { - mkldnn::post_ops ops; - ops.append_eltwise(1.0, p.alg, p.eltwise_alpha, p.eltwise_beta); - - mkldnn::primitive_attr attr; - attr.set_post_ops(ops); - - auto conv_desc = with_bias - ? convolution_forward::desc(prop_kind::forward_scoring, - p.aalgorithm, c_src_desc, c_weights_desc, c_bias_desc, - c_dst_desc, { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, - { cd.padh, cd.padw }, padR, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward_scoring, - p.aalgorithm, c_src_desc, c_weights_desc, c_dst_desc, - { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, - { cd.padh, cd.padw }, padR, padding_kind::zero); - - auto conv_primitive_desc = - convolution_forward::primitive_desc(conv_desc, attr, eng); - - auto conv = with_bias - ? convolution_forward(conv_primitive_desc, - c_src, c_weights, c_bias, c_dst) - : convolution_forward(conv_primitive_desc, - c_src, c_weights, c_dst); - std::vector pipeline; - pipeline.push_back(conv); - - stream(stream::kind::lazy).submit(pipeline).wait(); - }; - - if (catch_expected_failures(test, p.expect_to_fail, p.expected_status)) - return; - - compute_ref_conv_eltwise_fwd(cd, c_src, c_weights, c_bias, dst_ref, with_bias, - p.alg, eltwise_alpha, eltwise_beta); - compare_data(dst_ref, c_dst, 1e-2); - } -}; - using convolution_test = convolution_eltwise_test; TEST_P(convolution_test, TestConvolutionEltwise) @@ -276,33 +66,35 @@ TEST_P(convolution_test, TestConvolutionEltwise) {__VA_ARGS__} } INST_TEST_CASE(SimpleSmall, - PARAMS(nchw, oihw, x, nchw, - 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), - PARAMS(nchw, oihw, x, nchw, - 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), - PARAMS(nchw, goihw, x, nchw, - 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), - PARAMS(nchw, goihw, x, nchw, - 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) + PARAMS(nchw, oihw, x, nchw, 2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1), + PARAMS(nchw, oihw, x, nchw, 2, 1, 16, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1), + PARAMS(nchw, goihw, x, nchw, 2, 64, 64, 16, 16, 64, 16, 16, 3, 3, 0, 0, 1, 1), + PARAMS(nchw, goihw, x, nchw, 2, 32, 32, 9, 9, 32, 9, 9, 1, 1, 0, 0, 1, 1) ); INST_TEST_CASE(SimpleSmall_Blocked, - PARAMS(nChw8c, Goihw8g, x, nChw8c, - 1, 48, 48, 20, 20, 48, 20, 20, 3, 3, 1, 1, 1, 1), - PARAMS(nChw8c, OIhw8i8o, x, nChw8c, - 1, 1, 48, 20, 20, 48, 20, 20, 1, 1, 0, 0, 1, 1), - PARAMS(nChw8c, OIhw8i8o, x, nChw8c, - 1, 1, 48, 20, 20, 48, 20, 20, 3, 3, 0, 0, 1, 1) + PARAMS(nChw8c, Goihw8g, x, nChw8c, 1, 8, 8, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1), + PARAMS(nChw8c, OIhw8i8o, x, nChw8c, 1, 1, 48, 20, 20, 48, 20, 20, 1, 1, 0, 0, 1, 1), + PARAMS(nChw8c, OIhw8i8o, x, nChw8c, 1, 1, 48, 20, 20, 48, 20, 20, 3, 3, 0, 0, 1, 1) + ); + + INST_TEST_CASE(SimpleSmall_Blocked_Tail, + PARAMS(nChw8c, Goihw8g, x, nChw8c, 1, 47, 47, 20, 20, 47, 20, 20, 3, 3, 1, 1, 1, 1), + PARAMS(nChw8c, OIhw8i8o, x, nChw8c, 1, 1, 47, 20, 20, 47, 20, 20, 1, 1, 0, 0, 1, 1), + PARAMS(nChw8c, OIhw8i8o, x, nChw8c, 1, 1, 47, 20, 20, 47, 20, 20, 3, 3, 0, 0, 1, 1) ); INST_TEST_CASE(SimpleSmall_Blocked16, - PARAMS(nChw16c, Goihw16g, x, nChw16c, - 1, 48, 48, 20, 20, 48, 20, 20, 3, 3, 1, 1, 1, 1), - PARAMS(nChw16c, OIhw16i16o, x, nChw16c, - 1, 1, 48, 20, 20, 48, 20, 20, 1, 1, 0, 0, 1, 1), - PARAMS(nChw16c, OIhw16i16o, x, nChw16c, - 1, 1, 48, 20, 20, 48, 20, 20, 3, 3, 0, 0, 1, 1), - PARAMS(nChw16c, OIhw16i16o, x, nChw16c, - 2, 1, 32, 32, 32, 32, 32, 32, 3, 3, 0, 0, 1, 1) + PARAMS(nChw16c, Goihw16g, x, nChw16c, 1, 48, 48, 20, 20, 48, 20, 20, 3, 3, 1, 1, 1, 1), + PARAMS(nChw16c, OIhw16i16o, x, nChw16c, 1, 1, 48, 20, 20, 48, 20, 20, 1, 1, 0, 0, 1, 1), + PARAMS(nChw16c, OIhw16i16o, x, nChw16c, 1, 1, 48, 20, 20, 48, 20, 20, 3, 3, 0, 0, 1, 1), + PARAMS(nChw16c, OIhw16i16o, x, nChw16c, 2, 1, 32, 32, 32, 32, 32, 32, 3, 3, 0, 0, 1, 1) + ); + + INST_TEST_CASE(SimpleSmall_Blocked16_Tail, + PARAMS(nChw16c, Goihw16g, x, nChw16c, 1, 47, 47, 20, 20, 47, 20, 20, 3, 3, 1, 1, 1, 1), + PARAMS(nChw16c, OIhw16i16o, x, nChw16c, 1, 1, 47, 20, 20, 47, 20, 20, 1, 1, 0, 0, 1, 1), + PARAMS(nChw16c, OIhw16i16o, x, nChw16c, 1, 1, 47, 20, 20, 47, 20, 20, 3, 3, 0, 0, 1, 1), + PARAMS(nChw16c, OIhw16i16o, x, nChw16c, 2, 1, 32, 32, 32, 32, 32, 32, 3, 3, 0, 0, 1, 1) ); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp new file mode 100644 index 0000000..1e95fc3 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp @@ -0,0 +1,109 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "mkldnn_test_common.hpp" +#include "gtest/gtest.h" +#include "math_utils.hpp" +#include "mkldnn.hpp" +#include "test_convolution_eltwise_forward_common.hpp" + +namespace mkldnn { + +using convolution_test_u8s8s32f32 = + convolution_eltwise_test; +using convolution_test_s8s8s32f32 = + convolution_eltwise_test; + +#define EXPAND_FORMATS(src, weights, bias, dst) \ + { mkldnn::memory::format::src, mkldnn::memory::format::weights, \ + mkldnn::memory::format::bias, mkldnn::memory::format::dst } + +#define CONCAT_WITH_UNDERSCORE_(a,b) a ## _ ## b +#define CONCAT_WITH_UNDERSCORE(a,b) CONCAT_WITH_UNDERSCORE_(a,b) + +#define INST_TEST_CASE_(str, test, ...) INSTANTIATE_TEST_CASE_P( \ + str, test, ::testing::Values(__VA_ARGS__)) + +#define INST_TEST_CASE(str, test, ...) INST_TEST_CASE_( \ + CONCAT_WITH_UNDERSCORE(CONCAT_WITH_UNDERSCORE(Convolution, \ + str), eltwise), test, __VA_ARGS__) + +#define EXPAND_ARGS(args) args + +#define PARAMS(...) \ + EXPAND_ARGS(PARAMS_CONV(eltwise_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_elu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_tanh, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_square, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_abs, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_sqrt, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_linear, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_bounded_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_soft_relu, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_logistic, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_clamp, __VA_ARGS__)), \ + EXPAND_ARGS(PARAMS_CONV(eltwise_exp, __VA_ARGS__)) + +#define ELTWISE_ALPHA 0.5f +#define ELTWISE_BETA 0.f + +#define PARAMS_CONV(alg, src, weights, bias, dst, ...) \ + test_convolution_eltwise_params_t {alg, mkldnn::engine::kind::cpu, \ + mkldnn::convolution_direct, ELTWISE_ALPHA, ELTWISE_BETA, \ + EXPAND_FORMATS(src, weights, bias, dst), /* empty attributes */ {}, \ + {__VA_ARGS__} } + +#define INST_TEST_CASE_P_UNSIGNED(test) \ +TEST_P(test, TestConvolutionEltwise) {} \ +INST_TEST_CASE(SimpleSmall_Blocked16, test, \ +PARAMS(nhwc, OIhw4i16o4i, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 1, 1, 0, 0, 1, 1), \ +PARAMS(nhwc, Goihw16g, x, nhwc, 2, 32, 32, 13, 13, 32, 13, 13, 3, 3, 1, 1, 1, 1), \ +PARAMS(nhwc, OIhw4i16o4i, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 3, 3, 1, 1, 1, 1) \ +);\ +\ +INST_TEST_CASE(SimpleSmall_Blocked8, test, \ +PARAMS(nhwc, OhIw8o4i, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 1, 1, 0, 0, 1, 1), \ +PARAMS(nhwc, Goihw8g, x, nhwc, 2, 32, 32, 13, 13, 32, 13, 13, 3, 3, 1, 1, 1, 1), \ +PARAMS(nhwc, OhIw8o4i, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 3, 3, 1, 1, 1, 1) \ +);\ +\ +INST_TEST_CASE(SimpleSmall_Blocked8_Tail, test, \ +PARAMS(nhwc, OhIw8o4i, x, nhwc, 2, 1, 47, 20, 20, 47, 20, 20, 1, 1, 0, 0, 1, 1), \ +PARAMS(nhwc, Goihw8g, x, nhwc, 2, 47, 47, 20, 20, 47, 20, 20, 3, 3, 1, 1, 1, 1), \ +PARAMS(nhwc, OhIw8o4i, x, nhwc, 2, 1, 47, 20, 20, 47, 20, 20, 3, 3, 1, 1, 1, 1) \ +); + +#define INST_TEST_CASE_P_SIGNED(test) \ +TEST_P(test, TestConvolutionEltwise) {} \ +INST_TEST_CASE(SimpleSmall_Blocked16, test, \ +PARAMS(nhwc, OIhw4i16o4i_s8s8, x, nhwc, 2, 1, 32, 13, 13, 32, 12, 12, 3, 3, 0, 0, 1, 1), \ +PARAMS(nhwc, Goihw16g_s8s8, x, nhwc, 2, 32, 32, 13, 13, 32, 13, 13, 1, 1, 0, 0, 1, 1), \ +PARAMS(nhwc, OIhw4i16o4i_s8s8, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 3, 3, 1, 1, 1, 1) \ +);\ +\ +INST_TEST_CASE(SimpleSmall_Blocked8, test, \ +PARAMS(nhwc, OhIw8o4i_s8s8, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 1, 1, 0, 0, 1, 1), \ +PARAMS(nhwc, OhIw8o4i_s8s8, x, nhwc, 2, 1, 32, 13, 13, 32, 13, 13, 3, 3, 1, 1, 1, 1) \ +);\ +\ +INST_TEST_CASE(SimpleSmall_Blocked8_Tail, test, \ +PARAMS(nhwc, OhIw8o4i_s8s8, x, nhwc, 2, 1, 47, 20, 20, 47, 20, 20, 1, 1, 0, 0, 1, 1), \ +PARAMS(nhwc, OhIw8o4i_s8s8, x, nhwc, 2, 1, 47, 20, 20, 47, 20, 20, 3, 3, 1, 1, 1, 1) \ +); + +//INST_TEST_CASE_P_SIGNED(convolution_test_s8s8s32f32); +INST_TEST_CASE_P_UNSIGNED(convolution_test_u8s8s32f32); +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common.hpp index e3f2ac5..b87f354 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common.hpp @@ -179,7 +179,7 @@ protected: check_zero_tail(1, c_weights.get()); check_zero_tail(1, c_dst.get()); - std::vector padR = { + std::vector padR = { right_padding(cd.ih, cd.oh, cd.kh, cd.padh, cd.strh, cd.dilh), right_padding(cd.iw, cd.ow, cd.kw, cd.padw, cd.strw, cd.dilw) }; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common_3d.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common_3d.hpp index 8291cdd..9c4691f 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common_3d.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common_3d.hpp @@ -197,7 +197,7 @@ protected: check_zero_tail(1, c_weights.get()); check_zero_tail(1, c_dst.get()); - std::vector padR = { + std::vector padR = { right_padding(cd.id, cd.od, cd.kd, cd.padd, cd.strd, cd.dild), right_padding(cd.ih, cd.oh, cd.kh, cd.padh, cd.strh, cd.dilh), right_padding(cd.iw, cd.ow, cd.kw, cd.padw, cd.strw, cd.dilw) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_f32_3d.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_f32_3d.cpp index 632a557..f76a0b7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_f32_3d.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_f32_3d.cpp @@ -82,4 +82,34 @@ INST_TEST_CASE_3D(SimpleSmall_Blocked16, 2, 1, 32, 13, 13, 13, 48, 11, 11, 11, 3, 3, 3, 0, 0, 0, 1, 1, 1) ); +INST_TEST_CASE_3D(SimpleSmall_NCDHW_PLANAR, + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 1, 79, 79, 79, 1, 77, 77, 79, 5, 5, 5, 1, 1, 2, 1, 1, 1, 0, 0, 0), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 1, 79, 79, 79, 1, 75, 79, 75, 5, 5, 5, 2, 0, 2, 1, 1, 1, 0, 0, 0), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 9, 68, 68, 68, 1, 50, 50, 50, 5, 5, 5, 18, 18, 18, 1, 1, 1, 8, 8, 8), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 1, 75, 63, 91, 1, 73, 61, 91, 5, 5, 5, 1, 1, 2, 1, 1, 1, 0, 0, 0), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 1, 58, 41, 37, 1, 58, 37, 37, 5, 5, 5, 2, 0, 2, 1, 1, 1, 0, 0, 0), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 9, 68, 34, 48, 1, 50, 16, 30, 5, 5, 5, 18, 18, 18, 1, 1, 1, 8, 8, 8) +); + +INST_TEST_CASE_3D(SimpleSmall_NCDHW_MSD, + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 1, 79, 79, 79, 1, 79, 79, 79, 5, 5, 5, 2, 2, 2, 1, 1, 1, 0, 0, 0), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 2, 77, 77, 77, 1, 77, 77, 77, 5, 5, 5, 4, 4, 4, 1, 1, 1, 1, 1, 1), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 3, 50, 50, 50, 1, 50, 50, 50, 5, 5, 5, 6, 6, 6, 1, 1, 1, 2, 2, 2), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 8, 30, 30, 30, 1, 30, 30, 30, 5, 5, 5, 16, 16, 16, 1, 1, 1, 7, 7, 7), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 40, 15, 15, 15, 1, 15, 15, 15, 5, 5, 5, 20, 20, 20, 1, 1, 1, 9, 9, 9), + PARAMS_3D(ncdhw, oidhw, FMT_BIAS, ncdhw, + 2, 1, 41, 111, 111, 111, 1, 111, 111, 111, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0) +); + } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8fp.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8fp.cpp index 7a1618f..785c96e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8fp.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8fp.cpp @@ -28,7 +28,6 @@ TEST_P(convolution_test, TestConvolution) { } -//#define TEST_PARAM_ATTR #define U8S8 #define DIRECTION_FORWARD #include "convolution_common.h" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8s32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8s32.cpp index bd04f94..6d1d6f7 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8s32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8s32.cpp @@ -28,7 +28,6 @@ TEST_P(convolution_test, TestConvolution) { } -//#define TEST_PARAM_ATTR #define U8S8 #define DIRECTION_FORWARD #include "convolution_common.h" diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_neg_slope_f32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8u8.cpp similarity index 81% rename from inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_neg_slope_f32.cpp rename to inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8u8.cpp index 1c57c30..36c12db 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_neg_slope_f32.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8u8.cpp @@ -18,18 +18,19 @@ #include "gtest/gtest.h" #include "mkldnn.hpp" -#include "test_convolution_relu_forward_common.hpp" +#include "test_convolution_forward_common.hpp" namespace mkldnn { -using convolution_test = convolution_relu_test; +using convolution_test = convolution_forward_test; TEST_P(convolution_test, TestConvolution) { } -#define FP32 +#define U8S8 #define DIRECTION_FORWARD -#define NEGATIVE_SLOPE 0.2f #include "convolution_common.h" +#undef TEST_PARAM_ATTR } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_common.hpp deleted file mode 100644 index c5c1ab1..0000000 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_relu_forward_common.hpp +++ /dev/null @@ -1,201 +0,0 @@ -/******************************************************************************* -* Copyright 2016-2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "mkldnn_test_common.hpp" -#include "gtest/gtest.h" - -#include "mkldnn.hpp" - -namespace mkldnn { - -template -void compute_ref_conv_relu_fwd(const test_convolution_sizes_t &c, - const memory &src, const memory &weights, const memory &bias, - const memory &dst, bool w_bias, float negative_slope) -{ - data_t_src *src_data = (data_t_src *)src.get_data_handle(); - data_t_wei *weights_data = (data_t_wei *)weights.get_data_handle(); - data_t_dst *bias_data - = (data_t_dst *)(w_bias ? bias.get_data_handle() : nullptr); - data_t_dst *dst_data = (data_t_dst *)dst.get_data_handle(); - - const memory::desc src_d = src.get_primitive_desc().desc(); - const memory::desc weights_d = weights.get_primitive_desc().desc(); - const memory::desc dst_d = dst.get_primitive_desc().desc(); - - size_t padded_ic = src_d.data.layout_desc.blocking.padding_dims[1]; - size_t padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1]; - - mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow, - [&](int n, int g, int oc, int oh, int ow) { - size_t oidx = n * padded_oc * c.oh * c.ow - + g * padded_oc / c.ng * c.oh * c.ow - + oc * c.oh * c.ow + oh * c.ow + ow; - dst_data[map_index(dst_d, oidx)] = bias_data ? - bias_data[map_index( - bias.get_primitive_desc().desc(), - g * padded_oc / c.ng + oc)] : - data_t_dst{0}; - for (int ic = 0; ic < c.ic / c.ng; ic++) { - for (int kh = 0; kh < c.kh; kh++) { - for (int kw = 0; kw < c.kw; kw++) { - int iw = ow * c.strw - - c.padw + kw * (1 + c.dilw); - int ih = oh * c.strh - - c.padh + kh * (1 + c.dilh); - if (iw < 0 || iw >= c.iw) continue; - if (ih < 0 || ih >= c.ih) continue; - size_t iidx = n * padded_ic * c.ih * c.iw - + g * padded_ic / c.ng * c.ih * c.iw - + ic * c.ih * c.iw + ih * c.iw + iw; - size_t widx = g * padded_oc / c.ng * padded_ic - / c.ng * c.kh * c.kw - + oc * padded_ic / c.ng * c.kh * c.kw - + ic * c.kh * c.kw + kh * c.kw + kw; - - dst_data[map_index(dst_d, oidx)] - += src_data[map_index(src_d, iidx)] - * weights_data[map_index( - weights_d, widx)]; - } - } - } - - if (dst_data[map_index(dst_d, oidx)] < 0) { - dst_data[map_index(dst_d, oidx)] = - static_cast( negative_slope - * dst_data[map_index(dst_d, oidx)] ); - } - } - ); -} - -template -class convolution_relu_test - : public ::testing::TestWithParam { -protected: - virtual void SetUp() { - auto p = ::testing::TestWithParam::GetParam(); - catch_expected_failures([=](){Test();}, p.expect_to_fail, - p.expected_status); - } - - void Test() { - auto p = ::testing::TestWithParam::GetParam(); - ASSERT_TRUE(p.engine_kind == engine::kind::cpu); - ASSERT_EQ(p.aalgorithm, convolution_direct); - auto eng = engine(p.engine_kind, 0); - float negative_slope = p.relu_negative_slope; - - memory::data_type data_type_src = data_traits::data_type; - memory::data_type data_type_dst = data_traits::data_type; - memory::data_type data_type_wei = data_traits::data_type; - - test_convolution_sizes_t cd = p.sizes; - - auto c_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw }, - data_type_src, p.formats.src_format); - auto c_weights_desc = cd.ng > 1 ? - create_md({ cd.ng, cd.oc / cd.ng, cd.ic / cd.ng, cd.kh, cd.kw }, - data_type_wei, p.formats.weights_format) : - create_md({ cd.oc, cd.ic, cd.kh, cd.kw }, - data_type_wei, p.formats.weights_format); - auto c_dst_desc = create_md({ cd.mb, cd.oc, cd.oh, cd.ow }, - data_type_dst, p.formats.dst_format); - - auto c_src = memory({c_src_desc, eng}); - auto c_weights = memory({c_weights_desc, eng}); - auto c_dst = memory({c_dst_desc, eng}); - - auto dst_ref = memory({c_dst_desc, eng}); - - fill_data(c_src.get_primitive_desc().get_size() - / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle()); - // TODO: Temporary workaround for testing of convolution + relu - if (cd.mb) { - data_t_src *src_data = (data_t_src *)c_src.get_data_handle(); - const int mb_chunk = static_cast( - (c_src.get_primitive_desc().get_size() / sizeof(data_t_src)) - / cd.mb ); - for (int i = 0; i < cd.mb * mb_chunk; ++i) { - if ((i / mb_chunk) % 2) src_data[i] *= (data_t_src)-1.; - } - } - - fill_data( - c_weights.get_primitive_desc().get_size() - / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle()); - fill_data( - c_dst.get_primitive_desc().get_size() - / sizeof(data_t_dst),(data_t_dst *)c_dst.get_data_handle()); - - bool with_bias = p.formats.bias_format != memory::format::format_undef; - auto c_bias_desc = with_bias ? - create_md({ cd.oc }, data_type_dst, p.formats.bias_format) : - create_md({}, data_type_dst, p.formats.bias_format); - auto c_bias = memory({c_bias_desc, eng}); - if (with_bias) { - fill_data( - c_bias.get_primitive_desc().get_size() / sizeof(data_t_dst), - (data_t_dst *)c_bias.get_data_handle(), 1., true); - } - check_zero_tail(1, c_src); - check_zero_tail(1, c_weights); - check_zero_tail(1, c_dst); - - std::vector padR = { - right_padding(cd.ih, cd.oh, cd.kh, cd.padh, cd.strh, cd.dilh), - right_padding(cd.iw, cd.ow, cd.kw, cd.padw, cd.strw, cd.dilw) - }; - - auto conv_desc = with_bias - ? convolution_forward::desc(prop_kind::forward_scoring, - p.aalgorithm, c_src_desc, c_weights_desc, c_bias_desc, - c_dst_desc, { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, - { cd.padh, cd.padw }, padR, padding_kind::zero) - : convolution_forward::desc(prop_kind::forward_scoring, - p.aalgorithm, c_src_desc, c_weights_desc, c_dst_desc, - { cd.strh, cd.strw }, { cd.dilh, cd.dilw }, - { cd.padh, cd.padw }, padR, padding_kind::zero); - - auto conv_relu_desc = - convolution_relu_forward::desc(conv_desc, negative_slope); - auto conv_primitive_desc = - convolution_relu_forward::primitive_desc(conv_relu_desc, eng); - - auto conv = with_bias - ? convolution_relu_forward(conv_primitive_desc, - c_src, c_weights, c_bias, c_dst) - : convolution_relu_forward(conv_primitive_desc, - c_src, c_weights, c_dst); - std::vector pipeline; - pipeline.push_back(conv); - - stream(stream::kind::lazy).submit(pipeline).wait(); - - compute_ref_conv_relu_fwd(cd, c_src, c_weights, c_bias, dst_ref, with_bias, - negative_slope); - check_zero_tail(1, dst_ref); - compare_data(dst_ref, c_dst); - check_zero_tail(0, c_dst); - - } -}; - -} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_deconvolution.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_deconvolution.cpp index 1c2bac8..71e0675 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_deconvolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_deconvolution.cpp @@ -120,7 +120,7 @@ private: std::shared_ptr eng; bool with_bias; - std::vector padR; + std::vector padR; protected: virtual void SetUp() { auto p = ::testing::TestWithParam::GetParam(); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_depthwise.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_depthwise.cpp index 1325398..932ec73 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_depthwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_depthwise.cpp @@ -95,7 +95,8 @@ private: std::shared_ptr bias; std::shared_ptr dst; std::shared_ptr workspace; - std::shared_ptr data_desc; + std::shared_ptr src_desc; + std::shared_ptr dst_desc; std::shared_ptr weights_desc; std::shared_ptr bias_desc; std::shared_ptr depthwise_prim_desc; @@ -126,9 +127,10 @@ protected: memory::dims dims = p.data_format == mkldnn_nc ? memory::dims({p.dims[0], p.dims[1]}) : p.dims; - data_desc.reset(new memory::desc(dims, data_type, p.data_format)); - src.reset(new memory({*data_desc, *eng})); - dst.reset(new memory({*data_desc, *eng})); + src_desc.reset(new memory::desc(dims, data_type, p.data_format)); + dst_desc.reset(new memory::desc(dims, data_type, p.data_format)); + src.reset(new memory({*src_desc, *eng})); + dst.reset(new memory({*dst_desc, *eng})); fill_data(data_size, (data_t *)src->get_data_handle(), data_t(0), data_t(1)); @@ -146,8 +148,8 @@ protected: std::vector pipeline; auto depthwise_desc = with_bias - ? depthwise_forward::desc(prop_kind::forward_training, p.alg_kind, *data_desc, *data_desc, *weights_desc, *bias_desc) - : depthwise_forward::desc(prop_kind::forward_training, p.alg_kind, *data_desc, *data_desc, *weights_desc); + ? depthwise_forward::desc(prop_kind::forward_training, p.alg_kind, *src_desc, *dst_desc, *weights_desc, *bias_desc) + : depthwise_forward::desc(prop_kind::forward_training, p.alg_kind, *src_desc, *dst_desc, *weights_desc); depthwise_prim_desc.reset(new depthwise_forward::primitive_desc(depthwise_desc, *eng)); auto depthwise = with_bias @@ -158,7 +160,7 @@ protected: auto s = stream(stream::kind::lazy); s.submit(pipeline).wait(); - check_depthwise_fwd(p, *data_desc, *src, *weights, *bias, with_bias, *dst); + check_depthwise_fwd(p, *src_desc, *src, *weights, *bias, with_bias, *dst); } }; diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_eltwise.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_eltwise.cpp index e75e377..b1e1381 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_eltwise.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_eltwise.cpp @@ -16,116 +16,12 @@ #include "gtest/gtest.h" #include "mkldnn_test_common.hpp" - +#include "math_utils.hpp" #include "mkldnn.hpp" -namespace mkldnn { - -template inline T relu_fwd(T s, A alpha) { - return s > 0 ? s : static_cast(s * alpha); -} -template inline T relu_bwd(T dd, T s, A alpha) { - return s > 0 ? dd : static_cast(dd * alpha); -} -template T tanh_fwd(T s) { - return static_cast(::tanhf((float)s)); -} -template T tanh_bwd(T dd, T s) { - const float th = ::tanhf((float)s); - return static_cast(dd * (1 - th) * (1 + th)); -} - -template T elu_fwd(T s, A alpha) { - return s > 0 ? s : static_cast(alpha * (::expf(s) - 1)); -} -template T elu_bwd(T dd, T s, A alpha) { - return static_cast(dd * (s > 0 ? 1 : alpha * ::expf(s))); -} - -template -T square_fwd(T s) { - return s * s; -} +using namespace mkldnn::impl::math; -template -T square_bwd(T dd, T s) { - return dd * 2*s; -} - -template -T abs_fwd(T s) { - return s > 0 ? s : -s;; -} - -template -T abs_bwd(T dd, T s) { - return dd * (s > 0 ? 1 : s < 0 ? -1 : 0); -} - -template -T sqrt_fwd(T s) { - return s > 0 ? ::sqrtf(s) : 0; -} - -template -T sqrt_bwd(T dd, T s) { - return s > 0 ? dd / (2 * ::sqrtf(s)) : 0; -} - -template -T linear_fwd(T s, A alpha, A beta) { - return alpha * s + beta; -} - -template -T linear_bwd(T dd, T s, A alpha, A beta) { - (void) s; - (void) beta; - return dd * alpha; -} - -template -T bounded_relu_fwd(T s, A alpha) { - s = s > 0 ? s : 0; - return s > alpha ? alpha : s; -} - -template -T bounded_relu_bwd(T dd, T s, A alpha) { - return dd * ((0 < s && s < alpha) ? 1 : 0); -} - -template -T soft_relu_fwd(T s) { - return s < (T)logf(FLT_MAX) ? log1pf(::expf(s)) : s; -} - -template -T soft_relu_bwd(T dd, T s) { - return dd / (1 + ::expf(-s)); -} - -template -T logistic_fwd(T s) { - T v = (T)(::expf(- (float)s)); - return 1 / (1 + v); -} - -template -T logistic_bwd(T dd, T s) { - T v = logistic_fwd(s); - return dd * v * (1 - v); -} - -template -T clamp_fwd(T s, A alpha, A beta) { - return s > alpha ? (T)(alpha) : s < beta ? (T)(beta) : s; -} - -template -T clamp_bwd(T dd, T s, A alpha, A beta) { - return dd * ((beta < s && s < alpha) ? 1 : 0); -} +namespace mkldnn { template struct eltwise_test_params { @@ -141,7 +37,7 @@ struct eltwise_test_params { size_t n_elems(const memory::desc &md) { size_t p = 1; - const int *pdims = md.data.layout_desc.blocking.padding_dims; + const ptrdiff_t *pdims = md.data.layout_desc.blocking.padding_dims; for (int i = 0; i < md.data.ndims; ++i) p *= (size_t)(pdims[i]); return p; @@ -172,6 +68,8 @@ void check_eltwise_fwd(const eltwise_test_params &p, case eltwise_soft_relu: ref_d = soft_relu_fwd(s); break; case eltwise_logistic: ref_d = logistic_fwd(s); break; case eltwise_clamp: ref_d = clamp_fwd(s, p.alpha, p.beta); break; + case eltwise_exp: ref_d = exp_fwd(s); break; + case eltwise_not: ref_d = not_fwd(s); break; default: assert(!"unknown alg_kind"); } dst_data[i] = ref_d; @@ -236,6 +134,7 @@ void check_eltwise_bwd(const eltwise_test_params &p, break; case eltwise_logistic: ref_ds = logistic_bwd(ref_dd, ref_s); break; case eltwise_clamp: ref_ds = clamp_bwd(ref_dd, ref_s, p.alpha, p.beta); break; + case eltwise_exp: ref_ds = exp_bwd(ref_dd, ref_s); break; default: assert(!"unknown alg_kind"); } EXPECT_NEAR(diff_src_data[map_index(diff_data_d, i)], ref_ds, 1.e-6); @@ -289,7 +188,7 @@ protected: data_t data_median = data_t(0); data_t data_deviation - = p.alg_kind == eltwise_elu ? data_t(1) : data_t(200); + = p.alg_kind == eltwise_elu || p.alg_kind == eltwise_exp ? data_t(1) : data_t(200); fill_data(n_elems(*data_desc), (data_t *)src->get_data_handle(), data_median, data_deviation); check_zero_tail(1, *src); @@ -366,13 +265,16 @@ TEST_P(eltwise_test_float, TestsEltwise) EXPAND(PARAMS(eltwise_square, __VA_ARGS__)), \ EXPAND(PARAMS(eltwise_abs, __VA_ARGS__)) + #define PARAMS_ALL_ALG_SDPART(...) \ EXPAND(PARAMS(eltwise_sqrt, __VA_ARGS__)), \ EXPAND(PARAMS(eltwise_linear, __VA_ARGS__)), \ EXPAND(PARAMS(eltwise_soft_relu, __VA_ARGS__)), \ EXPAND(PARAMS(eltwise_bounded_relu, __VA_ARGS__)), \ EXPAND(PARAMS(eltwise_logistic, __VA_ARGS__)), \ - EXPAND(PARAMS(eltwise_clamp, __VA_ARGS__)) + EXPAND(PARAMS(eltwise_clamp, __VA_ARGS__)), \ + EXPAND(PARAMS(eltwise_exp, __VA_ARGS__)) + #define INST_TEST_CASE(str, ...) INSTANTIATE_TEST_CASE_P( \ str, eltwise_test_float, ::testing::Values(__VA_ARGS__)) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp index fa8e683..f468d3e 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp @@ -33,8 +33,14 @@ namespace mkldnn { -struct test_params { +struct test_igemm_params { char offsetc; + bool zero_oa; + bool zero_ob; + bool zero_oc; +}; + +struct test_params { char transA; char transB; int M; @@ -46,6 +52,7 @@ struct test_params { int ldb; int ldc; + test_igemm_params igemm_params; bool expect_to_fail; mkldnn_status_t expected_status; }; @@ -77,9 +84,9 @@ void ref_gemm(const char *transa, const char *transb, int m, int n, int k, template void ref_gemm_s8x8s32(const char *transa, const char *transb, const char *offsetc, int m, int n, int k, const float alpha, - int8_t *A, int lda, const int8_t *ao, b_dt *B, int ldb, - const int8_t *bo, const float beta, int32_t *C, int ldc, - const int32_t *co) { + int8_t *A, int lda, const int8_t *oa, b_dt *B, int ldb, + const int8_t *ob, const float beta, int32_t *C, int ldc, + const int32_t *oc) { bool OCisR = (*offsetc == 'R' || *offsetc == 'r'); bool OCisC = (*offsetc == 'C' || *offsetc == 'c'); @@ -104,14 +111,14 @@ void ref_gemm_s8x8s32(const char *transa, const char *transb, const int a_cols = AisN ? k : m; mkldnn::impl::parallel_nd(a_cols, a_rows, [&](int j, int i) { da_setter(i, j, - static_cast(ia_accessor(i, j)) + static_cast(ao[0])); + static_cast(ia_accessor(i, j)) + static_cast(oa[0])); }); const int b_rows = BisN ? k : n; const int b_cols = BisN ? n : k; mkldnn::impl::parallel_nd(b_cols, b_rows, [&](int j, int i) { db_setter(i, j, - static_cast(ib_accessor(i, j)) + static_cast(bo[0])); + static_cast(ib_accessor(i, j)) + static_cast(ob[0])); }); ref_gemm(transa, transb, m, n, k, 1.0, dA, lda, dB, ldb, 0.0, dC, ldc); @@ -120,7 +127,7 @@ void ref_gemm_s8x8s32(const char *transa, const char *transb, auto f2d = [=] (float v) { return static_cast(v); }; mkldnn::impl::parallel_nd(n, m, [&] (int j, int i) { - double coffset = OCisR ? i2d(co[j]) : OCisC ? i2d(co[i]) : i2d(co[0]); + double coffset = OCisR ? i2d(oc[j]) : OCisC ? i2d(oc[i]) : i2d(oc[0]); double val = ((beta == 0.0f) ? 0.0 : f2d(beta) * i2d(C[i + j * ldc])) + f2d(alpha) * dC[i + j * ldc] + coffset; C[i + j * ldc] = @@ -132,20 +139,31 @@ void ref_gemm_s8x8s32(const char *transa, const char *transb, test_free((char *)dC); } -template -void compare(int M, int N, int ldc, T *C, T *C_ref, int K = 1) { - mkldnn::impl::parallel_nd(N, ldc, [&](int i, int j) { - T ref = C_ref[i*ldc + j]; - T got = C[i*ldc + j]; - T diff = got - ref; - if (data_traits::data_type == memory::data_type::f32) { - T e = (std::abs(ref) > 1e-4) ? diff / ref : diff; - EXPECT_NEAR(e, 0.0, 1e-4) - << "Row: " << j << " Column: " << i; +template +void compare(int m, int n, const c_dt *c, const c_dt *c_ref, int ldc, + float alpha = 1.0f, float beta = 0.0f, int k = 1) { + using data_type = memory::data_type; + mkldnn::impl::parallel_nd(n, ldc, [&](int i, int j) { + c_dt ref = c_ref[i*ldc + j]; + c_dt got = c[i*ldc + j]; + c_dt diff = got - ref; + + if (data_traits::data_type == data_type::f32) { + c_dt e = (std::abs(ref) > 1e-4) ? diff / ref : diff; + EXPECT_NEAR(e, 0.0, 1e-4) << "Row: " << j << " Col: " << i; } else { - T eps = K / 1000 + 1; - EXPECT_NEAR(diff, 0, eps) - << "Row: " << j << " Column: " << i; + // igemm + if (alpha == 1.0f) { + EXPECT_NEAR(diff, 0, 1) << "Row: " << j << " Col: " << i; + } else { + if (data_traits::data_type == data_type::u8) { + c_dt eps = k / 1000 + 1; + EXPECT_NEAR(diff, 0, eps) << "Row: " << j << " Col: " << i; + } else if (data_traits::data_type == data_type::s8) { + c_dt eps = k / 500 + 1; + EXPECT_NEAR(diff, 0, eps) << "Row: " << j << " Col: " << i; + } + } } }); } @@ -165,15 +183,23 @@ inline T* get_matrix_buffer(size_t n) { } template -inline void fill_matrix(size_t sizeA, size_t sizeB, size_t sizeC, size_t sizeco, - a_dt *A, b_dt *B, c_dt *C, a_dt *ao, a_dt *bo, c_dt *co) { +inline void fill_matrix(const test_params &p, size_t sizeA, size_t sizeB, + size_t sizeC, size_t sizeco, a_dt *A, b_dt *B, c_dt *C, a_dt *oa, + a_dt *ob, c_dt *oc) { fill_data(sizeA, A); fill_data(sizeB, B); fill_data(sizeC, C); - if (ao != nullptr && bo != nullptr && co != nullptr) { - fill_data(1, ao); - fill_data(1, bo); - fill_data(sizeco, co); + if (oa != nullptr && ob != nullptr && oc != nullptr) { + if (p.igemm_params.zero_oa) (*oa) = 0; + else fill_data(1, oa); + + if (p.igemm_params.zero_ob) (*ob) = 0; + else fill_data(1, ob); + + if (p.igemm_params.zero_oc) { + for (size_t i = 0; i < sizeco; i++) + oc[i] = 0; + } else fill_data(sizeco, oc); } } @@ -190,37 +216,37 @@ void run_test_gemm(const test_params &p) { int32_t *C = get_matrix_buffer(sizeC); int32_t *C_ref = get_matrix_buffer(sizeC); - bool OCisR = (p.offsetc == 'R' || p.offsetc == 'r'); - bool OCisC = (p.offsetc == 'C' || p.offsetc == 'c'); + bool OCisR = (p.igemm_params.offsetc == 'R' || p.igemm_params.offsetc == 'r'); + bool OCisC = (p.igemm_params.offsetc == 'C' || p.igemm_params.offsetc == 'c'); size_t sizeco = OCisR ? p.N : OCisC ? p.M : 1; - int8_t ao, bo; - int32_t *co = get_matrix_buffer(sizeco); + int8_t oa, ob; + int32_t *oc = get_matrix_buffer(sizeco); - fill_matrix(sizeA, sizeB, sizeC, sizeco, A, B, C, - &ao, &bo, co); + fill_matrix(p, sizeA, sizeB, sizeC, sizeco, + A, B, C, &oa, &ob, oc); mkldnn::impl::parallel_nd(p.ldc * p.N, [&](int i) { C_ref[i] = static_cast(C[i]); }); - auto status = mkldnn_gemm_s8u8s32(&p.transA, &p.transB, &p.offsetc, - &p.M, &p.N, &p.K, &p.alpha, A, &p.lda, &ao, B, &p.ldb, &bo, - &p.beta, C, &p.ldc, co); + auto status = mkldnn_gemm_s8u8s32(&p.transA, &p.transB, &p.igemm_params.offsetc, + &p.M, &p.N, &p.K, &p.alpha, A, &p.lda, &oa, B, &p.ldb, &ob, + &p.beta, C, &p.ldc, oc); if (status != mkldnn_success) throw error(status, "mkldnn_gemm_s8u8s32 returned error"); - ref_gemm_s8x8s32(&p.transA, &p.transB, &p.offsetc, p.M, p.N, - p.K, p.alpha, A, p.lda, &ao, B, p.ldb, &bo, p.beta, C_ref, - p.ldc, co); + ref_gemm_s8x8s32(&p.transA, &p.transB, &p.igemm_params.offsetc, p.M, p.N, + p.K, p.alpha, A, p.lda, &oa, B, p.ldb, &ob, p.beta, C_ref, + p.ldc, oc); - compare(p.M, p.N, p.ldc, C, C_ref, p.K); + compare(p.M, p.N, C, C_ref, p.ldc, p.alpha, p.beta, p.K); test_free((char *)A); test_free((char *)B); test_free((char *)C); test_free((char *)C_ref); - test_free((char *)co); + test_free((char *)oc); } template <> @@ -233,37 +259,37 @@ void run_test_gemm(const test_params &p) { int32_t *C = get_matrix_buffer(sizeC); int32_t *C_ref = get_matrix_buffer(sizeC); - bool OCisR = (p.offsetc == 'R' || p.offsetc == 'r'); - bool OCisC = (p.offsetc == 'C' || p.offsetc == 'c'); + bool OCisR = (p.igemm_params.offsetc == 'R' || p.igemm_params.offsetc == 'r'); + bool OCisC = (p.igemm_params.offsetc == 'C' || p.igemm_params.offsetc == 'c'); size_t sizeco = OCisR ? p.N : OCisC ? p.M : 1; - int8_t ao, bo; - int32_t* co = get_matrix_buffer(sizeco); + int8_t oa, ob; + int32_t* oc = get_matrix_buffer(sizeco); - fill_matrix(sizeA, sizeB, sizeC, sizeco, A, B, C, - &ao, &bo, co); + fill_matrix(p, sizeA, sizeB, sizeC, sizeco, A, B, C, + &oa, &ob, oc); mkldnn::impl::parallel_nd(p.ldc * p.N, [&](int i) { C_ref[i] = static_cast(C[i]); }); - auto status = mkldnn_gemm_s8s8s32(&p.transA, &p.transB, &p.offsetc, - &p.M, &p.N, &p.K, &p.alpha, A, &p.lda, &ao, B, &p.ldb, &bo, - &p.beta, C, &p.ldc, co); + auto status = mkldnn_gemm_s8s8s32(&p.transA, &p.transB, &p.igemm_params.offsetc, + &p.M, &p.N, &p.K, &p.alpha, A, &p.lda, &oa, B, &p.ldb, &ob, + &p.beta, C, &p.ldc, oc); if (status != mkldnn_success) throw error(status, "mkldnn_gemm_s8s8s32 returned error"); - ref_gemm_s8x8s32(&p.transA, &p.transB, &p.offsetc, p.M, p.N, - p.K, p.alpha, A, p.lda, &ao, B, p.ldb, &bo, p.beta, C_ref, - p.ldc, co); + ref_gemm_s8x8s32(&p.transA, &p.transB, &p.igemm_params.offsetc, p.M, p.N, + p.K, p.alpha, A, p.lda, &oa, B, p.ldb, &ob, p.beta, C_ref, + p.ldc, oc); - compare(p.M, p.N, p.ldc, C, C_ref, p.K); + compare(p.M, p.N, C, C_ref, p.ldc, p.alpha, p.beta, p.K); test_free((char *)A); test_free((char *)B); test_free((char *)C); test_free((char *)C_ref); - test_free((char *)co); + test_free((char *)oc); } template <> @@ -276,7 +302,7 @@ void run_test_gemm(const test_params &p) { float *C = get_matrix_buffer(sizeC); float *C_ref = get_matrix_buffer(sizeC); - fill_matrix(sizeA, sizeB, sizeC, 0, A, B, C, + fill_matrix(p, sizeA, sizeB, sizeC, 0, A, B, C, nullptr, nullptr, nullptr); mkldnn::impl::parallel_nd(p.N * p.ldc, [&](int i) { C_ref[i] = C[i]; }); @@ -286,7 +312,7 @@ void run_test_gemm(const test_params &p) { if (status == mkldnn_success) { ref_gemm(&p.transA, &p.transB, p.M, p.N, p.K, p.alpha, A, p.lda, B, p.ldb, p.beta, C_ref, p.ldc); - compare(p.M, p.N, p.ldc, C, C_ref); + compare(p.M, p.N, C, C_ref, p.ldc); } test_free((char *)A); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_memory.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_memory.cpp index 7de9067..de7b237 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_memory.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_memory.cpp @@ -73,18 +73,29 @@ TEST_F(memory_test, WeightPaddingTest) { data_t *mem0_ptr = (data_t *)mem0.get_data_handle(); fill_data(O_16*I_16*H*W, mem0_ptr); + /* mem1 is OIhw16i16o with fmt = OIhw16i16o */ std::vector mem1_vec(phys_sz); mem1_vec.assign(mem0_ptr, mem0_ptr + mem0.get_primitive_desc().get_size() / sizeof(data_t)); - mkldnn::memory mem1({{{O, I, H, W}, memory::data_type::f32, memory::format::OIhw16i16o}, e}, &mem1_vec[0]); - check_zero_tail(0, mem1); + + /* mem2 is OIhw16i16o with fmt = blocked */ + std::vector mem2_vec(phys_sz); + mem2_vec.assign(mem0_ptr, + mem0_ptr + mem0.get_primitive_desc().get_size() / sizeof(data_t)); + mkldnn::memory::desc mem2_d = mem1.get_primitive_desc().desc(); + mem2_d.data.format = mkldnn_blocked; + mkldnn::memory mem2({mem2_d, e}, &mem2_vec[0]); + check_zero_tail(0, mem2); + check_zero_tail(1, mem0); + for (size_t i = 0; i < phys_sz; ++i) + EXPECT_NEAR(mem0_ptr[i], mem1_vec[i], 1e-7) << i << " :mem1"; for (size_t i = 0; i < phys_sz; ++i) - EXPECT_NEAR(mem0_ptr[i], mem1_vec[i], 1e-7) << i; + EXPECT_NEAR(mem0_ptr[i], mem2_vec[i], 1e-7) << i << " :mem2"; } } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_pooling_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_pooling_forward.cpp index aa1a191..d855301 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_pooling_forward.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_pooling_forward.cpp @@ -190,11 +190,11 @@ protected: check_zero_tail(1, p_dst); // calculate right padding exactly - std::vector padR_2d = { + std::vector padR_2d = { right_padding(pd.ih, pd.oh, pd.kh, pd.padt, pd.strh), right_padding(pd.iw, pd.ow, pd.kw, pd.padl, pd.strw) }; - std::vector padR_3d = { + std::vector padR_3d = { right_padding(pd.id, pd.od, pd.kd, pd.padf, pd.strd), right_padding(pd.ih, pd.oh, pd.kh, pd.padt, pd.strh), right_padding(pd.iw, pd.ow, pd.kw, pd.padl, pd.strw) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_relu.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_relu.cpp deleted file mode 100644 index a837e3c..0000000 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_relu.cpp +++ /dev/null @@ -1,249 +0,0 @@ -/******************************************************************************* -* Copyright 2016-2018 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "gtest/gtest.h" -#include "mkldnn_test_common.hpp" - -#include "mkldnn.hpp" - -namespace mkldnn { - -template -struct relu_test_params { - engine::kind engine_kind; - memory::format data_format; - memory::format diff_format; - data_t negative_slope; - memory::dims dims; - bool expect_to_fail; - mkldnn_status_t expected_status; -}; - -template -void check_relu_fwd(data_t negative_slope, const memory::desc &md, - const memory &src, const memory &dst) -{ - data_t *src_data = (data_t *)src.get_data_handle(); - data_t *dst_data = (data_t *)dst.get_data_handle(); - - ASSERT_EQ(md.data.ndims, 4); - ASSERT_EQ(md.data.data_type, memory::data_type::f32); // TODO: type assert - - size_t N = md.data.dims[0]; - size_t C = md.data.dims[1]; - size_t H = md.data.dims[2]; - size_t W = md.data.dims[3]; - for (size_t i = 0; i < N * C * H * W; ++i) { - data_t s = src_data[i]; - EXPECT_NEAR(dst_data[i], s > 0 ? s : s * negative_slope, 1.e-7); - } -} - -template -void check_relu_bwd(data_t negative_slope, const memory::desc &md, - const memory &src, const memory &diff_dst, const memory &diff_src) -{ - data_t *src_data = (data_t *)src.get_data_handle(); - data_t *diff_dst_data = (data_t *)diff_dst.get_data_handle(); - data_t *diff_src_data = (data_t *)diff_src.get_data_handle(); - - const memory::desc data_d = src.get_primitive_desc().desc(); - const memory::desc diff_data_d = diff_src.get_primitive_desc().desc(); - - ASSERT_EQ(md.data.ndims, 4); - ASSERT_EQ(md.data.data_type, memory::data_type::f32); // TODO: type assert - - size_t N = md.data.dims[0]; - size_t C = md.data.dims[1]; - size_t H = md.data.dims[2]; - size_t W = md.data.dims[3]; - for (size_t i = 0; i < N * C * H * W; ++i) { - data_t ref_s = src_data[map_index(data_d, i)]; - data_t ref_dd = diff_dst_data[map_index(diff_data_d, i)]; - data_t ref_ds = ref_dd * ((ref_s > 0) ? data_t{1} : negative_slope); - EXPECT_NEAR(diff_src_data[map_index(diff_data_d, i)], ref_ds, 1.e-7); - } -} - -template -class relu_test : public ::testing::TestWithParam> { -private: - std::shared_ptr src; - std::shared_ptr diff_src; - std::shared_ptr dst; - std::shared_ptr diff_dst; - std::shared_ptr workspace; - std::shared_ptr data_desc; - std::shared_ptr diff_data_desc; - std::shared_ptr relu_prim_desc; - relu_test_params p; - std::shared_ptr eng; - memory::data_type data_type; - int size; - -protected: - virtual void SetUp() { - p = ::testing::TestWithParam::GetParam(); - catch_expected_failures([=](){Test();}, p.expect_to_fail, - p.expected_status); - } - - void Test() { - p = ::testing::TestWithParam::GetParam(); - - ASSERT_TRUE(p.engine_kind == engine::kind::cpu); - eng.reset(new engine(p.engine_kind, 0)); - - ASSERT_EQ(p.dims.size(), 4U); - - data_type = data_traits::data_type; - ASSERT_EQ(data_type, mkldnn::memory::data_type::f32); - - size = p.dims[0] * p.dims[1] * p.dims[2] * p.dims[3]; - - Forward(); - Backward(); - } - - void Forward() { - data_desc.reset(new memory::desc(p.dims, data_type, - p.data_format)); - diff_data_desc.reset(new memory::desc(p.dims, data_type, - p.diff_format)); - src.reset(new memory({*data_desc, *eng})); - dst.reset(new memory({*data_desc, *eng})); - - fill_data(size, (data_t *)src->get_data_handle(), - data_t(0), data_t(1)); - - auto relu_desc = relu_forward::desc(prop_kind::forward_training, - algorithm::eltwise_relu, *data_desc, p.negative_slope); - relu_prim_desc.reset( - new relu_forward::primitive_desc(relu_desc, *eng)); - auto relu = relu_forward(*relu_prim_desc, *src, *dst); - - std::vector pipeline; - pipeline.push_back(relu); - auto s = stream(stream::kind::lazy); - s.submit(pipeline).wait(); - - check_relu_fwd(p.negative_slope, *data_desc, - *src, *dst); - } - - void Backward() { - diff_src.reset(new memory({*diff_data_desc, *eng})); - diff_dst.reset(new memory({*diff_data_desc, *eng})); - - fill_data(size, (data_t *)diff_dst->get_data_handle(), - data_t(0), data_t(1)); - - auto relu_bwd_desc = relu_backward::desc(algorithm::eltwise_relu, - *diff_data_desc, *data_desc, p.negative_slope); - auto relu_bwd_prim_desc = relu_backward::primitive_desc( - relu_bwd_desc, *eng, *relu_prim_desc); - auto relu_bwd = relu_backward(relu_bwd_prim_desc, *src, *diff_dst, - *diff_src); - - std::vector pipeline; - pipeline.push_back(relu_bwd); - auto s = stream(stream::kind::lazy); - s.submit(pipeline).wait(); - - check_relu_bwd(p.negative_slope, *data_desc, - *src, *diff_dst, *diff_src); - } -}; - -using relu_test_float = relu_test; -using relu_test_params_float = relu_test_params; - -TEST_P(relu_test_float, TestsReLU) -{ -} - -#define EXPAND_SIZES(mb, c, h, w) { mb, c, h, w } -#define EXPAND_FORMATS(data) memory::format::data - -#define ENGINE engine::kind::cpu - -#define PARAMS_EF(data, diff_data, ns, mb, c, h, w, ef, es) \ - relu_test_params_float { ENGINE, \ - EXPAND_FORMATS(data), EXPAND_FORMATS(diff_data), \ - ns, EXPAND_SIZES(mb, c, h, w), ef, es} - -#define PARAMS(data, diff_data, ns, mb, c, h, w) \ - PARAMS_EF(data, diff_data, ns, mb, c, h, w, false, mkldnn_success) - -#define INST_TEST_CASE(str, ...) INSTANTIATE_TEST_CASE_P( \ - str, relu_test_float, ::testing::Values(__VA_ARGS__)) - -INST_TEST_CASE(SimpleZeroDim, - PARAMS(nchw, nchw, 0.f, 0, 8, 4, 4), - PARAMS(nchw, nchw, 0.f, 2, 0, 4, 4), - PARAMS(nchw, nchw, 0.f, 2, 8, 0, 4), - PARAMS(nchw, nchw, 0.f, 2, 8, 4, 0) -); - -INST_TEST_CASE(SimpleEF, - PARAMS_EF(nchw, nchw, 0.f, -1, 8, 4, 4, true, mkldnn_invalid_arguments), - PARAMS_EF(nchw, nchw, 0.f, 2, -1, 4, 4, true, mkldnn_invalid_arguments), - PARAMS_EF(nchw, nchw, 0.f, 2, 8, -1, 4, true, mkldnn_invalid_arguments), - PARAMS_EF(nchw, nchw, 0.f, 2, 8, 4, -1, true, mkldnn_invalid_arguments) -); - -INST_TEST_CASE(SimpleZeroNegativeSlope_NCHW, - //PARAMS(nchw, nchw, 0.f, 1, 8, 10000, 10000), // is a tensor of 3 Gb data ok? YES (330 s runtime, slow) - //PARAMS(nchw, nchw, 0.f, 1, 12, 10000, 10000), // is a tensor of >4 Gb data ok? worked once (release mode) - PARAMS(nchw, nchw, 0.f, 2, 8, 4, 4), - PARAMS(nchw, nchw, 0.f, 2, 16, 4, 4), - PARAMS(nchw, nchw, 0.f, 2, 16, 8, 8), - PARAMS(nchw, nchw, 0.f, 2, 16, 16, 8), - PARAMS(nchw, nchw, 0.f, 2, 16, 10, 8), - PARAMS(nchw, nchw, 0.f, 10, 10, 10, 10), - PARAMS(nchw, nchw, 0.f, 256, 64, 8, 16), - PARAMS(nchw, nchw, 0.f, 1, 1, 1, 1), - PARAMS(nchw, nchw, 0.f, 3, 5, 7, 11) -); - -INST_TEST_CASE(Simple_NCHW, - PARAMS(nchw, nchw, 0.1f, 2, 8, 4, 4), - PARAMS(nchw, nchw, 0.1f, 2, 16, 4, 4), - PARAMS(nchw, nchw, 0.1f, 2, 16, 8, 8), - PARAMS(nchw, nchw, 0.1f, 2, 16, 16, 8), - PARAMS(nchw, nchw, 0.1f, 2, 16, 10, 8), - PARAMS(nchw, nchw, 0.1f, 10, 10, 10, 10), - PARAMS(nchw, nchw, 0.1f, 256, 64, 8, 16), - PARAMS(nchw, nchw, 0.1f, 1, 1, 1, 1), - PARAMS(nchw, nchw, 0.1f, 3, 5, 7, 11) -); - -INST_TEST_CASE(Simple, - PARAMS(nchw, nChw8c, 0.1f, 2, 8, 4, 4), - PARAMS(nChw8c, nchw, 0.1f, 2, 16, 4, 4), - PARAMS(nchw, nchw, 0.1f, 2, 16, 8, 8), - PARAMS(nChw8c, nChw8c, 0.1f, 2, 16, 16, 8), - PARAMS(nhwc, nchw, 0.1f, 2, 16, 10, 8), - PARAMS(nchw, nhwc, 0.1f, 10, 10, 10, 10) -); - -INST_TEST_CASE(AlexNet_NCHW, - PARAMS(nchw, nchw, 0.f, 2, 96, 55, 55), - PARAMS(nchw, nchw, 0.f, 2, 256, 27, 27), - PARAMS(nchw, nchw, 0.f, 2, 384, 13, 13) -); - -} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_reorder.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_reorder.cpp index e182e91..d4b5fbe 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_reorder.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_reorder.cpp @@ -29,7 +29,7 @@ inline void check_reorder(const memory::desc &md_i, const memory::desc &md_o, const data_i_t *src, const data_o_t *dst) { const int ndims = md_i.data.ndims; - const int *dims = md_i.data.dims; + const ptrdiff_t *dims = md_i.data.dims; const size_t nelems = std::accumulate( dims, dims + ndims, size_t(1), std::multiplies()); @@ -333,7 +333,11 @@ TEST_P(reorder_simple_test_weights_f32_f32_1, TestsReorder) { } INSTANTIATE_TEST_CASE_P(TestReorder, reorder_simple_test_weights_f32_f32_1, ::testing::Values( cfg_f32{eng::cpu, fmt::goihw, fmt::Goihw16g, {32, 32, 32, 3, 3}}, - cfg_f32{eng::cpu, fmt::Goihw16g, fmt::goihw, {32, 32, 32, 3, 3}} + cfg_f32{eng::cpu, fmt::Goihw16g, fmt::goihw, {32, 32, 32, 3, 3}}, + cfg_f32{eng::cpu, fmt::oihw, fmt::iohw, {32, 32, 3, 3}}, + cfg_f32{eng::cpu, fmt::iohw, fmt::oihw, {32, 32, 3, 3}}, + cfg_f32{eng::cpu, fmt::goihw, fmt::giohw, {2, 32, 32, 3, 3}}, + cfg_f32{eng::cpu, fmt::giohw, fmt::goihw, {2, 32, 32, 3, 3}} ) ); diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_rnn_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_rnn_forward.cpp new file mode 100644 index 0000000..a0614c3 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_rnn_forward.cpp @@ -0,0 +1,243 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include + +#include "gtest/gtest.h" +#include "mkldnn_test_common.hpp" + +#include "mkldnn.hpp" + +namespace mkldnn { + +struct test_rnn_sizes_t { + test_rnn_sizes_t( + int l, int d, int t, int mb, + int slc, int sic, int dlc, int dic) : + l(l), d(d), t(t), mb(mb), + slc(slc), sic(sic), dlc(dlc), dic(dic) {} + int l, d; + int t; + int mb; + int slc, sic, dlc, dic; +}; + +struct test_rnn_formats_t { + mkldnn::memory::format src_layer_fmt; + mkldnn::memory::format src_iter_fmt; + mkldnn::memory::format weights_layer_fmt; + mkldnn::memory::format weights_iter_fmt; + mkldnn::memory::format bias_fmt; + mkldnn::memory::format dst_layer_fmt; + mkldnn::memory::format dst_iter_fmt; +}; + +struct test_rnn_params_t { + const mkldnn::engine::kind engine_kind; + mkldnn::algorithm aalgorithm; + mkldnn::algorithm activation; + mkldnn::rnn_direction direction; + test_rnn_formats_t fmts; + test_rnn_sizes_t sizes; + bool expect_to_fail; + mkldnn_status_t expected_status; +}; + +// We assume uniform data type accross tensors for now +template +class rnn_forward_test + : public ::testing::TestWithParam { +protected: + virtual void SetUp() { + auto p = ::testing::TestWithParam::GetParam(); + catch_expected_failures([=](){Test();}, p.expect_to_fail, + p.expected_status, false); + } + + void Test() { + auto p = ::testing::TestWithParam::GetParam(); + ASSERT_TRUE(p.engine_kind == engine::kind::cpu); + auto eng = engine(p.engine_kind, 0); + //@todo check algorithm is one of the supported by RNN + //ASSERT_EQ(p.aalgorithm, algorithm::vanilla_lstm); + + // Initialize the data + memory::data_type prec = data_traits::data_type; + auto dims = p.sizes; + auto t = dims.t, mb = dims.mb, l = dims.l, d = dims.d; + auto slc = dims.slc, sic = dims.sic, dlc = dims.dlc, dic = dims.dic; + int s, g; + + switch (p.aalgorithm) { + case vanilla_lstm: + g = 4; s = 2; break; + case vanilla_gru: + case gru_linear_before_reset: + g = 3; s = 1; break; + default: + g = 1; s = 1; break; + }; + + mkldnn::memory::dims weights_layer_dims = {l, d, slc, g, dic}; + mkldnn::memory::dims weights_iter_dims = {l, d, sic, g, dic}; + mkldnn::memory::dims bias_dims = {l, d, g, dic}; + mkldnn::memory::dims src_layer_dims = {t, mb, slc}; + mkldnn::memory::dims src_iter_dims = {l, d, s, mb, sic}; + mkldnn::memory::dims dst_layer_dims = {t, mb, dlc}; + mkldnn::memory::dims dst_iter_dims = {l, d, s, mb, dic}; + + auto weights_layer_md_any = memory::desc({weights_layer_dims}, prec, memory::format::any); + auto weights_iter_md_any = memory::desc({weights_iter_dims}, prec, memory::format::any); + auto bias_md_any = memory::desc({bias_dims}, prec, memory::format::any); + auto src_layer_md_any = memory::desc({src_layer_dims}, prec, memory::format::any); + auto src_iter_md_any = memory::desc({src_iter_dims}, prec, memory::format::any); + auto dst_layer_md_any = memory::desc({dst_layer_dims}, prec, memory::format::any); + auto dst_iter_md_any = memory::desc({dst_iter_dims}, prec, memory::format::any); + + auto weights_layer_md_tgt = memory::desc({weights_layer_dims}, prec, p.fmts.weights_layer_fmt); + auto weights_iter_md_tgt = memory::desc({weights_iter_dims}, prec, p.fmts.weights_iter_fmt); + auto bias_md_tgt = memory::desc({bias_dims}, prec, p.fmts.bias_fmt); + auto src_layer_md_tgt = memory::desc({src_layer_dims}, prec, p.fmts.src_layer_fmt); + auto src_iter_md_tgt = memory::desc({src_iter_dims}, prec, p.fmts.src_iter_fmt); + auto dst_layer_md_tgt = memory::desc({dst_layer_dims}, prec, p.fmts.dst_layer_fmt); + auto dst_iter_md_tgt = memory::desc({dst_iter_dims}, prec, p.fmts.dst_iter_fmt); + + + // Create the reference descriptor + rnn_cell::desc cell(p.aalgorithm, p.activation); + auto direction = p.direction; + + rnn_forward::desc ref_desc(prop_kind::forward_inference, cell, + direction, src_layer_md_any, src_iter_md_any, + weights_layer_md_any, weights_iter_md_any, bias_md_any, + dst_layer_md_any, dst_iter_md_any); + auto ref_prim_desc = rnn_forward::primitive_desc(ref_desc, eng); + + // Query the descriptor for memory descriptors + auto weights_layer_md_ref = ref_prim_desc.weights_layer_primitive_desc().desc(); + auto weights_iter_md_ref = ref_prim_desc.weights_iter_primitive_desc().desc(); + auto bias_md_ref = ref_prim_desc.bias_primitive_desc().desc(); + auto src_layer_md_ref = ref_prim_desc.src_layer_primitive_desc().desc(); + auto src_iter_md_ref = ref_prim_desc.src_iter_primitive_desc().desc(); + auto dst_layer_md_ref = ref_prim_desc.dst_layer_primitive_desc().desc(); + auto dst_iter_md_ref = ref_prim_desc.dst_iter_primitive_desc().desc(); + + auto are_equal_md = [](memory::desc a, memory::desc b, engine eng){ + return memory::primitive_desc(a, eng) + == memory::primitive_desc(b, eng); + }; + + bool skip_test = + are_equal_md(weights_layer_md_ref, weights_layer_md_tgt, eng) + && are_equal_md(weights_iter_md_ref, weights_iter_md_tgt, eng) + && are_equal_md(bias_md_ref, bias_md_tgt, eng) + && are_equal_md(src_layer_md_ref, src_layer_md_tgt, eng) + && are_equal_md(src_iter_md_ref, src_iter_md_tgt, eng) + && are_equal_md(dst_layer_md_ref, dst_layer_md_tgt, eng) + && are_equal_md(dst_iter_md_ref, dst_iter_md_tgt, eng); + + if (skip_test) return; + + /* initialize data */ + auto weights_layer_ref = memory({weights_layer_md_ref, eng}); + auto weights_iter_ref = memory({weights_iter_md_ref, eng}); + auto bias_ref = memory({bias_md_ref, eng}); + auto src_layer_ref = memory({src_layer_md_ref, eng}); + auto src_iter_ref = memory({src_iter_md_ref, eng}); + auto dst_layer_ref = memory({dst_layer_md_ref, eng}); + auto dst_iter_ref = memory({dst_iter_md_ref, eng}); + + auto weights_layer_tgt = memory({weights_layer_md_tgt, eng}); + auto weights_iter_tgt = memory({weights_iter_md_tgt, eng}); + auto bias_tgt = memory({bias_md_tgt, eng}); + auto src_layer_tgt = memory({src_layer_md_tgt, eng}); + auto src_iter_tgt = memory({src_iter_md_tgt, eng}); + auto dst_layer_tgt = memory({dst_layer_md_tgt, eng}); + auto dst_iter_tgt = memory({dst_iter_md_tgt, eng}); + + auto init_tensor = [&](memory a, memory b) { + auto a_ptr = static_cast(a.get_data_handle()); + auto desc = a.get_primitive_desc().desc(); + auto a_dims = desc.data.dims; + auto a_ndims = desc.data.ndims; + auto n_elems = std::accumulate(a_dims, a_dims + a_ndims, size_t(1), + std::multiplies()); + for(size_t i = 0; i < n_elems; i++) + a_ptr[map_index(desc, i, false)] = i; + stream(stream::kind::eager).submit({reorder(a, b)}).wait(); + }; + + init_tensor(weights_layer_ref, weights_layer_tgt); + init_tensor(weights_iter_ref, weights_iter_tgt); + init_tensor(bias_ref, bias_tgt); + init_tensor(src_layer_ref, src_layer_tgt); + init_tensor(src_iter_ref, src_iter_tgt); + + // run the non packed version + auto prim_ref = rnn_forward(ref_prim_desc, src_layer_ref, src_iter_ref, + weights_layer_ref, weights_iter_ref, bias_ref, + dst_layer_ref, dst_iter_ref, null_memory(eng)); + stream(stream::kind::eager).submit({prim_ref}).wait(); + + // run the packed version + rnn_forward::desc tgt_desc(prop_kind::forward_inference, cell, + direction, src_layer_md_tgt, src_iter_md_tgt, + weights_layer_md_tgt, weights_iter_md_tgt, bias_md_tgt, + dst_layer_md_tgt, dst_iter_md_tgt); + auto tgt_prim_desc = rnn_forward::primitive_desc(tgt_desc, eng); + auto prim_tgt = rnn_forward(tgt_prim_desc, src_layer_tgt, src_iter_tgt, + weights_layer_tgt, weights_iter_tgt, bias_tgt, + dst_layer_tgt, dst_iter_tgt, null_memory(eng)); + stream(stream::kind::eager).submit({prim_tgt}).wait(); + + // compare dst_layer and dst_iter + compare_data(dst_layer_ref, dst_layer_tgt, 1e-5); + compare_data(dst_iter_ref, dst_iter_tgt, 1e-5); + } +}; + + using eng = engine::kind; + using fmt = memory::format; + using alg = algorithm; + using dir = rnn_direction; + using rnn_forward_test_f32 = rnn_forward_test; + using cfg_f32 = test_rnn_params_t; + +TEST_P(rnn_forward_test_f32, TestsRnn) { } +INSTANTIATE_TEST_CASE_P(TestRnn, rnn_forward_test_f32, + ::testing::Values( + cfg_f32{eng::cpu, alg::vanilla_rnn, alg::eltwise_tanh, dir::unidirectional_left2right, + {fmt::tnc, fmt::ldsnc, fmt::ldigo, fmt::ldigo, fmt::ldgo, fmt::tnc, fmt::ldsnc}, + test_rnn_sizes_t(1, 1, 10, 16, 100, 100, 100, 100)}, + cfg_f32{eng::cpu, alg::vanilla_lstm, alg::eltwise_tanh, dir::unidirectional_left2right, + {fmt::tnc, fmt::ldsnc, fmt::ldigo, fmt::ldigo, fmt::ldgo, fmt::tnc, fmt::ldsnc}, + test_rnn_sizes_t(1, 1, 10, 16, 100, 100, 100, 100)}, + /* Check for invalid parameters: unsupported unrolling */ + cfg_f32{eng::cpu, alg::vanilla_rnn, alg::eltwise_tanh, dir::unidirectional_left2right, + {fmt::tnc, fmt::ldsnc, fmt::ldigo, fmt::ldigo, fmt::ldgo, fmt::tnc, fmt::ldsnc}, + test_rnn_sizes_t(2, 1, 10, 16, 200, 100, 100, 100), true, mkldnn_invalid_arguments}, + cfg_f32{eng::cpu, alg::vanilla_rnn, alg::eltwise_tanh, dir::unidirectional_left2right, + {fmt::tnc, fmt::ldsnc, fmt::ldigo, fmt::ldigo, fmt::ldgo, fmt::tnc, fmt::ldsnc}, + test_rnn_sizes_t(2, 1, 10, 16, 100, 200, 100, 100), true, mkldnn_invalid_arguments}, + /* Check for invalid parameters: inconsistent dimensions */ + cfg_f32{eng::cpu, alg::vanilla_rnn, alg::eltwise_tanh, dir::unidirectional_left2right, + {fmt::tnc, fmt::ldsnc, fmt::ldigo, fmt::ldigo, fmt::ldgo, fmt::tnc, fmt::ldsnc}, + test_rnn_sizes_t(2, 1, 10, 16, 100, 100, 50, 100), true, mkldnn_invalid_arguments} + ) + ); + +} diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_softmax_forward.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_softmax_forward.cpp index e938da6..d9f6d68 100644 --- a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_softmax_forward.cpp +++ b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_softmax_forward.cpp @@ -181,6 +181,7 @@ protected: check_softmax_fwd(p.aprop_kind, src, dst, p.axis); }; + test_with_given_fill(-50, 50); test_with_given_fill(-200, 1); test_with_given_fill( 0, 1); test_with_given_fill( 200, 1); @@ -216,5 +217,9 @@ INSTANTIATE_TEST_CASE_P(TestSoftmaxForward, softmax_forward_test_float, softmax_fwd_test_params_float{prop_kind::forward_scoring, engine::kind::cpu, memory::format::nc, {2, 1000}, 0}, softmax_fwd_test_params_float{prop_kind::forward_scoring, - engine::kind::cpu, memory::format::nc, {2, 1000}, 1})); + engine::kind::cpu, memory::format::nc, {2, 1000}, 1}, + softmax_fwd_test_params_float{prop_kind::forward_scoring, + engine::kind::cpu, memory::format::nc, {1, 256}, 1}, + softmax_fwd_test_params_float{prop_kind::forward_scoring, + engine::kind::cpu, memory::format::nc, {1, 13}, 1})); } diff --git a/inference-engine/thirdparty/mkl-dnn/tests/other/subproject/CMakeLists.txt b/inference-engine/thirdparty/mkl-dnn/tests/other/subproject/CMakeLists.txt new file mode 100644 index 0000000..392a8b3 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/other/subproject/CMakeLists.txt @@ -0,0 +1,33 @@ +#=============================================================================== +# Copyright 2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +# Test Intel MKL-DNN for embeddability +# by imitating a project that includes the library +# +# To test run: +# mkdir -p build && cd build && cmake .. && make -j && ./project_app + +cmake_minimum_required(VERSION 2.8) + +set(PROJECT_NAME "Project") + +# include Intel MKL-DNN +set(MKLDNN_DIR "../../..") +add_subdirectory(${MKLDNN_DIR} mkl-dnn) +include_directories(${MKLDNN_DIR}/include) + +add_executable(project_app main.c) +target_link_libraries(project_app mkldnn) diff --git a/inference-engine/thirdparty/mkl-dnn/tests/other/subproject/main.c b/inference-engine/thirdparty/mkl-dnn/tests/other/subproject/main.c new file mode 100644 index 0000000..5d23650 --- /dev/null +++ b/inference-engine/thirdparty/mkl-dnn/tests/other/subproject/main.c @@ -0,0 +1,26 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "mkldnn.h" + +int main() { + printf("mkldnn_version: %d.%d.%d\n", + MKLDNN_VERSION_MAJOR, MKLDNN_VERSION_MINOR, MKLDNN_VERSION_PATCH); + printf("mkldnn_memory_desc_init = %p, sizeof(mkldnn_memory_desc_t) = %d\n", + mkldnn_memory_desc_init, (int)sizeof(mkldnn_memory_desc_t)); + return 0; +} diff --git a/inference-engine/thirdparty/mkldnn.cmake b/inference-engine/thirdparty/mkldnn.cmake index 0cf5045..d90717c 100644 --- a/inference-engine/thirdparty/mkldnn.cmake +++ b/inference-engine/thirdparty/mkldnn.cmake @@ -1,5 +1,5 @@ #=============================================================================== -# Copyright (c) 2016 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,6 +22,34 @@ set (CMAKE_CXX_STANDARD 11) set (CMAKE_CXX_STANDARD_REQUIRED ON) +set(version_cmake_included true) + +set(TARGET mkldnn) +set(MKLDNN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/mkl-dnn) + +string(REPLACE "." ";" VERSION_LIST "0.18.0") +list(GET VERSION_LIST 0 MKLDNN_VERSION_MAJOR) +list(GET VERSION_LIST 1 MKLDNN_VERSION_MINOR) +list(GET VERSION_LIST 2 MKLDNN_VERSION_PATCH) + +find_package(Git) +if (GIT_FOUND) + execute_process(COMMAND ${GIT_EXECUTABLE} log -1 --format=%H + WORKING_DIRECTORY ${MKLDNN_ROOT} + RESULT_VARIABLE RESULT + OUTPUT_VARIABLE MKLDNN_VERSION_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +if(NOT GIT_FOUND OR RESULT) + set(MKLDNN_VERSION_HASH "N/A") +endif() + +configure_file( + "${MKLDNN_ROOT}/include/mkldnn_version.h.in" + "${CMAKE_BINARY_DIR}/include/mkldnn_version.h" +) + function(detect_mkl LIBNAME) message(STATUS "Detecting Intel(R) MKL: trying ${LIBNAME}") find_path(MKLINC mkl_cblas.h ${MKL}/include) @@ -51,9 +79,6 @@ function(detect_mkl LIBNAME) endif() endfunction() -set(TARGET mkldnn) -set(MKLDNN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/mkl-dnn) - if (THREADING STREQUAL "TBB") add_definitions(-DMKLDNN_THR=MKLDNN_THR_TBB) elseif (THREADING STREQUAL "OMP") @@ -76,7 +101,9 @@ include_directories( ${MKLDNN_ROOT}/include ${MKLDNN_ROOT}/src ${MKLDNN_ROOT}/src/common + ${MKLDNN_ROOT}/src/cpu/ ${MKLDNN_ROOT}/src/cpu/xbyak + ${CMAKE_BINARY_DIR}/include/ ) if(WIN32) @@ -88,6 +115,23 @@ if(WIN32) endif() endif() +# to make build time reasonable, don't use optimizations for s8u8s32 Xbyak +# kernels +file(GLOB FILES_WITHNO_OPT + ${MKLDNN_ROOT}/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_b0_gemm_s8u8s32_kern.cpp + ${MKLDNN_ROOT}/src/cpu/gemm/s8x8s32/jit_avx512_core_kernel_gemm_s8u8s32_kern.cpp + ${MKLDNN_ROOT}/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern.cpp + ${MKLDNN_ROOT}/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern.cpp + ${MKLDNN_ROOT}/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern.cpp + ${MKLDNN_ROOT}/src/cpu/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern.cpp) +if(WIN32 AND NOT MINGW) + set_source_files_properties(${FILES_WITHNO_OPT} + PROPERTIES COMPILE_FLAGS "/Od") +else() + set_source_files_properties(${FILES_WITHNO_OPT} + PROPERTIES COMPILE_FLAGS "-O0 -U_FORTIFY_SOURCE") +endif() + add_library(${TARGET} STATIC ${HDR} ${SRC}) set_ie_threading_interface_for(${TARGET}) @@ -98,7 +142,7 @@ if(GEMM STREQUAL "OPENBLAS") list(APPEND ${TARGET}_LINKER_LIBS ${BLAS_LIBRARIES}) elseif (GEMM STREQUAL "MKL") ## enable cblas_gemm from mlkml package -if(WIN32) +if(WIN32 OR APPLE) detect_mkl("mklml") else() if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") @@ -113,4 +157,4 @@ endif() endif() ## enable jit_gemm from mlk-dnn -target_link_libraries(${TARGET} PRIVATE ${${TARGET}_LINKER_LIBS}) \ No newline at end of file +target_link_libraries(${TARGET} PRIVATE ${${TARGET}_LINKER_LIBS}) diff --git a/inference-engine/tools/accuracy_checker_tool/README.md b/inference-engine/tools/accuracy_checker_tool/README.md new file mode 100644 index 0000000..8dc6511 --- /dev/null +++ b/inference-engine/tools/accuracy_checker_tool/README.md @@ -0,0 +1,163 @@ +# Deep Learning accuracy validation framework + +#### Usage + +You may test your installation and get familiar with accuracy checker by running [sample][sample-readme]. + +Once you installed accuracy checker you can evaluate your configurations with: + +```python +python3 accuracy_check.py -c path/to/configuration_file -m /path/to/models -s /path/to/source/data -a /path/to/annotation +``` + +All relative paths in config files will be prefixed with values specified in command line: + +- `-c, --config` path to configuration file. +- `-m, --models` specifies directory in which models and weights declared in config file will be searched. +- `-s, --source` specifies directory in which input images will be searched. +- `-a, --annotations` specifies directory in which annotation and meta files will be searched. + +You may refer to `-h, --help` to full list of command line options. Some optional arguments are: + +- `-e, --extensions` directory with InferenceEngine extensions. +- `-b, --bitstreams` directory with bitstream (for Inference Engine with fpga plugin). +- `-C, '--converted_models` directory to store Model Optimizer converted models (used for DLSDK launcher only). +- `-tf, --target_framework` framework for infer. +- `-td, --target_devices` devices for infer. You can specify several devices using space as a delimiter. + +#### Configuration + +There is config file which declares validation process. +Every validated model has to have its entry in `models` list +with distinct `name` and other properties described below. + +There is also definitions file, which declares global options shared across all models. +Config file has priority over definitions file. + +example: + +```yaml +models: +- name: model_name + launchers: + - framework: caffe + model: public/alexnet/caffe/bvlc_alexnet.prototxt + weights: public/alexnet/caffe/bvlc_alexnet.caffemodel + adapter: classification + batch: 128 + datasets: + - name: dataset_name +``` + +### Launchers + +Launcher is a description of how your model should be executed. +Each launcher configuration starts with setting `framework` name. Currently *caffe* and *dlsdk* supported. Launcher description can have differences. + +Please view: + +- [how to configure Caffe launcher][caffe-launcher-configuration]. +- [how to configure DLSDK launcher][dlsdk-launcher-configuration]. + +### Datasets + +Dataset entry describes data on which model should be evaluated, +all required preprocessing and postprocessing/filtering steps, +and metrics that will be used for evaluation. + +If your dataset data is a well-known competition problem (COCO, Pascal VOC, ...) and/or can be potentially reused for other models +it is reasonable to declare it in some global configuration file (*definition* file). This way in your local configuration file you can provide only +`name` and all required steps will be picked from global one. To pass path to this global configuration use `--definition` argument of CLI. + +Each dataset must have: + +- `name` - unique identifier of your model/topology. +- `data_source`: path to directory where input data is stored. +- `metrics`: list of metrics that should be computed. + +And optionally: +- `preprocessing`: list of preprocessing steps applied to input data. If you want calculated metrics to match reported, you must reproduce preprocessing from canonical paper of your topology or ask topology author about required steps if it is ICV topology. +- `postprocessing`: list of postprocessing steps. +- `reader`: approach for data reading. You can specify: `opencv_imread` or `pillow_imread` for reading images and `opencv_capture` for reading frames from video. Default reader is `opencv_imread`. + +Also it must contain data related to annotation. +You can convert annotation inplace using: +- `annotation_conversion`: parameters for annotation conversion + + +or use existing annotation file and dataset meta: +- `annotation` - path to annotation file, you must **convert annotation to representation of dataset problem first**, you may choose one of the converters from *annotation-converters* if there is already converter for your dataset or write your own. +- `dataset_meta`: path to metadata file (generated by converter). +More detailed information about annotation conversion you can find [here][converters] + +example of dataset definition: + +```yaml +- name: dataset_name + annotation: annotation.pickle + data_source: images_folder + + preprocessing: + - type: resize + dst_width: 256 + dst_height: 256 + + - type: normalization + mean: imagenet + + - type: crop + dst_width: 227 + dst_height: 227 + + metrics: + - type: accuracy +``` + +### Preprocessing, Metrics, Postprocessing + +Each entry of preprocessing, metrics, postprocessing must have `type` field, +other options are specific to type. If you do not provide any other option, then it +will be picked from *definitions* file. + +You can find useful following instructions: + +- [how to use preprocessings][preprocessors]. +- [how to use postprocessings][postprocessors]. +- [how to use metrics][metrics]. + +You may optionally provide `reference` field for metric, if you want calculated metric +tested against specific value (i.e. reported in canonical paper). + +Some metrics support providing vector results ( e. g. mAP is able to return average precision for each detection class). You can change view mode for metric results using `presenter` (e.g. `print_vector`, `print_scalar`). + +example: + +```yaml +metrics: +- type: accuracy + top_k: 5 + reference: 86.43 + threshold: 0.005 +``` + +### Testing new models + +Typical workflow for testing new model include: + +1. Convert annotation of your dataset. Use one of the converters from annotation-converters, or write your own if there is no converter for your dataset. You can find detailed instruction how to use converters [here][converters]. + +```bash +python3 convert_annotation.py converter --converter_specific_parameter --output_dir data/annotations +``` + +1. Choose one of *adapters* or write your own. Adapter converts raw output produced by framework to high level problem specific representation (e.g. *ClassificationPrediction*, *DetectionPrediction*, etc). +1. Reproduce preprocessing, metrics and postprocessing from canonical paper. +1. Create entry in config file and execute. + +[sample-readme]: ./tools/accuracy_checker/sample/README.md +[preprocessors]: ./tools/accuracy_checker/accuracy_checker/preprocessor/README.md +[postprocessors]: ./tools/accuracy_checker/accuracy_checker/postprocessor/README.md +[metrics]: ./tools/accuracy_checker/accuracy_checker/metrics/README.md +[converters]: ./tools/accuracy_checker/accuracy_checker/annotation_converters/README.md +[caffe-launcher-configuration]: ./tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher_readme.md +[dlsdk-launcher-configuration]: ./tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher_readme.md diff --git a/inference-engine/tools/accuracy_checker_tool/accuracy_check.py b/inference-engine/tools/accuracy_checker_tool/accuracy_check.py new file mode 100644 index 0000000..3d4fc2b --- /dev/null +++ b/inference-engine/tools/accuracy_checker_tool/accuracy_check.py @@ -0,0 +1,19 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from openvino.tools.accuracy_checker.accuracy_checker.main import main + +main() diff --git a/inference-engine/tools/accuracy_checker_tool/convert_annotation.py b/inference-engine/tools/accuracy_checker_tool/convert_annotation.py new file mode 100644 index 0000000..5313d71 --- /dev/null +++ b/inference-engine/tools/accuracy_checker_tool/convert_annotation.py @@ -0,0 +1,20 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from openvino.tools.accuracy_checker.accuracy_checker.annotation_converters.convert import main + +if __name__ == '__main__': + main() diff --git a/inference-engine/tools/benchmark_tool/README.md b/inference-engine/tools/benchmark_tool/README.md new file mode 100644 index 0000000..bf11be2 --- /dev/null +++ b/inference-engine/tools/benchmark_tool/README.md @@ -0,0 +1,16 @@ +# OpenVINO™ Benchmark Tool +Inference Engine Benchmark Tool is a Python\* command-line tool, which measures latency for synchronous mode. + +Please, refer to https://docs.openvinotoolkit.org for details. + +## Usage + +In general, the Benchmark Tool is configured in the same way as the Accuracy Checker. You can also use additional command line arguments to define benchmark-specific parameters: + +| Argument | Type | Description | +| -------------------------------------------- | ------ | -------------------------------------------------------- | +| -c, --config | string | Required. Path to the YML file with local configuration | +| -ic, --benchmark_iterations_count | string | Optional. Benchmark itertations count. (1000 is default) | + +## Hardware requirements +Hardware requirements depend on a model. Typically for public models RAM memory size has to be not less then 16Gb independently on operation system. \ No newline at end of file diff --git a/inference-engine/tools/benchmark_tool/benchmark.py b/inference-engine/tools/benchmark_tool/benchmark.py new file mode 100644 index 0000000..0e5280f --- /dev/null +++ b/inference-engine/tools/benchmark_tool/benchmark.py @@ -0,0 +1,22 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import openvino.tools.benchmark as benchmark + +if __name__ == '__main__': + config = benchmark.CommandLineReader.read() + result = benchmark.Benchmark(config).run() + print("{0}: {1:.4} ms".format(config.model, result.latency * 1000.0)) \ No newline at end of file diff --git a/inference-engine/tools/calibration_tool/README.md b/inference-engine/tools/calibration_tool/README.md new file mode 100644 index 0000000..6402705 --- /dev/null +++ b/inference-engine/tools/calibration_tool/README.md @@ -0,0 +1,149 @@ +# OpenVINO™ Calibration Tool +Inference Engine Calibration Tool calibrates a given FP32 model so that you can run calibrated model in low-precision 8-bit integer mode while keeping the input data of this model in the original precision. +Inference Engine Calibration Tool is a Python\* command-line tool, which imports Python types from the `openvino.tools.calibration` package. + +Please, refer to https://docs.openvinotoolkit.org for details. + +## Hardware requirements +Hardware requirements depend on a model. Typically for public models RAM memory size has to be not less then 16Gb, drive has to have not less then 30 Gb free space independently on operation system. Temporary directory is used to cache layers output during calibration. + +## Usage +The Calibration Tool is configured in the same way as the Accuracy Checker. You can also use additional command-line arguments to define calibration-specific parameters. + +### Command-Line Arguments for the Accuracy Checker Tool reused in Calibration Tool +| Argument | Type | Description | +| -------------------------------------------- | ------ | ------------------------------------------------------- | +| -c, --config | string | Required. Path to the YML file with local configuration | +| -d, --definitions | string | Optional. Path to the YML file with definitions | +| -m, --models | string | Optional. Prefix path to the models and weights | +| -s, --source | string | Optional. Prefix path to the data source | +| -a, --annotations | string | Optional. Pefix path to the converted annotations and datasets meta data | +| -e, --extensions | string | Optional. Prefix path to extensions folder | +| --cpu_extensions_mode, --cpu-extensions-mode | string | Optional. specified preferable set of processor instruction for automatic searching cpu extension lib: `avx2` or `sse4` | +| -C, --converted_models, --converted-models | string | Optional. Directory to store Model Optimizer converted models. Used for DLSDK launcher only | +| -M, --model_optimizer, --model-optimizer | string | Optional. Path to model optimizer caffe directory | +| --tf_custom_op_config_dir, --tf-custom-op-config-dir | string | Optional. Path to directory with tensorflow custom operation configuration files for model optimizer | +| --tf_obj_detection_api_pipeline_config_path, --tf-obj-detection-api-pipeline-config-path | string | Optional. Path to directory with tensorflow object detection api pipeline configuration files for model optimizer | +| --progress | string | Optional. Progress reporter: `bar`, `print` or `None` | +| -td, --target_devices, --target-devices | string | Optional. Space-separated list of devices for infer | +| -tt, --target_tags, --target-tags | string | Optional. Space-separated list of launcher tags for infer | + +### Specific Command Line Arguments for Calibration Tool +| Argument | Type | Description | +| --------------------------------- | ------ | --------------------------------------------------------- | +| -p, --precision | string | Optional. Precision to calibrate. Default value is INT8 | +| --ignore_layer_types, --ignore-layer-types | string | Optional. Layer types list which will be skipped during quantization | +| --ignore_layer_types_path, --ignore-layer-types-path | string | Optional. Ignore layer types file path | +| --ignore_layer_names, --ignore-layer-names | string | Optional. Layer names list which will be skipped during quantization | +| --ignore_layer_names_path, --ignore-layer-names-path | string | Optional. Ignore layer names file path | +| --batch_size, --batch-size | integer| Optional. Batch size value. If not specified, the batch size value is determined from IR | +| -th, --threshold | float | Optional. Accuracy drop of quantized model should not exceed this threshold. Should be pointer in percents without percent sign. (1% is default) | +| -ic, --benchmark_iterations_count, --benchmark-iterations-count | integer | Optional. Benchmark itertations count. (1000 is default) | +| -mn, --metric_name, --metric-name | string | Optional. Metric name used during calibration | +| -mt, --metric_type, --metric-type | string | Optional. Metric type used during calibration | +| -o, --output_dir, --output-dir | string | Optional. Directory to store converted models. Original model directory is used if not defined | + +## Model calibration flow + +### Introduction +Calibration tool read original FP32 model, calibration dataset and create low precision model. Low precision model has two differences from original model: +1. Per channel statistics are defined. Statistics have minimum and maximum values for each layer and each channel. Model statistics are stored in Inference Engine intermediate representation file (IR) in XML format. +2. `quantization_level` layer attribute is defined. The attribute defines precision which is used during inference. + +### Prerequisites +* Model: Tensorflow\* Inception v1. You can download the model from here: https://github.com/tensorflow/models/tree/master/research/slim +* Dataset: ImageNet. You can download ImageNet from here: http://www.image-net.org/download.php +* YML configuration files: you can find YML configuration files and YML definition file which are used below in `configs` directory: + - `definitions.yml` - definition file + - `inception_v1.yml` - configuration file for Tensorflow\* Inception v1 model + - `ncf_config.yml` - configuration file for NCF model in OpenVINO\* Inference Engine Intermediate Representation format + - `ssd_mobilenet_v1_coco.yml` - configuration file for Tensorflow\* SSD Mobilenet v1 model + - `unet2d.yml` - configuration file for Unet2D mode in in OpenVINO\* Inference Engine Intermediate Representation format + +If you have custom topology with not supported accuracy metric or not suported custom dataset then you should add some components implementation in `openvino.tools.accuracy_checker` Python\* package yourself. Refer to `openvino.tools.accuracy_checker` documentation how to implement metric and dataset support. + +There are steps to calibrate and evaluate result model: +- Step #1. Convert data annotation files +- Optional step for low precision model performance estimation. +- Step #2. Calibration +- Step #3. Result model evaluation + +Additional optional step before calibration is available to rough estimate possible INT8 performance. + +### Step #1. Convert data annotation files +Calibration dataset is subset of training dataset. Use Convert Annotation Tool to convert ImageNet\* dataset to Calibration Tool readable data annotation files. Data annotation files describe subset of images which are used during calibration. Command line: +```sh +python convert_annotation.py imagenet --annotation_file /datasets/ImageNet/val.txt --labels_file /datasets/ImageNet/synset_words.txt -ss 2000 -o ~/annotations -a imagenet.pickle -m imagenet.json +``` + +> **NOTE:** For simplicity all command line tools in below steps use the same command line arguments. In practice [Collect Statistics Tool](./inference-engine/tools/collect_statistics_tool/README.md) uses calibration dataset, but [Accuracy Checker Tool](./inference-engine/tools/accuracy_checker_tool/README.md) has to use whole validation dataset. + + +| Argument | Type | Description | +| -------------------| ------ | --------------------------------------------------------------------------------- | +| --config | string | Path to the YML file with local configuration | +| -d | string | Path to the YML file with definitions | +| -M | string | Path to model optimizer directory | +| --models | string | Prefix path to the models and weights | +| --source | string | Prefix path to the data source | +| --annotations | string | Pefix path to the converted annotations and datasets meta data | +| --converted_models | string | Directory to store Model Optimizer converted models. Used for DLSDK launcher only | + + +### Optional step for low precision model performance estimation. +Before calibration you can roughly estimate low presition performance with [Collect Statistics Tool](./inference-engine/tools/collect_statistics_tool/README.md). + +[Collect Statistics Tool](./inference-engine/tools/collect_statistics_tool/README.md) ignores metric in YML configuration file but you can use the same command line arguments. + +Command line: + +```sh +python collect_statistics.py --config ~/inception_v1.yml -d ~/defenitions.yml -M /home/user/intel/openvino/deployment_tools/model_optimizer --models ~/models --source /media/user/calibration/datasets --annotations ~/annotations --converted_models ~/models +``` + +Result model has statistics which allow to infer this model in INT8 precision. To measure performance you can use [Benchmark Tool](./inference-engine/tools/benchmark_tool/README.md). + +### Step #2. Calibration +During calibration process, the model is ajusted for efficient quantization and minimization of accuracy drop on calibration dataset. Calibration tool produces calibrated model which will be executed in low precision 8 bit quantzed mode after loading into CPU plugin. + +[Calibration Tool](./inference-engine/tools/calibration_tool/README.md) has flexible and extensible mechanism of enabling new data set and metrics. Each network has its own dedicated network metric and dataset where network was trained. Dataset description and network metrics can be reused for different network. + +To plug new dataset you need to develop YML file. To develop new metric you need to develop Python\* module implementing metric and describe in YML. Please, refer to [Accuracy Checker Tool](./inference-engine/tools/accuracy_checker_tool/README.md) for details. + + +Command line example: +```sh +python calibrate.py --config ~/inception_v1.yml --definition ~/defenitions.yml -M /home/user/intel/openvino/deployment_tools/model_optimizer --tf_custom_op_config_dir ~/tf_custom_op_configs --models ~/models --source /media/user/calibration/datasets --annotations ~/annotations +``` + +### Step #3. Result model evaluation +After calibration of the model it worse to evaluate network accuracy on whole validation set using [Accuracy Checker Tool](./inference-engine/tools/accuracy_checker_tool/README.md). + +#### Step #3.1 Check accuracy +Command line: +```sh +python accuracy_check.py --config ~/inception_v1.yml -d ~/defenitions.yml -M /home/user/intel/openvino/deployment_tools/model_optimizer --tf_custom_op_config_dir ~/tf_custom_op_configs --models ~/models --source /media/user/calibration/datasets --annotations ~/annotations -tf dlsdk -td CPU +``` + +#### Step #3.2 Check performance +Use `benchmark_app` command line tool to measure latency and throughput for synchronous and asynchronous modes. Note, please, `benchmark_app` command line tool uses converted OpenVINO\* Intermediate Representation model. + +Command line for synchronous mode: + +```sh +./benchmark_app -i /inputImage.bmp -m /inception_v1.xml -d CPU -api sync +``` + +Command line for the asynchronous mode: +```sh +./benchmark_app -i /inputImage.bmp -m /inception_v1.xml -d CPU -api async +``` + +#### Optional step to check performance +You can use Python\* [Benchmark Tool](./inference-engine/tools/benchmark_tool/README.md) command line tool to quickly check performance with the same command line arguments and configuration YML files as for [Calibration Tool](./inference-engine/tools/calibration_tool/README.md). + +Command line: +```sh +python benchmark.py --config ~/inception_v1.yml -d ~/defenitions.yml -M /home/user/intel/openvino/deployment_tools/model_optimizer --tf_custom_op_config_dir ~/tf_custom_op_configs --models ~/models --source /media/user/calibration/datasets --annotations ~/annotations --converted_models ~/models +``` + diff --git a/inference-engine/tools/calibration_tool/calibrate.py b/inference-engine/tools/calibration_tool/calibrate.py new file mode 100644 index 0000000..c3034bd --- /dev/null +++ b/inference-engine/tools/calibration_tool/calibrate.py @@ -0,0 +1,23 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import openvino.tools.calibration as calibration + +if __name__ == '__main__': + with calibration.CommandLineProcessor.process() as config: + network = calibration.Calibrator(config).run() + if network: + network.serialize(config.output_model) diff --git a/inference-engine/tools/calibration_tool/configs/definitions.yml b/inference-engine/tools/calibration_tool/configs/definitions.yml new file mode 100644 index 0000000..a14d660 --- /dev/null +++ b/inference-engine/tools/calibration_tool/configs/definitions.yml @@ -0,0 +1,202 @@ +launchers: + - framework: dlsdk + device: CPU + - framework: caffe + device: CPU + +datasets: + - name: classification_dataset + data_source: ImageNet/original + annotation: ImageNet/accuracy_checker_annotations/2012/full/imagenet.pickle + dataset_meta: ImageNet/accuracy_checker_annotations/2012/full/imagenet.json + preprocessing: + - type: resize + size: 256 + - type: crop + size: 224 + - type: normalization + mean: IMAGENET + metrics: + - name: accuracy @ top1 + type: accuracy + top_k: 1 + - name: accuracy @ top5 + type: accuracy + top_k: 5 + + - name: classification_dataset_1001classes + data_source: ImageNet/original + annotation: ImageNet/accuracy_checker_annotations/2012_1001classes/full/imagenet.pickle + dataset_meta: ImageNet/accuracy_checker_annotations/2012_1001classes/full/imagenet.json + preprocessing: + - type: bgr_to_rgb + - type: resize + size: 256 + - type: crop + size: 224 + - type: normalization + mean: 127.5 + std: 127.5 + metrics: + - name: accuracy @ top1 + type: accuracy + top_k: 1 + - name: accuracy @ top5 + type: accuracy + top_k: 5 + + - name: classification_dataset_2015 + data_source: ImageNet/original + annotation: ImageNet/accuracy_checker_annotations/2015/full/imagenet.pickle + dataset_meta: ImageNet/accuracy_checker_annotations/2015/full/imagenet.json + preprocessing: + - type: resize + size: 256 + - type: crop + size: 224 + - type: normalization + mean: 104, 117, 123 + metrics: + - name: accuracy @ top1 + type: accuracy + top_k: 1 + - name: accuracy @ top5 + type: accuracy + top_k: 5 + + - name: VOC2007 + data_source: VOC/VOCdevkit/VOC2007/JPEGImages + annotation: VOC/accuracy_checker_annotations/VOC2007/full/voc07.pickle + dataset_meta: VOC/accuracy_checker_annotations/VOC2007/full/voc07.json + preprocessing: + - type: resize + size: 300 + - type: normalization + mean: 104, 117, 123 + postprocessing: + - type: resize_prediction_boxes + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - name: VOC2007_20classes + data_source: VOC/VOCdevkit/VOC2007/JPEGImages + annotation: VOC/accuracy_checker_annotations/VOC2007_20classes/full/voc07.pickle + dataset_meta: VOC/accuracy_checker_annotations/VOC2007_20classes/full/voc07.json + preprocessing: + - type: resize + size: 300 + - type: normalization + mean: 104, 117, 123 + postprocessing: + - type: resize_prediction_boxes + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - name: VOC2007_Segmentation + data_source: VOC/VOCdevkit/VOC2007 + annotation: VOC/accuracy_checker_annotations/VOC2007/full/voc07_segmentation.pickle + dataset_meta: VOC/accuracy_checker_annotations/VOC2007/full/voc07_segmentation.json + postprocessing: + - type: resize_segmentation_mask + apply_to: prediction + - type: encode_segmentation_mask + metrics: + - type: segmentation_accuracy + - type: mean_iou + - type: mean_accuracy + - type: frequency_weighted_accuracy + + - name: VOC2012_Segmentation + data_source: VOC/VOCdevkit/VOC2012 + annotation: VOC/accuracy_checker_annotations/VOC2012/full/voc12_segmentation.pickle + dataset_meta: VOC/accuracy_checker_annotations/VOC2012/full/voc12_segmentation.json + postprocessing: + - type: resize_segmentation_mask + apply_to: prediction + - type: encode_segmentation_mask + metrics: + - type: segmentation_accuracy + - type: mean_iou + - type: mean_accuracy + - type: frequency_weighted_accuracy + + - name: COCO2014_80cl + data_source: COCO/2014/val2014 + annotation: COCO/accuracy_checker_annotations/2014/full/mscoco_detection_80cl.pickle + dataset_meta: COCO/accuracy_checker_annotations/2014/full/mscoco_detection_80cl.json + preprocessing: + - type: bgr_to_rgb + - type: resize + size: 300 + postprocessing: + - type: resize_prediction_boxes + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - name: COCO2017_80cl + data_source: COCO/2017/val2017 + annotation: COCO/accuracy_checker_annotations/2017/full/mscoco_detection_80cl.pickle + dataset_meta: COCO/accuracy_checker_annotations/2017/full/mscoco_detection_80cl.json + preprocessing: + - type: bgr_to_rgb + - type: resize + size: 300 + postprocessing: + - type: resize_prediction_boxes + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - name: COCO2017_80cl_bkgr + data_source: COCO/2017/val2017 + annotation: COCO/accuracy_checker_annotations/2017/full/mscoco_detection_80cl_bkgr.pickle + dataset_meta: COCO/accuracy_checker_annotations/2017/full/mscoco_detection_80cl_bkgr.json + preprocessing: + - type: bgr_to_rgb + - type: resize + size: 300 + postprocessing: + - type: resize_prediction_boxes + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - name: COCO2017_90cl_bkgr + data_source: COCO/2017/val2017 + annotation: COCO/accuracy_checker_annotations/2017/full/mscoco_detection_90cl_bkgr.pickle + dataset_meta: COCO/accuracy_checker_annotations/2017/full/mscoco_detection_90cl_bkgr.json + preprocessing: + - type: bgr_to_rgb + - type: resize + size: 300 + postprocessing: + - type: resize_prediction_boxes + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - name: lfw + data_source: LFW/lfw + annotation: LFW/accuracy_checker_annotations/full/lfw.pickle + preprocessing: + - type: point_alignment + size: 400 + - type: resize + size: 160 + metrics: + - type: pairwise_accuracy_subsets diff --git a/inference-engine/tools/calibration_tool/configs/inception_v1.yml b/inference-engine/tools/calibration_tool/configs/inception_v1.yml new file mode 100644 index 0000000..86c832c --- /dev/null +++ b/inference-engine/tools/calibration_tool/configs/inception_v1.yml @@ -0,0 +1,29 @@ +models: + - name: GoogleNet_v1 + + # list of launchers for your topology. + launchers: + # launcher framework (e.g. caffe, dlsdk) + - framework: dlsdk + # device for infer (e.g. for dlsdk cpu, gpu, hetero:cpu, gpu ...) + device: CPU + # topology IR (*.prototxt for caffe, *.xml for InferenceEngine, etc) + # path to topology is prefixed with directory, specified in "-m/--models" option + tf_model: inception_v1.pb + # launcher returns raw result, so it should be converted + # to an appropriate representation with adapter + adapter: classification + mo_params: + data_type: FP32 + input_shape: "(1, 224, 224, 3)" + + # metrics, preprocessing and postprocessing are typically dataset specific, so dataset field + # specifies data and all other steps required to validate topology + # there is typically definitions file, which contains options for common datasets and which is merged + # during evaluation, but since "sample_dataset" is not used anywhere else, this config contains full definition + datasets: + # uniquely distinguishable name for dataset + # note that all other steps are specific for this dataset only + # if you need to test topology on multiple datasets, you need to specify + # every step explicitly for each dataset + - name: classification_dataset_1001classes diff --git a/inference-engine/tools/calibration_tool/configs/ncf_config.yml b/inference-engine/tools/calibration_tool/configs/ncf_config.yml new file mode 100644 index 0000000..3ba3d1a --- /dev/null +++ b/inference-engine/tools/calibration_tool/configs/ncf_config.yml @@ -0,0 +1,56 @@ +models: + - name: NCF_example + + # list of launchers for your topology. + launchers: + # launcher framework (e.g. caffe, dlsdk) + - framework: dlsdk + # device for infer (e.g. for dlsdk cpu, gpu, hetero:cpu, gpu ...) + device: CPU + cpu_extensions: libcpu_extension.so + # topology IR (*.prototxt for caffe, *.xml for InferenceEngine, etc) + # path to topology is prefixed with directory, specified in "-m/--models" option + model: graph_frozen.xml + # topology weights binary (*.caffemodel for caffe, *.bin for InferenceEngine) + weights: graph_frozen.bin + # launcher returns raw result, so it should be converted + # to an appropriate representation with adapter + adapter: hit_ratio_adapter + + inputs: + - type: INPUT + value: "u" + name: embedding/embedding_lookup/placeholder_port_1 + - type: INPUT + value: "i" + name: embedding_1/embedding_lookup/placeholder_port_1 + - type: INPUT + value: "u" + name: embedding_2/embedding_lookup/placeholder_port_1 + - type: INPUT + value: "i" + name: embedding_3/embedding_lookup/placeholder_port_1 + + # metrics, preprocessing and postprocessing are typically dataset specific, so dataset field + # specifies data and all other steps required to validate topology + # there is typically definitions file, which contains options for common datasets and which is merged + # during evaluation, but since "sample_dataset" is not used anywhere else, this config contains full definition + datasets: + # uniquely distinguishable name for dataset + # note that all other steps are specific for this dataset only + # if you need to test topology on multiple datasets, you need to specify + # every step explicitly for each dataset + - name: ncf_validation_dataset.npy + # directory where input images are searched. + # prefixed with directory specified in "-s/--source" option + # name of converted annotation file (specified in -a option during annotation conversion) + # prefixed with directory specified in "-a/--annotations" option + annotation: ncf_converter.pickle + dataset_meta: ncf_converter.json + + reader: ncf_data_reader + + # list of metrics, calculated on dataset + metrics: + - type: hit_ratio + - type: ndcg diff --git a/inference-engine/tools/calibration_tool/configs/ssd_mobilenet_v1_coco.yml b/inference-engine/tools/calibration_tool/configs/ssd_mobilenet_v1_coco.yml new file mode 100644 index 0000000..7786213 --- /dev/null +++ b/inference-engine/tools/calibration_tool/configs/ssd_mobilenet_v1_coco.yml @@ -0,0 +1,40 @@ +models: + - name: ssd_mobilenet_v1_coco + + # list of launchers for your topology. + launchers: + # launcher framework (e.g. caffe, dlsdk) + - framework: dlsdk + # device for infer (e.g. for dlsdk cpu, gpu, hetero:cpu, gpu ...) + device: CPU + # topology IR (*.prototxt for caffe, *.xml for InferenceEngine, etc) + # path to topology is prefixed with directory, specified in "-m/--models" option + tf_model: ssd_mobilenet_v1_coco.pb + # launcher returns raw result, so it should be converted + # to an appropriate representation with adapter + adapter: ssd + cpu_extensions: AUTO + mo_params: + data_type: FP32 + tensorflow_use_custom_operations_config: ssd_v2_support.json + tensorflow_object_detection_api_pipeline_config: ssd_mobilenet_v1_coco.config + + # metrics, preprocessing and postprocessing are typically dataset specific, so dataset field + # specifies data and all other steps required to validate topology + # there is typically definitions file, which contains options for common datasets and which is merged + # during evaluation, but since "sample_dataset" is not used anywhere else, this config contains full definition + datasets: + # uniquely distinguishable name for dataset + # note that all other steps are specific for this dataset only + # if you need to test topology on multiple datasets, you need to specify + # every step explicitly for each dataset + - name: COCO2017_90cl_bkgr + + # list of metrics, calculated on dataset + metrics: + - type: map + integral: 11point + ignore_difficult: True + presenter: print_scalar + + - type: coco_precision diff --git a/inference-engine/tools/calibration_tool/configs/unet2d.yml b/inference-engine/tools/calibration_tool/configs/unet2d.yml new file mode 100644 index 0000000..49ed489 --- /dev/null +++ b/inference-engine/tools/calibration_tool/configs/unet2d.yml @@ -0,0 +1,54 @@ +models: + - name: UNet_2D + + # list of launchers for your topology. + launchers: + # launcher framework (e.g. caffe, dlsdk) + - framework: dlsdk + # device for infer (e.g. for dlsdk cpu, gpu, hetero:cpu, gpu ...) + device: CPU + # topology IR (*.prototxt for caffe, *.xml for InferenceEngine, etc) + # path to topology is prefixed with directory, specified in "-m/--models" option + model: model.ckpt.xml + # topology weights binary (*.caffemodel for caffe, *.bin for InferenceEngine) + weights: model.ckpt.bin + # launcher returns raw result, so it should be converted + # to an appropriate representation with adapter + adapter: brain_tumor_segmentation + cpu_extensions: AUTO + + # metrics, preprocessing and postprocessing are typically dataset specific, so dataset field + # specifies data and all other steps required to validate topology + # there is typically definitions file, which contains options for common datasets and which is merged + # during evaluation, but since "sample_dataset" is not used anywhere else, this config contains full definition + datasets: + # uniquely distinguishable name for dataset + # note that all other steps are specific for this dataset only + # if you need to test topology on multiple datasets, you need to specify + # every step explicitly for each dataset + - name: brats + data_source: Task01_BrainTumour + # directory where input images are searched. + # prefixed with directory specified in "-s/--source" option + # name of converted annotation file (specified in -a option during annotation conversion) + # prefixed with directory specified in "-a/--annotations" option + annotation: annotations/unet/calibration/brats.pickle + + reader: nifti_reader + preprocessing: + - type: crop3d + size: 128 + - type: normalize3d + + postprocessing: + - type: crop_segmentation_mask + apply_to: annotation + size: 128 + - type: clip_segmentation_mask + apply_to: annotation + max_value: 1 + + # list of metrics, calculated on dataset + metrics: + - type: dice + presenter: return_value diff --git a/inference-engine/tools/collect_statistics_tool/README.md b/inference-engine/tools/collect_statistics_tool/README.md new file mode 100644 index 0000000..e5a73ef --- /dev/null +++ b/inference-engine/tools/collect_statistics_tool/README.md @@ -0,0 +1,7 @@ +# OpenVINO™ Collect Statistics Tool +Inference Engine Collect Statistics Tool collects statistics for a given model. + +Please, refer to https://docs.openvinotoolkit.org for details. + +## Hardware requirements +Hardware requirements depend on a model. Typically for public models RAM memory size has to be not less then 16Gb independently on operation system. \ No newline at end of file diff --git a/inference-engine/tools/collect_statistics_tool/collect_statistics.py b/inference-engine/tools/collect_statistics_tool/collect_statistics.py new file mode 100644 index 0000000..95b8364 --- /dev/null +++ b/inference-engine/tools/collect_statistics_tool/collect_statistics.py @@ -0,0 +1,39 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +import os +from openvino.tools.calibration import CalibratorConfiguration, CalibrationConfigurationHelper, CalibratorFactory, CommandLineProcessor +from openvino.tools.utils import Path + +def collect_statistics(): + with CommandLineProcessor.process() as configuration: + calibrator = CalibratorFactory.create(configuration.precision, CalibratorConfiguration(configuration)) + + print("Collecting FP32 statistics for {}...".format(configuration.model)) + fp32_result = calibrator.infer(add_outputs=True, collect_aggregated_statistics=True) + print("FP32 accuracy: {0:.4f}%".format(100.0 * fp32_result.metrics.accuracy)) + + output_model_file_path = Path.get_model(configuration.output_model, "_statistics") + output_weights_file_path = Path.get_weights(configuration.output_weights, "_statistics") + + quantization_levels = calibrator.get_quantization_levels(CalibrationConfigurationHelper.read_ignore_layer_names(configuration)) + statistics = fp32_result.aggregated_statistics.get_node_statistics() + calibrator.save(output_model_file_path, output_weights_file_path, quantization_levels, statistics) + print("Network with statistics was written to {}.(xml|bin) IR file".format(os.path.splitext(output_model_file_path)[0])) + +if __name__ == '__main__': + collect_statistics() diff --git a/model-optimizer/extensions/back/ConvolutionReshaper.py b/model-optimizer/extensions/back/ConvolutionReshaper.py index 155d1eb..9cbbb10 100644 --- a/model-optimizer/extensions/back/ConvolutionReshaper.py +++ b/model-optimizer/extensions/back/ConvolutionReshaper.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,11 @@ limitations under the License. """ -import networkx as nx import numpy as np +from extensions.back.ReshapeMutation import ReshapeMutation from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph from mo.ops.reshape import Reshape @@ -30,6 +31,9 @@ class ConvolutionReshaper(BackReplacementPattern): """ enabled = True + def run_before(self): + return [ReshapeMutation] + @staticmethod def pattern(): return dict( @@ -39,7 +43,7 @@ class ConvolutionReshaper(BackReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): conv = match['conv'] assert len(conv.out_nodes()) == 1, "Convolution operation {} should have 1 output data node".format(conv.id) diff --git a/model-optimizer/extensions/back/CreateConstNodes.py b/model-optimizer/extensions/back/CreateConstNodes.py new file mode 100644 index 0000000..8dce9e7 --- /dev/null +++ b/model-optimizer/extensions/back/CreateConstNodes.py @@ -0,0 +1,84 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.back.replacement import BackReplacementPattern +from mo.front.extractor import update_ie_fields +from mo.graph.graph import * + + +class CreateConstNodesReplacement(BackReplacementPattern): + enabled = False + + @staticmethod + def pattern(): + return dict( + nodes=[ + ('data', dict(kind='data')) + ], + edges=[] + ) + + @staticmethod + def _check_bin_attrs(node): + """Check that at least one output edge from node without 'bin' attribute.""" + out_edges = node.out_edges() + bin_in_out_ports = ['bin' in edge for edge in out_edges] + out_node = [node.has('op') and node.op == 'OpOutput' for node in node.out_nodes()] + return np.any(out_node) or not np.all(bin_in_out_ports) + + @staticmethod + def _check_that_node_from_body(node): + """Check that all output edges from node have 'internal_port_id' + (that shows that this node is from TI body)""" + n_ports = len(node.out_edges()) + internal_port_in_out_ports = ['internal_port_id' in edge for edge in node.out_edges()] + return np.all(internal_port_in_out_ports) and n_ports + + def replace_pattern(self, graph: Graph, match: dict): + """ + Adds layers with type 'Const' that produce blob from 'bin' file. The pass finds data nodes with one output which + doesn't have edge with 'bin' attribute (or with two outputs and at least one output havent 'bin' attr) + and generate Const op node before the node and data node before the Const node. The data node before 'Const' + node is needed because the op node dumps input tensors to bin file. + """ + node = match['data'] + if len(node.in_nodes()) > 0: + return + + if self._check_bin_attrs(node): + if node.has_valid('value'): + const_node_name = graph.unique_id(node.id + '_const') + log.debug("Added Const node '{}'".format(const_node_name)) + graph.add_node(const_node_name, name=const_node_name, type='Const', kind='op', op='Const', + precision="FP32") + update_ie_fields(node.graph.node[const_node_name]) + graph.add_edges_from([(const_node_name, node.id, {'out': 0})]) + + copy_data_node_name = graph.unique_id(node.id + '_copy_') + graph.add_node(copy_data_node_name, kind='data', precision="FP32", shape=np.array(node.shape), + value=np.array(node.value)) + + if node.has_valid('force_precision'): + Node(graph, copy_data_node_name)['force_precision'] = node.force_precision + Node(graph, const_node_name)['force_precision'] = node.force_precision + graph.add_edges_from([(copy_data_node_name, const_node_name, {'in': 0, 'bin': 'custom'})]) + elif not self._check_that_node_from_body(node): + log.debug('node = {}'.format(node.graph.node[node.id])) + raise Error( + 'Discovered data node without inputs and value, node.name = {}, consumer.name = {}. ' + + refer_to_faq_msg(23), + node.soft_get('name'), + node.out_node().soft_get('name') if len(node.out_nodes()) else "" + ) diff --git a/model-optimizer/extensions/back/CreateConstNodes_test.py b/model-optimizer/extensions/back/CreateConstNodes_test.py new file mode 100644 index 0000000..a0a0aec --- /dev/null +++ b/model-optimizer/extensions/back/CreateConstNodes_test.py @@ -0,0 +1,138 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest +import numpy as np +from extensions.back.CreateConstNodes import CreateConstNodesReplacement +from mo.utils.unittest.graph import build_graph_with_attrs, compare_graphs + + +class CreateConstNodesReplacementTest(unittest.TestCase): + nodes = [ + ('data_node', {'kind': 'data', 'shape': None, 'value': None}), + ('next_node', {'kind': 'op'}), + ] + edges = [ + ('data_node', 'next_node') + ] + + new_nodes = [ + ('const', {'kind': 'op', 'op': 'Const'}), + ('const_data', {'kind': 'data'}) + ] + new_edges = [ + ('const', 'data_node'), + ('const_data', 'const') + ] + + def test_one_node(self): + """We should add Const node and data node.""" + shape = np.array([2, 3, 4]) + data = np.zeros(shape) + graph = build_graph_with_attrs( + nodes_with_attrs=self.nodes, + edges_with_attrs=self.edges, + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data})] + ) + graph_ref = build_graph_with_attrs( + nodes_with_attrs=self.nodes + self.new_nodes, + edges_with_attrs=self.edges + self.new_edges, + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data}), + ('const_data', {'shape': shape, 'value': data})] + ) + tested_pattern = CreateConstNodesReplacement() + tested_pattern.find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, last_node='next_node') + self.assertTrue(flag, resp) + + def test_one_bin_node(self): + """Nothing should happen.""" + shape = np.array([2, 3, 4]) + data = np.zeros(shape) + graph = build_graph_with_attrs( + nodes_with_attrs=self.nodes, + edges_with_attrs=self.edges, + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data})], + update_edge_attrs={('data_node', 'next_node', 0): {'bin': 0}}, + ) + tested_pattern = CreateConstNodesReplacement() + tested_pattern.find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph, last_node='next_node') + self.assertTrue(flag, resp) + + def test_force_precision_parameter(self): + precision = 'FP16' + shape = np.array([2, 3, 4]) + data = np.zeros(shape) + graph = build_graph_with_attrs( + nodes_with_attrs=self.nodes, + edges_with_attrs=self.edges, + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data, 'force_precision': precision})] + ) + graph_ref = build_graph_with_attrs( + nodes_with_attrs=self.nodes + self.new_nodes, + edges_with_attrs=self.edges + self.new_edges, + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data}), + ('const_data', {'shape': shape, 'value': data, 'force_precision': precision}), + ('const', {'force_precision': precision})] + ) + tested_pattern = CreateConstNodesReplacement() + tested_pattern.find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, last_node='next_node') + self.assertTrue(flag, resp) + + #check that force precision was added to data and Const nodes + force_precision_const_node = graph.nodes['data_node_const']['force_precision'] + force_precision_new_data = graph.nodes['data_node_copy_']['force_precision'] + self.assertEqual(force_precision_const_node, precision) + self.assertEqual(force_precision_new_data, precision) + + def test_two_nodes_with_bin(self): + """Test case for data node with 2 consumers with bin edge attr. + Nothing should happened.""" + shape = np.array([2, 3, 4]) + data = np.zeros(shape) + graph = build_graph_with_attrs( + nodes_with_attrs=self.nodes + [('next_node_2', {'kind': 'op'})], + edges_with_attrs=self.edges + [('data_node', 'next_node_2')], + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data})], + update_edge_attrs={('data_node', 'next_node', 0): {'bin': 0}, ('data_node', 'next_node_2', 0): {'bin': 0}}, + ) + tested_pattern = CreateConstNodesReplacement() + tested_pattern.find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph, last_node='next_node') + self.assertTrue(flag, resp) + + def test_two_nodes_one_bin(self): + """Test case for two output nodes, one with 'bin' parameter, other without.""" + shape = np.array([2, 3, 4]) + data = np.zeros(shape) + graph = build_graph_with_attrs( + nodes_with_attrs=self.nodes + [('next_node_2', {'kind': 'op'})], + edges_with_attrs=self.edges + [('data_node', 'next_node_2')], + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data})], + update_edge_attrs={('data_node', 'next_node', 0): {'bin': 0}}, + ) + graph_ref = build_graph_with_attrs( + nodes_with_attrs=self.nodes + self.new_nodes + [('next_node_2', {'kind': 'op'})], + edges_with_attrs=self.edges + self.new_edges + [('data_node', 'next_node_2')], + update_nodes_attributes=[('data_node', {'shape': shape, 'value': data}), + ('const_data', {'shape': shape, 'value': data})] + ) + tested_pattern = CreateConstNodesReplacement() + tested_pattern.find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, last_node='next_node') + self.assertTrue(flag, resp) + diff --git a/model-optimizer/extensions/back/DumpFakeQuantStat.py b/model-optimizer/extensions/back/DumpFakeQuantStat.py new file mode 100644 index 0000000..b161ceb --- /dev/null +++ b/model-optimizer/extensions/back/DumpFakeQuantStat.py @@ -0,0 +1,57 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import networkx as nx +import numpy as np + +from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Node +from mo.middle.passes.eliminate import remove_op_nodes +from mo.utils.graph import pseudo_topological_sort + + +class DumpFakeQuantStat(BackReplacementPattern): + enabled = True + + def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + intervals = {} + for n in pseudo_topological_sort(graph): + node = Node(graph, n) + if not node.has('op') or (node.op != 'FakeQuantWithMinMaxVars' and node.op != 'Quantize'): + continue + if node.op == 'Quantize': + # check if input range matches output range + low_match = np.all(node.in_node(1).value == node.in_node(3).value) + high_match = np.all(node.in_node(2).value == node.in_node(4).value) + if not low_match or not high_match: + continue + + prev_node = node.in_node().in_node() + prev_node_id = prev_node.id + prev_node_out_shape = prev_node.out_node()['shape'] + C = prev_node_out_shape[1] + assert node.in_node(1).value.size == 1 + assert node.in_node(2).value.size == 1 + min = ', '.join([str(node.in_node(1).value.flatten()[0])] * C) + max = ', '.join([str(node.in_node(2).value.flatten()[0])] * C) + intervals[prev_node_id] = {'min': min, 'max': max} + if intervals: + if 'statistics' not in graph.graph: + graph.graph['statistics'] = intervals + else: + graph.graph['statistics'].update(intervals) + remove_op_nodes(graph, {'op': 'FakeQuantWithMinMaxVars'}) + remove_op_nodes(graph, {'op': 'Quantize'}) diff --git a/model-optimizer/extensions/back/EltwiseBroadcast.py b/model-optimizer/extensions/back/EltwiseBroadcast.py index a75974a..fce51d6 100644 --- a/model-optimizer/extensions/back/EltwiseBroadcast.py +++ b/model-optimizer/extensions/back/EltwiseBroadcast.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ import networkx as nx import numpy as np from mo.back.replacement import BackReplacementPattern -from mo.graph.graph import unique_id, Node +from mo.graph.graph import Node, Graph from mo.ops.tile import Tile @@ -36,7 +36,7 @@ class EltwiseBroadcast(BackReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): node = match['op'] shapes = [in_node.shape for _, in_node in node.in_nodes().items()] out_shape = node.out_node().shape @@ -69,7 +69,7 @@ class EltwiseBroadcast(BackReplacementPattern): if shapes[input_idx][i] == 1 and out_shape[i] > 1: new_op = tile.create_node([input], dict(axis=i, tiles=out_shape[i])) # add a data node following a new operation node - data_id = unique_id(graph, node.name) + data_id = graph.unique_id(node.name) graph.add_node(data_id, kind='data', shape=None, value=None) new_data = Node(graph, data_id) graph.add_edge(new_op.id, new_data.id, **{'out': 0}) diff --git a/model-optimizer/extensions/back/EnableConstantStridedSlice.py b/model-optimizer/extensions/back/EnableConstantStridedSlice.py new file mode 100644 index 0000000..2090d2d --- /dev/null +++ b/model-optimizer/extensions/back/EnableConstantStridedSlice.py @@ -0,0 +1,36 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph + + +class EnableConstantStridedSlice(BackReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['cmd_params'].keep_shape_ops] + + @staticmethod + def pattern(): + return dict( + nodes=[('const_strided_slice', {'op': 'StridedSlice', 'type': lambda type: type != 'StridedSlice'}), + ('data', {'kind': 'data', 'value': lambda value: value is not None}) + ], + edges=[('const_strided_slice', 'data')], + ) + + @staticmethod + def replace_pattern(graph: Graph, match: dict): + graph.node[match['const_strided_slice'].id]['type'] = 'StridedSlice' diff --git a/model-optimizer/extensions/back/PackBinaryWeights.py b/model-optimizer/extensions/back/PackBinaryWeights.py new file mode 100644 index 0000000..c1b8f63 --- /dev/null +++ b/model-optimizer/extensions/back/PackBinaryWeights.py @@ -0,0 +1,58 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import networkx as nx +import numpy as np + +from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Node, Graph +from mo.ops.tile import Tile + + +class PackBinaryWeights(BackReplacementPattern): + enabled = True + + @staticmethod + def pattern(): + return dict( + nodes=[ + ('op', dict(kind='op', type='BinaryConvolution'))], + edges=[] + ) + + @staticmethod + def replace_pattern(graph: Graph, match: dict): + conv = match['op'] + assert len(conv.in_nodes()) == 2 + weights = conv.in_port(1).data.get_value().flatten() + weights_rounded = np.round(weights) + assert np.all(np.isclose(weights, weights_rounded)) + assert len(conv.in_node(1).out_nodes()) == 1 + weights_rounded = np.array(weights_rounded, dtype=np.int32) + 1 # -1 --> 0 + # Reversing element in chunks by 8 elements to pack bits correctly + # First need to pad data with necessary number of element to make the length dividable by 8 + pad = (-len(weights_rounded))%8 + weights_rounded = np.array(np.concatenate((weights_rounded, np.zeros([pad]))), dtype=np.int32) + assert len(weights_rounded) % 8 == 0 + weights_rounded = weights_rounded.reshape([len(weights_rounded)//8, 8]) + weights_rounded = np.flip(weights_rounded, axis=1) + weights_rounded = weights_rounded.flatten() + packed = np.packbits(weights_rounded) + conv.in_port(1).data.set_value(packed) + conv.in_node(1)['force_precision'] = 'uint8' + conv['packed_weights'] = 1 \ No newline at end of file diff --git a/model-optimizer/extensions/back/PermuteForReshape.py b/model-optimizer/extensions/back/PermuteForReshape.py index f0f14c4..015ddde 100644 --- a/model-optimizer/extensions/back/PermuteForReshape.py +++ b/model-optimizer/extensions/back/PermuteForReshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ from extensions.back.ConvolutionReshaper import ConvolutionReshaper from extensions.back.TileReshaper import TileReshaper from mo.back.replacement import BackReplacementPattern from mo.front.common.layout import get_width_dim, get_height_dim, get_features_dim, indices_mapping +from mo.graph.graph import Graph from mo.ops.op import PermuteAttrs from mo.ops.permute import Permute @@ -46,7 +47,7 @@ class PermuteForReshape(BackReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): reshape = match['reshape'] assert len(reshape.in_nodes()) > 0 if graph.graph['layout'] == 'NCHW' or reshape.has_and_set('nchw_layout') or\ diff --git a/model-optimizer/extensions/back/PermuteForReshape_test.py b/model-optimizer/extensions/back/PermuteForReshape_test.py index 6efc482..dc33d37 100644 --- a/model-optimizer/extensions/back/PermuteForReshape_test.py +++ b/model-optimizer/extensions/back/PermuteForReshape_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/back/RNNSequenceTypeRename.py b/model-optimizer/extensions/back/RNNSequenceTypeRename.py new file mode 100644 index 0000000..dda3599 --- /dev/null +++ b/model-optimizer/extensions/back/RNNSequenceTypeRename.py @@ -0,0 +1,40 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph + + +class RNNSequence(BackReplacementPattern): + """ + This transform change type RNNSequence (internal MO type for all recurrent layers) + to correct operation name. + """ + enabled = True + + def pattern(self): + return dict( + nodes=[ + ('rnn_layer', {'type': 'RNNSequence'}) + ], + edges=[] + ) + + _supported_ops = ['RNN', 'LSTM', 'GRU'] + + def replace_pattern(self, graph: Graph, match: dict): + rnn_layer = match['rnn_layer'] + assert rnn_layer['op'] in self._supported_ops + rnn_layer['type'] = rnn_layer['op'] + 'Sequence' diff --git a/model-optimizer/extensions/back/ReshapeMutation.py b/model-optimizer/extensions/back/ReshapeMutation.py new file mode 100644 index 0000000..e8365ab --- /dev/null +++ b/model-optimizer/extensions/back/ReshapeMutation.py @@ -0,0 +1,89 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph, Node +from mo.middle.pattern_match import for_each_sub_graph_recursively + + +class ReshapeMutation(BackReplacementPattern): + enabled = True + force_clean_up = True + + @staticmethod + def pattern(): + return dict( + nodes=[('reshape', {'kind': 'op', 'type': 'Reshape'})], + edges=[], + ) + + @staticmethod + def replace_pattern(graph: Graph, match: dict): + reshape = match['reshape'] + if hasattr(reshape, 'dim') and reshape.dim is not None: + reshape_inputs = reshape.in_nodes() + value = np.array(reshape.dim) + shape = np.array(value.shape) + del reshape.graph.node[reshape.id]['dim'] + + if 1 in reshape_inputs: + reshape_inputs[1].value = value + reshape_inputs[1].shape = shape + else: + const_id = graph.unique_id(reshape.id + '/DimData') + graph.add_node(const_id, + **{'kind': 'data', 'value': value, 'shape': shape, 'name': reshape.id + '/DimData'}) + graph.add_edge(const_id, reshape.id, **{'in': 1}) + + +class DisableReshapeMutationInTensorIterator(BackReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + return [ReshapeMutation] + + @staticmethod + def add_supported_attrs_to_node(node: Node, params: list): + node.graph.node[node.id].update({ + 'IE': [( + 'layer', + [('id', lambda node: node.node), 'name', 'precision', 'type'], + [ + ('data', params, []), + '@ports', + '@consts'])] + }) + + def reshapes_with_two_inputs_to_reshape_with_dim(self, graph: Graph): + reshapes = graph.get_op_nodes(op='Reshape') + + for reshape in reshapes: + in_nodes = reshape.in_nodes() + + if len(in_nodes) == 1: + continue + assert len(in_nodes) == 2, "Reshape operation should have 2 inputs or 1 input and `dim` attribute" + + reshape['dim'] = reshape.in_port(1).get_connection().data.get_value() + reshape.in_port(1).disconnect() + + params = [('dim', lambda node: ','.join(map(str, node['dim'])))] + self.add_supported_attrs_to_node(reshape, params) + + def find_and_replace_pattern(self, graph: Graph): + for_each_sub_graph_recursively(graph, self.reshapes_with_two_inputs_to_reshape_with_dim) diff --git a/model-optimizer/extensions/back/ShufflenetReLUReorder.py b/model-optimizer/extensions/back/ShufflenetReLUReorder.py index 234c78c..2323d64 100644 --- a/model-optimizer/extensions/back/ShufflenetReLUReorder.py +++ b/model-optimizer/extensions/back/ShufflenetReLUReorder.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph class ShufflenetReLUReorder(BackReplacementPattern): @@ -50,7 +50,7 @@ class ShufflenetReLUReorder(BackReplacementPattern): ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): relu = match['relu'] reshape1 = match['reshape1'] reshape2_data = match['reshape2_data'] diff --git a/model-optimizer/extensions/back/ShufflenetReLUReorder_test.py b/model-optimizer/extensions/back/ShufflenetReLUReorder_test.py index 27c0f34..fd15e9b 100644 --- a/model-optimizer/extensions/back/ShufflenetReLUReorder_test.py +++ b/model-optimizer/extensions/back/ShufflenetReLUReorder_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/back/TileReshaper.py b/model-optimizer/extensions/back/TileReshaper.py index 7c6e2d6..f1123c8 100644 --- a/model-optimizer/extensions/back/TileReshaper.py +++ b/model-optimizer/extensions/back/TileReshaper.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,11 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. """ -import networkx as nx + import numpy as np from extensions.back.EltwiseBroadcast import EltwiseBroadcast from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph from mo.ops.reshape import Reshape @@ -37,7 +38,7 @@ class TileReshaper(BackReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): """ Workarounds not supported type of Tile in Inference Engine (Tiles are supported for 2-D or 4-D tensors): Searches for Tiles with 3D shapes and covers it with Reshapes. diff --git a/model-optimizer/extensions/back/TileReshaper_test.py b/model-optimizer/extensions/back/TileReshaper_test.py index 5c43219..2fac84f 100644 --- a/model-optimizer/extensions/back/TileReshaper_test.py +++ b/model-optimizer/extensions/back/TileReshaper_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/back/disable_unsupported_ND_operations.py b/model-optimizer/extensions/back/disable_unsupported_ND_operations.py index f657bc7..2b62830 100644 --- a/model-optimizer/extensions/back/disable_unsupported_ND_operations.py +++ b/model-optimizer/extensions/back/disable_unsupported_ND_operations.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.back.replacement import BackReplacementPattern -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error @@ -29,7 +29,7 @@ class DisableUnsupportedNDOperations(BackReplacementPattern): unsupported_operations = ['Convolution', 'Deconvolution', 'Pooling'] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): unsupported_nodes = [] for node in graph.nodes(): node = Node(graph, node) diff --git a/model-optimizer/extensions/back/insert_compatibility_l2normalization.py b/model-optimizer/extensions/back/insert_compatibility_l2normalization.py index 4f4dfe9..994b5af 100644 --- a/model-optimizer/extensions/back/insert_compatibility_l2normalization.py +++ b/model-optimizer/extensions/back/insert_compatibility_l2normalization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import numpy as np import networkx as nx from mo.ops.op import Op -from mo.graph.graph import create_edge +from mo.graph.graph import Graph from mo.back.replacement import BackReplacementPattern @@ -32,7 +32,7 @@ class CompatibilityL2NormalizationPattern(BackReplacementPattern): ], edges=[]) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): """ Adds Normalize layer weights, which are required by Inference Engine, but do not always exist in MXNet model. @@ -42,7 +42,7 @@ class CompatibilityL2NormalizationPattern(BackReplacementPattern): Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. @@ -51,4 +51,4 @@ class CompatibilityL2NormalizationPattern(BackReplacementPattern): if len(l2_normalization_node.in_nodes()) < 2: value = np.full([l2_normalization_node.in_node(0).shape[1]], 1.0, dtype=np.float32) weights_node = Op.create_input_data_node(graph, name=l2_normalization_node['name'] + '_weights', value=value) - create_edge(weights_node, l2_normalization_node, out_port=0, in_port=1, edge_attrs={'bin': 'weights'}) + graph.create_edge(weights_node, l2_normalization_node, out_port=0, in_port=1, edge_attrs={'bin': 'weights'}) diff --git a/model-optimizer/extensions/back/insert_compatibility_l2normalization_test.py b/model-optimizer/extensions/back/insert_compatibility_l2normalization_test.py index a1296ac..2179339 100644 --- a/model-optimizer/extensions/back/insert_compatibility_l2normalization_test.py +++ b/model-optimizer/extensions/back/insert_compatibility_l2normalization_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/back/kaldi_remove_memory_output.py b/model-optimizer/extensions/back/kaldi_remove_memory_output.py index 72e4cb4..3892635 100644 --- a/model-optimizer/extensions/back/kaldi_remove_memory_output.py +++ b/model-optimizer/extensions/back/kaldi_remove_memory_output.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph class KaldiRemoveMemoryOutputBackReplacementPattern(BackReplacementPattern): @@ -26,16 +25,18 @@ class KaldiRemoveMemoryOutputBackReplacementPattern(BackReplacementPattern): def pattern(): return dict( nodes=[ - ('memory_node', dict(kind='op', op='Memory')), - ('data_node', dict(kind='data')) + ('memory_node', dict(op='Memory')), + ('data_node', dict(kind='data')), + ('op_output', dict(op='OpOutput')) ], edges=[ - ('memory_node', 'data_node', {'out': 0}) + ('memory_node', 'data_node'), + ('data_node', 'op_output') ] ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): """ Need to find the pattern: Memory -> Data -> OpOutput @@ -47,7 +48,7 @@ class KaldiRemoveMemoryOutputBackReplacementPattern(BackReplacementPattern): Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. @@ -55,8 +56,5 @@ class KaldiRemoveMemoryOutputBackReplacementPattern(BackReplacementPattern): memory = match['memory_node'] data = match['data_node'] - # Those Memory nodes that are not output ones, should not be replaced - if not data.has_and_set('is_output'): - return graph.remove_edge(memory.id, data.id) graph.remove_node(data.id) diff --git a/model-optimizer/extensions/back/kaldi_remove_memory_output_test.py b/model-optimizer/extensions/back/kaldi_remove_memory_output_test.py index c72351c..12269c6 100644 --- a/model-optimizer/extensions/back/kaldi_remove_memory_output_test.py +++ b/model-optimizer/extensions/back/kaldi_remove_memory_output_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,21 +31,28 @@ class KaldiRemoveMemoryOutputTest(unittest.TestCase): }, 'output_node': { 'kind': 'data' + }, + 'op_output': { + 'kind': 'data', + 'op': 'OpOutput', } } def test_remove_out_data_for_memory(self): - graph = build_graph(self.nodes, [('input_node', 'memory_node')]) - # Need for matching in pattern. The edge memory_node->out_node must contain only the attribute 'out' = 0 - # build_graph creates edge memory_node->out_node with attributes 'in' and 'out' - graph.add_node('output_node', is_output=True, **self.nodes['output_node']) - graph.add_edge('memory_node', 'output_node', out=0) + graph = build_graph(self.nodes, + [ + ('input_node', 'memory_node'), + ('memory_node', 'output_node'), + ('output_node', 'op_output') + ]) KaldiRemoveMemoryOutputBackReplacementPattern().find_and_replace_pattern(graph) self.assertNotIn('output_node', graph.node) def test_do_not_remove_out_data_for_memory(self): - graph = build_graph(self.nodes, [('input_node', 'memory_node')]) - graph.add_node('output_node', **self.nodes['output_node']) - graph.add_edge('memory_node', 'output_node', out=0) + graph = build_graph(self.nodes, + [ + ('input_node', 'memory_node'), + ('memory_node', 'output_node'), + ]) KaldiRemoveMemoryOutputBackReplacementPattern().find_and_replace_pattern(graph) self.assertIn('output_node', graph.node) diff --git a/model-optimizer/extensions/back/remove_last_softmax_pattern.py b/model-optimizer/extensions/back/remove_last_softmax_pattern.py index 488e161..243274c 100644 --- a/model-optimizer/extensions/back/remove_last_softmax_pattern.py +++ b/model-optimizer/extensions/back/remove_last_softmax_pattern.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ import networkx as nx from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node @@ -27,27 +28,22 @@ class RemoveLastSoftMaxPattern(BackReplacementPattern): def pattern(): return dict( nodes=[ - ('softmax_node', dict(kind='op', op='SoftMax')) + ('softmax_node', dict(op='SoftMax')), + ('softmax_data', dict(kind='data')), + ('op_output', dict(op='OpOutput')) ], - edges=[] + edges=[ + ('softmax_node', 'softmax_data'), + ('softmax_data', 'op_output') + ] ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): """ - Need to find the pattern: Parent (any type) -> SoftMAx -> OpOutput - - It is needed to remove output SoftMAx layer - - Parameters - ---------- - graph : nx.MultiDiGraph - Graph with loaded model. - match : dict - Patterns which were found in graph structure. + Removes output SoftMax layer + :param graph: graph to operate on + :param match: dictionary with matched nodes """ - softmax = match['softmax_node'] - child = softmax.out_node() - if not child.has_and_set('is_output'): - return - remove_op_node_with_data_node(graph, softmax) + if len(match['softmax_data'].out_nodes()) == 1: + remove_op_node_with_data_node(graph, match['softmax_node']) diff --git a/model-optimizer/extensions/back/remove_last_softmax_test.py b/model-optimizer/extensions/back/remove_last_softmax_test.py index 29a0173..dd73f13 100644 --- a/model-optimizer/extensions/back/remove_last_softmax_test.py +++ b/model-optimizer/extensions/back/remove_last_softmax_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,14 +31,19 @@ class KaldiRemoveLastSoftMaxTest(unittest.TestCase): }, 'output_node': { 'kind': 'data' + }, + 'op_output': { + 'kind': 'op', + 'op': 'OpOutput' } } def test_remove_last_SoftMax(self): graph = build_graph(self.nodes, [ ('input_node', 'softmax_node'), - ('softmax_node', 'output_node') - ], {'output_node': {'is_output': True}}) + ('softmax_node', 'output_node'), + ('output_node', 'op_output') + ]) RemoveLastSoftMaxPattern().find_and_replace_pattern(graph) self.assertNotIn('softmax_node', graph.node) diff --git a/model-optimizer/extensions/front/LRNReplacer.py b/model-optimizer/extensions/front/LRNReplacer.py index 111b598..b844a87 100644 --- a/model-optimizer/extensions/front/LRNReplacer.py +++ b/model-optimizer/extensions/front/LRNReplacer.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ import numpy as np import networkx as nx from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Graph from mo.ops.lin_op import Mul from mo.ops.const import Const @@ -26,7 +27,7 @@ class LRNReplacer(FrontReplacementOp): op = 'LRN' enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): node = match['op'] if not node.has_valid('bias') or (node.has_valid('bias') and node.bias == 1): diff --git a/model-optimizer/extensions/front/Pack.py b/model-optimizer/extensions/front/Pack.py index a7defba..160539e 100644 --- a/model-optimizer/extensions/front/Pack.py +++ b/model-optimizer/extensions/front/Pack.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,11 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import networkx as nx - from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.concat import Concat from mo.ops.const import Const from mo.ops.expand_dims import ExpandDims @@ -27,14 +24,16 @@ class Pack(FrontReplacementOp): op = "Pack" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): expand_dims_nodes = list() expand_axis_node = Const(graph, dict(value=node.axis)).create_node([]) for ind, edge_attrs in node.in_edges().items(): expand_dims_nodes.append(ExpandDims(graph, dict(name=node.name + '/ExpandDims_')). create_node([(node.in_node(ind), edge_attrs['out']), expand_axis_node])) - out_node = Concat(graph, dict(name=node.name + '/Concat_', axis=node.axis)).create_node(expand_dims_nodes) + out_node = Concat(graph, dict(name=node.name + '/Concat_', + axis=node.axis, + in_ports_count=len(expand_dims_nodes))).create_node(expand_dims_nodes) # Replace edge from out port 0 of the matched node with a edge from node out_node.id with port 0. # The "explicit" version of the return value is: [(out_node.id, 0)]) return [out_node.id] diff --git a/model-optimizer/extensions/front/caffe/accum_ext.py b/model-optimizer/extensions/front/caffe/accum_ext.py index 9c185cd..1dc74c5 100644 --- a/model-optimizer/extensions/front/caffe/accum_ext.py +++ b/model-optimizer/extensions/front/caffe/accum_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/accum_ext_test.py b/model-optimizer/extensions/front/caffe/accum_ext_test.py index f67e745..ac65ad2 100644 --- a/model-optimizer/extensions/front/caffe/accum_ext_test.py +++ b/model-optimizer/extensions/front/caffe/accum_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/argmax_ext.py b/model-optimizer/extensions/front/caffe/argmax_ext.py index dc5f927..69946ea 100644 --- a/model-optimizer/extensions/front/caffe/argmax_ext.py +++ b/model-optimizer/extensions/front/caffe/argmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/argmax_ext_test.py b/model-optimizer/extensions/front/caffe/argmax_ext_test.py index 39547d1..7230844 100644 --- a/model-optimizer/extensions/front/caffe/argmax_ext_test.py +++ b/model-optimizer/extensions/front/caffe/argmax_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/axpy.py b/model-optimizer/extensions/front/caffe/axpy.py index e5f5759..88ef5c0 100644 --- a/model-optimizer/extensions/front/caffe/axpy.py +++ b/model-optimizer/extensions/front/caffe/axpy.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.lin_op import Add from mo.ops.scale_shift import ScaleShiftOp @@ -29,7 +27,7 @@ class AxpyToEltwise(FrontReplacementOp): op = "Axpy" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): in_node_0 = node.in_node(0) in_node_1 = node.in_node(1) in_node_2 = node.in_node(2) diff --git a/model-optimizer/extensions/front/caffe/axpy_test.py b/model-optimizer/extensions/front/caffe/axpy_test.py index 01e535c..6cd0bf2 100644 --- a/model-optimizer/extensions/front/caffe/axpy_test.py +++ b/model-optimizer/extensions/front/caffe/axpy_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/bias_ext.py b/model-optimizer/extensions/front/caffe/bias_ext.py new file mode 100644 index 0000000..8cce76b --- /dev/null +++ b/model-optimizer/extensions/front/caffe/bias_ext.py @@ -0,0 +1,37 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.caffe.extractors.utils import embed_input +from mo.front.extractor import FrontExtractorOp +from mo.graph.graph import Node +from mo.ops.lin_op import Add + + +class BiasToAdd(FrontExtractorOp): + """ + Replaces Bias layer with Eltwise. + """ + op = "Bias" + enabled = True + + @staticmethod + def extract(node: Node): + attrs = {'axis': node.pb.bias_param.axis} + embed_input(attrs, 1, 'bias', node.model_pb.blobs[0].data, 'biases') + + Add.update_node_stat(node, attrs) + + return __class__.enabled diff --git a/model-optimizer/extensions/front/caffe/bias_ext_test.py b/model-optimizer/extensions/front/caffe/bias_ext_test.py new file mode 100644 index 0000000..869aae8 --- /dev/null +++ b/model-optimizer/extensions/front/caffe/bias_ext_test.py @@ -0,0 +1,46 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest +from unittest.mock import patch + +from extensions.front.caffe.bias_ext import BiasToAdd +from mo.utils.unittest.extractors import FakeModelLayer, FakeMultiParam +from mo.utils.unittest.graph import FakeNode + + +class FakeBiasProtoLayer: + def __init__(self, val): + self.bias_param = val + + +class TestBias(unittest.TestCase): + + @patch('extensions.front.caffe.bias_ext.embed_input') + def test_bias(self, embed_input_mock): + embed_input_mock.return_value = {} + params = {'axis': 1} + add_node = FakeNode(FakeBiasProtoLayer(FakeMultiParam(params)), + FakeModelLayer([1, 2, 3, 4, 5])) + BiasToAdd.extract(add_node) + + exp_res = { + 'type': "Eltwise", + 'operation': 'sum', + 'axis': 1 + } + + for key in exp_res.keys(): + self.assertEqual(add_node[key], exp_res[key]) diff --git a/model-optimizer/extensions/front/caffe/binarization.py b/model-optimizer/extensions/front/caffe/binarization.py new file mode 100644 index 0000000..ba69573 --- /dev/null +++ b/model-optimizer/extensions/front/caffe/binarization.py @@ -0,0 +1,43 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.ops.quantize import QuantizeOp +from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Node, Graph +from mo.ops.const import Const + + +class BinarizationToQuantize(FrontReplacementOp): + """ + Replaces Binarization layer with Quantize. + """ + op = "Binarization" + enabled = True + + def replace_op(self, graph: Graph, node: Node): + in_node_0 = node.in_node(0) + + broadcast = lambda x: np.array([x], dtype=np.float32) + threshold = Const(graph, {'name': node.id + "/Input_1", "value": broadcast(0)}).create_node() + in_1 = threshold + in_2 = threshold + in_3 = Const(graph, {'name': node.id + "/Input_3", "value": broadcast(-1)}).create_node() + in_4 = Const(graph, {'name': node.id + "/Input_4", "value": broadcast(+1)}).create_node() + quant = QuantizeOp(graph, {'name': node.id + "/Quantize_", "levels": 2}).create_node( + inputs=[in_node_0, in_1, in_2, in_3, in_4]) + + return [quant.id] diff --git a/model-optimizer/extensions/front/caffe/binary_conv_ext.py b/model-optimizer/extensions/front/caffe/binary_conv_ext.py new file mode 100644 index 0000000..4ba74b5 --- /dev/null +++ b/model-optimizer/extensions/front/caffe/binary_conv_ext.py @@ -0,0 +1,55 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.front.caffe.conv_ext import conv_create_attrs, conv_set_params +from mo.front.caffe.extractors.utils import weights_biases +from mo.front.common.extractors.utils import layout_attrs +from mo.front.extractor import FrontExtractorOp +from mo.ops.convolution import Convolution +from mo.utils.error import Error + + +class ConvFrontExtractor(FrontExtractorOp): + op = 'ConvolutionBinary' + enabled = True + + @staticmethod + def extract(node): + proto_layer, model_layer = node.pb, node.model_pb + + if not proto_layer: + raise Error('Protobuf layer can not be empty') + + conv_param = proto_layer.convolution_param + conv_type = 'ConvND' if len(proto_layer.bottom) > 1 else 'Conv2D' + + params = conv_set_params(conv_param, conv_type) + attrs = conv_create_attrs(params) + attrs.update({'op': __class__.op, + 'get_group': lambda node: node.group, + 'get_output_feature_dim': lambda node: node.output + }) + + # Embed weights and biases as attributes + # It will be moved to a separate nodes in special pass + attrs.update( + weights_biases(conv_param.bias_term, model_layer, start_index=len(proto_layer.bottom), proto=conv_param)) + attrs.update(layout_attrs()) + + # update the attributes of the node + Convolution.update_node_stat(node, attrs) + return __class__.enabled + diff --git a/model-optimizer/extensions/front/caffe/bn.py b/model-optimizer/extensions/front/caffe/bn.py index 06ad486..01e52f4 100644 --- a/model-optimizer/extensions/front/caffe/bn.py +++ b/model-optimizer/extensions/front/caffe/bn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,12 +14,11 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.caffe.extractors.utils import embed_input from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.scale_shift import ScaleShiftOp from mo.utils.error import Error @@ -31,7 +30,7 @@ class BNToScaleShift(FrontReplacementOp): op = "BN" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): attrs = {'name': node.id + "/ScaleShift_"} param = graph.node[node.id]['pb'].bn_param diff --git a/model-optimizer/extensions/front/caffe/bn_test.py b/model-optimizer/extensions/front/caffe/bn_test.py index f075e50..ac4ecac 100644 --- a/model-optimizer/extensions/front/caffe/bn_test.py +++ b/model-optimizer/extensions/front/caffe/bn_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/conv_ext.py b/model-optimizer/extensions/front/caffe/conv_ext.py index 8146917..dfd9ed6 100644 --- a/model-optimizer/extensions/front/caffe/conv_ext.py +++ b/model-optimizer/extensions/front/caffe/conv_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/conv_ext_test.py b/model-optimizer/extensions/front/caffe/conv_ext_test.py index 49c8b0b..22d7d00 100644 --- a/model-optimizer/extensions/front/caffe/conv_ext_test.py +++ b/model-optimizer/extensions/front/caffe/conv_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/correlation_ext.py b/model-optimizer/extensions/front/caffe/correlation_ext.py index c05e04c..066e973 100644 --- a/model-optimizer/extensions/front/caffe/correlation_ext.py +++ b/model-optimizer/extensions/front/caffe/correlation_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/correlation_ext_test.py b/model-optimizer/extensions/front/caffe/correlation_ext_test.py index de4b74c..3ee6006 100644 --- a/model-optimizer/extensions/front/caffe/correlation_ext_test.py +++ b/model-optimizer/extensions/front/caffe/correlation_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext.py b/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext.py index 3073128..37c1f2e 100644 --- a/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext.py +++ b/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext_test.py b/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext_test.py index 07b724e..a01f405 100644 --- a/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext_test.py +++ b/model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/data_augmentation_ext.py b/model-optimizer/extensions/front/caffe/data_augmentation_ext.py index f7769e7..12f8ad7 100644 --- a/model-optimizer/extensions/front/caffe/data_augmentation_ext.py +++ b/model-optimizer/extensions/front/caffe/data_augmentation_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/data_augmentation_ext_test.py b/model-optimizer/extensions/front/caffe/data_augmentation_ext_test.py index 0dd0aba..4524ff8 100644 --- a/model-optimizer/extensions/front/caffe/data_augmentation_ext_test.py +++ b/model-optimizer/extensions/front/caffe/data_augmentation_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/detection_output.py b/model-optimizer/extensions/front/caffe/detection_output.py index 296fcf3..57f336a 100644 --- a/model-optimizer/extensions/front/caffe/detection_output.py +++ b/model-optimizer/extensions/front/caffe/detection_output.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -140,6 +140,8 @@ class DetectionOutputFrontExtractor(FrontExtractorOp): attrs['input_height'] = param.input_height if 'normalized' in fields: attrs['normalized'] = int(param.normalized) + if 'objectness_score' in fields: + attrs['objectness_score'] = param.objectness_score mapping_rule = merge_attrs(param, attrs) diff --git a/model-optimizer/extensions/front/caffe/flatten_ext.py b/model-optimizer/extensions/front/caffe/flatten_ext.py index a68d81c..19d4aef 100644 --- a/model-optimizer/extensions/front/caffe/flatten_ext.py +++ b/model-optimizer/extensions/front/caffe/flatten_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/grn_ext.py b/model-optimizer/extensions/front/caffe/grn_ext.py index 4b4cd97..57bf405 100644 --- a/model-optimizer/extensions/front/caffe/grn_ext.py +++ b/model-optimizer/extensions/front/caffe/grn_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/grn_ext_test.py b/model-optimizer/extensions/front/caffe/grn_ext_test.py index e284a8a..9eeba17 100644 --- a/model-optimizer/extensions/front/caffe/grn_ext_test.py +++ b/model-optimizer/extensions/front/caffe/grn_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/interp_ext.py b/model-optimizer/extensions/front/caffe/interp_ext.py index ae8a8da..9bfb33c 100644 --- a/model-optimizer/extensions/front/caffe/interp_ext.py +++ b/model-optimizer/extensions/front/caffe/interp_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/interp_ext_test.py b/model-optimizer/extensions/front/caffe/interp_ext_test.py index ecbf114..be17dcb 100644 --- a/model-optimizer/extensions/front/caffe/interp_ext_test.py +++ b/model-optimizer/extensions/front/caffe/interp_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/mvn_ext.py b/model-optimizer/extensions/front/caffe/mvn_ext.py index a34e007..cc4fb26 100644 --- a/model-optimizer/extensions/front/caffe/mvn_ext.py +++ b/model-optimizer/extensions/front/caffe/mvn_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/normalize_ext.py b/model-optimizer/extensions/front/caffe/normalize_ext.py index bc411b7..1202f3b 100644 --- a/model-optimizer/extensions/front/caffe/normalize_ext.py +++ b/model-optimizer/extensions/front/caffe/normalize_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/normalize_ext_test.py b/model-optimizer/extensions/front/caffe/normalize_ext_test.py index 4b2c42f..01d6f08 100644 --- a/model-optimizer/extensions/front/caffe/normalize_ext_test.py +++ b/model-optimizer/extensions/front/caffe/normalize_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/pooling_ext.py b/model-optimizer/extensions/front/caffe/pooling_ext.py index 96540a1..b48324e 100644 --- a/model-optimizer/extensions/front/caffe/pooling_ext.py +++ b/model-optimizer/extensions/front/caffe/pooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/pooling_ext_test.py b/model-optimizer/extensions/front/caffe/pooling_ext_test.py index f391d93..ec3e74f 100644 --- a/model-optimizer/extensions/front/caffe/pooling_ext_test.py +++ b/model-optimizer/extensions/front/caffe/pooling_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/power_file_ext.py b/model-optimizer/extensions/front/caffe/power_file_ext.py index cba120b..6284369 100644 --- a/model-optimizer/extensions/front/caffe/power_file_ext.py +++ b/model-optimizer/extensions/front/caffe/power_file_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/power_file_ext_test.py b/model-optimizer/extensions/front/caffe/power_file_ext_test.py index da06fc6..37b04ce 100644 --- a/model-optimizer/extensions/front/caffe/power_file_ext_test.py +++ b/model-optimizer/extensions/front/caffe/power_file_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/prelu_ext.py b/model-optimizer/extensions/front/caffe/prelu_ext.py index 40cff27..039ace3 100644 --- a/model-optimizer/extensions/front/caffe/prelu_ext.py +++ b/model-optimizer/extensions/front/caffe/prelu_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/prelu_ext_test.py b/model-optimizer/extensions/front/caffe/prelu_ext_test.py index 2ed4370..fb0a167 100644 --- a/model-optimizer/extensions/front/caffe/prelu_ext_test.py +++ b/model-optimizer/extensions/front/caffe/prelu_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/priorbox_clustered_ext.py b/model-optimizer/extensions/front/caffe/priorbox_clustered_ext.py index 68e98a4..959bdd1 100644 --- a/model-optimizer/extensions/front/caffe/priorbox_clustered_ext.py +++ b/model-optimizer/extensions/front/caffe/priorbox_clustered_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/priorbox_clustered_ext_test.py b/model-optimizer/extensions/front/caffe/priorbox_clustered_ext_test.py index 4ce3e32..8b02617 100644 --- a/model-optimizer/extensions/front/caffe/priorbox_clustered_ext_test.py +++ b/model-optimizer/extensions/front/caffe/priorbox_clustered_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/priorbox_ext.py b/model-optimizer/extensions/front/caffe/priorbox_ext.py index ae87dc4..c13e828 100644 --- a/model-optimizer/extensions/front/caffe/priorbox_ext.py +++ b/model-optimizer/extensions/front/caffe/priorbox_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/priorbox_ext_test.py b/model-optimizer/extensions/front/caffe/priorbox_ext_test.py index b93a883..23f46d9 100644 --- a/model-optimizer/extensions/front/caffe/priorbox_ext_test.py +++ b/model-optimizer/extensions/front/caffe/priorbox_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/proposal_ext.py b/model-optimizer/extensions/front/caffe/proposal_ext.py index 059e843..5ecfde5 100644 --- a/model-optimizer/extensions/front/caffe/proposal_ext.py +++ b/model-optimizer/extensions/front/caffe/proposal_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/proposal_ext_test.py b/model-optimizer/extensions/front/caffe/proposal_ext_test.py index ff41fb0..edb9f31 100644 --- a/model-optimizer/extensions/front/caffe/proposal_ext_test.py +++ b/model-optimizer/extensions/front/caffe/proposal_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/proposal_python_ext.py b/model-optimizer/extensions/front/caffe/proposal_python_ext.py index 364611b..3db451f 100644 --- a/model-optimizer/extensions/front/caffe/proposal_python_ext.py +++ b/model-optimizer/extensions/front/caffe/proposal_python_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/proposal_python_ext_test.py b/model-optimizer/extensions/front/caffe/proposal_python_ext_test.py index d47f2b7..4c3ac4d 100644 --- a/model-optimizer/extensions/front/caffe/proposal_python_ext_test.py +++ b/model-optimizer/extensions/front/caffe/proposal_python_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/psroipooling_ext.py b/model-optimizer/extensions/front/caffe/psroipooling_ext.py index 9dead6d..ffaef79 100644 --- a/model-optimizer/extensions/front/caffe/psroipooling_ext.py +++ b/model-optimizer/extensions/front/caffe/psroipooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/psroipooling_ext_test.py b/model-optimizer/extensions/front/caffe/psroipooling_ext_test.py index f175278..5da3c7d 100644 --- a/model-optimizer/extensions/front/caffe/psroipooling_ext_test.py +++ b/model-optimizer/extensions/front/caffe/psroipooling_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/regionyolo_ext.py b/model-optimizer/extensions/front/caffe/regionyolo_ext.py index ca28c14..22bde08 100644 --- a/model-optimizer/extensions/front/caffe/regionyolo_ext.py +++ b/model-optimizer/extensions/front/caffe/regionyolo_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/regionyolo_ext_test.py b/model-optimizer/extensions/front/caffe/regionyolo_ext_test.py index 8c37989..56e451f 100644 --- a/model-optimizer/extensions/front/caffe/regionyolo_ext_test.py +++ b/model-optimizer/extensions/front/caffe/regionyolo_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/reorgyolo_ext.py b/model-optimizer/extensions/front/caffe/reorgyolo_ext.py index d6ee374..57bc30b 100644 --- a/model-optimizer/extensions/front/caffe/reorgyolo_ext.py +++ b/model-optimizer/extensions/front/caffe/reorgyolo_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/reorgyolo_ext_test.py b/model-optimizer/extensions/front/caffe/reorgyolo_ext_test.py index 502c5ad..f5939de 100644 --- a/model-optimizer/extensions/front/caffe/reorgyolo_ext_test.py +++ b/model-optimizer/extensions/front/caffe/reorgyolo_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/resample_ext.py b/model-optimizer/extensions/front/caffe/resample_ext.py index 8e8bcb5..84f72e5 100644 --- a/model-optimizer/extensions/front/caffe/resample_ext.py +++ b/model-optimizer/extensions/front/caffe/resample_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/resample_ext_test.py b/model-optimizer/extensions/front/caffe/resample_ext_test.py index c1fc3d6..3e56de7 100644 --- a/model-optimizer/extensions/front/caffe/resample_ext_test.py +++ b/model-optimizer/extensions/front/caffe/resample_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/shufflechannel_ext.py b/model-optimizer/extensions/front/caffe/shufflechannel_ext.py index 37b7221..81ffcf8 100644 --- a/model-optimizer/extensions/front/caffe/shufflechannel_ext.py +++ b/model-optimizer/extensions/front/caffe/shufflechannel_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/simplernms_ext.py b/model-optimizer/extensions/front/caffe/simplernms_ext.py index 2d9cbaf..5ad9979 100644 --- a/model-optimizer/extensions/front/caffe/simplernms_ext.py +++ b/model-optimizer/extensions/front/caffe/simplernms_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/simplernms_ext_test.py b/model-optimizer/extensions/front/caffe/simplernms_ext_test.py index 06b298b..8ce238c 100644 --- a/model-optimizer/extensions/front/caffe/simplernms_ext_test.py +++ b/model-optimizer/extensions/front/caffe/simplernms_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/softmax_ext.py b/model-optimizer/extensions/front/caffe/softmax_ext.py index 6bb8d74..972c113 100644 --- a/model-optimizer/extensions/front/caffe/softmax_ext.py +++ b/model-optimizer/extensions/front/caffe/softmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/spatial_transformer_ext.py b/model-optimizer/extensions/front/caffe/spatial_transformer_ext.py index fc27ded..842b2a1 100644 --- a/model-optimizer/extensions/front/caffe/spatial_transformer_ext.py +++ b/model-optimizer/extensions/front/caffe/spatial_transformer_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/spatial_transformer_ext_test.py b/model-optimizer/extensions/front/caffe/spatial_transformer_ext_test.py index 9039cda..8747867 100644 --- a/model-optimizer/extensions/front/caffe/spatial_transformer_ext_test.py +++ b/model-optimizer/extensions/front/caffe/spatial_transformer_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/caffe/split_to_identity.py b/model-optimizer/extensions/front/caffe/split_to_identity.py index d46c1c3..189139b 100644 --- a/model-optimizer/extensions/front/caffe/split_to_identity.py +++ b/model-optimizer/extensions/front/caffe/split_to_identity.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Graph class SplitToIdentity(FrontReplacementOp): @@ -31,7 +30,7 @@ class SplitToIdentity(FrontReplacementOp): op = "Split" enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): split_node = match['op'] split_node.op = 'Identity' for u, v, edge_attrs in split_node.graph.out_edges(split_node.id, data=True): diff --git a/model-optimizer/extensions/front/create_tensor_nodes.py b/model-optimizer/extensions/front/create_tensor_nodes.py new file mode 100644 index 0000000..2417e91 --- /dev/null +++ b/model-optimizer/extensions/front/create_tensor_nodes.py @@ -0,0 +1,34 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.front.extractor import create_tensor_nodes +from mo.graph.graph import Graph + + +class CreateTensorNodes(FrontReplacementPattern): + enabled = True + force_clean_up = True + + def run_before(self): + return [] + + def run_after(self): + from extensions.front.pass_separator import FrontFinish + return [FrontFinish] + + def find_and_replace_pattern(self, graph: Graph): + create_tensor_nodes(graph) + graph.stage = 'middle' diff --git a/model-optimizer/mo/ops/div.py b/model-optimizer/extensions/front/div.py similarity index 54% rename from model-optimizer/mo/ops/div.py rename to model-optimizer/extensions/front/div.py index 4f39e4c..9509d79 100644 --- a/model-optimizer/mo/ops/div.py +++ b/model-optimizer/extensions/front/div.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ """ import numpy as np -import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.eltwise import Eltwise from mo.ops.power import Power @@ -27,13 +26,15 @@ class Div(FrontReplacementOp): op = "Div" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): - reciprocal = Power(graph, dict(scale=1, power=np.float64(-1), shift=0, name=node.name + '/reciprocal_')) - mul = Eltwise(graph, dict(operation='mul', name=node.name + '/mul_')) + def replace_op(self, graph: Graph, node: Node): + reciprocal = Power(graph, {'scale': 1, 'power': np.float64(-1), 'shift': 0, + 'name': node.name + '/reciprocal_'}).create_node() + mul = Eltwise(graph, {'operation': 'mul', 'name': node.name + '/mul_'}).create_node() + + # Connect nodes + node.in_port(1).get_connection().set_destination(reciprocal.in_port(0)) + node.in_port(0).get_connection().set_destination(mul.in_port(1)) + reciprocal.out_port(0).connect(mul.in_port(0)) - out_node = mul.create_node([(node.in_node(0), node.in_edge(0)['out']), - reciprocal.create_node([(node.in_node(1), node.in_edge(1)['out'])]) - ]) - # Replace edge from out port 0 of the matched node with a edge from node out_node.id with port 0. # The "explicit" version of the return value is: [(out_node.id, 0)]) - return [out_node.id] + return [mul.id] diff --git a/model-optimizer/extensions/front/div_test.py b/model-optimizer/extensions/front/div_test.py new file mode 100644 index 0000000..50ec3e8 --- /dev/null +++ b/model-optimizer/extensions/front/div_test.py @@ -0,0 +1,98 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest + +import numpy as np + +from extensions.front.div import Div +from mo.utils.unittest.graph import build_graph, compare_graphs + +nodes_attributes = { + 'placeholder_1': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'placeholder_2': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + # Div operation + 'Div': {'kind': 'op', 'op': 'Div'}, + # Test operation + 'last': {'type': None, 'value': None, 'kind': 'op', 'op': None}, + # Add and Power operations + 'power_1': {'scale': None, 'power': None, 'shift': None, 'type': 'Power', 'kind': 'op', 'op': 'Power'}, + 'mul_1': {'value': None, 'type': 'Eltwise', 'kind': 'op', 'op': 'Mul'}, +} + + +class TestDiv(unittest.TestCase): + def test_div_test_1(self): + # Test with two different inputs from two placeholders + graph = build_graph(nodes_attributes, + [('placeholder_1', 'Div'), + ('placeholder_2', 'Div'), + ('Div', 'last') + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'placeholder_2': {'shape': np.array([1, 227, 227, 3])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_2', 'power_1'), + ('power_1', 'mul_1'), + ('placeholder_1', 'mul_1'), + ('mul_1', 'last'), + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'placeholder_2': {'shape': np.array([1, 227, 227, 3])}, + 'power_1': {'scale': np.array(1), 'power': np.array(-1), 'shift': np.array(0), + 'type': 'Power'}, + 'mul_1': {'type': 'Eltwise', 'op': 'Mul'}, + }, nodes_with_edges_only=True) + + graph.stage = 'front' + + tested_class = Div() + tested_class.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_div_test_2(self): + # Test with two same inputs from one placeholder + graph = build_graph(nodes_attributes, + [('placeholder_1', 'Div'), + ('placeholder_1', 'Div'), + ('Div', 'last') + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('power_1', 'mul_1'), + ('placeholder_1', 'mul_1'), + ('placeholder_1', 'power_1'), + ('mul_1', 'last'), + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'power_1': {'scale': np.array(1), 'power': np.array(-1), 'shift': np.array(0), + 'type': 'Power'}, + 'mul_1': {'type': 'Eltwise', 'op': 'Mul'}, + }, nodes_with_edges_only=True) + + graph.stage = 'front' + + tested_class = Div() + tested_class.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last', check_op_attrs=True) + self.assertTrue(flag, resp) diff --git a/model-optimizer/extensions/front/eltwise_n.py b/model-optimizer/extensions/front/eltwise_n.py index f1a42cb..7501e26 100644 --- a/model-optimizer/extensions/front/eltwise_n.py +++ b/model-optimizer/extensions/front/eltwise_n.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.eltwise import Eltwise @@ -29,7 +29,7 @@ class EltwiseNReplacement(FrontReplacementOp): op = 'EltwiseN' enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): out_node = node.in_node(0) operation = node.operation for ind in range(1, len(node.in_nodes())): diff --git a/model-optimizer/extensions/front/eltwise_n_test.py b/model-optimizer/extensions/front/eltwise_n_test.py index 33cedbd..c0e1ad1 100644 --- a/model-optimizer/extensions/front/eltwise_n_test.py +++ b/model-optimizer/extensions/front/eltwise_n_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/freeze_placeholder_value.py b/model-optimizer/extensions/front/freeze_placeholder_value.py index 2775738..cda5a95 100644 --- a/model-optimizer/extensions/front/freeze_placeholder_value.py +++ b/model-optimizer/extensions/front/freeze_placeholder_value.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,10 @@ import logging as log -import networkx as nx import numpy as np from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import erase_node +from mo.graph.graph import Graph from mo.middle.passes.convert_data_type import SUPPORTED_DATA_TYPES from mo.ops.const import Const from mo.utils.error import Error @@ -28,13 +27,19 @@ from mo.utils.error import Error class FreezePlaceholderValue(FrontReplacementSubgraph): """ - Replaces existing placeholder to Constant node with provided value. It takes value from raplacement_dict as string - and casts it to actual node data type - :param replacement_dict: dictionary with node names as keys and strings as values + Replaces existing placeholder to Constant node with provided value. It takes value from freeze_placeholder as + a string and casts it to actual node data type """ + enabled = True + graph_condition = [lambda graph: graph.graph['freeze_placeholder'] is not None] - enabled = False - replacement_dict = dict() + def run_after(self): + from extensions.front.restore_ports import RestorePorts + return [RestorePorts] + + def run_before(self): + from extensions.front.pass_separator import FrontStart + return [FrontStart] @staticmethod def pattern(): @@ -43,15 +48,15 @@ class FreezePlaceholderValue(FrontReplacementSubgraph): edges=[] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): ph = match['placeholder'] - if ph.name in self.replacement_dict: + if ph.name in graph.graph['freeze_placeholder']: name = ph.name if ph.has_and_set('data_type'): data_type = ph.data_type else: data_type = SUPPORTED_DATA_TYPES[graph.graph['cmd_params'].data_type][0] - string_value = self.replacement_dict[name] + string_value = graph.graph['freeze_placeholder'][name] try: if data_type != np.bool: value = np.array(string_value, dtype=data_type) @@ -76,7 +81,7 @@ class FreezePlaceholderValue(FrontReplacementSubgraph): new_node = Const(graph).create_node( attrs={'value': value, 'data_type': type(value), 'name': name + '/const_placeholder', 'shape': ph.shape}) - erase_node(ph) + graph.erase_node(ph) graph.add_edges_from([(new_node.id, v, attrs) for u, v, attrs in out_edges]) log.info("Placeholder node \"{}\" was replaced with Const node \"{}\" with value \"{}\"".format( name, new_node.name, value)) diff --git a/model-optimizer/extensions/front/freeze_placeholder_value_test.py b/model-optimizer/extensions/front/freeze_placeholder_value_test.py index 5c23291..1eeb535 100644 --- a/model-optimizer/extensions/front/freeze_placeholder_value_test.py +++ b/model-optimizer/extensions/front/freeze_placeholder_value_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ nodes_bool = { '3': {'name': 'node_2', 'kind': 'op', 'op': 'NotPlaceholder'}, '4': {'name': 'node_3', 'kind': 'op', 'op': 'NotPlaceholder'}, '5': {'name': 'node_4', 'kind': 'op', 'op': 'NotPlaceholder'}, - '6': {'name': 'output1', 'kind': 'op', 'op': 'OpOutput', 'is_output': True}, - '7': {'name': 'output2', 'kind': 'op', 'op': 'OpOutput', 'is_output': True} + '6': {'name': 'output1', 'kind': 'op', 'op': 'OpOutput', 'type': 'OpOutput'}, + '7': {'name': 'output2', 'kind': 'op', 'op': 'OpOutput', 'type': 'OpOutput'} } edges = { @@ -46,7 +46,7 @@ class TestFreezePlaceholderValue(unittest.TestCase): graph = build_graph(nodes_bool, edges) graph.graph['fw'] = 'tf' tested_class = FreezePlaceholderValue() - tested_class.replacement_dict = {'input1': 'True'} + graph.graph['freeze_placeholder'] = {'input1': 'True'} before_pattern = graph.nodes() tested_class.find_and_replace_pattern(graph=graph) after_pattern = graph.nodes() @@ -65,7 +65,7 @@ class TestFreezePlaceholderValue(unittest.TestCase): graph = build_graph(nodes_bool, edges) graph.graph['fw'] = 'tf' tested_class = FreezePlaceholderValue() - tested_class.replacement_dict = {'input1': 'False'} + graph.graph['freeze_placeholder'] = {'input1': 'False'} before_pattern = graph.nodes() tested_class.find_and_replace_pattern(graph=graph) after_pattern = graph.nodes() @@ -84,7 +84,7 @@ class TestFreezePlaceholderValue(unittest.TestCase): graph = build_graph(nodes_bool, edges) graph.graph['fw'] = 'tf' tested_class = FreezePlaceholderValue() - tested_class.replacement_dict = {'input1': 'False', 'input2': 'True'} + graph.graph['freeze_placeholder'] = {'input1': 'False', 'input2': 'True'} before_pattern = graph.nodes() tested_class.find_and_replace_pattern(graph=graph) after_pattern = graph.nodes() diff --git a/model-optimizer/extensions/front/image_scaler.py b/model-optimizer/extensions/front/image_scaler.py index c034256..8ec13c6 100644 --- a/model-optimizer/extensions/front/image_scaler.py +++ b/model-optimizer/extensions/front/image_scaler.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Graph from mo.ops.const import Const from mo.ops.lin_op import Mul, Add @@ -26,7 +26,7 @@ class ImageScaler(FrontReplacementOp): op = "ImageScaler" enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): # This replacer replace ImageScalar operation to Mul->Add sequence # Also it check that weights and biases are good op = match['op'] @@ -38,28 +38,24 @@ class ImageScaler(FrontReplacementOp): if all([x == 0 for x in np.nditer(op.bias)]): has_bias = False - # Get all outputs for op node - out_nodes = [node for node in op.out_nodes().values()] + assert len(op.in_ports()) == 1 - assert len(op.in_nodes()) == 1 + last_port = op.in_port(0).get_source() - last_node = op.in_node() # Create Mul & Add nodes if has_weights: - mul_weights = Const(graph, dict(value=op.scale, shape=op.scale.shape)) - mul_op = Mul(graph, dict(name=op.id + '/mul_')) - last_node = mul_op.create_node(inputs=[last_node, mul_weights.create_node()]) + mul_weights = Const(graph, dict(value=op.scale, shape=op.scale.shape)).create_node() + mul_op = Mul(graph, dict(name=op.id + '/mul_')).create_node() + op.in_port(0).get_connection().set_destination(mul_op.in_port(0)) + mul_weights.out_port(0).connect(mul_op.in_port(1)) + last_port = mul_op.out_port(0) if has_bias: - add_bias = Const(graph, dict(value=op.bias, shape=op.bias.shape)) - add_op = Add(graph, dict(name=op.id + '/add_')) - last_node = add_op.create_node(inputs=[last_node, add_bias.create_node()]) - - # Move edges from ImageScaler to last_node (Mul or Add) - for out_node in out_nodes: - edge_attrs = graph.get_edge_data(op.id, out_node.id)[0] - graph.remove_edge(op.id, out_node.id) - graph.add_edges_from([(last_node.id, out_node.id, edge_attrs)]) - - # Disconnect ImageScalar node - graph.remove_edge(op.in_node().id, op.id) + add_bias = Const(graph, dict(value=op.bias, shape=op.bias.shape)).create_node() + add_op = Add(graph, dict(name=op.id + '/add_')).create_node() + last_port.get_connection().set_destination(add_op.in_port(0)) + add_bias.out_port(0).connect(add_op.in_port(1)) + last_port = add_op.out_port(0) + + op.in_port(0).disconnect() + op.out_port(0).get_connection().set_source(last_port) diff --git a/model-optimizer/extensions/front/image_scaler_test.py b/model-optimizer/extensions/front/image_scaler_test.py index 2c4ec90..40d7aea 100644 --- a/model-optimizer/extensions/front/image_scaler_test.py +++ b/model-optimizer/extensions/front/image_scaler_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,16 +32,59 @@ nodes_attributes = { 'last_data': {'value': None, 'shape': None, 'kind': 'data'}, # Mul and Add operations 'mul_1': {'type': None, 'value': None, 'kind': 'op', 'op': 'Mul'}, - 'mul_1_w': {'value': None, 'shape': None, 'kind': 'op', 'op': 'Const'}, + 'const_mul_1_w': {'type': None, 'value': None, 'kind': 'op', 'op': 'Const'}, + 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, 'add_1': {'type': None, 'value': None, 'kind': 'op', 'op': 'Add'}, - 'add_1_w': {'value': None, 'shape': None, 'kind': 'op', 'op': 'Const'}, + 'const_add_1_w': {'type': None, 'value': None, 'kind': 'op', 'op': 'Const'}, + 'add_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'add_1_data': {'value': None, 'shape': None, 'kind': 'data'}, } class ImageScalerTest(unittest.TestCase): - def test_image_scaler_test1(self): + # Tests for MIDDLE stage + # Graph with Mul and Add operations + def test_image_scaler_test_1(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'im_scaler'), + ('im_scaler', 'im_scaler_data'), + ('im_scaler_data', 'last'), + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler': {'scale': np.array(2.0), 'bias': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), + ('mul_1_w', 'mul_1'), + ('mul_1', 'mul_1_data'), + ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), + ('add_1_w', 'add_1'), + ('add_1', 'add_1_data'), + ('add_1_data', 'last') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array(2.0).shape, 'value': np.array(2.0)}, + 'const_add_1_w': {'shape': np.array([3, 1, 1]), + 'value': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'middle' + + replacer = ImageScaler() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) + + # Graph with Add operation + def test_image_scaler_test_2(self): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'im_scaler'), @@ -55,16 +98,18 @@ class ImageScalerTest(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('add_1_data', 'last') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'add_1_w': {'shape': np.array([3, 1, 1]), - 'value': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + 'const_add_1_w': {'shape': np.array([3, 1, 1]), + 'value': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, }, nodes_with_edges_only=True) graph.graph['layout'] = 'NCHW' + graph.stage = 'middle' replacer = ImageScaler() replacer.find_and_replace_pattern(graph) @@ -72,7 +117,8 @@ class ImageScalerTest(unittest.TestCase): (flag, resp) = compare_graphs(graph, graph_ref, 'last') self.assertTrue(flag, resp) - def test_image_scaler_test2(self): + # Graph with Mul operation + def test_image_scaler_test_3(self): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'im_scaler'), @@ -86,15 +132,161 @@ class ImageScalerTest(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'last') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'mul_1_w': {'shape': np.array(2.0).shape, 'value': np.array(2.0)}, + 'const_mul_1_w': {'shape': np.array(2.0).shape, 'value': np.array(2.0)}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'middle' + + replacer = ImageScaler() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) + + # Graph without Mul and Add operations + def test_image_scaler_test_4(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'im_scaler'), + ('im_scaler', 'im_scaler_data'), + ('im_scaler_data', 'last'), + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler_data': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler': {'scale': np.array(1.0), 'bias': np.reshape(np.array([0, 0, 0]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'last') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'middle' + + replacer = ImageScaler() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) + + # Tests for FRONT stage + # Graph with Mul and Add operations + def test_image_scaler_test_5(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'im_scaler'), + ('im_scaler', 'last'), + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler': {'scale': np.array(2.0), 'bias': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'mul_1'), + ('const_mul_1_w', 'mul_1'), + ('mul_1', 'add_1'), + ('const_add_1_w', 'add_1'), + ('add_1', 'last') + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array(2.0).shape, 'value': np.array(2.0)}, + 'const_add_1_w': {'shape': np.array([3, 1, 1]), + 'value': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'front' + + replacer = ImageScaler() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) + + # Graph with Add operation + def test_image_scaler_test_6(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'im_scaler'), + ('im_scaler', 'last'), + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler': {'scale': np.array(1.0), 'bias': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'add_1'), + ('const_add_1_w', 'add_1'), + ('add_1', 'last') + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'const_add_1_w': {'shape': np.array([3, 1, 1]), + 'value': np.reshape(np.array([1, 2, 3]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'front' + + replacer = ImageScaler() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) + + # Graph with Mul operation + def test_image_scaler_test_7(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'im_scaler'), + ('im_scaler', 'last'), + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler': {'scale': np.array(2.0), 'bias': np.reshape(np.array([0, 0, 0]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'mul_1'), + ('const_mul_1_w', 'mul_1'), + ('mul_1', 'last') + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array(2.0).shape, 'value': np.array(2.0)}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'front' + + replacer = ImageScaler() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) + + # Graph without Mul and Add operations + def test_image_scaler_test_8(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'im_scaler'), + ('im_scaler', 'last'), + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, + 'im_scaler': {'scale': np.array(1.0), 'bias': np.reshape(np.array([0, 0, 0]), [3, 1, 1])}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'last') + ], + {'placeholder_1': {'shape': np.array([1, 227, 227, 3])}, }, nodes_with_edges_only=True) graph.graph['layout'] = 'NCHW' + graph.stage = 'front' replacer = ImageScaler() replacer.find_and_replace_pattern(graph) diff --git a/model-optimizer/extensions/front/input_cut.py b/model-optimizer/extensions/front/input_cut.py new file mode 100644 index 0000000..66e4829 --- /dev/null +++ b/model-optimizer/extensions/front/input_cut.py @@ -0,0 +1,33 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.front.extractor import add_input_ops +from mo.graph.graph import Graph + + +class InputCut(FrontReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.front.output_cut import OutputCut + return [OutputCut] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + add_input_ops(graph, graph.graph['user_shapes'], True) diff --git a/model-optimizer/extensions/front/instance_normalization.py b/model-optimizer/extensions/front/instance_normalization.py index abcc1e9..c80c65b 100644 --- a/model-optimizer/extensions/front/instance_normalization.py +++ b/model-optimizer/extensions/front/instance_normalization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.lin_op import Add, Mul from extensions.ops.mvn import MVN @@ -30,7 +30,7 @@ class InstanceNormalization(FrontReplacementOp): op = "InstanceNormalization" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): prefix = node.name + '/InstanceNormalization' mvn = MVN(graph, dict( name=prefix + '/MVN', diff --git a/model-optimizer/extensions/front/instance_normalization_test.py b/model-optimizer/extensions/front/instance_normalization_test.py index 90dbe1b..bdfcd55 100644 --- a/model-optimizer/extensions/front/instance_normalization_test.py +++ b/model-optimizer/extensions/front/instance_normalization_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/kaldi/__init__.py b/model-optimizer/extensions/front/kaldi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model-optimizer/extensions/front/kaldi/add_permute_after_convolution.py b/model-optimizer/extensions/front/kaldi/add_permute_after_convolution.py new file mode 100644 index 0000000..72b1f0c --- /dev/null +++ b/model-optimizer/extensions/front/kaldi/add_permute_after_convolution.py @@ -0,0 +1,111 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from collections import deque + +import numpy as np + +from extensions.front.kaldi.add_reshape_around_convolution import ReplaceConvolutionReshape +from extensions.middle.TensorIteratorMerge import op_type +from mo.front.common.replacement import FrontReplacementSubgraph +from mo.graph.graph import Node, Graph +from mo.ops.permute import Permute + + +class ReplaceConvolutionPermute(FrontReplacementSubgraph): + """ + This pass adds Permute around a Convolution layer if after there is sequence Pooling or Activation afterConvolution + **IMPORTANT**: This pass must run after inserting Reshapes around Poolings and Convolutions + For example: + Let's suppose we have next graph: + + Convolution -> [Pooling | Activation -> Pooling | Pooling -> Activation | Activation]* -> ... -> (ScaleShift | FullyConnected) + + **NOTE**: Please, remember about Reshapes around Poolings and Convolutions. + In this example we do not print them for simplicity. + **NOTE**: After Convolution, it is not necessary to have a sequence [Pooling | Activation -> Pooling | Pooling -> Activation | Activation]* + + So this pass will convert this graph to the next one: + + Convolution -> * -> Permute (order 0, 3, 2, 1 )-> Next_Layer -> ... -> (ScaleShift|FullyConnected) + + """ + enabled = True + + def pattern(self): + return dict( + nodes=[ + ('target_node', dict(op=lambda x: x in ['ScaleShift', 'FullyConnected'])) + ], + edges=[] + ) + + def replace_sub_graph(self, graph: Graph, match: dict): + target_node = match['target_node'] + nodes_with_weights = self.dfs(graph, target_node.name, ('Convolution', 'FullyConnected', 'ScaleShift'), True) + convolution_nodes = [node for node in nodes_with_weights if Node(graph, node).op == 'Convolution'] + for convolution_node in convolution_nodes: + target_node = self.search_target_node(Node(graph, convolution_node)) + permute_op = Permute(graph, {'order': np.array([0, 3, 2, 1])}) + permute_node = permute_op.add_node({'name': '{}/Permute'.format(target_node.name)}) + target_node.insert_node_after( permute_node, 0) + + def run_after(self): + from extensions.front.kaldi.add_reshape_around_pooling import ReplacePoolingReshape + return [ReplaceConvolutionReshape, ReplacePoolingReshape] + + @staticmethod + def search_target_node(node: Node): + target_node = ReplaceConvolutionPermute.skip_reshapes(node) + sequence_layers = ['Pooling', 'Activation'] + if target_node.op not in sequence_layers: + return node + if target_node.op == 'Activation': + sequence_layers.reverse() + if target_node.op == sequence_layers[0]: + next_node = ReplaceConvolutionPermute.skip_reshapes(target_node) + if next_node.op == sequence_layers[1]: + target_node = next_node + + return target_node + + @staticmethod + def skip_reshapes(node: Node): + next_node = node.out_node() + while next_node.op == 'Reshape': + next_node = next_node.out_node() + return next_node + + @staticmethod + def dfs(graph: Graph, node_name: str, stop_nodes: tuple, reverse: bool = False) -> list: + d = deque() + res = [] + visited = set() + visited.add(node_name) + d.appendleft(node_name) + while len(d) != 0: + cur_node = d.popleft() + if reverse: + nodes = graph.in_edges(cur_node) + else: + nodes = graph.out_edges(cur_node) + for in_node_name, _ in nodes: + if in_node_name not in visited: + if op_type(graph, in_node_name) not in stop_nodes: + visited.add(in_node_name) + d.append(in_node_name) + else: + res.append(in_node_name) + return res diff --git a/model-optimizer/extensions/front/kaldi/add_permute_after_convolution_test.py b/model-optimizer/extensions/front/kaldi/add_permute_after_convolution_test.py new file mode 100644 index 0000000..c166fb9 --- /dev/null +++ b/model-optimizer/extensions/front/kaldi/add_permute_after_convolution_test.py @@ -0,0 +1,75 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest + +import numpy as np + +from extensions.front.kaldi.add_permute_after_convolution import ReplaceConvolutionPermute +from mo.graph.graph import Node +from mo.utils.unittest.graph import build_graph + + +class ReplaceConvolutionPermuteTests(unittest.TestCase): + nodes_attributes = { + 'conv': {'kind': 'op', 'op': 'Convolution'}, + 'reshape_conv': {'kind': 'op', 'op': 'Reshape'}, + 'reshape_pool': {'kind': 'op', 'op': 'Reshape'}, + 'pool': {'kind': 'op', 'op': 'Pooling'}, + 'reshape_after_pool': {'kind': 'op', 'op': 'Reshape'}, + 'act': {'kind': 'op', 'op': 'Activation'}, + 'fc': {'kind': 'op', 'op': 'FullyConnected'}, + 'scale_shift': {'kind': 'op', 'op': 'ScaleShift'} + } + + def test_simple_convolution(self): + graph = build_graph(self.nodes_attributes, [ + ('conv', 'reshape_conv'), + ('reshape_conv', 'scale_shift'), + ]) + ReplaceConvolutionPermute().find_and_replace_pattern(graph) + conv_node = Node(graph, graph.nodes['conv']['name']) + permute = conv_node.out_node() + self.assertEqual(permute.op, 'Permute') + self.assertTrue(np.array_equal(permute.order, np.array([0, 3, 2, 1]))) + + def test_conv_pool(self): + graph = build_graph(self.nodes_attributes, [ + ('conv', 'reshape_conv'), + ('reshape_conv', 'reshape_pool'), + ('reshape_pool', 'pool'), + ('pool', 'reshape_after_pool'), + ('reshape_after_pool', 'fc'), + ]) + ReplaceConvolutionPermute().find_and_replace_pattern(graph) + pool_node = Node(graph, graph.nodes['pool']['name']) + permute = pool_node.out_node() + self.assertEqual(permute.op, 'Permute') + self.assertTrue(np.array_equal(permute.order, np.array([0, 3, 2, 1]))) + + def test_conv_act_pool(self): + graph = build_graph(self.nodes_attributes, [ + ('conv', 'reshape_conv'), + ('reshape_conv', 'act'), + ('act', 'reshape_pool'), + ('reshape_pool', 'pool'), + ('pool', 'reshape_after_pool'), + ('reshape_after_pool', 'fc'), + ]) + ReplaceConvolutionPermute().find_and_replace_pattern(graph) + pool_node = Node(graph, graph.nodes['pool']['name']) + permute = pool_node.out_node() + self.assertEqual(permute.op, 'Permute') + self.assertTrue(np.array_equal(permute.order, np.array([0, 3, 2, 1]))) diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py b/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py index 02c0e0f..2900da4 100644 --- a/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py +++ b/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,10 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.convolution import Convolution from mo.ops.reshape import Reshape @@ -38,7 +37,7 @@ class ReplaceConvolutionReshape(FrontReplacementOp): op = "Convolution" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): input_node = node.in_node(0) port = graph.get_edge_data(input_node.id, node.id)[0]['out'] input_reshape_node = Reshape(graph, diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py b/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py index b7326ad..f17a8ae 100644 --- a/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py +++ b/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.pooling import Pooling from mo.ops.reshape import Reshape @@ -39,7 +37,7 @@ class ReplacePoolingReshape(FrontReplacementOp): op = "Pooling" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node) -> list: + def replace_op(self, graph: Graph, node: Node) -> list: input_node = node.in_node(0) input_reshape_node = Reshape(graph, @@ -48,7 +46,7 @@ class ReplacePoolingReshape(FrontReplacementOp): 'infer': Reshape.kaldi_infer }).create_node([input_node]) - pooling_node = Pooling(graph, graph.nodes[node.id]).create_node([input_reshape_node]) + pooling_node = Pooling(graph, graph.node[node.id]).create_node([input_reshape_node]) output_reshape_node = Reshape(graph, { diff --git a/model-optimizer/extensions/front/kaldi/eliminate_redundant_reshape.py b/model-optimizer/extensions/front/kaldi/eliminate_redundant_reshape.py index a5c9a8c..64ae694 100644 --- a/model-optimizer/extensions/front/kaldi/eliminate_redundant_reshape.py +++ b/model-optimizer/extensions/front/kaldi/eliminate_redundant_reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,11 +14,11 @@ limitations under the License. """ -import networkx as nx import numpy as np from extensions.front.kaldi.fuse_repeated_reshape import FuseRepeatedReshapes from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node @@ -40,7 +40,7 @@ class EliminateRedundantReshape(FrontReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): reshape_node = match['reshape'] in_node = reshape_node.in_node() out_node = reshape_node.out_node() diff --git a/model-optimizer/extensions/front/kaldi/fuse_repeated_reshape.py b/model-optimizer/extensions/front/kaldi/fuse_repeated_reshape.py index 9a8a984..639240e 100644 --- a/model-optimizer/extensions/front/kaldi/fuse_repeated_reshape.py +++ b/model-optimizer/extensions/front/kaldi/fuse_repeated_reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node @@ -38,7 +37,7 @@ class FuseRepeatedReshapes(FrontReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): node = match['reshape_1'] if (node.has_valid('type') and node.type == 'Reshape' and len(node.out_nodes()) == 1 and node.out_node().has_valid('kind') and node.out_node().kind == 'data' and diff --git a/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py b/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py index bfba4c4..b846a4c 100644 --- a/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py +++ b/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,11 +15,9 @@ """ import numpy as np -import networkx as nx - from mo.front.caffe.extractors.utils import embed_input from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.activation import Activation from mo.ops.clamp import Clamp from mo.ops.eltwise import Eltwise @@ -50,7 +48,15 @@ class ReplaceLSTMNodePattern(FrontReplacementOp): op = "LSTMCell" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + # we need to rewrite this transform to fit unified pipeline (it should be a part of traditional FRONT phase) + def run_before(self): + from extensions.front.output_cut import OutputCut + return [OutputCut] + + def run_after(self): + return [] + + def replace_op(self, graph: Graph, node: Node): input_node = node.in_node() memory_pair_input = unique_id('id') @@ -102,7 +108,8 @@ class ReplaceLSTMNodePattern(FrontReplacementOp): # |____(4)Eltwise(sum) split_joined_input = Split(graph, {'name': 'join_input_split', 'axis': 1, - 'num_split': 4 + 'num_split': 4, + 'out_ports_count': 4, }).create_node([join_input_prev_state_sum]) prev_lstm_state = Memory(graph, {'name': 'prev_memory_state', diff --git a/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py b/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py index 360a225..9c14e2c 100644 --- a/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py +++ b/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,12 +15,10 @@ """ import numpy as np -import networkx as nx - from extensions.front.kaldi.replace_lstm_node_pattern import unique_id from mo.front.common.partial_infer.utils import int64_array from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.concat import Concat from mo.ops.crop import Crop from mo.ops.memory import Memory @@ -49,7 +47,7 @@ class ReplaceSpliceNodePattern(FrontReplacementOp): op = "Splice" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): input_node = node.in_nodes()[0] memory_pair_id = unique_id('id') # Memory(in) @@ -72,6 +70,7 @@ class ReplaceSpliceNodePattern(FrontReplacementOp): # Concat # Input / concat_node = Concat(graph, {'name': 'Splice_Concat', + 'in_ports_count': 2, 'axis': 1}).create_node([crop, input_node]) # Concat -> Memory(out) diff --git a/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern_test.py b/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern_test.py index f967f4b..88e630c 100644 --- a/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern_test.py +++ b/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/RNN_ext.py b/model-optimizer/extensions/front/mxnet/RNN_ext.py index 1ae8e31..9842838 100644 --- a/model-optimizer/extensions/front/mxnet/RNN_ext.py +++ b/model-optimizer/extensions/front/mxnet/RNN_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,10 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. """ - -from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs +from extensions.ops.GRU import GRU +from extensions.ops.LSTM import LSTM +from extensions.ops.RNN import RNN from mo.front.extractor import FrontExtractorOp -from extensions.ops.lstm_sequence import LSTMSequence +from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg @@ -32,31 +33,40 @@ class RNNFrontExtractor(FrontExtractorOp): state_size = attrs.int('state_size', None) bidirectional = attrs.bool('bidirectional', False) num_layers = attrs.int('num_layers', 1) + layout = attrs.str('layout', 'TNC') # in MXNet RNN by default take data in + # format [seq_len, batch_size, inp_size] node_attrs = { - 'batch_dim': 1, - 'sequence_dim': 0, + 'batch_dim': layout.index('N'), + 'sequence_dim': layout.index('T'), 'blobs_wrb': False, 'hidden_size': state_size, 'has_num_directions': bidirectional, + 'direction': 'bidirectional' if bidirectional else 'forward', + 'num_layers': num_layers, 'format': 'mxnet', + 'multilayers': num_layers != 1, + 'gate_order': None, } - if bidirectional: - raise Error( - "Operation RNN with bidirectional not supported. num_directions = 1 is supported only " + - refer_to_faq_msg(86)) - - if num_layers > 1: - raise Error( - "Operation RNN with num_layers more then one not supported. num_layers = 1 is supported only " + - refer_to_faq_msg(86)) - - if mode == 'lstm': - LSTMSequence.update_node_stat(node, node_attrs) + if mode == 'rnn_tanh': + node_attrs['gate_order'] = [0] + node_attrs['activations'] = ['tanh'] + RNN.update_node_stat(node, node_attrs) + elif mode == 'rnn_relu': + node_attrs['gate_order'] = [0] + node_attrs['activations'] = ['relu'] + RNN.update_node_stat(node, node_attrs) + elif mode == 'gru': + node_attrs['gate_order'] = [1, 0, 2] + node_attrs['linear_before_reset'] = 1 + GRU.update_node_stat(node, node_attrs) + elif mode == 'lstm': + node_attrs['gate_order'] = [1, 0, 2, 3] + LSTM.update_node_stat(node, node_attrs) else: raise Error( - "Operation RNN with mode '{}' not supported. Please register RNN as custom op. " + + "Operation RNN with mode '{}' not supported." + refer_to_faq_msg(86), mode) return __class__.enabled diff --git a/model-optimizer/extensions/front/mxnet/RNN_ext_test.py b/model-optimizer/extensions/front/mxnet/RNN_ext_test.py new file mode 100644 index 0000000..41ee5b3 --- /dev/null +++ b/model-optimizer/extensions/front/mxnet/RNN_ext_test.py @@ -0,0 +1,99 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest + +import numpy as np + +from extensions.front.mxnet.RNN_ext import RNNFrontExtractor +from mo.utils.error import Error +from mo.utils.unittest.extractors import PB + + +class RNNFrontExtractorTest(unittest.TestCase): + @staticmethod + def _create_node(**attrs): + params = {'attrs': { + **attrs + }} + node = PB({'symbol_dict': params}) + return node + + base_attrs = { + 'batch_dim': 1, + 'sequence_dim': 0, + 'blobs_wrb': False, + 'format': 'mxnet', + 'gate_order': [1, 0, 2, 3], + } + + def test_base_attrs(self): + attrs = { + 'state_size': 128, + 'mode': 'lstm', + } + + additional_attrs = { + 'multilayers': False, + 'hidden_size': 128, + 'has_num_directions': False, + 'direction': 'forward', + 'num_layers': 1, + } + + node = self._create_node(**attrs) + RNNFrontExtractor.extract(node) + + expect_attrs = {**self.base_attrs, **additional_attrs} + + for key in expect_attrs.keys(): + equal = np.all(np.equal(node[key], expect_attrs[key], dtype=object)) + self.assertTrue(equal, 'Values for attr {} are not equal'.format(key)) + + self.assertTrue(node.op == 'LSTM') + + def test_unsupported_mode(self): + attrs = { + 'state_size': 128, + 'mode': 'abracadabra', + } + node = self._create_node(**attrs) + with self.assertRaises(Error): + RNNFrontExtractor.extract(node) + + def test_additional_attrs(self): + attrs = { + 'state_size': 128, + 'mode': 'lstm', + 'bidirectional': True, + 'num_layers': 2, + } + + additional_attrs = { + 'multilayers': True, + 'hidden_size': 128, + 'has_num_directions': True, + 'direction': 'bidirectional', + 'num_layers': 2, + } + + node = self._create_node(**attrs) + RNNFrontExtractor.extract(node) + + expect_attrs = {**self.base_attrs, **additional_attrs} + + for key in expect_attrs.keys(): + equal = np.all(np.equal(node[key], expect_attrs[key], dtype=object)) + self.assertTrue(equal, 'Values for attr {} are not equal'.format(key)) \ No newline at end of file diff --git a/model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes.py b/model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes.py new file mode 100644 index 0000000..eb6f9e5 --- /dev/null +++ b/model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes.py @@ -0,0 +1,62 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph, Node + + +class AddInputDataToPriorBoxes(FrontReplacementPattern): + enabled = True + + def run_before(self): + from extensions.front.create_tensor_nodes import CreateTensorNodes + return [CreateTensorNodes] + + def run_after(self): + from extensions.front.pass_separator import FrontFinish + return [FrontFinish] + + @staticmethod + def add_input_data_to_prior_boxes(graph: Graph, input_names: str = ''): + """ + PriorBox layer has data input unlike mxnet. + Need to add data input to _contrib_MultiBoxPrior for + for correct conversion to PriorBox layer. + + Parameters + ---------- + graph : Graph + Graph with loaded model. + """ + if not input_names: + input_names = ('data',) + else: + input_names = input_names.split(',') + + input_nodes = {} + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.name in input_names: + input_nodes.update({node.id: node}) + + if len(input_nodes) > 0: + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == '_contrib_MultiBoxPrior': + node.add_input_port(idx=1) + graph.create_edge(list(input_nodes.values())[0], node, out_port=0, in_port=1) + + def find_and_replace_pattern(self, graph: Graph): + self.add_input_data_to_prior_boxes(graph, graph.graph['cmd_params'].input) diff --git a/model-optimizer/mo/pipeline/mx_test.py b/model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes_test.py similarity index 85% rename from model-optimizer/mo/pipeline/mx_test.py rename to model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes_test.py index 66e49bb..9f0c9cf 100644 --- a/model-optimizer/mo/pipeline/mx_test.py +++ b/model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,9 +17,10 @@ import unittest import numpy as np +from argparse import Namespace from mo.graph.graph import Node -from mo.pipeline.mx import add_input_data_to_prior_boxes +from extensions.front.mxnet.add_input_data_to_prior_boxes import AddInputDataToPriorBoxes from mo.utils.unittest.graph import build_graph @@ -37,7 +38,8 @@ class TestMxnetPipeline(unittest.TestCase): 'node_2': {'shape': np.array([1, 3, 10, 10])}, }) - add_input_data_to_prior_boxes(graph) + graph.graph['cmd_params'] = Namespace(input=None) + AddInputDataToPriorBoxes().find_and_replace_pattern(graph) node_multi_box = Node(graph, 'node_multi_box') node_input1 = node_multi_box.in_node(0) @@ -58,7 +60,8 @@ class TestMxnetPipeline(unittest.TestCase): 'node_2': {'shape': np.array([1, 3, 10, 10])}, }) - add_input_data_to_prior_boxes(graph, 'node_1') + graph.graph['cmd_params'] = Namespace(input='node_1') + AddInputDataToPriorBoxes().find_and_replace_pattern(graph) node_multi_box = Node(graph, 'node_multi_box') node_input1 = node_multi_box.in_node(0) diff --git a/model-optimizer/extensions/front/mxnet/add_n_ext.py b/model-optimizer/extensions/front/mxnet/add_n_ext.py index 5577983..083e3c4 100644 --- a/model-optimizer/extensions/front/mxnet/add_n_ext.py +++ b/model-optimizer/extensions/front/mxnet/add_n_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/block_grad_ext.py b/model-optimizer/extensions/front/mxnet/block_grad_ext.py index 0d5946e..1cdda0f 100644 --- a/model-optimizer/extensions/front/mxnet/block_grad_ext.py +++ b/model-optimizer/extensions/front/mxnet/block_grad_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/broadcast_mul.py b/model-optimizer/extensions/front/mxnet/broadcast_mul.py index 8f1e064..9b861cb 100644 --- a/model-optimizer/extensions/front/mxnet/broadcast_mul.py +++ b/model-optimizer/extensions/front/mxnet/broadcast_mul.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,8 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import replace_node -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.lin_op import Mul @@ -26,8 +25,8 @@ class BroadcastMulFrontReplacer(FrontReplacementOp): op = 'broadcast_mul' enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): mul_op = Mul(graph, dict(name=node.id + '/mul_', symbol_dict={'name': node.id + '/mul_'})) mul_node = mul_op.create_node(inputs=[node.in_node(0), node.in_node(1)]) - replace_node(node, mul_node) + node.replace_node(mul_node) return [mul_node.id] diff --git a/model-optimizer/extensions/front/mxnet/broadcast_mul_ext.py b/model-optimizer/extensions/front/mxnet/broadcast_mul_ext.py index 7fd99ee..a37b5be 100644 --- a/model-optimizer/extensions/front/mxnet/broadcast_mul_ext.py +++ b/model-optimizer/extensions/front/mxnet/broadcast_mul_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs.py b/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs.py index 1e740ad..a8b6fa0 100644 --- a/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs.py +++ b/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,34 +14,39 @@ limitations under the License. """ -import networkx as nx from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph class CheckSoftmaxNodeInputs(FrontReplacementPattern): - enabled = True + def run_before(self): + from extensions.front.user_data_repack import UserDataRepack + return [UserDataRepack] + + def run_after(self): + return [] + @staticmethod def pattern(): return dict( nodes=[ - ('softmax', dict(op='SoftmaxOutput')) + ('softmax', dict(op=lambda op: op in ['SoftMax', 'SoftmaxActivation', 'SoftmaxOutput'])) ], edges=[]) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): """ Need to remove from softmax layer all unused inputs Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. """ - softmax_node = match['softmax'] softmax_nodes_len = len(softmax_node.in_nodes()) for i in reversed(range(1, softmax_nodes_len)): diff --git a/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs_test.py b/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs_test.py index ea7da2a..2e2dc20 100644 --- a/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs_test.py +++ b/model-optimizer/extensions/front/mxnet/check_softmax_node_inputs_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/conv_ext.py b/model-optimizer/extensions/front/mxnet/conv_ext.py index 6463bfb..1792ff8 100644 --- a/model-optimizer/extensions/front/mxnet/conv_ext.py +++ b/model-optimizer/extensions/front/mxnet/conv_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/conv_ext_test.py b/model-optimizer/extensions/front/mxnet/conv_ext_test.py index ee68688..2a75fce 100644 --- a/model-optimizer/extensions/front/mxnet/conv_ext_test.py +++ b/model-optimizer/extensions/front/mxnet/conv_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/copy_ext.py b/model-optimizer/extensions/front/mxnet/copy_ext.py index cc06a54..0a1fa31 100644 --- a/model-optimizer/extensions/front/mxnet/copy_ext.py +++ b/model-optimizer/extensions/front/mxnet/copy_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/custom.py b/model-optimizer/extensions/front/mxnet/custom.py index 33436c5..f084075 100644 --- a/model-optimizer/extensions/front/mxnet/custom.py +++ b/model-optimizer/extensions/front/mxnet/custom.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/custom_test.py b/model-optimizer/extensions/front/mxnet/custom_test.py index 3d698cf..36bd32a 100644 --- a/model-optimizer/extensions/front/mxnet/custom_test.py +++ b/model-optimizer/extensions/front/mxnet/custom_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/dropout_ext.py b/model-optimizer/extensions/front/mxnet/dropout_ext.py index ee16973..4210498 100644 --- a/model-optimizer/extensions/front/mxnet/dropout_ext.py +++ b/model-optimizer/extensions/front/mxnet/dropout_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/element_wise_sum_ext.py b/model-optimizer/extensions/front/mxnet/element_wise_sum_ext.py index 8ad2e20..ac826d3 100644 --- a/model-optimizer/extensions/front/mxnet/element_wise_sum_ext.py +++ b/model-optimizer/extensions/front/mxnet/element_wise_sum_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/exp_ext.py b/model-optimizer/extensions/front/mxnet/exp_ext.py new file mode 100644 index 0000000..05e84a2 --- /dev/null +++ b/model-optimizer/extensions/front/mxnet/exp_ext.py @@ -0,0 +1,28 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.ops.activation import Activation + + +class ExpExtractor(FrontExtractorOp): + op = 'exp' + enabled = True + + @staticmethod + def extract(node): + Activation.update_node_stat(node, {'operation': 'exp'}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/mxnet/flatten_ext.py b/model-optimizer/extensions/front/mxnet/flatten_ext.py index f0c3469..6b02cad 100644 --- a/model-optimizer/extensions/front/mxnet/flatten_ext.py +++ b/model-optimizer/extensions/front/mxnet/flatten_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/gather.py b/model-optimizer/extensions/front/mxnet/gather.py new file mode 100644 index 0000000..c94c332 --- /dev/null +++ b/model-optimizer/extensions/front/mxnet/gather.py @@ -0,0 +1,33 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Node, Graph +from extensions.ops.gather import Gather + + +class GatherFrontReplacer(FrontReplacementOp): + op = 'Embedding' + enabled = True + + def replace_sub_graph(self, graph: Graph, match: dict): + node = match['op'] + gather_node = Gather(graph, dict(name=node.id + '/embedding_', + axis=0, + symbol_dict={'name': node.id + '/embedding_'})).create_node() + node.in_port(0).get_connection().set_destination(gather_node.in_port(1)) + node.in_port(1).get_connection().set_destination(gather_node.in_port(0)) + node.out_port(0).get_connection().set_source(gather_node.out_port(0)) diff --git a/model-optimizer/mo/front/tf/extractors/shape.py b/model-optimizer/extensions/front/mxnet/gather_ext.py similarity index 64% rename from model-optimizer/mo/front/tf/extractors/shape.py rename to model-optimizer/extensions/front/mxnet/gather_ext.py index e174bc9..62e2f96 100644 --- a/model-optimizer/mo/front/tf/extractors/shape.py +++ b/model-optimizer/extensions/front/mxnet/gather_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,14 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. """ -import numpy as np -from mo.front.tf.extractors.utils import tf_dtype_extractor -from mo.ops.shape import Shape +from mo.front.extractor import FrontExtractorOp -def tf_shape_ext(pb): - return { - 'infer': Shape.infer, - 'data_type': tf_dtype_extractor(pb.attr['out_type'].type, np.int32) - } +class GatherFrontExtractor(FrontExtractorOp): + op = 'Embedding' + enabled = True + + @staticmethod + def extract(node): + return __class__.enabled diff --git a/model-optimizer/extensions/front/mxnet/gather_test.py b/model-optimizer/extensions/front/mxnet/gather_test.py new file mode 100644 index 0000000..0056950 --- /dev/null +++ b/model-optimizer/extensions/front/mxnet/gather_test.py @@ -0,0 +1,64 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest + +import numpy as np + +from extensions.front.mxnet.gather import GatherFrontReplacer +from mo.utils.unittest.graph import build_graph, compare_graphs +from mo.graph.graph import Node + + +class GatherTest(unittest.TestCase): + def test_embedding_replace1(self): + graph = build_graph({'placeholder_1': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'embedding_const': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None, 'type': 'Const', 'op': 'Const'}, + 'embedding': {'type': None, 'kind': 'op', 'op': 'Embedding'}, + 'last': {'type': None, 'kind': 'op', 'op': None}, + }, + [('placeholder_1', 'embedding', {'out': 0, 'in': 0}), + ('embedding_const', 'embedding', {'out': 0, 'in': 1}), + ('embedding', 'last') + ], + {'placeholder_1': {'shape': np.array([32,35])}, + 'embedding_const': {'shape': np.array([2000, 650]), + 'bias': np.array(np.random.random_integers(0, 225, (2000, 650)))}, + }, nodes_with_edges_only=True) + + graph_ref = build_graph({'placeholder_1': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'embedding_const': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None, 'type': 'Const', 'op': 'Const'}, + 'embedding': {'type': None, 'kind': 'op', 'op': 'Gather'}, + 'last': {'type': None, 'kind': 'op', 'op': None}, + }, + [ + ('embedding_const', 'embedding'), + ('placeholder_1', 'embedding'), + ('embedding', 'last') + ], + {'placeholder_1': {'shape': np.array([32,35])}, + 'embedding_const': {'shape': np.array([2000, 650]), + 'bias': np.array(np.random.random_integers(0, 225, (2000, 650)))}, + }, nodes_with_edges_only=True) + + graph.graph['layout'] = 'NCHW' + graph.stage = 'front' + + replacer = GatherFrontReplacer() + replacer.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'last') + self.assertTrue(flag, resp) diff --git a/model-optimizer/extensions/front/mxnet/instance_norm_ext.py b/model-optimizer/extensions/front/mxnet/instance_norm_ext.py index 26fe674..3a8a1d1 100644 --- a/model-optimizer/extensions/front/mxnet/instance_norm_ext.py +++ b/model-optimizer/extensions/front/mxnet/instance_norm_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/max_ext.py b/model-optimizer/extensions/front/mxnet/max_ext.py index 3db428c..4af1468 100644 --- a/model-optimizer/extensions/front/mxnet/max_ext.py +++ b/model-optimizer/extensions/front/mxnet/max_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ limitations under the License. """ +import numpy as np + from mo.front.extractor import FrontExtractorOp from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs from mo.ops.reduce import Reduce @@ -27,7 +29,7 @@ class MaxFrontExtractor(FrontExtractorOp): def extract(node): attrs = get_mxnet_layer_attrs(node.symbol_dict) data = { - 'axis': [attrs.int('axis', 0)], + 'axis': np.array([attrs.int('axis', 0)], dtype=np.int64), 'reduce_type': 'max', 'keep_dims': False } diff --git a/model-optimizer/extensions/front/mxnet/maximum_ext.py b/model-optimizer/extensions/front/mxnet/maximum_ext.py index 573a2dd..913e9b8 100644 --- a/model-optimizer/extensions/front/mxnet/maximum_ext.py +++ b/model-optimizer/extensions/front/mxnet/maximum_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/minimum_ext.py b/model-optimizer/extensions/front/mxnet/minimum_ext.py index fb3d094..c13fe60 100644 --- a/model-optimizer/extensions/front/mxnet/minimum_ext.py +++ b/model-optimizer/extensions/front/mxnet/minimum_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/minus_scalar.py b/model-optimizer/extensions/front/mxnet/minus_scalar.py index b190ebc..116de19 100644 --- a/model-optimizer/extensions/front/mxnet/minus_scalar.py +++ b/model-optimizer/extensions/front/mxnet/minus_scalar.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.lin_op import Add from mo.ops.const import Const @@ -26,7 +26,7 @@ class MinusScalarFrontReplacer(FrontReplacementOp): op = '_minus_scalar' enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): in_node = node.in_node() out_nodes = [node for node in node.out_nodes().values()] graph.remove_edge(node.in_node().id, node.id) diff --git a/model-optimizer/extensions/front/mxnet/minus_scalar_ext.py b/model-optimizer/extensions/front/mxnet/minus_scalar_ext.py index 43146fb..d748dbc 100644 --- a/model-optimizer/extensions/front/mxnet/minus_scalar_ext.py +++ b/model-optimizer/extensions/front/mxnet/minus_scalar_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/mul_scalar.py b/model-optimizer/extensions/front/mxnet/mul_scalar.py index 24dd307..7d9d863 100644 --- a/model-optimizer/extensions/front/mxnet/mul_scalar.py +++ b/model-optimizer/extensions/front/mxnet/mul_scalar.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.lin_op import Mul from mo.ops.const import Const @@ -26,7 +24,7 @@ class MulScalarFrontReplacer(FrontReplacementOp): op = '_mul_scalar' enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): in_node = node.in_node() out_nodes = [node for node in node.out_nodes().values()] graph.remove_edge(node.in_node().id, node.id) diff --git a/model-optimizer/extensions/front/mxnet/mul_scalar_ext.py b/model-optimizer/extensions/front/mxnet/mul_scalar_ext.py index fdee6e1..5c0b457 100644 --- a/model-optimizer/extensions/front/mxnet/mul_scalar_ext.py +++ b/model-optimizer/extensions/front/mxnet/mul_scalar_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/pad_ext.py b/model-optimizer/extensions/front/mxnet/pad_ext.py index a3b3c0e..cd1dad1 100644 --- a/model-optimizer/extensions/front/mxnet/pad_ext.py +++ b/model-optimizer/extensions/front/mxnet/pad_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/pooling_ext.py b/model-optimizer/extensions/front/mxnet/pooling_ext.py index 6a2452f..9710cc5 100644 --- a/model-optimizer/extensions/front/mxnet/pooling_ext.py +++ b/model-optimizer/extensions/front/mxnet/pooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/pooling_ext_test.py b/model-optimizer/extensions/front/mxnet/pooling_ext_test.py index 43450a8..9edd583 100644 --- a/model-optimizer/extensions/front/mxnet/pooling_ext_test.py +++ b/model-optimizer/extensions/front/mxnet/pooling_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/proposal_ext.py b/model-optimizer/extensions/front/mxnet/proposal_ext.py index 5e2fa80..32fe32c 100644 --- a/model-optimizer/extensions/front/mxnet/proposal_ext.py +++ b/model-optimizer/extensions/front/mxnet/proposal_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/reshape_ext.py b/model-optimizer/extensions/front/mxnet/reshape_ext.py index 32251fe..0ed3c0f 100644 --- a/model-optimizer/extensions/front/mxnet/reshape_ext.py +++ b/model-optimizer/extensions/front/mxnet/reshape_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/rnn_param_concat.py b/model-optimizer/extensions/front/mxnet/rnn_param_concat.py index 8b21e7e..fb487a4 100644 --- a/model-optimizer/extensions/front/mxnet/rnn_param_concat.py +++ b/model-optimizer/extensions/front/mxnet/rnn_param_concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/roi_pooling_ext.py b/model-optimizer/extensions/front/mxnet/roi_pooling_ext.py index f274c41..e17a4df 100644 --- a/model-optimizer/extensions/front/mxnet/roi_pooling_ext.py +++ b/model-optimizer/extensions/front/mxnet/roi_pooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/slice_channel_ext.py b/model-optimizer/extensions/front/mxnet/slice_channel_ext.py index 95b1cd8..1724332 100644 --- a/model-optimizer/extensions/front/mxnet/slice_channel_ext.py +++ b/model-optimizer/extensions/front/mxnet/slice_channel_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/slice_channel_ext_test.py b/model-optimizer/extensions/front/mxnet/slice_channel_ext_test.py index 080e871..a6e6194 100644 --- a/model-optimizer/extensions/front/mxnet/slice_channel_ext_test.py +++ b/model-optimizer/extensions/front/mxnet/slice_channel_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/softmax.py b/model-optimizer/extensions/front/mxnet/softmax.py index 10991ea..d60c48d 100644 --- a/model-optimizer/extensions/front/mxnet/softmax.py +++ b/model-optimizer/extensions/front/mxnet/softmax.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import numpy as np import networkx as nx +from mo.graph.graph import Graph from mo.ops.lin_op import Mul from mo.ops.const import Const from mo.front.common.replacement import FrontReplacementSubgraph @@ -33,7 +34,7 @@ class SoftmaxFrontReplacementSubgraph(FrontReplacementSubgraph): edges=[] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): node = match['softmax'] if 'temperature' in node and node['temperature'] != 1.0: in_node = node.in_node() diff --git a/model-optimizer/extensions/front/mxnet/softmax_activation_ext.py b/model-optimizer/extensions/front/mxnet/softmax_activation_ext.py index 2dbb114..93438a8 100644 --- a/model-optimizer/extensions/front/mxnet/softmax_activation_ext.py +++ b/model-optimizer/extensions/front/mxnet/softmax_activation_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/softmax_ext.py b/model-optimizer/extensions/front/mxnet/softmax_ext.py index c2071da..30768fb 100644 --- a/model-optimizer/extensions/front/mxnet/softmax_ext.py +++ b/model-optimizer/extensions/front/mxnet/softmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/softmax_output_ext.py b/model-optimizer/extensions/front/mxnet/softmax_output_ext.py index 60a3423..728c309 100644 --- a/model-optimizer/extensions/front/mxnet/softmax_output_ext.py +++ b/model-optimizer/extensions/front/mxnet/softmax_output_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py index d26b544..7da5174 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import networkx as nx from extensions.front.mxnet.ssd_pattern_remove_flatten import SsdPatternRemoveFlatten from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import create_edge +from mo.graph.graph import Graph from mo.ops.reshape import Reshape @@ -40,7 +40,7 @@ class SsdPatternFlattenSoftmaxActivation(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): """ Need to find the pattern: SoftmaxActivation -> DetectionOutput DetectionOutput in IE expects flattened input from SoftMax, that is why there is the need to add @@ -48,7 +48,7 @@ class SsdPatternFlattenSoftmaxActivation(FrontReplacementSubgraph): Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. @@ -70,4 +70,4 @@ class SsdPatternFlattenSoftmaxActivation(FrontReplacementSubgraph): new_reshape_op = Reshape(graph, {'symbol_dict': symbol_node}) new_reshape_node = new_reshape_op.create_node([softmax_activation]) new_reshape_node['dim'] = [0, -1] - create_edge(new_reshape_node, multi_box_detection, in_port=in_port, out_port=out_port) + graph.create_edge(new_reshape_node, multi_box_detection, in_port=in_port, out_port=out_port) diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation_test.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation_test.py index 7c9bc9e..fe78beb 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation_test.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -42,4 +42,4 @@ class TestSsdPatternFlattenSoftmaxActivation(unittest.TestCase): pattern.find_and_replace_pattern(graph) flatten_name = list(graph.nodes())[-1] self.assertTrue(graph.has_node(flatten_name)) - self.assertFalse(graph.has_edge(Node(graph, 'softmax_activation').id, Node(graph, 'multi_box_detection').id)) + self.assertFalse(graph.has_edge(Node(graph, 'node_softmax_activation').id, Node(graph, 'node_multi_box_detection').id)) diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten.py index 5686dc2..6ff1a71 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,8 +17,8 @@ import networkx as nx from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape +from mo.graph.graph import Graph from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import erase_node class SsdPatternRemoveFlatten(FrontReplacementSubgraph): @@ -38,16 +38,16 @@ class SsdPatternRemoveFlatten(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): """ Need to find each occurrence of pattern: _contrib_MultiBoxPrior -> Flatten remove Flatten layer - IE does not expect outputs to be flattened Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. """ - erase_node(match['flatten']) + graph.erase_node(match['flatten']) diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten_test.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten_test.py index dfd5708..c9cef98 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten_test.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape.py index cf12e19..6c8e746 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,9 @@ import networkx as nx +from mo.graph.graph import Graph from mo.front.common.replacement import FrontReplacementSubgraph from mo.front.mxnet.extractors.utils import get_json_layer_attrs -from mo.graph.graph import erase_node class SsdPatternRemoveReshape(FrontReplacementSubgraph): @@ -37,19 +37,19 @@ class SsdPatternRemoveReshape(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): """ Need to find each occurrence of pattern: _contrib_MultiBoxPrior(s) -> Concat -> Reshape remove Reshape layer - IE does not expect outputs from concatenation of _contrib_MultiBoxPrior to be reshaped Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. """ - erase_node(match['reshape']) + graph.erase_node(match['reshape']) # concat should be performed for the third axis concat_node = match['concat'] diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape_test.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape_test.py index 40a7649..a72620a 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape_test.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose.py index a3af10c..70627c1 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPat from extensions.front.mxnet.ssd_pattern_remove_flatten import SsdPatternRemoveFlatten from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import create_edge +from mo.graph.graph import Graph class SsdPatternRemoveTranspose(FrontReplacementSubgraph): @@ -42,7 +42,7 @@ class SsdPatternRemoveTranspose(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): """ Need to find each occurrence of pattern: transpose -> SoftmaxActivation -> _contrib_MultiBoxDetection @@ -52,7 +52,7 @@ class SsdPatternRemoveTranspose(FrontReplacementSubgraph): Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. match : dict Patterns which were found in graph structure. @@ -64,4 +64,4 @@ class SsdPatternRemoveTranspose(FrontReplacementSubgraph): graph.remove_edge(transpose_in_node.id, transpose_node.id) graph.remove_edge(transpose_node.id, softmax_activation.id) graph.remove_node(transpose_node.id) - create_edge(transpose_in_node, softmax_activation) + graph.create_edge(transpose_in_node, softmax_activation) diff --git a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose_test.py b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose_test.py index 576e2f9..38bcd15 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose_test.py +++ b/model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs.py b/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs.py index ce9f2cf..533a06c 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs.py +++ b/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ import networkx as nx -from mo.graph.graph import create_edge +from mo.graph.graph import Graph from mo.front.common.replacement import FrontReplacementPattern from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation @@ -38,7 +38,7 @@ class SsdReorderDetectionOutInputs(FrontReplacementPattern): edges=[]) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): """ DetectionOutput layer has another order of inputs unlike mxnet. Need to reorder _contrib_MultiBoxDetection inputs @@ -46,7 +46,7 @@ class SsdReorderDetectionOutInputs(FrontReplacementPattern): Parameters ---------- - graph : nx.MultiDiGraph + graph : Graph Graph with loaded model. """ multi_box_detection_node = match['multi_box_detection'] @@ -64,5 +64,5 @@ class SsdReorderDetectionOutInputs(FrontReplacementPattern): graph.remove_edge(conf_node.id, multi_box_detection_node.id) graph.remove_edge(loc_node.id, multi_box_detection_node.id) - create_edge(loc_node, multi_box_detection_node, in_port=conf_in_port, out_port=conf_out_port) - create_edge(conf_node, multi_box_detection_node, in_port=loc_in_port, out_port=loc_out_port) + graph.create_edge(loc_node, multi_box_detection_node, in_port=conf_in_port, out_port=conf_out_port) + graph.create_edge(conf_node, multi_box_detection_node, in_port=loc_in_port, out_port=loc_out_port) diff --git a/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs_test.py b/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs_test.py index 6ddde4c..d2beaaf 100644 --- a/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs_test.py +++ b/model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/stack_ext.py b/model-optimizer/extensions/front/mxnet/stack_ext.py index 6b5b79b..5c1d5d0 100644 --- a/model-optimizer/extensions/front/mxnet/stack_ext.py +++ b/model-optimizer/extensions/front/mxnet/stack_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/swapaxes_ext.py b/model-optimizer/extensions/front/mxnet/swapaxes_ext.py index 1b34f09..2741f7b 100644 --- a/model-optimizer/extensions/front/mxnet/swapaxes_ext.py +++ b/model-optimizer/extensions/front/mxnet/swapaxes_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/up_sampling_ext.py b/model-optimizer/extensions/front/mxnet/up_sampling_ext.py index a4284b1..cc8d87c 100644 --- a/model-optimizer/extensions/front/mxnet/up_sampling_ext.py +++ b/model-optimizer/extensions/front/mxnet/up_sampling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/mxnet/zeros_ext.py b/model-optimizer/extensions/front/mxnet/zeros_ext.py index 00923d2..5fec929 100644 --- a/model-optimizer/extensions/front/mxnet/zeros_ext.py +++ b/model-optimizer/extensions/front/mxnet/zeros_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ limitations under the License. """ +import ast import numpy as np from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs @@ -29,13 +30,16 @@ class ZerosFrontExtractor(FrontExtractorOp): def extract(node): attrs = get_mxnet_layer_attrs(node.symbol_dict) shape = list(attrs.tuple('shape', int, None)) + zero_shapes = [] for i, s in enumerate(shape): if s == 0: shape[i] = 1 + zero_shapes.append(i) update_attrs = { 'shape': np.ndarray(shape), 'value': np.zeros(shape), + 'zero_shapes': zero_shapes } # update the attributes of the node diff --git a/model-optimizer/extensions/front/no_op_eraser.py b/model-optimizer/extensions/front/no_op_eraser.py index 7d0b5c0..2c5f4e9 100644 --- a/model-optimizer/extensions/front/no_op_eraser.py +++ b/model-optimizer/extensions/front/no_op_eraser.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import logging as log import networkx as nx from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import erase_node +from mo.graph.graph import Graph class NoOpEraser(FrontReplacementSubgraph): @@ -35,7 +35,7 @@ class NoOpEraser(FrontReplacementSubgraph): ) @staticmethod - def replace_sub_graph(graph: nx.MultiDiGraph, match: dict): - erase_node(match['output']) - erase_node(match['noop']) + def replace_sub_graph(graph: Graph, match: dict): + graph.erase_node(match['output']) + graph.erase_node(match['noop']) log.info("NoOp node \"{}\" was removed from the graph".format(match['noop'].id)) diff --git a/model-optimizer/extensions/front/onnx/add_ext.py b/model-optimizer/extensions/front/onnx/add_ext.py index 42e64d0..efe59b5 100644 --- a/model-optimizer/extensions/front/onnx/add_ext.py +++ b/model-optimizer/extensions/front/onnx/add_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/affine_ext.py b/model-optimizer/extensions/front/onnx/affine_ext.py index 4067f95..237e1d8 100644 --- a/model-optimizer/extensions/front/onnx/affine_ext.py +++ b/model-optimizer/extensions/front/onnx/affine_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/affine_ext_test.py b/model-optimizer/extensions/front/onnx/affine_ext_test.py index 799e643..ea0ad60 100644 --- a/model-optimizer/extensions/front/onnx/affine_ext_test.py +++ b/model-optimizer/extensions/front/onnx/affine_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/argmax.py b/model-optimizer/extensions/front/onnx/argmax.py new file mode 100644 index 0000000..2f30704 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/argmax.py @@ -0,0 +1,46 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import networkx as nx + +from extensions.ops.argmax import ArgMaxOp +from mo.front.common.replacement import FrontReplacementSubgraph +from mo.front.onnx.extractors.utils import onnx_attr +from mo.graph.graph import Graph +from mo.ops.squeeze import Squeeze + +class Argmax(FrontReplacementSubgraph): + enabled = True + + def pattern(self): + return dict( + nodes=[('argmax', dict(op='ArgMax', keepdims=0))], + edges=[] + ) + + def replace_sub_graph(self, graph: Graph, match: dict): + """ + In ONNX ArgMax operation has keepdims attribute that indicates + whether to stay a dimension along which maximum is computed or not. + In case of keepdims=0 this dimension should be removed but ArgMax operation in IR format + is not designed to cover this case. So we should additionally add Squeeze operation + right after ArgMax for this case. + """ + argmax_node = match['argmax'] + axis = argmax_node.axis + squeeze_node = Squeeze(graph, {'squeeze_dims': [axis]}).create_node() + argmax_node.out_port(0).get_connection().set_source(squeeze_node.out_port(0)) + squeeze_node.in_port(0).connect(argmax_node.out_port(0)) diff --git a/model-optimizer/extensions/front/onnx/argmax_ext.py b/model-optimizer/extensions/front/onnx/argmax_ext.py new file mode 100644 index 0000000..162ee81 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/argmax_ext.py @@ -0,0 +1,42 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.ops.argmax import ArgMaxOp +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + +class ArgMaxFrontExtractor(FrontExtractorOp): + op = 'ArgMax' + enabled = True + + @staticmethod + def extract(node): + keepdims = onnx_attr(node, 'keepdims', 'i', default=1) + axis = onnx_attr(node, 'axis', 'i', default=0) + + attrs = { + 'axis': axis, + + # ONNX ArgMax always computes an index of one maximum value + 'top_k' : 1, + 'out_max_val' : 0, + + # Set attribute to trigger ArgMax replacer in case do not keep the dimension + 'keepdims': keepdims + } + + ArgMaxOp.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/cast_ext.py b/model-optimizer/extensions/front/onnx/cast_ext.py new file mode 100644 index 0000000..b19fb33 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/cast_ext.py @@ -0,0 +1,30 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.ops.Cast import Cast +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import get_onnx_datatype_as_numpy, onnx_attr + + +class CastFrontExtractor(FrontExtractorOp): + op = 'Cast' + enabled = True + + @staticmethod + def extract(node): + to = onnx_attr(node, 'to', 'i', default=None) + Cast.update_node_stat(node, {'dst_type': get_onnx_datatype_as_numpy(to)}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/clip_ext.py b/model-optimizer/extensions/front/onnx/clip_ext.py new file mode 100644 index 0000000..4940afd --- /dev/null +++ b/model-optimizer/extensions/front/onnx/clip_ext.py @@ -0,0 +1,33 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr +from mo.ops.clamp import Clamp + + +class ClipFrontExtractor(FrontExtractorOp): + op = 'Clip' + enabled = True + + @staticmethod + def extract(node): + attrs = { + 'min': onnx_attr(node, 'min', 'f', -3.4028234663852886e+38), + 'max': onnx_attr(node, 'max', 'f', 3.4028234663852886e+38), + } + Clamp.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/constant_fill_ext.py b/model-optimizer/extensions/front/onnx/constant_fill_ext.py index e800276..92d0560 100644 --- a/model-optimizer/extensions/front/onnx/constant_fill_ext.py +++ b/model-optimizer/extensions/front/onnx/constant_fill_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/conv_ext.py b/model-optimizer/extensions/front/onnx/conv_ext.py index 262a469..5562f58 100644 --- a/model-optimizer/extensions/front/onnx/conv_ext.py +++ b/model-optimizer/extensions/front/onnx/conv_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -110,27 +110,28 @@ class ConvTransposeFrontExtractor(FrontExtractorOp): @staticmethod def extract(node): + pads = onnx_attr(node, 'pads', 'ints', dst_type=int64_array) + auto_pad = onnx_attr(node, 'auto_pad', 's', default=None, dst_type=get_onnx_autopad) - int64array = lambda x: np.array(x, dtype=np.int64) + if pads is not None: + if len(pads) % 2 != 0: + raise Error( + 'ConvTranspose node {} specifies pads = {} which has odd number of elements. The model is not correct.', + node.soft_get('name'), + pads + ) + pads = pads.reshape([2, -1]) + pads = np.transpose(pads) - pads = onnx_attr(node, 'pads', 'ints', dst_type=int64array) - auto_pad = onnx_attr(node, 'auto_pad', 's', default=None, dst_type=get_onnx_autopad) + final_pads = int64_array([[0, 0], [0, 0], *pads]) if pads is not None else None - if pads is None: - pads = np.array([0, 0, 0, 0], dtype=np.int64) + dilations = onnx_attr(node, 'dilations', 'ints', default=None) + final_dilations = int64_array([1, 1, *dilations]) if dilations is not None else None - if len(pads) % 2 != 0: - raise Error( - 'ConvTranspose node {} specifies pads = {} which has odd number of elements. The model is not correct.', - node.soft_get('name'), - pads - ) + strides = onnx_attr(node, 'strides', 'ints', default=None) + final_strides = int64_array([1, 1, *strides]) if strides is not None else None - pads = pads.reshape([2, -1]) - pads = np.transpose(pads) - dilations = int64array(onnx_attr(node, 'dilations', 'ints', default=[1, 1])) - strides = int64array(onnx_attr(node, 'strides', 'ints', default=[1, 1])) - kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', dst_type=int64array) + kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', dst_type=int64_array) if kernel_shape is None: raise Error( @@ -138,9 +139,10 @@ class ConvTransposeFrontExtractor(FrontExtractorOp): node.soft_get('name') ) - output_padding = onnx_attr(node, 'output_padding', 'ints', default=[0, 0]) + output_padding = onnx_attr(node, 'output_padding', 'ints', default=None) + final_output_padding = int64_array([0, 0, *output_padding]) if output_padding is not None else None - output_shape = onnx_attr(node, 'output_shape', 'ints', default=None, dst_type=int64array) + output_shape = onnx_attr(node, 'output_shape', 'ints', default=None, dst_type=int64_array) attrs = { 'type': 'Deconvolution', @@ -148,26 +150,24 @@ class ConvTransposeFrontExtractor(FrontExtractorOp): 'auto_pad': auto_pad, 'bias_addable': True, 'bias_term': None, # will be deduced later; not really needed - 'pad': int64array([[0, 0], [0, 0], pads[0], pads[1]]), - 'pad_spatial_shape': int64array([pads[0], pads[1]]), - 'dilation': int64array([1, 1, dilations[0], dilations[1]]), + 'pad': final_pads, + 'dilation': final_dilations, 'output_spatial_shape': output_shape, 'output_shape': None, - 'output_padding': int64array([0, 0, output_padding[0], output_padding[1]]), - 'stride': int64array([1, 1, strides[0], strides[1]]), + 'output_padding': final_output_padding, + 'stride': final_strides, 'group': onnx_attr(node, 'group', 'i', default=1), 'output': None, - 'spatial_dims': int64array([2, 3]), - 'channel_dims': int64array([1]), - 'batch_dims': int64array([0]), - 'kernel_spatial': int64array([kernel_shape[0], kernel_shape[1]]), # TODO WARNING Don't misuse X/Y + + 'spatial_dims': None, # Will be calculated in infer function + 'channel_dims': int64_array([1]), + 'batch_dims': int64_array([0]), + 'layout': 'NCHW', 'input_feature_channel': 0, 'output_feature_channel': 1, - 'kernel_spatial_idx': np.array([2, 3]), 'get_pad': ConvTransposeFrontExtractor.get_pad } - attrs.update(layout_attrs()) # update the attributes of the node Convolution.update_node_stat(node, attrs) diff --git a/model-optimizer/extensions/front/onnx/conv_ext_test.py b/model-optimizer/extensions/front/onnx/conv_ext_test.py index 937542a..e853c80 100644 --- a/model-optimizer/extensions/front/onnx/conv_ext_test.py +++ b/model-optimizer/extensions/front/onnx/conv_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,13 +46,11 @@ class ConvTransposeONNXExtractorTest(unittest.TestCase): dict( type='Deconvolution', pad=[[0, 0], [0, 0], [1, 3], [2, 4]], - pad_spatial_shape=[[1, 3], [2, 4]], - kernel_spatial=[5, 6], bias_term=None, output_shape=None, - output_padding=[0, 0, 0, 0], - dilation=[1, 1, 1, 1], - stride=[1, 1, 1, 1], + output_padding=None, + dilation=None, + stride=None, output_spatial_shape=None, group=1 ) @@ -74,8 +72,7 @@ class ConvTransposeONNXExtractorTest(unittest.TestCase): def test_all_valid_default(self): inp, ref = self._base_attrs() del inp['pads'] - ref['pad'] = [[0, 0], [0, 0], [0, 0], [0, 0]] - ref['pad_spatial_shape'] = [[0, 0], [0, 0]] + del ref['pad'] out = self._extract(inp) self._match(out, ref) @@ -111,8 +108,7 @@ class ConvTransposeONNXExtractorTest(unittest.TestCase): inp['auto_pad'] = 'SAME_UPPER' ref['auto_pad'] = 'same_upper' - ref['pad'] = [[0, 0], [0, 0], [0, 0], [0, 0]] - ref['pad_spatial_shape'] = [[0, 0], [0, 0]] + del ref['pad'] out = self._extract(inp) self._match(out, ref) diff --git a/model-optimizer/extensions/front/onnx/crop_ext.py b/model-optimizer/extensions/front/onnx/crop_ext.py index d11f79d..1ef2e94 100644 --- a/model-optimizer/extensions/front/onnx/crop_ext.py +++ b/model-optimizer/extensions/front/onnx/crop_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/crop_ext_test.py b/model-optimizer/extensions/front/onnx/crop_ext_test.py index 1696b69..1b06466 100644 --- a/model-optimizer/extensions/front/onnx/crop_ext_test.py +++ b/model-optimizer/extensions/front/onnx/crop_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/detection_output.py b/model-optimizer/extensions/front/onnx/detection_output.py new file mode 100644 index 0000000..8e23cb4 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/detection_output.py @@ -0,0 +1,112 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr +from mo.ops.op import Op +from mo.utils.error import Error + + +class DetectionOutputFrontExtractor(FrontExtractorOp): + op = 'DetectionOutput' + enabled = True + + @staticmethod + def extract(node): + nms_threshold = onnx_attr(node, 'nms_threshold', 'f', default=0.0) + eta = onnx_attr(node, 'eta', 'f', default=0.0) + top_k = onnx_attr(node, 'top_k', 'i', default=-1) + + code_type_values = { + b"CORNER": "caffe.PriorBoxParameter.CORNER", + b"CENTER_SIZE": "caffe.PriorBoxParameter.CENTER_SIZE", + } + + code_type = onnx_attr(node, 'code_type', 's', default=code_type_values[b"CORNER"]) + try: + code_type = code_type_values[code_type] + except KeyError: + raise Error("Incorrect value of code_type parameter {}".format(code_type)) + + resize_mode_values = { + b"": "", + b"WARP": "caffe.ResizeParameter.WARP", + b"FIT_SMALL_SIZE": "caffe.ResizeParameter.FIT_SMALL_SIZE", + b"FIT_LARGE_SIZE_AND_PAD": "caffe.ResizeParameter.FIT_LARGE_SIZE_AND_PAD", + } + resize_mode = onnx_attr(node, 'resize_mode', 's', default=b"") + try: + resize_mode = resize_mode_values[resize_mode] + except KeyError: + raise Error("Incorrect value of resize_mode parameter {}".format(resize_mode)) + + pad_mode_values = { + b"": "", + b"CONSTANT": "caffe.ResizeParameter.CONSTANT", + b"MIRRORED": "caffe.ResizeParameter.MIRRORED", + b"REPEAT_NEAREST": "caffe.ResizeParameter.REPEAT_NEAREST" + } + pad_mode = onnx_attr(node, 'pad_mode', 's', default=b"") + try: + pad_mode = pad_mode_values[pad_mode] + except KeyError: + raise Error("Incorrect value of pad_mode parameter {}".format(pad_mode)) + + interp_mode_values = { + b"": "", + b"LINEAR": "caffe.ResizeParameter.LINEAR", + b"AREA": "caffe.ResizeParameter.AREA", + b"NEAREST": "caffe.ResizeParameter.NEAREST", + b"CUBIC": "caffe.ResizeParameter.CUBIC", + b"LANCZOS4": "caffe.ResizeParameter.LANCZOS4" + } + interp_mode = onnx_attr(node, 'interp_mode', 's', default=b"") + try: + interp_mode = interp_mode_values[interp_mode] + except KeyError: + raise Error("Incorrect value of interp_mode parameter {}".format(interp_mode)) + + attrs = { + 'num_classes': onnx_attr(node, 'num_classes', 'i', default=0), + 'share_location': onnx_attr(node, 'share_location', 'i', default=0), + 'background_label_id': onnx_attr(node, 'background_label_id', 'i', default=0), + 'code_type': code_type, + 'variance_encoded_in_target': onnx_attr(node, 'variance_encoded_in_target', 'i', default=0), + 'keep_top_k': onnx_attr(node, 'keep_top_k', 'i', default=0), + 'confidence_threshold': onnx_attr(node, 'confidence_threshold', 'f', default=0), + 'visualize_threshold': onnx_attr(node, 'visualize_threshold', 'f', default=0.6), + # nms_param + 'nms_threshold': nms_threshold, + 'top_k': top_k, + 'eta': eta, + # save_output_param.resize_param + 'prob': onnx_attr(node, 'prob', 'f', default=0), + 'resize_mode': resize_mode, + 'height': onnx_attr(node, 'height', 'i', default=0), + 'width': onnx_attr(node, 'width', 'i', default=0), + 'height_scale': onnx_attr(node, 'height_scale', 'i', default=0), + 'width_scale': onnx_attr(node, 'width_scale', 'i', default=0), + 'pad_mode': pad_mode, + 'pad_value': onnx_attr(node, 'pad_value', 's', default=""), + 'interp_mode': interp_mode, + 'input_width': onnx_attr(node, 'input_width', 'i', default=1), + 'input_height': onnx_attr(node, 'input_height', 'i', default=1), + 'normalized': onnx_attr(node, 'normalized', 'i', default=1), + } + + # update the attributes of the node + Op.get_op_class_by_name(__class__.op).update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/detection_output_test.py b/model-optimizer/extensions/front/onnx/detection_output_test.py new file mode 100644 index 0000000..f055f00 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/detection_output_test.py @@ -0,0 +1,102 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import onnx +import unittest + +import numpy as np + +from extensions.front.onnx.detection_output import DetectionOutputFrontExtractor +from extensions.ops.DetectionOutput import DetectionOutput +from mo.ops.op import Op +from mo.utils.unittest.extractors import PB + + +class TestDetectionOutputExt(unittest.TestCase): + @staticmethod + def _create_do_node(num_classes=0, share_location=0, background_label_id=0, + code_type="", variance_encoded_in_target=0, keep_top_k=0, + confidence_threshold=0, nms_threshold=0, top_k=0, eta=0): + pb = onnx.helper.make_node( + 'DetectionOutput', + inputs=['x'], + outputs=['y'], + num_classes=num_classes, + share_location=share_location, + background_label_id=background_label_id, + code_type=code_type, + variance_encoded_in_target=variance_encoded_in_target, + keep_top_k=keep_top_k, + confidence_threshold=confidence_threshold, + # nms_param + nms_threshold=nms_threshold, + top_k=top_k, + eta=eta, + ) + + node = PB({'pb': pb}) + return node + + @classmethod + def setUpClass(cls): + Op.registered_ops['DetectionOutput'] = DetectionOutput + + def test_do_no_pb_no_ml(self): + self.assertRaises(AttributeError, DetectionOutputFrontExtractor.extract, None) + + def test_do_ext_ideal_numbers(self): + node = self._create_do_node(num_classes=21, share_location=1, + code_type="CENTER_SIZE", keep_top_k=200, + confidence_threshold=0.01, nms_threshold=0.45, top_k=400, eta=1.0) + + DetectionOutputFrontExtractor.extract(node) + + exp_res = { + 'op': 'DetectionOutput', + 'type': 'DetectionOutput', + 'num_classes': 21, + 'share_location': 1, + 'background_label_id': 0, + 'code_type': "caffe.PriorBoxParameter.CENTER_SIZE", + 'variance_encoded_in_target': 0, + 'keep_top_k': 200, + 'confidence_threshold': 0.01, + 'visualize_threshold': 0.6, + # nms_param + 'nms_threshold': 0.45, + 'top_k': 400, + 'eta': 1.0, + # ONNX have not such parameters + # save_output_param.resize_param + 'prob': 0, + 'resize_mode': "", + 'height': 0, + 'width': 0, + 'height_scale': 0, + 'width_scale': 0, + 'pad_mode': "", + 'pad_value': "", + 'interp_mode': "", + 'input_width': 1, + 'input_height': 1, + 'normalized': 1, + } + + for key in exp_res.keys(): + if key in ['confidence_threshold', 'visualise_threshold', 'nms_threshold', 'eta']: + np.testing.assert_almost_equal(node[key], exp_res[key]) + else: + self.assertEqual(node[key], exp_res[key]) diff --git a/model-optimizer/extensions/front/onnx/detectionoutput_ext.py b/model-optimizer/extensions/front/onnx/detectionoutput_ext.py new file mode 100644 index 0000000..3d00fc1 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/detectionoutput_ext.py @@ -0,0 +1,42 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from math import log +import numpy as np + +from extensions.ops.detectionoutput_onnx import ExperimentalDetectronDetectionOutput +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class ExperimentalDetectronDetectionOutputFrontExtractor(FrontExtractorOp): + op = 'ExperimentalDetectronDetectionOutput' + enabled = True + + @staticmethod + def extract(node): + attrs = dict(class_agnostic_box_regression=onnx_attr(node, 'class_agnostic_box_regression', 'i', 0), + max_detections_per_image=onnx_attr(node, 'max_detections_per_image', 'i', 100), + nms_threshold=onnx_attr(node, 'nms_threshold', 'f', 0.5), + num_classes=onnx_attr(node, 'num_classes', 'i', 81), + post_nms_count=onnx_attr(node, 'post_nms_count', 'i', 2000), + score_threshold=onnx_attr(node, 'score_threshold', 'f', 0.05), + max_delta_log_wh=onnx_attr(node, 'max_delta_log_wh', 'f', log(1000. / 16.)), + deltas_weights=np.array(onnx_attr(node, 'deltas_weights', 'floats', [10., 10., 5., 5.]), + dtype=np.float32) + ) + ExperimentalDetectronDetectionOutput.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/dropout_ext.py b/model-optimizer/extensions/front/onnx/dropout_ext.py new file mode 100644 index 0000000..21292bd --- /dev/null +++ b/model-optimizer/extensions/front/onnx/dropout_ext.py @@ -0,0 +1,36 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr +from extensions.ops.identity import IdentityOp +from mo.utils.error import Error + + +class DropoutFrontExtractor(FrontExtractorOp): + op = 'Dropout' + enabled = True + + @staticmethod + def extract(node): + # some Dropout flavors doesn't have is_test attribute; when it is missing, interpret it as 1 + is_test = onnx_attr(node, 'is_test', 'i', 1) + if len(node.out_nodes()) > 1: + raise Error('Dropout node {} has more than one consumer. Unsupported.', node.name) + if not is_test: + raise Error('Dropout node {} has is_test: 0. This means training mode which is not supported.', node.name) + IdentityOp.update_node_stat(node) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/elu_ext.py b/model-optimizer/extensions/front/onnx/elu_ext.py index 36d66ac..5c1dfd4 100644 --- a/model-optimizer/extensions/front/onnx/elu_ext.py +++ b/model-optimizer/extensions/front/onnx/elu_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/elu_ext_test.py b/model-optimizer/extensions/front/onnx/elu_ext_test.py index e509e4e..1ca029b 100644 --- a/model-optimizer/extensions/front/onnx/elu_ext_test.py +++ b/model-optimizer/extensions/front/onnx/elu_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/exp_ext.py b/model-optimizer/extensions/front/onnx/exp_ext.py new file mode 100644 index 0000000..7716579 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/exp_ext.py @@ -0,0 +1,28 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.ops.activation import Activation + + +class ExpExtractor(FrontExtractorOp): + op = 'Exp' + enabled = True + + @staticmethod + def extract(node): + Activation.update_node_stat(node, {'operation': 'exp'}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/flatten_ext.py b/model-optimizer/extensions/front/onnx/flatten_ext.py index 11aaa1b..945b59d 100644 --- a/model-optimizer/extensions/front/onnx/flatten_ext.py +++ b/model-optimizer/extensions/front/onnx/flatten_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/flatten_ext_test.py b/model-optimizer/extensions/front/onnx/flatten_ext_test.py index 5498343..de9e9f2 100644 --- a/model-optimizer/extensions/front/onnx/flatten_ext_test.py +++ b/model-optimizer/extensions/front/onnx/flatten_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/gather_ext.py b/model-optimizer/extensions/front/onnx/gather_ext.py index 1484bc8..ad639d7 100644 --- a/model-optimizer/extensions/front/onnx/gather_ext.py +++ b/model-optimizer/extensions/front/onnx/gather_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/gather_ext_test.py b/model-optimizer/extensions/front/onnx/gather_ext_test.py index d91c793..5d48ea4 100644 --- a/model-optimizer/extensions/front/onnx/gather_ext_test.py +++ b/model-optimizer/extensions/front/onnx/gather_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/gru_ext.py b/model-optimizer/extensions/front/onnx/gru_ext.py new file mode 100644 index 0000000..a1e2605 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/gru_ext.py @@ -0,0 +1,59 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.ops.GRU import GRU +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class GRUFrontExtractor(FrontExtractorOp): + op = 'GRU' + enabled = True + + @staticmethod + def extract(node): + activation_alpha = onnx_attr(node, 'activation_alpha', 'floats', + default=None, dst_type=lambda x: np.array(x, dtype=np.float32)) + activation_beta = onnx_attr(node, 'activation_beta', 'floats', + default=None, dst_type=lambda x: np.array(x, dtype=np.float32)) + activations = onnx_attr(node, 'activations', 'strings', default=None, + dst_type=lambda x: list(map(lambda s: s.decode(encoding="utf-8").lower(), list(x)))) + clip = onnx_attr(node, 'clip', 'f', default=None) + linear_before_reset = onnx_attr(node, 'linear_before_reset', 'i', default=0) + + attrs = { + 'batch_dim': 1, + 'sequence_dim': 0, + 'blobs_wrb': True, + 'has_num_directions': True, + 'num_layers': 1, + 'format': 'onnx', + 'multilayers': False, + 'gate_order': [0, 1, 2], + + # ONNX - specific attrs + 'activation_alpha': activation_alpha, + 'activation_beta': activation_beta, + 'activations': activations, + 'clip': clip, + 'direction': onnx_attr(node, 'direction', 's', b'forward').decode().lower(), + 'hidden_size': np.array(onnx_attr(node, 'hidden_size', 'i'), dtype=np.int64), + 'linear_before_reset': linear_before_reset, + } + + GRU.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/gru_ext_test.py b/model-optimizer/extensions/front/onnx/gru_ext_test.py new file mode 100644 index 0000000..44e2951 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/gru_ext_test.py @@ -0,0 +1,79 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest + +import numpy as np +import onnx + +from extensions.front.onnx.gru_ext import GRUFrontExtractor +from mo.utils.unittest.extractors import PB + + +class GRUExtractorTest(unittest.TestCase): + @staticmethod + def _create_node(**attrs): + pb = onnx.helper.make_node( + 'GRU', + inputs=['X', 'W', 'R', 'B',], + outputs=['Y', 'Y_h', 'Y_c'], + hidden_size=128, + **attrs, + ) + node = PB({'pb': pb}) + return node + + base_attrs = { + 'type': 'RNNSequence', + 'op': 'GRU', + 'batch_dim': 1, + 'sequence_dim': 0, + 'blobs_wrb': True, + 'has_num_directions': True, + 'num_layers': 1, + 'format': 'onnx', + 'multilayers': False, + 'gate_order': np.array([0, 1, 2]), + 'direction': 'forward', + 'linear_before_reset': 0, + } + + def test_base_attrs(self): + node = self._create_node() + GRUFrontExtractor.extract(node) + + exp_res = self.base_attrs + + for key in exp_res.keys(): + equal = np.all(np.equal(node[key], exp_res[key], dtype=object)) + self.assertTrue(equal, 'Values for attr {} are not equal'.format(key)) + + def test_additional_attributes(self): + additional_attrs = { + 'activation_alpha': [1.0, 0.0, 2.0], + 'activations': [b'relu', b'tanh', b'sigmoid'], + 'clip': 10.0, + 'linear_before_reset': 1, + } + + node = self._create_node(**additional_attrs) + GRUFrontExtractor.extract(node) + + exp_res = {**self.base_attrs, **additional_attrs} + exp_res['activations'] = ['relu', 'tanh', 'sigmoid'] + + for key in exp_res.keys(): + equal = np.all(np.equal(node[key], exp_res[key], dtype=object)) + self.assertTrue(equal, 'Values for attr {} are not equal'.format(key)) diff --git a/model-optimizer/extensions/front/onnx/image_scaler_ext.py b/model-optimizer/extensions/front/onnx/image_scaler_ext.py index 5d46fc6..2bfb181 100644 --- a/model-optimizer/extensions/front/onnx/image_scaler_ext.py +++ b/model-optimizer/extensions/front/onnx/image_scaler_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/image_scaler_ext_test.py b/model-optimizer/extensions/front/onnx/image_scaler_ext_test.py index 8f5fb04..8a1b6ef 100644 --- a/model-optimizer/extensions/front/onnx/image_scaler_ext_test.py +++ b/model-optimizer/extensions/front/onnx/image_scaler_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/instance_normalization_ext.py b/model-optimizer/extensions/front/onnx/instance_normalization_ext.py index 44737b4..2a30ff4 100644 --- a/model-optimizer/extensions/front/onnx/instance_normalization_ext.py +++ b/model-optimizer/extensions/front/onnx/instance_normalization_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/instance_normalization_ext_test.py b/model-optimizer/extensions/front/onnx/instance_normalization_ext_test.py index c38a30f..60878cf 100644 --- a/model-optimizer/extensions/front/onnx/instance_normalization_ext_test.py +++ b/model-optimizer/extensions/front/onnx/instance_normalization_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/leaky_relu_ext.py b/model-optimizer/extensions/front/onnx/leaky_relu_ext.py index e6694e9..ef8c626 100644 --- a/model-optimizer/extensions/front/onnx/leaky_relu_ext.py +++ b/model-optimizer/extensions/front/onnx/leaky_relu_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/lrn_ext.py b/model-optimizer/extensions/front/onnx/lrn_ext.py index d402a6e..9d89d60 100644 --- a/model-optimizer/extensions/front/onnx/lrn_ext.py +++ b/model-optimizer/extensions/front/onnx/lrn_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/lstm_ext.py b/model-optimizer/extensions/front/onnx/lstm_ext.py index 20bc8ba..6673932 100644 --- a/model-optimizer/extensions/front/onnx/lstm_ext.py +++ b/model-optimizer/extensions/front/onnx/lstm_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,14 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. """ - - import numpy as np -from extensions.ops.lstm_sequence import LSTMSequence +from extensions.ops.LSTM import LSTM from mo.front.extractor import FrontExtractorOp from mo.front.onnx.extractors.utils import onnx_attr -from mo.ops.op import Op class LSTMFrontExtractor(FrontExtractorOp): @@ -29,27 +26,34 @@ class LSTMFrontExtractor(FrontExtractorOp): @staticmethod def extract(node): - - def split_helper(node, index: int, direction: str): - return Op._create_data_node( - node.graph, - name=node.name + '/SplittedBiLSTM/{}/'.format(direction), - attrs={'value': node.value[index], 'shape': np.array(node.value[index].shape, dtype=np.int64)} - ) + activation_alpha = onnx_attr(node, 'activation_alpha', 'floats', + default=None, dst_type=lambda x: np.array(x, dtype=np.float32)) + activation_beta = onnx_attr(node, 'activation_beta', 'floats', + default=None, dst_type=lambda x: np.array(x, dtype=np.float32)) + activations = onnx_attr(node, 'activations', 'strings', default=None, + dst_type=lambda x: list(map(lambda s: s.decode(encoding="utf-8").lower(), list(x)))) + clip = onnx_attr(node, 'clip', 'f', default=None) + input_forget = onnx_attr(node, 'input_forget', 'i', default=0) attrs = { - 'hidden_size': np.array(onnx_attr(node, 'hidden_size', 'i'), dtype=np.int64), 'batch_dim': 1, 'sequence_dim': 0, 'blobs_wrb': True, 'has_num_directions': True, - 'direction': onnx_attr(node, 'direction', 's', b'forward').decode().lower(), + 'num_layers': 1, 'format': 'onnx', - 'blob_bidirectional_split': lambda node: ( - split_helper(node, 0, 'forward'), - split_helper(node, 1, 'reverse') - ) + 'multilayers': False, + 'gate_order': [2, 0, 3, 1], # iofc --> fico + + # ONNX attrs + 'activation_alpha': activation_alpha, + 'activation_beta': activation_beta, + 'activations': activations, + 'clip': clip, + 'direction': onnx_attr(node, 'direction', 's', b'forward').decode().lower(), + 'hidden_size': np.array(onnx_attr(node, 'hidden_size', 'i'), dtype=np.int64), + 'input_forget': input_forget, } - LSTMSequence.update_node_stat(node, attrs) + LSTM.update_node_stat(node, attrs) return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/lstm_ext_test.py b/model-optimizer/extensions/front/onnx/lstm_ext_test.py new file mode 100644 index 0000000..ea66dfa --- /dev/null +++ b/model-optimizer/extensions/front/onnx/lstm_ext_test.py @@ -0,0 +1,77 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest + +import numpy as np +import onnx + +from extensions.front.onnx.lstm_ext import LSTMFrontExtractor +from mo.utils.unittest.extractors import PB + + +class LSTMExtractorTest(unittest.TestCase): + @staticmethod + def _create_node(**attrs): + pb = onnx.helper.make_node( + 'LSTM', + inputs=['X', 'W', 'R', 'B',], + outputs=['Y', 'Y_h', 'Y_c'], + hidden_size=128, + **attrs, + ) + node = PB({'pb': pb}) + return node + + base_attrs = { + 'type': 'RNNSequence', + 'op': 'LSTM', + 'batch_dim': 1, + 'sequence_dim': 0, + 'blobs_wrb': True, + 'has_num_directions': True, + 'num_layers': 1, + 'format': 'onnx', + 'multilayers': False, + 'gate_order': np.array([2, 0, 3, 1]), + 'direction': 'forward', + } + + def test_base_attrs(self): + node = self._create_node() + LSTMFrontExtractor.extract(node) + + exp_res = self.base_attrs + + for key in exp_res.keys(): + equal = np.all(np.equal(node[key], exp_res[key], dtype=object)) + self.assertTrue(equal) + + def test_additional_attributes(self): + additional_attrs = { + 'activation_alpha': [1.0, 0.0, 2.0], + 'activations': [b'relu', b'tanh', b'sigmoid'], + 'clip': 10.0, + } + + node = self._create_node(**additional_attrs) + LSTMFrontExtractor.extract(node) + + exp_res = dict(**self.base_attrs, **additional_attrs) + exp_res['activations'] = ['relu', 'tanh', 'sigmoid'] + + for key in exp_res.keys(): + equal = np.all(np.equal(node[key], exp_res[key], dtype=object)) + self.assertTrue(equal, 'Values for attr {} are not equal'.format(key)) diff --git a/model-optimizer/extensions/front/onnx/matmul_ext.py b/model-optimizer/extensions/front/onnx/matmul_ext.py index 38b3189..33e8f47 100644 --- a/model-optimizer/extensions/front/onnx/matmul_ext.py +++ b/model-optimizer/extensions/front/onnx/matmul_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/mul_ext.py b/model-optimizer/extensions/front/onnx/mul_ext.py index f1de122..14af8c8 100644 --- a/model-optimizer/extensions/front/onnx/mul_ext.py +++ b/model-optimizer/extensions/front/onnx/mul_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/neg_ext.py b/model-optimizer/extensions/front/onnx/neg_ext.py index 939c167..33103ca 100644 --- a/model-optimizer/extensions/front/onnx/neg_ext.py +++ b/model-optimizer/extensions/front/onnx/neg_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/pad_ext.py b/model-optimizer/extensions/front/onnx/pad_ext.py index 449949f..f87f726 100644 --- a/model-optimizer/extensions/front/onnx/pad_ext.py +++ b/model-optimizer/extensions/front/onnx/pad_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/pad_ext_test.py b/model-optimizer/extensions/front/onnx/pad_ext_test.py index 1f4f25d..46de627 100644 --- a/model-optimizer/extensions/front/onnx/pad_ext_test.py +++ b/model-optimizer/extensions/front/onnx/pad_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/pooling_ext.py b/model-optimizer/extensions/front/onnx/pooling_ext.py index 17c894c..5916bbd 100644 --- a/model-optimizer/extensions/front/onnx/pooling_ext.py +++ b/model-optimizer/extensions/front/onnx/pooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/pow_ext.py b/model-optimizer/extensions/front/onnx/pow_ext.py index ab8330f..327725f 100644 --- a/model-optimizer/extensions/front/onnx/pow_ext.py +++ b/model-optimizer/extensions/front/onnx/pow_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/priorbox_ext.py b/model-optimizer/extensions/front/onnx/priorbox_ext.py new file mode 100644 index 0000000..6a45003 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/priorbox_ext.py @@ -0,0 +1,51 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.front.extractor import FrontExtractorOp +from mo.ops.op import Op +from mo.front.onnx.extractors.utils import onnx_attr + + +class PriorBoxFrontExtractor(FrontExtractorOp): + op = 'PriorBox' + enabled = True + + @staticmethod + def extract(node): + variance = onnx_attr(node, 'variance', 'floats', default=[], dst_type=lambda x: np.array(x, dtype=np.float32)) + if len(variance) == 0: + variance = [0.1] + + update_attrs = { + 'aspect_ratio': onnx_attr(node, 'aspect_ratio', 'floats', dst_type=lambda x: np.array(x, dtype=np.float32)), + 'min_size': onnx_attr(node, 'min_size', 'floats', dst_type=lambda x: np.array(x, dtype=np.float32)), + 'max_size': onnx_attr(node, 'max_size', 'floats', dst_type=lambda x: np.array(x, dtype=np.float32)), + 'flip': onnx_attr(node, 'flip', 'i', default=0), + 'clip': onnx_attr(node, 'clip', 'i', default=0), + 'variance': list(variance), + 'img_size': onnx_attr(node, 'img_size', 'i', default=0), + 'img_h': onnx_attr(node, 'img_h', 'i', default=0), + 'img_w': onnx_attr(node, 'img_w', 'i', default=0), + 'step': onnx_attr(node, 'step', 'f', default=0.0), + 'step_h': onnx_attr(node, 'step_h', 'f', default=0.0), + 'step_w': onnx_attr(node, 'step_w', 'f', default=0.0), + 'offset': onnx_attr(node, 'offset', 'f', default=0.0), + } + + # update the attributes of the node + Op.get_op_class_by_name(__class__.op).update_node_stat(node, update_attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/priorbox_ext_test.py b/model-optimizer/extensions/front/onnx/priorbox_ext_test.py new file mode 100644 index 0000000..8608fdd --- /dev/null +++ b/model-optimizer/extensions/front/onnx/priorbox_ext_test.py @@ -0,0 +1,89 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import onnx +import unittest + +import numpy as np + +from extensions.front.onnx.priorbox_ext import PriorBoxFrontExtractor +from extensions.ops.priorbox import PriorBoxOp +from mo.ops.op import Op +from mo.utils.unittest.extractors import PB + + +class TestPriorBoxExt(unittest.TestCase): + @staticmethod + def _create_priorbox_node(aspect_ratio=[], min_size=np.array([]), max_size=np.array([]), + flip=False, clip=False, variance=None, img_size=0, img_h=0, + img_w=0, step=0, step_h=0, step_w=0, offset=0): + pb = onnx.helper.make_node( + 'PriorBox', + inputs=['x'], + outputs=['y'], + aspect_ratio=aspect_ratio, + min_size=min_size, + max_size=max_size, + flip=flip, + clip=clip, + variance=variance, + img_size=img_size, + img_h=img_h, + img_w=img_w, + step=step, + step_h=step_h, + step_w=step_w, + offset=offset, + ) + + node = PB({'pb': pb}) + return node + + @classmethod + def setUpClass(cls): + Op.registered_ops['PriorBox'] = PriorBoxOp + + def test_priorbox_no_pb_no_ml(self): + self.assertRaises(AttributeError, PriorBoxFrontExtractor.extract, None) + + def test_priorbox_ext_ideal_numbers(self): + node = self._create_priorbox_node(aspect_ratio=np.array([2, 3], dtype=np.float), + variance=np.array([0.2, 0.3, 0.2, 0.3]), + img_size=300, step=5.0, offset=0.6, flip=True) + + PriorBoxFrontExtractor.extract(node) + + exp_res = { + 'op': 'PriorBox', + 'type': 'PriorBox', + 'clip': 0, + 'flip': 1, + 'aspect_ratio': np.array([2, 3], dtype=np.float), + 'variance': [0.2, 0.3, 0.2, 0.3], + 'img_size': 300, + 'img_h': 0, + 'img_w': 0, + 'step': 5, + 'step_h': 0, + 'step_w': 0, + 'offset': 0.6 + } + + for key in exp_res.keys(): + if key in ['variance', 'aspect_ratio', 'step_h', 'step_w', 'offset']: + np.testing.assert_almost_equal(node[key], exp_res[key]) + else: + self.assertEqual(node[key], exp_res[key]) diff --git a/model-optimizer/extensions/front/onnx/priorgridgenerator_ext.py b/model-optimizer/extensions/front/onnx/priorgridgenerator_ext.py new file mode 100644 index 0000000..f8db64b --- /dev/null +++ b/model-optimizer/extensions/front/onnx/priorgridgenerator_ext.py @@ -0,0 +1,35 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.ops.priorgridgenerator_onnx import ExperimentalDetectronPriorGridGenerator +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class ExperimentalDetectronPriorGridGeneratorFrontExtractor(FrontExtractorOp): + op = 'ExperimentalDetectronPriorGridGenerator' + enabled = True + + @staticmethod + def extract(node): + attrs = dict(h=onnx_attr(node, 'h', 'i', 0), + w=onnx_attr(node, 'w', 'i', 0), + stride_x=onnx_attr(node, 'stride_x', 'f', 0), + stride_y=onnx_attr(node, 'stride_y', 'f', 0), + flatten=onnx_attr(node, 'flatten', 'i', 1) + ) + ExperimentalDetectronPriorGridGenerator.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/proposal_ext.py b/model-optimizer/extensions/front/onnx/proposal_ext.py new file mode 100644 index 0000000..b82f080 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/proposal_ext.py @@ -0,0 +1,34 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.ops.proposal_onnx import ExperimentalDetectronGenerateProposalsSingleImage +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class ExperimentalDetectronGenerateProposalsSingleImageFrontExtractor(FrontExtractorOp): + op = 'ExperimentalDetectronGenerateProposalsSingleImage' + enabled = True + + @staticmethod + def extract(node): + attrs = dict(min_size=onnx_attr(node, 'min_size', 'f', 0.0), + nms_threshold=onnx_attr(node, 'nms_threshold', 'f', 0.7), + post_nms_count=onnx_attr(node, 'post_nms_count', 'i', 1000), + pre_nms_count=onnx_attr(node, 'pre_nms_count', 'i', 1000) + ) + ExperimentalDetectronGenerateProposalsSingleImage.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/quantize_ext.py b/model-optimizer/extensions/front/onnx/quantize_ext.py new file mode 100644 index 0000000..bcead30 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/quantize_ext.py @@ -0,0 +1,30 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr +from extensions.ops.quantize import QuantizeOp + + +class QuantizeFrontExtractor(FrontExtractorOp): + op = 'Quantize' + enabled = True + + @staticmethod + def extract(node): + levels = onnx_attr(node, 'levels', 'i') + QuantizeOp.update_node_stat(node, {'levels' : levels}) + return QuantizeFrontExtractor.enabled diff --git a/model-optimizer/extensions/front/onnx/reduce_mean_ext.py b/model-optimizer/extensions/front/onnx/reduce_mean_ext.py index 174cff1..555ffad 100644 --- a/model-optimizer/extensions/front/onnx/reduce_mean_ext.py +++ b/model-optimizer/extensions/front/onnx/reduce_mean_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/reduce_sum_ext.py b/model-optimizer/extensions/front/onnx/reduce_sum_ext.py index 8886eab..1c04349 100644 --- a/model-optimizer/extensions/front/onnx/reduce_sum_ext.py +++ b/model-optimizer/extensions/front/onnx/reduce_sum_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/rnn_ext.py b/model-optimizer/extensions/front/onnx/rnn_ext.py new file mode 100644 index 0000000..aa8f441 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/rnn_ext.py @@ -0,0 +1,57 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.ops.RNN import RNN +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class RNNFrontExtractor(FrontExtractorOp): + op = 'RNN' + enabled = True + + @staticmethod + def extract(node): + activation_alpha = onnx_attr(node, 'activation_alpha', 'floats', + default=None, dst_type=lambda x: np.array(x, dtype=np.float32)) + activation_beta = onnx_attr(node, 'activation_beta', 'floats', + default=None, dst_type=lambda x: np.array(x, dtype=np.float32)) + activations = onnx_attr(node, 'activations', 'strings', default=None, + dst_type=lambda x: list(map(lambda s: s.decode(encoding="utf-8").lower(), list(x)))) + clip = onnx_attr(node, 'clip', 'f', default=None) + + attrs = { + 'batch_dim': 1, + 'sequence_dim': 0, + 'blobs_wrb': True, + 'has_num_directions': True, + 'num_layers': 1, + 'format': 'onnx', + 'multilayers': False, + 'gate_order': [0], + + # ONNX attrs + 'activation_alpha': activation_alpha, + 'activation_beta': activation_beta, + 'activations': activations, + 'clip': clip, + 'direction': onnx_attr(node, 'direction', 's', b'forward').decode().lower(), + 'hidden_size': np.array(onnx_attr(node, 'hidden_size', 'i'), dtype=np.int64), + } + + RNN.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/rnn_ext_test.py b/model-optimizer/extensions/front/onnx/rnn_ext_test.py new file mode 100644 index 0000000..83f7025 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/rnn_ext_test.py @@ -0,0 +1,77 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest + +import numpy as np +import onnx + +from extensions.front.onnx.rnn_ext import RNNFrontExtractor +from mo.utils.unittest.extractors import PB + + +class RNNExtractorTest(unittest.TestCase): + @staticmethod + def _create_node(**attrs): + pb = onnx.helper.make_node( + 'RNN', + inputs=['X', 'W', 'R', 'B',], + outputs=['Y', 'Y_h', 'Y_c'], + hidden_size=128, + **attrs, + ) + node = PB({'pb': pb}) + return node + + base_attrs = { + 'type': 'RNNSequence', + 'op': 'RNN', + 'batch_dim': 1, + 'sequence_dim': 0, + 'blobs_wrb': True, + 'has_num_directions': True, + 'num_layers': 1, + 'format': 'onnx', + 'multilayers': False, + 'gate_order': np.array([0]), + 'direction': 'forward', + } + + def test_base_attrs(self): + node = self._create_node() + RNNFrontExtractor.extract(node) + + exp_res = self.base_attrs + + for key in exp_res.keys(): + equal = np.all(np.equal(node[key], exp_res[key], dtype=object)) + self.assertTrue(equal) + + def test_additional_attributes(self): + additional_attrs = { + 'activation_alpha': [1.0, 0.0, 2.0], + 'activations': [b'relu', b'tanh', b'sigmoid'], + 'clip': 10.0, + } + + node = self._create_node(**additional_attrs) + RNNFrontExtractor.extract(node) + + exp_res = {**self.base_attrs, **additional_attrs} + exp_res['activations'] = ['relu', 'tanh', 'sigmoid'] + + for key in exp_res.keys(): + equal = np.all(np.equal(node[key], exp_res[key], dtype=object)) + self.assertTrue(equal, 'Values for attr {} are not equal'.format(key)) diff --git a/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py b/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py new file mode 100644 index 0000000..99dae31 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py @@ -0,0 +1,42 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from extensions.ops.roifeatureextractor_onnx import ExperimentalDetectronROIFeatureExtractor +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class ExperimentalDetectronROIFeatureExtractorFrontExtractor(FrontExtractorOp): + op = 'ExperimentalDetectronROIFeatureExtractor' + enabled = True + + @staticmethod + def extract(node): + attrs = dict(output_size=onnx_attr(node, 'output_size', 'i', 7), + sampling_ratio=onnx_attr(node, 'sampling_ratio', 'i', 2), + distribute_rois_between_levels=onnx_attr(node, 'distribute_rois_between_levels', 'i', 1), + preserve_rois_order=onnx_attr(node, 'preserve_rois_order', 'i', 1), + num_classes=onnx_attr(node, 'num_classes', 'i', 81), + post_nms_count=onnx_attr(node, 'post_nms_count', 'i', 2000), + score_threshold=onnx_attr(node, 'score_threshold', 'f', 0.05), + pyramid_scales=np.array(onnx_attr(node, 'pyramid_scales', 'ints', [4, 8, 16, 32, 64]), + dtype=np.int64), + ) + + ExperimentalDetectronROIFeatureExtractor.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/scale_ext.py b/model-optimizer/extensions/front/onnx/scale_ext.py new file mode 100644 index 0000000..7793ea9 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/scale_ext.py @@ -0,0 +1,35 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class ScaleFrontExtractor(FrontExtractorOp): + op = 'Scale' + enabled = True + + @staticmethod + def extract(node): + scale = onnx_attr(node, 'scale', 'f', default=np.array(1.0), dst_type=lambda x: np.array(x)) + + node['scale'] = scale + node['bias'] = np.array(0) + node['op'] = 'ImageScaler' + + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/sigmoid_ext.py b/model-optimizer/extensions/front/onnx/sigmoid_ext.py index 052c9a4..4c4c28c 100644 --- a/model-optimizer/extensions/front/onnx/sigmoid_ext.py +++ b/model-optimizer/extensions/front/onnx/sigmoid_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/sigmoid_ext_test.py b/model-optimizer/extensions/front/onnx/sigmoid_ext_test.py index 3d25ea1..776af04 100644 --- a/model-optimizer/extensions/front/onnx/sigmoid_ext_test.py +++ b/model-optimizer/extensions/front/onnx/sigmoid_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/slice_ext.py b/model-optimizer/extensions/front/onnx/slice_ext.py index 2cc4b36..93affa0 100644 --- a/model-optimizer/extensions/front/onnx/slice_ext.py +++ b/model-optimizer/extensions/front/onnx/slice_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/slice_ext_test.py b/model-optimizer/extensions/front/onnx/slice_ext_test.py index 74ab96a..7a4de92 100644 --- a/model-optimizer/extensions/front/onnx/slice_ext_test.py +++ b/model-optimizer/extensions/front/onnx/slice_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/softmax_ext.py b/model-optimizer/extensions/front/onnx/softmax_ext.py index 543fd4a..2d09ece 100644 --- a/model-optimizer/extensions/front/onnx/softmax_ext.py +++ b/model-optimizer/extensions/front/onnx/softmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/split_ext.py b/model-optimizer/extensions/front/onnx/split_ext.py index 4e9e5ad..0e5db4b 100644 --- a/model-optimizer/extensions/front/onnx/split_ext.py +++ b/model-optimizer/extensions/front/onnx/split_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/squeeze_ext.py b/model-optimizer/extensions/front/onnx/squeeze_ext.py index 8472b87..2478be1 100644 --- a/model-optimizer/extensions/front/onnx/squeeze_ext.py +++ b/model-optimizer/extensions/front/onnx/squeeze_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/squeeze_ext_test.py b/model-optimizer/extensions/front/onnx/squeeze_ext_test.py index 5c69728..209edf8 100644 --- a/model-optimizer/extensions/front/onnx/squeeze_ext_test.py +++ b/model-optimizer/extensions/front/onnx/squeeze_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/tanh_ext.py b/model-optimizer/extensions/front/onnx/tanh_ext.py index 6b88ce2..6199931 100644 --- a/model-optimizer/extensions/front/onnx/tanh_ext.py +++ b/model-optimizer/extensions/front/onnx/tanh_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/tanh_ext_test.py b/model-optimizer/extensions/front/onnx/tanh_ext_test.py index 25b8586..f5a49e5 100644 --- a/model-optimizer/extensions/front/onnx/tanh_ext_test.py +++ b/model-optimizer/extensions/front/onnx/tanh_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/topkrois_ext.py b/model-optimizer/extensions/front/onnx/topkrois_ext.py new file mode 100644 index 0000000..ab8c9f1 --- /dev/null +++ b/model-optimizer/extensions/front/onnx/topkrois_ext.py @@ -0,0 +1,30 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.ops.topkrois_onnx import ExperimentalDetectronTopKROIs +from mo.front.extractor import FrontExtractorOp +from mo.front.onnx.extractors.utils import onnx_attr + + +class ExperimentalDetectronTopKROIsFrontExtractor(FrontExtractorOp): + op = 'ExperimentalDetectronTopKROIs' + enabled = True + + @staticmethod + def extract(node): + attrs = dict(max_rois=onnx_attr(node, 'max_rois', 'i', 1000)) + ExperimentalDetectronTopKROIs.update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/extensions/front/onnx/transpose_ext.py b/model-optimizer/extensions/front/onnx/transpose_ext.py index c2ff501..b6b6941 100644 --- a/model-optimizer/extensions/front/onnx/transpose_ext.py +++ b/model-optimizer/extensions/front/onnx/transpose_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/transpose_ext_test.py b/model-optimizer/extensions/front/onnx/transpose_ext_test.py index 2880c2d..d94a339 100644 --- a/model-optimizer/extensions/front/onnx/transpose_ext_test.py +++ b/model-optimizer/extensions/front/onnx/transpose_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/unsqueeze_ext.py b/model-optimizer/extensions/front/onnx/unsqueeze_ext.py index 9348889..92ea63c 100644 --- a/model-optimizer/extensions/front/onnx/unsqueeze_ext.py +++ b/model-optimizer/extensions/front/onnx/unsqueeze_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/unsqueeze_ext_test.py b/model-optimizer/extensions/front/onnx/unsqueeze_ext_test.py index 7cdcdae..3d55103 100644 --- a/model-optimizer/extensions/front/onnx/unsqueeze_ext_test.py +++ b/model-optimizer/extensions/front/onnx/unsqueeze_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/upsample_ext.py b/model-optimizer/extensions/front/onnx/upsample_ext.py index 867e504..9e8578a 100644 --- a/model-optimizer/extensions/front/onnx/upsample_ext.py +++ b/model-optimizer/extensions/front/onnx/upsample_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/onnx/upsample_ext_test.py b/model-optimizer/extensions/front/onnx/upsample_ext_test.py index e363417..f86f47d 100644 --- a/model-optimizer/extensions/front/onnx/upsample_ext_test.py +++ b/model-optimizer/extensions/front/onnx/upsample_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/output_cut.py b/model-optimizer/extensions/front/output_cut.py new file mode 100644 index 0000000..e55b421 --- /dev/null +++ b/model-optimizer/extensions/front/output_cut.py @@ -0,0 +1,32 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.front.extractor import add_output_ops +from mo.graph.graph import Graph + + +class OutputCut(FrontReplacementPattern): + enabled = True + + def run_after(self): + from extensions.front.user_data_repack import UserDataRepack + return [UserDataRepack] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + add_output_ops(graph, graph.graph['packed_outputs'], inputs=graph.graph['user_shapes']) diff --git a/model-optimizer/extensions/front/override_batch.py b/model-optimizer/extensions/front/override_batch.py new file mode 100644 index 0000000..678c83c --- /dev/null +++ b/model-optimizer/extensions/front/override_batch.py @@ -0,0 +1,25 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph +from mo.middle.passes.infer import override_batch + + +class OverrideBatch(FrontReplacementPattern): + enabled = True + + def find_and_replace_pattern(self, graph: Graph): + override_batch(graph, graph.graph['cmd_params'].batch) diff --git a/model-optimizer/extensions/front/pass_separator.py b/model-optimizer/extensions/front/pass_separator.py new file mode 100644 index 0000000..3dcac16 --- /dev/null +++ b/model-optimizer/extensions/front/pass_separator.py @@ -0,0 +1,43 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph + + +class FrontStart(FrontReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + pass + + +class FrontFinish(FrontReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + pass diff --git a/model-optimizer/extensions/front/reciprocal.py b/model-optimizer/extensions/front/reciprocal.py index 3c656ea..74fe933 100644 --- a/model-optimizer/extensions/front/reciprocal.py +++ b/model-optimizer/extensions/front/reciprocal.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.power import Power @@ -25,7 +25,7 @@ class ReciprocalReplacer(FrontReplacementOp): op = "Reciprocal" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): reciprocal = Power(graph, dict(scale=1, power=-1, shift=0, name=node.name + '/power_')) out_node = reciprocal.create_node([node.in_node(0)]) diff --git a/model-optimizer/extensions/front/reciprocal_test.py b/model-optimizer/extensions/front/reciprocal_test.py index 527cb7e..1a8df9e 100644 --- a/model-optimizer/extensions/front/reciprocal_test.py +++ b/model-optimizer/extensions/front/reciprocal_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/restore_ports.py b/model-optimizer/extensions/front/restore_ports.py new file mode 100644 index 0000000..7f8fbc8 --- /dev/null +++ b/model-optimizer/extensions/front/restore_ports.py @@ -0,0 +1,42 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.common.replacement import FrontReplacementSubgraph +from mo.graph.graph import Graph + + +class RestorePorts(FrontReplacementSubgraph): + enabled = True + + def run_after(self): + from extensions.front.input_cut import InputCut + return [InputCut] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + for node_id, attrs in graph.nodes(data=True): + attrs['_in_ports'] = set() + attrs['_out_ports'] = set() + + for u, v, k, d in graph.edges(data=True, keys=True): + from_node_attrs = graph.node[u] + to_node_attrs = graph.node[v] + from_node_attrs['_out_ports'].add(d['out']) + to_node_attrs['_in_ports'].add(d['in']) + + graph.stage = 'front' diff --git a/model-optimizer/extensions/front/squared_difference.py b/model-optimizer/extensions/front/squared_difference.py index e5c94a6..a53e2ae 100644 --- a/model-optimizer/extensions/front/squared_difference.py +++ b/model-optimizer/extensions/front/squared_difference.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.eltwise import Eltwise from mo.ops.power import Power @@ -31,7 +31,7 @@ class SquaredDifference(FrontReplacementOp): op = "SquaredDifference" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): negate = Power(graph, dict(scale=-1, name=node.name + '/negate_')) add = Eltwise(graph, dict(operation='sum', name=node.name + '/add_')) squared = Power(graph, dict(power=2, name=node.name + '/squared_')) diff --git a/model-optimizer/extensions/front/standalone_const_eraser.py b/model-optimizer/extensions/front/standalone_const_eraser.py index 98ea814..295f9a3 100644 --- a/model-optimizer/extensions/front/standalone_const_eraser.py +++ b/model-optimizer/extensions/front/standalone_const_eraser.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import logging as log import networkx as nx from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import erase_node +from mo.graph.graph import Graph class StandaloneConstEraser(FrontReplacementSubgraph): @@ -35,8 +35,8 @@ class StandaloneConstEraser(FrontReplacementSubgraph): ) @staticmethod - def replace_sub_graph(graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(graph: Graph, match: dict): if not len(match['const'].in_edges()) and len(match['const'].out_edges()) == 1: - erase_node(match['const']) - erase_node(match['output']) + graph.erase_node(match['const']) + graph.erase_node(match['output']) log.info("Standalone Const node \"{}\" was removed from the graph".format(match['const'].id)) diff --git a/model-optimizer/extensions/front/sub.py b/model-optimizer/extensions/front/sub.py index a244078..2097ed0 100644 --- a/model-optimizer/extensions/front/sub.py +++ b/model-optimizer/extensions/front/sub.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.eltwise import Eltwise from mo.ops.power import Power @@ -26,7 +26,7 @@ class Sub(FrontReplacementOp): op = "Sub" enabled = True - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): negate = Power(graph, dict(scale=-1, name=node.name + '/negate_')) add = Eltwise(graph, dict(operation='sum', name=node.name + '/add_')) out_node = add.create_node([(node.in_node(0), node.in_edge(0)['out']), diff --git a/model-optimizer/extensions/front/tf/ArgMaxReshape.py b/model-optimizer/extensions/front/tf/ArgMaxReshape.py index b017684..ed77c2d 100644 --- a/model-optimizer/extensions/front/tf/ArgMaxReshape.py +++ b/model-optimizer/extensions/front/tf/ArgMaxReshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,13 +15,11 @@ """ import logging as log -import networkx as nx import numpy as np from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.squeeze import Squeeze -from mo.graph.graph import insert_node_after class ArgMaxReshape(FrontReplacementOp): @@ -32,17 +30,17 @@ class ArgMaxReshape(FrontReplacementOp): op = "ArgMax" enabled = True - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict): + def nodes_to_remove(self, graph: Graph, match: dict): # do not remove matched node return [] - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): squeeze_op = Squeeze(graph, dict()) squeeze_op.attrs['old_infer'] = squeeze_op.attrs['infer'] squeeze_op.attrs['infer'] = __class__.do_infer squeeze_node = squeeze_op.create_node([], dict(name=node.name + '/Squeeze')) - insert_node_after(node, squeeze_node) + node.insert_node_after(squeeze_node) return [] @staticmethod diff --git a/model-optimizer/extensions/front/tf/BlockLSTM.py b/model-optimizer/extensions/front/tf/BlockLSTM.py index 3e1bed4..cd0247f 100644 --- a/model-optimizer/extensions/front/tf/BlockLSTM.py +++ b/model-optimizer/extensions/front/tf/BlockLSTM.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import logging as log import networkx as nx from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error @@ -61,11 +61,19 @@ class BlockLSTM(FrontReplacementOp): op = "BlockLSTM" enabled = True - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict): + def nodes_to_remove(self, graph: Graph, match: dict): # do not remove matched node return [] - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + @staticmethod + def find_key_by_input_port(u: Node, v: Node, p: int): + key = None + for k, edge_info in u.graph.get_edge_data(u.id, v.id).items(): + if p == edge_info['in']: + return k + return key + + def replace_op(self, graph: Graph, node: Node): if node.use_peephole: raise Error("BlockLSTM operation is not supported with `use_peephole`==True. Node: {}" "".format(node.soft_get('name'))) @@ -81,7 +89,12 @@ class BlockLSTM(FrontReplacementOp): {p: o.id for p, o in node.out_nodes().items()})) log.debug("Cutting all inputs for peephole connection (5, 6, 7 input ports) off, as `use_peephole`=False") - [graph.remove_edge(node.in_node(p).id, node.id) for p, input_data in node.in_nodes().items() if p in [5, 6, 7]] + + for p, input_data in node.in_nodes().items(): + if p in [5, 6, 7]: + key = self.find_key_by_input_port(node.in_node(p), node, p) + assert key is not None + graph.remove_edge(node.in_node(p).id, node.id, key=key) log.debug("Cutting seq_len_max input off") graph.remove_edge(node.in_node(0).id, node.id) diff --git a/model-optimizer/extensions/front/tf/BlockLSTM_ext.py b/model-optimizer/extensions/front/tf/BlockLSTM_ext.py index feddc17..cdf46f8 100644 --- a/model-optimizer/extensions/front/tf/BlockLSTM_ext.py +++ b/model-optimizer/extensions/front/tf/BlockLSTM_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/CTCGreedyDecoder.py b/model-optimizer/extensions/front/tf/CTCGreedyDecoder.py index e36bf50..c424bf8 100644 --- a/model-optimizer/extensions/front/tf/CTCGreedyDecoder.py +++ b/model-optimizer/extensions/front/tf/CTCGreedyDecoder.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,11 +14,11 @@ limitations under the License. """ -import networkx as nx import numpy as np +from mo.front.common.partial_infer.utils import int64_array from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import replace_node, Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error @@ -52,14 +52,14 @@ class CTCGreedyDecoderReplacement(FrontReplacementSubgraph): ] ) - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict): + def nodes_to_remove(self, graph: Graph, match: dict): return [match['cast'].id, match['sparse_to_dense']] - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): decoder_node = match['decoder'] graph.remove_edge(decoder_node.id, match['sparse_to_dense'].id) graph.remove_edge(decoder_node.id, match['cast'].id) - replace_node(match['sparse_to_dense'], decoder_node) + match['sparse_to_dense'].replace_node(decoder_node) # update the TensorFlow infer function for the CTCGreedyDecoder to make necessary changes with the second input decoder_node['old_infer'] = decoder_node.infer @@ -77,6 +77,6 @@ class CTCGreedyDecoderReplacement(FrontReplacementSubgraph): new_value[:, 0] = 0 new_value = np.transpose(new_value) sequence_length_node.value = new_value - sequence_length_node.shape = sequence_length_node.value.shape + sequence_length_node.shape = int64_array(sequence_length_node.value.shape) node.old_infer(node) diff --git a/model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py b/model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py index 89986e4..ed5a405 100644 --- a/model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py +++ b/model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/Cast_ext.py b/model-optimizer/extensions/front/tf/Cast_ext.py new file mode 100644 index 0000000..2c29f78 --- /dev/null +++ b/model-optimizer/extensions/front/tf/Cast_ext.py @@ -0,0 +1,30 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from extensions.ops.Cast import Cast +from mo.front.extractor import FrontExtractorOp +from mo.front.tf.common import tf_data_type_decode + + +class CastFrontExtractor(FrontExtractorOp): + op = 'Cast' + enabled = True + + @staticmethod + def extract(node): + cast_dst_type = tf_data_type_decode[node.pb.attr['DstT'].type][0] + Cast.update_node_stat(node, {'dst_type': cast_dst_type}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/ConvFlatten.py b/model-optimizer/extensions/front/tf/ConvFlatten.py index 27282d3..2fd80f2 100644 --- a/model-optimizer/extensions/front/tf/ConvFlatten.py +++ b/model-optimizer/extensions/front/tf/ConvFlatten.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,29 +14,28 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.subgraph_matcher import SubgraphMatch from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph -from mo.graph.graph import insert_node_after +from mo.graph.graph import Graph from mo.ops.permute import Permute class ConvFlattenReplacement(FrontReplacementFromConfigFileSubGraph): replacement_id = 'ConvFlatten' - def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): return {} - def input_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def input_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): return {} - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): # no need to remove any of matched nodes. We just insert 'Permute' node before the matched sub-graph. return [] - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): permute_op = Permute(graph, {'order': np.array([0, 2, 3, 1])}) permute_node = permute_op.add_node({'name': match.scope + '_permute_'}) @@ -44,5 +43,5 @@ class ConvFlattenReplacement(FrontReplacementFromConfigFileSubGraph): # reshape_in_node is the node after which we should insert Permute reshape_in_node = reshape_node.in_nodes()[0] - insert_node_after(reshape_in_node, permute_node, 0) + reshape_in_node.insert_node_after(permute_node, 0) return {} diff --git a/model-optimizer/extensions/front/tf/CropAndResizeReplacement.py b/model-optimizer/extensions/front/tf/CropAndResizeReplacement.py index d02f109..15c1103 100644 --- a/model-optimizer/extensions/front/tf/CropAndResizeReplacement.py +++ b/model-optimizer/extensions/front/tf/CropAndResizeReplacement.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,12 +14,13 @@ limitations under the License. """ -import networkx as nx +import logging as log + import numpy as np -from mo.front.tf.graph_utils import add_convolution_to_swap_xy_coordinates from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node, create_edge +from mo.front.tf.graph_utils import add_convolution_to_swap_xy_coordinates +from mo.graph.graph import Node, Graph from mo.ops.concat import Concat from mo.ops.reshape import Reshape from mo.ops.unsqueeze import Unsqueeze @@ -34,16 +35,19 @@ class CropAndResizeReplacement(FrontReplacementOp): op = "CropAndResize" enabled = True - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict): + def nodes_to_remove(self, graph: Graph, match: dict): # do not remove matched node return [] - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): + if node.has_and_set('inputs_preprocessed'): + log.debug('Node "{}" has already been preprocessed'.format(node.soft_get('name'))) + return [] # reshape tensor with batch indices to 2d unsqueeze_op = Unsqueeze(graph, {'unsqueeze_dims': np.array([1], dtype=np.int64)}) unsqueeze_node = unsqueeze_op.create_node([node.in_node(2)]) - concat_op = Concat(graph, {'axis': 1, 'name': node.name + '/concat_batch_indices_and_boxes'}) + concat_op = Concat(graph, {'axis': 1, 'name': node.name + '/concat_batch_indices_and_boxes', 'in_ports_count': 2}) concat_node = concat_op.create_node([unsqueeze_node, node.in_node(1)]) # do not remove edge with crop_size because it is needed in the partial infer @@ -55,9 +59,11 @@ class CropAndResizeReplacement(FrontReplacementOp): # reshape locations tensor to 2D so it could be passed to Eltwise which will be converted to ScaleShift reshape_2d_op = Reshape(graph, dict(dim=np.array([-1, 5]))) - reshape_2d_node = reshape_2d_op.create_node([swapped_box_coordinates_node], dict(name='reshape_2d_')) - create_edge(reshape_2d_node, node, 0, 1) + + reshape_2d_node = reshape_2d_op.create_node([swapped_box_coordinates_node], + dict(name=swapped_box_coordinates_node.id + '/reshape_2d_', + nchw_layout=True)) + graph.create_edge(reshape_2d_node, node, 0, 1) # do not replace any output edge return [] - diff --git a/model-optimizer/extensions/front/tf/FlattenToReshape.py b/model-optimizer/extensions/front/tf/FlattenToReshape.py new file mode 100644 index 0000000..7198f5f --- /dev/null +++ b/model-optimizer/extensions/front/tf/FlattenToReshape.py @@ -0,0 +1,91 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from extensions.front.Pack import Pack +from extensions.front.tf.nearest_neighbor_upsampling import NearestNeighborUpsampling +from mo.front.common.partial_infer.utils import int64_array +from mo.front.common.replacement import FrontReplacementSubgraph +from mo.graph.graph import Graph + + +def is_value_is_constant(val: np.ndarray, const: [int, float]): + if val.ndim > 1: + return False + if val.ndim == 1 and len(val) > 1: + return False + return val.item() == const + + +class FlattenToReshapeableReshape(FrontReplacementSubgraph): + """ + The TensorFlow implementation of the Flatten operation is not reshape-able because the batch size is hardcoded + during te constant propagation. This transform sets the 'dim' attribute for the Reshape to [0, -1]. + """ + enabled = True + + def run_after(self): + return [NearestNeighborUpsampling] + + def run_before(self): + return [Pack] + + def pattern(self): + return dict( + nodes=[ + ('shape', dict(op='Shape')), + ('strided_slice', dict(op='StridedSlice')), + ('pack', dict(op='Pack')), + ('const', dict(op='Const')), + ('reshape', dict(op='Reshape')), + ], + edges=[ + ('shape', 'strided_slice', {'in': 0}), + ('strided_slice', 'pack', {'in': 0}), + ('const', 'pack', {'in': 1}), + ('pack', 'reshape', {'in': 1}), + ]) + + @staticmethod + def replace_sub_graph(graph: Graph, match: dict): + strided_slice_node = match['strided_slice'] + const_node = match['const'] + reshape_node = match['reshape'] + pack_node = match['pack'] + + if not const_node.has_valid('value') or not is_value_is_constant(const_node.value, -1): + log.debug('The pattern does not correspond to flatten. The second reshape dimension is not -1. It is {}'. + format(const_node.soft_get('value'))) + return + if len(pack_node.in_nodes()) != 2: + log.debug('The pattern does not correspond to flatten. The "Pack" operation produces tensor with 3 items ' + 'but should produce just 2.') + return + + expected_values = [0, 1, 1] # expected values to a StridedSlice to get the batch size + for ind in range(3): + if not strided_slice_node.in_node(ind + 1).has_valid('value') or \ + not is_value_is_constant(strided_slice_node.in_node(ind + 1).value, expected_values[ind]): + log.debug('The pattern does not correspond to flatten because of the input with index {}. The value is ' + '"{}".'.format(ind, strided_slice_node.soft_get('value'))) + return + + graph.remove_edge(pack_node.id, reshape_node.id) + reshape_node['dim'] = int64_array([0, -1]) + log.debug('The node "{}" is actually a Flatten node'.format(reshape_node.soft_get('name'))) diff --git a/model-optimizer/extensions/front/tf/ObjectDetectionAPI.py b/model-optimizer/extensions/front/tf/ObjectDetectionAPI.py index c62f9f6..c729051 100644 --- a/model-optimizer/extensions/front/tf/ObjectDetectionAPI.py +++ b/model-optimizer/extensions/front/tf/ObjectDetectionAPI.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,39 +17,43 @@ import logging as log from math import sqrt -import networkx as nx import numpy as np +from extensions.front.Pack import Pack +from extensions.front.div import Div from extensions.front.standalone_const_eraser import StandaloneConstEraser from extensions.front.sub import Sub from extensions.front.tf.CropAndResizeReplacement import CropAndResizeReplacement -from extensions.front.Pack import Pack from extensions.front.tf.Unpack import Unpack from extensions.ops.DetectionOutput import DetectionOutput from extensions.ops.priorbox_clustered import PriorBoxClusteredOp from extensions.ops.proposal import ProposalOp +from extensions.ops.psroipooling import PSROIPoolingOp from mo.front.common.layout import get_batch_dim, get_height_dim, get_width_dim +from mo.front.common.partial_infer.utils import int64_array from mo.front.common.weights import swap_weights_xy -from mo.front.extractor import output_user_data_repack, add_output_ops +from mo.front.extractor import output_user_data_repack, add_output_ops, update_attrs from mo.front.subgraph_matcher import SubgraphMatch from mo.front.tf.graph_utils import add_activation_function_after_node, add_convolution_to_swap_xy_coordinates, \ - squeeze_reshape_and_concat + squeeze_reshape_and_concat, add_fake_background_loc from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph, FrontReplacementFromConfigFileGeneral -from mo.graph.graph import create_edge, insert_node_after, Node, replace_node +from mo.graph.graph import Graph, Node from mo.ops.activation import Activation from mo.ops.concat import Concat from mo.ops.const import Const from mo.ops.crop import Crop -from mo.ops.div import Div from mo.ops.eltwise import Eltwise +from mo.ops.input import Input from mo.ops.op import PermuteAttrs from mo.ops.output import Output from mo.ops.permute import Permute +from mo.ops.reduce import Reduce from mo.ops.reshape import Reshape from mo.ops.roipooling import ROIPooling +from mo.ops.shape import Shape from mo.ops.softmax import Softmax from mo.utils.error import Error -from mo.utils.graph import backward_bfs_for_operation +from mo.utils.graph import backward_bfs_for_operation, bfs_search from mo.utils.pipeline_config import PipelineConfig missing_param_error = 'To convert the model specify path to the pipeline configuration file which was used to ' \ @@ -82,7 +86,7 @@ def _value_or_raise(match: SubgraphMatch, pipeline_config: PipelineConfig, key: return value -def _find_ssd_head_node(graph: nx.MultiDiGraph, ssd_head_index: int, head_type: str): +def _find_ssd_head_node(graph: Graph, ssd_head_index: int, head_type: str): """ Finds the SSD head node with index 'ssd_head_index' in the topology. The parameter 'head_type' specifies what type of the head is requested: with box predictions or class predictions. @@ -135,7 +139,7 @@ def _skip_node_of_type(node: Node, node_ops_to_skip: list): return node -def _relax_reshape_nodes(graph: nx.MultiDiGraph, pipeline_config: PipelineConfig): +def _relax_reshape_nodes(graph: Graph, pipeline_config: PipelineConfig): """ Finds the 'Reshape' operations following the SSD head nodes which have hard-coded output dimensions and replaces them with new ones with one of the dimensions sizes equal to -1. This function is used to make TF OD API SSD models @@ -155,23 +159,23 @@ def _relax_reshape_nodes(graph: nx.MultiDiGraph, pipeline_config: PipelineConfig assert (input_node is not None) old_reshape_node = _skip_node_of_type(input_node.out_node(), ['Identity']) assert (old_reshape_node.op == 'Reshape') - reshape_size_node = Const(graph, {'value': np.array([0, -1, 1, 4])}).create_node([]) + reshape_size_node = Const(graph, {'value': int64_array([0, -1, 1, 4])}).create_node([]) new_reshape_op = Reshape(graph, {'name': input_node.id + '/Reshape', 'correct_data_layout': True}) new_reshape_node = new_reshape_op.create_node([input_node, reshape_size_node]) - replace_node(old_reshape_node, new_reshape_node) + old_reshape_node.replace_node(new_reshape_node) # fix hard-coded value for the number of items in tensor produced by the convolution to make topology reshapable input_node = _find_ssd_head_node(graph, ssd_head_ind, 'class') assert (input_node is not None) old_reshape_node = _skip_node_of_type(input_node.out_node(), ['Identity']) assert (old_reshape_node.op == 'Reshape') - reshape_size_node_2 = Const(graph, {'value': np.array([0, -1, num_classes + 1])}).create_node([]) + reshape_size_node_2 = Const(graph, {'value': int64_array([0, -1, num_classes + 1])}).create_node([]) new_reshape_op_2 = Reshape(graph, {'name': input_node.id + '/Reshape', 'correct_data_layout': True}) new_reshape_node_2 = new_reshape_op_2.create_node([input_node, reshape_size_node_2]) - replace_node(old_reshape_node, new_reshape_node_2) + old_reshape_node.replace_node(new_reshape_node_2) -def _create_prior_boxes_node(graph: nx.MultiDiGraph, pipeline_config: PipelineConfig): +def _create_prior_boxes_node(graph: Graph, pipeline_config: PipelineConfig): """ The function creates one or several PriorBoxClustered nodes based on information from the pipeline configuration files. The PriorBoxClustered nodes get input data from SSD 'heads' and from the placeholder node (just to get @@ -227,11 +231,11 @@ def _create_prior_boxes_node(graph: nx.MultiDiGraph, pipeline_config: PipelineCo if len(prior_box_nodes) == 1: return prior_box_nodes[0] else: - concat_prior_boxes_op = Concat(graph, {'axis': -1}) + concat_prior_boxes_op = Concat(graph, {'axis': -1, 'in_ports_count': len(prior_box_nodes)}) return concat_prior_boxes_op.create_node(prior_box_nodes, {'name': 'ConcatPriorBoxesClustered'}) -def _create_multiscale_prior_boxes_node(graph: nx.MultiDiGraph, pipeline_config: PipelineConfig): +def _create_multiscale_prior_boxes_node(graph: Graph, pipeline_config: PipelineConfig): """ The function creates one or several PriorBoxClustered nodes based on information from the pipeline configuration files. The PriorBoxClustered nodes get input data from SSD 'heads' and from the placeholder node (just to get @@ -272,7 +276,7 @@ def _create_multiscale_prior_boxes_node(graph: nx.MultiDiGraph, pipeline_config: if len(prior_box_nodes) == 1: return prior_box_nodes[0] else: - concat_prior_boxes_op = Concat(graph, {'axis': -1}) + concat_prior_boxes_op = Concat(graph, {'axis': -1, 'in_ports_count': len(prior_box_nodes)}) return concat_prior_boxes_op.create_node(prior_box_nodes, {'name': 'ConcatPriorBoxesClustered'}) @@ -293,7 +297,7 @@ def calculate_shape_keeping_aspect_ratio(height: int, width: int, min_size: int, return int(round(height * ratio)), int(round(width * ratio)) -def calculate_placeholder_spatial_shape(graph: nx.MultiDiGraph, match: SubgraphMatch, pipeline_config: PipelineConfig): +def calculate_placeholder_spatial_shape(graph: Graph, match: SubgraphMatch, pipeline_config: PipelineConfig): """ The function calculates the preprocessed shape of the input image for a TensorFlow Object Detection API model. It uses various sources to calculate it: @@ -388,7 +392,7 @@ class ObjectDetectionAPIPreprocessorReplacement(FrontReplacementFromConfigFileSu def run_before(self): return [Pack, Sub] - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): new_nodes_to_remove = match.matched_nodes_names() # do not remove nodes that perform input image scaling and mean value subtraction for node_to_keep in ('Preprocessor/sub', 'Preprocessor/sub/y', 'Preprocessor/mul', 'Preprocessor/mul/x'): @@ -396,7 +400,7 @@ class ObjectDetectionAPIPreprocessorReplacement(FrontReplacementFromConfigFileSu new_nodes_to_remove.remove(node_to_keep) return new_nodes_to_remove - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): argv = graph.graph['cmd_params'] layout = graph.graph['layout'] if argv.tensorflow_object_detection_api_pipeline_config is None: @@ -423,8 +427,6 @@ class ObjectDetectionAPIPreprocessorReplacement(FrontReplacementFromConfigFileSu batch_dim = get_batch_dim(layout, 4) if argv.batch is None and placeholder_node.shape[batch_dim] == -1: placeholder_node.shape[batch_dim] = 1 - if placeholder_node.shape[batch_dim] > 1: - print("[ WARNING ] The batch size more than 1 is supported for SSD topologies only.") height, width = calculate_placeholder_spatial_shape(graph, match, pipeline_config) placeholder_node.shape[get_height_dim(layout, 4)] = height placeholder_node.shape[get_width_dim(layout, 4)] = width @@ -440,9 +442,9 @@ class ObjectDetectionAPIPreprocessorReplacement(FrontReplacementFromConfigFileSu # connect to_float_node directly with node performing scale on mean value subtraction if mul_node is None: - create_edge(to_float_node, sub_node, 0, 0) + graph.create_edge(to_float_node, sub_node, 0, 0) else: - create_edge(to_float_node, mul_node, 0, 1) + graph.create_edge(to_float_node, mul_node, 0, 1) print('The Preprocessor block has been removed. Only nodes performing mean value subtraction and scaling (if' ' applicable) are kept.') @@ -465,12 +467,22 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil def run_after(self): return [ObjectDetectionAPIProposalReplacement, CropAndResizeReplacement] - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): new_nodes_to_remove = match.matched_nodes_names().copy() - new_nodes_to_remove.extend(['detection_boxes', 'detection_scores', 'num_detections']) + outputs = ['detection_boxes', 'detection_scores', 'num_detections'] + for output in outputs: + children = Node(graph, output).out_nodes() + if len(children) != 1: + log.warning('Output {} has {} children. It should have only one output: with op==`OpOutput`' + ''.format(output, len(children))) + elif children[list(children.keys())[0]].op == 'OpOutput': + new_nodes_to_remove.append(children[list(children.keys())[0]].id) + else: + continue + new_nodes_to_remove.extend(outputs) return new_nodes_to_remove - def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): # the DetectionOutput in IE produces single tensor, but in TF it produces four tensors, so we need to create # only one output edge match return {match.output_node(0)[0].id: new_sub_graph['detection_output_node'].id} @@ -481,62 +493,60 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil current_node = current_node.in_node() return current_node - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): argv = graph.graph['cmd_params'] if argv.tensorflow_object_detection_api_pipeline_config is None: raise Error(missing_param_error) pipeline_config = PipelineConfig(argv.tensorflow_object_detection_api_pipeline_config) num_classes = _value_or_raise(match, pipeline_config, 'num_classes') - first_stage_max_proposals = _value_or_raise(match, pipeline_config, 'first_stage_max_proposals') + max_proposals = _value_or_raise(match, pipeline_config, 'first_stage_max_proposals') activation_function = _value_or_raise(match, pipeline_config, 'postprocessing_score_converter') activation_conf_node = add_activation_function_after_node(graph, match.single_input_node(1)[0].in_node(0), activation_function) - # IE DetectionOutput layer consumes flattened tensors - # reshape operation to flatten confidence tensor - reshape_conf_op = Reshape(graph, dict(dim=np.array([1, -1]))) + # IE DetectionOutput layer consumes flattened tensors so need add a Reshape layer. + # The batch value of the input tensor is not equal to the batch of the topology, so it is not possible to use + # "0" value in the Reshape layer attribute to refer to the batch size, but we know how to + # calculate the second dimension so the batch value will be deduced from it with help of "-1". + reshape_conf_op = Reshape(graph, dict(dim=int64_array([-1, (num_classes + 1) * max_proposals]))) reshape_conf_node = reshape_conf_op.create_node([activation_conf_node], dict(name='do_reshape_conf')) - # TF produces locations tensor without boxes for background. - # Inference Engine DetectionOutput layer requires background boxes so we generate them with some values - # and concatenate with locations tensor - fake_background_locs_blob = np.tile([[[1, 1, 2, 2]]], [first_stage_max_proposals, 1, 1]) - fake_background_locs_const_op = Const(graph, dict(value=fake_background_locs_blob)) - fake_background_locs_const_node = fake_background_locs_const_op.create_node([]) - # Workaround for PermuteForReshape pass. # We looking for first not Reshape-typed node before match.single_input_node(0)[0].in_node(0). # And add reshape_loc node after this first not Reshape-typed node. current_node = self.skip_nodes_by_condition(match.single_input_node(0)[0].in_node(0), lambda x: x['kind'] == 'op' and x.soft_get('type') == 'Reshape') - reshape_loc_op = Reshape(graph, dict(dim=np.array([first_stage_max_proposals, num_classes, 4]))) - reshape_loc_node = reshape_loc_op.create_node([current_node], dict(name='reshape_loc')) - - concat_loc_op = Concat(graph, dict(axis=1)) - concat_loc_node = concat_loc_op.create_node([fake_background_locs_const_node, reshape_loc_node], - dict(name='concat_fake_loc')) - PermuteAttrs.set_permutation(reshape_loc_node, concat_loc_node, None) - PermuteAttrs.set_permutation(fake_background_locs_const_node, concat_loc_node, None) + reshape_loc_op = Reshape(graph, dict(dim=int64_array([-1, num_classes, 1, 4]))) + reshape_loc_node = reshape_loc_op.create_node([current_node], dict(name='reshape_loc', nchw_layout=True)) + update_attrs(reshape_loc_node, 'shape_attrs', 'dim') # constant node with variances variances_const_op = Const(graph, dict(value=_variance_from_pipeline_config(pipeline_config))) variances_const_node = variances_const_op.create_node([]) + # TF produces locations tensor without boxes for background. + # Inference Engine DetectionOutput layer requires background boxes so we generate them + loc_node = add_fake_background_loc(graph, reshape_loc_node) + PermuteAttrs.set_permutation(reshape_loc_node, loc_node, None) + # reshape locations tensor to 2D so it could be passed to Eltwise which will be converted to ScaleShift - reshape_loc_2d_op = Reshape(graph, dict(dim=np.array([-1, 4]))) - reshape_loc_2d_node = reshape_loc_2d_op.create_node([concat_loc_node], dict(name='reshape_locs_2')) - PermuteAttrs.set_permutation(concat_loc_node, reshape_loc_2d_node, None) + reshape_loc_2d_op = Reshape(graph, dict(dim=int64_array([-1, 4]))) + reshape_loc_2d_node = reshape_loc_2d_op.create_node([loc_node], dict(name='reshape_locs_2d', nchw_layout=True)) + PermuteAttrs.set_permutation(loc_node, reshape_loc_2d_node, None) # element-wise multiply locations with variances eltwise_locs_op = Eltwise(graph, dict(operation='mul')) eltwise_locs_node = eltwise_locs_op.create_node([reshape_loc_2d_node, variances_const_node], dict(name='scale_locs')) - # IE DetectionOutput layer consumes flattened tensors - reshape_loc_do_op = Reshape(graph, dict(dim=np.array([1, -1]))) + # IE DetectionOutput layer consumes flattened tensors so need add a Reshape layer. + # The batch value of the input tensor is not equal to the batch of the topology, so it is not possible to use + # "0" value in the Reshape layer attribute to refer to the batch size, but we know how to + # calculate the second dimension so the batch value will be deduced from it with help of "-1". + reshape_loc_do_op = Reshape(graph, dict(dim=int64_array([-1, (num_classes + 1) * max_proposals * 4]))) custom_attributes = match.custom_replacement_desc.custom_attributes coordinates_swap_method = 'add_convolution' @@ -564,18 +574,21 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil # find Proposal output which has the data layout as in TF: YXYX coordinates without batch indices. proposal_nodes_ids = [node_id for node_id, attrs in graph.nodes(data=True) - if 'name' in attrs and attrs['name'] == 'proposals'] + if 'name' in attrs and attrs['name'] == 'crop_proposals'] if len(proposal_nodes_ids) != 1: - raise Error("Found the following nodes '{}' with name 'proposals' but there should be exactly 1. " + raise Error("Found the following nodes '{}' with name 'crop_proposals' but there should be exactly 1. " "Looks like ObjectDetectionAPIProposalReplacement replacement didn't work.". format(proposal_nodes_ids)) proposal_node = Node(graph, proposal_nodes_ids[0]) - swapped_proposals_node = add_convolution_to_swap_xy_coordinates(graph, proposal_node, 5) + # check whether it is necessary to permute proposals coordinates before passing them to the DetectionOutput + # currently this parameter is set for the RFCN topologies + if 'swap_proposals' in custom_attributes and custom_attributes['swap_proposals']: + proposal_node = add_convolution_to_swap_xy_coordinates(graph, proposal_node, 4) # reshape priors boxes as Detection Output expects - reshape_priors_op = Reshape(graph, dict(dim=np.array([1, 1, -1]))) - reshape_priors_node = reshape_priors_op.create_node([swapped_proposals_node], + reshape_priors_op = Reshape(graph, dict(dim=int64_array([-1, 1, max_proposals * 4]))) + reshape_priors_node = reshape_priors_op.create_node([proposal_node], dict(name='DetectionOutput_reshape_priors_')) detection_output_op = DetectionOutput(graph, {}) @@ -583,14 +596,16 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil # update infer function to re-pack weights detection_output_op.attrs['old_infer'] = detection_output_op.attrs['infer'] detection_output_op.attrs['infer'] = __class__.do_infer + for key in ('clip_before_nms', 'clip_after_nms'): + if key in match.custom_replacement_desc.custom_attributes: + detection_output_op.attrs[key] = int(match.custom_replacement_desc.custom_attributes[key]) + detection_output_node = detection_output_op.create_node( [reshape_loc_do_node, reshape_conf_node, reshape_priors_node], - dict(name=detection_output_op.attrs['type'], share_location=0, normalized=0, variance_encoded_in_target=1, - clip=1, code_type='caffe.PriorBoxParameter.CENTER_SIZE', pad_mode='caffe.ResizeParameter.CONSTANT', + dict(name=detection_output_op.attrs['type'], share_location=0, variance_encoded_in_target=1, + code_type='caffe.PriorBoxParameter.CENTER_SIZE', pad_mode='caffe.ResizeParameter.CONSTANT', resize_mode='caffe.ResizeParameter.WARP', num_classes=num_classes, - input_height=graph.graph['preprocessed_image_height'], - input_width=graph.graph['preprocessed_image_width'], confidence_threshold=_value_or_raise(match, pipeline_config, 'postprocessing_score_threshold'), top_k=_value_or_raise(match, pipeline_config, 'postprocessing_max_detections_per_class'), keep_top_k=_value_or_raise(match, pipeline_config, 'postprocessing_max_total_detections'), @@ -618,10 +633,13 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil class ObjectDetectionAPIMaskRCNNROIPoolingSecondReplacement(FrontReplacementFromConfigFileSubGraph): replacement_id = 'ObjectDetectionAPIMaskRCNNROIPoolingSecondReplacement' - def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def run_after(self): + return [ObjectDetectionAPIProposalReplacement] + + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): return {match.output_node(0)[0].id: new_sub_graph['roi_pooling_node'].id} - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): argv = graph.graph['cmd_params'] if argv.tensorflow_object_detection_api_pipeline_config is None: raise Error(missing_param_error) @@ -636,7 +654,7 @@ class ObjectDetectionAPIMaskRCNNROIPoolingSecondReplacement(FrontReplacementFrom detection_output_node = Node(graph, detection_output_nodes_ids[0]) # add reshape of Detection Output so it can be an output of the topology - reshape_detection_output_2d_op = Reshape(graph, dict(dim=np.array([-1, 7]))) + reshape_detection_output_2d_op = Reshape(graph, dict(dim=int64_array([-1, 7]))) reshape_detection_output_2d_node = reshape_detection_output_2d_op.create_node( [detection_output_node], dict(name='reshape_do_2d')) @@ -648,15 +666,24 @@ class ObjectDetectionAPIMaskRCNNROIPoolingSecondReplacement(FrontReplacementFrom output_node.in_edge()['data_attrs'].append('output_sort_order') output_node.in_edge()['output_sort_order'] = [('detection_boxes', 0)] - # creates the Crop operation that gets input from the DetectionOutput layer, cuts of slices of data with batch - # indices and class labels producing a tensor with classes probabilities and bounding boxes only as it is - # expected by the ROIPooling layer - crop_op = Crop(graph, dict(axis=np.array([3]), offset=np.array([2]), dim=np.array([5]), nchw_layout=True)) - crop_node = crop_op.create_node([detection_output_node], dict(name='crop_do')) + # creates two Crop operations which get input from the DetectionOutput layer, cuts of slices of data with class + # ids and probabilities and produce a tensor with batch ids and bounding boxes only (as it is expected by the + # ROIPooling layer) + crop_batch_op = Crop(graph, dict(axis=int64_array([3]), offset=int64_array([0]), dim=int64_array([1]), + nchw_layout=True)) + crop_batch_node = crop_batch_op.create_node([detection_output_node], dict(name='crop_do_batch_ids')) + + crop_coordinates_op = Crop(graph, dict(axis=int64_array([3]), offset=int64_array([3]), dim=int64_array([4]), + nchw_layout=True)) + crop_coordinates_node = crop_coordinates_op.create_node([detection_output_node], dict(name='crop_do_coords')) + + concat_op = Concat(graph, dict(axis=3)) + concat_node = concat_op.create_node([crop_batch_node, crop_coordinates_node], dict(name='batch_and_coords', + nchw_layout=True)) # reshape bounding boxes as required by ROIPooling - reshape_do_op = Reshape(graph, dict(dim=np.array([-1, 5]))) - reshape_do_node = reshape_do_op.create_node([crop_node], dict(name='reshape_do')) + reshape_do_op = Reshape(graph, dict(dim=int64_array([-1, 5]))) + reshape_do_node = reshape_do_op.create_node([concat_node], dict(name='reshape_do')) roi_pooling_op = ROIPooling(graph, dict(method="bilinear", spatial_scale=1, pooled_h=roi_pool_size, pooled_w=roi_pool_size)) @@ -675,7 +702,7 @@ class ObjectDetectionAPIMaskRCNNSigmoidReplacement(FrontReplacementFromConfigFil def run_after(self): return [ObjectDetectionAPIMaskRCNNROIPoolingSecondReplacement] - def transform_graph(self, graph: nx.MultiDiGraph, replacement_descriptions): + def transform_graph(self, graph: Graph, replacement_descriptions): output_node = None op_outputs = [n for n, d in graph.nodes(data=True) if 'op' in d and d['op'] == 'OpOutput'] for op_output in op_outputs: @@ -711,24 +738,22 @@ class ObjectDetectionAPIProposalReplacement(FrontReplacementFromConfigFileSubGra def run_before(self): return [Sub, CropAndResizeReplacement] - def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): return {match.output_node(0)[0].id: new_sub_graph['proposal_node'].id} - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): new_list = match.matched_nodes_names().copy() # do not remove nodes that produce box predictions and class predictions new_list.remove(match.single_input_node(0)[0].id) new_list.remove(match.single_input_node(1)[0].id) return new_list - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): argv = graph.graph['cmd_params'] if argv.tensorflow_object_detection_api_pipeline_config is None: raise Error(missing_param_error) pipeline_config = PipelineConfig(argv.tensorflow_object_detection_api_pipeline_config) - input_height = graph.graph['preprocessed_image_height'] - input_width = graph.graph['preprocessed_image_width'] max_proposals = _value_or_raise(match, pipeline_config, 'first_stage_max_proposals') proposal_ratios = _value_or_raise(match, pipeline_config, 'anchor_generator_aspect_ratios') proposal_scales = _value_or_raise(match, pipeline_config, 'anchor_generator_scales') @@ -737,39 +762,24 @@ class ObjectDetectionAPIProposalReplacement(FrontReplacementFromConfigFileSubGra # Convolution/matmul node that produces classes predictions # Permute result of the tensor with classes permissions so it will be in a correct layout for Softmax predictions_node = backward_bfs_for_operation(match.single_input_node(1)[0], ['Add'])[0] - permute_predictions_op = Permute(graph, dict(order=np.array([0, 2, 3, 1]))) - permute_predictions_node = permute_predictions_op.create_node([], dict(name=predictions_node.name + '/Permute')) - insert_node_after(predictions_node, permute_predictions_node, 0) - - # creates constant input with the image height, width and scale H and scale W (if present) required for Proposal - const_op = Const(graph, dict(value=np.array([[input_height, input_width, 1]], dtype=np.float32))) - const_node = const_op.create_node([], dict(name='proposal_const_image_size')) - - reshape_classes_op = Reshape(graph, dict(dim=np.array([0, -1, 2]))) - reshape_classes_node = reshape_classes_op.create_node([permute_predictions_node], - dict(name='reshape_FirstStageBoxPredictor_class', - nchw_layout=True)) - softmax_conf_op = Softmax(graph, dict(axis=2)) - softmax_conf_node = softmax_conf_op.create_node([reshape_classes_node], - dict(name='FirstStageBoxPredictor_softMax_class')) - PermuteAttrs.set_permutation(reshape_classes_node, softmax_conf_node, None) + reshape_classes_op = Reshape(graph, dict(dim=int64_array([0, anchors_count, 2, -1]))) + reshape_classes_node = reshape_classes_op.create_node([], dict(name='predictions/Reshape', nchw_layout=True)) + predictions_node.insert_node_after(reshape_classes_node, 0) - reshape_softmax_op = Reshape(graph, dict(dim=np.array([1, anchors_count, 2, -1]))) - reshape_softmax_node = reshape_softmax_op.create_node([softmax_conf_node], dict(name='reshape_softmax_class')) - PermuteAttrs.set_permutation(softmax_conf_node, reshape_softmax_node, None) + softmax_conf_op = Softmax(graph, dict(axis=2, nchw_layout=True, name=reshape_classes_node.id + '/Softmax')) + softmax_conf_node = softmax_conf_op.create_node([reshape_classes_node]) + permute_reshape_softmax_op = Permute(graph, dict(order=int64_array([0, 2, 1, 3]), nchw_layout=True)) + permute_reshape_softmax_node = permute_reshape_softmax_op.create_node([softmax_conf_node], dict( + name=softmax_conf_node.name + '/Permute')) - permute_reshape_softmax_op = Permute(graph, dict(order=np.array([0, 1, 3, 2]))) - permute_reshape_softmax_node = permute_reshape_softmax_op.create_node([reshape_softmax_node], dict( - name=reshape_softmax_node.name + '/Permute')) + initial_shape_op = Shape(graph, dict(name=predictions_node.id + '/Shape')) + initial_shape_node = initial_shape_op.create_node([predictions_node]) # implement custom reshape infer function because we need to know the input convolution node output dimension # sizes but we can know it only after partial infer - reshape_permute_op = Reshape(graph, - dict(dim=np.ones([4]), anchors_count=anchors_count, conv_node=predictions_node)) - reshape_permute_op.attrs['old_infer'] = reshape_permute_op.attrs['infer'] - reshape_permute_op.attrs['infer'] = __class__.classes_probabilities_reshape_shape_infer - reshape_permute_node = reshape_permute_op.create_node([permute_reshape_softmax_node], + reshape_permute_op = Reshape(graph, dict()) + reshape_permute_node = reshape_permute_op.create_node([permute_reshape_softmax_node, initial_shape_node], dict(name='Reshape_Permute_Class')) variance_height = pipeline_config.get_param('frcnn_variance_height') @@ -805,46 +815,61 @@ class ObjectDetectionAPIProposalReplacement(FrontReplacementFromConfigFileSubGra feat_stride=anchor_generator_height_stride, ratio=proposal_ratios, scale=proposal_scales, + normalize=1, base_size=anchor_generator_height, nms_thresh=_value_or_raise(match, pipeline_config, 'first_stage_nms_iou_threshold'))) + for key in ('clip_before_nms', 'clip_after_nms'): + if key in match.custom_replacement_desc.custom_attributes: + proposal_op.attrs[key] = int(match.custom_replacement_desc.custom_attributes[key]) anchors_node = backward_bfs_for_operation(match.single_input_node(0)[0], ['Add'])[0] - proposal_node = proposal_op.create_node([reshape_permute_node, anchors_node, const_node], - dict(name='proposals')) - # the TF implementation of ROIPooling with bi-linear filtration need proposals scaled by image size - proposal_scale_const = np.array([1.0, 1 / input_height, 1 / input_width, 1 / input_height, 1 / input_width], - dtype=np.float32) - proposal_scale_const_op = Const(graph, dict(value=proposal_scale_const)) - proposal_scale_const_node = proposal_scale_const_op.create_node([], dict(name='Proposal_scale_const')) + # creates input to store input image height, width and scales (usually 1.0s) + # the batch size for this input is fixed because it is allowed to pass images of the same size only as input + input_op_with_image_size = Input(graph, dict(shape=int64_array([1, 3]), fixed_batch=True)) + input_with_image_size_node = input_op_with_image_size.create_node([], dict(name='image_info')) - scale_proposals_op = Eltwise(graph, dict(operation='mul')) - scale_proposals_node = scale_proposals_op.create_node([proposal_node, proposal_scale_const_node], - dict(name='scaled_proposals')) + proposal_node = proposal_op.create_node([reshape_permute_node, anchors_node, input_with_image_size_node], + dict(name='proposals')) - proposal_reshape_4d_op = Reshape(graph, dict(dim=np.array([1, 1, max_proposals, 5]), nchw_layout=True)) - proposal_reshape_4d_node = proposal_reshape_4d_op.create_node([scale_proposals_node], - dict(name="reshape_proposals_4d")) + if 'do_not_swap_proposals' in match.custom_replacement_desc.custom_attributes and \ + match.custom_replacement_desc.custom_attributes['do_not_swap_proposals']: + swapped_proposals_node = proposal_node + else: + swapped_proposals_node = add_convolution_to_swap_xy_coordinates(graph, proposal_node, 5) - # creates the Crop operation that gets input from the Proposal layer and gets tensor with bounding boxes only - crop_op = Crop(graph, dict(axis=np.array([3]), offset=np.array([1]), dim=np.array([4]), nchw_layout=True)) - crop_node = crop_op.create_node([proposal_reshape_4d_node], dict(name='crop_proposals')) + proposal_reshape_2d_op = Reshape(graph, dict(dim=int64_array([-1, 5]), nchw_layout=True)) + proposal_reshape_2d_node = proposal_reshape_2d_op.create_node([swapped_proposals_node], + dict(name="reshape_swap_proposals_2d")) - proposal_reshape_3d_op = Reshape(graph, dict(dim=np.array([0, -1, 4]), nchw_layout=True)) - proposal_reshape_3d_node = proposal_reshape_3d_op.create_node([crop_node], dict(name="tf_proposals")) + # feed the CropAndResize node with a correct boxes information produced with the Proposal layer + # find the first CropAndResize node in the BFS order + crop_and_resize_nodes_ids = [node_id for node_id in bfs_search(graph, [match.single_input_node(0)[0].id]) if + graph.node[node_id]['op'] == 'CropAndResize'] + assert len(crop_and_resize_nodes_ids) != 0, "Didn't find any CropAndResize nodes in the graph." + if 'do_not_swap_proposals' not in match.custom_replacement_desc.custom_attributes or not \ + match.custom_replacement_desc.custom_attributes['do_not_swap_proposals']: + crop_and_resize_node = Node(graph, crop_and_resize_nodes_ids[0]) + # set a marker that the input with box coordinates has been pre-processed so the CropAndResizeReplacement + # transform doesn't try to merge the second and the third inputs + crop_and_resize_node['inputs_preprocessed'] = True + graph.remove_edge(crop_and_resize_node.in_node(1).id, crop_and_resize_node.id) + graph.create_edge(proposal_reshape_2d_node, crop_and_resize_node, out_port=0, in_port=1) - return {'proposal_node': proposal_reshape_3d_node} + tf_proposal_reshape_4d_op = Reshape(graph, dict(dim=int64_array([-1, 1, max_proposals, 5]), nchw_layout=True)) + tf_proposal_reshape_4d_node = tf_proposal_reshape_4d_op.create_node([swapped_proposals_node], + dict(name="reshape_proposal_4d")) - @staticmethod - def classes_probabilities_reshape_shape_infer(node: Node): - # now we can determine the reshape dimensions from Convolution node - conv_node = node.conv_node - conv_output_shape = conv_node.out_node().shape + crop_op = Crop(graph, dict(axis=int64_array([3]), offset=int64_array([1]), dim=int64_array([4]), + nchw_layout=True)) + crop_node = crop_op.create_node([tf_proposal_reshape_4d_node], dict(name='crop_proposals')) - # update desired shape of the Reshape node - node.dim = np.array([0, conv_output_shape[1], conv_output_shape[2], node.anchors_count * 2]) - node.old_infer(node) + tf_proposals_crop_reshape_3d_op = Reshape(graph, dict(dim=int64_array([0, -1, 4]), nchw_layout=True)) + tf_proposals_crop_reshape_3d_node = tf_proposals_crop_reshape_3d_op.create_node([crop_node], + dict(name="reshape_crop_3d")) + + return {'proposal_node': tf_proposals_crop_reshape_3d_node} class ObjectDetectionAPISSDPostprocessorReplacement(FrontReplacementFromConfigFileSubGraph): @@ -859,12 +884,12 @@ class ObjectDetectionAPISSDPostprocessorReplacement(FrontReplacementFromConfigFi # nodes return [Div, StandaloneConstEraser] - def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): # the DetectionOutput in IE produces single tensor, but in TF it produces two tensors, so create only one output # edge match return {match.output_node(0)[0].id: new_sub_graph['detection_output_node'].id} - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): argv = graph.graph['cmd_params'] if argv.tensorflow_object_detection_api_pipeline_config is None: raise Error(missing_param_error) @@ -872,7 +897,7 @@ class ObjectDetectionAPISSDPostprocessorReplacement(FrontReplacementFromConfigFi num_classes = _value_or_raise(match, pipeline_config, 'num_classes') # reshapes confidences to 4D before applying activation function - expand_dims_op = Reshape(graph, {'dim': np.array([0, 1, -1, num_classes + 1])}) + expand_dims_op = Reshape(graph, {'dim': int64_array([0, 1, -1, num_classes + 1])}) # do not convert from NHWC to NCHW this node shape expand_dims_node = expand_dims_op.create_node([match.input_nodes(1)[0][0].in_node(0)], dict(name='do_ExpandDims_conf')) @@ -883,13 +908,13 @@ class ObjectDetectionAPISSDPostprocessorReplacement(FrontReplacementFromConfigFi # IE DetectionOutput layer consumes flattened tensors # reshape operation to flatten locations tensor - reshape_loc_op = Reshape(graph, {'dim': np.array([0, -1])}) + reshape_loc_op = Reshape(graph, {'dim': int64_array([0, -1])}) reshape_loc_node = reshape_loc_op.create_node([match.input_nodes(0)[0][0].in_node(0)], dict(name='do_reshape_loc')) # IE DetectionOutput layer consumes flattened tensors # reshape operation to flatten confidence tensor - reshape_conf_op = Reshape(graph, {'dim': np.array([0, -1])}) + reshape_conf_op = Reshape(graph, {'dim': int64_array([0, -1])}) reshape_conf_node = reshape_conf_op.create_node([activation_conf_node], dict(name='do_reshape_conf')) if pipeline_config.get_param('ssd_anchor_generator_num_layers') is not None or \ @@ -933,7 +958,7 @@ class ObjectDetectionAPISSDPostprocessorReplacement(FrontReplacementFromConfigFi variance = _variance_from_pipeline_config(pipeline_config) # replicating the variance values for all prior-boxes variances = np.tile(variance, [prior_boxes.shape[-2], 1]) - # DetectionOutput in the Inference Engine expects the prior-boxes in the following layout: (values, variances) + # DetectionOutput Inference Engine expects the prior-boxes in the following layout: (values, variances) prior_boxes = prior_boxes.reshape([-1, 4]) prior_boxes = np.concatenate((prior_boxes, variances), 0) # compared to the IE's DetectionOutput, the TF keeps the prior-boxes in YXYX, need to get back to the XYXY @@ -941,7 +966,7 @@ class ObjectDetectionAPISSDPostprocessorReplacement(FrontReplacementFromConfigFi prior_boxes[:, 3:4], prior_boxes[:, 2:3]), 1) # adding another dimensions, as the prior-boxes are expected as 3d tensors prior_boxes = prior_boxes.reshape((1, 2, -1)) - node.in_node(2).shape = np.array(prior_boxes.shape, dtype=np.int64) + node.in_node(2).shape = int64_array(prior_boxes.shape) node.in_node(2).value = prior_boxes node.old_infer(node) @@ -977,7 +1002,7 @@ class ObjectDetectionAPIOutputReplacement(FrontReplacementFromConfigFileGeneral) def run_before(self): return [ObjectDetectionAPIPreprocessorReplacement] - def transform_graph(self, graph: nx.MultiDiGraph, replacement_descriptions: dict): + def transform_graph(self, graph: Graph, replacement_descriptions: dict): if graph.graph['cmd_params'].output is not None: log.warning('User defined output nodes are specified. Skip the graph cut-off by the ' 'ObjectDetectionAPIOutputReplacement.') @@ -993,3 +1018,97 @@ class ObjectDetectionAPIOutputReplacement(FrontReplacementFromConfigFileGeneral) log.debug('A node "{}" does not exist in the graph. Do not add it as output'.format(out_node_name)) _outputs = output_user_data_repack(graph, outputs) add_output_ops(graph, _outputs, graph.graph['inputs']) + + +class ObjectDetectionAPIPSROIPoolingReplacement(FrontReplacementFromConfigFileSubGraph): + replacement_id = 'ObjectDetectionAPIPSROIPoolingReplacement' + + def run_after(self): + return [ObjectDetectionAPIProposalReplacement] + + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): + return {match.output_node(0)[0].id: new_sub_graph['output_node'].id} + + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): + argv = graph.graph['cmd_params'] + if argv.tensorflow_object_detection_api_pipeline_config is None: + raise Error(missing_param_error) + pipeline_config = PipelineConfig(argv.tensorflow_object_detection_api_pipeline_config) + num_classes = _value_or_raise(match, pipeline_config, 'num_classes') + + input_node = match.input_nodes(0)[0][0].in_node(0) + if 'class_predictions' in input_node.id: + psroipooling_output_dim = num_classes + 1 + else: + psroipooling_output_dim = num_classes * 4 + + num_spatial_bins_height = pipeline_config.get_param('num_spatial_bins_height') + num_spatial_bins_width = pipeline_config.get_param('num_spatial_bins_width') + crop_height = pipeline_config.get_param('crop_height') + crop_width = pipeline_config.get_param('crop_width') + if crop_height != crop_width: + raise Error('Different "crop_height" and "crop_width" parameters from the pipeline config are not ' + 'supported: {} vs {}'.format(crop_height, crop_width)) + psroipooling_op = PSROIPoolingOp(graph, {'name': input_node.soft_get('name') + '/PSROIPooling', + 'output_dim': psroipooling_output_dim, + 'group_size': crop_width / num_spatial_bins_width, + 'spatial_bins_x': num_spatial_bins_width, + 'spatial_bins_y': num_spatial_bins_height, + 'mode': 'bilinear', + 'spatial_scale': 1, + }) + + if 'reshape_swap_proposals_2d' in graph.nodes(): + reshape_swap_proposals_node = Node(graph, 'reshape_swap_proposals_2d') + else: + swap_proposals_node = add_convolution_to_swap_xy_coordinates(graph, Node(graph, 'proposals'), 5) + reshape_swap_proposals_node = Reshape(graph, {'dim': [-1, 5], 'nchw_layout': True, + 'name': 'reshape_swap_proposals_2d'}).create_node( + [swap_proposals_node]) + psroipooling_node = psroipooling_op.create_node([input_node, reshape_swap_proposals_node]) + + reduce_op = Reduce(graph, {'name': 'mean', + 'reduce_type': 'mean', + 'axis': int64_array([1, 2]), + 'keep_dims': True + }) + reduce_node = reduce_op.create_node([psroipooling_node]) + + graph.erase_node(match.output_node(0)[0].out_node()) + + return {'output_node': reduce_node} + + +class ObjectDetectionAPIConstValueOverride(FrontReplacementFromConfigFileGeneral): + """ + Transforms allows to override specific constant values in the topology. The replacement description configuration + file contains list of tuples describing the desired replacements specified in the "replacements" key of the + "custom_attributes". The first element in the tuple is the initial node name of the graph with constant value. The + second element is the name of the parameter from the pipeline configuration file which stores new value. + + Usage example. The Faster-RCNNs topologies has constant node with the number specifying maximum generated proposals. + This value is specified in the pipeline configuration file in the parameter 'first_stage_max_proposals' and is + saved as a constant node in the generated topology. If the parameter is modified from it's original value then the + topology will be incorrect because the number 'first_stage_max_proposals' is used in the transforms of this file is + no more equal to the 'first_stage_max_proposals' saved as a constant. + """ + replacement_id = 'ObjectDetectionAPIConstValueOverride' + + def run_before(self): + return [ObjectDetectionAPIPreprocessorReplacement] + + def transform_graph(self, graph: Graph, replacement_descriptions: dict): + argv = graph.graph['cmd_params'] + if argv.tensorflow_object_detection_api_pipeline_config is None: + raise Error(missing_param_error) + pipeline_config = PipelineConfig(argv.tensorflow_object_detection_api_pipeline_config) + for (node_id, pipeline_config_name) in replacement_descriptions['replacements']: + if node_id not in graph.nodes(): + log.debug('Node with id {} does not exist in the graph'.format(node_id)) + continue + node = Node(graph, node_id) + if not node.has_valid('value'): + log.debug('Node with id {} does not have value'.format(node_id)) + continue + node.value = np.array(pipeline_config.get_param(pipeline_config_name)) + node.value = node.value.reshape(node.shape) diff --git a/model-optimizer/extensions/front/tf/ObjectDetectionAPI_test.py b/model-optimizer/extensions/front/tf/ObjectDetectionAPI_test.py index d9056ef..739d6de 100644 --- a/model-optimizer/extensions/front/tf/ObjectDetectionAPI_test.py +++ b/model-optimizer/extensions/front/tf/ObjectDetectionAPI_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,10 @@ import unittest -import networkx as nx - from extensions.front.tf.ObjectDetectionAPI import calculate_shape_keeping_aspect_ratio, \ calculate_placeholder_spatial_shape from mo.front.subgraph_matcher import SubgraphMatch +from mo.graph.graph import Graph from mo.utils.custom_replacement_config import CustomReplacementDescriptor from mo.utils.error import Error @@ -91,7 +90,7 @@ class TestCalculateShape(unittest.TestCase): class TestCalculatePlaceholderSpatialShape(unittest.TestCase): def setUp(self): - self.graph = nx.MultiDiGraph() + self.graph = Graph() self.graph.graph['user_shapes'] = None self.replacement_desc = CustomReplacementDescriptor('dummy_id', {}) self.match = SubgraphMatch(self.graph, self.replacement_desc, [], [], [], '') diff --git a/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py b/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py index a46bb50..b0f6eae 100644 --- a/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py +++ b/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,14 +14,13 @@ limitations under the License. """ -import networkx as nx import numpy as np from extensions.ops.DetectionOutput import DetectionOutput from extensions.ops.splitv import SplitV from mo.front.subgraph_matcher import SubgraphMatch from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.concat import Concat from mo.ops.const import Const from mo.ops.eltwise import Eltwise @@ -43,23 +42,23 @@ class RetinaNetFilteredDetectionsReplacement(FrontReplacementFromConfigFileSubGr replacement_id = 'RetinaNetFilteredDetectionsReplacement' @staticmethod - def _create_sub(graph: nx.MultiDiGraph, input_1: Node, port_1: int, input_2: Node, port_2: int): + def _create_sub(graph: Graph, input_1: Node, port_1: int, input_2: Node, port_2: int): negate = Power(graph, dict(scale=-1, name=input_2.name + '/negate_')) add = Eltwise(graph, dict(operation='sum', name=input_1.name + '/add_')) out_node = add.create_node([(input_1, port_1), negate.create_node([(input_2, port_2)])]) return out_node - def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict): + def output_edges_match(self, graph: Graph, match: SubgraphMatch, new_sub_graph: dict): return {match.output_node(0)[0].id: new_sub_graph['detection_output_node'].id} - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): new_nodes_to_remove = match.matched_nodes_names() new_nodes_to_remove.remove(match.single_input_node(0)[0].id) new_nodes_to_remove.remove(match.single_input_node(1)[0].id) new_nodes_to_remove.remove(match.single_input_node(2)[0].id) return new_nodes_to_remove - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): reshape_classes_op = Reshape(graph, {'dim': np.array([0, -1])}) reshape_classes_node = reshape_classes_op.create_node([match.single_input_node(1)[0]], dict(name='do_reshape_classes')) @@ -79,12 +78,12 @@ class RetinaNetFilteredDetectionsReplacement(FrontReplacementFromConfigFileSubGr [priors_node, priors_scale_const_node]) # calculate prior boxes widths and heights - split_node = SplitV(graph, {'axis': 2, 'size_splits': [1, 1, 1, 1]}).create_node([priors_scale_node]) + split_node = SplitV(graph, {'axis': 2, 'size_splits': [1, 1, 1, 1], 'out_ports_count': 4}).create_node([priors_scale_node]) priors_width_node = __class__._create_sub(graph, split_node, 2, split_node, 0) priors_height_node = __class__._create_sub(graph, split_node, 3, split_node, 1) # concat weights and heights into a single tensor and multiple with the box coordinates regression values - concat_width_height_node = Concat(graph, {'name': 'concat_priors_width_height', 'axis': -1}).create_node( + concat_width_height_node = Concat(graph, {'name': 'concat_priors_width_height', 'axis': -1, 'in_ports_count': 4}).create_node( [priors_width_node, priors_height_node, priors_width_node, priors_height_node]) applied_width_height_regressions_node = Eltwise(graph, {'name': 'final_regressions', 'operation': 'mul'}). \ create_node([concat_width_height_node, match.single_input_node(0)[0]]) diff --git a/model-optimizer/extensions/front/tf/SSDToolboxDetectionOutput.py b/model-optimizer/extensions/front/tf/SSDToolboxDetectionOutput.py index 278998c..15fa70f 100644 --- a/model-optimizer/extensions/front/tf/SSDToolboxDetectionOutput.py +++ b/model-optimizer/extensions/front/tf/SSDToolboxDetectionOutput.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,14 +14,13 @@ limitations under the License. """ -import networkx as nx import numpy as np from extensions.front.standalone_const_eraser import StandaloneConstEraser from extensions.ops.DetectionOutput import DetectionOutput from mo.front.subgraph_matcher import SubgraphMatch from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import PermuteAttrs from mo.ops.output import Output from mo.ops.reshape import Reshape @@ -33,16 +32,28 @@ class SSDToolboxDetectionOutputReplacement(FrontReplacementFromConfigFileSubGrap def run_before(self): return [StandaloneConstEraser] - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): return [] - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): # IE DetectionOutput layer consumes flattened confidences and locations tensors. # That is why we add reshapes before them. locs_node = match.single_input_node(0) conf_node = match.single_input_node(1) prior_boxes_node = match.single_input_node(2) + locs_out_nodes = locs_node[0].out_nodes() + assert len(locs_out_nodes) == 1 + locs_out_node = locs_out_nodes[list(locs_out_nodes.keys())[0]] + assert locs_out_node.op == "OpOutput", locs_out_node.op + graph.remove_node(locs_out_node.id) + + conf_out_nodes = conf_node[0].out_nodes() + assert len(conf_out_nodes) == 1 + conf_out_node = conf_out_nodes[list(conf_out_nodes.keys())[0]] + assert conf_out_node.op == "OpOutput", conf_out_node.op + graph.remove_node(conf_out_node.id) + # reshape operation to flatten confidence tensor reshape_loc_op = Reshape(graph, {'dim': np.array([0, -1])}) reshape_loc_node = reshape_loc_op.create_node([locs_node], dict(name='DetectionOutput_Reshape_loc_')) diff --git a/model-optimizer/extensions/front/tf/TensorArrayExtractors.py b/model-optimizer/extensions/front/tf/TensorArrayExtractors.py index 20e0d69..b7d7d4b 100644 --- a/model-optimizer/extensions/front/tf/TensorArrayExtractors.py +++ b/model-optimizer/extensions/front/tf/TensorArrayExtractors.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/TensorArrayGatherV3.py b/model-optimizer/extensions/front/tf/TensorArrayGatherV3.py index 46c29c2..d4dfcfc 100644 --- a/model-optimizer/extensions/front/tf/TensorArrayGatherV3.py +++ b/model-optimizer/extensions/front/tf/TensorArrayGatherV3.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/Unpack.py b/model-optimizer/extensions/front/tf/Unpack.py index 30af2d3..0054598 100644 --- a/model-optimizer/extensions/front/tf/Unpack.py +++ b/model-optimizer/extensions/front/tf/Unpack.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementOp -from mo.graph.graph import Node, insert_node_after +from mo.graph.graph import Node, Graph from mo.ops.squeeze import Squeeze @@ -29,14 +27,14 @@ class Unpack(FrontReplacementOp): op = "Unpack" enabled = True - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict): + def nodes_to_remove(self, graph: Graph, match: dict): # do not remove matched node return [] - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): for ind in range(len(node.out_nodes())): squeeze_node = Squeeze(graph, dict(squeeze_dims=[node.axis], name=node.name + '/Squeeze_')).create_node([]) - insert_node_after(node, squeeze_node, ind) + node.insert_node_after(squeeze_node, ind) # do not replace any output edge return [] diff --git a/model-optimizer/extensions/front/tf/YOLO.py b/model-optimizer/extensions/front/tf/YOLO.py index 2947254..651e5ac 100644 --- a/model-optimizer/extensions/front/tf/YOLO.py +++ b/model-optimizer/extensions/front/tf/YOLO.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,14 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. """ -import networkx as nx from extensions.front.no_op_eraser import NoOpEraser from extensions.front.standalone_const_eraser import StandaloneConstEraser from extensions.ops.regionyolo import RegionYoloOp from mo.front.tf.replacement import FrontReplacementFromConfigFileGeneral -from mo.graph.graph import Node -from mo.middle.passes.eliminate import get_nodes_with_attributes +from mo.graph.graph import Node, Graph from mo.ops.output import Output from mo.utils.error import Error @@ -35,7 +33,7 @@ class YoloRegionAddon(FrontReplacementFromConfigFileGeneral): def run_after(self): return [NoOpEraser, StandaloneConstEraser] - def transform_graph(self, graph: nx.MultiDiGraph, replacement_descriptions): + def transform_graph(self, graph: Graph, replacement_descriptions): op_outputs = [n for n, d in graph.nodes(data=True) if 'op' in d and d['op'] == 'OpOutput'] for op_output in op_outputs: last_node = Node(graph, op_output).in_node(0) @@ -55,8 +53,8 @@ class YoloV3RegionAddon(FrontReplacementFromConfigFileGeneral): """ replacement_id = 'TFYOLOV3' - def transform_graph(self, graph: nx.MultiDiGraph, replacement_descriptions): - graph.remove_nodes_from(get_nodes_with_attributes(graph, is_output=True)) + def transform_graph(self, graph: Graph, replacement_descriptions): + graph.remove_nodes_from(graph.get_nodes_with_attributes(op='OpOutput')) for input_node_name in replacement_descriptions['entry_points']: if input_node_name not in graph.nodes(): raise Error('TensorFlow YOLO V3 conversion mechanism was enabled. ' @@ -66,7 +64,7 @@ class YoloV3RegionAddon(FrontReplacementFromConfigFileGeneral): 'Refer to documentation about converting YOLO models for more information.'.format( ', '.join(replacement_descriptions['entry_points']), input_node_name)) last_node = Node(graph, input_node_name).in_node(0) - op_params = dict(name=last_node.id + '/YoloRegion', axis=1, end_axis=-1, do_softmax=0, is_output=True) + op_params = dict(name=last_node.id + '/YoloRegion', axis=1, end_axis=-1, do_softmax=0) op_params.update(replacement_descriptions) region_layer_node = RegionYoloOp(graph, op_params).create_node([last_node]) # TODO: do we need change axis for further permutation diff --git a/model-optimizer/extensions/front/tf/ZerosLike.py b/model-optimizer/extensions/front/tf/ZerosLike.py new file mode 100644 index 0000000..e58f323 --- /dev/null +++ b/model-optimizer/extensions/front/tf/ZerosLike.py @@ -0,0 +1,38 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Graph, Node +from mo.ops.power import Power + + +class ZerosLikeReplacer(FrontReplacementOp): + """ + Replace TF operation ZerosLike by multiplying input tensor by zero. + """ + op = "ZerosLike" + enabled = True + + def nodes_to_remove(self, graph: Graph, match: dict): + # do not remove matched node + return [] + + def replace_op(self, graph: Graph, node: Node): + power = Power(graph, dict(scale=0, name=node.name + '/Power/')).create_node() + + # Reconnecting inputs to this new node + node.in_port(0).get_connection().set_destination(power.in_port(0)) + node.out_port(0).get_connection().set_source(power.out_port(0)) + return [power.id] diff --git a/model-optimizer/extensions/front/tf/addn_ext.py b/model-optimizer/extensions/front/tf/addn_ext.py index 78fde8b..f01b16c 100644 --- a/model-optimizer/extensions/front/tf/addn_ext.py +++ b/model-optimizer/extensions/front/tf/addn_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/argmax_ext.py b/model-optimizer/extensions/front/tf/argmax_ext.py index a5a40ae..ef4eb12 100644 --- a/model-optimizer/extensions/front/tf/argmax_ext.py +++ b/model-optimizer/extensions/front/tf/argmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/assign_elimination.py b/model-optimizer/extensions/front/tf/assign_elimination.py index 2a6dc07..6550c27 100644 --- a/model-optimizer/extensions/front/tf/assign_elimination.py +++ b/model-optimizer/extensions/front/tf/assign_elimination.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ import logging as log import networkx as nx from mo.front.common.replacement import FrontReplacementOp +from mo.graph.graph import Node, Graph from mo.utils.error import Error @@ -26,7 +27,7 @@ class AssignElimination(FrontReplacementOp): op = "Assign" enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): node = match['op'] # here we request all data flow output edges (control flow edges will not be listed) out_edges = node.out_edges() @@ -41,7 +42,7 @@ class AssignSubElimination(FrontReplacementOp): op = "AssignSub" enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): node = match['op'] # here we request all data flow output edges (control flow edges will not be listed) out_edges = node.out_edges() @@ -56,7 +57,7 @@ class AssignAddElimination(FrontReplacementOp): op = "AssignAdd" enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): node = match['op'] # here we request all data flow output edges (control flow edges will not be listed) out_edges = node.out_edges() @@ -65,3 +66,18 @@ class AssignAddElimination(FrontReplacementOp): log.debug('AssignAdd op was removed {}'.format(node.id)) else: raise Error('Data flow edge coming out of AssignAdd node {}'.format(node.id)) + + +class AssertElimination(FrontReplacementOp): + op = "Assert" + enabled = True + + def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + node = match['op'] + # here we request all data flow output edges (control flow edges will not be listed) + out_edges = node.out_edges() + if len(out_edges) == 0: + graph.remove_node(node.id) + log.debug('Assert op was removed {}'.format(node.id)) + else: + raise Error('Data flow edge coming out of Assert node {}'.format(node.id)) diff --git a/model-optimizer/extensions/front/tf/basic_lstm_cell.py b/model-optimizer/extensions/front/tf/basic_lstm_cell.py index 37391ae..fc0b40d 100644 --- a/model-optimizer/extensions/front/tf/basic_lstm_cell.py +++ b/model-optimizer/extensions/front/tf/basic_lstm_cell.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,28 +14,27 @@ limitations under the License. """ -import networkx as nx - from extensions.ops.lstm_cell import LSTMCell from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import Node, replace_node, get_inputs_with_ports +from mo.graph.graph import Node, Graph from mo.ops.output import Output class BasicLSTMCell(FrontReplacementSubgraph): enabled = True + # When the deprecated IR version was requested, we configure only those phases that can lead + # to functional regressions in the version 2. BasicLSTMCell is one such transformation; + # when it is turned off, the body of TF basic_lstm_cell is converted as-is in a decomposed form, + # and should work in version 2. + graph_condition = [lambda graph: graph.graph['ir_version'] != 2] + # list of names of all original nodes that are supported by IE # this list is collected gradually by a separate transformation # original name in this case is a selected node in the pattern # that is returned from anchor() function instances_supported_by_IE = [] - # True if transformation should be activated only for instances collected in supported_by_IE list - # It will be set to True by a separate transformation - second_round = False - - def __init__(self): super().__init__() @@ -50,7 +49,6 @@ class BasicLSTMCell(FrontReplacementSubgraph): __class__.outputs = ['mul_2', 'add_1'] - def pattern(self): return dict( nodes=[ @@ -87,10 +85,10 @@ class BasicLSTMCell(FrontReplacementSubgraph): ('biasadd', 'split', {'in': 1}), # This important block specifies how gates are ordered in TF graph - ('split', 'sigmoid_1', {'out': 0}), # i - ('split', 'tanh_0', {'out': 1}), # c - ('split', 'shift', {'out': 2}), # f (this is unbiased f, there is an extra addition here) - ('split', 'sigmoid_2', {'out': 3}), # o + ('split', 'sigmoid_1', {'out': 0}), # i + ('split', 'tanh_0', {'out': 1}), # c + ('split', 'shift', {'out': 2}), # f (this is unbiased f, there is an extra addition here) + ('split', 'sigmoid_2', {'out': 3}), # o ('shift_const', 'shift', {}), ('shift', 'sigmoid_0', {}), @@ -107,25 +105,6 @@ class BasicLSTMCell(FrontReplacementSubgraph): ('sigmoid_2', 'mul_2', {}), ]) - - @staticmethod - def mark_supported_by_IE(node: Node): - """ Mark a given node as a supported LSTMCell by setting attribute `supported_by_IE`. - The node original name is also included in the list of all supported by IE LSTMCell - instances for possible second round of the network conversion. - """ - assert node.has_valid('original_name'), \ - 'Node {} doesn\'t have a reference to original FW operation name; bad LSTMCell'.format(node.soft_get('name')) - __class__.instances_supported_by_IE.append(node.original_name) - node['supported_by_IE'] = True - - - @staticmethod - def finalize_first_round(): - """ Switch the mode of this pattern into `second stage` where only supported patterns are converted. """ - __class__.second_round = True - - @staticmethod def anchor(): """ Mnemonic name in the pattern that is used as an anchor name for this pattern in the original graph. @@ -133,8 +112,7 @@ class BasicLSTMCell(FrontReplacementSubgraph): """ return 'concat' - - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): # node that is used to identify this pattern application instance for switching between supported # and not supported LSTMCell sub-graphs; this value will be searched in __class__.instances_supported_by_IE. @@ -142,25 +120,17 @@ class BasicLSTMCell(FrontReplacementSubgraph): assert anchor_node.has_valid('name'), \ 'LSTMCell anchor node {} does\'t have attribute name; such nodes are not supported.' - if __class__.second_round and anchor_node.name not in __class__.instances_supported_by_IE: - # at the second round of conversion we apply pattern selectively: only instances from - # __class__.instances_supported_by_IE are allowed for conversion; all others should be skipped - return - match['input_op'] = match['concat'].in_node(0) match['input_hidden_state'] = match['concat'].in_node(1) - match['input_cell_state'] = match['mul_0'].in_node(0) if match['mul_0'].in_node(0).id != match['sigmoid_0'].id \ - else match['mul_0'].in_node(1) + match['input_cell_state'] = match['mul_0'].in_node(0) \ + if match['mul_0'].in_node(0).id != match['sigmoid_0'].id else match['mul_0'].in_node(1) pattern_edges = self.pattern()['edges'] pattern_edges.extend([('input_op', 'concat'), ('input_cell_state', 'mul_0'), ('input_hidden_state', 'concat')]) - inputs = get_inputs_with_ports(graph, match, pattern_edges, __class__.inputs + __class__.extra_inputs) + inputs = graph.get_inputs_with_ports(match, pattern_edges, __class__.inputs + __class__.extra_inputs) lstm_op = LSTMCell(graph, dict( - name=match['concat'].name + '/LSTMCell', - mark_supported_by_IE=__class__.mark_supported_by_IE, - original_name=anchor_node.name, - finalize_first_round=__class__.finalize_first_round, + name=match['concat'].name + '/LSTMCell', activations=None, )) lstm_node = lstm_op.create_node(inputs) lstm_node['old_infer'] = lstm_node.infer @@ -172,7 +142,7 @@ class BasicLSTMCell(FrontReplacementSubgraph): graph.remove_node(match['tanh_1'].id) for i, output in enumerate(__class__.outputs): - replace_node(match[output], lstm_node, i) + match[output].replace_node(lstm_node, i) # Because of LSTMCell specification, this layer MUST have 2 outputs. # => we need to create fake consumers for LSTMCell @@ -186,7 +156,6 @@ class BasicLSTMCell(FrontReplacementSubgraph): lstm_node['extra_inputs'] = {name: match[name].id for name in __class__.extra_inputs} lstm_node['inputs'] = {name: match[name].id for name in __class__.inputs} - @staticmethod def infer(node: Node): assert len(node.in_nodes()) == len(__class__.inputs) + len(__class__.extra_inputs) diff --git a/model-optimizer/extensions/front/tf/concat.py b/model-optimizer/extensions/front/tf/concat.py index 87ea9f3..9d3a0d4 100644 --- a/model-optimizer/extensions/front/tf/concat.py +++ b/model-optimizer/extensions/front/tf/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.replacement import FrontReplacementSubgraph +from mo.graph.graph import Graph class Concat(FrontReplacementSubgraph): @@ -28,7 +27,7 @@ class Concat(FrontReplacementSubgraph): edges=[] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): """ There are Concat and ConcatV2 operations in TensorFlow The main difference is incoming port of tensor representing axis of concatenation diff --git a/model-optimizer/extensions/front/tf/concat_ext.py b/model-optimizer/extensions/front/tf/concat_ext.py index 95ef262..a9c18ab 100644 --- a/model-optimizer/extensions/front/tf/concat_ext.py +++ b/model-optimizer/extensions/front/tf/concat_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/concat_ext_test.py b/model-optimizer/extensions/front/tf/concat_ext_test.py index 9cf9021..16f96ac 100644 --- a/model-optimizer/extensions/front/tf/concat_ext_test.py +++ b/model-optimizer/extensions/front/tf/concat_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/concat_test.py b/model-optimizer/extensions/front/tf/concat_test.py index abee3b0..7682245 100644 --- a/model-optimizer/extensions/front/tf/concat_test.py +++ b/model-optimizer/extensions/front/tf/concat_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/conv_ext.py b/model-optimizer/extensions/front/tf/conv_ext.py index 00931de..d008ced 100644 --- a/model-optimizer/extensions/front/tf/conv_ext.py +++ b/model-optimizer/extensions/front/tf/conv_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/conv_ext_test.py b/model-optimizer/extensions/front/tf/conv_ext_test.py index d420f91..6813d57 100644 --- a/model-optimizer/extensions/front/tf/conv_ext_test.py +++ b/model-optimizer/extensions/front/tf/conv_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/crop_and_resize_ext.py b/model-optimizer/extensions/front/tf/crop_and_resize_ext.py index 98034c6..11503a7 100644 --- a/model-optimizer/extensions/front/tf/crop_and_resize_ext.py +++ b/model-optimizer/extensions/front/tf/crop_and_resize_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/deconv_ext.py b/model-optimizer/extensions/front/tf/deconv_ext.py index 8838cd5..df046c7 100644 --- a/model-optimizer/extensions/front/tf/deconv_ext.py +++ b/model-optimizer/extensions/front/tf/deconv_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/deconv_ext_test.py b/model-optimizer/extensions/front/tf/deconv_ext_test.py index c11d4da..333c785 100644 --- a/model-optimizer/extensions/front/tf/deconv_ext_test.py +++ b/model-optimizer/extensions/front/tf/deconv_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/depth_to_space.py b/model-optimizer/extensions/front/tf/depth_to_space.py index d422141..53a0d83 100644 --- a/model-optimizer/extensions/front/tf/depth_to_space.py +++ b/model-optimizer/extensions/front/tf/depth_to_space.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/exp_ext.py b/model-optimizer/extensions/front/tf/exp_ext.py new file mode 100644 index 0000000..7716579 --- /dev/null +++ b/model-optimizer/extensions/front/tf/exp_ext.py @@ -0,0 +1,28 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.front.extractor import FrontExtractorOp +from mo.ops.activation import Activation + + +class ExpExtractor(FrontExtractorOp): + op = 'Exp' + enabled = True + + @staticmethod + def extract(node): + Activation.update_node_stat(node, {'operation': 'exp'}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/extract_image_patches.py b/model-optimizer/extensions/front/tf/extract_image_patches.py index a6e0837..fd544d6 100644 --- a/model-optimizer/extensions/front/tf/extract_image_patches.py +++ b/model-optimizer/extensions/front/tf/extract_image_patches.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/fake_const.py b/model-optimizer/extensions/front/tf/fake_const.py index 2a487ef..0ba7579 100644 --- a/model-optimizer/extensions/front/tf/fake_const.py +++ b/model-optimizer/extensions/front/tf/fake_const.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ """ import logging as log -import networkx as nx +import numpy as np +from mo.front.common.partial_infer.utils import int64_array from mo.front.common.replacement import FrontReplacementOp from mo.front.tf.extractors.utils import tf_dtype_extractor -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.const import Const @@ -27,7 +28,7 @@ class FakeConstToConst(FrontReplacementOp): op = "FakeConst" enabled = True - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): node = match['op'] if not node.has_valid('value'): log.debug("No value in FakeConst node {}".format(node.id)) @@ -35,7 +36,7 @@ class FakeConstToConst(FrontReplacementOp): node_value = node.value extracted_attrs = { 'data_type': tf_dtype_extractor(node.pb.attr['dtype'].type), - 'shape': node_value.shape, + 'shape': int64_array(node_value.shape), 'value': node_value } Const.update_node_stat(node, extracted_attrs) diff --git a/model-optimizer/extensions/front/tf/faster_rcnn_support.json b/model-optimizer/extensions/front/tf/faster_rcnn_support.json index b2d8b37..c535044 100644 --- a/model-optimizer/extensions/front/tf/faster_rcnn_support.json +++ b/model-optimizer/extensions/front/tf/faster_rcnn_support.json @@ -36,6 +36,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false }, "id": "ObjectDetectionAPIProposalReplacement", "include_inputs_to_sub_graph": true, @@ -57,6 +59,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false, "coordinates_swap_method": "swap_weights" }, "id": "ObjectDetectionAPIDetectionOutputReplacement", @@ -97,5 +101,13 @@ }, "id": "ObjectDetectionAPIOutputReplacement", "match_kind": "general" + }, + { + "custom_attributes": + { + "replacements": [["mul/y", "first_stage_max_proposals"]] + }, + "id": "ObjectDetectionAPIConstValueOverride", + "match_kind": "general" } -] \ No newline at end of file +] diff --git a/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.10.json b/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.10.json new file mode 100644 index 0000000..95be086 --- /dev/null +++ b/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.10.json @@ -0,0 +1,113 @@ +[ + { + "custom_attributes": { + }, + "id": "ObjectDetectionAPIPreprocessorReplacement", + "inputs": [ + [ + { + "node": "map/Shape$", + "port": 0 + }, + { + "node": "map/TensorArrayUnstack/Shape$", + "port": 0 + }, + { + "node": "map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3$", + "port": 2 + } + ] + ], + "instances": [ + ".*Preprocessor/" + ], + "match_kind": "scope", + "outputs": [ + { + "node": "sub$", + "port": 0 + }, + { + "node": "map/TensorArrayStack_1/TensorArrayGatherV3$", + "port": 0 + } + ] + }, + { + "custom_attributes": { + "clip_before_nms": false, + "clip_after_nms": true + }, + "id": "ObjectDetectionAPIProposalReplacement", + "include_inputs_to_sub_graph": true, + "include_outputs_to_sub_graph": true, + "instances": { + "end_points": [ + "map/TensorArrayStack/TensorArrayGatherV3", + "map_1/TensorArrayStack/TensorArrayGatherV3", + "BatchMultiClassNonMaxSuppression/map/TensorArrayStack_4/TensorArrayGatherV3" + ], + "start_points": [ + "concat", + "concat_1", + "GridAnchorGenerator/Identity", + "Shape" + ] + }, + "match_kind": "points" + }, + { + "custom_attributes": { + "clip_before_nms": false, + "clip_after_nms": true, + "coordinates_swap_method": "swap_weights" + }, + "id": "ObjectDetectionAPIDetectionOutputReplacement", + "inputs": [ + [ + { + "node": "Reshape$", + "port": 0 + } + ], + [ + { + "node": "Reshape_1$", + "port": 0 + } + ], + [ + { + "node": "ExpandDims$", + "port": 0 + } + ] + ], + "instances": [ + ".*SecondStagePostprocessor/" + ], + "match_kind": "scope", + "outputs": [ + { + "node": "BatchMultiClassNonMaxSuppression/map/TensorArrayStack/TensorArrayGatherV3$", + "port": 0 + } + ] + }, + { + "custom_attributes": { + "outputs": "detection_boxes,detection_scores,num_detections" + }, + "id": "ObjectDetectionAPIOutputReplacement", + "match_kind": "general" + }, + { + "custom_attributes": + { + "replacements": [["mul/y", "first_stage_max_proposals"]] + }, + "id": "ObjectDetectionAPIConstValueOverride", + "match_kind": "general" + } +] \ No newline at end of file diff --git a/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.7.json b/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.7.json index 8f9d74c..6eba96f 100644 --- a/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.7.json +++ b/model-optimizer/extensions/front/tf/faster_rcnn_support_api_v1.7.json @@ -36,6 +36,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false }, "id": "ObjectDetectionAPIProposalReplacement", "include_inputs_to_sub_graph": true, @@ -57,6 +59,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false, "coordinates_swap_method": "swap_weights" }, "id": "ObjectDetectionAPIDetectionOutputReplacement", @@ -97,5 +101,13 @@ }, "id": "ObjectDetectionAPIOutputReplacement", "match_kind": "general" + }, + { + "custom_attributes": + { + "replacements": [["mul/y", "first_stage_max_proposals"]] + }, + "id": "ObjectDetectionAPIConstValueOverride", + "match_kind": "general" } ] \ No newline at end of file diff --git a/model-optimizer/extensions/front/tf/fifo_queue_v2_ext.py b/model-optimizer/extensions/front/tf/fifo_queue_v2_ext.py index 5a2b591..238ae68 100644 --- a/model-optimizer/extensions/front/tf/fifo_queue_v2_ext.py +++ b/model-optimizer/extensions/front/tf/fifo_queue_v2_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -37,6 +37,6 @@ class FIFOQueueV2Extractor(FrontExtractorOp): if len(shape) == 3: result_shapes.append(np.array([1, shape[0].size, shape[1].size, shape[2].size], dtype=np.int64)) else: - result_shapes.append(np.array(shape, dtype=np.int64)) + result_shapes.append(np.array([dim.size for dim in shape], dtype=np.int64)) Op.update_node_stat(node, {'shapes': result_shapes, 'types': extracted_types}) return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/fifo_replacer.py b/model-optimizer/extensions/front/tf/fifo_replacer.py index 576dcf1..9063cf5 100644 --- a/model-optimizer/extensions/front/tf/fifo_replacer.py +++ b/model-optimizer/extensions/front/tf/fifo_replacer.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,17 +15,20 @@ """ import logging as log -import networkx as nx import numpy as np from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import create_edge, erase_node, Node +from mo.graph.graph import Graph, Node from mo.ops.input import Input class FIFOQueue(FrontReplacementSubgraph): enabled = True + def run_before(self): + from extensions.front.override_batch import OverrideBatch + return [OverrideBatch] + @staticmethod def pattern(**kwargs): return dict( @@ -43,7 +46,7 @@ class FIFOQueue(FrontReplacementSubgraph): ) @staticmethod - def replace_sub_graph(graph: nx.MultiDiGraph, match: dict, **kwargs): + def replace_sub_graph(graph: Graph, match: dict, **kwargs): """ Usually graph looks like: @@ -70,16 +73,16 @@ class FIFOQueue(FrontReplacementSubgraph): ''.format(match['placeholder'].id, true_placeholder_shape, placeholder_shape)) placeholder_shape = true_placeholder_shape placeholder_name = match['fifo_queue'].name - erase_node(match['fifo_queue']) - erase_node(match['placeholder']) + graph.erase_node(match['fifo_queue']) + graph.erase_node(match['placeholder']) for _, out in match['batch_join'].out_nodes().items(): if out.id != match['image_batch'].id: if out.out_node().op == 'OpOutput': - erase_node(out.out_node()) - erase_node(out) - erase_node(match['batch_join']) + graph.remove_node(out.out_node().id) + graph.remove_node(out.id) + graph.remove_node(match['batch_join'].id) placeholder = Input(graph, {'name': placeholder_name, 'shape': placeholder_shape}).create_node() - create_edge(placeholder, match['image_batch']) + graph.create_edge(placeholder, match['image_batch']) log.info("FIFOQueueV2 pattern was detected. New shape of placeholder {} is {}. Use -b to set batch size if " "needed".format(placeholder.id, placeholder['shape'])) @@ -90,6 +93,10 @@ class QueueDequeueManyV2(FrontReplacementSubgraph): """ enabled = True + def run_before(self): + from extensions.front.override_batch import OverrideBatch + return [OverrideBatch] + @staticmethod def pattern(**kwargs): return dict( @@ -103,7 +110,7 @@ class QueueDequeueManyV2(FrontReplacementSubgraph): ) @staticmethod - def replace_sub_graph(graph: nx.MultiDiGraph, match: dict, **kwargs): + def replace_sub_graph(graph: Graph, match: dict, **kwargs): inputs_dict = {} for u, v, edge_attrs in graph.out_edges(match['queue_deque'].id, data=True): out_port = edge_attrs['out'] @@ -111,7 +118,7 @@ class QueueDequeueManyV2(FrontReplacementSubgraph): if out_port not in inputs_dict: input_op = Input(graph, {'shape': shape.copy()}) inputs_dict[out_port] = input_op.create_node([]) - create_edge(inputs_dict[out_port], Node(graph, v), edge_attrs['out'], edge_attrs['in'], edge_attrs) + graph.create_edge(inputs_dict[out_port], Node(graph, v), edge_attrs['out'], edge_attrs['in'], edge_attrs) graph.remove_node(match['queue_deque'].id) graph.remove_node(match['fifo_queue'].id) diff --git a/model-optimizer/extensions/front/tf/fifo_replacer_test.py b/model-optimizer/extensions/front/tf/fifo_replacer_test.py index e1150c2..a7a65d4 100644 --- a/model-optimizer/extensions/front/tf/fifo_replacer_test.py +++ b/model-optimizer/extensions/front/tf/fifo_replacer_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/gather_ext.py b/model-optimizer/extensions/front/tf/gather_ext.py index 0cb924f..1c3a7e2 100644 --- a/model-optimizer/extensions/front/tf/gather_ext.py +++ b/model-optimizer/extensions/front/tf/gather_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/mask_rcnn_support.json b/model-optimizer/extensions/front/tf/mask_rcnn_support.json index 9ff12e3..383cb94 100644 --- a/model-optimizer/extensions/front/tf/mask_rcnn_support.json +++ b/model-optimizer/extensions/front/tf/mask_rcnn_support.json @@ -36,6 +36,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false }, "id": "ObjectDetectionAPIProposalReplacement", "include_inputs_to_sub_graph": true, @@ -57,6 +59,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false, "coordinates_swap_method": "swap_weights" }, "id": "ObjectDetectionAPIDetectionOutputReplacement", @@ -104,5 +108,13 @@ }, "id": "ObjectDetectionAPIOutputReplacement", "match_kind": "general" + }, + { + "custom_attributes": + { + "replacements": [["mul/y", "first_stage_max_proposals"]] + }, + "id": "ObjectDetectionAPIConstValueOverride", + "match_kind": "general" } -] \ No newline at end of file +] diff --git a/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.11.json b/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.11.json index 6220ea1..178b53b 100644 --- a/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.11.json +++ b/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.11.json @@ -36,6 +36,8 @@ }, { "custom_attributes": { + "clip_before_nms": false, + "clip_after_nms": true }, "id": "ObjectDetectionAPIProposalReplacement", "include_inputs_to_sub_graph": true, @@ -57,6 +59,8 @@ }, { "custom_attributes": { + "clip_before_nms": false, + "clip_after_nms": true, "coordinates_swap_method": "swap_weights" }, "id": "ObjectDetectionAPIDetectionOutputReplacement", @@ -104,5 +108,13 @@ }, "id": "ObjectDetectionAPIOutputReplacement", "match_kind": "general" + }, + { + "custom_attributes": + { + "replacements": [["mul/y", "first_stage_max_proposals"]] + }, + "id": "ObjectDetectionAPIConstValueOverride", + "match_kind": "general" } ] \ No newline at end of file diff --git a/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.7.json b/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.7.json index 9b59125..3574f7a 100644 --- a/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.7.json +++ b/model-optimizer/extensions/front/tf/mask_rcnn_support_api_v1.7.json @@ -36,6 +36,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false }, "id": "ObjectDetectionAPIProposalReplacement", "include_inputs_to_sub_graph": true, @@ -57,6 +59,8 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false, "coordinates_swap_method": "swap_weights" }, "id": "ObjectDetectionAPIDetectionOutputReplacement", @@ -104,5 +108,13 @@ }, "id": "ObjectDetectionAPIOutputReplacement", "match_kind": "general" + }, + { + "custom_attributes": + { + "replacements": [["mul/y", "first_stage_max_proposals"]] + }, + "id": "ObjectDetectionAPIConstValueOverride", + "match_kind": "general" } ] \ No newline at end of file diff --git a/model-optimizer/extensions/front/tf/max_ext.py b/model-optimizer/extensions/front/tf/max_ext.py index 34f1baf..a68fea0 100644 --- a/model-optimizer/extensions/front/tf/max_ext.py +++ b/model-optimizer/extensions/front/tf/max_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/mvn.py b/model-optimizer/extensions/front/tf/mvn.py index 0dd00ee..c03cae9 100644 --- a/model-optimizer/extensions/front/tf/mvn.py +++ b/model-optimizer/extensions/front/tf/mvn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ import networkx as nx from extensions.front.squared_difference import SquaredDifference from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import Node, replace_node +from mo.graph.graph import Node, Graph from mo.ops.eltwise import Eltwise from mo.ops.op import Op @@ -53,7 +53,7 @@ class MVN(FrontReplacementSubgraph): ('squeeze_variance', 'fbn', {'in': 4}), ]) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): fbn = match['fbn'] input = fbn.in_node(0) log.debug('Found potential MVN pattern after {} with name {}'.format(input.op, input.name)) @@ -87,8 +87,7 @@ class MVN(FrontReplacementSubgraph): ]), input_beta ]) - - replace_node(fbn, new_subgraph) + fbn.replace_node(new_subgraph) @staticmethod def infer(node: Node): diff --git a/model-optimizer/extensions/front/tf/mvn_unrolled.py b/model-optimizer/extensions/front/tf/mvn_unrolled.py index a73ed49..2c33f52 100644 --- a/model-optimizer/extensions/front/tf/mvn_unrolled.py +++ b/model-optimizer/extensions/front/tf/mvn_unrolled.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,13 +16,11 @@ import logging as log -import networkx as nx - from extensions.front.squared_difference import SquaredDifference from extensions.front.sub import Sub from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import Node, replace_node -from mo.ops.div import Div +from mo.graph.graph import Node, Graph +from extensions.front.div import Div from mo.ops.op import Op @@ -57,7 +55,7 @@ class MVNUnrolled(FrontReplacementSubgraph): ]) @staticmethod - def replace_sub_graph(graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(graph: Graph, match: dict): MVN = Op.get_op_class_by_name('MVN') mvn = MVN(graph, dict( @@ -74,7 +72,7 @@ class MVNUnrolled(FrontReplacementSubgraph): new_subgraph = mvn.create_node([match['mean'].in_node(0), mean_reduction, variance_reduction, pow2, eps]) - replace_node(match['truediv'], new_subgraph) + match['truediv'].replace_node(new_subgraph) @staticmethod def infer(node: Node): diff --git a/model-optimizer/extensions/front/tf/mvn_unrolled_test.py b/model-optimizer/extensions/front/tf/mvn_unrolled_test.py index de9618b..11dd640 100644 --- a/model-optimizer/extensions/front/tf/mvn_unrolled_test.py +++ b/model-optimizer/extensions/front/tf/mvn_unrolled_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/nearest_neighbor_upsampling.py b/model-optimizer/extensions/front/tf/nearest_neighbor_upsampling.py index 23b1f45..d42b73b 100644 --- a/model-optimizer/extensions/front/tf/nearest_neighbor_upsampling.py +++ b/model-optimizer/extensions/front/tf/nearest_neighbor_upsampling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,12 +16,10 @@ import logging as log -import networkx as nx - from extensions.front.Pack import Pack from extensions.ops.resample import ResampleOp from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import replace_node +from mo.graph.graph import Node, Graph class NearestNeighborUpsampling(FrontReplacementSubgraph): @@ -56,7 +54,7 @@ class NearestNeighborUpsampling(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): log.debug('Matched NearestNeighborUpsampling pattern: {}'.format([node.id for node in match.values()])) try: input_height = match['pack_1'].in_node(1).value.item() @@ -73,5 +71,5 @@ class NearestNeighborUpsampling(FrontReplacementSubgraph): 'resample_type': 'caffe.ResampleParameter.NEAREST'}) resample_node = resample_op.create_node([match['op']]) - replace_node(match['reshape_2'], resample_node) + match['reshape_2'].replace_node(resample_node) graph.remove_nodes_from([node.id for node in match.values() if node.id != match['op'].id]) diff --git a/model-optimizer/extensions/front/tf/next_iteration_ext.py b/model-optimizer/extensions/front/tf/next_iteration_ext.py index ceb385c..0968b69 100644 --- a/model-optimizer/extensions/front/tf/next_iteration_ext.py +++ b/model-optimizer/extensions/front/tf/next_iteration_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/next_iteration_ext_test.py b/model-optimizer/extensions/front/tf/next_iteration_ext_test.py index 98e0ab6..0d0455c 100644 --- a/model-optimizer/extensions/front/tf/next_iteration_ext_test.py +++ b/model-optimizer/extensions/front/tf/next_iteration_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/pad_ext.py b/model-optimizer/extensions/front/tf/pad_ext.py index 542d9aa..98aabb5 100644 --- a/model-optimizer/extensions/front/tf/pad_ext.py +++ b/model-optimizer/extensions/front/tf/pad_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/pad_ext_test.py b/model-optimizer/extensions/front/tf/pad_ext_test.py index 138b4f0..f1a9302 100644 --- a/model-optimizer/extensions/front/tf/pad_ext_test.py +++ b/model-optimizer/extensions/front/tf/pad_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/pooling_ext.py b/model-optimizer/extensions/front/tf/pooling_ext.py index 772747c..29fd59c 100644 --- a/model-optimizer/extensions/front/tf/pooling_ext.py +++ b/model-optimizer/extensions/front/tf/pooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/pooling_ext_test.py b/model-optimizer/extensions/front/tf/pooling_ext_test.py index a03095e..85a13d0 100644 --- a/model-optimizer/extensions/front/tf/pooling_ext_test.py +++ b/model-optimizer/extensions/front/tf/pooling_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/prelu.py b/model-optimizer/extensions/front/tf/prelu.py index bea37f3..15b13bc 100644 --- a/model-optimizer/extensions/front/tf/prelu.py +++ b/model-optimizer/extensions/front/tf/prelu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,12 +16,10 @@ import logging as log -import networkx as nx - from extensions.front.sub import Sub from extensions.ops.prelu import PreluOp from mo.front.common.replacement import FrontReplacementSubgraph -from mo.graph.graph import replace_node +from mo.graph.graph import Graph from mo.middle.pattern_match import check_node_usages_out_of_match @@ -49,7 +47,7 @@ class PReLU(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): consumers = [n for n in match if n not in ['mul', 'op', 'add'] and not check_node_usages_out_of_match(match, n)] if consumers: log.warning('PReLU pattern was detected. Non pattern consumers of nodes: "{}" were found. Won\'t replace' @@ -57,7 +55,7 @@ class PReLU(FrontReplacementSubgraph): return gamma = match['mul'].in_node(0) if match['mul'].in_node(1).id == match['neg_1'].id else match['mul'].in_node(1) prelu_node = PreluOp(graph, {'name': '{}/PReLU'.format(match['add'].id)}).create_node([match['op'], gamma]) - replace_node(match['add'], prelu_node) + match['add'].replace_node(prelu_node) log.debug('PReLU pattern starting from "{}" was collapsed to "{}"'.format(match['op'].id, prelu_node.id)) @@ -89,7 +87,7 @@ class PReLUWithAbs(FrontReplacementSubgraph): ] ) - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): consumers = [n for n in match if n not in ['mul', 'mul_1', 'op', 'add', 'abs', 'sub'] and not check_node_usages_out_of_match(match, n)] @@ -99,5 +97,5 @@ class PReLUWithAbs(FrontReplacementSubgraph): return gamma = match['mul'].in_node(0) if match['mul'].in_node(1).id == match['sub'].id else match['mul'].in_node(1) prelu_node = PreluOp(graph, {'name': '{}/PReLU'.format(match['add'].id)}).create_node([match['op'], gamma]) - replace_node(match['add'], prelu_node) + match['add'].replace_node(prelu_node) log.debug('PReLUWithAbs pattern starting from "{}" was collapsed to "{}"'.format(match['op'].id, prelu_node.id)) diff --git a/model-optimizer/extensions/front/tf/rank_ext.py b/model-optimizer/extensions/front/tf/rank_ext.py index 71ca94d..7ad44b4 100644 --- a/model-optimizer/extensions/front/tf/rank_ext.py +++ b/model-optimizer/extensions/front/tf/rank_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/resize_bilinear.py b/model-optimizer/extensions/front/tf/resize_bilinear.py index f7670ac..9519b84 100644 --- a/model-optimizer/extensions/front/tf/resize_bilinear.py +++ b/model-optimizer/extensions/front/tf/resize_bilinear.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/resize_nearest_neighbor.py b/model-optimizer/extensions/front/tf/resize_nearest_neighbor.py index 0b8b8ec..f86ad58 100644 --- a/model-optimizer/extensions/front/tf/resize_nearest_neighbor.py +++ b/model-optimizer/extensions/front/tf/resize_nearest_neighbor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/reverse_sequence.py b/model-optimizer/extensions/front/tf/reverse_sequence.py index 2c6491f..75b9d25 100644 --- a/model-optimizer/extensions/front/tf/reverse_sequence.py +++ b/model-optimizer/extensions/front/tf/reverse_sequence.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,8 @@ limitations under the License. """ -from mo.front.extractor import FrontExtractorOp from extensions.ops.reverse_sequence import ReverseSequence +from mo.front.extractor import FrontExtractorOp class ReverseSequenceFrontExtractor(FrontExtractorOp): @@ -24,8 +24,11 @@ class ReverseSequenceFrontExtractor(FrontExtractorOp): @staticmethod def extract(node): + if node.has_valid('seq_dim'): + return + ReverseSequence.update_node_stat(node, { - 'seq_dim': node.pb.attr['seq_dim'], - 'batch_dim': node.pb.attr['batch_dim'], + 'seq_axis': node.pb.attr['seq_dim'].i, + 'batch_axis': node.pb.attr['batch_dim'].i, }) return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/reverse_v2.py b/model-optimizer/extensions/front/tf/reverse_v2.py index 6254d23..02241ff 100644 --- a/model-optimizer/extensions/front/tf/reverse_v2.py +++ b/model-optimizer/extensions/front/tf/reverse_v2.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,9 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. """ - +from extensions.ops.Reverse import Reverse from mo.front.extractor import FrontExtractorOp -from extensions.ops.reverse_sequence import ReverseSequence class ReverseV2FrontExtractor(FrontExtractorOp): @@ -24,5 +23,5 @@ class ReverseV2FrontExtractor(FrontExtractorOp): @staticmethod def extract(node): - ReverseSequence.update_node_stat(node) + Reverse.update_node_stat(node) return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/rfcn_support.json b/model-optimizer/extensions/front/tf/rfcn_support.json index 3f612fa..2e250c0 100644 --- a/model-optimizer/extensions/front/tf/rfcn_support.json +++ b/model-optimizer/extensions/front/tf/rfcn_support.json @@ -36,6 +36,9 @@ }, { "custom_attributes": { + "clip_before_nms": true, + "clip_after_nms": false, + "do_not_swap_proposals": true }, "id": "ObjectDetectionAPIProposalReplacement", "include_inputs_to_sub_graph": true, @@ -57,7 +60,10 @@ }, { "custom_attributes": { - "coordinates_swap_method": "add_convolution" + "clip_before_nms": true, + "clip_after_nms": false, + "coordinates_swap_method": "add_convolution", + "swap_proposals": true }, "id": "ObjectDetectionAPIDetectionOutputReplacement", "inputs": [ diff --git a/model-optimizer/extensions/front/tf/rfcn_support_api_v1.10.json b/model-optimizer/extensions/front/tf/rfcn_support_api_v1.10.json new file mode 100644 index 0000000..c0ed3be --- /dev/null +++ b/model-optimizer/extensions/front/tf/rfcn_support_api_v1.10.json @@ -0,0 +1,145 @@ +[ + { + "custom_attributes": {}, + "id": "ObjectDetectionAPIPreprocessorReplacement", + "inputs": [ + [ + { + "node": "map/Shape$", + "port": 0 + }, + { + "node": "map/TensorArrayUnstack/Shape$", + "port": 0 + }, + { + "node": "map/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3$", + "port": 2 + } + ] + ], + "instances": [ + ".*Preprocessor/" + ], + "match_kind": "scope", + "outputs": [ + { + "node": "sub$", + "port": 0 + }, + { + "node": "map/TensorArrayStack_1/TensorArrayGatherV3$", + "port": 0 + } + ] + }, + { + "custom_attributes": { + "clip_before_nms": false, + "clip_after_nms": true + }, + "id": "ObjectDetectionAPIProposalReplacement", + "include_inputs_to_sub_graph": true, + "include_outputs_to_sub_graph": true, + "instances": { + "end_points": [ + "map/TensorArrayStack/TensorArrayGatherV3", + "map_1/TensorArrayStack/TensorArrayGatherV3", + "BatchMultiClassNonMaxSuppression/map/TensorArrayStack_4/TensorArrayGatherV3" + ], + "start_points": [ + "FirstStageBoxPredictor/Reshape", + "FirstStageBoxPredictor/Reshape_1", + "GridAnchorGenerator/Identity", + "Shape" + ] + }, + "match_kind": "points" + }, + { + "custom_attributes": { + "clip_before_nms": false, + "clip_after_nms": true, + "coordinates_swap_method": "add_convolution" + }, + "id": "ObjectDetectionAPIDetectionOutputReplacement", + "inputs": [ + [ + { + "node": "Reshape$", + "port": 0 + } + ], + [ + { + "node": "Reshape_1$", + "port": 0 + } + ], + [ + { + "node": "ExpandDims$", + "port": 0 + } + ] + ], + "instances": [ + ".*SecondStagePostprocessor/" + ], + "match_kind": "scope", + "outputs": [ + { + "node": "BatchMultiClassNonMaxSuppression/map/TensorArrayStack/TensorArrayGatherV3$", + "port": 0 + } + ] + }, + { + "custom_attributes": {}, + "id": "ObjectDetectionAPIPSROIPoolingReplacement", + "inputs": [ + [ + { + "node": "Shape$", + "port": 0 + }, + { + "node": "TensorArrayUnstack/Shape$", + "port": 0 + }, + { + "node": "TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3$", + "port": 2 + } + ], + [ + { + "node": "TensorArrayUnstack_1/TensorArrayScatter/TensorArrayScatterV3$", + "port": 2 + }, + { + "node": "TensorArrayUnstack_1/Shape$", + "port": 0 + } + ] + ], + "instances": [ + "SecondStageBoxPredictor/map/", + "SecondStageBoxPredictor/map_1/" + ], + "match_kind": "scope", + "outputs": [ + { + "node": "TensorArrayStack/TensorArrayGatherV3$", + "port": 0 + } + ] + }, + { + "custom_attributes": { + "outputs": "detection_boxes" + }, + "id": "ObjectDetectionAPIOutputReplacement", + "match_kind": "general" + } +] diff --git a/model-optimizer/extensions/front/tf/shape_ext.py b/model-optimizer/extensions/front/tf/shape_ext.py new file mode 100644 index 0000000..1a6c0d7 --- /dev/null +++ b/model-optimizer/extensions/front/tf/shape_ext.py @@ -0,0 +1,31 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.front.extractor import FrontExtractorOp +from mo.front.tf.extractors.utils import tf_dtype_extractor +from mo.graph.graph import Node +from mo.ops.shape import Shape + + +class ShapeExtractor(FrontExtractorOp): + op = 'Shape' + enabled = True + + @staticmethod + def extract(node: Node): + Shape.update_node_stat(node, {'data_type': tf_dtype_extractor(node.pb.attr['out_type'].type, np.int32)}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/slice_ext.py b/model-optimizer/extensions/front/tf/slice_ext.py index 54881b3..ab9d053 100644 --- a/model-optimizer/extensions/front/tf/slice_ext.py +++ b/model-optimizer/extensions/front/tf/slice_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/softmax_ext.py b/model-optimizer/extensions/front/tf/softmax_ext.py index 8891b5f..6f0b029 100644 --- a/model-optimizer/extensions/front/tf/softmax_ext.py +++ b/model-optimizer/extensions/front/tf/softmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/split_ext.py b/model-optimizer/extensions/front/tf/split_ext.py index e316a81..e713c80 100644 --- a/model-optimizer/extensions/front/tf/split_ext.py +++ b/model-optimizer/extensions/front/tf/split_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/sqrt_ext.py b/model-optimizer/extensions/front/tf/sqrt_ext.py index 0886316..d68c270 100644 --- a/model-optimizer/extensions/front/tf/sqrt_ext.py +++ b/model-optimizer/extensions/front/tf/sqrt_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/square_ext.py b/model-optimizer/extensions/front/tf/square_ext.py index 6a3e939..457c82e 100644 --- a/model-optimizer/extensions/front/tf/square_ext.py +++ b/model-optimizer/extensions/front/tf/square_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/stop_gradient_ext.py b/model-optimizer/extensions/front/tf/stop_gradient_ext.py index fd166a7..a7320e5 100644 --- a/model-optimizer/extensions/front/tf/stop_gradient_ext.py +++ b/model-optimizer/extensions/front/tf/stop_gradient_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/stop_gradient_ext_test.py b/model-optimizer/extensions/front/tf/stop_gradient_ext_test.py index 6030393..6b1f7c7 100644 --- a/model-optimizer/extensions/front/tf/stop_gradient_ext_test.py +++ b/model-optimizer/extensions/front/tf/stop_gradient_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/sum_ext.py b/model-optimizer/extensions/front/tf/sum_ext.py new file mode 100644 index 0000000..6394dd9 --- /dev/null +++ b/model-optimizer/extensions/front/tf/sum_ext.py @@ -0,0 +1,28 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.extractor import FrontExtractorOp +from mo.graph.graph import Node +from mo.ops.reduce import Reduce + + +class SumFrontExtractor(FrontExtractorOp): + op = 'Sum' + enabled = True + + @staticmethod + def extract(node: Node): + Reduce.update_node_stat(node, {'keep_dims': node.pb.attr["keep_dims"].b, 'reduce_type': 'sum'}) + return __class__.enabled diff --git a/model-optimizer/extensions/front/tf/tensorflow_custom_operations_config_update.py b/model-optimizer/extensions/front/tf/tensorflow_custom_operations_config_update.py new file mode 100644 index 0000000..bf2e551 --- /dev/null +++ b/model-optimizer/extensions/front/tf/tensorflow_custom_operations_config_update.py @@ -0,0 +1,61 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import json + +from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph +from mo.utils.custom_replacement_config import parse_custom_replacement_config_file +from mo.utils.error import Error +from mo.utils.utils import refer_to_faq_msg + + +class TensorflowCustomOperationsConfigUpdate(FrontReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['cmd_params'].tensorflow_custom_operations_config_update is not None] + + def run_before(self): + return [] + + def run_after(self): + from extensions.front.freeze_placeholder_value import FreezePlaceholderValue + return [FreezePlaceholderValue] + + @staticmethod + def save_custom_replacement_config_file(descriptions: list, file_name: str): + """ + Save custom layer(s) description(s) to the file. + :param file_name: file to save description information to. + :param descriptions: list with instances of the CustomLayerDescriptor classes. + :return: True if operation is successful. + """ + try: + json.dump([replacement_desc.get_config_file_representation() for replacement_desc in descriptions], + open(file_name, "w"), indent=4, sort_keys=True) + except Exception as ex: + raise Error("failed to update configuration file {}: {}".format(file_name, str(ex))) + + def find_and_replace_pattern(self, graph: Graph): + argv = graph.graph['cmd_params'] + file_name = argv.tensorflow_custom_operations_config_update + + data = parse_custom_replacement_config_file(file_name) + if data is None: + raise Error("Cannot update the file '{}' because it is broken. ".format(file_name) + refer_to_faq_msg(73)) + + for replacement_desc in data: + replacement_desc.update_custom_replacement_attributes(graph) + + self.save_custom_replacement_config_file(data, file_name) diff --git a/model-optimizer/extensions/front/tf/tensorflow_patterns.py b/model-optimizer/extensions/front/tf/tensorflow_patterns.py new file mode 100644 index 0000000..c4e5673 --- /dev/null +++ b/model-optimizer/extensions/front/tf/tensorflow_patterns.py @@ -0,0 +1,51 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import mo.front.tf.custom_subgraph_call as csc +from mo.front.common.replacement import FrontReplacementPattern +from mo.graph.graph import Graph + + +class TensorflowSubgraphPatterns(FrontReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['cmd_params'].tensorflow_subgraph_patterns is not None] + + def run_before(self): + return [] + + def run_after(self): + from extensions.front.tf.tensorflow_custom_operations_config_update import \ + TensorflowCustomOperationsConfigUpdate + return [TensorflowCustomOperationsConfigUpdate] + + def find_and_replace_pattern(self, graph: Graph): + argv = graph.graph['cmd_params'] + csc.replace_subgraph_calls(graph, argv.tensorflow_subgraph_patterns) + + +class TensorflowOperationPatterns(FrontReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['cmd_params'].tensorflow_operation_patterns is not None] + + def run_before(self): + from extensions.front.tf.tensorflow_use_custom_operations_config import TensorflowUseCustomOperationsConfig + return [TensorflowUseCustomOperationsConfig] + + def run_after(self): + return [TensorflowSubgraphPatterns] + + def find_and_replace_pattern(self, graph: Graph): + argv = graph.graph['cmd_params'] + csc.offload_operations_to_tf(graph, argv.tensorflow_operation_patterns) diff --git a/model-optimizer/extensions/front/tf/tensorflow_use_custom_operations_config.py b/model-optimizer/extensions/front/tf/tensorflow_use_custom_operations_config.py new file mode 100644 index 0000000..8438657 --- /dev/null +++ b/model-optimizer/extensions/front/tf/tensorflow_use_custom_operations_config.py @@ -0,0 +1,44 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.custom_replacement_registry import CustomReplacementRegistry +from mo.front.common.replacement import FrontReplacementPattern +from mo.front.tf.replacement import FrontReplacementFromConfigFileOp +from mo.graph.graph import Graph +from mo.utils.class_registration import update_registration, get_enabled_and_disabled_transforms + + +class TensorflowUseCustomOperationsConfig(FrontReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['cmd_params'].tensorflow_use_custom_operations_config is not None] + + def run_before(self): + from extensions.front.pass_separator import FrontStart + return [FrontStart] + + def run_after(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + argv = graph.graph['cmd_params'] + registry = CustomReplacementRegistry() + registry.add_custom_replacement_description_from_config(argv.tensorflow_use_custom_operations_config) + + # automatically generate sub-classes for custom replacements that replace sub-graph with a single node + for replacement_desc in registry.get_all_replacements_descriptions(): + if replacement_desc.has('op'): + type('FrontReplacementFromConfigFileOp' + replacement_desc.op, (FrontReplacementFromConfigFileOp,), + {'replacement_id': replacement_desc.id}) + update_registration([FrontReplacementFromConfigFileOp], *get_enabled_and_disabled_transforms()) diff --git a/model-optimizer/extensions/front/tf/tile_ext.py b/model-optimizer/extensions/front/tf/tile_ext.py index 7f8e861..1d745b5 100644 --- a/model-optimizer/extensions/front/tf/tile_ext.py +++ b/model-optimizer/extensions/front/tf/tile_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/variable_ext.py b/model-optimizer/extensions/front/tf/variable_ext.py index 7f4c270..2028000 100644 --- a/model-optimizer/extensions/front/tf/variable_ext.py +++ b/model-optimizer/extensions/front/tf/variable_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/front/tf/variables_values_freezing.py b/model-optimizer/extensions/front/tf/variables_values_freezing.py new file mode 100644 index 0000000..c9be92e --- /dev/null +++ b/model-optimizer/extensions/front/tf/variables_values_freezing.py @@ -0,0 +1,36 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.front.tf.loader import variables_to_constants +from mo.graph.graph import Graph + + +class VariablesToConstants(FrontReplacementPattern): + enabled = True + force_clean_up = True + graph_condition = [lambda graph: graph.graph['variables_values']] + + def run_after(self): + from extensions.front.input_cut import InputCut + return [InputCut] + + def run_before(self): + from extensions.front.freeze_placeholder_value import FreezePlaceholderValue + return [FreezePlaceholderValue] + + def find_and_replace_pattern(self, graph: Graph): + variables_to_constants(graph, graph.graph['variables_values']) + del graph.graph['variables_values'] diff --git a/model-optimizer/extensions/front/tf/yolo_v3_tiny.json b/model-optimizer/extensions/front/tf/yolo_v3_tiny.json new file mode 100644 index 0000000..76f0a39 --- /dev/null +++ b/model-optimizer/extensions/front/tf/yolo_v3_tiny.json @@ -0,0 +1,14 @@ +[ + { + "id": "TFYOLOV3", + "match_kind": "general", + "custom_attributes": { + "classes": 80, + "anchors": [10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319], + "coords": 4, + "num": 6, + "mask": [0, 1, 2], + "entry_points": ["detector/yolo-v3-tiny/Reshape", "detector/yolo-v3-tiny/Reshape_4"] + } + } +] \ No newline at end of file diff --git a/model-optimizer/extensions/front/user_data_repack.py b/model-optimizer/extensions/front/user_data_repack.py new file mode 100644 index 0000000..2e6b88b --- /dev/null +++ b/model-optimizer/extensions/front/user_data_repack.py @@ -0,0 +1,42 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.replacement import FrontReplacementPattern +from mo.front.extractor import user_data_repack +from mo.graph.graph import Graph + + +class UserDataRepack(FrontReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + argv = graph.graph['cmd_params'] + + packed_user_shapes, packed_outputs, freeze_placeholder = user_data_repack( + graph, argv.placeholder_shapes, argv.output, argv.freeze_placeholder_with_value) + + graph.graph['user_shapes'] = packed_user_shapes + graph.graph['packed_outputs'] = packed_outputs + graph.graph['freeze_placeholder'] = freeze_placeholder + + inputs = list(packed_user_shapes.keys()) \ + if packed_user_shapes is not None and isinstance(packed_user_shapes, dict) else None + graph.graph['inputs'] = inputs # save user defined inputs for other extensions diff --git a/model-optimizer/extensions/middle/AddIsCyclicAttribute.py b/model-optimizer/extensions/middle/AddIsCyclicAttribute.py index c2616ad..d70495d 100644 --- a/model-optimizer/extensions/middle/AddIsCyclicAttribute.py +++ b/model-optimizer/extensions/middle/AddIsCyclicAttribute.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,11 +15,22 @@ """ import networkx as nx + +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern class AddIsCyclicAttribute(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.DeleteControlFlowEdges import DeleteControlFlowEdges + return [DeleteControlFlowEdges] + + def run_before(self): + return [] + @staticmethod - def find_and_replace_pattern(graph: nx.MultiDiGraph): + def find_and_replace_pattern(graph: Graph): is_acyclic = nx.is_directed_acyclic_graph(graph) - graph.graph['is_cyclic'] = not is_acyclic \ No newline at end of file + graph.graph['is_cyclic'] = not is_acyclic diff --git a/model-optimizer/extensions/middle/AddIsCyclicAttribute_test.py b/model-optimizer/extensions/middle/AddIsCyclicAttribute_test.py index 81f4ba7..ddc7b4c 100644 --- a/model-optimizer/extensions/middle/AddIsCyclicAttribute_test.py +++ b/model-optimizer/extensions/middle/AddIsCyclicAttribute_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/AddMeanScaleValues.py b/model-optimizer/extensions/middle/AddMeanScaleValues.py new file mode 100644 index 0000000..a72a9ad --- /dev/null +++ b/model-optimizer/extensions/middle/AddMeanScaleValues.py @@ -0,0 +1,122 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.graph.graph import Graph, Node +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.lin_op import Add, Mul +from mo.ops.op import Op +from mo.utils.error import Error +from mo.utils.utils import refer_to_faq_msg + + +class AddMeanScaleValues(MiddleReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + @staticmethod + def apply_scale(graph: Graph, input_node: Node, node_mean_scale_values: dict): + if 'scale' in node_mean_scale_values and node_mean_scale_values['scale'] is not None: + if all([x == 1 for x in node_mean_scale_values['scale']]): + return + out_node = input_node.out_node() + if not input_node.has_valid('shape'): + raise Error("Node {} has not valid shape attribute".format(input_node.id)) + input_shape = input_node.shape + + # Create Mul node + value = 1 / np.array(node_mean_scale_values['scale']) + graph.remove_edge(input_node.id, out_node.id) + + mul_node = Mul(graph, dict(name="Mul_")) + mul_data = Op.create_input_data_node(graph, "data_mul_", np.array(value)) + Op.expand_node_shape(mul_data, (len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 0)) + mul_input = Op.create_data_node(graph, input_node, {'shape': out_node.shape}) + + mul_node.create_node_with_data(inputs=[mul_input, mul_data], data_nodes=out_node) + + @staticmethod + def apply_mean_value(graph: Graph, input_node: Node, node_mean_scale_values: dict): + if 'mean' in node_mean_scale_values and node_mean_scale_values['mean'] is not None: + if all([x == 0 for x in node_mean_scale_values['mean']]): + return + out_node = input_node.out_node() + if not input_node.has_valid('shape'): + raise Error("Node {} has not valid shape attribute".format(input_node.id)) + input_shape = input_node.shape + # Create Add node + graph.remove_edge(input_node.id, out_node.id) + + value = np.array(node_mean_scale_values['mean']) * (-1) + + add_node = Add(graph, dict(name="Add_")) + add_data = Op.create_input_data_node(graph, "data_add_", np.array(value)) + Op.expand_node_shape(add_data, (len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 0)) + add_input = Op.create_data_node(graph, input_node, {'shape': out_node.shape}) + + add_node.create_node_with_data(inputs=[add_input, add_data], data_nodes=out_node) + + def find_and_replace_pattern(self, graph: Graph): + input_nodes = {} + values = graph.graph['cmd_params'].mean_scale_values + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == 'Placeholder': + input_nodes.update({node.id: node}) + + if not isinstance(values, dict): + if len(values) != len(input_nodes): + raise Error('Numbers of inputs and mean/scale values do not match. ' + + refer_to_faq_msg(61)) + + data = np.copy(values) + values = {} + for idx, key in enumerate(input_nodes.keys()): + values.update( + { + input_nodes[key]['name']: { + 'mean': data[idx][0], + 'scale': data[idx][1] + } + } + ) + + for node_name in values: + node_id = graph.get_node_id_by_name(node_name) + node_mean_scale_values = values[node_name] + if node_id not in input_nodes: + # if the user cutted-off input of the network then input node name specified in the --scale_values + # or --mean_values doesn't correspond to a real input node generated by Model Optimizer. But + # the information about initial input node name is stored in Placeholder's attribute 'initial_node_name' + new_node_id = None + for placeholder in input_nodes.values(): + if placeholder.has('initial_node_name') and placeholder.initial_node_name == node_name: + new_node_id = placeholder.id + break + if new_node_id is None: + raise Error('Input with name {} wasn\'t found!'.format(node_name) + + refer_to_faq_msg(83)) + node_id = new_node_id + + input_node = Node(graph, node_id) + AddMeanScaleValues.apply_scale(graph, input_node, node_mean_scale_values) + AddMeanScaleValues.apply_mean_value(graph, input_node, node_mean_scale_values) diff --git a/model-optimizer/extensions/middle/AddMeanScaleValues_test.py b/model-optimizer/extensions/middle/AddMeanScaleValues_test.py new file mode 100644 index 0000000..0cfa318 --- /dev/null +++ b/model-optimizer/extensions/middle/AddMeanScaleValues_test.py @@ -0,0 +1,252 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest +from argparse import Namespace + +import numpy as np + +from extensions.middle.AddMeanScaleValues import AddMeanScaleValues +from mo.graph.graph import Node +from mo.utils.cli_parser import get_mean_scale_dictionary, parse_tuple_pairs +from mo.utils.unittest.graph import build_graph + +nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'node_1_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'node_2': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'concat': {'type': 'Concat', 'value': None, 'kind': 'op'}, + 'node_3': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'node_3_data': {'value': None, 'kind': 'data', 'data_type': None}, + # Placeholders + 'placeholder_1': {'shape': None, 'type': 'Input', 'kind': 'op', 'op': 'Placeholder'}, + 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2': {'shape': None, 'type': 'Input', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_1': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_1_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'pl_2': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_2_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + # ScaleShift layer + 'scaleshift_1': {'type': 'ScaleShift', 'kind': 'op', 'op': 'ScaleShift'}, + 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + # Mul op + 'mul_1': {'type': None, 'kind': 'op', 'op': 'Mul'}, + 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput', 'infer': lambda x: None} + } + + +class AddMeanScaleValuesTest(unittest.TestCase): + def test_add_mean_scale_values_with_data_name(self): + graph = build_graph(nodes_attributes, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None, 'data_type': None}, + 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder', 'name': 'data', + 'data_type': None} + }, + nodes_with_edges_only=True) + graph.graph['layout'] = 'NCHW' + mean_values = parse_tuple_pairs('(124,117,104)') + scale_values = parse_tuple_pairs('') + + # input = 'data' + mean_scale = get_mean_scale_dictionary(mean_values, scale_values, None) + argv = Namespace(mean_scale_values=mean_scale) + graph.graph['cmd_params'] = argv + self.assertEqual(len(graph), 3) + AddMeanScaleValues().find_and_replace_pattern(graph) + self.assertEqual(len(graph), 6) + + def test_add_mean_scale_values_without_data_name(self): + graph = build_graph(nodes_attributes, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None, 'data_type': None}, + 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder', 'name': 'data', + 'data_type': None} + }, + nodes_with_edges_only=True) + graph.graph['layout'] = 'NCHW' + mean_values = parse_tuple_pairs('(124,117,104)') + scale_values = parse_tuple_pairs('') + # input = None + mean_scale = get_mean_scale_dictionary(mean_values, scale_values, None) + argv = Namespace(mean_scale_values=mean_scale) + graph.graph['cmd_params'] = argv + self.assertEqual(len(graph), 3) + AddMeanScaleValues().find_and_replace_pattern(graph) + self.assertEqual(len(graph), 6) + + def test_add_mean_scale_values1(self): + graph = build_graph(nodes_attributes, + [('pl_1', 'pl_1_data'), ('pl_2', 'pl_2_data')], + {'pl_1_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, + 'pl_2_data': {'shape': np.array([1, 6]), 'infer': None}, + 'pl_1': {'shape': np.array([1, 3, 38, 38])}, + 'pl_2': {'shape': np.array([1, 6])}, + }, + nodes_with_edges_only=True) + graph.graph['layout'] = 'NCHW' + argv = Namespace( + mean_scale_values={'pl_1': {'mean': np.array([1., 2., 3.])}, 'pl_2': {'mean': np.array([0., 0., 0.])}}) + graph.graph['cmd_params'] = argv + graph.graph['cmd_params'] = argv + AddMeanScaleValues().find_and_replace_pattern(graph) + mul_op_cnt = 0 + add_op_cnt = 0 + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == 'Mul': + mul_op_cnt += 1 + if node.has_valid('op') and node.op == 'Add': + add_op_cnt += 1 + + self.assertEqual(add_op_cnt, 1, "Found more than one Add op in graph") + self.assertEqual(mul_op_cnt, 0, "Found Mul op in graph") + + def test_optimize_scale_and_add_mean_values(self): + graph = build_graph( + nodes_attributes, + [ + ('pl_1', 'pl_1_data') + ], + { + 'pl_1_data': { + 'shape': np.array([1, 3, 38, 38]), + 'infer': None + }, + 'pl_1': { + 'shape': np.array([1, 3, 38, 38]) + } + }, + nodes_with_edges_only=True + ) + graph.graph['layout'] = 'NCHW' + argv = Namespace(mean_scale_values={'pl_1': {'scale': np.array([1.]), 'mean': np.array([1., 2., 3.])}}) + graph.graph['cmd_params'] = argv + AddMeanScaleValues().find_and_replace_pattern(graph) + mul_op_cnt = 0 + add_op_cnt = 0 + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == 'Mul': + mul_op_cnt += 1 + if node.has_valid('op') and node.op == 'Add': + add_op_cnt += 1 + + self.assertEqual(add_op_cnt, 1, "Found more than one Add op in graph") + self.assertEqual(mul_op_cnt, 0, "Found Mul op in graph") + + def test_optimize_mean_and_add_scale_values(self): + graph = build_graph( + nodes_attributes, + [ + ('pl_1', 'pl_1_data') + ], + { + 'pl_1_data': { + 'shape': np.array([1, 3, 38, 38]), + 'infer': None + }, + 'pl_1': { + 'shape': np.array([1, 3, 38, 38]) + } + }, + nodes_with_edges_only=True + ) + graph.graph['layout'] = 'NCHW' + argv = Namespace(mean_scale_values={'pl_1': {'scale': np.array([1.43]), 'mean': np.array([0., 0., 0.])}}) + graph.graph['cmd_params'] = argv + AddMeanScaleValues().find_and_replace_pattern(graph) + mul_op_cnt = 0 + add_op_cnt = 0 + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == 'Mul': + mul_op_cnt += 1 + if node.has_valid('op') and node.op == 'Add': + add_op_cnt += 1 + + self.assertEqual(add_op_cnt, 0, "Found more than one Add op in graph") + self.assertEqual(mul_op_cnt, 1, "Found Mul op in graph") + + def test_add_mean_scale_values3(self): + graph = build_graph(nodes_attributes, + [('pl_1', 'pl_1_data')], + {'pl_1_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, + 'pl_1': {'shape': np.array([1, 3, 38, 38])}, + }, + nodes_with_edges_only=True) + graph.graph['layout'] = 'NCHW' + argv = Namespace(mean_scale_values=[[np.array([1., 2., 3.]), np.array([1., 2., 3.])]]) + graph.graph['cmd_params'] = argv + AddMeanScaleValues().find_and_replace_pattern(graph) + + mul_op_cnt = 0 + add_op_cnt = 0 + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == 'Mul': + mul_op_cnt += 1 + if node.has_valid('op') and node.op == 'Add': + add_op_cnt += 1 + + self.assertEqual(add_op_cnt, 1, "Found more than one Add op in graph") + self.assertEqual(mul_op_cnt, 1, "Found more than one Nul op in graph") + + def test_add_mean_scale_values_cut_graph(self): + """ + Test case when user cutted start of the network and specified mean/scale value to the new input node 'node_3'. + """ + graph = build_graph(nodes_attributes, + [('pl_1', 'pl_1_data'), + ('pl_2', 'pl_2_data'), + ('pl_2_data', 'node_3'), + ('node_3', 'node_3_data'), + ('pl_1_data', 'node_1'), + ('node_3_data', 'node_1'), + ], + {'pl_1_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, + 'pl_2_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, + 'pl_2': {'initial_node_name': 'node_3', 'shape': np.array([1, 3, 38, 38])}, + 'pl_1': {'shape': np.array([1, 3, 38, 38])}, + }, + nodes_with_edges_only=True) + graph.graph['layout'] = 'NCHW' + argv = Namespace( + mean_scale_values={'pl_1': {'mean': np.array([1, 2, 3])}, 'node_3': {'scale': np.array([1, 2, 3])}}) + graph.graph['cmd_params'] = argv + AddMeanScaleValues().find_and_replace_pattern(graph) + + mul_op_cnt = 0 + add_op_cnt = 0 + for node in graph.nodes(): + node = Node(graph, node) + if node.has_valid('op') and node.op == 'Mul': + mul_op_cnt += 1 + if node.has_valid('op') and node.op == 'Add': + add_op_cnt += 1 + + self.assertEqual(add_op_cnt, 1, "There should be exactly one Add op") + self.assertEqual(mul_op_cnt, 1, "There should be exactly one Mul op") + self.assertEqual(Node(graph, 'pl_2').out_node().out_node().op, 'Mul', "The Mul op should be added after pl_2") + self.assertEqual(Node(graph, 'pl_1').out_node().out_node().op, 'Add', "The Add op should be added after pl_1") diff --git a/model-optimizer/extensions/middle/AddQuantizeFuse.py b/model-optimizer/extensions/middle/AddQuantizeFuse.py new file mode 100644 index 0000000..7dcbc3e --- /dev/null +++ b/model-optimizer/extensions/middle/AddQuantizeFuse.py @@ -0,0 +1,80 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +from typing import Dict + +from mo.graph.graph import Graph, Node +from mo.middle.passes.conv import get_tensor_in_port, get_value_in_port +from mo.middle.replacement import MiddleReplacementPattern + + +class AddQuantizeFuse(MiddleReplacementPattern): + """ Fuses Add --> Quantize sequence if possible + """ + enabled = False + + def run_after(self): + return [] + + def run_before(self): + return [] + + def pattern(self): + return dict( + nodes=[ + ('preop', dict(op='Add')), + ('preoped', dict()), + ('quantize', dict(op='Quantize')), + ], + edges=[ + ('preop', 'preoped'), + ('preoped', 'quantize', {'in': 0}), + ] + ) + + def replace_pattern(self, graph: Graph, match: Dict[str, Node]): + + quantize = match['quantize'] + preop = match['preop'] + + # Check for total number of Add consumers -- if something else consume its output it cannot be fused + if len(preop.out_node().out_nodes()) > 1: + log.debug('AddQuantizeFuse: cannot fuse because Add have Addtiple consumers') + return + + # If the fusion is applicable, direct modifications to quantize 1-st and 2-nd inputs + # are performed. So the data nodes at those inputs shouldn't have more than 1 consumer + # maximum 2 consumers to the same quantize op (consumed by 1st and 2nd ports). + # TODO: relax this limitation and duplicate data nodes accordingly to modify the input range freely + + # Provisional limitation that related to binary quantization + # TODO: Relax it beyond binarization case + if len(quantize.in_node(1).out_nodes()) != 1 or \ + len(quantize.in_node(2).out_nodes()) != 1 or \ + len(quantize.in_node(3).out_nodes()) != 1 or len(quantize.in_node(4).out_nodes()) != 1 or \ + quantize.levels != 2: + log.debug('AddQuantizeFuse: cannot fuse because Quantize op has ' + 'unexpected number of consumers for ports 1, 2, 3 or 4') + return + + tensor_port, value_port = get_tensor_in_port(preop), get_value_in_port(preop) + + quantize.in_port(1).data.set_value(quantize.in_port(1).data.get_value() - value_port.data.get_value()) + quantize.in_port(2).data.set_value(quantize.in_port(2).data.get_value() - value_port.data.get_value()) + quantize.in_port(0).disconnect() + tensor_port.get_connection().set_destination(quantize.in_port(0)) diff --git a/model-optimizer/extensions/middle/AddReshapeAfterStridedSlice.py b/model-optimizer/extensions/middle/AddReshapeAfterStridedSlice.py deleted file mode 100644 index 2ed08ff..0000000 --- a/model-optimizer/extensions/middle/AddReshapeAfterStridedSlice.py +++ /dev/null @@ -1,124 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import logging as log -import networkx as nx -import numpy as np - -from copy import deepcopy -from extensions.middle.UselessStridedSlice import UselessStridedSliceEraser - -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.ops.reshape import Reshape - - -class AddReshapeAfterStridedSlice(MiddleReplacementPattern): - """ - Transform adds Reshape after StridedSlice layers if new_axis_mask or/and - shrink_axis_mask contains True. After this transform StridedSlice layer - does not change shape dims and new_axis_mask/shrink_axis_mask fulfilled by - False - """ - enabled = True - - # Run before passes that will convert/remove StridedSlice - def run_before(self): - return [UselessStridedSliceEraser] - - def pattern(self): - return dict(nodes=[('strided_slice', dict(kind='op', op='StridedSlice'))], - edges=[]) - - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): - # add Reshape for shrink_axis_mask - if True in match['strided_slice']['shrink_axis_mask']: - log.info("StridedSlice op with shrink mask '{}' has been detected".format(match['strided_slice'].id)) - node = match['strided_slice'] - - if len(node.in_nodes()) != 4 or len(node.out_nodes()) != 1: - return - - shape_in = node.in_node().shape - shape_out = node.out_node().shape - dim = shape_out.copy() - ss_shape = [] - k = 0 - - # Don't permute reshape if channels were squeezed - dont_permute = False - if graph.graph['layout'] == 'NHWC' and node['shrink_axis_mask'][-1] == True: - dont_permute = True - - for i in range(0, len(node['shrink_axis_mask'])): - if not node['shrink_axis_mask'][i]: - ss_shape.append(shape_out[k]) - k = k + 1 - else: - node['shrink_axis_mask'][i] = False - ss_shape.append(1) - - out_node = node.out_node(0) - - # insert data node for StridedSlice - data_node = Op._create_data_node(graph, node.name + "/Reshape_shrink_data", {'shape': ss_shape}) - attrs = deepcopy(graph.get_edge_data(node.id, out_node.id)[0]) - graph.remove_edge(node.id, out_node.id) - graph.add_edge(node.id, data_node.id, **attrs) - - # insert Reshape - if dont_permute: - reshape = Reshape(graph, dict(name=node.name + "/Reshape_shrink", - dim=np.array(dim, dtype=np.int64), nchw_layout=True)) - reshape_data_node = reshape.create_node_with_data([data_node], reshape.attrs, - data_nodes=[out_node]) - reshape_data_node['nchw_layout'] = True - else: - reshape = Reshape(graph, dict(name=node.name + "/Reshape_shrink", - dim=np.array(dim, dtype=np.int64))) - reshape_data_node = reshape.create_node_with_data([data_node], reshape.attrs, - data_nodes=[out_node]) - - # add Reshape for new_axis_mask - if True in match['strided_slice']['new_axis_mask']: - log.info("StridedSlice op with new axis mask '{}' has been detected".format(match['strided_slice'].id)) - node = match['strided_slice'] - - if len(node.in_nodes()) != 4 or len(node.out_nodes()) != 1: - return - - shape_in = node.in_node().shape - shape_out = node.out_node().shape - dim = shape_out.copy() - ss_shape = [] - for i in range(0, len(node['new_axis_mask'])): - if not node['new_axis_mask'][i]: - ss_shape.append(shape_out[i]) - else: - node['new_axis_mask'][i] = False - - out_node = node.out_node(0) - # insert data node for StridedSlice - data_node = Op._create_data_node(graph, node.name + "/Reshape_new_data", {'shape': ss_shape}) - attrs = deepcopy(graph.get_edge_data(node.id, out_node.id)[0]) - graph.remove_edge(node.id, out_node.id) - graph.add_edge(node.id, data_node.id, **attrs) - - # insert Reshape - reshape = Reshape(graph, dict(name=node.name + "/Reshape_new", - dim=np.array(dim, dtype=np.int64))) - reshape_data_node = reshape.create_node_with_data([data_node], reshape.attrs, - data_nodes=[out_node]) diff --git a/model-optimizer/extensions/middle/AddReshapeAfterStridedSlice_test.py b/model-optimizer/extensions/middle/AddReshapeAfterStridedSlice_test.py deleted file mode 100644 index a834d99..0000000 --- a/model-optimizer/extensions/middle/AddReshapeAfterStridedSlice_test.py +++ /dev/null @@ -1,312 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np -import unittest - -from extensions.middle.AddReshapeAfterStridedSlice import AddReshapeAfterStridedSlice -from mo.graph.graph import Node -from mo.middle.passes.fusing.fuse_linear_ops_test import compare_graphs -from mo.middle.passes.eliminate_test import build_graph - -# The dictionary with nodes attributes used to build various graphs. A key is the name of the node and the value is the -# dictionary with node attributes. -nodes_attributes_test = { - 'placeholder_1': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, - 'placeholder_1_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_2': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, - 'placeholder_2_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_begin_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_end_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_stride_data': {'shape': None, 'kind': 'data', 'data_type': None}, - # StridedSlice layers - 'sslice_1': {'type': 'StridedSlice', 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, True, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_1_data': {'shape': None, 'kind': 'data'}, - 'sslice_2': {'type': 'StridedSlice', 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, True, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_2_data': {'shape': None, 'kind': 'data'}} - -nodes_reshape = { - 'placeholder_1': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, - 'placeholder_1_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_2': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, - 'placeholder_2_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_begin_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_end_data': {'shape': None, 'kind': 'data', 'data_type': None}, - 'placeholder_stride_data': {'shape': None, 'kind': 'data', 'data_type': None}, - # StridedSlice layers - 'sslice_1': {'type': 'StridedSlice', 'value': None, 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, True, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_1_data': {'value': None, 'shape': None, 'kind': 'data'}, - 'sslice_2': {'type': 'StridedSlice', 'value': None, 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, True, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_2_data': {'value': None, 'shape': None, 'kind': 'data'}, - # Reshape layer - 'sslice_1/Reshape_shrink': {'type': 'Reshape', 'value': None, 'kind': 'op', 'op': 'Reshape'}, - 'sslice_1/Reshape_shrink_data': {'value': None, 'shape': None, 'kind': 'data'}, - 'sslice_2/Reshape_shrink': {'type': 'Reshape', 'value': None, 'kind': 'op', 'op': 'Reshape'}, - 'sslice_2/Reshape_shrink_data': {'value': None, 'shape': None, 'kind': 'data'}, - 'sslice_2/Reshape_new': {'type': 'Reshape', 'value': None, 'kind': 'op', 'op': 'Reshape'}, - 'sslice_2/Reshape_new_data': {'value': None, 'shape': None, 'kind': 'data'}, -} - - -class AddReshapeAfterStridedSliceTests(unittest.TestCase): - def test_ss_1_shrink_last(self): - graph = build_graph(nodes_attributes_test, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_1'), - ('placeholder_begin_data', 'sslice_1'), - ('placeholder_end_data', 'sslice_1'), - ('placeholder_stride_data', 'sslice_1'), - ('sslice_1', 'sslice_1_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_1': {'slices': np.array( - [slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)])}, - 'sslice_1_data': {'shape': np.array([1, 227, 54]), 'is_output': True}, - }) - graph.graph['layout'] = 'NHWC' - - graph_ref = build_graph(nodes_reshape, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_1'), - ('placeholder_begin_data', 'sslice_1'), - ('placeholder_end_data', 'sslice_1'), - ('placeholder_stride_data', 'sslice_1'), - ('sslice_1', 'sslice_1/Reshape_shrink_data'), - ('sslice_1/Reshape_shrink_data', 'sslice_1/Reshape_shrink'), - ('sslice_1/Reshape_shrink', 'sslice_1_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_1': {'slices': np.array( - [slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), - 'shrink_axis_mask': np.array([False, False, False, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_1_data': {'shape': np.array([1, 227, 54]), 'is_output': True}, - 'sslice_1/Reshape_shrink': {'dim': np.array([1, 227, 54])}, - 'sslice_1/Reshape_shrink_data': {'shape': np.array([1, 227, 1, 54])} - }) - - pattern = AddReshapeAfterStridedSlice() - pattern.find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_1_data', check_op_attrs=True) - graph.clear() - graph_ref.clear() - self.assertTrue(flag, resp) - - def test_ss_1_shrink(self): - graph = build_graph(nodes_attributes_test, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data'), ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), }, - 'sslice_2_data': {'shape': np.array([1, 227, 54]), 'is_output': True} - }) - graph.graph['layout'] = 'NHWC' - - graph_ref = build_graph(nodes_reshape, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2/Reshape_shrink_data'), - ('sslice_2/Reshape_shrink_data', 'sslice_2/Reshape_shrink'), - ('sslice_2/Reshape_shrink', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), - 'shrink_axis_mask': np.array([False, False, False, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_2_data': {'shape': np.array([1, 227, 54])}, - 'sslice_2/Reshape_shrink': {'dim': np.array([1, 227, 54])}, - 'sslice_2/Reshape_shrink_data': {'shape': np.array([1, 227, 1, 54])}, - }) - - pattern = AddReshapeAfterStridedSlice() - pattern.find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) - graph.clear() - graph_ref.clear() - self.assertTrue(flag, resp) - - def test_ss_2_shrink(self): - graph = build_graph(nodes_attributes_test, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data'), ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': { - 'slices': np.array([slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1)]), - 'shrink_axis_mask': np.array([False, True, False, True])}, - 'sslice_2_data': {'shape': np.array([1, 227]), 'is_output': True} - }) - graph.graph['layout'] = 'NHWC' - - graph_ref = build_graph(nodes_reshape, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2/Reshape_shrink_data'), - ('sslice_2/Reshape_shrink_data', 'sslice_2/Reshape_shrink'), - ('sslice_2/Reshape_shrink', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1)]), - 'shrink_axis_mask': np.array([False, False, False, False]), - 'new_axis_mask': np.array([False, False, False, False])}, - 'sslice_2_data': {'shape': np.array([1, 227])}, - 'sslice_2/Reshape_shrink': {'dim': np.array([1, 227])}, - 'sslice_2/Reshape_shrink_data': {'shape': np.array([1, 1, 227, 1])}, - }) - - pattern = AddReshapeAfterStridedSlice() - pattern.find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) - graph.clear() - graph_ref.clear() - self.assertTrue(flag, resp) - - def test_ss_1_new(self): - graph = build_graph(nodes_attributes_test, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data'), ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 54, 1)]), - 'shrink_axis_mask': np.array([False, False, False, False, False]), - 'new_axis_mask': np.array([False, True, False, False, False])}, - 'sslice_2_data': {'shape': np.array([1, 1, 227, 227, 54])} - }) - graph.graph['layout'] = 'NHWC' - - graph_ref = build_graph(nodes_reshape, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2/Reshape_new_data'), - ('sslice_2/Reshape_new_data', 'sslice_2/Reshape_new'), - ('sslice_2/Reshape_new', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), - slice(0, 54, 1)]), - 'shrink_axis_mask': np.array([False, False, False, False, False]), - 'new_axis_mask': np.array([False, False, False, False, False])}, - 'sslice_2_data': {'shape': np.array([1, 1, 227, 227, 54])}, - 'sslice_2/Reshape_new': {'dim': np.array([1, 1, 227, 227, 54])}, - 'sslice_2/Reshape_new_data': {'shape': np.array([1, 227, 227, 54])}, - }) - - pattern = AddReshapeAfterStridedSlice() - pattern.find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) - graph.clear() - graph_ref.clear() - self.assertTrue(flag, resp) - - def test_ss_shrink_new(self): - graph = build_graph(nodes_attributes_test, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data'), ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), - 'shrink_axis_mask': np.array([False, False, False, True, False]), - 'new_axis_mask': np.array([False, True, False, False, False])}, - 'sslice_2_data': {'shape': np.array([1, 1, 227, 54]), 'is_output': True} - }) - graph.graph['layout'] = 'NHWC' - - graph_ref = build_graph(nodes_reshape, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'sslice_2'), - ('placeholder_begin_data', 'sslice_2'), - ('placeholder_end_data', 'sslice_2'), - ('placeholder_stride_data', 'sslice_2'), - ('sslice_2', 'sslice_2/Reshape_new_data'), - ('sslice_2/Reshape_new_data', 'sslice_2/Reshape_new'), - ('sslice_2/Reshape_new', 'sslice_2/Reshape_shrink_data'), - ('sslice_2/Reshape_shrink_data', 'sslice_2/Reshape_shrink'), - ('sslice_2/Reshape_shrink', 'sslice_2_data'), - ('sslice_2_data', 'placeholder_2'), - ('placeholder_2', 'placeholder_2_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, - 'sslice_2': {'slices': np.array( - [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), - slice(0, 54, 1)]), - 'shrink_axis_mask': np.array([False, False, False, False, False]), - 'new_axis_mask': np.array([False, False, False, False, False])}, - 'sslice_2_data': {'shape': np.array([1, 1, 227, 54])}, - 'sslice_2/Reshape_new': {'dim': np.array([1, 1, 227, 1, 54])}, - 'sslice_2/Reshape_new_data': {'shape': np.array([1, 227, 1, 54])}, - 'sslice_2/Reshape_shrink': {'dim': np.array([1, 1, 227, 54])}, - 'sslice_2/Reshape_shrink_data': {'shape': np.array([1, 1, 227, 1, 54])}, - }) - - pattern = AddReshapeAfterStridedSlice() - pattern.find_and_replace_pattern(graph) - - (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) - graph.clear() - graph_ref.clear() - self.assertTrue(flag, resp) - - -if __name__ == '__main__': - unittest.main() diff --git a/model-optimizer/extensions/middle/BinarizeWeightsM1P1.py b/model-optimizer/extensions/middle/BinarizeWeightsM1P1.py new file mode 100644 index 0000000..6700290 --- /dev/null +++ b/model-optimizer/extensions/middle/BinarizeWeightsM1P1.py @@ -0,0 +1,154 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from extensions.middle.CheckForCycle import CheckForCycle +from extensions.middle.DeleteControlFlowEdges import DeleteControlFlowEdges +from extensions.middle.DeleteNotExecutable import DeleteNotExecutable +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.lin_op import Mul +from mo.ops.power import Power + + +class BinarizeWeightsM1P1(MiddleReplacementPattern): + """ Convert weights to -1/+1 form + + Applicable for convolutions and other operations that have 'weights' that combined with the input data + by mean of multiplication operation. So any linear operator suits. Detect such operations by + multiplication_transparent attribute -- if it is presents and set to True, then multiplication term + can be passed through the operation. If multiplication_transparent attribute is set to True for an operation, + such operation should also has multiplication_transparent_ports that contain a list of pairs with + port indices (in_port, out_port) that defines which port pairs can pass multiplication through. + + For example for some convolutional operation which has 2 ports (input tensor and weights) and 1 output port + this list includes [(0,0)(1,0)]. If convolutional operation also has biases at port 2, it is not included into + this list because this port is not transparent for multiplication operation. + + multiplication_transparent_ports can be None if all possible input/output pairs are multiplication + transparent. + + #TODO Describe how to apply multiplication at output ports -- this is not specified. In the current definition + we can pass through only scalar multiplication, but we already requre passing it channel-wise. + """ + enabled = True + + def run_after(self): + return [DeleteControlFlowEdges] + + def run_before(self): + # CheckForCycle and DeleteNotExecutable run graph clean up which should not be run before weights binarization + return [CheckForCycle, DeleteNotExecutable] + + def pattern(self): + return dict( + nodes=[ + ('quantize', dict(kind='op', op='Quantize')), + ('quantized', dict()), + ('operator', dict(kind='op', multiplication_transparent=True)), + ], + edges=[ + ('quantize', 'quantized'), + ('quantized', 'operator'), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + assert match['operator'].has('multiplication_transparent_ports') + + port = match['operator'].input_ports_with(match['quantized']) + assert len(port) >= 1 + if len(port) > 1: + log.debug('BinarizeWeightsM1P1 cannot apply transformation for data {} because it consumed more' + ' than once'.format(match['quantized'].name)) + return + + assert len(port) == 1 + port = port[0] + applicable = [pair for pair in match['operator'].multiplication_transparent_ports if pair[0] == port] + if len(applicable) == 0: + return + + # Look at 3-rd and 4-th inputs of Quantize -- they have constants that should be passed through. + # Assume that the constant that should be passed through is a scalar. + quantize = match['quantize'] + output_low = quantize.in_node(3) + output_high = quantize.in_node(4) + + if not output_low.has_valid('value') and not output_high.has_valid('value'): + return + + output_low = output_low.value + output_high = output_high.value + + # This pass is applicable for binarization only. Other intX variants are not relevant. + if quantize.levels != 2: + return + + # Recognize two cases: 0/+1 and -1/+1. + zp1 = np.all(output_low == 0) or np.all(output_high == 0) + m1p1 = np.all(-output_low == output_high) + if (not zp1 and not m1p1) or (zp1 and m1p1): + log.debug('BinarizeWeightsM1P1 cannot apply transformation for data {} because it does\'t has one of' + ' 0/+1 or -1/+1 forms.'.format(match['quantized'].name)) + return + + # Recognize scalar + if len(np.unique(output_low)) != 1 or len(np.unique(output_high)) != 1: + log.debug('BinarizeWeightsM1P1 cannot apply transformation for data {} because output_low or output_high ' + 'cannot be interpreted as scalars.'.format(match['quantized'].name)) + return + + # TODO: Extract real scalar from 3rd and 4th inputs; reusing original tensors is dangerous because + # it may have incompatible shape. + + mult_term = quantize.in_node(3) if np.all(output_high == 0) else quantize.in_node(4) + + # Patch inflow path (by diving by mult_term) + # Put a new Power/Mul combination here: + # ---->---- (here)---> data ---> [3rd/4th ports]quantize ---> quantized ---> operator + + if len(match['quantized'].out_nodes()) > 1: + log.debug('BinarizeWeightsM1P1: len(match[\'quantized\'].out_nodes()) > 1') + return + div_op = Power(graph, {'name': quantize.name + '/DivNormalize', 'power': -1.0}) + div_output = div_op.create_node_with_data([mult_term]) + + for i in [3, 4]: + match['quantize'].insert_node_with_data_before( + match['quantize'].in_node(i), + Mul, + dict(name=quantize.name + '/MulNormalize'), + additional_inputs=[div_output], + ) + + match['quantized'].value = None # reset value because it will be recomputed + match['quantize'].infer(match['quantize']) + + # Put a complimentary new Mul node here: operator -->---(here)-----> operator.out_node() + + match['operator'].insert_node_with_data_after( + match['operator'].out_node(), + Mul, + dict(name=match['operator'].name + '/MulNormalize'), + [mult_term], + ) + + # Disable 'operator' fusion with linear ops, otherwise it will annihilate changes that we just made + match['operator']['can_be_fused'] = False diff --git a/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py b/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py index 9835442..aa4bdf6 100644 --- a/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py +++ b/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,20 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import networkx as nx import numpy as np -from extensions.middle.FusePermutesSequence import FusePermutesSequence -from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize -from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator +from extensions.ops.LSTM import LSTM +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.utils.error import Error class BlockLSTMtoLSTMSequence(MiddleReplacementPattern): """ - MO virtual operation LSTMSequence that converts to IE TensorIterator with LSTMCell inside supports 3 outputs: + MO virtual operation RNNSequence that converts to IE TensorIterator with LSTMCell inside supports 3 outputs: 0: concatenated hidden states over the whole time sequence, 1: last hidden state, 2: last cell state. @@ -37,13 +34,21 @@ class BlockLSTMtoLSTMSequence(MiddleReplacementPattern): 2. Searches for sub-graph, that takes last cell state out of unsupported concatenated cell state output. We cut this sub-graph off in case if there are no other consumers of concatenated cell state output and we connect BlockLSTM to consumers of this sub-graph by port producing last cell state output - 3. (Optional. Resolves by multiple checks) We cut the same sug-graph (as in 2) for concatenated cell states check + 3. Renumber input ports of BlockLSTM to match RNNSequence specification. + 4. (Optional. Resolves by multiple checks) We cut the same sug-graph (as in 2) for concatenated cell states check for better performance """ enabled = True def run_before(self): - return [FusePermutesSequence, LSTMSequenceTensorIterator] + from extensions.middle.FusePermutesSequence import FusePermutesSequence + from extensions.middle.LSTMRNNSequenceToTensorIterator import LSTMToTensorIterator + return [FusePermutesSequence, LSTMToTensorIterator] + + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + from extensions.middle.RNNSequenceNormalizeToIE import RNNSequenceNormalize + return [MiddleStart, RNNSequenceNormalize] def pattern(self): return dict( @@ -96,11 +101,11 @@ class BlockLSTMtoLSTMSequence(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): time_len = match['concatenated_hidden_states'].shape[0] """ Working with concatenated_cell_states_data part first, because IE TensorIterator primitive doesn't have - concatenated cell states output and if we can not collepse it, then we does not support this type of BlockLSTM + concatenated cell states output and if we can not collapse it, then we does not support this type of BlockLSTM We simplify the sub-graph below by taking another output of BlockLSTM: concatenated cell states over the whole time sequence -> last cell state @@ -156,8 +161,10 @@ class BlockLSTMtoLSTMSequence(MiddleReplacementPattern): hidden_size = node.in_node(3).shape[-1] weights = weights_node.value biases = biases_node.value - assert weights.shape[0] == input_size + hidden_size, "weights.shape={} input_size={} hidden_size={}".format(weights.shape, input_size, hidden_size) - assert weights.shape[1] == biases.shape[0] == 4 * hidden_size, "weights.shape={} biases.shape={} hidden_size={}".format(weights.shape, biases.shape, hidden_size) + assert weights.shape[0] == input_size + hidden_size, \ + "weights.shape={} input_size={} hidden_size={}".format(weights.shape, input_size, hidden_size) + assert weights.shape[1] == biases.shape[0] == 4 * hidden_size, \ + "weights.shape={} biases.shape={} hidden_size={}".format(weights.shape, biases.shape, hidden_size) weights = weights.reshape([ weights.shape[0], @@ -199,15 +206,35 @@ class BlockLSTMtoLSTMSequence(MiddleReplacementPattern): graph.add_edge(match['BlockLSTM'].id, match['gather_1_data'].id, **attrs) - match['BlockLSTM'].op = 'LSTMSequence' - match['BlockLSTM']['sequence_dim'] = 0 # TF reference - match['BlockLSTM']['batch_dim'] = 1 # TF reference - match['BlockLSTM']['direction'] = 'forward' # TF reference - match['BlockLSTM']['hidden_size'] = match['concatenated_hidden_states'].shape[-1] - match['BlockLSTM']['format'] = 'tf' + """ + #3 Renumbering h_init_state, c_init_state input ports to match RNNSequence ports order. + """ + h_init_port = 4 + c_init_port = 5 + # c_init_state + if 4 in node.in_nodes(): + assert c_init_port not in node.in_nodes() + cell_state_edge = graph.get_edge_data(node.in_node(4).id, node.id) + cell_state_edge[0]['in'] = c_init_port + + + #h_init_state + if 3 in node.in_nodes(): + assert h_init_port not in node.in_nodes() + hidden_state_edge = graph.get_edge_data(node.in_node(3).id, node.id) + hidden_state_edge[0]['in'] = h_init_port + + new_attrs = {'sequence_dim': 0, + 'batch_dim': 1, + 'direction': 'forward', + 'hidden_size': match['concatenated_hidden_states'].shape[-1], + 'format': 'tf', + } + + LSTM.update_node_stat(match['BlockLSTM'], new_attrs) """ - Optional #3 optimization from class description following + Optional #4 optimization from class description following """ data_to_mul = [n for n in match['mul'].in_nodes().values() if n.id != match['concatenated_hidden_states'].id] if len(data_to_mul) != 1: diff --git a/model-optimizer/extensions/middle/Cast.py b/model-optimizer/extensions/middle/Cast.py new file mode 100644 index 0000000..fad89d7 --- /dev/null +++ b/model-optimizer/extensions/middle/Cast.py @@ -0,0 +1,41 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.middle.RemoveIdentity import RemoveIdentity +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class CastToFloatMark(MiddleReplacementPattern): + enabled = True + + def run_before(self): + return [RemoveIdentity] + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def pattern(self): + return dict( + nodes=[('op', dict(op='Cast', dst_type=np.float32))], + edges=[]) + + def replace_pattern(self, graph: Graph, match: dict): + # resulting network is fully floating point, so casts to float are useless + match['op']['identity'] = True + \ No newline at end of file diff --git a/model-optimizer/extensions/middle/ChangePlaceholderTypes.py b/model-optimizer/extensions/middle/ChangePlaceholderTypes.py new file mode 100644 index 0000000..bfba1c1 --- /dev/null +++ b/model-optimizer/extensions/middle/ChangePlaceholderTypes.py @@ -0,0 +1,94 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +from mo.graph.graph import Graph, Node +from mo.middle.passes.fusing.helpers import get_next_operation +from mo.middle.replacement import MiddleReplacementPattern +from mo.utils.error import Error +from mo.utils.utils import refer_to_faq_msg + + +class ChangePlaceholderTypes(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['fw'] == 'tf'] + force_clean_up = True + + def run_after(self): + return [] + + def run_before(self): + from extensions.middle.ScaleInput import ScaleInput + return [ScaleInput] + + @staticmethod + def change_node_type(node: Node, new_type: type): + node.graph.node[node.id]['pb'].attr['dtype'].type = new_type + + @staticmethod + def is_node_casts_to_float(node: Node): + from tensorflow.core.framework import types_pb2 as tf_types # pylint: disable=no-name-in-module + attrs = node.graph.node[node.id] + return 'pb' in attrs and attrs['pb'].op == 'Cast' and attrs['pb'].attr['DstT'].type == tf_types.DT_FLOAT + + @staticmethod + def remove_node_preserving_edges(pl_node: Node, nodes: list): + graph = pl_node.graph + pl_node_data = pl_node.out_node() + + # Disconnect Placeholder data node from Cast nodes + for out_node in pl_node.out_node().out_nodes(): + graph.remove_edge(pl_node_data.id, out_node.id) + + # Move edges from Cast data nodes to Placeholder data node + for cast_node in nodes: + # it is necessary to create a list from the result of function "graph.out_edges()" because we modify + # the graph during iteration over the list. networkx version 2.1 raises error without creating a list + for u, v, d in list(graph.out_edges(cast_node.out_node().id, data=True)): + graph.remove_edge(u, v) + graph.add_edges_from([(pl_node_data.id, v, d)]) + + @staticmethod + def is_node_gather(node: Node): + attrs = node.graph.node[node.id] + return 'pb' in attrs and attrs['pb'].op == 'GatherV2' and attrs['precision'] == 'FP32' + + def find_and_replace_pattern(self, graph: Graph): + from tensorflow.core.framework import types_pb2 as tf_types # pylint: disable=no-name-in-module + for node_name, node_attrs in list(graph.nodes(data=True)): + node = Node(graph, node_name) + pb = node_attrs.get('pb') + if pb is not None and pb.op == 'Placeholder' and pb.attr['dtype'].type != tf_types.DT_FLOAT: + log.info('Placeholder "{}" has type that is different from DT_FLOAT'.format(node_name)) + next_ops = get_next_operation(node) + # check that all output nodes are nodes of type 'ToFloat' + if all([ChangePlaceholderTypes.is_node_casts_to_float(op) and + len(op.in_nodes()) == 1 for op in next_ops]): + ChangePlaceholderTypes.change_node_type(node, tf_types.DT_FLOAT) + ChangePlaceholderTypes.remove_node_preserving_edges(node, next_ops) # remove 'Cast' nodes + + elif all([ChangePlaceholderTypes.is_node_gather(op) for op in next_ops] for op in next_ops): + ChangePlaceholderTypes.change_node_type(node, tf_types.DT_FLOAT) + + else: + raise Error( + ('Cannot convert type of placeholder "{}" because not all of its outputs are "Cast" to float ' + 'operations: {}. ' + + refer_to_faq_msg(49)), + node.soft_get('name'), + [op.soft_get('name') for op in next_ops] + ) diff --git a/model-optimizer/extensions/middle/CheckForCycle.py b/model-optimizer/extensions/middle/CheckForCycle.py new file mode 100644 index 0000000..4d8021a --- /dev/null +++ b/model-optimizer/extensions/middle/CheckForCycle.py @@ -0,0 +1,39 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import networkx as nx + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.utils.error import Error +from mo.utils.utils import refer_to_faq_msg + + +class CheckForCycle(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + is_acyclic = nx.is_directed_acyclic_graph(graph) + if not is_acyclic: + raise Error('Graph contains a cycle. Can not proceed. ' + refer_to_faq_msg(97)) diff --git a/model-optimizer/extensions/middle/CheckForCycle_test.py b/model-optimizer/extensions/middle/CheckForCycle_test.py new file mode 100644 index 0000000..5ef5214 --- /dev/null +++ b/model-optimizer/extensions/middle/CheckForCycle_test.py @@ -0,0 +1,77 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest + +from extensions.middle.CheckForCycle import CheckForCycle +from mo.utils.error import Error +from mo.utils.unittest.graph import build_graph + +nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'node_1_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'node_2': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'concat': {'type': 'Concat', 'value': None, 'kind': 'op'}, + 'node_3': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'node_3_data': {'value': None, 'kind': 'data', 'data_type': None}, + # Placeholders + 'placeholder_1': {'shape': None, 'type': 'Input', 'kind': 'op', 'op': 'Placeholder'}, + 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2': {'shape': None, 'type': 'Input', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_1': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_1_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'pl_2': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_2_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + # ScaleShift layer + 'scaleshift_1': {'type': 'ScaleShift', 'kind': 'op', 'op': 'ScaleShift'}, + 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + # Mul op + 'mul_1': {'type': None, 'kind': 'op', 'op': 'Mul'}, + 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput', 'infer': lambda x: None} + } + + +class CycleTest(unittest.TestCase): + def test_check_for_cycle1(self): + # cyclic case + graph = build_graph(nodes_attributes, + [('node_1', 'node_1_data'), + ('node_1_data', 'node_3'), + ('node_3', 'node_3_data'), + ('node_3_data', 'node_1')], + nodes_with_edges_only=True) + with self.assertRaisesRegex(Error, 'Graph contains a cycle. Can not proceed.*'): + CheckForCycle().find_and_replace_pattern(graph) + + def test_check_for_cycle2(self): + # acyclic case + graph = build_graph(nodes_attributes, + [('node_1', 'node_1_data'), + ('node_1_data', 'node_3'), + ('node_3', 'node_3_data'), + ('node_3_data', 'mul_1'), + ('mul_1_w', 'mul_1'), + ('mul_1', 'mul_1_data') + ], + nodes_with_edges_only=True) + try: + CheckForCycle().find_and_replace_pattern(graph) + except Error: + self.fail("Unexpected Error raised") diff --git a/model-optimizer/extensions/middle/ConcatOptimization.py b/model-optimizer/extensions/middle/ConcatOptimization.py new file mode 100644 index 0000000..17f715f --- /dev/null +++ b/model-optimizer/extensions/middle/ConcatOptimization.py @@ -0,0 +1,93 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import networkx as nx +import logging as log + +from mo.graph.graph import Node +from mo.middle.replacement import MiddleReplacementPattern + + +class ConcatOptimization(MiddleReplacementPattern): + # This optimization reduces number of edges between Concat operations + # that significantly reduce memory consumption + + enabled = False + + def run_after(self): + return [] + + def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + mp = {} + used = {} + for node in graph.nodes(): + node = Node(graph, node) + if node.kind == 'op' and node.soft_get('type') == 'Concat': + in_nodes = tuple([node.in_node(idx).id for idx in range(len(node.in_nodes()))]) + out_node = (node.id, node.out_node().id) + if in_nodes in mp: + log.warning("Something is weird! {} and {}".format(node.id, mp[in_nodes])) + else: + mp.update({in_nodes: out_node}) + used.update({node.id: {x: False for x in in_nodes}}) + + for key in mp.keys(): + replacers = [] + for i in range(len(key)): + for j in range(i + 1, len(key)): + arr = tuple(key[i:j + 1]) + if arr in mp.keys() and arr != key: + # print("Output of {} can be used as input for {} ({})".format(mp[arr][0], mp[key][0], len(arr))) + replacers.append((len(arr), arr)) + + replacers.sort(reverse=True) + + concat_id = mp[key][0] + for ln, arr in replacers: + # Check that we can do it!!! + we_can = True + for x in arr: + if used[concat_id][x]: + # print("Sorry but {} input was already removed from {}".format(x, concat_id)) + we_can = False + break + + if not we_can: + continue + + for x in arr: + used[concat_id][x] = True + + edge_attrs = graph.get_edge_data(arr[0], concat_id)[0] + for in_node in arr: + graph.remove_edge(in_node, concat_id) + + new_input = mp[arr][1] + out_port = len(Node(graph, new_input).out_nodes()) + 1 + edge_attrs['out'] = out_port + graph.add_edge(new_input, concat_id, **edge_attrs) + + # Renumber 'in' attrs + concat_node = Node(graph, concat_id) + ln = len(concat_node.in_nodes()) + ports = [x for x in concat_node.in_nodes().keys()] + ports.sort() + + p_id = 0 + for p in ports: + in_node = concat_node.in_nodes()[p] + graph[in_node.id][concat_id][0]['in'] = p_id + p_id += 1 diff --git a/model-optimizer/extensions/middle/ConstSwitchResolver.py b/model-optimizer/extensions/middle/ConstSwitchResolver.py index 73459b0..ad9171c 100644 --- a/model-optimizer/extensions/middle/ConstSwitchResolver.py +++ b/model-optimizer/extensions/middle/ConstSwitchResolver.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,7 @@ limitations under the License. """ -import networkx as nx - -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node from mo.middle.replacement import MiddleReplacementPattern from mo.utils.graph import pseudo_topological_sort @@ -28,7 +26,11 @@ class ConstSwitchEraser(MiddleReplacementPattern): """ enabled = True - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def find_and_replace_pattern(self, graph: Graph): for n in pseudo_topological_sort(graph): if graph.node[n]['kind'] == 'data' or graph.node[n]['op'] != 'Switch': continue diff --git a/model-optimizer/extensions/middle/ConvToBinaryConv.py b/model-optimizer/extensions/middle/ConvToBinaryConv.py new file mode 100644 index 0000000..d370ecc --- /dev/null +++ b/model-optimizer/extensions/middle/ConvToBinaryConv.py @@ -0,0 +1,129 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from extensions.middle.CheckForCycle import CheckForCycle +from extensions.middle.DeleteControlFlowEdges import DeleteControlFlowEdges +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.const import Const +from mo.ops.lin_op import Mul, Add +from mo.ops.op import Op +from mo.ops.power import Power + + +class ConvToBinaryConv(MiddleReplacementPattern): + """ Transform usual convolution with [0,+1] input and [-1,+1] to BinaryConvolution + + Modifies output terms after the Convolution to be able to apply BinaryConvolution + operation instead that accepts [-1,1] input and [-1,1] weights. It requires modification + channel-wise addition with weights reduced along all axis except output channel dimension. + """ + enabled = True + force_clean_up = True + + def pattern(self): + return dict( + nodes=[ + ('quantize', dict(kind='op', op='Quantize')), + ('quantized', dict()), # input tensor, not weights + ('operator', dict(kind='op', type='Convolution')), + ], + edges=[ + ('quantize', 'quantized'), + ('quantized', 'operator', {'in':0}), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + assert match['operator'].has('multiplication_transparent_ports') + + quantize = match['quantize'] + # This pass is applicable for binarization only. Other intX variants are not relevant. + if quantize.levels != 2: + return + + port = match['operator'].input_ports_with(match['quantized']) + assert len(port) >= 1 + if len(port) > 1: + log.debug('BinarizeWeightsM1P1 cannot apply transformation for data {} because it consumed more' + ' than once'.format(match['quantized'].name)) + return + + assert len(port) == 1 + port = port[0] + applicable = [pair for pair in match['operator'].multiplication_transparent_ports if pair[0] == port] + if len(applicable) == 0: + return + + # Look at 3-rd and 4-th inputs of Quantize -- they have constants that should be passed through. + # Assume that the constant that should be passed through is a scalar. + output_low = quantize.in_node(3) + output_high = quantize.in_node(4) + assert len(output_low.out_nodes()) == 1 + assert len(output_high.out_nodes()) == 1 + + if not output_low.has_valid('value') and not output_high.has_valid('value'): + return + + output_low = output_low.value + output_high = output_high.value + + operator = match['operator'] + + if np.all(np.isclose(output_low, 0)) and np.all(np.isclose(output_high, 1)): + + weights = operator.in_node(1).value + reduction_indices = set(range(len(weights.shape))) - set([operator.output_feature_channel]) + weights_reduced = np.add.reduce(weights, axis=tuple(reduction_indices)) + weights_reduced = weights_reduced.reshape([len(weights_reduced), 1, 1]) + + add_term = Const(graph, {'value': weights_reduced}).create_node() + add = Add(graph, {}).create_node() + add.in_port(1).connect(add_term.out_port(0)) + mul_term = Const(graph, {'value': np.array(0.5)}).create_node() + mul = Mul(graph, {}).create_node() + mul.in_port(1).connect(mul_term.out_port(0)) + add.out_port(0).connect(mul.in_port(0)) + + operator.out_port(0).get_connection().set_source(mul.out_port(0)) + add.in_port(0).connect(operator.out_port(0)) + + operator['pad_value'] = float(-1.0) + elif np.all(np.isclose(output_low, -1)) and np.all(np.isclose(output_high, +1)): + pass + else: + log.debug('ConvToBinaryConv: cannot apply transformation because input range is neither in [0, +1] nor ' + 'in [-1, +1].') + return + + operator['type'] = 'BinaryConvolution' + operator['mode'] = 'xnor-popcount' + operator['input'] = operator.in_node(0).shape[1] + # Weights are not bit-packed yet; there should be a separate transformation to do that + + assert output_low.size == 1 + assert output_high.size == 1 + + output_low = quantize.in_node(3) + output_high = quantize.in_node(4) + + # Make sure that low/high values are exactly 0/1 + output_low.value = np.zeros(output_low.shape) + output_high.value = np.ones(output_high.shape) diff --git a/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py b/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py index cec09cc..d9906b3 100644 --- a/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py +++ b/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,16 +14,19 @@ limitations under the License. """ -import numpy as np -import networkx as nx +from copy import deepcopy + import logging as log +import numpy as np +from extensions.middle.SliceConverter import ConvertSlice from extensions.ops.splitv import SplitV -from mo.graph.graph import Node +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Node, Graph, add_opoutput +from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op from mo.ops.reshape import Reshape -from mo.middle.replacement import MiddleReplacementPattern -from extensions.middle.SliceConverter import ConvertSlice + class ConvertGroupedStridedSlice(MiddleReplacementPattern): """ @@ -50,7 +53,11 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): def run_after(self): return [ConvertSlice] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + + def find_and_replace_pattern(self, graph: Graph): # Iterate over all data nodes and find all with >= 1 consumers data_nodes = [Node(graph, node) for node in graph.node if Node(graph, node).kind == 'data'] for input_data in data_nodes: @@ -61,12 +68,16 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): input_shape = np.array(input_data.shape) # Get all StridedSlice consumers - out_nodes = [node for node in input_data.out_nodes() if node.op == 'StridedSlice'] + out_nodes = [node for node in input_data.out_nodes() if node.op == 'StridedSlice' and node.in_node(0).name == input_data.name] if len(out_nodes) < 1: continue valid_for_replacement = True + for node in out_nodes: + if len(node.slices) != len(out_nodes[0].slices): + valid_for_replacement = False + # Detect dimension for splitting split_channel_dim = None for dim_id, s in enumerate(out_nodes[0].slices): @@ -80,9 +91,6 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): # split_dims contains tuples with split range and output data node split_dims = [] for out_id, node in enumerate(out_nodes): - # Check that StridedSlice op has no shrink_axis_mask attribute - if not np.all([x == False for x in node.shrink_axis_mask]): - valid_for_replacement = False # Check that StridedSlice op has stride eq 1 and splits only feature channel for id, s in enumerate(node.slices): l, r, stride = s.start, s.stop, s.step @@ -97,7 +105,23 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): # Check feature split intersection final_data_nodes_list = [] - sorted_split_dims = sorted(split_dims) + sorted_split_dims = sorted(split_dims, key=lambda item: (item[0], item[1])) + + # check if we have similar StridedSlice operations with different outputs + prev_sd = sorted_split_dims[0] + to_remove = [] + for i in range(1, len(sorted_split_dims)): + if sorted_split_dims[i][0] == prev_sd[0] and sorted_split_dims[i][1] == prev_sd[1] and sorted_split_dims[i][2].name != prev_sd[2].name: + cur_node = sorted_split_dims[i][2] + for out in cur_node.out_nodes(): + attrs = deepcopy(graph.get_edge_data(cur_node.id, out.id)[0]) + graph.remove_edge(cur_node.id, out.id) + graph.add_edge(prev_sd[2].id, out.id, **attrs) + to_remove.append(i) + + for ind in reversed(to_remove): + sorted_split_dims.pop(ind) + size_splits = [] prev_r = 0 for l, r, out in sorted_split_dims: @@ -109,10 +133,10 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): shape = np.array(input_shape) size_splits.append(l - prev_r) shape[split_channel_dim] = l - prev_r - data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape, 'is_output': True}) + data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape}) + add_opoutput(graph, data_node.id, 0, False) final_data_nodes_list.append(data_node) - prev_r = r size_splits.append(r - l) final_data_nodes_list.append(out) @@ -124,12 +148,26 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): shape = input_shape.copy() shape[split_channel_dim] = input_shape[split_channel_dim] - prev_r size_splits.append(input_shape[split_channel_dim] - prev_r) - data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape, 'is_output': True}) + data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape}) + add_opoutput(graph, data_node.id, 0, False) final_data_nodes_list.append(data_node) if not valid_for_replacement: continue + for node in out_nodes: + if not np.all([x == 0 for x in node.shrink_axis_mask]): + out_node = node.out_node() + if np.any(node['shrink_axis_mask']): + self.add_reshape_for_shrink(graph, node) + if np.any(node['new_axis_mask']): + self.add_reshape_for_new(graph, node) + + for i in range(len(final_data_nodes_list)): + if final_data_nodes_list[i].name == out_node.name: + final_data_nodes_list[i] = node.out_node() + break + # Insert Split layer and remove old StridedSlice layers # 1. Remove connections from input_data to StridedSlice ops out_data_nodes = [] @@ -143,5 +181,82 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern): # 2. Create Split layer and reorder outputs split = SplitV(graph, dict(name=name_for_future_split + "/Split", axis=split_channel_dim, - size_splits=size_splits)) + size_splits=size_splits, out_ports_count=len(size_splits))) split.create_node_with_data(inputs=[input_data], data_nodes=final_data_nodes_list) + + @staticmethod + def add_reshape_for_shrink(graph: Graph, ss_node): + # add Reshape for shrink_axis_mask + log.info("StridedSlice op with shrink mask '{}' has been detected".format(ss_node.id)) + node = ss_node + + if len(node.in_nodes()) != 4 or len(node.out_nodes()) != 1: + return + + shape_out = node.out_node().shape + dim = shape_out.copy() + ss_shape = [] + k = 0 + + # Don't permute reshape if channels were squeezed + dont_permute = False + if graph.graph['layout'] == 'NHWC' and node['shrink_axis_mask'][-1] == 1: + dont_permute = True + + for i in range(0, len(node['shrink_axis_mask'])): + if not node['shrink_axis_mask'][i]: + ss_shape.append(shape_out[k]) + k = k + 1 + else: + node['shrink_axis_mask'][i] = 0 + ss_shape.append(1) + + out_node = node.out_node(0) + + # insert data node for StridedSlice + data_node = Op._create_data_node(graph, node.name + "/Reshape_shrink_data", {'shape': int64_array(ss_shape)}) + attrs = deepcopy(graph.get_edge_data(node.id, out_node.id)[0]) + graph.remove_edge(node.id, out_node.id) + graph.add_edge(node.id, data_node.id, **attrs) + + # insert Reshape + if dont_permute: + reshape = Reshape(graph, dict(name=node.name + "/Reshape_shrink", + dim=np.array(dim, dtype=np.int64), nchw_layout=True)) + reshape_data_node = reshape.create_node_with_data([data_node], reshape.attrs, + data_nodes=[out_node]) + reshape_data_node['nchw_layout'] = True + else: + reshape = Reshape(graph, dict(name=node.name + "/Reshape_shrink", + dim=np.array(dim, dtype=np.int64))) + reshape_data_node = reshape.create_node_with_data([data_node], reshape.attrs, + data_nodes=[out_node]) + + @staticmethod + def add_reshape_for_new(graph: Graph, ss_node): + log.info("StridedSlice op with new axis mask '{}' has been detected".format(ss_node.id)) + node = ss_node + + if len(node.in_nodes()) != 4 or len(node.out_nodes()) != 1: + return + + shape_out = node.out_node().shape + dim = shape_out.copy() + ss_shape = [] + for i in range(0, len(node['new_axis_mask'])): + if not node['new_axis_mask'][i]: + ss_shape.append(shape_out[i]) + else: + node['new_axis_mask'][i] = 0 + + out_node = node.out_node(0) + # insert data node for StridedSlice + data_node = Op._create_data_node(graph, node.name + "/Reshape_new_data", {'shape': ss_shape}) + attrs = deepcopy(graph.get_edge_data(node.id, out_node.id)[0]) + graph.remove_edge(node.id, out_node.id) + graph.add_edge(node.id, data_node.id, **attrs) + + # insert Reshape + reshape = Reshape(graph, dict(name=node.name + "/Reshape_new", + dim=np.array(dim, dtype=np.int64))) + reshape.create_node_with_data([data_node], reshape.attrs, data_nodes=[out_node]) diff --git a/model-optimizer/extensions/middle/ConvertGroupedStridedSlice_test.py b/model-optimizer/extensions/middle/ConvertGroupedStridedSlice_test.py index 0ebdb38..24d1ca9 100644 --- a/model-optimizer/extensions/middle/ConvertGroupedStridedSlice_test.py +++ b/model-optimizer/extensions/middle/ConvertGroupedStridedSlice_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,20 +19,26 @@ import unittest import numpy as np from extensions.middle.ConvertGroupedStridedSlice import ConvertGroupedStridedSlice +from mo.graph.graph import Node from mo.utils.unittest.graph import build_graph, compare_graphs nodes_attributes = { 'placeholder_1': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_begin_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_end_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_stride_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # StridedSlice layers 'sslice_1': {'type': None, 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, False, False])}, + 'shrink_axis_mask': np.array([0, 0, 0, 0])}, 'sslice_1_data': {'value': None, 'shape': None, 'kind': 'data'}, 'sslice_2': {'type': None, 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, False, False])}, + 'shrink_axis_mask': np.array([0, 0, 0, 0])}, 'sslice_2_data': {'value': None, 'shape': None, 'kind': 'data'}, 'sslice_3': {'type': None, 'kind': 'op', 'op': 'StridedSlice', 'slices': None, - 'shrink_axis_mask': np.array([False, False, False, False])}, + 'shrink_axis_mask': np.array([0, 0, 0, 0])}, 'sslice_3_data': {'value': None, 'shape': None, 'kind': 'data'}, # Split layer 'split_1': {'type': 'Split', 'kind': 'op', 'op': 'SplitV'}, @@ -43,6 +49,16 @@ nodes_attributes = { # Concat1 operation 'concat_1': {'type': 'Concat', 'kind': 'op', 'op': 'Concat'}, 'concat_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_1': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_2': {'kind': 'op', 'op': 'OpOutput'}, + # Reshape layer + 'sslice_1/Reshape_shrink': {'type': 'Reshape', 'value': None, 'kind': 'op', 'op': 'Reshape'}, + 'sslice_1/Reshape_shrink_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'sslice_2/Reshape_shrink': {'type': 'Reshape', 'value': None, 'kind': 'op', 'op': 'Reshape'}, + 'sslice_2/Reshape_shrink_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'sslice_2/Reshape_new': {'type': 'Reshape', 'value': None, 'kind': 'op', 'op': 'Reshape'}, + 'sslice_2/Reshape_new_data': {'value': None, 'shape': None, 'kind': 'data'}, } @@ -59,7 +75,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -75,7 +92,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(36, 54, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -88,14 +105,16 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('split_1_data', 'concat_1'), ('split_2_data', 'concat_1'), ('split_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, 'split_1': {'axis': 3}, 'split_1_data': {'shape': np.array([1, 227, 227, 18])}, 'split_2_data': {'shape': np.array([1, 227, 227, 18])}, 'split_3_data': {'shape': np.array([1, 227, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -116,7 +135,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -132,7 +152,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 19, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 19])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -145,14 +165,15 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('split_1_data', 'concat_1'), ('split_2_data', 'concat_1'), ('split_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, 'split_1': {'axis': 3}, 'split_1_data': {'shape': np.array([1, 227, 227, 18])}, 'split_2_data': {'shape': np.array([1, 227, 227, 17])}, 'split_3_data': {'shape': np.array([1, 227, 227, 19])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -174,7 +195,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -190,7 +212,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 19, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 19])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -205,7 +227,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -221,7 +244,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 19, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 19])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -243,7 +266,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -259,7 +283,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 19, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 19])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -274,7 +298,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -290,7 +315,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 19, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 19])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -315,7 +340,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), ('sslice_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -331,7 +357,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(1, 19, 1)])}, 'sslice_3_data': {'shape': np.array([1, 227, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -345,7 +371,9 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('split_2_data', 'concat_1'), ('split_3_data', 'concat_1'), ('split_4_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), + ('split_1_data', 'op_output_1') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, 'split_1': {'axis': 3}, @@ -353,7 +381,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): 'split_2_data': {'shape': np.array([1, 227, 227, 18])}, 'split_3_data': {'shape': np.array([1, 227, 227, 17])}, 'split_4_data': {'shape': np.array([1, 227, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -376,7 +404,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_2', 'sslice_2_data'), ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -388,7 +417,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(27, 45, 1)])}, 'sslice_2_data': {'shape': np.array([1, 227, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -401,7 +430,10 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('split_1', 'split_4_data'), ('split_1_data', 'concat_1'), ('split_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), + ('split_2_data', 'op_output_1'), + ('split_4_data', 'op_output_2'), ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, 'split_1': {'axis': 3}, @@ -409,7 +441,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): 'split_2_data': {'shape': np.array([1, 227, 227, 9])}, 'split_3_data': {'shape': np.array([1, 227, 227, 18])}, 'split_4_data': {'shape': np.array([1, 227, 227, 9])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -427,7 +459,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_2', 'sslice_2_data'), ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -439,7 +472,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(10, 227, 1), slice(0, 227, 1), slice(27, 45, 1)])}, 'sslice_2_data': {'shape': np.array([1, 217, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) graph.graph['layout'] = 'NHWC' @@ -451,7 +484,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_2', 'sslice_2_data'), ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, @@ -463,7 +497,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(10, 227, 1), slice(0, 227, 1), slice(27, 45, 1)])}, 'sslice_2_data': {'shape': np.array([1, 217, 227, 18])}, - 'concat_1_data': {'shape': np.array([1, 227, 227, 54]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() @@ -485,7 +519,8 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('sslice_2', 'sslice_2_data'), ('sslice_1_data', 'concat_1'), ('sslice_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 54, 54, 3])}, @@ -497,7 +532,7 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): [slice(0, 1, 1), slice(18, 36, 1), slice(0, 54, 1), slice(0, 3, 1)])}, 'sslice_2_data': {'shape': np.array([1, 18, 54, 3])}, - 'concat_1_data': {'shape': np.array([1, 54, 54, 3]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 54, 54, 3])}, }) graph.graph['layout'] = 'NHWC' @@ -509,14 +544,336 @@ class ConvertGroupedStridedSliceTests(unittest.TestCase): ('split_1', 'split_3_data'), ('split_1_data', 'concat_1'), ('split_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), + ('split_2_data', 'op_output_1') ], {'placeholder_1_data': {'shape': np.array([1, 54, 54, 3])}, 'split_1': {'axis': 1}, 'split_1_data': {'shape': np.array([1, 18, 54, 3])}, 'split_2_data': {'shape': np.array([1, 18, 54, 3])}, 'split_3_data': {'shape': np.array([1, 18, 54, 3])}, - 'concat_1_data': {'shape': np.array([1, 54, 54, 3]), 'is_output': True}, + 'concat_1_data': {'shape': np.array([1, 54, 54, 3])}, + }) + + pattern = ConvertGroupedStridedSlice() + pattern.find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'concat_1_data', check_op_attrs=True) + self.assertTrue(flag, resp) + + +class AddReshapeAfterStridedSliceTests(unittest.TestCase): + def test_ss_1_shrink_last(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_1'), + ('placeholder_begin_data', 'sslice_1'), + ('placeholder_end_data', 'sslice_1'), + ('placeholder_stride_data', 'sslice_1'), + ('sslice_1', 'sslice_1_data'), + ('sslice_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_1': {'slices': np.array([slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), + 'shrink_axis_mask': [0, 0, 1, 0], + 'new_axis_mask': np.array([0, 0, 0, 0])}, + 'sslice_1_data': {'shape': np.array([1, 227, 54])}, + }) + graph.graph['layout'] = 'NHWC' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_1'), + ('placeholder_begin_data', 'sslice_1'), + ('placeholder_end_data', 'sslice_1'), + ('placeholder_stride_data', 'sslice_1'), + ('sslice_1', 'sslice_1/Reshape_shrink_data'), + ('sslice_1/Reshape_shrink_data', 'sslice_1/Reshape_shrink'), + ('sslice_1/Reshape_shrink', 'sslice_1_data'), + ('sslice_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_1': {'slices': np.array( + [slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 0]), + 'new_axis_mask': np.array([0, 0, 0, 0])}, + 'sslice_1_data': {'shape': np.array([1, 227, 54])}, + 'sslice_1/Reshape_shrink': {'dim': np.array([1, 227, 54])}, + 'sslice_1/Reshape_shrink_data': {'shape': np.array([1, 227, 1, 54])} + }) + + pattern = ConvertGroupedStridedSlice() + pattern.add_reshape_for_shrink(graph, Node(graph, 'sslice_1')) + + (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_1_data', check_op_attrs=True) + graph.clear() + graph_ref.clear() + self.assertTrue(flag, resp) + + def test_ss_1_shrink(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('sslice_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array([slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), + 'shrink_axis_mask': [0, 0, 1, 0], + 'new_axis_mask': np.array([0, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 227, 54])} + }) + graph.graph['layout'] = 'NHWC' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2/Reshape_shrink_data'), + ('sslice_2/Reshape_shrink_data', 'sslice_2/Reshape_shrink'), + ('sslice_2/Reshape_shrink', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('sslice_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array([slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 0]), + 'new_axis_mask': np.array([0, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 227, 54])}, + 'sslice_2/Reshape_shrink': {'dim': np.array([1, 227, 54])}, + 'sslice_2/Reshape_shrink_data': {'shape': np.array([1, 227, 1, 54])}, + }) + + pattern = ConvertGroupedStridedSlice() + pattern.add_reshape_for_shrink(graph, Node(graph, 'sslice_2')) + + (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) + graph.clear() + graph_ref.clear() + self.assertTrue(flag, resp) + + def test_ss_2_shrink(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('sslice_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': { + 'slices': np.array([slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1)]), + 'shrink_axis_mask': np.array([0, 1, 0, 1]), + 'new_axis_mask': np.array([0, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 227])} + }) + graph.graph['layout'] = 'NHWC' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2/Reshape_shrink_data'), + ('sslice_2/Reshape_shrink_data', 'sslice_2/Reshape_shrink'), + ('sslice_2/Reshape_shrink', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('sslice_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array( + [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 0]), + 'new_axis_mask': np.array([0, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 227])}, + 'sslice_2/Reshape_shrink': {'dim': np.array([1, 227])}, + 'sslice_2/Reshape_shrink_data': {'shape': np.array([1, 1, 227, 1])}, + }) + + pattern = ConvertGroupedStridedSlice() + pattern.add_reshape_for_shrink(graph, Node(graph, 'sslice_2')) + + (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) + graph.clear() + graph_ref.clear() + self.assertTrue(flag, resp) + + def test_ss_1_new(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array( + [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 54, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 0, 0]), + 'new_axis_mask': np.array([0, 1, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 1, 227, 227, 54])} + }) + graph.graph['layout'] = 'NHWC' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2/Reshape_new_data'), + ('sslice_2/Reshape_new_data', 'sslice_2/Reshape_new'), + ('sslice_2/Reshape_new', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data')], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array( + [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), + slice(0, 54, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 0, 0]), + 'new_axis_mask': np.array([0, 0, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 1, 227, 227, 54])}, + 'sslice_2/Reshape_new': {'dim': np.array([1, 1, 227, 227, 54])}, + 'sslice_2/Reshape_new_data': {'shape': np.array([1, 227, 227, 54])}, + }) + + pattern = ConvertGroupedStridedSlice() + pattern.add_reshape_for_new(graph, Node(graph, 'sslice_2')) + + (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) + graph.clear() + graph_ref.clear() + self.assertTrue(flag, resp) + + def test_ss_shrink_new(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('sslice_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array( + [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), slice(0, 54, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 1, 0]), + 'new_axis_mask': np.array([0, 1, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 1, 227, 54])} + }) + graph.graph['layout'] = 'NHWC' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('placeholder_begin_data', 'sslice_2'), + ('placeholder_end_data', 'sslice_2'), + ('placeholder_stride_data', 'sslice_2'), + ('sslice_2', 'sslice_2/Reshape_new_data'), + ('sslice_2/Reshape_new_data', 'sslice_2/Reshape_new'), + ('sslice_2/Reshape_new', 'sslice_2/Reshape_shrink_data'), + ('sslice_2/Reshape_shrink_data', 'sslice_2/Reshape_shrink'), + ('sslice_2/Reshape_shrink', 'sslice_2_data'), + ('sslice_2_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('sslice_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'sslice_2': {'slices': np.array( + [slice(0, 1, 1), slice(0, 1, 1), slice(0, 227, 1), slice(0, 1, 1), + slice(0, 54, 1)]), + 'shrink_axis_mask': np.array([0, 0, 0, 0, 0]), + 'new_axis_mask': np.array([0, 0, 0, 0, 0])}, + 'sslice_2_data': {'shape': np.array([1, 1, 227, 54])}, + 'sslice_2/Reshape_new': {'dim': np.array([1, 1, 227, 1, 54])}, + 'sslice_2/Reshape_new_data': {'shape': np.array([1, 227, 1, 54])}, + 'sslice_2/Reshape_shrink': {'dim': np.array([1, 1, 227, 54])}, + 'sslice_2/Reshape_shrink_data': {'shape': np.array([1, 1, 227, 1, 54])}, + }) + + pattern = ConvertGroupedStridedSlice() + pattern.add_reshape_for_shrink(graph, Node(graph, 'sslice_2')) + pattern.add_reshape_for_new(graph, Node(graph, 'sslice_2')) + + (flag, resp) = compare_graphs(graph, graph_ref, 'sslice_2_data', check_op_attrs=True) + graph.clear() + graph_ref.clear() + self.assertTrue(flag, resp) + + # test case with 2 strided slices with the same parameters but different outputs + def test_1(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'sslice_1'), + ('sslice_1', 'sslice_1_data'), + ('placeholder_1_data', 'sslice_2'), + ('sslice_2', 'sslice_2_data'), + ('placeholder_1_data', 'sslice_3'), + ('sslice_3', 'sslice_3_data'), + ('sslice_1_data', 'concat_1'), + ('sslice_2_data', 'concat_1'), + ('sslice_3_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), + ('placeholder_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + + 'sslice_1': {'slices': np.array( + [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 27, 1)])}, + 'sslice_1_data': {'shape': np.array([1, 227, 227, 27])}, + + 'sslice_2': {'slices': np.array( + [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(27, 54, 1)])}, + 'sslice_2_data': {'shape': np.array([1, 227, 227, 27])}, + + 'sslice_3': {'slices': np.array( + [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 27, 1)])}, + 'sslice_3_data': {'shape': np.array([1, 227, 227, 27])}, + + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, + }) + graph.graph['layout'] = 'NHWC' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'split_1'), + ('split_1', 'split_1_data'), + ('split_1', 'split_2_data'), + ('split_1_data', 'concat_1'), + ('split_2_data', 'concat_1'), + ('split_1_data', 'placeholder_2'), + ('placeholder_2', 'placeholder_2_data'), + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), + ('placeholder_2_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 54])}, + 'split_1': {'axis': 3}, + 'split_1_data': {'shape': np.array([1, 227, 227, 27])}, + 'split_2_data': {'shape': np.array([1, 227, 227, 27])}, + 'concat_1_data': {'shape': np.array([1, 227, 227, 54])}, }) pattern = ConvertGroupedStridedSlice() diff --git a/model-optimizer/extensions/middle/ConvertLayoutDependentOperations.py b/model-optimizer/extensions/middle/ConvertLayoutDependentOperations.py index 7f2e87c..9308506 100644 --- a/model-optimizer/extensions/middle/ConvertLayoutDependentOperations.py +++ b/model-optimizer/extensions/middle/ConvertLayoutDependentOperations.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.layout import indices_mapping -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op, PermuteAttrs from mo.ops.permute import Permute @@ -32,9 +30,10 @@ class ConvertLayoutDependentOperations(MiddleReplacementPattern): enabled = True def run_after(self): - return [] + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): for node in list(graph.nodes()): node = Node(graph, node) # Check that node layout mismatch with graph layout diff --git a/model-optimizer/extensions/middle/ConvertMultiInputConv.py b/model-optimizer/extensions/middle/ConvertMultiInputConv.py new file mode 100644 index 0000000..8e5fd53 --- /dev/null +++ b/model-optimizer/extensions/middle/ConvertMultiInputConv.py @@ -0,0 +1,75 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import copy + +from mo.graph.graph import Graph, Node +from mo.middle.replacement import MiddleReplacementPattern + + +class ConvertMultiInputConv(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[('op', dict(kind='op', op='ConvND'))], + edges=[] + ) + + def replace_pattern(self, graph: Graph, match: dict): + node = match['op'] + node.op = 'Conv2D' + + if node.bias_term: + num_inputs = len(node.in_nodes()) - 2 + w_node = node.in_node(len(node.in_nodes()) - 2) + b_node = node.in_node(len(node.in_nodes()) - 1) + else: + num_inputs = len(node.in_nodes()) - 1 + w_node = node.in_node(len(node.in_nodes()) - 1) + + for i in range(1, num_inputs): + in_i = node.in_node(i) + out_i = node.out_node(i) + conv_id = graph.unique_id(node.id + '__') + graph.add_node(conv_id, **copy.deepcopy(node.get_attrs())) + new_conv = Node(graph, conv_id) + new_conv.name = conv_id + + graph.remove_edge(in_i.id, node.id) + graph.remove_edge(node.id, out_i.id) + graph.add_edges_from([ + (w_node.id, conv_id, {'in': 1, 'bin': 'weights'}), + ]) + + if node.bias_term: + graph.add_edges_from([ + (b_node.id, conv_id, {'in': 2, 'bin': 'biases'}), + ]) + + graph.add_edges_from([ + (in_i.id, conv_id, {'in': 0}), + ]) + graph.add_edge(conv_id, out_i.id, **{'out': 0}) diff --git a/model-optimizer/extensions/middle/CustomSubgraphCall.py b/model-optimizer/extensions/middle/CustomSubgraphCall.py new file mode 100644 index 0000000..f2eba63 --- /dev/null +++ b/model-optimizer/extensions/middle/CustomSubgraphCall.py @@ -0,0 +1,322 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import copy +import logging as log + +import numpy as np + +from mo.front.common.layout import nhwc_to_nchw_permute +from mo.front.common.partial_infer.utils import int64_array +from mo.front.extractor import update_ie_fields +from mo.graph.graph import Graph +from mo.graph.graph import Node, add_opoutput +from mo.middle.replacement import MiddleReplacementPattern + +nchw_to_nhwc_constant_name = 'IE_NCHW_TO_NHWC' +nhwc_to_nchw_constant_name = 'IE_NHWC_TO_NCHW' + + +class CustomSubgraphCall(MiddleReplacementPattern): + enabled = True + force_clean_up = True + graph_condition = [lambda graph: graph.graph['fw'] == 'tf'] + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + @staticmethod + def update_placeholders(graph: Graph): + """ + Iterates over all nodes of the graph, find all TF sub-graph call operations and updates placeholders shapes and adds + transpose operation if necessary. + :param graph: graph to operate on + :return: None + """ + for node_name in graph.nodes(): + node = Node(graph, node_name) + if node.kind == 'op' and node.has_valid('op') and node.op == 'TFCustomSubgraphCall': + CustomSubgraphCall.update_placeholder_shape_and_add_transpose(node) + + @staticmethod + def update_placeholder_shape_and_add_transpose(node: Node): + """ + The function changes placeholders shapes from NHWC to NCHW format and add transpose operations if needed. + :param node: node to operate on. + :return: None + """ + import tensorflow as tf + from mo.front.common.layout import convert_shape, nhwc_to_nchw_permute, nchw_to_nhwc_permute + from mo.front.tf.extractors.utils import tf_tensor_shape + from mo.front.tf.partial_infer.tf import add_node_def_to_subgraph, update_input_in_pbs + + tf.reset_default_graph() + + inputs_replacements = list() + + # transpose permutation constant + nchw_to_nhwc_constant = tf.constant(nchw_to_nhwc_permute, dtype=tf.int32, name=nchw_to_nhwc_constant_name) + nhwc_to_nchw_constant = tf.constant(nhwc_to_nchw_permute, dtype=tf.int32, name=nhwc_to_nchw_constant_name) + + for placeholder_name in node['input_nodes_names']: + # dummy node which we can refer to as input in the transpose for the output node + # dummy node should be unique for each placeholder + dummy_node = tf.constant(value=[[[[1]]]], dtype=tf.float32, name='random_dummy_name_' + placeholder_name) + + placeholder = node['pbs'][placeholder_name] + cur_shape = tf_tensor_shape(placeholder.attr['shape'].shape) + if len(cur_shape) == 4: # TODO think about better check that transpose is required + nchw_shape = convert_shape(cur_shape, nhwc_to_nchw_permute) + for ind in range(len(cur_shape)): + placeholder.attr['shape'].shape.dim[ind].size = nchw_shape[ind] + transpose_name = placeholder.name + '_transpose' + transpose = tf.transpose(dummy_node, nchw_to_nhwc_constant, transpose_name) # NCHW -> NHWC + + # add transpose operations to GraphDef after placeholders + add_node_def_to_subgraph(node, transpose.op.node_def, transpose_name, len(node['input_nodes_names'])) + inputs_replacements.append((placeholder.name, transpose_name)) + inputs_replacements.append((dummy_node.name, placeholder.name)) + node['real_input_dims'].append(nchw_shape) + else: + node['real_input_dims'].append(cur_shape) + add_node_def_to_subgraph(node, nchw_to_nhwc_constant.op.node_def) + add_node_def_to_subgraph(node, nhwc_to_nchw_constant.op.node_def) + + # update initial input names to a transposed ones + for old_input_tensor_name, new_name in inputs_replacements: + update_input_in_pbs(node, old_input_tensor_name, new_name) + + @staticmethod + def add_output_nodes_transposes(graph: Graph): + """ + Iterates over all nodes of the graph, find all TF sub-graph call operations and adds Transpose operations to the + output nodes if they are 4D to covert output from NHWC to NCHW. + :param graph: graph to operate on + :return: None + """ + for node_name in graph.nodes(): + node = Node(graph, node_name) + if node.kind == 'op' and node.has_valid('op') and node.op == 'TFCustomSubgraphCall': + CustomSubgraphCall.add_sub_graph_call_output_tensors_transposes(node) + + @staticmethod + def make_shape_4d(shape: np.array): + """ + Create 4D tensor from 1D, 2D or 3D by adding new dimensions of size 1. + :param shape: shape to extend. + :return: 4D tensor. + """ + new_shape = int64_array(shape) + old_shape_len = len(shape) + + for x in range( + 4 - old_shape_len): # TODO think about proper way to add additional dimensions considering layout + if len( + new_shape) <= 1: # if the shape is 0D or 1D then we should add additional dimensions to batch dimension + new_shape = np.insert(new_shape, 0, 1) + # new_shape = np.array([1, shape[0], 1, 1]) + else: + new_shape = np.insert(new_shape, 1, 1) + return new_shape + + @staticmethod + def add_reshape_before_op_node(graph: Graph, data_node_name: str, op_node_name: str, edge_attrs: dict): + """ + Adds reshape operation which expands dimension of the specified data tensor to 4D. + :param graph: graph to operate on. + :param data_node_name: the name of the data node to be reshaped to 4D tensor. + :param op_node_name: name of the TFCustomSubgraphCall node which produces the tensor. + :param edge_attrs: edge attributes which should be preserved. + :return: None + """ + data_node = Node(graph, data_node_name) + + graph.remove_edge(data_node_name, op_node_name) + + assert data_node['shape'] is not None + + new_shape = CustomSubgraphCall.make_shape_4d(data_node['shape']) + + # reshape shape data node + reshape_shape_data_node_name = graph.unique_id("Reshape_shape_") + graph.add_node(reshape_shape_data_node_name, kind='data', precision="FP32", name=reshape_shape_data_node_name, + value=new_shape, shape=[1]) + + # reshape operation node + reshape_node_name = graph.unique_id("Reshape_") + graph.add_node(reshape_node_name, kind='op', precision="FP32", type='Reshape', name=reshape_node_name, + op='Reshape', + data_type=data_node['data_type']) + update_ie_fields(graph.node[reshape_node_name]) + + # reshaped data node + reshaped_value = None + if data_node['value'] is not None: + reshaped_value = np.reshape(data_node['value'], new_shape) + reshaped_data_node_name = graph.unique_id("reshaped_data_") + graph.add_node(reshaped_data_node_name, kind='data', precision="FP32", name=reshaped_data_node_name, + shape=new_shape, value=reshaped_value, nchw_layout=True) + + graph.add_edges_from([ + (data_node_name, reshape_node_name, {'in': 0}), + (reshape_shape_data_node_name, reshape_node_name, {'in': 1}), + (reshape_node_name, reshaped_data_node_name, {'out': 0}), + (reshaped_data_node_name, op_node_name, edge_attrs) + ]) + + @staticmethod + def add_reshape_after_data_node(graph: Graph, data_node_name: str): + """ + Adds reshape operation which changes shape of the tensor produced by TFSubgraphCall from 4D to real dimension + of the tensor. The data_node_name node contains real dimensions of the tensor but they will be changed in the + add_reshapes_for_tf_subgraph_calls function to a 4D because IE TF call layer supports output in 4D only. + :param graph: graph to operate on. + :param data_node_name: name of the data node to be reshaped to correct dimensions. + :return: None + """ + data_node = Node(graph, data_node_name) + + # if the data node was previously marked as output then we need to mark as output new reshaped data node + is_out_node = False + if len(data_node.out_nodes()) == 1 and data_node.out_node().has('op') and data_node.out_node().op == 'OpOutput': + is_out_node = True + graph.remove_node(data_node.out_node().id) + + # save old consumers nodes with edge attributes + old_consumer_nodes_with_attrs = list() + for index, out_op in enumerate(data_node.out_nodes()): + edge_attrs = graph.get_edge_data(data_node_name, out_op.name)[0] + old_consumer_nodes_with_attrs.append((out_op.name, edge_attrs)) + + # remove old consumers from the data node + for out_op in list(data_node.out_nodes()): + graph.remove_edge(data_node_name, out_op.name) + + # reshape operation node + reshape_node_name = graph.unique_id("Reshape_") + graph.add_node(reshape_node_name, kind='op', precision="FP32", type='Reshape', name=reshape_node_name, + op='Reshape', + data_type=data_node['data_type']) + update_ie_fields(graph.node[reshape_node_name]) + + # reshape shape data node + reshape_shape_data_node_name = graph.unique_id("Reshape_shape_") + graph.add_node(reshape_shape_data_node_name, kind='data', precision="FP32", name=reshape_shape_data_node_name, + value=np.array(data_node['shape']), shape=[1]) + + # reshaped data node + reshaped_value = None + if data_node['value'] is not None: + reshaped_value = np.array(data_node['value']) + reshaped_data_node_name = graph.unique_id("reshaped_data_") + graph.add_node(reshaped_data_node_name, kind='data', precision="FP32", name=reshaped_data_node_name, + shape=np.array(data_node['shape']), value=reshaped_value, nchw_layout=True) + + if is_out_node: + add_opoutput(graph, reshaped_data_node_name, 0, False) + + graph.add_edges_from([ + (data_node_name, reshape_node_name, {'in': 0}), + (reshape_shape_data_node_name, reshape_node_name, {'in': 1}), + (reshape_node_name, reshaped_data_node_name, {'out': 0}), + ]) + + for out_node_name, edge_attrs in old_consumer_nodes_with_attrs: + graph.add_edges_from([ + (reshaped_data_node_name, out_node_name, edge_attrs) + ]) + + @staticmethod + def add_reshapes_for_tf_subgraph_calls(graph: Graph): + """ + Input and output tensors of the TFCustomSubgraphCall must be 4D because IE layer accepts and produces only 4D + tensors. This function adds reshape operations where it is necessary. + :param graph: graph to operate on. + :return: None. + """ + for src_node_name, dst_node_name, edge_attrs in list(graph.edges(data=True)): + src_node = Node(graph, src_node_name) + dst_node = Node(graph, dst_node_name) + if dst_node.kind == 'op' and dst_node.has_valid('type') and dst_node.type == 'TFCustomSubgraphCall' and \ + src_node.has_valid('shape') and len(src_node.shape) != 4: + log.info("There is an data tensor of shape '{}' which goes into '{}' node".format( + src_node.shape, dst_node.type)) + CustomSubgraphCall.add_reshape_before_op_node(graph, src_node_name, dst_node_name, edge_attrs) + + for node_name in list(graph.nodes()): + node = Node(graph, node_name) + if node['kind'] == 'op' and node.has_and_set('type') and node.type == 'TFCustomSubgraphCall': + for index, data_node in node.out_nodes().items(): + real_dims_count = len(data_node.shape) + if real_dims_count != 4: + log.info( + "There is an data tensor of shape '{}' with real dims count '{}' which goes out of '{}' " + "node".format(data_node.shape, real_dims_count, node.name)) + CustomSubgraphCall.add_reshape_after_data_node(graph, data_node.id) + + # need to update shape of the op so IE generates XML with 4D tensors + out_shape = CustomSubgraphCall.make_shape_4d(data_node['shape']) + + data_node['shape'] = out_shape + + @staticmethod + def add_sub_graph_call_output_tensors_transposes(node: Node): + """ + Adds transpose operations to the output nodes if they are 4D to change layout from NCHW to NHWC. + :param node: the node to add transposes to the output nodes to. + :return: None + """ + import tensorflow as tf + from mo.front.tf.partial_infer.tf import get_subgraph_output_tensors, add_node_def_to_subgraph + _, output_tensors = get_subgraph_output_tensors(node) + + # transpose permutation constant + nhwc_to_nchw_constant = tf.constant(nhwc_to_nchw_permute, dtype=tf.int32, name=nhwc_to_nchw_constant_name) + + # dummy node which we can refer to as input in the transpose for the output node + dummy_node = tf.constant(value=[[[[1]]]], dtype=tf.float32, name='random_dummy_name') + + new_out_tensor_names = list() + for out_tensor_name in node['output_tensors_names']: + out_name, out_port = out_tensor_name.split(':') + if len(output_tensors[ + int(out_port)].shape) == 4: # TODO think about better check whether transpose is required + out_transpose_name = out_name + '_port_' + out_port + '_transpose' + transpose = tf.transpose(dummy_node, nhwc_to_nchw_constant, name=out_transpose_name) + + # starting from TF 1.8 it is not possible to modify the "node_def" of the "tf.op", so we create a copy, + # update it and use further + new_input_names = transpose.op.node_def.input[:] + new_input_names[0] = out_tensor_name + new_node_def = copy.deepcopy(transpose.op.node_def) + new_node_def.input[:] = new_input_names + add_node_def_to_subgraph(node, new_node_def, position=len(node['nodes_order'])) + new_out_tensor_names.append(out_transpose_name) + else: + new_out_tensor_names.append(out_tensor_name) + + # update output tensor names with transposes operations + node['output_tensors_names'] = new_out_tensor_names + + def find_and_replace_pattern(self, graph: Graph): + CustomSubgraphCall.update_placeholders(graph) + CustomSubgraphCall.add_output_nodes_transposes(graph) + CustomSubgraphCall.add_reshapes_for_tf_subgraph_calls(graph) diff --git a/model-optimizer/extensions/middle/DecomposeBidirectionalRNNSequence.py b/model-optimizer/extensions/middle/DecomposeBidirectionalRNNSequence.py new file mode 100644 index 0000000..5828c16 --- /dev/null +++ b/model-optimizer/extensions/middle/DecomposeBidirectionalRNNSequence.py @@ -0,0 +1,213 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.graph.graph import Node, Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.concat import Concat +from mo.ops.op import Op +from mo.ops.split import Split + + +class DecomposeBidirectionalRNNSequence(MiddleReplacementPattern): + """ + Decomposes bidirectional RNNSequence to forward and reverse RNNSequence ops. + + Both initial state are split to two part, two parts of the results are concatenated. + + Axis of split/concat is completely defined by ONNX recurrent layers specification. + """ + enabled = True + + def run_after(self): + from extensions.middle.MXNetRNNSequenceNormalize import MXNetRNNSequenceNormalize + from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize + return [ONNXRNNSequenceNormalize, MXNetRNNSequenceNormalize] + + def pattern(self): + return dict( + nodes=[ + ('lstm', dict(kind='op', type='RNNSequence', direction='bidirectional')), + ('input', dict(kind='data')), + ('W', dict(kind='data')), + ('R', dict(kind='data')), + ('B', dict(kind='data')), + ], + edges=[ + ('input', 'lstm', {'in': 0}), + ('W', 'lstm', {'in': 1}), + ('R', 'lstm', {'in': 2}), + ('B', 'lstm', {'in': 3}), + ] + ) + + @staticmethod + def split_helper(node: Node, index: int, direction: str, axis: int=0): + return Op._create_data_node( + node.graph, + name=node.name + '/SplittedBiLSTM/{}/'.format(direction), + attrs={'value': np.take(node.value, [index], axis), + 'shape': np.array(np.take(node.value, [index], axis).shape, dtype=np.int64)} + ) + + def split_data(self, data: Node): + """ Helper. Split data node into two part along 0 axis """ + assert len(data.shape) == 3 + assert data.shape[0] == 2 + + output_data = [Op._create_data_node(data.graph, + name=data.name + '/SplittedBiLSTM/{}'.format(['forward', 'reverse'][i])) for i in [0, 1]] + split_op = Split(data.graph, dict(name=data.name + '/DecomposedBiLSTM_0', axis=0, num_split=2, + out_ports_count=2)) + return split_op.create_node_with_data([data], data_nodes=output_data) + + def replace_pattern(self, graph: Graph, match: dict): + bidirectional_cell = match['lstm'] + new_init_hiddens = self.split_data(bidirectional_cell.in_node(5)) + new_init_cells = self.split_data(bidirectional_cell.in_node(6)) if 6 in bidirectional_cell.in_nodes()\ + else (None, None) + + blob_bidirectional_split = lambda node: ( + self.split_helper(node, 0, 'forward'), + self.split_helper(node, 1, 'reverse') + ) + + splitted_W = blob_bidirectional_split(bidirectional_cell.in_node(1)) + splitted_R = blob_bidirectional_split(bidirectional_cell.in_node(2)) + splitted_B = blob_bidirectional_split(bidirectional_cell.in_node(3)) + + outputs = self.split_bidirectional( + bidirectional_cell, + new_init_hiddens, + new_init_cells, + splitted_W, + splitted_R, + splitted_B, + ) + + self.concat_outputs(bidirectional_cell, outputs[0], outputs[1], bidirectional_cell.out_nodes()) + + @staticmethod + def get_new_cell(bidirectional_cell: Node, direction: str): + assert direction in ['forward', 'reverse'] + + cell_class = Op.get_op_class_by_name(bidirectional_cell.op) + new_cell = lambda graph, attrs: cell_class(graph, attrs) + attrs = bidirectional_cell.attrs().copy() + new_attrs = { + 'direction': direction, + 'name': bidirectional_cell.name + '/Split/' + direction, + } + attrs.update(new_attrs) + return new_cell(bidirectional_cell.graph, attrs) + + def split_bidirectional(self, + bidirectional_cell: Node, + new_init_hiddens: list, + new_init_cells: list, + splitted_W: tuple, + splitted_R: tuple, + splitted_B: tuple): + """ + Split one bidirectional RNNSequence node into 2 one-directional RNNSequence nodes. + + All input data nodes should be already prepared; they are + have 2 in the num_dir dimension. + """ + all_outputs = [] + for i in [0, 1]: + direction = ['forward', 'reverse'][i] + op = self.get_new_cell(bidirectional_cell, direction) + + output_data = Op._create_data_node( + bidirectional_cell.graph, + name=bidirectional_cell.out_node(0).name + '/Split/' + str(i), + attrs={'shape': bidirectional_cell.out_node(0).shape.copy()} + ) + + assert output_data.shape[1] == 2 + output_data.shape[1] = 1 + + output_hidden = Op._create_data_node( + bidirectional_cell.graph, + name=bidirectional_cell.out_node(1).name + '/Split/' + str(i), + attrs={'shape': bidirectional_cell.out_node(1).shape.copy()} + ) + + assert output_hidden.shape[0] == 2 + output_hidden.shape[0] = 1 + + data_nodes = [ + output_data, + output_hidden, + ] + + if bidirectional_cell.op == 'LSTM': + output_cell = Op._create_data_node( + bidirectional_cell.graph, + name=bidirectional_cell.out_node(2).name + '/Split/' + str(i), + attrs={'shape': bidirectional_cell.out_node(2).shape.copy()} + ) + + assert output_cell.shape[0] == 2 + output_cell.shape[0] = 1 + + data_nodes.append(output_cell) + + all_outputs.append( + op.create_node_with_data( + inputs=[ + bidirectional_cell.in_node(0), + splitted_W[i], + splitted_R[i], + splitted_B[i], + None, + new_init_hiddens[i], + new_init_cells[i] if bidirectional_cell.op == 'LSTM' else None, + ], + data_nodes=data_nodes + ) + ) + return all_outputs + + @staticmethod + def concat_outputs(bi_rnn, forward_outputs, reverse_outputs, final_outputs): + """ Concatenates two set of outputs from bidirectiondl RNNSequence nodes """ + concat_ops = [ + Concat(bi_rnn.graph, { + 'name': bi_rnn.name + '/FinalConcat/Data', + 'axis': 1, + 'in_ports_count': 2, + }), + Concat(bi_rnn.graph, { + 'name': bi_rnn.name + '/FinalConcat/HiddenState', + 'axis': 0, + 'in_ports_count': 2, + }), + Concat(bi_rnn.graph, { + 'name': bi_rnn.name + '/FinalConcat/CellState', + 'axis': 0, + 'in_ports_count': 2, + }) + ] + + bi_rnn.graph.remove_node(bi_rnn.id) + + for i in final_outputs: + concat_ops[i].create_node_with_data( + [forward_outputs[i], reverse_outputs[i]], + data_nodes=[final_outputs[i]] + ) diff --git a/model-optimizer/extensions/middle/DeleteControlFlowEdges.py b/model-optimizer/extensions/middle/DeleteControlFlowEdges.py new file mode 100644 index 0000000..4ae88a2 --- /dev/null +++ b/model-optimizer/extensions/middle/DeleteControlFlowEdges.py @@ -0,0 +1,37 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class DeleteControlFlowEdges(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.PartialInfer import PartialInfer + return [PartialInfer] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + for u, v, k, attrs in list(graph.edges(keys=True, data=True)): + if 'control_flow_edge' in attrs and attrs['control_flow_edge']: + graph.remove_edge(u, v, k) + log.debug('Removing control flow edge from {} to {}'.format(u, v)) diff --git a/model-optimizer/extensions/middle/DeleteNotExecutable.py b/model-optimizer/extensions/middle/DeleteNotExecutable.py new file mode 100644 index 0000000..157cbe1 --- /dev/null +++ b/model-optimizer/extensions/middle/DeleteNotExecutable.py @@ -0,0 +1,42 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class DeleteNotExecutable(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.TensorIteratorConditionChecker import ConditionChecks + return [ConditionChecks] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + nodes_to_remove = set() + for node_name, node_attrs in list(graph.nodes(data=True)): + if node_attrs['kind'] == 'data' and 'executable' in node_attrs and not node_attrs['executable']: + [nodes_to_remove.add(op) for op, _ in graph.in_edges(node_name)] + nodes_to_remove.add(node_name) + log.debug('Removing the following not executable nodes: {}' + ''.format('\n'.join(sorted(map(str, nodes_to_remove))))) + graph.remove_nodes_from(nodes_to_remove) diff --git a/model-optimizer/extensions/middle/DepthToSpace.py b/model-optimizer/extensions/middle/DepthToSpace.py index 1e05c8a..6470b23 100644 --- a/model-optimizer/extensions/middle/DepthToSpace.py +++ b/model-optimizer/extensions/middle/DepthToSpace.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,12 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import logging as log -from copy import deepcopy - -import networkx as nx - +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.permute import Permute from mo.ops.reshape import Reshape @@ -31,6 +27,14 @@ class DepthToSpace(MiddleReplacementPattern): enabled = True + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -43,7 +47,7 @@ class DepthToSpace(MiddleReplacementPattern): ('op', 'out_data') ]) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): node = match['op'] N, H, W, C = match['in_data'].shape @@ -52,13 +56,17 @@ class DepthToSpace(MiddleReplacementPattern): graph.remove_edge(match['in_data'].id, match['op'].id) graph.remove_edge(match['op'].id, match['out_data'].id) - dim_6D = [N, block_size, block_size, int(C / (block_size ** 2)), H, W] - order_6D = [0, 3, 4, 1, 5, 2] - dim_4D = [N, int(H * block_size), int(W * block_size), int(C / (block_size ** 2))] - - reshape_data_node = Reshape(graph=graph, attrs={'name': match['op'].id + '/Reshape_to_6D', 'dim': dim_6D}).create_node_with_data([match['in_data']]) - permute_data_node = Permute(graph=graph, attrs={'name': match['op'].id + '/Permute', 'order': order_6D}).create_node_with_data([reshape_data_node]) - reshape_node = Reshape(graph=graph, attrs={'infer': None, 'name': match['op'].id + '/Reshape_to_4D', 'dim': dim_4D}).create_node_with_data([permute_data_node], data_nodes=[match['out_data']]) + dim_6D = int64_array([N, block_size, block_size, int(C / (block_size ** 2)), H, W]) + order_6D = int64_array([0, 3, 4, 1, 5, 2]) + dim_4D = int64_array([N, int(H * block_size), int(W * block_size), int(C / (block_size ** 2))]) + + reshape_data_node = Reshape(graph=graph, attrs={'name': match['op'].id + '/Reshape_to_6D', + 'dim': dim_6D}).create_node_with_data([match['in_data']]) + permute_data_node = Permute(graph=graph, attrs={'name': match['op'].id + '/Permute', + 'order': order_6D}).create_node_with_data([reshape_data_node]) + reshape_node = Reshape(graph=graph, attrs={'name': match['op'].id + '/Reshape_to_4D', + 'dim': dim_4D}).create_node_with_data([permute_data_node], + data_nodes=[match['out_data']]) reshape_data_node.in_node()['nchw_layout'] = True reshape_data_node['nchw_layout'] = True diff --git a/model-optimizer/extensions/middle/DilatedConvolution.py b/model-optimizer/extensions/middle/DilatedConvolution.py new file mode 100644 index 0000000..1177624 --- /dev/null +++ b/model-optimizer/extensions/middle/DilatedConvolution.py @@ -0,0 +1,89 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class DilatedConvolutionConverter(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[ + ('conv', dict(kind='op', op=lambda value: value in ['Conv2D', 'DepthwiseConv2dNative', 'Conv3D'])), + ('space_to_batch', dict(kind='op', op='SpaceToBatchND')), + ('batch_to_space', dict(kind='op', op='BatchToSpaceND')), + ('input', dict(kind='data')), + ('output', dict(kind='data')), + ('conv_output', dict(kind='data')), + ('stb_output', dict(kind='data')), + ('stb_bs', dict(kind='data')), + ('stb_pad', dict(kind='data')), + ('bts_bs', dict(kind='data')), + ('bts_crop', dict(kind='data')) + ], + edges=[ + ('input', 'space_to_batch', {'in': 0}), + ('stb_bs', 'space_to_batch', {'in': 1}), + ('stb_pad', 'space_to_batch', {'in': 2}), + ('space_to_batch', 'stb_output', {'out': 0}), + ('stb_output', 'conv', {'in': 0}), + ('conv', 'conv_output', {'out': 0}), + ('conv_output', 'batch_to_space', {'in': 0}), + ('bts_bs', 'batch_to_space', {'in': 1}), + ('bts_crop', 'batch_to_space', {'in': 2}), + ('batch_to_space', 'output', {'out': 0}), + ]) + + def replace_pattern(self, graph: Graph, match: dict): + conv = match['conv'] + stb = match['space_to_batch'] + bts = match['batch_to_space'] + + block_size = match['stb_bs'] + + input = match['input'] + output = match['output'] + stb_out = match['stb_output'] + conv_out = match['conv_output'] + + in_edge_attrs = graph.get_edge_data(input.id, stb.id)[0] + out_edge_attrs = graph.get_edge_data(bts.id, output.id)[0] + + graph.remove_edge(input.id, stb.id) + graph.remove_edge(stb_out.id, conv.id) + graph.remove_edge(conv.id, conv_out.id) + graph.remove_edge(bts.id, output.id) + + conv.dilation[conv.spatial_dims] = block_size.value + + pad = match['stb_pad'].value - match['bts_crop'].value + conv.pad[conv.spatial_dims] = [[pad[x][0], pad[x][1]] for x in range(len(pad))] + conv['auto_pad'] = None + + graph.add_edges_from([ + (input.id, conv.id, {'in': 0, **in_edge_attrs}), + (conv.id, output.id, {'out': 0, **out_edge_attrs}), + ]) diff --git a/model-optimizer/extensions/middle/EltwiseChecker.py b/model-optimizer/extensions/middle/EltwiseChecker.py index 751f5c7..abbcd8b 100644 --- a/model-optimizer/extensions/middle/EltwiseChecker.py +++ b/model-optimizer/extensions/middle/EltwiseChecker.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,18 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np -from copy import deepcopy -from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize -from extensions.middle.EltwiseInputReshape import EltwiseInputReshape, Eltwise1DInputReshape -from mo.front.common.layout import get_features_dim, shape_for_layout -from mo.graph.graph import Node, get_sorted_inputs -from mo.middle.passes.fusing.helpers import get_value_id +from mo.graph.graph import Node, Graph from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.ops.reshape import Reshape class EltwiseChecker(MiddleReplacementPattern): @@ -33,12 +25,17 @@ class EltwiseChecker(MiddleReplacementPattern): enabled = True def run_after(self): + from extensions.middle.EltwiseInputReshape import Eltwise1DInputReshape return [Eltwise1DInputReshape] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + + def find_and_replace_pattern(self, graph: Graph): eltwise_nodes = [Node(graph, node) for node in graph.node if Node(graph, node).soft_get('type') == 'Eltwise'] for node in eltwise_nodes: - raw_inputs = [(inp, attr) for inp, attr in get_sorted_inputs(node) + raw_inputs = [(inp, attr) for inp, attr in node.get_sorted_inputs() if 'control_flow_edge' not in attr or not attr['control_flow_edge']] shapes = [node.graph.node[inp]['shape'] for inp, attr in raw_inputs] diff --git a/model-optimizer/extensions/middle/EltwiseInputNormalization.py b/model-optimizer/extensions/middle/EltwiseInputNormalization.py index 6f5687e..c7fe206 100644 --- a/model-optimizer/extensions/middle/EltwiseInputNormalization.py +++ b/model-optimizer/extensions/middle/EltwiseInputNormalization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,22 +16,17 @@ import networkx as nx import numpy as np -from copy import deepcopy from extensions.middle.EltwiseInputReshape import EltwiseInputReshape -from mo.front.common.layout import get_features_dim, shape_for_layout -from mo.graph.graph import Node -from mo.middle.passes.fusing.helpers import get_value_id +from mo.graph.graph import Node, Graph from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.ops.reshape import Reshape class EltwiseInputNormalize(EltwiseInputReshape, MiddleReplacementPattern): # This pass should be called directly from pipeline before layout change and other permutations enabled = False - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): eltwise_nodes = [Node(graph, node) for node in graph.node if Node(graph, node).soft_get('type') == 'Eltwise'] # Iterating over all Eltwise operations and check that every input has similar shape # in case of different shapes, we inserts new_shape attribute and then call EltwiseInputReshape extension diff --git a/model-optimizer/extensions/middle/EltwiseInputNormalization_test.py b/model-optimizer/extensions/middle/EltwiseInputNormalization_test.py index 829b13b..1608d21 100644 --- a/model-optimizer/extensions/middle/EltwiseInputNormalization_test.py +++ b/model-optimizer/extensions/middle/EltwiseInputNormalization_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/EltwiseInputReshape.py b/model-optimizer/extensions/middle/EltwiseInputReshape.py index 74ff069..70647af 100644 --- a/model-optimizer/extensions/middle/EltwiseInputReshape.py +++ b/model-optimizer/extensions/middle/EltwiseInputReshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,11 +14,12 @@ limitations under the License. """ -import networkx as nx -import numpy as np from copy import deepcopy + +import numpy as np + from mo.front.common.layout import get_features_dim, shape_for_layout -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.passes.fusing.helpers import get_value_id from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op @@ -46,7 +47,7 @@ class Eltwise1DInputReshape(MiddleReplacementPattern): def run_after(self): return [EltwiseInputReshape] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): layout = graph.graph['layout'] for n in list(graph.nodes()): if 'type' in graph.node[n] and graph.node[n]['type'] == 'Eltwise' and get_value_id(Node(graph, n)) is None: @@ -68,7 +69,11 @@ class Eltwise1DInputReshape(MiddleReplacementPattern): class EltwiseInputReshape(MiddleReplacementPattern): enabled = True - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def find_and_replace_pattern(self, graph: Graph): data_nodes = [Node(graph, node) for node in graph.node if Node(graph, node).kind == 'data'] for node in data_nodes: # Get all requested shapes for current node @@ -113,4 +118,4 @@ class EltwiseInputReshape(MiddleReplacementPattern): # Reconnect edge from original data node to Reshape output datanode graph.remove_edge(node.id, consumer.id) - graph.add_edge(reshape_data.id, consumer.id, **edge_attrs) \ No newline at end of file + graph.add_edge(reshape_data.id, consumer.id, **edge_attrs) diff --git a/model-optimizer/extensions/middle/EltwiseInputReshape_test.py b/model-optimizer/extensions/middle/EltwiseInputReshape_test.py index 24c727d..abf3790 100644 --- a/model-optimizer/extensions/middle/EltwiseInputReshape_test.py +++ b/model-optimizer/extensions/middle/EltwiseInputReshape_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/FusePermutesSequence.py b/model-optimizer/extensions/middle/FusePermutesSequence.py index ea5c1c1..a230c6b 100644 --- a/model-optimizer/extensions/middle/FusePermutesSequence.py +++ b/model-optimizer/extensions/middle/FusePermutesSequence.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,12 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import networkx as nx import numpy as np from extensions.middle.ConvertLayoutDependentOperations import ConvertLayoutDependentOperations -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.passes.eliminate import merge_data_nodes, graph_clean_up_tf from mo.middle.passes.fusing.helpers import get_next_operation from mo.middle.replacement import MiddleReplacementPattern @@ -32,11 +30,12 @@ class FusePermutesSequence(MiddleReplacementPattern): """ enabled = True + graph_condition = [lambda graph: graph.graph['fw'] != 'caffe'] def run_after(self): return [ConvertLayoutDependentOperations] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): for node in list(graph.nodes()): if node not in graph.nodes(): continue diff --git a/model-optimizer/extensions/middle/FusePermutesSequence_test.py b/model-optimizer/extensions/middle/FusePermutesSequence_test.py index 850cf17..d852186 100644 --- a/model-optimizer/extensions/middle/FusePermutesSequence_test.py +++ b/model-optimizer/extensions/middle/FusePermutesSequence_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import unittest import numpy as np +from argparse import Namespace from extensions.middle.FusePermutesSequence import FusePermutesSequence from mo.middle.passes.eliminate_test import build_graph @@ -38,6 +39,7 @@ nodes_attributes = { 'permute_3': {'type': 'Permute', 'value': None, 'kind': 'op', 'op': 'Permute'}, 'permute_3_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': { 'op': 'OpOutput', 'kind': 'op'} } @@ -52,7 +54,8 @@ class FusePermutesSequenceTest(unittest.TestCase): ('placeholder_1_data', 'permute_1'), ('permute_1', 'permute_1_data'), ('permute_1_data', 'permute_2'), - ('permute_2', 'permute_2_data') + ('permute_2', 'permute_2_data'), + ('permute_2_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -60,14 +63,18 @@ class FusePermutesSequenceTest(unittest.TestCase): 'permute_1_data': {'shape': np.array([1, 3, 227, 227])}, 'permute_2': {'order': np.array([0, 2, 3, 1])}, - 'permute_2_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True}, + 'permute_2_data': {'shape': np.array([1, 227, 227, 3])}, }, nodes_with_edges_only=True) graph.graph['layout'] = 'NHWC' + graph.graph['cmd_params'] = Namespace(keep_shape_ops=False) graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}}, nodes_with_edges_only=True) + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}}, + nodes_with_edges_only=True) pattern = FusePermutesSequence() pattern.find_and_replace_pattern(graph) @@ -84,7 +91,8 @@ class FusePermutesSequenceTest(unittest.TestCase): ('placeholder_1_data', 'permute_1'), ('permute_1', 'permute_1_data'), ('permute_1_data', 'permute_2'), - ('permute_2', 'permute_2_data') + ('permute_2', 'permute_2_data'), + ('permute_2_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -92,20 +100,22 @@ class FusePermutesSequenceTest(unittest.TestCase): 'permute_1_data': {'shape': np.array([1, 3, 227, 227])}, 'permute_2': {'order': np.array([0, 1, 2, 3])}, - 'permute_2_data': {'shape': np.array([1, 3, 227, 227]), 'is_output': True}, + 'permute_2_data': {'shape': np.array([1, 3, 227, 227])}, }, nodes_with_edges_only=True) graph.graph['layout'] = 'NHWC' + graph.graph['cmd_params'] = Namespace(keep_shape_ops=False) graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data'), - ('placeholder_1_data', 'permute_1'), - ('permute_1', 'permute_1_data'), - ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'permute_1': {'order': np.array([0, 3, 1, 2])}, - 'permute_1_data': {'shape': np.array([1, 3, 227, 227])}, - }, nodes_with_edges_only=True) + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'permute_1'), + ('permute_1', 'permute_1_data'), + ('permute_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'permute_1': {'order': np.array([0, 3, 1, 2])}, + 'permute_1_data': {'shape': np.array([1, 3, 227, 227])}, + }, nodes_with_edges_only=True) pattern = FusePermutesSequence() pattern.find_and_replace_pattern(graph) diff --git a/model-optimizer/extensions/middle/FusedBatchNormNonConstant.py b/model-optimizer/extensions/middle/FusedBatchNormNonConstant.py index 6b0ed8e..d3a84ab 100644 --- a/model-optimizer/extensions/middle/FusedBatchNormNonConstant.py +++ b/model-optimizer/extensions/middle/FusedBatchNormNonConstant.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,9 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import networkx as nx - +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.eltwise import Eltwise from mo.ops.power import Power @@ -30,6 +28,14 @@ class FusedBatchNormNonConstant(MiddleReplacementPattern): enabled = True + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -37,7 +43,7 @@ class FusedBatchNormNonConstant(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): node = match['op'] if (node.data_format != b'NHWC' or len(node.in_nodes()) != 5 or diff --git a/model-optimizer/extensions/middle/FusedBatchNormTrainingCatch.py b/model-optimizer/extensions/middle/FusedBatchNormTrainingCatch.py index 90fedc9..93749e3 100644 --- a/model-optimizer/extensions/middle/FusedBatchNormTrainingCatch.py +++ b/model-optimizer/extensions/middle/FusedBatchNormTrainingCatch.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ limitations under the License. """ -import networkx as nx - +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.utils.error import Error @@ -30,6 +29,14 @@ class FusedBatchNormTrainingCatch(MiddleReplacementPattern): enabled = True replacement_id = "Fused_Batch_Norm_is_training_true_catcher" + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -37,5 +44,5 @@ class FusedBatchNormTrainingCatch(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): raise Error('FusedBatchNorm doesn\'t support is_training=True. Node {}'.format(match['op'].id)) diff --git a/model-optimizer/extensions/middle/GRURNNSequenceToTensorIterator.py b/model-optimizer/extensions/middle/GRURNNSequenceToTensorIterator.py new file mode 100644 index 0000000..1b2bdc6 --- /dev/null +++ b/model-optimizer/extensions/middle/GRURNNSequenceToTensorIterator.py @@ -0,0 +1,223 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.ops.tensor_iterator import TensorIterator +from mo.graph.graph import Graph, add_opoutput +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op +from mo.ops.reshape import Reshape + + +class GRUAndRNNToTensorIterator(MiddleReplacementPattern): + """ Converts normalized RNNSequence with op=GRU/RNN to TensorIterator. + + Normalized RNNSequence means that it should be processed by + RNNSequenceNormalize transform that ensures its strict form. + + This transformation builds an alternative sub-graph for GRUSequence + with TensorIterator connected in the same way as an original GRUSequence + node and with internal body represented as GRUCell op node with necessary + squeezes and unsqueezes around. + """ + + enabled = True + id = 'gru_and_rnn_to_tensor_iterator' + + def run_after(self): + from extensions.middle.RNNSequenceNormalizeToIE import RNNSequenceNormalize + return [RNNSequenceNormalize] + + def run_before(self): + from extensions.middle.FusePermutesSequence import FusePermutesSequence + return [FusePermutesSequence] + + def pattern(self): + return dict( + nodes=[ + ('rnn_layer', dict(kind='op', type='RNNSequence')), + ('input', dict(kind='data')), + ('weights', dict(kind='data')), + ('biases', dict(kind='data')), + # don't capture optional input initial states here + ('output', dict(kind='data')), + # don't capture optional output last states here + ], + edges=[ + ('input', 'rnn_layer', {'in': 0}), + ('weights', 'rnn_layer', {'bin': 'weights', 'in': 1}), + ('biases', 'rnn_layer', {'bin': 'biases', 'in': 2}), + ('rnn_layer', 'output', {'out': 0}), + ] + ) + + @staticmethod + def get_rnn_cell(name: str): + op = Op.get_op_class_by_name(name + 'Cell') + return op + + def replace_pattern(self, graph: Graph, match: dict): + if match['rnn_layer']['op'] == 'LSTM': + return + + rnn_layer = match['rnn_layer'] + + # Build TensorIterator body first + body = Graph(name=rnn_layer.name + '/sub_graph') + body.graph = graph.graph + + # 1. Input squeeze Reshape + inputs = [Op._create_data_node(body, rnn_layer.name + '/inport/' + str(inp), + {'shape': rnn_layer.in_node(inp).shape.copy(), + 'value': rnn_layer.in_node(inp).value.copy() + if rnn_layer.in_node(inp).value is not None and inp in [1, 2] else None}) + for inp in [0, 4, 1, 2]] # X, h_init, WR, B + + inputs[0].shape[rnn_layer.sequence_dim] = 1 + reshape_dim = inputs[0].shape.copy() + reshape_dim[rnn_layer.batch_dim] = -1 + reshape_dim = np.delete(reshape_dim, rnn_layer.sequence_dim) + input_squeeze = Reshape( + body, + dict(name=rnn_layer.name + '/input_squeeze', internal_layer_id=0, dim=reshape_dim) + ) + inputs[0] = input_squeeze.create_node_with_data([inputs[0]], edge_attrs=[{'internal_port_id': 0}]) + + # 2. Output unsqueeze Reshape + outputs = [Op._create_data_node(body, rnn_layer.name + '/outport/' + str(out), + {'shape': rnn_layer.out_node(out).shape.copy() if out in rnn_layer.out_nodes() else None}) + for out in [0]] + for out in outputs: + add_opoutput(body, out.id, 0, False) + + unsqueezed_output_shape = outputs[0].shape.copy() + unsqueezed_output_shape[rnn_layer.sequence_dim] = 1 + squeezed_output_shape = np.delete(unsqueezed_output_shape, rnn_layer.sequence_dim) + outputs[0].shape = squeezed_output_shape + unsqueezed_output_shape[rnn_layer.batch_dim] = -1 + output_unsqueeze = Reshape(body, dict(name=rnn_layer.name + '/output_unsqueeze/', dim=unsqueezed_output_shape, + internal_layer_id=2)) + + additional_attrs = dict(activations=rnn_layer.activations, + activation_alpha=rnn_layer.activation_alpha, + activation_beta=rnn_layer.activation_beta, + clip=rnn_layer.clip) + if rnn_layer.op == 'GRU': + additional_attrs['linear_before_reset'] = rnn_layer.linear_before_reset + + # 3. ***Cell + rnn_cell_op = self.get_rnn_cell(rnn_layer['op'])(body, dict(hidden_size=rnn_layer.hidden_size, + name=rnn_layer.name + '/{}Cell'.format(rnn_layer.op), + **additional_attrs, + internal_layer_id=1)) + + gru_cell = rnn_cell_op.create_node_with_data(inputs, data_nodes=outputs, + edge_attrs=[{}, {'internal_port_id': 1}, + {'internal_port_id': 2}, {'bin': 'weights'}, + {'bin': 'biases'}]) + + # internal ports for outputs of cell + gru_cell.in_node().out_edge(0)['internal_port_id'] = 4 # h_state + + gru_cell = output_unsqueeze.create_node_with_data([gru_cell]) + gru_cell.in_node().out_edge(0)['internal_port_id'] = 3 + add_opoutput(body, gru_cell.id, 0, False) + + # 4. TensorIterator layer creating + assert rnn_layer.direction in ['forward', 'reverse'] + if rnn_layer.direction == 'forward': + stride = 1 + start = None + end = None + else: + assert rnn_layer.direction == 'reverse' + stride = -1 + start = -1 + end = 0 + + # stacked h_state + output_port_map = [{ + 'external_port_id': 3, + 'internal_layer_id': 2, + 'internal_port_id': 3, + + 'axis': rnn_layer.sequence_dim, + 'stride': stride, + 'start': start, + 'end': end, + 'part_size': 1, + }] + + # Adding last h_state to outputs + if len(rnn_layer.out_nodes()) == 2: + output_port_map.extend([{ + 'external_port_id': 4, + 'internal_layer_id': 1, + 'internal_port_id': 4, + }]) + + ti_op = TensorIterator(graph, { + 'name': rnn_layer.name + '/TensorIterator', + 'body': body, + 'in_ports_count': 4, + 'out_ports_count': len(rnn_layer.out_nodes()), + + 'input_port_map': [ + { + 'external_port_id': 0, + 'internal_layer_id': 0, + 'internal_port_id': 0, + + 'axis': rnn_layer.sequence_dim, + 'stride': stride, + 'start': start, + 'end': end, + 'part_size': 1, + }, + { + 'external_port_id': 1, + 'internal_layer_id': 1, + 'internal_port_id': 1, + }, + ], + + 'output_port_map': output_port_map, + # only for h state + 'back_edges': [ + { + 'from_layer': 1, + 'from_port': 4, + 'to_layer': 1, + 'to_port': 1, + }, + ] + }) + + assert sorted(rnn_layer.out_nodes().keys()) == list(range(len(rnn_layer.out_nodes()))), \ + "There are gaps in output ports of GRUSequence operation. Node {}".format(rnn_layer.id) + + outs = ti_op.create_node_with_data([rnn_layer.in_node(i) for i in [0, 4]], # X, h_init + data_nodes=[rnn_layer.out_node(i) for i in range(len(rnn_layer.out_nodes()))], + edge_attrs=[{'external_port_id': 0}, {'external_port_id': 1}]) + + if not isinstance(outs, list): + outs = list([outs]) + + graph.remove_node(rnn_layer.id) + outs[0].in_edge(0)['external_port_id'] = 3 + for i, out in enumerate(outs[1:]): + external_port_id = 4 + i + out.in_edge()['external_port_id'] = external_port_id diff --git a/model-optimizer/extensions/middle/GatherNdNormalizer.py b/model-optimizer/extensions/middle/GatherNdNormalizer.py new file mode 100644 index 0000000..08dd6dc --- /dev/null +++ b/model-optimizer/extensions/middle/GatherNdNormalizer.py @@ -0,0 +1,100 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import logging as log + +import numpy as np + +from extensions.ops.gather import Gather +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.const import Const +from mo.ops.reshape import Reshape + + +class GatherNdNormalize(MiddleReplacementPattern): + """ + Hot fix for new speech-to-text model enabling while GatherND is not implemented in IE. + We can replace GatherNd to Reshape + Gather in case when GatherNd indices have just one + meaningful dimension. + """ + enabled = True + force_clean_up = True + + def run_before(self): + from extensions.middle.BlockLSTMtoLSTMSequence import BlockLSTMtoLSTMSequence + return [BlockLSTMtoLSTMSequence] + + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[('GatherNd', dict(kind='op', op='GatherNd'))], + edges=[] + ) + + @staticmethod + def indices_check(indices: np.array, input_shape: tuple): + """ + Check that indices have just one meaningful dimension and all other dimensions of input have size 1. + """ + n_dims = indices.shape[-1] + non_zero = None + for i in range(n_dims): + if not all(np.take(indices, indices=[i], axis=-1) == 0): + if non_zero is None: + non_zero = i + else: + return None + else: + if input_shape[i] != 1: + return None + return non_zero + + def replace_pattern(self, graph: Graph, match: dict): + gather = match['GatherNd'] + input_shape = gather.in_node(0).shape + indices = gather.in_node(1).value + if indices is None: + # We can't do such special pass without indices value + return + + # 0. All needed checks that we can replace GatherNd by Gather + gather_idx = self.indices_check(indices, input_shape) + if gather_idx is None: + log.warning('Node {} with op=GatherNd can\'t be normalized to op=Gather.'.format(gather.name)) + return + + # 1. Add Reshape and connect + new_shape = int64_array([-1] + list(input_shape[indices.shape[-1]:])) + reshape = Reshape(graph, {'name': gather.name + '/Reshape_for_GatherNd/', 'dim': new_shape, }).create_node() + gather.in_port(0).get_connection().set_destination(reshape.in_port(0)) + + # 2. Change indices from Nd to 1d: + new_indices = np.reshape(np.take(indices, indices=[gather_idx], axis=-1), [-1]) + new_indices_const = Const(graph, dict(value=new_indices)).create_node() + + # 3. Create new Gather operation and reconnect all inputs/outputs + new_gather = Gather(graph, {'name': gather.name + '/NewGather/', 'axis': 0}).create_node() + reshape.out_port(0).connect(new_gather.in_port(0)) + new_indices_const.out_port(0).connect(new_gather.in_port(1)) + + gather.out_port(0).get_connection().set_source(new_gather.out_port(0)) + + # 4. Remove old Gather node + graph.remove_node(gather.id) diff --git a/model-optimizer/extensions/middle/GemmResolver.py b/model-optimizer/extensions/middle/GemmResolver.py index 29a39b9..edef22a 100644 --- a/model-optimizer/extensions/middle/GemmResolver.py +++ b/model-optimizer/extensions/middle/GemmResolver.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,27 +14,31 @@ limitations under the License. """ -import networkx as nx - -from extensions.middle.NormalizeFullyConnected import NormalizeFullyConnected from mo.front.common.partial_infer.utils import mark_input_bins, assign_dims_to_weights, int64_array +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import PermuteAttrs class GemmResolver(MiddleReplacementPattern): enabled = True + graph_condition = [lambda graph: graph.graph['fw'] != 'tf'] def run_before(self): + from extensions.middle.NormalizeFullyConnected import NormalizeFullyConnected return [NormalizeFullyConnected] + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + def pattern(self): return dict( nodes=[ - ('input_0', dict(kind='data')), - ('input_1', dict(kind='data')), - ('fc', dict(op='MatMul')), - ('fc_data', dict(kind='data'))], + ('input_0', dict(kind='data')), + ('input_1', dict(kind='data')), + ('fc', dict(op='MatMul')), + ('fc_data', dict(kind='data'))], edges=[ ('input_0', 'fc', {'in': 0}), ('input_1', 'fc', {'in': 1}), @@ -42,9 +46,10 @@ class GemmResolver(MiddleReplacementPattern): ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): if not match['input_0'].has_valid('value') and not match['input_1'].has_valid('value') or \ - not match['input_0'].has_valid('value') and match['input_1'].has_valid('value') and match['input_1'].shape.size > 2: + not match['input_0'].has_valid('value') and match['input_1'].has_valid('value') and match[ + 'input_1'].shape.size > 2: match['fc']['type'] = 'GEMM' elif not match['input_0'].has_valid('value') and match['input_1'].has_valid('value'): match['fc']['type'] = 'FullyConnected' @@ -57,6 +62,3 @@ class GemmResolver(MiddleReplacementPattern): weights_shape = weights_node.shape node['out-size'] = weights_shape[1] - - - diff --git a/model-optimizer/extensions/middle/GemmToFullyConnected.py b/model-optimizer/extensions/middle/GemmToFullyConnected.py new file mode 100644 index 0000000..1cba6b4 --- /dev/null +++ b/model-optimizer/extensions/middle/GemmToFullyConnected.py @@ -0,0 +1,88 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from typing import Dict +from mo.front.common.partial_infer.utils import assign_dims_to_weights +from mo.graph.graph import Graph, Node +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.lin_op import Add + + +class GemmToFullyConnected(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['fw'] == 'onnx'] + + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + + def pattern(self): + return dict( + nodes=[ + ('gemm', dict(kind='op', op='Gemm')), + ('output', dict(kind='data'))], + edges=[('gemm', 'output')] + ) + + def replace_pattern(self, graph: Graph, match: Dict[str, Node]): + log.debug('GemmToFullyConnected is triggered') + gemm = match['gemm'] + A = gemm.in_node(0) + B = gemm.in_node(1) + B_consumers = graph.out_edges(B.node) + C = gemm.in_node(2) + + if not (B.value is not None and + C.value is not None and + A.shape is not None and + not gemm.transpose_a and + (len(B_consumers) == 1 or not gemm.transpose_b)): + log.warning('Cannot convert Gemm to FullyConnected') + return + + if gemm.transpose_b: + # B.value = B.value.transpose() + # B.shape = np.array(B.value.shape, dtype=np.int64) + gemm.transpose_b = 0 + else: + B.value = B.value.transpose() + B.shape = np.array(B.value.shape, dtype=np.int64) + + gemm['out-size'] = gemm.out_port(0).data.get_shape()[-1] + gemm['type'] = 'FullyConnected' + gemm['channel_dims'] = len(match['output'].shape) - 1 + gemm['bias_addable'] = True + gemm['input_channel_dim'] = 1 # MatMul weights in IO + gemm['output_channel_dim'] = 0 + gemm['layout'] = 'NCHW' + + gemm.in_port(1).bin = 'weights' + + bias_node = Add(graph, {}).create_node() + gemm.out_port(0).get_connection().set_source(bias_node.out_port(0)) + gemm.in_port(2).get_connection().set_destination(bias_node.in_port(1)) + gemm.out_port(0).connect(bias_node.in_port(0)) + + assign_dims_to_weights(gemm.in_node(1), None, 1, 0, 2) + # Do not transpose weights in this pass, it will be done as a separate pass diff --git a/model-optimizer/extensions/middle/InputCut.py b/model-optimizer/extensions/middle/InputCut.py new file mode 100644 index 0000000..902cd2d --- /dev/null +++ b/model-optimizer/extensions/middle/InputCut.py @@ -0,0 +1,34 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.extractor import add_input_ops +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class MiddleInputCut(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.ChangePlaceholderTypes import ChangePlaceholderTypes + return [ChangePlaceholderTypes] + + def find_and_replace_pattern(self, graph: Graph): + add_input_ops(graph, graph.graph['user_shapes'], False) diff --git a/model-optimizer/extensions/middle/L2NormToNorm.py b/model-optimizer/extensions/middle/L2NormToNorm.py new file mode 100644 index 0000000..b440c1b --- /dev/null +++ b/model-optimizer/extensions/middle/L2NormToNorm.py @@ -0,0 +1,107 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.front.extractor import add_attrs_props +from mo.front.extractor import update_ie_fields +from mo.graph.graph import Node, Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class L2NormToNorm(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[ + ('input', dict(kind='data')), + ('l2_normalize', dict(kind='op', op='Mul')), + ('l2_normalize_data', dict(kind='data')), + ('maximum', dict(kind='op', op='Maximum')), + ('maximum_data', dict(kind='data')), + ('maximum_y_data', dict(kind='data')), + ('rsqrt', dict(kind='op', op='Rsqrt')), + ('rsqrt_data', dict(kind='data')), + ('square', dict(kind='op', op='Square')), + ('square_data', dict(kind='data')), + ('sum', dict(kind='op', op='Reduce', reduce_type='sum')), + ('sum_data', dict(kind='data')), + ], + edges=[ + ('input', 'square'), + ('square', 'square_data'), + ('square_data', 'sum'), + ('sum', 'sum_data'), + ('maximum_y_data', 'maximum'), + ('sum_data', 'maximum'), + ('maximum', 'maximum_data'), + ('maximum_data', 'rsqrt'), + ('rsqrt', 'rsqrt_data'), + ('rsqrt_data', 'l2_normalize'), + ('input', 'l2_normalize'), + ('l2_normalize', 'l2_normalize_data'), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + input_data_name = match['input'].node + output_data_name = match['l2_normalize_data'].node + + if not match['maximum_y_data'].has_valid('value'): + return + if match['maximum_y_data'].value.shape != (): + return + y = match['maximum_y_data'].value + + normalize_id = graph.unique_id() + graph.add_node(normalize_id, + **add_attrs_props( + dict(kind='op', precision="FP32", type='Normalize', name=str(graph.unique_id('normalize')), + op='Normalize', shape=None, eps=str(y), across_spatial=str(0), channel_shared=str(0), + data_type=None, infer=None, in_ports_count=2, out_ports_count=1))) + normalize_data_id = graph.unique_id() + + graph.add_node(normalize_data_id, **add_attrs_props(graph.node[output_data_name])) + update_ie_fields(graph.node[normalize_id]) + weights_id = graph.unique_id('weights_') + graph.add_node(weights_id, **add_attrs_props( + dict(kind='data', precision="FP32", name=weights_id, value=None, shape=None, data_type=None, infer=None))) + wnode = Node(graph, weights_id) + wnode['value'] = np.ones(shape=match['input'].shape[-1], + dtype=match['input'].data_type) # TODO feature dim instead of -1 + wnode['shape'] = np.array(wnode['value'].shape) + output_edges = list(graph.out_edges(output_data_name, data=True)) + graph.remove_edges_from([ + (input_data_name, match['l2_normalize'].id), + (input_data_name, match['square'].id) + ]) + graph.remove_edges_from(list(graph.out_edges(output_data_name))) + graph.remove_node(output_data_name) + graph.add_edge(input_data_name, normalize_id, **{'in': 0}) + graph.add_edge(weights_id, normalize_id, **{'in': 1, 'bin': 'weights'}) + graph.add_edge(normalize_id, normalize_data_id, **{'out': 0}) + for data, owner, attr in output_edges: + graph.add_edge(normalize_data_id, owner, **attr) diff --git a/model-optimizer/extensions/middle/lstm_sequence_tensor_iterator.py b/model-optimizer/extensions/middle/LSTMRNNSequenceToTensorIterator.py similarity index 78% rename from model-optimizer/extensions/middle/lstm_sequence_tensor_iterator.py rename to model-optimizer/extensions/middle/LSTMRNNSequenceToTensorIterator.py index f576cde..2c19ad7 100644 --- a/model-optimizer/extensions/middle/lstm_sequence_tensor_iterator.py +++ b/model-optimizer/extensions/middle/LSTMRNNSequenceToTensorIterator.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,36 +13,36 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import networkx as nx import numpy as np from extensions.middle.FusePermutesSequence import FusePermutesSequence -from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize -from extensions.middle.mxnet_lstm_sequence_normalize import MXNetLSTMSequenceNormalize +from extensions.middle.RNNSequenceNormalizeToIE import RNNSequenceNormalize from extensions.ops.lstm_cell import LSTMCell from extensions.ops.tensor_iterator import TensorIterator +from mo.graph.graph import Graph, add_opoutput from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op from mo.ops.reshape import Reshape -class LSTMSequenceTensorIterator(MiddleReplacementPattern): - """ Converts normalized LSTMSequence op to TensorIterator. +class LSTMToTensorIterator(MiddleReplacementPattern): + """ Converts normalized RNNSequence with op=LSTM to TensorIterator. - Normalized LSTMSequence means that it should be processed by - LSTMSequenceNormalize transform that ensures its stict form. + Normalized RNNSequence means that it should be processed by + RNNSequenceNormalize transform that ensures its strict form. - This transformation builds an altenative sub-graph for LSTMSequence + This transformation builds an alternative sub-graph for LSTMSequence with TensorIterator connected in the same way as an original LSTMSequence node and with internal body represented as LSTMCell op node with necessary squeezes and unsqueezes around. """ enabled = True - + force_clean_up = True + id = 'lstm_to_tensor_iterator' + def run_after(self): - return [LSTMSequenceNormalize, MXNetLSTMSequenceNormalize] + return [RNNSequenceNormalize] def run_before(self): return [FusePermutesSequence] @@ -50,7 +50,7 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): def pattern(self): return dict( nodes=[ - ('lstm', dict(kind='op', op='LSTMSequence')), + ('lstm', dict(kind='op', op='LSTM', type='RNNSequence')), ('input', dict(kind='data')), ('weights', dict(kind='data')), ('biases', dict(kind='data')), @@ -66,16 +66,20 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): lstm = match['lstm'] # Build TensorIterator body first - body = nx.MultiDiGraph(name=lstm.name + '/sub_graph', layout=graph.graph['layout']) + body = Graph(name=lstm.name + '/sub_graph') + body.graph = graph.graph + + # 1. Input squeeze Reshape inputs = [Op._create_data_node(body, lstm.name + '/inport/' + str(inp), {'shape': lstm.in_node(inp).shape.copy(), 'value': lstm.in_node(inp).value.copy() if lstm.in_node(inp).value is not None and inp in [1, 2] else None}) - for inp in [0, 3, 4, 1, 2]] + for inp in [0, 4, 5, 1, 2]] # X, WR, B, h_init, c_init + inputs[0].shape[lstm.sequence_dim] = 1 reshape_dim = inputs[0].shape.copy() reshape_dim[lstm.batch_dim] = -1 @@ -85,11 +89,14 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): dict(name=lstm.name + '/input_squeeze', internal_layer_id=0, dim=reshape_dim) ) inputs[0] = input_squeeze.create_node_with_data([inputs[0]], edge_attrs=[{'internal_port_id': 0}]) - lstm_cell_op = LSTMCell(body, dict(hidden_size=match['lstm'].hidden_size, name=lstm.name + '/LSTMCell', - internal_layer_id=1)) + + # 2. Output unsqueeze Reshape outputs = [Op._create_data_node(body, lstm.name + '/outport/' + str(out), {'shape': lstm.out_node(out).shape.copy() if out in lstm.out_nodes() - else lstm.in_node(3).shape.copy(), 'is_output': True}) for out in [0, 1]] + else lstm.in_node(4).shape.copy()}) for out in [0, 1]] + for out in outputs: + add_opoutput(body, out.id, 0, False) + unsqueezed_output_shape = outputs[0].shape.copy() unsqueezed_output_shape[lstm.sequence_dim] = 1 squeezed_output_shape = np.delete(unsqueezed_output_shape, lstm.sequence_dim) @@ -97,7 +104,16 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): unsqueezed_output_shape[lstm.batch_dim] = -1 output_unsqueeze = Reshape(body, dict(name=lstm.name + 'output_unsqueeze', dim=unsqueezed_output_shape, internal_layer_id=2)) - # TODO edge attributes should be assigned by the op itself + + # 3. LSTMCell + lstm_cell_op = LSTMCell(body, dict(hidden_size=lstm.hidden_size, + activations=lstm.activations, + activation_alpha=lstm.activation_alpha, + activation_beta=lstm.activation_beta, + clip=lstm.clip, + input_forget=lstm.input_forget, + name=lstm.name + '/LSTMCell', + internal_layer_id=1)) lstm_cell_node = lstm_cell_op.create_node_with_data(inputs, data_nodes=outputs, edge_attrs=[{}, {'internal_port_id': 1}, {'internal_port_id': 2}, {'bin': 'weights'}, @@ -106,8 +122,9 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): lstm_cell_node[0].in_node().out_edge(1)['internal_port_id'] = 5 lstm_cell_node[0] = output_unsqueeze.create_node_with_data([lstm_cell_node[0]]) lstm_cell_node[0].in_node().out_edge(0)['internal_port_id'] = 3 - lstm_cell_node[0]['is_output'] = True + add_opoutput(body, lstm_cell_node[0].id, 0, False) + # 4. TensorIterator layer creating assert lstm.direction in ['forward', 'reverse'] if lstm.direction == 'forward': stride = 1 @@ -123,6 +140,7 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): 'external_port_id': 3, 'internal_layer_id': 2, 'internal_port_id': 3, + 'axis': lstm.sequence_dim, 'stride': stride, 'start': start, @@ -130,6 +148,7 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): 'part_size': 1, }] + # Adding h_state, c_state to outputs if len(lstm.out_nodes()) == 3: output_port_map.extend([{ 'external_port_id': 4, @@ -144,12 +163,15 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): ti_op = TensorIterator(graph, { 'name': lstm.name + '/TensorIterator', 'body': body, + 'in_ports_count': 3, + 'out_ports_count': len(lstm.out_nodes()), 'input_port_map': [ { 'external_port_id': 0, 'internal_layer_id': 0, 'internal_port_id': 0, + 'axis': lstm.sequence_dim, 'stride': stride, 'start': start, @@ -188,7 +210,8 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern): assert sorted(lstm.out_nodes().keys()) == list(range(len(lstm.out_nodes()))), \ "There are gaps in output ports of LSTMSequence operation. Node {}".format(lstm.id) - outs = ti_op.create_node_with_data([lstm.in_node(i) for i in [0, 3, 4]], + + outs = ti_op.create_node_with_data([lstm.in_node(i) for i in [0, 4, 5]], # X, h_init, c_init data_nodes=[lstm.out_node(i) for i in range(len(lstm.out_nodes()))], edge_attrs=[{'external_port_id': 0}, {'external_port_id': 1}, {'external_port_id': 2}]) diff --git a/model-optimizer/extensions/middle/LayoutChangeForConstantShapePaths.py b/model-optimizer/extensions/middle/LayoutChangeForConstantShapePaths.py new file mode 100644 index 0000000..37d1dd8 --- /dev/null +++ b/model-optimizer/extensions/middle/LayoutChangeForConstantShapePaths.py @@ -0,0 +1,113 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.ops.gather import Gather +from mo.back.replacement import BackReplacementPattern +from mo.graph.graph import Graph, Node +from mo.ops.const import Const + + +class LayoutChangeForConstantShapePaths(BackReplacementPattern): + enabled = False + graph_condition = [lambda graph: graph.graph['fw'] == 'tf', + lambda graph: graph.graph['cmd_params'].keep_shape_ops] + force_clean_up = True + + @staticmethod + def if_has_value(graph: Graph, node_name: str): + return Node(graph, node_name).has_valid('value') + + def search_of_constant_path_end(self, graph: Graph, node_name: str, visited: set): + from collections import deque + d = deque() + d.appendleft(node_name) + ends = set() + while len(d) != 0: + cur_node = d.popleft() + node = Node(graph, cur_node) + if node.has_valid('permute_attrs'): + node['permute_attrs'] = None + for _, out_node_name in graph.out_edges(cur_node): + if out_node_name not in visited: + if self.if_has_value(graph, out_node_name): + visited.add(cur_node) + d.extend([op for _, op in graph.out_edges(out_node_name)]) + else: + ends.add(cur_node) + return ends + + def find_and_replace_pattern(self, graph: Graph): + # 1. Inserting Gather to N*C format on constant shape paths + # - Search for Shape ops + # - Inserting Gather after them in case of [4] or [5] output shape + + shape_ops = graph.get_op_nodes(op='Shape') + constant_shape_paths = set() + gather_inserted = [] + + for shape in shape_ops: + shape_of_shape_op_output = shape.out_node().shape + + if np.array_equal(shape_of_shape_op_output, [4]): + index = np.array([0, 2, 3, 1]) + elif np.array_equal(shape_of_shape_op_output, [5]): + index = np.array([0, 2, 3, 4, 1]) + else: + continue + + const = Const(graph, {'value': index}).create_node() + gather = Gather(graph, {}).create_node() + + shape.out_port(0).get_connection().set_source(gather.out_port(0)) + shape.out_port(0).connect(gather.in_port(0)) + const.out_port(0).connect(gather.in_port(1)) + + constant_shape_paths.add(gather.id) + gather_inserted.append(gather.id) + + # 2. Inserting Gather to NC* format + # - Search from Shape ops found in previous step for nodes without value that are n-th children of Shape op + # * MO can not propagate value, there is data path + # - Inserting Gather on ports which comes from operations in `constant_shape_paths` list + + constant_shape_ends = [] + + for shape in shape_ops: + constant_shape_ends.extend(self.search_of_constant_path_end(graph, node_name=shape.id, + visited=constant_shape_paths)) + + for end in constant_shape_ends: + node = Node(graph, end) + in_ports = [in_port for in_port in node.in_ports().values() + if in_port.get_source().node.id in constant_shape_paths] + + for in_port in in_ports: + shape = in_port.data.get_shape() + + if np.array_equal(shape, [4]): + index = np.array([0, 3, 1, 2]) + elif np.array_equal(shape, [5]): + index = np.array([0, 2, 3, 4, 1]) + else: + continue + + const = Const(graph, {'value': np.array(index)}).create_node() + gather = Gather(graph, {}).create_node() + + in_port.get_connection().set_destination(gather.in_port(0)) + const.out_port(0).connect(gather.in_port(1)) + gather.out_port(0).connect(in_port) diff --git a/model-optimizer/extensions/middle/MXNetRNNSequenceNormalize.py b/model-optimizer/extensions/middle/MXNetRNNSequenceNormalize.py new file mode 100644 index 0000000..78235cd --- /dev/null +++ b/model-optimizer/extensions/middle/MXNetRNNSequenceNormalize.py @@ -0,0 +1,229 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op +from mo.ops.permute import Permute +from mo.ops.reshape import Reshape + + +class MXNetRNNSequenceNormalize(MiddleReplacementPattern): + """ + Convert blobs and shapes of MXNet-like RNN cell to IE compatible form. + + The target form of this operation is not normally covered by a dedicated + layer in IE. It should be further transformed to some other layer + that are supported by IE. This transformation pass involves weights and + shapes processing only. + + Post-conditions: + Inputs: + 0: X input data, shape [batch_size, seq_len, input_size] (or [seq_len. bathc_size, int_size], depends on + batch_dim param) + 1: W weights blob, shape [num_dir, n_cells, M, hidden_size, input_size] + 2: R weights blob, shape [num_dir, n_cells, M, hidden_size, hidden_size] + 3: B biases blob, shape [num_dir, n_cells, 2, M, hidden_size] + 4: (optional) sequence_length, shape [batch_size] + 5: initial hidden state, shape [num_dir, batch_size, hidden_size] + ([num_dir, n_cells, batch_size, hidden_size] if num_cells != 1) + 6: (only for LSTM) initial cell state, shape [num_dir, batch_size, hidden_size] + 7: (optional for LSTM) Peepholes weights, shape [num_dir, n_cells, (M - 1) * hidden_size] + + Outputs: + 0: Y output blob, shape [batch_size, num_dir, seq_len, hidden_size] + 1: (optional) Y_h, shape [num_dir, batch_size, hidden_size] + 2: (optional for LSTM) Y_c, shape [num_dir, batch_size, hidden_size] + + Where: + M -- number of gates in this cell (4 for LSTM, 3 for GRU, 1 for RNN). + num_dir -- number of directions ('forvard', 'bidirectional', 'reverse') + n_cells -- number of cells in layer (always 1 for ONNX). + + """ + enabled = True + + def run_after(self): + from extensions.middle.MXNetSplitMultiLayers import MXNetSplitLayersToRNNSequence + return [MXNetSplitLayersToRNNSequence] + + def pattern(self): + return dict( + nodes=[ + ('rnn_layer', dict(kind='op', type='RNNSequence', format='mxnet')), + ('input', dict(kind='data')), + ('params', dict(kind='data')), + ], + edges=[ + ('input', 'rnn_layer', {'in': 0}), + ('params', 'rnn_layer', {'in': 1}), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + rnn_layer = match['rnn_layer'] + + self.check_init_states(graph, match) + self.repack_weights(graph, match) + self.add_output_reshape(graph, match) + self.check_input_ports(graph, match) + rnn_layer['normalized'] = True + + @staticmethod + def repack_weights(graph: Graph, match: dict): + input = match['input'] + rnn_layer = match['rnn_layer'] + params = match['params'].value.copy() + + graph.remove_edge(match['params'].id, rnn_layer.id) + + input_size = input.shape[2] + direction = 2 if rnn_layer.has_num_directions else 1 + bsize = (2 * rnn_layer.hidden_size * direction * 1) * rnn_layer.multiplier + + W = np.array(params[0:len(params) - bsize]) + B = np.array(params[len(params) - bsize:]) + + W = W.reshape((direction, -1)) + B = B.reshape((direction, -1)) + + W, R = np.array(W[:, 0:rnn_layer.hidden_size * rnn_layer.multiplier * input_size]), np.array(W[:, rnn_layer.hidden_size * rnn_layer.multiplier* input_size:]) + + W, R = [x.reshape([ + direction, # 0: num of directions + 1, # 1: num_cells + rnn_layer.multiplier, # 2: four output parts of the matrix for all gates + rnn_layer.hidden_size, # 3: output size per direction and gate + -1]) # 4: input size/hidden size in W/R correspondingly + for x in (W, R)] + + assert W.shape[-1] == input_size + assert R.shape[-1] == rnn_layer.hidden_size + + B = B.reshape([ + direction, # 0: num of directions, limitation: should be 1 + 1, + 2, # 3: num of component B + rnn_layer.multiplier, # 1: four output parts of the matrix for all gates in order: i, f, c, o + rnn_layer.hidden_size, # 2: output size per direction and gate + ]) + + # Reorder gates: ifco --> fico + gate_reorder = rnn_layer.gate_order + W = np.take(W, gate_reorder, axis=2) + R = np.take(R, gate_reorder, axis=2) + B = np.take(B, gate_reorder, axis=3) + + for blob, port in [(W, 1), (R, 2), (B, 3)]: + Op.create_and_connect_input_data_node( + graph, + rnn_layer, + {'value': blob, 'shape': np.array(blob.shape, dtype=np.int64)}, + {'in': port, 'permutation': None} + ) + + @staticmethod + def check_init_states(graph: Graph, match: dict): + """ + Check if cell have initial states and create zeros states if not. + And renumber ports for this states. + """ + rnn_cell = match['rnn_layer'] + num_directions = 2 if rnn_cell.direction == 'bidirectional' else 1 + batch_size = rnn_cell.in_node(0).shape[rnn_cell.batch_dim] + + h_init_port = 5 + c_init_port = 6 + + if 2 not in rnn_cell.in_nodes(): + h_shape = [num_directions, batch_size, rnn_cell.hidden_size] # from ONNX spec + h_init = np.full(h_shape, 0, dtype=np.float32) + Op.create_and_connect_input_data_node( + graph, + rnn_cell, + {'value': h_init, 'shape': np.array(h_init.shape, dtype=np.int64)}, + {'in': h_init_port, 'permutation': None} + ) + else: + hidden_state_edge = graph.get_edge_data(rnn_cell.in_node(2).id, rnn_cell.id) + hidden_state_edge[0]['in'] = h_init_port + + if rnn_cell.op == 'LSTM': + if 3 not in rnn_cell.in_nodes(): + c_shape = [num_directions, batch_size, rnn_cell.hidden_size] # from ONNX spec + c_init = np.full(c_shape, 0, dtype=np.float32) + Op.create_and_connect_input_data_node( + graph, + rnn_cell, + {'value': c_init, 'shape': np.array(c_init.shape, dtype=np.int64)}, + {'in': c_init_port, 'permutation': None} + ) + else: + cell_state_edge = graph.get_edge_data(rnn_cell.in_node(3).id, rnn_cell.id) + cell_state_edge[0]['in'] = c_init_port + + @staticmethod + def add_output_reshape(graph: Graph, match: dict): + """ + Since MXNet Y output shape is [batch_size, seq_len, hidden_size * num_directions] we need to add reshape + from above common format [batch_size, num_directions, seq_len, hidden_size] to MXNet format. + """ + lstm = match['rnn_layer'] + input = match['input'] + if not lstm.has_num_directions: + return + old_data_node =lstm.out_node(0) + num_directions = 2 if lstm.direction in ['bidirectional'] else 1 + mxnet_shape = lstm.out_node(0).shape.copy() + + if lstm.batch_dim == 0: + mo_shape = np.array([input.shape[lstm.batch_dim], input.shape[lstm.sequence_dim], lstm.hidden_size], + dtype=np.int64) + else: + mo_shape = np.array([input.shape[lstm.sequence_dim], input.shape[lstm.batch_dim], lstm.hidden_size], + dtype=np.int64) + + if lstm.has_num_directions: + mo_shape = np.insert(mo_shape, 1, np.int64(num_directions)) + + new_data = Op._create_data_node(graph, name=lstm.name + '/Data/Reshape_mxnet/', attrs={'shape': mo_shape}) + graph.remove_edge(lstm.id, old_data_node.id) + graph.add_edge(lstm.id, new_data.id, key=0, out=0) + + # Add Permute + permute_order = np.array([0, 2, 1, 3], dtype=np.int64) + permute = Permute(graph, dict(order=permute_order)) + permute_data = permute.create_node_with_data([new_data], dict(name=lstm.name + '/Permute_mxnet/')) + + # Add Reshape + reshape = Reshape(graph, dict(dim=mxnet_shape)) + reshape.create_node_with_data([permute_data], dict(name=lstm.name + '/Reshape_mxnet/'), + data_nodes=[old_data_node]) + + @staticmethod + def check_input_ports(graph: Graph, match: dict): + """ + Check that all mandatory ports is present. + """ + rnn_layer = match['rnn_layer'] + mandatory_ports = [0, 1, 2, 3, 5] + + if rnn_layer.op == 'LSTM': + mandatory_ports.append(6) + + assert set(rnn_layer.in_nodes().keys()) >= set(mandatory_ports) diff --git a/model-optimizer/extensions/middle/MXNetSplitMultiLayers.py b/model-optimizer/extensions/middle/MXNetSplitMultiLayers.py new file mode 100644 index 0000000..0749308 --- /dev/null +++ b/model-optimizer/extensions/middle/MXNetSplitMultiLayers.py @@ -0,0 +1,206 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.graph.graph import Graph, Node +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.concat import Concat +from mo.ops.op import Op + + +class MXNetSplitLayersToRNNSequence(MiddleReplacementPattern): + """ + Split MXNet multilayer cell to multiple one-layers cells LSTM/GRU/RNN. + Also concatenate output hiddens and cells states of this layers. + """ + enabled = True + + def pattern(self): + return dict( + nodes=[ + ('rnn_layer', dict(kind='op', type='RNNSequence', format='mxnet', multilayers=True)), + ('input', dict(kind='data')), + ('params', dict(kind='data')), + ], + edges=[ + ('input', 'rnn_layer', {'in': 0}), + ('params', 'rnn_layer', {'in': 1}), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + output_states = self.split_multilayer_cell(graph, match) + + rnn_layer = match['rnn_layer'] + self.concat_output_states(graph, match, output_states) + rnn_layer.graph.remove_node(rnn_layer.id) + + @staticmethod + def get_new_cell(multilayer_cell: Node, number: int): + cell_class = Op.get_op_class_by_name(multilayer_cell.op) + new_cell = lambda graph, attrs: cell_class(graph, attrs) + attrs = multilayer_cell.attrs().copy() + new_attrs = { + 'num_layers': 1, + 'multilayers': False, + 'name': multilayer_cell.name + '/LayerSplittedLSTM/{}'.format(number), + } + attrs.update(new_attrs) + return new_cell(multilayer_cell.graph, attrs) + + def split_multilayer_cell(self, graph: Graph, match: dict): + """ + Split one multilayer type=RNNSequence cell to num_layers consecutive cells. + All parameters splits to parts for new num_layers cells. + """ + input = match['input'] + rnn_layer = match['rnn_layer'] + params = match['params'].value.copy() + + have_hidden = False + if 2 in rnn_layer.in_nodes(): + hidden_state_value = rnn_layer.in_node(2).value + have_hidden = True + + have_cell = False + if 3 in rnn_layer.in_nodes(): + cell_state_value = rnn_layer.in_node(3).value + have_cell = True + + direction = 2 if rnn_layer.has_num_directions else 1 + num_layers = rnn_layer.num_layers + input_size = input.shape[2] + bsize = (2 * rnn_layer.hidden_size * direction * num_layers) * rnn_layer.multiplier + + size = rnn_layer.hidden_size * direction * rnn_layer.multiplier + first_layer_params_size = (input_size + rnn_layer.hidden_size + 2) * size + other_layer_params_size = (rnn_layer.hidden_size * direction + rnn_layer.hidden_size + 2) * size + assert params.size == (first_layer_params_size + (num_layers - 1) * other_layer_params_size) + + input_node = input + params_layer_size_count = 0 + output_states = [[], []] + + param_w = params[0:len(params)-bsize] + param_b = params[len(params) - bsize:] + layer_bsize = (2 * rnn_layer.hidden_size * direction) * rnn_layer.multiplier + + for l in range(num_layers): + params_layer_size = first_layer_params_size if l == 0 else other_layer_params_size + + layer_params_w = param_w[params_layer_size_count: params_layer_size_count + + (params_layer_size - layer_bsize)].copy() + layer_params_b = param_b[layer_bsize*l: layer_bsize*l+layer_bsize].copy() + layer_params = np.concatenate((layer_params_w, layer_params_b), axis=0) + params_layer_size_count = params_layer_size_count + params_layer_size - layer_bsize + + op = self.get_new_cell(rnn_layer, l) + + params_value_node = Op._create_data_node( + rnn_layer.graph, + name=rnn_layer.name + '/LayerSplittedParamsLSTM/{}/'.format(l), + attrs={'value': layer_params, 'shape': np.array(layer_params.shape, dtype=np.int64)} + ) + if have_hidden: + layer_hidden_state = hidden_state_value[l * direction: l * direction + direction] + hidden_state_value_node = Op._create_data_node( + rnn_layer.graph, + name=str(rnn_layer.name) + '/LayerSplittedHiddenState/{}/'.format(l), + attrs={'value': layer_hidden_state, 'shape': np.array(layer_hidden_state.shape, dtype=np.int64)} + ) + else: + hidden_state_value_node = None + + if have_cell: + layer_cell_state = cell_state_value[l * direction: l * direction + direction] + cell_state_value_node = Op._create_data_node( + rnn_layer.graph, + name=str(rnn_layer.name) + '/LayerSplittedCellState/{}/'.format(l), + attrs={'value': layer_cell_state, 'shape': np.array(layer_cell_state.shape, dtype=np.int64)} + ) + else: + cell_state_value_node = None + + if l < num_layers-1: + output_data = Op._create_data_node( + rnn_layer.graph, + name=rnn_layer.out_node(0).name + '/LayerSplit/' + str(l), + attrs={'shape': rnn_layer.out_node(0).shape.copy()} + ) + else: + output_data = rnn_layer.out_node(0) + + # Output nodes creating: + state_size = np.array([input.shape[rnn_layer.batch_dim], rnn_layer.hidden_size], dtype=np.int64) + if rnn_layer.has_num_directions: + state_size = np.insert(state_size, 0, direction) + + output_hidden = Op._create_data_node( + rnn_layer.graph, + name=rnn_layer.out_node(1).name + '/LayerSplit/' + str(l), + attrs={'shape': np.array(state_size)} + ) + + current_data_nodes = [output_data, output_hidden] + + if rnn_layer.op == 'LSTM': + output_cell = Op._create_data_node( + rnn_layer.graph, + name=rnn_layer.out_node(2).name + '/LayerSplit/' + str(l), + attrs={'shape': np.array(state_size)} + ) + current_data_nodes.append(output_cell) + + data_nodes = op.create_node_with_data( + inputs=[ + input_node, + params_value_node, + hidden_state_value_node, + cell_state_value_node + ], + data_nodes=current_data_nodes, + ) + + input_node = data_nodes[0] + output_states[0].append(data_nodes[1]) + + if rnn_layer.op =='LSTM': + output_states[1].append(data_nodes[2]) + + return output_states + + @staticmethod + def concat_output_states(graph: Graph, match: dict, new_states: list): + """ Concatenates output states from multilayer layer. """ + rnn_layer = match['rnn_layer'] + original_states = [rnn_layer.out_node(i) if i in rnn_layer.out_nodes() else None for i in [1, 2]] + + concat_ops = [ + Concat(rnn_layer.graph, { + 'name': rnn_layer.name + '/FinalLayerSplitConcat/HiddenState', + 'axis': -1 + }), + Concat(rnn_layer.graph, { + 'name': rnn_layer.name + '/FinalLayerSplitConcat/CellState', + 'axis': -1 + }) + ] + + for i in range(len(original_states)): # [0] or [0, 1] + if original_states[i] is None: + continue + concat_ops[i].attrs.update({'in_ports_count': len(new_states[i])}) + concat_ops[i].create_node_with_data(inputs=new_states[i], data_nodes=[original_states[i]]) diff --git a/model-optimizer/extensions/middle/MeanToAvgPool.py b/model-optimizer/extensions/middle/MeanToAvgPool.py new file mode 100644 index 0000000..fafc503 --- /dev/null +++ b/model-optimizer/extensions/middle/MeanToAvgPool.py @@ -0,0 +1,95 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.graph.graph import create_edge, Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import PermuteAttrs, Op +from mo.ops.reshape import Reshape + + +class MeanToAvgPool(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + + def run_before(self): + return [] + + def pattern(self): + return dict( + nodes=[ + ('input', dict(kind='data')), + ('axis', dict(kind='data')), + ('mean', dict(kind='op', op='Mean')) + ], + edges=[ + ('input', 'mean', {'in': 0}), + ('axis', 'mean', {'in': 1}) + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + if match['axis'].value is None or match['input'].shape is None: + return + dims = len(match['input'].shape) + ones = np.ones(dims, dtype=np.int64) + axis = np.array(match['axis'].value) + axis = axis if axis.ndim != 0 else np.array([axis], dtype=np.int64) + + mean = graph.node[match['mean'].node] + mean['stride'] = np.array(ones) + # TODO: need to check axis with real layout + spatial_dims = np.array(axis) + mean['spatial_dims'] = spatial_dims + mean['pad'] = np.zeros((dims, 2), np.int64) + mean['pad_spatial_shape'] = np.array(mean['pad'][spatial_dims]) + window = np.array(ones) + window[spatial_dims] = match['input'].shape[spatial_dims] + mean['window'] = window + mean['TF_op'] = mean['op'] + mean['op'] = 'AvgPool' + mean['pool_method'] = 'avg' + mean['rounding_type'] = 'ceil' + mean['exclude_pad'] = 'true' + mean['kernel_spatial'] = window[spatial_dims] + graph.remove_edge(match['axis'].node, match['mean'].node) + mean['permute_attrs'] = PermuteAttrs().update_attrs(attrs=[('pad', 'input:0'), + ('stride', 'input:0'), + ('window', 'input:0'), + ('spatial_dims', 'input:0')]) + + if match['mean'].keep_dims == False: + output = match['mean'].out_node() + pool_node = match['mean'] + + # Keep dims for AvgPool + shape = np.array(output.shape) + for idx in spatial_dims: + shape = np.insert(shape, idx, 1) + + graph.remove_edge(pool_node.id, output.id) + # Create new data for pool with all dims + pool_data = Op.create_data_node(graph, pool_node, {'shape': np.array(shape)}) + # Create and connect reshape node + reshape_op = Reshape(graph, {'dim': np.array(output.shape)}) + reshape_node = reshape_op.create_node([pool_data], dict(name='Reshape_', + permute_attrs=PermuteAttrs().update_attrs( + attrs=[('dim', 'output:0')]))) + graph.create_edge(reshape_node, output) diff --git a/model-optimizer/mo/middle/passes/pool_test.py b/model-optimizer/extensions/middle/MeanToAvgPool_test.py similarity index 87% rename from model-optimizer/mo/middle/passes/pool_test.py rename to model-optimizer/extensions/middle/MeanToAvgPool_test.py index 1473f1e..16952c2 100644 --- a/model-optimizer/mo/middle/passes/pool_test.py +++ b/model-optimizer/extensions/middle/MeanToAvgPool_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import unittest import numpy as np from mo.middle.passes.eliminate import graph_clean_up -from mo.middle.passes.pool import mean_to_avgpool +from extensions.middle.MeanToAvgPool import MeanToAvgPool from mo.utils.unittest.graph import build_graph, compare_graphs nodes_attributes = { @@ -35,6 +35,8 @@ nodes_attributes = { # Reshape layer 'reshape_1': {'type': 'Reshape', 'kind': 'op', 'op': 'Reshape'}, 'reshape_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + # OpOutput + 'op_output': {'kind': 'op', 'op': 'OpOutput', 'type': 'OpOutput'} } @@ -43,14 +45,16 @@ class MeanToAvgPoolTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mean_1'), - ('mean_1', 'mean_1_data'), ('mean_axis', 'mean_1'), + ('mean_1', 'mean_1_data'), + ('mean_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mean_1': {'shape': np.array([1, 227, 227, 3]), 'keep_dims': keep_dims}, 'mean_axis': {'shape': np.array(axis.shape) if axis is not None else None, 'value': np.array(axis) if axis is not None else None}, - 'mean_1_data': {'shape': mean_out_shape, 'is_output': True}, + 'mean_1_data': {'shape': mean_out_shape}, }) del graph['mean_1']['mean_1_data'][0]['in'] return graph @@ -62,13 +66,14 @@ class MeanToAvgPoolTests(unittest.TestCase): [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'pool_1'), ('pool_1', 'pool_1_data'), + ('pool_1_data', 'op_output'), ], {'pool_1': {'pool_method': 'avg', 'rounding_type': 'ceil', 'exclude_pad': 'true', 'op': 'AvgPool', 'shape': np.array([1, 227, 227, 3])}, - 'pool_1_data': {'is_output': True, 'shape': np.array([1, 227, 227, 3])} + 'pool_1_data': {'shape': np.array([1, 227, 227, 3])} }) - mean_to_avgpool(graph) + MeanToAvgPool().find_and_replace_pattern(graph) graph_clean_up(graph) (flag, resp) = compare_graphs(graph, graph_ref, 'mean_1_data', 'pool_1_data', check_op_attrs=True) self.assertTrue(flag, resp) @@ -82,15 +87,16 @@ class MeanToAvgPoolTests(unittest.TestCase): ('placeholder_1_data', 'pool_1'), ('pool_1', 'pool_1_data'), ('pool_1_data', 'reshape_1'), - ('reshape_1', 'reshape_1_data') + ('reshape_1', 'reshape_1_data'), + ('reshape_1_data', 'op_output') ], {'pool_1': {'pool_method': 'avg', 'rounding_type': 'ceil', 'exclude_pad': 'true', 'op': 'AvgPool', 'shape': np.array([1, 227, 227, 3])}, 'pool_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'reshape_1_data': {'is_output': True, 'shape': np.array([227, 227, 3])}, + 'reshape_1_data': {'shape': np.array([227, 227, 3])}, }) - mean_to_avgpool(graph) + MeanToAvgPool().find_and_replace_pattern(graph) graph_clean_up(graph) (flag, resp) = compare_graphs(graph, graph_ref, 'mean_1_data', 'reshape_1_data', check_op_attrs=True) self.assertTrue(flag, resp) diff --git a/model-optimizer/extensions/middle/MinimumMiddleReplacer.py b/model-optimizer/extensions/middle/MinimumMiddleReplacer.py index d215637..100755e 100644 --- a/model-optimizer/extensions/middle/MinimumMiddleReplacer.py +++ b/model-optimizer/extensions/middle/MinimumMiddleReplacer.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ limitations under the License. """ -import networkx as nx - +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.eltwise import Eltwise from mo.ops.power import Power @@ -25,6 +24,14 @@ class MinimumMiddleReplacer(MiddleReplacementPattern): op = "Minimum" enabled = True + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -33,7 +40,7 @@ class MinimumMiddleReplacer(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): node = match['minimum'] # Constant propagation case if node.in_node(0).value is not None and node.in_node(1).value is not None: diff --git a/model-optimizer/extensions/middle/MinumumMiddleReplacer_test.py b/model-optimizer/extensions/middle/MinumumMiddleReplacer_test.py index eb04cda..96555cf 100644 --- a/model-optimizer/extensions/middle/MinumumMiddleReplacer_test.py +++ b/model-optimizer/extensions/middle/MinumumMiddleReplacer_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/MulQuantizeFuse.py b/model-optimizer/extensions/middle/MulQuantizeFuse.py new file mode 100644 index 0000000..0bfdc65 --- /dev/null +++ b/model-optimizer/extensions/middle/MulQuantizeFuse.py @@ -0,0 +1,90 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log +from typing import Dict + +import numpy as np + +from mo.graph.graph import Graph, Node +from mo.middle.passes.conv import get_tensor_in_port, get_value_in_port +from mo.middle.replacement import MiddleReplacementPattern + + +class MulQuantizeFuse(MiddleReplacementPattern): + """ Fuses Mul --> Quantize sequence if possible + """ + enabled = False + + def run_after(self): + return [] + + def run_before(self): + return [] + + def pattern(self): + return dict( + nodes=[ + ('preop', dict(op='Mul')), + ('preoped', dict()), + ('quantize', dict(op='Quantize')), + ], + edges=[ + ('preop', 'preoped'), + ('preoped', 'quantize', {'in': 0}), + ] + ) + + def replace_pattern(self, graph: Graph, match: Dict[str, Node]): + quantize = match['quantize'] + preop = match['preop'] + + # Check for total number of Mul consumers -- if something else consume its output it cannot be fused + if len(preop.out_node().out_nodes()) > 1: + log.debug('MulQuantizeFuse: cannot fuse because Mul have multiple consumers') + return + + # If the fusion is applicable, direct modifications to quantize 1-st and 2-nd inputs + # are performed. So the data nodes at those inputs shouldn't have more than 1 consumer + # maximum 2 consumers to the same quantize op (consumed by 1st and 2nd ports). + # TODO: relax this limitation and duplicate data nodes accordingly to modify the input range freely + + # Provisional limitation that related to binary quantization + # TODO: Relax it beyond binarization case + # Provisional limitation that related to binary quantization + # TODO: Relax it beyond binarization case + if len(quantize.in_node(1).out_nodes()) != 1 or \ + len(quantize.in_node(2).out_nodes()) != 1 or \ + len(quantize.in_node(3).out_nodes()) != 1 or len(quantize.in_node(4).out_nodes()) != 1 or \ + quantize.levels != 2: + log.debug('MulQuantizeFuse: cannot fuse because Quantize op has ' + 'unexpected number of consumers for ports 1, 2, 3 or 4') + return + + tensor_port, value_port = get_tensor_in_port(preop), get_value_in_port(preop) + + + # Need to flip output_low and output_high for those elements that have multiplier < 0 + # TODO: need some special processing for values that exactly equal to threshold + if np.all(value_port.data.get_value() <= 0): + log.debug('MulQuantizeFuse: cannot fuse because Mul op has non-positive multipliers.') + + quantize.in_port(1).data.set_value(quantize.in_port(1).data.get_value() / value_port.data.get_value()) + quantize.in_port(2).data.set_value(quantize.in_port(2).data.get_value() / value_port.data.get_value()) + + # Remove Mul as it no longer needed + quantize.in_port(0).disconnect() + tensor_port.get_connection().set_destination(quantize.in_port(0)) diff --git a/model-optimizer/extensions/middle/NasNet.py b/model-optimizer/extensions/middle/NasNet.py new file mode 100644 index 0000000..1280923 --- /dev/null +++ b/model-optimizer/extensions/middle/NasNet.py @@ -0,0 +1,146 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from mo.front.extractor import add_attrs_props, update_ie_fields +from mo.graph.graph import Node, Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op + + +class NasNet(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + + def run_before(self): + return [] + + def pattern(self): + return dict( + nodes=[ + ('input', dict(kind='data')), + ('pad_op', dict(kind='op', op='Pad')), + ('pad_out', dict(kind='data')), + + ('begin', dict(kind='data')), + ('end', dict(kind='data')), + ('stride', dict(kind='data')), + + ('sslice', dict(kind='op', op='StridedSlice')), + ('sslice_out', dict(kind='data')), + + ('avg_pool', dict(kind='op', op='AvgPool')), + ('output', dict(kind='data')), + ], + edges=[ + ('input', 'pad_op', {'in': 0}), + ('pad_op', 'pad_out'), + + ('begin', 'sslice', {'in': 1}), + ('end', 'sslice', {'in': 2}), + ('stride', 'sslice', {'in': 3}), + + ('pad_out', 'sslice', {'in': 0}), + ('sslice', 'sslice_out'), + + ('sslice_out', 'avg_pool', {'in': 0}), + ('avg_pool', 'output') + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + """ + Converts specific for NasNet topology subgraph Pad->StridedSlice->AvgPool to Conv->Crop->AvgPool + """ + input = match['input'] + + pad_op = match['pad_op'] + + sslice = match['sslice'] + sslice_out = match['sslice_out'] + begin = [] + end = [] + stride = [] + for s in sslice.slices: + begin.append(s.start) + end.append(s.stop) + stride.append(s.step) + + if not np.array_equal(pad_op.pads, np.array([[0, 0], [0, 1], [0, 1], [0, 0]])): + log.error(" Pad values doesn't match!") + return + + if not np.array_equal(begin, np.array([0, 1, 1, 0])): + log.error("StridedSlice has wrong begin") + return + + if not np.array_equal(sslice.end_mask, np.array([0, 0, 0, 0])) or not np.array_equal(sslice.begin_mask, np.array([0, 1, 1, 0])): + log.error("StridedSlice has wrong masks") + return + + # Cut Smth-x->Pad->StrudedSlice-x->AvgPool + graph.remove_edge(input.id, pad_op.id) + graph.remove_edge(sslice.id, sslice_out.id) + + # Pad -> Conv + conv_node = graph.unique_id(pad_op.name + '/Conv_') + conv_weights_node = graph.unique_id(pad_op.name + '/ConvW_') + conv_weights = np.ones((1, 1, input.shape[3], 1)) + conv_output = graph.unique_id(pad_op.name + '/ConvOut_') + output_shape = np.array([input.shape[0], input.shape[1] + 1, input.shape[2] + 1, input.shape[3]]) + + graph.add_node(conv_node, + **add_attrs_props( + dict(kind='op', precision="FP32", type='Convolution', name=conv_node, op='Conv2D', + stride=np.array([1, 1, 1, 1]), dilation=np.array([1, 1, 1, 1]), + group=input.shape[3], bias_addable=True, bias_term=False, + spatial_dims=np.array([1, 2]), + kernel_spatial=np.array([1, 1]), + pad=np.array([[0, 0], [0, 0], [0, 0], [0, 0]]), output_shape=output_shape, + channel_dims=np.array([3]), + in_ports_count=3, out_ports_count=1))) + + graph.add_node(conv_weights_node, **add_attrs_props( + dict(kind='data', precision="FP32", name=conv_weights_node, value=np.array(conv_weights), + shape=np.array(conv_weights.shape), + data_type=input.data_type, infer=None, + spatial_dims=np.array([0, 1]), + input_channel_dim=2, + output_channel_dim=3, + dims_number=4, can_be_bias=True))) + graph.add_node(conv_output, **add_attrs_props( + dict(kind='data', precision="FP32", name=conv_output, value=None, shape=output_shape, + data_type=input.data_type))) + + # StridedSlice -> Crop + crop_cls = Op.get_op_class_by_name('Crop') + crop = crop_cls(graph, dict(name=sslice.name + '/Crop_', axis=np.array([1, 2]), + dim=np.array([output_shape[1] - 1, output_shape[2] - 1]), offset=np.array([1, 1]))) + crop.create_node_with_data([Node(graph, conv_output)], data_nodes=sslice_out) + + # Connect : Conv->Crop->AvgPool + graph.add_edges_from([ + (input.id, conv_node, {'in': 0}), + (conv_weights_node, conv_node, {'in': 1, 'bin': 'weights'}), + (conv_node, conv_output, {'out': 0}), + ]) + update_ie_fields(graph.node[conv_node], graph.graph['ir_version']) diff --git a/model-optimizer/extensions/middle/NormalizeFullyConnected.py b/model-optimizer/extensions/middle/NormalizeFullyConnected.py index 9452486..991f816 100644 --- a/model-optimizer/extensions/middle/NormalizeFullyConnected.py +++ b/model-optimizer/extensions/middle/NormalizeFullyConnected.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,16 +14,25 @@ limitations under the License. """ -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op from mo.ops.reshape import Reshape class NormalizeFullyConnected(MiddleReplacementPattern): - enabled = False + enabled = True + graph_condition = [lambda graph: graph.graph['fw'] == 'onnx'] + + def run_after(self): + from extensions.middle.GemmToFullyConnected import GemmToFullyConnected + return [GemmToFullyConnected] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] def pattern(self): return dict( @@ -33,7 +42,7 @@ class NormalizeFullyConnected(MiddleReplacementPattern): edges=[('fc', 'fc_output')], ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): """ This pass normalize FC layer Example: diff --git a/model-optimizer/extensions/middle/NormalizeFullyConnected_test.py b/model-optimizer/extensions/middle/NormalizeFullyConnected_test.py index de6a73a..1cb2b35 100644 --- a/model-optimizer/extensions/middle/NormalizeFullyConnected_test.py +++ b/model-optimizer/extensions/middle/NormalizeFullyConnected_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/NormalizePad.py b/model-optimizer/extensions/middle/NormalizePad.py index 2e9e89f..5e4ae17 100644 --- a/model-optimizer/extensions/middle/NormalizePad.py +++ b/model-optimizer/extensions/middle/NormalizePad.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ limitations under the License. """ -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node from mo.middle.replacement import MiddleReplacementPattern @@ -30,6 +30,14 @@ class NormalizePad(MiddleReplacementPattern): """ enabled = True + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -38,7 +46,7 @@ class NormalizePad(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): node = match['pad'] for port, input_node in node.in_nodes().items(): if port != 0: diff --git a/model-optimizer/extensions/middle/ONNXRNNSequenceNormalize.py b/model-optimizer/extensions/middle/ONNXRNNSequenceNormalize.py new file mode 100644 index 0000000..3442497 --- /dev/null +++ b/model-optimizer/extensions/middle/ONNXRNNSequenceNormalize.py @@ -0,0 +1,234 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from copy import deepcopy + +import numpy as np + +from mo.graph.graph import Node, Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op +from mo.ops.permute import Permute + + +def permute_before_and_after(inp: Node, middle: Node, out: Node, input_order, output_order): + """ + Insert two permutes: before middle node and after middle node. + + Both permutes has a given order (input/output). + """ + # Permute before input + permute = Permute(middle.graph, dict(order=np.array(input_order))) + + edge_attrs = deepcopy(middle.graph.get_edge_data(inp.id, middle.id)[0]) + middle.graph.remove_edge(inp.id, middle.id) + new_inp = permute.create_node_with_data([inp], dict(name=middle.name + '/InputPermute')) + middle.graph.add_edge(new_inp.id, middle.id, **edge_attrs) + + # Permute after output + permute = Permute(middle.graph, dict(order=output_order)) + + middle.graph.remove_edge(middle.id, out.id) + new_out = Op._create_data_node(middle.graph, name=middle.name + '/WithoutPermute', + attrs={'shape': out.shape[output_order]}) + middle.graph.add_edge(middle.id, new_out.id, key=0, out=0) + permute.create_node_with_data([new_out], dict(name=middle.name + '/OutputPermute'), data_nodes=out) + + +class ONNXRNNSequenceNormalize(MiddleReplacementPattern): + """ + Convert blobs and shapes of ONNX-like LSTM, GRU, RNN cells to common form (internal for MO). + After this normalization pass passes for spliting bidirectional calls and + multilayer cells will be applied. + + This transformation pass involves weights and shapes processing only: + 1. Weights reshaping and reordering + 2. Gates reordering + + + Inputs will have the following order after normalising: + 0: X input data, shape [batch_size, seq_len, input_size] + 1: W weights blob, shape [num_dir, n_cells, M, hidden_size, input_size] + 2: R weights blob, shape [num_dir, n_cells, M, hidden_size, hidden_size] + 3: B biases blob, shape [num_dir, n_cells, 2, M, hidden_size] + 4: (optional) sequence_length, shape [batch_size] + 5: initial hidden state, shape [num_dir, batch_size, hidden_size] + ([num_dir, n_cells, batch_size, hidden_size] if num_cells != 1) + 6: (only for LSTM) initial cell state, shape [num_dir, batch_size, hidden_size] + 7: (optional for LSTM) Peepholes weights, shape [num_dir, n_cells, (M - 1) * hidden_size] + + Outputs: + 0: Y output blob, shape [batch_size, num_dir, seq_len, hidden_size] + 1: (optional) Y_h, shape [num_dir, batch_size, hidden_size] + 2: (optional for LSTM) Y_c, shape [num_dir, batch_size, hidden_size] + + Where: + M -- number of gates in this cell (4 for LSTM, 3 for GRU, 1 for RNN). + num_dir -- number of directions ('forvard', 'bidirectional', 'reverse') + n_cells -- number of cells in layer (always 1 for ONNX). + """ + + enabled = True + + def pattern(self): + return dict( + nodes=[ + ('rnn_layer', dict(kind='op', type='RNNSequence', format='onnx')), + ('input', dict(kind='data')), + ('W', dict(kind='data')), + ('R', dict(kind='data')), + ], + # We are not handling optional inputs + edges=[ + ('input', 'rnn_layer', {'in': 0}), + ('W', 'rnn_layer', {'bin': 'W'}), + ('R', 'rnn_layer', {'bin': 'R'}), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + self.repack_weights(graph, match) + self.check_init_states(graph, match) + self.check_input_ports(graph, match) + match['rnn_layer']['normalized'] = True + + @staticmethod + def repack_weights(graph: Graph, match: dict): + """ + Repack weights into general format (described above) and reorder gates. + """ + rnn_layer = match['rnn_layer'] + W = match['W'].value.copy() + R = match['R'].value.copy() + num_directions = 2 if rnn_layer.direction == 'bidirectional' else 1 + + graph.remove_edge(match['W'].id, rnn_layer.id) + graph.remove_edge(match['R'].id, rnn_layer.id) + + # find optional 'B' biases blob + if 3 in rnn_layer.in_nodes(): + # TODO: check if 'bin': 'B' attribute is assigned to this edge + B = rnn_layer.in_node(3).value.copy() + graph.remove_edge(rnn_layer.in_node(3).id, rnn_layer.id) + else: + B_shape = [num_directions, 2 * rnn_layer.multiplier * rnn_layer.hidden_size] # from ONNX spec + B = np.full(B_shape, 0, dtype=np.float32) + + # Add extra dimensions for W, R and B for easier repacking and reordering + B = B.reshape([ + num_directions, # 0: num of directions + rnn_layer.num_layers, # 1: num_layers + 2, # 2: two input parts of the matrix: W, R + rnn_layer.multiplier, # 3: four output parts of the matrix for all gates in order: i, o, f, c + rnn_layer.hidden_size, # 4: output size per direction and gate + ]) + + W, R = [x.reshape([ + num_directions, # 0: num of directions + rnn_layer.num_layers, # 1: num_layers + rnn_layer.multiplier, # 2: four output parts of the matrix for all gates in order: i, o, f, c + rnn_layer.hidden_size, # 3: output size per direction and gate + -1]) # 4: input size/hidden size in W/R correspondingly + for x in (W, R)] + + input_size = match['input'].shape[2] + assert input_size == W.shape[-1] + + # Reorder gates: iofc --> fico + gate_reorder = rnn_layer.gate_order + W, R = (np.take(x, gate_reorder, axis=2) for x in (W, R)) + B = np.take(B, gate_reorder, axis=3) + + for blob, port in [(W, 1), (R, 2), (B, 3)]: + Op.create_and_connect_input_data_node( + graph, + rnn_layer, + {'value': blob, 'shape': np.array(blob.shape, dtype=np.int64)}, + {'in': port, 'permutation': None} + ) + + @staticmethod + def batch_sequence_transpose(graph: Graph, match: dict): + """ + + """ + rnn_layer = match['rnn_layer'] + inp = match['input'] + out = rnn_layer.out_node(0) + + if rnn_layer.batch_dim == 0: + assert rnn_layer.sequence_dim == 1 + # nothing to do -- it's already in normal form + return + + assert rnn_layer.sequence_dim == 0 + assert rnn_layer.batch_dim == 1 + assert len(inp.shape) == 3 + + # Reorder the first two dimensions on both ends: input and output. + # Two Permute ops are inserted before and after the LSTM node. + # In this transformation we don't analyze the rest of the model around + # LSTM cell, so these Permute ops are not fused to some other layers here. + # But other transformations in the pipeline may optimize the Permute ops out. + + rnn_layer.batch_dim, rnn_layer.sequence_dim = rnn_layer.sequence_dim, rnn_layer.batch_dim + permute_before_and_after(inp, rnn_layer, out, [1, 0, 2], [2, 1, 0, 3]) + + @staticmethod + def check_init_states(graph: Graph, match: dict): + """ + Check if cell have initial states and create zeros states if not. + """ + rnn_layer = match['rnn_layer'] + num_directions = 2 if rnn_layer.direction == 'bidirectional' else 1 + batch_size = rnn_layer.in_node(0).shape[rnn_layer.batch_dim] + + h_init_port = 5 + c_init_port = 6 + + if h_init_port not in rnn_layer.in_nodes(): + h_shape = [num_directions, batch_size, rnn_layer.hidden_size] # from ONNX spec + h_init = np.full(h_shape, 0, dtype=np.float32) + Op.create_and_connect_input_data_node( + graph, + rnn_layer, + {'value': h_init, 'shape': np.array(h_init.shape, dtype=np.int64)}, + {'in': h_init_port, 'permutation': None} + ) + + if rnn_layer.op == 'LSTM': + if c_init_port not in rnn_layer.in_nodes(): + c_shape = [num_directions, batch_size, rnn_layer.hidden_size] # from ONNX spec + c_init = np.full(c_shape, 0, dtype=np.float32) + Op.create_and_connect_input_data_node( + graph, + rnn_layer, + {'value': c_init, 'shape': np.array(c_init.shape, dtype=np.int64)}, + {'in': c_init_port, 'permutation': None} + ) + + @staticmethod + def check_input_ports(graph: Graph, match: dict): + """ + Check that all mandatory ports is present. + """ + rnn_layer = match['rnn_layer'] + mandatory_ports = [0, 1, 2, 3, 5] + + if rnn_layer.op == 'LSTM': + mandatory_ports.extend([6]) + + assert set(rnn_layer.in_nodes().keys()) >= set(mandatory_ports) diff --git a/model-optimizer/extensions/middle/PartialInfer.py b/model-optimizer/extensions/middle/PartialInfer.py new file mode 100644 index 0000000..d5d519c --- /dev/null +++ b/model-optimizer/extensions/middle/PartialInfer.py @@ -0,0 +1,31 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.graph.graph import Graph +from mo.middle.passes.infer import partial_infer +from mo.middle.replacement import MiddleReplacementPattern + + +class PartialInfer(MiddleReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + partial_infer(graph) diff --git a/model-optimizer/extensions/middle/PixelLinkReshape.py b/model-optimizer/extensions/middle/PixelLinkReshape.py index 9564b5d..9c6cceb 100644 --- a/model-optimizer/extensions/middle/PixelLinkReshape.py +++ b/model-optimizer/extensions/middle/PixelLinkReshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,14 +15,14 @@ """ import logging as log -import networkx as nx import numpy as np from copy import deepcopy -from extensions.middle.AddReshapeAfterStridedSlice import AddReshapeAfterStridedSlice +from extensions.middle.ConvertGroupedStridedSlice import ConvertGroupedStridedSlice from extensions.middle.FusePermutesSequence import FusePermutesSequence from extensions.middle.ShufflenetReshape import ReshapeSoftmaxReshape +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op from mo.ops.permute import Permute @@ -30,16 +30,17 @@ from mo.ops.permute import Permute class PixelLinkReshape(MiddleReplacementPattern): """ - Transform adds Permutes around Reshapes that pack 4 dimensions in 2, than - do Softmax and then unpack it back to 5 dims. + Transform adds Permutes around Reshapes that pack 4 dimensions in 2, than + do Softmax and then unpack it back to 5 dims. """ enabled = True def run_before(self): - return [FusePermutesSequence, ReshapeSoftmaxReshape, AddReshapeAfterStridedSlice] + return [FusePermutesSequence, ReshapeSoftmaxReshape, ConvertGroupedStridedSlice] def run_after(self): - return [] + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] def pattern(self): return dict(nodes=[('reshape_split', dict(kind='op', type='Reshape')), @@ -51,7 +52,7 @@ class PixelLinkReshape(MiddleReplacementPattern): ('reshape_unpack', dict(kind='op', type='Reshape')), ('reshape_unpack_data', dict(kind='data')), ('strided_slice', dict(kind='op', op='StridedSlice')), - ], + ], edges=[('reshape_split', 'reshape_split_data'), ('reshape_split_data', 'reshape_pack'), ('reshape_pack', 'reshape_data'), @@ -84,7 +85,7 @@ class PixelLinkReshape(MiddleReplacementPattern): else: return False - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): if graph.graph['layout'] != 'NHWC': return @@ -120,55 +121,72 @@ class PixelLinkReshape(MiddleReplacementPattern): attrs = deepcopy(graph.get_edge_data(node.id, out_node.id)[0]) graph.remove_edge(node.id, out_node.id) - permute_after_node = permute_after.create_node_with_data([data_node], permute_after.attrs, - data_nodes=[out_node]) + permute_after.create_node_with_data([data_node], permute_after.attrs, + data_nodes=[out_node]) graph.add_edge(node.id, data_node.id, **attrs) # update softmax shape node_softmax = match['softmax'] node_softmax.out_node(0).shape = out_node.shape - # revert strided slice and reshape - node_ss = match['strided_slice'] - node_unpack = match['reshape_unpack'] - - unpack_out = node_unpack.out_node(0).id - ss_out = node_ss.out_node(0).id - - #gather edge attributes - soft_reshape_attrs = deepcopy(graph.get_edge_data(node_softmax.out_node(0).id, node_unpack.id)[0]) - reshape_data_attrs = deepcopy(graph.get_edge_data(node_unpack.id, unpack_out)[0]) - reshape_ss_attrs = deepcopy(graph.get_edge_data(unpack_out, node_ss.id)[0]) - ss_out_attrs = deepcopy(graph.get_edge_data(node_ss.id, ss_out)[0]) - - #remove all edges in Softmax->Reshape->StridedSlice chain - graph.remove_edge(node_softmax.out_node(0).id, node_unpack.id) - graph.remove_edge(node_unpack.id, unpack_out) - graph.remove_edge(unpack_out, node_ss.id) - graph.remove_edge(node_ss.id, ss_out) - - #add new edges to get chain Softmax->StridedSlice->Reshape - graph.add_edge(node_softmax.out_node(0).id, node_ss.id, **soft_reshape_attrs) - graph.add_edge(node_ss.id, unpack_out, **reshape_data_attrs) - graph.add_edge(unpack_out, node_unpack.id, **reshape_ss_attrs) - graph.add_edge(node_unpack.id, ss_out, **ss_out_attrs) - - #update output shape and parameters for StridedSlice - node_ss.out_node(0).shape = np.zeros(3) - node_ss.out_node(0).shape[0] = out_node.shape[0] - node_ss.out_node(0).shape[1] = 1 - node_ss.out_node(0).shape[2] = out_node.shape[2] - - old_slices = node_ss.slices.copy() - node_ss.slices = [] - node_ss.slices.append(old_slices[0]) - node_ss.slices.append(old_slices[-1]) - node_ss.slices.append(slice(0, out_node.shape[2], 1)) - node_ss.shrink_axis_mask = [False, False, False] - node_ss.new_axis_mask = [False, False, False] - - #update Reshape attribute - node_unpack.dim = np.delete(node_unpack.dim, 4) - #prevent permute for reshape because it gives wrong result - node_unpack['nchw_layout'] = True - node_unpack.out_node(0)['nchw_layout'] = True + if ConvertGroupedStridedSlice.enabled is True: + # revert strided slice and reshape + node_ss = match['strided_slice'] + node_unpack = match['reshape_unpack'] + + unpack_out = node_unpack.out_node(0).id + ss_out = node_ss.out_node(0).id + + # gather edge attributes + soft_reshape_attrs = deepcopy(graph.get_edge_data(node_softmax.out_node(0).id, node_unpack.id)[0]) + reshape_data_attrs = deepcopy(graph.get_edge_data(node_unpack.id, unpack_out)[0]) + reshape_ss_attrs = deepcopy(graph.get_edge_data(unpack_out, node_ss.id)[0]) + ss_out_attrs = deepcopy(graph.get_edge_data(node_ss.id, ss_out)[0]) + + # remove all edges in Softmax->Reshape->StridedSlice chain + graph.remove_edge(node_softmax.out_node(0).id, node_unpack.id) + graph.remove_edge(node_unpack.id, unpack_out) + graph.remove_edge(unpack_out, node_ss.id) + graph.remove_edge(node_ss.id, ss_out) + + # add new edges to get chain Softmax->StridedSlice->Reshape + graph.add_edge(node_softmax.out_node(0).id, node_ss.id, **soft_reshape_attrs) + graph.add_edge(node_ss.id, unpack_out, **reshape_data_attrs) + graph.add_edge(unpack_out, node_unpack.id, **reshape_ss_attrs) + graph.add_edge(node_unpack.id, ss_out, **ss_out_attrs) + + # update output shape and parameters for StridedSlice + node_ss.out_node(0).shape = np.zeros(3) + node_ss.out_node(0).shape[0] = out_node.shape[0] + node_ss.out_node(0).shape[1] = 1 + node_ss.out_node(0).shape[2] = out_node.shape[2] + + old_slices = node_ss.slices.copy() + node_ss.slices = [] + node_ss.slices.append(old_slices[0]) + node_ss.slices.append(old_slices[-1]) + node_ss.slices.append(slice(0, out_node.shape[2], 1)) + node_ss.shrink_axis_mask = np.array([0, 0, 0], dtype=np.int64) + node_ss.new_axis_mask = np.array([0, 0, 0], dtype=np.int64) + node_ss.ellipsis_mask = np.array([0, 0, 0], dtype=np.int64) + node_ss.begin_mask = np.array([0, 1, 0], dtype=np.int64) + node_ss.end_mask = np.array([0, 1, 0], dtype=np.int64) + + # update Reshape attribute + node_unpack.dim = np.delete(node_unpack.dim, 4) + # prevent permute for reshape because it gives wrong result + node_unpack['nchw_layout'] = True + node_unpack.out_node(0)['nchw_layout'] = True + else: + # reshape unpack: permute correctly + node_unpack = match['reshape_unpack'] + data_node = Op._create_data_node(graph, node.name + "/Permute_after_unpack_data", {'shape': node_unpack.out_node().shape}) + permute_after_unpack = Permute(graph, dict(name=node.name + "/Permute_after_unpack", + order=np.array([0, 3, 1, 2, 4]))) + out_node = node_unpack.out_node(0) + out_node.shape = out_node.shape[np.array([0, 3, 1, 2, 4], dtype=np.int)] + attrs = deepcopy(graph.get_edge_data(node_unpack.id, out_node.id)[0]) + graph.remove_edge(node_unpack.id, out_node.id) + permute_after.create_node_with_data([data_node], permute_after_unpack.attrs, + data_nodes=[out_node]) + graph.add_edge(node_unpack.id, data_node.id, **attrs) diff --git a/model-optimizer/extensions/middle/PixelLinkReshape_test.py b/model-optimizer/extensions/middle/PixelLinkReshape_test.py index e281f60..11a41c5 100644 --- a/model-optimizer/extensions/middle/PixelLinkReshape_test.py +++ b/model-optimizer/extensions/middle/PixelLinkReshape_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -38,6 +38,9 @@ nodes_attributes = { 'reshape_split/Permute_before_data': {'value': None, 'shape': None, 'kind': 'data'}, 'reshape_pack/Permute_after': {'type': 'Permute', 'kind': 'op', 'op': 'Permute'}, 'reshape_pack/Permute_after_data': {'value': None, 'shape': None, 'kind': 'data'}, + # uncoment when strided slice will be enabled + # 'reshape_unpack/Permute_after_unpack': {'type': 'Permute', 'kind': 'op', 'op': 'Permute'}, + # 'reshape_unpack/Permute_after_unpack_data': {'value': None, 'shape': None, 'kind': 'data'}, # Softmax layer 'softmax_1': {'type': 'SoftMax', 'kind': 'op', 'op': 'SoftMax'}, 'softmax_1_data': {'value': None, 'shape': None, 'kind': 'data'}, @@ -70,8 +73,11 @@ class ReshapeSoftmaxReshapeTests(unittest.TestCase): 'strided_slice': { 'slices': [slice(0, 1, 1), slice(0, 227, 1), slice(0, 227, 1), slice(0, 8, 1), slice(1, 2, 1)], - 'shrink_axis_mask': [False, False, False, False, True], - 'new_axis_mask': [False, False, False, False, False]}, + 'shrink_axis_mask': [0, 0, 0, 0, 1], + 'new_axis_mask': [0, 0, 0, 0, 0], + 'ellipsis_mask': [0, 0, 0, 0, 0], + 'begin_mask': [1, 1, 1, 1, 1], + 'end_mask': [1, 1, 1, 1, 1], }, 'strided_slice_data': {'shape': np.array([1, 227, 227, 8])}, }) graph.graph['layout'] = 'NHWC' @@ -88,10 +94,18 @@ class ReshapeSoftmaxReshapeTests(unittest.TestCase): ('reshape_pack/Permute_after', 'reshape_pack_data'), ('reshape_pack_data', 'softmax_1'), ('softmax_1', 'softmax_1_data'), + # comment when strided slice will be enabled ('softmax_1_data', 'strided_slice'), ('strided_slice', 'reshape_unpack_data'), ('reshape_unpack_data', 'reshape_unpack'), - ('reshape_unpack', 'strided_slice_data') + ('reshape_unpack', 'strided_slice_data'), + # uncomment when strided slice will be enabled + # ('softmax_1_data', 'reshape_unpack'), + # ('reshape_unpack', 'reshape_unpack/Permute_after_unpack_data'), + # ('reshape_unpack/Permute_after_unpack_data', 'reshape_unpack/Permute_after_unpack'), + # ('reshape_unpack/Permute_after_unpack', 'reshape_unpack_data'), + # ('reshape_unpack_data', 'strided_slice'), + # ('strided_slice', 'strided_slice_data'), ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 16])}, 'reshape_split/Permute_before_data': {'shape': np.array([1, 227, 16, 227])}, @@ -99,7 +113,11 @@ class ReshapeSoftmaxReshapeTests(unittest.TestCase): 'reshape_pack_data': {'shape': np.array([1, 2, 1 * 227 * 227 * 8])}, 'reshape_pack/Permute_after_data': {'shape': np.array([1, 227 * 227 * 8, 2])}, 'softmax_1_data': {'shape': np.array([1, 2, 1 * 227 * 227 * 8])}, + # comment when strided slice will be enabled 'reshape_unpack_data': {'shape': np.array([1, 1, 227 * 227 * 8])}, + # uncomment when strided slice will be enabled + # 'reshape_unpack_data': {'shape': np.array([1, 8, 227, 227, 2])}, + # 'reshape_unpack/Permute_after_unpack_data': {'shape': np.array([1, 227, 227, 8, 2])}, 'strided_slice_data': {'shape': np.array([1, 227, 227, 8])} }) diff --git a/model-optimizer/extensions/middle/RNNSequenceNormalizeToIE.py b/model-optimizer/extensions/middle/RNNSequenceNormalizeToIE.py new file mode 100644 index 0000000..0809c21 --- /dev/null +++ b/model-optimizer/extensions/middle/RNNSequenceNormalizeToIE.py @@ -0,0 +1,215 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from copy import deepcopy + +import numpy as np + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op +from mo.ops.reshape import Reshape + + +class RNNSequenceNormalize(MiddleReplacementPattern): + """ + This class normalize RNNSequence layers to IE-compatible from of weights, inputs and outputs. + + In this pass next will be done: + 1. Weights repack (squeeze all useless shapes in all blobls and concatenate W and R together, also add + bin param and all similar staff ) + 1. UNSqueeze num directions (in states and ) + 2. Initial states squeeze + 4. Renumbering inputs + 5. Ports checks + + After this normalization this layer will have next format of inputs: + 0: X input data, shape [batch_size, seq_len, input_size] + 1: WR weights blob, shape [M * hidden_size, hidden_size + input_size] + 2: B biases blob, shape [M * hidden_size] + 3: (optional) sequence_length, shape [batch_size] + 4: initial hidden state, shape [batch_size, hidden_size] + 5: (only for LSTM) initial cell state, shape [batch_size, hidden_size] + 6: (optional for LSTM) Peepholes weights, shape [(M - 1) * hidden_size] + + """ + def run_after(self): + from extensions.middle.DecomposeBidirectionalRNNSequence import DecomposeBidirectionalRNNSequence + return [DecomposeBidirectionalRNNSequence] + + def pattern(self): + return dict( + nodes=[ + ('rnn_layer', dict(kind='op', type='RNNSequence')), + ('input', dict(kind='data')), + ('W', dict(kind='data')), + ('R', dict(kind='data')), + ('B', dict(kind='data')), + ], + edges=[ + ('input', 'rnn_layer', {'in': 0}), + ('W', 'rnn_layer', {'in': 1}), + ('R', 'rnn_layer', {'in': 2}), + ('B', 'rnn_layer', {'in': 3}), + ], + ) + + def replace_pattern(self, graph: Graph, match: dict): + self.repack_weights(graph, match) + if match['rnn_layer'].has_num_directions: + self.unsqueeze_num_directions(graph, match) + self.squeeze_initial_states(graph, match) + self.reordering_inputs(graph, match) + # some additional checks for ports number and similar stuff + + def repack_weights(self, graph: Graph, match: dict): + # Concat W, R in IE- format + # Delete useless num_dir dimensions and n_cells dimensions in W, R, B (peepholes?) + lstm = match['rnn_layer'] + W, R, B = match['W'].value.copy(), match['R'].value.copy(), match['B'].value.copy() + + graph.remove_edge(match['W'].id, lstm.id) + graph.remove_edge(match['R'].id, lstm.id) + graph.remove_edge(match['B'].id, lstm.id) + + # Sum component of B that correspond to W and R + if lstm.op == 'GRU' and lstm.linear_before_reset: + B_shape = np.array(B.shape) + B_shape[3] = 4 + B_shape[2] = 1 + B_tmp = np.zeros(shape=B_shape) + B_tmp[:, :, :, 0, :] = B[:, :, 0, 0, :] + B[:, :, 1, 0, :] + B_tmp[:, :, :, 1, :] = B[:, :, 0, 1, :] + B[:, :, 1, 1, :] + B_tmp[:, :, :, 2, :] = B[:, :, 0, 2, :][:, :, np.newaxis, :] + B_tmp[:, :, :, 3, :] = B[:, :, 1, 2, :][:, :, np.newaxis, :] + B = B_tmp + else: + B = np.add.reduce(B, axis=2, keepdims=True) + + # Concatenate W, R to IE-compatible format + assert len(W.shape) == 5 + assert len(R.shape) == 5 + WR = np.concatenate([W, R], axis=4) + + # Squeeze useless dimensions + assert WR.shape[0] == 1 # num_dir == 1 + assert WR.shape[1] == 1 # num_cells == 1 + assert B.shape[0] == 1 + assert B.shape[1] == 1 + WR = WR.squeeze(axis=(0, 1)) + B = B.squeeze(axis=(0, 1)) + + # Flatten all output (0, 1) and input dimensions (2, 3) + final_shape_WR = [WR.shape[0] * WR.shape[1], -1] + assert final_shape_WR[0] == lstm.hidden_size * lstm.multiplier + WR = WR.reshape(final_shape_WR) + + final_shape_B = final_shape_WR + if lstm.op == 'GRU' and lstm.linear_before_reset: + final_shape_B[0] = lstm.hidden_size * 4 + B = B.reshape(final_shape_B) + + # Squeeze fake dimension in B + B = B.squeeze(axis=-1) + + for blob, port, name in [(WR, 1, 'weights'), (B, 2, 'biases')]: + Op.create_and_connect_input_data_node( + graph, + lstm, + {'value': blob, 'shape': np.array(blob.shape, dtype=np.int64)}, + {'in': port, 'bin': name, 'permutation': None} + ) + + @staticmethod + def unsqueeze_num_directions(graph: Graph, match: dict): + """ Assuming considered LSTM/GRU/RNN node should has num_directions in output shape and add Reshape + to match it. + """ + + rnn_layer = match['rnn_layer'] + # num_directions is at 1st position in output shape, and in 0st position in hidden and cell states + # please refer to docs in this transform + + direction_dim = [1, 0, 0] # index of dimension with direction index + for i in rnn_layer.out_nodes(): + old_data_node = rnn_layer.out_node(i) + old_shape = old_data_node.shape.copy() + new_shape = np.delete(old_shape, direction_dim[i]) + + data = Op._create_data_node(graph, name=rnn_layer.name + '/Out/{}/'.format(i), attrs={'shape': new_shape}) + graph.remove_edge(rnn_layer.id, old_data_node.id) + graph.add_edge(rnn_layer.id, data.id, key=0, out=i) + + reshape = Reshape(graph, dict(dim=old_shape)) + reshape.create_node_with_data([data], dict(name=rnn_layer.name + '/SqueezeNumDirections/{}'.format(i)), + data_nodes=[old_data_node]) + + @staticmethod + def squeeze_initial_states(graph: Graph, match: dict): + """ + Squeeze input initial states of recurrent node to 2-D shape. + """ + hidden_init_port = 5 + cell_init_port = 6 + + rnn_layer = match['rnn_layer'] + + reshape = Reshape(graph, dict(dim=[rnn_layer.in_node(0).shape[rnn_layer.batch_dim], rnn_layer.hidden_size])) + + assert hidden_init_port in rnn_layer.in_nodes() + init_h = rnn_layer.in_node(hidden_init_port) + edge_attrs = deepcopy(graph.get_edge_data(init_h.id, rnn_layer.id)[0]) + edge_attrs['in'] = hidden_init_port + graph.remove_edge(init_h.id, rnn_layer.id) + new_init_h = reshape.create_node_with_data([init_h], dict(name=rnn_layer.name + '/HiddenStateResize')) + graph.add_edge(new_init_h.id, rnn_layer.id, **edge_attrs) + + if rnn_layer.op == 'LSTM': + assert cell_init_port in rnn_layer.in_nodes() + + init_c = rnn_layer.in_node(cell_init_port) + edge_attrs = deepcopy(graph.get_edge_data(init_c.id, rnn_layer.id)[0]) + edge_attrs['in'] = cell_init_port + graph.remove_edge(init_c.id, rnn_layer.id) + new_init_c = reshape.create_node_with_data([init_c], dict(name=rnn_layer.name + '/CellStateResize')) + graph.add_edge(new_init_c.id, rnn_layer.id, **edge_attrs) + + @staticmethod + def reordering_inputs(graph: Graph, match: dict): + """ + Reorder (renumbering) inputs to described format. We need to renumber initial states ports. + """ + rnn_layer = match['rnn_layer'] + assert 5 in rnn_layer.in_nodes() + hidden_state_edge = graph.get_edge_data(rnn_layer.in_node(5).id, rnn_layer.id) + hidden_state_edge[0]['in'] = 4 + + if rnn_layer.op == 'LSTM': + assert 6 in rnn_layer.in_nodes() + cell_state_edge = graph.get_edge_data(rnn_layer.in_node(6).id, rnn_layer.id) + cell_state_edge[0]['in'] = 5 + + @staticmethod + def ports_checks(graph: Graph, match: dict): + """ + Check that all mandatory ports is present. + """ + rnn_layer = match['rnn_layer'] + mandatory_ports = [0, 1, 2, 4] + + if rnn_layer.op == 'LSTM': + mandatory_ports.append(5) + + assert set(rnn_layer.in_nodes().keys()) >= set(mandatory_ports) \ No newline at end of file diff --git a/model-optimizer/extensions/middle/Reduce.py b/model-optimizer/extensions/middle/Reduce.py index 6c6c91d..1dedf83 100644 --- a/model-optimizer/extensions/middle/Reduce.py +++ b/model-optimizer/extensions/middle/Reduce.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,11 @@ import logging as log -import networkx as nx import numpy as np from mo.front.caffe.extractors.utils import get_canonical_axis_index from mo.front.common.layout import get_batch_dim, get_features_dim +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.pooling import Pooling from mo.ops.power import Power @@ -39,6 +39,14 @@ class ReduceReplacer(MiddleReplacementPattern): 'sum': 'avg' } + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -47,7 +55,7 @@ class ReduceReplacer(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): node = match['reduce'] if not node.has_valid('reduce_type') or node.reduce_type.lower() not in self.supported_reduce_types: log.error("Reduce type {} is not supported for node {}".format(node.soft_get('reduce_type'), node.id)) diff --git a/model-optimizer/extensions/middle/Reduce_test.py b/model-optimizer/extensions/middle/Reduce_test.py index 1925df1..f708e0a 100644 --- a/model-optimizer/extensions/middle/Reduce_test.py +++ b/model-optimizer/extensions/middle/Reduce_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/ReluQuantizeFuse.py b/model-optimizer/extensions/middle/ReluQuantizeFuse.py new file mode 100644 index 0000000..116a493 --- /dev/null +++ b/model-optimizer/extensions/middle/ReluQuantizeFuse.py @@ -0,0 +1,90 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +from extensions.middle.BinarizeWeightsM1P1 import BinarizeWeightsM1P1 +from mo.graph.graph import Graph +from mo.middle.passes.eliminate import remove_op_node_with_data_node +from mo.middle.replacement import MiddleReplacementPattern + + +class ReluQuantizeFuse(MiddleReplacementPattern): + """ Fuses ReLU --> Quantize sequence if possible + + Relu --> Quantize fusion is possible if: + 1. Relu is consumed to 0-th port of Quantize + 2. Quantize ports 1 and 2 defines such input range that 0 is not included + """ + enabled = True + + def run_after(self): + return [BinarizeWeightsM1P1] + + def run_before(self): + from extensions.middle.SharedWeightsDuplication import SharedWeightsDuplication + return [SharedWeightsDuplication] + + def pattern(self): + return dict( + nodes=[ + ('relu', dict(op='Relu')), + ('relued', dict()), + ('quantize', dict(op='Quantize')), + ], + edges=[ + ('relu', 'relued'), + ('relued', 'quantize', {'in': 0}), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + + quantize = match['quantize'] + + # Check for total number of ReLU consumers -- if something else consume its output it cannot be fused + if len(match['relu'].out_node().out_nodes()) > 1: + log.debug('ReluQuantizeFuse: cannot fuse because ReLU have multiple consumers') + return + + # If the fusion is applicable, direct modifications to quantize 1-st and 2-nd inputs + # are performed. So the data nodes at those inputs shouldn't have more than 1 consumer + # maximum 2 consumers to the same quantize op (consumed by 1st and 2nd ports). + # TODO: relax this limitation and duplicate data nodes accordingly to modify the input range freely + + # Provisional limitation that related to binary quantization + # TODO: Relax it beyond binarization case + if len(quantize.in_node(1).out_nodes()) != 2 or \ + len(quantize.in_node(2).out_nodes()) != 2 or \ + quantize.in_node(1).id != quantize.in_node(2).id or \ + quantize.levels != 2: + log.debug('ReluQuantizeFuse: cannot fuse because Quantize op has ' + 'unexpected number of consumers for ports 1 and 2') + return + + threshold = quantize.in_node(1) + + # As we restricted to binarization case only, so we need to detect from + # which side of 0 Quantize threshold resides: + # if the threshold > 0, it remains the same; + # if the threshold == 0, it also remains the same; + # if the threshold < 0, it should be modified to -infinity that means that all inputs map to output_high + + modification_mask = threshold.value < 0 + threshold.value[modification_mask] = float('-inf') + + # Remove ReLU as it no longer needed + remove_op_node_with_data_node(graph, match['relu']) diff --git a/model-optimizer/extensions/middle/RemoveIdentity.py b/model-optimizer/extensions/middle/RemoveIdentity.py new file mode 100644 index 0000000..ba7535c --- /dev/null +++ b/model-optimizer/extensions/middle/RemoveIdentity.py @@ -0,0 +1,83 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.graph.graph import Graph +from mo.middle.passes.eliminate import remove_op_node_with_data_node +from mo.middle.replacement import MiddleReplacementPattern + + +class RemoveIdentity(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.InputCut import MiddleInputCut + return [MiddleInputCut] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[('op', dict(kind='op', identity=True))], + edges=[] + ) + + def replace_pattern(self, graph: Graph, match: dict): + remove_op_node_with_data_node(graph, match['op']) + + +class RemoveDropout(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[('op', dict(op='Dropout'))], + edges=[] + ) + + def replace_pattern(self, graph: Graph, match: dict): + remove_op_node_with_data_node(graph, match['op']) + + +class RemoveNodesWithZeroPhase(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[('op', dict(kind='op', phase=0))], + edges=[] + ) + + def replace_pattern(self, graph: Graph, match: dict): + remove_op_node_with_data_node(graph, match['op']) diff --git a/model-optimizer/extensions/middle/RemoveRedundantReshapeAfterCropAndResize.py b/model-optimizer/extensions/middle/RemoveRedundantReshapeAfterCropAndResize.py new file mode 100644 index 0000000..9f54165 --- /dev/null +++ b/model-optimizer/extensions/middle/RemoveRedundantReshapeAfterCropAndResize.py @@ -0,0 +1,68 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class RemoveRedundantReshapeAfterCropAndResize(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + + def run_before(self): + return [] + + def pattern(self): + return dict( + nodes=[ + ('crop_and_resize', dict(kind='op', op='CropAndResize')), + ('crop_and_resize_data', dict(kind='data')), + ('reshape_1', dict(kind='op', op='Reshape')), + ('reshape_1_data', dict(kind='data')), + ('reshape_2', dict(kind='op', op='Reshape')), + ], + edges=[ + ('crop_and_resize', 'crop_and_resize_data'), + ('crop_and_resize_data', 'reshape_1'), + ('reshape_1', 'reshape_1_data'), + ('reshape_1_data', 'reshape_2'), + ] + ) + + def replace_pattern(self, graph: Graph, match: dict): + car_node = match['crop_and_resize'] + reshape_2_node = match['reshape_2'] + + shape_1 = match['crop_and_resize_data'].shape + shape_2 = match['reshape_2'].out_node().shape + if not np.all(shape_1 == shape_2): + log.debug('Cannot remove reshape operations after CropAndResize due to different shapes: {} vs {}'.format( + shape_1, shape_2 + )) + return + + car_node.out_port(0).disconnect() + consumer_port_node = reshape_2_node.out_port(0).get_connection().get_destination() + consumer_port_node.disconnect() + car_node.out_port(0).connect(consumer_port_node) diff --git a/model-optimizer/extensions/middle/ReverseV2ToReverseSequence.py b/model-optimizer/extensions/middle/ReverseV2ToReverseSequence.py new file mode 100644 index 0000000..a1aa418 --- /dev/null +++ b/model-optimizer/extensions/middle/ReverseV2ToReverseSequence.py @@ -0,0 +1,62 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from extensions.ops.reverse_sequence import ReverseSequence +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.const import Const + + +class ReverseToReverseSequence(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.PartialInfer import PartialInfer + return [PartialInfer] + + def run_before(self): + from extensions.middle.reverse_tensor_iterator import ReverseTensorIteratorLSTM + return [ReverseTensorIteratorLSTM] + + def pattern(self): + return dict( + nodes=[ + ('reverse', dict(kind='op', op='Reverse')) + ], + edges=[] + ) + + def replace_pattern(self, graph: Graph, match: dict): + reverse = match['reverse'] + input_data_shape = reverse.in_node(0).shape + + assert reverse.in_port(1).disconnected() + + # 1. For ReverseSequence 1-port input is seq_lengths => create this input node + seq_lengths = np.ones(input_data_shape[0]) * input_data_shape[reverse['axis']] + const = Const(graph, dict(value=seq_lengths)).create_node() + + # 2. Create new ReverseSequence node and reconnect all inputs/outputs to it + reverse_sequence = ReverseSequence(graph, {'name': reverse.name + '/ReverseSequence/', + 'seq_axis': reverse['axis']}).create_node() + + reverse.in_port(0).get_connection().set_destination(reverse_sequence.in_port(0)) + const.out_port(0).connect(reverse_sequence.in_port(1)) + reverse.out_port(0).get_connection().set_source(reverse_sequence.out_port(0)) + + # 3. Delete old Reverse node + graph.remove_node(reverse.id) diff --git a/model-optimizer/extensions/middle/ScaleInput.py b/model-optimizer/extensions/middle/ScaleInput.py new file mode 100644 index 0000000..ad04300 --- /dev/null +++ b/model-optimizer/extensions/middle/ScaleInput.py @@ -0,0 +1,71 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.lin_op import Mul +from mo.ops.op import Op +from mo.utils.error import Error + + +class ScaleInput(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.AddMeanScaleValues import AddMeanScaleValues + return [AddMeanScaleValues] + + def pattern(self): + return dict( + nodes=[ + ('placeholder', dict(kind='op', op='Placeholder')), + ('data', dict(kind='data'))], + edges=[ + ('placeholder', 'data'), + ], + ) + + def replace_pattern(self, graph: Graph, match: dict): + scale = graph.graph['cmd_params'].scale + if scale is None or scale == 1: + return + assert (len(match['placeholder'].out_nodes())) + + tinput = match['placeholder'] + if not tinput.has_valid('shape'): + raise Error("Node {} has not valid shape attribute".format(tinput.id)) + + input_shape = tinput.shape + toutput = match['data'] + + # Create Mul node + value = np.array([1 / scale]) + + # Disconnect input with data node + graph.remove_edge(tinput.id, toutput.id) + + # Create Mul node + mul_node = Mul(graph, dict(name="Mul1_")) + mul_data = Op.create_input_data_node(graph, "data_mul_scale_", np.array(value)) + Op.expand_node_shape(mul_data, len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 0) + mul_input = Op.create_data_node(graph, tinput, {'shape': toutput.shape}) + + mul_node.create_node_with_data(inputs=[mul_input, mul_data], data_nodes=toutput) diff --git a/model-optimizer/extensions/middle/ScaleInput_test.py b/model-optimizer/extensions/middle/ScaleInput_test.py new file mode 100644 index 0000000..2dac2da --- /dev/null +++ b/model-optimizer/extensions/middle/ScaleInput_test.py @@ -0,0 +1,91 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest +from argparse import Namespace + +import numpy as np + +from extensions.middle.ScaleInput import ScaleInput +from mo.utils.unittest.graph import build_graph, compare_graphs + +nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'node_1_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'node_2': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'concat': {'type': 'Concat', 'value': None, 'kind': 'op'}, + 'node_3': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'node_3_data': {'value': None, 'kind': 'data', 'data_type': None}, + # Placeholders + 'placeholder_1': {'shape': None, 'type': 'Input', 'kind': 'op', 'op': 'Placeholder'}, + 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2': {'shape': None, 'type': 'Input', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_1': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_1_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'pl_2': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'pl_2_data': {'value': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + # ScaleShift layer + 'scaleshift_1': {'type': 'ScaleShift', 'kind': 'op', 'op': 'ScaleShift'}, + 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + # Mul op + 'mul_1': {'type': None, 'kind': 'op', 'op': 'Mul'}, + 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput', 'infer': lambda x: None} + } + + +class ScaleInputTests(unittest.TestCase): + def test_scale_input_1(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'op_output') + ], + {'placeholder_1': {'shape': np.array([1, 3, 224, 224])}}, + nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'mul_1_data'), + ('mul_1_data', 'mul_1'), + ('mul_1_w', 'mul_1'), + ('mul_1', 'placeholder_1_data'), + ('placeholder_1_data', 'op_output') + ], + {'mul_1_w': {'shape': np.array([1, 1, 1]), 'value': np.array([1 / 255])}}, + nodes_with_edges_only=True) + graph.graph['layout'] = 'NCHW' + graph.graph['cmd_params'] = Namespace(scale=255) + ScaleInput().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1_data') + self.assertTrue(flag, resp) + + def test_scale_input_2(self): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'op_output') + ], + nodes_with_edges_only=True) + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'op_output') + ], + nodes_with_edges_only=True) + graph.graph['cmd_params'] = Namespace(scale=1) + ScaleInput().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1_data') + self.assertTrue(flag, resp) diff --git a/model-optimizer/extensions/middle/SharedWeightsDuplication.py b/model-optimizer/extensions/middle/SharedWeightsDuplication.py new file mode 100644 index 0000000..d1f67ea --- /dev/null +++ b/model-optimizer/extensions/middle/SharedWeightsDuplication.py @@ -0,0 +1,54 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.graph.graph import Graph, Node +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op + + +class SharedWeightsDuplication(MiddleReplacementPattern): + enabled = True + force_clean_up = True + + def run_after(self): + from extensions.middle.CheckForCycle import CheckForCycle + return [CheckForCycle] + + def run_before(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def find_and_replace_pattern(self, graph: Graph): + """ + This function finds all const data nodes that have more that one consumer and then duplicate them + """ + data_nodes = [Node(graph, id) for id in graph.nodes() if Node(graph, id).soft_get('kind') == 'data'] + for node in data_nodes: + # Check that node has const values and more than one consumer + if len(node.in_nodes()) and node.in_node().soft_get('type') == 'Const' and len(node.out_nodes()) > 1 and \ + node.value is not None: + # Here we delete all edges between base node and it's consumers (except first), and then duplicate this + # node to connect with other consumers + for v, d in node.get_outputs(): + out_node = Node(graph, v) + e_attrs = d + graph.remove_edge(node.id, out_node.id) + data = Op.create_input_data_node(graph, "Copy_{}".format(node.id), np.array(node.value), + graph.node[node.id]) + + graph.add_edges_from([(data.id, out_node.id, e_attrs)]) + diff --git a/model-optimizer/mo/middle/passes/shared_weights_duplication_test.py b/model-optimizer/extensions/middle/SharedWeightsDuplication_test.py similarity index 72% rename from model-optimizer/mo/middle/passes/shared_weights_duplication_test.py rename to model-optimizer/extensions/middle/SharedWeightsDuplication_test.py index ef48276..49f571f 100644 --- a/model-optimizer/mo/middle/passes/shared_weights_duplication_test.py +++ b/model-optimizer/extensions/middle/SharedWeightsDuplication_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,10 +18,12 @@ import unittest import numpy as np -from mo.middle.passes.shared_weights_duplication import duplicate_shared_weights +from extensions.middle.SharedWeightsDuplication import SharedWeightsDuplication +from mo.middle.passes.eliminate import graph_clean_up from mo.utils.unittest.graph import build_graph, compare_graphs nodes_attributes = { + 'const': {'shape': None, 'type': 'Const', 'kind': 'op', 'op': 'Const'}, # Mul and Add operations 'mul_1': {'type': None, 'kind': 'op', 'op': 'Mul'}, 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, @@ -35,13 +37,15 @@ nodes_attributes = { # Concat1 operation 'concat_1': {'type': 'Concat', 'kind': 'op', 'op': 'Concat'}, 'concat_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'op': 'OpOutput', 'kind': 'op'} } class DuplicateSharedWeightsTests(unittest.TestCase): def test_duplicate_shared_weights_1(self): graph = build_graph(nodes_attributes, - [('mul_1_w', 'mul_1'), + [('const', 'mul_1_w'), + ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_w', 'mul_2'), ('mul_2', 'mul_2_data'), @@ -50,12 +54,16 @@ class DuplicateSharedWeightsTests(unittest.TestCase): ('mul_1_data', 'concat_1'), ('mul_2_data', 'concat_1'), ('mul_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], - {'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}}) + {'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}}, + nodes_with_edges_only=True + ) graph_ref = build_graph(nodes_attributes, - [('mul_1_w', 'mul_1'), + [ + ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_2_w', 'mul_2'), ('mul_2', 'mul_2_data'), @@ -64,14 +72,16 @@ class DuplicateSharedWeightsTests(unittest.TestCase): ('mul_1_data', 'concat_1'), ('mul_2_data', 'concat_1'), ('mul_3_data', 'concat_1'), - ('concat_1', 'concat_1_data') - ], + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ], {'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_3_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - }) - - duplicate_shared_weights(graph) + }, nodes_with_edges_only=True) + SharedWeightsDuplication().find_and_replace_pattern(graph) + graph_clean_up(graph) + graph_clean_up(graph_ref) (flag, resp) = compare_graphs(graph, graph_ref, 'concat_1_data') - self.assertTrue(flag, resp) + self.assertTrue(flag, resp) \ No newline at end of file diff --git a/model-optimizer/extensions/middle/ShuffleChannel.py b/model-optimizer/extensions/middle/ShuffleChannel.py index 5370aeb..d5e85fa 100644 --- a/model-optimizer/extensions/middle/ShuffleChannel.py +++ b/model-optimizer/extensions/middle/ShuffleChannel.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. """ -import networkx as nx import numpy as np from extensions.middle.ShufflenetReshape import FeatureShuffleReshape +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.permute import Permute from mo.ops.reshape import Reshape @@ -33,6 +33,10 @@ class ShuffleChannel(MiddleReplacementPattern): def run_after(self): return [FeatureShuffleReshape] + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -41,7 +45,7 @@ class ShuffleChannel(MiddleReplacementPattern): edges=[ ]) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): if graph.graph['layout'] != "NCHW": return @@ -58,7 +62,8 @@ class ShuffleChannel(MiddleReplacementPattern): cols = in_node.shape[1] // group if rows * cols != in_node.shape[1]: - raise Error("Group {} should divide input channels number {} without reminder for node {}".format(group, in_node.shape[1], node.id)) + raise Error("Group {} should divide input channels number {} without reminder for node {}" + "".format(group, in_node.shape[1], node.id)) reshape_split = Reshape(graph, attrs={'name': node.id + '/Reshape_split_', 'dim': np.array([in_node.shape[0], rows, cols, -1])}) diff --git a/model-optimizer/extensions/middle/ShuffleChannel_test.py b/model-optimizer/extensions/middle/ShuffleChannel_test.py index 4b1e7e4..2cd6dd1 100644 --- a/model-optimizer/extensions/middle/ShuffleChannel_test.py +++ b/model-optimizer/extensions/middle/ShuffleChannel_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/ShufflenetReshape.py b/model-optimizer/extensions/middle/ShufflenetReshape.py index f85d60d..b25eb09 100644 --- a/model-optimizer/extensions/middle/ShufflenetReshape.py +++ b/model-optimizer/extensions/middle/ShufflenetReshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,11 @@ import logging as log -import networkx as nx import numpy as np from mo.front.common.layout import get_features_dim, get_height_dim, get_width_dim from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern from mo.ops.reshape import Reshape @@ -33,6 +33,10 @@ class FeatureShuffleReshape(MiddleReplacementPattern): enabled = True + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + def pattern(self): return dict( nodes=[ @@ -51,7 +55,7 @@ class FeatureShuffleReshape(MiddleReplacementPattern): ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): reshape1 = match['reshape1'] reshape2 = match['reshape2'] transpose = match['transpose'] @@ -117,6 +121,8 @@ class FeatureShuffleReshape(MiddleReplacementPattern): new_transpose_shape = np.array(new_reshape1_shape[new_transpose_order]) reshape1.out_node().shape = new_reshape1_shape + reshape1.dim = np.copy(new_reshape1_shape) + transpose.order = new_transpose_order transpose.out_node().shape = new_transpose_shape @@ -137,6 +143,10 @@ class ReshapeSoftmaxReshape(MiddleReplacementPattern): enabled = True + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -150,7 +160,7 @@ class ReshapeSoftmaxReshape(MiddleReplacementPattern): ('softmax', 'softmax_data'), ]) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): layout = graph.graph['layout'] if layout != 'NHWC': return diff --git a/model-optimizer/extensions/middle/ShufflenetReshape_test.py b/model-optimizer/extensions/middle/ShufflenetReshape_test.py index d75c83d..1bd8b2a 100644 --- a/model-optimizer/extensions/middle/ShufflenetReshape_test.py +++ b/model-optimizer/extensions/middle/ShufflenetReshape_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/SliceConvert_test.py b/model-optimizer/extensions/middle/SliceConvert_test.py index f282d5e..745ca42 100644 --- a/model-optimizer/extensions/middle/SliceConvert_test.py +++ b/model-optimizer/extensions/middle/SliceConvert_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,18 +25,23 @@ from mo.ops.slice import Slice nodes_attributes = { # input data 'placeholder_1': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, + 'placeholder_2': {'type': 'Const', 'kind': 'op', 'op': 'Const'}, + 'placeholder_3': {'type': 'Const', 'kind': 'op', 'op': 'Const'}, 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'placeholder_3_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # Slice layer 'slice': {'type': 'Slice', 'kind': 'op', 'op': 'Slice'}, 'slice_data': {'value': None, 'shape': None, 'kind': 'data'}, # Output operation 'output_op': {'type': 'Const', 'value': None, 'kind': 'op', 'op': 'Const'}, 'output_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, # Crop layer 'crop': {'type': 'Crop', 'kind': 'op', 'op': 'Crop', 'axis': None, 'offset': None, 'dim': None}, 'dim': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # StridedSlice layer - 'strided_slice': {'type': 'StridedSlice', 'kind': 'op', 'op': 'StridedSlice', 'slices': None, + 'strided_slice': {'kind': 'op', 'op': 'StridedSlice', 'slices': None, 'shrink_axis_mask': None} } @@ -53,11 +58,11 @@ class ConvertSliceTests(unittest.TestCase): ('placeholder_1_data', 'slice'), ('slice', 'slice_data'), ('slice_data', 'output_op'), - ('output_op', 'output_data') + ('output_op', 'output_data'), + ('output_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([4, 5, 6])}, 'slice': {'start': np.array([1, 2, 3]), 'end': np.array([3, 4, 4]), 'axis': None}, - 'output_op': {'is_output': True}, } ) slice_node = Node(graph, 'slice') @@ -71,12 +76,11 @@ class ConvertSliceTests(unittest.TestCase): ('placeholder_1_data', 'crop'), ('crop', 'slice_data'), ('slice_data', 'output_op'), - ('output_op', 'output_data') + ('output_op', 'output_data'), + ('output_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([4, 5, 6])}, - 'crop': {'axis': np.array([0, 1, 2]), 'offset': np.array([1, 2, 3]), - }, - 'output_op': {'is_output': True}, + 'crop': {'axis': np.array([0, 1, 2]), 'offset': np.array([1, 2, 3])}, 'dim': {'dim': np.array([2, 2, 1])}, } ) @@ -93,11 +97,11 @@ class ConvertSliceTests(unittest.TestCase): ('placeholder_1_data', 'slice'), ('slice', 'slice_data'), ('slice_data', 'output_op'), - ('output_op', 'output_data') + ('output_op', 'output_data'), + ('output_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([4, 5, 6])}, - 'slice': {'start': np.array([1]), 'end': np.array([3]), 'axis': None}, - 'output_op': {'is_output': True} + 'slice': {'start': np.array([1]), 'end': np.array([3]), 'axis': None} } ) slice_node = Node(graph, 'slice') @@ -108,15 +112,19 @@ class ConvertSliceTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), + ('placeholder_2', 'placeholder_2_data'), + ('placeholder_3', 'placeholder_3_data'), ('placeholder_1_data', 'strided_slice'), + ('placeholder_2_data', 'strided_slice'), + ('placeholder_3_data', 'strided_slice'), ('strided_slice', 'slice_data'), ('slice_data', 'output_op'), - ('output_op', 'output_data') + ('output_op', 'output_data'), + ('output_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([4, 5, 6])}, 'strided_slice': {'slices': np.array([slice(1, 3, 1),slice(0, 5, 1),slice(0, 6, 1)]), 'shrink_axis_mask': np.array([False, False, False])}, - 'output_op': {'is_output': True} } ) diff --git a/model-optimizer/extensions/middle/SliceConverter.py b/model-optimizer/extensions/middle/SliceConverter.py index f6e925b..e4c0266 100644 --- a/model-optimizer/extensions/middle/SliceConverter.py +++ b/model-optimizer/extensions/middle/SliceConverter.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,12 +14,13 @@ limitations under the License. """ -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.const import Const from mo.ops.crop import Crop -from mo.ops.op import Op +from mo.ops.strided_slice import StridedSlice def convert_negative_indices(indices: np.array, shape: np.array): @@ -36,6 +37,10 @@ class ConvertSlice(MiddleReplacementPattern): enabled = True op = "Slice" + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + def pattern(self): return dict( nodes=[ @@ -44,7 +49,7 @@ class ConvertSlice(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): node = match['slice'] # Caffe case if not node.has_valid('start') or not node.has_valid('end'): @@ -52,31 +57,50 @@ class ConvertSlice(MiddleReplacementPattern): begin = node.start end = node.end + axis = node.axis if node.has_valid('axis') else range(begin.size) + input = node.in_node(0) output_data = node.out_node() # Check whether operation use only one axis or not + axes_begin = np.zeros(len(input.shape), dtype=np.int32) + axes_end = np.zeros(len(input.shape), dtype=np.int32) + begin_ext = np.zeros(len(input.shape), dtype=np.int32) + end_ext = np.zeros(len(input.shape), dtype=np.int32) dims = 0 axes = np.zeros(begin.size) - for i in range(begin.size): - if begin[i] != 0 or end[i] != input.shape[i]: + for i in range(len(axis)): + if begin[i] != 0 or end[i] < input.shape[i]: dims += 1 axes[i] = 1 + if begin[i] != 0: + axes_begin[axis[i]] = 1 + begin_ext[axis[i]] = begin[i] + if end[i] < input.shape[i]: + axes_end[axis[i]] = 1 + end_ext[axis[i]] = end[i] axes = np.array(axes, dtype=bool) - if dims == 0: - return - elif dims == 1: - # If Slice use only one axis, than + + if dims == 1 or dims == 0: + # If Slice use only one axis or no axis, than # convert Slice to StridedSlice + ss = StridedSlice(graph, dict(new_axis_mask=np.zeros(len(output_data.shape), dtype=np.int32), + shrink_axis_mask=np.zeros(len(output_data.shape), dtype=np.int32), + ellipsis_mask=np.zeros(len(output_data.shape), dtype=np.int32), + begin_mask=axes_begin, + end_mask=axes_end)) + + convert_negative_indices(begin_ext, input.shape) + convert_negative_indices(end_ext, input.shape) - node['op'] = 'StridedSlice' - node['type'] = 'StridedSlice' - node['new_axis_mask'] = np.zeros(len(output_data.shape), dtype=np.bool) - node['shrink_axis_mask'] = np.zeros(len(output_data.shape), dtype=np.bool) + begin_node = Const(graph, {'name': 'begin', 'value': begin_ext, 'force_precision': 'I32'}).create_node_with_data() + end_node = Const(graph, {'name': 'end', 'value': end_ext, 'force_precision': 'I32'}).create_node_with_data() - convert_negative_indices(begin, input.shape) - convert_negative_indices(end, input.shape) + ss.create_node_with_data(inputs=[input, begin_node, end_node], data_nodes=[output_data]) + # Remove unnecessary edges from and to to Slice vertex + graph.remove_edge(input.id, node.id) + graph.remove_edge(node.id, output_data.id) else: # If Slice use more than one axis use Crop layer crop = Crop(graph, dict(axis=np.arange(begin.size)[axes], diff --git a/model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py b/model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py index 276ff7f..409bdee 100644 --- a/model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py +++ b/model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,11 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op from mo.ops.reshape import Reshape @@ -31,7 +30,7 @@ class SwapAxesMiddleReplacer(MiddleReplacementPattern): edges=[], ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): """ Replace swapaxes layer: swapaxes -> Reshape @@ -47,5 +46,6 @@ class SwapAxesMiddleReplacer(MiddleReplacementPattern): graph.remove_edge(swapaxes_in_node.id, swapaxes.id) graph.remove_edge(swapaxes.id, swapaxes_out_node.id) Reshape(graph, {'dim': np.array(swapaxes_in_node.shape)}).create_node_with_data(inputs=[swapaxes_in_node], - data_nodes=[swapaxes_out_node], - edge_attrs=[input_edge_attrs, output_edge_attrs]) + data_nodes=[swapaxes_out_node], + edge_attrs=[input_edge_attrs, + output_edge_attrs]) diff --git a/model-optimizer/extensions/middle/TF_lstm_cell_to_generic.py b/model-optimizer/extensions/middle/TF_lstm_cell_to_generic.py index b029b45..20faa4e 100644 --- a/model-optimizer/extensions/middle/TF_lstm_cell_to_generic.py +++ b/model-optimizer/extensions/middle/TF_lstm_cell_to_generic.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np from extensions.middle.FusePermutesSequence import FusePermutesSequence +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern @@ -31,7 +31,8 @@ class TensorFlowLSTMtoGeneric(MiddleReplacementPattern): enabled = True def run_after(self): - return [] + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] def run_before(self): return [ @@ -44,7 +45,7 @@ class TensorFlowLSTMtoGeneric(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): weights_node = match['lstm'].in_node(3) biases_node = match['lstm'].in_node(4) node = match['lstm'] @@ -61,9 +62,9 @@ class TensorFlowLSTMtoGeneric(MiddleReplacementPattern): hidden_size = node.in_node(1).shape[1] weights = weights_node.value biases = biases_node.value - assert weights.shape[0] == input_size + hidden_size, "weights.shape={} input_size={} hidden_size={}".format( - weights.shape, input_size, hidden_size) - assert weights.shape[1] == biases.shape[0] == 4 * hidden_size,\ + assert weights.shape[0] == input_size + hidden_size, \ + "weights.shape={} input_size={} hidden_size={}".format(weights.shape, input_size, hidden_size) + assert weights.shape[1] == biases.shape[0] == 4 * hidden_size, \ "weights.shape={} biases.shape={} hidden_size={}".format(weights.shape, biases.shape, hidden_size) weights = weights.reshape([ diff --git a/model-optimizer/extensions/middle/TensorIteratorBackEdge.py b/model-optimizer/extensions/middle/TensorIteratorBackEdge.py index 868b38c..2ae1fe9 100644 --- a/model-optimizer/extensions/middle/TensorIteratorBackEdge.py +++ b/model-optimizer/extensions/middle/TensorIteratorBackEdge.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,8 @@ import logging as log -import networkx as nx - from extensions.ops.TensorIterator_ops import TensorIteratorBackEdge, TensorIteratorOutput +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern @@ -44,6 +43,15 @@ class BackEdgesMatching(MiddleReplacementPattern): TensorIteratorCondition-- """ enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + from extensions.middle.TensorIteratorCondition import SimpleConditionMatcher + return [SimpleConditionMatcher] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] @staticmethod def pattern(): @@ -83,7 +91,7 @@ class BackEdgesMatching(MiddleReplacementPattern): ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): log.debug('================== BackEdgeFind ===============') nodes_for_remove = [] diff --git a/model-optimizer/extensions/middle/TensorIteratorBackEdge_test.py b/model-optimizer/extensions/middle/TensorIteratorBackEdge_test.py index c4482c4..d9cc63f 100644 --- a/model-optimizer/extensions/middle/TensorIteratorBackEdge_test.py +++ b/model-optimizer/extensions/middle/TensorIteratorBackEdge_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/TensorIteratorCondition.py b/model-optimizer/extensions/middle/TensorIteratorCondition.py index 70b169f..435a686 100644 --- a/model-optimizer/extensions/middle/TensorIteratorCondition.py +++ b/model-optimizer/extensions/middle/TensorIteratorCondition.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ import logging as log -import networkx as nx - from extensions.ops.TensorIterator_ops import TensorIteratorCondition +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern +import numpy as np class LoopConditionMatcher(MiddleReplacementPattern): @@ -46,6 +46,14 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) Const---- """ enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + return [] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] @staticmethod def pattern(): @@ -69,7 +77,6 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) ('Enter_2_less', dict(kind='op', op='Enter')), ('Enter_2_less_data', dict(kind='data')), - ('minimum', dict(kind='op', op='Minimum')), ('minimum_data', dict(kind='data')), ('and', dict(kind='op', op='LogicalAnd')), @@ -78,9 +85,9 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) ('loop_cond_data', dict(kind='data')), ('init_1', dict(kind='op', op='Const')), - ('init_1_data', dict(kind='data')), + ('init_1_data', dict(kind='data')), ('Enter_1', dict(kind='op', op='Enter')), - ('Enter_1_data', dict(kind='data')), + ('Enter_1_data', dict(kind='data')), ('init_2', dict(kind='op', op='Const')), ('init_2_data', dict(kind='data')), @@ -92,7 +99,7 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) ('Identity_1', dict(kind='op', op='Identity')), ('Identity_1_data', dict(kind='data')), ('add_1', dict(kind='op', op='Add')), - ('add_1_y', dict(kind='op', op='Const')), + ('add_1_y', dict(kind='op', op='Const')), ('add_1_y_data', dict(kind='data')), ('add_1_data', dict(kind='data')), ('NextIteration_1', dict(kind='op', op='NextIteration')), @@ -111,7 +118,6 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) edges=[ ('Strided_slice', 'Strided_slice_data'), ('Strided_slice_data', 'Enter_1_less'), - ('Strided_slice_data', 'minimum'), ('Enter_1_less', 'Enter_1_less_data'), ('Enter_1_less_data', 'Less_1'), ('Less_1', 'Less_1_data'), @@ -150,7 +156,6 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) ('add_2', 'add_2_data'), ('add_2_data', 'NextIteration_2'), - ('minimum', 'minimum_data'), ('minimum_data', 'Enter_2_less'), ('Enter_2_less', 'Enter_2_less_data'), ('Enter_2_less_data', 'Less_2'), @@ -168,26 +173,35 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def looking_for_iteration_counter(graph: Graph, match: dict): + types = ['TensorIteratorInput', 'TensorIteratorOutput'] + candidates = np.array([match['Identity_1_data'], match['Identity_2_data']]) + results = np.array([False for i in range(len(candidates))]) + for i, candidat in enumerate(candidates): + for node in candidat.out_nodes(): + if node['op'] in types: + results[i] = True + assert not np.all(results) + assert sum(results) == 1 + return candidates[results == True][0] + + def replace_pattern(self, graph: Graph, match: dict): log.debug('================== ConditionFind ===============') - max_node = match['minimum'].in_node(1).in_node() - assert max_node['kind'] == 'op' and max_node['op'] == 'Maximum' - - #init_1 + # init_1 init_1 = match['init_1_data'].value assert init_1 is not None init_1 = int(init_1) - #init_2 + # init_2 init_2 = match['init_2_data'].value assert init_2 is not None init_2 = int(init_2) - #step_1 + # step_1 assert match['add_1_y_data'].value is not None step_1 = int(match['add_1_y_data'].value) - #step_2 + # step_2 assert match['add_2_y_data'].value is not None step_2 = int(match['add_2_y_data'].value) @@ -195,14 +209,17 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) match['Identity_2_data'].value = None # Create condition node and delete all useless nodes from condition pattern - condition_attrs = dict(time=dict(init=init_2, step=step_2), iter=dict(init=init_1, step=step_1), \ + loop_condiiton = match['loop_cond_data'] + iterator_data = self.looking_for_iteration_counter(graph, match) + + condition_attrs = dict(time=dict(init=init_2, step=step_2), iter=dict(init=init_1, step=step_1), name=match['loop_cond'].name + '/TensorIteratorCondition_') condition = TensorIteratorCondition(graph, attrs=condition_attrs) condition.create_node_with_data(inputs=[match['Strided_slice_data'], match['minimum_data']], - data_nodes=[match['loop_cond_data'], match['Identity_2_data']]) + data_nodes=[loop_condiiton, iterator_data]) # Delete useless nodes - safe_nodes = ['loop_cond_data', 'Identity_2_data', 'Strided_slice', 'Strided_slice_data', + safe_nodes = ['loop_cond_data', 'Identity_1_data', 'Identity_2_data', 'Strided_slice', 'Strided_slice_data', 'minimum', 'minimum_data'] nodes_for_remove = [] for node in match.keys(): @@ -211,7 +228,17 @@ Shape -> StridedSlice -> Enter -| LogicalAnd --> LoopCond (data) graph.remove_nodes_from(nodes_for_remove) -class SimpleConditionMather(MiddleReplacementPattern): +class SimpleConditionMatcher(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + return [LoopConditionMatcher] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] + @staticmethod def pattern(): log.debug('+++++++++++++++ SimpleConditionMatching ++++++++++++++++') @@ -231,17 +258,16 @@ class SimpleConditionMather(MiddleReplacementPattern): ('loop_cond_data', dict(kind='data')), ('init_1', dict(kind='op', op='Const')), - ('init_1_data', dict(kind='data')), + ('init_1_data', dict(kind='data')), ('Enter_1', dict(kind='op', op='Enter')), - ('Enter_1_data', dict(kind='data')), - + ('Enter_1_data', dict(kind='data')), ('Switch_1', dict(kind='op', op='Switch')), ('Switch_1_data', dict(kind='data')), ('Identity_1', dict(kind='op', op='Identity')), ('Identity_1_data', dict(kind='data')), ('add_1', dict(kind='op', op='Add')), - ('add_1_y', dict(kind='op', op='Const')), + ('add_1_y', dict(kind='op', op='Const')), ('add_1_y_data', dict(kind='data')), ('add_1_data', dict(kind='data')), ('NextIteration_1', dict(kind='op', op='NextIteration')), @@ -278,7 +304,7 @@ class SimpleConditionMather(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): log.debug('================== SimpleConditionFind ===============') # init_1 init_1 = match['init_1_data'].value @@ -292,7 +318,7 @@ class SimpleConditionMather(MiddleReplacementPattern): match['loop_cond_data'].value = None # Create condition node and delete all useless nodes from condition pattern - condition_attrs = dict(iter=dict(init=init_1, step=step_1), \ + condition_attrs = dict(iter=dict(init=init_1, step=step_1), name=match['loop_cond'].name + '/TensorIteratorCondition_') condition = TensorIteratorCondition(graph, attrs=condition_attrs) condition.create_node_with_data(inputs=[match['Strided_slice_data']], @@ -304,4 +330,4 @@ class SimpleConditionMather(MiddleReplacementPattern): for node in match.keys(): if node not in safe_nodes: nodes_for_remove.append(match[node].id) - graph.remove_nodes_from(nodes_for_remove) \ No newline at end of file + graph.remove_nodes_from(nodes_for_remove) diff --git a/model-optimizer/extensions/middle/TensorIteratorConditionChecker.py b/model-optimizer/extensions/middle/TensorIteratorConditionChecker.py index 5dfea5b..80351f9 100644 --- a/model-optimizer/extensions/middle/TensorIteratorConditionChecker.py +++ b/model-optimizer/extensions/middle/TensorIteratorConditionChecker.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,14 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. """ +import logging as log import numpy as np -import logging as log + from mo.middle.replacement import MiddleReplacementPattern class ConditionChecks(MiddleReplacementPattern): enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + from extensions.middle.TensorIteratorBackEdge import BackEdgesMatching + return [BackEdgesMatching] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] @staticmethod def pattern(): @@ -54,7 +64,7 @@ class ConditionChecks(MiddleReplacementPattern): @staticmethod def replace_pattern(graph, match: dict): - #Check for SS params + # Check for SS params # Sanity check that we iterate over axis of some tensor ss = match['Strided_slice'] params = ss.in_nodes() @@ -62,7 +72,7 @@ class ConditionChecks(MiddleReplacementPattern): assert np.all(params[2].in_node().value == 1) assert np.all(params[3].in_node().value == 1) - #Check Maximum/Minimum params + # Check Maximum/Minimum params # Check for comparing SS and seq_length source (it should be one tensor) # SIMPLE CHECK @@ -71,10 +81,9 @@ class ConditionChecks(MiddleReplacementPattern): log.warning('TF loop doesn\'t have a constant upper bound produced by node {}, or ModelOptimizer ' 'cannot detect a constant in this case. Loops with a dynamic number of iterations are not ' 'supported, so in the resulting IR, generated TensorIterator will have ' - 'a maximum number of iterations determined by input tensor size: {}', - match['minimum_data'].soft_get('name'), - match['Strided_slice_data'].value - ) + 'a maximum number of iterations determined by input tensor size: {}' + ''.format(match['minimum_data'].soft_get('name'), match['Strided_slice_data'].value) + ) else: assert match['Strided_slice_data'].value == match['minimum_data'].value, \ 'Values do not match: {} and {}'.format(match['Strided_slice_data'].value, match['minimum_data'].value) @@ -82,7 +91,7 @@ class ConditionChecks(MiddleReplacementPattern): # SMART CHECK # TODO: add here some smart check for tensors equality - #Check that bound for Condition and Inputs/Outputs sizes match + # Check that bound for Condition and Inputs/Outputs sizes match condition_time = match['condition'].out_node(0) inputs_and_outputs = condition_time.out_nodes() type_list = ['TensorIteratorInput', 'TensorIteratorOutput'] diff --git a/model-optimizer/extensions/middle/TensorIteratorCondition_test.py b/model-optimizer/extensions/middle/TensorIteratorCondition_test.py index 8ebd9dd..2085b67 100644 --- a/model-optimizer/extensions/middle/TensorIteratorCondition_test.py +++ b/model-optimizer/extensions/middle/TensorIteratorCondition_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27,18 +27,18 @@ class TensorIteratorConditionTests(unittest.TestCase): pattern = pattern_matcher.pattern() graph = build_graph_with_attrs(nodes_with_attrs=pattern['nodes'], edges_with_attrs=pattern['edges'], - new_nodes_with_attrs=[('maximum', {'kind':'op', 'op': 'Maximum'}), - ('maximum_data', {'kind': 'data'})], + new_nodes_with_attrs=[('maximum', {'kind': 'op', 'op': 'Maximum'}), + ('maximum_data', {'kind': 'data'}), + ('TensorIteratorInput', {'kind': 'op', 'op': 'TensorIteratorInput'})], new_edges_with_attrs=[('maximum', 'maximum_data'), - ('maximum_data', 'minimum', {'in':1})], + ('Identity_1_data', 'TensorIteratorInput')], update_nodes_attributes=[('init_1_data', {'value': np.array([0])}), ('init_2_data', {'value': np.array([0])}), ('add_1_y_data', {'value': np.array(1)}), ('add_2_y_data', {'value': np.array(1)}), ('loop_cond_data', {'value': None}), ('Identity_2_data', {'value': None}), - ], - update_edge_attrs={('Strided_slice_data', 'minimum',0): {'in': 0}}) + ]) pattern_matcher.find_and_replace_pattern(graph) graph_ref = build_graph_with_attrs( @@ -49,18 +49,16 @@ class TensorIteratorConditionTests(unittest.TestCase): ('StridedSlice_data', {'kind': 'data'}), ('Maximum', {'kind': 'op', 'op': 'Maximum'}), ('Maximum_data', {'kind': 'data'}), - ('minimum', {'kind': 'op', 'op': 'Minimum'}), ('minimum_data', {'kind': 'data'}), + ('TensorIteratorInput', {'kind': 'op', 'op': 'TensorIteratorInput'}) ], edges_with_attrs=[('Maximum', 'Maximum_data'), - ('Maximum_data', 'minimum'), ('StridedSlice', 'StridedSlice_data'), ('StridedSlice_data', 'TensorIteratorCondition', {'in':0}), - ('StridedSlice_data', 'minimum'), - ('minimum', 'minimum_data'), ('minimum_data', 'TensorIteratorCondition', {'in':1}), ('TensorIteratorCondition', 'loop_cond_data'), ('TensorIteratorCondition', 'identity_data'), + ('identity_data', 'TensorIteratorInput'), ], update_edge_attrs=None, new_nodes_with_attrs=[], diff --git a/model-optimizer/extensions/middle/TensorIteratorInput.py b/model-optimizer/extensions/middle/TensorIteratorInput.py index 65cdb40..93d63fa 100644 --- a/model-optimizer/extensions/middle/TensorIteratorInput.py +++ b/model-optimizer/extensions/middle/TensorIteratorInput.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ """ import logging as log -import networkx as nx + import numpy as np from extensions.ops.TensorIterator_ops import TensorIteratorInput +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern @@ -38,7 +39,16 @@ class SmartInputMatcher(MiddleReplacementPattern): |__________________________________________________| """ - enabled = False # called from mo.pipeline.tf directly + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + from extensions.middle.TensorIterator_utils import DeleteSelect + return [DeleteSelect] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] @staticmethod def pattern(): @@ -115,7 +125,7 @@ class SmartInputMatcher(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): log.debug('================== SmartInputFind ===============') assert match['Enter_data'].value is not None @@ -141,12 +151,12 @@ class SmartInputMatcher(MiddleReplacementPattern): # axis == 0 because in TensorArray we ALWAYS iterate over 0 axis, other params will be fill later (with # condition) input_node = TensorIteratorInput(graph, dict(axis=0, start=start, stride=None, part_size=None, - external_port_id=str(match['Enter_data'].value), - internal_layer_id=match['TensorArrayRead_data'].id, - name=match['TensorArrayRead'].name + '/TensorIteratorInput_' - )) + external_port_id=str(match['Enter_data'].value), + internal_layer_id=match['TensorArrayRead_data'].id, + name=match['TensorArrayRead'].name + '/TensorIteratorInput_' + )) input_node.create_node_with_data(inputs=[ta_size_data, value, match['Condition_data']], - data_nodes=[match['TensorArrayRead_data']]) + data_nodes=[match['TensorArrayRead_data']]) # Delete useless nodes safe_nodes = ['TensorArrayRead_data', 'Condition', 'Condition_data'] @@ -158,12 +168,21 @@ class SmartInputMatcher(MiddleReplacementPattern): class SimpleInputMatcher(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] - enabled = False # called from mo.pipeline.tf directly + def run_after(self): + from extensions.middle.DeleteNotExecutable import DeleteNotExecutable + return [DeleteNotExecutable] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] """ This pattern match simple inputs (without partitions) in while loops in TF (this inputs are set by Enter nodes). """ + @staticmethod def pattern(): return dict( @@ -175,13 +194,13 @@ class SimpleInputMatcher(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): log.debug('================== SimpletInputFind ===============') input_node = TensorIteratorInput(graph, dict(external_port_id=None, - internal_layer_id=None, - name=match['Enter'].name + '/TensorIteratorInput_' - )) + internal_layer_id=None, + name=match['Enter'].name + '/TensorIteratorInput_' + )) input_node.create_node_with_data(inputs=[match['Enter'].in_node()], data_nodes=[match['Enter'].out_node()]) # Delete useless nodes @@ -189,8 +208,15 @@ class SimpleInputMatcher(MiddleReplacementPattern): class BackEdgeSimpleInputMatcher(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] - enabled = False # called from mo.pipeline.tf directly + def run_after(self): + return [SimpleInputMatcher] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] @staticmethod def pattern(): @@ -203,7 +229,7 @@ class BackEdgeSimpleInputMatcher(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): log.debug('================== SimpleBackEdgeInputFind ===============') assert len(match['BackEdge'].in_nodes()) == 3 @@ -212,11 +238,18 @@ class BackEdgeSimpleInputMatcher(MiddleReplacementPattern): cycle_input = match['BackEdge'].in_node(1) # We need to create new TensorItertorInput node only if this node doesn't exist already. - if len(init_input.in_nodes()) == 0: + if len(init_input.in_nodes()) == 0 or\ + (len(init_input.in_nodes()) == 1 and init_input.has_valid('value')): + input_node = TensorIteratorInput(graph, dict(external_port_id=None, - internal_layer_id=None, - name=match['BackEdge'].name + '/TensorIteratorInput_' - )) + internal_layer_id=None, + name=match['BackEdge'].name + '/TensorIteratorInput_' + )) + + # In case if data node has Constant producer + if len(init_input.in_nodes()) == 1: + graph.remove_edge(init_input.in_node(0).id, init_input.id) + input_data_node = input_node.create_node_with_data(inputs=[init_input]) input_data_node.shape = np.array(init_input.shape, dtype=np.int64) graph.remove_edges_from([(init_input.id, match['BackEdge'].id)]) diff --git a/model-optimizer/extensions/middle/TensorIteratorInput_test.py b/model-optimizer/extensions/middle/TensorIteratorInput_test.py index efd560c..3d5b738 100644 --- a/model-optimizer/extensions/middle/TensorIteratorInput_test.py +++ b/model-optimizer/extensions/middle/TensorIteratorInput_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/lstm_tensor_iterator_to_lstm_sequence.py b/model-optimizer/extensions/middle/TensorIteratorLSTMToLSTMSequence.py similarity index 67% rename from model-optimizer/extensions/middle/lstm_tensor_iterator_to_lstm_sequence.py rename to model-optimizer/extensions/middle/TensorIteratorLSTMToLSTMSequence.py index a7b6b56..95edf9a 100644 --- a/model-optimizer/extensions/middle/lstm_tensor_iterator_to_lstm_sequence.py +++ b/model-optimizer/extensions/middle/TensorIteratorLSTMToLSTMSequence.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,13 @@ limitations under the License. """ -import networkx as nx - -from mo.graph.graph import copy_node -from mo.utils.error import Error +from extensions.middle.TF_lstm_cell_to_generic import TensorFlowLSTMtoGeneric +from extensions.middle.TensorIteratorMerge import TensorIteratorMerge +from mo.graph.graph import Graph from mo.middle.pattern_match import find_isomorphisms from mo.middle.replacement import MiddleReplacementPattern -from extensions.ops.lstm_sequence import LSTMSequence -from extensions.middle.TensorIteratorMerge import TensorIteratorMerge -from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize, permute_before_and_after -from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator -from extensions.middle.TF_lstm_cell_to_generic import TensorFlowLSTMtoGeneric +from mo.utils.error import Error +from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize, permute_before_and_after class TensorIteratorLSTM(MiddleReplacementPattern): @@ -40,7 +36,7 @@ class TensorIteratorLSTM(MiddleReplacementPattern): enabled = False def run_after(self): - return [TensorIteratorMerge, LSTMSequenceNormalize, LSTMSequenceTensorIterator, TensorFlowLSTMtoGeneric] + return [TensorIteratorMerge, ONNXRNNSequenceNormalize, TensorFlowLSTMtoGeneric] def pattern(self): return dict( @@ -52,8 +48,8 @@ class TensorIteratorLSTM(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): - nodes=[ + def replace_pattern(graph: Graph, match: dict): + nodes = [ ('input_unsqueezed'), ('squeeze', dict(op='Reshape')), ('input_squeezed'), @@ -69,7 +65,7 @@ class TensorIteratorLSTM(MiddleReplacementPattern): ('unsqueeze', dict(op='Reshape')), ('output_unsqueezed'), ] - edges=[ + edges = [ ('input_unsqueezed', 'squeeze'), ('squeeze', 'input_squeezed'), @@ -101,37 +97,3 @@ class TensorIteratorLSTM(MiddleReplacementPattern): 'Please modify the original network ' 'to meet the requirements.'.format(ti.soft_get('name'))) # TODO Additional checks for port indices - if body_match['lstm'].has_valid('mark_supported_by_IE'): - body_match['lstm'].mark_supported_by_IE(body_match['lstm']) - - -class CheckUnsupportedLSTMCell(MiddleReplacementPattern): - """ Finds all unsupported LSTMCell. - - Initiates the second translation round if find any not supported LSTMCell instances. - """ - - enabled = False - - def run_after(self): - return [TensorIteratorLSTM] - - def pattern(self): - return dict( - nodes=[ - ('lstm', dict(op='LSTMCell')), - ], - edges=[ - ] - ) - - @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): - lstmcell = match['lstm'] - if lstmcell.has_valid('finalize_first_round'): - lstmcell.finalize_first_round() - if not lstmcell.has_and_set('supported_by_IE'): - # this is a signal for the main translation pipeline to repeat the entire conversion process - graph.graph['repeat_conversion'] = True - # in case when there is no lstmcell.finalize_first_round then this cell wasn't created with the pattern - # (for example in ONNX) and we don't initiate the second round. diff --git a/model-optimizer/extensions/middle/TensorIteratorMerge.py b/model-optimizer/extensions/middle/TensorIteratorMerge.py index 218b129..29e9749 100644 --- a/model-optimizer/extensions/middle/TensorIteratorMerge.py +++ b/model-optimizer/extensions/middle/TensorIteratorMerge.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,22 +14,21 @@ limitations under the License. """ - from collections import deque from copy import deepcopy -import networkx as nx import numpy as np -from mo.graph.graph import Node -from mo.utils.graph import sub_graph_between_nodes -from mo.middle.replacement import MiddleReplacementPattern from extensions.ops.tensor_iterator import TensorIterator +from mo.graph.graph import Node, Graph, add_opoutput +from mo.middle.replacement import MiddleReplacementPattern from mo.ops.op import Op from mo.ops.reshape import Reshape +from mo.utils.graph import sub_graph_between_nodes stop_nodes = ['TensorIteratorInput', 'TensorIteratorOutput', 'TensorIteratorBackEdge', 'TensorIteratorCondition'] + def op_type(graph, node_name: str): node = Node(graph, node_name) if node.has_valid('kind') and node['kind'] == 'op': @@ -45,7 +44,7 @@ def update_inputs(graph, inputs: list, node_name: str): inputs.append(node_name) -def reverse_dfs(graph: nx.MultiDiGraph, node_name: str, stop_nodes: list, inputs: list, visited: set = None): +def reverse_dfs(graph: Graph, node_name: str, stop_nodes: list, inputs: list, visited: set = None): d = deque() if visited is None: @@ -62,7 +61,8 @@ def reverse_dfs(graph: nx.MultiDiGraph, node_name: str, stop_nodes: list, inputs else: update_inputs(graph, inputs, in_node_name) -def dfs(graph: nx.MultiDiGraph, node_name: str, stop_nodes: list, visited: set = None): + +def dfs(graph: Graph, node_name: str, stop_nodes: list, visited: set = None): d = deque() visited.add(node_name) @@ -75,18 +75,28 @@ def dfs(graph: nx.MultiDiGraph, node_name: str, stop_nodes: list, visited: set = visited.add(out_node_name) d.append(out_node_name) + def get_body(graph, inputs, outputs): nodes, extra_inputs = sub_graph_between_nodes( graph, inputs, outputs, - lambda node: node.soft_get('op') == 'TensorIteratorInput' + lambda node: node.soft_get('op') == 'TensorIteratorInput' ) nodes = list(set(nodes) - set(inputs) - set(outputs) - set(extra_inputs)) return nodes, extra_inputs class TensorIteratorMerge(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + return [] + + def run_before(self): + return [] + @staticmethod def pattern(): return dict( @@ -144,7 +154,7 @@ class TensorIteratorMerge(MiddleReplacementPattern): inputs = [Node(graph, node) for node in inputs] outputs = [Node(graph, node) for node in outputs] back_edges = [Node(graph, node) for node in back_edges] - + external_inputs = [ { 'external_data_id': node.in_node(1 if node.has_valid('axis') else 0), @@ -156,7 +166,6 @@ class TensorIteratorMerge(MiddleReplacementPattern): 'part_size': node.part_size } for node in inputs] - external_outputs = [ { 'external_data_id': node.out_node(0), @@ -168,7 +177,6 @@ class TensorIteratorMerge(MiddleReplacementPattern): 'part_size': node.part_size } for node in outputs] - back_edges_data = [ { 'from_data_id': node.in_node(1), @@ -177,12 +185,14 @@ class TensorIteratorMerge(MiddleReplacementPattern): } for node in back_edges ] - body = nx.MultiDiGraph(name='body') - body.graph['layout'] = graph.graph['layout'] + body = Graph(name='body') + body.graph = graph.graph body.add_nodes_from([(node, graph.node[node]) for node in body_nodes]) - body.add_edges_from([(u,v,k,d)for u,v,k,d in graph.edges(data=True, keys=True) if u in body_nodes and v in body_nodes]) + body.add_edges_from( + [(u, v, k, d) for u, v, k, d in graph.edges(data=True, keys=True) if u in body_nodes and v in body_nodes]) - graph.remove_nodes_from(body_nodes + [match['condition'].id] + [inp.id for inp in inputs] + [out.id for out in outputs]) + graph.remove_nodes_from( + body_nodes + [match['condition'].id] + [inp.id for inp in inputs] + [out.id for out in outputs]) internal_id_count = 0 real_back_edges = [] for edge in back_edges_data: @@ -192,7 +202,7 @@ class TensorIteratorMerge(MiddleReplacementPattern): edge['from_data_id'] = Node(body, edge['from_data_id'].id) edge['to_data_id'] = Node(body, edge['to_data_id'].id) edge['init_data_id'] = Node(body, edge['init_data_id'].id) - edge['from_data_id']['is_output'] = True + add_opoutput(body, edge['from_data_id'].id, 0, False) # Assign/reuse ids for the back-edge start; it comes from from_data_id assert len(edge['from_data_id'].in_nodes()) == 1 @@ -214,13 +224,14 @@ class TensorIteratorMerge(MiddleReplacementPattern): for _, consumer, key, edge_attrs in body.out_edges(edge['to_data_id'].id, data=True, keys=True): real_edge = {} - real_edge.update(edge) # all real back_edges have the same back-edge start + real_edge.update(edge) # all real back_edges have the same back-edge start consumer = Node(body, consumer) if real_edge['to_data_id'].in_node().has_valid('internal_layer_id'): assert False - real_edge['to_data_id'].out_node()['internal_layer_id'] = real_edge['to_data_id'].in_node().internal_layer_id + real_edge['to_data_id'].out_node()['internal_layer_id'] = \ + real_edge['to_data_id'].in_node().internal_layer_id elif not consumer.has_valid('internal_layer_id'): consumer['internal_layer_id'] = internal_id_count internal_id_count += 1 @@ -245,7 +256,7 @@ class TensorIteratorMerge(MiddleReplacementPattern): real_edge['consumer'].id, real_edge['consumer_key'], real_edge['attrs']) - for real_edge in current_real_back_edges]) + for real_edge in current_real_back_edges]) body.remove_nodes_from([edge['to_data_id'].id, edge['to_data_id'].in_node().id]) real_back_edges += current_real_back_edges @@ -261,7 +272,8 @@ class TensorIteratorMerge(MiddleReplacementPattern): # Insert squeezing resize at input port that has partitioning shape = ext_inp['internal_data_id'].shape.copy() assert not ext_inp['internal_data_id'].has_valid('value') - new_input_data = Op._create_data_node(body, ext_inp['internal_data_id'].name + '/UnsqueezedInput', dict(shape=np.insert(shape, ext_inp['axis'], 1))) + new_input_data = Op._create_data_node(body, ext_inp['internal_data_id'].name + '/UnsqueezedInput', + dict(shape=np.insert(shape, ext_inp['axis'], 1))) dim = shape.copy() # try to do it dynamically reshapable along one of the axis # it is practically useful to reshape along batch dimension, but here we cannot detect where it is @@ -300,13 +312,14 @@ class TensorIteratorMerge(MiddleReplacementPattern): # trying to make it dynamically reshapable (see related comment above for the first Reshape) dim[0] = -1 assert not ext_out['internal_data_id'].has_valid('value') - reshape_op = Reshape(body, dict(name=ext_out['internal_data_id'].name + '/OutputUnsqueeze', dim=np.insert(dim, ext_out['axis'], 1))) + reshape_op = Reshape(body, dict(name=ext_out['internal_data_id'].name + '/OutputUnsqueeze', + dim=np.insert(dim, ext_out['axis'], 1))) ext_out['internal_data_id'] = reshape_op.create_node_with_data([ext_out['internal_data_id']]) # TODO: add here working with simple outputs - ext_out['internal_data_id']['is_output'] = True - #assert len(ext_out['internal_data_id'].out_nodes()) == 0 + add_opoutput(body, ext_out['internal_data_id'].id, 0, False) + # assert len(ext_out['internal_data_id'].out_nodes()) == 0 assert len(ext_out['internal_data_id'].in_nodes()) == 1 if not 'internal_layer_id' in ext_out['internal_data_id'].in_node(): ext_out['internal_data_id'].in_node()['internal_layer_id'] = internal_id_count @@ -322,16 +335,22 @@ class TensorIteratorMerge(MiddleReplacementPattern): ti_op = TensorIterator(graph, { 'name': name + '/TensorIterator', 'body': body, + 'in_ports_count': len(external_inputs), + 'out_ports_count': len(external_outputs), 'input_port_map': [ - {field: external_input[field] for field in [ 'external_port_id', 'internal_layer_id', 'internal_port_id', 'axis', 'stride', 'part_size', 'start', 'end']} + {field: external_input[field] for field in + ['external_port_id', 'internal_layer_id', 'internal_port_id', 'axis', 'stride', 'part_size', 'start', + 'end']} for external_input in real_external_inputs], 'output_port_map': [ - {field: external_output[field] for field in [ 'external_port_id', 'internal_layer_id', 'internal_port_id', 'axis', 'stride', 'part_size', 'start', 'end']} + {field: external_output[field] for field in + ['external_port_id', 'internal_layer_id', 'internal_port_id', 'axis', 'stride', 'part_size', 'start', + 'end']} for external_output in external_outputs], 'back_edges': [ - {field: edge[field] for field in [ 'from_layer', 'from_port', 'to_layer', 'to_port']} + {field: edge[field] for field in ['from_layer', 'from_port', 'to_layer', 'to_port']} for edge in real_back_edges], }) @@ -346,7 +365,3 @@ class TensorIteratorMerge(MiddleReplacementPattern): for i, out in enumerate(ti_outs): out.in_edge()['external_port_id'] = external_outputs[i]['external_port_id'] - - - - # Create TI operation diff --git a/model-optimizer/extensions/middle/TensorIteratorOutput.py b/model-optimizer/extensions/middle/TensorIteratorOutput.py index 695e776..07b64db 100644 --- a/model-optimizer/extensions/middle/TensorIteratorOutput.py +++ b/model-optimizer/extensions/middle/TensorIteratorOutput.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,8 @@ import logging as log -import networkx as nx - from extensions.ops.TensorIterator_ops import TensorIteratorOutput +from mo.graph.graph import Graph from mo.middle.replacement import MiddleReplacementPattern @@ -40,6 +39,15 @@ class SmartOutputMatcher(MiddleReplacementPattern): --------> Identity -> TensorArrayWrite -> NextIteration """ enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + from extensions.middle.TensorIteratorInput import SmartInputMatcher + return [SmartInputMatcher] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + return [TensorIteratorMerge] @staticmethod def pattern(): @@ -121,7 +129,7 @@ class SmartOutputMatcher(MiddleReplacementPattern): ) @staticmethod - def replace_pattern(graph: nx.MultiDiGraph, match: dict): + def replace_pattern(graph: Graph, match: dict): log.debug('================== SmartOutputFind ===============') assert match['WriteEnter_data'].value is not None @@ -149,3 +157,132 @@ class SmartOutputMatcher(MiddleReplacementPattern): if node not in safe_nodes: nodes_for_remove.append(match[node].id) graph.remove_nodes_from(nodes_for_remove) + + +class SimpleOutputMatcher(MiddleReplacementPattern): + """ + This pattern match partitioned outputs for TensorIterator in dynamic_rnn loops in TF. + The structure of pattern without Data nodes between ops. Every node is named as op attribute of this node + (data nodes is marked by (data)): + TensorArray + | | + Flow(data) Handle(data)------------------------------ + | | | + v v v + Enter -> Merge -> Switch -> Exit -> TensorArrayRead + | + | + | + | + --------> Identity -> TensorArrayWrite -> NextIteration + """ + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + return [SmartOutputMatcher] + + def run_before(self): + from extensions.middle.TensorIteratorMerge import TensorIteratorMerge + from extensions.middle.TensorIteratorCondition import LoopConditionMatcher + return [TensorIteratorMerge, LoopConditionMatcher] + + @staticmethod + def pattern(): + return dict( + nodes=[ + ('TensorArray', dict(kind='op', op='TensorArrayV3')), + ('TensorArray_data', dict(kind='data')), + ('TensorArray_flow_data', dict(kind='data')), + + ('TensorArrayWrite', dict(kind='op', op='TensorArrayWriteV3')), + ('TensorArrayWrite_data', dict(kind='data')), + + ('NextIteration', dict(kind='op', op='NextIteration')), + ('NextIteration_data', dict(kind='data')), + + ('Condition_data', dict(kind='data')), + + ('Identity_2', dict(kind='op', op='Identity')), + ('Identity_2_data', dict(kind='data')), + + ('Switch_2', dict(kind='op', op='Switch')), + ('Switch_2_data', dict(kind='data')), + ('Switch_2_data_exit', dict(kind='data')), + + ('Merge_2', dict(kind='op', op='Merge')), + ('Merge_2_data', dict(kind='data')), + + ('Enter_2', dict(kind='op', op='Enter')), + ('Enter_2_data', dict(kind='data')), + + ('WriteEnter', dict(kind='op', op='Enter')), + ('WriteEnter_data', dict(kind='data')), + + ('Exit', dict(kind='op', op='Exit')), + ('Exit_data', dict(kind='data')), + # + ('TensorArrayRead', dict(op='TensorArrayReadV3')), + ('TensorArrayRead_data', dict(kind='data')), + ], + edges=[ + ('TensorArray', 'TensorArray_data'), + ('TensorArray', 'TensorArray_flow_data'), + ('TensorArray_flow_data', 'Enter_2'), + ('TensorArray_data', 'WriteEnter'), + + + ('Enter_2', 'Enter_2_data'), + ('Enter_2_data', 'Merge_2'), + ('Merge_2', 'Merge_2_data'), + ('Merge_2_data', 'Switch_2'), + ('Switch_2', 'Switch_2_data'), + ('Switch_2', 'Switch_2_data_exit'), + ('Switch_2_data', 'Identity_2'), + ('Identity_2', 'Identity_2_data'), + + ('Switch_2_data_exit', 'Exit'), + ('Exit', 'Exit_data'), + ('Exit_data', 'TensorArrayRead'), + + ('WriteEnter', 'WriteEnter_data'), + ('WriteEnter_data', 'TensorArrayWrite', {'in': 0}), + + ('Identity_2_data', 'TensorArrayWrite', {'in': 3}), + # + ('TensorArrayWrite', 'TensorArrayWrite_data'), + ('TensorArrayWrite_data', 'NextIteration'), + ('Condition_data', 'Switch_2'), + # + ('TensorArray_data', 'TensorArrayRead'), + ('TensorArrayRead', 'TensorArrayRead_data'), + ('NextIteration', 'NextIteration_data'), + ('NextIteration_data', 'Merge_2'), + ], + ) + + @staticmethod + def replace_pattern(graph: Graph, match: dict): + log.debug('================== SimpleOutputFind ===============') + assert match['WriteEnter_data'].value is not None + + index = match['TensorArrayWrite'].in_node(1) + value = match['TensorArrayWrite'].in_node(2) + + # axis == 0 because in TensorArray we ALWAYS iterate over 0 axis, other params will be fill later (with + # condition) + output = TensorIteratorOutput(graph, dict( + external_port_id=str(match['WriteEnter_data'].value), + internal_layer_id=value.id, + name=match['TensorArrayWrite'].name + '/TensorIteratorOutput_' + )) + output.create_node_with_data(inputs=[value, index], + data_nodes=[match['TensorArrayRead_data']]) + + # Delete useless nodes + safe_nodes = ['TensorArrayRead_data', 'Condition_data'] + nodes_for_remove = [] + for node in match.keys(): + if node not in safe_nodes: + nodes_for_remove.append(match[node].id) + graph.remove_nodes_from(nodes_for_remove) diff --git a/model-optimizer/extensions/middle/TensorIteratorOutput_test.py b/model-optimizer/extensions/middle/TensorIteratorOutput_test.py index d6aa940..f141e99 100644 --- a/model-optimizer/extensions/middle/TensorIteratorOutput_test.py +++ b/model-optimizer/extensions/middle/TensorIteratorOutput_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/middle/TensorIterator_utils.py b/model-optimizer/extensions/middle/TensorIterator_utils.py index 40e0efc..f058758 100644 --- a/model-optimizer/extensions/middle/TensorIterator_utils.py +++ b/model-optimizer/extensions/middle/TensorIterator_utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,13 +13,22 @@ See the License for the specific language governing permissions and limitations under the License. """ - from mo.middle.replacement import MiddleReplacementPattern next_ops = ['NextIteration', 'TensorArrayWriteV3'] class DeleteSelect(MiddleReplacementPattern): + enabled = True + graph_condition = [lambda graph: graph.graph['is_cyclic']] + + def run_after(self): + from extensions.middle.AddIsCyclicAttribute import AddIsCyclicAttribute + return [AddIsCyclicAttribute] + + def run_before(self): + return [] + @staticmethod def pattern(): return dict( diff --git a/model-optimizer/extensions/middle/UselessMerge.py b/model-optimizer/extensions/middle/UselessMerge.py index b0923bc..d3ef24a 100644 --- a/model-optimizer/extensions/middle/UselessMerge.py +++ b/model-optimizer/extensions/middle/UselessMerge.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,8 @@ import logging as log -import networkx as nx - from extensions.middle.ConstSwitchResolver import ConstSwitchEraser +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node from mo.middle.replacement import MiddleReplacementPattern @@ -29,13 +28,17 @@ class UselessMergeEraser(MiddleReplacementPattern): def run_after(self): return [ConstSwitchEraser] + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[('merge', dict(kind='op', op='Merge'))], edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): if len(graph.in_edges(match['merge'].id)) <= 1: remove_op_node_with_data_node(graph, match['merge']) log.info("Useles Merge op and data nodes was deleted op='{}'".format(match['merge'].id)) diff --git a/model-optimizer/extensions/middle/UselessSplitEraser.py b/model-optimizer/extensions/middle/UselessSplitEraser.py new file mode 100644 index 0000000..4c8d318 --- /dev/null +++ b/model-optimizer/extensions/middle/UselessSplitEraser.py @@ -0,0 +1,46 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class UselessSplitEraser(MiddleReplacementPattern): + enabled = True + + def run_after(self): + from extensions.middle.pass_separator import PreMiddleStart + return [PreMiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def pattern(self): + return dict( + nodes=[('split', {'kind': 'op', 'op': 'Split', 'num_split': 1})], + edges=[] + ) + + def replace_pattern(self, graph: Graph, match: dict): + split_node = match['split'] + input = split_node.in_node(1) + output = split_node.out_node() + graph.remove_edge(input.id, split_node.id) + + for u, v, d in list(graph.out_edges(output.id, data=True)): + graph.add_edges_from([(input.id, v, d)]) + graph.remove_edge(u, v) diff --git a/model-optimizer/extensions/middle/UselessSridedSlice_test.py b/model-optimizer/extensions/middle/UselessSridedSlice_test.py index 8fbf240..5c4a25b 100644 --- a/model-optimizer/extensions/middle/UselessSridedSlice_test.py +++ b/model-optimizer/extensions/middle/UselessSridedSlice_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,7 +36,7 @@ nodes_attributes = { 'slices': [slice(0, 4, 1), slice(0, 5, 1), slice(0, 6, 1)]}, 'strided_slice_2_data': {'value': None, 'shape': np.array([4, 5, 6]), 'kind': 'data'}, # Output operation - 'output_op': {'type': 'OpOutput', 'kind': 'op', 'op': 'OpOutput', 'output_op': {'is_output': True}}, + 'output_op': {'kind': 'op', 'op': 'OpOutput'}, } diff --git a/model-optimizer/extensions/middle/UselessStridedSlice.py b/model-optimizer/extensions/middle/UselessStridedSlice.py index b8272ea..6860a5a 100644 --- a/model-optimizer/extensions/middle/UselessStridedSlice.py +++ b/model-optimizer/extensions/middle/UselessStridedSlice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,11 @@ import logging as log -import networkx as nx import numpy as np from extensions.middle.ConvertGroupedStridedSlice import ConvertGroupedStridedSlice from extensions.middle.SliceConverter import ConvertSlice +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node from mo.middle.replacement import MiddleReplacementPattern @@ -40,7 +40,7 @@ class UselessStridedSliceEraser(MiddleReplacementPattern): edges=[] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): output_data_node = match['strided_slice'].out_node(0) input_data_node = match['strided_slice'].in_node(0) if np.array_equal(input_data_node.shape, output_data_node.shape) and \ @@ -49,6 +49,7 @@ class UselessStridedSliceEraser(MiddleReplacementPattern): # remove inputs to Strided Slice so it has just one input with data so we can use 'remove_op_node' function graph.remove_edge(match['strided_slice'].in_node(1).id, match['strided_slice'].id) graph.remove_edge(match['strided_slice'].in_node(2).id, match['strided_slice'].id) - graph.remove_edge(match['strided_slice'].in_node(3).id, match['strided_slice'].id) + if len(match['strided_slice'].in_nodes()) > 3: + graph.remove_edge(match['strided_slice'].in_node(3).id, match['strided_slice'].id) remove_op_node_with_data_node(graph, match['strided_slice']) diff --git a/model-optimizer/extensions/middle/decompose_bi_lstm.py b/model-optimizer/extensions/middle/decompose_bi_lstm.py deleted file mode 100644 index 0cfad4e..0000000 --- a/model-optimizer/extensions/middle/decompose_bi_lstm.py +++ /dev/null @@ -1,188 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import networkx as nx -import numpy as np -from copy import deepcopy - -from extensions.ops.lstm_sequence import LSTMSequence -from mo.utils.error import Error -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.concat import Concat -from mo.ops.op import Op -from mo.ops.split import Split -from mo.graph.graph import Node - - -class DecomposeBiLSTM(MiddleReplacementPattern): - ''' Decomposes bidirectional LSTMSequence to forward and reverse LSTM ops. - - To extract forward and reverse parts from initial blobs, the helper - functions used that should be already built-in into the operation attributes. - - Both initial state are split to two part, two parts of the results are concatenated. - Axis of split/concat is completelly defined by ONNX/LSTM specification. - ''' - - enabled = True - - def pattern(self): - return dict( - nodes=[ - ('lstm', dict(kind='op', op='LSTMSequence', format='onnx', direction='bidirectional')), - ('input', dict(kind='data')), - ('W', dict(kind='data')), - ('R', dict(kind='data')), - ], - edges=[ - ('input', 'lstm', {'in': 0}), - ('W', 'lstm', {'bin': 'W'}), - ('R', 'lstm', {'bin': 'R'}), - ] - ) - - - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): - bilstm = match['lstm'] - new_init_hiddens = self.split_data(bilstm.in_node(5)) - new_init_cells = self.split_data(bilstm.in_node(6)) - assert bilstm.has_valid('blob_bidirectional_split'), \ - 'Node {} doesnt\'t have blob_bidirectional_split attribute defined.'.format(bilstm.soft_get('name')) - splitted_W = bilstm.blob_bidirectional_split(bilstm.in_node(1)) - splitted_R = bilstm.blob_bidirectional_split(bilstm.in_node(2)) - splitted_B = bilstm.blob_bidirectional_split(bilstm.in_node(3)) if 3 in bilstm.in_nodes() else (None, None) - - outputs = self.split_bilstm( - bilstm, - new_init_hiddens, - new_init_cells, - splitted_W, - splitted_R, - splitted_B, - ) - - self.concat(bilstm, outputs[0], outputs[1], bilstm.out_nodes()) - - def split_data(self, data: Node): - """ Split data node into two part along 0 axis """ - assert len(data.shape) == 3 - assert data.shape[0] == 2 - - output_data = [Op._create_data_node(data.graph, name=data.name + '/SplittedBiLSTM/{}'.format(['forward', 'reverse'][i])) for i in [0, 1]] - split_op = Split(data.graph, dict(name=data.name + '/DecomposedBiLSTM_0', axis=0, num_split=2)) - return split_op.create_node_with_data([data], data_nodes=output_data) - - - def split_bilstm(self, - bilstm, - new_init_hiddens, - new_init_cells, - splitted_W, - splitted_R, - splitted_B): - """ Split one bilstm node into 2 one-directional lstm nodes. - - All input data nodes should be already prepared; they are - have 2 in the major dimension. - """ - assert len(bilstm.out_nodes()) == 3 - all_outputs = [] - for i in [0, 1]: - direction = ['forward', 'reverse'][i] - op = LSTMSequence(bilstm.graph, { - 'hidden_size': bilstm.hidden_size, - 'direction': direction, - 'batch_dim': bilstm.batch_dim, - 'sequence_dim': bilstm.sequence_dim, - 'blobs_wrb': bilstm.blobs_wrb, - 'has_num_directions': bilstm.has_num_directions, - 'format': bilstm.format, - 'name': bilstm.name + '/Split/' + direction, - }) - - output_data = Op._create_data_node( - bilstm.graph, - name=bilstm.out_node(0).name + '/Split/' + str(i), - attrs = {'shape': bilstm.out_node(0).shape.copy()} - ) - - assert output_data.shape[1] == 2 - output_data.shape[1] = 1 - - output_hidden = Op._create_data_node( - bilstm.graph, - name=bilstm.out_node(1).name + '/Split/' + str(i), - attrs = {'shape': bilstm.out_node(1).shape.copy()} - ) - - assert output_hidden.shape[0] == 2 - output_hidden.shape[0] = 1 - - output_cell = Op._create_data_node( - bilstm.graph, - name=bilstm.out_node(2).name + '/Split/' + str(i), - attrs = {'shape': bilstm.out_node(2).shape.copy()} - ) - - assert output_cell.shape[0] == 2 - output_cell.shape[0] = 1 - - all_outputs.append( - op.create_node_with_data( - inputs = [ - bilstm.in_node(0), - splitted_W[i], - splitted_R[i], - splitted_B[i], - None, - new_init_hiddens[i], - new_init_cells[i], - ], - data_nodes = [ - output_data, - output_hidden, - output_cell - ] - ) - ) - return all_outputs - - - def concat(self, bilstm, forward_outputs, reverse_outputs, final_outputs): - """ Concatenates two set of outputs from BiLSTM """ - - concat_ops = [ - Concat(bilstm.graph, { - 'name': bilstm.name + '/FinalConcat/Data', - 'axis': 1 - }), - Concat(bilstm.graph, { - 'name': bilstm.name + '/FinalConcat/HiddenState', - 'axis': 0 - }), - Concat(bilstm.graph, { - 'name': bilstm.name + '/FinalConcat/CellState', - 'axis': 0 - }) - ] - - bilstm.graph.remove_node(bilstm.id) - - for i in final_outputs: - concat_ops[i].create_node_with_data( - [forward_outputs[i], reverse_outputs[i]], - data_nodes=[final_outputs[i]] - ) diff --git a/model-optimizer/extensions/middle/lstm_sequence_normalize.py b/model-optimizer/extensions/middle/lstm_sequence_normalize.py deleted file mode 100644 index f2fe561..0000000 --- a/model-optimizer/extensions/middle/lstm_sequence_normalize.py +++ /dev/null @@ -1,281 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import networkx as nx -import numpy as np -from copy import deepcopy - -from extensions.middle.decompose_bi_lstm import DecomposeBiLSTM -from mo.utils.error import Error -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.ops.permute import Permute -from mo.ops.reshape import Reshape -from mo.graph.graph import Node - - -def inverse_perm(order: np.array): - indices = np.empty(order.size, dtype=np.int64) - indices[order] = np.arange(order.size) - return indices - - -def permute_before_and_after(inp: Node, middle: Node, out: Node, order): - ''' Insert two permutes: before middle node and after middle node. - - The first permute has a given order, the second permute has an - inversed order. - ''' - - permute = Permute(middle.graph, dict(order=np.array(order))) - - edge_attrs = deepcopy(middle.graph.get_edge_data(inp.id, middle.id)[0]) - middle.graph.remove_edge(inp.id, middle.id) - new_inp = permute.create_node_with_data([inp], dict(name=middle.name + '/InputPermute')) - middle.graph.add_edge(new_inp.id, middle.id, **edge_attrs) - - permute = Permute(middle.graph, dict(order=inverse_perm(np.array(order)))) - - middle.graph.remove_edge(middle.id, out.id) - new_out = Op._create_data_node(middle.graph, name=middle.name + '/WithoutPermute', attrs={'shape': out.shape[order]}) - middle.graph.add_edge(middle.id, new_out.id, key=0, out=0) - permute.create_node_with_data([new_out], dict(name=middle.name + '/OutputPermute'), data_nodes=out) - - -class LSTMSequenceNormalize(MiddleReplacementPattern): - ''' Convert blobs and shapes of ONNX-like LSTM to IE compatible form. - - Fuse W, R and optional B input blobs to weights and biases according - to IE LSTM specification. In case of bidirectional LSTM, the resulting - blobs are not directly supported by IE, but it will be further processed - by a separate transformation to break down to one-directional LSTMs. - - The target form of this operation is not normally covered by a dedicated - layer in IE. It should be further transformed to some other layer - that are supported by IE. This transformation pass involves weights and - shapes processing only. - - Post-conditions: - - Inputs have the following order: - 0: input data - 1: weights blob - 2: biases blob - 3: initial hidden state [optional] - 4: initial cell state [optional] - ''' - - enabled = True - - - def run_after(self): - return [ - DecomposeBiLSTM - ] - - - def pattern(self): - return dict( - nodes=[ - ('lstm', dict(kind='op', op='LSTMSequence', format='onnx')), - ('input', dict(kind='data')), - ('W', dict(kind='data')), - ('R', dict(kind='data')), - ], - edges=[ - ('input', 'lstm', {'in': 0}), - ('W', 'lstm', {'bin': 'W'}), - ('R', 'lstm', {'bin': 'R'}), - ] - ) - - - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): - self.repack_weights(graph, match) - if match['lstm'].has_num_directions: - self.squeeze_num_directions(graph, match) - self.batch_sequence_transpose(graph, match) - self.check_not_supported_ports(graph, match) - self.states_squeeze(graph, match) - - - def repack_weights(self, graph: nx.MultiDiGraph, match: dict): - - lstm = match['lstm'] - W = match['W'].value.copy() - R = match['R'].value.copy() - - # bidirectional case should be processed separately before this transformation - if lstm.direction not in ['forward', 'reverse']: - raise Error('ONNX/LSTM operator with `forward` or `reverse` is supported only. ' - 'Node {} has direction = {} which is not supported.'.format(lstm.name, lstm.direction)) - - graph.remove_edge(match['W'].id, lstm.id) - graph.remove_edge(match['R'].id, lstm.id) - - # find optional 'B' - if 3 in lstm.in_nodes(): - # TODO: check if 'bin': 'B' attribute is assigned to this edge - B = lstm.in_node(3).value.copy() - graph.remove_edge(lstm.in_node(3).id, lstm.id) - else: - B = np.full([1, lstm.hidden_size*8], 0, dtype=np.float32) - - # Add extra dimensions for W, R and B for easier repacking - - B = B.reshape([ - 1, # 0: num of directions, limitation: should be 1 - 2, # 1: two input parts of the matrix: W, R - 4, # 2: four output parts of the matrix for all gates in order: i, o, f, c - lstm.hidden_size, # 3: output size per direction and gate - 1, # 4: fake dimension to match the input dimension in W and R for shorter code - ]) - - W, R = [x.reshape([ - 1, # 0: num of directions, limitation: should be 1 - 1, # 1: dummy dimension to be aligned with B - 4, # 2: four output parts of the matrix for all gates in order: i, o, f, c - lstm.hidden_size, # 3: output size per direction and gate - -1]) # 4: input size/hidden size in W/R - for x in (W, R)] - - input_size = match['input'].shape[2] - assert input_size == W.shape[-1] - - WR = np.concatenate([W, R], axis=4) - - # Reorder gates: iofc --> fico - gate_reorder = [2, 0, 3, 1] - WR = np.take(WR, gate_reorder, axis=2) - B = np.take(B, gate_reorder, axis=2) - - # Sum component of B that correspond to W and R - B = np.add.reduce(B, axis=1, keepdims=True) - - # Reorder dimensions by collection output dimensions first, then input dimension - # Interpret the numbers below by looking at W, R and B reshape above in the code - inout_reorder = [0, 2, 3, 1, 4] - WR = WR.transpose(inout_reorder) - B = B.transpose(inout_reorder) - - # Supposing it is unidirectional LSTM, squeeze 'direction' dimension - assert WR.shape[0] == 1 - assert B.shape[0] == 1 - WR = WR.squeeze(axis=0) - B = B.squeeze(axis=0) - - # Flatten all output (0, 1) and input dimensions (2, 3) - final_shape = [WR.shape[0] * WR.shape[1], -1] - WR = WR.reshape(final_shape) - B = B.reshape(final_shape) - - # Squeeze fake dimension in B - B = B.squeeze(axis=-1) - - assert WR.ndim == 2 - assert B.ndim == 1 - assert WR.shape[0] == lstm.hidden_size*4 - assert B.shape[0] == lstm.hidden_size*4 - assert WR.shape[1] == lstm.hidden_size + input_size - - for blob, port, name in [(WR, 1, 'weights'), (B, 2, 'biases')]: - Op.create_and_connect_input_data_node( - graph, - lstm, - {'value': blob, 'shape': np.array(blob.shape, dtype=np.int64)}, - {'in': port, 'bin': name, 'permutation': None} - ) - - - def squeeze_num_directions(self, graph: nx.MultiDiGraph, match: dict): - """ Assuming considered LSTM node has num_directions in output shape, remove it. """ - lstm = match['lstm'] - # num_directions is at 1st position in output shape, please refer to LSTMSequence op definition - - direction_dim = [1, 0, 0] # index of dimension with direction index - for i in lstm.out_nodes(): - old_data_node = lstm.out_node(i) - old_shape = old_data_node.shape.copy() - new_shape = np.delete(old_shape, direction_dim[i]) - data = Op._create_data_node(graph, name=lstm.name + '/Out/{}/'.format(i), attrs={'shape': new_shape}) - graph.remove_edge(lstm.id, old_data_node.id) - graph.add_edge(lstm.id, data.id, key=0, out=i) - reshape = Reshape(graph, dict(dim=old_shape)) - reshape.create_node_with_data([data], dict(name=lstm.name + '/SqueezeNumDirections/{}'.format(i)), data_nodes=[old_data_node]) - - - def batch_sequence_transpose(self, graph: nx.MultiDiGraph, match: dict): - - lstm = match['lstm'] - inp = match['input'] - out = lstm.out_node(0) - - if lstm.batch_dim == 0: - assert lstm.sequence_dim == 1 - # nothing to do -- it's already in normal form - return - - assert lstm.sequence_dim == 0 - assert lstm.batch_dim == 1 - assert len(inp.shape) == 3 - - # Reorder the first two dimensions on both ends: input and output. - # Two Permute ops are inserted before and after the LSTM node. - # In this transformation we don't analyze the rest of the model around - # LSTM cell, so these Permute ops are not fused to some other layers here. - # But other transformations in the pipeline may optimize the Permute ops out. - - lstm.batch_dim, lstm.sequence_dim = lstm.sequence_dim, lstm.batch_dim - permute_before_and_after(inp, lstm, out, [1, 0, 2]) - - - def check_not_supported_ports(self, graph: nx.MultiDiGraph, match: dict): - lstm = match['lstm'] - inputs = lstm.in_edges() - assert 0 in inputs - assert 1 in inputs and inputs[1]['bin'] == 'weights' - assert 2 in inputs and inputs[2]['bin'] == 'biases' - assert 3 not in inputs - - if not(set(list(inputs.keys())) <= set([0, 1, 2, 5, 6])): - raise Error('Node {} that is interpreted as {} operation has ' - 'some unexpected inputs initialized, ' - 'they can include: sequence_lenght, ' - 'and weight tensor for peepholes. ' - 'This is not supported.'.format(lstm.name, lstm.op)) - - - def states_squeeze(self, graph: nx.MultiDiGraph, match: dict): - - lstm = match['lstm'] - - reshape = Reshape(graph, dict(dim=[lstm.in_node(0).shape[0], lstm.hidden_size])) - - if len(lstm.in_nodes()) > 3: - init_h = lstm.in_node(5) - edge_attrs = deepcopy(graph.get_edge_data(init_h.id, lstm.id)[0]) - edge_attrs['in'] = 3 - graph.remove_edge(init_h.id, lstm.id) - new_init_h = reshape.create_node_with_data([init_h], dict(name=lstm.name + '/HiddenStateResize')) - graph.add_edge(new_init_h.id, lstm.id, **edge_attrs) - - if len(lstm.in_nodes()) > 4: - init_c = lstm.in_node(6) - edge_attrs = deepcopy(graph.get_edge_data(init_c.id, lstm.id)[0]) - edge_attrs['in'] = 4 - graph.remove_edge(init_c.id, lstm.id) - new_init_c = reshape.create_node_with_data([init_c], dict(name=lstm.name + '/CellStateResize')) - graph.add_edge(new_init_c.id, lstm.id, **edge_attrs) diff --git a/model-optimizer/extensions/middle/lstm_sequence_normalize_test.py b/model-optimizer/extensions/middle/lstm_sequence_normalize_test.py deleted file mode 100644 index d15e680..0000000 --- a/model-optimizer/extensions/middle/lstm_sequence_normalize_test.py +++ /dev/null @@ -1,55 +0,0 @@ - -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import unittest -import numpy as np - -from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize -from mo.utils.unittest.graph import compare_graphs, build_graph_with_attrs -from mo.graph.graph import Node - - -class LSTMSequenceNormalizeTest(unittest.TestCase): - - def test_squeeze_num_directions(self): - tested_obj = LSTMSequenceNormalize() - pattern = tested_obj.pattern() - orig_shape = np.array([10, 1, 20, 128], dtype=np.int64) # seq_length, num_dims, batch_size, data_size - new_shape = np.array([10, 20, 128], dtype=np.int64) - graph = build_graph_with_attrs( - nodes_with_attrs=pattern['nodes'], - edges_with_attrs=pattern['edges'], - update_edge_attrs={ - ('W', 'lstm', 0): {'in': 1}, - ('R', 'lstm', 0): {'in': 2}, - }, - new_nodes_with_attrs=[ - ('output', {'shape': orig_shape}), - ], - new_edges_with_attrs=[ - ('lstm', 'output', {'out': 0}), - ], - ) - - lstm = Node(graph, 'lstm') - match = {'lstm': lstm} - tested_obj.squeeze_num_directions(graph, match) - self.assertTrue(np.array_equal(lstm.out_node(0).shape, new_shape)) - reshape_node = lstm.out_node(0).out_node(0) - self.assertTrue(reshape_node.op == 'Reshape') - self.assertTrue(np.array_equal(reshape_node.dim, orig_shape)) - self.assertTrue(reshape_node.out_node(0).id == 'output') diff --git a/model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py b/model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py deleted file mode 100644 index 17fb9b1..0000000 --- a/model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py +++ /dev/null @@ -1,168 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import networkx as nx -import numpy as np -from copy import deepcopy - -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.ops.reshape import Reshape -from mo.graph.graph import Node - - -class MXNetLSTMSequenceNormalize(MiddleReplacementPattern): - ''' Convert blobs and shapes of MXNet-like LSTM to IE compatible form. - - The target form of this operation is not normally covered by a dedicated - layer in IE. It should be further transformed to some other layer - that are supported by IE. This transformation pass involves weights and - shapes processing only. - - Post-conditions: - - Inputs have the following order: - 0: input data - 1: weights blob - 2: biases blob - 3: initial hidden state [optional] - 4: initial cell state [optional] - ''' - enabled = True - - def pattern(self): - return dict( - nodes=[ - ('lstm', dict(kind='op', op='LSTMSequence', format='mxnet')), - ('input', dict(kind='data')), - ('hidden_state', dict(kind='data')), - ('cell_state', dict(kind='data')), - ('params', dict(kind='data')), - ], - edges=[ - ('input', 'lstm', {'in': 0}), - ('hidden_state', 'lstm', {'in': 2}), - ('cell_state', 'lstm', {'in': 3}), - ('params', 'lstm', {'in': 1}), - ] - ) - - - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): - input = match['input'] - lstm = match['lstm'] - params = match['params'].value.copy() - hidden_state = match['hidden_state'] - cell_state = match['cell_state'] - - hidden_state_edge_attrs = deepcopy(graph.get_edge_data(hidden_state.id, lstm.id)[0]) - cell_state_edge_attrs = deepcopy(graph.get_edge_data(cell_state.id, lstm.id)[0]) - - graph.remove_edge(match['params'].id, lstm.id) - graph.remove_edge(match['hidden_state'].id, lstm.id) - graph.remove_edge(match['cell_state'].id, lstm.id) - - self.repack_weights(graph, input, lstm, params) - - reshape = Reshape(graph, dict(dim=[lstm.in_node(0).shape[0], lstm.hidden_size])) - - if len(lstm.in_nodes()) > 2: - hidden_state_edge_attrs['in'] = 3 - new_init_h = reshape.create_node_with_data([hidden_state], attrs=dict(name=lstm.name + '/HiddenStateResize')) - graph.add_edge(new_init_h.id, lstm.id, **hidden_state_edge_attrs) - - if len(lstm.in_nodes()) > 3: - cell_state_edge_attrs['in'] = 4 - new_init_c = reshape.create_node_with_data([cell_state], attrs=dict(name=lstm.name + '/CellStateResize')) - graph.add_edge(new_init_c.id, lstm.id, **cell_state_edge_attrs) - - - def repack_weights(self, graph: nx.MultiDiGraph, input: Node, lstm: Node, params: np.array): - input_size = input.shape[2] - - direction = 2 if lstm.has_num_directions else 1 - bsize = (2*lstm.hidden_size*direction*1)*4 - - assert direction == 1 - - W = np.array(params[0:len(params) - bsize]) - B = np.array(params[len(params) - bsize:]) - - WX = np.array(W[0:lstm.hidden_size*4*input_size]) - WH = np.array(W[lstm.hidden_size*4*input_size:]) - - WX = WX.reshape([lstm.hidden_size*4, input_size]) - WH = WH.reshape([lstm.hidden_size*4, lstm.hidden_size]) - - WX = WX.transpose([1, 0]) - WH = WH.transpose([1, 0]) - - WX = WX.reshape([ - 1, # 0: num of directions, limitation: should be 1 - -1, # 3: input size - 4, # 1: four output parts of the matrix for all gates in order: i, f, c, o - lstm.hidden_size, # 2: output size per direction and gate - ]) - - WH = WH.reshape([ - 1, # 0: num of directions, limitation: should be 1 - -1, # 3: hidden state size - 4, # 1: four output parts of the matrix for all gates in order: i, f, c, o - lstm.hidden_size, # 2: output size per direction and gate - ]) - - B = B.reshape([ - 1, # 0: num of directions, limitation: should be 1 - 2, # 3: num of component B - 4, # 1: four output parts of the matrix for all gates in order: i, f, c, o - lstm.hidden_size, # 2: output size per direction and gate - ]) - - assert WX.shape[1] == input_size - assert WH.shape[1] == lstm.hidden_size - - W = np.concatenate([WX, WH], axis=1) - - # Reorder gates: ifco --> fico - gate_reorder = [1, 0, 2, 3] - W = np.take(W, gate_reorder, axis=2) - B = np.take(B, gate_reorder, axis=2) - - inout_reorder = [0, 2, 3, 1] - W = W.transpose(inout_reorder) - B = B.transpose(inout_reorder) - - final_shape = [W.shape[0] * W.shape[1] * lstm.hidden_size, -1] - W = W.reshape(final_shape) - B = B.reshape(final_shape) - - # Sum component of B - B = np.add.reduce(B, axis=1, keepdims=True) - B = B.squeeze(axis=1) - - assert W.ndim == 2 - assert B.ndim == 1 - assert W.shape[0] == lstm.hidden_size * 4 - assert B.shape[0] == lstm.hidden_size * 4 - assert W.shape[1] == lstm.hidden_size + input_size - - for blob, port, name in [(W, 1, 'weights'), (B, 2, 'biases')]: - Op.create_and_connect_input_data_node( - graph, - lstm, - {'value': blob, 'shape': np.array(blob.shape, dtype=np.int64)}, - {'in': port, 'bin': name, 'permutation': None} - ) diff --git a/model-optimizer/extensions/middle/pass_separator.py b/model-optimizer/extensions/middle/pass_separator.py new file mode 100644 index 0000000..1b7e0aa --- /dev/null +++ b/model-optimizer/extensions/middle/pass_separator.py @@ -0,0 +1,58 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.graph.graph import Graph +from mo.middle.replacement import MiddleReplacementPattern + + +class PreMiddleStart(MiddleReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + pass + + +class MiddleStart(MiddleReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + + return [] + + def find_and_replace_pattern(self, graph: Graph): + pass + + +class MiddleFinish(MiddleReplacementPattern): + enabled = True + + def run_after(self): + return [] + + def run_before(self): + return [] + + def find_and_replace_pattern(self, graph: Graph): + pass + diff --git a/model-optimizer/extensions/middle/permute_tensor_iterator.py b/model-optimizer/extensions/middle/permute_tensor_iterator.py index fbd3d63..7696660 100644 --- a/model-optimizer/extensions/middle/permute_tensor_iterator.py +++ b/model-optimizer/extensions/middle/permute_tensor_iterator.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,35 +14,33 @@ limitations under the License. """ -import networkx as nx import numpy as np -from copy import deepcopy -from mo.graph.graph import copy_node, Node, dict_includes -from mo.utils.error import Error -from mo.middle.passes.eliminate import remove_op_node_with_data_node -from mo.middle.pattern_match import find_isomorphisms, find_pattern_matches -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from extensions.ops.lstm_sequence import LSTMSequence from extensions.middle.FusePermutesSequence import FusePermutesSequence +from extensions.middle.LSTMRNNSequenceToTensorIterator import LSTMToTensorIterator +from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize from extensions.middle.TensorIteratorMerge import TensorIteratorMerge -from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize, permute_before_and_after -from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator -from extensions.middle.decompose_bi_lstm import DecomposeBiLSTM +from mo.graph.graph import dict_includes, Graph +from mo.middle.passes.eliminate import remove_op_node_with_data_node +from mo.middle.pattern_match import find_isomorphisms +from mo.middle.replacement import MiddleReplacementPattern class PermuteTensorIteratorLSTM(MiddleReplacementPattern): - ''' Fuses Permute(1,0,2) --> TI --> Permute(1,0,2) pattern to a single TI with changed axis. + """ Fuses Permute(1,0,2) --> TI --> Permute(1,0,2) pattern to a single TI with changed axis. WARNING This transformation is limited to support of very special case of TI but code doesn't check all the cases. - ''' + """ enabled = True def run_after(self): - return [TensorIteratorMerge, LSTMSequenceNormalize, LSTMSequenceTensorIterator, FusePermutesSequence, DecomposeBiLSTM] + return [TensorIteratorMerge, ONNXRNNSequenceNormalize, LSTMToTensorIterator, FusePermutesSequence] + + + def run_before(self): + return [] def pattern(self): return dict( @@ -63,21 +61,21 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern): ('input', 'direct_permute'), ('direct_permute', 'input_permuted'), - ('input_permuted', 'ti', {'in': 0}), # affected by permute + ('input_permuted', 'ti', {'in': 0}), # affected by permute ('init_hidden', 'ti', {'in': 1}), ('init_cell', 'ti', {'in': 2}), - ('ti', 'output_permuted', {'out': 0}), # affected by permute + ('ti', 'output_permuted', {'out': 0}), # affected by permute ('output_permuted', 'inverse_permute'), ('inverse_permute', 'output'), ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): # This transformation works if and only if a body of TI # matches the following topology (Reshape -> LSTMCell -> Reshape) - nodes=[ + nodes = [ ('input_unsqueezed'), ('squeeze', dict(op='Reshape')), ('input_squeezed'), @@ -92,8 +90,16 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern): ('output_cell'), ('unsqueeze', dict(op='Reshape')), ('output_unsqueezed'), + + ('const_w', dict(op='Const')), + ('const_b', dict(op='Const')), + + ('op_output', dict(op='OpOutput')), + ('op_output_1', dict(op='OpOutput')), + ('op_output_2', dict(op='OpOutput')) + ] - edges=[ + edges = [ ('input_unsqueezed', 'squeeze'), ('squeeze', 'input_squeezed'), @@ -103,11 +109,19 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern): ('weights', 'lstm', {'in': 3}), ('biases', 'lstm', {'in': 4}), + ('const_w', 'weights'), + ('const_b', 'biases'), + ('lstm', 'output_hidden', {'out': 0}), ('lstm', 'output_cell', {'out': 1}), ('output_hidden', 'unsqueeze'), ('unsqueeze', 'output_unsqueezed'), + + ('output_unsqueezed', 'op_output'), + ('output_hidden', 'op_output_1'), + ('output_cell', 'op_output_2'), + ] ti = match['ti'] isomorphisms = find_isomorphisms(ti.body, nodes, edges) @@ -126,7 +140,6 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern): if not inverse_permute.has_valid('order') or not np.array_equal(inverse_permute.order, permute_order): return - def find_ports(port_map: list, attrs: dict): """ Find all ports in a given port map with specified attributes """ result = [] diff --git a/model-optimizer/extensions/middle/reverse_tensor_iterator.py b/model-optimizer/extensions/middle/reverse_tensor_iterator.py index 7cd529b..62f5133 100644 --- a/model-optimizer/extensions/middle/reverse_tensor_iterator.py +++ b/model-optimizer/extensions/middle/reverse_tensor_iterator.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,15 +14,10 @@ limitations under the License. """ -import networkx as nx - -from mo.middle.replacement import MiddleReplacementPattern -from extensions.ops.lstm_sequence import LSTMSequence from extensions.middle.FusePermutesSequence import FusePermutesSequence -from extensions.middle.TensorIteratorMerge import TensorIteratorMerge -from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize -from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator +from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize from extensions.middle.permute_tensor_iterator import PermuteTensorIteratorLSTM +from mo.graph.graph import Graph from mo.middle.passes.eliminate import remove_op_node_with_data_node from mo.middle.replacement import MiddleReplacementPattern @@ -38,13 +33,16 @@ class ReverseTensorIteratorLSTM(MiddleReplacementPattern): def run_after(self): return [ - TensorIteratorMerge, - LSTMSequenceNormalize, - LSTMSequenceTensorIterator, + ONNXRNNSequenceNormalize, + FusePermutesSequence, PermuteTensorIteratorLSTM, ] + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + def pattern(self): return dict( nodes=[ @@ -52,7 +50,6 @@ class ReverseTensorIteratorLSTM(MiddleReplacementPattern): ('direct_reverse', dict(op='ReverseSequence')), ('input_reversed'), ('init_hidden'), - ('init_cell'), ('ti', dict(kind='op', op='TensorIterator')), @@ -66,7 +63,6 @@ class ReverseTensorIteratorLSTM(MiddleReplacementPattern): ('input_reversed', 'ti', {'in': 0}), ('init_hidden', 'ti', {'in': 1}), - ('init_cell', 'ti', {'in': 2}), ('ti', 'output_reversed', {'out': 0}), ('output_reversed', 'inverse_reverse', {'in': 0}), @@ -74,21 +70,21 @@ class ReverseTensorIteratorLSTM(MiddleReplacementPattern): ] ) - def replace_pattern(self, graph: nx.MultiDiGraph, match: dict): + def replace_pattern(self, graph: Graph, match: dict): ti = match['ti'] direct_reverse = match['direct_reverse'] inverse_reverse = match['inverse_reverse'] - assert direct_reverse.seq_dim == inverse_reverse.seq_dim - assert direct_reverse.batch_dim is None and inverse_reverse.batch_dim is None or \ - direct_reverse.batch_dim == inverse_reverse.batch_dim + assert direct_reverse.seq_axis == inverse_reverse.seq_axis + assert direct_reverse.batch_axis is None and inverse_reverse.batch_axis is None or \ + direct_reverse.batch_axis == inverse_reverse.batch_axis # Modify stride in TI for port_map in [ti.input_port_map, ti.output_port_map]: for port in port_map: if 'axis' in port and port['axis'] is not None and 'external_port_id' in port: - assert port['axis'] == direct_reverse.seq_dim, \ - 'axis == {} != {} == direct_reverse.seq_dim'.format(port['axis'], direct_reverse.seq_dim) + assert port['axis'] == direct_reverse.seq_axis, \ + 'axis == {} != {} == direct_reverse.seq_dim'.format(port['axis'], direct_reverse.seq_axis) if 'stride' not in port or port['stride'] is None: port['stride'] = 1 assert port['stride'] in [-1, 1] diff --git a/model-optimizer/extensions/ops/BlockLSTM.py b/model-optimizer/extensions/ops/BlockLSTM.py index 8e3ac7f..3e28b17 100644 --- a/model-optimizer/extensions/ops/BlockLSTM.py +++ b/model-optimizer/extensions/ops/BlockLSTM.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.utils import mark_input_bins -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op import numpy as np @@ -25,10 +25,11 @@ import numpy as np class BlockLSTM(Op): op = 'BlockLSTM' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, - 'infer': __class__.infer + 'infer': __class__.infer, + 'type': __class__.op, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/Cast.py b/model-optimizer/extensions/ops/Cast.py new file mode 100644 index 0000000..5176994 --- /dev/null +++ b/model-optimizer/extensions/ops/Cast.py @@ -0,0 +1,40 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Node, Graph +from mo.ops.op import Op + + +class Cast(Op): + op = 'Cast' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'op': __class__.op, + 'infer': __class__.infer, + 'dst_type': None, + 'in_ports_count': 1, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def infer(node: Node): + assert node.has_valid('dst_type'), 'Destination type of "Cast" operation should be extracted earlier' + copy_shape_infer(node, lambda n: n.in_node().value.astype(n.dst_type)) diff --git a/model-optimizer/extensions/ops/DetectionOutput.py b/model-optimizer/extensions/ops/DetectionOutput.py index 6eb3d93..fb2f91d 100644 --- a/model-optimizer/extensions/ops/DetectionOutput.py +++ b/model-optimizer/extensions/ops/DetectionOutput.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.multi_box_detection import multi_box_detection_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,10 +23,12 @@ class DetectionOutput(Op): op = 'DetectionOutput' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': multi_box_detection_infer, 'input_width': 1, 'input_height': 1, @@ -39,7 +40,8 @@ class DetectionOutput(Op): def supported_attrs(self): return [ 'background_label_id', - 'clip', + 'clip_after_nms', + 'clip_before_nms', 'code_type', 'confidence_threshold', 'eta', @@ -70,4 +72,5 @@ class DetectionOutput(Op): 'visualize_threshold', 'width', 'width_scale', + 'objectness_score', ] diff --git a/model-optimizer/extensions/ops/Enter.py b/model-optimizer/extensions/ops/Enter.py index edc27d5..73bda61 100644 --- a/model-optimizer/extensions/ops/Enter.py +++ b/model-optimizer/extensions/ops/Enter.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -25,10 +25,11 @@ from mo.utils.error import Error class Enter(Op): op = "Enter" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, 'infer': Enter.enter_infer, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/Exit.py b/model-optimizer/extensions/ops/Exit.py index 6f5c8d9..a06f6ef 100644 --- a/model-optimizer/extensions/ops/Exit.py +++ b/model-optimizer/extensions/ops/Exit.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ - -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -25,11 +23,12 @@ from mo.utils.error import Error class Exit(Op): op = "Exit" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, 'infer': Exit.exit_infer, + 'in_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/GRU.py b/model-optimizer/extensions/ops/GRU.py new file mode 100644 index 0000000..16b1909 --- /dev/null +++ b/model-optimizer/extensions/ops/GRU.py @@ -0,0 +1,81 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from extensions.ops.RNN import rnn_infer +from mo.graph.graph import Node, Graph +from mo.ops.op import Op +import numpy as np + + +class GRU(Op): + op = 'GRU' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': 'RNNSequence', # should be never emitted to IR; for debugging purposes + 'op': __class__.op, + 'blobs_wrb': False, + 'has_num_directions': False, + 'direction': 'forward', + 'infer': __class__.infer, + 'multiplier': 3, + 'multilayers': False, + 'gate_order': np.array([0, 1, 2]), # TODO: change it later + 'normalized': False, + + 'activation_alpha': None, + 'activation_beta': None, + 'activations': None, + 'clip': None, + 'linear_before_reset': None, + 'in_ports_count': 6, + 'out_ports_count': 2, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def supported_attrs(): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'direction', # one of 'forward', 'reverse', or 'bidirectional' + 'axis', + + 'activation_alpha', + 'activation_beta', + 'activations', + 'clip', + 'linear_before_reset', + ] + + def backend_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'direction', # one of 'forward', 'reverse', or 'bidirectional' + 'axis', + + 'activation_alpha', + 'activation_beta', + ('activations', lambda node: ','.join(node.activations) if node.activations is not None else None), + 'clip', + 'linear_before_reset', + ] + + @staticmethod + def infer(node: Node): + assert len(node.in_nodes()) >= 3 # X, W and R + assert len(node.in_nodes()) <= 5 + assert len(node.out_nodes()) <= 2 + + rnn_infer(node, [1]) diff --git a/model-optimizer/extensions/ops/GRUCell.py b/model-optimizer/extensions/ops/GRUCell.py new file mode 100644 index 0000000..120aedd --- /dev/null +++ b/model-optimizer/extensions/ops/GRUCell.py @@ -0,0 +1,83 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.partial_infer.utils import mark_input_bins +from mo.graph.graph import Node, Graph +from mo.ops.op import Op +from mo.utils.error import Error + + +class GRUCell(Op): + """ A single GRU cell (without a loop). + + 2 inputs: + - [0, required] input data (2D), + - [1, required] initial hidden state (2D), + + 2 blobs: + - [2, required] cell FC weights + - [3, required] cell FC biases + + 1 outputs: + - [required] output data / resulting hidden state (2D) + """ + op = 'GRUCell' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': __class__.op, + 'op': __class__.op, + 'infer': __class__.infer, + 'in_ports_count': 4, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) + + def supported_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'activations', + 'activation_alpha', + 'activation_beta', + 'clip', + 'linear_before_reset', + ] + + def backend_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + ('activations', lambda node: ','.join(node.activations) if node.activations is not None else None), + 'activation_alpha', + 'activation_beta', + 'clip', + 'linear_before_reset', + ] + + @staticmethod + def infer(node: Node): + assert len(node.out_nodes()) in [1, 2] + + hidden_shape = node.in_node(1).shape.copy() + + mark_input_bins(node, start_port=2) + node.out_node(0).shape = hidden_shape + + hidden_size = hidden_shape[1] + if node.has_valid('hidden_size'): + if node.hidden_size != hidden_size: + raise Error("Input shape {} for hidden size doesn't match pre-defined hidden_size in node {}".format( + node.in_node(1).shape, node.soft_get('name'))) + else: + node['hidden_size'] = hidden_size diff --git a/model-optimizer/extensions/ops/GatherNd.py b/model-optimizer/extensions/ops/GatherNd.py new file mode 100644 index 0000000..9a4de3e --- /dev/null +++ b/model-optimizer/extensions/ops/GatherNd.py @@ -0,0 +1,47 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.graph.graph import Node, Graph +from mo.ops.op import Op + + +class GatherNd(Op): + op = 'GatherNd' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'op': __class__.op, + 'infer': __class__.infer, + 'in_ports_count': 2, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) + + def supported_attrs(self): + return [] + + @staticmethod + def infer(node: Node): + input_node = node.in_node(0) + indices = node.in_node(1).value + + assert indices is not None + + output_shape = list(indices.shape[:-1]) + list(input_node.shape[indices.shape[-1]:]) + node.out_node().shape = np.array(output_shape, dtype=np.int64) + # TODO: implement constant path diff --git a/model-optimizer/extensions/ops/LSTM.py b/model-optimizer/extensions/ops/LSTM.py new file mode 100644 index 0000000..196d653 --- /dev/null +++ b/model-optimizer/extensions/ops/LSTM.py @@ -0,0 +1,82 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from extensions.ops.RNN import rnn_infer +from mo.graph.graph import Node, Graph +from mo.ops.op import Op + + +class LSTM(Op): + op = 'LSTM' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': 'RNNSequence', # should be never emitted to IR; for debugging purposes + 'op': __class__.op, + 'blobs_wrb': False, # input blobs have three separate components W, R and B like in ONNX/LSTM + 'has_num_directions': False, # if True, output shape has 4 dimensions; 3D otherwise + 'direction': 'forward', + 'infer': __class__.infer, + 'multiplier': 4, + 'gate_order': None, + 'normalized': False, + 'multilayers': False, + 'format': None, # format type of input blobs for different frameworks (onnx, tf, mxnet), + + 'activation_alpha': None, + 'activation_beta': None, + 'activations': None, + 'clip': None, + 'input_forget': None, + 'in_ports_count': 7, + 'out_ports_count': 3, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def supported_attrs(): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'direction', # one of 'forward', 'reverse', or 'bidirectional' + 'axis', + + 'activation_alpha', + 'activation_beta', + 'activations', + 'clip', + # 'input_forget', # Not supported yet + ] + + def backend_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'direction', # one of 'forward', 'reverse', or 'bidirectional' + 'axis', + + 'activation_alpha', + 'activation_beta', + ('activations', lambda node: ','.join(node.activations) if node.activations is not None else None), + 'clip', + # 'input_forget', # Not supported yet + ] + + @staticmethod + def infer(node: Node): + # there are limitations coming from ONNX LSTM definition and normalization rules + assert len(node.in_nodes()) >= 3 # X, W and R + assert len(node.in_nodes()) <= 7 + assert len(node.out_nodes()) <= 3 + + rnn_infer(node, [1, 2]) diff --git a/model-optimizer/extensions/ops/NextIteration.py b/model-optimizer/extensions/ops/NextIteration.py index 5ee49af..3a4a5fe 100644 --- a/model-optimizer/extensions/ops/NextIteration.py +++ b/model-optimizer/extensions/ops/NextIteration.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,20 +14,20 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class NextIteration(Op): op = "NextIteration" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, 'infer': NextIteration.enter_infer, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/RNN.py b/model-optimizer/extensions/ops/RNN.py new file mode 100644 index 0000000..ba0a024 --- /dev/null +++ b/model-optimizer/extensions/ops/RNN.py @@ -0,0 +1,154 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from mo.front.common.partial_infer.utils import mark_input_bins +from mo.graph.graph import Node, Graph, add_opoutput +from mo.ops.op import Op + + +class RNN(Op): + op = 'RNN' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': 'RNNSequence', # should be never emitted to IR; for debugging purposes + 'op': __class__.op, + 'blobs_wrb': False, + 'has_num_directions': False, + 'direction': 'forward', + 'infer': __class__.infer, + 'multiplier': 1, + 'gate_order': np.array([0]), # Only one gate in this cell + 'normalized': False, + + 'activation_alpha': None, + 'activation_beta': None, + 'activations': None, + 'clip': None, + 'in_ports_count': 6, + 'out_ports_count': 2, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def supported_attrs(): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'direction', # one of 'forward', 'reverse', or 'bidirectional' + 'axis', + + # Additional attributes + 'activation_alpha', + 'activation_beta', + 'activations', + 'clip', + ] + + def backend_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'direction', # one of 'forward', 'reverse', or 'bidirectional' + 'axis', + + # Additional attributes + 'activation_alpha', + 'activation_beta', + ('activations', lambda node: ','.join(node.activations) if node.activations is not None else None), + 'clip', + ] + + @staticmethod + def infer(node: Node): + assert len(node.in_nodes()) >= 3 # X, W and R + assert len(node.in_nodes()) <= 5 + assert len(node.out_nodes()) <= 2 + + rnn_infer(node, [1]) + + +def rnn_infer(node: Node, out_ports=None): + """ + General infer function for RNN, GRU, LSTM layers. + Assume that 0-port input of node is input data for recurrent layer and node have attrs: + hidden_size, + """ + if out_ports is None: + out_ports = [] + + # 1. Necessary checks (from ONNX specification) + assert node.batch_dim <= 1 + assert node.sequence_dim <= 1 + assert node.batch_dim != node.sequence_dim + assert node.direction in ['forward', 'reverse', 'bidirectional'] + + if node.blobs_wrb: + mark_input_bins(node, ['W', 'R', 'B']) + else: + mark_input_bins(node) + + # 2. Output shape calculations + input_shape = node.in_node(0).shape + assert len(input_shape) == 3 + + # Reshape input nodes + for port in [2, 3]: + if port in node.in_nodes() and len(node.in_node(port).in_nodes()) > 0 and \ + 'zero_shapes' in node.in_node(port).in_node(): + for i in node.in_node(port).in_node().zero_shapes: + if node.in_node(port).shape[i] != input_shape[i]: + node.in_node(port).value = np.repeat(node.in_node(port).value, input_shape[i], axis=i) + node.in_node(port).shape[i] = input_shape[i] + + out_shape = np.array([input_shape[node.sequence_dim], input_shape[node.batch_dim], node.hidden_size], dtype=np.int64) + + if node.batch_dim == 0: + out_shape = np.array([input_shape[node.batch_dim], input_shape[node.sequence_dim], node.hidden_size], dtype=np.int64) + + num_directions = 2 if node.direction in ['bidirectional'] else 1 + if node.has_num_directions: + if node.format == 'mxnet' and node.normalized is False: + # In MXNet RNN layer return output with shape [seq_len, batch_size, hidden_size * num_directions] + out_shape[-1] *= num_directions + else: + # ONNX-like, insert extra dimension to output shape for num_directions + out_shape = np.insert(out_shape, 1, np.int64(num_directions)) + node.out_node(0).shape = out_shape + + # 3. Extra outputs for hidden/cell states shape calculations (optional) + state_size = np.array([input_shape[node.batch_dim], node.hidden_size], dtype=np.int64) + if node.has_num_directions: + state_size = np.insert(state_size, 0, num_directions) + + if node.multilayers: + # For multilayer case state sizes from every layer will be concatenated by last axis + num_layers = node.num_layers + state_size[-1] *= num_layers + + for i in out_ports: + # If node hasn't consumers for hidden/cells state -> create them + if i not in node.out_nodes(): + data_node = Op._create_data_node( + node.graph, + name=node.node + '/ExtraOutput/' + str(i), + attrs={'executable': True} + ) + node.add_output_port(i) + node.graph.add_edge(node.id, data_node.id, key=0, out=i) + add_opoutput(node.graph, data_node.id, 0, False) + else: + data_node = node.out_node(i) + data_node.shape = state_size.copy() diff --git a/model-optimizer/extensions/ops/RNNCell.py b/model-optimizer/extensions/ops/RNNCell.py new file mode 100644 index 0000000..0fd71ed --- /dev/null +++ b/model-optimizer/extensions/ops/RNNCell.py @@ -0,0 +1,81 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.front.common.partial_infer.utils import mark_input_bins +from mo.graph.graph import Graph, Node +from mo.ops.op import Op +from mo.utils.error import Error + + +class RNNCell(Op): + """ A single RNN cell (without a loop). + + 2 inputs: + - [0, required] input data (2D), + - [1, required] initial hidden state (2D), + + 2 blobs: + - [2, required] cell FC weights + - [3, required] cell FC biases + + 1 outputs: + - [required] output data / resulting hidden state (2D) + """ + op = 'RNNCell' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': __class__.op, + 'op': __class__.op, + 'infer': __class__.infer, + 'in_ports_count': 4, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) + + def supported_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + 'activations', + 'activation_alpha', + 'activation_beta', + 'clip', + ] + + def backend_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + ('activations', lambda node: ','.join(node.activations) if node.activations is not None else None), + 'activation_alpha', + 'activation_beta', + 'clip', + ] + + @staticmethod + def infer(node: Node): + assert len(node.out_nodes()) in [1, 2] + + hidden_shape = node.in_node(1).shape.copy() + + mark_input_bins(node, start_port=2) + node.out_node(0).shape = hidden_shape + + hidden_size = hidden_shape[1] + if node.has_valid('hidden_size'): + if node.hidden_size != hidden_size: + raise Error("Input shape {} for hidden size doesn't match pre-defined hidden_size in node {}".format( + node.in_node(1).shape, node.soft_get('name'))) + else: + node['hidden_size'] = hidden_size diff --git a/model-optimizer/extensions/ops/Reverse.py b/model-optimizer/extensions/ops/Reverse.py new file mode 100644 index 0000000..66dcf4e --- /dev/null +++ b/model-optimizer/extensions/ops/Reverse.py @@ -0,0 +1,47 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from mo.graph.graph import Graph +from mo.ops.op import Op + + +class Reverse(Op): + op = 'Reverse' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + # 'type': __class__.op, # Internal MO primitive + 'axis': None, + 'op': __class__.op, + 'infer': __class__.infer, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def infer(node): + input_data_shape = node.in_node(0).shape + assert input_data_shape is not None + if not node.has_valid('axis'): + assert 1 in node.in_nodes() + assert node.in_node(1).has_valid('value') + assert node.in_node(1).value.size == 1 + + node['axis'] = node.in_node(1).value.item() + node.in_port(1).disconnect() + + assert node.has_valid('axis') + + assert len(node.out_nodes()) == 1 + node.out_node().shape = input_data_shape.copy() diff --git a/model-optimizer/extensions/ops/SquaredDifference.py b/model-optimizer/extensions/ops/SquaredDifference.py index f1c3735..ce85851 100644 --- a/model-optimizer/extensions/ops/SquaredDifference.py +++ b/model-optimizer/extensions/ops/SquaredDifference.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.eltwise import eltwise_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,9 +25,11 @@ class SquaredDifference(Op): op = 'SquaredDifference' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, # IE layer type, not required if this op won't be dumped to IE 'op': __class__.op, # internal MO name for the operation, can be the same as type; required + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': lambda node: eltwise_infer(node, lambda a, b: (a - b) ** 2)}, attrs) diff --git a/model-optimizer/extensions/ops/TensorArray.py b/model-optimizer/extensions/ops/TensorArray.py index 9108e05..6fd80f5 100644 --- a/model-optimizer/extensions/ops/TensorArray.py +++ b/model-optimizer/extensions/ops/TensorArray.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,16 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class TensorArray(Op): op = "TensorArrayV3" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/TensorArrayGather.py b/model-optimizer/extensions/ops/TensorArrayGather.py index 221c0c9..ef6a05a 100644 --- a/model-optimizer/extensions/ops/TensorArrayGather.py +++ b/model-optimizer/extensions/ops/TensorArrayGather.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.utils import symm_match_shapes @@ -25,7 +25,7 @@ from mo.utils.utils import symm_match_shapes class TensorArrayGather(Op): op = "TensorArrayGatherV3" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/TensorArrayRead.py b/model-optimizer/extensions/ops/TensorArrayRead.py index 2b35159..6184e45 100644 --- a/model-optimizer/extensions/ops/TensorArrayRead.py +++ b/model-optimizer/extensions/ops/TensorArrayRead.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,14 +17,14 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class TensorArrayReader(Op): op = "TensorArrayReadV3" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/TensorArrayScatter.py b/model-optimizer/extensions/ops/TensorArrayScatter.py index cb30e87..4f46007 100644 --- a/model-optimizer/extensions/ops/TensorArrayScatter.py +++ b/model-optimizer/extensions/ops/TensorArrayScatter.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.utils import match_shapes @@ -25,7 +24,7 @@ from mo.utils.utils import match_shapes class TensorArrayScatter(Op): op = "TensorArrayScatterV3" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/TensorArraySize.py b/model-optimizer/extensions/ops/TensorArraySize.py index a16a06a..b5feac8 100644 --- a/model-optimizer/extensions/ops/TensorArraySize.py +++ b/model-optimizer/extensions/ops/TensorArraySize.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,16 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class TensorArraySize(Op): op = "TensorArraySizeV3" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/TensorArrayWrite.py b/model-optimizer/extensions/ops/TensorArrayWrite.py index 4330460..d9ace73 100644 --- a/model-optimizer/extensions/ops/TensorArrayWrite.py +++ b/model-optimizer/extensions/ops/TensorArrayWrite.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.utils import match_shapes @@ -25,7 +24,7 @@ from mo.utils.utils import match_shapes class TensorArrayWriter(Op): op = "TensorArrayWriteV3" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/TensorIterator_ops.py b/model-optimizer/extensions/ops/TensorIterator_ops.py index 8e408b8..bac24af 100644 --- a/model-optimizer/extensions/ops/TensorIterator_ops.py +++ b/model-optimizer/extensions/ops/TensorIterator_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,15 +16,16 @@ import networkx as nx -import numpy as np -from mo.graph.graph import Node + +from mo.graph.graph import Node, Graph from mo.ops.op import Op + # TODO: check all supported attributes in this file class TensorIteratorInput(Op): op = "TensorIteratorInput" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, 'axis': None, @@ -32,6 +33,8 @@ class TensorIteratorInput(Op): 'end': None, 'stride': None, 'part_size': None, + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': TensorIteratorInput.input_infer, } super().__init__(graph, mandatory_props, attrs) @@ -47,7 +50,7 @@ class TensorIteratorInput(Op): class TensorIteratorOutput(Op): op = "TensorIteratorOutput" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, 'axis': None, @@ -55,6 +58,8 @@ class TensorIteratorOutput(Op): 'end': None, 'stride': None, 'part_size': None, + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': TensorIteratorOutput.input_infer, } super().__init__(graph, mandatory_props, attrs) @@ -70,16 +75,15 @@ class TensorIteratorOutput(Op): class TensorIteratorCondition(Op): op = "TensorIteratorCondition" - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, + 'in_ports_count': 2, + 'out_ports_count': 2, 'infer': TensorIteratorCondition.input_infer, } super().__init__(graph, mandatory_props, attrs) - def supported_attrs(self): - return ['time', 'iter'] - @staticmethod def input_infer(node: Node): pass @@ -88,17 +92,15 @@ class TensorIteratorCondition(Op): class TensorIteratorBackEdge(Op): op = 'TensorIteratorBackEdge' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': TensorIteratorBackEdge.input_infer, } super().__init__(graph, mandatory_props, attrs) @staticmethod - def supported_attrs(): - return ['is_output'] - - @staticmethod def input_infer(node: Node): pass diff --git a/model-optimizer/extensions/ops/accum.py b/model-optimizer/extensions/ops/accum.py index b361c01..04446ab 100644 --- a/model-optimizer/extensions/ops/accum.py +++ b/model-optimizer/extensions/ops/accum.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,16 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class AccumOp(Op): op = 'Accum' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, @@ -32,6 +31,7 @@ class AccumOp(Op): 'top_width': 0, 'size_divisible_by': 0, 'have_reference': 0, + 'out_ports_count': 1, 'infer': AccumOp.accum_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/accum_test.py b/model-optimizer/extensions/ops/accum_test.py index b2762f3..d949b59 100644 --- a/model-optimizer/extensions/ops/accum_test.py +++ b/model-optimizer/extensions/ops/accum_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,20 +24,26 @@ from mo.utils.unittest.graph import build_graph wrong_attrs_graph = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'accum': {'type': 'Accum', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'}} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} + } nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'node_2': {'type': 'Identity', 'kind': 'op'}, 'accum': {'type': 'Accum', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'}} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} + } class TestAccumOp(unittest.TestCase): def test_accum_infer_assertion(self): graph = build_graph(wrong_attrs_graph, [('node_1', 'accum'), - ('accum', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('accum', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'accum': { 'top_height': 0, @@ -54,8 +60,10 @@ class TestAccumOp(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'accum'), ('node_2', 'accum'), - ('accum', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('accum', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'accum': { @@ -77,8 +85,10 @@ class TestAccumOp(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'accum'), ('node_2', 'accum'), - ('accum', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('accum', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'accum': { @@ -100,8 +110,10 @@ class TestAccumOp(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'accum'), ('node_2', 'accum'), - ('accum', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('accum', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'accum': { diff --git a/model-optimizer/extensions/ops/argmax.py b/model-optimizer/extensions/ops/argmax.py index 41435cc..73cd955 100644 --- a/model-optimizer/extensions/ops/argmax.py +++ b/model-optimizer/extensions/ops/argmax.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,22 +15,23 @@ """ import logging as log -import networkx as nx import numpy as np from mo.front.caffe.extractors.utils import get_canonical_axis_index -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs class ArgMaxOp(Op): op = 'ArgMax' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, - 'infer': ArgMaxOp.argmax_infer + 'infer': ArgMaxOp.argmax_infer, + 'in_ports_count': 2, + 'out_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/argmax_test.py b/model-optimizer/extensions/ops/argmax_test.py index 14edf5e..105441e 100644 --- a/model-optimizer/extensions/ops/argmax_test.py +++ b/model-optimizer/extensions/ops/argmax_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,7 +24,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'argmax': {'type': 'ArgMax', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -32,8 +33,10 @@ class TestArgMaxOp(unittest.TestCase): def test_caffe_argmax_axis(self): graph = build_graph(nodes_attributes, [('node_1', 'argmax'), - ('argmax', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('argmax', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 1025, 2049])}, 'argmax': { 'out_max_val': True, @@ -52,8 +55,10 @@ class TestArgMaxOp(unittest.TestCase): def test_caffe_argmax_axis_negative(self): graph = build_graph(nodes_attributes, [('node_1', 'argmax'), - ('argmax', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('argmax', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 1025, 2049])}, 'argmax': { 'out_max_val': True, @@ -73,8 +78,10 @@ class TestArgMaxOp(unittest.TestCase): def test_caffe_argmax_no_axis(self): graph = build_graph(nodes_attributes, [('node_1', 'argmax'), - ('argmax', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('argmax', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 1025, 2049])}, 'argmax': { 'out_max_val': True, @@ -92,8 +99,10 @@ class TestArgMaxOp(unittest.TestCase): def test_caffe_argmax_extend_shape(self): graph = build_graph(nodes_attributes, [('node_1', 'argmax'), - ('argmax', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('argmax', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3])}, 'argmax': { 'out_max_val': True, @@ -111,8 +120,10 @@ class TestArgMaxOp(unittest.TestCase): def test_caffe_argmax_out_max_val_false(self): graph = build_graph(nodes_attributes, [('node_1', 'argmax'), - ('argmax', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('argmax', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3])}, 'argmax': { 'out_max_val': False, @@ -130,8 +141,10 @@ class TestArgMaxOp(unittest.TestCase): def test_caffe_argmax_no_shape(self): graph = build_graph(nodes_attributes, [('node_1', 'argmax'), - ('argmax', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('argmax', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': None}, 'argmax': { 'out_max_val': False, diff --git a/model-optimizer/extensions/ops/assert_op.py b/model-optimizer/extensions/ops/assert_op.py index 249f8fb..f79808e 100644 --- a/model-optimizer/extensions/ops/assert_op.py +++ b/model-optimizer/extensions/ops/assert_op.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,7 @@ limitations under the License. """ -import networkx as nx - -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -24,7 +22,7 @@ from mo.utils.error import Error class Assert(Op): op = 'Assert' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, 'infer': Assert.assert_infer, diff --git a/model-optimizer/extensions/ops/assert_test.py b/model-optimizer/extensions/ops/assert_test.py index 37417d5..9d83df7 100644 --- a/model-optimizer/extensions/ops/assert_test.py +++ b/model-optimizer/extensions/ops/assert_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/axpy.py b/model-optimizer/extensions/ops/axpy.py index 26e15cd..6534ed4 100644 --- a/model-optimizer/extensions/ops/axpy.py +++ b/model-optimizer/extensions/ops/axpy.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ limitations under the License. """ -import networkx as nx - +from mo.graph.graph import Graph from mo.ops.op import Op @@ -26,7 +25,7 @@ class AxpyOp(Op): op = 'Axpy' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/binarization.py b/model-optimizer/extensions/ops/binarization.py new file mode 100644 index 0000000..ab2c0e3 --- /dev/null +++ b/model-optimizer/extensions/ops/binarization.py @@ -0,0 +1,32 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from mo.graph.graph import Graph +from mo.ops.op import Op + + +class Binarization(Op): + op = 'Binarization' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'op': __class__.op, + 'infer': None, + 'dst_type': None, + 'in_ports_count': 1, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/bn.py b/model-optimizer/extensions/ops/bn.py index 69f7bf1..4b7cd86 100644 --- a/model-optimizer/extensions/ops/bn.py +++ b/model-optimizer/extensions/ops/bn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ limitations under the License. """ -import networkx as nx - +from mo.graph.graph import Graph from mo.ops.op import Op @@ -26,9 +25,11 @@ class BNOp(Op): op = 'BN' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 5, + 'out_ports_count': 1, 'infer': None }, attrs) diff --git a/model-optimizer/extensions/ops/constant_fill.py b/model-optimizer/extensions/ops/constant_fill.py index 0a51160..1f9655f 100644 --- a/model-optimizer/extensions/ops/constant_fill.py +++ b/model-optimizer/extensions/ops/constant_fill.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,29 +14,27 @@ limitations under the License. """ -import logging as log - -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op -from mo.utils.utils import refer_to_faq_msg class ConstantFill(Op): - ''' Constant blob generation by broadcasting specified value to a given shape. + """ Constant blob generation by broadcasting specified value to a given shape. It is assumed that there is no equivalent of this op in IE, so it is usually relevant to constant folding. - ''' + """ op = 'ConstantFill' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { - 'type': None, # do not set type as there is no IE equivalent + 'type': __class__.op, 'op': __class__.op, 'input_as_shape': 1, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': __class__.infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/correlation.py b/model-optimizer/extensions/ops/correlation.py index b61ed48..715830f 100644 --- a/model-optimizer/extensions/ops/correlation.py +++ b/model-optimizer/extensions/ops/correlation.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,22 +16,23 @@ from math import ceil -import networkx as nx # Concat infer : N - number of inputs to concat # axis - dimension number for tensors concatenation import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class CorrelationOp(Op): op = 'Correlation' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': CorrelationOp.corr_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/correlation_test.py b/model-optimizer/extensions/ops/correlation_test.py index a47aec2..0ec121d 100644 --- a/model-optimizer/extensions/ops/correlation_test.py +++ b/model-optimizer/extensions/ops/correlation_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'node_2': {'type': 'Identity', 'kind': 'op'}, 'corr': {'type': 'Correlation', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'} } @@ -35,9 +36,11 @@ class TestConcatPartialInfer(unittest.TestCase): [ ('node_1', 'corr'), ('node_2', 'corr'), - ('corr', 'node_3')], + ('corr', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'corr': {'pad': 20, diff --git a/model-optimizer/extensions/ops/ctc_greedy_decoder.py b/model-optimizer/extensions/ops/ctc_greedy_decoder.py index 1d032cc..fb6dabd 100644 --- a/model-optimizer/extensions/ops/ctc_greedy_decoder.py +++ b/model-optimizer/extensions/ops/ctc_greedy_decoder.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,20 +14,21 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class CTCGreedyDecoderOp(Op): op = 'CTCGreedyDecoder' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': CTCGreedyDecoderOp.ctc_greedy_decoder_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/ctc_greedy_decoder_test.py b/model-optimizer/extensions/ops/ctc_greedy_decoder_test.py index b5a9217..40e3794 100644 --- a/model-optimizer/extensions/ops/ctc_greedy_decoder_test.py +++ b/model-optimizer/extensions/ops/ctc_greedy_decoder_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'node_2': {'type': 'Identity', 'kind': 'op'}, 'ctc': {'type': 'CTCGreedyDecoder', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, } @@ -35,9 +36,11 @@ class TestConcatPartialInfer(unittest.TestCase): [ ('node_1', 'ctc'), ('node_2', 'ctc'), - ('ctc', 'node_3')], + ('ctc', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([88, 2, 71])}, 'node_2': {'shape': np.array([88, 2])}, 'ctc': {'ctc_merge_repeated': 1} diff --git a/model-optimizer/extensions/ops/data_augmentation.py b/model-optimizer/extensions/ops/data_augmentation.py index c49ff92..46d99bf 100644 --- a/model-optimizer/extensions/ops/data_augmentation.py +++ b/model-optimizer/extensions/ops/data_augmentation.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,17 +20,19 @@ import copy import networkx as nx -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class DataAugmentationOp(Op): op = 'DataAugmentation' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': DataAugmentationOp.data_augmentation_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/data_augmentation_test.py b/model-optimizer/extensions/ops/data_augmentation_test.py index d8b30e3..6d570a8 100644 --- a/model-optimizer/extensions/ops/data_augmentation_test.py +++ b/model-optimizer/extensions/ops/data_augmentation_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = { 'node_1': {'type': 'Identity', 'kind': 'op'}, 'da': {'type': 'DataAugmentation', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -34,9 +35,11 @@ class TestConcatPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [ ('node_1', 'da'), - ('da', 'node_3')], + ('da', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'da': {'crop_width': 225, 'crop_height': 225, diff --git a/model-optimizer/extensions/ops/depth_to_space.py b/model-optimizer/extensions/ops/depth_to_space.py index 0e75495..5de83b2 100644 --- a/model-optimizer/extensions/ops/depth_to_space.py +++ b/model-optimizer/extensions/ops/depth_to_space.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,19 +16,21 @@ import logging as log -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Node, Graph from mo.ops.op import Op class DepthToSpaceOp(Op): op = 'DepthToSpace' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': DepthToSpaceOp.depth_to_space_infer } super().__init__(graph, mandatory_props, attrs) @@ -50,4 +52,4 @@ class DepthToSpaceOp(Op): out_shape = [N, int(H * block_size), int(W * block_size), int(C / (block_size ** 2))] if np.prod(in_shape) != np.prod(out_shape): return - node.out_node().shape = out_shape + node.out_node().shape = int64_array(out_shape) diff --git a/model-optimizer/extensions/ops/depth_to_space_test.py b/model-optimizer/extensions/ops/depth_to_space_test.py index 26b3c4e..b0a0b68 100644 --- a/model-optimizer/extensions/ops/depth_to_space_test.py +++ b/model-optimizer/extensions/ops/depth_to_space_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/detectionoutput_onnx.py b/model-optimizer/extensions/ops/detectionoutput_onnx.py new file mode 100644 index 0000000..8566e8a --- /dev/null +++ b/model-optimizer/extensions/ops/detectionoutput_onnx.py @@ -0,0 +1,59 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.ops.op import Op + + +class ExperimentalDetectronDetectionOutput(Op): + op = 'ExperimentalDetectronDetectionOutput' + enabled = True + + def __init__(self, graph, attrs): + mandatory_props = dict( + type=__class__.op, + op=__class__.op, + infer=__class__.infer + ) + + super().__init__(graph, mandatory_props, attrs) + + def backend_attrs(self): + return [ + 'class_agnostic_box_regression', + 'max_detections_per_image', + 'nms_threshold', + 'num_classes', + 'post_nms_count', + 'score_threshold', + 'max_delta_log_wh', + ('deltas_weights', lambda node: ','.join(map(str, node['deltas_weights'])))] + + @staticmethod + def infer(node): + rois_num = node.max_detections_per_image + # boxes + node.out_node(0).shape = np.array([rois_num, 4], dtype=np.int64) + try: + # classes + node.out_node(1).shape = np.array([rois_num], dtype=np.int64) + # scores + node.out_node(2).shape = np.array([rois_num], dtype=np.int64) + # batch_ids + node.out_node(3).shape = np.array([rois_num], dtype=np.int64) + except Exception as ex: + print(ex) diff --git a/model-optimizer/extensions/ops/exp.py b/model-optimizer/extensions/ops/exp.py new file mode 100644 index 0000000..8130c1f --- /dev/null +++ b/model-optimizer/extensions/ops/exp.py @@ -0,0 +1,47 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log +import networkx as nx +import numpy as np + +from mo.front.caffe.extractors.utils import get_canonical_axis_index +from mo.graph.graph import Node, Graph +from mo.ops.op import Op, PermuteAttrs + + +class ExpOp(Op): + op = 'Exp' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': __class__.op, + 'op': __class__.op, + 'infer': __class__.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def infer(node: Node): + assert len(node.in_nodes()) == 1 + assert len(node.out_nodes()) == 1 + input_node = node.in_node() + assert input_node.has_valid('shape') + node.out_node().shape = input_node.shape.copy() + if input_node.has_valid('value'): + node.out_node().value = np.exp(input_node.value) diff --git a/model-optimizer/extensions/ops/exp_test.py b/model-optimizer/extensions/ops/exp_test.py new file mode 100644 index 0000000..882f9e8 --- /dev/null +++ b/model-optimizer/extensions/ops/exp_test.py @@ -0,0 +1,76 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest + +import numpy as np + +from extensions.ops.exp import ExpOp +from mo.graph.graph import Node +from mo.utils.unittest.graph import build_graph + +nodes_attributes = {'node_1': {'op': 'Identity', 'kind': 'op'}, + 'exp': {'op': 'Exp', 'kind': 'op'}, + 'node_3': {'op': 'Identity', 'kind': 'op'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'} + } + + +class TestExpOp(unittest.TestCase): + def test_shape_only(self): + graph = build_graph(nodes_attributes, + [('node_1', 'exp'), + ('exp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, + 'node_1': {'shape': np.array([1, 3, 10, 20])}, + }) + + exp_node = Node(graph, 'exp') + ExpOp.infer(exp_node) + exp_shape = np.array([1, 3, 10, 20]) + res_shape = graph.node['node_3']['shape'] + for i in range(0, len(exp_shape)): + self.assertEqual(exp_shape[i], res_shape[i]) + + def test_shape_and_value(self): + graph = build_graph(nodes_attributes, + [('node_1', 'exp'), + ('exp', 'node_3'), + ('node_3', 'op_output') + ], + { + 'node_3': { + 'shape': None, + 'value': None, + }, + 'node_1': { + 'shape': np.array([2]), + 'value': np.array([0, 1], dtype=np.float32), + }, + }) + + exp_node = Node(graph, 'exp') + ExpOp.infer(exp_node) + exp_shape = np.array([2]) + exp_value = np.array([1, 2.7182818], dtype=np.float32) + res_shape = graph.node['node_3']['shape'] + res_value = graph.node['node_3']['value'] + for i in range(0, len(exp_shape)): + self.assertEqual(exp_shape[i], res_shape[i]) + for i in range(0, len(exp_value)): + self.assertAlmostEqual(exp_value[i], res_value[i], places=6) diff --git a/model-optimizer/extensions/ops/gather.py b/model-optimizer/extensions/ops/gather.py index 255fd1f..210633d 100644 --- a/model-optimizer/extensions/ops/gather.py +++ b/model-optimizer/extensions/ops/gather.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,22 +14,26 @@ limitations under the License. """ +import logging as log + import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class Gather(Op): op = 'Gather' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, 'axis': 0, - 'infer': __class__.infer + 'in_ports_count': 3, + 'out_ports_count': 1, + 'infer': __class__.infer, } super().__init__(graph, mandatory_props, attrs) @@ -62,6 +66,6 @@ class Gather(Op): shape = np.concatenate((data.shape[:axis], indices.shape)) if axis < len(data.shape) - 1: - shape = np.concatenate((shape, data.shape[axis+1:])) + shape = np.concatenate((shape, data.shape[axis + 1:])) node.out_node(0).shape = np.array(shape, dtype=np.int64) diff --git a/model-optimizer/extensions/ops/gather_test.py b/model-optimizer/extensions/ops/gather_test.py index 4f749f7..1a6c5ce 100644 --- a/model-optimizer/extensions/ops/gather_test.py +++ b/model-optimizer/extensions/ops/gather_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/grn.py b/model-optimizer/extensions/ops/grn.py index 3a8df99..33d3c64 100644 --- a/model-optimizer/extensions/ops/grn.py +++ b/model-optimizer/extensions/ops/grn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,16 +17,19 @@ import networkx as nx from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op class GRNOp(Op): op = 'GRN' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': copy_shape_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/grn_test.py b/model-optimizer/extensions/ops/grn_test.py index 351023f..6781dea 100644 --- a/model-optimizer/extensions/ops/grn_test.py +++ b/model-optimizer/extensions/ops/grn_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,15 +24,19 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'grn': {'type': 'GRN', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'}} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, + } class TestGRNOp(unittest.TestCase): def test_grn_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'grn'), - ('grn', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('grn', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'grn': {'bias': 1} }) diff --git a/model-optimizer/extensions/ops/identity.py b/model-optimizer/extensions/ops/identity.py index 30995a1..dbc27b7 100644 --- a/model-optimizer/extensions/ops/identity.py +++ b/model-optimizer/extensions/ops/identity.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,27 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import networkx as nx - -from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.front.common.partial_infer.elemental import copy_shape_infer, copy_value +from mo.graph.graph import Graph from mo.ops.op import Op -from mo.front.common.partial_infer.utils import mark_input_bins class IdentityOp(Op): op = 'Identity' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { - 'type': __class__.op, 'op': __class__.op, 'identity': True, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': IdentityOp.shape_infer }, attrs) @staticmethod def shape_infer(node): - copy_shape_infer(node) - + copy_shape_infer(node, value_infer=copy_value) diff --git a/model-optimizer/extensions/ops/instance_normalization.py b/model-optimizer/extensions/ops/instance_normalization.py index b1c9b37..9e2deb7 100644 --- a/model-optimizer/extensions/ops/instance_normalization.py +++ b/model-optimizer/extensions/ops/instance_normalization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ import networkx as nx +from mo.graph.graph import Graph from mo.ops.op import Op @@ -29,7 +30,7 @@ class InstanceNormalization(Op): op = 'InstanceNormalization' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'op': __class__.op, 'epsilon': None, diff --git a/model-optimizer/extensions/ops/instance_normalization_test.py b/model-optimizer/extensions/ops/instance_normalization_test.py index e106f47..5318f2f 100644 --- a/model-optimizer/extensions/ops/instance_normalization_test.py +++ b/model-optimizer/extensions/ops/instance_normalization_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,12 +17,12 @@ import unittest import networkx as nx - +from mo.graph.graph import Graph from extensions.ops.instance_normalization import InstanceNormalization class InstanceNormalizationOp(unittest.TestCase): def test_constructor_supported_attrs(self): - graph = nx.MultiDiGraph() + graph = Graph() op = InstanceNormalization(graph, attrs={'epsilon': 0.1}) self.assertEqual(op.supported_attrs(), ['epsilon']) diff --git a/model-optimizer/extensions/ops/interp.py b/model-optimizer/extensions/ops/interp.py index 8768582..b7eefc7 100644 --- a/model-optimizer/extensions/ops/interp.py +++ b/model-optimizer/extensions/ops/interp.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,11 +17,9 @@ import inspect import logging as log -import networkx as nx - from extensions.ops.resize_factor_utils import factor_update from mo.front.common.layout import get_batch_dim, get_features_dim, get_height_dim, get_width_dim, shape_for_layout -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.utils import refer_to_faq_msg @@ -29,13 +27,15 @@ from mo.utils.utils import refer_to_faq_msg class InterpOp(Op): op = 'Interp' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, 'factor': None, 'align_corners': 1, 'parse_2nd_input': 'value', + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': InterpOp.interp_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/interp_test.py b/model-optimizer/extensions/ops/interp_test.py index cf2bbc9..b2670d2 100644 --- a/model-optimizer/extensions/ops/interp_test.py +++ b/model-optimizer/extensions/ops/interp_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'node_2': {'type': 'Identity', 'value': None, 'kind': 'data'}, 'interp': {'type': 'Interp', 'kind': 'op', 'factor': None, 'parse_2nd_input': 'value'}, - 'node_3': {'type': 'Identity', 'shape': None, 'value': None, 'kind': 'data'} + 'node_3': {'type': 'Identity', 'shape': None, 'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -33,8 +34,10 @@ class TestInterpOp(unittest.TestCase): def test_caffe_interp_infer_shrink(self): graph = build_graph(nodes_attributes, [('node_1', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 1025, 2049])}, 'interp': {'shrink_factor': 2, 'height': 0, @@ -55,8 +58,10 @@ class TestInterpOp(unittest.TestCase): def test_caffe_interp_infer_wh(self): graph = build_graph(nodes_attributes, [('node_1', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 1024, 1, 1])}, 'interp': {'width': 65, 'height': 33, @@ -77,8 +82,10 @@ class TestInterpOp(unittest.TestCase): def test_caffe_interp_infer_zoom(self): graph = build_graph(nodes_attributes, [('node_1', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 256, 33, 65])}, 'interp': {'zoom_factor': 2, 'height': 0, @@ -99,8 +106,10 @@ class TestInterpOp(unittest.TestCase): def test_caffe_interp_infer_zoom_shrink(self): graph = build_graph(nodes_attributes, [('node_1', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 256, 33, 65])}, 'interp': {'zoom_factor': 2, 'height': 0, @@ -121,8 +130,10 @@ class TestInterpOp(unittest.TestCase): def test_caffe_interp_infer_zoom_shrink_error(self): graph = build_graph(nodes_attributes, [('node_1', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 256, 33, 65])}, 'interp': {'zoom_factor': 0, 'height': 0, @@ -140,8 +151,10 @@ class TestInterpOp(unittest.TestCase): def test_caffe_interp_infer_zoom_default(self): graph = build_graph(nodes_attributes, [('node_1', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 256, 33, 65])}, 'interp': {'zoom_factor': 1, 'height': 0, @@ -164,8 +177,10 @@ class TestInterpOp(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'interp'), ('node_2', 'interp'), - ('interp', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('interp', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 256, 33, 66])}, 'node_2': {'shape': np.array([1, 1, 3, 6])}, 'interp': {'zoom_factor': 1, diff --git a/model-optimizer/extensions/ops/lstm_cell.py b/model-optimizer/extensions/ops/lstm_cell.py index 1d1c545..75811c4 100644 --- a/model-optimizer/extensions/ops/lstm_cell.py +++ b/model-optimizer/extensions/ops/lstm_cell.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.utils import mark_input_bins -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -40,17 +40,32 @@ class LSTMCell(Op): ''' op = 'LSTMCell' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, - 'infer': __class__.infer + 'infer': __class__.infer, + 'in_ports_count': 5, + 'out_ports_count': 2, } super().__init__(graph, mandatory_props, attrs) def supported_attrs(self): return [ 'hidden_size', # number of the elements in hidden cell size + 'activations', + 'activation_alpha', + 'activation_beta', + 'clip', + ] + + def backend_attrs(self): + return [ + 'hidden_size', # number of the elements in hidden cell size + ('activations', lambda node: ','.join(node.activations) if node.activations is not None else None), + 'activation_alpha', + 'activation_beta', + 'clip', ] @staticmethod diff --git a/model-optimizer/extensions/ops/lstm_sequence.py b/model-optimizer/extensions/ops/lstm_sequence.py index 0f3c63b..ad590bb 100644 --- a/model-optimizer/extensions/ops/lstm_sequence.py +++ b/model-optimizer/extensions/ops/lstm_sequence.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,15 +14,11 @@ limitations under the License. """ -import logging as log - -import networkx as nx import numpy as np from mo.front.common.partial_infer.utils import mark_input_bins -from mo.graph.graph import Node +from mo.graph.graph import Node, add_opoutput, Graph from mo.ops.op import Op -from mo.utils.utils import refer_to_faq_msg class LSTMSequence(Op): @@ -46,14 +42,19 @@ class LSTMSequence(Op): """ op = 'LSTMSequence' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': '__LSTMSequence', # should be never emitted to IR; for debugging purposes 'op': __class__.op, 'blobs_wrb': False, 'has_num_directions': False, 'direction': 'forward', - 'infer': __class__.infer + 'num_layers': 1, + 'infer': __class__.infer, + 'blob_bidirectional_split': lambda node: ( + LSTMSequence.split_helper(node, 0, 'forward'), + LSTMSequence.split_helper(node, 1, 'reverse') + ) } super().__init__(graph, mandatory_props, attrs) @@ -74,13 +75,21 @@ class LSTMSequence(Op): ] @staticmethod + def split_helper(node, index: int, direction: str): + return Op._create_data_node( + node.graph, + name=node.name + '/SplittedBiLSTM/{}/'.format(direction), + attrs={'value': node.value[index], 'shape': np.array(node.value[index].shape, dtype=np.int64)} + ) + + @staticmethod def infer(node: Node): # there are limitations coming from ONNX LSTM definition and normalization rules assert len(node.in_nodes()) >= 3 # X, W and R assert len(node.in_nodes()) <= 7 assert len(node.out_nodes()) <= 3 assert node.batch_dim <= 1 - assert node.sequence_dim <=1 + assert node.sequence_dim <= 1 assert node.batch_dim != node.sequence_dim assert node.direction in ['forward', 'reverse', 'bidirectional'] @@ -91,11 +100,21 @@ class LSTMSequence(Op): mark_input_bins(node) input_shape = node.in_node(0).shape assert len(input_shape) == 3 + + for port in [2, 3]: + if port in node.in_nodes() and len(node.in_node(port).in_nodes()) > 0 and \ + 'zero_shapes' in node.in_node(port).in_node(): + for i in node.in_node(port).in_node().zero_shapes: + if node.in_node(port).shape[i] != input_shape[i]: + node.in_node(port).value = np.repeat(node.in_node(port).value, input_shape[i], axis=i) + node.in_node(port).shape[i] = input_shape[i] + out_shape = np.array([input_shape[node.sequence_dim], input_shape[node.batch_dim], node.hidden_size], dtype=np.int64) assert not node.has_num_directions or node.sequence_dim == 0, \ 'If has_num_directions == True, then node.sequence_dim should be equal 0, but it is {}'.format( node.sequence_dim) num_directions = 2 if node.direction in ['bidirectional'] else 1 + num_layers = node.num_layers if node.has_num_directions: # insert extra dimension to output shape for num_directions out_shape = np.insert(out_shape, 1, np.int64(num_directions)) @@ -103,15 +122,16 @@ class LSTMSequence(Op): # extra outputs for hidden/cell states state_size = np.array([input_shape[1], node.hidden_size], dtype=np.int64) if node.has_num_directions: - state_size = np.insert(state_size, 0, num_directions) + state_size = np.insert(state_size, 0, num_directions*num_layers) for i in [1,2]: if i not in node.out_nodes(): data_node = Op._create_data_node( node.graph, name=node.node+'/ExtraOutput/' + str(i), - attrs={'is_output': True, 'executable': None} + attrs={'executable': True} ) node.graph.add_edge(node.id, data_node.id, key=0, out=i) + add_opoutput(node.graph, data_node.id, 0, False) else: data_node = node.out_node(i) data_node.shape = state_size.copy() diff --git a/model-optimizer/extensions/ops/merge.py b/model-optimizer/extensions/ops/merge.py index 040cbf5..a106c90 100644 --- a/model-optimizer/extensions/ops/merge.py +++ b/model-optimizer/extensions/ops/merge.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,17 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Node, Graph from mo.ops.op import Op class Merge(Op): op = 'Merge' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, 'infer': __class__.merge_infer @@ -51,4 +51,4 @@ class Merge(Op): node.out_node().value = tensor.value.copy() if tensor.has_valid('value') else None tensor = inferred_nodes[0] - node.out_node().shape = tensor.shape + node.out_node().shape = int64_array(tensor.shape) diff --git a/model-optimizer/extensions/ops/merge_test.py b/model-optimizer/extensions/ops/merge_test.py index 755da1a..f6ee19d 100644 --- a/model-optimizer/extensions/ops/merge_test.py +++ b/model-optimizer/extensions/ops/merge_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/mvn.py b/model-optimizer/extensions/ops/mvn.py index a00d935..bcf65a2 100644 --- a/model-optimizer/extensions/ops/mvn.py +++ b/model-optimizer/extensions/ops/mvn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,7 +25,7 @@ class MVN(Op): op = 'MVN' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, @@ -32,6 +33,8 @@ class MVN(Op): 'eps': None, 'across_channels': 0, 'normalize_variance': 1, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': copy_shape_infer }, attrs) diff --git a/model-optimizer/extensions/ops/normalize.py b/model-optimizer/extensions/ops/normalize.py index cb6a844..c7cad95 100644 --- a/model-optimizer/extensions/ops/normalize.py +++ b/model-optimizer/extensions/ops/normalize.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,12 +23,14 @@ class NormalizeOp(Op): op = 'Normalize' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, 'eps': None, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': copy_shape_infer }, attrs) diff --git a/model-optimizer/extensions/ops/normalize_test.py b/model-optimizer/extensions/ops/normalize_test.py index 8a15fd6..5a57e5e 100644 --- a/model-optimizer/extensions/ops/normalize_test.py +++ b/model-optimizer/extensions/ops/normalize_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,7 +24,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'norm': {'type': 'Normalize', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -32,8 +33,10 @@ class TestNormalize(unittest.TestCase): def test_region_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'norm'), - ('norm', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('norm', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'norm': {} }) diff --git a/model-optimizer/extensions/ops/pack.py b/model-optimizer/extensions/ops/pack.py index c6a241d..705f5bd 100644 --- a/model-optimizer/extensions/ops/pack.py +++ b/model-optimizer/extensions/ops/pack.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,16 +17,17 @@ import numpy as np import networkx as nx -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class PackOp(Op): op = 'Pack' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { - 'op': __class__.op + 'op': __class__.op, + 'out_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/power_file.py b/model-optimizer/extensions/ops/power_file.py index 50177f9..bfe9aab 100644 --- a/model-optimizer/extensions/ops/power_file.py +++ b/model-optimizer/extensions/ops/power_file.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,10 +25,12 @@ class PowerFileOp(Op): op = 'PowerFile' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': copy_shape_infer }, attrs) diff --git a/model-optimizer/extensions/ops/prediction_heatmap.py b/model-optimizer/extensions/ops/prediction_heatmap.py index 0db515c..35e334b 100644 --- a/model-optimizer/extensions/ops/prediction_heatmap.py +++ b/model-optimizer/extensions/ops/prediction_heatmap.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,17 +17,19 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class PredictionHeatmapOp(Op): op = 'PredictionHeatmap' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': PredictionHeatmapOp.infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/prelu.py b/model-optimizer/extensions/ops/prelu.py index 2825ae0..2aa02df 100644 --- a/model-optimizer/extensions/ops/prelu.py +++ b/model-optimizer/extensions/ops/prelu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op from mo.front.common.partial_infer.utils import mark_input_bins @@ -25,10 +26,12 @@ class PreluOp(Op): op = 'PReLU' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': PreluOp.prelu_shape_infer }, attrs) diff --git a/model-optimizer/extensions/ops/priorbox.py b/model-optimizer/extensions/ops/priorbox.py index e494097..1793c62 100644 --- a/model-optimizer/extensions/ops/priorbox.py +++ b/model-optimizer/extensions/ops/priorbox.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,19 +14,18 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.common.layout import get_width_dim, get_height_dim from mo.front.extractor import attr_getter -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class PriorBoxOp(Op): op = 'PriorBox' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, @@ -34,6 +33,8 @@ class PriorBoxOp(Op): 'max_size': np.array([]), 'min_size': np.array([]), 'aspect_ratio': np.array([]), + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': PriorBoxOp.priorbox_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/priorbox_clustered.py b/model-optimizer/extensions/ops/priorbox_clustered.py index e1fe983..f26d905 100644 --- a/model-optimizer/extensions/ops/priorbox_clustered.py +++ b/model-optimizer/extensions/ops/priorbox_clustered.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,17 +19,19 @@ import numpy as np from mo.front.common.layout import get_width_dim, get_height_dim from mo.front.extractor import attr_getter -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class PriorBoxClusteredOp(Op): op = 'PriorBoxClustered' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': PriorBoxClusteredOp.priorbox_clustered_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/priorbox_clustered_test.py b/model-optimizer/extensions/ops/priorbox_clustered_test.py index 849ba7e..35bb306 100644 --- a/model-optimizer/extensions/ops/priorbox_clustered_test.py +++ b/model-optimizer/extensions/ops/priorbox_clustered_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'data'}, 'node_2': {'type': 'Identity', 'value': None, 'kind': 'data'}, 'pbc': {'type': 'PriorBoxClustered', 'value': None, 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'value': None, 'kind': 'data'} + 'node_3': {'type': 'Identity', 'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -35,9 +36,11 @@ class TestPriorBoxClusteredPartialInfer(unittest.TestCase): [ ('node_1', 'pbc'), ('node_2', 'pbc'), - ('pbc', 'node_3')], + ('pbc', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 384, 19, 19])}, 'node_2': {'shape': np.array([1, 3, 300, 300])}, 'pbc': {'flip': 0, 'clip': 0, 'variance': [0.1, 0.1, 0.2, 0.2], @@ -58,9 +61,11 @@ class TestPriorBoxClusteredPartialInfer(unittest.TestCase): [ ('node_1', 'pbc'), ('node_2', 'pbc'), - ('pbc', 'node_3')], + ('pbc', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 19, 19, 384])}, 'node_2': {'shape': np.array([1, 300, 300, 3])}, 'pbc': {'flip': 0, 'clip': 0, 'variance': [0.1, 0.1, 0.2, 0.2], diff --git a/model-optimizer/extensions/ops/priorbox_test.py b/model-optimizer/extensions/ops/priorbox_test.py index fbb42a4..74e7e1a 100644 --- a/model-optimizer/extensions/ops/priorbox_test.py +++ b/model-optimizer/extensions/ops/priorbox_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,8 +24,9 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'data'}, 'pb': {'type': 'PriorBox', 'value': None, 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'value': None, 'kind': 'data'} - } + 'node_3': {'type': 'Identity', 'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} + } class TestPriorBoxPartialInfer(unittest.TestCase): @@ -33,9 +34,11 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [ ('node_1', 'pb'), - ('pb', 'node_3')], + ('pb', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 384, 19, 19])}, 'pb': { 'aspect_ratio': np.array([1]), @@ -47,7 +50,7 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph.graph['layout'] = 'NCHW' pb_node = Node(graph, 'pb') PriorBoxOp.priorbox_infer(pb_node) - exp_shape = np.array([1, 2, 4*19*19*2]) + exp_shape = np.array([1, 2, 4 * 19 * 19 * 2]) res_shape = graph.node['node_3']['shape'] for i in range(0, len(exp_shape)): self.assertEqual(exp_shape[i], res_shape[i]) @@ -56,9 +59,11 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [ ('node_1', 'pb'), - ('pb', 'node_3')], + ('pb', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 384, 19, 19])}, 'pb': { 'aspect_ratio': np.array([1, 2, 0.5]), @@ -70,7 +75,7 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph.graph['layout'] = 'NCHW' pb_node = Node(graph, 'pb') PriorBoxOp.priorbox_infer(pb_node) - exp_shape = np.array([1, 2, 4*19*19*4]) + exp_shape = np.array([1, 2, 4 * 19 * 19 * 4]) res_shape = graph.node['node_3']['shape'] for i in range(0, len(exp_shape)): self.assertEqual(exp_shape[i], res_shape[i]) @@ -79,9 +84,11 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [ ('node_1', 'pb'), - ('pb', 'node_3')], + ('pb', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 19, 19, 384])}, 'pb': { 'aspect_ratio': np.array([1]), @@ -93,7 +100,7 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph.graph['layout'] = 'NHWC' pb_node = Node(graph, 'pb') PriorBoxOp.priorbox_infer(pb_node) - exp_shape = np.array([1, 2, 4*19*19*2]) + exp_shape = np.array([1, 2, 4 * 19 * 19 * 2]) res_shape = graph.node['node_3']['shape'] for i in range(0, len(exp_shape)): self.assertEqual(exp_shape[i], res_shape[i]) @@ -102,9 +109,11 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [ ('node_1', 'pb'), - ('pb', 'node_3')], + ('pb', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 19, 19, 384])}, 'pb': { 'aspect_ratio': np.array([1, 2, 0.5]), @@ -116,7 +125,7 @@ class TestPriorBoxPartialInfer(unittest.TestCase): graph.graph['layout'] = 'NHWC' pb_node = Node(graph, 'pb') PriorBoxOp.priorbox_infer(pb_node) - exp_shape = np.array([1, 2, 4*19*19*4]) + exp_shape = np.array([1, 2, 4 * 19 * 19 * 4]) res_shape = graph.node['node_3']['shape'] for i in range(0, len(exp_shape)): self.assertEqual(exp_shape[i], res_shape[i]) diff --git a/model-optimizer/extensions/ops/priorgridgenerator_onnx.py b/model-optimizer/extensions/ops/priorgridgenerator_onnx.py new file mode 100644 index 0000000..7bfdba8 --- /dev/null +++ b/model-optimizer/extensions/ops/priorgridgenerator_onnx.py @@ -0,0 +1,52 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.ops.op import Op + + +class ExperimentalDetectronPriorGridGenerator(Op): + op = 'ExperimentalDetectronPriorGridGenerator' + + def __init__(self, graph, attrs): + mandatory_props = dict( + type=__class__.op, + op=__class__.op, + infer=__class__.infer, + ) + super().__init__(graph, mandatory_props, attrs) + + def backend_attrs(self): + return [ + 'flatten', + 'h', + 'w', + 'stride_x', + 'stride_y', + ] + + @staticmethod + def infer(node): + input_shape = node.in_node(0).shape + priors_num = input_shape[0] + grid_h = node.in_node(1).shape[2] + grid_w = node.in_node(1).shape[3] + if node.flatten: + out_shape = np.array([grid_h * grid_w * priors_num, 4], dtype=np.int64) + else: + out_shape = np.array([grid_h, grid_w, priors_num, 4], dtype=np.int64) + node.out_node(0).shape = out_shape diff --git a/model-optimizer/extensions/ops/proposal.py b/model-optimizer/extensions/ops/proposal.py index 7eba530..8b7891b 100644 --- a/model-optimizer/extensions/ops/proposal.py +++ b/model-optimizer/extensions/ops/proposal.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,19 +18,21 @@ import networkx as nx import numpy as np from mo.front.extractor import attr_getter -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class ProposalOp(Op): op = 'Proposal' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, 'post_nms_topn': 300, # default in caffe-shared - 'infer': ProposalOp.proposal_infer + 'infer': ProposalOp.proposal_infer, + 'in_ports_count': 3, + 'out_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) @@ -59,6 +61,9 @@ class ProposalOp(Op): 'framework', 'box_coordinate_scale', 'box_size_scale', + 'normalize', + 'clip_after_nms', + 'clip_before_nms', ] @staticmethod diff --git a/model-optimizer/extensions/ops/proposal_onnx.py b/model-optimizer/extensions/ops/proposal_onnx.py new file mode 100644 index 0000000..56f78f5 --- /dev/null +++ b/model-optimizer/extensions/ops/proposal_onnx.py @@ -0,0 +1,45 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.ops.op import Op + + +class ExperimentalDetectronGenerateProposalsSingleImage(Op): + op = 'ExperimentalDetectronGenerateProposalsSingleImage' + + def __init__(self, graph, attrs): + mandatory_props = dict( + type=__class__.op, + op=__class__.op, + infer=__class__.infer + ) + + super().__init__(graph, mandatory_props, attrs) + + def backend_attrs(self): + return [ + 'min_size', + 'nms_threshold', + 'post_nms_count', + 'pre_nms_count' + ] + + @staticmethod + def infer(node): + node.out_node(0).shape = np.array([node.post_nms_count, 4], dtype=np.int64) + node.out_node(1).shape = np.array([node.post_nms_count], dtype=np.int64) diff --git a/model-optimizer/extensions/ops/proposal_python_example.py b/model-optimizer/extensions/ops/proposal_python_example.py index 80c7a5b..cf9bcae 100644 --- a/model-optimizer/extensions/ops/proposal_python_example.py +++ b/model-optimizer/extensions/ops/proposal_python_example.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,13 +18,14 @@ import networkx as nx from extensions.ops.proposal import ProposalOp from mo.front.caffe.extractor import register_caffe_python_extractor +from mo.graph.graph import Graph from mo.ops.op import Op class ProposalPythonExampleOp(Op): op = 'Proposal' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, diff --git a/model-optimizer/extensions/ops/proposal_test.py b/model-optimizer/extensions/ops/proposal_test.py index 0298468..e16b147 100644 --- a/model-optimizer/extensions/ops/proposal_test.py +++ b/model-optimizer/extensions/ops/proposal_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'proposal': {'type': 'proposal', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -33,8 +34,10 @@ class TestProposal(unittest.TestCase): def test_proposal_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'proposal'), - ('proposal', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('proposal', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'proposal': {'post_nms_topn': 2, **layout_attrs()} }) diff --git a/model-optimizer/extensions/ops/psroipooling.py b/model-optimizer/extensions/ops/psroipooling.py index 246e601..da84db8 100644 --- a/model-optimizer/extensions/ops/psroipooling.py +++ b/model-optimizer/extensions/ops/psroipooling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,17 +17,20 @@ import networkx as nx from mo.front.common.layout import get_batch_dim, shape_for_layout -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class PSROIPoolingOp(Op): op = 'PSROIPooling' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'mode': 'average', + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': PSROIPoolingOp.psroipooling_infer } super().__init__(graph, mandatory_props, attrs) @@ -36,7 +39,10 @@ class PSROIPoolingOp(Op): return [ 'spatial_scale', 'output_dim', - 'group_size' + 'group_size', + 'mode', + 'spatial_bins_x', + 'spatial_bins_y', ] @staticmethod diff --git a/model-optimizer/extensions/ops/psroipooling_test.py b/model-optimizer/extensions/ops/psroipooling_test.py index 10cdee1..c55bef8 100644 --- a/model-optimizer/extensions/ops/psroipooling_test.py +++ b/model-optimizer/extensions/ops/psroipooling_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'node_2': {'type': 'Identity', 'kind': 'op'}, 'psroipool': {'type': 'PSROIPooling', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -34,8 +35,10 @@ class TestPSROIPooling(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'psroipool'), ('node_2', 'psroipool'), - ('psroipool', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('psroipool', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([100, 5])}, 'psroipool': {'output_dim': 4, 'group_size': 15} @@ -52,8 +55,10 @@ class TestPSROIPooling(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'psroipool'), ('node_2', 'psroipool'), - ('psroipool', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('psroipool', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 227, 227, 3])}, 'node_2': {'shape': np.array([100, 5])}, 'psroipool': {'output_dim': 4, 'group_size': 15} @@ -70,8 +75,10 @@ class TestPSROIPooling(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'psroipool'), ('node_2', 'psroipool'), - ('psroipool', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('psroipool', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': None}, 'node_2': {'shape': np.array([100, 5])}, 'psroipool': {'output_dim': 4, 'group_size': 224} diff --git a/model-optimizer/extensions/ops/quantize.py b/model-optimizer/extensions/ops/quantize.py new file mode 100644 index 0000000..1bd7995 --- /dev/null +++ b/model-optimizer/extensions/ops/quantize.py @@ -0,0 +1,98 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.graph.graph import Node, Graph +from mo.ops.op import Op + + +def broadcastable(broadcast_from, broadcast_to): + """Check if shape broadcast_from can be broadcasted to broadcast_to""" + broadcast_to = np.array(broadcast_to, dtype=np.int64) + broadcast_from = np.array(broadcast_from, dtype=np.int64) + if broadcast_from.size > broadcast_to.size: + return False + broadcast_from = np.concatenate( + (np.array([1] * (broadcast_to.size - broadcast_from.size), dtype=np.int64), broadcast_from)) + return np.all(np.logical_or(broadcast_from == 1, broadcast_from == broadcast_to)) + + +class QuantizeOp(Op): + op = 'Quantize' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': __class__.op, + 'op': __class__.op, + 'levels': None, + 'infer': __class__.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, + } + super().__init__(graph, mandatory_props, attrs) + + def supported_attrs(self): + return [ + 'levels', + ] + + @staticmethod + def infer(node: Node): + assert len(node.in_nodes()) == 5 + assert len(node.out_nodes()) == 1 + inputs = [node.in_node(i) for i in range(5)] + x, input_low, input_high, output_low, output_high = inputs + assert x.has_valid('shape') + # TODO Check all input[1..4] shapes are broadcastable to intput[0] shape + assert all([broadcastable(inputs[i].shape, inputs[0].shape) for i in range(1, 5)]), \ + "Not all shapes from Quantize inputs can be broadcasted to input[0] for node {}".format( + node.soft_get('name')) + node.out_node().shape = x.shape.copy() + + if all([node.in_node(i).has_valid('value') for i in range(5)]): + x, input_low, input_high, output_low, output_high = \ + [np.array(np.broadcast_to(node.value, x.value.shape)) for node in inputs] + + assert node.has_valid('levels') + assert isinstance(node.levels, int) + + underflow_mask = x <= input_low + overflow_mask = x > input_high + # pylint: disable=assignment-from-no-return + middle_mask = np.logical_not(np.logical_or(underflow_mask, overflow_mask)) + + def middle_part(x, input_low, input_high, output_low, output_high): + return np.round( + (x - input_low) / (input_high - input_low) * (node.levels - 1) + ) / (node.levels - 1) * (output_high - output_low) + output_low + + output = np.zeros_like(x) + # pylint: disable=unsupported-assignment-operation + output[middle_mask] = middle_part( + x[middle_mask], + input_low[middle_mask], + input_high[middle_mask], + output_low[middle_mask], + output_high[middle_mask], + ) + + # pylint: disable=unsupported-assignment-operation + output[overflow_mask] = output_high[overflow_mask] + # pylint: disable=unsupported-assignment-operation + output[underflow_mask] = output_low[underflow_mask] + + node.out_node().value = output diff --git a/model-optimizer/extensions/ops/quantize_test.py b/model-optimizer/extensions/ops/quantize_test.py new file mode 100644 index 0000000..a3fae97 --- /dev/null +++ b/model-optimizer/extensions/ops/quantize_test.py @@ -0,0 +1,135 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import unittest + +import numpy as np + +from extensions.ops.quantize import QuantizeOp, broadcastable +from mo.graph.graph import Node +from mo.utils.unittest.graph import build_graph + + +class TestBroadcastable(unittest.TestCase): + def test_matching(self): + self.assertTrue(broadcastable([1, 2, 3], [1, 2, 3])) + + def test_incomplete(self): + self.assertTrue(broadcastable([1, 1, 1], [1, 2, 3])) + self.assertTrue(broadcastable([2, 3], [1, 2, 3])) + self.assertTrue(broadcastable([1, 3], [1, 2, 3])) + self.assertTrue(broadcastable([1, 1], [1, 2, 3])) + self.assertTrue(broadcastable([], [1, 2, 3])) + self.assertTrue(broadcastable([1], [1, 2, 3])) + + def test_reverse_incomplete(self): + self.assertFalse(broadcastable([1, 2, 3], [1, 1, 1])) + self.assertFalse(broadcastable([1, 2, 3], [2, 3])) + self.assertFalse(broadcastable([1, 2, 3], [1, 3])) + self.assertFalse(broadcastable([1, 2, 3], [1, 1])) + self.assertFalse(broadcastable( [1, 2, 3], [])) + self.assertFalse(broadcastable([1, 2, 3], [1])) + + def test_invalid(self): + self.assertFalse(broadcastable([3, 2, 1], [1, 2, 3])) + self.assertFalse(broadcastable([5], [6])) + self.assertFalse(broadcastable([5], [1])) + + +nodes_attributes = {'node_in_1': {'op': 'Identity', 'kind': 'op'}, + 'node_in_2': {'op': 'Identity', 'kind': 'op'}, + 'node_in_3': {'op': 'Identity', 'kind': 'op'}, + 'node_in_4': {'op': 'Identity', 'kind': 'op'}, + 'node_in_5': {'op': 'Identity', 'kind': 'op'}, + 'quantize': {'op': 'Quantize', 'kind': 'op', 'levels': 2}, + 'node_out_1': {'op': 'Identity', 'kind': 'op'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'} + } + + +class TestQuantizeOp(unittest.TestCase): + def test_shape_only(self): + graph = build_graph(nodes_attributes, + [('node_in_1', 'quantize'), + ('node_in_2', 'quantize'), + ('node_in_3', 'quantize'), + ('node_in_4', 'quantize'), + ('node_in_5', 'quantize'), + ('quantize', 'node_out_1'), + ('node_out_1', 'op_output') + ], + {'node_out_1': {'shape': None}, + 'node_in_1': {'shape': np.array([1, 3, 10, 20])}, + 'node_in_2': {'shape': np.array([1, 3, 10, 20])}, + 'node_in_3': {'shape': np.array([1, 3, 10, 20])}, + 'node_in_4': {'shape': np.array([1, 3, 10, 20])}, + 'node_in_5': {'shape': np.array([1, 3, 10, 20])}, + }) + + quantize_node = Node(graph, 'quantize') + QuantizeOp.infer(quantize_node) + quantize_shape = np.array([1, 3, 10, 20]) + res_shape = graph.node['node_out_1']['shape'] + for i in range(0, len(quantize_shape)): + self.assertEqual(quantize_shape[i], res_shape[i]) + + def test_shape_and_value(self): + graph = build_graph(nodes_attributes, + [('node_in_1', 'quantize'), + ('node_in_2', 'quantize'), + ('node_in_3', 'quantize'), + ('node_in_4', 'quantize'), + ('node_in_5', 'quantize'), + ('quantize', 'node_out_1'), + ('node_out_1', 'op_output') + ], + { + 'node_out_1': { + 'shape': None, + 'value': None, + }, + 'node_in_1': { + 'shape': np.array([4]), + 'value': np.array([5, 17, 0, 100], dtype=np.float32), + }, + 'node_in_2': { + 'shape': np.array([4]), + 'value': np.array([0, 12, 12, 12], dtype=np.float32), + }, + 'node_in_3': { + 'shape': np.array([4]), + 'value': np.array([10, 20, 20, 20], dtype=np.float32), + }, + 'node_in_4': { + 'shape': np.array([4]), + 'value': np.array([0, 0, 0, 0], dtype=np.float32), + }, + 'node_in_5': { + 'shape': np.array([4]), + 'value': np.array([1, 1, 1, 1], dtype=np.float32), + }, + }) + + exp_node = Node(graph, 'quantize') + QuantizeOp.infer(exp_node) + quantize_shape = np.array([4]) + quantize_value = np.array([0, 1, 0, 1], dtype=np.float32) + res_shape = graph.node['node_out_1']['shape'] + res_value = graph.node['node_out_1']['value'] + for i in range(0, len(quantize_shape)): + self.assertEqual(quantize_shape[i], res_shape[i]) + for i in range(0, len(quantize_value)): + self.assertAlmostEqual(quantize_value[i], res_value[i], places=6) diff --git a/model-optimizer/extensions/ops/range.py b/model-optimizer/extensions/ops/range.py new file mode 100644 index 0000000..2b02ce1 --- /dev/null +++ b/model-optimizer/extensions/ops/range.py @@ -0,0 +1,71 @@ +""" + Copyright (c) 2018-2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import logging as log + +import numpy as np + +from mo.graph.graph import Node, Graph +from mo.ops.op import Op + + +class Range(Op): + op = 'Range' + + def __init__(self, graph: Graph, attrs: dict): + mandatory_props = { + 'type': __class__.op, + 'op': __class__.op, + 'in_ports_count': 3, + 'out_ports_count': 1, + 'infer': __class__.infer, + } + super().__init__(graph, mandatory_props, attrs) + + @staticmethod + def infer(node: Node): + start = node.in_node(0) + limit = node.in_node(1) + delta = node.in_node(2) + output = node.out_node() + + if not start.has_valid('value') or not limit.has_valid('value') or not delta.has_valid('value'): + log.error("Range operation is supported with constant inputs only") + return + if 'type' in node.pb.attr: + from mo.front.tf.extractors.utils import tf_dtype_extractor + result_data_type = tf_dtype_extractor(node.pb.attr["type"].type) + else: + result_data_type = start.value.dtype + output.value = np.arange(start.value, limit.value, delta.value, dtype=result_data_type) + output.shape = np.array(output.value.shape, dtype=np.int64) + + # Some notes on the automatic result data type infer. The tf.range does is differently than np.arange. Numpy + # by default creates array with elements of type int64 and float64, but TF does not widen data types and keep them + # int32 and float32. + # Compare: + + # >>> tf.range(1, 5, 0.5) + # + # >>> tf.range(1, 5, 2) + # + + # >>> np.array([0.5], dtype=np.float32) + # array([0.5], dtype=float32) + # >>> np.arange(np.array([1], dtype=np.int32), np.array([5], dtype=np.int32), np.array([2], dtype=np.int32)).dtype + # dtype('int64') + # >>> np.arange(np.array([1], dtype=np.int32), np.array([5], dtype=np.int32), np.array([0.5], dtype=np.float32)).dtype + # dtype('float64') diff --git a/model-optimizer/extensions/ops/rank.py b/model-optimizer/extensions/ops/rank.py index ed17048..f6ee0cf 100644 --- a/model-optimizer/extensions/ops/rank.py +++ b/model-optimizer/extensions/ops/rank.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,8 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -25,9 +26,11 @@ from mo.utils.error import Error class Rank(Op): op = 'Rank' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': __class__.infer, } super().__init__(graph, mandatory_props, attrs) @@ -37,4 +40,4 @@ class Rank(Op): rank = len(node.in_node(0).shape) out_value = np.array(rank) node.out_node().value = out_value - node.out_node().shape = out_value.shape + node.out_node().shape = int64_array(out_value.shape) diff --git a/model-optimizer/extensions/ops/regionyolo.py b/model-optimizer/extensions/ops/regionyolo.py index f47245e..b35af53 100644 --- a/model-optimizer/extensions/ops/regionyolo.py +++ b/model-optimizer/extensions/ops/regionyolo.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,23 +14,24 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.caffe.extractors.utils import get_canonical_axis_index from mo.front.common.layout import get_batch_dim, get_height_dim, get_width_dim, shape_for_layout from mo.front.extractor import attr_getter -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class RegionYoloOp(Op): op = 'RegionYolo' - def __init__(self, graph: nx.MultiDiGraph, attrs: Node): + def __init__(self, graph: Graph, attrs: Node): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': RegionYoloOp.regionyolo_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/regionyolo_test.py b/model-optimizer/extensions/ops/regionyolo_test.py index 715163a..070837b 100644 --- a/model-optimizer/extensions/ops/regionyolo_test.py +++ b/model-optimizer/extensions/ops/regionyolo_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'region': {'type': 'RegionYolo', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -33,8 +34,10 @@ class TestRegionYOLOCaffe(unittest.TestCase): def test_region_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'region'), - ('region', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('region', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'region': {'axis': 1, 'end_axis': -1, 'do_softmax': 1, **layout_attrs()} }) @@ -49,8 +52,10 @@ class TestRegionYOLOCaffe(unittest.TestCase): def test_region_infer_flatten(self): graph = build_graph(nodes_attributes, [('node_1', 'region'), - ('region', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('region', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'region': {'end_axis': 1, 'axis': 0, 'do_softmax': 1, **layout_attrs()} }) @@ -65,8 +70,10 @@ class TestRegionYOLOCaffe(unittest.TestCase): def test_region_infer_flatten_again(self): graph = build_graph(nodes_attributes, [('node_1', 'region'), - ('region', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('region', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'region': {'end_axis': 2, 'axis': 0, 'do_softmax': 1, **layout_attrs()} }) @@ -81,8 +88,10 @@ class TestRegionYOLOCaffe(unittest.TestCase): def test_region_infer_do_softmax(self): graph = build_graph(nodes_attributes, [('node_1', 'region'), - ('region', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('region', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'region': {'do_softmax': 0, 'end_axis': -1, 'axis': 1, 'classes': 80, 'coords': 4, 'mask': np.array([6, 7, 8]), **layout_attrs()} @@ -101,8 +110,10 @@ class TestRegionYOLOTF(unittest.TestCase): def test_region_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'region'), - ('region', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('region', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 227, 227, 3])}, 'region': {'axis': 1, 'end_axis': -1, 'do_softmax': 1, **layout_attrs()} }) @@ -117,8 +128,10 @@ class TestRegionYOLOTF(unittest.TestCase): def test_region_infer_do_softmax(self): graph = build_graph(nodes_attributes, [('node_1', 'region'), - ('region', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('region', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 227, 227, 3])}, 'region': {'do_softmax': 0, 'end_axis': -1, 'axis': 1, 'classes': 80, 'coords': 4, 'mask': np.array([6, 7, 8]), **layout_attrs()} diff --git a/model-optimizer/extensions/ops/reorgyolo.py b/model-optimizer/extensions/ops/reorgyolo.py index 51a2c20..e5bb9ac 100644 --- a/model-optimizer/extensions/ops/reorgyolo.py +++ b/model-optimizer/extensions/ops/reorgyolo.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,14 +17,14 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs class ReorgYoloOp(Op): op = 'ReorgYolo' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, @@ -46,10 +46,10 @@ class ReorgYoloOp(Op): stride = node.stride output_shape = np.full_like(input_shape, -1, dtype=np.int64) - output_shape[node.batch_dims] = input_shape[node.batch_dims] - output_shape[node.channel_dims] = input_shape[node.channel_dims] * stride ** 2 + output_shape[node.batch_dims] = input_shape[node.batch_dims] # pylint: disable=unsupported-assignment-operation + output_shape[node.channel_dims] = input_shape[node.channel_dims] * stride ** 2 # pylint: disable=unsupported-assignment-operation # Round as in caffe - output_shape[node.spatial_dims] = np.round(input_shape[node.spatial_dims] / stride) + output_shape[node.spatial_dims] = np.round(input_shape[node.spatial_dims] / stride) # pylint: disable=unsupported-assignment-operation node.out_node().shape = output_shape PermuteAttrs.create_permute_attrs(node, attrs=[('channel_dims', 'input:0'), ('spatial_dims', 'input:0')]) diff --git a/model-optimizer/extensions/ops/reorgyolo_test.py b/model-optimizer/extensions/ops/reorgyolo_test.py index 7021fd5..696465c 100644 --- a/model-optimizer/extensions/ops/reorgyolo_test.py +++ b/model-optimizer/extensions/ops/reorgyolo_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'reorg': {'type': 'ReorgYolo', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -42,8 +43,10 @@ class TestReorgYOLO(unittest.TestCase): def test_reorgyolo_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'reorg'), - ('reorg', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('reorg', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'reorg': {'stride': 2, **layout_attrs()} diff --git a/model-optimizer/extensions/ops/resample.py b/model-optimizer/extensions/ops/resample.py index b227c00..331ab67 100644 --- a/model-optimizer/extensions/ops/resample.py +++ b/model-optimizer/extensions/ops/resample.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,22 +16,22 @@ import logging as log -import networkx as nx - from extensions.ops.resize_factor_utils import factor_update from mo.front.common.layout import get_batch_dim, get_features_dim, get_height_dim, get_width_dim, shape_for_layout -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class ResampleOp(Op): op = 'Resample' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, 'factor': None, + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': ResampleOp.resample_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/resample_test.py b/model-optimizer/extensions/ops/resample_test.py index bf4c4f0..b33ba71 100644 --- a/model-optimizer/extensions/ops/resample_test.py +++ b/model-optimizer/extensions/ops/resample_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,7 +24,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'resample': {'type': 'Resample', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, } @@ -32,8 +33,10 @@ class TestResampleOp(unittest.TestCase): def test_tf_resample_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'resample'), - ('resample', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('resample', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'resample': {'antialias': 1, 'height': 384, @@ -54,8 +57,10 @@ class TestResampleOp(unittest.TestCase): factor = 3.0 graph = build_graph(nodes_attributes, [('node_1', 'resample'), - ('resample', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('resample', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 224, 227])}, 'resample': {'antialias': 1, 'resample_type': 'LINEAR', @@ -77,8 +82,10 @@ class TestResampleOp(unittest.TestCase): graph = build_graph(new_attrs, [('node_1', 'resample'), ('new_shape', 'resample'), - ('resample', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('resample', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 224, 227, 3])}, 'resample': {'antialias': 1, 'resample_type': 'LINEAR', diff --git a/model-optimizer/extensions/ops/resize_factor_utils.py b/model-optimizer/extensions/ops/resize_factor_utils.py index 28424d3..09a3557 100644 --- a/model-optimizer/extensions/ops/resize_factor_utils.py +++ b/model-optimizer/extensions/ops/resize_factor_utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/reverse_sequence.py b/model-optimizer/extensions/ops/reverse_sequence.py index ff7329d..938eba1 100644 --- a/model-optimizer/extensions/ops/reverse_sequence.py +++ b/model-optimizer/extensions/ops/reverse_sequence.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,38 +14,38 @@ limitations under the License. """ -import logging as log -import networkx as nx -import numpy as np - -from mo.graph.graph import Node -from mo.ops.op import Op, PermuteAttrs +from mo.graph.graph import Graph +from mo.ops.op import Op class ReverseSequence(Op): op = 'ReverseSequence' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { - #'type': not set, there shouldn't be translated to real layer - 'seq_dim': None, - 'batch_dim': None, + 'type': __class__.op, + 'seq_axis': None, + 'batch_axis': 0, 'op': __class__.op, + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': __class__.infer, + 'in_ports_count': 2, + 'out_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) def supported_attrs(self): return [ + 'seq_axis', 'batch_axis', ] - + @staticmethod def infer(node): - if not node.has_valid('seq_dim'): - assert 1 in node.in_nodes() - assert node.in_node(1).has_valid('value') - assert node.in_node(1).value.size == 1 - node['seq_dim'] = node.in_node(1).value.item() - node.graph.remove_edge(node.in_node(1).id, node.id) + input_data_shape = node.in_node(0).shape + assert input_data_shape is not None + assert node.has_valid('seq_axis') + assert node.has_valid('batch_axis') + assert len(node.out_nodes()) == 1 - node.out_node().shape = node.in_node().shape.copy() + node.out_node().shape = input_data_shape.copy() diff --git a/model-optimizer/extensions/ops/roifeatureextractor_onnx.py b/model-optimizer/extensions/ops/roifeatureextractor_onnx.py new file mode 100644 index 0000000..5477d9b --- /dev/null +++ b/model-optimizer/extensions/ops/roifeatureextractor_onnx.py @@ -0,0 +1,53 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.ops.op import Op + + +class ExperimentalDetectronROIFeatureExtractor(Op): + op = 'ExperimentalDetectronROIFeatureExtractor' + + def __init__(self, graph, attrs): + mandatory_props = dict( + type=__class__.op, + op=__class__.op, + infer=__class__.infer + ) + + super().__init__(graph, mandatory_props, attrs) + + def backend_attrs(self): + return [ + 'distribute_rois_between_levels', + ('pyramid_scales', lambda node: ','.join(map(str, node['pyramid_scales']))), + 'image_id', + 'output_size', + 'sampling_ratio', + 'preserve_rois_order'] + + @staticmethod + def infer(node): + input_rois_shape = node.in_node(0).shape + rois_num = input_rois_shape[0] + input_features_level_0_shape = node.in_node(1).shape + channels_num = input_features_level_0_shape[1] + node.out_node(0).shape = np.array([rois_num, channels_num, node.output_size, node.output_size], dtype=np.int64) + try: + node.out_node(1).shape = np.array([rois_num, 4], dtype=np.int64) + except Exception as ex: + print(ex) diff --git a/model-optimizer/extensions/ops/select.py b/model-optimizer/extensions/ops/select.py index b377eb2..4af65dc 100644 --- a/model-optimizer/extensions/ops/select.py +++ b/model-optimizer/extensions/ops/select.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -25,9 +25,11 @@ from mo.utils.error import Error class Select(Op): op = 'Select' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': __class__.infer, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/select_test.py b/model-optimizer/extensions/ops/select_test.py index 15578d3..5fa1547 100644 --- a/model-optimizer/extensions/ops/select_test.py +++ b/model-optimizer/extensions/ops/select_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/shufflechannel.py b/model-optimizer/extensions/ops/shufflechannel.py index bb10360..8577d0b 100644 --- a/model-optimizer/extensions/ops/shufflechannel.py +++ b/model-optimizer/extensions/ops/shufflechannel.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -27,9 +28,11 @@ class ShuffleChannelOp(Op): op = 'ShuffleChannel' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': None, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': copy_shape_infer }, attrs) diff --git a/model-optimizer/extensions/ops/simplernms.py b/model-optimizer/extensions/ops/simplernms.py index 15d5298..cd1352a 100644 --- a/model-optimizer/extensions/ops/simplernms.py +++ b/model-optimizer/extensions/ops/simplernms.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,17 +20,19 @@ import networkx as nx import numpy as np from mo.front.extractor import attr_getter -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class SimplerNMSOp(Op): op = 'SimplerNMS' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': SimplerNMSOp.simplernms_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/simplernms_test.py b/model-optimizer/extensions/ops/simplernms_test.py index 08cbf53..6c44035 100644 --- a/model-optimizer/extensions/ops/simplernms_test.py +++ b/model-optimizer/extensions/ops/simplernms_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,15 +23,18 @@ from mo.graph.graph import Node from mo.utils.unittest.graph import build_graph nodes_attributes = {'SimplerNMS_1': {'type': 'SimplerNMS', 'kind': 'op'}, - 'node_1': {'type': 'Identity', 'kind': 'op'} + 'node_1': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } class TestSimplerNMSInfer(unittest.TestCase): def test_simplernms_infer_ideal(self): graph = build_graph(nodes_attributes, - [('SimplerNMS_1', 'node_1')], - {'node_1': {'is_output': True, 'shape': None}, + [('SimplerNMS_1', 'node_1'), + ('node_1', 'op_output') + ], + {'node_1': {'shape': None}, 'SimplerNMS_1': {'feat_stride': 16, 'post_nms_topn': 150, 'scale': [1, 2, 3]} }) @@ -46,8 +49,10 @@ class TestSimplerNMSInfer(unittest.TestCase): def test_simplernms_infer_no_shape(self): graph = build_graph(nodes_attributes, - [('SimplerNMS_1', 'node_1')], - {'node_1': {'is_output': True, 'shape': None}, + [('SimplerNMS_1', 'node_1'), + ('node_1', 'op_output') + ], + {'node_1': {'shape': None}, 'SimplerNMS_1': {'feat_stride': 12, 'post_nms_topn': 150, 'scale': [1, 2, 3]} }) diff --git a/model-optimizer/extensions/ops/spatial_transformer.py b/model-optimizer/extensions/ops/spatial_transformer.py index 3ab42a9..d914830 100644 --- a/model-optimizer/extensions/ops/spatial_transformer.py +++ b/model-optimizer/extensions/ops/spatial_transformer.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,19 +16,19 @@ import copy -import networkx as nx - -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class SpatialTransformOp(Op): op = 'SpatialTransformer' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': SpatialTransformOp.sp_infer } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/spatial_transformer_test.py b/model-optimizer/extensions/ops/spatial_transformer_test.py index 86b7ec2..eac48b0 100644 --- a/model-optimizer/extensions/ops/spatial_transformer_test.py +++ b/model-optimizer/extensions/ops/spatial_transformer_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'kind': 'op'}, 'node_2': {'type': 'Identity', 'kind': 'op'}, 'st': {'type': 'SpatialTransform', 'kind': 'op'}, - 'node_3': {'type': 'Identity', 'kind': 'op'} + 'node_3': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -35,9 +36,11 @@ class TestSpatialTransformInfer(unittest.TestCase): [ ('node_1', 'st'), ('node_2', 'st'), - ('st', 'node_3')], + ('st', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'st': {} @@ -55,9 +58,11 @@ class TestSpatialTransformInfer(unittest.TestCase): [ ('node_1', 'st'), ('node_2', 'st'), - ('st', 'node_3')], + ('st', 'node_3'), + ('node_3', 'op_output') + ], { - 'node_3': {'is_output': True, 'shape': None}, + 'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'st': {'output_H': 200, 'output_W': 15} diff --git a/model-optimizer/extensions/ops/splice.py b/model-optimizer/extensions/ops/splice.py index 381559e..e1fd72e 100644 --- a/model-optimizer/extensions/ops/splice.py +++ b/model-optimizer/extensions/ops/splice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,16 +15,20 @@ """ import networkx as nx + +from mo.graph.graph import Graph from mo.ops.op import Op class Splice(Op): op = 'Splice' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': None, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, } super().__init__(graph, mandatory_props, attrs) diff --git a/model-optimizer/extensions/ops/splitv.py b/model-optimizer/extensions/ops/splitv.py index 7c1fd42..67428e9 100644 --- a/model-optimizer/extensions/ops/splitv.py +++ b/model-optimizer/extensions/ops/splitv.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ import networkx as nx from mo.front.common.partial_infer.split import tf_split_v_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,12 +25,13 @@ class SplitV(Op): op = 'SplitV' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'Split', 'op': 'SplitV', 'axis': 1, 'input_port': 0, + 'in_ports_count': 3, 'infer': tf_split_v_infer }, attrs) diff --git a/model-optimizer/extensions/ops/stop_gradient.py b/model-optimizer/extensions/ops/stop_gradient.py index 58ad9bc..8db3eea 100644 --- a/model-optimizer/extensions/ops/stop_gradient.py +++ b/model-optimizer/extensions/ops/stop_gradient.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op from mo.front.common.partial_infer.utils import mark_input_bins @@ -25,11 +24,13 @@ class StopGradientOp(Op): op = 'StopGradient' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, 'identity': True, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': StopGradientOp.shape_infer }, attrs) diff --git a/model-optimizer/extensions/ops/swapaxes.py b/model-optimizer/extensions/ops/swapaxes.py index e8507fd..0029785 100644 --- a/model-optimizer/extensions/ops/swapaxes.py +++ b/model-optimizer/extensions/ops/swapaxes.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,7 @@ limitations under the License. """ -import networkx as nx - -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.permute import Permute @@ -24,7 +22,7 @@ class SwapAxes(Permute): op = 'SwapAxis' enabled = False - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): attrs.update({'infer': SwapAxes.infer}) super().__init__(graph, attrs) diff --git a/model-optimizer/extensions/ops/switch.py b/model-optimizer/extensions/ops/switch.py index b6fa822..630c051 100644 --- a/model-optimizer/extensions/ops/switch.py +++ b/model-optimizer/extensions/ops/switch.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,16 @@ limitations under the License. """ -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op class Switch(Op): op = 'Switch' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'op': __class__.op, 'infer': __class__.infer, diff --git a/model-optimizer/extensions/ops/switch_test.py b/model-optimizer/extensions/ops/switch_test.py index c5bb759..73bbf55 100644 --- a/model-optimizer/extensions/ops/switch_test.py +++ b/model-optimizer/extensions/ops/switch_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/extensions/ops/tensor_iterator.py b/model-optimizer/extensions/ops/tensor_iterator.py index faaf9a7..c5bc888 100644 --- a/model-optimizer/extensions/ops/tensor_iterator.py +++ b/model-optimizer/extensions/ops/tensor_iterator.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ import networkx as nx import numpy as np from mo.utils.error import Error -from mo.graph.graph import Node, dict_includes +from mo.graph.graph import Node, dict_includes, Graph from mo.ops.op import Op from mo.utils.utils import refer_to_faq_msg @@ -32,14 +32,14 @@ class TensorIterator(Op): op = 'TensorIterator' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): mandatory_props = { 'type': __class__.op, 'op': __class__.op, 'input_port_map': [], # a list of dicts with such attrs as external_port_id, etc. 'output_port_map': [], # a list of dicts with such attrs as external_port_id, etc. 'back_edges': [], # a list of dicts with such attrs as from_layer, from_port, etc. - 'body': None, # an nx.MultiDiGraph object with a body sub-graph + 'body': None, # an Graph object with a body sub-graph 'sub_graphs': ['body'], # built-in attribute with all sub-graphg 'infer': __class__.infer } @@ -96,14 +96,14 @@ class TensorIterator(Op): @staticmethod - def find_internal_layer_id(graph: nx.MultiDiGraph, virtual_id): + def find_internal_layer_id(graph: Graph, virtual_id): internal_nodes = list(filter(lambda d: dict_includes(d[1], {'internal_layer_id': virtual_id}), graph.nodes(data=True))) assert len(internal_nodes) == 1, 'Nodes: {}, virtual_id: {}'.format(internal_nodes, virtual_id) return internal_nodes[0][0] @staticmethod - def find_internal_layer_and_port(graph: nx.MultiDiGraph, virtual_layer_id, virtual_port_id): + def find_internal_layer_and_port(graph: Graph, virtual_layer_id, virtual_port_id): internal_layer_id = __class__.find_internal_layer_id(graph, virtual_layer_id) internal_port_id = __class__.find_port_id(Node(graph, internal_layer_id), virtual_port_id, 'internal_port_id') return internal_layer_id, internal_port_id @@ -111,11 +111,11 @@ class TensorIterator(Op): @staticmethod def generate_port_map(node: Node, src_port_map): - ''' Extract port_map attributes from node and node.body attributes. + """ Extract port_map attributes from node and node.body attributes. It iterates over src_port_map and substitude external_port_id, internal_port_id and internal_layer_id by real values queried from node ports and node.body attributes. - ''' + """ result_list = [] for map_item in src_port_map: result = dict(map_item) diff --git a/model-optimizer/extensions/ops/topkrois_onnx.py b/model-optimizer/extensions/ops/topkrois_onnx.py new file mode 100644 index 0000000..d6bba13 --- /dev/null +++ b/model-optimizer/extensions/ops/topkrois_onnx.py @@ -0,0 +1,38 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.ops.op import Op + + +class ExperimentalDetectronTopKROIs(Op): + op = 'ExperimentalDetectronTopKROIs' + + def __init__(self, graph, attrs): + mandatory_props = dict( + type=__class__.op, + op=__class__.op, + infer=__class__.infer + ) + super().__init__(graph, mandatory_props, attrs) + + def backend_attrs(self): + return ['max_rois', ] + + @staticmethod + def infer(node): + node.out_node(0).shape = np.array([node.max_rois, 4], dtype=np.int64) diff --git a/model-optimizer/install_prerequisites/install_prerequisites.sh b/model-optimizer/install_prerequisites/install_prerequisites.sh index cb6da98..8c78058 100755 --- a/model-optimizer/install_prerequisites/install_prerequisites.sh +++ b/model-optimizer/install_prerequisites/install_prerequisites.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -82,7 +82,11 @@ if [[ $V_ENV -eq 1 ]]; then echo echo "Before running the Model Optimizer, please activate virtualenv environment by running \"source ${SCRIPTDIR}/../venv/bin/activate\"" else - sudo -E $python_binary -m pip install -r $SCRIPTDIR/../requirements${postfix}.txt + if [[ "$OSTYPE" == "darwin"* ]]; then + python3 -m pip install -r $SCRIPTDIR/../requirements${postfix}.txt + else + sudo -E $python_binary -m pip install -r $SCRIPTDIR/../requirements${postfix}.txt + fi echo [WARNING] All Model Optimizer dependencies are installed globally. echo [WARNING] If you want to keep Model Optimizer in separate sandbox echo [WARNING] run install_prerequisites.sh venv "{caffe|tf|mxnet|kaldi|onnx}" diff --git a/model-optimizer/install_prerequisites/install_prerequisites_caffe.sh b/model-optimizer/install_prerequisites/install_prerequisites_caffe.sh index 9ea2518..0348223 100755 --- a/model-optimizer/install_prerequisites/install_prerequisites_caffe.sh +++ b/model-optimizer/install_prerequisites/install_prerequisites_caffe.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/model-optimizer/install_prerequisites/install_prerequisites_kaldi.sh b/model-optimizer/install_prerequisites/install_prerequisites_kaldi.sh index bcdd0e2..2996dc3 100755 --- a/model-optimizer/install_prerequisites/install_prerequisites_kaldi.sh +++ b/model-optimizer/install_prerequisites/install_prerequisites_kaldi.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/model-optimizer/install_prerequisites/install_prerequisites_mxnet.sh b/model-optimizer/install_prerequisites/install_prerequisites_mxnet.sh index 2cf20d9..da41693 100755 --- a/model-optimizer/install_prerequisites/install_prerequisites_mxnet.sh +++ b/model-optimizer/install_prerequisites/install_prerequisites_mxnet.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/model-optimizer/install_prerequisites/install_prerequisites_onnx.sh b/model-optimizer/install_prerequisites/install_prerequisites_onnx.sh index 97ea4f0..d9c9d77 100755 --- a/model-optimizer/install_prerequisites/install_prerequisites_onnx.sh +++ b/model-optimizer/install_prerequisites/install_prerequisites_onnx.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/model-optimizer/install_prerequisites/install_prerequisites_tf.sh b/model-optimizer/install_prerequisites/install_prerequisites_tf.sh index 3d7d58f..ce67a03 100755 --- a/model-optimizer/install_prerequisites/install_prerequisites_tf.sh +++ b/model-optimizer/install_prerequisites/install_prerequisites_tf.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2019 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo.py b/model-optimizer/mo.py index 7b8cc06..5c6f305 100755 --- a/model-optimizer/mo.py +++ b/model-optimizer/mo.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/back/ie_ir_ver_2/emitter.py b/model-optimizer/mo/back/ie_ir_ver_2/emitter.py index e72d1fd..3763c2d 100644 --- a/model-optimizer/mo/back/ie_ir_ver_2/emitter.py +++ b/model-optimizer/mo/back/ie_ir_ver_2/emitter.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,60 +15,16 @@ """ import hashlib -import xml.dom.minidom +from defusedxml.minidom import parseString from xml.etree.ElementTree import Element, SubElement, tostring -from mo.front.extractor import update_ie_fields from mo.graph.graph import * from mo.utils.unsupported_ops import UnsupportedOps from mo.utils.utils import refer_to_faq_msg from mo.utils.version import get_version -def create_const_nodes(graph: nx.MultiDiGraph, start_data_nodes_are_not_allowed: bool=True): - """ - Adds layers with type 'Const' that produce blob from 'bin' file. The pass finds data nodes with one output which - doesn't have edge with 'bin' attribute and generate Const op node before the node and data node before the Const - node. The data node before 'Const' node is needed because the op node dumps input tensors to bin file. - :param graph: input graph. - :return: None - """ - for node_name in list(graph.nodes()): - node = NodeWrap(graph, node_name) - if ( - node.has('kind') and - node.kind == 'data' and ( - (len(node.out_edges()) == 1 and 'bin' not in node.out_edge(0)) or - node.has_and_set('is_output') - ) and - len(node.in_nodes()) == 0): - - if node.has_valid('value'): - const_node_name = node.id + '_const' - log.debug("Added Const node '{}'".format(const_node_name)) - graph.add_node(const_node_name, name=const_node_name, type='Const', kind='op', op='Const', - precision="FP32") - update_ie_fields(node.graph.node[const_node_name]) - graph.add_edges_from([(const_node_name, node.id, {'out': 0})]) - copy_data_node_name = unique_id(graph, node.id + '_copy_') - graph.add_node(copy_data_node_name, kind='data', precision="FP32", shape=np.array(node.shape), - value=np.array(node.value)) - if node.has_valid('force_precision'): - Node(graph, copy_data_node_name)['force_precision'] = node.force_precision - Node(graph, const_node_name)['force_precision'] = node.force_precision - graph.add_edges_from([(copy_data_node_name, const_node_name, {'in': 0, 'bin': 'custom'})]) - elif start_data_nodes_are_not_allowed: - log.debug('node = {}'.format(node.graph.node[node.id])) - # TODO for body sub-graph it shouldn't be reported as an error - raise Error( - 'Discovered data node without inputs and value, node.name = {}, consumer.name = {}. ' + - refer_to_faq_msg(23), - node.soft_get('name'), - node.out_node().soft_get('name') if len(node.out_nodes()) else "" - ) - - -def serialize_constants(graph: nx.MultiDiGraph, bin_file_name:str, data_type=np.float32): +def serialize_constants(graph: Graph, bin_file_name:str, data_type=np.float32): """ Found all data constants that has output edges with 'bin' attribute. Serialize content for such constants to a binary file with name bin_file_name in @@ -86,10 +42,10 @@ def serialize_constants(graph: nx.MultiDiGraph, bin_file_name:str, data_type=np. serialize_constants_recursively(graph, bin_file, data_type, bin_hashes) -def serialize_constants_recursively(graph: nx.MultiDiGraph, bin_file, data_type, bin_hashes): +def serialize_constants_recursively(graph: Graph, bin_file, data_type, bin_hashes): nodes = sorted(graph.nodes()) for node in nodes: - node = NodeWrap(graph, node) + node = Node(graph, node) if node.kind == 'data' and node.value is not None and any('bin' in d for u, v, d in graph.out_edges(node.node, data=True)): blob = node.value @@ -118,7 +74,7 @@ def serialize_constants_recursively(graph: nx.MultiDiGraph, bin_file, data_type, # separate loop for sub-graph to dump them after all blobs for more natural blob offset ordering # TODO: implement strict order for all blobs in entier IR for node in nodes: - node = NodeWrap(graph, node) + node = Node(graph, node) # Dump blobs recursively if sub-graphs are present in the node if node.has_valid('sub_graphs'): for sub_graph_attr_name in node.sub_graphs: @@ -140,7 +96,7 @@ def serialize_mean_image(bin_file_name: str, mean_data=[]): return mean_offset, mean_size -def xml_shape(shape: np.ndarray, element: xml.etree.ElementTree.Element): +def xml_shape(shape: np.ndarray, element: Element): for d in shape: dim = SubElement(element, 'dim') if d <= 0: @@ -154,10 +110,10 @@ def xml_shape(shape: np.ndarray, element: xml.etree.ElementTree.Element): dim.text = str(d) -def xml_ports(node: Node, element: xml.etree.ElementTree.Element, edges: xml.etree.ElementTree.Element): +def xml_ports(node: Node, element: Element, edges: Element): # input ports inputs = None # will create input section only if at least one input is available - for u, d in get_sorted_inputs(node): + for u, d in node.get_sorted_inputs(): if 'bin' not in d and ('xml_skip' not in d or not d['xml_skip']): if inputs is None: inputs = SubElement(element, 'input') @@ -180,7 +136,7 @@ def xml_ports(node: Node, element: xml.etree.ElementTree.Element, edges: xml.etr # output ports outputs = None - for v, d in get_sorted_outputs(node): + for v, d in node.get_sorted_outputs(): if 'xml_skip' not in d or not d['xml_skip']: if outputs is None: outputs = SubElement(element, 'output') @@ -192,9 +148,9 @@ def xml_ports(node: Node, element: xml.etree.ElementTree.Element, edges: xml.etr xml_shape(node.graph.node[v]['shape'], port) -def xml_consts(graph: nx.MultiDiGraph, node: Node, element: xml.etree.ElementTree.Element): +def xml_consts(graph: Graph, node: Node, element: Element): blobs = None # sub-element that will be created on-demand - for u, d in get_sorted_inputs(node): + for u, d in node.get_sorted_inputs(): if 'bin' in d: if not blobs: blobs = SubElement(element, 'blobs') @@ -213,11 +169,11 @@ def soft_get(node, attr): def serialize_element( - graph: nx.MultiDiGraph, + graph: Graph, node, schema: list, - parent_element: xml.etree.ElementTree.Element, - edges: xml.etree.ElementTree.Element, + parent_element: Element, + edges: Element, unsupported): name, attrs, subelements = schema @@ -265,11 +221,11 @@ def serialize_meta_list(graph, node, schema, element, edges, unsupported): def serialize_node_attributes( - graph: nx.MultiDiGraph, # the current network graph + graph: Graph, # the current network graph node, # dictionry-like object that should be serialized schema: list, - parent_element: xml.etree.ElementTree.Element, - edges: xml.etree.ElementTree.Element, + parent_element: Element, + edges: Element, unsupported): try: @@ -303,7 +259,7 @@ def serialize_node_attributes( ) from e -def create_pre_process_block_for_image(net: xml.etree.ElementTree.Element, ref_layer_names: list, mean_offset: tuple, +def create_pre_process_block_for_image(net: Element, ref_layer_names: list, mean_offset: tuple, mean_size: tuple): pre_process = SubElement(net, 'pre-process') pre_process.set('mean-precision', 'FP32') # TODO: to think about need to output FP16 mean values @@ -346,7 +302,21 @@ def create_pre_process_block(net, ref_layer_name, means, scales=None): return pre_process -def add_meta_data(net: xml.etree.ElementTree.Element, meta_info: dict): +def add_quantization_statistics(graph, net_element): + if 'statistics' in graph.graph: + stats = SubElement(net_element, 'statistics') + for tensor, interval in graph.graph['statistics'].items(): + layer = SubElement(stats, 'layer') + name = SubElement(layer, 'name') + name.text = tensor + min = SubElement(layer, 'min') + min.text = interval['min'] + max = SubElement(layer, 'max') + max.text = interval['max'] + log.info('Statistics were inserted to IR') + + +def add_meta_data(net: Element, meta_info: dict): meta = SubElement(net, 'meta_data') SubElement(meta, 'MO_version').set('value', get_version()) parameters = SubElement(meta, 'cli_parameters') @@ -355,7 +325,6 @@ def add_meta_data(net: xml.etree.ElementTree.Element, meta_info: dict): SubElement(parameters, 'unset').set('unset_cli_parameters', ', '.join(sorted(meta_info['unset']))) - def serialize_network(graph, net_element, unsupported): layers = SubElement(net_element, 'layers') edges = SubElement(net_element, 'edges') @@ -363,7 +332,7 @@ def serialize_network(graph, net_element, unsupported): return nodes = sorted(graph.nodes()) for node in nodes: - node = NodeWrap(graph, node) + node = Node(graph, node) if not node.has('IE'): continue if node.kind == 'op' and (not node.has('type') or node.type is None): @@ -375,7 +344,7 @@ def serialize_network(graph, net_element, unsupported): raise Error(str(e).replace('', '{} (id = {})'.format(node.soft_get('name'), node.id))) from e -def generate_ie_ir(graph: nx.MultiDiGraph, file_name: str, input_names: tuple = (), mean_offset: tuple = (), +def generate_ie_ir(graph: Graph, file_name: str, input_names: tuple = (), mean_offset: tuple = (), mean_size: tuple = (), meta_info: dict = dict()): """ Extracts IE/IR attributes from kind='op' nodes in three ways: @@ -408,27 +377,28 @@ def generate_ie_ir(graph: nx.MultiDiGraph, file_name: str, input_names: tuple = unsupported = UnsupportedOps(graph) serialize_network(graph, net, unsupported) + add_quantization_statistics(graph, net) add_meta_data(net, meta_info) xml_string = tostring(net) - xml_doc = xml.dom.minidom.parseString(xml_string) # ugly? + xml_doc = parseString(xml_string) pretty_xml_as_string = xml_doc.toprettyxml() if len(unsupported.unsupported): log.debug('Partially correct IR XML:\n{}'.format(pretty_xml_as_string)) - unsupported.report(log.error, "List of operations that cannot be converted to IE IR:") - raise Error('Part of the nodes was not translated to IE. Stopped. ' + + unsupported.report(log.error, "List of operations that cannot be converted to Inference Engine IR:") + raise Error('Part of the nodes was not converted to IR. Stopped. ' + refer_to_faq_msg(24)) with open(file_name, 'w') as file: file.write(pretty_xml_as_string) -def port_renumber(graph: nx.MultiDiGraph): +def port_renumber(graph: Graph): for node in list(graph.nodes()): - node = NodeWrap(graph, node) + node = Node(graph, node) if node.kind == 'op': base = 0 - for u, d in get_sorted_inputs(node): + for u, d in node.get_sorted_inputs(): d['in'] = base base += 1 - for v, d in get_sorted_outputs(node): + for v, d in node.get_sorted_outputs(): d['out'] = base base += 1 diff --git a/model-optimizer/mo/back/ie_ir_ver_2/emitter_test.py b/model-optimizer/mo/back/ie_ir_ver_2/emitter_test.py index 44830dd..bb39758 100644 --- a/model-optimizer/mo/back/ie_ir_ver_2/emitter_test.py +++ b/model-optimizer/mo/back/ie_ir_ver_2/emitter_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/back/replacement.py b/model-optimizer/mo/back/replacement.py index c55c074..e47e6fe 100644 --- a/model-optimizer/mo/back/replacement.py +++ b/model-optimizer/mo/back/replacement.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/collect_attributes.py b/model-optimizer/mo/front/caffe/collect_attributes.py index 0ce7054..1855d20 100644 --- a/model-optimizer/mo/front/caffe/collect_attributes.py +++ b/model-optimizer/mo/front/caffe/collect_attributes.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/custom_layers_mapping.py b/model-optimizer/mo/front/caffe/custom_layers_mapping.py index 65500da..f9ecae3 100644 --- a/model-optimizer/mo/front/caffe/custom_layers_mapping.py +++ b/model-optimizer/mo/front/caffe/custom_layers_mapping.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ """ import logging as log from builtins import AttributeError -from xml.etree import ElementTree +from defusedxml import ElementTree from mo.front.caffe.collect_attributes import collect_attributes from mo.front.caffe.extractor import node_pb_arg diff --git a/model-optimizer/mo/front/caffe/custom_layers_mapping_test.py b/model-optimizer/mo/front/caffe/custom_layers_mapping_test.py index 84ce9b5..c9efbc4 100644 --- a/model-optimizer/mo/front/caffe/custom_layers_mapping_test.py +++ b/model-optimizer/mo/front/caffe/custom_layers_mapping_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractor.py b/model-optimizer/mo/front/caffe/extractor.py index 72e3283..6d7f777 100644 --- a/model-optimizer/mo/front/caffe/extractor.py +++ b/model-optimizer/mo/front/caffe/extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractor_test.py b/model-optimizer/mo/front/caffe/extractor_test.py index b5b2925..9b4d0ce 100644 --- a/model-optimizer/mo/front/caffe/extractor_test.py +++ b/model-optimizer/mo/front/caffe/extractor_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/batchnorm.py b/model-optimizer/mo/front/caffe/extractors/batchnorm.py index c4bb8cb..5c71a19 100644 --- a/model-optimizer/mo/front/caffe/extractors/batchnorm.py +++ b/model-optimizer/mo/front/caffe/extractors/batchnorm.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/batchnorm_test.py b/model-optimizer/mo/front/caffe/extractors/batchnorm_test.py index eeb441d..a8f122f 100644 --- a/model-optimizer/mo/front/caffe/extractors/batchnorm_test.py +++ b/model-optimizer/mo/front/caffe/extractors/batchnorm_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/concat.py b/model-optimizer/mo/front/caffe/extractors/concat.py index e3bfd7b..cd67d65 100644 --- a/model-optimizer/mo/front/caffe/extractors/concat.py +++ b/model-optimizer/mo/front/caffe/extractors/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/concat_test.py b/model-optimizer/mo/front/caffe/extractors/concat_test.py index 117ce04..a82633f 100644 --- a/model-optimizer/mo/front/caffe/extractors/concat_test.py +++ b/model-optimizer/mo/front/caffe/extractors/concat_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/crop.py b/model-optimizer/mo/front/caffe/extractors/crop.py index 4c82d6a..7eadf4a 100644 --- a/model-optimizer/mo/front/caffe/extractors/crop.py +++ b/model-optimizer/mo/front/caffe/extractors/crop.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/crop_test.py b/model-optimizer/mo/front/caffe/extractors/crop_test.py index 9405e70..cc764fb 100644 --- a/model-optimizer/mo/front/caffe/extractors/crop_test.py +++ b/model-optimizer/mo/front/caffe/extractors/crop_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/eltwise.py b/model-optimizer/mo/front/caffe/extractors/eltwise.py index 2365303..bf57976 100644 --- a/model-optimizer/mo/front/caffe/extractors/eltwise.py +++ b/model-optimizer/mo/front/caffe/extractors/eltwise.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/eltwise_test.py b/model-optimizer/mo/front/caffe/extractors/eltwise_test.py index e077c42..86f9172 100644 --- a/model-optimizer/mo/front/caffe/extractors/eltwise_test.py +++ b/model-optimizer/mo/front/caffe/extractors/eltwise_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/elu.py b/model-optimizer/mo/front/caffe/extractors/elu.py index 464a77f..e52d933 100644 --- a/model-optimizer/mo/front/caffe/extractors/elu.py +++ b/model-optimizer/mo/front/caffe/extractors/elu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/elu_test.py b/model-optimizer/mo/front/caffe/extractors/elu_test.py index c482888..4df18b0 100644 --- a/model-optimizer/mo/front/caffe/extractors/elu_test.py +++ b/model-optimizer/mo/front/caffe/extractors/elu_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/inner_product.py b/model-optimizer/mo/front/caffe/extractors/inner_product.py index bac429c..f6ee212 100644 --- a/model-optimizer/mo/front/caffe/extractors/inner_product.py +++ b/model-optimizer/mo/front/caffe/extractors/inner_product.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/inner_product_test.py b/model-optimizer/mo/front/caffe/extractors/inner_product_test.py index 44501c3..f70bef9 100644 --- a/model-optimizer/mo/front/caffe/extractors/inner_product_test.py +++ b/model-optimizer/mo/front/caffe/extractors/inner_product_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/input.py b/model-optimizer/mo/front/caffe/extractors/input.py index 94d1822..743e6ea 100644 --- a/model-optimizer/mo/front/caffe/extractors/input.py +++ b/model-optimizer/mo/front/caffe/extractors/input.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/input_test.py b/model-optimizer/mo/front/caffe/extractors/input_test.py index 37d1fc1..ea54f43 100644 --- a/model-optimizer/mo/front/caffe/extractors/input_test.py +++ b/model-optimizer/mo/front/caffe/extractors/input_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/lrn.py b/model-optimizer/mo/front/caffe/extractors/lrn.py index 669e337..3d5ba4d 100644 --- a/model-optimizer/mo/front/caffe/extractors/lrn.py +++ b/model-optimizer/mo/front/caffe/extractors/lrn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/lrn_test.py b/model-optimizer/mo/front/caffe/extractors/lrn_test.py index e5c7f8b..ef9a419 100644 --- a/model-optimizer/mo/front/caffe/extractors/lrn_test.py +++ b/model-optimizer/mo/front/caffe/extractors/lrn_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/native_caffe.py b/model-optimizer/mo/front/caffe/extractors/native_caffe.py index 6e96b17..db13d5a 100644 --- a/model-optimizer/mo/front/caffe/extractors/native_caffe.py +++ b/model-optimizer/mo/front/caffe/extractors/native_caffe.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/permute.py b/model-optimizer/mo/front/caffe/extractors/permute.py index 34dcd5f..2a5e617 100644 --- a/model-optimizer/mo/front/caffe/extractors/permute.py +++ b/model-optimizer/mo/front/caffe/extractors/permute.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/permute_test.py b/model-optimizer/mo/front/caffe/extractors/permute_test.py index 232e520..f6faf9e 100644 --- a/model-optimizer/mo/front/caffe/extractors/permute_test.py +++ b/model-optimizer/mo/front/caffe/extractors/permute_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/power.py b/model-optimizer/mo/front/caffe/extractors/power.py index 0f44824..2a06da2 100644 --- a/model-optimizer/mo/front/caffe/extractors/power.py +++ b/model-optimizer/mo/front/caffe/extractors/power.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/power_test.py b/model-optimizer/mo/front/caffe/extractors/power_test.py index 5281bbb..a39e5b2 100644 --- a/model-optimizer/mo/front/caffe/extractors/power_test.py +++ b/model-optimizer/mo/front/caffe/extractors/power_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/relu.py b/model-optimizer/mo/front/caffe/extractors/relu.py index 4e2ca88..100b553 100644 --- a/model-optimizer/mo/front/caffe/extractors/relu.py +++ b/model-optimizer/mo/front/caffe/extractors/relu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/relu6.py b/model-optimizer/mo/front/caffe/extractors/relu6.py index e66d3a6..6a3f925 100644 --- a/model-optimizer/mo/front/caffe/extractors/relu6.py +++ b/model-optimizer/mo/front/caffe/extractors/relu6.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/relu_test.py b/model-optimizer/mo/front/caffe/extractors/relu_test.py index b807166..aa4b7bf 100644 --- a/model-optimizer/mo/front/caffe/extractors/relu_test.py +++ b/model-optimizer/mo/front/caffe/extractors/relu_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/reshape.py b/model-optimizer/mo/front/caffe/extractors/reshape.py index 13deb99..c7893c1 100644 --- a/model-optimizer/mo/front/caffe/extractors/reshape.py +++ b/model-optimizer/mo/front/caffe/extractors/reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/reshape_test.py b/model-optimizer/mo/front/caffe/extractors/reshape_test.py index 4551eb7..8738d44 100644 --- a/model-optimizer/mo/front/caffe/extractors/reshape_test.py +++ b/model-optimizer/mo/front/caffe/extractors/reshape_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/roipooling.py b/model-optimizer/mo/front/caffe/extractors/roipooling.py index 8d6dc7c..3a56297 100644 --- a/model-optimizer/mo/front/caffe/extractors/roipooling.py +++ b/model-optimizer/mo/front/caffe/extractors/roipooling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/scale.py b/model-optimizer/mo/front/caffe/extractors/scale.py index 196b7d5..cc7e46f 100644 --- a/model-optimizer/mo/front/caffe/extractors/scale.py +++ b/model-optimizer/mo/front/caffe/extractors/scale.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/scale_test.py b/model-optimizer/mo/front/caffe/extractors/scale_test.py index 9258295..19cfd62 100644 --- a/model-optimizer/mo/front/caffe/extractors/scale_test.py +++ b/model-optimizer/mo/front/caffe/extractors/scale_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/sigmoid.py b/model-optimizer/mo/front/caffe/extractors/sigmoid.py index 5594c83..851d599 100644 --- a/model-optimizer/mo/front/caffe/extractors/sigmoid.py +++ b/model-optimizer/mo/front/caffe/extractors/sigmoid.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/slice.py b/model-optimizer/mo/front/caffe/extractors/slice.py index 953f88c..3927e99 100644 --- a/model-optimizer/mo/front/caffe/extractors/slice.py +++ b/model-optimizer/mo/front/caffe/extractors/slice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/slice_test.py b/model-optimizer/mo/front/caffe/extractors/slice_test.py index 22b43b8..b2a9215 100644 --- a/model-optimizer/mo/front/caffe/extractors/slice_test.py +++ b/model-optimizer/mo/front/caffe/extractors/slice_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/tanh.py b/model-optimizer/mo/front/caffe/extractors/tanh.py index 97bfb89..9d75264 100644 --- a/model-optimizer/mo/front/caffe/extractors/tanh.py +++ b/model-optimizer/mo/front/caffe/extractors/tanh.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/tile.py b/model-optimizer/mo/front/caffe/extractors/tile.py index 63b4c56..f9d3319 100644 --- a/model-optimizer/mo/front/caffe/extractors/tile.py +++ b/model-optimizer/mo/front/caffe/extractors/tile.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/utils.py b/model-optimizer/mo/front/caffe/extractors/utils.py index 416598a..32d0cef 100644 --- a/model-optimizer/mo/front/caffe/extractors/utils.py +++ b/model-optimizer/mo/front/caffe/extractors/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/extractors/utils_test.py b/model-optimizer/mo/front/caffe/extractors/utils_test.py index 6983a0f..7a98511 100644 --- a/model-optimizer/mo/front/caffe/extractors/utils_test.py +++ b/model-optimizer/mo/front/caffe/extractors/utils_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/loader.py b/model-optimizer/mo/front/caffe/loader.py index 69f63f2..40dd09c 100644 --- a/model-optimizer/mo/front/caffe/loader.py +++ b/model-optimizer/mo/front/caffe/loader.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,14 +18,12 @@ import logging as log import mmap import os - -import networkx as nx import numpy as np from google.protobuf import text_format from google.protobuf.internal import api_implementation from mo.front.caffe.proto import caffe_pb2 -from mo.graph.graph import Node, unique_id +from mo.graph.graph import Node, Graph from mo.utils.error import Error, FrameworkError from mo.utils.utils import refer_to_faq_msg @@ -165,10 +163,10 @@ def caffe_pb_to_nx(proto, model): Returns ---------- - nx.MultiDiGraph + Graph built NX Directed graph. """ - graph = nx.MultiDiGraph() + graph = Graph() # Blobs in prototxt model can be reused by inplace layer. # This requires loading of pb layers in order and tracking the latest # layer that writes a particular blob. @@ -282,7 +280,7 @@ def caffe_pb_to_nx(proto, model): input_dims.append(np.array(list(dims), dtype=np.int64)) input_names.append(layer.name) - layer.name = unique_id(graph, layer.name) + layer.name = graph.unique_id(layer.name) graph.add_node(layer.name, pb=layer, model_pb=model_layer, kind='op') # connect inputs based on blob_producers dictionary @@ -307,27 +305,6 @@ def caffe_pb_to_nx(proto, model): log.debug("Detected reuse of blob {} by layer {}".format(top, layer.name)) blob_producers[top] = (layer.name, src_port) - # Find all nodes that do not have consumers. - # Add identity ops as a consumers for each output port for such nodes. - for node in list(graph.nodes()): - node = Node(graph, node) - if len(node.out_nodes()) == 0: - if not node.has_valid('pb') or not hasattr(node.pb, 'top'): - continue - for port, top in enumerate(node.pb.top): - new_id = unique_id(graph, 'TerminalIdentity_') - graph.add_node(new_id, op='Identity', type='Identity', kind='op') - edge_attrs = { - 'out': port, - 'in': 0, - 'name': top, - 'fw_tensor_debug_info': [(node.id, top)], # debug anchor for a framework tensor name and port - 'in_attrs': ['in', 'name'], - 'out_attrs': ['out', 'name'], - 'data_attrs': ['fw_tensor_debug_info'] - } - graph.add_edge(node.id, new_id, **edge_attrs) - if len(input_names) <= 0: raise Error('The topology contains no "input" layers. ' + refer_to_faq_msg(79)) diff --git a/model-optimizer/mo/front/caffe/loader_test.py b/model-optimizer/mo/front/caffe/loader_test.py index b61f6d3..9128730 100644 --- a/model-optimizer/mo/front/caffe/loader_test.py +++ b/model-optimizer/mo/front/caffe/loader_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -156,5 +156,5 @@ class TestLoader(unittest.TestCase): proto = caffe_pb2.NetParameter() text_format.Merge(proto_str_multi_input + proto_same_name_layers, proto) graph, input_shapes = caffe_pb_to_nx(proto, None) - # 6 nodes because: 2 inputs + 2 convolutions + 2 output nodes - np.testing.assert_equal(len(graph.nodes()), 6) + # 6 nodes because: 2 inputs + 2 convolutions + np.testing.assert_equal(len(graph.nodes()), 4) diff --git a/model-optimizer/mo/front/caffe/proto/caffe_pb2.py b/model-optimizer/mo/front/caffe/proto/caffe_pb2.py index c32fa78..6e14d46 100644 --- a/model-optimizer/mo/front/caffe/proto/caffe_pb2.py +++ b/model-optimizer/mo/front/caffe/proto/caffe_pb2.py @@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor.FileDescriptor( name='mo_caffe.proto', package='mo_caffe', - serialized_pb=_b('\n\x0emo_caffe.proto\x12\x08mo_caffe\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcf\x01\n\tBlobProto\x12\"\n\x05shape\x18\x07 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"5\n\x0f\x42lobProtoVector\x12\"\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobProto\"M\n\x1e\x43osineSimilarityBatchParameter\x12\x14\n\tpos_label\x18\x01 \x01(\x01:\x01\x31\x12\x15\n\tneg_label\x18\x02 \x01(\x01:\x02-1\"\x81\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\"A\n\x0cLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05label\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"0\n\x08LabelMap\x12$\n\x04item\x18\x01 \x03(\x0b\x32\x16.mo_caffe.LabelMapItem\"\x87\x01\n\x0eNormalizedBBox\x12\x0c\n\x04xmin\x18\x01 \x01(\x02\x12\x0c\n\x04ymin\x18\x02 \x01(\x02\x12\x0c\n\x04xmax\x18\x03 \x01(\x02\x12\x0c\n\x04ymax\x18\x04 \x01(\x02\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x11\n\tdifficult\x18\x06 \x01(\x08\x12\r\n\x05score\x18\x07 \x01(\x02\x12\x0c\n\x04size\x18\x08 \x01(\x02\"\xad\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x45\n\rvariance_norm\x18\x08 \x01(\x0e\x32&.mo_caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\x12\x0c\n\x04\x66ile\x18\t \x01(\t\x12\x10\n\x08\x64iag_val\x18\n \x03(\x02\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\xed\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12(\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12!\n\x05state\x18\x06 \x01(\x0b\x32\x12.mo_caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cprofile_info\x18\t \x01(\x08:\x05\x66\x61lse\x12\x18\n\x0cprofile_iter\x18\n \x01(\x05:\x02\x35\x30\x12\x1a\n\x0eprofile_warmup\x18\x0b \x01(\x05:\x02\x31\x30\x12\'\n\x05layer\x18\x64 \x03(\x0b\x32\x18.mo_caffe.LayerParameter\x12*\n\x06layers\x18\x02 \x03(\x0b\x32\x1a.mo_caffe.V1LayerParameter\"\xf4\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12)\n\tnet_param\x18\x19 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12/\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12.\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x16.mo_caffe.NetParameter\x12\'\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x12.mo_caffe.NetState\x12&\n\ntest_state\x18\x1b \x03(\x0b\x32\x12.mo_caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18 \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x17\n\x0fplateau_winsize\x18* \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12N\n\x0fsnapshot_format\x18% \x01(\x0e\x32(.mo_caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12>\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x14\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x05\x31\x65-08\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12>\n\x0bsolver_type\x18\x1e \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverType:\x03SGD\x12\x1f\n\x11layer_wise_reduce\x18) \x01(\x08:\x04true\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"\xa8\x01\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12$\n\x07history\x18\x03 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\x12\x1b\n\x0cminimum_loss\x18\x05 \x01(\x02:\x05\x31\x65+38\x12\x1a\n\x0fiter_last_event\x18\x06 \x01(\x05:\x01\x30\"Q\n\x08NetState\x12$\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"v\n\x0cNetStateRule\x12\x1e\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\"\xad\x02\n\x1bSpatialTransformerParameter\x12\x1e\n\x0etransform_type\x18\x01 \x01(\t:\x06\x61\x66\x66ine\x12\x1e\n\x0csampler_type\x18\x02 \x01(\t:\x08\x62ilinear\x12\x10\n\x08output_H\x18\x03 \x01(\x05\x12\x10\n\x08output_W\x18\x04 \x01(\x05\x12\x1b\n\rto_compute_dU\x18\x05 \x01(\x08:\x04true\x12\x11\n\ttheta_1_1\x18\x06 \x01(\x01\x12\x11\n\ttheta_1_2\x18\x07 \x01(\x01\x12\x11\n\ttheta_1_3\x18\x08 \x01(\x01\x12\x11\n\ttheta_2_1\x18\t \x01(\x01\x12\x11\n\ttheta_2_2\x18\n \x01(\x01\x12\x11\n\ttheta_2_3\x18\x0b \x01(\x01\x12\x1b\n\x0c\x64\x65_transform\x18\x0c \x01(\x08:\x05\x66\x61lse\"(\n\x12PowerFileParameter\x12\x12\n\nshift_file\x18\x01 \x01(\t\"5\n\x0fSTLossParameter\x12\x10\n\x08output_H\x18\x01 \x02(\x05\x12\x10\n\x08output_W\x18\x02 \x02(\x05\"%\n\x10LocLossParameter\x12\x11\n\tthreshold\x18\x01 \x02(\x01\"\xa6\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x34\n\nshare_mode\x18\x02 \x01(\x0e\x32 .mo_caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xf4#\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1e\n\x05phase\x18\n \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\"\n\x05param\x18\x06 \x03(\x0b\x32\x13.mo_caffe.ParamSpec\x12\"\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12\'\n\x07include\x18\x08 \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18\t \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12:\n\x0ftransform_param\x18\x64 \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18\x65 \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12\x37\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x1c.mo_caffe.BatchNormParameter\x12,\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x17.mo_caffe.BiasParameter\x12I\n\x19\x63hannel_permutation_param\x18\x92? \x01(\x0b\x32%.mo_caffe.ChannelPermutationParameter\x12/\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12,\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x17.mo_caffe.CropParameter\x12\x39\n\x11\x63tc_decoder_param\x18\x95\x01 \x01(\x0b\x32\x1d.mo_caffe.CTCDecoderParameter\x12\x33\n\x0e\x63tc_loss_param\x18\x94\x01 \x01(\x0b\x32\x1a.mo_caffe.CTCLossParameter\x12+\n\ndata_param\x18k \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18l \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18n \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12*\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x16.mo_caffe.ELUParameter\x12.\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x18.mo_caffe.EmbedParameter\x12)\n\texp_param\x18o \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x32\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x1a.mo_caffe.FlattenParameter\x12*\n\tgrn_param\x18\xd5\x01 \x01(\x0b\x32\x16.mo_caffe.GRNParameter\x12\x34\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18s \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18u \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12.\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x18.mo_caffe.InputParameter\x12*\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x16.mo_caffe.LogParameter\x12)\n\tlrn_param\x18v \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18w \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18x \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x36\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x1c.mo_caffe.ParameterParameter\x12\x31\n\rpooling_param\x18y \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12\x32\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x1a.mo_caffe.PermuteParameter\x12-\n\x0bpower_param\x18z \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12.\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x18.mo_caffe.PReLUParameter\x12\x30\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x19.mo_caffe.PythonParameter\x12\x36\n\x0frecurrent_param\x18\x92\x01 \x01(\x0b\x32\x1c.mo_caffe.RecurrentParameter\x12\x36\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x1c.mo_caffe.ReductionParameter\x12+\n\nrelu_param\x18{ \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x32\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x1a.mo_caffe.ReshapeParameter\x12\x32\n\rreverse_param\x18\x93\x01 \x01(\x0b\x32\x1a.mo_caffe.ReverseParameter\x12.\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x18.mo_caffe.ScaleParameter\x12\x31\n\rsigmoid_param\x18| \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18} \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12*\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x16.mo_caffe.SPPParameter\x12-\n\x0bslice_param\x18~ \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18\x7f \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x36\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12,\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x17.mo_caffe.TileParameter\x12\x39\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12\x38\n\x08st_param\x18\x96\x01 \x01(\x0b\x32%.mo_caffe.SpatialTransformerParameter\x12\x31\n\rst_loss_param\x18\x97\x01 \x01(\x0b\x32\x19.mo_caffe.STLossParameter\x12\x37\n\x10power_file_param\x18\x98\x01 \x01(\x0b\x32\x1c.mo_caffe.PowerFileParameter\x12\x33\n\x0eloc_loss_param\x18\x99\x01 \x01(\x0b\x32\x1a.mo_caffe.LocLossParameter\x12\x34\n\x0eproposal_param\x18\xc9\x01 \x01(\x0b\x32\x1b.mo_caffe.ProposalParameter\x12P\n\x1d\x63osine_similarity_batch_param\x18\xca\x01 \x01(\x0b\x32(.mo_caffe.CosineSimilarityBatchParameter\x12\x45\n\x0erss_loss_param\x18\xcb\x01 \x01(\x0b\x32,.mo_caffe.RandomSamplingSoftmaxLossParameter\x12\x31\n\nnorm_param\x18\xcc\x01 \x01(\x0b\x32\x1c.mo_caffe.NormalizeParameter\x12\x39\n\x11roi_warping_param\x18\xcd\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIWarpingParameter\x12=\n\x13psroi_pooling_param\x18\xcf\x01 \x01(\x0b\x32\x1f.mo_caffe.PSROIPoolingParameter\x12\x39\n\x11roi_pooling_param\x18\xd0\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIPoolingParameter\x12>\n\x14smooth_l1_loss_param\x18\xd1\x01 \x01(\x0b\x32\x1f.mo_caffe.SmoothL1LossParameter\x12\x46\n\x18\x62ox_annotator_ohem_param\x18\xd2\x01 \x01(\x0b\x32#.mo_caffe.BoxAnnotatorOHEMParameter\x12\x43\n\x16\x64\x65tection_output_param\x18\xd3\x01 \x01(\x0b\x32\".mo_caffe.DetectionOutputParameter\x12\x35\n\x0fprior_box_param\x18\xd4\x01 \x01(\x0b\x32\x1b.mo_caffe.PriorBoxParameter\x12\x39\n\x11region_yolo_param\x18\xd6\x01 \x01(\x0b\x32\x1d.mo_caffe.RegionYoloParameter\x12\x37\n\x10reorg_yolo_param\x18\xd7\x01 \x01(\x0b\x32\x1c.mo_caffe.ReorgYoloParameter\x12.\n\x0brelu6_param\x18\xd8\x01 \x01(\x0b\x32\x18.mo_caffe.ReLU6Parameter\x12\x30\n\x0cinterp_param\x18\xd9\x01 \x01(\x0b\x32\x19.mo_caffe.InterpParameter\x12<\n\x12\x61ugmentation_param\x18\xda\x01 \x01(\x0b\x32\x1f.mo_caffe.AugmentationParameter\x12:\n\x11\x63orrelation_param\x18\xdb\x01 \x01(\x0b\x32\x1e.mo_caffe.CorrelationParameter\x12\x34\n\x0eresample_param\x18\xdc\x01 \x01(\x0b\x32\x1b.mo_caffe.ResampleParameter\x12\x35\n\x0f\x66low_warp_param\x18\xdd\x01 \x01(\x0b\x32\x1b.mo_caffe.FlowWarpParameter\x12.\n\x0b\x61\x63\x63um_param\x18\xde\x01 \x01(\x0b\x32\x18.mo_caffe.AccumParameter\x12?\n\x14\x63oeff_schedule_param\x18\xdf\x01 \x01(\x0b\x32 .mo_caffe.CoeffScheduleParameter\x12\x41\n\x15shuffle_channel_param\x18\xe0\x01 \x01(\x0b\x32!.mo_caffe.ShuffleChannelParameter\"\x90\x01\n\x0fInterpParameter\x12\x11\n\x06height\x18\x01 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0bzoom_factor\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\rshrink_factor\x18\x04 \x01(\x05:\x01\x31\x12\x12\n\x07pad_beg\x18\x05 \x01(\x05:\x01\x30\x12\x12\n\x07pad_end\x18\x06 \x01(\x05:\x01\x30\"n\n\"RandomSamplingSoftmaxLossParameter\x12 \n\x13random_sampling_num\x18\x01 \x01(\x05:\x03\x31\x30\x30\x12&\n\x16random_sampling_policy\x18\x02 \x01(\t:\x06random\"\xc8\x01\n\x11ProposalParameter\x12\x17\n\x0b\x66\x65\x61t_stride\x18\x01 \x01(\r:\x02\x31\x36\x12\x15\n\tbase_size\x18\x02 \x01(\r:\x02\x31\x36\x12\x14\n\x08min_size\x18\x03 \x01(\r:\x02\x31\x36\x12\r\n\x05ratio\x18\x04 \x03(\x02\x12\r\n\x05scale\x18\x05 \x03(\x02\x12\x1a\n\x0cpre_nms_topn\x18\x06 \x01(\r:\x04\x36\x30\x30\x30\x12\x1a\n\rpost_nms_topn\x18\x07 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x08 \x01(\x02:\x03\x30.7\"\x95\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12/\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x31\x65-10\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\xb6\x01\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\"\xb4\x02\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12G\n\rnormalization\x18\x03 \x01(\x0e\x32).mo_caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x12\x1f\n\x14pre_fixed_normalizer\x18\x04 \x01(\x02:\x01\x31\x12$\n\x15weight_by_label_freqs\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63lass_weighting\x18\x06 \x03(\x02\"Q\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\r\n\tPRE_FIXED\x10\x03\x12\x08\n\x04NONE\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"D\n\x18\x43hannelPermutationAction\x12\x0c\n\x04\x63han\x18\x01 \x02(\r\x12\x0c\n\x04\x63opy\x18\x02 \x01(\r\x12\x0c\n\x04\x66ill\x18\x03 \x01(\x02\"\x9a\x01\n\x1b\x43hannelPermutationParameter\x12\x32\n\x06\x61\x63tion\x18\x01 \x03(\x0b\x32\".mo_caffe.ChannelPermutationAction\x12\x12\n\nnum_output\x18\x10 \x02(\r\x12\x1f\n\x10inplace_possible\x18\x11 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x07version\x18\x12 \x01(\x05:\x01\x30\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"j\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12&\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x05\x30.999\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-05\"J\n\x19\x42oxAnnotatorOHEMParameter\x12\x13\n\x0broi_per_img\x18\x01 \x02(\r\x12\x18\n\x0cignore_label\x18\x02 \x01(\x05:\x02-1\"`\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x85\x04\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12\x30\n\rweight_filler\x18\x07 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12>\n\x06\x65ngine\x18\x0f \x01(\x0e\x32%.mo_caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"A\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\x12\x0f\n\x07\x64imsize\x18\x03 \x03(\r\"P\n\x13\x43TCDecoderParameter\x12\x17\n\x0b\x62lank_index\x18\x01 \x01(\x05:\x02-1\x12 \n\x12\x63tc_merge_repeated\x18\x02 \x01(\x08:\x04true\"\xb2\x01\n\x10\x43TCLossParameter\x12\x17\n\x0coutput_delay\x18\x01 \x01(\x05:\x01\x30\x12\x17\n\x0b\x62lank_index\x18\x02 \x01(\x05:\x02-1\x12+\n\x1cpreprocess_collapse_repeated\x18\x03 \x01(\x08:\x05\x66\x61lse\x12 \n\x12\x63tc_merge_repeated\x18\x04 \x01(\x08:\x04true\x12\x1d\n\x12loss_calculation_t\x18\x05 \x01(\x05:\x01\x30\"\xa7\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x34\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x1a.mo_caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x34\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"[\n\x1eNonMaximumSuppressionParameter\x12\x1a\n\rnms_threshold\x18\x01 \x01(\x02:\x03\x30.3\x12\r\n\x05top_k\x18\x02 \x01(\x05\x12\x0e\n\x03\x65ta\x18\x03 \x01(\x02:\x01\x31\"\x99\x04\n\x0fResizeParameter\x12\x0f\n\x04prob\x18\x01 \x01(\x02:\x01\x31\x12@\n\x0bresize_mode\x18\x02 \x01(\x0e\x32%.mo_caffe.ResizeParameter.Resize_mode:\x04WARP\x12\x11\n\x06height\x18\x03 \x01(\r:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\r:\x01\x30\x12\x17\n\x0cheight_scale\x18\x08 \x01(\r:\x01\x30\x12\x16\n\x0bwidth_scale\x18\t \x01(\r:\x01\x30\x12>\n\x08pad_mode\x18\x05 \x01(\x0e\x32\".mo_caffe.ResizeParameter.Pad_mode:\x08\x43ONSTANT\x12\x11\n\tpad_value\x18\x06 \x03(\x02\x12:\n\x0binterp_mode\x18\x07 \x03(\x0e\x32%.mo_caffe.ResizeParameter.Interp_mode\"G\n\x0bResize_mode\x12\x08\n\x04WARP\x10\x01\x12\x12\n\x0e\x46IT_SMALL_SIZE\x10\x02\x12\x1a\n\x16\x46IT_LARGE_SIZE_AND_PAD\x10\x03\":\n\x08Pad_mode\x12\x0c\n\x08\x43ONSTANT\x10\x01\x12\x0c\n\x08MIRRORED\x10\x02\x12\x12\n\x0eREPEAT_NEAREST\x10\x03\"I\n\x0bInterp_mode\x12\n\n\x06LINEAR\x10\x01\x12\x08\n\x04\x41REA\x10\x02\x12\x0b\n\x07NEAREST\x10\x03\x12\t\n\x05\x43UBIC\x10\x04\x12\x0c\n\x08LANCZOS4\x10\x05\"\xdb\x01\n\x13SaveOutputParameter\x12\x18\n\x10output_directory\x18\x01 \x01(\t\x12\x1a\n\x12output_name_prefix\x18\x02 \x01(\t\x12\x15\n\routput_format\x18\x03 \x01(\t\x12\x16\n\x0elabel_map_file\x18\x04 \x01(\t\x12\x16\n\x0ename_size_file\x18\x05 \x01(\t\x12\x16\n\x0enum_test_image\x18\x06 \x01(\r\x12/\n\x0cresize_param\x18\x07 \x01(\x0b\x32\x19.mo_caffe.ResizeParameter\"\x9d\x04\n\x18\x44\x65tectionOutputParameter\x12\x13\n\x0bnum_classes\x18\x01 \x01(\r\x12\x1c\n\x0eshare_location\x18\x02 \x01(\x08:\x04true\x12\x1e\n\x13\x62\x61\x63kground_label_id\x18\x03 \x01(\x05:\x01\x30\x12;\n\tnms_param\x18\x04 \x01(\x0b\x32(.mo_caffe.NonMaximumSuppressionParameter\x12\x38\n\x11save_output_param\x18\x05 \x01(\x0b\x32\x1d.mo_caffe.SaveOutputParameter\x12?\n\tcode_type\x18\x06 \x01(\x0e\x32$.mo_caffe.PriorBoxParameter.CodeType:\x06\x43ORNER\x12)\n\x1avariance_encoded_in_target\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x16\n\nkeep_top_k\x18\x07 \x01(\x05:\x02-1\x12\x1c\n\x14\x63onfidence_threshold\x18\t \x01(\x02\x12\x18\n\tvisualize\x18\n \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x13visualize_threshold\x18\x0b \x01(\x02\x12\x11\n\tsave_file\x18\x0c \x01(\t\x12\x17\n\x0binput_width\x18\r \x01(\x05:\x02-1\x12\x18\n\x0cinput_height\x18\x0e \x01(\x05:\x02-1\x12\x18\n\nnormalized\x18\x0f \x01(\x08:\x04true\".\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\"\xa6\x01\n\x12\x44ummyDataParameter\x12.\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x19.mo_caffe.FillerParameter\x12\"\n\x05shape\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa8\x01\n\x10\x45ltwiseParameter\x12<\n\toperation\x18\x01 \x01(\x0e\x32$.mo_caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xb2\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"a\n\x12HingeLossParameter\x12\x33\n\x04norm\x18\x01 \x01(\x0e\x32!.mo_caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xd1\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"4\n\x0eInputParameter\x12\"\n\x05shape\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xbe\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12G\n\x0bnorm_region\x18\x04 \x01(\x0e\x32!.mo_caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1f\n\x0cGRNParameter\x12\x0f\n\x04\x62ias\x18\x01 \x01(\x02:\x01\x31\"Z\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\"d\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-09\"8\n\x12ParameterParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\"\xc1\x03\n\x10PoolingParameter\x12\x38\n\x04pool\x18\x01 \x01(\x0e\x32%.mo_caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12:\n\x06\x65ngine\x18\x0b \x01(\x0e\x32!.mo_caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x17\n\tceil_mode\x18\r \x01(\x08:\x04true\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xd4\x02\n\x11PriorBoxParameter\x12\x10\n\x08min_size\x18\x01 \x03(\x02\x12\x10\n\x08max_size\x18\x02 \x03(\x02\x12\x14\n\x0c\x61spect_ratio\x18\x03 \x03(\x02\x12\x12\n\x04\x66lip\x18\x04 \x01(\x08:\x04true\x12\x13\n\x04\x63lip\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x08variance\x18\x06 \x03(\x02\x12\x10\n\x08img_size\x18\x07 \x01(\r\x12\r\n\x05img_h\x18\x08 \x01(\r\x12\r\n\x05img_w\x18\t \x01(\r\x12\x0c\n\x04step\x18\n \x01(\x02\x12\x0e\n\x06step_h\x18\x0b \x01(\x02\x12\x0e\n\x06step_w\x18\x0c \x01(\x02\x12\x13\n\x06offset\x18\r \x01(\x02:\x03\x30.5\x12\r\n\x05width\x18\x0e \x03(\x02\x12\x0e\n\x06height\x18\x0f \x03(\x02\"8\n\x08\x43odeType\x12\n\n\x06\x43ORNER\x10\x01\x12\x0f\n\x0b\x43\x45NTER_SIZE\x10\x02\x12\x0f\n\x0b\x43ORNER_SIZE\x10\x03\"V\n\x15PSROIPoolingParameter\x12\x15\n\rspatial_scale\x18\x01 \x02(\x02\x12\x12\n\noutput_dim\x18\x02 \x02(\x05\x12\x12\n\ngroup_size\x18\x03 \x02(\x05\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xc6\x01\n\x12RecurrentParameter\x12\x15\n\nnum_output\x18\x01 \x01(\r:\x01\x30\x12\x30\n\rweight_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x19\n\ndebug_info\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1c\n\rexpose_hidden\x18\x05 \x01(\x08:\x05\x66\x61lse\"\xb0\x01\n\x12ReductionParameter\x12@\n\toperation\x18\x01 \x01(\x0e\x32(.mo_caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x90\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x37\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1e.mo_caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1e\n\x0eReLU6Parameter\x12\x0c\n\x01n\x18\x01 \x01(\x02:\x01\x36\"]\n\x10ReshapeParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"#\n\x10ReverseParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"]\n\x17ROIWarpingTestParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"Y\n\x13ROIWarpingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"\xab\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"{\n\x10SigmoidParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"\x8c\x01\n\x10SoftmaxParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"u\n\rTanHParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.mo_caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"/\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xf1\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x34\n\x04pool\x18\x02 \x01(\x0e\x32!.mo_caffe.SPPParameter.PoolMethod:\x03MAX\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xcc\x14\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\'\n\x07include\x18 \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18! \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\x32\n\x04type\x18\x05 \x01(\x0e\x32$.mo_caffe.V1LayerParameter.LayerType\x12\"\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12\x41\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32\'.mo_caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12/\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12+\n\ndata_param\x18\x0b \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18\x0c \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18\x18 \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12)\n\texp_param\x18) \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x34\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12)\n\tlrn_param\x18\x12 \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18\" \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x31\n\rpooling_param\x18\x13 \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12-\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12+\n\nrelu_param\x18\x1e \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x31\n\rsigmoid_param\x18& \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18\' \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12-\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18% \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x35\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12\x38\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12:\n\x0ftransform_param\x18$ \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18* \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12)\n\x05layer\x18\x01 \x01(\x0b\x32\x1a.mo_caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\x8c\x08\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x38\n\x04pool\x18\x0b \x01(\x0e\x32%.mo_caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\"\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x39\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"Z\n\x0ePReLUParameter\x12)\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x86\x01\n\x13RegionYoloParameter\x12\x11\n\x06\x63oords\x18\x01 \x01(\x05:\x01\x34\x12\x13\n\x07\x63lasses\x18\x02 \x01(\x05:\x02\x32\x30\x12\x0e\n\x03num\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\ndo_softmax\x18\x04 \x01(\x08:\x04true\x12\x0f\n\x07\x61nchors\x18\x05 \x03(\x02\x12\x0c\n\x04mask\x18\x06 \x03(\x05\"\'\n\x12ReorgYoloParameter\x12\x11\n\x06stride\x18\x01 \x01(\x05:\x01\x31\"\xcf\x01\n\x18RandomGeneratorParameter\x12\x1a\n\trand_type\x18\x01 \x01(\t:\x07uniform\x12\x12\n\x03\x65xp\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x0f\n\x04mean\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06spread\x18\x05 \x01(\x02:\x01\x30\x12\x0f\n\x04prob\x18\x06 \x01(\x02:\x01\x31\x12\x1c\n\x0e\x61pply_schedule\x18\x07 \x01(\x08:\x04true\x12\x19\n\ndiscretize\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nmultiplier\x18\t \x01(\x02:\x01\x31\"`\n\x16\x43oeffScheduleParameter\x12\x14\n\thalf_life\x18\x01 \x01(\x02:\x01\x31\x12\x18\n\rinitial_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x16\n\x0b\x66inal_coeff\x18\x03 \x01(\x02:\x01\x31\"\xde\x07\n\x11\x41ugmentationCoeff\x12\x11\n\x06mirror\x18\x01 \x01(\x02:\x01\x30\x12\r\n\x02\x64x\x18\x02 \x01(\x02:\x01\x30\x12\r\n\x02\x64y\x18\x03 \x01(\x02:\x01\x30\x12\x10\n\x05\x61ngle\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06zoom_x\x18\x05 \x01(\x02:\x01\x31\x12\x11\n\x06zoom_y\x18\x06 \x01(\x02:\x01\x31\x12\x10\n\x05gamma\x18\x64 \x01(\x02:\x01\x31\x12\x15\n\nbrightness\x18\x65 \x01(\x02:\x01\x30\x12\x13\n\x08\x63ontrast\x18\x66 \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor1\x18g \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor2\x18h \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor3\x18i \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean0\x18\n \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean1\x18\x0b \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean2\x18\x0c \x01(\x02:\x01\x31\x12\x16\n\x0b\x61\x64\x64_nomean0\x18\r \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean1\x18\x0e \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean2\x18\x0f \x01(\x02:\x01\x30\x12\x17\n\x0cmult_nomean0\x18\x10 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean1\x18\x11 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean2\x18\x12 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean0\x18\x13 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean1\x18\x14 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean2\x18\x15 \x01(\x02:\x01\x31\x12\x18\n\radd_withmean0\x18\x16 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean1\x18\x17 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean2\x18\x18 \x01(\x02:\x01\x30\x12\x19\n\x0emult_withmean0\x18\x19 \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean1\x18\x1a \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean2\x18\x1b \x01(\x02:\x01\x31\x12\x14\n\tlmult_pow\x18\x1c \x01(\x02:\x01\x31\x12\x14\n\tlmult_add\x18\x1d \x01(\x02:\x01\x30\x12\x15\n\nlmult_mult\x18\x1e \x01(\x02:\x01\x31\x12\x14\n\tcol_angle\x18\x1f \x01(\x02:\x01\x30\x12\x15\n\nfog_amount\x18& \x01(\x02:\x01\x30\x12\x13\n\x08\x66og_size\x18\' \x01(\x02:\x01\x30\x12\x1c\n\x11motion_blur_angle\x18( \x01(\x02:\x01\x30\x12\x1b\n\x10motion_blur_size\x18) \x01(\x02:\x01\x30\x12\x17\n\x0cshadow_angle\x18* \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_distance\x18+ \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_strength\x18, \x01(\x02:\x01\x30\x12\x10\n\x05noise\x18- \x01(\x02:\x01\x30\"\xcc\x10\n\x15\x41ugmentationParameter\x12\x15\n\ncrop_width\x18! \x01(\r:\x01\x30\x12\x16\n\x0b\x63rop_height\x18\" \x01(\r:\x01\x30\x12\x19\n\x0fwrite_augmented\x18\x02 \x01(\t:\x00\x12\x1b\n\x0emax_multiplier\x18\x03 \x01(\x02:\x03\x32\x35\x35\x12\"\n\x13\x61ugment_during_test\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0erecompute_mean\x18\x05 \x01(\r:\x01\x30\x12\x14\n\nwrite_mean\x18\x06 \x01(\t:\x00\x12\x1c\n\x0emean_per_pixel\x18\x07 \x01(\x08:\x04true\x12\x0c\n\x04mean\x18\x12 \x03(\x02\x12\x11\n\x04mode\x18\x08 \x01(\t:\x03\x61\x64\x64\x12\x16\n\x0b\x62ottomwidth\x18P \x01(\r:\x01\x30\x12\x17\n\x0c\x62ottomheight\x18Q \x01(\r:\x01\x30\x12\x0e\n\x03num\x18R \x01(\r:\x01\x30\x12\x18\n\x10\x63hromatic_eigvec\x18S \x03(\x02\x12\x32\n\x06mirror\x18\n \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\ttranslate\x18\x0b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x32\n\x06rotate\x18\x0c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x30\n\x04zoom\x18\r \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07squeeze\x18\x0e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_x\x18\x0f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_y\x18\x10 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05gamma\x18# \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nbrightness\x18$ \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ontrast\x18% \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05\x63olor\x18& \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_pow\x18\x14 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nlmult_mult\x18\x15 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_add\x18\x16 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_pow\x18\x17 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08sat_mult\x18\x18 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_add\x18\x19 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_pow\x18\x1a \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ol_mult\x18\x1b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_add\x18\x1c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_pow\x18\x1d \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tladd_mult\x18\x1e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_add\x18\x1f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\ncol_rotate\x18 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nfog_amount\x18\x64 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x66og_size\x18\x65 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12=\n\x11motion_blur_angle\x18\x66 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12<\n\x10motion_blur_size\x18g \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x38\n\x0cshadow_angle\x18h \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_distance\x18i \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_strength\x18j \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05noise\x18k \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\"\x85\x01\n\x11\x46lowWarpParameter\x12\x43\n\nfill_value\x18\x01 \x01(\x0e\x32).mo_caffe.FlowWarpParameter.FillParameter:\x04ZERO\"+\n\rFillParameter\x12\x08\n\x04ZERO\x10\x01\x12\x10\n\x0cNOT_A_NUMBER\x10\x02\"\xb6\x02\n\x14\x43orrelationParameter\x12\x0e\n\x03pad\x18\x02 \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x03 \x01(\r\x12\x18\n\x10max_displacement\x18\x04 \x01(\r\x12\x13\n\x08stride_1\x18\x05 \x01(\r:\x01\x31\x12\x13\n\x08stride_2\x18\x06 \x01(\r:\x01\x31\x12\x1b\n\x10single_direction\x18\x08 \x01(\x05:\x01\x30\x12\x15\n\x06\x64o_abs\x18\x07 \x01(\x08:\x05\x66\x61lse\x12R\n\x10\x63orrelation_type\x18\x0f \x01(\x0e\x32..mo_caffe.CorrelationParameter.CorrelationType:\x08MULTIPLY\"-\n\x0f\x43orrelationType\x12\x0c\n\x08MULTIPLY\x10\x00\x12\x0c\n\x08SUBTRACT\x10\x01\"\xdc\x01\n\x11ResampleParameter\x12\x17\n\tantialias\x18\x04 \x01(\x08:\x04true\x12\r\n\x05width\x18\x01 \x01(\r\x12\x0e\n\x06height\x18\x02 \x01(\r\x12>\n\x04type\x18\x03 \x01(\x0e\x32(.mo_caffe.ResampleParameter.ResampleType:\x06LINEAR\x12\x11\n\x06\x66\x61\x63tor\x18\x05 \x01(\x02:\x01\x31\"<\n\x0cResampleType\x12\x0b\n\x07NEAREST\x10\x01\x12\n\n\x06LINEAR\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\x12\x08\n\x04\x41REA\x10\x04\"z\n\x0e\x41\x63\x63umParameter\x12\x15\n\ntop_height\x18\x01 \x01(\r:\x01\x30\x12\x14\n\ttop_width\x18\x02 \x01(\r:\x01\x30\x12\x1c\n\x11size_divisible_by\x18\x03 \x01(\r:\x01\x30\x12\x1d\n\x0ehave_reference\x18\x04 \x01(\x08:\x05\x66\x61lse\"(\n\x17ShuffleChannelParameter\x12\r\n\x05group\x18\x01 \x02(\r*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01') + serialized_pb=_b('\n\x0emo_caffe.proto\x12\x08mo_caffe\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcf\x01\n\tBlobProto\x12\"\n\x05shape\x18\x07 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"5\n\x0f\x42lobProtoVector\x12\"\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobProto\"M\n\x1e\x43osineSimilarityBatchParameter\x12\x14\n\tpos_label\x18\x01 \x01(\x01:\x01\x31\x12\x15\n\tneg_label\x18\x02 \x01(\x01:\x02-1\"\x81\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\"A\n\x0cLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05label\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"0\n\x08LabelMap\x12$\n\x04item\x18\x01 \x03(\x0b\x32\x16.mo_caffe.LabelMapItem\"\x87\x01\n\x0eNormalizedBBox\x12\x0c\n\x04xmin\x18\x01 \x01(\x02\x12\x0c\n\x04ymin\x18\x02 \x01(\x02\x12\x0c\n\x04xmax\x18\x03 \x01(\x02\x12\x0c\n\x04ymax\x18\x04 \x01(\x02\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x11\n\tdifficult\x18\x06 \x01(\x08\x12\r\n\x05score\x18\x07 \x01(\x02\x12\x0c\n\x04size\x18\x08 \x01(\x02\"\xad\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x45\n\rvariance_norm\x18\x08 \x01(\x0e\x32&.mo_caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\x12\x0c\n\x04\x66ile\x18\t \x01(\t\x12\x10\n\x08\x64iag_val\x18\n \x03(\x02\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\xed\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12(\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12!\n\x05state\x18\x06 \x01(\x0b\x32\x12.mo_caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cprofile_info\x18\t \x01(\x08:\x05\x66\x61lse\x12\x18\n\x0cprofile_iter\x18\n \x01(\x05:\x02\x35\x30\x12\x1a\n\x0eprofile_warmup\x18\x0b \x01(\x05:\x02\x31\x30\x12\'\n\x05layer\x18\x64 \x03(\x0b\x32\x18.mo_caffe.LayerParameter\x12*\n\x06layers\x18\x02 \x03(\x0b\x32\x1a.mo_caffe.V1LayerParameter\"\xf4\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12)\n\tnet_param\x18\x19 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12/\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12.\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x16.mo_caffe.NetParameter\x12\'\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x12.mo_caffe.NetState\x12&\n\ntest_state\x18\x1b \x03(\x0b\x32\x12.mo_caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18 \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x17\n\x0fplateau_winsize\x18* \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12N\n\x0fsnapshot_format\x18% \x01(\x0e\x32(.mo_caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12>\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x14\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x05\x31\x65-08\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12>\n\x0bsolver_type\x18\x1e \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverType:\x03SGD\x12\x1f\n\x11layer_wise_reduce\x18) \x01(\x08:\x04true\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"\xa8\x01\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12$\n\x07history\x18\x03 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\x12\x1b\n\x0cminimum_loss\x18\x05 \x01(\x02:\x05\x31\x65+38\x12\x1a\n\x0fiter_last_event\x18\x06 \x01(\x05:\x01\x30\"Q\n\x08NetState\x12$\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"v\n\x0cNetStateRule\x12\x1e\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\"\xad\x02\n\x1bSpatialTransformerParameter\x12\x1e\n\x0etransform_type\x18\x01 \x01(\t:\x06\x61\x66\x66ine\x12\x1e\n\x0csampler_type\x18\x02 \x01(\t:\x08\x62ilinear\x12\x10\n\x08output_H\x18\x03 \x01(\x05\x12\x10\n\x08output_W\x18\x04 \x01(\x05\x12\x1b\n\rto_compute_dU\x18\x05 \x01(\x08:\x04true\x12\x11\n\ttheta_1_1\x18\x06 \x01(\x01\x12\x11\n\ttheta_1_2\x18\x07 \x01(\x01\x12\x11\n\ttheta_1_3\x18\x08 \x01(\x01\x12\x11\n\ttheta_2_1\x18\t \x01(\x01\x12\x11\n\ttheta_2_2\x18\n \x01(\x01\x12\x11\n\ttheta_2_3\x18\x0b \x01(\x01\x12\x1b\n\x0c\x64\x65_transform\x18\x0c \x01(\x08:\x05\x66\x61lse\"(\n\x12PowerFileParameter\x12\x12\n\nshift_file\x18\x01 \x01(\t\"5\n\x0fSTLossParameter\x12\x10\n\x08output_H\x18\x01 \x02(\x05\x12\x10\n\x08output_W\x18\x02 \x02(\x05\"%\n\x10LocLossParameter\x12\x11\n\tthreshold\x18\x01 \x02(\x01\"\xa6\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x34\n\nshare_mode\x18\x02 \x01(\x0e\x32 .mo_caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xf4#\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1e\n\x05phase\x18\n \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\"\n\x05param\x18\x06 \x03(\x0b\x32\x13.mo_caffe.ParamSpec\x12\"\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12\'\n\x07include\x18\x08 \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18\t \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12:\n\x0ftransform_param\x18\x64 \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18\x65 \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12\x37\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x1c.mo_caffe.BatchNormParameter\x12,\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x17.mo_caffe.BiasParameter\x12I\n\x19\x63hannel_permutation_param\x18\x92? \x01(\x0b\x32%.mo_caffe.ChannelPermutationParameter\x12/\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12,\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x17.mo_caffe.CropParameter\x12\x39\n\x11\x63tc_decoder_param\x18\x95\x01 \x01(\x0b\x32\x1d.mo_caffe.CTCDecoderParameter\x12\x33\n\x0e\x63tc_loss_param\x18\x94\x01 \x01(\x0b\x32\x1a.mo_caffe.CTCLossParameter\x12+\n\ndata_param\x18k \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18l \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18n \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12*\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x16.mo_caffe.ELUParameter\x12.\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x18.mo_caffe.EmbedParameter\x12)\n\texp_param\x18o \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x32\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x1a.mo_caffe.FlattenParameter\x12*\n\tgrn_param\x18\xd5\x01 \x01(\x0b\x32\x16.mo_caffe.GRNParameter\x12\x34\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18s \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18u \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12.\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x18.mo_caffe.InputParameter\x12*\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x16.mo_caffe.LogParameter\x12)\n\tlrn_param\x18v \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18w \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18x \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x36\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x1c.mo_caffe.ParameterParameter\x12\x31\n\rpooling_param\x18y \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12\x32\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x1a.mo_caffe.PermuteParameter\x12-\n\x0bpower_param\x18z \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12.\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x18.mo_caffe.PReLUParameter\x12\x30\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x19.mo_caffe.PythonParameter\x12\x36\n\x0frecurrent_param\x18\x92\x01 \x01(\x0b\x32\x1c.mo_caffe.RecurrentParameter\x12\x36\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x1c.mo_caffe.ReductionParameter\x12+\n\nrelu_param\x18{ \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x32\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x1a.mo_caffe.ReshapeParameter\x12\x32\n\rreverse_param\x18\x93\x01 \x01(\x0b\x32\x1a.mo_caffe.ReverseParameter\x12.\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x18.mo_caffe.ScaleParameter\x12\x31\n\rsigmoid_param\x18| \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18} \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12*\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x16.mo_caffe.SPPParameter\x12-\n\x0bslice_param\x18~ \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18\x7f \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x36\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12,\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x17.mo_caffe.TileParameter\x12\x39\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12\x38\n\x08st_param\x18\x96\x01 \x01(\x0b\x32%.mo_caffe.SpatialTransformerParameter\x12\x31\n\rst_loss_param\x18\x97\x01 \x01(\x0b\x32\x19.mo_caffe.STLossParameter\x12\x37\n\x10power_file_param\x18\x98\x01 \x01(\x0b\x32\x1c.mo_caffe.PowerFileParameter\x12\x33\n\x0eloc_loss_param\x18\x99\x01 \x01(\x0b\x32\x1a.mo_caffe.LocLossParameter\x12\x34\n\x0eproposal_param\x18\xc9\x01 \x01(\x0b\x32\x1b.mo_caffe.ProposalParameter\x12P\n\x1d\x63osine_similarity_batch_param\x18\xca\x01 \x01(\x0b\x32(.mo_caffe.CosineSimilarityBatchParameter\x12\x45\n\x0erss_loss_param\x18\xcb\x01 \x01(\x0b\x32,.mo_caffe.RandomSamplingSoftmaxLossParameter\x12\x31\n\nnorm_param\x18\xcc\x01 \x01(\x0b\x32\x1c.mo_caffe.NormalizeParameter\x12\x39\n\x11roi_warping_param\x18\xcd\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIWarpingParameter\x12=\n\x13psroi_pooling_param\x18\xcf\x01 \x01(\x0b\x32\x1f.mo_caffe.PSROIPoolingParameter\x12\x39\n\x11roi_pooling_param\x18\xd0\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIPoolingParameter\x12>\n\x14smooth_l1_loss_param\x18\xd1\x01 \x01(\x0b\x32\x1f.mo_caffe.SmoothL1LossParameter\x12\x46\n\x18\x62ox_annotator_ohem_param\x18\xd2\x01 \x01(\x0b\x32#.mo_caffe.BoxAnnotatorOHEMParameter\x12\x43\n\x16\x64\x65tection_output_param\x18\xd3\x01 \x01(\x0b\x32\".mo_caffe.DetectionOutputParameter\x12\x35\n\x0fprior_box_param\x18\xd4\x01 \x01(\x0b\x32\x1b.mo_caffe.PriorBoxParameter\x12\x39\n\x11region_yolo_param\x18\xd6\x01 \x01(\x0b\x32\x1d.mo_caffe.RegionYoloParameter\x12\x37\n\x10reorg_yolo_param\x18\xd7\x01 \x01(\x0b\x32\x1c.mo_caffe.ReorgYoloParameter\x12.\n\x0brelu6_param\x18\xd8\x01 \x01(\x0b\x32\x18.mo_caffe.ReLU6Parameter\x12\x30\n\x0cinterp_param\x18\xd9\x01 \x01(\x0b\x32\x19.mo_caffe.InterpParameter\x12<\n\x12\x61ugmentation_param\x18\xda\x01 \x01(\x0b\x32\x1f.mo_caffe.AugmentationParameter\x12:\n\x11\x63orrelation_param\x18\xdb\x01 \x01(\x0b\x32\x1e.mo_caffe.CorrelationParameter\x12\x34\n\x0eresample_param\x18\xdc\x01 \x01(\x0b\x32\x1b.mo_caffe.ResampleParameter\x12\x35\n\x0f\x66low_warp_param\x18\xdd\x01 \x01(\x0b\x32\x1b.mo_caffe.FlowWarpParameter\x12.\n\x0b\x61\x63\x63um_param\x18\xde\x01 \x01(\x0b\x32\x18.mo_caffe.AccumParameter\x12?\n\x14\x63oeff_schedule_param\x18\xdf\x01 \x01(\x0b\x32 .mo_caffe.CoeffScheduleParameter\x12\x41\n\x15shuffle_channel_param\x18\xe0\x01 \x01(\x0b\x32!.mo_caffe.ShuffleChannelParameter\"\x90\x01\n\x0fInterpParameter\x12\x11\n\x06height\x18\x01 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0bzoom_factor\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\rshrink_factor\x18\x04 \x01(\x05:\x01\x31\x12\x12\n\x07pad_beg\x18\x05 \x01(\x05:\x01\x30\x12\x12\n\x07pad_end\x18\x06 \x01(\x05:\x01\x30\"n\n\"RandomSamplingSoftmaxLossParameter\x12 \n\x13random_sampling_num\x18\x01 \x01(\x05:\x03\x31\x30\x30\x12&\n\x16random_sampling_policy\x18\x02 \x01(\t:\x06random\"\xc8\x01\n\x11ProposalParameter\x12\x17\n\x0b\x66\x65\x61t_stride\x18\x01 \x01(\r:\x02\x31\x36\x12\x15\n\tbase_size\x18\x02 \x01(\r:\x02\x31\x36\x12\x14\n\x08min_size\x18\x03 \x01(\r:\x02\x31\x36\x12\r\n\x05ratio\x18\x04 \x03(\x02\x12\r\n\x05scale\x18\x05 \x03(\x02\x12\x1a\n\x0cpre_nms_topn\x18\x06 \x01(\r:\x04\x36\x30\x30\x30\x12\x1a\n\rpost_nms_topn\x18\x07 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x08 \x01(\x02:\x03\x30.7\"\x95\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12/\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x31\x65-10\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\xb6\x01\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\"\xb4\x02\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12G\n\rnormalization\x18\x03 \x01(\x0e\x32).mo_caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x12\x1f\n\x14pre_fixed_normalizer\x18\x04 \x01(\x02:\x01\x31\x12$\n\x15weight_by_label_freqs\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63lass_weighting\x18\x06 \x03(\x02\"Q\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\r\n\tPRE_FIXED\x10\x03\x12\x08\n\x04NONE\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"D\n\x18\x43hannelPermutationAction\x12\x0c\n\x04\x63han\x18\x01 \x02(\r\x12\x0c\n\x04\x63opy\x18\x02 \x01(\r\x12\x0c\n\x04\x66ill\x18\x03 \x01(\x02\"\x9a\x01\n\x1b\x43hannelPermutationParameter\x12\x32\n\x06\x61\x63tion\x18\x01 \x03(\x0b\x32\".mo_caffe.ChannelPermutationAction\x12\x12\n\nnum_output\x18\x10 \x02(\r\x12\x1f\n\x10inplace_possible\x18\x11 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x07version\x18\x12 \x01(\x05:\x01\x30\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"j\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12&\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x05\x30.999\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-05\"J\n\x19\x42oxAnnotatorOHEMParameter\x12\x13\n\x0broi_per_img\x18\x01 \x02(\r\x12\x18\n\x0cignore_label\x18\x02 \x01(\x05:\x02-1\"`\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x85\x04\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12\x30\n\rweight_filler\x18\x07 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12>\n\x06\x65ngine\x18\x0f \x01(\x0e\x32%.mo_caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"A\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\x12\x0f\n\x07\x64imsize\x18\x03 \x03(\r\"P\n\x13\x43TCDecoderParameter\x12\x17\n\x0b\x62lank_index\x18\x01 \x01(\x05:\x02-1\x12 \n\x12\x63tc_merge_repeated\x18\x02 \x01(\x08:\x04true\"\xb2\x01\n\x10\x43TCLossParameter\x12\x17\n\x0coutput_delay\x18\x01 \x01(\x05:\x01\x30\x12\x17\n\x0b\x62lank_index\x18\x02 \x01(\x05:\x02-1\x12+\n\x1cpreprocess_collapse_repeated\x18\x03 \x01(\x08:\x05\x66\x61lse\x12 \n\x12\x63tc_merge_repeated\x18\x04 \x01(\x08:\x04true\x12\x1d\n\x12loss_calculation_t\x18\x05 \x01(\x05:\x01\x30\"\xa7\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x34\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x1a.mo_caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x34\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"[\n\x1eNonMaximumSuppressionParameter\x12\x1a\n\rnms_threshold\x18\x01 \x01(\x02:\x03\x30.3\x12\r\n\x05top_k\x18\x02 \x01(\x05\x12\x0e\n\x03\x65ta\x18\x03 \x01(\x02:\x01\x31\"\x99\x04\n\x0fResizeParameter\x12\x0f\n\x04prob\x18\x01 \x01(\x02:\x01\x31\x12@\n\x0bresize_mode\x18\x02 \x01(\x0e\x32%.mo_caffe.ResizeParameter.Resize_mode:\x04WARP\x12\x11\n\x06height\x18\x03 \x01(\r:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\r:\x01\x30\x12\x17\n\x0cheight_scale\x18\x08 \x01(\r:\x01\x30\x12\x16\n\x0bwidth_scale\x18\t \x01(\r:\x01\x30\x12>\n\x08pad_mode\x18\x05 \x01(\x0e\x32\".mo_caffe.ResizeParameter.Pad_mode:\x08\x43ONSTANT\x12\x11\n\tpad_value\x18\x06 \x03(\x02\x12:\n\x0binterp_mode\x18\x07 \x03(\x0e\x32%.mo_caffe.ResizeParameter.Interp_mode\"G\n\x0bResize_mode\x12\x08\n\x04WARP\x10\x01\x12\x12\n\x0e\x46IT_SMALL_SIZE\x10\x02\x12\x1a\n\x16\x46IT_LARGE_SIZE_AND_PAD\x10\x03\":\n\x08Pad_mode\x12\x0c\n\x08\x43ONSTANT\x10\x01\x12\x0c\n\x08MIRRORED\x10\x02\x12\x12\n\x0eREPEAT_NEAREST\x10\x03\"I\n\x0bInterp_mode\x12\n\n\x06LINEAR\x10\x01\x12\x08\n\x04\x41REA\x10\x02\x12\x0b\n\x07NEAREST\x10\x03\x12\t\n\x05\x43UBIC\x10\x04\x12\x0c\n\x08LANCZOS4\x10\x05\"\xdb\x01\n\x13SaveOutputParameter\x12\x18\n\x10output_directory\x18\x01 \x01(\t\x12\x1a\n\x12output_name_prefix\x18\x02 \x01(\t\x12\x15\n\routput_format\x18\x03 \x01(\t\x12\x16\n\x0elabel_map_file\x18\x04 \x01(\t\x12\x16\n\x0ename_size_file\x18\x05 \x01(\t\x12\x16\n\x0enum_test_image\x18\x06 \x01(\r\x12/\n\x0cresize_param\x18\x07 \x01(\x0b\x32\x19.mo_caffe.ResizeParameter\"\xbd\x04\n\x18\x44\x65tectionOutputParameter\x12\x13\n\x0bnum_classes\x18\x01 \x01(\r\x12\x1c\n\x0eshare_location\x18\x02 \x01(\x08:\x04true\x12\x1e\n\x13\x62\x61\x63kground_label_id\x18\x03 \x01(\x05:\x01\x30\x12;\n\tnms_param\x18\x04 \x01(\x0b\x32(.mo_caffe.NonMaximumSuppressionParameter\x12\x38\n\x11save_output_param\x18\x05 \x01(\x0b\x32\x1d.mo_caffe.SaveOutputParameter\x12?\n\tcode_type\x18\x06 \x01(\x0e\x32$.mo_caffe.PriorBoxParameter.CodeType:\x06\x43ORNER\x12)\n\x1avariance_encoded_in_target\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x16\n\nkeep_top_k\x18\x07 \x01(\x05:\x02-1\x12\x1c\n\x14\x63onfidence_threshold\x18\t \x01(\x02\x12\x18\n\tvisualize\x18\n \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x13visualize_threshold\x18\x0b \x01(\x02\x12\x11\n\tsave_file\x18\x0c \x01(\t\x12\x17\n\x0binput_width\x18\r \x01(\x05:\x02-1\x12\x18\n\x0cinput_height\x18\x0e \x01(\x05:\x02-1\x12\x18\n\nnormalized\x18\x0f \x01(\x08:\x04true\x12\x1e\n\x10objectness_score\x18\x10 \x01(\x02:\x04\x30.01\".\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\"\xa6\x01\n\x12\x44ummyDataParameter\x12.\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x19.mo_caffe.FillerParameter\x12\"\n\x05shape\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa8\x01\n\x10\x45ltwiseParameter\x12<\n\toperation\x18\x01 \x01(\x0e\x32$.mo_caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xb2\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"a\n\x12HingeLossParameter\x12\x33\n\x04norm\x18\x01 \x01(\x0e\x32!.mo_caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xd1\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"4\n\x0eInputParameter\x12\"\n\x05shape\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xbe\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12G\n\x0bnorm_region\x18\x04 \x01(\x0e\x32!.mo_caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1f\n\x0cGRNParameter\x12\x0f\n\x04\x62ias\x18\x01 \x01(\x02:\x01\x31\"Z\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\"d\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-09\"8\n\x12ParameterParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\"\xc1\x03\n\x10PoolingParameter\x12\x38\n\x04pool\x18\x01 \x01(\x0e\x32%.mo_caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12:\n\x06\x65ngine\x18\x0b \x01(\x0e\x32!.mo_caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x17\n\tceil_mode\x18\r \x01(\x08:\x04true\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xd4\x02\n\x11PriorBoxParameter\x12\x10\n\x08min_size\x18\x01 \x03(\x02\x12\x10\n\x08max_size\x18\x02 \x03(\x02\x12\x14\n\x0c\x61spect_ratio\x18\x03 \x03(\x02\x12\x12\n\x04\x66lip\x18\x04 \x01(\x08:\x04true\x12\x13\n\x04\x63lip\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x08variance\x18\x06 \x03(\x02\x12\x10\n\x08img_size\x18\x07 \x01(\r\x12\r\n\x05img_h\x18\x08 \x01(\r\x12\r\n\x05img_w\x18\t \x01(\r\x12\x0c\n\x04step\x18\n \x01(\x02\x12\x0e\n\x06step_h\x18\x0b \x01(\x02\x12\x0e\n\x06step_w\x18\x0c \x01(\x02\x12\x13\n\x06offset\x18\r \x01(\x02:\x03\x30.5\x12\r\n\x05width\x18\x0e \x03(\x02\x12\x0e\n\x06height\x18\x0f \x03(\x02\"8\n\x08\x43odeType\x12\n\n\x06\x43ORNER\x10\x01\x12\x0f\n\x0b\x43\x45NTER_SIZE\x10\x02\x12\x0f\n\x0b\x43ORNER_SIZE\x10\x03\"V\n\x15PSROIPoolingParameter\x12\x15\n\rspatial_scale\x18\x01 \x02(\x02\x12\x12\n\noutput_dim\x18\x02 \x02(\x05\x12\x12\n\ngroup_size\x18\x03 \x02(\x05\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xc6\x01\n\x12RecurrentParameter\x12\x15\n\nnum_output\x18\x01 \x01(\r:\x01\x30\x12\x30\n\rweight_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x19\n\ndebug_info\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1c\n\rexpose_hidden\x18\x05 \x01(\x08:\x05\x66\x61lse\"\xb0\x01\n\x12ReductionParameter\x12@\n\toperation\x18\x01 \x01(\x0e\x32(.mo_caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x90\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x37\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1e.mo_caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1e\n\x0eReLU6Parameter\x12\x0c\n\x01n\x18\x01 \x01(\x02:\x01\x36\"]\n\x10ReshapeParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"#\n\x10ReverseParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"]\n\x17ROIWarpingTestParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"Y\n\x13ROIWarpingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"\xab\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"{\n\x10SigmoidParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"\x8c\x01\n\x10SoftmaxParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"u\n\rTanHParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.mo_caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"/\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xf1\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x34\n\x04pool\x18\x02 \x01(\x0e\x32!.mo_caffe.SPPParameter.PoolMethod:\x03MAX\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xcc\x14\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\'\n\x07include\x18 \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18! \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\x32\n\x04type\x18\x05 \x01(\x0e\x32$.mo_caffe.V1LayerParameter.LayerType\x12\"\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12\x41\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32\'.mo_caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12/\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12+\n\ndata_param\x18\x0b \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18\x0c \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18\x18 \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12)\n\texp_param\x18) \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x34\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12)\n\tlrn_param\x18\x12 \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18\" \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x31\n\rpooling_param\x18\x13 \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12-\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12+\n\nrelu_param\x18\x1e \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x31\n\rsigmoid_param\x18& \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18\' \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12-\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18% \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x35\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12\x38\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12:\n\x0ftransform_param\x18$ \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18* \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12)\n\x05layer\x18\x01 \x01(\x0b\x32\x1a.mo_caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\x8c\x08\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x38\n\x04pool\x18\x0b \x01(\x0e\x32%.mo_caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\"\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x39\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"Z\n\x0ePReLUParameter\x12)\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x86\x01\n\x13RegionYoloParameter\x12\x11\n\x06\x63oords\x18\x01 \x01(\x05:\x01\x34\x12\x13\n\x07\x63lasses\x18\x02 \x01(\x05:\x02\x32\x30\x12\x0e\n\x03num\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\ndo_softmax\x18\x04 \x01(\x08:\x04true\x12\x0f\n\x07\x61nchors\x18\x05 \x03(\x02\x12\x0c\n\x04mask\x18\x06 \x03(\x05\"\'\n\x12ReorgYoloParameter\x12\x11\n\x06stride\x18\x01 \x01(\x05:\x01\x31\"\xcf\x01\n\x18RandomGeneratorParameter\x12\x1a\n\trand_type\x18\x01 \x01(\t:\x07uniform\x12\x12\n\x03\x65xp\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x0f\n\x04mean\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06spread\x18\x05 \x01(\x02:\x01\x30\x12\x0f\n\x04prob\x18\x06 \x01(\x02:\x01\x31\x12\x1c\n\x0e\x61pply_schedule\x18\x07 \x01(\x08:\x04true\x12\x19\n\ndiscretize\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nmultiplier\x18\t \x01(\x02:\x01\x31\"`\n\x16\x43oeffScheduleParameter\x12\x14\n\thalf_life\x18\x01 \x01(\x02:\x01\x31\x12\x18\n\rinitial_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x16\n\x0b\x66inal_coeff\x18\x03 \x01(\x02:\x01\x31\"\xde\x07\n\x11\x41ugmentationCoeff\x12\x11\n\x06mirror\x18\x01 \x01(\x02:\x01\x30\x12\r\n\x02\x64x\x18\x02 \x01(\x02:\x01\x30\x12\r\n\x02\x64y\x18\x03 \x01(\x02:\x01\x30\x12\x10\n\x05\x61ngle\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06zoom_x\x18\x05 \x01(\x02:\x01\x31\x12\x11\n\x06zoom_y\x18\x06 \x01(\x02:\x01\x31\x12\x10\n\x05gamma\x18\x64 \x01(\x02:\x01\x31\x12\x15\n\nbrightness\x18\x65 \x01(\x02:\x01\x30\x12\x13\n\x08\x63ontrast\x18\x66 \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor1\x18g \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor2\x18h \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor3\x18i \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean0\x18\n \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean1\x18\x0b \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean2\x18\x0c \x01(\x02:\x01\x31\x12\x16\n\x0b\x61\x64\x64_nomean0\x18\r \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean1\x18\x0e \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean2\x18\x0f \x01(\x02:\x01\x30\x12\x17\n\x0cmult_nomean0\x18\x10 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean1\x18\x11 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean2\x18\x12 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean0\x18\x13 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean1\x18\x14 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean2\x18\x15 \x01(\x02:\x01\x31\x12\x18\n\radd_withmean0\x18\x16 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean1\x18\x17 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean2\x18\x18 \x01(\x02:\x01\x30\x12\x19\n\x0emult_withmean0\x18\x19 \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean1\x18\x1a \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean2\x18\x1b \x01(\x02:\x01\x31\x12\x14\n\tlmult_pow\x18\x1c \x01(\x02:\x01\x31\x12\x14\n\tlmult_add\x18\x1d \x01(\x02:\x01\x30\x12\x15\n\nlmult_mult\x18\x1e \x01(\x02:\x01\x31\x12\x14\n\tcol_angle\x18\x1f \x01(\x02:\x01\x30\x12\x15\n\nfog_amount\x18& \x01(\x02:\x01\x30\x12\x13\n\x08\x66og_size\x18\' \x01(\x02:\x01\x30\x12\x1c\n\x11motion_blur_angle\x18( \x01(\x02:\x01\x30\x12\x1b\n\x10motion_blur_size\x18) \x01(\x02:\x01\x30\x12\x17\n\x0cshadow_angle\x18* \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_distance\x18+ \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_strength\x18, \x01(\x02:\x01\x30\x12\x10\n\x05noise\x18- \x01(\x02:\x01\x30\"\xcc\x10\n\x15\x41ugmentationParameter\x12\x15\n\ncrop_width\x18! \x01(\r:\x01\x30\x12\x16\n\x0b\x63rop_height\x18\" \x01(\r:\x01\x30\x12\x19\n\x0fwrite_augmented\x18\x02 \x01(\t:\x00\x12\x1b\n\x0emax_multiplier\x18\x03 \x01(\x02:\x03\x32\x35\x35\x12\"\n\x13\x61ugment_during_test\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0erecompute_mean\x18\x05 \x01(\r:\x01\x30\x12\x14\n\nwrite_mean\x18\x06 \x01(\t:\x00\x12\x1c\n\x0emean_per_pixel\x18\x07 \x01(\x08:\x04true\x12\x0c\n\x04mean\x18\x12 \x03(\x02\x12\x11\n\x04mode\x18\x08 \x01(\t:\x03\x61\x64\x64\x12\x16\n\x0b\x62ottomwidth\x18P \x01(\r:\x01\x30\x12\x17\n\x0c\x62ottomheight\x18Q \x01(\r:\x01\x30\x12\x0e\n\x03num\x18R \x01(\r:\x01\x30\x12\x18\n\x10\x63hromatic_eigvec\x18S \x03(\x02\x12\x32\n\x06mirror\x18\n \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\ttranslate\x18\x0b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x32\n\x06rotate\x18\x0c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x30\n\x04zoom\x18\r \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07squeeze\x18\x0e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_x\x18\x0f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_y\x18\x10 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05gamma\x18# \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nbrightness\x18$ \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ontrast\x18% \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05\x63olor\x18& \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_pow\x18\x14 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nlmult_mult\x18\x15 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_add\x18\x16 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_pow\x18\x17 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08sat_mult\x18\x18 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_add\x18\x19 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_pow\x18\x1a \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ol_mult\x18\x1b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_add\x18\x1c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_pow\x18\x1d \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tladd_mult\x18\x1e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_add\x18\x1f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\ncol_rotate\x18 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nfog_amount\x18\x64 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x66og_size\x18\x65 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12=\n\x11motion_blur_angle\x18\x66 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12<\n\x10motion_blur_size\x18g \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x38\n\x0cshadow_angle\x18h \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_distance\x18i \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_strength\x18j \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05noise\x18k \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\"\x85\x01\n\x11\x46lowWarpParameter\x12\x43\n\nfill_value\x18\x01 \x01(\x0e\x32).mo_caffe.FlowWarpParameter.FillParameter:\x04ZERO\"+\n\rFillParameter\x12\x08\n\x04ZERO\x10\x01\x12\x10\n\x0cNOT_A_NUMBER\x10\x02\"\xb6\x02\n\x14\x43orrelationParameter\x12\x0e\n\x03pad\x18\x02 \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x03 \x01(\r\x12\x18\n\x10max_displacement\x18\x04 \x01(\r\x12\x13\n\x08stride_1\x18\x05 \x01(\r:\x01\x31\x12\x13\n\x08stride_2\x18\x06 \x01(\r:\x01\x31\x12\x1b\n\x10single_direction\x18\x08 \x01(\x05:\x01\x30\x12\x15\n\x06\x64o_abs\x18\x07 \x01(\x08:\x05\x66\x61lse\x12R\n\x10\x63orrelation_type\x18\x0f \x01(\x0e\x32..mo_caffe.CorrelationParameter.CorrelationType:\x08MULTIPLY\"-\n\x0f\x43orrelationType\x12\x0c\n\x08MULTIPLY\x10\x00\x12\x0c\n\x08SUBTRACT\x10\x01\"\xdc\x01\n\x11ResampleParameter\x12\x17\n\tantialias\x18\x04 \x01(\x08:\x04true\x12\r\n\x05width\x18\x01 \x01(\r\x12\x0e\n\x06height\x18\x02 \x01(\r\x12>\n\x04type\x18\x03 \x01(\x0e\x32(.mo_caffe.ResampleParameter.ResampleType:\x06LINEAR\x12\x11\n\x06\x66\x61\x63tor\x18\x05 \x01(\x02:\x01\x31\"<\n\x0cResampleType\x12\x0b\n\x07NEAREST\x10\x01\x12\n\n\x06LINEAR\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\x12\x08\n\x04\x41REA\x10\x04\"z\n\x0e\x41\x63\x63umParameter\x12\x15\n\ntop_height\x18\x01 \x01(\r:\x01\x30\x12\x14\n\ttop_width\x18\x02 \x01(\r:\x01\x30\x12\x1c\n\x11size_divisible_by\x18\x03 \x01(\r:\x01\x30\x12\x1d\n\x0ehave_reference\x18\x04 \x01(\x08:\x05\x66\x61lse\"(\n\x17ShuffleChannelParameter\x12\r\n\x05group\x18\x01 \x02(\r*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01') ) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -40,8 +40,8 @@ _PHASE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=26741, - serialized_end=26769, + serialized_start=26773, + serialized_end=26801, ) _sym_db.RegisterEnumDescriptor(_PHASE) @@ -369,8 +369,8 @@ _ELTWISEPARAMETER_ELTWISEOP = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=13284, - serialized_end=13323, + serialized_start=13316, + serialized_end=13355, ) _sym_db.RegisterEnumDescriptor(_ELTWISEPARAMETER_ELTWISEOP) @@ -391,8 +391,8 @@ _HINGELOSSPARAMETER_NORM = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=13867, - serialized_end=13889, + serialized_start=13899, + serialized_end=13921, ) _sym_db.RegisterEnumDescriptor(_HINGELOSSPARAMETER_NORM) @@ -413,8 +413,8 @@ _LRNPARAMETER_NORMREGION = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=14771, - serialized_end=14824, + serialized_start=14803, + serialized_end=14856, ) _sym_db.RegisterEnumDescriptor(_LRNPARAMETER_NORMREGION) @@ -465,8 +465,8 @@ _POOLINGPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=15515, - serialized_end=15561, + serialized_start=15547, + serialized_end=15593, ) _sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_POOLMETHOD) @@ -517,8 +517,8 @@ _PRIORBOXPARAMETER_CODETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=15965, - serialized_end=16021, + serialized_start=15997, + serialized_end=16053, ) _sym_db.RegisterEnumDescriptor(_PRIORBOXPARAMETER_CODETYPE) @@ -547,8 +547,8 @@ _REDUCTIONPARAMETER_REDUCTIONOP = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=16541, - serialized_end=16594, + serialized_start=16573, + serialized_end=16626, ) _sym_db.RegisterEnumDescriptor(_REDUCTIONPARAMETER_REDUCTIONOP) @@ -677,8 +677,8 @@ _SPPPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=15515, - serialized_end=15561, + serialized_start=15547, + serialized_end=15593, ) _sym_db.RegisterEnumDescriptor(_SPPPARAMETER_POOLMETHOD) @@ -877,8 +877,8 @@ _V1LAYERPARAMETER_LAYERTYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=20520, - serialized_end=21120, + serialized_start=20552, + serialized_end=21152, ) _sym_db.RegisterEnumDescriptor(_V1LAYERPARAMETER_LAYERTYPE) @@ -925,8 +925,8 @@ _V0LAYERPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=15515, - serialized_end=15561, + serialized_start=15547, + serialized_end=15593, ) _sym_db.RegisterEnumDescriptor(_V0LAYERPARAMETER_POOLMETHOD) @@ -947,8 +947,8 @@ _FLOWWARPPARAMETER_FILLPARAMETER = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=25994, - serialized_end=26037, + serialized_start=26026, + serialized_end=26069, ) _sym_db.RegisterEnumDescriptor(_FLOWWARPPARAMETER_FILLPARAMETER) @@ -969,8 +969,8 @@ _CORRELATIONPARAMETER_CORRELATIONTYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=26305, - serialized_end=26350, + serialized_start=26337, + serialized_end=26382, ) _sym_db.RegisterEnumDescriptor(_CORRELATIONPARAMETER_CORRELATIONTYPE) @@ -999,8 +999,8 @@ _RESAMPLEPARAMETER_RESAMPLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=26513, - serialized_end=26573, + serialized_start=26545, + serialized_end=26605, ) _sym_db.RegisterEnumDescriptor(_RESAMPLEPARAMETER_RESAMPLETYPE) @@ -4492,6 +4492,13 @@ _DETECTIONOUTPUTPARAMETER = _descriptor.Descriptor( message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), + _descriptor.FieldDescriptor( + name='objectness_score', full_name='mo_caffe.DetectionOutputParameter.objectness_score', index=15, + number=16, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=0.01, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), ], extensions=[ ], @@ -4504,7 +4511,7 @@ _DETECTIONOUTPUTPARAMETER = _descriptor.Descriptor( oneofs=[ ], serialized_start=12394, - serialized_end=12935, + serialized_end=12967, ) @@ -4533,8 +4540,8 @@ _DROPOUTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=12937, - serialized_end=12983, + serialized_start=12969, + serialized_end=13015, ) @@ -4598,8 +4605,8 @@ _DUMMYDATAPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=12986, - serialized_end=13152, + serialized_start=13018, + serialized_end=13184, ) @@ -4643,8 +4650,8 @@ _ELTWISEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13155, - serialized_end=13323, + serialized_start=13187, + serialized_end=13355, ) @@ -4673,8 +4680,8 @@ _ELUPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13325, - serialized_end=13357, + serialized_start=13357, + serialized_end=13389, ) @@ -4731,8 +4738,8 @@ _EMBEDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13360, - serialized_end=13538, + serialized_start=13392, + serialized_end=13570, ) @@ -4775,8 +4782,8 @@ _EXPPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13540, - serialized_end=13608, + serialized_start=13572, + serialized_end=13640, ) @@ -4812,8 +4819,8 @@ _FLATTENPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13610, - serialized_end=13667, + serialized_start=13642, + serialized_end=13699, ) @@ -4856,8 +4863,8 @@ _HDF5DATAPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13669, - serialized_end=13748, + serialized_start=13701, + serialized_end=13780, ) @@ -4886,8 +4893,8 @@ _HDF5OUTPUTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13750, - serialized_end=13790, + serialized_start=13782, + serialized_end=13822, ) @@ -4917,8 +4924,8 @@ _HINGELOSSPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13792, - serialized_end=13889, + serialized_start=13824, + serialized_end=13921, ) @@ -5024,8 +5031,8 @@ _IMAGEDATAPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=13892, - serialized_end=14171, + serialized_start=13924, + serialized_end=14203, ) @@ -5054,8 +5061,8 @@ _INFOGAINLOSSPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14173, - serialized_end=14212, + serialized_start=14205, + serialized_end=14244, ) @@ -5119,8 +5126,8 @@ _INNERPRODUCTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14215, - serialized_end=14424, + serialized_start=14247, + serialized_end=14456, ) @@ -5149,8 +5156,8 @@ _INPUTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14426, - serialized_end=14478, + serialized_start=14458, + serialized_end=14510, ) @@ -5193,8 +5200,8 @@ _LOGPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14480, - serialized_end=14548, + serialized_start=14512, + serialized_end=14580, ) @@ -5260,8 +5267,8 @@ _LRNPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14551, - serialized_end=14869, + serialized_start=14583, + serialized_end=14901, ) @@ -5290,8 +5297,8 @@ _GRNPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14871, - serialized_end=14902, + serialized_start=14903, + serialized_end=14934, ) @@ -5341,8 +5348,8 @@ _MEMORYDATAPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14904, - serialized_end=14994, + serialized_start=14936, + serialized_end=15026, ) @@ -5385,8 +5392,8 @@ _MVNPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=14996, - serialized_end=15096, + serialized_start=15028, + serialized_end=15128, ) @@ -5415,8 +5422,8 @@ _PARAMETERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=15098, - serialized_end=15154, + serialized_start=15130, + serialized_end=15186, ) @@ -5531,8 +5538,8 @@ _POOLINGPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=15157, - serialized_end=15606, + serialized_start=15189, + serialized_end=15638, ) @@ -5575,8 +5582,8 @@ _POWERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=15608, - serialized_end=15678, + serialized_start=15640, + serialized_end=15710, ) @@ -5704,8 +5711,8 @@ _PRIORBOXPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=15681, - serialized_end=16021, + serialized_start=15713, + serialized_end=16053, ) @@ -5748,8 +5755,8 @@ _PSROIPOOLINGPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16023, - serialized_end=16109, + serialized_start=16055, + serialized_end=16141, ) @@ -5799,8 +5806,8 @@ _PYTHONPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16111, - serialized_end=16214, + serialized_start=16143, + serialized_end=16246, ) @@ -5857,8 +5864,8 @@ _RECURRENTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16217, - serialized_end=16415, + serialized_start=16249, + serialized_end=16447, ) @@ -5902,8 +5909,8 @@ _REDUCTIONPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16418, - serialized_end=16594, + serialized_start=16450, + serialized_end=16626, ) @@ -5940,8 +5947,8 @@ _RELUPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16597, - serialized_end=16741, + serialized_start=16629, + serialized_end=16773, ) @@ -5970,8 +5977,8 @@ _RELU6PARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16743, - serialized_end=16773, + serialized_start=16775, + serialized_end=16805, ) @@ -6014,8 +6021,8 @@ _RESHAPEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16775, - serialized_end=16868, + serialized_start=16807, + serialized_end=16900, ) @@ -6044,8 +6051,8 @@ _REVERSEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16870, - serialized_end=16905, + serialized_start=16902, + serialized_end=16937, ) @@ -6088,8 +6095,8 @@ _ROIPOOLINGPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16907, - serialized_end=16996, + serialized_start=16939, + serialized_end=17028, ) @@ -6132,8 +6139,8 @@ _ROIWARPINGTESTPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=16998, - serialized_end=17091, + serialized_start=17030, + serialized_end=17123, ) @@ -6176,8 +6183,8 @@ _ROIWARPINGPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17093, - serialized_end=17182, + serialized_start=17125, + serialized_end=17214, ) @@ -6234,8 +6241,8 @@ _SCALEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17185, - serialized_end=17356, + serialized_start=17217, + serialized_end=17388, ) @@ -6265,8 +6272,8 @@ _SIGMOIDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17358, - serialized_end=17481, + serialized_start=17390, + serialized_end=17513, ) @@ -6309,8 +6316,8 @@ _SLICEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17483, - serialized_end=17559, + serialized_start=17515, + serialized_end=17591, ) @@ -6339,8 +6346,8 @@ _SMOOTHL1LOSSPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17561, - serialized_end=17602, + serialized_start=17593, + serialized_end=17634, ) @@ -6377,8 +6384,8 @@ _SOFTMAXPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17605, - serialized_end=17745, + serialized_start=17637, + serialized_end=17777, ) @@ -6408,8 +6415,8 @@ _TANHPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17747, - serialized_end=17864, + serialized_start=17779, + serialized_end=17896, ) @@ -6445,8 +6452,8 @@ _TILEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17866, - serialized_end=17913, + serialized_start=17898, + serialized_end=17945, ) @@ -6475,8 +6482,8 @@ _THRESHOLDPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17915, - serialized_end=17957, + serialized_start=17947, + serialized_end=17989, ) @@ -6589,8 +6596,8 @@ _WINDOWDATAPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=17960, - serialized_end=18281, + serialized_start=17992, + serialized_end=18313, ) @@ -6635,8 +6642,8 @@ _SPPPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=18284, - serialized_end=18525, + serialized_start=18316, + serialized_end=18557, ) @@ -6961,8 +6968,8 @@ _V1LAYERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=18528, - serialized_end=21164, + serialized_start=18560, + serialized_end=21196, ) @@ -7251,8 +7258,8 @@ _V0LAYERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=21167, - serialized_end=22203, + serialized_start=21199, + serialized_end=22235, ) @@ -7288,8 +7295,8 @@ _PRELUPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=22205, - serialized_end=22295, + serialized_start=22237, + serialized_end=22327, ) @@ -7353,8 +7360,8 @@ _REGIONYOLOPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=22298, - serialized_end=22432, + serialized_start=22330, + serialized_end=22464, ) @@ -7383,8 +7390,8 @@ _REORGYOLOPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=22434, - serialized_end=22473, + serialized_start=22466, + serialized_end=22505, ) @@ -7462,8 +7469,8 @@ _RANDOMGENERATORPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=22476, - serialized_end=22683, + serialized_start=22508, + serialized_end=22715, ) @@ -7506,8 +7513,8 @@ _COEFFSCHEDULEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=22685, - serialized_end=22781, + serialized_start=22717, + serialized_end=22813, ) @@ -7823,8 +7830,8 @@ _AUGMENTATIONCOEFF = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=22784, - serialized_end=23774, + serialized_start=22816, + serialized_end=23806, ) @@ -8168,8 +8175,8 @@ _AUGMENTATIONPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=23777, - serialized_end=25901, + serialized_start=23809, + serialized_end=25933, ) @@ -8199,8 +8206,8 @@ _FLOWWARPPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=25904, - serialized_end=26037, + serialized_start=25936, + serialized_end=26069, ) @@ -8279,8 +8286,8 @@ _CORRELATIONPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=26040, - serialized_end=26350, + serialized_start=26072, + serialized_end=26382, ) @@ -8338,8 +8345,8 @@ _RESAMPLEPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=26353, - serialized_end=26573, + serialized_start=26385, + serialized_end=26605, ) @@ -8389,8 +8396,8 @@ _ACCUMPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=26575, - serialized_end=26697, + serialized_start=26607, + serialized_end=26729, ) @@ -8419,8 +8426,8 @@ _SHUFFLECHANNELPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=26699, - serialized_end=26739, + serialized_start=26731, + serialized_end=26771, ) _BLOBPROTO.fields_by_name['shape'].message_type = _BLOBSHAPE diff --git a/model-optimizer/mo/front/caffe/proto/mo_caffe.proto b/model-optimizer/mo/front/caffe/proto/mo_caffe.proto index 82f83a5..d25ec87 100644 --- a/model-optimizer/mo/front/caffe/proto/mo_caffe.proto +++ b/model-optimizer/mo/front/caffe/proto/mo_caffe.proto @@ -1031,6 +1031,8 @@ message DetectionOutputParameter { optional int32 input_height = 14 [default = -1]; // If false, bboxes need to be normalized optional bool normalized = 15 [default = true]; + //the objectness score is used for the anchor refinement module to filter easy negative anchor. + optional float objectness_score = 16 [default = 0.01]; } message DropoutParameter { diff --git a/model-optimizer/mo/front/caffe/python_layer_extractor.py b/model-optimizer/mo/front/caffe/python_layer_extractor.py index 0908e69..0a693a9 100644 --- a/model-optimizer/mo/front/caffe/python_layer_extractor.py +++ b/model-optimizer/mo/front/caffe/python_layer_extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/python_layer_extractor_test.py b/model-optimizer/mo/front/caffe/python_layer_extractor_test.py index 35f6760..bdb0a8c 100644 --- a/model-optimizer/mo/front/caffe/python_layer_extractor_test.py +++ b/model-optimizer/mo/front/caffe/python_layer_extractor_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/caffe/register_custom_ops.py b/model-optimizer/mo/front/caffe/register_custom_ops.py index fb8ea57..1a89012 100644 --- a/model-optimizer/mo/front/caffe/register_custom_ops.py +++ b/model-optimizer/mo/front/caffe/register_custom_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,15 +14,11 @@ limitations under the License. """ -from mo.back.replacement import BackReplacementPattern from mo.front.common.replacement import FrontReplacementOp, FrontReplacementPattern, FrontReplacementSubgraph from mo.front.extractor import FrontExtractorOp, CaffePythonFrontExtractorOp -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.utils import class_registration -def update_registration(): - class_registration.update_registration([Op, FrontExtractorOp, CaffePythonFrontExtractorOp, FrontReplacementOp, - FrontReplacementPattern, FrontReplacementSubgraph, MiddleReplacementPattern, - BackReplacementPattern]) +def get_front_classes(): + front_classes = [FrontExtractorOp, CaffePythonFrontExtractorOp, FrontReplacementOp, + FrontReplacementPattern, FrontReplacementSubgraph] + return front_classes diff --git a/model-optimizer/mo/front/common/custom_replacement_registry.py b/model-optimizer/mo/front/common/custom_replacement_registry.py index cc1dd0e..87410d8 100644 --- a/model-optimizer/mo/front/common/custom_replacement_registry.py +++ b/model-optimizer/mo/front/common/custom_replacement_registry.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/extractors/utils.py b/model-optimizer/mo/front/common/extractors/utils.py index e82f89e..e4d0dcd 100644 --- a/model-optimizer/mo/front/common/extractors/utils.py +++ b/model-optimizer/mo/front/common/extractors/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/find_unsupported_ops.py b/model-optimizer/mo/front/common/find_unsupported_ops.py index 8706706..8b632c2 100644 --- a/model-optimizer/mo/front/common/find_unsupported_ops.py +++ b/model-optimizer/mo/front/common/find_unsupported_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,15 +16,12 @@ import logging as log -import networkx as nx import numpy as np -from mo.graph.graph import Node -from mo.utils.dsu import DSU, DSUElem -from mo.utils.graph import bfs_search +from mo.graph.graph import Node, Graph -def find_unsupported_ops(graph: nx.MultiDiGraph): +def find_unsupported_ops(graph: Graph): """ The function returns list of node name those are not supported. Currently nodes that product non FP32 data tensors or has undefined 'type' attribute are considered unsupported. @@ -36,57 +33,13 @@ def find_unsupported_ops(graph: nx.MultiDiGraph): node = Node(graph, node_name) # op node that produce non FP32 data or has no type are considered unsupported if node.kind == 'op': - if not node.has_valid('type'): - log.info('Node "{}" does not have type. Consider it unsupported'.format(node_name)) - unsupported.append(node.id) - else: + if node.has_valid('type') or (node.has_valid('op') and node.op == 'OpOutput'): for out_data_node in node.out_nodes().values(): if out_data_node.has_valid('data_type') and out_data_node.data_type != np.float32: log.info('Node "{}" produces output as non FP32. Consider it unsupported'.format(node_name)) unsupported.append(node.id) + else: + log.info('Node "{}" does not have type. Consider it unsupported'.format(node_name)) + unsupported.append(node.id) return unsupported - -def find_unsupported_ops_subgraphs(graph: nx.MultiDiGraph, unsupported_nodes: list, - find_constant_input_fn: callable = lambda node: node): - bfs_nodes = bfs_search(graph, list()) - visited = set() - # mark initial set of nodes as not supported - for node_name in unsupported_nodes: - graph.node[node_name]['supported'] = False - - for node_name in bfs_nodes: - if node_name in visited: - continue - - node = Node(graph, node_name) - if node.has_valid('supported') and not node['supported']: - added_nodes = find_constant_input_fn(node) - visited.update(added_nodes) - for node in added_nodes: - node['supported'] = False - - dsu_elems = list() - for node_name in bfs_nodes: - node = Node(graph, node_name) - if node.has_valid('supported') and not node['supported']: - dsu_elems.append(DSUElem(node_name)) - - dsu = DSU(dsu_elems) - - # merge adjacent unsupported nodes - for dsu_elem in dsu_elems: - node = Node(graph, dsu_elem.name) - if not node['supported']: - for out_node in node.out_nodes().values(): - if out_node.has_valid('supported') and not out_node['supported']: - dsu.union(dsu_elem, dsu.find_elem(out_node.id)) - - subgraph_id = dict() # key is the name of the node, value is the set of nodes that belong to this subgraph - for dsu_elem in dsu.map.values(): - parent = dsu.find_parent(dsu_elem).name - if parent not in subgraph_id.keys(): - subgraph_id[parent] = set() - subgraph_id[parent].add(dsu_elem.name) - - return [list(s) for s in subgraph_id.values()] diff --git a/model-optimizer/mo/front/common/layout.py b/model-optimizer/mo/front/common/layout.py index 6da7861..b95677d 100644 --- a/model-optimizer/mo/front/common/layout.py +++ b/model-optimizer/mo/front/common/layout.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/layout_test.py b/model-optimizer/mo/front/common/layout_test.py index e3865e4..1f0a288 100644 --- a/model-optimizer/mo/front/common/layout_test.py +++ b/model-optimizer/mo/front/common/layout_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/batch_norm.py b/model-optimizer/mo/front/common/partial_infer/batch_norm.py index e20e961..6b68628 100644 --- a/model-optimizer/mo/front/common/partial_infer/batch_norm.py +++ b/model-optimizer/mo/front/common/partial_infer/batch_norm.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/caffe_fallback.py b/model-optimizer/mo/front/common/partial_infer/caffe_fallback.py index b8bcdce..d750af9 100644 --- a/model-optimizer/mo/front/common/partial_infer/caffe_fallback.py +++ b/model-optimizer/mo/front/common/partial_infer/caffe_fallback.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,18 +20,18 @@ import os import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error from mo.utils.find_inputs import find_inputs from mo.utils.utils import refer_to_faq_msg -def get_node_top(graph: nx.MultiDiGraph, name: str): +def get_node_top(graph: Graph, name: str): node = Node(graph, name) return node.out_edge()['name'] if node else None -def build_net(graph: nx.DiGraph): +def build_net(graph: Graph): try: if not hasattr(os.environ, 'GLOG_minloglevel'): os.environ['GLOG_minloglevel'] = '2' @@ -80,7 +80,7 @@ def build_net(graph: nx.DiGraph): graph.__setattr__('caffe_net', net) -def get_net(graph: nx.DiGraph): +def get_net(graph: Graph): if not graph: return None @@ -101,6 +101,9 @@ def caffe_native_node_infer(node: Node): node node to infer the shape for """ + log.error("Caffe fallback is deprecated. It will be removed in future releases. Please use extensions for unsupported layers.\n" + + "See more information in the \"Custom Layers in the Model Optimizer\" chapter of the Model Optimizer Developer Guide", + extra={'is_warning': True}) log.info('Called "caffe_native_node_infer" for node "{}"'.format(node.id)) graph = node.graph diff --git a/model-optimizer/mo/front/common/partial_infer/caffe_fallback_test.py b/model-optimizer/mo/front/common/partial_infer/caffe_fallback_test.py index 1d03857..c2bfc0c 100644 --- a/model-optimizer/mo/front/common/partial_infer/caffe_fallback_test.py +++ b/model-optimizer/mo/front/common/partial_infer/caffe_fallback_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -52,7 +52,8 @@ class TestCaffeNativePartialInfer(unittest.TestCase): 'node_1': {'type': 'Input', 'kind': 'op'}, 'node_2': {'type': 'Input', 'kind': 'op'}, 'node_3': {'type': 'Identity', 'kind': 'op'}, - 'node_4': {'type': 'Identity', 'kind': 'op'} + 'node_4': {'type': 'Identity', 'kind': 'op'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } def test_build_net_equal_inputs(self): @@ -66,10 +67,11 @@ class TestCaffeNativePartialInfer(unittest.TestCase): [ ('node_1', 'node_3'), ('node_2', 'node_3'), - ('node_3', 'node_4') + ('node_3', 'node_4'), + ('node_4', 'op_output') ], { - 'node_4': {'is_output': True, 'shape': None}, + 'node_4': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 224, 224])}, 'node_3': {'top': 'top_node'} @@ -94,9 +96,10 @@ class TestCaffeNativePartialInfer(unittest.TestCase): graph = build_graph(self.nodes_attributes, [ ('node_1', 'node_3'), - ('node_3', 'node_4') + ('node_3', 'node_4'), + ('node_4', 'op_output') ], - {'node_4': {'is_output': True, 'shape': None}, + {'node_4': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_3': {'top': 'top_node'} }, diff --git a/model-optimizer/mo/front/common/partial_infer/concat.py b/model-optimizer/mo/front/common/partial_infer/concat.py index f041c28..372a124 100644 --- a/model-optimizer/mo/front/common/partial_infer/concat.py +++ b/model-optimizer/mo/front/common/partial_infer/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,8 +46,8 @@ def concat_infer(node): node.axis = axis mask = np.zeros_like(shape, dtype=np.bool) - mask[axis] = True - not_mask = np.logical_not(mask) + mask[axis] = True # pylint: disable=unsupported-assignment-operation + not_mask = np.logical_not(mask) # pylint: disable=assignment-from-no-return for s in shapes[1:]: if np.all(shape[not_mask] == s[not_mask]): # TODO handle -1 in a special way shape[mask] += s[mask] diff --git a/model-optimizer/mo/front/common/partial_infer/concat_test.py b/model-optimizer/mo/front/common/partial_infer/concat_test.py index 07b53a1..8644983 100644 --- a/model-optimizer/mo/front/common/partial_infer/concat_test.py +++ b/model-optimizer/mo/front/common/partial_infer/concat_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'kind': 'data', 'value': None}, 'node_2': {'kind': 'data', 'value': None}, 'concat': {'type': 'Concat', 'kind': 'op'}, - 'node_3': {'kind': 'data'} + 'node_3': {'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, } @@ -34,8 +35,10 @@ class TestConcatPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'concat'), ('node_2', 'concat'), - ('concat', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('concat', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'concat': {'axis': 2} @@ -52,8 +55,10 @@ class TestConcatPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'concat'), ('node_2', 'concat'), - ('concat', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('concat', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 3, 227, 227])}, 'concat': {'axis': -1} @@ -70,8 +75,10 @@ class TestConcatPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'concat'), ('node_2', 'concat'), - ('concat', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('concat', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': np.array([1, 2, 227, 227])}, 'concat': {'axis': 2} @@ -86,8 +93,10 @@ class TestConcatPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'concat'), ('node_2', 'concat'), - ('concat', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('concat', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227])}, 'node_2': {'shape': None}, 'concat': {'axis': 2} diff --git a/model-optimizer/mo/front/common/partial_infer/const.py b/model-optimizer/mo/front/common/partial_infer/const.py index 0ceb880..ebc3f6e 100644 --- a/model-optimizer/mo/front/common/partial_infer/const.py +++ b/model-optimizer/mo/front/common/partial_infer/const.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/crop.py b/model-optimizer/mo/front/common/partial_infer/crop.py index e097ec9..5c11617 100644 --- a/model-optimizer/mo/front/common/partial_infer/crop.py +++ b/model-optimizer/mo/front/common/partial_infer/crop.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/crop_test.py b/model-optimizer/mo/front/common/partial_infer/crop_test.py index d1eb97b..d070592 100644 --- a/model-optimizer/mo/front/common/partial_infer/crop_test.py +++ b/model-optimizer/mo/front/common/partial_infer/crop_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'value': None, 'kind': 'data'}, 'node_2': {'value': None, 'kind': 'data'}, 'crop_1': {'type': 'Crop', 'kind': 'op'}, - 'node_3': {'value': None, 'kind': 'data'} + 'node_3': {'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -34,8 +35,10 @@ class TestCropInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'crop_1'), ('node_2', 'crop_1'), - ('crop_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('crop_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 2, 500, 500])}, 'node_2': {'shape': np.array([1, 2, 256, 256])}, 'crop_1': {'axis': 2, 'offset': [0, 0], 'dim': None} @@ -57,8 +60,10 @@ class TestCropInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'crop_1'), ('node_2', 'crop_1'), - ('crop_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('crop_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 2, 500, 500])}, 'node_2': {'shape': np.array([1, 2, 256, 256])}, 'crop_1': {'axis': -1, 'offset': [0, 0], 'dim': None} @@ -80,8 +85,10 @@ class TestCropInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'crop_1'), ('node_2', 'crop_1'), - ('crop_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('crop_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 2, 500, 500])}, 'node_2': {'shape': None}, 'crop_1': {'axis': 2, 'offset': [0, 0], 'dim': None} @@ -95,8 +102,10 @@ class TestCropInfer(unittest.TestCase): def test_crop_infer_one_shape(self): graph = build_graph(nodes_attributes, [('node_1', 'crop_1'), - ('crop_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('crop_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 2, 500, 500])}, 'crop_1': {'axis': 2, 'offset': [0], 'dim': None} }) @@ -110,8 +119,10 @@ class TestCropInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'crop_1'), ('node_2', 'crop_1'), - ('crop_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('crop_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 2, 500, 500])}, 'node_2': {'shape': np.array([1, 2, 256, 256])}, 'crop_1': {'axis': 2, 'offset': [300], 'dim': None} diff --git a/model-optimizer/mo/front/common/partial_infer/elemental.py b/model-optimizer/mo/front/common/partial_infer/elemental.py index 99adb1f..c33a356 100644 --- a/model-optimizer/mo/front/common/partial_infer/elemental.py +++ b/model-optimizer/mo/front/common/partial_infer/elemental.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,14 +19,19 @@ def single_output_infer(node, shape_infer, value_infer=None): node.out_node(0).shape = shape_infer(node) if value_infer is not None and \ - 'value' in node.in_node() and \ - node.in_node().value is not None: + 'value' in node.in_node() and \ + node.in_node().value is not None: node.out_node(0).value = value_infer(node) -def copy_shape_infer(node): + +def copy_shape_infer(node, value_infer=None): """ Sets output dimensions of node equal to input ones Args: node: graph node """ - single_output_infer(node, lambda n: n.in_node().shape) + single_output_infer(node, lambda n: n.in_node().shape, value_infer) + + +def copy_value(node): + return None if node.in_node().value is None else node.in_node().value.copy() diff --git a/model-optimizer/mo/front/common/partial_infer/elemental_test.py b/model-optimizer/mo/front/common/partial_infer/elemental_test.py index 78d1dae..a1c6985 100644 --- a/model-optimizer/mo/front/common/partial_infer/elemental_test.py +++ b/model-optimizer/mo/front/common/partial_infer/elemental_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/eltwise.py b/model-optimizer/mo/front/common/partial_infer/eltwise.py index 7d19907..f0d96e1 100644 --- a/model-optimizer/mo/front/common/partial_infer/eltwise.py +++ b/model-optimizer/mo/front/common/partial_infer/eltwise.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,11 +19,11 @@ import logging as log import networkx as nx from mo.front.common.partial_infer.utils import int64_array -from mo.graph.graph import get_sorted_inputs, Node +from mo.graph.graph import Node def eltwise_infer(node, op=None, **kwargs): - raw_inputs = [(inp, attr) for inp, attr in get_sorted_inputs(node) + raw_inputs = [(inp, attr) for inp, attr in node.get_sorted_inputs() if 'control_flow_edge' not in attr or not attr['control_flow_edge']] inputs = [Node(node.graph, inp) for inp, attr in raw_inputs] shapes = [node.graph.node[inp]['shape'] for inp, attr in raw_inputs] diff --git a/model-optimizer/mo/front/common/partial_infer/eltwise_test.py b/model-optimizer/mo/front/common/partial_infer/eltwise_test.py index 5b57bf6..0bd0a24 100644 --- a/model-optimizer/mo/front/common/partial_infer/eltwise_test.py +++ b/model-optimizer/mo/front/common/partial_infer/eltwise_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'value': 2, 'kind': 'data'}, 'node_2': {'value': 3, 'kind': 'data'}, 'eltw_1': {'type': 'Eltwise', 'kind': 'op'}, - 'node_3': {'value': None, 'kind': 'data'} + 'node_3': {'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, } @@ -34,8 +35,10 @@ class TestEltwiseInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'eltw_1'), ('node_2', 'eltw_1'), - ('eltw_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('eltw_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'node_2': {'shape': np.array([1, 3, 256, 256])}, 'eltw_1': {} @@ -59,8 +62,10 @@ class TestEltwiseInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'eltw_1'), ('node_2', 'eltw_1'), - ('eltw_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('eltw_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'node_2': {'shape': np.array([1, 3, 256, 256])} }) @@ -81,8 +86,10 @@ class TestEltwiseInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'eltw_1'), ('node_2', 'eltw_1'), - ('eltw_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('eltw_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'node_2': {'shape': np.array([1, 3, 256, 256])} }) @@ -103,8 +110,10 @@ class TestEltwiseInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'eltw_1'), ('node_2', 'eltw_1'), - ('eltw_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('eltw_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256]), 'value': None}, 'node_2': {'shape': np.array([1, 3, 256, 256])} }) @@ -124,8 +133,10 @@ class TestEltwiseInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'eltw_1'), ('node_2', 'eltw_1'), - ('eltw_1', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('eltw_1', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 257, 256])}, 'node_2': {'shape': np.array([1, 3, 256, 257])} }) diff --git a/model-optimizer/mo/front/common/partial_infer/expand_dims.py b/model-optimizer/mo/front/common/partial_infer/expand_dims.py index 50ac4f0..dbdebd5 100644 --- a/model-optimizer/mo/front/common/partial_infer/expand_dims.py +++ b/model-optimizer/mo/front/common/partial_infer/expand_dims.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -44,6 +44,12 @@ def tf_expand_dims_infer(node): if expand_axis is None: return + # expand_axis is a position where the new axis is placed + # so expand_dims works for negative axis in a different way + # not as insert operation + if expand_axis < 0: + expand_axis += len(input_node.shape) + 1 + output_node.shape = np.insert(input_node.shape, expand_axis, [1]) # convert data type of the shape to int64 explicitly output_node.shape = output_node.shape.astype(np.int64) diff --git a/model-optimizer/mo/front/common/partial_infer/expand_dims_test.py b/model-optimizer/mo/front/common/partial_infer/expand_dims_test.py index 69dbc44..119c3c2 100644 --- a/model-optimizer/mo/front/common/partial_infer/expand_dims_test.py +++ b/model-optimizer/mo/front/common/partial_infer/expand_dims_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -146,6 +146,40 @@ class TestExpandDimsInfer(unittest.TestCase): for i in range(0, len(exp_shape)): self.assertEqual(exp_shape[i], res_shape[i]) + def test_expand_dims_infer_one_input_3(self): + graph = build_graph(nodes_attributes, + [('input_1', 'expand_dims'), + ('expand_dims', 'out')], + {'input_1': {'shape': np.array([3, 256, 256])}, + 'expand_dims': {'expand_axis': -1} + }) + + expand_dims_node = Node(graph, 'expand_dims') + + tf_expand_dims_infer(expand_dims_node) + exp_shape = np.array([3, 256, 256, 1]) + res_shape = expand_dims_node.out_node().shape + self.assertEqual(len(exp_shape), len(res_shape)) + for i in range(0, len(exp_shape)): + self.assertEqual(exp_shape[i], res_shape[i]) + + def test_expand_dims_infer_one_input_4(self): + graph = build_graph(nodes_attributes, + [('input_1', 'expand_dims'), + ('expand_dims', 'out')], + {'input_1': {'shape': np.array([3, 256, 256])}, + 'expand_dims': {'expand_axis': -2} + }) + + expand_dims_node = Node(graph, 'expand_dims') + + tf_expand_dims_infer(expand_dims_node) + exp_shape = np.array([3, 256, 1, 256]) + res_shape = expand_dims_node.out_node().shape + self.assertEqual(len(exp_shape), len(res_shape)) + for i in range(0, len(exp_shape)): + self.assertEqual(exp_shape[i], res_shape[i]) + def test_expand_dims_infer_one_input_negative(self): graph = build_graph(nodes_attributes, [('input_1', 'expand_dims'), diff --git a/model-optimizer/mo/front/common/partial_infer/inner_product.py b/model-optimizer/mo/front/common/partial_infer/inner_product.py index 765363b..a92f2ec 100644 --- a/model-optimizer/mo/front/common/partial_infer/inner_product.py +++ b/model-optimizer/mo/front/common/partial_infer/inner_product.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/inner_product_test.py b/model-optimizer/mo/front/common/partial_infer/inner_product_test.py index 8b39312..4858893 100644 --- a/model-optimizer/mo/front/common/partial_infer/inner_product_test.py +++ b/model-optimizer/mo/front/common/partial_infer/inner_product_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'value': None, 'kind': 'data'}, 'inner': {'type': 'FullyConnected', 'value': None, 'kind': 'op'}, 'node_2': {'value': FakeValue(None), 'kind': 'data'}, - 'node_3': {'value': None, 'kind': 'data'} + 'node_3': {'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -35,8 +36,10 @@ class TestInnerPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'inner'), ('node_2', 'inner'), - ('inner', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('inner', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'node_2': {'shape': np.array([1, 3, 256, 256]), 'dim_attrs': ['spatial_dims', 'channel_dims', 'batch_dims', 'axis']}, @@ -60,8 +63,10 @@ class TestInnerPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'inner'), ('node_2', 'inner'), - ('inner', 'node_3')], - {'node_3': {'is_output': True, 'shape': None}, + ('inner', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'shape': None}, 'node_1': {'shape': None}, 'node_2': {'shape': np.array([1, 3, 256, 256])}, 'inner': {'out-size': 4} diff --git a/model-optimizer/mo/front/common/partial_infer/matmul.py b/model-optimizer/mo/front/common/partial_infer/matmul.py index 157402c..e615dcf 100644 --- a/model-optimizer/mo/front/common/partial_infer/matmul.py +++ b/model-optimizer/mo/front/common/partial_infer/matmul.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/multi_box_detection.py b/model-optimizer/mo/front/common/partial_infer/multi_box_detection.py index eb076aa..755451a 100644 --- a/model-optimizer/mo/front/common/partial_infer/multi_box_detection.py +++ b/model-optimizer/mo/front/common/partial_infer/multi_box_detection.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/multi_box_detection_test.py b/model-optimizer/mo/front/common/partial_infer/multi_box_detection_test.py index ad9859f..f82a494 100644 --- a/model-optimizer/mo/front/common/partial_infer/multi_box_detection_test.py +++ b/model-optimizer/mo/front/common/partial_infer/multi_box_detection_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/multi_box_prior.py b/model-optimizer/mo/front/common/partial_infer/multi_box_prior.py index 4b70e37..8510b2d 100644 --- a/model-optimizer/mo/front/common/partial_infer/multi_box_prior.py +++ b/model-optimizer/mo/front/common/partial_infer/multi_box_prior.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/multi_box_prior_test.py b/model-optimizer/mo/front/common/partial_infer/multi_box_prior_test.py index 6e1ce7c..f50dd71 100644 --- a/model-optimizer/mo/front/common/partial_infer/multi_box_prior_test.py +++ b/model-optimizer/mo/front/common/partial_infer/multi_box_prior_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/random_uniform.py b/model-optimizer/mo/front/common/partial_infer/random_uniform.py index 0d33882..a720c55 100644 --- a/model-optimizer/mo/front/common/partial_infer/random_uniform.py +++ b/model-optimizer/mo/front/common/partial_infer/random_uniform.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/range.py b/model-optimizer/mo/front/common/partial_infer/range.py index de18323..ac7c135 100644 --- a/model-optimizer/mo/front/common/partial_infer/range.py +++ b/model-optimizer/mo/front/common/partial_infer/range.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/range_test.py b/model-optimizer/mo/front/common/partial_infer/range_test.py index 113c49b..3ea693e 100644 --- a/model-optimizer/mo/front/common/partial_infer/range_test.py +++ b/model-optimizer/mo/front/common/partial_infer/range_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/reduce.py b/model-optimizer/mo/front/common/partial_infer/reduce.py index 627badc..b65f914 100644 --- a/model-optimizer/mo/front/common/partial_infer/reduce.py +++ b/model-optimizer/mo/front/common/partial_infer/reduce.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,6 +35,6 @@ def tf_reduce_infer(node, op=None): output_shape = np.delete(output_shape, axis) node.out_node().shape = output_shape if op is not None and node.in_node(0).value is not None: - node.out_node(0).value = np.array([op(node.in_node(0).value, (*axis,))], + node.out_node(0).value = np.array(op(node.in_node(0).value, (*axis,)), dtype=node.in_node(0).value.dtype) # TODO extend to multi-dimensional axis log.debug("value: {}".format(node.out_node(0).value)) \ No newline at end of file diff --git a/model-optimizer/mo/front/common/partial_infer/reshape.py b/model-optimizer/mo/front/common/partial_infer/reshape.py index ae61602..bb752a4 100644 --- a/model-optimizer/mo/front/common/partial_infer/reshape.py +++ b/model-optimizer/mo/front/common/partial_infer/reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -30,6 +30,10 @@ def tf_reshape_shape_infer(node): input_shape = node.in_node(0).shape reshape_output = node.in_node(1).value if len(node.in_nodes()) > 1 else node.dim + # In case if Reshape operation was created with two inputs and dim attr wasn't set, we set in automatically + if not node.has_valid('dim'): + node['dim'] = reshape_output.copy() + if node.in_node(0).shape is None: return None @@ -68,8 +72,4 @@ def tf_reshape_shape_infer(node): output_shape = int64_array(output_shape) - # In case if Reshape operation was created with two inputs and dim attr wasn't set, we set in automatically - if not node.has_valid('dim'): - node['dim'] = output_shape - return output_shape diff --git a/model-optimizer/mo/front/common/partial_infer/roipooling.py b/model-optimizer/mo/front/common/partial_infer/roipooling.py index 48d2d32..115f923 100644 --- a/model-optimizer/mo/front/common/partial_infer/roipooling.py +++ b/model-optimizer/mo/front/common/partial_infer/roipooling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/roipooling_test.py b/model-optimizer/mo/front/common/partial_infer/roipooling_test.py index f6b9eba..b56d21b 100644 --- a/model-optimizer/mo/front/common/partial_infer/roipooling_test.py +++ b/model-optimizer/mo/front/common/partial_infer/roipooling_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27,7 +27,8 @@ nodes_attributes = {'node_1': {'kind': 'data'}, 'node_3': {'kind': 'data'}, 'node_4': {'kind': 'data'}, 'roipool': {'type': 'ROIPooling', 'kind': 'op', 'pooled_h': None, 'pooled_w': None}, - 'output': {'value': None, 'kind': 'data'} + 'output': {'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, } @@ -36,8 +37,10 @@ class TestRoipoolingInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'roipool'), ('node_2', 'roipool'), - ('roipool', 'output')], - {'output': {'is_output': True, 'shape': None}, + ('roipool', 'output'), + ('output', 'op_output') + ], + {'output': {'shape': None}, 'node_1': {'shape': np.array([1, 256, 20, 20])}, 'node_2': {'shape': np.array([150, 5])}, 'roipool': {'pooled_h': 6, 'pooled_w': 6} @@ -55,8 +58,10 @@ class TestRoipoolingInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'roipool'), ('node_2', 'roipool'), - ('roipool', 'output')], - {'output': {'is_output': True, 'shape': None}, + ('roipool', 'output'), + ('output', 'op_output') + ], + {'output': {'shape': None}, 'node_1': {'shape': None}, 'node_2': {'shape': np.array([1, 256])}, 'roipool': {'pooled_h': 6, 'pooled_w': 6} @@ -74,8 +79,10 @@ class TestRoipoolingInfer(unittest.TestCase): ('node_2', 'roipool'), ('node_3', 'roipool'), ('node_4', 'roipool'), - ('roipool', 'output')], - {'output': {'is_output': True, 'shape': None}, + ('roipool', 'output'), + ('output', 'op_output') + ], + {'output': {'shape': None}, 'node_1': {'shape': np.array([1, 20, 20, 256])}, 'node_2': {'shape': np.array([150, 5])}, 'node_3': {'shape': np.array([150])}, diff --git a/model-optimizer/mo/front/common/partial_infer/slice.py b/model-optimizer/mo/front/common/partial_infer/slice.py index bf23763..a63658a 100644 --- a/model-optimizer/mo/front/common/partial_infer/slice.py +++ b/model-optimizer/mo/front/common/partial_infer/slice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,13 +16,20 @@ import numpy as np -from mo.graph.graph import erase_node from mo.utils.error import Error + def tf_strided_slice_infer(node): + if node.in_node(1).value is None or node.in_node(2).value is None: + raise Error('Strided slice layer supports only constant begin and end inputs') begin_id = node.in_node(1).value end_id = node.in_node(2).value - stride = node.in_node(3).value + if len(node.in_nodes()) > 3: + if node.in_node(3).value is None: + raise Error('Strided slice layer supports only constant stride input') + stride = node.in_node(3).value + else: + stride = [] shape = node.in_node(0).shape @@ -32,63 +39,79 @@ def tf_strided_slice_infer(node): convert_negative_indices(begin_id, shape) convert_negative_indices(end_id, shape) - test_bit = lambda val, offset: ((1 << offset) & val != 0) - slice_idx = [] - shrink_axis_mask = [] - ellipsis_mask = [] - new_axis_mask = [] - dims = len(begin_id) - + dims = np.amax(np.array([len(begin_id), len(end_id), len(stride), + len(node.shrink_axis_mask), len(node.new_axis_mask), len(node.ellipsis_mask), + len(node.begin_mask), len(node.end_mask)])) + + # make mask correct length + def extend_mask(in_mask, fin_len, zeros=True): + mask = list(in_mask) + if len(mask) < fin_len: + if zeros: + mask.extend(np.zeros(dims-len(mask), dtype=np.int32)) + else: + mask.extend(np.ones(dims-len(mask), dtype=np.int32)) + return np.array(mask, dtype=np.int32) + + for mask in {'new_axis_mask', 'shrink_axis_mask', 'ellipsis_mask'}: + node[mask] = extend_mask(node[mask], dims) + node.begin_mask = extend_mask(node.begin_mask, dims, False) + node.end_mask = extend_mask(node.end_mask, dims, False) + + old_idx = 0 + ellips_ext = 0 + id_em = 0 for idx in range(dims): - def_beg = 0 if stride[idx] > 0 else -1 - def_end = shape[idx] if stride[idx] > 0 else -shape[idx]-1 - l = begin_id[idx] if not test_bit(node.begin_mask, idx) else def_beg - r = end_id[idx] if not test_bit(node.end_mask, idx) else def_end - - # Check shrink_axis_mask - shrink_axis_mask.append(test_bit(node.shrink_axis_mask, idx)) - if shrink_axis_mask[idx]: - l, r = l, l + 1 - - # Check new_axis_mask - new_axis_mask.append(test_bit(node.new_axis_mask, idx)) - if new_axis_mask[idx]: + if node.new_axis_mask[idx]: slice_idx.append(np.newaxis) - - # Check ellipsis_mask - ellipsis_mask.append(test_bit(node.ellipsis_mask, idx)) - if ellipsis_mask[idx]: - shrink_axis_mask[idx] = False - l, r = 0, shape[idx] - - slice_idx.append(slice(l, r, stride[idx])) - - # if masks length are less than input dims length than add slices and masks for such dims - for idx in range(dims, len(shape)): - slice_idx.append(slice(0, shape[idx], 1)) - shrink_axis_mask.append(False) - new_axis_mask.append(False) + elif node.ellipsis_mask[idx]: + ellips_ext = len(shape) - (dims - np.count_nonzero(node.new_axis_mask) - 1) + id_em = idx + for i in range(0, ellips_ext): + slice_idx.append(slice(0, shape[old_idx], 1)) + old_idx = old_idx + 1 + else: + s = stride[idx] if len(stride) > idx else 1 + def_beg = 0 if s > 0 else -1 + def_end = shape[old_idx] if s > 0 else -shape[old_idx]-1 + l = begin_id[idx] if node.begin_mask[idx] and idx < len(begin_id) else def_beg + r = end_id[idx] if node.end_mask[idx] and idx < len(end_id) else def_end + + # Check shrink_axis_mask + if node.shrink_axis_mask[idx] and idx < len(shape): + slice_idx.append(slice(l, l+1, s)) + else: + slice_idx.append(slice(l, r, s)) + old_idx = old_idx + 1 value = node.in_node(0).value if node.in_node(0).value is not None else np.zeros(shape) # fix for the warning: "FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated use # `arr[tuple(seq)]` instead of `arr[seq]`" value = value[tuple(slice_idx)] - for idx, flag in reversed(list(enumerate(shrink_axis_mask))): + for idx, flag in reversed(list(enumerate(node.shrink_axis_mask))): if flag: - value = np.squeeze(value, idx) + if ellips_ext > 0 and idx > id_em: + idx = idx + ellips_ext - 1 + try: + value = np.squeeze(value, idx) + except ValueError: + # ignore this error + continue node['slices'] = np.array(slice_idx) - node['shrink_axis_mask'] = np.array(shrink_axis_mask) - node['new_axis_mask'] = np.array(new_axis_mask) + for attr in ('shrink_axis_mask', 'new_axis_mask', 'ellipsis_mask', 'begin_mask', 'end_mask'): + node[attr] = np.array(node[attr], dtype=np.int32) node.out_node().value = np.array(value) if node.in_node(0).value is not None else None - node.out_node().shape = np.array(value.shape) + node.out_node().shape = np.array(value.shape, dtype=np.int64) + + # change precision to I32 for begin, end, stride inputs + for i in range(1, len(node.in_nodes())): + inp = node.in_node(i) + inp["force_precision"] = "I32" - #remove inputs converted in attributes - #for i in range(1,4): - # node.graph.remove_edge(node.in_node(i).id, node.id) def convert_negative_indices(indices: np.array, shape: np.array): for ind, value in enumerate(indices): diff --git a/model-optimizer/mo/front/common/partial_infer/slice_test.py b/model-optimizer/mo/front/common/partial_infer/slice_test.py index cdd674d..af54493 100644 --- a/model-optimizer/mo/front/common/partial_infer/slice_test.py +++ b/model-optimizer/mo/front/common/partial_infer/slice_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -41,6 +41,9 @@ nodes_attributes = {'node_1': {'value': None, 'kind': 'data'}, 'tf_slice_size': {'value': None, 'shape': None, 'kind': 'data'}, 'tf_slice': {'kind': 'op'}, 'tf_slice_output': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_1': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_2': {'kind': 'op', 'op': 'OpOutput'} } tf_slice_edges = [('tf_slice_input', 'tf_slice'), ('tf_slice_begin', 'tf_slice'), ('tf_slice_size', 'tf_slice'), @@ -52,10 +55,13 @@ class TestSSliceInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'Slice_node'), ('Slice_node', 'node_2'), - ('Slice_node', 'node_3')], + ('Slice_node', 'node_3'), + ('node_2', 'op_output'), + ('node_3', 'op_output_1') + ], {'node_1': {'shape': np.array([1, 288, 56, 56])}, - 'node_2': {'is_output': True, 'shape': None}, - 'node_3': {'is_output': True, 'shape': None}, + 'node_2': {'shape': None}, + 'node_3': {'shape': None}, 'Slice_node': {'axis': 1, 'slice_point': np.array([256])} }) @@ -77,10 +83,13 @@ class TestSSliceInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'Slice_node'), ('Slice_node', 'node_2'), - ('Slice_node', 'node_3')], + ('Slice_node', 'node_3'), + ('node_2', 'op_output'), + ('node_3', 'op_output_1') + ], {'node_1': {'shape': np.array([1, 288, 56, 56])}, - 'node_2': {'is_output': True, 'shape': None}, - 'node_3': {'is_output': True, 'shape': None}, + 'node_2': {'shape': None}, + 'node_3': {'shape': None}, 'Slice_node': {'axis': 1, 'slice_point': []} }) @@ -102,11 +111,15 @@ class TestSSliceInfer(unittest.TestCase): [('node_1', 'Slice_node'), ('Slice_node', 'node_2'), ('Slice_node', 'node_3'), - ('Slice_node', 'node_4')], + ('Slice_node', 'node_4'), + ('node_2', 'op_output'), + ('node_3', 'op_output_1'), + ('node_2', 'op_output_2') + ], {'node_1': {'shape': np.array([1, 288, 56, 56])}, - 'node_2': {'is_output': True, 'shape': None}, - 'node_3': {'is_output': True, 'shape': None}, - 'node_4': {'is_output': True, 'shape': None}, + 'node_2': {'shape': None}, + 'node_3': {'shape': None}, + 'node_4': {'shape': None}, 'Slice_node': {'axis': 1, 'slice_point': []} }) @@ -132,11 +145,15 @@ class TestSSliceInfer(unittest.TestCase): [('node_1', 'Slice_node'), ('Slice_node', 'node_2'), ('Slice_node', 'node_3'), - ('Slice_node', 'node_4')], + ('Slice_node', 'node_4'), + ('node_2', 'op_output'), + ('node_3', 'op_output_1'), + ('node_2', 'op_output_2') + ], {'node_1': {'shape': np.array([1, 288, 56, 56])}, - 'node_2': {'is_output': True, 'shape': None}, - 'node_3': {'is_output': True, 'shape': None}, - 'node_4': {'is_output': True, 'shape': None}, + 'node_2': {'shape': None}, + 'node_3': {'shape': None}, + 'node_4': {'shape': None}, 'Slice_node': {'axis': 1, 'slice_point': [100, 150]} }) @@ -168,15 +185,16 @@ class TestTFStridedSliceInfer(unittest.TestCase): ('sslice_end_1', 'sslice_1'), ('sslice_stride_1', 'sslice_1'), ('sslice_1', 'sslice_data_1'), + ('sslice_data_1', 'op_output') ], - {'sslice_data_1': {'is_output': True}, + { 'sslice_input': {'value': np.array([1, 34, 34, 62]), 'shape': np.array([3])}, 'sslice_begin_1': {'value': np.array([0]), 'shape': np.array([1])}, 'sslice_end_1': {'value': np.array([4]), 'shape': np.array([1])}, 'sslice_stride_1': {'value': np.array([1]), 'shape': np.array([1])}, - 'sslice_1': {'shrink_axis_mask': 0, 'ellipsis_mask': 0, 'new_axis_mask': 0, - 'begin_mask': 0, 'end_mask': 0}, + 'sslice_1': {'shrink_axis_mask': [0], 'ellipsis_mask': [0], 'new_axis_mask': [0], + 'begin_mask': [1], 'end_mask': [1]}, }) def build_test_graph(self): @@ -186,17 +204,18 @@ class TestTFStridedSliceInfer(unittest.TestCase): ('sslice_end_1', 'sslice_1'), ('sslice_stride_1', 'sslice_1'), ('sslice_1', 'sslice_data_1'), + ('sslice_data_1', 'op_output') ], - {'sslice_data_1': {'is_output': True}, + { 'sslice_input': {'value': None, 'shape': np.array([1, 35, 35, 3])}, 'sslice_begin_1': {'value': np.array([0, 0, 0, 0]), 'shape': np.array([4])}, 'sslice_end_1': {'value': np.array([1, 34, 30, 2]), 'shape': np.array([4])}, 'sslice_stride_1': {'value': np.array([1, 1, 1, 1]), 'shape': np.array([4])}, - 'sslice_1': {'shrink_axis_mask': 0, 'ellipsis_mask': 0, 'new_axis_mask': 0, - 'begin_mask': 0, 'end_mask': 0}, + 'sslice_1': {'shrink_axis_mask': [0], 'ellipsis_mask': [0], 'new_axis_mask': [0], + 'begin_mask': [1], 'end_mask': [1]}, }) - + def build_test_graph_dim_beg(self): return build_graph(nodes_attributes, [('sslice_input', 'sslice_1'), @@ -204,18 +223,18 @@ class TestTFStridedSliceInfer(unittest.TestCase): ('sslice_end_1', 'sslice_1'), ('sslice_stride_1', 'sslice_1'), ('sslice_1', 'sslice_data_1'), + ('sslice_data_1', 'op_output') ], - {'sslice_data_1': {'is_output': True}, + { 'sslice_input': {'value': np.array([[1, 34, 34, 62]]), 'shape': np.array([1, 4])}, 'sslice_begin_1': {'value': np.array([0]), 'shape': np.array([1])}, 'sslice_end_1': {'value': np.array([4]), 'shape': np.array([1])}, 'sslice_stride_1': {'value': np.array([1]), 'shape': np.array([1])}, - 'sslice_1': {'shrink_axis_mask': 0, 'ellipsis_mask': 0, 'new_axis_mask': 0, - 'begin_mask': 0, 'end_mask': 0}, + 'sslice_1': {'shrink_axis_mask': [0], 'ellipsis_mask': [0], 'new_axis_mask': [0], + 'begin_mask': [1], 'end_mask': [1]}, }) - def test_slice_infer_1(self): graph = self.build_test_graph() node = Node(graph, 'sslice_1') @@ -225,7 +244,7 @@ class TestTFStridedSliceInfer(unittest.TestCase): def test_slice_infer_2(self): graph = self.build_test_graph() node = Node(graph, 'sslice_1') - node.end_mask = 6 # 0110 + node.end_mask = [1, 0, 0, 1] # 6 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([1, 35, 35, 2])), 'Wrong output shape detected') @@ -233,7 +252,7 @@ class TestTFStridedSliceInfer(unittest.TestCase): graph = self.build_test_graph() node = Node(graph, 'sslice_1') node.in_node(1).value = np.array([0, 10, 10, 0]) - node.end_mask = 6 # 0110 + node.end_mask = [1, 0, 0, 1] # 6 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([1, 25, 25, 2])), 'Wrong output shape detected') @@ -241,7 +260,7 @@ class TestTFStridedSliceInfer(unittest.TestCase): graph = self.build_test_graph() node = Node(graph, 'sslice_1') node.in_node(1).value = np.array([0, 10, 10, 0]) - node.begin_mask = 6 # 0110 + node.begin_mask = [1, 0, 0, 1] # 6 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([1, 34, 30, 2])), 'Wrong output shape detected') @@ -249,8 +268,8 @@ class TestTFStridedSliceInfer(unittest.TestCase): graph = self.build_test_graph() node = Node(graph, 'sslice_1') node.in_node(1).value = np.array([0, 10, 10, 0]) - node.begin_mask = 15 # 1111 - node.end_mask = 15 # 1111 + node.begin_mask = [0, 0, 0, 0] # 15 + node.end_mask = [0, 0, 0, 0] # 15 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([1, 35, 35, 3])), 'Wrong output shape detected') @@ -273,7 +292,7 @@ class TestTFStridedSliceInfer(unittest.TestCase): def test_slice_infer_8(self): graph = self.build_test_graph2() node = Node(graph, 'sslice_1') - node.new_axis_mask = 1 + node.new_axis_mask = [1] tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([1, 4])), 'Wrong output shape detected') self.assertTrue(np.array_equal(node.out_node().value, np.array([[1, 34, 34, 62]])), @@ -282,59 +301,57 @@ class TestTFStridedSliceInfer(unittest.TestCase): def test_slice_infer_9(self): graph = self.build_test_graph() node = Node(graph, 'sslice_1') - node.begin_mask = 15 # 1111 - node.end_mask = 15 # 1111 - node.shrink_axis_mask = 1 + node.begin_mask = [0, 0, 0, 0] # 15 + node.end_mask = [0, 0, 0, 0] # 15 + node.shrink_axis_mask = [1] tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([35, 35, 3])), 'Wrong output shape detected') def test_slice_infer_10(self): graph = self.build_test_graph() node = Node(graph, 'sslice_1') - node.begin_mask = 15 # 1111 - node.end_mask = 15 # 1111 - node.shrink_axis_mask = 1 - node.new_axis_mask = 8 + node.begin_mask = [0, 0, 0, 0] # 15 + node.end_mask = [0, 0, 0, 0] # 15 + node.shrink_axis_mask = [1, 0, 0, 0] + node.new_axis_mask = [0, 0, 0, 1] # 8 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([35, 35, 1, 3])), 'Wrong output shape detected') def test_slice_infer_11(self): graph = self.build_test_graph() node = Node(graph, 'sslice_1') - node.begin_mask = 15 # 1111 - node.end_mask = 15 # 1111 - node.shrink_axis_mask = 5 # 0101 + node.begin_mask = [0, 0, 0, 0] # 15 + node.end_mask = [0, 0, 0, 0] # 15 + node.shrink_axis_mask = [1, 0, 1, 0] # 5 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([35, 3])), 'Wrong output shape detected') def test_slice_infer_12(self): graph = self.build_test_graph() node = Node(graph, 'sslice_1') - node.begin_mask = 15 # 1111 - node.end_mask = 15 # 1111 - node.shrink_axis_mask = 7 # 0111 + node.begin_mask = [0, 0, 0, 0] # 15 + node.end_mask = [0, 0, 0, 0] # 15 + node.shrink_axis_mask = [1, 1, 1, 0] # 7 tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([3])), 'Wrong output shape detected') def test_slice_infer_13(self): graph = self.build_test_graph2() node = Node(graph, 'sslice_1') - # node.in_node(0).value = np.array([1]) node.in_node(1).value = np.array([1]) - node.shrink_axis_mask = 1 + node.shrink_axis_mask = [1] tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([])), 'Wrong output shape detected') self.assertTrue(np.array_equal(node.out_node().value, np.array(34)), 'Wrong output shape detected') - def test_slice_infer_14(self): + def test_slice_infer_14(self): graph = self.build_test_graph2() node = Node(graph, 'sslice_1') - # node.in_node(0).value = np.array([1]) node.in_node(3).value = np.array([-1]) - node.end_mask=1 - node.begin_mask=1 - node.in_node(0).shape=[4] - tf_strided_slice_infer(node) + node.end_mask = [0] + node.begin_mask = [0] + node.in_node(0).shape = [4] + tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([4])), 'Wrong output shape detected') print(node.out_node().value) self.assertTrue(np.array_equal(node.out_node().value, np.array([62, 34, 34, 1])), 'Wrong output shape detected') @@ -342,8 +359,7 @@ class TestTFStridedSliceInfer(unittest.TestCase): def test_slice_infer_dim_beg(self): graph = self.build_test_graph_dim_beg() node = Node(graph, 'sslice_1') - # node.in_node(0).value = np.array([1]) - node.shrink_axis_mask = 1 + node.shrink_axis_mask = [1] tf_strided_slice_infer(node) self.assertTrue(np.array_equal(node.out_node().shape, np.array([4])), 'Wrong output shape detected') self.assertTrue(np.array_equal(node.out_node().value, np.array([1, 34, 34, 62])), 'Wrong output shape detected') diff --git a/model-optimizer/mo/front/common/partial_infer/space_to_batch.py b/model-optimizer/mo/front/common/partial_infer/space_to_batch.py index 39083eb..d157364 100644 --- a/model-optimizer/mo/front/common/partial_infer/space_to_batch.py +++ b/model-optimizer/mo/front/common/partial_infer/space_to_batch.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/split.py b/model-optimizer/mo/front/common/partial_infer/split.py index ff8abb8..0339147 100644 --- a/model-optimizer/mo/front/common/partial_infer/split.py +++ b/model-optimizer/mo/front/common/partial_infer/split.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/split_test.py b/model-optimizer/mo/front/common/partial_infer/split_test.py index a81b57a..60e399e 100644 --- a/model-optimizer/mo/front/common/partial_infer/split_test.py +++ b/model-optimizer/mo/front/common/partial_infer/split_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/squeeze.py b/model-optimizer/mo/front/common/partial_infer/squeeze.py index 574ba85..ffdfdc7 100644 --- a/model-optimizer/mo/front/common/partial_infer/squeeze.py +++ b/model-optimizer/mo/front/common/partial_infer/squeeze.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/transpose.py b/model-optimizer/mo/front/common/partial_infer/transpose.py index 6cef48d..b2bee1f 100644 --- a/model-optimizer/mo/front/common/partial_infer/transpose.py +++ b/model-optimizer/mo/front/common/partial_infer/transpose.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/partial_infer/utils.py b/model-optimizer/mo/front/common/partial_infer/utils.py index 0056a0a..97ce37a 100644 --- a/model-optimizer/mo/front/common/partial_infer/utils.py +++ b/model-optimizer/mo/front/common/partial_infer/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,8 +18,10 @@ import logging as log import numpy as np +from typing import Iterable -def int64_array(l: list): + +def int64_array(l: Iterable): return np.array(l, dtype=np.int64) @@ -58,26 +60,31 @@ def convert_tf_padding_to_str(padding): # TODO eliminate this dependency and pass necessary function as an argument -def tf_window_op_pad_infer(input, window, stride, auto_pad): +def tf_window_op_pad_infer(input, window, stride, auto_pad, is_deconv=False): if input is None or window is None or stride is None or auto_pad is None: return (None, None) + + normalized_stride = stride + if is_deconv: + normalized_stride = 1 / stride + if auto_pad in ['same_lower', 'same_upper']: if auto_pad == 'same_upper': - output = np.int64(np.ceil(input / stride)) + output = np.int64(np.ceil(input / normalized_stride)) else: - output = np.int64(np.floor(input / stride)) + output = np.int64(np.floor(input / normalized_stride)) residual = input % stride mask = residual == 0 full_pad = window.copy() full_pad[mask] -= stride[mask] - mask = np.logical_not(mask) + mask = np.logical_not(mask) # pylint: disable=assignment-from-no-return full_pad[mask] -= input[mask] % stride[mask] - full_pad = np.maximum(full_pad, 0) + full_pad = np.maximum(full_pad, 0) # pylint: disable=assignment-from-no-return low_pad = np.int64(full_pad / 2) high_pad = full_pad - low_pad pad = np.array([low_pad, high_pad]).transpose() elif auto_pad == 'valid': - output = np.int64(np.ceil((input - window + 1) / stride)) + output = np.int64(np.ceil((input - window + 1) / normalized_stride)) pad = np.zeros((len(output), 2), dtype=np.int64) else: log.error("Unsupported padding scheme: {}".format(auto_pad)) diff --git a/model-optimizer/mo/front/common/register_custom_ops.py b/model-optimizer/mo/front/common/register_custom_ops.py index 1172bf3..fdaa392 100644 --- a/model-optimizer/mo/front/common/register_custom_ops.py +++ b/model-optimizer/mo/front/common/register_custom_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/common/replacement.py b/model-optimizer/mo/front/common/replacement.py index 6a2874d..6b86689 100644 --- a/model-optimizer/mo/front/common/replacement.py +++ b/model-optimizer/mo/front/common/replacement.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import logging as log import networkx as nx from mo.front.subgraph_matcher import SubgraphMatch -from mo.graph.graph import Node, merge_edge_props, get_sorted_inputs +from mo.graph.graph import Node, merge_edge_props, Graph from mo.middle.pattern_match import apply_pattern from mo.utils import class_registration from mo.utils.replacement_pattern import ReplacementPattern @@ -28,6 +28,14 @@ class FrontReplacementPattern(ReplacementPattern): registered_ops = {} registered_cls = [] + def run_after(self): + from extensions.front.pass_separator import FrontStart + return [FrontStart] + + def run_before(self): + from extensions.front.pass_separator import FrontFinish + return [FrontFinish] + def pattern(self): raise Exception('Function "pattern" must be overridden in the sub-class') @@ -45,6 +53,14 @@ class FrontReplacementSubgraph(FrontReplacementPattern): """ replacement_id = 'None' + def run_after(self): + from extensions.front.pass_separator import FrontStart + return [FrontStart] + + def run_before(self): + from extensions.front.pass_separator import FrontFinish + return [FrontFinish] + def __init__(self): pass @@ -53,7 +69,7 @@ class FrontReplacementSubgraph(FrontReplacementPattern): return node_port if isinstance(node_port, tuple) else (node_port, 0) @staticmethod - def replace_input_edges(graph: nx.DiGraph, input_edges_match: dict): + def replace_input_edges(graph: Graph, input_edges_match: dict): """ Replacing existing input/output edges with a new ones to a new sub-graph. :param graph: networkX graph to operate on. @@ -64,14 +80,14 @@ class FrontReplacementSubgraph(FrontReplacementPattern): old_node_name, old_in_port = __class__.extract_port(old_name_port) new_node_name, new_in_port = __class__.extract_port(new_name_port) old_node = Node(graph, old_node_name) - src_node_name = get_sorted_inputs(old_node)[old_in_port][0] + src_node_name = old_node.get_sorted_inputs()[old_in_port][0] edge_attrs = graph[src_node_name][old_node_name][0].copy() edge_attrs['in'] = new_in_port graph.add_edge(src_node_name, new_node_name, **edge_attrs) log.debug("Created edge from {} to {} with attrs: {}".format(src_node_name, new_node_name, edge_attrs)) @staticmethod - def replace_output_edges(graph: nx.DiGraph, output_edges_match: dict): + def replace_output_edges(graph: Graph, output_edges_match: dict): """ Replacing existing input/output edges with a new ones to a new sub-graph. :param graph: networkX graph to operate on. @@ -88,28 +104,28 @@ class FrontReplacementSubgraph(FrontReplacementPattern): graph.add_edge(new_node_name, dst, **new_edge_attrs) log.debug("Created edge from {} to {} with attrs: {}".format(new_node_name, dst, new_edge_attrs)) - def input_edges_match(self, graph: nx.MultiDiGraph, match: object, new_sub_graph: dict): + def input_edges_match(self, graph: Graph, match: object, new_sub_graph: dict): """ Default implementation doesn't add new input edges automatically. """ return {} - def output_edges_match(self, graph: nx.MultiDiGraph, match: object, new_sub_graph: dict): + def output_edges_match(self, graph: Graph, match: object, new_sub_graph: dict): """ Default implementation doesn't add new output edges automatically. """ return {} - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: object): + def generate_sub_graph(self, graph: Graph, match: object): raise Exception("The function 'generate_sub_graph' must be implemented in the sub-class.") - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict): + def nodes_to_remove(self, graph: Graph, match: dict): """ Default implementation generates list of all matched nodes. So all matched nodes will be removed. """ return [node.id for node in match.values()] - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: [dict, SubgraphMatch]): + def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]): log.debug('replace_sub_graph: "{}" matched nodes: {}'.format(self.replacement_id, '\n'.join(sorted(match.matched_nodes_names())))) new_sub_graph = self.generate_sub_graph(graph, match) # pylint: disable=assignment-from-no-return @@ -121,7 +137,7 @@ class FrontReplacementSubgraph(FrontReplacementPattern): 'replace_sub_graph: "{}" removing nodes: {}'.format(self.replacement_id, '\n'.join(sorted(remove_nodes)))) graph.remove_nodes_from(remove_nodes) - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): apply_pattern(graph, action=self.replace_sub_graph, **self.pattern()) registered_ops = {} @@ -143,6 +159,14 @@ class FrontReplacementOp(FrontReplacementSubgraph): """ op = 'UnknownOp' + def run_after(self): + from extensions.front.pass_separator import FrontStart + return [FrontStart] + + def run_before(self): + from extensions.front.pass_separator import FrontFinish + return [FrontFinish] + def pattern(self): return dict( nodes=[ @@ -150,7 +174,7 @@ class FrontReplacementOp(FrontReplacementSubgraph): edges=[] ) - def replace_op(self, graph: nx.MultiDiGraph, node: Node): + def replace_op(self, graph: Graph, node: Node): raise Exception("The function 'replace_op' must be implemented in the sub-class.") @staticmethod @@ -167,7 +191,7 @@ class FrontReplacementOp(FrontReplacementSubgraph): return out_edges_match_dict @staticmethod - def update_input_edges_attrs(graph: nx.MultiDiGraph, node: Node, added_nodes: list): + def update_input_edges_attrs(graph: Graph, node: Node, added_nodes: list): """ Copy edge attributes from 'old' input edges of node 'node' to new input sub-graph edges. :param graph: graph to operate on @@ -181,7 +205,7 @@ class FrontReplacementOp(FrontReplacementSubgraph): if old_u == new_u and old_edge_attrs['out'] == new_edge_attrs['out']: merge_edge_props(new_edge_attrs, old_edge_attrs) # copy old edge attributes - def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict): + def replace_sub_graph(self, graph: Graph, match: dict): assert 'op' in match assert len(match) == 1 node = match['op'] diff --git a/model-optimizer/mo/front/common/weights.py b/model-optimizer/mo/front/common/weights.py index 84e0679..486e8da 100644 --- a/model-optimizer/mo/front/common/weights.py +++ b/model-optimizer/mo/front/common/weights.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/extractor.py b/model-optimizer/mo/front/extractor.py index 6ba1ea4..0b68294 100644 --- a/model-optimizer/mo/front/extractor.py +++ b/model-optimizer/mo/front/extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,8 +22,8 @@ import networkx as nx import numpy as np from mo.front.onnx.extractors.utils import get_backend_pad -from mo.graph.graph import Node, unique_id, get_node_id_by_name -from mo.middle.passes.eliminate import reverse_dfs, get_nodes_with_attributes +from mo.graph.graph import Node, Graph, add_opoutput +from mo.middle.passes.eliminate import reverse_dfs from mo.utils import class_registration from mo.utils.error import Error from mo.utils.graph import dfs @@ -31,15 +31,14 @@ from mo.utils.unsupported_ops import UnsupportedOps from mo.utils.utils import refer_to_faq_msg -def restore_edges(graph: nx.DiGraph, get_edges: callable): +def restore_edges(graph: Graph, get_edges: callable): """ Take a graph without edges and extract dependencies between nodes with the help of get_edges function. For a given node n the get_edges function returns a list of tuples (n1, n2, attrs), that is used to create n1 --> n2 edge with attributes attrs. - It is possible that two nodes n1 and n2 have more than one n1 --> n2 edges, so the resulting graph is - nx.MultiDiGraph. + It is possible that two nodes n1 and n2 have more than one n1 --> n2 edges, so the resulting graph is Graph. """ - graph = nx.MultiDiGraph(graph) + graph = Graph(graph) for node in list(graph.nodes()): edges = get_edges(Node(graph, node)) for u, v, d in edges: @@ -56,7 +55,7 @@ def restore_edges(graph: nx.DiGraph, get_edges: callable): return graph -def remove_control_dependency_inputs(graph: nx.MultiDiGraph): +def remove_control_dependency_inputs(graph: Graph): """ Delete control dependency inputs from pb all over the graph :param graph: graph to operate on @@ -473,6 +472,7 @@ def update_ie_fields(attrs: dict, ir_version = None): ir_version_mapping = { # Default behaviour is IR V3 attributes None: ir_v3_attrs, + 5: ir_v3_attrs, 4: ir_v3_attrs, 3: ir_v3_attrs, 2: ir_v2_attrs @@ -484,7 +484,7 @@ def update_ie_fields(attrs: dict, ir_version = None): attrs.update(ir_version_mapping[ir_version]) -def create_tensor_nodes(graph: nx.MultiDiGraph): +def create_tensor_nodes(graph: Graph): """ Creates nodes between ops to represent intermediate data that flows from one op to another. For each edge with unique out attribute that goes from a given node, @@ -528,7 +528,7 @@ def create_tensor_nodes(graph: nx.MultiDiGraph): node_name = str(smart_node.name) if smart_node.has_valid('name') else str(smart_node.id) # assign to each output port a tensor unique id in the graph - out_tensor_dict = {port: unique_id(graph, '{}/Output_{}/Data_'.format(node_name, port)) for port in out_ports} + out_tensor_dict = {port: graph.unique_id('{}/Output_{}/Data_'.format(node_name, port)) for port in out_ports} # add a new node with kind='data' per each tensor graph.add_nodes_from([(uid, @@ -561,7 +561,7 @@ def create_tensor_nodes(graph: nx.MultiDiGraph): # data node content (numpy array). Shape is initialized by this array. if 'embedded_inputs' in node_attr: for port_index, value_attr, attrs in node_attr['embedded_inputs']: - input_node_id = unique_id(graph, 'embedded_input_') + input_node_id = graph.unique_id('embedded_input_') value = node_attr[value_attr] shape = np.array(value.shape, dtype=np.int64) graph.add_node(input_node_id, **add_attrs_props( @@ -569,6 +569,9 @@ def create_tensor_nodes(graph: nx.MultiDiGraph): edge_attrs = {'in': port_index, 'name': value_attr} edge_attrs.update(attrs) graph.add_edge(input_node_id, node, **edge_attrs) + op_node = Node(graph, node) + if not op_node.has_port(port_type='in', idx=edge_attrs['in']): + op_node.add_input_port(edge_attrs['in']) del node_attr[value_attr] return graph @@ -586,7 +589,7 @@ def get_specific_edge_attrs(attrs: dict, attrs_type: str, additional_attrs=None) return new_attrs -def extract_node_attrs(graph: nx.MultiDiGraph, extractor: callable): +def extract_node_attrs(graph: Graph, extractor: callable): """ For each node produce new entries in a node attributes dictionary by existing attributes. Old attributes are not removed but merged with new ones. @@ -652,7 +655,7 @@ def extract_port_from_string(node_name: str): return name, in_port, out_port -def get_node_id_with_ports(graph: nx.MultiDiGraph, name: str): +def get_node_id_with_ports(graph: Graph, name: str): """ Extracts port and node ID out of user provided name :param graph: graph to operate on @@ -660,7 +663,7 @@ def get_node_id_with_ports(graph: nx.MultiDiGraph, name: str): :return: node ID, direction of port ('in', 'out', 'port') and port number or None """ node_name, in_port, out_port = extract_port_from_string(name) - node_id = get_node_id_by_name(graph, node_name) + node_id = graph.get_node_id_by_name(node_name) if in_port is not None: direction = 'in' port = in_port @@ -673,7 +676,7 @@ def get_node_id_with_ports(graph: nx.MultiDiGraph, name: str): return node_id, direction, port -def input_user_data_repack(graph: nx.MultiDiGraph, input_user_shapes: [None, list, dict, np.ndarray], freeze_placeholder: dict): +def input_user_data_repack(graph: Graph, input_user_shapes: [None, list, dict, np.ndarray], freeze_placeholder: dict): """ Restructures user input cutting request. Splits ports out of node names. Transforms node names to node ids. :param graph: graph to operate on @@ -712,12 +715,12 @@ def input_user_data_repack(graph: nx.MultiDiGraph, input_user_shapes: [None, lis _freeze_placeholder = dict() # freeze placeholder restructure # Replaces placeholder name with placeholder id. Raises if there is no placeholder with such ID - placeholders_ids = get_nodes_with_attributes(graph, op='Placeholder') + placeholders_ids = graph.get_nodes_with_attributes(op='Placeholder') if freeze_placeholder is None: _freeze_placeholder = None else: for placeholder_name, value in freeze_placeholder.items(): - placeholder_id = get_node_id_by_name(graph, placeholder_name) + placeholder_id = graph.get_node_id_by_name(placeholder_name) if placeholder_id not in placeholders_ids: raise Error( 'There is no placeholder with name {}. Can not freeze it with value.'.format(placeholder_name)) @@ -761,7 +764,7 @@ def input_user_data_repack(graph: nx.MultiDiGraph, input_user_shapes: [None, lis return _input_shapes, _freeze_placeholder -def output_user_data_repack(graph: nx.MultiDiGraph, outputs: list): +def output_user_data_repack(graph: Graph, outputs: list): """ :param graph: graph to operate on @@ -795,7 +798,7 @@ def output_user_data_repack(graph: nx.MultiDiGraph, outputs: list): return _outputs -def user_data_repack(graph: nx.MultiDiGraph, input_user_shapes: [None, list, dict, np.array], outputs: list, +def user_data_repack(graph: Graph, input_user_shapes: [None, list, dict, np.array], outputs: list, freeze_placeholder: dict): """ :param graph: graph to operate on @@ -809,41 +812,17 @@ def user_data_repack(graph: nx.MultiDiGraph, input_user_shapes: [None, list, dic return _input_shapes, _outputs, _freeze_placeholder -def add_opoutput(graph: nx.MultiDiGraph, node_name: str, port: int, cut: bool = True): - """ - Creates and connects OpOutput node to node_name port. Cuts existing port if requested. - :param graph: graph to operate with - :param node_name: name of existing node in the graph that we want to add OpOutput to - :param port: output port of node to connect OpOutput to - :param cut: determines way of operating with edge specified by node_name and port - """ - # we import it here because Op imports add_attrs_props and update_ie_fields from this file - from mo.ops.output import Output - if cut and len(Node(graph, node_name).out_edges()) != 0: - opoutput_node = Output(graph).cut_edge_and_create_node(Node(graph, node_name), port, - {'name': '{}/sink_port_{}'.format(node_name, port)}) - else: - opoutput_node = Output(graph).create_node([(Node(graph, node_name), port)], - {'name': '{}/sink_port_{}'.format(node_name, port)}) - opoutput_node.in_edge()['data_attrs'] = ['fw_tensor_debug_info'] - opoutput_node.in_edge()['fw_tensor_debug_info'] = [(node_name, port)] - log.debug('Sink: {} for node {}'.format(opoutput_node.id, node_name)) - log.debug(str(graph.node[opoutput_node.id])) - log.debug("Add edge from {} to {}".format(node_name, opoutput_node.id)) - return opoutput_node.id - - -def add_output_ops(graph: nx.MultiDiGraph, user_defined_outputs: dict, inputs: dict = None): +def add_output_ops(graph: Graph, user_defined_outputs: dict, inputs: dict = None): sinks = [] # func sets all layers as outputs in case of empty user_defined_outputs list (it's impossible to reach by cli) assert not (isinstance(user_defined_outputs, list) and not len(user_defined_outputs)) # remove previously generated OpOutput if any graph.remove_nodes_from([node_name for node_name in graph.nodes() if - 'type' in graph.node[node_name] and graph.node[node_name]['type'] == 'OpOutput']) + 'op' in graph.node[node_name] and graph.node[node_name]['op'] == 'OpOutput']) if user_defined_outputs is None: - inputs = get_nodes_with_attributes(graph, op='Placeholder') if inputs is None else list(inputs.keys()) + inputs = graph.get_nodes_with_attributes(op='Placeholder') if inputs is None else list(inputs.keys()) input_reachable, dead_outputs, undead_outputs = set(), [], [] for input in inputs: dfs(graph=graph, node_name=input, visited=input_reachable) @@ -885,12 +864,12 @@ def add_output_ops(graph: nx.MultiDiGraph, user_defined_outputs: dict, inputs: d return sinks -def set_is_input(graph: nx.MultiDiGraph, placeholders: list, is_input: bool): +def set_is_input(graph: Graph, placeholders: list, is_input: bool): for placeholder in placeholders: graph.node[placeholder]['is_input'] = is_input -def check_input(graph: nx.MultiDiGraph, node_name: str): +def check_input(graph: Graph, node_name: str): node = Node(graph, node_name) if node['kind'] == 'op' and node['op'] == 'Placeholder' and not len(graph.in_edges(node_name)) and not node[ 'is_input']: @@ -914,7 +893,7 @@ def split_node_in_port(node_id: str): return node_id, None -def add_input_op_input_port_without_data(graph: nx.MultiDiGraph, node_id: str, input_op, edge_attrs: dict): +def add_input_op_input_port_without_data(graph: Graph, node_id: str, input_op, edge_attrs: dict): input_node = input_op.create_node() graph.add_edge(input_node.id, node_id, **edge_attrs) log.debug('Input: {} for node {}'.format(input_node.id, node_id)) @@ -922,7 +901,7 @@ def add_input_op_input_port_without_data(graph: nx.MultiDiGraph, node_id: str, i return input_node.id -def add_input_op_input_port_with_data(graph: nx.MultiDiGraph, node_id: str, input_op, edge_attrs: dict): +def add_input_op_input_port_with_data(graph: Graph, node_id: str, input_op, edge_attrs: dict): input_data_node = input_op.create_node_with_data() input_node = input_data_node.in_node() graph.add_edge(input_data_node.id, node_id, **edge_attrs) @@ -933,7 +912,7 @@ def add_input_op_input_port_with_data(graph: nx.MultiDiGraph, node_id: str, inpu return input_node.id -def add_input_op_output_port_without_data(graph: nx.MultiDiGraph, node_id: str, input_op, port: int): +def add_input_op_output_port_without_data(graph: Graph, node_id: str, input_op, port: int): input_node = input_op.create_node() # In this case it can be more than one out edge from one port and we should iterate over all output edges for _, out_node, attrs in graph.out_edges(node_id, data=True): @@ -947,7 +926,7 @@ def add_input_op_output_port_without_data(graph: nx.MultiDiGraph, node_id: str, return input_node.id -def add_input_op_output_port_with_data(graph: nx.MultiDiGraph, node_id: str, input_op, port: int): +def add_input_op_output_port_with_data(graph: Graph, node_id: str, input_op, port: int): # we assume that after op always data node data_node = Node(graph, node_id).out_node(port) assert data_node.has_valid('kind') and data_node.kind == 'data' @@ -959,7 +938,7 @@ def add_input_op_output_port_with_data(graph: nx.MultiDiGraph, node_id: str, inp return input_node.id -def add_input_op(graph: nx.MultiDiGraph, node_id: str, port: int = 0, data: bool = False, shape=None, +def add_input_op(graph: Graph, node_id: str, port: int = 0, data: bool = False, shape=None, is_out_port: bool = False): """ This function adds Input node to node with id==node_id to specified port (in or out defined with is_out_port). @@ -996,7 +975,7 @@ def add_input_op(graph: nx.MultiDiGraph, node_id: str, port: int = 0, data: bool return new_input_id -def add_input_ops_helper_before_infer_input_port(graph: nx.MultiDiGraph, smart_node: Node, port: int, node_id: str, +def add_input_ops_helper_before_infer_input_port(graph: Graph, smart_node: Node, port: int, node_id: str, shape: np.array, inputs: list, edges_to_remove: list): n_inputs = len(smart_node.in_nodes()) if n_inputs > 1 and port is None: @@ -1010,7 +989,7 @@ def add_input_ops_helper_before_infer_input_port(graph: nx.MultiDiGraph, smart_n shape=shape)) -def add_input_ops_helper_after_infer_input_port(graph: nx.MultiDiGraph, smart_node: Node, port:int, node_id: str, +def add_input_ops_helper_after_infer_input_port(graph: Graph, smart_node: Node, port:int, node_id: str, inputs: list, edges_to_remove: list): n_inputs = len(smart_node.in_nodes()) if n_inputs > 1 and port is not None and port != 0: @@ -1029,7 +1008,7 @@ def add_input_ops_helper_after_infer_input_port(graph: nx.MultiDiGraph, smart_no edges_to_remove.append((in_node.id, node_id)) -def add_input_ops_helper_before_infer_output_port(graph: nx.MultiDiGraph, port:int, node_id: str, +def add_input_ops_helper_before_infer_output_port(graph: Graph, port:int, node_id: str, shape: np.array, inputs: list, edges_to_remove: list): for u, v, edge_attrs in graph.out_edges(node_id, data=True): if edge_attrs['out'] == port: @@ -1037,7 +1016,7 @@ def add_input_ops_helper_before_infer_output_port(graph: nx.MultiDiGraph, port:i inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=False, shape=shape, is_out_port=True)) -def add_input_ops_helper_after_infer_output_port(graph: nx.MultiDiGraph, smart_node: Node, port:int, node_id: str, +def add_input_ops_helper_after_infer_output_port(graph: Graph, smart_node: Node, port:int, node_id: str, inputs: list, edges_to_remove: list): out_node = smart_node.out_node(port) shape = out_node['shape'] if 'shape' in out_node else None @@ -1049,7 +1028,7 @@ def add_input_ops_helper_after_infer_output_port(graph: nx.MultiDiGraph, smart_n edges_to_remove.append((node_id, out_node.id)) -def add_input_ops(graph: nx.MultiDiGraph, user_defined_inputs: dict, before_infer: bool): +def add_input_ops(graph: Graph, user_defined_inputs: dict, before_infer: bool): """ This function add user defined input operations. For cutting without port: @@ -1067,9 +1046,9 @@ def add_input_ops(graph: nx.MultiDiGraph, user_defined_inputs: dict, before_infe For case with before_infer=False data nodes are added to this schemes. """ inputs = [] - set_is_input(graph, get_nodes_with_attributes(graph, op='Placeholder'), False) + set_is_input(graph, graph.get_nodes_with_attributes(op='Placeholder'), False) if user_defined_inputs is None: - inputs = get_nodes_with_attributes(graph, op='Placeholder') + inputs = graph.get_nodes_with_attributes(op='Placeholder') else: # cutting the net by inputs assert isinstance(user_defined_inputs, dict) @@ -1137,7 +1116,7 @@ def add_input_ops(graph: nx.MultiDiGraph, user_defined_inputs: dict, before_infe if len(inputs): set_is_input(graph, inputs, True) # Check if there are inputs that are not listed in user_defined_inputs and are needed to calculate outputs - outputs = get_nodes_with_attributes(graph, is_output=True) + outputs = graph.get_nodes_with_attributes(op='OpOutput') visited = set() for output_name in outputs: reverse_dfs(graph, output_name, check_input, visited) @@ -1145,13 +1124,12 @@ def add_input_ops(graph: nx.MultiDiGraph, user_defined_inputs: dict, before_infe return inputs -def remove_output_ops(graph: nx.MultiDiGraph): +def remove_output_ops(graph: Graph): for node in list(graph.nodes()): node = Node(graph, node) if node.has_valid('op') and node.op == 'OpOutput': if len(node.in_nodes()) > 0: assert (len(node.in_nodes()) == 1) - list(node.in_nodes().values())[0]['is_output'] = node.is_output graph.remove_node(node.id) diff --git a/model-optimizer/mo/front/extractor_test.py b/model-optimizer/mo/front/extractor_test.py index 5fcb5eb..1d6840a 100644 --- a/model-optimizer/mo/front/extractor_test.py +++ b/model-optimizer/mo/front/extractor_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -37,7 +37,8 @@ class FakePythonParam: nodes_attributes = {'input': {'kind': 'data'}, 'pool_1': {'type': 'Pooling', 'kind': 'op'}, - 'output': {'kind': 'data'} + 'output': {'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, } @@ -60,10 +61,12 @@ class TestExtractor(unittest.TestCase): } graph = build_graph(nodes_attributes, [('input', 'pool_1'), - ('pool_1', 'output')], + ('pool_1', 'output'), + ('output', 'op_output') + ], {'input': {'shape': input_shape}, 'pool_1': {**params, 'spatial_dims': [2, 3]}, - 'output': {'is_output': True, 'shape': None}}) + 'output': {'shape': None}}) pool_1_node = Node(graph, 'pool_1') for param in params.keys(): if type(params[param]) is np.ndarray: @@ -89,10 +92,12 @@ class TestExtractor(unittest.TestCase): } graph = build_graph(nodes, [('input', 'reshape'), - ('reshape', 'output')], + ('reshape', 'output'), + ('output', 'op_output') + ], {'input': {'shape': input_shape}, 'reshape': {**params, 'spatial_dims': [2, 3]}, - 'output': {'is_output': True, 'shape': None}}) + 'output': {'shape': None}}) pool_1_node = Node(graph, 'reshape') for param in params.keys(): if type(params[param]) is list: @@ -244,8 +249,9 @@ class TestInputAddition(unittest.TestCase): 'conv_1_data': {'kind': 'data', 'value': True, 'shape': np.array([-1, 224, 224, 3])}, 'relu_1': {'type': 'ReLU', 'kind': 'op', 'op': 'NotPlaceholder'}, 'relu_1_data': {'kind': 'data', 'value': None, 'shape': np.array([-1, 112, 112, 64])}, - 'output': {'type': 'SoftMax', 'kind': 'op', 'op': 'NotPlaceholder', 'is_output': True}, - 'output_data': {'name': 'output_data', 'kind': 'data', 'shape': np.array([-1, 112, 112, 64])} + 'output': {'type': 'SoftMax', 'kind': 'op', 'op': 'NotPlaceholder'}, + 'output_data': {'name': 'output_data', 'kind': 'data', 'shape': np.array([-1, 112, 112, 64])}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'} } edges = [ ('old_input', 'old_input_data'), @@ -254,7 +260,8 @@ class TestInputAddition(unittest.TestCase): ('conv_1_data', 'relu_1'), ('relu_1', 'relu_1_data'), ('relu_1_data', 'output'), - ('output', 'output_data') + ('output', 'output_data'), + ('output_data', 'op_output') ] graph = build_graph(nodes, edges) add_input_ops(graph=graph, user_defined_inputs=inputs, before_infer=False) @@ -277,7 +284,7 @@ class TestInputAddition(unittest.TestCase): 'node_2': {'type': 'Identity', 'kind': 'op', 'op': 'NotPlaceholder'}, 'node_3': {'type': 'Identity', 'kind': 'op', 'op': 'NotPlaceholder'}, 'node_4': {'type': 'Identity', 'kind': 'op', 'op': 'NotPlaceholder'}, - 'output': {'type': 'Identity', 'kind': 'op', 'op': 'OpOutput', 'is_output': True} + 'output': {'kind': 'op', 'op': 'OpOutput'} } edges = [ ('input_1', 'node_1'), @@ -309,7 +316,7 @@ class TestInputAddition(unittest.TestCase): 'node_2': {'type': 'Identity', 'kind': 'op', 'op': 'NotPlaceholder'}, 'node_3': {'type': 'Identity', 'kind': 'op', 'op': 'NotPlaceholder'}, 'node_4': {'type': 'Identity', 'kind': 'op', 'op': 'NotPlaceholder'}, - 'output': {'type': 'Identity', 'kind': 'op', 'op': 'OpOutput', 'is_output': True}, + 'output': { 'kind': 'op', 'op': 'OpOutput'}, 'input_3': {'type': 'Identity', 'kind': 'op', 'op': 'Placeholder'} } edges = [ diff --git a/model-optimizer/mo/front/kaldi/extractor.py b/model-optimizer/mo/front/kaldi/extractor.py index f0e3b3b..d970019 100644 --- a/model-optimizer/mo/front/kaldi/extractor.py +++ b/model-optimizer/mo/front/kaldi/extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py b/model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py index ff5dff9..6c9d566 100644 --- a/model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/add_shift_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/add_shift_ext_test.py index 08703d2..0b5f46a 100644 --- a/model-optimizer/mo/front/kaldi/extractors/add_shift_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/add_shift_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py index 7900639..347b4fe 100644 --- a/model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_component_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/affine_component_ext_test.py index 14b083b..691525a 100644 --- a/model-optimizer/mo/front/kaldi/extractors/affine_component_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/affine_component_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py b/model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py index 70a8c41..7aa11d1 100644 --- a/model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext.py b/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext.py index 8175fb1..cb807a7 100644 --- a/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext_test.py index 7b9f41c..6a4925e 100644 --- a/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/affine_transform_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/common_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/common_ext_test.py index e9cdb98..24e9077 100644 --- a/model-optimizer/mo/front/kaldi/extractors/common_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/common_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ import numpy as np from mo.front.common.partial_infer.utils import int64_array from mo.front.kaldi.loader.utils_test import TestKaldiUtilsLoading -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.unittest.graph import build_graph class KaldiFrontExtractorTest(unittest.TestCase): - graph = nx.MultiDiGraph() + graph = Graph() @classmethod def setUp(cls): diff --git a/model-optimizer/mo/front/kaldi/extractors/concat_ext.py b/model-optimizer/mo/front/kaldi/extractors/concat_ext.py index 9299c7c..aa339cb 100644 --- a/model-optimizer/mo/front/kaldi/extractors/concat_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/concat_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/concat_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/concat_ext_test.py index b2274ba..b0f05cb 100644 --- a/model-optimizer/mo/front/kaldi/extractors/concat_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/concat_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py index d77eeb3..fa46c97 100644 --- a/model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py index 21a1e33..af9fa91 100644 --- a/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext_test.py index 50fef84..b030422 100644 --- a/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/copy_ext.py b/model-optimizer/mo/front/kaldi/extractors/copy_ext.py index 3348ef1..6237e9a 100644 --- a/model-optimizer/mo/front/kaldi/extractors/copy_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/copy_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py index eee267f..799971b 100644 --- a/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext_test.py index e03f698..731c436 100644 --- a/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py b/model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py index 09e8061..a18c384 100644 --- a/model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py b/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py index 0e38dd3..a1c8cf9 100644 --- a/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext_test.py index b3e7ad1..4b68387 100644 --- a/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py index 4d1e9e9..c5b397c 100644 --- a/model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,8 +35,8 @@ class NormalizeComponentFrontExtractor(FrontExtractorOp): d_scaled = dim * target_rms ** 2 in_norm = np.zeros([dim], np.float64) in_norm += 1.0 / d_scaled - in_norm = np.maximum(in_norm, 2. ** (-66)) - in_norm = np.power(in_norm, -0.5) + in_norm = np.maximum(in_norm, 2. ** (-66)) # pylint: disable=assignment-from-no-return + in_norm = np.power(in_norm, -0.5) # pylint: disable=assignment-from-no-return attrs = {} embed_input(attrs, 1, 'weights', in_norm) ScaleShiftOp.update_node_stat(node, attrs) diff --git a/model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py index 713db4b..4b09cd3 100644 --- a/model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/rescale_ext.py b/model-optimizer/mo/front/kaldi/extractors/rescale_ext.py index 459e558..ff2c57d 100644 --- a/model-optimizer/mo/front/kaldi/extractors/rescale_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/rescale_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/rescale_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/rescale_ext_test.py index b7628bb..c0a160f 100644 --- a/model-optimizer/mo/front/kaldi/extractors/rescale_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/rescale_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext.py b/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext.py index a68ad4f..36bd4b3 100644 --- a/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext_test.py index 521ac06..638ed6e 100644 --- a/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/sigmoid_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/slice_ext.py b/model-optimizer/mo/front/kaldi/extractors/slice_ext.py index 4235c0d..379571c 100644 --- a/model-optimizer/mo/front/kaldi/extractors/slice_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/slice_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/slice_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/slice_ext_test.py index 0c2a16c..47ae3ed 100644 --- a/model-optimizer/mo/front/kaldi/extractors/slice_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/slice_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/softmax_ext.py b/model-optimizer/mo/front/kaldi/extractors/softmax_ext.py index da9f0a1..1dee868 100644 --- a/model-optimizer/mo/front/kaldi/extractors/softmax_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/softmax_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py index 47cbc23..da39914 100644 --- a/model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py index e67f9c4..e75ed77 100644 --- a/model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py +++ b/model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/extractors/tanh_ext_test.py b/model-optimizer/mo/front/kaldi/extractors/tanh_ext_test.py index 4604022..3fb0daf 100644 --- a/model-optimizer/mo/front/kaldi/extractors/tanh_ext_test.py +++ b/model-optimizer/mo/front/kaldi/extractors/tanh_ext_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/loader/loader.py b/model-optimizer/mo/front/kaldi/loader/loader.py index 8bf9085..9f0bdf3 100644 --- a/model-optimizer/mo/front/kaldi/loader/loader.py +++ b/model-optimizer/mo/front/kaldi/loader/loader.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ import logging as log from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder, find_next_component, get_name_from_path, \ find_end_of_component, end_of_nnet_tag, read_binary_integer32_token, get_parameters, read_token_value, collect_until_token, \ create_edge_attrs -from mo.graph.graph import unique_id, Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg @@ -39,7 +39,7 @@ def read_counts_file(file_path): counts_line = file_content[0].strip().replace('[', '').replace(']', '') try: - counts = np.fromstring(counts_line, dtype=int, sep=' ') + counts = np.fromstring(counts_line, dtype=float, sep=' ') except TypeError: raise Error('Expect counts file to contain list of integers.' + refer_to_faq_msg(90)) @@ -47,12 +47,12 @@ def read_counts_file(file_path): cutoff_idxs = np.where(counts < cutoff) counts[cutoff_idxs] = cutoff scale = 1.0 / np.sum(counts) - counts = np.log(counts * scale) + counts = np.log(counts * scale) # pylint: disable=assignment-from-no-return counts[cutoff_idxs] += np.finfo(np.float32).max / 2 return counts -def load_parallel_component(file_descr, graph: nx.MultiDiGraph, prev_layer_id): +def load_parallel_component(file_descr, graph: Graph, prev_layer_id): """ Load ParallelComponent of the Kaldi model. ParallelComponent contains parallel nested networks. @@ -67,7 +67,7 @@ def load_parallel_component(file_descr, graph: nx.MultiDiGraph, prev_layer_id): nnet_count = read_token_value(file_descr, b'') log.debug('Model contains parallel component with {} nested networks'.format(nnet_count)) - slice_id = unique_id(graph, prefix='Slice') + slice_id = graph.unique_id(prefix='Slice') graph.add_node(slice_id, parameters=None, op='slice', kind='op') slice_node = Node(graph, slice_id) @@ -84,7 +84,7 @@ def load_parallel_component(file_descr, graph: nx.MultiDiGraph, prev_layer_id): if i != nnet_count - 1: slices_points.append(shape[1]) g.remove_node(input_nodes[0][0]) - mapping = {node: unique_id(graph, node) for node in g.nodes(data=False) if node in graph} + mapping = {node: graph.unique_id(node) for node in g.nodes(data=False) if node in graph} g = nx.relabel_nodes(g, mapping) for val in mapping.values(): g.node[val]['name'] = val @@ -99,7 +99,7 @@ def load_parallel_component(file_descr, graph: nx.MultiDiGraph, prev_layer_id): for i in slices_points: packed_sp += struct.pack("I", i) slice_node.parameters = io.BytesIO(packed_sp) - concat_id = unique_id(graph, prefix='Concat') + concat_id = graph.unique_id(prefix='Concat') graph.add_node(concat_id, parameters=None, op='concat', kind='op') for i, output in enumerate(outputs): edge_attrs = create_edge_attrs(output, concat_id) @@ -113,7 +113,6 @@ def load_kaldi_model(nnet_path): Structure of the file is the following: magic-number(16896) weights etc. :param nnet_path: - :param check_sum: :return: """ nnet_name = None @@ -140,7 +139,7 @@ def load_kaldi_model(nnet_path): def load_kalid_nnet1_model(file_descr, name): - graph = nx.MultiDiGraph(name=name) + graph = Graph(name=name) prev_layer_id = 'Input' graph.add_node(prev_layer_id, name=prev_layer_id, kind='op', op='Input', parameters=None) @@ -161,7 +160,7 @@ def load_kalid_nnet1_model(file_descr, name): start_index = file_descr.tell() end_tag, end_index = find_end_of_component(file_descr, component_type) end_index -= len(end_tag) - layer_id = unique_id(graph, prefix=component_type) + layer_id = graph.unique_id(prefix=component_type) graph.add_node(layer_id, parameters=get_parameters(file_descr, start_index, end_index), op=component_type, @@ -180,8 +179,9 @@ def load_kalid_nnet1_model(file_descr, name): def load_kalid_nnet2_model(file_descr, nnet_name): - graph = nx.MultiDiGraph(name=nnet_name) + graph = Graph(name=nnet_name) input_name = 'Input' + input_shape = np.array([]) graph.add_node(input_name, name=input_name, kind='op', op='Input', parameters=None, shape=None) prev_layer_id = input_name @@ -197,7 +197,7 @@ def load_kalid_nnet2_model(file_descr, nnet_name): break start_index = file_descr.tell() end_tag, end_index = find_end_of_component(file_descr, component_type) - layer_id = unique_id(graph, prefix=component_type) + layer_id = graph.unique_id(prefix=component_type) graph.add_node(layer_id, parameters=get_parameters(file_descr, start_index, end_index), op=component_type, diff --git a/model-optimizer/mo/front/kaldi/loader/utils.py b/model-optimizer/mo/front/kaldi/loader/utils.py index 4dbba94..55f46a4 100644 --- a/model-optimizer/mo/front/kaldi/loader/utils.py +++ b/model-optimizer/mo/front/kaldi/loader/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/loader/utils_test.py b/model-optimizer/mo/front/kaldi/loader/utils_test.py index ba5b06b..b026069 100644 --- a/model-optimizer/mo/front/kaldi/loader/utils_test.py +++ b/model-optimizer/mo/front/kaldi/loader/utils_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/kaldi/register_custom_ops.py b/model-optimizer/mo/front/kaldi/register_custom_ops.py index 237ee91..719c6df 100644 --- a/model-optimizer/mo/front/kaldi/register_custom_ops.py +++ b/model-optimizer/mo/front/kaldi/register_custom_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,14 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. """ -from mo.back.replacement import BackReplacementPattern -from mo.front.common.replacement import FrontReplacementOp, FrontReplacementSubgraph +from mo.front.common.replacement import FrontReplacementOp, FrontReplacementSubgraph, FrontReplacementPattern from mo.front.extractor import FrontExtractorOp -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.utils import class_registration -def update_registration(): - class_registration.update_registration([Op, FrontExtractorOp, FrontReplacementOp, FrontReplacementSubgraph, - MiddleReplacementPattern, BackReplacementPattern]) +def get_front_classes(): + front_classes = [FrontExtractorOp, FrontReplacementOp, FrontReplacementPattern, FrontReplacementSubgraph] + return front_classes diff --git a/model-optimizer/mo/front/kaldi/utils.py b/model-optimizer/mo/front/kaldi/utils.py index f29a643..76af016 100644 --- a/model-optimizer/mo/front/kaldi/utils.py +++ b/model-optimizer/mo/front/kaldi/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractor.py b/model-optimizer/mo/front/mxnet/extractor.py index ad613f8..c6e2d0c 100644 --- a/model-optimizer/mo/front/mxnet/extractor.py +++ b/model-optimizer/mo/front/mxnet/extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/activation.py b/model-optimizer/mo/front/mxnet/extractors/activation.py index 21b5635..fe23d75 100644 --- a/model-optimizer/mo/front/mxnet/extractors/activation.py +++ b/model-optimizer/mo/front/mxnet/extractors/activation.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/activation_test.py b/model-optimizer/mo/front/mxnet/extractors/activation_test.py index d7e034c..eda8a0b 100644 --- a/model-optimizer/mo/front/mxnet/extractors/activation_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/activation_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/add_n.py b/model-optimizer/mo/front/mxnet/extractors/add_n.py index a1fe83c..b77705f 100644 --- a/model-optimizer/mo/front/mxnet/extractors/add_n.py +++ b/model-optimizer/mo/front/mxnet/extractors/add_n.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/batchnorm.py b/model-optimizer/mo/front/mxnet/extractors/batchnorm.py index 0d81625..5b3cc8f 100644 --- a/model-optimizer/mo/front/mxnet/extractors/batchnorm.py +++ b/model-optimizer/mo/front/mxnet/extractors/batchnorm.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/concat.py b/model-optimizer/mo/front/mxnet/extractors/concat.py index 84c651f..85c0c14 100644 --- a/model-optimizer/mo/front/mxnet/extractors/concat.py +++ b/model-optimizer/mo/front/mxnet/extractors/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/crop.py b/model-optimizer/mo/front/mxnet/extractors/crop.py index 28cb464..a5cf6d3 100644 --- a/model-optimizer/mo/front/mxnet/extractors/crop.py +++ b/model-optimizer/mo/front/mxnet/extractors/crop.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/crop_test.py b/model-optimizer/mo/front/mxnet/extractors/crop_test.py index 06b839c..50fbb6c 100644 --- a/model-optimizer/mo/front/mxnet/extractors/crop_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/crop_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/eltwise.py b/model-optimizer/mo/front/mxnet/extractors/eltwise.py index 61f2065..91c74b7 100644 --- a/model-optimizer/mo/front/mxnet/extractors/eltwise.py +++ b/model-optimizer/mo/front/mxnet/extractors/eltwise.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/eltwise_test.py b/model-optimizer/mo/front/mxnet/extractors/eltwise_test.py index 4d07e57..46d0f88 100644 --- a/model-optimizer/mo/front/mxnet/extractors/eltwise_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/eltwise_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/fully_connected.py b/model-optimizer/mo/front/mxnet/extractors/fully_connected.py index 9322990..78dae8d 100644 --- a/model-optimizer/mo/front/mxnet/extractors/fully_connected.py +++ b/model-optimizer/mo/front/mxnet/extractors/fully_connected.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/l2_normalization.py b/model-optimizer/mo/front/mxnet/extractors/l2_normalization.py index f73cf9e..0166230 100644 --- a/model-optimizer/mo/front/mxnet/extractors/l2_normalization.py +++ b/model-optimizer/mo/front/mxnet/extractors/l2_normalization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/leaky_relu.py b/model-optimizer/mo/front/mxnet/extractors/leaky_relu.py index a204643..9537bbb 100644 --- a/model-optimizer/mo/front/mxnet/extractors/leaky_relu.py +++ b/model-optimizer/mo/front/mxnet/extractors/leaky_relu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/leaky_relu_test.py b/model-optimizer/mo/front/mxnet/extractors/leaky_relu_test.py index f3fab2b..7d660ea 100644 --- a/model-optimizer/mo/front/mxnet/extractors/leaky_relu_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/leaky_relu_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/lrn.py b/model-optimizer/mo/front/mxnet/extractors/lrn.py index b6dbf34..c313f92 100644 --- a/model-optimizer/mo/front/mxnet/extractors/lrn.py +++ b/model-optimizer/mo/front/mxnet/extractors/lrn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/multibox_detection.py b/model-optimizer/mo/front/mxnet/extractors/multibox_detection.py index 0e81c97..6245904 100644 --- a/model-optimizer/mo/front/mxnet/extractors/multibox_detection.py +++ b/model-optimizer/mo/front/mxnet/extractors/multibox_detection.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/multibox_detection_test.py b/model-optimizer/mo/front/mxnet/extractors/multibox_detection_test.py index c6e4c0c..5f1f1ab 100644 --- a/model-optimizer/mo/front/mxnet/extractors/multibox_detection_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/multibox_detection_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/multibox_prior.py b/model-optimizer/mo/front/mxnet/extractors/multibox_prior.py index 7284eb7..7e69277 100644 --- a/model-optimizer/mo/front/mxnet/extractors/multibox_prior.py +++ b/model-optimizer/mo/front/mxnet/extractors/multibox_prior.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/multibox_prior_test.py b/model-optimizer/mo/front/mxnet/extractors/multibox_prior_test.py index cc2cc8f..38501fd 100644 --- a/model-optimizer/mo/front/mxnet/extractors/multibox_prior_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/multibox_prior_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/null.py b/model-optimizer/mo/front/mxnet/extractors/null.py index c53da6d..a49f69d 100644 --- a/model-optimizer/mo/front/mxnet/extractors/null.py +++ b/model-optimizer/mo/front/mxnet/extractors/null.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/relu.py b/model-optimizer/mo/front/mxnet/extractors/relu.py index 41400c5..71693d6 100644 --- a/model-optimizer/mo/front/mxnet/extractors/relu.py +++ b/model-optimizer/mo/front/mxnet/extractors/relu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/relu_test.py b/model-optimizer/mo/front/mxnet/extractors/relu_test.py index c045d86..881a309 100644 --- a/model-optimizer/mo/front/mxnet/extractors/relu_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/relu_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/scaleshift.py b/model-optimizer/mo/front/mxnet/extractors/scaleshift.py index 23d4b5d..dbc89e0 100644 --- a/model-optimizer/mo/front/mxnet/extractors/scaleshift.py +++ b/model-optimizer/mo/front/mxnet/extractors/scaleshift.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/sigmoid.py b/model-optimizer/mo/front/mxnet/extractors/sigmoid.py index 79b0c67..834b0a5 100644 --- a/model-optimizer/mo/front/mxnet/extractors/sigmoid.py +++ b/model-optimizer/mo/front/mxnet/extractors/sigmoid.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/sigmoid_test.py b/model-optimizer/mo/front/mxnet/extractors/sigmoid_test.py index fcf5893..ba73f6c 100644 --- a/model-optimizer/mo/front/mxnet/extractors/sigmoid_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/sigmoid_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/slice_axis.py b/model-optimizer/mo/front/mxnet/extractors/slice_axis.py index 956c177..046c410 100644 --- a/model-optimizer/mo/front/mxnet/extractors/slice_axis.py +++ b/model-optimizer/mo/front/mxnet/extractors/slice_axis.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/slice_axis_test.py b/model-optimizer/mo/front/mxnet/extractors/slice_axis_test.py index 435044d..246d88b 100644 --- a/model-optimizer/mo/front/mxnet/extractors/slice_axis_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/slice_axis_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/transpose.py b/model-optimizer/mo/front/mxnet/extractors/transpose.py index 985f40c..d0d7b32 100644 --- a/model-optimizer/mo/front/mxnet/extractors/transpose.py +++ b/model-optimizer/mo/front/mxnet/extractors/transpose.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/extractors/utils.py b/model-optimizer/mo/front/mxnet/extractors/utils.py index 8c8d23d..3358ccd 100644 --- a/model-optimizer/mo/front/mxnet/extractors/utils.py +++ b/model-optimizer/mo/front/mxnet/extractors/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -87,10 +87,11 @@ class AttrDictionary(object): def val(self, key, valtype, default=None): attr = self.str(key, default) + attr = None if attr == 'None' else attr if valtype is None: return attr else: - if not isinstance(attr, valtype): + if not isinstance(attr, valtype) and attr is not None: return valtype(attr) else: return attr @@ -178,3 +179,15 @@ def load_params(input_model, data_names = ('data',)): model_params._param_names = arg_keys model_params._aux_names = aux_keys return model_params + + +def init_rnn_states(model_nodes): + states = {} + for i, node in enumerate(model_nodes): + if node['op'] == 'RNN': + for i in node['inputs'][2:]: + attrs = get_mxnet_layer_attrs(model_nodes[i[0]]) + shape = attrs.tuple('__shape__', int, None) + if shape: + states.update({model_nodes[i[0]]['name']: shape}) + return states \ No newline at end of file diff --git a/model-optimizer/mo/front/mxnet/extractors/utils_test.py b/model-optimizer/mo/front/mxnet/extractors/utils_test.py index 070d532..b523162 100644 --- a/model-optimizer/mo/front/mxnet/extractors/utils_test.py +++ b/model-optimizer/mo/front/mxnet/extractors/utils_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -169,6 +169,15 @@ class TestAttrDictionary(unittest.TestCase): self.assertEqual(2, l[1]) self.assertEqual(3, l[2]) + def testIntWithAttrNone(self): + attrs = { + "something": "None" + } + + attr_dict = AttrDictionary(attrs) + attr = attr_dict.int("something", None) + self.assertEqual(None, attr) + class TestUtils(unittest.TestCase): @patch('mxnet.nd.load') diff --git a/model-optimizer/mo/front/mxnet/loader.py b/model-optimizer/mo/front/mxnet/loader.py index 219abb1..4bf85ba 100644 --- a/model-optimizer/mo/front/mxnet/loader.py +++ b/model-optimizer/mo/front/mxnet/loader.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,16 +17,14 @@ import os import json -import networkx as nx import numpy as np import mxnet as mx import logging as log -from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params +from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states from mo.front.mxnet.extractor import common_mxnet_fields from mo.front.mxnet.nd_to_params import build_params_file -from mo.graph.graph import Node -from mo.graph.graph import unique_id +from mo.graph.graph import Node, Graph from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg @@ -97,7 +95,10 @@ def symbol2nx(model_nodes, model_params, input_names: str = ''): else: input_names = input_names.split(',') - graph = nx.MultiDiGraph() + rnn_states = init_rnn_states(model_nodes) + names_rnn_states = list(rnn_states.keys()) + + graph = Graph() # as mxnet contain input layers as index of layer, for correct set up edges, we need provide index of layer with name of graph node index_node_keys = {} for i, node in enumerate(model_nodes): @@ -105,7 +106,9 @@ def symbol2nx(model_nodes, model_params, input_names: str = ''): node['value'] = np.array(model_params._arg_params[node['name']].asnumpy(), dtype=np.float32) elif node['name'] in model_params._aux_params and node['name'] not in input_names: node['value'] = np.array(model_params._aux_params[node['name']].asnumpy(), dtype=np.float32) - node_name = unique_id(graph, node['name']) + elif node['name'] in names_rnn_states: + node['value'] = np.zeros(rnn_states[node['name']]) + node_name = graph.unique_id(node['name']) graph.add_node(node_name, **symbol_attrs(node)) graph.node[node_name].update(common_mxnet_fields(Node(graph, node_name))) index_node_keys[i] = node_name @@ -119,7 +122,7 @@ def symbol2nx(model_nodes, model_params, input_names: str = ''): return graph -def find_output_node(graph: nx.MultiDiGraph, src_input_index): +def find_output_node(graph: Graph, src_input_index): for i, attrs in (list(graph.nodes(data=True))[src_input_index + 1:]): for input_index in attrs['symbol_dict']['inputs']: if input_index[0] == src_input_index: diff --git a/model-optimizer/mo/front/mxnet/loader_test.py b/model-optimizer/mo/front/mxnet/loader_test.py index 2c77d7e..cb52cb2 100644 --- a/model-optimizer/mo/front/mxnet/loader_test.py +++ b/model-optimizer/mo/front/mxnet/loader_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/nd_to_params.py b/model-optimizer/mo/front/mxnet/nd_to_params.py index e4a66cc..a0f1fdc 100644 --- a/model-optimizer/mo/front/mxnet/nd_to_params.py +++ b/model-optimizer/mo/front/mxnet/nd_to_params.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/mxnet/register_custom_ops.py b/model-optimizer/mo/front/mxnet/register_custom_ops.py index a699222..a07bf0e 100644 --- a/model-optimizer/mo/front/mxnet/register_custom_ops.py +++ b/model-optimizer/mo/front/mxnet/register_custom_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,13 +16,9 @@ from mo.front.common.replacement import FrontReplacementOp, FrontReplacementSubgraph, FrontReplacementPattern from mo.front.extractor import FrontExtractorOp, MXNetCustomFrontExtractorOp -from mo.ops.op import Op -from mo.utils import class_registration -from mo.middle.replacement import MiddleReplacementPattern -from mo.back.replacement import BackReplacementPattern -def update_registration(): - class_registration.update_registration([Op, FrontExtractorOp, FrontReplacementOp, FrontReplacementSubgraph, - MXNetCustomFrontExtractorOp, MiddleReplacementPattern, - BackReplacementPattern, FrontReplacementPattern]) +def get_front_classes(): + front_classes = [FrontExtractorOp, FrontReplacementOp, FrontReplacementSubgraph, MXNetCustomFrontExtractorOp, + FrontReplacementPattern] + return front_classes diff --git a/model-optimizer/mo/front/onnx/extractor.py b/model-optimizer/mo/front/onnx/extractor.py index 76a666c..00cefbe 100644 --- a/model-optimizer/mo/front/onnx/extractor.py +++ b/model-optimizer/mo/front/onnx/extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ import numpy as np from mo.front.onnx.extractors.concat import concat_ext from mo.front.onnx.extractors.const import onnx_const_ext from mo.front.onnx.extractors.constant import onnx_constant_ext -from mo.front.onnx.extractors.dropout import dropout_ext from mo.front.onnx.extractors.eltwise import make_tf_eltwise from mo.front.onnx.extractors.fused_bn import tf_fused_bn_extractor from mo.front.onnx.extractors.matmul import onnx_gemm_ext @@ -39,8 +38,7 @@ onnx_op_extractors = { 'Concat': concat_ext, 'Const': onnx_const_ext, 'Constant': onnx_constant_ext, - 'Identity': node_pb_arg(make_tf_eltwise(lambda v: v)), - 'Dropout': dropout_ext, + 'Identity': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'identity': True})), 'Sum': node_pb_arg( make_tf_eltwise(lambda a, b: a + b, attrs={'type': 'Eltwise', 'operation': 'sum', 'can_be_bias': True})), 'Relu': node_pb_arg(make_tf_eltwise(lambda v: np.maximum(0, v), attrs={'type': 'ReLU'})), # 0 is an integer diff --git a/model-optimizer/mo/front/onnx/extractors/concat.py b/model-optimizer/mo/front/onnx/extractors/concat.py index c99aa42..4cb510e 100644 --- a/model-optimizer/mo/front/onnx/extractors/concat.py +++ b/model-optimizer/mo/front/onnx/extractors/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/const.py b/model-optimizer/mo/front/onnx/extractors/const.py index 2bfe163..254a843 100644 --- a/model-optimizer/mo/front/onnx/extractors/const.py +++ b/model-optimizer/mo/front/onnx/extractors/const.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/constant.py b/model-optimizer/mo/front/onnx/extractors/constant.py index aa78db7..9339f01 100644 --- a/model-optimizer/mo/front/onnx/extractors/constant.py +++ b/model-optimizer/mo/front/onnx/extractors/constant.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/constant_test.py b/model-optimizer/mo/front/onnx/extractors/constant_test.py index 8204966..6399039 100644 --- a/model-optimizer/mo/front/onnx/extractors/constant_test.py +++ b/model-optimizer/mo/front/onnx/extractors/constant_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/dropout.py b/model-optimizer/mo/front/onnx/extractors/dropout.py deleted file mode 100644 index dff586a..0000000 --- a/model-optimizer/mo/front/onnx/extractors/dropout.py +++ /dev/null @@ -1,32 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -from mo.front.onnx.extractors.utils import onnx_attr -from mo.utils.error import Error - -def dropout_ext(node): - # some Dropout flavors doesn't have is_test attribute; when it is missing, interpret it as 1 - is_test = onnx_attr(node, 'is_test', 'i', 1) - if len(node.out_nodes()) > 1: - raise Error('Dropout node {} has more than one consumer. Unsupported.', node.name) - if not is_test: - raise Error('Dropout node {} has is_test: 0. This means training mode which is not supported.', node.name) - - return { - # redefine op to automatically remove a node in the next tranformations - 'op': 'Identity', - } - diff --git a/model-optimizer/mo/front/onnx/extractors/eltwise.py b/model-optimizer/mo/front/onnx/extractors/eltwise.py index 9a096a9..b33b877 100644 --- a/model-optimizer/mo/front/onnx/extractors/eltwise.py +++ b/model-optimizer/mo/front/onnx/extractors/eltwise.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/fused_bn.py b/model-optimizer/mo/front/onnx/extractors/fused_bn.py index b167da6..73db9ca 100644 --- a/model-optimizer/mo/front/onnx/extractors/fused_bn.py +++ b/model-optimizer/mo/front/onnx/extractors/fused_bn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/matmul.py b/model-optimizer/mo/front/onnx/extractors/matmul.py index f04890f..79a61ef 100644 --- a/model-optimizer/mo/front/onnx/extractors/matmul.py +++ b/model-optimizer/mo/front/onnx/extractors/matmul.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/placeholder.py b/model-optimizer/mo/front/onnx/extractors/placeholder.py index 78a8e59..cd92940 100644 --- a/model-optimizer/mo/front/onnx/extractors/placeholder.py +++ b/model-optimizer/mo/front/onnx/extractors/placeholder.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/reshape.py b/model-optimizer/mo/front/onnx/extractors/reshape.py index 1ef7995..19c13e0 100644 --- a/model-optimizer/mo/front/onnx/extractors/reshape.py +++ b/model-optimizer/mo/front/onnx/extractors/reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/onnx/extractors/utils.py b/model-optimizer/mo/front/onnx/extractors/utils.py index da28d64..9315f4a 100644 --- a/model-optimizer/mo/front/onnx/extractors/utils.py +++ b/model-optimizer/mo/front/onnx/extractors/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ limitations under the License. """ +import numpy as np + from mo.graph.graph import Node from mo.utils.error import Error @@ -47,3 +49,25 @@ def get_onnx_autopad(auto_pad): if auto_pad == 'notset': auto_pad = None return auto_pad + + +def get_onnx_datatype_as_numpy(value): + datatype_to_numpy = { + 1: np.float32, + 9: np.bool, + 11: np.double, + 10: np.float16, + 5: np.int16, + 6: np.int32, + 7: np.int64, + 3: np.int8, + 8: np.ubyte, + 4: np.uint16, + 12: np.uint32, + 13: np.uint64, + 2: np.uint8, + } + try: + return datatype_to_numpy[value] + except KeyError: + raise Error("Incorrect value {} for Datatype enum".format(value)) diff --git a/model-optimizer/mo/front/onnx/loader.py b/model-optimizer/mo/front/onnx/loader.py index 0da413f..d90f228 100644 --- a/model-optimizer/mo/front/onnx/loader.py +++ b/model-optimizer/mo/front/onnx/loader.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ import logging as log import networkx as nx import onnx -from mo.graph.graph import create_graph_with_nodes, unique_id +from mo.graph.graph import create_graph_with_nodes, Graph from mo.utils.error import Error, FrameworkError @@ -64,7 +64,7 @@ def protobuf2nx(pb): # convert initializers to a NX graph for easier control of model consistency and to use it as a dictionary later initializers = create_graph_with_nodes(pb.graph.initializer, get_id=lambda pb: pb.name, get_attrs=protobuf_attrs) - graph = nx.MultiDiGraph() + graph = Graph() # maps a tensor name to a node produced it and the node port: str -> (node_id, node_port) data_nodes_map = {} @@ -95,7 +95,7 @@ def protobuf2nx(pb): # important) for node in pb.graph.node: # create an NX node - id = unique_id(graph, node_id(node)) + id = graph.unique_id(node_id(node)) graph.add_node(id, pb=node, kind='op') # add incoming edges based on data_nodes_map diff --git a/model-optimizer/mo/front/onnx/register_custom_ops.py b/model-optimizer/mo/front/onnx/register_custom_ops.py index d3ec4ea..7ded9e1 100644 --- a/model-optimizer/mo/front/onnx/register_custom_ops.py +++ b/model-optimizer/mo/front/onnx/register_custom_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,14 +14,10 @@ limitations under the License. """ -from mo.back.replacement import BackReplacementPattern from mo.front.common.replacement import FrontReplacementOp, FrontReplacementPattern, FrontReplacementSubgraph from mo.front.extractor import FrontExtractorOp -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.utils import class_registration -def update_registration(): - class_registration.update_registration([Op, FrontExtractorOp, FrontReplacementOp, FrontReplacementPattern, - FrontReplacementSubgraph, MiddleReplacementPattern, BackReplacementPattern]) +def get_front_classes(): + front_classes = [FrontExtractorOp, FrontReplacementOp, FrontReplacementPattern, FrontReplacementSubgraph] + return front_classes diff --git a/model-optimizer/mo/front/subgraph_matcher.py b/model-optimizer/mo/front/subgraph_matcher.py index 410e2fe..6149098 100644 --- a/model-optimizer/mo/front/subgraph_matcher.py +++ b/model-optimizer/mo/front/subgraph_matcher.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,7 @@ import logging as log import re -import networkx as nx - -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.custom_replacement_config import CustomReplacementDescriptor from mo.utils.error import Error from mo.utils.graph import nodes_matching_name_pattern, sub_graph_between_nodes @@ -40,7 +38,7 @@ class SubgraphMatch(object): Class providing information about matched sub-graph. """ - def __init__(self, graph: nx.DiGraph, replacement_desc: CustomReplacementDescriptor, matched_nodes: list, + def __init__(self, graph: Graph, replacement_desc: CustomReplacementDescriptor, matched_nodes: list, inputs_order: list, outputs_order: list, prefix: str): """ Creates instance of a SubgraphMatch class from the provided configuration. @@ -164,7 +162,7 @@ class SubgraphMatcher(object): def __init__(self, replacement_descriptor: CustomReplacementDescriptor): self.replacement_desc = replacement_descriptor - def _match_sub_graph_for_scope(self, graph: nx.MultiDiGraph, scope_pattern: str): + def _match_sub_graph_for_scope(self, graph: Graph, scope_pattern: str): """ :param graph: networkx graph to find sub-graph in. :param scope_pattern: regular expression specifying sub-graph scope. @@ -187,7 +185,7 @@ class SubgraphMatcher(object): return SubgraphMatch(graph, self.replacement_desc, matched_nodes, inputs_order, outputs_order, scope_pattern) - def _match_sub_graph_for_points(self, graph: nx.MultiDiGraph): + def _match_sub_graph_for_points(self, graph: Graph): """ :param graph: networkx graph to find sub-graph in. :return: an object describing matched sub-graph. @@ -206,7 +204,7 @@ class SubgraphMatcher(object): self.replacement_desc.get_inputs_description(), self.replacement_desc.get_outputs_description(), '') - def matched_sub_graph_instances(self, graph: nx.MultiDiGraph): + def matched_sub_graph_instances(self, graph: Graph): """ Generator to product all instances of matched sub-graphs. :param graph: graph to find instances in. diff --git a/model-optimizer/mo/front/tf/change_placeholder_type.py b/model-optimizer/mo/front/tf/change_placeholder_type.py deleted file mode 100644 index 8c35bc3..0000000 --- a/model-optimizer/mo/front/tf/change_placeholder_type.py +++ /dev/null @@ -1,80 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import logging as log - -import networkx as nx -from tensorflow.core.framework import types_pb2 as tf_types # pylint: disable=no-name-in-module - -from mo.graph.graph import Node -from mo.middle.passes.fusing.helpers import get_next_operation -from mo.utils.error import Error -from mo.utils.utils import refer_to_faq_msg - - -def change_placeholders_types_to_FP32(graph: nx.MultiDiGraph): - for node_name, node_attrs in list(graph.nodes(data=True)): - node = Node(graph, node_name) - pb = node_attrs.get('pb') - if pb is not None and pb.op == 'Placeholder' and pb.attr['dtype'].type != tf_types.DT_FLOAT: - log.info('Placeholder "{}" has type that is different from DT_FLOAT'.format(node_name)) - next_ops = get_next_operation(node) - # check that all output nodes are nodes of type 'ToFloat' - if all([is_node_casts_to_float(op) and len(op.in_nodes()) == 1 for op in next_ops]): - change_node_type(node, tf_types.DT_FLOAT) - remove_node_preserving_edges(node, next_ops) # remove 'Cast' nodes - elif all([is_node_gather(op) for op in next_ops] for op in next_ops): - change_node_type(node, tf_types.DT_FLOAT) - else: - raise Error( - ('Cannot convert type of placeholder "{}" because not all of its outputs are "Cast" to float ' - 'operations: {}. ' + - refer_to_faq_msg(49)), - node.soft_get('name'), - [op.soft_get('name') for op in next_ops] - ) - return graph - - -def is_node_casts_to_float(node: Node): - attrs = node.graph.node[node.id] - return 'pb' in attrs and attrs['pb'].op == 'Cast' and attrs['pb'].attr['DstT'].type == tf_types.DT_FLOAT - - -def is_node_gather(node: Node): - attrs = node.graph.node[node.id] - return 'pb' in attrs and attrs['pb'].op == 'GatherV2' and attrs['precision'] == 'FP32' - - -def change_node_type(node: Node, new_type: type): - node.graph.node[node.id]['pb'].attr['dtype'].type = new_type - - -def remove_node_preserving_edges(pl_node: Node, nodes: list): - graph = pl_node.graph - pl_node_data = pl_node.out_node() - - # Disconnect Placeholder data node from Cast nodes - for out_node in pl_node.out_node().out_nodes(): - graph.remove_edge(pl_node_data.id, out_node.id) - - # Move edges from Cast data nodes to Placeholder data node - for cast_node in nodes: - # it is necessary to create a list from the result of function "graph.out_edges()" because we modify the graph - # during iteration over the list. networkx version 2.1 raises error without creating a list - for u, v, d in list(graph.out_edges(cast_node.out_node().id, data=True)): - graph.remove_edge(u, v) - graph.add_edges_from([(pl_node_data.id, v, d)]) diff --git a/model-optimizer/mo/front/tf/common.py b/model-optimizer/mo/front/tf/common.py index 72f85f7..a00274d 100644 --- a/model-optimizer/mo/front/tf/common.py +++ b/model-optimizer/mo/front/tf/common.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/custom_subgraph_call.py b/model-optimizer/mo/front/tf/custom_subgraph_call.py index 2a66ca5..8cd5fd5 100644 --- a/model-optimizer/mo/front/tf/custom_subgraph_call.py +++ b/model-optimizer/mo/front/tf/custom_subgraph_call.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,26 +17,15 @@ import logging as log from re import compile, match, findall -import copy import networkx as nx -import numpy as np -import tensorflow as tf -from mo.front.common.find_unsupported_ops import find_unsupported_ops_subgraphs -from mo.front.common.layout import convert_shape, nhwc_to_nchw_permute, nchw_to_nhwc_permute -from mo.front.common.partial_infer.utils import int64_array from mo.front.extractor import update_ie_fields -from mo.front.tf.extractors.utils import tf_tensor_shape -from mo.front.tf.partial_infer.tf import get_subgraph_output_tensors, tf_subgraph_infer, \ - add_node_def_to_subgraph, update_input_in_pbs -from mo.graph.graph import dump_graph_for_graphviz, unique_id, Node, get_outputs, get_inputs, merge_edge_props +from mo.front.tf.partial_infer.tf import tf_subgraph_infer +from mo.graph.graph import Node, merge_edge_props, Graph from mo.utils.graph import nodes_matching_name_pattern, is_connected_component -nchw_to_nhwc_constant_name = 'IE_NCHW_TO_NHWC' -nhwc_to_nchw_constant_name = 'IE_NHWC_TO_NCHW' - -def replace_subgraph_calls(graph: nx.MultiDiGraph, patterns_string: str): +def replace_subgraph_calls(graph: Graph, patterns_string: str): """ The function replaces sub-graphs defined by the node names with single nodes that are executed using the TensorFlow. The patterns applied independently, so N patterns produce N TensorFlow call nodes. @@ -59,18 +48,11 @@ def replace_subgraph_calls(graph: nx.MultiDiGraph, patterns_string: str): if cycle_exist: log.warning("Graph contains a cycle after merging nodes using pattern '{}'".format(pattern)) if cycle_exist: - dump_graph_for_graphviz(graph) + graph.dump_graph_for_graphviz() log.error('graph contains cycle after applying all merge node patterns') + - -def offload_unsupported_operations_to_tf(graph: nx.MultiDiGraph, unsupported_nodes: list): - assert len(unsupported_nodes) != 0 - sub_graphs_list = find_unsupported_ops_subgraphs(graph, unsupported_nodes, tf_find_constant_inputs) - for nodes_set in sub_graphs_list: - merge_nodes(graph, nodes_set) - - -def offload_operations_to_tf(graph: nx.MultiDiGraph, op_names_patterns: str): +def offload_operations_to_tf(graph: Graph, op_names_patterns: str): """ The function accepts the list of strings with operation names patterns. The patterns applied independently and nodes matching specific pattern are executed using the TF runtime. @@ -89,158 +71,6 @@ def offload_operations_to_tf(graph: nx.MultiDiGraph, op_names_patterns: str): merge_nodes(graph, [node_name]) -def make_shape_4d(shape: np.array): - """ - Create 4D tensor from 1D, 2D or 3D by adding new dimensions of size 1. - :param shape: shape to extend. - :return: 4D tensor. - """ - new_shape = int64_array(shape) - old_shape_len = len(shape) - - for x in range(4 - old_shape_len): # TODO think about proper way to add additional dimensions considering layout - if len(new_shape) <= 1: # if the shape is 0D or 1D then we should add additional dimensions to batch dimension - new_shape = np.insert(new_shape, 0, 1) - # new_shape = np.array([1, shape[0], 1, 1]) - else: - new_shape = np.insert(new_shape, 1, 1) - return new_shape - - -def add_reshape_before_op_node(graph: nx.MultiDiGraph, data_node_name: str, op_node_name: str, edge_attrs: dict): - """ - Adds reshape operation which expands dimension of the specified data tensor to 4D. - :param graph: graph to operate on. - :param data_node_name: the name of the data node to be reshaped to 4D tensor. - :param op_node_name: name of the TFCustomSubgraphCall node which produces the tensor. - :param edge_attrs: edge attributes which should be preserved. - :return: None - """ - data_node = Node(graph, data_node_name) - - graph.remove_edge(data_node_name, op_node_name) - - assert data_node['shape'] is not None - - new_shape = make_shape_4d(data_node['shape']) - - # reshape shape data node - reshape_shape_data_node_name = unique_id(graph, "Reshape_shape_") - graph.add_node(reshape_shape_data_node_name, kind='data', precision="FP32", name=reshape_shape_data_node_name, - value=new_shape, shape=[1]) - - # reshape operation node - reshape_node_name = unique_id(graph, "Reshape_") - graph.add_node(reshape_node_name, kind='op', precision="FP32", type='Reshape', name=reshape_node_name, op='Reshape', - data_type=data_node['data_type']) - update_ie_fields(graph.node[reshape_node_name]) - - # reshaped data node - reshaped_value = None - if data_node['value'] is not None: - reshaped_value = np.reshape(data_node['value'], new_shape) - reshaped_data_node_name = unique_id(graph, "reshaped_data_") - graph.add_node(reshaped_data_node_name, kind='data', precision="FP32", name=reshaped_data_node_name, - shape=new_shape, value=reshaped_value, nchw_layout=True) - - graph.add_edges_from([ - (data_node_name, reshape_node_name, {'in': 0}), - (reshape_shape_data_node_name, reshape_node_name, {'in': 1}), - (reshape_node_name, reshaped_data_node_name, {'out': 0}), - (reshaped_data_node_name, op_node_name, edge_attrs) - ]) - - -def add_reshape_after_data_node(graph: nx.MultiDiGraph, data_node_name: str): - """ - Adds reshape operation which changes shape of the tensor produced by TFSubgraphCall from 4D to real dimension - of the tensor. The data_node_name node contains real dimensions of the tensor but they will be changed in the - add_reshapes_for_tf_subgraph_calls function to a 4D because IE TF call layer supports output in 4D only. - :param graph: graph to operate on. - :param data_node_name: name of the data node to be reshaped to correct dimensions. - :return: None - """ - data_node = Node(graph, data_node_name) - - # if the data node was previously marked as output then we need to mark as output new reshaped data node - is_output = False - if data_node.has_and_set('is_output'): - is_output = data_node['is_output'] - data_node['is_output'] = False - - # save old consumers nodes with edge attributes - old_consumer_nodes_with_attrs = list() - for index, out_op in enumerate(data_node.out_nodes()): - edge_attrs = graph.get_edge_data(data_node_name, out_op.name)[0] - old_consumer_nodes_with_attrs.append((out_op.name, edge_attrs)) - - # remove old consumers from the data node - for out_op in list(data_node.out_nodes()): - graph.remove_edge(data_node_name, out_op.name) - - # reshape operation node - reshape_node_name = unique_id(graph, "Reshape_") - graph.add_node(reshape_node_name, kind='op', precision="FP32", type='Reshape', name=reshape_node_name, op='Reshape', - data_type=data_node['data_type']) - update_ie_fields(graph.node[reshape_node_name]) - - # reshape shape data node - reshape_shape_data_node_name = unique_id(graph, "Reshape_shape_") - graph.add_node(reshape_shape_data_node_name, kind='data', precision="FP32", name=reshape_shape_data_node_name, - value=np.array(data_node['shape']), shape=[1]) - - # reshaped data node - reshaped_value = None - if data_node['value'] is not None: - reshaped_value = np.array(data_node['value']) - reshaped_data_node_name = unique_id(graph, "reshaped_data_") - graph.add_node(reshaped_data_node_name, kind='data', precision="FP32", name=reshaped_data_node_name, - shape=np.array(data_node['shape']), value=reshaped_value, is_output=is_output, nchw_layout=True) - - graph.add_edges_from([ - (data_node_name, reshape_node_name, {'in': 0}), - (reshape_shape_data_node_name, reshape_node_name, {'in': 1}), - (reshape_node_name, reshaped_data_node_name, {'out': 0}), - ]) - - for out_node_name, edge_attrs in old_consumer_nodes_with_attrs: - graph.add_edges_from([ - (reshaped_data_node_name, out_node_name, edge_attrs) - ]) - - -def add_reshapes_for_tf_subgraph_calls(graph: nx.MultiDiGraph): - """ - Input and output tensors of the TFCustomSubgraphCall must be 4D because IE layer accepts and produces only 4D - tensors. This function adds reshape operations where it is necessary. - :param graph: graph to operate on. - :return: None. - """ - for src_node_name, dst_node_name, edge_attrs in list(graph.edges(data=True)): - src_node = Node(graph, src_node_name) - dst_node = Node(graph, dst_node_name) - if dst_node.kind == 'op' and dst_node.has_valid('type') and dst_node.type == 'TFCustomSubgraphCall' and \ - src_node.has_valid('shape') and len(src_node.shape) != 4: - log.info("There is an data tensor of shape '{}' which goes into '{}' node".format( - src_node.shape, dst_node.type)) - add_reshape_before_op_node(graph, src_node_name, dst_node_name, edge_attrs) - - for node_name in list(graph.nodes()): - node = Node(graph, node_name) - if node['kind'] == 'op' and node.has_and_set('type') and node.type == 'TFCustomSubgraphCall': - for index, data_node in node.out_nodes().items(): - real_dims_count = len(data_node.shape) - if real_dims_count != 4: - log.info("There is an data tensor of shape '{}' with real dims count '{}' which goes out of '{}' " - "node".format(data_node.shape, real_dims_count, node.name)) - add_reshape_after_data_node(graph, data_node.id) - - # need to update shape of the op so IE generates XML with 4D tensors - out_shape = make_shape_4d(data_node['shape']) - - data_node['shape'] = out_shape - - def internal_output_name_for_node(node_name: str, output_port: int): return node_name + ":" + str(output_port) @@ -273,7 +103,7 @@ def find_output_port(node: Node, output_desc: list, search_node_name: str, searc search_node_port)) -def merge_nodes(graph: nx.MultiDiGraph, nodes_to_merge_names: list, inputs_desc: list = None, +def merge_nodes(graph: Graph, nodes_to_merge_names: list, inputs_desc: list = None, outputs_desc: list = None): """ Merges nodes specified in the set 'nodes_to_merge_names' into one mega-node, creating new edges between mega-node @@ -288,9 +118,9 @@ def merge_nodes(graph: nx.MultiDiGraph, nodes_to_merge_names: list, inputs_desc: """ if not is_connected_component(graph, nodes_to_merge_names): log.warning("The following nodes do not form connected sub-graph: {}".format(nodes_to_merge_names)) - dump_graph_for_graphviz(graph, nodes_to_dump=nodes_to_merge_names) + graph.dump_graph_for_graphviz(nodes_to_dump=nodes_to_merge_names) - new_node_name = unique_id(graph, "TFSubgraphCall_") + new_node_name = graph.unique_id("TFSubgraphCall_") log.info("Create new node with name '{}' for nodes '{}'".format(new_node_name, ', '.join(nodes_to_merge_names))) graph.add_node(new_node_name) new_node_attrs = graph.node[new_node_name] @@ -305,7 +135,8 @@ def merge_nodes(graph: nx.MultiDiGraph, nodes_to_merge_names: list, inputs_desc: for node_name in nodes_to_merge_names: node = Node(graph, node_name) add_node_pb_if_not_yet_added(node, new_node) - for in_node_name, edge_attrs in get_inputs(graph, node_name): + # TODO: any improvements? + for in_node_name, edge_attrs in Node(graph, node_name).get_inputs(): in_node = Node(graph, in_node_name) # internal edges between nodes of the sub-graph @@ -336,7 +167,7 @@ def merge_nodes(graph: nx.MultiDiGraph, nodes_to_merge_names: list, inputs_desc: added_input_tensors_names.add(input_tensor_name) # edge from inside sub-graph to outside sub-graph - for out_node_name, edge_attrs in get_outputs(graph, node_name): + for out_node_name, edge_attrs in Node(graph, node_name).get_outputs(): if out_node_name not in nodes_to_merge_names: log.debug("Creating edge from inside of sub-graph to outside sub-graph: {} -> {}".format( new_node_name, out_node_name)) @@ -378,122 +209,6 @@ def set_tf_custom_call_node_attrs(node_attrs: dict): node_attrs['kind'] = 'op' -def prepare_tf_call_nodes(graph: nx.MultiDiGraph): - """ - The function performs preparation of the TF call nodes. Details are provided in the description of called functions. - :param graph: graph to operate on. - :return: None - """ - update_placeholders(graph) - add_output_nodes_transposes(graph) - add_reshapes_for_tf_subgraph_calls(graph) - - -def update_placeholders(graph: nx.MultiDiGraph): - """ - Iterates over all nodes of the graph, find all TF sub-graph call operations and updates placeholders shapes and adds - transpose operation if necessary. - :param graph: graph to operate on - :return: None - """ - for node_name in graph.nodes(): - node = Node(graph, node_name) - if node.kind == 'op' and node.has_valid('op') and node.op == 'TFCustomSubgraphCall': - update_placeholder_shape_and_add_transpose(node) - - -def update_placeholder_shape_and_add_transpose(node: Node): - """ - The function changes placeholders shapes from NHWC to NCHW format and add transpose operations if needed. - :param node: node to operate on. - :return: None - """ - tf.reset_default_graph() - - inputs_replacements = list() - - # transpose permutation constant - nchw_to_nhwc_constant = tf.constant(nchw_to_nhwc_permute, dtype=tf.int32, name=nchw_to_nhwc_constant_name) - nhwc_to_nchw_constant = tf.constant(nhwc_to_nchw_permute, dtype=tf.int32, name=nhwc_to_nchw_constant_name) - - for placeholder_name in node['input_nodes_names']: - # dummy node which we can refer to as input in the transpose for the output node - # dummy node should be unique for each placeholder - dummy_node = tf.constant(value=[[[[1]]]], dtype=tf.float32, name='random_dummy_name_' + placeholder_name) - - placeholder = node['pbs'][placeholder_name] - cur_shape = tf_tensor_shape(placeholder.attr['shape'].shape) - if len(cur_shape) == 4: # TODO think about better check that transpose is required - nchw_shape = convert_shape(cur_shape, nhwc_to_nchw_permute) - for ind in range(len(cur_shape)): - placeholder.attr['shape'].shape.dim[ind].size = nchw_shape[ind] - transpose_name = placeholder.name + '_transpose' - transpose = tf.transpose(dummy_node, nchw_to_nhwc_constant, transpose_name) # NCHW -> NHWC - - # add transpose operations to GraphDef after placeholders - add_node_def_to_subgraph(node, transpose.op.node_def, transpose_name, len(node['input_nodes_names'])) - inputs_replacements.append((placeholder.name, transpose_name)) - inputs_replacements.append((dummy_node.name, placeholder.name)) - node['real_input_dims'].append(nchw_shape) - else: - node['real_input_dims'].append(cur_shape) - add_node_def_to_subgraph(node, nchw_to_nhwc_constant.op.node_def) - add_node_def_to_subgraph(node, nhwc_to_nchw_constant.op.node_def) - - # update initial input names to a transposed ones - for old_input_tensor_name, new_name in inputs_replacements: - update_input_in_pbs(node, old_input_tensor_name, new_name) - - -def add_output_nodes_transposes(graph: nx.MultiDiGraph): - """ - Iterates over all nodes of the graph, find all TF sub-graph call operations and adds Transpose operations to the - output nodes if they are 4D to covert output from NHWC to NCHW. - :param graph: graph to operate on - :return: None - """ - for node_name in graph.nodes(): - node = Node(graph, node_name) - if node.kind == 'op' and node.has_valid('op') and node.op == 'TFCustomSubgraphCall': - add_sub_graph_call_output_tensors_transposes(node) - - -def add_sub_graph_call_output_tensors_transposes(node: Node): - """ - Adds transpose operations to the output nodes if they are 4D to change layout from NCHW to NHWC. - :param node: the node to add transposes to the output nodes to. - :return: None - """ - _, output_tensors = get_subgraph_output_tensors(node) - - # transpose permutation constant - nhwc_to_nchw_constant = tf.constant(nhwc_to_nchw_permute, dtype=tf.int32, name=nhwc_to_nchw_constant_name) - - # dummy node which we can refer to as input in the transpose for the output node - dummy_node = tf.constant(value=[[[[1]]]], dtype=tf.float32, name='random_dummy_name') - - new_out_tensor_names = list() - for out_tensor_name in node['output_tensors_names']: - out_name, out_port = out_tensor_name.split(':') - if len(output_tensors[int(out_port)].shape) == 4: # TODO think about better check whether transpose is required - out_transpose_name = out_name + '_port_' + out_port + '_transpose' - transpose = tf.transpose(dummy_node, nhwc_to_nchw_constant, name=out_transpose_name) - - # starting from TF 1.8 it is not possible to modify the "node_def" of the "tf.op", so we create a copy, - # update it and use further - new_input_names = transpose.op.node_def.input[:] - new_input_names[0] = out_tensor_name - new_node_def = copy.deepcopy(transpose.op.node_def) - new_node_def.input[:] = new_input_names - add_node_def_to_subgraph(node, new_node_def, position=len(node['nodes_order'])) - new_out_tensor_names.append(out_transpose_name) - else: - new_out_tensor_names.append(out_tensor_name) - - # update output tensor names with transposes operations - node['output_tensors_names'] = new_out_tensor_names - - def tf_find_constant_inputs(node: Node): """ The function finds constant inputs of the node and nodes with Identity operation. diff --git a/model-optimizer/mo/front/tf/extractor.py b/model-optimizer/mo/front/tf/extractor.py index d7af0d5..50ae67e 100644 --- a/model-optimizer/mo/front/tf/extractor.py +++ b/model-optimizer/mo/front/tf/extractor.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -33,12 +33,9 @@ from mo.front.tf.extractors.prod import tf_reduce_prod_ext from mo.front.tf.extractors.random_uniform import tf_random_uniform_ext from mo.front.tf.extractors.range import tf_range_ext from mo.front.tf.extractors.reshape import tf_reshape_ext -from mo.front.tf.extractors.shape import tf_shape_ext from mo.front.tf.extractors.space_to_batch import tf_space_to_batch_ext, tf_batch_to_space_ext from mo.front.tf.extractors.split import tf_split_ext from mo.front.tf.extractors.squeeze import tf_squeeze_ext -from mo.front.tf.extractors.strided_slice import tf_strided_slice_ext -from mo.front.tf.extractors.sum import tf_sum_ext from mo.front.tf.extractors.transpose import tf_transpose_ext from mo.front.tf.extractors.unpack import tf_unpack_ext from mo.front.tf.extractors.utils import get_tf_node_port @@ -90,7 +87,6 @@ tf_op_extractors = { 'MatMul': node_pb_arg(tf_matmul_ext), 'Pack': node_pb_arg(tf_pack_ext), 'Unpack': node_pb_arg(tf_unpack_ext), - 'StridedSlice': node_pb_arg(tf_strided_slice_ext), 'Prod': node_pb_arg(tf_reduce_prod_ext), 'Const': node_pb_arg(tf_const_ext), 'Placeholder': node_pb_arg(tf_placeholder_ext), @@ -109,15 +105,12 @@ tf_op_extractors = { 'BiasAdd': node_pb_arg(tf_bias_add_ext), 'Reshape': node_pb_arg(tf_reshape_ext), 'Squeeze': node_pb_arg(tf_squeeze_ext), - 'Shape': node_pb_arg(tf_shape_ext), 'SpaceToBatchND': node_pb_arg(tf_space_to_batch_ext), 'BatchToSpaceND': node_pb_arg(tf_batch_to_space_ext), 'Square': node_pb_arg(make_tf_eltwise(lambda a: a * a)), 'Minimum': node_pb_arg(make_tf_eltwise(lambda a, b: np.minimum(a, b))), # can use clamp if one argument is const 'Maximum': node_pb_arg(make_tf_eltwise(lambda a, b: np.maximum(a, b), attrs={'type': 'Eltwise', 'operation': 'max'})), - 'Sum': node_pb_arg(tf_sum_ext), - 'Range': node_pb_arg(tf_range_ext), 'ReadVariableOp': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'identity': True})), 'PlaceholderWithDefault': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'identity': True})) } diff --git a/model-optimizer/mo/front/tf/extractors/bias_add.py b/model-optimizer/mo/front/tf/extractors/bias_add.py index 883440c..d669a37 100644 --- a/model-optimizer/mo/front/tf/extractors/bias_add.py +++ b/model-optimizer/mo/front/tf/extractors/bias_add.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/concat.py b/model-optimizer/mo/front/tf/extractors/concat.py index 376a1f0..18a1531 100644 --- a/model-optimizer/mo/front/tf/extractors/concat.py +++ b/model-optimizer/mo/front/tf/extractors/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/concat_test.py b/model-optimizer/mo/front/tf/extractors/concat_test.py index 054da61..517c2df 100644 --- a/model-optimizer/mo/front/tf/extractors/concat_test.py +++ b/model-optimizer/mo/front/tf/extractors/concat_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/const.py b/model-optimizer/mo/front/tf/extractors/const.py index 8977a85..1924b43 100644 --- a/model-optimizer/mo/front/tf/extractors/const.py +++ b/model-optimizer/mo/front/tf/extractors/const.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/const_test.py b/model-optimizer/mo/front/tf/extractors/const_test.py index 5caafa4..c73e90b 100644 --- a/model-optimizer/mo/front/tf/extractors/const_test.py +++ b/model-optimizer/mo/front/tf/extractors/const_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/eltwise.py b/model-optimizer/mo/front/tf/extractors/eltwise.py index c45f769..3fc56f7 100644 --- a/model-optimizer/mo/front/tf/extractors/eltwise.py +++ b/model-optimizer/mo/front/tf/extractors/eltwise.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/eltwise_test.py b/model-optimizer/mo/front/tf/extractors/eltwise_test.py index 0a0f1e3..2cf897d 100644 --- a/model-optimizer/mo/front/tf/extractors/eltwise_test.py +++ b/model-optimizer/mo/front/tf/extractors/eltwise_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/elu.py b/model-optimizer/mo/front/tf/extractors/elu.py index 192250c..500df47 100644 --- a/model-optimizer/mo/front/tf/extractors/elu.py +++ b/model-optimizer/mo/front/tf/extractors/elu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/expand_dims.py b/model-optimizer/mo/front/tf/extractors/expand_dims.py index 0386a16..5363bf4 100644 --- a/model-optimizer/mo/front/tf/extractors/expand_dims.py +++ b/model-optimizer/mo/front/tf/extractors/expand_dims.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/expand_dims_test.py b/model-optimizer/mo/front/tf/extractors/expand_dims_test.py index dd1f1d8..ef2c344 100644 --- a/model-optimizer/mo/front/tf/extractors/expand_dims_test.py +++ b/model-optimizer/mo/front/tf/extractors/expand_dims_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/fused_bn.py b/model-optimizer/mo/front/tf/extractors/fused_bn.py index 31b4a12..96b2688 100644 --- a/model-optimizer/mo/front/tf/extractors/fused_bn.py +++ b/model-optimizer/mo/front/tf/extractors/fused_bn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/identity.py b/model-optimizer/mo/front/tf/extractors/identity.py index 9211da3..8d8832f 100644 --- a/model-optimizer/mo/front/tf/extractors/identity.py +++ b/model-optimizer/mo/front/tf/extractors/identity.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/identity_test.py b/model-optimizer/mo/front/tf/extractors/identity_test.py index 1a6a84f..ad29c4a 100644 --- a/model-optimizer/mo/front/tf/extractors/identity_test.py +++ b/model-optimizer/mo/front/tf/extractors/identity_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/lrn.py b/model-optimizer/mo/front/tf/extractors/lrn.py index e4a7d57..8ebc3a7 100644 --- a/model-optimizer/mo/front/tf/extractors/lrn.py +++ b/model-optimizer/mo/front/tf/extractors/lrn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/lrn_test.py b/model-optimizer/mo/front/tf/extractors/lrn_test.py index b4855b6..bb28656 100644 --- a/model-optimizer/mo/front/tf/extractors/lrn_test.py +++ b/model-optimizer/mo/front/tf/extractors/lrn_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/matmul.py b/model-optimizer/mo/front/tf/extractors/matmul.py index e0c763d..5fd6711 100644 --- a/model-optimizer/mo/front/tf/extractors/matmul.py +++ b/model-optimizer/mo/front/tf/extractors/matmul.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/matmul_test.py b/model-optimizer/mo/front/tf/extractors/matmul_test.py index e7bd524..19cac79 100644 --- a/model-optimizer/mo/front/tf/extractors/matmul_test.py +++ b/model-optimizer/mo/front/tf/extractors/matmul_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/mean.py b/model-optimizer/mo/front/tf/extractors/mean.py index 46453b4..ac74b4e 100644 --- a/model-optimizer/mo/front/tf/extractors/mean.py +++ b/model-optimizer/mo/front/tf/extractors/mean.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/mean_test.py b/model-optimizer/mo/front/tf/extractors/mean_test.py index 7430bae..cad5ed3 100644 --- a/model-optimizer/mo/front/tf/extractors/mean_test.py +++ b/model-optimizer/mo/front/tf/extractors/mean_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/native_tf.py b/model-optimizer/mo/front/tf/extractors/native_tf.py index ef2dcb3..0b20226 100644 --- a/model-optimizer/mo/front/tf/extractors/native_tf.py +++ b/model-optimizer/mo/front/tf/extractors/native_tf.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/pack.py b/model-optimizer/mo/front/tf/extractors/pack.py index 06fedf3..2453590 100644 --- a/model-optimizer/mo/front/tf/extractors/pack.py +++ b/model-optimizer/mo/front/tf/extractors/pack.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/placeholder.py b/model-optimizer/mo/front/tf/extractors/placeholder.py index c87112f..a0da30e 100644 --- a/model-optimizer/mo/front/tf/extractors/placeholder.py +++ b/model-optimizer/mo/front/tf/extractors/placeholder.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/prod.py b/model-optimizer/mo/front/tf/extractors/prod.py index 18947f6..70151f9 100644 --- a/model-optimizer/mo/front/tf/extractors/prod.py +++ b/model-optimizer/mo/front/tf/extractors/prod.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/prod_test.py b/model-optimizer/mo/front/tf/extractors/prod_test.py index 53b974d..a197b82 100644 --- a/model-optimizer/mo/front/tf/extractors/prod_test.py +++ b/model-optimizer/mo/front/tf/extractors/prod_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/random_uniform.py b/model-optimizer/mo/front/tf/extractors/random_uniform.py index e86936e..17bee99 100644 --- a/model-optimizer/mo/front/tf/extractors/random_uniform.py +++ b/model-optimizer/mo/front/tf/extractors/random_uniform.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/range.py b/model-optimizer/mo/front/tf/extractors/range.py index 73dffeb..d6807a4 100644 --- a/model-optimizer/mo/front/tf/extractors/range.py +++ b/model-optimizer/mo/front/tf/extractors/range.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/reshape.py b/model-optimizer/mo/front/tf/extractors/reshape.py index e95920e..aeed5fe 100644 --- a/model-optimizer/mo/front/tf/extractors/reshape.py +++ b/model-optimizer/mo/front/tf/extractors/reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/sigmoid.py b/model-optimizer/mo/front/tf/extractors/sigmoid.py index 4a43ee3..4093659 100644 --- a/model-optimizer/mo/front/tf/extractors/sigmoid.py +++ b/model-optimizer/mo/front/tf/extractors/sigmoid.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/space_to_batch.py b/model-optimizer/mo/front/tf/extractors/space_to_batch.py index d87f9c8..70c7830 100644 --- a/model-optimizer/mo/front/tf/extractors/space_to_batch.py +++ b/model-optimizer/mo/front/tf/extractors/space_to_batch.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/split.py b/model-optimizer/mo/front/tf/extractors/split.py index f112102..a550599 100644 --- a/model-optimizer/mo/front/tf/extractors/split.py +++ b/model-optimizer/mo/front/tf/extractors/split.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/squeeze.py b/model-optimizer/mo/front/tf/extractors/squeeze.py index 95ccefa..0054143 100644 --- a/model-optimizer/mo/front/tf/extractors/squeeze.py +++ b/model-optimizer/mo/front/tf/extractors/squeeze.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/squeeze_test.py b/model-optimizer/mo/front/tf/extractors/squeeze_test.py index ccc0ff1..d5e42d7 100644 --- a/model-optimizer/mo/front/tf/extractors/squeeze_test.py +++ b/model-optimizer/mo/front/tf/extractors/squeeze_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/strided_slice.py b/model-optimizer/mo/front/tf/extractors/strided_slice.py index cc2ecd2..909c10f 100644 --- a/model-optimizer/mo/front/tf/extractors/strided_slice.py +++ b/model-optimizer/mo/front/tf/extractors/strided_slice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,16 +13,37 @@ See the License for the specific language governing permissions and limitations under the License. """ +import numpy as np -from mo.front.common.partial_infer.slice import tf_strided_slice_infer +from mo.front.extractor import FrontExtractorOp +from mo.ops.op import Op -def tf_strided_slice_ext(pb): - return { - 'begin_mask': pb.attr["begin_mask"].i, - 'end_mask': pb.attr["end_mask"].i, - 'ellipsis_mask': pb.attr["ellipsis_mask"].i, - 'new_axis_mask': pb.attr["new_axis_mask"].i, - 'shrink_axis_mask': pb.attr["shrink_axis_mask"].i, - 'infer': tf_strided_slice_infer - } +def int_to_array_bit_mask(im): + list_repr = list(np.binary_repr(im)) + list_repr.reverse() + list_repr = [int(li) for li in list_repr] + return np.array(list_repr, dtype=np.int32) + + +class StridedSliceFrontExtractor(FrontExtractorOp): + op = 'StridedSlice' + enabled = True + + @staticmethod + def extract(node): + pb = node.pb + bm = int_to_array_bit_mask(pb.attr["begin_mask"].i) + bm = np.array([1 - b for b in bm], dtype=np.int32) + em = int_to_array_bit_mask(pb.attr["end_mask"].i) + em = np.array([1 - b for b in em], dtype=np.int32) + attrs = { + 'begin_mask': bm, + 'end_mask': em, + 'ellipsis_mask': int_to_array_bit_mask(pb.attr["ellipsis_mask"].i), + 'new_axis_mask': int_to_array_bit_mask(pb.attr["new_axis_mask"].i), + 'shrink_axis_mask': int_to_array_bit_mask(pb.attr["shrink_axis_mask"].i), + } + + Op.get_op_class_by_name(__class__.op).update_node_stat(node, attrs) + return __class__.enabled diff --git a/model-optimizer/mo/front/tf/extractors/tanh.py b/model-optimizer/mo/front/tf/extractors/tanh.py index 9ee14a4..e640d44 100644 --- a/model-optimizer/mo/front/tf/extractors/tanh.py +++ b/model-optimizer/mo/front/tf/extractors/tanh.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/transpose.py b/model-optimizer/mo/front/tf/extractors/transpose.py index 7d4d6db..90bc5bb 100644 --- a/model-optimizer/mo/front/tf/extractors/transpose.py +++ b/model-optimizer/mo/front/tf/extractors/transpose.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/unpack.py b/model-optimizer/mo/front/tf/extractors/unpack.py index 2ff831c..5d1bee1 100644 --- a/model-optimizer/mo/front/tf/extractors/unpack.py +++ b/model-optimizer/mo/front/tf/extractors/unpack.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/utils.py b/model-optimizer/mo/front/tf/extractors/utils.py index 5b736df..0b71a94 100644 --- a/model-optimizer/mo/front/tf/extractors/utils.py +++ b/model-optimizer/mo/front/tf/extractors/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/extractors/utils_test.py b/model-optimizer/mo/front/tf/extractors/utils_test.py index 51544cd..d278ccf 100644 --- a/model-optimizer/mo/front/tf/extractors/utils_test.py +++ b/model-optimizer/mo/front/tf/extractors/utils_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/graph_utils.py b/model-optimizer/mo/front/tf/graph_utils.py index 2a8454c..72891c0 100644 --- a/model-optimizer/mo/front/tf/graph_utils.py +++ b/model-optimizer/mo/front/tf/graph_utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,14 +17,15 @@ import collections import logging as log -import networkx as nx import numpy as np from mo.front.extractor import update_attrs -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.activation import Activation +from mo.ops.concat import Concat from mo.ops.const import Const from mo.ops.convolution import Convolution +from mo.ops.crop import Crop from mo.ops.reshape import Reshape from mo.ops.softmax import Softmax from mo.utils.error import Error @@ -55,6 +56,7 @@ def squeeze_reshape_and_concat(start_nodes: list): assert new_shape[2] == 1 new_shape = np.delete(new_shape, 2) cur_node.in_node(1).value = new_shape + cur_node.in_node(1).shape = np.array(new_shape.shape, dtype=np.int64) cur_node['dim'] = new_shape.copy() # run infer function once again cur_node.infer(cur_node) @@ -72,7 +74,7 @@ def squeeze_reshape_and_concat(start_nodes: list): q.append(node) -def add_convolution_to_swap_xy_coordinates(graph: nx.MultiDiGraph, input_node: Node, coordinates_size: int): +def add_convolution_to_swap_xy_coordinates(graph: Graph, input_node: Node, coordinates_size: int): """ The function add convolution node after the node 'input_node' to swap xy coordinates of the boxes produced by the node 'input_node'. It is expected that box coordinates are located in the fastest changing dimension of the @@ -121,7 +123,26 @@ def add_convolution_to_swap_xy_coordinates(graph: nx.MultiDiGraph, input_node: N return conv_op.create_node([input_reshape_4d_node, conv_filter_const_node], dict(name=input_node.name + "/conv")) -def add_activation_function_after_node(graph: nx.MultiDiGraph, node: Node, activation_function: str): +def add_fake_background_loc(graph: Graph, input_node: Node): + """ + DetectionOutput layer expects that box coordinates contains coordinates of boxes for the "background" class also, + but in the TensorFlow\* Object Detection API the tensor contains information about real object classes only. + The function copies a slice of the output data of the node 'input_node' and then concats it to the beginning of the + data. The data in this slice is not used by the Detection Output layer so the actual values are not important. This + approach allows the model to be reshape-able and does not introduce many layers. + "background" class box coordinates. + :param graph: graph to operate on. + :param input_node: node producing the boxes coordinates. + :return convolution node that adds slice of data for the "background" class. + """ + crop_op = Crop(graph, dict(axis=np.array([1]), offset=np.array([0]), dim=np.array([1]), nchw_layout=True)) + crop_node = crop_op.create_node([input_node], dict(name='crop_locs')) + + concat_op = Concat(graph, dict(axis=1, in_ports_count=2, nchw_layout=True)) + return concat_op.create_node([crop_node, input_node], dict(name=input_node.id + '/locs_with_fake_background')) + + +def add_activation_function_after_node(graph: Graph, node: Node, activation_function: str): """ The function adds node with activation function defined by string 'activation_function' which gets input from the node 'node'. diff --git a/model-optimizer/mo/front/tf/loader.py b/model-optimizer/mo/front/tf/loader.py index 8310e0a..ea93396 100644 --- a/model-optimizer/mo/front/tf/loader.py +++ b/model-optimizer/mo/front/tf/loader.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,8 +18,6 @@ import logging as log import os import re -import networkx as nx - from mo.utils.error import Error, FrameworkError from mo.utils.utils import refer_to_faq_msg @@ -30,7 +28,7 @@ except ImportError: refer_to_faq_msg(42)) from google.protobuf import text_format -from mo.graph.graph import create_graph_with_nodes +from mo.graph.graph import create_graph_with_nodes, Graph from mo.utils.summarize_graph import summarize_graph @@ -258,22 +256,17 @@ def protobuf2nx(pb: tf.GraphDef): return graph -def variables_to_constants(graph: nx.MultiDiGraph, variables_values: dict): +def variables_to_constants(graph: Graph, variables_values: dict): """ Converts `Variable` operations to FakeConst operations with `value` from `variables_values` dictionary :param graph: graph to operate on :param variables_values: dictionary with variable names as keys and np.array data as values """ - variable_operations = ['Variable', 'VariableV2'] - for node_name in graph.nodes(): - node_attr_dict = graph.node[node_name] - if 'op' not in node_attr_dict: - continue - op_name = node_attr_dict['op'] - if op_name not in variable_operations: - continue + for node in graph.get_op_nodes(op='FakeConst'): + node_name = node.name + if node_name not in variables_values: - log.debug("There is no value for '{}': {} in checkpoint variable values".format(op_name, node_name)) + log.debug("There is no value for '{}': {} in checkpoint variable values".format(node.op, node_name)) continue - graph.node[node_name]['op'] = 'FakeConst' - graph.node[node_name]['value'] = variables_values[node_name] + + node['value'] = variables_values[node_name] diff --git a/model-optimizer/mo/front/tf/loader_test.py b/model-optimizer/mo/front/tf/loader_test.py index 326849f..58f4254 100644 --- a/model-optimizer/mo/front/tf/loader_test.py +++ b/model-optimizer/mo/front/tf/loader_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/front/tf/partial_infer/tf.py b/model-optimizer/mo/front/tf/partial_infer/tf.py index a7247b9..ef35889 100644 --- a/model-optimizer/mo/front/tf/partial_infer/tf.py +++ b/model-optimizer/mo/front/tf/partial_infer/tf.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ from google.protobuf import text_format from mo.front.extractor import node_defs_to_str from mo.front.tf.extractors.utils import tf_dtype_extractor, tf_tensor_shape, get_tf_node_port -from mo.graph.graph import Node, get_sorted_inputs, get_inputs, create_sub_graph_copy +from mo.graph.graph import Node from mo.utils.graph import node_incoming_neighbourhood, node_outcoming_neighbourhood @@ -41,7 +41,7 @@ def tf_native_tf_node_infer(node: Node): # Also the sub-graph contains names of the output nodes of the node to perform native infer. nodes_to_extract = node_incoming_neighbourhood(node.graph, node.id, 10) + node_outcoming_neighbourhood(node.graph, node.id, 1) - tmp_graph = create_sub_graph_copy(node.graph, nodes_to_extract) + tmp_graph = node.graph.create_sub_graph_copy(nodes_to_extract) tmp_node_attrs = tmp_graph.node[node.id] tmp_node = Node(tmp_graph, node.id) @@ -82,7 +82,7 @@ def generate_feed_dict(graph: tf.Graph, node: Node): """ all_constants = True feed_dict = dict() - for in_data_node_name, edge_attrs in get_inputs(node.graph, node.id): + for in_data_node_name, edge_attrs in node.get_inputs(): if 'control_flow_edge' in edge_attrs and edge_attrs['control_flow_edge']: continue value = node.in_node(edge_attrs['in']).value @@ -198,7 +198,7 @@ def add_placeholders_to_subgraph(node: Node): :return: None """ inputs_replacements = list() - for index, (in_data_node, edge_attrs) in enumerate(get_sorted_inputs(node)): + for index, (in_data_node, edge_attrs) in enumerate(node.get_sorted_inputs()): if 'control_flow_edge' in edge_attrs and edge_attrs['control_flow_edge']: continue diff --git a/model-optimizer/mo/front/tf/register_custom_ops.py b/model-optimizer/mo/front/tf/register_custom_ops.py index 70ebe56..7a11e9c 100644 --- a/model-optimizer/mo/front/tf/register_custom_ops.py +++ b/model-optimizer/mo/front/tf/register_custom_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,18 +14,14 @@ limitations under the License. """ -from mo.back.replacement import BackReplacementPattern from mo.front.common.replacement import FrontReplacementOp, FrontReplacementPattern, FrontReplacementSubgraph from mo.front.extractor import FrontExtractorOp from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph, FrontReplacementFromConfigFileOp, \ FrontReplacementFromConfigFileGeneral -from mo.middle.replacement import MiddleReplacementPattern -from mo.ops.op import Op -from mo.utils import class_registration -def update_registration(): - class_registration.update_registration([Op, FrontExtractorOp, FrontReplacementOp, FrontReplacementPattern, - FrontReplacementSubgraph, FrontReplacementFromConfigFileSubGraph, - FrontReplacementFromConfigFileOp, MiddleReplacementPattern, - BackReplacementPattern, FrontReplacementFromConfigFileGeneral]) +def get_front_classes(): + front_classes = [FrontExtractorOp, FrontReplacementOp, FrontReplacementPattern, FrontReplacementSubgraph, + FrontReplacementFromConfigFileSubGraph, FrontReplacementFromConfigFileOp, + FrontReplacementFromConfigFileGeneral] + return front_classes diff --git a/model-optimizer/mo/front/tf/replacement.py b/model-optimizer/mo/front/tf/replacement.py index b9e1e60..c9b48ee 100644 --- a/model-optimizer/mo/front/tf/replacement.py +++ b/model-optimizer/mo/front/tf/replacement.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2017-2018 Intel Corporation + Copyright (c) 2017-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,13 +15,11 @@ """ import logging as log -import networkx as nx - from mo.front.common.custom_replacement_registry import CustomReplacementRegistry from mo.front.common.replacement import FrontReplacementSubgraph, FrontReplacementPattern from mo.front.subgraph_matcher import SubgraphMatcher, SubgraphMatch from mo.front.tf.custom_subgraph_call import merge_nodes -from mo.graph.graph import dump_graph_for_graphviz, unique_id +from mo.graph.graph import Graph from mo.ops.op import Op from mo.utils import class_registration from mo.utils.graph import is_connected_component @@ -40,7 +38,7 @@ class FrontReplacementFromConfigFileGeneral(FrontReplacementPattern): def transform_graph(self, graph, replacement_descriptions): raise Exception('Function "transform_graph" must be overridden in the sub-class') - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): replacement_descriptions = CustomReplacementRegistry().get_custom_replacement_description(self.replacement_id) if replacement_descriptions is None or len(replacement_descriptions) < 1: log.info("Failed to find custom replacement description with id '{}'".format(self.replacement_id)) @@ -72,10 +70,10 @@ class FrontReplacementFromConfigFileSubGraph(FrontReplacementSubgraph): def __init__(self): super().__init__() - def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def nodes_to_remove(self, graph: Graph, match: SubgraphMatch): return match.matched_nodes_names() - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): replacement_descriptions = CustomReplacementRegistry().get_custom_replacement_description(self.replacement_id) if replacement_descriptions is None: log.info("Failed to find custom replacement description with id '{}'".format(self.replacement_id)) @@ -87,7 +85,7 @@ class FrontReplacementFromConfigFileSubGraph(FrontReplacementSubgraph): if not is_connected_component(graph, match.matched_nodes_names()): log.warning("The following nodes don't form connected sub-graph: {}".format( match.matched_nodes_names())) - dump_graph_for_graphviz(graph, match.matched_nodes_names()) + graph.dump_graph_for_graphviz(match.matched_nodes_names()) self.replace_sub_graph(graph, match) registered_ops = {} @@ -111,7 +109,7 @@ class FrontReplacementFromConfigFileOp(FrontReplacementFromConfigFileSubGraph): super().__init__() def input_edges_match(self, # pylint: disable=method-hidden - graph: nx.DiGraph, + graph: Graph, match: SubgraphMatch, new_sub_graph: dict): """ @@ -131,7 +129,7 @@ class FrontReplacementFromConfigFileOp(FrontReplacementFromConfigFileSubGraph): return input_edges_match def output_edges_match(self, # pylint: disable=method-hidden - graph: nx.DiGraph, + graph: Graph, match: SubgraphMatch, new_sub_graph: dict): """ @@ -150,7 +148,7 @@ class FrontReplacementFromConfigFileOp(FrontReplacementFromConfigFileSubGraph): output_edges_match[(output_node.id, output_port)] = (new_sub_graph['new_node'].id, sub_graph_output_port) return output_edges_match - def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch): + def generate_sub_graph(self, graph: Graph, match: SubgraphMatch): replacement_desc = match.custom_replacement_desc op = Op.get_op_class_by_name(replacement_desc.op)(graph, match.custom_replacement_desc.custom_attributes) op.default_backend_attrs = list(match.custom_replacement_desc.custom_attributes.keys()) @@ -159,7 +157,7 @@ class FrontReplacementFromConfigFileOp(FrontReplacementFromConfigFileSubGraph): op.substitute_ie_attrs(op.attrs) node = merge_nodes(graph, match.matched_nodes_names(), replacement_desc.get_inputs_description(), replacement_desc.get_outputs_description()) - node.name = unique_id(graph, op.attrs['type']) + node.name = graph.unique_id(op.attrs['type']) node_attrs = graph.node[node.id] # copy attributes which are defined in the custom operation for key in op.attrs.keys(): diff --git a/model-optimizer/mo/graph/connection.py b/model-optimizer/mo/graph/connection.py new file mode 100644 index 0000000..0af1973 --- /dev/null +++ b/model-optimizer/mo/graph/connection.py @@ -0,0 +1,221 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from collections import namedtuple +from copy import deepcopy +from mo.utils.error import Error + + +class Connection: + def __init__(self, graph, source, destinations: list): + self.graph = graph + self.source = source + self.destinations = destinations + self.data = namedtuple('Data', ['get_value', 'get_shape']) + self.data.get_value = self._get_value + self.data.get_shape = self._get_shape + + def _get_value(self): + if self.graph.stage == 'front': + return None + return self.source.node.out_node().value + + def _get_shape(self): + if self.graph.stage == 'front': + return None + return self.source.node.out_node().shape + + def get_source(self): + return self.source + + def get_destination(self): + if self.destinations and len(self.destinations) > 1: + raise Error("Connection has more than one destination: {}".format(len(self.destinations))) + return self.destinations[0] if self.destinations else None + + def get_destinations(self): + return self.destinations + + def set_source(self, port): + # In this method we are changing source for a connection with given port. + # See detailed example below. + # + # SOURCE - Op1(out_port:0) + # + # | Op4(in_port:0) + # DESTINATIONS - | Op3(in_port:0) + # | Op2(in_port:0) + # + # NEW PORT - Op5(out_port:0) + # + # ,--->Op4(in_port:0) + # CONNECTION ,--->Op3(in_port:0) + # Op1(out_port:0)--->Op2(in_port:0) + # + # When we set source for connection we disconnect existing source and reconnect all consumers to + # the new given port with type='out'. + # + # UPDATED CONNECTION ,--->Op4(in_port:0) + # ,--->Op3(in_port:0) + # Op5(out_port:0)--->Op2(in_port:0) + # + + if port.type == 'in': + raise Error("Wrong port type in set_source method. Should be 'out' but given 'in'") + + if self.graph.stage == 'front': + scr_node = port.node + # Reconnecting all destinations as consumers to the source port preserving edge attrs + for dst_port in self.destinations: + edge_attrs, u, v, key = dst_port.get_in_edge_attrs(data=True) + if u is not None: + edge_attrs['out'] = port.idx + self.graph.remove_edge(u, v, key=key) + self.graph.add_edge(scr_node.id, v, **edge_attrs) + else: + self.graph.create_edge(scr_node, dst_port.node, port.idx, dst_port.idx) + else: + # Create out data node if not exists and mark node with need_shape_inference = True + # In case if data node exists just use it. + port._create_data_if_necessary() + port_out_data = port.node.out_node(port.idx) + + if self.source is not None and self.source.idx in self.source.node.out_nodes(): + source_out_data = self.source.node.out_node(self.source.idx) + # Copy attrs from source_out_data to port_out_data + attrs = deepcopy(source_out_data.attrs()) + for attr in attrs: + port_out_data[attr] = attrs[attr] + + for dst_port in self.destinations: + edge_attrs, u, v, key = dst_port.get_in_edge_attrs(data=True) + if u is not None: + self.graph.remove_edge(u, v, key=key) + self.graph.add_edge(port_out_data.id, v, **edge_attrs) + else: + self.graph.add_edge(port_out_data.id, dst_port.node.id, **{'in': dst_port.idx}) + + def set_destination(self, port): + # In this method we are changing destination for a connection with given port with type 'in'. + # This method requires exactly one destination or empty destinations list. + # See detailed example below. + # + # SOURCE - Op1(out_port:0) + # + # DESTINATIONS - Op2(in_port:0) + # + # NEW PORT - Op3(in_port:0) + # + # CONNECTION + # Op1(out_port:0)--->Op2(in_port:0) + # + # When we set destination for connection we disconnect destination port if exists and connect source to + # the new given port with type='in'. + # + # UPDATED CONNECTION + # + # Op1(out_port:0)--->Op3(in_port:0) + # + + def check_and_remove_edge(): + if self.destinations: + for destination in self.destinations: + edge_attrs, u, v, key = destination.get_in_edge_attrs(data=True) + if u is None: + raise Error( + "Broken Connection object! Destination (node:{}) is not connected to source.".format( + destination.node.name)) + destination.disconnect() + + if self.destinations and len(self.destinations) > 1: + raise Error("set_destination applicable only for connections that has exactly one destination or \ + when there is no destinations") + + if port.type == 'out': + raise Error("Wrong port type in set_destination method. Should be 'in' but given 'out'") + + if self.graph.stage == 'front': + if self.source is not None: + node = self.source.node + check_and_remove_edge() + self.graph.create_edge(node, port.node, out_port=self.source.idx, in_port=port.idx) + self.destinations = [port] + else: + # create out node if not exists and mark node with need_shape_inference = True + # in case if data node exists just use it as is + if self.source is not None: + data_node = self.source._create_data_if_necessary() + check_and_remove_edge() + self.graph.add_edge(data_node.id, port.node.id, **{'in': port.idx}) + self.destinations = [port] + + def add_destination(self, port): + # In this method we are adding destination port with type 'in' for a connection. + # See detailed example below. + # + # SOURCE - Op1(out_port:0) + # + # DESTINATIONS - Op2(in_port:0) + # + # NEW PORT - Op3(in_port:0) + # + # CONNECTION + # Op1(out_port:0)--->Op2(in_port:0) + # + # When we set destination for connection we disconnect destination port if exists and connect source to + # the new given port with type='in'. + # + # UPDATED CONNECTION + # ,-->Op3(in_port:0) + # Op1(out_port:0)--->Op2(in_port:0) + # + + if self.source is None: + raise Error("Can not add destination for connection without source port!") + + if self.graph.stage == 'front': + node = self.source.node + self.graph.create_edge(node, port.node, out_port=self.source.idx, in_port=port.idx) + else: + data_node = self.source._create_data_if_necessary() + self.graph.add_edge(data_node.id, port.node.id, **{'in': port.idx}) + + self.destinations.append(port) + + def remove(self): + # This method deletes all edges in connection. After that connection is not more accessible. + # See detailed example below. + # + # SOURCE - Op1(out_port:0) + # + # | Op4(in_port:0) + # DESTINATIONS - | Op3(in_port:0) + # | Op2(in_port:0) + # + # ,--->Op4(in_port:0) + # CONNECTION ,--->Op3(in_port:0) + # Op1(out_port:0)--->Op2(in_port:0) + # + # After removing edges connection will be empty + # + # REMOVED CONNECTION + # Op5(out_port:0) Op4(in_port:0) Op2(in_port:0) Op3(in_port:0) + # + + if self.destinations: + for dst_port in self.destinations: + dst_port.disconnect() + self.source = None + self.destinations = [] diff --git a/model-optimizer/mo/graph/graph.py b/model-optimizer/mo/graph/graph.py index e44b108..12cbbed 100644 --- a/model-optimizer/mo/graph/graph.py +++ b/model-optimizer/mo/graph/graph.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,239 +21,28 @@ from copy import deepcopy import networkx as nx import numpy as np +from mo.graph.port import Port from mo.utils.error import Error -from mo.utils.utils import refer_to_faq_msg +from mo.utils.utils import refer_to_faq_msg, deprecated_api, shrink_str_value -def unique_id(graph: nx.MultiDiGraph, prefix: str = ""): - """ - Generates a unique node id for a new node in a given graph. - The optional string prefix can be specified. - """ - # TODO thread safety? - unique_id.count = max(unique_id.count, graph.number_of_nodes()) + 1 - if prefix and not graph.has_node(prefix): - return str(prefix) - while graph.has_node(prefix + str(unique_id.count)): - unique_id.count += 1 - return prefix + str(unique_id.count) - - -unique_id.count = 0 - - -def get_node_id_by_name(graph: nx.MultiDiGraph, name: str): - for node in graph.nodes(): - if 'name' in graph.node[node] and graph.node[node]['name'] == name: - return node - raise Error('No node with name {}. ' + - refer_to_faq_msg(51), name) - - -def create_graph_with_nodes(src_nodes, get_id: callable, get_attrs: callable): - """ - Go over all nodes in src_nodes that should be enumerable and create new NX nodes - using get_id and get_attrs functions to create node id and node attributes correspondingly. - """ - graph = nx.MultiDiGraph() - for node in src_nodes: - graph.add_node(get_id(node), **get_attrs(node)) - return graph - - -# TODO implement merging for keys with dictionary values? -def merge_edge_props(attrs: dict, additional_attrs: dict): - """ - Update edge attributes without changing 'in' and 'out' keys. - It is necessary to copy edge attributes during merging of nodes when - result of one subgraph call is passed as input to another subgraph call - """ - result = attrs - for (key, value) in additional_attrs.items(): - if key not in ['in', 'out']: - if type(additional_attrs[key]) is list: - if key not in result: - result[key] = [] - result[key].extend(additional_attrs[key]) - result[key] = list(set(result[key])) # silly solution to find unique elements - else: - result[key] = value - return result - - -def print_graph_stat(graph: nx.MultiDiGraph): - log.debug('Number of nodes in graph: {}'.format(graph.number_of_nodes())) - log.debug('Number of edges in graph: {}'.format(len(list(graph.edges())))) - ops = collections.defaultdict(int) - for _node in graph.nodes(): - node = NodeWrap(graph, _node) - kind = node.kind if node.has('kind') else '' - if node.has('op'): - ops['op/' + node.op] += 1 - else: - ops[kind] += 1 - if node.has('shape') and np.any(node.shape == 0): - log.error("Found bad shape: '{}' for node '{}'".format(node.shape, node.node)) - for k, v in ops.items(): - log.debug(' {} : {}'.format(k, v)) - - -def get_inputs_with_ports(graph, match, pattern_edges, input_names_in_pattern): - """ - Front replacements of multi-input nodes should specify output port to add_node-like functions - This function is a helper to get such information out of matched nodes - :param graph: graph to operate on - :param match: dictionary returned by matching function - :param pattern_edges: edges that are specified in pattern - :param input_names_in_pattern: names of matched nodes as they were specified in pattern that should be in - resulting list - :return: list of tuples of node and output port - """ - inputs = [] - for name in input_names_in_pattern: - assert name in match, "node named {} not in match {}".format(name, match) - src = match[name] - dst = [] - for edge in pattern_edges: - if edge[0] == name: - assert edge[1] in match, "name from pattern_edges {} not in match {}".format(edge[1], match) - dst.append(match[edge[1]]) - if len(dst) != 1: - raise Error('Multiple output ports detected for node {} as {} in pattern'.format(match[name].id, name)) - dst = dst[0] - out_port = graph.get_edge_data(src.id, dst.id)[0]['out'] - inputs.append((src, out_port)) - return inputs - - -def dump_graph_for_graphviz(graph: nx.MultiDiGraph, node_attrs: list = ['kind', 'op', 'shape'], - edge_attrs: list = ['in', 'out'], - nodes_to_dump: list = None, save_to_svg = False): - log.debug("---- GRAPHVIZ OUTPUT STARTS ----") - if nodes_to_dump is None: - nodes_to_dump = graph.nodes() - string = '\ndigraph {\n' - visited_nodes = set() - for src_node_name, dst_node_name, attrs in graph.edges(data=True): - visited_nodes.add(src_node_name) - visited_nodes.add(dst_node_name) - if src_node_name not in nodes_to_dump or dst_node_name not in nodes_to_dump: - continue - src_node = graph.node[src_node_name] - dst_node = graph.node[dst_node_name] - src_node_string = str(src_node_name) + '\\n' + '\\n'.join( - [str(key) + '=' + str(src_node.get(key, 'None')) for key in node_attrs if key in src_node]) - dst_node_string = str(dst_node_name) + '\\n' + '\\n'.join( - [str(key) + '=' + str(dst_node.get(key, 'None')) for key in node_attrs if key in dst_node]) - edge_string = ' '.join([str(key) + '=' + str(attrs.get(key, 'None')) for key in edge_attrs if key in attrs]) - string += '"{}" -> "{}" [label = "{}"];\n'.format(src_node_string, dst_node_string, edge_string) - for node in nodes_to_dump: - if node not in visited_nodes: - string += '"{}"'.format(node) # TODO: add attributes like it was done in the loop above - visited_nodes.add(node) - string += '}' - log.debug(string) - log.debug("---- GRAPHVIZ OUTPUT ENDS ----") - - if save_to_svg: - try: - import graphviz - import os - file_name = "{}_{}.txt".format(graph.name.replace('/', '_'), 0) - id = 1 - while os.path.exists(file_name): - file_name = "{}_{}.txt".format(graph.name.replace('/', '_'), id) - id += 1 - with open(file_name, "w") as f: - f.write(string) - graphviz.render('dot','svg', file_name) - print('Graph was saved to {}.{}'.format(file_name, 'svg')) - except ImportError: - raise ImportError('Can\'t import graphviz') - except Exception as e: - raise Error('Can\'t save graph to svg') from e - - return string - - -def create_sub_graph_copy(graph: nx.MultiDiGraph, nodes_to_extract: list): - """ - Create new graph which is a sub-graph of the 'graph' that contains just nodes from 'nodes_to_extract' list. The - returned sub-graph is a deep copy of the provided graph nodes. - :param graph: graph to create a sub-graph from. - :param nodes_to_extract: list of node names to extract. - :return: new graph. - """ - return graph.subgraph(nodes_to_extract).copy() - - -def get_inputs(graph: nx.MultiDiGraph, node: str, edge_attr: dict = {}, control_flow: bool = False): - in_edges = graph.in_edges(node, data=True) - if not control_flow: - in_edges = [(u, v, d) for u, v, d in in_edges if 'control_flow_edge' not in d or not d['control_flow_edge']] - return [(u, d) for u, v, d in in_edges if all([attr in d and d[attr] == edge_attr[attr] for attr in edge_attr])] - - -def get_outputs(graph: nx.MultiDiGraph, node: str, edge_attr: dict = {}, control_flow: bool = False): - out_edges = graph.out_edges(node, data=True) - if not control_flow: - out_edges = [(u, v, d) for u, v, d in out_edges if 'control_flow_edge' not in d or not d['control_flow_edge']] - return [(v, d) for u, v, d in out_edges if all([attr in d and d[attr] == edge_attr[attr] for attr in edge_attr])] - - -def get_single_input(graph: nx.MultiDiGraph, node: str, edge_attr: dict = {}): - """ - Searches for all edges that have given attributes. - If there no such edges or there are multiple edges, raise exception. - If there is only one edge, returns the source node for this edge - and the edge attributes themselves. - """ - inputs = get_inputs(graph, node, edge_attr) - if len(inputs) != 1: - log.debug("Node '{}' has {} inputs with edge attributes '{}'".format(node, inputs, str(edge_attr))) - raise AttributeError( - "None or multiple inputs satisfy given attributes. Node: " + str(node) + ", edge_attr: " + str(edge_attr)) - return inputs[0] +def dict_to_ordered_dict(d: dict): + return collections.OrderedDict(sorted(d.items(), key=lambda t: t[0])) -def get_single_output(graph: nx.MultiDiGraph, node: str, edge_attr: dict = {}): - outputs = get_outputs(graph, node, edge_attr) - if len(outputs) != 1: - log.debug("Node '{}' has {} outputs with edge attributes '{}'".format(node, outputs, str(edge_attr))) - raise AttributeError( - "None or multiple outputs satisfy given attributes. Node: " + str(node) + ", edge_attr: " + str(edge_attr)) - return outputs[0] +class Node: + def __init__(self, graph, node: str): + if node not in graph: + raise AttributeError("Attempt to access node {} that not in graph".format(node)) + super(Node, self).__setattr__('graph', graph) + super(Node, self).__setattr__('node', node) # obsolete + super(Node, self).__setattr__('id', node) -def get_graph_ops(graph: nx.MultiDiGraph): - return [Node(graph, node) for node in graph.nodes() if Node(graph, node).soft_get('kind') == 'op'] - - -def dict_includes_compare_attrs(attr, attr_probe): - if callable(attr_probe) and not isinstance(attr_probe, type): - return attr_probe(attr) - else: - return attr == attr_probe - -def dict_includes(big: dict, sub_dict: dict): - ''' Searches attributes from sub_dict in big and ensures that all values match. - - Entries in sub_dict can be of two types: callable or not callable. If callable is specified - it is treated as probing function for attribute value from big dictionary by callable(attr) expression. - If it is not callable, the values are compared with == operator. - ''' - return all( - dict_includes_compare_attrs(big.get(attr, None), sub_dict[attr]) - for attr in sub_dict.keys() - ) - - -class NodeWrap: - - def __init__(self, graph: nx.MultiDiGraph, node: str): - super(NodeWrap, self).__setattr__('graph', graph) - super(NodeWrap, self).__setattr__('node', node) # obsolete - super(NodeWrap, self).__setattr__('id', node) + def __str__(self, max_length: int = 100): + node_dict = self.graph.node[self.id] + print_dict = {k: v if k != 'value' else shrink_str_value(v, max_symbols=max_length) for k, v in node_dict.items()} + return str(print_dict) def __setattr__(self, k, v): # you can assign only existing attributes @@ -266,6 +55,61 @@ class NodeWrap: # hope it raises AttributeError if k is not in the dict return self.graph.node[self.node][k] + def __getitem__(self, k): + return self.graph.node[self.node][k] + + def __setitem__(self, k, v): + self.graph.node[self.node][k] = v + + def __contains__(self, k): + return self.has(k) + + def add_input_port(self, idx): + if not self.has_valid('_in_ports'): + Node(self.graph, self.id)['_in_ports'] = set() + if idx in self.in_ports(): + raise Error("Input port with {} index already exists for {} node.".format(idx, self.name)) + self._in_ports.add(idx) + + def add_output_port(self, idx): + if not self.has_valid('_out_ports'): + Node(self.graph, self.id)['_out_ports'] = set() + if idx in self.out_ports(): + raise Error("Output port with {} index already exists for {} node.".format(idx, self.name)) + self._out_ports.add(idx) + + def in_port(self, idx=None) -> Port: + if not self.has_valid('_in_ports'): + raise Error("Operation {} {} has no _in_ports attribute", self.op, self.name) + if idx not in self._in_ports: + raise Error("Input port with index {} is not in node {}".format(idx, self.name)) + return Port(node=self, idx=idx, type='in') + + def in_ports(self): + if not self.has_valid('_in_ports'): + raise Error("Operation {} {} has no _in_ports attribute", self.op, self.name) + return dict_to_ordered_dict({idx: self.in_port(idx) for idx in self._in_ports}) + + def out_port(self, idx=None) -> Port: + if not self.has_valid('_out_ports'): + raise Error("Operation {} {} has no _out_ports attribute", self.op, self.name) + if idx not in self._out_ports: + raise Error("Output port with index {} is not in node {}".format(idx, self.name)) + return Port(node=self, idx=idx, type='out') + + def out_ports(self): + if not self.has_valid('_out_ports'): + raise Error("Operation {} {} has no _out_ports attribute", self.op, self.name) + return dict_to_ordered_dict({idx: self.out_port(idx) for idx in self._out_ports}) + + def has_port(self, port_type, idx): + assert port_type in ['in', 'out'], "Invalid usage of has_port method" + + if port_type == 'in': + return self.has_valid('_in_ports') and idx in self.in_ports() + else: + return self.has_valid('_out_ports') and idx in self.out_ports() + def attrs(self): return self.graph.node[self.node] @@ -278,55 +122,50 @@ class NodeWrap: def has_and_set(self, k): return self.has_valid(k) and self[k] - def __getitem__(self, k): - return self.graph.node[self.node][k] - - def __setitem__(self, k, v): - self.graph.node[self.node][k] = v - - def __contains__(self, k): - return self.has(k) - def in_nodes_edges(self, control_flow: bool=False): - return {x[1]['in']: (NodeWrap(self.graph, x[0]), x[1]) for x in get_inputs(self.graph, self.node, control_flow=control_flow)} + return dict_to_ordered_dict({x[1]['in']: (Node(self.graph, x[0]), x[1]) for x in + self.get_inputs(control_flow=control_flow)}) def in_nodes(self, control_flow: bool=False): - assert self.has('kind') - assert self.kind in ['op', 'data'] + assert self.has('kind') # TODO: remove as it always exists + assert self.kind in ['op', 'data'] # TODO: remove as it always exists if self.kind == 'op': - return {x[1]['in']: NodeWrap(self.graph, x[0]) for x in get_inputs(self.graph, self.node, control_flow=control_flow)} + return dict_to_ordered_dict({x[1]['in']: Node(self.graph, x[0]) for x in + self.get_inputs(control_flow=control_flow)}) elif self.kind == 'data': - return [NodeWrap(self.graph, n) for n, d in get_inputs(self.graph, self.node, control_flow=control_flow)] + return [Node(self.graph, n) for n, d in self.get_inputs(control_flow=control_flow)] + + def in_node(self, key=0, control_flow: bool=False): + return self.in_nodes(control_flow=control_flow)[key] def in_edges(self, control_flow: bool=False): assert self.has('kind') assert self.kind in ['op', 'data'] if self.kind == 'op': - return {x[1]['in']: x[1] for x in get_inputs(self.graph, self.node, control_flow=control_flow)} + return dict_to_ordered_dict({x[1]['in']: x[1] for x in self.get_inputs(control_flow=control_flow)}) elif self.kind == 'data': - return [d for n, d in get_inputs(self.graph, self.node, control_flow=control_flow)] + return [d for n, d in self.get_inputs(control_flow=control_flow)] def out_nodes_edges(self, control_flow: bool=False): - return {x[1]['out']: (NodeWrap(self.graph, x[0]), x[1]) for x in get_outputs(self.graph, self.node, control_flow=control_flow)} + return dict_to_ordered_dict({x[1]['out']: (Node(self.graph, x[0]), x[1]) for x in + self.get_outputs(control_flow=control_flow)}) def out_nodes(self, control_flow: bool=False): assert self.has('kind') assert self.kind in ['op', 'data'] if self.kind == 'op': - return {x[1]['out']: NodeWrap(self.graph, x[0]) for x in get_outputs(self.graph, self.node, control_flow=control_flow)} + return dict_to_ordered_dict({x[1]['out']: Node(self.graph, x[0]) for x in + self.get_outputs(control_flow=control_flow)}) elif self.kind == 'data': - return [NodeWrap(self.graph, n) for n, d in get_outputs(self.graph, self.node, control_flow=control_flow)] + return [Node(self.graph, n) for n, d in self.get_outputs(control_flow=control_flow)] def out_edges(self, control_flow: bool=False): assert self.has('kind') assert self.kind in ['op', 'data'] if self.kind == 'op': - return {x[1]['out']: x[1] for x in get_outputs(self.graph, self.node, control_flow=control_flow)} + return dict_to_ordered_dict({x[1]['out']: x[1] for x in self.get_outputs(control_flow=control_flow)}) elif self.kind == 'data': - return [d for n, d in get_outputs(self.graph, self.node, control_flow=control_flow)] - - def in_node(self, key=0, control_flow: bool=False): - return self.in_nodes(control_flow=control_flow)[key] + return [d for n, d in self.get_outputs(control_flow=control_flow)] def out_node(self, key=0, control_flow: bool=False): return self.out_nodes(control_flow=control_flow)[key] @@ -340,32 +179,71 @@ class NodeWrap: def get_attrs(self): return self.graph.node[self.node] + def get_inputs(self, edge_attr: dict = None, control_flow: bool = False): + if edge_attr is None: + edge_attr = {} + in_edges = self.graph.in_edges(self.id, data=True) + if not control_flow: + in_edges = [(u, v, d) for u, v, d in in_edges if 'control_flow_edge' not in d or not d['control_flow_edge']] + return [(u, d) for u, v, d in in_edges if all([attr in d and d[attr] == edge_attr[attr] for attr in edge_attr])] + + def get_outputs(self, edge_attr: dict = None, control_flow: bool = False): + if edge_attr is None: + edge_attr = {} + out_edges = self.graph.out_edges(self.id, data=True) + if not control_flow: + out_edges = [(u, v, d) for u, v, d in out_edges if + 'control_flow_edge' not in d or not d['control_flow_edge']] + return [(v, d) for u, v, d in out_edges if + all([attr in d and d[attr] == edge_attr[attr] for attr in edge_attr])] + + def get_sorted_inputs(self, control_flow: bool = False): + return sorted([x for x in self.get_inputs(control_flow=control_flow) if 'in' in x[1]], + key=lambda x: x[1]['in']) + + def get_sorted_outputs(self, control_flow: bool = False): + return sorted([x for x in self.get_outputs(control_flow=control_flow) if 'out' in x[1]], + key=lambda x: x[1]['out']) + def soft_get(self, k): return self[k] if self.has_valid(k) else '' def edges(self, attrs: dict=None): - ''' Get a list of all edges with specified set of attributes. + """ Get a single edge with specified set of attributes. + If none or multiple edges satisfies this criteria, exception is raised Edge is represented as tuple (u, v, d), where u is source node, - v is destination node and d is edge attributes. The function - returns a list of such tuples. - ''' + v is destination node and d is edge attributes. + """ edges = list(self.graph.in_edges([self.id], data=True)) + list(self.graph.out_edges([self.id], data=True)) - return [(u, v, d) for u,v,d in edges if dict_includes(d, attrs)] + return [(u, v, d) for u, v, d in edges if dict_includes(d, attrs)] def edge(self, attrs: dict=None): - ''' Get a single edge with specified set of attributes. + """ Get a single edge with specified set of attributes. If none or multiple edges satisfies this criteria, exception is raised Edge is represented as tuple (u, v, d), where u is source node, v is destination node and d is edge attributes. - ''' + """ edges = self.edges(attrs) assert len(edges) == 1, 'edges: {}, required attributes: {}'.format(edges, attrs) return edges[0] + def copy_node(self, new_attrs: dict = None, dst_graph=None): + ''' Copies node with all attributes (optionally updated) within the same graph or to different graph.''' + if new_attrs is None: + new_attrs = {} + if dst_graph is None: + dst_graph = self.graph + + attrs = deepcopy(self.attrs()) + attrs.update(new_attrs) + new_id = dst_graph.unique_id() + dst_graph.add_node(new_id, **attrs) + return Node(dst_graph, new_id) + def insert_node_with_data_before(self, inp, new_op_class: callable, op_before_params: dict = None, - infer_current: bool = False): + infer_current: bool = False, additional_inputs: list = None): """ Inserts operation node with op_before_params and data node before current operation @@ -389,18 +267,26 @@ class NodeWrap: new_op_before = new_op_class(graph, op_before_params) edge_attrs = deepcopy(graph.get_edge_data(inp.id, node.id)[0]) graph.remove_edge(inp.id, node.id) - new_inp = new_op_before.create_node_with_data([inp], {'name': node.name + cls_name + '/Before'}) + # form a list of input nodes for a new op node combining new_out and additional_inputs + inputs = [inp] + (additional_inputs if additional_inputs else []) + new_inp = new_op_before.create_node_with_data(inputs, {'name': node.name + cls_name + '/Before'}) graph.add_edge(new_inp.id, node.id, **edge_attrs) if infer_current: node.infer(node) - def insert_node_with_data_after(self, out, new_op_class: callable, op_after_params: dict = None): + def insert_node_with_data_after(self, out, new_op_class: callable, op_after_params: dict = None, + additional_inputs: list = None): """ Inserts operation node with op_after_params and data node after current operation :param out: output data node of current node :param new_op_class: class of operation that will be inserted after current operation node :param op_after_params: parameters to be added to operation that will be inserted after current operation + :param additional_inputs: other parameters for a new operation node in addition to one that is created + at the 'out' placed; new nodes are added after 0-th input + + TODO Allow indexing for input parameters as well as for 'out' data node to explicitly + specify ports that are connected to. Before calling: [...] -> Cur_Op -> Cur_Data -> [...] @@ -421,7 +307,9 @@ class NodeWrap: graph.remove_edge(node.id, out.id) new_out = Op.create_data_node(graph, node) node.infer(node) - new_op_after.create_node_with_data([new_out], {'name': node.name + cls_name + '/After'}, data_nodes=out) + # form a list of input nodes for a new op node combining new_out and additional_inputs + inputs = [new_out] + (additional_inputs if additional_inputs else []) + new_op_after.create_node_with_data(inputs, {'name': node.name + cls_name + '/After'}, data_nodes=out) def bracket_with_different_nodes_with_data(self, inp, out, new_op_class_before: callable, new_op_class_after: callable, @@ -469,19 +357,499 @@ class NodeWrap: new_op_class_before=new_op_class, new_op_class_after=new_op_class, op_before_params=op_before_params, op_after_params=op_after_params) + def insert_node_after(self, new_node, node_out_port: int = 0): + """ + Insert node 'new_node' after output with index 'node_out_port' of the node 'node'. All consumers of node 'node' + output with index 'node_out_port' will be changed to consume node 'new_node'. + The function should be used when graph doesn't contain data nodes yet. + :param node: node after which new node should be inserted. + :param new_node: node to be inserted. + :param node_out_port: the output index for the node 'node' to insert + :return: None + """ + assert self.graph is new_node.graph + assert (len([name for name in self.graph.nodes() if Node(self.graph, name).soft_get('kind') == 'data']) == 0) -class Node(NodeWrap): - pass + graph = self.graph + old_edges = list(graph.out_edges(self.id, data=True, keys=True)) + # create new edges first and then remove all old edges. This is needed for case when 'node' has several consumers + # getting input from 'node_out_port'. + # save tuple ("name of the destination edge", "edge key") to be removed + node_name_and_edge_key = [] + for _, dst_name, edge_key, edge_attrs in old_edges: + if edge_attrs['out'] == node_out_port: + log.debug('Create edge from "{}" to "{}"'.format(new_node.name, dst_name)) + graph.create_edge(new_node, Node(graph, dst_name), 0, edge_attrs['in']) + node_name_and_edge_key.append((dst_name, edge_key)) + for dst_name, edge_key in node_name_and_edge_key: + log.debug('Remove edge from "{}" to "{}"'.format(self.id, dst_name)) + graph.remove_edge(self.id, dst_name, edge_key) + graph.create_edge(self, new_node, node_out_port, 0, {}) + + def replace_node(self, new_node, new_node_out_port: int=None): + """ + Replaces node 'old_node' with a node 'new_node' preserving edge attributes. + :param old_node: node to be replaced. + :param new_node: node to replace with. + :return: None + """ + assert self.graph is new_node.graph + assert self.id != new_node.id, "New node and replaceable node are the same" + graph = self.graph + # save output edges and reconnect them to new node + for _, dst_node_name, edge_attrs in graph.out_edges(self.id, data=True): + new_edge_attrs = deepcopy(edge_attrs) + if new_node_out_port is not None: + assert 'out' not in edge_attrs or edge_attrs['out'] == 0, \ + 'replace_node function can replace old node with a single output port only if new_node_out_port is ' \ + 'specified' + new_edge_attrs.update({'out': new_node_out_port}) + graph.add_edge(new_node.id, dst_node_name, **new_edge_attrs) + + # if the node for replace is output node then we propagate this attribute to a new node + if len(self.out_nodes()) == 1 and self.out_node().has('op') and self.out_node().op == 'OpOutput': + graph.remove_node(self.out_node().id) + add_opoutput(graph, new_node.id, 0, False) + graph.remove_node(self.id) + + def input_ports_with(self, node): + """ + Returns a list of integers that specify input ports that connected to a given node. + :param node: node in the graph that is expected to appear at input port for self node + :return: a list of integers with port indices that are connected to self node + """ + return [i for i in range(len(self.in_nodes())) if self.in_node(i).id == node.id] + +class Graph(nx.MultiDiGraph): + def __init__(self, data=None, **attr): + self.stage = None + super().__init__(data, **attr) + + unique_id_count = 0 + + # SAFE API DESCRIPTION + # all provided methods below are designed to be more safe and convenient + # be careful while using other methods from nx.MultiDiGraph + + def add_node(self, node_for_adding, **attrs): + # TODO: check required attrs for node + super().add_node(node_for_adding, **attrs) + node = Node(self, node_for_adding) + + in_ports_count = node.in_ports_count if node.has_valid('in_ports_count') else None + out_ports_count = node.out_ports_count if node.has_valid('out_ports_count') else None + + node['_in_ports'] = set() + node['_out_ports'] = set() + + if in_ports_count is not None: + for idx in range(in_ports_count): + node.add_input_port(idx=idx) + + if out_ports_count is not None: + for idx in range(out_ports_count): + node.add_output_port(idx=idx) + + def add_edge(self, u_for_edge, v_for_edge, key=None, **attr): + return super().add_edge(u_for_edge, v_for_edge, key=key, **attr) + + def add_edges_from(self, ebunch_to_add, **attr): + for e in ebunch_to_add: + ne = len(e) + if ne == 4: + u, v, key, dd = e + elif ne == 3: + u, v, dd = e + key = None + elif ne == 2: + u, v = e + dd = {} + key = None + else: + raise Error("Edge tuple %s must be a 2-tuple, 3-tuple or 4-tuple." % (e,)) + ddd = attr.copy() + ddd.update(dd) + self.add_edge(u, v, key=key, **ddd) + def remove_edge(self, u, v, key=None): + return super().remove_edge(u, v, key=key) -def get_sorted_inputs(node: Node, control_flow: bool=False): - return sorted([x for x in get_inputs(node.graph, node.node, control_flow=control_flow) if 'in' in x[1]], key=lambda x: x[1]['in']) + def erase_node(self, node: Node): + """ + Erases node from the graph and reconnect edges from input node(s) to output node(s) + Produces assertion error if the node being removed has multiple inputs or outputs. + The function can be used in the front phase only (when there are no data nodes in the graph). + :param node: Node to erase + """ + node_id = node.id + + inputs = list(self.in_edges(node_id, data=True)) + outputs = list(self.out_edges(node_id, data=True)) + + assert node.kind == 'op' and (len(node.out_nodes()) == 0 or list(node.out_nodes().values())[0].kind != 'data'), \ + "The function must be used before the partial infer when graph doesn't contain data nodes." + assert len(node.out_nodes()) <= 1, "The node {} must produce just one output tensor".format( + node.soft_get('name')) + assert len(inputs) <= 1, "The node {} must have just one input".format(node.soft_get('name')) + + if len(outputs) == 0 and len(inputs) != 0: + from mo.front.extractor import add_output_ops + input_ids = {input_node_id: {'port': {'out': [attrs['out']]}} for input_node_id, _, attrs in inputs} + if node.has('op') and node.op == 'OpOutput': + add_output_ops(self, input_ids) + + if len(outputs) == 0 or len(inputs) == 0: + self.remove_node(node_id) + return + + input_node_id = inputs[0][0] + for src, dst, attrs in outputs: + self.remove_edge(src, dst) + # update the 'out' attribute of the edge from the node being removed + attrs['out'] = inputs[0][2]['out'] + self.add_edge(input_node_id, dst, **attrs) + self.remove_node(node_id) + + def get_edge_data(self, u, v, key=None, default=None): + return super().get_edge_data(u, v, key=key, default=default) + + def get_inputs_with_ports(self, match, pattern_edges, input_names_in_pattern): + """ + Front replacements of multi-input nodes should specify output port to add_node-like functions + This function is a helper to get such information out of matched nodes + :param graph: graph to operate on + :param match: dictionary returned by matching function + :param pattern_edges: edges that are specified in pattern + :param input_names_in_pattern: names of matched nodes as they were specified in pattern that should be in + resulting list + :return: list of tuples of node and output port + """ + inputs = [] + for name in input_names_in_pattern: + assert name in match, "node named {} not in match {}".format(name, match) + src = match[name] + dst = [] + for edge in pattern_edges: + if edge[0] == name: + assert edge[1] in match, "name from pattern_edges {} not in match {}".format(edge[1], match) + dst.append(match[edge[1]]) + if len(dst) != 1: + raise Error('Multiple output ports detected for node {} as {} in pattern'.format(match[name].id, name)) + dst = dst[0] + out_port = self.get_edge_data(src.id, dst.id)[0]['out'] + inputs.append((src, out_port)) + return inputs + + def get_node_id_by_name(self, name: str): + for node in self.nodes(): + if 'name' in self.node[node] and self.node[node]['name'] == name: + return node + raise Error('No node with name {}. ' + + refer_to_faq_msg(51), name) + + def get_op_nodes(self, **attrs): + nodes = self.get_nodes_with_attributes(**dict(kind='op', **attrs)) + return [Node(self, node) for node in nodes] + + def get_data_nodes(self, has_value=None): + """ + Returns list of data nodes. + If has_value = True, returns data nodes with value + If has_value = False, returns data nodes without value + """ + data_nodes = [Node(self, node) for node in self.nodes() if Node(self, node).soft_get('kind') == 'data'] + return [node for node in data_nodes if has_value is None or node.has_valid('value') == has_value] + + def get_nodes_with_attributes(self, **attrs: dict): + node_attrs = self.nodes(data=True) + return [n for n, d in node_attrs if all(a in d.items() for a in attrs.items())] + + def unique_id(self, prefix: str = ""): + """ + Generates a unique node id for a new node in a given graph. + The optional string prefix can be specified. + """ + # TODO thread safety? + self.unique_id_count = max(self.unique_id_count, self.number_of_nodes()) + 1 + if prefix and not self.has_node(prefix): + return str(prefix) + while self.has_node(prefix + str(self.unique_id_count)): + self.unique_id_count += 1 + return prefix + str(self.unique_id_count) + + def check_empty_graph(self, description: str): + if len(self.nodes()) <= 1: + raise Error( + "Graph contains {} node after executing {}. It considered as error because resulting IR will be " + "empty which is not usual".format(len(self.nodes()), description)) + + def check_shapes_consistency(self): + data_nodes = self.get_data_nodes() + data_nodes_with_wrong_shapes = [] + for data_node in data_nodes: + if not data_node.has('shape'): + data_nodes_with_wrong_shapes.append((data_node.name, "no shape attribute")) + continue + if data_node.shape is not None and not isinstance(data_node.shape, np.ndarray): + data_nodes_with_wrong_shapes.append((data_node.name, type(data_node.shape))) + if len(data_nodes_with_wrong_shapes) > 0: + raise Error("Graph contains data nodes ({}) with inconsistent shapes: {}".format( + len(data_nodes_with_wrong_shapes), + data_nodes_with_wrong_shapes + )) + + def check_nodes_ports_are_consecutive(self): + # Check that all operation nodes has consecutive ports indexes + op_nodes = self.get_op_nodes() + for node in op_nodes: + for idx in range(len(node.in_ports())): + if idx not in node.in_ports(): + raise Error("Node {} has not consecutive in ports indexes: {}".format(node.name, + list(node.in_ports().keys()))) + for idx in range(len(node.out_ports())): + if idx not in node.out_ports(): + raise Error("Node {} has not consecutive out ports indexes: {}".format(node.name, + list(node.out_ports().keys()))) + + def dump_graph_for_graphviz(self, node_attrs: list = ['kind', 'op', 'shape'], + edge_attrs: list = ['in', 'out'], + nodes_to_dump: list = None, save_to_svg=False): + log.debug("---- GRAPHVIZ OUTPUT STARTS ----") + if nodes_to_dump is None: + nodes_to_dump = self.nodes() + string = '\ndigraph {\n' + visited_nodes = set() + for src_node_name, dst_node_name, attrs in self.edges(data=True): + visited_nodes.add(src_node_name) + visited_nodes.add(dst_node_name) + if src_node_name not in nodes_to_dump or dst_node_name not in nodes_to_dump: + continue + src_node = self.node[src_node_name] + dst_node = self.node[dst_node_name] + src_node_string = str(src_node_name) + '\\n' + '\\n'.join( + [str(key) + '=' + str(src_node.get(key, 'None')) for key in node_attrs if key in src_node]) + dst_node_string = str(dst_node_name) + '\\n' + '\\n'.join( + [str(key) + '=' + str(dst_node.get(key, 'None')) for key in node_attrs if key in dst_node]) + edge_string = ' '.join([str(key) + '=' + str(attrs.get(key, 'None')) for key in edge_attrs if key in attrs]) + string += '"{}" -> "{}" [label = "{}"];\n'.format(src_node_string, dst_node_string, edge_string) + for node in nodes_to_dump: + if node not in visited_nodes: + string += '"{}"'.format(node) # TODO: add attributes like it was done in the loop above + visited_nodes.add(node) + string += '}' + log.debug(string) + log.debug("---- GRAPHVIZ OUTPUT ENDS ----") + + if save_to_svg: + try: + import graphviz + import os + file_name = "{}_{}.txt".format(self.name.replace('/', '_'), 0) + id = 1 + while os.path.exists(file_name): + file_name = "{}_{}.txt".format(self.name.replace('/', '_'), id) + id += 1 + with open(file_name, "w") as f: + f.write(string) + graphviz.render('dot', 'svg', file_name) + print('Graph was saved to {}.{}'.format(file_name, 'svg')) + except ImportError: + raise ImportError('Can\'t import graphviz') + except Exception as e: + raise Error('Can\'t save graph to svg') from e + + return string + + def print_graph_stat(self): + log.debug('Number of nodes in graph: {}'.format(self.number_of_nodes())) + log.debug('Number of edges in graph: {}'.format(len(list(self.edges())))) + ops = collections.defaultdict(int) + for _node in self.nodes(): + node = Node(self, _node) + kind = node.kind if node.has('kind') else '' + if node.has('op'): + ops['op/' + node.op] += 1 + else: + ops[kind] += 1 + if node.has('shape') and np.any(node.shape == 0): + log.error("Found bad shape: '{}' for node '{}'".format(node.shape, node.node)) + for k, v in ops.items(): + log.debug(' {} : {}'.format(k, v)) + + def create_sub_graph_copy(self, nodes_to_extract: list): + """ + Create new graph which is a sub-graph of the 'graph' that contains just nodes from 'nodes_to_extract' list. The + returned sub-graph is a deep copy of the provided graph nodes. + :param graph: graph to create a sub-graph from. + :param nodes_to_extract: list of node names to extract. + :return: new graph. + """ + return self.subgraph(nodes_to_extract).copy() + + def create_edge(self, src_node: Node, dst_node: Node, out_port: int = 0, in_port: int = 0, edge_attrs: dict = None): + """ + Creates edge from node 'src_node' from output with index 'out_port' to node 'dst_node' with input index 'in_port'. + :param src_node: node to create edge from. + :param dst_node: node to create edge to. + :param out_port: the index of output tensor of the 'src_node'. + :param in_port: the input index of the node 'dst_node'. + :param edge_attrs: dictionary with edge attrs. + :return: None + """ + # edges must belong to the same graph + assert src_node.graph is dst_node.graph + graph = src_node.graph + + if edge_attrs is None: + edge_attrs = dict() + else: + edge_attrs = edge_attrs.copy() + edge_attrs.update( + {'in': in_port, 'out': out_port, 'in_attrs': ['in', 'permutation'], 'out_attrs': ['out', 'permutation'], + 'data_attrs': ['fw_tensor_debug_info']}) + + # TODO: in case if in_port do not exists, we should raise an Exception here + graph.add_edges_from([(src_node.id, dst_node.id, edge_attrs)]) + + +def create_graph_with_nodes(src_nodes, get_id: callable, get_attrs: callable): + """ + Go over all nodes in src_nodes that should be enumerable and create new NX nodes + using get_id and get_attrs functions to create node id and node attributes correspondingly. + """ + graph = Graph() + for node in src_nodes: + graph.add_node(get_id(node), **get_attrs(node)) + return graph + + +def dict_includes_compare_attrs(attr, attr_probe): + if callable(attr_probe) and not isinstance(attr_probe, type): + return attr_probe(attr) + else: + return attr == attr_probe + + +def dict_includes(big: dict, sub_dict: dict, skip_attr_names=[]): + """ Searches attributes from sub_dict in big and ensures that all values match. + + Entries in sub_dict can be of two types: callable or not callable. If callable is specified + it is treated as probing function for attribute value from big dictionary by callable(attr) expression. + If it is not callable, the values are compared with == operator. + """ + return all( + dict_includes_compare_attrs(big.get(attr, None), sub_dict[attr]) + for attr in sub_dict.keys() if attr not in skip_attr_names + ) + + +def add_opoutput(graph: Graph, node_name: str, port: int, cut: bool = True): + """ + Creates and connects OpOutput node to node_name port. Cuts existing port if requested. + :param graph: graph to operate with + :param node_name: name of existing node in the graph that we want to add OpOutput to + :param port: output port of node to connect OpOutput to + :param cut: determines way of operating with edge specified by node_name and port + """ + # we import it here because Op imports add_attrs_props and update_ie_fields from this file + from mo.ops.output import Output + node = Node(graph, node_name) + if cut and len(node.out_edges()) != 0: + opoutput_node = Output(graph).create_node_on_port(node, port, {'name': node_name + '/sink_port_' + str(port)}) + else: + opoutput_node = Output(graph).create_node([(node, port)], {'name': node_name + '/sink_port_' + str(port)}) + opoutput_node.in_edge()['data_attrs'] = ['fw_tensor_debug_info'] + opoutput_node.in_edge()['fw_tensor_debug_info'] = [(node_name, port)] + log.debug('Sink: {} for node {}'.format(opoutput_node.id, node_name)) + log.debug(str(graph.node[opoutput_node.id])) + log.debug("Add edge from {} to {}".format(node_name, opoutput_node.id)) + return opoutput_node.id + + +# TODO implement merging for keys with dictionary values? +def merge_edge_props(attrs: dict, additional_attrs: dict): + """ + Update edge attributes without changing 'in' and 'out' keys. + It is necessary to copy edge attributes during merging of nodes when + result of one subgraph call is passed as input to another subgraph call + """ + result = attrs + for (key, value) in additional_attrs.items(): + if key not in ['in', 'out']: + if type(additional_attrs[key]) is list: + if key not in result: + result[key] = [] + result[key].extend(additional_attrs[key]) + result[key] = list(set(result[key])) # silly solution to find unique elements + else: + result[key] = value + return result + + +# All functions below are deprecated and will be removed in next release +# Please, use methods from Graph/Node classes instead + + +@deprecated_api(Graph) +def get_node_id_by_name(graph: Graph, name: str): + return graph.get_node_id_by_name(name=name) + + +@deprecated_api(Graph) +def print_graph_stat(graph: Graph): + return graph.print_graph_stat() + + +@deprecated_api(Graph) +def get_inputs_with_ports(graph: Graph, match, pattern_edges, input_names_in_pattern): + """ + Front replacements of multi-input nodes should specify output port to add_node-like functions + This function is a helper to get such information out of matched nodes + :param graph: graph to operate on + :param match: dictionary returned by matching function + :param pattern_edges: edges that are specified in pattern + :param input_names_in_pattern: names of matched nodes as they were specified in pattern that should be in + resulting list + :return: list of tuples of node and output port + """ + return graph.get_inputs_with_ports(match=match, + pattern_edges=pattern_edges, + input_names_in_pattern=input_names_in_pattern) + + +@deprecated_api(Graph) +def dump_graph_for_graphviz(graph: Graph, node_attrs: list = ['kind', 'op', 'shape'], + edge_attrs: list = ['in', 'out'], + nodes_to_dump: list = None, save_to_svg=False): + return graph.dump_graph_for_graphviz(node_attrs=node_attrs, + edge_attrs=edge_attrs, + nodes_to_dump=nodes_to_dump, + save_to_svg=save_to_svg) + + +@deprecated_api(Graph) +def create_sub_graph_copy(graph: Graph, nodes_to_extract: list): + """ + Create new graph which is a sub-graph of the 'graph' that contains just nodes from 'nodes_to_extract' list. The + returned sub-graph is a deep copy of the provided graph nodes. + :param graph: graph to create a sub-graph from. + :param nodes_to_extract: list of node names to extract. + :return: new graph. + """ + return graph.create_sub_graph_copy(nodes_to_extract=nodes_to_extract) -def get_sorted_outputs(node: Node, control_flow: bool=False): - return sorted([x for x in get_outputs(node.graph, node.node, control_flow=control_flow) if 'out' in x[1]], key=lambda x: x[1]['out']) +@deprecated_api(Graph) +def get_graph_ops(graph: Graph): + return graph.get_op_nodes() +@deprecated_api(Graph) +def check_empty_graph(graph: Graph, description: str): + return graph.check_empty_graph(description=description) + + +@deprecated_api(Graph) def create_edge(src_node: Node, dst_node: Node, out_port: int = 0, in_port: int = 0, edge_attrs: dict = None): """ Creates edge from node 'src_node' from output with index 'out_port' to node 'dst_node' with input index 'in_port'. @@ -492,20 +860,35 @@ def create_edge(src_node: Node, dst_node: Node, out_port: int = 0, in_port: int :param edge_attrs: dictionary with edge attrs. :return: None """ - # edges must belong to the same graph assert src_node.graph is dst_node.graph graph = src_node.graph + return graph.create_edge(src_node=src_node, dst_node=dst_node, out_port=out_port, in_port=in_port, + edge_attrs=edge_attrs) - if edge_attrs is None: - edge_attrs = dict() - else: - edge_attrs = edge_attrs.copy() - edge_attrs.update({'in': in_port, 'out': out_port, 'in_attrs': ['in', 'permutation'], 'out_attrs': ['out', 'permutation'], - 'data_attrs': ['fw_tensor_debug_info']}) - graph.add_edges_from([(src_node.id, dst_node.id, edge_attrs)]) +@deprecated_api(Graph) +def erase_node(node: Node): + """ + Erases node from the graph and reconnect edges from input node(s) to output node(s) + Produces assertion error if the node being removed has multiple inputs or outputs. + The function can be used in the front phase only (when there are no data nodes in the graph). + :param node: Node to erase + """ + graph = node.graph + return graph.erase_node(node) + + +@deprecated_api(Node) +def get_sorted_inputs(node: Node, control_flow: bool = False): + return node.get_sorted_inputs(control_flow=control_flow) +@deprecated_api(Node) +def get_sorted_outputs(node: Node, control_flow: bool = False): + return node.get_sorted_outputs(control_flow=control_flow) + + +@deprecated_api(Node) def insert_node_after(node: Node, new_node: Node, node_out_port: int = 0): """ Insert node 'new_node' after output with index 'node_out_port' of the node 'node'. All consumers of node 'node' @@ -516,67 +899,10 @@ def insert_node_after(node: Node, new_node: Node, node_out_port: int = 0): :param node_out_port: the output index for the node 'node' to insert :return: None """ - assert node.graph is new_node.graph - assert (len([name for name in node.graph.nodes() if Node(node.graph, name).soft_get('kind') == 'data']) == 0) - - graph = node.graph - old_edges = list(graph.out_edges(node.id, data=True, keys=True)) - # create new edges first and then remove all old edges. This is needed for case when 'node' has several consumers - # getting input from 'node_out_port'. - # save tuple ("name of the destination edge", "edge key") to be removed - node_name_and_edge_key = [] - for _, dst_name, edge_key, edge_attrs in old_edges: - if edge_attrs['out'] == node_out_port: - log.debug('Create edge from "{}" to "{}"'.format(new_node.name, dst_name)) - create_edge(new_node, Node(graph, dst_name), 0, edge_attrs['in']) - node_name_and_edge_key.append((dst_name, edge_key)) - for dst_name, edge_key in node_name_and_edge_key: - log.debug('Remove edge from "{}" to "{}"'.format(node.id, dst_name)) - graph.remove_edge(node.id, dst_name, edge_key) - create_edge(node, new_node, node_out_port, 0, {}) - - -def erase_node(node: Node): - """ - Erases node from the graph and reconnect edges from input node(s) to output node(s) - Produces assertion error if the node being removed has multiple inputs or outputs. - The function can be used in the front phase only (when there are no data nodes in the graph). - :param node: Node to erase - """ - graph = node.graph - node_id = node.id - - inputs = list(graph.in_edges(node_id, data=True)) - outputs = list(graph.out_edges(node_id, data=True)) - - assert node.kind == 'op' and (len(node.out_nodes()) == 0 or list(node.out_nodes().values())[0].kind != 'data'), \ - "The function must be used before the partial infer when graph doesn't contain data nodes." - assert len(node.out_nodes()) <= 1, "The node {} must produce just one output tensor".format(node.soft_get('name')) - assert len(inputs) <= 1, "The node {} must have just one input".format(node.soft_get('name')) - - if len(outputs) == 0 and len(inputs) != 0: - for input_node_id, _, __ in inputs: - if node.has_and_set('is_output'): - if graph.node[input_node_id]['kind'] == 'op': - data_nodes = [u for u, v in graph.in_edges(input_node_id)] - for data in data_nodes: - graph.node[data]['is_output'] = graph.node[node_id]['is_output'] - else: - graph.node[input_node_id]['is_output'] = graph.node[node_id]['is_output'] - - if len(outputs) == 0 or len(inputs) == 0: - graph.remove_node(node_id) - return - - input_node_id = inputs[0][0] - for src, dst, attrs in outputs: - graph.remove_edge(src, dst) - # update the 'out' attribute of the edge from the node being removed - attrs['out'] = inputs[0][2]['out'] - graph.add_edge(input_node_id, dst, **attrs) - graph.remove_node(node_id) + return node.insert_node_after(new_node=new_node, node_out_port=node_out_port) +@deprecated_api(Node) def replace_node(old_node: Node, new_node: Node, new_node_out_port: int=None): """ Replaces node 'old_node' with a node 'new_node' preserving edge attributes. @@ -584,40 +910,20 @@ def replace_node(old_node: Node, new_node: Node, new_node_out_port: int=None): :param new_node: node to replace with. :return: None """ - assert old_node.graph is new_node.graph - graph = old_node.graph - # save output edges and reconnect them to new node - for _, dst_node_name, edge_attrs in graph.out_edges(old_node.id, data=True): - new_edge_attrs = deepcopy(edge_attrs) - if new_node_out_port is not None: - assert 'out' not in edge_attrs or edge_attrs['out'] == 0, \ - 'replace_node function can replace old node with a single output port only if new_node_out_port is ' \ - 'specified' - new_edge_attrs.update({'out': new_node_out_port}) - graph.add_edge(new_node.id, dst_node_name, **new_edge_attrs) - - # if the node for replace is output node then we propagate this attribute to a new node - if old_node.has_valid('is_output') and old_node.is_output: - old_node.is_output = False - new_node['is_output'] = True - graph.remove_node(old_node.id) - - -def check_empty_graph(graph: nx.MultiDiGraph, description: str): - if len(graph.nodes()) <= 1: - raise Error("Graph contains {} node after executing {}. It considered as error because resulting IR will be " - "empty which is not usual".format(len(graph.nodes()), description)) - - -def copy_node(src_node: Node, new_attrs: dict=None, dst_graph: nx.MultiDiGraph=None): - ''' Copies node with all attributes (optionally updated) within the same graph or to different graph.''' - if new_attrs is None: - new_attrs = {} - if dst_graph is None: - dst_graph = src_node.graph - - attrs = deepcopy(src_node.attrs()) - attrs.update(new_attrs) - new_id = unique_id(dst_graph) - dst_graph.add_node(new_id, attrs) - return Node(dst_graph, new_id) + return old_node.replace_node(new_node=new_node, new_node_out_port=new_node_out_port) + + +@deprecated_api(Node) +def copy_node(src_node: Node, new_attrs: dict=None, dst_graph: nx.MultiDiGraph = None): + """ Copies node with all attributes (optionally updated) within the same graph or to different graph.""" + return src_node.copy_node(new_attrs=new_attrs, dst_graph=dst_graph) + + +@deprecated_api(Node) +def get_inputs(graph: Graph, node: str, edge_attr: dict = None, control_flow: bool = False): + return Node(graph, node).get_inputs(edge_attr=edge_attr, control_flow=control_flow) + + +@deprecated_api(Node) +def get_outputs(graph: Graph, node: str, edge_attr: dict = None, control_flow: bool = False): + return Node(graph, node).get_outputs(edge_attr=edge_attr, control_flow=control_flow) diff --git a/model-optimizer/mo/graph/graph_test.py b/model-optimizer/mo/graph/graph_test.py index 6b5d990..91131a6 100644 --- a/model-optimizer/mo/graph/graph_test.py +++ b/model-optimizer/mo/graph/graph_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,16 @@ import unittest -from mo.graph.graph import erase_node, get_node_id_by_name, Node, replace_node, get_inputs_with_ports +import numpy as np + +from generator import generator, generate + +from mo.graph.graph import Node, Graph, add_opoutput from mo.ops.const import Const from mo.utils.error import Error from mo.utils.unittest.graph import build_graph, compare_graphs + nodes = { '0': {'name': 'input1', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, '1': {'name': 'input2', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, @@ -28,8 +33,7 @@ nodes = { '3': {'name': 'node_2', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'NotPlaceholder'}, '4': {'name': 'node_3', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'NotPlaceholder'}, '5': {'name': 'node_4', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'NotPlaceholder'}, - '6': {'name': 'output', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'OpOutput', - 'is_output': True}, + '6': {'name': 'output', 'value': None, 'kind': 'op', 'op': 'OpOutput'}, 'input_3': {'name': 'input_3', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'Placeholder'} } edges = { @@ -47,31 +51,31 @@ class TestGetNodeById(unittest.TestCase): self.graph = build_graph(nodes, edges) def test_get_node_id_by_name(self): - self.assertEqual(get_node_id_by_name(self.graph, 'input1'), '0') + self.assertEqual(self.graph.get_node_id_by_name('input1'), '0') def test_get_node_id_by_name_1(self): - self.assertEqual(get_node_id_by_name(self.graph, 'input2'), '1') + self.assertEqual(self.graph.get_node_id_by_name('input2'), '1') def test_get_node_id_by_name_2(self): - self.assertEqual(get_node_id_by_name(self.graph, 'node_1'), '2') + self.assertEqual(self.graph.get_node_id_by_name('node_1'), '2') def test_get_node_id_by_name_3(self): - self.assertEqual(get_node_id_by_name(self.graph, 'node_2'), '3') + self.assertEqual(self.graph.get_node_id_by_name('node_2'), '3') def test_get_node_id_by_name_4(self): - self.assertEqual(get_node_id_by_name(self.graph, 'node_3'), '4') + self.assertEqual(self.graph.get_node_id_by_name('node_3'), '4') def test_get_node_id_by_name_5(self): - self.assertEqual(get_node_id_by_name(self.graph, 'node_4'), '5') + self.assertEqual(self.graph.get_node_id_by_name('node_4'), '5') def test_get_node_id_by_name_6(self): - self.assertEqual(get_node_id_by_name(self.graph, 'output'), '6') + self.assertEqual(self.graph.get_node_id_by_name('output'), '6') def test_get_node_id_by_name_7(self): - self.assertEqual(get_node_id_by_name(self.graph, 'input_3'), 'input_3') + self.assertEqual(self.graph.get_node_id_by_name('input_3'), 'input_3') def test_get_node_id_by_name_8(self): - self.assertRaises(Error, get_node_id_by_name, self.graph, '1') + self.assertRaises(Error, self.graph.get_node_id_by_name, '1') class TestEraseNode(unittest.TestCase): @@ -89,7 +93,7 @@ class TestEraseNode(unittest.TestCase): self.assertEqual(len(graph.edges()), 2) self.assertListEqual(list(graph.out_edges('input')), [('input', 'noop')]) - erase_node(Node(graph, 'noop')) + graph.erase_node(Node(graph, 'noop')) self.assertEqual(len(graph.nodes()), 2) self.assertEqual(len(graph.edges()), 1) @@ -121,7 +125,7 @@ class TestEraseNode(unittest.TestCase): ('input', 'output_3', {'in': 10, 'out': 0})], nodes_with_edges_only=True) - erase_node(Node(graph, 'noop')) + graph.erase_node(Node(graph, 'noop')) compare_graphs(graph, ref_graph, 'output_1') @@ -151,7 +155,7 @@ class TestEraseNode(unittest.TestCase): ('input', 'output_3', {'in': 10, 'out': 0})], nodes_with_edges_only=True) - erase_node(Node(graph, 'noop')) + graph.erase_node(Node(graph, 'noop')) compare_graphs(graph, ref_graph, 'output_1') @@ -169,7 +173,7 @@ class TestEraseNode(unittest.TestCase): ('noop', 'output_2', {'in': 2, 'out': 1}), ('noop', 'output_3', {'in': 10, 'out': 0})]) - self.assertRaises(AssertionError, erase_node, Node(graph, 'noop')) + self.assertRaises(AssertionError, graph.erase_node, Node(graph, 'noop')) def test_remove_noop_nodes_front(self): graph = build_graph( @@ -184,7 +188,7 @@ class TestEraseNode(unittest.TestCase): self.assertEqual(len(graph.edges()), 1) self.assertListEqual(list(graph.out_edges('noop')), [('noop', 'output')]) - erase_node(Node(graph, 'noop')) + graph.erase_node(Node(graph, 'noop')) self.assertEqual(len(graph.nodes()), 1) self.assertEqual(len(graph.edges()), 0) @@ -203,21 +207,20 @@ class TestEraseNode(unittest.TestCase): self.assertEqual(len(graph.edges()), 1) self.assertListEqual(list(graph.in_edges('noop')), [('input', 'noop')]) - erase_node(Node(graph, 'noop')) + graph.erase_node(Node(graph, 'noop')) self.assertEqual(len(graph.nodes()), 1) self.assertEqual(len(graph.edges()), 0) self.assertEqual(len(graph.in_edges('input')), 0) def test_remove_noop_nodes_noop_only(self): - import networkx as nx - graph = nx.MultiDiGraph() + graph = Graph() graph.add_node('noop', **{'type': 'NoOp', 'value': None, 'kind': 'op'}) self.assertEqual(len(graph.nodes()), 1) self.assertEqual(len(graph.edges()), 0) - erase_node(Node(graph, 'noop')) + graph.erase_node(Node(graph, 'noop')) self.assertEqual(len(graph.nodes()), 0) self.assertEqual(len(graph.edges()), 0) @@ -239,7 +242,7 @@ class TestEraseNode(unittest.TestCase): ('noop', 'output_1'), ('noop', 'output_2'), ('noop', 'output_3')]) - self.assertRaises(AssertionError, erase_node, Node(graph, 'noop')) + self.assertRaises(AssertionError, graph.erase_node, Node(graph, 'noop')) class TestReplaceNode(unittest.TestCase): @@ -248,20 +251,22 @@ class TestReplaceNode(unittest.TestCase): { 'input_1': {'type': 'Placeholder', 'value': None, 'kind': 'op'}, 'input_2': {'type': 'Placeholder', 'value': None, 'kind': 'op'}, - 'old': {'type': 'Identity', 'value': None, 'kind': 'op', 'is_output': True}, - 'output': {'type': 'OpOutput', 'value': None, 'kind': 'op'}, + 'old': {'type': 'Identity', 'value': None, 'kind': 'op'}, + 'output': {'op': 'OpOutput', 'value': None, 'kind': 'op'}, }, [('input_1', 'old'), ('input_2', 'old'), ('old', 'output')]) new_node = Const(graph, {'name': 'new'}).create_node([Node(graph, 'input_1'), Node(graph, 'input_2')]) - replace_node(Node(graph, 'old'), new_node) + + old_node = Node(graph, 'old') + old_node.replace_node(new_node) self.assertEqual(len(graph.nodes()), 4) self.assertEqual(len(graph.edges()), 3) - self.assertEqual(new_node['is_output'], True) - self.assertListEqual(list(graph.out_edges('new')), [('new', 'output')]) + self.assertEqual(new_node.out_node().op, 'OpOutput') + self.assertEqual(len(graph.out_edges('new')), 1) def test_replace_node_several_consumers(self): graph = build_graph( @@ -281,7 +286,7 @@ class TestReplaceNode(unittest.TestCase): ]) new_node = Const(graph, {'name': 'new'}).create_node([Node(graph, 'input_1'), Node(graph, 'input_2')]) - replace_node(Node(graph, 'old'), new_node) + Node(graph, 'old').replace_node(new_node) self.assertEqual(len(graph.nodes()), 6) self.assertEqual(len(graph.edges()), 5) @@ -319,6 +324,1154 @@ class GetNodesWithPorts(unittest.TestCase): } input_names_in_pattern = ['one', 'three'] - result = get_inputs_with_ports(graph=graph, match=match, pattern_edges=edges, + result = graph.get_inputs_with_ports(match=match, pattern_edges=edges, input_names_in_pattern=input_names_in_pattern) self.assertListEqual([(match['one'], 0), (match['three'], 0)], result) + + +class TestGraphShapeChecker(unittest.TestCase): + nodes = { + '0': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '0_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '1': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '1_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '2': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '2_data': {'value': None, 'shape': None, 'kind': 'data'}, + } + + def test_check_shape_consistency_1(self): + # No shape attr in data node + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + + del graph.node['2_data']['shape'] + + with self.assertRaisesRegex(Error, "Graph contains data nodes \(1\) with inconsistent shapes:.*"): + graph.check_shapes_consistency() + + def test_check_shape_consistency_2(self): + # No shape attr in data node + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + + graph.node['1_data']['shape'] = (1, 2, 3) + graph.node['2_data']['shape'] = (1, 2, 3) + + with self.assertRaisesRegex(Error, "Graph contains data nodes \(2\) with inconsistent shapes:.*"): + graph.check_shapes_consistency() + + +@generator +class TestGraphPortsChecker(unittest.TestCase): + nodes = { + '0': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '0_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '1': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '1_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '2': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '2_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '3': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '3_data': {'value': None, 'shape': None, 'kind': 'data'}, + } + + @generate(*[('0', 'in', 1), ('0', 'out', 2), ('1', 'in', 2), ('3', 'out', 2)]) + def test_check_shape_consistency_1(self, node_id: str, port_type: str, port_idx: int): + # + # ,->2-->2_data---,->3-->3_data + # 0-->0_data-/-->1-->1_data--/ + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('0_data', '2'), + ('1_data', '3'), + ('2_data', '3'), + ]) + + node = Node(graph, node_id) + if port_type == 'in': + node.add_input_port(idx=port_idx) + else: + node.add_output_port(idx=port_idx) + + with self.assertRaisesRegex(Error, "Node {} has not consecutive {} ports indexes:.*".format(node_id, + port_type)): + graph.check_nodes_ports_are_consecutive() + + +class TestNewGraphAPIMiddle(unittest.TestCase): + + nodes = { + '0': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '0_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '1': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '1_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '2': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '2_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '3': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '3_data': {'value': None, 'shape': None, 'kind': 'data'}, + + '4': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '4_data': {'value': None, 'shape': None, 'kind': 'data'}, + + 'const_1': {'type': 'Const', 'value': None, 'kind': 'op', 'op': 'Const'}, + 'const_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + } + + ########################################### + ###### TESTS FOR PORT CLASS METHODS ####### + ########################################### + + def test_port_get_destinations_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + node_2_in_port = Node(graph, '2').in_port(0) + + ports = node_0_out_port.get_destinations() + + self.assertTrue(len(ports) == 2) + for port in ports: + self.assertTrue(port in [node_1_in_port, node_2_in_port]) + + def test_port_get_destination_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + node_2_in_port = Node(graph, '2').in_port(0) + + with self.assertRaises(Error): + node_0_out_port.get_destination() + + def test_port_get_destination_2(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('0_data', '1'), + ]) + graph.__setattr__('stage', 'middle') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + + self.assertEqual(node_0_out_port.get_destination(), node_1_in_port) + + def test_port_get_source_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('0_data', '1'), + ]) + graph.__setattr__('stage', 'middle') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + + self.assertEqual(node_1_in_port.get_source(), node_0_out_port) + + def test_port_get_source_2(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('0_data', '1'), + ('2_data', '1') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + self.assertEqual(node_1.in_port(0).get_source(), node_0.out_port(0)) + self.assertEqual(node_1.in_port(1).get_source(), node_2.out_port(0)) + + def test_port_get_source_3(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_0.add_input_port(0) + node_1.add_input_port(0) + node_2.add_input_port(0) + + self.assertEqual(node_0.in_port(0).get_source(), None) + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), None) + + def test_port_disconnect_1(self): + # ,-->1-->1_data 0-->0_data + # 0-->0_data/--->2-->2_data ==> 0-->0_data 1-->1_data + # 2-->2_data + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_0.out_port(0).disconnect() + + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), None) + + self.assertTrue(len(node_1.in_nodes()) == 0) + self.assertTrue(len(node_2.in_nodes()) == 0) + + def test_port_disconnect_2(self): + # ,-->1-->1_data ,-->1-->1_data + # 0-->0_data/--->2-->2_data ==> 0-->0_data/ 2-->2_data + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_2.in_port(0).disconnect() + + self.assertEqual(node_0.out_port(0).get_destination(), node_1.in_port(0)) + self.assertEqual(node_1.in_port(0).get_source(), node_0.out_port(0)) + self.assertEqual(node_2.out_port(0).get_destination(), None) + self.assertEqual(node_2.in_port(0).get_source(), None) + + self.assertTrue(len(node_0.out_nodes()) == 1) + self.assertTrue(len(node_1.in_nodes()) == 1) + self.assertTrue(len(node_2.in_nodes()) == 0) + + def test_port_disconnect_3(self): + # 1-->1_data---\ 1-->1_data + # 0-->0_data---->2-->2_data ==> 0-->0_data-->2-->2_data + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('0_data', '2'), + ('1_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_2.in_port(1).disconnect() + + self.assertEqual(node_0.out_port(0).get_destination(), node_2.in_port(0)) + self.assertEqual(node_2.in_port(0).get_source(), node_0.out_port(0)) + self.assertEqual(node_1.out_port(0).get_destination(), None) + + self.assertTrue(len(node_0.out_nodes()) == 1) + self.assertTrue(len(node_1.in_nodes()) == 0) + self.assertTrue(len(node_2.in_nodes()) == 1) + + def test_port_disconnect_4(self): + # 1-->1_data---\ 0-->0_data + # 0-->0_data---->2-->2_data ==> 1-->1_data-->2-->2_data + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('0_data', '2'), + ('1_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_2.in_port(0).disconnect() + + self.assertEqual(node_1.out_port(0).get_destination(), node_2.in_port(1)) + self.assertEqual(node_2.in_port(1).get_source(), node_1.out_port(0)) + self.assertEqual(node_2.in_port(0).get_source(), None) + self.assertEqual(node_0.out_port(0).get_destination(), None) + # + # self.assertTrue(len(node_0.out_nodes()) == 1) + # self.assertTrue(len(node_1.in_nodes()) == 0) + # self.assertTrue(len(node_2.in_nodes()) == 1) + + ########################################### + ### TESTS FOR CONNECTION CLASS METHODS #### + ########################################### + + def test_connection_set_source_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + ('4', '4_data'), + + ('0_data', '1'), + ('0_data', '2'), + ('3_data', '4'), + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + node_3 = Node(graph, '3') + node_4 = Node(graph, '4') + + c = node_0.out_port(0).get_connection() + c.set_source(node_3.out_port(0)) + + self.assertTrue(node_0.out_node().kind == 'data') + + self.assertEqual(node_0.out_port(0).get_destinations(), []) + destinations = node_3.out_port(0).get_destinations() + for port in destinations: + self.assertTrue(port in [node_1.in_port(0), node_2.in_port(0), node_4.in_port(0)]) + + def test_connection_set_source_2(self): + # 2-->2_data ,->2-->2_data + # 0-->0_data-->1-->1_data ==> 0-->0_data/-->1-->1_data + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_2 = Node(graph, '2') + node_2.add_input_port(0) + + node_2.in_port(0).get_connection().set_source(node_0.out_port(0)) + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2'), + ]) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_set_source_3(self): + # ,->2-->2_data 0-->0_data-->1-->1_data + # 0-->0_data/-->1-->1_data => 3-->3_data-->2-->2_data + # 3-->3_data + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('0_data', '2'), + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_2 = Node(graph, '2') + node_3 = Node(graph, '3') + + node_2.in_port(0).get_connection().set_source(node_3.out_port(0)) + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('3_data', '2'), + ]) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + (flag, resp) = compare_graphs(graph, graph_ref, '2', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_set_source_4(self): + # 0 1 ==> 0-->1 + graph = build_graph(self.nodes, []) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + + node_0.add_output_port(0) + node_1.add_input_port(0) + + node_1.in_port(0).get_connection().set_source(node_0.out_port(0)) + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('0_data', '1'), + ]) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_set_destination(self): + # ,->2-->2_data-->3-->3_data ,->2-->2_data + # 0-->0_data/-->1-->1_data ==> 0-->0_data/-->3-->3_data + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('0_data', '2'), + ('2_data', '3'), + ]) + graph.__setattr__('stage', 'middle') + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '3'), + ('0_data', '2'), + ]) + + node_1 = Node(graph, '1') + node_3 = Node(graph, '3') + + node_3.in_port(0).disconnect() + node_1.in_port(0).get_connection().set_destination(node_3.in_port(0)) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_add_destination_1(self): + # 3-->3_data ,-->3-->3_data + # ,->2-->2_data ,-->2-->2_data + # 0-->0_data/-->1-->1_data ==> 0-->0_data/-->1-->1_data + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('0_data', '2'), + ]) + graph.__setattr__('stage', 'middle') + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('0_data', '2'), + ('0_data', '3'), + ]) + + node_0 = Node(graph, '0') + node_3 = Node(graph, '3') + node_3.add_input_port(idx=0) + + node_0.out_port(0).get_connection().add_destination(node_3.in_port(0)) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_add_destination_2(self): + # 0 + # 1-->1_data ==> 0-->0_data-->1-->1_data + graph = build_graph(self.nodes, [ + ('1', '1_data'), + ]) + graph.__setattr__('stage', 'middle') + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('0_data', '1'), + ]) + + node_0 = Node(graph, '0') + node_0.add_output_port(idx=0) + + node_1 = Node(graph, '1') + node_1.add_input_port(idx=0) + + node_0.out_port(0).get_connection().add_destination(node_1.in_port(0)) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_get_source_destinations_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + source = node_0.out_port(0).get_connection().get_source() + destinations = node_0.out_port(0).get_connection().get_destinations() + + self.assertEqual(source, node_0.out_port(0)) + for port in destinations: + self.assertTrue(port in [node_1.in_port(0), node_2.in_port(0)]) + + self.assertEqual(node_1.out_port(0).get_connection().get_destination(), None) + self.assertEqual(node_1.out_port(0).get_destination(), None) + + self.assertEqual(node_2.out_port(0).get_connection().get_destination(), None) + self.assertEqual(node_2.out_port(0).get_destination(), None) + + def test_connection_remove_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_1.in_port(0).get_connection().remove() + + self.assertEqual(node_0.out_port(0).get_destinations(), [node_2.in_port(0)]) + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), node_0.out_port(0)) + + def test_connection_remove_2(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ]) + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_0.out_port(0).get_connection().remove() + + self.assertEqual(node_0.out_port(0).get_destinations(), []) + self.assertEqual(node_1.out_port(0).get_destinations(), []) + self.assertEqual(node_2.out_port(0).get_destinations(), []) + + def test_connection_data_1(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + + ('0_data', '1'), + ('0_data', '2') + ], {'0_data': {'value': np.ones((1,3,64,64)), 'shape': np.array([1, 3, 64, 64])}}) + + graph.__setattr__('stage', 'middle') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + self.assertTrue(np.array_equal(node_0.out_port(0).get_connection().data.get_shape(), (1, 3, 64, 64))) + self.assertTrue(np.array_equal(node_0.out_port(0).get_connection().data.get_value(), np.ones((1, 3, 64, 64)))) + + self.assertEqual(node_1.out_port(0).get_connection().data.get_shape(), None) + self.assertEqual(node_1.out_port(0).get_connection().data.get_value(), None) + + self.assertEqual(node_2.out_port(0).get_connection().data.get_shape(), None) + self.assertEqual(node_2.out_port(0).get_connection().data.get_value(), None) + + ########################################### + ################## OTHER ################## + ########################################### + + def test_graph_cleanup_that_restores_const_operations(self): + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('2_data', '1'), + ('3_data', '2'), + ], { + '3': {'shape': np.array([1, 227, 227, 3]), 'value': np.ones((1, 227, 227, 3))}, + '3_data': {'shape': np.array([1, 227, 227, 3]), 'value': np.ones((1, 227, 227, 3))}, + '2': {'shape': np.array([1, 227, 227, 3]), 'value': np.ones((1, 227, 227, 3))}, + '2_data': {'shape': np.array([1, 227, 227, 3]), 'value': np.ones((1, 227, 227, 3))}, + }, nodes_with_edges_only=True) + add_opoutput(graph, '1_data', 0, False) + + graph_ref = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('const_1', '2_data'), + + ('0_data', '1'), + ('2_data', '1'), + ], { + 'const_1': {'shape': np.array([1, 227, 227, 3]), 'value': np.ones((1, 227, 227, 3))}, + '2_data': {'shape': np.array([1, 227, 227, 3]), 'value': np.ones((1, 227, 227, 3))}, + }, nodes_with_edges_only=True) + add_opoutput(graph_ref, '1_data', 0, False) + + from mo.middle.passes.eliminate import graph_clean_up + graph_clean_up(graph) + graph_clean_up(graph_ref) + + (flag, resp) = compare_graphs(graph, graph_ref, '1_data', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_node_in_out_ports_order(self): + # + # ,->2-->2_data---,->3-->3_data + # 0-->0_data-/-->1-->1_data--/ + # + graph = build_graph(self.nodes, [ + ('0', '0_data'), + ('1', '1_data'), + ('2', '2_data'), + ('3', '3_data'), + + ('0_data', '1'), + ('0_data', '2'), + ('1_data', '3'), + ('2_data', '3'), + ]) + + for id in ['0', '1', '2', '3']: + node = Node(graph, id) + for idx in range(len(node.in_ports())): + self.assertEqual(node.in_port(idx), node.in_ports()[idx]) + for idx in range(len(node.out_ports())): + self.assertEqual(node.out_port(idx), node.out_ports()[idx]) + + +class TestNewGraphAPIFront(unittest.TestCase): + nodes = { + '0': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '1': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '2': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '3': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + '4': {'type': 'Placeholder', 'value': None, 'kind': 'op', 'op': 'Placeholder'}, + 'const_1': {'type': 'Const', 'value': None, 'kind': 'op', 'op': 'Const'}, + } + + ########################################### + ###### TESTS FOR PORT CLASS METHODS ####### + ########################################### + + def test_port_get_destinations_1(self): + # ,->2 + # /-->1 + # 0 + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2'), + ]) + graph.__setattr__('stage', 'front') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + node_2_in_port = Node(graph, '2').in_port(0) + + ports = node_0_out_port.get_destinations() + + self.assertTrue(len(ports) == 2) + for port in ports: + self.assertTrue(port in [node_1_in_port, node_2_in_port]) + + def test_port_get_destination_1(self): + # ,->2 + # /-->1 + # 0 + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2'), + ]) + graph.__setattr__('stage', 'front') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + node_2_in_port = Node(graph, '2').in_port(0) + + with self.assertRaises(Error): + node_0_out_port.get_destination() + + def test_port_get_destination_2(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ]) + graph.__setattr__('stage', 'front') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + + self.assertEqual(node_0_out_port.get_destination(), node_1_in_port) + + def test_port_get_source_1(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ]) + graph.__setattr__('stage', 'front') + + node_0_out_port = Node(graph, '0').out_port(0) + + node_1_in_port = Node(graph, '1').in_port(0) + + self.assertEqual(node_1_in_port.get_source(), node_0_out_port) + + def test_port_get_source_2(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ('2', '1') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + self.assertEqual(node_1.in_port(0).get_source(), node_0.out_port(0)) + self.assertEqual(node_1.in_port(1).get_source(), node_2.out_port(0)) + + def test_port_get_source_3(self): + graph = build_graph(self.nodes, []) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_0.add_input_port(0) + node_1.add_input_port(0) + node_2.add_input_port(0) + + self.assertEqual(node_0.in_port(0).get_source(), None) + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), None) + + def test_port_disconnect_1(self): + # ,-->1-->1_data 0-->0_data + # 0-->0_data/--->2-->2_data ==> 0-->0_data 1-->1_data + # 2-->2_data + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_0.out_port(0).disconnect() + + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), None) + + self.assertTrue(len(node_1.in_nodes()) == 0) + self.assertTrue(len(node_2.in_nodes()) == 0) + + def test_port_disconnect_2(self): + # ,-->1 ,-->1 + # 0-->/--->2 ==> 0-->/ 2 + # + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_2.in_port(0).disconnect() + + self.assertEqual(node_0.out_port(0).get_destination(), node_1.in_port(0)) + self.assertEqual(node_1.in_port(0).get_source(), node_0.out_port(0)) + self.assertEqual(node_2.in_port(0).get_source(), None) + + self.assertTrue(len(node_0.out_nodes()) == 1) + self.assertTrue(len(node_1.in_nodes()) == 1) + self.assertTrue(len(node_2.in_nodes()) == 0) + + def test_port_disconnect_3(self): + # 1---\ 1 + # 0---->2 ==> 0-->2 + # + graph = build_graph(self.nodes, [ + ('0', '2'), + ('1', '2') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_2.in_port(1).disconnect() + + self.assertEqual(node_0.out_port(0).get_destination(), node_2.in_port(0)) + self.assertEqual(node_2.in_port(0).get_source(), node_0.out_port(0)) + self.assertEqual(node_1.out_port(0).get_destination(), None) + + self.assertTrue(len(node_0.out_nodes()) == 1) + self.assertTrue(len(node_1.in_nodes()) == 0) + self.assertTrue(len(node_2.in_nodes()) == 1) + + def test_port_disconnect_4(self): + # 1-----\ 0 + # 0------>2 ==> 1--->2 + # + graph = build_graph(self.nodes, [ + ('0', '2'), + ('1', '2') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_2.in_port(0).disconnect() + + self.assertEqual(node_1.out_port(0).get_destination(), node_2.in_port(1)) + self.assertEqual(node_2.in_port(1).get_source(), node_1.out_port(0)) + self.assertEqual(node_2.in_port(0).get_source(), None) + self.assertEqual(node_0.out_port(0).get_destination(), None) + + def test_port_disconnected_1(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ('1', '2') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + node_2.add_output_port(0) + node_0.add_input_port(0) + + self.assertTrue(not node_0.out_port(0).disconnected()) + self.assertTrue(not node_1.out_port(0).disconnected()) + self.assertTrue(not node_1.in_port(0).disconnected()) + self.assertTrue(node_2.out_port(0).disconnected()) + self.assertTrue(node_0.in_port(0).disconnected()) + + def test_port_get_connection_1(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ('1', '2'), + ('1', '3'), + ]) + graph.__setattr__('stage', 'front') + + node_1 = Node(graph, '1') + node_2 = Node(graph, '3') + node_3 = Node(graph, '2') + + c = node_1.out_port(0).get_connection() + + self.assertTrue(c.get_source() == node_1.out_port(0)) + for port in c.get_destinations(): + self.assertTrue(port in [node_2.in_port(0), node_3.in_port(0)]) + + ########################################### + ### TESTS FOR CONNECTION CLASS METHODS #### + ########################################### + + def test_connection_set_source_1(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2'), + ('3', '4'), + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + node_3 = Node(graph, '3') + node_4 = Node(graph, '4') + + c = node_0.out_port(0).get_connection() + c.set_source(node_3.out_port(0)) + + self.assertEqual(node_0.out_port(0).get_destinations(), []) + destinations = node_3.out_port(0).get_destinations() + for port in destinations: + self.assertTrue(port in [node_1.in_port(0), node_2.in_port(0), node_4.in_port(0)]) + + def test_connection_set_source_2(self): + # 2 ,->2 + # 0-->1 ==> 0/-->1 + # + graph = build_graph(self.nodes, [ + ('0', '1'), + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_2 = Node(graph, '2') + node_2.add_input_port(0) + + node_2.in_port(0).get_connection().set_source(node_0.out_port(0)) + + graph_ref = build_graph(self.nodes, [ + ('0', '1', {'out': 0, 'in': 0}), + ('0', '2', {'out': 0, 'in': 0}), + ]) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_set_source_3(self): + # 0 1 ==> 0-->1 + graph = build_graph(self.nodes, []) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + + node_0.add_output_port(0) + node_1.add_input_port(0) + + node_1.in_port(0).get_connection().set_source(node_0.out_port(0)) + + graph_ref = build_graph(self.nodes, [ + ('0', '1', {'out': 0, 'in': 0}), + ]) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_set_destination(self): + # ,->2-->2_data-->3-->3_data ,->2-->2_data + # 0-->0_data/-->1-->1_data ==> 0-->0_data/-->3-->3_data + # + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2'), + ('2', '3'), + ]) + graph.__setattr__('stage', 'front') + + graph_ref = build_graph(self.nodes, [ + ('0', '3'), + ('0', '2'), + ]) + + node_1 = Node(graph, '1') + node_3 = Node(graph, '3') + + node_3.in_port(0).disconnect() + node_1.in_port(0).get_connection().set_destination(node_3.in_port(0)) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_add_destination_1(self): + # 3 ,-->3 + # ,->2 ,-->2 + # 0--/-->1 ==> 0--/-->1 + # + graph = build_graph(self.nodes, [ + ('0', '1', {'in': 0, 'out': 0}), + ('0', '2', {'in': 0, 'out': 0}), + ]) + graph.__setattr__('stage', 'front') + + graph_ref = build_graph(self.nodes, [ + ('0', '1', {'in': 0, 'out': 0}), + ('0', '2', {'in': 0, 'out': 0}), + ('0', '3', {'in': 0, 'out': 0}), + ]) + + node_0 = Node(graph, '0') + node_3 = Node(graph, '3') + node_3.add_input_port(idx=0) + + node_0.out_port(0).get_connection().add_destination(node_3.in_port(0)) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_add_destination_2(self): + # 0 + # 1 ==> 0-->1 + graph = build_graph(self.nodes, []) + graph.__setattr__('stage', 'front') + + graph_ref = build_graph(self.nodes, [ + ('0', '1'), + ]) + + node_0 = Node(graph, '0') + node_0.add_output_port(idx=0) + + node_1 = Node(graph, '1') + node_1.add_input_port(idx=0) + + node_0.out_port(0).get_connection().add_destination(node_1.in_port(0)) + + (flag, resp) = compare_graphs(graph, graph_ref, '0', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_connection_get_source_destinations_1(self): + graph = build_graph(self.nodes, [ + ('0', '1'), + ('0', '2') + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + node_1.add_output_port(idx=0) + node_2.add_output_port(idx=0) + + source = node_0.out_port(0).get_connection().get_source() + destinations = node_0.out_port(0).get_connection().get_destinations() + + self.assertEqual(source, node_0.out_port(0)) + for port in destinations: + self.assertTrue(port in [node_1.in_port(0), node_2.in_port(0)]) + + self.assertEqual(node_1.out_port(0).get_connection().get_destination(), None) + self.assertEqual(node_1.out_port(0).get_destination(), None) + + self.assertEqual(node_2.out_port(0).get_connection().get_destination(), None) + self.assertEqual(node_2.out_port(0).get_destination(), None) + + def test_connection_remove_1(self): + graph = build_graph(self.nodes, [ + ('0', '1', {'in': 0, 'out': 0}), + ('0', '2', {'in': 0, 'out': 0}) + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_1.in_port(0).get_connection().remove() + + self.assertEqual(node_0.out_port(0).get_destinations(), [node_2.in_port(0)]) + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), node_0.out_port(0)) + + def test_connection_remove_2(self): + graph = build_graph(self.nodes, [ + ('0', '1', {'in': 0, 'out': 0}), + ('0', '2', {'in': 0, 'out': 0}) + ]) + graph.__setattr__('stage', 'front') + + node_0 = Node(graph, '0') + node_1 = Node(graph, '1') + node_2 = Node(graph, '2') + + node_0.out_port(0).get_connection().remove() + + self.assertEqual(node_0.out_port(0).get_destinations(), []) + self.assertEqual(node_1.in_port(0).get_source(), None) + self.assertEqual(node_2.in_port(0).get_source(), None) diff --git a/model-optimizer/mo/graph/port.py b/model-optimizer/mo/graph/port.py new file mode 100644 index 0000000..4584cfc --- /dev/null +++ b/model-optimizer/mo/graph/port.py @@ -0,0 +1,275 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from copy import deepcopy + +import numpy as np +import networkx as nx + +from collections import namedtuple + +from mo.front.common.partial_infer.utils import int64_array +from mo.graph.connection import Connection +from mo.utils.error import Error + + +class Port: + def __init__(self, node, idx: int, type: str): + if type not in ['in', 'out']: + raise Error("Inappropriate port type: {}".format(type)) + + # We use self.__dict__ only to not to call __setattr__ method from __init__ function + self.__dict__['node'] = node + self.__dict__['idx'] = idx + self.__dict__['type'] = type + self.__dict__['data'] = namedtuple('Data', ['get_value', 'get_shape', 'get_attr', 'set_value', 'set_shape', 'set_attr', 'has_valid']) + + self.data.get_shape = self._get_shape + self.data.set_shape = self._set_shape + + self.data.get_value = self._get_value + self.data.set_value = self._set_value + + self.data.get_attr = self._get_attr + self.data.set_attr = self._set_attr + + self.data.has_valid = self._has_valid + + def __eq__(self, other): + return ( + self.__class__ == other.__class__ and + self.node.graph == other.node.graph and + self.node.id == other.node.id and + self.type == other.type and + self.idx == other.idx + ) + + def __deepcopy__(self, memo): + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + result.__dict__[k] = v if k in ['graph', 'node'] else deepcopy(v) + return result + + def __setattr__(self, key, value): + edge = self.node.in_edge(self.idx) if self.type == 'in' else self.node.out_edge(self.idx) + edge[key] = value + + def __getattr__(self, item): + edge = self.node.in_edge(self.idx) if self.type == 'in' else self.node.out_edge(self.idx) + + def _create_data_if_necessary(self): + if self.node.graph.stage == 'front': + raise Error("_create_data_if_necessary method is not applicable for front Graph phase!") + if self.type == 'in': + raise Error("_create_data_if_necessary method is not applicable for 'in' Port type!") + + if self.idx not in self.node.out_nodes(): + from mo.ops.op import Op + Op.create_data_node(self.node.graph, self.node, out_port=self.idx) + self.node['need_shape_inference'] = True + return self.node.out_node(self.idx) + + def _get_shape(self): + if self.node.graph.stage == 'front': + return None + else: + if self.type == 'in': + return self.node.in_node(self.idx).shape + else: + return self.node.out_node(self.idx).shape + + def _set_shape(self, shape): + if self.node.graph.stage == 'front': + raise NotImplementedError("set_shape not implemented for front phase") + else: + if self.type == 'in': + assert self.node.in_node(self.idx).value is None + self.node.in_node(self.idx).shape = int64_array(shape) + else: + assert self.node.out_node(self.idx).value is None + self.node.out_node(self.idx).shape = int64_array(shape) + + def _get_value(self): + if self.node.graph.stage == 'front': + return None + else: + if self.type == 'in': + if self.idx in self.node.in_nodes() and self.node.in_node(self.idx).has_valid('value'): + return self.node.in_node(self.idx).value + else: + if self.idx in self.node.out_nodes() and self.node.out_node(self.idx).has_valid('value'): + return self.node.out_node(self.idx).value + return None + + def _set_value(self, value): + if self.node.graph.stage == 'front': + raise Error("set_value is not applicable for graph front phase") + else: + if self.type == 'in': + self.node.in_node(self.idx).value = value + self.node.in_node(self.idx).shape = int64_array(value.shape) + else: + self.node.out_node(self.idx).value = value + self.node.out_node(self.idx).shape = int64_array(value.shape) + + def _get_attr(self, item: str): + if self.node.graph.stage == 'front': + return None + else: + if self.type == 'in': + if self.idx in self.node.in_nodes() and self.node.in_node(self.idx).has_valid(item): + return self.node.in_node(self.idx)[item] + else: + if self.idx in self.node.out_nodes() and self.node.out_node(self.idx).has_valid(item): + return self.node.out_node(self.idx)[item] + return None + + def _set_attr(self, item, value): + raise NotImplementedError() + + def get_in_edge_attrs(self, data=False): + assert self.type == 'in' + for u, v, d in list(self.node.graph.in_edges(self.node.id, data=True)): + if d['in'] == self.idx: + edge_attrs = self.node.graph.get_edge_data(u, v) + for key in edge_attrs: + if edge_attrs[key]['in'] == self.idx: + if data: + return edge_attrs[key], u, v, key + else: + return edge_attrs[key] + if data: + return None, None, None, None + else: + return None + + def _has_valid(self, item): + if self.node.graph.stage == 'front': + raise NotImplementedError + else: + if self.type == 'in': + if self.idx in self.node.in_nodes() and self.node.in_node(self.idx).has_valid(item): + return True + else: + if self.idx in self.node.out_nodes() and self.node.out_node(self.idx).has_valid(item): + return True + return False + + def disconnected(self): + # This method returns False if port connected with some other port + # otherwise it returns True + + if self.type == 'in': + return self.get_source() is None + else: + return len(self.get_destinations()) == 0 + + def get_source(self): + # This method returns Port object that is producer (source) port for out port. + # In case if out port has no source port return None + + assert self.type != 'out', "Can't get source for output port at {} node".format(self.node.name) + + from mo.graph.graph import Node + producer_ports = [] + + has_producer = False + if self.node.graph.stage == 'front': + for n, d in self.node.get_inputs(): + if d['in'] == self.idx: + node = Node(self.node.graph, n) + producer_ports.append(node.out_port(d['out'])) + has_producer = True + if not has_producer: + return None + else: + if self.idx not in self.node.in_nodes(): + return None + + in_data = self.node.in_node(self.idx) + for n, d in in_data.get_inputs(): + node = Node(self.node.graph, n) + producer_ports.append(node.out_port(d['out'])) + + if len(producer_ports) != 1: + raise Error("Something happened with graph! data node has {} producers".format(len(producer_ports))) + + return producer_ports[0] + + def get_destination(self): + # This method returns Port that is consumer (destination) port for in port. + # In case if in port has no consumer return None + + consumer_ports = self.get_destinations() + if not consumer_ports: + return None + + if len(consumer_ports) > 1: + raise Error("The number of destinations for {} node at {} port is {}".format(self.node.name, + self.idx, + len(consumer_ports))) + return consumer_ports[0] + + def get_destinations(self): + assert self.type != 'in', "Can't get destinations for input port at {} node".format(self.node.name) + + from mo.graph.graph import Node + consumer_ports = [] + if self.node.graph.stage == 'front': + producer_node = self.node + else: + # In case if node has no output data node in given port, we return None + if self.idx not in self.node.out_nodes(): + return [] + producer_node = self.node.out_node(self.idx) + + for n, d in producer_node.get_outputs(): + node = Node(self.node.graph, n) + consumer_ports.append(node.in_port(d['in'])) + return consumer_ports + + def disconnect(self): + if self.type == 'out': + consumer_ports = self.get_destinations() + if self.node.graph.stage == 'front': + for port in consumer_ports: + self.node.graph.remove_edge(self.node.id, port.node.id) + else: + for port in consumer_ports: + self.node.graph.remove_edge(port.node.in_node(port.idx).id, port.node.id) + else: + source_port = self.get_source() + if source_port is None: + return + for u, v, d in list(self.node.graph.in_edges(self.node.id, data=True)): + if d['in'] == self.idx: + for key in self.node.graph.get_edge_data(u, v): + if self.node.graph.get_edge_data(u, v)[key]['in'] == self.idx: + self.node.graph.remove_edge(u, v, key=key) + return + + def get_connection(self): + if self.type == 'in': + return Connection(self.node.graph, self.get_source(), [self]) + else: + return Connection(self.node.graph, self, self.get_destinations()) + + def connect(self, port): + if self.type == 'in': + self.get_connection().set_source(port) + else: + self.get_connection().add_destination(port) diff --git a/model-optimizer/mo/main.py b/model-optimizer/mo/main.py index f843c5d..ac96364 100644 --- a/model-optimizer/mo/main.py +++ b/model-optimizer/mo/main.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -210,16 +210,14 @@ def driver(argv: argparse.Namespace): raise Error('Incorrect saved model tag was provided. Specify --saved_model_tags with no spaces in it') argv.saved_model_tags = argv.saved_model_tags.split(',') - outputs = None + argv.output = argv.output.split(',') if argv.output else None - if argv.output: - outputs = argv.output.split(',') - - placeholder_shapes = get_placeholder_shapes(argv.input, argv.input_shape, argv.batch) + argv.placeholder_shapes = get_placeholder_shapes(argv.input, argv.input_shape, argv.batch) mean_values = parse_tuple_pairs(argv.mean_values) scale_values = parse_tuple_pairs(argv.scale_values) mean_scale = get_mean_scale_dictionary(mean_values, scale_values, argv.input) + argv.mean_scale_values = mean_scale if not os.path.exists(argv.output_dir): try: @@ -233,7 +231,7 @@ def driver(argv: argparse.Namespace): raise Error("Output directory {} is not writable for current user. " + refer_to_faq_msg(22), argv.output_dir) - log.debug("Placeholder shapes : {}".format(placeholder_shapes)) + log.debug("Placeholder shapes : {}".format(argv.placeholder_shapes)) ret_res = 1 if hasattr(argv, 'extensions') and argv.extensions and argv.extensions != '': @@ -259,47 +257,36 @@ def driver(argv: argparse.Namespace): if is_tf: import mo.pipeline.tf as mo_tf - from mo.front.tf.register_custom_ops import update_registration - import_extensions.load_dirs(argv.framework, extensions, update_registration) - ret_res = mo_tf.tf2nx(argv, argv.input_model, model_name, outputs, argv.output_dir, argv.scale, - is_binary=not argv.input_model_is_text, - user_shapes=placeholder_shapes, - mean_scale_values=mean_scale) + from mo.front.tf.register_custom_ops import get_front_classes + import_extensions.load_dirs(argv.framework, extensions, get_front_classes) + ret_res = mo_tf.tf2nx(argv, argv.input_model, model_name, argv.output_dir, + is_binary=not argv.input_model_is_text) elif is_caffe: import mo.pipeline.caffe as mo_caffe - from mo.front.caffe.register_custom_ops import update_registration - import_extensions.load_dirs(argv.framework, extensions, update_registration) - ret_res = mo_caffe.driver(argv, argv.input_proto, argv.input_model, model_name, outputs, argv.output_dir, - argv.scale, - user_shapes=placeholder_shapes, - mean_scale_values=mean_scale, + from mo.front.caffe.register_custom_ops import get_front_classes + import_extensions.load_dirs(argv.framework, extensions, get_front_classes) + ret_res = mo_caffe.driver(argv, argv.input_proto, argv.input_model, model_name, argv.output_dir, mean_file=argv.mean_file, mean_file_offsets=mean_file_offsets, custom_layers_mapping_path=custom_layers_mapping_path) elif is_mxnet: import mo.pipeline.mx as mo_mxnet - from mo.front.mxnet.register_custom_ops import update_registration - import_extensions.load_dirs(argv.framework, extensions, update_registration) - ret_res = mo_mxnet.driver(argv, argv.input_model, model_name, outputs, argv.output_dir, argv.scale, - placeholder_shapes=placeholder_shapes, - mean_scale_values=mean_scale) + from mo.front.mxnet.register_custom_ops import get_front_classes + import_extensions.load_dirs(argv.framework, extensions, get_front_classes) + ret_res = mo_mxnet.driver(argv, argv.input_model, model_name, argv.output_dir) elif is_kaldi: import mo.pipeline.kaldi as mo_kaldi - from mo.front.kaldi.register_custom_ops import update_registration - import_extensions.load_dirs(argv.framework, extensions, update_registration) - ret_res = mo_kaldi.driver(argv, argv.input_model, model_name, outputs, argv.output_dir, argv.scale, - placeholder_shapes=placeholder_shapes, - mean_scale_values=mean_scale) + from mo.front.kaldi.register_custom_ops import get_front_classes + import_extensions.load_dirs(argv.framework, extensions, get_front_classes) + ret_res = mo_kaldi.driver(argv, argv.input_model, model_name, argv.output_dir) elif is_onnx: import mo.pipeline.onnx as mo_onnx - from mo.front.onnx.register_custom_ops import update_registration - import_extensions.load_dirs(argv.framework, extensions, update_registration) - ret_res = mo_onnx.driver(argv, argv.input_model, model_name, outputs, argv.output_dir, argv.scale, - user_shapes=placeholder_shapes, - mean_scale_values=mean_scale) + from mo.front.onnx.register_custom_ops import get_front_classes + import_extensions.load_dirs(argv.framework, extensions, get_front_classes) + ret_res = mo_onnx.driver(argv, argv.input_model, model_name, argv.output_dir) if ret_res != 0: return ret_res diff --git a/model-optimizer/mo/main_test.py b/model-optimizer/mo/main_test.py index 79f9feb..a1501a6 100644 --- a/model-optimizer/mo/main_test.py +++ b/model-optimizer/mo/main_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/middle/passes/conv.py b/model-optimizer/mo/middle/passes/conv.py index 9c6654f..2d4160b 100644 --- a/model-optimizer/mo/middle/passes/conv.py +++ b/model-optimizer/mo/middle/passes/conv.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,23 +14,23 @@ limitations under the License. """ -import copy import logging as log -import networkx as nx import numpy as np from mo.front.common.layout import get_batch_dim, get_features_dim from mo.front.common.partial_infer.utils import assign_dims_to_weights from mo.front.extractor import add_attrs_props from mo.front.extractor import update_ie_fields -from mo.graph.graph import Node, unique_id +from mo.graph.graph import Node, Graph +from mo.graph.port import Port from mo.middle.passes.fusing.helpers import get_value_id, get_tensor_id from mo.middle.pattern_match import apply_pattern -from mo.ops.op import Op +from mo.ops.const import Const +from mo.ops.scale_shift import ScaleShiftOp -def pad_op_transform(graph: nx.MultiDiGraph, match: dict): +def pad_op_transform(graph: Graph, match: dict): op = match['op'] pad_op = match['pad_op'] input_data = pad_op.in_node(0) @@ -45,7 +45,7 @@ def pad_op_transform(graph: nx.MultiDiGraph, match: dict): return input_tensor_dims = len(match['pad_output'].shape) - if np.any(pads[get_features_dim(op.graph.graph['layout'],input_tensor_dims)] != 0) or \ + if np.any(pads[get_features_dim(op.graph.graph['layout'], input_tensor_dims)] != 0) or \ np.any(pads[get_batch_dim(op.graph.graph['layout'], input_tensor_dims)] != 0): log.info('The pad node "{}" with padding over feature/batch dimension cannot be fused.'.format( pad_op.soft_get('name'))) @@ -60,7 +60,7 @@ def pad_op_transform(graph: nx.MultiDiGraph, match: dict): graph.add_edge(input_data.id, match['op'].id, **{'in': 0, **edge_attrs}) -def fuse_pad(graph: nx.MultiDiGraph): +def fuse_pad(graph: Graph): for op_type in ['Convolution', 'Pooling', 'Deconvolution']: apply_pattern( graph, @@ -74,7 +74,7 @@ def fuse_pad(graph: nx.MultiDiGraph): ) -def convert_matmul_to_fully_connected(graph: nx.MultiDiGraph): +def convert_matmul_to_fully_connected(graph: Graph): apply_pattern( graph, nodes=[ @@ -85,7 +85,7 @@ def convert_matmul_to_fully_connected(graph: nx.MultiDiGraph): ) -def matmul_to_fully_connected_action(graph: nx.MultiDiGraph, match: dict): +def matmul_to_fully_connected_action(graph: Graph, match: dict): log.debug('fully_connected_matched') matmul = match['matmul'] input = matmul.in_node(0) @@ -96,11 +96,11 @@ def matmul_to_fully_connected_action(graph: nx.MultiDiGraph, match: dict): len(weights_consumers) if weights_consumers is not None else None)) if not (weights.value is not None and - input.shape is not None and - len(input.shape) >= 2 and - weights.shape is not None and - len(weights.shape) == 2 and - len(weights_consumers) >= 1): + input.shape is not None and + len(input.shape) >= 2 and + weights.shape is not None and + len(weights.shape) == 2 and + len(weights_consumers) >= 1): matmul['can_be_fused'] = False return @@ -116,7 +116,7 @@ def matmul_to_fully_connected_action(graph: nx.MultiDiGraph, match: dict): # Do not transpose weights in this pass, it will be done as a separate pass -def transpose_fully_connected_weights(graph: nx.MultiDiGraph): +def transpose_fully_connected_weights(graph: Graph): transposed_for_IE = 'transposed_for_IE' for node in graph.nodes(): node = Node(graph, node) @@ -133,58 +133,7 @@ def transpose_fully_connected_weights(graph: nx.MultiDiGraph): weights.shape = np.array(weights.value.shape) -def gemm_to_fully_connected_action(graph: nx.MultiDiGraph, match: dict): - log.debug('gemm_to_fully_connected_action is triggered') - gemm = match['gemm'] - A = gemm.in_node(0) - B = gemm.in_node(1) - B_consumers = graph.out_edges(B.node) - C = gemm.in_node(2) - C_consumers = graph.out_edges(C.node) - - if not (B.value is not None and - C.value is not None and - A.shape is not None and - C.shape.size == 1 and - not gemm.transpose_a and - (len(B_consumers) == 1 or not gemm.transpose_b)): - log.warning('Cannot convert Gemm to FullyConnected') - return - - if gemm.transpose_b: - # B.value = B.value.transpose() - # B.shape = np.array(B.value.shape, dtype=np.int64) - gemm.transpose_b = 0 - else: - B.value = B.value.transpose() - B.shape = np.array(B.value.shape, dtype=np.int64) - - gemm['out-size'] = gemm.out_node().shape[-1] - gemm['type'] = 'FullyConnected' - gemm['channel_dims'] = len(match['output'].shape) - 1 - gemm['bias_addable'] = True - gemm['input_channel_dim'] = 1 # MatMul weights in IO - gemm['output_channel_dim'] = 0 - gemm['layout'] = 'NCHW' - gemm.in_edge(1)['bin'] = 'weights' - gemm.in_edge(2)['bin'] = 'biases' - - assign_dims_to_weights(gemm.in_node(1), None, 1, 0, 2) - # Do not transpose weights in this pass, it will be done as a separate pass - - -def convert_gemm_to_fully_connected(graph: nx.MultiDiGraph): - apply_pattern( - graph, - nodes=[ - ('gemm', dict(kind='op', op='Gemm')), - ('output', dict(kind='data'))], - edges=[('gemm', 'output')], - action=gemm_to_fully_connected_action - ) - - -def muladd_to_scaleshift_action(graph: nx.MultiDiGraph, match: dict): +def muladd_to_scaleshift_action(graph: Graph, match: dict): mul = match['mul'] add = match['add'] output = match['output'] @@ -212,15 +161,15 @@ def muladd_to_scaleshift_action(graph: nx.MultiDiGraph, match: dict): # Transform values weights.value = np.squeeze(weights.value) - weights.shape = weights.value.shape + weights.shape = np.array(weights.value.shape, dtype=np.int64) bias.value = np.squeeze(bias.value) - bias.shape = bias.value.shape + bias.shape = np.array(bias.value.shape, dtype=np.int64) # Broadcast weights if they are scalar if weights.value.ndim == 0 and bias.value.ndim == 1: weights.value = np.full(bias.shape, weights.value.item()) - weights.shape = weights.value.shape + weights.shape = np.array(weights.value.shape, dtype=np.int64) if bias.shape != weights.shape: log.warning('Mul->Add to ScaleShift conversion stoped {} != {}'.format(weights.shape, bias.shape)) @@ -243,7 +192,7 @@ def muladd_to_scaleshift_action(graph: nx.MultiDiGraph, match: dict): graph.remove_edge(bias.node, add.id) graph.remove_edge(add.node, output.id) - op_node = unique_id(graph, mul.name + '/Fused{}_'.format(op_name)) + op_node = graph.unique_id(mul.name + '/Fused{}_'.format(op_name)) if op_name == 'ScaleShift': graph.add_node(op_node, **add_attrs_props(dict(kind='op', precision="FP32", type=op_name, name=op_node, op=op_name, data_type=input.data_type))) @@ -254,6 +203,10 @@ def muladd_to_scaleshift_action(graph: nx.MultiDiGraph, match: dict): (bias.node, op_node, {'in': 2, 'bin': 'biases'}), (op_node, output.node, {'out': 0}) ]) + scsh = Node(graph, op_node) + scsh.add_input_port(0) + scsh.add_input_port(1) + scsh.add_output_port(0) else: graph.add_node(op_node, **add_attrs_props(dict(kind='op', precision="FP32", type=op_name, name=op_node, op=op_name, data_type=input.data_type, power=1, @@ -263,11 +216,13 @@ def muladd_to_scaleshift_action(graph: nx.MultiDiGraph, match: dict): (input.node, op_node, {'in': 0}), (op_node, output.node, {'out': 0}) ]) - + scsh = Node(graph, op_node) + scsh.add_input_port(0) + scsh.add_output_port(0) return -def convert_muladd_to_scaleshift_or_power(graph: nx.MultiDiGraph): +def convert_muladd_to_scaleshift_or_power(graph: Graph): apply_pattern( graph, nodes=[ @@ -291,7 +246,7 @@ def convert_muladd_to_scaleshift_or_power(graph: nx.MultiDiGraph): ) -def batch_norm_fuse_action(graph: nx.MultiDiGraph, match: dict): +def batch_norm_fuse_action(graph: Graph, match: dict): """ Multiply convolution kernel by batch normalization coefficient and remove mul op. """ @@ -309,7 +264,7 @@ def batch_norm_fuse_action(graph: nx.MultiDiGraph, match: dict): graph.add_edge(match['conv'].node, match['mul_output'].node, out=0) -def batch_norm_fuse(graph: nx.MultiDiGraph): +def batch_norm_fuse(graph: Graph): apply_pattern( graph, nodes=[ @@ -330,296 +285,60 @@ def batch_norm_fuse(graph: nx.MultiDiGraph): return graph -def convert_add_to_scaleshift(graph: nx.MultiDiGraph): - for n in list(graph.nodes()): - node = Node(graph, n) - if node.has('op') and (node.op == 'BiasAdd' or node.op == 'Add') and len(node.in_nodes()) == 2: - tensor_id, value_id = get_tensor_id(node), get_value_id(node) - if tensor_id is not None and value_id is not None and node.soft_get('can_be_scaleshift') is not False: - node['type'] = 'ScaleShift' - node['op'] = 'ScaleShift' - node.in_node(value_id).value = np.squeeze(node.in_node(value_id).value) - node.in_node(value_id).shape = node.in_node(value_id).value.shape - - # if the node was created with eltwise then it has attribute 'operation' which should be removed from - # the IR - if node.has('operation'): - del graph.node[n]['operation'] - - bias_data = node.in_node(value_id) - graph[bias_data.node][node.node][0]['in'] = 2 - graph[bias_data.node][node.node][0]['bin'] = 'biases' - - input_data = node.in_node(tensor_id) - graph[input_data.node][node.node][0]['in'] = 0 - - update_ie_fields(graph.node[node.id]) - - weights_id = unique_id(graph, 'weights_') - graph.add_node(weights_id, **add_attrs_props( - dict(kind='data', precision="FP32", name=weights_id, value=None, shape=None, data_type=None, - infer=None))) - wnode = Node(graph, weights_id) - - wnode['value'] = np.full_like(bias_data.value, 1, dtype=np.float32) - wnode['shape'] = np.array(wnode['value'].shape) - - graph.add_edges_from([ - (weights_id, node.node, {'in': 1, 'bin': 'weights'}), - ]) - - -def convert_mul_to_scaleshift(graph: nx.MultiDiGraph): - for n in list(graph.nodes()): - node = Node(graph, n) - if node.has('op') and node.op == 'Mul' and len(node.in_nodes()) == 2: - tensor_id, value_id = get_tensor_id(node), get_value_id(node) - if tensor_id is not None and value_id is not None and node.soft_get('can_be_scaleshift') is not False: - node['type'] = 'ScaleShift' - node['op'] = 'ScaleShift' - node.in_node(value_id).value = np.squeeze(node.in_node(value_id).value) - node.in_node(value_id).shape = node.in_node(value_id).value.shape - - # if the node was created with eltwise then it has attribute 'operation' which should be removed from - # the IR - if node.has('operation'): - del graph.node[n]['operation'] - - scale_data = node.in_node(value_id) - graph[scale_data.node][node.node][0]['in'] = 1 - graph[scale_data.node][node.node][0]['bin'] = 'weights' - - input_data = node.in_node(tensor_id) - graph[input_data.node][node.node][0]['in'] = 0 - - update_ie_fields(graph.node[node.id]) - - bias_id = unique_id(graph, 'bias_') - graph.add_node(bias_id, **add_attrs_props( - dict(kind='data', precision="FP32", name=bias_id, value=None, shape=None, data_type=None, - infer=None))) - wnode = Node(graph, bias_id) - - wnode['value'] = np.full_like(scale_data.value, 0, dtype=np.float32) - wnode['shape'] = np.array(wnode['value'].shape) - - graph.add_edges_from([ - (bias_id, node.node, {'in': 2, 'bin': 'biases'}), - ]) - - -def convert_nasnet_action(graph: nx.MultiDiGraph, matches: dict): - """ - This function converts speciefic for NasNet topology subgraph Pad->StridedSlice->AvgPool to Conv->Crop->AvgPool - """ - input = matches['input'] - - pad_op = matches['pad_op'] - - sslice = matches['sslice'] - sslice_out = matches['sslice_out'] - begin = [] - end = [] - stride = [] - for s in sslice.slices: - begin.append(s.start) - end.append(s.stop) - stride.append(s.step) - - if not np.array_equal(pad_op.pads, np.array([[0, 0], [0, 1], [0, 1], [0, 0]])): - log.error(" Pad values doesn't match!") - return - - if not np.array_equal(begin, np.array([0, 1, 1, 0])): - log.error("StridedSlice has wrong begin") - return - - if sslice.end_mask != 15 or sslice.begin_mask != 9: - log.error("StridedSlice has wrong masks") - return - - # Cut Smth-x->Pad->StrudedSlice-x->AvgPool - graph.remove_edge(input.id, pad_op.id) - graph.remove_edge(sslice.id, sslice_out.id) - - # Pad -> Conv - conv_node = unique_id(graph, pad_op.name + '/Conv_') - conv_weights_node = unique_id(graph, pad_op.name + '/ConvW_') - conv_weights = np.ones((1, 1, input.shape[3], 1)) - conv_output = unique_id(graph, pad_op.name + '/ConvOut_') - output_shape = np.array([input.shape[0], input.shape[1] + 1, input.shape[2] + 1, input.shape[3]]) - - graph.add_node(conv_node, - **add_attrs_props(dict(kind='op', precision="FP32", type='Convolution', name=conv_node, op='Conv2D', - stride=np.array([1, 1, 1, 1]), dilation=np.array([1, 1, 1, 1]), - group=input.shape[3], bias_addable=True, bias_term=False, - spatial_dims=np.array([1, 2]), - kernel_spatial=np.array([1, 1]), - pad=np.array([[0, 0], [0, 0], [0, 0], [0, 0]]), output_shape=output_shape, - channel_dims=np.array([3])))) - - graph.add_node(conv_weights_node, **add_attrs_props( - dict(kind='data', precision="FP32", name=conv_weights_node, value=np.array(conv_weights), - shape=np.array(conv_weights.shape), - data_type=input.data_type, infer=None, - spatial_dims=np.array([0, 1]), - input_channel_dim=2, - output_channel_dim=3, - dims_number=4, can_be_bias=True))) - graph.add_node(conv_output, **add_attrs_props( - dict(kind='data', precision="FP32", name=conv_output, value=None, shape=output_shape, - data_type=input.data_type))) - - # StridedSlice -> Crop - crop_cls = Op.get_op_class_by_name('Crop') - crop = crop_cls(graph, dict(name=sslice.name + '/Crop_', axis=np.array([1, 2]), - dim=np.array([output_shape[1] - 1, output_shape[2] - 1]), offset=np.array([1, 1]))) - crop.create_node_with_data([Node(graph, conv_output)], data_nodes=sslice_out) - - # Connect : Conv->Crop->AvgPool - graph.add_edges_from([ - (input.id, conv_node, {'in': 0}), - (conv_weights_node, conv_node, {'in': 1, 'bin': 'weights'}), - (conv_node, conv_output, {'out': 0}), - ]) - update_ie_fields(graph.node[conv_node], graph.graph['ir_version']) - - -def convert_nasnet(graph: nx.MultiDiGraph): - apply_pattern( - graph, - nodes=[ - ('input', dict(kind='data')), - ('pad_op', dict(kind='op', op='Pad')), - ('pad_out', dict(kind='data')), - - ('begin', dict(kind='data')), - ('end', dict(kind='data')), - ('stride', dict(kind='data')), - - ('sslice', dict(kind='op', op='StridedSlice')), - ('sslice_out', dict(kind='data')), - - ('avg_pool', dict(kind='op', op='AvgPool')), - ('output', dict(kind='data')), - ], - edges=[ - ('input', 'pad_op', {'in': 0}), - ('pad_op', 'pad_out'), - - ('begin', 'sslice', {'in': 1}), - ('end', 'sslice', {'in': 2}), - ('stride', 'sslice', {'in': 3}), - - ('pad_out', 'sslice', {'in': 0}), - ('sslice', 'sslice_out'), - - ('sslice_out', 'avg_pool', {'in': 0}), - ('avg_pool', 'output') - ], - action=convert_nasnet_action - ) - return graph - - -def dilated_convolution_action(graph: nx.MultiDiGraph, match: dict): - conv = match['conv'] - stb = match['space_to_batch'] - bts = match['batch_to_space'] - - block_size = match['stb_bs'] - - input = match['input'] - output = match['output'] - stb_out = match['stb_output'] - conv_out = match['conv_output'] - - in_edge_attrs = graph.get_edge_data(input.id, stb.id)[0] - out_edge_attrs = graph.get_edge_data(bts.id, output.id)[0] - - graph.remove_edge(input.id, stb.id) - graph.remove_edge(stb_out.id, conv.id) - graph.remove_edge(conv.id, conv_out.id) - graph.remove_edge(bts.id, output.id) - - conv.dilation[conv.spatial_dims] = block_size.value - - pad = match['stb_pad'].value - match['bts_crop'].value - conv.pad[conv.spatial_dims] = [[pad[x][0], pad[x][1]] for x in range(len(pad))] - conv['auto_pad'] = None - - graph.add_edges_from([ - (input.id, conv.id, {'in': 0, **in_edge_attrs}), - (conv.id, output.id, {'out': 0, **out_edge_attrs}), - ]) - - -def convert_dilated_convolution(graph: nx.MultiDiGraph): - for op in ['Conv2D', 'DepthwiseConv2dNative', 'Conv3D']: - apply_pattern( - graph, - nodes=[ - ('conv', dict(kind='op', op=op)), - ('space_to_batch', dict(kind='op', op='SpaceToBatchND')), - ('batch_to_space', dict(kind='op', op='BatchToSpaceND')), - ('input', dict(kind='data')), - ('output', dict(kind='data')), - ('conv_output', dict(kind='data')), - ('stb_output', dict(kind='data')), - ('stb_bs', dict(kind='data')), - ('stb_pad', dict(kind='data')), - ('bts_bs', dict(kind='data')), - ('bts_crop', dict(kind='data')) - ], - edges=[ - ('input', 'space_to_batch', {'in': 0}), - ('stb_bs', 'space_to_batch', {'in': 1}), - ('stb_pad', 'space_to_batch', {'in': 2}), - ('space_to_batch', 'stb_output', {'out': 0}), - ('stb_output', 'conv', {'in': 0}), - ('conv', 'conv_output', {'out': 0}), - ('conv_output', 'batch_to_space', {'in': 0}), - ('bts_bs', 'batch_to_space', {'in': 1}), - ('bts_crop', 'batch_to_space', {'in': 2}), - ('batch_to_space', 'output', {'out': 0}), - ], - action=dilated_convolution_action - ) - - -def convert_multi_input_conv(graph: nx.MultiDiGraph): - for node in list(graph.nodes()): - node = Node(graph, node) - if node.kind == 'op' and node.op == 'ConvND': - node.op = 'Conv2D' - if node.bias_term == True: - num_inputs = len(node.in_nodes()) - 2 - w_node = node.in_node(len(node.in_nodes()) - 2) - b_node = node.in_node(len(node.in_nodes()) - 1) - else: - num_inputs = len(node.in_nodes()) - 1 - w_node = node.in_node(len(node.in_nodes()) - 1) - - for i in range(1, num_inputs): - in_i = node.in_node(i) - out_i = node.out_node(i) - conv_id = unique_id(graph, node.id + '__') - graph.add_node(conv_id, **copy.deepcopy(node.get_attrs())) - new_conv = Node(graph, conv_id) - new_conv.name = conv_id - - graph.remove_edge(in_i.id, node.id) - graph.remove_edge(node.id, out_i.id) - graph.add_edges_from([ - (w_node.id, conv_id, {'in': 1, 'bin': 'weights'}), - ]) - - if node.bias_term == True: - graph.add_edges_from([ - (b_node.id, conv_id, {'in': 2, 'bin': 'biases'}), - ]) - - graph.add_edges_from([ - (in_i.id, conv_id, {'in': 0}), - ]) - graph.add_edge(conv_id, out_i.id, **{'out': 0}) +def get_tensor_in_port(node) -> Port: + tensor_ports = [] + for port in node.in_ports().values(): + if port.data.get_value() is None: + tensor_ports.append(port) + return None if len(tensor_ports) != 1 else tensor_ports[0] + + +def get_value_in_port(node) -> Port: + value_ports = [] + for port in node.in_ports().values(): + if port.data.get_value() is not None: + value_ports.append(port) + return None if len(value_ports) != 1 else value_ports[0] + + +def convert_add_or_mul_to_scaleshift(graph: Graph): + op_nodes = graph.get_op_nodes() + for node in op_nodes: + if node.soft_get('op') in ['BiasAdd', 'Add', 'Mul'] and len(node.in_ports()) == 2: + tensor_port, value_port = get_tensor_in_port(node), get_value_in_port(node) + + if tensor_port is not None and value_port is not None and node.soft_get('can_be_scaleshift') is not False: + # Remove 1 dims from value array (should be 1D) + value_port.data.set_value(np.squeeze(value_port.data.get_value())) # Updated shapes accordingly + + # Create ScaleShift operation + scsh_op = ScaleShiftOp(graph, dict(name='ScaleShift/{}'.format(node.name))).create_node() + + if node.op == 'Mul': + # Create fake biases for scale shift node + const_op = Const(graph, dict(name='{}/biases'.format(scsh_op.name), + value=np.zeros(value_port.data.get_shape(), dtype=np.float32), + shape=np.array(value_port.data.get_shape()), + )).create_node() + + # Reconnect input and weights to scale shift node + tensor_port.get_connection().set_destination(scsh_op.in_port(0)) + value_port.get_connection().set_destination(scsh_op.in_port(1)) + const_op.out_port(0).connect(scsh_op.in_port(2)) + else: + # Create fake weights for scale shift node + const_op = Const(graph, dict(name='{}/weights'.format(scsh_op.name), + value=np.ones(value_port.data.get_shape(), dtype=np.float32), + shape=np.array(value_port.data.get_shape()), + )).create_node() + + # Reconnect input and biases to scale shift node + tensor_port.get_connection().set_destination(scsh_op.in_port(0)) + const_op.out_port(0).connect(scsh_op.in_port(1)) + value_port.get_connection().set_destination(scsh_op.in_port(2)) + + node.out_port(0).get_connection().set_source(scsh_op.out_port(0)) + + # Set bin attribute to ScaleShift input ports + scsh_op.in_port(1).bin = 'weights' + scsh_op.in_port(2).bin = 'biases' diff --git a/model-optimizer/mo/middle/passes/conv_test.py b/model-optimizer/mo/middle/passes/conv_test.py index ad4e3aa..9b1fd73 100644 --- a/model-optimizer/mo/middle/passes/conv_test.py +++ b/model-optimizer/mo/middle/passes/conv_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,7 +18,8 @@ import unittest import numpy as np -from mo.middle.passes.conv import convert_muladd_to_scaleshift_or_power +from mo.graph.graph import Node +from mo.middle.passes.conv import convert_muladd_to_scaleshift_or_power, convert_add_or_mul_to_scaleshift from mo.middle.passes.eliminate import graph_clean_up from mo.utils.unittest.graph import build_graph, compare_graphs @@ -27,19 +28,24 @@ nodes_attributes = { 'placeholder_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # ScaleShift layer 'scaleshift_1': {'type': 'ScaleShift', 'value': None, 'kind': 'op', 'op': 'ScaleShift'}, + 'const_scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'op'}, 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'const_scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'op'}, 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, # Mul and Add operations 'mul_1': {'value': None, 'kind': 'op', 'op': 'Mul'}, + 'const_mul_1_w': {'value': None, 'shape': None, 'kind': 'op'}, 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, 'add_1': {'value': None, 'kind': 'op', 'op': 'Add'}, + 'const_add_1_w': {'value': None, 'shape': None, 'kind': 'op'}, 'add_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'add_1_data': {'value': None, 'shape': None, 'kind': 'data'}, # Power layer 'power_1': {'type': 'Power', 'kind': 'op', 'op': 'Power', 'scale': None, 'shift': None, 'power': None}, 'power_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, } @@ -48,17 +54,24 @@ class MulAddToScaleShiftOrPower(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'add_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True}, + 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array(mul_w.shape) if mul_w is not None else None, + 'value': np.array(mul_w) if mul_w is not None else None}, 'mul_1_w': {'shape': np.array(mul_w.shape) if mul_w is not None else None, 'value': np.array(mul_w) if mul_w is not None else None}, + 'const_add_1_w': {'shape': np.array(add_w.shape) if add_w is not None else None, + 'value': np.array(add_w) if add_w is not None else None}, 'add_1_w': {'shape': np.array(add_w.shape) if add_w is not None else None, 'value': np.array(add_w) if add_w is not None else None}, }) @@ -72,13 +85,18 @@ class MulAddToScaleShiftOrPower(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), ('scaleshift_1_w', 'scaleshift_1'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output'), ], - {'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + {'const_scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) convert_muladd_to_scaleshift_or_power(graph) @@ -93,9 +111,10 @@ class MulAddToScaleShiftOrPower(unittest.TestCase): [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'power_1'), ('power_1', 'power_1_data'), + ('power_1_data', 'op_output'), ], {'power_1': {'scale': 3, 'shift': 2, 'power': 1}, - 'power_1_data': {'is_output': True} + 'power_1_data': {} }) convert_muladd_to_scaleshift_or_power(graph) @@ -144,13 +163,17 @@ class MulAddToScaleShiftOrPower(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), ('scaleshift_1_w', 'scaleshift_1'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'add_1_data'), + ('add_1_data', 'op_output'), ], - {'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([3, 3, 3])}, + {'const_scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([3, 3, 3])}, + 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([3, 3, 3])}, + 'const_scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([3, 2, 1])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([3, 2, 1])}, - 'add_1_data': {'is_output': True} }) convert_muladd_to_scaleshift_or_power(graph) @@ -159,5 +182,118 @@ class MulAddToScaleShiftOrPower(unittest.TestCase): self.assertTrue(flag, resp) +class AddToScaleShift(unittest.TestCase): + @staticmethod + def _create_graph_with_add(add_w: np.ndarray): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), + ('add_1_w', 'add_1'), + ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_add_1_w': {'shape': np.array(add_w.shape) if add_w is not None else None, + 'value': np.array(add_w) if add_w is not None else None}, + 'add_1_w': {'shape': np.array(add_w.shape) if add_w is not None else None, + 'value': np.array(add_w) if add_w is not None else None}, + }, nodes_with_edges_only=True) + del graph['add_1']['add_1_data'][0]['in'] + return graph + + @staticmethod + def _create_graph_with_mul(mul_w: np.ndarray): + graph = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), + ('mul_1_w', 'mul_1'), + ('mul_1', 'mul_1_data'), + ('mul_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array(mul_w.shape) if mul_w is not None else None, + 'value': np.array(mul_w) if mul_w is not None else None}, + 'mul_1_w': {'shape': np.array(mul_w.shape) if mul_w is not None else None, + 'value': np.array(mul_w) if mul_w is not None else None}, + }, nodes_with_edges_only=True) + del graph['mul_1']['mul_1_data'][0]['in'] + return graph + + def test_add_to_scaleshift_1(self): + graph = AddToScaleShift._create_graph_with_add(np.array([1, 2, 3], dtype=np.float32)) + graph.stage = 'middle' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), + ('scaleshift_1_w', 'scaleshift_1'), + ('scaleshift_1_b', 'scaleshift_1'), + ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3])}, + + 'const_scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 1, 1])}, + 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 1, 1])}, + + 'const_scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + }, nodes_with_edges_only=True) + + convert_add_or_mul_to_scaleshift(graph) + graph_clean_up(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'op_output') + self.assertTrue(flag, resp) + + scsh_node = Node(graph, 'op_output').in_port(0).get_source().node + + self.assertTrue(graph.get_edge_data(scsh_node.in_node(1).id, scsh_node.id)[0]['bin'] == 'weights') + self.assertTrue(graph.get_edge_data(scsh_node.in_node(2).id, scsh_node.id)[0]['bin'] == 'biases') + + def test_mul_to_scaleshift_1(self): + graph = AddToScaleShift._create_graph_with_mul(np.array([1, 2, 3], dtype=np.float32)) + graph.stage = 'middle' + + graph_ref = build_graph(nodes_attributes, + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), + ('scaleshift_1_w', 'scaleshift_1'), + ('scaleshift_1_b', 'scaleshift_1'), + ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3])}, + + 'const_scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + + 'const_scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, + 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, + }, nodes_with_edges_only=True) + + convert_add_or_mul_to_scaleshift(graph) + graph_clean_up(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'op_output') + self.assertTrue(flag, resp) + + scsh_node = Node(graph, 'op_output').in_port(0).get_source().node + + self.assertTrue(graph.get_edge_data(scsh_node.in_node(1).id, scsh_node.id)[0]['bin'] == 'weights') + self.assertTrue(graph.get_edge_data(scsh_node.in_node(2).id, scsh_node.id)[0]['bin'] == 'biases') + + + if __name__ == '__main__': unittest.main() diff --git a/model-optimizer/mo/middle/passes/convert_data_type.py b/model-optimizer/mo/middle/passes/convert_data_type.py index daa1782..5f0d50e 100644 --- a/model-optimizer/mo/middle/passes/convert_data_type.py +++ b/model-optimizer/mo/middle/passes/convert_data_type.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,11 +15,9 @@ """ import logging as log - -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg @@ -28,6 +26,8 @@ SUPPORTED_DATA_TYPES = { 'half': (np.float16, 'FP16'), 'FP32': (np.float32, 'FP32'), 'FP16': (np.float16, 'FP16'), + 'I32': (np.int32, 'I32'), + 'uint8': (np.uint8, 'UI8'), } @@ -39,7 +39,7 @@ def data_type_str_to_precision(data_type_str: str): return SUPPORTED_DATA_TYPES[data_type_str][1] if data_type_str in SUPPORTED_DATA_TYPES else None -def convert_blob(graph: nx.MultiDiGraph, node: Node, data_type: type): +def convert_blob(graph: Graph, node: Node, data_type: type): out_edges = graph.out_edges(node.node, data=True) # if the data.value is used as binary weights @@ -70,7 +70,7 @@ def convert_blob(graph: nx.MultiDiGraph, node: Node, data_type: type): node.value = new_blob -def convert(graph: nx.MultiDiGraph, data_type_str: str): +def convert(graph: Graph, data_type_str: str): for node_name, node_attrs in graph.nodes(data=True): node = Node(graph, node_name) # if the data type is forcibly set then use it diff --git a/model-optimizer/mo/middle/passes/debug.py b/model-optimizer/mo/middle/passes/debug.py index 28c0023..e0f20be 100644 --- a/model-optimizer/mo/middle/passes/debug.py +++ b/model-optimizer/mo/middle/passes/debug.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ def debug_ir_emitter(graph, exclude_attrs: list = []): print("--- DEBUG IR END ---") -def get_output_node_names(graph: nx.MultiDiGraph): +def get_output_node_names(graph: Graph): result = [] for node in graph.nodes(): node = Node(graph, node) diff --git a/model-optimizer/mo/middle/passes/eliminate.py b/model-optimizer/mo/middle/passes/eliminate.py index 2878add..d131875 100644 --- a/model-optimizer/mo/middle/passes/eliminate.py +++ b/model-optimizer/mo/middle/passes/eliminate.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,18 +17,21 @@ import logging as log from collections import deque import networkx as nx +import numpy as np -from mo.graph.graph import Node, create_edge +from mo.graph.graph import Node, Graph from mo.middle.pattern_match import apply_pattern +from mo.utils.error import Error from mo.utils.graph import bfs_search, pseudo_topological_sort -def get_nodes_with_attributes(graph: nx.MultiDiGraph, **attrs: dict): +# TODO: dep warning +def get_nodes_with_attributes(graph: Graph, **attrs: dict): node_attrs = graph.nodes(data=True) return [n for n, d in node_attrs if all(a in d.items() for a in attrs.items())] -def reverse_dfs(graph: nx.MultiDiGraph, node_name: str, update_func: callable, visited: set = None): +def reverse_dfs(graph: Graph, node_name: str, update_func: callable, visited: set = None): d = deque() if visited is None: @@ -44,23 +47,23 @@ def reverse_dfs(graph: nx.MultiDiGraph, node_name: str, update_func: callable, v d.append(in_node_name) -def mark_input_nodes(graph: nx.MultiDiGraph, node_name: str, key: str, value): +def mark_input_nodes(graph: Graph, node_name: str, key: str, value): for input, _ in graph.in_edges(node_name): graph.node[input][key] = value -def mark_output_nodes(graph: nx.MultiDiGraph, node_name: str, key: str, value): +def mark_output_nodes(graph: Graph, node_name: str, key: str, value): for output, _ in graph.out_edges(node_name): graph.node[output][key] = value -def mark_output_reachable_nodes(graph: nx.MultiDiGraph): +def mark_output_reachable_nodes(graph: Graph): """ Mark nodes whether they are outputs reachable or not. The node is considered output reachable if it is connected to - one of the nodes that has attribute is_output=True. + one of the nodes that has attribute op=OpOutput. """ nx.set_node_attributes(G=graph, name='is_output_reachable', values=False) - outputs = get_nodes_with_attributes(graph, is_output=True) + outputs = graph.get_nodes_with_attributes(op='OpOutput') log.debug('The following nodes are seeded as output reachable:\n{}'.format('\n'.join(sorted(map(str, outputs))))) nx.set_node_attributes(G=graph, name='is_output_reachable', values={n: True for n in outputs}) visited = set() @@ -69,7 +72,7 @@ def mark_output_reachable_nodes(graph: nx.MultiDiGraph): lambda graph, node_name: mark_input_nodes(graph, node_name, 'is_output_reachable', True), visited) -def mark_undead_nodes(graph: nx.MultiDiGraph, undead_types: list): +def mark_undead_nodes(graph: Graph, undead_types: list): """ Mark output nodes and nodes of the specific type as undead, meaning that they should survive the dead nodes elimination phase. Then mark all children nodes of the undead nodes (except children of inputs) as undead. @@ -80,29 +83,30 @@ def mark_undead_nodes(graph: nx.MultiDiGraph, undead_types: list): nx.set_node_attributes(G=graph, name='is_undead', values=False) # mark output nodes as undead - outputs = get_nodes_with_attributes(graph, is_output=True) + outputs = graph.get_nodes_with_attributes(op='OpOutput') nx.set_node_attributes(G=graph, name='is_undead', values={n: True for n in outputs}) # mark specifically defined with node type set of nodes for type in undead_types: - node_of_specific_type = get_nodes_with_attributes(graph, type=type) + node_of_specific_type = graph.get_nodes_with_attributes(type=type) nx.set_node_attributes(G=graph, name='is_undead', values={n: True for n in node_of_specific_type}) - undead_nodes = get_nodes_with_attributes(graph, is_undead=True) + undead_nodes = graph.get_nodes_with_attributes(is_undead=True) # propagate 'undead' attribute to children nodes of undead nodes if the node produces constant value for node_name in bfs_search(graph, undead_nodes): if graph.node[node_name]['is_undead']: for _, dst_node_name in graph.out_edges(node_name): node_attrs = graph.node[dst_node_name] - if 'kind' in node_attrs and node_attrs['kind'] == 'data' and node_attrs['value'] is not None: + if 'kind' in node_attrs and ( + node_attrs['kind'] == 'data' and node_attrs['value'] is not None or node_attrs['kind'] == 'op'): graph.node[dst_node_name]['is_undead'] = True # mark input nodes as undead - inputs = get_nodes_with_attributes(graph, is_input=True) + inputs = graph.get_nodes_with_attributes(is_input=True) nx.set_node_attributes(G=graph, name='is_undead', values={n: True for n in inputs}) -def mark_const_producer_nodes(graph: nx.MultiDiGraph): +def mark_const_producer_nodes(graph: Graph): """ Mark nodes that produce constant values. :param graph: graph to operate on. @@ -122,7 +126,7 @@ def mark_const_producer_nodes(graph: nx.MultiDiGraph): graph.node[input]['is_const_producer'] = False -def eliminate_dead_nodes(graph: nx.MultiDiGraph): +def eliminate_dead_nodes(graph: Graph): nodes_to_remove = set() for node_name, node_attrs in graph.nodes(data=True): if not node_attrs['is_output_reachable'] or (node_attrs['is_const_producer'] and not node_attrs['is_undead']): @@ -131,25 +135,69 @@ def eliminate_dead_nodes(graph: nx.MultiDiGraph): graph.remove_nodes_from(nodes_to_remove) -def graph_clean_up(graph: nx.MultiDiGraph, undead_node_types: list = []): +def add_constant_operations(graph: Graph): + data_nodes = graph.get_data_nodes(has_value=True) + for node in data_nodes: + # If data node has no producers we create Const operation + if len(node.in_nodes()) == 0 and len(node.out_nodes()) != 0: + # It's necessary to import here due to cycle dependencies + from mo.ops.const import Const + Const(graph, dict(value=node.value, shape=np.array(node.value.shape))).create_node_with_data(data_nodes=node) + + +def remove_const_ops(graph: Graph): + ops = [node for node in graph.get_op_nodes() if node.soft_get('type') == 'Const'] + for node in ops: + graph.remove_edge(node.id, node.out_node().id) + graph.remove_node(node.id) + + +def shape_inference(graph: Graph): + nodes = pseudo_topological_sort(graph) + for node in nodes: + node = Node(graph, node) + if node.has_and_set('need_shape_inference'): + old_out_shapes = [port.data.get_shape() for port in node.out_ports().values()] + node.infer(node) + new_out_shapes = [port.data.get_shape() for port in node.out_ports().values()] + for shape1, shape2 in zip(old_out_shapes, new_out_shapes): + if shape1 is not None and not np.array_equal(shape1, shape2): + raise Error("After partial shape inference were found shape collision for node {} (old shape: {}, new shape: {})".format(node.name, shape1, shape2)) + node.need_shape_inference = False + + +def graph_clean_up(graph: Graph, undead_node_types: list = None): + if undead_node_types is None: + undead_node_types = [] + + if 'Shape' in undead_node_types and not graph.graph['cmd_params'].keep_shape_ops: + undead_node_types.remove('Shape') + mark_output_reachable_nodes(graph) mark_undead_nodes(graph, undead_node_types) mark_const_producer_nodes(graph) eliminate_dead_nodes(graph) + # Add Const op for constant data nodes + add_constant_operations(graph) + shape_inference(graph) + +def graph_clean_up_tf(graph: Graph): + graph_clean_up(graph, ['TFCustomSubgraphCall', 'Shape']) -def graph_clean_up_tf(graph: nx.MultiDiGraph): - graph_clean_up(graph, ['TFCustomSubgraphCall']) +def graph_clean_up_onnx(graph: Graph): + graph_clean_up(graph, ['Shape']) -def remove_identity_action(graph: nx.MultiDiGraph, matches: dict): + +def remove_identity_action(graph: Graph, matches: dict): remove_op_node_with_data_node(graph, matches['identity']) # TODO: unit tests -def merge_data_nodes(graph: nx.MultiDiGraph, survived: Node, removed: Node): - if survived.has_and_set('is_output'): - graph.node[removed.id].update({'is_output': True}) +def merge_data_nodes(graph: Graph, survived: Node, removed: Node): + if survived.has_and_set('op') and survived.op == 'OpOutput': + graph.node[removed.id].update({'op': 'OpOutput'}) for u, v, d in list(graph.in_edges(removed.id, data=True)): graph.add_edges_from([(u, survived.id, d)]) @@ -172,7 +220,7 @@ def merge_data_nodes(graph: nx.MultiDiGraph, survived: Node, removed: Node): # TODO: unit tests -def remove_op_node_with_data_node(graph: nx.MultiDiGraph, node_to_remove: Node): +def remove_op_node_with_data_node(graph: Graph, node_to_remove: Node): assert node_to_remove.kind == 'op' input_data_node = node_to_remove.in_node() output_node = [v for _, v in graph.out_edges(node_to_remove.id)] @@ -190,7 +238,7 @@ def remove_op_node_with_data_node(graph: nx.MultiDiGraph, node_to_remove: Node): graph.remove_nodes_from([node_to_remove.id, input_data_node.id]) -def remove_op_nodes(graph: nx.MultiDiGraph, attrs: dict): +def remove_op_nodes(graph: Graph, attrs: dict): op_attrs = {'kind': 'op'} op_attrs.update(attrs) apply_pattern( @@ -201,7 +249,7 @@ def remove_op_nodes(graph: nx.MultiDiGraph, attrs: dict): ) -def remove_edges_for_nodes(graph: nx.MultiDiGraph, node_attrs: dict, edge_attrs: dict): +def remove_edges_for_nodes(graph: Graph, node_attrs: dict, edge_attrs: dict): for node in graph.nodes(): node = Node(graph, node) if all([node.has(attr) and node[attr] == node_attrs[attr] for attr in node_attrs]): @@ -212,21 +260,3 @@ def remove_edges_for_nodes(graph: nx.MultiDiGraph, node_attrs: dict, edge_attrs: graph.remove_edge(src_node.id, node.id) -def remove_useless_split_action(graph: nx.MultiDiGraph, matches: dict): - split_node = matches['split'] - input = split_node.in_node(1) - output = split_node.out_node() - graph.remove_edge(input.id, split_node.id) - - for u, v, d in list(graph.out_edges(output.id, data=True)): - graph.add_edges_from([(input.id, v, d)]) - graph.remove_edge(u, v) - - -def remove_useless_split(graph: nx.MultiDiGraph): - apply_pattern( - graph, - nodes=[('split', {'kind': 'op', 'op': 'Split', 'num_split': 1})], - edges=[], - action=remove_useless_split_action - ) diff --git a/model-optimizer/mo/middle/passes/eliminate_test.py b/model-optimizer/mo/middle/passes/eliminate_test.py index 79b892c..f253dde 100644 --- a/model-optimizer/mo/middle/passes/eliminate_test.py +++ b/model-optimizer/mo/middle/passes/eliminate_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,9 +18,8 @@ import unittest import numpy as np -from mo.graph.graph import Node, erase_node -from mo.middle.passes.eliminate import mark_output_reachable_nodes, graph_clean_up, \ - get_nodes_with_attributes, mark_const_producer_nodes +from mo.graph.graph import Node, Graph +from mo.middle.passes.eliminate import mark_output_reachable_nodes, graph_clean_up, mark_const_producer_nodes from mo.utils.unittest.graph import build_graph nodes_attributes = {'placeholder_1': {'type': 'Placeholder', 'kind': 'op'}, @@ -38,11 +37,14 @@ nodes_attributes = {'placeholder_1': {'type': 'Placeholder', 'kind': 'op'}, 'data_node_3': {'value': None, 'kind': 'data'}, 'data_node_3_2': {'value': None, 'kind': 'data'}, 'data_node_4': {'value': None, 'kind': 'data'}, - 'data_node_5': {'value': None, 'kind': 'data'}, - 'data_node_6': {'value': None, 'kind': 'data'}, + 'data_node_5': {'value': None, 'shape': None, 'kind': 'data'}, + 'data_node_6': {'value': None, 'shape': None, 'kind': 'data'}, 'tf_call_1': {'type': 'TFCustomSubgraphCall', 'kind': 'op'}, 'tf_call_2': {'type': 'TFCustomSubgraphCall', 'kind': 'op'}, 'tf_call_3': {'type': 'TFCustomSubgraphCall', 'kind': 'op'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_1': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_2': {'kind': 'op', 'op': 'OpOutput'} } @@ -63,15 +65,17 @@ class TestEliminatePass(unittest.TestCase): [('placeholder_1', 'node_1'), ('node_1', 'node_2'), ('placeholder_1', 'node_3'), - ('node_3', 'node_4')], - {'node_4': {'is_output': True}}, + ('node_3', 'node_4'), + ('node_4', 'op_output') + ], + {'node_4': {}}, nodes_with_edges_only=True) mark_output_reachable_nodes(graph) - self.assertListEqual(sorted(['placeholder_1', 'node_3', 'node_4']), - sorted(get_nodes_with_attributes(graph, is_output_reachable=True))) + self.assertListEqual(sorted(['placeholder_1', 'node_3', 'op_output', 'node_4']), + sorted(graph.get_nodes_with_attributes(is_output_reachable=True))) self.assertListEqual(sorted(['node_1', 'node_2']), - sorted(get_nodes_with_attributes(graph, is_output_reachable=False))) + sorted(graph.get_nodes_with_attributes(is_output_reachable=False))) def test_mark_output_unreachable_nodes_behind_output(self): """ @@ -86,13 +90,15 @@ class TestEliminatePass(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'node_1'), ('node_1', 'node_2'), - ('node_2', 'node_3')], - {'node_2': {'is_output': True}}, + ('node_2', 'node_3'), + ('node_2', 'op_output') + ], + {'node_2': {}}, nodes_with_edges_only=True) mark_output_reachable_nodes(graph) - self.assertListEqual(sorted(['placeholder_1', 'node_1', 'node_2']), - sorted(get_nodes_with_attributes(graph, is_output_reachable=True))) + self.assertListEqual(sorted(['node_1', 'node_2', 'op_output', 'placeholder_1']), + sorted(graph.get_nodes_with_attributes(is_output_reachable=True))) self.assertFalse(graph.node['node_3']['is_output_reachable']) def test_mark_ops_producing_constant_values(self): @@ -128,16 +134,19 @@ class TestEliminatePass(unittest.TestCase): ('data_node_3_2', 'node_5'), ('node_5', 'data_node_5'), ('data_node_3', 'node_4'), - ('data_node_4', 'node_1')], - {'data_node_2': {'is_output': True}, - 'data_node_5': {'is_output': True}, + ('data_node_4', 'node_1'), + ('data_node_2', 'op_output'), + ('data_node_5', 'op_output_1') + ], + {'data_node_2': {}, + 'data_node_5': {}, 'data_node_3': {'value': np.array(1)}, 'data_node_6': {'value': np.array(1)}}, nodes_with_edges_only=True) mark_const_producer_nodes(graph) self.assertTrue((graph.node['node_6']['is_const_producer'])) self.assertListEqual(sorted(['node_1', 'node_2', 'node_3', 'node_5', 'placeholder_1']), - sorted(get_nodes_with_attributes(graph, is_const_producer=False, kind='op'))) + sorted(graph.get_nodes_with_attributes(is_const_producer=False, kind='op'))) graph_clean_up(graph) self.assertTrue('node_3' in graph.nodes()) @@ -166,6 +175,6 @@ class TestEliminatePass(unittest.TestCase): ('node_1', 'node_2'), ('node_2', 'node_3')], nodes_with_edges_only=True) - erase_node(Node(graph, 'node_2')) + graph.erase_node(Node(graph, 'node_2')) self.assertListEqual(sorted(['placeholder_1', 'node_1', 'node_3']), sorted(graph.nodes())) diff --git a/model-optimizer/mo/middle/passes/fusing/decomposition.py b/model-optimizer/mo/middle/passes/fusing/decomposition.py index 737074f..cf6739d 100644 --- a/model-optimizer/mo/middle/passes/fusing/decomposition.py +++ b/model-optimizer/mo/middle/passes/fusing/decomposition.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ import logging as log import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.passes.eliminate import merge_data_nodes from mo.middle.pattern_match import apply_pattern from mo.ops.lin_op import Mul, Add @@ -27,7 +27,7 @@ from mo.ops.op import Op from mo.ops.reshape import Reshape -def convert_batch_norm(graph: nx.MultiDiGraph): +def convert_batch_norm(graph: Graph): """ This function finds FusedBatchNorm layer (or BatchNorm for MXNet) and replaces with Mul->Add->Mul->Add sequence. """ @@ -78,7 +78,7 @@ def convert_batch_norm(graph: nx.MultiDiGraph): _fused_batch_norm_decomposition(graph, tinput, toutput, const, beta, scale, shift, can_be_fused) -def _fused_batch_norm_decomposition(graph: nx.MultiDiGraph, tinput: Node, toutput: Node, gamma: Node, beta: Node, +def _fused_batch_norm_decomposition(graph: Graph, tinput: Node, toutput: Node, gamma: Node, beta: Node, mean: np.ndarray, variance: np.ndarray, can_be_fused=True): """ This is common function for TF, Caffe and MXNet @@ -113,64 +113,108 @@ def _fused_batch_norm_decomposition(graph: nx.MultiDiGraph, tinput: Node, toutpu data_nodes=toutput) -def convert_scale_shift_to_mul_add(graph: nx.MultiDiGraph): - nodes = [Node(graph, node) for node in graph.nodes() if Node(graph, node).soft_get('op') == 'ScaleShift'] +def convert_scale_shift_to_mul_add(graph: Graph): + nodes = graph.get_op_nodes(op='ScaleShift') for node in nodes: if node.soft_get('can_be_fused') is False: continue + ports_count = len(node.in_ports()) + + input_port = node.in_port(0) + scale_port = node.in_port(1) if ports_count > 1 and not node.in_port(1).disconnected() else None + shift_port = node.in_port(2) if ports_count > 2 and not node.in_port(2).disconnected() else None + output_port = node.out_port(0) + has_biases = True has_weights = True + # We don't need zero biases - if len(node.in_nodes()) < 3 or all([x == 0 for x in node.in_node(2).value]): + if shift_port is None or (shift_port.data.get_value() is not None and all([x == 0 for x in shift_port.data.get_value()])): has_biases = False - input_node = node.in_node(0) - scale_node = node.in_node(1) - shift_node = node.in_node(2) if has_biases else None - output_node = node.out_node() - if scale_node.has_valid("value") and all([x == 1 for x in scale_node.value]): + # We don't need weights with ones + if scale_port is None or (scale_port.data.get_value() is not None and all([x == 1 for x in scale_port.data.get_value()])): has_weights = False - mul_node = Mul(graph, dict(name=node.name + "/Mul_")) - add_node = Add(graph, dict(name=node.name + "/Add_")) - - # Disconnect ScaleShift node - graph.remove_edge(input_node.id, node.id) - graph.remove_edge(node.id, output_node.id) + mul_op = Mul(graph, dict(name=node.name + "/Mul_")) + add_op = Add(graph, dict(name=node.name + "/Add_")) # Expand dims for current layout - broadcast_dims_cnt = len(input_node.shape) - 2 if graph.graph['layout'] == 'NCHW' else 0 - if scale_node.has_valid("value"): - Op.expand_node_shape(scale_node, broadcast_dims_cnt) - else: - # insert reshape to make shapes similar - reshape_dims = np.zeros(len(input_node.shape), dtype=np.int64) + broadcast_dims_cnt = len(input_port.data.get_shape()) - 2 if graph.graph['layout'] == 'NCHW' else 0 + + # In case if we have constant weights/biases we have to broadcast them according to graph layout + # otherwise we insert Reshape with broadcast dim attribute. + def broadcast_value(port): + value = np.array(port.data.get_value()) + for idx in range(broadcast_dims_cnt): + value = np.expand_dims(value, axis=-1) + port.data.set_value(value) + + def broadcast_with_reshape(port): + input_shape = input_port.data.get_shape() + reshape_dims = np.zeros(len(input_shape), dtype=np.int64) for i in range(0, node.axis): reshape_dims[i] = 1 - for i in range(node.axis, node.axis + len(scale_node.shape)): - reshape_dims[i] = scale_node.shape[i-node.axis] - for i in range(node.axis + len(scale_node.shape), len(input_node.shape)): + data_shape = port.data.get_shape() + for i in range(node.axis, node.axis + len(data_shape)): + reshape_dims[i] = data_shape[i - node.axis] + for i in range(node.axis + len(data_shape), len(input_shape)): reshape_dims[i] = 1 - reshape = Reshape(graph, dict(name=scale_node.name+"/Broadcast_", - dim=reshape_dims)) - scale_node = reshape.create_node_with_data(inputs=[scale_node]) + reshape = Reshape(graph, dict(name=port.node.name + "/Broadcast_", dim=reshape_dims)).create_node() + port.get_connection().set_destination(reshape.in_port(0)) + reshape.out_port(0).connect(port) - Op.expand_node_shape(shift_node, broadcast_dims_cnt) + if has_weights and scale_port.data.get_value() is not None: + broadcast_value(scale_port) + elif has_weights: + broadcast_with_reshape(scale_port) - # Connect input->mul->out->add->out - if has_biases: - add_node.create_node_with_data( - inputs=[mul_node.create_node_with_data(inputs=[input_node, scale_node]), shift_node], - data_nodes=output_node) + if has_biases and shift_port.data.get_value() is not None: + broadcast_value(shift_port) + elif has_biases: + broadcast_with_reshape(shift_port) + + if has_biases and has_weights: + # Connect input->mul->out->add->out + add_node = add_op.create_node() + mul_node = mul_op.create_node() + + # Connect Mul operation with inputs + input_port.get_connection().set_destination(mul_node.in_port(0)) + scale_port.get_connection().set_destination(mul_node.in_port(1)) + + # Connect Add operation with inputs + mul_node.out_port(0).connect(add_node.in_port(0)) + shift_port.get_connection().set_destination(add_node.in_port(1)) + + output_port.get_connection().set_source(add_node.out_port(0)) elif has_weights: - mul_node.create_node_with_data(inputs=[input_node, scale_node], data_nodes=output_node) + # Connect input->mul->out + mul_node = mul_op.create_node() + + # Connect Mul operation with inputs + input_port.get_connection().set_destination(mul_node.in_port(0)) + scale_port.get_connection().set_destination(mul_node.in_port(1)) + + output_port.get_connection().set_source(mul_node.out_port(0)) + elif has_biases: + # Connect input->add->out + add_node = add_op.create_node() + + # Connect Add operation with inputs + input_port.get_connection().set_destination(add_node.in_port(0)) + shift_port.get_connection().set_destination(add_node.in_port(1)) + + output_port.get_connection().set_source(add_node.out_port(0)) else: - merge_data_nodes(graph, input_node, output_node) - graph.remove_node(output_node.id) + # Connect input->out + producer_port = input_port.get_source() + input_port.disconnect() + output_port.get_connection().set_source(producer_port) -def _bn_to_mul_add_action(graph: nx.MultiDiGraph, match: dict): +def _bn_to_mul_add_action(graph: Graph, match: dict): # Data nodes tinput = match['input'] toutput = match['output'] @@ -209,7 +253,7 @@ def _bn_to_mul_add_action(graph: nx.MultiDiGraph, match: dict): data_nodes=toutput) -def convert_bn_to_mul_add(graph: nx.MultiDiGraph): +def convert_bn_to_mul_add(graph: Graph): apply_pattern( graph, nodes=[ diff --git a/model-optimizer/mo/middle/passes/fusing/decomposition_test.py b/model-optimizer/mo/middle/passes/fusing/decomposition_test.py index 2179f21..0fa1ff2 100644 --- a/model-optimizer/mo/middle/passes/fusing/decomposition_test.py +++ b/model-optimizer/mo/middle/passes/fusing/decomposition_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -30,21 +30,27 @@ nodes_attributes = { 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # ScaleShift layer 'scaleshift_1': {'type': 'ScaleShift', 'kind': 'op', 'op': 'ScaleShift', 'axis': 0}, + 'const_scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'op'}, 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, + 'const_scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'op'}, 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, # Mul and Add operations 'mul_1': {'type': None, 'value': None, 'kind': 'op', 'op': 'Mul'}, + 'const_mul_1_w': {'value': None, 'shape': None, 'kind': 'op'}, 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, 'add_1': {'type': None, 'kind': 'op', 'op': 'Add'}, + 'const_add_1_w': {'value': None, 'shape': None, 'kind': 'op'}, 'add_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'add_1_data': {'value': None, 'shape': None, 'kind': 'data'}, # Mul and Add operations 'mul_2': {'type': None, 'kind': 'op', 'op': 'Mul'}, + 'const_mul_2_w': {'value': None, 'shape': None, 'kind': 'op'}, 'mul_2_w': {'value': None, 'shape': None, 'kind': 'data'}, 'mul_2_data': {'value': None, 'shape': None, 'kind': 'data'}, 'add_2': {'type': None, 'kind': 'op', 'op': 'Add'}, + 'const_add_2_w': {'value': None, 'shape': None, 'kind': 'op'}, 'add_2_w': {'value': None, 'shape': None, 'kind': 'data'}, 'add_2_data': {'value': None, 'shape': None, 'kind': 'data'}, # Reshape @@ -60,6 +66,7 @@ nodes_attributes = { # Concat1 operation 'concat': {'type': 'Concat', 'kind': 'op', 'op': 'Concat'}, 'concat_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'} } @@ -69,30 +76,35 @@ class ScaleShiftToMulAdd(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), ('scaleshift_1_w', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1': {'can_be_fused': True}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph.graph['layout'] = 'NHWC' convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1') self.assertTrue(flag, resp) # ScaleShift 2 inputs-> Mul @@ -103,10 +115,11 @@ class ScaleShiftToMulAdd(unittest.TestCase): ('placeholder_1_data', 'scaleshift_1'), ('placeholder_2_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227])}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph_ref = build_graph(nodes_attributes, @@ -117,19 +130,20 @@ class ScaleShiftToMulAdd(unittest.TestCase): ('placeholder_1_data', 'mul_1'), ('placeholder_2/Reshape_data', 'mul_1'), ('mul_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227])}, 'placeholder_2/Reshape_': {'dim': np.array([1, 227, 1, 1])}, 'placeholder_2/Reshape_data': {'shape': np.array([1, 227, 1, 1])}, 'mul_1': {'can_be_fused': True}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph.graph['layout'] = 'NHWC' convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1') self.assertTrue(flag, resp) # ScaleShift 2 inputs-> Mul (axis = 1) @@ -140,11 +154,12 @@ class ScaleShiftToMulAdd(unittest.TestCase): ('placeholder_1_data', 'scaleshift_1'), ('placeholder_2_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([227])}, 'scaleshift_1': {'axis': 1}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph_ref = build_graph(nodes_attributes, @@ -155,53 +170,59 @@ class ScaleShiftToMulAdd(unittest.TestCase): ('placeholder_1_data', 'mul_1'), ('placeholder_2/Reshape_data', 'mul_1'), ('mul_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([227])}, 'placeholder_2/Reshape_': {'dim': np.array([1, 227, 1, 1])}, 'placeholder_2/Reshape_data': {'shape': np.array([1, 227, 1, 1])}, 'mul_1': {'can_be_fused': True}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph.graph['layout'] = 'NHWC' convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1') self.assertTrue(flag, resp) - # ScaleShift -> Mul (Zero biases) def test_scaleshift_to_mul_2(self): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_w', 'scaleshift_1'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1': {'can_be_fused': True}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph.graph['layout'] = 'NHWC' convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1') self.assertTrue(flag, resp) # ScaleShift -> Mul->Add @@ -209,38 +230,46 @@ class ScaleShiftToMulAdd(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_w', 'scaleshift_1'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([3, 2, 1])}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_add_1_w': {'shape': np.array([3]), 'value': np.array([3, 2, 1])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([3, 2, 1])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'add_1': {'can_be_fused': True}, 'mul_1': {'can_be_fused': True}, - 'scaleshift_1_data': {'is_output': True} + 'scaleshift_1_data': {} }) graph.graph['layout'] = 'NHWC' convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) - (flag, resp) = compare_graphs(graph, graph_ref, 'scaleshift_1_data') + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1') self.assertTrue(flag, resp) # ScaleShift -> None (Zero weights and biases) @@ -248,24 +277,30 @@ class ScaleShiftToMulAdd(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_w', 'scaleshift_1'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 1, 1])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, - 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True} - }) + 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3])} + }, nodes_with_edges_only=True) graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True}}) + [('placeholder_1', 'placeholder_1_data'), + ('placeholder_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}} + ,nodes_with_edges_only=True) graph.graph['layout'] = 'NHWC' convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) - (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1_data') + (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1') self.assertTrue(flag, resp) # ScaleShift -> ScaleShift (can_be_fused=False) @@ -273,29 +308,37 @@ class ScaleShiftToMulAdd(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_w', 'scaleshift_1'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 1, 1])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, 'scaleshift_1': {'can_be_fused': False}, - 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True} + 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3])} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), + ('const_scaleshift_1_w', 'scaleshift_1_w'), + ('const_scaleshift_1_b', 'scaleshift_1_b'), ('scaleshift_1_w', 'scaleshift_1'), ('scaleshift_1_b', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 1, 1])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array([1, 1, 1])}, + 'const_scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([0, 0, 0])}, 'scaleshift_1': {'can_be_fused': False}, - 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True} + 'scaleshift_1_data': {'shape': np.array([1, 227, 227, 3])} }) convert_scale_shift_to_mul_add(graph) @@ -316,7 +359,8 @@ class BatchNormDecomposition(unittest.TestCase): ('bn_var', 'bn_op'), ('bn_op', 'bn_data'), ('concat', 'concat_data'), - ('bn_data', 'concat') + ('bn_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'bn_op': {'eps': 1.2}, @@ -325,39 +369,50 @@ class BatchNormDecomposition(unittest.TestCase): 'bn_mean': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_var': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_data': {'shape': np.array([1, 227, 227, 3])}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('add_1_data', 'mul_2'), + ('const_mul_2_w', 'mul_2_w'), ('mul_2_w', 'mul_2'), ('mul_2', 'mul_2_data'), ('mul_2_data', 'add_2'), + ('const_add_2_w', 'add_2_w'), ('add_2_w', 'add_2'), ('add_2', 'add_2_data'), ('concat', 'concat_data'), - ('add_2_data', 'concat') + ('add_2_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), + 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, + 'const_mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_add_1_w': {'shape': np.array([3]), + 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, + 'const_add_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_2_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1': {'can_be_fused': True}, 'mul_2': {'can_be_fused': True}, 'add_1': {'can_be_fused': True}, 'add_2': {'can_be_fused': True}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) graph.graph['layout'] = 'NHWC' @@ -378,7 +433,8 @@ class BatchNormDecomposition(unittest.TestCase): ('bn_var', 'bn_op'), ('bn_op', 'bn_data'), ('concat', 'concat_data'), - ('bn_data', 'concat') + ('bn_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'bn_op': {'eps': 1.2, 'can_be_fused': False}, @@ -387,39 +443,50 @@ class BatchNormDecomposition(unittest.TestCase): 'bn_mean': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_var': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_data': {'shape': np.array([1, 227, 227, 3])}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('add_1_data', 'mul_2'), + ('const_mul_2_w', 'mul_2_w'), ('mul_2_w', 'mul_2'), ('mul_2', 'mul_2_data'), ('mul_2_data', 'add_2'), + ('const_add_2_w', 'add_2_w'), ('add_2_w', 'add_2'), ('add_2', 'add_2_data'), ('concat', 'concat_data'), - ('add_2_data', 'concat') + ('add_2_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), + 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, + 'const_mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_add_1_w': {'shape': np.array([3]), + 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, + 'const_add_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_2_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1': {'can_be_fused': False}, 'mul_2': {'can_be_fused': False}, 'add_1': {'can_be_fused': False}, 'add_2': {'can_be_fused': False}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) graph.graph['layout'] = 'NHWC' @@ -437,14 +504,15 @@ class BatchNormDecomposition(unittest.TestCase): ('bn_var', 'bn_op'), ('bn_op', 'bn_data'), ('concat', 'concat_data'), - ('bn_data', 'concat') + ('bn_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'bn_op': {'epsilon': 1.2, 'op': 'BatchNormalization'}, 'bn_mean': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_var': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_data': {'shape': np.array([1, 227, 227, 3])}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) del graph['placeholder_1']['placeholder_1_data'][0]['in'] @@ -453,23 +521,30 @@ class BatchNormDecomposition(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('concat', 'concat_data'), - ('add_1_data', 'concat') + ('add_1_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), + 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, + 'const_add_1_w': {'shape': np.array([3]), + 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1': {'can_be_fused': True}, 'add_1': {'can_be_fused': True}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) graph.graph['layout'] = 'NHWC' @@ -488,14 +563,15 @@ class BatchNormDecomposition(unittest.TestCase): ('bn_var', 'bn_op'), ('bn_op', 'bn_data'), ('concat', 'concat_data'), - ('bn_data', 'concat') + ('bn_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'bn_op': {'epsilon': 1.2, 'op': 'BatchNormalization', 'can_be_fused': False}, 'bn_mean': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_var': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'bn_data': {'shape': np.array([1, 227, 227, 3])}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) del graph['placeholder_1']['placeholder_1_data'][0]['in'] @@ -504,23 +580,30 @@ class BatchNormDecomposition(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('concat', 'concat_data'), - ('add_1_data', 'concat') + ('add_1_data', 'concat'), + ('concat_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), + 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([0.67419986, 0.55901699, 0.48795004])}, + 'const_add_1_w': {'shape': np.array([3]), + 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([-0.67419986, -1.11803399, -1.46385011])}, 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1': {'can_be_fused': False}, 'add_1': {'can_be_fused': False}, - 'concat_data': {'is_output': True} + 'concat_data': {} }) graph.graph['layout'] = 'NHWC' diff --git a/model-optimizer/mo/middle/passes/fusing/fuse_grouped_conv.py b/model-optimizer/mo/middle/passes/fusing/fuse_grouped_conv.py index 976dcb5..ee66dda 100644 --- a/model-optimizer/mo/middle/passes/fusing/fuse_grouped_conv.py +++ b/model-optimizer/mo/middle/passes/fusing/fuse_grouped_conv.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,14 +21,14 @@ import networkx as nx import numpy as np from mo.front.extractor import add_attrs_props -from mo.graph.graph import Node, unique_id, get_inputs +from mo.graph.graph import Node, Graph from mo.middle.passes.eliminate import graph_clean_up from mo.utils.graph import pseudo_topological_sort from mo.middle.passes.fusing.helpers import get_next_operation, get_tensor_id # TODO: unit tests -def concat_convolutions(graph: nx.MultiDiGraph, start_node: Node, last_node: Node): +def concat_convolutions(graph: Graph, start_node: Node, last_node: Node): """ This function converts group of convolutions into one """ @@ -130,10 +130,10 @@ def concat_convolutions(graph: nx.MultiDiGraph, start_node: Node, last_node: Nod # TODO: unit tests -def grouped_convolutions_fusing(graph: nx.MultiDiGraph): +def grouped_convolutions_fusing(graph: Graph): while True: is_fused = False - graph_clean_up(graph, ['TFCustomSubgraphCall']) + graph_clean_up(graph, ['TFCustomSubgraphCall', 'Shape']) nodes = pseudo_topological_sort(graph) for idx in nodes: node = Node(graph, idx) diff --git a/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops.py b/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops.py index ade7a3c..9700a3e 100644 --- a/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops.py +++ b/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,20 +17,19 @@ import logging as log from collections import deque -import networkx as nx import numpy as np +from mo.front.common.partial_infer.utils import int64_array from mo.front.extractor import add_attrs_props -from mo.graph.graph import Node, unique_id +from mo.graph.graph import Node, Graph from mo.middle.passes.eliminate import graph_clean_up from mo.utils.graph import pseudo_topological_sort from mo.ops.lin_op import Mul, Add from mo.ops.op import Op -from mo.graph.graph import dump_graph_for_graphviz from mo.middle.passes.fusing.helpers import backward_bfs, forward_bfs, get_tensor_id, get_value_id -def _fuse_mul(graph: nx.MultiDiGraph, node: Node, fuse_nodes: list, backward: bool = True): +def _fuse_mul(graph: Graph, node: Node, fuse_nodes: list, backward: bool = True): """ This function takes Mul node and array of convolution/fc nodes for further fusion Parameters @@ -143,7 +142,7 @@ def _fuse_mul(graph: nx.MultiDiGraph, node: Node, fuse_nodes: list, backward: bo return is_fused -def _fuse_add(graph: nx.MultiDiGraph, node: Node, fuse_nodes: list, backward: bool = True): +def _fuse_add(graph: Graph, node: Node, fuse_nodes: list, backward: bool = True): """ This function takes Add node and Convolution/FC nodes for further fusion and then deletes Add node In case if Convolution/FC Bias absence it will be created @@ -188,7 +187,7 @@ def _fuse_add(graph: nx.MultiDiGraph, node: Node, fuse_nodes: list, backward: bo # Create BIAS data node if not exists if len(fuse_node.in_nodes()) <= 2: - bias_data = unique_id(graph, "bias_data") + bias_data = graph.unique_id("bias_data") data_type = fuse_node.in_node(1).data_type # Broadcast if scalar if value.size == 1: @@ -199,7 +198,7 @@ def _fuse_add(graph: nx.MultiDiGraph, node: Node, fuse_nodes: list, backward: bo if not backward: value = np.dot(fuse_node.in_node(1).value, value) - shape = value.shape + shape = int64_array(value.shape) graph.add_node(bias_data, **add_attrs_props( dict(kind='data', precision="FP32", name=bias_data, value=value, shape=shape, data_type=data_type))) @@ -235,7 +234,7 @@ def _fuse_add(graph: nx.MultiDiGraph, node: Node, fuse_nodes: list, backward: bo return is_fused -def fuse_linear_ops(graph: nx.MultiDiGraph): +def fuse_linear_ops(graph: Graph): """ This function makes fusing of linear operations (Mul,Add) to Convolution/FC. """ diff --git a/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops_test.py b/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops_test.py index 30948e2..a73bdd4 100644 --- a/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops_test.py +++ b/model-optimizer/mo/middle/passes/fusing/fuse_linear_ops_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -30,20 +30,26 @@ nodes_attributes = { 'scaleshift_1': {'type': 'ScaleShift', 'kind': 'op', 'op': 'ScaleShift'}, 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'const_scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, + 'const_scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, # Mul and Add operations 'mul_1': {'type': 'Mul', 'kind': 'op', 'op': 'Mul', 'can_be_fused': True}, 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'const_mul_1_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, 'add_1': {'type': 'Add', 'kind': 'op', 'op': 'Add', 'can_be_fused': True}, 'add_1_w': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'const_add_1_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'add_1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # Mul2 and Add2 operations 'mul_2': {'type': 'Mul', 'kind': 'op', 'op': 'Mul', 'can_be_fused': True}, 'mul_2_w': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'const_mul_2_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'mul_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, 'add_2': {'type': 'Add', 'kind': 'op', 'op': 'Add', 'can_be_fused': True}, 'add_2_w': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'const_add_2_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'add_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, # Concat1 operation 'concat_1': {'type': 'Concat', 'kind': 'op', 'op': 'Concat'}, @@ -52,21 +58,30 @@ nodes_attributes = { 'conv_1': {'type': 'Convolution', 'kind': 'op', 'op': 'Conv2D', 'layout': 'NHWC'}, 'conv_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'conv_1_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'const_conv_1_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, + 'const_conv_1_b': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'conv_1_data': {'value': None, 'shape': None, 'kind': 'data'}, 'conv_2': {'type': 'Convolution', 'kind': 'op', 'op': 'Conv2D', 'layout': 'NHWC'}, 'conv_2_w': {'value': None, 'shape': None, 'kind': 'data'}, 'conv_2_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'const_conv_2_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, + 'const_conv_2_b': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'conv_2_data': {'value': None, 'shape': None, 'kind': 'data'}, # FullyConnected 'fc_1': {'type': 'FullyConnected', 'kind': 'op', 'op': 'InnerProduct', 'layout': 'NHWC'}, 'fc_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'fc_1_b': {'value': None, 'shape': None, 'kind': 'data'}, + 'const_fc_1_w': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, + 'const_fc_1_b': {'value': None, 'shape': None, 'kind': 'op', 'data_type': None}, 'fc_1_data': {'value': None, 'shape': None, 'kind': 'data'}, # Placeholders 'placeholder_2': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, 'placeholder_3': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, 'placeholder_3_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'op_output': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_1': {'kind': 'op', 'op': 'OpOutput'}, + 'op_output_2': {'kind': 'op', 'op': 'OpOutput'} } @@ -78,37 +93,49 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([1, 2, 3]), (3, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=False) @@ -123,37 +150,49 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_1_w': {'shape': np.array([1]), 'value': 6}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([6, 6, 6]), (3, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=False) @@ -168,20 +207,27 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('conv_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), + ('mul_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.ones(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.ones(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, - 'mul_1_data': {'shape': np.array([1, 55, 55, 96]), 'is_output': True}, + 'mul_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_mul_1_w': {'shape': np.array([96]), 'value': np.array([x for x in range(96)])}, 'mul_1_w': {'shape': np.array([96]), 'value': np.array([x for x in range(96)])}, }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([x for x in range(96)]), 96) @@ -190,16 +236,21 @@ class FuseMulTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, 'conv_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, - 'conv_1_data': {'shape': np.array([1, 55, 55, 96]), 'is_output': True} + 'conv_1_data': {'shape': np.array([1, 55, 55, 96])} }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=True) @@ -214,20 +265,27 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('conv_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), + ('mul_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.ones(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.ones(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, - 'mul_1_data': {'shape': np.array([1, 55, 55, 96]), 'is_output': True}, + 'mul_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_mul_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_1_w': {'shape': np.array([1]), 'value': 6}, }) ref_weights = np.ones((11, 11, 3, 96)) * np.array([6]) @@ -236,16 +294,21 @@ class FuseMulTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, 'conv_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, - 'conv_1_data': {'shape': np.array([1, 55, 55, 96]), 'is_output': True} + 'conv_1_data': {'shape': np.array([1, 55, 55, 96])} }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=True) @@ -262,9 +325,12 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), @@ -276,21 +342,28 @@ class FuseMulTests(unittest.TestCase): ('placeholder_3_data', 'concat_1'), ('conv_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_1_w': {'shape': np.array([1]), 'value': 6}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([6, 6, 6]), (3, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), @@ -302,15 +375,18 @@ class FuseMulTests(unittest.TestCase): ('placeholder_3_data', 'concat_1'), ('conv_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True}, - 'placeholder_2_data': {'is_output': True}, - 'placeholder_3_data': {'is_output': True}, + 'conv_1_data': {}, + 'placeholder_2_data': {}, + 'placeholder_3_data': {}, }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=False) @@ -323,9 +399,12 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), @@ -337,21 +416,28 @@ class FuseMulTests(unittest.TestCase): ('placeholder_3_data', 'concat_1'), ('conv_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([1]), 'value': np.array([6])}, 'mul_1_w': {'shape': np.array([1]), 'value': np.array([6])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([6, 6, 6]), (3, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), @@ -363,15 +449,18 @@ class FuseMulTests(unittest.TestCase): ('placeholder_3_data', 'concat_1'), ('conv_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output'), ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True}, - 'placeholder_2_data': {'is_output': True}, - 'placeholder_3_data': {'is_output': True}, + 'conv_1_data': {}, + 'placeholder_2_data': {}, + 'placeholder_3_data': {}, }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=False) @@ -387,60 +476,80 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('mul_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([1, 2, 3]), (3, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('placeholder_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_conv_2_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_2_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, }) @@ -457,37 +566,50 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, 'mul_1_data': {'shape': np.array([1, 2048])}, + 'const_mul_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'mul_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) ref_weights = np.ones((10260, 2048)) * np.array([x for x in range(2048)]) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, + 'const_fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'fc_1')], backward=False) @@ -502,43 +624,57 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'mul_1_w': {'shape': np.array([1]), 'value': 6}, + 'const_mul_1_w': {'shape': np.array([]), 'value': np.array(6)}, + 'mul_1_w': {'shape': np.array([]), 'value': np.array(6)}, 'conv_1': {'can_be_fused': False}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'mul_1_w': {'shape': np.array([1]), 'value': 6}, + 'const_mul_1_w': {'shape': np.array([]), 'value': np.array(6)}, + 'mul_1_w': {'shape': np.array([]), 'value': np.array(6)}, 'conv_1': {'can_be_fused': False}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=False) @@ -553,33 +689,41 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), ('conv_1_w', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 112, 112, 6])}, 'mul_1_data': {'shape': np.array([1, 112, 112, 6])}, + 'const_mul_1_w': {'shape': np.array([6]), 'value': np.array([1, 2, 3, 4, 5, 6])}, 'mul_1_w': {'shape': np.array([6]), 'value': np.array([1, 2, 3, 4, 5, 6])}, + 'const_conv_1_w': {'shape': np.array([3, 3, 6, 1]), 'value': np.ones((3, 3, 6, 1))}, 'conv_1_w': {'shape': np.array([3, 3, 6, 1]), 'value': np.ones((3, 3, 6, 1)), 'output_channel_dim': 2, 'input_channel_dim': 2, 'dims_number': 4}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) ref_weights = np.ones((3, 3, 6, 1)) * np.reshape(np.array([1, 2, 3, 4, 5, 6]), (6, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), ('conv_1_w', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 112, 112, 6])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 2, 'input_channel_dim': 2, 'dims_number': 4}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) _fuse_mul(graph, Node(graph, 'mul_1'), [Node(graph, 'conv_1')], backward=False) @@ -594,19 +738,24 @@ class FuseMulTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), ('conv_1_w', 'conv_1'), ('conv_1', 'conv_1_data'), ('conv_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), + ('mul_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 112, 112, 6])}, - 'mul_1_data': {'shape': np.array([1, 112, 112, 6]), 'is_output': True}, + 'mul_1_data': {'shape': np.array([1, 112, 112, 6])}, + 'const_mul_1_w': {'shape': np.array([6]), 'value': np.array([1, 2, 3, 4, 5, 6])}, 'mul_1_w': {'shape': np.array([6]), 'value': np.array([1, 2, 3, 4, 5, 6])}, + 'const_conv_1_w': {'shape': np.array([3, 3, 6, 1]), 'value': np.ones((3, 3, 6, 1))}, 'conv_1_w': {'shape': np.array([3, 3, 6, 1]), 'value': np.ones((3, 3, 6, 1)), 'output_channel_dim': 2, 'input_channel_dim': 2, 'dims_number': 4}, - 'conv_1_data': {'is_output': True} + 'conv_1_data': {} }) ref_weights = np.ones((3, 3, 6, 1)) * np.reshape(np.array([1, 2, 3, 4, 5, 6]), (6, 1)) @@ -614,10 +763,13 @@ class FuseMulTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), ('conv_1_w', 'conv_1'), ('conv_1', 'conv_1_data'), + ('conv_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 112, 112, 6])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 2, 'input_channel_dim': 2, 'dims_number': 4}, @@ -638,21 +790,29 @@ class FuseAddTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('add_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, 'add_1_data': {'shape': np.array([1, 2048])}, + 'const_add_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'add_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) ref_weights = np.ones((10260, 2048)) ref_biases = np.ones(10260) + np.dot(np.ones((10260, 2048)), np.array([x for x in range(2048)])) @@ -660,16 +820,21 @@ class FuseAddTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, + 'const_fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, 'fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) _fuse_add(graph, Node(graph, 'add_1'), [Node(graph, 'fc_1')], backward=False) @@ -684,16 +849,21 @@ class FuseAddTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), ('fc_1_w', 'fc_1'), ('fc_1', 'fc_1_data'), ('fc_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), + ('add_1_data', 'op_output_1') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, - 'add_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'add_1_data': {'shape': np.array([1, 10260])}, + 'const_add_1_w': {'shape': np.array([10260]), 'value': np.array([x for x in range(10260)])}, 'add_1_w': {'shape': np.array([10260]), 'value': np.array([x for x in range(10260)]), 'data_type': None}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2, 'data_type': None}, @@ -706,16 +876,21 @@ class FuseAddTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, + 'const_fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, 'fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) _fuse_add(graph, Node(graph, 'add_1'), [Node(graph, 'fc_1')], backward=True) @@ -730,15 +905,20 @@ class FuseAddTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), ('fc_1_w', 'fc_1'), ('fc_1', 'fc_1_data'), ('fc_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, - 'add_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'add_1_data': {'shape': np.array([1, 10260])}, + 'const_add_1_w': {'shape': np.array([1]), 'value': 6, 'data_type': None}, 'add_1_w': {'shape': np.array([1]), 'value': 6, 'data_type': None}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2, 'data_type': None}, @@ -751,16 +931,22 @@ class FuseAddTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, + 'const_fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, 'fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) _fuse_add(graph, Node(graph, 'add_1'), [Node(graph, 'fc_1')], backward=True) @@ -775,43 +961,58 @@ class FuseAddTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('add_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, 'add_1_data': {'shape': np.array([1, 2048])}, + 'const_add_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'add_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'fc_1': {'can_be_fused': False}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('add_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, 'add_1_data': {'shape': np.array([1, 2048])}, + 'const_add_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'add_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'fc_1': {'can_be_fused': False}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) _fuse_add(graph, Node(graph, 'add_1'), [Node(graph, 'fc_1')], backward=False) @@ -830,60 +1031,80 @@ class FuseLinOpsTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('mul_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) ref_weights = np.ones((11, 11, 3, 96)) * np.reshape(np.array([1, 2, 3]), (3, 1)) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('placeholder_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_conv_2_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'conv_2_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, }) @@ -900,37 +1121,49 @@ class FuseLinOpsTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, 'mul_1_data': {'shape': np.array([1, 2048])}, + 'const_mul_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, 'mul_1_w': {'shape': np.array([2048]), 'value': np.array([x for x in range(2048)])}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) ref_weights = np.ones((10260, 2048)) * np.array([x for x in range(2048)]) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, + 'const_fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, 'fc_1_b': {'shape': np.array([10260]), 'value': np.ones(10260)}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) fuse_linear_ops(graph) @@ -945,15 +1178,20 @@ class FuseLinOpsTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), ('fc_1_w', 'fc_1'), ('fc_1', 'fc_1_data'), ('fc_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, - 'add_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'add_1_data': {'shape': np.array([1, 10260])}, + 'const_add_1_w': {'shape': np.array([1]), 'value': np.array([6]), 'data_type': None}, 'add_1_w': {'shape': np.array([1]), 'value': np.array([6]), 'data_type': None}, + 'const_fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048))}, 'fc_1_w': {'shape': np.array([10260, 2048]), 'value': np.ones((10260, 2048)), 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2, 'data_type': None}, @@ -966,16 +1204,21 @@ class FuseLinOpsTests(unittest.TestCase): graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'fc_1'), + ('const_fc_1_w', 'fc_1_w'), + ('const_fc_1_b', 'fc_1_b'), ('fc_1_w', 'fc_1'), ('fc_1_b', 'fc_1'), ('fc_1', 'fc_1_data'), + ('fc_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 2048])}, + 'const_fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights}, 'fc_1_w': {'shape': ref_weights.shape, 'value': ref_weights, 'output_channel_dim': 0, 'input_channel_dim': 1, 'dims_number': 2}, + 'const_fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, 'fc_1_b': {'shape': ref_biases.shape, 'value': ref_biases}, - 'fc_1_data': {'shape': np.array([1, 10260]), 'is_output': True}, + 'fc_1_data': {'shape': np.array([1, 10260])}, }) fuse_linear_ops(graph) @@ -991,51 +1234,68 @@ class FuseLinOpsTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1_data', 'conv_1'), ('conv_1', 'conv_1_data'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1_data', 'add_1'), + ('const_add_1_w', 'add_1_w'), ('add_1_w', 'add_1'), ('add_1', 'add_1_data'), ('concat_1', 'concat_1_data'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('add_1_data', 'concat_1'), ('mul_1_data', 'concat_1'), - ('add_1_data', 'mul_1')], - + ('add_1_data', 'mul_1'), + ('concat_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': np.array([1, 1, 3, 3]), 'value': np.zeros((1, 1, 3, 3))}, 'conv_1_w': {'shape': np.array([1, 1, 3, 3]), 'value': np.zeros((1, 1, 3, 3)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([3]), 'value': np.zeros(3)}, 'conv_1_b': {'shape': np.array([3]), 'value': np.zeros(3)}, 'conv_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([1]), 'value': np.array([6])}, 'mul_1_w': {'shape': np.array([1]), 'value': np.array([6])}, + 'const_add_1_w': {'shape': np.array([1]), 'value': np.array([1])}, 'add_1_w': {'shape': np.array([1]), 'value': np.array([1])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1_data', 'conv_1'), ('conv_1', 'conv_1_data'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1_data', 'concat_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('conv_1_data', 'mul_1'), ('concat_1', 'concat_1_data'), ('mul_1', 'mul_1_data'), - ('mul_1_data', 'concat_1')], + ('mul_1_data', 'concat_1'), + ('concat_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_conv_1_w': {'shape': np.array([1, 1, 3, 3]), 'value': np.zeros((1, 1, 3, 3))}, 'conv_1_w': {'shape': np.array([1, 1, 3, 3]), 'value': np.zeros((1, 1, 3, 3)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([3]), 'value': np.ones(3)}, 'conv_1_b': {'shape': np.array([3]), 'value': np.ones(3)}, 'conv_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([1]), 'value': np.array([6])}, 'mul_1_w': {'shape': np.array([1]), 'value': np.array([6])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) fuse_linear_ops(graph) @@ -1051,69 +1311,92 @@ class FuseLinOpsTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('mul_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, 'conv_2': {'can_be_fused': False}, + 'const_conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('mul_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, 'conv_2': {'can_be_fused': False}, + 'const_conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) fuse_linear_ops(graph) @@ -1129,69 +1412,91 @@ class FuseLinOpsTests(unittest.TestCase): graph = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('mul_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1': {'can_be_fused': False}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) graph_ref = build_graph(nodes_attributes, [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'mul_1'), + ('const_mul_1_w', 'mul_1_w'), ('mul_1_w', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'conv_1'), + ('const_conv_1_w', 'conv_1_w'), + ('const_conv_1_b', 'conv_1_b'), ('conv_1_w', 'conv_1'), ('conv_1_b', 'conv_1'), ('conv_1', 'conv_1_data'), ('mul_1_data', 'conv_2'), + ('const_conv_2_w', 'conv_2_w'), + ('const_conv_2_b', 'conv_2_b'), ('conv_2_w', 'conv_2'), ('conv_2_b', 'conv_2'), ('conv_2', 'conv_2_data'), ('conv_1_data', 'concat_1'), ('conv_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1': {'can_be_fused': False}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, + 'const_mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, + 'const_conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_1_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_1_data': {'shape': np.array([1, 55, 55, 96])}, + 'const_conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96))}, 'conv_2_w': {'shape': np.array([11, 11, 3, 96]), 'value': np.ones((11, 11, 3, 96)), 'output_channel_dim': 3, 'input_channel_dim': 2, 'dims_number': 4}, + 'const_conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_b': {'shape': np.array([96]), 'value': np.zeros(96)}, 'conv_2_data': {'shape': np.array([1, 55, 55, 96])}, - 'concat_1_data': {'is_output': True} + 'concat_1_data': {} }) fuse_linear_ops(graph) diff --git a/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq.py b/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq.py index e608daf..1c96f6b 100644 --- a/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq.py +++ b/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,17 +21,16 @@ import networkx as nx import numpy as np from mo.front.extractor import add_attrs_props -from mo.graph.graph import Node, unique_id from mo.middle.passes.eliminate import graph_clean_up from mo.utils.graph import pseudo_topological_sort from mo.ops.lin_op import Mul, Add from mo.middle.passes.eliminate import merge_data_nodes from mo.ops.op import Op -from mo.graph.graph import dump_graph_for_graphviz +from mo.graph.graph import Node, Graph from mo.middle.passes.fusing.helpers import backward_bfs, forward_bfs, get_tensor_id, get_value_id -def _fuse_linear_sequence(graph: nx.MultiDiGraph, start_node: Node): +def _fuse_linear_sequence(graph: Graph, start_node: Node): """ This function finds the sequence of Mul/Add operations and replaces this sequence with two ops (Mul->Add). :param graph: @@ -125,7 +124,7 @@ def _fuse_linear_sequence(graph: nx.MultiDiGraph, start_node: Node): return True -def fuse_mul_add_sequence(graph: nx.MultiDiGraph): +def fuse_mul_add_sequence(graph: Graph): """ This function finds first valid Mul/Add node and pass it to fuse_linear_sequence where full sequence will be found """ diff --git a/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq_test.py b/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq_test.py index d320b57..c58ade4 100644 --- a/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq_test.py +++ b/model-optimizer/mo/middle/passes/fusing/fuse_linear_seq_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -79,6 +79,7 @@ nodes_attributes = { 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, 'placeholder_3': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, 'placeholder_3_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -102,6 +103,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -110,7 +112,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -125,6 +126,7 @@ class LinSeqFusingTests(unittest.TestCase): ('add_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -133,7 +135,6 @@ class LinSeqFusingTests(unittest.TestCase): 'add_1_w': {'shape': np.array([1]), 'value': np.array([36])}, 'mul_1': {'can_be_fused': True}, 'add_1': {'can_be_fused': True}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -167,7 +168,8 @@ class LinSeqFusingTests(unittest.TestCase): ('placeholder_1_data', 'concat_1'), ('mul_2_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -177,7 +179,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -194,7 +195,8 @@ class LinSeqFusingTests(unittest.TestCase): ('placeholder_1_data', 'concat_1'), ('add_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -204,7 +206,6 @@ class LinSeqFusingTests(unittest.TestCase): 'add_1_w': {'shape': np.array([1]), 'value': np.array([36])}, 'mul_1': {'can_be_fused': True}, 'add_1': {'can_be_fused': True}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) graph.graph['layout'] = 'NHWC' @@ -234,7 +235,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('add_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -244,7 +246,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -263,7 +264,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('add_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -273,7 +275,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -303,7 +304,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -313,7 +315,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -332,7 +333,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -342,7 +344,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': np.array([36])}, 'mul_2_w': {'shape': np.array([1]), 'value': np.array([6])}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -373,7 +374,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -383,7 +385,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 0}, 'mul_2_w': {'shape': np.array([1]), 'value': 1}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -396,13 +397,13 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -434,7 +435,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -444,7 +446,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 1}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -460,14 +461,14 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': np.array([6])}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -498,7 +499,8 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, @@ -508,7 +510,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 0}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -524,14 +525,14 @@ class LinSeqFusingTests(unittest.TestCase): ('concat_1', 'concat_1_data'), ('mul_1_data', 'placeholder_2'), ('placeholder_2', 'placeholder_2_data'), - ('placeholder_2_data', 'concat_1') + ('placeholder_2_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'placeholder_2_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': np.array([6])}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -558,6 +559,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2', 'mul_2_data'), ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -566,7 +568,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 1}, 'add_1_w': {'shape': np.array([1]), 'value': 0}, 'mul_2_w': {'shape': np.array([1]), 'value': 1}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -574,10 +575,9 @@ class LinSeqFusingTests(unittest.TestCase): [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, - 'concat_1_data': {'is_output': True} - }, + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}}, nodes_with_edges_only=True) graph.graph['layout'] = 'NHWC' @@ -603,6 +603,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2', 'mul_2_data'), ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -611,7 +612,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -625,13 +625,13 @@ class LinSeqFusingTests(unittest.TestCase): ('add_1', 'add_1_data'), ('add_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_w': {'shape': np.array([1]), 'value': np.array([36])}, 'add_1_w': {'shape': np.array([1]), 'value': np.array([36])}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -658,6 +658,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2', 'mul_2_data'), ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -666,7 +667,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([6, 6, 6])}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -680,13 +680,13 @@ class LinSeqFusingTests(unittest.TestCase): ('add_1', 'add_1_data'), ('add_1_data', 'concat_1'), ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, 'add_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_w': {'shape': np.array([3]), 'value': np.array([36, 36, 36])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([36, 36, 36])}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -716,6 +716,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -726,7 +727,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_2_w': {'shape': np.array([1]), 'value': 6}, 'mul_1': {'can_be_fused': False}, 'add_1': {'can_be_fused': False}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -744,6 +744,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -754,7 +755,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_2_w': {'shape': np.array([1]), 'value': 6}, 'mul_1': {'can_be_fused': False}, 'add_1': {'can_be_fused': False}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -784,6 +784,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -793,7 +794,6 @@ class LinSeqFusingTests(unittest.TestCase): 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, 'add_1': {'can_be_fused': False}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -811,6 +811,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -820,7 +821,6 @@ class LinSeqFusingTests(unittest.TestCase): 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, 'add_1': {'can_be_fused': False}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -856,6 +856,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_2_w', 'mul_4'), ('mul_4', 'mul_4_data'), ('mul_4_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -866,7 +867,6 @@ class LinSeqFusingTests(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) @@ -884,6 +884,7 @@ class LinSeqFusingTests(unittest.TestCase): ('mul_3', 'mul_3_data'), ('mul_3_w', 'mul_3'), ('mul_3_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -894,7 +895,6 @@ class LinSeqFusingTests(unittest.TestCase): 'add_1_w': {'shape': np.array([1]), 'value': np.array([36])}, 'mul_1': {'can_be_fused': True}, 'add_1': {'can_be_fused': True}, - 'concat_1_data': {'is_output': True} }, nodes_with_edges_only=True) diff --git a/model-optimizer/mo/middle/passes/fusing/helpers.py b/model-optimizer/mo/middle/passes/fusing/helpers.py index c743c70..f07331b 100644 --- a/model-optimizer/mo/middle/passes/fusing/helpers.py +++ b/model-optimizer/mo/middle/passes/fusing/helpers.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/middle/passes/fusing/helpers_test.py b/model-optimizer/mo/middle/passes/fusing/helpers_test.py index feb2020..365ba10 100644 --- a/model-optimizer/mo/middle/passes/fusing/helpers_test.py +++ b/model-optimizer/mo/middle/passes/fusing/helpers_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -64,6 +64,7 @@ nodes_attributes = { 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, 'placeholder_3': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, 'placeholder_3_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -79,9 +80,9 @@ class BFSTests(unittest.TestCase): ('scaleshift_1_data', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), - ('add_1', 'add_1_data') - ], - {'add_1_data': {'is_output': True}}) + ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') + ]) res = forward_bfs(Node(graph, 'placeholder_1'), ['ScaleShift', 'Mul'], ['Add']) self.assertTrue(len(res) == 1 and res[0].id == 'add_1', 'Add operation was not found by bfs') @@ -105,9 +106,9 @@ class BFSTests(unittest.TestCase): ('scaleshift_1_data', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), - ('add_1', 'add_1_data') - ], - {'add_1_data': {'is_output': True}}) + ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') + ]) res = backward_bfs(Node(graph, 'add_1_data'), ['Add', 'ScaleShift', 'Mul'], ['Placeholder']) self.assertTrue(len(res) == 1 and res[0].id == 'placeholder_1', 'Placeholder operation was not found by bfs') @@ -139,9 +140,9 @@ class BFSTests(unittest.TestCase): ('mul_2', 'mul_2_data'), ('add_1_data', 'concat_1'), ('mul_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') - ], - {'concat_1_data': {'is_output': True}}) + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ]) res = forward_bfs(Node(graph, 'placeholder_1'), ['ScaleShift', 'Mul', 'Add'], ['Concat']) self.assertTrue(len(res) == 1 and res[0].id == 'concat_1', 'Probably Concat operation was not found by bfs') @@ -178,9 +179,9 @@ class BFSTests(unittest.TestCase): ('mul_2', 'mul_2_data'), ('add_1_data', 'concat_1'), ('mul_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') - ], - {'concat_1_data': {'is_output': True}}) + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ]) res = backward_bfs(Node(graph, 'concat_1'), ['ScaleShift', 'Mul', 'Add'], ['Placeholder']) self.assertTrue(len(res) == 0, 'Smth went wrong with bfs') @@ -216,9 +217,9 @@ class BFSTests(unittest.TestCase): ('mul_2', 'mul_2_data'), ('add_1_data', 'concat_1'), ('mul_2_data', 'concat_1'), - ('concat_1', 'concat_1_data') - ], - {'concat_1_data': {'is_output': True}}) + ('concat_1', 'concat_1_data'), + ('concat_1_data', 'op_output') + ]) res = backward_bfs(Node(graph, 'concat_1'), ['Mul', 'Add'], ['Placeholder']) self.assertTrue(len(res) == 0, 'Smth went wrong with bfs') @@ -248,9 +249,9 @@ class BFSTests(unittest.TestCase): ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), ('add_1', 'add_1_data'), - ('add_1_data', 'placeholder_1') - ], - {'add_1_data': {'is_output': True}}) + ('add_1_data', 'placeholder_1'), + ('add_1_data', 'op_output') + ]) res = backward_bfs(Node(graph, 'add_1_data'), ['Add', 'ScaleShift', 'Mul', 'Placeholder'], ['Conv2D']) self.assertTrue(len(res) == 0, 'Sholdn\'t find any nodes due to cycle in graph') @@ -268,9 +269,9 @@ class GetNextOperationTests(unittest.TestCase): ('scaleshift_1_data', 'mul_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), - ('add_1', 'add_1_data') - ], - {'add_1_data': {'is_output': True}}) + ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') + ]) res = get_next_operation(Node(graph, 'mul_1')) self.assertTrue(len(res) == 1 and res[0].id == 'add_1', 'get_nex_operation returned wrong op') @@ -283,9 +284,9 @@ class GetNextOperationTests(unittest.TestCase): ('placeholder_1_data', 'add_1'), ('mul_1', 'mul_1_data'), ('mul_1_data', 'add_1'), - ('add_1', 'add_1_data') - ], - {'add_1_data': {'is_output': True}}) + ('add_1', 'add_1_data'), + ('add_1_data', 'op_output') + ]) res = get_next_operation(Node(graph, 'placeholder_1')) self.assertTrue(len(res) == 2 and all([x.id in ['add_1', 'mul_1'] for x in res]), @@ -300,8 +301,8 @@ class GetNextOperationTests(unittest.TestCase): ('placeholder_1_data', 'mul_1'), ('placeholder_2_data', 'mul_1'), ('mul_1', 'mul_1_data'), - ], - {'mul_1_data': {'is_output': True}}) + ('mul_1_data', 'op_output') + ]) res = get_next_operation(Node(graph, 'placeholder_1')) self.assertTrue(len(res) == 1 and res[0].id == 'mul_1', 'get_nex_operation returned wrong op') diff --git a/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes.py b/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes.py index d61c313..a67897b 100644 --- a/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes.py +++ b/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ """ import logging as log -import networkx as nx import re -from mo.graph.graph import get_graph_ops, Node +from mo.graph.graph import Node, Graph from mo.middle.passes.fusing.helpers import get_value_id @@ -36,9 +35,9 @@ def _check_lin_op(node: Node, layout: str): log.info('[ FUSING ] Node {} marked as fusable'.format(node.id)) -def mark_unfused_nodes(graph: nx.MultiDiGraph, regex_masks: str): +def mark_unfused_nodes(graph: Graph, regex_masks: str): regex_masks = [] if not regex_masks else regex_masks.split(',') - nodes = get_graph_ops(graph) + nodes = graph.get_op_nodes() for node in nodes: if node.has_valid('can_be_fused'): continue diff --git a/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes_test.py b/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes_test.py index f68c7ed..6224b9b 100644 --- a/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes_test.py +++ b/model-optimizer/mo/middle/passes/fusing/mark_unfused_nodes_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -65,6 +65,7 @@ nodes_attributes = { 'placeholder_2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, 'placeholder_3': {'shape': None, 'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'}, 'placeholder_3_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'} } @@ -86,6 +87,7 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -94,7 +96,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NHWC' @@ -121,6 +122,7 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -129,7 +131,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NHWC' @@ -157,6 +158,7 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -165,7 +167,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([1]), 'value': 6}, 'add_1_w': {'shape': np.array([1]), 'value': 6}, 'mul_2_w': {'shape': np.array([1]), 'value': 6}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NHWC' @@ -191,6 +192,8 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -199,7 +202,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NHWC' @@ -225,6 +227,7 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -233,7 +236,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NCHW' @@ -259,6 +261,7 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -267,7 +270,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NCHW' @@ -293,6 +295,7 @@ class MarkFusedNodes(unittest.TestCase): ('mul_2_data', 'concat_1'), ('concat_1', 'concat_1_data'), ('placeholder_1_data', 'concat_1'), + ('concat_1_data', 'op_output') ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'mul_1_data': {'shape': np.array([1, 227, 227, 3])}, @@ -301,7 +304,6 @@ class MarkFusedNodes(unittest.TestCase): 'mul_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'add_1_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, 'mul_2_w': {'shape': np.array([3]), 'value': np.array([1, 2, 3])}, - 'concat_1_data': {'is_output': True} }) graph.graph['layout'] = 'NHWC' diff --git a/model-optimizer/mo/middle/passes/fusing/resnet_optimization.py b/model-optimizer/mo/middle/passes/fusing/resnet_optimization.py index 8e6481a..6f78a39 100644 --- a/model-optimizer/mo/middle/passes/fusing/resnet_optimization.py +++ b/model-optimizer/mo/middle/passes/fusing/resnet_optimization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,10 +16,9 @@ import logging as log -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.passes.fusing.helpers import get_next_operation from mo.ops.pooling import Pooling from mo.utils.graph import pseudo_topological_sort @@ -32,7 +31,7 @@ def _clean_fw_tensor_attrs(node: Node): node[attr] = None -def _insert_pooling(graph: nx.MultiDiGraph, first_node: Node, second_node: Node, spatial_dims): +def _insert_pooling(graph: Graph, first_node: Node, second_node: Node, spatial_dims): """ This function inserts point wise pooling layer between two nodes """ @@ -70,7 +69,7 @@ def _check_next_ops(next_ops: list): return stride_props, status -def _simple_stride_prop(graph: nx.MultiDiGraph, node: Node, spatial_dims, supported=True): +def _simple_stride_prop(graph: Graph, node: Node, spatial_dims, supported=True): """ This function handles stride propagation for op nodes. If node is in supported ops dict so this is supported operation and we can propagate stride directly via this op (stride_prop will be set by using bottom stride_prop), otherwise we can't and @@ -99,7 +98,7 @@ def _simple_stride_prop(graph: nx.MultiDiGraph, node: Node, spatial_dims, suppor _clean_fw_tensor_attrs(node.out_node()) -def _conv_stride_prop(graph: nx.MultiDiGraph, node: Node, spatial_dims, supported=True): +def _conv_stride_prop(graph: Graph, node: Node, spatial_dims, supported=True): """ This function handles convolution stride propagation. There is two cases: conv->(op) and conv->conv. In first case we propagate stride from op, and in second case we also change stride for second conv @@ -138,11 +137,12 @@ supported_ops = { } -def _stride_propagation(graph: nx.MultiDiGraph, spatial_dims): +def _stride_propagation(graph: Graph, spatial_dims): """ This function do stride propagation for all op nodes """ - nodes = [Node(graph, x) for x in pseudo_topological_sort(graph, reverse=True) if Node(graph, x).kind == 'op'] + nodes = [Node(graph, x) for x in pseudo_topological_sort(graph, reverse=True) if + Node(graph, x).kind == 'op' and Node(graph, x).soft_get('type') != 'Const'] for node in nodes: if node.soft_get('type') in supported_ops: @@ -155,7 +155,7 @@ def _stride_propagation(graph: nx.MultiDiGraph, spatial_dims): _simple_stride_prop(graph, node, spatial_dims, False) -def stride_optimization(graph: nx.MultiDiGraph): +def stride_optimization(graph: Graph): """ This is main function for stride optimization pass """ diff --git a/model-optimizer/mo/middle/passes/fusing/resnet_optimization_test.py b/model-optimizer/mo/middle/passes/fusing/resnet_optimization_test.py index 0065775..ca68f52 100644 --- a/model-optimizer/mo/middle/passes/fusing/resnet_optimization_test.py +++ b/model-optimizer/mo/middle/passes/fusing/resnet_optimization_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/middle/passes/infer.py b/model-optimizer/mo/middle/passes/infer.py index e6f46e8..9d75b9a 100644 --- a/model-optimizer/mo/middle/passes/infer.py +++ b/model-optimizer/mo/middle/passes/infer.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,22 +21,16 @@ import numpy as np # TODO remove it from mo.front.extractor import update_ie_fields -from mo.graph.graph import Node, get_outputs, get_node_id_by_name, dump_graph_for_graphviz -from mo.middle.passes.eliminate import get_nodes_with_attributes -from mo.middle.pattern_match import apply_pattern, for_each_sub_graph -from mo.ops.lin_op import Mul, Add -from mo.ops.op import Op -from mo.utils.error import Error -from mo.utils.utils import refer_to_faq_msg +from mo.graph.graph import Node, Graph from mo.graph.graph import dict_includes +from mo.middle.pattern_match import for_each_sub_graph +from mo.utils.error import Error +from mo.utils.utils import refer_to_faq_msg, shrink_str_value def log_debug_dict(nodes_per_port: dict, direction_name: str): for port, node in nodes_per_port.items(): - value = str(node.soft_get('value')) - max_symbols = 100 - if len(value) > max_symbols: - value = value.strip('\n')[:max_symbols - 3] + '...' + value = shrink_str_value(node.soft_get('value')) log.debug('{}[{}]: shape = {}, value = {}'.format(direction_name, port, node.soft_get('shape'), value)) @@ -46,7 +40,7 @@ def is_fully_defined_shape(shape: np.ndarray): return True -def control_flow_infer(graph: nx.MultiDiGraph, node_name: str): +def control_flow_infer(graph: Graph, node_name: str): """ Executes constant control flow. Propagates nodes executability """ @@ -77,24 +71,7 @@ def control_flow_infer(graph: nx.MultiDiGraph, node_name: str): mark_executability(out_data, is_executable) -def delete_not_executable(graph: nx.MultiDiGraph): - nodes_to_remove = set() - for node_name, node_attrs in graph.nodes(data=True): - if node_attrs['kind'] == 'data' and 'executable' in node_attrs and not node_attrs['executable']: - [nodes_to_remove.add(op) for op, _ in graph.in_edges(node_name)] - nodes_to_remove.add(node_name) - log.debug('Removing the following not executable nodes: {}'.format('\n'.join(sorted(map(str, nodes_to_remove))))) - graph.remove_nodes_from(nodes_to_remove) - - -def delete_control_flow_edges(graph: nx.MultiDiGraph): - for u, v, k, attrs in list(graph.edges(keys=True, data=True)): - if 'control_flow_edge' in attrs and attrs['control_flow_edge']: - graph.remove_edge(u, v, k) - log.debug('Removing control flow edge from {} to {}'.format(u, v)) - - -def exit_bound_edges(graph: nx.MultiDiGraph, sources: list, end_node_attrs: dict): +def exit_bound_edges(graph: Graph, sources: list, end_node_attrs: dict): """ Finds all descendant nodes for each node from 'sources' that have given attributes from end_node_attrs. For each found node, create a tuple with a given element from 'source' and the node. @@ -107,14 +84,14 @@ def exit_bound_edges(graph: nx.MultiDiGraph, sources: list, end_node_attrs: dict return result -def partial_infer(graph: nx.MultiDiGraph, start_node: str = None): +def partial_infer(graph: Graph, start_node: str = None): """ Tries to execute constant parts of the graph and deduce as much as possible information following the data flow, e.g. calculate and propagate shapes and constant values. Partially or completely defined values are stored in data nodes (kind='data'). """ - cycle_nodes = get_nodes_with_attributes(graph, is_cyclic=True) + cycle_nodes = graph.get_nodes_with_attributes(is_cyclic=True) cycle_nodes = [Node(graph, node).out_node().id for node in cycle_nodes] ebunch_cyclic = list(graph.out_edges(nbunch=cycle_nodes, data=True, keys=True)) ebunch_reconnected = exit_bound_edges(graph, sources=cycle_nodes, end_node_attrs={'op': 'Exit'}) @@ -138,7 +115,7 @@ def partial_infer(graph: nx.MultiDiGraph, start_node: str = None): debug_logger = log.getLogger().isEnabledFor(log.DEBUG) nx.set_node_attributes(G=graph, name='executable', - values={n: True for n in get_nodes_with_attributes(graph, kind='data')}) + values={n: True for n in graph.get_nodes_with_attributes(kind='data')}) for n in nodes: # Data Flow Infer @@ -165,6 +142,8 @@ def partial_infer(graph: nx.MultiDiGraph, start_node: str = None): log.debug('Outputs:') log_debug_dict(node.out_nodes(), 'output') + not_all_output_shapes = False + for out_port, out_node in out_nodes.items(): not_all_output_shapes = False if not out_node.has_valid('shape'): @@ -217,30 +196,16 @@ def partial_infer(graph: nx.MultiDiGraph, start_node: str = None): refer_to_faq_msg(38)) from err control_flow_infer(graph, n) - not_fully_inferred = get_nodes_with_attributes(graph, is_not_fully_inferred=True) + not_fully_inferred = graph.get_nodes_with_attributes(is_not_fully_inferred=True) for n in not_fully_inferred: node = Node(graph, n) if node.has('infer') and not node.infer is None: node.infer(node) - #delete_not_executable(graph) return graph -def check_for_cycle(graph: nx.MultiDiGraph): - is_acyclic = nx.is_directed_acyclic_graph(graph) - if not is_acyclic: - raise Error('Graph contains a cycle. Can not proceed. ' + refer_to_faq_msg(97)) - - -def mark_outputs(graph: nx.MultiDiGraph): - nx.set_node_attributes(G=graph, name='is_output', values=False) - for node in graph.nodes(): - if graph.node[node]['kind'] == 'data' and len(get_outputs(graph, node)) == 0: - nx.set_node_attributes(G=graph, name='is_output', values={node: True}) - - -def override_batch(graph: nx.MultiDiGraph, batch: int): +def override_batch(graph: Graph, batch: int): """ Overrides batch for nodes with 'op' param set to 'Placeholder' Parameters @@ -250,7 +215,7 @@ def override_batch(graph: nx.MultiDiGraph, batch: int): """ if batch is not None: for node_id, data in graph.nodes(data=True): - if 'op' in data and data['op'] == 'Placeholder': + if 'op' in data and data['op'] == 'Placeholder' and not data.get('fixed_batch', False): if len(data['shape']) == 0 or data['shape'][0] not in (-1, 0, 1): raise Error(('The input layer {} has a shape {} defined in the model. \n\n' + 'When you use -b (--batch) option, Model Optimizer applies its value to the first ' + @@ -264,7 +229,7 @@ def override_batch(graph: nx.MultiDiGraph, batch: int): data['shape'][0] = batch -def override_placeholder_shapes(graph: nx.MultiDiGraph, user_shapes: dict, batch=None): +def override_placeholder_shapes(graph: Graph, user_shapes: dict, batch=None): """ This function overrides shapes for nodes with 'op' param set to 'Placeholder' with shapes defined by users (only for inputs without in/out port specified). @@ -277,7 +242,7 @@ def override_placeholder_shapes(graph: nx.MultiDiGraph, user_shapes: dict, batch # DON'T MOVE UPPER!!! WE NEED TO SET BATCH FIRST # user did not specify neither shapes nor inputs, keep models values return - placeholders = get_nodes_with_attributes(graph, kind='op', op='Placeholder') + placeholders = graph.get_nodes_with_attributes(kind='op', op='Placeholder') for node_id in placeholders: node_attrs = graph.node[node_id] shape = None @@ -293,141 +258,7 @@ def override_placeholder_shapes(graph: nx.MultiDiGraph, user_shapes: dict, batch node_attrs['shape'][0] = batch -def _scale_input_action_mul(graph: nx.MultiDiGraph, match: dict, scale: float): - assert (len(match['placeholder'].out_nodes())) - - tinput = match['placeholder'] - if not tinput.has_valid('shape'): - raise Error("Node {} has not valid shape attribute".format(tinput.id)) - - input_shape = tinput.shape - toutput = match['data'] - - # Create Mul node - value = np.array([1 / scale]) - - # Disconnect input with data node - graph.remove_edge(tinput.id, toutput.id) - - # Create Mul node - mul_node = Mul(graph, dict(name="Mul1_")) - mul_data = Op.create_input_data_node(graph, "data_mul_scale_", np.array(value)) - Op.expand_node_shape(mul_data, len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 0) - mul_input = Op.create_data_node(graph, tinput, {'shape': toutput.shape}) - - mul_node.create_node_with_data(inputs=[mul_input, mul_data], data_nodes=toutput) - - -def scale_input(graph: nx.MultiDiGraph, scale: float): - """ - Searches for all entries of Placeholder in graph and passes it to the the replace transform - Args: - graph: an instance of nx graph - scale: integer value for the scale - """ - if scale is None or scale == 1: - return - - apply_pattern( - graph, - nodes=[ - ('placeholder', dict(kind='op', op='Placeholder')), - ('data', dict(kind='data'))], - edges=[ - ('placeholder', 'data'), ], - action=lambda graph, match: _scale_input_action_mul(graph, match, scale) - ) - - -def add_mean_scale_values(graph: nx.MultiDiGraph, values): - input_nodes = {} - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == 'Placeholder': - input_nodes.update({node.id: node}) - - if not isinstance(values, dict): - if len(values) != len(input_nodes): - raise Error('Numbers of inputs and mean/scale values do not match. ' + - refer_to_faq_msg(61)) - - data = np.copy(values) - values = {} - for idx, key in enumerate(input_nodes.keys()): - values.update( - { - input_nodes[key]['name']: { - 'mean': data[idx][0], - 'scale': data[idx][1] - } - } - ) - - for node_name in values: - node_id = get_node_id_by_name(graph, node_name) - node_mean_scale_values = values[node_name] - if node_id not in input_nodes: - # if the user cutted-off input of the network then input node name specified in the --scale_values - # or --mean_values doesn't correspond to a real input node generated by Model Optimizer. But the information - # about initial input node name is stored in Placeholder's attribute 'initial_node_name' - new_node_id = None - for placeholder in input_nodes.values(): - if placeholder.has('initial_node_name') and placeholder.initial_node_name == node_name: - new_node_id = placeholder.id - break - if new_node_id is None: - raise Error('Input with name {} wasn\'t found!'.format(node_name) + - refer_to_faq_msg(83)) - node_id = new_node_id - - input_node = Node(graph, node_id) - apply_scale(graph, input_node, node_mean_scale_values) - apply_mean_value(graph, input_node, node_mean_scale_values) - - -def apply_scale(graph: nx.MultiDiGraph, input_node: Node, node_mean_scale_values: dict): - if 'scale' in node_mean_scale_values and node_mean_scale_values['scale'] is not None: - if all([x == 1 for x in node_mean_scale_values['scale']]): - return - out_node = input_node.out_node() - if not input_node.has_valid('shape'): - raise Error("Node {} has not valid shape attribute".format(input_node.id)) - input_shape = input_node.shape - - # Create Mul node - value = 1 / np.array(node_mean_scale_values['scale']) - graph.remove_edge(input_node.id, out_node.id) - - mul_node = Mul(graph, dict(name="Mul_")) - mul_data = Op.create_input_data_node(graph, "data_mul_", np.array(value)) - Op.expand_node_shape(mul_data, (len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 0)) - mul_input = Op.create_data_node(graph, input_node, {'shape': out_node.shape}) - - mul_node.create_node_with_data(inputs=[mul_input, mul_data], data_nodes=out_node) - - -def apply_mean_value(graph: nx.MultiDiGraph, input_node: Node, node_mean_scale_values: dict): - if 'mean' in node_mean_scale_values and node_mean_scale_values['mean'] is not None: - if all([x == 0 for x in node_mean_scale_values['mean']]): - return - out_node = input_node.out_node() - if not input_node.has_valid('shape'): - raise Error("Node {} has not valid shape attribute".format(input_node.id)) - input_shape = input_node.shape - # Create Add node - graph.remove_edge(input_node.id, out_node.id) - - value = np.array(node_mean_scale_values['mean']) * (-1) - - add_node = Add(graph, dict(name="Add_")) - add_data = Op.create_input_data_node(graph, "data_add_", np.array(value)) - Op.expand_node_shape(add_data, (len(input_shape) - 2 if graph.graph['layout'] == 'NCHW' else 0)) - add_input = Op.create_data_node(graph, input_node, {'shape': out_node.shape}) - - add_node.create_node_with_data(inputs=[add_input, add_data], data_nodes=out_node) - - -def update_fully_connected_shapes(graph: nx.MultiDiGraph): +def update_fully_connected_shapes(graph: Graph): nodes = nx.topological_sort(graph) while True: should_infer = False @@ -453,7 +284,7 @@ def update_fully_connected_shapes(graph: nx.MultiDiGraph): # Convert MUL operation to Power layer in case when # mul op takes two inputs (scalar constant and tensor) -def convert_mul_add_to_power(graph: nx.MultiDiGraph): +def convert_mul_add_to_power(graph: Graph): for_each_sub_graph(graph, convert_mul_add_to_power) nodes = list(graph.nodes()) for n in nodes: diff --git a/model-optimizer/mo/middle/passes/infer_test.py b/model-optimizer/mo/middle/passes/infer_test.py index d3b7e65..794221a 100644 --- a/model-optimizer/mo/middle/passes/infer_test.py +++ b/model-optimizer/mo/middle/passes/infer_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,11 +20,9 @@ import numpy as np from mo.front.common.partial_infer.concat import concat_infer from mo.graph.graph import Node -from mo.middle.passes.infer import override_placeholder_shapes, partial_infer, add_mean_scale_values, scale_input, \ - check_for_cycle -from mo.utils.cli_parser import get_mean_scale_dictionary, parse_tuple_pairs +from mo.middle.passes.infer import override_placeholder_shapes, partial_infer from mo.utils.error import Error -from mo.utils.unittest.graph import build_graph, compare_graphs +from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, 'node_1_data': {'value': None, 'kind': 'data', 'data_type': None}, @@ -50,6 +48,7 @@ nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, 'mul_1': {'type': None, 'kind': 'op', 'op': 'Mul'}, 'mul_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'mul_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput', 'infer': lambda x: None} } @@ -59,8 +58,10 @@ class TestInferPass(unittest.TestCase): Test for overriding shape in placeholder by shape from user_shapes. """ graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder'} }, nodes_with_edges_only=True) @@ -76,8 +77,10 @@ class TestInferPass(unittest.TestCase): Test for case when user_shapes is not defined. """ graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None, 'op': 'Placeholder'}, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None, 'op': 'Placeholder'}, 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder'} }, nodes_with_edges_only=True) @@ -92,8 +95,10 @@ class TestInferPass(unittest.TestCase): Test for case when user_shapes is not None, but it shouldn't rewrite shapes. """ graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder'} }, nodes_with_edges_only=True) @@ -106,8 +111,10 @@ class TestInferPass(unittest.TestCase): def test_override_placeholder_shapes_dict(self): graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None, 'op': 'Placeholder'}, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None, 'op': 'Placeholder'}, 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder'} }, nodes_with_edges_only=True) @@ -185,8 +192,10 @@ class TestInferPass(unittest.TestCase): graph = build_graph(nodes_attributes, [('node_1', 'concat'), ('node_2', 'concat'), - ('concat', 'node_3')], - {'node_3': {'kind': 'data', 'is_output': True, 'shape': None, 'infer': None}, + ('concat', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'kind': 'data', 'shape': None, 'infer': None}, 'node_1': {'kind': 'data', 'shape': np.array([1, 3, 227, 227]), 'infer': None}, 'node_2': {'kind': 'data', 'shape': np.array([1, 3, 227, 227]), 'infer': None}, 'concat': {'kind': 'op', 'axis': 2, 'infer': concat_infer} @@ -219,8 +228,10 @@ class TestInferPass(unittest.TestCase): def test_partial_infer_no_shape(self): graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None, 'infer': None}, + [('node_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None, 'infer': None}, 'node_1': {'shape': None, 'infer': None} }, nodes_with_edges_only=True) @@ -231,8 +242,10 @@ class TestInferPass(unittest.TestCase): [('node_1', 'concat'), ('node_2', 'concat'), ('concat', 'node_3'), - ('node_3', 'concat')], - {'node_3': {'kind': 'data', 'is_output': True, 'shape': None, 'infer': None}, + ('node_3', 'concat'), + ('node_3', 'op_output') + ], + {'node_3': {'kind': 'data', 'shape': None, 'infer': None}, 'node_1': {'kind': 'data', 'shape': np.array([1, 3, 227, 227]), 'infer': None}, 'node_2': {'kind': 'data', 'shape': np.array([1, 3, 227, 227]), 'infer': None}, 'concat': {'kind': 'op', 'axis': 2, 'infer': concat_infer} @@ -242,268 +255,17 @@ class TestInferPass(unittest.TestCase): start_node = 'concat' self.assertRaises(Error, partial_infer, graph, start_node) - def test_add_mean_scale_values_with_data_name(self): - graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None, 'data_type': None}, - 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder', 'name': 'data', - 'data_type': None} - }, - nodes_with_edges_only=True) - graph.graph['layout'] = 'NCHW' - mean_values = parse_tuple_pairs('(124,117,104)') - scale_values = parse_tuple_pairs('') - - # input = 'data' - mean_scale = get_mean_scale_dictionary(mean_values, scale_values, None) - self.assertEqual(len(graph), 2) - add_mean_scale_values(graph, mean_scale) - self.assertEqual(len(graph), 5) - - def test_add_mean_scale_values_without_data_name(self): - graph = build_graph(nodes_attributes, - [('node_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None, 'data_type': None}, - 'node_1': {'shape': np.array([1, 3, 227, 227]), 'op': 'Placeholder', 'name': 'data', - 'data_type': None} - }, - nodes_with_edges_only=True) - graph.graph['layout'] = 'NCHW' - mean_values = parse_tuple_pairs('(124,117,104)') - scale_values = parse_tuple_pairs('') - # input = None - mean_scale = get_mean_scale_dictionary(mean_values, scale_values, None) - self.assertEqual(len(graph), 2) - add_mean_scale_values(graph, mean_scale) - self.assertEqual(len(graph), 5) - - def test_add_mean_scale_values1(self): - graph = build_graph(nodes_attributes, - [('pl_1', 'pl_1_data'), ('pl_2', 'pl_2_data')], - {'pl_1_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, - 'pl_2_data': {'shape': np.array([1, 6]), 'infer': None}, - 'pl_1': {'shape': np.array([1,3,38,38])}, - 'pl_2': {'shape': np.array([1,6])}, - }, - nodes_with_edges_only=True) - graph.graph['layout'] = 'NCHW' - add_mean_scale_values(graph, - {'pl_1': {'mean': np.array([1., 2., 3.])}, 'pl_2': {'mean': np.array([0., 0., 0.])}}) - mul_op_cnt = 0 - add_op_cnt = 0 - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == 'Mul': - mul_op_cnt += 1 - if node.has_valid('op') and node.op == 'Add': - add_op_cnt += 1 - - self.assertEqual(add_op_cnt, 1, "Found more than one Add op in graph") - self.assertEqual(mul_op_cnt, 0, "Found Mul op in graph") - - def test_optimize_scale_and_add_mean_values(self): - graph = build_graph( - nodes_attributes, - [ - ('pl_1', 'pl_1_data') - ], - { - 'pl_1_data': { - 'shape': np.array([1, 3, 38, 38]), - 'infer': None - }, - 'pl_1': { - 'shape': np.array([1,3,38,38]) - } - }, - nodes_with_edges_only=True - ) - graph.graph['layout'] = 'NCHW' - add_mean_scale_values(graph, - { - 'pl_1': { - 'scale': np.array([1.]), - 'mean': np.array([1., 2., 3.]) - } - }) - mul_op_cnt = 0 - add_op_cnt = 0 - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == 'Mul': - mul_op_cnt += 1 - if node.has_valid('op') and node.op == 'Add': - add_op_cnt += 1 - - self.assertEqual(add_op_cnt, 1, "Found more than one Add op in graph") - self.assertEqual(mul_op_cnt, 0, "Found Mul op in graph") - - def test_optimize_mean_and_add_scale_values(self): - graph = build_graph( - nodes_attributes, - [ - ('pl_1', 'pl_1_data') - ], - { - 'pl_1_data': { - 'shape': np.array([1, 3, 38, 38]), - 'infer': None - }, - 'pl_1': { - 'shape': np.array([1,3,38,38]) - } - }, - nodes_with_edges_only=True - ) - graph.graph['layout'] = 'NCHW' - add_mean_scale_values(graph, - { - 'pl_1': { - 'scale': np.array([1.43]), - 'mean': np.array([0., 0., 0.]) - } - }) - mul_op_cnt = 0 - add_op_cnt = 0 - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == 'Mul': - mul_op_cnt += 1 - if node.has_valid('op') and node.op == 'Add': - add_op_cnt += 1 - - self.assertEqual(add_op_cnt, 0, "Found more than one Add op in graph") - self.assertEqual(mul_op_cnt, 1, "Found Mul op in graph") - - def test_add_mean_scale_values3(self): - graph = build_graph(nodes_attributes, - [('pl_1', 'pl_1_data')], - {'pl_1_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, - 'pl_1': {'shape': np.array([1,3,38,38])}, - }, - nodes_with_edges_only=True) - graph.graph['layout'] = 'NCHW' - add_mean_scale_values(graph, [[np.array([1., 2., 3.]), np.array([1., 2., 3.])]]) - - mul_op_cnt = 0 - add_op_cnt = 0 - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == 'Mul': - mul_op_cnt += 1 - if node.has_valid('op') and node.op == 'Add': - add_op_cnt += 1 - - self.assertEqual(add_op_cnt, 1, "Found more than one Add op in graph") - self.assertEqual(mul_op_cnt, 1, "Found more than one Nul op in graph") - - def test_add_mean_scale_values_cut_graph(self): - """ - Test case when user cutted start of the network and specified mean/scale value to the new input node 'node_3'. - """ - graph = build_graph(nodes_attributes, - [('pl_1', 'pl_1_data'), - ('pl_2', 'pl_2_data'), - ('pl_2_data', 'node_3'), - ('node_3', 'node_3_data'), - ('pl_1_data', 'node_1'), - ('node_3_data', 'node_1'), - ], - {'pl_1_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, - 'pl_2_data': {'shape': np.array([1, 3, 38, 38]), 'infer': None}, - 'pl_2': {'initial_node_name': 'node_3', 'shape': np.array([1,3,38,38])}, - 'pl_1': {'shape': np.array([1,3,38,38])}, - }, - nodes_with_edges_only=True) - graph.graph['layout'] = 'NCHW' - add_mean_scale_values(graph, {'pl_1': {'mean': np.array([1, 2, 3])}, 'node_3': {'scale': np.array([1, 2, 3])}}) - - mul_op_cnt = 0 - add_op_cnt = 0 - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == 'Mul': - mul_op_cnt += 1 - if node.has_valid('op') and node.op == 'Add': - add_op_cnt += 1 - - self.assertEqual(add_op_cnt, 1, "There should be exactly one Add op") - self.assertEqual(mul_op_cnt, 1, "There should be exactly one Mul op") - self.assertEqual(Node(graph, 'pl_2').out_node().out_node().op, 'Mul', "The Mul op should be added after pl_2") - self.assertEqual(Node(graph, 'pl_1').out_node().out_node().op, 'Add', "The Add op should be added after pl_1") - - -class ScaleInputTests(unittest.TestCase): - def test_scale_input_1(self): - graph = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data')], - {'placeholder_1_data': {'is_output': True}, - 'placeholder_1': {'shape': np.array([1, 3, 224, 224])} - }, - nodes_with_edges_only=True) - - graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'mul_1_data'), - ('mul_1_data', 'mul_1'), - ('mul_1_w', 'mul_1'), - ('mul_1', 'placeholder_1_data')], - {'mul_1_w': {'shape': np.array([1, 1, 1]), 'value': np.array([1 / 255])}, - 'placeholder_1_data': {'is_output': True}}, - nodes_with_edges_only=True) - graph.graph['layout'] = 'NCHW' - scale_input(graph, 255) - (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1_data') - self.assertTrue(flag, resp) - - def test_scale_input_2(self): - graph = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data')], - {'placeholder_1_data': {'is_output': True}}, - nodes_with_edges_only=True) - - graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'placeholder_1_data')], - {'placeholder_1_data': {'is_output': True}}, - nodes_with_edges_only=True) - - scale_input(graph, 1) - (flag, resp) = compare_graphs(graph, graph_ref, 'placeholder_1_data') - self.assertTrue(flag, resp) - - def test_check_for_cycle1(self): - # cyclic case - graph = build_graph(nodes_attributes, - [('node_1', 'node_1_data'), - ('node_1_data', 'node_3'), - ('node_3', 'node_3_data'), - ('node_3_data', 'node_1')], - nodes_with_edges_only=True) - with self.assertRaisesRegex(Error, 'Graph contains a cycle. Can not proceed.*'): - check_for_cycle(graph) - - def test_check_for_cycle2(self): - # acyclic case - graph = build_graph(nodes_attributes, - [('node_1', 'node_1_data'), - ('node_1_data', 'node_3'), - ('node_3', 'node_3_data'), - ('node_3_data', 'mul_1'), - ('mul_1_w', 'mul_1'), - ('mul_1', 'mul_1_data') - ], - nodes_with_edges_only=True) - try: - check_for_cycle(graph) - except Error: - self.fail("Unexpected Error raised") +class CycleTest(unittest.TestCase): def test_is_not_fully_inferred_param(self): # Node that have is_not_fully_inferred=True graph = build_graph(nodes_attributes, [('node_1', 'concat'), ('node_2', 'concat'), - ('concat', 'node_3')], - {'node_3': {'kind': 'data', 'is_output': True, 'shape': None, 'infer': None}, + ('concat', 'node_3'), + ('node_3', 'op_output') + ], + {'node_3': {'kind': 'data', 'shape': None, 'infer': None}, 'node_1': {'kind': 'data', 'shape': np.array([1, 3, 227, 227]), 'infer': None}, 'node_2': {'kind': 'data', 'shape': np.array([1, 3, 227, 227]), 'infer': None}, 'concat': {'kind': 'op', 'axis': 2, 'infer': concat_infer, 'is_not_fully_inferred': True} diff --git a/model-optimizer/mo/middle/passes/l2normalization.py b/model-optimizer/mo/middle/passes/l2normalization.py index 6e80ffb..9edcdc1 100644 --- a/model-optimizer/mo/middle/passes/l2normalization.py +++ b/model-optimizer/mo/middle/passes/l2normalization.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,16 +14,15 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.extractor import add_attrs_props from mo.front.extractor import update_ie_fields -from mo.graph.graph import Node, unique_id +from mo.graph.graph import Node, Graph from mo.middle.pattern_match import apply_pattern -def l2_norm_to_norm_action(graph: nx.MultiDiGraph, match: dict): +def l2_norm_to_norm_action(graph: Graph, match: dict): input_data_name = match['input'].node output_data_name = match['l2_normalize_data'].node @@ -33,18 +32,17 @@ def l2_norm_to_norm_action(graph: nx.MultiDiGraph, match: dict): return 1 y = match['maximum_y_data'].value - normalize_id = unique_id(graph) + normalize_id = graph.unique_id() graph.add_node(normalize_id, **add_attrs_props( - dict(kind='op', precision="FP32", type='Normalize', name=str(unique_id(graph, 'normalize')), + dict(kind='op', precision="FP32", type='Normalize', name=str(graph.unique_id('normalize')), op='Normalize', shape=None, eps=str(y), across_spatial=str(0), channel_shared=str(0), - data_type=None, - infer=None))) - normalize_data_id = unique_id(graph) + data_type=None, infer=None, in_ports_count=2, out_ports_count=1))) + normalize_data_id = graph.unique_id() graph.add_node(normalize_data_id, **add_attrs_props(graph.node[output_data_name])) update_ie_fields(graph.node[normalize_id]) - weights_id = unique_id(graph, 'weights_') + weights_id = graph.unique_id('weights_') graph.add_node(weights_id, **add_attrs_props( dict(kind='data', precision="FP32", name=weights_id, value=None, shape=None, data_type=None, infer=None))) wnode = Node(graph, weights_id) @@ -65,7 +63,7 @@ def l2_norm_to_norm_action(graph: nx.MultiDiGraph, match: dict): graph.add_edge(normalize_data_id, owner, **attr) -def l2_norm_to_norm(graph: nx.MultiDiGraph): +def l2_norm_to_norm(graph: Graph): apply_pattern( graph, nodes=[ @@ -79,13 +77,10 @@ def l2_norm_to_norm(graph: nx.MultiDiGraph): ('rsqrt_data', dict(kind='data')), ('square', dict(kind='op', op='Square')), ('square_data', dict(kind='data')), - ('sum', dict(kind='op', op='Sum')), + ('sum', dict(kind='op', op='Reduce', reduce_type='sum')), ('sum_data', dict(kind='data')), - ('range_data', dict(kind='data')), - ], edges=[ - ('range_data', 'sum'), ('input', 'square'), ('square', 'square_data'), ('square_data', 'sum'), diff --git a/model-optimizer/mo/middle/passes/leaky_relu.py b/model-optimizer/mo/middle/passes/leaky_relu.py index 1ff04b2..60fb42b 100644 --- a/model-optimizer/mo/middle/passes/leaky_relu.py +++ b/model-optimizer/mo/middle/passes/leaky_relu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,14 +16,14 @@ import logging as log -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.middle.pattern_match import apply_pattern from mo.ops.relu import ReLU -def _convert_to_leaky_relu_action(graph: nx.MultiDiGraph, matches: dict): +def _convert_to_leaky_relu_action(graph: Graph, matches: dict): """ This function checks given patten and if pattern satisfies all requirements, converts to ReLU with negative slope """ @@ -73,7 +73,7 @@ def _convert_to_leaky_relu_action(graph: nx.MultiDiGraph, matches: dict): ''.format(eltwise_op.id, power_op.id)) -def convert_mul_eltwise_to_leaky_relu(graph: nx.MultiDiGraph): +def convert_mul_eltwise_to_leaky_relu(graph: Graph): """ This function finds next subgraph: -->Data-------->Eltwise(Max)-->Data diff --git a/model-optimizer/mo/middle/passes/mean_scale_values.py b/model-optimizer/mo/middle/passes/mean_scale_values.py index ec53fc0..64c86a2 100644 --- a/model-optimizer/mo/middle/passes/mean_scale_values.py +++ b/model-optimizer/mo/middle/passes/mean_scale_values.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ limitations under the License. """ -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.middle.pattern_match import apply_pattern @@ -52,7 +52,7 @@ def move_scaleshift_to_preprocess_action(graph, match): graph.graph['mean_values'] = mean_values -def move_scaleshift_to_preprocess(graph: nx.MultiDiGraph): +def move_scaleshift_to_preprocess(graph: Graph): """ This function finds scaleshift layer after input layer and if it has weights with ones, it deletes scaleshift layer and creates graph dict attribute : {'input':np.array(...), 'input2': ... } diff --git a/model-optimizer/mo/middle/passes/mean_scale_values_test.py b/model-optimizer/mo/middle/passes/mean_scale_values_test.py index 9bc7b6b..9aa3018 100644 --- a/model-optimizer/mo/middle/passes/mean_scale_values_test.py +++ b/model-optimizer/mo/middle/passes/mean_scale_values_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,6 +35,9 @@ nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'}, 'scaleshift_1_w': {'value': None, 'shape': None, 'kind': 'data'}, 'scaleshift_1_b': {'value': None, 'shape': None, 'kind': 'data'}, 'scaleshift_1_data': {'value': None, 'shape': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, + 'op_output_1': { 'kind': 'op', 'op': 'OpOutput'} + } @@ -45,19 +48,21 @@ class TestScaleShift_To_Preprocess(unittest.TestCase): ('placeholder_1_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1')], + ('scaleshift_1_b', 'scaleshift_1'), + ('scaleshift_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.ones(3)}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([-1, -2, -3])}, - 'scaleshift_1_data': {'is_output': True} }) del graph['placeholder_1']['placeholder_1_data'][0]['in'] del graph['scaleshift_1']['scaleshift_1_data'][0]['in'] graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'scaleshift_1_data')], - {'scaleshift_1_data': {'is_output': True}}) + [('placeholder_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') + ]) move_scaleshift_to_preprocess(graph) self.assertTrue(graph.graph['mean_values'] is not None) @@ -72,11 +77,13 @@ class TestScaleShift_To_Preprocess(unittest.TestCase): ('placeholder_1_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1')], + ('scaleshift_1_b', 'scaleshift_1'), + ('scaleshift_1_data', 'op_output'), + ('placeholder_1_data', 'op_output_1') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([-1, -2, -3])}, - 'scaleshift_1_data': {'is_output': True} }) del graph['placeholder_1']['placeholder_1_data'][0]['in'] @@ -87,11 +94,13 @@ class TestScaleShift_To_Preprocess(unittest.TestCase): ('placeholder_1_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True}, + ('scaleshift_1_b', 'scaleshift_1'), + ('placeholder_1_data', 'op_output_1'), + ('scaleshift_1_data', 'op_output') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.array([-1, -2, -3])}, - 'scaleshift_1_data': {'is_output': True} }) move_scaleshift_to_preprocess(graph) @@ -105,10 +114,12 @@ class TestScaleShift_To_Preprocess(unittest.TestCase): [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1'), ], + ('scaleshift_1_w', 'scaleshift_1'), + ('scaleshift_1_data', 'op_output'), + ('placeholder_1_data', 'op_output_1') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, - 'scaleshift_1_data': {'is_output': True} }) del graph['placeholder_1']['placeholder_1_data'][0]['in'] @@ -118,10 +129,12 @@ class TestScaleShift_To_Preprocess(unittest.TestCase): [('placeholder_1', 'placeholder_1_data'), ('placeholder_1_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), - ('scaleshift_1_w', 'scaleshift_1')], - {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3]), 'is_output': True}, + ('scaleshift_1_w', 'scaleshift_1'), + ('scaleshift_1_data', 'op_output'), + ('placeholder_1_data', 'op_output_1') + ], + {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.array((1, 2, 3))}, - 'scaleshift_1_data': {'is_output': True} }) move_scaleshift_to_preprocess(graph) @@ -136,19 +149,21 @@ class TestScaleShift_To_Preprocess(unittest.TestCase): ('placeholder_1_data', 'scaleshift_1'), ('scaleshift_1', 'scaleshift_1_data'), ('scaleshift_1_w', 'scaleshift_1'), - ('scaleshift_1_b', 'scaleshift_1')], + ('scaleshift_1_b', 'scaleshift_1'), + ('scaleshift_1_data', 'op_output') + ], {'placeholder_1_data': {'shape': np.array([1, 227, 227, 3])}, 'scaleshift_1_w': {'shape': np.array([3]), 'value': np.ones(3)}, 'scaleshift_1_b': {'shape': np.array([3]), 'value': np.zeros(3)}, - 'scaleshift_1_data': {'is_output': True} }) del graph['placeholder_1']['placeholder_1_data'][0]['in'] del graph['scaleshift_1']['scaleshift_1_data'][0]['in'] graph_ref = build_graph(nodes_attributes, - [('placeholder_1', 'scaleshift_1_data')], - {'scaleshift_1_data': {'is_output': True}}) + [('placeholder_1', 'scaleshift_1_data'), + ('scaleshift_1_data', 'op_output') + ]) move_scaleshift_to_preprocess(graph) self.assertTrue(graph.graph.get('mean_values', None) is None) diff --git a/model-optimizer/mo/middle/passes/pool.py b/model-optimizer/mo/middle/passes/pool.py deleted file mode 100644 index a819cda..0000000 --- a/model-optimizer/mo/middle/passes/pool.py +++ /dev/null @@ -1,90 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import networkx as nx -import numpy as np - -from mo.graph.graph import create_edge -from mo.middle.pattern_match import apply_pattern -from mo.ops.op import Op, PermuteAttrs -from mo.ops.reshape import Reshape - - -def mean_to_avgpool_action(graph: nx.MultiDiGraph, matches: dict): - if matches['axis'].value is None or matches['input'].shape is None: - return - dims = len(matches['input'].shape) - ones = np.ones(dims, dtype=np.int64) - axis = np.array(matches['axis'].value) - axis = axis if axis.ndim != 0 else np.array([axis], dtype=np.int64) - - mean = graph.node[matches['mean'].node] - mean['stride'] = np.array(ones) - # TODO: need to check axis with real layout - spatial_dims = np.array(axis) - mean['spatial_dims'] = spatial_dims - mean['pad'] = np.zeros((dims, 2), np.int64) - mean['pad_spatial_shape'] = np.array(mean['pad'][spatial_dims]) - window = np.array(ones) - window[spatial_dims] = matches['input'].shape[spatial_dims] - mean['window'] = window - mean['TF_op'] = mean['op'] - mean['op'] = 'AvgPool' - mean['pool_method'] = 'avg' - mean['rounding_type'] = 'ceil' - mean['exclude_pad'] = 'true' - mean['kernel_spatial'] = window[spatial_dims] - graph.remove_edge(matches['axis'].node, matches['mean'].node) - mean['permute_attrs'] = PermuteAttrs().update_attrs(attrs=[('pad', 'input:0'), - ('stride', 'input:0'), - ('window', 'input:0'), - ('spatial_dims', 'input:0')]) - - if matches['mean'].keep_dims == False: - output = matches['mean'].out_node() - pool_node = matches['mean'] - - # Keep dims for AvgPool - shape = np.array(output.shape) - for idx in spatial_dims: - shape = np.insert(shape, idx, 1) - - graph.remove_edge(pool_node.id, output.id) - # Create new data for pool with all dims - pool_data = Op.create_data_node(graph, pool_node, {'shape': np.array(shape)}) - # Create and connect reshape node - reshape_op = Reshape(graph, {'dim': np.array(output.shape)}) - reshape_node = reshape_op.create_node([pool_data], dict(name='Reshape_', - permute_attrs=PermuteAttrs().update_attrs(attrs=[('dim', 'output:0')]))) - create_edge(reshape_node, output) - - -def mean_to_avgpool(graph: nx.MultiDiGraph): - """ - Translate Mean as a average pooling with kernel size equals to reduced dimensions and with no padding. - """ - apply_pattern( - graph, - nodes=[ - ('input', dict(kind='data')), - ('axis', dict(kind='data')), - ('mean', dict(kind='op', op='Mean'))], - edges=[ - ('input', 'mean', {'in': 0}), - ('axis', 'mean', {'in': 1})], - action=mean_to_avgpool_action - ) - return graph diff --git a/model-optimizer/mo/middle/passes/shape.py b/model-optimizer/mo/middle/passes/shape.py index 647502b..e98a2ac 100644 --- a/model-optimizer/mo/middle/passes/shape.py +++ b/model-optimizer/mo/middle/passes/shape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,12 +16,12 @@ import logging as log -import networkx as nx import numpy as np +from mo.front.common.partial_infer.utils import int64_array from mo.front.extractor import update_attrs -from mo.graph.graph import Node, create_edge -from mo.middle.passes.eliminate import remove_op_node_with_data_node, merge_data_nodes, graph_clean_up_tf, get_nodes_with_attributes +from mo.graph.graph import Node, Graph +from mo.middle.passes.eliminate import remove_op_node_with_data_node, merge_data_nodes, graph_clean_up_tf from mo.middle.passes.fusing.helpers import get_next_operation from mo.middle.pattern_match import apply_pattern from mo.ops.op import PermuteAttrs, Op @@ -30,7 +30,7 @@ from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg -def reshape_squeeze_transform(graph: nx.MultiDiGraph, match: dict): +def reshape_squeeze_transform(graph: Graph, match: dict): reshape = match['reshape'] output = match['output'] if output.shape is None: @@ -42,11 +42,9 @@ def reshape_squeeze_transform(graph: nx.MultiDiGraph, match: dict): # do not override value 'dim' if it is set. It may contain specific values like -1 and 0 reshape['dim'] = reshape.shape.copy() update_attrs(reshape, 'shape_attrs', 'dim') - if 'shape' in match: - graph.remove_edge(match['shape'].node, match['reshape'].node) -def convert_squeeze(graph: nx.MultiDiGraph): +def convert_squeeze(graph: Graph): apply_pattern( graph, nodes=[ @@ -57,7 +55,7 @@ def convert_squeeze(graph: nx.MultiDiGraph): ) -def convert_reshape(graph: nx.MultiDiGraph): +def convert_reshape(graph: Graph): apply_pattern( graph, nodes=[ @@ -107,12 +105,12 @@ def can_repack_fully_connected_weights_nhwc_to_nchw(fc_node: Node): return False -def repack_fully_connected_weights_nhwc_to_nchw(graph: nx.MultiDiGraph): +def repack_fully_connected_weights_nhwc_to_nchw(graph: Graph): """ Repack weights of FullyConnected layer as a part of nhwc_to_nchw translation if Reshape of that involves dimensions that we are repacking appears right before FullyConnected layer. """ - for node_id in get_nodes_with_attributes(graph, type='FullyConnected'): + for node_id in graph.get_nodes_with_attributes(type='FullyConnected'): fc_node = Node(graph, node_id) if not can_repack_fully_connected_weights_nhwc_to_nchw(fc_node): @@ -146,7 +144,7 @@ def repack_fully_connected_weights_nhwc_to_nchw(graph: nx.MultiDiGraph): weights.value = np.transpose(weights.value.reshape(tmp_shape), (2, 0, 1, 3)).reshape(weights.shape) -def apply_nhwc_to_nchw_permutation(graph: nx.MultiDiGraph): +def apply_nhwc_to_nchw_permutation(graph: Graph): # Add NHWC to NCHW permutation for all data nodes (only for nodes without permutation) if graph.graph['layout'] == 'NCHW': return @@ -181,7 +179,7 @@ def apply_nhwc_to_nchw_permutation(graph: nx.MultiDiGraph): PermuteAttrs.set_permutation(node, out_node, permutation) -def merge_nodes_permutations(graph: nx.MultiDiGraph): +def merge_nodes_permutations(graph: Graph): # Iterate over all data nodes and check all permutations for similarity # In case of equal permutations, this permutation will be set as attribute for data node # otherwise exception will be raised @@ -228,7 +226,7 @@ def merge_nodes_permutations(graph: nx.MultiDiGraph): node.permutation = None -def permute_data_nodes_attrs(graph: nx.MultiDiGraph): +def permute_data_nodes_attrs(graph: Graph): # Iterate over all data nodes and apply permutation if exists for node in graph.nodes(): node = Node(graph, node) @@ -245,7 +243,7 @@ def permute_data_nodes_attrs(graph: nx.MultiDiGraph): node.value = np.array(node.value.transpose(node.permutation.perm)) -def permute_op_nodes_attrs(graph: nx.MultiDiGraph): +def permute_op_nodes_attrs(graph: Graph): for node in graph.nodes(): node = Node(graph, node) if node.kind == 'op' and node.has_valid('permute_attrs'): @@ -255,7 +253,7 @@ def permute_op_nodes_attrs(graph: nx.MultiDiGraph): raise Error('Can\'t permute attrs for node {}. Error message: {}'.format(node.id, e)) -def reverse_input_channels(graph: nx.MultiDiGraph): +def reverse_input_channels(graph: Graph): """ Searchers for all type=Input nodes with 4D output tensors, tracks tensors down through non-shape-changing ops to the first type=Convolution or other channel-dependent nodes @@ -311,6 +309,8 @@ def reverse_input_channels(graph: nx.MultiDiGraph): if conv.op == 'DepthwiseConv2dNative': log.debug('out nodes: {}'.format(conv.out_node())) bottoms = conv.out_node().out_nodes() + if len(bottoms) == 1 and bottoms[0].op == 'FakeQuantWithMinMaxVars': + bottoms = bottoms[0].out_node().out_nodes() log.debug('bottoms: {}'.format(bottoms)) log.debug('assumed conv: name = {}, op = {}'.format(bottoms[0].name, bottoms[0].op)) if len(bottoms) > 0 and bottoms[0].op == 'Conv2D': @@ -349,12 +349,13 @@ def reverse_input_channels(graph: nx.MultiDiGraph): 'complete the flip') conv.in_node(1).value = np.flip(conv.in_node(1).value, conv.in_node(1).input_channel_dim) + conv.in_node(1).shape = int64_array(conv.in_node(1).value.shape) log.debug('Applied reversing input channels for weights of convolution {}'.format(conv.id)) log.debug('Shape was (shape){}, (value.shape){}'.format(conv.in_node(1).shape, conv.in_node(1).value.shape)) log.debug('Flipped dim: {}'.format(conv.in_node(1).input_channel_dim)) -def conv_flatten_concat_action(graph: nx.MultiDiGraph, match: dict): +def conv_flatten_concat_action(graph: Graph, match: dict): assert graph.graph['layout'] == 'NHWC' reshape_node = match['reshape'] reshape_data_node = match['reshape_data'] @@ -370,18 +371,18 @@ def conv_flatten_concat_action(graph: nx.MultiDiGraph, match: dict): log.info('There is a FullyConnected layer after the node "{}" which weights will be repacked. So there is no ' 'need to insert Permute'.format(reshape_node.soft_get('name'))) return - assert len(graph.in_edges(reshape_node.id)) == 1 graph.remove_edge(conv_data_node.id, reshape_node.id) permutation_order = PermuteAttrs.get_nchw_to_nhwc_permutation(len(conv_data_node.shape)).perm new_permute_op = Permute(graph, {'order': permutation_order}) permute_data_node = new_permute_op.create_node_with_data([conv_data_node], dict(name=conv_name + '/Permute_')) - create_edge(permute_data_node, reshape_node) + graph.create_edge(permute_data_node, reshape_node) # Disable permutation for Reshape and Concat layers attributes PermuteAttrs.set_permutation(reshape_node, reshape_data_node, None) + reshape_node['nchw_layout'] = True -def conv_flatten_concat(graph: nx.MultiDiGraph): +def conv_flatten_concat(graph: Graph): apply_pattern( graph, nodes=[ @@ -419,12 +420,12 @@ def conv_flatten_concat(graph: nx.MultiDiGraph): ) -def fuse_sequence_of_reshapes(graph: nx.MultiDiGraph): +def fuse_sequence_of_reshapes(graph: Graph): for node in list(graph.nodes()): - node = Node(graph, node) - if not graph.has_node(node.id): + if not graph.has_node(node): # data node can be already removed continue + node = Node(graph, node) if ( node.has_valid('type') and node.type == 'Reshape' and len(node.out_nodes()) == 1 and node.out_node().has_valid('kind') and node.out_node().kind == 'data' and @@ -439,3 +440,22 @@ def fuse_sequence_of_reshapes(graph: nx.MultiDiGraph): # Remove Reshape1 log.debug('Second phase for Reshape: {}'.format(node.name)) remove_op_node_with_data_node(graph, node) + + reshape_nodes = graph.get_op_nodes(op='Reshape') + for reshape_node in reshape_nodes: + in_ports = [port for port in reshape_node.in_ports().values() if not port.disconnected()] + assert len(in_ports) in [1, 2], "`Reshape` node must have 2 inputs or 1 input with `dim`" + if len(in_ports) == 2: + previous_dim_op = reshape_node.in_port(1).get_source().node.op + if previous_dim_op != 'Const': + continue + dim = reshape_node.in_port(1).get_connection().data.get_value() + else: + assert reshape_node.has_valid('dim'), "`Reshape` node with 1 input must have `dim` attribute" + dim = reshape_node.dim + + in_shape = reshape_node.in_port(0).get_connection().data.get_shape() + + if np.array_equal(dim, in_shape) and len(reshape_node.out_nodes()): + log.debug("Useless reshape with dim {} was deleted: {}".format(str(dim), reshape_node.name)) + reshape_node.out_port(0).get_connection().set_source(reshape_node.in_port(0).get_source()) diff --git a/model-optimizer/mo/middle/passes/shared_weights_duplication.py b/model-optimizer/mo/middle/passes/shared_weights_duplication.py deleted file mode 100644 index 9458386..0000000 --- a/model-optimizer/mo/middle/passes/shared_weights_duplication.py +++ /dev/null @@ -1,45 +0,0 @@ -""" - Copyright (c) 2018 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import networkx as nx -import numpy as np - -from mo.graph.graph import Node -from mo.ops.op import Op -from mo.utils.error import Error - - -def duplicate_shared_weights(graph: nx.MultiDiGraph): - """ - This function finds all const data nodes that have more that one consumer and then duplicate them - """ - data_nodes = [Node(graph, id) for id in graph.nodes() if Node(graph, id).soft_get('kind') == 'data'] - for node in data_nodes: - # Check that node has const values and more than one consumer - if len(node.out_nodes()) > 1 and node.value is not None: - # Here we delete all edges between base node and it's consumers (except first), and then duplicate this - # node to connect with other consumers - while len(node.out_nodes()) > 1: - out_node = node.out_node(1) - - if len(graph.get_edge_data(node.id, out_node.id)) != 1: - raise Error('There is more than one edge from {} node to {} node.'.format(node.id, out_node.id)) - e_attrs = graph.get_edge_data(node.id, out_node.id)[0] - - graph.remove_edge(node.id, out_node.id) - data = Op.create_input_data_node(graph, "Copy_{}".format(node.id), np.array(node.value), graph.node[node.id]) - - graph.add_edges_from([(data.id, out_node.id, e_attrs)]) diff --git a/model-optimizer/mo/middle/passes/tensor_names.py b/model-optimizer/mo/middle/passes/tensor_names.py index 97efb3d..7b8abb2 100644 --- a/model-optimizer/mo/middle/passes/tensor_names.py +++ b/model-optimizer/mo/middle/passes/tensor_names.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,14 @@ limitations under the License. """ -import json -from collections import defaultdict -from xml.etree.ElementTree import Element, SubElement, tostring -from xml.dom.minidom import parseString -import networkx as nx +from defusedxml.minidom import parseString +from xml.etree.ElementTree import Element, SubElement, tostring -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph -def propagate_op_name_to_tensor(graph: nx.MultiDiGraph): +def propagate_op_name_to_tensor(graph: Graph): for node in graph.nodes(): node = Node(graph, node) if node.kind == 'op' and node.has_valid('name'): @@ -35,7 +32,7 @@ def propagate_op_name_to_tensor(graph: nx.MultiDiGraph): out_node['ie_tensor_id'] = node.node -def output_tensor_names_map(graph: nx.MultiDiGraph, xml_file_name: str): +def output_tensor_names_map(graph: Graph, xml_file_name: str): mapping = Element('mapping') for node in graph: node = Node(graph, node) diff --git a/model-optimizer/mo/middle/pattern_match.py b/model-optimizer/mo/middle/pattern_match.py index f1ea8cf..0e260f4 100644 --- a/model-optimizer/mo/middle/pattern_match.py +++ b/model-optimizer/mo/middle/pattern_match.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,14 +19,14 @@ import logging as log import networkx as nx from networkx.algorithms import isomorphism as ism -from mo.graph.graph import Node, dict_includes +from mo.graph.graph import Node, dict_includes, Graph def inverse_dict(d: dict): return {v: k for k, v in d.items()} -def for_each_sub_graph(graph: nx.MultiDiGraph, func: callable): +def for_each_sub_graph(graph: Graph, func: callable): """ Run a given function `func` for each sub-graph in a given graph not recursively. It doesn't search for sub-graphs in found sub-graphs recursively. If the recursion is required, @@ -39,7 +39,7 @@ def for_each_sub_graph(graph: nx.MultiDiGraph, func: callable): func(node[sub_graph_name]) -def for_each_sub_graph_recursively(graph: nx.MultiDiGraph, func: callable): +def for_each_sub_graph_recursively(graph: Graph, func: callable): """ Run a given function `func` for each sub-graph in a given graph `graph` recursively. A given function `func` shouldn't contain a recursion for sub-graphs of the second level. @@ -53,7 +53,7 @@ def for_each_sub_graph_recursively(graph: nx.MultiDiGraph, func: callable): for_each_sub_graph(graph, recursive_helper) -def for_graph_and_each_sub_graph_recursively(graph: nx.MultiDiGraph, func: callable): +def for_graph_and_each_sub_graph_recursively(graph: Graph, func: callable): """ Run a given function `func` for a given graph `graph` and each sub-graph recursively. """ func(graph) for_each_sub_graph_recursively(graph, func) @@ -63,7 +63,7 @@ def all_edges_in_nodes(nodes: list, edges: list): return all([edge[0] in nodes and edge[1] in nodes for edge in edges]) -def apply_pattern(graph: nx.MultiDiGraph, nodes: list, edges: list, action: callable, node_attrs: list = None, +def apply_pattern(graph: Graph, nodes: list, edges: list, action: callable, node_attrs: list = None, edge_attrs: list = None): """ Search for all matches of a given subgraph defined by [nodes, edges] in graph, @@ -114,7 +114,8 @@ def check_node_usages_out_of_match(match: dict, node_name_in_match_group: str): def node_match(data1: dict, data2: dict): - return dict_includes(data1, data2) + # We have to skip _in_ports/_out_ports attributes for comparision as they are not comparable + return dict_includes(data1, data2, skip_attr_names=['_in_ports', '_out_ports']) def edge_match(datasets1, datasets2): @@ -130,7 +131,7 @@ def edge_match(datasets1, datasets2): return values1 == values2 -def build_matcher(graph: nx.MultiDiGraph, nodes: list, edges: list, node_attrs: list = None, +def build_matcher(graph: Graph, nodes: list, edges: list, node_attrs: list = None, edge_attrs: list = None): if node_attrs is not None or edge_attrs is not None: log.warning('\'edge_attrs\' or `\'node_attrs\'` parameter was passed to function \'find_pattern_matches\', ' @@ -139,13 +140,13 @@ def build_matcher(graph: nx.MultiDiGraph, nodes: list, edges: list, node_attrs: 'matching function like \'find_pattern_matches\', \'apply_pattern\' and \'pattern\' because it ' 'will be deprecated in the next release.') - subgraph = nx.MultiDiGraph(name='pattern') + subgraph = Graph(name='pattern') subgraph.add_nodes_from(nodes) subgraph.add_edges_from(edges) return ism.MultiDiGraphMatcher(graph, subgraph, node_match, edge_match) -def find_pattern_matches(graph: nx.MultiDiGraph, nodes: list, edges: list, node_attrs: list = None, +def find_pattern_matches(graph: Graph, nodes: list, edges: list, node_attrs: list = None, edge_attrs: list = None): """ Find all matches of a given sub-graph defined by [nodes, edges] in graph. @@ -154,7 +155,7 @@ def find_pattern_matches(graph: nx.MultiDiGraph, nodes: list, edges: list, node_ return matcher.subgraph_isomorphisms_iter() -def find_isomorphisms(graph: nx.MultiDiGraph, nodes: list, edges: list): +def find_isomorphisms(graph: Graph, nodes: list, edges: list): ''' Find for isomorphism between a given graph and a pattern specified by a given nodes and edges. Applies the same rules as apply_pattern. ''' diff --git a/model-optimizer/mo/middle/replacement.py b/model-optimizer/mo/middle/replacement.py index 82cadc5..752d544 100644 --- a/model-optimizer/mo/middle/replacement.py +++ b/model-optimizer/mo/middle/replacement.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,14 @@ class MiddleReplacementPattern(ReplacementPattern): registered_ops = {} registered_cls = [] + def run_after(self): + from extensions.middle.pass_separator import MiddleStart + return [MiddleStart] + + def run_before(self): + from extensions.middle.pass_separator import MiddleFinish + return [MiddleFinish] + @classmethod def class_type(cls): return class_registration.ClassType.MIDDLE_REPLACER diff --git a/model-optimizer/mo/ops/activation.py b/model-optimizer/mo/ops/activation.py index 95111f7..971a3de 100644 --- a/model-optimizer/mo/ops/activation.py +++ b/model-optimizer/mo/ops/activation.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,11 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.common.partial_infer.eltwise import eltwise_infer -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op @@ -37,14 +36,17 @@ class Activation(Op): 'tanh': lambda x: np.tanh(x), 'elu': lambda x, alpha: Activation.elu(x, alpha), 'sigmoid': lambda x: 1 / (1 + np.exp(-x)), - 'relu6': lambda x: np.maximum(0, np.minimum(x, 6)) + 'relu6': lambda x: np.maximum(0, np.minimum(x, 6)), + 'exp': lambda x: np.exp(x), } - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, - 'infer': Activation.infer + 'infer': Activation.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) @classmethod diff --git a/model-optimizer/mo/ops/activation_test.py b/model-optimizer/mo/ops/activation_test.py index b289b96..5dbc07b 100644 --- a/model-optimizer/mo/ops/activation_test.py +++ b/model-optimizer/mo/ops/activation_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/clamp.py b/model-optimizer/mo/ops/clamp.py index ce6bfc5..05e551c 100644 --- a/model-optimizer/mo/ops/clamp.py +++ b/model-optimizer/mo/ops/clamp.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,21 +14,22 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op class Clamp(Op): op = 'Clamp' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, - 'infer': copy_shape_infer + 'infer': copy_shape_infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/clamp_test.py b/model-optimizer/mo/ops/clamp_test.py index 66e38e2..0cdf556 100644 --- a/model-optimizer/mo/ops/clamp_test.py +++ b/model-optimizer/mo/ops/clamp_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/concat.py b/model-optimizer/mo/ops/concat.py index b13c19f..1e04f01 100644 --- a/model-optimizer/mo/ops/concat.py +++ b/model-optimizer/mo/ops/concat.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ class Concat(Op): 'op': __class__.op, 'axis': 1, 'infer': concat_infer, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/concat_test.py b/model-optimizer/mo/ops/concat_test.py index 7f39236..c03877d 100644 --- a/model-optimizer/mo/ops/concat_test.py +++ b/model-optimizer/mo/ops/concat_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/const.py b/model-optimizer/mo/ops/const.py index 3511a1b..adfcccc 100644 --- a/model-optimizer/mo/ops/const.py +++ b/model-optimizer/mo/ops/const.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -34,8 +34,9 @@ class Const(Op): 'value': None, 'shape': None, 'data_type': None, + 'out_ports_count': 1, }, attrs) if not isinstance(self.attrs['value'], np.ndarray): - self.attrs['value'] = np.array([self.attrs['value']]) + self.attrs['value'] = np.array(self.attrs['value']) self.attrs['shape'] = np.array(self.attrs['value'].shape, dtype=np.int64) diff --git a/model-optimizer/mo/ops/convolution.py b/model-optimizer/mo/ops/convolution.py index e6bcdee..96855eb 100644 --- a/model-optimizer/mo/ops/convolution.py +++ b/model-optimizer/mo/ops/convolution.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,14 +16,13 @@ import logging as log -import networkx as nx import numpy as np from mo.front.common.partial_infer.utils import int64_array, float_array, mark_input_bins, assign_dims_to_weights, \ tf_window_op_pad_infer from mo.front.extractor import spatial_getter from mo.front.onnx.extractors.utils import get_backend_pad -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs from mo.utils.error import Error @@ -31,12 +30,16 @@ from mo.utils.error import Error class Convolution(Op): op = 'Convolution' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, 'infer': __class__.infer, + 'multiplication_transparent': True, + 'multiplication_transparent_ports': [(0, 0), (1, 0)], + 'in_ports_count': 3, + 'out_ports_count': 1, }, attrs) def backend_attrs(self): @@ -49,7 +52,10 @@ class Convolution(Op): ('pads_begin', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 0)))), ('pads_end', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 1)))), - 'output' + 'output', + 'pad_value', + 'mode', + 'input', ] def backend_attrs_v2(self): @@ -176,6 +182,9 @@ class Convolution(Op): node['pad'] = np.array([[0, 0]] * len(input_shape), dtype=np.int64) node['pad_spatial_shape'] = node.pad[node.spatial_dims] + if not node.has_valid('output_padding'): + node['output_padding'] = np.full([len(input_shape)], 0, dtype=np.int64) + input_spatial_shape = input_shape[node.spatial_dims] stride_spatial_shape = node.stride[node.spatial_dims] @@ -185,9 +194,11 @@ class Convolution(Op): # Caffe do not use auto_pad attribute if node.has_valid('auto_pad') and not node.has_valid('output_spatial_shape'): node['pad_spatial_shape'], node['output_spatial_shape'] = tf_window_op_pad_infer(input_spatial_shape, - kernel_extent, - stride_spatial_shape, - node.auto_pad) + kernel_extent, + stride_spatial_shape, + node.auto_pad, + node.type == 'Deconvolution') + pad = np.zeros((len(input_shape), 2), dtype=np.int64) pad[node.spatial_dims] = node.pad_spatial_shape node.pad = pad @@ -208,7 +219,7 @@ class Convolution(Op): return else: output_padding = node.output_padding[node.spatial_dims] if node.has_valid('output_padding') else None - if output_padding is not None: + if output_padding is not None and any(output_padding): pad_spatial_shape -= output_padding for dim in range(len(pad_spatial_shape)): node.pad_spatial_shape[dim][1] -= pad_spatial_shape[dim] @@ -226,14 +237,14 @@ class Convolution(Op): if node.has_valid('get_group'): node['group'] = node.get_group(node) output_shape = np.full_like(input_shape, -1, dtype=np.int64) - output_shape[node.batch_dims] = input_shape[node.batch_dims] - output_shape[node.spatial_dims] = node.output_spatial_shape + output_shape[node.batch_dims] = input_shape[node.batch_dims] # pylint: disable=unsupported-assignment-operation + output_shape[node.spatial_dims] = node.output_spatial_shape # pylint: disable=unsupported-assignment-operation # For cases when output attribute wasn't set in extractor we should specify get_output_feature_dim attribute # this attribute should store lambda node: ... (check tf convolution extractor) if node.has_valid('get_output_feature_dim'): node['output'] = node.get_output_feature_dim(node) - output_shape[node.channel_dims] = node.output + output_shape[node.channel_dims] = node.output # pylint: disable=unsupported-assignment-operation node['output_shape'] = output_shape for n in node.out_nodes(): diff --git a/model-optimizer/mo/ops/convolution_test.py b/model-optimizer/mo/ops/convolution_test.py index 6f009b5..51d0395 100644 --- a/model-optimizer/mo/ops/convolution_test.py +++ b/model-optimizer/mo/ops/convolution_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27,7 +27,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'conv_input': {'value': None, 'kind': 'data'}, 'conv_node': {'type': 'Convolution', 'kind': 'op'}, 'conv_weights': {'value': FakeValue(None), 'kind': 'data'}, - 'conv_output': {'value': None, 'kind': 'data'} + 'conv_output': {'value': None, 'kind': 'data'}, + 'output_op': { 'kind': 'op', 'op': 'OpOutput'} } @@ -36,8 +37,10 @@ class TestConvolutionPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output')], - {'conv_output': {'is_output': True, 'shape': None}, + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') + ], + {'conv_output': {'shape': None}, 'conv_input': {'shape': np.array([1, 3, 227, 227])}, 'conv_weights': {'shape': np.array([64, 3, 3, 3]), 'dim_attrs': ['spatial_dims', 'channel_dims', 'batch_dims', 'axis']}, @@ -65,8 +68,10 @@ class TestConvolutionPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output')], - {'conv_output': {'is_output': True, 'shape': None}, + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') + ], + {'conv_output': {'shape': None}, 'conv_input': {'shape': None}, 'conv_weights': {'shape': None, 'dim_attrs': ['spatial_dims', 'channel_dims', 'batch_dims', 'axis']}, @@ -89,8 +94,10 @@ class TestConvolutionPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output')], - {'conv_output': {'is_output': True, 'shape': None}, + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') + ], + {'conv_output': {'shape': None}, 'conv_input': {'shape': np.array([1, 21, 16, 16])}, 'conv_weights': {'shape': np.array([1, 21, 4, 4]), 'dim_attrs': ['spatial_dims', 'channel_dims', 'batch_dims', 'axis']}, @@ -127,8 +134,10 @@ class TestConvolutionPartialInfer(unittest.TestCase): graph = build_graph(nodes_attributes, [('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output')], - {'conv_output': {'is_output': True, 'shape': None}, + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') + ], + {'conv_output': {'shape': None}, 'conv_input': {'shape': None}, 'conv_weights': {'shape': np.array([1, 21, 16, 16]), 'dim_attrs': ['spatial_dims', 'channel_dims', 'batch_dims', 'axis']}, @@ -153,11 +162,11 @@ class TestConvolutionPartialInfer(unittest.TestCase): [ ('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output') + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') ], { 'conv_output': { - 'is_output': True, 'shape': None }, 'conv_input': { @@ -227,11 +236,11 @@ class TestConvolutionPartialInfer(unittest.TestCase): [ ('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output') + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') ], { 'conv_output': { - 'is_output': True, 'shape': None }, 'conv_input': { @@ -301,11 +310,11 @@ class TestConvolutionPartialInfer(unittest.TestCase): [ ('conv_input', 'conv_node'), ('conv_weights', 'conv_node'), - ('conv_node', 'conv_output') + ('conv_node', 'conv_output'), + ('conv_output', 'op_output') ], { 'conv_output': { - 'is_output': True, 'shape': None }, 'conv_input': { diff --git a/model-optimizer/mo/ops/crop.py b/model-optimizer/mo/ops/crop.py index 4c1875f..1f660c9 100644 --- a/model-optimizer/mo/ops/crop.py +++ b/model-optimizer/mo/ops/crop.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,23 +16,24 @@ import logging as log -import networkx as nx import numpy as np from mo.front.caffe.extractors.utils import get_canonical_axis_index -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs class Crop(Op): op = 'Crop' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, - 'infer': __class__.infer + 'infer': __class__.infer, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) def backend_attrs(self): diff --git a/model-optimizer/mo/ops/crop_test.py b/model-optimizer/mo/ops/crop_test.py index 9eb5412..e93e936 100644 --- a/model-optimizer/mo/ops/crop_test.py +++ b/model-optimizer/mo/ops/crop_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/deconvolution.py b/model-optimizer/mo/ops/deconvolution.py index b4fe12b..829161e 100644 --- a/model-optimizer/mo/ops/deconvolution.py +++ b/model-optimizer/mo/ops/deconvolution.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ """ import logging as log -import networkx as nx import numpy as np from mo.front.common.partial_infer.utils import int64_array, float_array, mark_input_bins, assign_dims_to_weights, \ @@ -23,19 +22,21 @@ from mo.front.common.partial_infer.utils import int64_array, float_array, mark_i from mo.front.onnx.extractors.utils import get_backend_pad from mo.front.extractor import spatial_getter from mo.utils.error import Error -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs class Deconvolution(Op): op = 'Deconvolution' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, 'infer': __class__.infer, + 'in_ports_count': 3, + 'out_ports_count': 1, }, attrs) def backend_attrs(self): diff --git a/model-optimizer/mo/ops/eltwise.py b/model-optimizer/mo/ops/eltwise.py index 18185f6..eba1956 100644 --- a/model-optimizer/mo/ops/eltwise.py +++ b/model-optimizer/mo/ops/eltwise.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,17 +14,17 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.common.partial_infer.eltwise import eltwise_infer +from mo.graph.graph import Graph from mo.ops.op import Op class Eltwise(Op): op = 'Eltwise' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): operations = { 'sum': ('Add', lambda a, b: a + b), 'mul': ('Mul', lambda a, b: a * b), @@ -35,6 +35,8 @@ class Eltwise(Op): 'type': 'Eltwise', # a property of IE supported layer 'op': operations[attrs['operation']][0], 'infer': lambda node: eltwise_infer(node, operations[node.operation][1]), + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/eltwise_n.py b/model-optimizer/mo/ops/eltwise_n.py index 8f5eb03..e2060b3 100644 --- a/model-optimizer/mo/ops/eltwise_n.py +++ b/model-optimizer/mo/ops/eltwise_n.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ limitations under the License. """ -import networkx as nx - +from mo.graph.graph import Graph from mo.ops.op import Op from mo.utils.error import Error @@ -27,11 +26,12 @@ class EltwiseN(Op): """ op = 'EltwiseN' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'op': __class__.op, 'type': None, # type is None because this operation should not appear in IR 'infer': None, + 'out_ports_count': 1, }, attrs) if 'operation' not in self.attrs: raise Error('"operation" attribute is not set for operation "{}".'.format(__class__.op)) diff --git a/model-optimizer/mo/ops/expand_dims.py b/model-optimizer/mo/ops/expand_dims.py index ce790bb..c64b3e8 100644 --- a/model-optimizer/mo/ops/expand_dims.py +++ b/model-optimizer/mo/ops/expand_dims.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,8 @@ limitations under the License. """ -from mo.ops.op import Op from mo.front.common.partial_infer.expand_dims import tf_expand_dims_infer +from mo.ops.op import Op class ExpandDims(Op): @@ -28,8 +28,6 @@ class ExpandDims(Op): 'op': __class__.op, 'infer': tf_expand_dims_infer, 'expand_axis': None, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) - - def supported_attrs(self): - # TODO ugly copying from Reshape op - return [('dim', lambda node: ', '.join(map(str, node['dim'])))] diff --git a/model-optimizer/mo/ops/flatten.py b/model-optimizer/mo/ops/flatten.py index 96408e4..05b5412 100644 --- a/model-optimizer/mo/ops/flatten.py +++ b/model-optimizer/mo/ops/flatten.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,12 +14,13 @@ limitations under the License. """ -import networkx as nx -import numpy as np import logging as log +import numpy as np + from mo.front.caffe.extractors.utils import get_canonical_axis_index from mo.front.common.partial_infer.utils import int64_array +from mo.graph.graph import Graph from mo.ops.op import Op @@ -27,11 +28,13 @@ class Flatten(Op): op = 'Flatten' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, 'infer': __class__.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/flatten_onnx.py b/model-optimizer/mo/ops/flatten_onnx.py index 07a40c7..e997437 100644 --- a/model-optimizer/mo/ops/flatten_onnx.py +++ b/model-optimizer/mo/ops/flatten_onnx.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,11 @@ limitations under the License. """ -import networkx as nx -import numpy as np import logging as log +import numpy as np + +from mo.graph.graph import Graph from mo.ops.op import Op @@ -25,16 +26,15 @@ class FlattenONNX(Op): op = 'FlattenONNX' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'Reshape', 'op': __class__.op, 'infer': __class__.infer, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) - def supported_attrs(self): - return [('dim', lambda node: ','.join(map(str, node['dim'])))] - @staticmethod def infer(node): """ @@ -51,7 +51,9 @@ class FlattenONNX(Op): return if len(node.in_nodes()) != 1: - log.debug('Can\'t calculate output shape for {} node. Number of input nodes should be equal 1 instead of {}'.format(node.name, len(node.in_nodes()))) + log.debug( + 'Can\'t calculate output shape for {} node. Number of input nodes should be equal 1 instead of {}'.format( + node.name, len(node.in_nodes()))) return axis = node.axis @@ -60,5 +62,4 @@ class FlattenONNX(Op): node['dim'] = np.array(dim) node.out_node().shape = np.array(dim) if node.in_node(0).has_valid('value'): - node.out_node().value = node.in_node(0).value - node.out_node().value.shape = np.array(dim) + node.out_node().value = np.reshape(node.in_node(0).value, dim) diff --git a/model-optimizer/mo/ops/flatten_onnx_test.py b/model-optimizer/mo/ops/flatten_onnx_test.py index 1e68fbb..a73aa7f 100644 --- a/model-optimizer/mo/ops/flatten_onnx_test.py +++ b/model-optimizer/mo/ops/flatten_onnx_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/flatten_test.py b/model-optimizer/mo/ops/flatten_test.py index 9d58401..75de344 100644 --- a/model-optimizer/mo/ops/flatten_test.py +++ b/model-optimizer/mo/ops/flatten_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,7 +24,8 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'value': None, 'kind': 'data'}, 'flatten_1': {'type': 'Flatten', 'value': None, 'kind': 'op'}, - 'node_2': {'value': None, 'kind': 'data'} + 'node_2': {'value': None, 'kind': 'data'}, + 'output_op': { 'kind': 'op', 'op': 'OpOutput'}, } @@ -32,8 +33,10 @@ class TestFlattenPartialInfer(unittest.TestCase): def test_flatten_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'flatten_1'), - ('flatten_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': np.array([1, 3 * 256 * 256])}, + ('flatten_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': np.array([1, 3 * 256 * 256])}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'flatten_1': {'axis': 1, 'dim': []} }) @@ -49,8 +52,10 @@ class TestFlattenPartialInfer(unittest.TestCase): def test_flatten_infer_no_shape(self): graph = build_graph(nodes_attributes, [('node_1', 'flatten_1'), - ('flatten_1', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + ('flatten_1', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': None}, 'flatten_1': {'axis': 1} }) diff --git a/model-optimizer/mo/ops/inner_product.py b/model-optimizer/mo/ops/inner_product.py index 291af9c..3dcf082 100644 --- a/model-optimizer/mo/ops/inner_product.py +++ b/model-optimizer/mo/ops/inner_product.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.inner_product import caffe_inner_product +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,12 +23,14 @@ class InnerProduct(Op): op = 'FullyConnected' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'FullyConnected', 'op': 'FullyConnected', 'out-size': None, 'layout': 'NCHW', + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': caffe_inner_product }, attrs) diff --git a/model-optimizer/mo/ops/inner_product_test.py b/model-optimizer/mo/ops/inner_product_test.py index 22d3c4a..2151ed3 100644 --- a/model-optimizer/mo/ops/inner_product_test.py +++ b/model-optimizer/mo/ops/inner_product_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/input.py b/model-optimizer/mo/ops/input.py index 1aa76af..47b035d 100644 --- a/model-optimizer/mo/ops/input.py +++ b/model-optimizer/mo/ops/input.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,20 +14,20 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import single_output_infer +from mo.graph.graph import Graph from mo.ops.op import Op class Input(Op): op = 'Input' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': 'Placeholder', 'infer': lambda node: single_output_infer(node, lambda n: n.shape), + 'out_ports_count': 1, 'is_input': True }, attrs) diff --git a/model-optimizer/mo/ops/lin_op.py b/model-optimizer/mo/ops/lin_op.py index ff1ec6b..3a3c7b7 100644 --- a/model-optimizer/mo/ops/lin_op.py +++ b/model-optimizer/mo/ops/lin_op.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,32 +17,40 @@ import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.ops.op import Op from mo.front.common.partial_infer.eltwise import eltwise_infer class LinOp(Op): enabled = False - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'can_be_bias': True, 'can_be_fused': True, 'type': 'Eltwise', 'infer': None, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): return ['operation'] + class Add(LinOp): enabled = False - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + op = 'Add' + + def __init__(self, graph: Graph, attrs: dict): attrs.update({'op': 'Add', 'operation': 'sum', 'infer': lambda node: eltwise_infer(node, lambda a, b: a + b)}) super().__init__(graph, attrs) class Mul(LinOp): enabled = False - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + op = 'Mul' + + def __init__(self, graph: Graph, attrs: dict): attrs.update({'op': 'Mul', 'operation': 'mul', 'infer': lambda node: eltwise_infer(node, lambda a, b: a*b)}) super().__init__(graph, attrs) diff --git a/model-optimizer/mo/ops/lrn.py b/model-optimizer/mo/ops/lrn.py index f7dc110..f0e65cf 100644 --- a/model-optimizer/mo/ops/lrn.py +++ b/model-optimizer/mo/ops/lrn.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,10 +23,12 @@ class LRN(Op): op = 'LRN' enabled = False - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'Norm', 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': copy_shape_infer }, attrs) diff --git a/model-optimizer/mo/ops/memory.py b/model-optimizer/mo/ops/memory.py index 745efff..269a8ab 100644 --- a/model-optimizer/mo/ops/memory.py +++ b/model-optimizer/mo/ops/memory.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,7 @@ limitations under the License. """ -import networkx as nx - -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.front.common.partial_infer.elemental import copy_shape_infer from mo.utils.error import Error @@ -27,7 +25,7 @@ class Memory(Op): op = 'Memory' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'Memory', 'op': 'Memory', @@ -35,6 +33,8 @@ class Memory(Op): 'size': None, 'index': None, 'infer': Memory.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/op.py b/model-optimizer/mo/ops/op.py index 83d80fb..2028acc 100644 --- a/model-optimizer/mo/ops/op.py +++ b/model-optimizer/mo/ops/op.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ limitations under the License. """ +import copy import logging as log from collections import namedtuple @@ -22,7 +23,8 @@ import numpy as np from mo.front.extractor import add_attrs_props from mo.front.extractor import update_ie_fields -from mo.graph.graph import Node, unique_id +from mo.graph.graph import Node, Graph +from mo.graph.port import Port from mo.utils import class_registration from mo.utils.error import Error @@ -33,7 +35,7 @@ class Op(object): # Add the derived class to excluded_classes if one should not be registered in registered_ops excluded_classes = [] - def __init__(self, graph: nx.MultiDiGraph, attrs1: dict = None, attrs2: dict = None): + def __init__(self, graph: Graph, attrs1: dict = None, attrs2: dict = None): self.graph = graph try: self.ir_version = graph.graph['ir_version'] @@ -56,13 +58,15 @@ class Op(object): if attrs is not None: new_attrs.update(attrs) id_prefix = new_attrs['name'] if 'name' in new_attrs else '' - id = unique_id(self.graph, id_prefix) + id = self.graph.unique_id(id_prefix) new_attrs['name'] = id new_attrs = add_attrs_props(new_attrs) update_ie_fields(new_attrs, self.ir_version) self.substitute_ie_attrs(new_attrs) self.graph.add_node(id, **new_attrs) - return Node(self.graph, id) + + node = Node(self.graph, id) + return node def substitute_ie_attrs(self, new_attrs: dict): """ @@ -71,6 +75,7 @@ class Op(object): """ backend_attrs_mapping = { None: self.backend_attrs, + 5: self.backend_attrs, 4: self.backend_attrs, 3: self.backend_attrs, 2: self.backend_attrs_v2 @@ -103,23 +108,25 @@ class Op(object): raise Error('Node {} has more than one outputs. Provide output port explicitly. '.format(node.name)) return node, port - def cut_edge_and_create_node(self, node: Node, out_port: int, attrs: dict = None): + def create_node_on_port(self, node: Node, out_port: int, attrs: dict = None, edge_attrs: dict = None): """ Removes an edge, that is connected to nodes out_port. Creates new_node with attrs attributes and connects it to node by edge that stores the same information as cutted edge. :param node: Input node, to cut the edge from :param out_port: output port of edge to cut :param attrs: attributes of new node + :param edge_attrs: attributes to be changed/added to new edge :return: Node instance of created new_node """ - edges = [(u, v, keys, params) for u, v, keys, params in node.graph.out_edges(node.id, data=True, keys=True) - if 'out' in params and params['out'] == out_port] - edge_attrs = edges[0][3] - [self.graph.remove_edge(u, v, key=key) for u, v, key, params in edges] + if edge_attrs is None: + edge_attrs = {'in': 0} + prev_edge_attrs = copy.deepcopy(node.out_edge(out_port)) + prev_edge_attrs.update(edge_attrs) + new_edge_attrs = prev_edge_attrs if attrs is None: attrs = dict() new_node = self.add_node(attrs) - self.graph.add_edge(node.id, new_node.id, **edge_attrs) + self.graph.add_edge(node.id, new_node.id, **new_edge_attrs) return new_node def create_node(self, inputs: list = None, attrs: dict = None, edge_attrs: dict = None): @@ -176,7 +183,7 @@ class Op(object): old_data_value = [None] old_data_shape = [None] if data_nodes is None: - data_node = unique_id(self.graph) + data_node = self.graph.unique_id() self.graph.add_node(data_node, **add_attrs_props( dict(kind='data', precision="FP32", name=data_node, value=None, shape=None, data_type=None, infer=None))) @@ -190,9 +197,11 @@ class Op(object): data_nodes] for id, data_node in enumerate(data_nodes): self.graph.add_edges_from([(new_op_node.id, data_node.id, {'out': id})]) + if new_op_node.has_valid('infer'): - log.debug('Start running infer function for individual op node with attributes: {}'.format( - new_op_node.graph.node[new_op_node.id])) + if log.getLogger().isEnabledFor(log.DEBUG): + log.debug('Start running infer function for individual op node with attributes: {}' + ''.format(str(new_op_node))) new_op_node.infer(new_op_node) assert all(old_value is None for old_value in old_data_value) or all( [np.array_equal(old_data_value[id], data_node.value) for id, data_node in enumerate(data_nodes)]) @@ -203,36 +212,36 @@ class Op(object): [old_data_shape[id] for id in range(len(data_nodes))], [data_node.shape for data_node in data_nodes]) for data_node in data_nodes: - log.debug( - 'Finished running infer function, data nodes attributes: {}'.format( - data_node.graph.node[data_node.id])) + if log.getLogger().isEnabledFor(log.DEBUG): + log.debug( + 'Finished running infer function, data nodes attributes: {}'.format(data_node)) return data_nodes[0] if len(data_nodes) == 1 else data_nodes @staticmethod - def create_data_node(graph: nx.MultiDiGraph, op_node: Node, attrs: dict = None, edge_attrs: dict = None): + def create_data_node(graph: Graph, op_node: Node, attrs: dict = None, edge_attrs: dict = None, out_port=0): assert op_node is not None and op_node.kind == 'op' assert len(op_node.out_nodes()) == 0 if attrs is None: attrs = {} - data_node = unique_id(graph, op_node.id) + data_node = graph.unique_id(op_node.id) defaul_attrs = dict(kind='data', precision="FP32", name=data_node, value=None, shape=None, data_type=None, infer=None) defaul_attrs.update(attrs) graph.add_node(data_node, **add_attrs_props(defaul_attrs)) data_node = Node(graph, data_node) if edge_attrs is not None: - graph.add_edges_from([(op_node.id, data_node.id, {'out': 0, **edge_attrs})]) + graph.add_edges_from([(op_node.id, data_node.id, {'out': out_port, **edge_attrs})]) else: - graph.add_edges_from([(op_node.id, data_node.id, {'out': 0})]) + graph.add_edges_from([(op_node.id, data_node.id, {'out': out_port})]) return data_node @staticmethod - def _create_data_node(graph: nx.MultiDiGraph, name: str, attrs: dict = None): + def _create_data_node(graph: Graph, name: str, attrs: dict = None): if attrs is None: attrs = {} - data_node = unique_id(graph, name) + data_node = graph.unique_id(name) defaul_attrs = dict(kind='data', precision="FP32", name=data_node, value=None, shape=None, data_type=None, infer=None) defaul_attrs.update(attrs) @@ -241,23 +250,24 @@ class Op(object): return data_node @staticmethod - def create_input_data_node(graph: nx.MultiDiGraph, name: str, value: np.array, attrs: dict = {}): - data_node = unique_id(graph, name) - defaul_attrs = dict(kind='data', precision="FP32", name=data_node, value=np.array(value), shape=value.shape, + def create_input_data_node(graph: Graph, name: str, value: np.array, attrs: dict = {}): + data_node = graph.unique_id(name) + defaul_attrs = dict(kind='data', precision="FP32", name=data_node, value=np.array(value), + shape=np.array(value.shape), data_type=None, infer=None) defaul_attrs.update(attrs) graph.add_node(data_node, **add_attrs_props(defaul_attrs)) return Node(graph, data_node) @staticmethod - def create_and_connect_input_data_node(graph: nx.MultiDiGraph, op_node: Node, attrs: dict = None, edge_attrs: dict = None): + def create_and_connect_input_data_node(graph: Graph, op_node: Node, attrs: dict = None, edge_attrs: dict = None): assert op_node is not None and op_node.kind == 'op' if attrs is None: attrs = {} if edge_attrs is None: edge_attrs = {} - data_node = unique_id(graph, op_node.id) + data_node = graph.unique_id(op_node.id) defaul_attrs = dict(kind='data', precision="FP32", name=data_node, value=None, shape=None, data_type=None, infer=None) defaul_attrs.update(attrs) diff --git a/model-optimizer/mo/ops/output.py b/model-optimizer/mo/ops/output.py index 8a4f578..8b77397 100644 --- a/model-optimizer/mo/ops/output.py +++ b/model-optimizer/mo/ops/output.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ limitations under the License. """ -import networkx as nx - +from mo.graph.graph import Graph from mo.ops.op import Op @@ -26,12 +25,11 @@ class Output(Op): """ op = 'OpOutput' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict = None): + def __init__(self, graph: Graph, attrs: dict = None): super().__init__(graph, { - 'type': __class__.op, 'op': __class__.op, - 'is_output': True, - 'infer': None, + 'infer': lambda x: None, 'value': None, 'data_type': None, + 'in_ports_count': 1, }, attrs) diff --git a/model-optimizer/mo/ops/pad.py b/model-optimizer/mo/ops/pad.py index 739b886..47377d0 100644 --- a/model-optimizer/mo/ops/pad.py +++ b/model-optimizer/mo/ops/pad.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,9 @@ import logging as log -import networkx as nx import numpy as np +from mo.graph.graph import Graph from mo.ops.op import Op, PermuteAttrs @@ -50,11 +50,13 @@ class Pad(Op): op = 'Pad' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'op': __class__.op, 'type': __class__.op, 'infer': __class__.infer, + 'in_ports_count': 2, + 'out_ports_count': 1, 'mode': 'constant', 'fill_value': float(0), 'pads': None diff --git a/model-optimizer/mo/ops/pad_test.py b/model-optimizer/mo/ops/pad_test.py index 0013ed3..bcd0fdd 100644 --- a/model-optimizer/mo/ops/pad_test.py +++ b/model-optimizer/mo/ops/pad_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/permute.py b/model-optimizer/mo/ops/permute.py index 57158d1..4f2c089 100644 --- a/model-optimizer/mo/ops/permute.py +++ b/model-optimizer/mo/ops/permute.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.transpose import transpose_infer from mo.front.extractor import attr_getter +from mo.graph.graph import Graph from mo.ops.op import Op @@ -25,12 +24,14 @@ class Permute(Op): op = 'Permute' enabled = False - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'order': None, 'type': __class__.op, 'op': __class__.op, 'infer': self.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/permute_test.py b/model-optimizer/mo/ops/permute_test.py index cf26cc7..a586438 100644 --- a/model-optimizer/mo/ops/permute_test.py +++ b/model-optimizer/mo/ops/permute_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/pooling.py b/model-optimizer/mo/ops/pooling.py index a26ab7d..4af5f6c 100644 --- a/model-optimizer/mo/ops/pooling.py +++ b/model-optimizer/mo/ops/pooling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ limitations under the License. """ -import networkx as nx import numpy as np from mo.front.common.partial_infer.utils import tf_window_op_pad_infer @@ -22,19 +21,21 @@ from mo.front.extractor import attr_getter # from mo.front.common.partial_infer.pooling import pool_explicit_padding_infer from mo.front.extractor import spatial_getter from mo.front.onnx.extractors.utils import get_backend_pad -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs class Pooling(Op): op = 'Pooling' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, 'infer': __class__.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def backend_attrs(self): diff --git a/model-optimizer/mo/ops/pooling_test.py b/model-optimizer/mo/ops/pooling_test.py index ea11b72..78c6268 100644 --- a/model-optimizer/mo/ops/pooling_test.py +++ b/model-optimizer/mo/ops/pooling_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ from mo.utils.unittest.graph import build_graph nodes_attributes = {'node_1': {'value': None, 'kind': 'data'}, 'pool': {'type': 'Pooling', 'value': None, 'kind': 'op'}, 'node_2': {'value': None, 'kind': 'data'}, + 'op_output': { 'kind': 'op', 'op': 'OpOutput'}, } @@ -32,8 +33,10 @@ class TestPoolingPartialInfer(unittest.TestCase): def test_pooling_infer(self): graph = build_graph(nodes_attributes, [('node_1', 'pool'), - ('pool', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + ('pool', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'pool': {'window': np.array([1, 1, 1, 1]), 'stride': np.array([1, 1, 2, 2]), 'pad': np.array([[0, 0], [0, 0], [3, 3], [3, 3]]), @@ -56,8 +59,10 @@ class TestPoolingPartialInfer(unittest.TestCase): def test_pooling_infer_decrement_input_spatial(self): graph = build_graph(nodes_attributes, [('node_1', 'pool'), - ('pool', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + ('pool', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 224, 224])}, 'pool': {'window': np.array([1, 1, 1, 1]), 'stride': np.array([1, 1, 3, 3]), 'pad': np.array([[0, 0], [0, 0], [3, 3], [3, 3]]), @@ -80,8 +85,10 @@ class TestPoolingPartialInfer(unittest.TestCase): def test_pooling_infer_no_convention(self): graph = build_graph(nodes_attributes, [('node_1', 'pool'), - ('pool', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + ('pool', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': np.array([1, 3, 256, 256])}, 'pool': {'window': np.array([1, 1, 1, 1]), 'stride': np.array([1, 1, 2, 2]), 'pad': np.array([[0, 0], [0, 0], [3, 3], [3, 3]]), @@ -103,8 +110,10 @@ class TestPoolingPartialInfer(unittest.TestCase): def test_pooling_infer_no_shape(self): graph = build_graph(nodes_attributes, [('node_1', 'pool'), - ('pool', 'node_2')], - {'node_2': {'is_output': True, 'shape': None}, + ('pool', 'node_2'), + ('node_2', 'op_output') + ], + {'node_2': {'shape': None}, 'node_1': {'shape': None}, 'pool': {'window': np.array([1, 1, 1, 1]), 'stride': np.array([1, 1, 2, 2]), 'pad': np.array([[0, 0], [0, 0], [3, 3], [3, 3]]), diff --git a/model-optimizer/mo/ops/power.py b/model-optimizer/mo/ops/power.py index c4d1ca0..41a2c38 100644 --- a/model-optimizer/mo/ops/power.py +++ b/model-optimizer/mo/ops/power.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,10 @@ import logging as log -import networkx as nx import numpy as np from mo.front.common.partial_infer.eltwise import eltwise_infer -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op @@ -28,7 +27,7 @@ class Power(Op): enabled = False op = 'Power' - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'Power', 'op': __class__.op, @@ -36,6 +35,8 @@ class Power(Op): 'scale': 1, 'shift': 0, 'infer': __class__.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/power_test.py b/model-optimizer/mo/ops/power_test.py index e0a3b97..c77ab3c 100644 --- a/model-optimizer/mo/ops/power_test.py +++ b/model-optimizer/mo/ops/power_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/reduce.py b/model-optimizer/mo/ops/reduce.py index 1237928..41457cd 100644 --- a/model-optimizer/mo/ops/reduce.py +++ b/model-optimizer/mo/ops/reduce.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,10 @@ import logging as log -import networkx as nx import numpy as np from mo.front.common.partial_infer.utils import int64_array -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -34,11 +33,13 @@ class Reduce(Op): 'sum': np.sum, } - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'op': 'Reduce', 'reduce_type': None, 'infer': __class__.infer, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) @staticmethod @@ -71,7 +72,7 @@ class Reduce(Op): output_node.value = Reduce.reduce_method_map[reduce_type.lower()](input_node.value, axis=tuple(node.axis), keepdims=node.keep_dims) - output_node.shape = output_node.value.shape + output_node.shape = np.array(output_node.value.shape, dtype=np.int64) else: log.error('Reduce type {} is not supported for node {}'.format(reduce_type, node.id)) return diff --git a/model-optimizer/mo/ops/relu.py b/model-optimizer/mo/ops/relu.py index db3ae7d..3ee6d14 100644 --- a/model-optimizer/mo/ops/relu.py +++ b/model-optimizer/mo/ops/relu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,11 +23,13 @@ class ReLU(Op): op = 'ReLU' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': __class__.op, - 'infer': copy_shape_infer + 'infer': copy_shape_infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/reshape.py b/model-optimizer/mo/ops/reshape.py index f616c8d..8cc24f1 100644 --- a/model-optimizer/mo/ops/reshape.py +++ b/model-optimizer/mo/ops/reshape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,12 +15,11 @@ """ import math -import networkx as nx import numpy as np from mo.front.common.partial_infer.elemental import single_output_infer from mo.front.common.partial_infer.reshape import tf_reshape_shape_infer -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op from mo.utils.error import Error @@ -29,19 +28,18 @@ class Reshape(Op): op = 'Reshape' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': lambda node: single_output_infer(node, tf_reshape_shape_infer, lambda node: np.reshape(node.in_node().value, node.out_node().shape)) }, attrs) - def supported_attrs(self): - return [('dim', lambda node: ','.join(map(str, node['dim'])))] - @staticmethod def kaldi_infer(node: Node): in_node = node.in_node().in_node() # prev_layer_node -> data -> this_node @@ -50,7 +48,7 @@ class Reshape(Op): # Convolution/Pooling layers. Therefore there are 4 cases with different # partial inference. batch = input_shape[0] - if in_node.op == 'Convolution' or in_node.op == 'Pooling': + if in_node.op in ['Convolution', 'Pooling', 'Permute']: output_spatial = np.array([batch, np.prod(input_shape[1:])], dtype=np.int64) return Reshape.set_shape_and_dim(node, output_spatial) # Supports ONLY NCHW and NH layouts diff --git a/model-optimizer/mo/ops/roipooling.py b/model-optimizer/mo/ops/roipooling.py index 3b345c3..a5d8064 100644 --- a/model-optimizer/mo/ops/roipooling.py +++ b/model-optimizer/mo/ops/roipooling.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,7 +29,9 @@ class ROIPooling(Op): 'spatial_scale': 0.0625, 'type': __class__.op, 'op': __class__.op, - 'infer': roipooling_infer + 'infer': roipooling_infer, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/scale_shift.py b/model-optimizer/mo/ops/scale_shift.py index 835b626..4642bfc 100644 --- a/model-optimizer/mo/ops/scale_shift.py +++ b/model-optimizer/mo/ops/scale_shift.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer +from mo.graph.graph import Graph from mo.ops.op import Op @@ -24,10 +23,12 @@ class ScaleShiftOp(Op): op = 'ScaleShift' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'infer': copy_shape_infer, 'kind': 'op', 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 3, + 'out_ports_count': 1, }, attrs) diff --git a/model-optimizer/mo/ops/shape.py b/model-optimizer/mo/ops/shape.py index 75f4353..475d261 100644 --- a/model-optimizer/mo/ops/shape.py +++ b/model-optimizer/mo/ops/shape.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ limitations under the License. """ -import networkx as nx import numpy as np import logging as log +from mo.graph.graph import Graph from mo.ops.op import Op @@ -25,12 +25,18 @@ class Shape(Op): op = 'Shape' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { + 'type': __class__.op, 'op': __class__.op, 'infer': __class__.infer, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) + def supported_attrs(self): + return [] + @staticmethod def infer(node): if len(node.in_nodes()) != 1: @@ -44,6 +50,7 @@ class Shape(Op): node.out_node().value = np.array(value, dtype=node.data_type) else: node.out_node().value = np.array(value) + node.out_node().shape = np.array(node.out_node().value.shape, dtype=np.int64) else: log.info('Can\'t infer shape and value for shape operation due to undefined input shape') diff --git a/model-optimizer/mo/ops/slice.py b/model-optimizer/mo/ops/slice.py index 5f6145d..fda2acd 100644 --- a/model-optimizer/mo/ops/slice.py +++ b/model-optimizer/mo/ops/slice.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,10 +16,9 @@ import logging as log -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op @@ -27,13 +26,18 @@ class Slice(Op): op = 'Slice' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': __class__.op, 'op': 'Slice', + 'in_ports_count': 3, + 'out_ports_count': 1, 'infer': __class__.infer }, attrs) + def supported_attrs(self): + return ['start', 'end', 'axis'] + @staticmethod def infer(node: Node): if len(node.in_nodes()) == 1: @@ -52,7 +56,7 @@ class Slice(Op): from mo.front.common.partial_infer.slice import caffe_slice_infer caffe_slice_infer(node) elif len(node.in_nodes()) == 3: - #TF case + # TF case start_node = node.in_node(1) size_node = node.in_node(2) if start_node.has_valid('value') and size_node.has_valid('value'): @@ -104,10 +108,10 @@ class Slice(Op): if s is None: slice_idx[axis] = slice(0, input_shape[axis], 1) - #Add new parameters to node + # Add new parameters to node node['slices'] = np.array(slice_idx) node['shrink_axis_mask'] = np.array(shrink_axis_mask) - value = value[slice_idx] + value = value[tuple(slice_idx)] node.out_node().value = np.array(value) if node.in_node(0).value is not None else None node.out_node().shape = np.array(value.shape) diff --git a/model-optimizer/mo/ops/slice_test.py b/model-optimizer/mo/ops/slice_test.py index 2061e30..edc9124 100644 --- a/model-optimizer/mo/ops/slice_test.py +++ b/model-optimizer/mo/ops/slice_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/softmax.py b/model-optimizer/mo/ops/softmax.py index eaf6bc0..0b7ff37 100644 --- a/model-optimizer/mo/ops/softmax.py +++ b/model-optimizer/mo/ops/softmax.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,8 @@ limitations under the License. """ -import networkx as nx - from mo.front.common.partial_infer.elemental import copy_shape_infer -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op @@ -25,13 +23,15 @@ class Softmax(Op): op = 'SoftMax' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'infer': Softmax.infer, 'kind': 'op', 'axis': 1, 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, }, attrs) def supported_attrs(self): diff --git a/model-optimizer/mo/ops/split.py b/model-optimizer/mo/ops/split.py index 5ce6b0f..62c3951 100644 --- a/model-optimizer/mo/ops/split.py +++ b/model-optimizer/mo/ops/split.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,9 +15,8 @@ """ import copy -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs @@ -25,12 +24,13 @@ class Split(Op): op = 'Split' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'type': 'Split', 'op': 'Split', 'axis': 1, 'input_port': 0, + 'in_ports_count': 1, 'infer': Split.infer }, attrs) diff --git a/model-optimizer/mo/ops/squeeze.py b/model-optimizer/mo/ops/squeeze.py index ef215c9..ad56f99 100644 --- a/model-optimizer/mo/ops/squeeze.py +++ b/model-optimizer/mo/ops/squeeze.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,8 +28,7 @@ class Squeeze(Op): 'kind': 'op', 'type': 'Reshape', 'op': __class__.op, - 'infer': tf_squeeze_infer + 'infer': tf_squeeze_infer, + 'in_ports_count': 2, + 'out_ports_count': 1, }, attrs) - - def supported_attrs(self): - return [('dim', lambda node: ', '.join(map(str, node['dim'])))] diff --git a/model-optimizer/mo/ops/strided_slice.py b/model-optimizer/mo/ops/strided_slice.py new file mode 100644 index 0000000..50f1f93 --- /dev/null +++ b/model-optimizer/mo/ops/strided_slice.py @@ -0,0 +1,114 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +import numpy as np + +from mo.front.common.partial_infer.slice import tf_strided_slice_infer +from mo.graph.graph import Node, Graph +from mo.ops.op import Op, PermuteAttrs +from mo.utils.utils import array_to_str + + +def permute_array_with_ellipsis(node: Node, permutation: PermuteAttrs.Permutation, array: np.array, ins_value: int): + """ + This function permutes masks according to permutation parameter. Several cases should be processed: + * Some dimensions can be omitted in mask according to ellipsis mask + * Mask length can be less than length of output dimensions plus shrinked dimensions + * Mask have the same or more length than output + """ + attr_mask_extended = list(array) + + # If input and output have length of shape 3 and less, no need to permute + if len(node.in_node().shape) < 4 and len(node.out_node().shape) < 4: + return attr_mask_extended + + # Length of mask is less than length of output ()plus shrinked dimensions then we should extend it before permutation + if len(attr_mask_extended) < len(node.out_node(0).shape) + np.count_nonzero(node.shrink_axis_mask): + # ellipsis is set, add dimensions in right place otherwise insert in the end + if np.any(node.ellipsis_mask): + idx = np.nonzero(node.ellipsis_mask) + assert len(idx[0]) == 1 + id = idx[0][0] + else: + id = len(attr_mask_extended) - 1 + + ellips_ext = len(node.out_node(0).shape) + np.count_nonzero(node.shrink_axis_mask) - len(attr_mask_extended) + for i in range(0, ellips_ext): + attr_mask_extended.insert(id + i + 1, ins_value) + # permute extended mask + perm = PermuteAttrs.get_nhwc_to_nchw_permutation(len(attr_mask_extended)) + attr_mask_extended = np.array(attr_mask_extended)[perm.perm] + return attr_mask_extended + else: + perm_len = len(node.out_node(0).shape) + np.count_nonzero(node.shrink_axis_mask) + perm = PermuteAttrs.get_nhwc_to_nchw_permutation(perm_len) + perm_list = list(perm.perm) + # if mask length is more than output, just add tail that will not be permuted to avoid error + for i in range(perm_len, len(attr_mask_extended)): + perm_list.append(i) + return np.array(attr_mask_extended, dtype=np.int64)[np.array(perm_list)] + + +def permute_masks(node: Node, permutation: PermuteAttrs.Permutation, attr: str): + if not node.has_valid(attr): + return None + + node[attr] = permute_array_with_ellipsis(node, permutation, node[attr], + attr in ['begin_mask', 'end_mask']) + return node[attr] + + +class StridedSlice(Op): + op = 'StridedSlice' + enabled = True + + def __init__(self, graph: Graph, attrs: dict): + super().__init__(graph, { + 'type': __class__.op, + 'op': 'StridedSlice', + 'in_ports_count': 4, + 'out_ports_count': 1, + 'infer': __class__.infer + }, attrs) + + def backend_attrs(self): + al = list() + + def convert(attr): + return lambda node: array_to_str(node, attr) + for a in list(['new_axis_mask', 'shrink_axis_mask', 'ellipsis_mask', 'begin_mask', 'end_mask']): + al.append((a, convert(a))) + return al + + @staticmethod + def infer(node: Node): + tf_strided_slice_infer(node) + + PermuteAttrs.create_permute_attrs(node, attrs=[('shrink_axis_mask', 'input:0', permute_masks), + ('new_axis_mask', 'input:0', permute_masks), + ('ellipsis_mask', 'input:0', permute_masks), + ('begin_mask', 'input:0', permute_masks), + ('end_mask', 'input:0', permute_masks), + ]) + + for i in range(1, len(node.in_nodes())): + if node.in_node(i).value is not None and node.in_node(i).shape[0] > 3: + perm = PermuteAttrs.get_nhwc_to_nchw_permutation(len(node.in_node(0).shape)) + node.in_node(i).value = permute_array_with_ellipsis(node, perm, node.in_node(i).value, 0) + + # due to permutation from nhwc to nchw we will extend all masks and inputs + idx = np.nonzero(node.ellipsis_mask) + node.ellipsis_mask[idx] = 0 diff --git a/model-optimizer/mo/ops/strided_slice_test.py b/model-optimizer/mo/ops/strided_slice_test.py new file mode 100644 index 0000000..c933b4e --- /dev/null +++ b/model-optimizer/mo/ops/strided_slice_test.py @@ -0,0 +1,290 @@ +""" + Copyright (c) 2019 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import unittest + +import numpy as np +from generator import generator + +from mo.graph.graph import Node +from mo.ops.op import PermuteAttrs +from mo.ops.strided_slice import permute_masks, permute_array_with_ellipsis +from mo.utils.unittest.graph import build_graph + +nodes_attributes = { + 'data_1': { + 'kind': 'data', + 'shape': None, + 'value': None, + }, + 'begin': { + 'kind': 'data', + 'shape': None, + 'value': None, + }, + 'end': { + 'kind': 'data', + 'shape': None, + 'value': None, + }, + 'stride': { + 'kind': 'data', + 'shape': None, + 'value': None, + }, + 'strided_slice': { + 'op': 'StridedSlice', + 'begin_mask': None, + 'end_mask': None, + 'new_axis_mask': None, + 'shrink_axis_mask': None, + 'ellipsis_mask': None, + 'kind': 'op', + }, + 'data_2': { + 'kind': 'data', + 'shape': None, + 'value': None, + } +} + + +@generator +class TestPermutationStridedSlice(unittest.TestCase): + def test_permute_begin_end(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 1, 0, 0]), 'end_mask': np.array([0, 1, 0, 0]), + 'new_axis_mask': np.array([0, 0, 0]), 'shrink_axis_mask': [0, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 1, 0]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0]))) + + def test_permute_begin_end_short(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 0, 0]), 'end_mask': np.array([0, 1, 0]), + 'new_axis_mask': np.array([0, 0, 0]), 'shrink_axis_mask': [0, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0]))) + + def test_permute_begin_end_long(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 0, 0, 1, 0]), 'end_mask': np.array([0, 1, 0, 1, 1]), + 'new_axis_mask': np.array([0, 0, 0]), 'shrink_axis_mask': [0, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0, 0]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0, 1]))) + + def test_permute_begin_end_new(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 0, 0, 1, 0]), 'end_mask': np.array([0, 1, 0, 1, 1]), + 'new_axis_mask': np.array([1, 0, 0]), 'shrink_axis_mask': [0, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0, 1]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0, 1]))) + + def test_permute_begin_end_new_short(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 0, 0]), 'end_mask': np.array([0, 1, 0]), + 'new_axis_mask': np.array([1, 0, 0]), 'shrink_axis_mask': [0, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0, 1]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0, 1]))) + + def test_permute_begin_end_shrink(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 0, 0, 1]), 'end_mask': np.array([0, 1, 0, 1]), + 'new_axis_mask': np.array([0, 0, 0]), 'shrink_axis_mask': [1, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask') + + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0]))) + + def test_permute_begin_end_shrink_short(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([1, 0, 0]), 'end_mask': np.array([0, 1, 0]), + 'new_axis_mask': np.array([0, 0, 0]), 'shrink_axis_mask': [1, 0, 0], + 'ellipsis_mask': np.array([0, 0, 0])}, + 'data_2': {'shape': np.array([2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0]))) + + def test_permute_begin_end_ellipsis(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]), + 'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0], + 'ellipsis_mask': np.array([1, 0])}, + 'data_2': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 1, 1]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 1, 1]))) + + def test_permute_begin_end_ellipsis_new(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([0, 0, 0]), 'end_mask': np.array([1, 0, 0]), + 'new_axis_mask': np.array([1, 0, 0]), 'shrink_axis_mask': [0], + 'ellipsis_mask': np.array([0, 1, 0])}, + 'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask') + self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 1, 1]))) + + permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask') + self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 1, 1]))) + + def test_permute_begin_end_ellipsis_new_inputs(self): + # Testing constant path case + graph = build_graph(nodes_attributes, + [('data_1', 'strided_slice'), + ('begin', 'strided_slice'), + ('end', 'strided_slice'), + ('stride', 'strided_slice'), + ('strided_slice', 'data_2')], + {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None}, + 'strided_slice': {'begin_mask': np.array([0, 0, 0]), 'end_mask': np.array([1, 0, 0]), + 'new_axis_mask': np.array([1, 0, 0]), 'shrink_axis_mask': [0], + 'ellipsis_mask': np.array([0, 1, 0])}, + 'begin': {'value': np.array([0, 1, 2])}, + 'end': {'value': np.array([1, 2, 3])}, + 'stride': {'value': np.array([1, 1, 1])}, + 'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None}, + }) + + slice_node = Node(graph, 'strided_slice') + slice_node.in_node(1).value = permute_array_with_ellipsis(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), + slice_node.in_node(1).value, 0) + self.assertTrue(np.array_equal(slice_node.in_node(1).value, np.array([0, 2, 1, 0, 0]))) + + slice_node.in_node(2).value = permute_array_with_ellipsis(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), + slice_node.in_node(2).value, 0) + self.assertTrue(np.array_equal(slice_node.in_node(2).value, np.array([1, 3, 2, 0, 0]))) diff --git a/model-optimizer/mo/ops/tile.py b/model-optimizer/mo/ops/tile.py index 146978f..21f45c9 100644 --- a/model-optimizer/mo/ops/tile.py +++ b/model-optimizer/mo/ops/tile.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ """ import logging as log -import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.ops.op import Op, PermuteAttrs @@ -26,11 +25,13 @@ class Tile(Op): op = 'Tile' enabled = True - def __init__(self, graph: nx.MultiDiGraph, attrs: dict): + def __init__(self, graph: Graph, attrs: dict): super().__init__(graph, { 'kind': 'op', 'type': __class__.op, 'op': __class__.op, + 'in_ports_count': 1, + 'out_ports_count': 1, 'infer': Tile.infer }, attrs) diff --git a/model-optimizer/mo/ops/tile_test.py b/model-optimizer/mo/ops/tile_test.py index af0d189..0b708b9 100644 --- a/model-optimizer/mo/ops/tile_test.py +++ b/model-optimizer/mo/ops/tile_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/ops/unsqueeze.py b/model-optimizer/mo/ops/unsqueeze.py index 99195a3..2fce222 100644 --- a/model-optimizer/mo/ops/unsqueeze.py +++ b/model-optimizer/mo/ops/unsqueeze.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ import numpy as np -from mo.front.common.partial_infer.squeeze import tf_squeeze_infer from mo.ops.op import Op, PermuteAttrs @@ -29,12 +28,11 @@ class Unsqueeze(Op): 'kind': 'op', 'type': 'Reshape', 'op': __class__.op, + 'in_ports_count': 2, + 'out_ports_count': 1, 'infer': __class__.infer }, attrs) - def supported_attrs(self): - return [('dim', lambda node: ', '.join(map(str, node['dim'])))] - @staticmethod def infer(node): unsqueeze_dims = np.array(node.unsqueeze_dims) diff --git a/model-optimizer/mo/ops/unsqueeze_test.py b/model-optimizer/mo/ops/unsqueeze_test.py index 06d25b0..f618502 100644 --- a/model-optimizer/mo/ops/unsqueeze_test.py +++ b/model-optimizer/mo/ops/unsqueeze_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/pipeline/caffe.py b/model-optimizer/mo/pipeline/caffe.py index d334396..e1e8dad 100644 --- a/model-optimizer/mo/pipeline/caffe.py +++ b/model-optimizer/mo/pipeline/caffe.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,52 +16,37 @@ import argparse import logging as log -import numpy as np - -from extensions.front.freeze_placeholder_value import FreezePlaceholderValue -from extensions.middle.FusePermutesSequence import FusePermutesSequence +from extensions.back.CreateConstNodes import CreateConstNodesReplacement from mo.front.caffe import custom_layers_mapping, loader -from mo.front.caffe.extractor import caffe_extractor, common_caffe_fields, caffe_type_extractors -from mo.front.common.register_custom_ops import check_for_duplicates -from mo.front.common.register_custom_ops import update_extractors_with_extensions -from mo.front.common.replacement import FrontReplacementSubgraph -from mo.front.extractor import extract_node_attrs, add_output_ops, create_tensor_nodes, remove_output_ops, \ - add_input_ops, user_data_repack -from mo.graph.graph import print_graph_stat, check_empty_graph +from mo.front.caffe.extractor import caffe_type_extractors, caffe_extractor +from mo.front.common.register_custom_ops import update_extractors_with_extensions, check_for_duplicates +from mo.front.extractor import extract_node_attrs, remove_output_ops +from mo.middle.passes.conv import convert_add_or_mul_to_scaleshift from mo.middle.passes.conv import convert_muladd_to_scaleshift_or_power, \ - convert_matmul_to_fully_connected, batch_norm_fuse, convert_add_to_scaleshift, \ - convert_mul_to_scaleshift, \ - convert_multi_input_conv -from mo.middle.passes.eliminate import graph_clean_up, remove_op_nodes + convert_matmul_to_fully_connected, batch_norm_fuse +from mo.middle.passes.eliminate import graph_clean_up +from mo.middle.passes.eliminate import remove_const_ops from mo.middle.passes.fusing.decomposition import convert_bn_to_mul_add, convert_scale_shift_to_mul_add from mo.middle.passes.fusing.fuse_linear_ops import fuse_linear_ops from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence from mo.middle.passes.fusing.mark_unfused_nodes import mark_unfused_nodes from mo.middle.passes.fusing.resnet_optimization import stride_optimization -from mo.middle.passes.infer import add_mean_scale_values, scale_input, override_placeholder_shapes, mark_outputs, \ - partial_infer, convert_mul_add_to_power, override_batch +from mo.middle.passes.infer import convert_mul_add_to_power from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess -from mo.middle.passes.pool import mean_to_avgpool from mo.middle.passes.shape import reverse_input_channels, fuse_sequence_of_reshapes -from mo.middle.passes.shared_weights_duplication import duplicate_shared_weights from mo.pipeline.common import prepare_emit_ir from mo.utils import class_registration +from mo.utils.cli_parser import get_meta_info from mo.utils.error import Error from mo.utils.find_inputs import find_inputs from mo.utils.utils import refer_to_faq_msg -from mo.utils.cli_parser import get_meta_info -def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, output_model_name: str, outputs: list, - output_dir: str, - scale: float, - user_shapes: [None, list, np.array] = None, mean_scale_values: [dict, list] = (), mean_file: str = "", - mean_file_offsets: tuple = None, - custom_layers_mapping_path: str = None): +def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, output_model_name: str, + output_dir: str, mean_file: str = "", + mean_file_offsets: tuple = None, custom_layers_mapping_path: str = None): meta_info = get_meta_info(argv) - FusePermutesSequence.enabled = False - proto, model = loader.load_caffe_proto_model(proto_file_name, model_file_name) update_extractors_with_extensions( @@ -77,8 +62,8 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, refer_to_faq_msg(11), str(e)) from e log.debug("After caffe_pb_to_nx") - print_graph_stat(graph) - check_empty_graph(graph, 'load_caffe_proto_model') + graph.print_graph_stat() + graph.check_empty_graph('load_caffe_proto_model') graph.__setattr__('proto_path', proto_file_name) graph.__setattr__('caffemodel_path', model_file_name) @@ -86,12 +71,7 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, graph.graph['layout'] = 'NCHW' graph.graph['cmd_params'] = argv graph.graph['fw'] = 'caffe' - graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4 - - extract_node_attrs(graph, lambda node: (True, common_caffe_fields(node))) - - log.debug("After adding specific nodes for outputs") - print_graph_stat(graph) + graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 5 custom_layers_map = custom_layers_mapping.load_layers_xml(custom_layers_mapping_path) custom_layers_mapping.update_extractors( @@ -100,76 +80,16 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, argv.disable_omitting_optional if hasattr(argv, 'disable_omitting_optional') else False, argv.enable_flattening_nested_params if hasattr(argv, 'enable_flattening_nested_params') else False ) - extract_node_attrs(graph, lambda node: caffe_extractor(node, check_for_duplicates(caffe_type_extractors))) - log.debug("After extract_node_attr") - print_graph_stat(graph) - - packed_user_shapes, packed_outputs, freeze_placeholder = user_data_repack(graph, user_shapes, outputs, argv.freeze_placeholder_with_value) - if argv.freeze_placeholder_with_value is not None: - FreezePlaceholderValue.enabled = True - FreezePlaceholderValue.replacement_dict = freeze_placeholder - class_registration.update_registration([FrontReplacementSubgraph]) - output_op_nodes = add_output_ops(graph, packed_outputs) - input_op_nodes = add_input_ops(graph, packed_user_shapes, True) - override_placeholder_shapes(graph, packed_user_shapes) - override_batch(graph, argv.batch) - graph_clean_up(graph) - check_empty_graph(graph, 'add_output_ops and add_input_ops') + # --------------------------------- LOAD END ------------------------------------------------------ class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER) - - graph = create_tensor_nodes(graph) - - log.debug("After create_tensor_nodes") - print_graph_stat(graph) - - remove_op_nodes(graph, {'op': 'Identity'}) - remove_output_ops(graph) - graph_clean_up(graph) - - log.debug("After removing specific nodes for output") - print_graph_stat(graph) - - # you need to pass required network outputs here - # but we don't have a way yet, so just passing all discovered sinks - mark_outputs(graph) - graph_clean_up(graph) - log.debug("After graph_cleanup") - print_graph_stat(graph) - - graph = partial_infer(graph) - log.debug("After partial_infer") - print_graph_stat(graph) - check_empty_graph(graph, 'partial_infer') - duplicate_shared_weights(graph) - - input_op_nodes = add_input_ops(graph, packed_user_shapes, False) - graph_clean_up(graph) - check_empty_graph(graph, 'add_input_ops') - scale_input(graph, scale) - - add_mean_scale_values(graph, mean_scale_values) - - log.debug("Split multi input convolutions") - convert_multi_input_conv(graph) - - graph_clean_up(graph) - log.debug("After graph_cleanup") - print_graph_stat(graph) - - remove_op_nodes(graph, {'op': 'Dropout'}) - remove_op_nodes(graph, {'phase': 0}) - graph_clean_up(graph) - class_registration.apply_replacements(graph, class_registration.ClassType.MIDDLE_REPLACER) - mean_to_avgpool(graph) - # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes mark_unfused_nodes(graph, argv.finegrain_fusing) - #need this pass even without fusing to convert scale with 2 inputs + # need this pass even without fusing to convert scale with 2 inputs convert_scale_shift_to_mul_add(graph) graph_clean_up(graph) @@ -190,12 +110,12 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, convert_matmul_to_fully_connected(graph) batch_norm_fuse(graph) convert_mul_add_to_power(graph) - convert_add_to_scaleshift(graph) # scale = 1 - convert_mul_to_scaleshift(graph) # biases = 0 - graph_clean_up(graph) + convert_add_or_mul_to_scaleshift(graph) # scale = 1 + graph_clean_up(graph) + log.debug("After graph_cleanup") - print_graph_stat(graph) + graph.print_graph_stat() if argv.reverse_input_channels: reverse_input_channels(graph) @@ -220,6 +140,11 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str, class_registration.apply_replacements(graph, class_registration.ClassType.BACK_REPLACER) + remove_const_ops(graph) + CreateConstNodesReplacement().find_and_replace_pattern(graph) + + remove_output_ops(graph) + prepare_emit_ir(graph=graph, data_type=argv.data_type, output_dir=output_dir, output_model_name=output_model_name, mean_data=mf, input_names=input_names, diff --git a/model-optimizer/mo/pipeline/common.py b/model-optimizer/mo/pipeline/common.py index 7c21c90..6d4b94c 100644 --- a/model-optimizer/mo/pipeline/common.py +++ b/model-optimizer/mo/pipeline/common.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,15 +14,14 @@ limitations under the License. """ +import logging as log import os from operator import itemgetter -import logging as log import networkx as nx -from mo.back.ie_ir_ver_2.emitter import port_renumber, serialize_constants, generate_ie_ir, serialize_mean_image, \ - create_const_nodes -from mo.graph.graph import Node, unique_id +from mo.back.ie_ir_ver_2.emitter import port_renumber, serialize_constants, generate_ie_ir, serialize_mean_image +from mo.graph.graph import Node, Graph from mo.middle.passes import tensor_names, convert_data_type from mo.utils.error import Error @@ -62,7 +61,7 @@ def get_fw_tensor_debug_info(node: Node): return node.soft_get('fw_tensor_debug_info') -def get_sorted_outputs(graph: nx.MultiDiGraph): +def get_sorted_outputs(graph: Graph): outputs = [] outputs_for_sort = {} for node in graph.nodes(): @@ -85,7 +84,7 @@ def get_sorted_outputs(graph: nx.MultiDiGraph): return [Node(graph, key) for key, value in sorted(outputs_for_sort.items(), key=itemgetter(1))] -def collect_sub_graphs(graph: nx.MultiDiGraph): +def collect_sub_graphs(graph: Graph): ''' Go over all nodes and sub_graphs in the graph recursively; returns all found sub-graphs. ''' result = [] for node in graph.nodes(): @@ -97,14 +96,14 @@ def collect_sub_graphs(graph: nx.MultiDiGraph): return result -def relabel_nodes_inplace_safe(graph: nx.MultiDiGraph, new_labels: dict): +def relabel_nodes_inplace_safe(graph: Graph, new_labels: dict): ''' Safely relabels graph in-place without graph copy. Safity in this place means that it is guarantied that there won't be collisions during relabiling process. ''' # Relabel nodes in two stages - intermediate_map = {node: unique_id(graph, '__relabel__{}__'.format(str(i))) for i, node in enumerate(graph.nodes())} + intermediate_map = {node: graph.unique_id('__relabel__{}__'.format(str(i))) for i, node in enumerate(graph.nodes())} final_map = {dst: new_labels[src] for src, dst in intermediate_map.items()} assert len(set(intermediate_map.keys()).intersection(set(intermediate_map.values()))) == 0 assert len(set(final_map.keys()).intersection(set(final_map.values()))) == 0 @@ -112,11 +111,9 @@ def relabel_nodes_inplace_safe(graph: nx.MultiDiGraph, new_labels: dict): nx.relabel_nodes(graph, final_map, copy=False) -def prepare_emit_ir(graph: nx.MultiDiGraph, data_type: str, output_dir: str, output_model_name: str, +def prepare_emit_ir(graph: Graph, data_type: str, output_dir: str, output_model_name: str, mean_data: [list, None] = None, input_names: list = [], meta_info: dict = dict()): - for sub_graph in [graph] + collect_sub_graphs(graph): - create_const_nodes(sub_graph, start_data_nodes_are_not_allowed=(sub_graph == graph)) op_order, data_order = determined_sort(get_sorted_outputs(sub_graph)) mapping = {v: u for u, v in enumerate(op_order)} mapping.update({v: u for u, v in enumerate(data_order, start=len(sub_graph))}) diff --git a/model-optimizer/mo/pipeline/common_test.py b/model-optimizer/mo/pipeline/common_test.py index a877700..8ee313f 100644 --- a/model-optimizer/mo/pipeline/common_test.py +++ b/model-optimizer/mo/pipeline/common_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/pipeline/kaldi.py b/model-optimizer/mo/pipeline/kaldi.py index fcb3faa..e86b794 100644 --- a/model-optimizer/mo/pipeline/kaldi.py +++ b/model-optimizer/mo/pipeline/kaldi.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,25 +14,27 @@ limitations under the License. """ import logging as log + import numpy as np +from extensions.back.CreateConstNodes import CreateConstNodesReplacement from extensions.back.kaldi_remove_memory_output import KaldiRemoveMemoryOutputBackReplacementPattern from extensions.back.remove_last_softmax_pattern import RemoveLastSoftMaxPattern from extensions.front.kaldi.eliminate_redundant_reshape import EliminateRedundantReshape from extensions.front.kaldi.fuse_repeated_reshape import FuseRepeatedReshapes from extensions.middle.EltwiseChecker import EltwiseChecker from mo.front.common.register_custom_ops import update_extractors_with_extensions -from mo.front.extractor import create_tensor_nodes, extract_node_attrs, add_output_ops, remove_output_ops +from mo.front.extractor import extract_node_attrs, remove_output_ops from mo.front.kaldi.extractor import kaldi_extractor, kaldi_type_extractors from mo.front.kaldi.loader.loader import load_kaldi_model, read_counts_file +from mo.graph.graph import Node +from mo.middle.passes.eliminate import graph_clean_up, remove_const_ops +from mo.middle.passes.infer import partial_infer +from mo.pipeline.common import prepare_emit_ir from mo.utils import class_registration from mo.utils.cli_parser import get_meta_info from mo.utils.error import Error from mo.utils.find_inputs import find_outputs -from mo.graph.graph import print_graph_stat, Node, check_empty_graph -from mo.middle.passes.eliminate import graph_clean_up -from mo.middle.passes.infer import override_placeholder_shapes, partial_infer, mark_outputs, override_batch -from mo.pipeline.common import prepare_emit_ir from mo.utils.utils import refer_to_faq_msg @@ -92,14 +94,13 @@ def apply_biases_to_last_layer(graph, counts): biases_node = target_node.in_nodes()[2] # first - input, second - weights, third - biases if biases_node.value is not None: - biases_node.value = np.subtract(biases_node.value, counts) + biases_node.value = np.subtract(biases_node.value, counts) # pylint: disable=assignment-from-no-return else: biases_node.value = counts * -1 biases_node.shape = counts.shape -def driver(argv, input_model, output_model_name, outputs, output_dir, scale, placeholder_shapes=None, - mean_scale_values=()): +def driver(argv, input_model, output_model_name, output_dir): meta_info = get_meta_info(argv) EltwiseChecker.enabled = False @@ -109,51 +110,22 @@ def driver(argv, input_model, output_model_name, outputs, output_dir, scale, pla except Exception as e: raise Error('Model Optimizer is not able to read Kaldi model {}. '.format(input_model) + refer_to_faq_msg(91)) from e - check_empty_graph(graph, 'load_kaldi_nnet_model') + graph.check_empty_graph('load_kaldi_nnet_model') graph.graph['cmd_params'] = argv graph.graph['fw'] = 'kaldi' - graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4 - + graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 5 update_extractors_with_extensions(kaldi_type_extractors) - extract_node_attrs(graph, lambda node: kaldi_extractor(node)) + # --------------------------------- LOAD END ------------------------------------------------------ class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER) - output_op_nodes = add_output_ops(graph, outputs) # TODO pass real outputs instead of None - log.debug("After adding specific nodes for outputs") - print_graph_stat(graph) - - check_empty_graph(graph, 'add_output_ops') - create_tensor_nodes(graph) - - graph_clean_up(graph) - log.debug("After removing specific nodes for output") - print_graph_stat(graph) - - override_placeholder_shapes(graph, placeholder_shapes) - override_batch(graph, argv.batch) - - graph_clean_up(graph) - log.debug("After setting input shapes") - print_graph_stat(graph) - graph_clean_up(graph) - remove_output_ops(graph) - log.debug("After removing specific nodes for output") - print_graph_stat(graph) - - # You need to pass required network outputs here - # but we don't have a way yet, so just passing all discovered sinks - mark_outputs(graph) - graph_clean_up(graph) - log.debug("After graph_cleanup") - print_graph_stat(graph) graph = partial_infer(graph) # The order is intentional, firstly eliminate repeated, then remove redundant FuseRepeatedReshapes().find_and_replace_pattern(graph) EliminateRedundantReshape().find_and_replace_pattern(graph) - check_empty_graph(graph, 'partial_infer') + graph.check_empty_graph('partial_infer') if argv.counts: try: counts = read_counts_file(argv.counts) @@ -167,9 +139,15 @@ def driver(argv, input_model, output_model_name, outputs, output_dir, scale, pla RemoveLastSoftMaxPattern().find_and_replace_pattern(graph) graph_clean_up(graph) log.debug("After removing softmax") - print_graph_stat(graph) + graph.print_graph_stat() # Intentionally after all transformations KaldiRemoveMemoryOutputBackReplacementPattern().find_and_replace_pattern(graph) + + remove_const_ops(graph) + CreateConstNodesReplacement().find_and_replace_pattern(graph) + + remove_output_ops(graph) + prepare_emit_ir(graph, argv.data_type, output_dir, output_model_name, meta_info=meta_info) return 0 diff --git a/model-optimizer/mo/pipeline/kaldi_test.py b/model-optimizer/mo/pipeline/kaldi_test.py index 2fe1833..3e08bf3 100644 --- a/model-optimizer/mo/pipeline/kaldi_test.py +++ b/model-optimizer/mo/pipeline/kaldi_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,20 +29,17 @@ class TestKaldiPipeline(unittest.TestCase): 'weights': {'value': None, 'kind': 'data'}, 'biases': {'value': np.zeros(10), 'kind': 'data'}, 'sc': {'op': 'ScaleShift', 'kind': 'op'}, - 'output': {'kind': 'data'} + 'output': {'kind': 'data'}, + 'op_output': {'op': 'OpOutput', 'kind': 'op'} } graph = build_graph(nodes, [ ('input', 'sc'), ('weights', 'sc'), ('biases', 'sc'), - ('sc', 'output') - ], - { - 'output': { - 'is_output': True - } - }) + ('sc', 'output'), + ('output', 'op_output') + ]) counts = -0.5 * np.ones(10) apply_biases_to_last_layer(graph, counts) sc_node = Node(graph, 'sc') @@ -53,20 +50,17 @@ class TestKaldiPipeline(unittest.TestCase): 'weights': {'kind': 'data'}, 'biases': {'value': None, 'shape': None, 'kind': 'data'}, 'fc': {'op': 'FullyConnected', 'kind': 'op'}, - 'output': {'kind': 'data'} + 'output': {'kind': 'data'}, + 'op_output': {'op': 'OpOutput', 'kind': 'op'} } graph = build_graph(nodes, [ ('input', 'fc'), ('weights', 'fc'), ('biases', 'fc'), - ('fc', 'output') - ], - { - 'output': { - 'is_output': True - } - }) + ('fc', 'output'), + ('output', 'op_output') + ]) counts = -0.5 * np.ones(10) apply_biases_to_last_layer(graph, counts) fc_node = Node(graph, 'fc') @@ -79,7 +73,8 @@ class TestKaldiPipeline(unittest.TestCase): 'fc': {'op': 'FullyConnected', 'kind': 'op'}, 'data': {'kind': 'data'}, 'softmax': {'op': 'SoftMax', 'kind': 'op'}, - 'output': {'kind': 'data'} + 'output': {'kind': 'data'}, + 'op_output': {'op': 'OpOutput', 'kind': 'op'} } graph = build_graph(nodes, [ @@ -88,13 +83,9 @@ class TestKaldiPipeline(unittest.TestCase): ('biases', 'fc'), ('fc', 'data'), ('data', 'softmax'), - ('softmax', 'output') - ], - { - 'output': { - 'is_output': True - } - }) + ('softmax', 'output'), + ('output', 'op_output') + ]) counts = -0.5 * np.ones(10) apply_biases_to_last_layer(graph, counts) fc_node = Node(graph, 'fc') diff --git a/model-optimizer/mo/pipeline/mx.py b/model-optimizer/mo/pipeline/mx.py index 03ac18f..e382cd6 100644 --- a/model-optimizer/mo/pipeline/mx.py +++ b/model-optimizer/mo/pipeline/mx.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. """ +from extensions.back.CreateConstNodes import CreateConstNodesReplacement +from extensions.front.restore_ports import RestorePorts +from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively from mo.utils.error import Error, FrameworkError from mo.utils.utils import refer_to_faq_msg @@ -22,31 +25,23 @@ except ImportError: raise Error('Module mxnet was not found. Please install appropriate version of mxnet via install_prerequisites ' 'script.' + refer_to_faq_msg(52)) -import logging as log - -import numpy as np import argparse -import networkx as nx -from mo.front.extractor import add_output_ops, extract_node_attrs, create_tensor_nodes, \ - add_input_ops, remove_output_ops, user_data_repack +from mo.front.extractor import extract_node_attrs, remove_output_ops from mo.front.mxnet.extractor import mxnet_op_extractor from mo.front.mxnet.loader import symbol2nx, load_symbol_def from mo.middle.passes.fusing.decomposition import convert_batch_norm, convert_scale_shift_to_mul_add from mo.middle.passes.conv import convert_muladd_to_scaleshift_or_power, \ - convert_add_to_scaleshift, convert_mul_to_scaleshift, fuse_pad -from mo.middle.passes.eliminate import graph_clean_up, remove_op_nodes + convert_add_or_mul_to_scaleshift, fuse_pad +from mo.middle.passes.eliminate import graph_clean_up, remove_const_ops from mo.middle.passes.fusing.fuse_linear_ops import fuse_linear_ops from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence from mo.middle.passes.fusing.mark_unfused_nodes import mark_unfused_nodes -from mo.middle.passes.shared_weights_duplication import duplicate_shared_weights from mo.middle.passes.fusing.resnet_optimization import stride_optimization -from mo.middle.passes.infer import mark_outputs, override_placeholder_shapes, partial_infer, add_mean_scale_values, \ - scale_input, convert_mul_add_to_power +from mo.middle.passes.infer import convert_mul_add_to_power from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess from mo.middle.passes.shape import reverse_input_channels from mo.pipeline.common import prepare_emit_ir -from mo.graph.graph import create_edge, Node, print_graph_stat, check_empty_graph from mo.front.mxnet.nd_to_params import save_params_file from mo.front.common.register_custom_ops import update_extractors_with_extensions from mo.front.mxnet.extractor import mxnet_op_extractors @@ -55,48 +50,7 @@ from mo.utils.cli_parser import get_meta_info from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize -def add_input_data_to_prior_boxes(graph: nx.MultiDiGraph, input_names: str = ''): - """ - PriorBox layer has data input unlike mxnet. - Need to add data input to _contrib_MultiBoxPrior for - for correct conversion to PriorBox layer. - - Parameters - ---------- - graph : nx.MultiDiGraph - Graph with loaded model. - """ - if not input_names: - input_names = ('data',) - else: - input_names = input_names.split(',') - - input_nodes = {} - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.name in input_names: - input_nodes.update({node.id: node}) - - if len(input_nodes) > 0: - for node in graph.nodes(): - node = Node(graph, node) - if node.has_valid('op') and node.op == '_contrib_MultiBoxPrior': - create_edge(list(input_nodes.values())[0], node, out_port=0, in_port=1) - - -#TODO Remove the func after 'add_output_ops' will be moved to front replacer. -def check_softmax_node_inputs(graph: nx.MultiDiGraph): - for i, attrs in list(graph.nodes(data=True)): - if 'op' in attrs and attrs['op'] == 'SoftMax': - node = Node(graph, i) - if len(node.in_nodes()) > 1: - graph.remove_node(node.in_node(1).id) - - -def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, outputs: list, output_dir: str, - scale: float, - placeholder_shapes: [None, list, np.array] = None, - mean_scale_values: [dict, list] = ()): +def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, output_dir: str): meta_info = get_meta_info(argv) try: @@ -118,61 +72,20 @@ def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, o update_extractors_with_extensions(mxnet_op_extractors) graph = symbol2nx(model_nodes, model_params, argv.input) - check_empty_graph(graph, 'symbol2nx. It may happen due to problems with loaded model') + graph.check_empty_graph('symbol2nx. It may happen due to problems with loaded model') graph.__setattr__('name', output_model_name) graph.graph['layout'] = 'NCHW' graph.graph['cmd_params'] = argv graph.graph['fw'] = 'mxnet' graph.graph['feature_dim'] = 1 if graph.graph['layout'] == 'NCHW' else 3 - graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4 - graph = extract_node_attrs(graph, mxnet_op_extractor) - check_softmax_node_inputs(graph) - - user_shapes, packed_outputs, _ = user_data_repack(graph, placeholder_shapes, outputs, None) - output_op_nodes = add_output_ops(graph, packed_outputs) - input_op_nodes = add_input_ops(graph, user_shapes, True) - - try: - override_placeholder_shapes(graph, user_shapes, argv.batch) - except ValueError as err: - raise Error( - 'The following error happened while processing input shapes: {}. ' + - refer_to_faq_msg(54), - str(err) - ) from err - check_empty_graph(graph, 'add_output_ops and add_input_ops') + graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 5 + extract_node_attrs(graph, mxnet_op_extractor) + # --------------------------------- LOAD END ------------------------------------------------------ class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER) - add_input_data_to_prior_boxes(graph, argv.input) - - graph = create_tensor_nodes(graph) - - graph_clean_up(graph) - remove_output_ops(graph) - mark_outputs(graph) - remove_output_ops(graph) - - graph_clean_up(graph) - - log.debug("After removing specific nodes for output") - - print_graph_stat(graph) - - graph = partial_infer(graph) - graph_clean_up(graph) - check_empty_graph(graph, 'partial_infer') - - duplicate_shared_weights(graph) - - scale_input(graph, scale) - add_mean_scale_values(graph, mean_scale_values) - - remove_op_nodes(graph, {'identity': True}) - - graph_clean_up(graph) - class_registration.apply_replacements(graph, class_registration.ClassType.MIDDLE_REPLACER) + fuse_pad(graph) # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes @@ -205,8 +118,9 @@ def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, o graph_clean_up(graph) convert_mul_add_to_power(graph) - convert_add_to_scaleshift(graph) # scale = 1 - convert_mul_to_scaleshift(graph) # biases = 0 + graph_clean_up(graph) + convert_add_or_mul_to_scaleshift(graph) # scale = 1 + graph_clean_up(graph) if argv.reverse_input_channels: reverse_input_channels(graph) @@ -220,6 +134,11 @@ def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, o class_registration.apply_replacements(graph, class_registration.ClassType.BACK_REPLACER) + for_graph_and_each_sub_graph_recursively(graph, remove_const_ops) + CreateConstNodesReplacement().find_and_replace_pattern(graph) + + for_graph_and_each_sub_graph_recursively(graph, remove_output_ops) + prepare_emit_ir(graph=graph, data_type=argv.data_type, output_dir=output_dir, output_model_name=output_model_name, meta_info=meta_info) return 0 diff --git a/model-optimizer/mo/pipeline/onnx.py b/model-optimizer/mo/pipeline/onnx.py index 88fd356..d41ea4d 100644 --- a/model-optimizer/mo/pipeline/onnx.py +++ b/model-optimizer/mo/pipeline/onnx.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. """ - from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -22,48 +21,40 @@ from __future__ import unicode_literals import argparse import logging as log -import numpy as np - +from extensions.back.CreateConstNodes import CreateConstNodesReplacement +from extensions.middle.AddQuantizeFuse import AddQuantizeFuse from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize -from extensions.middle.NormalizeFullyConnected import NormalizeFullyConnected -from mo.front.common.register_custom_ops import check_for_duplicates -from mo.front.common.register_custom_ops import update_extractors_with_extensions -from mo.front.extractor import add_output_ops, add_input_ops, \ - extract_node_attrs, create_tensor_nodes, remove_output_ops, user_data_repack -from mo.front.onnx.extractor import common_onnx_fields, onnx_op_extractor, onnx_op_extractors +from extensions.middle.MulQuantizeFuse import MulQuantizeFuse +from mo.front.common.register_custom_ops import update_extractors_with_extensions, check_for_duplicates +from mo.front.extractor import extract_node_attrs, remove_output_ops +from mo.front.onnx.extractor import onnx_op_extractor, onnx_op_extractors from mo.front.onnx.loader import load_onnx_model, protobuf2nx -from mo.middle.passes.conv import convert_add_to_scaleshift, convert_gemm_to_fully_connected, \ - convert_muladd_to_scaleshift_or_power, fuse_pad, convert_dilated_convolution, convert_mul_to_scaleshift -from mo.middle.passes.eliminate import graph_clean_up, remove_op_nodes, remove_useless_split +from mo.middle.passes.conv import convert_add_or_mul_to_scaleshift, convert_muladd_to_scaleshift_or_power, fuse_pad +from mo.middle.passes.eliminate import graph_clean_up_onnx, remove_const_ops from mo.middle.passes.fusing.decomposition import convert_batch_norm, convert_scale_shift_to_mul_add from mo.middle.passes.fusing.fuse_grouped_conv import grouped_convolutions_fusing from mo.middle.passes.fusing.fuse_linear_ops import fuse_linear_ops from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence from mo.middle.passes.fusing.mark_unfused_nodes import mark_unfused_nodes -from mo.middle.passes.infer import scale_input, override_placeholder_shapes, partial_infer, convert_mul_add_to_power, \ - update_fully_connected_shapes, add_mean_scale_values, override_batch +from mo.middle.passes.infer import convert_mul_add_to_power from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess from mo.middle.passes.shape import convert_reshape, reverse_input_channels, \ fuse_sequence_of_reshapes, merge_nodes_permutations, permute_data_nodes_attrs, permute_op_nodes_attrs +from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively from mo.pipeline.common import prepare_emit_ir from mo.utils import class_registration from mo.utils.cli_parser import get_meta_info from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg -from mo.graph.graph import check_empty_graph - -def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: str, outputs: list, output_dir: str, - scale: float, - user_shapes: [None, list, np.array] = None, - mean_scale_values: [dict, list] = ()): +def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: str, output_dir: str): meta_info = get_meta_info(argv) model_proto = load_onnx_model(model_file_name) model_graph = model_proto.graph # pylint: disable=no-member - #print(model_graph) - #assert len(model_graph) == 1, "An ONNX model contains more than 1 graph: unsupported" + # print(model_graph) + # assert len(model_graph) == 1, "An ONNX model contains more than 1 graph: unsupported" log.debug("Number of nodes in graph_def: {}".format(len(model_graph.node))) log.debug("Number of all input ports (not true inputs) in graph_def: {}".format(len(model_graph.input))) log.debug("Number of initializers in graph_def: {}".format(len(model_graph.initializer))) @@ -73,15 +64,13 @@ def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: st try: graph = protobuf2nx(model_proto) log.debug("Number of nodes in NX graph: {}".format(graph.number_of_nodes())) - graph.__setattr__('name', output_model_name if output_model_name else model_proto.graph.name) # pylint: disable=no-member + graph.__setattr__('name', + output_model_name if output_model_name else model_proto.graph.name) # pylint: disable=no-member graph.graph['layout'] = 'NCHW' graph.graph['cmd_params'] = argv graph.graph['fw'] = 'onnx' graph.graph['feature_dim'] = 1 if graph.graph['layout'] == 'NCHW' else 3 - graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4 - # extract basic attributes earlier to enable some passes that relies on them before full attribute - # extractor is called - extract_node_attrs(graph, lambda node: (True, common_onnx_fields(node))) + graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 5 except Exception as e: raise Error( 'Cannot pre-process ONNX graph after reading from model file "{}". ' \ @@ -90,59 +79,15 @@ def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: st model_file_name, str(e) ) from e - check_empty_graph(graph, 'protobuf2nx. It may happen due to problems with loaded model') - packed_user_shapes, packed_outputs, _ = user_data_repack(graph, user_shapes, outputs, None) - - output_op_nodes = add_output_ops(graph, packed_outputs) - input_op_nodes = add_input_ops(graph, packed_user_shapes, True) - - # this call of 'graph_clean_up' removes child nodes of outputs which is useful when custom output is specified - graph_clean_up(graph) - check_empty_graph(graph, 'add_output_ops and add_input_ops') + graph.check_empty_graph('protobuf2nx. It may happen due to problems with loaded model') extract_node_attrs(graph, lambda node: onnx_op_extractor(node, check_for_duplicates(onnx_op_extractors))) + # --------------------------------- LOAD END ------------------------------------------------------ class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER) - - create_tensor_nodes(graph) - graph_clean_up(graph) - - override_placeholder_shapes(graph, packed_user_shapes) - override_batch(graph, argv.batch) - - graph_clean_up(graph) - remove_op_nodes(graph, {'op': 'Identity'}) - - graph_clean_up(graph) - - remove_output_ops(graph) - - partial_infer(graph) - graph_clean_up(graph) - check_empty_graph(graph, 'partial_infer') - - input_op_nodes = add_input_ops(graph, packed_user_shapes, False) - graph_clean_up(graph) - check_empty_graph(graph, 'add_input_ops') - #change_placeholders_types_to_FP32(graph) - - scale_input(graph, scale) - add_mean_scale_values(graph, mean_scale_values) - - convert_dilated_convolution(graph) - graph_clean_up(graph) - - graph_clean_up(graph) - - remove_op_nodes(graph, {'op': 'Identity'}) - remove_useless_split(graph) - class_registration.apply_replacements(graph, class_registration.ClassType.MIDDLE_REPLACER) - convert_gemm_to_fully_connected(graph) - NormalizeFullyConnected().find_and_replace_pattern(graph) - fuse_pad(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes mark_unfused_nodes(graph, argv.finegrain_fusing) @@ -150,50 +95,54 @@ def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: st # Converting FusedBatchNorm layer to Mul->Add->Mul->Add sequence # IE doesn't support BN with 4 inputs, so we have to split it to two ScaleShift convert_batch_norm(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) if not argv.disable_fusing: # Converting ScaleShift layer to Mul->Add convert_scale_shift_to_mul_add(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) # Fusing the sequences of Mul/Add operations fuse_mul_add_sequence(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) # Fusing linear operation to Convolution fuse_linear_ops(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) if not argv.disable_gfusing: grouped_convolutions_fusing(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) if not argv.disable_fusing: fuse_linear_ops(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) + + AddQuantizeFuse().find_and_replace_pattern(graph) + MulQuantizeFuse().find_and_replace_pattern(graph) convert_muladd_to_scaleshift_or_power(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) convert_mul_add_to_power(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) convert_reshape(graph) - convert_add_to_scaleshift(graph) # scale = 1 - convert_mul_to_scaleshift(graph) # biases = 0 + graph_clean_up_onnx(graph) + convert_add_or_mul_to_scaleshift(graph) # scale = 1 + graph_clean_up_onnx(graph) fuse_pad(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) if argv.reverse_input_channels: reverse_input_channels(graph) if argv.move_to_preprocess: move_scaleshift_to_preprocess(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) fuse_sequence_of_reshapes(graph) - graph_clean_up(graph) + graph_clean_up_onnx(graph) pattern = EltwiseInputNormalize() pattern.find_and_replace_pattern(graph) @@ -204,6 +153,12 @@ def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: st class_registration.apply_replacements(graph, class_registration.ClassType.BACK_REPLACER) + for_graph_and_each_sub_graph_recursively(graph, remove_const_ops) + + CreateConstNodesReplacement().find_and_replace_pattern(graph) + + for_graph_and_each_sub_graph_recursively(graph, remove_output_ops) + prepare_emit_ir(graph=graph, data_type=argv.data_type, output_dir=output_dir, output_model_name=output_model_name, meta_info=meta_info) diff --git a/model-optimizer/mo/pipeline/tf.py b/model-optimizer/mo/pipeline/tf.py index f6e1503..07dc07f 100644 --- a/model-optimizer/mo/pipeline/tf.py +++ b/model-optimizer/mo/pipeline/tf.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,96 +15,50 @@ """ import argparse -import copy import logging as log -import networkx as nx -import numpy as np import tensorflow as tf +from extensions.back.CreateConstNodes import CreateConstNodesReplacement +from extensions.middle.LayoutChangeForConstantShapePaths import LayoutChangeForConstantShapePaths +from extensions.middle.ConcatOptimization import ConcatOptimization + try: import tensorflow.contrib except: pass # we try to import contrib for loading models that use contrib operations -import mo.front.tf.custom_subgraph_call as csc -from extensions.front.freeze_placeholder_value import FreezePlaceholderValue -from extensions.front.tf.basic_lstm_cell import BasicLSTMCell -from extensions.middle.AddIsCyclicAttribute import AddIsCyclicAttribute from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize -from extensions.middle.GemmResolver import GemmResolver -from extensions.middle.TensorIteratorBackEdge import BackEdgesMatching -from extensions.middle.TensorIteratorCondition import LoopConditionMatcher, \ - SimpleConditionMather # SimpleConditionMather -from extensions.middle.TensorIteratorConditionChecker import ConditionChecks -from extensions.middle.TensorIteratorInput import SmartInputMatcher, SimpleInputMatcher, BackEdgeSimpleInputMatcher -from extensions.middle.TensorIteratorMerge import TensorIteratorMerge -from extensions.middle.TensorIteratorOutput import SmartOutputMatcher -from extensions.middle.TensorIterator_utils import DeleteSelect -from mo.front.common.custom_replacement_registry import CustomReplacementRegistry -from mo.front.common.find_unsupported_ops import find_unsupported_ops +from mo.middle.passes.eliminate import remove_const_ops from mo.front.common.register_custom_ops import check_for_duplicates from mo.front.common.register_custom_ops import update_extractors_with_extensions -from mo.front.extractor import restore_edges, add_output_ops, add_input_ops, \ - extract_node_attrs, create_tensor_nodes, remove_output_ops, user_data_repack, remove_control_dependency_inputs -from mo.front.tf.change_placeholder_type import change_placeholders_types_to_FP32 -from mo.front.tf.extractor import get_tf_edges, common_tf_fields, tf_op_extractor, tf_op_extractors -from mo.front.tf.loader import load_tf_graph_def, protobuf2nx, variables_to_constants -from mo.front.tf.register_custom_ops import update_registration -from mo.front.tf.replacement import FrontReplacementFromConfigFileOp -from mo.graph.graph import check_empty_graph -from mo.middle.passes.conv import convert_add_to_scaleshift, convert_matmul_to_fully_connected, \ - convert_muladd_to_scaleshift_or_power, fuse_pad, transpose_fully_connected_weights, \ - convert_dilated_convolution, convert_mul_to_scaleshift, convert_nasnet -from mo.middle.passes.eliminate import remove_op_nodes, remove_useless_split, graph_clean_up_tf +from mo.front.extractor import restore_edges, extract_node_attrs, remove_output_ops, remove_control_dependency_inputs +from mo.front.tf.extractor import get_tf_edges, tf_op_extractor, tf_op_extractors +from mo.front.tf.loader import load_tf_graph_def, protobuf2nx +from mo.middle.passes.conv import convert_add_or_mul_to_scaleshift, convert_matmul_to_fully_connected, \ + convert_muladd_to_scaleshift_or_power, fuse_pad, transpose_fully_connected_weights +from mo.middle.passes.eliminate import graph_clean_up_tf from mo.middle.passes.fusing.decomposition import convert_batch_norm, convert_scale_shift_to_mul_add from mo.middle.passes.fusing.fuse_grouped_conv import grouped_convolutions_fusing from mo.middle.passes.fusing.fuse_linear_ops import fuse_linear_ops from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence from mo.middle.passes.fusing.mark_unfused_nodes import mark_unfused_nodes -from mo.middle.passes.infer import scale_input, override_placeholder_shapes, partial_infer, convert_mul_add_to_power, \ - update_fully_connected_shapes, add_mean_scale_values, override_batch, check_for_cycle, delete_not_executable, \ - delete_control_flow_edges -from mo.middle.passes.l2normalization import l2_norm_to_norm +from mo.middle.passes.infer import convert_mul_add_to_power, update_fully_connected_shapes from mo.middle.passes.leaky_relu import convert_mul_eltwise_to_leaky_relu from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess -from mo.middle.passes.pool import mean_to_avgpool from mo.middle.passes.shape import convert_squeeze, convert_reshape, reverse_input_channels, \ conv_flatten_concat, fuse_sequence_of_reshapes, repack_fully_connected_weights_nhwc_to_nchw, \ apply_nhwc_to_nchw_permutation, permute_data_nodes_attrs, permute_op_nodes_attrs, merge_nodes_permutations -from mo.middle.passes.shared_weights_duplication import duplicate_shared_weights -from mo.middle.pattern_match import for_each_sub_graph, for_graph_and_each_sub_graph_recursively +from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively from mo.pipeline.common import prepare_emit_ir from mo.utils import class_registration, tensorboard from mo.utils.cli_parser import get_meta_info -from mo.utils.custom_replacement_config import update_custom_replacement_config_file from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg -def need_to_repeat_conversion(graph: nx.MultiDiGraph): - """ Detects if another round of conversion is required for the entire graph. - - It traverses a given `graph` and all sub-graphs recursively and searches for - 'repeat_conversion' graph attribute. If at least one is found and its value is True, - this function returns True. - """ - result = False - - def check_for_repeat(graph: nx.MultiDiGraph): - if 'repeat_conversion' in graph.graph and graph.graph['repeat_conversion']: - nonlocal result - result = True - - for_graph_and_each_sub_graph_recursively(graph, check_for_repeat) - - return result - - -def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str, outputs: list, output_dir: str, - scale: float, is_binary: bool, - user_shapes: [None, list, np.array] = None, - mean_scale_values: [dict, list] = ()): +def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str, output_dir: str, + is_binary: bool): """ Convert TF GraphDef object to NetworkX representation. The resulting graph is still TF-specific and needs normalization passes to be applied. @@ -121,7 +75,7 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str graph_def, variables_values = load_tf_graph_def(graph_file_name=model_file_name, is_binary=is_binary, checkpoint=argv.input_checkpoint, - user_output_node_names_list=outputs, + user_output_node_names_list=argv.output, model_dir=argv.saved_model_dir, meta_graph_file=argv.input_meta_graph, saved_model_tags=argv.saved_model_tags) @@ -150,25 +104,13 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str graph.graph['layout'] = 'NCHW' if argv.disable_nhwc_to_nchw else 'NHWC' graph.graph['cmd_params'] = argv graph.graph['fw'] = 'tf' - graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4 - - if graph.graph['ir_version'] == 2: - # When the deprecated IR version was requested, - # we configure only those phases that can lead to - # functional regressions in the version 2. - # BasicLSTMCell is one such transformation; when it is turned off, - # the body of TF basic_lstm_cell is converted as-is in a decomposed form, - # and should work in version 2. - BasicLSTMCell.enabled = False + graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 5 - # placeholder for request from a transformation pass to repeat the entire conversion - graph.graph['repeat_conversion'] = False + graph.graph['variables_values'] = variables_values + del variables_values graph = restore_edges(graph, get_tf_edges) graph = remove_control_dependency_inputs(graph) - # extract basic attributes earlier to enable some passes that relies on them before full attribute - # extractor is called - extract_node_attrs(graph, lambda node: (True, common_tf_fields(node))) except Exception as e: raise Error( 'Cannot pre-process TensorFlow graph after reading from model file "{}". ' \ @@ -178,257 +120,109 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str str(e) ) from e - check_empty_graph(graph, 'protobuf2nx. It may happen due to problems with loaded model') + graph.check_empty_graph('protobuf2nx. It may happen due to problems with loaded model') + extract_node_attrs(graph, lambda node: tf_op_extractor(node, check_for_duplicates(tf_op_extractors))) - packed_user_shapes, packed_outputs, freeze_placeholder = user_data_repack(graph, user_shapes, outputs, - argv.freeze_placeholder_with_value) - if freeze_placeholder is not None: - FreezePlaceholderValue.enabled = True - FreezePlaceholderValue.replacement_dict = freeze_placeholder - update_registration() + # --------------------------------- LOAD END ------------------------------------------------------ + class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER) + class_registration.apply_replacements(graph, class_registration.ClassType.MIDDLE_REPLACER) - GemmResolver.enabled = False - - inputs = list(packed_user_shapes.keys()) if packed_user_shapes is not None and isinstance(packed_user_shapes, - dict) else None - graph.graph['inputs'] = inputs # save user defined inputs for other extensions - - output_op_nodes = add_output_ops(graph, packed_outputs, inputs=packed_user_shapes) - input_op_nodes = add_input_ops(graph, packed_user_shapes, True) - - # this call of 'graph_clean_up' removes child nodes of outputs which is useful when custom output is specified + fuse_pad(graph) graph_clean_up_tf(graph) - check_empty_graph(graph, 'add_output_ops and add_input_ops. It may happen due to absence of \'Placeholder\' layer ' - 'in the model') - - variables_to_constants(graph, variables_values) - del variables_values - graph_clean_up_tf(graph) - - if argv.tensorflow_custom_operations_config_update: - if update_custom_replacement_config_file(graph, argv.tensorflow_custom_operations_config_update): - return 0 - else: - return 1 - - unsupported_ops_to_offload_to_tf = list() - - MAX_ITERATIONS = 5 - cur_iteration = 0 - while cur_iteration < MAX_ITERATIONS: - graph_copy = copy.deepcopy(graph) # create a copy of graph for the case when some ops are unsupported - - if argv.tensorflow_subgraph_patterns is not None: - csc.replace_subgraph_calls(graph, argv.tensorflow_subgraph_patterns) - - if argv.tensorflow_operation_patterns is not None: - csc.offload_operations_to_tf(graph, argv.tensorflow_operation_patterns) - - if argv.offload_unsupported_operations_to_tf and len(unsupported_ops_to_offload_to_tf): - csc.offload_unsupported_operations_to_tf(graph, unsupported_ops_to_offload_to_tf) - - extract_node_attrs(graph, lambda node: tf_op_extractor(node, check_for_duplicates(tf_op_extractors))) - - if argv.tensorflow_use_custom_operations_config is not None: - registry = CustomReplacementRegistry() - registry.add_custom_replacement_description_from_config(argv.tensorflow_use_custom_operations_config) - - # automatically generate sub-classes for custom replacements that replace sub-graph with a single node - for replacement_desc in registry.get_all_replacements_descriptions(): - if replacement_desc.has('op'): - type('FrontReplacementFromConfigFileOp' + replacement_desc.op, (FrontReplacementFromConfigFileOp,), - {'replacement_id': replacement_desc.id}) - update_registration() - - override_placeholder_shapes(graph, packed_user_shapes) - - # the user shapes are used to convert TensorFlow Object Detection API models - graph.graph['user_shapes'] = packed_user_shapes - class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER) - - override_batch(graph, argv.batch) - - create_tensor_nodes(graph) - graph_clean_up_tf(graph) - - remove_output_ops(graph) - partial_infer(graph) - delete_control_flow_edges(graph) - - replacer = AddIsCyclicAttribute() - replacer.find_and_replace_pattern(graph) - - # TENSOR ITERATOR CREATING BEGINS - if graph.graph['is_cyclic']: - replacer = DeleteSelect() - replacer.find_and_replace_pattern(graph) - - replacer = SmartInputMatcher() - replacer.find_and_replace_pattern(graph) - - replacer = SmartOutputMatcher() - replacer.find_and_replace_pattern(graph) - - replacer = LoopConditionMatcher() - replacer.find_and_replace_pattern(graph) - - replacer = SimpleConditionMather() - replacer.find_and_replace_pattern(graph) - - replacer = BackEdgesMatching() - replacer.find_and_replace_pattern(graph) + convert_matmul_to_fully_connected(graph) - replacer = ConditionChecks() - replacer.find_and_replace_pattern(graph) + # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes + for_graph_and_each_sub_graph_recursively(graph, lambda graph: mark_unfused_nodes(graph, argv.finegrain_fusing)) - delete_not_executable(graph) - graph_clean_up_tf(graph) - if graph.graph['is_cyclic']: - replacer = SimpleInputMatcher() - replacer.find_and_replace_pattern(graph) - - replacer = BackEdgeSimpleInputMatcher() - replacer.find_and_replace_pattern(graph) - - # Here will be optimizing path (ops after Enter and before body take out of body) - - replacer = TensorIteratorMerge() - replacer.find_and_replace_pattern(graph) - # TENSOR ITERATOR CREATING ENDS - - check_for_cycle(graph) + # Converting FusedBatchNorm layer to Mul->Add->Mul->Add sequence + # IE doesn't support BN with 4 inputs, so we have to split it to two ScaleShift + convert_batch_norm(graph) + graph_clean_up_tf(graph) + if not argv.disable_fusing: + # Converting ScaleShift layer to Mul->Add + for_graph_and_each_sub_graph_recursively(graph, convert_scale_shift_to_mul_add) for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - check_empty_graph(graph, 'partial_infer') - - csc.prepare_tf_call_nodes(graph) - graph_clean_up_tf(graph) - - duplicate_shared_weights(graph) - input_op_nodes = add_input_ops(graph, packed_user_shapes, False) - graph_clean_up_tf(graph) - check_empty_graph(graph, 'add_input_ops') - - change_placeholders_types_to_FP32(graph) - - scale_input(graph, scale) - add_mean_scale_values(graph, mean_scale_values) - - convert_dilated_convolution(graph) + # Fusing the sequences of Mul/Add operations + for_graph_and_each_sub_graph_recursively(graph, fuse_mul_add_sequence) for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - l2_norm_to_norm(graph) - graph_clean_up_tf(graph) - - remove_op_nodes(graph, {'identity': True}) - remove_useless_split(graph) - - class_registration.apply_replacements(graph, class_registration.ClassType.MIDDLE_REPLACER) - - mean_to_avgpool(graph) - convert_nasnet(graph) + # Fusing linear operation to Convolution + for_graph_and_each_sub_graph_recursively(graph, fuse_linear_ops) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - fuse_pad(graph) + if not argv.disable_gfusing: + grouped_convolutions_fusing(graph) graph_clean_up_tf(graph) + if not argv.disable_fusing: + fuse_linear_ops(graph) + graph_clean_up_tf(graph) - convert_matmul_to_fully_connected(graph) + # Converting Mul->Add to ScaleShift node + for_graph_and_each_sub_graph_recursively(graph, convert_muladd_to_scaleshift_or_power) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes - for_graph_and_each_sub_graph_recursively(graph, lambda graph: mark_unfused_nodes(graph, argv.finegrain_fusing)) + for_graph_and_each_sub_graph_recursively(graph, convert_mul_add_to_power) - # Converting FusedBatchNorm layer to Mul->Add->Mul->Add sequence - # IE doesn't support BN with 4 inputs, so we have to split it to two ScaleShift - convert_batch_norm(graph) - graph_clean_up_tf(graph) - - if not argv.disable_fusing: - # Converting ScaleShift layer to Mul->Add - for_graph_and_each_sub_graph_recursively(graph, convert_scale_shift_to_mul_add) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) + # Need to eliminate dead nodes before doing update_fully_connected_shapes + # because update_fully_connected_shapes does partial inference and dead + # nodes will lead to sporadic failures. + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) + for_graph_and_each_sub_graph_recursively(graph, update_fully_connected_shapes) - # Fusing the sequences of Mul/Add operations - for_graph_and_each_sub_graph_recursively(graph, fuse_mul_add_sequence) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) + for_graph_and_each_sub_graph_recursively(graph, convert_mul_eltwise_to_leaky_relu) + graph_clean_up_tf(graph) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - # Fusing linear operation to Convolution - for_graph_and_each_sub_graph_recursively(graph, fuse_linear_ops) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) + for_graph_and_each_sub_graph_recursively(graph, fuse_pad) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - if not argv.disable_gfusing: - grouped_convolutions_fusing(graph) - graph_clean_up_tf(graph) - if not argv.disable_fusing: - fuse_linear_ops(graph) - graph_clean_up_tf(graph) + for_graph_and_each_sub_graph_recursively(graph, convert_reshape) + for_graph_and_each_sub_graph_recursively(graph, convert_squeeze) - # Converting Mul->Add to ScaleShift node - for_graph_and_each_sub_graph_recursively(graph, convert_muladd_to_scaleshift_or_power) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - for_graph_and_each_sub_graph_recursively(graph, convert_mul_add_to_power) + for_graph_and_each_sub_graph_recursively(graph, convert_add_or_mul_to_scaleshift) # scale = 1 + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - # Need to eliminate dead nodes before doing update_fully_connected_shapes - # because update_fully_connected_shapes does partial inference and dead - # nodes will lead to sporadic failures. - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - for_graph_and_each_sub_graph_recursively(graph, update_fully_connected_shapes) + if argv.reverse_input_channels: + reverse_input_channels(graph) - for_graph_and_each_sub_graph_recursively(graph, convert_mul_eltwise_to_leaky_relu) + if argv.move_to_preprocess: + move_scaleshift_to_preprocess(graph) graph_clean_up_tf(graph) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - - for_graph_and_each_sub_graph_recursively(graph, fuse_pad) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - - for_graph_and_each_sub_graph_recursively(graph, convert_reshape) - for_graph_and_each_sub_graph_recursively(graph, convert_squeeze) - for_graph_and_each_sub_graph_recursively(graph, convert_add_to_scaleshift) # scale = 1 - for_graph_and_each_sub_graph_recursively(graph, convert_mul_to_scaleshift) # biases = 0 + fuse_sequence_of_reshapes(graph) - if argv.reverse_input_channels: - reverse_input_channels(graph) + pattern = EltwiseInputNormalize() + pattern.find_and_replace_pattern(graph) - if argv.move_to_preprocess: - move_scaleshift_to_preprocess(graph) - graph_clean_up_tf(graph) + conv_flatten_concat(graph) - for_graph_and_each_sub_graph_recursively(graph, fuse_sequence_of_reshapes) + if argv.enable_concat_optimization: + ConcatOptimization().find_and_replace_pattern(graph) - pattern = EltwiseInputNormalize() - pattern.find_and_replace_pattern(graph) + LayoutChangeForConstantShapePaths().find_and_replace_pattern(graph) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - conv_flatten_concat(graph) + for_graph_and_each_sub_graph_recursively(graph, apply_nhwc_to_nchw_permutation) + for_graph_and_each_sub_graph_recursively(graph, merge_nodes_permutations) + for_graph_and_each_sub_graph_recursively(graph, permute_data_nodes_attrs) + for_graph_and_each_sub_graph_recursively(graph, permute_op_nodes_attrs) - for_graph_and_each_sub_graph_recursively(graph, apply_nhwc_to_nchw_permutation) - for_graph_and_each_sub_graph_recursively(graph, merge_nodes_permutations) - for_graph_and_each_sub_graph_recursively(graph, permute_data_nodes_attrs) - for_graph_and_each_sub_graph_recursively(graph, permute_op_nodes_attrs) + for_graph_and_each_sub_graph_recursively(graph, repack_fully_connected_weights_nhwc_to_nchw) + for_graph_and_each_sub_graph_recursively(graph, transpose_fully_connected_weights) - for_graph_and_each_sub_graph_recursively(graph, repack_fully_connected_weights_nhwc_to_nchw) - for_graph_and_each_sub_graph_recursively(graph, transpose_fully_connected_weights) + for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) - for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf) + class_registration.apply_replacements(graph, class_registration.ClassType.BACK_REPLACER) - if argv.offload_unsupported_operations_to_tf: - unsupported_ops_to_offload_to_tf = find_unsupported_ops(graph) - if len(unsupported_ops_to_offload_to_tf) == 0: - log.info('All operations are supported! Exit from the loop.') - if not need_to_repeat_conversion(graph): - break - else: - print('After {} iteration there are {} unsupported ops'.format(cur_iteration + 1, - len(unsupported_ops_to_offload_to_tf))) - else: - if not need_to_repeat_conversion(graph): - break - - graph = graph_copy - cur_iteration += 1 + for_graph_and_each_sub_graph_recursively(graph, remove_const_ops) + CreateConstNodesReplacement().find_and_replace_pattern(graph) - class_registration.apply_replacements(graph, class_registration.ClassType.BACK_REPLACER) + for_graph_and_each_sub_graph_recursively(graph, remove_output_ops) prepare_emit_ir(graph=graph, data_type=argv.data_type, output_dir=output_dir, output_model_name=output_model_name, meta_info=meta_info) diff --git a/model-optimizer/mo/utils/class_registration.py b/model-optimizer/mo/utils/class_registration.py index 8d4c834..0296c1d 100644 --- a/model-optimizer/mo/utils/class_registration.py +++ b/model-optimizer/mo/utils/class_registration.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,17 +15,55 @@ """ import logging as log +import os from enum import Enum import networkx as nx +from mo.graph.graph import Graph +from mo.middle.passes.eliminate import graph_clean_up_tf, graph_clean_up_onnx, graph_clean_up +from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg -from mo.graph.graph import check_empty_graph _registered_classes_dict = {} +def _check_unique_ids(): + """ + Check that idxs is unique for all registered replacements. + """ + unique_idxs = set() + for class_type, classes_set in _registered_classes_dict.items(): + for cls in classes_set: + replacers = [c for c in cls.registered_cls if not hasattr(c, 'op')] + \ + [c for op, c in cls.registered_ops.items() if c] + for replacer_cls in replacers: + if hasattr(replacer_cls, 'id'): + id_cls = getattr(replacer_cls, 'id') + + if id_cls in unique_idxs: + raise Error('Found replacer {} with not unique id!'.format(replacer_cls)) + unique_idxs.add(id_cls) + log.debug("All replacers has unique idxs.") + + +def get_enabled_and_disabled_transforms(): + """ + :return: tuple of lists with force enabled and disabled id of transformations. + """ + disabled_transforms = os.environ['MO_DISABLED_TRANSFORMS'] if 'MO_DISABLED_TRANSFORMS' in os.environ else '' + enabled_transforms = os.environ['MO_ENABLED_TRANSFORMS'] if 'MO_ENABLED_TRANSFORMS' in os.environ else '' + + assert isinstance(enabled_transforms, str) + assert isinstance(disabled_transforms, str) + + disabled_transforms = disabled_transforms.split(',') + enabled_transforms = enabled_transforms.split(',') + + return enabled_transforms, disabled_transforms + + class ClassType(Enum): EXTRACTOR = 0 OP = 1 @@ -34,11 +72,20 @@ class ClassType(Enum): BACK_REPLACER = 4 -def _update(cls, registered_list: list, registered_dict: dict, key: str): +def _update(cls, registered_list: list, registered_dict: dict, key: str, enabled_transforms: list, disabled_transforms: list): new_keys = {} # maps a custom name to class new_keys_lower = {} # translates lowered custom name to its original form # print('Registering new subclasses for', cls) + for c in cls.__subclasses__(): + # Force enabling operations + if hasattr(c, 'id') and c.id in enabled_transforms: + setattr(c, 'enabled', True) + + # Force disabling operations + if hasattr(c, 'id') and c.id in disabled_transforms: + setattr(c, 'enabled', False) + if c not in registered_list and (not hasattr(c, 'enabled') or c.enabled): if hasattr(cls, 'excluded_classes') and c in cls.excluded_classes: continue @@ -60,19 +107,19 @@ def _update(cls, registered_list: list, registered_dict: dict, key: str): registered_dict.update(new_keys) -def update_registration(classes: list): +def update_registration(classes: list, enabled_transforms: list, disabled_transforms: list): for cls in classes: - _update(cls, cls.registered_cls, cls.registered_ops, 'op') + _update(cls, cls.registered_cls, cls.registered_ops, 'op', enabled_transforms, disabled_transforms) _registered_classes_dict.setdefault(cls.class_type(), set()).add(cls) -def apply_replacements(graph: nx.MultiDiGraph, replacements_type): +def apply_replacements(graph: Graph, replacements_type): """ Apply all patterns that do not have 'op' first, then apply patterns from registered_ops. If two or more classes replaces the same op (both have op class attribute and values match), such pattern is not applied (while registration it will warn user that we have a conflict). """ - dependency_graph = nx.DiGraph() + dependency_graph = Graph() for class_type, classes_set in _registered_classes_dict.items(): if class_type == replacements_type: for cls in classes_set: @@ -92,7 +139,7 @@ def apply_replacements(graph: nx.MultiDiGraph, replacements_type): dependency_graph.add_edge(cls_before, replacer_cls) try: - replacers_order = nx.topological_sort(dependency_graph) + replacers_order = list(nx.topological_sort(dependency_graph)) except nx.NetworkXUnfeasible as exception: cycles = nx.simple_cycles(dependency_graph) raise Error('There is(are) cyclic dependency(ies) between replacers. One of the cycles is the following: {}', @@ -100,6 +147,7 @@ def apply_replacements(graph: nx.MultiDiGraph, replacements_type): for replacer_cls in replacers_order: replacer = replacer_cls() + replacement_id = 'REPLACEMENT_ID' if hasattr(replacer, 'replacement_id'): replacement_id = replacer.replacement_id @@ -108,11 +156,26 @@ def apply_replacements(graph: nx.MultiDiGraph, replacements_type): log.info("Skip replacer {} (enabled = False)".format(replacer_cls)) continue + if hasattr(replacer, 'graph_condition') and \ + not all([condition(graph) for condition in replacer.graph_condition]): + log.info("Skip replacer {} (graph_condition not satisfied)".format(replacer_cls)) + continue + log.debug("Run replacer {}".format(replacer_cls)) try: replacer.find_and_replace_pattern(graph) - check_empty_graph(graph, replacer_cls) + + if hasattr(replacer, 'force_clean_up') and replacer.force_clean_up: + for_graph_and_each_sub_graph_recursively( + graph, + graph_clean_up_tf if graph.graph['fw'] == 'tf' else + graph_clean_up_onnx if graph.graph['fw'] == 'onnx' else + graph_clean_up) + + for_graph_and_each_sub_graph_recursively(graph, lambda _: graph.check_empty_graph(replacer_cls)) + for_graph_and_each_sub_graph_recursively(graph, lambda _: graph.check_shapes_consistency()) + except Error as err: raise Error('Exception occurred during running replacer "{}" ({}): {}'.format( replacement_id, diff --git a/model-optimizer/mo/utils/cli_parser.py b/model-optimizer/mo/utils/cli_parser.py index 48558b2..942e5a7 100644 --- a/model-optimizer/mo/utils/cli_parser.py +++ b/model-optimizer/mo/utils/cli_parser.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -141,6 +141,7 @@ def writable_dir(path: str): else: raise Error('The directory "{}" is not writable'.format(cur_path)) + def get_common_cli_parser(parser: argparse.ArgumentParser = None): if not parser: parser = argparse.ArgumentParser() @@ -236,6 +237,9 @@ def get_common_cli_parser(parser: argparse.ArgumentParser = None): common_group.add_argument('--disable_gfusing', help='Turn off fusing of grouped convolutions', action='store_true') + common_group.add_argument('--enable_concat_optimization', + help='Turn on concat optimization', + action='store_true') common_group.add_argument('--move_to_preprocess', help='Move mean values to IR preprocess section', action='store_true') @@ -272,6 +276,10 @@ def get_common_cli_parser(parser: argparse.ArgumentParser = None): ' deployment scenarios. Use it at your own discretion. By default, without this' ' option, the Model Optimizer generates IR V3.', action='store_true') + common_group.add_argument('--keep_shape_ops', + help='[ Experimental feature ] Enables `Shape` operation with all children keeping. ' + 'This feature makes model reshapable in Inference Engine', + action='store_true', default=False) return parser @@ -311,7 +319,6 @@ def get_caffe_cli_options(): def get_tf_cli_options(): d = { 'input_model_is_text': '- Input model in text protobuf format', - 'offload_unsupported_operations_to_tf': '- Offload unsupported operations', 'tensorflow_subgraph_patterns': '- Patterns to offload', 'tensorflow_operation_patterns': '- Operations to offload', 'tensorflow_custom_operations_config_update': '- Update the configuration file with input/output node names', @@ -435,9 +442,6 @@ def get_tf_cli_parser(parser: argparse.ArgumentParser = None): tf_group.add_argument('--saved_model_tags', type=str, default=None, help="Group of tag(s) of the MetaGraphDef to load, in string format, separated by ','. " "For tag-set contains multiple tags, all tags must be passed in.") - tf_group.add_argument('--offload_unsupported_operations_to_tf', - help='TensorFlow*: automatically offload unsupported operations to TensorFlow*', - action='store_true') tf_group.add_argument('--tensorflow_subgraph_patterns', help='TensorFlow*: a list of comma separated patterns that will be applied to ' + 'TensorFlow* node names to ' + diff --git a/model-optimizer/mo/utils/cli_parser_test.py b/model-optimizer/mo/utils/cli_parser_test.py index 1646273..ab68f19 100644 --- a/model-optimizer/mo/utils/cli_parser_test.py +++ b/model-optimizer/mo/utils/cli_parser_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/convert.py b/model-optimizer/mo/utils/convert.py index edec06f..48ccbd3 100644 --- a/model-optimizer/mo/utils/convert.py +++ b/model-optimizer/mo/utils/convert.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/custom_replacement_config.py b/model-optimizer/mo/utils/custom_replacement_config.py index 8709e19..63dc551 100644 --- a/model-optimizer/mo/utils/custom_replacement_config.py +++ b/model-optimizer/mo/utils/custom_replacement_config.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ from re import compile, match import networkx as nx -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error from mo.utils.graph import nodes_matching_name_pattern, sub_graph_between_nodes from mo.utils.utils import refer_to_faq_msg @@ -126,7 +126,7 @@ class CustomReplacementDescriptor(object): return None return [(out['node'], out['port']) for out in self._replacement_desc['outputs']] - def update_custom_replacement_attributes(self, graph: nx.MultiDiGraph): + def update_custom_replacement_attributes(self, graph: Graph): """ The function run specific functions to update attributes of the custom replacement description. Currently it updates information about input/output nodes. @@ -179,7 +179,7 @@ class CustomReplacementDescriptorPoints(CustomReplacementDescriptor): def get_outputs_description(self): return [('^' + node_name + '$', 0) for node_name in self.instances['end_points']] - def get_internal_input_nodes(self, graph: nx.MultiDiGraph): + def get_internal_input_nodes(self, graph: Graph): """ Gets list of node names getting input from outside of the sub-graph. This function checks whether input nodes specified in the configuration file should be added to the sub-graph or not. If they should not be added to the @@ -199,7 +199,7 @@ class CustomReplacementDescriptorPoints(CustomReplacementDescriptor): else: return self.instances['start_points'] - def get_internal_output_nodes(self, graph: nx.MultiDiGraph): + def get_internal_output_nodes(self, graph: Graph): """ Gets list of node names producing output outside of the sub-graph. This function checks whether output nodes specified in the configuration file should be added to the sub-graph or not. If they should not be added to the @@ -219,7 +219,7 @@ class CustomReplacementDescriptorPoints(CustomReplacementDescriptor): else: return self.instances['end_points'] - def update_custom_replacement_attributes(self, graph: nx.MultiDiGraph): + def update_custom_replacement_attributes(self, graph: Graph): if not self.has('instances'): raise Error("No instance(s) is(are) defined for the custom replacement '{}'. ".format(self.replacement_id) + refer_to_faq_msg(66)) @@ -278,7 +278,7 @@ class CustomReplacementDescriptorScope(CustomReplacementDescriptor): def __init__(self, replacement_id: str, attrs: dict = None): super().__init__(replacement_id, attrs) - def update_custom_replacement_attributes(self, graph: nx.MultiDiGraph): + def update_custom_replacement_attributes(self, graph: Graph): if not self.has('instances') or len(self.instances) == 0: raise Error("No instances are defined for replacement with id '{}'. ".format(self.replacement_id) + refer_to_faq_msg(68)) @@ -384,35 +384,7 @@ def parse_custom_replacement_config_file(file_name: str): return result -def update_custom_replacement_config_file(graph: nx.MultiDiGraph, file_name: str): - data = parse_custom_replacement_config_file(file_name) - if data is None: - raise Error("Cannot update the file '{}' because it is broken. ".format(file_name) + - refer_to_faq_msg(73)) - - for replacement_desc in data: - replacement_desc.update_custom_replacement_attributes(graph) - - return save_custom_replacement_config_file(data, file_name) - - -def save_custom_replacement_config_file(descriptions: list, file_name: str): - """ - Save custom layer(s) description(s) to the file. - :param file_name: file to save description information to. - :param descriptions: list with instances of the CustomLayerDescriptor classes. - :return: True if operation is successful. - """ - try: - json.dump([replacement_desc.get_config_file_representation() for replacement_desc in descriptions], - open(file_name, "w"), indent=4, sort_keys=True) - except Exception as ex: - log.error("failed to update configuration file {}: {}".format(file_name, str(ex))) - return False - return True - - -def generate_pattern_for_node(graph: nx.MultiDiGraph, sub_graph_pattern: str, node_name: str): +def generate_pattern_for_node(graph: Graph, sub_graph_pattern: str, node_name: str): if sub_graph_pattern == '': return node_name node_name_components = node_name.split("/") diff --git a/model-optimizer/mo/utils/dsu.py b/model-optimizer/mo/utils/dsu.py index 849db90..9bde494 100644 --- a/model-optimizer/mo/utils/dsu.py +++ b/model-optimizer/mo/utils/dsu.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/error.py b/model-optimizer/mo/utils/error.py index 4b18866..d7d28e7 100644 --- a/model-optimizer/mo/utils/error.py +++ b/model-optimizer/mo/utils/error.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/find_inputs.py b/model-optimizer/mo/utils/find_inputs.py index 87ab7bb..633859b 100644 --- a/model-optimizer/mo/utils/find_inputs.py +++ b/model-optimizer/mo/utils/find_inputs.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,28 +16,22 @@ import networkx as nx -from mo.graph.graph import NodeWrap +from mo.graph.graph import Node, Graph -def find_nodes_by_type(graph: nx.MultiDiGraph, t_name: str): - nodes = nx.topological_sort(graph) - inputs = [] - for n in nodes: - node = NodeWrap(graph, n) - if node.has('type') and node.type == t_name: - inputs.append(node.id) - return inputs +def find_nodes_by_attribute_value(graph: Graph, attr: str, attr_name: str): + return [id for id, v in nx.get_node_attributes(graph, attr).items() if v == attr_name] -def find_inputs(graph: nx.MultiDiGraph): - return find_nodes_by_type(graph, 'Input') +def find_inputs(graph: Graph): + return find_nodes_by_attribute_value(graph, 'type', 'Input') -def find_outputs(graph): - nodes = nx.topological_sort(graph) +def find_outputs(graph: Graph): outputs = [] - for n in nodes: - node = NodeWrap(graph, n) - if node.has('is_output') and node['is_output']: - outputs.append(node.id) - return outputs + for node_id in find_nodes_by_attribute_value(graph, 'op', 'OpOutput'): + parents = Node(graph, node_id).in_nodes() + assert len(parents) == 1, 'OpOutput node should have exactly one input' + parent = parents[0].id + outputs.append(parent) + return list(set(outputs)) diff --git a/model-optimizer/mo/utils/graph.py b/model-optimizer/mo/utils/graph.py index b651228..cf2d136 100644 --- a/model-optimizer/mo/utils/graph.py +++ b/model-optimizer/mo/utils/graph.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ from re import match, compile import logging as log import networkx as nx -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.utils.error import Error from mo.utils.utils import refer_to_faq_msg @@ -52,7 +52,7 @@ def backward_bfs_for_operation(start_node: Node, op_names: list): return [Node(start_node.graph, x) for x in ret] -def bfs_search(graph: nx.MultiDiGraph, start_nodes: list = list()): +def bfs_search(graph: Graph, start_nodes: list = list()): """ Performs breadth-first search over a graph and returns a list of nodes in the BFS order. :param graph: networkx graph to traverse. @@ -77,7 +77,7 @@ def bfs_search(graph: nx.MultiDiGraph, start_nodes: list = list()): return result -def dfs(graph: nx.MultiDiGraph, node_name: str, visited: set): +def dfs(graph: Graph, node_name: str, visited: set): """ Implementation of the depth-first search algorithm starting from the specific node. :param graph: networkx graph to operate on. @@ -103,7 +103,7 @@ def dfs(graph: nx.MultiDiGraph, node_name: str, visited: set): return order -def pseudo_topological_sort(graph: nx.MultiDiGraph, reverse: bool = False): +def pseudo_topological_sort(graph: Graph, reverse: bool = False): """ The function performs topological sort but doesn't check for cycle existence. So it may produce wrong nodes order for some applications. @@ -127,7 +127,7 @@ def pseudo_topological_sort(graph: nx.MultiDiGraph, reverse: bool = False): return list(reversed(order)) -def nodes_matching_name_pattern(graph: nx.MultiDiGraph, pattern: str): +def nodes_matching_name_pattern(graph: Graph, pattern: str): """ Returns list of node names of the graph that match regular expression. :param graph: graph to operate on. @@ -138,7 +138,7 @@ def nodes_matching_name_pattern(graph: nx.MultiDiGraph, pattern: str): return [node_name for node_name in list(graph.nodes()) if match(compiled_pattern, node_name)] -def is_connected_component(graph: nx.MultiDiGraph, node_names: list): +def is_connected_component(graph: Graph, node_names: list): """ Checks that specified list of nodes forms a connected sub-graph. It ignores edges direction. The algorithm is the following. Run BFS from one of the nodes from the node_names list ignoring edges order and @@ -167,7 +167,7 @@ def is_connected_component(graph: nx.MultiDiGraph, node_names: list): return set(node_names).issubset(visited) -def sub_graph_between_nodes(graph: nx.MultiDiGraph, start_nodes: list, end_nodes: list, detect_extra_start_node: callable=None): +def sub_graph_between_nodes(graph: Graph, start_nodes: list, end_nodes: list, detect_extra_start_node: callable=None): """ Finds nodes of the sub-graph between 'start_nodes' and 'end_nodes'. Input nodes for the sub-graph nodes are also added to the sub-graph. Constant inputs of the 'start_nodes' are also added to the sub-graph. @@ -251,7 +251,7 @@ def node_neighbourhood(node_name: str, depth: int, next_node_fn): return list(dist.keys()) -def node_incoming_neighbourhood(graph: nx.MultiDiGraph, node_name: str, depth: int): +def node_incoming_neighbourhood(graph: Graph, node_name: str, depth: int): """ Find input neighbourhood of the node. :param graph: graph to operate on. @@ -262,7 +262,7 @@ def node_incoming_neighbourhood(graph: nx.MultiDiGraph, node_name: str, depth: i return node_neighbourhood(node_name, depth, lambda node_name: [u for u, v in graph.in_edges([node_name])]) -def node_outcoming_neighbourhood(graph: nx.MultiDiGraph, node_name: str, depth: int): +def node_outcoming_neighbourhood(graph: Graph, node_name: str, depth: int): """ Find output neighbourhood of the node. :param graph: graph to operate on. @@ -273,7 +273,7 @@ def node_outcoming_neighbourhood(graph: nx.MultiDiGraph, node_name: str, depth: return node_neighbourhood(node_name, depth, lambda node_name: [v for u, v in graph.out_edges([node_name])]) -def scope_output_nodes(graph: nx.MultiDiGraph, scope: str, scope_delimiter: str='/'): +def scope_output_nodes(graph: Graph, scope: str, scope_delimiter: str='/'): """ The function returns nodes producing output of the sub-graph defined by scope (name prefix). The node is considered output of the scope if it is in this scope and it's output is outside of the scope. diff --git a/model-optimizer/mo/utils/graph_test.py b/model-optimizer/mo/utils/graph_test.py index 5d4ed57..21bf45d 100644 --- a/model-optimizer/mo/utils/graph_test.py +++ b/model-optimizer/mo/utils/graph_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,11 +20,11 @@ import networkx as nx from mo.utils.error import Error from mo.utils.graph import dfs, bfs_search, is_connected_component, sub_graph_between_nodes - +from mo.graph.graph import Graph class TestGraphUtils(unittest.TestCase): def test_simple_dfs(self): - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 5))) graph.add_edges_from([(1, 2), (1, 3), (3, 4)]) @@ -36,7 +36,7 @@ class TestGraphUtils(unittest.TestCase): """ Check that BFS automatically determines input nodes and start searching from them. """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 6))) graph.add_edges_from([(1, 3), (2, 3), (3, 4), (4, 5)]) @@ -47,7 +47,7 @@ class TestGraphUtils(unittest.TestCase): """ Check that BFS stars from the user defined nodes and doesn't go in backward edge direction. """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 7))) graph.add_edges_from([(1, 3), (2, 3), (3, 4), (4, 5), (6, 1)]) @@ -58,7 +58,7 @@ class TestGraphUtils(unittest.TestCase): """ Check that if there are two separate sub-graphs the function returns False. """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 7))) graph.add_edges_from([(1, 2), (2, 3), (4, 5), (5, 6)]) self.assertFalse(is_connected_component(graph, list(range(1, 7)))) @@ -71,7 +71,7 @@ class TestGraphUtils(unittest.TestCase): Check that if there are two separate sub-graphs the function connected by an edge going through the ignored node then the function returns False. """ - graph = nx.MultiDiGraph() + graph = Graph() node_names = list(range(1, 8)) graph.add_nodes_from(node_names) graph.add_edges_from([(1, 2), (2, 3), (4, 5), (5, 6), (1, 7), (7, 4)]) @@ -81,7 +81,7 @@ class TestGraphUtils(unittest.TestCase): """ Check that if the sub-graph is connected. """ - graph = nx.MultiDiGraph() + graph = Graph() node_names = list(range(1, 8)) graph.add_nodes_from(node_names) graph.add_edges_from([(1, 2), (2, 3), (4, 5), (5, 6), (1, 7), (7, 4)]) @@ -91,7 +91,7 @@ class TestGraphUtils(unittest.TestCase): """ Check that edges direction is ignored when checking for the connectivity. """ - graph = nx.MultiDiGraph() + graph = Graph() node_names = list(range(1, 5)) graph.add_nodes_from(node_names) graph.add_edges_from([(2, 1), (2, 3), (4, 3)]) @@ -104,7 +104,7 @@ class TestGraphUtils(unittest.TestCase): Check that edges direction is ignored when checking for the connectivity. In this case the graph is not connected. """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 5))) graph.add_edges_from([(2, 1), (2, 3), (4, 3)]) self.assertFalse(is_connected_component(graph, [1, 2, 4])) @@ -121,7 +121,7 @@ class TestGraphUtils(unittest.TestCase): 1 -> 2 -> 3 -> 4 :return: """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 7))) graph.add_edges_from([(1, 2), (2, 3), (3, 4), (5, 2), (6, 5)]) sub_graph_nodes = sub_graph_between_nodes(graph, [1], [4]) @@ -140,7 +140,7 @@ class TestGraphUtils(unittest.TestCase): \ 1 -> 2 -> 3 -> 4 """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 6))) graph.add_edges_from([(1, 2), (2, 3), (3, 4), (5, 2)]) sub_graph_nodes = sub_graph_between_nodes(graph, [2], [4]) @@ -154,7 +154,7 @@ class TestGraphUtils(unittest.TestCase): \ 1 -> 2 -> 3 -> 4 """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 6))) graph.node[5]['op'] = 'Placeholder' graph.add_edges_from([(1, 2), (2, 3), (3, 4), (5, 2)]) @@ -168,7 +168,7 @@ class TestGraphUtils(unittest.TestCase): \ 1 -> 2 -> 3 -> 4 """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 6))) graph.node[5]['op'] = 'Placeholder' graph.add_edges_from([(1, 2), (2, 3), (3, 4), (5, 2)]) @@ -183,7 +183,7 @@ class TestGraphUtils(unittest.TestCase): \ 1 -> 2 -> 3 -> 4 """ - graph = nx.MultiDiGraph() + graph = Graph() graph.add_nodes_from(list(range(1, 6))) graph.add_edges_from([(1, 2), (2, 3), (3, 4), (5, 2)]) sub_graph_nodes = sub_graph_between_nodes(graph, [2, 5], [4]) @@ -199,7 +199,7 @@ class TestGraphUtils(unittest.TestCase): / \ 9 -> -> 7 -> 8 """ - graph = nx.MultiDiGraph() + graph = Graph() node_names = list(range(1, 10)) graph.add_nodes_from(node_names) graph.add_edges_from([(1, 2), (2, 3), (3, 4), (2, 5), (5, 6), (5, 7), (7, 8), (9, 5)]) diff --git a/model-optimizer/mo/utils/guess_framework.py b/model-optimizer/mo/utils/guess_framework.py index 1149c71..c19d34d 100644 --- a/model-optimizer/mo/utils/guess_framework.py +++ b/model-optimizer/mo/utils/guess_framework.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ def guess_framework_by_ext(input_model_path: str) -> int: return 'caffe' elif re.match('^.*\.pb$', input_model_path): return 'tf' + elif re.match('^.*\.pbtxt$', input_model_path): + return 'tf' elif re.match('^.*\.params$', input_model_path): return 'mxnet' elif re.match('^.*\.nnet$', input_model_path): diff --git a/model-optimizer/mo/utils/import_extensions.py b/model-optimizer/mo/utils/import_extensions.py index 317bef6..0ed0ce6 100644 --- a/model-optimizer/mo/utils/import_extensions.py +++ b/model-optimizer/mo/utils/import_extensions.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,23 @@ import os import pkgutil import sys +from mo.back.replacement import BackReplacementPattern +from mo.middle.replacement import MiddleReplacementPattern +from mo.ops.op import Op +from mo.utils.class_registration import _check_unique_ids, update_registration, get_enabled_and_disabled_transforms + def import_by_path(path: str, middle_names: list = ()): for module_loader, name, ispkg in pkgutil.iter_modules([path]): importlib.import_module('{}.{}'.format('.'.join(middle_names), name)) -def load_dir(framework: str, path: str, update_registration: callable): +def default_path(): + EXT_DIR_NAME = 'extensions' + return os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, EXT_DIR_NAME)) + + +def load_dir(framework: str, path: str, get_front_classes: callable): """ Assuming the following sub-directory structure for path: @@ -57,27 +67,36 @@ def load_dir(framework: str, path: str, update_registration: callable): log.info("Importing extensions from: {}".format(path)) root_dir, ext = os.path.split(path) sys.path.insert(0, root_dir) - internal_dirs = [['ops', ], ['front', ], ['front', framework], ['middle', ], ['back', ]] + + enabled_transforms, disabled_transforms = get_enabled_and_disabled_transforms() + + front_classes = get_front_classes() + internal_dirs = { + ('ops', ): [Op], + ('front', ): front_classes, + ('front', framework): front_classes, + ('middle', ): [MiddleReplacementPattern], + ('back', ): [BackReplacementPattern]} + if ext == 'mo': - internal_dirs.append(['front', framework, 'extractors']) - for p in internal_dirs: + internal_dirs[('front', framework, 'extractors')] = front_classes + + for p in internal_dirs.keys(): import_by_path(os.path.join(path, *p), [ext, *p]) - update_registration() + update_registration(internal_dirs[p], enabled_transforms, disabled_transforms) sys.path.remove(root_dir) -def default_path(): - EXT_DIR_NAME = 'extensions' - return os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, EXT_DIR_NAME)) - - -def load_dirs(framework: str, dirs: list, update_registration: callable): +def load_dirs(framework: str, dirs: list, get_front_classes: callable): if dirs is None: return + mo_inner_extensions = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, 'mo')) dirs.insert(0, mo_inner_extensions) dirs = [os.path.abspath(e) for e in dirs] if default_path() not in dirs: dirs.insert(0, default_path()) for path in dirs: - load_dir(framework, path, update_registration) + load_dir(framework, path, get_front_classes) + + _check_unique_ids() diff --git a/model-optimizer/mo/utils/logger.py b/model-optimizer/mo/utils/logger.py index 26b7c2f..51bc390 100644 --- a/model-optimizer/mo/utils/logger.py +++ b/model-optimizer/mo/utils/logger.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/pipeline_config.py b/model-optimizer/mo/utils/pipeline_config.py index 901bf45..5352db3 100644 --- a/model-optimizer/mo/utils/pipeline_config.py +++ b/model-optimizer/mo/utils/pipeline_config.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -57,6 +57,10 @@ mapping_rules = [ ('first_stage_nms_score_threshold', '.*_nms_score_threshold'), ('first_stage_nms_iou_threshold', '.*_nms_iou_threshold'), ('first_stage_max_proposals', '.*_max_proposals'), + ('num_spatial_bins_height', '.*/rfcn_box_predictor/num_spatial_bins_height'), + ('num_spatial_bins_width', '.*/rfcn_box_predictor/num_spatial_bins_width'), + ('crop_height', '.*/rfcn_box_predictor/crop_height'), + ('crop_width', '.*/rfcn_box_predictor/crop_width'), 'initial_crop_size', # Detection Output layer attributes ('postprocessing_score_converter', '.*/score_converter'), diff --git a/model-optimizer/mo/utils/pipeline_config_test.py b/model-optimizer/mo/utils/pipeline_config_test.py index 596a714..6c8e19b 100644 --- a/model-optimizer/mo/utils/pipeline_config_test.py +++ b/model-optimizer/mo/utils/pipeline_config_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/replacement_pattern.py b/model-optimizer/mo/utils/replacement_pattern.py index d77f7ce..4aa0a18 100644 --- a/model-optimizer/mo/utils/replacement_pattern.py +++ b/model-optimizer/mo/utils/replacement_pattern.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ import networkx as nx +from mo.graph.graph import Graph from mo.middle.pattern_match import apply_pattern @@ -24,7 +25,7 @@ class ReplacementPattern(object): # All intermediate infrastructure classes should be here excluded_replacers = [] - def find_and_replace_pattern(self, graph: nx.MultiDiGraph): + def find_and_replace_pattern(self, graph: Graph): apply_pattern(graph, **self.pattern(), action=self.replace_pattern) # pylint: disable=no-member def run_before(self): diff --git a/model-optimizer/mo/utils/simple_proto_parser.py b/model-optimizer/mo/utils/simple_proto_parser.py index cfdbf28..4975dcd 100644 --- a/model-optimizer/mo/utils/simple_proto_parser.py +++ b/model-optimizer/mo/utils/simple_proto_parser.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/simple_proto_parser_test.py b/model-optimizer/mo/utils/simple_proto_parser_test.py index 2f601ce..1b1af16 100644 --- a/model-optimizer/mo/utils/simple_proto_parser_test.py +++ b/model-optimizer/mo/utils/simple_proto_parser_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/str_to.py b/model-optimizer/mo/utils/str_to.py index c27a581..9c5a15a 100644 --- a/model-optimizer/mo/utils/str_to.py +++ b/model-optimizer/mo/utils/str_to.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/summarize_graph.py b/model-optimizer/mo/utils/summarize_graph.py index 8d69718..fbb7906 100644 --- a/model-optimizer/mo/utils/summarize_graph.py +++ b/model-optimizer/mo/utils/summarize_graph.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/summarize_graph_test.py b/model-optimizer/mo/utils/summarize_graph_test.py index fbed0eb..41a897a 100644 --- a/model-optimizer/mo/utils/summarize_graph_test.py +++ b/model-optimizer/mo/utils/summarize_graph_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/tensorboard.py b/model-optimizer/mo/utils/tensorboard.py index 9ca78ec..98ff1c7 100644 --- a/model-optimizer/mo/utils/tensorboard.py +++ b/model-optimizer/mo/utils/tensorboard.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/unittest/extractors.py b/model-optimizer/mo/utils/unittest/extractors.py index e58534c..68e251f 100644 --- a/model-optimizer/mo/utils/unittest/extractors.py +++ b/model-optimizer/mo/utils/unittest/extractors.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/unittest/graph.py b/model-optimizer/mo/utils/unittest/graph.py index 64a0f30..2c36d61 100644 --- a/model-optimizer/mo/utils/unittest/graph.py +++ b/model-optimizer/mo/utils/unittest/graph.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,12 @@ from collections import deque from copy import deepcopy +from numbers import Number import networkx as nx import numpy as np -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph from mo.middle.pattern_match import all_edges_in_nodes from mo.utils.error import Error @@ -51,7 +52,7 @@ def build_graph_with_attrs(nodes_with_attrs: list, edges_with_attrs: list, new_n update_nodes_attributes: dict = None, nodes_with_edges_only: bool = False, add_nodes_from_edges: bool = False): """ - Build the nx.MultiDiGraph with specific nodes and edges. Also update of edge and node parameters is supported. + Build the Graph with specific nodes and edges. Also update of edge and node parameters is supported. :param nodes_with_attrs: list of tuples ('node_name', {node_attrs}) :param edges_with_attrs: list of tuples like (start node, end node, (optional) {attrs of the edge}). :param new_nodes_with_attrs: analogically nodes_with_attrs @@ -78,7 +79,7 @@ def build_graph_with_attrs(nodes_with_attrs: list, edges_with_attrs: list, new_n if not add_nodes_from_edges and not all_edges_in_nodes(nodes=all_nodes_names, edges=all_edges): raise Error("Some nodes from list of edges is not in nodes. Please, add all necessary nodes.") - graph = nx.MultiDiGraph() + graph = Graph() # Create dict for nodes with attrs nodes_attrs = {} @@ -129,7 +130,7 @@ def build_graph_with_attrs(nodes_with_attrs: list, edges_with_attrs: list, new_n def build_graph(nodes_attrs: dict, edges: list, update_attributes: dict = None, nodes_with_edges_only: bool = False): """ - Build the nx.MultiDiGraph with specific nodes and edges. + Build the Graph with specific nodes and edges. :param nodes_attrs: dictionary where key is the node name and the value is the dictionary with node attributes. :param edges: list of pairs with start and end node names of the edge. :param update_attributes: optional dictionary which specifies nodes names and their attributes to be updated. The @@ -137,7 +138,7 @@ def build_graph(nodes_attrs: dict, edges: list, update_attributes: dict = None, :param nodes_with_edges_only: add nodes which has at least one incoming or outcoming edge. :return: generated graph. """ - graph = nx.MultiDiGraph() + graph = Graph() for node_name, attrs in nodes_attrs.items(): if 'name' not in attrs: @@ -180,19 +181,30 @@ def build_graph(nodes_attrs: dict, edges: list, update_attributes: dict = None, for attr, value in new_attrs.items(): graph.node[node_name][attr] = value + for node in graph.get_op_nodes(): + # Add in_ports attribute + in_edges = node.in_edges() + for i in range(len(in_edges)): + node.add_input_port(idx=i) + + # Add out_ports attribute + out_edges = node.out_edges() + for i in range(len(out_edges)): + node.add_output_port(idx=i) + return graph def build_graph_with_edge_attrs(nodes_attrs: dict, edges: list, update_attributes: dict = None): """ - Build the nx.MultiDiGraph with specific nodes and edges. + Build the Graph with specific nodes and edges. :param nodes_attrs: dictionary where key is the node name and the value is the dictionary with node attributes. :param edges: list of pairs with start and end node names of the edge. :param update_attributes: optional dictionary which specifies nodes names and their attributes to be updated. The key is a node name to update attribute and the value is a dictionary with attribute name and its value. :return: generated graph. """ - graph = nx.MultiDiGraph() + graph = Graph() for node_1, node_2, attr in edges: if node_1 not in graph.nodes(): graph.add_node(node_1, **deepcopy(nodes_attrs[node_1])) @@ -207,7 +219,7 @@ def build_graph_with_edge_attrs(nodes_attrs: dict, edges: list, update_attribute return graph -def compare_graphs(graph: nx.MultiDiGraph, graph_ref: nx.MultiDiGraph, last_node: str, last_node_ref=None, +def compare_graphs(graph: Graph, graph_ref: Graph, last_node: str, last_node_ref=None, check_op_attrs=False): if last_node_ref is None: last_node_ref = last_node @@ -249,7 +261,7 @@ def compare_graphs(graph: nx.MultiDiGraph, graph_ref: nx.MultiDiGraph, last_node # Check that nodes has same operation if check_op_attrs: for attr in graph_ref.node[node_ref.id]: - if graph_ref.node[node_ref.id][attr] is None or attr in ['name', 'id']: + if graph_ref.node[node_ref.id][attr] is None or attr in ['name', 'id', '_in_ports', '_out_ports', 'infer', 'IE']: continue if attr not in graph.node[node.id]: return False, 'Node {} has missing attribute {}'.format(node.id, attr) @@ -259,11 +271,16 @@ def compare_graphs(graph: nx.MultiDiGraph, graph_ref: nx.MultiDiGraph, last_node return False, '{} and {} has different attr {} : {} and {}'.format( node.id, node_ref.id, attr, graph.node[node.id][attr], graph_ref.node[node_ref.id][attr]) - else: - if graph.node[node.id][attr] != graph_ref.node[node_ref.id][attr]: + elif isinstance(graph.node[node.id][attr], Number): + if abs(graph.node[node.id][attr] - graph_ref.node[node_ref.id][attr]) > 1e-4: return False, '{} and {} has different attr {} : {} and {}'.format( node.id, node_ref.id, attr, graph.node[node.id][attr], graph_ref.node[node_ref.id][attr]) + elif graph.node[node.id][attr] != graph_ref.node[node_ref.id][attr]: + return False, '{} and {} has different attr {} : {} and {}'.format( + node.id, node_ref.id, attr, graph.node[node.id][attr], + graph_ref.node[node_ref.id][attr]) + else: if node_ref.has_valid('shape') and not node.has_valid('shape'): return False, '{} has None shape'.format(node.id) diff --git a/model-optimizer/mo/utils/unsupported_ops.py b/model-optimizer/mo/utils/unsupported_ops.py index e5187e3..09cdb04 100644 --- a/model-optimizer/mo/utils/unsupported_ops.py +++ b/model-optimizer/mo/utils/unsupported_ops.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,11 +18,11 @@ import collections import networkx as nx -from mo.graph.graph import Node +from mo.graph.graph import Node, Graph class UnsupportedOps(object): - def __init__(self, graph: nx.Graph): + def __init__(self, graph: Graph): self.graph = graph # map op to a list of node names self.unsupported = collections.defaultdict(list) diff --git a/model-optimizer/mo/utils/utils.py b/model-optimizer/mo/utils/utils.py index c4f089c..4c1871f 100644 --- a/model-optimizer/mo/utils/utils.py +++ b/model-optimizer/mo/utils/utils.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,8 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. """ - - +import functools +import warnings +import logging as log import numpy as np @@ -45,3 +46,34 @@ def symm_match_shapes(shape1: np.array, shape2: np.array): # Elements with values -1 and 0 in both shapes are just ignored. # Other elements should match. Undefined elements can be one side only. return match_shapes(shape1, shape2) or match_shapes(shape2, shape1) + + +def deprecated_api(class_name=None): + def deprecated(func): + @functools.wraps(func) + def deprecation_message(*args, **kwargs): + warnings.simplefilter('always', DeprecationWarning) # turn on filter + dep_msg = "Call to deprecated function {}. ".format(func.__name__) + if class_name is not None: + dep_msg += "Please use {}.{} method".format(class_name.__name__, func.__name__) + warnings.warn(dep_msg, DeprecationWarning, stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) # reset filter + return func(*args, **kwargs) + + return deprecation_message + + return deprecated + + +def array_to_str(node, attr): + if not node.has_valid(attr): + return None + else: + return ','.join(map(str, node[attr])) + + +def shrink_str_value(value: np.array, max_symbols=100): + value = str(value) + if len(value) > max_symbols: + value = value.strip('\n')[:max_symbols - 3] + '...' + return value diff --git a/model-optimizer/mo/utils/utils_test.py b/model-optimizer/mo/utils/utils_test.py index 7ebae7f..368dc31 100644 --- a/model-optimizer/mo/utils/utils_test.py +++ b/model-optimizer/mo/utils/utils_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/version.py b/model-optimizer/mo/utils/version.py index 8c512fc..30d1646 100644 --- a/model-optimizer/mo/utils/version.py +++ b/model-optimizer/mo/utils/version.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/version_test.py b/model-optimizer/mo/utils/version_test.py index 909e742..8f40375 100644 --- a/model-optimizer/mo/utils/version_test.py +++ b/model-optimizer/mo/utils/version_test.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo/utils/versions_checker.py b/model-optimizer/mo/utils/versions_checker.py index 09f3105..b9ff081 100644 --- a/model-optimizer/mo/utils/versions_checker.py +++ b/model-optimizer/mo/utils/versions_checker.py @@ -1,5 +1,5 @@ """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,13 +14,17 @@ limitations under the License. """ + import logging as log import os import re import sys from distutils.version import LooseVersion -modules = {"protobuf": "google.protobuf"} +modules = { + "protobuf": "google.protobuf", + "test-generator": "generator", +} critical_modules = ["networkx"] message = "\nDetected not satisfied dependencies:\n" \ @@ -100,7 +104,19 @@ def version_check(name, installed_v, required_v, sign, not_satisfied_v, exit_cod """ if sign is not None: req_ver = LooseVersion(required_v) - satisfied = eval('installed_v{}req_ver'.format(sign)) + satisfied = False + if sign == '>': + satisfied = installed_v > req_ver + elif sign == '>=': + satisfied = installed_v >= req_ver + elif sign == '<=': + satisfied = installed_v <= req_ver + elif sign == '<': + satisfied = installed_v < req_ver + elif sign == '==': + satisfied = installed_v == req_ver + else: + log.error("Error during version comparison") else: satisfied = True if not satisfied: @@ -110,7 +126,7 @@ def version_check(name, installed_v, required_v, sign, not_satisfied_v, exit_cod return exit_code -def check_requirements(framework = None): +def check_requirements(framework=None): """ Please do not add parameter type annotations (param:type). Because we import this file while checking Python version. @@ -133,10 +149,11 @@ def check_requirements(framework = None): exit_code = 0 for name, key, required_version in requirements_list: try: - exec("import {}".format(modules[name] if name in modules else name)) - installed_version = eval("{}.__version__".format(modules[name] if name in modules else name)) + importable_name = modules.get(name, name) + exec("import {}".format(importable_name)) + installed_version = sys.modules[importable_name].__version__ exit_code = version_check(name, installed_version, required_version, key, not_satisfied_versions, exit_code) - exec("del {}".format(modules[name] if name in modules else name)) + exec("del {}".format(importable_name)) except (AttributeError, ImportError): not_satisfied_versions.append((name, 'not installed', 'required: {}'.format(required_version))) exit_code = 1 diff --git a/model-optimizer/mo_caffe.py b/model-optimizer/mo_caffe.py index d16e457..36b01f1 100755 --- a/model-optimizer/mo_caffe.py +++ b/model-optimizer/mo_caffe.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo_kaldi.py b/model-optimizer/mo_kaldi.py index 1d64d7d..b3ff3a6 100755 --- a/model-optimizer/mo_kaldi.py +++ b/model-optimizer/mo_kaldi.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo_mxnet.py b/model-optimizer/mo_mxnet.py index 5338db9..3495941 100755 --- a/model-optimizer/mo_mxnet.py +++ b/model-optimizer/mo_mxnet.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo_onnx.py b/model-optimizer/mo_onnx.py index 87f9c7d..1fa724d 100755 --- a/model-optimizer/mo_onnx.py +++ b/model-optimizer/mo_onnx.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/mo_tf.py b/model-optimizer/mo_tf.py index 954d09d..4763a2a 100755 --- a/model-optimizer/mo_tf.py +++ b/model-optimizer/mo_tf.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ - Copyright (c) 2018 Intel Corporation + Copyright (c) 2018-2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/model-optimizer/requirements.txt b/model-optimizer/requirements.txt index 7583c33..8ee99c8 100644 --- a/model-optimizer/requirements.txt +++ b/model-optimizer/requirements.txt @@ -4,3 +4,5 @@ networkx>=1.11 numpy>=1.12.0 protobuf==3.6.1 onnx>=1.1.2 +test-generator==0.1.1 +defusedxml>=0.5.0 diff --git a/model-optimizer/requirements_caffe.txt b/model-optimizer/requirements_caffe.txt index 2acb120..eb74892 100644 --- a/model-optimizer/requirements_caffe.txt +++ b/model-optimizer/requirements_caffe.txt @@ -1,3 +1,5 @@ networkx>=1.11 numpy>=1.12.0 protobuf==3.6.1 +test-generator==0.1.1 +defusedxml>=0.5.0 \ No newline at end of file diff --git a/model-optimizer/requirements_kaldi.txt b/model-optimizer/requirements_kaldi.txt index 74772f3..24caaf4 100644 --- a/model-optimizer/requirements_kaldi.txt +++ b/model-optimizer/requirements_kaldi.txt @@ -1,2 +1,4 @@ networkx>=1.11 numpy==1.13.0 +test-generator==0.1.1 +defusedxml>=0.5.0 diff --git a/model-optimizer/requirements_mxnet.txt b/model-optimizer/requirements_mxnet.txt index ae4ec3c..1e2f557 100644 --- a/model-optimizer/requirements_mxnet.txt +++ b/model-optimizer/requirements_mxnet.txt @@ -1,3 +1,5 @@ mxnet>=1.0.0,<=1.3.1 networkx>=1.11 numpy>=1.12.0 +test-generator==0.1.1 +defusedxml>=0.5.0 \ No newline at end of file diff --git a/model-optimizer/requirements_onnx.txt b/model-optimizer/requirements_onnx.txt index 05e8d70..e196da4 100644 --- a/model-optimizer/requirements_onnx.txt +++ b/model-optimizer/requirements_onnx.txt @@ -1,3 +1,5 @@ onnx>=1.1.2 networkx>=1.11 numpy>=1.12.0 +test-generator==0.1.1 +defusedxml>=0.5.0 \ No newline at end of file diff --git a/model-optimizer/requirements_tf.txt b/model-optimizer/requirements_tf.txt index 2ee5784..3864030 100644 --- a/model-optimizer/requirements_tf.txt +++ b/model-optimizer/requirements_tf.txt @@ -1,3 +1,5 @@ tensorflow>=1.2.0 networkx>=1.11 numpy>=1.12.0 +test-generator==0.1.1 +defusedxml>=0.5.0 \ No newline at end of file diff --git a/model-optimizer/tf_call_ie_layer/build.sh b/model-optimizer/tf_call_ie_layer/build.sh index 3188767..6518c31 100644 --- a/model-optimizer/tf_call_ie_layer/build.sh +++ b/model-optimizer/tf_call_ie_layer/build.sh @@ -59,9 +59,9 @@ else fi set -e # exit if something goes wrong -if [ "x$INTEL_CVSDK_DIR" = "x" ]; then - echo "ERROR: INTEL_CVSDK_DIR environment variable is not set" - echo "Please, run the 'source /bin/setupvars.sh'" +if [ "x$INTEL_OPENVINO_DIR" = "x" ]; then + echo "ERROR: INTEL_OPENVINO_DIR environment variable is not set" + echo "Please, run the 'source /bin/setupvars.sh'" exit 1 fi @@ -71,7 +71,7 @@ if [ "x$TF_ROOT_DIR" == 'x' ]; then exit 1 fi -IE_HEADERS_SRC_DIR=$INTEL_CVSDK_DIR/inference_engine/include +IE_HEADERS_SRC_DIR=$INTEL_OPENVINO_DIR/inference_engine/include if [ ! -e $IE_HEADERS_SRC_DIR ]; then echo "ERROR: Inference Engine headers files '$IE_HEADERS_SRC_DIR' doesn't exist" exit 1 diff --git a/model-optimizer/version.txt b/model-optimizer/version.txt deleted file mode 100644 index c700465..0000000 --- a/model-optimizer/version.txt +++ /dev/null @@ -1,3 +0,0 @@ -06:46PM December 13, 2018 -1.5.12.49d067a0 -49d067a07dedf8e95920e9649e890a76451ca648 diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..e618415 --- /dev/null +++ b/tools/README.md @@ -0,0 +1,69 @@ +# OpenVINO™ Python* openvino.tools package + +## General +`openvino.tools` package includes: +* openvino.tools.accuracy_checker +* openvino.tools.benchmark +* openvino.tools.calibration +* openvino.tools.utils + +Please, refer to https://docs.openvinotoolkit.org for details. + +## Installation +Choose neccessary Python\* version and define `PYTHONPATH` environment variable. + +### Prerequisites + +Install prerequisites first: + +#### 1. Python + +**openvino.tools** is **Python 3** library. Install it first: + +- [Python3][python3] +- [setuptools][setuptools] + +```bash +sudo apt-get install python3 python3-dev python3-setuptools python3-pip +``` + +Python setuptools and python package manager (pip) install packages into system directory by default. There are several options: + +- work inside [virtual environment][virtualenv] (best solution). +- use `--user` option for all `pip` commands. +- install all dependencies with *sudo* permissions. + +In order to use virtual environment you should install it: + +```bash +python3 -m pip install virtualenv +python3 -m virtualenv -p `which python3` +``` + +Before starting to work inside virtual environment, it should be activated: + +```bash +source /bin/activate +``` + +Virtual environment can be deactivated using command + +```bash +deactivate +``` + +#### 2. Install package prerequisites + +The next step is installing package prerequisites. + +```bash +python3 -m pip install -r accuracy_checker/requirements.txt benchmark/requirements.txt calibration/requirements.txt +``` + +### Configuration + +Each subpackage has specific configuration. Please, refer to specific subpackage documentation for details. + +[python3]: https://www.python.org/downloads/ +[setuptools]: https://pypi.python.org/pypi/setuptools + diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e8cc80e --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1,17 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +__version__ = "0.0.1" diff --git a/tools/accuracy_checker/.pylintrc b/tools/accuracy_checker/.pylintrc new file mode 100644 index 0000000..7c903ac --- /dev/null +++ b/tools/accuracy_checker/.pylintrc @@ -0,0 +1,31 @@ +[MASTER] +disable = C0103, + C0111, + too-many-locals, + too-many-arguments, + unused-argument, + too-many-instance-attributes, + too-few-public-methods, + unsubscriptable-object, + unbalanced-tuple-unpacking, + arguments-differ, + E1101, + E1111, + C0204, + W0201, + W0107, + R0401 + +max-line-length = 120 +ignore-docstrings = yes +extension-pkg-whitelist=inference_engine,cv2,numpy +ignored-modules = numpy,cv2,openvino.inference_engine,caffe +load-plugins = pylint_checkers +ignored-classes = pathlib.PurePath +jobs=0 + +[SIMILARITIES] +ignore-imports = yes + +[BASIC] +bad-functions=print,as_posix,absolute diff --git a/tools/accuracy_checker/README.md b/tools/accuracy_checker/README.md new file mode 100644 index 0000000..ceee153 --- /dev/null +++ b/tools/accuracy_checker/README.md @@ -0,0 +1,60 @@ +# Deep Learning accuracy validation framework + +## Installation + +### Prerequisites + +Install prerequisites first: + +#### 1. Python + +**accuracy checker** uses **Python 3**. Install it first: + +- [Python3][python3], [setuptools][setuptools]: + +```bash +sudo apt-get install python3 python3-dev python3-setuptools python3-pip +``` + +Python setuptools and python package manager (pip) install packages into system directory by default. Installation of accuracy checker tested only via [virtual environment][virtualenv]. + +In order to use virtual environment you should install it first: + +```bash +python3 -m pip install virtualenv +python3 -m virtualenv -p `which python3` +``` + +Before starting to work inside virtual environment, it should be activated: + +```bash +source /bin/activate +``` + +Virtual environment can be deactivated using command + +```bash +deactivate +``` + +#### 2. Frameworks + +The next step is installing backend frameworks for Accuracy Checker. + +In order to evaluate some models required frameworks have to be installed. Accuracy-Checker supports these frameworks: + +- [OpenVINO][openvino-get-started]. +- [Caffe][caffe-get-started]. + +You can use any of them or several at a time. + +#### 3. Requirements installation +```bash +pip3 install -r requirements.txt + +[python3]: https://www.python.org/downloads/ +[setuptools]: https://pypi.python.org/pypi/setuptools +[caffe-get-started]: accuracy_checker/launcher/caffe_installation_readme.md +[virtual-environment]: https://docs.python.org/3/tutorial/venv.html +[virtualenv]: https://virtualenv.pypa.io/en/stable +[openvino-get-started]: https://software.intel.com/en-us/openvino-toolkit/documentation/get-started \ No newline at end of file diff --git a/tools/accuracy_checker/__init__.py b/tools/accuracy_checker/__init__.py new file mode 100644 index 0000000..e4f37bf --- /dev/null +++ b/tools/accuracy_checker/__init__.py @@ -0,0 +1,39 @@ +from .accuracy_checker import ( + annotation_converters, + adapters, + config, + data_readers, + launcher, + metrics, + postprocessor, + preprocessor, + representation, + dataset, + dependency, + logging, + main, + model_evaluator, + presenters, + progress_reporters, + utils +) + +__all__ = [ + 'annotation_converters', + 'adapters', + 'config', + 'data_readers', + 'launcher', + 'metrics', + 'postprocessor', + 'preprocessor', + 'representation', + 'dataset', + 'dependency', + 'logging', + 'main', + 'model_evaluator', + 'presenters', + 'progress_reporters', + 'utils' +] diff --git a/tools/accuracy_checker/accuracy_checker/__init__.py b/tools/accuracy_checker/accuracy_checker/__init__.py new file mode 100644 index 0000000..ede2917 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/__init__.py @@ -0,0 +1,17 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +__version__ = "0.6.8" diff --git a/tools/accuracy_checker/accuracy_checker/adapters/README.md b/tools/accuracy_checker/accuracy_checker/adapters/README.md new file mode 100644 index 0000000..40cec31 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/README.md @@ -0,0 +1,73 @@ +# Adapters + +Adapter is a function for conversion network infer output to metric specific format. +You can use 2 ways to set adapter for topology: +* Define adapter as a string. + +```yml +adapter: classification +``` + +* Define adapter as a dictionary, using `type:` for setting adapter name. This approach gives opportunity to set additional parameters for adapter if it is required. + +```yml +adapter: + type: reid + grn_workaround: False +``` + +AccuracyChecker supports following set of adapters: +* `classification` - converting output of classification model to `ClassificationPrediction` representation. +* `segmentation` - converting output of semantic segmentation model to `SeegmentationPrediction` representation. +* `tiny_yolo_v1` - converting output of Tiny YOLO v1 model to `DetectionPrediction` representation. +* `reid` - converting output of reidentification model to `ReIdentificationPrediction` representation. + * `grn_workaround` - enabling processing output with adding Global Region Normalization layer. +* `yolo_v2` - converting output of YOLO v2 family models to `DetectionPrediction` representation. + * `classes` - number of detection classes (default 20). + * `anchors` - anchor values provided as comma-separated list or one of precomputed: `yolo_v2` and `tiny_yolo_v2`. + * `coords` - number of bbox coordinates (default 4). + * `num` - num parameter from DarkNet configuration file (default 5). +* `yolo_v3` - converting output of YOLO v3 family models to `DetectionPrediction` representation. + * `classes` - number of detection classes (default 80). + * `anchors` - anchor values provided as comma-separited list or precomputed: `yolo_v3`. + * `coords` - number of bbox coordinates (default 4). + * `num` - num parameter from DarkNet configuration file (default 3). + * `threshold` - minimal objectness score value for valid detections (default 0.001). + * `input_width` and `input_height` - network input width and height correspondingly (default 416). + * `outputs` - the list of output layers names (optional), if specified there should be exactly 3 output layers provided. +* `lpr` - converting output of license plate recognition model to `CharacterRecognitionPrediction` representation. +* `ssd` - converting output of SSD model to `DetectionPrediction` representation. +* `face_person_detection` - converting face person detection model output with 2 detection outputs to `ContainerPredition`, where value of parameters `face_out`and `person_out` are used for identification `DetectionPrediction` in container. + * `face_out` - face detection output layer name. + * `person_out` - person detection output layer name. +* `attributes_recognition` - converting vehicle attributes recognition model output to `ContainerPrediction` where value of parameters `color_out`and `type_out` are used for identification `ClassificationPrediction` in container. + * `color_out` - vehicle color attribute output layer name. + * `type_out`- vehicle type attribute output layer name. +* `head_pose` - converting head pose estimation model output to `ContainerPrediction` where names of parameters `angle_pitch`, `angle_yaw` and `angle_roll` are used for identification `RegressionPrediction` in container. + * `angle_pitch` - output layer name for pitch angle. + * `angle_yaw`- output layer name for yaw angle. + * `angle_roll` - output layer name for roll angle. +* `age_gender` - converting age gender recognition model output to `ContainerPrediction` with `ClassificationPrediction` named `gender` for gender recognition, `ClassificationPrediction` named `age_classification` and `RegressionPrediction` named `age_error` for age recognition. + * `age_out` - output layer name for age recognition. + * `gender_out` - output layer name for gender recognition. +* `action_detection` - converting output of model for person detection and action recognition tasks to `ContainerPrediction` with `DetectionPrdiction` for class agnostic metric calculation and `DetectionPrediction` for action recognition. The representations in container have names `class_agnostic_prediction` and `action_prediction` respectively. + * `priorbox_out` - name of layer containing prior boxes in SSD format. + * `loc_out` - name of layer containing box coordinates in SSD format. + * `main_conf_out` - name of layer containing detection confidences. + * `add_conf_out_prefix` - prefix for generation name of layers containing action confidences if topology has several following layers or layer name. + * `add_conf_out_count` - number of layers with action confidences (optional, you can not provide this argument if action confidences contained in one layer). + * `num_action_classes` - number classes for action recognition. + * `detection_threshold` - minimal detection confidences level for valid detections. +* `super_resolution` - converting output of single image super resolution network to `SuperResolutionPrediction`. +* `landmarks_regression` - converting output of model for landmarks regression to `FacialLandmarksPrediction`. +* `text_detection` - converting output of model for text detection to `TextDetectionPrediction`. + * `pixel_class_out` - name of layer containing information related to text/no-text classification for each pixel. + * `pixel_link_out` - name of layer containing information related to linkage between pixels and their neighbors. +* `human_pose_estimation` - converting output of model for human pose estimation to `PoseEstimationPrediction`. + * `part_affinity_fields_out` - name of output layer with keypoints pairwise relations (part affinity fields). + * `keypoints_heatmap_out` - name of output layer with keypoints heatmaps. +* `beam_search_decoder` - realization CTC Beam Search decoder for symbol sequence recognition, converting model output to `CharacterRecognitionPrediction`. + * `beam_size` - size of the beam to use during decoding (default 10). + * `blank_label` - index of the CTC blank label. + * `softmaxed_probabilities` - indicator that model uses softmax for output layer (default False). +* `gaze_estimation` - converting output of gaze estimation model to `GazeVectorPrediction`. diff --git a/tools/accuracy_checker/accuracy_checker/adapters/__init__.py b/tools/accuracy_checker/accuracy_checker/adapters/__init__.py new file mode 100644 index 0000000..d52b162 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/__init__.py @@ -0,0 +1,79 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .adapter import Adapter, AdapterField + +from .action_recognition import ActionDetection +from .text_detection import TextDetectionAdapter, LPRAdapter, BeamSearchDecoder +from .image_processing import SuperResolutionAdapter +from .attributes_recognition import ( + HeadPoseEstimatorAdapter, + VehicleAttributesRecognitionAdapter, + PersonAttributesAdapter, + AgeGenderAdapter, + LandmarksRegressionAdapter, + GazeEstimationAdapter +) + +from .reidentification import ReidAdapter +from .detection import TinyYOLOv1Adapter, SSDAdapter, FacePersonAdapter, YoloV2Adapter, YoloV3Adapter +from .classification import ClassificationAdapter +from .segmentation import SegmentationAdapter, BrainTumorSegmentationAdapter +from .pose_estimation import HumanPoseAdapter + +from .dummy_adapters import XML2DetectionAdapter + +from .hit_ratio import HitRatioAdapter + +__all__ = [ + 'Adapter', + 'AdapterField', + + 'XML2DetectionAdapter', + + 'ClassificationAdapter', + + 'SSDAdapter', + 'TinyYOLOv1Adapter', + 'YoloV2Adapter', + 'YoloV3Adapter', + 'FacePersonAdapter', + + 'SegmentationAdapter', + 'BrainTumorSegmentationAdapter', + + 'ReidAdapter', + + 'SuperResolutionAdapter', + + 'HeadPoseEstimatorAdapter', + 'VehicleAttributesRecognitionAdapter', + 'PersonAttributesAdapter', + 'AgeGenderAdapter', + 'LandmarksRegressionAdapter', + 'GazeEstimationAdapter', + + 'TextDetectionAdapter', + + 'BeamSearchDecoder', + 'LPRAdapter', + + 'HumanPoseAdapter', + + 'ActionDetection', + + 'HitRatioAdapter' +] diff --git a/tools/accuracy_checker/accuracy_checker/adapters/action_recognition.py b/tools/accuracy_checker/accuracy_checker/adapters/action_recognition.py new file mode 100644 index 0000000..113eb9d --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/action_recognition.py @@ -0,0 +1,119 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..adapters import Adapter +from ..config import ConfigValidator, StringField, NumberField +from ..representation import DetectionPrediction, ContainerPrediction + + +class ActionDetectorConfig(ConfigValidator): + type = StringField() + priorbox_out = StringField() + loc_out = StringField() + main_conf_out = StringField() + add_conf_out_prefix = StringField() + add_conf_out_count = NumberField(optional=True, min_value=1) + num_action_classes = NumberField() + detection_threshold = NumberField(optional=True, floats=True, min_value=0, max_value=1) + + +class ActionDetection(Adapter): + __provider__ = 'action_detection' + + def validate_config(self): + action_detector_adapter_config = ActionDetectorConfig('ActionDetector_Config') + action_detector_adapter_config.validate(self.launcher_config) + + def configure(self): + self.priorbox_out = self.launcher_config['priorbox_out'] + self.loc_out = self.launcher_config['loc_out'] + self.main_conf_out = self.launcher_config['main_conf_out'] + self.num_action_classes = self.launcher_config['num_action_classes'] + self.detection_threshold = self.launcher_config.get('detection_threshold', 0) + add_conf_out_count = self.launcher_config.get('add_conf_out_count') + add_conf_out_prefix = self.launcher_config['add_conf_out_prefix'] + if add_conf_out_count is None: + self.add_conf_outs = [add_conf_out_prefix] + else: + self.add_conf_outs = [] + for num in np.arange(start=1, stop=add_conf_out_count + 1): + self.add_conf_outs.append('{}{}'.format(add_conf_out_prefix, num)) + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + raw_outputs = self._extract_predictions(raw, frame_meta) + prior_boxes = raw_outputs[self.priorbox_out][0][0].reshape(-1, 4) + prior_variances = raw_outputs[self.priorbox_out][0][1].reshape(-1, 4) + for batch_id, identifier in enumerate(identifiers): + labels, class_scores, x_mins, y_mins, x_maxs, y_maxs, main_scores = self.prepare_detection_for_id( + batch_id, raw_outputs, prior_boxes, prior_variances + ) + action_prediction = DetectionPrediction(identifier, labels, class_scores, x_mins, y_mins, x_maxs, y_maxs) + person_prediction = DetectionPrediction( + identifier, [1] * len(labels), main_scores, x_mins, y_mins, x_maxs, y_maxs + ) + result.append(ContainerPrediction({ + 'action_prediction': action_prediction, 'class_agnostic_prediction': person_prediction + })) + + return result + + def prepare_detection_for_id(self, batch_id, raw_outputs, prior_boxes, prior_variances): + num_detections = raw_outputs[self.loc_out][batch_id].size // 4 + locs = raw_outputs[self.loc_out][batch_id].reshape(-1, 4) + main_conf = raw_outputs[self.main_conf_out][batch_id].reshape(num_detections, -1) + add_confs = list(map( + lambda layer: raw_outputs[layer][batch_id].reshape(-1, self.num_action_classes), self.add_conf_outs + )) + anchors_num = len(add_confs) + labels, class_scores, x_mins, y_mins, x_maxs, y_maxs, main_scores = [], [], [], [], [], [], [] + for index in range(num_detections): + if main_conf[index, 1] < self.detection_threshold: + continue + + x_min, y_min, x_max, y_max = self.decode_box(prior_boxes[index], prior_variances[index], locs[index]) + action_confs = add_confs[index % anchors_num][index // anchors_num] + action_label = np.argmax(action_confs) + labels.append(action_label) + class_scores.append(action_confs[action_label]) + x_mins.append(x_min) + y_mins.append(y_min) + x_maxs.append(x_max) + y_maxs.append(y_max) + main_scores.append(main_conf[index, 1]) + + return labels, class_scores, x_mins, y_mins, x_maxs, y_maxs, main_scores + + @staticmethod + def decode_box(prior, var, deltas): + prior_width = prior[2] - prior[0] + prior_height = prior[3] - prior[1] + prior_center_x = (prior[0] + prior[2]) / 2 + prior_center_y = (prior[1] + prior[3]) / 2 + + decoded_box_center_x = var[0] * deltas[0] * prior_width + prior_center_x + decoded_box_center_y = var[1] * deltas[1] * prior_height + prior_center_y + decoded_box_width = np.exp(var[2] * deltas[2]) * prior_width + decoded_box_height = np.exp(var[3] * deltas[3]) * prior_height + + decoded_xmin = decoded_box_center_x - decoded_box_width / 2 + decoded_ymin = decoded_box_center_y - decoded_box_height / 2 + decoded_xmax = decoded_box_center_x + decoded_box_width / 2 + decoded_ymax = decoded_box_center_y + decoded_box_height / 2 + + return decoded_xmin, decoded_ymin, decoded_xmax, decoded_ymax diff --git a/tools/accuracy_checker/accuracy_checker/adapters/adapter.py b/tools/accuracy_checker/accuracy_checker/adapters/adapter.py new file mode 100644 index 0000000..2358dcc --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/adapter.py @@ -0,0 +1,71 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..config import BaseField, ConfigValidator, StringField +from ..dependency import ClassProvider + + +class Adapter(ClassProvider): + """ + Interface that describes converting raw output to appropriate representation. + """ + + __provider_type__ = 'adapter' + + def __init__(self, launcher_config, label_map=None, output_blob=None): + self.launcher_config = launcher_config + self.output_blob = output_blob + self.label_map = label_map + + self.validate_config() + self.configure() + + def __call__(self, *args, **kwargs): + return self.process(*args, **kwargs) + + def process(self, raw, identifiers=None, frame_meta=None): + raise NotImplementedError + + def configure(self): + pass + + def validate_config(self): + pass + + @staticmethod + def _extract_predictions(outputs_list, meta): + return outputs_list[0] + + +class AdapterField(BaseField): + def validate(self, entry, field_uri_=None): + super().validate(entry, field_uri_) + + if entry is None: + return + + field_uri_ = field_uri_ or self.field_uri + if isinstance(entry, str): + StringField(choices=Adapter.providers).validate(entry, 'adapter') + elif isinstance(entry, dict): + class DictAdapterValidator(ConfigValidator): + type = StringField(choices=Adapter.providers) + dict_adapter_validator = DictAdapterValidator( + 'adapter', on_extra_argument=DictAdapterValidator.IGNORE_ON_EXTRA_ARGUMENT + ) + dict_adapter_validator.validate(entry) + else: + self.raise_error(entry, field_uri_, 'adapter must be either string or dictionary') diff --git a/tools/accuracy_checker/accuracy_checker/adapters/attributes_recognition.py b/tools/accuracy_checker/accuracy_checker/adapters/attributes_recognition.py new file mode 100644 index 0000000..b43040d --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/attributes_recognition.py @@ -0,0 +1,210 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..adapters import Adapter +from ..config import ConfigValidator, StringField +from ..representation import ( + ContainerPrediction, + RegressionPrediction, + ClassificationPrediction, + FacialLandmarksPrediction, + MultiLabelRecognitionPrediction, + GazeVectorPrediction +) + + +class HeadPoseEstimatorAdapterConfig(ConfigValidator): + type = StringField() + angle_yaw = StringField() + angle_pitch = StringField() + angle_roll = StringField() + + +class HeadPoseEstimatorAdapter(Adapter): + """ + Class for converting output of HeadPoseEstimator to HeadPosePrediction representation + """ + __provider__ = 'head_pose' + + def validate_config(self): + head_pose_estimator_adapter_config = HeadPoseEstimatorAdapterConfig( + 'HeadPoseEstimator_Config', on_extra_argument=HeadPoseEstimatorAdapterConfig.ERROR_ON_EXTRA_ARGUMENT) + head_pose_estimator_adapter_config.validate(self.launcher_config) + + def configure(self): + """ + Specifies parameters of config entry + """ + self.angle_yaw = self.launcher_config['angle_yaw'] + self.angle_pitch = self.launcher_config['angle_pitch'] + self.angle_roll = self.launcher_config['angle_roll'] + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + frame_meta: list of meta information about each frame + Returns: + list of ContainerPrediction objects + """ + result = [] + raw_output = self._extract_predictions(raw, frame_meta) + for identifier, yaw, pitch, roll in zip( + identifiers, + raw_output[self.angle_yaw], + raw_output[self.angle_pitch], + raw_output[self.angle_roll] + ): + prediction = ContainerPrediction({'angle_yaw': RegressionPrediction(identifier, yaw[0]), + 'angle_pitch': RegressionPrediction(identifier, pitch[0]), + 'angle_roll': RegressionPrediction(identifier, roll[0])}) + result.append(prediction) + + return result + + +class VehicleAttributesRecognitionAdapterConfig(ConfigValidator): + type = StringField() + color_out = StringField() + type_out = StringField() + + +class VehicleAttributesRecognitionAdapter(Adapter): + __provider__ = 'vehicle_attributes' + + def validate_config(self): + attributes_recognition_adapter_config = VehicleAttributesRecognitionAdapterConfig( + 'VehicleAttributesRecognition_Config', + on_extra_argument=VehicleAttributesRecognitionAdapterConfig.ERROR_ON_EXTRA_ARGUMENT) + attributes_recognition_adapter_config.validate(self.launcher_config) + + def configure(self): + """ + Specifies parameters of config entry + """ + self.color_out = self.launcher_config['color_out'] + self.type_out = self.launcher_config['type_out'] + + def process(self, raw, identifiers=None, frame_meta=None): + res = [] + raw_output = self._extract_predictions(raw, frame_meta) + for identifier, colors, types in zip(identifiers, raw_output[self.color_out], raw_output[self.type_out]): + res.append(ContainerPrediction({'color': ClassificationPrediction(identifier, colors.reshape(-1)), + 'type': ClassificationPrediction(identifier, types.reshape(-1))})) + return res + + +class AgeGenderAdapterConfig(ConfigValidator): + type = StringField() + age_out = StringField() + gender_out = StringField() + + +class AgeGenderAdapter(Adapter): + __provider__ = 'age_gender' + + def configure(self): + self.age_out = self.launcher_config['age_out'] + self.gender_out = self.launcher_config['gender_out'] + + def validate_config(self): + age_gender_adapter_config = AgeGenderAdapterConfig( + 'AgeGender_Config', on_extra_argument=AgeGenderAdapterConfig.ERROR_ON_EXTRA_ARGUMENT) + age_gender_adapter_config.validate(self.launcher_config) + + @staticmethod + def get_age_scores(age): + age_scores = np.zeros(4) + if age < 19: + age_scores[0] = 1 + return age_scores + if age < 36: + age_scores[1] = 1 + return age_scores + if age < 66: + age_scores[2] = 1 + return age_scores + age_scores[3] = 1 + return age_scores + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + raw_output = self._extract_predictions(raw, frame_meta) + for identifier, age, gender in zip(identifiers, raw_output[self.age_out], raw_output[self.gender_out]): + gender = gender.reshape(-1) + age = age.reshape(-1)[0]*100 + gender_rep = ClassificationPrediction(identifier, gender) + age_class_rep = ClassificationPrediction(identifier, self.get_age_scores(age)) + age_error_rep = RegressionPrediction(identifier, age) + result.append(ContainerPrediction({'gender': gender_rep, 'age_classification': age_class_rep, + 'age_error': age_error_rep})) + return result + + +class LandmarksRegressionAdapter(Adapter): + __provider__ = 'landmarks_regression' + + def process(self, raw, identifiers=None, frame_meta=None): + res = [] + raw_output = self._extract_predictions(raw, frame_meta) + for identifier, values in zip(identifiers, raw_output[self.output_blob]): + x_values, y_values = values[::2], values[1::2] + res.append(FacialLandmarksPrediction(identifier, x_values.reshape(-1), y_values.reshape(-1))) + return res + + +class PersonAttributesConfig(ConfigValidator): + attributes_recognition_out = StringField(optional=True) + + +class PersonAttributesAdapter(Adapter): + __provider__ = 'person_attributes' + + def validate_config(self): + person_attributes_adapter_config = PersonAttributesConfig( + 'PersonAttributes_Config', + PersonAttributesConfig.IGNORE_ON_EXTRA_ARGUMENT + ) + person_attributes_adapter_config.validate(self.launcher_config) + + def configure(self): + self.attributes_recognition_out = self.launcher_config.get('attributes_recognition_out', self.output_blob) + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + raw_output = self._extract_predictions(raw, frame_meta) + for identifier, multi_label in zip(identifiers, raw_output[self.attributes_recognition_out or self.output_blob]): + multi_label[multi_label > 0.5] = 1. + multi_label[multi_label <= 0.5] = 0. + + result.append(MultiLabelRecognitionPrediction(identifier, multi_label.reshape(-1))) + + return result + + +class GazeEstimationAdapter(Adapter): + __provider__ = 'gaze_estimation' + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + raw_output = self._extract_predictions(raw, frame_meta) + for identifier, output in zip(identifiers, raw_output[self.output_blob]): + result.append(GazeVectorPrediction(identifier, output)) + + return result diff --git a/tools/accuracy_checker/accuracy_checker/adapters/classification.py b/tools/accuracy_checker/accuracy_checker/adapters/classification.py new file mode 100644 index 0000000..ddcf267 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/classification.py @@ -0,0 +1,45 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..adapters import Adapter +from ..representation import ClassificationPrediction + + +class ClassificationAdapter(Adapter): + """ + Class for converting output of classification model to ClassificationPrediction representation + """ + __provider__ = 'classification' + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + frame_meta: list of meta information about each frame + Returns: + list of ClassificationPrediction objects + """ + prediction = self._extract_predictions(raw, frame_meta)[self.output_blob] + prediction = np.reshape(prediction, (prediction.shape[0], -1)) + + result = [] + for identifier, output in zip(identifiers, prediction): + result.append(ClassificationPrediction(identifier, output)) + + return result diff --git a/tools/accuracy_checker/accuracy_checker/adapters/detection.py b/tools/accuracy_checker/accuracy_checker/adapters/detection.py new file mode 100644 index 0000000..4ff1355 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/detection.py @@ -0,0 +1,344 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import math + +import numpy as np + +from ..adapters import Adapter +from ..config import ConfigValidator, NumberField, StringField, ListField +from ..representation import DetectionPrediction, ContainerPrediction +from ..utils import get_or_parse_value + + +class TinyYOLOv1Adapter(Adapter): + """ + Class for converting output of Tiny YOLO v1 model to DetectionPrediction representation + """ + __provider__ = 'tiny_yolo_v1' + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + Returns: + list of DetectionPrediction objects + """ + prediction = self._extract_predictions(raw, frame_meta)[self.output_blob] + + PROBABILITY_SIZE = 980 + CONFIDENCE_SIZE = 98 + BOXES_SIZE = 392 + + CELLS_X, CELLS_Y = 7, 7 + CLASSES = 20 + OBJECTS_PER_CELL = 2 + + result = [] + for identifier, output in zip(identifiers, prediction): + assert PROBABILITY_SIZE + CONFIDENCE_SIZE + BOXES_SIZE == output.shape[0] + + probability, scale, boxes = np.split(output, [PROBABILITY_SIZE, PROBABILITY_SIZE + CONFIDENCE_SIZE]) + + probability = np.reshape(probability, (CELLS_Y, CELLS_X, CLASSES)) + scale = np.reshape(scale, (CELLS_Y, CELLS_X, OBJECTS_PER_CELL)) + boxes = np.reshape(boxes, (CELLS_Y, CELLS_X, OBJECTS_PER_CELL, 4)) + + confidence = np.zeros((CELLS_Y, CELLS_X, OBJECTS_PER_CELL, CLASSES + 4)) + for cls in range(CLASSES): + confidence[:, :, 0, cls] = np.multiply(probability[:, :, cls], scale[:, :, 0]) + confidence[:, :, 1, cls] = np.multiply(probability[:, :, cls], scale[:, :, 1]) + + labels, scores, x_mins, y_mins, x_maxs, y_maxs = [], [], [], [], [], [] + for i, j, k in np.ndindex((CELLS_X, CELLS_Y, OBJECTS_PER_CELL)): + box = boxes[j, i, k] + box = [(box[0] + i) / float(CELLS_X), (box[1] + j) / float(CELLS_Y), box[2] ** 2, box[3] ** 2] + + label = np.argmax(confidence[j, i, k, :CLASSES]) + score = confidence[j, i, k, label] + + labels.append(label) + scores.append(score) + x_mins.append(box[0] - box[2] / 2.0) + y_mins.append(box[1] - box[3] / 2.0) + x_maxs.append(box[0] + box[2] / 2.0) + y_maxs.append(box[1] + box[3] / 2.0) + + result.append(DetectionPrediction(identifier, labels, scores, x_mins, y_mins, x_maxs, y_maxs)) + + return result + + +PRECOMPUTED_ANCHORS = { + 'yolo_v2': [1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071], + 'tiny_yolo_v2': [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52], + 'yolo_v3': [ + 10.0, 13.0, 16.0, 30.0, 33.0, 23.0, 30.0, 61.0, 62.0, 45.0, 59.0, 119.0, 116.0, 90.0, 156.0, 198.0, 373.0, 326.0 + ], + 'tiny_yolo_v3': [10.0, 14.0, 23.0, 27.0, 37.0, 58.0, 81.0, 82.0, 135.0, 169.0, 344.0, 319.0] +} + + +def entry_index(w, h, n_coords, n_classes, pos, entry): + row = pos // (w * h) + col = pos % (w * h) + return row * w * h * (n_classes + n_coords + 1) + entry * w * h + col + + +class BaseYoloAdapterConfig(ConfigValidator): + classes = NumberField(floats=False, optional=True, min_value=1) + coords = NumberField(floats=False, optional=True, min_value=1) + num = NumberField(floats=False, optional=True, min_value=1) + anchors = StringField(optional=True) + + +class YoloV2Adapter(Adapter): + """ + Class for converting output of YOLO v2 family models to DetectionPrediction representation + """ + __provider__ = 'yolo_v2' + + def validate_config(self): + yolo_v2_adapter_config = BaseYoloAdapterConfig('BaseYoloAdapter_Config') + yolo_v2_adapter_config.validate(self.launcher_config) + + def configure(self): + self.classes = self.launcher_config.get('classes', 20) + self.coords = self.launcher_config.get('coords', 4) + self.num = self.launcher_config.get('num', 5) + self.anchors = get_or_parse_value(self.launcher_config.get('anchors', 'yolo_v2'), PRECOMPUTED_ANCHORS) + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + Returns: + list of DetectionPrediction objects + """ + predictions = self._extract_predictions(raw, frame_meta)[self.output_blob] + + cells_x, cells_y = 13, 13 + + result = [] + for identifier, prediction in zip(identifiers, predictions): + labels, scores, x_mins, y_mins, x_maxs, y_maxs = [], [], [], [], [], [] + for y, x, n in np.ndindex((cells_y, cells_x, self.num)): + index = n * cells_y * cells_x + y * cells_x + x + + box_index = entry_index(cells_x, cells_y, self.coords, self.classes, index, 0) + obj_index = entry_index(cells_x, cells_y, self.coords, self.classes, index, self.coords) + + scale = prediction[obj_index] + + box = [ + (x + prediction[box_index + 0 * (cells_y * cells_x)]) / cells_x, + (y + prediction[box_index + 1 * (cells_y * cells_x)]) / cells_y, + np.exp(prediction[box_index + 2 * (cells_y * cells_x)]) * self.anchors[2 * n + 0] / cells_x, + np.exp(prediction[box_index + 3 * (cells_y * cells_x)]) * self.anchors[2 * n + 1] / cells_y + ] + + classes_prob = np.empty(self.classes) + for cls in range(self.classes): + cls_index = entry_index(cells_x, cells_y, self.coords, self.classes, index, self.coords + 1 + cls) + classes_prob[cls] = prediction[cls_index] + + classes_prob = classes_prob * scale + + label = np.argmax(classes_prob) + + labels.append(label) + scores.append(classes_prob[label]) + x_mins.append(box[0] - box[2] / 2.0) + y_mins.append(box[1] - box[3] / 2.0) + x_maxs.append(box[0] + box[2] / 2.0) + y_maxs.append(box[1] + box[3] / 2.0) + + result.append(DetectionPrediction(identifier, labels, scores, x_mins, y_mins, x_maxs, y_maxs)) + + return result + + +class YoloV3AdapterConfig(BaseYoloAdapterConfig): + threshold = NumberField(floats=True, optional=True, min_value=0) + outputs = ListField(optional=True) + + +class YoloV3Adapter(Adapter): + """ + Class for converting output of YOLO v3 family models to DetectionPrediction representation + """ + __provider__ = 'yolo_v3' + + def validate_config(self): + yolo_v3_adapter_config = YoloV3AdapterConfig('YoloV3Adapter_Config') + yolo_v3_adapter_config.validate(self.launcher_config) + + def configure(self): + self.classes = self.launcher_config.get('classes', 80) + self.coords = self.launcher_config.get('coords', 4) + self.num = self.launcher_config.get('num', 3) + self.anchors = get_or_parse_value(self.launcher_config.get('anchors', 'yolo_v3'), PRECOMPUTED_ANCHORS) + self.threshold = self.launcher_config.get('threshold', 0.001) + self.outputs = self.launcher_config.get('outputs', []) + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + Returns: + list of DetectionPrediction objects + """ + + def get_anchors_offset(x): + return int((self.num * 2) * (len(self.anchors) / (self.num * 2) - 1 - math.log2(x / 13))) + + def parse_yolo_v3_results(prediction, threshold, w, h, det): + cells_x, cells_y = prediction.shape[1:] + prediction = prediction.flatten() + for y, x, n in np.ndindex((cells_y, cells_x, self.num)): + index = n * cells_y * cells_x + y * cells_x + x + anchors_offset = get_anchors_offset(cells_x) + + box_index = entry_index(cells_x, cells_y, self.coords, self.classes, index, 0) + obj_index = entry_index(cells_x, cells_y, self.coords, self.classes, index, self.coords) + + scale = prediction[obj_index] + if scale < threshold: + continue + + box = [ + (x + prediction[box_index + 0 * (cells_y * cells_x)]) / cells_x, + (y + prediction[box_index + 1 * (cells_y * cells_x)]) / cells_y, + np.exp(prediction[box_index + 2 * (cells_y * cells_x)]) * self.anchors[ + anchors_offset + 2 * n + 0] / w, + np.exp(prediction[box_index + 3 * (cells_y * cells_x)]) * self.anchors[ + anchors_offset + 2 * n + 1] / h + ] + + classes_prob = np.empty(self.classes) + for cls in range(self.classes): + cls_index = entry_index(cells_x, cells_y, self.coords, self.classes, index, + self.coords + 1 + cls) + classes_prob[cls] = prediction[cls_index] * scale + + det['labels'].append(cls) + det['scores'].append(classes_prob[cls]) + det['x_mins'].append(box[0] - box[2] / 2.0) + det['y_mins'].append(box[1] - box[3] / 2.0) + det['x_maxs'].append(box[0] + box[2] / 2.0) + det['y_maxs'].append(box[1] + box[3] / 2.0) + + return det + + result = [] + + raw_outputs = self._extract_predictions(raw, frame_meta) + + if self.outputs: + outputs = self.outputs + else: + outputs = raw_outputs.keys() + + batch = len(identifiers) + predictions = [[] for _ in range(batch)] + for blob in outputs: + for b in range(batch): + predictions[b].append(raw[blob][b]) + + for identifier, prediction, meta in zip(identifiers, predictions, frame_meta): + detections = {'labels': [], 'scores': [], 'x_mins': [], 'y_mins': [], 'x_maxs': [], 'y_maxs': []} + input_shape = list(meta.get('input_shape', {'data': (3, 416, 416)}).values())[0] + self.input_width = input_shape[2] + self.input_height = input_shape[1] + + for p in prediction: + parse_yolo_v3_results(p, self.threshold, self.input_width, self.input_height, detections) + + result.append(DetectionPrediction( + identifier, detections['labels'], detections['scores'], detections['x_mins'], detections['y_mins'], + detections['x_maxs'], detections['y_maxs'] + )) + + return result + + +class SSDAdapter(Adapter): + """ + Class for converting output of SSD model to DetectionPrediction representation + """ + __provider__ = 'ssd' + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + Returns: + list of DetectionPrediction objects + """ + raw_outputs = self._extract_predictions(raw, frame_meta) + prediction_batch = raw_outputs[self.output_blob] + prediction_count = prediction_batch.shape[2] + prediction_batch = prediction_batch.reshape(prediction_count, -1) + prediction_batch = self.remove_empty_detections(prediction_batch) + + result = [] + for batch_index, identifier in enumerate(identifiers): + prediction_mask = np.where(prediction_batch[:, 0] == batch_index) + detections = prediction_batch[prediction_mask] + detections = detections[:, 1::] + result.append(DetectionPrediction(identifier, *zip(*detections))) + + return result + + @staticmethod + def remove_empty_detections(prediction_blob): + ind = prediction_blob[:, 0] + ind_ = np.where(ind == -1)[0] + m = ind_[0] if ind_.size else prediction_blob.shape[0] + return prediction_blob[:m, :] + + +class FacePersonDetectionAdapterConfig(ConfigValidator): + type = StringField() + face_out = StringField() + person_out = StringField() + + +class FacePersonAdapter(Adapter): + __provider__ = 'face_person_detection' + + def validate_config(self): + face_person_detection_adapter_config = FacePersonDetectionAdapterConfig( + 'FacePersonDetection_Config', on_extra_argument=FacePersonDetectionAdapterConfig.ERROR_ON_EXTRA_ARGUMENT) + face_person_detection_adapter_config.validate(self.launcher_config) + + def configure(self): + self.face_detection_out = self.launcher_config['face_out'] + self.person_detection_out = self.launcher_config['person_out'] + self.face_adapter = SSDAdapter(self.launcher_config, self.label_map, self.face_detection_out) + self.person_adapter = SSDAdapter(self.launcher_config, self.label_map, self.person_detection_out) + + def process(self, raw, identifiers=None, frame_meta=None): + face_batch_result = self.face_adapter(raw, identifiers) + person_batch_result = self.person_adapter(raw, identifiers) + result = [ContainerPrediction({self.face_detection_out: face_result, self.person_detection_out: person_result}) + for face_result, person_result in zip(face_batch_result, person_batch_result)] + + return result diff --git a/tools/accuracy_checker/accuracy_checker/adapters/dummy_adapters.py b/tools/accuracy_checker/accuracy_checker/adapters/dummy_adapters.py new file mode 100644 index 0000000..300dec9 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/dummy_adapters.py @@ -0,0 +1,64 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..representation import DetectionPrediction +from ..adapters import Adapter + + +class XML2DetectionAdapter(Adapter): + """ + Class for converting xml detection results in OpenCV FileStorage format to DetectionPrediction representation. + """ + + __provider__ = 'xml_detection' + + def process(self, tree, identifiers=None, frame_meta=None): + class_to_ind = dict(zip(self.label_map.values(), range(len(self.label_map.values())))) + + result = {} + for frames in tree.getroot(): + for frame in frames: + identifier = frame.tag + '.png' + labels, scores, x_mins, y_mins, x_maxs, y_maxs = [], [], [], [], [], [] + for prediction in frame: + if prediction.find('is_ignored'): + continue + + label = prediction.find('type') + if not label: + raise ValueError('Detection predictions contains detection without "{}"'.format('type')) + label = class_to_ind[label.text] + + confidence = prediction.find('confidence') + if confidence is None: + raise ValueError('Detection predictions contains detection without "{}"'.format('confidence')) + confidence = float(confidence.text) + + box = prediction.find('roi') + if not box: + raise ValueError('Detection predictions contains detection without "{}"'.format('roi')) + box = list(map(float, box.text.split())) + + labels.append(label) + scores.append(confidence) + x_mins.append(box[0]) + y_mins.append(box[1]) + x_maxs.append(box[0] + box[2]) + y_maxs.append(box[1] + box[3]) + + result[identifier] = DetectionPrediction(identifier, labels, scores, x_mins, y_mins, x_maxs, y_maxs) + + return result diff --git a/tools/accuracy_checker/accuracy_checker/adapters/hit_ratio.py b/tools/accuracy_checker/accuracy_checker/adapters/hit_ratio.py new file mode 100644 index 0000000..f28b84f --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/hit_ratio.py @@ -0,0 +1,47 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..adapters import Adapter +from ..representation import HitRatioPrediction + + +class HitRatioAdapter(Adapter): + """ + Class for converting output of NCF model to HitRatioPrediction representation. + """ + + __provider__ = 'hit_ratio_adapter' + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + raw: output of model. + identifiers: list of input data identifiers. + frame_meta: metadata for frame. + Returns: + list of HitRatioPrediction objects. + """ + + prediction = self._extract_predictions(raw, frame_meta)[self.output_blob] + prediction = np.reshape(prediction, -1) + + result = [] + for identifier, output in zip(identifiers, prediction): + result.append(HitRatioPrediction(identifier, output)) + + return result diff --git a/tools/accuracy_checker/accuracy_checker/adapters/image_processing.py b/tools/accuracy_checker/accuracy_checker/adapters/image_processing.py new file mode 100644 index 0000000..21ecec3 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/image_processing.py @@ -0,0 +1,35 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..adapters import Adapter +from ..representation import SuperResolutionPrediction + + +class SuperResolutionAdapter(Adapter): + __provider__ = 'super_resolution' + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + raw_outputs = self._extract_predictions(raw, frame_meta) + for identifier, img_sr in zip(identifiers, raw_outputs[self.output_blob]): + img_sr *= 255 + img_sr = np.clip(img_sr, 0., 255.) + img_sr = img_sr.transpose((1, 2, 0)).astype(np.uint8) + result.append(SuperResolutionPrediction(identifier, img_sr)) + + return result diff --git a/tools/accuracy_checker/accuracy_checker/adapters/pose_estimation.py b/tools/accuracy_checker/accuracy_checker/adapters/pose_estimation.py new file mode 100644 index 0000000..25350f5 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/pose_estimation.py @@ -0,0 +1,331 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import math +from operator import itemgetter + +import cv2 +import numpy as np + +from ..adapters import Adapter +from ..config import ConfigValidator, StringField +from ..representation import PoseEstimationPrediction + + +class HumanPoseAdapterConfig(ConfigValidator): + type = StringField() + part_affinity_fields_out = StringField() + keypoints_heatmap_out = StringField() + + +class HumanPoseAdapter(Adapter): + __provider__ = 'human_pose_estimation' + + limb_seq = [ + [2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], [10, 11], [2, 12], [12, 13], + [13, 14], [2, 1], [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18] + ] + map_idx = [ + [31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], [23, 24], [25, 26], + [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38], [45, 46] + ] + + def validate_config(self): + human_pose_estimation_config = HumanPoseAdapterConfig('HumanPose_Config') + human_pose_estimation_config.validate(self.launcher_config) + + def configure(self): + self.part_affinity_fields = self.launcher_config['part_affinity_fields_out'] + self.keypoints_heatmap = self.launcher_config['keypoints_heatmap_out'] + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + raw_outputs = self._extract_predictions(raw, frame_meta) + raw_output = zip( + identifiers, raw_outputs[self.keypoints_heatmap], + raw_outputs[self.part_affinity_fields], frame_meta + ) + for identifier, heatmap, paf, meta in raw_output: + height, width, _ = meta['image_size'] + heatmap_avg = np.zeros((height, width, 19), dtype=np.float32) + paf_avg = np.zeros((height, width, 38), dtype=np.float32) + pad = meta.get('padding', [0, 0, 0, 0]) + heatmap = np.transpose(np.squeeze(heatmap), (1, 2, 0)) + heatmap = cv2.resize(heatmap, (0, 0), fx=8, fy=8, interpolation=cv2.INTER_CUBIC) + heatmap = heatmap[pad[0]:heatmap.shape[0] - pad[2], pad[1]:heatmap.shape[1] - pad[3]:, :] + heatmap = cv2.resize(heatmap, (width, height), interpolation=cv2.INTER_CUBIC) + heatmap_avg = heatmap_avg + heatmap + + paf = np.transpose(np.squeeze(paf), (1, 2, 0)) + paf = cv2.resize(paf, (0, 0), fx=8, fy=8, interpolation=cv2.INTER_CUBIC) + paf = paf[pad[0]:paf.shape[0] - pad[2], pad[1]:paf.shape[1] - pad[3], :] + paf = cv2.resize(paf, (width, height), interpolation=cv2.INTER_CUBIC) + paf_avg = paf_avg + paf + + peak_counter = 0 + all_peaks = [] + for part in range(0, 18): # 19th for bg + peak_counter += self.find_peaks(heatmap_avg[:, :, part], all_peaks, peak_counter) + + subset, candidate = self.group_peaks(all_peaks, paf_avg) + result.append(PoseEstimationPrediction(identifier, *self.get_poses(subset, candidate))) + + return result + + @staticmethod + def find_peaks(heatmap, all_peaks, prev_peak_counter): + heatmap[heatmap < 0.1] = 0 + map_aug = np.zeros((heatmap.shape[0] + 2, heatmap.shape[1] + 2)) + map_left = np.zeros(map_aug.shape) + map_right = np.zeros(map_aug.shape) + map_up = np.zeros(map_aug.shape) + map_down = np.zeros(map_aug.shape) + + map_aug[1:map_aug.shape[0] - 1, 1:map_aug.shape[1] - 1] = heatmap + map_left[1:map_aug.shape[0] - 1, :map_aug.shape[1] - 2] = heatmap + map_right[1:map_aug.shape[0] - 1, 2:map_aug.shape[1]] = heatmap + map_up[:map_aug.shape[0] - 2, 1:map_aug.shape[1] - 1] = heatmap + map_down[2:map_aug.shape[0], 1:map_aug.shape[1] - 1] = heatmap + + peaks_binary = (map_aug > map_left) & (map_aug > map_right) & (map_aug > map_up) & (map_aug > map_down) + peaks_binary = peaks_binary[1:map_aug.shape[0] - 1, 1:map_aug.shape[1] - 1] + peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) + peaks = sorted(peaks, key=itemgetter(0)) # same order with matlab + + flag = np.ones(len(peaks), np.uint8) + peaks_with_score_and_id = [] + peak_counter = 0 + for i, _ in enumerate(peaks): + if flag[i] != 1: + continue + for j in range(i + 1, len(peaks)): + if math.sqrt((peaks[i][0] - peaks[j][0]) ** 2 + (peaks[i][1] - peaks[j][1]) ** 2) < 6: + flag[j] = 0 + peak_id = peak_counter + prev_peak_counter + peak_counter += 1 + peaks_with_score_and_id.append([peaks[i][0], peaks[i][1], heatmap[peaks[i][1], peaks[i][0]], peak_id]) + all_peaks.append(peaks_with_score_and_id) + + return peak_counter + + @staticmethod + def _add_pose_single_candidate(subset, candidate, idx_joint, kpt_num=20): + for joint in candidate: + num = 0 + for subset_j in subset: # check if already in some pose, was added as a part of another limb + if subset_j[idx_joint] == joint[3]: + num += 1 + continue + if num == 0: + person_keypoints = np.ones(kpt_num) * -1 + person_keypoints[idx_joint] = joint[3] # joint idx + person_keypoints[-1] = 1 # n joints in pose + person_keypoints[-2] = joint[2] # pose score + subset.append(person_keypoints) + + return subset + + @staticmethod + def _filter_subset(subset): + filtered_subset = [] + for subset_element in subset: + if subset_element[-1] < 3 or (subset_element[-2] / subset_element[-1] < 0.2): + continue + filtered_subset.append(subset_element) + + return np.asarray(filtered_subset) + + @staticmethod + def _add_pose_both_candidates(subset, temp, index_a, index_b, candidates, kpt_num=20): + for i, temp_i in enumerate(temp): + num = 0 + for j, subset_j in enumerate(subset): + if subset_j[index_a] == temp_i[0]: + subset[j][index_b] = temp[i][1] + num += 1 + subset[j][-1] += 1 + subset[j][-2] += candidates[temp_i[1], 2] + temp_i[2] + if num == 0: + person_keypoints = np.ones(kpt_num) * -1 + person_keypoints[index_a] = temp[i][0] + person_keypoints[index_b] = temp[i][1] + person_keypoints[-1] = 2 + person_keypoints[-2] = np.sum(candidates[temp_i[0:2], 2]) + temp_i[2] + subset.append(person_keypoints) + + return subset + + @staticmethod + def _copy_temperature_to_subset(subset, temp, index_a, index_b): + for _, temp_i in enumerate(temp): + for j, subset_j in enumerate(subset): + check_subset_a = subset_j[index_a] == temp_i[0] and subset_j[index_b] == -1 + check_subset_b = subset_j[index_b] == temp_i[1] and subset_j[index_a] == -1 + if check_subset_a: + subset[j][index_b] = temp_i[1] + continue + if check_subset_b: + subset[j][index_a] = temp_i[0] + + return subset + + @staticmethod + def _get_temperature(cand_a_, cand_b_, score_mid, pafs, threshold=0.05): + temp_ = [] + for index_a_, cand_a_element in enumerate(cand_a_): + for index_b_, cand_b_element in enumerate(cand_b_): + mid_point = [( + int(round((cand_a_element[0] + cand_b_element[0]) * 0.5)), + int(round((cand_a_element[1] + cand_b_element[1]) * 0.5)) + )] * 2 + vec = [cand_b_element[0] - cand_a_element[0], cand_b_element[1] - cand_a_element[1]] + norm_vec = math.sqrt(vec[0] ** 2 + vec[1] ** 2) + if norm_vec == 0: + continue + vec[0] /= norm_vec + vec[1] /= norm_vec + score_mid_a = score_mid[mid_point[0][1], mid_point[0][0], 0] + score_mid_b = score_mid[mid_point[1][1], mid_point[1][0], 1] + score = vec[0] * score_mid_a + vec[1] * score_mid_b + + height_n = pafs.shape[0] // 2 + suc_ratio = 0 + mid_score = 0 + mid_num = 10 # n points for integral over paf + + if score > -100: + p_sum = 0 + p_count = 0 + + x = np.linspace(cand_a_element[0], cand_b_element[0], mid_num) + y = np.linspace(cand_a_element[1], cand_b_element[1], mid_num) + for point_idx in range(0, mid_num): + px = int(round(x[point_idx])) + py = int(round(y[point_idx])) + pred = score_mid[py, px, 0:2] + score = vec[0] * pred[0] + vec[1] * pred[1] + if score > threshold: + p_sum += score + p_count += 1 + suc_ratio = p_count / mid_num + ratio = 0 + if p_count > 0: + ratio = p_sum / p_count + mid_score = ratio + min(height_n / norm_vec - 1, 0) + if mid_score > 0 and suc_ratio > 0.8: + score = mid_score + score_all = score + cand_a_element[2] + cand_b_element[2] + temp_.append([index_a_, index_b_, score, score_all]) + if temp_: + temp_ = sorted(temp_, key=itemgetter(2), reverse=True) + + return temp_ + + def _get_connections(self, cand_a, cand_b, score_mid, pafs, thresh): + temp_ = self._get_temperature(cand_a, cand_b, score_mid, pafs, thresh) + num_limbs = min(len(cand_a), len(cand_b)) + cnt = 0 + occur_a = np.zeros(len(cand_a), dtype=np.int32) + occur_b = np.zeros(len(cand_b), dtype=np.int32) + connections = [] + for row_temp in temp_: + if cnt == num_limbs: + break + i, j, score = row_temp[0:3] + if occur_a[i] == 0 and occur_b[j] == 0: + connections.append([cand_a[i][3], cand_b[j][3], score]) + cnt += 1 + occur_a[i] = 1 + occur_b[j] = 1 + return connections + + def group_peaks(self, peaks, pafs, kpt_num=20, threshold=0.05): + subset = [] + candidates = np.array([item for sublist in peaks for item in sublist]) + for keypoint_id, maped_keypoints in enumerate(self.map_idx): + score_mid = pafs[:, :, [x - 19 for x in maped_keypoints]] + candidate_a = peaks[self.limb_seq[keypoint_id][0] - 1] + candidate_b = peaks[self.limb_seq[keypoint_id][1] - 1] + idx_joint_a = self.limb_seq[keypoint_id][0] - 1 + idx_joint_b = self.limb_seq[keypoint_id][1] - 1 + + if not candidate_a and not candidate_b: # no such limb + continue + if not candidate_a: # limb has just B joint + subset = self._add_pose_single_candidate(subset, candidate_b, idx_joint_b, kpt_num) + continue + if not candidate_b: # limb has just A joint + subset = self._add_pose_single_candidate(subset, candidate_a, idx_joint_a, kpt_num) + continue + + temp = self._get_connections(candidate_a, candidate_b, score_mid, pafs, threshold) + if not temp: + continue + + if keypoint_id == 0: + subset = [np.ones(kpt_num) * -1 for _ in temp] + for i, temp_i in enumerate(temp): + subset[i][self.limb_seq[0][0] - 1] = temp_i[0] + subset[i][self.limb_seq[0][1] - 1] = temp_i[1] + subset[i][-1] = 2 + subset[i][-2] = np.sum(candidates[temp_i[0:2], 2]) + temp_i[2] + else: + index_a = self.limb_seq[keypoint_id][0] - 1 + index_b = self.limb_seq[keypoint_id][1] - 1 + if keypoint_id in (17, 18): + subset = self._copy_temperature_to_subset(subset, temp, index_a, index_b) + continue + subset = self._add_pose_both_candidates(subset, temp, index_a, index_b, candidates, kpt_num) + + return self._filter_subset(subset), candidates + + @staticmethod + def get_poses(subset, candidate): + persons_keypoints_x, persons_keypoints_y, persons_keypoints_v = [], [], [] + scores = [] + for subset_element in subset: + if subset_element.size == 0: + continue + keypoints_x, keypoints_y, keypoints_v = [0] * 17, [0] * 17, [0] * 17 + to_coco_map = [0, -1, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3] + person_score = subset_element[-2] + position_id = -1 + for keypoint_id in subset_element[:-2]: + position_id += 1 + if position_id == 1: # No 'Neck' in COCO + continue + + cx, cy, visibility = 0, 0, 0 # Keypoint not found + if keypoint_id != -1: + cx, cy = candidate[keypoint_id.astype(int), 0:2] + cx = cx - 0.5 + 1 # +1 for matlab consistency, coords start from 1 + cy = cy - 0.5 + 1 + visibility = 1 + keypoints_x[to_coco_map[position_id]] = cx + keypoints_y[to_coco_map[position_id]] = cy + keypoints_v[to_coco_map[position_id]] = visibility + + scores.append(person_score * max(0, (subset_element[-1] - 1))) # -1 for Neck + persons_keypoints_x.append(keypoints_x) + persons_keypoints_y.append(keypoints_y) + persons_keypoints_v.append(keypoints_v) + + persons_keypoints_x = np.array(persons_keypoints_x) + persons_keypoints_y = np.array(persons_keypoints_y) + persons_keypoints_v = np.array(persons_keypoints_v) + scores = np.array(scores) + + return persons_keypoints_x, persons_keypoints_y, persons_keypoints_v, scores diff --git a/tools/accuracy_checker/accuracy_checker/adapters/reidentification.py b/tools/accuracy_checker/accuracy_checker/adapters/reidentification.py new file mode 100644 index 0000000..f2fed25 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/reidentification.py @@ -0,0 +1,58 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..adapters import Adapter +from ..representation import ReIdentificationPrediction + + +class ReidAdapter(Adapter): + """ + Class for converting output of Reid model to ReIdentificationPrediction representation + """ + __provider__ = 'reid' + + def configure(self): + """ + Specifies parameters of config entry + """ + self.grn_workaround = self.launcher_config.get("grn_workaround", True) + + def process(self, raw, identifiers=None, frame_meta=None): + """ + Args: + identifiers: list of input data identifiers + raw: output of model + Returns: + list of ReIdentificationPrediction objects + """ + prediction = self._extract_predictions(raw, frame_meta)[self.output_blob] + + if self.grn_workaround: + # workaround: GRN layer + prediction = self._grn_layer(prediction) + + return [ReIdentificationPrediction(identifier, embedding.reshape(-1)) + for identifier, embedding in zip(identifiers, prediction)] + + @staticmethod + def _grn_layer(prediction): + GRN_BIAS = 0.000001 + sum_ = np.sum(prediction ** 2, axis=1) + prediction = prediction / np.sqrt(sum_[:, np.newaxis] + GRN_BIAS) + + return prediction diff --git a/tools/accuracy_checker/accuracy_checker/adapters/segmentation.py b/tools/accuracy_checker/accuracy_checker/adapters/segmentation.py new file mode 100644 index 0000000..1654c89 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/segmentation.py @@ -0,0 +1,83 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import numpy as np +from ..adapters import Adapter +from ..representation import SegmentationPrediction, BrainTumorSegmentationPrediction + + +class SegmentationAdapter(Adapter): + __provider__ = 'segmentation' + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + frame_meta = frame_meta or [] * len(identifiers) + raw_outputs = self._extract_predictions(raw, frame_meta) + for identifier, output in zip(identifiers, raw_outputs[self.output_blob]): + result.append(SegmentationPrediction(identifier, output)) + + return result + + def _extract_predictions(self, outputs_list, meta): + if not 'tiles_shape' in (meta[-1] or {}): + new_raw = {} + for out in outputs_list: + for key, val in out.items(): + out_previous = new_raw.get(key, []) + out_previous.append(val) + new_raw[key] = out_previous + + for k in new_raw: + new_raw[k] = [new_raw[k]] + return new_raw + tiles_shapes = [meta['tiles_shape'] for meta in meta] + restore_output = [] + offset = 0 + for _, image_tiles_shape in enumerate(tiles_shapes): + next_offset = offset + image_tiles_shape[0] * image_tiles_shape[1] + image_tiles = [network_output[self.output_blob] for network_output in outputs_list[offset:next_offset]] + tiles_columns = image_tiles[::image_tiles_shape[0]] + image = tiles_columns[0] + for tile_column in tiles_columns[1:]: + image = np.concatenate((image, tile_column), axis=3) + restore_output.append(image.squeeze()) + offset = next_offset + + return {self.output_blob: restore_output} + + +class BrainTumorSegmentationAdapter(Adapter): + __provider__ = 'brain_tumor_segmentation' + + def process(self, raw, identifiers=None, frame_meta=None): + result = [] + frame_meta = frame_meta or [] * len(identifiers) + raw_outputs = self._extract_predictions(raw, frame_meta) + for identifier, output in zip(identifiers, raw_outputs[self.output_blob]): + result.append(BrainTumorSegmentationPrediction(identifier, output)) + + return result + + def _extract_predictions(self, outputs_list, meta): + if not (meta[-1] or {}).get('multi_infer', False): + return outputs_list[0] + + output_keys = list(outputs_list[0].keys()) + output_map = {} + for output_key in output_keys: + output_data = [[output[output_key] for output in outputs_list]] + output_map[output_key] = output_data + + return output_map diff --git a/tools/accuracy_checker/accuracy_checker/adapters/text_detection.py b/tools/accuracy_checker/accuracy_checker/adapters/text_detection.py new file mode 100644 index 0000000..d90ebfc --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/adapters/text_detection.py @@ -0,0 +1,309 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from collections import defaultdict + +import cv2 +import numpy as np + + +from ..adapters import Adapter +from ..config import ConfigValidator, StringField, NumberField, BoolField, ConfigError +from ..representation import TextDetectionPrediction, CharacterRecognitionPrediction + + +class TextDetectionAdapterConfig(ConfigValidator): + type = StringField() + pixel_link_out = StringField() + pixel_class_out = StringField() + + +class TextDetectionAdapter(Adapter): + __provider__ = 'text_detection' + + def validate_config(self): + text_detection_adapter_config = TextDetectionAdapterConfig('TextDetectionAdapter_Config') + text_detection_adapter_config.validate(self.launcher_config) + + def configure(self): + self.pixel_link_out = self.launcher_config['pixel_link_out'] + self.pixel_class_out = self.launcher_config['pixel_class_out'] + + def process(self, raw, identifiers=None, frame_meta=None): + results = [] + predictions = self._extract_predictions(raw, frame_meta) + raw_output = zip(identifiers, frame_meta, predictions[self.pixel_link_out], predictions[self.pixel_class_out]) + for identifier, current_frame_meta, link_data, cls_data in raw_output: + link_data = link_data.reshape((1, *link_data.shape)) + cls_data = cls_data.reshape((1, *cls_data.shape)) + link_data_shape = link_data.shape + new_link_data_shape = (link_data_shape[0], link_data_shape[2], link_data_shape[3], link_data_shape[1] / 2) + cls_data_shape = cls_data.shape + new_cls_data_shape = (cls_data_shape[0], cls_data_shape[2], cls_data_shape[3], cls_data_shape[1] / 2) + link_data = self.softmax(link_data.transpose((0, 2, 3, 1)).reshape(-1))[1::2] + cls_data = self.softmax(cls_data.transpose((0, 2, 3, 1)).reshape(-1))[1::2] + mask = self.decode_image_by_join(cls_data, new_cls_data_shape, link_data, new_link_data_shape) + rects = self.mask_to_boxes(mask, current_frame_meta['image_size']) + results.append(TextDetectionPrediction(identifier, rects)) + + return results + + @staticmethod + def softmax(data): + for i in np.arange(start=0, stop=data.size, step=2, dtype=int): + maximum = max(data[i], data[i + 1]) + data[i] = np.exp(data[i] - maximum) + data[i + 1] = np.exp(data[i + 1] - maximum) + sum_data = data[i] + data[i + 1] + data[i] /= sum_data + data[i + 1] /= sum_data + + return data + + def decode_image_by_join(self, cls_data, cls_data_shape, link_data, link_data_shape): + k_cls_conf_threshold = 0.7 + k_link_conf_threshold = 0.7 + height = cls_data_shape[1] + width = cls_data_shape[2] + id_pixel_mask = np.argwhere(cls_data >= k_cls_conf_threshold).reshape(-1) + pixel_mask = cls_data >= k_cls_conf_threshold + group_mask = {} + pixel_mask[id_pixel_mask] = True + points = [] + for i in id_pixel_mask: + points.append((i % width, i // width)) + group_mask[i] = -1 + link_mask = link_data >= k_link_conf_threshold + neighbours = link_data_shape[3] + for point in points: + neighbour = 0 + point_x, point_y = point + x_neighbours = [point_x - 1, point_x, point_x + 1] + y_neighbours = [point_y - 1, point_y, point_y + 1] + for neighbour_y in y_neighbours: + for neighbour_x in x_neighbours: + if neighbour_x == point_x and neighbour_y == point_y: + continue + + if neighbour_x < 0 or neighbour_x >= width or neighbour_y < 0 or neighbour_y >= height: + continue + + pixel_value = np.uint8(pixel_mask[neighbour_y * width + neighbour_x]) + link_value = np.uint8( + link_mask[int(point_y * width * neighbours + point_x * neighbours + neighbour)] + ) + + if pixel_value and link_value: + group_mask = self.join(point_x + point_y * width, neighbour_x + neighbour_y * width, group_mask) + + neighbour += 1 + + return self.get_all(points, width, height, group_mask) + + def join(self, point1, point2, group_mask): + root1 = self.find_root(point1, group_mask) + root2 = self.find_root(point2, group_mask) + if root1 != root2: + group_mask[root1] = root2 + + return group_mask + + def get_all(self, points, width, height, group_mask): + root_map = {} + mask = np.zeros((height, width)) + + for point in points: + point_x, point_y = point + point_root = self.find_root(point_x + point_y * width, group_mask) + if not root_map.get(point_root): + root_map[point_root] = int(len(root_map) + 1) + mask[point_y, point_x] = root_map[point_root] + + return mask + + @staticmethod + def find_root(point, group_mask): + root = point + update_parent = False + while group_mask[root] != -1: + root = group_mask[root] + update_parent = True + + if update_parent: + group_mask[point] = root + + return root + + @staticmethod + def mask_to_boxes(mask, image_size): + max_val = np.max(mask).astype(int) + resized_mask = cv2.resize( + mask.astype(np.float32), (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST + ) + bboxes = [] + for i in range(int(max_val + 1)): + bbox_mask = resized_mask == i + contours_tuple = cv2.findContours(bbox_mask.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + contours = contours_tuple[1] if len(contours_tuple) > 2 else contours_tuple[0] + if not contours: + continue + rect = cv2.minAreaRect(contours[0]) + _, hw, _ = rect + ignored_height = hw[0] >= image_size[0] - 1 + ignored_width = hw[1] >= image_size[1] - 1 + if ignored_height or ignored_width: + continue + box = cv2.boxPoints(rect) + bboxes.append(box) + + return bboxes + + +class LPRAdapter(Adapter): + __provider__ = 'lpr' + + def configure(self): + if not self.label_map: + raise ConfigError('LPR adapter requires dataset label map for correct decoding.') + + def process(self, raw, identifiers=None, frame_meta=None): + raw_output = self._extract_predictions(raw, frame_meta) + predictions = raw_output[self.output_blob] + result = [] + for identifier, output in zip(identifiers, predictions): + decoded_out = self.decode(output.reshape(-1)) + result.append(CharacterRecognitionPrediction(identifier, decoded_out)) + + return result + + def decode(self, outputs): + decode_out = str() + for output in outputs: + if output == -1: + break + decode_out += str(self.label_map[output]) + + return decode_out + + +class BeamSearchDecoderConfig(ConfigValidator): + beam_size = NumberField(optional=True, floats=False, min_value=1) + blank_label = NumberField(optional=True, floats=False, min_value=0) + softmaxed_probabilities = BoolField(optional=True) + + +class BeamSearchDecoder(Adapter): + __provider__ = 'beam_search_decoder' + + def validate_config(self): + beam_search_decoder_config = BeamSearchDecoderConfig( + 'BeamSearchDecoder_Config', + BeamSearchDecoderConfig.IGNORE_ON_EXTRA_ARGUMENT + ) + beam_search_decoder_config.validate(self.launcher_config) + + def configure(self): + if not self.label_map: + raise ConfigError('Beam Search Decoder requires dataset label map for correct decoding.') + + self.beam_size = self.launcher_config.get('beam_size', 10) + self.blank_label = self.launcher_config.get('blank_label', len(self.label_map)) + self.softmaxed_probabilities = self.launcher_config.get('softmaxed_probabilities', False) + + def process(self, raw, identifiers=None, frame_meta=None): + raw_output = self._extract_predictions(raw, frame_meta) + output = raw_output[self.output_blob] + output = np.swapaxes(output, 0, 1) + + result = [] + for identifier, data in zip(identifiers, output): + if self.softmaxed_probabilities: + data = np.log(data) + seq = self.decode(data, self.beam_size, self.blank_label) + decoded = ''.join(str(self.label_map[char]) for char in seq) + result.append(CharacterRecognitionPrediction(identifier, decoded)) + return result + + @staticmethod + def decode(probabilities, beam_size=10, blank_id=None): + """ + Decode given output probabilities to sequence of labels. + Arguments: + probabilities: The output log probabilities for each time step. + Should be an array of shape (time x output dim). + beam_size (int): Size of the beam to use during decoding. + blank_id (int): Index of the CTC blank label. + Returns the output label sequence. + """ + def make_new_beam(): + return defaultdict(lambda: (-np.inf, -np.inf)) + + def log_sum_exp(*args): + if all(a == -np.inf for a in args): + return -np.inf + a_max = np.max(args) + lsp = np.log(np.sum(np.exp(a - a_max) for a in args)) + + return a_max + lsp + + times, symbols = probabilities.shape + # Initialize the beam with the empty sequence, a probability of 1 for ending in blank + # and zero for ending in non-blank (in log space). + beam = [(tuple(), (0.0, -np.inf))] + + for time in range(times): + # A default dictionary to store the next step candidates. + next_beam = make_new_beam() + + for symbol_id in range(symbols): + current_prob = probabilities[time, symbol_id] + + for prefix, (prob_blank, prob_non_blank) in beam: + # If propose a blank the prefix doesn't change. + # Only the probability of ending in blank gets updated. + if symbol_id == blank_id: + next_prob_blank, next_prob_non_blank = next_beam[prefix] + next_prob_blank = log_sum_exp( + next_prob_blank, prob_blank + current_prob, prob_non_blank + current_prob + ) + next_beam[prefix] = (next_prob_blank, next_prob_non_blank) + continue + # Extend the prefix by the new character symbol and add it to the beam. + # Only the probability of not ending in blank gets updated. + end_t = prefix[-1] if prefix else None + next_prefix = prefix + (symbol_id,) + next_prob_blank, next_prob_non_blank = next_beam[next_prefix] + if symbol_id != end_t: + next_prob_non_blank = log_sum_exp( + next_prob_non_blank, prob_blank + current_prob, prob_non_blank + current_prob + ) + else: + # Don't include the previous probability of not ending in blank (prob_non_blank) if symbol + # is repeated at the end. The CTC algorithm merges characters not separated by a blank. + next_prob_non_blank = log_sum_exp(next_prob_non_blank, prob_blank + current_prob) + + next_beam[next_prefix] = (next_prob_blank, next_prob_non_blank) + # If symbol is repeated at the end also update the unchanged prefix. This is the merging case. + if symbol_id == end_t: + next_prob_blank, next_prob_non_blank = next_beam[prefix] + next_prob_non_blank = log_sum_exp(next_prob_non_blank, prob_non_blank + current_prob) + next_beam[prefix] = (next_prob_blank, next_prob_non_blank) + + beam = sorted(next_beam.items(), key=lambda x: log_sum_exp(*x[1]), reverse=True)[:beam_size] + + best = beam[0] + + return best[0] diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/README.md b/tools/accuracy_checker/accuracy_checker/annotation_converters/README.md new file mode 100644 index 0000000..d5dcefe --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/README.md @@ -0,0 +1,98 @@ +# Annotation Converters + +Annotation converter is a function which converts annotation file to suitable for metric evaluation format. +Each annotation converter expects specific annotation file format or data structure, which depends on original dataset. +If converter for your data format is not supported by Accuracy Checker, you can provide your own annotation converter. +Each annotation converter has parameters available for configuration. + +Process of conversion can be implemented in two ways: +* via configuration file +* via command line + +### Describing annotation conversion in configuration file. + +Annotation conversion can be provided in `dataset` section your configuration file to convert annotation inplace before every evaluation. +Each conversion configuration should contain `converter` field filled selected converter name and provide converter specific parameters (more details in supported converters section). All paths can be prefixed via command line with `-s, --source` argument. + +You can additionally use optional parameters like: +* `subsample_size` - Dataset subsample size. You can specify the number of ground truth objects or dataset ration in percentage. Please, be careful to use this option, some datasets does not support subsampling. +* `annotation` - path to store converted annotation pickle file. You can use this parameter if you need to reuse converted annotation to avoid subsequent conversions. +* `meta` - path to store mata information about converted annotation if it is provided. + +Example of usage: +```yaml + annotation_conversion: + converter: sample + data_dir: sample/sample_dataset +``` + + +### Conversing process via command line. + +The command line for annotation conversion looks like: + +```bash +python3 convert_annotation.py +``` +All converter specific options should have format `-- ` +You may refer to `-h, --help` to full list of command line options. Some optional arguments are: + +* `-o, --output_dir` - directory to save converted annotation and meta info. +* `-a, --annotation_name` - annotation file name. +* `-m, --meta_name` - meta info file name. + +### Supported converters + +Accuracy Checker supports following list of annotation converters and specific for them parameters: +* `wider` - converts from Wider Face dataset to `DetectionAnnotation`. + * `annotation_file` - path to txt file, which contains ground truth data in WiderFace dataset format. + * `label_start` - specifies face label index in label map. Default value is 1. You can provide another value, if you want to use this dataset for separate label validation, + in case when your network predicts other class for faces. +* `sample` - converts annotation for SampleNet to `ClassificationAnnotation`. + * `data_dir` - path to sample dataset root directory. +* `voc07` - converts Pascal VOC 2007 annotation for detection task to `DetectionAnnotation`. + * `image_set_file` - path to file with validation image list. + * `annotations_dir` - path to directory with annotation files. + * `images_dir` - path to directory with images related to devkit root (default JPEGImages). + * `has_background` - allows convert dataset with/without adding background_label. Accepted values are True or False. (default is True) +* `voc_segmentation` - converts Pascal VOC annotation for semantic segmentation task to `SegmentationAnnotation`. + * `image_set_file` - path to file with validation image list. + * `images_dir` - path to directory with images related to devkit root (default JPEGImages). + * `mask_dir` - path to directory with ground truth segmentation masks related to devkit root (default SegmentationClass). +* `mars` - converts MARS person reidentification dataset to `ReidentificationAnnotation`. + * `data_dir` - path to data directory, where gallery (`bbox_test`) and `query` subdirectories are located. +* `market1501` - converts Market1501 person reidentification dataset to `ReidentificationAnnotation`. + * `data_dir` - path to data directory, where gallery (`bounding_box_test`) and `query` subdirectories are located. +* `detection_opencv_storage` - converts detection annotation stored in Detection OpenCV storage format to `DetectionAnnotation`. + * `annotation_file` - path to annotation in xml format. + * `image_names_file` - path to txt file, which contains image name list for dataset. + * `label_start` - specifies label index start in label map. Default value is 1. You can provide another value, if you want to use this dataset for separate label validation. + * `background_label` - specifies which index will be used for background label. You can not provide this parameter if your dataset has not background label +* `face_reid_pairwise` - converts Labeled Faces in the Wild dataset for face reidentification to `ReidentificationClassificationAnnotation`. + * `pairs_file` - path to file with annotation positive and negative pairs. + * `train_file` - path to file with annotation positive and negative pairs used for network train (optional parameter). + * `landmarks_file` - path to file with facial landmarks coordinates for annotation images (optional parameter). +* `landmarks_regression` - converts VGG Face 2 dataset for facial landmarks regression task to `FacialLandmarksAnnotation`. + * `landmarks_csv_file` - path to csv file with coordinates of landmarks points. + * `bbox_csv_file` - path to cvs file which contains bounding box coordinates for faces (optional parameter). +* `cityscapes` - converts CityScapes Dataset to `SegmentationAnnotation`. + * `dataset_root_dir` - path to dataset root. + * `images_subfolder` - path from dataset root to directory with validation images (Optional, default `imgsFine/leftImg8bit/val`). + * `masks_subfolder` - path from dataset root to directory with ground truth masks (Optional, `gtFine/val`). + * `masks_suffix` - suffix for mask file names (Optional, default `_gtFine_labelTrainIds`). + * `images_suffix` - suffix for image file names (Optional, default `_leftImg8bit`). + * `use_full_label_map` - allows to use full label map with 33 classes instead train label map with 18 classes (Optional, default `False`). +* `icdar15_detection` - converts ICDAR15 dataset for text detection task to `TextDetectionAnnotation`. + * `data_dir` - path to folder with annotations on txt format. +* `icdar13_recognition` - converts ICDAR13 dataset for text recognition task to `CharecterRecognitionAnnotation`. + * `annotation_file` - path to annotation file in txt format. +* `mscoco_detection` - converts MS COCO dataset for object detection task to `DetectionAnnotation`. + * `annotation_file` - path ot annotation file in json format. + * `has_background` - allows convert dataset with/without adding background_label. Accepted values are True or False. (default is False). + * `use_full_label_map` - allows to use original label map (with 91 object categories) from paper instead public available(80 categories). +* `mscoco_keypoints` - converts MS COCO dataset for keypoints localization task to `PoseEstimationAnnotation`. + * `annotation_file` - path ot annotation file in json format. +* `imagenet` - convert ImageNet dataset for image classification task to `ClassificationAnnotation`. + * `annotation_file` - path to annotation in txt format. + * `labels_file` - path to file with word description of labels (synset words). + * `has_background` - allows to add background label to original labels and convert dataset for 1001 classes instead 1000 (default value is False). diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/__init__.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/__init__.py new file mode 100644 index 0000000..f037422 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/__init__.py @@ -0,0 +1,55 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from .format_converter import BaseFormatConverter +from .convert import make_subset, save_annotation +from .market1501 import Market1501Converter +from .mars import MARSConverter +from .pascal_voc import PascalVOCDetectionConverter +from .sample_converter import SampleConverter +from .wider import WiderFormatConverter +from .detection_opencv_storage import DetectionOpenCVStorageFormatConverter +from .lfw import FaceReidPairwiseConverter +from .vgg_face_regression import LandmarksRegression +from .super_resolution_converter import SRConverter +from .imagenet import ImageNetFormatConverter +from .icdar import ICDAR13RecognitionDatasetConverter, ICDAR15DetectionDatasetConverter +from .ms_coco import MSCocoDetectionConverter, MSCocoKeypointsConverter +from .cityscapes import CityscapesConverter +from .ncf_converter import NCFConverter +from .brats import BratsConverter + +__all__ = [ + 'BaseFormatConverter', + 'make_subset', + 'save_annotation', + + 'ImageNetFormatConverter', + 'Market1501Converter', + 'SampleConverter', + 'PascalVOCDetectionConverter', + 'WiderFormatConverter', + 'MARSConverter', + 'DetectionOpenCVStorageFormatConverter', + 'FaceReidPairwiseConverter', + 'SRConverter', + 'ICDAR13RecognitionDatasetConverter', + 'ICDAR15DetectionDatasetConverter', + 'MSCocoKeypointsConverter', + 'MSCocoDetectionConverter', + 'CityscapesConverter', + 'NCFConverter', + 'BratsConverter' +] diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/_reid_common.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/_reid_common.py new file mode 100644 index 0000000..8bcce97 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/_reid_common.py @@ -0,0 +1,45 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path + +from ..representation import ReIdentificationAnnotation + + +def read_directory(directory, query, image_pattern): + pids = set() + images = [] + for image in directory.glob("*.jpg"): + pid, camid = map(int, image_pattern.search(image.name).groups()) + if pid == -1: + continue + + camid -= 1 + pids.add(pid) + + identifier = str(Path(directory.name) / image.name) + images.append(ReIdentificationAnnotation(identifier, camid, pid, query)) + + return images, pids + + +def check_dirs(dirs, parent_dir, arg_name='data_dir'): + for directory in dirs: + if directory.is_dir(): + continue + + message_pattern = "{directory} not found in {parent_dir}. Check {arg_name} is pointed to a correct directory" + raise FileNotFoundError(message_pattern.format(directory=directory, parent_dir=parent_dir, arg_name=arg_name)) diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/brats.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/brats.py new file mode 100644 index 0000000..327398b --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/brats.py @@ -0,0 +1,53 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from pathlib import Path + +from ..representation import BrainTumorSegmentationAnnotation +from ..utils import get_path +from ..config import StringField +from .format_converter import BaseFormatConverter, DirectoryBasedAnnotationConverterConfig + + +class BratsConverterConfig(DirectoryBasedAnnotationConverterConfig): + image_folder = StringField(optional=True) + mask_folder = StringField(optional=True) + + +class BratsConverter(BaseFormatConverter): + __provider__ = 'brats' + + _config_validator_type = BratsConverterConfig + + def configure(self): + self.data_dir = self.config['data_dir'] + self.image_folder = self.config.get('image_folder', 'imagesTr') + self.mask_folder = self.config.get('mask_folder', 'labelsTr') + + def convert(self): + mask_folder = Path(self.mask_folder) + image_folder = Path(self.image_folder) + image_dir = get_path(self.data_dir / image_folder, is_directory=True) + + annotations = [] + for file_in_dir in image_dir.iterdir(): + annotation = BrainTumorSegmentationAnnotation( + str(image_folder / file_in_dir.parts[-1]), + str(mask_folder / file_in_dir.parts[-1]), + ) + + annotations.append(annotation) + + return annotations, None diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/cityscapes.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/cityscapes.py new file mode 100644 index 0000000..3bda89a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/cityscapes.py @@ -0,0 +1,73 @@ +from pathlib import Path +from ..representation import SegmentationAnnotation +from ..representation.segmentation_representation import GTMaskLoader +from ..config import PathField, StringField, BoolField +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +train_meta = { + 'label_map': { + 0: 'road', 1: 'sidewalk', 2: 'building', 3: 'wall', 4: 'fence', 5: 'pole', 6: 'traffic light', + 7: 'traffic sign', 8: 'vegetation', 9: 'terrain', 10: 'sky', 11: 'person', 12: 'rider', 13: 'car', + 14: 'truck', 15: 'bus', 16: 'train', 17: 'motorcycle', 18: 'bicycle' + }, + 'segmentation_colors': ( + (128, 64, 128), (244, 35, 232), (70, 70, 70), (102, 102, 156), (190, 153, 153), (153, 153, 153), + (250, 170, 30), (220, 220, 0), (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), + (0, 0, 142), (0, 0, 70), (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32) + ), +} + +full_dataset_meta = { + 'segmentation_colors' : ( + (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (111, 74, 0), (81, 0, 81), (128, 64, 128), + (244, 35, 232), (250, 170, 160), (230, 150, 140), (70, 70, 70), (102, 102, 156), (190, 153, 153), + (180, 165, 180), (150, 100, 100), (150, 120, 90), (153, 153, 153), (153, 153, 153), (250, 170, 30), + (220, 220, 0), (107, 142, 35), (152, 251, 152), (70, 130, 180), (220, 20, 60), (255, 0, 0), (0, 0, 142), + (0, 0, 70), (0, 60, 100), (0, 0, 90), (0, 0, 110), (0, 80, 100), (0, 0, 230), (119, 11, 32) + ), + 'label_map': { + 0: 'unlabeled', 1: 'ego vehicle', 2: 'rectification border', 3: 'out of roi', 4: 'static', 5: 'dynamic', + 6: 'ground', 7: 'road', 8: 'sidewalk', 9: 'parking', 10: 'rail track', 11: 'building', 12: 'wall', + 13: 'fence', 14: 'guard rail', 15: 'bridge', 16: 'tunnel', 17: 'pole', 18: 'polegroup', 19: 'traffic light', + 20: 'traffic sign', 21: 'vegetation', 22: 'terrain', 23: 'sky', 24: 'person', 25: 'rider', 26: 'car', + 27: 'truck', 28: 'bus', 29: 'caravan', 30: 'trailer', 31: 'train', 32: 'motorcycle', 33: 'bicycle', + -1: 'license plate' + } +} + + +class CityscapesConverterConfig(BaseFormatConverterConfig): + dataset_root_dir = PathField(is_directory=True) + images_subfolder = StringField(optional=True) + masks_subfolder = StringField(optional=True) + masks_suffix = StringField(optional=True) + images_suffix = StringField(optional=True) + use_full_label_map = BoolField(optional=True) + + +class CityscapesConverter(BaseFormatConverter): + __provider__ = 'cityscapes' + + _config_validator_type = CityscapesConverterConfig + + def configure(self): + self.dataset_root = self.config['dataset_root_dir'] + self.images_dir = self.config.get('images_subfolder', 'imgsFine/leftImg8bit/val') + self.masks_dir = self.config.get('masks_subfolder', 'gtFine/val') + self.masks_suffix = self.config.get('masks_suffix', '_gtFine_labelTrainIds') + self.images_suffix = self.config.get('images_suffix', '_leftImg8bit') + self.use_full_label_map = self.config.get('use_full_label_map', False) + + + def convert(self): + images = list(self.dataset_root.rglob(r'{}/*/*{}.png'.format(self.images_dir, self.images_suffix))) + annotations = [] + for image in images: + identifier = str(Path(self.images_dir).joinpath(*image.parts[-2:])) + mask = Path(self.masks_dir) / image.parts[-2] / self.masks_suffix.join( + str(image.name).split(self.images_suffix) + ) + annotations.append(SegmentationAnnotation(identifier, mask, mask_loader=GTMaskLoader.PILLOW)) + + return annotations, full_dataset_meta if self.use_full_label_map else train_meta diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/convert.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/convert.py new file mode 100644 index 0000000..ba9ee8a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/convert.py @@ -0,0 +1,126 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import warnings +import json +from pathlib import Path +from argparse import ArgumentParser +from functools import partial + +import numpy as np + +from ..utils import get_path + +from .format_converter import BaseFormatConverter + + +def build_argparser(): + parser = ArgumentParser( + description="Converts annotation form a arbitrary format to accuracy-checker specific format", add_help=False + ) + parser.add_argument( + "converter", + help="Specific converter to run", + choices=list(BaseFormatConverter.providers.keys()) + ) + parser.add_argument( + "-o", "--output_dir", + help="Directory to save converted annotation and meta info", + required=False, + type=partial(get_path, is_directory=True) + ) + parser.add_argument("-m", "--meta_name", help="Meta info file name", required=False) + parser.add_argument("-a", "--annotation_name", help="Annotation file name", required=False) + parser.add_argument("-ss", "--subsample", help="Dataset subsample size", required=False) + parser.add_argument("--subsample_seed", help="Seed for generation dataset subsample", type=int, required=False) + + return parser + + +def make_subset(annotation, size, seed=666): + dataset_size = len(annotation) + if dataset_size < size: + warnings.warn('dataset size - {} less than subsample size - {}'.format(dataste_size, size)) + return annotation + np.random.seed(seed) + return list(np.random.choice(annotation, size=size, replace=False)) + + +def main(): + main_argparser = build_argparser() + args, _ = main_argparser.parse_known_args() + converter, converter_argparser, converter_args = get_converter_arguments(args) + + main_argparser = ArgumentParser(parents=[main_argparser, converter_argparser]) + args = main_argparser.parse_args() + + converter = configure_converter(converter_args, args, converter) + out_dir = args.output_dir or Path.cwd() + + result, meta = converter.convert() + + subsample = args.subsample + if subsample: + if subsample.endswith('%'): + subsample_ratio = float(subsample[:-1]) / 100 + subsample_size = int(len(result) * subsample_ratio) + else: + subsample_size = int(args.subsample) + + result = make_subset(result, subsample_size) + + converter_name = converter.get_name() + annotation_name = args.annotation_name or "{}.pickle".format(converter_name) + meta_name = args.meta_name or "{}.json".format(converter_name) + + annotation_file = out_dir / annotation_name + meta_file = out_dir / meta_name + + save_annotation(result, meta, annotation_file, meta_file) + + +def save_annotation(annotation, meta, annotation_file, meta_file): + if annotation_file: + with annotation_file.open('wb') as file: + for representation in annotation: + representation.dump(file) + if meta_file and meta: + with meta_file.open('wt') as file: + json.dump(meta, file) + + +def configure_converter(converter_options, args, converter): + args_dict, converter_options_dict = vars(args), vars(converter_options) + converter_config = { + option_name: option_value for option_name, option_value in args_dict.items() + if option_name in converter_options_dict and option_value is not None + } + converter_config['converter'] = args.converter + converter.config = converter_config + converter.validate_config() + converter.configure() + + return converter + + +def get_converter_arguments(arguments): + converter = BaseFormatConverter.provide(arguments.converter) + converter_argparser = converter.get_argparser() + converter_options, _ = converter_argparser.parse_known_args() + return converter, converter_argparser, converter_options + + +if __name__ == '__main__': + main() diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/detection_opencv_storage.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/detection_opencv_storage.py new file mode 100644 index 0000000..dfe461a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/detection_opencv_storage.py @@ -0,0 +1,114 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from ..config import PathField, NumberField +from ..representation import DetectionAnnotation +from ..utils import convert_bboxes_xywh_to_x1y1x2y2, read_xml, read_txt + +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class DetectionOpenCVConverterConfig(BaseFormatConverterConfig): + annotation_file = PathField() + image_names_file = PathField(optional=True) + label_start = NumberField(floats=False, optional=True) + background_label = NumberField(floats=False, optional=True) + + +class DetectionOpenCVStorageFormatConverter(BaseFormatConverter): + __provider__ = 'detection_opencv_storage' + + _config_validator_type = DetectionOpenCVConverterConfig + + def configure(self): + self.annotation_file = self.config['annotation_file'] + self.image_names_file = self.config.get('image_names_file') + self.label_start = self.config.get('label_start', 1) + self.background_label = self.config.get('background_label') + + def convert(self): + root = read_xml(self.annotation_file) + + labels_set = self.get_label_set(root) + + labels_set = sorted(labels_set) + class_to_ind = dict(zip(labels_set, list(range(self.label_start, len(labels_set) + self.label_start + 1)))) + label_map = {} + for class_label, ind in class_to_ind.items(): + label_map[ind] = class_label + + annotations = [] + for frames in root: + for frame in frames: + identifier = '{}.png'.format(frame.tag) + labels, x_mins, y_mins, x_maxs, y_maxs = [], [], [], [], [] + difficult_indices = [] + for annotation in frame: + label = annotation.findtext('type') + if not label: + raise ValueError('"{}" contains detection without "{}"'.format(self.annotation_file, 'type')) + + box = annotation.findtext('roi') + if not box: + raise ValueError('"{}" contains detection without "{}"'.format(self.annotation_file, 'roi')) + box = list(map(float, box.split())) + + is_ignored = annotation.findtext('is_ignored', 0) + if int(is_ignored) == 1: + difficult_indices.append(len(labels)) + + labels.append(class_to_ind[label]) + x_min, y_min, x_max, y_max = convert_bboxes_xywh_to_x1y1x2y2(*box) + x_mins.append(x_min) + y_mins.append(y_min) + x_maxs.append(x_max) + y_maxs.append(y_max) + + detection_annotation = DetectionAnnotation(identifier, labels, x_mins, y_mins, x_maxs, y_maxs) + detection_annotation.metadata['difficult_boxes'] = difficult_indices + annotations.append(detection_annotation) + + if self.image_names_file: + self.rename_identifiers(annotations, self.image_names_file) + + meta = {} + if self.background_label: + label_map[self.background_label] = '__background__' + meta['background_label'] = self.background_label + meta['label_map'] = label_map + + return annotations, meta + + @staticmethod + def rename_identifiers(annotation_list, images_file): + for annotation, image in zip(annotation_list, read_txt(images_file)): + annotation.identifier = image + + return annotation_list + + + @staticmethod + def get_label_set(xml_root): + labels_set = set() + for frames in xml_root: + for frame in frames: + for annotation in frame: + label = annotation.findtext('type') + if not label: + raise ValueError('annotation contains detection without label') + + labels_set.add(label) + + return labels_set diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/format_converter.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/format_converter.py new file mode 100644 index 0000000..7927867 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/format_converter.py @@ -0,0 +1,108 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from argparse import ArgumentParser + +from ..config import ConfigValidator, StringField, PathField +from ..dependency import ClassProvider +from ..utils import format_key + + +class BaseFormatConverterConfig(ConfigValidator): + converter = StringField() + + +class BaseFormatConverter(ClassProvider): + __provider_type__ = 'converter' + + _config_validator_type = BaseFormatConverterConfig + + @property + def config_validator(self): + return self._config_validator_type( + '{}_converter_config'.format(self.get_name()), + on_extra_argument=self._config_validator_type.ERROR_ON_EXTRA_ARGUMENT + ) + + def __init__(self, config=None): + self.config = config + if config: + self.validate_config() + self.configure() + + def convert(self, *args, **kwargs): + """ + Converts specific annotation format to the ResultRepresentation specific for current dataset/task. + + Returns: + annotation: list of ResultRepresentations. + meta: meta-data map for the current dataset. + """ + raise NotImplementedError + + @classmethod + def get_name(cls): + return cls.__provider__ + + def get_argparser(self): + parser = ArgumentParser(add_help=False) + config_validator = self.config_validator + fields = config_validator.fields + for field_name, field in fields.items(): + if field_name == 'converter': + # it is base argument. Main argparser already use it to get argparser from specific converter. + # Converter argparser should contain only converter specific arguments. + continue + + required = not field.optional + parser.add_argument( + format_key(field_name), required=required, type=field.type + ) + + return parser + + def validate_config(self): + self.config_validator.validate(self.config) + + def configure(self): + pass + + +class FileBasedAnnotationConverterConfig(BaseFormatConverterConfig): + annotation_file = PathField() + + +class FileBasedAnnotationConverter(BaseFormatConverter): + _config_validator_type = FileBasedAnnotationConverterConfig + + def configure(self): + self.annotation_file = self.config['annotation_file'] + + def convert(self, *args, **kwargs): + pass + + +class DirectoryBasedAnnotationConverterConfig(BaseFormatConverterConfig): + data_dir = PathField(is_directory=True) + + +class DirectoryBasedAnnotationConverter(BaseFormatConverter): + _config_validator_type = DirectoryBasedAnnotationConverterConfig + + def configure(self): + self.data_dir = self.config['data_dir'] + + def convert(self, *args, **kwargs): + pass diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/icdar.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/icdar.py new file mode 100644 index 0000000..184ade3 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/icdar.py @@ -0,0 +1,63 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from ..representation import TextDetectionAnnotation, CharacterRecognitionAnnotation +from ..utils import read_txt +from .format_converter import FileBasedAnnotationConverter, DirectoryBasedAnnotationConverter + + +class ICDAR15DetectionDatasetConverter(DirectoryBasedAnnotationConverter): + __provider__ = 'icdar15_detection' + + def convert(self): + annotations = [] + + for gt_file in self.data_dir.iterdir(): + gt_file_name = str(gt_file.parts[-1]) + identifier = '{}.jpg'.format(gt_file_name.split('gt_')[-1].split('.txt')[0]) + all_points, transcriptions, difficult = [], [], [] + + for text_area in read_txt(gt_file): + text_annotation = text_area.split(',') + transcription = text_annotation[-1] + points = np.reshape(list(map(float, text_annotation[:8])), (-1, 2)) + if transcription == '###': + difficult.append(len(transcriptions)) + all_points.append(points) + transcriptions.append(transcription) + annotation = TextDetectionAnnotation(identifier, all_points, transcriptions) + annotation.metadata['difficult_boxes'] = difficult + annotations.append(annotation) + + return annotations, None + + +class ICDAR13RecognitionDatasetConverter(FileBasedAnnotationConverter): + __provider__ = 'icdar13_recognition' + + supported_symbols = '0123456789abcdefghijklmnopqrstuvwxyz' + + def convert(self): + annotations = [] + + for line in read_txt(self.annotation_file): + identifier, text = line.strip().split(' ') + annotations.append(CharacterRecognitionAnnotation(identifier, text)) + + label_map = {ind: str(key) for ind, key in enumerate(self.supported_symbols)} + + return annotations, {'label_map': label_map, 'blank_label': len(label_map)} diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/imagenet.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/imagenet.py new file mode 100644 index 0000000..88df08a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/imagenet.py @@ -0,0 +1,52 @@ +import numpy as np + +from ..config import PathField, BoolField +from ..representation import ClassificationAnnotation +from ..utils import read_txt, get_path + +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class ImageNetFormatConverterConfig(BaseFormatConverterConfig): + annotation_file = PathField() + labels_file = PathField(optional=True) + has_background = BoolField(optional=True) + + +class ImageNetFormatConverter(BaseFormatConverter): + __provider__ = 'imagenet' + + _config_validator_type = ImageNetFormatConverterConfig + + def configure(self): + self.annotation_file = self.config['annotation_file'] + self.labels_file = self.config.get('labels_file') + self.has_background = self.config.get('has_background', False) + + def convert(self): + annotation = [] + for image in read_txt(get_path(self.annotation_file)): + image_name, label = image.split() + label = np.int64(label) if not self.has_background else np.int64(label) + 1 + annotation.append(ClassificationAnnotation(image_name, label)) + meta = self._create_meta(self.labels_file, self.has_background) if self.labels_file else None + + return annotation, meta + + @staticmethod + def _create_meta(labels_file, has_background=False): + meta = {} + labels = {} + for i, line in enumerate(read_txt(get_path(labels_file))): + index_for_label = i if not has_background else i + 1 + line = line.strip() + label = line[line.find(' ') + 1:] + labels[index_for_label] = label + + if has_background: + labels[0] = 'background' + meta['backgound_label'] = 0 + + meta['label_map'] = labels + + return meta diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/lfw.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/lfw.py new file mode 100644 index 0000000..1002daf --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/lfw.py @@ -0,0 +1,111 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from collections import defaultdict +from pathlib import Path + +from ..config import PathField +from ..representation import ReIdentificationClassificationAnnotation +from ..utils import read_txt + +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class FaceReidPairwiseConverterConfig(BaseFormatConverterConfig): + pairs_file = PathField() + train_file = PathField(optional=True) + landmarks_file = PathField(optional=True) + + +class FaceReidPairwiseConverter(BaseFormatConverter): + __provider__ = 'face_reid_pairwise' + + _config_validator_type = FaceReidPairwiseConverterConfig + + def configure(self): + self.pairs_file = self.config['pairs_file'] + self.train_file = self.config.get('train_file') + self.landmarks_file = self.config.get('landmarks_file') + + def convert(self): + landmarks_map = {} + if self.landmarks_file: + for landmark_line in read_txt(self.landmarks_file): + landmark_line = landmark_line.split('\t') + landmarks_map[landmark_line[0]] = [int(point) for point in landmark_line[1:]] + + test_annotations = self.prepare_annotation(self.pairs_file, True, landmarks_map) + if self.train_file: + train_annotations = self.prepare_annotation(self.train_file, True, landmarks_map) + test_annotations += train_annotations + + return test_annotations, None + + @staticmethod + def get_image_name(person, image_id): + image_path_pattern = '{}/{}_{}{}.jpg' + return image_path_pattern.format(person, person, '0' * (4 - len(image_id)), image_id) + + def convert_positive(self, pairs, all_images): + positives = defaultdict(set) + for data in pairs: + image1 = self.get_image_name(data[0], data[1]) + image2 = self.get_image_name(data[0], data[2]) + positives[image1].add(image2) + all_images.add(image1) + all_images.add(image2) + + return positives, all_images + + def convert_negative(self, pairs, all_images): + negatives = defaultdict(set) + for data in pairs: + image1 = self.get_image_name(data[0], data[1]) + image2 = self.get_image_name(data[2], data[3]) + negatives[image1].add(image2) + all_images.add(image1) + all_images.add(image2) + + return negatives, all_images + + def prepare_annotation(self, ann_file: Path, train=False, landmarks_map=None): + positive_pairs, negative_pairs = [], [] + ann_lines = read_txt(ann_file) + for line in ann_lines[1:]: # skip header + pair = line.strip().split() + if len(pair) == 3: + positive_pairs.append(pair) + elif len(pair) == 4: + negative_pairs.append(pair) + + all_images = set() + positive_data, all_images = self.convert_positive(positive_pairs, all_images) + negative_data, all_images = self.convert_negative(negative_pairs, all_images) + + annotations = [] + for image in all_images: + annotation = ReIdentificationClassificationAnnotation(image, positive_data[image], negative_data[image]) + + if landmarks_map: + image_landmarks = landmarks_map.get(image) + annotation.metadata['keypoints'] = image_landmarks + + if train: + annotation.metadata['train'] = True + + annotations.append(annotation) + + return annotations diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/market1501.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/market1501.py new file mode 100644 index 0000000..8c1e39e --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/market1501.py @@ -0,0 +1,41 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from __future__ import absolute_import, print_function + +import re + +from ._reid_common import check_dirs, read_directory +from .format_converter import DirectoryBasedAnnotationConverter + +MARKET_IMAGE_PATTERN = re.compile(r'([-\d]+)_c(\d)') + + +class Market1501Converter(DirectoryBasedAnnotationConverter): + __provider__ = "market1501" + + def convert(self): + gallery = self.data_dir / 'bounding_box_test' + query = self.data_dir / 'query' + + check_dirs((gallery, query), self.data_dir) + gallery_images, gallery_pids = read_directory(gallery, query=False, image_pattern=MARKET_IMAGE_PATTERN) + query_images, query_pids = read_directory(query, query=True, image_pattern=MARKET_IMAGE_PATTERN) + annotation = gallery_images + query_images + + meta = {'num_identities': len(gallery_pids | query_pids)} + + return annotation, meta diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/mars.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/mars.py new file mode 100644 index 0000000..bb8de49 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/mars.py @@ -0,0 +1,38 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from __future__ import absolute_import, print_function + +import re + +from ._reid_common import check_dirs, read_directory +from .format_converter import DirectoryBasedAnnotationConverter + +MARS_IMAGE_PATTERN = re.compile(r'([\d]+)C(\d)') + + +class MARSConverter(DirectoryBasedAnnotationConverter): + __provider__ = 'mars' + + def convert(self): + gallery = self.data_dir / 'bbox_test' + query = self.data_dir / 'query' + + check_dirs((gallery, query), self.data_dir) + gallery_images, gallery_pids = read_directory(gallery, query=False, image_pattern=MARS_IMAGE_PATTERN) + query_images, query_pids = read_directory(query, query=True, image_pattern=MARS_IMAGE_PATTERN) + + return gallery_images + query_images, {'num_identities': len(gallery_pids | query_pids)} diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/ms_coco.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/ms_coco.py new file mode 100644 index 0000000..f1e41be --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/ms_coco.py @@ -0,0 +1,129 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from tqdm import tqdm +import numpy as np + +from ..config import BoolField +from ..utils import read_json, convert_bboxes_xywh_to_x1y1x2y2 +from ..representation import DetectionAnnotation, PoseEstimationAnnotation +from .format_converter import BaseFormatConverter, FileBasedAnnotationConverter, FileBasedAnnotationConverterConfig + + +def get_image_annotation(image_id, annotations_): + return list(filter(lambda x: x['image_id'] == image_id, annotations_)) + + +def get_label_map(full_annotation, use_full_label_map=False, has_background=False): + labels = full_annotation['categories'] + + if not use_full_label_map: + label_offset = 1 if has_background else 0 + label_id_to_label = {label['id']: label_id + label_offset for label_id, label in enumerate(labels)} + label_map = {label_id + label_offset: label['name'] for label_id, label in enumerate(labels)} + else: + label_id_to_label = {label['id']: label['id'] for label in labels} + label_map = {label['id']: label['name'] for label in labels} + + return label_map, label_id_to_label + + +class MSCocoDetectionConverterConfig(FileBasedAnnotationConverterConfig): + has_background = BoolField(optional=True) + use_full_label_map = BoolField(optional=True) + + +class MSCocoDetectionConverter(BaseFormatConverter): + __provider__ = 'mscoco_detection' + + _config_validator_type = MSCocoDetectionConverterConfig + + def configure(self): + self.annotation_file = self.config['annotation_file'] + self.has_background = self.config.get('has_background', False) + self.use_full_label_map = self.config.get('use_full_label_map', False) + + def convert(self): + detection_annotations = [] + full_annotation = read_json(self.annotation_file) + image_info = full_annotation['images'] + annotations = full_annotation['annotations'] + + label_map, label_id_to_label = get_label_map(full_annotation, self.use_full_label_map, self.has_background) + + meta = {} + if self.has_background: + label_map[0] = 'background' + meta['background_label'] = 0 + + meta.update({'label_map': label_map}) + + for image in tqdm(image_info): + identifier = image['file_name'] + image_annotation = get_image_annotation(image['id'], annotations) + image_labels = [label_id_to_label[annotation['category_id']] for annotation in image_annotation] + xmins = [annotation['bbox'][0] for annotation in image_annotation] + ymins = [annotation['bbox'][1] for annotation in image_annotation] + widths = [annotation['bbox'][2] for annotation in image_annotation] + heights = [annotation['bbox'][3] for annotation in image_annotation] + xmaxs = np.add(xmins, widths) + ymaxs = np.add(ymins, heights) + is_crowd = [annotation['iscrowd'] for annotation in image_annotation] + detection_annotation = DetectionAnnotation(identifier, image_labels, xmins, ymins, xmaxs, ymaxs) + detection_annotation.metadata['iscrowd'] = is_crowd + detection_annotations.append(detection_annotation) + + return detection_annotations, meta + + +class MSCocoKeypointsConverter(FileBasedAnnotationConverter): + __provider__ = 'mscoco_keypoints' + + def convert(self): + keypoints_annotations = [] + + full_annotation = read_json(self.annotation_file) + image_info = full_annotation['images'] + annotations = full_annotation['annotations'] + label_map, _ = get_label_map(full_annotation, True) + for image in image_info: + identifier = image['file_name'] + image_annotation = get_image_annotation(image['id'], annotations) + if not image_annotation: + continue + x_vals, y_vals, visibility, labels, areas, is_crowd, bboxes, difficult = [], [], [], [], [], [], [], [] + for target in image_annotation: + if target['num_keypoints'] == 0: + difficult.append(len(x_vals)) + labels.append(target['category_id']) + keypoints = target['keypoints'] + x_vals.append(keypoints[::3]) + y_vals.append(keypoints[1::3]) + visibility.append(keypoints[2::3]) + areas.append(target['area']) + bboxes.append(convert_bboxes_xywh_to_x1y1x2y2(*target['bbox'])) + is_crowd.append(target['iscrowd']) + keypoints_annotation = PoseEstimationAnnotation( + identifier, np.array(x_vals), np.array(y_vals), np.array(visibility), np.array(labels) + ) + keypoints_annotation.metadata['areas'] = areas + keypoints_annotation.metadata['rects'] = bboxes + keypoints_annotation.metadata['iscrowd'] = is_crowd + keypoints_annotation.metadata['difficult_boxes'] = difficult + + keypoints_annotations.append(keypoints_annotation) + + return keypoints_annotations, {'label_map': label_map} diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/ncf_converter.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/ncf_converter.py new file mode 100644 index 0000000..5e7ac59 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/ncf_converter.py @@ -0,0 +1,74 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +from ..representation import HitRatioAnnotation +from ..utils import read_txt +from ..config import PathField, NumberField + +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class NCFDatasetConverterConfig(BaseFormatConverterConfig): + raiting_file = PathField() + negative_file = PathField() + users_max_number = NumberField(optional=True) + + +class NCFConverter(BaseFormatConverter): + __provider__ = "ncf_converter" + + _config_validator_type = NCFDatasetConverterConfig + + def configure(self): + self.raiting_file = self.config['raiting_file'] + self.negative_file = self.config['negative_file'] + if 'users_max_number' in self.config: + self.users_max_number = self.config['users_max_number'] + else: + self.users_max_number = -1 + + def convert(self): + annotations = [] + users = [] + + for file_row in read_txt(self.raiting_file): + user_id, item_id, _ = file_row.split() + users.append(user_id) + identifier = ['u:'+user_id, 'i:'+item_id] + annotations.append(HitRatioAnnotation(identifier)) + if self.users_max_number > 0 and len(users) >= self.users_max_number: + break; + + item_numbers = 1 + + items_neg = [] + for file_row in read_txt(self.negative_file): + items = file_row.split() + items_neg.append(items) + if self.users_max_number > 0 and len(items_neg) >= self.users_max_number: + break; + + if items_neg: + iterations = len(items_neg[0]) + item_numbers += iterations + for i in range(iterations): + for user in users: + item = items_neg[int(user)][i] + identifier = ['u:' + user, 'i:'+ item] + annotations.append(HitRatioAnnotation(identifier, False)) + + return annotations, {'users_number': len(users), 'item_numbers': item_numbers} diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/pascal_voc.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/pascal_voc.py new file mode 100644 index 0000000..651c525 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/pascal_voc.py @@ -0,0 +1,157 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from tqdm import tqdm + +from ..config import PathField, BoolField +from ..representation import DetectionAnnotation, SegmentationAnnotation +from ..representation.segmentation_representation import GTMaskLoader +from ..utils import get_path, read_txt, read_xml +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + +_VOC_CLASSES_DETECTION = ( + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor' +) + +_VOC_CLASSES_SEGMENTATION = tuple(['__background__']) + _VOC_CLASSES_DETECTION +_SEGMENTATION_COLORS = (( + (0, 0, 0), (128, 0, 0), (0, 128, 0), (128, 128, 0), + (0, 0, 128), (128, 0, 128), (0, 128, 128), (128, 128, 128), + (64, 0, 0), (192, 0, 0), (64, 128, 0), (192, 128, 0), + (64, 0, 128), (192, 0, 128), (64, 128, 128), (192, 128, 128), + (0, 64, 0), (128, 64, 0), (0, 192, 0), (128, 192, 0), + (0, 64, 128) +)) + + +def prepare_detection_labels(has_background=True): + num_classes = len(_VOC_CLASSES_DETECTION) + labels_shift = 1 if has_background else 0 + reversed_label_map = dict(zip(_VOC_CLASSES_DETECTION, list(range(labels_shift, num_classes + labels_shift)))) + if has_background: + reversed_label_map['__background__'] = 0 + + return reversed_label_map + + +def reverse_label_map(label_map): + return {value: key for key, value in label_map.items()} + + +class PascalVOCSegmentationConverterConfig(BaseFormatConverterConfig): + image_set_file = PathField() + images_dir = PathField(optional=True, is_directory=True) + mask_dir = PathField(optional=True, is_directory=True) + + +class PascalVOCSegmentationConverter(BaseFormatConverter): + __provider__ = 'voc_segmentation' + + _config_validator_type = PascalVOCSegmentationConverterConfig + + def configure(self): + self.image_set_file = self.config['image_set_file'] + self.image_dir = self.config.get('images_dir') + if not self.image_dir: + self.image_dir = get_path(self.image_set_file.parent / 'JPEGImages') + + self.mask_dir = self.config.get('mask_dir') + if not self.mask_dir: + self.mask_dir = get_path(self.image_set_file.parent / 'SegmentationClass') + + def convert(self): + + annotations = [] + for image in read_txt(self.image_set_file): + annotation = SegmentationAnnotation( + str(self.image_dir.name / '{}.jpg'.format(image)), + str(self.mask_dir.name / '{}.png'.format(image)), + mask_loader=GTMaskLoader.SCIPY + ) + + annotations.append(annotation) + + meta = { + 'label_map': dict(enumerate(_VOC_CLASSES_SEGMENTATION)), + 'background_label': 0, + 'segmentation_colors': _SEGMENTATION_COLORS + } + + return annotations, meta + + +class PascalVOCDetectionConverterConfig(BaseFormatConverterConfig): + image_set_file = PathField() + annotations_dir = PathField(is_directory=True) + images_dir = PathField(optional=True, is_directory=True) + has_background = BoolField(optional=True) + + +class PascalVOCDetectionConverter(BaseFormatConverter): + __provider__ = 'voc07' + + _config_validator_type = PascalVOCDetectionConverterConfig + + def configure(self): + self.image_set_file = self.config['image_set_file'] + self.image_dir = self.config.get('images_dir') + if not self.image_dir: + self.image_dir = get_path(self.image_set_file.parent / 'JPEGImages') + self.annotations_dir = self.config['annotations_dir'] + self.has_background = self.config.get('has_background', True) + + def convert(self): + class_to_ind = prepare_detection_labels(self.has_background) + + detections = [] + for image in tqdm(read_txt(self.image_set_file, sep=None)): + root = read_xml(self.annotations_dir / '{}.xml'.format(image)) + + identifier = root.find('.//filename').text + get_path(self.image_dir / identifier) + + labels, x_mins, y_mins, x_maxs, y_maxs = [], [], [], [], [] + difficult_indices = [] + for entry in root: + if not entry.tag.startswith('object'): + continue + + bbox = entry.find('bndbox') + difficult = int(entry.find('difficult').text) + + if difficult == 1: + difficult_indices.append(len(labels)) + + labels.append(class_to_ind[entry.find('name').text]) + x_mins.append(float(bbox.find('xmin').text) - 1) + y_mins.append(float(bbox.find('ymin').text) - 1) + x_maxs.append(float(bbox.find('xmax').text) - 1) + y_maxs.append(float(bbox.find('ymax').text) - 1) + + image_annotation = DetectionAnnotation(identifier, labels, x_mins, y_mins, x_maxs, y_maxs) + image_annotation.metadata['difficult_boxes'] = difficult_indices + + detections.append(image_annotation) + + meta = {'label_map': reverse_label_map(class_to_ind)} + if self.has_background: + meta['background_label'] = 0 + + return detections, meta diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/sample_converter.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/sample_converter.py new file mode 100644 index 0000000..88fb713 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/sample_converter.py @@ -0,0 +1,100 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import re + +from ..config import PathField +from ..representation import ClassificationAnnotation +from ..utils import get_path, read_txt + +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class SampleConverterConfig(BaseFormatConverterConfig): + data_dir = PathField(is_directory=True) + + +class SampleConverter(BaseFormatConverter): + """ + Sample dataset converter. All annotation converters should be derived from BaseFormatConverter class. + """ + + # register name for this converter + # this name will be used for converter class look up + __provider__ = 'sample' + + _config_validator_type = SampleConverterConfig + + def configure(self): + self.data_dir = self.config['data_dir'] + + def convert(self): + """ + This method is executed automatically when convert.py is started. + All arguments are automatically forwarded from command line arguments. + + Returns: + annotations: list of annotation representation objects. + meta: dictionary with additional dataset level metadata. + """ + + dataset_directory = get_path(self.data_dir, is_directory=True) + + # read and convert annotation + labels = self._read_labels(dataset_directory / 'labels.txt') + annotations = self._convert_annotations(dataset_directory / 'test', labels) + + # convert label list to label map + label_map = {i: labels[i] for i in range(len(labels))} + metadata = {'label_map': label_map} + + return annotations, metadata + + @staticmethod + def _read_labels(labels_file): + """ + Extract label names from labels.txt file. + """ + + return read_txt(labels_file) + + @staticmethod + def _convert_annotations(test_dir, labels): + """ + Create annotation representations list. + """ + + # test directory contains files with names XXXX_class.png + # we use regular expression to extract class names + file_pattern_regex = re.compile(r'\d+_(\w+)\.png') + + annotations = [] + # iterate over all png images in test directory + for image in test_dir.glob('*.png'): + # get file name (e.g. from /foo/bar/image.png we get image.png) + image_base = str(image.parts[-1]) + + # extract class name from file name + regex_match = re.match(file_pattern_regex, image_base) + image_label = regex_match.group(1) + + # look up class index in label list + class_id = labels.index(image_label) + + # create annotation representation object + annotations.append(ClassificationAnnotation(image_base, class_id)) + + return annotations diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/super_resolution_converter.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/super_resolution_converter.py new file mode 100644 index 0000000..4c053f9 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/super_resolution_converter.py @@ -0,0 +1,52 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from ..config import PathField, StringField, BoolField +from ..representation import SuperResolutionAnnotation +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class SRConverterConfig(BaseFormatConverterConfig): + data_dir = PathField(is_directory=True) + lr_suffix = StringField(optional=True) + hr_suffix = StringField(optional=True) + two_streams = BoolField(optional=True) + + +class SRConverter(BaseFormatConverter): + __provider__ = 'super_resolution' + + _config_validator_type = SRConverterConfig + + def configure(self): + self.data_dir = self.config['data_dir'] + self.lr_suffix = self.config.get('lr_suffix', 'lr') + self.hr_suffix = self.config.get('hr_suffix', 'hr') + self.two_streams = self.config.get('two_streams', False) + + def convert(self): + file_list_lr = [] + for file_in_dir in self.data_dir.iterdir(): + if self.lr_suffix in file_in_dir.parts[-1]: + file_list_lr.append(file_in_dir) + + annotation = [] + for lr_file in file_list_lr: + lr_file_name = lr_file.parts[-1] + hr_file_name = self.hr_suffix.join(lr_file_name.split(self.lr_suffix)) + identifier = [lr_file_name, hr_file_name] if self.two_streams else lr_file_name + annotation.append(SuperResolutionAnnotation(identifier, hr_file_name)) + + return annotation, None diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/vgg_face_regression.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/vgg_face_regression.py new file mode 100644 index 0000000..53c7c57 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/vgg_face_regression.py @@ -0,0 +1,64 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..config import PathField +from ..representation import FacialLandmarksAnnotation +from ..utils import convert_bboxes_xywh_to_x1y1x2y2, read_csv +from .format_converter import BaseFormatConverter, BaseFormatConverterConfig + + +class LandmarksRegressionConfig(BaseFormatConverterConfig): + landmarks_csv_file = PathField() + bbox_csv_file = PathField(optional=True) + + +class LandmarksRegression(BaseFormatConverter): + __provider__ = 'landmarks_regression' + + _config_validator_type = LandmarksRegressionConfig + + def configure(self): + self.landmarks_csv = self.config['landmarks_csv_file'] + self.bbox_csv = self.config.get('bbox_csv_file') + + def convert(self): + annotations = [] + for row in read_csv(self.landmarks_csv): + identifier = row['NAME_ID'] + '.jpg' + x_values = np.array( + [float(row["P1X"]), float(row["P2X"]), float(row["P3X"]), float(row["P4X"]), float(row["P5X"])] + ) + y_values = np.array( + [float(row["P1Y"]), float(row["P2Y"]), float(row["P3Y"]), float(row["P4Y"]), float(row["P5Y"])] + ) + + annotation = FacialLandmarksAnnotation(identifier, x_values, y_values) + annotation.metadata['left_eye'] = 0 + annotation.metadata['right_eye'] = 1 + annotations.append(annotation) + + if self.bbox_csv: + for index, row in enumerate(read_csv(self.bbox_csv)): + annotations[index].metadata['rect'] = convert_bboxes_xywh_to_x1y1x2y2( + int(row["X"]), int(row["Y"]), int(row["W"]), int(row["H"]) + ) + + meta = { + 'label_map': {0: 'Left Eye', 1: 'Right Eye', 2: 'Nose', 3: 'Left Mouth Corner', 4: 'Right Mouth Corner'} + } + return annotations, meta diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/wider.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/wider.py new file mode 100644 index 0000000..3b5876f --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/annotation_converters/wider.py @@ -0,0 +1,64 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +from ..config import NumberField +from ..representation import DetectionAnnotation +from ..utils import convert_bboxes_xywh_to_x1y1x2y2, read_txt + +from .format_converter import BaseFormatConverter, FileBasedAnnotationConverterConfig + + +class WiderConverterConfig(FileBasedAnnotationConverterConfig): + label_start = NumberField(floats=False, optional=True) + + +class WiderFormatConverter(BaseFormatConverter): + __provider__ = 'wider' + + _config_validator_type = WiderConverterConfig + + def configure(self): + self.annotation_file = self.config['annotation_file'] + self.label_start = self.config.get('label_start', 1) + + def convert(self): + image_annotations = read_txt(self.annotation_file) + image_ids = [] + for image_id, line in enumerate(image_annotations): + if '.jpg' in line: + image_ids.append(image_id) + + annotations = [] + for image_id in image_ids: + identifier = image_annotations[image_id] + bbox_count = image_annotations[image_id + 1] + bbox_lines = image_annotations[image_id + 2:image_id + 2 + int(bbox_count)] + + x_mins, y_mins, x_maxs, y_maxs = [], [], [], [] + for bbox in bbox_lines: + x_min, y_min, x_max, y_max = convert_bboxes_xywh_to_x1y1x2y2(*(map(float, (bbox.split(' ')[0:4])))) + x_mins.append(x_min) + y_mins.append(y_min) + x_maxs.append(x_max) + y_maxs.append(y_max) + + annotations.append(DetectionAnnotation( + identifier, [self.label_start] * len(x_mins), + x_mins, y_mins, x_maxs, y_maxs + )) + + return annotations, {'label_map': {0: '__background__', self.label_start: 'face'}, 'background_label': 0} diff --git a/tools/accuracy_checker/accuracy_checker/config/__init__.py b/tools/accuracy_checker/accuracy_checker/config/__init__.py new file mode 100644 index 0000000..a32b29a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/config/__init__.py @@ -0,0 +1,48 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .config_validator import ( + BaseField, + StringField, + ListField, + BoolField, + PathField, + NumberField, + DictField, + + BaseValidator, + ConfigError, + ConfigValidator +) + + +from .config_reader import ConfigReader + +__all__ = [ + 'BaseField', + 'StringField', + 'ListField', + 'BoolField', + 'PathField', + 'NumberField', + 'DictField', + + 'BaseValidator', + 'ConfigError', + 'ConfigValidator', + + 'ConfigReader' +] diff --git a/tools/accuracy_checker/accuracy_checker/config/config_reader.py b/tools/accuracy_checker/accuracy_checker/config/config_reader.py new file mode 100644 index 0000000..3430090 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/config/config_reader.py @@ -0,0 +1,281 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import copy +from pathlib import Path + +import warnings + +from ..utils import read_yaml, to_lower_register, contains_any +from .config_validator import ConfigError + + +class ConfigReader: + """ + Class for parsing input config. + """ + + @staticmethod + def merge(arguments): + """ + Args: + arguments: command-line arguments. + Returns: + dictionary containing configuration. + """ + + global_config, local_config = ConfigReader._read_configs(arguments) + if not local_config: + raise ConfigError('Missing local config') + + ConfigReader._check_local_config(local_config) + ConfigReader._prepare_global_configs(global_config) + + config = ConfigReader._merge_configs(global_config, local_config) + + ConfigReader._provide_cmd_arguments(arguments, config) + ConfigReader._merge_paths_with_prefixes(arguments, config) + ConfigReader._filter_launchers(config, arguments) + + return config + + @staticmethod + def _read_configs(arguments): + global_config = read_yaml(arguments.definitions) if arguments.definitions else None + local_config = read_yaml(arguments.config) + + return global_config, local_config + + @staticmethod + def _check_local_config(config): + models = config.get('models') + if not models: + raise ConfigError('Missed "{}" in local config'.format('models')) + + def _is_requirements_missed(target, requirements): + return list(filter(lambda entry: not target.get(entry), requirements)) + + required_model_entries = ['name', 'launchers', 'datasets'] + required_dataset_entries = ['name'] + required_dataset_error = 'Model {} must specify {} for each dataset' + for model in models: + if _is_requirements_missed(model, required_model_entries): + raise ConfigError('Each model must specify {}'.format(required_model_entries)) + + if list(filter(lambda entry: _is_requirements_missed(entry, required_dataset_entries), model['datasets'])): + raise ConfigError(required_dataset_error.format(model['name'], ','.join(required_dataset_entries))) + + @staticmethod + def _prepare_global_configs(global_configs): + if not global_configs or 'datasets' not in global_configs: + return + + datasets = global_configs['datasets'] + + def merge(local_entries, global_entries, identifier): + if not local_entries or not global_entries: + return + + for i, local in enumerate(local_entries): + local_identifier = local.get(identifier) + if not local_identifier: + continue + + local_entries[i] = ConfigReader._merge_configs_by_identifier(global_entries, local, identifier) + + for dataset in datasets: + merge(dataset.get('preprocessing'), global_configs.get('preprocessing'), 'type') + merge(dataset.get('metrics'), global_configs.get('metrics'), 'type') + merge(dataset.get('postprocessing'), global_configs.get('postprocessing'), 'type') + + @staticmethod + def _merge_configs(global_configs, local_config): + config = copy.deepcopy(local_config) + if not global_configs: + return config + + models = config.get('models') + for model in models: + for i, launcher_entry in enumerate(model['launchers']): + model['launchers'][i] = ConfigReader._merge_configs_by_identifier( + global_configs['launchers'], launcher_entry, 'framework' + ) + + for i, dataset in enumerate(model['datasets']): + model['datasets'][i] = ConfigReader._merge_configs_by_identifier( + global_configs['datasets'], dataset, 'name' + ) + + return config + + @staticmethod + def _merge_configs_by_identifier(global_config, local_config, identifier): + local_identifier = local_config.get(identifier) + if local_identifier is None: + return local_config + + matched = [] + for config in global_config: + global_identifier = config.get(identifier) + if global_identifier is None: + continue + + if global_identifier != local_identifier: + continue + + matched.append(config) + + config = copy.deepcopy(matched[0] if matched else {}) + for key, value in local_config.items(): + config[key] = value + + return config + + @staticmethod + def _merge_paths_with_prefixes(arguments, config): + args = arguments if isinstance(arguments, dict) else vars(arguments) + entries_paths = { + 'launchers': { + 'model': 'models', + 'weights': 'models', + 'caffe_model': 'models', + 'caffe_weights': 'models', + 'tf_model': 'models', + 'mxnet_weights': 'models', + 'onnx_model': 'models', + 'kaldi_model': 'models', + 'cpu_extensions': 'extensions', + 'gpu_extensions': 'extensions', + 'bitstream': 'bitstreams', + 'affinity_map' : 'affinity_map' + }, + 'datasets': { + 'data_source': 'source', + 'segmentation_masks_source': 'source', + 'annotation': 'annotations', + 'dataset_meta': 'annotations' + } + } + + def merge_entry_paths(keys, value): + for field, argument in keys.items(): + if field not in value: + continue + + config_path = Path(value[field]) + if config_path.is_absolute(): + value[field] = Path(value[field]) + continue + + if not args[argument]: + continue + + value[field] = args[argument] / config_path + + def create_command_line_for_conversion(config): + mapping = {} + value = 'source' + for key in config: + if key.endswith('file') or key.endswith('dir'): + mapping[key] = value + return mapping + + for model in config['models']: + for entry, command_line_arg in entries_paths.items(): + if entry not in model: + continue + + for config_entry in model[entry]: + if entry == 'datasets': + annotation_conversion_config = config_entry.get('annotation_conversion') + if annotation_conversion_config: + command_line_conversion = (create_command_line_for_conversion(annotation_conversion_config)) + merge_entry_paths(command_line_conversion, annotation_conversion_config) + merge_entry_paths(command_line_arg, config_entry) + + @staticmethod + def _provide_cmd_arguments(arguments, config): + def merge_converted_model_path(converted_models_dir, mo_output_dir): + if mo_output_dir: + mo_output_dir = Path(mo_output_dir) + if mo_output_dir.is_absolute(): + return mo_output_dir + return converted_models_dir / mo_output_dir + return converted_models_dir + + additional_keys = [ + 'model_optimizer', 'tf_custom_op_config_dir', + 'tf_obj_detection_api_pipeline_config_path', + 'cpu_extensions_mode' + ] + arguments_dict = arguments if isinstance(arguments, dict) else vars(arguments) + update_launcher_entry = {} + + for key in additional_keys: + value = arguments_dict.get(key) + if value: + update_launcher_entry['_{}'.format(key)] = value + + for model in config['models']: + for launcher_entry in model['launchers']: + if launcher_entry['framework'].lower() != 'dlsdk': + continue + + launcher_entry.update(update_launcher_entry) + models_prefix = arguments.models + if models_prefix: + launcher_entry['_models_prefix'] = models_prefix + + if not arguments.converted_models: + continue + + mo_params = launcher_entry.get('mo_params', {}) + + mo_params.update({ + 'output_dir': merge_converted_model_path(arguments.converted_models, mo_params.get('output_dir')) + }) + + launcher_entry['mo_params'] = mo_params + + if arguments.aocl: + launcher_entry['_aocl'] = arguments.aocl + + @staticmethod + def _filter_launchers(config, arguments): + def filtered(launcher, targets): + target_tags = args.get('target_tags') or [] + if target_tags: + if not contains_any(target_tags, launcher.get('tags', [])): + return True + + config_framework = launcher['framework'].lower() + target_framework = (args.get('target_framework') or config_framework).lower() + if config_framework != target_framework: + return True + + return targets and launcher.get('device', '').lower() not in targets + + args = arguments if isinstance(arguments, dict) else vars(arguments) + target_devices = to_lower_register(args.get('target_devices') or []) + + for model in config['models']: + launchers = model['launchers'] + launchers = [launcher for launcher in launchers if not filtered(launcher, target_devices)] + + if not launchers: + warnings.warn('Model "{}" has no launchers'.format(model['name'])) + + model['launchers'] = launchers diff --git a/tools/accuracy_checker/accuracy_checker/config/config_validator.py b/tools/accuracy_checker/accuracy_checker/config/config_validator.py new file mode 100644 index 0000000..edb1e24 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/config/config_validator.py @@ -0,0 +1,339 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import enum +import math +import re +import warnings +from collections import OrderedDict +from copy import copy +from functools import partial +from pathlib import Path + +from ..utils import get_path, string_to_bool + + +class ConfigError(ValueError): + pass + + +class BaseValidator: + def __init__(self, on_error=None, additional_validator=None): + self.on_error = on_error + self.additional_validator = additional_validator + + self.field_uri = None + + def validate(self, entry, field_uri=None): + field_uri = field_uri or self.field_uri + if self.additional_validator and not self.additional_validator(entry, field_uri): + self.raise_error(entry, field_uri) + + def raise_error(self, value, field_uri, reason=None): + if self.on_error: + self.on_error(value, field_uri, reason) + + error_message = 'Invalid value "{value}" for {field_uri}'.format(value=value, field_uri=field_uri) + if reason: + error_message = '{error_message}: {reason}'.format(error_message=error_message, reason=reason) + + raise ConfigError(error_message.format(value, field_uri)) + + +class _ExtraArgumentBehaviour(enum.Enum): + WARN = 'warn' + IGNORE = 'ignore' + ERROR = 'error' + + +def _is_dict_like(entry): + return hasattr(entry, '__iter__') and hasattr(entry, '__getitem__') + + +class ConfigValidator(BaseValidator): + WARN_ON_EXTRA_ARGUMENT = _ExtraArgumentBehaviour.WARN + ERROR_ON_EXTRA_ARGUMENT = _ExtraArgumentBehaviour.ERROR + IGNORE_ON_EXTRA_ARGUMENT = _ExtraArgumentBehaviour.IGNORE + + def __init__(self, config_uri, on_extra_argument=WARN_ON_EXTRA_ARGUMENT, **kwargs): + super().__init__(**kwargs) + self.on_extra_argument = on_extra_argument + + self.fields = OrderedDict() + self.field_uri = config_uri + for name in dir(self): + value = getattr(self, name) + if not isinstance(value, BaseField): + continue + + field_copy = copy(value) + field_copy.field_uri = "{}.{}".format(config_uri, name) + self.fields[name] = field_copy + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + field_uri = field_uri or self.field_uri + if not _is_dict_like(entry): + raise ConfigError("{} is expected to be dict-like".format(field_uri)) + + extra_arguments = [] + for key in entry: + if key not in self.fields: + extra_arguments.append(key) + continue + + self.fields[key].validate(entry[key]) + + required_fields = set(name for name, value in self.fields.items() if not value.optional) + missing_arguments = required_fields.difference(entry) + + if missing_arguments: + arguments = ', '.join(map(str, missing_arguments)) + self.raise_error( + entry, field_uri, "Invalid config for {}: missing required fields: {}".format(field_uri, arguments) + ) + + if extra_arguments: + unknown_options_error = "specifies unknown options: {}".format(extra_arguments) + message = "{} {}".format(field_uri, unknown_options_error) + + if self.on_extra_argument == _ExtraArgumentBehaviour.WARN: + warnings.warn(message) + if self.on_extra_argument == _ExtraArgumentBehaviour.ERROR: + self.raise_error(entry, field_uri, message) + + @property + def known_fields(self): + return set(self.fields) + + def raise_error(self, value, field_uri, reason=None): + if self.on_error: + self.on_error(value, field_uri, reason) + else: + raise ConfigError(reason) + + +class BaseField(BaseValidator): + def __init__(self, optional=False, allow_none=False, **kwargs): + super().__init__(**kwargs) + self.optional = optional + self.allow_none = allow_none + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + field_uri = field_uri or self.field_uri + if not self.allow_none and entry is None: + raise ConfigError("{} is not allowed to be None".format(field_uri)) + + @property + def type(self): + return str + + +class StringField(BaseField): + def __init__(self, choices=None, regex=None, case_sensitive=False, **kwargs): + super().__init__(**kwargs) + self.choices = choices if case_sensitive or not choices else list(map(str.lower, choices)) + self.regex = re.compile(regex, flags=re.IGNORECASE if not case_sensitive else 0) if regex else None + self.case_sensitive = case_sensitive + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + if entry is None: + return + + field_uri = field_uri or self.field_uri + source_entry = entry + + if not isinstance(entry, str): + raise ConfigError("{} is expected to be str".format(source_entry)) + + if not self.case_sensitive: + entry = entry.lower() + + if self.choices and entry not in self.choices: + reason = "unsupported option, expected one of: {}".format(', '.join(map(str, self.choices))) + self.raise_error(source_entry, field_uri, reason) + + if self.regex and not self.regex.match(entry): + self.raise_error(source_entry, field_uri, reason=None) + + @property + def type(self): + return str + + +class DictField(BaseField): + def __init__(self, key_type=None, value_type=None, validate_keys=True, validate_values=True, allow_empty=True, + **kwargs): + super().__init__(**kwargs) + self.validate_keys = validate_keys if key_type else False + self.validate_values = validate_values if value_type else False + self.key_type = _get_field_type(key_type) + self.value_type = _get_field_type(value_type) + + self.allow_empty = allow_empty + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + if entry is None: + return + + field_uri = field_uri or self.field_uri + if not isinstance(entry, dict): + raise ConfigError("{} is expected to be dict".format(field_uri)) + + if not entry and not self.allow_empty: + self.raise_error(entry, field_uri, "value is empty") + + for k, v in entry.items(): + if self.validate_keys: + uri = "{}.keys.{}".format(field_uri, k) + self.key_type.validate(k, uri) + + if self.validate_values: + uri = "{}.{}".format(field_uri, k) + + self.value_type.validate(v, uri) + @property + def type(self): + return dict + + +class ListField(BaseField): + def __init__(self, value_type=None, validate_values=True, allow_empty=True, **kwargs): + super().__init__(**kwargs) + self.validate_values = validate_values if value_type else False + self.value_type = _get_field_type(value_type) + self.allow_empty = allow_empty + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + if entry is None: + return + + if not isinstance(entry, list): + raise ConfigError("{} is expected to be list".format(field_uri)) + + if not entry and not self.allow_empty: + self.raise_error(entry, field_uri, "value is empty") + + if self.validate_values: + for i, val in enumerate(entry): + self.value_type.validate(val, "{}[{}]".format(val, i)) + + @property + def type(self): + return list + + +class NumberField(BaseField): + def __init__(self, floats=True, min_value=None, max_value=None, allow_inf=False, allow_nan=False, **kwargs): + super().__init__(**kwargs) + self.floats = floats + self.min = min_value + self.max = max_value + self.allow_inf = allow_inf + self.allow_nan = allow_nan + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + if entry is None: + return + + field_uri = field_uri or self.field_uri + if not self.floats and isinstance(entry, float): + raise ConfigError("{} is expected to be int".format(field_uri)) + if not isinstance(entry, int) and not isinstance(entry, float): + raise ConfigError("{} is expected to be number".format(field_uri)) + + if self.min is not None and entry < self.min: + reason = "value is less than minimal allowed - {}".format(self.min) + self.raise_error(entry, field_uri, reason) + if self.max is not None and entry > self.max: + reason = "value is greater than maximal allowed - {}".format(self.max) + self.raise_error(entry, field_uri, reason) + + if math.isinf(entry) and not self.allow_inf: + self.raise_error(entry, field_uri, "value is infinity") + if math.isnan(entry) and not self.allow_nan: + self.raise_error(entry, field_uri, "value is NaN") + + @property + def type(self): + return float if self.floats else int + + +class PathField(BaseField): + def __init__(self, is_directory=False, **kwargs): + super().__init__(**kwargs) + self.is_directory = is_directory + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + if entry is None: + return + + field_uri = field_uri or self.field_uri + try: + get_path(entry, self.is_directory) + except TypeError: + self.raise_error(entry, field_uri, "values is expected to be path-like") + except FileNotFoundError: + self.raise_error(entry, field_uri, "path does not exist") + except NotADirectoryError: + self.raise_error(entry, field_uri, "path is not a directory") + except IsADirectoryError: + self.raise_error(entry, field_uri, "path is a directory, regular file expected") + + @property + def type(self): + return Path + + +class BoolField(BaseField): + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + if entry is None: + return + + field_uri = field_uri or self.field_uri + if not isinstance(entry, bool): + raise ConfigError("{} is expected to be bool".format(field_uri)) + + @property + def type(self): + return string_to_bool + + +def _get_field_type(key_type): + if not isinstance(key_type, BaseField): + type_ = _TYPE_TO_FIELD_CLASS.get(key_type) + if callable(type_): + return type_() + + return key_type + + +_TYPE_TO_FIELD_CLASS = { + int: partial(NumberField, floats=False), + float: partial(NumberField, floats=True), + dict: partial(DictField, validate_keys=False, validate_values=False), + list: partial(ListField, validate_values=False), + Path: PathField, + str: StringField, + bool: BoolField, +} diff --git a/tools/accuracy_checker/accuracy_checker/data_readers/__init__.py b/tools/accuracy_checker/accuracy_checker/data_readers/__init__.py new file mode 100644 index 0000000..73e1bc7 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/data_readers/__init__.py @@ -0,0 +1,40 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .data_reader import ( + BaseReader, + DataReaderField, + ReaderCombiner, + JSONReaderConfig, + OpenCVFrameReader, + OpenCVImageReader, + PillowImageReader, + ScipyImageReader, + NiftiImageReader + +) + +__all__ = [ + 'BaseReader', + 'DataReaderField', + 'ReaderCombiner', + 'JSONReaderConfig', + 'OpenCVFrameReader', + 'OpenCVImageReader', + 'PillowImageReader', + 'ScipyImageReader', + 'NiftiImageReader' +] diff --git a/tools/accuracy_checker/accuracy_checker/data_readers/data_reader.py b/tools/accuracy_checker/accuracy_checker/data_readers/data_reader.py new file mode 100644 index 0000000..0aaa6fc --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/data_readers/data_reader.py @@ -0,0 +1,216 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from functools import singledispatch +from collections import OrderedDict +import re +import cv2 +from PIL import Image +import scipy.misc +import numpy as np +import nibabel as nib + +from ..utils import get_path, read_json +from ..dependency import ClassProvider +from ..config import BaseField, StringField, ConfigValidator, ConfigError, DictField + + +class DataReaderField(BaseField): + def validate(self, entry_, field_uri=None): + super().validate(entry_, field_uri) + + if entry_ is None: + return + + field_uri = field_uri or self.field_uri + if isinstance(entry_, str): + StringField(choices=BaseReader.providers).validate(entry_, 'reader') + elif isinstance(entry_, dict): + class DictReaderValidator(ConfigValidator): + type = StringField(choices=BaseReader.providers) + dict_reader_validator = DictReaderValidator( + 'reader', on_extra_argument=DictReaderValidator.IGNORE_ON_EXTRA_ARGUMENT + ) + dict_reader_validator.validate(entry_) + else: + self.raise_error(entry_, field_uri, 'reader must be either string or dictionary') + + +class BaseReader(ClassProvider): + __provider_type__ = 'reader' + + def __init__(self, config=None): + self.config = config + self.data_source_is_dir = True + self.data_source_optional = False + self.read_dispatcher = singledispatch(self.read) + self.read_dispatcher.register(list, self._read_list) + + self.validate_config() + self.configure() + + def __call__(self, *args, **kwargs): + return self.read_dispatcher(*args, **kwargs) + + def configure(self): + pass + + def validate_config(self): + pass + + def read(self, data_id, data_dir): + raise NotImplementedError + + def _read_list(self, data_id, data_dir): + return [self.read(identifier, data_dir) for identifier in data_id] + + +class ReaderCombinerConfig(ConfigValidator): + type = StringField() + scheme = DictField( + value_type=DataReaderField(), key_type=StringField(), allow_empty=False + ) + + +class ReaderCombiner(BaseReader): + __provider__ = 'combine_reader' + + def validate_config(self): + config_validator = ReaderCombinerConfig('reader_combiner_config') + config_validator.validate(self.config) + + def configure(self): + scheme = self.config['scheme'] + reading_scheme = OrderedDict() + for pattern, reader_config in scheme.items(): + reader = BaseReader.provide( + reader_config['type'] if isinstance(reader_config, dict) else reader_config, reader_config + ) + pattern = re.compile(pattern) + reading_scheme[pattern] = reader + + self.reading_scheme = reading_scheme + + def read(self, data_id, data_dir): + for pattern, reader in self.reading_scheme.items(): + if pattern.match(str(data_id)): + return reader.read(data_id, data_dir) + + raise ConfigError('suitable data reader for {} not found'.format(data_id)) + + +class OpenCVImageReader(BaseReader): + __provider__ = 'opencv_imread' + + def read(self, data_id, data_dir): + return cv2.imread(str(get_path(data_dir / data_id))) + + +class PillowImageReader(BaseReader): + __provider__ = 'pillow_imread' + + def read(self, data_id, data_dir): + return np.array(Image.open(str(get_path(data_dir / data_id)))) + + +class ScipyImageReader(BaseReader): + __provider__ = 'scipy_imread' + + def read(self, data_id, data_dir): + return np.array(scipy.misc.imread(str(get_path(data_dir / data_id)))) + +class OpenCVFrameReader(BaseReader): + __provider__ = 'opencv_capture' + + def __init__(self, config=None): + super().__init__(config) + self.data_source_is_dir = False + self.source = None + self.current = -1 + + def read(self, data_id, data_dir): + # source video changed, capture initialization + if data_dir != self.source: + self.source = data_dir + self.videocap = cv2.VideoCapture(str(self.source)) + self.current = -1 + + if data_id < 0: + raise IndexError('frame with {} index can not be grabbed, non-negative index is expected') + if data_id < self.current: + self.videocap.set(cv2.CAP_PROP_POS_FRAMES, data_id) + self.current = data_id - 1 + + return self._read_sequence(data_id) + + def _read_sequence(self, data_id): + frame = None + while self.current != data_id: + success, frame = self.videocap.read() + self.current += 1 + if not success: + raise EOFError('frame with {} index does not exists in {}'.format(self.current, self.source)) + return frame + + +class JSONReaderConfig(ConfigValidator): + type = StringField() + key = StringField(optional=True, case_sensitive=True) + + +class JSONReader(BaseReader): + __provider__ = 'json_reader' + + def validate_config(self): + config_validator = JSONReaderConfig('json_reader_config') + config_validator.validate(self.config) + + def configure(self): + self.key = self.config.get('key') + + def read(self, data_id, data_dir): + data = read_json(str(data_dir / data_id)) + if self.key: + data = data.get(self.key) + + if not data: + raise ConfigError('{} does not contain {}'.format(data_id, self.key)) + + return np.array(data).astype(np.float32) + +class NCF_DataReader(BaseReader): + __provider__ = 'ncf_data_reader' + + def __init__(self, config=None): + super().__init__(config) + self.data_source_optional = True + + def read(self, data_id, data_dir): + if not isinstance(data_id, str): + raise IndexError('Data identifier must be a string') + + return float(data_id.split(":")[1]) + +class NiftiImageReader(BaseReader): + __provider__ = 'nifti_reader' + + def read(self, data_id, data_dir): + nib_image = nib.load(str(get_path(data_dir / data_id))) + image = np.array(nib_image.dataobj) + if len(image.shape) != 4: # Make sure 4D + image = np.expand_dims(image, -1) + image = np.swapaxes(np.array(image), 0, -2) + return image diff --git a/tools/accuracy_checker/accuracy_checker/dataset.py b/tools/accuracy_checker/accuracy_checker/dataset.py new file mode 100644 index 0000000..f4ee1cb --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/dataset.py @@ -0,0 +1,190 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path +import numpy as np + +from .annotation_converters import BaseFormatConverter, save_annotation, make_subset +from .data_readers import BaseReader, DataReaderField +from .config import ConfigValidator, StringField, PathField, ListField, DictField, BaseField, NumberField, ConfigError +from .utils import JSONDecoderWithAutoConversion, read_json, get_path, contains_all +from .representation import BaseRepresentation + + +class DataRepresentation: + def __init__(self, data, meta=None, identifier=''): + self.identifier = identifier + self.data = data + self.metadata = meta or {} + if np.isscalar(data): + self.metadata['image_size'] = 1 + elif isinstance(data, list) and np.isscalar(data[0]): + self.metadata['image_size'] = len(data) + else: + self.metadata['image_size'] = data.shape if not isinstance(data, list) else data[0].shape + + +class DatasetConfig(ConfigValidator): + """ + Specifies configuration structure for dataset + """ + name = StringField() + annotation = BaseField(optional=True) + data_source = PathField() + dataset_meta = BaseField(optional=True) + metrics = ListField(allow_empty=False) + postprocessing = ListField(allow_empty=False, optional=True) + preprocessing = ListField(allow_empty=False, optional=True) + reader = DataReaderField(optional=True) + annotation_conversion = DictField(optional=True) + subsample_size = BaseField(optional=True) + subsample_seed = NumberField(floats=False, min_value=0, optional=True) + + +class Dataset: + def __init__(self, config_entry, preprocessor): + self._config = config_entry + self._preprocessor = preprocessor + + self.batch = 1 + + dataset_config = DatasetConfig('Dataset') + data_reader_config = self._config.get('reader', 'opencv_imread') + if isinstance(data_reader_config, str): + self.read_image_fn = BaseReader.provide(data_reader_config) + elif isinstance(data_reader_config, dict): + self.read_image_fn = BaseReader.provide(data_reader_config['type'], data_reader_config) + else: + raise ConfigError('reader should be dict or string') + + dataset_config.fields['data_source'].is_directory = self.read_image_fn.data_source_is_dir + dataset_config.fields['data_source'].optional = self.read_image_fn.data_source_optional + dataset_config.validate(self._config) + annotation, meta = None, None + self._images_dir = Path(self._config.get('data_source', '')) + if 'annotation_conversion' in self._config: + annotation, meta = self._convert_annotation() + else: + stored_annotation = self._config.get('annotation') + if stored_annotation: + annotation = read_annotation(get_path(stored_annotation)) + meta = self._load_meta() + + if not annotation: + raise ConfigError('path to converted annotation or data for conversion should be specified') + + subsample_size = self._config.get('subsample_size') + if subsample_size: + subsample_seed = self._config.get('subsample_seed', 666) + if isinstance(subsample_size, str): + if subsample_size.endswith('%'): + subsample_size = float(subsample_size[:-1]) / 100 * len(annotation) + subsample_size = int(subsample_size) + annotation = make_subset(annotation, subsample_size, subsample_seed) + + if contains_all(self._config, ['annotation', 'annotation_conversion']): + annotation_name = self._config['annotation'] + meta_name = self._config.get('dataset_meta') + if meta_name: + meta_name = Path(meta_name) + save_annotation(annotation, meta, Path(annotation_name), meta_name) + + self._annotation = annotation + self._meta = meta + self.size = len(self._annotation) + self.name = self._config.get('name') + + @property + def annotation(self): + return self._annotation + + def __len__(self): + return self.size + + @property + def metadata(self): + return self._meta + + @property + def labels(self): + return self._meta.get('label_map', {}) + + def __getitem__(self, item): + if self.size <= item * self.batch: + raise IndexError + + batch_start = item * self.batch + batch_end = min(self.size, batch_start + self.batch) + batch_annotation = self._annotation[batch_start:batch_end] + + identifiers = [annotation.identifier for annotation in batch_annotation] + images = self._read_images(identifiers) + + for image, annotation in zip(images, batch_annotation): + self.set_annotation_metadata(annotation, image) + + preprocessed = self._preprocessor.process(images, batch_annotation) + + return batch_annotation, preprocessed + + @staticmethod + def set_image_metadata(annotation, images): + image_sizes = [] + if not isinstance(images, list): + images = [images] + for image in images: + if np.isscalar(image): + image_sizes.append((1,)) + else: + image_sizes.append(image.shape) + annotation.set_image_size(image_sizes) + + def set_annotation_metadata(self, annotation, image): + self.set_image_metadata(annotation, image.data) + annotation.set_data_source(self._images_dir) + + def _read_images(self, identifiers): + images = [] + for identifier in identifiers: + images.append(DataRepresentation(self.read_image_fn(identifier, self._images_dir), identifier=identifier)) + + return images + + def _load_meta(self): + meta_data_file = self._config.get('dataset_meta') + return read_json(meta_data_file, cls=JSONDecoderWithAutoConversion) if meta_data_file else None + + def _convert_annotation(self): + conversion_params = self._config.get('annotation_conversion') + converter = conversion_params['converter'] + annotation_converter = BaseFormatConverter.provide(converter, conversion_params) + annotation, meta = annotation_converter.convert() + + return annotation, meta + + +def read_annotation(annotation_file: Path): + annotation_file = get_path(annotation_file) + + result = [] + with annotation_file.open('rb') as file: + while True: + try: + result.append(BaseRepresentation.load(file)) + except EOFError: + break + + return result diff --git a/tools/accuracy_checker/accuracy_checker/dependency.py b/tools/accuracy_checker/accuracy_checker/dependency.py new file mode 100644 index 0000000..947a3ec --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/dependency.py @@ -0,0 +1,108 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +# pylint: disable=protected-access + + +class ProvidedWrapper: + def __init__(self, provided): + self.provided = provided + + +class UnresolvedDependencyException(ValueError): + + def __init__(self, provider, missing_dependencies) -> None: + super().__init__() + self.provider = provider + self.missing_dependencies = missing_dependencies + self.message = "Unresolved dependencies ({}) for provider {}".format( + ", ".join(self.missing_dependencies), self.provider + ) + + +def get_opts(options): + """ + Args: + options: options object. + Returns: + args (tuple): positional options. + kwargs (map): keyword arguments. + """ + + if isinstance(options, tuple): + if len(options) == 2 and isinstance(options[-1], dict): + args, kwargs = options + else: + args = options + kwargs = {} + elif isinstance(options, dict): + args, kwargs = (), options + else: + raise ValueError("Options object expected to be either pair of (args, kwargs) or only args/kwargs") + + return args, kwargs + + +class BaseProvider: + providers = {} + __provider_type__ = None + __provider__ = None + + @classmethod + def provide(cls, provider, *args, **kwargs): + root_provider = cls.resolve(provider) + return root_provider(*args, **kwargs) + + @classmethod + def resolve(cls, name): + if name not in cls.providers: + raise ValueError("Requested provider not registered") + return cls.providers[name] + + +class ClassProviderMeta(type): + def __new__(mcs, name, bases, attrs, **kwargs): + cls = super().__new__(mcs, name, bases, attrs) + # do not create container for abstract provider + if '_is_base_provider' in attrs: + return cls + + assert issubclass(cls, ClassProvider), "Do not use metaclass directly" + if '__provider_type__' in attrs: + cls.providers = {} + else: + cls.register_provider(cls) + + return cls + + +class ClassProvider(BaseProvider, metaclass=ClassProviderMeta): + _is_base_provider = True + + @classmethod + def get_provider_name(cls): + return getattr(cls, '__provider__', cls.__name__) + + @classmethod + def register_provider(cls, provider): + provider_name = cls.get_provider_name() + if not provider_name: + return + cls.providers[provider_name] = provider + + +def provide(service): + return ProvidedWrapper(service) diff --git a/tools/accuracy_checker/accuracy_checker/launcher/__init__.py b/tools/accuracy_checker/accuracy_checker/launcher/__init__.py new file mode 100644 index 0000000..af21a91 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/__init__.py @@ -0,0 +1,34 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .dummy_launcher import DummyLauncher +from .launcher import Launcher, create_launcher, unsupported_launcher + +try: + from .caffe_launcher import CaffeLauncher +except ImportError as import_error: + CaffeLauncher = unsupported_launcher( + 'caffe', "Caffe isn't installed. Please, install it before using. \n{}".format(import_error.msg) + ) + +try: + from .dlsdk_launcher import DLSDKLauncher +except ImportError as import_error: + DLSDKLauncher = unsupported_launcher( + 'dlsdk', "IE Python isn't installed. Please, install it before using. \n{}".format(import_error.msg) + ) + +__all__ = ['create_launcher', 'Launcher', 'CaffeLauncher', 'DLSDKLauncher', 'DummyLauncher'] diff --git a/tools/accuracy_checker/accuracy_checker/launcher/caffe_installation_readme.md b/tools/accuracy_checker/accuracy_checker/launcher/caffe_installation_readme.md new file mode 100644 index 0000000..8118dcd --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/caffe_installation_readme.md @@ -0,0 +1,56 @@ +# Caffe Installation Tips + +## Install OpenCV 3.3 or later with Python3 bindings + +Accuracy Checker uses OpenCV library for image processing. You can miss this step if you are using OpenCV from [OpenVINO toolkit][openvino-get-started]. + +```bash +sudo apt-get install libopencv-dev +pip install opencv-python +``` + +## Install Caffe with Python3 bindings + +* Clone repository: + +```bash +git clone https://github.com/BVLC/caffe.git +cd caffe +``` + +* Install Caffe dependencies: + +```bash +sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libhdf5-serial-dev protobuf-compiler libgflags-dev libgoogle-glog-dev liblmdb-dev +sudo apt-get install --no-install-recommends libboost-all-dev +pip install -r python/requirements.txt +pip install matplotlib +``` + +* Build + +If you need CPU only version of caffe add `-DCPU_ONLY=ON` to cmake command. + +```bash +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX= -Dpython_version=3 -DBLAS=open .. +make +sudo make install +``` + +* Copy Python library to your python installation. + +```bash +cp -r ../python/caffe $VIRTUAL_ENV/lib/python3.5/site-packages +cp --remove-destination lib/_caffe.so $VIRTUAL_ENV/lib/python3.5/site-packages/caffe +``` + +## Check your installation + +You can test prerequisites with the following command. If it does not fail, then you are installed prerequisites correctly: + +```bash +python3 -c 'import caffe, cv2' +``` + +[openvino-get-started]: https://software.intel.com/en-us/openvino-toolkit/documentation/get-started diff --git a/tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher.py b/tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher.py new file mode 100644 index 0000000..df3d98a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher.py @@ -0,0 +1,141 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import re + +import caffe + +from ..utils import extract_image_representations +from ..config import PathField, StringField, NumberField, BoolField +from .launcher import Launcher, LauncherConfig +from .input_feeder import InputFeeder + +DEVICE_REGEX = r'(?Pcpu$|gpu)(_(?P\d+))?' + + +class CaffeLauncherConfig(LauncherConfig): + """ + Specifies configuration structure for Caffe launcher. + """ + + model = PathField() + weights = PathField() + device = StringField(regex=DEVICE_REGEX) + batch = NumberField(floats=False, min_value=1, optional=True) + output_name = StringField(optional=True) + allow_reshape_input = BoolField(optional=True) + + +class CaffeLauncher(Launcher): + """ + Class for infer model using Caffe framework. + """ + + __provider__ = 'caffe' + + def __init__(self, config_entry: dict, adapter, *args, **kwargs): + super().__init__(config_entry, adapter, *args, **kwargs) + + caffe_launcher_config = CaffeLauncherConfig('Caffe_Launcher') + caffe_launcher_config.validate(self._config) + + self.model = str(self._config['model']) + self.weights = str(self._config['weights']) + + self.network = caffe.Net(self.model, self.weights, caffe.TEST) + self.allow_reshape_input = self._config.get('allow_reshape_input', False) + + match = re.match(DEVICE_REGEX, self._config['device'].lower()) + if match.group('device') == 'gpu': + caffe.set_mode_gpu() + identifier = match.group('identifier') or 0 + caffe.set_device(int(identifier)) + elif match.group('device') == 'cpu': + caffe.set_mode_cpu() + + self._batch = self._config.get('batch', 1) + + inputs_map = {} + for input_blob in self.network.inputs: + inputs_map[input_blob] = self.network.blobs[input_blob] + + self.input_feeder = InputFeeder(self._config.get('inputs') or [], inputs_map) + + if self.adapter: + self.adapter.output_blob = self.adapter.output_blob or next(iter(self.network.outputs)) + + @property + def inputs(self): + """ + Returns: + inputs in NCHW format. + """ + + self._inputs_shapes = {} + + for input_blob in self.network.inputs: + if input_blob in self.input_feeder.const_inputs: + continue + + channels, height, width = self.network.blobs[input_blob].data.shape[1:] + self.network.blobs[input_blob].reshape(self._batch, channels, height, width) + self._inputs_shapes[input_blob] = channels, height, width + + return self._inputs_shapes + + @property + def batch(self): + return self._batch + + def predict(self, identifiers, data_representation, *args, **kwargs): + """ + Args: + identifiers: list of input data identifiers. + data_representation: list of input data representations, which contain preprocessed data and its metadata. + Returns: + output of model converted to appropriate representation. + """ + _, meta = extract_image_representations(data_representation) + dataset_inputs = self.input_feeder.fill_non_constant_inputs(data_representation) + results = [] + for infer_input in dataset_inputs: + for input_blob in self.network.inputs: + if input_blob in self.input_feeder.const_inputs: + continue + + data = infer_input[input_blob] + + if self.allow_reshape_input: + self.network.blobs[input_blob].reshape(*data.shape) + + if data.shape[0] != self._batch: + self.network.blobs[input_blob].reshape( + data.shape[0], *self.network.blobs[input_blob].data.shape[1:] + ) + + results.append(self.network.forward(**self.input_feeder.const_inputs, **infer_input)) + + if self.adapter: + results = self.adapter(results, identifiers, [self._provide_inputs_info_to_meta(meta_) for meta_ in meta]) + + return results + + def release(self): + """ + Releases launcher. + """ + + del self.network diff --git a/tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher_readme.md b/tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher_readme.md new file mode 100644 index 0000000..2ff6013 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/caffe_launcher_readme.md @@ -0,0 +1,24 @@ +# How to configure Caffe launcher + +For enabling Caffe launcher you need to add `framework: caffe` in launchers section of your configuration file and provide following parameters: + +* `device` - specifies which device will be used for infer (`cpu`, `gpu_0` and so on). +* `model` - path to prototxt file with Caffe model for your topology. +* `weights` - path to caffemodel file with weights for your topology. +* `adapter` - approach how raw output will be converted to representation of dataset problem, some adapters can be specific to framework. You can find detailed instruction how to use adapters [here][adapters]. + +You also can specify batch size for your model using `batch` and allow to reshape input layer to data shape, using specific parameter: `allow_reshape_input` (default value is False). + +Caffe launcher config example: + +```yml +launchers: + - framework: caffe + device: CPU + model: path_to_model/alexnet.prototxt + weights: path_to_weights/alexnet.caffemodel + adapter: classification + batch: 4 +``` + +[adapters]: ./tools/accuracy_checker/accuracy_checker/adapters/README.md diff --git a/tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher.py b/tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher.py new file mode 100644 index 0000000..6378b8d --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher.py @@ -0,0 +1,430 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import subprocess +from pathlib import Path +import os +import platform +import numpy as np +from cpuinfo import get_cpu_info +import openvino.inference_engine as ie + +from ..config import ConfigError, NumberField, PathField, StringField, DictField, ListField, BoolField +from ..logging import warning +from ..utils import read_yaml, contains_all, extract_image_representations, get_path +from .launcher import Launcher, LauncherConfig +from .input_feeder import InputFeeder +from .model_conversion import convert_model +from ..logging import print_info + +HETERO_KEYWORD = 'HETERO:' +FPGA_COMPILER_MODE_VAR = 'CL_CONTEXT_COMPILER_MODE_INTELFPGA' +DEVICE_REGEX = r"(?:^{hetero}(?P(?:{devices})(?:,(?:{devices}))*)$)|(?:^(?P{devices})$)".format( + hetero=HETERO_KEYWORD, devices="|".join(plugin for plugin in ie.known_plugins) +) + + +class CPUExtensionPathField(PathField): + def __init__(self, **kwargs): + super().__init__(is_directory=False, **kwargs) + + def validate(self, entry, field_uri=None): + if entry is None: + return + + field_uri = field_uri or self.field_uri + validation_entry = '' + try: + validation_entry = Path(entry) + except TypeError: + self.raise_error(entry, field_uri, "values is expected to be path-like") + is_directory = False + if validation_entry.parts[-1] == 'AUTO': + validation_entry = validation_entry.parent + is_directory = True + try: + get_path(validation_entry, is_directory) + except FileNotFoundError: + self.raise_error(validation_entry, field_uri, "path does not exist") + except NotADirectoryError: + self.raise_error(validation_entry, field_uri, "path is not a directory") + except IsADirectoryError: + self.raise_error(validation_entry, field_uri, "path is a directory, regular file expected") + + +class DLSDKLauncherConfig(LauncherConfig): + """ + Specifies configuration structure for DLSDK launcher. + """ + + device = StringField(regex=DEVICE_REGEX) + model = PathField(optional=True) + weights = PathField(optional=True) + caffe_model = PathField(optional=True) + caffe_weights = PathField(optional=True) + mxnet_weights = PathField(optional=True) + tf_model = PathField(optional=True) + onnx_model = PathField(optional=True) + kaldi_model = PathField(optional=True) + cpu_extensions = CPUExtensionPathField(optional=True) + gpu_extensions = PathField(optional=True) + bitstream = PathField(optional=True) + mo_params = DictField(optional=True) + mo_flags = ListField(optional=True) + outputs = ListField(optional=True) + allow_reshape_input = BoolField(optional=True) + affinity_map = PathField(optional=True) + batch = NumberField(floats=False, min_value=1, optional=True) + + _models_prefix = PathField(is_directory=True, optional=True) + _model_optimizer = PathField(optional=True, allow_none=True, is_directory=True) + _tf_obj_detection_api_config_dir = PathField(optional=True, allow_none=True, is_directory=True) + _tf_custom_op_config_dir = PathField(optional=True, allow_none=True, is_directory=True) + _cpu_extensions_mode = StringField(optional=True, allow_none=True) + _aocl = PathField(optional=True) + + def __init__(self, config_uri, **kwargs): + super().__init__(config_uri, **kwargs) + self.need_conversion = None + + def validate(self, entry, field_uri=None): + """ + Validate that launcher entry meets all configuration structure requirements. + + Args: + entry: launcher configuration file entry. + field_uri: id of launcher entry. + """ + + dlsdk_model_options = ['model', 'weights'] + caffe_model_options = ['caffe_model', 'caffe_weights'] + mxnet_model_options = ['mxnet_weights'] + tf_model_options = ['tf_model'] + onnx_model_options = ['onnx_model'] + kaldi_model_options = ['kaldi_model'] + + multiple_model_sources_err = ( + 'Either model and weights or caffe_model and caffe_weights ' + 'or mxnet_weights or tf_model should be specified.' + ) + sources = { + 'dlsdk': dlsdk_model_options, + 'caffe': caffe_model_options, + 'tf': tf_model_options, + 'mxnet': mxnet_model_options, + 'onnx': onnx_model_options, + 'kaldi': kaldi_model_options + } + + specified = [] + for mo_source_option in sources: + if contains_all(entry, sources[mo_source_option]): + specified.append(mo_source_option) + + if not specified: + raise ConfigError('{} None provided'.format(multiple_model_sources_err)) + if len(specified) > 1: + raise ConfigError('{} Several provided'.format(multiple_model_sources_err)) + + self._set_model_source(specified[0]) + super().validate(entry, field_uri) + + def _set_model_source(self, framework): + self.need_conversion = framework != 'dlsdk' + self.framework = framework + self.fields['model'].optional = self.need_conversion + self.fields['weights'].optional = self.need_conversion + self.fields['caffe_model'].optional = framework != 'caffe' + self.fields['caffe_weights'].optional = framework != 'caffe' + self.fields['mxnet_weights'].optional = framework != 'mxnet' + self.fields['tf_model'].optional = framework != 'tf' + self.fields['onnx_model'].optional = framework != 'onnx' + self.fields['kaldi_model'].optional = framework != 'kaldi' + + +class DLSDKLauncher(Launcher): + """ + Class for infer model using DLSDK framework. + """ + + __provider__ = 'dlsdk' + + def __init__(self, config_entry, adapter): + super().__init__(config_entry, adapter) + + def fit_to_input(data, input_layer): + shape_len = len(input_layer.shape) + if shape_len == 4: + return np.transpose(data, [0, 3, 1, 2]) + if shape_len == 2: + if len(np.shape(data)) == 1: + return np.transpose([data]) + return np.array(data) + + dlsdk_launcher_config = DLSDKLauncherConfig('DLSDK_Launcher') + dlsdk_launcher_config.validate(self._config) + + self._device = self._config['device'].upper() + self._set_variable = False + self._prepare_bitstream_firmware(self._config) + + if dlsdk_launcher_config.need_conversion: + self._model, self._weights = DLSDKLauncher.convert_model(self._config, dlsdk_launcher_config.framework) + else: + self._model = self._config['model'] + self._weights = self._config['weights'] + + self._create_ie_plugin() + self.network = ie.IENetwork(model=str(self._model), weights=str(self._weights)) + self.original_outputs = self.network.outputs + outputs = self._config.get('outputs') + if outputs: + self.network.add_outputs(outputs) + self.input_feeder = InputFeeder( + self._config.get('inputs') or [], + self.network.inputs, + prepare_input_data=fit_to_input + ) + self._batch = self._config.get('batch', self.network.batch_size) + if self._batch != self.network.batch_size: + self._set_batch_size(self._batch) + affinity_map_path = self._config.get('affinity_map') + if affinity_map_path and self._is_hetero(): + self._set_affinity(affinity_map_path) + elif affinity_map_path: + warning('affinity_map config is applicable only for HETERO device') + self.exec_network = self.plugin.load(network=self.network) + self.allow_reshape_input = self._config.get('allow_reshape_input', False) + + @property + def inputs(self): + """ + Returns: + inputs in NCHW format. + """ + + # reverse and omit N + return {k: v.shape[1:] for k, v in self.network.inputs.items() if k in self.input_feeder.non_constant_inputs} + + @property + def batch(self): + return self._batch + + def predict(self, identifiers, data_representation, *args, **kwargs): + """ + Args: + identifiers: list of input data identifiers. + data_representation: list of input data representations, which contain preprocessed data and its metadata. + Returns: + output of model converted to appropriate representation. + """ + _, metadata = extract_image_representations(data_representation) + non_constant_inputs = self.input_feeder.fill_non_constant_inputs(data_representation) + results = [] + for infer_inputs in non_constant_inputs: + input_shapes = {} + do_reshape = False + for input_blob in self.network.inputs: + if input_blob in self.input_feeder.const_inputs: + input_shapes[input_blob] = self.network.inputs[input_blob].shape + continue + + data = infer_inputs[input_blob] + input_shapes[input_blob] = data.shape + if self.allow_reshape_input: + if tuple(self.network.inputs[input_blob].shape) != data.shape: + do_reshape = True + + if do_reshape: + self._reshape_input(input_shapes) + + for input_blob, data in infer_inputs.items(): + infer_inputs[input_blob] = self._align_data_shape(data, input_blob) + + network_inputs_data = {**infer_inputs, **self.input_feeder.const_inputs} + + benchmark = kwargs.get('benchmark') + if benchmark: + benchmark(network_inputs_data) + + result = self.exec_network.infer(network_inputs_data) + + raw_outputs_callback = kwargs.get('output_callback') + if raw_outputs_callback: + raw_outputs_callback(result) + + results.append(result) + + if self.adapter: + self.adapter.output_blob = self.adapter.output_blob or next(iter(self.original_outputs)) + results = self.adapter(results, identifiers, [self._provide_inputs_info_to_meta(meta) for meta in metadata]) + + return results + + def _is_hetero(self): + return self._device.startswith(HETERO_KEYWORD) + + def _devices_list(self): + device = self._device + if HETERO_KEYWORD in self._device: + device = self._device[len(HETERO_KEYWORD):] + + return [platform_.upper().strip() for platform_ in device.split(',')] + + def _set_affinity(self, affinity_map_path): + self.plugin.set_initial_affinity(self.network) + layers = self.network.layers + for layer, device in read_yaml(affinity_map_path).items(): + if layer not in layers: + raise ConfigError('Layer \'{layer}\' is not present in network'.format(layer=layer)) + if device not in self._devices_list(): + raise ConfigError( + 'Device \'{device}\' set for \'{layer}\' layer is not present in ' + 'provided configuration \'{configuration}\''.format( + device=device, layer=layer, configuration=self._device + ) + ) + layers[layer].affinity = device + + def _is_fpga(self): + return 'FPGA' in self._devices_list() + + def _prepare_bitstream_firmware(self, config): + if not self._is_fpga(): + return + + compiler_mode = os.environ.get(FPGA_COMPILER_MODE_VAR) + if compiler_mode == '3': + return + + bitstream = config.get('bitstream') + if bitstream: + print_info('programming bitstream: {}'.format(bitstream.name)) + aocl_executable = config.get('_aocl') + if aocl_executable: + subprocess.run([str(aocl_executable), 'program', 'acl0', str(bitstream)]) + os.environ[FPGA_COMPILER_MODE_VAR] = '3' + self._set_variable = True + else: + aocx_variable = 'DLA_AOCX' + previous_bitstream = os.environ.get(aocx_variable) + if previous_bitstream == str(bitstream): + return + os.environ[aocx_variable] = str(bitstream) + if not os.environ.get(aocx_variable): + warning('Warning: {} has not been set'.format(aocx_variable)) + + @staticmethod + def get_cpu_extension(cpu_extensions, selection_mode): + cpu_extensions_name = cpu_extensions.parts[-1] + if cpu_extensions_name != 'AUTO': + return cpu_extensions + extensions_path = cpu_extensions.parent + file_format = '{}.dll' if platform.system() == 'Windows' else 'lib{}.so' + if not selection_mode: + default_cpu_extension = file_format.format('cpu_extension') + extension_list = list(extensions_path.glob(default_cpu_extension)) + + if extension_list: + return extension_list[0] + + cpu_info_flags = get_cpu_info()['flags'] + selection_mode = 'avx2' if 'avx2' in cpu_info_flags else 'sse4' + extension_list = list(extensions_path.glob(file_format.format('cpu_extension_{}'.format(selection_mode)))) + + if not extension_list: + raise ConfigError('suitable CPU extension lib not found in {}'.format(extensions_path)) + + return extension_list[0] + + @staticmethod + def convert_model(config, framework='caffe'): + config_model = config.get(framework + '_model', '') + config_weights = config.get(framework + '_weights', '') + + mo_search_paths = [] + model_optimizer = config.get('_model_optimizer') + if model_optimizer: + mo_search_paths.append(model_optimizer) + + model_optimizer_directory_env = os.environ.get('MO_DIR') + if model_optimizer_directory_env: + mo_search_paths.append(model_optimizer_directory_env) + + return convert_model( + Path(config_model).name.split('.')[0] or Path(config_weights).name.split('.')[0], + config_model, config_weights, framework, + mo_search_paths, config.get('mo_params'), + config.get('mo_flags'), + config.get('_tf_custom_op_config_dir'), + config.get('_tf_obj_detection_api_pipeline_config_path') + ) + + def _reshape_input(self, shapes): + self.network.reshape(shapes) + del self.exec_network + self._create_ie_plugin(log=False) + self.exec_network = self.plugin.load(network=self.network) + + def _set_batch_size(self, batch_size): + # in some cases we can not use explicit property for setting batch size, so we need to use reshape instead + # save const inputs without changes + const_inputs_shapes = { + input_name: self.network.inputs[input_name].shape for input_name in self.input_feeder.const_inputs + } + new_non_const_input_shapes = {} + for layer_name in self.input_feeder.non_constant_inputs: + layer = self.network.inputs[layer_name] + layer_shape = layer.shape + ind_batch = layer.layout.find('N') + if ind_batch != -1: + layer_shape[ind_batch] = batch_size + new_non_const_input_shapes[layer_name] = layer_shape + + self.network.reshape({**const_inputs_shapes, **new_non_const_input_shapes}) + + def _align_data_shape(self, data, input_blob): + input_shape = self.network.inputs[input_blob].shape + + if data.shape[0] != input_shape[0]: + input_shape[0] = data.shape[0] + if len(data.shape) > 1 and len(input_shape) > 1 and data.shape[1] != input_shape[1]: + data = data[:, :input_shape[1]] + + return data.reshape(input_shape) + + def _create_ie_plugin(self, log=True): + if hasattr(self, 'plugin'): + del self.plugin + self.plugin = ie.IEPlugin(self._device) + if log: + print_info('IE version: {}'.format(ie.get_version())) + print_info('Loaded {} plugin version: {}'.format(self.plugin.device, self.plugin.version)) + + cpu_extensions = self._config.get('cpu_extensions') + if cpu_extensions and 'CPU' in self._device: + selection_mode = self._config.get('_cpu_extensions_mode') + cpu_extensions = DLSDKLauncher.get_cpu_extension(cpu_extensions, selection_mode) + self.plugin.add_cpu_extension(str(cpu_extensions)) + if self._config.get('gpu_extensions') and 'GPU' in self._device: + self.plugin.set_config('CONFIG_FILE', str(self._config.get('gpu_extensions'))) + + def release(self): + if self._set_variable: + del os.environ[FPGA_COMPILER_MODE_VAR] + del self.network + del self.exec_network + del self.plugin diff --git a/tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher_readme.md b/tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher_readme.md new file mode 100644 index 0000000..e04415e --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/dlsdk_launcher_readme.md @@ -0,0 +1,54 @@ +# How to configure OpenVINO™ launcher + +For enabling OpenVINO™ launcher you need to add `framework: dlsdk` in launchers section of your configuration file and provide following parameters: + +* `device` - specifies which device will be used for infer. Supported: `CPU`, `GPU`, `FPGA`, `MYRIAD` and Heterogeneous plugin as `HETERO:target_device,fallback_device`. +* `model` - path to xml file with Caffe model for your topology. +* `weights` - path to bin file with weights for your topology. + +launcher may optionally provide model parameters in source framework format which will be converted to Inference Engine IR using Model Optimizer. +If you want to use Model Optimizer for model conversion, please view [Model Optimizer Developer Guide][openvino-mo]. +You can provide: + +* `caffe_model` and `caffe_weights` for Caffe model and weights (*.prototxt and *.caffemodel). +* `tf_model` for TensorFlow model (*.pb, *.pb.frozen, *.pbtxt). +* `mxnet_weights` for MXNet params (*.params). +* `onnx_model` for ONNX model (*.onnx). +* `kaldi_model` for Kaldi model (*.nnet). + +In case when you want to determine additional parameters for model conversion (data_type, input_shape and so on), you can use `mo_params` for arguments with values and `mo_flags` for positional arguments like `legacy_mxnet_model` . +Full list of supported parameters you can find in Model Optimizer Developer Guide. + +Model will be converted before every evaluation. +You can provide `converted_model_dir` for saving converted model in specific folder, otherwise, converted models will be saved in path provided via `-C` command line argument or source model directory. + +* `adapter` - approach how raw output will be converted to representation of dataset problem, some adapters can be specific to framework. You can find detailed instruction how to use adapters [here][adapters]. + +Launcher understands which batch size will be used from model intermediate representation (IR). If you want to use batch for infer, please, provide model with required batch or convert it using specific parameter in `mo_params`. + +* `allow_reshape_input` - parameter, which allows to reshape input layer to data shape (default value is False). + +Additionally you can provide device specific parameters: + +* `cpu_extensions` (path to extension *.so file with custom layers for cpu). +* `gpu_extensions` (path to extension *.xml file with OpenCL kernel description for gpu). +* `bitstream` for running on FPGA. + +OpenVINO™ launcher config example: + +```yml +launchers: + - framework: dlsdk + device: HETERO:FPGA,CPU + caffe_model: path_to_model/alexnet.prototxt + caffe_weights: path_to_weights/alexnet.caffemodel + adapter: classification + mo_params: + batch: 4 + mo_flags: + - reverse_input_channels + cpu_extensions: cpu_extentions_avx512.so +``` + +[adapters]: ./tools/accuracy_checker/accuracy_checker/adapters/README.md +[openvino-mo]: https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer diff --git a/tools/accuracy_checker/accuracy_checker/launcher/dummy_launcher.py b/tools/accuracy_checker/accuracy_checker/launcher/dummy_launcher.py new file mode 100644 index 0000000..7714f2e --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/dummy_launcher.py @@ -0,0 +1,69 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..utils import get_path +from ..logging import print_info +from ..adapters import Adapter +from ..config import PathField, StringField +from .loaders import Loader +from .launcher import Launcher, LauncherConfig + + +class DummyLauncherConfig(LauncherConfig): + """ + Specifies configuration structure for Dummy launcher. + """ + + loader = StringField(choices=Loader.providers) + data_path = PathField() + adapter = StringField(choices=Adapter.providers, optional=True) + + +class DummyLauncher(Launcher): + """ + Class for using predictions from another tool. + """ + + __provider__ = 'dummy' + + def __init__(self, config_entry: dict, adapter, *args, **kwargs): + super().__init__(config_entry, adapter, *args, **kwargs) + + dummy_launcher_config = DummyLauncherConfig('Dummy_Launcher') + dummy_launcher_config.validate(self._config) + + self.data_path = get_path(self._config['data_path']) + + self._loader = Loader.provide(self._config['loader'], self.data_path) + if self.adapter: + self.adapter.output_blob = self.adapter.output_blob or self.data_path + self._loader.data = self.adapter(self._loader.data) + + print_info("{} predictions objects loaded from {}".format(len(self._loader), self.data_path)) + + def predict(self, identifiers, *args, **kwargs): + return [self._loader[identifier] for identifier in identifiers] + + def release(self): + pass + + @property + def batch(self): + return 1 + + @property + def inputs(self): + return None diff --git a/tools/accuracy_checker/accuracy_checker/launcher/input_feeder.py b/tools/accuracy_checker/accuracy_checker/launcher/input_feeder.py new file mode 100644 index 0000000..202409b --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/input_feeder.py @@ -0,0 +1,138 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import re +import numpy as np + +from ..config import ConfigError +from ..utils import extract_image_representations + +class InputFeeder: + def __init__(self, inputs_config, network_inputs, prepare_input_data=None): + def fit_to_input(data, input_layer): + if len(np.shape(data)) == 4: + return np.transpose(data, [0, 3, 1, 2]) + return np.array(data) + + self.input_transform_func = prepare_input_data or fit_to_input + self.network_inputs = network_inputs + self.configure(inputs_config) + + def configure(self, inputs_config): + self.const_inputs, self.non_constant_inputs, self.inputs_mapping = self._parse_inputs_config(inputs_config) + if not self.non_constant_inputs: + raise ConfigError('Network should contain at least one layer for setting variable data.') + + def fill_non_constant_inputs(self, data_representation_batch): + filled_inputs = {} + for input_layer in self.non_constant_inputs: + input_regex = None + input_batch = [] + if self.inputs_mapping: + input_regex = self.inputs_mapping[input_layer] + for data_representation in data_representation_batch: + input_data = None + identifiers = data_representation.identifier + data = data_representation.data + if not isinstance(identifiers, list) and not input_regex: + input_data = data + input_batch.append(input_data) + continue + + if not input_regex: + raise ConfigError('Impossible to choose correct data for layer {}.' + 'Please provide regular expression for matching in config.'.format(input_layer)) + data = [data] if np.isscalar(identifiers) else data + identifiers = [identifiers] if np.isscalar(identifiers) else identifiers + for identifier, data_value in zip(identifiers, data): + if input_regex.match(identifier): + input_data = data_value + break + if input_data is None: + raise ConfigError('Suitable data for filling layer {} not found'.format(input_layer)) + input_batch.append(input_data) + + filled_inputs[input_layer] = input_batch + + return self._transform_batch(filled_inputs, extract_image_representations(data_representation_batch)[1]) + + def _parse_inputs_config(self, inputs_entry): + constant_inputs = {} + non_constant_inputs_mapping = {} + non_constant_inputs = [] + for input_ in inputs_entry: + name = input_['name'] + if not name in self.network_inputs: + raise ConfigError('network does not contain input "{}"'.format(name)) + value = input_['value'] + + if input_['type'] == 'CONST_INPUT': + if isinstance(value, list): + value = np.array(value) + constant_inputs[name] = value + else: + value = re.compile(value) + non_constant_inputs_mapping[name] = value + + non_constant_inputs = list(non_constant_inputs_mapping.keys()) + not_config_inputs = list(filter( + lambda input_layer: not input_layer in non_constant_inputs + list(constant_inputs.keys()), + self.network_inputs.keys() + )) + if non_constant_inputs and not_config_inputs: + raise ConfigError('input value for {} are not presented in config.'.format(','.join(not_config_inputs))) + non_constant_inputs += not_config_inputs + + return constant_inputs, non_constant_inputs, non_constant_inputs_mapping or None + + def _transform_batch(self, batch_data, meta): + def calculate_num_splits(layers_data, batch_size): + max_split_num = 1 + for _, data in layers_data.items(): + total_tiles_num = 0 + for tiles in data: + total_tiles_num += len(tiles) + + offset = 0 if total_tiles_num % batch_size == 0 else 1 + splits_for_layer = (total_tiles_num // batch_size) + offset + if max_split_num < splits_for_layer: + max_split_num = splits_for_layer + + return max_split_num + + def separate_data(data, num_splits): + grouped_data = [[] for _ in range(num_splits)] + for data_part in data: + for split_id, data_split in enumerate(data_part): + grouped_data[split_id % num_splits].append(data_split) + return grouped_data + + batch_size = len(meta) + if meta[0].get('multi_infer', False): + num_splits = calculate_num_splits(batch_data, batch_size) + infers_data = [{} for _ in range(num_splits)] + for layer_name, layer_data in batch_data.items(): + batch_for_all_infers = separate_data(layer_data, num_splits) + for infer_id, on_infer_batch in enumerate(batch_for_all_infers): + infers_data[infer_id][layer_name] = self.input_transform_func( + on_infer_batch, self.network_inputs[layer_name] + ) + return infers_data + + for layer_name, layer_data in batch_data.items(): + batch_data[layer_name] = self.input_transform_func(layer_data, self.network_inputs[layer_name]) + + return [batch_data] diff --git a/tools/accuracy_checker/accuracy_checker/launcher/launcher.py b/tools/accuracy_checker/accuracy_checker/launcher/launcher.py new file mode 100644 index 0000000..8aa4436 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/launcher.py @@ -0,0 +1,149 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..config import BaseField, ConfigError +from ..adapters import Adapter, AdapterField +from ..config import ConfigValidator, StringField, ListField +from ..dependency import ClassProvider, provide + + +class Launcher(ClassProvider): + """ + Interface for inferring model. + """ + + __provider_type__ = 'launcher' + + adapter = provide(Adapter) + + def __init__(self, config_entry, adapter, *args, **kwargs): + self.adapter = adapter + self._config = config_entry + + def predict(self, identifiers, data_representation, *args, **kwargs): + """ + Args: + identifiers: list of input data identifiers. + data_representation: list of input data representations, which contain preprocessed data and its metadata. + Returns: + raw data from network. + """ + + raise NotImplementedError + + def release(self): + raise NotImplementedError + + @property + def batch(self): + raise NotImplementedError + + @property + def inputs(self): + raise NotImplementedError + + def _provide_inputs_info_to_meta(self, meta): + meta['input_shape'] = self.inputs + + return meta + + +class InputValidator(ConfigValidator): + name = StringField() + type = StringField(choices=('CONST_INPUT', 'INPUT')) + value = BaseField() + + +class ListInputsField(ListField): + def __init__(self, **kwargs): + super().__init__(allow_empty=False, value_type=InputValidator('Inputs'), **kwargs) + + def validate(self, entry, field_uri=None): + super().validate(entry, field_uri) + names_set = set() + for input_layer in entry: + input_name = input_layer['name'] + if input_name not in names_set: + names_set.add(input_name) + else: + self.raise_error(entry, field_uri, '{} repeated name'.format(input_name)) + + +class LauncherConfig(ConfigValidator): + """ + Specifies common part of configuration structure for launchers. + """ + + framework = StringField(choices=Launcher.providers) + tags = ListField(allow_empty=False, optional=True) + inputs = ListInputsField(optional=True) + adapter = AdapterField() + + +def unsupported_launcher(name, error_message=None): + class UnsupportedLauncher(Launcher): + __provider__ = name + + def __init__(self, config_entry, adapter, *args, **kwargs): + super().__init__(config_entry, adapter, *args, **kwargs) + + msg = "{launcher} launcher is disabled. Please install {launcher} to enable it.".format(launcher=name) + raise ValueError(error_message or msg) + + def predict(self, identifiers, data, *args, **kwargs): + raise NotImplementedError + + def release(self): + raise NotImplementedError + + @property + def batch(self): + raise NotImplementedError + + return UnsupportedLauncher + + +def create_launcher(launcher_config, dataset_meta=None): + """ + Args: + launcher_config: launcher configuration file entry. + dataset_meta: metadata dictionary for dataset annotation. + Returns: + framework-specific launcher object. + """ + + launcher_config_validator = LauncherConfig( + 'Launcher_validator', + on_extra_argument=ConfigValidator.IGNORE_ON_EXTRA_ARGUMENT + ) + launcher_config_validator.validate(launcher_config) + + label_map = None + if dataset_meta: + label_map = dataset_meta.get('label_map') + + config_framework = launcher_config['framework'] + config_adapter = launcher_config.get('adapter') + if not config_adapter: + adapter = None + elif isinstance(config_adapter, str): + adapter = Adapter.provide(config_adapter, launcher_config, label_map=label_map) + elif isinstance(config_adapter, dict): + adapter = Adapter.provide(config_adapter['type'], config_adapter, label_map=label_map) + else: + raise ConfigError + + return Launcher.provide(config_framework, launcher_config, adapter=adapter) diff --git a/tools/accuracy_checker/accuracy_checker/launcher/loaders/__init__.py b/tools/accuracy_checker/accuracy_checker/launcher/loaders/__init__.py new file mode 100644 index 0000000..98217dd --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/loaders/__init__.py @@ -0,0 +1,26 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .loader import Loader + +from .pickle_loader import PickleLoader +from .xml_loader import XMLLoader + +__all__ = [ + 'Loader', + 'PickleLoader', + 'XMLLoader', +] diff --git a/tools/accuracy_checker/accuracy_checker/launcher/loaders/loader.py b/tools/accuracy_checker/accuracy_checker/launcher/loaders/loader.py new file mode 100644 index 0000000..7c07394 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/loaders/loader.py @@ -0,0 +1,54 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path + +from ...dependency import ClassProvider + + +class Loader(ClassProvider): + """ + Interface that describes loading output from another tool. + """ + + __provider_type__ = 'loader' + + def __init__(self, data_path: Path): + self._data_path = data_path + + def __len__(self): + raise NotImplementedError + + def __getitem__(self, item): + raise NotImplementedError + + +class DictLoaderMixin: + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.data = self.load() + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + if item not in self.data: + raise IndexError('There is no prediction object for "{}" input data'.format(item)) + + return self.data[item] + + def load(self): + raise NotImplementedError diff --git a/tools/accuracy_checker/accuracy_checker/launcher/loaders/pickle_loader.py b/tools/accuracy_checker/accuracy_checker/launcher/loaders/pickle_loader.py new file mode 100644 index 0000000..ba3578b --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/loaders/pickle_loader.py @@ -0,0 +1,34 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ...utils import read_pickle +from .loader import Loader, DictLoaderMixin + + +class PickleLoader(DictLoaderMixin, Loader): + """ + Class for loading output from another tool in .pickle format. + """ + + __provider__ = 'pickle' + + def load(self): + data = read_pickle(self._data_path) + + if isinstance(data, list) and all(hasattr(entry, 'identifier') for entry in data): + return dict(zip([representation.identifier for representation in data], data)) + + return data diff --git a/tools/accuracy_checker/accuracy_checker/launcher/loaders/xml_loader.py b/tools/accuracy_checker/accuracy_checker/launcher/loaders/xml_loader.py new file mode 100644 index 0000000..13c0de9 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/loaders/xml_loader.py @@ -0,0 +1,29 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ...utils import read_xml +from .loader import Loader, DictLoaderMixin + + +class XMLLoader(DictLoaderMixin, Loader): + """ + Class for loading output from another tool in .xml format. + """ + + __provider__ = 'xml' + + def load(self): + return read_xml(self._data_path) diff --git a/tools/accuracy_checker/accuracy_checker/launcher/model_conversion.py b/tools/accuracy_checker/accuracy_checker/launcher/model_conversion.py new file mode 100644 index 0000000..d87f4ab --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/launcher/model_conversion.py @@ -0,0 +1,196 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import sys +import subprocess +from pathlib import Path +from typing import Union +from ..utils import get_path, format_key + + +def convert_model(topology_name, model=None, weights=None, + framework='caffe', mo_search_paths=None, mo_params=None, mo_flags=None, + tf_custom_op_config_dir=None, tf_object_detection_api_config_dir=None): + """ + Args: + topology_name: name for converted model files. + model: path to the topology file. + weights: path to the weights file. + framework: framework name for original model. + mo_search_paths: paths where ModelOptimizer may be found. If None only default paths is used. + mo_params: value parameters for ModelOptimizer execution. + mo_flags: flags parameters for ModelOptimizer execution. + tf_custom_op_config_dir: path to Tensor Flow custom operations directory. + tf_object_detection_api_config_dir: path to Tensor Flow directory with config for object detection API. + Returns: + paths to converted to IE IR model and weights. + """ + + mo_params = mo_params or {} + mo_flags = mo_flags or [] + + set_topology_name(mo_params, topology_name) + + model_optimizer_executable = find_mo(mo_search_paths) + if not model_optimizer_executable: + raise EnvironmentError( + 'Model optimizer not found. Please set MO_DIR environment variable to model optimizer folder ' + 'installation or refer to help for command line options for providing Model optimizer' + ) + + framework_specific_options = { + 'caffe': {'input_model': weights, 'input_proto': model}, + 'mxnet': {'input_model': weights}, + 'tf': {'input_model': model}, + 'onnx': {'input_model': model}, + 'kaldi': {'input_model': model} + } + + mo_params['framework'] = framework + mo_params.update(framework_specific_options.get(framework, {})) + + set_path_to_custom_operation_configs(mo_params, framework, tf_custom_op_config_dir, model_optimizer_executable) + set_path_to_object_detection_api_pipeline_config(mo_params, framework, tf_object_detection_api_config_dir) + args = prepare_args(str(model_optimizer_executable), flag_options=mo_flags, value_options=mo_params) + + code = exec_mo_binary(args) + + if code.returncode != 0: + raise RuntimeError("Model optimizer conversion failed: ModelOptimizer returned non-zero code") + + model_file, bin_file = find_dlsdk_ir( + get_path(mo_params.get('output_dir', Path.cwd()), is_directory=True), mo_params['model_name'] + ) + if not bin_file or not model_file: + raise RuntimeError("Model optimizer finished correctly, but converted model is not found.") + + return model_file, bin_file + + +def find_dlsdk_ir(search_path: Path, model_name): + """ + Args: + search_path: path with IE IR of model. + model_name: name of the model. + Returns: + paths to IE IR of model. + """ + + xml_file = search_path / '{}.xml'.format(model_name) + bin_file = search_path / '{}.bin'.format(model_name) + + return get_path(xml_file), get_path(bin_file) + + +def find_mo(search_paths=None) -> Union[Path, None]: + """ + Args: + search_paths: paths where ModelOptimizer may be found. If None only default paths is used. + Returns: + path to the ModelOptimizer or None if it wasn't found. + """ + + default_mo_path = ('intel', 'computer_vision_sdk', 'deployment_tools', 'model_optimizer') + default_paths = [Path.home().joinpath(*default_mo_path), Path('/opt').joinpath(*default_mo_path)] + + executable = 'mo.py' + for path in search_paths or default_paths: + path = Path(path) + if not path.is_dir(): + continue + + mo = path / executable + if not mo.is_file(): + continue + + return mo + + return None + + +def prepare_args(executable, flag_options=None, value_options=None): + """ + Args: + executable: path to the executable. + flag_options: positional arguments for executable. + value_options: keyword arguments for executable. + Returns: + list with command-line entries. + """ + + result = [sys.executable, executable] + + for flag_option in flag_options or []: + result.append(str(format_key(flag_option))) + + for key, value in (value_options or {}).items(): + result.append(str(format_key(key))) + result.append(str(value)) + + return result + + +def exec_mo_binary(args, timeout=None): + """ + Args: + args: command-line entries. + timeout: timeout for execution. + Returns: + result of execution. + """ + + return subprocess.run(args, check=False, timeout=timeout) + + +def set_path_to_custom_operation_configs(mo_params, framework, tf_custom_op_config_dir, mo_path): + if framework != 'tf': + return mo_params + + config_path = mo_params.get('tensorflow_use_custom_operations_config') + if not config_path: + return mo_params + + if tf_custom_op_config_dir: + tf_custom_op_config_dir = Path(tf_custom_op_config_dir) + else: + tf_custom_op_config_dir = Path('/').joinpath(*mo_path.parts[:-1]) / 'extensions' / 'front' / 'tf' + + config_path = Path(config_path) + if not config_path.is_absolute(): + config_path = tf_custom_op_config_dir / config_path + + mo_params['tensorflow_use_custom_operations_config'] = str(get_path(config_path)) + + return mo_params + + +def set_path_to_object_detection_api_pipeline_config(mo_params, framework, object_detection_api_config_dir=None): + object_detection_api_config = mo_params.get('tensorflow_object_detection_api_pipeline_config') + if framework != 'tf' or not object_detection_api_config: + return mo_params + + object_detection_api_config_dir = Path(object_detection_api_config_dir or get_path(mo_params['input_model']).parent) + config_path = object_detection_api_config_dir / object_detection_api_config + mo_params['tensorflow_object_detection_api_pipeline_config'] = str(get_path(config_path)) + + return mo_params + + +def set_topology_name(mo_params, topology_name): + if not mo_params.get('model_name'): + mo_params['model_name'] = topology_name + + return mo_params diff --git a/tools/accuracy_checker/accuracy_checker/logging.py b/tools/accuracy_checker/accuracy_checker/logging.py new file mode 100644 index 0000000..cf25579 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/logging.py @@ -0,0 +1,134 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import logging.config +import os +import sys +import warnings + +_DEFAULT_LOGGER_NAME = 'accuracy_checker' +_DEFAULT_LOG_FILE = 'accuracy_checker.log' + +PRINT_INFO = logging.INFO + 5 +logging.addLevelName(PRINT_INFO, "PRINT_INFO") + +_LOG_LEVEL_ENVIRON = "ACCURACY_CHECKER_LOG_LEVEL" +_LOGGING_LEVEL = logging.getLevelName(os.environ.get(_LOG_LEVEL_ENVIRON, PRINT_INFO)) + + +class LoggingFormatter(logging.Formatter): + def format(self, record: logging.LogRecord): + if record.levelno == PRINT_INFO: + return record.msg + return super().format(record) + + +class ConsoleHandler(logging.StreamHandler): + def __init__(self, default_stream=sys.stdout): + super().__init__(default_stream) + self.default_stream = default_stream + self.err_stream = sys.stderr + + def emit(self, record): + if record.levelno >= logging.WARNING: + self.stream = self.err_stream + else: + self.stream = self.default_stream + super().emit(record) + + +_LOGGING_CONFIGURATION = { + 'loggers': { + _DEFAULT_LOGGER_NAME: { + 'handlers': ['console'], + 'level': _LOGGING_LEVEL, + 'propagate': False + } + }, + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'default': { + '()': LoggingFormatter, + 'format': '%(asctime)s %(name)s %(levelname)s: %(message)s', + 'datefmt': '%H:%M:%S' + }, + 'detailed': { + 'format': '%(asctime)s %(name)s %(levelname)s: %(message)s' + } + }, + 'handlers': { + 'console': { + 'level': 'DEBUG', + '()': ConsoleHandler, + 'formatter': 'default', + } + } +} + +logging.config.dictConfig(_LOGGING_CONFIGURATION) + +_default_logger = logging.getLogger(_DEFAULT_LOGGER_NAME) + + +def _warning_handler(message, category, filename, line_number): + s = warnings.formatwarning(message, category, filename, line_number) + _default_logger.warning(s) + + +warnings.showwarning = _warning_handler + + +def get_logger(logger_name: str): + if logger_name.startswith(_DEFAULT_LOGGER_NAME): + return _default_logger.getChild(logger_name) + return logging.getLogger(logger_name) + + +def error(msg, *args, **kwargs): + _default_logger.error(msg, *args, **kwargs) + + +def warning(msg, *args, raise_warning=True, **kwargs): + if raise_warning: + warnings.warn(msg) + else: + _default_logger.warning(msg, *args, **kwargs) + + +def info(msg, *args, **kwargs): + _default_logger.info(msg, *args, **kwargs) + + +def debug(msg, *args, **kwargs): + _default_logger.debug(msg, *args, **kwargs) + + +def print_info(msg, *args, **kwargs): + _default_logger.log(PRINT_INFO, msg, *args, **kwargs) + + +def add_file_handler(file_name): + file_info_handler_config = { + 'level': 'PRINT_INFO', + 'class': 'logging.handlers.WatchedFileHandler', + 'formatter': 'default', + 'filename': file_name + } + _LOGGING_CONFIGURATION['handlers']['file_info'] = file_info_handler_config + _LOGGING_CONFIGURATION['loggers'][_DEFAULT_LOGGER_NAME]['handlers'].append('file_info') + logging.config.dictConfig(_LOGGING_CONFIGURATION) diff --git a/tools/accuracy_checker/accuracy_checker/main.py b/tools/accuracy_checker/accuracy_checker/main.py new file mode 100644 index 0000000..61fe524 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/main.py @@ -0,0 +1,216 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path +from argparse import ArgumentParser +from functools import partial + +from .config import ConfigReader +from .logging import print_info, add_file_handler +from .model_evaluator import ModelEvaluator +from .progress_reporters import ProgressReporter +from .utils import get_path + + +def build_arguments_parser(): + parser = ArgumentParser(description='NN Validation on Caffe and IE', allow_abbrev=False) + parser.add_argument( + '-d', '--definitions', + help='path to the yml file with definitions', + type=get_path, + required=False + ) + parser.add_argument( + '-c', '--config', + help='path to the yml file with local configuration', + type=get_path, + required=True + ) + parser.add_argument( + '-m', '--models', + help='prefix path to the models and weights', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + parser.add_argument( + '-s', '--source', + help='prefix path to the data source', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + parser.add_argument( + '-a', '--annotations', + help='prefix path to the converted annotations and datasets meta data', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + parser.add_argument( + '-e', '--extensions', + help='prefix path to extensions folder', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + parser.add_argument( + '--cpu_extensions_mode', + help='specified preferable set of processor instruction for automatic searching cpu extension lib', + required=False, + choices=['avx2', 'sse4'] + ) + parser.add_argument( + '-b', '--bitstreams', + help='prefix path to bitstreams folder', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + parser.add_argument( + '--stored_predictions', + help='path to file with saved predictions. Used for development', + # since at the first time file does not exist and then created we can not always check existence + required=False + ) + parser.add_argument( + '-C', '--converted_models', + help='directory to store Model Optimizer converted models. Used for DLSDK launcher only', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + parser.add_argument( + '-M', '--model_optimizer', + help='path to model optimizer caffe directory', + type=partial(get_path, is_directory=True), + # there is no default value because if user did not specify it we use specific locations + # defined in model_conversion.py + required=False + ) + parser.add_argument( + '--tf_custom_op_config_dir', + help='path to directory with tensorflow custom operation configuration files for model optimizer', + type=partial(get_path, is_directory=True), + # there is no default value because if user did not specify it we use specific location + # defined in model_conversion.py + required=False + ) + parser.add_argument( + '--tf_obj_detection_api_pipeline_config_path', + help='path to directory with tensorflow object detection api pipeline configuration files for model optimizer', + type=partial(get_path, is_directory=True), + # there is no default value because if user did not specify it we use specific location + # defined in model_conversion.py + required=False + ) + parser.add_argument( + '--progress', + help='progress reporter', + required=False, + default='bar' + ) + parser.add_argument( + '-tf', '--target_framework', + help='framework for infer', + required=False + ) + parser.add_argument( + '-td', '--target_devices', + help='Space separated list of devices for infer', + required=False, + nargs='+' + ) + + parser.add_argument( + '-tt', '--target_tags', + help='Space separated list of launcher tags for infer', + required=False, + nargs='+' + ) + + parser.add_argument( + '-l', '--log_file', + help='file for additional logging results', + required=False + ) + + parser.add_argument( + '--ignore_result_formatting', + help='allow to get raw metrics results without data formatting', + required=False, + default=False + ) + + parser.add_argument( + '-am', '--affinity_map', + help='prefix path to the affinity maps', + type=partial(get_path, is_directory=True), + default=Path.cwd(), + required=False + ) + + parser.add_argument( + '--aocl', + help='aocl executable path for FPGA bitstream programming', + type=get_path, + required=False + ) + + return parser + + +def main(): + args = build_arguments_parser().parse_args() + progress_reporter = ProgressReporter.provide(( + args.progress if ':' not in args.progress + else args.progress.split(':')[0] + )) + if args.log_file: + add_file_handler(args.log_file) + + config = ConfigReader.merge(args) + + for model in config['models']: + for launcher_config in model['launchers']: + for dataset_config in model['datasets']: + print_processing_info( + model['name'], + launcher_config['framework'], + launcher_config['device'], + launcher_config.get('tags'), + dataset_config['name'] + ) + model_evaluator = ModelEvaluator.from_configs(launcher_config, dataset_config) + progress_reporter.reset(len(model_evaluator.dataset)) + model_evaluator.process_dataset(args.stored_predictions, progress_reporter=progress_reporter) + model_evaluator.compute_metrics(ignore_results_formatting=args.ignore_result_formatting) + + model_evaluator.release() + + +def print_processing_info(model, launcher, device, tags, dataset): + print_info('Processing info:') + print_info('model: {}'.format(model)) + print_info('launcher: {}'.format(launcher)) + if tags: + print_info('launcher tags: {}'.format(' '.join(tags))) + print_info('device: {}'.format(device)) + print_info('dataset: {}'.format(dataset)) + + +if __name__ == '__main__': + main() diff --git a/tools/accuracy_checker/accuracy_checker/metrics/README.md b/tools/accuracy_checker/accuracy_checker/metrics/README.md new file mode 100644 index 0000000..c1381b2 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/README.md @@ -0,0 +1,127 @@ +# Metrics + +For correct work metrics require specific representation format. +(e. g. map expects detection annotation and detection prediction for evaluation). + +In case when you use complicated representation located in representation container, you need to add options `annotation_source` and `prediction_source` in configuration file to +select specific representation, another way metric calculation possible only if container has only one suitable representation and will be resolved automatically. +`annotation_source` and `prediction_source` should contain only one annotation identifier and output layer name respectively. +You may optionally provide `reference` field for metric, if you want calculated metric tested against specific value (i.e. reported in canonical paper) and acceptable `threshold` for metric deviation from reference value. + +Every metric has parameters available for configuration. + +Accuracy Checker supports following set of metrics: + +* `accuracy` - classification accuracy metric, defined as the number of correct predictions divided by the total number of predictions. +Supported representation: `ClassificationAnnotation`, `ClassificationPrediction` + * `top_k` - the number of classes with the highest probability, which will be used to decide if prediction is correct. +* `accuracy_per_class` - classification accuracy metric which represents results for each class. Supported representation: `ClassificationAnnotation`, `ClassificationPrediction`. + * `top_k` - the number of classes with the highest probability, which will be used to decide if prediction is correct. + * `label_map` - the field in annotation metadata, which contains dataset label map. +* `character_recognition_accuracy` - accuracy metric for character recognition task. Supported representation: `CharacterRecognitionAnnotation`, `CharacterRecognitionPrediction`. +* `map` - mean average precision. Supported representations: `DetectionAnnotation`, `DetectionPrediction`. + * `overlap_threshold` - minimal value for intersection over union that allows to make decision that prediction bounding box is true positive. + * `overlap_method` - method for calculation bbox overlap. You can choose between intersection over union (`iou`), defined as area of intersection divided by union of annotation and prediction boxes areas, and intersection over area (`ioa`), defined as area of intersection divided by ara of prediction box. + * `include_boundaries` - allows include boundaries in overlap calculation process. If it is True then width and height of box is calculated by max - min + 1. + * `ignore_difficult` - allows to ignore difficult annotation boxes in metric calculation. In this case, difficult boxes are filtered annotations from postprocessing stage. + * `distinct_conf` - select only values for distinct confidences. + * `allow_multiple_matches_per_ignored` - allows multiple matches per ignored. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `integral` - integral type for average precision calculation. Pascal VOC `11point` and `max` approaches are available. +* `miss_rate` - miss rate metric of detection models. Supported representations: `DetectionAnnotation`, `DetectionPrediction`. + * `overlap_threshold` - minimal value for intersection over union that allows to make decision that prediction bounding box is true positive. + * `overlap_method` - method for calculation bbox overlap. You can choose between intersection over union (`iou`), defined as area of intersection divided by union of annotation and prediction boxes areas, and intersection over area (`ioa`), defined as area of intersection divided by ara of prediction box. + * `include_boundaries` - allows include boundaries in overlap calculation process. If it is True then width and height of box is calculated by max - min + 1. + * `ignore_difficult` - allows to ignore difficult annotation boxes in metric calculation. In this case, difficult boxes are filtered annotations from postprocessing stage. + * `distinct_conf` - select only values for distinct confidences. + * `allow_multiple_matches_per_ignored` - allows multiple matches per ignored. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `fppi_level` - false positive per image level. +* `recall` - recall metric of detection models. Supported representations: `DetectionAnnotation`, `DetectionPrediction`. + * `overlap_threshold` - minimal value for intersection over union that allows to make decision that prediction bounding box is true positive. + * `overlap_method` - method for calculation bbox overlap. You can choose between intersection over union (`iou`), defined as area of intersection divided by union of annotation and prediction boxes areas, and intersection over area (`ioa`), defined as area of intersection divided by ara of prediction box. + * `include_boundaries` - allows include boundaries in overlap calculation process. If it is True then width and height of box is calculated by max - min + 1. + * `ignore_difficult` - allows to ignore difficult annotation boxes in metric calculation. In this case, difficult boxes are filtered annotations from postprocessing stage. + * `distinct_conf` - select only values for distinct confidences. + * `allow_multiple_matches_per_ignored` - allows multiple matches per ignored. + * `label_map` - the field in annotation metadata, which contains dataset label map. +* `detection_accuracy` - accuracy for detection models. Supported representations: `DetectionAnnotation`, `DetectionPrediction`. + * `overlap_threshold` - minimal value for intersection over union that allows to make decision that prediction bounding box is true positive. + * `overlap_method` - method for calculation bbox overlap. You can choose between intersection over union (`iou`), defined as area of intersection divided by union of annotation and prediction boxes areas, and intersection over area (`ioa`), defined as area of intersection divided by ara of prediction box. + * `include_boundaries` - allows include boundaries in overlap calculation process. If it is True then width and height of box is calculated by max - min + 1. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `use_normalization` - allows to normalize confusion_matrix for metric calculation. +* `segmentation_accuracy` - pixel accuracy for semantic segmentation models. Supported representations: `SegmentationAnnotation`, `SegmentationPrediction`. + * `use_argmax` - allows to use argmax for prediction mask. +* `mean_iou` - mean intersection over union for semantic segmentation models. Supported representations: `SegmentationAnnotation`, `SegmentationPrediction`. + * `use_argmax` - allows to use argmax for prediction mask. +* `mean_accuracy` - mean accuracy for semantic segmentation models. Supported representations: `SegmentationAnnotation`, `SegmentationPrediction`. + * `use_argmax` - allows to use argmax for prediction mask. +* `frequency_weighted_accuracy` - frequency weighted accuracy for semantic segmentation models. Supported representations: `SegmentationAnnotation`, `SegmentationPrediction`. + * `use_argmax` - allows to use argmax for prediction mask. +More detailed information about calculation segmentation metrics you can find [here][segmentation_article]. +* `cmc` - Cumulative Matching Characteristics (CMC) score. Supported representations: `ReIdentificationAnnotation`, `ReIdentificationPrediction`. + * `top_k` - number of k highest ranked samples to consider when matching. + * `separate_camera_set` - should identities from the same camera view be filtered out. + * `single_gallery_shot` - each identity has only one instance in the gallery. + * `number_single_shot_repeats` - number of repeats for single_gallery_shot setting (required for CUHK). + * `first_match_break` - break on first matched gallery sample. +* `reid_map` - Mean Average Precision score for object reidentification. Supported representations: `ReIdentificationAnnotation`, `ReIdentificationPrediction`. + * `uninterpolated_auc` - should area under precision recall curve be computed using trapezoidal rule or directly. +* `pairwise_accuracy` - pairwise accuracy for object reidentification. Supported representations: `ReIdentificationClassificationAnnotation`, `ReIdentificationPrediction`. + * `min_score` - min score for determining that objects are different. You can provide value or use `train_median` value which will be calculated if annotations has training subset. +* `pairwise_accuracy_subsets` - object reidentification pairwise accuracy with division dataset on test and train subsets for calculation mean score. Supported representations: `ReIdentificationClassificationAnnotation`, `ReIdentificationPrediction`. + * `subset_number` - number of subsets for separating. +* `mae` - [Mean Absolute Error][mae]. Supported representations: `RegressionAnnotation`, `RegressionPrediction`. +* `mae_on_intervals` - Mean Absolute Error estimated magnitude for specific value range. Supported representations: `RegressionAnnotation`, `RegressionPrediction`. + * `intervals` - comma-separated list of interval boundaries. + * `ignore_values_not_in_interval` - allows create additional intervals for values less than minimal value in interval and greater than maximal. + * `start` , `step`, `end` - way to generate range of intervals from `start` to `end` with length `step`. +* `mse` - [Mean Squared Error][mse]. Supported representations: `RegressionAnnotation`, `RegressionPrediction`. +* `mse_on_intervals` - Mean Squared Error estimated magnitude for specific value range. Supported representations: `RegressionAnnotation`, `RegressionPrediction`. + * `intervals` - comma-separated list of interval boundaries. + * `ignore_values_not_in_interval` - allows create additional intervals for values less than minimal value in interval and greater than maximal. + * `start`, `step`, `end` - generate range of intervals from `start` to `end` with length `step`. +* `rmse` - [Root Mean Squared Error][rmse]. Supported representations: `RegressionAnnotation`, `RegressionPrediction`. +* `rmse_on_intervals` - Root Mean Squared Error estimated magnitude for specific value range. Supported representations: `RegressionAnnotation`, `RegressionPrediction`. + * `intervals` - comma-separated list of interval boundaries. + * `ignore_values_not_in_interval` - allows create additional intervals for values less than minimal value in interval and greater than maximal. + * `start`, `step`, `end` - generate range of intervals from `start` to `end` with length `step`. +* `per_point_normed_error` - Normed Error for measurement the quality of landmarks' positions. Estimated results for each point independently. Supported representations: `FacialLandmarksAnnotation`, `FacialLandmarksPrediction`. +* `normed_error` - Normed Error for measurement the quality of landmarks' positions. Supported representations: `FacialLandmarksAnnotation`, `FacialLandmarksPrediction`. + * `calculate_std` - allows calculation of standard deviation (default value: `False`) + * `percentile` - calculate error rate for given percentile. +* `per_point_regression` - Root Mean Squared Error for 2D points estimated results for each point independently. Supported representations: `PointRegressionAnnotation`, `PointRegressionPrediction`. + * `scaling_distance` - comma-separated list of 2 point indexes, distance between which will be used for scaling regression distances. +* `average point error` - Root Mean Squared Error for 2D points estimated average results for all points. Supported representations: `PointRegressionAnnotation`, `PointRegressionPrediction`. + * `scaling_distance` - comma-separated list of 2 point indexes, distance between which will be used for scaling regression distances. +* `multi_accuracy` - accuracy for multilabel recognition task. Supported representations: `MultiLabelRecognitionAnnotation`, `MultiLabelRecognitionPrediction`. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `calculate_average` - allows calculation of average accuracy (default value: `True`). +* `multi_precision` - precision metric for multilabel recognition. Supported representations: `MultiLabelRecognitionAnnotation`, `MultiLabelRecognitionPrediction`. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `calculate_average` - allows calculation of average precision (default value: `True`). +* `multi_recall` - recall metric for multilabel recognition. Supported representations: `MultiLabelRecognitionAnnotation`, `MultiLabelRecognitionPrediction`. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `calculate_average` - allows calculation of average recall (default value: `True`). +* `f1_score` - [F score][f_score] metric for multilabel recognition. Supported representations: `MultiLabelRecognitionAnnotation`, `MultiLabelRecognitionPrediction`. + * `label_map` - the field in annotation metadata, which contains dataset label map. + * `calculate_average` - allows calculation of average f-score (default value: `True`). +* `text_detection` - Harmonic mean of precision and recall for text detection task. Supported representations: `TextDetectionAnnotation`, `TextDetectionPrediction`. + * `iou_constrain` - minimal value for intersection over union that allows to make decision that prediction polygon is true positive. + * `ignore_difficult` - allows to ignore difficult ground truth text polygons in metric calculation. + * `area_precision_constrain` - minimal value for intersection over union that allows to make decision that prediction polygon matched with ignored annotation. +* `coco_precision` - MS COCO Average Precision metric for keypoints recognition and object detection tasks. Supported representations: `PoseEstimationAnnotation`, `PoseEstimationPrediction`, `DetectionAnnotation`, `DetectionPrediction`. + * `max_detections` - max number of predicted results per image. If you have more predictions,the results with minimal confidence will be ignored. + * `threshold` - intersection over union threshold. You can specify one value or comma separated range of values. This parameter supports precomputed values for standard COCO thresholds (`.5`, `.75`, `.5:.05:.95`). +* `coco_recall` - MS COCO Average Recall metric for keypoints recognition and object detection tasks. Supported representations: `PoseEstimationAnnotation`, `PoseEstimationPrediction`, `DetectionAnnotation`, `DetectionPrediction`. + * `max_detections` - max number of predicted results per image. If you have more predictions,the results with minimal confidence will be ignored. + * `threshold` - intersection over union threshold. You can specify one value or comma separated range of values. This parameter supports precomputed values for standard COCO thresholds (`.5`, `.75`, `.5:.05:.95`). +* `angle_error` - Mean angle error and Standard deviation of angle error for gaze estimation. Supported representations: `GazeVectorAnnotation`, `GazeVectorPrediction`. + +[segmentation_article]: https://arxiv.org/pdf/1411.4038v2.pdf +[mae]: https://en.wikipedia.org/wiki/Mean_absolute_error +[mse]: https://en.wikipedia.org/wiki/Mean_squared_error +[rmse]: https://en.wikipedia.org/wiki/Root-mean-square_deviation +[f_score]: https://en.wikipedia.org/wiki/F1_score +[psnr]: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio diff --git a/tools/accuracy_checker/accuracy_checker/metrics/__init__.py b/tools/accuracy_checker/accuracy_checker/metrics/__init__.py new file mode 100644 index 0000000..8fec449 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/__init__.py @@ -0,0 +1,92 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .metric_executor import MetricsExecutor + +from .classification import ClassificationAccuracy, ClassificationAccuracyClasses +from .detection import (DetectionMAP, MissRate, Recall, DetectionAccuracyMetric) +from .reid import CMCScore, ReidMAP, PairwiseAccuracy, PairwiseAccuracySubsets +from .semantic_segmentation import SegmentationAccuracy, SegmentationIOU, SegmentationMeanAccuracy, SegmentationFWAcc +from .character_recognition import CharacterRecognitionAccuracy +from .regression import ( + MeanAbsoluteErrorOnInterval, + MeanSquaredErrorOnInterval, + + MeanAbsoluteError, + MeanSquaredError, + + RootMeanSquaredErrorOnInterval, + RootMeanSquaredError, + + FacialLandmarksPerPointNormedError, + FacialLandmarksNormedError, + + PeakSignalToNoiseRatio, + + AngleError +) +from .multilabel_recognition import MultiLabelRecall, MultiLabelPrecision, MultiLabelAccuracy, F1Score +from .text_detection import TextDetectionMetric +from .coco_metrics import MSCOCOAveragePresicion +from .hit_ratio import HitRatioMetric, NDSGMetric + + +__all__ = [ + 'MetricsExecutor', + + 'ClassificationAccuracy', + 'ClassificationAccuracyClasses', + + 'DetectionMAP', + 'MissRate', + 'Recall', + 'DetectionAccuracyMetric', + + 'CMCScore', + 'ReidMAP', + 'PairwiseAccuracy', + 'PairwiseAccuracySubsets', + + 'SegmentationAccuracy', + 'SegmentationIOU', + 'SegmentationMeanAccuracy', + 'SegmentationFWAcc', + + 'CharacterRecognitionAccuracy', + + 'MeanAbsoluteError', + 'MeanSquaredError', + 'MeanAbsoluteErrorOnInterval', + 'MeanSquaredErrorOnInterval', + 'RootMeanSquaredError', + 'RootMeanSquaredErrorOnInterval', + 'FacialLandmarksPerPointNormedError', + 'FacialLandmarksNormedError', + 'PeakSignalToNoiseRatio', + 'AngleError', + + 'MultiLabelAccuracy', + 'MultiLabelRecall', + 'MultiLabelPrecision', + 'F1Score', + + 'TextDetectionMetric', + + 'MSCOCOAveragePresicion', + + 'HitRatioMetric', + 'NDSGMetric' +] diff --git a/tools/accuracy_checker/accuracy_checker/metrics/average_meter.py b/tools/accuracy_checker/accuracy_checker/metrics/average_meter.py new file mode 100644 index 0000000..3c2e37a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/average_meter.py @@ -0,0 +1,46 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + + +class AverageMeter: + def __init__(self, loss=None, counter=None): + self.loss = loss or (lambda x, y: int(x == y)) + self.counter = counter or (lambda x: 1) + self.accumulator = None + self.total_count = None + + def update(self, annotation_val, prediction_val): + loss = self.loss(annotation_val, prediction_val) + increment = self.counter(annotation_val) + + if self.accumulator is None and self.total_count is None: + # wrap in array for using numpy.divide with where attribute + # and support cases when loss function returns list-like object + self.accumulator = np.array(loss, dtype=float) + self.total_count = np.array(increment, dtype=float) + else: + self.accumulator += loss + self.total_count += increment + + def evaluate(self): + if self.total_count is None: + return 0.0 + + return np.divide( + self.accumulator, self.total_count, out=np.zeros_like(self.accumulator), where=self.total_count != 0 + ) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/character_recognition.py b/tools/accuracy_checker/accuracy_checker/metrics/character_recognition.py new file mode 100644 index 0000000..fbb11c8 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/character_recognition.py @@ -0,0 +1,36 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..representation import CharacterRecognitionAnnotation, CharacterRecognitionPrediction +from .metric import PerImageEvaluationMetric +from .average_meter import AverageMeter + + +class CharacterRecognitionAccuracy(PerImageEvaluationMetric): + __provider__ = 'character_recognition_accuracy' + + annotation_types = (CharacterRecognitionAnnotation, ) + prediction_types = (CharacterRecognitionPrediction, ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.accuracy = AverageMeter(lambda annotation, prediction: int(annotation == prediction)) + + def update(self, annotation, prediction): + self.accuracy.update(annotation.label, prediction.label) + + def evaluate(self, annotations, predictions): + return self.accuracy.evaluate() diff --git a/tools/accuracy_checker/accuracy_checker/metrics/classification.py b/tools/accuracy_checker/accuracy_checker/metrics/classification.py new file mode 100644 index 0000000..7213c71 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/classification.py @@ -0,0 +1,107 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..representation import ClassificationAnnotation, ClassificationPrediction +from ..config import NumberField, StringField +from .metric import BaseMetricConfig, PerImageEvaluationMetric +from .average_meter import AverageMeter + + +class ClassificationAccuracy(PerImageEvaluationMetric): + """ + Class for evaluating accuracy metric of classification models. + """ + + __provider__ = 'accuracy' + + annotation_types = (ClassificationAnnotation, ) + prediction_types = (ClassificationPrediction, ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def loss(annotation_label, prediction_top_k_labels): + return int(annotation_label in prediction_top_k_labels) + self.accuracy = AverageMeter(loss) + + def validate_config(self): + class _AccuracyValidator(BaseMetricConfig): + top_k = NumberField(floats=False, min_value=1, optional=True) + + accuracy_validator = _AccuracyValidator( + 'accuracy', + on_extra_argument=_AccuracyValidator.ERROR_ON_EXTRA_ARGUMENT + ) + accuracy_validator.validate(self.config) + + def configure(self): + self.top_k = self.config.get('top_k', 1) + + def update(self, annotation, prediction): + self.accuracy.update(annotation.label, prediction.top_k(self.top_k)) + + def evaluate(self, annotations, predictions): + return self.accuracy.evaluate() + + +class ClassificationAccuracyClasses(PerImageEvaluationMetric): + """ + Class for evaluating accuracy for each class of classification models. + """ + + __provider__ = 'accuracy_per_class' + + annotation_types = (ClassificationAnnotation, ) + prediction_types = (ClassificationPrediction, ) + + def validate_config(self): + class _AccuracyValidator(BaseMetricConfig): + top_k = NumberField(floats=False, min_value=1, optional=True) + label_map = StringField(optional=True) + + accuracy_validator = _AccuracyValidator( + 'accuracy', + on_extra_argument=_AccuracyValidator.ERROR_ON_EXTRA_ARGUMENT + ) + accuracy_validator.validate(self.config) + + def configure(self): + self.top_k = self.config.get('top_k', 1) + label_map = self.config.get('label_map', 'label_map') + self.labels = self.dataset.metadata.get(label_map) + self.meta['names'] = list(self.labels.values()) + + def loss(annotation_label, prediction_top_k_labels): + result = np.zeros_like(list(self.labels.keys())) + if annotation_label in prediction_top_k_labels: + result[annotation_label] = 1 + + return result + + def counter(annotation_label): + result = np.zeros_like(list(self.labels.keys())) + result[annotation_label] = 1 + return result + + self.accuracy = AverageMeter(loss, counter) + + def update(self, annotation, prediction): + self.accuracy.update(annotation.label, prediction.top_k(self.top_k)) + + def evaluate(self, annotations, predictions): + return self.accuracy.evaluate() diff --git a/tools/accuracy_checker/accuracy_checker/metrics/coco_metrics.py b/tools/accuracy_checker/accuracy_checker/metrics/coco_metrics.py new file mode 100644 index 0000000..8ed2237 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/coco_metrics.py @@ -0,0 +1,322 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from functools import singledispatch +from typing import Union +import numpy as np +from ..config import NumberField, BaseField +from ..representation import ( + DetectionPrediction, + DetectionAnnotation, + PoseEstimationPrediction, + PoseEstimationAnnotation +) +from ..utils import get_or_parse_value +from .overlap import Overlap +from .metric import BaseMetricConfig, PerImageEvaluationMetric + +COCO_THRESHOLDS = { + '.50': [0.5], + '.75': [0.75], + '.50:.05:.95': np.linspace(.5, 0.95, np.round((0.95 - .5) / .05).astype(int) + 1, endpoint=True) +} + + +class MSCOCOAveragePresicionMetricConfig(BaseMetricConfig): + max_detections = NumberField(optional=True) + threshold = BaseField(optional=True) + + +class MSCOCOBaseMetric(PerImageEvaluationMetric): + annotation_types = (PoseEstimationAnnotation, DetectionAnnotation) + prediction_types = (PoseEstimationPrediction, DetectionPrediction) + + def validate_config(self): + coco_config_validator = MSCOCOAveragePresicionMetricConfig( + 'coco_metric', on_extra_argument=MSCOCOAveragePresicionMetricConfig.ERROR_ON_EXTRA_ARGUMENT + ) + coco_config_validator.validate(self.config) + + def configure(self): + self.max_detections = self.config.get('max_detections', 20) + self.thresholds = get_or_parse_value(self.config.get('threshold', '.50:.05:.95'), COCO_THRESHOLDS) + label_map = self.dataset.metadata.get('label_map', []) + self.labels = [ + label for label in label_map + if label != self.dataset.metadata.get('background_label') + ] + self.meta['names'] = [label_map[label] for label in self.labels] + self.matching_results = [[] for _ in self.labels] + + def update(self, annotation, prediction): + compute_iou, create_boxes = select_specific_parameters(annotation) + + for label_id, label in enumerate(self.labels): + detections, scores, dt_difficult = prepare_predictions(prediction, label, self.max_detections) + ground_truth, gt_difficult, iscrowd, boxes, areas = prepare_annotations(annotation, label, create_boxes) + iou = compute_iou(ground_truth, detections, boxes, areas) + self.matching_results[label_id].append( + evaluate_image( + ground_truth, + gt_difficult, + iscrowd, + detections, + dt_difficult, + scores, + iou, + self.thresholds + )) + + def evaluate(self, annotations, predictions): + pass + + +class MSCOCOAveragePresicion(MSCOCOBaseMetric): + __provider__ = 'coco_precision' + + def evaluate(self, annotations, predictions): + precision = [ + compute_precision_recall(self.thresholds, self.matching_results[i])[0] + for i, _ in enumerate(self.labels) + ] + + return precision + + +class MSCOCORecall(MSCOCOBaseMetric): + __provider__ = 'coco_recall' + + def evaluate(self, annotations, predictions): + recalls = [ + compute_precision_recall(self.thresholds, self.matching_results[i])[1] + for i, _ in enumerate(self.labels) + ] + + return recalls +@singledispatch +def select_specific_parameters(annotation): + return compute_iou_boxes, False + +@select_specific_parameters.register(PoseEstimationAnnotation) +def pose_estimation_params(annotation): + return compute_oks, True + +@singledispatch +def prepare(entry, order): + return np.c_[entry.x_mins[order], entry.y_mins[order], entry.x_maxs[order], entry.y_maxs[order]] + + +@prepare.register(Union[PoseEstimationPrediction, PoseEstimationAnnotation]) +def prepare_keypoints(entry, order): + if entry.size == 0: + return [] + + if np.size(entry.x_values[order]) == 0: + return [] + + return np.concatenate((entry.x_values[order], entry.y_values[order], entry.visibility[order]), axis=-1) + + +def prepare_predictions(prediction, label, max_detections): + if prediction.size == 0: + return [], [], [] + prediction_ids = prediction.labels == label + scores = prediction.scores[prediction_ids] + if np.size(scores) == 0: + return [], [], [] + scores_ids = np.argsort(- scores, kind='mergesort') + difficult_box_mask = np.full(prediction.size, False) + difficult_box_mask[prediction.metadata.get('difficult_boxes', [])] = True + difficult_for_label = difficult_box_mask[prediction_ids] + if len(scores_ids) > max_detections: + scores_ids = scores_ids[:max_detections] + detections = prepare(prediction, prediction_ids) + detections = detections[scores_ids] + + return detections, scores[scores_ids], difficult_for_label[scores_ids] + + +def prepare_annotations(annotation, label, create_boxes=False): + annotation_ids = annotation.labels == label + difficult_box_mask = np.full(annotation.size, False) + difficult_box_indices = annotation.metadata.get("difficult_boxes", []) + iscrowd = np.array(annotation.metadata.get('iscrowd', [0]*annotation.size)) + difficult_box_mask[difficult_box_indices] = True + difficult_box_mask[iscrowd > 0] = True + difficult_label = difficult_box_mask[annotation_ids] + not_difficult_box_indices = np.argwhere(~difficult_label).reshape(-1) + difficult_box_indices = np.argwhere(difficult_label).reshape(-1) + iscrowd_label = iscrowd[annotation_ids] + order = np.hstack((not_difficult_box_indices, difficult_box_indices)).astype(int) + boxes = None + areas = None + if create_boxes: + boxes = np.array(annotation.bboxes) + boxes = boxes[annotation_ids] + areas = np.array(annotation.areas) + areas = areas[annotation_ids] if np.size(areas) > 0 else np.array([]) + boxes = boxes[order] + areas = areas[order] + + return prepare(annotation, annotation_ids)[order], difficult_label[order], iscrowd_label[order], boxes, areas + + +def compute_precision_recall(thresholds, matching_results): + num_thresholds = len(thresholds) + rectangle_thresholds = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) + num_rec_thresholds = len(rectangle_thresholds) + precision = -np.ones((num_thresholds, num_rec_thresholds)) # -1 for the precision of absent categories + recall = -np.ones(num_thresholds) + dt_scores = np.concatenate([e['scores'] for e in matching_results]) + inds = np.argsort(-dt_scores, kind='mergesort') + dtm = np.concatenate([e['dt_matches'] for e in matching_results], axis=1)[:, inds] + dt_ignored = np.concatenate([e['dt_ignore'] for e in matching_results], axis=1)[:, inds] + gt_ignored = np.concatenate([e['gt_ignore'] for e in matching_results]) + npig = np.count_nonzero(gt_ignored == 0) + tps = np.logical_and(dtm, np.logical_not(dt_ignored)) + fps = np.logical_and(np.logical_not(dtm), np.logical_not(dt_ignored)) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): + tp = np.array(tp) + fp = np.array(fp) + num_detections = len(tp) + rc = tp / npig + pr = tp / (fp + tp + np.spacing(1)) + q = np.zeros(num_rec_thresholds) + + if num_detections: + recall[t] = rc[-1] + else: + recall[t] = 0 + + # numpy is slow without cython optimization for accessing elements + # use python array gets significant speed improvement + pr = pr.tolist() + q = q.tolist() + + for i in range(num_detections - 1, 0, -1): + if pr[i] > pr[i - 1]: + pr[i - 1] = pr[i] + + inds = np.searchsorted(rc, rectangle_thresholds, side='left') + try: + for ri, pi in enumerate(inds): + q[ri] = pr[pi] + except IndexError: + pass + precision[t] = np.array(q) + + mean_precision = 0 if np.size(precision[precision > -1]) == 0 else np.mean(precision[precision > -1]) + mean_recall = 0 if np.size(recall[recall > -1]) == 0 else np.mean(recall[recall > -1]) + + return mean_precision, mean_recall + + +def compute_iou_boxes(annotation, prediction, *args, **kwargs): + if np.size(annotation) == 0 or np.size(prediction) == 0: + return [] + overlap = Overlap.provide('iou') + iou = np.zeros((prediction.size // 4, annotation.size // 4), dtype=np.float32) + for i, box_a in enumerate(annotation): + for j, box_b in enumerate(prediction): + iou[j, i] = overlap(box_a, box_b) + + return iou + + +def compute_oks(annotation_points, prediction_points, annotation_boxes, annotation_areas): + if np.size(prediction_points) == 0 or np.size(annotation_points) == 0: + return [] + oks = np.zeros((len(prediction_points), len(annotation_points))) + sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89])/10.0 + variance = (sigmas * 2)**2 + # compute oks between each detection and ground truth object + for gt_idx, gt_points in enumerate(annotation_points): + # create bounds for ignore regions(double the gt bbox) + xgt = gt_points[:17] + ygt = gt_points[17:34] + vgt = gt_points[34:] + k1 = np.count_nonzero(vgt > 0) + x0_bbox, y0_bbox, x1_bbox, y1_bbox = annotation_boxes[gt_idx] + area_gt = annotation_areas[gt_idx] + w_bbox = x1_bbox - x0_bbox + h_bbox = y1_bbox - y0_bbox + x0 = x0_bbox - w_bbox + x1 = x0_bbox + w_bbox * 2 + y0 = y0_bbox - h_bbox + y1 = y0_bbox + h_bbox * 2 + for dt_idx, dt_points in enumerate(prediction_points): + xdt = dt_points[:17] + ydt = dt_points[17:34] + if k1 > 0: + # measure the per-keypoint distance if keypoints visible + x_diff = xdt - xgt + y_diff = ydt - ygt + else: + # measure minimum distance to keypoints in (x0,y0) & (x1,y1) + zeros = np.zeros(len(sigmas)) + x_diff = np.max((zeros, x0 - xdt), axis=0) + np.max((zeros, xdt - x1), axis=0) + y_diff = np.max((zeros, y0 - ydt), axis=0) + np.max((zeros, ydt - y1), axis=0) + evaluation = (x_diff ** 2 + y_diff ** 2) / variance / (area_gt + np.spacing(1)) / 2 + if k1 > 0: + evaluation = evaluation[vgt > 0] + oks[dt_idx, gt_idx] = np.sum(np.exp(- evaluation)) / evaluation.shape[0] + + return oks + + +def evaluate_image(ground_truth, gt_difficult, iscrowd, detections, dt_difficult, scores, iou, thresholds): + thresholds_num = len(thresholds) + gt_num = len(ground_truth) + dt_num = len(detections) + gt_matched = np.zeros((thresholds_num, gt_num)) + dt_matched = np.zeros((thresholds_num, dt_num)) + gt_ignored = gt_difficult + dt_ignored = np.zeros((thresholds_num, dt_num)) + if np.size(iou): + for tind, t in enumerate(thresholds): + for dtind, _ in enumerate(detections): + # information about best match so far (matched_id = -1 -> unmatched) + iou_current = min([t, 1-1e-10]) + matched_id = -1 + for gtind, _ in enumerate(ground_truth): + # if this gt already matched, and not a crowd, continue + if gt_matched[tind, gtind] > 0 and not iscrowd[gtind]: + continue + # if dt matched to reg gt, and on ignore gt, stop + if matched_id > -1 and not gt_ignored[matched_id] and gt_ignored[gtind]: + break + # continue to next gt unless better match made + if iou[dtind, gtind] < iou_current: + continue + # if match successful and best so far, store appropriately + iou_current = iou[dtind, gtind] + matched_id = gtind + # if match made store id of match for both dt and gt + if matched_id == -1: + continue + dt_ignored[tind, dtind] = gt_ignored[matched_id] + dt_matched[tind, dtind] = 1 + gt_matched[tind, matched_id] = dtind + # store results for given image + return { + 'dt_matches': dt_matched, + 'gt_matches': gt_matched, + 'gt_ignore': gt_ignored, + 'dt_ignore': np.logical_or(dt_ignored, dt_difficult), + 'scores': scores + } diff --git a/tools/accuracy_checker/accuracy_checker/metrics/detection.py b/tools/accuracy_checker/accuracy_checker/metrics/detection.py new file mode 100644 index 0000000..97ce961 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/detection.py @@ -0,0 +1,487 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import bisect +import enum +import warnings +from typing import List + +import numpy as np + +from ..utils import finalize_metric_result +from .overlap import Overlap, IOA +from ..config import BoolField, NumberField, StringField +from ..representation import DetectionAnnotation, DetectionPrediction +from .metric import BaseMetricConfig, FullDatasetEvaluationMetric + + +class APIntegralType(enum.Enum): + voc_11_point = '11point' + voc_max = 'max' + + +class BaseDetectionMetricConfig(BaseMetricConfig): + overlap_threshold = NumberField(min_value=0, max_value=1, optional=True) + ignore_difficult = BoolField(optional=True) + include_boundaries = BoolField(optional=True) + distinct_conf = BoolField(optional=True) + allow_multiple_matches_per_ignored = BoolField(optional=True) + overlap_method = StringField(optional=True, choices=Overlap.providers) + use_filtered_tp = BoolField(optional=True) + + +class BaseDetectionMetricMixin: + def configure(self): + self.overlap_threshold = self.config.get('overlap_threshold', 0.5) + self.ignore_difficult = self.config.get('ignore_difficult', True) + self.include_boundaries = self.config.get('include_boundaries', True) + self.distinct_conf = self.config.get('distinct_conf', False) + self.allow_multiple_matches_per_ignored = self.config.get('allow_multiple_matches_per_ignored', False) + self.overlap_method = Overlap.provide(self.config.get('overlap', 'iou'), self.include_boundaries) + self.use_filtered_tp = self.config.get('use_filtered_tp', False) + + label_map = self.config.get('label_map', 'label_map') + labels = self.dataset.metadata.get(label_map, {}) + self.labels = labels.keys() + valid_labels = list(filter(lambda x: x != self.dataset.metadata.get('background_label'), self.labels)) + self.meta['names'] = [labels[name] for name in valid_labels] + + def per_class_detection_statistics(self, annotations, predictions, labels): + labels_stat = {} + for label in labels: + tp, fp, conf, n = bbox_match( + annotations, predictions, int(label), + self.overlap_method, self.overlap_threshold, + self.ignore_difficult, self.allow_multiple_matches_per_ignored, self.include_boundaries, + self.use_filtered_tp + ) + + if not tp.size: + labels_stat[label] = { + 'precision': np.array([]), + 'recall': np.array([]), + 'thresholds': conf, + 'fppi': np.array([]) + } + continue + + # select only values for distinct confidences + if self.distinct_conf: + distinct_value_indices = np.where(np.diff(conf))[0] + threshold_indexes = np.r_[distinct_value_indices, tp.size - 1] + else: + threshold_indexes = np.arange(conf.size) + + tp, fp = np.cumsum(tp)[threshold_indexes], np.cumsum(fp)[threshold_indexes] + + labels_stat[label] = { + 'precision': tp / np.maximum(tp + fp, np.finfo(np.float64).eps), + 'recall': tp / np.maximum(n, np.finfo(np.float64).eps), + 'thresholds': conf[threshold_indexes], + 'fppi': fp / len(annotations) + } + + return labels_stat + + +class DetectionMAP(BaseDetectionMetricMixin, FullDatasetEvaluationMetric): + """ + Class for evaluating mAP metric of detection models. + """ + + __provider__ = 'map' + + annotation_types = (DetectionAnnotation, ) + prediction_types = (DetectionPrediction, ) + + def validate_config(self): + class _MAPConfigValidator(BaseDetectionMetricConfig): + integral = StringField(choices=[e.value for e in APIntegralType], optional=True) + + map_config_validator = _MAPConfigValidator( + self.__provider__, on_extra_argument=_MAPConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + map_config_validator.validate(self.config) + + def configure(self): + super().configure() + self.integral = APIntegralType(self.config.get('integral', APIntegralType.voc_max)) + + def evaluate(self, annotations, predictions): + valid_labels = get_valid_labels(self.labels, self.dataset.metadata.get('background_label')) + labels_stat = self.per_class_detection_statistics(annotations, predictions, valid_labels) + + average_precisions = [] + for label in labels_stat: + label_precision = labels_stat[label]['precision'] + label_recall = labels_stat[label]['recall'] + if label_recall.size: + ap = average_precision(label_precision, label_recall, self.integral) + average_precisions.append(ap) + else: + average_precisions.append(np.nan) + + average_precisions, self.meta['names'] = finalize_metric_result(average_precisions, self.meta['names']) + if not average_precisions: + warnings.warn("No detections to compute mAP") + average_precisions.append(0) + + return average_precisions + + +class MissRate(BaseDetectionMetricMixin, FullDatasetEvaluationMetric): + """ + Class for evaluating Miss Rate metric of detection models. + """ + + __provider__ = 'miss_rate' + + annotation_types = (DetectionAnnotation, ) + prediction_types = (DetectionPrediction, ) + + def validate_config(self): + class _MRConfigValidator(BaseDetectionMetricConfig): + fppi_level = NumberField(min_value=0, max_value=1) + + nms_config_validator = _MRConfigValidator( + self.__provider__, on_extra_argument=_MRConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + nms_config_validator.validate(self.config) + + def configure(self): + super().configure() + self.fppi_level = self.config.get('fppi_level') + + def evaluate(self, annotations, predictions): + valid_labels = get_valid_labels(self.labels, self.dataset.metadata.get('background_label')) + labels_stat = self.per_class_detection_statistics(annotations, predictions, valid_labels) + + miss_rates = [] + for label in labels_stat: + label_miss_rate = 1.0 - labels_stat[label]['recall'] + label_fppi = labels_stat[label]['fppi'] + + position = bisect.bisect_left(label_fppi, self.fppi_level) + m0 = max(0, position - 1) + m1 = position if position < len(label_miss_rate) else m0 + miss_rates.append(0.5 * (label_miss_rate[m0] + label_miss_rate[m1])) + + return miss_rates + + +class Recall(BaseDetectionMetricMixin, FullDatasetEvaluationMetric): + """ + Class for evaluating recall metric of detection models. + """ + + __provider__ = 'recall' + + annotation_types = (DetectionAnnotation, ) + prediction_types = (DetectionPrediction, ) + + def validate_config(self): + recall_config_validator = BaseDetectionMetricConfig( + self.__provider__, on_extra_argument=BaseDetectionMetricConfig.ERROR_ON_EXTRA_ARGUMENT + ) + recall_config_validator.validate(self.config) + + def evaluate(self, annotations, predictions): + valid_labels = get_valid_labels(self.labels, self.dataset.metadata.get('background_label')) + labels_stat = self.per_class_detection_statistics(annotations, predictions, valid_labels) + + recalls = [] + for label in labels_stat: + label_recall = labels_stat[label]['recall'] + if label_recall.size: + max_recall = label_recall[-1] + recalls.append(max_recall) + else: + recalls.append(np.nan) + + recalls, self.meta['names'] = finalize_metric_result(recalls, self.meta['names']) + if not recalls: + warnings.warn("No detections to compute mAP") + recalls.append(0) + + return recalls + + +class DetectionAccuracyMetric(BaseDetectionMetricMixin, FullDatasetEvaluationMetric): + __provider__ = 'detection_accuracy' + + annotation_types = (DetectionAnnotation, ) + prediction_types = (DetectionPrediction, ) + + def validate_config(self): + class _DAConfigValidator(BaseDetectionMetricConfig): + use_normalization = BoolField(optional=True) + + da_config_validator = _DAConfigValidator( + self.__provider__, on_extra_argument=_DAConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + da_config_validator.validate(self.config) + + def configure(self): + super().configure() + self.use_normalization = self.config.get('use_normalization', False) + + def evaluate(self, annotations, predictions): + all_matches, _, _ = match_detections_class_agnostic( + predictions, annotations, self.overlap_threshold, self.overlap_method + ) + cm = confusion_matrix(all_matches, predictions, annotations, len(self.labels)) + if self.use_normalization: + return np.mean(normalize_confusion_matrix(cm).diagonal()) + + return float(np.sum(cm.diagonal())) / float(np.maximum(1, np.sum(cm))) + + +def confusion_matrix(all_matched_ids, predicted_data, gt_data, num_classes): + out_cm = np.zeros([num_classes, num_classes], dtype=np.int32) + for gt, prediction in zip(gt_data, predicted_data): + for match_pair in all_matched_ids[gt.identifier]: + gt_label = int(gt.labels[match_pair[0]]) + pred_label = int(prediction.labels[match_pair[1]]) + out_cm[gt_label, pred_label] += 1 + + return out_cm + + +def normalize_confusion_matrix(cm): + row_sums = np.maximum(1, np.sum(cm, axis=1, keepdims=True)).astype(np.float32) + return cm.astype(np.float32) / row_sums + + +def match_detections_class_agnostic(predicted_data, gt_data, min_iou, overlap_method): + all_matches = {} + total_gt_bbox_num = 0 + matched_gt_bbox_num = 0 + + for gt, prediction in zip(gt_data, predicted_data): + gt_bboxes = np.stack((gt.x_mins, gt.y_mins, gt.x_maxs, gt.y_maxs), axis=-1) + predicted_bboxes = np.stack( + (prediction.x_mins, prediction.y_mins, prediction.x_maxs, prediction.y_maxs), axis=-1 + ) + + total_gt_bbox_num += len(gt_bboxes) + + similarity_matrix = calculate_similarity_matrix(gt_bboxes, predicted_bboxes, overlap_method) + + matches = [] + for _ in gt_bboxes: + best_match_pos = np.unravel_index(similarity_matrix.argmax(), similarity_matrix.shape) + best_match_value = similarity_matrix[best_match_pos] + + if best_match_value <= min_iou: + break + + gt_id = best_match_pos[0] + predicted_id = best_match_pos[1] + + similarity_matrix[gt_id, :] = 0.0 + similarity_matrix[:, predicted_id] = 0.0 + + matches.append((gt_id, predicted_id)) + matched_gt_bbox_num += 1 + + all_matches[gt.identifier] = matches + + return all_matches, total_gt_bbox_num, matched_gt_bbox_num + + +def calculate_similarity_matrix(set_a, set_b, overlap): + similarity = np.zeros([len(set_a), len(set_b)], dtype=np.float32) + for i, box_a in enumerate(set_a): + for j, box_b in enumerate(set_b): + similarity[i, j] = overlap(box_a, box_b) + + return similarity + + +def average_precision(precision, recall, integral): + if integral == APIntegralType.voc_11_point: + result = 0. + for point in np.arange(0., 1.1, 0.1): + accumulator = 0 if np.sum(recall >= point) == 0 else np.max(precision[recall >= point]) + result = result + accumulator / 11. + + return result + + if integral != APIntegralType.voc_max: + raise NotImplementedError("Integral type not implemented") + + # first append sentinel values at the end + recall = np.concatenate(([0.], recall, [1.])) + precision = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(precision.size - 1, 0, -1): + precision[i - 1] = np.maximum(precision[i - 1], precision[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + change_point = np.where(recall[1:] != recall[:-1])[0] + # and sum (\Delta recall) * recall + return np.sum((recall[change_point + 1] - recall[change_point]) * precision[change_point + 1]) + + +def bbox_match(annotation: List[DetectionAnnotation], prediction: List[DetectionPrediction], label, overlap_evaluator, + overlap_thresh=0.5, ignore_difficult=True, allow_multiple_matches_per_ignored=True, + include_boundaries=True, use_filtered_tp=False): + """ + Args: + annotation: ground truth bounding boxes. + prediction: predicted bounding boxes. + label: class for which bounding boxes are matched. + overlap_evaluator: evaluator of overlap. + overlap_thresh: bounding box IoU threshold. + ignore_difficult: ignores difficult bounding boxes (see Pascal VOC). + allow_multiple_matches_per_ignored: allows multiple matches per ignored. + include_boundaries: if is True then width and height of box is calculated by max - min + 1. + use_filtered_tp: if is True then ignored object are counted during evaluation. + Returns: + tp: tp[i] == 1 if detection with i-th highest score is true positive. + fp: fp[i] == 1 if detection with i-th highest score is false positive. + thresholds: array of confidence thresholds. + number_ground_truth = number of true positives. + """ + + used_boxes, number_ground_truth, difficult_boxes_annotation = _prepare_annotation_boxes( + annotation, ignore_difficult, label + ) + prediction_boxes, prediction_images, difficult_boxes_prediction = _prepare_prediction_boxes( + label, prediction, ignore_difficult + ) + + tp = np.zeros_like(prediction_images) + fp = np.zeros_like(prediction_images) + + for image in range(prediction_images.shape[0]): + gt_img = annotation[prediction_images[image]] + annotation_difficult = difficult_boxes_annotation[gt_img.identifier] + used = used_boxes[gt_img.identifier] + + idx = gt_img.labels == label + if not np.array(idx).any(): + fp[image] = 1 + continue + + prediction_box = prediction_boxes[image][1:] + annotation_boxes = gt_img.x_mins[idx], gt_img.y_mins[idx], gt_img.x_maxs[idx], gt_img.y_maxs[idx] + + overlaps = overlap_evaluator(prediction_box, annotation_boxes) + if ignore_difficult and allow_multiple_matches_per_ignored: + ioa = IOA(include_boundaries) + ignored = np.where(annotation_difficult == 1)[0] + ignored_annotation_boxes = ( + annotation_boxes[0][ignored], annotation_boxes[1][ignored], + annotation_boxes[2][ignored], annotation_boxes[3][ignored] + ) + overlaps[ignored] = ioa.evaluate(prediction_box, ignored_annotation_boxes) + + max_overlap = -np.inf + + not_ignored_overlaps = overlaps[np.where(annotation_difficult == 0)[0]] + ignored_overlaps = overlaps[np.where(annotation_difficult == 1)[0]] + if not_ignored_overlaps.size: + max_overlap = np.max(not_ignored_overlaps) + + if max_overlap < overlap_thresh and ignored_overlaps.size: + max_overlap = np.max(ignored_overlaps) + max_overlapped = np.where(overlaps == max_overlap)[0] + + def set_false_positive(box_index): + is_box_difficult = difficult_boxes_prediction[box_index].any() + return int(not ignore_difficult or not is_box_difficult) + + if max_overlap < overlap_thresh: + fp[image] = set_false_positive(image) + continue + + if not annotation_difficult[max_overlapped].any(): + if not used[max_overlapped].any(): + if not ignore_difficult or use_filtered_tp or not difficult_boxes_prediction[image].any(): + tp[image] = 1 + used[max_overlapped] = True + else: + fp[image] = set_false_positive(image) + elif not allow_multiple_matches_per_ignored: + if used[max_overlapped].any(): + fp[image] = set_false_positive(image) + used[max_overlapped] = True + + return tp, fp, prediction_boxes[:, 0], number_ground_truth + + +def _prepare_annotation_boxes(annotation, ignore_difficult, label): + used_boxes = {} + difficult_boxes = {} + num_ground_truth = 0 + + for ground_truth in annotation: + idx_for_label = ground_truth.labels == label + filtered_label = ground_truth.labels[idx_for_label] + used_ = np.zeros_like(filtered_label) + used_boxes[ground_truth.identifier] = used_ + num_ground_truth += used_.shape[0] + + difficult_box_mask = np.full_like(ground_truth.labels, False) + difficult_box_indices = ground_truth.metadata.get("difficult_boxes", []) + if ignore_difficult: + difficult_box_mask[difficult_box_indices] = True + difficult_box_mask = difficult_box_mask[idx_for_label] + + difficult_boxes[ground_truth.identifier] = difficult_box_mask + if ignore_difficult: + num_ground_truth -= np.sum(difficult_box_mask) + + return used_boxes, num_ground_truth, difficult_boxes + + +def _prepare_prediction_boxes(label, predictions, ignore_difficult): + prediction_images = [] + prediction_boxes = [] + indexes = [] + difficult_boxes = [] + for i, prediction in enumerate(predictions): + idx = prediction.labels == label + + prediction_images.append(np.full(prediction.labels[idx].shape, i)) + prediction_boxes.append(np.c_[ + prediction.scores[idx], + prediction.x_mins[idx], prediction.y_mins[idx], prediction.x_maxs[idx], prediction.y_maxs[idx] + ]) + + difficult_box_mask = np.full_like(prediction.labels, False) + difficult_box_indices = prediction.metadata.get("difficult_boxes", []) + if ignore_difficult: + difficult_box_mask[difficult_box_indices] = True + + difficult_boxes.append(difficult_box_mask) + indexes.append(np.argwhere(idx)) + + prediction_boxes = np.concatenate(prediction_boxes) + difficult_boxes = np.concatenate(difficult_boxes) + sorted_order = np.argsort(-prediction_boxes[:, 0]) + prediction_boxes = prediction_boxes[sorted_order] + prediction_images = np.concatenate(prediction_images)[sorted_order] + difficult_boxes = difficult_boxes[sorted_order] + + return prediction_boxes, prediction_images, difficult_boxes + + +def get_valid_labels(labels, background): + return list(filter(lambda label: label != background, labels)) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/hit_ratio.py b/tools/accuracy_checker/accuracy_checker/metrics/hit_ratio.py new file mode 100644 index 0000000..6d5d7a1 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/hit_ratio.py @@ -0,0 +1,100 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import heapq +import math + +import numpy as np + +from ..representation import HitRatioAnnotation, HitRatioPrediction +from .metric import FullDatasetEvaluationMetric, BaseMetricConfig +from ..config import NumberField + +class BaseRecommenderMetric(FullDatasetEvaluationMetric): + annotation_types = (HitRatioAnnotation, ) + prediction_types = (HitRatioPrediction, ) + + def __init__(self, discounter, *args, **kwargs): + super().__init__(*args, **kwargs) + self.discounter = discounter or (lambda item, rank: int(item in rank)) + + + def validate_config(self): + class _RecommenderValidator(BaseMetricConfig): + top_k = NumberField(floats=False, min_value=1, optional=True) + + recommender_validator = _RecommenderValidator( + 'recommend', + on_extra_argument=_RecommenderValidator.ERROR_ON_EXTRA_ARGUMENT + ) + recommender_validator.validate(self.config) + + def configure(self): + self.top_k = self.config.get('top_k', 10) + self.users_num = self.dataset.metadata.get('users_number') + self.pred_per_user = {i: [] for i in range(self.users_num)} + self.gt_items = {} + + def update(self, annotation, prediction): + self.pred_per_user[prediction.user].append((prediction.item, prediction.scores)) + if annotation.positive: + self.gt_items[annotation.user] = annotation.item + + def evaluate(self, annotations, predictions): + iter_num = len(self.pred_per_user[0]) + + measure = [] + for user in range(self.users_num): + map_item_score = {} + for j in range(iter_num): + item = self.pred_per_user[user][j][0] + score = self.pred_per_user[user][j][1] + map_item_score[item] = score + ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get) + measure.append(self.discounter(self.gt_items[user], ranklist)) + + return np.mean(measure) + +def hit_ratio_discounter(item, rank): + return int(item in rank) + +def ndcg_discunter(item, rank): + if item in rank: + return math.log(2) / math.log(rank.index(item) + 2) + + return 0 + + +class HitRatioMetric(BaseRecommenderMetric): + """ + Class for evaluating Hit Ratio metric + """ + + __provider__ = 'hit_ratio' + + def __init__(self, *args, **kwargs): + super().__init__(hit_ratio_discounter, *args, **kwargs) + + +class NDSGMetric(BaseRecommenderMetric): + """ + Class for evaluating Normalized Discounted Cumulative Gain metric + """ + + __provider__ = 'ndcg' + + def __init__(self, *args, **kwargs): + super().__init__(ndcg_discunter, *args, **kwargs) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/metric.py b/tools/accuracy_checker/accuracy_checker/metrics/metric.py new file mode 100644 index 0000000..0cb6189 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/metric.py @@ -0,0 +1,159 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..representation import ContainerRepresentation +from ..config import ConfigError +from ..utils import is_single_metric_source, get_supported_representations +from ..presenters import BasePresenter +from ..config import ConfigValidator, NumberField, StringField +from ..dependency import ClassProvider +from ..utils import zipped_transform + + +class BaseMetricConfig(ConfigValidator): + type = StringField() + name = StringField(optional=True) + reference = NumberField(optional=True) + threshold = NumberField(min_value=0, optional=True) + presenter = StringField(choices=BasePresenter.providers, optional=True) + label_map = StringField(optional=True) + prediction_source = StringField(optional=True) + annotation_source = StringField(optional=True) + + +class Metric(ClassProvider): + """ + Interface for evaluating metrics. + """ + + __provider_type__ = 'metric' + + annotation_types = () + prediction_types = () + + def __init__(self, config, dataset, name=None, state=None): + self.config = config + self.name = name + self.dataset = dataset + self.state = state + self._update_iter = 0 + self.meta = {} + + self.validate_config() + self.configure() + message_unsupported_multi_source = 'metric {} does not support several {} sources' + self.annotation_source = self.config.get('annotation_source') + + if self.annotation_source and not is_single_metric_source(self.annotation_source): + raise ConfigError(message_unsupported_multi_source.format(self.name, 'annotation')) + + self.prediction_source = self.config.get('prediction_source') + if self.prediction_source and not is_single_metric_source(self.prediction_source): + raise ConfigError(message_unsupported_multi_source.format(self.name, 'prediction')) + + def __call__(self, *args, **kwargs): + return self.submit_all(*args, **kwargs) + + def submit(self, annotation, prediction): + self.update(annotation, prediction) + + def submit_all(self, annotations, predictions): + return self.evaluate(annotations, predictions) + + def update(self, annotation, prediction): + pass + + def evaluate(self, annotations, predictions): + raise NotImplementedError + + def configure(self): + """ + Specifies configuration structure for metric entry. + """ + + pass + + def validate_config(self): + """ + Validate that metric entry meets all configuration structure requirements. + """ + + BaseMetricConfig(self.name, on_extra_argument=BaseMetricConfig.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def _update_state(self, fn, state_key, default_factory=None): + iter_key = "{}_global_it".format(state_key) + if state_key not in self.state: + default = default_factory() if default_factory else None + self.state[state_key] = default + self.state[iter_key] = 0 + + self._update_iter += 1 + if self.state[iter_key] < self._update_iter: + self.state[iter_key] += 1 + self.state[state_key] = fn(self.state[state_key]) + + def _resolve_representation_containers(self, annotation, prediction): + def get_resolve_subject(representation, source=None): + if not isinstance(representation, ContainerRepresentation): + return representation + + if not source: + return representation.values() + + representation = representation.get(source) + if not representation: + raise ConfigError('{} not found'.format(source)) + + return representation + + annotation = get_resolve_subject(annotation, self.annotation_source) + prediction = get_resolve_subject(prediction, self.prediction_source) + + def resolve(representation, supported_types, representation_name): + message_not_found = 'suitable {} for metric {} not found' + message_need_source = 'you need specify {} source for metric {}' + + representation = get_supported_representations(representation, supported_types) + if not representation: + raise ConfigError(message_not_found.format(representation_name, self.name)) + + if len(representation) > 1: + raise ConfigError(message_need_source.format(representation_name, self.name)) + + return representation[0] + + resolved_annotation = resolve(annotation, self.annotation_types, 'annotation') + resolved_prediction = resolve(prediction, self.prediction_types, 'prediction') + + return resolved_annotation, resolved_prediction + + +class PerImageEvaluationMetric(Metric): + def submit(self, annotation, prediction): + annotation_, prediction_ = self._resolve_representation_containers(annotation, prediction) + self.update(annotation_, prediction_) + + def evaluate(self, annotations, predictions): + raise NotImplementedError + + +class FullDatasetEvaluationMetric(Metric): + def submit_all(self, annotations, predictions): + annotations_, predictions_ = zipped_transform(self._resolve_representation_containers, annotations, predictions) + return self.evaluate(annotations_, predictions_) + + def evaluate(self, annotations, predictions): + raise NotImplementedError diff --git a/tools/accuracy_checker/accuracy_checker/metrics/metric_executor.py b/tools/accuracy_checker/accuracy_checker/metrics/metric_executor.py new file mode 100644 index 0000000..cd24e9a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/metric_executor.py @@ -0,0 +1,106 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from collections import namedtuple + +from ..presenters import BasePresenter, EvaluationResult +from ..config import StringField +from ..utils import zipped_transform +from .metric import BaseMetricConfig, Metric +from ..config import ConfigError + +MetricInstance = namedtuple('MetricInstance', ['name', 'metric_fn', 'reference', 'threshold', 'presenter']) + + +class MetricConfig(BaseMetricConfig): + type = StringField(choices=Metric.providers) + + +class MetricsExecutor: + """ + Class for evaluating metrics according to dataset configuration entry. + """ + + def __init__(self, dataset_config, dataset, state=None): + dataset_name = dataset_config.get('name', '') + message_prefix = '{}'.format(dataset_name) + + self.state = state or {} + self._token = 'metrics' + + dataset_metrics = dataset_config.get(self._token) + if not dataset_metrics: + raise ConfigError('{} dataset config must specify "{}"'.format(message_prefix, self._token)) + + self.dataset = dataset + + self.metrics = [] + type_ = 'type' + identifier = 'name' + reference = 'reference' + threshold = 'threshold' + presenter = 'presenter' + + for metric_config_entry in dataset_metrics: + metric_config = MetricConfig( + "{}.metrics".format(dataset_name), on_extra_argument=MetricConfig.IGNORE_ON_EXTRA_ARGUMENT + ) + metric_type = metric_config_entry.get(type_) + metric_config.validate(metric_config_entry, type_) + + metric_identifier = metric_config_entry.get(identifier, metric_type) + + metric_fn = Metric.provide( + metric_type, metric_config_entry, self.dataset, metric_identifier, state=self.state + ) + metric_presenter = BasePresenter.provide(metric_config_entry.get(presenter, 'print_scalar')) + + self.metrics.append(MetricInstance( + metric_identifier, + metric_fn, + metric_config_entry.get(reference), + metric_config_entry.get(threshold), + metric_presenter + )) + + def update_metrics_on_object(self, annotation, prediction): + """ + Updates metric value corresponding given annotation and prediction objects. + """ + + for metric in self.metrics: + metric.metric_fn.submit(annotation, prediction) + + def update_metrics_on_batch(self, annotation, prediction): + """ + Updates metric value corresponding given batch. + + Args: + annotation: list of batch number of annotation objects. + prediction: list of batch number of prediction objects. + """ + + zipped_transform(self.update_metrics_on_object, annotation, prediction) + + def iterate_metrics(self, annotations, predictions): + for name, functor, reference, threshold, presenter in self.metrics: + yield presenter, EvaluationResult( + name=name, + evaluated_value=functor(annotations, predictions), + reference_value=reference, + threshold=threshold, + meta=functor.meta, + ) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/multilabel_recognition.py b/tools/accuracy_checker/accuracy_checker/metrics/multilabel_recognition.py new file mode 100644 index 0000000..14f107e --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/multilabel_recognition.py @@ -0,0 +1,189 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from .metric import PerImageEvaluationMetric, BaseMetricConfig +from ..representation import MultiLabelRecognitionAnnotation, MultiLabelRecognitionPrediction +from ..config import StringField, BoolField + + +class MultiLabelMetric(PerImageEvaluationMetric): + annotation_types = (MultiLabelRecognitionAnnotation,) + prediction_types = (MultiLabelRecognitionPrediction,) + + def validate_config(self): + class _MultiLabelConfigValidator(BaseMetricConfig): + label_map = StringField(optional=True) + calculate_average = BoolField(optional=True) + + config_validator = _MultiLabelConfigValidator( + 'accuracy', on_extra_argument=_MultiLabelConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + config_validator.validate(self.config) + + def configure(self): + label_map = self.config.get('label_map', 'label_map') + self.labels = self.dataset.metadata.get(label_map) + self.calculate_average = self.config.get('calculate_average', True) + + self.meta['scale'] = 1 + self.meta['postfix'] = '' + self.meta['calculate_mean'] = False + self.meta['names'] = list(self.labels.values()) + if self.calculate_average: + self.meta['names'].append('average') + self.tp = np.zeros_like(list(self.labels.keys()), dtype=np.float) + self.fp = np.zeros_like(list(self.labels.keys()), dtype=np.float) + self.tn = np.zeros_like(list(self.labels.keys()), dtype=np.float) + self.fn = np.zeros_like(list(self.labels.keys()), dtype=np.float) + + self.counter = np.zeros_like(list(self.labels.keys()), dtype=np.float) + + def update(self, annotation, prediction): + def loss(annotation_labels, prediction_labels): + tp_result = np.zeros_like(list(self.labels.keys()), dtype=np.float) + fp_results = np.zeros_like(list(self.labels.keys()), dtype=np.float) + tn_results = np.zeros_like(list(self.labels.keys()), dtype=np.float) + fn_results = np.zeros_like(list(self.labels.keys()), dtype=np.float) + + for index, label in enumerate(annotation_labels): + if label == 1 and label == prediction_labels[index]: + tp_result[index] = 1. + continue + + if label == 1 and label != prediction_labels[index]: + fn_results[index] = 1. + continue + + if label == 0 and label == prediction_labels[index]: + tn_results[index] = 1. + continue + + if label == 0 and label != prediction_labels[index]: + fp_results[index] = 1. + continue + + return tp_result, fp_results, tn_results, fn_results + + def counter(annotation_label): + count = np.zeros_like(annotation_label, dtype=float) + cond = np.where(np.array(annotation_label) != -1) + count[cond] = 1. + return count + + tp_upd, fp_upd, tn_upd, fn_upd = loss(annotation.multi_label, prediction.multi_label) + self.tp = np.add(self.tp, tp_upd) + self.fp = np.add(self.fp, fp_upd) + self.tn = np.add(self.tn, tn_upd) + self.fn = np.add(self.fn, fn_upd) + + self.counter = np.add(self.counter, counter(annotation.multi_label)) + + def evaluate(self, annotations, predictions): + pass + + +class MultiLabelAccuracy(MultiLabelMetric): + __provider__ = 'multi_accuracy' + + def evaluate(self, annotations, predictions): + tp_tn = np.add(self.tp, self.tn, dtype=float) + per_class = np.divide(tp_tn, self.counter, out=np.zeros_like(tp_tn, dtype=float), where=self.counter != 0) + average = np.sum(tp_tn) / np.sum(self.counter) + + return [*per_class, average] + + +class MultiLabelPrecision(MultiLabelMetric): + __provider__ = 'multi_precision' + + def evaluate(self, annotations, predictions): + tp_fp = np.add(self.tp, self.fp, dtype=float) + per_class = np.divide(self.tp, tp_fp, out=np.zeros_like(self.tp, dtype=float), where=tp_fp != 0) + if not self.calculate_average: + return per_class + average = np.sum(self.tp) / np.sum(tp_fp) + + return [*per_class, average] + + +class MultiLabelRecall(MultiLabelMetric): + __provider__ = 'multi_recall' + + def evaluate(self, annotations, predictions): + tp_fn = np.add(self.tp, self.fn, dtype=float) + per_class = np.divide(self.tp, tp_fn, out=np.zeros_like(self.tp, dtype=float), where=tp_fn != 0) + if not self.calculate_average: + return per_class + average = np.sum(self.tp) / np.sum(tp_fn) + + return [*per_class, average] + + +class F1Score(PerImageEvaluationMetric): + __provider__ = 'f1-score' + annotation_types = (MultiLabelRecognitionAnnotation,) + prediction_types = (MultiLabelRecognitionPrediction,) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.precision = MultiLabelPrecision(self.config, self.dataset) + self.recall = MultiLabelRecall(self.config, self.dataset) + + def validate_config(self): + class _F1ScoreValidator(BaseMetricConfig): + label_map = StringField(optional=True) + calculate_average = BoolField(optional=True) + + f1_score_config_validator = _F1ScoreValidator( + 'f1_score', on_extra_argument=_F1ScoreValidator.ERROR_ON_EXTRA_ARGUMENT + ) + f1_score_config_validator.validate(self.config) + + def configure(self): + label_map = self.config.get('label_map', 'label_map') + self.labels = self.dataset.metadata.get(label_map) + self.calculate_average = self.config.get('calculate_average', True) + self.meta['names'] = list(self.labels.values()) + if self.calculate_average: + self.meta['names'].append('average') + + self.meta['scale'] = 1 + self.meta['postfix'] = '' + self.meta['calculate_mean'] = False + self.meta['names'] = list(self.labels.values()) + ['average'] + + def update(self, annotation, prediction): + self.precision.update(annotation, prediction) + self.recall.update(annotation, prediction) + + def evaluate(self, annotations, predictions): + precisions = self.precision.evaluate(annotations, predictions) + recalls = self.recall.evaluate(annotations, predictions) + + precision_add = np.add(precisions[:-1], recalls[:-1], dtype=float) + precision_multiply = np.multiply(precisions[:-1], recalls[:-1], dtype=float) + + per_class = 2 * np.divide( + precision_multiply, precision_add, out=np.zeros_like(precision_multiply, dtype=float), + where=precision_add != 0 + ) + if not self.calculate_average: + return per_class + + average = 2 * (precisions[-1] * recalls[-1]) / (precisions[-1] + recalls[-1]) + + return [*per_class, average] diff --git a/tools/accuracy_checker/accuracy_checker/metrics/overlap.py b/tools/accuracy_checker/accuracy_checker/metrics/overlap.py new file mode 100644 index 0000000..d9fffc7 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/overlap.py @@ -0,0 +1,71 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..dependency import ClassProvider + + +class Overlap(ClassProvider): + __provider_type__ = 'overlap' + + @staticmethod + def intersections(prediction_box, annotation_boxes): + px_min, py_min, px_max, py_max = prediction_box + ax_mins, ay_mins, ax_maxs, ay_maxs = annotation_boxes + + x_mins = np.maximum(ax_mins, px_min) + y_mins = np.maximum(ay_mins, py_min) + x_maxs = np.minimum(ax_maxs, px_max) + y_maxs = np.minimum(ay_maxs, py_max) + + return x_mins, y_mins, np.maximum(x_mins, x_maxs), np.maximum(y_mins, y_maxs) + + def __init__(self, include_boundaries=None): + self.boundary = 1 if include_boundaries else 0 + + def __call__(self, *args, **kwargs): + return self.evaluate(*args, **kwargs) + + def evaluate(self, prediction_box, annotation_boxes): + raise NotImplementedError + + def area(self, box): + x0, y0, x1, y1 = box + return (x1 - x0 + self.boundary) * (y1 - y0 + self.boundary) + + +class IOU(Overlap): + __provider__ = 'iou' + + def evaluate(self, prediction_box, annotation_boxes): + intersections_area = self.area(self.intersections(prediction_box, annotation_boxes)) + unions = self.area(prediction_box) + self.area(annotation_boxes) - intersections_area + return np.divide( + intersections_area, unions, out=np.zeros_like(intersections_area, dtype=float), where=unions != 0 + ) + + +class IOA(Overlap): + __provider__ = 'ioa' + + def evaluate(self, prediction_box, annotation_boxes): + intersections_area = self.area(self.intersections(prediction_box, annotation_boxes)) + prediction_area = self.area(prediction_box) + return np.divide( + intersections_area, prediction_area, out=np.zeros_like(intersections_area, dtype=float), + where=prediction_area != 0 + ) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/regression.py b/tools/accuracy_checker/accuracy_checker/metrics/regression.py new file mode 100644 index 0000000..894acdc --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/regression.py @@ -0,0 +1,360 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import warnings +import math +import numpy as np + +from ..representation import ( + RegressionAnnotation, + RegressionPrediction, + FacialLandmarksAnnotation, + FacialLandmarksPrediction, + SuperResolutionAnnotation, + SuperResolutionPrediction, + GazeVectorAnnotation, + GazeVectorPrediction +) + +from .metric import PerImageEvaluationMetric, BaseMetricConfig +from ..config import BaseField, NumberField, BoolField, ConfigError +from ..utils import string_to_tuple, finalize_metric_result + + +class BaseRegressionMetric(PerImageEvaluationMetric): + annotation_types = (RegressionAnnotation, ) + prediction_types = (RegressionPrediction, ) + + def __init__(self, value_differ, *args, **kwargs): + super().__init__(*args, **kwargs) + self.value_differ = value_differ + + def configure(self): + self.meta.update({'names': ['mean', 'std'], 'scale': 1, 'postfix': ' ', 'calculate_mean': False}) + self.magnitude = [] + + def update(self, annotation, prediction): + self.magnitude.append(self.value_differ(annotation.value, prediction.value)) + + def evaluate(self, annotations, predictions): + return np.mean(self.magnitude), np.std(self.magnitude) + + +class BaseIntervalRegressionMetricConfig(BaseMetricConfig): + intervals = BaseField(optional=True) + start = NumberField(optional=True) + end = NumberField(optional=True) + step = NumberField(optional=True) + ignore_values_not_in_interval = BoolField(optional=True) + + +class BaseRegressionOnIntervals(PerImageEvaluationMetric): + annotation_types = (RegressionAnnotation, ) + prediction_types = (RegressionPrediction, ) + + def __init__(self, value_differ, *args, **kwargs): + super().__init__(*args, **kwargs) + self.value_differ = value_differ + + def validate_config(self): + validator = BaseIntervalRegressionMetricConfig( + 'regression_on_intervals_config', + on_extra_argument=BaseIntervalRegressionMetricConfig.ERROR_ON_EXTRA_ARGUMENT + ) + validator.validate(self.config) + + def configure(self): + self.meta.update({'scale': 1, 'postfix': ' ', 'calculate_mean': False}) + self.ignore_out_of_range = self.config.get('ignore_values_not_in_interval', True) + + self.intervals = self.config.get('intervals') + if not self.intervals: + stop = self.config.get('end') + if not stop: + raise ConfigError('intervals or start-step-end of interval should be specified for metric') + + start = self.config.get('start', 0.0) + step = self.config.get('step', 1.0) + self.intervals = np.arange(start, stop + step, step) + + if not isinstance(self.intervals, (list, np.ndarray)): + self.intervals = string_to_tuple(self.intervals) + + self.intervals = np.unique(self.intervals) + self.magnitude = [[] for _ in range(len(self.intervals) + 1)] + + self.meta['names'] = ([]) + if not self.ignore_out_of_range: + self.meta['names'] = (['mean: < ' + str(self.intervals[0]), 'std: < ' + str(self.intervals[0])]) + + for index in range(len(self.intervals) - 1): + self.meta['names'].append('mean: <= ' + str(self.intervals[index]) + ' < ' + str(self.intervals[index + 1])) + self.meta['names'].append('std: <= ' + str(self.intervals[index]) + ' < ' + str(self.intervals[index + 1])) + + if not self.ignore_out_of_range: + self.meta['names'].append('mean: > ' + str(self.intervals[-1])) + self.meta['names'].append('std: > ' + str(self.intervals[-1])) + + def update(self, annotation, prediction): + index = find_interval(annotation.value, self.intervals) + self.magnitude[index].append(self.value_differ(annotation.value, prediction.value)) + + def evaluate(self, annotations, predictions): + if self.ignore_out_of_range: + self.magnitude = self.magnitude[1:-1] + + result = [[np.mean(values), np.std(values)] if values else [np.nan, np.nan] for values in self.magnitude] + result, self.meta['names'] = finalize_metric_result(np.reshape(result, -1), self.meta['names']) + + if not result: + warnings.warn("No values in given interval") + result.append(0) + + return result + + +class MeanAbsoluteError(BaseRegressionMetric): + __provider__ = 'mae' + + def __init__(self, *args, **kwargs): + super().__init__(mae_differ, *args, **kwargs) + + +class MeanSquaredError(BaseRegressionMetric): + __provider__ = 'mse' + + def __init__(self, *args, **kwargs): + super().__init__(mse_differ, *args, **kwargs) + + +class RootMeanSquaredError(BaseRegressionMetric): + __provider__ = 'rmse' + + def __init__(self, *args, **kwargs): + super().__init__(mse_differ, *args, **kwargs) + + def evaluate(self, annotations, predictions): + return np.sqrt(np.mean(self.magnitude)), np.sqrt(np.std(self.magnitude)) + + +class MeanAbsoluteErrorOnInterval(BaseRegressionOnIntervals): + __provider__ = 'mae_on_interval' + + def __init__(self, *args, **kwargs): + super().__init__(mae_differ, *args, **kwargs) + + +class MeanSquaredErrorOnInterval(BaseRegressionOnIntervals): + __provider__ = 'mse_on_interval' + + def __init__(self, *args, **kwargs): + super().__init__(mse_differ, *args, **kwargs) + + +class RootMeanSquaredErrorOnInterval(BaseRegressionOnIntervals): + __provider__ = 'rmse_on_interval' + + def __init__(self, *args, **kwargs): + super().__init__(mse_differ, *args, **kwargs) + + def evaluate(self, annotations, predictions): + if self.ignore_out_of_range: + self.magnitude = self.magnitude[1:-1] + + result = [] + for values in self.magnitude: + error = [np.sqrt(np.mean(values)), np.sqrt(np.std(values))] if values else [np.nan, np.nan] + result.append(error) + + result, self.meta['names'] = finalize_metric_result(np.reshape(result, -1), self.meta['names']) + + if not result: + warnings.warn("No values in given interval") + result.append(0) + + return result + + +class FacialLandmarksPerPointNormedError(PerImageEvaluationMetric): + __provider__ = 'per_point_normed_error' + + annotation_types = (FacialLandmarksAnnotation, ) + prediction_types = (FacialLandmarksPrediction, ) + + def configure(self): + self.meta.update({'scale': 1, 'postfix': ' ', 'calculate_mean': True, 'data_format': '{:.4f}'}) + self.magnitude = [] + + def update(self, annotation, prediction): + result = point_regression_differ( + annotation.x_values, annotation.y_values, prediction.x_values, prediction.y_values + ) + result /= np.maximum(annotation.interocular_distance, np.finfo(np.float64).eps) + self.magnitude.append(result) + + def evaluate(self, annotations, predictions): + num_points = np.shape(self.magnitude)[1] + point_result_name_pattern = 'point_{}_normed_error' + self.meta['names'] = [point_result_name_pattern.format(point_id) for point_id in range(num_points)] + per_point_rmse = np.mean(self.magnitude, axis=1) + per_point_rmse, self.meta['names'] = finalize_metric_result(per_point_rmse, self.meta['names']) + + return per_point_rmse + + +class NormedErrorMetricConfig(BaseMetricConfig): + calculate_std = BoolField(optional=True) + percentile = NumberField(optional=True, floats=False, min_value=0, max_value=100) + + +class FacialLandmarksNormedError(PerImageEvaluationMetric): + __provider__ = 'normed_error' + + annotation_types = (FacialLandmarksAnnotation, ) + prediction_types = (FacialLandmarksPrediction, ) + + def validate_config(self): + config_validator = NormedErrorMetricConfig( + 'normed_error_config', NormedErrorMetricConfig.ERROR_ON_EXTRA_ARGUMENT + ) + config_validator.validate(self.config) + + def configure(self): + self.calculate_std = self.config.get('calculate_std', False) + self.percentile = self.config.get('percentile') + self.meta.update({ + 'scale': 1, + 'postfix': ' ', + 'calculate_mean': not self.calculate_std or not self.percentile, + 'data_format': '{:.4f}', + 'names': ['mean'] + }) + self.magnitude = [] + + def update(self, annotation, prediction): + per_point_result = point_regression_differ( + annotation.x_values, annotation.y_values, prediction.x_values, prediction.y_values + ) + avg_result = np.sum(per_point_result) / len(per_point_result) + avg_result /= np.maximum(annotation.interocular_distance, np.finfo(np.float64).eps) + self.magnitude.append(avg_result) + + def evaluate(self, annotations, predictions): + result = [np.mean(self.magnitude)] + + if self.calculate_std: + result.append(np.std(self.magnitude)) + self.meta['names'].append('std') + + if self.percentile: + sorted_magnitude = np.sort(self.magnitude) + index = len(self.magnitude) / 100 * self.percentile + result.append(sorted_magnitude[int(index)]) + self.meta['names'].append('{}th percentile'.format(self.percentile)) + + return result + + +def calculate_distance(x_coords, y_coords, selected_points): + first_point = [x_coords[selected_points[0]], y_coords[selected_points[0]]] + second_point = [x_coords[selected_points[1]], y_coords[selected_points[1]]] + return np.linalg.norm(np.subtract(first_point, second_point)) + + +def mae_differ(annotation_val, prediction_val): + return np.abs(annotation_val - prediction_val) + + +def mse_differ(annotation_val, prediction_val): + return (annotation_val - prediction_val)**2 + + +def find_interval(value, intervals): + for index, point in enumerate(intervals): + if value < point: + return index + + return len(intervals) + + +def point_regression_differ(annotation_val_x, annotation_val_y, prediction_val_x, prediction_val_y): + loss = np.subtract(list(zip(annotation_val_x, annotation_val_y)), list(zip(prediction_val_x, prediction_val_y))) + return np.linalg.norm(loss, 2, axis=1) + + +class PeakSignalToNoiseRatio(BaseRegressionMetric): + __provider__ = 'psnr' + + annotation_types = (SuperResolutionAnnotation, ) + prediction_types = (SuperResolutionPrediction, ) + + def __init__(self, *args, **kwargs): + super().__init__(self._psnr_differ, *args, **kwargs) + + def validate_config(self): + class _PSNRConfig(BaseMetricConfig): + scale_border = NumberField(optional=True, min_value=0) + + config_validator = _PSNRConfig('psnr', on_extra_argument=_PSNRConfig.ERROR_ON_EXTRA_ARGUMENT) + config_validator.validate(self.config) + + def configure(self): + super().configure() + self.scale_border = self.config.get('scale_border', 4) + + def _psnr_differ(self, annotation_image, prediction_image): + prediction = np.asarray(prediction_image).astype(np.float) + ground_truth = np.asarray(annotation_image).astype(np.float) + + height, width = prediction.shape[:2] + prediction = prediction[ + self.scale_border:height - self.scale_border, + self.scale_border:width - self.scale_border + ] + ground_truth = ground_truth[ + self.scale_border:height - self.scale_border, + self.scale_border:width - self.scale_border + ] + image_difference = (prediction - ground_truth) / 255. # rgb color space + + r_channel_diff = image_difference[:, :, 0] + g_channel_diff = image_difference[:, :, 1] + b_channel_diff = image_difference[:, :, 2] + + channels_diff = (r_channel_diff * 65.738 + g_channel_diff * 129.057 + b_channel_diff * 25.064) / 256 + + mse = np.mean(channels_diff ** 2) + if mse == 0: + return np.Infinity + + return -10 * math.log10(mse) + + +def angle_differ(gt_gaze_vector, predicted_gaze_vector): + return np.arccos( + gt_gaze_vector.dot(predicted_gaze_vector) / np.linalg.norm(gt_gaze_vector) + / np.linalg.norm(predicted_gaze_vector) + ) * 180 / np.pi + + +class AngleError(BaseRegressionMetric): + __provider__ = 'angle_error' + + annotation_types = (GazeVectorAnnotation, ) + prediction_types = (GazeVectorPrediction, ) + + def __init__(self, *args, **kwargs): + super().__init__(angle_differ, *args, **kwargs) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/reid.py b/tools/accuracy_checker/accuracy_checker/metrics/reid.py new file mode 100644 index 0000000..2adf069 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/reid.py @@ -0,0 +1,379 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from collections import defaultdict, namedtuple +from sklearn.metrics import auc, precision_recall_curve +# noinspection PyProtectedMember +from sklearn.metrics.base import _average_binary_score +import numpy as np + +from ..representation import ( + ReIdentificationClassificationAnnotation, + ReIdentificationAnnotation, + ReIdentificationPrediction +) +from ..config import BaseField, BoolField, NumberField +from .metric import BaseMetricConfig, FullDatasetEvaluationMetric + +PairDesc = namedtuple('PairDesc', 'image1 image2 same') + + +class CMCScore(FullDatasetEvaluationMetric): + """ + Cumulative Matching Characteristics (CMC) score. + + Config: + annotation: reid annotation. + prediction: predicted embeddings. + top_k: number of k highest ranked samples to consider when matching. + separate_camera_set: should identities from the same camera view be filtered out. + single_gallery_shot: each identity has only one instance in the gallery. + number_single_shot_repeats: number of repeats for single_gallery_shot setting. + first_match_break: break on first matched gallery sample. + """ + + __provider__ = 'cmc' + + annotation_types = (ReIdentificationAnnotation, ) + prediction_types = (ReIdentificationPrediction, ) + + def validate_config(self): + class _CMCConfigValidator(BaseMetricConfig): + top_k = NumberField(floats=False, min_value=1, optional=True) + separate_camera_set = BoolField(optional=True) + single_gallery_shot = BoolField(optional=True) + first_match_break = BoolField(optional=True) + number_single_shot_repeats = NumberField(floats=False, optional=True) + + validator = _CMCConfigValidator('cmc', on_extra_argument=_CMCConfigValidator.ERROR_ON_EXTRA_ARGUMENT) + validator.validate(self.config) + + def configure(self): + self.top_k = self.config.get('top_k', 1) + self.separate_camera_set = self.config.get('separate_camera_set', False) + self.single_gallery_shot = self.config.get('single_gallery_shot', False) + self.first_match_break = self.config.get('first_match_break', True) + self.number_single_shot_repeats = self.config.get('number_single_shot_repeats', 10) + + def evaluate(self, annotations, predictions): + dist_matrix = distance_matrix(annotations, predictions) + gallery_cameras, gallery_pids, query_cameras, query_pids = get_gallery_query_pids(annotations) + + _cmc_score = eval_cmc( + dist_matrix, query_pids, gallery_pids, query_cameras, gallery_cameras, self.separate_camera_set, + self.single_gallery_shot, self.first_match_break, self.number_single_shot_repeats + ) + + return _cmc_score[self.top_k - 1] + + +class ReidMAP(FullDatasetEvaluationMetric): + """ + Mean Average Precision score. + + Config: + annotation: reid annotation. + prediction: predicted embeddings. + interpolated_auc: should area under precision recall curve be computed using trapezoidal rule or directly. + """ + + __provider__ = 'reid_map' + + annotation_types = (ReIdentificationAnnotation, ) + prediction_types = (ReIdentificationPrediction, ) + + def validate_config(self): + class _ReidMapConfig(BaseMetricConfig): + interpolated_auc = BoolField(optional=True) + + validator = _ReidMapConfig('reid_map', on_extra_argument=_ReidMapConfig.ERROR_ON_EXTRA_ARGUMENT) + validator.validate(self.config) + + def configure(self): + self.interpolated_auc = self.config.get('interpolated_auc', True) + + def evaluate(self, annotations, predictions): + dist_matrix = distance_matrix(annotations, predictions) + gallery_cameras, gallery_pids, query_cameras, query_pids = get_gallery_query_pids(annotations) + + return eval_map( + dist_matrix, query_pids, gallery_pids, query_cameras, gallery_cameras, self.interpolated_auc + ) + + +class PairwiseAccuracy(FullDatasetEvaluationMetric): + __provider__ = 'pairwise_accuracy' + + annotation_types = (ReIdentificationClassificationAnnotation, ) + prediction_types = (ReIdentificationPrediction, ) + + def validate_config(self): + class _PWAccConfig(BaseMetricConfig): + min_score = BaseField(optional=True) + + validator = _PWAccConfig('pairwise_accuracy', on_extra_argument=_PWAccConfig.ERROR_ON_EXTRA_ARGUMENT) + validator.validate(self.config) + + def configure(self): + self.min_score = self.config.get('min_score', 'train_median') + + def evaluate(self, annotations, predictions): + embed_distances, pairs = get_embedding_distances(annotations, predictions) + + min_score = self.min_score + if min_score == 'train_median': + train_distances, _train_pairs = get_embedding_distances(annotations, predictions, train=True) + min_score = np.median(train_distances) + + embed_same_class = embed_distances < min_score + + accuracy = 0 + for i, pair in enumerate(pairs): + same_label = pair.same + out_same = embed_same_class[i] + + correct_prediction = same_label and out_same or (not same_label and not out_same) + + if correct_prediction: + accuracy += 1 + + return float(accuracy) / len(pairs) + + +class PairwiseAccuracySubsets(FullDatasetEvaluationMetric): + __provider__ = 'pairwise_accuracy_subsets' + + annotation_types = (ReIdentificationClassificationAnnotation, ) + prediction_types = (ReIdentificationPrediction, ) + + def validate_config(self): + class _PWAccConfig(BaseMetricConfig): + subset_number = NumberField(optional=True, min_value=1, floats=False) + + validator = _PWAccConfig('pairwise_accuracy', on_extra_argument=_PWAccConfig.ERROR_ON_EXTRA_ARGUMENT) + validator.validate(self.config) + + def configure(self): + self.meta['scale'] = 1 + self.meta['postfix'] = ' ' + self.subset_num = self.config.get('subset_number', 10) + self.accuracy_metric = PairwiseAccuracy(self.config, self.dataset) + + def evaluate(self, annotations, predictions): + subset_results = [] + first_images_annotations = list(filter( + lambda annotation: (len(annotation.negative_pairs) > 0 or len(annotation.positive_pairs) > 0), annotations + )) + + idx_subsets = self.make_subsets(self.subset_num, len(first_images_annotations)) + for subset in range(self.subset_num): + test_subset = self.get_subset(first_images_annotations, idx_subsets[subset]['test']) + test_subset = self.mark_subset(test_subset, False) + + train_subset = self.get_subset(first_images_annotations, idx_subsets[subset]['train']) + train_subset = self.mark_subset(train_subset) + + subset_result = self.accuracy_metric.evaluate(test_subset+train_subset, predictions) + subset_results.append(subset_result) + + return np.mean(subset_results) + + @staticmethod + def make_subsets(subset_num, dataset_size): + subsets = [] + if subset_num > dataset_size: + raise ValueError('It is impossible to divide dataset on more than number of annotations subsets.') + + for subset in range(subset_num): + lower_bnd = subset * dataset_size // subset_num + upper_bnd = (subset + 1) * dataset_size // subset_num + subset_test = [(lower_bnd, upper_bnd)] + + subset_train = [(0, lower_bnd), (upper_bnd, dataset_size)] + subsets.append({'test': subset_test, 'train': subset_train}) + + return subsets + + @staticmethod + def mark_subset(subset_annotations, train=True): + for annotation in subset_annotations: + annotation.metadata['train'] = train + + return subset_annotations + + @staticmethod + def get_subset(container, subset_bounds): + subset = [] + for bound in subset_bounds: + subset += container[bound[0]: bound[1]] + + return subset + + +def extract_embeddings(annotation, prediction, query): + return np.stack([pred.embedding for pred, ann in zip(prediction, annotation) if ann.query == query]) + + +def get_gallery_query_pids(annotation): + gallery_pids = np.asarray([ann.person_id for ann in annotation if not ann.query]) + query_pids = np.asarray([ann.person_id for ann in annotation if ann.query]) + gallery_cameras = np.asarray([ann.camera_id for ann in annotation if not ann.query]) + query_cameras = np.asarray([ann.camera_id for ann in annotation if ann.query]) + + return gallery_cameras, gallery_pids, query_cameras, query_pids + + +def distance_matrix(annotation, prediction): + gallery_embeddings = extract_embeddings(annotation, prediction, query=False) + query_embeddings = extract_embeddings(annotation, prediction, query=True) + + return 1. - np.matmul(gallery_embeddings, np.transpose(query_embeddings)).T + + +def unique_sample(ids_dict, num): + mask = np.zeros(num, dtype=np.bool) + for indices in ids_dict.values(): + mask[np.random.choice(indices)] = True + + return mask + + +def eval_map(distance_mat, query_ids, gallery_ids, query_cams, gallery_cams, interpolated_auc=False): + number_queries, _number_gallery = distance_mat.shape + # Sort and find correct matches + indices = np.argsort(distance_mat, axis=1) + matches = (gallery_ids[indices] == query_ids[:, np.newaxis]) # type: np.ndarray + + # Compute AP for each query + average_precisions = [] + for query in range(number_queries): + # Filter out the same id and same camera + valid = (gallery_ids[indices[query]] != query_ids[query]) | (gallery_cams[indices[query]] != query_cams[query]) + + y_true = matches[query, valid] + y_score = -distance_mat[query][indices[query]][valid] + if not np.any(y_true): + continue + + average_precisions.append(binary_average_precision(y_true, y_score, interpolated_auc=interpolated_auc)) + + if not average_precisions: + raise RuntimeError("No valid query") + + return np.mean(average_precisions) + + +def eval_cmc(distance_mat, query_ids, gallery_ids, query_cams, gallery_cams, separate_camera_set=False, + single_gallery_shot=False, first_match_break=False, number_single_shot_repeats=10, top_k=100): + number_queries, _number_gallery = distance_mat.shape + + if not single_gallery_shot: + number_single_shot_repeats = 1 + + # Sort and find correct matches + indices = np.argsort(distance_mat, axis=1) + matches = gallery_ids[indices] == query_ids[:, np.newaxis] # type: np.ndarray + + # Compute CMC for each query + ret = np.zeros(top_k) + num_valid_queries = 0 + for query in range(number_queries): + valid = get_valid_subset( + gallery_cams, gallery_ids, query, indices, query_cams, query_ids, separate_camera_set + ) # type: np.ndarray + + if not np.any(matches[query, valid]): + continue + + ids_dict = defaultdict(list) + if single_gallery_shot: + gallery_indexes = gallery_ids[indices[query][valid]] + for j, x in zip(np.where(valid)[0], gallery_indexes): + ids_dict[x].append(j) + + for _ in range(number_single_shot_repeats): + if single_gallery_shot: + # Randomly choose one instance for each id + # required for correct validation on CUHK datasets + # http://www.ee.cuhk.edu.hk/~xgwang/CUHK_identification.html + sampled = (valid & unique_sample(ids_dict, len(valid))) + index = np.nonzero(matches[query, sampled])[0] + else: + index = np.nonzero(matches[query, valid])[0] + + delta = 1. / (len(index) * number_single_shot_repeats) + for j, k in enumerate(index): + if k - j >= top_k: + break + if first_match_break: + ret[k - j] += 1 + break + ret[k - j] += delta + + num_valid_queries += 1 + + if num_valid_queries == 0: + raise RuntimeError("No valid query") + + return ret.cumsum() / num_valid_queries + + +def get_valid_subset(gallery_cams, gallery_ids, query_index, indices, query_cams, query_ids, separate_camera_set): + # Filter out the same id and same camera + valid = ( + (gallery_ids[indices[query_index]] != query_ids[query_index]) | + (gallery_cams[indices[query_index]] != query_cams[query_index]) + ) + if separate_camera_set: + # Filter out samples from same camera + valid &= (gallery_cams[indices[query_index]] != query_cams[query_index]) + + return valid + + +def get_embedding_distances(annotation, prediction, train=False): + image_indexes = {} + for i, pred in enumerate(prediction): + image_indexes[pred.identifier] = i + + pairs = [] + for image1 in annotation: + if train != image1.metadata.get("train", False): + continue + + for image2 in image1.positive_pairs: + pairs.append(PairDesc(image_indexes[image1.identifier], image_indexes[image2], True)) + for image2 in image1.negative_pairs: + pairs.append(PairDesc(image_indexes[image1.identifier], image_indexes[image2], False)) + + embed1 = np.asarray([prediction[idx].embedding for idx, _, _ in pairs]) + embed2 = np.asarray([prediction[idx].embedding for _, idx, _ in pairs]) + + return 0.5 * (1 - np.sum(embed1 * embed2, axis=1)), pairs + + +def binary_average_precision(y_true, y_score, interpolated_auc=True): + def _average_precision(y_true_, y_score_, sample_weight=None): + precision, recall, _ = precision_recall_curve(y_true_, y_score_, sample_weight) + if not interpolated_auc: + # Return the step function integral + # The following works because the last entry of precision is + # guaranteed to be 1, as returned by precision_recall_curve + return -1 * np.sum(np.diff(recall) * np.array(precision)[:-1]) + + return auc(recall, precision) + + return _average_binary_score(_average_precision, y_true, y_score, average="macro") diff --git a/tools/accuracy_checker/accuracy_checker/metrics/semantic_segmentation.py b/tools/accuracy_checker/accuracy_checker/metrics/semantic_segmentation.py new file mode 100644 index 0000000..d418de0 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/semantic_segmentation.py @@ -0,0 +1,139 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..config import BoolField +from ..representation import ( + SegmentationAnnotation, + SegmentationPrediction, + BrainTumorSegmentationAnnotation, + BrainTumorSegmentationPrediction +) +from .metric import PerImageEvaluationMetric, BaseMetricConfig +from ..utils import finalize_metric_result + + +class SegmentationMetricConfig(BaseMetricConfig): + use_argmax = BoolField(optional=True) + + +class SegmentationMetric(PerImageEvaluationMetric): + annotation_types = (SegmentationAnnotation, ) + prediction_types = (SegmentationPrediction, ) + + CONFUSION_MATRIX_KEY = 'segmentation_confusion_matrix' + + def evaluate(self, annotations, predictions): + raise NotImplementedError + + def validate_config(self): + config_validator = SegmentationMetricConfig( + 'SemanticSegmentation_config', SegmentationMetricConfig.ERROR_ON_EXTRA_ARGUMENT + ) + config_validator.validate(self.config) + + def configure(self): + self.use_argmax = self.config.get('use_argmax', True) + + def update(self, annotation, prediction): + n_classes = len(self.dataset.labels) + prediction_mask = np.argmax(prediction.mask, axis=0) if self.use_argmax else prediction.mask.astype('int64') + + def update_confusion_matrix(confusion_matrix): + label_true = annotation.mask.flatten() + label_pred = prediction_mask.flatten() + + mask = (label_true >= 0) & (label_true < n_classes) + hist = np.bincount(n_classes * label_true[mask].astype(int) + label_pred[mask], minlength=n_classes ** 2) + hist = hist.reshape(n_classes, n_classes) + confusion_matrix += hist + + return confusion_matrix + + self._update_state(update_confusion_matrix, self.CONFUSION_MATRIX_KEY, lambda: np.zeros((n_classes, n_classes))) + + +class SegmentationAccuracy(SegmentationMetric): + __provider__ = 'segmentation_accuracy' + + def evaluate(self, annotations, predictions): + confusion_matrix = self.state[self.CONFUSION_MATRIX_KEY] + return np.diag(confusion_matrix).sum() / confusion_matrix.sum() + + +class SegmentationIOU(SegmentationMetric): + __provider__ = 'mean_iou' + + def evaluate(self, annotations, predictions): + confusion_matrix = self.state[self.CONFUSION_MATRIX_KEY] + union = confusion_matrix.sum(axis=1) + confusion_matrix.sum(axis=0) - np.diag(confusion_matrix) + diagonal = np.diag(confusion_matrix) + iou = np.divide(diagonal, union, out=np.zeros_like(diagonal), where=union != 0) + + values, names = finalize_metric_result(iou, list(self.dataset.labels.values())) + self.meta['names'] = names + + return values + + +class SegmentationMeanAccuracy(SegmentationMetric): + __provider__ = 'mean_accuracy' + + def evaluate(self, annotations, predictions): + confusion_matrix = self.state[self.CONFUSION_MATRIX_KEY] + diagonal = np.diag(confusion_matrix) + per_class_count = confusion_matrix.sum(axis=1) + acc_cls = np.divide(diagonal, per_class_count, out=np.zeros_like(diagonal), where=per_class_count != 0) + + values, names = finalize_metric_result(acc_cls, list(self.dataset.labels.values())) + self.meta['names'] = names + + return values + + +class SegmentationFWAcc(SegmentationMetric): + __provider__ = 'frequency_weighted_accuracy' + + def evaluate(self, annotations, predictions): + confusion_matrix = self.state[self.CONFUSION_MATRIX_KEY] + + union = (confusion_matrix.sum(axis=1) + confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)) + diagonal = np.diag(confusion_matrix) + iou = np.divide(diagonal, union, out=np.zeros_like(diagonal), where=union != 0) + freq = confusion_matrix.sum(axis=1) / confusion_matrix.sum() + + return (freq[freq > 0] * iou[freq > 0]).sum() + + +class SegmentationDSCAcc(PerImageEvaluationMetric): + __provider__ = 'dice' + annotation_types = (BrainTumorSegmentationAnnotation,) + prediction_types = (BrainTumorSegmentationPrediction,) + overall_metric = [] + + def update(self, annotation, prediction): + cnt = 0 + for prediction_mask, annotation_mask in zip(prediction.mask, annotation.mask): + annotation_mask = np.transpose(annotation_mask, (2, 0, 1)) + annotation_mask = np.expand_dims(annotation_mask, 0) + numerator = np.sum(prediction_mask * annotation_mask) * 2.0 + 1.0 + denominator = np.sum(annotation_mask) + np.sum(prediction_mask) + 1.0 + self.overall_metric.append(numerator / denominator) + cnt += 1 + + def evaluate(self, annotations, predictions): + return sum(self.overall_metric) / len(self.overall_metric) diff --git a/tools/accuracy_checker/accuracy_checker/metrics/text_detection.py b/tools/accuracy_checker/accuracy_checker/metrics/text_detection.py new file mode 100644 index 0000000..65f8481 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/metrics/text_detection.py @@ -0,0 +1,124 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from .metric import PerImageEvaluationMetric, BaseMetricConfig +from ..config import BoolField, NumberField +from ..representation import TextDetectionPrediction, TextDetectionAnnotation +from ..utils import polygon_from_points + + +def get_union(detection_polygon, annotation_polygon): + area_prediction = detection_polygon.area + area_annotation = annotation_polygon.area + return area_prediction + area_annotation - get_intersection_area(detection_polygon, annotation_polygon) + + +def get_intersection_over_union(detection_polygon, annotation_polygon): + union = get_union(detection_polygon, annotation_polygon) + intersection = get_intersection_area(detection_polygon, annotation_polygon) + return intersection / union if union != 0 else 0.0 + + +def get_intersection_area(detection_polygon, annotation_polygon): + return detection_polygon.intersection(annotation_polygon).area + + +class TextDetectionMetricConfig(BaseMetricConfig): + iou_constrain = NumberField(min_value=0, max_value=1, optional=True) + ignore_difficult = BoolField(optional=True) + area_precision_constrain = NumberField(min_value=0, max_value=1, optional=True) + + +class TextDetectionMetric(PerImageEvaluationMetric): + __provider__ = 'text_detection' + + annotation_types = (TextDetectionAnnotation, ) + prediction_types = (TextDetectionPrediction, ) + + def validate_config(self): + text_detection_metric_config = TextDetectionMetricConfig( + 'TextDetectionMetric_config', TextDetectionMetricConfig.ERROR_ON_EXTRA_ARGUMENT + ) + text_detection_metric_config.validate(self.config) + + def configure(self): + self.iou_constrain = self.config.get('iou_constrain', 0.5) + self.area_precision_constrain = self.config.get('area_precision_constrain', 0.5) + self.ignore_difficult = self.config.get('ignore_difficult', False) + self.number_matched_detections = 0 + self.number_valid_annotations = 0 + self.number_valid_detections = 0 + + def update(self, annotation, prediction): + gt_polygons = list(map(polygon_from_points, annotation.points)) + prediction_polygons = list(map(polygon_from_points, prediction.points)) + num_gt = len(gt_polygons) + num_det = len(prediction_polygons) + gt_difficult_mask = np.full(num_gt, False) + prediction_difficult_mask = np.full(num_det, False) + num_det_matched = 0 + if self.ignore_difficult: + gt_difficult_inds = annotation.metadata.get('difficult_boxes', []) + prediction_difficult_inds = prediction.metadata.get('difficult_boxes', []) + gt_difficult_mask[gt_difficult_inds] = True + prediction_difficult_mask[prediction_difficult_inds] = True + for det_id, detection_polygon in enumerate(prediction_polygons): + for gt_difficult_id in gt_difficult_inds: + gt_difficult_polygon = gt_polygons[gt_difficult_id] + intersected_area = get_intersection_area(gt_difficult_polygon, detection_polygon) + pd_dimensions = detection_polygon.area + precision = 0 if pd_dimensions == 0 else intersected_area / pd_dimensions + + if precision >= self.area_precision_constrain: + prediction_difficult_mask[det_id] = True + + if num_gt > 0 and num_det > 0: + iou_matrix = np.empty((num_gt, num_det)) + gt_matched = np.zeros(num_gt, np.int8) + det_matched = np.zeros(num_det, np.int8) + + for gt_id, gt_polygon in enumerate(gt_polygons): + for pred_id, pred_polygon in enumerate(prediction_polygons): + iou_matrix[gt_id, pred_id] = get_intersection_over_union(pred_polygon, gt_polygon) + not_matched_before = gt_matched[gt_id] == 0 and det_matched[pred_id] == 0 + not_difficult = not gt_difficult_mask[gt_id] and not prediction_difficult_mask[pred_id] + if not_matched_before and not_difficult: + if iou_matrix[gt_id, pred_id] >= self.iou_constrain: + gt_matched[gt_id] = 1 + det_matched[pred_id] = 1 + num_det_matched += 1 + + num_ignored_gt = np.sum(gt_difficult_mask) + num_ignored_pred = np.sum(prediction_difficult_mask) + num_valid_gt = num_gt - num_ignored_gt + num_valid_pred = num_det - num_ignored_pred + + self.number_matched_detections += num_det_matched + self.number_valid_annotations += num_valid_gt + self.number_valid_detections += num_valid_pred + + def evaluate(self, annotations, predictions): + recall = ( + 0 if self.number_valid_annotations == 0 + else float(self.number_matched_detections) / self.number_valid_annotations + ) + precision = ( + 0 if self.number_valid_detections == 0 + else float(self.number_matched_detections) / self.number_valid_detections + ) + + return 0 if recall + precision == 0 else 2 * recall * precision / (recall + precision) diff --git a/tools/accuracy_checker/accuracy_checker/model_evaluator.py b/tools/accuracy_checker/accuracy_checker/model_evaluator.py new file mode 100644 index 0000000..65c9815 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/model_evaluator.py @@ -0,0 +1,132 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import copy +import pickle + +from .utils import get_path +from .dataset import Dataset +from .launcher import create_launcher, DummyLauncher +from .launcher.loaders import PickleLoader +from .logging import print_info +from .metrics import MetricsExecutor +from .postprocessor import PostprocessingExecutor +from .preprocessor import PreprocessingExecutor + + +class ModelEvaluator: + def __init__(self, launcher, preprocessor, postprocessor, dataset, metric): + self.launcher = launcher + self.preprocessor = preprocessor + self.postprocessor = postprocessor + self.dataset = dataset + self.metric_executor = metric + + self._annotations = [] + self._predictions = [] + + @classmethod + def from_configs(cls, launcher_config, dataset_config): + dataset_name = dataset_config['name'] + preprocessor = PreprocessingExecutor(dataset_config.get('preprocessing'), dataset_name) + dataset = Dataset(dataset_config, preprocessor) + + launcher = create_launcher(launcher_config, dataset.metadata) + postprocessor = PostprocessingExecutor(dataset_config.get('postprocessing'), dataset_name, dataset.metadata) + metric_dispatcher = MetricsExecutor(dataset_config, dataset) + + return cls(launcher, preprocessor, postprocessor, dataset, metric_dispatcher) + + def process_dataset(self, stored_predictions, progress_reporter, *args, **kwargs): + if self._is_stored(stored_predictions) or isinstance(self.launcher, DummyLauncher): + self._annotations, self._predictions = self.load(stored_predictions, progress_reporter) + self._annotations, self._predictions = self.postprocessor.full_process(self._annotations, self._predictions) + + self.metric_executor.update_metrics_on_batch(self._annotations, self._predictions) + return self._annotations, self._predictions + + self.dataset.batch = self.launcher.batch + predictions_to_store = [] + for batch_id, (batch_annotation, batch_input) in enumerate(self.dataset): + batch_identifiers = [annotation.identifier for annotation in batch_annotation] + batch_predictions = self.launcher.predict(batch_identifiers, batch_input, *args, **kwargs) + + if stored_predictions: + predictions_to_store.extend(copy.deepcopy(batch_predictions)) + + annotations, predictions = self.postprocessor.process_batch(batch_annotation, batch_predictions) + if not self.postprocessor.has_dataset_processors: + self.metric_executor.update_metrics_on_batch(annotations, predictions) + + self._annotations.extend(annotations) + self._predictions.extend(predictions) + + if progress_reporter: + progress_reporter.update(batch_id, len(batch_predictions)) + + if progress_reporter: + progress_reporter.finish() + + if stored_predictions: + self.store_predictions(stored_predictions, predictions_to_store) + + if self.postprocessor.has_dataset_processors: + self.metric_executor.update_metrics_on_batch(self._annotations, self._predictions) + + return self.postprocessor.process_dataset(self._annotations, self._predictions) + + @staticmethod + def _is_stored(stored_predictions=None): + if not stored_predictions: + return False + + try: + get_path(stored_predictions) + return True + except OSError: + return False + + def compute_metrics(self, output_callback=None, ignore_results_formatting=False): + for result_presenter, evaluated_metric in self.metric_executor.iterate_metrics( + self._annotations, self._predictions): + result_presenter.write_result(evaluated_metric, output_callback, ignore_results_formatting) + + def load(self, stored_predictions, progress_reporter): + self._annotations = self.dataset.annotation + launcher = self.launcher + if not isinstance(launcher, DummyLauncher): + launcher = DummyLauncher({ + 'framework': 'dummy', + 'loader': PickleLoader.__provider__, + 'data_path': stored_predictions + }, adapter=None) + + predictions = launcher.predict([annotation.identifier for annotation in self._annotations]) + + if progress_reporter: + progress_reporter.finish(False) + + return self._annotations, predictions + + @staticmethod + def store_predictions(stored_predictions, predictions): + # since at the first time file does not exist and then created we can not use it as a pathlib.Path object + with open(stored_predictions, "wb") as content: + pickle.dump(predictions, content) + print_info("prediction objects are save to {}".format(stored_predictions)) + + def release(self): + self.launcher.release() diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/README.md b/tools/accuracy_checker/accuracy_checker/postprocessor/README.md new file mode 100644 index 0000000..752276a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/README.md @@ -0,0 +1,40 @@ +# Postprocessors + +Postprocessor is function which processes prediction and/or annotation data after model infer and before metric calculation. For correct work postprocessors require specific representation format. +(e. g. clip boxes postprocessor expects detection annotation and detection prediction for processing). + +In case when you use complicated representation located in representation container, you can add options `annotation_source` and `prediction_source` in configuration file, +if you want process only specific representations, another way postprocessor will be used for all suitable representations. `annotation_source` and `prediction_source` should contain +comma separated list of annotation identifiers and output layer names respectively. + +Every postprocessor has parameters available for configuration. + +Accuracy Checker supports following set of postprocessors: + +* `cast_to_int` - casting detection bounding box coordinates given in floating point format to integer. Supported representations: `DetectionAnotation`, `DetectionPrediction`, `TextDetectionAnnotation`, `TextDetectionPrediction`. + * `round_policy` - method for rounding: `nearest`, `greater`, `lower`, `nearest_to_zero`. +* `clip_boxes` - clipping detection bounding box sizes. Supported representations: `DetectionAnotation`, `DetectionPrediction`. + * `dst_width` and `dst_height` - destination width and height for box clipping respectively. You can also use `size` instead in case when destination sizes are equal. + * `apply_to` - option which determines target boxes for processing (`annotation` for ground truth boxes and `prediction` for detection results, `all` for both). + * `bboxes_normalized` is flag which says that target bounding boxes are in normalized format. +* `correct_yolo_v2_boxes` - resizing detection prediction bbox coordinates using specific for Yolo v2 approach. Supported representations: `DetectionAnotation`, `DetectionPrediction`. + * `dst_width` and `dst_height` - destination width and height respectively. You can also use `size` instead in case when destination sizes are equal. +* `encode_segmentation_mask` - encoding segmentation label image as segmentation mask. Supported representations: `SegmentationAnotation`, `SegmentationPrediction`. +* `resize_prediction_boxes` - resizing normalized detection prediction boxes according to image size. Supported representations: `DetectionAnotation`, `DetectionPrediction`. +* `resize_segmentation_mask` - resizing segmentation mask. Supported representations: `SegmentationAnotation`, `SegmentationPrediction`. + * `dst_width` and `dst_height` - destination width and height for box clipping respectively. You can also use `size` instead in case when destination sizes are equal. + If any of these parameters are not specified, image size will be used as default. + * `apply_to` - determines target boxes for processing (`annotation` for ground truth boxes and `prediction` for detection results, `all` for both). +* `nms` - non-maximum suppression. Supported representations: `DetectionAnotation`, `DetectionPrediction`. + * `overlap` - overlap threshold for merging detections. +* `filter` - filtering data using different parameters. Supported representations: `DetectionAnotation`, `DetectionPrediction`. + * `apply_to` - determines target boxes for processing (`annotation` for ground truth boxes and `prediction` for detection results, `all` for both). + * `remove_filtered` - removing filtered data. Annotations support ignoring filtered data without removing as default, in other cases filtered data will be removed automatically. + * Supported parameters for filtering: `labels`, `min_confidence`, `height_range`, `width_range`, `is_empty`, `min_visibility`, `aspect_ratio`, `area_ratio`, `area_range`. + Filtering by `height_range`, `width_range` are also available for `TextDetectionAnnotation`, `TextDetectionPrediction`, `area_range` - for `PoseEstimationAnnotation`, `PoseEstimationPrediction` and `TextDetectionAnnotation`, `TextDetectionPrediction`. +* `normalize_landmarks_points` - normalizing ground truth landmarks points. Supported representations: `FacialLandmarksAnnotation`, `FacialLandmarksPrediction`. + * `use_annotation_rect` - allows to use size of rectangle saved in annotation metadata for point scaling instead source image size. +* `extend_segmentation_mask` - extending annotation segmentation mask to predicted mask size making border filled by specific value. Supported representations: `SegmentationAnotation`, `SegmentationPrediction`. + * `filling_label` - value for filling border (default 255). +* `zoom_segmentation_mask` - zooming segmentation mask. Supported representations: `SegmentationAnotation`, `SegmentationPrediction`. + * `zoom` - size for zoom operation. diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/__init__.py b/tools/accuracy_checker/accuracy_checker/postprocessor/__init__.py new file mode 100644 index 0000000..c3a93bd --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/__init__.py @@ -0,0 +1,69 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .postprocessing_executor import PostprocessingExecutor + +from .filter import ( + FilterPostprocessor, + + FilterByHeightRange, + FilterByLabels, + FilterByMinConfidence, + FilterEmpty, + FilterByVisibility, + FilterByAspectRatio +) + +from .cast_to_int import CastToInt +from .clip_boxes import ClipBoxes +from .nms import NMS +from .resize_prediction_boxes import ResizePredictionBoxes +from .correct_yolo_v2_boxes import CorrectYoloV2Boxes +from .resize_segmentation_mask import ResizeSegmentationMask +from .encode_segmentation_mask import EncodeSegMask +from .normalize_landmarks_points import NormalizeLandmarksPoints +from .clip_points import ClipPoints +from .extend_segmentation_mask import ExtendSegmentationMask +from .zoom_segmentation_mask import ZoomSegMask +from .crop_segmentation_mask import CropSegmentationMask +from .clip_segmentation_mask import ClipSegmentationMask + +__all__ = [ + 'PostprocessingExecutor', + + 'FilterPostprocessor', + 'FilterByHeightRange', + 'FilterByLabels', + 'FilterByMinConfidence', + 'FilterEmpty', + 'FilterByVisibility', + 'FilterByAspectRatio', + + 'CastToInt', + 'ClipBoxes', + 'NMS', + 'ResizePredictionBoxes', + 'CorrectYoloV2Boxes', + + 'ResizeSegmentationMask', + 'EncodeSegMask', + 'ExtendSegmentationMask', + 'ZoomSegMask', + 'CropSegmentationMask', + 'ClipSegmentationMask', + + 'NormalizeLandmarksPoints' +] diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/cast_to_int.py b/tools/accuracy_checker/accuracy_checker/postprocessor/cast_to_int.py new file mode 100644 index 0000000..cd6e29a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/cast_to_int.py @@ -0,0 +1,71 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from functools import singledispatch +from typing import Union +import numpy as np +from ..config import StringField +from ..representation import DetectionAnnotation, DetectionPrediction, TextDetectionPrediction, TextDetectionAnnotation +from .postprocessor import Postprocessor, BasePostprocessorConfig + + +class CastToInt(Postprocessor): + __provider__ = 'cast_to_int' + annotation_types = (DetectionAnnotation, TextDetectionAnnotation) + prediction_types = (DetectionPrediction, TextDetectionPrediction) + + round_policies_func = { + 'nearest': np.rint, + 'nearest_to_zero': np.trunc, + 'lower': np.floor, + 'greater': np.ceil + } + + def validate_config(self): + class _CastToIntConfigValidator(BasePostprocessorConfig): + round_policy = StringField(optional=True, choices=self.round_policies_func.keys()) + + cast_to_int_config_validator = _CastToIntConfigValidator( + self.__provider__, on_extra_argument=_CastToIntConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + cast_to_int_config_validator.validate(self.config) + + def configure(self): + self.round_func = self.round_policies_func[self.config.get('round_policy', 'nearest')] + + def process_image(self, annotation, prediction): + @singledispatch + def cast(entry): + pass + + @cast.register(Union[DetectionAnnotation, DetectionPrediction]) + def _(entry): + entry.x_mins = self.round_func(entry.x_mins) + entry.x_maxs = self.round_func(entry.x_maxs) + entry.y_mins = self.round_func(entry.y_mins) + entry.y_maxs = self.round_func(entry.y_maxs) + + @cast.register(Union[TextDetectionAnnotation, TextDetectionPrediction]) + def _(entry): + entry.points = self.round_func(entry.points) + + + for annotation_ in annotation: + cast(annotation_) + + for prediction_ in prediction: + cast(prediction_) + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/clip_boxes.py b/tools/accuracy_checker/accuracy_checker/postprocessor/clip_boxes.py new file mode 100644 index 0000000..dd87f10 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/clip_boxes.py @@ -0,0 +1,68 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..config import BoolField, NumberField +from ..representation import DetectionPrediction, DetectionAnnotation +from .postprocessor import PostprocessorWithSpecificTargets, PostprocessorWithTargetsConfigValidator + + +class ClipBoxes(PostprocessorWithSpecificTargets): + __provider__ = 'clip_boxes' + + annotation_types = (DetectionAnnotation, ) + prediction_types = (DetectionPrediction, ) + + def validate_config(self): + class _ClipConfigValidator(PostprocessorWithTargetsConfigValidator): + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + size = NumberField(floats=False, optional=True, min_value=1) + boxes_normalized = BoolField(optional=True) + + clip_config_validator = _ClipConfigValidator( + self.__provider__, on_extra_argument=_ClipConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + clip_config_validator.validate(self.config) + + def configure(self): + size = self.config.get('size') + self.dst_height = size or self.config.get('dst_height') + self.dst_width = size or self.config.get('dst_width') + + self.boxes_normalized = self.config.get('boxes_normalized', False) + + def process_image(self, annotation, prediction): + target_height = self.dst_height or self.image_size[0] + target_width = self.dst_width or self.image_size[1] + + max_width = target_width if not self.boxes_normalized else 1 + max_height = target_height if not self.boxes_normalized else 1 + + for target in annotation: + self._clip_boxes(target, (0, max_width), (0, max_height)) + for target in prediction: + self._clip_boxes(target, (0, max_width), (0, max_height)) + + return annotation, prediction + + @staticmethod + def _clip_boxes(entry, width_range, height_range): + entry.x_mins = entry.x_mins.clip(width_range[0], width_range[1]) + entry.x_maxs = entry.x_maxs.clip(width_range[0], width_range[1]) + entry.y_mins = entry.y_mins.clip(height_range[0], height_range[1]) + entry.y_maxs = entry.y_maxs.clip(height_range[0], height_range[1]) + + return entry diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/clip_points.py b/tools/accuracy_checker/accuracy_checker/postprocessor/clip_points.py new file mode 100644 index 0000000..3ffd3a5 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/clip_points.py @@ -0,0 +1,68 @@ +"""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from ..config import BoolField, NumberField +from ..representation import TextDetectionAnnotation, TextDetectionPrediction +from ..utils import get_size_from_config +from .postprocessor import PostprocessorWithSpecificTargets, PostprocessorWithTargetsConfigValidator + + +class ClipPointsConfigValidator(PostprocessorWithTargetsConfigValidator): + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + size = NumberField(floats=False, optional=True, min_value=1) + points_normalized = BoolField(optional=True) + + +class ClipPoints(PostprocessorWithSpecificTargets): + __provider__ = 'clip_points' + + annotation_types = (TextDetectionAnnotation, ) + prediction_types = (TextDetectionPrediction, ) + + def validate_config(self): + clip_points_config_validator = ClipPointsConfigValidator( + self.__provider__, on_extra_argument=ClipPointsConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + clip_points_config_validator.validate(self.config) + + def configure(self): + self.dst_height, self.dst_width = get_size_from_config(self.config, allow_none=True) + self.points_normalized = self.config.get('points_normalized', False) + + def process_image(self, annotation, prediction): + target_width = self.dst_width or self.image_size[1] - 1 + target_height = self.dst_height or self.image_size[0] - 1 + + max_width = target_width if not self.points_normalized else 1 + max_height = target_height if not self.points_normalized else 1 + for target in annotation: + points = [] + for polygon in target.points: + polygon[:, 0] = np.clip(polygon[:, 0], 0, max_width) + polygon[:, 1] = np.clip(polygon[:, 1], 0, max_height) + points.append(polygon) + target.points = points + for target in prediction: + points = [] + for polygon in target.points: + polygon[:, 0] = np.clip(polygon[:, 0], 0, max_width) + polygon[:, 1] = np.clip(polygon[:, 1], 0, max_height) + points.append(polygon) + target.points = points + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/clip_segmentation_mask.py b/tools/accuracy_checker/accuracy_checker/postprocessor/clip_segmentation_mask.py new file mode 100644 index 0000000..7a01464 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/clip_segmentation_mask.py @@ -0,0 +1,48 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import numpy as np +from .postprocessor import PostprocessorWithSpecificTargets, PostprocessorWithTargetsConfigValidator +from ..representation import BrainTumorSegmentationAnnotation, BrainTumorSegmentationPrediction +from ..config import NumberField, ConfigError + + +class ClipSegmentationMask(PostprocessorWithSpecificTargets): + __provider__ = 'clip_segmentation_mask' + + annotation_types = (BrainTumorSegmentationAnnotation,) + prediction_types = (BrainTumorSegmentationPrediction,) + + def validate_config(self): + class _ConfigValidator(PostprocessorWithTargetsConfigValidator): + min_value = NumberField(floats=False, min_value=0, optional=True) + max_value = NumberField(floats=False) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.min_value = self.config.get('min_value', 0) + self.max_value = self.config['max_value'] + if self.max_value < self.min_value: + raise ConfigError('max_value should be greater than min_value') + + def process_image(self, annotation, prediction): + for target in annotation: + target.mask = np.clip(target.mask, a_min=self.min_value, a_max=self.max_value) + + for target in prediction: + target.mask = np.clip(target.mask, a_min=self.min_value, a_max=self.max_value) + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/correct_yolo_v2_boxes.py b/tools/accuracy_checker/accuracy_checker/postprocessor/correct_yolo_v2_boxes.py new file mode 100644 index 0000000..b37be37 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/correct_yolo_v2_boxes.py @@ -0,0 +1,75 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..config import NumberField +from .postprocessor import BasePostprocessorConfig, Postprocessor +from ..representation import DetectionPrediction, DetectionAnnotation +from ..utils import get_size_from_config + + +class CorrectYoloV2Boxes(Postprocessor): + __provider__ = 'correct_yolo_v2_boxes' + + prediction_types = (DetectionPrediction, ) + annotation_types = (DetectionAnnotation, ) + + def validate_config(self): + class _CorrectYoloV2BoxesConfigValidator(BasePostprocessorConfig): + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + size = NumberField(floats=False, optional=True, min_value=1) + + clip_config_validator = _CorrectYoloV2BoxesConfigValidator( + self.__provider__, on_extra_argument=_CorrectYoloV2BoxesConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + clip_config_validator.validate(self.config) + + def configure(self): + self.dst_height, self.dst_width = get_size_from_config(self.config) + + def process_image(self, annotation, prediction): + dst_h, dst_w = self.dst_height, self.dst_width + # postprocessor always expects lists of annotations and predictions for the same image + # we do not need to get image sizes in cycle, because they are equal + img_h, img_w, _ = self.image_size + + if (dst_w / img_w) < (dst_h / img_h): + new_w = dst_w + new_h = (img_h * dst_w) // img_w + else: + new_h = dst_h + new_w = (img_w * dst_h) // img_h + + for prediction_ in prediction: + coordinates = zip(prediction_.x_mins, prediction_.y_mins, prediction_.x_maxs, prediction_.y_maxs) + for i, (x0, y0, x1, y1) in enumerate(coordinates): + box = [(x0 + x1) / 2.0, (y0 + y1) / 2.0, x1 - x0, y1 - y0] + box[0] = (box[0] - (dst_w - new_w) / (2.0 * dst_w)) * (dst_w / new_w) + box[1] = (box[1] - (dst_h - new_h) / (2.0 * dst_h)) * (dst_h / new_h) + box[2] *= dst_w / new_w + box[3] *= dst_h / new_h + + box[0] *= img_w + box[1] *= img_h + box[2] *= img_w + box[3] *= img_h + + prediction_.x_mins[i] = box[0] - box[2] / 2.0 + 1 + prediction_.y_mins[i] = box[1] - box[3] / 2.0 + 1 + prediction_.x_maxs[i] = box[0] + box[2] / 2.0 + 1 + prediction_.y_maxs[i] = box[1] + box[3] / 2.0 + 1 + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/crop_segmentation_mask.py b/tools/accuracy_checker/accuracy_checker/postprocessor/crop_segmentation_mask.py new file mode 100644 index 0000000..dd814fe --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/crop_segmentation_mask.py @@ -0,0 +1,49 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .postprocessor import PostprocessorWithSpecificTargets, PostprocessorWithTargetsConfigValidator +from ..representation import BrainTumorSegmentationAnnotation, BrainTumorSegmentationPrediction +from ..config import NumberField +from ..preprocessor import Crop3D +from ..utils import get_size_3d_from_config + + +class CropSegmentationMask(PostprocessorWithSpecificTargets): + __provider__ = 'crop_segmentation_mask' + + annotation_types = (BrainTumorSegmentationAnnotation,) + prediction_types = (BrainTumorSegmentationPrediction,) + + def validate_config(self): + class _ConfigValidator(PostprocessorWithTargetsConfigValidator): + size = NumberField(floats=False, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + dst_volume = NumberField(floats=False, optional=True, min_value=1) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.dst_height, self.dst_width, self.dst_volume = get_size_3d_from_config(self.config) + + def process_image(self, annotation, prediction): + for target in annotation: + target.mask = Crop3D.crop_center(target.mask, self.dst_height, self.dst_width, self.dst_volume) + + for target in prediction: + target.mask = Crop3D.crop_center(target.mask, self.dst_height, self.dst_width, self.dst_volume) + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/encode_segmentation_mask.py b/tools/accuracy_checker/accuracy_checker/postprocessor/encode_segmentation_mask.py new file mode 100644 index 0000000..736eb0e --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/encode_segmentation_mask.py @@ -0,0 +1,46 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from .postprocessor import Postprocessor +from ..representation import SegmentationAnnotation, SegmentationPrediction + + +class EncodeSegMask(Postprocessor): + """ + Encode segmentation label image as segmentation mask. + """ + + __provider__ = 'encode_segmentation_mask' + + annotation_types = (SegmentationAnnotation, ) + prediction_types = (SegmentationPrediction, ) + + def process_image(self, annotation, prediction): + segmentation_colors = self.meta.get("segmentation_colors") + + if not segmentation_colors: + raise ValueError("No 'segmentation_colors' in dataset metadata.") + + for annotation_ in annotation: + mask = annotation_.mask.astype(int) + encoded_mask = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.int16) + for label, color in enumerate(segmentation_colors): + encoded_mask[np.where(np.all(mask == color, axis=-1))[:2]] = label + annotation_.mask = encoded_mask + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/extend_segmentation_mask.py b/tools/accuracy_checker/accuracy_checker/postprocessor/extend_segmentation_mask.py new file mode 100644 index 0000000..abd83e0 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/extend_segmentation_mask.py @@ -0,0 +1,64 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import math +import cv2 + +from .postprocessor import Postprocessor, BasePostprocessorConfig +from ..representation import SegmentationAnnotation, SegmentationPrediction +from ..config import NumberField, ConfigError + + +class ExtendSegmentationMask(Postprocessor): + """ + Extend annotation segmentation mask to prediction size filling border with specific label. + """ + + __provider__ = 'extend_segmentation_mask' + + annotation_types = (SegmentationAnnotation, ) + prediction_types = (SegmentationPrediction, ) + + def validate_config(self): + class _ExtendSegmentationMaskConfigValidator(BasePostprocessorConfig): + filling_label = NumberField(optional=True, floats=False) + + extend_mask_config_validator = _ExtendSegmentationMaskConfigValidator( + self.__provider__, on_extra_argument=_ExtendSegmentationMaskConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + extend_mask_config_validator.validate(self.config) + + def configure(self): + self.filling_label = self.config.get('filling_label', 255) + + def process_image(self, annotation, prediction): + for annotation_, prediction_ in zip(annotation, prediction): + annotation_mask = annotation_.mask + dst_height, dst_width = prediction_.mask.shape[-2:] + height, width = annotation_mask.shape[-2:] + if dst_width < width or dst_height < height: + raise ConfigError('size for extending should be not less current mask size') + pad = [] + pad.append(int(math.floor((dst_height - height) / 2.0))) + pad.append(int(math.floor((dst_width - width) / 2.0))) + pad.append(int(dst_height - height - pad[0])) + pad.append(int(dst_width - width - pad[1])) + + extended_mask = cv2.copyMakeBorder( + annotation_mask, pad[0], pad[2], pad[1], pad[3], cv2.BORDER_CONSTANT, value=self.filling_label + ) + annotation_.mask = extended_mask + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/filter.py b/tools/accuracy_checker/accuracy_checker/postprocessor/filter.py new file mode 100644 index 0000000..440aec0 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/filter.py @@ -0,0 +1,319 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from functools import singledispatch +from typing import Union +import numpy as np + +from ..config import BaseField, BoolField +from ..dependency import ClassProvider +from ..postprocessor.postprocessor import PostprocessorWithSpecificTargets, PostprocessorWithTargetsConfigValidator +from ..representation import (DetectionAnnotation, DetectionPrediction, TextDetectionAnnotation, + TextDetectionPrediction, PoseEstimationPrediction, PoseEstimationAnnotation) +from ..utils import in_interval, polygon_from_points, convert_to_range + + +class FilterConfig(PostprocessorWithTargetsConfigValidator): + remove_filtered = BoolField(optional=True) + + def __init__(self, config_uri, **kwargs): + super().__init__(config_uri, **kwargs) + for functor in BaseFilter.providers: + self.fields[functor] = BaseField(optional=True) + + +class FilterPostprocessor(PostprocessorWithSpecificTargets): + __provider__ = 'filter' + + annotation_types = (DetectionAnnotation, TextDetectionAnnotation) + prediction_types = (DetectionPrediction, TextDetectionPrediction) + + def __init__(self, *args, **kwargs): + self._filters = [] + self.remove_filtered = False + super().__init__(*args, **kwargs) + + def validate_config(self): + filter_config = FilterConfig(self.__provider__, on_extra_argument=FilterConfig.ERROR_ON_EXTRA_ARGUMENT) + filter_config.validate(self.config) + + def configure(self): + config = self.config.copy() + config.pop('type') + self.remove_filtered = config.pop('remove_filtered', False) + config.pop('annotation_source', None) + config.pop('prediction_source', None) + config.pop('apply_to', None) + + for key, value in config.items(): + self._filters.append(BaseFilter.provide(key, value)) + + def process_image(self, annotation, prediction): + for functor in self._filters: + for target in annotation: + self._filter_entry_by(target, functor) + + for target in prediction: + self._filter_entry_by(target, functor) + + return annotation, prediction + + def _filter_entry_by(self, entry, functor): + ignored_key = 'difficult_boxes' + + if not self.remove_filtered and isinstance(entry, (DetectionAnnotation, DetectionPrediction, + TextDetectionAnnotation, TextDetectionPrediction, + PoseEstimationAnnotation, PoseEstimationPrediction)): + ignored = entry.metadata.setdefault(ignored_key, []) + ignored.extend(functor(entry)) + else: + entry.remove(functor(entry)) + + return entry + + +class BaseFilter(ClassProvider): + __provider_type__ = 'filter' + + def __init__(self, filter_arg): + self.filter_arg = filter_arg + + def __call__(self, entry): + return self.apply_filter(entry, self.filter_arg) + + def apply_filter(self, entry, filter_arg): + raise NotImplementedError + + +class FilterByLabels(BaseFilter): + __provider__ = 'labels' + + def apply_filter(self, entry, labels): + filtered = [] + for index, label in enumerate(entry.labels): + if label in labels: + filtered.append(index) + + return filtered + + +class FilterByMinConfidence(BaseFilter): + __provider__ = 'min_confidence' + + def apply_filter(self, entry, min_confidence): + filtered = [] + + if isinstance(entry, DetectionAnnotation): + return filtered + + for index, score in enumerate(entry.scores): + if score < min_confidence: + filtered.append(index) + + return filtered + + +class FilterByHeightRange(BaseFilter): + __provider__ = 'height_range' + + annotation_types = (DetectionAnnotation, TextDetectionAnnotation) + prediction_types = (DetectionPrediction, TextDetectionPrediction) + + def apply_filter(self, entry, height_range): + @singledispatch + def filtering(entry_value, height_range_): + return [] + + @filtering.register(Union[DetectionAnnotation, DetectionPrediction]) + def _(entry_value, height_range_): + filtered = [] + for index, (y_min, y_max) in enumerate(zip(entry_value.y_mins, entry_value.y_maxs)): + height = y_max - y_min + if not in_interval(height, height_range_): + filtered.append(index) + + return filtered + + @filtering.register(Union[TextDetectionAnnotation, TextDetectionPrediction]) + def _(entry_values, height_range_): + filtered = [] + for index, polygon_points in enumerate(entry_values.points): + left_bottom_point, left_top_point, right_top_point, right_bottom_point = polygon_points + left_side_height = np.linalg.norm(left_bottom_point - left_top_point) + right_side_height = np.linalg.norm(right_bottom_point - right_top_point) + if not in_interval(np.mean([left_side_height, right_side_height]), height_range_): + filtered.append(index) + + return filtered + + return filtering(entry, convert_to_range(height_range)) + + +class FilterByWidthRange(BaseFilter): + __provider__ = 'width_range' + + annotation_types = (DetectionAnnotation, TextDetectionAnnotation) + prediction_types = (DetectionPrediction, TextDetectionPrediction) + + def apply_filter(self, entry, width_range): + @singledispatch + def filtering(entry_value, width_range_): + return [] + + @filtering.register(Union[DetectionAnnotation, DetectionPrediction]) + def _(entry_value, width_range_): + filtered = [] + for index, (x_min, x_max) in enumerate(zip(entry_value.x_mins, entry_value.x_maxs)): + width = x_max - x_min + if not in_interval(width, width_range_): + filtered.append(index) + + return filtered + + @filtering.register(Union[TextDetectionAnnotation, TextDetectionPrediction]) + def _(entry_values, width_range_): + filtered = [] + for index, polygon_points in enumerate(entry_values.points): + left_bottom_point, left_top_point, right_top_point, right_bottom_point = polygon_points + top_width = np.linalg.norm(right_top_point - left_top_point) + bottom_width = np.linalg.norm(right_bottom_point - left_bottom_point) + if not in_interval(top_width, width_range_) or not in_interval(bottom_width, width_range_): + filtered.append(index) + + return filtered + + return filtering(entry, convert_to_range(width_range)) + + +class FilterByAreaRange(BaseFilter): + __provider__ = 'area_range' + + annotation_types = (TextDetectionAnnotation, PoseEstimationAnnotation) + prediction_types = (TextDetectionPrediction, ) + + def apply_filter(self, entry, area_range): + area_range = convert_to_range(area_range) + + @singledispatch + def filtering(entry, area_range): + return [] + + @filtering.register + def _(entry: Union[PoseEstimationAnnotation, PoseEstimationPrediction], area_range): + filtered = [] + areas = entry.areas + for area_id, area in enumerate(areas): + if not in_interval(area, area_range): + filtered.append(area_id) + return filtered + + @filtering.register + def _(entry: Union[TextDetectionAnnotation, TextDetectionPrediction]): + filtered = [] + for index, polygon_points in enumerate(entry.points): + if not in_interval(polygon_from_points(polygon_points).area, area_range): + filtered.append(index) + return filtered + + return filtering(entry, area_range) + + +class FilterEmpty(BaseFilter): + __provider__ = 'is_empty' + + def apply_filter(self, entry: DetectionAnnotation, is_empty): + return np.where(np.bitwise_or(entry.x_maxs - entry.x_mins <= 0, entry.y_maxs - entry.y_mins <= 0))[0] + + +class FilterByVisibility(BaseFilter): + __provider__ = 'min_visibility' + + _VISIBILITY_LEVELS = { + 'heavy occluded': 0, + 'partially occluded': 1, + 'visible': 2 + } + + def apply_filter(self, entry, min_visibility): + filtered = [] + min_visibility_level = self.visibility_level(min_visibility) + for index, visibility in enumerate(entry.metadata.get('visibilities', [])): + if self.visibility_level(visibility) < min_visibility_level: + filtered.append(index) + + return filtered + + def visibility_level(self, visibility): + level = self._VISIBILITY_LEVELS.get(visibility) + if level is None: + message = 'Unknown visibility level "{}". Supported only "{}"' + raise ValueError(message.format(visibility, ','.join(self._VISIBILITY_LEVELS.keys()))) + + return level + + +class FilterByAspectRatio(BaseFilter): + __provider__ = 'aspect_ratio' + + def apply_filter(self, entry, aspect_ratio): + aspect_ratio = convert_to_range(aspect_ratio) + + filtered = [] + coordinates = zip(entry.x_mins, entry.y_mins, entry.x_maxs, entry.y_maxs) + for index, (x_min, y_min, x_max, y_max) in enumerate(coordinates): + ratio = (y_max - y_min) / np.maximum(x_max - x_min, np.finfo(np.float64).eps) + if not in_interval(ratio, aspect_ratio): + filtered.append(index) + + return filtered + + +class FilterByAreaRatio(BaseFilter): + __provider__ = 'area_ratio' + + def apply_filter(self, entry, area_ratio): + area_ratio = convert_to_range(area_ratio) + + filtered = [] + if not isinstance(entry, DetectionAnnotation): + return filtered + + image_size = entry.metadata.get('image_size') + if not image_size: + return filtered + image_size = image_size[0] + + image_area = image_size[0] * image_size[1] + + occluded_indices = entry.metadata.get('is_occluded', []) + coordinates = zip(entry.x_mins, entry.y_mins, entry.x_maxs, entry.y_maxs) + for index, (x_min, y_min, x_max, y_max) in enumerate(coordinates): + width, height = x_max - x_min, y_max - y_min + area = np.sqrt(float(width * height) / np.maximum(image_area, np.finfo(np.float64).eps)) + if not in_interval(area, area_ratio) or index in occluded_indices: + filtered.append(index) + + return filtered + + +class FilterInvalidBoxes(BaseFilter): + __provider__ = 'invalid_boxes' + + def apply_filter(self, entry, invalid_boxes): + infinite_mask_x = np.logical_or(~np.isfinite(entry.x_mins), ~np.isfinite(entry.x_maxs)) + infinite_mask_y = np.logical_or(~np.isfinite(entry.y_mins), ~np.isfinite(entry.y_maxs)) + infinite_mask = np.logical_or(infinite_mask_x, infinite_mask_y) + + return np.argwhere(infinite_mask).reshape(-1).tolist() diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/nms.py b/tools/accuracy_checker/accuracy_checker/postprocessor/nms.py new file mode 100644 index 0000000..8bdbf1a --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/nms.py @@ -0,0 +1,80 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..config import NumberField +from .postprocessor import BasePostprocessorConfig, Postprocessor +from ..representation import DetectionPrediction, DetectionAnnotation + + +class NMS(Postprocessor): + __provider__ = 'nms' + + prediction_types = (DetectionPrediction, ) + annotation_types = (DetectionAnnotation, ) + + def validate_config(self): + class _NMSConfigValidator(BasePostprocessorConfig): + overlap = NumberField(min_value=0, max_value=1, optional=True) + + nms_config_validator = _NMSConfigValidator( + self.__provider__, on_extra_argument=_NMSConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + nms_config_validator.validate(self.config) + + def configure(self): + self.overlap = self.config.get('overlap', 0.5) + + def process_image(self, annotations, predictions): + for prediction in predictions: + keep = self._nms( + prediction.x_mins, prediction.y_mins, prediction.x_maxs, prediction.y_maxs, prediction.scores, + self.overlap + ) + prediction.remove([box for box in range(len(prediction.x_mins)) if box not in keep]) + + return annotations, predictions + + @staticmethod + def _nms(x1, y1, x2, y2, scores, thresh): + """ + Pure Python NMS baseline. + """ + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + intersection = w * h + + union = (areas[i] + areas[order[1:]] - intersection) + overlap = np.divide(intersection, union, out=np.zeros_like(intersection, dtype=float), where=union != 0) + + order = order[np.where(overlap <= thresh)[0] + 1] + + return keep diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/normalize_landmarks_points.py b/tools/accuracy_checker/accuracy_checker/postprocessor/normalize_landmarks_points.py new file mode 100644 index 0000000..7f3fbbc --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/normalize_landmarks_points.py @@ -0,0 +1,59 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..config import BoolField +from ..postprocessor.postprocessor import Postprocessor, BasePostprocessorConfig +from ..representation import FacialLandmarksAnnotation, FacialLandmarksPrediction + + +class NormalizeLandmarksPoints(Postprocessor): + __provider__ = 'normalize_landmarks_points' + + annotation_types = (FacialLandmarksAnnotation, ) + prediction_types = (FacialLandmarksPrediction, ) + + def validate_config(self): + class _ConfigValidator(BasePostprocessorConfig): + use_annotation_rect = BoolField(optional=True) + + config_validator = _ConfigValidator( + self.__provider__, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + config_validator.validate(self.config) + + def configure(self): + self.use_annotation_rect = self.config.get('use_annotation_rect', False) + + def process_image(self, annotation, prediction): + for target in annotation: + height, width, _ = self.image_size + x_start, y_start = 0, 0 + if self.use_annotation_rect: + resized_box = annotation[0].metadata.get('rect') + x_start, y_start, x_max, y_max = resized_box + width = x_max - x_start + height = y_max - y_start + + target.x_values = ( + (np.array(target.x_values, dtype=float) - x_start) / np.maximum(width, np.finfo(np.float64).eps) + ) + target.y_values = ( + (np.array(target.y_values, dtype=float) - y_start) / np.maximum(height, np.finfo(np.float64).eps) + ) + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/postprocessing_executor.py b/tools/accuracy_checker/accuracy_checker/postprocessor/postprocessing_executor.py new file mode 100644 index 0000000..875a546 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/postprocessing_executor.py @@ -0,0 +1,79 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..config import ConfigValidator, StringField +from ..utils import overrides, zipped_transform +from .postprocessor import Postprocessor + + +class PostprocessingExecutor: + def __init__(self, processors=None, dataset_name='custom', dataset_meta=None, state=None): + self._processors = [] + self._image_processors = [] + self._dataset_processors = [] + self.dataset_meta = dataset_meta + + self.state = state or {} + + if not processors: + return + + for config in processors: + postprocessor_config = PostprocessorConfig( + "{}.postprocessing".format(dataset_name), + on_extra_argument=ConfigValidator.IGNORE_ON_EXTRA_ARGUMENT + ) + postprocessor_config.validate(config) + postprocessor = Postprocessor.provide(config['type'], config, config['type'], self.dataset_meta, state) + self._processors.append(postprocessor) + + allow_image_postprocessor = True + for processor in self._processors: + if overrides(processor, 'process_all', Postprocessor): + allow_image_postprocessor = False + self._dataset_processors.append(processor) + else: + if allow_image_postprocessor: + self._image_processors.append(processor) + else: + self._dataset_processors.append(processor) + + def process_dataset(self, annotations, predictions): + for method in self._dataset_processors: + annotations, predictions = method.process_all(annotations, predictions) + + return annotations, predictions + + def process_image(self, annotation, prediction): + for method in self._image_processors: + annotation_entries, prediction_entries = method.get_entries(annotation, prediction) + method.process(annotation_entries, prediction_entries) + + return annotation, prediction + + def process_batch(self, annotations, predictions): + return zipped_transform(self.process_image, annotations, predictions) + + def full_process(self, annotations, predictions): + return self.process_dataset(*self.process_batch(annotations, predictions)) + + @property + def has_dataset_processors(self): + return len(self._dataset_processors) != 0 + + +class PostprocessorConfig(ConfigValidator): + type = StringField(choices=Postprocessor.providers) diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/postprocessor.py b/tools/accuracy_checker/accuracy_checker/postprocessor/postprocessor.py new file mode 100644 index 0000000..de0c066 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/postprocessor.py @@ -0,0 +1,188 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import warnings +from enum import Enum +from ..representation import ContainerRepresentation +from ..config import ConfigValidator, StringField, ConfigError, BaseField +from ..dependency import ClassProvider +from ..utils import ( + zipped_transform, + string_to_list, + check_representation_type, + get_supported_representations, + enum_values +) + + +class BasePostprocessorConfig(ConfigValidator): + type = StringField() + annotation_source = BaseField(optional=True) + prediction_source = BaseField(optional=True) + + +class Postprocessor(ClassProvider): + __provider_type__ = 'postprocessor' + + annotation_types = () + prediction_types = () + + def __init__(self, config, name=None, meta=None, state=None): + self.config = config + self.name = name + self.meta = meta + self.state = state + self.image_size = None + + self.annotation_source = self.config.get('annotation_source') + if self.annotation_source and not isinstance(self.annotation_source, list): + self.annotation_source = string_to_list(self.annotation_source) + + self.prediction_source = self.config.get('prediction_source') + if self.prediction_source and not isinstance(self.prediction_source, list): + self.prediction_source = string_to_list(self.prediction_source) + + self.validate_config() + self.setup() + + def __call__(self, *args, **kwargs): + return self.process_all(*args, **kwargs) + + def setup(self): + self.configure() + + def process_image(self, annotation, prediction): + raise NotImplementedError + + def process(self, annotation, prediction): + image_size = annotation[0].metadata.get('image_size') if not None in annotation else None + self.image_size = None + if image_size: + self.image_size = image_size[0] + self.process_image(annotation, prediction) + + return annotation, prediction + + def process_all(self, annotations, predictions): + zipped_transform(self.process, zipped_transform(self.get_entries, annotations, predictions)) + return annotations, predictions + + def configure(self): + pass + + def validate_config(self): + BasePostprocessorConfig( + self.name, on_extra_argument=BasePostprocessorConfig.ERROR_ON_EXTRA_ARGUMENT + ).validate(self.config) + + def get_entries(self, annotation, prediction): + message_not_found = '{}: {} is not found in container' + message_incorrect_type = "Incorrect type of {}. Postprocessor {} can work only with {}" + + def resolve_container(container, supported_types, entry_name, sources=None): + if not isinstance(container, ContainerRepresentation): + if sources: + message = 'Warning: {}_source can be applied only to container. Default value will be used' + warnings.warn(message.format(entry_name)) + + return [container] + + if not sources: + return get_supported_representations(container.values(), supported_types) + + entries = [] + for source in sources: + representation = container.get(source) + if not representation: + raise ConfigError(message_not_found.format(entry_name, source)) + + if supported_types and not check_representation_type(representation, supported_types): + raise TypeError(message_incorrect_type.format(entry_name, self.name, ','.join(supported_types))) + + entries.append(representation) + + return entries + + annotation_entries = resolve_container(annotation, self.annotation_types, 'annotation', self.annotation_source) + prediction_entries = resolve_container(prediction, self.prediction_types, 'prediction', self.prediction_source) + + return annotation_entries, prediction_entries + + +class ApplyToOption(Enum): + ANNOTATION = 'annotation' + PREDICTION = 'prediction' + ALL = 'all' + + +class PostprocessorWithTargetsConfigValidator(BasePostprocessorConfig): + apply_to = StringField(optional=True, choices=enum_values(ApplyToOption)) + + +class PostprocessorWithSpecificTargets(Postprocessor): + def validate_config(self): + _config_validator = PostprocessorWithTargetsConfigValidator( + self.__provider__, on_extra_argument=PostprocessorWithTargetsConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + _config_validator.validate(self.config) + + def setup(self): + apply_to = self.config.get('apply_to') + self.apply_to = ApplyToOption(apply_to) if apply_to else None + + if (self.annotation_source or self.prediction_source) and self.apply_to: + raise ConfigError("apply_to and sources both provided. You need specify only one from them") + + if not self.annotation_source and not self.prediction_source and not self.apply_to: + raise ConfigError("apply_to or annotation_source or prediction_source required for {}".format(self.name)) + + self.configure() + + def process(self, annotation, prediction): + image_size = annotation[0].metadata.get('image_size') if not None in annotation else None + self.image_size = None + if image_size: + self.image_size = image_size[0] + target_annotations, target_predictions = None, None + if self.annotation_source or self.prediction_source: + target_annotations, target_predictions = self._choose_targets_using_sources(annotation, prediction) + + if self.apply_to: + target_annotations, target_predictions = self._choose_targets_using_apply_to(annotation, prediction) + + if not target_annotations and not target_predictions: + raise ValueError("Suitable targets for {} not found".format(self.name)) + + self.process_image(target_annotations, target_predictions) + return annotation, prediction + + def _choose_targets_using_sources(self, annotations, predictions): + target_annotations = annotations if self.annotation_source else [] + target_predictions = predictions if self.prediction_source else [] + + return target_annotations, target_predictions + + def _choose_targets_using_apply_to(self, annotations, predictions): + targets_specification = { + ApplyToOption.ANNOTATION: (annotations, []), + ApplyToOption.PREDICTION: ([], predictions), + ApplyToOption.ALL: (annotations, predictions) + } + + return targets_specification[self.apply_to] + + def process_image(self, annotation, prediction): + raise NotImplementedError diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/resize_prediction_boxes.py b/tools/accuracy_checker/accuracy_checker/postprocessor/resize_prediction_boxes.py new file mode 100644 index 0000000..2ce7b85 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/resize_prediction_boxes.py @@ -0,0 +1,40 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..representation import DetectionPrediction, DetectionAnnotation +from ..postprocessor.postprocessor import Postprocessor + + +class ResizePredictionBoxes(Postprocessor): + """ + Resize normalized predicted bounding boxes coordinates (i.e. from [0, 1] range) to input image shape. + """ + + __provider__ = 'resize_prediction_boxes' + + prediction_types = (DetectionPrediction, ) + annotation_types = (DetectionAnnotation, ) + + def process_image(self, annotations, predictions): + h, w, _ = self.image_size + + for prediction in predictions: + prediction.x_mins *= w + prediction.x_maxs *= w + prediction.y_mins *= h + prediction.y_maxs *= h + + return annotations, predictions diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/resize_segmentation_mask.py b/tools/accuracy_checker/accuracy_checker/postprocessor/resize_segmentation_mask.py new file mode 100644 index 0000000..6c6b6dd --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/resize_segmentation_mask.py @@ -0,0 +1,73 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from functools import singledispatch +import scipy.misc +import numpy as np + +from ..config import NumberField +from ..utils import get_size_from_config +from .postprocessor import PostprocessorWithSpecificTargets, PostprocessorWithTargetsConfigValidator +from ..representation import SegmentationPrediction, SegmentationAnnotation + + +class ResizeSegmentationMask(PostprocessorWithSpecificTargets): + __provider__ = 'resize_segmentation_mask' + + annotation_types = (SegmentationAnnotation, ) + prediction_types = (SegmentationPrediction, ) + + def validate_config(self): + class _ResizeConfigValidator(PostprocessorWithTargetsConfigValidator): + size = NumberField(floats=False, optional=True, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + + resize_config_validator = _ResizeConfigValidator(self.__provider__) + resize_config_validator.validate(self.config) + + def configure(self): + self.dst_height, self.dst_width = get_size_from_config(self.config, allow_none=True) + + def process_image(self, annotation, prediction): + target_height = self.dst_height or self.image_size[0] + target_width = self.dst_width or self.image_size[1] + + @singledispatch + def resize_segmentation_mask(entry, height, width): + return entry + + @resize_segmentation_mask.register(SegmentationPrediction) + def _(entry, height, width): + entry_mask = [] + for class_mask in entry.mask: + resized_mask = scipy.misc.imresize(class_mask, (height, width), 'nearest') + entry_mask.append(resized_mask) + entry.mask = np.array(entry_mask) + + return entry + + @resize_segmentation_mask.register(SegmentationAnnotation) + def _(entry, height, width): + entry.mask = scipy.misc.imresize(entry.mask, (height, width), 'nearest') + return entry + + for target in annotation: + resize_segmentation_mask(target, target_height, target_width) + + for target in prediction: + resize_segmentation_mask(target, target_height, target_width) + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/postprocessor/zoom_segmentation_mask.py b/tools/accuracy_checker/accuracy_checker/postprocessor/zoom_segmentation_mask.py new file mode 100644 index 0000000..aae7fce --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/postprocessor/zoom_segmentation_mask.py @@ -0,0 +1,65 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from .postprocessor import Postprocessor, BasePostprocessorConfig +from ..representation import SegmentationAnnotation, SegmentationPrediction +from ..config import NumberField + + +class ZoomSegMask(Postprocessor): + """ + Zoom probabilities of segmentation prediction. + """ + + __provider__ = 'zoom_segmentation_mask' + + annotation_types = (SegmentationAnnotation, ) + prediction_types = (SegmentationPrediction, ) + + def validate_config(self): + class _ZoomSegMaskConfigValidator(BasePostprocessorConfig): + zoom = NumberField(floats=False, min_value=1) + + zoom_segmentation_mask_config_validator = _ZoomSegMaskConfigValidator( + self.__provider__, on_extra_argument=_ZoomSegMaskConfigValidator.ERROR_ON_EXTRA_ARGUMENT + ) + zoom_segmentation_mask_config_validator.validate(self.config) + + def configure(self): + self.zoom = self.config['zoom'] + + def process_image(self, annotation, prediction): + for annotation_, prediction_ in zip(annotation, prediction): + height, width = annotation_.mask.shape[:2] + prob = prediction_.mask + zoom_prob = np.zeros((prob.shape[0], height, width), dtype=np.float32) + for c in range(prob.shape[0]): + for h in range(height): + for w in range(width): + r0 = h // self.zoom + r1 = r0 + 1 + c0 = w // self.zoom + c1 = c0 + 1 + rt = float(h) / self.zoom - r0 + ct = float(w) / self.zoom - c0 + v0 = rt * prob[c, r1, c0] + (1 - rt) * prob[c, r0, c0] + v1 = rt * prob[c, r1, c1] + (1 - rt) * prob[c, r0, c1] + zoom_prob[c, h, w] = (1 - ct) * v0 + ct * v1 + prediction_.mask = zoom_prob + + return annotation, prediction diff --git a/tools/accuracy_checker/accuracy_checker/preprocessor/README.md b/tools/accuracy_checker/accuracy_checker/preprocessor/README.md new file mode 100644 index 0000000..d5be82b --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/preprocessor/README.md @@ -0,0 +1,51 @@ +# Preprocessors + +Preprocessor is function which processes input data before model inference. +Every preprocessor has parameters available for configuration. +Accuracy Checker supports following set of preprocessors: + +* `resize` - resizing the image to a new width and height. + * `dst_width` and `dst_height` are destination width and height for image resizing respectively. + You can also use `size` instead in case when destination sizes are equal for both dimensions. + * `use_pil` parameter specifies usage of Pillow library for resizing. + Accuracy Checker uses OpenCV as default image reader. + * `interpolation` specifies method that will be used. + Possible values depend on image processing library: + * **OpenCV**: Nearest, Linear, Cubic, Area, Max, Lanczos4, Bits, Bits32 + * **Pillow**: None, Nearest, Cubic, Bicubic, Box, Bilinear, Lanczos, Antialias, Hamming + * `aspect_ratio_scale` allows save image aspect ratio using one of these ways: + - `width` - rescale width. + - `height` - rescale height. + - `greater` - rescale greater from image sizes. + +* `normalization` - changing the range of pixel intensity values. + * `mean` values which will be subtracted from image channels. + You can specify one value for all channels or list of comma separated channel-wise values. + * `std` specifies values, on which pixels will be divided. + You can specify one value for all channels or list of comma separated channel-wise values. + + These parameters support work with precomputed values of frequently used datasets (e.g. `cifar10` or `imagenet`). + +* `bgr_to_rgb` - reversing image channels. Convert image in BGR format to RGB. +* `bgr_to_gray` - converting image in BGR to grayscale color space. +* `flip` - image mirroring around specified axis. + * `mode` specifies the axis for flipping (`vertical` or `horizontal`). +* `crop` - central cropping for image. + * `dst_width` and `dst_height` are destination width and height for image resizing respectively. You can also use `size` instead in case when destination sizes are equal. +* `crop_rectangle` - cropping region of interest using coordinates given as annotation metadata. +* `extend_around_rect` - scaling region of interest using annotation metadata. + * `augmentation_param` is scale factor for augmentation. +* `point_aligment` - aligning keypoints stored in annotation metadata. + * `draw_points` - allows visualize points. + * `normalize` - allows to use normalization for keypoints. + * `dst_width` and `dst_height` are destination width and height for keypoints resizing respectively. You can also use `size` instead in case when destination sizes are equal. +* `padding` - padding for image. + * `stride` - stride for padding. + * `pad_value` - value for filling space around original image. + * `dst_width` and `dst_height` are destination width and height for padded image respectively. + You can also use `size` instead in case when destination sizes are equal for both dimensions. +* `tiling` - image tiling. + * `margin` - margin for tiled fragment of image. + * `dst_width` and `dst_height` are destination width and height of tiled fragment respectively. + You can also use `size` instead in case when destination sizes are equal for both dimensions. + diff --git a/tools/accuracy_checker/accuracy_checker/preprocessor/__init__.py b/tools/accuracy_checker/accuracy_checker/preprocessor/__init__.py new file mode 100644 index 0000000..3999b41 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/preprocessor/__init__.py @@ -0,0 +1,51 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .preprocessing_executor import PreprocessingExecutor +from .preprocessors import ( + Preprocessor, + + Resize, + Flip, + Normalize, + Crop, + BgrToRgb, + BgrToGray, + CropRect, + ExtendAroundRect, + PointAligner, + Tiling, + Crop3D, + Normalize3d +) + +__all__ = [ + 'PreprocessingExecutor', + + 'Preprocessor', + 'Resize', + 'Flip', + 'Normalize', + 'Crop', + 'BgrToRgb', + 'BgrToGray', + 'CropRect', + 'ExtendAroundRect', + 'PointAligner', + 'Tiling', + 'Crop3D', + 'Normalize3d' +] diff --git a/tools/accuracy_checker/accuracy_checker/preprocessor/preprocessing_executor.py b/tools/accuracy_checker/accuracy_checker/preprocessor/preprocessing_executor.py new file mode 100644 index 0000000..aa355b5 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/preprocessor/preprocessing_executor.py @@ -0,0 +1,52 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from ..config import ConfigValidator, StringField +from ..preprocessor.preprocessors import Preprocessor + + +class PreprocessingExecutor: + def __init__(self, processors=None, dataset_name='custom', dataset_meta=None): + self.processors = [] + self.dataset_meta = dataset_meta + + if not processors: + return + + identifier = 'type' + for processor in processors: + preprocessor_config = PreprocessorConfig( + "{}.preprocessors".format(dataset_name), on_extra_argument=ConfigValidator.IGNORE_ON_EXTRA_ARGUMENT + ) + + type_ = processor.get(identifier) + preprocessor_config.validate(processor, type_) + preprocessor = Preprocessor.provide(processor[identifier], config=processor, name=type_) + + self.processors.append(preprocessor) + + def process(self, images, batch_annotation=None): + for i, _ in enumerate(images): + for processor in self.processors: + images[i] = processor( + image=images[i], annotation_meta=batch_annotation[i].metadata if batch_annotation else None + ) + + return images + + +class PreprocessorConfig(ConfigValidator): + type = StringField(choices=Preprocessor.providers) diff --git a/tools/accuracy_checker/accuracy_checker/preprocessor/preprocessors.py b/tools/accuracy_checker/accuracy_checker/preprocessor/preprocessors.py new file mode 100644 index 0000000..e4c2fb0 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/preprocessor/preprocessors.py @@ -0,0 +1,565 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import math +import cv2 +import numpy as np +from PIL import Image + +from ..config import BaseField, BoolField, ConfigValidator, NumberField, StringField, ConfigError +from ..dependency import ClassProvider +from ..utils import get_size_from_config, get_or_parse_value, string_to_tuple, get_size_3d_from_config + + +class BasePreprocessorConfig(ConfigValidator): + type = StringField() + + +class Preprocessor(ClassProvider): + __provider_type__ = 'preprocessor' + + def __init__(self, config, name=None): + self.config = config + self.name = name + + self.validate_config() + self.configure() + + def __call__(self, *args, **kwargs): + return self.process(*args, **kwargs) + + def process(self, image, annotation_meta=None): + raise NotImplementedError + + def configure(self): + pass + + def validate_config(self): + config = BasePreprocessorConfig(self.name, on_extra_argument=BasePreprocessorConfig.ERROR_ON_EXTRA_ARGUMENT) + config.validate(self.config) + + +def scale_width(dst_width, dst_height, image_width, image_height,): + return int(dst_width * image_width / image_height), dst_height + + +def scale_height(dst_width, dst_height, image_width, image_height): + return dst_width, int(dst_height * image_height / image_width) + + +def scale_greater(dst_width, dst_height, image_width, image_height): + if image_height > image_width: + return scale_height(dst_width, dst_height, image_width, image_height) + return scale_width(dst_width, dst_height, image_width, image_height) + + +class Resize(Preprocessor): + __provider__ = 'resize' + + PILLOW_INTERPOLATION = { + 'NEAREST': Image.NEAREST, + 'NONE': Image.NONE, + 'BOX': Image.BOX, + 'BILINEAR': Image.BILINEAR, + 'LINEAR': Image.LINEAR, + 'HAMMING': Image.HAMMING, + 'BICUBIC': Image.BICUBIC, + 'CUBIC': Image.CUBIC, + 'LANCZOS': Image.LANCZOS, + 'ANTIALIAS': Image.ANTIALIAS, + } + + OPENCV_INTERPOLATION = { + 'NEAREST': cv2.INTER_NEAREST, + 'LINEAR': cv2.INTER_LINEAR, + 'CUBIC': cv2.INTER_CUBIC, + 'AREA': cv2.INTER_AREA, + 'MAX': cv2.INTER_MAX, + 'BITS': cv2.INTER_BITS, + 'BITS2': cv2.INTER_BITS2, + 'LANCZOS4': cv2.INTER_LANCZOS4, + } + + ASPECT_RATIO_SCALE = { + 'width': scale_width, + 'height': scale_height, + 'greater': scale_greater, + } + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + size = NumberField(floats=False, optional=True, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + aspect_ratio_scale = StringField(choices=set(Resize.ASPECT_RATIO_SCALE), optional=True) + interpolation = StringField( + choices=set(Resize.PILLOW_INTERPOLATION) | set(Resize.OPENCV_INTERPOLATION), optional=True + ) + use_pil = BoolField(optional=True) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.dst_height, self.dst_width = get_size_from_config(self.config) + self.use_pil = self.config.get('use_pil', False) + + interpolation = self.config.get('interpolation', 'LINEAR') + + self.scaling_func = Resize.ASPECT_RATIO_SCALE.get(self.config.get('aspect_ratio_scale')) + + if self.use_pil and interpolation.upper() not in Resize.PILLOW_INTERPOLATION: + raise ValueError("Incorrect interpolation option: {} for resize preprocessing".format(interpolation)) + if not self.use_pil and interpolation.upper() not in Resize.OPENCV_INTERPOLATION: + raise ValueError("Incorrect interpolation option: {} for resize preprocessing".format(interpolation)) + + if self.use_pil: + self.interpolation = Resize.PILLOW_INTERPOLATION[interpolation] + else: + self.interpolation = Resize.OPENCV_INTERPOLATION[interpolation] + + def process(self, image, annotation_meta=None): + data = image.data + new_height, new_width = self.dst_height, self.dst_width + if self.scaling_func: + image_h, image_w = data.shape[:2] + new_width, new_height = self.scaling_func(self.dst_width, self.dst_height, image_w, image_h) + + image.metadata['preferable_width'] = max(new_width, self.dst_width) + image.metadata['preferable_height'] = max(new_height, self.dst_height) + + if self.use_pil: + data = Image.fromarray(data) + data = data.resize((new_width, new_height), self.interpolation) + image.data = np.array(data) + return image + + data = cv2.resize(data, (new_width, new_height), interpolation=self.interpolation).astype(np.float32) + if len(data.shape) == 2: + data = np.expand_dims(data, axis=-1) + image.data = data + + return image + + +class Normalize(Preprocessor): + __provider__ = 'normalization' + + PRECOMPUTED_MEANS = { + 'imagenet': (104.00698793, 116.66876762, 122.67891434), + 'cifar10': (125.307, 122.961, 113.8575), + } + + PRECOMPUTED_STDS = { + 'imagenet': (104.00698793, 116.66876762, 122.67891434), + 'cifar10': (125.307, 122.961, 113.8575), + } + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + mean = BaseField(optional=True) + std = BaseField(optional=True) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.mean = get_or_parse_value(self.config.get('mean'), Normalize.PRECOMPUTED_MEANS) + self.std = get_or_parse_value(self.config.get('std'), Normalize.PRECOMPUTED_STDS) + if not self.mean and not self.std: + raise ConfigError('mean or std value should be provided') + + if self.std and 0 in self.std: + raise ConfigError('std value should not contain 0') + + if self.mean and not (len(self.mean) == 3 or len(self.mean) == 1): + raise ConfigError('mean should be one value or comma-separated list channel-wise values') + + if self.std and not (len(self.std) == 3 or len(self.std) == 1): + raise ConfigError('std should be one value or comma-separated list channel-wise values') + + def process(self, image, annotation_meta=None): + if self.mean: + image.data = image.data - self.mean + if self.std: + image.data = image.data / self.std + + return image + + +class BgrToRgb(Preprocessor): + __provider__ = 'bgr_to_rgb' + + def process(self, image, annotation_meta=None): + image.data = cv2.cvtColor(image.data, cv2.COLOR_BGR2RGB) + return image + + +class BgrToGray(Preprocessor): + __provider__ = 'bgr_to_gray' + + def process(self, image, annotation_meta=None): + image.data = np.expand_dims(cv2.cvtColor(image.data, cv2.COLOR_BGR2GRAY).astype(np.float32), -1) + return image + + +class Flip(Preprocessor): + __provider__ = 'flip' + + FLIP_MODES = { + 'horizontal': 0, + 'vertical': 1 + } + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + mode = StringField(choices=Flip.FLIP_MODES.keys()) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + mode = self.config.get('mode', 'horizontal') + if isinstance(mode, str): + self.mode = Flip.FLIP_MODES[mode] + + def process(self, image, annotation_meta=None): + image.data = cv2.flip(image.data, self.mode) + return image + + +class Crop(Preprocessor): + __provider__ = 'crop' + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + size = NumberField(floats=False, optional=True, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.dst_height, self.dst_width = get_size_from_config(self.config) + + def process(self, image, annotation_meta=None): + data = image.data + height, width, _ = data.shape + if width < self.dst_width or height < self.dst_height: + resized = np.array([width, height]) + if resized[0] < self.dst_width: + resized = resized * self.dst_width / resized[0] + if resized[1] < self.dst_height: + resized = resized * self.dst_height / resized[1] + + data = cv2.resize(data, tuple(np.ceil(resized).astype(int))) + + height, width, _ = data.shape + start_height = (height - self.dst_height) // 2 + start_width = (width - self.dst_width) // 2 + + image.data = data[start_height:start_height + self.dst_height, start_width:start_width + self.dst_width] + return image + + +class CropRect(Preprocessor): + __provider__ = 'crop_rect' + + def process(self, image, annotation_meta=None): + rect = annotation_meta.get('rect') + if not rect: + return image + + rows, cols = image.data.shape[:2] + rect_x_min, rect_y_min, rect_x_max, rect_y_max = rect + start_width, start_height = max(0, rect_x_min), max(0, rect_y_min) + + width = min(start_width + (rect_x_max - rect_x_min), cols) + height = min(start_height + (rect_y_max - rect_y_min), rows) + + image.data = image.data[start_height:height, start_width:width] + return image + + +class ExtendAroundRect(Preprocessor): + __provider__ = 'extend_around_rect' + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + augmentation_param = NumberField(floats=True, optional=True) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.augmentation_param = self.config.get('augmentation_param', 0) + + def process(self, image, annotation_meta=None): + rect = annotation_meta.get('rect') + rows, cols = image.data.shape[:2] + + rect_x_left, rect_y_top, rect_x_right, rect_y_bottom = rect or (0, 0, cols, rows) + rect_x_left = max(0, rect_x_left) + rect_y_top = max(0, rect_y_top) + rect_x_right = min(rect_x_right, cols) + rect_y_bottom = min(rect_y_bottom, rows) + + rect_w = rect_x_right - rect_x_left + rect_h = rect_y_bottom - rect_y_top + + width_extent = (rect_x_right - rect_x_left + 1) * self.augmentation_param + height_extent = (rect_y_bottom - rect_y_top + 1) * self.augmentation_param + rect_x_left = rect_x_left - width_extent + border_left = abs(min(0, rect_x_left)) + rect_x_left = int(max(0, rect_x_left)) + + rect_y_top = rect_y_top - height_extent + border_top = abs(min(0, rect_y_top)) + rect_y_top = int(max(0, rect_y_top)) + + rect_y_bottom += border_top + rect_y_bottom = int(rect_y_bottom + height_extent + 0.5) + border_bottom = abs(max(0, rect_y_bottom - rows)) + + rect_x_right += border_left + rect_x_right = int(rect_x_right + width_extent + 0.5) + border_right = abs(max(0, rect_x_right - cols)) + + image.data = cv2.copyMakeBorder( + image.data, int(border_top), int(border_bottom), int(border_left), int(border_right), cv2.BORDER_REPLICATE + ) + + rect = ( + int(rect_x_left), int(rect_y_top), + int(rect_x_left) + int(rect_w + width_extent * 2), int(rect_y_top) + int(rect_h + height_extent * 2) + ) + annotation_meta['rect'] = rect + + return image + + +class PointAligner(Preprocessor): + __provider__ = 'point_alignment' + + ref_landmarks = np.array([ + 30.2946 / 96, 51.6963 / 112, + 65.5318 / 96, 51.5014 / 112, + 48.0252 / 96, 71.7366 / 112, + 33.5493 / 96, 92.3655 / 112, + 62.7299 / 96, 92.2041 / 112 + ], dtype=np.float64).reshape(5, 2) + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + draw_points = BoolField(optional=True) + normalize = BoolField(optional=True) + size = NumberField(floats=False, optional=True, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.draw_points = self.config.get('draw_points', False) + self.normalize = self.config.get('normalize', True) + self.dst_height, self.dst_width = get_size_from_config(self.config) + + def process(self, image, annotation_meta=None): + keypoints = annotation_meta.get('keypoints') + image.data = self.align(image.data, keypoints) + return image + + def align(self, img, points): + if not points: + return img + + points_number = len(points) // 2 + points = np.array(points).reshape(points_number, 2) + + inp_shape = [1., 1.] + if self.normalize: + inp_shape = img.shape + + keypoints = points.copy().astype(np.float64) + keypoints[:, 0] *= (float(self.dst_width) / inp_shape[1]) + keypoints[:, 1] *= (float(self.dst_height) / inp_shape[0]) + + keypoints_ref = np.zeros((points_number, 2), dtype=np.float64) + keypoints_ref[:, 0] = self.ref_landmarks[:, 0] * self.dst_width + keypoints_ref[:, 1] = self.ref_landmarks[:, 1] * self.dst_height + + transformation_matrix = self.transformation_from_points(np.array(keypoints_ref), np.array(keypoints)) + img = cv2.resize(img, (self.dst_width, self.dst_height)) + if self.draw_points: + for point in keypoints: + cv2.circle(img, (int(point[0]), int(point[1])), 5, (255, 0, 0), -1) + + return cv2.warpAffine(img, transformation_matrix, (self.dst_width, self.dst_height), flags=cv2.WARP_INVERSE_MAP) + + @staticmethod + def transformation_from_points(points1, points2): + points1 = np.matrix(points1.astype(np.float64)) + points2 = np.matrix(points2.astype(np.float64)) + + c1 = np.mean(points1, axis=0) + c2 = np.mean(points2, axis=0) + points1 -= c1 + points2 -= c2 + s1 = np.std(points1) + s2 = np.std(points2) + points1 /= np.maximum(s1, np.finfo(np.float64).eps) + points2 /= np.maximum(s1, np.finfo(np.float64).eps) + points_std_ratio = s2 / np.maximum(s1, np.finfo(np.float64).eps) + + u, _, vt = np.linalg.svd(points1.T * points2) + r = (u * vt).T + + return np.hstack((points_std_ratio * r, c2.T - points_std_ratio * r * c1.T)) + + +class Padding(Preprocessor): + __provider__ = 'padding' + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + stride = NumberField(floats=False, min_value=1, optional=True) + pad_value = StringField(optional=True) + size = NumberField(floats=False, optional=True, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + + _ConfigValidator(self.name).validate(self.config) + + def configure(self): + self.stride = self.config.get('stride', 1) + pad_val = self.config.get('pad_value', '0,0,0') + if isinstance(pad_val, int): + self.pad_value = (pad_val, pad_val, pad_val) + if isinstance(pad_val, str): + self.pad_value = string_to_tuple(pad_val, int) + self.dst_height, self.dst_width = get_size_from_config(self.config, allow_none=True) + + def process(self, image, annotation_meta=None): + height, width, _ = image.data.shape + pref_height = self.dst_height or image.metadata.get('preferable_height', height) + pref_width = self.dst_width or image.metadata.get('preferable_width', width) + height = min(height, pref_height) + pref_height = math.ceil(pref_height / float(self.stride)) * self.stride + pref_width = max(pref_width, width) + pref_width = math.ceil(pref_width / float(self.stride)) * self.stride + pad = [] + pad.append(int(math.floor((pref_height - height) / 2.0))) + pad.append(int(math.floor((pref_width - width) / 2.0))) + pad.append(int(pref_height - height - pad[0])) + pad.append(int(pref_width - width - pad[1])) + image.metadata['padding'] = pad + image.data = cv2.copyMakeBorder( + image.data, pad[0], pad[2], pad[1], pad[3], cv2.BORDER_CONSTANT, value=self.pad_value + ) + + return image + +class Tiling(Preprocessor): + __provider__ = 'tiling' + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + margin = NumberField(floats=False, min_value=1) + size = NumberField(floats=False, optional=True, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.dst_height, self.dst_width = get_size_from_config(self.config) + self.margin = self.config['margin'] + + def process(self, image, annotation_meta=None): + data = image.data + image_size = data.shape + output_height = self.dst_height - 2 * self.margin + output_width = self.dst_width - 2 * self.margin + data = cv2.copyMakeBorder(data, *np.full(4, self.margin), cv2.BORDER_REFLECT_101) + num_tiles_h = image_size[0] // output_height + (1 if image_size[0] % output_height else 0) + num_tiles_w = image_size[1] // output_width + (1 if image_size[1] % output_width else 0) + tiled_data = [] + for height in range(num_tiles_h): + for width in range(num_tiles_w): + offset = [output_height * height, output_width * width] + tile = data[offset[0]:offset[0] + self.dst_height, offset[1]:offset[1] + self.dst_width, :] + margin = [0, self.dst_height - tile.shape[0], 0, self.dst_width - tile.shape[1]] + tile = cv2.copyMakeBorder(tile, *margin, cv2.BORDER_REFLECT_101) + tiled_data.append(tile) + image.data = tiled_data + image.metadata['tiles_shape'] = (num_tiles_h, num_tiles_w) + image.metadata['multi_infer'] = True + + return image + +class Crop3D(Preprocessor): + __provider__ = 'crop3d' + + def validate_config(self): + class _ConfigValidator(BasePreprocessorConfig): + size = NumberField(floats=False, min_value=1) + dst_width = NumberField(floats=False, optional=True, min_value=1) + dst_height = NumberField(floats=False, optional=True, min_value=1) + dst_volume = NumberField(floats=False, optional=True, min_value=1) + + _ConfigValidator(self.name, on_extra_argument=_ConfigValidator.ERROR_ON_EXTRA_ARGUMENT).validate(self.config) + + def configure(self): + self.dst_height, self.dst_width, self.dst_volume = get_size_3d_from_config(self.config) + + def process(self, image, annotation_meta=None): + image.data = self.crop_center(image.data, self.dst_height, self.dst_width, self.dst_volume) + return image + + @staticmethod + def crop_center(img, cropx, cropy, cropz): + + z, y, x, _ = img.shape + + # Make sure starting index is >= 0 + startx = max(x // 2 - (cropx // 2), 0) + starty = max(y // 2 - (cropy // 2), 0) + startz = max(z // 2 - (cropz // 2), 0) + + # Make sure ending index is <= size + endx = min(startx + cropx, x) + endy = min(starty + cropy, y) + endz = min(startz + cropz, z) + + return img[startz:endz, starty:endy, startx:endx, :] + + +class Normalize3d(Preprocessor): + __provider__ = "normalize3d" + + def process(self, image, annotation_meta=None): + data = self.normalize_img(image.data) + image_list = [] + for img in data: + image_list.append(img) + image.data = image_list + image.metadata['multi_infer'] = True + + return image + + @staticmethod + def normalize_img(img): + for channel in range(img.shape[3]): + channel_val = img[:, :, :, channel] - np.mean(img[:, :, :, channel]) + channel_val /= np.std(img[:, :, :, channel]) + img[:, :, :, channel] = channel_val + + return img diff --git a/tools/accuracy_checker/accuracy_checker/presenters.py b/tools/accuracy_checker/accuracy_checker/presenters.py new file mode 100644 index 0000000..9c39e1f --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/presenters.py @@ -0,0 +1,123 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from collections import namedtuple +from enum import Enum +import numpy as np + +from .dependency import ClassProvider +from .logging import print_info + +EvaluationResult = namedtuple('EvaluationResult', ['evaluated_value', 'reference_value', 'name', 'threshold', 'meta']) + + +class Color(Enum): + PASSED = 0 + FAILED = 1 + + +def color_format(s, color=Color.PASSED): + if color == Color.PASSED: + return "\x1b[0;32m{}\x1b[0m".format(s) + return "\x1b[0;31m{}\x1b[0m".format(s) + + +class BasePresenter(ClassProvider): + __provider_type__ = "presenter" + + def write_result(self, evaluation_result, output_callback=None, ignore_results_formatting=False): + raise NotImplementedError + + +class ScalarPrintPresenter(BasePresenter): + __provider__ = "print_scalar" + + def write_result(self, evaluation_result: EvaluationResult, output_callback=None, ignore_results_formatting=False): + value, reference, name, threshold, meta = evaluation_result + value = np.mean(value) + postfix, scale, result_format = get_result_format_parameters(meta, ignore_results_formatting) + write_scalar_result( + value, name, reference, threshold, postfix=postfix, scale=scale, result_format=result_format + ) + + +class VectorPrintPresenter(BasePresenter): + __provider__ = "print_vector" + + def write_result(self, evaluation_result: EvaluationResult, output_callback=None, ignore_results_formatting=False): + value, reference, name, threshold, meta = evaluation_result + if threshold: + threshold = float(threshold) + + value_names = meta.get('names') + postfix, scale, result_format = get_result_format_parameters(meta, ignore_results_formatting) + if np.isscalar(value) or np.size(value) == 1: + value = [value] + + for index, res in enumerate(value): + write_scalar_result( + res, name, reference, threshold, + value_name=value_names[index] if value_names else None, + postfix=postfix[index] if not np.isscalar(postfix) else postfix, + scale=scale[index] if not np.isscalar(scale) else scale, + result_format=result_format + ) + + if len(value) > 1 and meta.get('calculate_mean', True): + write_scalar_result( + np.mean(np.multiply(value, scale)), name, reference, threshold, value_name='mean', + postfix=postfix[-1] if not np.isscalar(postfix) else postfix, scale=1, + result_format=result_format + ) + + +def write_scalar_result(res_value, name, reference, threshold, value_name=None, postfix='%', scale=100, + result_format='{:.2f}'): + display_name = "{}@{}".format(name, value_name) if value_name else name + display_result = result_format.format(res_value * scale) + message = '{}: {}{}'.format(display_name, display_result, postfix) + + if reference: + threshold = threshold or 0 + + difference = abs(reference - (res_value * scale)) + if threshold <= difference: + fail_message = "[FAILED: error = {:.4}]".format(difference) + message = "{} {}".format(message, color_format(fail_message, Color.FAILED)) + else: + message = "{} {}".format(message, color_format("[OK]", Color.PASSED)) + + print_info(message) + + +class ReturnValuePresenter(BasePresenter): + __provider__ = "return_value" + + def write_result(self, evaluation_result: EvaluationResult, output_callback=None, ignore_results_formatting=False): + if output_callback: + output_callback(evaluation_result) + + +def get_result_format_parameters(meta, use_default_formatting): + postfix = ' ' + scale = 1 + result_format = '{}' + if not use_default_formatting: + postfix = meta.get('postfix', '%') + scale = meta.get('scale', 100) + result_format = meta.get('data_format', '{:.2f}') + + return postfix, scale, result_format diff --git a/tools/accuracy_checker/accuracy_checker/progress_reporters.py b/tools/accuracy_checker/accuracy_checker/progress_reporters.py new file mode 100644 index 0000000..3938e7d --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/progress_reporters.py @@ -0,0 +1,92 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import time + +from tqdm import tqdm + +from .dependency import ClassProvider +from .logging import print_info + + +class ProgressReporter(ClassProvider): + __provider_type__ = 'progress_reporter' + + def __init__(self, dataset_size=None): + self.finished = True + self.dataset_size = None + self.start_time = None + self.prev_time = None + if dataset_size is not None: + self.reset(dataset_size) + + def finish(self, objects_processed=True): + self.finished = True + if not objects_processed: + return + + process_time = time.time() - self.start_time + print_info('{} objects processed in {:.3f} seconds'.format(self.dataset_size, process_time)) + + def reset(self, dataset_size): + if not self.finished: + self.finish(objects_processed=False) + + self.dataset_size = dataset_size + self.start_time = time.time() + self.finished = False + + +class PrintProgressReporter(ProgressReporter): + __provider__ = 'print' + + def __init__(self, dataset_size=None, print_interval=1000): + super().__init__(dataset_size) + self.print_interval = print_interval + + def reset(self, dataset_size): + self.dataset_size = dataset_size + print_info('Total dataset size: {}'.format(dataset_size)) + self.start_time = time.time() + self.prev_time = self.start_time + + def update(self, batch_id, batch_size): + if (batch_id + 1) % self.print_interval != 0: + return + + now = time.time() + batch_time = now - self.prev_time + self.prev_time = now + + print_info('{} / {} processed in {:.3f}s'.format((batch_id + 1) * batch_size, self.dataset_size, batch_time)) + + +class TQDMReporter(ProgressReporter): + __provider__ = 'bar' + + def update(self, _batch_id, batch_size): + self.tqdm.update(batch_size) + + def finish(self, objects_processed=True): + self.tqdm.close() + super().finish(objects_processed) + + def reset(self, dataset_size): + super().reset(dataset_size) + self.tqdm = tqdm( + total=self.dataset_size, unit='frames', leave=False, + bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' + ) diff --git a/tools/accuracy_checker/accuracy_checker/representation/__init__.py b/tools/accuracy_checker/accuracy_checker/representation/__init__.py new file mode 100644 index 0000000..0ceabc3 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/__init__.py @@ -0,0 +1,103 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .base_representation import BaseRepresentation +from .classification_representation import Classification, ClassificationAnnotation, ClassificationPrediction +from .detection_representation import Detection, DetectionAnnotation, DetectionPrediction +from .reid_representation import ( + ReIdentificationAnnotation, + ReIdentificationClassificationAnnotation, + ReIdentificationPrediction +) +from .segmentation_representation import ( + SegmentationRepresentation, + SegmentationAnnotation, + SegmentationPrediction, + BrainTumorSegmentationAnnotation, + BrainTumorSegmentationPrediction +) +from .character_recognition_representation import ( + CharacterRecognition, + CharacterRecognitionAnnotation, + CharacterRecognitionPrediction +) +from .representaton_container import ContainerRepresentation, ContainerAnnotation, ContainerPrediction +from .regression_representation import ( + RegressionAnnotation, + RegressionPrediction, + FacialLandmarksAnnotation, + FacialLandmarksPrediction, + GazeVectorAnnotation, + GazeVectorPrediction +) +from .multilabel_recognition import MultiLabelRecognitionAnnotation, MultiLabelRecognitionPrediction +from .super_resolution_representation import SuperResolutionAnnotation, SuperResolutionPrediction +from .text_detection_representation import TextDetectionAnnotation, TextDetectionPrediction +from .pose_estimation_representation import PoseEstimationAnnotation, PoseEstimationPrediction +from .hit_ratio_representation import HitRatio, HitRatioAnnotation, HitRatioPrediction + +__all__ = [ + 'BaseRepresentation', + + 'Classification', + 'ClassificationAnnotation', + 'ClassificationPrediction', + + 'Detection', + 'DetectionAnnotation', + 'DetectionPrediction', + + 'ReIdentificationAnnotation', + 'ReIdentificationClassificationAnnotation', + 'ReIdentificationPrediction', + + 'SegmentationRepresentation', + 'SegmentationAnnotation', + 'SegmentationPrediction', + 'BrainTumorSegmentationAnnotation', + 'BrainTumorSegmentationPrediction', + + 'CharacterRecognition', + 'CharacterRecognitionAnnotation', + 'CharacterRecognitionPrediction', + + 'ContainerRepresentation', + 'ContainerAnnotation', + 'ContainerPrediction', + + 'RegressionAnnotation', + 'RegressionPrediction', + 'FacialLandmarksAnnotation', + 'FacialLandmarksPrediction', + 'GazeVectorAnnotation', + 'GazeVectorPrediction', + + 'MultiLabelRecognitionAnnotation', + 'MultiLabelRecognitionPrediction', + + 'SuperResolutionAnnotation', + 'SuperResolutionPrediction', + + 'TextDetectionAnnotation', + 'TextDetectionPrediction', + + 'PoseEstimationAnnotation', + 'PoseEstimationPrediction', + + 'HitRatio', + 'HitRatioAnnotation', + 'HitRatioPrediction' +] diff --git a/tools/accuracy_checker/accuracy_checker/representation/base_representation.py b/tools/accuracy_checker/accuracy_checker/representation/base_representation.py new file mode 100644 index 0000000..05d53b5 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/base_representation.py @@ -0,0 +1,42 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import abc +import pickle + + +class BaseRepresentation(abc.ABC): + def __init__(self, identifier, metadata=None): + self.identifier = identifier + self.metadata = metadata or {} + + @classmethod + def load(cls, file): + obj = pickle.load(file) + + if cls != BaseRepresentation: + assert isinstance(obj, cls) + + return obj + + def dump(self, file): + pickle.dump(self, file) + + def set_image_size(self, image_sizes): + self.metadata['image_size'] = image_sizes + + def set_data_source(self, data_source): + self.metadata['data_source'] = data_source diff --git a/tools/accuracy_checker/accuracy_checker/representation/character_recognition_representation.py b/tools/accuracy_checker/accuracy_checker/representation/character_recognition_representation.py new file mode 100644 index 0000000..df6a241 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/character_recognition_representation.py @@ -0,0 +1,31 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .base_representation import BaseRepresentation + + +class CharacterRecognition(BaseRepresentation): + def __init__(self, identifier='', label=None): + super().__init__(identifier) + self.label = label + + +class CharacterRecognitionAnnotation(CharacterRecognition): + pass + + +class CharacterRecognitionPrediction(CharacterRecognition): + pass diff --git a/tools/accuracy_checker/accuracy_checker/representation/classification_representation.py b/tools/accuracy_checker/accuracy_checker/representation/classification_representation.py new file mode 100644 index 0000000..67f72f6 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/classification_representation.py @@ -0,0 +1,44 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from .base_representation import BaseRepresentation + + +class Classification(BaseRepresentation): + pass + + +class ClassificationAnnotation(Classification): + def __init__(self, identifier='', label=None): + super().__init__(identifier) + + self.label = label + + +class ClassificationPrediction(Classification): + def __init__(self, identifier='', scores=None): + super().__init__(identifier) + + self.scores = np.array(scores) if scores is not None else np.array([]) + + @property + def label(self): + return np.argmax(self.scores) + + def top_k(self, k): + return np.argpartition(self.scores, -k)[-k:] diff --git a/tools/accuracy_checker/accuracy_checker/representation/detection_representation.py b/tools/accuracy_checker/accuracy_checker/representation/detection_representation.py new file mode 100644 index 0000000..1fc2c8b --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/detection_representation.py @@ -0,0 +1,87 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from ..utils import remove_difficult +from .base_representation import BaseRepresentation + + +class Detection(BaseRepresentation): + def __init__(self, identifier='', labels=None, x_mins=None, y_mins=None, x_maxs=None, y_maxs=None, metadata=None): + super().__init__(identifier, metadata) + + self.labels = np.array(labels) if labels is not None else np.array([]) + self.x_mins = np.array(x_mins) if x_mins is not None else np.array([]) + self.y_mins = np.array(y_mins) if y_mins is not None else np.array([]) + self.x_maxs = np.array(x_maxs) if x_maxs is not None else np.array([]) + self.y_maxs = np.array(y_maxs) if y_maxs is not None else np.array([]) + + def remove(self, indexes): + self.labels = np.delete(self.labels, indexes) + self.x_mins = np.delete(self.x_mins, indexes) + self.y_mins = np.delete(self.y_mins, indexes) + self.x_maxs = np.delete(self.x_maxs, indexes) + self.y_maxs = np.delete(self.y_maxs, indexes) + + difficult_boxes = self.metadata.get('difficult_boxes') + if not difficult_boxes: + return + + new_difficult_boxes = remove_difficult(difficult_boxes, indexes) + + self.metadata['difficult_boxes'] = new_difficult_boxes + + @property + def size(self): + return len(self.x_mins) + + def __eq__(self, other): + if not isinstance(other, type(self)): + return False + + def are_bounding_boxes_equal(): + if not np.array_equal(self.labels, other.labels): + return False + if not np.array_equal(self.x_mins, other.x_mins): + return False + if not np.array_equal(self.y_mins, other.y_mins): + return False + if not np.array_equal(self.x_maxs, other.x_maxs): + return False + if not np.array_equal(self.y_maxs, other.y_maxs): + return False + return True + + return self.identifier == other.identifier and are_bounding_boxes_equal() and self.metadata == other.metadata + + +class DetectionAnnotation(Detection): + pass + + +class DetectionPrediction(Detection): + def __init__(self, identifier='', labels=None, scores=None, x_mins=None, y_mins=None, x_maxs=None, y_maxs=None, + metadata=None): + super().__init__(identifier, labels, x_mins, y_mins, x_maxs, y_maxs, metadata) + self.scores = np.array(scores) if scores is not None else np.array([]) + + def remove(self, indexes): + super().remove(indexes) + self.scores = np.delete(self.scores, indexes) + + def __eq__(self, other): + return np.array_equal(self.scores, other.scores) if super().__eq__(other) else False diff --git a/tools/accuracy_checker/accuracy_checker/representation/hit_ratio_representation.py b/tools/accuracy_checker/accuracy_checker/representation/hit_ratio_representation.py new file mode 100644 index 0000000..f6cb6c7 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/hit_ratio_representation.py @@ -0,0 +1,40 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + +from .base_representation import BaseRepresentation + + +class HitRatio(BaseRepresentation): + def __init__(self, identifier=''): + super().__init__(identifier) + self.user = int(identifier[0].split('u:')[-1]) + self.item = int(identifier[1].split('i:')[-1]) + + + +class HitRatioAnnotation(HitRatio): + def __init__(self, identifier='', positive=True): + super().__init__(identifier) + self.positive = positive + + +class HitRatioPrediction(HitRatio): + def __init__(self, identifier='', scores=None): + super().__init__(identifier) + + self.scores = np.array(scores) if scores is not None else np.array([]) diff --git a/tools/accuracy_checker/accuracy_checker/representation/multilabel_recognition.py b/tools/accuracy_checker/accuracy_checker/representation/multilabel_recognition.py new file mode 100644 index 0000000..d5af464 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/multilabel_recognition.py @@ -0,0 +1,32 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from .base_representation import BaseRepresentation + + +class MultiLabelRecognitionRepresentation(BaseRepresentation): + def __init__(self, identifier='', multi_label=None): + super().__init__(identifier) + self.multi_label = np.array(multi_label) if isinstance(multi_label, list) else multi_label + + +class MultiLabelRecognitionAnnotation(MultiLabelRecognitionRepresentation): + pass + + +class MultiLabelRecognitionPrediction(MultiLabelRecognitionRepresentation): + pass diff --git a/tools/accuracy_checker/accuracy_checker/representation/pose_estimation_representation.py b/tools/accuracy_checker/accuracy_checker/representation/pose_estimation_representation.py new file mode 100644 index 0000000..f765dd8 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/pose_estimation_representation.py @@ -0,0 +1,63 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from .base_representation import BaseRepresentation + + +class PoseEstimationRepresentation(BaseRepresentation): + def __init__(self, identifier='', x_values=None, y_values=None, visibility=None, labels=None): + super().__init__(identifier) + self.x_values = x_values if np.size(x_values) > 0 else [] + self.y_values = y_values if np.size(y_values) > 0 else [] + self.visibility = visibility if np.size(visibility) > 0 else [2] * len(x_values) + self.labels = labels if labels is not None else np.array([1]*len(x_values)) + + @property + def areas(self): + areas = self.metadata.get('areas') + if areas: + return areas + x_mins = np.min(self.x_values, axis=1) + x_maxs = np.max(self.x_values, axis=1) + y_mins = np.min(self.y_values, axis=1) + y_maxs = np.max(self.y_values, axis=1) + return (x_maxs - x_mins) * (y_maxs - y_mins) + + @property + def bboxes(self): + rects = self.metadata.get('rects') + if rects: + return rects + x_mins = np.min(self.x_values, axis=1) + x_maxs = np.max(self.x_values, axis=1) + y_mins = np.min(self.y_values, axis=1) + y_maxs = np.max(self.y_values, axis=1) + return [[x_min, y_min, x_max, y_max] for x_min, y_min, x_max, y_max in zip(x_mins, y_mins, x_maxs, y_maxs)] + + @property + def size(self): + return len(self.x_values) + + +class PoseEstimationAnnotation(PoseEstimationRepresentation): + pass + + +class PoseEstimationPrediction(PoseEstimationRepresentation): + def __init__(self, identifier='', x_values=None, y_values=None, visibility=None, scores=None, labels=None): + super().__init__(identifier, x_values, y_values, visibility, labels) + self.scores = scores if scores.any() else [] diff --git a/tools/accuracy_checker/accuracy_checker/representation/regression_representation.py b/tools/accuracy_checker/accuracy_checker/representation/regression_representation.py new file mode 100644 index 0000000..99800d3 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/regression_representation.py @@ -0,0 +1,72 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from .base_representation import BaseRepresentation + + +class RegressionRepresentation(BaseRepresentation): + def __init__(self, identifier='', value=None): + super().__init__(identifier) + self.value = value + + +class RegressionAnnotation(RegressionRepresentation): + pass + + +class RegressionPrediction(RegressionRepresentation): + pass + + +class GazeVectorRepresentation(RegressionRepresentation): + def __init__(self, identifier='', value=None): + if value is None: + value = np.array([]) + super().__init__(identifier, value) + +class GazeVectorAnnotation(GazeVectorRepresentation): + pass + +class GazeVectorPrediction(GazeVectorRepresentation): + pass + + + +class FacialLandmarksRepresentation(BaseRepresentation): + def __init__(self, identifier='', x_values=None, y_values=None): + super().__init__(identifier) + self.x_values = x_values if x_values.any() else [] + self.y_values = y_values if y_values.any() else [] + + +class FacialLandmarksAnnotation(FacialLandmarksRepresentation): + @property + def interocular_distance(self): + left_eye = [ + np.mean(self.x_values[self.metadata['left_eye']]), + np.mean(self.y_values[self.metadata['left_eye']]) + ] + right_eye = [ + np.mean(self.x_values[self.metadata['right_eye']]), + np.mean(self.y_values[self.metadata['right_eye']]) + ] + + return np.linalg.norm((np.subtract(left_eye, right_eye))) + + +class FacialLandmarksPrediction(FacialLandmarksRepresentation): + pass diff --git a/tools/accuracy_checker/accuracy_checker/representation/reid_representation.py b/tools/accuracy_checker/accuracy_checker/representation/reid_representation.py new file mode 100644 index 0000000..d212eb7 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/reid_representation.py @@ -0,0 +1,42 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .base_representation import BaseRepresentation + + +class ReIdentification(BaseRepresentation): + pass + + +class ReIdentificationAnnotation(ReIdentification): + def __init__(self, identifier, camera_id, person_id, query): + super().__init__(identifier) + self.camera_id = camera_id + self.person_id = person_id + self.query = query + + +class ReIdentificationClassificationAnnotation(ReIdentification): + def __init__(self, identifier, positive_pairs=None, negative_pairs=None): + super().__init__(identifier) + self.positive_pairs = set(positive_pairs) + self.negative_pairs = set(negative_pairs) + + +class ReIdentificationPrediction(ReIdentification): + def __init__(self, identifiers, embedding): + super().__init__(identifiers) + self.embedding = embedding.copy() diff --git a/tools/accuracy_checker/accuracy_checker/representation/representaton_container.py b/tools/accuracy_checker/accuracy_checker/representation/representaton_container.py new file mode 100644 index 0000000..add7c69 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/representaton_container.py @@ -0,0 +1,78 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from ..representation import BaseRepresentation + + +class ContainerRepresentation(BaseRepresentation): + def __init__(self, representation_map=None): + super().__init__('') + self.representations = representation_map or {} + + def __eq__(self, other): + if not isinstance(other, type(self)): + return False + + if self.identifier != other.identifier: + return False + + if self.metadata != other.metadata: + return False + + if self.representations != other.representations: + return False + + return True + + def __getitem__(self, item): + return self.representations[item] + + def get(self, key): + return self.representations.get(key) + + def values(self): + return list(self.representations.values()) + + @property + def identifier(self): + if self._identifier: + return self._identifier + + values = self.values() + if np.size(values) == 0: + raise ValueError('representation container is empty') + + self._identifier = values[0].identifier + return self._identifier + + @identifier.setter + def identifier(self, identifier): + self._identifier = identifier + + +class ContainerAnnotation(ContainerRepresentation): + def set_image_size(self, image_sizes): + for key in self.representations.keys(): + self.representations[key].metadata['image_size'] = image_sizes + + def set_data_source(self, data_source): + for key in self.representations.keys(): + self.representations[key].metadata['data_source'] = data_source + + +class ContainerPrediction(ContainerRepresentation): + pass diff --git a/tools/accuracy_checker/accuracy_checker/representation/segmentation_representation.py b/tools/accuracy_checker/accuracy_checker/representation/segmentation_representation.py new file mode 100644 index 0000000..c6c78f0 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/segmentation_representation.py @@ -0,0 +1,91 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from enum import Enum + +import numpy as np + +from .base_representation import BaseRepresentation +from ..data_readers import BaseReader + + +class GTMaskLoader(Enum): + PILLOW = 0 + OPENCV = 1 + SCIPY = 2 + NIFTI = 3 + + +class SegmentationRepresentation(BaseRepresentation): + pass + + +class SegmentationAnnotation(SegmentationRepresentation): + LOADERS = { + GTMaskLoader.PILLOW: 'pillow_imread', + GTMaskLoader.OPENCV: 'opencv_imread', + GTMaskLoader.SCIPY: 'scipy_imread', + GTMaskLoader.NIFTI: 'nifti_reader' + } + + def __init__(self, identifier, path_to_mask, mask_loader=GTMaskLoader.PILLOW): + """ + Args: + identifier: object identifier (e.g. image name). + path_to_mask: path where segmentation mask should be loaded from. The path is relative to data source. + mask_loader: back-end, used to load segmentation masks. + """ + + super().__init__(identifier) + self._mask_path = path_to_mask + self._mask_loader = mask_loader + self._mask = None + + @property + def mask(self): + return self._mask if self._mask is not None else self._load_mask() + + @mask.setter + def mask(self, value): + self._mask = value + + def _load_mask(self): + loader = BaseReader.provide(self.LOADERS.get(self._mask_loader)) + if self._mask is None: + mask = loader(self._mask_path, self.metadata['data_source']) + return mask.astype(np.uint8) + + return self._mask + + +class SegmentationPrediction(SegmentationRepresentation): + def __init__(self, identifiers, mask): + """ + Args: + identifiers: object identifier (e.g. image name). + mask: array with shape (n_classes, height, width) of probabilities at each location. + """ + + super().__init__(identifiers) + self.mask = mask + + +class BrainTumorSegmentationAnnotation(SegmentationAnnotation): + def __init__(self, identifier, path_to_mask): + super().__init__(identifier, path_to_mask, GTMaskLoader.NIFTI) + +class BrainTumorSegmentationPrediction(SegmentationPrediction): + pass diff --git a/tools/accuracy_checker/accuracy_checker/representation/super_resolution_representation.py b/tools/accuracy_checker/accuracy_checker/representation/super_resolution_representation.py new file mode 100644 index 0000000..8cf989e --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/super_resolution_representation.py @@ -0,0 +1,67 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from enum import Enum +import numpy as np + +from .base_representation import BaseRepresentation +from ..data_readers import BaseReader + + +class GTLoader(Enum): + PILLOW = 0 + OPENCV = 1 + + +class SuperResolutionRepresentation(BaseRepresentation): + pass + + +class SuperResolutionAnnotation(SuperResolutionRepresentation): + LOADERS = { + GTLoader.PILLOW: 'pillow_imread', + GTLoader.OPENCV: 'opencv_imread' + } + + def __init__(self, identifier, path_to_hr, gt_loader=GTLoader.PILLOW): + """ + Args: + identifier: object identifier (e.g. image name). + path_to_hr: path where height resolution image should be loaded from. The path is relative to data source. + gt_loader: back-end, used to load segmentation masks. + """ + + super().__init__(identifier) + self._image_path = path_to_hr + self._gt_loader = self.LOADERS.get(gt_loader) + + @property + def value(self): + loader = BaseReader.provide(self._gt_loader) + gt = loader.read(self._image_path, self.metadata['data_source']) + return gt.astype(np.uint8) + + +class SuperResolutionPrediction(SuperResolutionRepresentation): + def __init__(self, identifiers, prediction): + """ + Args: + identifiers: object identifier (e.g. image name). + prediction: array with shape (height, width) contained result image. + """ + + super().__init__(identifiers) + self.value = prediction diff --git a/tools/accuracy_checker/accuracy_checker/representation/text_detection_representation.py b/tools/accuracy_checker/accuracy_checker/representation/text_detection_representation.py new file mode 100644 index 0000000..38e7a9c --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/representation/text_detection_representation.py @@ -0,0 +1,46 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from ..utils import remove_difficult +from .base_representation import BaseRepresentation + + +class TextDetectionRepresentation(BaseRepresentation): + def __init__(self, identifier='', points=None): + super().__init__(identifier) + self.points = points or [] + + def remove(self, indexes): + self.points = np.delete(self.points, indexes, axis=0) + difficult = self.metadata.get('difficult_boxes') + if not difficult: + return + self.metadata['difficult_boxes'] = remove_difficult(difficult, indexes) + + +class TextDetectionAnnotation(TextDetectionRepresentation): + def __init__(self, identifier='', points=None, description=''): + super().__init__(identifier, points) + self.description = description + + def remove(self, indexes): + super().remove(indexes) + self.description = np.delete(self.description, indexes) + + +class TextDetectionPrediction(TextDetectionRepresentation): + pass diff --git a/tools/accuracy_checker/accuracy_checker/utils.py b/tools/accuracy_checker/accuracy_checker/utils.py new file mode 100644 index 0000000..f03a0a2 --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/utils.py @@ -0,0 +1,361 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import collections +import csv +import errno +import itertools +import json +import os +import pickle + +from pathlib import Path +from typing import Union +from warnings import warn + +from shapely.geometry.polygon import Polygon +import numpy as np +import yaml + +try: + import lxml.etree as et +except ImportError: + import xml.etree.cElementTree as et + + +def concat_lists(*lists): + return list(itertools.chain(*lists)) + + +def get_path(entry: Union[str, Path], is_directory=False): + try: + path = Path(entry) + except TypeError: + raise TypeError('"{}" is expected to be a path-like'.format(entry)) + + # pathlib.Path.exists throws an exception in case of broken symlink + if not os.path.exists(str(path)): + raise FileNotFoundError('{}: {}'.format(os.strerror(errno.ENOENT), path)) + + if is_directory and not path.is_dir(): + raise NotADirectoryError('{}: {}'.format(os.strerror(errno.ENOTDIR), path)) + + # if it exists it is either file (or valid symlink to file) or directory (or valid symlink to directory) + if not is_directory and not path.is_file(): + raise IsADirectoryError('{}: {}'.format(os.strerror(errno.EISDIR), path)) + + return path + + +def contains_all(container, *args): + sequence = set(container) + + for arg in args: + if len(sequence.intersection(arg)) != len(arg): + return False + + return True + + +def contains_any(container, *args): + sequence = set(container) + + for arg in args: + if sequence.intersection(arg): + return True + + return False + + +def string_to_tuple(string, casting_type=float): + processed = string.replace(' ', '') + processed = processed.replace('(', '') + processed = processed.replace(')', '') + processed = processed.split(',') + + return tuple([casting_type(entry) for entry in processed]) + + +def string_to_list(string): + processed = string.replace(' ', '') + processed = processed.replace('[', '') + processed = processed.replace(']', '') + processed = processed.split(',') + + return list(entry for entry in processed) + + +class JSONDecoderWithAutoConversion(json.JSONDecoder): + """ + Custom json decoder to convert all strings into numbers (int, float) during reading json file. + """ + + def decode(self, s, _w=json.decoder.WHITESPACE.match): + decoded = super().decode(s, _w) + return self._decode(decoded) + + def _decode(self, entry): + if isinstance(entry, str): + try: + return int(entry) + except ValueError: + pass + try: + return float(entry) + except ValueError: + pass + elif isinstance(entry, dict): + return {self._decode(key): self._decode(value) for key, value in entry.items()} + elif isinstance(entry, list): + return [self._decode(value) for value in entry] + + return entry + + +def dict_subset(dict_, key_subset): + return {key: value for key, value in dict_.items() if key in key_subset} + + +def zipped_transform(fn, *iterables, inplace=False): + result = (iterables if inplace else tuple([] for _ in range(len(iterables)))) + updater = (list.__setitem__ if inplace else lambda container, _, entry: container.append(entry)) + + for idx, values in enumerate(zip(*iterables)): + iter_res = fn(*values) + if not iter_res: + continue + + for dst, res in zip(result, iter_res): + updater(dst, idx, res) + + return result + + +def overrides(obj, attribute_name, base=None): + cls = obj if isinstance(obj, type) else obj.__class__ + + base = base or cls.__bases__[0] + obj_attr = getattr(cls, attribute_name, None) + base_attr = getattr(base, attribute_name, None) + + return obj_attr and obj_attr != base_attr + + +def enum_values(enum): + return [member.value for member in enum] + + +def get_size_from_config(config, allow_none=False): + if contains_all(config, ('size', 'dst_width', 'dst_height')): + warn('All parameters: size, dst_width, dst_height are provided. Size will be used. ' + 'You should specify only size or pair values des_width, dst_height in config.') + if 'size' in config: + return config['size'], config['size'] + if contains_all(config, ('dst_width', 'dst_height')): + return config['dst_height'], config['dst_width'] + if not allow_none: + raise ValueError('Either size or dst_width and dst_height required') + + return None, None + + +def get_size_3d_from_config(config, allow_none=False): + if contains_all(config, ('size', 'dst_width', 'dst_height', 'dst_volume')): + warn('All parameters: size, dst_width, dst_height, dst_volume are provided. Size will be used. ' + 'You should specify only size or three values des_width, dst_height, dst_volume in config.') + if 'size' in config: + return config['size'], config['size'], config['size'] + if contains_all(config, ('dst_width', 'dst_height', 'dst_volume')): + return config['dst_height'], config['dst_width'], config['dst_volume'] + if not allow_none: + raise ValueError('Either size or dst_width and dst_height required') + + return config.get('dst_height'), config.get('dst_width'), config.get('dst_volume') + + +def in_interval(value, interval): + minimum = interval[0] + maximum = interval[1] if len(interval) >= 2 else None + + if not maximum: + return minimum <= value + + return minimum <= value < maximum + + +def finalize_metric_result(values, names): + result_values, result_names = [], [] + for value, name in zip(values, names): + if np.isnan(value): + continue + + result_values.append(value) + result_names.append(name) + + return result_values, result_names + + +def get_representations(values, representation_source): + return np.reshape([value.get(representation_source) for value in values], -1) + + +def get_supported_representations(container, supported_types): + if np.shape(container) == (): + container = [container] + + return list(filter(lambda rep: check_representation_type(rep, supported_types), container)) + + +def check_representation_type(representation, representation_types): + for representation_type in representation_types: + if type(representation).__name__ == representation_type.__name__: + return True + return False + + +def is_single_metric_source(source): + if not source: + return False + + return np.size(source.split(',')) == 1 + + +def read_txt(file: Union[str, Path], sep='\n', **kwargs): + def is_empty(string): + return not string or string.isspace() + + with get_path(file).open() as content: + content = content.read(**kwargs).split(sep) + content = list(filter(lambda string: not is_empty(string), content)) + + return list(map(str.strip, content)) + + +def read_xml(file: Union[str, Path], *args, **kwargs): + return et.parse(str(get_path(file)), *args, **kwargs).getroot() + + +def read_json(file: Union[str, Path], *args, **kwargs): + with get_path(file).open() as content: + return json.load(content, *args, **kwargs) + + +def read_pickle(file: Union[str, Path], *args, **kwargs): + with get_path(file).open('rb') as content: + return pickle.load(content, *args, **kwargs) + + +def read_yaml(file: Union[str, Path], *args, **kwargs): + # yaml does not keep order of keys in dictionaries but it is important for reading pre/post processing + yaml.add_representer(collections.OrderedDict, lambda dumper, data: dumper.represent_dict(data.items())) + yaml.add_constructor( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, + lambda loader, node: collections.OrderedDict(loader.construct_pairs(node)) + ) + + with get_path(file).open() as content: + return yaml.load(content, Loader=yaml.SafeLoader, *args, **kwargs) + + +def read_csv(file: Union[str, Path], *args, **kwargs): + with get_path(file).open() as content: + return list(csv.DictReader(content, *args, **kwargs)) + + +def extract_image_representations(image_representations): + images = [rep.data for rep in image_representations] + meta = [rep.metadata for rep in image_representations] + + return images, meta + + +def convert_bboxes_xywh_to_x1y1x2y2(x_coord, y_coord, width, height): + return x_coord, y_coord, x_coord + width, y_coord + height + + +def get_or_parse_value(item, supported_values, default=None): + if isinstance(item, str): + item = item.lower() + if item in supported_values: + return supported_values[item] + + try: + return string_to_tuple(item) + except ValueError: + message = 'Invalid value "{}", expected one of precomputed: ({}) or list of values'.format( + item, ', '.join(supported_values.keys()) + ) + raise ValueError(message) + + if isinstance(item, (float, int)): + return (item, ) + + return default + + +def string_to_bool(string): + return string.lower() in ['yes', 'true', 't', '1'] + + +def get_key_by_value(container, target): + for key, value in container.items(): + if value == target: + return key + + return None + + +def format_key(key): + return '--{}'.format(key) + + +def to_lower_register(str_list): + return list(map(lambda item: item.lower() if item else None, str_list)) + + +def polygon_from_points(points): + return Polygon(points) + + +def remove_difficult(difficult, indexes): + new_difficult = [] + decrementor = 0 + id_difficult = 0 + id_removed = 0 + while id_difficult < len(difficult) and id_removed < len(indexes): + if difficult[id_difficult] < indexes[id_removed]: + new_difficult.append(difficult[id_difficult] - decrementor) + id_difficult += 1 + else: + decrementor += 1 + id_removed += 1 + + return new_difficult + + +def convert_to_range(entry): + entry_range = entry + if isinstance(entry, str): + entry_range = string_to_tuple(entry_range) + elif not isinstance(entry_range, tuple) and not isinstance(entry_range, list): + entry_range = [entry_range] + + return entry_range + + +def add_input_shape_to_meta(meta, shape): + meta['input_shape'] = shape + return meta diff --git a/tools/accuracy_checker/configs/face-detection-adas-0001.yml b/tools/accuracy_checker/configs/face-detection-adas-0001.yml new file mode 100644 index 0000000..9b573df --- /dev/null +++ b/tools/accuracy_checker/configs/face-detection-adas-0001.yml @@ -0,0 +1,94 @@ +models: + - name: face-detection-adas-0001 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: face-detection-adas-0001/FP32/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP32/face-detection-adas-0001.bin + adapter: ssd + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - INT8 + device: CPU + model: face-detection-adas-0001/INT8/face-detection-adas-0001.xml + weights: face-detection-adas-0001/INT8/face-detection-adas-0001.bin + adapter: ssd + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: face-detection-adas-0001/FP32/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP32/face-detection-adas-0001.bin + adapter: ssd + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: face-detection-adas-0001/FP16/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP16/face-detection-adas-0001.bin + adapter: ssd + + - framework: dlsdk + device: MYRIAD + model: face-detection-adas-0001/FP16/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP16/face-detection-adas-0001.bin + adapter: ssd + + - framework: dlsdk + device: HDDL + model: face-detection-adas-0001/FP16/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP16/face-detection-adas-0001.bin + adapter: ssd + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: face-detection-adas-0001/FP32/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP32/face-detection-adas-0001.bin + adapter: ssd + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP16_MobileNet_Clamp.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: face-detection-adas-0001/FP32/face-detection-adas-0001.xml + weights: face-detection-adas-0001/FP32/face-detection-adas-0001.bin + adapter: ssd + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP11_ELU.aocx + + datasets: + - name: wider + data_source: WIDER_val/images + annotation_conversion: + converter: wider + annotation_file: wider_face_split/wider_face_val_bbx_gt.txt + + preprocessing: + - type: resize + dst_width: 672 + dst_height: 384 + + postprocessing: + - type: resize_prediction_boxes + - type: filter + height_range: 100 + apply_to: annotation + + metrics: + - type: map + ignore_difficult: True + include_boundaries: False + allow_multiple_matches_per_ignored: True + use_filtered_tp: True diff --git a/tools/accuracy_checker/configs/face-detection-retail-0004.yml b/tools/accuracy_checker/configs/face-detection-retail-0004.yml new file mode 100644 index 0000000..74b7872 --- /dev/null +++ b/tools/accuracy_checker/configs/face-detection-retail-0004.yml @@ -0,0 +1,98 @@ +models: + - name: face-detection-retail-0004 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: face-detection-retail-0004/FP32/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP32/face-detection-retail-0004.bin + adapter: ssd + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - INT8 + device: CPU + model: face-detection-retail-0004/INT8/face-detection-retail-0004.xml + weights: face-detection-retail-0004/INT8/face-detection-retail-0004.bin + adapter: ssd + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: face-detection-retail-0004/FP32/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP32/face-detection-retail-0004.bin + adapter: ssd + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: face-detection-retail-0004/FP16/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP16/face-detection-retail-0004.bin + adapter: ssd + + - framework: dlsdk + device: MYRIAD + model: face-detection-retail-0004/FP16/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP16/face-detection-retail-0004.bin + adapter: ssd + + - framework: dlsdk + device: HDDL + model: face-detection-retail-0004/FP16/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP16/face-detection-retail-0004.bin + adapter: ssd + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: face-detection-retail-0004/FP32/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP32/face-detection-retail-0004.bin + adapter: ssd + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP16_TinyYolo.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: face-detection-retail-0004/FP32/face-detection-retail-0004.xml + weights: face-detection-retail-0004/FP32/face-detection-retail-0004.bin + adapter: ssd + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP11_CaffeMobileNet.aocx + + datasets: + - name: wider + data_source: WIDER_val/images + annotation_conversion: + converter: wider + annotation_file: wider_face_split/wider_face_val_bbx_gt.txt + + preprocessing: + - type: resize + size: 300 + + postprocessing: + - type: resize_prediction_boxes + - type: cast_to_int + - type: filter + apply_to: annotation + height_range: 60 + is_empty: True + - type: filter + min_confidence: 0.0 + apply_to: prediction + + metrics: + - type: map + ignore_difficult: True + include_boundaries: False + allow_multiple_matches_per_ignored: False + distinct_conf: False diff --git a/tools/accuracy_checker/configs/face-reidentification-retail-0095.yml b/tools/accuracy_checker/configs/face-reidentification-retail-0095.yml new file mode 100644 index 0000000..de91fc9 --- /dev/null +++ b/tools/accuracy_checker/configs/face-reidentification-retail-0095.yml @@ -0,0 +1,74 @@ +models: + - name: face-reidentification-retail-0095 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: face-reidentification-retail-0095/FP16/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP16/face-reidentification-retail-0095.bin + adapter: reid + + - framework: dlsdk + device: MYRIAD + model: face-reidentification-retail-0095/FP16/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP16/face-reidentification-retail-0095.bin + adapter: reid + + - framework: dlsdk + device: HDDL + model: face-reidentification-retail-0095/FP16/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP16/face-reidentification-retail-0095.bin + adapter: reid + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.bin + adapter: reid + bitstream: 2019R1_A10DK_FP16_SSD300.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.xml + weights: face-reidentification-retail-0095/FP32/face-reidentification-retail-0095.bin + adapter: reid + bitstream: 2019R1_A10DK_FP11_CaffeMobileNet.aocx + + datasets: + - name: lfw + data_source: LFW/lfw + annotation_conversion: + converter: face_reid_pairwise + pairs_file: LFW/annotation/pairs.txt + landmarks_file: LFW/annotation/lfw_landmark.txt + + preprocessing: + - type: point_alignment + size: 400 + - type: resize + size: 128 + + metrics: + - type: pairwise_accuracy_subsets diff --git a/tools/accuracy_checker/configs/human-pose-estimation-0001.yml b/tools/accuracy_checker/configs/human-pose-estimation-0001.yml new file mode 100644 index 0000000..7197115 --- /dev/null +++ b/tools/accuracy_checker/configs/human-pose-estimation-0001.yml @@ -0,0 +1,114 @@ +models: + - name: human-pose-estimation-0001 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: human-pose-estimation-0001/FP32/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP32/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: human-pose-estimation-0001/FP32/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP32/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: human-pose-estimation-0001/FP16/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP16/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + + - framework: dlsdk + device: MYRIAD + model: human-pose-estimation-0001/FP16/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP16/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + + - framework: dlsdk + device: HDDL + model: human-pose-estimation-0001/FP16/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP16/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: human-pose-estimation-0001/FP32/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP32/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + bitstream: 2019R1_A10DK_FP16_ELU.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: human-pose-estimation-0001/FP32/human-pose-estimation-0001.xml + weights: human-pose-estimation-0001/FP32/human-pose-estimation-0001.bin + allow_reshape_input: True + adapter: + type: human_pose_estimation + part_affinity_fields_out: Mconv7_stage2_L1 + keypoints_heatmap_out: Mconv7_stage2_L2 + bitstream: 2019R1_A10DK_FP11_ELU.aocx + + + datasets: + - name: ms_coco_keypoints + data_source: val2017 + annotation_conversion: + converter: mscoco_keypoints + annotation_file: person_keypoints_val2017.json + + preprocessing: + - type: resize + size: 368 + interpolation: CUBIC + aspect_ratio_scale: width + - type: padding + stride: 8 + + postprocessing: + - type: filter + apply_to: annotation + area_range: 1, 10000000000 + - type: filter + apply_to: prediction + area_range: 1, 10000000000 + + metrics: + - name: AP + type: coco_precision + max_detections: 20 diff --git a/tools/accuracy_checker/configs/landmarks-regression-retail-0009.yml b/tools/accuracy_checker/configs/landmarks-regression-retail-0009.yml new file mode 100644 index 0000000..eca538a --- /dev/null +++ b/tools/accuracy_checker/configs/landmarks-regression-retail-0009.yml @@ -0,0 +1,82 @@ +models: + - name: landmarks-regression-retail-0009 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + + - framework: dlsdk + device: MYRIAD + model: landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + + - framework: dlsdk + device: HDDL + model: landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP16_AlexNet_GoogleNet.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.xml + weights: landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.bin + adapter: landmarks_regression + cpu_extensions: libcpu_extension_avx2.so + bitstream: 2019R1_A10DK_FP11_RMNet.aocx + + datasets: + - name: vgg2face + data_source: VGGFaces2/test + annotation_conversion: + converter: landmarks_regression + landmarks_csv_file: VGGFaces2/bb_landmark/loose_landmark_test.csv + bbox_csv_file: VGGFaces2/bb_landmark/loose_bb_test.csv + + preprocessing: + - type: crop_rect + - type: resize + size: 48 + + postprocessing: + - type: normalize_landmarks_points + use_annotation_rect: True + + metrics: + - type: per_point_normed_error + presenter: print_vector + - type: normed_error diff --git a/tools/accuracy_checker/configs/person-reidentification-retail-0031.yml b/tools/accuracy_checker/configs/person-reidentification-retail-0031.yml new file mode 100644 index 0000000..d41e250 --- /dev/null +++ b/tools/accuracy_checker/configs/person-reidentification-retail-0031.yml @@ -0,0 +1,80 @@ +models: + - name: person-reidentification-retail-0031 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: person-reidentification-retail-0031/FP16/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP16/person-reidentification-retail-0031.bin + adapter: reid + + - framework: dlsdk + device: MYRIAD + model: person-reidentification-retail-0031/FP16/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP16/person-reidentification-retail-0031.bin + adapter: reid + + - framework: dlsdk + device: HDDL + model: person-reidentification-retail-0031/FP16/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP16/person-reidentification-retail-0031.bin + adapter: reid + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.bin + adapter: reid + bitstream: 2019R1_A10DK_FP16_ELU.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.xml + weights: person-reidentification-retail-0031/FP32/person-reidentification-retail-0031.bin + adapter: reid + bitstream: 2019R1_A10DK_FP11_ELU.aocx + + datasets: + - name: market1501 + reader: pillow_imread + data_source: Market-1501-v15.09.15 + annoation_conversion: + converter: market1501 + data_dir: Market-1501-v15.09.15 + + preprocessing: + - type: bgr_to_rgb + - type: resize + dst_width: 48 + dst_height: 96 + use_pil: True + interpolation: ANTIALIAS + + metrics: + - name: rank@1 + type: cmc + top_k: 1 + + - type: reid_map diff --git a/tools/accuracy_checker/configs/person-reidentification-retail-0076.yml b/tools/accuracy_checker/configs/person-reidentification-retail-0076.yml new file mode 100644 index 0000000..09c28e6 --- /dev/null +++ b/tools/accuracy_checker/configs/person-reidentification-retail-0076.yml @@ -0,0 +1,76 @@ +models: + - name: person-reidentification-retail-0076 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: person-reidentification-retail-0076/FP16/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP16/person-reidentification-retail-0076.bin + adapter: reid + + - framework: dlsdk + device: MYRIAD + model: person-reidentification-retail-0076/FP16/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP16/person-reidentification-retail-0076.bin + adapter: reid + + - framework: dlsdk + device: HDDL + model: person-reidentification-retail-0076/FP16/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP16/person-reidentification-retail-0076.bin + adapter: reid + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.bin + adapter: reid + bitstream: 2019R1_A10DK_FP16_ELU.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.xml + weights: person-reidentification-retail-0076/FP32/person-reidentification-retail-0076.bin + adapter: reid + bitstream: 2019R1_A10DK_FP11_ELU.aocx + + datasets: + - name: market1501 + data_source: Market-1501-v15.09.15 + annoation_conversion: + converter: market1501 + data_dir: Market-1501-v15.09.15 + + preprocessing: + - type: resize + dst_width: 128 + dst_height: 384 + + metrics: + - name: rank@1 + type: cmc + top_k: 1 + + - type: reid_map diff --git a/tools/accuracy_checker/configs/person-reidentification-retail-0079.yml b/tools/accuracy_checker/configs/person-reidentification-retail-0079.yml new file mode 100644 index 0000000..417127c --- /dev/null +++ b/tools/accuracy_checker/configs/person-reidentification-retail-0079.yml @@ -0,0 +1,76 @@ +models: + - name: person-reidentification-retail-0079 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.bin + adapter: reid + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: person-reidentification-retail-0079/FP16/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP16/person-reidentification-retail-0079.bin + adapter: reid + + - framework: dlsdk + device: MYRIAD + model: person-reidentification-retail-0079/FP16/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP16/person-reidentification-retail-0079.bin + adapter: reid + + - framework: dlsdk + device: HDDL + model: person-reidentification-retail-0079/FP16/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP16/person-reidentification-retail-0079.bin + adapter: reid + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.bin + adapter: reid + bitstream: 2019R1_A10DK_FP16_RMNet.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.xml + weights: person-reidentification-retail-0079/FP32/person-reidentification-retail-0079.bin + adapter: reid + bitstream: 2019R1_A10DK_FP11_ELU.aocx + + datasets: + - name: market1501 + data_source: Market-1501-v15.09.15 + annoation_conversion: + converter: market1501 + data_dir: Market-1501-v15.09.15 + + preprocessing: + - type: resize + dst_width: 64 + dst_height: 160 + + metrics: + - name: rank@1 + type: cmc + top_k: 1 + + - type: reid_map diff --git a/tools/accuracy_checker/configs/text-detection-0002.yml b/tools/accuracy_checker/configs/text-detection-0002.yml new file mode 100644 index 0000000..529c264 --- /dev/null +++ b/tools/accuracy_checker/configs/text-detection-0002.yml @@ -0,0 +1,110 @@ +models: + - name: text-detection-0002 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: text-detection-0002/FP32/text-detection-0002.xml + weights: text-detection-0002/FP32/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: text-detection-0002/FP32/text-detection-0002.xml + weights: text-detection-0002/FP32/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: text-detection-0002/FP16/text-detection-0002.xml + weights: text-detection-0002/FP16/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + + - framework: dlsdk + device: MYRIAD + model: text-detection-0002/FP16/text-detection-0002.xml + weights: text-detection-0002/FP16/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + + - framework: dlsdk + device: HDDL + model: text-detection-0002/FP16/text-detection-0002.xml + weights: text-detection-0002/FP16/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA.CPU + model: text-detection-0002/FP32/text-detection-0002.xml + weights: text-detection-0002/FP32/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP16_MobileNet_Clamp.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA.CPU + model: text-detection-0002/FP32/text-detection-0002.xml + weights: text-detection-0002/FP32/text-detection-0002.bin + adapter: + type: text_detection + pixel_link_out: pixel_link/add_2 + pixel_class_out: pixel_cls/add_2 + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP11_MobileNet_Clamp.aocx + + datasets: + - name: ICDAR2015 + + data_source: ICDAR15_DET_validation/ch4_test_images + annotation_conversion: + converter: icdar15_detection + data_dir: ICDAR15_DET_validation/gt + + preprocessing: + - type: resize + dst_width: 1280 + dst_height: 768 + + postprocessing: + - type: cast_to_int + - type: filter + area_range: 300, 980993 + height_range: 10 + width_range: 10 + apply_to: prediction + remove_filtered: True + - type: clip_points + apply_to: prediction + + metrics: + - type: text_detection + name: f-measure + ignore_difficult: True diff --git a/tools/accuracy_checker/configs/text-recognition-0012.yml b/tools/accuracy_checker/configs/text-recognition-0012.yml new file mode 100644 index 0000000..da8e241 --- /dev/null +++ b/tools/accuracy_checker/configs/text-recognition-0012.yml @@ -0,0 +1,76 @@ +models: + - name: text-recognition-0012 + + launchers: + - framework: dlsdk + tags: + - FP32 + device: CPU + model: text-recognition-0012/FP32/text-recognition-0012.xml + weights: text-recognition-0012/FP32/text-recognition-0012.bin + adapter: beam_search_decoder + cpu_extensions: AUTO + + - framework: dlsdk + tags: + - GPU32 + device: GPU + model: text-recognition-0012/FP32/text-recognition-0012.xml + weights: text-recognition-0012/FP32/text-recognition-0012.bin + adapter: beam_search_decoder + + - framework: dlsdk + tags: + - GPU16 + device: GPU + model: text-recognition-0012/FP16/text-recognition-0012.xml + weights: text-recognition-0012/FP16/text-recognition-0012.bin + adapter: beam_search_decoder + + - framework: dlsdk + device: MYRIAD + model: text-recognition-0012/FP16/text-recognition-0012.xml + weights: text-recognition-0012/FP16/text-recognition-0012.bin + adapter: beam_search_decoder + + - framework: dlsdk + device: HDDL + model: text-recognition-0012/FP16/text-recognition-0012.xml + weights: text-recognition-0012/FP16/text-recognition-0012.bin + adapter: beam_search_decoder + + - framework: dlsdk + tags: + - FPGA16 + device: HETERO:FPGA,CPU + model: text-recognition-0012/FP32/text-recognition-0012.xml + weights: text-recognition-0012/FP32/text-recognition-0012.bin + adapter: beam_search_decoder + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP16_AlexNet_GoogleNet.aocx + + - framework: dlsdk + tags: + - FPGA11 + device: HETERO:FPGA,CPU + model: text-recognition-0012/FP32/text-recognition-0012.xml + weights: text-recognition-0012/FP32/text-recognition-0012.bin + adapter: beam_search_decoder + cpu_extensions: AUTO + bitstream: 2019R1_A10DK_FP11_AlexNet_GoogleNet_SqueezeNet.aocx + + datasets: + - name: ICDAR2013 + data_source: ICDAR13_REC_validation/Challenge2_Test_Task3_Images + annotation_conversion: + converter: icdar13_recognition + annotation_file: ICDAR13_REC_validation/gt/gt.txt.fixed.alfanumeric + + preprocessing: + - type: bgr_to_gray + - type: resize + dst_width: 120 + dst_height: 32 + + metrics: + - type: character_recognition_accuracy diff --git a/tools/accuracy_checker/data/test_data/1.jpg b/tools/accuracy_checker/data/test_data/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..20edaaee81f192e33998ce97f651f97c9c115093 GIT binary patch literal 147595 zcmb5VRa6|$7cD%vyTf3?HMl#0!CexZ0Kp0F?mq$y?(Pf{T!Rw^3GVLh?r`~kYkd#* z@mAMb{ZQ4Xs=MUu+WYkT!utjQTR~P{761bS1CaZ;0Po8HX#gB7?Emsdg8#?}$Os7V z@Cc|#NQlVjsOacusAy;yn7G&&m^hedXxN0-~eE8VBm0I-unUM000caM{i*O|9?V&g@Z># zLWTjLd=zV717Km`;9%k5KKhIdhX9WT^HGd|1Hh$56i+}Sm0zpz zy=Rh;mfJZvVeU8lbC_cZIt7mc(ACX4tQiFi|yEAQq zbRAuA!TS;b{eRVQ;BWxqfJU#?yJ*XG{rRwc_lulgE`ccXpR5=zT;2g*6AB$)U!hJb zR;FYsfka5eck>$j8<`MXznKQpf4?r72egQw!pFfp&;U zB#*I}?Ir30w0aq(;(dwLyUe*V%Bsd7V=G7$Uw(_kq=*5Zc2dQK=ZPDG{)L(xKTpVf zkH5;p`et+vwr8&Ip^g`QaFm2~NwRxG9LGO9f6tBW=#a`$Y51R1j!g}TBwH(IvZ#6S z5a_5O+L_xb(PucEh1_vzwAAIn+etCo)a5ZAqR)S(Z|B*K9{Vj{ZoDm=Z&ds}pAPhn zjH(o|G#?KVZOfT#i!=kz1a8c#>nC9+yij)j%*e_v{f8**YZ>Fdg?nOsP%?cRHkTBF zF$Q7lh#DIKPaHNUlLT|W1MrO!|Hf7nQnntEMM8<6r{iD7Vy}n6FJHSZzieKPSJH^A zS){%LA|dOWg=;z3?*Lq}C>zhmnV2AMO%#0Rh$|r~ep*Mczb5Vt8eTjtd*8DH=<(q6rxHD0i*qMAHLr-L zf&iVsnSuFRo|yr$kOqAKkC;(v1(@vgFE}5@SbW^z>ra%>-ecz<1yN1#pICeYJ=k`w zJbnsTX`qm~8>E+%mMMu;PDEd1%AOqZ;XH*g@$&-+a5uR}g}olqo)AxXiqIfRK<`36 zioAD#%wE*Cq%JSH4LWO0S3d}n{KDGxiL(R~pniWoq>~`}s8_1&SH=Vhb-jN!{K6Fi z?p&V#C;Qm&ZKW&Tv;YWcV?d97zR)B8-S5M85M=;CJpF;ttX-m1(gwV`uiqGq3G70@xI$r8g<- zuG?b#EA#@^yl8r~&ittredISv<#Cx1C*ob0D9_y~Q;HGC2;KWO@)nEl_)n)iv zA%DOlT|-1O?*KN*1|Iz=_9Q&%WY}Bp@M`VgNO=sP*`_?E2jwZy`>5zQW;g1LdP?@# zu1ejPujN#H!T{c{3p&Ad1$f{vSy6?3`QVaqY7uoXBEG=3fWW+(G*ij~;2of=0iA(+ z(}TzVJj6tp8*B=_m)>{>D7)JxwH#$zZTOl3&GyGKoeVVX@dY+5kcgp3Qj@*0O!>}# zQzTC>-U0Il@(1m+JXU7D_5ze^ND>)nfek%*@bRtOFM``bTQ>2(@rUxXk?sR=#xPq* zH{Dn0l?hQkX~R(zIaN25(5LNxq&Rz8~HfxVDMknnia_ZiG#z;BinCq+7^& z*%ufeH`tgEqqE`iHg$2k6TeLzu6KE8-&bF~s8UzUmq@4ciCr?0DH)}98*i-R94KB9 zV8-qjvK$hL>3p(>LbiG)Rhm{S7>Q98mNL|HCdP)l@&v?7T24h_BdkdfJBNFeXEYU@ zzNvzoD3@}wKnyRJcH-3mq8NqwMf_sdx!TT$;y5;KZV@NP9Rri3*#ogvGQDAMUzc0O z^0{Bn4aKIsi0+u{Gp195r{4knt1sZYW}BSaL$SI#Md>Yqx*`Ai$T>N7M@cpd1dRkl z(;up0C;-}Ibh)J!j20fVEYUqNEPOBVT3x7LX|z0EGtaLY3~$05tVy#sx_7|ltND>P zjNe7|USj3Z6x6y5@=e16SW{XyMn<{l5BfoMxHCO)eHl|aeN}b%teya4xj*i{@l~8p)V^HE2gLAsff|w)IA_Dc_r_syPVca6DK+lF100PASgIU$!c59 z8^)Z)DiEW_`~=`YY!3<*)f$XM=Ryh?mrZpVc2~$(AattrEyc=m0juYx&BHg^CIwug z93V5B)wmVY+V0ijYooV5#0PQiPg4iI19H`DWa?h<+Za4tQd~WD0u=ul@C_euKdw%teZ)Ax3`QHLeYUEWyZV}gB(yuSjdM5uDZxzWSSmH=+q~Y9 zJfmzdOK*|Y(oPmRJkeT&LDX}W_d(vD6^Zj0XC^6Gic(30i!`ty!H|?Yd2L>l?wUUN zJ0PbOK3_v}pGJ0`E@}@h+_bPQuIS2;PQAZU%xnIew3SF0J4JC-2n_R%ah$JJ1T80b zk4i|jL)<=r(#IOL%@+4Q+Ld*^d|;Y*NOYCHly3*8O?!+fxK1X?QWDsvh*pmwwGGN< zc3!GyGm5G!21F+(eR*_IB*R8LErY%TN|o2WXx{-uqjK+n(+~e&oV}gB-kJf6byO){ z0`?}t5d4YSq3;vkrt5jslp?qsjp;@n5|#%Ik6vu@#7A7!vS>dY_r#d%A4pE7Y(4W+2U#7{u(Lyj z)^D>7<7(F$lLdo2+0z)is)N?#<-^%+9&s>?2KUUrvNB68liMv~Rd|)p8Bli|8?)d~ z={Vve&W0!rCA4m;b@X^p{#1q>Ea~DZGm(WlBNv?4xzp5Q#L!LI-fD(13vF|k!_lxJ zPi_~SY0|@^%Xz7I?ZAo~SfN#8|G7#smB>9c0N&A%h#R7bp;h@cF1Rp4J15-cH`+!S z&*j6RIk{JdV7@Q}@`FeXe$8B$hUf!*TL=|?q8v1^mh&E|Gkk*!hw36dB^WedhAs@3 zv$U}(H(8DE(KOfI;iZgwSda0Um7Ub+%6*MXN^ByWYsaPiY6 zGcH_z@PDXxIC=-59Qg|T8NQiyN2E}i^41hiM9;hH<+Muk21L~F*jw*-!u%uox#H>l zo4O*bW^C`zH0_`m(z=+;=Ti(Bm&5j3L-K)`^7Oik8u+5g*s+LOinqerp1BH$Tz~Cu zDMdv01LZjN7aCWUg%`@fOM|~IVm~waKq;Y!_I=UMPfs?DzjUlI6CjM6;~5;F{4rPb z7^@*v2`EJ*xpFTG8V(1|p73G7VC-3MW=lfdR3S-njaEg-LY&~st$?FvQx+p|o@dr_ zQmNPvO6T(uo+gweQ*<3Bh-^oJ@)*Od`k%UBL#c!m*je;rb}-QCy7&?VL*2`oRrfWt zMipr;1{j8i&d4#fdv4K@A8q<1Jcq0ICRaForRIGFj|8o3XmWzB?6qgPLv5_+7FEAd z^=R%UrkM-&@;RzVapkV2L*G6Vic=03(vUv;1)di_I8cT>=$%LA;C=I-?!fsps|k{x zCwxHD)U{vSFUTTDgo{Jz+1_Mug!Z=uZCH;%_Za-Zsedl2tQuA9q11yfZclEPBs&C; z*M`&FH;q@2RekiES*^MTE~(>7n(`hE(H>IhjKr%*ro#(SnO)E+omB@Jb9poiFyu8` z8L!Ix-=uA!FLyboJTR&sv;WwcY@?E7`Nymu4d4PWZtvx^)e(^33A)G%8H*@5K2Z32 zNAl#accf@SaCU+hpLjjl3s^__D?pZojTyX!>WI1U5b<5u_Rl<})`vGMaz-kVP3E}N z!0EWmO^I3JmH#Uo$bjG{jp0m4Ks1n(B1Yv23OIa;DE|*lGwT!v3Z=XT*yixpgb{8* zk#5nhwabGHLU-;d3cr#N8x>*8gpO#O0hz`Jt-dPDmlTWa`uH|T4qz9BC_^{qaoZA< zIZqpoo7pSmcN93gWD?aGf5HtDL3vSDu`1<{b=W_~!1%0;--$dM(aI7{o*kkN_=fWe0qsjO~*bttZ95ic_}w~ds=AziByQze=)HM zbim_*zj&#F*0BFjn|l_+yQF_@^fp!I3&&!%EW|I5rkm^KF4`mdIN2aEj?%B2w43(q zW5ac<+>|s(?G()+bbO%{soC?{QC3Xv*yf1wZ(B;f0atw16Q(N+5?Y|Hu%DP$bJLq;>T$zu#nf;M|R1uNkEYhzVh&3W#*If|vNdK#0C^>J9 zdjxL5wrM{}lHrYsD<4&1nk)Lr$9>+}OPcKhoDgP&#exO1kwq{>V5P znT-+v+TSjJgYL!Enqduk4ry~bP9(W;SM(!y+aj#w=8{(TH3DH$vXdX7?%LkVueAy7 z@@zQD|maBf+>dk#p|G+z74hJp3 zP#z6Sh-zDqwif;!us*FqF7j=MakVsDg`;Y2tTewRcNGOYYm*X5lmF{vM3gq|gq%!0 zb05y|W-|@WMTAD{WAw2K(>EIXVv7;7Z8ve`#zuws2Ze2c*@xl0y{U;Og;4F*C@tLH zZCh#*c#4hZqZi8|OXaZHP9!!%Wj}N`_Nh@2jn}LOFtzk*kad6N#H8r{d*D4=hW8Xn zsYf`=+^ioX`;6@5?K@x#ru~LcrNtQv9bOIyTxys!h4%O$}!(k(i?W z^>06u#8nBh_78ug{*DPNw08w=mv-^yxJIHLhnhITkkyw&1j%S*fYAzk`5XyZE9~z? z2bv1Hhx^hjWH!pWd+w2%{xrhsLg;Aw|G5;j!+7HZ_eC(fKk!C7HEgHc5H04zpb$$N zL$dx%bPQ}@{;w zyBERoQ=a}of+5=HkvXo(GDWRRFSyyV8AUe<@ z-;%t{q&aW70#=3n3Hf|}y$1B?0wEn>7IB2yO-d=r;@O1Yb zFjBJlStJxpw_r9*>+@{2T7a;XfB}qdL3Z?$n~GpBg%5FvOW?qoH!zH5FNkZpXP|Yt zo^0#g5c*9qr{ZIISZM0LEaziAhRNlx#|Qet=qATEy*y7|Tg+Zr0#~4@uL{6ctsy0Y zqBSkZ^M%t02dEdWX^^pT-ECmekUH+P>kiDx3!Oe1HvgqsMidQDcHdpr5Vthn@ei~* zLu;bV^r8juE(eooD8pfLK3XjpHWZ2CF%F#;niLn{8N2V33`C)@VItYPRm*ew#JGK6 z{QAQ}+tdLzw|vE|Xt)0A;!Wv-w$x?h)0YN&RAZA|WrUH9q8zAdhGL+=ajC9-7nMqM89nM{DAFeYgc?P_= z!NzsJooiO`xqx|5xxpL#e0tW?o)(>q`|;oH_s zpQ_+C(sA@bOfgeK|IIc0;I678Vxp}I9@;_Pqf7H9S<*91wiCutWaoeBp|Nj9Cz!jm z8+ds~2psjH%Oo@QTm;e~J%jU_AQ;NX?=#kL{&fBz$U8vEO5?O>hrTarz0W6LaFXO`NE%Ew3-K_>6~X;m=AL)Q4&LpmFzQM7c#0t@bA0E z_Kv?Bu{NkJ+9jcT%%L$JTq(2_&F%&Xk~c%-wP2O;86fI4`$jGC;cvoocVg?qAD@n; zu{b5a{-X_f2ZV44Z@J7iF~*@^F}L8Bl#=K$Y&08Z?^Qixl! z)HsVvZ0!9;~O#ewP2tO=aB3wO@NVN}saMkV&n)Zpcjxe5vd!6YE>!`dTFRQ)-1J4i{}{H@GI?nTZ(u533~Np=Rik%bij~|AXX$ zQ5n@Ae<3&0I`3hw_`G=E|BI(|?7*S4*wKs4s+{;KF73@*dNI}wr*REz`a09Kf7%JE zkb8|cNAw--v%+dRCl!ltgUzW1-pJ< zG+~Bcc=S^gyMArME3;n6!H_|Zhl1~SP(@n>W(T0FSob*-0nhvwDB>0=hr+!+B?7TY zK>r=A;e&Im&VNMDYO%k|ZQ1UKVc-XT;Qdpr(H0JP1W!x9Ea+eKqm9Q=`KkC1H)MDL z`QW5`|r;FP|wu0?s66=s=*H+>AFp}-2h62Qkdb&&Tu!$TX=TFvX z%mu-XWA!Z7ZxU_FCRz6IZ+h1$9drbrguc79XSsw7v@&~~^(@56y3HrgTYra#$BLkt zmIQRjc~Basv~p=^o|q=yj@Pocll^j~$iN$JW}|rIgBtcB9^$)l<&QO&TcvpG&O#peaPJtQnv^N?Tk< z_yF$RM`45zyO0b$+AnNA*lcW&&M zw{|}sGV1Yf=mswtWMTAsOp0R>#wt}r2yaOXmC4tW(AG{IumoAQ8(xOtMTs&lcYH$)L(PC}F0FaKPTUQ$ zJcSy-9iw;!Fr*~GGSKLt9(fnC5`92|U|!e+MH{I16}%c0dpLiz_Q;JlAQn^_vLGIt zXnDLY#`Sdl0=-g3qh8?v+yD=s*M2qW*`IRlql#0xjiw$XHC$S!HjJTk18h_-i$A;X z-Vl;54NVc#q!d^ReBG4#3$B6tGKfTQZP*Z>Oj6~7NZn%M=<4y9-%ymPROC-HUY#b2 z5r*A-gK%{Dv+8$2Zza?XW!Q}#^Yh(WFxKDbF(k)c@*BitncXPx*Ua& zl=+$=zbzg&xneW1?E4v`wZ^#+I^*2tNXD>cq z9e3=?X<;wg>V_hbQsw0;=-dqi+o(cS4|Ck+!k8wyM}~+gMPaS6MYI;oe$Z^918&ZE zh@4AgP&7Cqu1N!EA>*dSWiHu8X49RS$iH+1s%}}kBi$2EoQmlRAxBP|#6=A{f*O5D zUOF&a!1{zlVPmDf2v8JZPCz-^mN|>f4^7;DM`19{7Vxk%De!<`0rc(DfC?#bCt&-jI&+ZplWMt zVtvc-0Be0qjS0!`#~+%26EaQIC07vodGz=1;)`8B!57Wd)9W@E%KnXeyrleJ9E90) z=BmmHoPY@*JcqEE^~tcGVq8b<$EmOMLdLH**x@mNj4|zQg^s7?wlxzt| zT^m6urq_`~2Wq@nT!vU2_O{z{o){VM358JrK{=}pEz0uiw2n|(u=Gs(jE>1Rp%@GuOR0zbzdTyQ9jbggd0FjSHZR%uw1;&qGO=WBfkTlez08Gq3 zL_U}SoLx&Xwi{FAxQIcV3o_&FnrtQRd?LO$79bW8`*Q{ocmt zRwe+0dGjHG2wJ{ja)@qT4j)nwqdQ3uSAA3ZL$G_r&$VNYk$(M~^xmODo(rwrFw!Pm zeEv$DL#ju2tYg^Bx1-SLzo?89-9W2*3dQ!7tHPe?Mqb=&67!pFg$>yFY6iwEL*ZSl z*lanttk|cLyMz-0>uM_srl?;|@F$U<$4Nu$2^IYA!^0>P?Ii%=%O?=tWr@02D!Lv>bsBx0Hlb~-CGY~HWAc(h$( zz@Ht&zRjE8s+#)FgaY`s#X4@VWJU%Li-P99FvenP&zze0gEa6YDV)Lx^*mPOmEaNw z0t;B-TN^ZbCOl#*vQy@NdL;@Q707)4_f&GgaA#HI)%*vI1VMJ0clp9sEon3<)aP?Z-& zKTdq0N-{6KrZkgHp?;Uk{9!=caC0MG05saZ>i2;GrULO4_P zT{&ZZaun^NZgk68W>Os4j*8plI9ZSjGH+Na{^Xc-?557rCyX+$X8} zq`qbMoQ3hSnf$p_d*_6^wdWw33EN*_mJki)F41uA0${t%#boqG^kGmZF5zTBQiIpY zAD{SqD5G;u_$p6TsPf_soX{~ap#=#DuaH|}QzTA&1r7KWBm2XO5I>j*Uk@`$F3{51 zZ-5uA5%B`Q0KlQZgi)P=fHbt?hvrgzZg7SwOuQE5N7@~xdv<^#zBmtiCHS1D2$HdB zLWrVn5*Y9yFMK>Qnbwk5Cn~6A!9{`Tz!VrrYj6zT9!^E~c{F|K^p5{yDy5?CEhP0P z%tQ>N0pL-EhBo$DGazcX*TxHn_xU3CoWS!1>gFhe3rX; zi&A7?F+RdVKDNwIn=*x~xam1Q+D5`|hjCD(jElK!c;E3}{5~7U7aui*C#Yx1ac!}C zS~mf;e#V7=6=0UwCAO5FglYgus@rDJiB*t}tMudCa(gbO3QvhD*9)hoRUgaT5)?d* z(>*G(k(E0*0sFra+AYcL@b}!`>ZPjq<+Hg7#GWgw+uD(jqGqIheKI3t3|hDz@2vG?F^XYAQ+RO@F(YqS`;{=4#JG{r zw_+Q0tn#`Z6bHB`1+ow8X>K^8ncPn($sZ)|2<0s9FK0V;>KZv`eO0K@*{8^Wr^*S> zj)dA#8XG!)a9MhqK_isa?I<-ldf!Dq*p%I2@})pk#e~do)cEsvfMm58TB_)(eT7}y zY)?2Dt-#8kjVs}oMv__y+Mg&5fXjtDA#k#ZTCW!qUUY)~o-2o@DO1(KwiDn`!J+)K zlmOEF^OtNl`VVS>t>+5(@#<=TP4csYd}WJp6$KaYCS0}v47HO;{?*McPbHcZY%B-h zrLm_V3RX%*A)ZKaf`{5SS?5f(eAc4@x0*|OD>Jj;U`Gc=#ia*$UDjk7_Sw@W&iX|t zSSh5%B~~`VT`2TR7-nuC533ZeXyj=bx^$it$ zMXO=Wz)AEAw85$T2g3nkY8$N?j0!Zt3XN&~ZB!^Y{{s-2RD4o#i%+MDXs#qm03nsc$wR6@aGL~P{!ZSWG~yPccH%nY;esj*~Gti z6u+9)BCV}7Aa+`vobVw{*K9kxy zx-p`hPv>z%i^tGbs<*m;=`77V^80^Hv)VP(zC~j@x@1Y^j&RPcLt7E%eR#Vt&}|ZX zXFpYfT?PN?+YM1|j2q7SRlaD4*t>8Rxo3n_TpG0f8Umf>Ip=n2y3G zNAnMHt;MlqnWM(r&tJ1Y#(i&^LKH=xL955Q#g%KeUCZd{yS$m#H*-4cbux{wi&8B^ z?0kbT1_0@hY?#HWr) zHzcQF=bV9AT>HiG@mQYfI36mJAHlq!={TtX@8R{&%G$2yK&E~>62>S^89Xf3Pi1d| z1H3S_Z-vRcvAA>T27$E8hdpDFO-UH|=PUO&a=n`M71o+8t{t+X?_xKIF(D1qlP{(nGWv*7@=k34>pop1y?8!?m=-b zXIO6;zGWj3QQoR-skGRt<1yG%@%@-lY%-){5i~aY#|07&sMPy$LAsi(mIB?tZD7E0 z{%X=`5}pi9d6bq*{g&|ujGC4*RlCeaUtE~59 z%2ovzQ1j^1#Nhhke`69990*{8OS1OtPW^_A5Mc?WIq*3Q6vd5W2BY>_T+%?=oaJbn z+{l-rd#F-43CzRPp+5i#Tmmz5Gj*pxnj->R!T6_Yze%__-=JCVU~hZ zvv{cY!6%KmT*nPu{|u30Y5^u63kmUY>Cm~{sHAQ3>zBsA_zy)xFI|QdIM*7uAq*t* zRLB}B3kd^*vKz9(wiS@ApJ=(hX7#^78gx?7&@6ia^7Meivg1*-g7TY_BA8g8>moK< zCJ+8(dM13#jgWQq7EdNUNfngD_=GMcYm8EO+!Tc@TpufQn$Q}^Fjv(hU&g2@me%rg zL6)~~0}KOlcwl-$80Z$@J`ZO__*-m~8JI^);YAbUz8BYS8>|QV1mNS}aa}lQr%4Kn zgG-H$<<17xI16h0q)9$34+?@m6}=@((_<|sy*Od89{qjivvT~+ybz;FAin7ly5k3W z?s&ZgSANWAFvVHvFa0HDykbvl>CAqVvNiF=b7caiAIl(QSbnO2J$RFKDY+Y=$GzYB zkmPp&-sR?b*vaPPUu*#X{)b2VPh87@K0~fh^$&M*e?7({&F-L32~UY+E0j{Z1Wu2u z%1?04(e(M9P@MN2X9iRm~ZnfI<;?G#6`WRcamI{P=spZ|+Amx3D4& zOXkx)#H`WSqNrw~e=r$qXJ#d`ypm(blpy;Xc2MqrX5*Bie)JehKe)`SOs>XfDlt?t zwdqHv3|%+ALQ6K)z zm7LC;9EfSLxst;L3OO>tOUPOLSy1CjwXXL}QQ2=u3L{F&w|(jqyo;tgvp|#wN1wZW zEjRrlYuf;`8$I;10xLH_l?O#i*M~mm5k9F=&$Yunp#4EUvwFhCKF*PxRLQ$#@mD1? z%}H>#AC%v*m$cnf<2FUL!7>{j>_sMp$ zyS9h~XaiyH=K!~#t>Teld@Fh>qF3{@vflB!UYxVqaDi2f3C90vS?Icp6AEtYW)};E zw@wYuoo!QNJwUS%tbwsvb;iuhZ_v7Ikb=@VoY}L1N`iWAZO(l0C>buTP1>aLD{l>Q2L`pzVnl^&rzu-xu*8){E|avY~A*=z?8}~NTj-9 zt9THF9s5P`BJUv_W?ycuwYu%*x2KNqT&e?bV(31X&iCSI*0bW0G$Uo6G!$AOgH^n^PZ-OZ#7?M`**=@LoqlmJTUD;TRPUlOA{R%k z4v{)##k9MT7YTGaP=Ie(hVc#EX z-)P_BbSe2Wpg#Kb;$VaqR9kIE(LpkFkmGyN+Br`Ez{H!GaZ{^0&iS|c>K z;xC;_eoXG%&c5{u!?K7zL*Lqa}|vpBw3t zI~DG_R{;Zblet}z4K2VIzR#s3q`z9?oKfvh1Q0wYYfWGf>QA2z!ttV5s>4xJ)op$k z#0oN(7nj8TymMB@`_wfN2Q%%VB2KtoVEBrgm+$3>2mf*z@^}y)7k&;en=wnTXb0~T zU%t>QQ}*lW6SjT=A-cAD{Q?I2u)qt=_p(qUT2vw$!CT;Q;6oVxd>EZ}@-O-`;aJek zbYqH3F9;VA2i37UFMQ{JQH!E&R~n=md6-51mkN{yM{0A?W%6Q-Nthr+CEv_wKT+P# zsGq!<Jxp4gqRYdiFxtnNPMXxq==89ehQj{7Av8Cc=ceAGRfqR!6a-o%&%~ ziDP5a!@FtQRBy&3-_GO5gi|t!i6>?h5+-ZwnaKQHvJUwj1a{>%r5TIqis7QsbER7L zbx60hV9Sme-r|RaJo6niG0ueF+LEgnn2Ay<2s(?kAPr^|X&XliwaMyz9?Z(UH;neP z+yvcG49;-55`Mf)?sKUO)4dky@j*i{Zd9QF#I=>{0Z4F#aj=^iQV?{)O7HBy_4b_| zbS#%@1C6P4&mqBi8dKq2Q%~c02`UBS`1k}laW-eBn_@C&XR(gkl}!(nhYyWvf7(#e ziP_wdQ4mgv=v(6_ff`{Cu{L}!1BFC<6YIJ#)`(LC8}XXXJNzFc2}&E5xuH+K19|SK za0Pwk(P;xG`zs+uv>J6V%u+~{y;!YNs3N7M=u|$#2b46LS#UcX_Ne=8Qq&=*>8Ff zBZmyuihp6_`*5SWZ5{*c*+;gnd&j!+JZmOQRvuQ0pYz0@)pO(+7EEHa=9F1J(GxDu z4S*GTsA!`u?|WH{XB+vQ@UL6aQ6s`&gERf`**WmwnmE^EJW*ovf1kTajpv>T7;PaJ zz}X3Cdv}So?TgCeOs8kH#%9CH{<+fz!d#Jh2{&VOvn1E3^F`|yh0mv2^HZE4%8HL` z1vR5H*NJrD$Is}pV5zD9RcE!8G*I&)0J zcD@BqQAGp$M>JVi9jFlu-vEmGZ#LUU?UaqpODTwj6*&!orQCLy)9prea25_ zhvn;8&Z9>UK)X;Fs=nuKUsW$G)lJW`Ul8+kfsi_}#;%+pi9H&{Yzcf4G<4fUm|8Ze zom^(-(>>AAO_598TG_3Udk9Cv8KT*(Ouc({W9Mk?!irE$#qX4a#Eo;tnNSMrX&c(7 zyUS0AWr;zszlI>mEg zY`ZwB`a9*I?CX48ghcjm?uR)gv@r<}n7duQb)n#m&aJV#?k**G3q*642QlxI#WJDy zcto~}6K-Jgn~4Bi2I{F#2Gj0Z1aQSFWR^}V-I_WBrikv$nv8@t+1=wo2-;0c;bn94 z6&FE?y(?r-{)<+dKY|3M@z(5)o_}<8SOaZU}1qeIMTub#PlMai1jipAkk_p+Fej;Q=}3q2Pcf15WMu-vlDwFs@@k!~ zmjcA)H19-kF15Y8N_vgVdP~f9F3pBeq2Ev6+`>O%kS;n^MOD<^*oFk{4jJkCxLRf& z>ZJ1(fU^X~L(}p2JKzvqyg>F$O!u|I@h&aJ==7iGqYuHox|R3u%C^Ej8}{m2<3mlx z^+wi)d>TlTox`_+tt zg5CX7N|~uwvLBGP&k?JdhB_qs8ExsC70iHs=kQRZv^Vl$IR;8h;b(bQ$Ab*L`;MR% zAlVb;^4=d-?xS{D$v1%yZapLn3oc5mseY{ZSrcjqyz1HP`tq#|xF=A}XHk)iWMOG; zHPa*tyKVHzVS28iy^Vi!_{kyjRj|~_Dt;%Y0m0|`nYly!KfbjZ$|j4Zn*gP`zUMN% zNqZYpKFre-v|)*EXm{xzFS@L}<$V&&cga~~#AMRKih-Ize{1Khczg636a6R- zZW8|a#f=xl#)ceARpfC^Gd!?8FOKH>9HUr#iq}!74$W1~mC@8LbTM;D6Gc8g5`8M4 z&#eht%E6dZDQTE1`qjp%jh?hTzrsf%_ZSlEcJf8Y5-X_RIR8cCve1x5S+wv>JUL_i zZP1@_LA`{W<6h5A;M(2=s?k-Dy*{GWF4^)9Xr>b9nsv+0>t!3|y`{8BHxd%MoK8k$ z$u6i{X51(?#~>8ngHtC5UjA}%x_x0$o|R@ebbO&~i`h|XAr^Q#Dg%7gK}(9VT2+N4 zi0V>KYrx3RC5ypZ*qx0xSGNEehd;#!_t7%ll1vhE;R4GKh!*^ikd#mq^aPl8m4X2@ zVxuKp_{ketSMql77+0bBfv#o)cC2k6Iw~;KA7_BqNESR(-S6$mT&RD=!GY<=tCq3Ba=OioKSKv1BfIXI zHqym0$e(rpn}t1*Tg|iFMhjsSRgTW5BZ(Lam(7xsD@a;Uqis}(xTr8|MWUS7@JOC@ z%s|^q4SX;gGF#XL!oUq~3aTw$HM{DNZ zglnrLo!vMSdH|fr-K$$4=8!}tF{&$mryhaE4#hs;XpnfgM5bJQdMKJTG|Mu1CeYzZ zB4JG-S6P(64NJt~DI1G!XeKnvCH{4?B+tb*u@`0; zKKI<(k4W$9ugzQNVGOCH5F`($d#ZYqNm9Lgf4{nMSn4y$zF6yI?=BOfD)B3Kp<+_x z%=YoR5DiPR_+J){l_&oaBDE17WeU#Y5bz!^-iad^ExC*?SWYX4IH+ek*HDM9FU;J| z@e4Ih?XY5EG3_m?1lb_dY?Lf9rlXJfaSX+O-J-bZ=Dz^j$IU_~_$32G|EiklMZ05B zFZ03px6+|&-@n7jZg?jH59=qXmIP5I$(giR_zt-|=&|}v6BiOsuV#F*-Q*AdEZydP zoO04{_Ca0;c!a;?cpipy#{XsQD|R^A4F36X@rK#?N3SZb52*Vc%m`7Ae7mBGBrx!C;pMOxQ2J3wNdlqF20F|txYSHqkI=8>E|Q)gA$gTKJq z^#fyKN=>vGSJ1RBG1;0XIay)6#-wdGDC+^4jT>bvyX!oEVE30;s zLv!pgF-l4-I>*8e&^J1G*D%Yn^rq&tc8Q?}r|=ACj{gQC_kVXmZmVZ% zFG)Zji^rc>$hnD_@i?JU5jl~X;uBdtMm0f;Ajtrhqa4+-yB|EqP!o;WKR`W?ceBcr z5o5^iGwYr)hZ>BwIx`9gK?Ui!0%vgjwP}vvKQdz+Ns*|?{49`0r^eM^O;*8s25_m_ z8ShV%y;&q>*+5Xm(_UM~y;I5I;ZEDq7Kn3hPUCgWJ8jm}GA4WY9ngkC10 z!nlFfpDO(i1R_jvPBk$RJ zY4bK!pbhJ<|Cwq$TFU5a(5Pxm+%Yj^UlH%b-Jr-TjJ0W!4V`fT8}XvYWPdP7fgWjn zomm7l(k6Wd`a90z$u{6}RxPcme^!e-IaJnjai3FZLLUCL2pTQIBO2%A3H>L*YK1fo zF01wnV_C2m$pTgTXJh;5E`55O*8XnO?#t`=#c@Y|SBV(t&(^Mgjq2}xt(SXq%&S3S z;y>^5xA}1J@iRYUVgPfGRBu%E{{WLfY`@yl$9U-jEj8fK*2Va&)seJSd%faqr5ql4 z4l(LArw?apSr>HkN6RDY$oa26bkYd}cKp^0)g5Ql-F?>9+l8|7X<~%8-4=3VN<)HM z&SIfim;;l@D}kM9w@D70PDZ07^VjGXrP1ky_iPzk?T+o#l$|x_Pg`oeQe42Is?5$H zL}fddYcC_>$?kAFb|XJQppJL$jm~KE6<<50dt0Y$G{*`)T~-lQ_IJ9~-b}XJr83*t z?uvP;RyTv>%Oj~MrvrMDKq^Y}je6Lh1CMvV-;cUklE)-}61DhE)7QTay1MHHMF}mS+C(j1d%xv?t2QZJ9L^G^BI=lFF z*L3S;s;RYJ(rKz~*68R=)dDGUy9A7zCqksDU5A0l^cTzj0LNgMyKQy47wi12soxZv zqj&^){gOHZre^8LdWy}|a6@U9=_Ne0=~XB=k~byebynRQZE`^)f*2jS$bvmwCDu;h zd8GZ+k6fhxK-Z4Z5B~r& z;26Al`2f8n#m7(aAJx$_;h)+qFJ3v=Iv(vsRF)g8Mv66-pJiJ+5h5cn?!WB{uqBYO z0k)iL*b?gm57~KV>C9m7Ke+jNpx!1yEIut$J<`c9*BGci@?oZujyI&8lA@%%n3xR4q@p-S zW;q-1FdO1tG_gk*c|YDy%k!;)4fmE^i?y|-l(Z=5Hg4(nw=t%_b>#R zBS5O4k(~3QT4i|>lY#0h_fs=c=wKUeiSbnXg*%Jb#I%(+1l(k584940TLdGi!)1B+ zaC`UE6bLQO-xK^4u5TjhXCRPZ?#Ti$ay>`Wr`Y(bEuR~#kCCD5p?(#3_tZ5VXA05OUZAacViB8s z##Dt6llT}NqYOc1BjGq@d;zTP?rjIgGSTjWvn+aDTMKq%j|%(wd9K-`w_j^6)ty;M zd$@I7(I64prKGq)K{}AF@KjQi5YD-7;{<`*JdaECXKpxG%#Ll6Y~c51@k>u z`hQa#j+u@xHH?OlA_Fe7h!p+n-Z#2+hUC&QoO{<3#lOIxKruQ zt<5YQ<>`e1{{YCF^H~vpksE7J;!l_YzCzI3r-0rbb=4TEtLW%$Q_Y{=P?JY5$*_c> z&wf|eQJEbesVUx~{{X+bxelU64b8Fd=03|V>aVcPMw!KLtJNpDItVmOt@b zLSfUt6kF(T?0=P_-q4Z1t3J5=e+we&&$Cxv^lC{hRgvqc(>bcPpS?yg>UMlX{v{fK z=^eZMq8;`>L;EcT>mqpgvG2)e{{VJqn4{o@;g=o0zN6`?k7J{D*9ty(R+{F#TTO)U zgakW4Bl+k9lpH)0u*5d<_`&RWBU0XQq1phlpGf89RmMJ=ZkY-)4@9D_ImaW@oE zjJ7gx!vuexsyU#X6?LwzLXHYla6Sgd;C?fx3tY@nupS(fB{G)7Ap_IbI*d5*yB$qBi&dJ<*8^DT)L4eYGh6eWP6TvAZ~a^chyc> zFhDrmG3}j7)0`C<&nqzT!l&-`>va&|%Iz})>Hxt0Wa^&2L9jG0eg2Nq-l9o};8)dm z>cVl#{--*>OXfD55qksQ&}fClIyr2asQTGF!XnCTfMlwA#Z^0BZtf3$1{Y#UNC<%9c z2s#dKEiH=+k1iC=OMkml>`Xya>JfOrKM?*$RgV>F+W{i!EfKby!;$hMQIe$%vBGdh z~d5RI}iHk z`jysv6QZGjEb$f|#Ai_uIu#|JKwcT+m4Il9W1NB8K!Oh_TTdZQw^234_Qf2EgNK<@ za6#|f56e2MsGqrEB!zAWOwJoZ#rmVePO_+4gC%W>vNdTKQb|uE`Jk~okHi2cjyczr z{{YlPWS0U@jf{FPbl5Nebq@;u)*sb1x9fCM(^XN>tP>574KuQq0kg*8@bXXL=f9?} z>GiN`jmOCM@1n0ZhDJAa-T5ki-7mLV=%k9D?jmA5wT43!M309(!1Tc%le{w?9eA;YXIZ0>1qQ6%wT4>Aap1|RP5GCjwC zUwu50GgD5wBdG9I1k&lSekPwEcv57Ki27@# z-wU?h;nU4Y36E0J>ZNV)Q>P=l^-PZ}VUieHOtfK<2pkih+?)@dH10cy1o_&oTivzq zZ;<{`TaSa?RpU2{E7QF{YM$kCc*3fzsw0Xq$&yJ)+ZuvCJ93YBbpJDEWQ z9x^qt9+D=5at+=96aN59&CA-**Z>;w;bpd;1G=`?)lX!Kig9wMcm1PhhGMlen;T&h zCdTrMXDY-pvXF9pD9fi|aWes9ulLno{{T2PF!o z66Gq(4~XnB6tuH85s*-u)Bgf`seJcMcbNqEz*tSr~yi@@E($;>s@cU zQ_}SP%9%?{mP(bVq*ca5j}(A99vMhnZDj`?fzoUms7YvZWUhK2rP2pgqt(VHiLPKc zM|?i}J@g+?S)s0`mg_YgT#Av(E&!N`P&S;Y$;l*c80Q0xRMZLEPAzE%z~}i>vS7#4 zF2-`P4;lKVzowdR52)*FOg6iuvQ?{%kjn9bkhqU=%Lv=?dTziZX*8N@E_1j(@)X*g zSe4Ciua>m`fTAxWUFTK=%KeFjApSRh% z_QwQ}LHh%!;ese&twLB-)&K{WIqs;+Sge^iY!1zVVeqne_M8$uDE!cFqHR2cKOhr+ zo`{9l#b5LvtnBN2vDN<46rrQ8rF9!+WD6shNXTX@wV4aEFe4etaXnVd5`Hb**ip<$I-`y0YCK_G&;`tMzpo}i0hK& zZ-fMNGVaM&lLLp1PXlQrZS@0B*)U5C?vHRMkaP44b6=@s&b|wUc%^c=Nej}_E6*rdB0(CemL#NP62LL|ik{iyjbpw>27qzz?6x7(xY%xS z?+BlSFjN?NzR6WNin=RhM3m1?Oa0;ezcXVl?W_*d+!M*|p%bDHsRe_J;)mkdTBd1g=k-42eg-I)n5PO5^rfX=BPt8+ww$RWse0r(% zVuJ6|lhs5~-Uvl~EJ^~@AB?P>$3zPTQ z&{MKV2?dab+D3W(LHXB`U~6#vU-G4Hn1E=lx0LAdY;Xk#z?yh$M(DpaBecMJw4ZyEmKm$L`q3Yyig3{HpU!#V4iS0Wb4s0P~FSO z9Xpsw94y9P22oqAX{o4a`irM-Oe$#Y^;c?`=Z0)HT~K^4Al>{x6M=?0_HlN+Pwvj` zt{6WHXphY=GcRSira%}R!@wl-^{e&ztwW?MdaI^-MhPzUQr1_+Pa6p8?dt%6RdCU~ z@G<1z5X`3@y2sYZ1EeMf(7zkMkyhYtq8P^WXUq(FTTkp!4x{P*i}4SLoO4-S%$RnM$esp?z6?k@-j1KYQuCqfv7BPOHdaRSI zyd>(+4;h;O0Arq#pE&#EU`tNl{*^Mnt`4dl9Gt{MhnGJw&)I2>TyFhX_5MZ*tRm+d zct1`xR*2+a=tVn zc>n|T@2(C34Ys38N*#DyACdlAYbOh~j;aiF&W=a1{w$C1)9nauaN#bZj83~`LC*l= zI*7I13Y0qVRa)=Sk<^xSb4@5vvYtOnn*sXXmCR~Q6aRFD)OP&m_3~}P2_O~5bM1{v(bOa-#W;kpCjgb_f`6W+Z6&iy4Dp(=?mcB4 z5*@}AikTNZ-H-XO)<(DDcw+kh0Qy!jr_efgb+hzeqHB4icSVJoq`rUrGM63jc^^G& zmGJvai`>&m5RQw>QuuK!qk69#g-}U7#tu7o(^rb*MtY-6yPh?HJMOBLV;@{{PwAnK zi;tRzEzzDeXj*Eee&29|BN@jsIl%4hq6doXpMT1WT~g82G)*lEjC>}tStgX_M;=n1 z2q)9O%RnyR^!%q14YtD5v&^z^-fk$;Fb{M5T<7!C6PiWX=kJ95n${$YwH-wq0b5~u zNDQ(A!N-{)@AJo~(zy7p9$$n}EfkM2X>(gL0N?8pnF{+IxxoE2w_GknV@;svIM*%~ zmJc2H*CwR!+P|G4R9(@SY-Gqgpu0rv8!ks_j1nw|C{{RZg z;D7BS{o5awM$J<`3fyEH`l|FY#6~yc*k@5wxur26V5v#w06e)#&!*simXjQnAiwyO z=b2Cd8A3i|zJc4y?}%V8jP6Jc8aLO0q5v7S#GeF~WOjCXa2wf3<3L#$D0DUw)=F@g z#L=ARInX<;Fxy$Mor*Y|s1u%jTmJxELnNKsOUU6(8w8FGY2`=0T^f=DB0rTH;wX;q z#6F(q>93h0lAGHL9@>P*lq}Mo4OD^N_nv<@x;nx`B^*VJ=FLlCMs z!ym4_2XZDf1{8A2AZ?VI8}+sw@>N9gRFoKKMiD=r<|F6^t_?FxMWm_&Ug0P$9}2P3 zUhVga6?!>RbOf&SfPV(v-Ti)A)YEGm z!A{Dpsna%4Knp&r2G_hSUQ+iyVaL}R^bIZ4G&`#EWvKZF=McuCy(4`$s%Hbl& z0VS3w)m39TB;XR~^Uk1>Ugu86MmE{X!|hP{H9H<`0fo3z_@Zf~TuKNW0o#p0WNR7= zdHbwB?Zncn^}kusKTtzmPKilhTM&YxvJe(&Jg`hbk9OJ_g6DyNF|6H2yiX$|cThsT z2Ar{uAp7!IKUQB}*+Wbwky5&nR!HXqB1L1J*&G};4nMp`u%|Oe9C5f?m~ebYksolg z+W?Ml_XomPDAjmX)_r-^bwbx~maa)9JL_6sEfXgNnb?dDIUVpj9eTYS;z<4`kbJ=( zOfPhGnr)2{AB1$J)1Djne?#EChTl_ixIO0i9A=IQiYf(GJG_Vi8;HpKPDvOC;R8|~ zK`#T34yWIe&YpMbFTruQi}oAcH;k9b`qtYep0>JgzZfbQ5U|Pq#e*p!muLeZp2yo- zjAwNi4a%z%nJG4eS!cA--HJ^XVhy#LoUaS=KgG|_PS;&puy+iFp;)?*#ZwwvYbt`* zG^rZy+{}u=ZEy(P{JUff<59fC$GaFA{{W?<5qqI)iTJ(yrEKwI#EW%Bvfm9Xm5F$^ zS4@jTAQDAQPPiypB}0^Nh=H8pxddS8*zqF+nmob$DmsR>uN#e?quqsO=<7vA^P;K5 z^4OuJN(Pu2D~xr8ow-DW?y~bDI54uDhN-fpa>dFUA=i zQbi<(R%RqNejstTInDvo!q>)myFMSVQOzSH!!cO*Yo=~}bJKnu^rZ9jrLM8)>+VfY zaiLpyA9IQ#p_wJ~PU!(`BDour;sr|;U5VzV>|;ecpXEV)RmEj|1w6bf1nf zS~|kt)8Bi7TUB=Ex79^n$tZ=gQD>)=1`Kh4kCTuw+-J_evo?}gaC4A<>3SCG8zg(G zi08xA37suB+w3s4D@2k^%TYobV#WK-ry1pPIpkywcx_Xh+4j6st$H}@N4=+_>1jGu zEcBFgmiZ)}m?rRLIbcs4PtX83*QwIP11+6biPy|sAl=bjx(lQ$sc%$ORn#jb8Af0w z0g$l#Adh_WgRSj6FN}wk=cRCV0@80b7;YATl$2>_yi*ggR0NKpGxFmN-A|#&I?;F! zu;ok|L%L}ynt5rjROH7AR`Mq7=eWr1K;Zjovv)>*CC-Vm3Z-ql^-VQ5kvm8XEW$FN zV`&_W`fAU5*0$6xjJeAqJ`PjO)EB$$s_9~`sksK-Z@%>WQqsDy$yq$6Qd^vPfNmqc zb?D>mcDgo3@Znur<9{o^rwa-Fnp-2~X%>1G0zP$LUVK%r!rE2oNE(uUv8uj_DV8~? zTD>Z1XvppoHy&!okAxN^V*r3gbx&8Sk5KDmVWb{L3H7XfI|JCii98Dfxa)zOAR54H~#?bnmz`H(C~hA zRrYEL{YUBibM&$1vUw<{ibsjdB#7HtS#Ux2Ao35$>Z{{1ov&>lHF2)oEr!!rm^K4v z>T#r|qhu7Ml29{_-hgLHT8MEOUo`n7IubsbVA@T*^$EEu0{W5xWP+HE8 zicI-(6k-lfx6@D-=IL2%o@f-8`q^C&ZU8)8+&pNiyDN;6^j-FVy;DheCIpTPSV1n3eO81$ldqir0JjT| z$dAf$o}ntX?4Fs)^fMqhA5*B~?JWzxOs&6FasL46{?!>UAF^^vib_@=c&eq39l(9Z z`Da}2*3m>WBC#L(TeT+t0QkW5AS^b5neTo%TWP8kWv@tFWD&I+f95cm`f8tgH4-)L zHV;+wDdIhw*pkP=4X>*G1(sDv<)^7M)Q`0JKX|C~rOrS0N-@Sc#=T2u;4jbR!0>-` z{XgDnSq5sQgpG{eNTh}}J=uSWy+2c~un~2ezrXOAwnI-ujTm!lwH9@tlBm@t$%soM-{#{2_h^ z_fl^3WX7x~8J>?LxjAY}It?;)$A{686s?Tmy#&(_-D-?=w zb{ua>fQc&8?49rEoq7wz{09i3waRsz2ff zU3HpAk6N9jm5kw8t4qH1-YeJ zsjZs1;H8>LBp?mOMkgM)9@@Pj4QX%-8WvoQYmP$Fx^mwYw$)Dzbud&%A}sC%Zmk=i z#TyqL%&>DqM?Z*NR{@mme6#~U@(SN3&6*G|JL94;Fl zK0n7A^BS2YV@P@XEcqf3jn5RL-Fk(t*BV-i%bP}!F>y&qq1}(B=?TCX&j25;yyUv* zA#in)N4%`*HBPbJg#FXRJ9U!vM@v&uwu)Kl{J|{GB!Ee~7-j(NKRyT3TRI-lcBtJ_ zpGfD(9M>GHPx^Cjo|6#XR{2o)WP+Y^Dx3}rIrvBOKP`Gk%|n|iaKm}y-->$GYNnp| zY`orSQcAF_AoEE+N}d~F&%#I}C%G8bcA(Fob7U?b81VP%r4tEQ=$p}V$J@rtx8D+a ze(g^1q*2;M6GV(r&dFG*9AkkdF_Haswb_!HTJZD1^H9Xyye?`^k*<;a*qNlI^^z4Y zBA-mIGx}#+7Ty)s?lwkIWw9+a+Qn|AieXtTT0>MPev+fPnexjCJ5Q&NzoM{JRM_qvZCY*>>1LocysU9~ud@Lg;v0Z*jO{08cXBdt?GZKm zPwDdEKs)JJWUFynRkDF7Z1G&{lJfDINZyWunl^}3z{Z~3w26VD|vFDIo?SnmSDi`Q}8l^ z2+j^$+1D_Y@qy68b6h7`y_aSUE%%JulAY>R_YsFEmZV^qGK9Oz6K2&jDl5& zhLgu@vC#VAb z=f!08eF1E^Br^29U0pr)FdjQye9_F6kGPTdd6X)W6pw+1N&F<|0*sRl+gvf?KY!&~ zYz4IVPJBPLTdttFSt_Pmyfe{MfJ4P9l#_5gsZ)Zz^g#uqX9_}J8;NqVczfbQ^nX=Z z?o|-YTqr6x3Ru`UU{R9|yN=Dk&(}ep>^r<*v*v`aI;i`n8Xh4^vsj_I_^(4rYNAVR zHO_e^f{s$tM^3*RDhJNz+Mp>XlZ^M$f0F51`2fhY$4=M_H~Dm1?USdfdX~>qWu58P zxd)giu5qq22{Pf}Ohq?)pl zu~JCuWGvOR+Z9qoVxTh)0bFtcKUedW{OtE+O#6OubS?WnIs7k4}iwMi}e2h(^TE7 zbBDyHY%Q!R_5T2=BYH9^CpK{CCp{OaA~*2^`I#J8Ztx(lV8hkUn~y zt|3+}S6eq<%C5e1Z*T3e_`hEIw#kcCcg4FX+!uMPI%4HX1x)nS%(WkMG%k%Sc_pF| z89p7ufCF*PGwZD_H0&}0`m5{5&a$(g3xixQ$B*0HEKNL7>uWDf$7{LuHTK)qk8o2* zM|H7ATB0)QyB4K}zz%Xl1GMqG1RZ-LqteMCd~vkLSs~nfxn6I-o;GZfM+e>kX#W8F zd!@4V4UeXLNTQ1v^Vrsw@ipe!|rK11-y{{RUG`?`q7 z6Y@R-=aKc~eEzFeElDPhe?LwS%RWmJtF}~7RLf6OOD#Q2ELKRRO|KdDAmESBRU~7h zcGqLy-{h{f!R@h>!KYJ=!H|*Zq`cuZ0vC@$l8g_C50K8KYqF5E_^8ubdmMrD)S%f> zmY!61w@DGokbST_>5=c4R z6Yz*e7aLEw7}SBJ6otTyp`A%^suhq?(iMUwj6Aa{<;gvlp4@wTjd?!V)aoISGB?nr z^Paslo(J4p#xBD@UKcv4^pB^ z#Xxh|_xX)355iQVI8B*X>oOF757Ln5*N z0O{AS>{$cY1!NMuhx)8+?8y&}54!J4e?Fa^5Hu`TbH{6=+5tcN=Hut-uc6n9{NI|d z=q9#CwI(w3Ydf!(ipqzpm4Q9BFTw_K{{UlMctg|qPSjD=igMKtGsU>aP?OL1I0B9` zss}1h11Ftz4|!aC`14KD)65mV;^Aip1NK!3l&*aYvS1!Pf3ATZKa|49ABy~`HBcl} z$|o=wMi~CmnW|kSIpCs_G3+yqcc0-Ys{{K|D&m?pm`6`@YJI^xx@JmvZ~z;F3?06_ zYrRT7sxr$(1#2ZSqYMcGNGW8kl;HL*cn$jNg_Dora5et`U%Hc3YnFF~r&($uV8`y) z8ij!a1H%~5d|U!}=UvAf{a1hX`;+0o_t2%libi~} z4tt+bp-Gp!atFhMpomYxqt{2tau3)2ITup&y!6ckbGydzfX4(9I3wqrX&x+EUJBzl zp;Pd*xOMng6Y4lVp8B9{8d{B_F4TbwKW9Qcj)vd@A-z|^m4G>6-?E)?a9kQn_)y5h zI4lQXan6)J)9gGiMJ&sVCel5IyL(PoYaL2j}aqiK0BUMJwMS+U10Or#?HSy+WytGF5|T zc6;D^YiF};z2qSg+g7t%WuB(<_exqw9;$kWSRsZWOFWFhRRHxQkOrK%HbS)s8*;Or zthCfvy1K5M)5}jiJgkot$%aW3;n1&eLjnl&*5;N$Bv!oUq9;c#=Ji%O8|b*nRZ{Ew zJVODHQ?VpSK>(@@cjTUU03LNhXxjHvM^&pEjBE~+s`byR7Ny<1!XRGVzAX>&U36D} zl+C-32|XuuYKzS~BjF!>b0;7w3=z1GL;SU(AZu7b7P1bjy_Wdsw0jj?bysSs;iQge zVpnGU+RG#=I03SAwU72kar4whQ4~NC^WyDU7GxuKJQXpfJwbGyYQ}uC5f~!{6@Ffy zJtm;KIQzlMpG?-d!GnZG%93=2`kJ~ID8v}fSC&TL{eS7KeM=p!kGPl-YAtjTz9Ioy z~t6dZ` zQkN>N9JrFDN8%_B0|0pW{PlVu9jGR0EDoI&3Fq$Ju>+sPO0KEv54zINS8b=Xz|@jc z%&I?nUpH7itH!!hu(RfwTtJOA6qN<+S zCXSk>lH(rtgL%Tru9yWg!T3S-=U#_Xpo_cT6KM0}j~@~BrK{PUwJg(jGJ$_j>2udp zS60UcB8pW>DWisIkTRAC!LrIw0mjpjocGTsSmZUV4UFUO@9e!Ow?A{nRllqHa*L{Z zA|9r+eb4rVJuGiOP$6i-o5D06_NIl$9p?tnfyqlLez=w<_SPWH%t^xzymfz;1P_ zPTJhCV%2t&s;U~DW2*~QOoU=GJ}8IwkosfWx6@IW#&gfjJ9Pw$BaaL%O5fs2^6Cu_&bF*F?glD3> zB7&{kPCSp?bV0f?1B=PF z^#0M9TGdZ*N4QeM1k}wyu|8cs11&-AmC_m6Y z?QYy2M!gPheNcH0 z%MGn?Siu{U`Du3At$hmk(Mbv9ewym=UF1}{TA8WFRgMR~y31;kP%6`C>e(XM9KcGc z*hm4#(4OCxtw%X;3J1groR;|R_XvEMX{jxW!dIk4W_ctBA+XtvaDA6LwjLgAm6;}- zgU}CuB?HB7z8)P}X|5918v2Kn?4f$PdMPGF<;pWe+sGLB00ie-8a*4eZY>6fjT7~y zPgOY6vdOdHNEh}$MO?r2jialpDS!J!=?JXdEpesutsPHru8tYO+O6f8lPV4vmCKbO zc^=lk#Vm}r2k#*KOsgKWX!_L@&DO}*HS)J>95CYS`dG2)c083MtbWY5ifUK6bal=? zskW68tsx4viieO?DcLB&9}*Feat5;<91l1f4}s(B$n zs-d1psws?8MCBw#8(A10K?jla)px{LcE1H)WSfPIh3eO?aM3XNk4*sHpswHI7t%B* zvt!%SP5~UH-riFMB9INN2h9Hf*wLanM{`a=UYLg?l0E+b7EYlcjH#M!efL<@pF{1p z8X&lGgM)7i!D{=!01=) zS1jArIZsD3%@k1((n#uyxq$eI01wpq>TiGxNlrLW`opKL78|FPaHcfX4+_W>ka)>o zPnwah1^%Ff$2(;_5We63lW})v_ip8`D0fMH(R#jsToNguB(RE6l$Ei<5%cGdm*uZH zrgMAH;#Z>WgEKS_Fmw3xT|=8UTlB~5TctA3X^;kuU<(`4z6cURK*BiWcERK@BVN-_ zsi0$!ZfWEd@pezB(a-)wE~k&bk?~y9;va|C9}e##j;xBBIe~gQ_?K*TC_C}|E;2VK zar06M^L?Y){{Z5i_9Jng4~LQWmG%C~?O3}`E$zdHBg6XhRLRo>sUT;!vDN{%xKi|C z-bsTWz^$Izh?fFItG8m&zK@`SPN=7>k+D5BJmL}PcZChl&}a46r0glGV;Slfi2nds zhr6>tZvn#BT$$=(61?)Pld&n}+^hGp4l)EWFFbpX(_c{-Uz})Ne)sr9u_QwfM^{fw z#Dq*oK@bDD$@tuUTI)oPhmWFeB7#_{;bBQv84x+6f#)iVjiIEFei6yfAm?3@Ehs(< z>!gA@a<@4njZt^SCC+li^5b@6ivw;kjC%fh>&_QfQ}<879L6}Fg{PG~!lElxI?~6H z+?IT?9G{WKyVvjiBiA3oRaCT*0X&OCNj*t@%T+5?JElM(QBDcSz_K zMY=&7R!oZ9Q_k5%6AS_v;|+}Deq4H+#Gzgp%Mv35;3$LF=4F3RK+N9%3>_){Z z((a69Tzx<~Oev_264GMbhJJZJ<)kAjL6qXu$en>F{p|jF1O>p9pxxU}Lrv@_(+rb*P_rzy(tG@v*i__opuvmSb5zog|^o_ClcW2W)Uh zwyUyNFxUyDV`KmvDi4lx{@t?BSDGN7cbj`{%A7DfXP=kLjc)dIjx+IsI8=;#ELzFn zsz;$%i!ls7F^mp74}X1HnSeO$T5KW1JRsLPxadKY#%f$=DyNb?btG+bgbP(8ag&$YGjFLtHKp{-#-fdae_$o)piZV#PQSjrJ(6H?HnGp z`>O|s5Kvxi7wd%CdAz1UDUfc60x^JpwcFb#ldOGf-Xv@+z&#eenV_?OR3^Lc)pWEp zuvODd3`zqmphqktJOF)9`=>uIU1)A#6yZr_)0YbX5%fi_)dr%5mW-w@OcfHGPliB7 zGRHU{ndeq+HxGrsWsu!A$u88(6fkt%>Y9@0AxddVyH-OHXLxq7W8O5;{rg97_tsFrj!k1aPHT z1_!tr^8MVA3y?1rGY;|(D3?;&I&$!)<56sYUQ8wht||?MCs~mzs!31{X9ch`O z*0eq-fnn=aRp_pfB+dT-aNsMohsCSJoffukq?VEhD%;A0)z-6+3}sAsY=#HE!f*)V z+g0KJ08!7`Y?cT?m{v2E{Of?Q9|TOCOAx)jSVCxgB(=eS$9ArIQX{!6AU=>2>^Oy!yuMo zLHHbS6qD*?ikf-r@6B@`LOe8`PSiI;cBZN&qcth;nrZMvivIxFU`r_{BR?%<_O_!{ zqjva1PToP}k1yJlskKrF8t5D=!^h;Jc#V0u^!=8Lr7kp7F;LoWGEGla6*-Ps!^VXj z&UU}RMh-V^;N&RQ;nh1?rH;i11o|HAKm4Yg_$rrD-jlCw^yyP88^t6-7!CnhBV3hD z#Ec9U$G2|BRCMsWPD^=d_rlxMIg_v!ig5ZM-9OO1L)5m%md(@GTfz*I^fWV0ss|*J zibk!vXxQzpcn8A5%iaTMD^G$R9(6BUb)-=A)N>1!B7NkoD~~+HtOK$~kkakQB>*QK z#xiwZSm^d_k-M*_kI#UY{K29EU~|&CttY6wIO<-My3W@dJx#u>&Z|#&rK6rII=9Y2 zZ-@6xayU5WBRb1|mFyPFb36?aKKw1c7jEfwEvJyjAD7q77r%8}olAA4qw09!qUy>j zG^K{7qvC?9HQlwOZsI_=UoZj+s_wxA>M|cJAOLH}jCy!2NW@{*#{v&e;bwjzSnu?8 zsP`rnH42$6O(>OO82lxa01gP?XCRCcHKzmu3k_|0{>q&5cc4-FbXLxc`!(Epl9JzD zP1W{lJ9DugeXBA<4K#%}Aa=n4T!Op1DIlB!ktBNP9t))SKs0W#{3vGZ_ri9!;ymGW z2ZfeCr0{pA*`eyHVUp3LCY7hBS97!mi-96G;uv=$wg?)g?PL>3(-}PY^X8BEuPxf< z^yKtc3wPOC!PeDPmkVE1BsMFZKsP&8M^4766+*7rObjF;SQ4ZaAP_dUE6(h5#9C~d z^&iGPRxYAhp?KWUU}MkM$L6(Ye#sX901E2i>n{yzu4AYo0t%W+b%vQLqG7e0$xq!m z;4xA^gffuB26l*~AME4j&&^Xh<3GWvXg+;M%qs^(*>9JFUsu+&?fW21R*uzK0>tvh zR2dq7ERe%HfKWzzg>*lu^>w0&sDe2?u z3Mw9_niJ)j+SMF50k~++=8_j4g&Yr-q5lBagk-Q@G30$!L)nI{ydM=lYCFxQvWk}5 z;heYIboDI{?MQl!BijchXJmEC<$Vr0<2;RR#!K2B14DlQ07}outhZ^QXg2e>bpG!j zFtn;pkhLuzmT3sxg7DR0&QGB$>N|0(Aco)tM`eW%sCqBFw0ct{GjU!w&og z9kcTUYmGU(NkeHF04hIIbR^w7K?P5n=Z+SkRdMA%p$FAZBR}V=uY@(W)z)}&2TfZg zGh{L2RoePQvekl??@e-vAh=UOTN1E!N|;hmtUC}G935ywseD;!Bhb>Q_<_YBbe@uw5&mgB%Q%{Be%<3_LinV(Y)iR!TFx6Bm79Cdx(#p zLOzF&lC;y^Drj1cj@@Q4wKFU;Ei{P~G;5E9s63S`%78m?a6#2F^5dM9V#ubozkx|T zYtVf$;({>0QvKfRv&)M0Nrjnl_;=%p2m9Ie#x*NmTHTzt!2aDwnX6VdCP=bUgn6eyc(`+Uz?Yg+Md> zBzHLHaK<$TRgw~MpW#;+&j%yJ~m<(#krXC7uRQxO{Ru5%ky286`XtzSkUK%^<6{ zgPto@)6AB;#M`E3+YM8mCHX)1jsVZ`aC?q-;M5D+Prp_34&Uq(2|A59_P1e&&(=>p9;7OwE7QyVK`9^Jr}Br;svTzmb_BdQ_e^pTj1l2j?Mgu)vY9k3EMB9 zyruZRH5Y{) zJ94xNDXY4rgkEK2tQIg0-&rZkWwep!`r+-W7jhc?qG8B56N3q)Hk}HcW zkKLVjFV`AE+5nFqfA+Y^-n(m>vR=39b6e8VP}WsPSv-3;RWS-l-;!`R*4B(XOdbUU zlSbCGk;13hE>K%)A+4UCj^R%WZhA_>{#?T!g@kUekM~$}j&-Fpjon*qN*Z?HszvVW z62ig0A-gJ|Ozk1$lj-TLb~H3lURqfUuP!OIQpYFs zI%ORxiko2o0h6Q|Ia2Tu#F_6kQdCu`c5#+5HaW&UbEE*Zv=xoT=W!M2td^>VuvshU zWksfWof#%^@r-;}KHm8C`D&x$63Qj;T9CcbMX}iM{?O;@S=5!Mc@>?*G>`EN=Yx;) z)sF7#7(>4~0P?jD4E#Il&k|}Rsp;;EyI-n66tdP)!0!^{u`V!*ep&R@h%{5aF+c(4 zLZPYc4JP=8B>q1yBbPK{il%iJmsg zlBGrnAQ6{5PIxSKYwh<}OmNMYOQu!Q&P(+)3j`?B$-15U~!N$&jgQs zc}U$Ods9gQA1HThW+mOf$z0b9S>y7SgnFApZ^3-3l&!rJd#v#4St+XR_Zk(XtUqyBRsqk6dFWFP8 zJbmihnS$jVvRiVRc|U8y$dEAQAsZC7a!+x<=e7o<(M;_-2A@>jrMOqL`hdtlXhZxh z`!?3}w~5tu-oCZTO!lCRv~wTb%*W#0k;IYU4#%AHj@qN!TA8BR1SYrt0L#dDcptxl z*n?0W!r{}R?f#SNsk(ZKC@u571T^4zFjF5YII-Jmt^oYChyEaIcU6@G+)oRKd~}v8 z2iiBUZV<#Rty5neeLQgHFuUVPeAFWt4CUAr>~c89t%pl4mJrUDpF88^6tj57Bi;2z}bKUXfNvs}lSEIeLwM1Lx) zv@B~{Lg?Ci)spEVRK23v_q1FI{^b#n4CjU_Nbl+Iu5W8h@kH0A5(o zAdds!d=J%oZqDrkG)<0TXn0kF<>l6vgVWt(a_Yvbg4JNT-PI|cvasjm+V+7Cd5k zs~z}N$Zm19RfA)Y!y3CZ&C$?)TzYx=tZhUu?FpRbev#M7ZWUcoOJ?cH%GoDaDI$Y( z=4q4!CvaRh2WiI#9r*Uv#*bCFX9Q>DuxHT_;jdgWDM&8PD*HAC|J7 zBbCMMHhc=a!dp#nuQgMG!B1CH6z>zqFl;s$f~P%?{e88F{X9>8`CCMCwM5c2yI5>3 z+oO7to~E7}X(i1?DgZOaRB@lzzP(Rw^%x|iQ=iK z1n!Pl0;mDcDYN2IM<8%@?4{AcBbeCY$5%At(Ee>>*x%ny{4Dpyud_l$OFjPpp#;@bV=pzG4*8+ypDWeW58yu##{~A$0@;l^gT8?O00-^p zwWFH7KI8cdpEtB%hAcLMSh&Uj?n(3?T}9nHacvtN746^vN#%UCEL%AT9;3fHkJUBT z3R>1XC=7s{XM@SohK@l^xSSBZSx~+iSZCWh?2Lpy9tcuZqi_Ir+&kxuDJ~gLO*FoB zIKYsR?ZG-w?+}3=Y9xrkv4BbU8k9H@l!O$)1~dN6Nyj95k5QubHfYOUa8%lTM7mko zJXmvbJ^A%j+x4i-46BkyBaCEweRc9)($XCi&WN0JKlHy$_BU!p+B#-xBJ0TTJwm3< zZu_lBc8D_eWgs>O9f9LNUNw*2J1h>P>+c?met4UyjDdsim9BVi;zeBy6p3_n*)w@` z^$jR3C#K0b066W(%jK4p3&^-G!A3h>tW!MdGH_)p~x%gy^Y$f)D{xp9xy|m@2CJ* z@zrEu{JEGhI3qj{*F*~dk(Km{vqg7J!qp_u6FjvQ@#hj6+RwVj8soSL+w%usaugTi zgXf=An|moqDd}I!mK~4(0Nl|WmBASEABhidr?JxyvhID-_OdEkc^u2+J4Y#vDsFXe zJC+@pVnM+AV+3~42@Q0w$|40*JH@`Tq${hs&{HKWB;+v6jD9blKO>`#Pu!-@pW#Dq zx*66sp3wyLVUkEnwOj9ICxM)8Be*`JM?3;j3gKAcv=Zv`Rn*QC@Wjuu72u2nQONlc zf0ls(HT6;`?ex?Lt4mE3Dm!^d7hTnIlem+UpU?dCE2mdT{#run=#Id8qum?O_a3sjy)_Ji2;xsRrmmylhMGU^NZy|z-2VVQ zOwb%H+$!=sLJ76F^hZ&2kB0YYtyI@rbo3HZtI2vYv&$smxs8YgcH@w#!sC)or4I(c z7#>T{Nv&(1L!2w;{{TuC+KZ*B`p3cQn}w#9p01)QihGEjmK5DQU=YQe46xeSN^8a-9E9eu}2r`xf=a{r=jUuNam=OcR~^{fD+goclP%>`S!;+27$`@ zJ3ucac&ulsdiJzYnkRLoht_5xoQ-Me^sPM!^i3>*zYrNzJF$+Y+R(u#-aO_E<=Yqo zzxZojFj)H{tLmsYhqvO+ReJ4klHUT;$12N}$tZYJ?VVXqZvyak?)|Gkcv^#d4!TMdksYA28&X#Sx0GU$CT=);@gft z#A>*c;^A2gp~{!mG&3iaHsdU;iGT)Kn;p31`(wVI6U%ZbThuh|4@_A)I_GqTp0=`z z;T4HUY3dRZSt1Suj1&TX72JDzYT>x+ddw|*m|Ykv_4<RSZt zt~nJTofbdcBE(2mR>J3u^PGdugLME#A0eVwjSxC{HjI_5_$T7^7mU6WS+DgJpRp?E zghgL#t8f1R15|(sVI=ne;gp_AGLiw#uQE%?Djh^77Z{wsJ^tNvT{+VJEk|2R)3(dK zb?P>nX=IJINhD+sG!w8XcSQ zq4V?fDm~-u({>h{TUAIbl`RQM+m&)+-qP?QQ6yoU79^=9HYwYZGp)!panA^q;INKK z7Cf&w^|HM5^FK=Rz(axH4t&(8{==PFYNUqgSxq&35ul#7ik1Lb5b~_9vNrh0?%P-8 zon&^Fo3u31xI+je4nvxLU~*L#$rhqM^G_fo^rAE2p1w<>`d6lE>AF_X(R9UiT@Aj8 z^93yvsZ>7@kOllo0M15nlb+{ZZ$f65NE@h(mp0CMAm;DgEE#k&+nic*pmg_HRCN8q zj`3;x?ZTXFlVO3I&O&hDGOieWx#R=fXIYJA?K`H4q!4p_Uy<`uY4o!14RZ*;CA-=@ zZ0NqPno4Rep|n)W`_B2@ zbTy*g(Z>5u(^)E4Yk8E&7CB2WQMpLR8P4Uw9_P}OlASmcFnnD5ujylIA#1L%e3og@ zy?1J&yINuC`kAK{bGf?F)ypkQqZL2Bj%UGdhz#dH4;)|}ZR`7AL#<;afK;FI8TZf4 zQ0(y|ZaE1fqS-Ihc3!x*$4_{oim_CF>Q?w=hds(T_?PL}`h%*G>m2WZfJd6w*|#99 zW2*cYy3$wE$J2Ktse)NnH(KMhph9_*a`3@>mB1&^9PzN{>3f~|9#{BJ8{AD5s2>FU zH0o%mDel%PD$9)m1*E>!(yaASIPB~R_}3(#EaVLQI6CvXe$`}hx)O)7TWR25g$-TB#FF558&?C@(CP+u1hgnR&w%~Yh?+9@0JmQZd}_Sj zAW7Da7->>OS>&dYS&R|6!m|*#&f>fhJN{aDG!sPauOwGELRcdh4(E?{uI=`F@b9jC zORno~xVAm_sAOnkdRZqcPneVhc8m;3xpw3+AmDM|Ua0nqba6HiE!D}czrqk{UT;4S zg4#Nkn%yk~R|**nQPP2!>ElRDDcp=k04h&8D}l~?=T*+NvNFfSdt;PG81UvuvyQa7J-c$3VetFzGEBA`ZL-F` zc_}5fwcZ)`{wMO=B@X}-#~^X4?*~)37qoa0&&?AfSm5VbJ#YC&>~qkf)xmQyT;_sF zS*4_7=Bg;&xZ`dZVD={`-&J(ah9c;sJBUY7zDjgrf$>>)iL{rh`j{&EUMky#zOj}^ zs;h~DPOrd5a4@bIcRc(1>W01>YC>8P%ivD1pF zHlS&Qj*E=^efwnnajG##CY9S<{#A!b7NXw9@U;H`4K5X*bE%4@+LWv>Le>!x(h~(Pzu2V~BHwkgB(;>H4l^x6{p8bu_giraPS_5srAs zIBfP=3D4qEtZ~YnR(W&28p5RFWcWcQvvr@~{gu{(tXA1nlT#(cWrE19ZKwcqjIr5; zA3se@9YTm!w*!%2`XAvCuj%N1^;^^(IY$jW+N^G`rV*$$V;DHy!6bJF;cs4f)RM8& zG*hxTKe7FfMQcx~?p#|S!C1#s_)TEzC7P$F%{NZo0}(C0a8im#^-;!Y+dh~n@?BRN ztABT?6z?DB$Jd`#vRFedIwRlFV;9bxx^#Wf?e~jhw%V{a1d#CK(b0xg)cdI;=c_JZ zXx(-^dVBRzG}+@SGQ2Dob27Kj*ZFg(M&W>@ru9-bBQeaI&m8*d9mEAD)}E({V5s)=DkSo(fQo2tQV_l~$Z|WQ`RsNjqcXn2;q(AMxf$}{M%c=`=wksX#Fb}97&tE8#2VT%V zA?5Fd`XV{8VAv>6?3!7KN~l$kx|em0e9*m)#$a~0&)g5LsO^_-2?y=-1CivDQo}P2 z)6>TvG1XrB8^uWM7wD`twR6iG?OICerQ7G9M020V?0DVVxaZOKZl-re8>p?r!#H0n z?M}g-D2}K!0n`qk(%&GeqUsx4(^My!1cMASf~VXcLNmet0MPI5W8*Q(EvJxIh6V9I zb9oDxJWlZ1v%|ZbBXpIvnv?#iTw@9t_c5?jg>XGTh;ltZ#MS5>2r@^Oem(g0Uu*16 z+GlCUPQ;Dme&fsVy3#1eFWwjB@kfugQ8;0( zbyoiXTCa*!b!h73S~NkNZSumBGBR>G<3@txd>^`(L1vaYOR~|^t1AL2U?tlifK04+ zXR+xe6Ykx_J(h80Shnn)%IlezrKWFT@vE(S^T z-Jy0(E;(Nr>MCN)a9k@(41*yQadzx~i$@_65sFG^X?F;z;)I_2SCjQU&*z|gkp`55 z*pA=hq3VsV2^a*N{dByh3+=&wi=6@&Dvw>-((P}nr(y!sM99tQh6PSfuYTG`ix^#j zg~oAEy*rq^)RChsgp=Hc$~cjeqDC3uKqm8kW_(%H}#t2c+VRMyCCl~MVR zP|H2e;<7T3wGBrY3hrBk-pmJmpbc$lw4!Dih(8ZZ2KsJ4b>?+5m&@HcsQS470Q%s5 zQC|vrXQ=5qg1xG49YITAq?LTH+m?EGsxG5$2$|iP@VNLwfB?sC+8U#$V?G8qaps&z zZ>3=!EK)VZmpX@t8TI7hTKq%n3L7n+rkYKmWQn3^)BpmIK^PxXuQ95by9-L*(?sIo zx1!E8*9Aj6*2PJ2@)rP3ETG}|3q{(7NH+LZFQ6qRz78;Y*mk(-xCU#tuZ<;cLXu1{I^G?2hw84BH^zBBtr` zstXVRC%d|KkZzGPVzBr$G@VgP2;uxg1CyB`Q@m9>?v6Yb^siO&voyRMh`lw z?HhmoF<(E~cYPbLc79*FMSqL8?x(AUzDhQlx;5MP_GWjDc;J6!4hJ2(@$E3vkShL( zBXm)kYe2t#?R8JtN5wnWM_Kx|JH!)2F4Ha2jessBm_g;bHZoUtmlg@&9QO65&2(`S z8=sMV6%Nqsm(J$BfONX(nTw;YzB2WK-EZNkRu|sYRYkVbA^s7!2h*_i2N>6$)avJ* z^ygDL`L9Q$)4|!%(=y_^AEu(|j*E(xg56mIZCtfIF(40z0kEma}C^fEbt_bb{2&MR@b6=nzxWu}$#(?e|D?#t@yL3=AINoM(}!T{~w5t-w5vFV$FM zT+fO5r6Nl$qUTa?(AsT+_5g~yrnZb#R=ijWo@rt6?PkW}3FHRPZAwgmjoN6Ap8o)3 zf>$(_xY+XHVBREBRCG^-cX|psdRu+!@pqw(ZS;3pyt4*nlgwIYa~vU=K;0e}B&o&- z(bMQ>*SoryiWY1Rf3<_#4vKfSHdyh%`yb^}yd=ETS=OGmD!G!rNcL0|E3}Q=p;iPC z2*Ct*9mc%8&i?=oXG609;}!M)0HqUjak4yNbnTm~DlS&TQTI1xBWfCX6mBDq&;AEq zlSQR|rJeT;KC2=^7X~iVAG_8l25U_B@nT#oo2+ViXuO(-z7{{H|)Qf*0g1_H47yWp*t$9|5Zu=S+% zkkVUf7NU-}j#OHr3gPluPSEIfo&h8V?gqZEp=&g-_nMZ|s;l|;>a)h8K!4j->lAig zm$A}XthTBvb+U#ff}WZxXz-Ds3ZRmtw*X|El6gFxR)1+qO|Z~Wxt49*s8A!s_xekc7QBp_R=$|(OH}Zm+SMkI$~G#-f!08HA+kUq z@;-X49MQ0j^7F+jZi++t1=9M#;rC2OSyxd_eyyHIm@U|U9 zr}V}8p5aexovYdjQWgyx@=70b+;g2n0gcf{`5i=GnvYc!jdPgRvktA;W~^HD;wrR~vf{ z{{Ww+i%53`MBqjW57k{bx9edv@WN}*hF4+{urran+a8BfxOanTQ2vu)W%YEkP}IB= zCdWgDQ;t7PX1#_tQBn-$J4IO3?g(-Qd-l^c-YH8}rjF)oR}RIW=5w8BM=jv0k6{Z9 z@p$z`tE{b^M|!BArutgjTjWE0>H5Z+XrXXQG}NryTjLmGw1T9mR^wi`M%W~Q(L^`A zi=r#miow+Pt&m9n0EM9P4t|9D^HRJc>dLN)>+0)7v-M^A-+ib`t8LohMJ#mKF>hkg z$%7aqk-aj6DrHx`q1D5_Au|J7TY>=>_UGrHRAG1$7M;!x;E+ey{@C(bhlsr=QF)_& zqv@FQdWm;bP$wUKP&)zt0JR^&e~a*+LNF$Qx|ZjjhxR^Sh3GVzlLNLtg`f1-U)nnE z$S5w*Q(W!R9EwTQk1e|rry-(nK?JT2Mh|Ua#XQk;;+W zeMhn9NaM8Ag%u!ZtQ;p|ca#;*PC3W*(id66VHv7T&Su;bhX;Y>w(O6mzL}+`g~hHN zP?_Rnc@eT5JK;uG=0>e#a;qHW52&qV1TY+q0SEbNP|!`JwhMp_bbhXx&!PHeCw9F%+t(kx|9)dRKm-au! z_2Bv)H0Du5r@wZ-WB#M;ShS=3s4&yg_<%IJ4ne^yxMp|6E2ZS_mCSfAI}gWXsj zLk>aKta6r&@OfSi8D!H+CWO-+^|U;y%$<79Te_a%K7$35WO;;?${m!P%4M|fcmrSiwHqnfY z=d*A?$mEfs$^iH&1*VFEX=-AvmY!4$(_QMsQJ~K3VR;xNX9MM=HRPD%o;=sl)kG#? zY^#c%7F8=1rh|N^kK%GM`5brC5qzKzhuIpmBSB12;i-j#voGHrOpl+4o;z|#$EH2B z4FuI;O#=X+s5)twkAmpt*0ZII9yf%J%+J+Z4oCX!C zWCsX61PsMgh8XNmV0}5&#~R60K=4v4$`p2%L^5aLKyXR==rY0hMi6lX8!Df3s7i{E z7vT%RE${2C7{LQ8Hdk&mzev8s9R|J{cqtE3T`d$aT`x5fRL5HIM%8IZ!F-i6^2n?d zxQyfbUb+%W1~*-FRHPfu)~zqBd{MCd-bAjr$24rGd~Z2$-&DZ|RF1wN>K!~4i2E{U za0=nC6}oGtC_32P?KCYxZ(x!&BEX`Wp|DFztGH|&uH?duf<`l|@$HfOj~@NyS#xEA zDjcPBhe#(`hN%}Q{^d!N5=@N2-_RTq!A3yjl0Y1tcBSoWHtw%AE`^R`#D%xv{Hs4) z*UL|5Xsaq#;Z@{_lu7W!gB&C=ju3a}l68fqb7j=vZmvlmh3ICFyMVY2x6}D66u0zn zy++N^qs0ITfK7ES5-xpLA89oKBqX( z91VGWUun+Sc19_q@V6k+#=tBB?-SF|aJnz1Yv`?X5lv{fJadH#R%Z^;p2vVW{-f7e z^1Qg*V!Hg(5J{;H5v!z~M}CqhYA#fC@$Kb9Fu?WPfMZl7j(JG$B*(ub(U_wa)~GT5 z%jxN9q^PWpR)cfmT2B;TdM5wT4jPNnrfw$>~~sRR8Mc1l3F(01Nmft9x#2Ff@G#@6oeX(#PPB9=?l)YOiKWA$1S-IRD+(wgPf0jdTF84 zYnk3Tq1V-8cP@q+Oo8? zP#oR@hEUks@t>mB`d%6yhPDNWtwr2|veg21gP+2H{t@%<$ELRQu8XtdEYpvRtG|Vk z<{1sUj|CF()iY0)E*Q5s&NHghJ?|Td%9ew0pxHWu?}6Y(Y?4%DVB_iQs!+RgwgY7} zmsXjU?;*NVQtWtzLljm|;=uTc2ha>>>#Z0;bHhtbc#q%1)mAZ?2y1yOCSJ3rz4aAL zb#c9V(MahHGCR7(9&wTs`QR!-95Bve=!CHD=(>>C*TB?~Mqk^I(CWeks zz)D*zh1x;6kk?GDyX>qKH|_bPZLaXzD0xDiV}p86H@|rz1NQ;0$1% z4zRTyl=ex!H&E-)dU-8|_KRN4uNhZrYpgcgdjnCjqC9O(cNoWgPyR!6%q?-EvrJ}Z zSO~43#EW&-ukOzvN`=4=B%h4-$34!UN86FgL~>)!)OwD~d}Dh!R&=eLi}!;3X8Q{*G2$=wohseKnQ(Ty0CyOrowZ$OcG0pDjw)6pm^+Qq@;Wb45v0 z4LwQVrBniVCSO#M4<%2yV=BSrLN^mJ!LnQ_tZ3UQP9#3UO1LT~I_9K>!;c)rM8b;Ux`Md?-4vr~A2L0^yfQW9dn($C`Kf@#$dX#k1p>@7tbTp%pZnz{{VRI$7UM4r`M1>xw1av=kL)$BtPm zcD|#fNh|0Qrbk@)#~Z=--Z=yF)qqHCDs6RevtFh9KBVe@WZmAB5&pvAADFrK-lP%v zgQNkMoGIHP$9m(zuD`HDB-FNpc8W8(YHc$4%n0Xp&L2OK(!L=pOG&%vufZ^5!@;6B zJy4#P>M_+8=M@mO4bGXos>&%;qsGT1FjfHUVaPvG+-v2%r`gWX%|*C8d-7kPI~TOs z+PYx>0QrnKdHDJj7VTMeMp{ZpsNt-UPnR`KN5LT-!YCOn$T&Fs`PEn>gnzjE+t$Bx z(OcBZsTCAKp~r&bkC5^5Dwk8?T`yHE(_bnksjLm<$c%m{*m61S1~5j^^f}e}J3zt% zbl_6Ohp`!|aelqU?&>^0U_BPm;SY$l)+&l_pS@Jn)=IcYE;Rst^k?BoTyeaP0~~J0 z_Pf`IUL5cD2nVKqyyxh!du#nXG)xmf>yOOjeW7rJi+e7BN0Xi#r$dDkgkdDXW3W8x=v4n}z6LhB2*i?F`? z^43hKmkRrvn94LZ*nF4|#oU}65W|cd;Ga!(-}a4Kkxgux3b^N|p`^Cd7}*i)DqeZ8 zd!3-;>&efqHP?MtZLHF=M=!v;uCg0D(|)F+>WRpb^L3Je>qH??pDHnerc9Ly z8)OWYY-(^LfE7Yu1{Biv2XJah+6v2h%f&_N`-HUQ%S$Q} ze(f*;vGO30aq|5%8?SDtkZrCV4hm7Bc9>bBJVy$%ZlQb4NL3%Z9oXtkfZ;iI)%S=! zCvUs-t!2Kfv?|fCN#o$gao^|j9=hqGX>Gfa!ss;)j!SO~f;#%rdUELWPdZLh1!Rxg z(~nJC(Y#6vg3{GEP%5=gcd4gw_jw3nF@U+xet{6oJfm zb!`-#JWM@4;wW%k0 zU`O@-m21Q*zMbmYYFbHc)DcBa%_^i>Bph{T)t&J|byNhdCU$V=SLS>K{993rA z)O3lmnb<%vwNuyr^QY8m<#?hNggRD7kg8rFT%G4shKx*%qb?;NsrjxxFP9@vq0~nc z33uxIBdgNQJvF9>&0G97@ZPVhx@v|Tq;Z##oy<}B%IAUuoG}Cp<0VEjfJx3~u2VF{ z!PrjMH%QRHG8Vn}OK(J3Vzzlvx0e{*B#DM#dxMPUC)n}w)-3vpeGJ-0wbGLNN4hT3 z)zZaRG!fdV+yb!)cA0VNz*q)pIy4C?t-BLEOP4stl-QM5hn43Kf? zuSUQeTso^)L3RC4>^h^eE4@;pt1U+6ilpS>aG(Io__7XoC3()FZMDRKpPqgG5T3u} zaJCDj_NT4-NAI?3Fp|^A($zr-F3~>4BrZl@y8=FX)ob581hu(XU^Y2`dBWijwUg6N z;(Z*H?Mo%W?GU}frWV1PMQ2r#LECqhs0#ox04JVK?`0Rh4X@~FSJl0-%elC<)c*iu z-wJwym&1y4)OO0t1*>d^Ijf^FOFsEa5gaZ7U{sHdhS7jjwySoAmAlKNc07((o0m{G zd^szkE3c4Q?eW`fs3~Y;JVvLv^)2pw&a)pe07lF;P_4&vvQ-ZE)k{enht2ivtmn_& zwAo#@)_r~P327rU%U9(f7#TSJy3dnUBdFO4b`GdNPk6r-N%P<`&hF(Qg)mFEsqet~ zofux`xVhY;@DV|D--SfbTH~+0GFMkt#0$YFSs{TpWGaFVSm0&8SRJ*!q}0DnPAmZyC^`r6lR%6y`VgNPR z^UBV=NAQxzcDK}3R`pfR8X9zIo;rvksMr)1EC?u1ZrIMVyS)R><9}QImc%``sckSc zeinbxJ{yPV>bJGsE9a$_Y+7qzEYT_j1QQD!pkUeBc>@PJpHbQI#Nx+gdKmTn@e&5r z@(+r)TW!|+2kgjtdhaB3cPhhGJyg@Ql5J1Kcdpa9j{!~sj^5g#D1tVPhiLcJOxH;% zJ>zVTB#y1E>I((lnW<}MtD=>>!*O!uD9|Xt+71}8%IznhGVLSbYBVcujeYMhq#coPCSt- zWcAmJT|H~uY0~c?F;~;XP%Oj@d3h~@INR!d#-eC*q1+u(50Y1HYxdW}(G|K@)yG3e zbhk}Zqs%62d5NZF2!!q1!2t1(W09WPvvN6s(UW~gDk#G@IgcMc6)lM<8zG5%tEgCe&(RkoWO?4&eA71*|mDNWkb`2^^**dwo=`NlzrR zRWk;HQUGG4eg;91exp>MEK)yU6i!0n8`|CI+#Va!StJ$GK?}&g5dgjc`eSZ?J!wVR za6&}8K`N}8>7^S9G^~{Mk;NG(g_x7OfJP6?Tg;iEzXMh~<0Sk=ITh7XV$lHDU8O+V zpQd$H+JYTyP2E_hBQ~3qhA`8rp<-|&B!TUpp0u>HIJa}Y%7pQYrmCM<+hL};{^100 z%NUR<@B!x_53fI-tj-1qRNG?;xsNHZd+L?alaxl?T+z50Rj_#gev4c1->qr7_on{yEuyx9ilO9y(p{kg&0J%0`aDjp!*P+2K-OH5yGti_ z=L6mgPl~= zL?3Rf9zyh*T|j#>a;x1j*SDUHV6Qp2(5J-T4kSVD$_FF+%lUz<3A9u{+xPFyYBoSt z`Otna(cWo*ilJF%jKv~08DvAjMhAye>JD{5vOW`Ikfb0yWGmXx+wPQ-3v(sDS38y> z$L{&YJ|9%=+mJq5+=pHtiEw;YY?^Az6oCyzEi`e|M$yawB}vHaGDo5Ht7G(deV)=$`-sSSZkRR;jbck-;7#s9dAgJtXsq?X}eL#aT@-Wm=TpsVbf~ z4^{dRka7s)Uo?AP*}8`{;P@KGJXh#D(BJ)hm z8VnLDqEpOUiRIV2lg=N<6$ zru3asT;enzb)=fzP?AkiG@zuL0{W71l_MK*oafYQ!S=3P<<@sG#a-;IUov)#Y(UrHBLc4E;IRS^qTy;wwJM1L02_Vpt@F=sMrOf zwnnJ9IKV@UI3C=Q&(mGzfX+f~C}pLS?+ayupK;8!w`u(ORYISQmv=bEI46&$h1V(T z$OY_IDC_5^rZ>6kyysY%ZLx@=iAwU@fb0*k$lb^~;14v1CzS0CvT;<>t(JlW< zL1={(9l({wa(0{^R2+KeT~>$XO0ij}~oRrz&z;P z1Zfd_?Fv>!N-H91MrJ>$FSF=WgH*I}Hr~;*$v!$U6Qu7 zU+OJxq-|;1JhAsk@h<%hmW11A5h>#ffRQLg1KR^uWpLqS!qX$SJC)BJM`>HS;$qPj zI9b$?$bG^2d+W|>9RWLDY2StEbgZ7PF=OnhQrLR_k>6KnCH*e< zISnlq`Y8SwRu}S8HS|?n5CGH-h1qdAK40{-B-ECC5w^hdrDX70o|2=jsO79v5<$9D z5;$<|0l*^zIPdMMdihE%y3P@Tb$vG~^< zgteqaAWw%5cIcgLFROF5U#!h{`HI*?X+qhFccEuRlY_0ywM^9w8)6r9i z(PAPm#y})sE02V7PpRj))pt8^uVrH!F5Fc5s43~Ge(hAgTGTL%%`%g*`-6j=43B?b zT~~0%RDr~s3peU-8sw&;mXgyFE2DX4baNz%M|>#&eDj`vL#?QE4UwaJzDg$@ry|xV z)HjNn**xp*Q_UL?>k{x~#_Vq+ka-+;;B%|yMl#u^19#XfG3nhlX2VC_TDnuj*{>D! zR_Y208$}y@Ee$0xjwzdHK)jMv<2Y{o0giQ6X{UxaZ#R64_xmenO9PFwHIRSk1*hCP zTc=l|uANCE)KgzARdjC=r4bh~Nhpctl^u6Ts?3GFwo(Iww2YAbpEi?Mp^_v20Ju2x z$JJ%(4yHX+U}iUS>aMb%HSLREa)yGep{!eU(#G*mJVVNmfZPY!&tTZ&;u_Bk7~JZ2 zt}%|iPt`=?XqJl}K1cGhzOCwZwp>x^AZl8QxS5tZc~~?+oO~dj0Rub_mZ6d$P4!d4 z^Hfd(x7{sOzAX4vTLFTW`*w_G*IQADDpIHqj1F)MV+@3aeCP21bvB$mq_?7x?`vEf z3dOuKy|mTybor{Np}WCTwp)Ene~haT&Y+H2Fn$w;#z4<)U7JAJ8@~XaJpO(upDyvZ z3$K4g(A^DF)EzPRT4^qYsmCO`y;R>F>c;l4zVa zQq-;88PZvrYsXL7?ymJ|Ua(C_uA-@XO5AcsARs$2-A=TbseBH$yK$=Y{g;*2&L(ek z!9%JeH;KRL9bIHw+{*-pHrz)z{vppR?ex}DHC^+Z4=wThsxoOzAUm5?JNH|y5%@z* zDf}_0$vTbmhMH)z;GiwOizV?3#KyLxfliaxFfA1^lL|g=aof|4K-M~U*}?_8cwFIG zed;czx>HYdonEBQ8DK{J!T{OY7uy4pb+z0<6I>%5IXs`)ETA=~!D;lpNl9<3hI(|0 zF2SRx!Cd3`NZ^C%tbI{ap|g&_m8Ju z6;{<1c#^(lAPGxrM8*FAiAY>kf2?|iC^~BCS6J?8Dd}XEZ!Bes1vvxUk=T0bHnGld zU^!E1#_3yLC~ryE)LQyJ-%n(xri!Y*8al+1f82;65hBUCx!a%ahC8vr#>%KTt$<0tae!Lf|DE$}L4I?Yu& zxl?U@E_$}rQ9Y*LdZfEgIeHO!mKk8CU&3N6PbfGgi6`6JR@lPxVVg&d`=wjH?u|Xb zjT81Jfo-z%rW`o84q3W7(9lqcsq4?$yEMzjM0GSInsm?LSZra0ilgQH>0H#1$zaK-x z`&;ePY2f(Y+?9T`Ug|GBF)YaIJ^GF|nwtK9)8(mLqg`fgFvN zGDh9-bFC&%vE;WPtoKK_B2ryqsElkvt~o4m{{UxJ9jIg3wy8oREjMVDrsL)mIT>F0 zJZjdOT+vmgXM!Pgvcr3W&ZpXy*xG=&0zv(gp5DH?-)`AkOBvv_*;L!LvU(XxO&N{x zka>v-9HQ3(hs5L%Jwf->Lcz31RvSIQsRdlA^})d!u@mV)_Ntxxz}p4ms&4&J zQ4QYtS6+nFT2`<}6+zk*Z7a2lDvY3JARYniHR+k!Q!*CDzy)w?KYzN#lSs$NWp$Ez zU%!%*({j=DJRLF9mUg(^EcHHmLp6M_EcClu{g)~VDmFolfGM9&2D2yByHO{3x1MNH zc5Q?EtrBcYjDNnOo$>pr!1!{1vco9fBcDO55x5)`ayNG` zzJ+kjR82)uH8o^&($zsD#Vjw4@w$>3NcRL0kC)R_+Jg;JP|VX(PV`l3 za?P{hZhwhKf2ibYoUpb|LY=Pz;k>PLr2I~~SfQkAye^dTgb7xSx+W|#2JCQg{4LMd zSyO2r9$Ao`lC{fk>8}uJu5{{*=`>CM0G8*Es6D~daB5_T;I5J;$TF`~HB{B>wj&wO zc%?<-=yA@yCZAUvnBc5gwC|52Av;lOfL+-wz{*dj@@g)BM z5@YZ?k0QCJt-5kbxEi*NWEJq?il+s__Q&^^(;fSJYsKhw8wrL_(tN%0zMb0I6Eh6( zezBi3TvwRrp;)?}8=bTEgvgNq!7F3NL~PHl~?L&L|_WjTA&1x>5lw+9Ame(lau1R z)gs?eB9*HtY?aj#D=;;83eAWUpNQ_p1~JCdjN?3Oucr%?$~x++OsV@)>u;usm&*%T1_Pt%Uk`;xcD@a&KNf-r5KM3`}KHl2n$192PPz5!)lHOJerT+kYDJ*z% z<>R*U7>qH;Ao`z7oi;(?07ZDBwt{GqpiM(XXNELZ@{(mPfagA8&OL}Z_3xzJU2QcH zgk>^Rdgq*hqPW_56e-=F-z{}67M{p#XW<%CgyB0*p`^j#SC&57M>>L68d;x;eO+{h zhO*m50Jz+)gwRp+^#h|;Y$GjGP(CI^0C1Qc#^&_W!>9iMBQiT{M^nfBtl4CE6wzej zhd!tF3boT-C3MxVt*c_F>HC$Ts8UE;vg=@CN(dpl*d2~^b1}vQX!q6t`+H0MXTBxY2s8?WR711nYLjRkq$NWmUWRa5AsPJ2Ar*j&O<+x+JJbeAM&y|t3+;E~~5=iZ>1zRYs)izmDx;amr znE`?V>OdT08Nu|_dmE{Wq!=mYbF@=jX2?dmZ?0U zIU2~HvzpJ2F@mqbt`lM7Ds6vV3K*&)-jaDFln$gkpYa2Lz4^g9vksOWn}<|q)jhHp z!q4ZvJonVN%$STEtXsI-ooF?LH)vFQ))$w#Tx=ao*USA4EGgyEG@%%8=N|iy8PYkXRF<1$bkQ_Uk_;$xQIVeeyJtVwUOmSY55HAxz0w`U8K(_S z<))d0nE^80v692N$k<;gq*W|)5}K{1%O#sCxspO8ca0VKi|o5 z7Avpea`#sBb^hJebyYCOG%&+e=F+=1(0(#efCTaeJv}w(;?zUj2A|YLSArW{=k5mx zMc&n4a;bW{nG`W)X9VD|CmU69jVuNzsk|)*Nh6zpi&Qm5mZFl5mMW@fS+_BWD+gTT z03I{J?~m!Ok4yehFOZ7JwH&&(#3*F>D3>m+yVBH2MN3&!w6$ol&F4r#k}_~WQTTbm z_(vGyQnyQ{gzjL-{P~p-nbS6$Kc%2r?h`?9g0k&VB<%yPOBULA3R?t|lYoA@#*ov= zZzt-28=JIngWdXyO5K ziQmhns-<~EF4>JFTx~nMDI<*hhPp-=WTTKmcabb%z+@lFsC73`+xpX}s_Ji2wIyWH zhFg?t<~&j;+NHte(2f~bPmEc zl&N+srwB>L;&2oO_P{yDMvkTvsg8lF3p!>(@Z-g+ZaB-)^zOdWQ2$Y!kI7l?5!^p;X&|NhEjYoc?*#vAwRo z>j3pu=bO9(amZB8ukkWVF{_T*N|lwUo57iimL8#T>IXRW?cY;H-QLTi;<6t&4gdv+ zTspPyr|*@Od!$pGvh8g5+kk)1S`CZ4CWcC!ImZ;G+4`2r1o6*wrk%vc?(w1(*f(&@ z`Ntf9ap+F2MgsB_praSkV({{UXX>K=!yTbEBoSxk|^8489t z+%QKRG6D$)1Y;nc+R)U;8^C;67^*ZqTlt&vmUHo!*$G)IzO#n|S0<*6l z?tgOdqW&WV-lYXRWlk}kOMXFnYJD&cm~{&Fg|{$13rMTBNj0CQ=xL*vH|)sLdD)ag z3Y-s^I3tdGXFBO1?}J(~bR2=WVY8oK zZ)2-5c1r;GpML%ITMq3XirU&ssIpn&v%Mr1ngEDeK=U+>Ab)mz<%h5*1KU;xoW~iA zP#rL#{{Sh^e&-T_+`8?0THOsaMcR=3Wf_!+cJ@-exb8dm&U38YOXSml1(38Lh)(U@ zRp+3*Z;qylXg_MGk*Whk>bZ4Nq=SM65I6vX{j85&Xy|)NIO8mP&QI<9s@k0ki^GF( z27UOW{vlm%Hv1jE^s$;nR}|M8YI%}ab|3a`jqzjS zJZya14o6G;FKu04Th&n0)QX3B%uO9tB8-Wm+y2n0A&&$GUUA>oRW$mkbpToThunWD zwHj#Pbq+czk5bVKJeFzg9XhpFIo^LVT1f#A?LeTFW&sB$ZySzy9rY}&1P2P;^!qN2 z#MqJ46IKqF>N<$3C0cf)c)?M!d@8B_68vpAC%!xSYdC2qE-nM-(Oh)!&=+Y-MOV{7 zY3fi%E;j`-d7ln(=yCZ2#;U#}rvzSoZ7FwraI7|3!&MVL;S(otTa)t|k|{R}DzuEw z6|RanD%Brqhf%={MtJoa+J%wJSjx?Lb5Uz;DUPJMbLEaoao^ilqK9kkRFU{cD!j{a zM)KT4v@vipIrlo-Y))w0I9YCUU5=``)K@B6%H%Le@;1@9wnF368PS=*kUFRXNET2@ z;~?(N2e{Q|3|f9uw6!KTUU8oP06i9zjtJdFs^8S~Yfo{7<&8UebIB1I@Ayd1(C1na zkm0nf`J)cy3BAo==o(4Zj=ow8h^KnrxWQ1h6;-jP;wBki_t+Wa5Wp$JU}IOJ(wPX@ zC|zGc^HmRb76GH-c|Rk{fzVzs*?NceT=HM+;9TTtYI->XQr3ClfH6Ep856?`^E z(%Is7==Cr-?rnzhK*z2B07^D$Baxt&4E&A#NLf$X+ryJ;zS8x5zL$L^(D@e;82FZ@ zSK*K^uK3jF`_Ys9T8aMv6nlpw@%?@l_L5D*Yq~CRsHq-a7ywk{A8lrw6|yn~t&73R z^tsriq>?q3HjKe6MMK8>82%u0pM}B4`D#`Zg_}3R`wHC;L4DOFsj$`>Nuzu<1w$Vt z)b`mYbUv7DulI(amq5wvw+Hg2k62o1=*@?%s_nHRj^z}zl?mM?1b%<@AC|Brid{Rl zxoWk|c(#?PSh~$BQAEw=rU1_yhtT@#Lrt!9eB>%q=^G|S3b8PXu9wY){KOsEd#}`Y z8uU?WBba<8W4;%}>Tp4>R1nre@lP7h6oU?u466@Ne=SZxq*&t9VRJ`|o*0gitctpU zWNSR_wL>vu1fSR^xl%b(`F9%m_igrXc7Mr)VDs<5uf2bzJA3~CP>=jgL%-Z_9%K0n zEuMxKjp^iqCxntrY@=y8$@p=M=bx^;c`s|X3>EYe#<{y3&_VKDRUI9&R;r4U$5USM zNL7VYW;jk9u?X1O*z+3$)O6%d`j%{yr-_BuA``dp6S%yaYm9gghm%(7$6d%M&X|4`VBE2NRfRJ3R)V- zsp=9deIHN7BrG77DF>Srgm6e74+nN|eKV7t0(?sBi|B?_%M{`oqW4RkJz0@_m|S^T z0fQh%21D7fJ^ez6NTOA$7PrP*yf+0AU?kTjeTmn##x$oPzaqohl z0nSha-zkcFT?NYGlfh(`syR!_^%bzPqVC+wv^Qb7yAj6jF^zQ*&KERv0Ux$Oa}&!o zbHJ+6s>e+MW)YA{B)9PskTde&4C}2CVG#U8=hKe*2P>&K9Bm&LA1wsRY)E5OPgC)* z9-hNho^3+oWn@+U4&1GseNlMn`^~?jdU~cGCbmi>Z@*4m_sAYucP{2`&A4qG0i9^K zTF&RYnY)PM_#PZA@uk}#a}L&@4o}Sg04o0gWaxVrOvzUzuGt+dwOI3~cX?%$4#2yy zIQ7SD^Q`Hnl5tWS1JPfKHpD4{uhmfeRIP@RmPle%Mt%;_jbzP$wBQBDE1i1EzG`|? z_jFlVNEm)c=dUrYX)&F@HR|*md(mizMBP69MgV49odPG4IpqPzDh~=U2gqyO@81nHWRAF4-O-9bTyTKXBXm8wp4(!g zH54H1Eg@erOpnLNBRS4-@2rRRbkmaZ`ab>R3uU^f@Ei~bx_0q=zC{hul_;=6q6P>8 zoRt_nXFa~S)RAc!3y3)?+qFC6w&7YJ>5iD`brRFr#Y>|WM0rG|PI&l80|V!isGp|{ ze&&ll{z)6DmQk>{0oS6Uc!4z)J<>X=gd(;@^9#0ih57bA^4 zjPZ~AD-!BSi&pv3$kI$mGhlI^Pai>#&r^P(O1*3#j)}P|zi81%ADfa7An;Gu-2Bd~ zH*{9o!o1+g_P+9wEiCaeNg)dqoGX^ben*{Jjp5iltohln*f>Wlk|M~j2H7Wx!=MN7 zpON*`NE+r4cp*m)@bKo|3hGM@_Mz;f5Q!u^U*QCOa^3OY=dYbV;WgmBVE`LbdS|a9 znn~rZq*5v6A{h69>5bSvxIOsMFWNk99_W!kt7Tou>#2Ou<^te82gE{=$0TFx-%yZU zf_~~D@R~vHR;cUdjifF~9oTSj*khb}^QYk1Hj7BjEU(1MNbR#lEMSu%V$re2+z#U& zowaXDJI9vvR^xHlNLb8uQQU7VQf6gIgry^F@;L3BV~l5Brh_951K+=z#CyS!#GC~x zvaVYGsWVf=DYlY=IL>yrCjjF*jm>Le+U~yn+ITF#94)SZQ>RVSTp zameG>{<`wATPCLDt6-DL$NILZEtF3rRokMTLWTzzbsUgBV0I_uHRv>~4>egGRTkBo?@C z@wL$O{YwN?RV@UNhN(mg66bg!v(82{$>eM2vow~t1Ifzr&VJFissYq@%A(PEbq@>^ zH}5>N$oIkY&ZE+bX=&(-rLknMe)APwyp-CE8A=@IEIR@8(g5h>vZ&EHD=zArY?O6x zIab^pi83+~$tMK-`;X6Fze^~Um{+2GiYRsJ|a664OQg>2s; z`L|$L%Gf zX=4?X_VNR|1Cj$b3NiugK_CN;G#Lw+)7^ufKsAI6t$VH2eJ}c2WE~k5=7Pz6v|Mg9 z=|0%tl`1@mXDW zo`YL#mo!#Gm?>IqQvKxT2P5CqYjPKE%FdKfMDN!Jq_Yq=JWSXnS0s#nALXr`&JCcm zLjitPQ&rWJQ{3W?nslg)h z&w2`2+LlVxTq*N%RyPs0)(sg28w?W}Q>e{eln2G$JO^=kA1xBXm8}r>-Bk`}ID-Azpsx4eF81}4)zYN^=&Ic!DrFHLXTTN@}Mo^gQWa5KuDet*|j zz#!OJj&P+u>C%zcQw5qPn`dFS4H@;p;Er|DcTWYc$W066X|~~0o9|sUR#E)O;m#IG zdmoV_fPTO9*Ob(0_7Y>^dKjeMJ}yTaEi>usKfvM|)uk(9{81|E#0 zcfrBq+geW9GYw=Ex@|<-Ih!ef`0JKxcU_Fk$UD{eB zqMK+R4@LS@vHMOw*3!0S;P6L*_rmCDF3_yCmix`#DwdHN2c(LSg1-ubv^OLY#CBoG z85q~q_OHtJ9gbJgTccE0py{b5SlN8ai`!vX1Rxb>W;p;T+(!$J_~3Dqg!XwMORKq0 zNYtj~dZo8VB#2fDIJc%Z5wytMC^?i~Q?~EMx=ZBI}=%vWF3O~E7)t2iOB1$Ck;VfOy9#G@qImrC+-yDk^lAL(j zk56s2TxY44lA5mYz)4mEf3TFk;ju{}J|bI;eQ|-K(#TPP;4MoM+--{)sgm7lrreOl z97Cu!`kX z7TRVCK$1hqaT)>%d;rjUArvW6 zA+RmllB3RyFknE=shIaPgIQO^g{KQPv5LNrxYt^qS!yZwq>!0^@|TcTf3G+oA8mQP zR7_*TSTMadnpaI8_nfPRCE}){IpUQPC>j#mk7-*6AoJ_)b)Q9~xbUyx!iOs?>N8hw z>DS(4hH9A$C_J(cRb!4b>WI)O89=%n~WPAy16~#|Mv0 z>eL#UB*7l4S+ubMf}XJ5D{k^QmDtEm(m=r0v!f)63n{+?ZN*rs?v{?3iDkJ?vQ#(w zI%vn4EB^q)=ssryQv%qC4(hYqB-Gns7hgr5c!d{ER@6sPBoohas3D{aTNI)(jE%~r ziv2kHYNoqW-L~Veu2rth(7EraF1%>h*9#%)E}K|9+2L|`oRnjo+Q-xDog=oqEv;W? zh}*fUuT5%c?h&dv5tag9c>Txe*!DWsno%5O{{Thhq&o?&RiCUDB&|sWENM?7AVp!7 z;C5m2$LG$l^iFfyNp^GOt3>^x)vu!_x6c$nTceIR*Aj{~F43P%kHydDsI?{K$8)dO zs`OfE0cXOst8F#9;;N5c^g=ZX7Bpcak78Kk^VTcy;$AP`zN<-+IAX0F6c%b&!oUKw zO_4ONOJ|Pw2e|nh>M}`f3@!wdR-UEmL|w5#c!Td|O@b#+`f)AiErd(AS>pY2^f zX`s|Gz{#|M`SV&2hn^h6)RR!lS4hn4fy-14%*2iW1Y`3W^l`-=mLPWgMPuqU(n{h3 zuokW2mw;A37Q8)c6*Ubl4^>+f)L83OEG+VOYFQi)_ETeTe}VS6#v3z5D|eQ>Rjiik z9~1Z8gUbBdq_x)6-S<2%6rOXV!y|>3DOIq`ASWS|f(}Qftj!}@&hPd#y*`}eOn6;C z&>jVK7m0l*1UA~LdTV@5BAKP)Hhk`68El`4y|Kym)^3}#pQt_c%3I7R-1i&43Cp?OSL+|2>$w)^AjD9!oZ%#UXBeb<2fN@&Dv1N-c>ur zf3tU5{hzviD*NM8)LN@CPV>-9=EW68GOERL8C(Y1PSQvqs*p1tr%oK^k(EA^T-S#( z>mOB-^e0S9*4(yujpBZz7TJ)Ws9?@_fnUYEAms4cQy>u0!A zeb$y^MGBY%USw)QJM)5f@S~G~+f;wVd?raA%6Eb1-_>R^0j!nsxX>i7qp)6Gtn}c< z(8r8Mgb}m=037;`0roxhm8%uYX;zv&oN$j?s#<9jk`KshIA(!VDG0yRee1+7pQn*G zQi^H!f(ZfN9$7xw@|Xbt;k`UQE1yMgY!t((T1E6dfPVG`B(5DHgSjN&gW1z%D~ zE4T9pUfO0k@{j=}CeUh`7%h?1vHQI|an0nqG;)FX8E}6?&a>oBn zNFN8bMm~Aen|raxzu`>R#W47WRm-OO^Y3!UOJ4-E6{T`vkx5)}xT9bO9FL#NO&r8> ztH}j$Rnl98MirH^85BCY9gM!%?ar+_P#SS%S7cvWT zfLDgx2|34p`j|3jgSg$OLzu?{A5S%zTsm!fa`#IWf;uphGt>b}S%hlm2@wsJ$O9nm zIR~9$&!u5@=!ZTX7TkK5IfO`6g4ah?Pha74`*<6~CgCIBasszRuKi6?HYY zX{hNcZWJ{?>T|5^=AjXlmNKA#11o|u4sv^GVPi-wDm`1>yam>sA9RLTA~4l6TWT_? z9Onin&NiI(&m$SfVms-Ji$w)c3pYynNf>BAU{${Z1L#L={D{V}C7V1|kCUKx2t`Qty#bMw@ZY8@X4J|?_n4)GVED11k< zGE!})yuzjFmS+4%xr{degY0<+9rb%r-D~18N+|R`e*IRLV|IkSjj0m1F`PJge<4j; zy4hAJV5qm+YFWsridnqr72ky0i~`M!ocj^;*P_Q&;g69)lHo&HJy2%3^$fB=)UD;x zD2?N0_?O`WX$4O$j>P2afbc+5x+&9A)LQOQ3QB7=wwbb#PguaoE>s_claN>9_;Z3W z@1?$l8%k@vGE^_#uk+j{jFuEs5QPWf?eGkoVBqcm?in49o2CNpsZ^9J9aQy8W=dEJ z$vf6vl?;)}&IrK;z5$bfdF{_SUx^R^;C>Vf$a&EqyxndUGC*VV7S~KrsgRJbxZ#dj z2n9o91AtDJvtf2R`}gxi=Z2!?3f5aKh$}qBi_W^!oyIj&i2|`0!Q0LnZR$gJIR-8v16!f6Gd0clK zdX(igc%rt8ELL=r%9f_0Npa>&Q!e&w5=W*6HymTrjc_=^9!QfG!5AcumV)H_RL5_% zPg7GW!w^<^By++>qb$VxU>!tl&H!aWBY?@_a}`%uQgxqS->cH^*3sT>5sGe`p`)jc z5iL8$aFPdNs}Ufbq#TZa7BT4?rZK=6<}i=>5`E=){_sfmIj(l&zyp!zo)*jYezV>z z9RQSed#e9d5@miWd=Eb$muYFb5 zsHgHR(?4!CjP<14E_3P`O=Vn?WRlEz8lv&ywIehUYi+ziZUl}@5Vjp-Hmcq!vek5)&Wq+v2H92j5XX=?2mbif`bU4d z8mEcHt`9v{UbYI)*hmtj(|M!K1CpmF@chU4>uIb6hL6-eRhLTA^Wnm=4PSdwg<5p9$%SAlVR9h(`sH#)@+L>X?frGc7 zQT(-+E~|UDv{_ov>3g>KZaZ7ye*XZKTk5;()<-oxWl=DB04Ls~5)IwSOdKF5fK(1K zk>6CJmVHny-GkHlwWe98(=Sd&uFTd-UGFh05ClBjDzdjSIj;814+nUPgg|Y9Tkz@Z89v19vI`{)ci#E03BlY zZqv#L#rw6AY&aAzr~2BnrD|HHgUe@S3Q}b_IPKVi2+p=7(zbC@aIfk057XUd){D}9 zE8cuX>KkuPQuO7%YmH+Fl38lxi34Dkc3?gfz|T3y&r~Js__XXYI!Xh{TpHps*8K|F zrtnk5IO2(yrl{rHlv!;ZLq{=pI2a2i*TxQc9{lHxXSe)kH+*sjS-|5X#B}vpx}MdX zw`@(T=E299Dv{I{PNMMo?fXvYITaMOLItOpuvxH18a2wJZU7K)NdxPtVlm7_5<6>; zAXSeqd?=vo?(wD1>zlr+M^ay{-D$eD_Q6HPlyzy8aVgN3AF9@h8C1fJjeYs^X7bLu6y%>m#) zb?05)0pPMuy69^a{?B{rdpk}qu=$m)vCq0Wqyv1&$Z`X}Ew~VOVB?dW)@tP&TR#S? zr~UbDn?0SPC?JEwc+g@ zLw*>YLAc|UW_TMT%bqNiroq(}5VDr4GpN8~*Z}kRrcDr#IVg7G1-##q zg|KXpG@nAazRBQitEW1yy76bLYCCJ08kXH$43a^)e+QrZLpi~5oB$Y;$@6pn0H`C? z$F@#D+Ht1;_CJ{L6kpXnj|IBF(?u+UyW_4@P{d2A;+98EKx(G*41lY0<%UkR+TmT;Zwt~gF+hV$ zPOzW539Yp^(F02YOF>q`MUKJ`+z>Rc$r!z%4 z!29|p8>2QbtL@?ChMF0xnvxo6B1V=PqR!IPf*ZR1 zRRf{=sd|pK=~Zv?%@i}3V3sgTv}AlGi3N8YV5sftH5~0N!^v&=SYEDKp@=v@i zUI`NAQ)sE`8hLA@rItYei)uu>le?;&>e&1w{5j8l+IZmD&hY#|cz)vrM9pMcpZ4K! z2g9ml(NAV-y1Gc>yV99_*yMCasEtSga;%^ncHFriA*{VjuXUX}vEjk`;|p>aTIv#t zKV_rz{eAvrMXiz%XCMn`OS8nLC1T_c)Im_Bx?>PDG>?QVGDkWR9) zU+&hr`>#vGP2^2UH|;BGY>3AS#u&6qmC5)?!VHbM^~SwWf$~NSFZimn)nrEa+WoSO zUKKZ?Vv4i(byKjbn``~xu#I6nVMsXq&PKf8le%#ZTQ7{bZfekN7F4aMcxWr(t&FiA zY+!;D969fio-i_XZKbYtH04rrT+wbAQ>%JLFE975DlSzBSUhqthA<8RA5373eDF04 z%xJEzDn=K20Eb#^w-|0looFX809aOSA_+ji$}#WXzspGs4s*T`qLsi5tDUB4rjcTo zDp!I?!5H`T*6a;&A+A|+)e;r1iy{Z7U8l`QUytn_qBkz_E{ZFj#O~ zLVX)%XZbo$(OeuKI{yH*&D1&vcRGV-q30iC_E*oemq8yCKF|G2LkvE3){>d4=#)1m zqMR8Oeqn%gBm3IOh>hD_!N=G8TiR#rcq{WK!0xf1r#wqG*;{2j<~3@4r-4%#EP($2 zzaI7afHg!D8}^m0tB-p_g6m$N>V~t`A1M~1(N;u~K;!WL0BEW17z~VZ4hBcAtjKd= zW;7aGzem->98F1+EOhuWw|&sD$LOT-?UAbtIHePX8&?AtpH2w}c=%h7;vMt|B@k$# zbyRLM@nC%Q3!0~Cv&yJ-FG|66g!!losme&4GM{iqAnKfv2MQrviekg$s*&3&sF=iC zbD`|V0sdh7>p9v$>kYy1=(M}wGI&#IuF^u@RG@}l>f;Q*mZ&+dh|yCDISNx-jV;EW zdsUta&%)y6UsA-j@Yz%(YielB_=tYTmyi-201N~*uwV8jdkTJdakbZG_lmNw%CA>7nZI;E6W(pRGcmW$vEJt z0OayC3D3pCBgICZTSdy_Ql_H5=4xeFN0lKF8|%a?y7GX>fQ_jZw-ekLBAugx}kr|B#eMmoW9x=&mx(5$hhVvTYX z5}cBU;rirqj#hLRO^?IR8DKjLhkQ|s%dB^4~l1AHSNGEX{<6qlT_ z3Yp0`9@^rHAyZdHM^R5rRZ~k2@v#!3R8KVf0LS9`4C=8oZB%gt(dxZx4X|HD-b-~V z-dzL>TJW!)RB^O1=N_0Ee)-|O-t2fYP;PJ9+Js>!`4#!mIV=K!(vSG7i(I zUZi5cYQKM58WKMiajK51%G7C~to4LJYapIN{$ohtf|l~QyT(>G3-w%#p_h44$F{QY zX)v?WaaTE&l5V@~l0NEU? zZ&x%Yr|wP}W-PlMzzVoLg#+h~ewwcl0D?c2VYVDCW;W}By$VECq(nQDapwcL*07Pq z-w4mYbgUN9yFqOx?@!lP+9}M&o~RNUCBn*&jf?CU1AuwexuRsyweIPR`xF&zJKVzN zwYdh?r%Mf&qit1BJfc}p7zA+6P^cj6CD{nz<90jtQ|qriKB^}$K-~IY+xS||idi4M z_*rj>w6Is$z$^d z;#P10Dsp#YKA)zxnW&BUA!bPq(}WhbCZ7;XQtQtvl6GT|aqX+|lI+sP)PYrN+G!`V zMfZ6Ck_Y8Vju`&{fz(=!H+b#f&k+-_1Ak{j0pULUdKYqx1;IDSFmi1}X_!fjMnPaKYljTPUgyin_2 zqUhVrEmK?T=;oHybdj9RadeS(`DQg?B!Grc#AhG|0G)k9LD`xeG>)HBy50IsTZiX& zQFyWDs>}XNF)~;PBpM^2{yi39;zx(HT@TdNcIaA5os#M1i0QAC!Wk;eVL3@+C02OL z0l5HRa9TVwCcQ}z&IpM5UtfRHuN=~r10ds?&0oLmSNuMpkEA-Ll3%<-Uqg3KyVB1b zfkdnuI|(~>$fOlv$3EE_@O_{RVeOHGhUAYU)l08omMzip)pfe`G}l|qGFDzHV7DVP z8ftk2G84wYB}QYnzTtpR7(MmnV(nc9k~7?~&Ucs;2j0rU6Fs;MA3xz1j{g8-@al$0 zq7>9{O}iTxX_igAmSfA3FgH0T9=e0P*7n|^uN^VyKcra{nwQZwQ$7;-lO(i~TBc9C zSJ1ZD;oLDSAHwKI48Fun3 z^;ZV?qiatKpL|xhw}*Xcf3dKr{mkUi$no(kE{f<$JPntQMfdM!3Js~hSIlyLP$Yf=cB zo%1+Bw2jO;Vf8us=UaM9Y)uXs2smyyDZ5mwQ&c2i@P>1+Bx8~7f!p-elS3g`grCw; z%av-x_E>HZyQDG9i5*lU%%>&J(iOO6B<>j<`(uqGK%?MP(uTN@GD4Iu^)Xh8O8HH~ zqL=qFG=<3Hka@!qgWEi0<5i!gZcBJR@zFM&@v)(!1S{`Nb(`8KZf4PPlG`kZ5oCD0 z__X8w?_xApxsWX=U7Ku;93 z^Os74yMhY013wWCpywGG)IF)Iba4j4NV>jw@cxzuYX?K8EO@d{Ki`_`&k?IBAH?pU zdSH-K$yJENY-3P`RE#R%56BI9`LwT|a`q4!Pp9Ev>Du`t+=Nvc>P3U@R%?ynUDaMo zTdbx9&*Nn>Z#eIOr|Y6S6H@;G{j+|C{@>DY-N)i^9;%6c@q)I`dZV|=1=g3#k(|hl z6C~?{g(J)bN3TB;mB$*PE|P69du4V-bU6O{EvS1dy%rqby0>&C6;D*uJxvAXk`Fx@ z(pu60+yOfXRmddd0>tyidCsEIGrf5mfIeW?@T+P&K1SixN6lO9JtNeXo~coIJvCh# z)5kEXp`NBGOipsD$si$4ROAN718z>eF4nQ_2Pc#7l}Pqmi+jkhdK*SAmQJCuS)#71 zq>hG~X-I`x{P`XxkSS(GQdE8-1A2XRMokRy{{XtuDBX2l`VCZ4RAk$zd1uLK-4oY# zyS1vCnrJB_h)9twJV>b#QV7g@NZ3H+5y3niY3Msaj~ivH7~+T1qR!Q6Ue{Q&_fu`K z)LO5VbQHHBwZ2=$9J9uy<91W!vnyp^xH#kcyz%YW- z@*x|V-PSfMq*qF7HOG1*t4b38+7V>2)gDC{=A2I8%RMs|D z?ig5RM=;UKuF=@Kj?vUt395v1TLdw*62U%3L!6DL3>yUJj9?6BQb!bwd2u!pNhEIo z4ps+wjX!VP8aX`F`^>u=<+$Jh{I!Fq1HN$z`+rCU7P3tsng&9s74rMnC6@<*?anmO z_^2K*wK@VNfmrj$uC`l{7Gtnfc~i@kv5x-$T?5KJ5t|%@4ZDvA(~Ve&ol%ZVWtX}V0U(3VI^NUkCeql6)qgdaE}fI%;Z{0tr213C z%FB;f+9s>%x+!Vi-C5JIs)&T~uq8rgcbKIz6cV_0BZkJ3eS7r~hD(yu&tKs~9C5-- zjQB?t@2cB6I<0Eyh>Wa==(4A%U_fE|=Tu3s9IwP|(CWV(zSTY*KZKq%^(~^`DWzL| zde+;eh~UKxn4S*!v$3R#|C{#iyd^{{Rf{(Nsl7DswE5MAUM*8<>DNIsWp0^CqpZ z(yHNDTJ^buj4}K`^PcQD{$mI8)c2JbPD>tm#zFqb@1R0pX$!2U2O|KTKth-mTDx0P zPmHbz_5T1ZS94XU-AE->1XHJ#P=VBb2^8Uz<`4LG*Ey`2yJQN>T98c40YPD>w^2JYywgfP@=AOJ z`DAByc;Au;Jf3v^r4f!2Yc7!^NgczfYU^d(SMIjBObkG5V=7BzV4Uw91{fo-9!On` zl>H?HLN%$Ou%uJTI9VsTAKq-+{?w^CW*OVG;O8fi->mdPtEazyicqy(>eppu>&Qj} zF=U*s3EASd~T9qyRZZBovB*T? zQv_;?cAy(VMgo96fE;LIk;++I+P0nP#WX>A%&XfE`3igPsu zRCSOmw0>+kDYz0C2F4BuAmg#)UCvPRPqq$OiO1JLGID@<^^py{1H1InIM6}dA=fBl z>YkyR#4XhIP)i0b6>~r&F+$l19C-)hW61u_GmPU^U65550Q0bqQn9u12FF8cyw_Ir zO=bT8sBKey$lPnhvrF zv^T_gN?O~VF~&7P>eDLLC2x$$dPNz?IsCo#SZz61fyPn(rIYUVxsPC|dG53g+F8T8h6 zvNn%qarONml$pNceH{8Sj&KJNXYSwUOg?5Dl@Sy%I zX=dE`9TP@*LI7DVP81vWQ`GI2ID$(~z?p!AJY`Qii1g>z)cb49X`*vUEwS@nnoX1m zRGIpOOtTerXruvIvy!;<9>e-;PrG#nkxILq-a$etDk7RO8}2LV#(4h#Em&x9{VJ2% z3Z>ue@2KI8J#P9>+zt1N+%0n^QOM}zhXZkXuHbp!xAtYTRu z36C+`;wL`6$LKZk?%SEu%6^`UUN{2<1+;XHsy?a)(LK6qfmH**BcP3>npeSK%B8T) z#tNxisLlZyHG3xjra)Q0OlG#aGDTp4)OF8aEP5(Bv)s5nrmMwsZQcu)gPlo1GEGra z(zP{9RRpmR)6i9+3?+fVERHdrFk}M_xPW!-bQ;NZ@HRzU4Y_$%!zqJLt(~rR zi%~*$n`s}V{yrS@<9#h)h8-k=W`biPoDJjC)9AApIyR@NI!^g!>NS!Jf?O5o?(&6- zC#xsmOoN4Y!B$)ja7S%b)OLflFN;wVO}5zaRsg>xJkiGsgP3iHHP*h1IhVw{$NJ?g z)aolJK}qJQmurSnLEj!TxE=`3GwaCIBmFxKi5TyN+yFmyK{kBQ13!SAd+Zih=8Mee5hSMATqJdS`i-+;Q8ru;hUivF3XqUx*MaT>TD8ELKA zcL?Bu49o^w2fB;{*qwM0qJ}wp);y36fOz^Ii_J}}X>~?Es|E49z*@fxZ0D@HO6_v1 zTU?Q~eLT`ElcX*1dFEsa&Vh=f1_Y>ZLu0qT?9EhqmYAFepyTL&WoJ#TbN>MRX#md) zIkNbh)_r4o{efw-)padwkpUV*8&6Xa$0y7H5~|1XNO<)+l5WIwtl+lONBh{ts>Q4aWyoOj9?m=nShZ001B2F#s{{mJ~zW|4L<$yrqc-J zb6VK}9c!PVSzRwrbgi!MPja`)mA1yE+c~aOmO3h(=VGfKNh$FKBXaSL!#YhvC#Jca zUQbW2PgMtK2f3RVP2U3qk2D@GwzT;*_S&AAw>2b=W|C7IM>&~+CPiQXUO^#10D1w8 zYQqG8hrj9X_$ydP6JgnZ5FY$^^-$=o)b=`xxhJ$tBdCzaMOP(EQl&fbg_+fYlZG3Q z1P(y-)Y3KKz+ewi`+rKVpHmoVFa1NH1br_l&loJXzLL67QFR4OcMAG)>XTB`EYQ3I zx+X9eYiBErjDw6alRKg?`~=;3rf#-9Snru&-bGRRCww%rS^9>urn>V-4RMUc3dKq|hPy*xb?f2_7aRXarr(mj;I1W)D@ z7?w3vWgV;Bl#BF}2l^i}3t@4isz>HJ3o$@>YH^3TG?)tl@m}; z4ODZ~Qk8_Hk*+tCW@Q`54e+-f-N4p-urpI0)Ajy>X`LjHNfSt-Dznc{S5I|q!mjUg zhV4&onwFmPPSVgNQ^w6BLZMtEt1>bECHRIx#|MphxBmdAX4E{%1g)cZB#-HM;i8&9 zg~Vg3%y{G$evIs6miby*0dWxGcwc=p?eydUX$ z6WB184bq!EMFI3Y{Z-n#?I8zIM2f&z$RLTQ;Ub z8UT2#*vML1Zpn$CqCyJTyfwI6I)|fZspu`#_NcCqGtDhEGe40Te)%nyKMnxgbIHKY zsOjN(wlX>gngvnRf3qutpLOW1y)V+Y5A>p2mCLA>i|=%*8ag_6oHCDrV_^FTP@wkD z;y${z?E}NZi;u{E(vSSe%zHz4^IEJH=&Tm&h0gnS#h#Wgj^iq9iV9>*D?2-I21xYu z#(i{Kp_QV|k@EilwMgos=s;Zg)HW+Lw;FDy>Bpn}jaex)^%m;=yWcySGYfoMf4#RI zuyy9UM?lDGW>-!PAK~p9+p0#ubGe+^EMy=$s5JMGV z^&S0BwyJ-U>3#`L4~1b&bp65aBll}Wx!fx5Ha3FUcw1MHhFa556`e|wNr|JtaCq+7 z9lLqHrIV4glT}gnm91^9uBflVg zvNcPy;*<*n?I)A(DfN$K@Grn7dglEDeb$sIde~|n37cUh6><;9*@o;9&%V7DzJ0NV zfn{cCqC1Hv1y*cP$U1@N66}l&$UVv9>D!-{K6=27gv+~+6>;^lsccqzgkqF_@gJR~ zv)KB5HS1uRp`5H4V>_@E`-+RLma5oE0;L`^H)fA=mB{*MSD4^3X ztF*&aQ$Vm0Sv=B*VIUw03gB&4&!+(UYd=$?8+XJqIa9+m{4Oe?XK<_oV0v?`)=j=}BoLy40Jrb@QovecG`i zs;H4h)RLPlo@<3W8a(CHu09|ER_WZ}T)Yh}jNSO2eHMq%T}y1ZSD0

`wP)%$py>{lK#y-QBl+0B*|-eWfWD=JBT-*FVBpt zZ`w{b7{ji6;0;lX{8&$|Z^9kmyQ8G;zq9V_iy%=mD(*b(aM_Kx`J8a4NUgiTF*X(o!2N()U9 zE)miq@-QF}*v{Tqwnqnma&ktm)#XslQu-d9fXhuieMLpuvYodjML7QM29tay=ml?>< z1Jg=uuB8s+cB8Wgda3pR#Y%GToxAa{Ww>96@xbKZX)JiAu8I}^0IAl;CA}lJ)?J#d z&g8c*nDIgY-7y0@fZV?b#E8*tt? zvKLL5eVI}iW7H*B%I=C)Cq_x;(A3=T6;<^V>kCV}ZpoyEM|hmE8-ZUiu0{)hH*2{# z9{QukN;H3G^!+^3Nbyj?G|y2al@e@Mkj&$>o}KY|)^@-mWpv$g@_&@bk6kHAR$1jkZzoSkLU z(_3P^)!0&cgnF8q4~A(YUh(eWfJdk~<38qu6bvK9b&or%Vdbp3CrO8Cxh#iSP5n|XL+1AY3tvg42&!>DzBf+F`cZXwr_3ER~KjPD$jFPb8mGb%iXio!jI#RRs2c~ z_XZ$fW4?7w{Mxd`ky<1F0PkhN?me{#6s}lAdfuPRu~e>m#7766ME((BAg)^KiY0nE z0`fDC{OX&7iNexpu&5Q38TfYb{haDOgk*|RwfEIYj_X>KFGnnJ*HaYofT;(1FQ8Wk zAP;VJohF%&cIKKsU&7IgSoRIPtjnovGt-xvJ?_c)l;Ds*EnJQ_jPgAd-}9rCZ6b;h zXX=XkRc$PyrHwpp3o9UTu$(Z*sN0@2u4uE2saj~*I9HpD6jaceX@Yq(qf1QjNt7y2 zB~SKGa6Jzl#;_$Xe53>6Ytd=m3@ubCDwxS4ppOLP^NnaVfV3ARF1{1+RZ~V4=4m1k zo=O~(fu1=YzPc=rbGG=ZvOjA{ReSGHwR=A}4VA+TjC{YAwqSt6&sFDV)Cb`P%+fnL zB!CPZ(Sz{(^>(pJfpI3(>VBoGj*e<)nB=L$DvakG43GQHpzc|A0=CUl>ua*RbEl$` z*JYxLbNj`HF!}Ugi=XG6d~00o^1a~Qlht~NhTsa)JU+hE#TVSt;ZqmhV)CA*mPdzf zSP=UV@>dzhemEfN=VtbVq324+7PZyEIP>>QH(RG~{^0auul-IiY zB$`+%gjI9AfgZ9jO3S%dcvfwvg)NBzIOOZoX{U!vJC3w;AUI#@^YI76m7K{W4QKfH z$j6sZvF{5lS1Vnzs?T|nON3C)^Cc}IM-eosxJoiV12SdPBRdHNTPH$C#jn?Cn>=K- zz>(MS$noK5YD*azF_B}`e7}WA@w2Vls(N;76Dg2T)g@W1si9cvgoQSMCqueO%J$w{ z8$nVJe^%Hcf=$ryIvOYc0Nav1Z1lQRVPk;E$?!P&kK&eNQFlwMOLDkW589H(BdH+B z?Nb_%f!OT>aRd06=V{69tBj4IzX-0rzfKemZr5!GuhCsPOYHSo;|1DyI#Q~VijIm8 z^_shrNU@2scAzM(#hGwO0EFP4<2@O3@x>_hhBzy&sNFyIaq7!Yh&0x&s-d`adz?&%Df`voq@03K^>mA)P^04yBF`siLX1)?4ZBR5ejFQ_=w?OqWzn1fK$=~yu{31=05K2%!juv&ux8>6B9ti zpAHq79_fwAlCO0a+3BT`cQx8pEpp;7Y2~m^@V{;Rr&PfET0K)?Z z1bsC-AFzX8kgLpG@I|YIlJj7_Np_m=ZME0bx_Oi=BQ)|$JGaDNV<0E-93O<^9>Z1K z_}=$9-8lEntj+UBBU>3Y)PLH!*sARh(v!{9w(1Fzi6f_q+?1vztf!MWU}D*_c-bBb7e-s{_b8r@>S}FQ zIuTh-Eb=TYTX{{ABJN;Sv&l>m&Hy}Ak;E3789zm{150-_l9;?(s-x+vvrjkO)=0_& zLo#AlS4HyW&pumVusF_eK;Y=gL6y75j&t@Z(6a-BO?~g>{3;a1yk5CW4(} zT5EhCc$#;>4Y3q&AeJ4@0KquMby^wTYB6N29Zg8*NbCciir>;c81;W$b>sErb>42B zy0xrQ%|SgvRQYixc45Mkxa4k8k)Co#X(a6ZRC4Qgl-d`>+V->5qL?cWNZq zStD#7s^Mqwm#4aepy~Q$jXf1aE@h**Q)OhQlNnISfX^OBw%`s&uDi4U0K|5X=%zGD zJm7-1`!87S4LzqG9Tt85irnnix~pjVc+!!8Z_`HIS zM~Aq#yP@W<9X-{}e;~b4c%Y~o)fa052d5pd-G>_6gIT@19B#aMSo1(wYz|fSTSX=6 z%~eko6*Sco4aq4RHa2*Dz-K=$bL+2B5p4060buy7XE?(q;#P4{HD^g(YOPgKq;;y9 zRgHryh93*$<0Bl8&y9GEcYCCtx{&%WQKw{jh(wIaIcI9A&@41_)W+HP;a$Tcxbz3< zooK-F;Y}ClvnFSEW!PFR4aS1eLdZPXWkmmfn!dIF0JrU;-cq`$TPf+%vX)3?4Dm}FM9QpG zkU>+19D{?w9FF?aVJ#uUkf}LxmDj_&6|bv0kE42??bOuWC23Zvp{@IYsg9kd3%*%4 zBVsliekH=QA>0+Y*R^c&IwtBMV@sXLy1M#(*O`1VwpT#+kkOtmp1ujo7qJ-me$q-UU9ODQ7 z09|NLW8nAGK zd~7+&9&$fWe?3|TjSi{<;-S^V>R9qld*emJ3Bg0AB=SJdsXz6`p;F-ws;BPTakrj% z#-kwgQZ)D~-Kxf3b}OTSP>IIp${go!m7+_EmocB%=SZFb->1B0c*z7P{t;YW+6Z5=mprs>*Vo09gd^z{-N`DT5~ zk0IIy*vJ_pQe;MCTsAOwV?xh99{uuw&OAN%SIJncolLZp($d~)=$`54IwiW*7n;$; z7>LK07Vf)736B&#>6T=D_$gPdp(xfinm30eeMAWJCM1_)I`#X>m;Sy+W3v|=Uh z3$`-SA6y&(z|VaF=C}jNQ;Fc5dh?>0Q~X1x-)Uk!2BLbAt01-oOS+CZhb7K>eBQtGJ5a7y+ch-d*>Aux3sO?}I9 zj1QK5zv-kcE*L6M;&6n^Xn3Y=;Pc2Qxf+q67Lrd27aBBx-ug1d=Rma!BsDcs2%(0S zLb6REI0}dX+{B*1YBz}3H$T_q;;hF!=iwHevvjo-c8beuHBed_vm4S^WXlwX1%}bW zgWnt+^Y5=Wte8GlJ|3f*=lNSYU38G*L2W-}o?dAM4OE6j@*K9{jh)1L5$+DIMQ;_# z-!im~%Asx%Jvbqm8YRPir)nTw1%m!W_X1=BdWr%;f$t0RF)^< z0}Qz2oxt&sSjZe*Umf$(y4dUWP->aEq#u6%YiypXlJ(&QT=dS(Y=*uIPgGl?ryJmx zcKR!fIXEuJ!X`WkaHzyG?jHXDSkWhDvR)&72iHo*;pQ=ZBkH_-MbYXA>N4E@kCDxI z6`5SRRxb*=vXbj$x!Y^*u&fsvTlJJ~5ByOE4xok@<~Ii!Am^QUxlFy0s9=I^;ANzF z_>7E>e2UmI{{WWK9xKFu9>V9p6FP1>)vhg36(o~pVpIr&IgqvzHNZTv_&~rMfCjz4 zVrm_#Ys`k-&VPiGN@D?|jultJE{A@wxm4LNG|5wCx4Ko&BoZ`rB@RG^FmbWl1+uvt zcx5M_Q6!~}8*}CFg`Xk3oE6z!9c>1VmY$#PHeQ*53ZxXNI?NH@{Cq2+aLt14Y)ZR5HsIYOwB-w#N8wgE`p`{-6jCzd;VUsCjtJyrewvB#ME;ohshhd9 zFYvS8z3R$(DhX?A>QPSX5D@VWR`A0@yaOV_s0@`SL%_h0@os-L_lFdUw_li1oDxyhW;gVHyfRNh*DuPg- za!J(*bmkaC%kADi8RL#Ugt|>o)lB`OhaXRx4}ZLwZP&h=x*}QV>#AasoWin!5=Iz@fJ= zBmxd_Gwbc>PNgIc6=t-&g7by#9F8Zaisx5T6&WLBu*K%=Uy|fxk8J8T$j4xF;H7*I zW48sRbXQp|Ye#IFNfw9(hN7w}1)7?soMU25O4~%ZU|Zw(mwS~t15oNUXG{a5bsw%h zI)u^b9jG@bzt7i|X1mxeRaTgVswL@bhVeWSTI!`X_RA8>H8U4=J+{b<3W2qwZsR$c z_;q&4*|psq+ZFQuJykt#Wk_3sPrn|ke(-+LYUo>kl@F0scbEfLasU>ck3R_n;y!W} za1V2mbEc0}=;SfH*&cSh`O<7)TFUs7OVc!RRop5drJ__RB(*NFAXDGsYz+SZ-#Xi7 zs1J!zbT@_0*T1wKs^QmRdTOa6wZwru*_^4F_jf>f0DJnLa0a}WZALXGypI+4{?t!y z+6!0fuNf(PJ@A6MvZ|!7Q(WPag56Cy^A>k;hLmj~fQd;hoC0=$a!A%UXGGVzuW&n? z^B?w`X8ySHT%8Y#{Y$-LHFWWa#?@qL*8?1PmDGRBkEXpuvbDmCK4>>g#t=b2TUzIX zemC6im-{_}uGdFA5^s!443hooM^r8j2`YFEpZ9x@Se_y{5&TE3m9?bQ%?{QVlHUp+ zo>yDcB}JYX?hw=YkmMO%oA3|qA7S&=d7vP7@r&_W(MTQeIIEKNZAF^g)eT?eqRObr z6hJXU{9(D>?}3gz^`i!?M8e`#N9p^=P->v{ZO)4Acarf%H{R=6H_Xc+JK0AW&pE*P zcGe$jIxSfeu-k|1wsZyXj_KG4=Rx@M1oV*6&u5g(q$z4jkOO3nSs6Ya{G96xN!m|q zh}v(+`~0c3eUvP`JXe@n?FD=nzq`j=lEEZ|C`Fup?K6y#wEB_9&tAhzt9hXD$?;xx zO*>i%Y2g9!E2Qr?i!C%1lwxIuLo@A|X8}mST>M_A-;aHIX&`&!rmS|otvOj8u~lRQ zV*~;ZZDX~iry)-30}B5D(x-_y?scaDR3)@-y>Ns1omp^D1dCecae?Wg&QiaHWoj?J zTk0vq(o-3tbQ45KWF!z65y-}UHFygOo3bfX>bireES)}<+UjLDvno+iReS~l z9Prps!PT*xf`1Xz1@DMa`5%2%UD7eTNYB`!-6zo29*VE2p37#Yv$bn0Pxny4WZROs z1e^d_um=MtT2ks})CP%NbeWJcGF-;dDIaVLX}tZGdj2(C%$7EsW8No@O#cA;KjrDF z&{um$&+u04BwS^FD)kK`R4qhcu_Q1b&sg_Hmh$Z13Oa#rodaO51P?NvG64r5?HjxM z5ZcgTw&h?sv`AV%NnPZm>L-o>NvR>D+A@69%O^g7jmUk&XtexZsibJzxpb#N-vrc< zv`yxH-)TkNoW#JCAN^>4gI3yYs*{FQ*6nXxB}69?I)+y5!PLwMXg}kCIU)o`2!9H43ZJX&{{WR{yCpYF^yO1@#MYjisr%5{^>K(P!-r8RDBDY&pV{TGFnx4{-38gg6FmgI zQ*U@`p6Ppzs){OFfk+-YX(dVg(K^jh<$zloeiS4Q0XPRoZ_D47R~jci>*@GY=z1Zk zVzx}lZ?!{1=L$-Sxum#D7?MKl1hO=7vE9flBqJXBj=(?0KeZ*EREmnbLeRlI!h6m1 zRn06B(p0lBX=5xH8hG|bIT!_$jz_+T$-qi{(KzS2e$1ze({HD-#?2s_DuoKLpdSOO zAW%U(oZydR&X$Tiga-Ld6jbcJN7e;*M$Pw1Kp>3njfVkF;t5t{$p>)wPZ|Ucg3$Fu zt##CQil`dhV3K&&n8yv$u8<@^1qL+*fo-f?oN^9wG{*CIN0g+ms*UMxcW$GgrMKhZ zh7yJb;eiHHek^WYMn4GPZX-jlit0R+PSJCY^=g8(BX|V6zjY}a(UppjamI2zJvGE0 ztFcJ+o0>|pK?5Ln`D(n183nA%#agN^iRjwv^4pT@b*Qe5B$BuS%NRShp&Z8;+N0p- z-1Dilno+8hM|3)@%}Efma284GsC`w{U31bj)_$pLS?F4O%!p+$){3R6R@(ccrkEB* zDoMd+Z3dy}C3i$M zROS~YP7lIK82N%mu_uB0ca}e&TU#;6*;*dX50xgmvY1s`@7q+|6@kjbGvwVO zQZ|rqH43satk;-6^J#FrmO?o_^p4?h!Cc$aG(KHas(l-|{)bf^gUY_cg;2QAvdZeJ z2tB=VLYZ+rKyBWZX0eKh9B(8jF0-~UWPXh@!Bzde}6@7(FQwnxI;SDnx4#9 zU1~i0)RK9XG)_09O3T1GU9|)xfHltsH!QRXo!fSlJ3klvw8*8 zLC30;wS-t!4-PJqbj2yDT5?iavF1zh0LrLvGCP1+k~8Wx2BH?u+?t`}r_xGccooXp zVz@2dIOR!$WCkA>2aR|c*x1}!3ihouETY{fb)9ZAd9>>dwguhyw|%Tg{6lZB9=gfb z!xLj@WA{i;LwN8$YP`?EJS{`R&lyE8g`Hy7YhA|aTh%ob)NLI+L6#~=nar{s;3IDv zR48rANjOu3uScn!x-QLjiPE$^z+m17Tfy{RRKg_GHSWVj=h?rqm1MY6UhFk<*HK+< zx1TiyJv8ePjgu$LX%6kc^1DL&5Zdr&v4C;_+!!3^9nQX-<~TTqVPtO-FgPC-Z1HogY3=tbH7(L+qqa{%B&2!46wPubn*)9sWI;mWS|{8p;|F!i-9hQmPbR}>FDZphJ}xY~Dj_R5o|nkTq_ zwlA&|vBcd4kaaJMHP#DwiVKG0DcrHTf^c};amXJ-rPFB}BZ)t5pW2Ce+e9oZpWvJ957K@vA9Gjpi(bn@pGsw(?)1(=A)*MckUsXu_jJ_ zAaXtZHDtBNJyaj;3^-c6(EX_k+Ps?EZDm!(UxpH-DX;t(9N%8lth7IiOztOtFpG7dgm>S%Q^>H#r2>2YBkqZKv^>27k+-hj87D1qAJ zVcUSipJT^9=Tir_kft%Y$7$p*s;?AqQ-K9iNXSQ*2~rArlelAce=nY%$-?gKtQO<< zPf=6W^&>?=9F;KJWU50IJg0bSg194iP)0y3RB!<7NY0}nzpA{w^h{%W#@nE;HB>3m z{Y}&LmdYqqYN&R3G*fxd)JRELKn^g$h`~Y#CvfLE)RM3|5aVC<)l|?m`f1Ekn?5{p ztv)F9!I~>g_FH|@hRpCoHOZoeDV~l{N+U;?Ct-bxL+vU-UBvJet+-3$2WaMv(zN-U z$;59W^H{6?RuxfIHAT{@dF1nFmS&bmIr!YMEZ98qNj&@NCu7Z9Xar$gdQmNvi>V zqM{a9S*g;RHE%L(fU4z)3Yf_(GI#(EaxwJ)6Jct?NEB7N(Qc=5zF(Fc5~QI7f;)_l z>8yQ7k&UrrE9|0-U9DAgPfr3Bsh+K3G|L33A#XDQ6X7RoWZM3s3GtD2F9JR8^E1pPgtohE?1RQ#eNdED*V>UcRe+r}K zbN>KI{43u`)$|1oR4gF6%^@QT8OJ`~5E{bPK36w%qQ|Hn{!21uxPo76X6jE2Z9XH@ z+Ap^Ex*9r)XQoPd<=yvoiNd%=e5eGFN~suZllX?4OxHdZ%M)lGeExk_*JZO$JO_2y zKV&A0?9~%oqw{X@!$w#aN1!5bndfis3X_h<82Nf?!JLRX;}BsVi!)l(smYZzs&EqI!`l6(lGL z@d5xRvB)1SX+x)*P)lJJLd~8*6KuM^R7SJWIFTpf?8dwa)o{{W9)=&ZgsP>ZL5U1vsdnYwzpGn{kxX;1Z0 zrhwF=)ojIq8vI4L)Fitl2nOX+2yNUD4?J_qLL`OVRZcsJ$1UGhI?F> z2&Y(KdGGZJPR|ruyrpPq#E#hBz?OAp_1oJ|)7eW|uD<$v`l}T^3sF|B6;=NL?lskH zq|n*oS|ct1P*{!g5C99koqGY1rMvS@;-JyjLAF`?Z>B0`sXO2oS=AYp)a^T^k={(H z+;GGUu0M!ti6a~#Myx3!-_$kbs_XiO8+OI=qFSP08GsVZTTmep&A zr>WQ!Q#RemJ|VqhL6T4I+v8r@1PKq>Gvdw|0hWgr~jV%Hq6X|4@aJlh4Dt)z;bt#MaWR7R-?M<9ZwkHm}{@eX!@ z$ZZqiPz0~4C@bn9lIeKrg{Oei2Y0u*B~d{P;e!xA5X+*E!ZJJP5CY@khtk$rZFQ)& zZl9>FuXz?PzScnTruhqk8TN2V7(Wl;U~)jlm|4O&;+2NnZtFZ1SBk0%XeUD)G<3`v z8TVuqkbqzzTt6>@6(Y-Bs4b4 z)oI^jt_sLo;Q(?*z&XclYRlT2;-)s9c_ZKDRgXor=e)~$d*kS|{-Wx8Cx@0eUaTz* z6l>*EQd33&Y!AYSFgPB(0qS|zm(^;U563CV`I5Z`lSd}0-P|LOpjp(PXzrh_pYHS9 zYia3Z+6_>3kVY8cS(9k^Pr8%&<5(^2%`@f@o3d$o89&pyn4>ZR4JzMNLH8THgMv5i z1bY1Sws2RTY3Nm!jlTD7Qh5b*q*^Np>&scAyv~~r)?!t^&s9zJR$4`6Qfn!ZluC*i z46Z$~q!cU;s$AL`bCs#>F4X8na}^L~2AC6^vXvgCLBTw6ti5#4h&92$4h{bRD&zr# z{C~nf(subbK|Nx+RKDy|MHx{llYl#(Px))bYu)}#&Dt;NpWQrS?Y+Av98p0 zg(#-;3&lvlyofP?0Pd^Dxjgnd&MuZJ+%w<@?~sO{P+H+(QmMKwtE`fW#T^V5y4ojt ziSMf0k)S=3Zw-~ew8Hp zP;zQdU|0-e)O~dDc9U4{<`F}{4nArKhW8Lw8uSr0ER>A1tTQ4yKJ`E3=+b^Oyg?|Hh!AYYua15aGCKQiZv>I>anAmOaj}>x{#5;Wy52ASJEZ(iuZlRW)*DR~4FZuGq;Qa{lg9v$T_&rugBt0z z5)x0${Yt9#24(KD7f|%ygq=s#R!FY)I*KD3qAb+4vK2mPTse))3@JSAAd#NuPZwmt ztYFk>2xvVV?AY_iKC8`1+YW{0k}#h!UjFMf@q58P5a{d`VyC8|YO0k950!C)Xs&eS8s zX;URMzju*w_;(;=V12>Jz{j??qStA)G+~vbdh)bGqL2_>pMPbv+dM_4NGT|1Fw-SW zn^KOx1#QwO?gODi#&FBY`ho?b@gqT_{d@YVhfeTG0{s2*irjk6uCo!-MOApIm2m|v zzB2N+U;r3Y8ORyn4J+FIPaY*8h0f@)Znp4q#a|Pu=e}L6klN{}=KG8kRh3c1l;gWA zK1zq=7TiNNco@dJaDZJC>HVuQ#1Fy_QBV<>s6?1_L-$?{Zgr?T|tO?~o-UX^O;=4IFcl(GSU$ttnQCz3EY$3697 zh4NFgpLtTeM!-c-_vEU^j71zWU@)0?swg{@XN>WUTxsaD-x=ftiR>*BwRow8g2}Y) zkZ(ipsLa8psX>1jjDj!duYFC{X=At9sBDp&d{m0ED+C24C2&gv!w&rV^Q=uKm%cJf zE=k9q^tJI1!tDMKTc20M)DTo9%A%$~#aP2n8pMJ4_?H9WKbJY@TG44^go4KW`u$V` zW3JqVUn*!Fof@WEpTPeBc$Gpv(xc%Za7e%z8lA0cN5C+vgs(NysrrU3xI;trnS?aCz=@_}Mngjmf{J|6XWwyGFpniqq5AG&U>KUhMNAk}0RBG}Tb#a~(uGm?VmQ zQQa5~%Gfv*<$x1_twTKvDZ39;wY44vcc_^QO0dl)*4x4OGw{Cv{5{V-^#1_kvQ~n9 z{K^MXVSO7~k3#iI->7WuPXZg{x2R&Jsv*KhBVa@tH9J`gk@0Zg;BpAYbM%dFV`s!V z@#}p)$u*jf_8d!&(m4J)s`piOy-l;I;H;WTVbm0Lf|6JoP3<8A4jMoK3!JWVxPK5n zx1nicw?E?B%o z14Mz-KK<~kYNmdv!U+2G3%Wkf)g3R?{Uvs>&`OmSm#J%y?n^PDkZg_JQLyp<00AgR zA^lXYHoHBIi+mOEoc{pQf;P*fe#eS%R%pC9YAfuNm3~aq(mO=05Ck%u5Xw&lTj6h* z#&vDlR+&C5j*E0^NNt62w)j)i(b2s~ zV|JrXAIbDuW$uELtU6t(BxU=|Y$m62IZ0G>*zaFDmdU=@(d$$F0RyoIdwe+xhJG`=_>mn#pxb!@Ze_d$8 z;AGs>VNjfWG)7i{x06dm)pJHf2tu@xkU%8hd?bV4+dam$OuJfK?iN^&d7y_~`hn|f znpkS2rjf=A$BdJX8w3%beJGW#B8@2v2+M`2RQ9oPgd2;b^2j$bu-uG6{{W!XrZUC6 z+Nk%Q8W9<;j3IQ1*gK7*9sZh{`UM8ws<~JLGJthe-UtshBHJDgdmTu@N;fEeCD|^Q z?uNNgS?O)n)y=WemgfkPDO4UpJ2?7P@1i^P)H;PGa{)4ww?~`GJ}=KI`s^Zvrgy5F(3;q ztXG%2SOtbCzuk(jsX6`ELwH+n>uYn|X?oVWhUFwx3MgtQ<3%Pgq%ycgP)0(PJ9C0T zCyjZHS<^Ye+G`#+eErteoK7vW)eboQmZ8_-;^;f5Kn$O=sxSx#pSwy&eh=rYxN{)& z{leZNn!tRQ;U!+@iC#oN%z>Zbz$A>G0l?&*JvBmNT_ca0^k7o9_yN{~r94M8Y{g)^ zB8b%F0aF%PKj)0gALhDPi3cj4t$??1U7>GC&0C~xx(b;5$d7!E{{ZK$3=p!84XY=LnsDOQB?qrINhV?blG+# zf}qGFaKX=GqhzPRG^gC-xKTvEv^s)@#cyXkP*OBf9mom&=1$Tao_jFHNIC|R*a&*v zY!w$8s(LyJtLvVLX=);^59Pdj42kAOL1{x{{h$rYs3d1xH*`eRm1vTJ<0S<$P?K({ zM~BabocR$8xbwb9=0uM;!8D`6WyT~~pk&Q~|gZ6a{5-ZkH+Ns)KCwhbfaaHohO392C1+q>KO5k#Z^i0t`s+lWb zwpl7J-+X~#sCZ?C)>wAr11x_F=gn>uo)wfFs2U^@f{7|kJ?`UEW^}vH{*I+ZJaEy{ zQwarJ5^ut7_>MziR4Z&GcsrTu&r(>FAT-=Ytj9Q7y9d zbcoY*>v?r?ML7a$rTj=XcSJlBy9bP8OWB=+Hkwf;zx6cWdHJjTtNx&dSj>7uchza0 zC)wHxm7H=PBLj|pTJ^3JX?P~(Lg^}|l8nSj%KRr9P;4hEo8pa~WCCm*tTwS8p4zP- z>ZD<#g_Ty+P*`H8mVfo~;mnY23-JJP-26uvKHBlUrP^~q{{ZpW!U^kn^;e>6q>wc6 z;;A=p7y4DG;ib3JH0Eg(u>e1pCO9lJg!RWiOljQHLICqIOP!+hCiMM89LX~c37D2E+L+Lp!vJ8Kv_{b!=?I8)+e`F zDsB_7U%OOS#Ztvv;^lWC0Q^87gdAY%#PP=e0QTY*2UFMiNYeKm&s@pX*L&|xbth3- zWSmw=%rI&c6;wDV@#Gw2cOLwMt&JyT!7NPLHj&Sts=K#v54BC{dP7UqR^_aboV(vvGW97H!1twopNU;5J7Lp4=FifF2~Fj&b8<|pxEBe>^`^Q;Fw%mKpx0F|h~ zOK63Y(RAzS}`ghl$Y~hdz90^&nL`;MdvdS_;*2jqoV2i4NP@F!zlZQ-S!E6>&)#z@#)@YNoHpR3 zDJzcH9F1sbbkO#mti!Hm+c#mLKKwFjysPH4+ok^i9hVT`b@%sJe_Zv|!Y-htuIcN| zZSu!F2p%fBW{1m=HeFgh%rn3t$0xTK*RIlObUFt*7+pklAT<6@9#pMDWevBmU#c#FA_X`zI7D>S!e0dV!YBamzlSvTt$Bi{9rmT7#RFG()V&Q zqn4<76vk-H$rai26sa}6dDY!nN#x!-itAai8U>DL44zp4fWsMn{=J87S&vT}MhD#f z3X8vgHV4QkmMaI6R(XhmDFW}E@lV6-TZ!&Z90Ol+A08{H%JY&{7B~+icLwHa;DSc;^_&9sYViTXsTo6O9|R z7JGwI?Tt~go{V@0{6?n)D$adkl!H+#O0%$0xBxJ`_W6A@Z=z7sRus^$SJSrucFDF_ z<|a&KTOm#fJdzKfKc{AC!L(#xu;7VU>=o;3p5T_6m$%cA9MzQ31`)TW;&XwXGsm`d zPkUVLmN$4>MoSEGY%a?B=dz-hiPsP@+&1O59Af~nKu*7tpI&|SD?nhPrh%BldZ<)WzIC{z4d!64+FK`x>;ST3Zjk~5d%*9eS;irVxP z_4fOStbs?~=&DLb5GC6y5d^ZSUB8KQyLWMes%_OYy1}vZS{>MX31Rv2w5x>;H(m6d zp=zlt)(Q$SHnL6z`O%j+p|;h3Vt{=u**_R(8WV)X~lURDq3f!SMKp$W@PJ zc{fWYn^N%OV2&^FPY3K(N2`1buIW0;YdzYsDeA5^hP_eM(bGy(%ko-O10V-hRc*#H z!A~Hz2pgKcGqk%mI33;tkDA%+{Zv}mp7vNRHIhFI3>ECMMC#aR+!b{>E9;F>4BAa~ zQ^_v&Z7SQ)T}36T^-`N>lAg8(rj{6FSc^VHkulr~uvrlpsm4Y?DtXX*Lxw_(7d|IO z4iC@%y;r?^dxNaH&i?>(h7Uce>RxEt1X$Jx$=nlQ3I;hD@5dybPejQG0mp%#f7;4~ z7}y%@c_>{VOWfgsJlC2=2q8t?BOhQrfA!TZLu?aR_wVqbm;fyBwQjKS9=@-`%V&qM zI>ThL)Ja=#l^8NjTI_x@_{yLgQ`nUR`e^CoEQmA=eb3oR*}6uT8Q<5BAh&<9M@#jk zw&!xY3l;JU3RA}{a=iq|v#Bnv|j`UwuL?Vcm z0x3QQ0E6YQJZGQ#&ZE+5Lp>B)m~(4H6H?tN%@t#9iO3gZIC zQM+>NRsDGyw824EamtO&xRZMvFIgrCAuPSJI}JqLL1`{&Djn?FOR^Qqp)2d} zR5aC>3K2m(&dee}*a^GgCwCy@8kLSNZsHWICBRsuSM3RPJv-pPTU-~0q_Nk!o_mUB zLF?|rwuYNB3-KSy-6P{;eAnU~d%OfGjY&lc4i7jQ%y`KwcoI4))qTdQl2~gg6(Xyp zl`3I%Jc$v5FV7hRN(B&CKjUd>9IpBB{{X2W>W+!D-zY+uN~F;!{utZ_3V+=suly%k zQM+);^W~%h(yqxlp}9d;goKlZ0OKmy{{S2x%SA8hZdTXf{m{J9Q?OthdB9_ji-!LI z<^YU)om-6bRUR)ZaG;!A?J!B5&9EaM?+y?6e!7G7xlTSNSAwqH?QI!sXCB0A=sSX| zTTWLv(O#mgxm8qJdTs6`$cVM@aIdu$NqyWaC@j45gxW~R8c}L-`fW+6r@uTYQC)kX zvOpt8e<+67_+7|1k~U`};b2IALI%2rR6Oksv|TOq*2+2;uC3~co_3vTZT@mcB!Tv@ zcLZfoyYJyi!Cp1PHH2&vXw?M0Na&DISZ%gCSY*fDXQ>b7Cp(IW1|#Pf%COS+H)N?wLVwZjo18@= z`G0w;+zT(l!JEu*ahH6cY?fE-NZd`HQ5ec2SvAg@nyIgK?1HCcF-2&mX(gv~AjTb# z@Ny9EW{_yt7OFhA4ny+n2RVk7*G4Ld5J|sBU008;`55uAfaJAUz zrcY?U+~br~-Q%OCkh-Er5lKR{hcUj#*u9ef#zh#;&pS<epYd4WvNW00Dl+s@|r zfMh#!@V6UJ420vckLI3_L}#QZYAly{<*K_>-6oi%ZzSUixLv;p&lx=X?!S9ih@mjvzPw;OTJyIpA3bu5nz8CH&#FuVmJSCf&I91IU$b>%g$fzPNLKytUB5(fVOx@@C8 zWoApYu0$i1tmQ*1Mxb`~$`_>#Fz#O?DAZtG1=e(K7AJI^=QdsNO6h z(z)}fEHKwNSxZGMO_h;UHZXX~upx)Y{<`z}tt5J!yOGzZR=QzOdbgzL`hLw#96=av zDkSpHnYX?bLB|Iq1s#jmECD*$z_HnDI2#KB^Qj0aB;_`lT!qeZQ3mP@IFXH=0*)-`t|y%&s=pi%ATr( znBsFk!5XzvDMRQBIqrS4-&ro~mlJPwl#P}#fpzGveGg-ot2(0B4P?f5p@nMe9x8>A z5?%O6vyMR=p7{Xtq>Di#BoZIeJ=K3JGgqk*vCVH^711$T(kRhUKq@eC$kmBrF5zG| z%~v|ohK8!wS5G-ch19Hw_$uIx71x&5=`3z*sEVStbkp}h61n>KVWp-)Peq8}jBP}W zVL-=V4zr?c<=fnBo~u-Ln+f=pCb|gi;6!9dlntR;JYauKY_ZR~c*?Tb*#7|1pi#+9 zPjHHVJs->LxF8>|^wh6&p2FJYH+<%btSv{Q`p%InBWY!iegcE$dmb+}GjW{Z^-Ux7mDJw8vuVVv<7NyQQwGE?(Iy5CqY({Aw41%DD`njilpNJ1_k~ zs)|nHcYG%Rn>&2@_U%UHP2mZ;7?Wb_2NinmKv0KSqzUSU8x(~Bh-8Z@Zi6 zmKeYpJZb9ioH&K-l(IZ;QQ1M-0X+8k=%LP0xM?=W0Wzwo0I&d%IPN*ubyhz}$-??I zjwYyAn-DSswMok_9AE-7^~Y@kV};MXhL*Qwx^%}!biG^B(@#A`$U`L!W7o^)+hmGQ z-bo~0Sc+wtOD~tW?b@tLl>xl;`<_U1MW3jzI#eNRCu5Aj6UI4JE1dkHT?Vfq;$q9TM-y4nL$%ErsGc}Gqo(L>hVga46W!# zW5M>+(>J_t#lq2`MG=L0f{yLf1Mf8w)La@=#B8z|0Qm4g9rtf4HuvYYu*-JsV;hgD zT?lL~A(;30T@Cg^@gAGQzZN=1mV14+m#F^$ZwqX-H#>eRsM4Y1r*RvzF4+`3sHQ|a zJl9eRoDGNzBYRj`lgX|}j&k*T4YpG}`?fo}d(*03STXnk% zyN#3XmG2cynTDNW_##-)EUF42jhGUl20^vj61lNCrvw~d=lSthyBVGA<8T{Y&!1fS zbnsZei&jYbm!hnlRY?TVR8c~<%Y#Z38Gt#CP(T?fSfLz{zF(l$HiJ}Nl@{la<^C>OA%!Y*ZTGBctyXwN4c@*lyPC;S z=JFH*UBQYu9Yb)Xl@2m8$})A4EG{jnoC5|){#JgWwl^FB`CX^>VEZ9j`U|P4I*Y3X z754JATc~K9G_*8lCu^`|25EZ`%(zeC1pGrO{inJ>)_#H1_4#xQH&Lo{qT1XKe{auK z`U7D90Pgq_rbduR#5i}8cID&~oaFPzJ+-B#V9$2$qs%iX&hR-qfX4nagWu+RYiu#EAghdVV78kIrfl@`M3K#0%m#nis9fi`$a?0&G@_oseO0Kn+L*H& z$n;X|ofFa>IS`K1()Me5=Xd(O66ZeoY@GgjWKM91=c&aZ__<37wo(jqt6`qw?%1vR2O&QazHyQ1m9EhL0E-|Db^ie0%|4$}?L&*Ii1Wb}=i$ehTYrbOkzY)8aYa)E zZ6hniRRM-oV0qk){YF6tzOx+bVWqF9B1g2W-|g$vg3S#oUIc`l$Za}96jI|2R zKmA;9ri0+z?*M-l`>mGS@Eu%Qq7OQ71EIZ0Ib_&r*Ohb zcJ1qqMn7NVMy-*K6*2I-mVx2*(VZxzX)_`@f%wO9{(DYFo->eC5)4x4ZjQPK^C`*V zKqXA_0mmc${k0^`>Xs)_uN7gcgi0_6BoZ^n^3pNXne4U7mq~A7Ae#Gnr?AsaBEcHc z7I$W0*vde_;l3cl9|<_Y&7emKa7|kFq;&L^wpyB)sxLKFraE{k+6QBpw-GbrXiJsnc=7;m%a5+4{1v-Z~088*FbR%4#ZPV&Ha;P`LQFaB;MZ5;Kv?QsA1Xq@}57 z?Dt!|mx(Lu!Xm=afdeA5tTF{9R1mBPDmVWCWOJ?_03-Ua6Fo95YgrfUj-7gHX9hX0 zbHo<`Kml?=a^DU%w%ir<&U6i-b}+trd+l9JG?$C)6b7kXmX_X(vdD@C{77;!x6p&b zpH&A#)f{zEZJkeRnvOekP~30PBU8K{d;!*I_QuG_5Gmj?zr?x9u^GlS*TL$KRCcYp zlJ7@Gv}J4tXe2c$H8`nyNisG7Lj@s;An*v_f)5%MMGkmC>L_j0^W7@zt{01~RlZ0O z%gME1^N>az4nYfpmvF$1r1xD5>Jj}pL~J*Ey=6ob{{W-QYKEbAR#ll485>~Xe*?A- zMp!;J{K3(s*ys_NTPinlGsaGFrAE;DmXSeV+ejdc4EE%IO(DpnO1IAc0C}K-pbUbb z{#ey|RgMyk4ISDj7P6L}v0;BQAyE)<8v(;8IRO0h?0cAg6G>dgy0;6ZP3rFqIw3FB zU1xiunyR!&lRC7{$uxrt8i;|zF(E)+#N#@&HS@(6a>6U_s=G}Pe|qb`vbtJc=QSCU z>a@_+Nack+Hb{jJWM)5&hp7M$Kse5*bjrBh-||wPF(R#WH|pNEw* z99znXKZNjg4T6>`^HFcSG;2LPiBaXNtFCqwIl%+?y*^`D`k0`V;j)XbGOa&MCT9Nt z4}J?Xe+8z3<@+j8J#`B+7B>|U$x7RZb;=%O&HyTQl_7@*S@Snbp&OVtIouBqH3V($ zP7m2=y*c)Eut##WSJhENEj79oijswf2_VIw6bY+(NYj<2-3Aw(;2O6D62 zo;u&ch^-AOJ;4+*kW!y^TRp~okG86RnD)DE*O&6WEIPnNZB$->xrq(l!5tj6YZQV- zk~msWr0^B7$XtR?rnDu6&JCIMT|qT}%I8ehxR7}Y-)Eg@tsYF(vecNj1S9~x&f$-d z)KO%lw^uOfC4KhwvRIMe) zYREd)=|>k)EmaEE$kdVz>spEwC3s6r(yJGfduL3r(W==!^bS+@S( zq_|HUj`B!jmKVm=0e~Q>7yvhH3?6Z+Y=pG;Y;_evcv~R&x-Bvvw5^)(Hq}ctrfg)# zSTPcPSxH>~0KJW6NgR?mj#VKWnm!SJoc+~)@8Yet=K~sB#VtybMq-stKP>U8zv0U4 zxF4cN?&{L2HJyD*>45u>JSopQwB0G&L}4fdoT{z*`xSoDFi7p`q;{ONT`es?!_`)# zsSex`op6YZ6}HJ2u+HjoGI8_A8s{;!l`tUit+w91yHL=_Pv=Eb6ig(C?@=txPUEvi zPi7wcZ7N2?I(F;F|olRzvV+Hc4B(5BxAb@>I&#Cv<4LDBH;P}i>K1$bri-XXFw0QVK>q+yBuuOi>OMz8CihRbfZ^spiv86yqK>Jb z52yENMP&UU)D@7bSSPo}GAabMOI8RynF)zmNcJUx=ds4Fx$bQktREtO3O8x?u+zRS zhmWpMPPXvpr>&N#>!=6{Sw=7moSY8W#a_Vmp zx`VBH-pv*Arfaor1tVKxnt0T+A^3~#2Q48iS={4uWsX?6GEFR#Y2DvASoy0xj~I?k z(T;P~X*P=#-9zCm`i_PwtCUjFR6pF{i+r+66h)iL1F_sPKsh^j-N4t4?EU`$`lxHL ze{P>;?YmM|KTp5Oa-ex2NZ?bJ^Xi7OZS&rvbu%gB6a7gVp_MN7DU4Ed-R+(<{a*VU$D!OofQsytX$f z9IiM7A6;y_RL0RGm8qtT&dUTURpZBbE%gQpUY=^EAH_9p(TsN?V>tbFQclwh!&pzh zn$wT7!@pCh3Y*9Kl~qxOmWtg--~v)_GIs!+XNFPl>8#m%UR~82N6yN=-pq(Jx#XX> z^0bJ0j@ovP${Icb<|WnS7WS7`U=D=>STa}_!~Ttp4q`2m;yQU8eL2}_K6$F^SSMPX#P@{VL11blb!hfi5&~&w_tLKhcBNE3J*su@A z*v~v*44U+ZC1y? zS7}xwp84H}TMU|Z>85D|sRq3d^J_X?G=^^djvj3zgtZ=d~$gA)CI$WqyUva z!XBt@it$p*T{9|ERk63u`M^;jU5;_+vIxofXI7(dyHxdYmooe^zd|8&lH(OX1VvpC zE&l#6f1uWoH_$b3>ZA>WFm9 z-RY8+I=-QvvQqUmHAZ0!0onGN>&Sv@^NOEgrm6!SbkQ zg#wNvA*2zU$=+Z<7#|77b=6#=FND6Sp|P~pwfDQVRZX@z(v;id4Di#-w64Yo8Mn8X zNATxz4&hFP(Xf{tQ#Sjg;vk(f(-T{1W(m5wP-&aXWi93+w$gFC)%PnF-5+p6iK?{V zX9^rvE33sxS9`w1C6Q_2sFsqUqKc#|u>p$!j950@55z!Vz>H{JF7s2VdafyGW%3rL zx`dj_jl?urRmr-=x}NRQlYYavLqhcv%}ormQ>#`?k^bzW zvyKKy1&9RW8U)$3U%cMt{3qX%hb)npvU57#{=eg1HT%~nC~T}zWkRtyQ{@XvDa=asJqdZB#Aip1Y;nNnAAF8 zM@3MY!BZ;br>Fh>M9z^R$x_^FPvBwR_hA6?R!!pn}+6$MGySUDv?MY<+~M^1_F6N&ftJd(8>p%1;g#b=r7K*|FP#jevgH2&?8!JM)M{-z z%$_lq+w$B__T2I<>Qq&*@NGe!)H7mvuR|Go(cVOUP5)R-2+D|x8GbtrbGlPx}ef>2n1E3->Yz0tT1vi8pmUv982t?tAPH~S-SomF+A`Q_> zsH&Q+OVfb6S1gI1KV5KZE{`eN;KM|yvD{?rO9ShHodc(5K(00AsGTKr%^wKvJ@)ge z&xSB)m35ADh0=Mhvn~KhGx});nS|L-xSok}-D+th**tPZCkJGs2LxmuK;VK8&pPk% z7;=XFiC1y-Jc8$swJ{3V&Q_Os*~=sb!Jwa)~Qi z$H27|vcGOiWAz%~*$2b{Ko@mHx^9hXD~JC8cZ!(uM%d)w{KI4V>aLBCxP1oP{jB(6c|nW%6GNG$ty6q-UZ@! znzCn*=Y&}>N9r}A$ekV<7ITRs;Zk9=fk_QjJag1TCP0}}g=}YQ9>Cxp4}5W{2IoB8 z02EDj>uKd!x@=1Y%9`O2F+beLlvW%mlw>mdG4;^YFAstTMI0eENI&End-)q875S z22YwN;HTLp-qduruDX$`<^VXdM%aZ%wDJyQ_t>~m$SmCIZFg*f3pCgsKgWmVeBymW z+;+1cpXqe9P3D)U=_x7HI#Y&PYIy1C{uJ~8DvTob8Q=|haBhpr&Dli^>Q1i~*h<91 z&SBp@w35yu6y8*UwaUwG*Q%?{PnN7@VsU}}PO_wt(qo%abHXWSpnFv?m56EEIAU@J zp^`HbcYnzI=}EK>7LnnXS6r>vWNKkF^7ciHl1qP6^co$WC5KQYX|G&o-$eu1=}2Qm z+O>7JP|ts8Zt|FcNc@LH%7}*S`sdX1pMT3=Sa5fA*AfZx;cIB&5Q5I727tv`7!raQ zc6Iz;udnBz(8r9c*=n!Y>_VHUE>9@mQsq^p=L`wT@#ry)X7@EA-Oq};`OBjKEt9Bg z{^QVnMJ5L1w@LsbBPR+w{LZ}~J0T;}#-CfPEQmv=7Wa65 zR(oR(e(4SeDgK$V$#kB2c_A~>nOn?R3n!XMbw^|q+k=y2W@*^&+OLd%bX5M^SZrR z)DSC=nzb(JIATECwNB&5AY_g+jEzok zEmtqADh=B2VZPhups0?zz8SEl8f$%4{Q6+|U@K?Toj~U_f(nVxV}%_2iVoLmYCH85 zT}wRHp-mb+x|X1jg>X~Mnnd_oL5vVcQJ>FISZN#ZIwG z56?e8eOwvJvK7LSe(69quQC~@&stINkz6Q`QNvG9}JeE$G059O|I3JxK|EA$t^Zm&Ie;T3}F z`TNpBo^ykc80BB|aCM^*v1`t~yKSBd{?FvyY3hLs<^h&G<2-+l^3K z#H;1y67fhhbFG(x{YM!l{rix0l zQxz38REV+2S>;e;_?-?iFg6C-IML&j0iZmT`*r3>5$P-)Cml@%FP8KbAc0O+Tr_}8 zcc@{OCj=?T8P0G;l++3>e5AckSyd$6M@?q30J|hZvY>n?c4Td#bC7TtpMW1yG=Lk* z9S~D;>8c5ue{M#+|4XBw6nYY**8dKBo$UIkBb;a8-lPU$2=V<^YBB>RL!rUtfn~XDwgpx zIz=1~-eQav*s805BLr&8>Z1!mSc9^MnjeH2IS(k9% z3Keg~l_hh-ZUwt(b4b?3iY#U(-+3LXMiArg|!@#%objz^#m{XF0*h!6T61#wF5O8rG{q>|&oi1rKq}3Lkv6vT_|8Xu zef^BR{DG2b>As^sjus?lEv1r@Nboj4JgcwSyQOcJTZI+spQ$S&u-fV*j%!$|nL%PQ zBiWxVl!gq)9sM(^U8O8cVPRn2;m$uit6DZlNOtdpbX`8{k&oxEC&JvdTI!yjON=I1 z#${2Fk6lFI7jUX}nyrPXstHm!9+}m0M+t#dd$r*7{Z+y#Ir7dxVUkpiKV5kJLyQi1 z9S`T<&1h*ycSqUA&ys*5k7L#5ThNBePuN7=c>17B7tOtQcf zD#uTPsz;#582WLk9z~*;s6$rqHMR9CbzR zsieAfC6fx5Y+hR1SvM;{5 zWRZc~STQ&pU}Sgp)G)+i0D+U!!_{c%+awJwc(Qp)zf?tew$sr;Rc`_^P=yC zWsGbgxf{Lv=(IdI^XZ;_4^zx}V77D5LuTDu%Eic_-nkq(x50>m1e9*Xd7*)pZ4hST6 z+}-u2bu0u6!BqyC#1c7F=|fV#mmny9e!7{o@nKbW!fD*5+HJ%8Lr+R0l*!rbGo!*( zg5Yp6bEE3HeHPOo^E0>Q-$Y?@C2gmL)<h6=rc1 zFOsMki6kM)WRh^BB%XD;(&9cbpC!$3aImKIe7AZVG?dn9R#m8A&mQ27>_N{AJN-36 zR&JO`2?PX_X~cv(f)`BqYvJrS#fV%e=~-n#=SKuVRJI0A201?c$@=PM#w9f)Y$tTi zgl=Q97QtcrFu~LHKeg-<)n3U^t7ETqGE-we{uS~!sT`DVh+_b_)Oyav@0s3dsFBd} zeC+-|bw+O0nqT(jG(1NiG(rloBZ$ag%Eb&_nMmM*ybwpR{WbIHU@VZfOc9Ff^i{~l z5gW|Z_RC`zMQ6YvSmy`l`Dx~JVT!3MS$PW?>MP9E)ms(zmM1O;r?A!mCSzrB6lijs zvd2$MJj{*F!9B_R{{Wt@MCLPvFafG>Q}s=sqpe9Jv~z(3dUH=tmsD@im)sQIS07+8oLK(cGqV^-D!1A zc<^vF`~JyG;d$I!UWHeyX0zKQEoQ${)IcQirMJm8YN;HJ$`~*Om6w%p0pkn-!EZ^U zo=3NCrJdY^k;wIF7$fcuE2`q#$jI?hx=94})Z|L_Qq00xWnYa*&LZG&b*39!Dwk-Ts)&!?GLpynYIfjSg~rx1_JQkvCi`xpvGpCs&wsQ`vB?Ah z;E0)mm6Oi=KL{W&U_n#zI`uu7rybLuGXZJq$;X%Qu=Vnmkl5c&2uFyuJ~ecAQ`&A9 zOVp7~S1^K&V6Ry1uO>2Kk~b?DQZi2b;RTLhGD-UcM@<(re>zbMfKf z@6~j)^wif%NQGS`EmAUx7C9qHqXdEYO9CgO2+~_2Z^JBY`RBf@oZ6_|VwK~hYm9Xvqbx+D zw0@-gKx6BXsaRA`3uNg_n1bk}Ir8M}M+0kPKcUC-)Q=VvS425r9ZMau*G(l&{_k6G zbc_|ZtgItx08@l5xTZ=63hptrh|fBd5HOhMD5JH)(^`;O`eK$wiV#wz_ic^Y8MY+M zOKlDSJ4o6HAx?PgTPP!AeIV1USB_OFNqie$&Z9!q2!aC@y3TK zODL%RwU7m`4&Zg5Hba5?)uJc(X91{eNC6!(rIE{)k;jzk=SlO3w z2ZOHaX%Y>ee*I}kwN2DFE2R}hU9t;R5`0q9Ng~No#iQkNs&*BSbF{AYZg55qeAjxR z)IBk7qODrVZZq8KsK}@bPbb43z|oI|N&X*%ra&yYz$EAc)eaO}HST#~cIn$Z?N2nz zSZZz(`CFEt?`a~yD#|x!V2~fgqYN-b8t_eIVMdg>i0B{KJp+lk>TCJAmI)cou<$U=60Cy-K z+8)23eL&$wLg83mQcd?+_L6cxE;N9pAaz14UJ-Pix{~h=4UQ?2>PuCJLnAmRJ4g0c zCpZ`x=U1Jro#nRfI8_^@h%~mY(VBZ@r^D+Af+}cicB*Whf-@Q^W9^KVC+beBOEaEJ zrc zr9CFvdYG%PW>Ez5(UR*50C;i^)02QPlk*tX2Dy!vH$8xEu6>oWqS4FV2RINs@UmJD z6gOS1w#i{xDXXRr=Cj~NIL<*FlE?o2Jl2v2wj(Q!M^3*bxikUKp~wrT`exSLh5!(fRtTUF_O|9B@2` z@~;*;dP*1;8R+7SGbc9(cZ%svXE3io(OXY&o{vvqx?moJ_4s*gSA+FCJKK}q@&CaPo zgKRAsL5FgAX8B+Y50~@P4CE_OI9WWcJEE^K8VKW(EV3xuyVQ2?uPfROb)O%8@Vy?M zoZ4#E`g*c9jdD)|;Ul;@^P%Cssb!DDg<+nKn!X?2g%9Y%jaoA9Bsi*rbY|MCx4w?2 ztA&AR;1Vuj_{yWe7ywQ>gI>@q;f^&n}fM5e)EG9$E= zbky!5X%;yP66`xhHmSoOOyGNw#<=_?IB9nZbtSwr$4-nxx9Z`v#tZu z`6dTp>Jf_&$K*iIzIC4*lM&uSP8~*Xl!<02c`F^^u zDbYsVj4nSfe}xCY(`2YMw`-3`be(m2RS72+d&@)%Mdf@bz?=M zI;Z2|2l(md;-G8T`2${Pd*y45;KxSwCBEHp>h7I_=XXj(g{tY~sD;>Lc^|rU3_)Xp zMsdcyyfL;ITFD2{bN#Ib$lz)dK^5u$00#O>)iJ8Fy=}H!XUmd*y+$$ee~5hW+Kg#5 zlRI;M6YKn`9jMd+wb=-z)1WT3m7aAi=GQRT6f2n`I0NQBdbTuACfM*)Ls9^u(hyj@ z7Q0s7%}q~G*{JE|K(cIxYKai8Kp+C3@Nh@xu05H&hZ!h!#i>TPG*D?jWqz>q*H3g+ zs&=KXudbE9eNkIps{kAf{^(>zK1I9#06Ru4CY6SbpC5m^gh@W3z&KFw3oP+du7A>+ zkwr-?kWwsWCEC9eA>iZKvHt)KVrq2_m*2cCXc^p#>X&A+Cq>6@x1_SmQB_XWv!a4o z4*i7=eH3H2xz~X07^K&2{{S4=CWrHtIDL}m^0z+?x=+rAD!IY(yP`P82dDY|diqyk z_G2_UaT`aOJ}1R_t!|*b*jjY5r5tO8$DqbF>p9q>5k8YRjsj4J;CRrq_+-D`WLTm7mU`+7nT!&pgpWuzy`l_<&p_qdp%E>xY?8QAy*Lps!u9c`ERff=LD~y2`a~5wc z!3r_3Z0+CMTaxy(TeQFh^|4=tmdyj4;@6TJ)2iEiJ?N@#g6VsIPtZ#=^fY_?r<{V= zXJcPJ_rMyc>}5^@8Cg!jN&OCcohB_w z!qGm;olE|V_}fJkjL99&o$0aLE~k2r&_C;|F_B{{H(K9>JHW2|ikziYbO(jb^&c+( z0MP47tSq8wz^R=))5ftdBza6h0G{CP1N!JOE5KV7s-!hMGawDKu`Q6wc^}B*^3~&! zYOu~0VNr0Yj^R}$n#)No&Y8s|t6fhlbIk(AN=PaO6sF>#Mq4>2f9N&snZNBH! z_Mt;rSp~YLNhDd}r^~F=?u?4Af558^AiIpE+aM;wF>({<@9B#>Pi?N?V;&G#t9 zUh>cuC*cve%ws#b4UBgr0fU~MR~(}Bm+FXUtd#vfYKq%WaS}=QYPxwL@(L9vdLd9U z3y|4vK)_&0*In?ssMVxT)7Oiw8w>sJl9G#PX`!cuS|$T$B*>}8^$ZUsckLy((w`N; zbWsGo5o=YFI(fQ=QdN|&X-EJuR*%fs01LfGmz;!UIU@(GrbZUB{{UV|w^(UxVPi^l zsjNgN%iMP?e5eYfZdGLXxFBHg3B1}qY7O_NUK&_G(vG96r=_f#NSX)%NOLqqG8S`% ziE!mePlw?oq$snrSKQu$5Ef!bHvEIlsc9DQc#|d@7Of%TG-r#8A{D z1wLUs>_N1ZJUD3-gWn^a0p_0+r{6Br)|jcyae6^IlBpsA$O>F;&fKoyo_+D16I>;$ zI81bRi8kwZQ$t3RODq*)MKTCq%3z0*uYji<@-jQ~k)$JVrOs`ItX%1V3oJ(?h6Egj z9sBCBJr#ZxTA!sLyu62~Ybq~jU78wWER^)nE=~Z7HUW+?mSRah+7)9eJ)4ijZB=M` zYT4C2M|HPZT6!pLEUy(U!lGK*37;ijF+1f3c=%gA&IfHm+IPe{D#4rFLk}J%!hfoK zaQ^`8-L1C!hepi+0ffy%Qi(>04nJ_)4p{O+IA5vntz8zA{LuZb+FRm3>mFj!mr?#U z(Dw7-Lc=^ozWAkSzc*aXWV;&$M5peaUDi0-DBKYW7{r5M!R@!6dFN5<;nYtL_h83##dL?JI*<3+48q?dh9iutjP4%IuaRo| zAGUPUmbe#JTOZ2$u9LOtSw>K~+U297pqMAFfbGI8+@nci27 zUYeF5UkmyVZ#;sYQAVh;g;*PPVK=$}L{E31oShPtC8uj^J&-f#pQ@trJE zlkpWuG^To|#0r@}m8#w~WZnMIy$8dFDzjTvsZDzL)yh^uX(n&Vs$qYvC zsnrH)PyDi*9Z)K+qNkQOS$33e=&|P`>fQBytZ)?3Ui?*-yztdty7;YTvNEXya&hoP zSdXFA_+XHqfw@*&6W-H*ZV)<;v}LYVYGk%n)WpLK8Z=ZRzkS&n);*sinnZ+A=#c50 z?}f=R_{U4sw%^$lFH>x)kg^4z41jmu%mMZw4}5DXe#??K0_?`#DE*XT6BEHXYN=jn z=&LSbsu<>~6`G3pML)Ug#&cpMnhUFlrC#*Gt|1# z(#us00Fn^=Zh#+)9r2EJaw!WyFCY`|(NU26L4eWwD*aWmc8)hGA|!=PKBeHOXW-B^_nhgZ+HY>jxWR*bOG zYV=KWWgM%7eQ^y9BS||KP;ga1B>b>Q&a1_we1~H~fza8)`lH3WY*7Fbsz=6lq(kf6 zkTs;u1g-d7091?)yb!9LN7k)%X+C1g(6$Qqw)or;(A+4vSl%!RWWH*;C(l@SD9gUU9 z2N@aFsHPH7uut7!4W#@8`V=yv;IAS%5n~{fE=g~Ek&J59k|{~Gw`YX?C1pRF?y|P^ z;Z(jtjOT7WIXZ>yFBDX}SWe(pvEh$Xih8O@M6Mk2Fg_ROky`m zvWn5w^1)%a)QzpiW z{+QqUOHucGZ%$b#NXPFg4E0P0zC6TS{zPk4t#N6mD&iP{>Z(`Y8*bhs#LLvT z_++7hxBcE4rKXN1_wu74ANbV$bxL0GqiAiPOIk5DHk)CuQnz0T`faV0QQhilDyrF# zH8`$0cvrakzd8Qs?a9~A^;2rQLU%!?8(KN(ex6CCdq5Ue?a+M|dV-~H70|L$4~Pf+ zQh)rb&u?Be?E4$B&YgftyzZ}8{{R*(vfEJ*2~SED&j18qK>Yh_+K0sut=U+@8Kn5O z;SpOsMAu1So@#kk5V`xa?+V0ZsbFwOy;$x z8K3a7pe0$3Mhc8DBhZakn@;dcBxwYFl3OQk=DPd{QR(S#kXPKw$wzX%QPNXVM=a%{ zM^-NE?cDA>4oC#AeN+M9)!C)g>+vBG>Dp`m0G%}Vs>n*X3l$|gCdB^pGN{XbWNKy! zF5a2D$+s(&1v;q>jE)He_BxQMhE1UUC3W?`!!C%tbsf>SQCpz%S|C7aobenZBz#WT zQS~R&OvZ0C6zyqygE}wCeyi%cmt1u%)2J?z$5(N<)IC*82H?9K;d9wkVL29j5Y z9AGV$d_N>2FqIj?`FvpZ2m0!%p`KNM3CO}jQIQ$m1E?Q}DJM}E0;g+8d2$H)Td0@DeGg5Vj1ZXo=Hg#%&y?GL)6Er0vMSB7w&%k2Pw?HATvX`mtFk zY0V7Of}`(r^oC9Hvp>S}?p0+!3;zJLLxSA4ai#HH;-QguX{iP8sbIFtP!XQnZ!G1= zU6Kgd0yvDbBV?BEwem1a8YG3>AyQA$)~Zi7)Rg^0S`1En%Cc5f60BxTpzU%UOJ`(& zIWMdV89Ig*NwDb5Ri5r~SAxFlezE1>|EJmx*yN1wlZrpjFv_CBMx$x&^fsaw50 zv`bH0a;8b-1<;tllanr4SQ4ZRt7HS*93xq@s;Dowc`3xTjnlEu8XyIt`|C_&V#~CK z84b0D83&tL*`% zg3S?zl1Ein@!O_4tF7%8+ec2r1eBA=e*V+29|*?5k_ii(lZTqvboI_Wp1$S z0f&@Mrh++R!Hq!m$mdnaQ`+XwAAZRePY^!ycC^H{exPi*tENz^Gk@<;aL4=HH_Yok zeKqBuPZg)8*GUN5$nad_&~(>-3eDrEdX+^S@Jr`IBe>fl5y?a9GCwTq$o7;O1*0m& z@(I9jJRp5-;{-7zwD#$Vm+{ z+uP~#*3bBDBHVHo*oB{{1Z$xB-YPlcj;m~lPTa_Iw^byBgO#1AfFFnjdAjxGx|EPN z)bXDxnLtnYh#$*I6lCJCsK*P*4p^^Pee&mYsHl}vB3zLKc)|~v2N~AX1(EzcR;w6V z5csH!tF|Ci00Cwk{PC&Fz$lSQWhJUQDu`x^Pc8C1$6`nv@^PLGB*QjN$87x zs;8r_PxlGyE1*_omIgvpg~-HgK_p{6kIUO$W3)96Z0rHl54%w6W*aC!m0qdq%3D-L zXjz5BV@T|rp~2&JldsUj0b%z$vf4_}_DO2$0wa9VK}mhNFi zKhl+vqVpBEtAYt^4O1FTgt>;*hUy#$4hZE{=89@_2zP=)L4}m@86<8bbDlInb4}O3 z*g(3+)H_;xql!E6z`xjpcC{D9KE;2S6UQMOh$9H>VB03EbPB{Wi` z;fP}^28^ln+B<>mk)95VNO-N>D$~x!4Z*gfJcj08uC8CaR7uYHK3<j*n|E$S!uC}1`p+D>Kj9F!39=oZCjIK0$m#gxXB)e z=dD=Y_eLI`64(BvKF`us7uo4zsi>7HV_zu=8<;Lh*o)lqN&NK))RJi7Sdooj!r1yZ z! z^I39MQrQHS>sL=(X=tT=%^XokvPenJ8TS%#=s5$74PAS+8i#+p>-8SOvn;YpnVn^5 z-8gOiEo+MPa=%nrAreItbkI*#v89o=q-@3zh%?ShW9O?neG8cV+qRydSU-gNa3zp2 ztDx-3YU>+Vw_WL$vW7=iXj}uefdnwmW*Epmhu6NnV@q`cpOC#1!HQ4oFtEW1S zruEY|X=|;Ok}C=5+{Gjy;C}i7#aMtl4Cl~m2V2>iu{V7Kt*7N)qp#&a*`yAjX;r#A z?FG~pzO1-(pImrH3L_!c=8yX<9QJtCNhzc>_BM9Nda21s$2|VQN27 z7{i^j?9HEAGCg3V5bCL>?A^>r<2W=LX-8@r| zkH5mK=$jzi(SBq12sOUV)iw$TxAi5OdsWxU^4i#g3)wn!|3d zmAP7ZYO1T%PJef2rO~Zuo1-qn|X*iZ=^Z?w#=jeeU04=__PZbTc-= zMM+Gss}E4H#sZO!nrqX*BZr^Wl%#Zb2}afz0G! ztpW&(m1Yd244mVRJ8G@tD@1WHh#&$I{kX=;rC$ z9EZ(#oRzAsF@B7{v2wtB%CXQ z;TcLO3SpafzTz7r=yaZ{N&s@1l%#9%?L2Zvz;X2&9~mjW8^W($xm@6|KgMJyx3KT3 z?gJ*&>?kCZ@yG}RU4(l~dzHp9xaAb%cBQe(Ak$JF<wQVsrgrT%~%pGfs$53Jm@Ukx`a&dqMZVM?G(G-^mea1*_ zsXYf%^&2#mQ~mNKRW#zB6(M$Reag}2@hJ!K94;_N4C0pyHR2O?`r9R%siCphYpd(& z%tNIDO7Bewl-DDuthPIaraJj<({%$_p{O!VM(53xsQuy3c5n~?LxZ_V$T`73`6b6H zJ*%neicKF+bkr)XQp8r$lxSHC49g)5tnO5yIF*ZJoSnGP@+jaMQ|m1?Rkw?kI$dL< zf~hI$R-bv6p;a=j1WrN-oPZk#pNp|M(Dhx)2q-#2#Z5_3PeAuqo=~vOPTqUTG%CX( znnh6{Qp1C}!tFcoG^Ydtj|S4k(&r^=Q`;<%Se8>R+N-4`ixaGSFvV2G<$@i6$j;;= z;s-h&NP+cIM^4XC3)0_iOJ4U1VNA4!P^r9bGBg7Vl0gHA_W+>hzPNyLnFwyNwZ}&l zCAK=dqoOLs7*_?*3@ZsF9}zfUGlIn&cN~oJxSPpMTPM9#%{-Sn3R`SA0vMsG^WB&d z31$EQI0bQ>h6IHmu8%w*mrzoUsI^eFtNu$Wf#A%c9PsED8GSf&?O^vbhJ3p17ic+X_j3>IP>0 z>O2YYoCSx<+@ySo-bwcdQ4cj6YjcWvn)7YB$YP?o(bh~e`>1LW+m8PL2suBNbi#^o z;O+`^#T=pVW0Psv+g)30dmLv6C;Rdmo?u<5t@>k+jf0_)Hx{Z7Q{3>MDeamwIb*kR2aZOh(@Y)etj!n#!usuCdqsi@)khv{U^)ChLGP*e zp7&y;a~+O|9VOnnk#QUD^%>7?JK~M}g$C&wj@d3VU81X!Rz+x&7y$F1Z`0RQWsR}W z2bC0)7Z$XIA}ac3jb(Y&06&28OM87jnsKH)+Pv9-004+fbA-|qa7`9s{86gPf;X7pa`ZBGboio4Tb>nyMQ=7hzGdVH0~}tZ8oGej3%jWlwMt$ zfpbdirwC_c_*CO00(c*$vgR#_(HZ*w>&m)Uyx`#y{i#bOEYecT4DSh7nx39tGHAzb z#Z+e;_ZjW)tZCW^#{~HQ0Mf4F=V9C{ZJ(&Be_=IE9Q5-U25DY7T2Ut**lbdm_9Hxg zS`6&>JK~Iy`1JSkSo6jjGEjP&Ij%KT%U^Q0(?;YUHb!|F&IbXog9PV-qtn}4aj}t| zuWZW(ZRW0E_KC}N4$ zNZtVHgX@~QN(KImrD^3 zAl-DneUahyTABbGZaDib3&MNfRCs-0ppUC;*Lz(EQ5x4*(MuF?e`YxymkaC}xb)7V zE|LeIhvD()y(^s)fNNc4bst0Z!c@l;^tCEcz}qBD8C&Q^LH=V}uiN+Te752@iZm9KiVm5 zjIMF2f8qzqA0VkZ6Or=%l&_E$TzPL_6`OdK(>+_^-CQ)+n|$=IP+UVjG>cIT#9+FK zQ`BcATWKBfj&%;AJw#MUzfXUZ(K(LIs;@`=n);{3o1nP2mu9f2{{UIK)UTN?-1dz- zF+Q0%_6^%r^xcy_u1(K7LNe>w6LzyDbO%THSJQqGQF(T1Qq>ZTvs^02nNv}Z?Xd&l z_Eid_+g^q}92z_`pC1*TWs-Bb>RnCMr>j~jP}MbzKiS4IK=(ZH`ROatgI&CishJpBFt0H(B~gtfj4E%R=5t4)rw(@|MH4IN!H)s)OuT4a=Ys7HT; zhW!RK?F|QLDjx7S)u7%gMfaL2N*N?-Xwr3-tIvs*6$uKiNdPbep2Yn#sPzQx+7(&y zD617GRi%A=QQ7Dxxzle6Jy`wCD#y-ByN}-S#?=@;`mop58ZMJ>2My1^y34v-?9)fn zomEj+ajQ$cm7i|!ynl~XklxP%UC*M`vRf^b zc6bC7bdb^0XCf#NBINft)(nz4=X1@KuE^OKrV0vBoE{aGR~aOoMdqc-=O4At+E$;# zt1R6iVVpl$-06stNYDQO(&?OI%s!rLjD=Imf>`5?Fd{MDJLErqK(FbbFZhVU{Qm%j z0V)F`E&vOFf5%meoa2zS8qgQ6LX5E6gZP5u_0a(IDQpY0gHQshuLl_$O5-~1gmuDU zAOjz=QWk%N&XX4I5OP0FS9otUrfZK8KsJGymD7glEB9D99n{Isn#f|!d>t6BPH-BA@)!l$XlQ>6`R$uf}&7j9o>N(E=y zH^>hv0pnLJa1xSSKtw2Rl=tqPubZXXOI37{yuSq+LphMEZQf-~rz`<1OLpY*uFpiS z1bL|Q^xegl=T>@}uGc{%NY#Jqf~}%q2=hz39f^h+%CH+r7#Rmd3gvayIw$T|OGS>Z zh9{z$wsCK0=-G=&68n%5ngm0Bi< z)x{gZBqSV|*;!O$BPp|Y1Of>MU1*l6QLOeCitE%jxp<6BCMwGz9EiK8w;i6nI{RY@O+GbmxkGoe6y zA$O5d8sXHnrE03;q_JC-#DZGwp~OE66m@cW+GJA8<&|~L3is@AS0T6loax(E>(%0E zt1Zt|WJy_4NT~$Oj0?(1WsiC&47dPEw2h}aUoNYK@S^_Pthv#{M`P)F$~ow=)MBz3 z1JO?!g_0KD9SnnUKsTzCY^(cmKk^8JlwQ8ucD-Aor=INvs%`R`xT)hAk_HT^xeSa^ zfWgR<0fz;jp*_iRX3K56TPTco zi-5Q>2(hvFol36P20*GCBnIa!31C?9t_5)^>49ZD0rJxNBS0j z*V_AiB^`%+RW%cBo}pwBc@YK9?f(G7lgGo)wvqO5)dZW!rY|p@JocIzJB*7_)Tsm} zp6yLk%~K_HI}pW!u_cUbH!AyX9=Zfk&k0=JqRxDOm5=q0j2%B^>sf4e4v@9WEiFn@ z!E&m&)Kh-(BulqROA+CbxqYv zTYtET$%WvWnN?!j&gp?6hI5Um+pyYSJ^yfO-(raVZ9i@eb!21g$SD}+g1KRN6<A^i;Ofpo$-x0S?#5ep5P7l$!`RaMJ;rQsGhgrAvtd6ppaXm!^Oh(gDg%1Sz zSy{edj^9J7B0lDgU3=kcwr%6#DjmkdRdT6^2H~H|$8=AH+gz)+dJ1}ZqpD#AG#+8|Ab~m0ufK9T z=Z$N@E8NYwF7Z}lHfrqortVa6)6vscQ=l}}0Lp-jW99j3cx8EcY?onM)5&DCoP;H- z)(}XEcOFl#=1JC_3t9rsRBFGyR7vH%IgPzll&cS3$DK^p0hBYqD^BP>o?C3yfK|a& z6)bY%wwr&KC?o*Oy;ZPz;gwIg8qLu}Y}@Iaq<5 zB77}^2d1?);T9bbIJbJKDSo)Qwim1_0q}J@yTSi-{{3w;$@__Vyd789FrNA zHUM_WQ;%(VxFHaKzcWpAyaIhy25pbxE%*0PI#0v+dS?3fNT_0|s)<|5sJT+eDYhJM zD09RA00{#)2k{M9n^fsz3~~5RPfryNt5YM!+n}|3brsI}7?zP~toGTQ$Z8~L>4dC% zu|B`1G#eQagTOvuEO!^Z-wN7150j+23ixaKf}@~sk;sB6TJ1Yj$r}3kN8(gtfuDfq z(5TkeXS_ju@nHGpzKUpE*d4vi4!$UA zthK&sihLL8irUXUhL83|RL3nP6n}SVKDz(}{Leb9?JN=-sy8o*R*ubMQJ;hp2Nz&SAN>h>)9u`=GDLM8E03R%!UK~(S zIk~cey_CG@)0oF)3aSA29PzmP&Z3|!l=fYOI{23At8-FRG*boyAUVph{+xlx_RpZz zN%qR!uK6o)IKu72=9%exO*f0Sw2ru~Z4E6^rK+Kn29YXYAUtl2IFU{X$O8ltaKldn zjIms!*3!)pY`bIN_Ub<%sx~;^kLcnrkOXi={l1 z&hRqFwM|-MD$3~IVqlf`bho+=tk z$fb0O6$$tIOfwpdxl<#VQ+phac9qEo7&+CJ_GXGpHGeaYMN!(x#WmIRT`ka-3mx;O zEVqezcADE?TSXkt8^-R`(lUHu1dPRSazO9fUP4_oameNjfJj@BZIz+*R*Q7RJxN7Y z)OW3yIu%Sqa9P!)1a=LRxPm>wAo`K0qY^aH;WInHY1s28+`_0qEGa-LfIjdagbQ&%eu^Ge4o?YA70{vO(_=NTMa<+nh5Argh0 zu2|%DIAh1Rul3L+qai5Va7X9)POMuF<&g4l$9He#r!E5|CE4Ip!l%jrsL1SCFd6yk zGSYIS4%!n9Alw|eMeV>i`tjRD4*-EDk<~!1s4EjL;NWqV4THyQ>NhmIkJ&hO-cRLK zYax#qB_?3D7dZy;l;ahFHC}flfLkwgeVT|e`;aE}z_@bT^U&lgtuhTbBe5g{a zQ%-ZY5z00aPp~CV<*DLu2Gvbud!jxeewR_OYno(jNz>PiZy)~vGN0F4;b3?Wpj4-k zJ1cTyU_ka6{(ibc%5juV$xG5TQ%?<$+v;z0;iHawq8f^XV2U>tl!=M~<>zkEjAPv0 z0>%_X8r+^qI=bT(9IBApWt1|{MO8jh4W$R}j9ZD{wnD5`$_8EsBTSaEhA&Q7jqZAT z_fY=Sxd3O9f~q*9^MgVV$`fYkA7X#Ci)#!Jah)v+C-LtH-U$e;KV2nE5XE7&+^;AS z5|u2nh{SHE%ap#^BxU$WECFRB4i`Ds1DvBq@>Ki$*9hvQny&3nWVbAky4AgCj!EMO z_sGM`-WZ+(B$&Y&8OS|d3Jsuv#Y5A*J#@6sa=1%gmme~Ts%}(XTFWq$ktIt$Herd3 z61z)2a56LoFc*0gcFl61sCqOltjTAq6zv+rZ&HZ_fI6&d(x%b@1CTpowscR52gGGI z_J}MdEA_^%u9^s8S*xjJX)9rRg%}enHs(^B++i@JGLi<+bPWQU`V|u2)Ll7irkv1H zS5&paN2_{DT7SIC?Ck5i%T^$x7@P)-GMoa(oa>K*?gwDr02 z*#pJ_k87+_;Ynf`rtF+>03{Aad0mP@X?jj7nJAj!eXYJ#Eiy;t+d|f64}9Gb}(Ql_T;*~n~wTMI;gb08J$oAn+?z>7J*i>k1;Sf(rPT62sgd?=Q>x zY9}_bpmTPW^-VOk`GY7p&VGMgC8)VYt#`oT13uCZa&i9vq0wBbelU1-Pt{!;cY8&e zj(hyd3QqBlFpa>z7=y_Gj^4V|(Q2I>tq*s?7Ivpj-G$6=#IU-Lvc>AN!)l86X(;-Y znVl$OU`d2y5#;1=BX$7Cpa-2=?LO2!)RF1Mn!)o;+4^86idoG7ayi1uZdMwvy*&}7 zWRf__n5Kc!R8icOATYC zv=mVVV^J(;EtA<7KHTK~n)Fcieyd#_YF)5+4t+d7E6i&D0MvRoUPh2_K0QCR(zG5K zbeBWXEp=2>%Pk-b*9)VD^3T2-;9r&(KHBs;KFiYSNN%1nyTs(L53KDyR;ZIJ@!&8O zvg1iZ)YOqJL@RWFfh=wUu^!Hhcpp%8pyOtoIQuNR>J?J&&^1(mwId4HNXUv@Zv)sK z$MeV6Oj;LiD=zC_3oQL#N@(u11k1N6l46;8{ub^(nD@@2k}&a}RFO{jNG&T2>L0Uh z)2H3#u~gJa7IIZ)LjM4okQKiyeKkzN@xZ^z(Ppqe^H!^ISbab5UU`tUQ~L%Jo~f`hsR+lb1nTsmqQ z2@0w*G4P#SYouuz_wVGLtZ+A3K`yqDL1q-uK`gGaZVZjWgWo>-(Sgn~ZgJ`0rDowF zCl-cTsFHCbNf2ny0I>c;TP?;_vQb$nja^gjc_fv{21!du-GCV+l5hY6K7$&G@@?T- zjuR$Iv_W?Lk6^2UKQK)yfQ*Wy9C|iLz~u5WbB-~r=QQdZ7~zgqgP0jk9TirJ;af;+ z+8T<0l~9=KY2c8ksZhbfI)jH$=!^l#+@SXW>sgEq09Ok*$WCoqJV9a8z7N=}@;)l- zB@@jbsP65b&6H~*Bi?GB{{RH;!);=Wq>ev5Y;dWh7TPxB0P&D?)|GirPSaGP#Ek4u z{!$Xy#!my}3DGEJTe>XyPgPAQ+k?PAKHr!5>Noi%HOiMaN2U7U`>KU7Ku~d?i?}3u z4OQQ^=}AQmgQu3$S5r+bRYKOJo=Ju$3b_ZIx2Pio5yp$&WNm;e%~q6?&{vw)Y(5}Y zf1>L{TX3nQv(m>Qh8h(Rk25(N*?u{P8QtAT>_OL;n@Z;$jL%&A!p!d7yWt*w&qZeI zuMjRa38-kP1t^WEl1WfD;WwBAvnz~eoO>QKsv~$d z>=e_=_0XDiBhFx>%=J|xf<3nnkQ zfat77&GcMNbn+gYEvu<~MzVD6qJ|rPQCQ|%tddC#H5HM;=Xql>XFXOU4%%Or#`Wk&d!uy_~;Ic#t_ zJ-F5@r`5JkjiBF93M`^A!k=X8G;{@|RU41uH~{+U{92&yc~erkQgv$(#r&phWV5pd zVgB-*`u_mwtyWEFP(wP4uZ=Wggr*8GNMcz0{v)9-r(nBI)&f-{r~2voktA3eN}Af) z8<#qYX;}!rP(CLnkXx$`cr73qMA!hL^YK%6$VHG?_;z8Hu?+!g5H2~ zc-4>AelMbx?iyf222WK%uj$mj-B#m#b*`Bkd=x-tnwg-kE+=)6}J0vQsgqmP8OLGeY!lgs zaj!$5*N@RVt+m%HCsU@}-trNf=&7|n1oizyRF}25BVVL#^7++lvq0FuZTKONzBu`J z)$!UoOvO~tsJlt2bx1~6N7x{*rnJ&UbDAoO#D+DNSYrsR;S@2zC%XnYgl~BV!zld`BL?BcbG&Qlfa(;}1@F3()iRCsD{* zH+eL6_*i}7zGAuZqR(%3bHa{KwwbNGlR$EO6!ApqT|s_G_^0CC`^JvHuIh{AS zDoA1xm)|P9om*=!Br48+FjCfeoDvJL&*IuR`g5)=7lfh4jVG`atd8HP8UFyLqw)hO zs*l8)Y1(2{7asiM{4{6*WTg(>`a<;5spKwxWamGYqH~n(1C$owMnkb>D#P3XkLRi{ zAevWO!Vf%5tBfCh4mAM+!BYZARr}Q_dUuU-8I*u>P7k5{wBVbxT+z2H^k>3Ot^WW` z{3+>6{6vZ5rA2%k?2vH4`tc$W{(9MA4sI8eb4x9Up;vptiE1OBFbs2`WEk&}>!;6E zz}T=?u=SN)SD3agmU@W8H}B<^I6-W**d2>3-dlhPLIGr)09PY|a%ff;0s%?t3x`ZYGD~t_+EP?`NgOOp@W@nz zToT?vU?v3aa(1{lItIER0VtJ{=lgb&dRZ+u8znFjA^{uIylM`{AcR2_naKqUf&gIN znqHm=k>F4n}CqVcj4cc8F zR`l0R)Wt<>pmjAS1coYT<=oKA@rPLCULglBh?KA5AOqd3oFmUG>y2Mnbw}Ok<+APM*tZ>6>kISIfIs8rowVqGe=vhA6%? zjS!dE6fwZ$0(m&jiQ#dwyrJqkd9JiHmK#gK0jrTNBlc&OM-Hsp&@$vmk?&(BNOdGO zatN|{L{()&ymXy?+M49IYrt!9O4KWGr;bUWiO0brOA{x;xCdma&I*i3xyGX+*8;tv zw)EAOpg|pFM3wc-k;_K~MBZfk+2mDvedlP;npb%k%MHgk(IpyRFIik~RK|Xv>B(x+ zR1FP@|!3PWhrIcrAR!22x0kY zNWySXktI#Kt~#n&o}Q*7xn48K{{RE@I!`4hB8Xa_N_D*#R>rhdAX8mcd5nxCzcfG8 z9YR`0LagR(QtNGUQc7MRMmaojr3xh?*?TC#B;2HW6VAC6>>Xi0P~57m_F8pl>72z= zbE{I&M-KZ_9myV*hwYs)J0F5n@L|50BNecu6 z1d-3nRb*?Z0z!@Mb456e`6v2*3ztgL)6tt9wdmwKma@@aUwaD|qw67IRHBtGh&~Idn_GJasHwPf(4%+i`LST@w!-cZP zhqO2+s`^@n=on5s)nS$X@6h^vbzUY0U3JQLWmWE;9aK}~(aLHgANf@uiytN*;y*o1 zTWNabXWdKE7OR{6((y!#GV-f3jmz^^AK-M3U^a_hL9nn-5qtrPqM$vlS%_9AWTYSd zVc+Ylm&LG>)ui29tKw9vH-}d)n7K(U#*TSmSpNWbG=t2JG7q@Vah^!jdjaSa=4QB6 zr>f#rM}IOWQIqiHzb!^u7&jFAybxOZT1+xRoDAv1FljhGDy?&~mEX0Bk8pKifL)RA z@}#v~GgMpMA#_KDP6L2PAC|8N;YcW}*H78}`gtniiYl}VhLR~2{GsW&N5rG%Mm3$O zJ=ooU+R=hQ4RC?qsis;9t5Cr7kyQQA;-)V&IZfdCTl~U=1N*?GGw|-+#-!6YI0pgB zo_8InRg><6Qx9yP)-nkB zX*=c&T90#lQq)#jWm!|srgaArz|4#?2smIt91=VBC*MrTTI{r=VDPVfKW13wilU^L zu?mM|8T8;G-PDvJ=U%X7}T6dJi7U0f)2*R8JlbrFuCnL~}BSB}wReAON z@L zh59IOlTTl4nF`4qa5MQVxGLdPhGE)bM{XjoOK*5zQxt|n$?*u zqFbyN7G)FExpJiU9~T8ju*b-1Nuhu=fsMZJ`}?cSoI=vr##PFP$Bwn=`g*HRPW7GU z8W`g+EizR3%T9ygD;ShNmT*oyy`*qP2-Q6_voy8OIO9Lsta2^kXgw45wyEg*MLjQ2 zK|RW7U5wR+d_6>1%7r|F`54IqB$7zfF+n$cTHLh#N9{^5*OE313uLwZt$HH=07V@3 z{-sM~t~jE*K`b&UKf|~wLPxr`HF9X))|$t^ehSXVHv}QQN#mumSWCVBkjJ0J0A?Sr zBl+tg+ppqsaMS3I_!(W-BdBGshQ(6dCs`4YR@@*oMP$z9X#Ca&Wjv3CbMbdL)y*b@ zP%cKak6e8hGI-k|+-x7Rx_F!879&wPndhYj7lK*XG7*o2DL6*K!60V@dTQjl`9KVK zIQO25&}np``3DEzf)n6(j`qHulCnF6lI1NnT0>b)R~(;rQ?y1ly9EOwhI5m+01mF{ z`$9Jxa~wYP^i%6QJh`B68S%ooc$xN}m!~Q%2d5~l*GhTmO-)MHGx>poviVp?1}4Ji zEF19yjCcA6W@w|=!e(=b@Wp<4UpMUuouH6Mw*G5JgT#J{>&lq=XZF1*y4A#w_a$Q> zNlFIU6#IOsBtxDQ^Mk?38mB&jc1B$G9lrBigaq))14$Noe|%c|7rqKc!wWs&Ws)dwlh9DXXGpdgO!>*x>tP)14I5p$h3Ie8=#w z*Ij7Rta#VTta`y3Lg)K^G2%IEMz58{yIDBx~v4M-bfaWn_K zd8uK6(;6Hv$^QTvJaX!P9(vM#qUuVemA)wG=qZr>)`A@8%7ss1z$K3?c|L$tnqB8> zaM9EEK7PXAiXjtdD%Gh=IDN~wALCLns2blDBZ%@=7{fBxGZBR{spp-lHNmzl1;qH2H(j9~((raPGD z=YmJ;s`x5;>V#b2QyJ_Oj(Nb+izEc}7f*eLdbC&c&s21b(t|a=p=+rTk7(t|U=PYv zWB&jiwY27eYnPwZw`|+G71SYCuEI9_CI{dT?4CgWc>J|!fE6~Abe9!ZQD0!1qG)94 zs1_D_R;rWAWSL}TR+D*FF_bF73|n)MPC(VlDlC7;l^X94PSn9`xn8=ChJU+5gs6_H z6^dzMkTc63Sn#@E{z+j66ivPlrS7Diy|ovlAw%h zh4Q-%I7O5EwT(&VlC8 z^HHpQMN(s?rRl2-!r3%1G!=;sAWCz)4$2)^fD~Z~1w#Tx2^t=#_0K70$9t)#Gv4}^ z8X9_dpDBZ(rzG#milX1Ir*{Y^Bh}}g$I$KYm_LQrds9Hcj=`{8+5kIQ!CSLxI$4F0tcFe7-Wc)W&;Z# zR*VdDopH!Ux+w|M)jzPS>3XK`9aSxT7=)-Uu*DEStihisTPHqd&|i#$jA=`3QC(xA z_N3VQcBw7&cByWc5LR@gk8F&FLnEsXHL(zPSq@`ep^qVVXjP-tIP&#f`kf_$2&@Nl z1zPg6OSAZJyiYQB}} zqz-_iDcl`6Dt-RCTLXn6(E2OI2N^tG`7dq7AJ=)6um32h$xZW{RR& z5<9GBd4rYnBz@%e^euso+VlFUj-qbyX~xd!jbgh~Lg^#1AjC-6p)g0kIp-(jG4s`6 z6~b&Mq=<}eQn};;F_Wg@FA5!Xm1cl_;+?#s{{Xa4grDiI!A6x{=i%KPjhJ^)vvLB0 z6#oFic+fe*TBTRG_A6`;UQ-AD(o(L0xM&>W>O7Hx?~hX=+)Uk3KaqF!}CM zGpHX69fICejFrqb)0LR@7unjTQ1V*uvm&0+Rl<`G`P3=>LDWvuP_R|jYRKd)!>D{4 z>YHZm*{EE9wSu3_U(5rMsGi4;LbfN3%7b_4Yn421)6}}Za1A$?pZT=yvT{B9;YuZG z77z+GudvTiI?S};WB6o9+1Q@MahwC{4<7niBDx@7zq*anJ+uKTM^aTmC2**ZY>`b6 zGI?==75#Y}bMoU=w9&XUgM;+`)wp#|k-r$CfmKwQ;7Fs0q$qeof0_B}<09HiiL#}9 z_kqF@T{LW05-Us2;-|3HiX^Ca+D$4wY?(xMNXAaVk~FWTgrwG1jbO3Ga<<1)DohKn z;*oKZ4<{KMYszZo5Xf2xrI9m;c?u=U%STSca&z4Vc^ijpAeG?!X*w&mNkdCYIQ8l2L!C zFDAIuPXtjuC1})x>PCD0b;1UNxM5TfG#r$QdywJCWA^0AzwU6GdEnJ@8KT z;skJ4({%M?3WOjuv6z-ToM(KP9QV($)soqP?&24fD3PobO5eOX`wGKukXGKVa#Dql zm@Vb1CC9D~+-LpNW9_SBM>NzXa0AGHDi>-mh0s13bYF%&D_3}e$7+sS<%VA>zFVa% zvR6RZC0H5Md@KPf(R?S<+gkDHBhoZKZ#{WcT;lf-;;na1q=M@~4P7*%q6&e!IoCfQ z=iear2a${&M)@ta)jVwkSX}+%h?+__u2@UR8Bm2#e82`q?EV}9_`fcA))uN3%F;-q z=(e=_zwpJJgkQry4z8DbK$fnuvd*=X?1GsH$}3|9hDc-Bk8f=1_J>d1Q;tW5a+g!u zCU%n1N5u3&EWJ6={R!2V%RgA9b>@zjY*1BNRel!-EFD;{Bw%g?fJZ+1`lDYw(6P`^ z@;}n~w5*MjIBE$#o<3+Sam;itGqbuA%PSDlJgIc~sw^~vkSN2jNnx^)n$KU#fxrWVwNH%hRcv~; z0mtj9?vyl1$G^Ik9TEprbly1lN$>Ul0Is0|-C;@p02GC3IL|rc62$4wdZC)(OobJP z1RgoVWBTd?UJ6##@}K-CIR#IkeoxC#*Ht@kX+gQEbnmnSpNOCG)Lbc5da)lFTxaXw zP<%I3r58w5X2OAF*xt&cBpm|L$O+owIaiOe?^Oruw;tip`~8>bRjX=LR#Ot8T~5)TLC^GItz+UAI3m*j0MYx+`q6%* zy*(Th6`78i;6Wj$kFhRFpe#-pWsV8Pduq`ehEyV`c&K$$oiEaLaz|4=Wp#VTsL)ka z%QG}jFbWvEDiP*C;*r^aZdk5`C$i-}%YLf7-KZp|wcIGIoIZ@Z5F1OkyhWB%tN?9TD9+r@^#K5TsA%KjM!5kJx!!7~ZNYKDs zP7um|tLaM}zLK(%j;dTV(<)UYf}$B6ce2K=HvB_(`#s}=tCBY8UxI2UlBPpl)cs)u zk}r`3vZhF-h|eR$@G|*qRV2bl_>m++8XTR&1dcREzkDvLG>jSutv01g-PXJBa#o6% z=a#)-s{v%hUlNrEYTz+ZhUXx$*LWex6ogb=tk&z|3r9>(Uv!9yh&+mEeAya6LrBi0 z*n))P%mcaYr2{xbTC-YTsO776x=~O^O(DXpGP1mB@%|upmnYx^Wk)9*3~1ntrQ~pi z+Po~aTj(IAsd=m6jz2HSbFvaP2;1PvIAsgDcWosBz&aV~i6-q^qg17iD5FG;5kzRK zfDw2A4`Ih|mVvTYFL1g3(H_Gi8Vi$dLiSz99)nzpVd*5FoGRzG2l!|sOV8Qrxh1oW zcPOT%r6oxtgWH_vT~v=KT~z|HAwdWGORhv9mA2;RBY;TYa&)H3JQZ56w_u;*$9(gp zDeuKr?iTJ2Q2hwfk(DK(RNg#Yyd}EQQCw-w)23#Pm`9)WifyEBC`sjZw(Y0;JafjN zE@>b;9%z_D!6TK>HXH4e#7hi!YaL`a3K=_118xBO9Ckk=jA%Kv%A(*~f}mV&Gf}a0 z83)jF&X)lWiXAO#LvW5+W8FG>mOTA^4m~u43q(;?J2lEEY0?FZ35j4NBjG*IJmh<8 zfE7!Xf}Wr^4I(yjR|6U#5V#<{ab0GjvosfZ<&GMxtbcZmo?XlF226G;Kp>Ji(mtyy zDGNZSAk~+9wPbX*d!=dVn!12IyP8I(I?<27t-J_j^(aZt@SQD}d?y7VbB#2b{m^@t zR@gigsCasZsV-L=g5fB)5mf$ z=@k1p^>yC&*1Kx@#V4ewNt#PtUBTX3iE;4ljlo^RmiSwC2W)B%(bLArZIS%IR&qODb=nJH0QKaD;X{YEst zNoueae&gX)Ek;ZgQ$|k!HXVPTKhHvw*;OmYgEh{K15#BW^pF((pp632)e!duo%ILV z#u~lSRCLTQ-Mq?Og#B`)AI}<1qzoxL&v{v`=j_8zG!jKfYVE;^Bn8}{`u-q!{Pd1# zaKe~PwDPF6zhippwk0}_We50LnHYcdF1T(A$N59S4R5h0Qpy0Qy-??$_Vlu@N2g+R zY|`)ZKlrH@Z?b1ybmc^jO`78o0}BO2k;$C;Z;v0(Rp$O4`JPq*?BC4)lo{c=1+Cq(Sg`WyD!YL(> z$amvcqU`d<+ntXkY9i7@NvrZy{)y;Yo%m8gRx6~Sh%i=+WtBio=61xYe>%D z_?78MYz^8nR+lGC&`?<}P}`}J5lKxclo+ujOc0zgI6RMiD_qADoc%wAMccP-cU8|Y z(Mb4Lh>`&K3imkV3};$o;Shq=xZt=ic6Gh=CDEK zGO^xI85zz$Ep?I6E1Qc~+Aj$@imWqRpq{w685HnHcVo9@$Me#V59$j&&Sz4OT7J$R zA=XT-A5H95CCgS%6eFK-Sey^0bsL()no*e};??`4zQw&^SYvyg&fRQ6ysR<3OquzU zv3MU{E1AFHuliNpDCiccWcv;D=?>|0{{X~o&HVK+iMMAzbzHg5HcHhkb|_(%GSPwq0!wqA zdHl67XdMt-+t>ahb;MU1-kG?_BS*~wE7NQe{C@>NlZ-xb@q!d%w;9*5(`g_K60}`s z`t}wa`nfljQmIq*t~+X=v_0M(uQX9bKbWZ7i7P6ea(nvcxz4RLX}dwZU3!H8 zcB4#pyBG6U{{R600BCrVZ0ZWX+AVCj(AK1HTR}{T1$xMZ^DQy`*mcKs+mOQ})cGFM z?B;1@1_M{~TG~mBpq>c#!n@xnE*8{`Uh3_ML-(lCIgOp!%df@eZ1Zu2ARaUF8pI*( z!^rw8a!6ZSin`f4I^|N()PJS562jqRLqSUnF}n8$AQC|b-&b14obUPt87_8reU#3i z@DHWBuK6X}p(m%SoOj$JNky$P8fxHA zv{&2a`PUK4Ves{M!KKT;%Z z=il8vjwDhy5<3vU(S;6h`5v09_?HlqyDl6x-R?}SQT*q5OI#obaoUDO|} zmZpM{;y!F9V6i%FAS|k?v9{pE#GuQ9ISawIFjE3*z*A@IJ2Ky=meXYYvvp@=o~D`= z6Bk&Ol~nC6*-~+hzkrTN0QHMX8b>N@H634bxJN&gZlR|QpDs|u&NCp(7E>zZc}=(A zBjN>)m?axRlX8OHXzCh!^}@1YUrk0sQ_{RstQO%?^1@Nvd`=!UQg(o34!fVWiNQjs(deVeO<8;pS$c-+ zZnjiPBY3U!l?at;$h+AZ73|1%km~rtoGR#K{{Sd}2(XRNbPa}%X(VbTt*M(Hbk!qq zn1ZAiPy)B$Q}F}aU0oLvZCL4P;-scpYIu^GnlXfe6%oXv*eC!G(_Mwvl)UUGXd%8s z8tbGN%ek8v-SYMQbPD9}-7$lXPo|Wn9%=DT0+Mol^TvfP!c9uWM@{F{O&u&b{{U-H z;v_#p*auxmge|Ys?^PWwEEP4i6GrmJ-cg1ZRVo;XWeN^A55gP1x~&u+sk+YD()5zl z-@20DVuCpqCyI=D^CPeG`>B|&Z?UIugPqTG-I;0iF7$Z3mgGfK{PG?L8nmp=Jo zf0hT!Tt#6UJyctPmH`T_9c65i%8_|gaS0tm92ab|fQ#LTB!So+bzTssOdbUUTi=KzX2TR<2gzCP?f(fzE*lgF$5eFLeA;^y5?*gpW%q$jAGH z?P2Z6Z(nUeBSmFcjoDnuP2yixco};Y{WEc>q@Iqgsj8tlYD$F90$6eZJYX;ZhIk`Y zn)4sRay%8{#>ze*y7$44w>L-lh9I@|VTYve&Y5SK{{Yk-x#4DG!Nv|44)_I*rW;-Q z4ty4TnpZO|$Xx+hVWzKNG7_#m7a)sdQK7fR8j_ z_8Dwwbw>qAuD9Fzrr~d?t&(H+K-($90TYrz8^%5?5y1U)g`1saMCJ#OPeo&_j*j(7 zEgixLYbjj+nGKQA|K0h*__Qe`jkyT~=Ev61t|`D5*spYO*wh#t9&0 zw-bz?TV!4t3~s(?zLhnZO4RMVlvi5iplwZ~Zsme0ILMwe=lEd$d;gRcUR@8YqpnUb!=*9EA8O6weezp3PfHyn@ug&ru{sG=Wk}EBAZH`CGxOJ- znZ=_D?9ZV?l^3CIwmTn9Q&ZL6?C{pe%{1U#BR>-yjFQ}tFh4CbC;$mYGB%8-{7Jb~ z!PB)cS^9<>CFNBrA*l1C9|$?a2RPcpkDsQr^pS6n7c+Bx_$cR)wZ93<&bmtR;|p}f zW%gS8Me35D%x}BaK@_{yFkcfuP_71eJ+ZAhq0o1NDlI3&`?Xb*YD;(3kH6Uy=pVFw zqpE5nqq|ZIEllJHXNlTMcFr&upN$DVxGUQvYPP4d9}7#^Y!4n(@aq{RHP|I-_R8uj z-6U01lJ1UN?QNw=AdqsW*CSY9gOIL*2(=2n)fH_<#EoRf@iGEFylNKx5L37rkBW?HZ`rWO!|GTQ zp|v4#>P4tAzx&H?FYpR>UulH zqpNMVignGpGO$T5+{CC)#v~czjF8yl0|z?Jn8?S64is-L4!(-jZ@%1bb?}>2tRAuQVbKHMm&1z!(=Wr!H*Q-<<+!dXTCdlBYW`@`A2ZHR76o})|ud_ZK}8@ zCqBnK@Oye|({_HcC?hR;&DFsqjs=S(>kqTPS#-tDl`XI}Sl~vv z`ROkfnyE~;SgLDVbb*+kKr_a>B`GTd@!Rdi(Vh!UQDy1MWa}=ehbbaQAW9lX{{Y^} zgT06L5M+9hraN1i56h~%7M0J-uovgIirzVOpN)5xujH?>YX<3=jMH zX$c(ydnF`=mymJJa$Jw=jZ0;``}I*4gMgY01K=NzBz#HF*SXH6Cl*w7Udr-#*@)fA zBk=G)UIu`9AdrUITQrYE;PZftLh?Vj$aS>?t)Y3T+p<>y>bDte zy)#pOp}C7~HNyFEnc9YWh}Mcx|i7w=0I{YoDwmvAoN{85qx{~P!`L3V;bIij-K63=82@H zj;W;cA}EMq+(biRV&Iqa3vW~0Xw~6&Jra^GmGI(%uANqTk|I=^2MXc*xOXa*AtFL! zOn?9fY4y&zb(f;`k6+atD_>6y3)||XO|S`SO2aCL1xn0_2v$&Ygvy4-52~9e5lRnB z_! zmgl!Q=R_q=tvYVt20pFn`e@ao%6?BOmX)d}c>`uH_UDsk*}7(y=VJ%iy*H8_FTJ(@4asDP|^P8(;trC&?c@ zDWQbG6O~SjuWr}aqgW}5L+b)JGMs_LI0 zJJe=W{mx7zM&AG^S%_`BhEF<8s9E6^ln&>)_4U&(K>C`@mzy1R9JDc9BbGd^Ec3iE z%D{$D&mYF4AY_kEe%e=CM&^r7$89%xPe~KHD@n7=Aa4jsP;tmoK*;*@qI1gbsBLmC zNp48?^!&86yFFEY=T0Q=!w+3@Y6Vidg3h-~bxkCP%ZhYp+tY$Eq;qjefugv}#g3GY zrh3SdRi=9CPc_&MJot%LEHT-Nf_}c5r6YM)<8c}AT_PIT69L+(~3r_KHK6j~L(#sP@%?J>mKMs

sAn+@?{3-ijcz5E{Q&25) zS#Oi3e`j1*W0E_BU`omF@RD)P2BiQ$^qW~y>Ew?!g|5|DV5^M0SSm(A8cGVO!UI!e zsG%*hcHft8(?f)9HBWV@qLbs?0MCAZmX1Q?dWWeMz{8H%`e;{FQB=9ZKHcjath}ip z93Pf@X{_{0S_%POa=LXj_M+2B)mA%IC?9&c`Fm!mB!9D~bHa~z86(pgfZNFf0Ypi0 z+AyK@7lziTx@x@Gth9#iBi#)|iRRAa5Kr$U+)ELWlkcQ_U^LK5_{P#e${XR=P+Kmt zEvD^kw$t6yh)k3b#M3S^K|;JpJKziv&XPAZ_${K6Cp6*1DDPSLJ<}c_SJT#B=nVA` zL?Kp2DTL2%F~>c>rnU6?*>vsO8Ae?!a~;PXi=6E~AKe?o+U|^|rn=iYexkjSaSYO| zZA>$UcFxxLN3Kt%y)o1?{Bb#=;5@lm4QukmhGNm9Ix7DFXq3)#+qnAci~Fz| z1(KQa5r8cfu}yZWv%I%BW23HbFrq4ev93N6*<;(e$@J$Mu^BF{!jT2Vxfxvb(AUov z`p&DU=KaHEE2M8hhiVw3_mBMgIs+k@L)G3_jIYlcGf4IqmOq0`+XRfckhMbB7d>4gP)=8CVX$z(;QaC1R+RR+GsI4zrM-2L zwvMo_uCqOMUFsuo6%zTJ`FZ?7dE_WQ+1IesLSuwKyBn+OuqK$9OLyY_h;N9tD^G?O z`wiEnI;NVcNn~l!>1B$dXb9~tGDpRh;kn5?YF#^>s$-cvb@djbp7i7vKFZhnGpL)d zh^VsMjl%P3rG=_4HB`$Cbx$Kc=p|HD0EJL;2N@mv>&teAi0RxO++E|zXzAxOWE_Qc zr|Vl}a;-JzJv(37pt$@ffB16#2OpR^%52Oyb`P4p-VO?`P0|%Tae1hzxZP7BFOg~t#77GR zJD+#SeKW};JPtiKWk&A%;p3{St=b~|Di=$5e`M>w4wjCJmN>eVSyoDzr-oRR0!~3j zEJ;zFtQ7a>jcmy*x_RyA;aKfta>ww4xfk#Y!fK00h*Q_kS}E&nVdAKuNX%u#$iYZ& z6*N;E$$VWi7t8x&qMov(5`4;djAL#)A96kT)rM~a<5IVEWlw64B$2;p&%Qq` zKdNvPS#MCnd2PI8kNW5)q#nDf&;qgZ>GRPfA{$6kT&pfGwLyK|wbqUiJ7-N)R}dXo zp8G)TJ@NTyfO@61nfRUdxbUmNNnVP3jNMOsOoq2~_m?qppb~+bh3%F9&7RD8ekCqSh(6h-# zycC*hDMO4LW8WPA06j@yr8X6chh%HH&=NQ$2-MAC843db02E@Ftgd{im=Z^G-%nI> z67Jo>M53pV1`mc&l1}Daeq8BnUK5ZRe-MrT0L*GaLCH~p;h&n48Cib_ z4By1;KbIOzrlC~fOU8~wBn+@QIVW}pnH8B&4*a8w82mVP^&exCY; z!ARF$Q2NHf-MavsABYAYmO0gDHyBe6P@2?{0p(eKUHy zP;y8|C2qfBuBBUNR`nf%SIX8$9b7{`z{dXo{{XLAx>wjBg@vzf*R|u(b^TTU0H!)- zd2Cg5My04}p{S;Us$ai6$%D1gW@ci_Ny`9V8DelVt(=}#ZQsRy{qSUlA0A#>9!v_RdkR{ zLV^{8AjZ%*BzEIO@{-MxFo@JyD68k5X{afxCa8!W zY3pz;uvFdi!o3;+3I71}(iM#V0Ki=Qv}tk~)qPb@(i9U_^$p^er=fx$ygfZ6Emu}! z-Lx>=nZaTgD+w9ncdn4vR6w&qRNJoW?~+u%LUht!t~Aw22klGr@)5D3^E}wx@Ttf^ z1d4YPl>lkHKV{TE0hFWHVik(E4El5Ra8vyxe5*d1f6vYw7!VAiByKi z$PF9>QW{`eAE75YTIjn8mdU`z2d)Nxo`;etsbr2YS(kSmyPxHuYk;*9YK-k+$k!uY zDh$lTcf%e}rn<>b!husF4n2Z6{MMJjn*_9H9b2e!M87cFDLpR~VS_(ddXX1vKg zy?FhbU%G01doE7s{{Z1Q`D(*vJUb)bRbEILCo8TzE&l+-N2TxZ#Y@*V`8syEk_}Zd zH1uR=xo^gtae}Aos{pc24SN3oD>h9VhFO);{Xco>uD`V{%EfV{xzR`g6VgnQFFxf- z*A@}Vpjzf&q*rvSemKBk{Pdkr2ci`%Ep-{d$sgHapXZ=#qifhYf~FoMMgVc{4uGSQ zxKb@g)k%OibNXjQ8eK@uKTtz6D==9cb}j}-(DS9C(RErU4bllPGBfHkpXH_q(s?F{ zr6o&chtC)uo&LHWNL6Jz0uje;0ukj?yivT=Sh{syo;aCmCh{hE;tr^#NgQ^`BaK&` z#jW8(GmBe*qFA~fLuR*E(N@-U_0m`tNuz;*gt;7-g#0-?WDF0MgJU29!(W7~j{uAm zE5y!>>n^VF`sHVe<8GR`%M_5BM0#JmBIkTkr+86hG6-I5>Clo8E9BQ z@HfORlIc6G{VHD5S5FGB?8{w4y+uT2fMJ-M7y$0u&T;9lPyT+ISp;=p^*nvnNYl*= z3&uV9C|(Kpdqd)%QCz65^mhtaDc}>Ds#a)l!Gf6BmBB5Nf3?BW>$N9M7%|pTY2gz% zf>%!5F5{-nyQO+|iD$U2Fj!)8L{zY-!n2d#>FfFH!c8%A&1J=U3)xI=RcEX1$E8FS z*LJMCt-?aJCX%GlLb7^p2)TTMak+2^L2V==Qz&@by^3?F`Ew(A6EVH!*sQWu~RZY^WzDrS-YKC_x@*gZa=Oiy-$s;p7d%NL)$e7ddch4>Cn2a}WBYg1F&416HO z?~5NLMjb1}q̋XZkVu93HjD@6=cH06twjD9BfKKk?1OC-+R^JQ#B6JZpFKAQCXz~tl~zfnkh`!Mz{%~&&XQT3EpFf`qJTt63Qam&ZFLoM)4?276)omZ zCP^|kxg?!a3}Xo(WU0eI;UcH0Vt^dUgXy&68k2Hus5M7fH=5)yXviBeoc!~y+!JlC zQ2Ba}uB({AAoj<7C8yiauc__T_anK3bD{!L?*wj}rXr(^)K^U$F-E1DNn9$0^(3A# zrU5QV!o@sN`)_z*;Zw~)Xm{&N+<8+^Yi05xZ~f%t?fjP~>83kbm!IWZk4Yq%I17oq zWcywCmE%X9M{vE#X6g4iTMd!TJvbQu0L*0~Nq^%w`DzDhNW^#_f2!7tPaJ#K7aC`jwYGyWqf$l`9 z=-+W|rbCQwL;nC>Oy>_p5DDanDGlc`fyc$4@Y5S3Auc$yN}jd92GndH?cn3{)MRG= z0A(v*g)Ibi8+R{1P)F;X3oTA4;H4NgftG!pxI6*$+oCd*gwi!Q+OVu_D}W5Y193X4*1J zP4ExV{ImIL#0~-DDweW4gqq08{ZN32WtvD(M?9>pB$13W;9wHNz6S$6hP@q&UUO}5 zr1=jo3~`=MIV9;(s!ydRF~dK|>8jAWPkC!k15YVf;7(dcV3DuYj|1~M>aJ1eg$g_E zH9~`LNv>*pNl2hU{{ZXcDE5^Ms-}Cj^=$O;+@qd2sV6f>Q%Zp17l{@(Ld2wZwDo;xICDL6elYQRQE9ICGEVpS*Sz+e+xDmc%MG|~LRX_)k$vPk! z2}$YEUztlU#Ln3KC77IjGu!gfqy;3MsCdtBT`p5nOUbkh4RY#Cyu1<)3H8T)aa^Ox zIS1l!0M~k=6B1Qt)Q?U^l%@ehl2Spz03T2^r5{wo&yMFO*F0;Hu%J}cg$Dq)=l(hg zsSBy9+PN*pGr`vyTmlVSQcmExE!^N^K{k0ns$U?I+4&y&?x}z@v!4>}l3eK@%ZqT9 zCWO8?50J`zhal=Vc%nSlAbp{2GQ4&>x>AoJQK@CAjR$gfAYwn_1M}4>+fKE92VNFE z8P0p^p;wdPNH|=Pp_R#HiuW4j%kMCN2yWiTw!4+cpHFo)&!q0DbFx>@X}BODRZ+?1 zOM)Z@VtdZ>fB_o>Gig2~kUK{@a;se*cGf_oP)N#}GHL^0RGZgY_**g*(9gN>q z8TTFaFqx$MW9X?#qhr5L3hF+G`)&Ab)`Fg%;U!;BR!(-u9mWsczc^e)O77V1#LB4L5dkErCAYJ92G32fI;Algap+IMKDmJ1LaD5Oc$D=!lHxoJZOg6 zTPFc$Jzw^u@Mptt>s8fsw~E-o{{U6At5rx(%V^u*j!vIwY00Hwx_MvHmpyog{{V@t zYx&f@1=6t4#&Q<6{6vytj1`JAfPda^%SdEkz{?ld z?Vq`%rW#@FJfy;$eDkPGuRLL1WK3u23XcG0;O+_rFaW^)H3k&TQzD6CJ9r1a;r%rO zoK8Zp)`7}#p;ACoC;UCXsMi8T7cSd;DG<{7l%2hS8R!0O9m4(p00#|whdu+DulKbh0=IZ=WKDD4dlbBu@=X#53~u6_G<(>LKX zN{!TdBM?xuYye=ZoMUk9>HIpAc%+n6sbWB>w1C{@iwu1`jY!jYDO@^*@9x5t_(>!9 za&ezbkJnF-Z7qX~O^Rb7_=XHFedDj}pxJl>@}LWxc8qyM3 z_zGX*Cm6^%`D2YZ82zU5Z=QZFr2hbphUZ@^i$Ul2 zQEC({e*>o+?ebe6&ri1cap;%y=lMfxoP}bb#t&sh!v6q0D~UNN+CUV6)lVTG5{2jF zNc^*^&K?S3=9rGDz$IKVkDoe(wu+Dy5~4{EZ$is7ZnC4Ja;&EyAhCbJ`rz z4^{gjQaCA|c_c__)0Ub-7B%{@z#o~{s-PE?u8Jbxp{k5Ex}KuyI|*r!M*%5DagR@cK{4HWg)Dq5+%nt03P6C=JTSi(sBV9M()q=xH5=kyMiNn;$ zDz25R^D9(sH31qhg%U!cie!L$V@8fLmvJikBV!1Sjv@q+AMk_kk9{7gLQqf$$>edI zYl>y@aEqP^I!i|>nIt548-9Me8AUMDZuJ=XXyp{!OYr-i_&Ng03N+J$j&gmolcI5i zXr?5ci5S|0B=UJbO)ahgMw)P=;%?pa5#XrOR(IM#I_?nz#YL8*IXjQ#pc)!vW zw;qw|J1jemL}%|PtfZGHe9Up{jl)&t1hX9!(Gb@Hxh53&75f9vZD$p%6oke=$kOnN zQCyAy*CM>+9!Swad4u#mr#b-?l725=&$hbIN{!knR!IKk+1Hey`^qq3oSuT~9im-F{iVBB;&pG6&8@cz+q-@V_0YI^?f__=I0AGWuHeMBQ!`iu5VuZ+iarIHLE_kb(>^(6y-$P_zo_F; zme(n7#X*~*&VC64A3%BIK(`2oA(b5r?oaL@a({`U9Q?S_)np-pp(jUG(>ZY&41coB z2ldbaHUe0-+@Sm+@I5n~8u3R9?=Iq`DUbf0SRJ;;WjcK<<39D zT!?TSrN`6P33)2syquitx+UF7_O3ZqL?UUdyN^V%X zmb3s!0%^{wwTX&z*4%AF048+ohE9DP`SN{{V?!O;N<-r8*Y2X}=1+>R3Ye z?}R7Q+#M-&Y~w0+=#Tu0#(Km501)pMZ_3fyu2lUu0VDx$jdPFrRA=%!Q!Q{6mgysJ zBGv`gJ|lHMh!t&B)V)J;z0BUVwJhpBiIAi68jQ^MV(aX!hK7s^Rm4UkYJeB}AYgu) ziu@B%N?11$@eF$nA@M1LQua6_ZZVE|()y;O(MpX*8yiQ+`u>^(fN4KGCj@TegdqO_ zXn*Uj1aeWyQmEycn4@JuAgL_5$UGhoAD^ax+FWR5EuN};TOALWh*+LTUgz}E?I8Rr zLg7Zy8d?nPKX+12J|XMJry3yHukV zJ5(6JR!~%o76<$^3wf+24yF4U6^4R3nChjA@0F?e zDr;S;WncG9gzf(TfG)i_2C}@Ro)vQY)4gwf>Yv^9EiJR9RpMuYo}GntO8)@n!e$MW z8C5GF_=t8KU}uGso)Wddbjn{$_%~y)`AsX;T&9YqqNyaQlgeGwFlJXswuzK&Pz1(L z!Z^qn;L+7_bBOs;yunP6(??ATtW@!?*x^MB8V_KjfIfg}xJsW@3UC0^?E?cYdGL_jH*s2pGbK3*GFk(+C4uzvsn0G3U$cRv!rb+d}%M0erM|b!Ui@pV&y=WB!ZB{{S)9ZtzOH zdvr0sl`QW($Db&^8~*^+r$4TA3GrM{iWhh48%IqrhOV}T(98=6>SstCk5jN_Onl1d zCjh%Ns8_ET?Qphj^!1lqca}92kdL0}l(T+FKDtf7DMV+k6)xxEHTv?zQ`6AWDD)8h z?2l};q_60sU10E^Ye1rbR7ZEMh%If>vRIqEesNSh=kSa#oi}{%`2qC)6bDNQ{{V{6`g{KX5YG|qsOfcr z(|gMTuJD>xQSF4tKVJU%(}`QuD(=w6e@+#;m;V3~d%!98u8yUCc{KIpar80&08^vh zGtEYH1EZ1JKm1E5w&JUxDhTxvTvI=(0DmnK`1%yT;GQ=lw7>X|^lZ2_9V2_EVn2c4 zlOHd{zpwJq4vd~soeTOnC#L@Z#DkzBP0{p^P*k5m=2J!d%QlAS>z}eCME?N%g7RPd zNO}eu5hPs~)H2p!pSwvlDn<{^K*v9l>xqy5057s5MEm3??7#Stczs($TI0C>-aT#?hgPmnUI)Rg(p*UsY~ z&p;M`<@rbaIXqBHhyEsAD;5P8hxe+lh>g`33TOsB4g$IQjE}CnbYqu&zmm9Uoz$R} zpZK2*q*3^6Ji+*0uK9&O?pktL1rbU2D`=R&((Ro2=VU(L)6vkkId>s=8AKHM|8+YPWeev1W3A z8y(H>-0C9YdZ`nFu1OEXOM2%zQ%I)}nHy^YtbSUI{3(V?d8v5a-~HV#^)GkTedJ(r zgG;<38HMu4xPBab@H9c{xP#!mNZ6dIBOaIi3Nn9NFU3TvU+ecLDKvaSp z1@uwtuCPZ7xJy{2R?ZcEr_)Bk3yrDpi5QH0Bzs_w=5(M7K$mqyDQ)iVH#21Ci~>JR zFcg-`F+)w|8~~%&_<8)ZsR#p*mhz1V%c5aexCbQ0tWWaS0a!!PI~&ND#0cc`yUW{; zo}{k?xg3OAosdKr82~-ilPiyN`e``9!fX}oJaE&<8KP8N^O$f)=aNR4l2>44r&eSV ze(Ez&2skBk>)$$RJy5ZDPo#ve`xyZP2P&)lv#8uJg1erHsnu12q@_-9Rhe^-PEYBl z6nO|7CZ(cg!4bTVobCZZ?cV|BIvK?{)PeqVkcApZ@Ijf{CE#XBLa9FAT@v9+Fw#n8 z;)*w6}BVUIgaqX{D z%JO3+I2S&h^bxM9N`Uft(Lf~>R8fg;tOtQ6&%4F{05pIceH7>t5vt`k zBsW>&FEw2~t`GkJwm~C)a?9{%=^lY3pf>ubRnS?j)W*K&F{z3%lUsA2F(=GCrD7lO zXbbUAW!P%0-8^vVIwE6+x=WKUZ`t&cRH*p}a}k!{)Llu``C7ZJSf(c;l7#)ro9nVYyS}ZsodQn@ z+qJo+3X`VRk+>1lmLI~nbNTl_Txd|`65@D8X}WMo%AyvTk0ZDrW2=e40b^)hBAf!$Pp# zAm!y{_bBAMRnJ*lEOiyN4%5#!!wy}JP8C24&!E*p;!TjQHLYl-pp~}Bz|3D0cER}n z0M9y(sNE+tctPf(na;wg#~B@?%DDbFQ>#uMBKkm*?`=aJrYK2||Oph(4eVb%b`5lMx=TyV&Sj6qDru zety}}yy0~P>n;M|5`MZCPHh@W3IW3^srBa?8C(GBkW(Dza34(@D~O_&2vx@;KExey z&PpKCdOC>^g;L%9K+@1XQ*Llm60SDDe}JD&x`?#$6v9E(O7azgmFGU=8PZs9Dd-*- zjRC;m0uOxX*t{V`VLT}Y)jqzb(?B@-rJ>4FQmTL#B~Q;fCEZ9#@7@%c>C2ITF`h>6 z`f0_Jbda2)w6rP^2EcCk1pfe*pKnx_iN#D41D@T;IR2b!KnFynNQxOe@5G}cy5~PH zO-W#21c#JHmYJQl$c(`8&OrWNzdb2EX&TBSC~C@=nn4klg3PJ35(YU%01m@A9r@&) z17y%edlRf`QL=qA8N%{%2Pf;MqtOmIPr{Kh7pv} zYq?RE%XxMyL|;4(Pd`n^2STuI0m^cQ=U>yc=BC$MCBCKNcx5oGWJ?o_GCIf#GMsiG z;DL-CNYZZ9Zw)voWQ-X&V%a4wWZd~>%fc|4qx|4dSl~McwzY>wo=Ew5a z3MDl!+F(Y1z-!d6AXkh+a?CN`2V95*;WH(0T!1t42T8d&VqvNtTx8*Wv)@QbwDbb_ z5C8*@p1Amh!8XtBkiT6oHi%7`698BEJZRV3mXyR?x= zfBdz+F$)p-Z0XVNNh+%7E%jLVnxbH8iTSACIX_hzN&_WHPk#YY z?>;?K3WQrtwx;0`C(N31D8U&19El&lsSEVdmw=`keACS;J>$fltGmm**3;R+$i)O$ zljWIZFDLw^ewtn6@TBhJYO6_PdyQY-Dostot2z6{T#qAYk|&PfHue3lGCo;Zy$r@!;B@{;{OhSL%;Itusv4a#bzqZe2nG z-0G*SryPo6%Nz^;0O(LE57R$AR5Cr?RhG4_=#5a?V5K;YAri0yYJtG^$i|=mg&{Vl zm6$xgHJ`d2q^4x%K`}gx^UriQqqdn;~&M!df|H z8%D)WU**P|qT^{B0U5?m%RTkMwg~F$in0M!v{e5p-BWfvB3?GL&4DmAxTWjfTV2p+y;p^ zlvl2fe}xCok(~<&)uc3~P8d3q+Z^k#a)CZ8mjQDR53HbB(1K5BvAfvDH3P z$c3A6JRElP(w`*MN>GJIWdr6lz|uQMDPlDv1U7NpoCBoYMw5nPE6*i^5=Hzfcm?o;u=rK+ z^FgCv71T&dNSy%6vUA)h(dP+jxlTiv$iucX-0(l9xP&h#wJlmo%8UY&!EOK_GoIgF zS9DmyoJu;P^z;TrV!s|exF5?<@M%kTkf+CM_`?Dz-k#s-vw*WcAJ&k07P%1ADyzq)_*(IXi~(p@HE7!v9}5`4mk z>6HK-5JDL!b!_bsDCn*3ekn2ewxwte$SNm0Zz>y-0Sq?hkaM>`u5{S(OM8Oza+F{< zK1ZklN9G2B&Po8%izHGnDw$E-47R0f$AU+&rjGNHttK()8PDmYCm9IVRC?JZ3zbQm zKA;cr=THy{%CHHps8o$9450JsU%1p{guz0VCRW49!252<{P@yXtS6=WBPGGqZ9$Kx zKd!w=J`2i*juReZNOE%L=5(Z#;j7=6&T)WGZ7D{rdK@Xi|w|^SCQ>PTcM^_bFjE^SPpLy->RIFp_WIBwwt?H$0 zGC!3_y7=#TxNb{bp()`!2d|x*KUm@N$K=lc08^;UhoYQ7!O8dU(N!*eMbvk+Xl@s| z;iqCrp`u|+PlM=Ui*E1uw-fGkyPV}PYg@qm(AplKjui<_Mi3ux5Pplv{#qfomBs!g z6BfmeMMsf?pZur+FZv9(^w9!}k8&qTWHH9l2v!J`Wu}>c0DpM;er={mU1pHVzOm`M zbNNEKu9Y~)-R+T?bNP>l{{WcjcX^^%^U{?EtNze-{+tg0HtRf?$CW!8+Ozbhi-kY$ z@$=HUX0TtsqLHu@;#MKm{{Uz^$59uKnmVf3M`pWAvOlWQ!a{${$L2Lr+OCaOo2L!Q z$D+@xYOh^c7>2U4qNbgB6r&JRFVYoIzo;kYs>W9wIIAsS7-&vW^q-f7Xx1EK0SLhQ zk4+$VRMkl3B_G{!@GGe9Py_t&#)i?+4a6f9aLtdQRHwPg0C*e{5 z0DqQ^HKc!N4DyG5%>~`zB1J>pi32|@5;epgIKL0=Gff;Q6WZaaOf0bpKtBX*jnDk2 zQSN&<7886f&7jlL3Uv|UC1L7hUOuA(Q{~i1KSY($Lz0WiVYG%)cqlSIuA1y2RGD>E zZ(REyPsp7dlo!G*fPNgGK-tjGL|29~2R|3KG#4bWW83x7x?GnKAY^m$(w={$LU|By z(h|J)!N=>aJ_&_#Qd*bJCk&(yQ{U;q*I5Wvrc3N41~~UT{{THQTczrUe@&z6Kgj7$ z0%2t-;Q(Re7&-1h{{SrkoPdpZqzZLL;zB-K`8ql1KFEC2%MotJKRz*_Nf`ke=)5Z6 z48|KC=Szna)kB1~jhR0H{IoY`B{dQ_OGpZpVH%b^^1vN&cUw(fNzfR9m{j2B8@V4o z+72M#qhxeSP}w+y4D*2f{{WVhRb8e8^GUxZS%ySPIp;e`I+FoMMcf3_Q@$|K>^a9F zN7tO|i32~%T0K%D5WI?vNc1CV{{RD~w~M4Ls$cA-832&SoRg?`o}o+K_$4)~JArZs zYyb!NXm^Lf4T9*D%fza?7~rTG+#5g3Qw_%{4FO3|qUE-NIMAd~jj|%{;2}~N^Xzf12F+4ZDSTm)CfcmuAkp_ezMs$Or=YvZdV$}Z zx#J&$AD=vE3IdX)1i|GpbB~DnXx?y&KvoVzMuCrJEKldoqzA<7wDdA|1D)gIndkSy6x) z1Cl?MHQ8ugf^7+GBd{ZI2Lu!+`QuTLPE?^so)H^vP*f0mMi1-8qqE6UaKB%&NS6PyM3YT zlu;OH{JV^b#41_gd1A(R{q){i4m*`e276~w_kma%XA@^0s*if{pQ?I*d2|%e+G8#L z)?@d{ap{G15UKfyv&VfzUR}bRLmPiqef#RE_bOV;^+HR1x>en(P!OiFR+*DL;a8Bq zKmgGAiBbm!{*hm0I*!P2s|6`1w&^msA3@3fT5W|!=LIf0NYTu4RQb~X0PHDKEC;v1 z3H?adP~k2A0Le{;-6cIUPHADFr(Rm3vQVLZxc>ka*I;x&n>^p56I<2xDsSC=_NMnK zJ4HRZJVZ~R-NK*wn_Ny)Y;f{beyQ<(K9hrGRQ7(5iN;nsxAW@Qe4$a2fBhNy=?nJx zClT-HO2zuS$NF1~ZKR{P^*rO1sl7R;r&I2*E?EBn`###J{X<4MR@noGm!i$Dy=B$C zQ@T34oY2uf{{WJutt==?qiUK3tK84a04gPd(S`V5cp(_eyIFHIt-LVVD9!*V$NG5qw_MgkjB z;j5k!NewJ(>RUPfx_TnXx@Lsb;u81@JCUf=inY8Nz)RDgrWDcKaJ=5`3#<7q38 z=Zy>jK_4=b0hhyW-L z6mbkPsAl8_as09~rrcbVib*0_o&F|hmyd=Z5B$N0Iu_A`*WU<>DT>XL1Q3E;HUv(; z5d7FT@;Y6iqTR>8L~l4yE*8@)LQRjnaCVj>816tT{WKd_P51d+>*|GzTi`rWqej2N zs?E>L7C6^9kNFR(BI?0It7s$hIA@Mt_$YEmjKRtEyv}?sksE#lqb^(+hh<7?l zHbvzz;1ZdZ0Avh`i=Hxzw?D4B#TCW9Qe}cfIl~Y*&H&MA2LT5H7ldGpw$tmKA5<^F zIFg`KxbGh&AZR(b!eDZfBE>R{Dyi?0>HM|E#|Rp5NmwIcjl>iG0J{hLG;_#ZL&Z40 z>mklZ4H+M=8b@8x1L~X~0-Xhd5s1mpaKLCB`?U6#0FiKZujIu40EUskbtr;HX*lt; zsz8xJBRJT9&s+g%+y~tgWO9-FETe*UIsU*v3I2R&?mUmmU8kyXsdgy;09D_P;Hr=D z&<;7zBq=o1sO+ocNsQ%#E3qEf_;o310*a4tq**PAC=7*(Q~v;* zj+ykq?Vry>Ba*s`=y4E1tpE+uvPYOO6tuW1L0^AZK#AWA*1;KnBX?Aue^XyfT3ZzT}-0HsskrH}o*(z(xOATRa)S{>6)LIP4bMI*gKJ_Lq2pNuMkSdVgZsLKyU zAb9GfZ;E4-05Q)bVUDAxRi-p}UP}tQs_a~m-#z{G!tay_QG{kAjesEMJfG-3nv@$+ zrS4%S4$udv+;BfZuH+(-A}JiXQ;hsejQ)CTl%|peGlvo(1f2XU$LFTpE|S&dI2D;W z8%Md^bU7syz=F$+|Ic~L4z*~sg;Qn6o{lsCpzu-fqBvNLz@54gR zZ{KN4Y)sHtZB>_fm*H32_gDe{0D0AJ$ck~34Z?o z6^S#ITYOoNePok+=jNq;nuN+uMisWf8!=bw>?+)yEVI`d>y=!!b6R4iu^A;Alu&D_GSM>~} zU=Q|GuqR($0 zFmQ6BYJ*@ib^@sF?lbsm3D@S1@bPb4fQsKCxg(R5dz{4CB{JGRlVLelaC-6u^ zl3OQmKma-Q)(3RXk=U0IM+CPvkSBKnrgxte@1I z$)|*<+V0$-FR&0X@0{aNxwdkdriFz98#C2Aw5kyDSSD1H^%&EawYUjd(&J?fsIoK+ zLqp{+AiS)}yX%pIr@QO+{CFic6dId4Yyvb^kc{ForvvgjHtt6sG)*WkNXwN4{(lf% zzs%^6mi!Wa>LuV61N%Y2&**e2vYOI9>#?!s4Y}Zh$MrhuwZqLVDZIC0tF?#m6M!^_ zI3YNoa2KQ&JDGA%s5$|lU?#A62`x+?;s9~-$^M!fwxv13N8Jd?1oP+sAInDV1?I*C zx5RM9Qsj>K4L(gCr{Iq%GF0TT;N=bKZLHW85 z^U~i`XoT|6hhgSyll!coWAo4_ZYel!?o);lPZ(52W&YsL>T{t**|cwNDLoRJmm6u{ zImR-e{-pNOme4sLba!sxCBqPMSI?0MKg1)*1M=4vngoIOTpiN#iA6+Wk2qHMVo5#y za5V{Ov4tk#)lH7t&G?lREX~0U{KKP+%dUa5y!td z{Xo(dQNd0hV&UPoAFaB&^w9N@RHON-~af%JYw10OUw23p>JF3{HQt*~s=e z8j}QJC6GU~_n753?;v+75FUR#YCn>b%|?=SkxopR?U2VGTzVYwq8p1ts*5BUKME@3 zc9WcY`s=ROa)nkx?VeDe@5uyn^U@MIPmqMm0XvQt9r!wooTgppa_cU)>X6GV%vvCj ztLapmQylPrFu@_#)iCd!+2 zi$sy>snjhUC}_uOil*{m`hobH^dDUi+*(VuE|NK`m=&m=ag2F(_>57X_{quo?$fjj z!iCQxpXGU1Uuv>5ed^&sY(A=|?-d8@^ZJ9Q!+exud$OY2B z&xSGdUU>QerQAJJ1`+y=dMa;M_~7<=9t*cqP}pDAhT5h|kUfY%(htvWgQeU0jVELv z{+8dFSsmN$18Rz`b2k8$5R?ap&-l#~IgMqj7mp`~*gL>0Nl z&N1$BqQd8LP5P#A4H!;RP{&glA){qeoFeugo=&C9NT8v?>b{0If^i%y@`LN!`G3<> zV2_DMo{C_qiDog#V?c+JTL(WdF{oP70X(L)CRh>6GR6Qtp#diz>PPj_x;aHPhbSWn zz!}0~HT<~eTumd=laSCU3S`^>$pePR&x~mciN`6bxlGl=CULc-$Fk=i$o~LcJ4ysz zLT;T~b`kzOegZfjlXlQ?BPa}~COd{EjGsW|6b$_dJm>*=3#@Qb;iOoB5z8Ba>98w!t%md@e}VqGCCqmxD}Zpw7M9mIkeKQR zmfzfL;Qa;;qwd|ya+`4B%4Bm#Rs+HD>>n}*=f;0cDFwJrDC8xX8ZcFYo^n(ZlkMMJ z4}v_3DFQ?BFb)HZGHxxM5G?w6B{^O=1yx@WfO0oCltB4+&=tr;n>4(Z+ZH|3QaYn!_K5HyC(LmHqin9pGHwttZN>Rz}h zNF7subY_W_VU9!tI2kzk_Wrtz(l<-t9uZV1k0Y}#0LRbQU9=EWZY$X6)<!bkfM}oXIvK*X^$FUrb<)Un^9CDqG+*1d8XVV8tZ<nEk?-i7 z6Ub9xSvII}q>kog3*Xxv^rfSsAd;j-304M8fd}FdWmtZvM-_`)1)C{mu?rHk#!fbZ z8T)_BqCw*64ZvX~SX6K!Wh_4vyDm@sxZ~LA`dy{4N^%-#%xtnN<+;ReOE1$HJn2a| ztRr42QCyY)zEY%Qd^@)>{PT@5VQ?gyM|c&4r+SLx`@c1ju)hm;`HzPn{dLK0Qntw9MrbDuxr%NERQFT6*XlX? z=m&s({>zJPF(tA@CANnd=gdXt^dF|5eh@8R4>kul3^EAJdx$^J8e0VeW0bfq6)`9U zT!EaZk00ZvEo+Vb>w}7yPkxOz`Lw~6M+6W*$6RB-(tjw2_=NN~N>T!c0KQ2KM6H!GRo-SY zG5Lt2AP=E)^wbAX&}ph(<<9*@QZF7L^?z2QMRL7Yec8?pB_R~hW9E_V2mUPTCralH zq^X-5TaXaC8(Pzig?1t13b@D5jDEVVTYM>Z0B+IR3oJY^n`;h0+Belw(N#}9WY$SoEE zl}iekN!}hnLv07j1LcvO5^R%Sx&R8oh#~WQ=lBXU{7W9`5Ar!5pwr(CyrkNE6qcct zoGMeu=tcspgCuv(7-S#MNFE5Ve#q_c2whdR=wvR6=mE=l(uL#ZPyKadKX*T65n*Qw z)U1|QSMv82;4(NQep%9EfQ|Ts^3@Hyq+=fMNfa+XUfSx;5xPkpOp05}50minN}tFb z02~(zDZ@z{F>cc1eTA>LV?hnc6RUo0C?8}xLqzhQeaXTo=G>9 z1G)HsKhOLQrE{@>lmmRyRFkJTE=e2|koWiC@t`lkbEzgelvFz$0{j&-;OFx?1W~ex zHNtSxR80^CcPjS`dP-@=%vpa!G?3o8Hy?=eOXw`~CH z9Q}~mFORxZ*6}P`QkpHJzGed-rbeR$x8i*hk%Tt0rja291u8B;3{HP87yNZ$vJwFx z{uCvIDAgT3R^XsClPYn#FnIp}EfD5)DRwyds6xXe4qMNRU}tG1e@@x@YC`KvwvyV0 zc2;y~=TZ2KV{y;x-#SxA=8jJaA%?9Ta)%?50d*h9k)v0S;R`Fxlb2%gFk$$Qn~Z-g z8XZuwoeaV>A|k~0!>Pdb*B1Gsxg}%VyMFCBB%EO|0r_C}(A!J6QmLSJPy*qP3JU;! zgl9`aiXfhfLcxgvlgb488HRsN4Wx9xk|g0Yu*g0*nkEATs=3J?`6P`s-U=RyL{RzZ zfg*XHqv8HqN*t%piuo<-jsqiLpTrcC`eR1-ew?9I%5|tCNZGu#C(yVfk9=s18eOy| z)^}D_0YfmxN$sOe#hSTge$m<9t!Amcxf?W7DBNZ*R_B}ju6g-ARCag`sT(}vl# z+;m<+P$PV@nsN!{c<1uQi3DW;Y^9kP61#vHBa@JST@mEFJy+g%3x$mv1Gock=yZaQ zRlzJv8nW*4lpYQj8T~cYiwl!1BVDCSmiG?B*S4hu^+5S4Xz@me;UM~eTOX-9QWq~L zs(EBWu2g}5ReN#o>7XfstKZdjQ$+~?urhp^@y~rV`K3~+YZ4u&X53`pj>GaAQV&Gd z6S7swGb=|NVL%(2LUHo#jDETR0egyKbA*;=kpA!^AsIgzEuW|v(W>K=tJH#)RpNZ$ zXKKjDx65$Xd_dqPrq?M&WZK3bI4Bq%TxFN@$Zmf573P<1>X=#2Ex5Fvk!u42poZy{{SCdb#-uDUEo*GT`GPTn2d0) z;1ACSOF?mQ3Bg@9(Un!f;1PrRgRU$CmBG%W^mhb*hH{9I0&P=&pE)lql_t9TMO=pa3bFnA1EW zoD7ZF<3JPQi3t@6h?{Zbaljcr&z(d~oTfmkqf~A2f;Ns%{W;PTk_vA%(H&>vO}^9_ zIRzYne!K;5SE{O-bJ~t0 z5VgOh3Z>~sjTc}nqq6MaeuGhoH*fOWJ^KA9u&?T4IhGHoRLVClDK8NQ#&QqoZf zY!4b{C6Uy6a6$aeg}44fz4;-usZ&{40lpxFj1Xhoex0==2s|4~;YmqkXv%RCZ(u-Z zw&(OZ?d^5}`XE?APD4R6IZ+`#zW^io<4jHTKmszSQPfEm-en3${?{S9`)BpzK!H_{ zq9Mc{RLO1<6%D>@=XTtyVEnVLYXImFErDCQTqDX%TEi0fOEFKBaE_D)l`4rn) za5AHtZ9e_jmIrTm4iC#ANc{7S2E37c=g0d=v~ZfJwo|l)CJb8`$bhP;{{VPof9a=e zhyFj1gO~xQC~wodBUg?70yDZWMn}{gKYg5`-JU{*lGW0%{nXQYlEJ^s>FK78(o1mY zofXGF6(q3&J|nZS{Xd?Tf#KG>cUn^7mRXftJLGzdf%;_s04*lkIsR0aV+ljeDxg9M z0|nd#1fM`xO+lw60&RH`IbDVz0U6!AA77rhkyg61OX*xf+`E=ycoGHA%aNew*t$fT z>bx)T7FCg1xgllRPv~)?HjFN_($kdkGAL4ou-G8V2_N_EsEa{2`zCAXQb?Jr!vsPQ z91vMDPq4wpe!5`Rvy>d-{uF1Xr&clvqmD&5C`iaZJY?yEoqQ!Ac%%UglIB)MlnxzB zNzde?`u>`H?YgA6yp+-tBR7&=m>dU=F&XdMCpsrM{P&c^{sR z(a0-{kAj&Tx&7`EozL(T-N(%T0M}8~isd*xRLE*o+>beAMgxT-kJnK-v}_I(mI+Qy z>PR3I3BW+O$F^`d@1Oy7+HNUDP^nOiBUSmbr^WLGa6dgsK#%+W)O=(qu{1GmF}~8k zmS`e59Cpg{_4?`(%gN}a1P}U2GDA@Y*&XC%ISL6*G)um!Ut(dFvC$6#g#zz zW4Vg`L+3Rxj(V>cGK@*^0OAb+o&*5Od z*G>uOJ^fIjYtOiTB@vfWazTiX&xRvI9zTEScG5Z|t-1-MRr0<*KOAK+eMmjKdyN3{ z(PX2rA7wXJWLW9}e=I9}NzruMZOvjf5j%6IC4X!UJGx;dp!WiQu>4US3{ZQHt zQ_V*m7!87i5r8R{J;(j+x=`TqYp0o9aK374c^_|(lCf+8@+s$^WX3!9(`TRM0zz4} z?T`r(Aq)xPW1r*~U3HLtp%5)ZsH8NiuN^~qWD$`k4bS_>`e}i@B_M_5N`cQlZbnN! zFuZ%`kFJRwPzKkL#)_GaqLr8t@E`=A=kKAwHD z7XWahwN?=Mh+%y5v9X_^W&Z#jOSE|3zf_lurjjZ;o#mW|L!2Hn`e13YI$~c*(gvMQRG?7XAvrUZc+&Rh|^;E zK;MFdR#^_$GA}=c6X7~=Xn3ejdB@2mqK%jWg-$*ud=7oN=R(aVp=-lX6-LO?CIsZ~ z1QF7cy=wWXB@DJytrwr122L(9F zp?(pZ^SMT%H_3pgCUHlsf*LYoAWX?3t#5B^*;oI&jq zc%-!R&S5n)^&aUe1N!m)I0F6ouZv*a>f<+ws zl(^amqxfZoB8X56_J|AK`ywuV!=mq3s%?ghq}~-`Sat zKRvu^Pu0M$m-e6{l4GKzntV9g**R$BX*2R9az8PqbDq(5xHY7vr=x;54NoLLH!?6h zdYp`ZGuutL^*@9QK;e3`P56HCOzcO)fyo~H9S2by5{SMNYja50th;bB2~bX-XgLWl z#d_6ox!swzjz|fCoP9MLT1FGvevYOYfsi>y2kpTvfF&Cz7N}S|<{utmD=YM<2-Zr2)4@Rr~oTAdG^OPf*jg<%L1c+&RJgurvKM zN!T&?MYR4>qNR4;T2+f^+k|lc0IuK<^V0W#;0yQkTo`>l{FEuEqqxT1r!k!GgdpAW z!vpp7?WudshKa9!_)!*a;4f*d9#-`M2qXBw2R}R$t}@u%7YL74MND9OdKUL#RO#w%J7nzZi?S(_J%A7_&Kf|7X zmY9*$uFtw&d6E`odS+e!0EyN^hWy7r(^2BAAAdB~c%|W{jgYok0QC&~0q^m9YCz{2 zu2StCR79z%fCC$`KzD3FFh4BmjXJ?m*=ew*Ls;ad=;l8kDyX|o2e%_XKKcX^$L^Z^ zRIe~0mPr8EJ}BXkfwcbso`n;}6Jwr=5v^vZO`dm_r#l*4fIlVY^V4krQT%^t3CQ>R zDd`m@ZWJBTgXUq~@%kP=?~O)FoAD_C-NsXMNRl5mG#+jO?nuky*8mk6)3*YTzq(gZ z;WQ(XL%sg@xB|lvsQqvc>Ha!mPACMFDKIJ9%fiURBMh$2e_Sq!cYw#qaB1k5H7pTi z)JGJA;D!)^`m-K;cjHPN;yx0N*8c#=FF$8RIyk1C)F{qk03!R2G0*3tyxvFG-|UEB ztQ8Tde1hliLCT${F*9?X#Ej$l>BIL`g&7sqA?Y4PC7I+w_=o^6{{S{Wrkit8mXe#6 z03zT4x7_pkE`RLni;C(|kSPZW$^gk^4ah$E#+dn~9!uE6j^&6x@EKQFMW1+JR?AiD9s0&LrS6J z2aHO@e?D=eyTb`czEdKT%4RV}2GfNy@y3>7H~21V}wsu-C~>n2qEh@IwrY>x)CVC_gQ&4PzT_CJVCnwU+!OOE_ z#6>#@T5>koMpo@1`1TeYES`qFKPC&^3VyS)EvDFxesd#yM^)QtKHhlSbUoqGS>aT* zmMT%wvLbE;+4x|t5u^y`V$a7ERvcoz8YX^p*Z>tKiT|q9B_eNis-cU)Ndt2Kk z)N)zfoD$~zoM0Vqu0X*rOvu*V z9@r_!h-95KB>!3rDTncQkow3B_llX2H6HcwqF@G&egNc-x)s=HTidRUEFvGv!Z2MD zid7#)3Izj7XzKlE*t@&(K{F@uyD>l@~%&%{0jpJ2CU z9*mb*Gx~2daiCfRPXG7_dSeqwidQ_gI{K1SZIiUkX0l1wo_2D$b(Zbl#w|kky(?|a zTNtu&-X8M$*l#@CU@m-iQPtKkXAv2-TMm&6BJpPHI5Kmu0IN1VV?9uC$1$SI!AMq0 zxV6-Pz3O*)0^)?j`51qrl;tO%U1!Wuey-3LWJciF56vO*g5yb11 zv6Xg@5%OM^;_j)H=q!=K0g85btieEdpv;9Or6x;#GG9hSs@CF&NH(SUSe{w8&4&11 zv?UIjb8R1GKO_2+XR>-v0o1-}BQyFp+h!eIC)^V~$2NEIc~ba{Pki69Y{RV=3QP6> z;UgX1Wa45d+2HaBC(XY=RxOLd3Ioc-A-#e~Ha9|J=r+>Q;)UNvD&SMOR4qv&y=UCC_R^LG@B{92DM z0NPT}15m4YDluJ1Fvh z;bV=9$&X|^QtD^`_ZM~pUv?wTYY;M8${p}4q>Gfiw6f*qEWs;9ok^GLTev0Ew$&H; zfi1dE3FA5*p>kg(tY&Zw>kmW-cbhJNB~t-=uaQSzI-di|ZDN`G4dOW;RH)d~^+Z?P zn+%I>As$n*A?D|5V&BmRD`h^w6Pe$z>#jFcocav9)+1CV=P+875X0DuR!|1ZmcaZv zL+t%z4XzT2#50G_Jl;M~K)EchgPhtfhCeHx@#~JkiYnFUL8u;EJJ<9siHx642EXdf=RPrZb9oZJTl6?Ivj@@r|q2Q6(%zxWHF&ispfc&4^xMI{1 z>!`(}>#`N7#|lu$+AFwt|8`8>u!2WVbm5G@4J_ZIj|Z~aQO3v}Sd=G4d=99={_E;^ zR+22%4jmUfvpWr)e@bBITn$_Ojgq#(;ZJeva$WrM)k*x=C=Xkt?!@Y+6p6)wP=fq| zz^rj2zTjU^g^D(lZ*zB(VD$<}TiA@Q+p|bqjUNde_{TUslSa#SPeA)p2a>)piTYD^ z8=Qg*!G{w~HU}q=M3YhC9{HR=(*^jVR0kLGCCSgT-^d_D!!xHGfC5HHD6EUpkpxesZu(R@Nr0{+z?BPa}b>8vVZ+{j}%IRku zM4d>&y$N{jREXzYSHrh|RKmN`AI!QPQ?aaS684@IgQ7=Pp?$R>cz22fxwiH*CHLK# zjErBxIyy?&Y-ubKrcU-Sqb%Gq-2uJYqKL22$$IfV zs>6W7^_}S?OkGzP>9k8I8XzE5a)!9!+Gjz*l_=c(Y!k^l=!)~FkHaN%Ir7-MkO)kr zg(p{;Kv+f*(|;Z@0}qO^|1njxOK%!B6ICbXTSu_*Z!_D=LwuZ;Jl*#3D+ZNu-;vX? z&7{}ULKruh2f8nUgpKAv#`j$OUy%WITcRH4_3T17OYe|rm4$>Vt|rH3sxq&K{xUn% z<%E|F(ghn&K7xva!(d;2pR88dgQZmUZ52*_#n zPtId@DxD?1jk;t{uqW{t2(>kwm@14PnktMtc#tD4wojB?5_HvS+$8qLc*Kq9arEs1uILFZ+E#+7hoaIa`%;MC&Z03k{vN%K26Bg7a0Qg<~Ici}QP8oNm2xinH}A=Ip%|#A)+5$Pxdn%9*mnn-doM zgZAwd(2e#Eob&xRXl*Kz-rS>1!~A5U<1^2(4c<-XC@FuSXE~U1&UScl793LL4D@?) zL=P?F)Qqz@FY2plz0Go*A6wtiPUn_zJaeo$q}Y(7VIR$@8kom<>ouLTv|oqwsYr{n z{-g!xW9V_t(TDmRGZ_cYwx1_CgC9aTODj)s)VX$?QHw>KBiFZcs>?A4ZwTkas3mYj zL=!o_8`p5cSDoY7F!}V_=#}*W5hXQqKD|7Tmw$^#VH&^QF#u8h;gjH z1kut3n~ar064)CGWKG%#GG6#!IDJ$4BR#(M4IOo1l#bjrjk8LsiG2*Z=~=I~(VLg1 z(Q2ZpId!w%(JKTG=pMy+oP#50>1ZV${ii#JPFhmUwy;#-Y`IuMW6wewN;x#W z!+>L(xRhh$yqqJ{G3Hp!wcxxO7^CGrDshr}9@8$S@|=)PZBC>_4}Hx-gJaB|&Y7`# zJLhD+0!L%vTiW~kW{$%5n>62in64|Vq7M(4b58WM&~t8T)34$W((cL19Mk39^oh{( zwAUeNPTyB4j*4?1-Szw?T|PKU57Z{oVjAx3Cxt8N7}=x3|FBda+bM^a;jY{}CE)ioyqUX{PIH9x|vl zu_vlzojKEt2AWIJ{uwNgk+VjJeLRtDWIRfnb4)OI!F067`zE7Q@d%xWDMR}!MyZfN zWtfqpg`ToG)QNffQCamoB+ZnfrWYSk-RpbQx|Q}+tYr?%S1$&c+Lr!@n8B5bzJ2xB1;iA z-*$!BRV0Es8Z_~n7-@95^CmK>io|+htw>f!1Dz^gf>|07$b#j9Bs{j@lWXVT2h#fZ z<7X*+^F=$lQ*i}}yd6XAQw`|cE-^ve)0vc8{~u=1T%V-Io~Nkh>lC5Bv-YoL;DT=i zjODCD0jnQ@oYQ0IjR}M!b94k^xEPit9);R4C-_z;LK^y8AncMju|G0{$Xo&N+v*A5 zH~)uJL%XTp4HZmhQ6%CyJ*MhjDN)Blnida*ufIC-9kbeEFBH%VlbaRQmLFpB7u zY#`2N{zQ4E21)ABBJSsx6JhIoWD~It8un!grkb>{UZ2>EY;SIX?e|~845ckhz_Pul z;J;b;Xl@HM&Rq}J&K*ROE-E-neHoTg*C3-7sYsDqj7RnMVCj<*xRb*Ly(}~2a6ACN zH*v><=>sT1unt#+=;3Q_diYWfqLDzMAVj3Dpx|V7ay_XGO6!#1oJR?c@ z)Hrf-%!Q=4Oeg)-GU%<&A*fLt0M?<|uw6L8?7DFtoi&rij_C`r%76|&xpzCNSX+hm z2d%~*<|Sj>lSz2Vm^^NGvBeXwx4^d*-@&bL6R6`f#wjZbZwP8alA33r<)s5*xl0q{ zD}K03JOi(;=tJsW{WD)hYH0;)|oG z!)e#4A=@|*lf7EsB6WcY`WZts6rK{wLxc1+G%-6T?BIAp8qB_yNLF7=CEi~1ag}2L z$;?+JaT;yl+jAQ16 zV%~-WSXE*c=KV>-!byxG^{)?qF1&~bqvzp?bPfD*=mN{tZQ3_tPsIy#-EJc^!k5l`q6!7ZSe}bxq#^^9r z$oPExCGfI+jE;pDQAMe5skbZN3Y721Gc)?XQj%Nzp$9#roE|xYx%+I0Xh>q2cMX7Z z#gMfHzg^H8^$nTEsN(+askrE!754BsgH4K~u#SE?6Q4X6{wsM0L!MvJ3tx;Ex!i=S zoKH-u`cxP!yiSc;ufj(jUS`FrPiLMVFF{d$TGT|PE;bOPBGsrDNcou}&eZ>f&j0E| z6)N+HYr_Y~uT~)sq~u7@x1~g2-VWrLAv5iG5%XEm94!Ah!#@YEjn>Jj@N)Jv@X${b z%tlR2zyC5Q_`MM#mnK2=tZqp5UJf6`lb|g|g2}(U1B|Z*!lRKISp6m*FA=a2FD!-{ zzPT8e{Ufy*xeT6O3ETCqwN@9I8DSTS|f-Jn=%=Z$Cto73<^jpL-`jS_ag*+or=dg3bc zmdSX=Q1Y`)D9ijBI=@tg$k#m}cK&gM`}R5--tr2yNokU|LCxgH>0$EihAL#PiA4!I zO61k_VzM-&g#7ZBA%Y|2l%%aI3Af%sBs$@;T8yM3vCEivu zYTs^C=7yftH~(Ai$i2i;su@m=ugPum;wF`@0F(sC^TC>Fbm1YI^{BOFZWl>V?i0cwn}E9;ESi(~_9-1on(&CV+0j*I@T?P;*)igae1>FsZ?9ez_>JMyd3 zEL}Q`d*`t#_puw|0=%!ip0|}Nyiv!USn!eC8UL8OL#(m3@6Zu$rm_uJJ@psYdQgOC z>vo3gHYJBU-MpBq_)L@+G_8Q^?tHh#=|=&VlkCoYXIsaesoBH*5&&~Bb%^Vfm||vr@r8M@@;mdr6^*r;Glpt&OIDg|ChReP*8Rm? zw^)%EbG(=9_le<__}}1ud<@*Et#Z8GH8;2yE#7e*Pn2+(U#;A;Hp_TR&1`r}mfG^B zs_ODgvJ82b#n+sJrq`8@uq*(<{;FPf=_zewLDaK8aiQ<)Wb#pO8uT zMkI)3BP6SUmo94&s9j!$FKCBgan>4qD6*0XGZG<#U*zz$%x07-`ic5nqeT_yRDx`j zGBxjtHnA*b2ye}OB7N=y?30J?O8s z5oKO^0Bw$Q#8z*w<7$2eexvyujR|f;o?#8ysL)GR{JBS({Jw)QZ#h|W!9tjk87uU% z+a$buq@HZ@A4c^Tjv~Wk36${c1&UO=$WcuOIEO$oB$D zp1Th>Uyr~qM`z%gtIfJH>2MxhL2|T!bo*J@m0M*PXC%oLQ+Glw;hy+ zIb$M>)BY{I{vL6#uCajI7NShmmLY+u%~=>5&V>dQO{T{`5Cx9EM1@|_D7JDbtH(QE zpt4V#7)Z6F&v`+3pNlKbxK4q%j~#k#ri$+BH!|}WF|vnZtJo8k4jZSu#A#59*Mh2eY|`fRfV^psZqDk zl8_kmT#*6~xoX(7^)l&msUcF!BFNLj@gRFV1A^+8A^)f6(XrGQ)Q9YQ;5Dj15)NlD zGc@l~qvrCE{m+yfNJ(Has>|W$g8LA&v>HyvD#MMP)6n?6PgLdRpVZaq+04-aKcrI< zhcXXavNk=ofzO(HFjG;EDtYn?DHe!>X2CJyx$_K3OUWU}i{$X;CCf;^`7g5R`AgEj z>nwRccK}J&PbXuO6l-&vE7|_jnXG)XA0?#}BKi7o1x4BpM`=Iv$0PZs-$v?C8#FSe=40kLi1sD6Er+YTq zknBKAKKYT^Emcs{Il>w_Js%bfOeL?C_Ywo=Q{;%&ZYY^Q50(~+!}(hPdzR3kz}CfT z86mhYp$MP4bs4WxkYxshCy>sqw>Z)H4gPxJBA${v6T0iJ;2G_4c%t|cE)5Mr0eB`^ zq!)!xi(N#XXF6e7-yVp#Ek;gp7vtuJR;EtE5>mov-%^CC0C6f+B8C%XOc z0m8nI(Ve!%NM`YGN^;- zeMR6IBhBotxk=S}|3hMM$810-RM%V;MQ!WmYT_1+Ca%veH+agdh6C>KFaN zkhbq?=!bb;Ul%0zUv zI8|LYh=LPB@s3m7u<`E+rllwc$r&!di*x6pVxJ6j9o53@6V;6VzSk&K=`a{8gkx5A z4(v{@0*CWnP`W7(RP<8uedTE&T4YB$cG?j=x(YpH7UPApW}xR=_7Ru68YEoJ3(t01 zhA`(A>PQSkXLf1;LF!QV)sAvJ^#I*{;f;!i$CzKj8E|m-7fAkP4)HTZ@lqF>a(!6J z>>zJsNHN>wJGmjb`pwD+!;Oh^6 zpobZgcg^4;rmL|5ef5vU-!tc9hl62^YTGk3fA?N2CEkgje7S<vos-Owr0^?n{%m*>gM3yuWj9YW^ajU|t} zQ{c;-chHlfON{oUk|jq9iOm)}oRCzD+#+2VTB8OeFvAOnM@3M21y^BJ zI|~aIRgg!qhhcX>CslfHaz>Nli#<~=p>EAqs``c!*zYMt#~wGKOs92F?eR)Lr2~@D zFhjb%U#S}!lki8~h{Onk@Gh$+5Zb#^XEDIxEgUlX*9%*S6haT5K^)#w2-wk#Zs(^V zSNm>|_G^Znr5adjsxcD#B89W8)Jaq75>mBN74QBy6Xoo?=E{8SOoPS%2eNU%427K7gnVaeqv=~y1m3?FAgSZ= zlyUfH^wDA_mGVi8avL*eR{tGiF0=*=oUN!9(<0 zvlZ#c6(Co;MrP-K0kFz47g~cqg7f8A#@c!EZq1kuO#>37u(p_#ycfxKO1d{52NI^ zJY@D|S&j;!gx z8%%CsX~!5mA;J*h&2G}{Y9-vLY$p_3JzpqZp$*}>2XU>ABrcTM2`hibkrUTcSa)Wf zgVMlx;MoP*zCZtV*^>SlnoC+De~_~w-CX+`XTbaBczXZ$CwlF1TrMwPXy==X(baQ2}N zE>Cx0?a!{KegyuZW6k$bq7jyhpuLn1SXfPl1*-^;CRL7V3W528Y`irtjSrs2k?Mwf#0kQaTw% z7cGOZBbvlQFPZd+XpxB8)9~0SA9U|05lbgW@^4%kt~}C(%;hqq{ADch$&8>*vmDTU zues=%$rK!{vn1mgd6y|DSrHMr!)LA0<$4j(>cf=_&#L{fq7_^!1Tj%$v@hwtpf z3oAC@CUzZ`UL}QF;$85uZ5Ob_k9h1Ay%&FJwZU%tBk{og_4t0_dK?pIim!T^q5*Mx zh&VhG&Mv=9eZnVDN2n{xxOEjCAGtu;)fv^dMa4nHzzoJDa4q8${1V-aOoR`%L16xD zHg@OEr#8fK!0FK_e13NxeN;8V$2tWpRdG3pwN>O*WW8Wq?&X8~s5rFe{X~14X5hyc zs+e#8rGfR>I#eBW5UI>uF3`KM2weQ$p$Hs}b|f27OR{??-Dy|Q77JS_l`w-x8G*=@ z=ZsBu67di6^v~FjEpZqtp=Tsl_WN z6WQ%3vF#WYon!&Z%mN~?YC^{YKcevAjg#|nU({i}AD*{*VBd$+&{KUvy>bge_fPpS z{iAy5r%wZF=v|5b+z`d$J8q((BZ@ep%#Z|VzktKXl!(Nx>EzcUBFNun2YL@#cxMOHKZ zBD=OTC|#`&trowF++FXXz~tK?ciRi8=iP)TtyHw8EfKNqPlXJ>dB{sE8a#OcXul_$ znv(|%|H@11;0;ljTK>0wt<@;Y;PpDz)nCf!P>TfAP5Z|BwqY8=dcRrkORgiCse#On zf79{I-^XaZnk-tQvw}vrl8!jI$mGnk1bWJb>Fhn>X~sue?$KI)z3eW^!k9PhIvw?@ zfL;)jN?&YsF>X3qRDH_t4b6XdnI1nDO`l6zN=G%s(qT?CU3<@v^WfB^eroWWu3R-i zYi?xIDmoi!vzl01#8;E!5GHP9y+zU3tlQkU`M3`~sCIzP)ctX@}u-eyRCHElv{L9ksj?IqQa@T z?#c%G2Yt!Ak$zT?NT2CUqK!9Y(lQxuC)a#JuYUQMZcY6_TlIZsuW!z!6Q`EY8&`|c zxBg>KYU3;;({p#&BQ0_C^Wy?KcFP~SX~kQ^fF*BE8U3%zFF2HMYW9sRfDIDNci#Q4jz zKKi--M_SJ_k9K{vp02)HPUoeY&==h*XsOlilW~8cw=Mli2Wq^jy7Jn9KI0ck=DN#NWsr_l0u zE137mt;}Zmo9N=vQDAk}L#w3@mGC7I2CHL9=A9Ij%|*y?-6--9&cD^eO)CKYF# z@qwGmvHd4ktT1mBEl;dustwK2(OG@S!Sg&NBY6-F`HWC8!`sk~*wZYuW|kn}>2wtH zBOfK!c2ffXFD!?2L$v!<3R>J5PiZv8p{(pb)RGQQ8>>1Ng7-@JPKX`6dlUhlS9}?j zs4vu(&TAlBvmV>c9YZlG4&ZLu#cW*J&NRjtB9Fcxlsv>n^c!*f{D&>NR9J^t=X8*m zasktvpO0iFw3ugQ9q9X(al|#!#}nu5u#Tude9c`;G<#`6Zyq6Q=ZFa9PG*pj^IDMG za}5+90sL}Xk0N9?qNg5~$W!YE3ZAfJwe`6(Zwr*cD{mRmSuulbzT`zNuoH;)?*B;Z z<`h`9WRyAVZ^EjYS&gneX{B^pKeQn~u9 zzj<+tevTTMivHBE-Q-ojKHLnA$oxhZeu7~Ct`pSrmI{0<#X4G9dTgx9m&F1fN?BZvv zujPv$_`@$>GtTS&ZNeLV+-KRhk!`uz{UT3)wio~Fv@+iN5qZnS>K81vt(5t$i)eo4 zdlUYysZl&E|D0#K)|g+}_>n(xwvvCp$)8`Iu!FC@EtfCFYUi)?D&pVraNvvK5WeM( zi~L93@m%<>z(2!j=h?iD;TNfH7`96*J`Ntfq z`A4Tl^V8~|^F$m7Ur=<3zos#c-?BJ^@Agog|L}JTKTWuUKUjZ}KQCUI@4hXR-)H=l ze>SLqe=t|T_xLZDAEy?^Z_9bgzb3rQcU|4ePgNE0zuha~+ax{aAMr`yYu9o4>qoBg zrRE0kMQs!KcO@tP7oE=kx+I$4=(d(W-o1*ydYTJgY?nVjBgc==YO&?7+hc3_Kr+X2 zZgz!5MAmy=P-7Cm&wM67v0aPzD5==u?RqQzg_W!MzlX~C7y2@J0*C(E>e%MTAN7&!0Zc*L9ZWU)#KtA2Y9rkH;?Y+10!F8=@BRH!AJm zFFNeYkE@U6XX|G39h^_{O&J^h-VArXXjTJ1V^s&=%ag}a$o43+sYTYvBjvt2l2uNN${k}MR?+_ca( zMqWVFndQ8erebV;s1zQTd*FGImc%9F8w$5fKquo>m!;R&(Kl^caNp?){@6$o+dHhl zICtrO?cEo#+$y&(!Kp%hQWU^yKWQ9c-zr& zcD*EREIq%($U^FuQFoOxt$Ft>?W12)-RBX)UOSjX@6{&k;{jV~Fph-|?dRZY@f?DW za-sg%OPKgqLdh#i!P>5C%nMmD7^sn?*2RjT6}QBg{i`FG5B&zr?eR6#m%aALc+yjx zzMv3T!J1H+dWfPI&BB*E=AvGZW9kJ@f_?`2>ErBvxSqyU?O~A7hdGe#W3o-{Hn1k|D zp(xS|YJ!e|!Nz1Lw*SjqtBHk6>$2d3r+^7{$z)zHR0pet_8=<&{8 zYHgz@q-8IKQ+}Gv;}b<-yzo1m?7GRoN=uX%%doyLXk@giUBPAJ4v5X&%4Ec=3vP{8 zF^`LuGgoeBFpI72m?{%>RR8A>6VrK(xnaR$J%00@n(^W+Ri`aReYkg?*}Z5EOf|Ko z92F3^%kp0Bp6JwCZ{yzD5RDJDPtM+`b)56I*2r14w$bl>?a`NF+^2K?)gI^e)OKbs ztZDEWt|=Sm)iB3rafd|mYv;zJ+VvGqwN6HMwPH$hxtl({tKHBhR_i_7SbI{wx^|&w zk6@kXQL6a{3;h}4+K9CpQA{5Nt%jb|Mfq-|e0o1Cut`+#IlF`!)Ot#FXn#T*7w96* z@AFYn+cor4yHD`$cQGO&H(CBI;&@A|1wJeLfRQ#h&O9I9MeR;j!6(+Hfu{CnX7klR zM*O=xB&ptk@d0xGqQlyycRb5tb>4qHqc9c zQ+cJ0jEl_|mc62Hz324 zd(hGK)>MCaC$#m9Q-bw}(TUjCXb;v$8{~q}*>`z1=aLiAA;ArF+EpKwv1O1Se>(ap zN|2`6HB=ltNI7~$phwou5J@jW5$l7IlS(pWaH5tn*IW-POgkv4tNYRJ^UKgSZxQtL zVIj;t&@IRsYZvU39%Ks8bFk{^hC9;nj1!!Kp`@#DU1A(QUZ5v+0Xx>tJ|DsKB^^v| zx-UG8`VZn0c+@I87BUSCphk`VFlqYbF!6;5mStO`yVkS6Q@+8>* z{s^3>4b3ln;c8_9)U4eN{4<5re@1-P)&@l?>!vjm|KH(yvC?gn-(h3c z*R^Gorl=9MKB0{{=bpjxJR44pmV}|seWB>SM<3OdCPF`)ahxvLJz=ylxPjex_d9#l zqjyUQ=e>~&?>4*qr88Y0GmqBR4Q0Ps6h?=2XwzF&6;1wpSk3N9DK*a0%Vke{W@U2D z_d5H)tXI{k>r0GNW%G^do)oh$tg$s_9t+t4p&2N8w=zz+C5F!gm!hG|KhQV#cQ8Fr zkILM-mTIqLGm+!RQJG>F+$u<-+Q>y@f7l9f#nz)kT_0e!$OI#|;yJbXiwslt*9P{o z3mCr{lHf7iMDjnd1u=8@I<$^TFgpWX!*BM(rn@ORPL#jy%sK6VU|KgY`IYT zn1zW!sAbQT7nTv1=U5d7_OQ1k1XtH3NL2lO(a!FDrbf#;cN_DaJlW<|qwI$Z(~P@T zGseB!CO7oW)cec3O45{>T# zPBZxur&GP_VtRFMXB^^f-hh5N$)K*<3TnlsB(OMDf%I;cQ9kDbsfoRxSa}zNq0Y1i z=_*B0dh$C3{~qh3OFBQ9{N0f#Nq(MSLr<6BQDC~Y+UY6Gp2&7e`m!5%9g${OS5du3 z#W$+>UNkJI@S`3%ucUg?)T4_*?zO|9WadSM%UUvu3SXFoRes>TArnqx4b)aHU?n%I!S{j~IA}hV$TT*gV!n|1 zCw7neD3D@OIuf9y<|vYsXn-}*0Zgmv119k6NhlH>t*bkg3X)f|sMMqXsEHa13bhx)zVy(P zkn^NIMqiD&{y~Adps7TiPV^GISN}^j793$x&X@rI&Jc6@&t>MxSQFa8p7i`{h*I+o zb}@~P^C4cCiVl>|M1C3(f?V@dW?AAHDEe@r*7H(z?dWp1+TCZIxc#p*x%Sfk)rLPC zuXTLq#Em(+p8KitH1~*%4%er)qc(8#21NX0YB=_TtmE73sSgy;R5bi5~ z5AFt>z#VTNtd$L!&Xs?FxW5}&T*>9NsCk(cW%2bbvsYJ&`Ex{pbv$Wt6)k}5tjuR0v5?ybuOrzdM$ z0An5yCU$9-wuyP;>HDYS;03=)~w?; z@C)5W83z;@Pu>hB!IURRZx)3w%1=<<^54wCoI)l$7pSwVa;Oh)6_C*SH!FBi9OIS! ziFy=iijqWbQ=5!=)DO{oX4kzkW~E6Nbwnu~ikIp`fx>!9f!lJ0eBs(K!JYtP^*{-f|?$XIQ^W-+@*%og8RW|2cP+wxg3pTby3rU zX5pJ_zE+Fi9`@#MdyEfdiP95A#`JpFfHit&!cj>d6=D)hcQ(6ef4#ho`0cUA!feLTTD)f;CixgSBf zOLAe?A7j)wO&vZd#WLY)pQz=_HA+e6(JFPz49=A^-w%NU}G6ydN`Y3Tav+Fp>v-9 zpgNHs@LxK=lquyWgq87!$u)lT-c0_k_HzE7Dk1;=T>*b=_DA|}Wd{AhEsj2Zvw;57 zG(bCSAT;x(mbMN^r=P~#(N`DzrGI&RrInr3IO}Y$(n{<%bmGW+8vAUf-?q)*R9@xM zBTm=o;9O}=pwf}INeOa4c^gC#ZOUyr^ zyjeMvo6X<)d-lE5SB-xxnRr#oUtmw&{$52T%%Yel!e5M2?l(b5$>I6}yHIFv&<5qU z{a`C)2nQ=-VZ-a!$ZO;UwU<1mQdRX)f$bk^>V-k-v)+8jxO)&{&*#G$k4wN=9m4oe zVu9qjyP2UKmzk&=x|6xyPHo)WOmXbFEcf|x)aUK3jIMH@&Ck&1RB50n^~JfdUShX7 z%zShmx*FUd{-X}8cz6b8qEOg*X%>X)T&6sduM2#qxG|SLWPrPc?*HFfpnnmz=%TtZ z%9-H~9d^lrl5e8OWNa^rVZ~Bs%8$aM|I~?^oEkZjKaH$bd4|M7;T%)n4Xj_=D!*uca)8SH_u_kx%9o zA^({>%Ve7uCe!A!nP<&)<+$durQ6N>RkY26E7i?)%tp*)*6lO*Z}vjmWDArRxF2nJ zvljhW^9m&;tz_j;BaAJ3DRaYL6sCXrNhz4Sz~Fld);45-gtH&a${k|v`_oLroN;PF z&Pw!Ze=3r4QA67QVi>DOl?;DmIivL76gWOFl#x2Fji|r^B>vR~&G=qMDHwf1E>%~d zqa_m}vI?NEHVeFGK0{vTHbK)k7t}@F;HZoT6JazRHTLhI+82j29v^e5oVE*8$F++# z&a^CCO-_Ta3-W+7yovd>S_bTm!r^;$IwNoE37@9MgQ<2J3>~yZpNsvFxzb7We9{wZ zZQD*=uK3F2Y3H#VmS>>g9fMT$E+v-hmdi}|x>jnjy&p2u5J#e0xzrXP8p5~5!nw(P zdj&|s)e#JbD^*Cq4pZV(ra{X3Wy!mdZ{Vz#M#-P*6X9EwW;aQuTsn;=TBpab!>+&1rBVv zN+UF13h&!>8GVVaL@~-|(Uh(EsKY%I=@3!m_G%+Fy5ua9bCqU_Q#%E+Ati!r<`9JZ zm&)WX?h=^Iod!Eir@-nHW?;wu!lbYND#$N42WCJXtP2Aecqt9{I@iJi`SYN1=06}I zUzlIry42KdMOO$ zi@}4&i`3xmguNDwJnUyYT(YSH{`=7Vdk0X#vlJxi6^b6M-7b)s9mxcTIWsYLlBu?; zR7P*`BlG&u9f8EOos5e4ER?i|UoU!BopxThk@n%8Hx?)^qE|Gg()X9FqW3Nqq5t{Z zXLsW%?7uVH*+#ohv)^;fjC*w)*v#|UbdZ(^9r^4s{n=BE!KN6gR2i|4@lS z7iLYtEemY$Blc9jMGl|W7j>IAyzL$D#O5o!|GL)k)>}+EV~^gpcv75UdCV%((%?4Y zb;kzq780JNz3`ExyLr21f>WfWi_R}g-3#(olA3Ln2R>}EbjuL6I@p7(^5a6SE|t!- z%9{L~STBqR-4>c;m-?Hi8|a!W`<87Y5w*zVj>UvA%VUYjJcEr!DvQ?{HzXyPoVzjI zL}h)q@r&F*6MEHYlLyZuOfJURoBUxiO%iiLOn%>tG~w&!0 z(83hgh!os^j$H#%w+uL)&T%vtbt9GnB?V z{;!g}(u%;rOX{%y)fU+^kHxsu1y);kvAZ8t+2!Q#>{Nj`MWrg#h(!@_=_ zG0?EJ32c%*Ao0~w{M*+)#Qgf9%gfk8P8<*W;Sm z9mcbSF#(#Qxjh~=cl@e3$u~~cRPNCe_G#*fPARaMoH;SHBEbNQA{LW={Y~V+y5V55 za(zn4y%*PSgU_4bc zIFI9|Iik|(Sp0eF7=5v2GuAA-Q0H0|!_3w(#yjmldAs9=lW}hq;n3_#vU1x==JAWV zx?!Ossk{x45o-2ypXK9@{~@uXr||%q3!eHSp#Hr#Xkc!fvq>5owYJ9LaB7Q5 zv)yS>TN7=taKQAMEy#JC%sch+33>cO33?};A=*nvVgIN2+VMH3z{9Q`-tIXBDf5S6 zsqF+nKTZwfUw{gw6XSiV4UR;t^Evu=1 zM0WX6T-b07&rD0f#`+lK%-MqOO%}LZr2t1?KY{F{Y*b#o0rh4^(lZllvF5xLijSLP z9j{(?C(OdorY1~(Hv&)oD941RUc9qC7tMwX(05fnwK)GBeyKl(g+3CHKl}g&%qnS7 zvmUSWhdwc|w}w3t*I{6(Ii8zY;dXY#6Z+Mqlde2=0JfJtp-+!DQjYv&yvj+!#k!;G zhR@K!a`W@lp}ns&rEp5z2 zqaSJbYF-q6syl?@MjI@-zYpom9jN3n5=~1_p^+4DW_cWL`nQEHI~YzkO&o_)%k1#S zr8u0vwt{SL9>HJVtnT0tU+eIx{TMso!Bv*d*rn{c_V(&Y&6?Gp?k;2Ie zopp%6BF~I{bxSn+rs6?%1$n}1+wI48tO#SDKYNUI_vay2@%mZS&-8yduD#g8dXpW= zKI}LQ_9 zM3NQfDw&-4Ml6({C>dF8NT;-vTd9|JkoTOa67)ni7rv!1Ox!T0mfWljC8E!bJ&#J8NSD=_ucVQuuc7Ia0s zvGz?&s`BjAV!gfYDVXIwPf&X@S+M$btN>yh1>-p_j-0;jr1q&R&q}xi$IPw5>d_ha zG35xf`%2*FqG2E(Dvx(H+OYIg3_c%KL+1E3!rdjWKwM!2i5qy*Da|`@Vqp>F`5U^+ zbv_0qZ5K4xZlRlFZF!ypw}{WTO7NfAK}H`_B@3rT;`STY@!_z8bj)mZm>ggUx5BjP z^sM7JBEkUeKgW|qwZps{5p7g*s{ju*m!h!_K#R&)*f?(vF*|QY`J3Wtkgg9ra$FBS z*>${k8pcF&(+Xy7#Ri(`p|P7sc;%_N#o7JGKadD?&k>a30RP$a15LxDYu;%y~2Um5{_f z7k+fzD+)UuCX!|Oxk{C}!gW+4>@%tnj$Y<2>WsL~-DA|s9a_GcTRwk`Nc&5n@Wji9 zT%s=JURpbgTgzoRowz!Kqch~^n!O5nH1oOEh>7_ z%^FjrXH_kw-`$RaX`L~!|H@Z%v@7^0qYVN*vl7eP@!*9MHRJ7qZrwlNW0? zFpu4ilJ!9!h{Mk+##2uY-bcH_;;Xx9)^mCK#U>RC9_yn^Ofyf#N5K4P^o0&q4VhW) zKor!xB2%(BpTfH(hWKZi8OfYl zKrina!TV!bNyom=aJ!=MkW?4SxRZ$Y39Rx zVvJC|eg}C!?3(B^g~LAx*T|~a#uHns2pGMn% z498#YZ$uUAPl?a^=822)mx_5ZKJU+(a ziRJWSvOal1T4??1E40<8LFlPCi1+Z{%I4m!!rnA#x;hiEE9yN-fhdhLz8+YTx3^8f>!@>

SLi4V4x@zVzrf?XWXEAt~WZ0V! zqc1$}gp^cn*XTU1x$$|SM~bNCl8+Bp*;tQzSf!Y=U~tpq)}kDt&kN-mhh-U@C-ajz z`3p^jIdN$<1y32_+J-MRa!)>TI!_yMHH_zTeAnf2#^mO3tXKPREA-jiLCJil6I&Ni z+r$sVccv>$Z`ue>Kg&o{Um*w^8+dz6pYt^Dd?IGza!@!o750mN(q`2hS}9}rEhUwV zIOIazpBXRd?K2!)G+iF`nk~rn-4--PX*69ft538BUUaKFu$6QkvqGNb6mm?ljhue4 z4+3KM%bX+eG-pO2*`sq-qCL|P?{zK4zWgvuJsV2b_1^*BroxKfK~x}<;q7B}ki_yj-V~cU zk{U9fG(HW0?#CC%Ws6j|MxH0+*9+mR(=3Qp@PUpo)-e5tE}UFo1N!k+sH^jW?hp>S z=K1WV@jHEJyJfVi%dM&4yu%r)0+OKSkP&>mC55hbZTLqGnI-A*bm;nKnOs2yj(lBB zKNp>+0UMoggZw0-oHGqq9cZJw2O}_HC=YL&)L@CZB^voYqbsDY$ZDF2J06|JTjvgA zS4k@=sDVy8W0e?$zGFo+ zOUn$dF3F&8ZkR*vC=R@rbApc+xkQrollk5>9Uy z{{+tNCUObX`uZpD4${KXs_P zi;tUsucpWH+v(3gMyO($iOVZO@p)+q_1?aLKU{63U`x$f!NlBb$Bve1tjT^B0`1Zm ze*4?M{FLDdtfh}b`Qhuo@l#7*J8t&)&hK+HWABX}$L{~_#lNuk3d?KfSoWBuDeTh& zTUjHo+6oqA-E}O#9Z#?kl1-_W)?>GvFTTM-(mFXqITG_R~-)8B%*s)N}6Bu{}YU zx-XNuoVSN-=EbDqS*?Wsl`By%Q|0afvd*UfyMdI$FZt?Mg z7|Hncwi3rF?vllZT*>!JuB1g$E3r?Bm29a?lXRXhmFx{WBYFOAK+^r3ku2ZYDB1Y3 zT2j-|D^dR*FSv56lYc+MlK=f@is0x$Bi0hmDAwOxKYs50V*aE26M{y!hy02~FZk>8 z_VQCEc=L0YFA_YjvKCxfE6?KpS;bE+8sh6(gb0>Jw(y^=u@zK5oGTd9IOJIHZ;^og z+mZk9QyMBOtRw846_6#(0Mem|b7V}9($VpR)%VWDt^6Dl6={uIdc#Th=P@Kdyp$>~ zT;^K-Er2o|E5TCsjdz?(b4xg~wl071C3-p}wQgflCRuv@1yyqGq!Tt~;l43^nN!Xh z6=XfLcV(Gm_FOkA$jZdY)x)V$a3s%a9d)ygzlr`&W)qFMpGcs*C9StFpFxmnH=GH=sIxb3d; zCj}2O=)W8tlCBs=Kj)pNI+E@5D2_xMIhNbrxG3xvY(cSi3|fAFK<0imB>VOh+B3IMk8!-B5PcX``I>M}xYMuw@@Qr4L!Yb=P^;;)AnidC zo%(z_M$6^m`IEPCUF#?gw#qwwa!{_B62G*DrD;+#>vn_FDo$^Wk9GOv#AdU}&pgwe z+*H@qEDF0)V;r<<@^dD3^72uQm-4*lIPIxeBea;bRQP4Nv9R8T)Kpn!)O-vts&T%d zB)oI%K#fmdbIrbgkMZP-ZRqZ-2RU}>usyI8j{8NUn}aGdxYQ9oZnlCyt1}>C3niP1 zM!}5ts$e#64Y}6o346^8$k#iC#L3PWbaiAslI(AE({e-j*y_)`+Wudi!4n(oefI*F zWk#TY84rpcHH>n{?YjD}sV@62%R`~c0$$fOd(0epjn^dqf){o~39dIUWA3$k(jEF| zc%rygqSuoKavoN2lQRJteO$l>xNut9LBc0!!2RMszZY`v zO~R3%9#NK$I+<}mgE5&o5!P9_GYINNtgZAY}W`^pC)aKpNRCVxx9*g3ISUNi$-KkVQY&ru=lF?!^ktWH;Ymrdt1 z$|HY?Y8^+dhAgdPxvve$!`|-oc)a%>p#}c%PvqhLWAYlZrd!Y5{NFD3C*~IJjy1*b zK57mqUz-JAPR~a3$_Bh|8MEd5PfWY)EU9-(06%KJ zK=b;M7@0GM`f0f_wAYJdih|&hGy)!2zap>aSJBLax2R8Ss_O*3bO`?Vk!N1NoSBqz zk0y`#K(8xy!*W@~(M--2Cmx8xJy(a&&)7uLaP+pAd(%ri`@flDx!P&sz)OdmLq69z z$M0V6+!gQW`~o&Qw-rBkKDU0NOYuOB^Sb@Foj>SmxcnS(-ML;_*~MCJzq2QE$N90B zuM79^J2CfI6=R7sg#Cb>iGUq>X|FJe6GUQv|vK8F`kl|rRbc7mtl;;Y zvt?g;I#TebTf%oKCW7c}ZN9j_fS)jBl;FwGF4l2_f2^Q%MfPwRLYf;rT=1nPpMQH@ zE&qeilC4-JFF2fVoiC2>;otr=4{!DTWHMT0=QNqqsQ8(_tYA4Df2tYr19P`^?@~L_?YrW*7?%%k~+HTQy$!1VFGLYPmm==44EGoOxhMy zK#Y$c<6(b*`7t_&m(clyH=uPHp1%D8VW&2e{%`{-NLUQ(x+);=lonn&AUpFe--r4N zuBgO0j{R>caP9|1sgC-8ct89(Zp%A}Yc>y)?i^5*hRk_~Ix7sN9|OK&>$QEfEh3k0 z(usDn@qfdF-SeZ*$2;JbU4@i<8AG*R&f`VPd`pQNE^}XGD{%!KM%)8i#&MUHuI92b z4|6#kOSz7|`P|{HDct&Xq1-P&1GqM&^SGGxhVwb`95?X+;p&}=b&6(Ib6W-PoL2vy z%pIFz&ONl(l{5XY2KU1oH||<>ZLWWnEOzx;A5K|MgcFPP;J;o|P&b8#&wMagmk%?j}LrjXouJQ>VBecJ{ku$I&^*6ZwL7NH`EvrT__2Za zPcIt|Sc#dT_AF9mHkdijjTRnnP1+H<*n;xp zt8!FU~r8a1iTR$k)&qzuN2Kax!q3t`bP8?c+MOjGB?!SAk1 zro^selMaC9md$eU5sNcyre0Q+?dgM z7rv)@{UQ}AYSuy1#Y`}M@RoP6cDVC4Ubu7bXcOmr z&9TnqU&5R-PREI^P4^XV^zjyN>kSkaK3pY!NP>l1?4O9LU1LO(cPfc@8AXd17VmRj zC70}c@9KT$BdfU1b0V`uPD{kjGpj9JwBu}Cx|b=rlmsx&p?8D$pQ0&WPA-P8bY?UE zPqQ|AWJ>^RmGy~gYNRLF?DW81Jid+JQ4&!-QEzIs>4rsoeQ$gI&j?4>w%AtI<}b4Z z=UR6(W#rxTx+Y*kB;yM}OGL`tfEXHBxgTo*y&GLYL8~`nM4E zH`UYoTbsxy?q}jDmXd~G9+nqyNr+GbqD&?ebS%L{zX>w#@f z7AX1sqvpz{aJOAi`m5G}yrkLg^1>o4S#guxJ@6QnyerVC?1=lHu{zT8X;-MPQy^}H z*Z6DJdw3J@8th+GcbFOd1+u0@cI+&8#$yE%Q((F5HP3euyA*U`<} z0|m0Wd*?2DR3DT><3JB)pc_!FUzL>Y|4V`={HwisM4g$aql51jbuiP?kARVr1Gv(o z^xmfl*b;deRn9$tIg|5X($cNC{ZubKlNN*1S7yQc;xo`)bDo-M#fkct+l%<)*NS?z zI3mZ!24VRaU(xK7GepqZE8NM-5-vUeRyYtlAmqih3uBclgu(Nwgz5LXgpVwS)%ZER z7iLPfaMbS~5uS{$7dq~|Rs;3MqU&h|H4@7}AwT53FofRY&CQmFg8eCshRh96hK!#U2Nr*)z!lZ$==Ge7_SyG{>_E4y#$=6M$~6XGI?GUi>f9HnE4_M4$q&2n?4+)YTD&!cp$S@Ppk_Y7I%>|jZcW` zG$A$UsUj~OM?#OC8@M0bhDwqJ7&#Tl!N56iQ^qZXhMvUjj^XI#{hFBeG{D+t|Ck*& zW=iF~c2bQk%l1NPgQ)B*ESpbX)va=4(7d&1%Z&vyxu7 z)~aFT+Ad}K@0<_bby!Z`b}U1SD^ZMi#Wgx@(nXxMJ(m1udI=vi{U9&Y<#5C3FpR#R zD?0(+W0Ydv5S0~ut{4CAqW|S3Q9GuZ{v33Wh4kDs^iNgHy^4WVc)znAQBHX*-&6Lu4id?HddK zO_SBiD(=wIwm@k}i4@8=v%t>fETk_nhGs)g>9335$*-V2P!(xS4K}5sX2>h1@|-&g zzI^1_-J;)}oziDqWLY37`A4>7MiBAXnxnemZul%v14@M777^ z_Py>zd~hSC{`y-hsIDi57>m8}0$$>SX;9H~g0w2uGx@$p=)N;LbgGFTY`&(92F)S3 zae@`5uDC-T)>*u{#b@(Md6{)vK$QKOKE3@A7s#APnUOLtRoQen zYI_`27WU&N-aNQh=_N2*ttB{kB%d!_Ya=+b_$Oca%29qtbsc}I#e06Fp~UfR`zijF zZLb}w&0GW$t;K?g=c*i98w~|xjvwb|b;LV6i%HCzA*QaTj#|k(;kW*`Oxk~9Uh=G1 zdae9=l0Pe^Nq#eSlG<-_&dOSRr#AUb;^X(z#AL8Rs8zvoW{W|bdHbH&%juw4 z%VGnkxIa@|ocTe_x@Icg6H_eGKc*wL3@;HMlcsTMmOOBdT9xR`ls)7o)oBST?5x-+ zVXA_0rZZXJr4!iIo1NL+hsUrly_DnUj}PI8E_}!r{nXWxCR_s!++NGOz%j>>w+rg# zF3}{*&+R4NAGe{uxCG1U6KUtYF&KJY4qYEE#djzrjWUC7h16O)tMV$k%xfZTVbh^) zS`u}C9f}P>4rFhfker;ch)(f#r6vO=7&6j{=zmS5afUIZDe(yTG~uX=XXi=sxtQSS zv>PPR^eJHn-KQ(x&!s`1U(`i>TZe_`-Jn?Jmb-E(o_8cTA3Uz?#fCsg>^BGo)2})(e@+v{9GSOD<-f zbNb4;FT5IU%K6(7#4&BT=H#lDz|n|2=F~3gbz;I)Ih(VN)bK_x=3EWo)r|kUKxqAM zF1uqPPw4+WjU&F*?(}hXx^QXLeWB)gXJO(?1+MwwXHG?HCJB4Knu_vmEQGpA%c;Ty zQ&$+COb+<%fg0I9zhtce(4)?zx!r?Wyx$0Q+40ceyBx+HJ4@1@yO4=a-B9Y>02yO~ zp{6C99_Q&}>G_MmQa6Rz{iWo(K96kv^MtwEJ&}GmIS>9R#=+>l0gx(+fev>s%({68 zca08$g(E`1MvXWhj+5!FfA+ zuv51WyF+%_!ffpc!>=+}b}=>mBJ zoc2vjU2AWW;QOA)SZyV1j+`d%Ke!V^y<5Cr(@c3=?z)4o#!Rq2bc{&9cDvm?;0@8< zt#J3bntL7l0)(wDgxcbpOkS@VId?A(de3&s{#QgpXN&@e(aq4~Ujzj$bKsCH+jMBT zCoxsw!}%4f;bDzBIkg(;8M+#Zx1WVK`8H(zQ(XwVy_u}^ok;Ek|Diq{H#&0u6OvQ8 z#VyR~4J~_9K};12N$aG^csGA3RjW_Mg?EPG_><8X9a4g68)Uw}Z#Qt2tgcmR){Mfg zYJBqT4L0qagHsKX4rXMQxz*uX5<6wHeT% z@f$3!{($8MyC8RaDf6*?CFCy>Ku&Qe@ipT}dz|bw#JlDsbG^sQbp?WhWtsIVbN9efM1iQIwB;SGfJ0CIk zfhHb}&?4@aUBUmX9F%mXV~mFiUW|W0x~^V=DawA393CR`P>#R~;!Z@H3&^q5I9q1p%?vnq(pa1f= zqx`EYeE%^s*mzTpovoO_YEHLf&wnzN9~85gm3+^Uoq5;JaWGrVf3Io5c8R#d&(u|6 zmmSm;Bu={TXxcTLopR?ybC|);`3~b+zqM^Cwh!y$`$BA4d;M zPhQ8;TJkgaxOi?5M{GB~P~>&(oyh3)QL)CrE%D&N;o@y`mU1eeopBl*Gfq6h*HE-) zrKj_T>X+RAvV|hGUL&z)Mw_s}jp06Cr|cwpUm~`Dzet>Ny;0n~y-8eQ(ka#p`X!bV zC`jsTWyOG{x~!F7z4(`Q#PEYMo>W)Zc(dmI4Ckl*nk1O08^gL`?#Al3nZTA!PRGEYheZ5=%WdqD%L!n?IE5+TR4r|oPib%lmr^-@oW>H zFg+X8T~!|K#zjucVU@uudUfGy>iu(o34hu^{sf+r_1sqBtW_Eq7#T}P{l@~gEK6#X zJ|6~lsgb$TCK|WUm$XHhl8fb4Aau`%CkM8Zkl800%?36Z`S%D6zv2o-74hU0yMabc zSOb4wE(I_OrZeY#!=J)(95$l>R&A|i#(X|P7d-N(SMxJ5PB26c5(P~C>PTMhjfOjH z7L0gij@>Oo_}ok%ZC}`d<$|>!ytNm16}DsTirqN-;YqjD1vn%Ue7&5;@>W!l-${(qufZ5L-cTw-Mhsi<*13G z2sJlR(|&K!%>#2p9WOr%r#%l4{g=H|bT%MNRIK|?xco(g(1$fjG^Km2C}GnPk=$Q7 zkyO2#dN+81`tf-(Pm(20*>?#CtDccPX%wusZ-QGK34JtQLpqi}9_q)>goep;L9$hw zu*dzzZ$|5A%Ct-5@p~2EolU21j(jXDjl*426G-#>$K=bmKDwas1t#w*$7xEsm^82q z%#+iAdF29c>rT+S|F)uTpE0$+hvcyd2PXZT$cxzameDJEK~AY0CNB16v`=>tSoj)& z=wq`i->e3lq8MCUl*?OR19Za3N$_IEQL=v~pO@2bN;zHUse)Y(-Bf&pzE~ssE^Nn> zIaA-$!XqyDAnOM2%-&O^E=oYM^*8d^Q}@ureWQqp<|`svISH<{-5|WP3b3b131T<4 z6Av9vl6b$DDbc@6ei^NBb@E;Tj#pK1v1Kye46UVV3-sW0>^a`W&GBe==^Hu!GKu(C$82O;fb1^n0`fzyS_Zm;8)z+b&)COxr_Xzy-< zT>YC6Xzz@9tw}hHCwmv^A7E(|g5H!iEcmD>eQWg@D~^vMR|mA*uRm6k2FvP78f{JuHuZK6i`Z14f;8SS2Dm4MS{SMMB|v5n+_W@zL5seDyIM+JY|t@G^+a zwVpb6VDToPZMA5 zsTA`<_KRKSTo8L_v&1{UD2Zphik9s&_TtD{wPKA~CuF!oyf`6}NDg}kNM;%>mvpQ$ zm8{?@OE!5mRnOg?&mS_J%VKF>X4RWd;YSD)s-NsS>F`3)l&!{jRQ+^)4NGI^4OaJn zJZpEFkY!&`!Rmfl!n(AojK5yPpS7r^m~XmPhy7gen=jXJnN@f$kl(C%iLbQyG5>Yq zY5uh5nzT*r5ls8y?{4ht=U$a|7x-(YAXj3_Tb0L$Pwz4as`X++OCJ+IHd=PKUkWiv z!x24t(R-IBEvmF6cJh2kpSqfG=Um3g>AGN@9SR2OC&4oD2n@yErjzv65^j--^pL)p zblAQtsM`X(EAPuuuKxy_JOdQp(jbe&3dwBw@ATY09c<=P=EMp-K3$$wtKqp34+ z)BZ-H8?g+W_ju7Rxghdh$)Bj~j)u^ivcB=1k34<_$6;uq5poY1!h-|H&6YniX4 ziNCDq_qQ9#3z3LC>-)%?Z9kDnc0M60edh?fet;R`}^in05_t3fV~gpTl#=qk=R{p3*`K zxAjvi!#hk)V<@fGT!Q*8Z!wM*p>b&yzML&U>vz*p*X$-n{1xIS8PE9pHleFxd6?ra z#k?DPu=f2BbM>VT9GKKh59Q9ow_D<|@9;&sU|u}YUk&8f%U1f!tC1P@QGs;Qv zQ>W-7dD)s=dl{BXykX$NH)4+|%TX*S=qKAuqgKy4CO}g^;;brr9aFB+2aYu{+mm;Gq32Agh~>smP*`jW#i)MQZhL| zibj`DBplXl!jbtbbUZdwo%S?LuUdlBJZ;cVP6^L=2XT+-Z7r@YTFxb$?Lt&xeJ#&8yG8!9q4z_XVGd=zg{+X`<$uH z&t1z!OFo?!y%xUZjF`NMtF-JLH+*{$R~T*MoOfuDd*1&G*Fo4IN|~Q5*7teNUD=cF zESc274M}tpws+jG+&iEy__EMkFy~o{_-Qqs@q(Y*mtZ>@{@Dr z`6;`nvS!a7!>j+AfO1tQ@zR6aRP@jeB1Na+pm76<3Yx`q$YwDGPj98rwgFz=oCKO< zkCET=#B|mkO&liL4jxk-h_ymG&vs`K9YP=u*N5Qj#t!24DHGRJy+%7cC!0@Xi;lTT zc<#qaYIm%P?${lTN21N^zAW@64TEx2(_fu#Rv1sbW6Fr_w0>~RI)|pw`>|=sDCXC& zi?TUhFJz3Ywe*|wPAaxj!p|Cs;HUTtB7#DsUTG>)-?(i0;xZ3=54g*+cIQEJ?P;jz zbim_GD`4+odR$EXc+-l>@Xt z#EUFY^TyJ=KX82B80iPcN^F~G2NSea8ROj(iGNKB`B1rxsZPI5OD+Y%aNLBd*ZwnIrk6o zx9W}$bmLY{WvQKCJw3W|k|^$uYRx$xWzpt;XN2V=Z_4J;Gzla9^Ef}2;N*S76F6@* zrJR55OE}fzg1Hq1?p%qkJ=dyn3RkiBEoaMZ0XK2$V{T%*BiBuJh-0%monvUFRx>vz zj9c~$guGdd{ao|!OW7R($G1GU?-aB7PO@>{f+Y(<0;Qz3k2pTyGCeB?7P z=mL5JjDL2LO%bZnCBjA|Srbr{s0>s86VhJs6W-;*6Oh-ZNSfa~mwB_7q0s*nN$I%- zlLq<;R6k_Qb|;|D?`hC>yAgiELtgv(8F)9U3YRv_1?$O2;q|37u=sEWCI5D#BrpNk zy=UQv*JE&tI*aB>$uRJ)gsw?S!wqx_dMwhSQ_T$^DWsC_>g}UiHkYaHm2B8ya)C@p zizlDcjNnw@ViLK3I^^FsNB78HUO=olFe+K}zu7D3fl_<)ho8Lm?3wWOcN825$i)fW zlj-2{L6Wuf8?o)YOSiI*(|;$A({)+}#Oa_HD93MQ@;a@k`rZNBbb2&69ehu|hrdOs zRRb^a_XPYMbP<+4O@W1uvN?-S7tll7@8bS;9)$Zw1N-p}EMU$$-qnj_U3}@okNIFN zm|$AU@4fV|+D`ciYvTBKj>&6ytnu3%*qf5-ShuvIuS`K)VC99yc*+3EZ5 z@eK+_2_haE3ZAKbo_$7JDL~mg}V{^+D#)@KjW<{CU zuOnmd3p40w*(`?97u6u($7&pFn*q5kM$S$^URufaT?ShCWAbGa2- z?Tzs63{!Yt&!9sNxbJ2H)u|(`x`tKWm?HF6umD_>U+$f#uZ|9Cz`)fU8?dQMe zA1gBvv;-Ej!V1bAj!u|XUHEklzu4+JUr;xOZBw8n(BG!U`nOcVx)7};h)Q?jAD@=! z*!#GYwP=zt`|}Z1c9`3H?2*mb+#A};w4M4#XZOdWoYONrwCWJGEICfvJcdY)dkuEY z??JyEM{(ZwP_n{w8}Ia+kN9Gep7c-K1#Bv=r)uGcao1QA>E2u;>G2mT(q(O5uv^Jo zIuPwGwcM>wt%6)(vd=^~PP%#5I;xn?y{X{ns13|w7FY{6@RSd%gkHsL+Aqs+DQWv- z)yI8wX2>gEm5DYQzM4*{Zv$2H7)xp$9Z14K7CQb8z(?a9=*Ol*SethZ2M;Q^D`-Z* zF70Qa7^~newv~6M*R+W3s`Y3fH<|JjTAAh%1rVY;4xUJXu&YJBIwWW;;$q^B-L#T}D#>gkRD zN~mYpE~+}!k*Qp9i#{vLpzqb(@L$h9vg_+<`pV}KlWD7uCgo#MA=e5Ph<%wqt)nQM z*PuJnRNc&E`2(E`0$zQ46Af6Wj*cCpP&IQm*UM@lcWeKAZt4BGoK<1ZIQI^Eb5CU+ z;(nT1$EjMsux3V7B`0gsF-}joH+MDLoBKU&?xl6ZwK-R-w$z;Yki_m;|A0fRj&Noq zHE=4oHOr>8Om{k>XvMKwZ^+fV5yd@KiM-j7KD_WzY1AM_T-Q0@l)5YpWK`Wl=!t`? zNc1mXrqUya=Nf;9Sjjj|amsEm-Z4n{7aZYBlo2RDS_b;tPQaFVdLVcd3GM*^dryQx zpOiwP#~pZf^b$-Q^#m?-mcnfvNB2PgR#3UD?fxd{CTULR)3nPjke%5{(5oD-TdpBm zcbAfli++*g2`b=v;Q>7K@Tc?NZlc3}9wz%07Q@lFv9NjGZn!0vS~t(?A@SNe8D?y~ zPD;v>$fu&uM0uVy36Y+rQ}-H64eo3rPQ-$j!7m|B7EgE<`h^&mu7{0qjQ0Fb(U}H9 z`L$tOAv-N3vQtWl8D<7^?xAe$CH^XkO8Y{oRJ5RkvSrDZNDD2NXEICX+@nM(B`ubc zv@g=8MSJhO-{;#i&vVXw-PiTI&MxNbY?_33mL0^z-+I7y{wjM>T0$LMD1Ga9mTtE? z3NV!67pq(0k*5aO)p3hnPbnqQf#bk_^-1!_;3}D63oz__HeWZ(pQv1FBxw^@k-OW1 zp?D4DpM$l0ooFT6UUgBn#%n)pN~s0+7hJGUTM2pVOJFb}>~9zj7Z17A>OdarFiRF2yyX9keDL`?O=eK@Kvpt$3~C&a$Q=69 zVAV%YdX$$(KJM*xJ3Q5a%$*tx*EJH5U%69umKek7lBLj_r^?^LZKoTWyzu&|F0{R6 zB3tr}CHr3A#Iygq)>U|HU&R*$oeXSnjv+OK0(=eM=-Tj@3*34vGvC&}f zck*W~>}3m|G>&mJAJN3{#2UgHEeGNERo_{oRGe5F+M<~HN2iz_QR%GSGfAwvc}G}N zpImqB+&rh+b7)TW=PD&u=j(1(R=6ch-1L>6UcCnM{YGHin~8Y%wFPuXYGH*sV0p(s zs#JIs6ZOO3VAoAz;eW}^O=}&0D$fJ`vZ6@2TmC`2Af;3=MPP1FK&%yC3csxug&dLoOV11 zx6kG5Ut8kYImgr4`&%nm|Ds2-=i0lod&8aCS4MfUt4RsV_pS%?Xq#?T6Mq6bH~0Ws zFXc6f-8Ti!tExcI(g(7AOLM8WB2((_q{#WLo=1oL)Wh=Lf3j7VU1`I@o3hg#Q*p$; zAbgp7fli+iM_ z&^XQsieu&ou5R-ZglJWe_BREz_LP=#?;n*BKG+A&D zUBU)oUsxR!>wX5!@N#H>@)UU`c~ms&D1@)v2cuOJNJ9JtECe?FuT9}cSsaLq#)(mA z>`zW@)__8i4vS-_;NbEa%sd=KTnDKmdw2lO@>zkB@0aL5CKjh!RHKsOeBI`jicG*g z`lEL{8qQxr_vi;pi#B)At${_v^mG$BckD3Lv%Mj+=3DVgCm29;m_Bu-ov5hY#WOpj zp(to?ZHdcO(Odm=k>2~cVso~iIKJCQbY<2&(T2vCBJ}GNjh$X48s+s#6f%7x=kc9@ zn&3PhXJ!6mHZvktw8Hl@=fB}IIH7YBxaYa&I0`m+jh2@;hprD4oo&D7wBm}fs7rW2 zjLCBHAk-J=o=$S5_dPLPyN^Hm%58MIGmV5iZ6!fK^=B z<%8)U`w)zK2J_+H_yAJxUdeZ>Nx>-Y0GTpO1DXb1BG+3>=+Mx0u;_XP{CIJb!!R;hQJ!+wCYqLPrN*PL)SMrx`y-KgL+KFCW z9$C4#>Vox)4Uogw&{y_Lp=k6ClG-$vinl+ZyZ_9h`5`~3$7ClQ`ECSi;bcr$wiWLh z4Z(V5Deh9_qh~%#L4)vMtn*f*+ZC*n;Sbltciaa#TH1nc|2p`!(}gY+Jg28_`{SQU zajs+6seu_Opp$3*2Zc)WXm{rjcr_^v;##w?Z)Owiy)+WHFt=#+nKF#H!NHpPP$D^) zMKgOd=zfQrG_GSiD%Gk%(Y?uVG)o@?7Ei*D^~+I{KaZ$YZH9HO*5oKXNd3!>(j{g} zn55H+CSI>FQ^64{I(LjblRhR7P6eQ7L^`UER+i6kiU;yw3`~-2rxpflNf~P)8gfU# z>e=SR>0mp*_>gz$~=TUguc{9#CV@hnbc<{z)rK^cz28~p( zS0)WEq>C0mCru3s-^q7BNPB|Rp{Y}C6Wl|Mv2uwUFo?7zDt&YjQ2RjikBD>#B1(J3*9^wI#&ARl$CUg zMUX_Pa0jhVvp%@NFuAKul5`?2~^=OxxPmW|_dZkc1}Tm$Ap#$8szl^WK=e#(TW z?`4j5K4jW`hY45yu3_@-FJ*Zi&tgS(eXmab8X}Cd7CUO3JW?&&N?B9C=nKnEmau-` zGZ6ON9wT2{SPoa21<=44!8V5rRMn*lQ%VehaFKtG^@&7`2%)ORfhaHyqw$8}B+`8$ zWEgG7?TdcW4M!+#)*evoW3#CDFNIdK+Zb!o72XrkXa4oAR%G)vPgq*>j@tA2nBCMt zZ)}_d+KTt{?>RGEb$=XYzgi?4d#su$)JZ5Es)8+1#t?OW7(FxE61$a3tF9yrE> zRaX+|kB#wo@I)Up{OUkn8DH^D*eknzaw`&nk>F3+Cn}X@;4eKP7OpkHj4RQw{rCdB zFt`9`6tz?D1L7PP+ zq}LIu{~W{4tady-+7{y?Lcm-WEWd27VEPQ*3&yPP3bsoRbe{Tzd1qSPG&=O4cl9*| zM{hH{wl2aiijL@;w)M4;O)z8Qa%d{nz();=8>s1Bd>Wz*>GtD^ z#k8BUIsFV657FU9YHD!X)JAa*)s5tiDhcI|l<9E?U)17`ERojCAGq(_JlomXV_h&X0fGgU`QXiWj&0)d6@tw?OG3QCM#k36e4rHbOkq9zeJ8imDF1M!?VYE zve_%E@dL?2+20e?GRuQLW2e-vES2$FM;K!Kf1WrzDPCq75J@$^osfB5lS0!3S5k8^ zl5BVQf$Y9k%uvyUT_-iEslpJkF+sud)*B@6{_c%Z$fD`bJLtP^6`Jj;>SliKm~2O@ z1~`qerw3(wWq;4tlZ;yz_=VplV%vsnRE%W69KBE5PpG3-$5vvt?+Jg&FdbQK+!p?N zaUOiGJ%sLaEwIlopFE!KMl$|JgU|gS9GWu&+(qNTU|Kqi=~@H*%w4J)GYjgt2gx^` zrzCG_1(eiJCX2K`k#%fC+-OvUkJT4L?Nl*7*s%e%?mw6P-na!qKHEaXvKGqK`X|e8 zxkggW3&^HnCg3*UL2riKkfnwkmi29ZOMYGtqkb0$$Tvmzt*o1oElsba|8)dl?8sF7 z`^pP6o~X(?q9y#lqF5Naw;rbdTn%wkg!FNkF50PQ&T}}WmW1l;M z;W4@Hg?2mh6~6i-xMJ7|j92M}Wtq)%>%zOTB`uzKs%oUX?bt~6+k?hJChVo-_}$YO z>&{fhgHd9xc1&WTmMmekbClUu{Qa!@kY-lbf5z+~y9|ZwdsCRQoYBnBw-wBK!;96w z9w#x;bG6uB6;GMo6}CdF2gbtwP3suneU`$j3%Xf*+P1*F*IH2PdXsjvd!p|SXX4{` z4AOn-N#*OG)bCUhZoDuSS~pdLvHu66ec?87_f4aE0Vhe$xGs2o*$im(NVpn01=rp$ z!zsxT;J-wnT{q^FVafJra$g(kOg>3oR@zH!%4Ud8wN;9vryZ0SG=Ak|RUhH*h*IPA z#LSTt9aEBiHC1?VXAR<}9x@PH`sPc9EJ~8(Fdb{=Hb8F`V*?C3?nn)Y@((v~HxV^D!ft znY z0OTsL*L%ET;N}Ghf1WUs8ZMC%wr(QK3{vobH~gS!+G#Yq_bv_CJ`duHj-W|+9c^Ih zsNNjKK5$bF*KoTozT5anc2`Fo9iC0XmELmtaE}q5?>k7wyj2GGf))HjG415&3niG} zI~zPIMksoZ;>d)v3FKUSKA{=yaFMkdl-@d!*vbVEHz0&foq3=aM!;dJBN+EwgAq<< zP^+oPOpO^w63k68*yJNW-PRZyz8mmk2L;pY&2@C&W=*nI!5ST!X$8Y)ETqw9RkTZY z5Z`-eKi^MrJ_jA0LJw%D(R-z~{6FDK#b#p4L?s$~MfcK|a7JAiAy)S&bzWpX zUvz!3dyPad#M#yNlyh8NJSU=-!|{IkjPv17B&T|`H|Ha_nB#F_lJoo74>@|Hn^-j` zPdc|=zFBkEHdj=CrK)CrPlm|$OAu-J_>V9HA&?i$hQB2{$ntbMEY{DZ&aHMdw`&wV z8<2-jhhDcD_ROxnvlAisXVb>E8herM29vzZJDsMA1SuAzgd$4$784Mw^lE@TTfhS}h)+Y4I-jx3U~_)~U$n zn68z!I@M6lr3NY-vH%Z8Y{rX$hv^Z=qf{+K70+vgV0nH&O`g0Q@BCej@AmD-M$<6N zoX^1sjStj1^EJKg+lH;TW*})uM)A&Ays};&S3NzAdrI@seN8r=cU8g>f0S_DCxYw+ zp)}zAW@>CIlof_@sIO%@EqEAAqf{rNUBh|24NB-G-AOxSZ>N2&U7%?% z6MlUu3tPHGX71lb{Z}}V4SgS|Vc>CUpe*6Lq|SlnM}J6^%WLYdlTR|DD@k8cC=K1R z2MXU!B4ZuD$@a9^qUxD!`ja^T^jbb>SB++KqBgO*)_AbTz1hz)TX5geW7IX)etTet?`d)T;9JI; za;%e)AJt&11l6+!_8PO#47joe&mG}7Gro`IwQ~vU=Wt75PH`@a^`(z#9H$}}v~U}= ziv`d*#|m^74)7=RI0#xo#9+oa!Unq~3jgyOVt(N+v>8T2Y5gnyiBHQ(Z%IvUz`<+q zrh2Esb9IA!u1O*ur+xT|t6z}~p*FSJ?IyCFSi~=#7f)!GfU7-Y6puFgNY2J-Nr%rc zl==>?;BN05&YKl6LcGj8OtL6_u6RHH6gQ};iuZ5PCf+WmRlLO}HN4JIL%7w`GkCW< zm+ z^EB&ZSR?Cc$spmMF;`hJ-+LJ8)L|_9Ycj^K>;-JUy#j`MtaQ6;T!5bkw^0k-YEXBa zO9IEu#Rm;BX!RVas)Fxk=9Nfvy`4#Gnt<+$Go<^X2FQE&!LXv~JTwoO!Q~=b{Gxgs zW7q0Z`9v;Mk2yt$G#$W>Ye&h7bNf;HUk;uvKZU8AimA)a0G#Hij4O8~$tKirYZL#O zfN6^WcdmL%oi%4d(n(d=Zt|35em_h@8x|?Pt(|<{&au#Q#1Lk#uBC=|AJbnw8B{yI zk^FfO52n?=Br{W0ZWsO%{R)@Cu+#aFyM2KAov6nJk%81<;saRvbEqIg!Kl8i^9}Jt zDsGx@0BUSKh;+CKUv(52w_#st)4qRX(Xukvc`FA&PK=p6m&Bp0H<=vLxxqhLdzKW1 zl+Y20rTnN_1cTMQ@J7jN5_s7ZAI>#JsqGVNr`T!x*l~`(K=$`-gLKBc)vN zUq1Jb7Kgjj#EW}tM>EI%(lPFe91&;Xn|{%0-8Rm;0Zl>FmWhy@bex)GwbYt=cJOs3 zU4vKl^S~v|26QhzBB@`e!?GcbVBA~BKcVe|p0_^H!gpQ#dt26mWV;`#h_B%GhkNjV zp$$E_V=K&BeO0#FN3YiKKm$=O3Whh6mE~av#Nc4*Cm5Pt0f(M1!JC0U(0ZMWAEk2! z-qz=knzItJuT>Mjzr6q^)u-T7;S0JX`UrKJYzar;I*guoj11e7iR=@$cx22*@N8pX zzpWwbo{$2Y0x;K3svY^kLAGtR1vu!Ilk>_Pw|`kc zj!v|Phg*ta{~v1@ddd=%UE=AaRgtn+^|8SJbB$c$ucDEvI^b%QL9~BF!w8e*0E!H) z@9|sY_UjrlX82)9Y?i>^8;1al1ZZ0DRrV%66Wne+109v|V05;Rt~S^N{+w?Rl=TR% z`wXK|FKl3nA_JhTlqU=3CDBvm&*?2kM{@o2AoMa;mZ$IS#mE)unETBWjL!x0|BRH- zyp}>LLH~Z$BNUwi6b;CJ-~>0pBG*U%^^vA@4GcK}B5)v)}Dk zWa#+t+U5Y?VcTlz%^5=uoHiy?E`P?CDjQucUsi=ZBZmt#Ldqd)RVMzPYD(IZs^CoP zU3l?mt!!aP5`Hq~62HM^ptV&=knDAV?z}UJa`xONv#PXV@`inowfTjtv1T;69c==4 zc6Z9M4+ir6MxYAv+9Yt1gGKFz$`mCU5q z<}pW4=re3rWyhMe%b1mW-?Dn23@~9s#;~VdXERTlEEK)A_71mIidk0;vlxf%+QJp( zHLTOWR9F-Lp21O8Q&GFJ5Lpc&czDG`?A+>%k%=v}^?&PWp=>GEZ`zM$$?Ndht!uIx zaRV`!|3p@j{*T5QDABrC$N0Yv&y>wE+<~|4lVxh@x^7ABiSVYPzg8*iGH(0+f^V20 z<^3&s;oRb@B+A&U#{F>3jF%wW!Rhq%=7oH#;~770;SFoK$vY9H%exX+!Fy44n^&2C zfH&JLop;D-y2Lh`NZd8|@}e@|ie}9yl3c6fNYC3?NDInzq_gJDk?L2+Ngp>qWKBHm zTWud~#*AN?$yl_qgbEIp!}x2X*k__%vodo#nSn#Yg~u)xv%W`OW!n7532j}@GX3+F z*&aQV)fcWLyg04G(N%dbYg0pJb$&u7t8k_>BjguXpU>n9dpQ$XCs!#tN>lDb-|(Mc zXQ3^~+Mq0GIdB^zuFuD}166o+N)rB2Da9Y$L3pQA;F{DIfZ8iZ(lpbZSh?8_X9We) zS29DoB0vXK|3x9^&qltn;{SOmd>I*}lup-|_mK}*n-m_~^Hdbxi;j~TFm?AQd?$+` zliy~O6a#&jAyzK2mSzEwx*g^>`3NcOFS%4_nafp*6D(uFSH?Q)gjqOb`2V2^y0^`GdOzR zIlNP9gUUT;sfKYD@f$JP#VCR1?Q)rZ z{9V4$!(^hq^*bFKsx1w(*Ceh$AUHltM-JyAP50?0TAh#ZU0ry=F zf-$GE1yhHdkf~jk(Os{1llSdx{JAn2QcK-Y?|D7?8J)pE^Jcv9$_;-UKOnml+)VSg zX4B&%12Fo4F&_QI%!Q zNTYS1kci1HB>8L=e?eddfB#=87AqCwY?Ys~9W(s#!T3cOcahS`x(W{YIydAzo{T$9 zRTV7FdYZU4SyreLPLB+GPZv%!!Pqi=&`@IFu+MiQ=uu?vuO^Wt?Ox=eRwz8X6$p2J z#Y4ra62)v+Mea4NCePF@+*+SzlL7ZtFeIQAR_b`d*c}R%f#N?tcwQcP@HG}XttY|Z z=9zd!`v?|{G9cm;B4V(?h1kE}q}coUV5GMp5d^BiKb8+)op#Yc<})=_+ebBrbP{%4 zF0J_+F8iwILBIASL3B_(G@UpKf_w$HU7SX)DYV^(ErALR^)MV?eh0L#`T_Rbp)WCl z*qs%?rT$oODIX7J=HC27jY(KIIFY#9T2Sh=hz{;(!9s^|c<<^I`SG{n4g#li9<*9d$u@;ZIuvkqKu4MP6|ML1QK=&HOy!K)nfMfOg70e7lpq2=rb z`iZBQaZlNRHG42zcN-0xlycmD-mN9Ryt}MFn=RSd7MkoQzlXEu*QGL{K88ZZ?OC<; zbR)KA^fp#Uz-&fzadVaRm?6TmuT6!T?v{+Z{biO$Q8`P${XC=9|3tx7{4*(RWf$`^ zI>^z!bpv7v9X%I z5Avr8ZYPNUvpbyC_j78Loocw3?(E`qDhj@BA{_E?1B8y4~3ZQ2AArX8Xi_txNnv8!qHO*`ls84Fh*og?jQ zPSNRHE~*dvA-lY;jC>g|pq~5ZBX_G7X^d2OyPJdfMW21JedAQTG{aQEw@Su`Uy_is z;iGIH!^PBF6Ug^HjNIc~?59<~(n773yHt)+~DF3Z=#y+2ru$ zR4PsVO&$l2#3gkF$nUPBYa0gW29BV)5p?lSE*}9BQSMhrn+)JUu?4EJ0VcQLCd?UyB=i6w{O)0;1 zFo)`Qr{m!^BXQlPN|EZ@>Eh^Ux#;d`FY)bh(c%voDWZk(q2gV3OT2#ydG1t598&PbI^1AVj6s(1*UG5 zpv^xDUt{thf~5kP6_K#an+?&E`^mjetHD7@i+r@!1)Q7+Qv?5lv59XW%*UQQ@H+`M zy*nXLXE)rr^pmW~jfbzC?_jf8U*N6s5HxQM5g5s|!7@jOx*nZ^!Dn*G(~m*mA`gbd zL*}k4UFd}!Pxxfn;|B8n&nvf$edEaDtSv-L$J1{clK7_EIIweJHK|+wm~>qqLWpB6 zc`LIJ{CYG7Q8^$3ZL~A# z_ND}SqwE45cxFd3y^q7Ik*`UKd>z@5{7zOMF_8$G(!mnXf_-B+d2?_Ce7HS}K>kVo zT_sZ(_{pI=GRw&{%dxcbU%%{H-A0%{x(1fLRuec39Vs}|?G4*wmO{k(Y49(vl!O+< z$`*?XV0WmDYG(G5B*}9!Geu6aKh2lT?!QjSzI>QJe|%3N^T{K+N%vb*W1NkwQT=%5SrAJ8$Kg!HNn#jFLDxNUkP zuKtyV|HTqqP}hjZJf2ECx0*?Hrazb1Y6`@fs{19%Cx%Ftm_C$YE8euknQa z$?sTB#926b+A~((Ry|?Fhh%0*?P1m=%dl#%R3oW(=Izj3-OF&(C38F^E(iO3DK0OO7 z+INBTUu9g^Zw+D}W3X0ux+`uZl9xU0sLu6}!;4jf*{X%5GbWNR4-VpHxsN<=L6~f+ zf;p0{;?LI(JW9tpoyX|3(_~}o8Z21TK;A}E4}f@Lfe;aJu;`teN>$reS9% zw~8yIFPV0d9ccn$h6UEh#;h7m7c23| z=7CZ2D|f>%a&`%+@pc37$%c`hTk(O`DLj602mZQ!#&u%;PuZ-UJkV8Lh2Q2+!+-AQ zWZ`!l$g-m`U}OA;=G867ZjXEPZrBUHtYK=7&c4y;l#fOzMV zbcIb2wf4`aFMjW@P0{!#K7UqQT;DQFthd`xEW10{Dbh(#V*Bc}_~Dhk;$f~);?{58 z;>L+Kl07%JioGh9iCy+d#b-pO;B-X$FPt?0xU82icFMdLH zimWdzt*I*a6Tb<|LR$Ti4nFycZt$K(nrD8Z9ls(_=h|#Kqkbd~ah^fb$aNZ{e~MJ8 z@aPUv3{I?bL63!s-jD5EMQ-#So$tB_hsK(sp0x#zRNOb*r0RH9PXl{m$CH8Al{ES5 zQ0RE^hdj3{=5PC<4@*xN(hG~;kWoL1Yxg_Oll53PDR$r~=)Y(^OquwNwB%le7u{Q+ z`=bENw$29c>=}xbZZEp8(#kDsUk8X`1zYG?^;LgRztx ze?1IdobZ)xnj*!ruW6WD9YH;t%H1x6U8id9ac=9~J`mG|mu25(1QDmec*?GRLT66? zNBL()V}+ulUAw>z>((QMqw{F-N;ax4vBBhxv_{q3Pgmil;6^|)JnNkJ6tzqb->CLRT|BpD@U z(=c}7TwFY<2nslH0I4qU>Zvsz{G3KT+pq9FrH<(NOHJr?qJk;*i)A&w|H(34X3DY* z_j5d9ca^byp2Y-A32}^TR4|_cO@u=}*$MA$8Oe5tea)oZ)nGp~I>N9b)Y)pktr?S7 z!s_9D{!Gap32U3J#8J-DWOnXtX0`FxFhkBCQsifnQKM)Ay}EBWzqKz4z74U%Hw}@P zfA1*1xb~YCUk!()Wd)SGW)dDNNX0`DJ@l3DU6PaVjl^pu(x|OEm~r+#*;)|-OE$Pt z(N$GyWPF1>+vW*hCpv+0%sq*E%qdAIinMonV0G9t7vb$mLs<_$H?uAbhrMh&ZrOR(M*Oo)aV8-TeS`B{c^$jv`Dtu%n45`8VT1H_=DPt-C%KS zB5ljsjm^pxDEuA|Bhw#|TlvpvJGT*USsqgK$hJYsjTcaV_BnrL@>PtzF<8DnBM0&V zCHyW?FW>#VnBFWjpu4=zQ{5lBxaHpy68TWUrnS6F4o1$y=co1D?5#zx&|^RR3)F=@ z8S3&$r{r`jcL@|U?}IsE^C4}#xqN=kP@EJYpmPLg71^*d0DpIdhv*o!oV$%)c2xtu zRW&rc(vU3?og-u9cVy&0liEqaO{D6qJxow|%q^`GuUE$gAOhl z1Ke8)w84W<%^mdVhU*d(<@^JA}boOyS=YDP^8 zI`6l0B6lq9=VYER;XeO+jpOa<#9i6ClRN6bDDIR=ySOvDdEBN=D%{2;dfXf9?Kt~3 zU*v3Uyu(>@=W9)C&py#2`&4ibt^sD%1gLl;B!jMWD0;TGL;rvqQ1!E<@g2}^{m0;G zd5Hwhd`-T8Gw16&YtRydSI|9dZtbtN!|8&hVw!XHIIgNPh6im&`8&^jAa?Hzpl$Va znUnc#_#pjDzwYuOV+wQ0$&4y~?57yaST_QBTEh`OvT2}q|6=&j)E zRcc~)n;O~Ya~PGH&S6~3Tg*-Ri>HaNOlTWPuN!&bo17r}<6tAI-M&hv3M-K9oJsHP zoJ99+wFSKmg>DG7~;B*d`2^8$i3kIL)_x^v>X?O;B znr|oiT8U(F%mI?bI$Eo~+mN5TKpkQ}YYCcTb3u3aC72nb*iQ&f(&O_NlEvLG>Bh;= z=?ae|VsN&c_P+9QYy7^PGNaOQa!m-R-Wde(p7G?X-D5KPfeC8y%;7^!1^h7~V18_! z?CJQyxc*KTw8+|_(tTp>ho1L%b8H8$9(){Kp0$zN7jk8w;3s%lm66=}t#lk`FJb>Q z#Lb2~v0w=jjg;kh$wP;1Se_=kw0$GSEK`G5g-2w!$+s%-9!<4iCOVGeA;xFmOXL7$!cmi_0CVy zyQBukG@jr0G>n;Et$yYUwX0sPmdkqNrN$_K<~qt=D6x7pELd;fRWKup&a%dL1+hnrYpbpl z3}=4~v|uN>A7FgfSu?f{c`T1-0ZjLmW@dSkoA6_99g|(4&91xz!fX4-zyxUx(Q&sX zj%G#|0@>3*zqya7ohs))Ozfn)7bK9(0)%UeTIi4KbFnph2;SX&p7t$z0BO4@nqBk4 z^(p7+$N;JAP>8i)%YoBaj=$;WJF{h1?9cM+?{>k$$qBq81GmI8SFMsvU;98Zaf-7f z^PLkH6cowkzj)-17A)pKsnXv$0eK29?6YKtWD zRkkGa+9ht)h(Ru8@e4T%AJmC=xZUGkX)$p5wDu|U*(cM{YHEw4^3XOGcdP;HilC0^ zzY)NSNsM(Ec+)w_c}5)TL+@Hg$;i1(mi9hY?c&)?ok|g7^L+{P^GvwV%b6>bJ{&Gw zKkv0;!M0S^?6(J5QNQL3XC0bbt+H~HP=2!+8|#M1**DYi)znh>yl1U|Fbj~Vyukho zqv-SfGDUxqo}l#VWRzXi!j%ofh~C&(cz9tk#N<>^x~`EdFPcE=mpvyZw^qI-c$WGLg(LYBN)j#_FVR3U+!Mi-EaiL-!iZ3U}ADuT9Ec_PzjW6|MCp7VkopE>6)X^6DX zOcxyswiItVt1NE(Z0h_f$xXCPGg)+cK2Q8^!!)ta<^fU3nh~O*pO%SU&p66S?@SUM zJ9%3q2pJ;2$pz7tJ4ZS0{T7^YZC>YnCwvig1S!cke42o<4`RsYuWw;^oTlLEyG9zO zX9Y7W#<@Md6h}sNJcfkhHmKuQOPltz(c><6N$jv@`0x7#`Z_)V-G#ZRh@jBt1*^c^ zYaG0q+AW)9a+EyHs^gb8Dee^toodm@8gyPwP z>-E9GiaWumrcwNay-%^LvIL8jpU8%uYU20J8;aW%jHcAJ;e4Yn4?$tsAa?aSDDNuA z{xjnVz3Czt5_%cmr!+lFAhsW4RM*GWA@u8O^}REfiHDNe8qws zj#N#AIpss(jms!F7(Im=x5Y!JLkNHHU={p4rdhV~usJwxuY$u*Ps_wFcsN;e21Kmc z2jizn0aopTzRBN+q<~NLqnpTx6)6xErvvvZ!q9oVC$4#&51{)T_RiT0i~D0_+df#J zW!{3?g3HN-<>vwK-3dwePbEG>YjL&c2mUZWjmMOyAPM!E`Tjpa>#^GLs?a*KdCtyhdTrO@aD5rJo2E6 zit3LdYpFjb_8n2og?rFq?NC&XnT~eU6kR6UWcBmAg7<3XJ$81($Q;YP+3Qpo?!V zhAQ5x@R(tOH4{dFk(7M4u%;^Oqiu_#^lfQWUu>2zTJMlBhVz81n+1i8#>-yTyD}k5+3hp)_{LmeN?a>T@7PjdU#`8Qj<*70a^}US)%S1Jn`6q1N`+(R?(&hWPI$pswC6_hbW-fKC zc`mhc0(ja=A(F1~J0y|vuiUNHMZBg^Yl+&Y!xDL$y~LzEOOmRQ!#wxYuI$hRq44-y z1%rK|FcZ1LKW@vJZ{~}Ig1r{Z1`T)NhA*oezY{-Z%7bmf#S0}YJu80opsU*)Qxv~l zzNLg&Rd`6)^F5n6KV*@xVX&>R-}59>9-IV)Xwj;-LCht`mZ6SmPwT^#&m&?_5j zdY06WPk}pwMC4s72jBy^~MT(VbdIbPl+sipJ1rwjT2IchcnP zmDJ!<7=(>eg0JUxfH;30q&#lpr#B_jf>bFylsy3xSrYhW?5N#j8ApHaL1KPt7x}Cw zr%iPZ5YSZ&yMjF6vTqu+B!%(+1@j?bl8A)nhtc{)V~CS^8gdnUhl}QgwWZIqNre7l z_+qXO;$hj)vwsM&FDgYBuR>Zg=od}?VJ{y$Z5kL|>?Vq`WLTx(#(VoF;jOqAD3fKN z8C${JE?G@yw)l{%sw#4+Q4>DBzZlhD?!g8HC&g)lle|}jL$h3W!(r=JeEFh7Wa08) zx!xl?d8y?O9J)_MUcNLHj}~0OnTp*`PPrH}u2tfH3f_d%i!m^^;T)uI2_Um4ALWZ| zzrx!3PGV?oJa= z(YPS0(kbFt1+L+q4_D$^jTk4s9N#4>*!3SLCA63ulrzA2WE2M{9360+jt*+PnFMQA z?1AN*x6sHWk!)RcD7g^Cf|=db3f@OMv6%NuR-+%vFZ*Upk67mtTM~-@X>Z1VcfIg= z?iqYML59z?dT?0k2*_7kBg^aU!V?X~@{2w9u=m_={yNNuQdE_;fa@D#nQy9 zj<|hV9}E~tU{>i#NO@bK*oPOxI~6@tFSA0Ai_37#aBJK;^fbn4+mZK1t)$6lB#em| z3w6Oy$uV6e+_>`~KB-TmA31+$)4fuB)y@)>ME8-o)p-!Au!gjLeh!AN383n`12&H- zfc~o=>D*`UWJg*Tz_<7MQ1M|3w8`tp=+156<#L$jUu@(5>$pJ@KAt5i*;a77r~p&{ z%OS%OH^FVE!-RQH;6iuCf-(w$l54L)* zri*n7L0RHMio?{<_r??~GB}7ed1WN(y#S>eMKnS^3Z{zqU|X#uaN0Q+Y#e`(XW^@% zGx93T{`rG0DpkdYzs+!d*>#z2{C&JNw~De4rxU}X_u#ua9P|yAV4KJVZa;3MLX-PM zbH6(AaxH>8Edur@^)83n`bZD;l#EdetSr|DP|wnmsM&u6i!%URog z gpD3(Y?JHz9#yLi|Y!z-w{m!&)vlfVFII)3?&fn_bkaYk!k9$aRURm-Tx~@Rr@XPR2cQ_2&`<8Y- z`X;ORsS6|IMr4J45`XErNb1q;i$$G&c=!5a^r?{{@6Is!xf4F}xwQ-B8s%c1{j~^} zrwj63!oSUQ>3i48<4s87wQo{)QLcH$gFjoilTlx?yKoh^^+}${kgv>pZ0s#D2|g~- z{QFL<9r09x{v&FZra7cZgN5-m(@^wbB--*&+#rYqxQlQ(!^c0G7q z&xR1Y`S4sR0GeCW!E&7yS$iL*Y#D zXVTF^vwCyHWYw7&IH+noG(C7ve$d%VQBfAAZ%VNc?u;3+@h|I&$Uu3E0W1Ckf5wuzIH=46L^&`fd}sOYX=?OCP1f z8LJWUF|>&+Y?nm_V|4D3Q;a&J&eM!1BRqZkU3Bi&N#sR1pmYOZKi>vq%ugX$pM)$+ zHLtok%ZML&hvt$PqTgxusJ&Q!dD!=zR_uOk7qchLf2JK7p>yZ;=#rr!Wte~HIAqXcWGEAv*+z0$51hB#|e z3HD-?<8n(q)W^&QH18@*YA%3fkLaI$aS6w+Sc6SZPQ~8t71%Uc7TbC#;g`3*li?j& zgpoOfHycbKkAn5MV!K4z&m90vI>*gxO@fJwFVgqrF4$w=1>c{ofb;deTs@h&m{B63 zzw0K%-wi0q+1dEc+H&OW>_q}AG{7_IF>?H42ExyFWOGm~8Ge{SF3GH*p193{gt}cl%X8nCjja~SVyi9l@s{w9xF@C( z>D*0#`~&gia}EQ}kEX(AwQ9U{lr1O)Acbbx@rj0l(A`=t21iby!5p-wrP2xFRi;pgThc=ZI;f96o z_(<1Nu5!;jd^%^gw5c;1&OrEBCf4YU$E&}k zp@bicQ0ASb+}3BUaHe?}zuKFMBWZ_u;36eFhdw)5WWB&%r&F+sI0GxKKaI!b6oK<5 zdGt!xm{(Icl~*jC&ATwtg!1$&;bO-u*djj-W_}2Rl3#t`J>d*7`&kb0ITPTOd^)VW zEhoJfy&F6JG?4PVbfux+F2IDU8Zh*#$NQEYhkFHA;rq`VC=7An4r>`*UR)8*`1^CZ z?eIBG!TE)8wKgz8zQ={GpH2nZK>Q8N0MsUY@w?Jmbh5P3EAikg>|+uOw6Kfa}h@MK61` zN!WN@yvjvQddPt899#>4_si|!5bb2-HtYj$l|v1+j~wCIb0JZwqW>>HZ6I@h%8{)# zx~TfFBN-a&K%w{J;jKP`&Bx>b{LfI)&%@BY`C8(%ZJOfAu|eY2V`oJh9zJEQ|DrBl zF>af1>A2l>FV6(ATIWu846jlUpLI)OJ@M@nmB_vlxvQ0lmO4vBy3L4vka1mfw>Lh$88lX3mT+-<&y7V(9gKhjD&YSp2-q!rln?U5^NpEnXOuI- z(!%(vr(>Abvkoz4EIeF$R!3FfVZbDUw>ya0zNzGz@EZ4%g+BE*sfkKc5mHx6ms4JU zn9fx}0!n9LDm7k9ja(me zj`_xq)TXgHJvAH2A(Pp(p(Q}+_7%LR#lt}_X*dnGgx6bfIj z{8V4$@klsN`GnB_=rQ4_Ktt#=N8;eNcC7xKnYGa8elWA;#nt+>6+v_jz{=t9f#$mH z=`{jYOhH{>&jxciejW>wV0IJ}(EG-HrB}GW1wVofVYln`X-FQZ=`A z=1wXyty(hdFNYe3TDS-0W>Y~0ce#l>Gr0ark<_4D0rlUBEH0;~hqKk{1LdpOLBZE- z&f+EiNxr_T-hbx#Fu8) zmfo&+zP!;@l6Jk7V;J4T9Urm{bwu@3TMju;&25WG-IQwX)|_^(#SovXZ{6Gw-?Nwm zFPuh>oV!8IdH0{B|4bO=VPHZ0wlm1Vd`t30x{KO+c>-eTHdC+rFx7DK9_QiE`o_=V ztCH_)wCVd=7!~F&p_&vlNwU!bV(?IhaMm3o26O$%ru@}Jy!02P`)LUYmCPrHk1nCQ z&TXK^O`S}Ro_j+HxrtNwYFJ-1n*cueK;~ewcNbM{UUflk+ynH*N;rmFXzLb*cRZx>su5o`lYjfp=x|ILVOe+1wcgpD=lM)-` zbG-);(XVtOGb-P3Z!!}a8(Jf|?>lTfa10*UZc zlH}+U1FFqx9%`=gAR4%udgyvP|fJ8YJ~wF)WbYA!TK`&apMQ#bA>eN{Y;&yoVN zxR*<~v_C1#+LihfHktZYm`!cnQc2laYf3mihTLO67jpgnDwCcU)+DI>DwX~33gu0E zlYQMisrJE1l2hG<+_8{n9LpK!xSUN6q)b16vI|lm>gWDZ1C2cLWW^l8&0WfZH#Y_R zJ)VCUlOk034J*?aPX>@5R}4LDUM?}bk5`K@9&Zf{(eSXJ*`7I zG;5RJA@XFbN1GIM@yPrmiCp>kbWVrvA!>YEndIBQ@r2{(P5!-;qaMp>G;~}UYb?Zf zs5j{8`kp`NA^+9^f zr0naAhTT({E@?fNn@TeoUw>Ef$7TBRmaln*J3<6_A5)R{>*Y?KjeP(X?4=!yHalE0dTW?a5WNbhDyZqnkq+8Uy~#0a!5_B7hEbQ&ZKx{{ih zvt)76K4LQ=g}c_p5muzekc+R9;phPU771=AbGgZ+pyDHrV|tUV(*NLltqZJkd5t6Q zw?OKSOgP5c$5;3|vY# zLamh?WW^~#$%j$0OhxM4n*W$6`p$zA(E;c;DmlKoY`A;p4`1n68!! zig7@s9on$!)>#y<^qXcD&%h(hHMlxl4I8h$Q(u)k;=p{Tz+$WjVKu&6#=dCIU^!mk z3%A7_7G6nR<&e9lj&+zbh5d7pF6;HaaiY7c+l0JX<3u~w1hIS4t=PH^sqD~B5BAQ` zR95-F+3ftFRcu|44EA&DXm-`RZ>#~mO7c^|3;ZnI!2N}gNNi{L0xOH&0I^g!L}B~%`U|I zi!WezojQE`v=Y|3T?(o-L2#hl2&%I7k(!}A7?G(!vQ~AJbEpEziEbw)a}d4Tp`Bsd z2gs}w#&FQpfDk*nv-N5rI5C&QVx>M3b8s#XtnY@bgv*iXQgcQjx9-*=*FnM$nhje~uA zHDE_7A4<+BLGlYTVo_`bvTutBZ;;;0f=yf$zXP75^0+Z+_I*5p*i z)1MTdPDBgVuaXl)+a>dlpOR(hcb{P-%rg}Xmku#5%#Y_!h|ytOz1Gi|YxR`h=scYn ze5Z`BR+z`P$dVUqo1bC#R4FzOC^iB1a4Sxb+HN zbvO)b8s_3qgXuVLbuxZSyKE!6dU5{9V?1>R6F&*Mh5Y~Qa$3*Ta=Q8KsAC4Rz;T+v zOh+0e#g3L!69=-gMFS(s;_P+RV&~oK9B)rxup1rJMEoyI@dLwP@iH4(#};vch{=p% zFD+ic-mV@hp1jDzaa=_Q`;$|Hqt9*|$9<1hI(BQGtauv0m$hUv$rml=!`EVPnkHcGu18Q~W>&E0BvN1Yd5wcsC`zB~byD{5n_RkgS* z`5c}xEgJuumW%D98pwSBGWJj%fSpMSO7*dOY%TYJ#0V60^Tac2SeUGIbq*FVu7%!8o7K)i}K9rXU>4OUmbgZDA) zc$PBj>Hgzm@|UXvuAWz+X5Sp?s~%fC*)EfMeLag>8g>voe}4cEPjg~Br4%vq{BW07 z5RTa)MX!=*uTI`kG>`59W}j@vd~H>pEMqgrI@VIMR6z5?@7^bUqBhRzDfW2&^gDPa z&6B_KLmS__T|i#xvrtZ1vBb>8mBi7^CJnheWOG_7SkAi!`)A2ZJ^FXRl(H>wVkj0m zMg!qx*%4UPJ`Z||WZ+?>3tqEpAMr6+fsMnbqPN_OD9g53^uD-4G-8`CGPrnJw4=~g zv{7f3sB(*+X!kxPjT} z4l!Xi4*zX&WvNH130s_6SWoQNvbY}I4i_r6v+C;R!Img%IQ8ruv0L<3oL-V8!sCRcF_Bo#)rgaI-OnGdX)tK^TD?tvvGD%AqgFtj1O~W z;UkT%V5qtfr;A*W_bfNeG?s(GyT2hwE(Sbq1iV)^uyOmEZ zchGBPE_rrrFMW52rapGb;Cp5kxbIO3Dl&3KKQlUsy@Qa_wFsa#eXS&?yGBV^!2)^$ zEFvR+E=V-Oox!TJ|59yx5Dxy7hFQzp@ZF^< zsPW4r92*>uXAH$+b%8wXYfqsf9=YH{(<&fY=05aZz6_VnOu)$}VquT}IO!2LP3e-@ zFEF$x1y(MrF%y{HmBOe5s0_VEne;T4Sn#QL`+QQ4&9a zk(Q;!e^1ZN?~-2f+s3IfuN`n^`YfDcSMQ-L2>Prpn7mbm$yj}dk+@<5|H#1>zD&?$ zrrO^Ta$)@%uy)Ibe=Uo^pdcN^L~Fz41YO|MnKi$_M9`jSNS+Ur5=~**}WsN94cF3raWKB-}qpgz*_rwQH_4##+>5q4C~QWiQK?Ra|s zW5@s2+B$a2?sMG!coy3%+1T;dLmj74wNgj_{}!=j)CIzTgYlwEGavF#F3M!wKQG|_ zct1@LO19CU>o-G)RjAO7x8hkw;MP`WN9y{f>5%g2^j(#%~fpzg2q?LY8 zvXM#ezjuz4&UhW=fyg_e<%zS2NoRp_?;UPyb`$2y=m`KGsv&*Mq?A@ zvG(}C+?fXyNlR2Qc&H!23ks`|dqlJ3&z4T^p^+DGJM{rq#W@rb9JYb&w?eexq8wh# zD&edScnmK$&j40JIX-yqJdUYx!0A`Z=q#NAG^;nG5HA6=DxHM@l?P-dn@M^@#^DE- z+sTI1I8YqSfqmuNCIQsB$Jh-|IfAv*Gb(QW|@yRrva!e|j z=B2@XJm>ofApb>{h%uC-% zh4p5JsxVp0m6dsJIqOPzm~j0EbJ2p(G2yCWsc^TEhDdIniii>Q)xQ5li}1k{EtV4% z$U5byBl_U~qCPYKv2ga#nY!nUS?ula_$(D3sHaSpQJ`F@%Uv0Ki-jBRq#8*M7IkK2m3A*Pw032wrZxguC> z)JxeIr{Z<3kD&)Pz{`CcbW!O&jx@C7&Ab#1Ll^9E+7@{dp*sp~8~4K7#E00rW|-Xh z8-UmSI0~%J(=fjOIhuCiB6zm!Mgiz5>g!TK=7AT`i-%8!WLQv361(DAJnDt+FOS4&gDKspEgN4sM)#mC^zfF9O88w9Ug!Nk z&mVxtH?we}&tcxyrR+{N>y4!u9Br`=c%q!Us1AY89dF7g2AykxYRcctQ>o+ zaDNVITl^n-v&R$48fXV-;3K4O7D;EQWqB({n{i>|QHWfsftNIo$AdS=<3BfY&@m%F ztkhTnKia&xrT^7n{ksf!m-`mt8rFi@vI``_=^`$D>4;`7k_VgT>tK0yCl$OlQuCb((L%=`EP1ho6fJrNk;s= zXZP`=;_LaAP5b#Tr$+L5zcw+Zn$g*}OZr${>oA$Nd_HtOd5)bMnLvh@;48uF!NN-l zT8bHD&wwsGzfpyHcAf(DGj}18a~j6oe@LSH)UeCHU8FxM8`c;~;Y->e!RjBCfU2XwYBYD_*d5 zvUqMwifHxi69PBpalx2-fne(AHbHY(wjfdPRgh7+UXXC+u^^-}U2y$92)_Fy3DTa_ z34*T_2;5Xx3bej~U~cP}AWkz@fDhyfD)gEJ&jN&kbM>bLB*B6|F-23bGBaGz+ixZ) zJ&=J?S#G5NZzegyI){E%+`w9;HDpVZKb`Mf2az>=+GqV5MOUYys9ASW=piTWia9X(dIpP;>UEv?Ft54pNr7{-)D58 zaxa?p#)s!_T};%io7@9m^5dwDgaM#wyQLP!PL3hn1cv!j+g{3d& zB`zAzJ5o^qOF$h4;8^`|Q=yQh;#N0Jc2Ia$C$@f7zm%1ou23J8yw-kKm6k|9*j!lP z#$b7`Nv@-gEN9jI+0QDlOsFrP=fIXMRbz)ImavPT1hS=K5$o#74whhRK5L0f1^cTJ zVy9#)J9KTGfqU9!;6iy8zLOGzrqa1XflCff$6DxAxeuD08H8q>EhOFs6L^!4zCxnP zgLviIFL*-^7d=0|4s~ACmS(n>!_)3nXlHI3g!~%BVMW7`eykPNb6U~5BkkCA^&n~d zQ%=2($pew{PH3muicvo0s3@rj`EDhE-YUV~j!}Z{NKhF4{d>Ft2{`bmO za;p$XPtPf$6?>TcpnbV(zm<}Nlt(0MrY`w5rU=2AU#VNSpdxrJ zbAsrdcY;`lNig`NpZms&iLzS|DGfhJg&oaCml70s-w!;-l_@^B_*)YGRL>-x>27#8 zS4wtH-URrA73DH$j0eK5qQB3=(4Ws*C~CPWihcK)l$2(XCcmfX!KXvS@2i-Tv(pbp zZ7zqeqj4mAL>ael^5@Ro*N?lK<)ubyRrK%P8N266P}IG>Ou%6k(@=%WX^4I(gV!xxgEX`m_&^?) zdSK#7UdQ_5#)I-aC)b(OSK9`Zb~^*#UXTg9=&Y3N{)13>>jAo5LU$w_>6}~IFq+mj zl}JW-=+Cc{=yRwGp4qp7X5}3sIzu5SEJz1;yj?-7M!{lg4gYqQiaRc)b9GXM<9FcgrMoMvcRT) zfHLs&Rkiv9TtSUj9YbO>Lr`X{ zC3rdgFe94dD44r7&ra7o*)AnGhOyR8nVGpQl>cM6hr9RQW-?o*ol0uFjm{Z=Aax6F zaGpaiRqLKZ`hK23p0^r7Yu|jlt24yeV44G5^P7)1PECiusnuj>Q#|yA6>%M$gRw0C zGikArhZB3Qld+01{BQla#=Ubk!S|Pw9FsiwBJ`(H%r9Ld7G^|?&-ZqUFH@c34GY!j zw@52X@Yu`Ii2GE0wf>KI`QfwTPg}~w`;$V%(r8uDe;K{vrYC+4K1#X`3*V}X?*=bq z58921Z)~w_IRD38JodoXacRm5L8RMO!MZs+1)0i@f}8GxwL71v3N-4x?8@G|3cjn0 z8SWoq`PABA{@zBG;QN$hhPS~)zFyrv#@{E7mvxl91rLXn1$zGW0-d{6{5Kye?d-Ci zGc3E-GY-Y)GYlHd7%8*HL3&LSbOx7#_{#?1eIFqjMhbY3&UElIUjgg(@j$lwF3R}h zk97hQ38jAsVD534xa$lGnW2d{w8+9z;W=U_E+@B^C38?w5jFXvDSRp$B0v1}!AUw9 zd{3U_tXQ-GUPxaM-{*!T;nN|!dFCSUjx4ZEh&y{GLr zxU7YTDXTH0CpjQM{fiX~9AA!fBZ^NXso4|MWX)@}Rj??;6;mEKZOj%IL z9m=r6iLR5e!oGZ1(Gd@Jk%sVQu9R}4dp`FJ%R#ZD72@Z_fLDkaJlH)%9-2sD+|nDg zcQONx(d>?g%5$jruotjF^ zNv|dm^A8dE!3)H7VH);Z&P8`?v>~VC5Ij+JC1XbQ*!{69(&$-)Y9bs+Uwj*N+IJ8O zhW$vAHr<)|Lpi%&*?{gniou7MD6qdwaJ5a`SnZHIqRsC4o5&Ws4bq3W3r=b)kVwAr-=+YoLIk)IkP$~J~#+QSJiu}w+SO6?>pS}S7XOUG_jH+R9M$v z-xQ7wJraJ8Jj=>&9LL5tvPr_1i}2uMFN~;1g3Mn-5XQ}g^Yikc{_lAp?Puw3b|D%0 z5e9yfli^j`PRO2q0HW4iCmG-Jz&5xQiY&j8RgSr&`bQ!$IC>LYhgZPv&o0n*BM;t> zr?XP;pW=1n`Pf@s2QMjkMaob0flcZ>oVjT?roMSlRfjFX+ejZ&>YMPV$1(Uz^99sw ztqdBR1>h!_hb|ZuNqTye;V7MnE6AA$a%-<(i=t_GLunP+ZRPazITIf+k!|lk{1tP$N!<$s_M(Qm%dlQK8l-tkjAmlz4K?d$^Rn=OxLCoHTS{ zRVK1y=_9k6Kh)Vb$C2EwPtZ&Y~aEcAF61Jkh5sygDGS4aEs?qSowavX3j7SC02AVv0#c#~Hm zb`9Kv0wShzil=SDXN6DDmR;Mp)9D#&^v+hYV2e8XHMs^fjupdq$4s~pun88;xL94bes+#i&y)ivQR{UB4E6Ng4KYJO@9}dFC`Jvnq6?d{lhe?gU8i)CThwy~z zGw9!;^|1EiTCA7&1xJ`YzylU#c(;2Xrt`a`pu`Q2gnUBms}r#A%K@bFRG$CA5F=%a$;{_H^^3mZ~!M*UV>> zysKio$V{?z-o|FsjI&`*Z*1k?WAt85{-?l{pLLIK;1a`syqDWfMYvoH}%%Z$RkUYto~HaZFQOm^X8AC zn4oHW)ZP;-AE=}m{?X85Ew(zmqvZ_$a_`VF6ouZyD_%ft;QH;di|nuzz7 z5b?1iSHv1IHsYVt6&xn*?qe0SToISHFB1<;ohTl(2ojgQe;~eRtl-2$v5w(&?T(*= zh@;ooVn@X(3!OBS?42~Ys9>>k5M$yjh%^9{dq zyfwe|M>6A-`W(S&?W=s15kJm;6vJpBT=z_F5s|EDEgGGr-jHnx$j z@6?xmyJmtml@4;H;ho&yaw%xn966YwkV?iPr;(;>MyTUU7E1b*1C7-)iPMB~QZPP% z{8vtAcwbSVJQEXM;xH2APQzuHX=HpxE<`WB0{tI0)7m3f96n-;eCHdGZl*oV&y=T` z!qJkDcZq~8Q;#Y)EQBS8t-$^DQP7r41J$_=WRrd`Ip!Nk*atk}#8x0aTw`kPA=*R1 z>%gUdj^K{_)wuq%96o)27}XZ9L;>-~h|Nkx6zf+-ew#OmOy-n{HWoUFw)*Lb5|3XM zdab-7a@=GgTHfH&~O|!x_lY>MP|B*@ROypRzz%MBo_?x6WOqrgvnK4{HX9iT!!n zCle0b?0zbgc^Ahg!z*(KA9}?0Qli-poBSM-|=PR!HR2!oj952JbPo z!Uk=x(e6Fr_+^SBez7AS2blA*3nvv%;(6iQ>zCmavzwr~cpsTn{+Wd6d?)O+kC2^5 z7Mz|iANF2pmHa6k2Y+W#+=oG@;9|HtWb0T$V>zAAPSBvd9CN{U))^R(9DslPeBc=G z1jn5kR6Xqs`kodKGCnKd?B5MU!Eg$O7c=m=@3d>NzZ1onMbLMGIYjek2;QZ(7%OjB zgtdmp(Bw8LdQ!dy7vA#2tB=M=jHMjZttvrr(n!2~;&S}dMH}yZV!=IbMzbOO#}TcT zb0kS&37ibe2D`orQ2Vb4Oq8!e|Cuged(DupvK*y)zFkBush`o3z1ryEnP3<(*$08k z#zSr7D7p7*2l+dzn{0l@g7ltTP>s6)-Ou|;&hI`-YtLR(Nb`97*PVgp0%!O;K+k># zbvXNX5?9h*MRs_9CI?@?fbJ8Q$)Bwr5TW~%>D<;U$dlyvcB3(qkU4((-Jt+L=3OB5K| zsUCKw7Fq)L&2r2jdui>x7P$QM=!Dt|t;Aa4B0qlVfi;X9f(!iPjfTurj}AuOkd2_s zRfiwBHxgg_vmQ6+rDATB7$3eb%kwZ^f=_*3LzQmo=X(39G}6aqOhbo>f4n?=PTWh$ z+A6{h%`P%c{t&ngxqwBWCb2tGK>kgi4wF**$%D^8(lqJ6%@?~k8&mDkbpw}%aURwU zi%;k^%&3%Y*v3?DP)ZILe@~t#*2sS$%2?;k)|~3f(tK7Vj`zqES5==Chivx{|J=Az ztbcNh?RO=G?c~?PUiJA3`99>ToIN~8=z7QR$ji?gw zyZ9&UlsydvEi%9PPdw!W8h?}oIUm@9_E+V#&MS2Uo1UHHPu?_a7nM4RDT&|2usRf3 z+rm%gvsFLZX?Zj-g2mko=23O#iNtmMXENCg_fk`VW8yA;5&aF$vHS_;H+$i@g^cub z2v0g&LxHQbXaauJW{h)+2hglDHxb?{#2RWD`0SKv`2P8Fyu%1b)z3Gq7zdVJh#mCX!OX@H&FqSI)cZ7UXlf%c?wv(kx z4})%$4oD8N5of&u-X+Q zP1s<=p_118;JjD6N%S;tliaPxIemZpV5D#seljXV4z(IkbH0eEK^4bj@&n{@Rs(Nt z9tRq{cuI3n39jWV#Ml$#q|!PP3|#VluM9+6XrgD=o{>qmVwkgqW=m#i!&!l?XrEv@ zTRWAo#_pT4FPiqU6enI1?f!V2W%}!6y|LdT*0r*?_3gbSBGzmpmVMhU8qIN$C5-yW z^1t?0G`fD4XduT|6ioP~a(fTyrVMTjZoo_iSJOn&Cz6{scoyVsVe_`8Krr3ezoKJ1of};ZW zp@jpwh)YeOn6ZzMftCqzw(~^k^i9<;ql<({&mx)JGR)d}0Zo_@;QYW>1{dw~!`-`g zf#<+sNd1;YAc{){28~JEH8tp#&Lvi<5}M1GEU{Gji0WlS==pRR?R5&Ga`W`iei6NI!_xq+bg9fvi%I9yMi47kp+& z$7L!>lRjBPh`15Vs^z5Xc{)-vM_+jD`G~A3j3Wz&i-7Du18+<{$=gM=FYE{l6zQ(; zbx|y|StpS(-dcF^I2cmaZH3ckRlslb3$@_NN+k2?23mXTCT82z7dU&m7U@?rg>tUDVq3|F2;HHy3s*WO8$NrkNc8(V{@Ji8J6 z`r2vyDZUh+e{&E&t&`^+aXgRJ=vz;}%VsQVpo}FUS8!u}3|D8CC4Tf0qgS4tlE+bU zpjysBcY{}8wo(@fI2%D`)>_kE?wL5!_aoBItfa0uzd(O%inz>ZJ+%L*^C1SR>YhC6TrfP?!S3a?M8SMjZ^4%)j-Wx^RnVcA zP`jpWE8o=8MDVIao*&@eS-Un_LvU%T3%_;Yj@s|FCj=@s6@2}M1A_ch(RR+K@))6d zxprrcrr1qqrZH+i?qL+XR^j#j(#E&lZg2&2vdJ~oB$#ky4tCSnLOf2o5zX6^pfzTK zwCGqp4z1mY6@sGi>J=u`#q#s;eE$#0@-i1lSgQsqx6Z^QNx=Iod%PiZVgA=k2Yv=w>QkX646j2OlfdE zn$&RML|lWu<*J5;1?dgmSrrXcohutckGVI%vQ-Ug*D4xTiXSxOUC3<6P|9!Eu;5Zd z*T9{IvcuNQ2{DsqKi79)_MOsV&P`8ZBy@z=PHj5Dm|HfFe>i`{u3lxDt;g7T{>YP6 zwZnIR+fJ!6XXY~d7z<|oW_-q5`O=T(%&)&WjIb*+nGp{h`JrX=85bnkm%E;6)~aXC zWDIqA^V%kU#2@ZibK`DXNQX82!9)BV|8eTZzqy;ys;v{GRc@Q1;Cd{!8BNAnjXGGf z#tYoWNy&ty-bSMq4>aZG0GemmK)0RH4L!^8Mg6m$*;FqyY-QCq`}m95}$SXsLN z93SgPn(6T+Z&P0+u0~XwjGU=tpqo>VzkR~GkW(m263`>(=Mvt zWavdPDQ(w-x}jiV`E4t9wcaDX?4&RLHH9I57d}C(RP$71;ukAkEN3d-v(ilbz4D`| zI9Du+x*RKx+B074sVfqB9egJ`JpG5rHDrxg-s!IB%F!lKR_35+<_AUbf^i)2ntvlA zj%S^yHLy>F9;b=k?3*aHoOl*qarWaqiSMw5(p>y&RTS7ooTb-tk0f4-`JAwcAIQJn z6=-h+4+i$>|YF%i@1npr}ze@~5ZcuTRm5qG4G{k7*fuG-fqFP%)ax%hDN%vngr|3Lb z`Y_3Y)Ol#hy+_FOcO^EwI!?N1V>=c7P78Idxr>h0HsPIUI((7qCl}}e3ESVH~`x}enBcB{q*R3J+w_F5u7$G z#%9mdVgFxwBrLWg=F%e|_N~Fc6CM%6s3R!)&@N)15(ba1Yw*N_o8jrJI5_U71GCJ% zp+rHQr?{^eJE&yB`Uwr-aTJUnKb9e7Y;CFa(=4S8?kfcH`*$8@OUhAiVOI zm%j0g;zlaP;)2&jc(_cRyn6bDy6~?LUD#hw@3CLuIqZj|bF&fLTvQ2N+>`t${x!Ry z$W#1Sbvc3i!)E^c(F(@izm|doZ#}`VtdZcGU5#y}T{ZuOz?5k>_my3)Mh7Ei<>2g) zd^Lf(s=VNb%L)FfP%Xj62T2V7?{2l-CU+P%0#e&~c|PM*gAQ}FU=}=+(}R1bJn?{T z4xU%I-+AK;7x*>fHcZxC4)e;E5Y=E2whLE4uLfG7)jk0h9ddx~ZFP9%7(F}u+6<$R z49Vj*!FvZj-U)Q?GxjxEkhB5jDQ7~&s~jA9Fhk6IuiOxC_)x6vm@j^>b587WXs1Z` z&Q*5ull|;O^Zg=S#!2zbsma3654+iS*DZ29UaKbFb&f5bqZ zUhHG}Upgyp7YW4gj@5|XRv%znJk@6_K7G%)oz_$vEa+f}_p~sQTOTp{*3GY7SYXA^ z+5e1Dh6)&+@@chlFY5T0|NFsrF0Wv$Z0+P1ozWI7f5a0ge^X+(Ec(NLrKKuxpI6C0 zF<8U@v3i@JY_5f1p^hg%{zN_hveiY>ru>()?ra`8zD-O5!f9XUUVE}io6fVwXhUqM z4xFNyl5s?l%dhvYYj>>81}j`=<&|Csvb}s;J6i?eO7E z){e%@9;cz^^$T%j#ZnAkYw>QLAEeYQhU+R;P*=*NrU){utO`Sfl0-I)tk_XLoXod@H`8A}@z?>i0khLA610E!;C zsbphY0+<{3fx-JpFzwqp?w5zr=)%c6Xoa^0k-NM>beU8OY4UWvR>KGj&$VPn&@S{B z*KP{^9a%!Vvj*&K?`U6Ch>WmmqLpyfLs?<1)*-gM=>qngEz9eC=lx)z#U8A;9!&O| zaxUBcLW3~Elh3NIiekwv&$Vw1k7dOvm2n@Xha>s#k5HI;Il2}T#fdtXMC2n^QjL0l zxHTLpirb%q`aYXL+KyH#Qf9U!!{!~g^{6aYORE49cbLON+2tTETnN7-V#&$(ck$sL zrg)xR4w_)^ihFZs@?t#};aDRld{%EW+G$B=Ud_AlP4)5c&2)(P>idCKPdd0CVUW#L z`#?!44DmD_LGf<~oY-3p)$Gf}a+))GaQzzr&2iGRqG{4;hK^9SHUNCi@u12g21ffz zVYuWL(mrj7_xJoy(V0F%`LYEcS>~PL4!*9e@NZt8i{;1omyug4L8He8+bF<;C<9Y>Ty5= ztos+maJB_-J28#nsmHc_1GDf?@xJB#^9r2s_))!m%fg zV0`Th2uco-=Vwc)UDCIaRINQ-aPTDcN1ExVEt`ksR^39bk7$(sRvyn!k-_f75#L^R z5-<9)3J2-m!^?N(BEyj@Vefx2*yWlg9&SEE!Vl4~ZORi^ z!gv;Vzx0t_6(3IKt4WTqnap!WE?8}v0^;mjus`q~WVQE0v_cJbv{a;W@@#O>jCy4G zxCj@`?W9g*sKPBRC1{zpn||V?g&)a$C$IMK;NI0haJiU`oY-Gb)?!~`yw(WR*8C(> z{%T=budB{${0wp=AP;@MsE5x>JK%p_I+DkSrPx(N=d4uw8>%B~RIRd;EvpAM&tTUi zUaa=B^&vBqPbSP_Z0J2 z>SXb~4O&?h+cpZ^=e*%#SuKHc!8C#J-UjldW;e2TJVh?~J|hy_6Y%^@5U#XO!k#g~ zSj~4no@o(_ACC{j(Yx1R=Or$9YyWA6tM(iFJH=5ycyY)^=QsL=zo2{jnQUJ23G9|U z0ogNo&#B9WsHD^r6)353V?-%pze#ywCEfkv^{WqypWY7;kC&Y!mRbHtl$tU@JR1jy ztK0hc%}xtyF8KL~QxESIJ00IDUOml7l;QDIsQZEGrZZe_wdU#Ke^e~@Mu0hYQEWFi zba=?R{a3c=YVQ)ZLrMW_W$PmL`ArV&C1WpHOm2lW?U1g(H;lIO{@lyo{MDRoj*Ive zxpIQAfg)xC^5#Fgz!V_57}d0<=glFklYLCtJcz^=0i_My*FT7Ao~(~kR0 zw|E!(D;aOATmu#@G5Ca!822*#XHKGs z1STiL>=!doMu;IkpyZ667Ak?tf<)+9pH2Cx&%x&n;;4km+hkMn87NFj#f=9Acy6*3 z&ONq*Y+t)VB6D>e<}tgpxFZK(`-ka(4W5&=EB-@g^`DaiQfe?U+L!c)c0hoEK9N3T zM5=7s$=PL3C@0n^oE|3yt6qEKcN_Q(G5c-s!F&^mcj0?7*~=F1kqo1dEkE$aNjIT> z^+c2wx*Wf|RflLDIecftnnR^3Y9{vRKS{dlTs0~s(^8l^=LbV`Rn%DhytoA8@jnzS5n~foK#K=rqB0}+8ry z+|qnfq7{V?51ypAC+osVu}-jfdj_|y@xwcaJwEZl ziTJ-Zg)-xG{M=?eu1WcZ@}90n1uKf_+&EY4xTcWS%JRb>3%qbrmpLAd(7}IVYpC3e zb7=o)5VdrBF=`!Whd=EaL~ra1(Yp0d&>sa&{M&dTXxQz?arWbEyEiw`4@-7YzE3*9 zOzJ&!;*)5C^+F6aQOE#FgHK;&P21apEhLxxCD^jXyYb9*;3x!%Ir6_SQPq+AnQZ4+d^p*@j0{uiyEauVi-KD$Gxwoi009 zaOPBewTuE{jqv{P_f?t+&INuWj@&k6;<5%?RJI_c$7WbzbNp|o zhJ(iD8|YKFvIMpEklwZ5&;^Hg40j2U@PyOwg4Kzp{58a@h92UB(E z1>~OI8!$02A+g1I^vHJ_8w|vP_uiNAPQoXy$4&5w(N(x(niUe>EP|$hCU{>|jc12= zq00D9BokH*p$aeHv+{nte7vT^!P5n}CUYw2_kM%^OhRu% zK}b1xgsgQri@#M5MzaOjT0i=;`EeyhSr>|0ui^ z=CmBBo|>T%wZ`C=Rd6$N==<*qrbtkc4aA7_?MjLj0>14 z{JEirV;QiXGw#w>n~)z{IcL|DalV`tgF+ai+uEn#5SVcnJ)YhPp}$pN=ab!7C0^H| zY32kxLGuIChx3H5d!-zPy&N3;dW!J_Wifo{kdiRwU3}R_%0V%2Jo$aaNaCNw2dOMi zy!Va(ubH?K*kjC$`nH)0Y|f-&eltGvGuP;t!J9bIA{WIrUn7M34N@KQ~o`aY=jW#q3~8rpdDSi{qwJ%|ZT!<98s zAiv-M>7dtxWYYy=ETaO4uBSk<$9^cQV!Vq5Lb!jh3i5}q!{72Qh);YCipBpyN1~DB z`tLcgsxAr!mGj`oi*xW~m!f37P#bjm@*pzd26P`;5BC@*Y#$s$jDr$3%)0{*nXF*r z^--$G=NulteTYo@D1r^yCB$u!0Wv%hj@M7g1S5YxIQGi~9WiE@`SBXWPjxo@d@zpa zNy*~)Jlg)ucq3}r%A>e)=X7ileUJ30B%!F;il|M}+o1pGOvAnpZDjbGGw%HKiMF1i z1M62x!IjWBGD%7UA5>mPUpR1`+`Sz`>@QZJaSNv7q3}YO<8=(4DO7^F-5`B*w3Zsl zHx}4<{$>@bR8>bkOskIn#1*Wu*vvW)W2|svX}0dSS(Yxl|M54eCNYZ?Ai{%qFvjQ7>T@^@hW z#E~dJdPJRdo=J4x7N8*AaPmTuk3YodptohpB&Ey}*WEitRxUb%(cu(SrPN7JGnj<0 zmG1|oh_9%3fdoBaPs1~tPT<<6eAqf?72*i*~)y=NnV0&@=GTTjb6|aBfl-Q(1s&3yCL)(KUBQ|2!1o51pMzO7-x%fj*nm9^p za-HQ4pSr~l=hX2;V)6cr_&PPiiaL{l>vfMmq|}{lv8Yqoc~FoNC9%|PRIv)EpTJKN zstbnwWCf+i4-4$3ZV||jUgS?+6!MAQ zMgHx-s{~Q5Yx&-3wN`c9iXV#_$;(n` zU5_D^ZbbFzF8E1b4>jBHJejq}37G`Eq2IXbJH!dKaMS_~tWPeYPthVGNujaf^(1_E zTr^&MaxdP^`9()&d6TPr4ZQNtMXbK871yl1!}x}p;DVnHwtCWo)4r-Za0g>i@5|d% znw9~MoSuUUe~pKB-+XYpxgDAQmcysc9Y#Ly6{)OHY5e)H9(|+#4m$Cs5*1%Br>Yfs zSm4oz#$rZMPq!{LRv1;k`2l?mgXj1!6XHVBK|Q-owyy)ZPfM(8p)Mf9al zM^u)eBAR|gS#(z8vG7}-kx0Slx$yd%@gkzqF0}omCw$no8lSU}#W~E|FnG9#cGS2* z-ySYU=VzHR9DriU)MR_vnF#)FXQ2EDQEpJ|tZ|7f8yo zGP2D%k+{ESTrFR|k(g!C;QVqOsY;zsc(+A#QkyJ3P*Fh6S!^J)TsdHv&V#~YUr-57 zA)jA`fnDY~m>YKtF5R_&qnupgvFtD{wJw=Rv1H(r|1H90{(f<-K6)R-V)9t8NcXvC zWIQhy4psFck5NlFdt(Byv#UtwCv9k&{u+rDH9@~V1d4MAyn3(++B0O}(X(7KK;I>_ z!FaGbS`Pc%y5Q)h9k8tX6A?YSO!8cUAXvQ}mgd%i*~7K4MEgI|=GTl)Gg+$-Ep;$A zG68I5%ZTIaNW5-h47APF0oNH5z*_)WTi3wS&4*z{ z_XsgI<)LWSA$+AfnBKg)2x|4uK}FG7LfN^Qm22)S-wS%tm)0?~bv-)|0lb08=?0`IVXpgg28%TPGlq8bJ0xwBt1r%yZ+6=ms*k<%gXBnN9Mj2{B80R6wEmyuyvvY41Z0)$z3A& zq1rB(Z1-M}g!2RmxMpp$Zs(6Add4a%h;X6U$>oS_+8)LpvFa3O;J5}-6n|$!$ z!sU72QIGvR;$pp+UhSMlUOI1S$Pb=}xRss|vM-))d?`UuT5l<PH3Xhy zBQRXJ7v^~UA#WL$e*IiY|$5Lwm`Yak{SNxo#?%>wlGj)2&oQ-fwX zW7w~40qu<2XmYG6Uid|cOiwn2i@{|Krcn-anKy4$KZ2W+B&hqLH@cz3d?!|Pk`)gI z$k{%8_q>Y`K?9mGxfIqh;;5`}(@kOcz zZx*e?DYAp8{~&WeD^_rbJP?iU_dcT%cgxY^CvK;UJKi9Vhv}qvwikJQ)e!1KOvuT} zMp&Tn09{(-Oi%T3v46<2!c)f9;?F0o=*JE6M84=WshvlYQe6YorTC7Mf^~)JX7`1= zj$RUe-ndY>9>TcuM9(<=6_;(UMbRt&X1F*-enH`%Uhl559y{lj>}_@uf17a!e}0mSGyZj=&y1sIgDl`S%}U&~AQSt})5Jv)Iix??2g(hH$%c__ zqM^@0J7Qo_UnPKU1ihJQMa>?Spm7Yk(~XNByrRAc1}sx_k8{ z%?VHfugRK-+GmYVIPStAj>Y+L%&;T<@BBX2JO{@-FLkADrW5sh9(8-My+N`9FUS*BZ=BxUt z6SKnUZu0^(_Ctr@BdJ84Gle|x3niH`&*@zk);G|XcOb1J4fIR%Ee)FgEk;t%i`G8a zFZQwLid9C#MDHUSMSsT+i=4)?#qQe^#9A2_MCxwMqB2!W@hDy-Ua{bl$o=#&Zdu7< z?wX+!LZ_om+}#?6ypb43UfCLXp1g%8Z(4dZFLF%<&wKJVp4PArZ~2K_p56A@0)F}` z*6B%E{OQk6v3k0vvbU>C3;xyqwHiEliNDynnH6MP&#Ky6%HMB%hGl})*_9`x+10bz zY^UMrg1B*yS+)LL{*O(@R>fro>?wma{E12${0)zyt1q4!Cn!2u!{XgfhL;iPl#kcB z`ekdEkx79Pkaem+ArYQf#+;20xb49%h66}e=N8>Gz5uE87~r*qY`lY4jdE9iBky@z zP~TgE{;pBPwvqjG06!Qy;>|#he+JYXf}w4(2z-{rlUr)qc)Ft&_8T(7@+Y+Ly@9)^ zDPj%0+m1l&R6t%grlZA@^LTN37iBkklRj<6L+0z7a1VDW-cYWMdWLEA`L-+mR1}h68Ld0jp%MVN*nytscSG~dWFBVB!Rk(aJy&+RH`#f z_@__E5!Wx!v3?HpnK;6`DT!e16-6K4$_25uKRI=oBBpmu!PjFPIGSG~SO3n0nCupE zdFU6B62`)(eNk}ZX#g3mRR?k8RQN$XCO;@6k}x}o2)bQpALft~thJYDZ8|{?uA3yX z+N>`sT<6a%KbyuK(b&ngaa|@HjeTYFyZ9nE>HaD1t&!#2Rytaw*Rq7O~)5)anPW{YTVZ zDpEgwX^=A-z2wg0S@!iY-6ZtIKN2*u6=uDTg#Ap%qV?1jxIZ%q){$b+{JI(TEm#as zA3h@+emjA}v_S};I!G6+KY{#qgphg8J4ok$5lEDJfco)W8I3tJH&5mBkQS0gIGZ?x z`>vaGd8)%64g#NMO;oAwPv0*|BRXJ7*3Xk8#V99lh+yeeRVQk5_KDWzc&T8 z#3YjK@y=+LVn4MtX&I?-x=q%ZF?%*n9@&;_29i5l$k?bm&XfCzyL2|tBd>Cxl8&Kr zy4-Q0sWe{nqz6G73s2mlgU^g?L;qZI@pbkc^2TEcSbuRx>VKl}SM}Kr+FAeVckG{o zORZUWo9YW3*)ohpUrce7!xCH;xB>^3sX0)OvXS3zp&hkw4lY(;<4dxYn71YsFCH#K ze-7WmY9H_8is>fE|3Lw8q-;SjdXqAry%#OMIBb_7KS2^LYYvJU)ns|fBsfSPBO2Q~ zVC6kqIP@!(UZ|x(teYK({vA2U3SI($J zIDyX9$G|;Gf*Z*F9PLMlJ|CX=};|HE1wFiT*r{KINoa7p!+;2->L zuCw?HRE$|W&VFKzKN-R5XfkHow*SG-N;K}<@dmA^Za^9>COE1y0kgfHQAfY2kzIc{ z;1v)|7GG^e6DI`Pop16Zg3l#HCH^CFm*Jz{=SSdq+*vreu>fYC7Lxyv6n0(xmF#+^ z1iRbQNqn*_oasEn^A)||&K_RDVeG=(gNhO^w|gG%4M`VH(#{c~&QMYQn|qw+XZP}c z2d)*#3>t}z?Bc~urwqhm`=>mfM;6yfe6u#~{&CUEpBuT(rEhp9qoF*z*mJyf)A_sw zdo^v({5Il`4{59#@JVDvr~TpEEmG#66?(BYwK@s%KWYjp!X^s*`}M0G;%a{3t0-3a`+0(u)Bo~M4K3u$UmE3ammg#@TQ^v(0S#7eu42AjW;*}t zAz2uZ4}_n~?n3NYUAX1?gF183jYOoKv3C>|k)ep^$j9{z>T9Y(RrE1@HMIcKvZ;8F zOCa8IyPWo&a119UoX6e%m+=YT&G_U#GfDGd<~G^S{14||f(_aaps8;Ueif>WyLc0^ zrS@_U1I~IMo2#=2pXvvvsg)4?zW} z#jN4JMEc|Lev2!YAQ3?)XOkUvl7yyj>ABlGb#Fc95=o?hP%F4V^CMX z%Pz0RW!b9u{+c}OHZC8VXjS2{g}&Hwwi3MR{Yt_URY*v)41AnbZ(q7j3U;WBk=;2q zP|!DxjQ&kW*VfHu+dI4xPWgI4SiQhO*fU~QJMz+0`1%YH%0!$K@@7vI(mlIt=lY$s zc8|I!l=^sDc=1|SjsDHcwOhaXa5`Gn33Cps2%XZRY(9wg)LyurQESLOE_9#0Lm2K5 zDeT%(Dm-vwBA#j~2b#7=Ad+Eotu09>e#z0Gxa%H#Imz@;O3nf6Nj04N&L;a6=E6)k z*0APQ0})C8K*$JbZk`K(ateMB;DJSFd-o866{GpxgwwX%56 zJzw(Fw2kUCutA0cN66lxMl^U>9=$3*&iDZoVNc9zsPy}fTx(xM9JWt|EuSigpZ5&= zfrUlny8J1UXsHLT)AQ-Y%e+bCmuYye{uSCxB?68o90KyMpKuR22$RFXDU8mk$?@o!tpZoESyoC!0_r<56m9jbSMT_zD~o<4;jzW`)YEHpG)NHRIo$ZcJeqk zpWL?%1fPc&AhTu(B=?trooOo=MJ>Vew=sLMuzoU0*%=C$x3L=g6ILrz!Q6}(Y}sCj zuTzE&0o|75Phk(5G2BTlDY#9~Ncada~w{zr{k=B z*;u-5HSYBG#(R0!3D?Jo?5dMOE}P|0guxq%saOtm!CgOnWy^qViiX7#INaFqut|h7N1f@Ojo7e*;!P8K333(ckKblc^x_ZXatO|G(-JQq}yo z4yyb)vNu>E`#$q|ZteVwa*3>j@AFxH!%3{UAJVJ;?(#ruEp73_U%EKJ&K@HhYh3>r z(U%&Rqwng@cywqDI>~gJ&g=|@7*|ZIMX$ukJ6lldSOH%8`4V((lENN?XR%S!9$e=dTso$d{bu#Au3ZcbU!+>P8E=MpMJN!OhpW zPI*tb)inrREyEM7o9>rS^d<5ma(bqosM(W9?j*>nFI@?K^q8 z5;NYM*SEO?Uh}zym*#PIyBAh3>lRskjEv=HtkGh9YI(}aS@M9NP&`$zvP_Hhb9=E> z)Z>>G9lrAH^+S94=PZ`;wcd>KU97VB-~3DYYQ9fc$JGb;>bx}mgGbg@Z&hkpzYblk zyq6fmfAb`fwOip4zuU19+vlmEicSj{>@XpZ3woJulOxWJcuubDihxgM6Jc(m7d}%q zfKT=!k}7`~ULScxi6Tzm`o+(1R5=HBlwE?59UDj=vmZ42GK62wD(j6%aIvpisI2hb7 z=|hY7EV`Qdk0gKJ10PS`2OW=RFkq%lS$ewRDV_o3+Ab#IXPAk@>~wIFSvN$d)%-EeaX*QnmLmhTI-dW+|Z^!>L8Gu(P@X2na} zTo`#-8zGw}3T{TiAL3QQ>E}0b^IkDGcE9_zCR=iZwlNl>+Xc0pwTn31j!pL5_j_3) z#aVMiv)8L~qK@q6KAjQD738K-gI&__$0G+?COjmM)mP%~iMjSM?z_O4eH^&rc48G| zh5xC}fP};vP?x?38f&LPysDU1{ZC!8bc3wKp1FB+4O>7`d<4w9E+e^d{1|BET_f_f z)nul^XGj=3PffR|U>v}!@LK;8Y;tr7PBgb6vyWSn5dCM=p&BKuC{{u5bZZ;_$eklc zmcBwIeO6d_@+&oS?i9GX%M-YsU#TNHQ!s1qPvRNdKtB8`qH{I<;ch4kriO*W(6kF= z|4`As;p(A4 zd?hWA(5Kj#V|5#|@H#xKmVl*Y+p%KhWqiO!)gi3<0e*67l7p|*5-{|b3)gkr@YP+* zah3T5yzEp4T^hU_iPm@Gbza%Hi@zLO1R0{;q8xN;(I^_bp@PDH2$@=s|pezoK1c5FP=wyuZHy5vkwR1Z*2D!#}_DHJLG@x|X8rsGXB z!thmxMikU;gLN$$@Q)vg*w3OH?Q3R}*hy2-Zxa>l5$6uE(Se}oGXW~5g_3Kt3rWe( z3}~4u1roS6 z!M{O1voc58_@yQ{S&Mp1SlQ=GEYFTj6|}997tCKGuD&#a1W6&ut`KjEaEX6|`1fz%Dsw0Qj@hh0_h_lsXR%*NyPzQRc5K9^S zz+eTo-+B(a8)U8n#9qGDgw@iRZ?$l`EZf*?nm}oADyuI2 zzSa4_ZhiqRVfAl~W(CdoYBk5^G^@??F8_3k1v~8LXTFJ0MsPnnfpztbu7ICBL!dpo z#8S$1loc#D%8JPAO!?~VNuaImWZUxv*mOjr!QyR(61s4eMQLBlO3E7j$e3Of$m zg_@a($T^#FK)pAF20<;dcCw;vA5~E|bae6HxVI#i@t1m<(gc3Yg9z0O%Cvkgn3SbZ z7J8p>@cdO!b@3l1d&Leq=IPpZv{s^ROJm`xw~+MhErEdH7$`a(1`!W(aJZtpgMwQD zMjv_jldlf$TyKUe+7IGmTlV4kkgISF+uIAKO9LdA~~ zA-!N6=jB>wZm-HLZX@&4rNHFBeS%8aJx;ouN|_m4taYI__KKe9_J1>l`x?uI=XSmk zKH9Iwos#%gl-C$9+G!swa_8_wYO6O1C+G&Eng>uN@xCRH0ihmd)mh`{(FE*v-X108zC+i@5H1A{ z)|m1cA2BM#cRiLeJo?{cE?*9R{h5XuUOCfcM=|N`&LtKHT(Pj{9mSjcnv_KJkRQ@s zXiRkwNe^zrYEmz-{pV;rMs%@#xgwtLkxy1-z9UnforQvMP02AgYe}-)C$K9LLx=Hu zxN@Ptp3a^RRf?C$#rfau-Fr9S?M6-5Ms5n$^IC@Ef>g-E75S(l^9YFNJ_d&p4awsJ zYRoP39y1#lNU97N$yLxMN!?mM$zbwKiTP|PDhcZ;71GASHX+)bJx1tGmMOgJ)8;)By#Ub^?S3EEA0XE!F zhy`9D*uK96FK;Nony31)*u_<{=g4Omb6hQv`(Pu9?U^sRF^%z*(HChMtrGNw^@8F1 zn4&eZ<>dI>eaL}f)^Bpg_y_wt8fwh8x1L>$2Bi<;WezNC_up0Y_u2`Zyu=32y*v?* zTw!+g{l~FNcn%&vdIc5EsKRFM{&?QMM7&ehkPMj4$2N4#0MlJk3&3$~618-S%>t_mf$4=sZ{eFfuA*!6UCj9`b!*R5FUblrn zF{={mpk36spk{P7brxmm7>Ms1Ohkj3&E$^41UQ*g0^jWfux;Mu29tB+;n4ajQ2G5C z?tL(kdqcW3lQaIZ#Eb6aoyZkyt)-1C)Rb%)UKmv$cuBk|B4THN7T7(%dVT!cBrnn zWJX<}TYQ~WMQfem`)hUPr-XI!|DCN{zfDlLWTdff`|7)O8CehOc6=Vk9+=CFp)Xa~ zMj{^DXy#4JzrFfwX?~BD{5TG)KBJTs_^FaLc$aaW7q;*>)kgDCznoxuY#V>sgD}?b zIZvuh=9&n`mA$lbYjzTZd=BRq-}e=yWw%$W+U((HyU$?nP}2erRYeDOM1VthIom-8 zhUpi#@<{uMH0%&pj>o^VK}RN^g7I6YQQ;BtApP(Tq~Az@80ibprfEXtG$%o$UJ6vG zRuaXBU&!s-@=*PEfZRXMf_;e=kSB_U8|`g)Mg23R+Y=9QHQPb=zqJfALWGU^cCd?K zCpp*_K&eSS6lPnI+{4G=o9qWj*jfx(y-(o>y_6JgUjo#sIJo~V1T+~=#oleIFl)sJ zYR!vWnCg-O%I98!-OT++?%+N!&nbp3=_l~&Xg0N+zDvDs$|GkQuhW4ER~zn~{*An> zb(n8mA$4d?G^%+wky!O6!V+6!tohu?zHaggToon3Qhki)m;6k=w#8wak0XVXkF2at%{;mhjX|Uk0y)$O^}F0^V>wC;$+cC ziy8O8T?3I)t)pl(SX&gZU_>aW4d8~Bd2uJKxFxJOSXVpr=C;lFPajb8#bBf)bBi3C z@rvp%vLNox?0PD(ii8(&!K)kW^0Er-i(4YGcajKI+?G&{twWT}$p_e+;dR&qpTpOv z6m&Ft0p3MNca~Cz?b>nPt`MP2-FN=n(`b021k^>)J*U~TDlgayU zli-pEK#^)OY&1Ou(Yq=^ZAcrG)vPcBmBx~v7x4{;OnPskA>Oq4ERJ?bLARbW9`7hI_;9-k{%^UBqqu&~Tf{}*bZv?2Tej_$ttOw1r4B>p_bUZybmgzG1 z(;n#i{Y{;cyo*2VG%PdHt1++u4G5)&wErQ}x=uu-d!G@TM z@VCbYcCLeU4k=_)Y&F86@wlmv=?aznC4bCB@cma8thrbY(cfo~&W$esgfGEsPBE+< z{z2Yu+yRsEW~h?)fRl+*aG`Py+4=Au39gKR4~Au==hQHLF6%qhUi*c77^*>Y9P05p zn|9oL%ny53>Ea1Tuk({um9Xv@$q7h#C+o~JQ@)D2HcM-JJj>;R1`AyhFizG?ez`K6 zZ)hE3B{qJ`x4ZwD<+1oOzfAgW_4_k=f(cv91@|x4@*_^T@_imFvxS%Du~vIl@tyiE z@@aUsTzrbmglGY!`-+@vO; zwYYW31N_)u9&a6d0xuZ`K4)(l=1S`E=K*_WRP2F3Aly4|D4D7Uw+g z6n&tNi8>#C6KTZ%W)@9r z!>f!>$Meq>{H^A_xWQBRs4{t zhE`t>TCg?!KJl+QUS`?OZ)g3x!(kUX%drC^QmS2+)$s>!%doGsI)SPCe(0^PrhBJc zK<3rU!DXB*EMj=>gHVs=*v~}oY_>4Ilr7++eT_~^Y@jblNzpS_9)>S_4})>JkT}dz zMW?&Es4eodA#hGJ(KSg&%?)#~?B?~@e)KMK4w1*|pEc=1zW`WO?hCe-GLpY`r*YAA zOMG{_kA2wH7i9g6L(rQ$`P1hzsmYc}-k?k4!(AawLmAOK!RoXyh zGSihmG@pttu_j-b9v^4kP38`+jH9=`LbF5>U$UVhNDlOBoKWiH|B&)MX?)XhIoh%)5A}Jrq0b}%z1T7Xt!!LQJTCR2c?DIJ z{lWyQr8k@0)4W04)=xp+&0k3Q!40JKS674k%v&VSd<(>V6Ogct#^5__L5Aj8fNi{@ zSS@*hNMm!#cuu6NDD8%|Xu*?4k@ZR&QN*>MqDS8OqMWKzBF7gu zL}fo83fHI?iSGOg5E*A(7Bw+hz=gU4!k&s!k@JEhqNTIyL^`YvQMfotwAs)L%Ljcx z!4@y5nS0Jq;WBrr_LeR9+@k}?e&1Q@ou&&8n4$#BC>xx5Ii^8!-v~Q-ok30Yy>>|R zp6dDCDU=(9RZ5JnT^j_oI(@vL-cxG7V^F4i)AG9=_|kVNRsAVvd%;v#>~1X z_p#OFkd+Bu@Kc#Ok{&^;DViXDzY^YR=Zm#p%Yf1ubvW3j4Fj8Lc#-u7Z|~nuDnq8h zuGM0Q$TO2n403mTc|m=#I;C zWaxD}RB)xR0bh^Xj>msW0TsQExHwP2;jd05?qNTr6Ba!LovM#uTla)yIWsx4_zB?Z zeVnqtSqmMHhKaGoR>)*B%3CF}%wnkpRO@}9&%+-)_OsBc&y3^oOCS8_u?kXs*1@jl zh2)6tZF|-`4N1)rcZOB@i`?u~M^e9y9V#})YA z`oVE!w#i~?w%%z~{>I#NR%o&++a~NV+r2fPwflS=%f2d#t-TufSYM%^xa)!;?;Om$>M+Lglo!u+)Mmy&W&A2H02*(^k`tUsNL=^_g29a?t{KKj^wHW_xX;~T-(G* zQGNXkk@{aV?(Bkq1w(mE*f4g+zF+^dcy#b!v82b^G7JZ zHjIa?Nm-K+*|W|tW0`XX6{2XRqNH7^khH2~$yQmjmF&rG1~bfa?opD+R!X~2Drpz3 z-|C(BhxrfYInR0S`?@}#^9*ZqujLHm8v{J&%+H)y$6n4=``X%}VGFAWrF`12*wdVY zj*@krwo^3kI6LmXCtGRz(l^i|l&!hpYYx%+Jr{F-!%FV=XN|Ncv1yzkmEt-TCudrr zzY15g1#u5Yp0G51YFp?3^cBbDD~)r?q!X+}XIyMhKGmYs&#D@}jeWPakmd8*@uI?7 zoNJkjarOf&*U?85+YIqA%oiBYl1N~-2j-og1^W3?_^rht+9~SC*)EnvmY`N1-4UrWV!PNK`l&!WeBK0&+OE9fy}2xBXf!B=?%LK~9d zWl$^ZmiYqv&87*DR}X>2x7RFj`Wx2RLL*Y67f+JOQ~M6l?xAl(77oS)2+W9>gej&cDNENn`V%~Q7)dLEXCe=d<<{OPsW1eK(ghnXqWl^AS@d4gmm6) z$O+1X11n~e;iwvsQ~Cj6CGKYpMWuoo?+x5wS|j;&o2bCH{nYz>JNWHq0A{7bWXYnl zNPU$Bq(1V6s2~RPs+}bdMDrkaTBiNuh@Gs?T5-5FTN3>DpM*_~Gr?z54q5*-7E4_J zhoy_Avzh;@u_nD1nKdlI7Ht!VeJlzuT3>;qM>b&Xp*_s#XCBO_hpd>pyLK?o-1TIp z82)Ai57skea=YrU=8iHr5hDzXw}+TFpFLw-@KIyvOub?pDVt)r_?}^EGbl!4dLP4b zY&CPYDvPdNxC3Ibh2&>w8>2Do9{@D$2X#7a{Z`kFqt*OX9bni`;ikXH^-9i!vB(&h- z7Bx0gbQ?{KF2x%Mqo6UARE{SLE|3l`!?MLU9yhZ^lf1^c47P!V-5{isQ z(A@4xoZtQ!FBcbwVyPrp+Taf{%4`VuDhcf`4TN{aB!&9!qB}}qCwxqrgm$GtGL(A^ zq;0mtTD>0fsqa5>?vet#EwBSGD3D>_ZI)+$`1A?8q`Khf1Qs?&G|W^4{H7=p7aAAh zlS^*mwS{G(Ic*)@J+K(Ff7nC2-D9}engHJ||B&pLd&rGtRY-H*a#(uzAPIiAlSEY< zMZYGtK~3NtFv*t|cK&7vgREO1Or{Jk+;j`yYP*NmZ{uS7>)UbPqNA8j_h-3X2}Ks$ zZ6L`&1G5T^uzr*>wsfvRpMJ_hLh*kjJ+l+*?5@BLP3|~nXApJZmlu>)hU1AEF1}!C zhA(B(Nwd=^`pZ_Q%+ynH;^`_}5iyF!o_67{M(sq#<`T<(ArKw8C{$hT5u}sPtK#5wWCExRnZp7>H}t$agT77Er6NqDkwT*q)(c!jQaFh$ z_S6Dk|92Wz{V;@z=Z^qMyhc~Qh;or^0|=p_;l#B<=v^&^N+PmOeGuoS4Tr~ zPX<0IS&rIMGl}}1Xi|2;1BDpPWM7Vm#hEc}Xn~eAzCSGp_wrqEUg8IPd+vGM?Gl40 zMIGS_*(%&@-b}h*Xr`rb-v`S}C7E?8qgM1UtzWD@&rsur=_qkKm%ORn-uINFc)83{ zz3L@RH(rlk<730|_`QHDxhBD?=*wKXV$&ZQbLawR+6Nh0jO$!_%F@Laucc^Q+4gDN z*ppsx?uaI-57I~4+m(=f@)CUT^M6R^+yPt|>WNj7SKzv$o9IO1E?j!$Dqdyogu^A% zQTVQ2RB`J#mHU1!wfA2DENVXo{|0MF!FU_w8Y>Gw@D~U*YHfvhT|JbaUBX{H&xx=6 zTY)d11H2QC%lSF$c>FS-c}>-!3Qby85BV}HOq$MDi#K`H7&Z-=$TiWGoA}Yg&-w0i zU78lXa&OAYiseOYc*iuAuH_B8arlqkb9gIPp0+)gS7iG|s)RB2sn^O|OoM)Dq=6&j zs=_tX=-?2^+4KkxE+;1IB`ts$(auan(_H_^(qqzI*<*bS{;(pcJ%lQ)o*M(|X`1ZzLOcjG+1Q<%C+&%R0OH-EvVh8g)=fX;x0}GjWc0Qr*Sj%El>NHYi#l!XM$KYU@yfAA)0sK9t!@9Cv7EV_c zlc~Gb_+I}?!J73C$X>lqqHb^-nXbMFccq*G?m02x`{|G2maHns{kyr}HAOdhr^T zXz*_6tl)XRsckHKRKn9a@5{S>XEpD{#>G6bn5B)gK4|dPX!X=zyv}OK_$b5MUq9HG z{_9rby(bU3A2-J}{6v2nvjP-&{Bbp&T6{}`=Su&^rUoS*gFlP+p95|Z1Nn!c1%PDL8T~gUMWi5?S^&?i1^~q7O@;n1P+Tne?ptn7U5cx zdaU-?2Il5U2<;YJfFbW_pq)Dmo+`IN&&>=Jzv#fB(u+_jHwcroHMlH10Y|+sz(4-& zM?EKPShvNc*d7`=xZk`D$0Wz&`jh?mRzwSqeNlk-Cd;$$vL)DczvbDt-1K2#yE$4d z%tP)8)yV040g;~nlKQBV3(Gs>p>0b)736dRono^gHlsaUX zJx0yfU!X$Li`Tqirk{HfMPhQkwDG=gYLo0}J`%kLCd>YMg!? zB?%wQ{*Q>=brWp;cLwFn{*6*23So85A9%dB7?v%W4@yhkQiW2TFvoZ`etJp?CXVk# z+u}u>Nuvl*J|-dwcuS((F&6Gf%ttr7)QJC(IILJ|h?~h3?6TrHmR%K%YgH6C1LrRuu&FK;*(+9PSRU*D*kw!o<_eDxR3^dv9Z+Ql@E zg~t-^qCXj&^QSk^=IZ!yrB>G95dp#0w?lEjw$u2YM><*{;@q9q6i1I#^`JSz3uafD z!Lsl+bg(lLCCm0w^H+_N-hy(nT-gF1c`t#i#hGw+yAX`y%0Odh153Z-4l$32h5qz# zDEzhvBpaUEh)I9veSEc-KL@SiU%XtybGui>XnURAc>Eo)c_tiSMtS~VRuyuXok?}f z*JQq}<1pR!lxHbpg|wyZisweQ{?k7(YbWb$#`o*-U5D1%>dOV%J{0A%sszV)j(bD- z0fQ>sjS2>|&F-SE$KltsKXV6Z>l;_o?APgXd+xN+Lgw1lg_JGkIP)E7yC>T@)rYTf z>@-X`7N&8uJ6qdmZZYp@+f-z^T$K-X3reSRzwGq25`6LDC`-=ZhJTq&H~g@=uD3rL z*^Ba|DK>5}C>{o4oFVGXKWn^p#1P;4ybSFZJA>}7i^QIvcjI1(cJx^xmc;uHkw1SW zu*v-!c*~nHB9JTu>bM!aNLYz|OTXi3r!C>7RtF?~n*$@t!}!GRU&J&x7cLj3ftS@d zDLp?|#ErXy%W}>Vhv#C#iR(5v^12Y)H@?Fm?@D3LBbG45@Ccq$ro-0ItcEG&9yq)- z4?3;vpl|CeWSg3d<}iF9xsA_yHKm0v_w8k|Z|s54>IHc4@&){{_A6TZeG8s24kMgn ziycBAyg~oAt|vKLBjM+zTTLGiyhP#;BeCjxN$TD$3?QF}&hjNmc5)52noozl=oRGf zb=jrn+PJvB74r)fL{9G(@aUb*THk$|?4OVk<&oM6OEZR4?~OwV>grIrKa)82KSc_r z8}YKQe~^xM4fKeUf3vGlO zR(vOpX)Lt0?+|n1?n#?!V|}JgN+MHiOBPdLU&mD2|DK`QD*9@llf=w=d4YL0ox@C1 zpfR;Ku3^UdRxwd%I+K(9k=eD%j;XEK%a|)8D)LQ_W=<{o!ThJ^$}B1GW0Vx0W30%# z&UjmR8kYsD;HmEi@uDmB$a3g4p5s%2ypEaSK4Vv$aQ2hn*@b$%-y;BD?FzA2{E3GNSzMMWQg=>4e2^A)$@2(!g_IN$`nKP1Wv*rj3P(RFxo z@;8oGR>Y5MMR(B`BCe#X0`??Mn6phBr}a#-Zu>D{GZAq?)^0+(J*H7xlTMJ1Lxh8 zE)BqvFMGgZC%fa@)S?oLero^$X1<&cLcYf z)4!zfOtyybc}F;O=q9mN_2y7Z7GJ~~sssx18F1cF7RF)@ppC zJf`*eNz=~9>d+VT8`AYQD|5o7#pnq)hw8ju9D=x*L!{cqnA~c4M@Cn=QOlacAVpz0 zoEa-d!5%*7l!GN`OZHOh>!y&C#B+4GI2o;#F(Vh^jFHCgRMNjuNpr+r> z@M=Z~Dm}FaY3|K}bic=B&h%*hlRqi^8;tGz)mR&w~^rxN&%^F^k<;4X$kVkCdS^c;U%P$};e%(QhZ>SX3@ujXyM z@slz0ubr()O{;DEWi`6Ixm2B+gDkgl_itLa_y&$kg8^5<%#R~?$%OvyOfL=2&ZgVl zQQ?dv7|>~8!R3Z)(|tsFUfUukPB`tRmDEi;j)hS+t$h1g&IqrDrg_+xrnOatTcnUd zyHb6Wb8b5eUo`lIqFS9{Wq&=~eW(EM!cL(53Jz%BnkOVgUgVT~xelt)N$mfi1(iHn ziCh)3$Ss>3l4buC_r~tPl>r54k$3{x`tT*X(76K3Ijh2Jo&sF&-3two^Pn3P@mm*} z=6|6Bl(1$7)aA^^N4NQ*7jxdRUTP*#mLFKKOD_gb_dQOu>%GaBu;W;7pboYdO-K7A zBU#ZuwS>N!bJ(_TLh$172uQhGNBR~=37;IF$G&!UE1P>239tWI4@>rCv%Cz(NT^=4wXeQMLj^5b}y|=Qw4P{x4u{KW{4Vlf2A^p>AK6B^Srd7US1fRRy(B5yxlq)Z3sQ!{}Bi(eJA%9G=QN?;1b97jt ze%+`#^W4=0MrT5L<6o(rP-Djx8zafIcPLjk3$9#Yh?q(Xv4o8QzTeb^^p=R2=&54D zn_EqA$RbBDThWA7-LgdU^+VJ=>sYkER|C$jyMXHzUtu>F8A0f#O0;+PO0u{l9Zmbb z9v5ZLK;{p$5m2@cDIsy>z;98`J1iQNuX85OhimbV88+0ub(3(8C+cOXN8$?69@(2| z3Vm}8$l+t_knDj5?6~a=xnnF15#`jseVLJgN6a>C+0 zo2jsOve3WT57tHxQi^{sqU*6n*sC}SoACH#spb&}9cCsDGqb{|`7AaMv7qDvmQl-6 zPO!MS5-=Wr5t%C4KxC0f>ZQy>Ko@S{>)E6{eLW=``>j)sf>p@ySJ=Mzx|NYlqF6Z?M6S(H$!vJ3nDRa znhIIx%yOLl9NiZav4Jm{3AQLkz~aoyB))1HZgO7)%eNoLFPA?B;rFdD^`{Y!tUU`U zc8ctfsf(~hLQm+_L1Pz&UPQJoTKIRu0(?a46YFWhERhR37VX})gf#sqq9)@2Z47Z0 ztd-mjVn+Q`spwokR-lK&L>`!FC>vhaY{i*Yy*T*6I?5^fJobHTCLH``1F!ARVcFuR zu((=^t)VZ(gD=(*KJyAoazGpx%TglO+}Z zBb};wh_j^%o^{F{PS?>+UvFG@Yhp zv^D3?TMWH-tozFKrHL=Iv3eR_XrVc#Ku>rvOq0kQ;v`(3;)IDRKrsyo@HX3y)V*wB z`3$`#Kh8fzmBy{a`V5!c3)w|(c==HSE-&z`qA{|JF2cOb*+^F3ix#|)Z^ySj+2PhS z9niSA5W1T^K&$HvzJ0R+|I{skgS$+i>CG)v>XM7cp#BFP+Wzx8(5Xa#H-m#yj|3ZkGJ8mub9ntIqKg^S|+P&9s_!u8HDL$<_01h+&iF z19lTr&$Q`#W-HINFPLB4D%RxpL!#+;TNNLs)ai#ixirH7E{7w`w`z>2sCyz|NcS#a z(Tk-v*T^lC=ia=>r01BNwerX|q;vA;(RX|wqnQ^xqIqmsOg~gU#Oe3h#L444vpVW} zy4IcJXnFt749?Mqgl1tJ%rRbSLi1awFI)!ZLd`MJY`Fdp3Rh~tn-W9e%kG;{>*`O& z4&~vE85(%v%tG|iA_0cW>(RjJXsnxh75RVm;UK&I6jbIl!PXR93(d z_DqD+W2mJvA*9mH6dYD?af{6edSKB;$_Goyj2%qK>XF5T{;@dcj1>-0$j7k{2Uxql zT!vFBA*{PTB2J%gHEs>jh8d#1;!ndj)MLGqc&f|-`z6M)^m?D+g2OWS$<11PRxK1C zRl{hwZ5SoJKMIc>=HopxvhcBqG32CikbNR$h!l3Gpb9fD+_>2rXS@(V{y+xk6vvU7 z`t@l2fxrA>pJZx}?;H?yMDa;oI|#X(4WnN)_{L%2<~h zg&$RziP>_DSM932XL)xTgO!yx`ZY85Ucq|2j6ZoU&sa+Qzgo zv!<8Ghgl_22(+eFNenou(SJA7(%f}cxXqMF?YS&V{Kc%vN$KW*~C z;rt&W@o5X(*7}e2t)&NJsht?9wvao&Z<4f~ULe2sD{5ZffW05QMpBRT(T>J=QXSQb zX3W&XAAd;Uo8Ns{sV7}w(Bul?+T5oOOt_L_>vKS}OTuGb6D-cWPO7H5RZxA=5!h*p zBCeMlXxIdjmbX<@>yeH4!)FC@r#XR0yF|lu;XJ6gNyp}dLeU&V^4#U4RR45Kmi9^riRnOLs>Hx(GW3x>)Lf=6E($qtXFY^-jgj@akKN8<}= z+!-Qh>s|=S51m1K_jGb{`E+pJWhsdL`8I2~52M3ei^ODKM$7CzY#3$Y@enTh>USoBP^ zXK^(pt-ri!1&_aTR%uso3Z!^-f8I@6u@~sp%~O3(%kzE5IaaB_P43O*T)bt+Sv<1G zD$HGuJ|0|36Hs4jK4m=Myck?j`_532w!puyW`fbmX@0Pf{%vjxXQZ^B6Xa@5rsVdgd?!OM-|9Xm8%T;jrfDMF3KeLzEyZ{?lCXgBvG3LQ$dH%fK zaK7k5$@lnnjqfZT!#}$75r3g0;Rh$y@ZUR>@zX=D^5wMJ_|rd^^IQH~#an6=d^S>Dp+-)PTmcF|i+3^^@x z6=-3N3phWH$8f5Aj&t^g=5rJe{ozsf85!5C-ih5^_5Y9uX1 zuaG03Q_XoW_ZWA_Lk;eJyq#tnGm9%`wVk`fEUHHSnm6b9tWxaG{X*=_;|bq2k<3zF zMcjsk_}Xg@8U1&MTvy2<%WO2!(!)Whty~LsEOdpUmKE@FFb-ViY==d68c1!h3@j6M z3(e<9LSR@r`s=n4JI^RXV+GfV0-V9M-{bMgZGKQQc?PAmxIozWYIwcZ6=&bAhhX(D z=yP@l8lGVx$_$qX7Ws)-*~_`uFNTMEHoT_Vx7{XtKE=a81K6-r?yL@5--u@^8_n}DKx%8f(Cq$8rMO}=)1KwbIR~lzE zR>OryEIi|1I)T>GT}|`&g?@&Pk5x!aSDPAWw7PG zk2vJEXumq=3+u9mC8T^RBCNnTQt{f9-BNW9U2oe3vTx#0b|Vi}sLo;grha1`9>aD! ztMf1f-AAD&Euzl%FDkUd9Slzg;4~Twe7kSr^r(a6WuS&MqkJUVkhye&fz zq}_--cUx~gtj*iwE^V{uBs5kYn$IJ>4;xjE?B}`nZsGm%GUmuOvzq0rdSX25vc z?#MIyd4l(T<1SvH;myX7d{d^*8yVhAvBNy4LoU45PpcYT$GjNREyeH1WMHnb=F!h1|+>BNg@`xHAkP-{JzY#=L;?+`SIA72L;OwFWHRa?$K#@&c&!&lL(2 zPU5fHvM~D1Qkdpe342Ucgq-08c$&-`)EYEbm~EmSNHS7%C2H6#wkYfFV;L>wcmOx!NSH~Rg4c$QI7puwb8MUH}wh*2- zenr~Vt%$Cji07wp0j;cu;yqpX$ZA=fR{0s{D&&#<*A#^pO&8$ISW|32(u$mB{-V4M znn;YD01q!x#@+r_@Nd2@in?WhOcLAhE{-2od-MbvTCNd$v)x#G`)jfwcOO}D$x8S< zxfwQYeN6_Y?-Tt$mcsDIFuZbyAwD(P!n#rt15f@@P&K*+MZkF|jeknk25I3(hcux& zn+-YjVxZ66fL~!gi+4X2+uxZ52adK-G6AotsIm{(RKuUWz@?AWX1n6UVwUUy$3z?^ zM+cr<0d?h!8cx@dVk=o_vYvMQKd%u)1dba8U7440fmAuodqSv?V+&o70( zD_5xZ{uG=>E;ys^2o{$$Mw?$eW$Ea(LxFM%JPiMhG`rT}4=&RoXFwGMm&Jv7OS;LD z0$13xs0rRwH{grL+ac?7C+eG?fSl`1z*|3-6~6WaI%wmE7T$2j-Zu*Hk_2(o9UF{u zXYI!~)Q__?YZu~~XMe$#3$1wlpy+vyafbQP-ta5$FYMWT6Z=U|lFXo#Xx$#pY zy!b~3ukeEmlKAacP5D`0&+#UQU-3M}{P@lZJ$&h#z5K~P+xdBEM*PTx4gBEqbGhMJ zZ#f15Y?{}&DtC*-MNX(q8OQXMF8v8-KAnsjbCpL2Y1-FSDc($W z$eW|fVG{P$szKP+`7O!(mIa5G&4q`i;&?UXg^tIHd>=RSVatvPXx2$!JxtI<2XlKI zCJ()!ep&UiLOg0v*aIy%bgU5McGZL0dy)I8#1WFpTgbayb)tTx1RcPmDk6k$6j-=?!Na((a{Ubr>LT!F&=7JY$Cb?o(0D;X_(V{ zgFM}_hW&iGHWu$vz~3kSVEgFl>;&UA@Jo6bG-cOy zqSb4y}yqSx2<{!YGuP!6Q-Z_*E!wes^iN~kIdeExdCFr)>K1BH~hrJ=|VLZ=lKxNo=X@9cE3-p0{pPSErj%rnJ|Yw6*P!++cxv3N6M=-FFq!PEmrbyqqg!sZx= zkaeWt=v=fJ<&&kX3@F@d3c=%pyJ|fDUo=Cy>K8rj}FO2Z% ziY?URISJ(Fa6YMBWCIZ;95SpS2ciHoS#fnWtaeX_=tr}#`Yvr@Xlom^%Z`E1l3(y$ ztQ3we7XUK!hdz@G@co_%>`z=!E5MMau>))r6rgid7Ea8$4dvgmkU@$V9QbMj%#;ua zJN1r??-I>#+HawKKcA7yr%RwmG{bKgPb5x}Gr+B&gp7vxVXxD^*n=}4I09ez)DlTP zj)~z*hs5#lv+LyExjahZyb*fUPQ!VJDfIi?IMRF~h9svqp?3Kcv|m3R^<>UteP3k) zg`<5W|C*72<`;;s|EfR|7N4kHKUYCXXc}upRWL4CoywZgz-K9X?}a*uRy%>iXN0v|}~4GrL5y+ocSi!$mg z(fmz&6!B8eW5vZqScCz=o@FPI7=JIW2sw?F5B1=*yhLnLS%qVo7h@-Tef-Kh4(lc^ z$GH_DtN`@`*fL&7Y)X_sY&;Pb7d|C+g`zq0)<~AO+;QrysJQf*UWnr5GT@eOJtU0Z zhmeO^@YRb0h5tQ=gl;a(+<6s7!{Xt_&flcx{aqNh^Z?^?d!cJV3OSs49TArjIH%b} zQg_r4#+q){g}043di4z9h7vh)QoRBn8IQ!K21e{nWC{iJ3J?u^@InxEa3tAewl$npBj^SYxj{o4?j_29!;$OMh4LtW-r>TV1&gl z%t5^`enaG&BgAc@oGi|hqHfFOh}@#YDp1v)dn?$4n-C?z-DFI0>`qA0PBLe3^ZA-| z<@Nbi6F=i<9T7&HJ@Xalv-}2V=7QDSp?lF*oBDXQ&&v;Rx_mcrlYNeH+}u-W;@rI2 z{yk;j}SUO8R#4 zTTqYGzs|)&?#uX={>{AG9~>K{^&MlTe4;q zQwm+P9bQ<=U6erMj_1g8rEH@(YAhcvOWBKicimZTZa@V$;o%+5eV+_&>$R^O@fqrM zVIu{cs8~Mt39EqnLlDW;m7U~VQQ>l(J-RtnOD}WRL3iDEAM3g|G;8h?r!h|7R3Yc$ zn|7+_%SyrX>pm#sh6PfMXTV5?6~2;DPh8XgQLcXFRLU0_VqhI}O&K$Cq%%n5~B9bzvxS_!n7cgc?}CrN>PG3hDM#*X!e zi1Q{Td~W#?u!FZDiwmda?8uy4gw=&!Hj-KtWwshY#8S}Hmp*T$g#THRRL z&T$}<-vy(V#&G+jW@F-kPmPK{ryA=tba-Jk3cT`!vkWxXnUQ!P-lnZAvr%U`=H+E( zG9qQe8y{KdHSQT>HvBmFsZoA;6(hgXr}6N(gUy@%{08pmb(=cgeTG=hJex~&1Lhau za(dq0%Z6^C)6ih2_6g(V@Xpw9ht|g1~iLV!Mh4%Ubur?7mRga`+NBg{Q*u z{2*{y-v~QpGeIh-gRJK@3oeG60<&@t&}WXr(KQ~zSMnx8PN<^L^xGihmpcfp)1DFe z7Z*v&g%r|#4M@b>hbT|Eg!F8_Pc~h01vh&W*nMmkd2=V6Y)P084v{(}M3nXO5OX5i zoaZ`tlx@TM`x~*J;tl-af;Q5+`pBVO-ivIWS`7E$9sF4uC2W)B;0yjt+%Z1~o!=2d zwu>x4NBldeSuuI!=FlYBU~`f+pz8*zwz}Bblb|)%o}r){!_@6*Yl+?pS2Cw@Ci?aH zDl1A^4u@LFklEj8K#Mg(uAZ6rYWG`oDN_vp{dg90cd6l+HIdj1qH$@U9j+W_qR3`> zJd%C@%a#nIT-h%yB%y*TK185hHp`221(1&EERf9q zguc4lvXb(&(Cv%KRJNiwWLe*$__=e@+Kd|Fx3`R1ef9}u*prWx+8*HY!>3_Qx+6$? zcOm9%Z!Fb*7T+^7#BUe9L?yp2it@ z(1qw|U>nb8e)@Xfm8VMpRam50ERJrUqdc3Lu%{rjR`p#L3MVvs&KqN#_zaNhi zS6#qi|6NCH3m51!bp|b04KyuyH)6?rw9R@j({`3RpI`N{l^@bt!CQUJg#Z2T2ma{Q zVt&DhH(v<7Jk2jHd}E3AO`*Dbnw%cWH>D^K^JRl9n~ttK*`&K*ZIheIUw+I;5r0#~ zHQv;(G=5xo0$=`e2*0QMK2Pn(7uuS3L%LbMHn(_MIW6kA62~i3lKwhIkE_xtLqB@S zn)B(S30Lg=agI#xP+eP+0^PW#hn7Bj7X6EsJhx$A8|}}1XIh29EN)R=JMC|g0!KeZ zoV#@+&=zQ_alhN9)}1KSKz9>FzNb^?VPD`60*Vr(Ya0z^55FXR+qS`ncYlbXybEtpw=Z7!u?v-hs}@r!xns0trN z$bDh`NHIePw!9_d>H3fuy%2pY93@d-s>vSOHd1%c22Sa>kyoedfaAXdX0g)=|Ke)6 z;{G3LxKj>aB5o5D+e&`PhEoB8x6~fPU~s)I>WKTy2CbiNu-mU2JrwaH6! zqJEi{`26uRtlgi2aq)yM9@crxn$u;74X1hA44TI@9#zX>crP7i)Hh{X=RV)Z6koNP zF?z6tF;c&RS@!ZSqeSTnGdt-gBRJBTu{AM?d1uRJ=6U6Q=1PV(b2BZ3(SOK=>9Kho zW0>K|TyOD$F=D@)p=kHQ`r@EGk8oI^-F;NV2HgOUR&0SEhWl zpcrQvZ9=i7t%5X$6I|W?n^-3;h31DB!T0lW${^bVE-sn|jeK=5dRr)B@wvcyWf50* zUOs$Q4TK410p!g&1)ei`U^c4@{tzzAY7d8XX8q)~@I85DlLx;?j=_;-?+HJYPcCQt zB=nCluv^&)`jP@*L)0#^p6MrYD7(M`#dNqDvH`ZKYr-a#!@zs|hxA_82lbme#C5tH zj5*{Aj7og5{*wwc^Jp8Y^AEV+#mP9HodmMmHX%6v^9mO#PTK*I9uY2 zCsNU){TBAV$L;YI-y+;4n}L6YrQ%{s1Dw^j4=a{k!eT#-ahz%z_EYu1NlBqdH6aOi zx+_4imoHu*c@_Pdk=?8kxf_SG7vs&`?@0CAHMH$P7V)u=2d%vS5al)-N7Y(kO9NG0 zCGzcUZE(lOLN1H$&eC`=GKFIPeu-xLl@UGBj!^v7YnIjRYVt(X8Q*CIWK#6rqWWip zq>KkiNp^RLiMW6cPHZAodaf`|YeW39=cr+`H4M;RlBZlHYNNoB@|cnp>=iv{biEBC zKJi_YGk*yAeUyi1*+-GescR0euP=w$Ll)>&R}uOAsimmXLJ+DWZN@ zmM*k^NBg>M24`t1kMqW4SzWB{sMU8PBbv8+Cr2-Gn3JA(!AetP!?1W@z$FPsHafl+YzK)xVP!<0xmDZ!k6cbNTd6P#*m z0!dNFVO8Tske*isOPlV%TZ<%+l6VRWAN4@?SP!rtltWI)Go~8N*mmgFSff6c=T%f5 zU>1~n+WLn4Xvq7xj^TW=lDQ`3C39{2Tjs4#ql}Y>^&4Q+gtysq8vpEFXMXSG2VPN5 zC%?sY7XL}pJ?0)|W@B;Z3BG2SMANgG!~A)r>dcF~bUA4&i))=L6KEN^7dSn>OX^b3 z7ICzf%5ps;k8)DqDOj$FF6R8BNwK=o`;4>W#eLeL@mbuvetT$h zZeT0++Hl%>{aTuM*MGF=dULMoeogK&?WWobjsY=Y)8Z5>`ySC4N;yVWw$iKvz6}o8S{|<@oT!Y{!T9NG;Kkga-rQ4!j0GBC`aqKG zl?uh;)1RXkkEI2NgIF-RMGcN#NkM{m5pPW5Bekbo57gb2p?lX~l)_ShYgb%g+QSrR zt2qN@%85jC)^M}d^(d@xYk&+Ln=M=#E-o~9{}#N$_2K2730By<8hA*4f)7Q5=`-_@ z)HXgIMyllgJW;=7>@@7y^N3_|9--pXl`M{v7FZ4%3-8W;2}ZZCg4UNfsEil!77A2Q zYPK#_889dim#;uCOZ6}-z6Pafn&Ohr@z}jX6E)_nz?It>xG(4eRuyH|&AfH-kNYd} zBl)}J3_S$I)Fi+x>=GWdb?dwSo6T%L)W%axRB!NEZ`!ya>QVh(y%UUY{kfEImSX6?95sgzU3jPW+(Zs2r=t8Zg;In=P2wbb+x_+v|rZvm(e^XraTc8Drvi-zM zS{gq5b74hWEQeQortsm#cz8Os2>$&&2yCMjuut*|?1(NRry8?yhUqw-7Gi-dRlbsy zo$FW!w|_y5^%UMUIf(+kT_&FA4uMFg0~6yWuyCLT{5<#|XL<+XidBX0<;MR z+xzcq9A2CO1`d9(*IAAo@I!$eAY+DF(}E%6N+LZ20s+wY6N#~vh)mifS#RVsYB)4}SLuyAll z@`hvPZqPM6UD#%l50aaP!GkJ>&NP6J(=$+A!(H+sKn&zJh&&G>-lV*43`V9pcy6#J z=>M0>S}A6N{n=Zv%e^$bT`UDn$V%c*-<8SSF?XEc^%CFvej5K$osYE{8l-%L!y5Ov zf+uEI;LUTNAUyttxJ#|2krR6Kwt@q+;5~mh-&-a*&%>+gHb<$`trRQjoU<&rInU?N zRy}Osq#U=Q=`^gQIp{fZZjzaF%g2cvi7_!QH^N!V#&F+Qp} z4Q3EWSpIGXv{1p=@LLi(9ia_t|I7n}2X0i0=siCq@>}1Xwj6@LZ-+C}&k9Tv3h)|Dk?m#UD-J2<@0_OfNQ?b5}SwiW&ojk{&!WGZHN85Ge4DHghFxr!nG)}@! zFYc-Lw`h`~MI3vd+Z@_>JjcoT9p}W4Gn~t|DKwhy87pOhEoY&$6gM&`5C%J$kZBhU z7gLUa*>7*qEib3CZ?7X0YsE!;w%cg&{HG+JA3^BaSMkS!$9OSaf;~5A4eqeMisIla zJ}{-v&h&nY75C;Ln$IQN|5^zz+aM?6{=Y+mLGM_*-yCF>Rb-L{(1SiYgri^A7JyTl z6-pVZMR&BeVY8pgm|{-Da@Y=+PcKFt^Nny|nH;{RDqHX9Ie1)oamw+GU(fwFkh*m2IrYBXQWz-Wp%R&FEsmBeJ}U zfjZ8mqvJo*$;OC<_}Ck5bZL_$e!fKq+kIG!w=PHMBPZSANXIrZ@O=%*b(f(&^X8+M zJq8fik_@K%k3iL;SN2}b`sA7FQYimf3aPAUSm}O@oaQ7r%@#5`R?TIqwvIC|W_H+!ebiuP`&Bb7=}(x_#S_fTAF<39t+kBK z$quH7Ps8lLEM@zjRywoA)R@W4Yh_MtKF2KhpkQmdBbnK)m%vPHSm9%rd}E6#yr zNIc|WO|)3r3%woqi<<3?ME{%<@I`Mo{vSo>9Z&W5$8n>~lr2(1DP-Ksy~aJ~YL`?( zOB$rKC^Tr0EjueCGO}gdi+j2Eylq0@ZUvroOQOA6}ODy>~(mdb)Tk)-Wz>L{IE^9Ga5+Gf>H9Q zF$7r0O2KVK5m~v#4LT=!!LK6_R&oiP5I=!0bI(9x>;%aDqQkOYG~J?jzfS++pcc z7yeaHc*Gv|1ebxRqy&bJXaha`Jth4%M0-zOLEGX|(I<5mlx%N?qSyUpE%m>R25C?5 zGWrV~Q1}QLZ&-_qW-4H{gORx5whJ!re@~tv0kVB+LfRMW6T^2aK;!!SMqkDR|84Fd z|9sdG-<|o0PjBP#KY2XkZMvw=Pp-4#ttzi-ICVsVzp3CZ?{B&{|7Vjj|8ahoh5dYc zo>04xzk8&VzxI9*pZUXwpE`Nis@Cc_ukN}oe@Z8rzkOCX|D$gqEi1^8?r`cVJvx&~ zGw_h6|1Td*GnCW=#@(cHBz( zc&9kMo*z!nm+YdQXz`{U(#@j{`()5Jg{zzI@-m`-yx9rjwX^Wi0w=t>%o0nCJVJtB zf5F?+1CQ_$uyO_E0H1yXA6?r7QW~!iYqZODUE2)E%nBvOm3;ERArdOWXz6VLRu1EL6@y1Fr9C`?Ec~*zKQ4P6XtA%$B$HKd>o15f!i&*V) z(JVXlNW3L+3oM#GMZ(fOZI{|t2&5vE@!3OnN%d|K$~hl^zRGlw%M(f9MD1ZNGh@-> zs(HA=+X|FQFN0@oDYjwUV=;GZBCLsYsObL;dp9}bv)|*8*r5V+a$q@BYj^|vcO9?z zum;zzD?x1!dRPm(RPpAMvUu;ay?E)t2?74`Jn-53RkpITHP~e&d zFLto-^~;Ab&u2HfuTchK&%{N)^Lo(F)087o&X^T>*&ddt%ZsL6sRy+TbzbOjCgbml z5Jr=-uO(+7r+&Y(AQ9q^>MbvB7s!1%ti& z9FP6hNSA`!=*)8n3T@brapF-wbI2UyvHu z497gfph(Yw+#BGL)Ceb}dU?q9`^*&L>c1CaUeCs#7wA%~*TZ;+RW+Wc5DCje6N%Vc zHhx_{i0<}QV-INq7*J4xq|a_-L;gUM+u%v8eQN?97xaO(p}YvyFN7B9yLiJ-J<*cW zJYXm_0Op8^UcS157@3!#4ekmzR~vzB-KLHV88O82)E@NnTLx@0PJsbl2mIqFQaQ=Ru(LoQ+~vU&hX1)jx~YBB-_TsR zHvJhIou7f!Z7!p7wmy+g4uh>5ZeyR&BK)R(KlVtBAVtrHNL67tKGT+gnRswOj%c>EAH3uM4MsTla&%%XVQ4ED3$T>LwG zEu7MEfp4?J(1)k??3hzYSfA>YN}m=J6>4O`u&X}4u-62P%Ok+`(oOJPWGX6KIFkha z=p^!IOVAv@b-3-z99AIRlP)%DPM;>LPJ6Oo6WuEQsp*Y-8|d$>XV5<=-KNpxF4O*0 zeysI8EKk4Su0^Y3chqQh@1hT;ucAv>DbP(%?V(-L{KZ*zaEcR`E>Dlin@3k@O*8K+ zEahyAlc6vA=0tzBSCx$aD8T9KZsJ<0N?cVdz|T~KaIq*wIGo8r3aa`@(;*D&|K|?x z3mj~>ON^q$D&E9P!U@}p|AH8f8`RznA&P54h|H#C2NxObT^s9(&FU?efp2NHR|omMnXKTyk8RSV`X z^l9VQtW|Dou-e=BBY#!nvg11&GYxk(O4rV4+}0V_s9YA<81y5&QO7m3@#|vy#t-h9 zob|01+@OMT+UKhM90{dnPU~6*?ch^3=iLC4Gb`4G=3s2d-MM8B_m!SL_r{gQ97A3n z?YHL@^Sf84bA8pCX_Gnn+*LDbXi|sLYksN8a!=2iM>DeNsZN<)&3UqUbIs4uW#~)b z2ny-jK@PoDK>Hc_sKcxs|0nFipZ6D_y7pB3(+>VWhRqbq%Iy?uRgnRHlyB};)R*HIiSE7zZr6ekOK6yOQfYSS2 z@MqT;tn?o>FWN08tPo2^pVigz-JVFicWfO}+_;@6NbVMXT;+|zPgR2C**kEY%7~n{ zO92r_S>(9fR+Ld@B>HuSA-Zg$B%&fPqJvQjadh%g!t4kY+&=h{%yUeHaP>C$9U=ze zucYBqd=_!u8B95=K0y`POEIvV;iZcgp)UC7)axJ^J-Cl#cx)u>=}G9R@DM(fpF`9p zvS9G*N!DwLE^?2(O}IELA1ml&;!F~{*i(X42fm;H>31l91smpR zNI+8l90+n>4XrEmNwIz*N&nFWEoY{S=B78nwfBzXu$UOx#h3ZZDbqoQyV* zW~6#z8b8b_jOP-&kvI0=6<(MwgXi~Mk8j$U!?Vd+%Ijg>2QQO8jrU;2G2XE0dY(+16>sj1fd=z=5tgO@e0lb3X*_1Lf+ZfPvMkoKZ}^%- zIfBKqpmjJK8g^69($+n~uC^DX^BRUtz8-L#Vi-Q!mxwf-nfR|vIu2H>#jDpH#*2A= zm@~W=Zx2|9g&!-i)2bOba>+`BlySI5qIX{>av6XM*U z?r7WuaFEr4-HKf3h|Gb6;N`G2^eg#(%noGQFnIr<@_pYIvA-36!a26N*zsyBw!R?6 zUZQYWct89QJhq4d3CM6?P^Vll0>wSgn0dt{naw4 z&#ngV+~CMwIMR%7Q@z&ajpg{6@l(8Z=R3?E2*fA8*-$>h6dX(WH6BRjpreu;wCCDJ zT--f{w2Y4tL2?Ueuuue7z6M;_6G8UtRDne5S5$F+f=HId5e35p;Z(A@u=Vv;$laYR zG}a5pf6lk#&%?b~i#q`8l9s`^@)Kg4WdgygPXdFTg4LuOk+L-9$SL*1ucq!}r8jNF z_PsuC*!&Wor4`_Wm>66*a9X%xRu@@OG9a|#EFt$tGqDori1e!$i9F4xiL|FliVm(> z4JLzJ(49FbRNMK4Wj~ThuC0!RNX|HCY(LlhmRuso{^ zPgoRZNGFfOI1*nQc5){79gT2Am9DhCg@v49?|u$@ZXSnrK%49IF2daFOcLinR;anX zk-K?cuN2MHSDb4oJBw?U+<{cGveDa3JmTOIBADSuc}h+vBbl=@@S@EF^v7Nh*VL5| z7ORP$FDoX!?QTTubP<~4kcLZbm*BCg6jU%f9JSX(A)0odj1 zpzcD>g%$aka(8DPXUi_Gru93k>G?_!Mj?v(*^c&zrEzLGx!fG|u_07h& zQZC|D%@WiTwuyXL^iZ(yOE0>$%#P)+$A@Fdcj3=LMcBMY9_z}VBXe|qgF1VbXrF5g zqz%(>cex@6K0k#5;a&Lg^$4^$pCj9Slc2J43Uc12z}tE~qJSDntnUi&9$pR(mKws7 z^nd8}tuPoI%@LAKwWQtu54w3`BZ-jW!;a+h#B(VFjOJ~_FHc&Kx$%eK#^hOi@W=r) zS=)kwM^}^U3%pntgHtg0X@i=1tH28O`b%xQEuP&BpHnD^YHZ4U4!7 z$Q36|7QR={v_G(xdED?2bGaamG3URZ3}uf~%rf)C%=crfs6BdieSeNSW7@lNrhaP? zQ(|NzGvv6NWqX|~Q!rY}eDmlpBm0pAFZNk=gVmie#x~y_%!l=#7~dxzTl!t()PIbX zU|y25z&rIPkk7vwa%A;oqN`kry4e;eq}CXU#W1Y@>k1={>xHWaXz*C(3S==>1Ao6R zQ%tU}smA~8DOO2f2bMe~j{02pk$I2G@%{I? zn87naC0Y~0S;=-c`<{)&IiX&RE-A~r z!m@wuELh`EPdw6(kU0tRu>NkQAaKhip?7csIrMxxvC95M#`YPI`EmQu2xkX5S6hJY zUb3M8jR(m7@=h{;-YB{HN1K}OZ@~>lcG%u40=uj|4*9w-g}RY6eC(k!ZYZro>rSPC zbo>R@m=SHW>*x z7juWacAF*2Wf~FPLSIPP>j5t=gn?b(19(sE44R~uftdF`I4V~N@(a1ZZ*3L^bPGU5 z{3={5zXBQOiy%T-p1jEJ0Vj7w5Ird&QG-ihTj&Ma)z&~7j%}fZ6`iMb-SakYY8#+w zZ7Zg=LOEyK`J0@fmcyLx<8>TWUmbdSrVQt;>wIpz_ZJ!Y$TrO{T?QO}(*m8sU#zD$T!eJFZMCTtfwDAICW9Ap$lgXdF z>g!Xyyx>5q$M`I>ZPb<@IWE(1F-EbWEk@BQbb(F7>0WpKr;A2>^vjITSUJwkk0|7C z(|p6%95-yN{TN(h+D@5>sOI=etz6uGoNk9bYtC=KO%R?p0H5$j>qJfnx@> zyR#(db4tq~_~s*+#Vvq0`DN&zOq9^Jfx1T&-G&|3h&9X<;!7kIXYIQnJV`SEmhYq> zo0X1N%+kfzGk24Rk?VkIpDgqXFu+e$8R&_Ggs6fM554UMSYTU+cdY)5TOZcp`L@)2 zI%hWV-_`{}&FAp-NHUIMpT~g`4~VIM4pfELLU%_zKJ&L6d!2U&m9R+Q`}o8Cb92~t z^~&*teWWd-vKWRheOR6*+U&vcYCKrC2f9DC!kRBx(Ep?ky*z7ygWhQ4wDxnDyHkh# z`S)d564XU7b=TP2QbM$pci~6ME$~j|GK_jg!-FT$#A}rjV(t1VET7Z_FXasQufr7h z8y%oHA&lyd)kAHkqA2AR52pCLK{O-VI3IweAd4@a~sA^B=Y)gzgj*j(PSEC*zhuynLN!U9NvA%ZP3&9 zVf?B*gaZz-vA((!>~zr+6Q0`*yyo`ONc4v;LhC7uf&kv@lyfEtXT z_Tw|GojX$$yZI`xCqI)M(-B;8K8@`=M_dHZm|<77C17kNY?~p>*68 zXUvc6T6NGpB>JJ_f=tNMVguLn6EqD63-JB{jk7`n299)Fcr?NcVK0G`9MR9=gB$j)4n8cqg#K-hZao@S6*mmtVbSUdB>VBn#!9JBFp4y4`FOo%{ zUhhYe#`jo_?BA?^bpqB}ml>2lzy>7MO2Jm8kwn&gM;y0uq%EJyIye%FBqalJo(lCh z>sltPMsEl|=p+gru|uPJS%P4ldU!V`E|R^j1m=VD@fO1figoTS3^-qdf>kM=TVO0$ z;RrCY^#Y5sI7E$f@sZAHY2jMBhC5eCA?K0^>`JWX_j$DTPpAm%A;&$=B=y1^;OHL4n-~7Y$)p(Pe0fej?ka%Pu~s6#h03<< zG=h_Jmt(ORg`lzngMVNKIX6;+hURO7TzvUgt^Ebx*f)>TK`=>x6Aa)<$e!>#*Hv)+-pf)|rM2xMH#eT}?P|n=f{qVzvoUdGI^% zkQgKWi%NvNWHv69TaGsbZHAQC*JM%P669i(k3cU9d-(4|vmaKFlA4EvHE$7W3fqRM zZ5I+c>kx@YTgaZkD1^;-;>&Z(QThB>;N*p%Z5^uU_csPUb8<*H%`^}0t^N`3_RgFNaXsL%{3|LWm{-CD-i8Y#lCEtzL*~#__QvsU^4Mz9l z+>;gXrdfKhlKSpTv@0O#z%Rnk2dfzOrH(e-JBS;OFb^}b5oRVsI!!Y1*lpWXnBvk$diBrLk8HUD4yuA-oe`1JtB-e(;?K8 zkO8ZnUSayeL}EVeHrbgQOIE(7@(L@G@K(dG!s`>_aL|7Xter+VQ>I(P|Yu z>Y9ZdQolmru5r-fS3zu`A%t@gJbb$nUZ;rQtWr9dww{1Z8&3&uIyK-r$y5+Tw*jj( z2>zJ9MT2X<-~+lr@aX1FNb{2**+)2}`KYYuMrdf(mVHGGi8!d^Z=wb|jM9w8da6 zFJxW!5eJ9PU|9A}nG9)!k~NFYpsEwwp-*x)IZEG!JN&HiSUCgrJ~zO=mO)ro&Jewz z*Q2qw6~YUqedvbD5==|kPkzU0z;p*4Fcz+)TzwWuQ*t^)Y4(t1@ynpt#R6`fOGT>d zwV~UUj`qF`f}cOuz`O6GBz~niqzG(;?=|y~`nRu0pLC;<4W2Mc-MJTxUm~m0Op)!B zGb*HZ)V&+#lQtU(!Pbl6tWTMbQ3T~K2`H3=VVr-n$fbxQpS%Mj1qt3Qqk_V1w=1I^khiTWnU* z2pR)XkltEE_N)9P=|1z3@fklV1D!*Rhp!QC$|KglT z0zaN642?V?`yU@BlLu>2)ifve)XP$6tv&#$GV4GgCXeK}Q69qY^$-(r6S}C*mQsNc z89Z_uUo!X&Co>Pi2aRi>`TiJ+DUo4^Jye3H37gSyRy@2~`j99UdSEz6Xnp_8>9evl z=nARVI8|2GbmyvZPN<-R7P(4>PMbTy(R(#i+grJg?t5pLlYV^#{qGfPy6uinoC`JB zj1d-Mt{)V~**3Mhc9m~DZS(q%H6PZA)2l0w(#%c%)=HU;(%uzGusy0;aX`vZ{AR5= zKHuL-_Lfd#k6vTpzdNST4x8&j6m<^9+PiR91jV90I3=KsM3SX%*ibyX3`;p_;)x$> zB&m$zLi$~TC#OR&Z6poL>-8b)iH{^rH5-0$#2c9gYK<*NY#P%u^c&C1C^y=y-o;}| z-r{fc_{wLFr1RCa-S~9J%e-(;u}0A*)kd$97Cy@?l+Oz6Zs5Cx@#Pk_@?VyX@VV=x z8z&=T_-_u);vc+i$j_eNz&B4g%@3AU;l|7l;x4ke&CRY*=X!k(s_mQ^&%JziIyWie z0r#GrJ$LO9S?=&=;A$*C%Z*!G%I(ti;O5RA=Dsbn<6ipvkdsN@%)PXF0k^*WHb>es zoip=hD92Pzjyr%ZakSJ^xt(@C@LAS|6#g#6fyHsQ4&9!hLBE+u-&W*Zcu$GIm$6RthbZmE~voJdQ0JU89n%N`U8QJS0P0C5xH_p z6WXY3$3;yo+dI~++P!KzLTj%f)e7 z8JCM)#wqO?Bva)ZdFJwj%qI#&i64nSYG-3{MRj(2$}6<~aVAUbupO~#YY=A5SwUnZ zRq*bGX%Hq?1UHR8k=?r}cfm0cBp!=`70UhONpl0)v?rXTEYYTNcx@oR@g?Lfw1B+( zN64|{b|O)s3!sz^viWJGAP#J21_f(y2CH%{i%zXA4)bd z=T*5d#N%c#Opo8P{Fi-|k;s|Gg zxn_w4BbI5xm~$_ev7q!WW6=8x{jz{7N7;p~ZNNU5t$*exsZD zs`%uiTUhZMjXlsP$@Z|6W&b`rgDrF0o!!0Cn$6Nl!9m?tDD!d({&HTAJv@+(_pa3)rx+BwSfA~ADijZ!C}F>r3f6zeY+z$T zBE?cn!9IiQ@xM1#_}i>F%-Ap+mU{j|YpBf5iz9DYB{o;_?K3Q}`{GO99P%au8R^2Y z#_hKD7tMsfI>wMhsSVcIeHyXOJU}UF-uUV#4oaM|!e(OYG1plGYaRNBQsyyOt78>~ zvlP@|!>(s=<2Mfu`rL+qK{-+L8Y7VvbFRq3P*2o7Q(L55W-A(9OEDU2oM4gdXS6Ql zI?MfrK1o~EhwzRRO#gWZVoS!rI?PDadesjclT*m5TuIUP7cwG|skmtRqG9;mnh*W< z3?!_6h005OaQI&b)YZNo?TUShA73rNjM3lt!sH!1noy39);%PzSg%RJNE|vvmCB z!x4OcWd?rhaRe_7&cVtBlgPdGI@Nid$1-{5FRTstfFdsMz)`!Ku+`31T)Quxb$an0 z)bnQxQfgKQIfJE;?{WyL zw5_w?-nDw7VA=&CaX})oauZ5BI)=5j%x0ryxA2@aBW!BeM1o@0f+8G*INcPozjG5( zd9?%bFWiA>xvOyAwig1=p93_!0~{$fuyRo*Jd7QJ#3E^t3^Ng33_b_}(V?)YsEL4W zA(mc5XzeM1++>Xe?k#kh<}`PnSt(h}xhUbm^=4^vXPf$Re@j=G)0JFmXB7Ez9}ULR zG8V>j|MaMFJ8S4%k2}Y?J~C09%D=lf3+3L>j!wyQ%ey4G-=s!3573L+)83A>??Vk( z$#*_3o%H}^_MXExC=apZ5~`ot^OrCLiLlh~06brl4nHb$kioAivZ_52Mij%ySa$*_ zyzj#q;rB56gf->6uSLF_RzX$bH+;$B78d`igLyMtutm^sG&Fj|Y8BJfN@LL@E6!vL~^TODm~=dVlo%HN5j_>aw8`43ZB{G#8}8!rg%^FtyC|3kz?!>hdO4N<43_}0OT z_;I@OjJCqYdXD1_Ufr2`{;R>g{4mdE{;$+-zIT*-V|UXM`W&sT^zHv<(Ul&oq!*lZ zr+Yi<(1a{B`8G9uj>mGL*{=R?G_9NCw9wWN+BRMaXW#SX^jb=YGJm<6 zxt6&XXPQ8f-sO3=W<^dLjl1Lo&E_QLW+@A3Q+M21VMp(SZ^{F*&|ny6SQu79Ls2`i)g3h~`^FA$T4Qe8Aov1OHZrVYzijm2x#Y%_DD;%lGNz=RqvW!(8&D zNReICmB@0;T|}gAx#CiKKOTr4z>)Vf&|h~1+PV!aCFc#e(B(Lur=3J`mr~G?j22

p`{vd6`fO-(+NGFV}{mf$E?i>TPjlM#hy!q&+!Vd5oEDF0Uk!hUU--{6MMhbO{% zOc&Xk-Y4y2a_j{G+U$S+p|ER1E@&tog?sMCc>P=*Y?}RqEL*As{cek)Os<}MFFM8& z{#^m9Z-#>6S|wNzb5MBW9}{Jptb*|FND#L#K;5ex@ju=V#1f}5OlIJQN{P1(jV@A* zwi8DgQzkmh(F0}m_D;49-^e+}e&-R!y_6lygSy%EM_Uzn^DGxLyTbbz8&h+cu|9E3D-{;g&uW;lJnBzPPR^C>^K_@n{ zwg$f@DOu~uGuOw&>G`4rG(e}sNx4(eUQp_ zdo1huhv-&Cqp)f9WOL7DVl1`}i+bN9?$k|kaMxvF{zMrGey)zIJSFkzE&gPydm-fw z--LRWDTB$~5V$$sinK>IVBU!g{AzU?tWCB6g^_IHddL7Jp4oscU(J9^gOS3;h6+@c zHxTX8-+@WmL-Zo&8eyrQXL;_dMw+_6Y;%)*L9qD+m=;%~J*Ht;XPpNtg7Q~6?)1UV zALrpyfycm4vJSYPwAh_nwS>y1bI1$#LIPbcV453)?Mpie(+Xaa-0lUqyKshRX1OK% zPycsvp^ilwqVFT8IbY!Kp-#NeS_RLo`GM3Jn&iQ8E-W88&nnQXhx^nWsUu>Xbyu6g z+H3WwzPnimK9#@@{VM*et^08VW?I~48AqJ z!l(2V&`pbvkaOu6^2nK>-IDx4lfGI=%kStg_fDKi{~oVTKNHedW53XWzO0Z%4@})m ze{62TS@Y)=w&y6=rwi%6RmHJVRQAed0P;@HfmPkb zD4#mB`s~le`)sI=nMfAlMr*7!<1%j6k;UP`o59vK3~jcMgf)dClrcDn?(t55v{Hpt zP+J;vvtJ$0=Jp7$(@B=U!Y$ltvdz%_=Z>(aaC)_*b_t*0!{S@+z(Xl=Jej;k`2 zOY4+O;9OyAb8c*1#GQXwl6&1bgVuPS#bF2d(5haz(A3goxmP#rq8)FXP2c-r0sY|g z>D-n?ak}H5Opa1b9%sJ%Hkw=ZELwc0lDT-16eoPy85+OcjILh6H`BZ?E83N#0@ds8 z3f}~V!A18vm>$Fh!Pp^zmq>)PkHoR^(x<`WUG?D#7v8;Pywa84QM|3lP*z}&@;=HfTtim^Z^yK1dd~2CG zxgEcYWtSBVWpUA9UGf5qt&~I;U;1NC)oZl$uNU#2FoieSx}a~h3%6Q&Ao}BUqJAX_ zo`qL~OMx30^q(M>c6JavQ3#rbp-|Q-DO&zBAFQ90l7)YFvdWU{uyTeHd;js7WGm&Z z{*bJNw@DnpIz`leze69IkI0iqv=qM0RTd@ny+C1lzgY78B377SCz<{tAI{O7@iAP9 z7yrz}d$K(7x?EGz^~M&tJ49g%Q&qflo+UO@a>Q$mPP6XT`jS_(14xeVSHio!noKOC zTc+x*;5ke;;vp>tZ*q9p^2hFlyy68Xd95@2DL-un@0LbWLvoJ_@9)?q-usoKmJ2e2 zcw5KMG+0JeS$ZkP^K4h`tuIMi%5yTlRWFQ>Wo*b?*PyduHsj^n*$ruGs~Rp|lg9?b zVeog?R#>{Yoa`7Z$M0VR!?R1>u;j`Gh^yZTssmey*M^I5M13h_Ub{>Z%N`QR1__8T zIZn>{enuk&A;?=)hV~^*$7l8|gQ2H8@b(%DoTO!klqM(PeAsN!;yg`QF--!WImQ%R zI6Q_-byP$?gBFmnAq}M&|0P}06vyM7v1qK&40N@_$l5YjxR}L7-vahP{0&Liv&$0f zHywh(DLOV-+(EIt8}XAfc0|iS0E_CyK<%-F$X}2Eu0OnioAZy5F9pJdDc9oCfj6Sy6BjG2{cDjeTl25g?H*4$c2pyxN*iZob7r%^TZ%6u8YwVj=f4mt$T<8 z_j%k)PD|=i^CaU~GuvI;%`4QDxQ7p^bCVYTp-Fg1ac$9T?p@X;^Ph4fX6B29w5L-I z9E-z$IA14sbEch7ZDeb^J|{viu6B1OKS@ zB%eQz$*epnzzcVru~fE$9ZJnO>aKq&e=5I z(d_4v5**>M47RPF0_B)-uv`5X-3Z)<-KFW|imN6ZHj)A@!~bxl_;PHvt`l7}d4w9) zFTi0G`QyiW32=&?4JNw5XzP}wrt|xIkdf&_*y8^WO0p}Uh93sK1GhkaPA4p!tiT49 zd)c3hhw$^|0(f$+4btqA5X&nB*E~9k-nYw(wzxIG$5;0SDmL$tT*fBC{P`WGDb2+0 zYZAz^ANFunsRW+R8z754mg21Y&&WvVa>!qu2WzCQSPRSF!w-LeH8d?`xJQlMuDg`2 zOsHLB)=u)`=|$VDL#IgXSOEN}Rv@nSPS9s14Hx{@6MLE&mI!WPMHK7fj*qv5h6dB2 zyCNE&PM#zavHw_V<7N12n>bs$!kXP05CWcEJ#f2jIcsToD0V9Si%Z1zvA!GHV8soW zvBREaxJ54DX;~S_|NMy6hwLW7T=W?)jSOYmVqk1 zQ5(o^T`kG>kDiSdCnmw%$0d;9umk(M>)~ZS{b+0R04j93fVrPkshPb8`Y@gbvaL4Y z_KJs|j%lKha)L_IBw^aAQL<0?$TmfX+DACs3j(u~sO(g-khG{lTpJ%*^qb<>tK#q? z+88d5OvQF7W#r-Q3fr#rs)*L2fig3{k`n5UQk`Idw@P0@Q;G}lb?Kw5h%fH2cwL6@ z;*={?qz{q_YId#jMhQnt>QFw|aQyb55pFv>juNjpK>K$?I5Rv)L2^!(Hoj0@iKbP%u0gAR+GF!^RS zJlNe&f*qysr(!j*o7Vz!8pFs^@2ez+%4(SuJtSF|B}AKsO~A{dR=C%IkLP#pfH^Y; z;M|i+Sj*-MRcYDSy_<~>UM7U}*NshB2&NWoT`ok_@q8tv-pX!h|63xQO*ba2&ZWej@nM*bX9zgZ~ z3IxQng`D^~17&RWVl7|u4*h6zgc~;J@I$XAoN;#@3NGx!qYESP11U9bq+(ia#ktky zGj~OB3T)^c{}prTugkY`=7y`&Bnm&&)*NgwKh#vq5wDx1&0E<_d--{vxq48kIp>o! zU8~lM!@4QW&F+ul#M!9R%^m*LK684@X*Sd4T5^@?x!rl@3+#UrDccdGp}`Zj{aC zR#N4EnKBmLyq#xEYMfk_E1`!^ni2%v{WgIPsXH`SK;F`RyOt^7D1~mMjuH#Fx8xUbQm+u6FttLn#XKkmS-Wsc z5!GFvR*H4KYH_*j1+vWSm+<*=Bb*kVjBN!Q;Lylb@+tB&Dmcu8mpxRs>$ZVNdi<2g zqF}zr)yE68jz^=#33lLXX(mbx4~Hm|JmH^|9i(J55h{kXA^XB%q&KOKOz2DCgR&>d zJ@1FZ3?8Ggx&yfF+;)~p)(rf9<2cR>nIMO^OrjTqI$%=R&(bl}#7BBF!0=2c%oDso zX}6al)!kzFRJb)9zqFA!8QwwzUhZIabb#1tiBoK(BJ@0|9@$lHf(J~0v{I;m3~yh@ z8OuZQ`oqO!BwP`^srm7yq^oGvXzG|7=m&%CL%wSs9Y-BQ?nll~kT$p~ZT^RFKvzU+dr!(z8TGvbc)?p4^ zU&x#?Xl4GVo5UQl$g`;GZD%ZGoMX;hss-t;6+)dh24eiZfyFJW$)Th#FvGeNzHd1M z@<~2u(a`|#;ok+tU(Vq5)|w?=WP#Pa-;?>@)JQ>c1gX_%MEM>@_~JO11cX}R{3}ag zPQN&rZ756ndPHzV{5jTZ&I6wO6KHjPK&q3ZNnLj{xnJ=T6#DAmN{cg2{H4IoYClNy zTcXG|UNj*ov3Pl=82j_dDN-7xfX*GY!;y=V@$2@3(D5r2ulf;=mh7jvg!|)!;cMoJ z+P_fDcggKw=)DS#G`a{P=MS*ruKYuS(7kAZWE6gONf#?S9l=|A-O!>9lux#5L^v;X z7{%lxl5ActcwN|zSr|b-zGWtiH^V^7N&I(WqBPeuf63%^;i8Y32!|Cvi zU}Ulg*4$4-6WuNN#Q_dcy*Q8M5x<*g*=-kGTNc7vx@i#jKVIQS&dsP>^&7F1V8ghk z0CbHgrgfv7P-*rWV6CLihPp*?U0ajwFMSOg|C3~I60c?H((jS=UPY*T(S3Ba+8OcU zrSR~73-QSa4VJ9jN|dxm7ke(qLBT7lAV2CFoZUks63Kn2``8EKB|i>P``&{7?3ZNw zl}}`rpr3V3tRD8?PlU}g##s?FBH`AnX4rVT7cT;)=-GH6+}&9T%L__jEJum`*6$}U z4VGd?N~$n1Yk>3`O=pY!d<5n1_v1E&YV>nR5(*Yh!+q1#!2g;pT6tFjuaM!P%k_zP z{F^SvK%s_a6(~pV6?;fK@<)PuUVa*V;`K3#Q*yP|^-CfRO>k(*cGt~Ys~l@b=SS1{ zc^_%&#XV%+?eLmXJ7Qk#BzD+>74 z#nSEZ$8`r&@Q)4aV4D~N4(r7UGY+NVlP6=4%!3vfkp0A(^Sl)CFD%EjU1P#ua#vu$ zvstiaxEJl~8bKGHZ6!Q30Q&#tlT3654m!0rlZ4zi5^Z+_g0J1Z? zL20Oo#LwLZi;q2l-kCnqhF{EnzLV(~>ZFl(R#9ZfT`nD;T0}432!-Bm9gsc#!*^O+ zL#Lj4&rcPG!=svs#I+PK=UEn+bWIh=#pU2~`-rq9F9!>i1gKfIf^4_Z1cYWdz9|=m z#L6`B(`PbujUfcx*AZ`w1@S;om^iv}mbmm|L1W##+l{X>{l)5X-r`5MUBsu;7B?OX zP!)&lStyPhX(2A1|9|icAVTnC{l!Xp1YjNi4JZx?0Kwj2#jGDF`$5^Q0tx<1LXL|)d)V`c# zISQD3L4h>>LLwT~DnX6$Skh_CgO9e;AaJWTBxe4k9tjke+qlSXAM!xc3oX)bX^9Z_ zM+K%^E0HUSGa06OISn@)&h$kLpe=kYu~4z0E?TGgSEL4zxUiY53x7a7^Lxnt`3~jez|hQ>39?rnoOv z5j?!+G46>mAf3P?><3j;^zJ>;h<63$HIDGGb0q8%uO$-_HJ&5RBD)kk0%+zJ*8wY-T0g~|7*rKrVKsyw-^&oxT8;d6>vfo z1@V5Gf??_b@V>Uu#b3k)EA^+Oz&e@^|1ul`u4Y3?|8THB-~E9*hl} zK?k0ggI0(+criPr?|x6E#h3I^PL@f!TIZ8`Jr0CA`04|HSo13)CqV1L z$+(P$z?7AV_=*z$#|?%xv^$wjd!YjpclS!$ZU51*cdy}JgbJLOt^|up zHeYk8D;B9SKF+tT{4Gb;!)Lh|7}PrLY-VDE?eVS5Jg|wBTt5egXGP#q%_dr*7mYtx zW?`6O3FF-9pe;>SbZWm6vM)0}vI+IHPobCYUt2EC)NYi{Pc9(FS&UmJwu1lW&Ib}6 ze3P8MU=OO#kAqU=4G5WW6N*_Zx^UW3QuX69|Mi6~_`BwmaAR|~DPC&ocgOE}Ao- zAbNH3jF8oC!ZPt3Vm)V^u1j>wS$U;7tlp9l><3+lYctp3`D0VCe60mqt`ETjWsQ{G z84WLwGM&wI7ib;cM1D^lgF%g-iP!gcWY&cVz2T;ac?S@48}j>sSTZ!c;Bd&YzQqMO@>(;F%!hXJT4ODK&}?b(p8He!bM-W=p7fM7`>EpP zU0-2WX9D~+R-wurmUzo!h$t7Xfu!r6czELltiRbwpQ{D(Yknk9_pTEn6;G@|S+9<(n#L(3X{V__r9B>U5(fT7Pgn=`G4{KKnU?H0({1 z-VS+Amz;e;T%%mz(xxXQ?(Z9F7rO7!`sf95Ccn7~1!^^*eYb;JInTt9H}-Ulo-ZuDvW9W|4wLCG zmzVV@{)caUn3kmHX+YW6&5#I`vzF6R{pWEP0^-8+@ zZ5TfOq)Tdyu91KB7jVe08_TCC%lO{DXyQ_X%lPup?yi9KcK_kJGdaj!rz-pK)gK({ z#r#F*Ie2W;INVl!4O@3hz+#&|r29^w+Lr=A^5_Q9NmxyclKRLW$0a1(cq=`8-T~6B zZRx#LOz!ogfQI!vbq*@a#wzV%vaM=(%YA!qTC~2ZrAdAY9({R38kUhL#S6h?w}LPK zgxm?KlAE)1m)BUS);(h=Q075??r{izHiF3ru7oS;0279@;i73j{PwTHboD}7sksHe z{r*SaZ8F4LKij1NPtTE?589C?H=-V^JkKY?t78wc1M&N++u;bxGZw}`Jfe3jJ4wvfD<_=6Pf&Vqz4BeF|nH5KY!=hsqUj+!-2OnqKW_X++8#H^j|!5* zeR`4P%f5>YXY(l7?7zjA*X9uUg-2nHYY}W6olG9e&8CA-+i6sN73mNDKyV=Yt4n0a9_yzpdRHQ2EiT}&3nk;&z7=w8v*FI{bqCh5HP@7|j;u&x&nroQ zi$^Hr>1<%;3k%TaS||!*ry_UuE8-k>hqS>;=3W<#0oMy!qQk?$^%)oTR6AgE{Q~^K z_}Z6v7tt9R5^3Z<6}&a+1X}#p0!Drk7prASwCZvvWu3i?|C+ibUkhR+1+GUVBbN+` z=M+fAg;#z{-0Mrk9dfxm<&HjH#kvmOr;JCEd7M@;ndkHiR)|=mlktH*|L)haFSy%(`oObUh!8+(V1(hGBN){z4Gl52!BS8aC(%WkRI%$k*|c^atnAk29gIrg{ump;{c$jv zt25;fH)y;PZ)1Bh_lNfp?)_E~SF!pv*H@y%dwVLC`#t9-*NG_dT$Mj@?funwv!pRx z&xj%JqbcRwZSC3I7y7@sFe-@q%Sy(z5r=V~4~ykK>zj?z=UG(XdRuBVSsnx97vau@ zC9tQq1(X#Y!?lt&Xpl1j1zj$xZQR22yxREP45NCltr^bj*iN=gYnN({&n4Tw-;pl! z=EIj^o1mQWeg!zxLqaZ-b;?u{SZZs)!lOa(_+ma^`SM*-eAq(z*f9@=zm9>^^=+W# zHA(O)zY6$1A}L#43*)UWkjB-gK%@5$neuBG{3vap_f=(5r+FhlX~!_ANV^5ULdwa( zrxT&>OK8f^BO}AF&YXc)WaBP zEelF=e4sy65t3j5%#11q-*tshx7!6K-E$Upq?6o{A%MJ;3G&NZ#W$3;9Fa`(BcK@27xeWWd>=nS>ztm zg$z>8g})9JAjmENC2BF^=a$ax0PJ$eV));&eleoa)RbbHk&Fb^cn&aUO*)e7B%<@ga<9ctgjm`on)vl*_rB2b*X0*SYDhCmSp=0c0|+&(Xw%lqS8y>gzM8Qg;}xF*kk{xv6T+2Wcjsp z2~}rbV3m9iuJa2x!iw^~z{)fAU_EcMV2_m@W@lR7w6Ck=v44zfW?96U3WLI>EZHj; zcpKOR=Zgv<N)((3u0mu?+$5l3*kSn=|sZK7f-*J*Ep>~PYk#E z#OGyk<2qcfmw-VR+A%z;7;_y`aItPRJ-O*3xul~Fiu{pu zrn@!sF4{p8!tR6Io#!y}YazHD-OQhQY#2VXngeZF*}&KYVSLXvoXPrw?=zPwGWaj19wN_Pbheac(Vava+NnJmdEXJ{3_rAN|BAu_R_ z+BKGAKsv)7QJzoC*+0p;>V^0@`xs7^w}s*BSg`NNGkoEF1`T@)@MC!a&A1njN|p&^ zbrqlf*l>c3f1p5oCx7ARB<2H0KaaTV=_G5PGR%5|J@DjL3dBap3+n2V;gpsTR%Uv^ zc70{(^4m?)_rqr3BokX~i*v{8J19{+n8yFse^4rMd&J*0%9f;vqw8O)$u$n8402dU zBRJQaa~&Lio#nKd)i_AUC2}3C^|{t+$2oOx38!G#{l=)=`CO0jt&Sd#^EsES0vzx6 z3~LNB9puO#+s>J7UCUWJb{uD@$ea^kFq%8>u?_d-Wo7P<#5v^6Uk&N=;~)6!ru}qE zkQavhk;A;H5s;yM9d`e=#rc+2Xqs@Ae%Swlj=vs)xBGVDN41-zIA$D<+V+AjF{Qpc>xUqyZZaGc)&J@5svFEVS_cODjYC;GMgoVD^bTGAx%*ZCAFFpIdLjr>L8- zE@U=Lcq+!XXOH5+%uMiK(#(H6R}uFG<>1Izs!evXMft;J z^MzU1<;X{$V@L47#1m)@!Pt0|ao9iDNW(AI((cSeJTYqnUgkGSYvp$0wqhyGvG<^V zlG3SF8z0LZEnwB(X6W%U6l9Lf1J?L?u%YfdjM{D{m^XQ{AWwIO;LIsYE~O)BWOaWB>dtvr01U z%fHdX`2{UxdV$AZ0ZjSg4R$-{VH-(6*I!mR=d%Sga-YL#on`R(L~)CsNfNG}^qc(7 zS7+Y9#eBPhsdPxDBWsPkg|QdB@a-oTG;0ts9Gg7Y7r<~z7WSjh@;V5etu0vjT2U}@ z`$X!zdn-N;-4D%SU!b*4O+fh#(nBhp)Zy5)79ZD1^mMz=y)u2%0Q=u?xl?y8XdW0>p`-H>WtVDBlgIN#uq_Ku; z@r6TEVp+V%iLABZ3hZI}MJ!icQ+B&`FKe52FzdsLF4hBE75hnR=Cg`YRD^OHG)3ug zMfN(k*up)bvGBHf75U(pgKI*3_*qZwz>0a6HA>Y8>v|g6-&rplmc%Io=(x0Z$C&!!;$roM+W{FzZxWf`~0S*F#r(r;YTx1GU~shWLti`BWJ zxY)6xS%qez>D^o-=~$+=Y~l%Y5Qu_cdMfE za-;C&G6h;N!I^s6rK4VQAN_8DloNkadUwVOTqKFXsL96Y5Wzuj)@^ES+e5ZA$m7^$ zt~k=H5|6J8!vygtd~T*lM!S!rhgro~yTXOu%@~W1RuA#NO>u`Wzkkq(kEdyg@ngFG z&pB$dCxSY#HP96ggvmcrLmIcIhPUHM@mT&@WN5nh78dpfYxxP%H`6RNUg z6g*$q0Y5TF)5YntrER*JwCC)27x&U9v}|53ULD^{^E}4lbJs<5GP6@YIQKGEGHk*r zuSTGX0^@V5m;)z9Y(NtYhI90aO9y|Cgofa`@TvD4UUEkCNM!DR^M6tY-w-J$T2VZh z@tKp}CgN;sAH@kcsmN6}n=S4g@s9INl)};W(dJxI{OhEF zoHRLa!$*m8MZX=ZPJH25sLpWgPrJ#n=kFJfEP2;BOyI6hr@8`McxoN~kMDmPn} z?6HFirn51k)EPZFiI{KRMWZHbz?tk#)OC9fx*MmWPpCdWyniOmZn4F0Ue9s4swqlz?a|m(~&$%WBxlyb>fzy_SOA(B4R5Yxulnt2A7bUps&=SJA&c3 zx}vqZ9=gpwN!*Jw$(KF_v{5tCs9XPrVdI ztD*46q7dRb^R-Lh+b7I7n~8!KBIHX(|T+N(bOrKoR5xmc!U*Rbp;E<+UW-o z>L}0NH|IKQyD#JS zzAVQcli4YJFRvhS&9@R27S3d?RQF+}e#>Ts{#F!D3N017O+P4%`5qn2^ZuC4fq zKI}^RO{%kph;zRJv@}J+G+_lX46r38R_91V_E)NWW*r$7y%OK?hS7n*sW`@bBOd=` zM%EwQ-}Evixao<(%%+|KwUVmb7!Ar_!6^7s&c0E#KSt8)1$~_ z886e0@hWs9cN`=zo+h__4dlhmB7W3|V>qyM2unicWYd4#!py|!vfI7cFiOt|BiaBf z_-?3K{RwB?+X%tRA8G6>U0S`*7+q}JaSG%3G#&Fy`mxcJ-+9TwCEipItrF*&5j^qH4~-7g(zV_fq?{AwQrn+%aeVbCT&=sncoRW$pDxD?$#kN<7KAa+5`>#WUPe24 zn9@TI*cO8O$$oNRzc0+bI}VmV_kvZAi(uX(9he(!0H-VW5v`77Sj!$JW2N4~*Oxm{ zTz($!b~RwX^?V$V#Nplp`_Xo(2JRjRBIXaBF@WjyZ+P3{lAoPLwmMi6bIpmc(4jT z>L<(g{iNylBFT%j;~?zVU1GGs03JWNNq>Bw3t@r#L2E(?EaEFdt4cMA3{nGgq6haI zjo|v9cD~rGh;$Y#hHhyOIqotFveoPOmy*Ym4$fzicG(6Vr6_`AWHpf#%%E>;a-dHm z6KeXhL3`m^Sao?9%#3J<+V=@C#&i;-O?8Iim`z|KD1bt@h2-IibQ1oS17)gR@N0xM zp28j2TC)i^*^I--HOW}@CIly`y`Ya>mSWdv8)S`ea2e3~jJGdEp`Pw?`pUABAMr&2 z0tR-WyqW;bJm!&gY3E6@giSUxF5V3NDP-G>vqY*IB~7mJ$1|?RaQ0$4>^K_(50ly8 z`}Il7y64N`_f{)G<8~`Sy6$iRf88lqeqNP&>^cWe7~a9$8`YrR76>h``*Fpv-_*qR zFkf%SBVs8Gz-o3nI{a0kn`-8htQjrzjlTvC56xvoW|*;&^^PU~aXD*<*IFm`-pQKE zF&Fh;AF$`!_6b{esfj9Ne0GG-RJL#H{koR;TvqdVhS$FNiSXGsb&-p+(Ed=pf=Exz zSv2B57g5>TCRSspoA9__Cab9TDC#in5}hNI$vPJ*U?dcacQ@KEUc&T2mRm^ zK7exL&tPWx8UFN(#gWGrG@2R*i?`?pi)AT~Ilp*o#DD8c8ei~lH@w;C$F(U3Uf?Wy zv4eL^#$ALz~(+GS2*>uom_E_$xT-Z0_3 z-2rAc6~9AWRGl|UbnJ(<(BIWicrQX^A3gE7Fj&5(E>=s(TC$cSn%Yw)%s#cJ&PG>R z6yJFWFG(!vMtYs5eO!#{0}rsO_cZPD@q)nUCbG4E3AwV)8Tnt$F)%TPzGOI?PwnNT zi!`n^%ZIK=2nssxfN8(8K1M;c$lHxL1vv-gM#_d@M*Cb z4W)(rD-28T$kH0nygw4c_bI}_sx#!1=QHLTWe4mJu7zn+%)oTZYS^0c4w@}LK;qp5 zh`2cewvF_M(9KLXQs)nx4v&EtYXwMI9W5<7v4{9Id%!+BN7D4)Aw1mA^rjeA=#8I% zzjO?lBJDD%#n-FkN^Byo=>36SJ|k%Th8*bFlnPbew(#kb7(f0uo!U8i5WjaUx|rWV zKb|hZeT?gBsw4#EHacUP_$Xfe<%(}db79S^9CBd&e&Uj>NH%h(5$Clc`c^-dF1q@V zPJ8u%KTCRrXt$ViacW_ML0g_^+wuFH+VURGgYy%)r&bPgc=9gBp{0E@r#Cl{v(kJ7 zSGQ#i=ZJ5yU)02@drApNT#Vd7<&c*t8pY;-mL}Km>imIF$d&L>d3x~3V-afA!Kd} zjAC4`M$t{M;wK^Z82;;|2NjV1Z37fuR6%jQEsQ#xz>j?VfXN8lp-o?sn==m#kb8nV zWYV+y(m&_bFlOmwYUf}K-?HVfz`TvBE&AFLwZ4j^=PJ$9FJkXcG4`%W73%;Ye?AlD|mC%PT^n6b#+&V9tn4`*9b*P z6WQl`^o2|Rn^UJ@8zqc1>So;zR1_tA=n~q#(y+f&Vl6Vay2jesJR}Tm=wvmlEE9&l zC>1^p))Q^0e8=jkQ51bu8^*RWW(n2Z$JCv zGhdWdoG(fUb`zFwQN{@lepsmHdtS>CY}@SMjmbr#5hY3m8{vHcn;>sNyCo|iOVdpsJY71IfvPU-P$ zCCsyGiOFUR!{q!c*zzI;%ANK=)8gZ>^zLt1kgXy(@GJtV53#|x&Wc!tpM}K|BiP7S zm0HP9#MY%!TBNmsj28Z;Ar}I1;`MkeUwE0aN3rR0lj(S*yBwYAGTE+dW!dll?$8y? zT&Cn*3OwM&pm_5Z6m0oUx{sTZn3F1a^hW``|JViFe#-HCW+zitk^#=XU5o8T2he&y zW85C+L)k?Rc>Yu-9MqA+J)!9sxL_IXa8bjDrm-aA>0)Z7sM$QqXBW-tECPwb1W-W< z4eT7G3*u8~Hk-te;DjrL1kH?H8GYAkq=%<1^V<~<0S!9CvZ-#FR(BgbTskdrsUgLeys z+=HS|oVky@xg{kdxErhaw9GI96||3|*W3(xs=Js3c#6q1^B6j7*&Vt_qnSKz9|PpG zh%}cbO68M}^Jm*!m2M?Nbdr28|9vCV$yw|{*Gy2y_X-XeEBZpVEHH;ZRr=8Sm4#V}uw1o===}7?)xF^;-F_0SPgdgh(i}8W zE5qS^;Dj##w4=vUO()(NV=58%C+YsSu<(N$Hq;Z5D>VjU-2{0-@#_ixiCC zO^$5#0_i>*NO?B_^gj%fK65!lEJ_;4B+)%`C_WBO}xXP?nQEI^jhd^ zPlW?@jJU?`E_v7bi$C&189XSL-6z+SG2V0bg5FxrddrtY2cz1g~+<}f`ptGbS+x(2eDdri^r z_jVWY{8!Y^=_0njog-^q^c#<*Lx3iav}@lY)vBN)eWIV%e7@AzVRbcuBL zJ@F(@|jK}e3Ga08>0xmgbk1B5~FsA)F+OCq3ul4@)Pn;(S#Ba2Gk0Eutm?gat zokGngX1Tl(rqdAzCt~yt8I?^tgpiQvx_1NKs@av)| zR@v##!lTuvgwYYY!a=KM;UBXpc6T?9uR9>QZGSs!m;KMw(W0qgQ-mg_KK5JFrNWqE zbuR^4%z!BCm{dDG#$-&ha$$Z5t=vI@35Wa|_RChAMAPND{9h zd<4(@@m!u$s+QC11QRFm_9;%O)pMQRq>gpcHA;3$n;z|S;q+>!OK3NCY{>s zH0^eQ6TI5!bYgv@(^N}ir|}oFc$==LaLtSAh05)(SraE86?Ug{L~FJ3go_rI*uVV2 zV|mrIu#6vN347M1)oyX~6gm0oiPR6au^yhj$1?bOTDbiCN1>sf}CJk~tt zWZ{YQeqm;$JL|UE&034zyR1reWp@7s3&8~sU+j(fA-(zs$j&)@xZb-6>bdH&0>=B$ zeyEnp%~mFI!}8!m4#NY?YoT98B$7czbujl02PgMhYI=~1R|`kOIGuR1Xs!T~4GU?9 zZzNyAeKQ#|EuZv-?-6VnenjA-R}G`a4U?Hz?3LARsDb3{?Sff}%1#m5>as!`DIV`Q zM!ss=3%=_frn%1gvTd>Vuy4F4hS$x2)CL}^{mn%;sWG10*x^#S>oSq|zehRG#^Sy{ zRje{IlbsVsp`vaejSSX=WV-|y*8GlExGl%l%||fK&j@3^4uDPiG@Q8S5pHE4#K_6- z$*1;VfS*}l?%6l7_DiOCS-OxH05GW776J4PGkuHn0R#9@SYHa*#BfCd4d zTsjtyLjJyCGMmjwcs6euI{QB)%O6?7swZb~#`+c5uMmWR&SLuVTsA%QvYfK+&X(@| z(MB&-Zo>1P5!ADrU@KkCG?|yPhv>sE|2GH*+^8Bv%F}Tmf1x=SupkX`ZVacQPzqLbuK+AJtO@0M1!??t)573p|dD`|54#kbWt6gyd@l5=UvmVZ!0TghPy~~m?5kv zt`d2&v_(2^iiEC83M|cUzlB%Q{aNd#c-xouj}e{nvk`sWv3AB8yFWJ~EG$3;6x-BMHt6 z!%b_gQ6nG^>ZaGg%pIfPYPB0(CQ2bWFHB|ahmS*)e1@R!?HR$oH$j4!j81Hx`w%@e z)gg3{4XTrM@P>3Z9*$9#z5SZOs2OaD>rsESxtvFqHf+Ux#dheJYl)JgpVaQf5oGz* zU`vsTY=TERK3JHK=?S^Gv9|&<`-jO!e$$XfM6J@fOiclOwZNj)Yf#Cdh7goX4pAqqxnS$=>If;N2I|;z(}0*fDFA zm}*ZIC%jb`*MxG!mn>F_`7=(`H%uR>SI!S?$c;4;pDXBU^nEr@ykmei=44)LteWK_ z?klTo6z*FhcHpwa8nga2CSC7rJkAD=%im><-!J}XEZn+^^VaVo{dJ=OUVI-TI9qKf zXmfo7#g{YD)qOPypQQ{*uO`ERlZvE8>?(8nl|{F8EhL=*w@8fj6Fm1l3qv$Q=z95w zMAVQ#(jm&VrDOnQ-`x7?zydNnfa4Z$6?nov!*bn(Bz`;jMN*4g0wo+NI}dlRQo-ugZv2s3oZ-j>kf>a0CV0KOuZT+AOYaRB0oazC1ry|3% zH}2qE{UQFGF^(j`pcby&QidlR`NIdG@!NK5CDK%A5_KnF}2M)-xhSh?c{Iu*`BmGE|& zJo!6~)>M%?Rpe3ytr)yx%)zZiU&;InB{1c@27rShL|6VtdWNSl-%weMC;bGBSzy_8 zjHA?4Y%{!R>G3s9>62GCt!a*uL|oKpeDywA5S$$xxxbOKgNq@;}$>hC9B<`(OCA}xJCF@UXHC3!OYx;HAi|G$85V|WK z7kPQD72#Y}(Q~UnQGT^`?USW^(Z4g1q9ofYVeM~@=+Hf9(YOcmL_4rabR}!0NL|NU z)a`OoRKB@NG;>F@$kgkUsQdE?QRmVMQC+r+Xv?$dqI+i1v`Ce7Qd94&u;Ln^B zgDIf);5m%ZRu)*N`#^rrVbbvTHNn3dNzlz7)Fy2-Eo_+zyN+1Gr-*V;DHtNp^`Dc1LkEc5 z_?uLIO%67DXwip@ell)y79hh@7G6xnzUKxld!LD=y+eP=$Wi4a*tZgQ6*Ig%tGUv= zHaApiY{M6++Ol;Oqi_bNtC@A;7q28^COQ9K40L4WB9C_g-Q!%DPW~K}ZtNkK?%JZL zS3FEjT zo8CAsbk%4~_;P|Ht=HqW&DqY~*E529Ch&ek_Gdfp$EIZN?xJ|^!~5>sVZ4V86~50n zEi57T>u?3G=0jy}@oq!z2feqP)7PB1kJPtw|C{=(VN8Ks;{s`3u5lsIQgD)9J$6(I@?mou! z{-jCNM_7%a)|YXy^*^*-vlP2WZlDLZdt=eR80qK>NVm_w19e`z$%JAnTpX;6x5fnG zMipKj2Uy! zql84-q_m3`EmFQDCH0%%zw^(`YyPj*-fzj7b|w)%E+DdO3$m+r9J#+!6(S_r z$M>AWWV~1wn`gD%w5O#0!*Otm+6JmuPZNvUL1L#~B-u%Il8fCw zu;ha;Bo+;c*`d|;8_%?$=ck)V&;?6MY7$5~-=%nI`$wEKHyzs~&c{zLmLav&WMnYX zfQ$pyV8y>*QGV$bG&*M~woQn`wFPt%wcBu!&jtJ-NChYMK0xt5W|NMr&m>`GGAJA(jAxJ= zqsI@U)P7qdz17hWW@wBmR$V|pww$8YPK?C&_Lq=q>jGk5IFY$@(gB=izQiLht01QS z94!4RBw07ZQ24}Ftfy^5>Rd~q-_jErf0sd6dKUIQ+W_45cqr{ppwv|-A_+Y`GCQ%1ZJ3Eh`v#pTzQ=iM0Hk9VK7}@jN?EbOp{k*TS8jlEqPSM z)B~vJ_GJ7xOomK1QzKca`C#|@Hc|XL6%`nDkw1x%#QcB_%*g1qThqIm(ea!nE(B`@Jx~ zUPbgoeWK{uoKazKZ@4WxL)-ROjMygQ5X&ZDd$EmQrN7m$7cT6zQWM#o=Pt99?(Jeb zJ*Z+m_423b=q6UNu~voc;yG;pyCGJU%ht0M{L<-JFU(n6CG%WRx)i%4q=B`s=P0}V z8pWQrBcQ_l@nx%VVl(!y3vTTG+EwgAy}9g7-a`!jJcpF4r;xAzU1zq-P9P0&HB`~M z|LhGs$D8g%nwawikEO1L2(vZIcYhXH>gWg)O1Mm&jx0ySCqKHYBSnk z)WNLFo6WfG7*CL8x4rvOBkJAm@AmIKu2IHQuP_(x|DX)FKcbW+2a}N*YAC!!i?R$| zYOnM{3GFp+vrltcLy2E+XEr$Nq7jWJR9=}0vqC$VId$BgiTyAWIX}sy&PjOkO5sJ~ znSN$uW#M~!?uSXpLfu5-QQ)IRGrO51#hJ{ePE!@xDZDPkwacJXM@6F-J&w0VIYG1}E zUti1QugG8;j!b4+C>iFl;uYrSH7T-nfjP4*E1CJcPgWei<3Ib0UmBR)`9=g?f51G_ zR3++KIUy&5AOJ3xhR(Lg@YOOOIT3N;?gq_(ZCsP%G~!8A0m(d|Q~D}($lz${2DPK{wn%RS&SZ;?jc|jE4C7}WgLlnQpYTw@PJ7@Jt)pi zzc0=MA!Bm;3qx--Le=#$$WG6TbX_Q5{P!OvuaEo@|8Sa1_N|v86~!aW`RS8M!E93H4F#o4QvNh1r^T7dEC>^=K2k1@UVon%81Q0-#I#sX z`kr00K!tqTH^QvWyTLq765I9POsB$YW|K|zswDdPKXK-FC1%erW%OU7JTYl$Vnoj` zGZ*GrQpeWHGMy=}7_?u=xOP7g%TBMNo_$?KnW(*H%pSBdJx6N9=Sx=CQWoX*U+$T) z5B!?NE+qYQPt+mSrss!POU!&&WqKd!z4JV*-2H}G8$>hMI?CFRW5CnCL#0b8I$tpp)1C0hNr{6Q{&AZyR1isQb(8q zl{V&LgN*pm{CBk-1wTZ4x?{xO?hG=%w!6e!{fS6Agop!wJ`~G+wIijkgiKd_I`wPI zMSGuVi?zedsz_U>gl(l_7X_hT7ZMN;u3w7JpDQ!0QY7g5S zkddy#%p#>sm)hay~Tnq_ir_W8Hkl+Jo>qnrGbo-MhzqN{ZInwJvk zbox=HLiB6;*bXyVW{IU$+WNaJkLMR@UEh9Mf>>bfP*7vJ8{}42&A&s}?2V%*MJ=;T zS-`I->aylP|JE=0%!|KBu8#HU_ZZz9bD!o$K4bZIy~EpFg%~9j;2EurSS`$+=evso zDU!W{zxjBWAyS7w5#y=c<_MUq9}lSuvf)Hi20WXy0(x4oM5pr~@*4;yU#v7pZ*4B| z{CbDHS&@YwpXB3p>r^o-M-O&N@+wBhI7Bo=Ze`(q3te8MWgKi@)q6 z*~xdwh?Jx!*0l8D6@LcC{ns@Nu*yGL-!U-xE~S% zDZ2N_CjC)T*&jpxu8}8`^Lo*$JFa+U$OrVYc_QXJ^`RN&>G+^)ATCia!W}mg@Zj@W zysP>*+QOZMVn1b|BWDcpFC|Yr;JcN|icvxS`5I_BR~et^SVE<3|0ej{MmS0B+JZbU zOM#YOxFFKps%B2iDuGJ4n_%})vEbuEgPInRca37f4}t!KtZD_cPjIBCL9p=PmYN+K zrq;BimI|hh(-s(yix6b~n^!YhC13N;x9oPoa5V% zSf%0k?8All_QybGUT&rMr*kFwuC9%lRu@=Qc#A|WTtW4(m$015BFQYRDR|RWF7~D} z@V!n6kNGHvO7dKbg&h(n6O)f+(4|^T>XBJbE#F5q423 zP$J}3=t@qjDhOK02AM89aHmM&#)>mv>J5tB*KY8gCBl55`aQloBIu&Qop_LRD{ zL$m9yq-xf^(D#5kjZ*OJ`2Y!*jO%U(h;YTlKzwmx5q5qXiytqM#TTzg&iR=_eAP&Z zb5jcOjm%}Z`^;LbZW4qfLP-p7baCt$3u|9kj4dSPsOG~5$;`M0G}xezUkyvX=?{FTmn}F`@!;WCyXES0>3?a@Ult`GC5o_e(VeLa(@=N zVaj7#QsyA@P&;C~Bpqk@%>exr$+^Yu3zJkZ6(TXleW6{%W+E51_uqodyM1uLFa{P4 z)iA>gBT4`72&f4UCfa7Y=n6|ndbgy(tg7Sig}VAjL7C`>uZs?uWf}B#4V1|U5zC7Taeu|XO0FV-S%#Ab3( zX%0NGo&=nS4)AHf3}5Ot#gqIuq4~b!0jTg1qoDntJ{ByIk*#$Rj&KteA zK_|Z0sLpz9b9PId%~!2Ho5(C>Ths5xwkw`1+rF7&Y)fg+vW+csuyu5vYil%=SLxUI zl78wul^z_CWxrqjmd7D~xYic5^1h*jrWFfOcDNbd_??4~ZRa7qOQuXex{aihy~$xJ zlUyG7PTr_=lCOr(vB%H|TG?8RQdMW*UY{b|xLM*GTwx8~htI&V=2D4ULLFSRo#A(x z7I-hYKxY09XU=@TOQvY%LF{xMjO3RvcfYcj`1^}tcF=mTIcN^6&5z_=--VbK&%BQY#I5_${I&>v++E(29~ah z#|utefaV`HWMxxj?Q8pajM7V8{La+D?t}Ld?6p50_5AmTRB1($ZJ)o{JKQ$KT0RQ+ zOst6bu3w82fAi6APCT-@|AQ$F$wT@ws-(r^B?@gd#GAxZv8{F`DwwZH%-G9eZVdxE z(tOZvJq9N^;czbTCj8uR3i7^qLS9!E80Nf%Wv6T4rD-Z0O*#qRM4_Mm;-2zVJmV?&vjgEp7d)b`aGt0Q%`pVq7u~WEzcb()EU$f>0pA%7s*SVpG z!@4Nu)FV{JCy`1g77BBSMzIb5P+?&Lq7HpWtIsFntoIU^w(<%5KK23r zW+KHKt^S8|X7}QKpBVh=+ajJEUz>;17W1yV&E`o>>A-)VnW6V@oKb-7DO`GdDb_f* z1gpe(qf3EKxY2(vp0ipWt1K|a9}~yW-qsj2J#;ayiob-UVw1`65q+pra)9wo6Crv< z3exDjFENb2B(Ei%Aht<8{48J*L-$}<-ai)~&kMxat5ndy(Fn@w?^(2h|H}~`DIup?55V1*Uzmiw(qNEvmDK&XN518*CwDby zwD>NvcQ^OKT~qH%`gO**_3%8f?F#~#+qc9`53PWH=|Yq@6u_GYE1`6HGU`k}3|gTg zaNMOY0Y%-UPR$Od9?A5gmFJJ3_U#QsYQ&DRkV&Ebu8StTpVx@-x4*=Z3Zm?#lEtd` zpOL`wo8(3ejmExnu`{=o8dvPVEI5>i?#5AM-oyQZLZ?+!lfM9PJ?X#+SgeH3=^) zjmCeZ%<-90~e+ufgVQ{_oK z;BO zsK#su(|2`9!oID*ztTdm0rLg>e4NNjlnY?3@g1RGbGzv77ixUfPX_!zri|XnUQ^k8 z-=LD;l2(!VZZDm7wve8e-b6RQbEs3A&E3tejS5Ld5mvk$2m}ZwQXU$l2 zs-o`NXWIK@#}%X94(wf*BP8C!B4TvT80^2#7tc%ij_ih(sy$0KJ4}a4xw8 z``#(k^<9y#8?QPd?EbV)xMa1c#%q_NusDz{{Mmg(7{@y+oL-z%v-sP1;eyRkLf#uB zDjfVF)VlgcXtL#{aMhi^!nY2sYk=w%W{V(IF+H0@}~{!e_A_F3Xi`y0Bl`|YA={v=C! z(Vfe5*63mu)&7p&x~i=*lNZnWbW(-CYvXO!kDBXrwn*E;@>C8zpEl;ZY=1`^TCHKJ zXfFXz`SmdOr7h?#nIPfSZXo%0o#9q#nB+OT6S5EV5-rQ~%;EJSQn{dmDcsU^N;dKRi88t+7gApWii%Y_>5}YJ&yXoUxQ?GM5sco0wu=(Fc_2Ep}7wH#EycuSQIS4-xTAqAkPu+YmP%|>UWUrY61Ep>6dkzG?Cp4 zI?3-CGm?F95{aBIhxeM^CH5h)NUu>;yI|uv{IGl_8onAyO7LG)`eQb}b!8LLoS6Wk z`G3H7N+-NY&jsnnk%EH-|JSCHBnCo7tCq)6s(LpBKRqHt=dy8TN7L!F6h^k6?|046twR>DR_I}qF{MnPqn6CwE9ef zpWv;ki=aJo8_9A%&+L}AK@;nyL$_|a#5-?6VkFO?RTYwVV0SpPqh$;AB5e%E4t&Dh zbG3Q;W;Fi!N{x4QSclj4!IJmbA)9x9nkn!6vUdC}@&fhTOA1yTU5}m$hM67NqoPv* zhwL?Wo8o%`ozw&IX|nfA5y5y97~XRtH%r$u2YH`}#UnL1H>gO~-2BB%YLJ54)3ot! zrEfKX&mSN^ev)kk?cbPVCJcD)SsSd4-YoC5+bbSNx z!8>_qxAGEVJr3buyD02_NW%DLPJ_Hl8puhBKF z%N}Bmr{^NgO+j!|_BhyFnF@1)su;KJW6U;%T=ewsT_)kl6nuEWF1)98C#nzIg=fBW z#!*9+_|7>+teowSbNjQf)Q=SW_26~9b7!+S@zZ83o&E`%ezw4A#>2Irr~O5Nm?hpM z8->;{ZX^m21zR+HF)hl#%Vu9j>Y^zqqCyF@ugnDFXA_8vMk%G8gUOiNN3!hRS@P)2 zA|@?JPSRyEN0tsTWczJZd{i94?0mx`LduL@StddY)RA5FzBUzyM znoPBg#a1hlvA1LwnvtD@WKv}Cy8oi_82=lVn`O-#S`~+{tTaT=&c(5+XYH&sj^(h+ zCO>91n&w(f@ET=VG<{)(NBzh0eQ(XSgJ61_Ml=g0O7rdBsVYD=*TUT+1^HVbZq)vdKM{RiHg^-hL<+7GIs7|6_ghsRnaKAn4CiFxe{*1f(>N( z*Ts~6;BKT@97Q=6IC?8kwdkT z{_Gzp37rQU4Xxn9p5x$P<_JexH^AgY-CS|~YMVp$IyS4gQ*5S%?C0|APjfoFIX0d) z#oRTHhlO5-r9!ixESs*Qy%KhvXN{bvj&Nf7SWQvGrJ9AmcGXrLvkN-g80nuC9L~n~|q*ouVW=%BctSp)KfYx6( zm9Kkl3cWWtg5{RWD&GCZ z3Z!BtU+2Bn8iM?;t&5;5#T?d!$iU6>P9S{ih+2l#K>4XBOb=Ehy?4EVYJ85i#q5Gi-Ez{h z!5l9LP~@2#fcBaRi9;eGOr~NS{k8%#|Qr)*8l{8>=C>)^e$4j%Aae zwrt5f_2loI`N4k#(LGmdyi~b0&GWAaSepu~Eu7{GJk~z0IdyhJ%}umGaG|wa@Hxd; zpkcnb=JcOAHLt6^1yt2@>(}D1g0wIp6YqN%&!8Le=M7<`&-kolr`H3++tP@^vJ~{F z;|}Svt^wHJ4r?WM4fF2kkT0_%(ftc@5Tvmk-2TXel<+4hq1;fATs|@q8rN-mCW60m zN}$m98C~-KOez%=iHGhAFt3ticvdM$=EW^|g0wg6K6e;09-EL zv#zlE&P2T9&Q{!5;*JCL!}0Z-Y~*e)`Tx6j;_g3enELT4m2SO=0<#Ky#wZte+JwQS zxL1tAOAp-Gcnn)0rBa6nXyIjM?v{NTX95aTfl7tsl<7U3?BE9M}l| zBy2bP=zLOnq6P2X-HIpt$;V>f|UpRVl3 z2Pc>?QUVOGf9cfKJkP*t-6NjXkd(qKff8Z9k%4f4r(BU3~NZpm++J&OtP~153sLC0y^2G$}4q3~%3> zk-lfz)P(|H=(_ilwB&7pQxAAh71)bE9h%0oK9)!}dgsAb3DbYm2?n1je~%+b6?qii z3uodhNU_}ze&$w>)vnZ2P1Sisv_1+=Zb-q;nR0wEH37!IeFI%9zLEg{Lzvg&gV(zk z(K9@3`2VI9(evvMS$SvQqTl4cw0i1NY32X*4c$95p6_(&A$_7ZnJ$bpWzF_)s_e>z zN(Gk`mZFLS-A8`Yd)RfX|Bj!h-yV>mw;a=B>z`L)4{A1BHD&K)Rn*UA|5Vq9uTGYD z-?4J^!%qs;i$;n5@^t99pF&=Jt|rCoRrq_mBKQR_0%oBD-ZF{Le5l=nP>MdBTh&3d zmPca$lPaj;kRsfjRZQ}$cOtX6sra`{GkR{S4UY;>g8^i7_sb~>U#+hZJ}8Y6K3;N0 z=&J1_Jmo%K^jk1NWc;>AXq?+G6#mB*me`4e<=R(;=QdRcbB|RDA6g9y(+yqORH6a`VacZS5@}1Q489$`ZQSWP=}~*w{2OMr<1~z9JrmX4qaI(#Ou{9 z=FZ|90LiY0Wns5qVA*Y`QhW{>8mS;RX#>bC+6xBz7XU|A7S5hMMEpFWNWx*Qy11z? z?Bp-!FeNq_=bsI zp^4WX*TgY-$g7DCZb2L!ooLm%cSNM`9x zXvzFo(iHUo3BsmQpK6j(xL_S>b!bLamwz*r%@WUgffNpXx*e}8i9^DrZshGMAEt2S z947gC0D8DNs`fZH4vu}g0Me6ALYHMPncnr7;VR68Lg!%aWgT@+#r8)W=f}3(ZNK6; zexWBhRg)YzZ10|V-mdDLe3umKKikbX{PR7WEpOGT?ad9ii_~TLaQK zv#kztQq9!3dTmPFx28WihiduO@nstY9UgJkgC=4~>wXJzrx$~rNxmc(VS!C3LpUa1 z47sQI@OWP_DD^48^KrFgXZu-Duzn!vaNL4ahh_*o7zCRe> zaQuN`=0(&F;%r?=G`-Ate|ld)-0eItr@~n$Mqo=$xz93(>rHJGv;4C(P+2~QV` z$pjldN&Z+)-YBg=cY;!g=TV{kzDcFz(!E(Ez`>95Y2L>8-&aN@J_n($IR_?she1kH z5LBrZ5jjma^yqC9IlIdMExbPGB%aq1js*(`(XpxP zp~zj8tP3k8thMssZn^_ppR3m0b`is@WqhC&0WN9QLZ9PpRJnEuvOGH%EzD7sY_nwW zzI-?0bxI8$ea$AZwcE+$+yZiA0T0w0MR4h59o*sE1rt9EG8gxw$%o9y&7o6dceD=d zu?Pi>&T>=yoh?=r#H>WR+wBJ#I=0e&k}LMFQ30rrKLp!6XMe4CO2JTbT;5+;zYdGGmrM&K)VBN?7J6^k(70JnHJwGNZ7APt!!-}a;J}y zZw)_izHd5CYLGDX^*hNV7X_TL)(p=ueveH1%Sd5^IkbeaU`F_NVf-{@QE2aw&~N0A z@N&6|Xq>i&Fi!bgjr*<-HNPjv2!r*$3(v#(>aX~ckRNkVs5*xTBX$1>cl^`l`Z_Q| z?SVR>!-@o9UhyYkr1XZGO|h4Saq5afclEb5!_(Y3@3!`{4wvN8|DQv8${tjlzF13N z9p}P7lNoCj`Zl>zx5bcVBU}1c*iF_+6Kgu_!B+kp(@NHX7k#XXi-xR^{002R^`GhG z%f|6*9Lp=IWs6t_S_W1_Yfo8a&yh$nA0yTh>2h>$TMotAt`GH3YDgg50e3lFSmMK{ z$P8_GsHXz*K_ReOKL%u)3()mVJ?ICa8TQg>a(3)4=oI=AgWS(#uG2L#obZVxtrt?E z?klmY)C%(Ur!vW!VGVY{8RW#c@61isT8S(9H}Z0l1yutt(%#sBntxZ}gDG9;lt{23B-m1P53Q-fEI0cgq7L%QR0%_c*5-Wb~6KaVXOFIGJG&zysO6p z*EA|Z*@{l^&sL~IQ7m%3vs1!pC6XOx25btnCYhY^xOT!}tPrylwAr7T;4E*v^!iRb z+UbE0$ecq1-1X$s;73$&?H9TiFdqM!_Xqt={0c^8d*EEK521>h(V?0DNcD}|c(pr& zrSP2J4Qz<kRfu_zEj9Cf6M5|2 zf)A}!$6r0h;c;UE?9(lvTyFQG38tAa&~phMuQ`RBRzD|?l4b~0nw+cq66yr^M{if} zwVP+H^)O3t@zP4c_#-uf-JE1WSMSg2#(^xs6VrRuY1x0PL;a>#9dH|~b~~#jIBU5< zV66YA+MKymy#t@GmOk23eK$ErV52!nu=yKHAp7@qb#tT=8u=`TEmh{?y1XZ(t;iJz zZFPdEzy{F#Y5@AK25?8Rzj*s+xnwU`03FL5VV%V^P_>REBMX&ClA;_ulDl1|P( zwG+r+AsU}PoJfsHxVLT-cM`Q%|A)^KBHFYNZ>}gn!xNX|)r>LTyVV6=1ifK?eD=e4 zHK*Y<8=m0ZbwlKa#}RDwsZNr)9zaJ9e!_nWAKKq|po4dX{Y0C0xFBn0H&!3_26-Ne z#~n!ySi3J3DSyvo&@CYvMXAheKTj;j;$R2#lXN6;;j*eORUYDxHQ3{@-F^*-YW^be zQ}v;&`ySX>;Dx!ry&>j)8D6&|jj{Vzfc&rO<0R|TM6Wdn2S-0++D|&--E;z8TP}-5 zvs3Yny)Q7{k%P4&bg`R~EG+1mgGR4@M>P{R;BR+dBCo$siS9`iOm{b+u0&~+?@?g? zUE+Yau{Z*4mTy4ki?Y$dqbtx1*;w4{_=>zxyG_I$RzOLeA$NZd6Se;eu6vJ)mq9R ziqnBtFJ!?>iiIElXhW&LU!u`#qlgnRK#ZSU!A5bey!cnU@t3|-s5&~YZv3!goz?Es zc+tc}{JWVj@r}kXAN_!yxMBS3av9z(Vct15^iVy^%HYViSy;h+CT!g53y(hz+kcRc zh5tq+jMZ`iMfa{j&9iD~Q$7Qq2TP&zbZlL5sBc|DI6y@NhqvDT6%5LZuTyy@fbJi+ znKK&+y0!QoeDjcWKjiu$rO1~kKXbs;no>A^<0t64^n%cl2U7)p*tA-bD|fAf+!x8f z%Z-6Kpv+tUZ zJ3B8kPhz6rnuMv4zIqMMsh|dv#aGF{_3CvGk9)w$+keRDJ!R1HS_bA#N_c6IBM)u2 zBO{l4sg+l2Vd_j@Sa#8pCoIq-vFdW`l?NmGRq4!LibMZX7YygG&3mL}mZQquU@?-q{Jt5|{K z%&K|o_LgxR@~b!^msgxSGmmhZc+%XV8xh>}Ttn_d!$Tb7cMCaf=>gnF?~ig-uiA3w zoQ|lfnvl*>obiw2cwj2m)D?4|Xp{;jG`* zZX6uFVGNfxQOu&SET-XPn)s@HJh4wvfWaFFNN0MMUHxVu@?lqyrS~Oy%6SUpB= zjDAEu`JV~==w&{(NP*+}8uZTA8hsp?19qO0zVg&HFfl6u+Ai=V@9G11Pr5O-QI4mS zdags6jfgBe_>Gx3pO5rIjL@P05j5?8K>AO1An$&Rb$XN`k(mGkQRO&t;~07!rHv<> zS;K_hy{IMl9L_K)mUw2hk@vAF;@)Mk$duM5TgIP(w~sN_oiHBH-l2nTMeRljdC_1~ zR0lD&AJCI=lZfs$f|e>fW1hPrW8JPrn!d`R#d&V%l>1Sfc25`QUhlIvZb>E60$p&% za5%;T>iFSab8PuX8t*a`;BD6{@$#U@nEAOJX+PCR!GnQh`Hmp+!g(wDY&ZcDKdge2 z8|?v~mqCZDFXN!UH(=rRY8cU?iOZj-=y_o*)>)GRH#Li3{6BkGk^F{ibsk|v5-wU| zcmjS<-G+acmSPz%ZMf!`g)Br%pyS9e!)|+tmi?2(JJL>(1^dDwO7}FZv%N($T2!!# zB$@Nq;SBjSHG&8ZeIZ*XO=ng;%!X@=U14KRCdu&0W!|=OAWOZ4>=$l9GtRc*+nS?SQXyph-zG@{6PSCH({adkUne*&MLOf;MSLnl5AlXn{haC7Jy zj6cqxSRFTXJ2eU-pLYGs9*w9f)T6P@&(Ic*`3^SXD(1Wldj&E?zf>GL=3O zeo+?=rKV7|w_@=EqF8re(QBA^`vho3s{s0ui_38>j8092-OIicW8ECGW;g*4Z!0IG za*Ltm=P6)Mod&rzM%e6xpVfR}F`YA`hn0TWob`S16YDy5VolOj=dbp?MIT*5vE6-V zvp$Y>Sw3h2t5UtSm6Hz$*+1&?BSeIrK);D))Hj}Dj zbs}5DT3h&zwOq6hGV~h2Q^MPd-z0g@e;5$|GW*3$ab^?WnTCLO<&$>(Sujsu8mF1K zp#^jAAbO3gggu-GZ#-sVs{g9Q9qfxl$G79-QzpW8??~9aTZ>4Agu}$AJ0RRv2-`aQ zg<}cQqLR+(A}98LLQw!;xN9IsIA#2Nq1GEOp?u+K;p4nl!jxM_g!(=bu2nuunEItb z_|^E6u=@Gqnwm;1yxkrr-2ddc@KKJG$k4Qndr9*QCr{xt_dxk6PVz$!8}&+ER$y{Y z#ir0KHTq z$yrC-?-fiVP2juKyy|9A_I$cs38kah!;C1)sW070~7-!}~Y||L!6aE9W-9b!(?lTg9 zc`fWDz(kq5A%oD}Bwoh=ijuCuTs?U*>D^QNgjP#tQ->@Z7;GZh&CN{kzC}pG`!RXj zw+as4QX$4i46##v8_AA~hGgSuz+=|J{beQglBPDXK4<{{KKw_Lth>qgB0W5&6NED! z4WKVjhLVrhGJZ-k(e~CJaznNqdBiP4Sslqp=+=n5UWJoR;bXM$QVA*;{f!zUS~;P9 zENj$E(bm7Kl7l!RMi%$=d|nNwh9&ONbu z%sNWEv_@YcqdLLNLa3jR3A4$R}G?+fh41+kocKWOqvr z;P-OCzcmN3zFsCyQ{{OJj`%=GuRgr}@{v08QHg{OtjBw*Un9qldN{OQ0q_4Y6@2=P z@U4g-?CWcUvTw{HtADt_r-N~5B9q6M+__8MIj=<9B96k3@U!BOQUWP665+RZul-Z@ z707BeCf5URl2vA=p!<=s|I!!?i)c&yZpa#bT(}21i-1I~=aUcm+W11Dq(?0Y0=M1Y z3)vxes3|48akYevzV?b3-C92tOuS3TyS8N9x@H+3-C05T+lXq<&+&p6+fU*O&qk_e z-%8MVumf&)av{}UnKw9~i?2yoVIoZ(s9Bwc-|x`HzwGzmjeYLa^!Ib9*zY@VcNSn}LJoH^EXbiThE>8KypdSF6={2ZhxKVSjv%gmM~4 z?!0=;`&Wdw7YK3Xn@8kr<34!Pa}F)libD?^!cg{W$u7|+9~@`s!3lM3kh_%+t%{-K zNA)sDFrG(9d>@k{E7ZD#0f7xO9iQYH-pu7utmTZN_ zb>YzQ?IW3Zd^7G=xQ!k>e1_!Jq(S)g7Nyo3ids(JWgZ!xLt8tv0iD@Kiht*eLsGWG zVzpmn%6LWQ!(|dE32%y{rnf}InAZ}Qbt+$u7D*NoyD)siDvnAi0JEwQY=pvr_kz?DZcK` za{640G}~T2mrfp&<_oiwu!_zZs_#ZBwX=t=tr+*57$<0xo}xR31473SXe@zt1?Q|Em%$P6SB=(SGu*!rXJEjR+UoI4Kx9ky)DO?b;^wNc{cfQvc z_^Js{h~0!!!h(g#8J$9Ybd0b>{jl&?xWCYOxr?w_D_5v#Q7C*dZN4zcT1{A0ZCc|j z=gU`)^X1DMZRf92ap!O8J3+52aHPF^J^6d%4e4WD^Y}lGbXvVW(a$n#@1%7$##j35 z=F-s~N&K>4bNgDar|uw>*>-iecDnqmG3Gt=W{~m`{q{IarPmu`RIZVxSJxuw?M{tmIhv_ z{SQCO<>UBERq%6O2iy!!qH@ zViiGjHimX14rn&Y$otno9wNw_=U7%XvYWt{wJ`>U*#IMy-`B^`?cd;i%; z_wP--(NTltzY-U%7PK1| zitklL*(#9ZHVaR2swe|aj^kL4Qsf0;~KbrdjeBK+wJN_JmMj7W_kvEMqU1O=%{Qp z`4sJnyrw9TOATw_w^}@WS$79Hx~Wi;vTqRn#1gXe@Gj6+H$B2RMe8zX_DWv&e zGO$lwsnz;sgF+Y1!EcW_V0ANB{3BV?y&LF91y4)J{{Jc=uj(Qx@he5q1?31~SA3yu ztNn?>TC`CAF!YP2!q;Ybs7n4s`XdTSd{-3)-vwyN18IyCPa&h6RxyQm=@<@~a{vb3D%C}Kyas+~1U|7Z9hVz)GXbu- zV4l%W{3Ul`GP_*y#Y7I3x4}r_1Rf@3QsPf)oNN|u1mAG_f zfbOsHxcku`;yxfE!&%07&c0)4=NSnfrI#V?nMLHJ{bI7LwTe8?93|%Q&JegK0OT*O zgRNiv(U~9_;I!!BrIP;msuk&=VN9T*Fa@8ZRe5&inK3xHALb+C@#CGjkGuT_^>YHw3}F!fW70|E7=n2UMOJOrb5JkMP~}mh;PvmROec zcG8zur_e#3H|V3E`{~e!k6BmcFGw<2ee_3NBdcSlqG_K|hAujKpH|wz<7*X!@q1Dq zSn;+<{I~T}`8u*y^mb}Hz3Es1U73X0 zsjlEuyvwZv%;Od69<*%-zol2n;=D-A%PfJy`zlGBrER8WTAp)CN|J~Yp=d*keUD$XXkV1lLeWM@>Y19EnmKnxghV83B#MwF z`xgGs|8=kW)SNlzzOU>19hit^{w+im%w6`fl{IP>ZpR^g6PultF9_E%yYt{{Lg5{c zC&C8#B4P8>>B5)E^SFJ{lEM=zL>PGeim>0GFO;8_BH$?|3wbT?g|R1SPLkFyPUr9> zhuI$_gr%nwgz9=34t*21JCv2=IP|pDa4${W!Cm@=!&6@7&s(}`D@%FFRMx96s_clM z^YpHe(gx9oX8O@yLf3g&@di2#d7=dg^nc}&JfV&(Tj@bDYi4FOYqz0s!}1aV?LJ$b zrLxZ~!95z@-3O*^4>sD4U+<1ie$@~`2y4sJdGb6BKR}=E`(!qrbFiz2%htUr) ze*bF?E{jRW_hA~Ye)=9wnv_eu>c8te$n?Q(tJ}iw^)_h!KSh#1HbD89FC(8*|B<0m zUvzx%9I*@*k{LIrLUZ&svUdAiEO5Ah;z9+~qUysaMv{xuRVZ=8vt>BWd>sxtqK22n z^PL-7yuoaBGUfix5tBj+AFZkf<&!ko+g*TMa%W-V(OkSC$dc(%$BX^UywO%OU#Dxu zC&V&pKd8QIPejZA7NE3@x%lJUUFbs8G|ckq!^QnqP-R~j_L!UEEMc|?MAmK4`(z&s zE&Pwn?;{{JOTV=vyk9Ka`G8^ahLEGv!XT439lrODpna#(u#R>bhLdKP&(niSkG zlf<iGNXX7plUDo7T@!>9fCNn{d&4<`aZr$iY7vYMf6csg*4n76r;1%#zWz!#_G zpnEWyJhL)@T^q}YAg35Ugs0%vm&16@s0eccm*b#`^RZ5J5@qH1gnaJCV&3^*&SA;= z%)9M3if9-U&7(I_19PCoVmrgp-LB-k^V=foYqgko#Eg?$vdmm(=5iDjQj1myCg2m3 z-BI&fSG4GiAC}N_!4a8TFgGE@e0V`Fawj7`OaDgE@*w zUyQ}(-B|6BKBkA0vHQCra1PrC(&rRgHOvy>#uS6rM}7qix2Bp1kCni_v2v1js1)29 z5)h|&36`F3P2F2-g9md1h;C{+nbuW^?FKI3O**M~y_qVOIZ_HmvUA`-0UjteddCJIyALWJAvn}ttRPYNr3X9?GL zb_dk81>dhMm^_sVo*a)r`9v6O2nAALxp3+>mU9!0-%8Ffi@ilA7gtHBPLh}aN zQ^2aI(_$+{TxI1o8?xq4mSSs{+Oe}P+_#T?85Bi!}!|I6Ii0^ z4VK?pk2cIuMC0d5(B+s7xS_?-McwTdE?cl4d)Q>4k(#sXs`aOb*8)if2 z)Ol#)@r5o66{KLw69sTP^ieDtmvkwyDIhPqddaI&9qelIl&UePBpWA;iPvq6hbPss zaNA}n^vyj`7EJ7_E)d@s2e_nrpNg?LLpI$D6FZFXkh@R=1xM*&8|6IIW6*)z zRbsHl>J;2{Y!fc|orSd5-9p_=HmHa3R-gX58L!nv`1wmeOu46Evnn(Ev3Z%`;d>AM zvdzi-7nL%CKbA>?TbBz3*B{CWij2PSXRKHyD7IQ97&@_8U{ubW!Uy~WpWa0Yesb0d zG;qGa?@FTJx>>5=XF;A|um3!OcZHvTbZQEIHDwF7Chro+R2vF%e)^%!{>O1%lC+D= zw3S#kY94u>yp8&Ez!A&qp2AkUSK{kd>JZlYfh?)vAzkilJowTOtIw+?XSMy{O5Fw0 z+P?|)s9qz_np5FiQ#l-&pazmh8z`mB9Xqpl`mEaE^Q@hBuB8+F&=C&dDXG zx>JyhUb}c$@RN+WaYyzZL*ZK<5`ok2~ zund9ay@|?r(2SQl`Qf}cjHg#6;6u|Lu-#}H(+By(I2JF6Ha*&bKlH`nrX(htlx2?u zlSE{EdlGj28-wfW33}YA4?ltr!}=q4$w`K{CZ5%dn*8TdleO~E&*vkkX#5pg%Jc_s z)O{j1U%HFa*Diq9Dcvx?Ed~YUUPA{WVzBeyr-BfU_c>{YC)}XMT3vnx{AeJjqVM|C3 zSOot-!^IZ(T2B$;mQ4U*!dKSmtA6y!6UnTLi>+CkWhB_v-ivA7F+29=>`vCEzn%15 zomlo-l-n@s7h=2eSsU$LHk>$iw!>@t{r zx{y3-TSV=$Jw*0eTEXIBce3$x4bt`ZMctlh;tk*dwjwUl{k<9GNuFS`q*>IgJK->= z;hlqgUW3E_U2h#WSr0lyzj^JDs`82({v(m2y{Uv-q^IuSlEZP}B_#=tPtX?T1V{=K z53S{l6}dW0cyU`8a`=i++wh7|>wt|gL|>k3-=ZUw3OUOiE$R{^=#B7av1A;|YKnPL z6J&UQ-z3sMvY*j~(HD3WcO9>>A{G>hf-UZ{oc#s^a}h zt>N9&oJGqfW%7=GpTj%wIi9Ad?Yt=#(`*7RPvcdu_oXGz<@0u*3E}P3^nuLr63{9s zg_gGc;LOZjB;C(K8C(Zhk70l=KKyds59znNATj4Yc(mPtCwK0H>S#aAOD+UnMkCzc zza7ky9Uv0N!%mG@@bfwk5jD;5blp~1@NN#wsj|mAgyyi;Nd}JcB%yFLkgWE&i`$vK zW_j9q%&C8fy++PprjS{oBNA2}(Syy?*F){6Y-kRzCys}I zl0+$eC^6uX!Knt2Ty=~LB=E!KfKmQ`mS=@yMZ7;EfLo!adsN?c??i82jA=6xxjCEb=%D3Rz!^_BypRMG^ z_HOd3L;*igOhR9sJz?JDT<}@La4=k5ze?VC5LkXDE9tS(X>JAukrlgG^w zRB%Lr>fAZ`G2D4mytr2L;<%>4-kioT1+L_(W!x;4)Pe$gU4ve7^xzt~iR`?#w5jy}K~$@+ow8eFBbO$hhvN%Ho!qkK%dz z?V(d~FWDuN0D2icaP-9zY;uo~_sP%D_~I_f%-|;gM5*kyK*UKWbsgi1YR3DM-Dh z47r-8;9aO1FP*VREH@<%Z3$ZLB%{6-f0V5u5^o-okB!CRJI5}QC9V5$TT&&iTPVU= z>AARaO&H#|VlI9j+>a@nVO$&Ah~tzGVX{vCKQ<)ZxyJ}* zbVLy;r%;ODDuos+X`wHINtFB3CRA42ByL^NLx#**_?~(k9{GG6d-RT@a#TzN>y)sT z>opYM;)K>VKBVrMrJ#FNamZ`YZ>p+mp7;#ki@GwY%h}@UCX)Emoa~dlLW*WFT(XsE z#A{~(UXzo7Ut3PcH7%-Wuj&+>e^(VRI<1Bu{!_vqE(C}j^30L#B_p(`&TE=Nwo*34DMiD2VG<;@Z^9=h%i0aK^jWsO)C;`jh^cb`A z&7+ltO?1YBFAYcAjx@}k!QxFm6|H?ZD#$g#4$Zqtuk^XN#w znKtCx8CJm7K&I6gXmB{LU+i$N_lSddafpNRRY^w^ zGd0$$8*1$Ry;{5gg?`pa6>VO0O`Od!KOL6dqRBkPXn(pO_!UcSmo@wH(E5hH{aq|i zr&QV?>5}ax`3_cN$P(M`vI5#Z{EY23*|iO#KWA7@q}{fzY~BTIiR&cX zAFyS9FJ2ediGJEO!BuubOY`qUtiiCfdnzr!Co&ndauUdPlN{(;Bk2+|GXu{KO{6>q zjl}5k+}4EA0xVxDJ=x%S zx8K=8y9jG{ZH3Ulm!Lnm9zMs)!hnx8jvCMbuNh~F;UJehPrr|{_pjS)0WX>I9kZTgBk8TxafC!yj^udzZMP<8{qoVg1U2onS%>rP)MEd5#HiBF4K7R(((K9-6&Aqb9k&0`)d~<&nbUN zxK$2@!n;7eh~CZFz+c8$dEzvu?er_ot>fDKh1(8s zT!hOY;%EYXIAsEk8o5knr__a@>Xot8IR{kd?jP+0>QQ~2=&wPc)O+ne(-A{OjvOa=9r%$CR4qM_Kkcr z@H!UG?cD&g%l`-tEIx&1r(Q>!mIKzT7g&lWB$d`F5an7C7 z_~rL(^l^M9QG(a#R&*IrUbGn+4%k9Wz&0$>R>IW8TO+jzvRMIAxhK(@PlNc;4?YR+{YNgwo1h>|b)HRp4DIy0j2=CBmOi(50^2ZBpT5kw z-w-@|pKa><%XC)qYPwg}hR4e)pzD6kr>WoF^yy^6+Mch&mi0bCr~5jvyl1^+tqHN> zP1%r2_uk{vjw3f&joW_H8Y`}`E_9qg>t9#kRuzWT!g)nT>pj6UYA@_NBP74NBgyUk z#YBS155(TQPV%NtfLE`x$&VCM@rL13LN3H_cMi_8g|0A8^yvMhf9TXl$(UCZFhyn zb5ezKw4Mo++Ugoz8sj(|szRW*Z+i2df@-h>Z^;QXTyUXd4GxKaSd3)&Al|NX5#>$3QL;3XaFj@MKeKtMs)MVx-vb-OG39MLS z1NO{+2U(_?gRE<5(rnYW*Jv;05A~O$vspW|*EKBqBh9|qE60n`j&9Ia%d!2h_oQv# zmn$@#q}`gvcuV6Xr@KfG6yScQhp_td+SZ2kDJ?4lYO%%2Ts&ZN5ucuJ)_MRW@mix* z_*h#IHY>Jv5$$*cF`W?i!Az=C7J8J$j z2S(1gVT)*lOJ~(n_B0R6hg~G^l@iI+#trDfoIF(RF@*IBt^sG09Mrq5hUSVBAZO5s zjSG5ETD+7?sfh-TKcNJ_cD%q-nlxKwPo<*4WpaoeUPDFJ9mcs^m$e>@#z^(o5iA=& z57(V-Lj0y9@anXLi^k|X+*xbQRE*S7fX3NWA1*Axw-?YpvOI1|4MB8YgQrNT+E&IN!Jc4Lkjj!Lvss z^Y~0+Ktib=@C#qsgAWkmGgN5Qia@Ky7=%wDk8zl)>lH%MdhK2*jNQ-Rz4R?^t%h(`!F}}RuC8;tH!8OY zkaA9|rBx>SX80a$zqgWlYd%C~9Pw;@RhBHyXSyy{3fXAQpVL^XLABLkNj&tepM_5B z^vCRJ-MGdw5})xd6VK2z2X6E^qEw?x?z=`H`Rz^&6Mg~ImM2hq0zRUs=qt#?ALArZ zE#&s66MrK!VsZR0Qt#M_J8BK!-9b4BTz*}gzoZoGjrPEi(Ip_7q&@=3|AfmoToMcn>IuKD+$MbK_+1bo(_ugR zoQxoO=X1d~qh|j8`EB-RM8~)#2kM4jgD4lCr` ziIH%aoetcUp*IB{%01louP9!e(Wi!uCZ}nu>|L-v;`9zY(ih7~F8U<}T~Y`UsmL zY|FbmFoC_X*_1U_AIamj<+9dEsCuLyBjI>TzB%6R+W-m<3z`4X$0Yx35H??V6J0y>03rgS;XS_#_{SYV zzIZlK8uNwETlGkDU_M0G7^D4FR+u-INJbW9qfxtRbf{w`P6;%|*w79|Zc>EnH67$% zW+qGxegtnKhrn0=Fq};KkI(wffC)y9|@9cKIr!RwxU!29;6MvwvQ4wwt@dmAlz%5Y`h6(YrH za-eR=Bi0{tNX4r<^yE)E^0j659G@iNU3xCeUG`P zK8lt=Dv71EsNcJuLUDZ$c(vQXc(RW8@G&oFsyhq8lKn(HubIlY8AY+C9VAaDXoC@} z1x@+)68CpF!8tt{5cUz6g^3H|F zY9(M*Tmu)QwlR){d2mOo654!HVVdz7Qa1Jx^^YGy7g*-#!=_=B+2xJ1Zkyv}=~7N8 z{|A-R;)wl&r(vZkhIhE=m*~S{H7M$s0hx;G=#&TFIr`%R@$_R>Epv03`^U6orqiN= zUFJLlyZ_z+J}=!Wy+N&YacT$5^xh1sI@3T+buMHbSOfusOb4d!Bsmx32g=1?h-SSO zw2Il}K-E^!ca9~VIO8}jW8XwaUaw)^rBSFP{-Nm2t-_WWOsUsV%M)(3?S_Ts6vaq= z@q&AjE~^6d(Vx_(p!8vpIO#&bOEgW%nDltF~t!wc!Csis|UJsA) zY{e>yjpR7W61VGp2@aNr0Yn{6YiH9Ui^OdTbY3(^2b z-bZoO!)Saw2T4zjMJgd1(E1-Og!?o{eEG^!k@;l-ermcA@8MmAYL!ZucJv{vKcxhN ze}kds_XoW2uaZkj@h_~@c>&wlUBYfsNAbBewKy?V2Y-1zL=^|_#!36%p|?V9=(a0l zieTyRPd5%RiDsl>wwpN5m4L(**5IeFgdaYcLj8R!0SeAxpwNWicz6;S*%iSmdMD5R zFuA_rUb;F@()k!|eg3wsclKe{Uu_fCT(4>Dxr(n?66HVXt(CKEU7BlI(~7U0xA&A_ z-;-0}jjQI-9%D~fQge8$`nQtobOzgUQ*Jw5*QUagJ~Tid9r?vt-c?6E99~UneR3cn zA@|91z9O`jV_fGbPp+*>A}>DfB~cLwDww$(?^P_CFaH(APue8BX(7Yx*H7T3ON}wB zDI9zB<&a5MuRyZMoRqBWKmuzmaj91U)Huqwa_jva>c1Z58(M1$FZPzPe}BKljVM)d z5UWgg*f~9foA}(?p)xqj;nT}{p;rPH+E?%d8|-E}j6I5TxF-EuSQ_RmR34ivJpEf= zxUxW(KYT>GnV+n~4}9CepI-G{knv)lV8??|*5#O4JhZ3ThSdFI<&|EpcV|iREVO3v zX2s8ByJsk~ryq0Tb;PCFW?h-wu=d@z`j#Iop0HyQ`{b`$`kb*8Ps>D`UDlr0@ax1> zcKD5Q)-o!%-t_EO*8Wp7crF^&ytb8+aCV;ww7qOc8Vj27D(h4{r6UBadzhlcu@F$i z?bLx_YbL)G3WYKGpcrlkn!OXmGp9MAU0Cir2o#hV9lbTIR=d zk;wfX+4HCt<+mn*$K4+BsZ$@x@9DMT*f)$r{Fw{Hh+m`7Gd-jj=HS8^5Agf{M+MGIUp{85JKe)q(ljj?Fal|}e5uE4*q=fQJp52$=AgJO1hV$~2; z>|o~um(J?L%c6zkYQ+&!**p~->~E2fS&0{e^B+|zt`*FD^nBri{w1{wd-%&kGL)7(-$o``ff`(86S4>C)-UH@cI=6 zfwRJz##uKxrbU7L;&T=J$I-&3ZaTc_B9#Qyu7{zlV>3K`GavfoGl{uWF40u6fd59; zLEd~f#(NVe=53r#ZMuF!taCPulxZrHFLlZ=S;`rYZuCK&<&sqFs@>wZ4_#qd;}?eM zQj44HcH>GtFPuEh3(9;dkX#>4@=Hso$0|Eeqw5Rj*(YB*>s@ey==Y9bs6FJo|E)ak z`92YbbPS<b(_2^;{u&ed04R zo{&X?rG~KDcZT1;+y$n4~8@cro(dtu&N$2lKv^y{zNsQ=hk?T6Gq0W7)ED?GhFd%6DK%xNz3^WGX42G(t9I``kA5!Zr4%E+Fwh_&%2XB zRsEZ|CPF%=(4*o}%uJkTA} z0<>`7Zz@v};pH=!Ii2DF>N+Wp7SQL>Qv+L9r&8D zJABBGqx=Py;tGX-$he$~Z+ysxHIW}lTETp{yyXrF=#mwiwhqJ0#N!Zlss`rqDnYaI zI&sq}YjC?inP*MEW}U*h4Nvafpuenn*)XmZP0yVypy}LZ+Z1^R+Rx;jZQlM!y7j^$ zTk9wC*6V#OdAGc(ZKfQSVLv?VOOM7G^777KVXb*3M=!jg&1;MbWhIT}u-+|l;svZy zX#MPsApF=1v66U@MaRehMhKLNjC*w0Hh_MqdB7vq?|SUetKi>5AB z7T@=A#E*__A{9Ornw#v++Z#Ukj-zeW>X=b;x?#8L`WW&?wAF>*(%K?uXW}9xISLGDY z8S}?!og_AknpQ|pT6dXV4nkJx=!=Fh#^%r-6lLQy5{PH-3B%mhLiBg4BtDWHK^?48 zK=f6n>#$A@9dyydpR9Ay&dy%On<0mvMJ`8~zPfmD=`^fjMiF`Qx9Fmi8X1^$o0=_| zgftc#WPny^<^ z6RZVp_>UIjpEy1p7QTB49n1`1VZD1AU{CG=XSn6St|GRMWB_71g0suGo6 z8-kb@Q^|%-6PLHoTJeG#1!#EwVsKw@iQH&vglU~e@CL($IAog@99aSkNUl?VI2HL&p1Eb*$s%araaa~Os}(rNq~LK;kf_1H%=H_j2h z9DG7X{tm;{^Z|I@FMt52!=SopAxvdBYj2LN!C#Z-KsA$1eH$)=&+KmDOKoiBAJf$m zJUrLSUn{9@pX(|i=!jg!&oI&yNTNK!kluR!U7bVxF*7&*E3vacS5UJ^TuzgFr=yUw#Aq%@Ued(=z_ec^sp$@}dhY`K1W8oK zT1_I4DkE{%R8qF999&cF;HFFju5JI;a{Rah731g#e>8VN`0qUY)4>vzr^}0v6jd-h zs&lY4B!}EN9O@z|@^HCP*@e4n{*ilOmg4?DAxMq88qY~>M16WGXzMi=h=x+gg|#=S zSRW4dPTIn_QtD9G++D>eCgIu$ouwuHq_dr*elPGYrp5}ptz zN6yzRCiZgo(LnJzqM0c~8g&`)U~3wFs>*OnuGCVEK_XK4{3L`QG;!%!bP2n@TaE^f z{3E8P4iMb0jm@(9k)HQrT*qWhb$4oDJEh&QBHk5@^v%c(%QRBZ-bZAk_k)7UJ#lz% zJ_&j#Me0_jGkN$l@_WV>;_351e6f**)3vzdq@ofG%ZC6tIu*vW%qR<~d=lOkK%}Q= zU~k@5B6L-TW+q1+p}|7$FV4sQHA^4{X~V4^1N=QEP_$h=1an?)0X0oEkW1J@4X`Ev z>XpJxOIS!$590T>_Q?N04@g#N0M9WOi=~ZFS=0pdO7IMA`Q?FwLaLDO)ocT%1N7>8tQ${#)2>kURw#M@#wE4DOM3J`Y|0DWyC_-Np;w=!wKFjpM$S$ zm%{Pv3o92mHI9vut&(DFzAI1=_{BL0j+usP~7Qa&F+MStWYRjw7Ka`HC0yl^7P&wnMJ z;kF+yTB(3TL$qMteh)hF=L}x{8WY|((E<9hRt9UN<1vf9s;HqIX7Y%hGP`%>M%Lms zDc+N~8SJjnOqTuBHMH}_o%H7e@;sZRdTgz;*Xh$wr?7FkhV9qlx$HxF2@Ro*2JA1> z!dM*VWLm%BHREn!x;CpQ$OB#P633|tH}Nx zS4h>{>DYFztjqEr8vx%q3r?luXYGp zO1SUWw>Iewck}Px4;4t~Pj(O{XK^1mq;Y+UMErj1GhFYaU)<)=)k4x+E}W#7AhhIf z7K-}R94DkDaY}!b32#|!5&kZH*A(|D$8p09Q^(9h3mpXm>5e*^jyh@?oS^AHx9F!U zqL@d;&W5>P%jk}*L-kE>ZRlDtn^%)q#QQb5lsA2ADsNv8lgGV3nI=Dyc~b|=c>Wm` zbo%?J^q}isdggi?`qT@G_eibAO0%<=HqhkId@E1-z#Ro%n^O<9C2t{q^WP0;gBV$K zVe2EJRZzq@P-y(?KE`>!&STr229*7b>A#x3!ec+_nlvu?tb=R^cENi^Z^$_jK$SaYfwB|=uZ??&PT@0BH!+C>YG$HmqX9&v zcmZO+{X!CZ?I5gH&Y3&x2W?@^C_dJFTt$vammpMa>tLH-a`Bw24 zp+(C%`Mv1ifsJDHPKwEXPH^#A!$DHfA~Ac_B-FS#0Ds+9%jCr4!QYJ;PE;SlQszf8Hm z<~eiwKB#bYz9(>Ic}Z~I_?XxaZe7eN%+TXU-k0LXOyt>bDt^MZc3sZ@&+P|)$2=bn z&)tJ_@1%^qC4Yk8GE>}ds>wQh|bMznd`Kh-~F|q`+>kDAu-ZJ=m@I2f->JG|}3efq` zMbKqaM19z|60M6nOE`G~6n~5_%Anlf^RINqgJlCnAI4zYygSH7n!?H!a@bQ^9_mC= zFi`dr1-9wo!cJz8``eE^tPY2>PMKtwn4$f*M2>I2DB#(j*P+wQUgPCD6}(rf4ks@k z!{q36mo^y2d*{~Tk4M|kqvtVrGxQF<>{s1t#BmaKig4JgPYt9bb65m^zjf2W| z;K%xosO$YyCYR8GfBU?^;qUwDt>uxt8QO1btt<}mQe@8Ce40>8i(W|ZZXeaNO+EFA zKHiqjTJ3d_u8T3{g+!&Y^bOVM2X}AM46u;5d&*P#m~%95CN`#Ts2TD$FS$WKD>dNl z^{T8_%sOk6Ht{t*U%U;!qWB`agFU3w@HDJgunOKVoz58MZ}QxA37FDD@OSSiXpU+X zKeBvD=9fpo-Tc2Gm80Lf@8b@r+7<$`#=S75PqH<&yAG}<4!|!LU>F`pp=pxi*&Pvk$)+n^6m-3sD8(W z?Q4{1-)d)Ge$j50=HgSV!?)dSze&e8+>d`lACerV)7)fu>s#K_qODS_&5m#A&@D;l zSI(1Q!^9^vong*0KBi7DSBa$74Cty-fuxON;Tl6&l=*VJ4`|=m%z$PP7Lol1boz53GcxjF$^oh5e-)`U3U@I zXIFyU?E{d)Nrl>tCxI>Z8Qs%x!0Uz*(24ODBsnh@FA3R)idL;bU8fp|_FsL}?i-B# z#Bn&aT^eRB3f7UV?sSDWUwgQ;{^&ADHI-Kvbl-2TSYR0cF#-qQ8su zu)=#Cob+csj+^up8b=lZxv4}g-#r6)&bd!1j;X@GAT_Y9>L4>LHP9o~OxQKDq}BO> za%-ETT}uIrkAEj@#6xy%l(w-XaGqE~&6pIdHC%u{$IKT|e>OOTM_ZDzk%gpbC|#T( z%ptYHcETFWYAJj%8OC~Y3w}Q~5x7ez2ofDU`O)89I1i-u^Pg>y7UZ8AB&*k#`E<G=@!bTJVgKR`0fDv`vo38Zj*GCY|38!o#Uk)G-v zWJ||Gz<43-ojR$t`%?rdwcLmrSNPy{HP^^l6{FS%6%?BLwT0l=I#Qb3E50Bt38!BM zQ%{>Mpi!U?>mI)*AO5+3Qcw@7*FH`eMmUmYhegC{V+?$aY$ap6it(X=&CIm(2C@v- zhT%n%TK5`z!d4v%Xb9m!MW-PY3RJLprX1Eh?@snj^+Q)y)S&TYzC_pFpNy#gAr_XM zBw+M9d2e!qQt&&1PsEMkjXB0R=3JO_$Ho(oxjci=%RI^RrO}KlvjdK%FvbP739M$N9x8E1OhK0 zk@{-F*|LGW_kKxE1+_D8^mu&tNfvJ0XNfuur!V{m%)6xOs^E@9MZ;b*Cn9x z88TR1y<6=4@DgS#%tUY2Zy|Q<{q>;$}@Rh2}@+{|SR`_%`QX&TQr^O=y11wQKHqW8OS)(4#qU zXm0avy_L@jZs_u0HT^PYOMihmmX@Cq`+;R8U3gc9{W`{u9sk~peQb_BtuQ!1-&r_`_rpn^wYjRiqJxL3d;nG|xIr?uOqd zsSp62UfbcY(@t=^8s>ax{RBcSKTr5R68K5QCm6qT2KZu6X!z&^^IrFon1>}~#KRJQ zGYnzoRv(#;7SmNdTnz)>YS1Cm3Flwb0dce^k{#|YwUZ)TQ z`Y#U~Eh~XTvweu>(j0tF?jo+T=_I~Ol~HY26P9~-754|S;U~`={?d^1xz9<=o zNS}b&zqFuq;We_~do6g+@r30JzwW^frem^nz4(`)mdJkZC#%?&kY@Cfv^Xh{Nq^oV z$Qq_v{$U)D>kib74NN!D5f>gPMQ8ex(do_3;As&GWReza?&udMyr~d7zA(j8br|m4 zuc_#Y69>+kh2lZ+Eqv;p00;ayh(Co+#bfXFaiwA~7Jc1Bmaq(QGgk(matc85)(+Il zoj*jn%Zl-*+2d$jn82~TYQum0vyLCK(?;;Nv5w!EHkEr+cDa2=Uy}Xwuvh#7^H2O0 zmh-uvQcE~r4*7FDz7O(Wx}N4Anc~e2y;Q?(sh8%S`?`q#PyUd=W~`iRV3f{PlwTW$ZON9UQYeLNB{8$T%w{o63za1Smy-%S@Te@9i{df+Tm1Nb_4@))vM~nNoY;q=wlwq}OvZuG5qNjxF?!gxn`qCq!X!9|7sCBmyh%Q`n9#GONfIBeD(bbqhCXH)&pKsS>{;c47e1z7=-dz*7c&eq zewLDH{B&m%hjg4zmk|9IK@_r1lOO*^(JO!45!Sw!yc=UiXOB2Ub#xs`<+51FQqtl5 z+vN^!9y8%|gCj7&C9X2zlG(?y2{}F-HY>K!E7wY?$BQ)L-%vw>Q(qDFNe%R;n=A&2 z8%V>?b`q9-5%hnRLZZS_DDk$#e&1b~RpAMLLykgrs}(t+$-&}+xfn8C4x7EDw0*KV z<{cVFi^C$Rs?1oJmL3nemBF<7g+I+zT!%Lurjgz!{^U|miDXhN+nqex9A#wZ(^(NI z^!+t&GQy1=p^O~P`_MQXS)fpII(jQKESJR_v(J%Ft|@fJje4q6J`p=l2f~c3mt@w* zMb2TG_lTnMIx=@;Cq!?vhMR|EB|Y*0>f2Mufe#ZPnDu2kgm#jEz#fX?LRe#Oy6*y&VDb0y`SPxX2UX6XL+o%{c}O$Y)Ma=&%t#8U#O*{3|wE| zAz3NYN9^si;c@L!2ztie1-t&z35g5H{t0#9)l?6kIMKv?uX)z6rocMdz0<)-J3>%K;-1wntb7;MuO^k$L-v0W=*rjG z_eVINX0m67lC<4)XNVyRTJ{pV=K(yOxc3r)eJZh&86-zyEHTk}6AqVd!<-;5Tpi$q zZWV_pcls1wcZ40P_HpoxrUSUBn9v}XVmdI3&Ad;;(VKP;xna=V!x*~+%?rBxHc^j;-3FBYUh~_i9_G!37)^-CQy9uE~qZgV$9Wc zG8Jb&R&qagS9wI)365pnWDZX1v{um3wiXT5S$_zaESOMp$0~;9L_DVZ36!5j2p(=e zBCyIaU^X9F#EILb%ITRG&AfW7!lB1mkFlPw;H0ewm7l4GZ)Tpth!hK+^)pK-|96;o z^U@@2n;}7$f1BBShbA&@lTo#U-824;L&=B%%Aa^1P4)g!yMy7xEMOwwnlB{&tQ600 z7{u=J)_7v;Rig4|3ORaSRhrvdM2>98<7Ewal8_{+^S-sGF=&errm8%kWkJD`;ygcz zcA6ICH#CCV3kM#Yso{Cd*(3>b=Rw|>G$>nOLBAR+qRh8RxZsW+n6DhhpX;FxU2R4% z?MN5zB+H?2!$q*Fua)OG_?2FZ&c-82^3vn7Ca~MFL&BdXtXojJ5GPz*NJ+I4$_f7w zjWZmWChv$oJY~dJv1pPJ4Et8ALCQl+vk)i#HQlo` zaqa>7#&412{qtNBb$O)of4fp)@WFm?b=e60FTRuA<}zUMs0(_8tT$)oB*<~IC70Oz z`(Z&VZ2p!D#!sg}iuPLi=o2f@xL*g3T#Nb(KaZ2IJA=tk)p@|w&YCmsJUcJj-S*jc z-Gm`$x7A!WpJbnuy4k+rc9wl$%DEb$i+hdYrU~{o?>5_aH*U3URL~Y|lvTBhnt!CG zU*(e>KheL&`Sz!pzWhSF+ollvLl;Nbx1X}K%laE3Xuz@5`Kc4&)^NC~tpVDu z*7)dH5sehxBWk&YuycAYd^YgLqN%1RWb;?a+moo(j22RJqKiCU)kT*Ye5NtZHsnu2 z4cT{A5f1Gg1Lg0h5`*w@kR3P@-d~D^kBLT;*u%6X=OAPKWqeEwn(f_X+ot9BU9&7!jE{on#`G4kE9o$Y5 zN4JuMY+CdEkxCa zLk{ODq3#Z2$PWy!tFCOM2g1(rxR>HUZ;h4n>4B>>PrHk{ct_Af0vkd?j?tF~>`DHC z2a=r9QTRzi6~l^FP&<`$N+X?kORim|V?PMm zmNsc0qYYNxth1EewdReV#jAOA2>17xvmPM>ob@psr{9o4#rg60`>+TDY>ZHKyBfB+ z=kxw>^>Fnn5e-eT!KDg@P-nf0#{Qc`#+o|V0D?C={1PBjAEncCz>Vh)LAnVYVs z5irr_C|Yc{<_8CRf##DX@XEiDCZuqs-WKu~ zQ+7#P;GQ19c{omt@-fHpH^&3L`u@>NR?A3Q{1TpBLXD$^X^8_T`Y~-}bO&?Ot#V%mZ!yDpEgV zo5S%Hx9syj-?Qh;*5z8cgm8Dp_K9Z7`wF+zJa=#^@aE=P>v6|+sfrJHIEu8q4IEt3 z9|-UK+blXF=y2HNr7djXmI_!Z=o+y)hBYVC=<-9DEE+<)eXP?>KN26WIfQO$tr))e6vinB zQIm9cyxXCHv7yIt_T5U<1|8gJH4o!w7~{ba=W+B$6X>|^gGFKk%w0SK!!KSWD%t_` z*rtDY=9Q|{QuYEK@Lhxa{4A`RuO*GDwUDZ|Xi5`$*@4-u0g3VEU`*VkAXUHj1wWc@ zMOUsVL|ev5ic5xJ?1og#mVL*|v^1fsLWk(evd#GL_cY}7PeEDdP||Myie|h&iDzU? z@$Hwx$BmpD%Z*xxrr`-sccw?B!cZSCOE=23^Mre0m z3m;dQL!!YE7&}5pq&!u~`j<{J^sf@Ro<`ce@+G-eHxjb$e<0ySdmvzz1&Ew;=zgnc z__RX?)_EGzyj4eV;r`iZVSk=xKXt;{p|vPkkbyJW`si?3Q(72!Q#E{J~f02>e&kL;81&(N^~DAL&LjVfq^m91-D%cW8k`@63kKEE@6}`qYnJ21{TM zzPLBw5wZSiOC|>M!SZ+{xpsOSbkxN1!n~qMeB%aaU(10f-G7M9&o1&q?>DbKppXvw zETO|rD#O0Un{9 zP>rVTT|tYT`IyevAZib+;2YaZ^NQtm#)fgxxOEa33>AQeZ7f(Aq>^bruJYQxU*|ok%2k7vU7CooTob2$@t{;lG^YhvcNIaTEvSljfkPOJ*s zr(p9eG?)>u=;7oE&smLdUCMbsl*k!9N=GpIiIQO7;yC8&vKO4bEe3+nSDH-S-lv?7 z_xmd?!&X=RG#taZvtP)uPLbosZw`eQCk)Bhm!{ymF$4Z=kwb%krTE!S9z7yr>4i_d zWb(qpSYD@qvy)rsxni~la?b=fx-b!g23OOYnkIPT4v#LFypnfbUly;wkcZ>i3nX4| zc1q6fpHC)C&8q!bwY}CchF_~OSHD*Az`@#`9YM9DL)g1Rj$*A$%T@8RtG~nnMN?~c z?no9Ny{l2Xxc8d)j-N{HhN8z}n;u{!;4>DS!a*~elUllh?;i)OQ(B&JHtbfnnO(h+!oop${9HnY=*~vw4p;yT!kC(yO=BRI@bNCqNVZ~zFWa& z-_sK@v{|2kYc{Gcxr}|bVW|CGgD9&N!fNS4Ad2O1*FO_>4XE>FSP%1>UM*BO#UV0R z($MgohIHDW&-BguQ$(g}GMv~|L*FuIaLb$1^!tE2>z%kkplKRilIg;70u{K0?HSo~ z^)?C5c_P^;WayegC0NZL1NL2~;do~{%yl~hb~AP0N;bhisrtxFJ%&$QETvpi2dE7y zfhlWh0E3 zcZGZsmynyw1L1ndSt9&9ia)t@r{w3(7j#+0XwduUD*0U&N;l7BdpRxQ@J^^PE-?tC zzhhD*1yK>W!0S(Z~6M8J`4roSg|mIxT)J_W8JSj? zPrYI+(4qICBxdqmy9^sEy9H|dg)e<>3$y0?3P;zQ*o~`J5}qsR6Gmf+aIk2naEsQE zzI38m=FsmJ&CPvL`r-S_~@u|nr0-mi6|R|jSB z)0T78_wFzHWmq~{mUE7@hXs={+GC;mr!5TBG(+}JbGQ?z2KU}im+Uhg4iOQ?((qGJ zDDQd$w?H!5b9*F9$|F%|KN)Ywp2T~PcVPd$G<>&n8E8&ALH$G2@S9x&)irCv0v9d* zeuMcikSrj%PIo2StCMI}aX3sO0Qfc@s@}W=*P94>dO^795y9=_BJrwcCmnCZVCpza zc=E>*PTuQ)*;Z9B_&b**OwWWu`~X^4HTYiE4IpkyfU7N$uw!xooK$@YW+mk?_W1yB zXQIB;KI$HZbZ6uEYAvakr?E8KM~=&`YKo&&4;OJnPBri5N0a7A>3WJ*+< zv-&|NSkCsSFO}a8b^%9t2BijcYvyZuSiu-lJfBP6G#g`L{u;E}mW$_tjw7SY(8%_sZewjgOqyhs-6)C-Xt4_89v%bx1Dg+LNV~JtU{;BXLq8 zrqwU-Mhe7O!Q#dqPE^fL4gSb*7?$MoT_9{62y9(4WBLEbq> z2(!L{ty_N4l=DyN^Q`$~Q{`>2IAF;aS4KdCrXsw3FOTo8y`c7gg!t7fT;l)ZD{soR zLlB_67BwF~ptB!PsMQ^Usp^m6*n}7`Ja&kV?hl0HSJU8Ve<7^OJ`9=4O=OIV0-a@3 zPHxJb=6P(7CGD0$Xp~!q9!?DQ)lHOooVrnftYu2o+L;VP2gbN%yx>ksCL`;249|a>ibt2Nr}4v=QQQ4HaOJ!ojNc|ukP9PD8IdJNVlXbQ0qwj_VfPVp$P9Tz#?FzY z7KNMf-WCR*m)XEn;Yc_ja3&v5^bna?BfK&wgfNp)e2!)+kN^qH{gMZMZv{~HyPHOI z?Ls=im(G(5h02G^!RY=zJY4BPk11b+!*vy8o5th%7x#3~*Ub-qK2hO^i8r9`q$2#0 zmrq}WcH)(sdtkR$U;1O@IvSTX9vU6J*$!4;=>KjaEvifBo%;fOdPjlvvAo2^araQE zb1~Mx*g|?YG*Wff=h#=lLy1Zm-JsG)%QT#+!r3$Ucl9mOSAQD#GD#FI_e#RO6=7`a zd|F`e0^FbNB^moK@RlxB0-b#;*tvBBlppv2q&OPi%dUo4%N=OGZwNFMzQf$7ndH&y z6Hr`IjnhI@`Axrv;KvtRy7NLhL^BVtK)W7-NhmM$UbVjorC4g=eF5_ukFO0E9}o@ z$O}`xi-FVAgXQlpprse)nyKyt{=f;gVJ+Tndzp=X$71j~IiNX2T z?qTz}VUo{(!{E}aN?LSgKjb^_#17MV^j)Pyo;>x%Iu#3YzWt(8*V}t_Gs|O9P3tKs z*D`?*Z{NZg2!_oU*#G0{Y`;RmTi(8N$&mP;IiDQRkuD3glJrg}BlA<6iTu)yFe)O1 zKd~?C>>kQ7R9|l)kOhF;G3-W&}!nSugkD*_&X|w|dNGS=zk*IJjF8PkUGA z)0&T&*u2snPyN^eTK@Txxk1yhTXiXV4$PCxcF2R8TM{^wwT0BLbAtZ3acFy2p6~qK zh<`oX68G+$31&VA>AKVmdQ@{aMueqNnT91Wc*v8Sa(Be+BdoX9WdsDujfN#l>cCWd z4sT}$;LWMs@ThhZtkJjyN90DK+N+zi(`Fd|7FPrXA8*5ea4YS-b(xkueG5^K5@5)D zA$?ZULad#VAzt5%?I1CP=Xxq6@OCB%Te1f&XM|wf!42f$2V1h-`aE%Rc#OWL&#}VA zf?QbMO0K{DK)cRg!TRxy)TJSnEDkvVNt=6#KB>i9AHGtv+%i~L`2m}*UL-jQT2whI zhEA#2j{UCc_+@b<_OdfW&t0FO+rt>I9UcRQ+BK->&-xCEjQKS#x+s5W5Q6@k$G^)v zc<7A#zd5^m{{KHtcD^7tB_Wk09D zdN0JT1`95)f&t3}(9Vj+WVwI!wGV9I=OlBeIO@SK>@kIhM+0EMEE3u}|AIm~2X1F( zN;Lic(u>C%=<{PAaoqFeaHjY)wNX7mAM2#xvh7KzDUt#|dOawrjU#V&->#dnxs(>D zPvGZ0YJte5iBKd8h2WhZq4(cWu)G)!f0|OEJ;xE8^ddlRYY7UWxWP7m0R`ZWjGWs}_BTIxqCjDHdUDfJlB~t>|6j z0a5Vcy)^?r_KD_4T^0FDnng2C92G52ixckq9xZyNF;DcTBu~_AoGAL`WoNhd;3VNa z`6}VSKLb(VK|9e{>2KojOCLktpT@^EU38eMC3Z_Kq45A0&mZc*^?Q?W^}0^Uv|R(d z_MM{o@$KHI^k@oQ_UIhl{&qDTn0t?0T_6XTaFslbZkH^<7CJbzwcgCfkPNd}?7T0S z&1+9T0#13O>hArNl01~suifx;-eftR!PA$;RI?gb&3YNrD}3m<9F|20ioxWsV(clZ!SV$e z?5-mlrtKVp@3jlb-z7t|g5{=D_dg^t#fKni#7O*9IUTQFH=)*#s)@CADZ9h8$1k5( z)7#H-$f7HE$wv3_u)0bHPrQ1A4spUG1-?>Ne{SBCBx?U6BXSOUPM3382FyX1i-Ipw_q?jQ^g4V?L|G1)CHM_G`y& zKbo=Z?q}AY;sBNf%7;Z}g&*=v2tYti1F++{W2K5ljl?uE)s0?fpi}&Wz zavaRDbPoc8o{ob@p$ z7{?j%Hd5oeoHlrB{n%F7rr&tC)zHx_>j|18If@-$svfpJKqteq^wjgmL~WD?9Cw~U z+apCtw))dQy^iF?0S!3%=8Pn)G!K82Jg3ldT9R|Mo_w~9<~`JVgARRwGpug$wvY3N zzsDzGw9Hq`*!2drR$RoKkQ~~2M1&s?D~iU;a=D%JBe<8s1Gu((j&Kw9MRFhf(Qx#* zJkqhUXoTYxf2>^HV#S(dQIxPEVLD&{}E3++8kX6PI96^+9AU7&Fvb zH8Qi4Ngo`+oH|y`OcbRuVMzvpvHw|dvO>3WcJI7f6?kxxfT@xbtRJ4rx#+DR_}o>% zJe5}v%=_EKX+Fyn(Ble>R~*}ICix38J#o;yLyOiZEX0nW4G_P&0tyxWNDNj#MHBlf zG>$w4Z);NF@J4HFtQsqIFFJ>xK28Ifuxnth|B6>;a~+KwzR=bFJ>>GTI4E}4AWK(G zz!lAoB>l}(Nz|5~QpY1vLw5_!KHtn{WpQ%eXA8&i(Kw^zc*Q8}>Zc?c|xT!G!W zGwJUcD9JeEjw{ACgE?;{PbSrnJ>$K?5#3dob7%#c%xdSIsn4bz+Ucm*639xK62n z={a=TM%!B)e_7Ky&~5LTf0SlTeL$tQu^2pN0k*SyiJPfm_+V9y#PdQF8a+*y zfLazP{L04ddvDV2stq(aq6Q|VzJ{;9a?+-yDfszskt8JRHrVW*3uRaSVA;WROdgp_ z?*58|5bHTSU!_WVT_p)Dd?|RXzeV1J#o=Nii(BHgh&t4g=N7$m*sHndv!jU@6)z&o z&i|u=2P|uX15|oY!h3G8mu^0zjlOD;xOZ?0&v2VIx)~CDowgod*&fE<&!XY}u>_Ja zs1MJo#)IoJmhqWvr|!B;{Py+nez@9_6#gMH|4jqk%*4^h zqjbpl(UG*<@Efh2Bv0?AZ717Ylc6oIlZ;+w1~>eD$+k_0p@82`Zch`??H_~aQHg|J zRIjJLf3NUN>z>kSbE-+!#jjN9{V<#_${~jBADwGQnUVgqVp>#NMU2`qVWrm;Xk4_8 zE?M!2c23$2KJb`0%qb*m_YEi02Ts$P(xc?s-3$2JCJjlHGo9DlOeRLBlAI40kvvMp zt!|Ap*@MjiY&$|^m6oDud@{Bzl492JMtXFF78V(np{2uCys}P;t2mlyT3LnrxO&n{ zhtA-_`1#a4@-y*TI}i20ZGgkB8%Q~mFX{hu1c!|~jG5UHgx(lK>lX9Kus?HY&iOy} z@$F&IwPhbLBLQAF$-~y(9Kw&E3*J-jknfukXw|Y1^0Q?Yl#P-hGb$43mj&zbf!Si5 zQDjNxA9z8J1f0T3nhKUF?)OW60$3QlJ^-w0y-rG>gr_vV>@?w^ZlMm?L8& zPmeRphGjAVpS7wYCSSBl=1s3^zCK!Dc5o{vV)r9X+B}v^U-_E(7HGtbSp0_(%*kUc zKhNW6muFfVy=vkd{8GUAmE6Oe;aLg1o6mBLUg-<6NA9ocKOn05FJ&4%sB?w>3`wA+ zPa|n>xh|bNT9f{mUq)^j&4A#jSLFPsU0~Ici6<26sJ%fQvAUN)P5-4(lN~MSyU~@N z_P2%+`&%Ws`G0xqV++xXT#!8ZJq_kZ+7P3}v&2gNI(Mz#d#?L=S;yhadbociySQ>O zlN?PC*gIOgjCHJU)Np*CRm*kt)peYE>^@g8wU^toZoK2Y+AglG&kb&v-43qemNc%N z&Kd4U`?C&8JL9>G^66di%HcNudcu+O@Vn_TsC zT?;d|{WP<0_u(pj_Z~(s(^K%F;vpyV%}VxJY)PN)UxD5x2{>ib9jq{HCfg(Zc*Z|0 z@oY^oP0~dgAAFCrE(wyjH9w{cFUycD?*&wgQ%?TDY4WWlU*g=UMV@qht$)$zf}B~l z6l31gHy&H*+T-CwbbJwSJ%`P}Cp@G}4zQjKi<5NNtpl|7O%=_!>5Q89Ph%15k3VzE z2NMlvp?X?6Hht+t!wb)mj@XC;+xBDB+?`nXK!_LIs_^^z>v-8`CeHk3jNc{%vi%>K zxT$3?7&Uz4nS5ABrQ?dI{+{VLtHu?FC0(xH@kt*n(l$d>3Y*__>&8!7zLHjjV(`j2 z2deGE_`QlharjLheX}tR48py@$Z<5k)TkeOlnL_ZJHmgCSun(t=Qo%)&`-IO=~| z5Zw%n5bDJ)s|noNXm@popPh+Hlkj-k1>xDFg~D&|h6}&EL1EyMA>or^f$*5ZQ(@{Z zFJb&URS{2htZ2bYC1L2R_rfc6|AY-*^3~lJKGImfVf4wkyJY*^Pdv>Jrl`MG7IT}n z&;w5vkTttxfZUOxx zQl?4XYV>5BImRyhNF7iM@88~u@8VXGUypUjjiTH1-GnFP%q1nX?r9@@Q!{u`=SqYd z_CZel4^sbL70#VA2T{y4lB~Ic6i(KGz+)A48J^>Kbpu`WonHV-L%iU1@>+8LlNlZV zn}fQ`=A!-uF>mi}9sDnNJvL;j&@Z8 zTrP{&j4qQvXCo=HnC#ra%b`BUTP2}mBFX;ynJ{_8En~Y}-JGma4;@HNB*UL1+3sn{~kK!NPz*)+1w=@ChsEFm|Of03^SZS{BJdr0X4xw=cY zlBlZfcFI|MlJ<_4hq>8rcnezc$?*zR2#|Whjxp=0?)aG)dE%y|C9;QBiV5nz84H17 z+o=DT4R9#@m2-O53;J~1GWZu73xicXaOsyUOxTr3m-aC5?8*ZOvrPsnc}`--d!peJ z4vZ?S0pGq0q_j8&U+(I{vBCmC(@B_pzy+_HJJW4x_24(>8*lOWbr_ojDEa+~w=iua z7^|n#tjk9+&##O2)dZ70)6}5KcnuymQN~>T19a>DZIX+QN~pBd5sAeIp6%EzB=h(L zr$+%*C~>f^{8VXZWAxG7#v@o&@Ig-BX7|%bW|?9kXW#Y;&hPc-Ik#+Mtd3lH#9R+N zX1zCRkYm)nfQfM&FYwS;&D2PKa~kX3Fx+2t%nHhIng`RW`bYG0WZGwOPAI|;j1(2JWL!9UAqEBjGQ_rfoywgz*G}yPL-u=lFiBj$e zqVs(+0dFU|s(KU*W=hBt`;+8wi#z!+X^!M5rDSZeHx_PQiVcY?czr{w#o>=;i)TB4 zSP}?g>yRdKXLyI`XRL-ef8GXhvs;|FGz`TTk~+kXvrmfERtv=n6??>|4*QAMwGFGi zHZobPl$s#kb(kyu{xDEHOZKx^bk9IsS7{_RidrW2md_EFH}W`q^)4p;p08lXm8Hz$ zv~Gs6Nf%tMQsu~rR|`gJlrp_%BLv46hci7Z`~^w5>4Hq=4^yWm?p|FkT6_nI6R3XD*XH5C2GfwR-5wylc-*jx~`b8u9 zL-9>(I(o3rkmngUaeU$%j9$1AEsOMNdH-?rpIL}5i>_k9qDBmjoQ@8~68dlY7&0e6 zkHl_@#qdL)Fw1s1{^sw7qjLT{m4(h2wDlfME3ky1@QLv5gC_3iaE8c>@{q5i0lBI% zFg7|K#F|PNJ~JDmm!yFs29wBm_8d4p60L7X;G`I3Ok&w>r51HM;J00J>FqS0pLsS_ zQ9DX|$S>aRVHvP@{$~gYqww6&65R?@CDLNHJ8nw{yxZ3Y73ME$G@EN`TBCWwanD}V z+`q0{qfm8GIQiYvn#h96>QxG_gjXs>!urM*;ot%}QB=si8mpkw!sK1K!W&oXSl{k_ z;V&I|;YjmJq1;u?nv?52YnlU(*6d%-2uHY25naCgs^-qq6zY3DSTb(|2c0**sekEO z2X7pV@JsCuY?YIj1`Yd)eI1v`>!NPno6` z`#TA9vX!27y+W*BnA3$X_rR{x4e8btoZ$5CczSa}pCm172|CS6NBI&%uv4$5 zIjSDyg>MLNPh=3OGkp&KU7Z3~cC3N$)rQVv*mG~3Wj1bk-^H^T;Yb|Ia&b)O159pQ zj2BwtojZPe(0yJ_2;=3XYg@Hxm=f#P@Xw+3tIyG@Tn#Z^@bS$~4qG8##+dOB1^lrNv=En4dmK+!=O%c0gB}yC4Q_nbmkJ=RDbUU<#h~ z>%)(kPI!Jg@Di0`oa=AP(%iB%$;fk>81>MKni=mQLFXRPDWflfBs~hA^r%Dm4rTbW zBO8=DRbk1IG#HR7@L&3B@fI4gv{Zr&-af}>PA27m@IMO}`(GUW(BF(3WgnrX=4IHZ zd!J-Q3dt~bcj+3t9VLY;Am;TG`tC(3P3T;UhQHlVu;K`fY~-UiH@Uv2RtOW*HTZsG zMUc>!DoM)`5~saG5_~VJw~y|C4&L+Nwc=rFSk)xasP93OhhpeZY2sOb8U@}%Kk1WStr)c56ILl1@wcqW10`IxzAF)rA)g067|EWTqY$?#~U zGJaxG(cA)7yT=k=-v;V4FPPqvvx7S;pTU6x-|68DD|{C}k_yLAlH2);q`$S0{L0S} zAJ5q#j_6@IGYbnDtiIan_GVT+QRiy>gxFs5nGl{zFj+q`Plc1kvaK`(<%*TGP;L27j8u|CPRgs?N{aCQ-29I?d+$! zE{%gx*URY`w@19o+kfJWieCDI%XW*0KOpm~oN;p5M_iEjjlO(56=LOFz$a0jI`4ji zHVTHgyd?zIJxl;!i38Xuf2Pt+Zaj^?B)B#)117kOK@@TsA{Tl}v|lnf>dHI3u<;;d zZg=Dt@2MpQ@19|FeG8iBwZiupZT>gw_q=xh`{*%x8~z9{AwQn?K!YxiloynvnW-Zl zXc>+wHyfaSUU$98bRj+~T#ZkI+@%&z>@ev`EcpKmgXX29`Fkt1q?LImsG7+m()q28 z%uBCiJ@w@%Y(Mr~XlN}ripUd9TM@s_n`i5gza`!$FM!YAbLsBxUyIit$+Hk13wu#mV z!l>@^i$rurk+#29qeXT3q;39xlJDELz=K=mJQJTG=W>(b(78xhf47?b&HS~b-@TOU zdB3I!|LLR%lU<((OWNm%=3bQ(@uOWh|29Y158IBy#X5nalZ>nA-S0sAA8QWUPd8p@ zXQyl=nw%;xnzJ_C{_^Ch_W34_)lr6n!hqsf;jyG*`^Ha;?59doYfNP;gju^ZMQ_<| zBfiT+vfuEiEW;Z`S3Kz5d=^qt&Qn#WSg5G9AX2D}3v^F883ZYb6 zAcKpxaj2c|zxvO=C{RulnoswHy&%ss6d>%r0ac7I;T5)u$oQx8BvY3fQMx7` z@28mH{a7npDYU`pz!B6!(VFwM@~gF(zw(l zUUi8r9=vj#%CbAA(%4DxqQDN8G!)|MNLN(7W(p0KuVL6aMfmjQ7A|tzk0XK>&|}p?{NiM(NnBVwlU1@+-k<|zKqRAt4TIh zzGa-{a_!8__xT*vJ1aP+fUi)-X116oKV3mI+X}-X1hb^E<>KtbscOHx6 K6hjLLy@o>CM9z zBp1(~CALjVVbW^#`tquI*m^z{Pjv_-ayQzD@;8<%dTc~B%LXLDm1hZmTZ5!N^(9dl zI75QZuyewcXx@(o&DvR(DPqshD0Y2tL);YLC3ZYBj{Es@m&2qlI@~9{bGf^54Y%cY zsz`0l9C6XWEr*@#5m>sZu14kcMu&+G3BuI5cSStOQ<0h4IC0+_Gp@Mu7+3!JOX0D@ zQR4kKUBpo_^~{FG0D<)sk1E$&s*J412u`I~Q7~_bnP5WQc}`60C_(Q)p>_SmS=M{c zO%pgL9Aj2p(Xu}LI^6oUc?CzEr(@%#GRU-^jbsjnDGS#B)e@XI_LEU6zgM;9+(jnJ z%u29+ixVnww&Cc+Hk>!78OJH?z{XKau@<9By3V^kdpMd@UBWF|>^Up%nuF(j4Td$&JM7ZQ0J4k;Rv>1o&s>K;*TGG9~dZ@Q`7;QOTDJdFN zifL>oiu9N%|Ad<@4k)qnxRQy|8Q<4Q^CsVNZs8f@%28a5d%O*8R;uE@S&4M!vhCC@ z?I3Zr%_6-lQ@c6L1i2?Y$>YP>(D~6FWN#+%+$U|quKC8Kv3I5OmtXs-zV5~P_QW$d z_TL9ygTF6|#z@Fw8#l0#{a`ozHD5F;lP|jWpg>r$yIM#}BkjNOw~1DzFrx12U7|J_ zUD35=YaN_ULcqZ;?BdAGMb<_$(f^FFfOdbZ(FcoB|~w=#|Jaqc|q;En|M zsyb5s{tP|3bs zlNY?9$*)#Ie#l|oeV&8_XGKdae*EUmYI_Jqwe~O&?@y07-jbM|-Gh6+{iW&WF49ol zNKA4`#5+q_r_Z)gC_Q9VZ&uty-el%u+wnpwj%Qt?s(+l%{ZN91${~2{dM3$Rwi{+G zA#fvaHLU9F$Mo5qEF*c5_qQq?rbS5HiYj$AqG2i&Q*NKK6!G>(rX=eWN}((fN+Z{A-j*=_^raWx>$RpYnp z;DY%*L)frfo!;qA!shyT44e7{N65v1g{3pmEdNe_raR*HOQ*3mK^-UDvZQ_!9H^6% zCQfuL#F*s^@l;kZ_4M0=9`5nzD*ubl$nVD1l|Qf{{v$pZvk_O=>G6-=khPimp_=oh zeWoBNkSl2W93fb!q-~R1zl5{aAc*5rCog#Keo$cESIx**j<5+V+$s3#IYUsryspZ; zP{QmoFcyrlm>{^P!x4P=e43-TESWRk>IFwUW4>V7+LhKDW_nl;MjpijQ{_>_W`)Mq z6od5PC(v%okc>7?AkFS;$p0uh6MrbbHVzBfqbx}X5t40&F~f7tSkh*xC|jh`CMv14 zXtS@C5*3w%Y!NfeSe|oFDt@R$DM~5rrF{{Ryz~AK&*wbn-1ql;U5*!z;I!q(@M2?a z&|aqwWn=MVmvtJst>;f&XH=rS&KvRe^g2rK-!|g8zLLC6Q|H>h;Nsw&uF$&YGUy%t z4Z7z0?V3**vws|TVaHx1&(fFvY`68(IQ!M3boS4xtLz|V%y2M9B;h$5Km7}pE;No#mn?_VJ?bNWU9u?!flym zWY6hAu4c~kmd%+EELJmIit<947B5is&v(dA?-3C=79#cwYZNqJd`9;A<34UH>U(Dc zJFLDD4W(StO_b336&`WHu}(qHtw=UayF{)H@gAR!orUt#iG>a&>iti z$zU3{Vooz&Cw8_y*r?vvuv~`x|6g}*GYdXD_k*viH~xItA4}~Bqa>s1h_>!=5L5&q zYT7&WEcXKH&;3rq1qSrUO?f=8EQzeoKi|k2JxbhCmRnzL0&N|srPEOE9R8f%3e3ORn=;>jZIm^y8W)!rfd4? z+QZV{ncjyaSmRkGEI$p?+Vr3EYgeTt+srJO&hmBRvg~)q*B04z+VmDE+3q*pVXNDk zg->m2BPcQng6_w_c2`HDSYLwWxeQ2~v=@R(D%uN*_jP8&q%LT8vC+eQn_b_i6i2W6Ys=Kp0&oXXr>9|1s1@O^O5jc<~EdQ7Lx2aXUKD( z+2ncZQnIKcHfSlmVfHMt%ZtuJ=^_$At%J=+Vqn#-ISxN>WKshcn56EW5tMBb!N9vrSka&Z)qOp5Wvds&6y5-r zgwqh;&Vx8}0~qVEq5Iq=;Dp9@I9G8Eh(R69Ul9YPdZwVdIG?yrWkAD7UE@qn9y+q| zC-IFH^VH5?AiG6B_Z_jy8REPq$nca*J7#zZ{FI)giWGZ%rMUs2( z7NV8Tt1;`x6jALbGg6zCgKj<6pnav5fx*`S>s9lo^FxE5@-8(!W=QJaW^~7-^In_{ zwDu@1<#p)~^8#aUGa@nzujrWntZtlrw(7^S0ba+sg^Zsm0gQLIAMz6Sbn==UjjVO7 z`y2RV~{3y@qZXa*LwFi1_?XB~~wD=|{6RPk&;9}0GU`zN^MDI;8+_Cc`7qC_CAH(9toaa=%7&JemcX$z$v zKNom2-wBol4+shp7(!`mA)GA35E|}w5?*v!Ak;bTEFAZugz+zS36r8jg(?&F!Y60U zgpp?{VO;(m;qBQE`1vF5{9URi`FDP$@g;}Y{FCbz@||1i_}5nx{>z~he*Ez)zE79| zf2%3*Wj)gPy-A(?@9`S^u8mjtkB`3N-M4Ar1-VvLD@_~b4easY8`Vc!pPaOs-}k$Q zKVR`O-*nMJIM<&EdkY&NN$Di)df|t>?Q&69ub5LEJWkzosX-g>?ZBS;Q6JyFAxB=_MSsMdiJnKk+-ad+nCgu}XZu{BrcQ;sfH-kotwTY>@gLZ{ zl@qN!oP$DIm!PYu|AANa2e7$5RTPo@mb&oc9Bf|t5ta-ML!X*8)ok|&&G=ys+gfyp z^NcjCL?`3ReTnqAL;S$$+z2QAZ>RSKC!YS?z392O|#o4anbJLtXX#3>&sbc-{!J{ z8S$)P&M~{o#wvEy)Ay`rgOYaK_;yyat)Cs!H-#m+cA(~Q@0Hq@kA(u43#;w4CR?z4 ztT(V;e-j86OxF?I6FPF2+nm5TA+5MF_&Hu4--TqUk}V}+>1}f0bZK=42LQvp(?Aj zxWzRQkL-~o|EyNPRwUk69DWRUrZ_;z?(Kj)X2F-}N%X)|4Us0-mo7cR1Ll`}a^`C* z<*cLtH`Ue9nX;qMr@b0QHs|4T{$+f^c0LqO%z#f%`1oUbJ+?@a#@^BmBw(#F_V1Cw z%kt0C|5XJv&PvHfSw1&NnB;S6`O12$2rhG+82`3wmy_!Tv*>6OP4hxX6nLlaM zxlJx!iy*u|B@QpQe4vesccK$vImk47F}{(woBUVl4VMq?hr3_rLa+F)B6OMyM@puG zt9bzaK4~X@yu1w`^xcAQzNWBBh*PBFb}k*!J}s@!>% zJ~w2H^XA^PmlQ`kl?N+fbb#HOz}) ze_^~IEnpn_Ajvdp_`%?vljcv|#k2mE>&aK$Jc}Q6rI^ul)0AoMs!GmJeNEKEG{~c@ zy~xV{55cid$+b1wWYYUBaOy`e((0NCd*k`2D=CeN^y1)F!+5lG(>iqfy8$eEpM!eL zW?HfEN#WJIIdg?FiY`+JykDl7jmfKpyezP{5?cK4At+wSRJFVpuJ3P>aeM|oh zdpy;MGym!)PN*4!Q+>*S^XZ8(XIfMtXZgDA)@7czczS1x7(-t3_!Ccf)%5&zjQi)i zd9O3uc~?J{@N_rpGoL6OXV~CaM!@IGJSX!bRV}AW8CE};{H5Kqc(A69*VFcix5Qwr zmBaE$49g?Sd9I@hjOQUwc+Fu|j2HW5xhvh|xe_Hq*wnokUp1G(*~k-@i_vo3Gqt#j zS1WO6WL(GpG)s~Gdp6d}mE+>5d0eXvl3c%yt&OECSE1_NxY^)Nm>i! zocsj=sq$cy+l|6=_9CL3jg#;BqPR!xs3^M&EuPwn9@bt$dB3;ea3^tQmb(?}p8JPR z-+hb5AH`yK>9ttmn+@(+&c=pI8nI#12{^j*8M!kY0|j~q(QC(Ed?Wb?X?dRto1bdxCVcumxfc9FB+`l&q@mBk7ogKt2E=im-sG;Pp zY(S5q<`A!Cv*Dh`RX99q4wDa@qaXj1h4rHox&sdcpszRJ#`J+xBbfs+Z@eDEN7+@(o* zpgX8zhb?#=ai`s07J;YMVc&pgI40) z|4SG)7=%BjVy{N$5^zbK0V%8sa9VX8>>@fL+Sv@AEE3Ovhlb!QiiR0AZV<|$?7s&) zfb6@spqv^H<9D0Lr}Zb$_Jv>3v4`$hc36+huM8p*Vz#_P!%k@1ITw#VVdBA*N|fYv zg?w8tB$v*#Bcloh%$_6eQ#@EhbF8kToTXfBqId$mSoVRcRCI+^CI``#jb7L@U$HSV zMh(1M0?2z-D*3Cu5t)d6BXY|x5a}QP2;&rjxQ{g?++Y^!>8+ss9Z$hERYa5$#;KP^ zI%H_<62-{93>(&|5SLLI*im`cL5#zrQacF@t&+s7#&_svr!tXn2l^@HPgYzOclq5 zcTsc1PU9s59_Ukr7wTUX*SJ<c~@Q=0fID2suby3V= zT&=IhjZJH^OgaCHVU^C|+lP7bc77S=jWU(_o(|6$SvCW_0t0itS4#)a&{CecBHNSk zgv;g&(ysG{>z*=7*UR$l78&sW>u~2;HD)txZiO;Z*maCf-_hz`D>Z)IMHQZFjXmG; zW&wI*(}{{Eri$-0Q=#Sh7MRb-CWq}u(S`vTd^ma%d4I|T$b&1?`Kl;fW_0Xc! zV&}mkmtb^gbp}fEnTD5FmZ98blc2vh3Fq#Zg^${Az=z8pA@%qmTHj+U$4!>vJU2Jy zEcvR&`8CgxvwHI^&Qt9Qc5&qucEv+4_OhxP_DPpPc9*&j`>e3o&XXHs*WM;#yB)dB zp4`N@J6)_TWGRfZ@+BUzV^7TGO!H1d4GN~z9?m|&R#cGUtEM>d9d1_g*Wxlh zOUwgvOey4P3EX(0qFnx%O%1=ZxQ@4R=n~I7Ig&4{I*DJuDThCP=@4J4wUu|j_BgL% z?nB( z49+S4qyIhmgbZ&Ep$~s$(0SQr^q=T0npvTR)x+oE4R4mBSQ8a|BvT1{Fm-WUs|c;q zc!B109H*2rwvqGy{Xt)#fwG(#fX*kS*vrjtCH-Yeu)kv|nY6N(R{x~}B|~xK*VSTj z<&q|3`;<1`xkA%kAH^PLgQ=)2^#W-qF(wU=hSwDLVdiZ$ZgNQyetlyp{;g7g#~Qlu zSdRjCQePLci%g|wZkvq_-kPB?FIm`a_5cp1yoQC{ir8^SJT=GJ4bJLCz>Sj;@FKbf zd0kV7wS{wF`LQJO(&GbpQksH3y<9-P4ZAp;V~Tyy%Qhg1?_DHtYc%~ZBb&Ic{DSno zTv5pPRlw58g2-bxLGM65gs)GAu%sXK6Uj5Mr0N>Xm~x+d<18R9vz*DH3x&YS9t4wp zQ$bOA23DCMWbwXixN!71yiY9vowj2{b9NXuo(RXv&*Jcg+g|jMzYZ1?g2 z$NMd0wYP{G6mEr#75@lnJP2NOp5RV>C%26J;K#aJxEb09U!yDFsf`B&JxKxO7)z?{ zT_t+-N&)h!1w_~BKhl4DAI@#c#4DcK;Z4ccupYIXqD5j3pz{y1f14k2?BU??txDv| zeE~7tR!G*Zi=|p#m7$k0Gl*iSD~$DefL=~4Z041c3l~CAg2GC2SvVcKTS7o`Wi6Sf zQ0SofV>)~v45GtyIMnp9T~Os{ih5jD5XXXR|_+6z@Yb0t$T!v7_O;$@emu*iHVe&#EQzbXge;&K|>T&cuG2l+TKVH>Vtigy|# zV~AnM#=c^I-Ne@(>atP=DPDUB@6YeUih7%{Xn!#}F;klUb$=6bGBZKhK_WVkfg47T z6%!Xq9X8v9gA}(4o&;vW@0Kg{g-!w37`%n}WiMcd*w?XNA_s==UV}1!B}mV^@57y^cpF9wX??J2XqzooU1L=c-@wDgR zU5(#&%mBA1;_S7-Y$B~1j|v|Zp&7e-=sW9H5}m0PMEP9~l6`DO9k&yZJX#TMQ8`4n zy3pYb=MTAl#vFEEWq{?=DBe|LOa9$VO+F`Ci{HCniWz+-n8)24zz7$VGr}9P8LNGl z@V86n@j@G<`6%&JmEF!aJi`_LtV;zhyuQbh%nLCF{B0|*@X`vT869~>{7jZ9f8FYL zyipy0Yo)GF46k=PNlc9z7&GjM=Qla{(GWllm~n|86=DBv)_ICqs0|m-o*>5kVhlp& zYU<*JYov(l09|bp4%Yb$vf|xr(j4%MIDE?nJNYGjYS$71l;&nh_5JRIj{ zEo6s==h(foueIBKQIX8u_=Yts<>$_`0RNp2atUgasz zBvRGtkVUPvUh*f#_5n4%lBpH{6J^4jX5Yip-(=4JH0>hKZ_OYtpwgN@*J>5x<=r}l zvD78&q}UFg`Qjfuy%`~l>)ZG68rD~;rIQ|U0i2Vd_7W0hyI(YbxvliE+(~8^aLX+^%PaSe( z4^cHYqVdDnPJAHB0ZQBXh{%}W2$lV~cf%i?BJqfjqys2zrYBW4u#diSdmMk!;z65+ zHrTxkBymgY$Z<(~uFlau^18*=-h7n=-Pv+Z%pRT(^0t1ohs-h3I1){!$-N^j3%K}Q zRXllH-AvBRn+Km+=_Ebt3|+V=pT7Tm0h!0WO$Qv?Kx&?iQ0zVp5~(&5E#G~WzHl;; z)Ei$W`#<=z@&;W5e*%(NoUr9A^wfyeRnf{?vhj?ddAggxx7u>exeOh<4-yLm z2J6NI*I2h|`xE}M($%%>>R4@p!@1Uvk-$Wm_y6t*b>FLvr9^M7eT)~!$IgY9C(NIF0pjt&y3@z3aK+aytxXdUc* zSV|1r4B`8aA);q<3%0E)LX#8HF{i1Iy8ZK}eTYOMygDw{3h$8mhaAr!|L zuR+phP7s469+hPA1C@Ps!;?}HQOrOkF>7-MO^x{^Wc(7Ya&|!<_k@D>k#y*p_=+Z% zU&TTFivZn_5Utq#5^CM$5tF~6F=1s84*r&kmMe&JLu!WbOw6>;Q4hxvIa5*Q#%?-2 z<}?^mHPAOTyrBCub~0r@I3 zaP3<}Iqxw>FP_%YHdExO3)|bN^8K^1XOz(4_Nv7Yxmg?K8c%|c($8s|u@S01W;Cd>*W$}zOR48)Mu^gPZOA+>3G2G=;FiqOm@z{ZyIr0H@(Fw4dX_OhB3h5T zuQgEjRk)B@Ukb=<6*B%6hOb;n!BZo5z`?dEI5t&}&-9}B(YrP?UIhj4gy(&E7Ey~C z+Lu}w5#$o@{?@I$+X8u}w9NDB$@-Z0_}@ncBV!t`P)(n)Dq)y0@cRkxY4gk~Yn5D{ zJ>ze6-kH-36X#w=!Ps%evu7)LKfe0#vW{%VD&JVx+GaU!Yk7$}EO*g1<&9{z=1-K} z6^`R`?6KwU2sGo5A>JRur~FEv>JK& zolN?@{jt0}BvzI4(TZ$KJGKX0; z9KrjhXv|ztI*lJLyN>tjWuxWnY2}P^*V)YV+a&ls&u#cCzWk`#==+h^6F!4Us~a!^ znUneU5dpkRD=8@6aTh)N7=bsvs6kUbS)hJ$2F}Qr#wLra(eB_s=>4oQ>XK9%sy!M- z&)FFX&%&g@Z>bh!x#yzLHxJNfUn5#wTZ1fr@Pf9h55y5G%CM4i4xa3khHvS0V7Vko zSYxsY;wxlCv3@4t)~O3J9{qTkdlEjE@DPm`>7sJYZhT8anp>4@%#AnS#H|^e!+p5W zlsjGK68+yY57KC(L7p;$Ny8I;G+{Cu9<1+%_tIt}h4M!*S`|S~dKdt=y%;uCm%)-5 zE1|BU1%#0j#P3@U?W$HqSW>Fc;nEIf5l_kGy?01oW39bf!Cr@Jtux`$yJm;v%aKTB z%M_d@l%&RYJs@6E)SjbgL`?+0+=3FRO#;y~2z1eUAP2KbHAJuK+6!}a@1vpN5)x#p3^L{wRTZ93 z>ca~9Cb(X-5tpk!L#5O2<2ae}Nj zc}tJq^#jKUS@gVBA5`X_0P}q>sp=QU!1`;5cvocwm-M%SiQ6j338^6mKleNQzBWSM zJpV~DLsemATAV}U6Y&|fKmvzX)zUqIxL8lA^8YZE_#d{Dh22O*AS0{rQtKGy||^g6n;TPS!W3k-EQg9H#zmM%kua zXv^Ma5|^q40k0asCv8~lzR`rAAr7!7x&n6(UZXu(GBDjY5vDxJM0bNd@qfYch(6s( zhCSV2q33)o|HK!Lo)1yn9WI>8NSE586y&tncP2j85Z{ft-Z<|akt%Jtm)>DBa~Ed z7dsQnaMHl(NG8a)q(bO@1osDK(czIoT2b8l3eG+R!4|F1ZFCtt4yR$=DtXLJwZy># zaoAnb583>z!;f1QAys^U?$S>{N2~4e>4ge)7VGrtCcnNcjO$xhN2Qe1tryw~f0arK z?gzaVSVrC!ZeCYX2PxrpC23RH#s}}%efXFs6rGi-`!3FE1tkpI8Gc{FzGb?FE%~E| z9rZPpJyLa>9r#zD^Zk)8XT{27j%KwipZ&*|&$w{AYFdLk!{@^to?Gpk>W86W)yE!t z@&8jW=fBu+f^n(df}gQ5llQxE8^eiy&inSC6aQyoq4nGp4ns#m+j>EJChyNT1AeiM z4c|BQG4J&Ax%}P_PJG{m6V*#x-h;*QBoK9+!KN?Eu#V9l!nAo0^P)sxzn}*dC@#hh z;->w5>QZq}e3J<`-Chod>?pxo zk{4syy=TBp#sl%cOOt~OY|x@-63Fw!4%oPD2Y!4cym8x;&y9Bt=OU(lGW|pEA6|qN zVXwjws92XspWMbo6YiDxk@kHE3#x;#2N@(9qTrTP1-_d74FU`^P;-wfmKf9&we3qJ zuMa4~gTNdZV`riDFAbosn+0dIXQH|`ZOF6N#8HfNbn?x4IG&e^96FCc)SNEFi2j9U z?#O}(elPZapic&l-M~IivT@QM7g)A^8h7W*3dqsTp$qjZ@a(I-*k?EcLStv+!0j}? z(o;pcH*&emN&wlM#!5Cy9Vr6ROg3r@rd;G6-vCe9a37)@vv3GgE6HN-8y?%=;O zi)6jKM62>Q(a-xU9S$ZQ#|foco7BW|6EeSl=4E5LnJIo+B;CD1gyQdM>!Bh~ZS@5~3`M8L` z8nT|n!Ml~3FgnmneSc}z8uq!McFoUppz{l|>&85MCPoF$jSmoQ!^t>q_zb=J z)ni=fY>C=$ErZn30Aw87LGjB>xHeAp_~utD5Vp4>$KFrWq%BOb6NC#@WwKbjEJt7G zUBki`4X}D?JA5)-gTMCQ#Wm(?-1d6~FzfpyQ9xl6b~*YH`&j64A8v?&$5R?WxmSr> zoG8nEc(WASh_j1lS1LfDp_m1IruBvn)_WxhP(08O_JKw2;UFf0tcsU7^=c? z$GIjb*wzQnyvrcb>@W^pYf4@%`$ufA_kgU_9nw2JiuRki4AOII(1Pi$pw6no8~54Z z8@7s~$ggbl@?{n<)})h+zOO_rqKvxe7z=h>fVtM^@NNwy*mSxD-VVg0BkKadRx1=w zUGR$-JuL?PRed;Rdnu`}Xu%)*1E6qHI2?ZL&ix)>4@p*cu;gM#QONH^@R}6Q-FQff zd8f635#RIPI#@n{$8&qa^9v1QIKCU;O}k&iD4nOzWd6RwJ0L5^^!3@s2(;V77(OP! ze^6}1*I)OEv7%)mW7;iE{;G9WJjjS+>I7UDT!6_kLi9lVZhONeXX7*F6svGf&`h4Vi z>Jyn6c2(@CGO; z8B<8-z?WiezB-@+gXL*4Wx~-5y6Aw=G>67f#YDtdA(K^nG^O>AGuXQ=s zBYoJn4~Mh$FK{`-9hU4n-;(X%eKI>_gvWl-QOX!TEXP;)5XFeuBgxR}zf~P{q>1M- zHitj`asf{PTk`i5=P*1z81j}a>18DNoAS+VYZ+H`-!f)>Xkw&50&gI4s9M%=d9~J# zeY|XmUdHxii40vu%vWnlVN85)U~HLWBjU8nigsqT!=ID0MHlO4iKy-j6g;MYC(G;M zh=d{3yzU)}GBBZi1PE6@4Z_L&p;*Q}1!p8Wk@nLwaESX3Y#A<#JHE=|R)HV+#k@s0 z1%bdp^I@`w4b^G%3Sws-BWjYmfD61yr|tmS)iZ(q9{5S`dU*p)vG&JhYK7SU zw5sUN2Y`B&Y%u(Ci>}$!2d+LIT*tL?-2EDW>En~Ri{rjihi_2aNm{$H`7j5cq+D@E zUJ|)Q_2QFefl$+xO)}RXCFLH~ux7<|6l{5sNCnuz(2)(KQqKs?i$Y2EjzPkaK=zLp zm6KJaCy7V1qS(>-i$*7=QlTpwP;^2)(pr;2Wc;-0ZQ?!PmM&xBp^yz7#wuWLz=U>- zY$*CVS=`_1gRwV1v0920H$eLlwz+r}cT97{0oRV%^}Wru+c^2Woy%NbJ0+7^*7J{- zSi7dS4z?Q z{rN=Z>@jpgemfPbD(*^cw}u9K5v*G~OlSUGOOLethNF7nxFr5IT5Yb5-%erR3!D3i zlW`kfIrS#aj-+w%^hE61k&gSVRJg)_Sy*#(HC{Ps8*Z^E#DDB=aDu#cVr$6>tee5Z zZ;2B3&!yY&?olSFE>su$4>dt~Z98G6ZHI5V^T4opGbo+NCqZk!l0QOSsQl{!-?Lbd zS6Bjbrgg&KxAU+!h`(XQtB8}o0PMAEz$6`ml>!TWsO5>@IyTTR*#hU|M(~!QTR7=^ zIu59&#s2NDn9SwVd2(~`<-a$P$BKBIwR{>9EHwb%)OMmjOY9e`*M;j3%Hj3HtxzC- z7ce!R5sOoC;FVbhz9rh+dsUL$L^l;~c;F|zT1kTo&b=t{i5Q6DcNqQ*XMvAmAI&f> zMQyXhy}~{&q@OJyeSK$0E#8P@cv*1poCJ(oOd=ZJeBgD*MI5=@78ZQ71O5JwsC{)T zP$5$mA+y{vEWOZ9R7Ky1`pE!ih1X!EA{Q@NDJHlz{~(4? z3+J{ZK<@1dcyHi|8>jD}l*Jsb;Hr;}L#$#4;W;<#J4FhQZcC-k-oHaV?bX7kKID?q z)k_`bS4@TTld8yQvmyB!q6wjURd9Uq4OE}@g3P|!480-Q#2|AX6o#rpx^5A4DCmka z``Ol(<0A~|`4Y@~I}Mo04%xi*!Ey{8VLW5;=on+%#G7HJ=gzAc|6w&VcUJX#=>^s4 zjvK7g>e$Rx7lwFe3)zh5!4ZaKB4X+tF=m!t`Nl|?`HS&rFo8GK&z5>tva+CW`qMbnAf zH{cwlOuEVQ5NujDolMxyL6QwL(%-B|ex*plB=MePhe;l^nN4QbhL*Aqo_NdFoc)|_ z{O&k=&ceF7FUJnodCom2tWf`9Te<&{aBA+ux(x>Rgj&V*cCvft*zNhET31waw~iWo zXgB5YKKAtcUYXI>HptM6%iwv{ z%kymRon#o?R^>O00^{3}i#+LGX}-im1!h>&G#+n07n((9P-d_Z{xMkrVrBE;!-i() zo_tVNZO|=|6mHi38rKKS^|R{$`kbZxgxX-iLOl zAIA}&e}ir88)B(>h}zl|hnl~N8Q_i6MY~+Tqq6Y|tW7<@HQW91tKEme{L3d|SY!<+ z9x37{+B{sXBxbpXu0t<6&o_RF5zxvD3$dGXI_aIrB9zo2av*jiTKF;-TeYZwala9< zR*688g$a27zbY&rnoL&R*h_MkO$Ozq_3(7E725N%0K9JukgqplLGU6G{13EXaT*3U zn9snjCTcLaa10VBXNVb^B9u3xgj>W?G`ra;P|sWdPn5Lrg_=`j=GCJlu>Uwwd*uqw zxrSVq(qSx)P($u{CsE10owTU+mqYQh1k4vZcKxt`NF7y&FRrCzE=d9-$wySDVL4d` zV%PU9cU&&(kN)H6!TWjy1+zEOTYoRa%$m>W9mgE!7+)Z~Kqpe!ddzNXQUa^bU^7eO zs;%9D4jG%CpU3PvP+V=?fHA9d@>`Y(UtXA&=xpbBWSwwrV!hx@-89zKqx%H+qcsIp zCW^vEet`o04UsI1?wx{tkDu2bd9aM7r?{oo?-ydVAJ3@u-ee8eoS&ldMa$5X=|$w5 zegZ_QJHhAbipGoEb;KQCIqF4x4k#TqB#Dydbo2Ey_-OuGZ16k}CWjnA3+{fPu6K1} zOEiHRQ+GqqT4xgApaPn&a?l~;Uno39iKKZhCM;*Z*p+pM9$nXk&u-1cBde6KLcJuU zbc$z8v5S%E0bR&{I~C9Jmqmf=7a(TdQHNW`55Qw+4%QKkqg=})$i=$S;mt=65N19^ zFSQP%UGl%s>9HKz>#iqQd}8BjWi9-?$P{b8m;^aHmC+kYfv7F3qFweXB7waxzMh$k zr^nC6wlf~1Z5dAJ-FGh}J+8yuJDW?non1@{tEb}6;bq9^)DT{~iG_1#G}3%WeP|AB zMy125+?3^)aeI_AR$fs^tlS@yDZQDfgHeEV#q*l3zWdPZ&LV>UmXb)-3``A$kIix z-7%(QU%DR-ym|ruZdpwvB2_`6au`O{=7^?tWI;lY9r8~KAbJW67~N|Rn-(jG7VBOk zF(doPw0$DtC*@3k^1KX(pU#7o)J&Q&Q$RGE^}*ai@dJ^4bw9&??lOqF5d%8X-hi^i_x#XQywYn8 zKKv^I=)>7y=~D@@{I2Rh(=3_I*RNX-8tCy4m2>!V-tCNk2~3_toB{9JTMyou3#llc(`_ht1=A$NjASvP_Z5dOwT#sK1D@eD5WO?dCy-OAmv8ZBGv`VQn`f zvv>x7Jcs6)br0eOBQbNh3Q+9w7Y^Tl--j((DYzhuiI-0k_u$r7)?eQ<25OCwc+>c8 zx*~(3mgkz{tc91svCXj2!7c#hIL0#O7oSum?=$b2<+?aSF?=Ig1+AI8LEq z9BaALoUw1poRvSnvz;D2V^8DCan3Gq;*`lhVh5>IvJ--%*l|zN*oPAu*_vE<|_>6HKXPzv34dYi%5^us-z-yYF$hbL2f-$?aq&i1060YWHV!>=C zK5ur(LB*dBwzucPxerQ|-lY)nyR8p7yq5sef-+(hkc2``V|>gv7scz8k!urMz~(5A z$W<59ZZ%F2;(Qynn7)Hk6?=h}HGKc=sBYXDcW5JgcZM`B zQP!sRdo0G1!)LL^kz72o-w*SI8r)jOO1wYf5}yA%4A)mL#ch76IR0TXIXv*5Zi<k{o;#?q?K^_x4Y=oc~@$#3c5VJ$P100hi71mqn5;0RLrKt??U1iwy_eI<74UiK|Q9i~Rui3~t|0&pRNs5M@da#V(=>c<=%D`n-`EColS0N`U z_&Pus;!K0`Pl^l}wvg7y6`<-B4o|{QgLF?a82;D}Mv+YDlFo*R)lpD>bunbOpQYRO zwo-;=Ux|{}E_lD^E2%43iwCs$*z}|x_V!vLc6psck2gwlzstDbQ-{p3a_oFm^7%cT zSmPpgch=ylggaQ3U|y~O`<(~Y?FJ|NGJhl!huFDhAeifo-`i2sRMC%;xLqy8@T#~I^*mU!Fa z?tQ9|Jg1h#9ndAikJiB7`)A<9kT!NV48@=Mj#%|zIP7>ajxzh@@usXG+`7Y(GVY#A z5??+eE*Y1owtg|6A!eV$%K83{ekTlwuwgMNchE%B@*g-{x%LLFUvrk&k0pb6F$E7! z`H43TozyPWOk(4WKw zx&gKynptt5gw7>J-#;}ZHctn=&6~-_=g09j%eh47_fOI<-fM3tPe2bi zUN~|in?7c*PN;qA4L*zz6kfXpFMCyunb*p2`~1^55BFfd^|jbdI|w|J+f*XfNqa<-mv?cyh-9Y=idWUW|x1fI7Q_NId zfQU!~k_D~AhI0mUCgr1;iY0ir(?Zn!Dhm&m4>AmX65bk-4S)WDL%f{PDn`UN;3-(n zWbQqq#Fx6M&U}1QhxsAqDuX$*(3-j5h`;8~Ag?L=5^w+aJ&c%b^^A%IlldkII~iJ#^K2vZ z>@fxFg_A}6l`eSV?1lQlG$lBB;0VdTT`lgC^-;rmnlSa3H{4RHB6dI8;Gu^tHeECi z$mK*>-rGZtZO$WiGB?*XZ22M_%U&nDtL01ZpE@Jn_VGo0eZQ@|(hX94v+FOd)z_5pO3KH1lT>P~6uN|r znK!TVxMx3A4;FMYRKCUVE~Am3T+A_|v=il1sIV3`TD z$Al9}+JvOadlJd@y=dyfmyP?n6dHGUo8g~{6R7!LJNBuLg!#+{xRmEsrj=`@+T#>s~E+zCVL3edLz)lFDowOdkIM9JZv|eFP>7TfHND)?+a?p663ohUW zr2g$KGHXjCM#^KLYdQ_g|nw@5b~~s2=_f#7yiDs zUiew*m~g&*vatWz3qi7Tf#B6ofnZtOD?tJi3pHlF6x?|t5PVFG7K)z!kD~Jqr1Jmb zxKU&zDIyXYMk!=JpYudnr6@&7sie{uEe)w;lt`hhjEtgTg>bpg=iDNp(lAOTQAE-% z?eTkl|K9&x&wW1UbI$v`UV2p{bEB*BUE`|;|1?!Dzxx$t__Z^?z8B!DyPq&@)N4F% z0r0q?mxl4RzWT#zOq_WMr>0qA_Of=I{VosV7Hq-jNx6(#TO#TdPsY|h*4o$MCHHqU zk^^@}h<3=Qavp|D;Qbg`P@gM7uQ=(!zni@8{Fw{m{b~;^%6m)JY7bjS9sxR5uZzsk zbp`k8baFSNn7rTa&Lm7vBk|d_v}K$uxtx?k_s+a)z3iAY#5((NZJh)mN&NOs=_duRS(qRm#2&aD|_Y{pI)wI>=L z4aI|&dbYKl-WKdW_K+BzswR^bePKN7u5)SGm1y*36%I~G#?JTe(e11x{akRJ_!ZV?>3N=cH7!OmG^Kgx1phgXNL22k@hA7a`sFhu{{z@7l$O_$soY5 z&)%CyPV%Qp+Do}Y@h*%`J5KfVM>1}2JBhAhH&iO#gzAsyA^!yjaaL`RKE@YHCcgqh zl`y#WI+y%(-vOt*u9CZXH^AX$p|zvhO*&4~k8o4o&_{(Ap!cyYIT0L+bAG4O`CUix z*rwyC=l=#Ty)Hq!FJ|~4PZ3pDiQrwg6YLL*s#&&U94y_NO1yp65#`DH==#Rw>UgJy$TgO^C0W%eF#;xfR>8`!mAVSQ7JtiXlq-@Gfstg zf7?NfoqZ8i+%if#jKxeM*NT^C``6X6&a{XtX;*tZb>3Vfj+3WgM12 zseEl{K-t-pKs?~1kBJvevCVxSE`2Uf?!`aB#_k$iXHkH|(~r@AI{7ec+kua=Gw@7E z8~u{gO@3aGXC-U;s6}B2snqKwTaU+q@jjM4>z77Xl#PM~B?b^`r3iVKd%;%1t2(gI zzgn)yyILmFr23-%2SL=YW&v4VCrC;x5~Ren3M|j%3s!yz5FC^<6)e@N7i^oMQoZMq zVfBc?U~~PfNrF#-rv&y==LB*wV%4^b##9|Y;3sgpGTwZp=5+J42`} zmy2oj8hy5~D)SVmm+qJwZtP+2{QsRolg;Nsjb-mQmsPI$T_#G|X ze?S+0I72O^YYM~F!6Q&Ko6i$2 zOQhL+Kk$`|5*pT4(`Mg(92u{Ghws{xv)iUZPF^7#?ytv6$9)*_KoWfys=|dyMbNN$ zhDbj&7wzw-QN!qR2%g&wWBzKhi4y7}E1{F9o9A4;e^zIezHB=b`q`uPjSo1?lR`Nk zCA^<%N!)xZNV?aXypC1E{^K409NCakFUJMIPyTQ1{x5>e$%IM6$ zS3L?2L&n2wvg8|&lx`YD#x(2W)6GZm&Cetv(f3Md-dRlVKGdV~qsP$Uhz24`5`)zD zvmyD85?I@`G37}kN#M#HvVBb=dC1)(G98Pd>01Piy4TAjWk+$PM=EICUw-%Pz~`zF z`|YX}G)t-`UT~{g^ftHZp!9(%mxsG6Q?pB|6xoWZ;R3@dwa%8RLan^2@B9qC{Yy=i zo$I11B@a!3YMgylMPp=D!{^UcHuFzcEju+?aPe_k<>ji$RTCWMR(*9jSs818ipl3@ z;6S<`RlOETn@oSw>!B<0-<2o0d?YfyK{ud?DwFO85A<=VMfWEjv{uU=#K!E!!+KxH z)kpVg#`{^3O~zp)>v{ny&z=gu#hZm@kx67}fDJiRdI6_qMuL)sIh(a)8awX$VY=es zYy7%LosIoDoy{ODJE{Ew>{h5nU)_4B3!BEyx%Ufhd)xAgo>!PD_(i20mDr;#LSlD2 zm8*WV2A_Yqfd6SpL+XklS|#5^pN7kd_Pbod(HE4^Al;H4n0=NiDNi9Cp=St+z2au1 zSm8AtZ@i=|&5S*K1}5Fw2pgAr(w+9*RJ?qUc$n+Lmumy$vi(HR8$E$3Wdcco!%b>+ zyPFeVwgHVpno;#uFK%gCPc;Vu;kW7=;y-yN-TOQZS4Py+{8n)oX=XucEyPguw+S}h zkE27ydBk+tMWT895g9~l*j~IA!zBE0rv7;1{ptx5H}W0%KK?k`syfgZekNN}Q3yKt z5}fEbY}A*+p;;5SlC`NYX7_a{=y-(}xBtOJYd_)Qpak%{DZ%VAxx>(wUUKk8F|iu1r@;*Y=(150d*^(|(@tYVkqX{i>vJWxqC%Y= zw@F3xrfZ7mNd9wXy@4ESt+I$+F#S5(EEV7c->*!a@=Gv|eF!;SzSwdt8pk);5rJX~ z$ZqlmtLKJf*@i(fSiBPrT{fZ1+faTtX>UpE*yo*5$#Adf@2FB6zedpZtiOhqu{8JlHTBhIrS&-%qjR zoc9=1?%a>E`&y_`eFR=<873DmP2pYfTS=hiOl)XZ=V#~I5cF<{UhWOWJ4MlqZP*pY zIXfQKx6P)%-`+yo%TAV-@26Of3$?Vo_t)0)djC4hM$aM((c5(Mr5PX0m-oG~D4nik z=_AOrs8q`}pDi)Xd^ufeQMC4(h3@Yy7CuYGETTg0njc${Yd-%+gGGDtJJWZ9krtEI zE-@=wy4%7c(^o7zB*sSmY8#9v- zH{7vGd^?hza^xtf4StKt89`LLuM}n}A*>$B@7T7~p`~dBxnMMeUax+G*kUnM0B$9xJbJu zMZS}WEe*x<3Ce87R!X+ASMjV_3+$a9jf!%-@8e`OID8Bf-Pn@CIl0efnd(3^%N`{< zN=}m#KIxb+{R7-It)V9G!s)$`G2juK2I)Rk%%6J-EGD%>s@o43n^;A3(`!h~eP3L4 z_ci?_+Xj)lClJxgVZ6fg>$8=YqF}}woHB5XB!ny=>o&aOB1iuRxle-d$2B4C`m_t2 z>NAN#`dr$ZD~*Lewqt_Q4C}g2t(Y3oXl*h&pCs}5lTlL+pu?pLIQR5w(r39EgXgpC z)uG93+4XR4>~TqUS-=(W-K51Ho+ZZ(_4r`s0aIMd=zy)CG`4DnGrhr1*kN{&s`f8t zTXVKUu4^W?>15NzZ<9$~(0q2P#dOikv&Xpk?&C@3Zhns>%bp%;JWM+F^0oiXVc7U? zHcs?=LdFjo;EJjaB$wnxSw@#ZSJ1&tQRcCxLijRsS^rA7aX9ytAV>|XhjoNO@n!RPQj0q+sp%x%is)43xB$5iB4CXsd z(Q^s_OS}AV-Gdn6+)&_B`kTQlb~Jp{xr+^FpTn-`ay0oVBKF&cg;VEsK-Z6bc&&LZ zdU;(TLTx!3aG#+|rXR;O_cRe?KS5=D2S%1=SWPVi8m}q^z2T=woNgsbhvgB6gb{3= zXg5yMvBig0EZB?p@LIn{Zb11XYF(I$zdlyNk@aS5Q~fG3rTHYCd~OrSx5%p zi4)i}iMO~H<$Ksnwkvij>z>zIE5_5`yKU|R~ai*+dUR`C8*R5ejx#9!a=aP@NbH|EqO^-mu z8KGn|oa1(n>8F2--r?yz(HOty9|k1zVoN#}D&Roy#F- z-y|}hJH~xtPYerbpO_3t z>`%fkliIlNiLrP?^**k;;>?cUFbC95aB*a7>-gjyUWg<41qNX*(PsxT_sq zf=7$oEOohSa{V;QIvE~Ja-?!sP3hUr6=bEk0d5qWLHmzOVMWw0YB%W*gxwlR4_S=D zjZ18a``?uqbx;C?3Mu%vbqB{3*x}8S`pm-SZcJ54piu^=@c81M_N-+D4A18SuhdvxWg4Ve|ILT=p^y}AJFdn+c zvzxBtv$ZYaU_?%a$oKi;OLwc(f~bs3&T{GfgQ+wtFjbMdQ=3Ke{)Bz;}R5X2{W zRpk%hEYo+)hjARvw&u{Q>meh3`7^%tzJk32yo+^#m$m=bzryT!<(S5^!9D#0aMVWL z`E*4}lzU_<9x(jKB;0hx>qci$Y5RKk5_cS_>N{q6%6xo!MFtcPrqDfaqiNI5H6UI) z1I)~nSTC7-bX24mZgEx?y*Cx(w(nJ^S$*}Sx=R8*Sq1Wr@El+xR~*s4lO&rB(?fg@ zLW#d;jXV?s+YV;nA+=~+xc>ncIPNk2Q8(l=Zmq|Q;~hZN>JTP8NI~u006U7_;&HX( z_;=Gj)E{)FyT{d&o;X<|xiS+KKCl>pdO0c3=i^NUVg(XTpbdO~%IkI^_k!dv(Rr{)9*^bj# z7daaKl}L%EDeb2N#UB|dLnWO3ghMHtix|G^7}?J=OGhoz5ShQ)&1$@?;-(dx#^KgD zG|$SR;-Ag&hTm#*tycoSPsg!tLmOA!avf4TMqx-RLyP8h(zQHGFyV;}JA0)$u4)W} zfgSPKcrY7>>gJLY>J;;zjN;5r*oa@gx?i4>13xX$%p?#OeU=j~YaNH7+e-06=y_1jIu8aKH<2hsg5>vV?t&SgIk7*7 z%()S;E?R+2ko$)bTlK=)p^N$Dh{Uyt) z>DOcMPu`JZ@`Y#k5-?gU#FZM!DE6%s-@P*;uiLA@?Gx`X{8A4uosQFThdtP|Sr*-X zxxjvfN}_x;0B3coGD6@zAtDDfSe-?6*OlSFCk7DRG6#>gPlQPoOQ3n~FDQ%R_lPZ? zkwpvYgtbTAG2z%*jEKmk(^nMZ$v=g-_pTEOSQyQ-!fwEWc}7^_Go9HH@d?gPm;qRtW^;uPIIIuipNkp%QR-(pcoxE1enl&iu^g^&NGs3aZ^-m z>6Gj&sgMI(uhOZm2>VYQSRE@;^sOLCi!BJ{=vlz~&cfrtLB-lg*2?O5-lk1)V z*nQ_LeXG9J9@{Zg=O~OL?j&*qBWFJ?ReuSA{FcTi-INm&`bkx{Wue~z74W?vOUl+B!5{ZBz(2zk zC0u{eDSnP{+rpZjHChU_I}*7+4%={7%|E(wKn6W)8tAG_Ka8DdghAKxplXpivhjCe z`jhXRhwU>|;WI$&vkd$`{~Bm_l%TGV(r?FAS>3ziM4BP9xJBWcu=@}1bC?!MMZF6d z{>~#(GgZf3m(pNVxe@pBWty;Q<5)Uk&Ph&d=V!|lEy`W)xXUe!y}}u4r8Cq%l?(FT zLviykm%dKOh#h^!#7}4F`+*bREXFN*o5PvT zKg|TyuHjE96fj{$NJ&gZ9?Vyqv-bL z+f4Iq2chUu7sn-@wst%!Njq!z(wRZis9BE;_t(XliX3kXzwJI@Jyv%+qv}-1X(>*i zI}UykuAWsP^ss2>COm28Dx^!e5k!p(U}d;Un`Y}DAp_R-OL~Mmb-lUZ_CoGfQaqig zWlS`8mvY8Rs?HaMjfoaN`JTGTzC&4&1hf3Wacc0Uma{W>!O!Iv8K92#W2SQAayIQwC7Cm+gm$_l;hYcG zss4pDk{bJfS{Ht#S<}Sn$;1;B#DsJ)*w*kcPVCv@N|uPWGjBhHu|q;tOtaKNdv-Zp zJZwOYo)>Xdrm;9RQUZ5<&OwV1e!n#A54B34N=Gy&k(*Y>aeYlAj#zhsE=k=Fdes@U zpm;Vso^Xf$ckus4r35VLDkpk=CU|Gl6AQ8 z@;f?MAI;J3K1#RlCX3al;q8&HSgQ{MB)-rPXG|W2yFC1PM;q^NKC%u|SO2kIRj~yN z8qy&vKn(A$7>Ua#>7zFZ#BpO5ARAapZq9#7!^>ZxNp2aMO$mqJKIxpZ>oGi@HoC|8hym+xn=SUw+hr#4{s7YDjnshIeF?*;Ekx#aIJ zB{aDfixmbU(!DVTwlv$)%H^>%wRpU$qq-FF1)Jr`J<|-5X?9O#u{sUqs{MlemoJ^>Fgn zbo{WA!*=UwO+t4Qv&(n?tWWInt!Y-gUf ze54vSJ8`#qFx{jlqCNNLTlWkXFwsFyu&~P%W{ggS|L$bd=DD%d=kg$S_$iAV7dU|O zwqUq;?>kAj_Lcjw{G5;%5s>xahse!=#WlBeh4?1i6vJy&=+gHIcp%jt*2EY=Ql%~Y zw$x;mrALT51HW@4Y(q%z_+Z?a@RY8v2mvGi|6t{4-d7-|0=v)G)2}PakV|+4()wP+ zXhH{B>)--XZ$nUS%Od#A1VWqU7b^7KhOTP;T+h{Ga9a5;WVJL_a}(CWmCy*78QhOg zvL8ZxN+LZH+)qP4h0{)bf6$r}OEVO|(+?80_%hvr(Tw3a1C1lt(gUC1a%~0;rhr zcPu`1p!hdc8XT|I;}>4B(3z0P{Wa%MyJ;4&$Tz{fSub$pk5rI#`UHFL$ilq^4fNfZ z7^s}pPyCggLG!Y zjTTI?~3))jaDu6 zcUd+~TUdlqwTIDfrxuZZv5l~QFQC85EF5!vD@l-N$%)a=h)S0(+qTOHY8LJxhE-{F zrzioTTHX{tzn z+e!Q@(MY#>74vh{U$kz|CVJv<0ffXU3EZ#!6kLz$tvdYdnT1qgr{IkI+mVnNA= zSAxZL%LH?SCJMUkiG`q6$(Nn>x;ehgU za!vv@>FBCJSMh3>n?ZuJi#}G3tk4o1UaVfFKijg}oA8~8Iq$2Y7OxU4*%(y$+q**W z?qZo>$6L1gQ#$WEEtn(NwR}KuxU5}ZX#Y}xi_NM9^Np&fs~i&StK2Gx|6w3_)@@b& zYL$4ERKO8I#PY?0ZTHNprMoW*Ru}xNlISTEXos0pCumz&8$Nwel``3?`pEX9g78s> zf)DqH1!uEU1jktoL3Z!V>ZBQ-g1zfRf@kA>1P`XauR3pXThOQHYCg+hWc7cw>jX6> z^Ud~@-mhwMelFNQuQOF}$Y_aRRAG)F^3GMkm2nEyr`{c}3fwCuI3AH! zeR#QjmCD}Pf`{6H)$XtJ1>b_+3B-E}1a||~t9RV|B(S??P*pj?MsV9ky4u12ksyM& zm}$RH6uhauBXDUj75H6#D98&5uj<(SM<9N^Mo=5b2_{Z3t{QF=6C4RmfIiIwFnM|l zdPHS1@19(tR=Nu4X!xK?{<$RNX(_Sw8gdvJQipNd^l(A>YxuE013lkQq*^zNY4u}C zc%a4i!) z`JW%fJ>ng*={^5(-@K>cRu>i2*{*?8&v(;A^TWvOxIt#C{K5ycjHJGj!x4M);44(& zfddgF+3h%L?QSEBE5Fl{)`#5jml2}mDeCNSh%XpcgksBUK5J#+h?xOiG;*a3&pAuN z^KU<+x_2?-Z!L~Xr>a5yP#(R!E0{iZd`8@P=id~Yc+~H2gqSmmusL%p9$wbMT=-W& zH;g^NcHLTr#SId;#<81LZ@5A3%ow7LCPh?ni!OCaoy%5z8Hw(xk$Ac;9^X_5=&E^< zY(sDX^VTdHy=5B_m|*;~oxh77O@>tEXzH6E1I3}c*`?)nkQB2B>Y6HWpXE#*d0xWx zO|+vf2j)c3sU_aN{%j^V7}}ThFJsPrs78eE2C`x}itZ(&flg?I7I2vr7Zo zLa;~cDL#n&hW(yzaNOH&c;kE%7H1uY&m(SN)8lctZ2fjfewx8J$2AF6+;)Rz@E&*{ zRRkB_Uqs!KW$?M64!^lg#6QwK-0l(yCQzE9wCzl2PYB}N4NPb@)qr~j z`S>sM0JY6i6bW9s(Z16jK;;iW{%Arh{06B*Up?w)Q|fN*j*nc;ESvX_lgIM=PfF`h z{1I?B;^QFSX9{!RnGp2P&jG6~8szW((_SrIJ%b$+&CzlYpD)@dz>LW^P6z}o7rr5>rUADP}}+zB*4{s#VD(_0)BVhq8fai zR@?D`C`>AY$?+F4W%*|iU%3U0qyb~&58MA1quF}iHD^7SXzag16B2pW zdRQ`S9=JkdPE1EFoB;9Kjp*`dD!x$oK}THqOMK_VlK|CU){iBBaMe}^;Xv(3blxC| z{hvoupZ;$24g_fJx1^Puy+G=a8ZFpZhR*_3z@;h_g_V4^CN>?T*)A~XPlu#06HpWz zizTPO!jZTZyz_o4J#qODJ;>LF#usBz(xaM+S(#JiX_1hkHi38@+s+x!aRO)kX58X$ zgEAU3iOei1%#V!cW}PU4gLmabwu$<%`(_tL`E)?WBQfiw=rG9FJPJbA2QFTVrhfF-A-_^G&C{uuo5wl^RcP}3OPny+rO+$@!6XB&%1YYf)OY6Iy z!+7n@_=Bkir!A3;mvk1-KuZNlZ&?^xv<&`RH-+A6U515$VHmf67Y2RcL2{8{)-%JK zG2=x#Gw=Rb5&7Z<8-_=~m}?anCe-0(ByWRxe`m9nS0vGA*(B0_<^{T4D+gT{aZ$R) zWXyMy#A{oMfyn$LeoK3(S=w_HW%Iqn@5}Iee;@NbEFTsnE90iP{n)FT$~B$pq~&K~ z;E!A<3ZqYf%9u2adhSf$m+;KI!G-W%EC{uZg+sG!Jn7U+qi@TsQJ&A=i{_q(|CC3I zGA)+kanFn7=+@Qv@#hR^x%~#WbX&m2cvsA<%qE#S1t95^4P&QBi;NwH$dI>~=v@Ikt-oaydYU{zlhr7|9-2Q^IQf&2V;L zKC0hcKqEbO67K`WPt9;c`G4T;DhbRQ=N6G7-Sh5tGTygROLwmm}6*EG1tQ zTgkP#D{#NpY4k1AgT_WVmYu1?I;Z2DyT|zXT{=UyosKW4Zk3xwJdq8vAco z;n^VvJgBgdi}-GeI!8LW-P^P9^rH2!We*aCuuIfs8UND_j$urv$pY*zp#N;+tnGG| zqg?DA?5^vDhIdoQ^#9gl(T$bVXW}C|cy15Y8HBJ`8&2cwO{#d<{UaHxmIsgbyPTibN*cV1!^6T5mrLt9HvU6M|kqvk~8GN$|FwtUUo;9xr7Z#iDT1B?borhH3ZA zW_aHkfJc-D=^CjVdcGu@Xm{k&!uerXkPwFhx;N>YJ%;#XvKwBSWd?^Io+oQez1Vy1 znrzu%8$Ees1E-cP4O2#Kz_30cBMb_IdV7wV?TjKp8eeHoz6To2?4*|*bLlvve0;cx zWrb>+F(|)_K9KI_@>a=;*rWVx$<>NxDX%1#n~rd|W;dXL)la&*Y68fgG{>k9-t=(6 zGDzO8N{=LK(T|Hf;Ml54{Fmd2{_;<0vg`>OZug0-y=w#7AFhL0`5Ss7RSDB42;oP- z2n^6w2Lq}>SH9v|-#KRN^O&79V735!i?%b;3DLy;eG;bRJfT(_mQbl?PptTMgW8TI z^Z?(>vX#-p-goNkN!LhRBVmdLPDzl-^L?K<25_(Dr$JX!Dgc_38$&-B=X5*NY4Zk# zX90DlM!>Is%2emgMbub7g8ecmS@Y|98$6x#kYtbL^Rho1s7?NH)LEwue_!!4x@ ztqWQrEuDPKAXAY3m(FP{z%MPWhwEZqNuK*FAnPOne=bvgQzI zxQ+%*c}5hcf2Mziwh{IIKyaQk2ezDTrvZ-^e3>mp)1R+F9jBw%>6b}g%?*bg zVhS{{Ko1R4yl7zkYutUAcQSh=&2t#k6t!oFlB2l zGsagPCspZTYnXsY34?IDbq}UrogzA_{+qtLwHPuCPGH{3K$tQ4B}#|L;EmNS#CdBv zm$9N0$6G$3FXFuEO4W8I$kvGl9PgFq3V|%P;4oW)$as2=iGfV@v$Dwc(DOLjNgUqk|%I$ z{xEJq-p_ET52U;cVe+dcOr4mC#@Zvv%Vi3pS>HN2X>VYB+YEQmzQ{6XTk zY6DFCC@HFr)xpxS&*9g?r??}EXaC{PD(Eax{k>{Q=&(_(1%9B5&yGfY_j!VP%kWbd^@8kIN$uZXM5#`=5Zh%YG z!_4ezXYBi^ND8<9!u-5KQs+=i>vGF!Uj26%n|hFFH)N7wv&nqcavUZ<=kwQhj>C#C zYOG!9I2iVC#Qic8F(+y+E)$kPj8O&s+rYn{+nsS!>Hsv$-=M9Qa_obhx)6Fq9U`Oa zai_RG#OXUw-FcfaBg+?`O|2vssur=U^OI?6`ArzyMd07Yhn&{1HokeS!2a2}6*6j9 zlB>mgaMxx#68(J;!|y)8wV@{HEeOXME`>Ba{RAj=6cP63BVnw$4;psOM1xk|QMo}0 z))>2?it~ExY8ZPlLVM}R5O0PHP9B2jjB>D^nGX|Xx{0=?wqI0 z1~D}%>+D6l*PEevlQ+uF=_W$co%r#@W!$}+_xF-a`g5k9$i-|d8{qvOijGTTMbH=o z#~$$H7+5nQ4^`&qvimA*$==LSqQQ`G-cNFgPoLUZ_niJiyOmAAEoLdS@%(^pi?LW3 z(FLnM50dOOUmP4!Xf0!Y75BD!;7-(+%?ApUGzjLiXeJPqM6v zp*tElf2U`*=wj*MPmG8;gD?F&@l^i>+>q)CFYcYjJJZ}TCMyC@O)SGDzOyj##2PG8 z(HBYhU!k#wzmQLB#^R{pP_URY9{vA50qe#r=1qPQX7e-tfVDZeb740Yt^Q5!UH5^0 zeOX#y|CfZt%wUiD^E;8VK9Z9B*<>5pinV51c)5??x%qAf|5fj|ZmCEEl^`SBIAbH` zwUnZI!Bo0kP)SlIc7c1~CI z{>``sR#>n9?m+TKE#Oo>Cg7oEK_utMSGwYX2(~=u9p3V1amhw8_VS%5>u1ZZ;|r&$ z5VTnWJ=F5y=f1l*Gua)zM^A(#PgPNhZw#Dz^4jWwi!W69w8O(!TfyQI!b=FjlpmRRM(qYjHt-Im&y?(PX&};5pQFU>GbrY>3Ox^xpfj$V<-Os2 zoqvqO$Cn0>wl zUQdoDJKk<4r_65AlG}EyYtD8$C^;T8*6D%RzVUcp5Jp6YB-z%eX=w6D8Mlcp;)O5O z_<13e*q z4n6-~VT^u;V8QRzIBClfklTKm$(C%Y^tC#tOO^{m8u&pr4@ z<^^R|SdrSRqakdH1A8;wgH;>0!4c=~bCbHaamP0DEZgcCm|v#C-kvRnVFyYuf4G`P zNnXOns%oZVdKs?c?{fdn-sd=y%jtwP(G~y9;2iI_5L>N=+jDM_LpGl9(?N{A-D80D zJA`;?dm_#;k;55bZ@A2dFNNNl(y?Y;6Lm>(1RP_?Is~nNpW(Sc5=Vp4yjZ-W@eFGs z_tMbZXzSpjX^dZ6I{BnoK$X|DA7V*G$xj03{qMYp+Rj|JSCV~zB* z{a&irF3#M#>QC2m2dxX$;;F&G;c69wca~^3DEyT%l`1>vQn!`AIQh5s^kd9Hn&%ry z8#`uEw`m?6Ka!`*FZ$5kqZPS!`q}zz#!b#nX*{(acv!QT>t#Y$|Kq-Hoj_mQJ4e|^ z)0oDOYw7KUpE=*oV&?XX7;Cvr2{a_gMtDq0owgZ!Q%75M`pd?FzCSaYyVd=btCo#r z-mN#c+SI8<*LR)g#yp@ZDT)R?i)7&Gc2Y^YT(_;npkk@Incy9^uR#UQ#G* z9O&m9({nAE-M8ud3Q4{O8RVutT0r|o9%p*~sMAOG8W`r%OI~r2PEIsBbq;kes^@ZaG`K4! z#hlKAK~74sj7GXG|W$(FzMXO(s7p>-~Z?b?b`I1MKDo*2%?j|lzS6(#Y zW)$4I#nBcEd*<}ucr5NCFY3s^H74^{bMj7$dFK9|9BL1xUQZxhH-%gHDo zr-CP&-Lc<_uv;r?acFZoeUq^O^B!c;JN!r~u`z_)y&6X)qD&#_lq44G$Ktv+7JoO1 z=$!U*DF6Bs-j{z#Pw;NDwtL^WcJp$o8MO|7n*`vN&K+pE?IUd5vnMod+`C{C2c{)Q^iEm^GD;+&aXInFqYo$OUJ!8I;i0lO-60T7+Ub! z8~hix!~DB_p3226}_v@y=g0(T$oGR9^FocvO_Zk|U;=FP;XIuSk+wza$KIjKkuP zt8}+t1RmsP43i3=`O68i)YYC?E~oVVm|@)0+r@=n+y?9K zW(Z#%>%hI%&&ld|F%;$>hZ#z@a6-~|y7{IKTppz=+Sn(}8qQjPqvnTlSM{=qve*Zb zdqN+c&YvzytlACBb z&Fj}jEk4KTcl06s9$Sp-c}w`SKfJHHs}65R-G+AI1gQJy2Zo=DYi{jd&K=&<4M)_& zFlNsKY}v}7xiZhhxSIl<{idK~n1Sy_$;_Fv1`w5b2+JQF#_T{VoV+pwt5oaAr)5`w zv3N~y1*l+csvLQEf@e5<8^H6+Dw%uwmx0`ULKdyQg{t~d=$w)O7R^OCcfuPgl_krb z?T^N!@k;Q<<_`29+zTn^)(F42JZDV$?6K%-2CD5{hq86=@X_XMBB6ExKCk>mH~f2q z;)@z-`>JSYHJMB*-4W(27>ivkS5W7o1``%)jDfcbaO(UhJb7*^{{1RL&v;al)_b=} z+wN@4(9*>v4DaT&=qJJ6Cm?Kr0qvnruzmL`oZ$DG29Hp}W_>T}vh*fK?41a!dv~&N zV@z?eT@iJ7Zj0MHBQU~QmyX+RPc(n)W5mmPaztS_zWA#^v15eDtSTO@yAH5N*TrGT zX-h4k+uPUa=lTk8@|+)#ky zi=@aMmq^@l^#hWl1jB=7vXVE)GRLp}Bex>s;aS^8dhC(`R!%O!%(@7Ymug4u{OyI7 zk7ii3bQRPbe8UCF-#~ANNmRKZ1Z$-KqtVASM4!ydu_s)S9ldq}1ZN{=eJ;UPmkJCC z*2Viho6(kDpg*6Ru;Tm-@YzOP);jJz-E=q&%r2+Xt2r|8L5kwkD|)ERyEop=siXG? zWavOl0sQLI!$0Ac)><3yLydAej(X}qcEpvUyj~~qeXl~F4RxdP?`>fDQi{&toeEP- z&ceVvW!N*y408g`;HAb5Yy!;0!h@qk3JWKr+~tvshWmQ<_s9WiYd;HbO;P7Ag|?!N zYZAtkPlY?zLLp1mi#-ze5OY6$#wWkOl2Vbf=yco}(vuQGdfr=l<@kq#! zI|#a?_Yvm&KbpZffwv9cn?BV_q{?^Vh>~m2zswnDZ&=T%C41neATgS|&Krg28~#Vp zc{peOfQmH5@J0qlsWMw3zL^95Ef0NOW(Xfh2 zsg%`Jn%?vN2gVuCx$o=qxghzhHM4$m0Inz!g&Vi>QF==*#%bxmdgdW9Bnu@PHT5)%^4E{IK6ykg;&|YmmxgdU(8nzXq z*6%8GOOodfM}I)$gE>(3Cehq0vX5!=^+NwCHNcGHf8RbaJkPCU+ho)6)s7s@36ExW zuZ@86=(~74H=I1|7stSxnP@JTL84zqard6JGm)v9H1)A2HqRBn_2&~`M#O-PwOY3TOU~vC>aCkXZkel|v{QdSfT;Ahk z=xyVDHLlK>^lu(H@#QRtoR6h7y5nfcf@zo#7z<*h9r!RWmXTgpjOJG3LF!gD7q#|0 z%8%0*>~+(`Ke|`AIV1a7^|D%0`Yantv&t~-v?Ru`bpbB58nJz!`SN9 z(fZ#8c($#E%I)q#FO4ZomNlQX&YH^T@vO)Kow1$negFLTszU}yu9D5^!yBEfiiyP%}YNrN$oD+w7)3d3E zVihg?!(v(fDaAdJRbk?s$*Q8#?hMHpXzk8(BaxTWr4Mpdh4FvD4 zChjdKVa)Ydls$hGj?_fr+w)3-vu0g*cisk~`=^q1FAx$K8XKVPLvy^kunQgITFCJv z3-r>g!ER|0oWsn3?6%AF-p?Y)Gzh2B&VlfA>q{ z`oU*n8EwI~oxjX2skH;6IqCd+Y!=&V8H3~Vcfs#XVYp+TJ$!H9k3E@(Q2b;D_)q5D z|IH>iQ&^GaBwYic_f_!5X(6^xmIo203*JvF0@HbJApft2{OmgguZ8a5H!LP%Oq01@ zr3&4Y%+HIgQbFHvCih|O0-QKM6rX@T>1a9)7lW>o36V;IcR?Z4Q1k$Cdsa&IFK;8O z3!`AA)mB1&hzgGF-HDslX~LPW^I)ptJiIDWPo0ZJ1*gO9>0G(*cx>)h;!$FPsca4N z^-nJLDhlC2o|#MLe`e!IA;FJ*M`>i{Y~t-R5!wn@3Tpck$ewm%oZ=*eb}uGiK(7|o zD@@?rL=KSX|LmxuRV7|dv_UDSD)^RnjJ3FvO7g{`vE@++Hcbt~Gld7B(jpyWwweoI zx*CowS%BA$gn_=*Db&@zPHjV{fbO5$uzspOOL*>K#qS^V`C=zLC>>3Ew-{0X<4&lb zdJR|nsRa|9gct4|fNG|gJb5Pq=f{=Mzez5j#(RBiJYSKbAAARIK^eK1K8fsgTgk#F zEBv!D7z(G1aFub7$@0i9^0_b%pC5`t*Plvc&gBwvpXY@l+Q!hLLwxR0F$Sfp|8mb$ zN4VN~PdNFbc_<?lhRbyL`4!b$YPYrtfjMlJ|oW3 zQTSM01*#lnkX%`Wf~h9(^{6ziX?4PyW%ks_cMkd-3_+KSlkoXfASNw7j!mhF%tUi# zxVHKk)|?QjHh5;k$@C_E+swb>g@Wr9SYqd%2zqMQ5qwCU@ljAC9!TPOm6$%PUqeX$ z+*X?3m<&;xY}{oyfeZ9MiaNJH(4@!={1&nk#A^>jtZxJc=Iy1G<1%ohCm!Fg&w$YZEy1&G zYOv&hKbqyUvk~FK8<( zCb(RA16&=?LgTLEI6hI0J(~Rmx^9Oec^X-%aOxP&S$CVc-#bcA+X_g+g$7dXcm!Q% z9fzY_Gp;(4M8&U!qkKU$&i(p?Xw|ggoZiLQJ295-ox=dgh~slPX*PKI8c4mAN0w}P zK-Fx#So@$W^n2t6e8=zDiak!iB+IFAM{fy!89N3QEv=c+RVC2V*nm666yw%tM2oH# z;%;Qc$v3?szpk`^G5+LK-28UUUi|0uUum-ir?L5;t5sPeP} zD@z`u!6ik3Z$mGLT#}&HEvfO&1C72%c;2?W~m>{P>8J>)0<2B<+f=0t`Sl`X?euV*| zba)z^Wc`VgSpmLGdxTyW9%9-y50Y;^h;KI?#Jl^};bhGNOivR!Vz>GM^*9yAFC=ZM4T>Sb{AqY`ZTwvFC%+6`E`2lKNI;oR5Cz`I!? zdaNG0e3OLoIm$$?suNCJuz>#ZV3^Fm%e&{^hdT)=m~iY04(s2*Exq$Wnk}h}?7V`f zG6T_7hR*=x?#6Q)W60;1wM6cL5)QQ3a8>qytvTpIT^oA{;`O178la_KD_D?jgGu(J9nNy(o~=oS(D z@^y+pPv<{S^E-kDf=;ksTZ{rZQMi&f8^bdNSgo6i#mr<}bugGlXL_P6w-~IY=He{x z9JB3HJPg^$Yq*oa-r&b~}UG zr%FL&Uq1O7RKk6IAVXRdkFsv!JK?FFF>B+k17nXwgWMhG%HdoiSo^pi6UEzbnbrc( z9RD9_)YCxYDig@t7zve{8}Mv=9W6}ZGZMZcf(Oy>@UD$HvZ=-7{>n?3KGq$~vO;Kk z3h&hj48`@+W!PTdtyH8~2u4$EASz#-Zu}~X>u)KbaPJ1%v*Zz3CWrv{8Ye2>yn^hq zzsC0~gu%P0m-ue*6!q=k&q79$5G<*J_n(H4grf%dRrMV{5;YDtY8w7tOAqW z84Ru7!};8i#`fY~5jY7g$w7ZGgA z7tpk#Tw?U!T2edumCG=6LYo)kz@_&sNuH-oY?}aHhW2n1yT)NrT?0HeaDifY$v7n% zV$0;GaJ%m$Dji;4xwU2wzRunZ7jDhs#w^%Rv>(a|x_bu5bp}HB^VkbM|9u%uVXE-UJ8PMEH2BkI$FRBRiDu;!56s zB_k;zFnp*5;n<8OQFl4Bi#IS`JD+^Y-h`iZg>mzcJJ*)-g7(#F!nFQvum%0#RPb6f zFWLyPN^|j1V+YzMf1oeVr9*x0c?{kij8&WM*rz(86}~PHXy(>QU^O(toHw)vueY|C zse6&GQ)s45j0s52b%341wK%4H4C-v+cZxoX1q~(tq+~Gp#2@6HBSDY&w+h^F>uV zWqcqJit3e{aJkT7oS-EG%xOQmP*WO;z5?!c$cOu(3m{~T4XT$_fcyhRcy??SD>d&O zO?z8~WTp;Wd8H(f*WqV1b6(J0v-sM=&y;jM9IX`A%%EPoUeXpkM78{!(7*5?A+4*q zle4d)?UEprq7Se~xe@F1h~WE{G3|R^jm8@ zr1q3-it`%qd@2BdPx8cIrQy6!IpLI@{NyckdkuiBUm|Z)p z@mR%kb1KV0{NL4Z_SO|Fmr((i{U@P5^aJtyWDUX{;{**dZD`nj6Xy+P;6|P&&rU4B z9nU<`Bt#ANPjqJj<5t7w0$rZPYQdmqykDwv9=mn#VQ6hhgD0sU@xQY{*thfyERFrb z_g!wr202euSjNwVEoI=yxBYx?hZ~NI(m;deALJI#(?8hbj4BIH!J(GlT%mtAwo93^ z_O0qH{^NZVacRWkUKYN}X@Ogdw6WbR2;8!&QSTCizr2LdZJ83b&M;?^cFu%JPn^i< z{p--q{WT*KGmhPL+64#fmOx=(3ea7V^xXbfI1JhZ&e1*q?W>~<+JJI-g1yrNCrjzEO`Ea6WTRT z#lLxJR7dO@#`|=kp4ndXzO5&479OEKUIwg}v;=P9`*bCxdD^quma89(L50=3xHPLU z^lFeLlie?a%Jgj1yCXy{#rxyNdm6m6W<1Vt*T;OB#i%QE0fxHPku|px@l1dR`?SOy zEfp3K6YF~nxh#vB(xD_WEehS#Ir8b>IKkuh64>G(j-w3;^m*87tPZ?Syl+dPV*M-f z_lqrx|Ne!;4pq3D{^0_DE#T(vT!x#NC|DS^nHUYK(GK$ubno6z=C2d(k&lMIVfv#@ zcwj7$XICpYW65}stu>=deLpj)-ib7Tzn5N=MB;z3!|==RCUFf8#ATCyQgt^mHr7jz zbMKe}WmY?A(pDeL$k>LCjgq)Rd?(&{I1yI`Mc`D|AQY+z0&-;^>ePLrdtV)<4@|z( zgMYH{Q&bIR>ttbLp&3k^riC-6T43O7Z`{MXOO+lDVQudojFc?o{ni>(QR*}_#`@x- zQ~@!Wy9c#e{-U4XX>4L-nER`?;guIR&>&TB!IbegX&wK)WLPMRp<2)B_MLpzU+xIz z^87|FI|UPsLzu06)>v*o!JAP9Y}MN#a>05$%sxcv_x3|nEdDm#H2o+E-?c*Ea8Lv{ zZVsW(5`Sa+qX4?klxOa>jzCMwY{>h52puGTQiaWX@PKmWVRq07Y&2Lxz7*;T z3Wmh+g^V>>DHKU9npK(Xd*cQ18y(R@vVnBn^(NjHNQXvk@N3^X?!%OoG~7&@8;G!k zzMN;IS3eF-+TPKFow8VqeZ(g^9GB+YL4k_HY;w{WA1aK1=)br^Bhr1cZw zMuW#Q-%KsQVR0T^FeMMf9hBKzowwM@yZ*?)SNb>PCaD$s2W2{9ps~IjOq@gTvG`VS z&ymL!nsxB-ZZ?gu@d4NUB{1u-EvRm|L+@TB=ruVIaMy2YJ@6VM1;M2F;t0LIg!hNo zYY~@e=Cu4v6RGzPz~En>!BV*dDt_<4vN}Vk=e?vWC9_B~|7|6=#1rR)+tOC;SacSN zWlH=0VIwm}(6yx#ulV|Kb966N_SmF=*0D}1!OxGRvhz8WHdk6RB1}(=PQ>h#tK^PT zHoZ469-g?k!aYaCtOuLu$0%j&xVegZn!EsiCCRV~QVDdP!5zr4DTADrf0z*a0^M6} zK;pz{=7)G9r0kC17Q4+PZX*XkY`h8vir%Gh5fq`Kg?ShqkNNK(5#^z0ba2Tzvanqa zk2lN^6a-IX_6!*b;@-am(~P@R;@xtbI#|em*M~Csw;#|`PM4TP(=*5h;|R`pPCSuo zt)pi;E}(PrbFfNkL`CTh=+Q0>a?iB`ao@h*5B0rP99emSAA!!eNxWMjCGQO#6r8~E5!9zyvFsf+F8Z~HOb@E-j;K-2^ z(*khkx1)DzWEdbe!oQ7{(HBeOm;jfuKqc&JyS%WuCov~*v{b${e!sS zYDcojxtczemlj<4n@#ury8!kMkML^k2I6pH8G5RJ!p94Dp~Igg@X4|Pd*3HLG-kvXwhx}zVX*# zi>D+zKV~i++nG-MR$1U>ey+WEMlxK>SHK7ZDb{eg3HwapHjPWz&wJ}{VXK-XTq+I1 z$udu4$T``U^OyIZGNv|t6sfjj%~k366-#YM=JcDamNn47G+9qTH1gYpA~hM{|-AZ zf1%5)qR~n66@)e2z-RL`(e2tOcf`h*WIi-OcDe?2Yh6npHjHDB$|Zp8eQ*5rwhkx! z_9gLIX7rqJGp>wV%WBX<^j^?`MJe*2yHy(VZrGv6=>T-M%7yA}L$LA@-$ULrnO%{# z8Op~6pvA{9T&uptd&=g8snM4&B$3&7?-G1l6-lt$=$g3_Vubh7t# zl4NND+2?IpQ|0OS#{UC7`ALNS)vkaOH3xA&`>(PjC<1aaodoG({8+=xA0G+~@za1c zuIuQdg+GIs6U$v_PnsNliA&)2oZ-Lm4rh=TTYl5uy7E}MZ-!v&tGOVW`i+E}?!~q( zVYo=dA2eQ1M34p)buebB!$?laXY876o&^ z6-IFB!41^ohqT~T*kO7gP@K*W^+B&c96l^~N`gLDlG;BZ?5@`HD0X=bCVraDj2?Q7 zUQ*NX#0}n0&SsNF)eCIsbX5%J|JR*D!JsR-0N1_#{D1ElS^jN0>9sgXBwwwfcCBm4 zT-S117#~gjG!D^52NTTR!;@T&PdK*v7;0*)fI`_DWK!g6D*YpoG>-|zLajwOaiukt z*pd&V`XLc3A~Yo=7H|3+v5DnZq2RfWAjraui#$<|9#^KKgwA5#(Y_6}>xE#2IMPR- zr{m8zf2s6=M&jA31m^p-K=E4*4e-){U7f2*xr`J(EdSKeS7;aqg zB{-L|ud?`_C9X>wpw%DwTeK;Yv|X>I{&EIv_xm&S_HS8i^ycTcaV7$ZBc@o<8;1?P zV&EN?LLAZyAj;q!iFkDpmbA>kf7`|3_F06FqQ$77y%?ut{f3x6D?F>d59yy-sIcAx zHws;$8hdu*KyU@;xP2)zvhkB!3#veQ$PvI(@;;?deIBs{{g^kI@75df;f`c^7 zTJVv6XcIteP8{ex{6+o?N=K9Ds?b#OoFqN1vJB^BpZpWp@Doh!^KzYduOz>}KW<*6*26h!tq5dD}eA=I$nf3y@cZp4xe zGR7#n#TJ8>t;OQ2z1U`X1Wh)?@oY#PB<0&;W!Wm~nVJZnlV-5-W2#B7tt~DaRKaWV zWmrDx7H({m!cr9}!Q2#Ky6&|m=ocBVSF5d>I7*vFn^9Skb2?0b+ zGZRv6noz$ci&!75rpJ^y&@fQJnkEjt=?*j)T0#EPf5{#GegO@$2f?V!5S{Oj0t$~o zoArfoY}$2B27Do_x&X^9PNJ6Mc3Adg9&l_vc{{WkMy5qT==Ugec-X^v#*Pt8RBVQ@ ztJg@FdN!^5asd-es}R=eV1KwatGCCRQJ5zO9VUHTx_OgOV`I+P6^+qJ8B^w=g zK0x!Ui?L7CtI~D(82Ef~2y+a@=!wO>WY{qt7(;D=f1xk^nXZF^38ALg=t&KoAPCJ+P)~-YVs29^|+&*Z7Fj|`#0CytReVx z{t;)iBNv#d*_`>qa=Ib!B3JtB0vxcD2D7AWy3X|$S|6~&&e`D@`LY6QjQ64GiSEj% z{fkj-djfN|d7g1z zaS31OYoJ(K59Vs!1>>#KFkR{qZ0>f2KXsL;aN!V^yWGd`2W14wOgo8~6@>FIiwRD| zjncOdzY?7t3-Cd4485CsnyE2d$abZ^p##&j*?pBdcx+ZS-s!G@#V?id$!Hb*R<6(V zTv*DuaN|f=mvOm@k0wrbRD#^6J8*2>D)9Vz2gjQn#~zb?B;u+iX4a2KrQW&p*HR5U z(ZpwyT$iEi=XE$kQJghhbDEB3XA#Q-kI0KT+UVXV#=bUq$F+y?e(gWn#LxM%`Du1N zRLM_ZyC$z@I8k*vD3rjNNbbSvHogz^sxI|=+e=MeL}BH-KxU`zLo^iokM7An!miMi zsJz9fr=8n+ho<_{P7J-7S894v(N;Y}Z7~0f0ksF$8g88Cf zn8WINct*9GdmCcMpV{+xt5OT!6X8y!dm>PKuRK>J+X_!6x&O6Fw!X%_ z2I*v2@E5gdDy#Rskh8Iwf@?I#v$3~4c!x3Xu(gxqME1?Y(Y~W}Y>E!a+iVY+%6m!3 zMn5t)Q3B42RYSAGZ!X<)kQiFNfVD?9G27%5K#5KjsM|D?dnqIEvU#>Z!~Qq^osdN4 zyk3nZDY}?c7LWhVtR&sLlt}X7m2fdIkQ*Ix#~}Wi_w=$N|BjtOd*3sdtNjUBv_iEC zZ)W&GKRsi{vxGzDWZ+IL7q5NG{CZOq?XZ<+556LpD_H@{_&J;y?|gq&=Y_vpgvq8) zgK#@A9g=4@5!OZt>!W)(p(b&``_f5pXL2sJbvez9-Y-H@nSt>?U%{QCdV1555a|b} z@Ys$PlJ7PHS16~0`6ey=xzC#zf8leOXLYfB+*r(*e~Jq}cLEk@1VKfk1!!>TRAMKP z!KKr1vuiE<^QuAh@J5_wwjJt^e#Y6Jyx+=P1I*rkWj4v?!iIuqGTfEQIJB3LwP(f5 z@19FyjCAJ1xi#(BZInz6qrBlwQ#4*4>rMW&Ji{UJ^Nh6qLo#XN9q0)aVoy#L#$Q)e zaX7lCvf{xDj5PW{grS-^yEe-u(5U38fHXG7E-{wO9fE(B- zPh)#x6)+~B=Mx&#$(%Wl@ElhQO*4DBgRy5JjwC_T9yL}?DVk;q!eHjUYB0=vkMa&z z(V{;K?=Q3f7bh=9Z;d1*q?bU5T?u4umV#Rc2OwMW9PDyWhm`xS5cSNSe#!fcj&>8k zOMDJ|+LedqxgnJq8k=d{xF6^rAV=~IVw5ZNCRDMx$YDYR!t!-2|PoxFAL|I)j+yM9?E||PNkk`aKf*~2>i^$=(E@`44Lu>{`-9s z1nenBDNT(Pe;_RIn`;LV=f|RBYd(FZbDN`A&vUjF^Ua56C&G{Cr|>a%9X&2fu^-2& z2!^A@1#|m;5%u}A*yIBn$ryuhRPZl@(veKkbtr}uoSDe}a;Y)*PR#_Tw*72+dNl6k zdGVEokKyjQ5nP8?K_piLk8Qt<4*v6CB>F#4wsWCDE}h_z-HqAoecU?Z3L4(G7bL$e zCEvf#hX5@ZR&up6xbF}We7JWW4xbcS!0Z&ouTxq;=h9hfVZ8$5mhL3?@hb0+mNfN| zJ3w4duf(VQ4x}gZ6fX1dCQsG|Vx_wtvmp8#oxA=rp0J-n^_I?HI6)O%erlrNc5fD` zwLJ~JwWZubAAMqBbeZ0$wZ*B;+i^t8*?h{Dha}qDNRU}EnWjYwp{%6Rg3z^F@cqV6 zXb^U$KbGjD&7c_j?TSbF>aDw(X4yd8^tXxblX*o;M^|D>qN`30SNfy{TlOZ=lnNqXzicKW>!t~$=9rQ6B>{LuX*}-nsHV07^?1U_ zpMTy5lgDa%uwvjdtV%P*fdM_iK*v!W@D*cqzfp898b+_<*HCACD$YH?vd#L2aA**4 zqPP}YbiM&z#Z19&O5lf)H3nOVu+ws4&~(mexGXhB;CdvC=0=_3J)~NKpwm7uR%J2B z?z@K-og!F0>lh`vIVh4+%oTO6g`kQpto+H@u(9?IoEy8BY}WC^ZC@qf!-iSd_GKj* z3YQXSOuWOaRO(~`*YjDunJ0<5^|T}03)Q-*zOw~<_z7Glcp`MbG$v`epppYeoZ_QYfwIBE@R z6F!j|l}Iw>NiT-w>7e0%ylZu55NyZ|#0xKHg5ZcZ`I%e@f9n%LcZw77sIG)q`6uMw z>FxAY?PWasAQN7{`40Vy5>eW8r66Nh6qdOtgUp4msI+^8&(F%i*%BT0jOZi0%YXbk zwRwWSR5U4*8E<2rZ-2 zT(gOa>;%Eaj$2fRXToo(&cXoMe`HQBgP!6~uw`c}HwP7Qan}tLGQCE-*IMG{Zwq+$ z?Ml>JJ_2mf6v4hZp7^#+M^MjIqUfJzc=jV4b6d7zQq@F3z-M9l>SP&l zJNOdIh0YM%b^_KM`v;R+9>bfFLg2TZe9xsBTP)p;q*|D%5`RJ$WW=Hb7eFeTgxNJ+ z`*5t3Eh_KMMAhhhWK5bD7MA^`<6kE*T_*;K`0OC?z9UQ*^D{QlC}DQu+yIj5wjTzp zQ#h-p?=<82CF*&u5oVzh`(5n|`MqO0aoi&S-!Xg+ODO|X2KS)nYH3(uorv}ygK)qx z0z}NVQTZr2EUGlXi`VW!Ax`IpZTQTEVj#Ei)lcpdL_)h8BwgsBoLY_os@m zMcgJ-Up|X1RCNXU{yQYEGaJjRI^b-61bHs{1k<0sg_%|*M1Ow>IqH{3x_g6BlQW?U zr%8j1$9qU}IKpgQXekh-E!jOZuoWRHPI5p_&8wH-&=sO z4iKgzC5XEcL}krBlibEl=vw}ie32W_-EzXcm6p2p+r`P}C&MCo%A*|DBOxb#abO!=@8Huz}^X5GxB=dztqNJb3{ zb{6BZpm2UazlQ|gYsVYEu0XNbdpb1UjMb59fy+CEF(mvV=JZdcYEu@`#=8u8tq_M+ z6N<2x-N12HFZsUW0*v_-2$Ab3*za=T=O2r3jqej0EPIAoVbnq1q@|L(hA+V*YB!eb ze?hbtnPL;yR2gS1CHQApg7d{ySmU)3G@{LjUQb`g-y2bwdhigIM7HACfk>S4@GkwE zatD@-isI{<8F+JfJe6*2;3PbP;n47HI8%|2UJ)+j7Vo{b{GNsrRF9&F^AxB{Pa;qD z)?vHkB-ZDkE0FEW;HY6JZM5Hwhnv1~<#YA0ZryUUh#v>K*OtS31m~YAOu6{S}XC)i@r(}qHHxL>WrX_g_(h#Yld-&e70P(ZQok3s z$u9jRaDDYz`gxWJD;Gp??V}qQYNkP#@=Po7k;C;_wj@6NBlkY$2r4}*!JI!ubk$B5 zkn&x@UiUBurGrxJ`=%Uns`E5>^dhc{aG@<_w@_kXvbq1SK^XH#6_kSSV1nLp^6mG0 z!DlIc7BcToMU~1q+Vkonm1_9|56;Wrq|jviy(E;bE73tZ(n*g^*-FbN)|yv|Z{wW< zw&*CBiA(j*lV^VnL4+;G)C@`1WaS|Jw``bDkwn~lEsr+c;QJG{9XIcfh`<=40oOkX zLBwZCnEK%e&RV8R&&91|*X{4be^W2vbSu&A2MWgXwE#oqim@UR$Mm~l<*nZM!Rc}G_wr9r*>jdS^>UYK7o}#U5Z<`Wiidy zw_>%Q6B8(&jtAA=k{!HbDXC-(yLg-f5giue3fEQQ@Z2A?uecOn@7+a~vYLYA@24T+ z`gI~Sb`Nuo8wxwqcx=He{I|!p(nW=$ALk$(!kcoW$KKI^IHy z9l~SGi`~w2&*2aB0lia0e^dCe4doxyN-O&ID_ z1MP)~4VeLu6er|=^yGzm1p@hbkw&6)^C#Ds>2vfvxdIi6m{dy0p zVyYo4GJ?3?ya}tP`@w6CCVIFht8$WS2v>Md5g!Pxz|X4+@$ab_qR!>6nuZ_;WfR+wf_{w>?#m5VHsy&VE9*j)(LX z|9xY$yBS5p#jyW%1nsp;$9McbpkG&rbDL$ywt1?skb^u)VZ{PLT3b;b?DtzS9tF{TTz-m=8E z2Nn4as&BA1S&_`Lmt|XKj}^E?zooTj8<=s+GI3wnbIkR+LJF@4!pX1JVG-K<$W^PKR z5?c{$EZASg&oG!O-f1aLm;Y{sw2VDaS}=w^Chvm}7H-1hyInv`cO{pi62cnAZsH#A z6vKy3S$Je|D_jm7!?|mU2xbO0(-ONwFiU2T#*i0)soU{IuN=Fy;37R3u?5Q{I_Q6? z#;De(g(4RiVd7_t*|OG&BAP`+bgF1Nk*k= zv@|wM%b|xlgwVH*&v(o_i!YwUqs%}#f4)!`ENBabQ-0}?daoYWHIHYNbtAcNJcoMJ za1ptEKM-RIwxNn8?|1#d;Peg)WYO`4sus`2YU6-W2(bN zX#Qhk@Cv=a&q`7-L_CN*R&>V>Uw=>%xy;OuPOH4MTbFcR&4dJ{t1$InDju8Pgz@wv zc~F0V=E(%(N#5fy(MbjNo3VJ5w3C5`AM|s~F!!Ln2v^P8jD3Y;(0Cw?gx5u5P(l`+ zJ);+V9fQ$8>LAhF9#0>-DYECwufbK{aP)VxqbxTSqStPsKJs6v#PqTB$w6KEfOoZ+ zX-%Z!sx$3CEq|u0qtG9U$gC9S2WPjIsVsRGues+7egcxkfM?d~zH7e8&n( zZ0FGRZ>B@(B|psbJC15&?vqQr6W(W}jk@t20@YWpGgC$~(SFHT{4-chlB^fOysiMQ z-JlC32R@*nGMH;pSwgRVTtE&wpI^{@a+Lcd>5r?N3h;wu4BlrGU;$~TZ;V9*8=huB zL5Mlr?)gR==1+wwpOSF$6&)Jd`jyX&1(ApRjHkuS0}Tz1kvPKvIxE;0+lHdRGVClo znfZx(9Jm7Q*9yVX{|xX^M-NE3cQRjFpTfc~A0T}FH?sVv5==Iog?E47fW1xzXf_}u zsGQEg8s;7biQVE-Lv&bod2x)nK4@O)XH7La-_p674AwpSKpNJh;V#~%qi2|bwEYQ+ zx`?thV*=^?B~lRiC7ilPIgo%dRa*Zc0o~@_#geDnq0dkgM&H$w=LNx_)b|5s*Pp^O zQa9m-t|V(CBv0Rq-C(ZOOoi=xm*e$REvUUdVE$gn4PA6H@yEJtxK^o=*7b~Gw@T!o z;-won<5(I^^Yx*N#>L{pzi(-4^=A51S_wYJhQf1+czC?D4E~PYgb82c@W%P56K#;bMTPxx$_~``+LNgJm$((ld2r*jA=cavgoPGW zoVtXvz<=8>yyn?TFDWVGL)ZxFM|5ygv%Fwg#%c2@*Ah4}?;$$#$|bVIaIg6v&1>is zdY|^U?ZC<01^D!V&m;%*V(7ABI^w1ym>eO`Hoj06{EOL#ho@u{yA73O<$6)Nv-ky_ zcU~D+XW2pFS8u*2ej;@_mrXRvF3|ZM6HwOJ6m!=+Azsof_mKA~>6yr)QvpXW|MKC! zuMffH#%pmh<-mnLMGZmQh^09f|opaDPAqThUABGIcBr;U=4uVr`QE~k)h)Hu~f}%s2TkbJv zJ=%s#U#z9;yjZ-UcLvt%NP%OIq;Y~qGG&C+z&kY=XMVT__U6~|sr@$`j1a*KR<%UI z@h$e$%kUjQhv3rHa!6fa4^u3KG5)|zTohA8^*t6rqC_2|C3J_p<=Gk^2V3Y~(F!Tb z*I~C-0sXFMW8S*BnQHx*#It+Z7s0a1? z&xtmQ$qSYzxq<4D5AbRGT71p-f$4k2<3;au5P>PwYR+}&_IboiuIBf6U&f(I|7LtM z6vUCD2AEd;5~TX3(b2O~(DkznTRWr$%KQw^p*@~-9D6`TlZNs88!cATdKs>I<%y$? zLBwFuH~u{Y%%K!RysV*scdM3gcCTL&sed9wrT_mdghfRK-vKe{(qf!+A{jf@^PZ{i zcd5*AIUK3H%oBC8g57~DaZF4qjI{ryQ>(9GY4}ZaaLfadV=VhoHVNMyUj`4aaj;o! z8TqD=j+@zBoFO3!J@QkjwCa7zR{n-^%UOcOB3f*hf-y19DaCdDanwK17N3VFle6tJ zNZ@_G3+#6cZa)+bZo`2@$W0N|8%`ecw z`|f{DkkpVX{^)8^U6qfh_1dtVv60l+yhpX5vZmnW~=Vv(MYqadT)SWVG^5-tJnEeY6NyNxxts zyLqRVihyp7n2+PmhhrI^%NiD5h+4vnQ1w<7S#f1C^jq z2U_TcFXr&z$y(lX>rDRY@h+IssrdC=FwJ+mVE$`JTmUO`aLlw~uvl`=Y+|4Bu8?(@v+?WVVjte&BATyo;h#llR7ZQI8t+NeFSMg?U)#~6B#atm#Z=CN zBiz-h@ih5|EKNVJ#hp;uOxcl_oaACv`Z5gY$!W&aXLej=)!=u|&r^gxU4D`S$LTc0 zB#o24FqVGExLctte3X0tWqRdf=LYVw@eOWe*EsTi;mY#ud*++3Eq=o-kyoM8xzlMz zQ7ornk!bGobOR+x3f#*vjhu&^2Ys&nfph43&l!dGR}OzxrRjQ)xVkkGbk{8rT2#@> zbrxKHXVBToCUm+%23NSI ziYc3IOMAp-(xN@x+~kAG^ex|;L~f^X)>1;$(&rpkH^GL=4GwbGZ_Clu^|4%M5;Cgp z_qj=3TezpkMJoND+0yCJtLO#&PHtw=VLCE#3qXB_Tqb-i z`GA`-JRenOiPOn8g-& zb$$4_Tr%YUaD!t`kHM>TAI=47aLYf4O3YNi@Nzzytxtk@+QG&d3 zECJ~U>Cj~iQnYKSO8NdS13rqh*A@^DAlZ@R8d0aq4@fzVSC z)COJXT5$__X3+~W$p!F?J|id+K}` zy*nJvJaD2TGv6ZTVoJYOUIz2@DfoHn19~qbiRbP5Q&lZ(L7N?=Nn1~&4}*r}mgHu# zDbmwo)dy>V#hp~x<}ibVY<59g0xtsf4aK%j7AHME$L)Ig4y}?XLP3*~(5C?_ z6ri^WWk_1k=<1W$HC`28zOe`^t}ucim`k(XeT9DycERjsW0<&NB@)}cAL4C>1*KlW zuzYqe(iz%Bxr^IQr^7mq-sInMM?fye|8wUugH$F!1OHI*|$aYkg zH43(g8^Xe90e-4213s2g)R=#6d~5-EPb#KKI0{XQOF;4&8Xz^@g-oA0jXw`eK=Y)= z&`&Ied;4_`z5o7)u>Fe@F8@1%DjOejfq61;Z}vS4$4BbiqXSZKHA0#@wDKHXYib0G zN*-~kdaAr%_MhO#j5#!zx1a5~wOzT`(Q!7!XZ=JC z>XA@V9LW_)7!lWs?YIzqp^jg}xXt|y^}@v);kReDV0NAo`Z01G)m!({37Rs*E9ocN zm6eEdbrWc##&zKyJ0a~WQliy!lPw+=Y=BkM)#$MD4%nIbhz`qdpsND|u$iVT_@TLi znywy!y@5OW?_rz8ky}>Zxp5Hp+XU#st&pxegYId~;7XoOgjCoK=I{8-q{k{)cVr3p zjm^S3;uoQPwKaa?F^0S`|0ZnapQA$`ZqRc~k-dq#fg@J`<$2=k!Q~>~ZS9MpcO3ce zrcp^y-BBpa3rdBda{*LOvxu_pQFuYj0F+#t49DvAX>Yt4c)aVtIX`~W2Uj-ZowM|* zrOI4Vl;?uq$W-GXHCx!Q|252776(TgJxL!@$0M42-lyp_*fyL;l^F-o^LQonv`d^E zZ?M5LkLK`9j1a!y&F^uRD$yi5kIzm#!;UKA(6#KgMJzQSqtnh|o#RL7Vyn@3VU|4k zZU2}GGOr>Lk0BIb$KRFXyCCnJB6(Swj4x*1!sq@;5f$DaXE<9Cwx!gglHjwDAJ!mn z*H8zY*df}K5uJG~9PEd)FCJPSeq0)=}NVjG*J?Ex|x>b_F z)>)eFU$d3p6;?vm*f^Tj83~^nVsTn@Hq=aQLOFV`=ycgUIHD*6e5;f$arwc0y&DHP z%fh(May!^w;s%~?Gwa{{tc_TZ*eTdHQ-2&|-wPSb6Ni|aOnX>kigI`ACm;{{xDi3)pQGy!M7JAo>t zJ+a4NH?px?Lg&Yo!bi_1RMIMz(^=^Xx|c2rXiqih_$;C4BmY6R-Z9Wiaf9+l8L%{4 z9Wv!3X#qdyoe;DeUntL{7Sen16S3_eIz5R*o#r`>HF0pm%8iDOd;)E!Q{ebxHE5g? z{lC)^B&&A8#`|vAb@dr^_s$}c_`Cs%fADO(zT-mO%D;Gtjxo+z!t)sU+(@YZDwuxD zkb5F33m>g{SK*nlEI+FStCmP`XY6Fa^29m3qf-N2erpJIBO)|Y=MCs+IKu2&%JrML zL4}$&4wOBK+iP|Sem$CtpY=Jx>a{!2tOX1oNjJit2Y#R$@i`EF3#+h=Tp^U@=+ z^!gX{Mw>nCddJVTL?l?lwIImuQNzD)Ve&j69#2X-f!3l~So}={IQzAM^3Q(m{qA0r z6Z91lMPH(v$x%YP)@U&MBMRLzndZfoS$r<@1a)2Q4^bwvBv5oWsgporBum40qQdc~AXZird~-#KY4KvbUJn7&m`9WT904qu z&9jW$U`neLS)g~0@0R{R`y#>wc_&80k2~44=zJG$9^*!JyVO8k^(EXnZcnfVJlYh2rOW;CgrnWu+4c7Dr@<3l@AHM`kjq$T zV2I0Lt#ujC+&I=pS`dI8urG4kM9yJ_i!e**DnK$&4*B^ zofFi53q?M+M-!R-`=PqG4p-eX!1dBRpnjDU23B0f&44qd9yJJ3S~L+<$%+xE>K@6AnM7>Pf4y%I*Yw zT((glYCoR(TfY_bnMEn$C(Ub-|?%agj|&>2TOHT<~iPY7Lx1F9n;y zoUdace}W>_X}JW;&Mv}^Rqg_10q--}+k{8zchEA+BKUV%gp^rl!Pja72q-g!aa+!z z{G~_XS!e=1t&M0_6rVC$vkb-#y+I}KUr-zSES|l&7~8(Pjx372VeOlJFh^o4K5aA+ z_PqFlmdfgY{DUEpa*|g@)k&^NgoEoe z(cjBUv3RRK=59OzSIhCZ1rHGutIjgVWm2hrr?SbEfN$S&T>THAtg+{@YU({?HE zy3~Vk`Uj?`H$r{rBXoIeBEFma9`ZU?;u?JskKJHInXyJO}FR0KGQuFKnK)1_lf-(Gr-7uP+iO<3*H++?a#VZXZe&n?K{S z8RfXTs*lSVHyvcM1_Un3r55h;`(Tl~0*s{e@OR`sn!z(r%U-oW34gENW37R64Ua%d zLkm(LTuj4wc9-5M1(q$LjSZ4V(diE*u^vAgmd|bHtn+JO`RbGKbl-JQvmJ|9ZCV4t zvzKuWi_hZ8#}D9~h+3=|`2ZbqI0GSec4&Er8g8~0;7dxO`0ja0B4_v;8`=Jcw+Hci z8~sWsOdFzl$6I+mr8X5$sfK0ipU}*_hd8ru8w7NW!U7u+T-qzf+^*dd9y1QWi*_rJ zEt6Z|bi6CH-CxL*26SP@!d$qaCj#MjHxW2&fHy@-knA-sTz>aYq;xhBMv_-R-L)$a z;+z13qg>5WKHsNm))VN`tkLYKNi2xwG4Na2PBrFkgKn!`^qHRvT-fph){X$TM|V2f zvP%=?ivA~qC#--CIzf!2rg*05cKOFy>W&|;p zCnB55*O+e<(EYP!f_?He&LOUqx)>kAHg-p8M{O2ZJIdg%*N)=+xq5K^KNXO;|Clb) zA4@KZT%%&Sd>&0q93IKefyDf!&{FJ9m2aMdd}T*iFQr2iPSy$|^!c;6u{F*=>ml&7 zn#uR9Z19^t4LqazI4KHNW{OV{b(6d;SUkH3-kdpyjm`2QvHLe|GQUbsuak#(kraF( z1HoQfQ_wtG2#?g~;4!~{qjdHTKdw$i#wzPr!~NZ4+=vm`I-^U_NCbFZj}&pzoJyws zxJSbS(@?{juiV7&F!=1W74KXkhi@n9;ipT7An|QJZZ}AQ1d~gsUi=Q->+=}yMb_GU`UZpebZ;X5af0Xd zIA?KB#%t2Mar?k?NSD2F*oB7lhv8eyNhFnE%bcaB;(RRdc<#KRaXa=A*u-Uhg?vwNl1AHXp&; zQV*cjx_9XD<0(|?&=yX5oinafi-i({gJ?y39hS8##|7K}fy3t%Y{qAc;B+ySnms5K z3>QM)TuJ=Lek!p|$T8HuD0M-WykXk77JQdjpda{8C+K4NlIcjVrP@-VE?+4 zwp5pK4?8X4+jo7M-kO0XZ74Fea#$tQPSF-BAIT6lC{Hwh(imRMz1o*n;UOj~oRodO-Xx<|7$<*T*e-TdDBF=! zA0ujDQL=5j@WJMXLK<9XDt{!n{?FCEdc}w`p=_tBuy^jDaKviJOv9*J*tbBp{=X&N z=6kLy3bl!zFsk8(@aB^+q2Q;6aQ_`2;l4MQgcV9Fgl;cB)cf^h3QI0W*E>vz7Y;O? z7WN-36H0zovbb$tChThP7GA#|D-3w}Nx0=ts4&n=RJf#PsW9+Vfq74^m{97bRNcH| z2|{zJe4%8djZkfqp+$N1F`@ZrQK9HO5#ia#MHbN?28EsX;)Irmrzy0UdJ>>c*3<4Uw#*XLKSav3%oYt6w4@@{DpV9 z1PNiqRSe^5f1(3+Q^5SH5-Vt|#yzLJ;MN*z2>wydvxAy&2Yw45M0X<3rdm|Svvxn2 z+CpJS5}F%zAGzlB(E}%DU{8l5oJDvLtUH)Yr!Jd|f+Pbi>L30_QCf23tU@)Q2~B9t zv`sj3lQOu66mv;7&UDnw37{vm!z=3v%;WpaS%=K6I}&2h^nscK^5!%vHc%{xZUL4|Fhxj|+wcT~KB2 zfG&(&1?D0JZ)`g8SV?uzJ>Y|s4>*FFWF>N&EXu;Zd2pAW-sE#LFY%$7x!AaKH0KpJ zp4eRf$JzSZ;aadqEs2eKKe$_MZIea``bY~CVxuUFowbe^J_hUHh2s6Wd!v8=* zO0uqhifU9nXAe6P^9Pz_=%DYxjt>t?HtpzZz~!26;IcHXxGcUlJEVx<(SH}@>}uP7d# zoxgxQHmfnyplm_V=}CC#ggoAbcs9&H8Vb-YhMXJCFc6ah1DB-OQd}cA>}v(L`Q22S z*e!UWngx^ZAB4oRaC+!)32b;pNbb9X!o7ndoR_E_sF+>ChaQ~}=s(&?PsU#W^Q{ud z{ct{&i%g=o7Mju;qs_QKonUK?oA9X78=Y``PfeZe=zov4(_Mw*sh3?a?6`Oxw}0CU zZuhRivUgb!qcD}{oQ|RfkxJ0sD@Jnb67fE>R)N)1aXiIg4MaGPL04pCSZeur{A|yD z()%D4NjNINwdR>LCv-0^E*E1d^LOIZ8Jh5*Pz!u+$&x{9G0uuVxAoNLQN5D6^*`nH zsW_1*W#v9pZR|9j*R>Vb=hWbBcdWoFA0Y013^csyq`hs&h_c^gb~CXE%YJ?WhLc3` z*aH@j-MRsS%~k;s6dK-RS{EQE-nzapI%-fGfwOvFR z6;5QW)*uz}55tGwox$sM#9-zkNjmduD!R$f!(HUP=vY(?E|#YF(pm-hS#w)($@CJ8 z5wSza*$JjhKSfjqR^eF}c;DCF4%%hFut4@TJl}kU8?s#to)O;gW|9d6ZZSlObCMuu zl{N^Qs$gs<=DBq`WNu6%9lv&%-U*R|$xqWPYW5sKg_RSbI`#})c0K~hs>k49dl>#I zW5osC-vJLzHRypY+3@exFC3SjN*WBsX?sis?K_=`V%sFZW7De2btHkVN_c_C+U-VK zS6Z-R%L1f-Ee+jH7Le!46H%5$Io=t34xX-lz!g|)gT!S5Yj?lL`u1Wp?bsA-yj%$v z@Z5G0m(zH~`z=^L(UjX~tcEru{Q-gKD*pSY5*}HmgTcFh(BfkbPp+lHtS?*f_-`q6 z`tk#Gs#6SH-=YDlzw&u>S4^7s=h7|iDaiGJ95za*1c`)E@M~QnMiZ9dWQDWvYP=oY z{An!iP?R7p@5U3O-Eqh!N1jyF4dIRB{m5m0K6LTHL-ae}2!?*wKt_-$99~sLZ_UYs z&xQBs)#5%Hm6`y(AN#q2>1)x@x`*Zor^=z+QyKT1Yyzj?0$l$!5~5z^Aot;Oe7F1{ zt?Q}B@{V!X{mf@U>7tp4#}J?;LlLmSY7M$R)GKJ8u@)}!u7l9y^^oVsAX7pd>)(A% z=c{Bw^)VlA-pf0%r+kg@UECVHhx3()?JieMPBlKLzgvarPx#Jlsf zKqTxlE^=3b!5hbr!J}I6+~NZUtqVxkCvQ5(zXDb0zZN*RC!rl}?ey;ck6@TPAWVoX z;oeSO3!akI7L{`>c;2u(&(e>EwPmvG-U1^ibK3-Nu>@wXRX{h-s|vP$Z^L_|RN0Qs zHb^t}$K0{$Y*y$=yw3>0eDOG%(PD^?$$LXl)GHWu^agO-*T9oK+fi!2Wc~Wz@sJXE zTo|=c7fhFH;rYX7>67CpAnd0*(S0L{&s*1lzQtd-r4dOrSB6nz`};8N$SBfcr3`n? zv!UU|1#~ze2&b1+2)`x&N9&JiaT4Oo@%z$v_1yASE&kLxO6$?s+; z2rdJg8%N{YarGLGBTjKDP{Nr61A@ z(eLp1k!-kqpU+(mx6mu@F8nh!lPEf0L~3F0(3mA6zW?Y#p$u>)@QAa2U z4@;n%QJL5!yAi5~YM{6J6WwCjftRcBU6znmswtPMPs`U-W?P&wy zwhGQYA%kA5d5Z^&pFr>R3|tqb$mG^P!ZRCRL-W86H1~5eh#wTgi5=H?kH9uaEz8H2 zSqf~0Y>2=_T}WT)xWRKTQQmc43!B%t!qe;^m@Rk%)i*ccJ-&&YaY(I2+7x@?+o&+C z_2Um`JW>vjuoN22f}nXzJuOjKgvnD?m?vI^4b7~m(f+Xz%Xi1FZFvSs+xuyLZaHLm z>tU_L22i<{0(%s@smPv0&~KTXNEkyJdZQb<3HgkiZzPsmc_c~Pba2N?4FlotWu`_urkmpQ_jQ$1Ys-co?7 zV|ek$0DOSYokzC~(-U`X!2j2Btmz&NIhVHMl2^xIhDtabeFyZdp9J==N#^SGWAJB% zI=+AMk#2YxOYL9Pg7xuO?A|pR?{`S0#csCXYuN&Cr7zIGcVkhvK!`2VCy>Au54m)+ zGf-ft1v!zO+`JYmu;2O``WLuEOq>JSs~LfR>{S<7yr_UN>4mUR=Lo*?W)z#w=N(+r zJmCP(UYQxV3RMpT<8hn*BJ2HwsPg@9I!oRXgeVff`;&-jv=6}iiN`qEQEue>R5u!Y zMiXY+-Qf4XC7^Xc4i5iNCTitzwAnQgei|ntk+yvN`8_9aGZ~Au>@v_QE|=CR`{2Th zg`Co(Yj9wtDzz6c#PzYt(1`C^-1xQ?7rJNR{;pxzzmeZXIUa|niz?uT7Kd%wDcZN~ zD)KIkg`pQFP|rr_?Exbo%E!>6nhAo8DY;O*S_Hh>?}5nrA4pA88$}qWf@TrVmOPmZ zo9-UsLW(xhfj|W|26f@aTiG<(!^R@$-2rT_{egcE3g9ta2p-el!)I@C*w}mlD~uH< z$;}n`k)bt=&E;o&Z6+WUUqiQ!jK@an55bSs&fqb;9J=jaK#0Ul{&T;LQhY~aC^i>Lv6d0wgZ zL+M27*b_;%j~c?%a;#;&=MX8VeT7xSJIUTpy{O~96`AL~lcch*$ww}*{Ls@%o;Hf& zo!nczexnͻN(^^hmSA#!BDgox#$W1YBE&X8IB)wig6JIYeRZ!Bp!vW$%J{)*RH zq!H&&0pytR0+L)LL!7#%vLxvUv|vjGu{B7+ugCIyT=y)fH#v%0Lqx$w_BB)E&y*`_ zy>QQm`6hQ8)Cg%v#n1n)XKNq#quuiAOe9K|t=JpJ4(tvlEou)?9*>$Ln?m{ikPh4x z2jpmTnDp~A*MH*1Y{I}SHc@^V3(OwHrquYdfR1f!Y^F4EJ!ir$au<=qtXlGV>lOU) z-exvu)lv9$Khomg)I<^zT!A&82gB81BXaf2eCE*i7;Iv4N!1J|(zLLJ$}76Vh|^VE zk>-snkJzyq-al%gWz7070odR0A|K?6$^0o6Y}cH*MD~&kex78`N^CNOxN)m=T-Q6B8>cbI*Z)67C;p8e?hk+hYVYfT8#KCVp6EML<6GtRpVG)9TREGhE*9T;^ST4ZEtV2;Rt9Lhdv^SAQpk zfV-@vdD{z9`RKbmGvE*OsSM#=CHX94a1T~7EhW7Qp=^iIYk1XJiJpYN0ag8-{ z+3WR2;CjImzx*m>I)U}nL_ZE+jx+Bn=*qQIh z%#v${-`Uq8z#xpgSS8QC^Y7{M8vsAQML_J8ctKUrY@!*ngN4-Tk=!gVvP11G86&oi zycI16l9K{;Y!;h->Kkzq6p)PAT9~_Agi9OCTZDG=4ugCjSe!eLeYLcs#gazsx|9~1 z*lPzzUm7uS9)~5BP2pyOG+bTx0~ZBsW?S9*i0<-0vdJLUB5K4Q)NVh4IqtbgUrdUf zgyBffUgh9qP&?1QuWJ@BWjQmk!{xss4p;K!q6vql}6DnA-tjN$LyUzK^Ts0tZs z=DYH3zd)^L8d)lJhb*^sfY4W&Xu`M{6qb=k=C~NcVka%qQ8|N6c8MTQHw{Dh_NB~d zj3kTHS7MR@v1Ed21X{16$)=+e)^>L+l3jlvhfNS?1+w8}&Rq{Sa7LR|jVxnc6F$(G z3MIP1FcZ=S4#86O(;#wrIY=${Zk+YeV^{F!%+FB^vZcu|)1 z*N|G~@5K^7Gnn&}C-_33KDqGKh8$Z_2sMMx;ln^I*_~-(zQFPr6Z5X(I(_!R9Zh){ z(>R}6eT)-RP~gemL~8Ald&EPw&A#l-p9R&HBRCnfna$((F8 z_S5eKB>RptsaQXiEwwa(F8(#PnHJ!z&X?%tT6e+OXWE2U7Q?Jv2U+(VY4%6D9iHEu z2vri2bnB-XxW9#v((T{jnsFo07yV>gLp)3C#7wH^DF(F;2waG%q7Sj1^DFhvtXAi{9c@N4R(sD(NNH-w=cMlr|y(xJ7+Cs{|Z7t zaY&N9?o7mwuBx-$Zm*HM1JKObMiei+hDZCq07Li7c;TOE%pMidJ%-9e=7m2wQE>wv zZj@wcW-+y9D`U~)q~px(cL6S3&;Z-@3dpo+C2+({lc;X}0rM?hk;Hd{SpAU&8|j%( z9!~A1pF}6JH!Zou_PZ&20++C_@&IkAOeA|&OVj_t=8~nZgk2sQq+2a_5rZY-f@l{n zWKvX34>>Hx!H*MI+JG_pp!0*?4OSq!2a-_h#9+d+2ZMC)PSSd_rrc);3F@8QI`1vXNqia3e zGhU7zT9|V0F0~dti~yW_y^Bqfc8=)m4I)hUu}g=ZdVi@E3?pzCz|N3}w5wE~P$y#VpN| zW{@!`jby%B$Rc{?n0~OVC0POd*{0Ky%&p`(E%(de)Vv%dH<*hfUa2#6{VMnr`3tW% zwqc9Ue}g@n&8XoYrj^pr4D|V9AQpFxtt8U668QPXaM}mXpUW3W$Ys zpg7TX$)jX$Eoq|T*^6KHMD?LMmQz^GwgmBBq+c(2=F|ihZX3X)`0|6_%d(q*+3RYW_fd!}O^bt805iiAXy)}!UX~H@ulrp>I1hU}7ELPqtO?pH0 zF)6WPtW%sGy0jhE^7IdOw;q05<&p3EW^#ucWQoD-7~7TG6Z$~ISgAVqa6;pqbhCVJ~O_Uqq9Y)?OdyAw8% z9cwsB;6@`^=|Zwvh8J#|)LAy}YI;ktwZlV}NJ z51%TaQ4w>QqnQU8Mlqlf9#8(8l?Z$J-{w=tLfIFMNVZAxxzLaaarkut!wQEjg!buV zrk^hv_DRMEdMjbAR1cnURg`h}Q^5OzP*~csp5b literal 0 HcmV?d00001 diff --git a/tools/accuracy_checker/data/test_models/SampLeNet.caffemodel b/tools/accuracy_checker/data/test_models/SampLeNet.caffemodel new file mode 100644 index 0000000000000000000000000000000000000000..274a07282bef5e365ece93fd4380bb71cba9f317 GIT binary patch literal 248617 zcmZ6yc{Ekg`}i+&WJn^RP=-nwGT*beP)Q;~DvB~R&^%}+W9FfxL?mNTxaV^3<(_BX ziUvfbQpVDtL8T&1%;f7fq)Kfk~BI%hv?pJ$)F*M6qgmYMA7ziaQREv{P*7D#Hz zhzI!}^jDH_+PydQpw9n1xJ~*aEiEG{BR)w?ORGTgwY-c(;GW%wOq66DCYn7vLk|Y; z**)`rM0Nh#W0N62KVH6f^dYidseujN^{Eu5;-Q70-oR>Z9!8z0Cu zhE(B9?73?hF?C!^wpvGE-Jme^ZABrrE68Q?-WUoq>MO`|Z)@Ac>L^x7Sr^{5)CIrw zD;B!HmmnWHrKN|F$0tt>l@~1PQ^Y2pJ11E9*mV(Glp-pagas? z&iMEUdSVkvs#iR=I{cDU21?pyGufnbcN+<9nP&U9VUy5(&vIMyW`?YvwVS*?@*5A; zn+u;^P`5S8nMX$KmO|v5NW9THj!fMn!0L_9SPzukag69vFqW4RZZ2I&^2Tg&%1{=b zbh{6G1s@Tfc$u2*h~d2}2;z0f*~+@d2zf6{ahG%j zN+MD?K-mtD)-M$9zv9A@(vYV9*ROxs7<&xql~ zKXKY!05xw~$&|j0wpoYQ2zN)%u+5!+h7>;I6W_Ni+i>f-!cxP3_)xny8NaZDtaEvU zlV_hNE0#oIrG8c7kWo$~n;M{DM<8i#_QG!?mGBA89x&zA6931PZNHiu{-tt~>^P}K zrghG>T^M)|&psY2Y*qV-mMqUjb#ZI3qv%ngUCDIZ{dWY5{921I;Y(ohL0zD@d=S5O zxJ15*9K$=>4rBHgMNk-Z#pWCvvUA-P?0Du15uBT9JKBASxmD9a7OX8sP1nZpmEIXR z`(>6;Y)>OPG@^?gYx_{HL$a`Fa1=eeqhx!FyOW6BzCo76eg|d#FMOn7KKYSiM@k(R z!u`2jz?WZ-^Xi3+j%qvn3h5*zFRg63IScS|QD@TW`W9{pb!`nreqf8P6T-OmN2ttK z1#2ukf(`p4gu52cfd$fly;musFCEVTx6?7Zs@*}y*n&6 z7Fv+e!)fGC!Wt5|V>$WySsc6H>_oxC)0wk@pP2RKQ$X?0ZCpO$i1jq$(RKN9)NKVQ zWc6iSyl)GpZdk#iC;D*8-v*ZMHpKl|Z76g24$RAwB0l>yVBd93JS|xs>+Tp6JhM9q z9e+w-+e|H6!}XH3!QoGF%Tj&(^VMPRB z775cP&M@OF+$`gOUTspwW530yxBYIYL18hPDmxjc|L6zV%a_Pn@m{KZA%*KYGDw)F zzA$p-PN8UkfK)0N;reTz1qGL*aM!a9B=3ML&X*a3i{=XCv3DU6ES41>UttPinMF+B z8N~EID8~Lr)X`3Z$=FO(lbCNF#wNebY%dM+ae9i3?c-Mry25=&PR2Ho9#0El+{8N2 zy%8jABnL9T=i;+P3#r=@bvUnkC%RdBhfJ<0Bvf$~IWkq9c|G`-*{Z1^ytFVwu>SZX zC_gX+_GR~pliF@9rEX}ebo?v6Zuc3({THA#FB-LWKgZ&2Td}8Z6JC3K7GpDEIwi#< zBy48SpRr9PiTF0?lik6d#G`+QtS^}EcQPCm)erPEhsB(vb}!DHgX*0AK2MJ5 z!6lsPF&5`VT@`JxQGxSg^E-Ow=>;6m9BU3KHsWa6M{_FsXK~(o$#53+>2W?4>2TH_ zx8Qu-ag=lTp&`di&Vdv7^EhYVLkMS4#W9X1*N!t{F^_ZT`W8-A8Rp=1;hY$a1dfPk zBFA_ADo*%{(;OQnpO&6`o}RmOGG~gP2**E0oHHtVknYZ1$f>$8iDM+H$bk?sj`f%A zv~0l!6P1tz_PPRj(^i5^=AI3wZ_0k8$5y|gqt1`ekvk`IR!B9nk3bha?NuPXaZx(0 z@miFlJrkFYTg_?Knew z9hBwtewE^=Irq|?&u`LY10!^QO(HF(<<5RmxSWoWk19`cearqkYZ*OwNgrtq-C+_x zorI-rq09osG%DysKXdQnUdG&U?JFOlLIzY} zN{$YC%H~kVX6-{)s%9ZsrW7^4_=sv>-=o$nx2Iw)b6CCxF~~GO4nCSTQ9IWJqb+3* zYNaMOvb^QKP?8NI7#~-}D?*HM^HV>Y-o+BrWy~MKxWg2=$${Z#4tweIsBr zXAKH)dISn9A45+}5FDDJClJHMuq5#?)P${sZ?z(%zONa=E{YTTLsN*{WdOg;p74F+ zS){(Bi~3z(&U6$-BHqf!RP8GjDlqJf&4A-&D*V+SX52Lb5;l%gNAAC1#>0wXSmqY9 ziv5kEUj`z}314X>;x^oN=>-wJCg#(L-Exl z^Sd}nD?0)%zQ$zzq!9AHDwVifoF|PxmXO!;4N3j+Geo4TlnlH{CM!%5$fUVZM6YBW zaW?ZOs#CQ{a=Q+3KeLnwTV^Ahh&52ZH%lOG+RS==Y$LM0xe2!1e+g4mHZcK9_Mn2Z z)9~TkW@wnX7OtH>fFxbiaF*s0ETySMMl8~hGPf9y80^Nf$0cwFhYJQ-X2{`a0Df=k zjt4UOQG#F%t_(52*W3*7zfy5*-&lvLmn?zK>-J>vjyTe}NS7GAtOw(`XHf4MNy?<- z$njAZlF=+f`l{s6TfKu&t=tc+gVSM)aGcqB;|xkNlgEx3bFo^#9zMQj3o2h-iS})G z!XIX(VB6!#c)_S5Zga84hy%9-jpMvI>4us_{OH3~N z;ZE^P>{Q;1G!Lqiwe%eFw5JQrXw%2N{d2IHw+?I=SwNn@S3--M`;qXvBQEe>fTb5M zLw@W(%-ei3tZUzc63!h&za=-|wOux_;9d;+a&tD`nv#LM_T*51bWNxZu~!gZ97Tnu zU#A9b<3LRQYF)F`d1m|17^0>0lu#a8q_@72**0zmM-$Rv`n5#jbRms+d(Fm`jsfIc zzB-B1Y6a(}TQJv-B)dItbo1Fp*DbvQ!Ud@8N5gur+f(*VDVF^-SHPF;xeeAVN9dF%y1RYUW zL4wz9uxT7D!LHY4;;*u8uv6m}2zBI13)>POuYXRNU7Acz)|R4|kJrMm?|gE;-jVRO z+-Fjgjv|X@OV&)C4tNkgo#5zdYM1Cv{4U=QFKu0i#eNQ4V7DaiY}qVJFnmPezQ?HjZp z_~j5ZF!RXMuD{e%<3uP9(}edgW)q(ih!jtqLoQK@a4+2y&j0oyKNG(}`!aoy>^cHN z$@|HZ*;>R?uM@yD^m z?{Nd@%}gNd-N^)GfQ;h|f?_6-i>+UQR>=U@r%qrv8Nsu;$C=M7B#@zBE6V?S47F$+ zMu{g1VTpmVVAti_tShHi3mm4zB9_#A#QM98O0QE!%k70sV%r+D%!iJf>r}JM(7tLfCT9 zn~@PqW}R5t!bnRfqmqq}q0cf0Ngt-*MYAT$<HQeBw^9i^wfqxQJ~Tn0R3YQ@ z@t45M_AxpVUPKk8y`|nRe=AVEAJ0tb`$|b}@`rBpkXrf38O+_MLqvTt%e=E6oXZET zE%ypMr@2s$g&nawL9EJ4^%b56-nQ*q`9SnMYMK63Yp67BCu5vyx zX`0e7PJ-oz<)s$hLA1y&qemc~6g+5*=NJHvTFOcdpWqi)?7drE+7nQ5c zBChoxAiqkDJdjc#+rKR$0`oQ?M~s-sM~j%x%I0AC#~J=PaBXysOT)|QlflCB7|9EI&N-vu0f0KHm(gDBby@nefHe)62pSV8f64I+0V?G>; z04IwlpjjJ-H`n{)v`Nj(_FNk@Tk9yaMQ9R}**EYz+m}@Kk_N`twGiz+98GS2*^3W6 z-;Bci5}>ij3@_M}DUi!~jathynDefSNqoshc&0Ckxl0ql!POO~-lmXKy%CZ8mWEPC zYQg1~0IfKhAVfYTJ!y1RK-wL#?Bv+YNO%w<;DT=&6ub zGR0(3W(oP_Ek^{0$|y-&R}yZ$l}OH_NHcYsSVla9@LnT=Jk`mnfHyFzvzT~WO{sYs zsLI`uWN7}k%#nMMrBXeV7GIs)>cveiUCu3-2!{Gv4Y|7wewYWo@-Q!Iy;1W-P{iRY zx7X~=FR2NMp3jwBam_q;=A7!LfCJTUPOUWib?IM?(Y7S+_#aC%kB(W~bziq|^@i7) z9j>jfY8JdSKe1|o`77;m?wQ|ob@ROBnqmF0fFa}hzsz(=6c>{uJA@JcYMxAZb$rM?pCpen%;wlxaU-DxSDCdxYh$AJX^O@ zT(?O%Tp9CXuJSWc-uB4_TzBWY)hmA#a5*XN+;_IM+^O2#+%M6ydGyt{+`!aU?i#bF z+%4<_-0PaZxe@Y%+-(VOxQ=S=+{(;;?uD^#uEG4NJil}u-p(U4c;&pAJh|Vext^l& z-0;D2u3gC#p6#0{JkSI$?gPC6u25_(FFjm=w<_Z;cNgzHw{k@d_waZG?=RGH9fY&F z97ct^d(j5&u)Z_zTGm1C%EVMN^9wJ`i&fv5?v*h$%lvBLCfO|EEi$v=Em&mBlUCQ~ znPwUBE{U(=y)zHtc`L2rebIE~oiuUbWn@&Kk)NV?;aCzS`Fz4}*_(?7mwiH};p>qg zmW`0S5?-{VS)g%g2|lkIg2h>@@WIFmCd^ob417_**UmMeG|^Ag=V~3QK(7Miqg1I` zmvxC{F++H(?i1P5A7Jl99@3m6k9YVNpko1m($BP`<*K!UBpn8wUpxgry|NqqwKk^A zEB2#}agNyP?R8wm&%|%EpQBO1ZOAjKChL`Y$g)58NTc6(5aul^tx0NsU;EQX=psiAI*gAbxV-{(=+Hu+6(GK_C4?#Q6dSUnamXJ zyVQueB4qztO!lWHFqu_l@N>?6h*?wx$75CD#xZe-ymBgWQ zp_Z%-k8R+ywgF64R-j6r{6fkF;-Fn{gm`W{MbcAq$k8GNym7%2(r5mQYbwU_afwm3Xqd=Z!K*p345R5H&X z3ZE3afILriz>?nG5OG_KoZ!yKP4z8It%N0{UKwC+Od9N30#KYW0& z?_+eQbv}}t|C^Fr^d1c@&p@%T2k9K%faKO3pmxW$A^X|lxLW%wx_$I6-1;GkW```q zqBd^W+b4?I>Ae>ujtwDE={xAr<=trBmJaw)7|lxVeu}iUFF|v82KXAafZ6UA=5=oo zc*e*wyDINc#SZEuMj%VvZJ$vUjUr?`^eeE3-5_0f8voRsj}(sWAaVDfGM}4r5Pwl5 z-XO?A-JAH3IV&C=&8V5s#R7ktICo@nDw!&`PC?BEd1musEmRh>7^b?0pl^M#Xwe5n zvi*b*MePr!G9w09TI;nal_p8b@P0XUwM-d}3-TEDy!*_D`Kwrice@$K)*6(4!x|XP zC^*b;7=u;Y!0&e{5*!?1Ms77QtDX)rY9@QY?OhVe*?Eyl8(WN4PVHhEW)3s+^EOi3 z!cv&jo3EG-yG~|X-6<6N<_)8=*O3WOTMUJnsnEggWPSY-#&RqPqgXS0!1_xIRN57= z#y0!H(2JXl>K-xTar!U3-ZlsPTQ1dYf3FE!!c>UUN-gr2?gfc=`%!XVgy8%$Tc-Ja zB2#GggVOL2M{6XC8G*VLQ~6nh1lcD*l6pGJc;#JY**sCui47*peZ@)m!7r>n(H~5{ za51D^nMvYq*g-)34pegBrl59p1`5Wcol?nqHav*!?6r9kR2^Yq~8Eca@XynHz6&4LGkW6bgVSG~#(NxHAb7tj5p@!$ zs%i&NaKa9}^+Xq}|9gySF3Ld)Msx7|+*zpDClg&qH8A~H6=S&fHA+(n1rw!k%*xJz zT`85|aK;NtH{^kuK^nfVIvGTZ>`41IJ7PdrqKC|UJa^g@^nBA^;&NAugll-==_{8Y z%(;cy6NAvHZCXH(Ce(hlqa05>KzCnwqvD}a=9h2^9N6^*Qhu33{8UlA$c3g{AC@xP zwr?k{KUR|q&?z|YMpH98W)R<+4P?o}^~7)@j@ERoWuj775x-K*JgR?yKHph}uRr{O z9%fG1nt^#tXG1;u>K}{0pPP*x4umo4to#fvkb~=VGvgj!)CTR z%ns~j4*R}m;x2yyzI7xdu5m&;S7AjP z3k&9zlSi?kuq&X0D!n(6qe=C}o~aj6mv#$PbwdU0cbB3gj~mgsm205NcUm=q(B73w!T_P0IAr9n7q$>7gl;~AIJ~D2u(b)@&QC|K z_FW+B*96;2wXl@52@?Avg|nwOK1^38Z+p+dW6c0`UB?H9>(qf;W*l|&KmzJ)yhvGRPKTN+PJ)eshv>I< z3(`|4K(2NT%(k-uu);DIT7o}<^QBnE+IhmZX39WgzXU0)DJEB5&nM1GWx)8BAh%vO zWEUz%SUU@tTk1_{wbL-tR9iyPnv)PaL<4QfjYN~o^U#=yH7Yz3fTHeaqfPN)l)|Qm z%-$?3awBOo*|<6vvU|Rx@P8}d>XxlU++YRKy&VV_xA&n{OWRQNe07rLT0;hX{79S2 z6#R1GQGDxEJ`%4KqP*?bsqwdNc=;qr)R@sDI5jsGEt_9WQ97?t$w?dR{f~nqtK0E9 z(;HaUF$Rx|FhqE>i!`}f3D>LI3B{ad3&qQIAzc3euF;dkg>u_q`Oi3V?3xrX zG8|a@vg@cHL4T-e&oTwS&r09`y*EhjX%P0yVX%l)I$kuf&o_tGv7+C{z`lR8iR|5M z67b>!$&8C3&o|x>%y=1s9aaA#MfqjKUhNDxehP%o>al=CW#B|Dsa@9i6Xq9K5cP|x zM8U!+`n%vzF)W&#{@0LSG^XaesOz< z2%QQ^OD|EM@G;cB!xd%Tx(bgEou}+-jq6&Y;vk}b3S%0ynpqkA65WhUgb%jc!Ti~D z?9QD{t&8Kp%10yc`P~`xQQa6H=@78g#T6jdR+(3x^@4G^mk;hE;?S1&6YXi7f*+r+ zWWJqE2kX%_sA~HGq&9V_z~KBmaPfPGB5*X?nqo{X$nK`}CtpUJENr1v!VDf|2BF0~ zXKcESpf|2!WJhli$~d+P*VqT(p1lRgUK61km(au-2+`%l2(*5m0!02=itZ)6ud{C6 zL!I$g1Usb$L6%(@I<22cy;;J6mMv`XQ~ZW@pXgng;MqeNa!heKxpC|;>EE-3%zSm0Tvl5IB1uLNwNIIx7&=4_C2~kg ziWMxJzMQ#`;ss&X<;VubbEId12JjXLsD2*BXvjM=+rkRqiPj7%t8N*pWE&vsKr0kf z^qX2ODTQAzNFt|g?hvtn1Y)6Z0X?kgLNZJJK%>Q!951K>|84;#R+NY&?T)~~Tl$3k zE(YSIp0M(UjF3<_9&|i3i1OYH5IPmGCYuqI9pjC3{@tJ!WhWrd4?kIH&o7V)vqTc% zagmHZSVBq-?4f+Q2hn+GMXDn8$jKu+A!qR&rYcm52%Gw#dY2Nkth`NmRir@T&J60? z>WZ(a3FZzeN`l z`7J21^#~Q6Yyqmw93rr4L`Q=@qVS>h6Zvsp)NZ{Gp0{{l--naYQ$s?%a@&sXpYUP& zMhwtTpL$f^vmF1qA&SMf-b90klySrrBNCwd0z!|d5Q&{Kpc{davKzGdm+udn-HawhE}yEBG!Fr$n=|qymX?$lNW&Yd9tY) zdBE^5zoZV_5C!S7zjdpvMpz49uVG#NrHT$VOF-@9Z>(?YCL?U{oAtirI+BwPVz&O1 z!Bc-9p$)3DXswQN8sQ2$;=nx9Q_m9UN$X_TyTj8>4maPUb^LnRos@+MZ}N3I>Qw9MJtNM86BG5*g}>>F72~w_ zdN!@5x1Khuj-^F>wK)!9;>OmSlugXK%uSk(`p^R!`{{Fr|BS?^mKtm4715<_Vx0Uk zQI3sfDy^{X6MfSnjV?TTo=%8)M-RQ2W1^ZmP*q-bnEq(DnNFW{i|#Hmp#4MCIOW$} z*+BoGFM2o7&k7RhQys~)$%b>ZT;|(}F`v**FCWt_X&-2--tX+SO}TWUbP2uQNtC{I zmOW7$XBjU(eTO~V97jJtDxhOG{h=F|y)_D0K1ey)|tYPmpq$=w<3wZMqb0*8U!L{S z&kaA)2A+Ae>#MbN)zvaOFT<3+;8spcIk`{h{e=!(^pg(KdQ*A%^+NiTUzEw2nUm?l zs*9Cb`zz^RN)PDGqVr7FL_Vb}O|0p%iYG>J*=W}aFZlAef*JMBrXqXHn5qy@X1quO=YBbXmcCoY zyiaLiHY(mk7Y>gAtD_EDEcK{_FNrWv6-&4e;H=pu@sPsKC$y*S}~%Dr2go{FF1^ ze{(6e|Ky65W{sewi4{zhkvTd%trt0Xo}uI<51>JxVJc=Q5N(Y;$wI592?CzVpqL-| zD6yuC68L{%Ib;~2U9VEn{Em1^t1%8`W&fcTw0qiE)v^%0SHX8e?BLy_2=Kh@%cwsfQ)<)VYCz$1Pc{z3$B00#)$JTSD}fO(7dEdXe+&1meBxENR)8 z3M&?jFrogYtjeiX=-QJOO0Ojz4$O2#0Z1LqjOs=8Mt{*ZM;VlL5l`i_yxHDvwG zi(?FPG>A0%Q@479SKZohGc+vs8=e0Nf_*!WQO{$>nOlh^Oz!!u*z9*fh(!}xW1+QBd-te*PXk>zwYYIUpVrJUvjOQpTd`|oqF*u|4GI*{=<0% z{Q0Pczh6(jcKQmj+Rz&`e|t+C|I~)Ze5=%_{DoHS{Mt2BYA+|%@W)QF`8ypu`I&2L z_~QHj@XJ<>@w$GS@`fJwTK2AITW)l}z%!ig#s50_3h(`}qUC(e^Om|+s(jaZH2>Ut zQ~plrC>~aP&Re|NgkRC{kw2bP!N1?=&o4{Z%GV6cJ_{+VD__sV9_+mJOZ@KjX z|4~;w7rra;PjT9KHm_s&Me2e4tq)W9%X7l{4|S{gPqa9EY~{lHcJMmir{O;Th+`H1 zuyiy(z3w?r#DVYyMHl(28shlP^E3Hw4>kD@f2Z=(gT%jK^fzRH)H z8Ne5{P2k^^ocLaJ2LJ1VXnup+YW`T)3ck~17rxj|e|~0;AD`81%U`qG*7AX5j^)hk za*K$p_q^>5$^2gPsrdTL-i{)qQXY(DLkMkEZHvB!A?tIa#dVc1LcD|PbkH2DT4gb^hU;OK- zANX#Q`uKhq{_?H7gnW}NG~XpUozGXc<w~56xXc63inJsyncq;jWdb@Luf8Osu9m)O(~Nsh zmh=C^lqvp~DO3DkOquEb%aob^U;gX=VaiPZe@xkg$bXnJ)BnYk<-e4b5!)JQqBO;6 z_wFtG-S+PZ3Jp9s^S_ch|LL(w>Xw}!(JebVoXqwPD=^7jw9h2;Ub9KXyL;?7UO1a) zzJh+st24=d`i=c$u+U_~cX_&}uhS^_FJ{+nV3`CS9%I)@(k8OAON=d~ei?UFs?yqb zlV~5q>Z)Fk5ccYUWO|P-VIK|HOoK@*wCg?xUyJ7ue3%P$M_$7CzY_G@E~ZZ1asx$jYKh$epQ%^=^ePpBSY7dT$~g*-*q;klu4)MD)) zkiSubq)fjI&fgL!lB1&ZzeGS{kO z;o_Ps_~0pEcDS5lUeDD8wthaes%Js)@qFsn-zDhrwjOGAgD0eCFM<<(+RWo)MPM@b zI~?!4$-r_;lo!jezRzi3bgEpzW&Ku&&E3po#%l_0jZ`v^i+ft57h}&s-uV#02T8+0!Pfdu{ zhngozH)kJweTJ&9jPa_Oqtm#9BKb8l z<5A7p@|86!jqPg0RAzEFe0o>2u2rnYd#Iu2xMEe!T+eR78qvd4(+w8-Gt9LSYcZyn zUJ6=_JgEzcT}buhK2}hpsNi#U2{oYelxo-ggx1f|N7~#YsU3ipvKD3kCm7<1^txgAR-OtR%t3izTcST56zXN0a=5Xx56{dbH zh*_{^Cg>mj3_E6Nz_y5Z!R@^jtX(Ni;8Vkc2ltfV`vOhad@O+xo^E9BYUM%XfqF*u z)(P0V)B&cOA7%1}6k(G3HFVMTAgVch8sT^?r03&+UhGQ-*O(%>`Y#P`)EdIQ!*xt} zrw-WsoDZc1;qZ2|6|^p&Lyb$mi#8w0~U@GBk-nWNI{8S@Hx;4sb`81EQG7+YgwjJ$|rv zt37;u-#@{iZHGCUdm(il2Mnf6hIEy5c+!7?c|WiY86Dq^4zIPQ`pP<>wR?;btPMrS zVqc@(*buE#*p8Ck<=LE0Nkj()H_%B}Lv)2Lhy3_5=%*+_+Gf{Kaqs};=n;V)Svx}{ zJr6~!4Mr=~QmBQ;YAAEAKG=lpREh3!v#xA_u?6Q~6bY09W zT33Gu`^~&CI;>rn-mI={`safayF0biB+DR|J^7iH>1p5V?ETYTRi&*hF-eopH?DnB z%s#)$)`WR1WC!fXMA5rcal$Pzd@8sU4PN?zzPY~xnIHq|+~(C(TLqhm96O4xD0jlG zf@G?dTtN1rR){OM79H&T0MkXr8HHufsf}Ocm@9v6U=O>1@tYzE9z%^(cexT{rhHy- z&hip(38$YY`(1{=V|bJo{PaCSc$DJ2EvzKkWRts~U`JFkuk-#&&W9$^( zsn*>*(akLu(^BVJ{%w#no4X~Adt!Nyg-ceLrBE4L&ec6)VY+aKW%r~PmJyd`SQQ6# zvo|FKSJfs+RQ`R@#_oBhLCZULnebP7vdt?;*bnEXn{+xcCOv@@6MFl=_^91*mES1a zWX7t5Dhcn~l|POSv*C5MapK-qWARLN6OGTl?3X@?Cij9Sn|_JYtJ--Xqbj!}4)Hdw zL%&waq0X9eYT1TlusBhU3~pYbe9i<><9j}_@-75J?c#2vuM$NWC~gz{du)g<>iuN$ zcSWLP#aV)N-JOC*K^fK>CnqtxBikt1OK#wGNS0w;MRgwK->Bky(J-gnk9y?1oa#u5?*%b%z+9ThH8i4PzL5DRc8~ffkYZBX6QHE}Fp`w0 zhgH!5OpE#hCg|&NC=wm1tv!(jl2@~+w8Q_X@oEZ;g3O`rvOI`|u_;eUS(Nl%0pjDd zAUC#+x}`VICeclV;S*`t>NzO5;A96U)h<$Ljt57Es zy#((y|56PFhnUn;rog{5$ejFhiFq>Gh_dQnLX~EbAH`@>c-l5#6Bbf(Cp zU8Wm~;D4aD{W*nfVGbG}2nV;lOPP?Lmr-WnN%Z3WS0-A_kod&ZQND?bsN3t?KyNr5 zt|?AJOede|d)Za@)!;njDGxKr?T%E}=B0v@6MI|h&j$Fh@;7s6D2iJ2rkeU- zWdOcMbeKjdMOIk38P)f}g`q4`s9(30(D|>0ths9i5V}wnN(By7+I1_W1Gryxj92+vHqf!51HO(~nHokwS(q>FW|X-93~7rKoy_bW4=yeUk=VxAzQ zNff@QK0$d)e=`Si3YqL&ppsVPP#@kZA))neR`9$y#w+_1^(b;NN*1|IZ7|_cKScAH zo%gOV%S}6}Ln`4=yvPs=l-5#OCpM!q?N?yNvbB)ftpg1|cOj4c3#eI#)rq6O8DlZr z!;(8FV62XPfZ{nir0#wq)lj{XIcPzFe~%gTNNAIYfFf24P6n&V!63fn6+E9g3X&bS zsRs*Z;t~CmXpTxS%z5O2BphjQ9C%H&bZD}0Ps2A?eXSC~-RzCucAFf` z5~atBOlYmIv21qJ6_X(oS(Dpxr`RTXa`c%L6*^MhjeX_kTXwwBXOsDLh(5}ZG~Ih? zAA9~AHTGrs=*q8;73sfwQ%$zccQlC`P^0%=v!OLbca2A@??%kI2^xFjXC{o^s_slgaa#r+VWcc*;dBiT`A<>zboL6PX9>%t;nQ5xW&;&Zx+yh8vAL7b%bWV)X>%e8T8Y5 zJNoLJzw|GUue9n)4bB?dtF#LH4V^grp2j{K>9?&@I2Bj9^zh1Sba1XLCrIZ5y(U(a zbE;LIQyg)V7J7EkuOq~9+pQk-JmebEcB(_8J<9ljqXt!eXAYv|)=)e0bf}ulrAT+9 z3lcdygNYPVfRD?fnS_+*)X8LRYFe=;Om^ikm7o4HD^&ilJ}G&z*0fDW6+dR7l2i|L z`>8s*bzFknPYHx*VMEHwYnfpx;QW9EMV7CL>>UBZ2bsyMD8Nq?_SXlS^HS!vM zLG2-rsWf#%RABpul0H8`eKwd4nRgFB?3sL6<#7=>P9co{gfEaXa~CtX^%4_xLw{ne zw@~XhHc=dVF3Wwk0`+-I3!|^vYx8r*bE-5*l=|Y_P$#j=9Hu_H4xROG5dTpRmOVTL zQ_&9Cc5xc)(7Qx=q+A#HPI6-|e#iuO3;qAx|3LpDY|#ZxRg^Qu9op?u1SQ`@k?H6j z6vK+8PL&;oM`tyOhJpq;ls}m`DZc{eVhQr5<6KR~{B70S4d&Mj@|(>Txt5t{UOHfY ze5sm+>EsymSHc8yS&4Y_H%?yWr}zWr;jg7EhL*>fUm>5&%R|nZJIiI87pBnW^5>Gw z^%c10@}*nM`_y#JgDW)6^~{FNTun)buXFN%Gqi#E zv_g#pY+X!N zUeO{~`sB&G;cwt?aDfVa5X^9Ltyq0p zN=(M{HG;?1_0*!#4yLq`1rnR?fW3AR)K#8?{o4;Rf2CqsHE}0c+3Vjk?P0G4!w>ry z50`9ezyDrz|K5I7@GKRHdhI}uR&No=O^;-P!TsL&94)^t4TZ0T~GV)&X@?4=h4d=(&+mOR?vIqi_rgk?z6k_B=+B_ZEWLRC)w{g zW+pv)4s7Q6bb7mv2p#!ow(0fT)$H<%BPK3)K33^ZS!5b>O@lo$nqoq2VX<=_EHrTg zYf~AAICSFH7qt0nCR(>|C7KakK>f8Wr99jDRAVRGX8#QbCZ_!dW8}Ucyy5h28wz~KJGSvM?`-E9-dc+ZTKDj6izmgImPf22Ef?NKysp>)-dw`7v==_IbT@Ca zOjsFd>7w_`Qvdw_Vd%`GVr;`O-X^qYkM<~u))~#T%yTD62t|}VYm&9kQr2i+RVtLU zQ%K8;WZ!ylh@G3@$^J!;v6J&QvVS&2u_e}$ngcJ6@l%gh z@HePVU~hcu%8Fh|+>@950r#u#;dI7Vm@Tq(uS8Y%p(hNeO)G-DyKlf@gpvEViw|Kz zhd-#C1b8LW=j{Aj3RZ#yxOzgDR@~ZwXCuAw?N2#4<@bfR`PgJEP>R7b8K==JZwJYk zTS*>ElX**CNl9Q_22ZS5O|Cdr@#?1l{a|y4u68(5@5`G&hh0>`wU@V3@w!jc`NDT% zXOoCp1wp8>$O20$y6AO}i&TDcGA=heLn9TcXs2x;`FCbBbI{xb*NPO}e~nuUQnz$C z8Q%wG?zzNDr44k=){ur{YrtV}8(8eCBR%F9V2eQ(-8-1dJ2q5BUTW^b?-$hIz{{<& zcOHkaDGQhb?-yX@Ol8t^a|gYuu#EP2#nU#q5RCEoL*2`a@Q)IuUrUG4x3*~vZwBJ< z2{kC6?S%=U>(O=2Lu_y?#+z;ibQgYwOYvXd zbFy;c3wU<*2V7T6guyH+8F0=QZX9fJdee5CGv`vS)41wVj<>R+@W1uPIg?v0Ij)(u zwW(oWYMpXRgwwxX=FFAnJIyN%61M#tAsS+T5_DSv#BG(TG|DKTbu=?u+X!!U{NnA29Ztr-QG`RYtH`SVPB0Ij)z=RT8Aau-fs9bKr~6&+ z*KcV^rRIk%cz*|d$lhiJXfpg>zxcZ^nfBfXwr4ukpYmBrG(Y^My7R`;OE#yT6PvQcweEDonOm^j;=E^lw9jph!RR=X8B z50iL#FCLMHKb4@@?iA5lIvV>w#??*8$pa7D4tTTo5G2nZhGjMrVg16BprLsc{D;NC zj<+YkqgJ0Dy}E$rXY6$KebGS2cXHikKHQ2Il>6zE-D*@porl+U?8NK?QgrkV!}@ch z@qTME8Y=9?Q4Yq^Fslc6D5eVQ({JM1@+Y`p*gjm6S|gjOen57`5nR}K6i-b{#-@g7 zdm>!%*kx3_}5yRO2( zQZp={S?P9qd?_vKdvHx z`po|ZM`qT$a}d^s-yKh_^Yag#Nc z-rbLM<_=Wy7=KR-7c_9|3ud5{6TSo9Vw5T~a z#ML=`>^RB}xObVQJ#HzxzN4eYu0^Bfq<9R5*h_F zLdL)y|C#hl_W+n!+k@6n6c~NTg#E8{Af#RZap{xbA)^3)RS$zXJBQ5AD}sOvcgTx{ zr-+SKD4@C;M0ah5h*f9k(wlLpS8YR=-bw&Tx0B>enTv#K5=mC9t7LM*Yq3y%l4Mkc ziX=jJq@-_8nb_-ovdCzWiKH%9N5XOWBQ6~gDYnS_DUuV9l6*|>6DKp;678?X5`(^* z;%B_IGE3pV$u_Y9@hFqglDCVh#175iDA_r)X2Z+z{PJJ(S^8{K_Cxy7vHbog{;0+I zHDgD2u%4_x&iC`sVNLy{CD>W&#J8BOBQWvQW^MAf5_CtovGz|&toH2EWWBlSDVXIw zPf&LvNwDTsi~yn?1rs=}j-0;jr0%gR&r-Mq$Ih+Bnlb74Avps&d?oN}(J+t?k;mKW z?O2uS5ls zUF}qIqYw|Zl%bI}KbxWw5rzowv zUyLQE`AlX{J*f^q14aeg;bd0?-NaR{+o^8`Hd{}DRpcu7UB-Ly)sYEwipnBbdDt4R z*e!9N{7V&{^q+&ylmr<1SWQS`8BSeTj`{sYIO=jPHO+jE$FoiVbvy#y8Dq)HOV6l> zj{%MgS`Vx5?@%ztj#=g|>e_XcyVtOdJGf#Ew_^TSk=ExT;qe#uxkOFMy|8W;w~otlI(~Tu zM|E$UOGr&TPZKirOhNxc!U z|H)(5;>&wz)>C=<**XOaAL^k?bPI2!kAV5xXwXv~MqIUFHMa9iEC!Edp%$EJN2g=i%*A1N=M9lw?jVq!)LO;Qh6zqT}AByIoSh zPil%}UYK3)$erbTi2p4)YHL(LO)-y&tlUInth#wg_bW+stULr5?B;Egvxm&l1+czA z0Y?9cBNI1Wp}W2`@{TRaBoPBUz}Ij-d1`P0LzY}bRqa&VFf|+A4ly4=&@!u&(M57rr|YIjW$HJh8ei^UJneGZziF4 zW>6QJPe1woUmk}-i#7*;?35?^SCpYMc{cP84}nCD9rV8Ea=bJx z5f4sS&RcQlBuPl{#Gk!l@^Hrv`ugjCWMljUcz@%TTa8c~KEywuqkNa+h^CWx=l&@S z81iN%DQEHZji0kZH5*yE6+!Hto>JEQgYxW6iZj`tju_W0JZWiPQgDec@!H0^=&U8s zRrO)d)p+apgwJOUtW01{SCL~)e30+>bHp^Z)44L1sB;xN{oq!%!r+gZpyjo!I|~x1 z>%SuUBghw(3Xfvr;mPPFctOS#OUQ~-h0LJBU)rSN%$s;V3Et^s!|UsYzf_jbfwle6e!+8S$Sl=f#}sZ$#~@ z^~4Q3&WfK{GGg5wvEt$D*rK=9lf`%5of2=b{VHD2&lgMNZ}L~pj1V|gSvxcs`v`Wo zDhlK_n+R5Bck)&9!vqgbzu>PsXD$dXKER(-d5yo|?J+@@!$iURQQHNP#U|`lwiRoS z>IlIWz5?swuq1wJ{OD@iyeItOcboXn7mws$@4v+_i*S}E9W#)=J~3K)>gspoA6Jyt z>^p=p2LF+Dc$I`08SYgyqn^V#Mu04(xzNH5|vw2Z1YG~Rr9K+j!>>(ya8kOMwEkt`Mwo^YPerWw*p2p`Nc+rq&idW`ye+ zKiA4V`oQTrX~UlgsN~Qp8CSqUQpJda&(;5(@q*qw#lc0> zwPNS8^&=s=zL|fo_xBAF!r0b|9@+_v1ql)e1nv92d6!ShMe;{fP+c#d2& zPjPGFc|t*h5WYCgf*1uK=p1VW(|_u~i3Qf67iWn&+Rx|?;h<~2&mJ1L(}#9gM7g@$ zm z3(l{$!Du~4DzvGg-PU>Za!C)3YRYEjxXh#a+G&hs!gBN-C!$%Jrf_*lI(>c3401A&QN1fe#-127)JzCH~fBiMY zkrtV_qA~=ZmL*f~?Hl>SRYwW7)~*vw%FTA{Y@NoM>}M{}DvRcKy!ppZ9v;tH`Y?na zw&5#3rRYZ)yd+r%fH%NrnXEorCVT^2s#{m1X16=-Z zBW%-oNi^q1f`%Rg?je3e(W0GZi3aOF4wjKY)ptZK=Pn)hd^x7<&!jGA?cs`938{Qi zC*gnLO4QD%O6E2hOK!$;B)2OzN|qFS6+`+R@suwjaZhoN_*h}IWWoj;iQ^P^$zlVp z3PaXR_tt&YtC)PKlg44|3SfVL6h5ke&wQP{0;g0_{kH!`ME0=37%G4 z2`;UZXYv28=BE@7@^#FE12|H&c zWJ%M3bSmN;na@b+$OOXbd+Xv>QO-n!TjAE;FcS7@EGY;pqlyccxmJ7)piJi~u#kP@ zohMS=;xpFO7c9O&PX?#dZ%WK0ORqkoO0HdW;-)OzKb9|J?^&UOY-jfNOeUE<*NqCY zGI4UvaHC?rUEoK zd57N@AApcpQw)C7imd^A@l3xzYTOr)@NKKzEanNBH)$i>_Eh_m!h7j-C`X&5DTdNd z`DdxNWIH{AqtIH8<+d+20(%5oQS2R!7C-KhxnB&({=Fsm`A0eu1qqHn8-h7&Sh(qB zH$KR`fu<{av2M|5skVJ2N_!vBH>amEv78aGcaa6jU8xGGuSBFkASQ9EX2P>aGOb#a zG9+Ai;uhjDo>v^K2cxTA5zcXU`lVkUEv8?#{Z9W19xs1Iyr;Ulh7Is4(A`I>Lu7mhg8?I_%m?$>!qGFyoyHn9f^Eu5@|A zKC?pd(B`7M#WYaj5k(9%gA)po#TMbgR%IFrHyae|HJq45op)%h1@%KIO^jA%JNYoGa}U) zU{f%(;H8F;FOHfmAjvOj~p^=K}IM&_; zwKnZ0OHIsRx`zS2Jp7C9b~8Y=!6@?YQ!4aU+L0y;KKWLqME3rfNLS$zX6BPrUhfTW zx8jctw0rGWYL={s51M_*g5o^pl(f&y>2EZ;C{3on97j;)yeW8fZ3K;^b=0OifCQ@A z(bC%+G5$Ying8=VXqHC8+V76A?zuhA*DIzHH&gfPO9n{Mg#`L_LJrRMT~gN>rRBcr zUNBsVvdE4$66bDHFlzeJ^;qgqRr*0bE# z1?OXL&jvi!dxy|Me;5*Zxc{8ImaOg3bvGN@?f%Hj+}*LZ1l~o=0p%;R;Pc7ZXjavT zcP+hS{n;zxb;J`y(($ z?$L{e4#x>jqEA+IUi%ebK#}_1~s5{MiX79M=tiK+CGxG06Hm-|%H3 z>(3@5L5Y@zAo!XZYwi*4nps^P{J8xL26I0$9kNYnmw5KKnPR!RY2v^Ohn#~y)jP-S+2GtA=ji+lHaWMK zJasPLFv+E4pw@Z)ft$|nb<|ycjkxOEpseg-C3nEtley*m(973_d-x9LRYW7#gU@rG z8ffgiqhp!#U%79rO@|ElRyS+}x4w)MB)0wFGjkPKx;ZQP&&zGt7aori{OytOT}p@` zDqD*$?l0uWPZ=$EG`O2}On->AJWY{3TxM&{jT$caTwB1uxxSA7UTDEqtdD|Pm+;J|S!0^R4(9jxha~Z2JgyP_qwy-u zyua>W}$wd!;d~^FL0O6f$@ugzp?Cb~(+Li^ zb$1aZpT|gG6qz_#*5r%*-BhNry)0T>v-*=3y?Ub1B!+w}3mmEt%V} zK7{-ER{+<#Y#tZ0UUNPrlyejA5w31tj8hb|hTA51>$K+2WbU|RGwz{%uAJ$I)w%Cq zyK&d4X>t9lWwNVRdXQ%|363w;g>Stkp!CumP%RsH|Fx50?K&Vy=^_PlFMyVg2J|eh zApD&>AzN=9%xd94^!at56d}_Iv?#!v>q2tt;bbuV^pQAttR@u!-e6VC2W@LgMjt%G z+xshzDz{wY6&<@sPv8DW4(v98?`8#cDm~Hl&Y$9G<C|b{tg> zoQ*w;82DIa!hBF)LRy=E(LWT3A(#DO-`!R^S*GnBRd%DkqMMD|-lRhJXH~f9at(yf zQ(*e4B=|Re2L70T65q7$Ln}RfeEKkj*4;Ni)$>cxDf1M*-E|b(TF+p&X&KH@o`bHx z@_VSK?Lgv>d z7h32T#+*L8ffjjr(uw0WQ1_q_&WH%WP=8xgkmOVIi4C-Bq)hXkUqt_eUvfM6coa;p zGsphi8?;(`2~Auuj5_xYG6C+xVf`T~z1GU(txe6Ki_IxdUd{p*&bmfhi#>TeYtIn+ z6JY4RIjk;t#VlQZk~%GWPtGmb1#dfS;hIJhbKsx?UR-vcyc11=hC(w~9ekO`w$`KR zyT!z*q{QvqgDEg$<|N)4EF@dvWGav>}lW(~H}m1)YHSoqU@fqb2A0ovQc;e%}~oXtN8 z^E7@Ei=}>W{>@4FwO_$~So$`+Qui8lPmGdzsdCWir-F>Il#CCo)unq)heq-&S}*?AO>ia(0!u1+ItV|J!kz??PWz6*t6em4`D|?7{fpO+qq_2SPg6W`7Bn+k%O!kmnRCQQk0pX6@ zCDDB88Xfkr9f^)>(gqwyHXmWto#U_@gXGx_;Lhj1ZeSIzvLJ&sFYCW;V@RXrcqnMq zre9cnw6W_RobK41xO-TVQMNF!%L`&HxYhoVsKzx~G?8Ot>`JJV^Y9{GUtufiSh_B~u&;PZ{k@a6p8*9twS%UJmosQev8u*{>F0)Q7 z2xncGN(ATC`~(~IMf|Au+gLweZ=!~(PQ>#=CRyk*237tQq2A^OdUsnh`N;i5JjGJd z7{tShLM{mwszZeFWTM7-O=d}h>4MgBV!!AVFUTpE4!5i$J8UN4cvwplZe?NY!WwF6 zx)$a{o*)X#oAI({1e}RC220JWG~K`tvQK5v+b_Lv-U?l?$;kpGzaeU-Yy!7C6s5oG z^vMgF?Jh4Y#?qD7$?eF8sN`LVhG#O||Blm^o=v?(eVqbv6THITv);k$fLC~1?Ibm0 z+d%ZZL*N%=O$;X>VRfv=spoH!u8KU@iecqA^x`oS^R1Pt89als!y|DpEd#sCygg7Lo4a@Iwnw$^a%dFj z!3^{Os`jgpGY9^Wpov3umowCuN!r@@c2OrYEiD5KogBcG9-((WPQ=#mi#W3U9?Y4X z4|Yqp;r6^uAty zE<+|viv{z4Q{a-ybo6@4Mf>bK#BoNqtcUnbwC5V3Yq2dRE;~=!V=e&au_5(Yl0=>s z$DoR_0%kr7g~RjhaP#}4R8^}24I(q^bj3Qbad9^})%1v{OcPT5=hfte<0yD;>jv%z z|3f9o0t}xDH?T%sS<^777Ja2?`PllKs*JeuPy>?RdtrOh$RDPt= z^b}r9`f5^jY8Z{EUr7E~Pa;9y>GZ$2FU*%XhMLw^&?lw6Zf(`W$d%p7bg0}1Z#%3Y zZ#tKu`K1U(yz&a2W_KQ^ZI2<}OfKNP=AY!5njCH%6N*uHb7d9a9Y!hoH5s|G&-MJj z-Sk_2BDG~|=&$b%GM&vIvLk8w4ig} zd?R!3`GNKpYx?VH3(ftNfEDI5(e!8_jga|(&Rmw#F5l%$gG?_qTvji!`gmmdlQiDm zD4G81%QdQ7nL^!~LLpDNk?fI61=HGpG<8P{ZVb-KS)*dJgE|o&X z78cmLoQAX|M$ls5DgAx^2l>5xFI0zHQT@#+s1f{PQXjJHw`MEA17^!4NQS=2Hk&3n@%c#TtG?D2-2PO>in^yN_;Acbz-{YdS*z%D8)>BJ)C;Zy~hDrS<<|WOFq37F6sp=?OB!l~Tttw$q z|L!Nl9@friq@IPI-k>@ywJybnvKUx1084yZa%EO!6bDGuLcD*3x|n&b~- zE2;Y`=d7&BcWRg4EIxKORZPA&3NKM_Fg8pbB5$tao#U9{sGEiLbC+n473KSg_lN(`UtEf3 z8WL#Nov|2lRt{b7FU7YgB~3D_Z>7{qI;-k3y3A`P?V;15eOe-Qe-(m_%N@wRSRpwv zV-cO=?MjUYj4^nW6VdyUKw}M}NpnI5`8e^2i)Ysf@~MR2nAB?|!Q?SvFTYDyy_-vy ze|lEG>+5by3u^WnDfH!%~!Ser+1>gfZ}2y2q?_L z%p%ViH~Y91r1kDdm~}KA`U*y%|M>zLR%keC&Cx;~PA@s1S?=_Ob60pd%7pW;b2-PP z^@@|LYCK0h{HRlhsMm=JRpD&O%BbayS3$w1@!rfz*!@?24V6Doc6B-Pp zf6h*k);7z-Vz<*oO=AW*@l=8Z@n+I)(QC5#s4|qv8{@nk&#_CV4|{_5(EDR8pmE|0 z%FV5nX@6qz5if@ps-^Q*zS1N<^4iGmH->Vza{A@qeY!wiAE$j4Q`frdBuq=NRi6n~hmI2Immat4k=_vH-3GUxs=C*+&q3&#BB(35 z&gA#1lJYyT(0jT|_Pb&lGGjD2jA?=A{>4z(ItLEPv`vRrcoLJ5d^o#u4cxC)BYA6( zo}z1@Wcz7&U0_W%Jl28Gn_I{#-$~?F&|m7qaigQ=KO#9*TirsPUehzLD~X9h5oxoV zjJFGxQq_hOTzG33PB;;TQNg8{x>3d;{CW*n%jQ~TrY$J!uE9rNUt{yWS-9=jaSYW} zm-;I!NiF6{@$|S#8up5Zk1n?3_Q+#66tR)Uz09E()}=$I`X8{k{1aB_?}ps%Wz2_; zRZy@<068Tgkh^a>xazB6JWqw$^7kmQn1MY04ja<^s~S(eb>khp7(!O38ln3IOClFI z14Fjf@LV5y(!{b%gqkgwxN11w9--}C8|3D$o^%W1ZhgSqdm4CTmnLz)=nDR)<)E}H z4Wm6q;`z9Hr2Fy(n4;_lNnyb}Kd;X)T{N>S$SxI*s*_pR(9lvLb`R_E$*)F?o@iTQsvdr&u}uMJhaACE@5$-Egu-jMG$ z9!l1aA_LnGAnBM&A}*gG%dB)EIC?30>--T%zS@sH8;+reg(t6bX&w0$bWA*VIY(?e zp-AL)<*mr@zx6EEqp6qpb55<{>#a)Re;?k;YJJo$ zdww~dJ~5G>!`se!te_*9WTz$w;*4OQi5kIn(x?2D4qqL8x4L5V^m(vi!yI5O4@dP} zb16qV9OBel!Cy5G{!V<(82r42)q{QTZ*3+_%6>>@G=7lro$N3>r03?us;D!{in$p4KozKTVs-AS0NI-8t9z$2kTY=kJPc-*^Lw-LtqYuWd zBKnt~kZnbkbd!$;c#gE<@oWz99NNQZ?3Q#iZ=@v9P?u+u@Q8W7QO$MagFU#&X$7p- zUrjGBTtmHo4KQJk8_D0ma@o#pHO^YCj)CDZbo4hCxMf*T!?gJ@uv?YPl{VAZg}$Uc z!i1c!s0N{X0X&M_PJ(BjU^E)pWYoV57=FnWiYwzt9=nl7Ok4~9UMvML45BmVeZ^nG z3LG}05LR!iW5#~UpbH-O)5`^!7%Lbg2Z;ivd~qZ%_C>)hHVZ~PF~gqLL40behc?e_ z!D7KW5Z>5_yNf!oZsi`FegA}8%7SGOqO69y9u^>8v%qalgR-r^B{VBpi#8g#i25{a zMY;XsMZ>L1gj3ea0Og=BC--L3y&r${3=X7&P27R_x_FdRaAaka^qeU4mcrWW3@6^6L zQXe58*?OCJ?5TU{;l9ztSmPxTt+InF?bitJv;ypHR)UyK9mGT1lO){jWlHsKli!9b zU7fsFg5%|pxY!~IuZPr8)djk6GNzn&eoGwMW<14Bhv%Z|ojrtm#tFD1QgB?yHheHb z3;j|!(Ei{ws0J3{+2PrE-~J!9YW_;jzDOkg)w242B4P^7LxbZDWTXE@XkJLrsP;6K zGtI;x_7}8_U{LGcWdgenLgLdlqPE~X!hnr5aIHKD4iOlxz8-ewKBoz5ZKan+=8?uj zKD+>#9#XYL8+ng3q;=0C;r1#*6fH-?q^0la!&8UwpR@t?dCM{_#w6p|JK5OsT3M>2 z;tBH7LQFG1PRCtO;-zeqaeQr@nQN`;xSTB|g~5-Q$(7GAAao3T%k3wI^Ebfu5v%I6 zHJj0<=m>S1au8BaRKm}V5;$3u`I;Yl};j~^+W&U?=E3ARNtJziJ*oY5rTs!F2^Jv!K!JDV29vH_s4u{hA1t z*26A-t)T(C-|3&D(-sT1^5=k>;h-$g9z0!rMD7af#f%G%KelYIIg?HJHJ$qGj_~O< zqVm10VJV0CY1Ry}c^M8x8$^Ur4#NlQity!!G-wYx2f#}w)>od_yBjH?zV-=#HCben zz8RIv?c*&u?Zw*>6AwX4n#k~cBYg94FRwEz5yErsFx#!|@gOS=A4a*MT+hk+gbquI zX|}dx)ipKA=;JDq_34e`PMHVf;0vDk-r5dvpG%`SY0EV6<>ysmUho02%bas!?`)QM z=VvAHjF(Zed&XWIKC4cwKI^#5?hq%A4=0kt-T{)ChASkUt4$;;xyq8wUd=Ugw-@jS z4d${~8W&j&W>fgPgz+_xcAs!~rf9-e%1Kv9Yy}i5nLs>+P39bmDMCk6!fNtwD>cEQqZ< zAJV3-A>26^adMgtSY?NRzS;?}NXUS}n48p2cOBstkCYzLGnEe8e+hM3fp_U$1yOM~eSiK$rI%nDadW*z23f z8#5Jh>6j3(=>Z#tI5<2j~i_+A7N8j1s-NWF$I1;3D#Z+d_Tw3X!kJ7*ULihA4huiOA0( zLzt70ByyJ9EP6Ri!NBRsq3xUlVlxll!wE1Fc9L3Vq-qb}B| zxP0^u3|RJ&zRKQ@t8!POQ#Hk*+Zia5dA?VH7kbKVz$;dc_y7#$UBam`g96Ejo(&S{}$mgy3hBV4T>u7T@V8x@(%Pq6xn(>5n&?$TN|MJn8$u zn{7XdNOnFVBm2q;yJ3JC*D(dqMBt0-2tPhQ>!mC?Cw2A{6LYZn-~Rh&ML6@`+Av| z&Wy-=%))i=yXnQiqc~W|!})DT@ydDuVO+n^sdB1!HMp_)gt8L$TneG4R=NsMl zPEKDct#3s&VKLnyDyxr_SHbF-yLf%oX8aVELmm{SV$zgWTJ&E(wKTZJKGp8xJxnr+Fq=CH^^Lmp$(CC zE%Z?CJbbe?4*L$DrwiuA5xq4)e!pm=zrC86VIS1VSF=!J5qcA%Bg8aGH3@58>_S_a zr+F`D2RJV%#h}wMn6%_Jv!QSsC|*~B;Djjrr(=O_e^ki+2nPP%qx8j9nU_%3n_SpD zhFPp3VOr%rD6tCa{s$VNXoWV5My=;V)@1thpU`v3mUADgM^| zeAe@|dhEo1BLxRFtOVbiUh$XZ4YIDE*~&KxAI6_~#ZF*5;tuQE>zk}>8xWXRcnbz| zBlvo44vx~Frh@DV1Y?Hg((TMk`Y67Ngs7$v_Z!)`c)FBKE{LE}6_W^ub(3&pJPU1) zEmXTB71OGh;51Kb^pjJ@7EJdofoSW$m1Q=sWkU|7Wg)uu+seKS`|T^OU>ld787tu9F*(rYO#LQ)8`? z4-#zeyj!(zKuz#@p_yRLlVr!MeG>%XONR-Pc&}^b3g_2+mL{>TKTYD7X!NkIc&o6t z!075TjXPLlT7vklBVV$N7nRx1PYhu{I2cjW?xMuLWqE?1lq=6q-ZPaod-hmf!&58S$j2bm}onA zOm!eu3TZr>oyBwzfjC?pgwvZkiQC6aTwDDLZLwUInPh{Gxrtc*a}~8cT1|KCiNcI1 zv--~qy-DMDIjZ5WMz<(TAl}huh|RQqaLg)4lc)pOyks=V+ccD=wtfxt z$SONg>@Ahravx>UmZ4L^ic!~P89B|uaQ}SH&m}l{|L}Ou8x1LEsACDIW_%F0ve2C? z(Xr=RHcjCw_P*h4y(!=(YA?4oA&^ z^ZRR1_0(57pWb9@`d*T7S+3tpA027ej~bjlBnP9yV&LDlY7kT;!7H~wTClf|cKUUa zOZ+ZUw=TH;sm&NNIUdPSQoii$Dq*5F1f%4+1$8x@O5!U%kiBEd&~8B{?%x!O{x%I{ z(PRUhR`4GlVz=Tw#Z~0|UwwBY@myS}F;eQ8uZ*%FJsEd+J+H7of_|THiQ2sy%~Ot2 zCeIDi>g#Vlt+xyuf%m+NN!r+tWJ&fra^zJYZ_uoqrb`@Yc3(fwY0OiSeZ__NExt@j zHV*MF)<{XBj}lmD+TztUTI4h%V?AM7Hb>>nSrMD+{e5qq>P!C$2`j zf;eX4h*>amL^Ra>mxWVXJ&@n3$z&@cl$r?P6Z-^~r4%5ac}5q|Yhd)Ni)`MdB3&YE zLXtHRMG49<^_!6PiXZVV79EHDK1I^<`l*cNy$pr^c_jJy4X_*NCs1>rG2Ii7+JB}& z`^_f!1^0O!8)o3`h-zHgI2WuYAAwgFQo;QFDU=NDK}ldduzOF#Pp^mI7I7NQ5|d!y zZ7E%wn2HNeur5ZwB;9?SfU^*1s zHADCCUS2?q889QW=(pJ`X=Ir_`ok|?NA^tk@+Sf!19EX<&t&?2#dngm^eeIHx=pvS zkI|tM$LM;^LgIAL3zXxwG5KAVRBhh?Z9X{$oDRMtKf>Oi)UuJ6@Mj|aS$-atJx+#& zj#|!A8?YHqj2M@x0qk#SJ8Wu9A9dGM~v(CTp;YYu>5==BH&%Gx@A5j~rWT%-CrM?(p>sM+=7=D~uDx@Jx#{v0q!}!7oauV`TXdW6rBW zz|S=}&d?K0&iLV8p%-|3wTFfyhh_SS&%8#nIAXy{1J1=ZXtOuO+cQkyU11l*a8f~F zPz5T^EVPWyaljN4pl=v$;OUg?2a>p45h|dP=iq-GTaynR~ z=3*xL;V_Pf9m|Hk^4E>EVO_czSi_CbuK8Mitj1sS3F`p=9slSVV?k?R2`jX)!r{oo zX*ES(=I~1_ukr=;W7*b)ngYH5R9QnyC9HE%nu3TlC;qW%368xF%UFx-jM$$tRM??z z@9?=S_j6xJ8`GXQL}&NMp`6nbJhb`{wJ1GC+C2tIj(aV3&wq}7JC5MIA0cF=$$z|) zuRq{3J6-AD_H)==(m++i4&&}|#?pPchSFotMoO2pf5sjqGwDE-x71>f9<^NV3X^>% z!7nyeW^MjXC+e>WROvyPNHwY;l1{Y*| zAj@kRFxLAsdF=IqUjH%*%MW)eVQ9?c19Qg3NTM3`I+h z}pGpQ7^)sPTQnc(k|CP})UA>YO^K zai04WrEH~yqL4j`5Xz`XMN4}~$tcozkIv9}?h_GN$!L&devzzz7gh(BYfO$3@TkApuagEBK-6~ z&=n2*`2ex`YvGNILXzigczpOA*co=inFpt!b%Y~7Xx=qYZye5lwfH8vlFrc7Mpwwm zdO*;-3a(p+lVNv)$@&GqNZtW;a68ihou2ck|LYA@`DYs0J#--)-fsmP{dU4Fg+mQ~ zHl4&f(g}Pcuao1I$>ja94@AXpELkZ#LMO+LmKogMK%9vMFM~NwoGrR}7W!qlKV1(m zK`wn(wS=d=c@o}Teh8C(>jK;PtMp}g8Fg@_^sRpt-C=bUU>L(IRkOk)&-Ago^ESPn zT25ku#)18sljM*7RWj2SVEDNlo=$cEQM%Yn(kHGa_qK;X=~~J=3+s5=F$%P!_JVY+ z_W{_P+5jFexnQ5Z3JNxq!B9jvaCrn=IPBV>4F$X-H-7NGH%}s7dynI8<1|!{bVI$I zFxM)QFt3P_tqb>#H47nYWeOK{vK|dLO3ES>WU;Q;sIw0^1+W(Nu?0_?$2gjgY++nP z>VkSr2f_E%-&v!SoLC#%qnW11r|tS_M|u z>mF8ige6Se{FRY1iZgt|^#8u+hxpv~;qrQk|R1b=3^8{i?!xHgfKQ&2r z>Qsr*Hhan8>eZ5S+kGW(w#Q3yqUt5JNrxo^F4YoEm3+yzs!Yjl<#Fsy%V)DEs5!HZ zY*X2}>?F3@+q~3ceqHj@&c(s~b0z!N)&zF$@eKBXwrbYD z7z4JSy$8E5!kK+#lsCJMl(Bs8c`}c;>(sXJCb08E4zhJqUz51~Q{bGk5(F=MDBZs- zpH7u$NjGp5ATSMfhdlFXV1 zUCC50T>)cf%W&Z2VT_!01Jk{>;|m)vG&T0Z#umH$fr~v~ea&`g9_Iw5v2*!Xw|nzLHLFR- znjEne~Fwop$VbcrBv9PL1o#2XhfrNY_2YvEWD4d;X|-Lyb(%uK7&R?C3HM} zhAw3VR5g$V#0YW1~#4FF88A>3BtwWL?|#0Ag8vdLovyKC2>=5Xk|TS zK7*D_;2oeYg>>N8M-_%Bz6!%&r(H3EtOG=6Y55Rxd-S``%Av z&h{52^!Ny`%$_IQ*!-Um{kw!?XVeNud4Cdy&X~w~ayPI(q`-xHC>8 zf5c2qm|r6I9QQ0o&MU9i^q$J08-j#Y9oL*zUNIJS3l54fMMfTm`2yYBMXvO{C#LK6 z^G08}gHCs+lh9{vWGS7Geo#&C^w#sVt+n|<$s&mV+fObX$^hwy5ZpVI2mi(gk|vKD zo_l>NMso+rl;P^oGUOt;-d0A3g{_Ch*Xyu+#%%o2s)ZAN3Gyy)FV z@)KqB-JNdCiJU{uJ$yph>@>=Als^YQMLfTUg|b!FywYVB*xnR|_E}!oHzOX?-*`Yx za0WFAOvH`uCo%eQD{Wtw4a!!EP`09mw`|*0dX?2dbQ=rEswK7OtzT}0T*ijJvR?)z zqi>S57C$Q5@s#fQGn*EM{-BTN)OS1ilcO?nF1zjx1ovmE6kE}#7fQ{BhMvI z$iq{CC>)uAYNHipbDa`^JRAd)#5<^k{yI{@T7-t&k+5ctIdMAF!7KgV0kcyh!0KlW z)=eHQtMpor{2fooo9>6?bd)yeG$Bmu%>i5Yx8zi0Hrbw{0=aKYrRIrir8;wt(#;`S zQvVb`I5W*1t5cJ4O~P^9oAFKh_L&CO(_mbl?<4Qbxs6`Q<+#OqD;kbVgNbL=_>V@P zm0F6^aO46pD`?0Hc5nJAR;Ka=7M*#g##sASo%zi_ta-dK%zn->R$XrmW7nO{)YA#9 z>B}!U1a2PgxM*!R%eY{4-MGsiS<_zGv-YJM)>$?@b~KK%VlvZ>*%6Kt8O<{+_QOk4 z*cbDrK#iU*WPKimcbvE2yfdc6R?`LEIIVIskqt6pP}v8fo*QsS)Vvp6 zc3disbZ!<~{nL@0z07mX)|a|GNwsoGiVSwS(VN7z+d08C@yl58kO+x*j7p7o#iN7b z?h6k%PetP;&+2ALdRJLXeoU_v_kAc89ojZl^7NFIWUEE6SfOUJ>tyv=f^#<_nVCPP z))^kay2D);S=U%Ljx)Fwj$MBG%=yfFti&tztVaWsiOAT;9PN6WEyOgz{lZ*cxpN(dUNKGovlB=jlh_Rwv@YV=l1zN+SKSDFF|i=!eU{I?<(q zC;ukwlioSG4GG_f|EJ;;l}Ix2m#zSd*O_4El^ED@d?B76T7)x8I_T7c8Ds-52r^9~ z@LNkYnjI@7%VIr<$M$00#2gPg^D7ULsB z!CV?5yJRkB`V89##;oshwo5N`o%)0YXWHD=J9VLN%{4hkZwtJ(F2OJI-s)Sn2Vh1< zDX-JnpL{BfLIZ~un7L^Mw3MpjdxBbKd(!9Z z)P_~%QeN9gLrj?Og(H#^q?UnERO8zTsrNMrv`la#^%tVZ4u>De?r+0PB@NhpQiGbx z4H26X zT4f{VmUhO7d4Gj*Fm_)P z%=ozm;-?7clW-lhQ_G|iV#MHhRIY(0OqACIc%o5Q9FDjw=Us0vq|?{zq5ihE)Z#=r zhOW|uLC*}ShWy-r>h*&Bn0|y1%6=qP8EXJDxiAS0pR-jK~U&lqPNY8u9j!>HN!W+`{_CoHGOQT+B1{{_LNJtJU>a#FCN69&7pjw( zvaNUrSWTffSl#oD*($pY1?>A%n2Oxd%+I&g%m%{?b-$h@GckUeZ13u4Oy5ddfz?A} z!NBJAjPHI+!PSL5tiA19Vcu&^XmGnlJ3GA4cc(M)@jnI`K24%chR1JOEvhj{p=Q{BLmBzIglyuM@xG{yj~hE2hB4=Qj5{Og#D$CYQQ}8!7kuk;r)=T#V ztmUbB%y^%Dj!#8-tew9fGuP6l3Fbyr*M7865e%(;&rB;K*XKjILD$0DN)c5dbDgyz3>tKuWBL0;IclVjnP1N;@KCd=2z&+`E zGyF?+m-`u&!7F1kJS>A5smaR(@1sa%J+er>$!$IbnxA#$I^S+ zYUuEM60VvmqmT9);ko`pWXxMd@F-fzI~?0Vj=ofY1$}eCvwEbwiz%K=s7fSf6AB5< z?0^fbHK6d;fyC7;g!n-LZ0;%m-Eaa9(;UIL_Zp0JGJ^&Ud1h+NIFe{?iXkQ+c^S6G zaQVAFFK$Q(&DqjO_ixc4>*TD_VOdr%V&)_H)CbE>dReV_1t#!}9x^CLxSp5@Mq%@+u-FY%}s>xMeJ`JQr)Z%p7s zHgGso|9j5)@F$8>H+m}PBe#^}d47`f`#Fy|x}#fI^(Rj{w_Unbf6q2w*mR|~enD@h z(Dq9(x%}}TVFp8?AcPHn%XX3#8FpBzmrtGB>}Y=XC|VU*fX{|q#3yVudEPCH&bn)d z|NSe*p?2qRH~SggFV`|YS>L9Ax+mgS>0@-AVvULm=O8nVg|l{4;FH)jV6dN{iLo`# zQD24mxkF_89VQd#3!wKaPN4JtH~8L6M|MiOj_9_(cguaJhG9E|xUMS&?Y?TmpjbpY zDmmyklwecr@IM7&2^%%=SNPnfE{&YOai? z#i|0j?$BM7u6Tkr2Ji5e@-tc|8l>q7uK2g65_8up$>y4_le9V2Q_jW9RG_jD4@GXl z3qg7Gh~rVJ8mf%v)I+hdaDb*vUV(T2uEBTv4`8!tIA$&2V5Is7YMu3(-tle6w%aq2 zTuwpJt~k81K@V3yJB@qG3(;e34xV#Uz>$9xaQ!EO?1f=8@ckBQY$}izhjFN{Wd<#J z6hfnwC!*cu9oRoKn^gQ*K<$29B(Jyj@anmBn5xfq>ymXz@5EfCzvnOwueOKa#|BEx zC&fuE{J!w4nGe!-jST#CXg>{3>?1wNJEd=@e{EQ(VJ{VY{ZAUcY^l^dpqmD)bRrx3 zKTyM<<5XW!%yUhf3pXDBAU1cS^ByGov z$J(+Ng|lUa(dY2|lKc1>>ag`j7tY$xl~u&qV%0M_3spIux5gylRoq)3l^;& z&E`aJW_7RiWRH7ufMvGufurZBYpetIz>e74>iEI8f;Ht>7b82W&Q=O;Vh!#yW}g{! zV-58i={PH)pXI%4DeLD5OF?dFK8y9GpJ^Va#2>O~J9LQn&^6Zzv=E1#D@Xf~g#HFJ~;ZT1mY#cN7N%runv4z1?y=pW%SJ94CGxp}yF zafYAh0PhqxxTV(R-{Q?KyPa0MEHSBf=^CZNt(%eQa;Iyl%ic-_S3g_gnv>iwy74Yn zxVS^&5_e{T>!k=^S8>t-X74OZCPHfj`{&X?Mt8lkKr0~LackQnR;u?#hQsP`T=T+^ zoxr-uXpJ2z*tgP%HQvC!Zn|eh?XU~qSf&f7F+b0SIbPN{%{m$0%z9QfMDSHg?J^4?=8tZX?4HwMk%Qi&~oQ9h1w>vX7WA{Xk$oT4f%2eI?o zQL^&v0hG+o#j46vn6|l;y6y_Z>5huHYIm}9LOr)3>7NOhw(@b;>bKNcV-_TzRE8ZU z&q&tyJQ{X+vHaWG#dFy;7J82u!mKq7)bQRD`l~mSY9%z2KMxbYw9c1gWhu+-BK|}F z;^i>>bRp#L7^MCuns8xM5OtXN5SINM#?O>9s_$rjLp+g&n-}PVDq9z#9B#o^ZF$CR z_*dGp{~uYryuxkXsv(dYYbMJl@hI&}A&0eZ@{TrCk&@6dIx?x87d@L`h^jZ6c=^PcOBEEjSEQC)?O6((iUCjIy*ewd#_%xChc%};qko^_ zRIU2V(H(cn`Df@R?tfO=T%C<$xF+v^adsUT!O4%3a7BN4+&`Ke?k*E=?(Lm7IQAEh zaaZOFIg8#52v6&@bJh=P@T0d*gp}mt)FivL!PKjhr#!rMC?KAMUDVNk&iOK#p4gCK0Jebs- zf=|US>C%`Z)M>IM9D(aFdfqWId}|i6PuSv-F`K}voq+?k@;vCAXq=pSiH^4I;QgCu zg{>TA2p@b(ifoF&Tr0W3;Ddv7`x*;y(5WQn6glqyvVk0(Xb+FJmcoHQ)-ddpB`CTk z&`GPKq_3LdfcNJbxyV~hqm;G5%_x&-{fL2)CMy8s8Cu`tx5=H?^<>P5JV?4BhQBut z0~qnqwCJn!O+ps9-+m6-O5?$(s-LdW-wXkqZxEdQ7_R#ar_nEMV2V5gpr}wF4RJ}P zrz&62+m4Rp`spF)ZLBEE*w=?qD>E?vn->_L4dMMU5YvKH>KM9rJ`8`p1rJ~DrHikW zkR3mBXj+FGEaR@1HuA>G&uR-~COD#V5|3AOF^pL47hplNvaBuaA11IK<43WK7xB*o z-2#)bF25LUE62(`Cr4yOg_dZgHV!7Niy=<4r@)S#ak8~(GjUO%I-Dwzd$|Q>5MS;o z`||Gq9jCer7QZGCJK-VEwXjgmT4*KjvW!7VO%rq89gt_}c<|chAkSg@8akCTh8#R? zOr~7=jQ=TZa=mm(8TJ~C;H!sLLiXw`{5{Q-bR^fpnYMfI^6@(9qR?dgWXvW0Ln}aY zn*u+@`#jxscM|37y+dZ#YQf}<`yqSFOKEfcXmTgU1n%zXlI9!=;`xtE$NJ85Oe4_~ z)LFMVdi_jg{jp4`jV{~nxY*QMV0q7)T~v3P`EPd$lhIJX96h1Ou-z0L>(?!3R_%Ms z>U%oKgsY5UPruG)p0-%XyLRmz?kJVAt{P@D4m-32D=X_+r++E2CjLExqpYT(R!uRo zE{9^?%8A&u%^9PTS{s`FHqm0~GHlv>0L@a?Tn;TdF&1qsft-Wo zF#g&o_L=C{tgQSlX7KO`!Lf^_tnX1*nf8Ej0$bNAW?+FL+q0Ll`Xdws7p7M`x+(5s zZN8jUSD2W^DxT%c2zaG+=d!qhKF&ne$<^}S)YJ#iKjJ6YS!nUIH!AX558lDZ>kIJh zU@cyql8k?p%JBzx2;S}DyCwGrqSh({nr^xaYqr?o?BF2!N@_?~25O`7zi8zA*~BxJ z|35EAEGI)0GU$fNe)8dJi`-*-jtV3C&~egbOxyDb-$`T1qMG2+ErRm5e9s+7_x^YK~bkC$uE=dPkm;B@hn&3z#F8?*VfVM zohiI)NqKxVuep3J?QZz_QVXJ2AC`V(*Q0)BAASr!gQNGK#k=J;sMuRY)s3@>zse}E znl+wWe_u)pi-YOn#uWJ7wG*cP^#a~$NB2KzKk$8r7MVXZfn4ri3i~UI8a{RHgPB!2 zeCGN=Fj{gPhBv}$-Xto zFmKLbSX}%cvA5d-rtC`IZwFsYchbgBomTYyha61rDaSy;BvEMbxcYjVshp72H5^-w zi=2^X_K9L19u!`l{e`*ZzK=6)@^z=+yGum5k-_!pW>Lb9#2hwrQH|47#ur^PnI+7T z^Ltf}rwd&hgX_Z-hl`AZlY|y`UkPuR46XlrJih+$54f?W7t4Gff#ep7>|D9^3F^2 zaEfg(*eqHGJ?SQRu~*L2IaY%$)5gkPToho_(=@Et_lLi~eL%MN9?{U654QvgaKrL6 zY29=cP9{AEGtqdkc~(Krq`#%FpR52`LIWhD25fz`nmCnSpfNg6N#tZ#l2TR6TNsqd zJMdS6r3$4uN9m_@=ga_nIDRq4U!Zidj+{fj-W@qlCgUzsWjRZ;i6*T}krpdO&?Cd& z(?t_aFs?!m)D;-W^Z8Erz4Gk+)nu}?!<#(P41?#lgW&G31gKtJCZFwU$^Di!}o$p7O*=M|8LU*n+5dJ^Q_n1xrgj$qL!eIhy`B>Ed& ziT(S{^1YuAMol#&{2*2M$MWH+(QX>Ve5R(V`>BRX7h%We)B3*=(yzLn^lNW2!~{1% z%Za1FFO+lJMd{?4T-$xr8YI_H^WgZ3yP$Q|AF%f>{SPCFT@@cL4#a_L<#;eNpUO*8 zpM=FjlZc0{1*J}l>CnzrEOr=&_peTo9e=AQ6Pz9*d%!ovm*oUD&Tc?k94bq8i^P`( z*XdKA_271U2nHN1!D-SYH^q%|UgeN4(s!csxJxw~E$3XOpIqcK?kO9vW)Fqy?xSI| zLazJIdkw_bCs&M}$;PvvaIKa;{&bu&L>f*tK!?=ydWJ~RE}%?0{P zUNXA>4aXz(M)3FVC^%QC1ee=ecyiY_)n(1a!O0;w+slG}e3OrDo8|tWW68Mvc{nOh zU%(4|=!_P@MR+OlI~E$1<273aoVh#=^ST{DZFhMCHrLVj!2vYU{R9bkewVZ6L2kXG zQ$6?M-Q6zTxwS5fl2t|G$Z;ZZiJN3cqv9 z-kufJHT)DoeKY5y<|po<@c}Lg@0vv#8!JSH&GDRSoAEBqhex|;WT!GZ(~h#*DTk${ z^TZ)a>ljl%$5`NXte4d?;!T9<}nv`K4In$ z(PEF~yED3bmDpdr<}y)jYnl6Dk66Yw-Yk6u13^em5tAdI`|H}w(d|D|cq6HC)7D&z z&z}e}V&h`oyUm+n!t}#*)4qBB~PDUC|fev>C52Dr4b2zfn? zblv4aI#RL#m5ycOjI(xpcrPTUy~4@$u@hlp>2ypzdI27P)S%6O3yFfKHi$QFB(*kvh8;HDK+lUP5 zQqhlVGllj!ef4_ZdxgC_?L_nK8-;1<1wx~Z5l)>M`t^&F%7lB?*r9V5m#Ez{K}k~{ zp8Fe!>mENOF9W-Y@X!M~uR)&Qjy46aa|@+05vKS#$Qi47i!tegH<(S;qeICp`g8ms zEq*=@yVbuE(bvmZ_+5w#Z*0J{oIJd=YA$+>UqVA3u)x&42DErb;cILGM6#41t2zpn zPh~^Q{9liU@aH87dKUIT)|ehxRey`;l^!)h+R%cql^lT>dG^_=mxzorfhd{81y$^>j`a`Nt1zA!0lv+;M zj{{S$W7z&fSncABe=g;ciFc#1cHwc{2x+*}e;eleSdewj@$`vm0G`aYM^3ZCfaCa(H$*`^k&6*I{4g6r>UVwXYWhUxIHpqzvi#YvY4i(pTflxk%4k!0}; zGAmU^ay~7P&KbB)$^JrEuwWMG-86vNd(Tmg=uqjRt7+uX@5k<6t5e7p-{l0BM1We) z1n{?K!@^Suu%!Df{Cpi)S9Yd=DeYV!xUl<}Lquy1W3h+`*qRlL{Y@@wf+#2`{Cp^4!8Tg$U|vppO;KcWBJYgJjY9csjR7f=d## z< zDpdi@v|YulMWRsx%SW=hf`hl1EAJiHi@MbXkMzt0Vb@#)11axVPQ+O-dHQo!!8Tn% zqROTF%F8G}c%RiommgD1q#d$z44;e*V z<((jnTg0FyxCqe(adc(eYtr>64X(9sfGYl0h&C>!+-Fs=vST+m|5e2G1J)q&F$Qb7 zr@Q)Q68W#U1J$^mGI+U~Fxxb-eC9;*<>4XRBJ+_IEDV=UlQTzhlmd8KK}YFWr*jyS zewu7*TZ=_&FO#=Xqv1>F9=dN=JA7BtAmomJZbM_DE;_~H+rx9z3&11`Hz!DupbrVoC+SL?U}_yuNMHUw(2%PB zPjs$IOVrdlTco?kP$a!K)G5kIS8V(0wCK^5eWKxR(W16*Q$@`aZNz(TZWDP|FBiG) zmx#^?O+{~0O~ofx_X-n#+^)ZVT|uOG{hzRJkD6GAwL$ch>=Ig^UshjR=`VT{o{hBb zBOQA372P;>Hn}nD6YcyJiQ3oZ(3wpJsNy`6rjzS5R__$4RdS&_g|RrX(G@)x$$LL` zaOJtt`*eZZUK|!@in`VoXdu6DxJ%TqN>?3woDki`G>r)E9Gtfp$E%O z7}E2L-;hy1N*fM1&Xe|9x5#(ksTi<$15BCtjkM-pg_k{Bq30tX%(l&esW~&{DardF z%#H?~AFBM0lL_FlA{~}r-$%|&P{gPe1u$cg2IALJYH%V5J2&j$`BM4LMg5#S&kT@y zFo(D2stwrQNdTQORs=S=f;KZ39?AP;dUqZGeRE}uu5zJ#?FmG*shftCC)2^h8*z@p zNbCp^;z5n`IAlT>zF7E{W*W3nebvu&-nJAnq<$ZihQ`v&+EqCB$as8^IzV;B=ODu` z9~6u`NvRtTMf4uZ0*B$1??0Sk9hYD?jMNhqD#_mGlPj!Py%JwJ*BfI|D(J!qp@1v(XLfwhm9MM z!qIuObQK$wm)c-YhL*JAaTshnI}?&T<{)0|qDwAixEmMM0B1uHh&@k0ZMzN}Ijjrr z>t~YzcX=O}xD;%2LqPT-4f1aekesvop#5nfXjvtbNtW_`wjm4dzW0Zo$P!>Z5mLv> zdsNpuom91C!PgDi{8Sr%Y2KbJe4J{9FHKw^Y33C$X)@-QTg-;q@2Rw9sRiz8c7)Xh z-Vk81nmWB%N`HIml4U;LR4x8CPh1pD2kx%|n~6ukELlp4*>sFs z&2r{bkcmL$lbzuHRs*&}+-oNNo;v%H(Gi9fsm50QZOxdx64Z_84`9mnidoxj#f~zT z2D5A54OTmEEu(VouslDLg6bs`=+*rrcy0aB@J+=I-&~Hu!uv<@<+b0m^lAhwt0}b*WLtG8EZyivg;$lSk?~FPe7hHXo#+IL zvG>Jlv8TlSv6>R$n#Gc#{+W`!>)ge8U*&P?=O9T-nI>^_*e2oJ6pAA)^dt+Xt(SN# zjFD^+?v^AAZi(j-zGP+GTFF7~A_*qPJfwR zLNh7iFr%BeeD!v)_s<9G(?aPMGbcPPZzNn-6acC#_khK*iL^a?58hC$M#1+4FvxgJ zZWq3w9o%NTZFyMUBijzCH(x?i)eGLLl&ctbbEs@XW-b&2iFw_^KAy)p5xrHcPj`Et zqdGryaO=OPB23N9EzHUFHY;a+gl4^k>>&U7o-DwGu331PRZz4?oueau^;A! zFM#y%=CTF3!*Eh0pU&l1$+KY<00ACy579Ad>9?I;a#IDKRUKS@r7m46JWIyN?n=pg zlZHtlEu^-}9wx{==9UL-Kuy;N%u*BK=V2pWO@t}T@>0Q;Q%=-u)rIqU4Z2{fJB$CX zTZw-qybu~MA0R4*YN)9Ghu%6U_fd`*#?xD523w79(xD||fO|WUUiRcsa|b=T@%mx< z;)NO5m~&u_$qnLq?K)MI`#<%5Tf@rt2c>_dhozY()o@}!AY`r?4h};~@odXt{AxcR zgMH(KBmV|-{NJ1qdM;}gCa0ToPhU9X{KWe#XWm`!`cadD&-w3~$Q?@uI9Vr5xG(-* z<4pB(;;w4j#T|8U6nDy`-Q1ZyF5H&QO5En9y4;%^>^S?kT;ObLzROvA_iKGy?|$K8 z`!w(fsRw5D1gL%^AVaQn%6qnUz`&q8P_-)3{0?Z3-V^Y$yhws(y(Ztkne+6V)oGdj zE9e>S*YIoI2)b~Yh~{2Bj;m{p;bHqx-mbGBh}}DVXkT+(>STTgK1lx3ue*K7nBshL zGP9N!_bC=L*N;RO%@GJ6*|gD@4NeX1Bz#jWcDFr8oGRz*)o5T(yDHi5lZOf|XEDC@ zE#@cx#nZ%BDzJ^B*Nr^!O>Qv#ai|$p?_8zR1l7oP&Z75sO``j^*@EuIV)so8N~OAi zmuToFFHCLKgrYx(h?k}faJoiF`EqvVdHqlH`@lcyG$Iqc%y$qy%_OoU_8>`S9c@tC zW5~;2s0Oj0HTgH<@9g@eal_qiXq9$AjmN}> z554d4*4R#5GxRvRK5r*?&gV-(&QI{Rsv!9b+UPjWKEnQKh+7PIVbM|~>ZvR6qNg_5 zxFTJ8amOZ%U9JkZT=is0%Zh0KK6%&8rL(-O+iRhmnJw!*q9OabgA(WZBe1__2H)bf zp3JG|0zUZ_j*m|#xCd%L@Lf$iD5cmo46BC<}Tp9X$1?pi@)}Zu(t- zk7UDf#7#X+AO8fOn~HIq>jyf?eXP7QZWa8eun*^EN$Eu!cN7n+qaR;CZCJW>7U&G# zCr?xJ@Mbz&?lt4%pM;tG^84A)v~&SVN-8lT*gPRvuobCA@v+J2$V<Lg|*uam;*4oc%STcucR4QR?zAt5do{126J97n+Mv<^2@JIM(I^e zTy=Rk(H$EHkIpZF*xYJL*Ef?DB@;-~@)zXfwrbkXxr#SWeW5Le3}swf$9eoT6iW^>F4COYna&0={hTk(&FeqejFK@VnOs#)BjHofUlkqyAkm z%C0~b5W5{T8-CDB&MfI$iM+p|b*?M`{pfn75%9*>6&|dUyFFh!$}-DJ@!6SLLbm9F ztbH#XzUK?(INc>mC*$FI!*eKw#o(fxQr zb{B5fND-c1;39grak|K7%b>7q?MUIUPs@d`XCCEbbR`RqoxCIDhpLEfaY4BC?op1% zfCXnsuHRufc!zu9=4GTEVR9aqdqp#*RC#zD`I)4?#Z4BPjGm(P}XF9tW?c^+;!%93{`P8+ny2 z@_U6`r&?@KkIrj|tfT5bG;!0XGY<{G!iQO4a?uDKeP+vdy*@ZpekT~!GK!bD?-_R2 zlwqmjQ|Yi%Exi7D!*GY3(Ui6>f@k#Q5y&kY#I8vjW!;rHaArKAw_N!uVVCfIY73d| zIup(wz3iU7dEDF;l2< zdjfPhgz|Zb(<0Fn+oOVD)b3pZtx8i+EHo zriF}LnF`VI+VG${9G%B|;o8@Q06H&V-`st$WFS_${euNs7A$Nix|Bj#{+{sOgOH4X zG~zR?0oMqB;1BcDcua8$lF%PyvRwv^%#io5?P#D&R_-He*KSIKUZleTXa@C*eWWz~ zt#rc+UD?9#eAs#=mrMvVl-7m?kou$XxGT^fZ#_@LBM-Z&u<0nWmIYu^{}K6IxEC$f z4MWA)8EE(K1Bw#~gx7_^!?;|!#HoU{+o_%zJF`q98+YJstfsxH{nd3JI<% z(#En=r=YEFBPPag!f-)77OPvpz4dQlYspMkgImL0*~R@XTAQx8q^!E*Lf^h}Da zeHt^%bWhI%! zs0E)rSb}Q*?ZwM-PKwh;Ct05ohi1F&fjsM1JlW#IWYLNcneJmdS-Is89JXIcR=F$< zj~1QBS@PXZZlwq_uhrmuId8)0m)Y=PcJJ(0^ITkhs(>$r-U!Q5jBH_xpex<)j%rJj@e zY|wdQ)Dq5sT%&q%y+7&#xz zf>}Lva^6P=v6%NuTCW$ztN3P2k67muTM~xzwYK2Dd*1jW{|r8vAjRjJy*NB=BowNy zl@|1M~3yV$k7Gy!gEnJy{Z5wz3`dHJ{@xr2#yW+JhD|YRH&Z>*&1bWB7}ETDqxDAL^2v zaH_w7biB5R9#H#8gFdB6i;E^vk8|2s-WDfa-8>O~ZAgWV?Q+dXXo0ihlrXq8(=BWC zWMZBr7pJ!tlVPx&jG~74bkuFic;w-G?{xfb;f0r8#nGgzj<{obKMWd)VRrdRNPSx^ z--nmNJ0)FItFS`P3(IlL2y5In>@>z}*^&1~ZKTD>0LDa)g~pI)R|E9V)}_+T>*s2ts!lnUx1-oA}IUrge_x=VBqRU>i7Je^hn!6`1W28 zsy|GDc3C4C-L)OOUGr$+g?8S*&YL9hV--=#v4T4#MVK}}mkdwZ40oLJ2=ku6`H0Jy z^JqDpmhUL{hmDr+*mskb*~3Z9>ms5#H61_ynTQ%8mf-s|3HT}3`+3rIitQ!H-?Q7r$Ukd9RGKtm5WjHSGJ@#|=V%yX;bcuE`D2jbZX}Bu--kgFZ z`iHQ-pn^od=c7cughr}G!!#ieZ0i*GPP_cT#_} zT$k!3Jiyz2wUnKgK@3aYgYTLM(9>Uv?Lt?$^Q4&yOdb%818T(Etpx72^4Xu%x*ZxG zShCXAerL>MY8-2|oY{Ld`OILvhGTvBT0y?7kj1L2WNrVwpicB+qM&+>uYlPU?-BNXlNy3^T*L&po8lLb61(Abln6T2$J;_rG5b$LoIX z`?}6^@+rZeo|D2J4hssdhM~a9kG$bN7npUE4fZ2-_>B2VY;XA#ogJLYvw6~peROZ& z0Zm=3bK))O?j4~jMm4dXY!ccsK90BX%3k99YysqTFN2PI{oqzj0jFI@cBRlw<|SPt zQ!N#9X5S5Q7+9U{uxn(A!@Kvb9FBDyr)!UjgM9T14*nC$c7gGdW9I_dtlX-lA!!2jOiw}N>3#U_gaYhk zRDgG%l*5ac&}@kYCN_G$0xR5e!Y8Mz;6#%*xOYx9c;3o{K->B7LfIGU8#KUjgA$y& zJOgalzX$grq2&GZtN2oXpz$+M8wj8QIy7nPtCs+89y2@Gtiz zwjeXKR*~z0cX)_96|dqlskM$9h;QWsblbv>WO24*lgLDT`}7*T?E5}6o#%`$FdyLZ zBTjf%Qw?tRalz|)4Drk_u{djlV4YMI{wlgx+V*lX&e&3jJsBmq#8Mx1Gcy6ry8#pH z^I+K%`e$EW#nG$QVbfDnv6ovZHccFZZQPadtGnOH&@OGlNbkp64abuw0s1_NZ9MJg z_5~)L1bI1n z62DSS@JM)yIDgDQ^x2kd^^Yb)kJHFixmDCNR~Lwzy8`%&wjyW$T%KV!3-4Qc2(+sW z@l}ias7Wgfan31YK_)xt5@YTJCgE%YPqh$=(658@#AU@ZBZ#ej<=$*@)Z zGG01-6oPnZq-2{m7_WBc887R^hA zsNwofQl7L->UGEzO1bf*$YTQDl+cC3XVoLELyORhXYy#o^ATDdVo7`K!ti0;Q6dR3 zB@uO-ad22VCh~E3=jo&9{-is^W2h1zTl@}fDa^+;3*GS1w&y&Rj(PY@)*M+~Ya}Q{ zG?3PkDMZ6ifS;~Rf}TZ7z|ZrJgLfzc;p6F8vojX2{g#5_ek?-i_m}b-2O8mQ{SY49 zmyE+{hq>P(W$Z$qoh&k5V$U;4SXGjSl{cQjW3%$Xaf<>P)HCK+oS(`skj>#=8m>b* z24!%iWj1V6mx1ViDkZtxm^mY9u}Ky21{7*t4w)%WCO4XbCSajH$GS@j~vaenFl7 zB4LoKh0SZ(c;UC9^~|!f8qA{>9~oo9#|lR?6qtciyDRhZ?3mqW-c{;9s}%HqoX-3` z(~GfNXZ5uSyDu`1x@a-`#)uee-2Y0`CHA=L>|6AzQ-=hP)5B|=)MbYa>CVATUwFUV z4i3{!MqbT+@KQZoQ~87g1205GwVeLH{Je(D9hE2BEA-IiBOKEIvjqip#ll+y1Y3{G z1NfY!B3}ffhx4^1DLb?zlcN14jmOW4H$8sN+W197vTE!O(bBPds$QM-XEn~9&Iv77 zl$>)-U_D#eDlQx|D0WjX7B6*_iuLLd`w-)n_(7+?B;opy*yHbIR^{MovE_M&B;!T2 zgyB3S4$+xx+a9xBxXizg_LWZ(`a5_EU(FZTHu@_uM)!FOy>$Kx4C4wIf0b$&{?k7T z&i6G5emHIsBzVmf+&W^#_~Y|NutMdKAT-yQxqe0oBRC~kpmrvTIdk@5#*BqWD$nVv z3Ed5uMEG_WG21_t+!Wp9eX=m1-X_#hDXJpsM$vN0^AFRp++RrPE=;DzX{(c4BMM~T z#bv6n)0Ld>UO}QRT%)Rg->2$NH*udmQYU}kn~<4_71X0;ENWt&A~9IKm^|zq=Dskr zp{i4F@ErGjrjE8XbDQVPB-7`9t5x5UDfPaPLp8A?sf&f0(#cf}6m}s^TAA>KI#9XG zu{qY6OP;-_&VSb<6Av)R!jLsoxp_5jLQgbO(AtXvk%ZUWXpUS}AMtckvZ;%jb(ESx zDwTijF6EW|tY(@=u=LM^u~gMoJpwLwsAUy#)GxPC=^n0_N-*8RmDf}xA>)cEl_`@* zMBqy5eu)lx(>tH5*!qBa%Qqnk85gL%ZUwc{FLK<@!{J=>%^#^PKV8W0uSTTJ*`9K_ zKLZ^)B%Y-)CAISYFr{O7UfMajzE*ovEyXFC zj0*qrrLsPF@|>UDpjxNyBQmE;QlD%`5?VDVG7+8^yI-HwMJu`jm<=y)RHv-+r)vOK_D2rZSM5x#5VE53s{A@O2^uyf*%$we zVEul_tG2t<&8+G6w71?^||Iz|DDX>aXUJ=+pRuOE0tO(_?pRGyyQRW*LP*S1p^}9c}cYN zAlfZ$>F=Vp@64uVWj&)p)MPxXt@b?Kw4FT8_qWoOG_$toUbW-3&Cb%4TaDbwksZ8o zfjdx3L=Uy?usv1Zw3t*)xy;+1)y%W#7w`;b*4M;#EG7X9r;(!UB4!YEC`mKJMRG`&n{B`hA@aeP0Wvg59K4ouU>= zG+ICmAL|nChU3I=t`FIgyOv0n{-X3gEg?bD`Q*s4B~;smP1M+_lgP0PZzvHjo@!x= zsIF!nI>Ah%+W(kSiP9$Ooc468)0o41pKn6FtawEEC-F$re{-mu`;Vyg^@=25*DC76 z(O#WE7cXkAUc)pT)9JCwGG}%lxVRb zCHKCFlIfRH6O(T8Mjdr{3L-tqXEdEkefgboc*vwAhPgbiUPKJa9LS8aH@rK{xZ0Y= zaNfI5QYz@lLDEz_hkUy^9`#$@p{|X7q@sV`r@CG&a2%4#abM8RD5H0!l;MOcyymqJ z9miAGxCsiaQkQ&O`()NM;+KbbH&QS2WIv^pv#U@lI-MXr_RNrKvYLnL%iW14zDzy# zxzAhCWy%{ei{n`Z7Vxwdnxg}2e0a&550LJ1K6k~EJhHfxM|iY9DR`zc^(S}|_3wNp zwS8L|Wjj+#%3U#;cYJgq&-3BJl_?O(EGXGttyl8LoN;eOxxo@KMbo+VU&%gn$ z<%|nF?iPDeY~V}T`YRHR3;(FzT0VKU%0+l*w~Fx19id>a#~;STFjYa#>J-MaJ`<+< z!70p&qt49i36_Fiqm_c6=4@vCtw!77pSNu5B_@o7-XsC*HK}w6d0<;`(U*DO-_zFd zLyA$e_lGXH2i zPa!sy+oE@v8rM`T{q}Dh;c~plzjyM~Q#s9=mg_%j&*S^l8*{35jN&$)3Asb{EpDbB z?NF2c9@oa}>l%;tMm&+MHhn62wRv()^In%4i!!eo<3HIFtN_65Zfp#Bsd-!#W=_J6(qSf3koe}2<`}<*#6C( zBm8cqW9#I7O0a$;O7QsnZN|jZw}RVizz2TDmjW= z8BBy@z4Tiopqb3&C6c_-k2r?uMYhZSgYT72u)*my4)1D!TWw}D|yI>QHH_~BV?JX%&{@| zDN$NE4+_Nxp~tL{%skS`i?ZO7hD7%H>bjM^=wdz0U~SBfi-u}p;+bLG_!aH z9%in?mqXRD@%sDK<=Mmb%y)_`#;QP8?Ym{{E9MLq=aN9QBkG9gdh#0k?7da2Bit$M z(M5W!*ZaqcAFOQ>@n??}?^@^2?nt#_>(wN)gIe9$AwkKkl7Dm9x&CX|dhTiL7c(Q- zPY5_Z$A!uQT7W9@rIpjP1z z2TP2gJYzqp=+A*+xl%O7s)}+9QY2ZC&7{x;(Yqbm8Md>R%sy!hhnx)wv86j(gA2id zxf~WNcax}7(<$Mpd2l$y2A<0%qL$*7XiMdFqS$j7%JB%QYaikLi4I0Do}a>F_b$cy z1OIsb^MCNN=f2|QHy*)Jf)2F(-8FJr#~JnB;E{i0GSTwZ0`fl10DIRZNZ*=Gr5=+Z z%5?cAXztL&CA8OfD=!|YYx|R!t6vE#`~ZsT&BP1RbMQ#uI2=xAX}F0ih(k4(teQTZ zD%CF~9iHZpwDk%4WGO>Nh97u0lP)1Cs~E}lcM_jDWmM1!+H>;mH@D&M5c#sZjQ8Zy z0HrW0kDiJvP@vgw-mYgOXsEo1`hBK|s}p5}`?fE}2lFb?%C?K>zh7q9$+7`^e*tJ9p5m zfGPOr@OPjp^>`E3u;aO!!d}&533antv z+kC>fr4RXA>35rcGR@ORkV>-{yHgPMBNXmueItHfPk>p;R(#et z4EsiS;ZY@3#e~_?hD!_KH}0QZnQDC?%j1(ZY>u|xBcdM>DSoX!lmUo22RpChoyP!R5zpJw~x zvpPfUG+DUh)_cLY=jMXlZHB@CU#?)$_g$6Mk7}XW`XSg`B!OG43Ct?+;V~w^Mj;#7 zWN5Y?9^Pq!f7B=9pE|9)Pm2}l41Wyyw$7V~ITrY7<8{2|NHEr#oQ;DFr{kQpiTEw; zvJGqN#JR&y@zfbi{LKF@^7*sdVIxo5;m*J@P8u_hGtF=ohek=UBjwd4eymJ!@34v_ zbHincgCtMPWJa);7A#=z)CiJHTI9|dTiU|@ ze5o;Q;*V+oUKyeZvQ zXKA|tNZ7TDzm8WHSQ-d5d2Gh2=e~k5mQ1F6)F(mBW;;fz+UeXwuuDcs`LgrHLy&-lC(jbG-ALx~USi~oi!UL8PjR~MmGCQPjH z?K&)O%>(aSt+46VC)$H~2oxEJU;3tn-k-d|8XEWUeugdIQf?#Ne|$>*@^r!3;|5gh zcaaTt*x*UF>D2368Pw9?L*ViI1Gsyb6PqbTh?(P!+dTbo)GiqsOrX6wImggEx(Aqf zsvZk;)c9i{k;9A6qUu z={z7JpPpCW`n;0GFn?FA)|P1ZVWddpY};i2iqle6D9yEx3bwZYZ<{kqBV1k7;LyN& zX1AWjb8ol5RJxN@RW%Q`Ma+cL1Mi6Kq8H>#xDb|l%R|t$zqPyaB$UpDA7sEU4{~n5 zs?FO)?`LWs6YJ@8c4_+!67X*YzW0=kGyTt#p#Dkt2zNFn@(}gf%-HG|q-;O8VdytoY z{|Ng)HT5=Jl`lE)4ZoT=i2c?y;)ex4@a3frcrU$M`P6V9y;k9of#dt=yF(=Pu}uy? zG_%0nPYO}Kkuw@iYbAE}B1+G~m)i2RjGSp3A;EbI=m{{N4F9<#)eMb6dSA4$qsIw& zOM5*$O3J~=oxZ90OWcM~qI1LaE+;E$-vw#D{92`dX6ZN7ud=@F--Nx} z62ZSIgG{}iig(`KiRauni+5d$K%0Wv(Ok8;M5~D+%^Mw%ZhjN)=~|D2Ht3-+-8y14 zIg1R$oW>fXr|1xYKcPR-aH;?yoCa@OMcr95^Wf$1opcc5M}&JtV>1js)9!>+}1k z-oU}B`B>ky69wf4;ScC6R-5yeYTAo%z^4?hPL26JvYBgcqM2WtH!)} z(2==f;S}3ycNL-kXAR+`?W#=1+WU<7RhtAy4>btn{3kKh{|=K&8`r^1*IfA5um}wE zQc+Z-4qS`V0|A{`^Y)7eoe7i4i{2ulWvmJheioCM!4ot`Sp$AOpABhcTaf+L`S{=` zb*$>&O&b1mqrOeL_>YGK&ob}AZte51uJ&@t$Uc7$b({u2&*wPW&!Ttiv!nqGBlv9f{?G zx$Czg8V`!Uy_;F|lXC(U0GGfdQ+5_ridWEU{GE+9uiJ)W zj?L!T(d?tb)GIjh$O+uHwh4b-sfMb`+_2KKX?&HaWHQZDllR6`o1fEfgLlP@mCbaF zC*^Y*@nZiZ{LkVFo}RJ;$?sW#4IY(a?|f>N)us zttiVm_K|nwP89gIGvqd*?d=OStvCy=hpQ=oc&7e(*pIOU>jG*{WE>^ULkUAC{hi9SdwBU{4&>jNT@W z{uca|pXNgJwzDu}hrH}|lPc}ww&HJ!s;6gyJMe6-7#16KQr5=Fcthh;=zvY|YCjiU zQGSoZCtLDoU5$kPOLjPAn*s^b8-b?H``~T-V?48Bh}{3{i#Pl@2CUXIFs|nXns(_5 zcr@%mzUT((Zc{|&epkrCwWfIY&4ak&?Hjz>qK8 zzIqci&p}}66cD>uC%)O$)$~0;9paM|WGCI+@P?*H{;dtUyy3sWpz$UVw%wY6Z)}~% z*QoX(^6P>iG%*SakIl!WNizKBlmbbfumYHUnPi7;3orEIL3ny61IMp8!rz{lK_zSt zf$68!WclO^!9}k)Xx1VHuzs-tmS+ZmC~rD`K8_F8K<7@}XTy)uQ8fJAmjCQd z9mwp@!=2S91>Y3Rg#9W?!fn~}g^az1LYF}wTdpxnc&utjU^M#3c7OMX;B$wm@VmXA zaP{vwjIP7(!u*m!!NZAL7^Nw;0`uHAm1|AsSK%?+1^i!I z7*oyY?AuiXtf75`Oj|x5TA#hZj4_@3ZM;#%jLF4QL zi07Vxv0aZzWVbqY`nQ|(L}bFc$ujtoatPlXqBDtsxu}1B5FYb!4`n{@GJEmA3!L?G zpE)+~ZgOT%{K@fZujDYKs+p;=?Fyi>AZ`y|QSh9vRYdnbji%oD<&Zh69~ zpPPjB!I{E%;a6c=*+ya9*{8z5vQ*)%_aOYfB0-q)tV$SgJx}PWwpytD4TN(We+px? zqJ{Wiwy;#cPB`Ez5?-i2EhKRkf(c1l!qw@a!p}=z7mPqch}=Y zG=k(VOp#SR6TtJ%(@14`ESbwV3YC)scndxqBx%ofL!U|U0 zxuqA~^V@}YgfFDM&o8J9BqSI5=EL<~6O=h2gYx-}_>9>7{IDfkp?i85a?f5NYq+(5 z|EiwD@11m5_AUJ-q{W*fQ>Bk&d`K?|y}X&s`Q8H>^o!=z&5gY5T^G^PoJn|2a~4)@ zo`4pO93njx*`RTtj2CI%fhT-SMZC5Euvu{hdj9*2E|u*=)84G$yUi>h8dfi$`dKlh z_SG8>4)_sjMi~j*yPE_nJVg&aIzd*oCrL5-K>kzgCYB!t(LH*0T^g4_qFz`-SVbex zuWmfn&kaVC<|~2k?hfAi>KLjqZ7t}ny9$qs7NX$P<^1?XBQ^I!>RKrbAx9;!Pp zVyU`TO&ilEI;R_5y{1RT%1l+P_D@`Ix4T?hY!F~B%5!C~yw)XFQAd}vs{R~ch)jnrYs(Mabg2%y`m#a zZ!Ur7?Q2j-b_xXk>cheLLy&sB5jJug(T1bV*m-Rqsr^$zy^hKOu}TOu(`?0v6(uM? zAs?;WP5`}ChJ7s~1l^aSVEX$vfK2WR8JPFU7qR~H1j*X- zQsiSRiz-TonN2GE99BQsw@N5A%*rMAxj)GM-g@4DgErE;=Ye!|oF>}QN5~J_m%IL3 z5s6EBLNaFQk#9ehARzrKb@!eEgk3p7N?nJ@f|kXwHdKV(%uE1>rYrch+&`k7qXXXb z9HC@76$-6ZVpF~zz8RN^6iTBJMCu^Rek<7D;S66;KD?GYN%Su|K(zft=zG?~dt=2! znGJ{(g&v}Uk7c5(afuo z-+^HC=d(77SZ<1<-+d;9MH!^d`#E~_=`ivBD&b~@c;kqzCGd45hGY(_;N~qpyt(^( zaC^Oi%t*bQ{@pubw;U--Eu79bG0>C^tTdBpMjXU03nyX)qh91{UPK<)y+d-QW6*57qH;IJk5gDs?&ok&or@hNbI}rVaxi%;8awOgzZzXdhgANP+L*Jd65j zQ-f0OrQv%E(qT89l^S#45S+jJ2;D2BI}#i^=aw>rrZr6^(qTUO^XnA)9OQ&&b#J0s zd54K^e;^9>*TpSwS5YB1GsUiQg$y4&1skVqgjMkoRM^}y-r26J zlqJZ^7G!0>>4$-M!yXxRe)+n}-T(Fq$J{(Atb3{=wC))%l(}gNMuR&UPKrCO<$W>{ zK4o?Zx_4FzzWp929MDS@1a2s=H0b3CD?FKdRrA{`i>yMkD(6UzK2`M9JyvHq4plSVElnpEx65n0i9H(TNddaJ&8Q-)`Ir_ z`FMA0prhe5d${R6A8($T3V)L?laRVt=nl^3**gZ{F@n#e!CC=M?!85Rmj1;5HeRUR z=duO9znaKNa2JTtpE8M{Xq`lq7Ad*d*($k4wMsTE)S%xYjV$3)PmU4qx#ULmAIb6~ z=OmxD6-y2z1WII)YU2OWIwf__ylYk{>(wlLt08$1u#nwn`%`jzn`O<#KW>trk5+P) zCan^NyKWb5a0wBnt8j#O-1;g*o~a2nt2}Ls-#ZJxYe*PwAEE`+`T)VcT9)wplthM? z;RJzx)qcj`XPj%g%3i|9Ln=ajA3LG$gL1)}4`sHt87~-?Z5tVfV{;jXwPuW@*<&HK zq7GUEia_#Z6Y#$e6HOyUyjOQRc$=?+4g2{pru_j*`{RRk{o)B_a2R0j37D|^EDD^V zg*P>ffu*7g#8y&5?k-E@qWpYn(nnMHRNPN~cpHF&Y!a+Ib&9)c(I$8)dr4Nlm`vh6 z9mZQ{EdmcgBNWm3*N(ITu-hmH)qlVqK!ZGtdkk?dd0vO=K{3>22COnFhyccE@8C7b-URCG6Q4 z2Qyy`lOv%5q<``vKL2R~-moMBPRf@6(NloHf7bYCgE@YadYOdHKTH(*E)nO2DcE~C z4?U>RfvlFp@J!8_{4}b@Zcm+&X2&8_5oS-iW1FZmEBml;$eSeS(4Cn-l%w1AP3Ym1 zD12mzBKymDXPfxVm+iBMb=Vz$EvTnV;Bl_9@MD#uU94ohWEF0g> zBynG^z@v|yFsu;{a(^d-C}u8PoR*RSjeajw9eRHSEk6412kpZYy*Wpi3qwtsdOQ?RP3TSc{fU9sGx@43u?dVW} zV{|4iFKYtGufL8h@~7cVMdf6RV>pQ#9)zOJ8?c-IavZy5IUcaGM|Dzy#@|Shc3Y}a z!wyHt6R&zwqH!HjK{HWskrjz6^x&Fz@F+WvtI}0jDd^IgbY#miKxP$xsB>>lAo<;& zsNW8{@b-5p#qGX~_%A2mjjbH~W^yc=qcer#mB!)Yw~KJgcvpPxMj;ixUIm)2??bGt ziP%U;vjlPnk&dE0&R%7J=ZDgLK$lDC9KEBNHgg+p^mfHDnlAW-b|Xw2oCX~YF{D>z z36jgtKyRle#HM~F*!My-o~vq4^6hHz7SDL>?6(*BhE3%bOxuFbiJqZt zyLa%W(=*n{{q1DIHVyP^QUz!pFM#iybhz!i1s2V|PJ&l_fCKYO;Ntc}kejj;BKv+3 z-)AS`JME^fU0nh3`PneDR|LJ+Rp3`hEmYpIf;%0$#3+iMj}C2sMxB+Y+~pH`V|tdX z`n3vH7006<_A;D1F{Vm_nrT{+`rdV-DP4mP7=tTl6b zZKL2Jqw`wgKSid(?1utFrzpYeO+yUbw0g|(|dxjcW`>BN2=r951&Hss_{4e8Ub{<&eU>VKupGlOD z9K*+HhEH-WeM_7F7M+}9Q1hwamL#&aSWOinWT5>1WBKTzohv6BgsQ!MF&2L=7d%?b3XeMj_1$C9Hl7>9W<5g9JJmn za$uFNckn%S+TmQ(1;MpB35-is`ohVz_O>mI_t%zx(_*Fead zZe2TBT4Gx@TbVgzXDNummO_T1tzb&idQk zY*xWx{~jR^^L42H-36R*XFZyHxtm;B=LYJL3V75h6+O_DN2_%W(bKCb=xxMKtmN5% z3@&NQuFhzK(P@?7F~5lB>F2>Eu9YnGKW*8PeoI-sv9)YttAXs>O%t@GsE;QLZRP!z zPeQw0l1P{OAysJ%RkIE*hLdE=S(xzShmbIP z8ZJ&xA>-1rA#(9`==rdf)*dZ}U2_iA%9~^Lcymc5i)g{D~VP&(+t(oIOM=Rk0HvjcF3q z{Iw9h`f*ZxMtV&=!%AMnM9Vu3tdJA8(Uk5)qmtPYb8zm15;k`fS2TMn7Qa$sTe z5c_UEk5OG3-s3)q>is(KUrPpm+>;;p?OGlFhc;FI!NcSDvD3w1s+veDrj9}NH8aVa zp)|y?`bzmgCgGOPrz}tg5qgBe-3vFU=`9)L!@6E#Vt0}D$%Fzgvxf>|UPP~BwxFgD z6};yROX;NBjl^M+7n=949YyG>;VTcT@gm5&X|E={{)LX+lwq|9m zS$!G3LoMd6&^JQC>}qm3w4Qg?MGf8Cah@oogo1T<6y9rUg${Z82HpCDWJ3^+4>KJ2^RDE(737XHqr zc#r*0!Nrxit+h0%8c7os&9hvT(e7!gBoFNeAzHX2CmQMzbM&#uDv@3nW2t37iVf1l#UXQ2#F< zOjK?_&)GI$d(M!pu^gc~zFk2L$)C}ZeLCpz*#HvuP$y>}lvPxE+uHk^g}JV*H3OV56WRXFo^0#DjpPIh^HCWl_X zg!Yry$e-=*5T-Xu_6LoHm>*2oH<(EJl(fmCS8pBnwD}M_c`fqV^*9bz3_urOu<@R{ zZ9I!5awK%(Dzw;nA;cWD1aZL%*kE^q%y_QFm#R}pxpzG7@nK<)+aGXHs3K$S+e*O{ zk9G!6KDBaB_d3SV@)Ne33flx@&R<}}9p1;dQEnk9kSa2ClHF}hEwqJhTjiPlcCyNc z4RG!GvGJ9q+VPd5Mc#s&2|oRK9aXfYhv&6YwU$0EV;VY4d}0;gbNoJPjExfP(rP2q6b^%HzY|#a zX%X9_dF0>p=`b<5hdlZWBt?t<+kCm3yE)ko-7<8l8S6f?X7Ne=ni*wdYIZPHYLpX0 zCEpY0Ni=g`iqkfDv9+c;v$O{CC9&@5lJd*vB!N5KC8L{HOAJo^WP4vvVmo+uu-AOP z&c18qB7V4Az$x34$gwd?=S06?w{w#^WbV#yG2> z7tKC<2jT4^tf`)c&rO+zyDpaCU5~;i7Jhcql2Sg(5b`k|$ zeLQ3k}>HAQNpQ;IfToOJ?Z6IiZbszi>KRCz-H*cA2uTn0B(1CR`Qo z`FMh5`s-A+vG)_!&EmJ!&7Fl})*K_2UDIwF&2fb#iulO#x%pN+vT?S!H*2NXY3P-R z(YcmoemzC}c+O2x>WjsqO_QobqpzGqQ9*|Mk%K)r`gk`zPe)*lxzDkxfev43<{7-p ziuSBJ-gQ)b40vQ*F|M$^h)>7=!Zw4Z*q-K`Pp#R8BmDNGg}r)+M@^xa(NB?~wh3{x z^+2igO?7fw8wr%1Lvq>0m=$sfji2G`_-LgZ&fo8i+xP4SkKQAY{4Ik(1dsIg8Iz`) z>d-EmORSQmG?y1bY7j!l8)Appva9`yf%5eaY(qvPM;Q%#8URTf zw!@ios^C5Hg<5cZHIn;u8?C>42eSo*cXS@i%;`v zQ#`Qs6fHcchjGrsc65jok-s0t;qC;WoL`T}dUVe|$95RMzIg_JiY>wy-yFivs}%T0 zITx`yee3CQ+KR^*s$gm0bzEB=#nYW_iJ!c}Xwaio`ZPiw)JnMMLBJ}^R&FD{=fcRW z%9*s6dln8~`4Q=)mr>UpU!p(O`MhP4So~&aGm4@gX)hN`$SO}~M>P-&`Ku! zo3j~8CiGLEBJ8nztuN1np-+=!u5w+|c7ds-iEyw`LE!7w zTDd+^Q+Rc%lb~_ouFCI~Cxxokr2>PRgTma?k+zPfa~MJT*|ukoCD~4ArZ6f$?q%e? zR^@m8(!uv!Z}WsMndGKg0*pWEf?YMY5%*KBMC;x}Xp9;!%RgR?gDN*;MgK^=c9jWr zrQ{;KIPgQdyx0li)~kc+-4uBL(@1tWn(41* z)m*JuToW^YQO%n)-5Qfo*P5el$7-~BJ8NcN538vVCDk||OQ<<`GN#7Ba!pOmg47zX zjMAF&*3~sZ$K7gR*_s;lo250YC68)yE~VF`Dd*N~T5z?dt@nOS@sXL#@lg}!yfCn5 zcAwT}&P`2V#I=N0POUr1m|Hwga3puwwpw+Xjr-4wg5hUtDu*8YwwY3H&dg?ZGZxJL z&G?MB3uGV7nO}c%8Nt_QF~c5l1VP2~8JDD)*V+cODm5}@G5XuQ_)QZ&;tvmJ@?!2; z$c8k1z+Lhk|8Z!?zj<5Hn(gCd<*r*H?^ZOn9!bO*wYpfV!V_G_%E7T>h9rhDOKh}$pLVf zavd(+J_ccnv}k6}d~CN@9xt054PUhWk~;NIs9~g@JnVRc19m?qnt7&hEAcnkeJ2cN zC;f-b{l*iYA0y<_-+XxT>m$*+G>N}(QB%$B)s5hOL`8Psf&d$YoAJ-=EXBTe52FMQ zoi92$4Gw*}#B-F(<&|5_#U5XVsUFt|bpNs+bpKlnQ4NpLig9)%$}$^WFwKE7#Rz<0 ziZvctc!D&Wm_h$bLz&+iMQA-Y2^Vj4#3e5^a8$%Z*)OFg^h)V64w9b5c1KR**n1+d zN!^ZS?W{#|Z|9)2G0kY~m@;s6EkL_DThY6(QHYzdhjvl@CjBo9NKvysRP_fC%WvDU z^US@HYYqmIUsD*8ccJ4Y$`#MWCf?DK#qy?-y{pY6-^)IV3$i8Rh-=Z3h`r+^9(rQ2 z=b?AvBh!C~odefN6dWFiuOF)uXQcOuXMIqTEEvm`tot`C=6Y0#8~wV)=xK`h&Hf28 z%L(UTkb3~{jemzNl;`4KYa+ll>>Rz8dm{Bz%H;-6_(1-3u0k8^{~`rSRGV}zl7zeD zgE5^s(q_-awrQ%ouV(U~DAJcLI#!B5(>sVFbHuzw?5hWnjI42k2SU zdR&ow7H^xWflW4?!)H>{Nb8wJDCy=XUb^}oj?w=H+M0?u+T#hXhfk2{?=n33 z##q^+&COKgJ8jgq?g2VhS%*W=boe6QLoUgGr!44f`p&vzu&rg0Z1qSEM3*Pvx4}R0 z8)(3F18ms=K?uk^C6;CMy@xo&6N3H+@+<6m|pcllO`dWvlN*b6&9enF~%KDcn}6SARIN*czThFoP1PHA?;KMU=t6)I6Ylk_QYp81fc z&5Obcf%1Iyb&a@Xjh0ON+1*+%#&S62Q%$0_>!Tg2@!+s&F*X~}fCGOOkf^|xn9GiW zWMu{Z9ruJxjyQ@U5AP;+Nx|^smL^})w-ufb#=r@0U6^g|1%-+le5L&b*j_aqHjb|W z2m5m*v$r1K`Xa@T=hIzD#eUeLwT4&yxE)94-o~X<{9w>WLH5QYf)}nFjq_gTx^$qL-eV797xrV)y448oEGmOG-YG$Z;HGVV_-R43hP=@2alK&vNGW6A zUrS+}m%ealjFIr0ZG}yl?PbABp()dL?x1b9W(y-~b>E!8Ty>#_nu73$(@DXaAZ_91 zM+pp{@2-{YCifZELQ>g!Z9e04jV^N}Z#E3b>%+s-9=KO83(q@$z;W|SC-^nv9!%0( z4)cna5VZg?whdK8gT0N=Xcq^I4%9= zHBt2WaXb6LhDDqcmFkk+7ub@CIyc4INKUe+M2+*;%Y>tJrJLn*^_-+xER?)EULk&a z`5@ckxdB`0`FqB_l)B0QVGBdDw}Fw^_=M5DVSeSpJS#!gfdNJ_%44)Dq*ThktP))N z?}xy#q?EC`u~m?NR!6w}316u4O_|}e=#OAfTTSRTuS{^VuR`!+?G9n_TnphsT@OL* z$!ftht1F~QMc3*Pkb0S$X#v0Z&=8~PaXs+xoU6^2I2Wso|;qex0c(q|O%`NxFhjYwv z-WY_(U5G=ImTO`AEh~}xgz=bt=RDpYuZjDE((ui+9Z>alJ34gZ68iCH0;(%8#yzu= zq4c5}l3w;@~|%gwMX`%9RCh( zaPW6tMBWYM4;rod&?b`+3 zm&cLl(y9Asm6rvPzqUzyja(Md?MT2LZoo5{@JxvwEai&9jQO8qMB zC0bFv&-)Mf;KMxEeO>2y90JbeOEDbz6(?=#_9SwWluM}I<9m?8&t9}mtrWGy#W=(i zrW1vzHFTZsAF9GZiju;IuB{v%)<{(&gLgBa>I#}?C>ew5VT_^lX*4i#Mjlv!S|`piKqT5 zn9y|`oDZ@{$fdoYth^2J)p?-w_YNfNz6_VSRb=LLd(?gHI{|e$=_&Da>2yOLl=!WK zm4$q`WEuxQ9$kQU#W#>frXddRnt+r1duaA2A*Gb)h2DtSsOEbRqE+3I?sR3uxy{Cd z3*0I3u?O{P<^Lf~+iN8H=`?Dsp)A}CE3fCxSdI5mgbDFv3~dTZ8e(e0p)zzZL>A7mroo5VKK3;yjyfFnzl{n?iqQE3k5QuF5BhUp zDlU1fKnHoBp%;uDprcn914{{r$G>3MH!cOFs~*CRw$pH=*AYyveF0%fBzbnal-enK z8_Co-&;SaIbo8mk+#b6ctl=-34TJp_2?nuYf)O#bMWL+IXnx6bX-{Ve8~xSj2c1c)twL#3}(CKdT{) zVl$cN%sjB(I2j~4w_so3J;-kBhgjt+*wIRb%FVaMLDTDy)#D;uG^c|)mZ=W6bX1{v z>Mpw1Ne4fY`%Yf&=E1$If#7-}2RX67ploMfVzR~R>js0LigFP z1z15x=u$9MD7v?v^j_J892`%Oi@r~Z)b1EOI~9a09a6AoTrk$~orh;w#^c9h0&(oF z)!1c`E8fz7lHscT#{NzT)DK<)veo;Ie&H|Z-aaOqmwF5@P929Fn7rqdWg=8k>WKPAYe2J?5KFPXO2PIGL2S~;!OpwSedn8Uz8z-5C10>b0eS#*Z`B%>S`AE_a z?vXeh-6B~v)mWVA@kFHmp6RADTyBk~X_9|bJoiR`1$SY57dLch(5CHIj`(WNA~rv* zfVHA!A^Y4$K6}yV3l@`GVNH$H7y5?L*503c1e?BEuq|+rpdwFE81|uvnSi_nkM-(^=;NZ4T0-XV#wb}|pwqyBkmiL4axagfkN3K1E zB28s+_FRU=UX>4aMK9=_(B;&O7G>-GKc;+{K?@hleNz%35SS~U1(ZeG;)kujHBG<;}eZ5 zp|kW7vQe-?Z)Xpq5j_z~mEQugk;kdwiM>ekwI(ifG{+?u-;*UB9xzUbNKpHMdTWzD zTDcl5o8#~?9|`VZ_|KeVF$qjfg;~$1qs$N^ykFG? zJtp)KgHHl8ze=2=Sa$8Ju@yIoY;mxm51zTFhg1 zX$evLVcYv@fDNCKHOv1)rwyKw{W2ObKGv7?hqgn2p#hPNG$vKHZRGUQUdo9z0w?8V zVC5@s{C2&dK5m~aK5*7l>RtGbO!BhByQM=YWb+TaVZu$QTQwdX4qb-d-l|2ko+7?8 z^CM>?`oZDPO5r50E#>rm%Doc4vzn7={)zL_!jgMDa0Mq)zDF=L6eM~X+Q&Khu87O| z&2V@F{X~}nUyEYn@@zd8hjM?cKgzlGKB|tm!&o2+SDtZTjnC; zE6&6?uYg<-(PZjBUB)Hnk1JIYk_CbdI#r_zoTYb|RYU zW{ML_FQ8WKuh1bI0%>#8!G7FtYRzFO)_a&jW~SVt&CZJHUr$=egZL2Q*wO(rqg`Rj z&{FIlDn+iXgmfzR!IXn3pk#3eml>Jjts5qS%d327IVc5)uEu#=Ct%mq@s!iK`ykf+ z37?TBAhPAoZ5>NQc@Yj^`n9PVA_N#TpxfUh( z(6oE>1y%yqiFG3dKF>(y)L{Bt|A@o>u`Sp(?J9wV8{q-l2+J=yOG~Mt*ia;cH)4Ky>}x1ugsv#Bm+OQU5Brv zeM9+AR-uCB#dKbR8+Ke>Nb6+#;g1DgIHl79k3{IPRh5p9n59^!+Y~p@Xcmzm=;h> zweg&Rik0CvEpNcz$FtE`=Zj=}sJ?XP`(eS88;LCcK6}=jpkjGuwfaAt|%!Q8Rdv%2S1n`>3F!tUH>skE4~Ja=0NT!VL8 zpUR%fjvXIq{goIASIBo(hX*fbo&BN=PhT0~o2QSVt5uUA?cyYy_F^4+;9C+#Y+XtVCb^7(Ras0?}q;cr?5Ohkm_;=6_aYSS~N<2U~>ox!KmR_~myp zskg{s4|N2^TPjoAa_g8Ir-@`j<0Q%cC2UCv-%GObtE}Wn;i$MXHAymI{TIpfh8oFu z4M&L%b3dQr^gt5q`c(3736WHK7fQ;tx+Tdk)oKfM3MA2;T@ss8iNs!-DH&qbN^BNf zkTm?@)OxtNR~vjc5jyNG7ig?p1CBb;W#K9y6cZoA??`4NXaWaXvl# zoyLYA;=y~*3wSFP5VxbI_}Iuw+&QF3K;Z!gZ_=3Bxt~rd4Dppri%vrw~e)UPyGO-9f%@pTu33swfkOB+P*ae^ap1pA7uh z_7JwnPsE$QUm_3B8gs7a?iCvRtK>{kRpm_7%jI~BXVm;rej&!&P;uhPG zA6q!5SC?_VoR)xc7^BwwVTYOtes7gkHu=Qqw6hsSBZXZmoS@O6(2 zf5?l^_v2CaZMSqZ2+xDzj(HUTfamjh}BM)dgoT-r|k=j=?R{!5W_NW8<|(N=OI!w?PUv{6RMZ%{688a$)8 z&>5bFv@;4&OyCIC*f2<av)vF_AB!PNGdt%TO_ogSIL?L7R87@zEyd*2Vwub~Zxj?_Q-*^QXsG+bUS14{GvlXiL?NH?A*CUR;J zc|8qMJ@!Fa72{nj5W)QeRd9CbI{Yo~gv8`mpi=xFv?m)&um7G6D{Et5U~E48czygCoC8QIrY8%um!Je(JN}=L2~%QAPnL=F<+R#u!seR~*8X zJEmdF*n6Z~Ed|BQQbDcKo_d2vr|S2<8#WBQ@0USrehH=WkY_T2*z-!^72y zpSZ%+mYZ1TV3ZYZBFonQHq**=*FV8VHDzJYuKO(4u|}+r*GE_%Ti>y&%%`yHwbg_l z7h4eV@X<>6|TK^gsfN?h0(z@RHfQMPc@u?ua)lu)rhaCXTB8mvZvx1 zjmL0J<5}1;dnFRAZ$alzzCbU2rJ;_*qul6D6T8^puiTkGRBGpBC)CcWUT=3{%u>6J zbQQ0E_=Lo?M5cCcuaVsW(_veQTasjUP=mzI$U^eI`>-TNXHu=zcAr}3hqG&WVu@s5 zW@4>|QAMrkhwHVE->21{Znmsd+i^gc79+LNZ&0%is2e9p5orpC{1k+xM-K|^r)(A~ zja(2+TIeJEd`(GM5auhqsuCj1R4o!ZWN8Z5r{z{#$Q266#0!Gke^&})+|~%Z57$`N zaxYjdx;Y{^{B?=YyLp=MBtH{mFBnq_dv7qjgI&nQfdfv}iIBxVK@ksYl%A%7QzG7w z1+SXn@im$>T&SVrr5g6OGb^Y&iE`2@DKnsSOffw!`zb1QU}K)B8xr%T19wa^`3hH2 zR*)(-vt0)#TS8#fQCZYKV2PiMq>`Ou3_v+p0V!yD;1e}wxamVFQWzZrXJ3cHAFW_` z9A9Mbe$5BVXysDpdd7ouM?Yu}9EWi?Tamc@IjKu=A*L@%p=BM0bh-i6Ww_$rzHVxk z(K#}6w-YiAcul`{GvFtPba2dkEo?w8pii-4B2A;Q(e)I3S3VXyAK!y_aemR!+1}); zKnt(lvioH>Ym-lq=C7y_~96;bEai9~zAtLET;Y)M#N$ z-J(AYl+8jNQnIxVUE(&PZC92O_2eyN<}`857b9hk)Bb~;yN&HN9%gez#r@49(#xxf zjExjc3ojAb<(G*PdgaBEJ@uku_cRe%Stkljz9MoRm@NL%rzb8;QWH;$8Y@1n^;q;R z-&m|{{7iKH^%yZxYZKXhnkahMxeA}LRKU5++c0>jh<4PvLEj!KN9SgmF&uzm^l~^1 zMSj+Y9S&!R{T6Ax=Cf+*SlJRH=OH0An_Xdn2M@BVWT=GA<5AMz$@s~55zPLuAJnsH zQosHuQhp`FFAz)w!TZ{}al>u2)1eYH)ehrX4iub79S3&5Df;+;IkBB+g{6<6BIlyuYG=oUvR_X1Z~}D1!%u#lD~xoJKys3}_M-Z$66oaiL5Qz#>jucGnV@o~^P zM-SYlj{_T3Mc6gT9Z#=14S}bNslsWg$ooScn%A-#mTWo*%e#h&i5U;YvLf;2u3&o8 zsv@W{I0F?$rwM(>3_s)Z@a;lJ_&0b8@?Ix^F6~MQubjqbCoEt(GG?Gl7Q?QMx@19M z2p;*m0Ia?lL&)^ww7y|6lGgT8Un6{B&^{Q_O4rgGaUA7)IlaE1Z4F#D&V~H9HlV9K zkEpwQlTDBY+J6T~YtK2#CO4fj*zFDZ#(DHmh!J`l{Sh@}%Hu@8=VbBzBw^&tL{`Oo z8P<~5I;@(#RzkAuz97ho7KE$wh04iI0>uGc;ot^8;Zlv+tiR@K1l4zvtPQ7}6zB$B z6=XDZ31;^5gvT!}U^)EwBoM1lVkiC{5Db6FVp$thv0Mw}SSNq{z^Zbcn7uy%kKYx5 zJrzu_ruZ1@h@Fl#X6Mo220XuAuv+$p8omZ7}>K#3yMGJ;1wTqq)%Ncp>D4g z#97pnWyuaWdzGQIXHZoteJlji&u3`=aZFAn?3!e&N@5P~urBx^x&eR8a(J43* zK1#F_!ifLmCD87eMJ{~#i!S>0pt~16L2l7<_*F7MUTz72-W&u*3--Wlk3ZxM!?Ij2 z^Z~CGK5)4%1tujGQ2V3H>f;S1q$0(a?7dllzTOcMhcg<`q-X;BbS7eqR$%^h}JwFII44Oq+JF8%$mIgDh+64mD&)T>M1=@YYr^#l9nk#s4K zq#bES54UC#ubDN}1a(!cY9WJepZbY5`;5gkohew#KZaknPGQ^_{`kdv8EH*%KfL9R zmHK5VOP7_s1=iPx@J!WE>V0dRv?K#dKh8S`?Uv2Zc4V>CK4-dAuESocYgGXMR^A2U zr!rFi7Hz4ythV%cw-Rt{o56c(5IlKs9Y*3V|Nrd+hhFL65Nlg3at+0ga_3@sYa#l( zD+o7!EX6bLbMOk!G_0Ml8FR9E_(eK%`!2tXz0cA(x~Tv^SR9D^M+&jG`EmSh`W^ha zHxFn2>p-6wN6&f%z^&SqxN&|K_MNMZiz0GKf2t3Z8x4{5!&^z)-gc6eUPg45heOay zKFsS^n$WE?owcXinmuFYErG$}JA#6*AF8|#{jx5JauNo<*e)!I z%McD{y`~fcWj}RnW;oqtQGiB&=n)*1 zPBb}_$pgPok`?!i-g$mqJ$-3A(uu04Us!Ce*Z$8L$v_WU^I)IE$AK$R8wnG?i)ax4 z9Wx|$8qJZoZ%dNsWS$pmE^ZQ+sar`#@IuM*`Jcq@Cy#K;N}Rc?2ak!I4mEOjX&Lc` z;~aTqtCe_4mY%$+8L_nh-VpS6wF;zE%9te0N+H57SK5a;G^ z;M%$^6^+EdwEbOtftzyw1ozhPGHweUE1uZAh_mSReUA0ZY1|B7Q?BUw4zaUIIj3~- zj+z|@RXLsMo?P7*tHp<-RYc7@Iz_2T&7zl$MK#d57Vo+=2ETr_3N5~}6>W_{IKt=# z3VtU`IlfRv;jib=Yh@jAeV7`a+W(3ey+22cati8AE-%C73+>Two&p^f*Xc=<|OhXm|_Ed=(A*n2truiOX<*MhdJY#i0Fl6YQPu3{M_DCF_4Xf%4P=2%j=Q z7pyym{I-XXxh~sD$A1w>oW7s>@qH{BbzyFvW6#0iki&$tkwds|yGW;}ChX=Q@M+RU zm74za{gT7P=a~)}He&n%mUZ-&IWwtQicg4zSv*lbZU&1s#=!Y$+OW*c23*@Gk`DiU zBs{|khEit2AB`o@w@CwrQf1-BmA9n#RVIE{or)L5+(zH;O@_^J$z)ri3!16YPi;wA zN-CUgleOl|o{f`Fw&t0G^v-57I^vG=6@TJRz4i3)%Ur0W8GBRI^$ z<2UQ!Q^Q-)Ki52boqdPA_E-cqU)+)ApBVgAa~5AW`(NGmeY0_?4GV8oe~zP@hp_mI z8IIvE!exQWabTGSpL&#o{C0`#sRgrfu`(N9RH($f)#=!Is0{r%cnfQMypJoUnIiuO z1;CN91L4R`%3{_Y|-biR<7%K<7ST3Q9StfUMghfl0gf6_5g5F3IA!9VCt~jVD zj9vF1tAA4i3zX9ZIX9WiRrw#5bHGfY#n>O#^1&t5n}dG{wz|y}%vUpEZ9n~qHRgB( ztG&^LZP)e(yQtE*Z~JSsyt*D~HJjp?jwHJzM%kSoD`A&kPLQP^_A>=stUW>GDu>o0-WkN#q$+E z=gt~h&SC7r+yg37F1Kqg?={H~P0-C1qmEGV+1K|t&ra{*{SI6smK!jZ7~3aG8c!HX zBo0q_JdbRyljLU2;rmC$FMe*|x|F`=nT~|=?Bmby)=m@f=I_zAJN4UGFeap->Vr=* zEB5dof&Ida za{ED@y0Dl;96sgXC@vy{5zmm1+bPu7ScR(SBlv210j3qw@ov{Zy!m!H?K|!WPDwh4 zyZkTVW4@d4@xA8Kri09FvY+`M&bbKdbss=u-){UebS&=VjmMrkisa|}owVORM-+Id zgoG6uksW`u;iOS1kzFB&PnPA;@|RLz%BLJKymuB>s4C#Sj%8%A^Ah0i&?i2IW$^U% zN!Wg(9=6V@h8w4AVdaY3uz{OcCwSdR{bb(sreEdp%dz)RzeW`_1~q}PQX4V#zV9%8 zi!$C>PM|Y074(=suV}{)^iO*(R+peaYn?OHC{w_@2Z*OciP51=7LD(<4MtlYn@jN zZeFU{^3{ja-n>?ndr(c}bU4QLy?A%c`RkcAM%<$!_gULT;rwV(=jKw;{u|@*6e~s0 zwu^#jhRwC6B!l>+#)8Vud+_Br(?cma1FYU^IP;xN_9@SS8E~Y2^{sj$mi>WJ$FITv zd_1srND;m>WP*>btHSeGq13yGRNQbviLam2g`Y93!{#*#c=tVD^2Dr_>M*oLMjxWc zp1}q*a8L=oEI-Ql0aRdj+$yN_`;S~}TS)lZroiS;6~xbby2FPBMdZ5D36g9z5!|Mo zrJa|0ljtu~@f?H8w7FUY98HP@@~@w8_q*V8Cd0_9_6FLwDwFOTJwU>Cw^8D%Q>3H8 zldhSShi;}!CKA~UwCiU&-fx+Njl9C~Qq63fS)9c1>Q~{|IdgIKA2zZ$Eky?3T`}9> zBcZSVBD#0h!E^gJC`-u$oo~8Mh9|v8E}Bm$uenU`?KvN(F~T&Bz6A8X6N*?V)aZ0joA;q8t0@U9cC zj}zHhD}!7&DWV9&*CbuL4jP!>726k!>AX`H=&f$84v&AN!oJ!P`tNK-__d%JI#uOi zZ>5B4Zma_tdkz>hU4%`VSINLi4|rOZE~w*`3SzD|v2HKIf|jwH1nJBPVe;KR)?UGX)yHJ21#kH3g4qf;SRs2q3wVp$ z1Q!&OSxMjLvHXToSaaTIRR7)Sf!0{r;RV0+ae%!8Mz%J%?lGb-HY`KmHC^z?;A(W7 z=`@|%5eji`nAV70fm3%hqx8`NyyWvm=-eoSJqAu=S%gdA0Iha-86GZ!&9_@*_c) zV*_@`S3?yYmN3w6N*)*VFx@6coEPzoT;3T0pUlU@oMbP2s_X+k-h)WG(m{9?^@tKj z9K&_aPjO5+2ey}8gplp)NguNxH2yM(U(DbE|J`pm6w8Lq7r1z}Z3yX#Rsg;B5;A<+ z2ANp0p>47%o!#)%(nuUW)ldSXb53`XMvxmP1sW()vtUU zO;`SLBxrOiOxL1`%i*#567OMJgjwf$xa1eC-hbBDzgu)v2u=5=SbYMvv-F+|^+%FnHv*a|on*NWZe%}or zkKYG9kEig#T$i%)T#P4s29RqznTVfJ77nx5!zt#8*z#j0Ryb^jmAg;F{B@5&pZTuo zoeL)K{cn=#YS%#K!DrCrRm0fyAyU2QHziVfNbV%;Ca)OxVyFITwEOTenENh=$%u)e zX+b>fuP!0UonbZWZ%*bK*SXaQe4%Ed^Bl2(|7KC<;vU$0CHzCmv z$x6|*a~rw&FPR&=-~AfX&AB4GI7{*Ef*Q`6g&c1CMhEV@JuI=x%-P~u>(n_hQTw=0 zriXHcd55WiPFeWlkqgb^9+JnJD{$BNJcl^2U)b3Bf9}gVHujCKo>cK#K`EVeiPp~n^`Zi|a zwRlJ)3Ck+9VU@~Dc)zVWKdkxz?maPq?<=zij6CMRb-l&->ds}j%3>T|dLol94c>*s z>pJjSuN>SdScWZwjLS7@Kt^T3Tm^(`j++h z#}5_kXW51JHnBf7(${-&qBbFJpkkQwHxjuBg{4gXEp~3xi8n z!{-yn$oJJ5h|BCi{m-sLyIPOo&#lk#tMVy)X@D}nEzwE1P{vYlgM4OX4YvwPO>eRm zcAK(t&Xri59-Sg=U9BXXw@6ZbaTp7mG!g~d?(7m2;!*3H2H#jd^G2)Gua;Gq)HiL>Zv=_fR2La`=Jaa_q3>40bonfkA&6 z5W8ET9r5FEH}|Hh^CLyLr}8P!E|2hP8v1xw&K~78HCJ-4RA}3Mb134?a@fiHUKzl1 zx)s3Fywkzc44cQ>Sp1y(ftSMly`$Uqb#xi;^_OJs(eT~e%@Yhn1MN3NyzqS9{slc8 z`L;OjbUdCHO$Yx#6t!f@unD6R)X4)qyFj+Kl_w=K{L~1+bFbZ`qSNxn^ctK4zu-=+s<-3#H-kpracOCGtkP8)nSP1`-HqHgHv;{o|MB#-fzdYaJ$e$0gk^-RjFd=8kFrBRj>KjGkcE1~MbKT6@U zJ+#l&cW7^^L|d1{!&Pq)>DyBR0Yh<6bTkYi9^~S16(zp%;sT65^6)2LJ>0R*99Og* zz(+Rk#q-Kb&=1x;lv+3kJsTFmI@1A|HzgCEuR8`@wcYs5*fg?5B>`-ou>nur3b{@@ zVM;6GoiHS@`|&+E{O=u{YpSl%ez=os)!oHOs$I?rGfoq!e3Xjl`SP3>Yh1WJYBRYF z%ukmxlmGS!DrI*&>2oUOrgO2*`I`946UDdxn<3iUP%b*NVr`>3rMgw!6|N#(u|CwiCaN%8g1v`-?lr~9q|D$}#c z&-DmB?uf&69#M$*Es1;(O~kBP8yq{5gcqN4Kq+}|(KRxNOM!#6CV$3J#)bH<#}bA| z|C`JaDB`a_52N~*E_B%;OnSQVh~<7aEb4ws@g}_@B@x}^hpZPGRUbgI16#3%%yaDU zITnu+ee6)Kg6DahB`dSulF3g`LqWK<^vGfxX{zEUurHE8yU9Dae7?Vq&Y1^QDi_Ix zdEXt}d)DJ^#*NrkaWbChwG<}=sgZ}v&!UQ~D3Hu~4Ez!;>Erzx%q{aCGaDF6s|*>* zRnSIh?HWJnKEfl*)=^if>GHAC4eeiOUA7y2IQ|NPiqO= z{25|Dl%pE>%wA%+0d0ERg2vqyWA$&XIAQH%e)_Smcy96oY_z@*3%x?HLw^ZgR$qX% zPxNDntDAIp)MprVTqRX}Z!3-Oo+rIAmGP9(7ic-167+@joZQjmN{6nO%MVQLGl8i^q&y zMupR>u(`WGp8GEu?@%xzAI#?A?SJHn=B5}bqkJ3+bqRv~t*&s}`#v!}KZ&^hoeq0N z2{@u4v_8zZh!`K^fXVP)SZTq>`?ck%7ynJd^2^TPO!GFfCwzdKe>s`)cDTU6yhc)J znh1UZ8bYMMh-PU&dFFae67l#DT)vx(Idb#y-Oa7|W=Il#yK5LVJTAj3W!A8HzcUbp z1o%DY60%cD#*x7Wpm@ImOmc5Qd-+wOd7>W8Pn1RHQqLl?TNb@;Fvg|U>CDcgoi$%* zD%?_@C#cNQ68_+O2-u-_Schiov4fXov4(OPF1z;)LFBte?S-&)?xlLuGh-lfT@|SP{tWltn@ZmYY4Kf;#^IQ( z6r2*ywhxQGMr_Omu)Rwv=3x!IG|-)x>Sv;gxbwAgUoO>Nh;yjbvb`gDu<4A%Vl-bO zIAT&e=OwRp`N#3KoilW6>FG+6ysp2JgIy7|i?`;~PHv5?EiRc}8@M>J*1Dpl*67`} zT8k5++Qk1(*RI3%dY?HWd(k!WDVS9oacqjf{it?0@SZ4+!o&|So$E0^?P=2wdovFp?uj3>%~n@ z!jRA5g5vwW!oxXj)#|pp1v&21+1oXAz(ZYy&yEP-hnKVYdN4#kzm-qgjvdDQgmOIQ ztu2b0bOOd~nM#F6D1q$5JCJcB3F2hWL#wtaQPiFQ4HMI#LcNlxJp4j#-&TU^zaPl` zqb%5)Yzg_|Sh&&FikH_tMf%-|kZ@%i=>NBdVMd6tiNGFqGVCP2T>+Guo`u33Ym#^H zD11|R4@p~!A-m@Z{GgYR!flIyTA2X%--duT!>QP_RUKw7e^0G`o(EH0(_rkG7hpeQ zA5uK97c6p%p;NXOULMM!meF^qSB?4PRKs;TFzIUjy_3I@myI6ttt+G=SI43&Z^sks zo@7{LXM(k#89UTYT8^t?q*$hp@qB!f32*h;5A#YS<4X=z%llFXG`pO-(p@ zZa;o5&chsqy=09;DJfbZ!d2W*{O0a*yiv%(cV;bPemB~&{+|l`y(A5D67umUjkS1# zI>7^HPUCBw@t82ZWUnD}$h((FCL9pZ16S2y!UhB0pZONzuVeR#mgFAf^jTHbq`6Jt z`G-ytUyzR%wa4mT_$n=*~e~~3|cVX92$yFr0 zhznj_V4t5|;85HgjlEOEsN%MiYG@gxY>z*{77VY$Huwy_Mx~)cvGeiP!de`s!J;e{vg?w;C@L4;zp9~MxTlhL-zLCC4}c=|V%T7I0%CVo zg2tdOjMcEl3{)CRe_p^h_*wLxWFx$B(`g**nucyYV?5qv*3{pTU&!^I9NvghapP}u ztdKv3zrEiEe}duq->-$Jvu*>naGnTFGmYR}^fWvzFP`Z#_~U2(=IC^13N7b50Uu!p zqNj5wVYk$$=<zt<42cxb-?TA02_bFhi+Y zzB}k`Yk@9SJdp1LFxyi`T9v;Mj$3>rAH`a579`~Px_aWXehu8&e2jkEQAinfY@^>! zZ6Ig5w?fSAJ;+YR2?#Y|W$8w|-{f#z;>i};x<8Xtt-!=5QVC_=ON6riOC&`( zhFToTIJZKlgKDn{d}y8tX|#~$7n|U(o8BNOK7k%J#1d?TsR(}uynn}9Sj$f%8{?}H z4o$?3eN0!VX2W)vgf~Hzk_Q}5 zmVxt?tI3Xs_egMM9K1IwBi$#4=rh^hskWLgo+EzPvq~S2J9J%; zva*DA$5>HF$~#!6o|*~NG<8`z+Y(u>=e1bqqL6X2W(ms2vIRyqan=%(Hv;?npIIKx zmjq?9cdOr>nkXE%*+O{#QjH+un47@o@mRL#;#}4$&nkga-vxo)QgdtHe~Qwej4a4r z7EccB+fCe(_CwIg*;0)X#&?{zj5I93*wO3|@phr%`uUsGIJ5@0EP8++`zzrs1HJHq zVc>K29L8K}9sc~mff*IMq5g#@PI?7+MN0;r!R*7FI=m&h&pO2K=_BHfhu_3niNBde z(;pt*r)xJ=Z+1hpi^CsFQ@6)$Cbjr}~NGjWI*NrOPYvUib zbt%oW1DHAESLs<^Axi*~vRL%OO2AybU3zaFq;Yx{i?Ty?y} zvYXe&`ge!JE_6|32S%h-yDqI24BVDuUv6;%GxvSaQ(aB>Og@h+s+WPQyaFs_cKyPg~Grp9~;G=tuPD!q(&&$Zr(^njXFMAGxNx6vdXR4!3S6dgIb4t(V<6C1TwP@BYb)mJEBKa1ZA>o^lgl#YZ-yusfIM)*F5>+G`sq@8hJ9y%m4^+hy zw+7n=_sL0SN4tp+->?zS?`;sB}0r?B_$#YRw|?onHZB zlk7|4MkWimK>vfNyP{O=GCxYZWM-{ckJT;?m&AxS8F^u)p!X=)@;NnQ_bDn|?k?5V zycwT)v>!R_Jx#sUcEtgcRbeS*i_W=E>FdY|7Kv#j8Jfvlf?%GRV2$~3A9|0(5`~jgzGy7lEw!>y3-gq zGWY_?uYXBRRL+D+%4cY;#f{XE{RHfN*9vlWn$?e`IqUv=Nt>}0| z9elYA*?e>(g$pMzIq4zNlw^hnjxu}d(W~T@MI4hcUP&CwhH&?N5nib?gw)lO>*eNX zfWu82Ft}q+&d!e~YU>t4$j1(%w=x@*1u=Q?2#G{1N`tcZ%)~zP<|17k6`Wc!4Ts$k zqQsjy_>|=dG#)=huhwQG-+R7TPCAdi{A(gf(Y{O8nkvDlc_-yQx{5?vo8tLD$5K%l z5wwPiDH8On;w|>RSof72sGib<1FgF7VG|9{v;W|2{o6=o$W++5N&*r2=F;&&?)(b= zU#cNU7c$mfhf8XsFfmM?|EJuMu5N257oJCey~R6X8l?;r;z_$-;)DbLkbGo1x*d~)pRVe{PRZ`LJXelhyIl=eit6$8gl%}t zr!-KT_z@T9EA#*ARpM^;6FO<(L(r@G2zIr-B-@3_nI(<`H}9jA!_69Ke>6l)EVn=w zlTqFxRbUoN&7fZA1AQL;;IWT|R(@t2k6-%WKaZ7=?z0wlJ}V?q`nMfeZ?&XXqTCr) zTKJvgKYPfv#ed`5?BsZF>KxcQFP{yP=0L~&%PvOO+w0^b%q(soHM8pqLqr0 z_Cud%4TGQd?^daU|&ILwC2;yu1Q!PVt+X4EV}_O_ik*313abRMke~YoEHpbye8ev ze3|nMYjf{q4C5R9eCN#1oLI*m&Q<%`+M!_!s|e+M+OOEtoP&;1bzZhpG@m#-?!Kp6 zX#3LF(;`%?x#6o1(fYg=bAQ7M?)L#6?P+WpXGpcUPSweoRv4hl)oMZ9!;vQ}O`qA; z1w4DjarsK)oHFSEE72Jj8V zikp6Jtg(egq((2EB$H?Mw?(^$-t}2{mtH3>6rJOV=4QC3Wiyy=`R!0g*TA;J zl~~0zgA9&x@pKhw_KxFYcw>GtZcg4sHoq0^GT$GBMMGYY&YuN2L78x1`7AOVRU>jr zKP0Tg{j8y=R8Z%?fg4O~q_B1)wX1DE^*-MYe)}7MS?MrYvgj<*SZM*NkNqGjhygw7 zXURj+Jcyl^Y5yc*2dkr29B$2$f`I)eVIyw__-@Q0>%PWfiR=HcOpzp;`L7yl(QA=e z!xC)KHi6j3qVS@16*ziiJ=Pi8!;BvAU_Lu!#oXPwoq6V-7c<51HzRnko*|ptS${Qm zl);G@VOYF9#Jo8$z_{S6&d{BD#W+$n#c=UE!_;9=jKuU_hUM5Q=593>bEUZ|bI1Ky zhVaB4#-5x6<}9t}49nX4mqZ|Ik2 zZ~i--ZTW8&+wHg%p0WQAne(;}om=u61+Msw78zOK8gnTqG8#d1x+Zac`xCrOTpWs} zlVE8>0K}-UA@Hjdw7)bE-V>7&>br~XD1{yHF=-Ool?TaC?lF+D*#>L$y2+>B|H!#Z zitM&sxAB4kS@yjq1@?zepRh}+3yw};VRJ;oOeMf?iXw5LaUnjrLWK zZ-y|)x&^{y%kaXDxA3jD`*__}F1EkE4fifOis|$KmfMw3WU- z=Nk0sryL{{|3}g@JFxEV3hdD6j&pVdQ3rl`Luq9=o~YsC3zla1QYM`=IgO&fYz@jx zBNZo}uEG@&qiF0|C;n>GPGoH^vFsND(Up&auxnBvr~CxG?ufX#3bTau@DIcWD&R|V zI{CaZ4qitlfa;JrENJe6?ssR0*o5(loAI`!sIsNpCoJM2pl1=_8$O z%19x32|oDwKcsu^04@ym!m7#3ab3|(bRuylF1>OUuQYeU;Zo@+d}j};xOJS$eLsiV z`!5g{wV#83gEgdJybW@VRfHb|3xt}rwnDtN9?H)y5iFkTBvARSC{V}&{t3rrf}FK{ zL7DH|#_CYTMs2G{0@>vzjpwVy8$D_a8;4Bf8|f;Ig6QGr0{1yCjf-BnH)dtU@*~#2 zW17m;@`v3xg2(SU{1q!s+n&oSvi%}m!WjG1W91{JNxwAGz>#%T<(g^U<`Ah_^au|w zCnoD9Esz+|&P+tpT>r??V^&|M9og5j^QT-_^dv*<-26 zeYK~KwrG@9S8_$0YhLk`#&J#LO!Te7{uVEBd~%*BYnX$RuE!v6w|!VgjDeY(mZ65n zL#RhGiwMF)(N6jsite5XN49PU)2+8)MXVy~G73bASIWtDw~J(X!X0?tk&c`O58+p$ zt<{6@b2v$?67~;ELRGgITIvx-0d_>? zfrU+ih|i`E2gPF`zCH(*xko{ebP8DfFd_e)X=If?N`|0aesIHjH!S?L4g|>?$cF7B zXr4kjp_cTp4sQr;c0RoeKZ(B6T)RUXXJl-^otz9DVO5E{-s<9C7v|&h-tJg;Ko%)P zU1N>ycuL*1mcyphX{5%AhwImm!ND>GVb+2I_V8sA@1L;R-ufZSy-cDwtr8tRl}}=m zXA`z#A~FanMZ4yfqSW1PXnVhiFFvq{Q8K7POgN|Zovf@ z@`(o8xx?V4dI$8}%s}ajE*vVo2$k}KFiBgD%fb_I)C&XrSrj_|}yqd@%bzB6iQMdCR{u zC~wwplp0Y8t8)IplQqTQxnv$FFL_H9N_W6)<5l?CDP@>Az87tc7jY(yB0%Msh$P@6 zg>uJO_;zAGy4k5t0*1t4`BFpNM6O_$<bmwXZ#Wm)BWbf<->93mz^N{L)fbLrk~eM5Vj9Q7V+nWBpA63V)9Yz- zbp5%~D{AmaGr`t(LUG{M)A+teI$9v&+@01EM~~I?pee!|W>uMiXLuVr*b#}6<$9=j zE5}JsK{;8bVgZkRmO$3xOgOtu2*z<`pt+-grGNV_F^`CazVvV?{I&?B8lKsR$$aO3 ze6?3F8?6*vyj;U~yI;g;d!5ZY{*KrT2>Y2)UO$*sg&bx_QXTU(nP=-bOt(GdRmxZ{ zV`;nmxsh#vCBP`CYb;MER_$=3{)vy`h4@K~?SsMFZLU7QEp0Tn zn0K_Ts&ZVe>W8`orIOq)JN&GgzxZ-gq^5Jjzs#Z=eppr4(-)2GMS0Q`8#fpf4+AmI z5cTGtHC{7fi0^*(MEk|gpnGd0vDfF_xJRNLeO8Pm@c~2R&tC~_^56#E{AP?aOBDij z+zehMtiXPy-|@85mhe*hHY9zU4I?VU_{8pC#56b;E*GYOx79c)JwHdpjk}A>a?TQm z=VHQ%>oz#@x)9s*-r1P1yUqCxdvO!qr+bG3UUPV*`?+>xVXL*3knrQ zPVW}*=$XY@*L9lgpO6*hk=hAMD~43>jYA0<8c?}ElQ{N0LyD#wu;3h%edB0%@;?a5_zT(#ZG`KWeJ&5ZS{VxrJ= zCMWqLvvZ{#Q%9+XF-KNZ%MC(r#+l?(&};DC)%d**BtYaY4nH6PPDOCI$Sfx6I!po4DEcZE`33t zAzg2i3MX7fjGl0FsLuPvA&84PM5=9!$*q=mWORia<=GSlDT>SB%vdoB_V7ig94tXc zs)t%vH-(%eo}8U+PYi|~$ z`#&MGC8GsT|D*_RFt!Pn(F_Hd*K_z(uaaQz1Qs~_XDLVwJjiz|9p!Jn(Z}B(lPkEr zf+Gk&l^}4OCo=T~cQPCjBL)4Y=LB1WO8KW?hOJ{!2Qz0|HGjj6pNtuQ?QBhIT5aPm ztJ4+CrR&rkx+!f7|Hq;J}BER3>g<=f73M))-}t;2pa?JcU@BE<~amFlCMbK6+>qQNf| z)#?l@`s(4{BSm-@b^`5JbU^!7KP4dwBB$ibwNQ;t;(&)OsO0eq){R{1Nz^_aR@xj$^(4I@neuiS|iFvZ8-#3;ncavu)pm z;Kkt)kaDk%^e&DPK0Q8{eeLWPHuowLUjMTWmh8)Bc^iz84ucO4t-GYyDo-!L^>YYE ztYffl9mMd?za9S)I{~vJ>+yVxK~~f7A9B)eIbKxz7WQZiFuoezI47r zC&7-bj8>9>BQMAc{Rq~rj3VscUyL2Dt|4}Z&w?g<3DFlR?a8YRWLz%?q7xdSsUn}6 zefA}}to(wQ)?E>KuKQuyk#34InjqC*3emCO$>d+v2~a#D1NY{afob(5xqDlmy~jQS z&7j)A(I=as_g0R-zATFuYx9(6$ZX<;^i8w*%$dtmjTW~tY|-RpGopE`{>Cus`*az_ zZ)zFSD&H`I&s}b4?=xe{mzOkDe@VBIX}r!*I3~qYwVuWt9agMgJF3AvcQt|0k&w>& zE4_m}K6sIoWb31_#(z{%NE=F*`QCioJCO35-$ltv{GfI%NW*`AOf=8;b-^}Iq~Tfl z2qm#VogG(fjHJ@uq1^ddaODa^#8g^{C2S1vgT_vzw?xE5PZbm1++u=57CD01@9o^gv9+ z2EJt0yjdv%7H3{2@l~F<(R~pt+jbnkT=oov-?zZjA08fAa~4wUl-MCt7h$u6p3vzw zja?Xe5!t$En%3(mCa!NCvLQclt5vELIj;ovtLcx`_U%N0L^#nsYmO?@FAe6f}Y zm{(X*{o=S-juN@%*5OK5MOapTky=-t#k!>530p*dlviFyAm?}+=}^l}w7GA{aB zr79+J)~D9g$whsqWhuwe0;WrHmp?P6-?3ES>ID_n-D{#~x{aQ+)#uM!483=(`^xpB ziF?{uJqs_i&>B;uC%hP@Nn{Rj60T2i!bBCIn1%#+n{7wxUbe7&hhCE(=bxcU<5psQ zhD+{;>?Ak5{i%MJ7kFmT81bZwFfX$=kX84in_noj<6EEXaBG?_XkJ_hT}>XK-FXJz zx!Hh!&M$$3J58YR%`H^wl8eU`LK?44inc}P7dK9}GaEH@<~RO5t$>G!G zr3HtLw+lMmECpdN)A;9Bo)aYIe-q@IX*cdz9VM8OuNT-5!$z%#>_(=ZY2)|IR=#U* zu%NhAtkM66MC0+cDgjJs&<}TTX@-GZ4o8@8#fzw@dn#c__bFh}i={W!$a^YqZ{BCp zb4<=!d1M>XIr($x+rN*|%nKgVJk~F!A1WW>^!aY&}!*KKL`8bMz6R zSr`X%jF+0w{1@sAJ;7Y4H71%3*Z)D`$_;p9VkmsseG_V31IXB+JiH-86HlC3h+bMG zz;Jmz>OUQg=cisp{^RrTopFjd%)ZMSY*ELn6-;qU?j`gw=?Fxjf7N9)h1Lia5faiEw%hwNy5QRJxgh z!*VWeu^B-RE!s%=U@4itoe5dpa=0)c7U!I?!hwqUIQCILYv-5Ca7s0VbGP|` ztsy!vUDQ|nY50bEqIVKcm04i_#5k5-&j2nsEQ_Dsti@;5L-A2{jE38WQPTUP@Wf#r z-ZMQ5ADb9MPMQbVCsKw;VOI*OF!RQ|O+Gl|MKk30XMk>T9GRhCkJcUdD=79&ruO)4 zgkz5`urj@xS(XDIQRt54IMq#z#eC#M7HTz-Op*5YsjMfy=%NCD*9*zH>H2V5)E#^g zC&%7qIns=#GKk^|aTxI04?jQ0@x=9_c&DU;8%7notd2Anc)BR&ABSB1kN7=1Ad@iE1Et-4}bh2jcelA!i;lofQxb8#S}#NDAjd#Z5XMZxOOay~4oDA{bO3&xX6>0c4;w4=d?pkfAr3MA$xp zwAUU%E4-!H8!S{v!O#|9h<>d_iD2Eo1nbsngWiTHVpgOHcS$fQO?W5D@~$Uc3EF5& z@ecCytu-9D(TvacuV?wzoFLtY){=qeDX_iq3w{+Vy6-W&StAcRsfNp4;2nDeEisJ7 z)O&>U&1YcwzTecY*qty`b`U&z(@1uBJY{2b6Wxw|PJA`L5Z;cE=C-bdko?FQbaqRU zlglK*d8cJ_?BBm6%-}biex%&oioPJMWq|)nXCR-V7U1fr3+K6RfCZBUXig9tnH~9w zo#);fJyv1v^7QfGQd%?hrRGz{ zL(YrA<+bk&m1qkBdTS;at(>NZ3+dnHv~WgB`#3?a)^wWLM_K@<7C%uA#=^T2de;g;L=0?V7+@JVtPI z$78`lM>2gP*$C1!24QjWiL**DsAo1OGl6GKkR97S3fZvp4W@fc2(?{UuF z(0q>4p+6jpCt}=v>$JHCkGj*Ob`I2b7>seIZWwT_uSU{R^a?o&In|tpbB=MhKhosx z$J=POF*CVhR@=Bs%%W-(uK94D&n(66+%Lq=Je~+#6Uj`KmBejWh_AiokkNm4$#vBn z;%TFWmL3j5ZROgqeW5E9wJe91gK^+8dmAje+dyiAWx-R_Ei|7k1-ru1(O*m-&x z8Y{R?6yXf6{T`1`ZuN(n$ulUe#RbB~SHbJOt~mQ%Jp^lfL7%g4qv7cmqRenf^CEu{ zD|;Ci`^WHc_xjgV`_?;T&!>1euy-DEKSkjKRXJo2dWT#S>`7bdG4gF13x95TOy>4K z!$+^rWS37&z}p{1;641v?8Y&zbGGrMiOJ!jE5Jg;I05G_&0Ap zin?WhOcLAhPL4lTfBY00TCNd$v)x!{+iS8QcOO}D$x8S20xeAVRwvjkwm1GXnQgQTke?Q&l-vFd|oN^Ub#ZO51`;Qa={sON3gh@ zG1~Ov8B15M9ST%Z;8FN*q}90=e{hk6oPITEzAP@xThc|A6u82kMUC*Lx&dD_-UeBp zJ5aA=0&=c50U!NXR`{9|=%9^1T6n`9``jqNOA^FUS8OoOow*<1&^XT0s$Ga@oc#rx zFSO!ygQDj-#u?^C`@pZfzp!W1O&m};g*R*XlH0pCLH6Q{aJq;EafL0ID$Zib*W`h5 z;8BiOiwS+pI~A+t18J6rpTyDY;yh?pf;n{An(?|e=MkDU+CWcrYUa2O{IX<7oTTOI zdvglLHEHrH4IKNZ3EF(^I9jfgYwcXRCq1*Zs!lIVhP%9DI%ob(clxBn1sW+6^$0xd zfOp9NJB;MA4DaVc`m;q?ZDAPJ?Qh4`HbywD+7yRcza_&DA0wITAt=LgN@Yhfn(H@-Zk*gUQ0L{(=TY3ED@AJ1jgH5Pd>=RSVe|F~Xwpq!Jxb6*2Xng}CJ()!ep&UgLOg0v*h6hN zbgU5Mch-aYdy)I8#1WFpTgbay4WeEfC$fW)H1dEE_w7#o-5oC0TRA$+ z37S5Gd8U|gEj^rZ_>Vgei#IWb2HsjXPd#K*ccwEUY>t5lSw|X<&Ow_{K3U4jfWp1T z(4UQ2YpMmP>GnRd=AjH658sQ;M?|^P6Dj!qXOXArg%Lhov6*@@JAwQh&L_2tY#^e9 zLxwfwK@?yn%df72Rqp8!{dgwU*r_87ZEb^gxiRov@(aF;mBP_w&43I8pw}b={Jv)b z`x6(`3oztqZU-AhMd%ongA=pwK>4>UWRM~T2fo?>GbIGVPQ4@JJ4N%G_FHJ*&jE7z zbP05eX7~-`iNq;#I=B^-kkJr-?0wn~dvN9fr`ZoawM3GSV`BKyA#prBaGl&gmq$sQ zH$t!4X*lmNg?^tKM_NzCkd$O2YF9`>`}NaNcjjEy_mw74IND3{uNgJd{CDB&zbcS~ z#V2a#&y`RTn#NjQ6^sj3rLv|s2v|x!d&vxWMb^n#bK!E@LM%4_D&FOjkF^}%{@{e_K%>flS0v|}~4NVYSy+ocRi!$mg(fmzk6!BBfW2MDKScCz= zUS%hdm|!oi2sw>a4t3+SyhLnLS%qVo7Go!Sef-KN4$n_qhI1=ISb-V`uw}fE*pw)P z*mxo=E__Dp3Pp3~Es-oA`Qy}EQE}-ry%5FAXTYub^^h?B074#R!B=k%6#n-d61uoB zW5-n(4U2~xJARYy_xE7j(gTdo?S;+-DdceGbwpfB;G9+?N!?yU7^}Nj7vA!4^s4E? z^(FG;q(%ikG9HOd4UE_u$rK9gX+|{g#S2ySu%3ez@sk`TFB7MsdD~U-!kO_bt*0Vi zS;9l~{W1fEJ~JkB*X$#E9(|(3JQ`X5jSQkQ%pSB!(FluQn2mZ~{D#OkM~K@*Ia!=1 zP2G{t5xGT))h;!A?yX=GZbFmP?b~HbtJL1ZRhKlNy<3q$ zlU18bI>O^%bXqWcd*uyhbv@x?)lt+RBt=$5ZFE?F`5f~5tB=pqO3_W;5_}?V2mUj* z6p!BlTe5m3QyN{f9bQ<=U6erMj^`+F zrEQ})>MUO_OU0XeZ|zxbZeRsB;n7{r1K$j8>$R^O@#z|MVIu{cs8|8_DXW0{qdAg0 zUv83fMU~5S_UPhNExpWL3te^JeXZ->(5$&noyIu5Q-z$1Z`!HuFDsg#U-v~BH!P4^ zJOf5DtnihLdg7Y?k8<@dr&7Mi5(E2YDo{-x9=M!>3yIF)adIt@-@ztw$0dpS263Wx zcUPv;t@|?vWpxPm%(KV$xlrgB|M+5$BD{_~y29H2=34<@5Up zZn#n0d?M5go80uqMqLbiPh$|Lr%MP6`nrjdtjM?erwA)!4mLS?9lUfTgxXKekaOqQ z=&YxdFrc#)9jaC&qMV`Vy#~}T4HizCOh&S|5jCZ;9=&|mOF6adqb9}WiSs`xC_6W^ zX{pZ%v{Y7ygncWfLVtZF?^c$gjny1h)l$* zd@c*?7qn-0V$Y_pzDe&E0UaIbj|+MCy_dQP$5x%!zDup5x$AwiWB|=V5=P8~DQo z9i)Bru|vCpH`z3`7#_eo__H)h*e1up7Xq00_PiW)etQhrCb9q>3AjzojL9Q6hbGB- zo0F{m`EHOx>BbhUhJKC9`=m(67%|Sy3wTIMhm(%=$(HTC5Rr z^~%IoyWXNpnPT|w$FrEbQys^wj>KjVjZ1gg;mUC)ifmHABk2dQT*)xXmHWa%5~`@; zLj>A+!xL{Ge2j)QOUZMxK0MkwmR? z48ki_=i_a2)$xhUO7fIn0)YXKuz|HYz}Po(Y)Lup)(*u17pp}MDXxRv?P@Y0j!8ZL zAU^PWHA<9CLcBg@a%tCT{5-D{eKM*AlUv;$p~_*4!$+UUp{{I8l+CM$yH3rA%9 zpCf*HYm}JrHWMu=Ic|ec#owF2+IDwRbNQk2GARZ^Kynw_0yN=ivF3@Z04BD=m zXj<@Y#FG7JoAq#p?Mw}Upz33*Af&Z|zv`Na;QQYXg3+tRf`Snrfe?E5T3=cO#uDoq zL+9^lbb6%Fn4&T)kPEVGJi7K|e`YN z>BcqPwDegs>0h)JxDET-Xn!6!(<%&Ra*OiXX@83pIr=H$+$|e`wm?gr``tFR?nI#` zx|bmGJ)Jra`*!^xpd>*$x6)Ad@JrIWbt`;$_lFoN7(t%a8uS3xiBxDp2)ln7F8$pP zRj)pf;nTGcD!c=K^d*GtwFI12Wx}Dw86ag`3A08AVcPF|(C|VR(jA>ax@;4?366l; zC0D?__bh9PmLvTAwH7uV%Yw&a)8HmUjGArNPeqh9;M@iI|YO;s6mDC-yfm8Zze*i!A?!@VBmAG^2 zGSWIB!>-t(%HAV6m+jOg&i+(%8|$an;br@!*^}e5*at*;QU6R!eE#?u*6vTixOied z9@c%rn%!xL4X63o44TLAj;iM{e3p(g>KikybD!^HimzP77(Lj+7^z>*EPHv6QKEc> znVs~L5gh5v*pisUyt{c5^SnwQa|J_(xrr9S=sV=X^w_kPG0gB{uCsW-7_r~YP_lbr zeQ{8MPdF^l={hQ6gRY0i%QwRh!vG>xd=wqsx{EcdXgX~E6$1ZFZ3T~^T_j_F3Hp{F zjWV>~kyVRU!XK?%Gv=POYwl=3RoZ#xV z-^4m;DKtI02!5ZJQ3lx_aB13m|XyDe#)X z2eX--@P}|=W_viSHR~g_eEatw}mz9)iE0lA#M=uHZQ^-(*? zI;OwKq3i+&l+xj9$a>hSp#>XN4+H42(~jM6^BP$*!FQPW`iRonja8L@R!U_=a#Uz(0Ve=<_ z&AYnIA1Ad{l^7S%Tkq+~rvO0v5{OvD9raAG5= z(sPAzS{o9SJx2|jtf8Owl04%oQyZEcDUT_+=DniljIOs{#3#Ooa^?*o|BnhVkbM-X zp1S7n`uZ}MHDrNabrzA&Pk*2%|J=yao4(MpdI?Dnm?9d7<>*5DceJltr*oF3@;Prz zJnLd@N3Fga8PR;)J2-lg!<_WQ3szbp8-~S01Mcg4irjCDy6RN_{-zm>NO5Ndc++Sy z+I0EgaoVB3muutbG`f8G4$iAH`5d!&b*|*=>D(=~S1@hfS=^*$h%+jz@v}(}Jmbz{ zByAEvLPjj0uW%+z?TUiS2lAWqG);+=lQPWibB9^)Ho~d4MvxM999Hr+fXv(~SlW0O z-dZGqw8S%5__!Og$GU<2upDwi2AJwJW80xyV?2GVz^|x0z$_^Dvh@r3(UA9XEyMX_ zC3AJkOXiyPx6E6gMj0m$>o>rt34fE&VruF5B#E>4nd3SOu^H}`^-HmOkQ!v z34vCaMB~7WVZq!|4d%t2^EqiNiff%K6KEN^7dYL1OX^b37IAcz%5l9Sk8)DqDO#?M zF6R8BNwK=oGr-yY;sNc@_)P9S|2?$XH?S3ZO*n0xel1PB^FLa2 zy*XEHzZTb1r?K{eV<60Vn1erkQeZ1-Qh3)b3v6)a0_%@(0=;`CY)(FCf~I*QqMAQoO@k#=D81|8hn;wM{9PQ9Hb~SFE_YR5gSdHCUYVdmY zE94{oAH3C;7Md>~fDcCnVEpVg@MCUJZ|*BW#sW<+eJDltN{3=`$>-?B6Pf12K`fZu ztPV%7q@d<_5pPW5Bekbo4>a6Wpljz}l)_SlYgb%g+M^U`t2qN@Dv3mE=5Ukt^(d@( ztDg)Vne%_4Z^XohRo5gHD z)W%mz)M)TsXUbb2^|*eo-U-IHzFbPUVj+ZOABUyJ2sFOP;+f`AWFn@3Jfk0m_b1qgP;PeI|8zUK?ud^1yO<1_6pQ z(8Q^q=t8Yl^Jo1GXm+iF>-wn<8&`Yc|E9R;ce6Gm%JmU%85#KS&xI9ju?$`bn8HUF znOZ)auNl8 zyG*>!9RiU~2PVc%U}1j^_T|C9zL_=0s1Ar4gYYMiRk6K@yN*|sJ`rbbM?;v9KRN+^%GdU;s>O!SRyP|RTaA53xS&TtEr8bf~nkJ?jY$> z4x_`H;hC@xsy-^C-u-^~d+b5-*wYurtWx32-P^1V2@3~@Bp*0t?gpL1lEOBVe305W z3?5W5bff`%l*~YN4fn{4Krv8QFY-Kyc#{hAV=yw+#dCtSK>xo~)(SBb9Kha;UGAsh zZDJ{CLQV>Q`mRFejJe|k@0a-g_tW^7+B~eo&?My}9M-tU6+AJk0&kl06yfnV#9ev~ zjhxVw<$`4ZlzRF=bUB1&3QhXw(?N}C*`;eO}AkM%|XwR zbCb-VTRutTNQ{YbA$N*)`ys`tak^C}N1w-yxjKXHBCAQ8`Ri3(*y=l+k7dS?+P;l= zpSt4k`l>i|Y;B=rQn%x;yZwpZLTOgG(={}|BMCb#EXGH*ronXL2+Q70hZZUr8-7bd zrz3P=&7Zko@X(EF5xwV!M1Jdg)0RQ-_ib=S@@%tdLIJ)~3w(Ani(i#b=NA>K394pR z3tk+Nw_TWwZD)J=GMA02@vr{06@(rZ@^0UM#2*bBZ)lcLw_PWaeHqSs&1AfP#@zHg z!qS1C0 ze5QprWLudR&*FM)?x|ZYW5N~BzE9hES*SU&67Cga==@RTYL96j?>#HaZzTyK@`s_@fXIN=(9-{eP z!hNrmvFCbu5%>Qc8Vq{J+WqDrtE?iEEP!tG(IFiDy0!qE(yUO*P%XNvy%n4NRKXN; z8kWa)xLmRr-JWNJca_QGOMM#H<`|#gu-l~b!&l-f@=W=jJPGG|UGS2CRgn0=9?-RY z=;oF|YMae!(p;qpGrTBPy!>GE;rk+Ano1NNOnQP&CYR%5Jtny2><==rJri%p+lud2 z_T#gGJ-F(%Xg=*SPNv!e;p2)n){~Jq>~C+4FX1M1vEVUT*2zG(&!wZ|Khw#Eh=usr z8y$3MqZEF=Sr^-VSd6zUL+B$X-QmdXt)&0^YLe?NOMT|gLod4xU{^~rnC?FURf}HP zdpGHm0kx%2{<9QPS<$e<{TMmdr;atd=VQ~%W9SRp$LV0x!1VoT3j;_q=bR zfik{rQi@8Y9Sx;^pWh$;yMNq=>wcW`e!pJNXNQv}Q?2bC^WudLOYx6d%xw1>rWt*J zDSi1X^TN+)=GD2bjLz{6CdH>=-n}Gc^=EE6^Qy@rCNsZ{IpLASEc`Ij%4BZ}^NxNT zbH!P4=Dah;OnYi4sylBd%lc52;D+cdyz40e+Soo;$&7Rq@F)~ToNYuE-jc-nxGH?r z{|;BU@8QjhBuG4=2;He+@P=(Dnn@Ojj()raTF+_l>ZK9^loZVb^UXb|mNet#uWw@a z^EvphAsy$Ou4g4Iqd0qAK4{&?DWdmUAClj174C=y(zjrYJZugF){$~>Ur|C#;dK{@?VCW2JKgd6;uw7AD3iqfN+CY( zStv@Y0BxvwU~B$^V!%yW;l4k*c=?Ag1Z&@;r4`zc;;xH5Z@xvh_;-XuF8`q(rCM@n zT?(u_Pyn;9K8O4B)4=tkzHlNTf?U4UZF~IA7ql_|sBqEgounc~jzks&k>Q?U+v=^d z_*{52-tpxf@_Wev`;uQ^o_!fsZ~ToCJ$A6xKTs8k3#>#jClo|ana>G(rYbzj+Drb6 zPlNyFK7t$nZG@FS<`CIC2Zj72HbVCA6PRBcf^WNbBboQP*w48WI}48B|1=!&uPy;* z-cV!DS8u~@x+*NEM>!~~aTV+8^go#H-i&9ebfHSgHe{+|AX+W^h}6ByR}0v^*jnD-T5{eQ=q+ADJ^ZoUEkWFApOk zKx(Th;dz{c|ERvH#m#=QEyNSXZhF8zhmWAQ>Ku4dHHFV^&!BIGtVpzV38&Rvh5Nyb zawDCY;M@?{aJ}nU=FTMx>W}T*&6%%zzFx&0ad(c@(mH3xa;&DS(k|#$&=mLd(cHRz zapvCsK$|h%Q}0<?o>aZvF?WU^r`E}5x- z)%Hi@rkgIfa^M|#iUi2^$r94JNS_$JT>%n8#P1A`(68ud?w|Rdvefd9Hl=+Vedo1kd+w+9l#r$2P<@~jGL-@?^ ze*E;`2d(O@j`A9=>hh;_Qu*6vMe;xR7t?Y=9O(`xF4JSPnKT10Y5G5tKQyUxiu7g| zQ<`dWD&4y?nB$OPLa(#Vrt{yPq+f|mp>+linQPXi&}S#Cq)&8<(;NAb^g_uV+OalY z+5z1H+K68keN&{m`A#2W`iJY?AYMNUA1-pjt1B$A#OOmL`1u!ny}j@#F9|DGQ4a9w z*YM%BEg+@w60yd5Y}a+nfb5)bVp7c~_Z_0KD zV(WHzHymxQCL`eo@aCrt$QRX-tMyuV=SV!f{o>jpze~jGl#6BAsYm0@$(zA&`V@)C z^tN4UUnP)=QpTqb+$OcVL@4iU5c(q1LoQCHfD^TcxyX!1i)!ZKDqky5D!&Nc_2t-x zafij+zKO6VGofnW6YSpPj8A_{Kw<}q(DA|LP^;k!@ZVLu;{6(2zpe~*-0x#8=uyS4 z$7S*Er@Qge{gVRx#R<2C-bL>vd%>3}3ERuX;Xkc4Sk<(egc%(J+f_NN-$yisTGeaF z^|Q*hDJ8m)ZplU7y$qH@*2k8&%sOnm18m+O&Om`1=R%QN20Y)+!dEXI#5})U=&nWu zh&>e-{VM1~KTcAPL^%^y^hJAEtS&E_cBv86vebFuBiW3w( zs*>Ayy~Bam_QkSk%{?8S>?8lCjtdiwwJ)!uYmKguA@B#OL03599RVeJ4&=@tkEBOA zA=Qh+w%=x^5x2nI5cg^}{LHh@6| zB}nqlfI2N{Lm9EKFI@yQWIc~nCQjJONfzu z0XpEe(6!bWWTOWma3BVz1zJNxg)!c`w;Ls4ABz1D52sSDqvyRk*z6{CWXOsmmM1*W zkFQy<$s`R1d0p_2pG@T>7r~AqfpDi6PZ;^<66vM(O@G7l;mY)?n?mH(0am)c2-Woni9+Hc_#I#z zH^q?n-C<~1+GJRSEy0>7uD6j0i zz@_FJxi3s3yTxU(@OdMQ=nR0aDo1#9K@MViWaGf=?{HnrXIO2lh0jgwz?1z^c(Fj2 z-Qd&*rY`EN#Gokb_vJk*FPgwRG-u+WMTuyG&l%7tAA?bQn#k@(E)v(y6yAS&7#0L= z0M05|k+a`YQu^v5d9Zd4e3Ljx^q#JV2|lrf&z}WPcfTQXwzZ&;Z;42v_&z$t*97DB zdeC5d5<-e*V6)AR?2(BB*esX8ekQ!9(&aWmLY@8#oPxohEsjthL99f96Iv1iAf zNWuD4r&Ri+n5bAI2S(iV@wwegz@#z?+%8-Pe?wDI#lo2+_E3j)F?0GfS#{du1)JzriBC+g-Pu5YYdwShUg;){CU=qcr}{&^_d$93IZrKG1G}qE zvv((bD03BE!b*W|dcuQtLGveP+5Ra`LZ&=Du3#Qrp*_RAzqp*UH9>}M_|=L2a3(djAh`VF+9PUd!eeShOi#SHS0@^R{OXjyPPv`opwbFj) z>2p`jsG~_8$gKORCd)lJYaY$mrmr?_b}i?zt83klv1RCU@F)uF-%bv^Q9yecg{aG{ z6aOdd!JqaPp@zU{EOunA=jxZqE2aaid;YF@NUOjsqB ziax2UC&B~xKw%zHo6LcsFUMK0Bznjl_EzDd zh(fHOla0%jjj-IeFsvxjB1Zm6cz<6RRvr9|f~4P~!WC?ory&6;g>xXpb2YTD&?lw( z#U%54544?{E}EO!0$1KSl7nJmWG7=93)dH{r z|1R+&bQ!#WZ+d*w?mV7N&Qe|<>pHJ(z=^m2nKb|MwY$9cejl2;Q@nWD^l7~NGmh{^ zOxN>dI;?neuMIYt&x^7w|L4!MUrXaLTNNzvV2x#|o_*7o6v`1SmILi0xzMzWdX~0( z2zxr7lkO`RHu-zOQHo*sa8EMQbY|keGMPA3u^z8pcMvb)1z^s|ZoDmM9TtA5#!jnd z;ONCG2|q3yRepA(GTbo`Wxp7%S`~tGn5;;PAt%g>Q6b{md?K~b7TtNz1Ak?I($kVh zZrKQhzb|`Jdn)SLI#ex$96hN1-ADw%3b5&59F~0e9BFwZ!SCSJV6{7%N zksDydV~PZvJBzK|KftE~0yoXh!Cbr&{EHIcF69mRz}yD{dKOyuNCCepE*56IJ`ej# zHUrwVpZuFtz!6&u5V|T3CjVwZse%J)3W*{I@}8j_Ndva8Wjl@rT{eX+!S`i<;Ldy% zcGM~b_BZq2*h;q#zdLarCxk{KjUiX`>Zm$)F34bIa-9(88g)nGCV_*j7VJ{wLRWMi zB!w=AE#Y6twWI{5Jj$(mF7S>HN!uqRaP`_Or-m$@vy>PS@-=uo2 zts5)xQr$4%gz{rzo3jK$S04ukI}NKzHzQ?f%8^qZfL~7C#Y(R`i0wOl+~oQKpQaVzq_{X- zJa|&LVpb1XQ8p;F;w&b2$Fi{!=!o=d4MpDO(?r_SBt`qztcE2+T+p34Bvjk+m}Nhj zO|GnthiJ|OXM8W${DxdIXUQ=wj%D9Jj>Fa^T1~v`s7zMOD#dt*=Y{(Dk1Q-5);k?bt4 zS!x$j$;m};Ht~pqOPFAWJLM@knTlji%fRyvFVG)~%f9j`HbHTY%{#1b!e`<6w zKXKb<-se7Ve(BRBKKIcWudiiZ^OJvS&F-OU&2yX0o7{kR=4|%LBe}>G_`336iS)x5|agZ@W!@ZS?Ao%nIiiEe} z`M6*3lLl`Z^@sv$Ch`6&z;|RhI9O^3kJJC5S2rSHXe>`iHr12P zz(45vv5h23iVxdU&l2yY3^1Oz6~8!cLFOhNfNQ@`n%d>PY;p>QD$)LCV>^@pfj7kfbxpe zgAf0zO?2d$?o~xT?XM@*OV&eP>s6ASSce0y@zFl>7PRV+ z9V@YX40%S!VmmQa#6DSzDm7zSJ%e}9nzgz3ZE!Wpud`thPXW2)q{+f}8kzR{)-sP8 z9bhgOWH9FZ_k*GAb%I%8evtWYd=<4v&utvY^JGkWTglXKFJVfIZe)fXb+_znaAOL_ z%9*bp{$=Dol;Fibt!=WpHO|=Tzn%G@@gw8g@Ou!%BSjT|Q>;mY_1NN#U&27BoRI z9k{pi(3_pX=vwO%!6tVj{L@+<&3uYb>Ub!z8{7?eXQfazR!Ew^B-j?W=ac^wJBZ-J z9JuV73ctm7flT6aa;ek}SZg)0Rag}M?J|MgRo384MmcKxR)A<73V7aHXR7x-8&^#A z3hPAk!PCpw=9bG6LCipwFsbu6(fqQ9@IO|vZuHio?^pkz>E@I_Zt@1Qw5rE#c?7Hd zuEXbzbMPsH0PHp}f$U3XqfD+CT>E>8Onp*9oAb{I_2P6%MfN3@{VQj|8iz*Wm3fHF zNs@>4x3dMon=c4`LzBpXXWNKX?oTql$AHXF*n>tn+sT>wB6RzL4Fzc2NA^~Bllk+; z$mKuU)O>$4ZZfvR_C8VAW$jTY)O{h;ji%uv51esRc>`K^A_Jro&yfLo5_Y+qi+7aX z!WIuHHgr`6>x{JquAtw+Dq5b{aQ-ytxs{EaOtWyX%RO8)F9BKEVDR>J$5qnnagDJa zUVQQ>eqwNdeBG#kCsVV9reRKSAAcZ!V^?DDW97)nI}OF$NF^6t($TfarMSg?5jy!k z0=?eQhgNHwV}2|_t-e-d=EDY-YODlN-ERgyrtQSTKnpr#B;ZWkE%M5JmMEWTOmvI= zA#Jx8JUqWSuR6C}nx_ zJhu;=JQYFoxQxUMErzY(=V+JPgK0RvjTTXImezB}*Sw`;kfyb@l-3TFoULcCbB5av za(a(8a8&(u=$Y9voHuUsxt+eBX@JzYE?u6qv&kmrhkgvv>O!q(gAKRp7Zji282+cg zb+A+8HY|_hbfrEa#`a32Yo0dBk53T3-6V^n#Ut=1VHEZ+rWk79;;`*>U);5J0T#Em z!{dwWv7ekizV&?$=?N)DIVJLVzjZDe^9v;H?vb#k^dEU<_gDC%(*?2*$pXE21H75h z$(OlU+4RcSl99Ff8uPV-l$C_TDt?&GKW2FIImX89&%DRKfADIrPVovtgRLIn)69-B zTYmI}Ow;)|#ioupMXT@yHccn*dGbG=H|C?CW_-rV32tFjF@LM(Yrf`$QFHx=&^psj zn%S3lbDGDBa?Ui{Sk8N+IrP=5vp8&zQS(y`GTa$u>+5sl(l~u{w76xN^SE-6mUXwr zmvP*8S8*h!#OTjomXu+XlRS^(Wl^paIXfrRLLlvq|9A9uR6igD;0t zaSZz`4wkr2Oat?vCbAxSyAtuKzm?eMtS6{ML<8S15bmCt!@jLoi6`x&Z4s5lFnZy~ z@?N6N9-64dLk%9#`>`F?e9nP^#|`MkX$u_kS{r9{p26H5I_yusF2drF9)hX6#_qN< zqNThOKU8jmw<;H5%sUqDKaM3ntCSFH=MQ1!Z%yz~&Vv8COo6}A0ZNl1sP0%J)ORb2 z(q8glioXj)!%|@ByBp07;^SXu?_%GW6L^*4QHXyNiI`Njpm5xPlu+MYn~ZkS?!vb{ z{LUNd&zeJcZ4RXWw*~I?WkE&QHb_WIAW=WhqI8-GFpvWAaEeE}sb^jex{eCch$+Zf&MdS@VS4w9^R@`t%<)^!jQefsOlMzH z26D^f9qpLgG=40ZH(>k4@?n`K(EizH3%u$tMwSpOOX zthFvPD1U$rNUD{Ctx7YAZuo{c?v+SeKAp9HG#p7v2IB%1>UY+&OjwIv6Mo2X6gp~$ z#`JOop*oH5c3fN}dsPX{hvwtWMo|>&+*cTMwho1=Qarcdc(B4zU~KCH78MDI8tLLg z-P6c-SAf<_@8C6~0BwxXz#&lv!bg6?5N$F|w69Vb_l?fO=*54itXvOtgwZhXMGu9jiZ^@1K3t<;a2D4d@u;J=5yh+D`;spI7`=)D%Qt1US)>}um%~K@~$MeYP z&HBQW|B~Qb{Sl#9tOeW)Ahw?KjaUb|rh(j8D(g>(1hkIow7us$qB2jY z1nVJ34`-5oZVqsC4`SDae{m`)#d)8fkqTy-w)wZ`0?w>; z_slncQR9fu5O8L!5yNpc?}b`&_2|!aYR9C-#{LG$=%}VVOWn5H_TnNb$kLXDkMwM; zFS8Wa8auI8Q?B!nBiG?rydiG??1)dqT()hKo`5!p~U!;ZrMV^)mgO zT?-GdCZW@Nd3V%@pdNthU71fHAPD_5AZ-f zQNyV9?hLfAp_Fxh_7ZTcO~$gaI_RzCF?7;h7EfPh zgzMr3FVYBGLXFVp?m@!aop5Q%W^_5}tZkv#Rf^drM3td$!AoMC1R9nJd8uq%F1H+S z2yun9_*cX*crkJ@E<~UggS`UxpxFZU0FI zhZFIoT3?*#E|2ZD{YAGQUMAVQ3UHgj62hC%Ceas8;hS~qh-az--ZV=OR#M-6iB1J1 z9sF4set#9?uGHbCJNt3dA?88G+T9|?9-Q82P@~N(9@cJBx%rSg{o)+PrVevUn;=={ zmLP7^&D4L5i8}?BZhdPQ`dLRAUO%5Vt=V&}>A!>y%R^yRj16<8ESm=oG|_9ccsJz086uiagi}32CIP4GH3~Q%R&Xj3a@b+&uNVHl7kGbU_hx9KHymJDy_%#q8 zYy^>91P|VrPF$K{6c#u^qrF4}m|XZ_v=%&v>712t2&L z12O_6NbaFic(=bM`^Cmgr2l$C_(7geR!tL9$+?eMdGAadyzDm`(HcU(_pL^b-^1XO zg9-lP*@e7JP6*4W_iNPV0D>xQyky2KB>8qOh;2_MH))H&R$j=u>L(5k-J!7Ttuh(b z2q$X{PobJ)+n`@^HaSe+iMs-<@pvTz-Fs$${VhYVuAC8iPH#lxZ>ognO#9I_mBpBr zwwL^h*MR8`I$$DPNxAwgkf!8xh|%mL%MzDCsfz{NIFpW4*K0$s8y)R_5duHHuYtGU z#z^8yb4U}|2;XTIAoZ_bkUr@}qZ_61w+mk?w1CH6 zvJkrdDFl@ifakrXu-#J(FEY*|npAM=R}vd+Pd5k;%GzSHs%Fp_jDgJd60%q22g&rC zk4#PlP#NevVlr}taMK>L{w?nk(B%?A{{2C!6PyBkyEc~W4;BRQG+}u3G1>d*Ao;z& z9@R{9Vo$v&hxXcikS?TD?$8Iz$yH}M67UvNBoKfKqt z0-EoRptv#_cEkfEc#`CbMsgD2<*)TsMmU*QSJ3}nvZmW^|HwI4hs_ufVdnZF37oA{tLsNkf2e!EPMls_eVAsxits!+GsRc`kD=;vn#NalO~@0u0~QSC@y5c1$cZi4AVw4 zu)JPBvYz}vGE{TnCr7-QX`t5JcF3kVGfThutc-HA&FY;zrsNI&Mz1e?=4d8gUE7^c zcf81p^cHItZBlLaDQn}i%)t`|3d_6YY)g&p_8-v^v*x-0j>>IK}!&YK)*?@Z3j>){+zIXUhiy1>y= zPv>^q`N1by8&dqM7zdXo*gEujgOWoGoCtYISgS+egFpaswpSqEGmDHC9YiNji4h~k zX!5VAiHN_*W?j(A!#UN3c!$Y-ysWwl2b`{i;IN%|ye|^hqI}$E-9&7Ul%V%-cEdKa z8n{OFrQ|5b?1g__AiMMcoDxfb#fVR4P_BW1Yms=v-c(#Eb`hs_W|3@_ujHxA zV=|v85G8&z{-B+U#TC`rooO%8`bXI;v4eKRs-sDmGiL>nkyOFE7G^+%TnSt^`ABx{ zq}&BZM38(W239B!kjJe}WRpiENn5N<40r+g$zEp zu^0`c=Myi}LP#wiW(>%6GqNbw;RZ`9#=3*g=>zG8%=cv*ne%E~7~%;t7^X*WSpLhs z%t+=;V{+D(GRVy97HOw48OxF)7_%;pG4}WMGnUMK%2*aWgSlq01tXrhgfZt%K4U@o zZN`xAXU0=Md60DOA^`(uK^MEye73sv#)hc~d|R~mb;S(5E# zDa-zKa0XlErYF01r8S$SlZHcjtx)#GH2nFj9(!ak6YpMm5B1M)B1;RtBe%J;@F(h- zwJZ7woV{|3bvufWbe-Z*{33m}_la;kvP207v{kYGJ7NPHlaeWxVjA`vT95y|w!&X$ zC1A#e*|606Ct5>gcAg)4!z#16jBlP|f!$|+^7?=;8O+KQjyG?!wLfnr{Mj{*B+6~D z&aRV)b?QD!%kagQKXOp=lod7;TaUTU8d&SVKa@6)!CD=!D4eCB1{-!hg=@cfu;1?{ z1P#fFTGtqhteA5}7Djra-kI7W4qcltGy5g>={T{`w~@_`QgaF4ydPd zJ=z)n1V6f5gc)PM@VVc&@K{nMKHTttykxy1MJx4S)+-t+_>+%9W|s;a{qGBtXSJ~I zeapc=44iR#_gz%heii%G^x_QH5zK3RL%BA}a4?6Dd(ATO=l6&3-IZDRjn^T(G&Bz@ z7yU+_?N_PJ>pa$yr-8!yp!X>1;&vRfs|8!_Xvg(?5?Ln~d7!>On~_qhI>;F;g+iAD z@a`CurwCGneQ9cFN;5~8mAL@Bc^N^3@@i=N6+~9qUx9d>gJW%-1$VAA5(U#92ulbN zk(HZJ#^G_SwRtuhExUo|WEf*pqZSepw-yv(KP2d;k-goUkjl&LPTn9A)$4q1#?q1lDId} zNt)B#d1mEg5$C*w7uT1i&7E!P&;2D`WlmResh?5e&wV%)Ps>`E$odI z<@(9QaH{|AreVR(!LEhVI}|hxO~=qlzs0EzD9Y7 zB^Oiu)V{xjAxMU$0sG+DnoRg!ores5){s@5(J-nQLB@NNK;d0K&WgN)*~hFY-+evu z-?R#9lE2~$7B{f?7ah!-;estfexc#9LsqMpZdMwG53M*S<*m=W`DJyuwUfUt?F)Yg zj^RHtbK^fqXYosZO>aIYxXTZVBK-GJlT9xRt~SM-oZ?%D8uAl#E^Tj<;V&7v#aUr8@I?Me4_)Td7}KXV?$i_?)| zqWM-eeU8_1q1n!XuQaXe6SVO5FxpmL8fVY5<@9<=h%$e2Bdx^6{Y z2aUV<7|rH5=H@61Xj8X5SrLctf`8h5vd~}z=UB~PJ51LmHqX03ujDj5<4vMaofLdN zbS`{Phy>kX8OR&n$l5{iQvDk?fmgFH{Mn)a*^E!ZR-A(4etaY4P!GzL8Q@SZ$LdzH z!NV~-u+=R!Ec@CHZ(3t++a&l5QZ_QI(tuoa|M-0J;&qqp=c6egJ)h!V%%CBwm@mjL z&IC?qEP(D7542{oHwqZ+e$>j?1wZ1ZlJX z1%|`U4f&vMyJQta_C|xaeG%$i<%s|Bz9W`6jj?0~ZmO1e)70!D#ppP8m@&0PhdH*dqS4;T zw&^Q5!`SOQ%D9uZow;8(xAAbhB5$7MB4$ryKVxHhK6Ao5o2ig}fH8liA+sw;ml^Mu zz_e0fF$1hd7>j@XVtlf%VSFAb#uI;ZSZf3hmb1@dn|l$`dv(4sYD01YwzJtQG4C`8kV6xL=DBHcZcD0}5wv~}BDqAdM^D6#X9MBp)7b?;EjEqF(= zm$kG6I<}+ng{Dwjx{$|JhCKg3ZYb36H z7m113IxM>P4soZhll?m{3JWJINa!^}qF$hig`yx8n=-aroQ&U{cJifajCW)a$Ep zT~7f1(JloGLM*7gHx}G}wn8Xv(J#aX8`xHrGes?SDXb}_)8ur?W%9I9gKeZJCYs+L zNWv*T>K>_3u$(5qhc>r?m6jD=?edmnwZDhCg%PM;NesTWyu>H;70`8y50H1^2=dCC zq}`DGPLsY|Oe^f_GWSiMN&l9pPd^pbUuVD2fxfJmMGsDQr9U!X!ddg@1dSeRO^@vq za?;O5(GO?Op~tUKr>O^Athec3K(E=Cz&Yo@p-toMr7hFkL$`?Ts=uyBdA+riv3c8F z)}v+Wu>IK-ITTQdIWJW)KQa&Vg0ErKpfPv-<7L$9rt3j+san z;bv>BHRB>~)se-Kp{`)-7J*zXBwn)m``1O8a21)wh`@R&^bpcq{#G@>K8SB2g=dJBF%W+kv@@d_YNt{b;ZO*kVhTQoF zCAn9fvuMp{SsZqdAFbxO3r#IUmV0@_PTJAt+4SA-7tr@lpU!Pd7Nu246>#Ru zZ>72C&Y~rDE18RzNO2;UouctO&FJb?d^63vvZ9@NDp0%bw(xau1f2J5faxJz5R4xX z_=rSE`%nU_Aafc#+Sv&C0cLQ&{{UD{%m8LwENH#Z!BuJ__}h9e=JEh%XRN>nUYbG- zTNV#Bm9X+1M@ZX}mE$Tc=WZRBh_vfKhGbyy^ zK{t_fh-cjqt4C%cJ))ca-R9mgE-v`e$|`=%M~~01#y6IklbeY~P|)@9GZ z#7aqY{zV|>)VxAV|N0Q$NmF>8s|)&8J8`?E7otDPBgA!N4(MX=ewa zlf|HE6b=>LlA`553c>nuIa&C32dg5r0V`)IvG*RGNw!el>i4Nyc&o%dtW!em_q+6= z^^iP?MoZ!ATxC&8|8o?f_md^hFJVOpc97}M3*ijS86Uybc+rn+?2+S**X5g%p4Ya> z(;)_1n5yEX^DMEMk|SPge3Es$-k-dj9Ypf{zYyNd)nsxR-7;No1)jirxbBF}c! z?#8l=r93B-8;!!mc*ch8bxk@OW;0&AncbA3wyNp;6?tqh5&?gAZh@tXD#`YtO8oA5 zC_KH;3yUwEgM`K%pgOpj_-r^2ht!us_LYkyx#9tlY?6SeB}d6=|4(SNC=B_ED$t&k z>G+h#G8lfc9dE0%z$sdGNa^=)I2$orw5UK6R!oz?r;acM=MIh|QymqN-;f1lZOA|w zCVxqfG{x~aVEHotNhIU66V++4v zYO9=9f#wE-&dikqrZr5sX0kHxC~k|`4Hy| zwB_Sg1-28FeI2&T!p$WaXibY5b`#NX^Gz#q-}4v>ZEI&;y#EHF51pjjFM`^6uRwhn zhi!l1SuH5)8e9)6z?!=b;F*kZq;Qxin3Q!F&YItbzfPkVW?@En|Fd1#^ECr5>=ws0 zXAO|@s1i%`@c|jLnIU>OJ5ShLbOP=>hfwF5Y%onT5>>Wz<3iW1uyKC_tTEmKLo=?z z%!G%;Ay)$LTX+=TDEx!7=0~yvU^TuxC`)9GG@$v0BWumyIMQ+HyKuXj0t#=BKud%6 z;$~-R_Nb0A{p)@F=j0PuWcDZJI-K+bJkz)dib;cV06nJ0&6ab1ibbL>^3>OI2@xX%(^aN5$Bnx~k=o7wK% zW?rSH#65UGotv`g4^6^LiffBzb8oXQnE#L)H8Wo%q&=Bx;#eH~!};=i7iZepbk1Z+ zD~%Q+QKtlvqB12-k+#--NW3r#Zcm26f*6rcLkei+o0G`?DaGT-%|`{#J@HKWKy1F; z2(w;k;Z@zA(7w_Fyf{1;$9&eqSFMt;nztf5wO}1`Ib4Dnf1Gh<*mhjFzW_Ja+44@_ zyi?`Fl_aeG&nL;KO5`7uU&r4REz7TBI`9u`|K{`OG5Iw+YR>HJUWR5f&Zp7hYR7TOAH1-fC)7xQ+R)wCbcdpHjbeCe&HJUN@@JDUAiT!y0@mciE5Q=l9- z0d}kZqHDohv8ObhTyoQdgT_*zW%M7e7GI9d)^(%vOCF-8^$TzWMgI7{UILurXTuWR zP_$)pO3T^3eaP7K0c;NZ2W7ccP{)sedxJMXeoi+m{9T0&s&}(Ll@8-)%LVZGOb2Ax zr68717_NJG7`^M17j1TLf)6k63RG;~A-Swgg!$teOjDYPJ=Y|WW#8@LvQim5nKwul zc`d~`cb}5c@a0grx&YQlTd@{azJu?90BdMk$jC#D-Ko2jtxTw0W6lop{Ky!_D;}mB@O2S))RZ08I}lbV?~wfg)pG^HtCgcCH)FvwM zJ0b zDA(^Pax^MqO_Sb1*b(Vy=&20OiWZ@&n+U&+-e_}HH<5&!TmjK1s*GA4LmtUB39S|v z@B)tL^E@0E@xp&k=dJ#DuIaf~9IttUKF^*#i??}4c~f6#e^aJKB`^4qDNnmpndhPU zpy`5NUsJ%B7~bBl_@=kJFE>TJINS7Y{6GvlOqW8wZ_Zq5K zKXV$wjsr0J-E`5W?4`gKNa8=QDo_aJ;ahZDT2%KW0$2vB_*#81yM47JJ1}-OT9lju zb03vKlEZc!=&6U7`3;~gt%Io8Q}WHDfhU#&{OhO}{98l8^q(kjSs1l0O#@|Te<5Ym9i=wO0&kJNgr*c1;H%PySy7)oVbQuQ;rS^ysLC87lho{5=d}`! zmDHhpu#x!9eq-EmdIBY1o`wUr55cve&7#Kq8pyqwfREJbkPC;m3D$QR!jf0<#KzSW zf+Lrr1<`*{uBH->4nKl4=>DuZ?|bkA>1gb#^BDi|%*4UgKT-J+M|>n-5woY4V}rI; zcu0K&UA{><16_m2yLIX)bENr>+EAKQ&xlRhh*Y2U_Y6`Y9-IHubXPyvZdIOJ|B+Y~ryN`H@ix@g|dzf(5W|BN- zBb;xfi1kNYV2N@hJbR);UQ4tJtK+-SsoOc^#V0P=7`zWP{wop??>2Jm#|)IU#fPRU73@5=x zoo?>%ul}ji8&0d4F4vN)OwaEvFkfK*i%8jyA`K0mkQTKD&}S#m_DKTqr{~GUbOqRR zB^W%-*^toxj@Y|eVjc0@=oWQ;DY+Mj${b3;#bz4I)bt-~Fo5dsC(OmU6Z_z=N;PZu z!yeQUau?nER)+6r8e3nAZMFj(LJ{NtUnvgFUpmbadi*k~nH`IjkU(aYO$dW1K5 zPhj==0m(M!!?+`hy_?GQE)mYdf7g_Qdd?cr?T9dT>a9FH!YU@J zSDWEky@p8LVjBC|O6tt2wHfZ|`=EX#NzI@PaN!n3(FLR5Pfoi(-&<3OAQf-SuhZSl6c>SIV9v z%glZXpDj1W8Ih^jR4$v4s-bGLgth~JM%x?6y~r+ zfkn-|PR2sU8RpETT9D~hCDiF)AjaQoSlqIf97y>LGpxJe+vWoxpW=rM4+nuC|28Q8 zbOxU{)-3T73#{(@j?Dk6Mv79SNWDfgD)chO=O?%%DBKbkURnxs2E@s1BU#elCxT1j z&#+!=0r2D>L%Z93Qkx=88hTsF-KrO$(BA-;+MIFnPX%^P=YFE!7DKl3VhKr$$IG+D z*q@G1k@6S?bmp)fj$V|CUv=(>uAkv})%QrWcrV2z+?ybbTr*G9`I%zAOKt-r-&Js^ z*+mdNe~^`M=^qk=??wwGWAM`px>(ui5Z-*x9T{$*e6ls8!g=W1 z|Fs6~EPsQ399;;~MG@FTO=NpVZYS$-Ul%!~76?}ZhT)LsXE;(g0&PJjV4Hd_Ix72- zIFJqkSw{HS<_J{1TLr%fvj!D+E!eUs9<590v9+oyLo!B=P?h73_B-0cES?nH-Ih=8 zTyllUzRSd;;uM@P{zse(JBVrC3Q&t6D12!O&VQJVHAZH`$;gdhyu=XJ+)YN4y>0mU zJ`PbmKab^=xQl4nZ4+Es7RFk-X$bh=U*d<(t*BS^E3uPc!-S>)bd4#db+epMY4#dm zt)$L|x+QQ`Taz6qeFdBRlVooauV?Af?~wIAC8*c%F1lRnjChGsc;vr@_;{2COIB_r zO7YOe-V5?j=*k)>jJX1*J!nKCwIB5!c~5-gCm?3e8_=Kqf^56=k<1bdu&#(T!rr^d z;5uW16*VIoZoF)TjVJG6Ltu)YP6Wg49o4YBs2s-gl-O?qegM;8DQ2Xl3zKsO$vxxg zY_T5?q4M2c+@Vm5ehf=O(ZXrCf0`NuUa>_hZ%g16GCcG@hR(wg>Nk$#_DU*3L{Ub_ zxa00{&*##Th*V0uw4_1xYao@Gj0j1#WQ4*U?r_iNl+n;qnwlzU?;)jr-@oDR`+UF8 z=ktEQUb*!t@coOa&|-KEix;ZF?v}gF+WbdBbYPegd$2Er;gp=LUGgb~MFxeev?(nP zZD;1zzBP$wNebVy^la3H{-Uq7^JA;=xDGkyp^U?t7H8+T^XyCR=-j>jz)4 zgk`S6fCU{aMUyOtO%3+!VTQ2|ft}9mp!^Wwtg+i!p09FP_ufRX!W*_ZOl+%Q?b~@( zxch^FFp;P2kU)pCMCC&cH+hyUXRlFgg)`!0*P* zS^T8AsG(iHyCL`WOmW$t{D$^rha1x5;~GT9qWYa#24W&Nx1soWR>LEmnud}e`S_P% zi9LFliH@#oamMOgY;Ep9Ugk86nz|jwTBzYIy|<{ny__HFP)0JH1WdjlUmAZt5e;jU zpxS60=``ZON870oxK#@hGk#Lf1d7XTTxGWpdZNksX6d&hi4gWj1*Tamk;{oQ7^Zp| z4L2CU^hNZcEqpDpP_dz|ny2}frTUP#poy#t?;&1!y=32U1L>SMZKTBC6)HPy;kIB7 z$k?_-?V%0EC>TP?PBEwiCBmAF$so5d4cgv~gnb`Vq@i6VxHnZ1JiX^K?uoG=oxmgP zo=Pfu_nv6PyMgi=CwR~~3U-Ruk_q}cu%c!IEW45doBz{+e~+DD)#({9VifakKRisZ zZtEX#I>hkN`j)}-V@>>3eP5f8eP2!fOP@()^E}9qy8+72q>` zdv|vODm6&QN0SWTo>DTHY2%<+IcSEMgEzBN z`tJWkT69qte@# zmA~cidiX3C1B05UUCfMausyz&nFltK;_GMO(98%tI;@eFkB-KlD>E@nv6yl0bkOET zD>~(k60$EbKC%gQv|pi*?ps?X&CqI)&P&cGMwyITC$^mb`pyRu9(ol&693itF8I6VlyGBHtWfHF&SBO1Pb~MNajeU#LBbW)>LSZC zUBYF5$FoU_nyCHT3H#;R2J8g|I>O7t^x3nvUv{|hbsx*>PN14uMnk%%9 zXe7Uq9wm&qE5|VQTOdM(Im@S;fC%S z;lW#LM04*3i!QA%6X}^o3wsxNiB{cr60Nn*7Iog&5|!rv7Jm6VMkHIXQp5?mBw7)w zDR?tY2yH#zsKv}0qRB*wRCW##^K_*Db?UqJG_}C^6+5Zw1`qtV=^*W1I0>YeU&5ff zHW3*n^DWNHkypdw$UJB#uQb-;hhfdlK68Fbf3BX2Rg<2OCVy4DwDT+M>`Z{aMk-Xf z!xC?K4iV*oHIQ`O3lD8Pk99XY=`*!Je)W$8(lJF>dRg+Dh8mpVyA+=$_62&)ewXT8 zZmDu zx_Qe*Oa78iHGC5#4cff&4*wS8Qk`blNb3%mlD@*crk2m?q<&A5^mfQIy13;zaf@<= zi<=&kxWBKdee4!AJp7Wg_}&?D$Mq9p=|)}gXJ7LM6~!&wNcEo$rzW>I%w??1EqZ?& zZjQ9&=C7L~wm5Gtw*83>8~wZLmzy8ujO>VZdfi~gbv;qf387h>zC&u)?w$elA93joxmfl;%|fN>98SgXMlZ z(emR58uRKvQ|)0nnNDX5_Vim~p}v|dKhj(FYThj5nMrABnkC%U`6S((-GX`HW{{V= zkUwn1Dygq+0eLy`2Pxc@2?WT(n%D%82oueG#*@W~ZW_Ki=xc3vT^-wdUt_H1a| zJ5E*$kI{gcGhdc_K-;$>3>5xFlY{oSy+^`7ZuDB(9yiF33X;RUqa(?ey%!kH=25WO zcZ)Bt#Ub(wj=~zZLfAScnLLo2MF*d>)2O;i@+SBLo%3ocChtE_i~A$lF%iiu*s+l1 zub0ERGvl@JL53pRDf?Y*K>1JB7`uNpVp1)9zdwx?c7Fj&a`X{P{QWHJppyctOJu+v z=dUb!S1e;K%IB~PB;(nB<#KGZ5iaa?``59Dttn<5UXjF}TbuwF4pYe0-oVTk=A-Y` zP!z^aLGG-V#3k$wX@ix_y)GI9uID#LhlhdNQ!ebTa>S;(`S^qJwJ-K5q|?(S(#X9k zcx%!Lw7Av`hW-*)tEEY_@=_;dwcN#jjop&3`7x4wx5JWAi-*Ls^QGc~%fBTab;aTi zxg4HyM?bH8T?g+|`a{WFPAXTob`fvJ!h<|BlastcvQ+Zg=q&H1Pasc2QIF@8=OU?J zktH!2nZs+Xd@NocT_f4p^;q)Ob|tGO&BMX6KT}wwgsc?tAPcXXirT#US$j>~SnFo) z6!{&xDojuuCz3{f5*prKFB-qVQWUi@wbpa#Tv23t9jp88f2@BOtwim=_lVS$XNy_| zqlJGZOGRT#%Z2%NSA->_uHyaj1vnylES;S!5C6{SLu1Qpd?X)-S5%tBOmv%&LA7>1Ji9F#9%xZcOtVG1uCv28t*6-HQ;lLeUy>VnuC zS+L4l2$!c>(MMlOshZ&=AYMK6-K?dIui#hHBdZzOs=HIo-T^U)oy8yle6N+vJU zl1b)Ttc53j8K8YD9Zg5gr{B9ANZHH7^uNmuO{p%{m=O%0$~I#- zXc!0Q>W-0H)+0!C;V`P$b&hOWyBAh=Yw-?5C2)U?3*i1Zkj&M#`@;<~RpM=IPv-vc zIn2G^D&i_uz2f>wlz4AXrEs+^7AsQ2H#B3fyi>4JXTEVEjV-Z$UBau4x8kg-39;xDD#% zj6p$%i)tIUFg>p}emBFY-eYTqGdi}DEmPa2!%TC?w(obOOMUq8W%wp2W4vDhj&+cb z!(^Q@lmwPq8nEDK5Ink&$5*~|mlPeckUnzCg%Ph};B;LZXnIc)yv(ZvzOP8iR@cOM ztMjB`^(oNk`$O!04Tm2k_4K}~OzJ#$Bq;3|4&_H~!LN`qa^T5CC_P{V(`M!ot!1m} zx{GYa1+0wef;#>Z?o!gLc~bh>Vkz~RmesVzfI|$2f(i9B0$Rg@;%r}d6RHSFFdt?_ z6@lNn0;t{P3X^Y4hkUajhRu-%v%HNMHccIgDs3Y2Df+NqrGof#`E*vgFJ^B(NbNM5 ziNrq=aoItJog9V}PNkrBp%G@w+`zVRGX&PIfW{S1$nat>aBMn843WC-liej1+MQ(o zjhSHfa6C-!=%?NJQM76LSLv1s`{B+6S5(!Q$hg?oVV}!>4B4B5p4%>y<(tx}a*_ky zNe-Z18iV{QqaENYNFc8{9O>X&TR7kB4Qcv;Flc24M`AO{J*Ep8q?`kP9m~myPowB1 z`O#E*e>RCY+eaej>O=RK9cWwo1XY(S$rvb)Y(=v!a1Y++@4q;nrk2N{O=lsdm}*eV zW^*{zd5p{nk0#c6Ya!c351;#OLFuA{7*qe6j$QSK-(xlm?U#SWfoWH8cx?ol*o9)o zh#6oUuTBoB7dJm?E25vXYKd;^L;8Tzh-vgPF134v;or6KZC4mgSK5r7zbY`eAPX10 ze@TnFK0`}?I?T{G4O0T5VQWDgED>HMt+kWjKzRpPDTl%^;}@hq)s&r~?p&LCMuFwc z>0n8=pI}EsjT9{%?<6X@_)WMztwNX?JC!}|pBh_f|4NpBbC*zc)_GR(_uyLpfWxdP zpYyC-6HnH&HVgJR*&%j@cMSdVJQMf7tVU5q3fjaf;4@wi^WXU`of3@vP>UP% z%^zdeU#n;HGtP^NO}qyjky`-Qyr&TfH%~n6UT(wG`q5&z)h}+J`JW^q(1PobWXKh4 zQsAa-ji|d|^Mtdb1~HUL?Bxj*6Jx@-;)Td|LU{cWCcQ>ynn*Yb2ip??%nOM z+CY^(E%+(Rafqc4HA^V@e} zdQ6e9#<*4}d|M`*ef%|R*NrfkzowQ>`3^3rDuEMXuOsduG1d}{XG@Z5*K~?a`JW@`Q|v|jj#pQK@GSZX@<8VGcY@9 zB#Mv^yEd-F<)afY=zKedM-^d?V+t)p~mU#>qn;@9>Zo?U@KlnalsY}**1!;|AI$h^?5vOk$E_2w~ zjEi#G;Uc&Q9Bws{J#$O&huv{(3uLk+r(B>_{FWY0D}l(wH`KnN3SRQ&&RTGvb-&fSjU3Bho9nepEGFCmyaLI@@e|LcvP}XAge3+^v8x1#I#3&_)Y%8 z&rZw(j&3e--Q7vnK4F;k`n%!ruM~)lkQda}CBrFAA*{^shV8n_(&e`srSFGN$4SPv z*cRu3*LP5&c;E>C+nWPYiTgwTPCZ+aB95+mp(fWblrqR+9gW~zZ_06W`qjc|Gplx# zj!)z|TI+JH)sAy&-x5y#@cRu>IrF%lrmaq%kMcMdtpc3x_YQ9eG8yE^XKm-qvaaE* z9XFmcRA|l#&>zE{`^bj-;*v7=N8)Vq`mcuc+3^p2cH=&37vzm0f8;QCN(7{9U58!2 zZE>EZ6`CZp&=32bQ`74qc)R~U{HS)56vd23y=~9wVuPhLM`=91G1sKQ6AZBY%MH@^ z>I|u$bb;Xkx?yHY7nFI2!;cTGxO3kh{B*&d^cP-XoS)Z0wl|1;8C1hNn>1jm6AcoQW1be`!UCKfLo$6wEr2ONQt0sqM;k@^kA=_!M;$)`iT12~WiMw&f@u$jE?Y zi<|h5<|yLcpln>b@gh1k8_GJhU%`AWCAvCsB-01a1@r%!Vc+yqkQOmc+W1m~>4542 zuPFt}tK%ul)fywlXJT@eJbIhu6OsFMw5vClt=eQSTbMUOHcyy|T~2)T%{q+xC!RoS z2*!q^jKjWXBMrY$L%TB)@x;sxc!}R2t&#f=w-rfgwu2}AlaxlS+W1)JWC5%GHbJkq zfgod4F0f4J!iL)Kpts#jFn97~L9Whp!Nr|IC>|;yLwbV{_V+ogIrkox8JE*<<84TT zix`I7l?2Jh${@ls1tTwR#SzoDgYuQ{Fk5{Ij9<12KSp$;MxGmq=P3)8oiiq*%)BsV zLm{#Y#L~O@BGS&V{IZ#ScG+aU>{xp~Zke8kzFj%!(xN2Oy7U`8U7phdrWbhpC4k)* zAF%&#F1C>bbo*t6vp-uv1NRx6)?NyqPZTx#8z^>%gV$!pQOB&8y5Dw@yi0Ycaa2W?l2aw!)cl1ey*Fn+ zX4$fxUJeR1J{1TvSGlqRpjX%&yH_}(%}O*!Cz#c<`v_}|F{bg_DDRU9U*na3(jQ4z{*7$!=KD|FDl#TM=kjfJ;WtH=kZ zY+MuK%g=mb4_3^xtWlyySl5rB!;LBUDZGw8oqCj(eaWW&@mo>0UQ6~@@ed}>dM&#quriJb|1JaDNcGKNeCK} z*v`_Ci1h6wlj{;BpCghaHJ`3XCTGewPU!wCX-qmPNtPx_eEN+gXDam@?Ox53ytOP4 ze>MT}H}&0|5l;;pD@wVIE;7x=mHy)!zx@|1nKG=uc9A+)6c;;AG_$}=lzjel?F_eE z)>rjrR>U1SwvksP%TG>OR9*dL)?vi;r?YNq6pfuHMYGz;eUeQ*$lZ+=X5nKQvEXd=JtOB7Mg8^_OG zewQv>wV7V?A#%INMFe3!9RuS6-uZD98;5w<_cI^jzu2a8a}JfHr^nVAFnEuRl*nZc1y7I$BxKtemBD@HJ_q1V&Q85*~B2;Cu9z0vw0Y5Uv&_!vpq-{FG zXm5+Dt4GOWS~@oeubB4HT+ea%%xxi^%=nv*EEQ2CP#-)8KJ}f&i!O+siOl_P-cRc27b4|ED~boxKXcOBM4WBydYpihid<#0 zS>nEt?>OH?DI6_dEzU*7zfMb1r;DGwe#?n&SL2RZSXwvSc^KzyxSlvi^xLWO#21c* z>U5_!M{aT)`1{18ir+O17dUffO;X})y=&EQ@?0R;M(4r}Du9|Jj-bjHkrSN&-;2xO z?1F7@_HqIlnQZ`p3?p+{#4OM|sR;@rhr@Ei2C_5Wkv_6IO4ODNkX4JfNT*LKqLLe} zWTForn51!#09-LiI`{J zMWZHbz?oy4sN42z^e{?A-%wqC_?sE@ShFpD^L~c&2J>)Vdm_CacNiUtvyppzKfd_B zkB;I|8guO=)s9<&T37briHNOq)Z#u`5?oBGgT7M7?g)nC>W0?pqtSiVN#aqILB8}W zpjA*P{xDHQ&!=l3@#jS7{!>d=jkrvUg0-;sW*8h@mqx~yWb^OnFdo~}jo5av2))7< z{Yag#+{^97viBUM6x7ptK*d0tz8FunfjMwSv`8RbiR^K8RYG3_X*M!t^da z-(bGFU_``o_}kV7lfRTe>XmkITXh5cC#1r>O$AU@EeCy>dHf0YzmrcpCWBgGBa)O*qmIl{El~RH$zQtEa5^S@T8A&LRY*_nQ%0ff zX{!1~PCC|b8q{*jY1~{H-#uX!c7e4CWe6@yHaZFzmI}eHNk;A>Fdtj z%_Os)p1PDO{$I!5tla#(A*da zQ-$TkAi$OwTb(8K$G%eKGwX<6^h$ik8%_rTr{Gxgjd=W%8CidHU*n6E;Ks-LGa7sI z-5brKXEe@8acivF<t zR1&ca9@VnO-Z;rRJ<^q3NS3jqJ5dd77y-DPQpb_kUm@cM^~Tc)a=t3A9UfEq~U@ zuXG`BY*RUG^G<}nR?Kemg$NfR3w;l`<7eL;*m-(BJ~*-#2QC{kOy<>Cd%uMSwi(f) zVJT!wbOnjK7euy(7jAHB;+Fe_XX% zpW)zah9kYN4DE&|;*cCR9nWxPZ?OjYPc?s!OGJWcL< z>&f$*h5V=wSvas{2#Z7IWYd1!!i>aevfF*fKyS1mMzjHz^WAY+)hC>JZzBXNf26T5 zb!gRIBXqTGM?1#xX)^Yy^kahwzw@G_YrM&5v`UB`*L)ZG8mvT;&No{}5LDMQdT;Q>eT7OF*#|%zO?~Fc6Jo(Q^o^t}(_@I-Y zJbJElO)n61l3jUpOB}?PXtFk<8H;_r5;qRRnjJW)yeVlO)b)?qANw@td3uYu)1{ z4Cgl7^UC3D7;V5^Jv6mp=RP*~%FiVYN?Iic-2C~u{d_t2kx}pWIx;J8b+jC5N zRE|n6=cM0CSXhInCEJ3(AXWp>tQ6bEIs10+X_2G2IUZUBN3~SlLWvtXY`07$8ip$R7-L86kV?7TCByqTB z|30)`qJg^xf{6J87Ytx}{TtpkyXGCsBwHOViTSXJu(+j(JWNy}&9NE$i2;LTF?$UC zWu;A)EZPP04LZoVyI0B5JHbpTcNxjQZ3ug2JK*>fJ$}#v6R7jq193T#kW!fqw;3ja z9y7BFC^^Hw%FMr_r)-Ab-;M(J$UXk7rQ7J7oFWqDCI<@ws^RbHd~|5B#3PO|RQcB; zdb75g$gh7+J8hoQRPzF9$T0~uafqXnf8U`ypWSm+9=D8dp!tR@-TRZK-HRm8*N%s< zth>Z;gFZZZe3Sn8J_o`A_krew5Ln1pgjSU*5*efh=43S7Z!m=Gf7aFfk=d{mu` zm9Il^lG=0n$aM*Jjj=)2NJrNJ?az4oVibScJw%?Mg9q`{7s79zx!SZve)N|)qc+Bt)=G>?P^|nB0e)R@d4F64yZ4dEB?|4Wog#lQ_PD97P zDs)rzJd!!RnZ918fg?h5Sdr;wY-GJ-$$wnV8sfFqihcfLP2re}-drDW;M?{KTX(97 z%4K|Zgzpr#U+ewa=J*^|lPSY%-~3qk^qacK)kWxVuuefVTFpf?@|vrtbZsN6A=F)X z+&_a=*msn5rnwC}k4v%a-DYgaufqR??{Sff2wQz!a4>8wo*zm=E#`lG)~pDlYZvp! zFn(Iiwbz<&?&5)u>Tx`#d0RS9V8hqeR)FzGZUb3RN8=Cp!zb*4GNaF6X89TZyb+5d zv*tIL7zK;B=mv{rDUUe6cx%Le>xvtm^KaL`-ssP@DFa^MOb4-}PfWx7J;j_-u_tfy zzdbyKiZ1aERc(%|=QQzoUYz5WQDek1<6+{1_?(+_+)Vt^OkMJ6O|&FKHJbGxz(yo8 zdd5;-Q73#=J(rbcY0K8?QWN&*%oEyY*s(`%HxMp-rpewg;hg<`W;PYSLtRvrt0&6( zVJ%$dW+1#5A##YGcw87PUtJrkDP%2P%MnfKEfpR+wY%0vM_Cl#c@Qs3Ea^siogVqP z2-OFAu(Izo?eg`8!01M@_03{(d7TULznWuUVhnx3a5kSf$VnGvc%V7M^#7j!oeuSx z;@yQ4(CTv^J#cZXOcySq@{}|9E%X#x-8hTeZ9kCRj00k@V>xO}azX#S6EXjunRKmv zHw`LFrn__%Y4qk+k{-DgRLU5iv%4uw*XkfMKFylD8e$c%>40{&%&rFVEqH4M8y z3c~j)!oaFCyvSl@FO?d}RmLDMTZURKyoDSPYErZa_Og2*c z51bB5~{gUYkbkoji&EI~HBU@1P$~7vo;Wbu~p2f^r*O z@QC;*UisyQZ^v+9&C6`EfBinm-k1J4r-m_0CR2My zBlvbq4)e|1sM^A>&2c+2Xh*VF^U^aeG-!vIM(zAcqke|qzZnBG*?SZwHm}0G>z5E# zccS<7VDx>@!u5BUQ_y`1?$KcQOjfx#>aZOCU_0ZCh|gqppp<6{A`k?nbAB|&2(vOw3v{S!{u5(qvdkhn&tXjRXQBj}FcvniD_7|Y%#NX(Y*Nh7` zTceS~eiZvP;>g}inh|iF1}mDObq~Xssns9_PL_1+m35T;T?q?qrS#xxePoTXM(6k- z+-R?iha3&De4rb5+$kV|U-wYGrfdA$&1`Z?umCT!Y>3bDPm0=@QO z7~xq%S{+~V6IN^?(wu{gLv1b2)QI8tU&|t2vSavFcRmvFWC7VwHyn1En?n9=d$@P@ zEL0jQlQkBmWRz|t-E-xz>mYj^T)Uk|0GtSUojas<^FJdQ0&T*l5_|j1dXa zt2b0CwR`xjER|aQ+Dhs8GV+Xl26EoogJZ)CRML0C0_rPm%dmv6b1Oh` zpn&{UQz4lxK2WD}7ZO!(ft%`bNSNUR-H}hA?Z11VWbf*FaRQ&jxEvtw*>Z4T^%$m* z9FJjs_R>ttEa{9<)geVhPnf#@3#>Rm$Sebwcd|6YNioq@Jto>DdoSbW!kP%wYCx=N|OazsvTcgv~IW zj~CE*g?@S{!WDb|EyTw7FQiT~1$16?l2sP#=)0K(bW#2(h`GW%H%(q3Fmxcpe_bFZ zg;8L%-b!*oK2%aNev9OB=pKpc`7M$Ug#%)vDiulgXqh-F*jzI0l!rt<@}O8{6JI=} zxK^yM_)5G*-a?}6yP-khYjnLEB#MJ)zZTEGSs=bwZXKc)goVuv+^?Xso+GQfQu40k-(O1H5 z!xWLRa)8JqR3PdZ-6;IK!BuoFIYHzU<{~QCRaEL448YORP>cnd6&7{W%rs?H9y9V10_oMYb#<)G+m$C~T@!Y8lIG`PbBPQ*DaR8!wXzpN()Sek^ob#}O|zTPR+d3=6DW1Vujm5Mpr~if>O8e891S zV1@VGmiu>|RvaC4DxUU)Q@taPQx>DnyHT;$sYKP9>pX8WC#1uK`{#inZ$#h;c|AeXxyRouYnbfwkz>42$jKe)$-9L@?g7y!&YXun+~VSq+>KRy zT51r13R=g}drmq%)m=mayu@Uxc?_Mo^bTF9(L^4#j|K8sM4CzxrSi$g`Lk@UNVk$9 zI!V5d|Gt6gsdVAXHyvk$lr#wj?_H# z;NmO?RC!yDG40pUc9o2Lty@O_#CefG{6@=m8&LNPnbI54Db##ort52A8Xb9HB1Z3! zQQ6dkIH}B1`uv_F&i~ylt&4ABbpLsCp0$kb=aCF6~+{)i!Ms9vUGdgYI+QNYscR? zFI=WPO}OXDVv+K>4&gSH7p#mG`mE{N-gs}bCf0Ty!j{)@EPuyFP%m)LakFi$v-)ZbXnDsXm5a>LGxUa z??%xSd1ahKd3db(EN_^uZNtP{XBy^aY~dMBSLMwPN#fOqkK~y@n!|HW)pUN9VC*d3 zZs(j@HOKjN>NsZ|!(`_p)1sZvpI+^}JiWkKy?3AUq*I%nr{2zYhL;a_ zYvGl*57B2t(QRt<6*&+7QdX_;2k2TjNS$HDtjW8q9gLPZ&W{pMP zT~>vaJ_FK)N$2i`Hc6W{a_81o25+ThUdbEY=#Gz z+f2WVOeBMf>R|2{4$dAm)Z_pcuM~`d@!IiZ;T!=Z8x+tEzev7<$7V8iY98qi-!0fO z;;_JXbQS20A1*Vt*dwdmPz}k)whLw^DmzDLsmls%q>lMj9fy1SRk6~{OmE|7yLOA0~PP0ao3qc zctWfVKi=pH!1#joAO?2M)3Aor@M`koe z2sPPTr5pZ~Qf&=C+%@Ya)_lnjFU>wIp1u2ZgLlbX&fOC)oFX0-HMH7<*Ps2otv=?7 zGk2(SDmRAoG$d`mQ|~>)L%jR@Xz`S=0;iBw#bWub9^wu|JMn#+GYzL>MvEo+Z^VnI zC5W$FFcEuvzEB@?=~n&x;z)t>Rc*n*wP9ql^9y7@HkWPjGZSS+teZx_Lr-1T&B?NPcS)l6zLv7Z#&iU6CB>d;l&M_!H$ z<;NYchZ}RhIM3J7!}Ja@d{TTxO12-SI{U29VSYNs#Tzi%OWzCNJqZ62F;1_u=&Jz zC^7WG$oCAFeDp-=s^D}o|AQ_ll}c#i>xLK=NLWNppW_ipIkc@=^=mbaGA~KBy7o@ ziZ06@kmV08Vb$X^IDP#Je4`MAfi7bD;_NYc@I@JA-JK=f^P`Pktk{I-ydtPqH^CxS zEFE02l>Uw%hMV3t(#Qc7d}>~Wy9*Q1X7qDfGp9tl`@225Iac%kWH{3o^dR8o3=+EH zIIjA#2LHxy0#C_taKF0?+c(8B4vtOOZLArfHUYQB_z6-A{I*BQv!(atxHKZOL1qsH+>=dRCXq~^k zIjIP8N-Mo* zUkZ1HtQ&^x^7lO~g>QFRYRjjvKISzE_qN?-y%`hl5Z|jP^y>Kr4eA?UiQhhQO0*tR z)&|h;D)RiU_c6HF*cDBdOrT-^$;0AFNra;*z}vMuP@6xF+|%zSL-UVI9~_Q?q=hHR zow)nd=HCbD$Cgy0@%JE{eplvdu9=LZE`FhJ7o6rvr5xwP2_Ja@Ci{4@J14lphGyPz zJAdavnT@mO_FkU(?7O`5U&+qOzq6ggcTREMJNzNp`Vt!4zGQJtn zEOnb>4(ge4uy!ngRkzR4uii3JA=JUo9aBlnoWrnqSu_k=-v(j6luj)EMRpmzfi|Ki zaE>q$%uvgq-tMV%Qdl+J^Cyt>9*|M{z7VOGe-b}VQ=KY*p!{5!2_&sIL)(LU=K_0ERlY~Z;5UE1*d!jFc6t*bb1{V&j8H|pW}_pySODg!~A+iNJgl!0y@t4a7w zWk`BC8TOx4B+X(snftFyx~*#g=?u6aB?#Uhv*_*c4Z^awm7UqXGvli}YVH<&VvMO{ov=!Od#xYN}L zKIslhm+aR8Gx;{Dbf%EBo=JvzhV}gABh(;4Bq4>Z@>Hz75VyT8BQ3w>iK=uagj~pg zL$AfK`0Ri5x!U!n!)nv$sy}0>w#WhAYQ3RhKUagiPB08v9HAbM^XZ-%HCWxw!~a&P zVAX1O{-`Z3@bKs|IGR3^R0Y`JvFt|L`av1iI_?HJ)&1~JMTRXm?%*8VA^z;KP9#CU z1}@)HhQ}Mc;S$~Y~{Wb&tv2v>#C+%+9|LE~^x=owE_gv5X>7m|y6CSd=6H!Q!;+b|7*D_;O92(f zjiW!*WBJ^oO6c3d$M2=-=zP2g*Q8j{uDE>W?3+k_*~ZZd$pvaM_Xvd6RYU%pdMNt` z(rSY(IOWO${5~g;><)X0P~L^^C${0p`^mtgS6}W z>FjHaTlkR^Dyb{NDJ5U960BJW!-$* z%KE!%pmy?VGm&m%DQmiCxA5&1Q}!J5_rk+%%`96r9T8_;wQwiRa7dq^EPOMeNIFMr zlt8;hRUq)rhC8K)n>!B&;-u^WI$**u!q4BuiXAu5xnL%$gtyb=$=_-8FcqnDc@9<3 zjKMob9Nb#?mCQR|40h)<02~bCYym6JpcCb-==Df1_$GcL=>iu)X z{m+(4-W0!*Ox|-?;?a6V(s%NhWc_K)#`4u>jlT|gGyTE&LJ!5`B5&`tBAlZtdS(?U z%B!-jdAx)#`gbN$lw?~etoh9m9lYlv8s9Tlv;!MOmorz2)U|y?-L5A^Wt%HSGj=qI zOuSEtx<8)~buK9v)gE&dZFxFPbnkbNXtP!c#q2o{)$x!xkG(+x zcVv*B1LGm)wj@!+kM{o*oq0S|-y6pXS<8|wOCq6_nao)3 zIa8t(Nm@wC(yscFN_(@S zC)m*8wP5PUOPn2(QoxILL;i=mP_ODoA}xOcmd8JZWMx(OJyy(g%!~&KCK5U)CqT-# z2zVoQbo}leh2Tta?~jQitLQ6mcy=?m&;AGVHhV&c>UMH6XeH$Ri(s^G0||0_jLThu zkx~9Ea1WRQow_?&y+OgoDRALS8g_I$FtdIGt>O9jHk*~hn>!(?KOG&akSLZiH; zWF({ra)(05=c|bby=bHA#F;!b%VJWJ5KnqlKB9A@H<`_#hwrtHBf(lX>YI}*1jWd~ z$E^|6$?Ii=HT?;xdv_chBDaIewbNvN^><>URv_LZ8x25 zM$b++k-&@Ql*D8ZcfO19^7i*Qab6m>N?3@WTq;4TDM?6wv>qAxuf+;~Kcn2@t!QlS za%>$Ri>u{sq)1)4+J_?V8(^DP6&gFt1R(Ek9%{g^{KGd_}dwP}i7e<#KN~!*~M7nFDAk@GJ zm94&rer!EOt(z2q@9i%lm6mzLHh&Ux`J^2<%zBPTpI1P1-FaC4Sx7Q&g`%)YEm&8} ziqtw6!=SkbH2f}s(6kKfbFLn^?Qu{%m`rs)*9MpT>0r1_8X6uD!YNR};b*=gFU?V= z-CrHLoE%A~`T*tRE{&77IwL19c^o=tDmfVak!U&DKwen|VJRiE6sKmejU_Pfvm2CHS9}-ohu?RxTErPvlNAT( zk#n5vSD zlw7cRd50+cordxZd&r-J2x4}?3TB?^w^`f2hSBzzUQ-fkfGY>uVSKh34oErJ4NQFVuf$m^Ari1(f+YAzoV_FR}EDpybt z&A8VuJg=`SDiukHX6V?8e&);*t@Cmav0u4~xK_ZOe&?+)w@z8~SZ$K%>D)14e}9-Y z`;3Rr9WFGQ*LGr@Xe=F~e0XDqvzEQ?k6qCgS;NW|``Brnh<}p;k{I zPEMDok25C``!AExETt@_%yGS~Ys{hjI^>dOGvkrrVY@cag7JAZhuQnD(6(ypT_*LyajJT` zBr!WQg(+OdWdwy8$ZhgUwBWlAQC8I^O+V|IX`Yca^%h&uM#D~Kea;-lW#>eK%zJHJ zj~Y^M_I$N{>wcXwns$}Bc>f1wu%m-g5+6)PXR4yGB2CIXWVx+kw<6kW)@qyTu$HQM zxr5p0sDnn;A5%Fc#>^_MAm-F@TPEh+Eadn&ojNb($t#8x)XefVC2sj|ZMp9zqxova zVvhnJEt%EJ94Sm^Hiy_U?s5yNPr0q6wC#teh#3WxicdW=zb%*>e1AU2OaB#T`{N0M z&4E*^61;P`iO(V}Usu~%CQ8?FB%kY59qbO}{&AbjUAu6YqkY$a8|qSLd442}Ti<2F z?YX&udtyk9YoDQ2p_0F!yT{mun{3z0vD}-*k)7Mkv1nhxDBW1cu%uHOSNE+o9+wNVTOuf12i^4sZ zRBM{F*32HSsNr*UC}O)39q4$#j3l&Gzw=W^Z!8Td!$a?w9rr94#q%$zbBimgRU3?K z8)qdmjS4hndCriEHqNcN9NKRC_&+}?(XodT6p9^BGs0>%=vXle9z-+k>Q`z`DLtk9 zun$`KCYtf@X=So8B9H%AFr|MB(UxzRI`vkb$xhZ`p6N%~R(%gg$LmxX$(SK3Y^yr* zj#`G~_>rjLU^=yZU0JoK-Atywo{jDtG9f1$teMzgZAxu{F-cqQgp4bs$jb7owm~jQ zRD{MZa!jq*=G@fvOvMfva^dlxnhw`crtgX^@}~bXXEV<-ORWww-{YGZiyuhq6Q`L7c{cHQUdGtQPp8VHI&4^5KQVS0i%4I? zZt`o?nD{B{F(uKuWQMzdNhsS!(AEhg_Hiw^za{7BscG~@#91t?bcRn%nCPP$N zCxvWuJxR~SJjQSTQS$Q0pPC;I^T@spQlzYKl({ft3aLMPk%=p17-NkOjN9IowvR?% z+M4`*SF0cF@|9&Zai^(ABapoQr>QdM)IXiYUWztr~AJyZ69U$fcyWRUKQJjB}k z>=0|2sW+=c_dUILfxCsP?+9z7XeL`bYBN2yS)E<~L7VNpPT4{}$cmjZe+|p=(KS|y zf-3()zBQ}DskPiVcp*J=at=Lw)rigTwApv7yx8McbjfhaD3hn$%3P|Ks_9txrn)ol zhv>iF=$bEgzcW78yKA_5laOREsR{V`utw&y4Jm#hWP0M#s9#$z*?Lc3!faok%cPB2 zGwkTkOtJ18(Vx0?w!3EhV%jcHH7izmkYfYCxed)Jyu7(JH|B(iLO;T=oKbSsw~|M%$qwj|SJ|L(q7=0ml2RVDpv( z#O9?Z^!!>1?#9#Lglw?wgsvOtjDI1LyQd8I`p?$WHOQ8!O)fNtLqX3c+NNb!z2k?b~t-mT_vKR6hYb?%YPdSj$~Fq-^b zD@&&2^rO{xo$;*Tcj$T3B+PdhKr_wK@ImJQT%=ZjJ8#9~@6W37?#kb2D|a@E`EUju zIctD_DSF@`pKVk|v@-I`RYxniO87+QGAeb)7s1C?!bxn`66AQA3p9Ph1QBKyRdb_P z3zWlL1bc=H1@9N@S2c^gsuc2m2=wGLD&^2V!I8dt!Qz8kt9EXjR@IzRESNSyOJFo1 zT#){6LDg7=Y}G%@dco1YX2Alt9)U(rx!9rW1X&X5;4se+*28QF(EdTZA4!4ri;FP$ z*A}9$iv89yiumKZLnN2Gh$PEup*ysg`yK6!OLh;TV23$4%V!X=io@`^hl}x@_W{g; z?DCqQj^*U5nighSoM1`*Z4$Y76*ag)%yKG;AhR{6;?38%*o!)Y?{$fJ%ttv?qQ^Qc z>=Zkhm|QF+vl~Y}C20Pi8`Q{svAbwo{C~c>fNW>Pq3)4Ou$xkbA~C1%L&jEc{wIOA zYuSOPPdatT!3962#7?C1MOb;^5Ka|*#H`0>al`^~PUyV>9?5LQ5gRvRPs32L2V4PP zKG%vS`9>ktGznX4Z%MEXpF#rHNa5MyT=UN7>a`<+)wRnrr`E0;o>O}@MWeP`&mC&j zi@~Gs9mHQYs=ebc!ey5N@TExw*zt7?ezZ&)U%Dzj=cfztHA5lJPR_?S(^uf$v+J;$ zaUc>4B{96x!7<}3taWiIo-Zy(H61=kX2sT{@AZ24#fbPj?)?M*#+9)8WdwYUcnSxk zrq*W1jKjRcMQ~NHAI$%D!NhS-@cmC0o>!}KDXF>ViNPFK?KHlAf$&_P2$3i!P}7DHUI|OT*#-&DA{Q3>-eRU>Zr7IIK=E?YRr&#}bDr0Sb zMAMq%Ca}8kpxvr%zOi+h|5R(EC9PIxMGaQ}T+FR@!%eFThHtIViBDE4vmaTV+Zt>2 zS#!WDB16gA!L0 zpK+Sc*1Pza9tcdQT?7-@ajSmPeFrqzvJMm2#y*Pdf6AKlvzmqcm*p~ihrd}gzea_{ z-SUC;waT2Is@%(3T62%?q07xbD|NAy@7B|YCL|#9iYRQePkVDDptaq)_(-EW zmh_6n`^O@X*G)wE16%`IZuKC_s(`%7U?;0CZdXa;Ly|J0Nnwr0N1I0Di)uE1*Xozka-Bnbem4;R8cjfBsv% zF!;rnB|I6v77wQ`<=t?Z!;_fWiT^$|MQ>j@B7f^sxcK;TtbTqOR*v;TmjfJdgWp~} zca1DoUSx#dCyb-LEzxL3$WmMpcNs~UE2E*K;grSSb7)bin5|(l8s|U-RIJ z_ZH|sC?n4Qmm)k`L{7CFfVStKnE1Vtpr3J#)c&|fzGQD8ZR#{y+JwENO{>WBb30*XmZiVhOQ)X8 zrx&C((oJt1%AXomSm2eql)o_m; zd-s)avA3{*7@ju*+pi0279@W~Hp46Ng~UT}XNem8DxE+Ut~r6;9yG-^2a1XFy3lIA z(Tkda6}M4!R3B2Y1(KMP$GAUgXDa$#LER^TQ*-pJJsT$Qbzs4^<- z{jgrRY>lYObGL%9Fn}%m*?UA7%R48WQJ7e@^vguyqAig^-YX=^|NcX$dF_?Zcbrl1uZ^Qchqvj9j(#S>wtcF?n_pY1s_t5f7QX8ch75-Z9fzxh!!mD#wySj6sTxVF z_<}j~`L3Dt3Na(Gc+EVTwl`;gt2s)0FLR~+44m16Hjy-cvN^rv?iD&?Y$=Ose?xCu z-CCZ`i(`E_sm$NK=??2h)eSmRq&46CR2IFEHsU+&ctaaltYs-{EdvkP4KVMyHRvpp z7xQX2lH4{&xLq77e$MWK%me*I)BFN+c!P+PFY06pcX^`>x4Wo!f*yV~v=c%g`s z5hOgxhPdw2zzR(a{@r@3=H;>75ccUMNxU6}`)=jn=@OgqsZY|paq)W1v&aLpyvNYO zqdZbEAc+IFMj~)piuD#hr5g53px*JoSOTD^gPi(|qE4MOvO4Z;qiQ1iQdvLM>x^QCw&O3b&o4iPb*}FnvTzrS- zJoFRW3tnSUkN|&Ch{J*$d%UkH7OAS;Mbc{o=##i#)@$5I_AKfmzoSh_=E2D%VxbJ) zYtlw+gJY0xgQ$AZrt|n=sT&%(7D0;eUsU{K4!(VLGtrn852A&Cz-MX~yh_Uk$wv_a zx7YdtkG43$>OWFd$>u8sTynkgKW8-o4%{n9=?V}8&DbjlT;(f}5~T^|HhBo#Vvh)Z z%3QDXsFAJ;Dh(41YDf#-%cTq2_ns8IK5$8}a-gqLLoil(Hr`k8TE$7wp1z%AxL#oP zNLryuwKJesCr#{~pHHI2&!E+1;&)(g7_+l^E7hGkj$?*C;NE#!JUvqy|9qj!yEdZD zYkgU&J^J^wE1LCtBh z_frAEcrzH>b0D{hH!ufzAISU;RXG1$fvmmti@mtVoU=8)U|+5q|r-j_jOX#cV&f1@`gssi1qW(fk)p zU{mD;$3s%ZzpI37RR|@cs;9AF-c($X_JvxmY=Wzwd}VZe{PDrNIcSg4GGaLa;UJqx z?0QJd_-0OntWzq;QuZMGwuIS{T>%BZf5L_LtXiW4&oi zGqA*uWc>Ny4ZLetQ%%B$Em$(`12*|MAEz3PRDYcQ7X@He&1UIHv|(ujk%LIss_uhn z(HXpA&K0C4nu@~96hZ6iEHHX1Pn^_?DXlC_#$Dc%75C1OjC)PAFZlzLqQsn=X+c;gRRi)X@i zLzUX8v(CWhKT%MoZU=|w9fxd*MmQtYPc@B~6Y7&P%#3d!ixggxY1T2=VpS6M67NFK zWM(0$WGTG?W|zI9yf_(o2@Fe_>aSN%~#oMbjV9dh4h0bZN~|)^kg0tB@FUimr)uo^-}PDdXS`*ZoL1wvPx`{jyu_jJNNuD= z3R{Se@w8g6!b@<&Ng5})+4IcO3{jo6MeY2qB3MLdTzF+Q?|wrWQZ|*X^;%~Mfxb6z z!bJmK#uFLbJeCW#PbJ{Sl@)mJ{y?0a5`xJ)KjPB53AeAhhWRBqxWs=IesVTf%=&J_ z8vI50&lg^8-#=-zc1|XlA}XUUWVE8e_QkN|-Av#uIE5A;9YdOKX4ZM3((tck3G^hJ z!TMk+xOKq+gs<&U^N1=aJ@J4UK??TPJGv} z;I20LG$#VxzbFHN>N~*Yk1R+Cf081~1qI6FB1553?e?c4_$#9baswaHWxtQ4Tuy#PE^3UP*KwVZff+=}HTyTC}9&C(_lu ziw?ZyfV5v4SnnD`9>2C=_FD` zSpH8gu2EG3A6v!R>XgUW@X$E+)!B>>b4_4i+^p8-(RBRb>VAAs-k6aPV0igUySD1t z8LaxH32nPFOkL``%B#kJVOm4yq2 z#+kSaTTts^b6(7MbtHGj65?~kJY_MHtYqN>?BgDfPBv;VRRLWTdzAr9+$zGee(eS3 zOac`mE(r{&G=SvURD5~MTu3SZg4~{#pr{{F5cJ)}maZy<*RM^A(2|gU^<}#o?rabcFT8*|;)NXfuqTy3}FS zt94XkWeyQ-h(uHBlkqdA6dz29hl#IWLC>ns#NY1_=Jk2w4Xy?BOm{2(zo`ZE!n#8i zUYWP)SJ}@ko;a0T_{`~h;}_`H2c+n&$8_0x7nIrGHJU6MGk3Ad>gKV3s_DUJ2XnmdSSkA9D}m}nV?=Ld z8g$-ICNDl#l0x=s{Iy*He1nz%v)B%Aoy=$6RsV-jvL2jY-AOc8Mqs~_%BcR30<_I8 zB)OHlkZJ5R{9CFCJu}gQj{MW051HKkGK#_%8>)m4iX(-OmYo$kYk3P#xlR=Q7RZZ? zUiS%&vIm92Z(L!KjYwFkbxnAFbD1#vSh?__#fUIX|G6;ZM3vAsvrZ^KsTIN#42DpdwQPcQWmoCU@hCM%4f2pE&e{dL7VDb zVx2N7v*;9P(RaG0@#SSFScGJ5WNmspnavz5EEgJ>^0l99vHk5P^T)F)=-}oSi+Hsk z^pVdh>=k44X^+a&V6jsTBEQ_RW?h*<@{_aRPMR9@WF!;M7q^+aOK*}Vj|rSATn7uM zW>Yo?FCv8{XOQp3WSFDSj)H2uv6GnNqth0R3kr*wDGhzFWZiYp-{1>KdIPKoy$wSv z?m&gYGdQE30y2{~g4B||puc|+aHOT-+_^)<*FBQNAJ(jmoz`t5dnJo0vN}U<=go## zMV>hGX^N#<7c+U^XcBzH169ZR;CQ9w*zE60JbJSLzkT)w&yt&t6Xa5`koLjLMq}|* zKNE5|YcFI}*I?O`;^)nUXE@3x9533n6vrH#i>8hr!rzab!VWVv@Ves~I67w{w4IZp zpymy7$!bEaWm;g=a0Zgi%)!WUF_f`|P%zvHfiBz8ySZ}WS$Z>CwlIb?Mm|7-&}r0% zsw5O9SdUulnoz}+-%NRv*t4D|fkU3`!0U@*k#M;SdA-`3$#RaID ztj5W8O1Av7!<56n(8t;OTD8*FOrN_%RhzqH=Vy*rkupq9 z@{@C@nr|6bvPsbC9&7pCxCT;tUxUo)rC?*6E6zpC$HtTa9Fr}C?9+UBw672p2jt+{ zgle*@{T#?yJ`i^}ZbOP)69gO#1Qo>x;9ja;D>m1|&1DPW)WQik?!XAs9l4V@TILfC zPgCBX{%(l9lLKZ{D9m`d6+~atsCW6_$?!pSrer5W`n=rW$5Q0=`7IHcYQD_LsegeU({>9H8X&F12~ znlW_wsy?}VmtmqB^GQE>M1Yl#W|p#W$nvk$&BHfQx^6syZ2Y+jh#63Czc;Eb`ILWyQ=d8YsH)Nf}3p&HFVDWc!Y}y7Wa8)7eLyHM(oh-PT z>;&g$Dz$fkKA0u1GNSbTz*~)cR6if?2AF_(tb4MkSVz}e2VOe(uV)6!;nxzi5K+ZA)ew zMT1{|qO&W~@aA7)hnKxHsyiKx4du^>{eQ)%>wO|~Y_}iquWf?G(>Wmd|DN^}D;O-f zjlbTzLh$uEqP?Sl{B2)^UrQB{Nv?N+eepRczKaB(#w3z`^Cj9cHxk_vDd17( z0$%5I1Rr<%iL*eD_o>#9CmYy?FO^5Y%Jmybtw}w3!nMK4UL8nMu8-a~a~^-GQsw-@ z>GJ%@OCzk?{0Np>+-KUeN|DWa-obieEuaO3CG?8)O#0GfExv&Er!UT(#vc4PgMM>T zp53To&MJ`XXGMRuqYHla(BA|u^sVis77}*~X?OEli%8idX#HqKQqOntpB zGTBKEpIK*$XBxdlCW9p;zupX*Ls>90?5i+tx{@fQe^}@{`bT)BR9Q4ZOI;YNbiT@U z_q(d!Q=)}Ix?hDC;6mkRd|Akkz9dwcON0?Re}p^#X>on*7@^ist#C9IuADd923d!Tvg!ZNA$=te%1&RZrmszK;-5{Au?TscRIbx( zK(moG{VVhq>!h(Io$+8Bf38V6>p=GatL%~i>pg!FzhT2idgY1<{3`p>a%#mAmcFLG z#qhdQ7MXL!lFUblwM?=U-P@i;v3BS|-Qy|}0C&MvMhBL8^C>b@3m)n!gKS_htkH`G zsir)1V{;$+L1>1(Jc^tfZv*XoAEKZAk<4?rPDbKCki-o_D#Xu0uYW3$jG2~T z6Lf~0nDCXk#abtJCI3dA4$`2a?@8Jl>QU403VblR2fcr>4LVbv6HfMW;%+$;k1_$+ zK%fD?YaXB_JM6(N^FB&gwg=14d22H(U^lji8zCbH<7#&Ix#Ow^1t?k71%8=wwJ4HB zZgh2tIjuyz!#o3<11w28XCkhaKaAy~mxC7jBNLS2g_qyhg~z(w@d2sxXo$OkeE9wz zun=>0Wc35j51u1<^XV7gS z8bc|}9A6JC!{T5&^pkWZa^Z@K4pkcLht=5=u+4sTh-~^K_EQa@jQj4`NZ^UNzr7&( zehFT`GnKLVmxuhW>ET4n(?qu=5C=s)W!g{L<2`gdURNrOMRQW{&Ar_i@65uQ;X2qw zQ5qKY%|&C^zM?AmjrdF3bL9E=3DG&JjOpHb)RQ2Ia^3T6zlt3YH|c;>)jOo7(*h`ov!w0!2vPl(hozU&FtIxgqH0*+6Y~(erpF

9@dO zYdn-Zkc0QZX?V6;am}ORN+|T$3k!7vKG4sy8zK`l#Q36N4%*JxAvtZLUA9(bB#P*$R4160CGgeCp6x_QGRZlCS zRp~5z{9X)Qr(#9HO20_gp5hdH~6pxaCD!54RN_d{k7 zk_&u@(o;K3tu2P*H-CbTQ$Gmpc`!}ji%lxUxpL=P$nH)8UUoFh1ts2wPoK(rJM{S- zKH;nhhy7T4$z;|ZSMPFzF>Th`XL)qz4Sm*(gZ;ES=L@aEXz-6UGPprX;9#)l4-^1 zNYa?b+Ku|m&X~1ek>N!0CW_ss()IZFCk62KNI{0i1JnbRF60tyjz_*91}oA;&b$=w z(_F&|erSLXJIFw2Uj!3Tx(Or~HjuMF!qNNzHBqskSLo1wU3efQU#M`eS-3}bf@t%R zK_Qka6WUov3g3vwk1r_(!iVb|ga!L93;oxW3vF8N2>-P@TbZUATN(c}<1UR_SH;y@ zA`JN;V^tZaZPhqyhZXq{X*K^!uGN-UOTPUoOTOz+BH#8#3|~PrjDKt(&qAu)n3Xec z63g!4TE6pSd5cbSF3W^9!<^o(z-shA%8xTX%6D(`4q(C{Eo$c z>8ToLS-(8jv#4p?`7mDf=xe*S62!@Zmw^xwccB;k^cGg+hf?=@qhS6gShbiEub`2CXl z^Hm+I$m{@~f;)1zH^XbS1Chf+D`K4s@N!fFZ%Mz8yr*nIsgp0G-{SMkiUkQs^JF&U z2%kXFe+%HdNg3Lod4w1?g+P(_6VTYRp8RV1jOx)9V!m>S+SPT1c^n-H*TqbQv^8sa z4tZ6WEWJklZBVOyc-$RM-uXku?kRz$=L#@uRK&{z?RjX24H>=CPq|&IhH0~WV8tbK zo-j|B#JFbR^+X54SLQ&vSt8zfbU%*Tyar0bPs6b#;Xt${P>4o3^&(WIR!e&f{!H16 z|9fwPZ(rIf-UDjmNm}h>_S#;Oh+;_hxD_5ARKl+>mtu|JqwtZHOTtGo7#Za%q9M4C zw!O83pNqD`uS>GLX1zYF{^AsFU7m-&?}5%dOyuoL+G5 z&N{+rzdS50&eD91od8E~8o}kw6tg5WgQ-86 zT64`dj@Ty4!S|a7NLN~hP2CnD@@AKj<@d#T%0W4jUpY>0j=e|TxgQDq=x5$HOMv}` zD)h$M61^Xq3pO6&zVfuSFexJ*S}*d&@9G2izceFkr4&ag_T7LID-l_7@C!3(As^`l z8=@utB52(IfDE4OL|%g!YxgNZ0wWJYk)=3d(>Qt=sfDMQT7rE4Uep|P9-lET6nkd1 zkk_%PHT^4MkO{3twoW_?uODHoBR>((*{O|gNA5xKIZlrLFbVXhTZxct@tO6ccz{qi}r;V-{baKW!n|a;Bfef`~vR}9r%{bT1u!^z_%9yblUAkcgCCURN?2--~N=c@wZ^z(8M4|S; zl9w>)&I!%R8<*m07@L*=dsciUMmkw!?MOTx*Rq$b_e?)(eWMl_|!?T!z%*z?9n6=!C^4z$xaBf7Q*(fLE(73q^PKChRA{aO(^o` z3wIA?38zk6DAauADU{7WEqs*oLYRE}h)~a4%(cp82~$4R3qKou5LP~WR8>`ug?HLx zh5H{r6Lw@thzv|xxtBH0a&qKOa}SiB;v_wEw^A$DVFe^*m2HknVkIAWP6s~qWMw^% zEKg-GVX0Piu&$fbmcJeKq+6bd{ZVhy=#rwttdYkp<=NeGw0F@i+9Fz$_L?VKzS4i3 zmAqbqZG2FHZE){O`Kx7YTKn<>cFBH2zNnjzbC)HfUblBtl=57{tGEbT&!pq;`WLAt z&K(r!=LZu;g240aeyU+zI_0|SFz(}NK~!AA+*w!Tl9Qw&JEXu{@{`|K5EgiI%0EC_zcbs~KO#S!hQ~ zAGs;rj@)Bcpp4EWBy?#&o-e{km+%oAS%-^+M2$Gmrb>-J*8w>v@V|1o%di&H!emJS5-)KbR79F<13^Y zCc#-cf?9382#Lu;5M=fT=qdqRKafkduCbwZg|q_Ap~;@+EWmGNfPZ@~Vtu|s9Hzca z1RoB@qDf2+V|=%bym53x+ry8-kFaw!!NmlUXC}aJuYTJn?5mK`Vnl8P+#;(@O+e>8 zW&5ci2$s<1_|32-{J3}zw3h&h*uW?6^tABBd~uIj90YE?zZWut?^08X_TWk}8-3l? z8gzTZG%)rmB5zugaLd{icx+c0ih4T!=UWKl6@ORV_ zM#8q^bJ=Hb;KD{&?kRRZDmucnM{lY%8}6dexr2SrY@XN(*l{>xzM5zLVi@PfOw+?gv1r0=pz`a zUSDQxHccSy(LD^o6UcvtvunfS!r(7E6hGGef*+LjAdjML(6BxXI={Rpla6n}y>fTZ zgNIL%tg0jkKi{TQ`$JIk={Bar;5^#asRiilc2f8|w*Sq<S>?OY$TJuM*MV+(}|JQ~Y786R!%kDh; z#d=0_>4D@i7H$);1Y@)L)mc$2-%b&I<505s$--n>Z3@NL*;PuPZqk$~}=^5@g2T^xmD7 zc!pg+LJ$?gS+@nXEi>C3NMP_{N$4AI5Qp}c~{GC{ztAt1g)!O9m z&Q$5MEvxH=}=?gM^ zeH+Brlww2vVch(o6J2hUg2T!^M z?kxWPb`EyD8HT!ZLZSE8HrRRYA+CDwgb%oyAiwgTE*|#tbezfg1mK%;S&q0aDV!_sL4$}F13vaSl=e?Owh3ANw zsu|*0)IDw*mK$1$ZTu}L&DHYw(V{@IWA-ED8TMD~qSb_Uqx_nCmv+Lfbum!qxdU#W zsUyoe-yl)vBdk5UiE=il;b`Grh;H8kGYenfV+tLpvuKQ1Py9z(Tw7qynH=sR^p;bG z*xbg8(OfxQnmcLsA#O#)3C=`c56;5*bGUoFlDH2=y&PlP`W$Z(s4r#KaqJ}1lmwB?GQhd2s#^4zwPct~wAslD8r40;he z@%*j>xTLRx36_zq{oIfWwQ0r3!gC6$OOfH}&zuKWSB}86pOUb7IEHNTPUM+#Zs8!- z0o0j06YZ8i2YUz9dFvE5<2$BjQ0I|J`1$NLAZRheItekv<@ZGrBleoyY|tY{!XJ#8 zV~LMcqcw}7&W z#7spyId{_%Zr!{N&R-tGw2*e2IuVb!i=CMs{vT0~UoAQ+T}eJfIU~=hisW+rTKKIR z2cOp8MfNVr)a1;Ygg>c>>^i&~v^Ut2#+Y%Uud|TxS$+y>JeUIPQ&+1szgVG=#dGoN zV|G~0)EWOs5_j*022tLVBC`KmIpkDaB1OK%C@QZMA?%DVwr;aMkzb7#>m7zc(KPtn zBnuTuAIM;MK8fq8z~Hk8Eqfq|al$ENnAJjN+`En*&DMdI4t)q3A0e{?%ivj9Dw9-x z20pw&FgHekuPGkG!E+D5&}+roNcWe(4++O7_NC!c{a(i3IUCH*3=%)_U6|BvXM8Dv zL*;BV6gz=ONQuOGMqc2E2SgXeGk+*t$?YN!WJZx8Gnv=Zsmb$kh{OJevvAMFRX8qxH4*oA{2{JGA~KR; zgy-%%hIXA5^HKU4(w<&GPTDRdD_SbZv-B}y7Uu{7|M`RLrS-7wvmZJeC274!^|*qZj@3-v64`gj+t zmpF)>Uu5BLr%RB5sy4-)9S#>NH60g<58{#e5>Vs64Xn#9z!Lg5y>CN!ebJX3+9ElI zw`9sv-dTNj%QM3d=o+6KdfTctdf(d*^np9~ShcbhOa|*c{aR1oYJXBPy>3iQSMBSj z6~i2PTBUKk7rD2s977oY?ZxRl9q9%-lnSMT_LtIiYF|-C(;@`3;<0{nE2gjg!5?XFM=TvSWkiW9Np(bG`2GzS`PN0S+wEFAXB zw4z`A6R^y`C8(Ua%U&_JKuyAE9N0Ud$wBF&a3ixj54bKA-gSK{td}noHa(jqe3i72 z+Z!PzELA2#zZ+MDeLj4lyg{;nr*lY57dn>#r*85dRDG?!e-D(Z?qG@jgP= zx|{R*JErkOi{t74%A|NgZCUoDhefP88C9%3)AZ|C9TU(l^VE16nUi_C9s-&w7SO?` zyy&6Z*{pvbn^`^D58qVtv8P%Uu@AapmRe$qQ zYWf2FX~AxEG28&N+7?ZpNJj*=-M_ft9;&~PLAZIgHh+qPf zN7i5khf88A@FHSDb2$Ag4{tu-#>w;B%ir;%$R;c`f+JdJ%U@`a$IWO{tB*co{TGF!_++L&Wy{b zU(Jl<%!q{E?IOw?=i!MeayWIX5^nJE!M4+6u)a$tb{7mHtyweCx~r1-{G$L&rPkxu z{n==DnhGwhvckj1g!tihjDwTjpdf{F4zdj?B(~BA*Vve#vh$&6hU-jRdA1g=I-21T znQ*-HNg`gh>J92?>!;?o{X^WPo>(ui1bsQT7A>28jLc|nq-I{0z>nJ)7npM^;#VT5 z>r15!8WCC5J(b#_RDnF#pCBW13&HuH9~@ri3CT&ns8t#iJSdUEHKuC#$D1bfvVSH> z<;TJ20}n`8B7%>lzMy?f34Ak~pk!zka0{8YxxFa_rG&s&`&FQGD1tmUp8~tLln_C7 z5qu0z#x1Xg@cefo%<)@={U9R@ zL$hf#!_keN=~Nfg^xg?AJ?o7nbscd? z#&%3kor>R!i%@#XQkeTK3{B2$5#2oaA7u8Hz_jVJ9a54Sh&h%<6+0qO%rO_ZXxJbw zvuh)s_Z}gW3I2F$MHT5VwnRfA9CSW;5B~LLGNu@}e1E+O3Q1dr#U@=?^|2nNhmx?% z$RIcb?T3ly6oTBA;;vx&`{zgkY zkmE~qQqqV)X92eEzlgVLr{GXy6)bbK7z$j59_Tignpfq}1qF0h8$oW$RfT*%vP~4jQp<91&XOZJEIK^WxFPFREB- zYp1i-6;*lkQ*D-yYXN;NvyHCUD9u~orpxmdsk8S)+-Kd~@&FP>@?iFRA(&se20b;6 z^x>Qn141arwFJ^1u zdGGU(vhx$tcF>btYdq@kDX<9X-`q_C-M$EueJ2S0@8xg;9;~tXk7Fu`d8u#f^Q*#k zc4CU{m<(kb{WsF~`^Is>CBXz?N^77nx~@t1Oy!iYd@NHK+R-IkNvk!fcBwUO8PILo ze#}yEt>C2aOZ?=f3H0=)+Gwe!!fIdGUeOZ0Ftd>ve^-KWE&g$Iks>y%^TzS?Io`ua)%H-g*w}i?y+n;WMgwN(I>>F)H4)B@Uie#lRiQ70|oj6j4zt7QfXw z3Nx&JfZ_NnaIK7mvnTSz)Qa^`HCL|1vqA)sTbT^Q)`e)glMP;|t^h0ctHa%&c4Ysb z1t4+Ymi>uo6-aTxX%eRw40D<9sgv>Rp(i*2nRGehHB$Nb)UB69X5Jz2-FAtDOSt2a z9mlcd``^fDzY_6tuOxpZ7K5rmJlwPB!P0dUKC!`qB9R-BzVoRl5)IFsgxhO|s^>xX(^TbwsY%CLL zZMu!Rm~2otSP#_*2tL!J`kZ{FU31_%ADD1b@sD1-Gvh z2yQ%*6BN$)%AdV@t)R$!tzfWpn_xy6a|$2y7JMEF7yROE6sY4of%nw}!42aS!LR&W z!9Jgb0*`WU0qM{X{BFz=L?!JO$W%=eWdHI;+k8&q+{B5FG6rj~Z1_U*B54Qp=b#;y z*Ex;N_pHG;%+(;M3$^aMSV6@+v22bp&B1fi$wA+tC+=?|Om3VUyy z8;kL*%6NQumMyk^m&)`({xFWki=wTMcjAw|vA8jj$tGpmAi-o28IMlHPJbhDZ5=^R zI`rUYz!3;NdXJo9cx&RhO{mdl0X0=K5B+*Ej0(qJqZLel@Mi62a_g0gIBnx%c$3@( zi&`U*f6jGuFeDN?{C!55&KH1`nh}n2SHvM&YS=VE8~rRtFl<~e&dD!F@&4_|KggeW z6b3+DWhN}vT#HNA#^av#a#+^371E!YBAplu{5GJJT-X;%CcNvSdXv^;lZRP&$Li1G z+x0XmJb!`QFjay(Z_c3AXAj`#9Sd;u_6ADs;ybwSs)v5OPRCoKw%`)=U~H|^4o*}A zOnSDLjP!2ClZ{RByYCKgU4=(7m^swg-fECE_lKK0<3#ff!wm4zfxfe4pltix;X>IC z#!)yM-Ac;A*Q@su&d5x#t9}c671pDmUrTTcDJN#DQ(${wHkbzdL_-$)O^Y)2;7_6Hx89w_m**krT*p{qCob#5Oedxl*M}mNb)^dp8*7*N@oB=hoXE*!|vitHpqA z#M?KvDax<8!9No?T3e5C3w70O9kV&Myu?JoNeL}swy%^h;qXSzXrYs>#LGLvz#~_M zTGOryH4j<}1NG#&HqF{X>A-W`cZHpTc->+CT$YS&Np%q~TtbHT_iY0GGwV5B5OI-5 zaX0ZAnxD|4xj8(e#&llsIuo99S4(}qlMe6Zk4D~y!b;xnlxp5Bjk&aJVg~Q{kNLcV zU*c$risnr>HL&!(V!*2k^`fQD=kfNO4dm_8@Pv%p#}$vct}6h^8Vw}{@Q7{UdQZgj219vtuO06UGXplZqy zKX%T6xaBVJwl58KhWdh+YZ$COstem@g+k5eENBX@BX&oAkpyWyI5vex24+ryq{8|`d%1h}=y4w0GT@%84d7~5hH)pp-OH^*dYqJvVorA6 zan2i9!qIUq=7v7@=OpdC$r%`)z!Am`HAF0wu8TQHaw#48z-at zm9fZu=@?bnxlnwT?@nEv-05I?Z7WImWkU8#T_uHc87|qHRN}rXAFt0&$8XGL;p%1; zv`=L^&bz0Am!46@kN!=vd=n~Eo> zUKfRUMWK7ui&5O2XH-$tGZa4`DAi{ZQS{TPxQla7yfS1JK4;9O^ph8ft@2{<-r0vB z^frx}A3vt}>_6m7vMhGklL+73rh?b7J!G|ABZ72>$>`lleqS~MIX_qMattAl`y6nR zk`{99o`>eUw-ernA`-t*5ywqwg^jP5qn7J0#Lc|bHUP+=k@wr$0yrpn@1WInhHhrPb(Ldfx!<85JLS@0-g7L;_p>bDC_jtOOQx~) z&lgw zr6eE1-{D9Xi)!i2CD}$l+y$3p3ZuTkkl*G$cTjkqX4S~z8 zx=QkCo8YrnJ7hQ3i~gKt*^@S_s=j>I8UJloZmcR+cdwq~t?~0<_v{utY)OVz%}Ka_ zyag++Kj|<9>B3zq59V&#O!#@mjx)>U9arUkKmjJ5z?Qs0f_(s+<@MlAu^s4_bt7D3 z$2T{PC17=irQKa&3Z7v}pqU*{q7Ad5bG?*f6xdR=KtSFK7I6-fNZU3SSRCRGtnd%gl6GBiS+c~4-|p}8P>D+8AC8B`g??8DamhIwg?XziyQ za^{-`G`BaQuY*Id^ZF5jHmJkX0w*XqEFu+}S@0<$2V3%&Vr3*k67%xWpIH5t$oD0b zhinKgm1ORj%q-{72KAP0W~zk0Y6m*nKTPJ`D96)d+Cj&3CaE~yMao7E#G7t2Zt#Mu zAZ*S6@xiBXOz@SsPC1Fuiy+cAyblLSoy4nFrJz^a=i#XZ)npLO6QjYoSf#858+XLv z`I6tr=q5j~>h(u`bR3S>n1UbvUIG%U&%=C^v&3+w2hqBjhx*^d!1;ZfVP4rE!NFyx z(Y%x!XzMD#>bqCryUlgvgpy)Q4|@=viC1DT7kvzHSN&6p$p)G!7^9%49 z(@h!hEk*@?XjWjwkkfbrbH`W}{Scl%FNU3WhsoE4YjF16Gx*hyEc9u74w(dR(Cvs4 zqO^1y)E~5hNZ%b;q%{ds6K<18tO~*D&SYUkD1Me(h_`Q5BKDo_uzi~Xq;OB6^`8gu zzwF^PoOp-hQi;4_D~Sq;+(U ztR;_^nNQdLUPM!4UG$kG!ivt*X3Kh%(rI3{ERVUbSnC7LdDAzi&^`D0wB7J6R>O`l zT7C6(*2VTx6#Awdw z9!e9suV@xtRNx4of0!e5Q@0kj-YgRCJW?#2uhb}H?YJk@pPwR}ulZasskOGju_2bj zp~?li`)4)n&94&PD4ixOIlWTodd)(3I6<=MNz{nY_PJ0vuInYt&|NFY=_;d-on2^^ z!P`r>tog|jG*r~T9?YXp2FcQYY_jNur>8PMk>&mLi)Y2?PhrpbcZg-AF~GW>I+1Pk z{yOch^s(+rL>6nO)~5QUep5lB`}HbKCu+5%GTzcysacK_ z`}1)h(?eMIWn)WyXmax!-x_SXCI|OhUczT)8Mhn+DZFvUT706_9~&20IEr>YhRCRi zc)dZZD15syTv{>J(WUVY%s?#E&VSI%3do16iN=mk)zV1O{%#y;Z-~zrZiMcuvyj`d zV$%2PB;MnD6RA!XK}>=&+R0|-1~aR%y4qdDwj^No@CR!8GarV}I%Cragp23aQ8qLW z%LiQ|A0{P`nGKuK!}+gJo<)Zkw#KQNbkbpz2M#wcLj5iuH0AkYl5uhlG4~iHopOJ~Z$tdB zAuEAvRsOi)zNa2P(xa~NW9$@x&$mWywbdc(Rc5#NLv2ce^Ij%`C;UWiBi_Ov>dD|Y z&9@e``_%G_mQ3U>$&u%)4W8zI2ub8mkP723J1NI~aLAs&q_Km)#}9Fy2W;fu@q7h- zyr)P!M+VO*S&sSl#W=t36`bjJz?~n=D9f7)4u*Y!DDCQ7aPvt;lf1)`OGO;YX0kCC z+>gOLXGv@-t%$R49C0}L>oYm))(j(GI+5(NYDCq$l3#st_?E08vg663u;H!9r2Hpw z-}3{VKW`2`|2|XY+x>7-el9E#`;l`t?}%>7O}u5oeZ22P4VcsxfVpQg#28v)@gpmI z;AaVZDZT>!Q-wfWSkT43g!Fg{z!JHWu4SL7tq(UqboEl0<1!5{1f=5)+Y(Stpb(jV z$i-N;6y3pl@r1?>Cc{69?ru1S?I?d_6!VeUfApcj%D>_xylm{SN*~LZbfb)85iq3o zo!nK)#tV~#h!|;O9sMhKO42fXW$`OgIYk5`#*E|AWgDKFDT7^o3n`_%3<%u7WR6^S zVRygfxN-X?a_`b!u)38>z8sUpBL<1c@thSXFT!A0WdS>Lk6=f`Zo(SNfunyjp@V$` zrX$g2dvj4Xec!j^-LM=zr{v(#IXt1VF0s~N~>;56x9-z5$#N#Z=F>te2uh1UN$gQe?LT5OlcL2u|>RJzLtvkkg%wOJTG>ro<} ztziP(i1TDpwGMgU6oTZV?HMNgVyG#Lr}p}OLg5itk)aR9iJ}_F=}RL%GmMGp$-hXg zeHU)8nF1q+9pH^eEQAO;|;?o87^66LpdO zjGK7xL|Hk~=^%;JZlrXbl#= zGimTDDKwOGghWvVXOyvjWb&pnr18>C-2lT8W);edRzk zYp2p*)|Mx^tkd73Sa!=^veZ`GvC_JKgLeGqVJY&yOZ$in*)E1d^vV;HdGYg$S^N4T z=zP-z`qxA=o^hU-m9)2@&h?`yDAc->yFW#gZumv?TzH$@%f1N?vU-p+oPbS@F5|ZP z52#>!Iy~?9B}rFz;&gKX3N^RJ1Nl8f^X+kTx#~S7s7^tT=9c5;Q+41T<^^KO45HZ4 zMjScnSZS?LnE#|z=(OQK;i}D-1^olM!f$JK2w&O#5QNCI+sr#JBS_lyLhyY?6aT=X zR-3b;liZr?aom3vX9Sv~D#A~BIo$bnMuMWy5y92)Gu%Bv1)RH)lD6~GfV*bHErF|2 zH}}J9iWfWMbNv>>GqicfEPA+~)z??9d@Z#dtWH&lI=W+tnwwCO!i zuvWHZaE1=7^NvUVjAO*bHDjb>FJH`h3AjnE5>I6xMRvL6*!_yKU>Ul6TG@o2=Rg^@1-^x}?c3 z4m1|Nc2u2qs;(E$j(yT{?a zo(wK}c@9S>{6sQsw{DyHwmvayiMlrngv(4kB~$~Lx|kJ7KG8K!D?U-J&NVv<|n#XiRX{B z_jXfvIeR$ag{6GW_GA1X(?|KnUK6Zyz96p9%#F{>XyZGstm7LhO=xtHF5@mwdBaJI ze87MAIFkRX-Jk!aA&VQb&WU?1y~~Dw;yb6Wafi)wjb~i5_j|a1t=Dl9-f!e?aSE_` z@gorbH&q7bd920l^N(XWi2>yPOb*?^a#HftD9(iR^%;SP=R&sCsi`M z!==APAD5{@Vf$>zP*g*wUHJ|XpB{>5oiJ}+ki*+F_2zHdxz{3aTm#AoJi-@Eu?}Ftw-1`ABb2D*8$^>NKH6%q9mbqeMSAW_ZHv zlemO^3mtv4o_UvsqhoQ8L~m~wG|y&Ay>^;zaJzL6EHR-dM(T?f-!*kQq&$O3 zAD4;~Uq<55bMJA?H#VNuxfCpf!w&YxrXbsvjrh5}3Kb)-i${4@Vr9h!auQ{VTgF(B z=N^XsIY~h9Ef12T9S_xA9xWda9>t^cEJZcb&Z4U`-;t^Lser>CpxCM*G(Mhf0a&?8s9Az`%zc=zwXG1tnmsfJ}xboGLbnGHe){#JK(TPp`ByGOm z`+EMY${&LCm-_`fAHHKUo{QPKxAbR`96e%gRHzRZGXi+J>l{>J$O~Pgj_2> zN-COWg00PM5;)fZe-|=2&J{qOX6b=bbFMhkSr1Bla$wHRDr!jW9a)lb5iiV~g>Ogn zH@{!A5ntW(6*v9Zj_s;+ag&r44)?Ui1|Ns8@YphlddIj#HpRf|Lw8Bxi6-9f$j89)O<>)!D;G}=t&7W#LRlw_06!^^zY8+?X;usbB@r%xv z^PfZr8@uS>#!FNpR5=}ilJ;%z?Cm1xmCqn1(m6y!*%ba8-UPXeoEh(ppP0907Pa+8 zsaX455Gm16B42BjV5+nOez(OFb(Bd_F>CjT-#>DKl?`7Rrb`WOwBCa&blq{1fjg9V zmLs`dn&cH9qn;@5L=8?a9p;^S<)C}f9wI*2!8ENwhXe2Barci2FsMBZ$`)^d8f-`8 zzwAU3eQowW%w8)@7F=Rwy3VDYp zjblYflG)4923p9uhltjlK0-Ri!q6VSG$c8!P27+5Qflw@$P{}kBz|Fm2RK1ww314ZEVdM)_#T{xK>;5#d#{nK_+e zKk7Usj~3Gx(6cF4j(1aI&GlBg3!sCYGv_M-Xhe?jCb3;%5wS< z8TO+iUi7=zX}sJES6S;{%h5}2YVjJvH?R^%vsoic?Rmay6WRIk|eg zpJ+FYi;NGM;3H+qpq{7(=L{Z@?~fM27T4#tjS=OxrK~Bo{&UoA8w6K5f$}-F2X-D2 z`ULy`Lx%JJ8+>|n1O4A!xhGYWYbGr z#qI04W3MMRUDQtzW*;cBd6@m&HhSu~)yaQbSq?re^z?=t*5taY^%1m){$c&5K1yaT zeW&38{Z{EdOCzg|buU(#CmWnf|CCi{UGaTfKhJ18y*4|aPG2-mYbUZ{%nzh zc6IbH-V8bXJZu%p@Y2BpD-5u*F-7D}-lI$Qs-%DN9crFbB2r(bk4M{O@U$-nLEshv zr`t|-)Xc+|saTTrQ4ObT)_{Go8ek!C#(y*!|HR2zuw>*F zv@1~BM2iZ@SNf&+J$!|~N}aGsgh8w=~mk&q^Fd5jO?Cz_&K-z4N9 z6$8h2GagNwMeyfjI%`hcnUWc>!GJ)`0o&0OY4UheFE^c zKLRRSm%vPhv-bAHdi*VEK2$N;)c3(M`0So$zVwzB{s|pT!K3p%{Ebp-HaSj`g7&br z{PYdcF?+FJ@l0HQ#SE6&??vfyyNLO|$yg#*j$Ei+Mr`CBp#GxsL?c6p z)N9k>VN@!9rowPbuGUZu{vuNF;uHiQGIZ=-dKo*7tU~=q{}H3p2MO-e!p50>NY`T- zu4S^OI=j@d^`t$pI?f4Z=oyonW~n5Ssv+>XF8c%UMWN2jUM_Hpu5- zH%L{g1J5o8izm)NCE*h2wct70{@WG%2Ua4lr~#CBMbdG-IpbBlScOz3m629wRn+r> zjRipzyfGJ0;}PFTVvHhM`g0bVksbr7r_^BkPJ4K}YCgUmEsf*&*MUFf2i!LQ3DXh{ zAS81lIJtXb_U_Mwo1Nrv%-H}3QWfw`X1c?c@D37QA#VOw&Fs(*W<&qA3+P19$L2rs zCy}r>1Mz1chHc?jkkW+^D&i9#sc_9HpB8Q@8q7w19nBH@<#D!xZ*cgCG{iA&J(jf5W&No2a?} zm7`N!UA#`P3mZ$NlVkl-=q=Fp4nd52+P)8OIj;c1S(W6#&a0&I{VZ&?K-O{9&&`1E zo&$TQeGpmY3Onpok>5Z8tg253(azKG-1`9;%t^pYE>zk2uOQqHp)HNNLtXp_4>kxU z=1sK~CS`IT+NN?n3q|}si?dvh#NXVeck6_tr%X6mH(qGQ-zF6Gs@h4UCUS~@mI!Z~ zZWoRfk2J=9&bHe;+sH2C@De*gf0~{4*5h`XQ%Y(2&u#kI>Tu>!v8#T;w-UNN^Kf0` zdrP`T%;r@m6!LygE#}RNO5yGAX7acXrqbkR5^v@}3C}0JoKE}jj2>|MOV0_lq))%3 zc#l=9%{4lT=qVZ;ns4q#AH1u;YqjsDw&yOvZ~wdLFeOqJU5t86H1i7?2MUdUKfpNm z_XTX#U5~P!GyPYi*Ld_-9M<=(#hn{sah+K!vAp7gg4&dEO_Mg`(Ktu_x;_yf`=X7k z`gg+zMGwd>^`*+}GC@fif%}$yM7!WQshyBW{4_Gq^LM^Pxo9zBzyC@SdaNO+M$Umd z2$7h*b~0*M=8M1WsA2MA6yP^3IEBN}w@Y={*t!iA^yk6lzehCujOMGqdA88BhoUH*fOl!ch z)&s-Y%Bm=KVI}{$A+7?ft0C)&3FBnd>ge zdFyFtGZ3|mQ;@F94|^cZkDS1>*;@3JZ{f6x|DW?u{?3J-9G;6S=l&@f8#BIy;M562 z!OKw*hxRLI^vYYqmpY;#SXv1D0~I+O+98buGUu62tuj_!tc>^DR-o>(IM{r40TnZ3 z3HaGuIIO4w58s~#BPO5Q92g8YqS9e+u{JdGjo@yb4`szO!uw+WqPI3x;uBhOsNftQ zYv;bgxvfpObL~0`r6i)^k!57>R)yx{o^2>D_Y+<}EuOk|(hX|n^+B2TI0V_OAXm!6 z;m!U=`1S4|^m?ncOrK!}#y1v2|NWKl_s|8nciaV(9_OPA8auu1IlR2~X*`p8-HKGbu( zmw23SCgr(TN%o`<;$&tYGyEtS-*mo)g$ra;-u`jqF z*^Sci&L{I3ZavOyLVEPR+q3 zPs&7x3#GyEdpC~%P>J7ZEI=oiUBx>lFDSWe8@gi=fD1a4vB&ZVc(g8(Hq6{rO?|{)#N@U$PTF(X&IHA7(PSgmygU z`4R_z=%b^`!g#Z_-ddTP9^xg-T(JBsQA3MfO7iX;*R@JH{h2=5n#Nk^eu=J)G~xw@ zr?B*h&2;Lm5Pv2CX#@n|1CjGp43U8l#MV(^iIm^@uZ|Fth z9r!iH7g-NmW(hA2JrOP}zA7x#$r6VBrG!d*B!uC8cJcgr$E% zjb5c3M|*dLSlw9hiFHTVM^Bx8gSAQZmerQV5T41-=cwUNA$t0^+MzPo6-7O0Mzf36 z;g#ol;;l18HdHK!HJ9xf-gO{&sB zchO3`)^-ElU!wy*#`ePmhb%a$eh3exOE^lEzD0978=&_^5BLO~!`WTZSW#dIn)MDX zbHbEc9E+BtvCK&*@BK_Pk?9W@csvvpD(=M-weNzG(RZW>7sU4I8H|#$O^AiKsuDZG$7sNXhUL(m0qVP8VjA8etn@4P-VKyqpT7 zJxs^FtPAbX3m1uZ3aJJaCCopcP1axQ#ozg<*sa7AT{}7nTO@ozaTeBmePf!h+~6(< zaCpO~q7L!9UfkjT)sq)29q|)H2PO02aVo!R+=c(&A_KwN#Cd|Es1APh&Rl+#BgNnB zK1(q6#8BWOsUS$Ob>m0;aO6CcKEQvzd7>ci^Z;L^eU=|<#KnS)R?$|IJW`#dhhpvX z(f!4;s4ed#_2_Xp`gJ>lSe*Dnq_%aDSE9>AWA{YjbkdNVWqL7k{pI-S)ns(zwJuIu z_!rr;uHfVAxsC_#EW;1I?Hv!v1UXu5+UNMp={p%w0DHF?)=U>*3*5+Bk3Hi%S*Nlgo}BTcqTiTRdD_!ibeMt2wC!~NTsY3EI37OVwBODDJN(|3a?ZBwWZ zO6+AcyuXrI4{IozN@Y$cq&bj4QJpjwdt571b7K@+yV4Pl*sQQv?Chu4FaS zJAJ9QnGB}bkutexhyV6#A*KmJX5u|C_Z5d^bzG(fF36zmFS1Bj!CEpWT0;EpYm-=^ zei}9ejKVi3q32F3@dihPFH8SsawF5hOZF=AOg?zLBB%Y^ zm^XSHzV|c}H|#gVpAB`;q$nD0_nab@Q$2A$`z~60(+>xzr;zHXRyfa&gQMXSVB4!O z{PuJix#mBY^5so|wKF!5Z`J!C<+VXen{gQa%1^`-UdZD+-dm}k*VuSt$TeiRQql#IqIiq{)MW2q*-I_i15$Hr5hWy&aP+yhN^#AH;r7-t{W!tXc5qq5mDSWT@%;I4k0to^Y+3y5E{K^&fI=@*7;xv`2SMlhr_Ylg^fY zVdwbNrb7>@rrCS-oBn+w4I@0JoT9jtyk->SdL}pr5|GWHRrky*zJw-b8j)J%_#8CXbcrCB=?Qyp0wGA8ro5 zx0-0XszF*wBv=+DQsRFthV`SLX41b>%$jq%iF&!cO;Y!O6; zg2#L}SjF(`9`0m1CM!b4zXdf!_D3IC%Ql158LvpQy#ks1=RJbVA*%Tw#=bc&K;7KT zbQA4x!NFp5wl4{t+2#OlrW=4v)`V^Ced74H zXc3h*(lRYG)tqM{d!p<~*`ur>M3EM4(xQ!2QWTYHre>y^^9-pFAtJIX@k61AD17ht z|NJ$}ne#sPb6xjuLvq8}5lc+h!)r+^6t#c{zvq!FUXm}Xq zV6ovpVY*qraMiT=?Ecu(j^C4)I?np}TKLZOqA+8kJ9~X&F}t>61iNftk?@~#vdHFZ zK3jiMJX=)|$Ch}UaumPcB~t0(FlWSil>9Nr{dX7A_IxRIT?oK#mI_`I{~4~v`{9NEJJ|c>rK%x6 z(L=8l{~Fwb=|+2T$;B?ZaQSTNrd=VkEL$UUeWT-cr@q>K*`1!Tytta ziteVO&tMV`gpR@oqmR=g_FY7Kt__}mLwG6NALpOAMol}mLh)^L=GkO9a`u*ypeMJ< zr)E=nzT||&SF4hGudk+08OF0#*$sPEx#Fcy$rw5}gvP~;!1P}w#G04pV#-d#`E&^} zcojq;^DO!KZw$Tm#{*&Q2g&f?V=Yidc0QdIkxW0_@FAnznGwq9k=&1s zBas0LC1<0zLH%-Byfym*`RtZVXWXizI%Siv{cIr2$b3y^eOlxarumR4Dz77RN4G=t zc3Zf8SXR;_51_synH>B$5rP?CCOfp91SDCL)^pxyz<3SI3kgwRd|&SkN@@R@Ib`~V zM|AcF8SMQWhcX+Mp*q83o$sFu5*I6a+F}l_3;0T{oMhnU`Zmc*nLgs^pbbxJmO{`A z=2@`wFP)gUfE<`u3*Jq2@R=1&JXX%e@IoI{o8Up8%O}w#d)4K<__D0>@dFhxfqtwjrV->uM+E+`bJj#LtBA) z)Neks9M8J-Q`PS9dOf=s37`M&_jlG-Q+>PA6GN=0pJe#G1BbYOg!5?zb7v?{-9vYU z7@?qfA8~jYz}1QSAQ3pG5C@q-ax}&Y6J0jrNa=RW4)Vs;0nX@NewcEmPvv$+IG}1D z3-dME;HqLugItT~z$|9xecD&@II*5QRqn1HyfT$jy6}Uj?GIbLox5B#Evuq>)>dTC z@>nmr6|W+^dgBHsIVFId>&D^Swu%!4b!CdWCa&eIA8Hmx4DDevB$qhP?t(bK#7VsA zc{OKE)hLc#bA-6(KaHAsW<%o8ce#R>AGQk=KX?eL$};&DYPVbU(9yOP4b|Fy445LASbg6nhT%j!r*9G{zlab#*>Y51lWoY~a&!?ZZnr9{ zXI?b_%~KT?J;`{C_51{8Vv*fqfxPi<=;Du1Su zV;5DWIlYDC=!RTw=71LoIU#k~zxFH!Z56^amB+L+C|FXIyGf#*ss(xVjo|)@&4v7G zuIHT1k}wZ0>)(gl|En~5UIe4mU9?(2cY$`QP|p6bwf*BGpic5+WM9115~1grX5 zxK4xL=#A(sJbFT2dQ#RD_Bgdkc-F$&1tkk{;^l>uR4JjH@E_60XMwf66Z&$M5#Pk3 zX?ig1U#$koPrUH?+r5%OH(e?B(SNlDQePPQn8?gRoOw5N&(p-Y2Wj`FMUoFMb4b+H z(Jue(PJzM42f)pBBlN%eLH1b4fMrJ~^avSm&dkY>?QT!5F#GRE1TC=Tdk&a9p9;y^ zYiY-4MxgPi7MwVib(emfB;R%glcCCsfGO?O`5s&cZ~HxtSr6QWA?LSOU$vO*cp_zs zWBt8M$H3$Z)k0T~YQ@bH9qrz4aqMc`X5XlwE!Zfl>JT;mXm!8JX9r%Qf3?fK&((c- z1r8@))>Yk|N*oVg9_842#>ye{Z-k&8$5WT*&Vbv(;kLF0XuH{B$MHfMDSAlMatdJA z^c?tN=!1pR%uvYeUnT80L2YI8QiKj_6xn?k%YK-Y5 z^X@aA)jBH-?k}Z7=S|T6uNt*ZuO?5m{!-UP@2UJh3#nS$-nkZB&Tb@o^^bGUXdWZFUF(R$r7lvVaDaH|{+7(pdP(ITD5BZW z-&&cG%T<8pSOf#zy=Qdge{dRSmbNXT*e`k*7pJNQ_VT`~qgYp7ya;R@=Y zl16EyGk3|220H$;F+E}0BRMoynWmmK#RoQ~7+;}5UPd)8#)om1GFfZx zgjw9`j>CAM$Aa+)8RD!@X*m6s3@Xl#$KfL)46rjs)g5Yh*CUVnhogt9SBYq7vK=l} zFoIg!RW$bBWHO%VPkVW55#umi3{5N2@Ti_UF4DV2lfPY|k7Lu|-E2pyH~K!&N!iD+ z_c37CbPj&JXW@eYhVNO~3&)RiLvV^Q=+4w8KNGV_EW_M%K97J&cE`|ihb=ES*c&vT zErB=wjWi*dBlWR-2t`$A;i=YnSm=@m0*%qUjn5{~Rkad2c+EJm6~aekp7k1fd-}V7^R89ttP<-fH#{m=#2N~k{blU_Qy<_@s=QU{aCaG><$i^gn58U;FZ#g$ z_zL(h(+BuB{!`?~&M)MLl1BbE<~0YDpV(?IyTH0QDvs6tb{4DjinhQbEr9hzUySlG zCvlkNiQawx=oOn~BsG2s*C8Su6%)fTG~k9LvQ!Qhih{sgw~Zt`f5hx2dcxl2CtT46e(SMqDCzBTOJsYOlU6h}}iZ5WIGNJLZE9%54p$T2_;AV#E!4M=uz8KP(vT z=>lCNR>!dGNi@3bFeZtH&|yF0bTf#=r)v(Qduj`YFFu2D%0bjL%>y5_X<%&VNu2$l z0<}R0H`>g@_!%a6Xw*d<`^gmAZu(-O*bs9T&%p4@mx+pY06o6>ALhSNm0HPO!h?Qm zke8QXy_$6ENb!U7TvkHYv-LL%j= zLgv3Tl5WsIIY$@;wq8rTOH94 zT}z>{?^)pyof@GiQ$rMc>bvl1M3=CKRW01Id6Z~krXS4u>k4Lj^g{c6vpWAe*Nl!N>_E>w>CN6;}>R3Mt+_?8@~kfQ!YTu{Ul=J zX%5uL6I->4sFIG5&U&mw$N8J##*qx0F254%)H~^ory}~+wy$nR`zeyLU4i~9J5K{U zcob}s@cGeB3A4VAJ6m(Q0Pq$Km!7W7lNz zPkte>Jmn5oCI-O#MgHKYbQJz8?;-s=M{}RNFeanZZgWTOm=BjT*&y$&4hObuCbbvx zX!L|KNe~&5h+7wtDC1M4%1DEbyq`};>ZxJBub6oF-ofIiNIaHMQB$tf$u%jqW_m@e z;GQUsgnv~ee?vs{q5T5t8xc!5wR?#9Hv`h97Eh~9pAqN7)m%S$1K8SmuCC~uzsu-9 zqv>i(COggiMuYRBaKWHDZhA9?!kXoH`9K@qRr!aZcV(qscl~f*KnJn?W=|#s^T6t4 zB)M@`AKI#8xnbVXB))M2w60~rv#vkH?pG)IsW;4R4Je?4zDwwc)5@^F@iyISu1m_c ztR#LjBVc`2Cv=-?q0Z_)^6d)NWk>%bzjSxNSLU}MSCLAMpQzE)eJf~@3lGzH8bs}} z4SZ*MY2Gm0&iF76nzT#?!=ZDaVIK>YhAG7QX9M@{kDJ`6CtJbMG6VWOG@$a55_E=~ zCK-2`{vMPNud2~ttb3OJPRN3FMr&xl_%tc7NG4yCH6h@b4gL*Oqj^W%!G2Nzs97D7 z1h`K?-bEAqwZjMBT%1TAuT>>cE%!;S$R60TjH`~j5AMr4L+9I5WbMq2pc&>3#C1C8 zd7mXx{|Wr?0}1@+MLDcVyZ%;k-Z$~*-*v9cF|@VoU!7PPwqL>SMQAWzyrPGdE4*Mc z%55p@!%!k?>=+%v*k?+D{fp!H4a;7!`nDPhLf>feYxg~8wS72HVHLKz;+N4l*8KxQ zmTj^eFMdlXygFq>#=kZLzm4heXR90<1}w!d4)W+35lb(9?j=(e9>KC&1)QDKOfM8M zJ&=1R!m)*k7&N$=-qtk5Tlcwi!IYKUiw3fI^OZcD)LtO*?%pLie_%eDI4!g0SLKcx zrx;$1#$1CM#e;`xcC`i7j16I)9kLZ`WSSeq%Nl-*0}7|r?AnFI zH5&?_icR86#gmpg)QD|n*6?2}smXqkR&y<5M-55aRWt7T&YA;q?loxv#sVJSl2tHh zZg*NsSMcN0fNgT~3)Y4`>UOhhIs8L;U3~BB8mtO`A%ERlBRsu7XRwN_M&2bk=q!13R4~$62TUjPx+CMaivRVPGmM#RMSOyRLGhp|CI!}i2Ft6#=LWMIdB6BSjjoxcWt^a(X-RsW~ znaU|}YIik#$Ir*D-Dl~K0T0GIaf?8cHC>Y7%5VbZxRvP{+1qfBgl9jKY!ve8ngS(Q z%^L@fooC@>dm7Aj&j*K@I&dwE;GYx&6=(K;hzDGW+XB7&Y%2`7ACbx0eUP&9?JI z_;(C%O35zCuU)U`vhuN@_sLB%TpCKZ%w&2wE#vThs0l7H45Y&`DUx$h5xC_35|^8w z=g{wY2IP|05cwu%ip{1r;2nb@42gV>ts}yjzHAQNxZ9jq7vxdz7)xaLK9R&sdEk(4 zXXCIy?SSyL?>%AWd_UpXI#UPzDkb5C;yz(C77GUpcL}#@4G9eSV;#qNR0-dlcrAP% z@U`01QS6{7X|vy(m?OLrIw0)iCJ8@i9&!9(v8GC?ak+5+WlfRlyW_&Zf!!$cgNx<; z�nE7w><%gi1mwy0Gf-!=p3!IA9Mx#xg9|IgO8M+-bw0EPmd4f%-l8O}~yvBg?Wc zkk+tZGERFuRQBX)kR0a+k{wki zXl7A3OeO&ME*>hoUxV9i1U066yQhDYS7?$+t!2lZUQtkN5al2=is#JYcMY^gYhp1xVsV!q>fP!F{CRCCsb)k zy}eANS-x_-yiMbIa`&`&HE$ch%6L9ZOU(eg4}mbd=`gsy@q#GTl@MI3j#_C#TBfal z4>lj8!Kx-`ztmM~ysH`quP_dW1L^pE)IGe_Y>6wP>m^g8?z*TSa)#whkNQ&i9pDgf zlxtXGNVjFYrAHJ@Ald7sr2D!FCg!a{o9#JxA?PIXmHBjv^f1m)j>qKkW4P{?1InK? z!9BflxO!uU%leSHMEP_c=+qo%-ljImC0$3dw4#S(H+>?`DuldWvjFyYSdweX%+5UH zjj;801%NOSN-3NJmf8!N({3FC~-r*AepWnDsZybgI<+Z5!^f8_Nm_m*2 z5KL2l3dbkLfYI^8bZmbhoNP#iWBmoND(eVjC^wODt_pOPT^YG8cb4n9BbKyU1)*_H zDSA5dv9ETL)bsQ$T%`7Z&oawkz16E>srniTZ2s+LoqOHL{}lL_AMGgSzq^#jFEUUS z%v5~O3L09=uWOwo7&`NYzwp;I!87hS!9<1|?6>ddhje(@p8io(IkaFczeKH-pKiqF zn>0@l+)vKn%WgW37e7zKW6Rdl_>s$~{ehjh?eI~EnI;RjQgq0t`mw-wt_4W0qMmiK z_{#P_8e;ZTa>9HyOw#tF7;uPg8Em2h7R_Y9w2w5593=7Eev%1BX|$rS)cGfS6Z&>5 zv;Qh`*CQ5S57qLqDqJLvi^ujn))b-k4#G1NZbb((gs_MBSdf^G8@&PaYg>!dV zp0by0Gq@L6zLrHS%N1rr(LEtymfqZ?tqk+NvCk1y{xzon5K0iClGIB5w^rf`$eNx+56F%9{%FiD`<$5Re z`<;kB-Oo{LR3si^=fJ2l#$?IS7>tXnM+fgS*mcwbGD14Y_&KuFvS157+{(w7rFJk) zI2sNLT*#+WJwztf7_SWqAqC};XZcu5Yr5Y|&;e0;+U44i2)tv>N%n6EC`y}B$iZH%qK0Rmn3OruyBk2b& zahEPt0-gOUn7MU5lpXvCq$nCc$gYN1tDR`Ee+V=ce!$%48KmRwDJUwgLhBG!UeoXp z{QPQ9cU?+@X#Qh7r(FlZB$S)>;5&Y+ii4)hAF&_L6WN3Hly~qQZkA(upc`7)gP|qt zm*O{Uvw{TngxdeuT8?3C=i=4uo2q+)?XNZL zrq@T=T}Q^Ud8{$)1h5m@_zgKm?QmdQ-F|7G75LUc+`hu`Lb|*#`G*)-Jv~_V;SyST zGY$+F1$?;oBur}?#$>lp?Dc&EhL%Us*oiq~n?B>=q)!;R&VZ-aD=VG2RY-p2{6?P= zOKFzcbv6ReDxlZJlZiF!rAp}w$AlZ`-IFwI5`FP@v z*%Oftr(^upH_-d37+CRQ6nuBZuA6+EQ#S&xxtf7O?iZY@H5U8#^U-8U1|FDMjuC&I zq?yAH@K0eW)XoT$_%UAYBeOTdqVIK>#jv!w{c-T1Jf8Nh&ZE_zGVuCJM?CX$D`@%W zN#+Jk$1c^S=ru4;GMk+X)psOtICCqhTjvb@ar$WgK%VFF!SL zbb3s44@QKgQknWCFnHLDobhnNtfP##)^!vF%8i93OKQPPd;#xe1mNvyUC>dp8P;gr zfunL`Q0>ia+HN<3cZVZ_bD!?PfN&e_y>pe8K7R*M9SJaGv5>x~ZYH+QNf2*f&UBC% z!Am_A5_m6zge}>NRx?5{?$8GEUwIr98w~(7}Khn;N*RXCv zBXzA$A&Wy!!HF%s#DLV`osZwBc}^)TtoVpc4VOuFf)-VdilI}>cVNGpI(}UoiM`Cs z&};W+=<+nd8%M^0k#;re`7^$QLK9xKt1ikP9)zGj7xC}%HZFQ<@`{;^5&M?&O2X?= zPQ)nshaOAS~8cq;w3RD-ocHCOwwlR#= zQT(9%>Z)+-w0l~Drl@; zni|svpPHhM9PvDpMT~32Mcn`OuK4|gFJhC~N5#=2+NxLPWr>@oR*0YUy%0y&Ocy`D zGFddZT34LmQZ9}jxFa6n9WC~8x+X4v*)2Y?RHnx36!SMNnO8GLW^RpZj(&|o=N`e8 zLq3Ab7kmYMUws9+E_(#uhHM1Iljrl3H#b$Dj=ff)p0!IbtFM-S|AQ*O@A?^mFJmYi zxEI8K8a^eJ^qvS1X+Pz?YW5f@*H%1;N ztaOL^g}XsHA_7LI^U1-nDR?4-FOikjaruL1Xz1-#a6Fxja-RvjOFj+@uC9Uss|3)_ zjK(Cne|0sF?cmpB3n)M4$t&nFgD1xVV8A>Q?zaC0g)|o2%gB&uZu(0vpRA`ZkAFh_ zm&+l)=q$BUJw>1DB;&FjCs0!)1zz-eP*l?=@AllQow22ao>QO5%k5}}$fb!;C<=w( zU7w)$-!ZVd91ee)QlK^437qvJKyF(x7}X@xA0~-(;q@8t;%*yQG`)p)XPPAH({9p1 zv%6#j^WAPL`w9ux36vF9PDfq#l2j=#k&L=G9&Yh1u%hESSA+GB#MzF8aL*sqy)^-& znv&_Q(ObZ$N0&$3l`x)lf|`bGA?@*p=uVfzxWJ!rE6JwRnto(*!q$(Z%Fjd+z|6w4 z|INqsccZx@JN&rHj338OZw!7)K7{Os$@J|NL%J+sDt=iKhadNT!S5G+(YxR!?po1@ zN4Q%>|MtHT?Qnhwl;NE#N%0VXt%=h3Z_CD=>{hql$3`x=oEmWy5X2Q&NH(~p94?wHz#zoaA= zrF40#fGm8SLn?NtqT*Y3d>dI=XJIk|qK|mP+%Ox-+)IwwdM%|-b9SJ+$TcG~G<7m#=Nggb%AjQp-;6Uqj zZqipp>CI3Z+WFav9$X73`2BZ>+bwQtpNVItWJKPtFJ&}I##fC@jm{kaohNF zbWZS3aNbuuaC8=2{ONUdHIy6jnROUE;DFrqzK72fBL7_iM%?f z0mr)YC1EAG__O#Wh32!8?1nn>#VVTnM6Vm!eSkA;?r?YL`@`RplQCN68>a8>My(Z> zF*_uiwj34Vrz47@39=kc`}_#bmGA(L{obRTg#D46$A2`OJg<&+swf=g)L~i6QQI-Z zN!()MH0_U#Q>ND(r*(28of5}qa>~xjJH2GLb9!EDIIXk`^GIntZhLs;hwt{g(EtR7)+WBdNqxffzSMeu_ zQutvf3QckdbWD4uDkAmP!=Q;j!c?H3|zfG*`=eYuU zQi1Os$8?)X{(?+T99-Y2MXMDSVq4G#h+ka}1qy#8hO3{WsbeLYM4o|n)hTdfqb)X8 zj+c5AUck?vtU)I123Q!p;g;InL}T_>y4t^oTwN9iMIIVt>8gpi;<^(_>wYeYnzf!f z4f-H=@&Zh{-3N!#DQ1uJA|l~t7;|q5Z@MBgk&2a*w(iNoP1}uNTDT#v%1VJZ)l*CQ ztX@G{y3342KI#G+y{XXD9D*apm0|F`l`witHtc;F0!t%TU{}seIy?g<>G>YGV*GWm z;I8D#q!=-Gyf-+is}i#huRzmTt=#;&EZU}>hI&mtICP^LMMjIbtTSAY8bp!bl{0bO zS9{z^95CfhAzb@m0Of->sqRuc*kAk_6`5!B3rVH;oYjEql+KadkaM`wN?p>C@tFiw z)N(%$Rsk?fZcH}g3!G#JeI7T7=u|RHcq7K-VhK^&B?krinxul;4SyMK$~!C3|C7-yM45y<;mC8 zEdyPSUU|o8=CsFDY9EWi;}&2mlS|xA3B$*$swG~RqR{wxngrA`LE(26?$~#mc2#bm z!4cIkIpr;U^OKV{EltKRe+wlcnfJhM&s-?I_6JK3rD4+O9P;3IB!t+`;rc06(3>hJ zz|xO`_xd}eJ1hdRe`W`u)AeHLHD2t>;vc z%*)@X(uWZ^UzAOZT0gnej4>zusYSG~rji)n&488OQ=xIuI=W;<2W_9c2YlfvVb3Wb zYxj>N(+AGd>XKvR#)C^ZY?q28%7xDBy-p@Yr;zNAmyvX&;5PS0n&ips0c<}?WR;ep zS$q=ST_nZKla2J)C@m~BDn%>yHoUe@imOpC{@6d=y9MAHj^Q2tsd-qqU2dHp#=b z-fY5)p9?e7L<;WAv4Mo=+_17@v-@0oKa{+<{x}Tj|QB<3Yr2| z$sWA(DQp-fcI3O28LG)D!a|3`z+sqfLxr)h@!Usx$?*7!{m>(JM8 ztlvpJ{CuvBz~}mTmhl?{LDuL4mHh`rmH#DM(?dGf=&z6jTJk)S_Lk|=DPuM1&-ta~ zj`0i#j(S5be%=i>%^7$~v6ea-))JeC3DoRgGBw@VjD8#4=vjYT7!W?#T6t|{w9CY$IpB&G#m3GB({ND#V zu5GCnJ!wxD8ECkOv$v0~@GaRZ=v@AVU$>X=TRFdMuLJ}Olw^zfJRciD$?#m(_Uau1 z>-=>B;n*=Oo4i`v(o^#Vo>{l}8dG%yES@slTKG1H@ZFKF)a3VUn zh`XM}?7$~Hp-T=jo(#*=bi|#5w5Gd~rr&l!%|~akknzXo-|@vnqgkk)nubkZ+tKLK zOQfSV;=uL;7&Uhn7CaW>CHG4FvHm7r^___`znkFqiGfW2M+Rvn!N0L#=Z5S7gAZ@PEkXDvTTi$W22XI}u-))Bm3#a}q`HkWp9 zj03}PZ!mTm%PTSN#~x*Zy!lS>pHn6bapigS7WMRV&J?=V_z>Q18>E+0qmeh`6^U^+ zfbDns3C^*FW0OzP&D+f3THHyvtC$1h^TS|9GQ(8;CS2%T#JC>K;N^T*g76KX+zFPRMcc4mZnV{7AJo;=-Ck(StciCJtxFqK8yQhQy z1+T~Y3>A9GZ5t+>=p%>Zj!KleBDk`D3u>B=&7vm@0!ff%8ZDE{q}5|fCD7hT3N5F& zv~jbk@5vTP=(tF7;86xl8FhzP++yZZDi`5g$~-9FXaF6lo5{_|jYK8e0k7KE)1js6 zaDPoNspg{#{gK5uVD{qEJtJw+kxj6wb1FG_TNML8@#q`3sS@_e_Y#NAf5;>LB>G`p zsHA0^2~~PugR2&fqILaSsL)^u^$JnOpLrZi;hZINqUOT*yCU+7B_V2Q%=#6R9nHYKMwxl_-hgOIQ>UNKZz_9Jqf7}K*9R9{7E%Oz9 zzI_?|3yp=r${x7#TNWnnPNYkF`0(P|V+gZP0xEe)VkdZ^(K8l|DX0d&zDuN}Cd&CowTny znC!JygG!S%c+ylEa|{mBZ3nhXE;}iq(o!cRmLIwHSvSXJ;*&cJTZ^GN|<<>UhClB3_NbTFKUow+_iuo<1|6w zX`sfMFv6B!Eg5Dt)^_tbzias`D4%tGFtxIOR6k3mbr$RN;ybL@o6dt?LJqWUkrK}# zZ<0)1z>Tv2ax$`s%beZxS@m1$RXLY?Hi}Jy{hI4Mo;{N&<%}XaKc*1yai*)P#=u~P zggkRRO^!5skdKq+NRClT#uxcu!Iq_1pSXhCH?&$D-Z5J|n+;+~Ac$>4n#AqlZK7YX z8sfZp8^qV$kXgmWh4jv&ChNTozBglb`m`Pq6daQvTx9EnhRl_e)$ zEf}p)!tXsFAvn1>oZqv;UvMHPO_0I=!>=_MCt#^=6qrb(`BTT`3T|#q5hUKpLUo6YID@`6RpYXlP(JfJMeLGIF^-`vHMV?lW70yCR3KqZ;uu=M9Q=&Wml z;4vqlY=b%mu6hrL%QShcH#6bzieKDE#|!a#q&^WD%>bR%wv-h<346Hhv@58Cu9tY? zp?D?SZgvRob2CJo=w)Lo6b!qBcE%*#-Jpcc*=*kaz+!Yf77wP^EZv?55?}5H1uT7keBJV zaYAA@MlalmR)q$%tp6nX&n!UKMGbgvQ6q*%PDgf;g#Md8j?Br+C9#`hG5qjn%(P#Q z!@NCkOwOOHvd{&Cwmqb&=d2(od=k9>sEIqd12yRjU-<2(ML$gmsP0!odY{qNtFE)iyzAg-N?}gtr=M8Q<o_7P@Tgu6ylP3*BsE{93aUTjb=W zK_k9lU)vS(wy=xaU9t#c;=f?h#$9+fy#)r|kATnK*^=otNz_bb6cvwG;{AEnPv)z1 zliPS4zyJCs>9+d_W8RmM!yyI0ov;ulRF~k3`g|;TH4HDdG1 zf-Zc$7j~b0A~{!}EseBcUg@#1vgCMw0a z)ZLS%Ii;zR(HAr^>WK|CH`z;qE_BeTV=sdwEef9Xs6*LKW%#o*3zXVbVad@{7?3LP zUi)cr7aB3NRDulNyTI(6OwI=3f0i)*zc~7_|2l4z?LaHdtFTe`5y^}ck`YXH=@z>K zB?T)W=It~3{#6N0XkUv)!|o_pag;_j^3aEqRM%4@gh^={yiMapkkFSRNzE1#=Y2yG z{2;1xjBbOZTNYTB`;wk{#KOmJMtDb33~4SiaK_kD3>Z$vCI>mZqhBEj+2e|X4?Vdp zc!U~NHc2$oitl(Y0pSr zO?nT3ASaaXR6>=-W9ans7ZMoG!;D3Xal!T#bd4Ke(S0jPx@QZO*(4_A*PFp+&v@eJ zS5JNC1=Bln4sd_v3pjZ22R)K*gYV-) z-LE~OHOa+d{-12|Y`sC@N~`ap*)wiB7U-T9lhv2S0W*b;W&_C%&EHOm1d#@!c};&r z^?Y6N<@Y1Rx+@deK8MYnGT-Ss&HCBF(L9NqYd1Mgs*}I-|IZ6P5q7h($5{#zwfG3{Cg8#oTxW05OZ(oI$v?BKuRWt1%?ceW`d1)1lr@jnj+!x@} z zR&XeMDSbyfmIUI>4ac$g!6PD!_&`dAq-0iCnPi*wNT|NCnN|zJsP4cTi>eD z!rDA?cm98pAKSOW<2z+sQ{N$%GSiXJzDQX2po;m;{I#M#yp`*?zpaJ;bdrThZqJ0p zt#d?k8{|a1Xjj(1EfJ0*cA#*vPN3*C-%a#>IMDIunnRA$O%^&hC>x8Wq{xfrtPOX( zI%S$;o@rxMl+mCtpeR;&{6vvsTD4YvTAAIkmL#Z8<$h1wgBp1mjvIm-JqTEY+Py+kg!Yg zWPRRxTF2bkEiR71#SDM?M@630uBd{bcNvgz?kN%4ni4aGP^vAE!A09y)WPpx-Iw2# zs;I2PAFN`@;V(6WKkv0<^AW})oH-UtkKLmez0B!&hhX$MV1}nNyoiG50T`1Dpl3Ts z*FN!p_1C|WH>8ig>v6}a@k2N+`n%-k=()u8YAW$9)&pO4FBfYS8_eceV2PY1i5*gh zNVSKgiE+VfIN*V=%LNiG;~)q%9uiqN?tFBrkmB6tWl^zeK%gN z-B34~SbpCGt5^GTR~JUob|$ah6fx+M+_H;|Gz1r~VIyv}!72KdG|)k#bL7!7S#Eq# zEq%`1@0zd7r+dR*k(cQT5cbHBD#jOc3tB{E!t;5OX-kbMT@#Owl1=eZtPQRd+F^9y zC~7J4pc_{1#(*c2a85@rwb(9?&)J^Tf>TSij;!WxN-m>vBMQ;~HAtG@JSQ0sevq9@ zOyNy$JdWMdPE+*d!F%~qSjVhPPVuWm9mA5^v`PPwGZBq6F69Kbs@NV6T{}r-nT)9< zb~3y==KxFU3vhL$8>-$ggL-tQ)xWj5?~^ z8IoMT&U89P>4X1ld%?Yj9D!E!R9n;SeEz+hD!#)b8M}=(lkF<~N?FV0TKO|SR^+&g;u&U*;7+8iHudGw~t(sTt0uE*f%YO$*a}t z$|~n!%f%Et(m_w5uZhAyJ_*ie=7h=7+@JND zHM6Xe#a>@f?DqJUxGBI}?3Az1`SPWcJ^8B+=UMMu&K_LDX&z1ysm+-qE*!YS-o;#j zrJHN3Ro-r7PhuwsQ|3MpaV5`1=4$%lzHW1lxZ*fR{^e`o@gq^<1GinpQ89J=4UGW; z+o_(FZg*7qvYw+@6=Frfyd~y>iM1D5F)d>Ry#ocdb(d$^?z>0C z+j|z}EOoAqowv#$zvX-+|4^8+VEtb$!KvfF_)29DE7x4O%#Sj+5v2rgJE)QVSgH)_v z=7=-CA?Y&9geLo^kjxn&Z4GTfKSyiy54=K@)?a~ZD)tzOF{q=S%$*l>l(f7WB4=`$ z+>ld%3cVS1zQ0d{%<>oX%7|R56ncty%&UZ^um~_^JS6S{@4A#n(a;yV%mmE4Y z8e?xu16oCdOCB|W1Td2${m+xt_iLj!@;e=*F_OZ0A!E!jgUs zk=fvf>hbn3g(I-FW<0+*wyfW`^T_Q7^QM}_!UK|`tf4?0jACITNj(!zbt-bDYI7y?K11 zcJ@8hcv!`~xAiG^Fd~}!iRsp}50ApjaGbo8X@pO6=V2RXG~D+9tB`)_l0HZirJBbkn3aDM_$N1g6S%+K$|z2E=Rvla81PfOBtun_Aw|u zY*S}m)I_>7^6>7-0xFJYT%)RgTrT`nf`!THDt$z8SwW-TFbD|a=lYVXIi+3gG? zd71mSG7YSwZ^G-DV~~6Nlgso)SuS#+*SO?fhUCe{?X*8fL{jU`bMnEze=}w;SlZ=_wb=M?#L`X?o+)T-^Od54rQQP|ormJn0gV+vF&@xcdcBigu7j zd$AzzngKZwE#;0_Rz&pwX*+?=VF;SKSTgft1u2<65hf<+!I#bHz(1u1S-RGcQgDG> zJHH7y)9#S!YIkUy5J@g@ev=c!Kgqt_zf`ir4$|VPL7bz;Yu?ELi-$(AVYxc}A4TWk zPxbf5aUnZoBr8Nn*1g=p=bUS#WhBaqv`bV{X%9Q2Qliq3kQH$+_Zpw`zNz@45~V1m zv`c$Pk^JuOfA~Dk=bZQZ^?E+&GxjIt9vSu2aI60|<5w`$R0({`?$zrl-%{K7$P6KzD5#pBj5^e_#!?m;z&^u){1Ul;R zl5b96S`XGRzIM$PY}vpO-1!nO@KMlao@?}HEH~c52y2iLy!YNCu>4lTm#Lb>+)xlL z__4@MP!rHlZCOC`V~ouNQ)W#Q+@H!241T%5&|7kpvB2&bqt0!CV9D|z`&F}j?SCgE z!)|LC5J|E^I<>`E{O~i@zLQOBnWdxuyu*>pg@ZVK(Gfh~Tp#pTYC+j(BH3o2PHq_m zQ`Z=kXs6p+yg8$eGWxfP_^hrZZ_>4RPS1Hbe5(hv?zjX-2Y!K}^=@|aF>}uMz0cX4 zd5SD!*-z|^AICT^A7*fVR9)e0Ngw1yhw5`@FPp@@V0@VK`|&wWfo&gWX^5>P71_)A z6r96(SM`naSDefxb|u`#4nNNR_cVw1dO6o(q=-9xpq2a2UQdwId%gO^Q8niHE=Q)7 zn+KErH-quts$u?uUU|Nir=Fnm@pR_qkmpsp*EN`fO?u1+3+^!vs6FL#V~rVSu6i(z zyv^dDR+QLI#uWHtz5Dq_M#jv+^jLmYkUMi;tTIz0Q66r|H6tf(5Av{bqc^P2f^dnN z;bN>G%CdQms(-viK}HXW(4`P@p4+3a*^)D|HyHQvT2bFy2iRiwm1wJ+A>Bj;tzO)L zYEH?BTeMd|4R??PUOnGA|j4b#pjHQ zsEfTbAz|@2soppPtr-u)y~k7WKF2n^$u$K8I!hogZxKvQ?55q`ZNlm$%gD#kCD62| z3i|i0g)?tVF#E|2(9U~G1aDNp^|(2dBDq9D^P7;M+5^M-CqW=bZLc@;C7@k)ua?fzP= z#)d_5#8uEmdM1K%vDtJ5_1UO7J{nF)A7sAk60xqMjZ`3?Hwyx zQdr``HLNhdIF`&iO~=JE%4_AXSvzXzwmQbA&STwrTkFs@6n${Zg#4>=IUx zwq)Hs4NrA9?aj@CL zg(%mTU_~AS(kJYMu+mCMp3w?p{dE{Gd`thy$sk9tcr?jgn%Vk&vYQ=*o> z1G&+P9FvJ;Z>N)`4_!#5nmcU17ly3QJR$Q&vWUw25!$@A3%zbD zX;_;!2i{cAfjtj4kc>Z@P<_@kZ0ULd{X?(GyxWSx-)z z;<3}MS+MuoY%GB|3#9XT~Y=EMPi$mbQ?r?{9>}G0Azp zwI2;e2jB}{*YWEyF;1KGlCBL~NUxbbjrf`#MctMyIJ+~JT-O+JnkHLBqN$az{$Ly| z-8RGd$Mq~~;5?Jm-8F-c(2KR{ObI9utk;|X(_+Uc$gm=e~S^F zRe0IJ`bTx+^wU+}eFyj*XXi40q=qow-g>}K-qFc#axt?vuv1`4O}W57dGjKF?X$yt z*E@auarb}KY9W@qzYFeRmzf#V+bsiR=FW9cCFR7+{Bi~#e0c$BJh((A8q_ups2cGi z$0P9ES79XJZyK?C9#2179WsN3^Rws`M$?>TouO@ww?4xHwf96}EZ}@<)Ac-N8 z#Wtdeatx8_c305__c#&N;zpXmbWA&%hq%DDo!mO8<;Wkm2|C{>VnG+Pi5X(HHY2?BZVbU|-Qr{G(n zwxDb6Wx=DvZ~6Be8u(!zRn;n!hxr3Lyai_Ud+d)-SSINERU??Kd`VzAZ!Vne&w`zW z4UnR89JW0VLjLSCsH<1PsSY2bZn)Q=wRd;pI;xFadU}H#DL#j?VtydX*9AB8>Oftx zW6+J*i>AEXLtxaCdMYTRlBJ%ZRf`Ve_k+uDo6dXeojn2DFUX>Y&Yeb+Z}O=RZ(fsw zFYll~lFmfW!$98T2tQ2q#-h`G?od;w#+yT2c^>u=pzZPx9Ns91R~*Pit6CSJD{23M zU-f%%xTY?SKJRR9Qzzv3W709=pQ50 z?{713-|{%T*eVvTOs|IHTeV@>c^RIwBOQNzH3hU1t0gm`-*m}E6^ZL?8*tWsAh#bk zl6L|5>|G|Z>^tz0l`VCT)p~U@dzaJ&_JwKF*qiIiS(;yGvcegOtYPjE_9b&QHudBk z>*=60o0r(mYIY1_GXqmu(kljP9`#>`}Gugp|$UD=&x=xrnC%yyH-uT%sB#*C^~Qoev@y7lc}&3orItFA2cyG z!<%zm$ocpWazmPrPRrXtN&9B_xG$For`A*5GjnlKUo9wQmB7{EW;Cn63908qU{6UN z?Xg)S6q&M+)#r8CqC`Wuh3_Q%yHh#8&1-aXoupE zD$*U>jTHP#aK@KBT={nq(ru5UUDm(C-;!%eIFZR9?48eU~tgDmoj>G!9tonV>@%8!-gTdW!yn`J1n)pZyh77AeI zY#}IqSHRP(<8fx?LOdzV9+H+=z{Im-=#|P}TqSW=xEnGBZmX$kC+Zjf{`=pC`0kqOB$;7;9OpCR-ADmr|9fesS;Kb* z|E#P)eH-8Y=NTV?#`yruZ zHo%GR;Yhbj4|XOBP*+Mi730Unt)_`+;kuRR);ANF_bwOpSWU%-N9qaM`9k6w4uGcM zM40ybAo;Ew4(}Iyg9S(9IeE>SIoCd%6YZRsC*)SI;4~U)iTDSKM5Vjmu#4qJ*hX5< z*^b}BIfqYd<|u3|;=EoF#qsa(~em zM;Q({o)Plt65rMOU{%Y>Qik0RreIk#;BEH9n664vb z$Nc8VD#r8O^1LOUiae>3A#CYYjIUVB;T+_H%Oz;JZarPz{ADV@1$$8fjk8+SbO>B}>tslQXf}{A7I0j$oNp3lR2tg2v+ypdjWjg>5nDo0t>iwqjD~BbXTbKmBv0a}4khU=kbC?mgrq5g*_mz>mAexW z)f{~2ZXil{*p7;Fy3l;}R`j6uBFg)<2}ij~GPAsm*zoK>bn?y{H1;qad&#cAQePc# z&ms;sUC@Y4n~uTZtxw7A;W#KT+J|1b^y2G>4w9C4Y2Xvahl=Z-u;Su&ffe!kP;uJ#o; zFk%f8_nxI6{ga2)Bje=3RC(}wxCDwk#e^dX$^Q{ld=1zEpuA9>WJLwTdysAG#G_#O13Jzo@o zkM05BKQ)I-9s_9eiz#TWrYo^uo<>q`Sc2?&8T_wm9{wfB!Hw?G`1_n(?71NdUhK{x zop!;{_Rs{Tq%9`RlI~~PnmO3Xekv$T^8lj{A82y^6k1=fjLKdW4HNit!KQmA1U3cW zpjrM%U&|QS(^6s2#whrj{*%x;sxWlq7Szf~iKp>Z#qS2~B)R_=Fl;gie=H?ljm`z& zo;DRySry>A^ct|EJ7JHT6+E6NnE?+C!56d#rq*~u1ebF97VZr4Z{L7wS|W_yX(At2 zA48kxenCebcwzZrBQm=(j7Uk?^3Dxgp>69-JocE02U9Ciir;1Ob+w3GJk^fODwHs1 zhNMq%e>u&yyM}TX^00;SG4$N`Jyogf0ZT3Rq04Liuur~nV@#YT__u_RcdRt>SAQ+C zkoZOv7M&-u-~SQD2?PlrYDkpHG}O~uK?l2>fU6pas3eV1FU$3#bR6BaiD`dd=-qOE)6vvu9fTy zn{;tcvLgQB)`f||PkeErCU0+?JXlXY0z3&D+0IVSIlxK-$47NhGbB#q1q0sbV}&2; zUz*UkLTVA7+<1sg^qh|3y347(pMQ|Y?f>ELD-?0g{3hyxgu}SZSd$l@-ejA4?kB@8 zgDY@~^x<#)Jj@?qstSCZpE0r>2KWUg)&jql4!)_aB6D$$592Y9BM_!v;}6$8VU(_x z7qI7<2>$Ev;@dUmFdS}1Fj6^njLyK3>RvldLEQy4zDJFd!1hJ~dhO7OipJF?cN%qQ zxwZjjGjhlQrxCPfKn@?+Gl9H2VFBd+W$Ij2EWWlO2YYzyQkwCzV4iz8+P^Flr36gI ziz>^|8Q%%e-z*qRIU^%Z0ma{WR_q z{R&QTT4<1niT=`XZLn7%N}u zAt(OWOzvd=G>(#*O5LDvQtg49V;p5A8G%NstHAk2rCkZEn@->W^!ubBCOe>_W1AgiLquhW+o zh$9-Rc8vH6_OHzm=mZF>!#C}Mt(-w;BWV*nWquo4|40hD8?4~0>OcD5 z3=Nd6UI;pulIo-|yOs2psle`zg=E5#URvv?8k7trke^qI$>obWkP}eac>6L<`+bmj zoK4hGS=xEhP-0FRARR9+?!(Mmn!H0LDfrd(h4`0R0UmAW!lOM(ya|0>h#iwg>us8j zP2O0cQ9pUuZgn5_rM`l>-OAWyOCmMH%@a-=MZ@*u(eQjv4f4CH1uF_?z@j55wXoKFgH&pncaMsc&5*bmJcSL1qr|Sn?Se`+1;M-{171)u~u*oRImua^U>o zqwp@R01VoW5S{6f*nB(+t3FM@Yi{|`2Se|o{L3}0Pezl34=0S*Tsd;S_EcbMt=HsB ztlbO8Y7`cq5bpOJaR@JMbG$05W%;Jdvc4X;AY2&wU#(JgWv$+;Gpv}^?;SH-vK?~` zxsJz^RayE!c`V`(C5&&}QhVb}600%5k@e}46rQdyx3S&N8(i+SkY)a2YEZNhG8g|N zq;Vhk)%k!I^^M##3xe+}YvD#jAAH$U0Z$yfA?$G~sK(h+ZEq{l!izr$w;Q+dQC%ZQVA(tL59^0rwF5eRp(@lkB<;r-f0AkDLH?B6-rWCLN1A>KzGY3kX}+tW+@dq>wKRA-v-0zNCPf4 zWpo=7V!3Ay+F4loKYVLm9YXG zHvDS2OrP%*f`iE$Nc4RUTO_`Y-BP(QeCH~Z1*Gw?MQWGRqUM1i7^Z1gdF14{Wu+0 zr@i}?C~%&o@UU7Td!eTSM{f3M4r|>I=kgY5ZpoQ4PQBU_zD10N-F}-|d!s`i8Jh<* z1uB+yf{&C1bFxzp-*}z1;N#>A{GjE7{E$j}!A!fQj2Cz680In;?Nj1A_}25k^Npsi zVqDw2gWur!*6#O9uj-RC`xsjLWB6At-e6q({ek~$I2Rnu{}2;FEUkD|g9N`gN#2=R zfS<(su;@o3s_&A+XLoHxJ7Z+Q^OzSUY{|gFh<5Zxt`Y6{ok>1!bU-8G#rUP%KBRZ( zJ5A*;C6&68jBK+Do^5skfBNo;PsGb2#VR#;YJC*aZ%Oj8tp?b3o&w%Ke3j1sX8``D z3UE7D3731NJC9CxqsJb+BffTCko8*~pGjuopymdoo~X&$@hQ)7A&}i%^oVu#Oy+cH$Wv z?jx^S9G$F}O3|GyXC>_6*`VkcM0?8}A&tNHkjV;fNy{7_ep{7D-c&b}Q?q8lCsqc@ z$Ua3E&daCoJ)1*j@ov!}N7j&;a_9Iq#ybDZ<|2`g@U9!oL19=?C~g+?t~kdc1a zkegzNFCPCSaZubPoUBvizxRiTY_bh)c+M0p3rnXK2DYJ!uevbvlNoFY)+G!5ol)ev zXguGwkIeq54cRw8qW6yT5hG$--n2i!%f(< zvGBfhvzsOix zkTC&_84&U`tNJ#~zjr0}T}oA0RX zizl9tnvCKGDv4E_8|Y}yCacCS;wm?H^kGK?=pW30p7AeeV)+#u);|x>bt&=U?JuC# zOA#>zYZ{Z5gyHb7X=ss>BsZjK3Qr|W`&_Lk9G$CtNB{ov`#!36ipjhQZ1#2s|OCzMs&i-R2fkH8YL_efprpEph=Jjv>@va+qHH z(2-O&E`-33a)9^RfO^DFvfz;(T-w0_@Hd3_>B_))J_8i1tiYph9_6;f96f(hOFK+b zq|R?{r^Yf@8vg%6#SuI88 zKO^zw%c)pBW((|VtAZoyMgpcECD^lVJ>z9q2w!wAkZ%(^pP_%Tg%M3I^6zcj$iF32 zWXj4ttDb0#`H%j6U@$T#^9wbN8B3Ff83Vr_^Pe>9RoSbZ;X5(@R_C2M$*^$iWfY7a zWjuYlg#Z0Z06+WSdaU-9h3y>{;kK3+sKa&}?NHu`rtADbIbBgWG1m#(ZjVM&|Cr+4 zaRMsn#Sl7u5vb7vHV~y7f%2_oX&oU*IX!x5H7AUB8aL_#8sA zMp4|cOUs`3!0V#JBV9C*eNZ+ObdT{gn8 zdKk}%4+^eb%*d*nP%H6GoieD~y(qL!gqWjNjc6}(0UR;J%+;Z_m*K~Z-r~@mcNW*fAb&yygCyo!Y0MAZC zkn`@xzFsN#Y|;ZXR%D3Eb-M9QZCPH`A#+}$^*Ub7;0)e_xt6>sau@0Ue7#AdgEo1> z3?~hbjnTNpbhy8|8{WxUiIvJ9!bnv#Iqq!&y!K*PS6v1RrY?cHiWU&XNRgngxwMC7 z6=BI}K!5(*SHc z?%_Pm9`t(+AB`=qLpM!y@dd*@c#1#)c`+O5{13|1tK@OA+Tsm8b|(m2qUF)ER%1|` zeGII3y`ZX}9|8L>t0cQBJGf}P5iC5HLhh;>vhP#B^RKJF$?Io7NLGXfEJ;srZhR~` zqvlBAsH$4JCv=SVS)EK~Y!sl$_uHv^d+JEH&l1qmsl{Qg&A6!fFur3Gfvp;(aiyfY zre@Jezq40B8!Gu!{O1JcbfPVILid{4jAFTj4_nA`Z6W!IE`rg$CASmXY9tg zndHnb7o2_F6@Qn0h$`njLe7;!w3la!e@CX{QyRUvrMMN_`nck%+%l+_aG&{`W=XO% zMbO=y3I2*rq-ZpnYGr9a$c++G<=#$KH>Q)izcZZGe>I~V%PzEGXERAi(}j?i4G@q% zEOFoHz>ihVuwzdJ?jF2K`>^C-N? z62{z$#bo-YOAvcWg$mVKkN%9d!}R6f>p53nlkL69l>K&12v+Nt{GlIGZflK+;Fb~j zU4Sq6F}tu^8QwFnTB>s^{VT_w-y0Hn4_sBp8^B3UoU(4+>+%__Qew3|lzP*YWRT{x;{I7*E z^k)V$F~frC8m-7o3OK<0^E{2Y#v|Q6a>KafKHbL%%yVLJ}&TuQB)A z!$9uhC5N~=)$#()pFjcQ{H?0V4PK0Z_qX|;wXdrmL_}5}dE_VfPsv*Fe9bY&#d;e- z=GrX&uf|ObSNa+M>wm6-AIXLGGgG+?11Wv`Iqg~eKVMA*#SRXFz_dsFlh0-fdf&SW z0_TobFK~YcHb+xH+;Iw9z9_>6W;+Pe;T_D16@$~99#o(_A3ICVl07;yu#J%m-MV&o zN%auAcf=YN&)eJv=V4{osdNx3RwmQOH!;z; zS0#R^e-9$V>LBudCfNhAa8srNU&;9jA*PwAxyJ)b4eE&7b|sTndzImSXfBL$veD`n zCQ#SSf>Zi>sIE;P@|<*VEF%LQe|-*)=A|L$&Vvv;qYE+i{6uYMCex(-&3}-?_{B#_;nZ}oUsz~=*9+>V^GcQXNIXzJHA}c|3ReP`K z)z{;qfPxHB7UzOcb%w6+N5~GLLt%=rA^(`D=(3uqY_QtVCOu8KRHDlk<)0Rza7~d+ zv@UDTu>>KUH9=P;`GRrtF1Q!}8KMr`;Bq%H;_IsOo~`Z$-*W=!bvlCG)Qo|VF_UMd zrY8RPZ#BGc4a1I?C@2iAz`7P@;{0I+D0?Y~7Y~?X;k;zrv|3f%>Z}WUewO2<^$svk zE|J(SlH&z*U4~_k6vTmW6{I{AU~91fexjWO&r@DO!lG-=!CSLQ_S=iJhF~52tiRHE zU-D7haJ&+-84@nY(_7f`mlAqnK8+`ybR5d~AIPiC!MuAbA%9 zd|1>%X3DJ~p=X<+9y?>_sRW1o+5@s)>4dc}4J2t6d{#Xl7YUX@_R|D-yF>>@270M) zPc5+d{5?=We}~QY8=#S!3J)b7lGPvYLa&4a;^uFI6D}QrnoawNVdQ??CkYFDz3>z1 zFwN<{b#;xVeeS4T=Mx?3_MB|HJ`10UQ-iZ(14Q3+B2E}SMK62#2p789qV}7-hdHF5B0QUj>=s_=@F<#`Wolwt=-cJcHQ zB?vW@u%qpYNu>@Kf8p5kz8T2z)_%M}(wZ9K+uobt?79gViu=uz3J8?a$)v+A6T_WDC3*NJIx$hJd4P1XiE(lbAgz2IHlDICXO= zsjq0kANoU}a6%Luc;v8SmU&-J%A(6LszYOzsYXc*(=be4HVhEq_ z`IsLR5y^0QJHVfOuZB@N%b3ahb(z0cUV#}Hu!#}M-oY3?A|<$AY$h;X`H``>WiDg# zO&!6~m3Dl{OknIc>xF$Rg6XitczU1BZnuBndLnTIJXW&ag5LNxWgT` zoZDXyvEkhz&Z^&h&hw5^#_$0Jfzta}M%)f*hHn4O>ac@NeDBd2f+?2@_)6GTu%kGa z;r-r}@4KLvkrZqxuy(9vTsC~enD)MjkqSxtftaCcdDBJJx?6VfbEJA1n|+fRhRRr= z*_6r{|JK0RFu_60ZI>5s&2EQ3$ES-g)J+pp-I*wSR0&U1G{VtIL#TP>TNGZlne0m1Pfo?FytZK5D0$rRMIN^bgUCWJc+HAwuBMM0PqzB#mt zycNqpyxtL_DQyV2z@Ky)4xnv4VU|gnIh=WgRh;P3Ks8`DY)6X~Q znst5P5#Y^pS)st&tqqtyI*~U&;Tv_}I>noyyB%8(bMbM?183%?kegI5K5i8XHC;I* zYxQAL?p+Pb7hgl+wik#@2pfhDt|661W?)?uL2|YX60Q_-dNi+`EG<1wyqlFJj?SMn zI;KuVENMV{lIoG}@=PKZq)%^>>;X4)nG=ZX;+QHFA&y++p^4Kwh4DSKCVr-og-A-pDJ80d?@sCIO{0i z8zXG(31Ve_v1P{}H%31*_CkE;C)n0o0~cQvLSc>!Za=3BcUBaFW6%hBbYg@k{J zQWnDF-&a6!vp=+b?m!*+_Z!c>Zi7ChL6G`#4k(SI)cMP$=-uvoB6s=-I;Oapir0{I zr8e6`13eE`t{A4X{;r^ZxBP;`Mp3vV@fKQUt%cuAV&L=Z`-!W08(yM*1LwrhxOhr3 z_UOpK{dQ_R(Z6i0v%VTHnXn1B*c9SF?CabxzpdC_dK?>O^6?v@!uxaa7QB6!1sZd; zB>qDkP+ieZnCYA0tKlp#EnW{Q$MQ+oiZA4k$Pg<3y2H0@7UUI{z>LYA@b}Fu><^M} zSosR#=Fb5q{Ti^yz+k7uLhoz&lDCcn^h>wExrE<%!_ZBf@+||0RMQgw_7_ZM3g|qA z8TiuQ8_0WcBFcK`Rkh}|++E0nii3IS=Ducih zecs(FX%byl>Ar%L*mqV6MryI8vVDwqKf{>f@*;hEC`!`odeP-n?vQeX}%0kZK7ykLoh;MV+|m_jX_-H-%lZdJfL6Cd0- zWecS$;c$gleP|qF6+4U0dg8!IGI(TD8g=^KZR$y{Eo$KH1AH$!%|6!Y#@ z6Xqf39RBKX1%`nrkuiT{lrd)E&#*G`;@6CQx6?Z_t@@qpoazjhHTLOs9OlyVL;TZ) z9LAo(-wfMi#56i+&MdwBm64?PlkspciLV~y$lTv2;s-oEi3UPs;NO$O(DrjL#GXum z%!UNa%=n3lMs09qxg0ipa1fsylqD+CW+YVp4{1$npf0iY(8-(E;9Qj~y2)ohtn-~h z#vNxM=>{4ZuU95NQ>9^oWKXiiA`jZECUR;cN;&(Ez2WFgf5tI?dz3R{Ze88yBM0hy zW}X#QXnl9A-2G6be&#{l8k4&s-Qs$-{LUHd9iKJoifZoEQG*ZIlOFBjtX`B?bG|FH z&b+L<&ifqZ#5+vj{N3(a=e0(){>?6V#@zkis{0Q+Vg?{N1x!{nBRpkV|U zUk_g3%l66&q#h_SBbz4k`LlV@EIx&@!p-paiAoSJpAYZXG(-24vl7mm4DXby6i=zB zpfTi~ta$QzA#@G<;JdE>;hPJb@mk{v;v2Kq!^FGm$VIO{v_0b}j{fuu9OGXTTb=#X z#-;?+{6)e5Z=53D=J5@cja6WM>OQX79E@LXKLFOBKN8a-dpPz`89&zN<7yQN%ROQx zdfs`i@pGJzR-IdjJ>4=$?|3$$WcHK2@oUlC7iX|tiyD~sn-P1pXe3>jgm?d|!io`x z$kOXO$r;~?pt`Ugo=mhuJ6;rk|Mda#<$62_pC?1`-WDuL!{7$%so29p69(swLh{5+ z2}4ti@_wt}7Ks#%Jv|lbnRDQ=iatJHbAsqyIZQ(Pj}pz79^iJyl;>VLj3p6jICIWb zTyl3SEpGkkT>LZ%3nY%+AS@&@hqd6dM=6;}Qov05A=PPGPF8}%^*zlCm&*sE|F}l* zt{y?b^tJTHUvn|D<`a6$wZ^&T=LtX5l~lGKVQ)-HV)dD zY7+*`S)~)-uq*_MqV!}pw#&hlq7}*Y!c%pVSyvA465iXRBdoGe7R?I^6&kOJVcB$V z74CZUtoGo2UzU;bhT5Q?h}C{Hv(|r|JzRBrg39OlqDfPV$XDYeh|zL|Pt_HT7d9J6 zI=%|j^Tb?GIbceXrLF1aYp3wx{1w>bSsqMWwHM8~^Pala)roDj8S9djM@y{E1GE=F)z5 ze8A=-2Un}=;%7ycSpWG1$la=nUQHm*PJ#GZ)*(D4aXNOK`Uq{xbVYB! z`61ac1K!T*Jj(O*d{S7gjz2|}A+r-hc*QyvJ~OqE7PuHgb7(Uv9oFEbF1m!A`TqT%lFozWEdxiJS z(Z*D;7pfha47u}G!IXhR_<_s`lDEwgazdIQxKUReYa9cP-<_d5&XVlP2*ROP&f{M# z%ZOBr21r#7!-(b#v3f@~B=xXSa7qXy^BoVB_=^KZuCc= zOK{-HELcM6(F{Ey(P=gYvy)Ba`(hQCG9C=xrOzS9Q?ggeV4-Lg7T$SfBlR*wl1sMQ z3GxRu#0IOX=;9L|kQlE`pAY$fKK73iy(}JnvG_XuXL%1&-nR}<%vFY<&2eZ$!xE5{ zDIoS4K5$7gYq(suM#6wv39F8dOY)CN$U{})5LfZQ$2*sR=bx!i^f3XhYvn+a&Le!e zrUEIycuPzcYyqR4$#Av%DGu_IL)7&+Fp%{Jlr6dEN2K8;e#`NJpGiO;$N}4cN{AP9 zRsWf6%WS@O&3@3tNU*=0D^T!nXZ%ZI@|_b*_*dU}^G_}6v8(*9!3bmKS1*`2S+G5F zmcT#ZNA+i4WhU$0H0HzpBF3Vf7a5N02N~`?48heMJ^Z8<-Hfc_se-Xwns3!Th#Sl# z%;73P@r#~2fBSV0He{#bf@~&UG+EMvTU}XyZO15RHpbv}W4GvvOp01`#u8`Gy#y|8 zrj5=nY0iyH@1c|TuAsjgHo!UiIGj0S6E1fO#^CpZYO^bsIQU+0+LxFH27~h zfn}E5Z0Qz`)+rh8#H=C?H*dq*Qm*LjfmvhE1cwxepTfz`SFe8 z`tT`dGEaefdX6i%Oz|NnOs$fW6dub-c%05TkkrW0d9y+g7r$8W=CiH+l63|Q>Hr z`fWvgzrIy8#AOB(@RULbB$u)N}diDPG{nCRu`Ssf(77sYbKn1 zuRI7Jw9G( zjusY90O)zqu)*6NJIwhQlSD=VR&N(^&i9 z8T@;95ax@td9{oscz5(gJo{H9uCHE*+k(^IUS5~fmWQyJP`8%JBUPNL+}`pNp;X9>N|2^$>lM*g)O)bB)V^u||< z^6JULFVoZUv>C;C{(~~?$eE1yzM)7*JfJp4Ci=6agsPH#N1o5U$-Zelh22!9#%^lP zU@4DEvG*rkW2cyEvrmOeb{`k`jz9ODV-3e-vG0rK*Zwi+uI-lluO{{igWcb^hrKgI zK{!eE2y3inE$iIJaQ1>!ZMIgpobbtBYnIx;C06-%8}`dp$0_(SKp2utgX#~844Af% z)|kbh;THvuqfUZs&ml1Vz8TD7n9wDg1JTQ3q5R5x$Z0=Kx9x1DOv}Cy6~ArpZpRl= zSFi#P=nAmqaU<;Sw?N|ZI*T5ymF0btbH^w4TVd7s*{I~xJ36_>UE=Pn!Bt7Ou?9C7 zw-scdiXHmcv5}2CHpt*VZ-glKfFau7H6n4RZoz{$J|eZp5_aRs7Bui9kzDVLK>9EV z51u@U@B8H97O$7sxQ&J4+^f;9kF%&_x24cnzXD9j+J$Cuw9&pM7V`RI3tRmp|8dK; zxbz+%pAQF!r+XkOS$cwOoNS8!NmwU8m-@eKA#g;PfRwv0Xo)Y)Wi&R^`gwGJS z%X!J{;Krb1CPdUQpOic6pvn37oiAT~jaDx|O`Jv#fn+fS_fG^#HVvKBHq=bw6U{*6 zy&t?e|LAoZU!B%3>wuFnn;_`KQdl_a0$7JXp(hAG!L8{fu-m=?HY|7x+T0AuT+3L@ zRQv(g12n`b5@*+)^)}-9yF$cM5;eru%G1Pjlc9L5(_5U8T0}zn$B~&`1x~u0fj3&& z@g7IaBt_poHY7Js0i*Tn$=YW}@h01u#NgKt(l6O-uPIML_ql#JW-W(4;-p2WU0MwR zj8!PAb_4c(S&f-j%W(VblQ<9e;GosD*i%0R&75?De45U|Z>4+5OF>cvEDhHN705U*0guE=}yEo}2bFTC3~??`tMAkGCi>|86i5 z%uc>rT|4->db9Q+RQZXJ^dqal-P{l|&*XxJPB^-qsSCMJ*Fw(@OR%3iQ7l;Ej>k`* zuOCcTf#Z7*lKflMk}g>vHEg5<>OcMArb-oIe{X{a-j3LE-Yg)Ol3`JA4>__vkKE2$ zU)QkVvuHGDwdiiQhv-}50ntbOc=oHvNt{2M!`U7iZH4~Qc|rs0zakzpfqk*nnH}S> zMwsR(WTi9K)G7EEi!37QSw}N-g@w6kq7#K)b@v{vuG<%WqV9>Ecirry$#qTFWCWV| zQi3OLh`)Jkp1?S0BfoTwjKJ#J3wy2QCH#`|G5!R#8at&f5kv3BH9qh3$Lhg?Zid>| zIKD!Tzu<$^3C6OJaQ@}bD+M=8PVu|nEn;lD8_w6^_Av0wDT2v)QTDP@QcyXhOpEE| zjj7TV)cB8LvM}kFQ^XO37PgrYpSzM-FXs@PTG&LUDBUM3{vrr@*94mmpM~Q@3D#$t zAXeO3dTixJWOT@hw$b_m`f~(i|7IWBw(o2;PGOMmCUFr($wI{sO%&p50qt>7M4GlBX^K8XdUY>S zpZlV5SC>-b7Jn=JBY7M(|7*tq)iE%e*#LJl7eb`;TB07p0aIZ-S?8qz<`FyywCy4j zv&#^BnlsW7MUj8uD5V&aEA4RgIU#ZJ$D`%^|6PTG#sWARDLt9&+XxbX)z27P6mq? ziBz21Oxs6JgV4xp@MJI^1Y^{2{i|s>C24>x&Gb zTj=syrHlsq*S4m}+w%=btX_6;^vENc1&QvzF=-#-fQb<<~z9QFp!*#JDQ>!y+X zUSH>7B_>b3h!fInF=t^bPJf?=33E1J+}P8M_WfiuEFOm~FIho=^DFM}a3p)~43TVA zPUCz`J>kO$MbMckL$A0R!@rxn@BEoN8`0wf`y0WA#X2`j8rE$)F^|mo|!WVKTncNuIex+ zaukug48y3L0$fc#V9~)l^j`TesZ~-?mGYt)kU-=+nsrPpp7_b z%_X&CE#Rh?6$%eGGA7Z5v}g2q`t8^j>aFVlHLx3;ZY?8P((eTGm*$IIDh-+b>u&Jt zbuD~b9RhOC6yRcpD&2CdoG3JmffL@FVb|0_CeCUxY2TDd)H1ihu$^)6s6P?(b#erb z#v8En@IzvDypoKa_m%Ojxz45MRG`_{rPw$A1h#+pfL>>0>6e1*B*1qFSLb?#;+X_& ze7GL|{ji7J^xFaxE#AYi$euB_-kc`C zytl&fWmid4-VJcRStxMPzDY;w1rl!jTl%Q*Jaj*HAVA=PeHRPZFYPdmdzey9W_kHqd;bSA2EMT`Fhn5BKlSvMZbZgQrv!-_%rCy8cFH6)fnVT+j zyDU9-w8iAo?$QI+HKl_KHO*Hpa-CMXywp5UyU=V$v{dN=JJYfSRNp)@e{9*xh~UyQ zsUf(>e=;Utu*Cb`|KWn?%H(e16Rhv7!c{f}I56oj{b!gD0}gHYC?^w7h2N)Nb34he z^UADj)k|tq7)~mTyUC^_31I#o%bp2Lr;AI6!JHBkh_F+Iyi48SAhWD8q%f#bsmQNV zA;zNe!sL&l*x!vJvZzLsl3XN8O=uC>p3N66`4}wPt7Iu!pkFK6JVmo|=Ofd~A$?)i zle5Q)K8G9^Imw+BDIJokbeK0ncw|qY$o0x7>m_=VtkV~HTJJrsce(kOoq6JErLw1U zD$F|v?@sqx8B;njzs%f}_!ddM&@cNUGs&z%~A1Al}%@kU!KO14Lc(1XvV#S|QN$NK9!hQ?(UQ0XBes`E>sDURI zXh@u_I>@csKJK<~6?9vl1o=u2?s-EZEq`Q(zkebDx&S(qzcK06!PKqY6iqe)o%`_= zwUtjq|CQrGDN%~|g+C@rjk#2HzY2YIvaVe|u844A70=f@U7-TEn#$q}c~ z>0UZDjVpt&nVm4=uK}AZqa(2syGlBF&eex!HFg;bw?JW_6AEs8!~vcZ%K59~y);|m z6;Mtxd?w?RX>W1q{TPh@EP*Yv55eO+2l6sJ8Z$4=hdC#`VC4MUWbacAbmQ-?S z2<-1OWhp~R$dX*LWkoW1$lWChZS$ewdo+!`+s&lp#B!Gpl+%R2{OsGFFTx?a9fc}- zCBiZ1y@d1Lofhtu-y?K?xT7L1r$ngAmJ0_8OoiI*&B8+cJmC+%hu-?NO6ce@PpIyr zC(=rA5|-D;26c@$k{iWhR6aINzjpw>vrMg=^3} zyb-l-b>oJH)l{!H1pa8fB|+n+(p}HfaY=M7&2N#0p;k7e+C~bs{#aoBy#(4{oJTAd zULbl$9+5s2z?R~b7%3BoQzwriey^V}2}9qLAES<-gO)Qr#P?*Y$_v2|Ux6zfiS?7^ zuz%WUu4H8zjM#A<3ff-dg)M`aEC>|O4^4u=n=))r`8_OmRF*89dVucYeF6_}tFhlV zBw$3VoMhYhc(nTZ46d1Lv2!dl==0Jy=+ZnA|NTA#{oyT8RWN|_=X}RQE{pO0_9%$# zU5*dF%|=hJp%Oo*Qj}>r&v{OF#%?wStnUp{`G^8^zMPF^_rB2V|4zW_i|>hU+Y!;U zpyza9pf~hy=q7t_6cf9FS{hatjP7gouzSW2Jn1?@5~JeBwLDj6%gc4xk?S-iZ#%|I z4&*;)R+}iX0?m2soJrTwet`%_2Yh2{G+u#u{6om?2*BoRaX6~Mk%&}NL2;cQ*gZET z3)l3KzT$0Y>b?%$-$n4VNxMo~&V-r$dtp#EmX0d>F}~G&>{_lg)O}$|2uYGW^utTWs8=KDH99dn%^)}ws_~$UTS{BzjXgq zuhM|`x)w=Z3T11*2+G3db-}?UMet~IKKU6x3vaW@xVLUP^z*KPzn|mDS-%me(Y_lM z|7)gZwb6K`d4OEFIG%UKZz3UjQ?ag9hwq&mK7VNOG z$-HAN7^)~6mLw~9Eo;&xYgpjO_B{{7!2e`mi?jh$9FGB`pYjsZ%Ds$v9=|ajA&c`L z7h`#o39JoNfm5fCb6Tz%l3wo&`X-Yoo)Zpc1n5K@fd?-I~+gyXTmgVHUSwAj&{RgDx zOQAL&+^pPeNZuarg7j`1^4ND52B~SX+m#}z*QpIG(><0AKAFPAj8}pg=VmhTE6-ut z?oDv4xr|xiQGq9~q_C>%)5s|H3+$Vy&pugHOjTBj;dOpIjGAL3@p^gU`hJ~Gx(^ee1ZEf%lYxN;sYv*~7kzKdO-0*)7p3-OJj!I+{`R5ku%5yet zTD#42+vb2BYqtOYdn^QpIT@2kPKK!rL93i$l7r+VIqIK*Ns~UpP0K23@ji;)4IcqM z@#&D^FJuPqs<4>S3Ta+HK`mKGj54aop?d+iwCN4~tau+{c8n&HR|9y3$K&T{EI`qe zw>ZA{Fi8rZLsqSM&&3Si2&bQf;?HYh+VOcixYlM7m5iCR`?Ne3{@j8|>Qe+YpIb03 zx?W&0JfEcS@s?rZ_n`B|^EmV5a`Mu4Ifl(<*{l8I*wX7!oZ1mtc46=p2w11j?w_W_ z^>_JW)*ee-$ryq|pggwdMKRrBuGnUEfok>4XIpZ&z-f;xylnT6Kp0) zrk**>&GsHevUc!eD%nnSU;Td4wu?Vn+#Gp9wA&wjsHwEXg*z1V*AZ zZlaD1M*rOj6Jl3#-jAZucb1q}Rmjp$_J4@wOM?4#(@^|K8!bQez|Mb*kd+BRKi6Nx zOizUz%Zj&~yMGG28My_PT`C4|12-66KZf=$_5{JuG5G4KGrDLSlfp%Yk{H7X=s)ia ztsQmTO;bCXv~(iO+Ibv)rru^A7(S!BywBqHxh3e6c^sb_meU>8zsa2=_t8z1gjOc4 z^laWK*k1FV(JA}M_3g<)E>K7l@9>Hn`jGbXExVh3ZB-fn?&7hDrWn?%Ao2fvQ4r!Y zi1*C&U_(hA&YR+bHk-=O5f#;+!;ut&!}&v4#%#(2fNs=`P+HSkLupd_w*6 z6Y%$^3OKOZifyP}O2#)HqvOu51LbB#_SJ<`bRc;&dn)-B_o8eko8@%?zw{JCSPe(y zm)gTj?GWrJD8R+1^NH*j{-|$M$2ksDMWeh1?(QK|vR);I1pSwbsvbVXiT{qs^?5j+ zEWmqhwU}2CFIL@hjU47F`(Z1&T~S*XM1k~43UnJ?|ql2^~Qz$5u9;PD6aPf!Bg z{UZVF^Q>^1mI`b1axr$?4`yzCY7`$G5Yv}Z3LvvP1;38H&;3Y_#~WJraOo8{cGQ{~ zpnZ&kb+hZx{RM%;I(#(6c_$e){3}k}>I`8Wt>_*$T;gSG#9dSBp|OG!@L;S9Rk~_P z&$KTlORP_dweOdsEV}Db}v4aqKYbf1kGYr>yIuP%_OE7k?42V@y@o&pkjwx`& zo5v{@XYczZq&$(Z>q%vcnug`Jk|t@*bRK`JVYWlEdi&4wrR2WaKY> z!FPUFu)CL6z0O%C2>SL{oHMHo(|Po`Z%{A}Tgxk@uE`PS8=qcL0M)&zbmzM`+OTZ}NLNn*D=T$&nZjKP$a3k3aTttYW2W;pySk&{eR-%WdqKQVHq>Nx!g zhjR88Flzf@vYSVu4x6Vdv3|XS)qN%8CKjB;ftCcc&d#OMU##&);Bxe+RfoXON3dqi zeXg?kI;6D?!|)b{7R_p>D|u95(i3}j`VwngS|16$TNANAt$%J zV&d(NkQ*1bKzT%JoOfcwdPVg5?GC$DDu~9xV4T*j#fX8|kVu@- zWO+67#)3@PFh@y z#|8^=SCcCVo*T!b$Zo)cS!P)7KZ)5I{Ta@Uo(W5%lCaK$_(D6B^P(NWMh6Vh?HQ&~t@PQuQ z)QZ9U*ylK9&Oz#Uvlz~0bU=S!4A@79ibLLok?X#}*m>s+eK&bK_>3LO*7@;j%e*qk ze>a*Qbdv$M;vaZcBn7#0F4%R|11Ephgdv}K#U(_hN0K;KsZka*~BK8^yCNU0;&(mj{qPf-$WOC8*PF&&1LnQ^aPrHdQZ zs?i}cj&b_izSySfQ|?k*6E`>h3TLXH!BD3(F4S))#q|SR#wsx*b?`NlIEkU-7ChrJ zE{~)We`!&b8|w7q`^B8@SUH;C1Dy1FDQ@1oT+VX#NhY*<38%$S2;H#2 zg4P_m!-e+VE7eHf`=M#ga!I9G84j;BruwwpZ_> zQ$r_Gt1boZue%$SxZD&sN;!(RE2Fce^8O8J<(@QC%JK|2b;M4_O;^r?pE=GkfHSY-<4HX z*QIFJ8EfW4;0$VFAhhju4&jFFt>GGs>N(D=f%Epv;@0np=f2sZ238h|V7{AqUS%IH6@cPKc4g?O$@yCY&E% zjU1$Q858J`#uReX?g*}~s>dO#j#AIG-C$gqNehan!{gC+=*GSO6E-DbNkIUee~SorFg?Vdswy)XgtL#dbZM@=2 za#!42CQmln$g+R}AY?7_PwU^%0zS=mBcD|%^(+)CUXCmGn`kJ-- z*h>-%O>xS&VYuBVh*!YzdgueIFm3suU}^aVEU3$X>|iNuT09gNjh&2sBm_r}n1gJ{ zWpZ=&QyNwF8ZAzjqSg2)_~W0!xp^GMBPm{(*lU2Vt$T3i%WpI?NJZl6v6JQ>z6fP1 z3h?T~6c)>7qtS#q?ELCX=c^Z!fFIr9H}*97`&%6?uEk@yiG*~nJ%ov&c4X|5Z15uq zST|FSE|?Vri)Ykuzj*zC#G-<^Ib#$&P8M=L7Yk`h&SKO#wiZMCby0S~Sm-!E97hQ^ zq04z!Nz9~L8f0{XOsgt@!XNW!VqyxHd15sj`#lLiF5z(Mok*HZrjdq;E>O?MP3|}6 za_wIgCF=WR*sgQ#bc9_DcT#-`tbQ^ZUYWKsPn$ncUHfggLnn-`GnUYy0|iW6 zs4L9vu!JeY(_rJB9NIWDp88+v!?r*TiOU>k(AXRX7w-NbN!Pw{KNp=9^PU5;I%*%e z**m}Lwvia$Mpj2xmx;daVc_1uR(tDGGaEmjjVKb2f26QsI*}o{9!`izTQ_V z4%mzy+C5y?)x&U7qY1K`>npj@E8$8+G)xWa!6!KnAu%J;cHL2gyL0O3`w@qrVp%`BAPe<<37&JAg324!F6eA9^aUYKUL8f> zjsT_3QgXEF6@KgbifODO(t;PaKsk9W9A? zGX;rXtT=4be$1Zjfs2x=aQcbiq%ig`q^4(s))*k)2B%6|m21h=d@;T&(_tOM%Baq) ziKr7$Pab#wB5d|=+P`x;8k#?)tqsSp=20tsq?!mJU)s>C=NPm}m0`kv<`5zClP=0{ zqB9<3z#r8t>^-UlYDpICpC5U2+y)&jwI@fks0oTy+9Uthu6 zxG#q!@A0@c{Ub)W?&LPkI*g+I3X&aPO0n`o2;O8a!pLuM~bJr*u3sqck$QfA4~;m?R>hY@>!yBSo?-APP^>2#YU1zBAl)yu7fU2ai=gXdew`kxEA zrN?^lbWterO%}l@mqN5QxrpFOjYD{L2?QXVI4^eGmVe0(jMU$am&Gjxoo(@{wjo&T+(!}x^g z-tm0ln&u6nXL@#`*=~KJ)_Kz_Djq!*9UllTJFDa>(w2`ChImLuRTq) z&&*RatT0y;bLXn)%1D*Uwtib6LpKS7&Q= zl`tl}1`{?L5m6F$?Q3sN=t0Dj8sjjVjI+T?*ttnLf;v}MA7Hy2~)^o#IhNe6gUjv~@e z6nT70B`5BRhZ}4^X&6xjVPk7S&%adQcj+{f#K*I@;!nCc^Bj0ITBG!h)5N#c4nvkt z!JxxlB$glDwaZ#a-ft?Q@;bAb7x%BA@5*^x(79pUBVJLP(Y2BL?l%!Pxoe`~7G0ch zu9GHP??-0q4KiKxH$I@HB<-~l4%w9l-$00a_C%8tUPn-W$9*!t;s-5hdB`1k6)ib2 zUWXkB4*=8h2yA}CN4acVFe`W&jaj0=W7Jac+`BKR<5$cC38Zns1Z}A8&!d;NhtbC_ z&xkj#6dZ4#h?9Hj;m|2nSf8~C_b+T_&i^Z*Yt;6z9k&)@ah(jVaOtF#Yi^J`Q~GJW zMG;lqU_@QhX0pPsL(w}e22a)`;@ff&T{A}@54=pm*rP+?g=;P7yQ)eJy!a7-+39D9 zgYQ%PZ*m29Y5!BWc*B@zXDE@U2BEl>N23Pc563S3r}!Y|JNEd##gXqi;jPg#M4a1EulfI>9qs&!gj)k*djRp;Q|_!EQBuwHTd0Y3=Yb7aXU(6 zm=Jl2@(x!?!r&>K@bnn&9v#Ygn^@2sstb2b^6_8R9_o;%DiOW%qAySS09D=t`NIjZ z3GAcJFKcmf4yE1#Z=~(dQ2kjGNQX>jH<$V2%k4+tx{WvZ)fUmV6`|g8X z4cymYg(0IW;Y0f;l4t9Oop;6bRqJM4u3be7`dV;AKo(b0vX)q>ig3r5eK6kX9Qh}A zjCO6P#l!n*K=hv+o0u^MTH>1U%REIRo~ad_c;Om6g4vSZ5IyuZKge1dL(-7XEZS`r1O9O zrcdI|LUFneJRaJENqiq?pi7xm=WCb4tcU%y5#Tf zBe>zEE7(*%qa!EI;FMOb#&wySxnar;=ucM?k8iE?aa=bSvbIk=(^d!mcDX@R)_3vh zx6|3EmTj=@p@HBQB*E3Y#i*#i82)tJqPqM(t-9?aQ5jnb;}S1m>Y^_oy<`KJDa*nZ zk%%#gcnfDf!xh{nH{ymMdsNV!N))EaVSY>^H|=N<>}^t#I3!Po z9XC5L*1ruhA4v&P;vykm?;wa-f4FefmM&ZPn;M#&CT$iIB$mf3Ky&X{YBoL?mh0`s z@;y5EG|h{=40#IXJ?G&WufJtjHbzU1*Ix* zA$mIe=huw~H_Ed{9_gsNY7D$Gi^i*+Gihzda~Nf?9)B{G;JP7(Stg&&Bi7PD)=v@o z=PiVdtH#q?Eeo+QBoY&LZ^zJ&Jf|)uQZO~D5i?(8FthHdNyt|(STisLMqDe$NU;dJdqo!g7mg*Zr(U4fwK6brmzHGcj>CK}S-iHP7>L3@61bp?TBScnNe*92 z{IL*!^t@z#MCQZ16B@WKVK;VbrEv|%+iBUULole+j^eoEpgAHPW1qXx4<$U3uWv4V zkP1cp!%@(vm`K`<)9JfX0V?zHe#y*puu)^UB+F(29`U_E4sKeGpMFh&=G$*^L#GX_ zP4vL5iX4(Umm>)J*PSN6i4c?WQQf>Py~y;E?fJ+He?UP6A6(;)b4OTF|ss@g)X&0LJTmz~6bQe&vES7OFcy`zxOAirkw|mcYC2A*_ktw9Y$hK*kiP48Va`Thow6dn9-px zpwW!4wIulo#uwGlqF1mgkqKDt6Km!2z$BL;1!Y2oZh zEJ#YgUZb1z?M_pCGR_MxPP2mj56_Vmmdn_?-g<0l-+g-Qz#2|FM;^uxTZ54=#f&&K z5^9||YPBtvgzA2yUHLv}GPRvvbU95&n&soebu25^UXP*q9rS^GCzrQWQNkYNdrKa6 zG+Sc{xzuogyEVNIP3(Tr<-*aRe9RhSKl;)A1qOVDRxeSe3n{N7K|XW3(841`oktBONfIx^&5F9wnY@#Xdi@jRsE_ zK|s+KMm{Nycz;O2)Z8c3ZjC3EYxKqP?>DG}8lijmT9$)?F?PS#VUKyl;0hT_G;vLV zEFK^H#3h(}Jv$vb8qxsJn%wCB$+%@WqTzi%V0hF}ds;O7{-;3=-(Eo7)kD~?eX>=* zuiuBKV;_BbDk_ih0eo6140 zfj!+gu7t{tw?G$oiWzD-u+tAPCCrn%alQs#j?c#Bnm6d+vC|k}FhOG6?*E6GFtCR{2DKJKp)MYfJ_6_d1#4DedCD8{l z#st+?;oc?pVBFU+tT-i;xJ6pCwyx!9ab1Xp+;?K(U&2gKxsSUS9LK6(CqZCiK_IOd zKA(5O?$HOSm%TlyIyXlm9@U6)%Yl86rHiF^3^AxW0&9M?V(*Gs_~`xttco8&MeE(E zZpAZ*-lI!=-AAzxG6Es5#}reFevw(9Ut)t#1KyeINDuS<+ufWeTzeg3_i#@;u6zCr z9zJcOqH+Hr)ZZsxa^mri`Ai(MI1!iGkHFMTrTNrEoSxH?{P zQ0EVQe``KunjFQvB_S|n+$)q1SHK&~n~B?|3@&r=WgKPuguY1dqf4|}nNSB;8hkhq z$k9`DTBIj?ZiobrOZutBBxQJB?}&O&L-C&AKM-CtWY6U;roD6}R*d(+qQ!%-Eq55V z`ga*_dkQc-Z9VCFGeC2bD@f%Z{ ze#%HglYlE|=JSp@v3dYjo;^cH>RYg>7xdwPj|Tp`dkD{7wxjhQOeOD!%CW662(<0Hy_m|q7MD+ieAm2UX*lPW3P^c(Z@3Q3J~F|9dW zM)PWafLhvKVo;Yw2CT;MQO=Qg;yEAhzHLuiA@lM>n-jT%p=)lA0{ui$ zIK{n?Mr9lY^|m6y-h3pExAsTVj;UzU!YeS>sKW|#FVu8fjqMF@u%_cKdPtAp>(w8q zVP^r%m+1t#$K&Ytv7@lrIRgeB#$b%Ix7bmp+pac3O(F@BCc!c7sG0PK>ORiE(fs+v za_tcO*4d0TKho({SciMwNl8}AJ)vq}uc4{!1x6_ueM>)9~5O5u=pR+SEsDTm(m=W+A)XC!@-D(Xk_IQ+{)B#9wBI#hWDY!_XD zp(7VyqGTx-ducryf6}4VISF*t@~M!j^pPrLE~9EjUJ#c(6tb_Mz%dt}5M6r-?0kLB z?x?E_Sol;jf!$SbKR5@qa^AbCApE7Htw5g)oGTOP?3auObP;o{l5nFD< zPe(7|jvc%nm}JpkQ;j9=R%&dp-v=l!{nnYut;;VL?-A8jo<&3d|si3!@?rKX2vKC z`uhX~_1VnZ{1nXLd;GyGb8*|;PAppfhuppH4?U9=X@S#U5_xC}doYL}xt#Wil;lq* zo5?1uw$jH-FZq$4AC9oGa<`zlJRLMc&2a6MwV2m@8FdOK&@G}0k~*dXyhDaT(SUOu@rAj{SbzYShjrJ>wsGsc`fMMii1 zM-KOIhxlU-q-^|j@^$i6`18FQHcD+|ES6ni8oZ}tg;O=&x_kj%K{%%V%)(RJH$b+I zS2TU0WV?GEk)QA!Wp1BBDgUMDyMG9sa^(!KCFl3~hdCVQ7Q(&vab}y%t#F3%JS>b` zMj~#=QukTKu<-L#;yHVyBxt12*O23$yXffNdH8Rk3uWpqK$hbP?7e3tne&v#naTJx8$v+LBNPwoKhG#;P^m126WrU6y^Teu18vYbO(7Wr7@ zh!q>WiAldKEL)z2VXYdR{k3~&yk;)-y%K;q=bGWoxHz))-FkA|>J}}z?Z|rMZlQg$ zqcC%oF-ZM43h#*`iDaKF+Y&nwEgos$X2}IS|Fx1dMG4?~>3-63btdQks*yY_-wGyP z3T#1U3f26WgvX9s@U?_WPR=2bd-Ep|9fO_eQTemz`}YcC_A49<{w&9_8xDZdmXl15 z@_lAt-aqKrq6xXT1JSMa2Yq)`Xm_`33Eq9~!(TElD6`m(R9_tqk>j1&n;AZ=_JBPO zIeU*A+qs!Lyp~7hR!+hEQcd>ubSaG7Q-b*el{8lNBGwBlnYKx#xQai^{X27yM zRjV4Pd#VfI2vgQMbTRyjIt?UwIGD|f$2+>uuqtL3jW``A2rHV%1m4depY;l;#)?)P z^-F?xPMcAiAr_RFKj7wLuXteIZSI869B#$odV11n7gcSQW^P>#qN};Rf5IE(Df?(5Q~zlty*>8}7tmhJ+Q>9p|_ak7sakr(C%U zDF>MqUMuLb)j>?t9WzGRc`iLXI#iJNcRF{u|AhGdndhAS{T1|EYXzq^!-J~N%;Y%j zG-|m^g|-Im;g%_=P|f<0%yA4ZO9B24!h8nao?R?t1{${AWtVdLwDy z`U-++r{TqO6{|_mZAS~;=jwpvzEKi z+2QIm=qFNk@(5bu`I?Jyb)|7>GpKh_EthMk%U!W3<_sV7adN7qG{z%K@Fj4#pwN8` zu3@yGxmZ_lWv2mD@4H78?|OsmL%*nn{Xx{n(zGg{B0@|(|iPx%!+%B$1v-~GIu`*&-w+dm#d znMcPV&w3_lRaca(VCCWC)3JG?i24j}$Fv?X8vRjW5B}4J@2!Uzm1WYb z+C^Q-w&DZ~ikd;Z53FGJ=r2K&f^1^7w46#j`$(;YkjyfQ!*Ppiv2((Eaym5y2jdnK zg(p{uZ?z*FVwxb#BN!W2pF#tdqxfBJA>DaGQL^z7!>RXwBI}q}jB;8e82A0aJAbt$ zH>#RZW5sXcQ(g+52P`pPIvvJcktNlEDH!FFfW_fg>5jl?+{^dMH!VCxqFtP@=|TV* zpZm=UQwpH*>rt}6!-?1~qV(Q~0o>W$!9`uz46B~x#yLbo>LWRQbTT}hJxP)*+yRS>tnkZ(|A@70 z9rZu61dlI@CClFXVz+fTW=ze%fp^z&O~7}YJ48`3tH%KK`5b59!H4umd@-)(CFIWr zc^!5~4c?Bu4XxtQQ1dAeOg|S_-P*l~+rP6D4(LSUp`8z~c@u-y8a#5NDHYm#EJ58g z6F*2!FsII#Ky21NEPJpYb3*KJ+>&q3$FlU^M>9E*2L;GCGzknkBIo*i{}4=HDZfo>xy>m&QSh#W+&ojWA=58g?{a zLBk8WOk{*PhTJN^3A1DI*x3pA_nQJe2@7E$Nt4!ET~8l&Be=*TTj zMDNdJjDA&14yf$F7k^bKwhfV32@_G!v4=glDgnb!8j%CJdJx=Nira6Sk|{G!pu;*R zm~dc<ISy^jJ!CUK5}R;Hh?9+{RT72^spt0tP{r8$y2f4iailNA;%SPE5p-*TbKH_*>{EY+wB z$13@aH14pjnJh5i|qNRNP7lfqQV~!FnT+QYIBth z>Wi|Ve}w`Z6E(oh18JClZ#NSQ3$V~nlrObL6~&8%dDHEjnLon)bh)`MHmIwx`KKPE zsZ~6NmuNs!^(jc7yqyV&?!Zf5zT)#gKge~y0zWA_jr3kPLH6{^V`6YDbenZhe@|y< z{8WuLzhB@4vlb}exS9etuFfVQ9K4@gfKJjzZam-4%j_6qjdRX$_mK63`H!o;(-($| z;(jns@;KpL9;X+1mf&d0&8E*iCX-4Ig{%Bl@jdH#Y=!*+UbZ@m+wN# zd2`<8?f$s#x-i_lU4W81@-R+S4K_>8qw-viKUZaW)o^((D#~9YTe(W(Wp@T>(;?BS zTic`HzYXo+v96Jf97-i)rMA%e;2zFOiG&g3MQ}gnEOyv;u`mvg_bVA{J$d zw>Dk@*N`-P<<^e-YozeM>4p&JU5VC8OUXsiYgDJB1XahX(IrWiJr?x=^^fF2^_xT! zkB9+YhmQyPsn-H;68HNKh+s0ip6QUzz*l>6F*huVx8EilDx&V;sk|`KIw*<(x3bVg zCX+eIl)|-Gf1co5A+wM1EfSLzDNr->~_QQ=q?t)8RNfV$#1Q zls(UZvg~q9j}@Wcd-KYI&9#^|WD38_##tQknhT=%J#{%Qok- z*4cA-nj9-)tTqu;A5P>e?_}s}F>m@IW<6@0yG%mP$uNt%{87!|37)qUMcoyv=@O-_ zoNT3%=J+imdGd}`irQB}cGVxcFm40SQ0X+exzikP9{o=4YV=_Kp9I?QyO{QAeFaA) zeURlW(Rb`$l9O+waR0Ixa%rn9&goL2k8|Ttb6yU0n^H}S{xDcxa2B)5&hkFjbNYz# z2XskP2Rdrip>t9{&w?6a%D?-hedS_|TO5Lpw;BoF-$-2B&cK8lF({pU0*=>4;M-(* z{&}NrytiZv(fCuvxE2cVb@a8-veg8yE$c@6xHfVs$rL?i)MBrs5H8})hn$Wp^!~5w zkfj|)qZ|X^*Up!0@bDw(a<9hbkTA@gehhvrnauxtlgs*jxeT+XZNcR#=_Y1Jo2uL= z&c?bR%Wpsaq5Z3oSh8FpQLhdR2cvFGjU9g2de{9Muw4Vz-`<_viTPM+$dzMv? zdt;!I1G*Uq@Fh#?$Q|||ucJx9

)%&VF4)6UWYwS=V@YYN-TVzQW}eRcv5);V+&t z_kW$eHyrHZ3y6=wA6zSyfSHB)IPvKXRuN&Uay$*&;TtN0;ANe(K7G4Y7#cx}J5Z zuS;OglqGmgsDV0`2=imZtm$HzA9!-{SK?M`fN4xE@9UpD?4KflM>u9Kh5MQHBLw(A z51pV9Sqq7m_jKqeTF?~@`xzj*Lm&*2G>Vs!aozi6w04^Uy-+zH59Oc7!D;n$mHs@a z+^~=dx{{5f>2J|Zs0{M933C~PB;17zgktlaO9H!r@UUu>m7z}%ANTz&_h zQaf?amzCIcAe^4vcN|-(BR&dj!ox`%uM$0g4VwrVT-;6zno=NgMg@!upN4Y-&E%`z zG&r@Q7c7S|Fev{W9iN$rgsX^|pniq>EUm+NTt#c@pLm?{W+Cp=naT$Eoj|oaA81lU zCVmUv3!-($AjT&g1M&~js!5qR+82-SH)q25kShQAE+trX*bj{g81mI`0oiAxh0PI~ z#PnMM3f(yN{}>=Oo!-B8$oD(>n+(X+L5&o5`o8WAJ9-8)@!pVtB%!!;Y z(0wNa$+L(mxw9v6k=-5MgZ^SM3 zQLXL3Mg1$Ve|ii(xQGWJC5kU(B$=SKn;`9SK3TQnAyu;QV5|eL(jO69@Ex~fD{(sw zGgr=myPB);%ftydWu-Z9d_yVpH8tX%2_?8Q3emK?jkxNXv9ist$nUFdAU=61IICvD zuZ}J{YuzlE`P!W^cJ>E%w||sfp}}bu&Y{X*D^z^;6st-fqxR(~e4oaC5V|Zz&D*B& zwT+J9*C=^J-i zSv`mzabqE8st7-~PywEd=U}n^On#G2FKq7Rar%ZKB7bZyoMHTkgHa*AOn-zP7h5rX zmm4WCAI7)aj^MpRb~t;+VP0U?i>ktJ0eGoefF!Sa0U`s3z{|Rn23oE)+4lJw-Uyrt z!;3^=eB&Cp^-&(SecMIvJM0H6JAeh*M{)6M1>n@L5H(Q~oxX`f#UceFQ{4rpFPg$& zMG(y9-sQcEAHdy&R7^N|6~AlU#2x)hL6Rx0is-tEXR`v(S&GX5w6mrYIG-2Gir7(6mX})IRVZ*?dw8{aX(4?>}0>3ttqVW5u7KGu`& zuX!}?YYy>Ai-JqD=9A_f&Ll!0j8r~+PZv!7OWGV(Q<26H?ks(tBq!z3!?XT^sgMG$ zGiZTd*QbJe@K>_CLW9j{+gkN`4-czqPLR}s*@W&8!Y^Od`I>6~fs*fW)aG}AwM_}~ zWrX2s{z44P;$w|Q8kX>8E6dMXO_D|-#v;o{|O?qSrobqSXg;hlmAKOC=AvgrqMmq zaev5nI%BQ`HknDn$VDeYzkkD`IgTi^b29Ay?TyUDdyvGbbsIMUDgAT?e}@*BNKW%X zsmm!Szi&Q&<=9eU%rRv_Cc*^!e#2(tK@#VD79?HH;jFV|AatmJd<`sRzdn>AZBtG# zE~0zknUy|c;iU!>k4J&bUB{~LdAeZpcn}jsJ8+GvG0d3!A8FE5LH%k2$ln?PRWr8W z`S^NTl*(l!e1!N9qu${?3ln70O2~utmoa0aD;Q-5)6P`R)DaMZo99U}{XRRXP>BGH zr&>T{fim6tRT?+nmP5h*EwpddBeI4c4z9HhRJLUu*=JqM)i;E~yU3ULZul(qdCHxI zbj2Y^ToE5U3nmFCwDIe-clb#398OwTiFtZKBsfJ8X1nq*q~-wYeOD4YOZrLtcNg@& z9*5;;K9jPw{utXSZS?fwY~vXBIz>c+panK}+= z(K890`rne2C9{ZSGr-G`K6ZNdB)neV2#>X$pafp>91?Y~ZT2&`GjImwkFBlRS$hCq z=WK_IxAoWw#-3zWs|>%lZ;0Fw-hoE!ITTlBQRCGS94UH67VL~6d$oFqnNJ~xW|-jp zT^rC)-WTRM-p0Abe$2HPTjXm7qJl*nZ>a4k`U}m%?EYNzlPAoN#WlDA3^@kr1U)g? z3GX&L(2su(qvO0fqL<_XQza%c7T5j3`N$+zxF?opAh;a14u|0Hh>K`?L!1{>!tp)B zS#ag)TVn4kWsJQ}xf}hlRpAlz*GIw`$IB3{ zoW$yl)Z?jKPfUxxMZGs0;4qU2AI}bO`O+n1kHS4%&*`ZXa=FJ5qb-K-~+Bp-SzV7TU}+0P)4Puvf4S zCsa&8wQbx^(R&5I(IA}3DVYt2s{^q?@GM=qUYAXdzYfQD)uG(zBx;ru2^}}RcpWiw zP@p6mjokfVo24}@IXMZlW^(R}`zCODyc|#H??dSa8RYu9V)P#!#JC$9U?i4-!KX@8 zQ#TNdGlGyr)S}VNi*(b=H6S=Mn&k7iY854YMlLvtk?XqzSC7ZSvtxU}zuOb0GD)oQ zPmb%AdCfC>GX$GFKT@V%+W3%R1GzrEjQU7tK*b>+oMxqf55+=IxoR7(6*z`dRi%Iz z>r0o-kc8`B0r%S$z=IHD2;O9Y%H@?H`)~?8KdHw^EO|%M-&P}8paxf8$@68^xLM7j z9=cDDYeoD_Nq6gbmEep_>ap)7ZNsBf)z<<2ijEM{zL7n%@ETgK3PcI|5c?FGuwIk! zf9#mR98U1ZjHP=_!V?5Z*j*%#kDB7P%_-E)y$xgEn&VNWXLR=yeSW)9G0tZ0qrc^I zy5QaoWUOBCiu^-CAW5HOeK+Fo_jg2_TN~(!-EGh~(H}>D9D$nowyam*1vq!%0uAmE zC3*GjxX{m&DzCf)7bl*@xZ~WcbLs*zc}6vvkbjf6Z;v^itbAcYrCEsoyAjUczKRu6 zis0mV1{y*>5cf~!Aoz3=zfr0Kbvke1lHp9;%JJkmiG{f5xjPyJD}m>9S6)EeM%Z4c z!Le9v82FsiS5+-xb{;$i?QQAsB<&;qcRmmYR-c2_F<-bk&F$DIccndJMy?lgQ!?c_=6^fMW@#aGF{ZbbVjNkp3#V zOmrr5)YA~B+!zBV=Kv^H$pGEnkF;dnErO}4@LtXy7bZh9I`|C#z3Cunf9z4mQ6J{G ze`5XX-I#$5nH-Z^2CvpGq>uY6Kt?VFrf_G$FdU6?8~3s4W})cOC{1R&UIE2#vKIA_gdobRfI1yU(c)C5<`Ehh%%_j%-sG-gSLkcjk1 zbWvu>r+<_9kKc=7o2@90Hzv>*p&PL#-~sWvBY{&IUXj0FEKzjqH-5LR#{KjU8}Qqh zUA%V|s%$Afg85nb)` znV063Nd39bQcr0F{ulEde*4}c&OrgVX67%d>>|R%cxbY&Pt~E^Y!6M^>5ZA0yU@N# z9M_5N#k;N3aYJA@&T$Szf!aVIR}Z0D{U>_x)iL_e;0HbOCmTOS)?$uYHZ~O*!SuPR zIDd{Q2E6vd1DqOFzI6oa`VU})coC=1R-scQVxcL<2bZVuiNWFnsM_`yeSKrGnJ2}2 zuwfTo?YW8CX`06BlW);_?!Kf`B#j}eFX-;QT-IOaIOcKuMjoS%iTc63om|#f#*^T! z$U>(2?FhMOJ{cAsrSwPVQ7RIDhi;p9f`si`$G1Hqgj=@<)8~m}*!jqx9x~*Zd)4F6 zmbwt~e;h?yv0qef`vE-c*hizMn-T#o*D9b(@yCk2%s0Vr?3Y*j@L>CXYBp&L%{RS| zKVpW7^5?5`s<|T0u?xU~x4q=~kT@(1JdI7-tH_rk4SwN>2=+*sll1}-)U;(9FX#Sb ze*9K@bQ5nR-S@nRmnqVbaSQx9V8?z?Ur)n~B-x?xl`xR|ob+qOp+UzxdZbGl>u`X0 zM}^_)+*>GTe*vXAhI&{}gjkC@Kp*#AbIP%p{L&J@%)nsu>h+>+qqRiSZ5AqKSCgHI zXQ<=54fLwdS!6Zlpd!x~KHv34>(q7V%)LjRIo!j5#4ut357@f7YiO~#EHj@T!}IfX zi0Uz2c67r-X7+*qc)q2Gw%fMgLy_5#;4hB}XP3^O~H0WJ~dX)2T@xE zCQt1xc5x~}GW3=H4ZcO{ME*g!nh2={fh_xzlnrlKUzBH2tKYtAR{TWs& zltSg$9xSidfd)={x?Vh+v~YJTnN{w%D9nRySh54^4(APR?-O9x?mjHu^DA z0iWL5z&=Yc#@|U&jGROQU7~##axKasx9uM$#PpzRy9J1yj^+InO@!3_;p_^R1;l0a zFo;Z6!~o%YG%lPXRJQS2qvEmP{Uf3<@|+H@xAV9Yy8O8JFTpVL z9u<4H7Uv8XargQVUcsG*^sK{W-tu{wWQ%?{tG_6o$h6nfb5Ad#W6BFKOKQR?l3UQN zQxaq(?~_F*T5y}-8rIPM1ARSRpEyN)CW8avU^zjIs|E|E<#}`XHm76Q=s8chB9S7} z8PaNObvFgy)VI^UJ9gkv-A)*vV#(+>s$fmZJ-leolGAhjaj#tw`^zI3Y#yA3hVC`E zpvW2v@7QA^FC5=|iU!}`!pz|P+fXh&8K-RgP2E2y(P^Fc@V7uC{n77_p(_|nKdKGg zzKL{mtr@)U>w>?xqj1BPFuJ8vlf7rYha1xgdAH;K^QdoBKHJmaKL>Uiz zCgo8YTdCl%{_spo!a0Ne*<>7i!)237t@Jd8N_#kDPG~` z+AHR#KyiT_hHFbOI%^G>=W=&wT!JU3#lMa1O5$+2ED&c)J)th9Ye{mw3_rV55qE3y z$#dpEqOg80x=6Y+pNyhl^RXh1^{uAHxl4E{-4Xbu265GY&*E+J-?!`x`acxMH(go8)zcpjo(LctzHFmT9ejM~{FB`->rNcbJ_1i(KAm}& z;2%#U~!}&xwX;)Jh-f=qwEjZd*usVV-|%D;;$gI`6fPJG6P+T z$Jyf+J|wHv0GW9z)TP~qwl+>;PRJyH^aC&a{k9&bj`@)IY$JL>um#u0*)S@!2)&G- z;`LNn(AX)7`8Tc5E!H1h&GMjT*9feC#MPYl&1Tl6Z- zz&Y-YcPx|i>=>hC8nRe+Xg+`EtHmIk_Kk!Y9>k6vp}1Vg4^&=HN2ig=_^F`-4W6IJ zH@{7Jw_oM4>kpUHfS2o-qhDHR)+S3lDJ9PPbv=k3lN-g=N48M6pOXAnp~vXq08zR$ z#2Y>Su-IDoj0AqJB6WX)nSJfaC~{>JCVpDT8$bFOJtXGg>6@JXoXH_g(=IY0^QK`K z_rLBE2m%dpW3+qy`TyQAvi93N(rRgTgoTLa8Fz>!?` zPgthrB+gJ-2Sw61$;^n2RPtvcX_*j$MXJkj`g(IJwxa+@O)C+(PH1Xy4Bqn7WfCi{ zLE#HEexRul8*#b<-LB3-F|`$(0)7|HY7l^RqDUWoo`=8Q{H2nIn}~b6JeYW@!jx~d z)Zaq|_H}I}6;d;R*pIQ5+>XP4UlNqS;9$m*GO{p{kPmE72}}h}L}MKBCQ8 zq~k^%^^?(Ndf%U;cgCc#$%~uc#u@O%jvHcSe;hXYh=5mUDzVKdgh=goB>dGSSk*Qk z|LqoqJLeHT3YVbVtQDx9JqFPOW_Vut5Yj(-D7V=Sw+dXPDhKxCP*5dnzk4-rboFnO z(HdbCt@%fim~dSAteYrwxPsp=G3?6Cgj~&8aMJEMd*IkzdZBv{8fqH>uZK^qySTI7 zzAq&9oFBM7R%a_6;;?Fe818o7hfOIZm0IRJ1Y1eaGyX_Fbnqc2Hx4vgf0O?LGtl70 zG-xh;K~sd!;=bHz=yP2br+!!m$MVjj_xiik+`5Q&Vg7H3uNTJh-gGJ~yc<{REAq;* z2W7=8F~P5sH$O76DxkZN3Jm^4$5;cVk78IkPI?>a)p9!)SQwlZ$(Oe=yct8bNa>^S4oeJNV}m8v`mtl>aWvQx z$FU*xkW^raRplF~ds-rVPMXiePpBdNmX^3?SP_e5%dukSZQR-#5gt|`O{(=(`Q zzZ=#(SpqCmK;DjQgweU-5b`4uZCm?T_m~O%=~G%D^ja|qRnDRHUoK*TVKst{8V-id zVl)q!^W>JufLqKz;&iW_OnENJ-q}?G&vTMcwDk^%$ORI&l2oGqAP&B4FUE7z!_hMF z7^*GpgP$^b7%q7Un%2IdE2E|P&*HPmi|%CTYT5~7PKLNWJD4csZiI)E19*>Q^KjOZ znegv^CjNfJFeWQ2;JaHb?RZ&9Bz#Tq@n&6;+m?g&dmp07wG}uZ>`~>sb^?69G=jN0 zBJ}i%e)8Qu9(X#l_$(xEbbnc-@+HUX??)SN(m1P<4=&UiezeR=rDftnryC)BLb8=Xd)(W~M{}NmF z`yw2+k_4lq9BSu$8_f@!Vb{VijCfgzwfcw9@N{ogq~{70*_{9(MWf(5b1Q6W7Q>Za zp3)S9XkLKw1laMzxGL&&F2y`$7`5lVpPyPW@yQ(HyzVmgXsMt`dLQPg-UI!ek}yx= z5p3^uhClUHD0lHFRyaMtA4jD4DZEY+t`~?)uZZwZ$BomstzU`S9%Fo15>4;r#qw%( zmNDIFZ|Kn6S$m?&Y$gY{uEKha8oQD z&&eh$4?iM3i)NwgfC%$i`yJaE%IUlR%p$&yS4?7=%}^~nmFb?nk;e)v(_w)G)G z<7y~esbpU`&N?W|R!g_Tlj%+neRLHLU3P=Qr#dM1*ar{he^I{0l`Pjha1 zaePrU{a)t`H==SuuSX7>KlJi$nJ;B@RXwTX@C`_`5k;}^4A?wcPvK!MT~bp?9XzUF z@10w`53d@@q9+;XTA4}b@0o=6HJ*U^(?0qtuY&wg3PQPtV3cgv#ln}ORh<(9$ffa8 ztc<=xHU{$`J2V$(bhoh??GD6zNSP^#OrdsU2v@eh#)H}!KyYI4@EIEw*E9hWjwl zS=j?N$G7oz$tHk2ox@kQXd(AgN8x45LcWUi82+7_L>9f?h^46-m{cB*|IV!^z5C=z z%CYruDIkCyA92M%?wNP@FeCp?oma@!|R|NCKD`5>chZEtH^3Ur%aI8&`Z2L3}cLFjXWkEAxEab5vs*e?D z7UjP$n+bPk=TS?iSl;-9>qx3HG5*&pxO=^U-m)h|^5I!Lxu=a3xXj0O3K?LsO%;C~ z@*?_QxE$tr4Xl_n5i^&bWrHr924j^#sBAI?6*hy4?FBNtdM<8vu7iIbwWu7{gmaB{ zL&J&BxX_*RTbZbU(fhBwZPIzLr7((o?@r^{c9s&G^CBkqE+p}E)t17AO`X`Qn?iLW zz2HrA6keI=MgFus#}UzFo}^YQnK}I~^o0m8XXXgv?`zZWdsJUl<-;C~(EUIJ)F<=T z2ixJY*q3CRvmJH>zQ!tM5*Rw&!wdJqjh5}!V7zCUVdl;?_|*6Y6rcqacybx?1MY*# z`vNjB-5pfMKbgF$d{0UXyGViGV_bKDVF%Cd#pB9WwEyH}n*Jn#-uGxBy?-a+xrd#Y zTu8}Sv=+b1-x@Z~?O}H`4uW=oGTh#-L;rkR41+^1V4*OV>5GxW=mL&UXjCSP7CpiX zY#lT&=x2|_oQF7)1kDGO7$x~An#B)=1&3-tC+j`R+FnD`!EAi6%oLm)Jb0R$#33Q0 z6oRcvA$z+7+&(e{IpP;!pKAuBK5&M}=hpN~{%5qenhGAGi{R6~d^E`muF6!|PU9y1 zL_dESQlJw9InCLa>kz}auCG%=g(lwh-)HcKrWP0pl(Na{%i+j{YwQT;Dlsvf4l4gf z>C07V;JYN6F5q?0)-8*0+rd4&GapkYSpSvOi8yi6<(o;HwFIUz1S!JSG%_)gQrsW4D0MoaM=0%E1^K>CA7ZS`)9dEO6-XWHK_M@F2&Fuh)4D_b!a09bN;WJQX~- z`wH6nErHRf|3JaYi3U1#fo)DN<}eR%=lrXv^T3*)^0tip_^}lHRizm5jSAqpM}Ys~ zeli?8BVf$iD~w;&+d%E|d1`9D4&zquB@gf#=Z_XQ^p-hHoMYGHvq4+Zmvt7`xOtH$ zHUU`Ws>w5s`bHOTzJjN%)v4y{`8<|iP1l~C&cD;2P3kOTp}($-UFNMtOm(l&n{}2r zr)4*esydpe?`S1aUb_6O%GoqMLI9=3<&8sZcHsN1A z^ID_>aNFNzdPwROEgN5tsa4;Io6jZqC#Jw0xE?@yWwi1B{&^T1R)x-Lxop{!Hf%eX zL{lpXpZT(#jM~lRODr-X8LRy9xcp>1;8sH|{TuMKt{?Y(A4DE29l*+=&#)og5Ql~| z`9n`n;E<09qw$TRACa2-z!wl1+r2|KY0jGtjD$(>Ri@wNt(WiO1#ITBdJE1F zW$_IVT{H#vINZR9{bEGNq7i?%@8Azj5rPr3NXTFC6-E#FfcK5lc&fvdbRR6F=AA

v!Cf`+;SajU6<7}+P}e(Y}gs_qJ&f0zZY-~52V<%uY1 zxSpT6FA~e06hP|YSCrpB%H?Ne;C!hXb58gX-s3L+4jt|wJ#{@;u3CoYIri>?vMH}# zFaUDm{6S@l8Hz6~#vMCU;P{RUJY}+(NCy8$E{2rTxz0I6QF(kp*jbp-ZPt!wx z>3?KV9uM6`pJ3bGc6Jd?!4=&%QNXa6_S&q(?ca(^ zwekCn7U(f411&Cz!1JGBnAf%wld7ll{XYxRS7*wJ%aNB@A#jf1uG6sT!?_gLfR1;F?kkJ}Z%<--J0|-+TrB`LO5I zO867Uxo6UGj^ErNw**~A6qtWqQP5^u2$e4CTs^B0bDiCW%4_wQqG`?`J9wAmcjaJ3 z^;0-s5KdkQKf#P=Z()I1Dbex_CMSIJNpF7;&R`AbvbmBV<@O$uY>)GHE?dbLr9v#N zZoq+Qi_pK;io7?U0Q+S`$-S@}&~Kp1kM_33$iKG%V{9Q*O@beHHIPaheI|KL+t9h< z8Ob}pjQlKYg6qX&l6t7!a%a%=3bIin+u5&N_>eu zYuSovmGIxB5A4;CihSeqTD-^q#h_oH8Kx_mQ=btb=yXYkn{Q8m(>XKT@<6H?h9OUP?4>5Rc0jF|Hd&xZpk(+ZJ#IS-&60OupY$D~Q2imoRs54pmZLPMhxW$ZNScG@E)I`LoUSVPXQ2N zN5OiZ6F2`@j+=a*&>-n^ymh)y$(!^va!=!Q0eXZxk=va2dgYIFoI33U3OTAneMS;_a4C(KLP%XcVueACKjt=cm8;Tj?@Jl=qQ4@7Lpx9Y1JY#tk~hb(|di z$oWAZtikH3iCD0b$GQ4l*pJUvpk~k-dhm-ra+QAUn`w)gFV)sCiUQ1o$$Z{}-WDvq zIvq{Ee;}In_voVU@1fE&1D34d96kxkc=F5xeEo^i3sWOuTxkZzFHK;U8kLeI6{%>| zP>7mj8MQyB27h&9Xhk`petUjXhw&7w=-q%)?$Xq^=MLGYwF+)*JWoIC2{AH(1Z^JO z#1JDDx|(BJiMI@H&bB1+86VmA(Z^B#c`4@pxlT9ibpi>Wb<7Pn6Ocb5!MtzIC1<;0 z!L1+BF5HQ>mET6OWho|pzlUMMpJ^Z;bQcpePmyn9OZlHAxLL@OKb6&r7iizBOH`um z4?Ikk!kHl{IJPQ;+Lfvy9qpng)pyd0>2)U6qPsZffF;`V7vO5GWb*uv4hS(7n3gHd z7_1+r|JHmbR45U*7w6OFn_Pdwu2Uw1;o%rfRN%%Z0SNyr4s$*nN4+%~^g`Tv#?G?~ z|IN9KSJu2E$E%Z2;j2DMEz`w=CYQ+UNKUrWQOfmIDC3ru1&}GFMst^N6=jQk=BLR(+lzK+Nq3eY#Hv{mCb9pu@h^29e4qv8F)nLE!o34mXb;* zFe@h665;P6Y>{0Reqa2P4wRJP>x28qYGwvMCj8$Ad7gBb~b4y+*Is#XQx(D9sr~O6u9}0NF z^a{-^)P#*IuF!}aN6sg+1pG`KIGB`o4 z^WSRp;22~J5f_a23&i*1#i;RY6}!(pj?u4vPPaeaL2BC`AisVJIi#=woo9Wep6_$m zmc;eU=evioe%>(IV1Ahvxa^>1$sLf$Uxl_6eYmQ2C#kO;6GZ;?8 z@ZnG>84Q8SXft#VbjD>h`h272&+zb%B7aSWGPg6oi2H?)k>a_RAorFvGJK9h6pqB8 z$Nypdny+}sMvztXKaOV??V{_Xg6JGkMU?q97j|9DhZ$NLFnf9d4C+cTNA!JgkA^!& z6*2Y{?|chcz72Mw3y;rfY# zu*oPA2WlVDc~<9HHuf~SahyW5usEI&G@vRQ8i=P-0G`QlC8i;}nO&U-n+nCyv*RlL zl2L&(-?DV!_oow&}<&`3i&6u-?E1lOJz8mc)N`*+Pku`p-c zdtx6(KD$dSCkHS-wpF~I&H<9SDT}?6`h{li=AJ<==XLL~IO!^UMA(Q;xcrSE)^Arv z>&RbtaV2+unsErHtXfYmi)U5-9k>aZk4u;&wK<@D`3kLngWx;%on0VW&TD%Ami%mJ zpcXqi(Md*uKk&-+s%HxkJZsNkR1_DoNwKNjq`lrwMsKR z)|Xv1(>a(ex<3UU3arD=8;bDn+4-b*^;)`Fy8#0i%i$DxNt`pw8E@B~#lniIOs~c= zyq(@f_g2G`UIQTl8_FHA( zJ8mB^s3E|*=vgrx?uyLh-t}bi*e*2CcmOfdFL8M8SMp(G4)QtgKvI)4B1&VI$szdzF9Z8bbmw zJHsgP?^vo3Bkz2O`#yyq@=cI`%3#a{_ef&>t#w>|CE^T)KrC#tz(pMEvQ^QZowD2m z+pc)wxPx~bWZNpRhoj^omJ)5c+%;?5!V;}Do!B&TCJiekGt^`bAU1tdK z7X-A>Qmdn&CpAo?Ne^J!ZtUroVOAGjqDR7aV7b^+`d^wpDh;TjkilwPd^Fa?^7RX< zn{b?5`;v$?gFor+Z_`PMwgcJ-PD5UcAQR&9ir4uhvuawDBsR{?rAMC%pic*v?^tpk zd!EFj)KCR?zEI{HcZ9%M-wa5*-+*>4lNkk#2=*Jtp`OrLPVPJi!05tVsJN2zyZ+?i zyr-tTsS!uJO!pu=*$rOvt7S=l6q;eGqC(93eAy$J17q zDNJ%jFZlI z0lCaM;k`#Ys0-I2P;>PLPkl5CtyfLNKf^U7$$U91>Go$kwYx!l=mYYrg4kxoRrK0N zV{*hX*|_)2IQvQ54>vdz;s^0)e841tG3lgl^o96ao@GK|unFAh`$ij=&H?pLNjUqe z8Vzax%4Nm^Nh>$wX)|&|9qp4OPG^Yf1^HmdNF=NbJr7S7d}1F5tV3%X0a*Q?Ha>dV z2NJGbysz!gVA+=s5VrXnS^G;KW*h3^y|J5c&_NrGh6MOk^LVg{cOL^qZnJ5@YK*I_ zC`R8HHmUM8rz%}<>EanYtbhK2G;T`AeVk86QzsK?=Mxlm5@u>A1kk0cBp~8T7v^Nj^SM|e zmkA1|J0N3+BJ<^}6(}FHCXo*=v+Gjw;byE3);0W{&;?g7lT|tgP5#U%Mu(6Gbg*L&&So#drw?2v*}oq{ z)|Aju7d8Iua9O6QM}hw@`Vbyd&mmS@s>u4y!gOy*4_%V1fE%-|py;a?S3^IYI$g*i zD&-gH(x+2VTHg@!Ha#I8k__9*`IIybq|u>}rB{A?vp+Tm<63+c{a)e@&{-Urd?_PoHT5Z-OqXfz-1z|}oA)Xsy!o0{if)1Fi~ z`A8C{s-#e!fD(A6rQm`O#b9l61D{!c!{Kltyl7TOs=^J}uUbfdOtCO&U(rHUf6nCC zy&MdQ$`O>z730pAbS}0egT&FA}ocR@0?Ep9&lY?W6`+#Xc)MB4t7;RbA|re%GJ~_6r@(U ziP&PFLa(a&P{pWu{JaI_9Ft+hKOe)*e>|j_vsF8>Fi4%-d#a)5w27F!Djee*D)8orsplsJ2``$Gr79#wzB-SD;tA^$q!ID;2*sb>)S__Y$2EQGx z-^eq*MeX!_>~@%~+7G!^XIQUFK5?uZVMEtiqw*nc=kDN)n)WP)S|&i_%{!bj(iNg= zZj$H{S2pi(H?=hjh9A1Uv}}0?aawp50=(kUeCs)w@W7F3*scNSuYz^oT|j13O!mqPJMAqjr=T>}7f==DX znxZT`Lq5}Kv0V0fmojb-iGa*@&dJ+b2hxw0;|9qdUPLeF^it&0o#9JyQgRrUbGfYV zg3C}<^YXHYlX0fs9#;r`ft^w`2=2s+$Gw|p^yhfi!c&#fc*tI4@w z%I4tL3qiEN;iAd!5m7#@&&3IIOTcv11*7Q!KGZ(zVAW)Mu7k{~F)stz;PidNwtjeBN+)kO% zm#p}TY4l|%&@*%OsrSOTs_Nk%tndGNI@53}zxQv;JkLXv1}c&=W!!5mO)7sl(2rmsX3BF!nyX|DhMJD%fs-Z;GQ!rtt?*SfCH=RAYO z=)=W_IP7UaV=Z$zg_Gj+b>7t`4beT^t4{{PJF9PUrDhj6=RQgF%HFxrFT~2ut>zgw zUqzEDlo-&w>J)CwtSq}-54|WluF4IJy2S-42hm3pUUTk!uecd;Z-iezj;5!lc58O6wGY#5=cu_(16{s^qEEl_nvleYY*vAsV8+DmNulO z8kW>xS{`?9*+oHv=|cKc%9vJ%Jmw~bYS8DrHwj(J4hRRU9%ob`eg}fe5FWPQ7m_2{t=vzy&X0w-JtG@=jrR& zJ0L}Ws~{$N39OI_0NZzN)Zz7PFkL>LWQSycz)6P~9dCmNTMj^8RS;cUlZH=b%!T`f z2_U{%7VWyH4UPNGLd~8pFd=vn^u$_lO*daaNQ)bw7YE^{hXi@bf1uL=QgHr@CCo0! z$Jg2?ur{eu{AAKI5X;E|uSI>}r_%=ZvC_=*H_!fF*?@nTrU~vW*h1GSe?zTX5ZG`d(dCug>+?@J@)=I6OuPYqs7;x@uRkg*Jz1EDcUi0#YJYY7tCqy>(B7--e#EIW(ngLEkSz;t9F?HtCsT7N^rynCXh{~|CT(`stwD{Ltq^LRQ+5OZz`6vRM$`8!o_qCq2EF^{ zo3QhfC$9Ya2i3GZ#`b0fGM{{e2nwx}kSHZ7ZgNDD9_8dump*lEJ17I_ff~Up>)Eak>Kb!Q`oZd z9?W?^2#tDMv8iPi%s-a}bHr7#*o#p7%lbLUA$~4$=moqzvIvj)QH4^Jt!WI71(jO? zFn`SroWqTQ#9vlGXRm`S{i$@@jcl&`(Rj#!tzi3_&rJF+f#v(>!4~aYtS4R#ovx0! z&wn&|Y5PUk#=nmCyuC_KGe!0?@hXmW{mb*jSAcgl-)$X?r?))#_oh*)P}fx=EZmy` zBd5ctfmSJHy`%7~_+co&JP{5y8PkCz8}NVKh4a7tqW3PY!JDQTQ+ri&Qd;PZU&_?s zz3R@ea_0+}wjdD>v;>erq=Elv@p+%t6X4u@2GwNmMo*HI(4!u4a;Vt}8y(2!nHZ6L z!JFUX%vYkR)Pm1WJjNcX;?T3;rd-h9PDWb~z<4mV3!us?kR332(ilUnZewrGfmoP$GQ@22~ zz7G~zxtcWIPNW`s4!}-{P+KioR(bj+BoEi%g*|n6oy20Aoqw0_LuR8Le78S4>>McU zIgX#YDnVoU9k8%!M`P(3te#Nk_L8PMm#*V?g*DKl zok%mg`xRw?Zi+deL+i< z;pE~4#3^&j99}+JGJ6k3wGhHr$%w zOx3MhfR*>qDf*pIy?hN=m$gHjJI{eWRLrH7tFn7$<8a>Vqo_tY0Q-OFMNTgB=*+|l zco)z|B^?qty~RGDf4*8k`|Ch2cpg0y_Yd+64uV0tFH}CrhWU9KkRu;Qi}^Y4xV>BP z*~%PhC%ql_iERK;!&Gwk1kY)#PlT%uzBKC3d(ibf4j$iJLG!ri|D2W}S-Tll-Sx#j zt|!s$TXRUtlV&LU#) zl-rKg$|bmyE;3+$^fcbst%)wYGKI!JA~Z+uCFp5-!1M;n4O#g@mAWntmpz6%>o*I2 zJTS+P2R*@c*+w*N7Q_3q%y7W2Z>U~;1_{qDLNHbaNmmR(hwDQmxp_ZsSb7+wzU$)y zi8;Vj#=~J2gFRO!fpQoJ5lPG8#vt#XNgouP&Xb|8(_i4itT-&a;win_;Rc&u^K&f` z3D$ghFXZ*B<6qY?c@ma{C!`)l%g{6|{xTN4wse5X_aW}h)&Z2i_cNr3K12Bv4+~vx z908l(qR=aoV_RmQ%jYtWQXkh)IBX?L!bP`|Mu`mI`Ce3Io;SK}V@4I`7~vaet8qe@L8MiYm;E3>uH+g7=ai6y#g)VdTipP zXt(TE1$U3mKoyf^*Ydde+F*L zXSl4Y3pUJY#%C}UluH|86lBtD6g_Sub3ot+2G+Y=!;WFD5gZ-m{9&w<^VJt)e>6Pmt6q2Qa!L}uqs zs2ga+wRcQ#lWh!Ai*2X=$zq(`h9G1%1l*^ew;=!aJ(RKaD;M9d0B%QpX*E@?3^`>S$i54%|8H-qmtAx8VJebUekzg6AD%VQ;L(NnRY)D7iERb}!dLe=p3(;x~*jceM|E z?8oBrA9Ki1mpZNN5EX4>LK zYbmD|5ujSP>;6H+na8x@I`j z3M@}T7n`JwqK5Y+u>n6DmM`e!9E%!Yq3bbtwBrh>J8R=5tCvE=^aY&z+*5espWR6gSLpOLn zr7jguuY(0E`e;t!eVjA69>TgtVS$qft{4zwzL)O^4_b!dIa?LT+KKIOBFP6j?#^aP z!}>6Fb^%;95P_K6s|oBg!K>pWNZwL!uBi7rQaY6af6^8~GG>6Hv0M)}yJf4oc8 z9mmo6xytN-RRV|>GT5@XlWLl;hhB%x^zjyNIJ@>6Ec*l8Hhn|1cC!{L5dBXuQ$qrt zb+{pvGaYKTB!cWSd6s=X2Yfb|k!p4Yf0^rwj~E|ttnmGp!7NP>urkN?8!= z6~Ri{(HKxC1S-Cz`~0GjrLY%8b6K?SM>-8Yn~5uS#^4`Q%pm^4c;r;`0`rXmx^tQl zxTRg@+!Jq5Z_7Q{$>jj;YRCmg4;lRV@&R0AZUATgQw52;59u6ZZBi|AnTi$gc{DL` zcpy6iQi|q7dzl|qxpo?gR6JmXlpawy)*y^E=Fj4mj=1QQzhH}l5#O_N!Y>Ck@zlCQ zq%=Z>DLz8fSMsJ{?(|Z4dGa*2v?+p=-e0uU_7Xj@Tpp4{((&1B1lyghLF+&XJkXed zNB{bTve;|39QnJA4i|#!1M?w)<`76`I_8zq3^BcVUksth9697l0VyV4pIaE{}!-GbQ zdWBSh_D>li-6KhYlSi|QJcHw1*Jk+d)i>;~zJgZWmBqz7pP~Kt4q~Y)L+p|32>1Ku zfYGfF5V3X#Xq_^p@8*0KUgx<82`>_Giw1y;gdN#F*iHKrX3}e4_uvZ;L>bBAc{(*U3|Od+~E<>1c<(RTg;b&n56=paXgb93XmA z7!7k$0oSH;-0LU1U?BF3Fhq7Ec9`-J-xia{c6I)={>n>Ozq}4lR{hC$7EAFznYXm? zqzG7j9k#RNKVwg+S3!$E71$J3!6IvE|iR+mbwE9pJ%}S8wg)* z(}&LXSh$t4i>j9%Kudj<$ivn{usG5L27wExwk%*Z&XH4o%|Z;Hoj4uqg4gC}swK?QGIvmdX|*o9p6Z_z`C(y7#* zwVd>FFI=OZ0OcmT(W0bAEbCHSqDW9-qe-=Ek zIK$N$w9^Sb&fr_?!t)L03fh-w^Gc*_E;r>EB?+srmz5#7z38Uxb?3PI-S+V1t1->G zk&Py-EVXuUUn0~^*A=Sl&lWbTjJNG;iD`P7zDW2;rn0GAy+?Q~I7{fc_Hom93l*V( zYm9J1b4ycc_aWQb&Aq~?!>vs-hDEuZLpPh=6rLCE{!%D>!4(SU^ba?Aei7w5L-sY5 zWa$fMD-Jfzym7Nh{nHKM9wsZCkawoZD!W0LD1S!SDRw|8+m+uGFKS{}zJ7!7-kSSD z8c|{`zdxes_ocxm#n^K~*={xAfcXdEABPbeO|v@T;4J;7|K|1DZo8r=)FlSO!_8NP z*ZQJ`g72EbowtI8J6@g_Rw*qK`aXTzw52~sSbpJ1ll!_Q26y}yikDZgqy$E zHHoi2Csb&E&Yxz^2=6YlvbpE_uj#`9jV9?p5#i}$ql6*(On5$UvoK~PziCYTpC*~v zvxP^GVWIULTOl{aLwKpXtw|ENCee+fgkNKOggbwo7h2@c7e4MkNR}u3$J5Ql;SmRR^S?(Zhx4jed1j%dxfy*5{5Ai-_b6Y zNnm?Pi50ih;r`=+aDAyGM0~5{*+H$i3%`Q5qMJ}aYXdsRvv%KFJ3~oiDl$KO7x@$p z(p^WV;sE#koL$UbSiU=rPF`S+_DY7^HQoP(4(rH~Qwnu}#%mpTnB~DlVan7O6xa0B~ z935T>8m0<#NbMwij2VmP+}egWEjria>hKJkzl(v5(KdKc_#a59iwi^wyHL*LEVTbg z37(d%g=5nGp?8qNjjaEUdj1vyE&4_GH*bJ@<1%5)jmKO!>E@YHPatytacn4Ej&wQ~ zL5k%|@X^l}M6phsFTM{?dVK@b8u*^TjbgAUe}|sB1Y;Kke{44G9X*h>QE=^{G1=fQ z!WP_{53@4A(G$m$h)MDc2&xvrUOP78Zu>A?rj$+%%ujLuN|WI6nX|}$jXJa5nsW_1*=PHA#y7m;F z*R>8e<=5l&w;aHs2q5uIJT$-TrUM-ZiOQCV>{?1Gmi^cVrV~W5_AWcfyRi}?Y?c5K z6eFg#jUKL91)B#i&`etyR$*X_H2jZ&_{m^gJ8eC#F5HPmbyTD5Do?UZ=K~c9jmG<4 zpTsNl#K35dBsKb+fv)lMaBulQs*TFP+uju(CqTc3x~A}$DddBP;a<3x3M z37&SA_kC^eqCF-I3uIrwlQkE)5$CxO5E}$9Cs;xFT2quVBNg(O=z_4d7PPxD&#luV z=J6?X?6RNqR-_zEe3WTdzimG%sTmJ-2`Ax#*M3M-I|#cwqw!}M2X61(jd0&ulkQrZ z2mhY`z==f}q}f!QcE(rH!4o+sp+f@vr!0of+uKp|;5;(6)dU;W3y9D2Xly@v7uT6s zk3(Nu!<7}H#BpX8G%qW{yQ?SB(9{4BoxOlGl4QCh`67wquX>S;+WuCc2p> zAWu}rqg=a6yeZ-|JaWCq6+7yJ#03J&w!Xo}Zeld^;3RChPze|F+;$P~6L`^^wOBsI zn%iNij#j4r27%}j{{5#09@uAr$?Jd69&8JJmos46r*(Mjmvm~la2K8I84p+1YJ%%$ zK9BB$N!!i>y4Eip`RtOzX2~@mkvs~1EKk8`+yb1Ya0;G}b)jqCYvV3O3F7^FEHT@f zh@A4}Nmb(rUNv?Lxxmkds_)%LzlzLYOLV5iT;R7{gmLt)q}|7 zK?4M=4F;1Nvq;bTAUY$o3RM}u5O{T_qKzG$^!CnoU|KLNOpYt(UQJvE0g`ogH8bpZ z-mo9f(mw*r&dIVnv&`U}?`rT(ATWKI0=jlaO|b532i|^Ijcx4ifK1C!%pEjj)1r>y z9cBQwbH~u^c2j&%J_t$=KZj8Vt^&7VDfDgIfHH<8n^ycvg7mmU!o#cd!Fr(%p84|> z?K^Z7qQCnQ{g;yXjAJ7h+x>;>nsHQXaWu7by9;CXk0R|3DsbC251OBzMf;NX;;iy2 z;g_`kXwyL*PC|Skep8VIpZ7%It8%iiM(mbg>mGglZg~~V`_%@;5$C|^>H&Cp!-Su2 znV~{%GfwaQLlyP>Q4?8-zYtxhEYTzDvr=Jaj48y$i!s9=)4_kiEzs+aqVh7UAXha} z*wZTnN!2ej%5^gcgZW)k|6UZBUnNd-6U#|fHnI~nHpCPC}b zX|$|o6LehDz(Ms|0;j7g_(WtYmnGkfzNJ%?EcXCQpI!hB()a1vBd_t;KY4KTE}y&n z*-kI|dGoI+BckY4jnt!GqtWw3$SaxY`01WOnCtdipiw5ldtpSOSoa+^=&j?noJpl> zUthtVKiA>ohelLaX$gb-oZ;T`?-+lL2A@k41U}1`VqFUcedb4y+@4+B9iDmTGvJPY zPSeL;XD89R)Kpv>6pE4)Pa}y)j@Iszp^v|7vITrcbx!_O{G#m!&1}AgzqFR1UqKte zUw#Fx(Vs!g`WvBPsx&#W>MV8`cM(M1)tEpy zWqmMkB^x&$R%CK39$=%E7tl7m5t)B%1M%HrIHl_{?-5uJ8Rv?yeXasqBpWHP(h$<; zdcN=^P?UF_H^7>uKJX}SFH9G_gt}|1@wSi@&N8yWE_0Hb@YUgHtn=+RXZfcRAbCEt z*zAS2wN12KVGbsb)WAZ#7Mt2QP_v!dkid7xF0XwIsT+o9Q9&i-1{q+Tlx9%9oDSO* zd#T8_RkZ)(7Q8#|8rBRt3Fl|c!|4T|@TKHooYlG+Ztrx%?&}A6{zy2kc)S^w4kja~ z$Nz9?sReYM2%!pzQs`gzf2gGSH_fn3!h64thKj~7)GPfD_xS8Mn7T3nK7F?)U;bp^ zX*ELn_D?jHiwl5UMV=RBjc}ML&rf+81|g43Aa8*OoIKvowQa8es6B}1z6-;<_}uxC zj-T}CZ6^r*u@Gzd9fADw8*usagD_P!1`fOi`f7^=4y{k)8ja)eM}EGK4s8=Aw_F3ad_@etR9uIg~unH{Ht>o1|8iUb&WY6VT~ z54|~T21Ml`no~bckUgmY%3MVtu=5UxtoVl1wRBOeWd>-K@@&atX|VeC9xk$U6&(&& zV53nFZn>UE)BK(6_P*YQZFS%B&p|Ofq_e@_@C|$n5{Ff7XR(5|I7w@(!VgRxLA!vT z@pV{%R8l=%_h&3NTd@bexq5;B&xO$I_7ox|jQGF%O_Uy@j8$IRaA?eK2s!)={tX7< zs8LpAc=|9`TEO3f{&YhkOhA@^-`P5<#qJ(ICQN#F4pN^a!_JTcf@1+8kUeG< zXr<@SgztNJ7jZCZ&pwaG&S{3ZwPN&zXeajN85b$+vcU2}EF^tj4PQ$_!OP&LV29lc z2=1(d4R`ZFPyQ@!WkI%*>s$p-rT?Sz{qrfOV}oUz-{ITENAac5dx&-%L8BWriB^U^ zw#hbzLz`UTKkEZT)uNUDNsNVi{F$^*?HyFn|0^BZY!k(Q` zYIs{Qo_h4hkqx6pFtyjVZweS8#SPD~T1+?D{(b;;-E|-qL7PbWGkG$|&l)FtUO?Xl z`tZ7Iq4tVj_abS-TV(K0D+#>XPGka$@yXTEr0T_Z`d_RY3FiKy@ROI2qi+zIakZXo zRm;I?QZq^Y^zS76K_4Eo_YH2dxJ;(_DiZnGI%I7}77p+(BJU?WB9e;{$)1JD?Aytt z%%p~V41bQl{px_T5%TD)u{qWu4f3c( z6mQ~Q;T5ZF$l~ys-Ctw7+E=6OCAMgj_WcXU=%CMd znO!FFdLKp(TFxSAr82}$Om>vJ>!>m*QEB%xlG?Ho+t-JLS*waKTkqtdBeucMTaW`lXUz- zn@8{ZBCA%2|uPu!A?8=l6-%P(Nzd862*`Vbb@ zwVr9`NE4saR;-$~<=z=m&p1FX5`pAY8NGh1K(FQ#&0?Hgq1q?P?%-D_2HlPO@X0 zXP6V&^WOMLsx2#b$`)3J<-+WkQ(&Py5rQwC#r?}iV0rf?_BveAGPZizPvE}5IXeE&RbZBJLnBnm+#8XgAvJ)D>e5(kTsm%+D zw(@#}qF|U?V8K4yyU;R8Gj>Hvhm9X_fdkLXm^e?`lFFlSEm;~aE&ql~!`85MzJo-6 zVK`ZBl3;iEj~l4p?1LG81;|)Tik*V%^!=t7Haoxst}WTZlIMCbIfEwR8e+$$$kmX< z@v$U+VLH+D)k4%`1EaA0au!ZXJ(@%S)8#FlMG8B<==rpjxgwcn93ll84bPGug2W0g?LJ1@g5MvHL1fqPJxXnjE{3WcdEX z?CAr6P1kvtsBJ|x|0{vI90RhdSBy!U4N&Q&6%DVB%w}@`r4UQUDm)a@$=7{e(B8N_ z=-Tic{Pdqd!CsFlgyJ~ zw_XP@?X{!Gv)?U{zU&^pZ|}(TZU(ZCF*os32@O!4cmk2*o!rG~9uVhPhK{%JI)`CepxWCetyfsonveBx%!fy)MlP`-4986hGr8s4>CNs$qmk*ChSZ#fESA!DqnR!nB|?X94Pe~ca0#W=V78T!7= zPjKq7F5zv+Fm3a0);mL*{g&>8C)dV9t%M|9_kJoKYA2*(!&kU$*#h+G5Lw@x#4@`v zlj{ErsRp4;laCZli+BTJgR7ZHh$=46>7eiTE@iQ5ifrw|CB$Af7wYy~k)9K}WVS~; z>HaYv_8yXF_rz^^#mX(xeo>4S>Bup!)r@@WJx&S?E`oIV8(84;0j8XbWdg}#FgHAq z#H^Z%N6!f(jq7a4XR~!k7Ba-ArcM5)RNrP?meS!Skf#ys%qj>gZtQ`6j zO#Lq4*}tbSH&jZunW_+(r=jF%)m6B^N|I&T#5dS1PCyS+4>8|g#kgcvGi)#@CR3)A z!+skrqPFfE%(Qz>QeJ<+8V~H)pZ=NT{^VZzUUWQr*yD;*BuCw1vOy-FTj(7(mtI|5U$9*o2c$myGhb`G#y>Ik(ga&CDcn)uh z?!wHwvP`OL8a{7%oPD&Ws8s$P9KMnRFZx>{K2(JrTYi^&akUz@Wt}2}qrFj2%_qEP zeJ1>7ZFq}SJS>d;j$f}FLr=<8kmj@J$o#H5aHTRDI{q-?xxNmb@)#YJY(-|@`<>S| z909+>ZZQ5~gK#2`nGM!D3#A*B@!gJYy8EyIsYFMSo7r+`!sj+>@YamH8ugKMQayJwfhru$ow1CMT?+Din@k2YZE zo+z?`k{=*8@gkWyJBn>xH=hRo6|=WVok~WdOp^0_HjC|_Vg1&=f#inqXPa(&VqU`| zVD47Jaf^H;_n`pCKG$Fx#BQ!q`2yS4*ig~udx`VW<>XkwTjUcy06+Pd z%A!-MY*lJE+zxXfg*m@b=+Hs_V6za~`PQ_`;B~aS7e*;5oXY$%zo~; zN80>d@vZAqU}gPXh^V>-9%*?zUMU8C1skzdU5YI9(ONR4C!d^i5)w_fO4_(`n0_ql zfO(5ffU>6Blo3XElz+IEOJQzE`njViF#6nqO_~U(CcKcS6k_C06S`hJ-IZMWWBvkwxLVi2|=q zaw*l~@QiES*L#C8}x1Npr7$fjfth@8$R$Fi4_=cm<}@?{0Aa$AczJsf7DU)Gbd!xGGC zq7K_}ZYo>q7EP8(+OWhb0V{6mgl44)`1hu6n4~$8*?FEJN1mQTO{!~$IkS0ogeM0+&5|40FiiZy2*HvZ%%iU-Y@B=X<16xhyxHXlD2 z#Xf1qvDK1Kgr-c0W3CYRsj$yZ=$1u{wuF$M!D)EcKn*OD>c>+ri8AhPIs}~+3jcSN zm36l>F*2-ZU%?7=ESeRaH*zz)r0hppRPFUrC*l2L6xsSKM(oYPa=U;g9(c)L&}9GL I1x@z<0BZ1F)c^nh literal 0 HcmV?d00001 diff --git a/tools/accuracy_checker/data/test_models/SampLeNet.prototxt b/tools/accuracy_checker/data/test_models/SampLeNet.prototxt new file mode 100644 index 0000000..d6b158f --- /dev/null +++ b/tools/accuracy_checker/data/test_models/SampLeNet.prototxt @@ -0,0 +1,116 @@ +name: "SampLeNet" + +layer { + name: "data" + type: "Input" + top: "data" + input_param { shape: { dim: 1 dim: 3 dim: 32 dim: 32 } } +} + +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + + convolution_param { + num_output: 6 + kernel_size: 5 + stride: 1 + } +} +layer { + name: "relu_conv1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} + +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + + convolution_param { + num_output: 16 + kernel_size: 5 + stride: 1 + } +} + +layer { + name: "relu_conv2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} + +layer { + name: "fc1" + type: "InnerProduct" + bottom: "pool2" + top: "fc1" + + inner_product_param { + num_output: 120 + } +} +layer { + name: "relu_fc1" + type: "ReLU" + bottom: "fc1" + top: "fc1" +} + +layer { + name: "fc2" + type: "InnerProduct" + bottom: "fc1" + top: "fc2" + + inner_product_param { + num_output: 84 + } +} + +layer { + name: "relu_fc2" + type: "ReLU" + bottom: "fc2" + top: "fc2" +} + +layer { + name: "fc3" + type: "InnerProduct" + bottom: "fc2" + top: "fc3" + + inner_product_param { + num_output: 10 + } +} diff --git a/tools/accuracy_checker/data/test_models/SampLeNet.xml b/tools/accuracy_checker/data/test_models/SampLeNet.xml new file mode 100644 index 0000000..f3d55ee --- /dev/null +++ b/tools/accuracy_checker/data/test_models/SampLeNet.xml @@ -0,0 +1,239 @@ + + + + + + + 1 + 3 + 32 + 32 + + + + + + + + 1 + 3 + 32 + 32 + + + + + 1 + 6 + 28 + 28 + + + + + + + + + + + + 1 + 6 + 28 + 28 + + + + + 1 + 6 + 28 + 28 + + + + + + + + 1 + 6 + 28 + 28 + + + + + 1 + 6 + 14 + 14 + + + + + + + + 1 + 6 + 14 + 14 + + + + + 1 + 16 + 10 + 10 + + + + + + + + + + + + 1 + 16 + 10 + 10 + + + + + 1 + 16 + 10 + 10 + + + + + + + + 1 + 16 + 10 + 10 + + + + + 1 + 16 + 5 + 5 + + + + + + + + 1 + 16 + 5 + 5 + + + + + 1 + 120 + + + + + + + + + + + + 1 + 120 + + + + + 1 + 120 + + + + + + + + 1 + 120 + + + + + 1 + 84 + + + + + + + + + + + + 1 + 84 + + + + + 1 + 84 + + + + + + + + 1 + 84 + + + + + 1 + 10 + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/accuracy_checker/pylint_checkers.py b/tools/accuracy_checker/pylint_checkers.py new file mode 100644 index 0000000..a42ccd6 --- /dev/null +++ b/tools/accuracy_checker/pylint_checkers.py @@ -0,0 +1,144 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import astroid +from pylint.checkers import BaseChecker +from pylint.interfaces import IAstroidChecker, IRawChecker + + +class BackslashChecker(BaseChecker): + """ + Checks for line continuations with '\' instead of using triple quoted string or parenthesis. + """ + + __implements__ = IRawChecker + + name = 'backslash' + msgs = { + 'W9901': ( + 'use of \\ for line continuation', 'backslash-line-continuation', + 'Used when a \\ is used for a line continuation instead of using triple quoted string or parenthesis.' + ), + } + options = () + + def process_module(self, node): + with node.stream() as stream: + for (line_number, line) in enumerate(stream): + if not line.decode().rstrip().endswith('\\'): + continue + + self.add_message('backslash-line-continuation', line=line_number) + + +class AbsoluteImportsChecker(BaseChecker): + """ + Check for absolute import from the same package. + """ + + __implements__ = IAstroidChecker + + name = 'absolute-imports' + priority = -1 + msgs = { + 'W9902': ( + 'absolute import from same package', 'package-absolute-imports', + 'Used when module of same package imported using absolute import' + ) + } + + def visit_importfrom(self, node): + node_package = self._node_package(node) + import_name = node.modname + if import_name.startswith(node_package): + self.add_message('package-absolute-imports', node=node) + + @staticmethod + def _node_package(node): + return node.scope().name.split('.')[0] + + +class StringFormatChecker(BaseChecker): + """ + Check for absolute import from the same package. + """ + + __implements__ = IAstroidChecker + + name = 'string-format' + priority = -1 + msgs = { + 'W9903': ( + 'use of "%" for string formatting', 'deprecated-string-format', + '"%" operator is used for string formatting instead of str.format method' + ) + } + + def visit_binop(self, node): + if node.op != '%': + return + + left = node.left + if not (isinstance(left, astroid.Const) and isinstance(left.value, str)): + return + + self.add_message('deprecated-string-format', node=node) + + +class BadFunctionChecker(BaseChecker): + """ + Check for absolute import from the same package. + """ + + __implements__ = IAstroidChecker + + name = 'bad-function' + priority = -1 + msgs = {'W9904': ('using prohibited function', 'bad-function-call', '')} + + options = ( + ( + 'bad-functions', + { + 'default': '', + 'help': 'List of prohibited functions', + }, + ), + ) + + def visit_call(self, node): + bad_functions = set(f.strip() for f in self.config.bad_functions.split(',')) + if self._function_name(node) in bad_functions: + self.add_message('bad-function-call', node=node) + + @staticmethod + def _function_name(node): + func = node.func + if hasattr(func, 'attrname'): + return func.attrname + elif hasattr(func, 'name'): + return func.name + + +def register(linter): + """ + Required method to auto register this checker. + """ + + linter.register_checker(BackslashChecker(linter)) + linter.register_checker(AbsoluteImportsChecker(linter)) + linter.register_checker(StringFormatChecker(linter)) + linter.register_checker(BadFunctionChecker(linter)) diff --git a/tools/accuracy_checker/requirements.txt b/tools/accuracy_checker/requirements.txt new file mode 100644 index 0000000..16cc457 --- /dev/null +++ b/tools/accuracy_checker/requirements.txt @@ -0,0 +1,9 @@ + numpy +tqdm +PyYAML +pillow +scikit-learn +scipy<=0.19 +py-cpuinfo +shapely +nibabel diff --git a/tools/accuracy_checker/setup.cfg b/tools/accuracy_checker/setup.cfg new file mode 100644 index 0000000..5d5a13c --- /dev/null +++ b/tools/accuracy_checker/setup.cfg @@ -0,0 +1,8 @@ +[flake8] +max-line-length = 120 +ignore = F401 + +[isort] +line_length = 120 +use_parentheses = True +known_third_party = openvino.inference_engine,caffe,cv2 diff --git a/tools/accuracy_checker/tests/__init__.py b/tools/accuracy_checker/tests/__init__.py new file mode 100644 index 0000000..43d061d --- /dev/null +++ b/tools/accuracy_checker/tests/__init__.py @@ -0,0 +1,16 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + diff --git a/tools/accuracy_checker/tests/common.py b/tools/accuracy_checker/tests/common.py new file mode 100644 index 0000000..7a85f91 --- /dev/null +++ b/tools/accuracy_checker/tests/common.py @@ -0,0 +1,139 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from contextlib import contextmanager +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import List + +import numpy as np + +from accuracy_checker.representation import DetectionAnnotation, DetectionPrediction, SegmentationPrediction, SegmentationAnnotation +from accuracy_checker.utils import get_path + + +@contextmanager +# since it seems not possible to create pathlib.Path from str with '/' at the end we accept strings +# expect paths in posix format +def mock_filesystem(hierarchy: List[str]): + with TemporaryDirectory() as prefix: + for entry in hierarchy: + path = Path(prefix) / entry + if entry.endswith("/"): + path.mkdir(parents=True, exist_ok=True) + else: + parent = path.parent + if parent != Path("."): + parent.mkdir(parents=True, exist_ok=True) + # create file + path.open('w').close() + + yield get_path(prefix, is_directory=True) + + +def make_representation(bounding_boxes, is_ground_truth=False, score=None, meta=None): + """ + Args: + bounding_boxes: string or list of strings `score label x0 y0 x1 y1; label score x0 y0 x1 y1; ...`. + is_ground_truth: True if bbs are annotation boxes. + score: value in [0, 1], if not None, all prediction boxes are considered with the given score. + meta: metadata for representation + """ + + if not isinstance(bounding_boxes, list): + bounding_boxes = [bounding_boxes] + + result = [] + for idx, box in enumerate(bounding_boxes): + arr = np.array(np.mat(box)) + + if box == "": + arr = np.array([]).reshape((0, 5)) + + if is_ground_truth or score: + assert arr.shape[1] == 5 + elif not is_ground_truth and not score: + assert arr.shape[1] == 6 + + if not is_ground_truth and score: + score_ = score + if np.isscalar(score_) or len(score_) == 1: + score_ = np.full(arr.shape[0], score_) + arr = np.c_[score_, arr] + + if is_ground_truth: + detection = DetectionAnnotation(str(idx), arr[:, 0], arr[:, 1], arr[:, 2], arr[:, 3], arr[:, 4]) + else: + detection = DetectionPrediction(str(idx), arr[:, 1], arr[:, 0], arr[:, 2], arr[:, 3], arr[:, 4], arr[:, 5]) + + if meta: + detection.metadata = meta[idx] + + result.append(detection) + + return result + + +def make_segmentation_representation(mask, ground_truth=False): + if ground_truth: + representation = SegmentationAnnotation('identifier', None) + representation.mask = mask + return [representation] + + return [SegmentationPrediction('identifier', mask)] + + +def update_dict(dictionary, **kwargs): + copied = dictionary.copy() + copied.update(**kwargs) + + return copied + + +class DummyDataset: + def __init__(self, label_map, bg=-1): + self.label_map = label_map + self.background = bg + + @property + def metadata(self): + return {"label_map": self.label_map, "background_label": self.background} + + @property + def labels(self): + return self.metadata['label_map'] + + +# @pytest.fixture(scope="function", params=[ +# {0: 'dog', -1: 'background'}, {0: 'dog', 1: 'cat', 2: 'human', -1: 'background'}, {0: 'dog', 1: 'cat', 2: 'human'} +# ], ids=['single class', 'multi class', 'multi_class_without_background']) +# def dataset(request): +# labels = request.param +# yield DummyDataset(label_map=labels, bg=-1) + + +def multi_class_dataset(): + labels = {0: 'dog', 1: 'cat', 2: 'human', -1: 'background'} + return DummyDataset(label_map=labels, bg=-1) + + +def multi_class_dataset_without_background(): + labels = {0: 'dog', 1: 'cat', 2: 'human'} + return DummyDataset(label_map=labels) + + +def single_class_dataset(): + labels = {0: 'dog', -1: 'background'} + return DummyDataset(label_map=labels, bg=-1) diff --git a/tools/accuracy_checker/tests/conftest.py b/tools/accuracy_checker/tests/conftest.py new file mode 100644 index 0000000..7657240 --- /dev/null +++ b/tools/accuracy_checker/tests/conftest.py @@ -0,0 +1,52 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +from pathlib import Path + +import pytest + +test_root = Path(__file__).parent +project_root = test_root.parent + + +def pytest_addoption(parser): + parser.addoption( + "--caffe_logging", action="store_true", default=False, help="Enable Google log" + ) + + +def pytest_configure(config): + if not config.getoption('caffe_logging'): + os.environ['GLOG_minloglevel'] = '2' + + +@pytest.fixture +def data_dir(): + return project_root / 'data' / 'test_data' + + +@pytest.fixture +def models_dir(): + return project_root / 'data' / 'test_models' + + +@pytest.fixture +def mock_path_exists(mocker): + mocker.patch('pathlib.Path.exists', return_value=True) + mocker.patch('pathlib.Path.is_dir', return_value=True) + mocker.patch('pathlib.Path.is_file', return_value=True) + mocker.patch('os.path.exists', return_value=True) diff --git a/tools/accuracy_checker/tests/test_adapters.py b/tools/accuracy_checker/tests/test_adapters.py new file mode 100644 index 0000000..9cb90f5 --- /dev/null +++ b/tools/accuracy_checker/tests/test_adapters.py @@ -0,0 +1,121 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import pytest + +from accuracy_checker.adapters import SSDAdapter, Adapter +from accuracy_checker.config import ConfigError +from .common import make_representation + + +def test_detection_adapter(): + raw = { + 'detection_out': np.array([[[[0, 3, 0.2, 0, 0, 1, 1], [0, 2, 0.5, 4, 4, 7, 7], [0, 5, 0.7, 3, 3, 9, 8]]]]) + } + expected = make_representation('0.2,3,0,0,1,1;0.5,2,4,4,7,7;0.7,5,3,3,9,8') + + actual = SSDAdapter({}, output_blob='detection_out')([raw], ['0']) + + assert np.array_equal(actual, expected) + + +def test_detection_adapter_partially_filling_output_blob(): + raw = { + 'detection_out': np.array( + [[[[0, 3, 0.2, 0, 0, 1, 1], [0, 2, 0.5, 4, 4, 7, 7], [0, 5, 0.7, 3, 3, 9, 8], [-1, 0, 0, 0, 0, 0, 0]]]] + ) + } + expected = make_representation('0.2,3,0,0,1,1;0.5,2,4,4,7,7;0.7,5,3,3,9,8') + + actual = SSDAdapter({}, output_blob='detection_out')([raw], ['0']) + + assert np.array_equal(actual, expected) + + +def test_detection_adapter_partially_filling_output_blob_with_zeros_at_the_end(): + raw = { + 'detection_out': np.array([[[ + [0, 3, 0.2, 0, 0, 1, 1], + [0, 2, 0.5, 4, 4, 7, 7], + [0, 5, 0.7, 3, 3, 9, 8], + [-1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0] + ]]]) + } + expected = make_representation('0.2,3,0,0,1,1;0.5,2,4,4,7,7;0.7,5,3,3,9,8') + + actual = SSDAdapter({}, output_blob='detection_out')([raw], ['0']) + + assert np.array_equal(actual, expected) + + +def test_detection_adapter_batch_2(): + raw = { + 'detection_out': np.array([[[[0, 3, 0.2, 0, 0, 1, 1], [0, 2, 0.5, 4, 4, 7, 7], [1, 5, 0.7, 3, 3, 9, 8]]]]) + } + expected = make_representation(['0.2,3,0,0,1,1;0.5,2,4,4,7,7', '0.7,5,3,3,9,8']) + + actual = SSDAdapter({}, output_blob='detection_out')([raw], ['0', '1']) + + assert np.array_equal(actual, expected) + + +def test_dictionary_adapter_no_raise_warning_on_specific_args(): + adapter_config = {'type': 'age_gender', 'gender_out': 'gender', 'age_out': 'age'} + with pytest.warns(None) as record: + Adapter.provide('age_gender', adapter_config) + assert len(record) == 0 + + +def test_age_gender_adapter_raise_config_error_on_extra_args(): + adapter_config = {'type': 'age_gender', 'gender_out': 'gender', 'age_out': 'age', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Adapter.provide('age_gender', adapter_config) + + +def test_face_person_detection_adapter_raise_config_error_on_extra_args(): + adapter_config = { + 'type': 'face_person_detection', + 'face_detection_out': 'face', + 'person_detection_out': 'person', + 'something_extra': 'extra' + } + with pytest.raises(ConfigError): + Adapter.provide('face_person_detection', adapter_config) + + +def test_head_pose_adapter_raise_config_error_on_extra_args(): + adapter_config = { + 'type': 'head_pose', + 'angle_yaw': 'yaw', + 'angle_pitch': 'pitch', + 'angle_roll': 'roll', + 'something_extra': 'extra' + } + with pytest.raises(ConfigError): + Adapter.provide('head_pose', adapter_config) + + +def test_vehicle_attributes_adapter_raise_config_error_on_extra_args(): + adapter_config = { + 'type': 'vehicle_attributes', + 'color_out': 'color', + 'type_out': 'type', + 'something_extra': 'extra' + } + with pytest.raises(ConfigError): + Adapter.provide('vehicle_attributes', adapter_config) diff --git a/tools/accuracy_checker/tests/test_caffe_launcher.py b/tools/accuracy_checker/tests/test_caffe_launcher.py new file mode 100644 index 0000000..205fb7b --- /dev/null +++ b/tools/accuracy_checker/tests/test_caffe_launcher.py @@ -0,0 +1,77 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +pytest.importorskip('accuracy_checker.launcher.caffe_launcher') + +import cv2 +import numpy as np + +from accuracy_checker.launcher.launcher import create_launcher +from accuracy_checker.config import ConfigError +from accuracy_checker.dataset import DataRepresentation + + +def get_caffe_test_model(models_dir): + config = { + "framework": "caffe", + "weights": str(models_dir / "SampLeNet.caffemodel"), + "model": str(models_dir / "SampLeNet.prototxt"), + "adapter": 'classification', + "device": "cpu" + } + + return create_launcher(config) + + +class TestCaffeLauncher: + def test_launcher_creates(self, models_dir): + assert get_caffe_test_model(models_dir).inputs['data'] == (3, 32, 32) + + def test_infer(self, data_dir, models_dir): + caffe_test_model = get_caffe_test_model(models_dir) + c, h, w = caffe_test_model.inputs['data'] + img_raw = cv2.imread(str(data_dir / '1.jpg')) + img_resized = cv2.resize(img_raw, (w, h)) + res = caffe_test_model.predict(['1.jpg'], [DataRepresentation(img_resized)]) + + assert res[0].label == 6 + + def test_caffe_launcher_provide_input_shape_to_adapter(self, mocker, models_dir): + mocker.patch('caffe.Net.forward', return_value={'fc3': 0}) + adapter_mock = mocker.patch('accuracy_checker.adapters.ClassificationAdapter.process') + launcher = get_caffe_test_model(models_dir) + launcher.predict(['1.png'], [DataRepresentation(np.zeros((32, 32, 3)))]) + adapter_mock.assert_called_once_with([{'fc3': 0}], ['1.png'], [{'input_shape': {'data': (3, 32, 32)}, 'image_size': (32, 32, 3)}]) + + + +def test_missed_model_in_create_caffe_launcher_raises_config_error_exception(): + launcher = {'framework': 'caffe', 'weights': 'custom', 'adapter': 'classification'} + + with pytest.raises(ConfigError): + create_launcher(launcher) + + +def test_missed_weights_in_create_caffe_launcher_raises_config_error_exception(): + launcher = {'framework': 'caffe', 'model': 'custom', 'adapter': 'ssd'} + + with pytest.raises(ConfigError): + create_launcher(launcher) + + +def dummy_adapter(): + pass diff --git a/tools/accuracy_checker/tests/test_config_reader.py b/tools/accuracy_checker/tests/test_config_reader.py new file mode 100644 index 0000000..9b364d8 --- /dev/null +++ b/tools/accuracy_checker/tests/test_config_reader.py @@ -0,0 +1,1014 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import copy +from pathlib import Path +from argparse import Namespace + +import pytest +from accuracy_checker.config import ConfigReader, ConfigError + + +class TestConfigReader: + def setup_method(self): + self.global_launchers = [ + { + 'framework': 'dlsdk', + 'device': 'fpga', + 'cpu_extensions': 'dlsdk_shared.so', + 'bitstream': 'bitstream' + }, + { + 'framework': 'caffe', + 'device': 'gpu_0' + } + ] + + self.global_datasets = [ + { + 'name': 'global_dataset', + 'annotation': Path('/pascal_voc_2007_annotation.pickle'), + 'data_source': Path('/VOCdevkit/VOC2007/JPEGImages'), + 'preprocessing': [ + { + 'type': 'resize', + 'interpolation': 'mean_image', + }, + { + 'type': 'normalization', + 'mean': 'voc', + } + ], + 'metrics': [{ + 'type': 'fppi', + 'mr_rates': [0.0, 0.1] + }], + 'postprocessing': [ + { + 'type': 'filter', + 'labels': ['dog', 'airplane'], + 'min_confidence': 0.05, + 'min_box_size': 60, + }, + { + 'type': 'nms', + 'overlap': 0.5 + } + ] + } + ] + + self.global_config = { + 'launchers': self.global_launchers, + 'datasets': self.global_datasets + } + + self.module = 'accuracy_checker.config.ConfigReader' + self.arguments = Namespace(**{ + 'models': Path('models'), + 'extensions': Path('extensions'), + 'source': Path('source'), + 'annotations': Path('annotations'), + 'converted_models': Path('converted_models'), + 'model_optimizer': Path('model_optimizer'), + 'bitstreams': Path('bitstreams'), + 'definitions': None, + 'stored_predictions': None, + 'tf_custom_op_config': None, + 'tf_obj_detection_api_pipeline_config_path': None, + 'progress': 'bar', + 'target_framework': None, + 'target_devices': None, + 'log_file': None, + 'target_tags': None, + 'cpu_extensions_mode': None, + 'aocl': None + }) + + def test_read_configs_without_global_config(self, mocker): + config = {'models': [{ + 'name': 'model', + 'launchers': [{'framework': 'dlsdk', 'model': Path('/absolute_path'), 'weights': Path('/absolute_path')}], + 'datasets': [{'name': 'global_dataset'}] + }]} + empty_args = Namespace(**{ + 'models': None, 'extensions': None, 'source': None, 'annotations': None, + 'converted_models': None, 'model_optimizer': None, 'bitstreams': None, + 'definitions': None, 'config': None, 'stored_predictions': None, 'tf_custom_op_config': None, + 'progress': 'bar', 'target_framework': None, 'target_devices': None, 'log_file': None, + 'tf_obj_detection_api_pipeline_config_path': None, 'target_tags': None, 'cpu_extensions_mode': None, + 'aocl': None + }) + mocker.patch('accuracy_checker.utils.get_path', return_value=Path.cwd()) + mocker.patch('yaml.load', return_value=config) + mocker.patch('pathlib.Path.open') + + result = ConfigReader.merge(empty_args) + + assert config == result + + def test_empty_local_config_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Missing local config' + + def test_missed_models_in_local_config_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {'not_models': 'custom'} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Missed "{}" in local config'.format('models') + + def test_empty_models_in_local_config_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {'models': []} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Missed "{}" in local config'.format('models') + + def test_missed_name_in_model_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {'models': [{'launchers': None, 'datasets': None}]} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Each model must specify {}'.format(['name', 'launchers', 'datasets']) + + def test_missed_launchers_in_model_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {'models': [{'name': None, 'datasets': None}]} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Each model must specify {}'.format(['name', 'launchers', 'datasets']) + + def test_missed_datasets_in_model_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {'models': [{'name': None, 'launchers': None}]} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Each model must specify {}'.format(['name', 'launchers', 'datasets']) + + def test_invalid_model_raises_value_error_exception(self, mocker): + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, {'models': [{'name': None, 'launchers': None, 'datasets': None}]} + )) + + with pytest.raises(ConfigError) as exception: + ConfigReader.merge(self.arguments) + + error_message = str(exception).split(sep=': ')[-1] + assert error_message == 'Each model must specify {}'.format(['name', 'launchers', 'datasets']) + + def test_merge_datasets_with_definitions(self, mocker): + local_config = {'models': [{ + 'name': 'model', + 'launchers': [{'framework': 'dlsdk', 'model': '/absolute_path', 'weights': '/absolute_path'}], + 'datasets': [{'name': 'global_dataset'}] + }]} + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, local_config + )) + arguments = copy.deepcopy(self.arguments) + arguments.model_optimizer = None + + config = ConfigReader.merge(arguments) + + assert config['models'][0]['datasets'][0] == self.global_datasets[0] + + def test_merge_datasets_with_definitions_and_meta_is_not_modified(self, mocker): + local_config = {'models': [{ + 'name': 'model', + 'launchers': [{'framework': 'dlsdk', 'model': '/absolute_path', 'weights': '/absolute_path'}], + 'datasets': [{'name': 'global_dataset', 'dataset_meta': '/absolute_path'}] + }]} + expected = self.global_datasets[0] + expected['dataset_meta'] = Path('/absolute_path') + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, local_config + )) + + config = ConfigReader.merge(self.arguments) + + assert config['models'][0]['datasets'][0] == expected + + def test_expand_relative_paths_in_datasets_config_using_command_line(self, mocker): + local_config = {'models': [{ + 'name': 'model', + 'launchers': [{'framework': 'caffe'}], + 'datasets': [{ + 'name': 'global_dataset', + 'dataset_meta': 'relative_annotation_path', + 'data_source': 'relative_source_path', + 'segmentation_masks_source': 'relative_source_path', + 'annotation': 'relative_annotation_path' + }] + }]} + + mocker.patch(self.module + '._read_configs', return_value=( + None, local_config + )) + expected = copy.deepcopy(local_config['models'][0]['datasets'][0]) + expected['annotation'] = self.arguments.annotations / 'relative_annotation_path' + expected['dataset_meta'] = self.arguments.annotations / 'relative_annotation_path' + expected['segmentation_masks_source'] = self.arguments.source / 'relative_source_path' + expected['data_source'] = self.arguments.source / 'relative_source_path' + + config = ConfigReader.merge(self.arguments) + + assert config['models'][0]['datasets'][0] == expected + + def test_not_modify_absolute_paths_in_datasets_config_using_command_line(self): + local_config = {'models': [{ + 'name': 'model', + 'datasets': [{ + 'name': 'global_dataset', + 'dataset_meta': '/absolute_annotation_meta_path', + 'data_source': '/absolute_source_path', + 'annotation': '/absolute_annotation_path', + }] + }]} + + expected = copy.deepcopy(local_config['models'][0]['datasets'][0]) + expected['annotation'] = Path('/absolute_annotation_path') + expected['dataset_meta'] = Path('/absolute_annotation_meta_path') + expected['data_source'] = Path('/absolute_source_path') + + ConfigReader._merge_paths_with_prefixes(self.arguments, local_config) + + assert local_config['models'][0]['datasets'][0] == expected + + def test_merge_launchers_with_definitions(self, mocker): + local_config = {'models': [{ + 'name': 'model', + 'launchers': [{'framework': 'dlsdk'}], + 'datasets': [{'name': 'global_dataset'}] + }]} + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, local_config + )) + expected = copy.deepcopy(self.get_global_launcher('dlsdk')) + expected['bitstream'] = self.arguments.bitstreams / expected['bitstream'] + expected['cpu_extensions'] = self.arguments.extensions / expected['cpu_extensions'] + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.models = None + + config = ConfigReader.merge(args) + + assert config['models'][0]['launchers'][0] == expected + + def test_merge_launchers_with_model_is_not_modified(self, mocker): + local_config = {'models': [{ + 'name': 'model', + 'launchers': [{'framework': 'dlsdk', 'model': 'custom'}], + 'datasets': [{'name': 'global_dataset'}] + }]} + expected = copy.deepcopy(self.get_global_launcher('dlsdk')) + expected['model'] = 'custom' + expected['bitstream'] = self.arguments.bitstreams / expected['bitstream'] + expected['cpu_extensions'] = self.arguments.extensions / expected['cpu_extensions'] + mocker.patch(self.module + '._read_configs', return_value=( + self.global_config, local_config + )) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.models = None + args.converted_models = None + config = ConfigReader.merge(args) + + assert config['models'][0]['launchers'][0] == expected + + def test_expand_relative_paths_in_launchers_config_using_command_line(self, mocker): + local_config = {'models': [{ + 'name': 'model', + 'launchers': [{ + 'framework': 'dlsdk', + 'model': 'relative_model_path', + 'weights': 'relative_weights_path', + 'cpu_extensions': 'relative_extensions_path', + 'gpu_extensions': 'relative_extensions_path', + 'caffe_model': 'relative_model_path', + 'caffe_weights': 'relative_weights_path', + 'tf_model': 'relative_model_path', + 'mxnet_weights': 'relative_weights_path', + 'bitstream': 'relative_bitstreams_path' + }], + 'datasets': [{'name': 'dataset'}] + }]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + + expected = copy.deepcopy(local_config['models'][0]['launchers'][0]) + expected['model'] = self.arguments.models / 'relative_model_path' + expected['caffe_model'] = self.arguments.models / 'relative_model_path' + expected['tf_model'] = self.arguments.models / 'relative_model_path' + expected['weights'] = self.arguments.models / 'relative_weights_path' + expected['caffe_weights'] = self.arguments.models / 'relative_weights_path' + expected['mxnet_weights'] = self.arguments.models / 'relative_weights_path' + expected['cpu_extensions'] = self.arguments.extensions / 'relative_extensions_path' + expected['gpu_extensions'] = self.arguments.extensions / 'relative_extensions_path' + expected['bitstream'] = self.arguments.bitstreams / 'relative_bitstreams_path' + expected['_models_prefix'] = self.arguments.models + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + config = ConfigReader.merge(args) + + assert config['models'][0]['launchers'][0] == expected + + def test_both_launchers_are_filtered_by_target_tags_if_tags_not_provided_in_config(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': '/absolute_path1', + 'weights': '/absolute_path1', + 'adapter': 'classification', + 'device': 'CPU', + }, + { + 'framework': 'dlsdk', + 'model': '/absolute_path2', + 'weights': '/absolute_path2', + 'adapter': 'classification', + 'device': 'GPU', + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + self.arguments.target_tags = ['some_tag'] + + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + + with pytest.warns(Warning): + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_launcher_is_not_filtered_by_the_same_tag(self, mocker): + config_launchers = [{ + 'framework': 'dlsdk', + 'tags': ['some_tag'], + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_tags = ['some_tag'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers[0] == config_launchers[0] + + def test_both_launchers_are_not_filtered_by_the_same_tag(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'tags': ['some_tag'], + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'dlsdk', + 'tags': ['some_tag'], + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_tags = ['some_tag'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_both_launchers_are_filtered_by_another_tag(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'tags': ['some_tag'], + 'model': '/absolute_path1', + 'weights': '/absolute_path1', + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'dlsdk', + 'tags': ['some_tag'], + 'model': '/absolute_path2', + 'weights': '/absolute_path2', + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_tags = ['other_tag'] + + with pytest.warns(Warning): + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_only_appropriate_launcher_is_filtered_by_another_tag(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'tags': ['tag1'], + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'tags': ['tag2'], + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_tags = ['tag2'] + + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 1 + assert launchers[0] == config_launchers[1] + + def test_only_appropriate_launcher_is_filtered_by_another_tag_if_provided_several_target_tags(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'tags': ['tag1'], + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'tags': ['tag2'], + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_tags = ['tag2', 'tag3'] + + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 1 + assert launchers[0] == config_launchers[1] + + def test_launcher_with_several_tags_contained_at_least_one_from_target_tegs_is_not_filtered(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'tags': ['tag1', 'tag2'], + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_tags = ['tag2'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 1 + assert launchers[0] == config_launchers[0] + + def test_both_launchers_with_different_tags_are_not_filtered_by_the_same_tags(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'tags': ['tag1'], + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'dlsdk', + 'tags': ['tag2'], + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_tags = ['tag1', 'tag2'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_launcher_is_not_filtered_by_the_same_framework(self, mocker): + config_launchers = [{ + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_framework = 'dlsdk' + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_both_launchers_are_not_filtered_by_the_same_framework(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_framework = 'dlsdk' + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_launcher_is_filtered_by_another_framework(self, mocker): + config_launchers = [{ + 'framework': 'dlsdk', + 'model': Path('/absolute_path'), + 'weights': Path('/absolute_path'), + 'adapter': 'classification', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_framework = 'caffe' + + with pytest.warns(Warning): + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_both_launchers_are_filtered_by_another_framework(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': '/absolute_path1', + 'weights': '/absolute_path1', + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'dlsdk', + 'model': '/absolute_path2', + 'weights': '/absolute_path2', + 'adapter': 'classification', + 'device': 'GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_framework = 'caffe' + + with pytest.warns(Warning): + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_only_appropriate_launcher_is_filtered_by_another_framework(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_framework = 'caffe' + + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 1 + assert launchers[0] == config_launchers[1] + + def test_launcher_is_not_filtered_by_the_same_device(self, mocker): + config_launchers = [{ + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.model_optimizer = None + args.converted_models = None + args.target_devices = ['CPU'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_both_launchers_are_not_filtered_by_the_same_device(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'CPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.converted_models = None + args.target_devices = ['CPU'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_launcher_is_filtered_by_another_device(self, mocker): + config_launchers = [{ + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.converted_models = None + args.target_devices = ['GPU'] + + with pytest.warns(Warning): + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_both_launchers_are_filtered_by_another_device(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'CPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_devices = ['GPU'] + + with pytest.warns(Warning): + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_only_appropriate_launcher_is_filtered_by_another_device(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.converted_models = None + args.target_devices = ['GPU'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 1 + assert launchers[0] == config_launchers[1] + + def test_only_appropriate_launcher_is_filtered_by_user_input_devices(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'HETERO:CPU,GPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU', + } + ] + + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.converted_models = None + args.target_devices = ['GPU', 'CPU'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == [config_launchers[0], config_launchers[2]] + + def test_both_launchers_are_filtered_by_other_devices(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': '/absolute_path1', + 'weights': '/absolute_path1', + 'adapter': 'classification', + 'device': 'CPU', + }, + { + 'framework': 'caffe', + 'model': '/absolute_path2', + 'weights': '/absolute_path2', + 'adapter': 'classification', + 'device': 'CPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + self.arguments.target_devices = ['FPGA', 'MYRIAD'] + + with pytest.warns(Warning): + config = ConfigReader.merge(self.arguments) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 0 + + def test_both_launchers_are_not_filtered_by_same_devices(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.converted_models = None + args.target_devices = ['GPU', 'CPU'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert launchers == config_launchers + + def test_launcher_is_not_filtered_by_device_with_tail(self, mocker): + config_launchers = [ + { + 'framework': 'dlsdk', + 'model': Path('/absolute_path1'), + 'weights': Path('/absolute_path1'), + 'adapter': 'classification', + 'device': 'CPU', + '_model_optimizer': self.arguments.model_optimizer, + '_models_prefix': self.arguments.models + }, + { + 'framework': 'caffe', + 'model': Path('/absolute_path2'), + 'weights': Path('/absolute_path2'), + 'adapter': 'classification', + 'device': 'GPU' + } + ] + local_config = {'models': [{'name': 'name', 'launchers': config_launchers, 'datasets': [{'name': 'dataset'}]}]} + mocker.patch(self.module + '._read_configs', return_value=(None, local_config)) + args = copy.deepcopy(self.arguments) + args.converted_models = None + args.target_devices = ['CPU', 'GPU_unexpected_tail'] + + config = ConfigReader.merge(args) + + launchers = config['models'][0]['launchers'] + assert len(launchers) == 1 + assert launchers[0] == config_launchers[0] + + def get_global_launcher(self, framework): + for launcher in self.global_launchers: + if launcher['framework'] == framework: + return launcher + + raise ValueError('Undefined global launcher with framework = "{}"'.format(framework)) + + def get_global_dataset(self, name): + for dataset in self.global_datasets: + if dataset['name'] == name: + return dataset + + raise ValueError('Undefined global dataset with name = "{}"'.format(name)) diff --git a/tools/accuracy_checker/tests/test_config_validator.py b/tools/accuracy_checker/tests/test_config_validator.py new file mode 100644 index 0000000..29f2f6b --- /dev/null +++ b/tools/accuracy_checker/tests/test_config_validator.py @@ -0,0 +1,379 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from math import inf, nan +from pathlib import Path +from unittest.mock import ANY + +import pytest +from accuracy_checker.config.config_validator import ( + ConfigError, + ConfigValidator, + DictField, + ListField, + NumberField, + PathField, + StringField +) +from tests.common import mock_filesystem + + +class TestStringField: + def test_expects_string(self): + string_field = StringField() + + with pytest.raises(ConfigError): + string_field.validate(b"foo") + with pytest.raises(ConfigError): + string_field.validate({}) + with pytest.raises(ConfigError): + string_field.validate(42) + + string_field.validate("foo") + + def test_choices(self): + string_field = StringField(choices=['foo', 'bar']) + + with pytest.raises(ConfigError): + string_field.validate('baz') + + string_field.validate('bar') + + def test_case_sensitive(self): + string_field = StringField(choices=['foo', 'bar'], case_sensitive=False) + + string_field.validate('foo') + string_field.validate('FOO') + + string_field = StringField(choices=['foo', 'bar'], case_sensitive=True) + + string_field.validate('foo') + with pytest.raises(ConfigError): + string_field.validate('FOO') + + def test_regex(self): + string_field = StringField(regex=r'foo\d*') + + string_field.validate('foo') + string_field.validate('foo42') + + with pytest.raises(ConfigError): + string_field.validate('baz') + + def test_custom_exception(self, mocker): + stub = mocker.stub(name='custom_on_error') + string_field = StringField(choices=['foo'], on_error=stub) + + with pytest.raises(ConfigError): + string_field.validate('bar', 'foo') + stub.assert_called_once_with('bar', 'foo', ANY) + + def test_custom_validator(self, mocker): + stub = mocker.stub(name='custom_validator') + string_field = StringField(choices=['foo'], additional_validator=stub) + + string_field.validate('foo', 'baz') + stub.assert_called_once_with('foo', 'baz') + + +class TestNumberField: + def test_expects_number(self): + number_field = NumberField(floats=True) + + number_field.validate(1.0) + with pytest.raises(ConfigError): + number_field.validate("foo") + with pytest.raises(ConfigError): + number_field.validate({}) + with pytest.raises(ConfigError): + number_field.validate([]) + + number_field = NumberField(floats=False) + number_field.validate(1) + with pytest.raises(ConfigError): + number_field.validate(1.0) + + def test_nans(self): + number_field = NumberField(allow_nan=True) + number_field.validate(nan) + + number_field = NumberField(allow_nan=False) + with pytest.raises(ConfigError): + number_field.validate(nan) + + def test_infinity(self): + number_field = NumberField(allow_inf=True) + number_field.validate(inf) + + number_field = NumberField(allow_inf=False) + with pytest.raises(ConfigError): + number_field.validate(inf) + + def test_ranges(self): + number_field = NumberField(min_value=0, max_value=5) + + number_field.validate(0) + number_field.validate(1) + number_field.validate(2) + + with pytest.raises(ConfigError): + number_field.validate(-1) + with pytest.raises(ConfigError): + number_field.validate(7) + + +class TestDictField: + def test_expects_dict(self): + dict_field = DictField() + + dict_field.validate({}) + with pytest.raises(ConfigError): + dict_field.validate("foo") + with pytest.raises(ConfigError): + dict_field.validate(42) + with pytest.raises(ConfigError): + dict_field.validate([]) + + def test_validates_keys(self): + dict_field = DictField() + dict_field.validate({'foo': 42, 1: 'bar'}) + + dict_field = DictField(key_type=str) + dict_field.validate({'foo': 42, 'bar': 'bar'}) + with pytest.raises(ConfigError): + dict_field.validate({'foo': 42, 1: 'bar'}) + + dict_field = DictField(key_type=StringField(choices=['foo', 'bar'])) + dict_field.validate({'foo': 42, 'bar': 42}) + with pytest.raises(ConfigError): + dict_field.validate({'foo': 42, 1: 'bar'}) + with pytest.raises(ConfigError): + dict_field.validate({'foo': 42, 'baz': 42}) + + def test_validates_values(self): + dict_field = DictField() + dict_field.validate({'foo': 42, 1: 'bar'}) + + dict_field = DictField(value_type=str) + dict_field.validate({'foo': 'foo', 1: 'bar'}) + with pytest.raises(ConfigError): + dict_field.validate({'foo': 42, 1: 2}) + + dict_field = DictField(value_type=StringField(choices=['foo', 'bar'])) + dict_field.validate({1: 'foo', 'bar': 'bar'}) + with pytest.raises(ConfigError): + dict_field.validate({1: 'foo', 2: 3}) + with pytest.raises(ConfigError): + dict_field.validate({1: 'foo', 2: 'baz'}) + + def test_converts_basic_types(self): + dict_field = DictField(value_type=str) + assert isinstance(dict_field.value_type, StringField) + + dict_field = DictField(value_type=int) + assert isinstance(dict_field.value_type, NumberField) + assert dict_field.value_type.floats is False + + dict_field = DictField(value_type=float) + assert isinstance(dict_field.value_type, NumberField) + assert dict_field.value_type.floats is True + + dict_field = DictField(value_type=list) + assert isinstance(dict_field.value_type, ListField) + + dict_field = DictField(value_type=dict) + assert isinstance(dict_field.value_type, DictField) + + dict_field = DictField(value_type=Path) + assert isinstance(dict_field.value_type, PathField) + + def test_empty(self): + dict_field = DictField() + dict_field.validate({}) + + dict_field = DictField(allow_empty=False) + with pytest.raises(ConfigError): + dict_field.validate({}) + + +class TestListField: + def test_expects_list(self): + list_field = ListField() + + list_field.validate([]) + with pytest.raises(ConfigError): + list_field.validate("foo") + with pytest.raises(ConfigError): + list_field.validate(42) + with pytest.raises(ConfigError): + list_field.validate({}) + + def test_validates_values(self): + list_field = ListField() + list_field.validate(['foo', 42]) + + list_field = ListField(value_type=str) + list_field.validate(['foo', 'bar']) + with pytest.raises(ConfigError): + list_field.validate(['foo', 42]) + + list_field = ListField(value_type=StringField(choices=['foo', 'bar'])) + list_field.validate(['foo', 'bar']) + with pytest.raises(ConfigError): + list_field.validate(['foo', 42]) + with pytest.raises(ConfigError): + list_field.validate(['foo', 'bar', 'baz']) + + def test_empty(self): + list_field = ListField() + list_field.validate([]) + + list_field = ListField(allow_empty=False) + with pytest.raises(ConfigError): + list_field.validate([]) + + +class TestPathField: + @pytest.mark.usefixtures('mock_path_exists') + def test_expects_path_like(self): + path_field = PathField() + path_field.validate('foo/bar') + path_field.validate('/home/user') + path_field.validate(Path('foo/bar')) + + with pytest.raises(ConfigError): + path_field.validate(42) + with pytest.raises(ConfigError): + path_field.validate({}) + with pytest.raises(ConfigError): + path_field.validate([]) + + def test_path_is_checked(self): + with mock_filesystem(['foo/bar']) as prefix: + prefix_path = Path(prefix) + file_field = PathField(is_directory=False) + with pytest.raises(ConfigError): + file_field.validate(prefix_path / 'foo') + file_field.validate(prefix_path / 'foo' / 'bar') + + dir_field = PathField(is_directory=True) + dir_field.validate(prefix_path / 'foo') + + with pytest.raises(ConfigError): + dir_field.validate(prefix_path / 'foo' / 'bar') + + +class TestConfigValidator: + def test_compound(self): + class SampleValidator(ConfigValidator): + foo = StringField(choices=['foo']) + bar = NumberField() + + sample_validator = SampleValidator('Sample') + sample_validator.validate({'foo': 'foo', 'bar': 1}) + + with pytest.raises(ConfigError): + sample_validator.validate({'foo': 'foo'}) + with pytest.raises(ConfigError): + sample_validator.validate({'foo': 'bar', 'bar': 1}) + + def test_optional_fields(self): + class SampleValidatorNoOptionals(ConfigValidator): + foo = StringField(choices=['foo']) + bar = NumberField(optional=False) + + sample_validator = SampleValidatorNoOptionals('Sample') + sample_validator.validate({'foo': 'foo', 'bar': 1}) + with pytest.raises(ConfigError): + sample_validator.validate({'foo': 'bar'}) + + class SampleValidatorWithOptionals(ConfigValidator): + foo = StringField(choices=['foo']) + bar = NumberField(optional=True) + + sample_validator = SampleValidatorWithOptionals('Sample') + sample_validator.validate({'foo': 'foo', 'bar': 1}) + sample_validator.validate({'foo': 'foo'}) + + def test_extra_fields__warn_on_extra(self): + class SampleValidatorWarnOnExtra(ConfigValidator): + foo = StringField(choices=['foo']) + + sample_validator = SampleValidatorWarnOnExtra( + 'Sample', on_extra_argument=ConfigValidator.WARN_ON_EXTRA_ARGUMENT + ) + + with pytest.warns(UserWarning): + sample_validator.validate({'foo': 'foo', 'bar': 'bar'}) + + def test_extra_fields__error_on_extra(self): + class SampleValidatorErrorOnExtra(ConfigValidator): + foo = StringField(choices=['foo']) + + sample_validator = SampleValidatorErrorOnExtra( + 'Sample', on_extra_argument=ConfigValidator.ERROR_ON_EXTRA_ARGUMENT) + + with pytest.raises(ConfigError): + sample_validator.validate({'foo': 'bar', 'bar': 'bar'}) + + def test_extra_fields__ignore_extra(self): + class SampleValidatorIgnoresExtra(ConfigValidator): + foo = StringField(choices=['foo']) + + sample_validator = SampleValidatorIgnoresExtra( + 'Sample', on_extra_argument=ConfigValidator.IGNORE_ON_EXTRA_ARGUMENT) + + sample_validator.validate({'foo': 'foo', 'bar': 'bar'}) + + def test_custom_exception(self, mocker): + class SampleValidator(ConfigValidator): + foo = StringField(choices=['foo']) + + stub = mocker.stub(name='custom_on_error') + sample_validator = SampleValidator('Sample', on_error=stub) + sample_validator.validate({}) + stub.assert_called_once_with(ANY, 'Sample', ANY) + + def test_custom_validator(self, mocker): + class SampleValidator(ConfigValidator): + foo = StringField(choices=['foo']) + + stub = mocker.stub(name='custom_validator') + sample_validator = SampleValidator('Sample', additional_validator=stub) + entry = {'foo': 'foo'} + sample_validator.validate(entry) + stub.assert_called_once_with(entry, 'Sample') + + def test_nested(self): + class InnerValidator(ConfigValidator): + foo = StringField(choices=['foo']) + + class OuterValidator(ConfigValidator): + bar = ListField(InnerValidator('Inner')) + + outer_validator = OuterValidator('Outer', on_extra_argument=ConfigValidator.ERROR_ON_EXTRA_ARGUMENT) + + outer_validator.validate({'bar': [{'foo': 'foo'}, {'foo': 'foo'}]}) + + def test_inheritance(self): + class ParentValidator(ConfigValidator): + foo = StringField(choices=['foo']) + + class DerivedValidator(ParentValidator): + bar = StringField(choices=['bar']) + + derived_validator = DerivedValidator('Derived', on_extra_argument=ConfigValidator.ERROR_ON_EXTRA_ARGUMENT) + derived_validator.validate({'foo': 'foo', 'bar': 'bar'}) diff --git a/tools/accuracy_checker/tests/test_dataset.py b/tools/accuracy_checker/tests/test_dataset.py new file mode 100644 index 0000000..954ded4 --- /dev/null +++ b/tools/accuracy_checker/tests/test_dataset.py @@ -0,0 +1,191 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import copy +from pathlib import Path +import pytest +from .common import make_representation +from accuracy_checker.config import ConfigError + +from accuracy_checker.dataset import Dataset + + +def copy_dataset_config(config): + new_config = copy.deepcopy(config) + + return new_config + +class MockPreprocessor: + @staticmethod + def process(images): + return images + + +class TestDataset: + dataset_config = { + 'name': 'custom', + 'annotation': 'custom', + 'data_source': 'custom', + 'metrics': [{'type': 'map'}] + } + + def test_missed_name_raises_config_error_exception(self): + local_dataset = copy_dataset_config(self.dataset_config) + local_dataset.pop('name') + + with pytest.raises(ConfigError): + Dataset(local_dataset, MockPreprocessor()) + + def test_setting_custom_dataset_with_missed_annotation_raises_config_error_exception(self): + local_dataset = copy_dataset_config(self.dataset_config) + local_dataset.pop('annotation') + with pytest.raises(ConfigError): + Dataset(local_dataset, MockPreprocessor()) + + @pytest.mark.usefixtures('mock_path_exists') + def test_setting_custom_dataset_with_missed_data_source_raises_config_error_exception(self): + local_dataset = copy_dataset_config(self.dataset_config) + local_dataset.pop('data_source') + with pytest.raises(ConfigError): + Dataset(local_dataset, MockPreprocessor()) + + +@pytest.mark.usefixtures('mock_path_exists') +class TestAnnotationConversion: + dataset_config = { + 'name': 'custom', + 'data_source': 'custom', + 'metrics': [{'type': 'map'}] + } + + def test_annotation_conversion_unknown_converter_raise_config_error(self): + addition_options = {'annotation_conversion': {'converter': 'unknown'}} + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + with pytest.raises(ValueError): + Dataset(config, MockPreprocessor()) + + def test_annotation_conversion_converter_without_required_options_raise_config_error(self): + addition_options = {'annotation_conversion': {'converter': 'wider'}} + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + with pytest.raises(ConfigError): + Dataset(config, MockPreprocessor()) + + def test_annotation_conversion_raise_config_error_on_extra_args(self): + addition_options = {'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file', 'something_extra': 'extra'}} + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + with pytest.raises(ConfigError): + Dataset(config, MockPreprocessor()) + + def test_sucessful_annotation_conversion(self, mocker): + addition_options = {'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file'}} + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + annotation_converter_mock = mocker.patch( + 'accuracy_checker.annotation_converters.WiderFormatConverter.convert', + return_value=(make_representation("0 0 0 5 5", True), None) + ) + Dataset(config, MockPreprocessor()) + annotation_converter_mock.assert_called_once_with() + + def test_annotation_conversion_with_store_annotation(self, mocker): + addition_options = { + 'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file'}, + 'annotation': 'custom' + } + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + converted_annotation = make_representation('0 0 0 5 5', True) + mocker.patch( + 'accuracy_checker.annotation_converters.WiderFormatConverter.convert', + return_value=(converted_annotation, None) + ) + annotation_saver_mock = mocker.patch( + 'accuracy_checker.dataset.save_annotation' + ) + Dataset(config, MockPreprocessor()) + + annotation_saver_mock.assert_called_once_with(converted_annotation, None, Path('custom'), None) + + def test_annotation_conversion_subset_size(self, mocker): + addition_options = { + 'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file'}, + 'subsample_size': 1 + } + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + converted_annotation = make_representation(['0 0 0 5 5', '0 1 1 10 10'], True) + mocker.patch( + 'accuracy_checker.annotation_converters.WiderFormatConverter.convert', + return_value=(converted_annotation, None) + ) + dataset = Dataset(config, MockPreprocessor()) + assert dataset.annotation == [converted_annotation[1]] + + def test_annotation_conversion_subset_ratio(self, mocker): + addition_options = { + 'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file'}, + 'subsample_size': '50%' + } + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + converted_annotation = make_representation(['0 0 0 5 5', '0 1 1 10 10'], True) + mocker.patch( + 'accuracy_checker.annotation_converters.WiderFormatConverter.convert', + return_value=(converted_annotation, None) + ) + subset_maker_mock = mocker.patch( + 'accuracy_checker.dataset.make_subset' + ) + Dataset(config, MockPreprocessor()) + subset_maker_mock.assert_called_once_with(converted_annotation, 1, 666) + + def test_annotation_conversion_subset_with_seed(self, mocker): + addition_options = { + 'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file'}, + 'subsample_size': 1, + 'subsample_seed': 1 + } + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + converted_annotation = make_representation(['0 0 0 5 5', '0 1 1 10 10'], True) + mocker.patch( + 'accuracy_checker.annotation_converters.WiderFormatConverter.convert', + return_value=(converted_annotation, None) + ) + dataset = Dataset(config, MockPreprocessor()) + annotation = dataset.annotation + assert annotation == [converted_annotation[0]] + + def test_annotation_conversion_save_subset(self, mocker): + addition_options = { + 'annotation_conversion': {'converter': 'wider', 'annotation_file': 'file'}, + 'annotation': 'custom', + 'subsample_size': 1, + } + config = copy_dataset_config(self.dataset_config) + config.update(addition_options) + converted_annotation = make_representation(['0 0 0 5 5', '0 1 1 10 10'], True) + mocker.patch( + 'accuracy_checker.annotation_converters.WiderFormatConverter.convert', + return_value=(converted_annotation, None) + ) + annotation_saver_mock = mocker.patch( + 'accuracy_checker.dataset.save_annotation' + ) + Dataset(config, MockPreprocessor()) + annotation_saver_mock.assert_called_once_with([converted_annotation[1]], None, Path('custom'), None) diff --git a/tools/accuracy_checker/tests/test_dependency.py b/tools/accuracy_checker/tests/test_dependency.py new file mode 100644 index 0000000..0f98842 --- /dev/null +++ b/tools/accuracy_checker/tests/test_dependency.py @@ -0,0 +1,89 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from accuracy_checker.dependency import ClassProvider, get_opts + + +def test_get_opts_positional_and_kwargs(): + opts = {'o': ((1,), {'a': 1})} + args, kwargs = get_opts(opts['o']) + + assert args == (1,) + assert kwargs == {'a': 1} + + +def test_get_opts_kwargs_only(): + opts = {'o': {'a': 1}} + args, kwargs = get_opts(opts['o']) + + assert args == () + assert kwargs == {'a': 1} + + +def test_get_opts_positional_only(): + opts = {'o': (1, 2, 3)} + args, kwargs = get_opts(opts['o']) + + assert args == (1, 2, 3) + assert kwargs == {} + + +def test_class_provider(): + class BaseService(ClassProvider): + __provider_type__ = 'Service' + + class ServiceA(BaseService): + __provider__ = 'service_a' + + class ServiceB(BaseService): + __provider__ = 'service_b' + + assert issubclass(ServiceA, BaseService) + assert issubclass(ServiceB, BaseService) + + assert 'service_a' in BaseService.providers + assert 'service_b' in BaseService.providers + + +def test_provide(): + class BaseService(ClassProvider): + __provider_type__ = 'service' + + def __init__(self): + pass + + class ServiceA(BaseService): + __provider__ = 'service_a' + + provided = BaseService.provide('service_a') + + assert isinstance(provided, ServiceA) + + +def test_provide_with_args(): + class BaseService(ClassProvider): + __provider_type__ = 'service' + + def __init__(self, bar): + self.bar = bar + + class ServiceA(BaseService): + __provider__ = 'service_a' + + provided = BaseService.provide('service_a', bar=42) + + assert isinstance(provided, ServiceA) + assert provided.bar == 42 diff --git a/tools/accuracy_checker/tests/test_detection_metrics.py b/tools/accuracy_checker/tests/test_detection_metrics.py new file mode 100644 index 0000000..def1354 --- /dev/null +++ b/tools/accuracy_checker/tests/test_detection_metrics.py @@ -0,0 +1,459 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +import numpy as np +from accuracy_checker.metrics import DetectionMAP +from accuracy_checker.metrics.detection import Recall, bbox_match +from accuracy_checker.metrics.overlap import IOU, IOA +from tests.common import (make_representation, single_class_dataset, multi_class_dataset, + multi_class_dataset_without_background) + + +def _test_metric_wrapper(metric_cls, dataset, **kwargs): + provider = metric_cls.__provider__ + config = {'type': provider, 'name': provider} + config.update(**kwargs) + return metric_cls(config, dataset, provider) + + +class TestBoxMatch: + def test_single(self): + gt = "0 0 0 5 5" + pred = "0 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 1 + assert fp[0] == 0 + + def test_single_with_ignored_tp(self): + gt = "0 0 0 5 5" + pred = "0 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + pred[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 0 + assert fp[0] == 0 + + def test_single_with_use_filtered_tp(self): + gt = "0 0 0 5 5" + pred = "0 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + pred[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator, use_filtered_tp=True) + assert tp[0] == 1 + assert fp[0] == 0 + + def test_single_non_overlap(self): + gt = make_representation("0 5 5 10 10", is_ground_truth=True) + pred = make_representation("0 0 0 5 5", score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 0 + assert fp[0] == 1 + + def test_single_non_overlap_ignored(self): + gt = make_representation("0 5 5 10 10", is_ground_truth=True) + pred = make_representation("0 0 0 5 5", score=1) + pred[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 0 + assert fp[0] == 0 + + def test_multiple(self): + gt = make_representation("0 0 0 5 5; 0 7 7 8 8", is_ground_truth=True) + pred = make_representation("0 0 0 5 5; 0 7 7 8 8", score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 1 + assert tp[1] == 1 + assert fp[0] == 0 + assert fp[0] == 0 + + def test_multiple_2(self): + gt = make_representation("0 0 0 5 5; 0 9 9 10 10", is_ground_truth=True) + pred = make_representation("1 0 0 0 5 5; 0.8 0 7 7 8 8") + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 1 + assert tp[1] == 0 + assert fp[0] == 0 + assert fp[1] == 1 + + def test_multi_label(self): + gt = make_representation("1 0 0 5 5; 0 9 9 10 10", is_ground_truth=True) + pred = make_representation("1 1 0 0 5 5; 0.8 0 7 7 8 8") + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 1, overlap_evaluator) + assert tp.shape[0] == 1 + assert tp[0] == 1 + assert fp[0] == 0 + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp.shape[0] == 1 + assert tp[0] == 0 + assert fp[0] == 1 + + def test_multi_image(self): + gt = make_representation(["0 0 0 5 5", "0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5", "0 0 0 5 5"], score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 1 + assert tp[1] == 1 + assert fp[0] == 0 + assert fp[1] == 0 + + def test_false_negative(self): + gt = make_representation("0 0 0 5 5; 0 1 1 6 6", is_ground_truth=True) + pred = make_representation("0 0 0 5 5", score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, ngt = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 1 + assert tp.shape[0] == 1 + assert ngt == 2 + + def test_multiple_detections(self): + gt = make_representation("0 0 0 5 5", is_ground_truth=True) + pred = make_representation("1 0 0 0 5 5; 0.9 0 0 0 5 5") + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 1 + assert tp[1] == 0 + + def test_no_annotations(self): + gt = "1 0 0 5 5" + pred = "0 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, _ = bbox_match(gt, pred, 0, overlap_evaluator) + assert tp[0] == 0 + assert fp[0] == 1 + + def test_no_predictions(self): + gt = "0 0 0 5 5" + pred = "1 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator) + assert n == 1 + assert len(tp) == 0 + assert len(fp) == 0 + + def test_iou_empty_prediction_box(self): + gt = "0 0 0 5 5" + pred = "0 0 0 0 0" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + overlap_evaluator = IOU({}) + + with pytest.warns(None) as warnings: + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator) + assert len(warnings) == 0 + assert n == 1 + assert tp[0] == 0 + assert fp[0] == 1 + + def test_ioa_empty_prediction_box(self): + gt = "0 0 0 5 5" + pred = "0 0 0 0 0" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + overlap_evaluator = IOA({}) + + with pytest.warns(None) as warnings: + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator) + assert len(warnings) == 0 + assert n == 1 + assert tp[0] == 0 + assert fp[0] == 1 + + def test_iou_zero_union(self): + gt = "0 0 0 0 0" + pred = "0 0 0 0 0" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + overlap_evaluator = IOA({}) + + with pytest.warns(None) as warnings: + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator) + assert len(warnings) == 0 + assert n == 1 + assert tp[0] == 0 + assert fp[0] == 1 + + def test_single_difficult(self): + gt = "0 0 0 5 5" + pred = "0 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + gt[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator, ignore_difficult=True) + assert n == 0 + assert tp[0] == 0 + assert fp[0] == 0 + + def test_single_with_not_ignore_difficult(self): + gt = "0 0 0 5 5" + pred = "0 0 0 5 5" + + gt = make_representation(gt, is_ground_truth=True) + pred = make_representation(pred, score=1) + gt[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator, ignore_difficult=False) + assert n == 1 + assert tp[0] == 1 + assert fp[0] == 0 + + def test_single_difficult_non_overlap(self): + gt = make_representation("0 5 5 10 10", is_ground_truth=True) + gt[0].metadata['difficult_boxes'] = [0] + pred = make_representation("0 0 0 5 5", score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator) + assert n == 0 + assert tp[0] == 0 + assert fp[0] == 1 + + def test_single_difficult_non_overlap_not_ignore_difficult(self): + gt = make_representation("0 5 5 10 10", is_ground_truth=True) + gt[0].metadata['difficult_boxes'] = [0] + pred = make_representation("0 0 0 5 5", score=1) + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator, ignore_difficult=False) + assert n == 1 + assert tp[0] == 0 + assert fp[0] == 1 + + def test_multiple_detections_with_ignore_difficult(self): + gt = make_representation("0 0 0 5 5", is_ground_truth=True) + pred = make_representation("1 0 0 0 5 5; 0.9 0 0 0 5 5") + gt[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator, ignore_difficult=True) + assert n == 0 + assert tp[0] == 0 + assert tp[1] == 0 + assert fp[0] == 0 + assert fp[1] == 0 + + def test_multiple_detections_with_not_ignore_difficult(self): + gt = make_representation("0 0 0 5 5", is_ground_truth=True) + pred = make_representation("1 0 0 0 5 5; 0.9 0 0 0 5 5") + gt[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match(gt, pred, 0, overlap_evaluator, ignore_difficult=False) + assert n == 1 + assert tp[0] == 1 + assert tp[1] == 0 + assert fp[0] == 0 + assert fp[1] == 1 + + def test_multiple_detections_with_ignore_difficult_and_not_allow_multiple_matches_per_ignored(self): + gt = make_representation("0 0 0 5 5", is_ground_truth=True) + pred = make_representation("1 0 0 0 5 5; 0.9 0 0 0 5 5") + gt[0].metadata['difficult_boxes'] = [0] + overlap_evaluator = IOU({}) + + tp, fp, _, n = bbox_match( + gt, pred, 0, overlap_evaluator, + ignore_difficult=True, allow_multiple_matches_per_ignored=False + ) + + assert n == 0 + assert tp[0] == 0 + assert tp[1] == 0 + assert fp[0] == 0 + assert fp[1] == 1 + + +class TestRecall: + def test_one_object(self): + gt = make_representation(["0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5"], score=1) + metric = _test_metric_wrapper(Recall, single_class_dataset()) + assert 1 == metric(gt, pred)[0] + assert metric.meta.get('names') == ['dog'] + + def test_two_objects(self): + gt = make_representation(["0 0 0 5 5; 0 10 10 20 20"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 0 10 10 20 20"], score=1) + assert 1 == _test_metric_wrapper(Recall, single_class_dataset())(gt, pred)[0] + + def test_false_positive(self): + gt2 = make_representation(["0 10 10 20 20"], is_ground_truth=True) + pred2 = make_representation(["0 0 0 5 5"], score=1) + metric = _test_metric_wrapper(Recall, single_class_dataset()) + assert 0 == metric(gt2, pred2)[0] + assert metric.meta.get('names') == ['dog'] + + gt1 = make_representation(["0 0 0 5 5"], is_ground_truth=True) + pred1 = make_representation(["0 0 0 5 5; 0 10 10 20 20"], score=1) + assert 1 == metric(gt1, pred1)[0] + assert metric.meta.get('names') == ['dog'] + + def test_false_negative(self): + gt = make_representation(["0 10 10 20 20; 0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5"], score=1) + metric = _test_metric_wrapper(Recall, single_class_dataset()) + assert 0.5 == metric(gt, pred)[0] + assert metric.meta.get('names') == ['dog'] + + def test_duplicate_detections(self): + gt = make_representation(["0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 0 0 0 5 5"], score=1) + + metric = _test_metric_wrapper(Recall, single_class_dataset()) + assert 1 == metric(gt, pred)[0] + assert metric.meta.get('names') == ['dog'] + + def test_no_warnings_in_recall_calculation(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], score=1) + + with pytest.warns(None) as warnings: + _test_metric_wrapper(Recall, multi_class_dataset())(gt, pred) + assert len(warnings) == 0 + + def test_on_dataset_without_background(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], score=1) + + with pytest.warns(None) as warnings: + _test_metric_wrapper(Recall, multi_class_dataset_without_background())(gt, pred) + assert len(warnings) == 0 + + def test_not_gt_boxes_for_matching(self): + gt = make_representation(["0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["1 0 0 5 5"], score=1) + + metric = _test_metric_wrapper(Recall, multi_class_dataset_without_background()) + assert 0 == metric(gt, pred)[0] + assert metric.meta.get('names') == ['cat'] + + +class TestMAP: + def test_selects_all_detections(self): + gt = make_representation(["0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 0 0 0 5 5"], score=1) + + metric = _test_metric_wrapper(DetectionMAP, single_class_dataset()) + metric(gt, pred) + + assert not metric.distinct_conf + assert metric.overlap_threshold == 0.5 + assert metric.ignore_difficult + assert metric.meta.get('names') == ['dog'] + + def test_no_warnings_in_map_calculation(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], score=1) + + with pytest.warns(None) as warnings: + _test_metric_wrapper(DetectionMAP, multi_class_dataset())(gt, pred) + assert len(warnings) == 0 + + def test_perfect_detection(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], score=1) + + metric = _test_metric_wrapper(DetectionMAP, multi_class_dataset()) + assert metric(gt, pred) == [1.0, 1.0] + assert metric.meta.get('names') == ['dog', 'cat'] + + def test_one_false_alarm(self): + gt = make_representation(["0 0 0 5 5", "1 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["1 10 10 20 20; 0 0 0 5 5", "1 0 0 5 5"], score=1) + metric = _test_metric_wrapper(DetectionMAP, multi_class_dataset()) + values = metric(gt, pred) + assert values == [1.0, 0.5] + map_ = np.mean(values) + assert 0.75 == map_ + assert metric.meta.get('names') == ['dog', 'cat'] + + def test_zero_detection(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20"], is_ground_truth=True) + pred = make_representation(["0 30 30 40 40"], score=1) + + metric = _test_metric_wrapper(DetectionMAP, multi_class_dataset()) + assert metric(gt, pred) == [0.0] + assert metric.meta.get('names') == ['dog'] + + def test_no_detections_warn_user_warning(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20"], is_ground_truth=True) + pred = make_representation("", score=1) + with pytest.warns(UserWarning) as warnings: + map_ = _test_metric_wrapper(DetectionMAP, multi_class_dataset())(gt, pred)[0] + assert len(warnings) == 1 + + assert map_ == 0 + + def test_detection_on_dataset_without_background(self): + gt = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["0 0 0 5 5; 1 10 10 20 20", "1 0 0 5 5"], score=1) + + with pytest.warns(None) as warnings: + map_ = _test_metric_wrapper(DetectionMAP, multi_class_dataset_without_background())(gt, pred) + mean = np.mean(map_) + assert 1.0 == mean + assert len(warnings) == 0 + + def test_not_gt_boxes_for_box_matching(self): + gt = make_representation(["0 0 0 5 5"], is_ground_truth=True) + pred = make_representation(["1 0 0 5 5"], score=1) + + metric = _test_metric_wrapper(Recall, multi_class_dataset_without_background()) + assert 0 == metric(gt, pred)[0] + assert metric.meta.get('names') == ['cat'] diff --git a/tools/accuracy_checker/tests/test_dlsdk_launcher.py b/tools/accuracy_checker/tests/test_dlsdk_launcher.py new file mode 100644 index 0000000..599f77a --- /dev/null +++ b/tools/accuracy_checker/tests/test_dlsdk_launcher.py @@ -0,0 +1,980 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import subprocess + +import pytest + +pytest.importorskip('accuracy_checker.launcher.dlsdk_launcher') +import os +import cv2 +import numpy as np + +from pathlib import Path +from unittest.mock import PropertyMock +from accuracy_checker.config import ConfigError +from accuracy_checker.launcher import DLSDKLauncher +from accuracy_checker.launcher.dlsdk_launcher import DLSDKLauncherConfig +from accuracy_checker.launcher.launcher import create_launcher +from tests.common import update_dict +from accuracy_checker.dataset import DataRepresentation +from accuracy_checker.utils import contains_all + + +@pytest.fixture() +def mock_inference_engine(mocker): + try: + mocker.patch('openvino.inference_engine.IEPlugin') + mocker.patch('openvino.inference_engine.IENetwork') + except ImportError: + mocker.patch('inference_engine.IEPlugin') + mocker.patch('inference_engine.IENetwork') + + +@pytest.fixture() +def mock_inputs(mocker): + mocker.patch( + 'accuracy_checker.launcher.input_feeder.InputFeeder._parse_inputs_config', return_value=({}, ['data'], None) + ) + + +def get_dlsdk_test_model(models_dir, config_update=None): + config = { + 'framework': 'dlsdk', + 'weights': str(models_dir / 'SampLeNet.bin'), + 'model': str(models_dir / 'SampLeNet.xml'), + 'device': 'CPU', + 'adapter': 'classification', + '_models_prefix': str(models_dir) + } + if config_update: + config.update(config_update) + + return create_launcher(config) + + +def get_image(image_path, input_shape): + _, h, w = input_shape + img_raw = cv2.imread(str(image_path)) + + return DataRepresentation(cv2.resize(img_raw, (w, h))) + + +class TestDLSDKLauncherInfer: + def test_infer(self, data_dir, models_dir): + dlsdk_test_model = get_dlsdk_test_model(models_dir) + result = dlsdk_test_model.predict(['1.jpg'], [get_image(data_dir / '1.jpg', dlsdk_test_model.inputs['data'])]) + + assert dlsdk_test_model.adapter.output_blob == 'fc3' + assert result[0].label == 6 + + def test_launcher_creates(self, models_dir): + assert get_dlsdk_test_model(models_dir).inputs['data'] == [3, 32, 32] + + def test_infer_with_additional_outputs(self, data_dir, models_dir): + dlsdk_test_model = get_dlsdk_test_model(models_dir, {'outputs': ['fc1', 'fc2']}) + result = dlsdk_test_model.predict(['1.jpg'], [get_image(data_dir / '1.jpg', dlsdk_test_model.inputs['data'])]) + outputs = list(dlsdk_test_model.network.outputs.keys()) + adapter_output_blob = dlsdk_test_model.adapter.output_blob + + assert contains_all(outputs, ['fc1', 'fc2', 'fc3']) + assert adapter_output_blob == 'fc3' + assert result[0].label == 6 + + def test_dlsdk_launcher_provide_input_shape_to_adapter(self, mocker, models_dir): + raw_results = {} + + def raw_results_callback(outputs): + raw_results.update(outputs) + + launcher = get_dlsdk_test_model(models_dir) + + adapter_mock = mocker.patch('accuracy_checker.adapters.ClassificationAdapter.process') + launcher.predict(['1.png'], [DataRepresentation(np.zeros((32, 32, 3)))], output_callback=raw_results_callback) + adapter_mock.assert_called_once_with([raw_results], ['1.png'], [{'input_shape': {'data': [3, 32, 32]}, 'image_size': (32, 32, 3)}]) + + def test_dlsd_launcher_set_batch_size(self, models_dir): + dlsdk_test_model = get_dlsdk_test_model(models_dir, {'batch': 2}) + assert dlsdk_test_model.batch == 2 + + +@pytest.mark.usefixtures('mock_path_exists') +class TestDLSDKLauncherAffinity: + def test_dlsdk_launcher_valid_affinity_map(self, mocker, models_dir): + affinity_map = {'conv1' : 'GPU'} + + mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.read_yaml', return_value=affinity_map + ) + + dlsdk_test_model = get_dlsdk_test_model(models_dir, {'device' : 'HETERO:CPU,GPU', 'affinity_map' : './affinity_map.yml'}) + layers = dlsdk_test_model.network.layers + for key, value in affinity_map.items(): + assert layers[key].affinity == value + + def test_dlsdk_launcher_affinity_map_invalid_device(self, mocker, models_dir): + affinity_map = {'conv1' : 'GPU'} + + mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.read_yaml', return_value=affinity_map + ) + + with pytest.raises(ConfigError): + get_dlsdk_test_model(models_dir, {'device' : 'HETERO:CPU,CPU', 'affinity_map' : './affinity_map.yml'}) + + def test_dlsdk_launcher_affinity_map_invalid_layer(self, mocker, models_dir): + affinity_map = {'none-existing-layer' : 'CPU'} + + mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.read_yaml', return_value=affinity_map + ) + + with pytest.raises(ConfigError): + get_dlsdk_test_model(models_dir, {'device' : 'HETERO:CPU,CPU', 'affinity_map' : './affinity_map.yml'}) + + +@pytest.mark.usefixtures('mock_path_exists', 'mock_inference_engine', 'mock_inputs') +class TestDLSDKLauncher: + def test_program_bitsream_when_device_is_fpga(self, mocker): + subprocess_mock = mocker.patch('subprocess.run') + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'fpga', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix', + '_aocl': Path('aocl') + } + launcher = create_launcher(config, {'label_map': {}}) + subprocess_mock.assert_called_once_with(['aocl', 'program', 'acl0', 'custom_bitstream']) + launcher.release() + + def test_program_bitsream_when_fpga_in_hetero_device(self, mocker): + subprocess_mock = mocker.patch('subprocess.run') + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'hetero:fpga,cpu', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix', + '_aocl': Path('aocl') + } + launcher = create_launcher(config, {'label_map': {}}) + subprocess_mock.assert_called_once_with(['aocl', 'program', 'acl0', 'custom_bitstream']) + launcher.release() + + def test_does_not_program_bitsream_when_device_is_not_fpga(self, mocker): + subprocess_mock = mocker.patch('subprocess.run') + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'cpu', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix', + '_aocl': Path('aocl') + } + create_launcher(config) + subprocess_mock.assert_not_called() + + def test_does_not_program_bitsream_when_hetero_without_fpga(self, mocker): + subprocess_mock = mocker.patch('subprocess.run') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'hetero:cpu,cpu', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix', + '_aocl': Path('aocl') + } + create_launcher(config) + subprocess_mock.assert_not_called() + + def test_does_not_program_bitstream_if_compiler_mode_3_in_env_when_fpga_in_hetero_device(self, mocker): + subprocess_mock = mocker.patch('subprocess.run') + mocker.patch('os.environ.get', return_value='3') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'hetero:fpga,cpu', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix', + '_aocl': Path('aocl') + } + create_launcher(config) + + subprocess_mock.assert_not_called() + + def test_does_not_program_bitstream_if_compiler_mode_3_in_env_when_fpga_in_device(self, mocker): + subprocess_mock = mocker.patch('subprocess.run') + mocker.patch('os.environ.get', return_value='3') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'fpga', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix', + '_aocl': Path('aocl') + } + create_launcher(config) + + subprocess_mock.assert_not_called() + + def test_sets_dla_aocx_when_device_is_fpga(self, mocker): + mocker.patch('os.environ') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'fpga', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + create_launcher(config, {'label_map': {}}) + + os.environ.__setitem__.assert_called_once_with('DLA_AOCX', 'custom_bitstream') + + def test_sets_dla_aocx_when_fpga_in_hetero_device(self, mocker): + mocker.patch('os.environ') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'hetero:fpga,cpu', + 'bitstream': Path('custom_bitstream'), + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + create_launcher(config, {'label_map': {}}) + os.environ.__setitem__.assert_called_once_with('DLA_AOCX', 'custom_bitstream') + + def test_does_not_set_dla_aocx_when_device_is_not_fpga(self, mocker): + mocker.patch('os.environ') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'cpu', + 'bitstream': 'custom_bitstream', + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + create_launcher(config) + + os.environ.__setitem__.assert_not_called() + + def test_does_not_set_dla_aocx_when_hetero_without_fpga(self, mocker): + mocker.patch('os.environ') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'hetero:cpu,cpu', + 'bitstream': 'custom_bitstream', + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + create_launcher(config) + + os.environ.__setitem__.assert_not_called() + + def test_does_not_set_dla_aocx_if_compiler_mode_3_in_env_when_fpga_in_hetero_device(self, mocker): + mocker.patch('os.environ') + mocker.patch('os.environ.get', return_value='3') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'hetero:fpga,cpu', + 'bitstream': 'custom_bitstream', + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + create_launcher(config) + + os.environ.__setitem__.assert_not_called() + + def test_does_not_set_dla_aocx_if_compiler_mode_3_in_env_when_fpga_in_device(self, mocker): + mocker.patch('os.environ') + mocker.patch('os.environ.get', return_value='3') + + config = { + 'framework': 'dlsdk', + 'weights': 'custom_weights', + 'model': 'custom_model', + 'device': 'fpga', + 'bitstream': 'custom_bitstream', + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + create_launcher(config) + + os.environ.__setitem__.assert_not_called() + + def test_model_converted_from_caffe(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': 'dlsdk', + 'caffe_model': '/path/to/source_models/custom_model', + 'caffe_weights': '/path/to/source_models/custom_weights', + "device": 'cpu', + 'bitstream': Path('custom_bitstream'), + '_models_prefix': '/path/to/source_models', + 'adapter': 'classification' + } + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_model', '/path/to/source_models/custom_model', '/path/to/source_models/custom_weights', 'caffe', + [], None, None, None, None + ) + + def test_model_converted_with_mo_params(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': "dlsdk", + 'caffe_model': '/path/to/source_models/custom_model', + 'caffe_weights': '/path/to/source_models/custom_weights', + 'device': 'cpu', + 'bitstream': Path('custom_bitstream'), + '_models_prefix': '/path/to/source_models', + 'mo_params': {'data_type': 'FP16'}, + 'adapter': 'classification' + } + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_model', '/path/to/source_models/custom_model', '/path/to/source_models/custom_weights', 'caffe', + [], {'data_type': 'FP16'}, None, None, None + ) + + def test_model_converted_with_mo_flags(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': 'dlsdk', + 'caffe_model': '/path/to/source_models/custom_model', + 'caffe_weights': '/path/to/source_models/custom_weights', + 'device': 'cpu', + 'bitstream': Path('custom_bitstream'), + '_models_prefix': '/path/to/source_models', + 'mo_flags': ['reverse_input_channels'], + 'adapter': 'classification' + } + + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_model', '/path/to/source_models/custom_model', '/path/to/source_models/custom_weights', 'caffe', + [], None, ['reverse_input_channels'], None, None + ) + + def test_model_converted_to_output_dir_in_mo_params(self, mocker): + config = { + 'framework': 'dlsdk', + 'tf_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to', + 'adapter': 'classification', + 'mo_params': {'output_dir': '/path/to/output/models'} + } + mocker.patch('accuracy_checker.launcher.model_conversion.find_mo', return_value='ModelOptimizer') + prepare_args_patch = mocker.patch('accuracy_checker.launcher.model_conversion.prepare_args') + args = { + 'input_model': '/path/to/source_models/custom_model', + 'model_name': 'custom_model', + 'output_dir': '/path/to/output/models', + 'framework': 'tf' + } + + mocker.patch( + 'accuracy_checker.launcher.model_conversion.exec_mo_binary', + return_value=subprocess.CompletedProcess(args, returncode=0) + ) + DLSDKLauncher(config, dummy_adapter) + prepare_args_patch.assert_called_once_with('ModelOptimizer', flag_options=[], value_options=args) + + def test_model_converted_from_tf(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': 'dlsdk', + 'tf_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to/source_models', + 'adapter': 'classification' + } + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_model', '/path/to/source_models/custom_model', '', 'tf', [], None, None, None, None + ) + + def test_model_converted_from_tf_with_arg_path_to_custom_tf_config(self, mocker): + config = { + 'framework': 'dlsdk', + 'tf_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to', + 'adapter': 'classification', + 'mo_params': {'tensorflow_use_custom_operations_config': 'ssd_v2_support.json'}, + '_tf_custom_op_config_dir': 'config/dir' + } + mocker.patch('accuracy_checker.launcher.model_conversion.find_mo', return_value=Path('/path/ModelOptimizer')) + prepare_args_patch = mocker.patch('accuracy_checker.launcher.model_conversion.prepare_args') + + args = { + 'input_model': '/path/to/source_models/custom_model', + 'model_name': 'custom_model', + 'framework': 'tf', + 'tensorflow_use_custom_operations_config': 'config/dir/ssd_v2_support.json' + } + + mocker.patch( + 'accuracy_checker.launcher.model_conversion.exec_mo_binary', + return_value=subprocess.CompletedProcess(args, returncode=0) + ) + DLSDKLauncher(config, dummy_adapter) + prepare_args_patch.assert_called_once_with('/path/ModelOptimizer', flag_options=[], value_options=args) + + def test_model_converted_from_tf_with_default_path_to_custom_tf_config(self, mocker): + config = { + 'framework': 'dlsdk', + 'tf_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to', + 'adapter': 'classification', + 'mo_params': {'tensorflow_use_custom_operations_config': 'config.json'} + } + mocker.patch('accuracy_checker.launcher.model_conversion.find_mo', return_value=Path('/path/ModelOptimizer')) + prepare_args_patch = mocker.patch('accuracy_checker.launcher.model_conversion.prepare_args') + + args = { + 'input_model': '/path/to/source_models/custom_model', + 'model_name': 'custom_model', + 'framework': 'tf', + 'tensorflow_use_custom_operations_config': '/path/extensions/front/tf/config.json' + } + + mocker.patch( + 'accuracy_checker.launcher.model_conversion.exec_mo_binary', + return_value=subprocess.CompletedProcess(args, returncode=0) + ) + DLSDKLauncher(config, dummy_adapter) + prepare_args_patch.assert_called_once_with('/path/ModelOptimizer', flag_options=[], value_options=args) + + def test_model_converted_from_tf_with_default_path_to_obj_detection_api_config(self, mocker): + config = { + 'framework': 'dlsdk', + 'tf_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to', + 'adapter': 'classification', + 'mo_params': {'tensorflow_object_detection_api_pipeline_config': 'operations.config'}, + '_tf_obj_detection_api_pipeline_config_path': None + } + mocker.patch('accuracy_checker.launcher.model_conversion.find_mo', return_value=Path('/path/ModelOptimizer')) + prepare_args_patch = mocker.patch('accuracy_checker.launcher.model_conversion.prepare_args') + + args = { + 'input_model': '/path/to/source_models/custom_model', + 'model_name': 'custom_model', + 'framework': 'tf', + 'tensorflow_object_detection_api_pipeline_config': '/path/to/source_models/operations.config' + } + + mocker.patch( + 'accuracy_checker.launcher.model_conversion.exec_mo_binary', + return_value=subprocess.CompletedProcess(args, returncode=0) + ) + DLSDKLauncher(config, dummy_adapter) + prepare_args_patch.assert_called_once_with('/path/ModelOptimizer', flag_options=[], value_options=args) + + def test_model_converted_from_tf_with_arg_path_to_obj_detection_api_config(self, mocker): + config = { + 'framework': 'dlsdk', + 'tf_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to', + 'adapter': 'classification', + 'mo_params': {'tensorflow_object_detection_api_pipeline_config': 'operations.config'}, + '_tf_custom_op_config_dir': 'config/dir', + '_tf_obj_detection_api_pipeline_config_path': 'od_api' + } + mocker.patch('accuracy_checker.launcher.model_conversion.find_mo', return_value=Path('/path/ModelOptimizer')) + prepare_args_patch = mocker.patch('accuracy_checker.launcher.model_conversion.prepare_args') + + args = { + 'input_model': '/path/to/source_models/custom_model', + 'model_name': 'custom_model', + 'framework': 'tf', + 'tensorflow_object_detection_api_pipeline_config': 'od_api/operations.config' + } + + mocker.patch( + 'accuracy_checker.launcher.model_conversion.exec_mo_binary', + return_value=subprocess.CompletedProcess(args, returncode=0) + ) + DLSDKLauncher(config, dummy_adapter) + prepare_args_patch.assert_called_once_with('/path/ModelOptimizer', flag_options=[], value_options=args) + + def test_model_converted_from_mxnet(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': 'dlsdk', + 'mxnet_weights': '/path/to/source_models/custom_weights', + 'device': 'cpu', + '_models_prefix': '/path/to/source_models', + 'adapter': 'classification' + } + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_weights', '', '/path/to/source_models/custom_weights', 'mxnet', [], None, None, None, None + ) + + def test_model_converted_from_onnx(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': 'dlsdk', + 'onnx_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to/source_models', + 'adapter': 'classification' + } + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_model', '/path/to/source_models/custom_model', '', 'onnx', [], None, None, None, None + ) + + def test_model_converted_from_kaldi(self, mocker): + mock = mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.convert_model', + return_value=('converted_model', 'converted_weights') + ) + + config = { + 'framework': 'dlsdk', + 'kaldi_model': '/path/to/source_models/custom_model', + 'device': 'cpu', + '_models_prefix': '/path/to/source_models', + 'adapter': 'classification' + } + DLSDKLauncher(config, dummy_adapter) + + mock.assert_called_once_with( + 'custom_model', '/path/to/source_models/custom_model', '', 'kaldi', [], None, None, None, None + ) + + def test_raises_with_multiple_models_caffe_dlsdk(self): + config = { + 'framework': 'dlsdk', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_tf_dlsdk(self): + config = { + 'framework': 'dlsdk', + 'tf_model': 'tf_model', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_mxnet_dlsdk(self): + config = { + 'framework': 'dlsdk', + 'mxnet_weights': 'mxnet_weights', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_onnx_dlsdk(self): + config = { + 'framework': 'dlsdk', + 'onnx_model': 'onnx_model', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_kaldi_dlsdk(self): + config = { + 'framework': 'dlsdk', + 'onnx_model': 'kaldi_model', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_mxnet_caffe(self): + config = { + 'framework': 'dlsdk', + 'mxnet_weights': 'mxnet_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_tf_caffe(self): + + config = { + 'framework': 'dlsdk', + 'tf_model': 'tf_model', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_onnx_caffe(self): + + config = { + 'framework': 'dlsdk', + 'onnx_model': 'onnx_model', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_mxnet_tf(self): + config = { + 'framework': 'dlsdk', + 'mxnet_weights': 'mxnet_weights', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_onnx_tf(self): + config = { + 'framework': 'dlsdk', + 'onnx_model': 'onnx_model', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_mxnet_caffe_tf(self): + config = { + 'framework': 'dlsdk', + 'mxnet_weights': 'mxnet_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_caffe_tf(self): + config = { + 'framework': 'dlsdk', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_caffe_onnx(self): + config = { + 'framework': 'dlsdk', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'onnx_model': 'onnx_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_caffe_mxnet(self): + config = { + 'framework': 'dlsdk', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'mxnet_weights': 'mxnet_weights', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_tf_mxnet(self): + config = { + 'framework': "dlsdk", + 'model': 'custom_model', + 'weights': 'custom_weights', + 'mxnet_weights': 'mxnet_weights', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_tf_onnx(self): + config = { + 'framework': 'dlsdk', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'onnx_model': 'onnx_model', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_tf_mxnet_caffe(self): + config = { + 'framework': 'dlsdk', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'mxnet_weights': 'mxnet_weights', + 'onnx_model': 'onnx_model', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + def test_raises_with_multiple_models_dlsdk_tf_mxnet_caffe_onnx(self): + config = { + 'framework': 'dlsdk', + 'model': 'custom_model', + 'weights': 'custom_weights', + 'caffe_model': 'caffe_model', + 'caffe_weights': 'caffe_weights', + 'mxnet_weights': 'mxnet_weights', + 'tf_model': 'tf_model', + 'device': 'cpu', + '_models_prefix': 'prefix' + } + + with pytest.raises(ConfigError): + DLSDKLauncher(config, dummy_adapter) + + +@pytest.mark.usefixtures('mock_path_exists', 'mock_inputs', 'mock_inference_engine') +class TestDLSDKLauncherConfig: + def setup(self): + self.launcher = { + 'model': 'foo.xml', + 'weights': 'foo.bin', + 'device': 'CPU', + 'framework': 'dlsdk', + 'adapter': 'classification', + '_models_prefix': 'prefix' + } + self.config = DLSDKLauncherConfig('dlsdk_launcher') + + def test_hetero_correct(self): + self.config.validate(update_dict(self.launcher, device='HETERO:CPU')) + self.config.validate(update_dict(self.launcher, device='HETERO:CPU,FPGA')) + + def test_hetero_endswith_comma(self): + with pytest.raises(ConfigError): + self.config.validate(update_dict(self.launcher, device='HETERO:CPU,FPGA,')) + + def test_normal_multiple_devices(self): + with pytest.raises(ConfigError): + self.config.validate(update_dict(self.launcher, device='CPU,FPGA')) + + def test_hetero_empty(self): + with pytest.raises(ConfigError): + self.config.validate(update_dict(self.launcher, device='HETERO:')) + + def test_normal(self): + self.config.validate(update_dict(self.launcher, device='CPU')) + + def test_missed_model_in_create_dlsdk_launcher_raises_config_error_exception(self): + config = {'framework': 'dlsdk', 'weights': 'custom', 'adapter': 'classification', 'device': 'cpu'} + + with pytest.raises(ConfigError): + create_launcher(config) + + def test_missed_weights_in_create_dlsdk_launcher_raises_config_error_exception(self): + launcher = {'framework': 'dlsdk', 'model': 'custom', 'adapter': 'ssd', 'device': 'cpu'} + + with pytest.raises(ConfigError): + create_launcher(launcher) + + def test_missed_adapter_in_create_dlsdk_launcher_raises_config_error_exception(self): + launcher_config = {'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom'} + + with pytest.raises(ConfigError): + create_launcher(launcher_config) + + def test_undefined_str_adapter_in_create_dlsdk_launcher_raises_config_error_exception(self): + launcher_config = {'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom', 'adapter': 'undefined_str'} + + with pytest.raises(ConfigError): + create_launcher(launcher_config) + + def test_empty_dir_adapter_in_create_dlsdk_launcher_raises_config_error_exception(self): + launcher_config = {'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom', 'adapter': {}} + + with pytest.raises(ConfigError): + create_launcher(launcher_config) + + def test_missed_type_in_dir_adapter_in_create_dlsdk_launcher_raises_config_error_exception(self): + launcher_config = {'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom', 'adapter': {'key': 'val'}} + + with pytest.raises(ConfigError): + create_launcher(launcher_config) + + def test_undefined_type_in_dir_adapter_in_create_dlsdk_launcher_raises_config_error_exception(self): + launcher_config = { + 'framework': 'dlsdk', + 'model': 'custom', + 'weights': 'custom', + 'adapter': {'type': 'undefined'} + } + + with pytest.raises(ConfigError): + create_launcher(launcher_config) + + def test_dlsdk_launcher(self): + launcher = { + 'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom', 'adapter': 'ssd', 'device': 'cpu', + '_models_prefix': 'models' + } + create_launcher(launcher) + + def test_dlsdk_launcher_model_with_several_image_inputs_raise_value_error(self, mocker): + launcher_config = {'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom', 'adapter': {'key': 'val'}} + + with pytest.raises(ValueError): + mocker.patch( + 'accuracy_checker.launcher.dlsdk_launcher.DLSDKLauncher.inputs', + new_callable=PropertyMock(return_value={'data1': [3, 227, 227], 'data2': [3, 227, 227]}) + ) + create_launcher(launcher_config) + + def test_dlsdk_launcher_model_no_image_inputs_raise_value_error(self): + launcher_config = {'framework': 'dlsdk', 'model': 'custom', 'weights': 'custom', 'adapter': {'key': 'val'}} + + with pytest.raises(ValueError): + create_launcher(launcher_config) + + +def dummy_adapter(): + pass diff --git a/tools/accuracy_checker/tests/test_input_feeder.py b/tools/accuracy_checker/tests/test_input_feeder.py new file mode 100644 index 0000000..a4b5e14 --- /dev/null +++ b/tools/accuracy_checker/tests/test_input_feeder.py @@ -0,0 +1,255 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +import re +import numpy as np +from accuracy_checker.config import ConfigError +from accuracy_checker.launcher.input_feeder import InputFeeder +from accuracy_checker.dataset import DataRepresentation + +# InputInfo from openvino is needed here, but there is no appropriate API +# to create InputInfo with specific shape, therefore lets use analog +class InputInfo_test: + layout = '' + precision = '' + shape = [] + def __init__(self, layout = '', precision = '', shape = []): + self.layout = layout + self.precision = precision + self.shape = shape + +class TestInputFeeder: + def test_create_input_feeder_without_inputs_raise_config_error(self): + with pytest.raises(ConfigError): + InputFeeder([], {}) + + def test_create_input_feeder_with_config_inputs_and_empty_network_inputs_raise_config_error(self): + with pytest.raises(ConfigError): + InputFeeder([{'name': 'const_data', 'type': 'CONST_INPUT', 'value': '[1, 1, 1, 1]'}], {}) + + def test_create_input_feeder_with_config_const_inputs_not_in_network_inputs_raise_config_error(self): + with pytest.raises(ConfigError): + InputFeeder([{'name': 'const_data', 'type': 'CONST_INPUT', 'value': '[1, 1, 1, 1]'}], {'data': (1, 3, 10, 10)}) + + def test_create_input_feeder_with_config_inputs_not_in_network_inputs_raise_config_error(self): + with pytest.raises(ConfigError): + InputFeeder([{'name': 'data2', 'type': 'INPUT', 'value': '.'}], {'data': (1, 3, 10, 10)}) + + def test_create_input_feeder_without_config_inputs(self): + input_feeder = InputFeeder([], {'data': (1, 3, 10, 10)}) + assert not input_feeder.const_inputs + assert not input_feeder.inputs_mapping + assert input_feeder.non_constant_inputs == ['data'] + + def test_create_input_feeder_config_inputs_fully_match_to_network_inputs(self): + input_feeder = InputFeeder([{'name': 'data', 'type': 'INPUT', 'value': '.'}], {'data': (1, 3, 10, 10)}) + assert not input_feeder.const_inputs + assert input_feeder.inputs_mapping == {'data': re.compile('.')} + assert input_feeder.non_constant_inputs == ['data'] + + def test_create_input_feeder_config_inputs_contain_only_const_inputs_with_list_value(self): + input_feeder = InputFeeder([{'name': 'const_data', 'type': 'CONST_INPUT', 'value': [1, 1, 1, 1]}], {'data': (1, 3, 10, 10), 'const_data': (1, 4)}) + assert np.array_equal(input_feeder.const_inputs['const_data'], np.ones(4)) + assert not input_feeder.inputs_mapping + assert input_feeder.non_constant_inputs == ['data'] + + def test_create_input_feeder_config_inputs_contain_only_const_inputs_with_not_list_value(self): + input_feeder = InputFeeder( + [{'name': 'const_data', 'type': 'CONST_INPUT', 'value': 'value'}], + {'data': (1, 3, 10, 10), 'const_data': (1, 4)} + ) + assert input_feeder.const_inputs['const_data'] == 'value' + assert not input_feeder.inputs_mapping + assert input_feeder.non_constant_inputs == ['data'] + + def test_create_input_feeder_not_all_non_constant_inputs_in_config_raise_config_error(self): + with pytest.raises(ConfigError): + InputFeeder( + [{'name': '0', 'type': 'INPUT', 'value': '.'}], + {'0': (1, 3, 10, 10), '1': (1, 3, 10, 10)} + ) + + def test_fill_non_constant_input_with_one_input_without_specific_mapping_batch_1(self): + input_feeder = InputFeeder([], { 'input': InputInfo_test(shape=(1, 3, 10, 10)) }) + result = input_feeder.fill_non_constant_inputs([DataRepresentation(np.zeros((10, 10, 3)), identifier='0')])[0] + expected_data = np.zeros((1, 3, 10, 10)) + assert 'input' in result + assert np.array_equal(result['input'], expected_data) + + def test_fill_non_constant_input_without_specific_mapping_batch_2(self): + input_feeder = InputFeeder([], { 'input': InputInfo_test(shape=(1, 3, 10, 10))}) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation(np.zeros((10, 10, 3)), identifier='0'), + DataRepresentation(np.zeros((10, 10, 3)), identifier='1') + ])[0] + expected_data = np.zeros((2, 3, 10, 10)) + assert 'input' in result + assert np.array_equal(result['input'], expected_data) + + def test_fill_non_constant_input_with_specific_mapping_batch_1(self): + input_feeder = InputFeeder([{'name': 'input', 'type': 'INPUT', 'value': '.'}], {'input': InputInfo_test(shape=(1, 3, 10, 10))}) + result = input_feeder.fill_non_constant_inputs([DataRepresentation(np.zeros((10, 10, 3)), identifier='0')])[0] + expected_data = np.zeros((1, 3, 10, 10)) + assert 'input' in result + assert np.array_equal(result['input'], expected_data) + + def test_fill_non_constant_input_with_specific_mapping_sevaral_image_matched(self): + input_feeder = InputFeeder([{'name': 'input', 'type': 'INPUT', 'value': '.'}], {'input': InputInfo_test(shape=(1, 3, 10, 10))}) + result = input_feeder.fill_non_constant_inputs([DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], identifier=['0', '1'])])[0] + expected_data = np.zeros((1, 3, 10, 10)) + assert 'input' in result + assert np.array_equal(result['input'], expected_data) + + def test_fill_non_constant_input_with_specific_mapping_not_match_raise_config_error(self): + input_feeder = InputFeeder([{'name': 'input', 'type': 'INPUT', 'value': '1.'}], {'input': InputInfo_test(shape=(1, 3, 10, 10))}) + with pytest.raises(ConfigError): + input_feeder.fill_non_constant_inputs([DataRepresentation(np.zeros((10, 10, 3)), identifier='0')]) + + def test_fill_non_constant_input_with_specific_mapping_batch_2(self): + input_feeder = InputFeeder([{'name': 'input', 'type': 'INPUT', 'value': '.'}], {'input': InputInfo_test(shape=(1, 3, 10, 10))}) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation(np.zeros((10, 10, 3)), identifier='0'), + DataRepresentation(np.zeros((10, 10, 3)), identifier='1') + ])[0] + expected_data = np.zeros((2, 3, 10, 10)) + assert 'input' in result + assert np.array_equal(result['input'], expected_data) + + def test_fill_non_constant_input_with_specific_mapping_not_all_image_in_batch_matched_raise_config_error(self): + input_feeder = InputFeeder([{'name': 'input', 'type': 'INPUT', 'value': '0+'}], {'input': InputInfo_test(shape=(1, 3, 10, 10))}) + with pytest.raises(ConfigError): + input_feeder.fill_non_constant_inputs([ + DataRepresentation(np.zeros((10, 10, 3)), identifier='0'), + DataRepresentation(np.zeros((10, 10, 3)), identifier='1') + ]) + + def test_fill_non_constant_inputs_without_specific_mapping_batch_1(self): + input_feeder = InputFeeder([], { 'input1': InputInfo_test(shape=(1, 3, 10, 10)), 'input2': InputInfo_test(shape=(1, 3, 10, 10))}) + result = input_feeder.fill_non_constant_inputs([DataRepresentation(np.zeros((10, 10, 3)), identifier='0')])[0] + expected_data = np.zeros((1, 3, 10, 10)) + assert 'input1' in result + assert np.array_equal(result['input1'], expected_data) + assert 'input2' in result + assert np.array_equal(result['input2'], expected_data) + + def test_fill_non_constant_inputs_without_specific_mapping_batch_2(self): + input_feeder = InputFeeder([], {'input1': InputInfo_test(shape=(1, 3, 10, 10)), 'input2': InputInfo_test(shape = (1, 3, 10, 10))}) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation(np.zeros((10, 10, 3)), identifier='0'), + DataRepresentation(np.zeros((10, 10, 3)), identifier='1') + ])[0] + expected_data = np.zeros((2, 3, 10, 10)) + assert 'input1' in result + assert np.array_equal(result['input1'], expected_data) + assert 'input2' in result + assert np.array_equal(result['input2'], expected_data) + + def test_fill_non_constant_inputs_with_specific_mapping_batch_1(self): + input_feeder = InputFeeder( + [{'name': 'input1', 'type': 'INPUT', 'value': '0'}, {'name': 'input2', 'type': 'INPUT', 'value': '1'}], + {'input1': InputInfo_test(shape=(1, 3, 10, 10)), 'input2': InputInfo_test(shape=(1, 3, 10, 10))} + ) + result = input_feeder.fill_non_constant_inputs( + [DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))],identifier=['0', '1'])] + )[0] + expected_data = [np.zeros((1, 3, 10, 10)), np.ones((1, 3, 10, 10))] + assert 'input1' in result + assert np.array_equal(result['input1'], expected_data[0]) + assert 'input2' in result + assert np.array_equal(result['input2'], expected_data[1]) + + def test_fill_non_constant_inputs_with_specific_mapping_not_match_raise_config_error(self): + input_feeder = InputFeeder( + [{'name': 'input1', 'type': 'INPUT', 'value': '0'}, {'name': 'input2', 'type': 'INPUT', 'value': '1'}], + {'input1': InputInfo_test(shape=(1, 3, 10, 10)), 'input2': InputInfo_test(shape=(1, 3, 10, 10))} + ) + with pytest.raises(ConfigError): + input_feeder.fill_non_constant_inputs([DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], identifier=['0', '2'])]) + + def test_fill_non_constant_inputs_with_specific_mapping_batch_2(self): + input_feeder = InputFeeder( + [{'name': 'input1', 'type': 'INPUT', 'value': '0'}, {'name': 'input2', 'type': 'INPUT', 'value': '1'}], + { 'input1': InputInfo_test(shape = (1, 3, 10, 10)), 'input2': InputInfo_test(shape=(1, 3, 10, 10))} + ) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], identifier=['0', '1']), + DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], identifier=['0', '1']) + ])[0] + expected_data = [np.zeros((2, 3, 10, 10)), np.ones((2, 3, 10, 10))] + assert 'input1' in result + assert np.array_equal(result['input1'], expected_data[0]) + assert 'input2' in result + assert np.array_equal(result['input2'], expected_data[1]) + + def test_fill_non_constant_inputs_with_specific_mapping_not_all_image_in_batch_matched_raise_config_error(self): + input_feeder = InputFeeder( + [{'name': 'input1', 'type': 'INPUT', 'value': '0'}, {'name': 'input2', 'type': 'INPUT', 'value': '1'}], + {'input1': (1, 3, 10, 10), 'input2': (1, 3, 10, 10)} + ) + with pytest.raises(ConfigError): + input_feeder.fill_non_constant_inputs([ + DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], identifier=['0', '1']), + DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], identifier=['0', '2']) + ]) + + def test_fill_non_const_input_with_multi_infer_data_batch_1(self): + input_feeder = InputFeeder({}, {'input': (1, 3, 10, 10)}) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation([np.zeros((10, 10, 3)), np.ones((10, 10, 3))], {'multi_infer': True}, identifier='0') + ]) + expected = [{'input': np.zeros((1, 3, 10, 10))}, {'input': np.ones((1, 3, 10, 10))}] + assert len(result) == len(expected) + assert np.array_equal(result[0]['input'], expected[0]['input']) + assert np.array_equal(result[1]['input'], expected[1]['input']) + + def test_fill_non_const_input_with_multi_infer_data_batch_2(self): + input_feeder = InputFeeder({}, {'input': (2, 3, 10, 10)}) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation( + [np.zeros((10, 10, 3)), np.ones((10, 10, 3))], + {'multi_infer': True}, + identifier='0' + ), + DataRepresentation( + [np.zeros((10, 10, 3)), np.ones((10, 10, 3))], + {'multi_infer': True}, + identifier='1' + ), + ]) + expected = [{'input': np.zeros((2, 3, 10, 10))}, {'input': np.ones((2, 3, 10, 10))}] + assert len(result) == len(expected) + assert np.array_equal(result[0]['input'], expected[0]['input']) + assert np.array_equal(result[1]['input'], expected[1]['input']) + + def test_fill_non_const_input_with_multi_infer_not_consistent_data_batch_2(self): + input_feeder = InputFeeder({}, {'input': (2, 3, 10, 10)}) + result = input_feeder.fill_non_constant_inputs([ + DataRepresentation( + [np.zeros((10, 10, 3))], + {'multi_infer': True}, + identifier='0' + ), + DataRepresentation( + [np.zeros((10, 10, 3)), np.ones((10, 10, 3))], + {'multi_infer': True}, + identifier='1' + ), + ]) + expected = [{'input': np.zeros((2, 3, 10, 10))}, {'input': np.ones((1, 3, 10, 10))}] + assert len(result) == len(expected) + assert np.array_equal(result[0]['input'], expected[0]['input']) + assert np.array_equal(result[1]['input'], expected[1]['input']) diff --git a/tools/accuracy_checker/tests/test_metric_evaluator.py b/tools/accuracy_checker/tests/test_metric_evaluator.py new file mode 100644 index 0000000..7b4c9e8 --- /dev/null +++ b/tools/accuracy_checker/tests/test_metric_evaluator.py @@ -0,0 +1,549 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +from accuracy_checker.config import ConfigError +from accuracy_checker.metrics import ClassificationAccuracy, MetricsExecutor +from accuracy_checker.metrics.metric import Metric +from accuracy_checker.representation import ( + ClassificationAnnotation, + ClassificationPrediction, + ContainerAnnotation, + ContainerPrediction, + DetectionAnnotation, + DetectionPrediction +) +from .common import DummyDataset + + +class TestMetric: + def setup_method(self): + self.module = 'accuracy_checker.metrics.metric_evaluator' + + def test_missed_metrics_raises_config_error_exception(self): + config = {'annotation': 'custom'} + + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_missed_metrics_raises_config_error_exception_with_custom_name(self): + config = {'name': 'some_name', 'annotation': 'custom'} + + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_empty_metrics_raises_config_error_exception(self): + config = {'annotation': 'custom', 'metrics': []} + + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_metrics_with_empty_entry_raises_config_error_exception(self): + config = {'annotation': 'custom', 'metrics': [{}]} + + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_missed_metric_type_raises_config_error_exception(self): + config = {'annotation': 'custom', 'metrics': [{'undefined': ''}]} + + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_undefined_metric_type_raises_config_error_exception(self): + config = {'annotation': 'custom', 'metrics': [{'type': ''}]} + + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_accuracy_arguments(self): + config = {'annotation': 'custom', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + assert len(dispatcher.metrics) == 1 + _, accuracy_metric, _, _, _ = dispatcher.metrics[0] + assert isinstance(accuracy_metric, ClassificationAccuracy) + assert accuracy_metric.top_k == 1 + + def test_accuracy_with_several_annotation_source_raises_config_error_exception(self): + config = { + 'annotation': 'custom', + 'metrics': [{'type': 'accuracy', 'top_k': 1, 'annotation_source': 'annotation1, annotation2'}] + } + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_accuracy_with_several_prediction_source_raises_value_error_exception(self): + config = { + 'annotation': 'custom', + 'metrics': [{'type': 'accuracy', 'top_k': 1, 'prediction_source': 'prediction1, prediction2'}] + } + with pytest.raises(ConfigError): + MetricsExecutor(config, None) + + def test_accuracy_on_container_with_wrong_annotation_source_name_raise_config_error_exception(self): + annotations = [ContainerAnnotation({'annotation': ClassificationAnnotation('identifier', 3)})] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1, 'annotation_source': 'a'}]} + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_with_wrong_annotation_type_raise_config_error_exception(self): + annotations = [DetectionAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'accuracy', 'top_k': 1}] + } + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_with_unsupported_annotations_in_container_raise_config_error_exception(self): + annotations = [ContainerAnnotation({'annotation': DetectionAnnotation('identifier', 3)})] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'accuracy', 'top_k': 1}] + } + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_with_unsupported_annotation_type_as_annotation_source_for_container_raises_config_error(self): + annotations = [ContainerAnnotation({'annotation': DetectionAnnotation('identifier', 3)})] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'accuracy', 'top_k': 1, 'annotation_source': 'annotation'}] + } + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_on_annotation_container_with_several_suitable_representations_config_value_error_exception(self): + annotations = [ContainerAnnotation({ + 'annotation1': ClassificationAnnotation('identifier', 3), + 'annotation2': ClassificationAnnotation('identifier', 3) + })] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_with_wrong_prediction_type_raise_config_error_exception(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [DetectionPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_with_unsupported_prediction_in_container_raise_config_error_exception(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ContainerPrediction({'prediction': DetectionPrediction('identifier', [1.0, 1.0, 1.0, 4.0])})] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_with_unsupported_prediction_type_as_prediction_source_for_container_raises_config_error(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ContainerPrediction({'prediction': DetectionPrediction('identifier', [1.0, 1.0, 1.0, 4.0])})] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'accuracy', 'top_k': 1, 'prediction_source': 'prediction'}] + } + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_accuracy_on_prediction_container_with_several_suitable_representations_raise_config_error_exception(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ContainerPrediction({ + 'prediction1': ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0]), + 'prediction2': ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0]) + })] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + with pytest.raises(ConfigError): + dispatcher.update_metrics_on_batch(annotations, predictions) + + def test_complete_accuracy(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_complete_accuracy_with_container_default_sources(self): + annotations = [ContainerAnnotation({'a': ClassificationAnnotation('identifier', 3)})] + predictions = [ContainerPrediction({'p': ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])})] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_complete_accuracy_with_container_sources(self): + annotations = [ContainerAnnotation({'a': ClassificationAnnotation('identifier', 3)})] + predictions = [ContainerPrediction({'p': ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])})] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'accuracy', 'top_k': 1, 'annotation_source': 'a', 'prediction_source': 'p'}] + } + + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_zero_accuracy(self): + annotation = [ClassificationAnnotation('identifier', 2)] + prediction = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + + dispatcher = MetricsExecutor(config, None) + + for _, evaluation_result in dispatcher.iterate_metrics([annotation], [prediction]): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == 0.0 + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_complete_accuracy_top_3(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [1.0, 3.0, 4.0, 2.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 3}]} + + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_zero_accuracy_top_3(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [5.0, 3.0, 4.0, 1.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 3}]} + + dispatcher = MetricsExecutor(config, None) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == 0.0 + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_reference_is_10_by_config(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [5.0, 3.0, 4.0, 1.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 3, 'reference': 10}]} + + dispatcher = MetricsExecutor(config, None) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == 0.0 + assert evaluation_result.reference_value == 10 + assert evaluation_result.threshold is None + + def test_threshold_is_10_by_config(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [5.0, 3.0, 4.0, 1.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 3, 'threshold': 10}]} + + dispatcher = MetricsExecutor(config, None) + + for _, evaluation_result in dispatcher.iterate_metrics([annotations], [predictions]): + assert evaluation_result.name == 'accuracy' + assert evaluation_result.evaluated_value == 0.0 + assert evaluation_result.reference_value is None + assert evaluation_result.threshold == 10 + + def test_classification_per_class_accuracy_fully_zero_prediction(self): + annotation = ClassificationAnnotation('identifier', 0) + prediction = ClassificationPrediction('identifier', [1.0, 2.0]) + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy_per_class', 'top_k': 1}]} + dataset = DummyDataset(label_map={0: '0', 1: '1'}) + dispatcher = MetricsExecutor(config, dataset) + dispatcher.update_metrics_on_batch([annotation], [prediction]) + for _, evaluation_result in dispatcher.iterate_metrics([annotation], [prediction]): + assert evaluation_result.name == 'accuracy_per_class' + assert len(evaluation_result.evaluated_value) == 2 + assert evaluation_result.evaluated_value[0] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[1] == pytest.approx(0.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_classification_per_class_accuracy_partially_zero_prediction(self): + annotation = [ClassificationAnnotation('identifier', 1)] + prediction = [ClassificationPrediction('identifier', [1.0, 2.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy_per_class', 'top_k': 1}]} + dataset = DummyDataset(label_map={0: '0', 1: '1'}) + dispatcher = MetricsExecutor(config, dataset) + + dispatcher.update_metrics_on_batch(annotation, prediction) + + for _, evaluation_result in dispatcher.iterate_metrics(annotation, prediction): + assert evaluation_result.name == 'accuracy_per_class' + assert len(evaluation_result.evaluated_value) == 2 + assert evaluation_result.evaluated_value[0] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[1] == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_classification_per_class_accuracy_complete_prediction(self): + annotation = [ClassificationAnnotation('identifier_1', 1), ClassificationAnnotation('identifier_2', 0)] + prediction = [ + ClassificationPrediction('identifier_1', [1.0, 2.0]), + ClassificationPrediction('identifier_2', [2.0, 1.0]) + ] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy_per_class', 'top_k': 1}]} + dataset = DummyDataset(label_map={0: '0', 1: '1'}) + dispatcher = MetricsExecutor(config, dataset) + + dispatcher.update_metrics_on_batch(annotation, prediction) + + for _, evaluation_result in dispatcher.iterate_metrics(annotation, prediction): + assert evaluation_result.name == 'accuracy_per_class' + assert len(evaluation_result.evaluated_value) == 2 + assert evaluation_result.evaluated_value[0] == pytest.approx(1.0) + assert evaluation_result.evaluated_value[1] == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_classification_per_class_accuracy_partially_prediction(self): + annotation = [ + ClassificationAnnotation('identifier_1', 1), + ClassificationAnnotation('identifier_2', 0), + ClassificationAnnotation('identifier_3', 0) + ] + prediction = [ + ClassificationPrediction('identifier_1', [1.0, 2.0]), + ClassificationPrediction('identifier_2', [2.0, 1.0]), + ClassificationPrediction('identifier_3', [1.0, 5.0]) + ] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy_per_class', 'top_k': 1}]} + dataset = DummyDataset(label_map={0: '0', 1: '1'}) + dispatcher = MetricsExecutor(config, dataset) + + dispatcher.update_metrics_on_batch(annotation, prediction) + + for _, evaluation_result in dispatcher.iterate_metrics(annotation, prediction): + assert evaluation_result.name == 'accuracy_per_class' + assert len(evaluation_result.evaluated_value) == 2 + assert evaluation_result.evaluated_value[0] == pytest.approx(0.5) + assert evaluation_result.evaluated_value[1] == pytest.approx(1.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_classification_per_class_accuracy_prediction_top3_zero(self): + annotation = [ClassificationAnnotation('identifier_1', 0), ClassificationAnnotation('identifier_2', 1)] + prediction = [ + ClassificationPrediction('identifier_1', [1.0, 2.0, 3.0, 4.0]), + ClassificationPrediction('identifier_2', [2.0, 1.0, 3.0, 4.0]) + ] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy_per_class', 'top_k': 3}]} + dataset = DummyDataset(label_map={0: '0', 1: '1', 2: '2', 3: '3'}) + dispatcher = MetricsExecutor(config, dataset) + + dispatcher.update_metrics_on_batch(annotation, prediction) + + for _, evaluation_result in dispatcher.iterate_metrics(annotation, prediction): + assert evaluation_result.name == 'accuracy_per_class' + assert len(evaluation_result.evaluated_value) == 4 + assert evaluation_result.evaluated_value[0] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[1] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[2] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[3] == pytest.approx(0.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + def test_classification_per_class_accuracy_prediction_top3(self): + annotation = [ClassificationAnnotation('identifier_1', 1), ClassificationAnnotation('identifier_2', 1)] + prediction = [ + ClassificationPrediction('identifier_1', [1.0, 2.0, 3.0, 4.0]), + ClassificationPrediction('identifier_2', [2.0, 1.0, 3.0, 4.0]) + ] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy_per_class', 'top_k': 3}]} + dataset = DummyDataset(label_map={0: '0', 1: '1', 2: '2', 3: '3'}) + dispatcher = MetricsExecutor(config, dataset) + + dispatcher.update_metrics_on_batch(annotation, prediction) + + for _, evaluation_result in dispatcher.iterate_metrics(annotation, prediction): + assert evaluation_result.name == 'accuracy_per_class' + assert len(evaluation_result.evaluated_value) == 4 + assert evaluation_result.evaluated_value[0] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[1] == pytest.approx(0.5) + assert evaluation_result.evaluated_value[2] == pytest.approx(0.0) + assert evaluation_result.evaluated_value[3] == pytest.approx(0.0) + assert evaluation_result.reference_value is None + assert evaluation_result.threshold is None + + +class TestMetricExtraArgs: + def test_all_metrics_raise_config_error_on_extra_args(self): + for provider in Metric.providers: + adapter_config = {'type': provider, 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide(provider, adapter_config, None) + + def test_detection_recall_raise_config_error_on_extra_args(self): + adapter_config = {'type': 'recall', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('recall', adapter_config, None) + + def test_detection_miss_rate_raise_config_error_on_extra_args(self): + adapter_config = {'type': 'miss_rate', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('miss_rate', adapter_config, None) + + def test_accuracy_raise_config_error_on_extra_args(self): + adapter_config = {'type': 'accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('accuracy', adapter_config, None) + + def test_per_class_accuracy_raise_config_error_on_extra_args(self): + adapter_config = {'type': 'accuracy_per_class', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('accuracy_per_class', adapter_config, None) + + def test_character_recognition_accuracy_raise_config_error_on_extra_args(self): + adapter_config = {'type': 'character_recognition_accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('character_recognition_accuracy', adapter_config, None) + + def test_multi_accuracy_raise_config_error_on_extra_args(self): + metric_config = {'type': 'multi_accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('multi_accuracy', metric_config, None) + + def test_multi_precision_raise_config_error_on_extra_args(self): + metric_config = {'type': 'multi_precision', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('multi_precision', metric_config, None) + + def test_f1_score_raise_config_error_on_extra_args(self): + metric_config = {'type': 'f1-score', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('f1-score', metric_config, None) + + def test_mae_raise_config_error_on_extra_args(self): + metric_config = {'type': 'mae', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('mae', metric_config, None) + + def test_mse_raise_config_error_on_extra_args(self): + metric_config = {'type': 'mse', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('mse', metric_config, None) + + def test_rmse_raise_config_error_on_extra_args(self): + metric_config = {'type': 'rmse', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('rmse', metric_config, None) + + def test_mae_on_interval_raise_config_error_on_extra_args(self): + metric_config = {'type': 'mae_on_interval', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('mae_on_interval', metric_config, None) + + def test_mse_on_interval_raise_config_error_on_extra_args(self): + metric_config = {'type': 'mse_on_interval', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('mse_on_interval', metric_config, None) + + def test_rmse_on_interval_raise_config_error_on_extra_args(self): + metric_config = {'type': 'rmse_on_interval', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('rmse_on_interval', metric_config, None) + + def test_per_point_normed_error_raise_config_error_on_extra_args(self): + metric_config = {'type': 'per_point_normed_error', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('per_point_normed_error', metric_config, None) + + def test_average_point_error_raise_config_error_on_extra_args(self): + metric_config = {'type': 'normed_error', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('normed_error', metric_config, None) + + def test_reid_cmc_raise_config_error_on_extra_args(self): + metric_config = {'type': 'cmc', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('cmc', metric_config, None) + + def test_reid_map_raise_config_error_on_extra_args(self): + adapter_config = {'type': 'reid_map', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('reid_map', adapter_config, None) + + def test_pairwise_accuracy_raise_config_error_on_extra_args(self): + metric_config = {'type': 'pairwise_accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('pairwise_accuracy', metric_config, None) + + def test_segmentation_accuracy_raise_config_error_on_extra_args(self): + metric_config = {'type': 'segmentation_accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('segmentation_accuracy', metric_config, None) + + def test_mean_iou_raise_config_error_on_extra_args(self): + metric_config = {'type': 'mean_iou', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('mean_iou', metric_config, None) + + def test_mean_accuracy_raise_config_error_on_extra_args(self): + metric_config = {'type': 'mean_accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('mean_accuracy', metric_config, None) + + def test_frequency_weighted_accuracy_raise_config_error_on_extra_args(self): + metric_config = {'type': 'frequency_weighted_accuracy', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + Metric.provide('frequency_weighted_accuracy', metric_config, None) diff --git a/tools/accuracy_checker/tests/test_model_conversion.py b/tools/accuracy_checker/tests/test_model_conversion.py new file mode 100644 index 0000000..a5a8c77 --- /dev/null +++ b/tools/accuracy_checker/tests/test_model_conversion.py @@ -0,0 +1,80 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import sys +import pytest + +from accuracy_checker.launcher.model_conversion import (exec_mo_binary, find_dlsdk_ir, find_mo, prepare_args) +from tests.common import mock_filesystem + + +def test_mock_file_system(): + with mock_filesystem(['foo/bar', 'foo/baz/']) as prefix: + assert (prefix / 'foo' / 'bar').is_file() + assert (prefix / 'foo' / 'baz').is_dir() + + +def test_find_mo(): + with mock_filesystem(['deployment_tools/model_optimizer/mo.py']) as prefix: + assert find_mo([prefix / 'deployment_tools' / 'model_optimizer']) + + +def test_find_mo_is_none_when_not_exist(): + with mock_filesystem(['deployment_tools/model_optimizer/mo.py']) as prefix: + assert find_mo([prefix / 'deployment_tools']) is None + + +def test_find_mo_list_not_corrupted(): + with mock_filesystem(['deployment_tools/model_optimizer/mo.py']) as prefix: + search_paths = [prefix] + find_mo(search_paths) + assert len(search_paths) == 1 + + +def test_find_ir__in_root(): + with mock_filesystem(['model.xml', 'model.bin']) as root: + model, weights = find_dlsdk_ir(root, 'model') + assert model == root / 'model.xml' + assert weights == root / 'model.bin' + + +def test_find_ir_raises_file_not_found_error_when_ir_not_found(): + with mock_filesystem(['foo/']) as root: + with pytest.raises(FileNotFoundError): + find_dlsdk_ir(root, 'model') + + +def test_prepare_args(): + args = prepare_args('foo', ['a', 'b'], {'bar': 123, 'x': 'baz'}) + assert args[0] == sys.executable + assert args[1] == 'foo' + assert '--a' in args + assert '--b' in args + assert '--bar' in args + assert '--x' in args + + assert args[args.index('--bar') + 1] == '123' + assert args[args.index('--x') + 1] == 'baz' + + +def test_exec_mo_binary(mocker): + subprocess_run = mocker.patch('subprocess.run') + mocker.patch('os.chdir') + + args = prepare_args('ModelOptimizer', value_options={'--foo': 'bar'}) + exec_mo_binary(args) + + subprocess_run.assert_called_once_with(args, check=False, timeout=None) diff --git a/tools/accuracy_checker/tests/test_model_evaluator.py b/tools/accuracy_checker/tests/test_model_evaluator.py new file mode 100644 index 0000000..eeb9a52 --- /dev/null +++ b/tools/accuracy_checker/tests/test_model_evaluator.py @@ -0,0 +1,143 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from unittest.mock import Mock, MagicMock + +from accuracy_checker.model_evaluator import ModelEvaluator + + +class TestModelEvaluator: + def setup_method(self): + self.launcher = Mock() + self.launcher.predict.return_value = [] + + self.preprocessor = Mock() + self.postprocessor = Mock() + + annotation_0 = Mock() + annotation_0.identifier = 0 + annotation_1 = Mock() + annotation_1.identifier = 1 + annotation_container_0 = Mock() + annotation_container_0.values = Mock(return_value=[annotation_0]) + annotation_container_1 = Mock() + annotation_container_1.values = Mock(return_value=([annotation_1])) + self.annotations = [ + ([annotation_container_0], [annotation_container_0]), + ([annotation_container_1], [annotation_container_1]) + ] + + self.dataset = MagicMock() + self.dataset.__iter__.return_value = self.annotations + + self.postprocessor.process_batch = Mock(side_effect=[ + ([annotation_container_0], [annotation_container_0]), ([annotation_container_1], [annotation_container_1]) + ]) + self.postprocessor.process_dataset = Mock(return_value=( + ([annotation_container_0], [annotation_container_0]), ([annotation_container_1], [annotation_container_1]) + )) + self.postprocessor.full_process = Mock(return_value=( + ([annotation_container_0], [annotation_container_0]), ([annotation_container_1], [annotation_container_1]) + )) + + self.metric = Mock() + self.metric.update_metrics_on_batch = Mock() + + self.evaluator = ModelEvaluator(self.launcher, self.preprocessor, self.postprocessor, self.dataset, self.metric) + self.evaluator.store_predictions = Mock() + self.evaluator.load = Mock(return_value=( + ([annotation_container_0], [annotation_container_0]), ([annotation_container_1], [annotation_container_1]) + )) + + def test_process_dataset_without_storing_predictions_and_dataset_processors(self): + self.postprocessor.has_dataset_processors = False + + self.evaluator.process_dataset(None, None) + + assert not self.evaluator.store_predictions.called + assert not self.evaluator.load.called + assert self.launcher.predict.called + assert self.postprocessor.process_batch.called + assert self.metric.update_metrics_on_batch.call_count == len(self.annotations) + assert self.postprocessor.process_dataset.called + assert not self.postprocessor.full_process.called + + def test_process_dataset_without_storing_predictions_and_with_dataset_processors(self): + self.postprocessor.has_dataset_processors = True + + self.evaluator.process_dataset(None, None) + + assert not self.evaluator.store_predictions.called + assert not self.evaluator.load.called + assert self.launcher.predict.called + assert self.postprocessor.process_batch.called + assert self.metric.update_metrics_on_batch.call_count == 1 + assert self.postprocessor.process_dataset.called + assert not self.postprocessor.full_process.called + + def test_process_dataset_with_storing_predictions_and_without_dataset_processors(self): + self.postprocessor.has_dataset_processors = False + + self.evaluator.process_dataset('path', None) + + assert self.evaluator.store_predictions.called + assert not self.evaluator.load.called + assert self.launcher.predict.called + assert self.postprocessor.process_batch.called + assert self.metric.update_metrics_on_batch.call_count == len(self.annotations) + assert self.postprocessor.process_dataset.called + assert not self.postprocessor.full_process.called + + def test_process_dataset_with_storing_predictions_and_with_dataset_processors(self): + self.postprocessor.has_dataset_processors = True + + self.evaluator.process_dataset('path', None) + + assert self.evaluator.store_predictions.called + assert not self.evaluator.load.called + assert self.launcher.predict.called + assert self.postprocessor.process_batch.called + assert self.metric.update_metrics_on_batch.call_count == 1 + assert self.postprocessor.process_dataset.called + assert not self.postprocessor.full_process.called + + def test_process_dataset_with_loading_predictions_and_without_dataset_processors(self, mocker): + mocker.patch('accuracy_checker.model_evaluator.get_path') + self.postprocessor.has_dataset_processors = False + + self.evaluator.process_dataset('path', None) + + assert not self.evaluator.store_predictions.called + assert self.evaluator.load.called + assert not self.launcher.predict.called + assert not self.postprocessor.process_batch.called + assert self.metric.update_metrics_on_batch.call_count == 1 + assert not self.postprocessor.process_dataset.called + assert self.postprocessor.full_process.called + + def test_process_dataset_with_loading_predictions_and_with_dataset_processors(self, mocker): + mocker.patch('accuracy_checker.model_evaluator.get_path') + self.postprocessor.has_dataset_processors = True + + self.evaluator.process_dataset('path', None) + + assert not self.evaluator.store_predictions.called + assert self.evaluator.load.called + assert not self.launcher.predict.called + assert not self.postprocessor.process_batch.called + assert self.metric.update_metrics_on_batch.call_count == 1 + assert not self.postprocessor.process_dataset.called + assert self.postprocessor.full_process.called diff --git a/tools/accuracy_checker/tests/test_postprocessor.py b/tools/accuracy_checker/tests/test_postprocessor.py new file mode 100644 index 0000000..81c14c3 --- /dev/null +++ b/tools/accuracy_checker/tests/test_postprocessor.py @@ -0,0 +1,1070 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import pytest + +from accuracy_checker.config import ConfigError +from accuracy_checker.postprocessor import PostprocessingExecutor + +from accuracy_checker.representation import ( + DetectionAnnotation, + DetectionPrediction, + ContainerAnnotation, + ContainerPrediction, + ClassificationAnnotation +) + +from .common import make_representation, make_segmentation_representation + + +def postprocess_data(executor, annotations, predictions): + return executor.full_process(annotations, predictions) + + +class TestPostprocessor: + def test_without_apply_to_and_sources_filter_raise_config_error_exception(self): + config = [{'type': 'filter', 'labels': [1]}] + + with pytest.raises(ConfigError): + PostprocessingExecutor(config) + + def test_both_provided_apply_to_and_sources_filter_raise_config_error_exception(self): + config = [{ + 'type': 'filter', + 'apply_to': 'prediction', + 'annotation_source': 'annotation', + 'labels': [1] + }] + + with pytest.raises(ConfigError): + PostprocessingExecutor(config) + + def test_filter_annotations_unsupported_source_type_in_container_raise_type_error_exception(self): + config = [{'type': 'filter', 'annotation_source': 'annotation', 'labels': [1]}] + annotation = ContainerAnnotation({'annotation': ClassificationAnnotation()}) + executor = PostprocessingExecutor(config) + + with pytest.raises(TypeError): + postprocess_data(executor, [annotation], [None]) + + def test_filter_annotations_source_not_found_raise_config_error_exception(self): + config = [{'type': 'filter', 'annotation_source': 'ann', 'labels': [1]}] + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + executor = PostprocessingExecutor(config) + + with pytest.raises(ConfigError): + postprocess_data(executor, [annotation], [None]) + + def test_filter_predictions_unsupported_source_type_raise_type_error_exception(self): + config = [{ + 'type': 'filter', + 'prediction_source': 'detection_out', + 'labels': [1], + 'remove_filtered': False + }] + prediction = ContainerPrediction({'detection_out': ClassificationAnnotation()}) + executor = PostprocessingExecutor(config) + + with pytest.raises(TypeError): + postprocess_data(executor, [None], [prediction]) + + def test_filter_predictions_source_not_found_raise_config_error_exception(self): + config = [{ + 'type': 'filter', 'prediction_source': 'undefined', 'labels': [1] + }] + prediction = ContainerPrediction({'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0]}) + executor = PostprocessingExecutor(config) + + with pytest.raises(ConfigError): + postprocess_data(executor, [None], [prediction]) + + def test_filter_container_annotations_by_labels_with_ignore_using_source(self): + config = [{ + 'type': 'filter', 'annotation_source': 'annotation', 'labels': [1], 'remove_filtered': False + }] + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation': make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_container_annotations_by_labels_with_ignore_using_apply_to(self): + config = [{ + 'type': 'filter', + 'apply_to': 'annotation', + 'labels': [1], + 'remove_filtered': False + }] + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation': make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_regular_annotations_by_labels_with_ignore(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'labels': [1], 'remove_filtered': False}] + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected = make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_multi_source_annotations_by_labels_with_ignore(self): + config = [{ + 'type': 'filter', + 'annotation_source': ['annotation1', 'annotation2'], + 'labels': [1], + 'remove_filtered': False + }] + annotation = ContainerAnnotation({ + 'annotation1': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0], + 'annotation2': make_representation('1 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation1': make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0], + 'annotation2': make_representation( + '1 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [0, 1]}] + )[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_multi_source_annotations_by_labels_with_ignore_using_apply_to(self): + config = [{ + 'type': 'filter', + 'apply_to': 'annotation', + 'labels': [1], + 'remove_filtered': False + }] + annotation = ContainerAnnotation({ + 'annotation1': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0], + 'annotation2': make_representation('1 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation1': make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0], + 'annotation2': make_representation( + '1 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [0, 1]}] + )[0] + }) + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_regular_annotations_by_labels_with_remove_using_annotation_source_warm_user_warning(self): + config = [{ + 'type': 'filter', + 'annotation_source': 'annotation', + 'labels': [1], + 'remove_filtered': True + }] + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected = make_representation('0 0 0 10 10', is_ground_truth=True)[0] + + with pytest.warns(UserWarning): + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_regular_annotations_by_labels_with_remove_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'labels': [1], 'remove_filtered': True}] + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected = make_representation('0 0 0 10 10', is_ground_truth=True)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_annotations_by_labels_with_remove_on_container(self): + config = [{ + 'type': 'filter', + 'annotation_source': 'annotation', + 'labels': [1], + 'remove_filtered': True + }] + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_annotations_by_labels_with_remove_on_container_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'labels': [1], 'remove_filtered': True}] + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_multi_source_annotations_by_labels_with_remove(self): + config = [{ + 'type': 'filter', + 'annotation_source': ['annotation1', 'annotation2'], + 'labels': [1], 'remove_filtered': True + }] + annotation = ContainerAnnotation({ + 'annotation1': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0], + 'annotation2': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation1': make_representation('0 0 0 10 10', is_ground_truth=True)[0], + 'annotation2': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_multi_source_by_labels_with_remove_on_container_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'labels': [1], 'remove_filtered': True}] + annotation = ContainerAnnotation({ + 'annotation1': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0], + 'annotation2': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + expected = ContainerAnnotation({ + 'annotation1': make_representation('0 0 0 10 10', is_ground_truth=True)[0], + 'annotation2': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_predictions_by_labels_with_ignore(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'labels': ['to_be_filtered'], 'remove_filtered': False}] + prediction = DetectionPrediction(labels=['some_label', 'to_be_filtered']) + expected = DetectionPrediction(labels=['some_label', 'to_be_filtered'], metadata={'difficult_boxes': [1]}) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_predictions_by_labels_with_ignore_on_container(self): + config = [{ + 'type': 'filter', + 'prediction_source': 'detection_out', + 'labels': [1], + 'remove_filtered': False + }] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected = ContainerPrediction({'detection_out': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0]}) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_predictions_by_labels_with_ignore_on_container_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'labels': [1], 'remove_filtered': False}] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected = ContainerPrediction({'detection_out': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0]}) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_multi_source_predictions_by_labels_with_ignore(self): + config = [{ + 'type': 'filter', 'prediction_source': ['detection_out1', 'detection_out2'], 'labels': [1], + 'remove_filtered': False + }] + prediction = ContainerPrediction({ + 'detection_out1': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0], + 'detection_out2': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected = ContainerPrediction({ + 'detection_out1': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0], + 'detection_out2': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0] + }) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_multi_source_predictions_by_labels_with_ignore_using_apply_to(self): + config = [{ + 'type': 'filter', 'apply_to': 'prediction', 'labels': [1], 'remove_filtered': False + }] + prediction = ContainerPrediction({ + 'detection_out1': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0], + 'detection_out2': make_representation('1 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected = ContainerPrediction({ + 'detection_out1': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0], + 'detection_out2': make_representation( + '1 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [0, 1]}] + )[0] + }) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_predictions_by_labels_with_remove(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'labels': [1], 'remove_filtered': True}] + prediction = make_representation('0 0 0 10 10; 1 0 0 11 11', score=1) + expected = make_representation('0 0 0 10 10', score=1) + + postprocess_data(PostprocessingExecutor(config), [None], prediction) + + assert prediction == expected + + def test_filter_predictions_by_labels_with_remove_on_container(self): + config = [{ + 'type': 'filter', 'prediction_source': 'detection_out', 'labels': [0], 'remove_filtered': True + }] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected = ContainerPrediction({'detection_out': make_representation('1 0 0 11 11', score=1)[0]}) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_predictions_by_labels_with_remove_on_container_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'labels': [0], 'remove_filtered': True}] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected = ContainerPrediction({'detection_out': make_representation('1 0 0 11 11', score=1)[0]}) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_multi_source_predictions_by_labels_with_remove(self): + config = [{ + 'type': 'filter', + 'prediction_source': ['detection_out1', 'detection_out2'], + 'labels': [1], + 'remove_filtered': True + }] + prediction = ContainerPrediction({ + 'detection_out1': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0], + 'detection_out2': make_representation('0 0 0 10 10', score=1)[0] + }) + expected = ContainerPrediction({ + 'detection_out1': make_representation('0 0 0 10 10', score=1)[0], + 'detection_out2': make_representation('0 0 0 10 10', score=1)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_multi_source_predictions_by_labels_with_remove_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'labels': [1], 'remove_filtered': True}] + prediction = ContainerPrediction({ + 'detection_out1': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0], + 'detection_out2': make_representation('0 0 0 10 10', score=1)[0] + }) + expected = ContainerPrediction({ + 'detection_out1': make_representation('0 0 0 10 10', score=1)[0], + 'detection_out2': make_representation('0 0 0 10 10', score=1)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [None], [prediction]) + + assert prediction == expected + + def test_filter_regular_annotations_and_regular_predictions_by_labels_with_ignore_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': False}] + prediction = make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + expected_prediction = make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0] + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected_annotation = make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_regular_annotations_and_regular_predictions_by_labels_with_remove_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': True}] + prediction = make_representation('0 0 0 10 10; 1 0 0 11 11', score=1) + expected_prediction = make_representation('0 0 0 10 10', score=1) + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True) + expected_annotation = make_representation('0 0 0 10 10', is_ground_truth=True) + + postprocess_data(PostprocessingExecutor(config), annotation, prediction) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_container_annotations_and_regular_predictions_by_labels_with_ignore_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': False}] + prediction = make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + expected_prediction = make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0] + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected_annotation = make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_container_annotations_and_regular_predictions_by_labels_with_remove_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': True}] + prediction = make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + expected_prediction = make_representation('0 0 0 10 10', score=1)[0] + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected_annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_regular_annotations_and_container_predictions_by_labels_with_ignore_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': False}] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected_prediction = ContainerPrediction({ + 'detection_out': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0] + }) + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected_annotation = make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_regular_annotations_and_container_predictions_by_labels_with_remove_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': True}] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected_prediction = ContainerPrediction({'detection_out': make_representation('0 0 0 10 10', score=1)[0]}) + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected_annotation = make_representation('0 0 0 10 10', is_ground_truth=True)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_container_annotations_and_container_predictions_by_labels_with_ignore_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': False}] + prediction = ContainerPrediction({ + 'detection_out': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected_prediction = ContainerPrediction({ + 'detection_out': make_representation( + '0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}] + )[0] + }) + annotation = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + expected_annotation = make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_container_annotations_and_container_predictions_by_labels_with_remove_using_apply_to(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': True}] + prediction = ContainerPrediction({ + 'prediction': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0] + }) + expected_prediction = ContainerPrediction({'prediction': make_representation('0 0 0 10 10', score=1)[0]}) + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected_annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10', is_ground_truth=True)[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_container_annotations_and_container_predictions_by_labels_with_ignore_using_sources(self): + config = [{'type': 'filter', 'apply_to': 'all', 'labels': [1], 'remove_filtered': False}] + prediction = ContainerPrediction({'prediction': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0]}) + expected_prediction = ContainerPrediction({ + 'prediction': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1, meta=[{'difficult_boxes': [1]}])[0] + }) + annotation = ContainerAnnotation({ + 'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0] + }) + expected_annotation = ContainerAnnotation({ + 'annotation': make_representation( + '0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True, meta=[{'difficult_boxes': [1]}] + )[0] + }) + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_container_annotations_and_container_predictions_by_labels_with_remove_using_sources(self): + config = [{'type': 'filter', 'annotation_source': 'annotation', 'prediction_source': 'prediction', + 'labels': [1], 'remove_filtered': True}] + prediction = ContainerPrediction({'prediction': make_representation('0 0 0 10 10; 1 0 0 11 11', score=1)[0]}) + expected_prediction = ContainerPrediction({'prediction': make_representation('0 0 0 10 10', score=1)[0]}) + annotation = ContainerAnnotation( + {'annotation': make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True)[0]}) + expected_annotation = ContainerAnnotation( + {'annotation': make_representation('0 0 0 10 10', is_ground_truth=True)[0]}) + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_filter_annotations_by_min_confidence_do_nothing(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_confidence': 0.5, 'remove_filtered': True}] + annotations = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True) + expected_annotations = make_representation('0 0 0 10 10; 1 0 0 11 11', is_ground_truth=True) + + postprocess_data(PostprocessingExecutor(config), annotations, [None]) + + assert np.array_equal(annotations, expected_annotations) + + def test_filter_predictions_by_min_confidence_with_ignore(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'min_confidence': 0.5, 'remove_filtered': False}] + predictions = [ + make_representation('0 0 0 10 10; 1 0 0 11 11', score=[0.3, 0.8])[0], + make_representation('0 0 0 10 10; 1 0 0 11 11', score=[0.5, 0.4])[0] + ] + expected_predictions = [ + make_representation('0 0 0 10 10; 1 0 0 11 11', score=[0.3, 0.8], meta=[{'difficult_boxes': [0]}])[0], + make_representation('0 0 0 10 10; 1 0 0 11 11', score=[0.5, 0.4], meta=[{'difficult_boxes': [1]}])[0] + ] + + executor = PostprocessingExecutor(config) + postprocess_data(executor, [None, None], predictions) + + assert np.array_equal(predictions, expected_predictions) + + def test_filter_predictions_by_min_confidence_with_remove(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'min_confidence': 0.5, 'remove_filtered': True}] + predictions = [ + make_representation('0 0 0 10 10; 1 0 0 11 11', score=[0.3, 0.8])[0], + make_representation('0 0 0 10 10; 1 0 0 11 11', score=[0.5, 0.4])[0] + ] + expected_predictions = [ + make_representation('1 0 0 11 11', score=0.8)[0], + make_representation('0 0 0 10 10', score=0.5)[0] + ] + + postprocess_data(PostprocessingExecutor(config), [None, None], predictions) + + assert np.array_equal(predictions, expected_predictions) + + def test_filter_annotations_by_height_range_with_ignored(self): + config = [{ + 'type': 'filter', + 'apply_to': 'annotation', + 'height_range': '(10.0, 20.0)', + 'remove_filtered': False + }] + annotations = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True)[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', is_ground_truth=True)[0] + ] + expected = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True, meta=[{'difficult_boxes': [1]}])[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', is_ground_truth=True, meta=[{'difficult_boxes': [0, 1]}])[0] + ] + + postprocess_data(PostprocessingExecutor(config), annotations, [None, None]) + + assert np.array_equal(annotations, expected) + + def test_filter_annotations_by_height_range_with_remove(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'height_range': '(10.0, 20.0)', 'remove_filtered': True}] + annotations = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True)[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', is_ground_truth=True)[0] + ] + expected = [ + make_representation('0 0 5 0 15', is_ground_truth=True)[0], + make_representation('', is_ground_truth=True)[0] + ] + + postprocess_data(PostprocessingExecutor(config), annotations, [None, None]) + + assert np.array_equal(annotations, expected) + + def test_filter_predictions_by_height_range_with_ignored(self): + config = [{ + 'type': 'filter', + 'apply_to': 'prediction', + 'height_range': '(10.0, 20.0)', + 'remove_filtered': False + }] + predictions = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', score=1)[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', score=1)[0] + ] + expected = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', score=1, meta=[{'difficult_boxes': [1]}])[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', score=1, meta=[{'difficult_boxes': [0, 1]}])[0] + ] + + postprocess_data(PostprocessingExecutor(config), [None, None], predictions) + + assert np.array_equal(predictions, expected) + + def test_filter_predictions_by_height_range_with_remove(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'height_range': '(10.0, 20.0)', 'remove_filtered': True}] + predictions = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', score=1)[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', score=1)[0] + ] + expected = [ + make_representation('0 0 5 0 15', score=1)[0], + make_representation('', score=1)[0] + ] + + postprocess_data(PostprocessingExecutor(config), [None, None], predictions) + + assert np.array_equal(predictions, expected) + + def test_filter_predictions_by_unknown_min_visibility_raises_value_error_exception(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'min_visibility': 'unknown'}] + predictions = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', score=1)[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', score=1)[0] + ] + + with pytest.raises(ValueError): + postprocess_data(PostprocessingExecutor(config), [None], predictions) + + def test_filter_annotations_by_unknown_min_visibility_raises_value_error_exception(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_visibility': 'unknown'}] + annotations = [DetectionPrediction(y_mins=[5.0, 10.0], y_maxs=[15.0, 40.0])] + + with pytest.raises(ValueError): + postprocess_data(PostprocessingExecutor(config), annotations, [None]) + + def test_filter_predictions_by_visibility_raises_value_error_with_unknown_visibility(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'min_visibility': 'heavy occluded'}] + predictions = [DetectionPrediction( + y_mins=[5.0, 10.0], y_maxs=[15.0, 40.0], metadata={'visibilities': ['unknown']} + )] + + with pytest.raises(ValueError): + postprocess_data(PostprocessingExecutor(config), [None], predictions) + + def test_filter_annotations_by_visibility_raises_value_error_with_unknown_visibility(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_visibility': 'heavy occluded'}] + annotations = [DetectionAnnotation( + y_mins=[5.0, 10.0], y_maxs=[15.0, 40.0], metadata={'visibilities': ['unknown']} + )] + + with pytest.raises(ValueError): + postprocess_data(PostprocessingExecutor(config), annotations, [None]) + + def test_filter_by_visibility_does_nothing_with_annotations_without_visibility(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_visibility': 'heavy occluded'}] + annotations = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True)[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', is_ground_truth=True)[0] + ] + expected = [ + make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True, meta=[{'difficult_boxes': []}])[0], + make_representation('0 0 5 0 35; 1 0 10 0 40', is_ground_truth=True, meta=[{'difficult_boxes': []}])[0] + ] + + postprocess_data(PostprocessingExecutor(config), annotations, [None, None]) + + assert np.array_equal(annotations, expected) + + def test_filter_by_visibility_does_nothing_with_predictions_without_visibility(self): + config = [{'type': 'filter', 'apply_to': 'prediction', 'min_visibility': 'heavy occluded'}] + predictions = [ + DetectionPrediction(y_mins=[5.0, 10.0], y_maxs=[15.0, 40.0]), + DetectionPrediction(y_mins=[5.0, 10.0], y_maxs=[35.0, 50.0]) + ] + expected = [ + DetectionPrediction(y_mins=[5.0, 10.0], y_maxs=[15.0, 40.0], metadata={'difficult_boxes': []}), + DetectionPrediction(y_mins=[5.0, 10.0], y_maxs=[35.0, 50.0], metadata={'difficult_boxes': []}) + ] + + postprocess_data(PostprocessingExecutor(config), [None, None], predictions) + + assert np.array_equal(predictions, expected) + + def test_filter_by_visibility_does_nothing_with_default_visibility_level_and_heavy_occluded(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_visibility': 'heavy occluded'}] + annotation = make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True)[0] + expected = make_representation( + '0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True, meta=[{'difficult_boxes': []}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_by_visibility_does_nothing_with_default_visibility_level_and_partially_occluded(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_visibility': 'partially occluded'}] + annotation = make_representation('0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True)[0] + expected = make_representation( + '0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True, meta=[{'difficult_boxes': []}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_filter_by_visibility_filters_partially_occluded_remove_filtered(self): + config = [{'type': 'filter', 'apply_to': 'annotation', 'min_visibility': 'partially occluded', + 'remove_filtered': True}] + annotation = make_representation( + '0 0 5 0 15; 1 0 10 0 15', is_ground_truth=True, + meta=[{'visibilities': ['heavy occluded', 'partially occluded']}] + )[0] + expected = make_representation( + '1 0 10 0 15', is_ground_truth=True, meta=[{'visibilities': ['heavy occluded', 'partially occluded']}] + )[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_nms(self, mocker): + mock = mocker.patch('accuracy_checker.postprocessor.nms.NMS.process_all', return_value=([], [])) + config = [{'type': 'nms', 'overlap': 0.4}] + postprocess_data(PostprocessingExecutor(config), [], []) + mock.assert_called_once_with([], []) + + def test_resize_prediction_boxes(self): + config = [{'type': 'resize_prediction_boxes'}] + annotation = DetectionAnnotation(metadata={'image_size': [(100, 100, 3)]}) + prediction = make_representation('0 0 0 5 5; 1 7 7 8 8', score=1)[0] + expected = make_representation('0 0 0 500 500; 1 700 700 800 800', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected + + def test_clip_annotation_denormalized_boxes(self): + config = [{'type': 'clip_boxes', 'apply_to': 'annotation', 'boxes_normalized': False}] + meta = {'image_size': [(10, 10, 3)]} + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True, meta=[meta])[0] + expected = make_representation('0 0 0 5 5; 1 9 10 10 10', is_ground_truth=True, meta=[meta])[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_clip_annotation_normalized_boxes(self): + config = [{'type': 'clip_boxes', 'apply_to': 'annotation', 'boxes_normalized': True}] + meta = {'image_size': [(10, 10, 3)]} + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True, meta=[meta])[0] + expected = make_representation('0 0 0 1 1; 1 1 1 1 1', is_ground_truth=True, meta=[meta])[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_clip_annotation_denormalized_boxes_with_size(self): + config = [{'type': 'clip_boxes', 'apply_to': 'annotation', 'boxes_normalized': False, 'size': 10}] + meta = {'image_size': [(10, 10, 3)]} + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True, meta=[meta])[0] + expected = make_representation('0 0 0 5 5; 1 9 10 10 10', is_ground_truth=True, meta=[meta])[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_clip_annotation_normalized_boxes_with_size_as_normalized(self): + config = [{'type': 'clip_boxes', 'apply_to': 'annotation', 'boxes_normalized': True, 'size': 10}] + meta = {'image_size': [(10, 10, 3)]} + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True, meta=[meta])[0] + expected = make_representation('0 0 0 1 1; 1 1 1 1 1', is_ground_truth=True, meta=[meta])[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [None]) + + assert annotation == expected + + def test_clip_prediction_denormalized_boxes(self): + config = [{'type': 'clip_boxes', 'apply_to': 'prediction', 'boxes_normalized': False}] + annotation = DetectionAnnotation(metadata={'image_size': [(10, 10, 3)]}) + prediction = make_representation('0 -1 0 5 5; 1 9 11 10 10', score=1)[0] + expected = make_representation('0 0 0 5 5; 1 9 10 10 10', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected + + def test_clip_prediction_normalized_boxes(self): + config = [{'type': 'clip_boxes', 'apply_to': 'prediction', 'boxes_normalized': True}] + annotation = DetectionAnnotation(metadata={'image_size': [(10, 10, 3)]}) + prediction = make_representation('0 -1 0 5 5; 1 9 11 10 10', score=1)[0] + expected = make_representation('0 0 0 1 1; 1 1 1 1 1', score=1)[0] + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected + + def test_clip_predictions_denormalized_boxes_with_size(self): + config = [{'type': 'clip_boxes', 'apply_to': 'prediction', 'boxes_normalized': False, 'size': 10}] + annotation = DetectionAnnotation(metadata={'image_size': [(10, 10, 3)]}) + prediction = make_representation('0 -1 0 5 5; 1 9 11 10 10', score=1)[0] + expected = make_representation('0 0 0 5 5; 1 9 10 10 10', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected + + def test_clip_predictions_normalized_boxes_with_size_as_normalized(self): + config = [{'type': 'clip_boxes', 'apply_to': 'prediction', 'boxes_normalized': True, 'size': 10}] + annotation = DetectionAnnotation(metadata={'image_size': [(10, 10, 3)]}) + prediction = make_representation('0 -1 0 5 5; 1 9 11 10 10', score=1)[0] + expected = make_representation('0 0 0 1 1; 1 1 1 1 1', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected + + def test_cast_to_int_default(self): + config = [{'type': 'cast_to_int'}] + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + prediction = make_representation('0 -1.1 0.5 5.9 5.1; 1 -9.9 11.5 10.9 10.1', score=1)[0] + expected_annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + expected_prediction = make_representation('0 -1 0 6 5; 1 -10 12 11 10', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_cast_to_int_to_nearest(self): + config = [{'type': 'cast_to_int', 'round_policy': 'nearest'}] + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + prediction = make_representation('0 -1.1 0.5 5.9 5.1; 1 -9.9 11.5 10.9 10.1', score=1)[0] + expected_annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + expected_prediction = make_representation('0 -1 0 6 5; 1 -10 12 11 10', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_cast_to_int_to_nearest_to_zero(self): + config = [{'type': 'cast_to_int', 'round_policy': 'nearest_to_zero'}] + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + prediction = make_representation('0 -1.1 0.5 5.9 5.1; 1 -9.9 11.5 10.9 10.1', score=1)[0] + expected_annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + expected_prediction = make_representation('0 -1 0 5 5; 1 -9 11 10 10', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_cast_to_int_to_lower(self): + config = [{'type': 'cast_to_int', 'round_policy': 'lower'}] + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + prediction = make_representation('0 -1.1 0.5 5.9 5.1; 1 -9.9 11.5 10.9 10.1', score=1)[0] + expected_annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + expected_prediction = make_representation('0 -2 0 5 5; 1 -10 11 10 10', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_cast_to_int_to_greater(self): + config = [{'type': 'cast_to_int', 'round_policy': 'greater'}] + annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + prediction = make_representation('0 -1.1 0.5 5.9 5.1; 1 -9.9 11.5 10.9 10.1', score=1)[0] + expected_annotation = make_representation('0 -1 0 5 5; 1 9 11 10 10', is_ground_truth=True)[0] + expected_prediction = make_representation('0 -1 1 6 6; 1 -9 12 11 11', score=1)[0] + + postprocess_data(PostprocessingExecutor(config), [annotation], [prediction]) + + assert prediction == expected_prediction and annotation == expected_annotation + + def test_cast_to_int_to_unknown_raise_config_error(self): + config = [{'type': 'cast_to_int', 'round_policy': 'unknown'}] + + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_extend_segmentation_mask_with_float_filling_raise_config_error(self): + config = [{'type': 'extend_segmentation_mask', 'filling_label': 0.5}] + + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_extend_segmentation_mask_default(self): + config = [{'type': 'extend_segmentation_mask'}] + annotation = make_segmentation_representation(np.zeros((5, 5)), ground_truth=True) + prediction = make_segmentation_representation(np.zeros((7, 7)), ground_truth=False) + expected_annotation_mask = np.zeros((7, 7)) + expected_annotation_mask[0, :] = 255 + expected_annotation_mask[:, 0] = 255 + expected_annotation_mask[-1, :] = 255 + expected_annotation_mask[:, -1] = 255 + expected_prediction_mask = np.zeros((7, 7)) + postprocess_data(PostprocessingExecutor(config), annotation, prediction) + assert np.array_equal(prediction[0].mask, expected_prediction_mask) + assert np.array_equal(annotation[0].mask, expected_annotation_mask) + + def test_extend_segmentation_mask_do_nothing(self): + config = [{'type': 'extend_segmentation_mask'}] + annotation = make_segmentation_representation(np.zeros((5, 5)), ground_truth=True) + prediction = make_segmentation_representation(np.zeros((5, 5)), ground_truth=False) + expected_mask = np.zeros((5, 5)) + postprocess_data(PostprocessingExecutor(config), annotation, prediction) + assert np.array_equal(prediction[0].mask, expected_mask) + assert np.array_equal(annotation[0].mask, expected_mask) + + def test_extend_segmentation_mask_asymmetrical(self): + config = [{'type': 'extend_segmentation_mask'}] + annotation = make_segmentation_representation(np.zeros((5, 5)), ground_truth=True) + prediction = make_segmentation_representation(np.zeros((6, 7)), ground_truth=False) + expected_annotation_mask = np.zeros((6, 7)) + expected_annotation_mask[:, 0] = 255 + expected_annotation_mask[-1, :] = 255 + expected_annotation_mask[:, -1] = 255 + expected_prediction_mask = np.zeros((6, 7)) + postprocess_data(PostprocessingExecutor(config), annotation, prediction) + assert np.array_equal(prediction[0].mask, expected_prediction_mask) + assert np.array_equal(annotation[0].mask, expected_annotation_mask) + + def test_extend_segmentation_mask_raise_config_error_if_prediction_less_annotation(self): + config = [{'type': 'extend_segmentation_mask'}] + annotation = make_segmentation_representation(np.zeros((5, 5)), ground_truth=True) + prediction = make_segmentation_representation(np.zeros((4, 4)), ground_truth=False) + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), annotation, prediction) + + def test_extend_segmentation_mask_with_filling_label(self): + config = [{'type': 'extend_segmentation_mask', 'filling_label': 1}] + annotation = make_segmentation_representation(np.zeros((5, 5)), ground_truth=True) + prediction = make_segmentation_representation(np.zeros((7, 7)), ground_truth=False) + expected_annotation_mask = np.zeros((7, 7)) + expected_annotation_mask[0, :] = 1 + expected_annotation_mask[:, 0] = 1 + expected_annotation_mask[-1, :] = 1 + expected_annotation_mask[:, -1] = 1 + expected_prediction_mask = np.zeros((7, 7)) + postprocess_data(PostprocessingExecutor(config), annotation, prediction) + assert np.array_equal(prediction[0].mask, expected_prediction_mask) + assert np.array_equal(annotation[0].mask, expected_annotation_mask) + + +class TestPostprocessorExtraArgs: + def test_cast_to_int_raise_config_error_on_extra_args(self): + config = {'type': 'cast_to_int', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_clip_boxes_raise_config_error_on_extra_args(self): + config = {'type': 'clip_boxes', 'size': 1, 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_correct_yolo_v2_boxes_raise_config_error_on_extra_args(self): + config = {'type': 'correct_yolo_v2_boxes', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_encode_segmentation_mask_raise_config_error_on_extra_args(self): + config = {'type': 'encode_segmentation_mask', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_filter_raise_config_error_on_extra_args(self): + config = {'type': 'filter', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_nms_raise_config_error_on_extra_args(self): + config = {'type': 'nms', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_normalize_landmarks_points_raise_config_error_on_extra_args(self): + config = {'type': 'normalize_landmarks_points', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_resize_prediction_boxes_raise_config_error_on_extra_args(self): + config = {'type': 'resize_prediction_boxes', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_resize_segmentation_mask_raise_config_error_on_extra_args(self): + config = {'type': 'resize_segmentation_mask', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) + + def test_extend_segmentation_mask_raise_config_error_on_extra_args(self): + config = {'type': 'resize_segmentation_mask', 'something_extra': 'extra'} + with pytest.raises(ConfigError): + postprocess_data(PostprocessingExecutor(config), [None], [None]) diff --git a/tools/accuracy_checker/tests/test_preprocessor.py b/tools/accuracy_checker/tests/test_preprocessor.py new file mode 100644 index 0000000..339fb8c --- /dev/null +++ b/tools/accuracy_checker/tests/test_preprocessor.py @@ -0,0 +1,610 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import cv2 +import numpy as np +import pytest + +from accuracy_checker.config import ConfigError +from accuracy_checker.preprocessor import ( + Crop, + Normalize, + Preprocessor, + Resize, + Flip, + BgrToRgb, + CropRect, + ExtendAroundRect, + PointAligner +) +from accuracy_checker.preprocessor.preprocessing_executor import PreprocessingExecutor +from accuracy_checker.dataset import DataRepresentation + + +class TestResize: + def test_default_resize(self, mocker): + cv2_resize_mock = mocker.patch('accuracy_checker.preprocessor.preprocessors.cv2.resize') + resize = Preprocessor.provide('resize', {'type': 'resize', 'size': 200}) + + input_mock = mocker.Mock() + resize(DataRepresentation(input_mock)) + + assert not resize.use_pil + assert resize.dst_width == 200 + assert resize.dst_height == 200 + cv2_resize_mock.assert_called_once_with( + input_mock, (200, 200), interpolation=Resize.OPENCV_INTERPOLATION['LINEAR'] + ) + + def test_custom_resize(self, mocker): + cv2_resize_mock = mocker.patch('accuracy_checker.preprocessor.preprocessors.cv2.resize') + + resize = Preprocessor.provide( + 'resize', {'type': 'resize', 'dst_width': 126, 'dst_height': 128, 'interpolation': 'CUBIC'} + ) + + input_mock = mocker.Mock() + resize(DataRepresentation(input_mock)) + + assert not resize.use_pil + assert resize.dst_width == 126 + assert resize.dst_height == 128 + cv2_resize_mock.assert_called_once_with( + input_mock, (126, 128), + interpolation=Resize.OPENCV_INTERPOLATION['CUBIC'] + ) + + def test_resize_without_save_aspect_ratio(self): + name = 'mock_preprocessor' + config = {'type': 'resize', 'dst_width': 150, 'dst_height': 150} + input_image = np.ones((100, 50, 3)) + resize = Preprocessor.provide('resize', config, name) + + result = resize(DataRepresentation(input_image)).data + + assert result.shape == (150, 150, 3) + + def test_resize_save_aspect_ratio_unknown_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide( + 'resize', {'type': 'resize', 'dst_width': 100, 'dst_height': 150, 'aspect_ratio_scale': 'unknown'} + ) + + def test_resize_save_aspect_ratio_height(self): + input_image = np.ones((100, 50, 3)) + resize = Preprocessor.provide('resize', { + 'type': 'resize', 'dst_width': 100, 'dst_height': 150, + 'interpolation': 'CUBIC', 'aspect_ratio_scale': 'height' + }) + result = resize(DataRepresentation(input_image)).data + + assert result.shape == (300, 100, 3) + + def test_resize_save_aspect_ratio_width(self): + input_image = np.ones((100, 50, 3)) + resize = Preprocessor.provide('resize', { + 'type': 'resize', 'dst_width': 150, 'dst_height': 150, 'aspect_ratio_scale': 'width' + }) + result = resize(DataRepresentation(input_image)).data + + assert result.shape == (150, 75, 3) + + def test_resize_save_aspect_ratio_for_greater_dim(self): + input_image = np.ones((100, 50, 3)) + resize = Preprocessor.provide('resize', { + 'type': 'resize', + 'dst_width': 100, + 'dst_height': 150, + 'aspect_ratio_scale': 'greater' + }) + result = resize(DataRepresentation(input_image)).data + + assert result.shape == (300, 100, 3) + + def test_resize_to_negative_size_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('resize', {'type': 'resize', 'size': -100}) + + def test_resize_to_negative_destination_width_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('resize', {'type': 'resize', 'dst_width': -100, 'dst_height': 100}) + + def test_resize_to_negative_destination_height_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('resize', {'type': 'resize', 'dst_width': 100, 'dst_height': -100}) + + def test_resize_with_both_provided_size_and_dst_height_dst_width_warn(self): + input_image = np.ones((100, 50, 3)) + + with pytest.warns(None) as warnings: + resize = Preprocessor.provide( + 'resize', {'type': 'resize', 'dst_width': 100, 'dst_height': 100, 'size': 200} + ) + assert len(warnings) == 1 + result = resize(DataRepresentation(input_image)).data + assert result.shape == (200, 200, 3) + + def test_resize_provided_only_dst_height_raise_config_error(self): + with pytest.raises(ValueError): + Preprocessor.provide('resize', {'type': 'resize', 'dst_height': 100}) + + def test_resize_provided_only_dst_width_raise_config_error(self): + with pytest.raises(ValueError): + Preprocessor.provide('resize', {'type': 'resize', 'dst_width': 100}) + + +class TestNormalization: + def test_normalization_without_mean_and_std_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('normalization', {'type': 'normalization'}) + + def test_custom_normalization_with_mean(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'mean': '(1, 2, 3)'}) + source = np.full_like((3, 300, 300), 100) + input_ref = source.copy() - (1, 2, 3) + result = normalization(DataRepresentation(source)) + + assert normalization.mean == (1, 2, 3) + assert normalization.std is None + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_precomputed_mean(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'mean': 'cifar10'}) + + source = np.full_like((3, 300, 300), 100) + input_ref = source.copy() - normalization.PRECOMPUTED_MEANS['cifar10'] + result = normalization(DataRepresentation(source)) + + assert normalization.mean == normalization.PRECOMPUTED_MEANS['cifar10'] + assert normalization.std is None + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_mean_as_scalar(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'mean': '1'}) + + source = np.full_like((3, 300, 300), 100) + input_ref = source.copy() - 1 + result = normalization(DataRepresentation(source)) + + assert normalization.mean == (1.0, ) + assert normalization.std is None + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_std(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'std': '(1, 2, 3)'}) + + source = np.full_like((3, 300, 300), 100) + input_ref = source.copy() / (1, 2, 3) + result = normalization(DataRepresentation(source)) + + assert normalization.mean is None + assert normalization.std == (1, 2, 3) + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_precomputed_std(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'std': 'cifar10'}) + + source = np.full_like((3, 300, 300), 100) + input_ref = source.copy() / normalization.PRECOMPUTED_STDS['cifar10'] + result = normalization(DataRepresentation(source)) + + assert normalization.mean is None + assert normalization.std == normalization.PRECOMPUTED_STDS['cifar10'] + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_std_as_scalar(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'std': '2'}) + source = np.full_like((3, 300, 300), 100) + input_ref = source.copy() / 2 + result = normalization(DataRepresentation(source)) + + assert normalization.mean is None + assert normalization.std == (2.0, ) + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_mean_and_std(self): + normalization = Preprocessor.provide( + 'normalization', {'type': 'normalization', 'mean': '(1, 2, 3)', 'std': '(4, 5, 6)'} + ) + + input_ = np.full_like((3, 300, 300), 100) + input_ref = (input_ - (1, 2, 3)) / (4, 5, 6) + result = normalization(DataRepresentation(input_)) + + assert normalization.mean == (1, 2, 3) + assert normalization.std == (4, 5, 6) + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_custom_normalization_with_mean_and_std_as_scalars(self): + normalization = Preprocessor.provide('normalization', {'type': 'normalization', 'mean': '2', 'std': '5'}) + + input_ = np.full_like((3, 300, 300), 100) + input_ref = (input_ - (2, )) / (5, ) + result = normalization(DataRepresentation(input_)) + + assert normalization.mean == (2, ) + assert normalization.std == (5, ) + assert np.all(input_ref == result.data) + assert result.metadata == {'image_size': (3,)} + + def test_normalization_with_zero_in_std_values_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('normalization', {'type': 'normalization', 'std': '(4, 0, 6)'}) + + def test_normalization_with_zero_as_std_value_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('normalization', {'type': 'normalization', 'std': '0'}) + + def test_normalization_with_not_channel_wise_mean_list_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('normalization', {'type': 'normalization', 'mean': '3, 2'}) + + def test_normalization_with_not_channel_wise_std_list_raise_config_error(self): + with pytest.raises(ConfigError): + Preprocessor.provide('normalization', {'type': 'normalization', 'std': '3, 2'}) + + def test_normalization_with_unknown_precomputed_mean_raise_config_error(self): + with pytest.raises(ValueError): + Preprocessor.provide('normalization', {'type': 'normalization', 'mean': 'unknown'}) + + def test_normalization_with_unknown_precomputed_std_raise_config_error(self): + with pytest.raises(ValueError): + Preprocessor.provide('normalization', {'type': 'normalization', 'std': 'unknown'}) + + +class TestPreprocessingEvaluator: + def test_preprocessing_evaluator(self): + config = [{'type': 'normalization', 'mean': '(1, 2, 3)'}, {'type': 'resize', 'size': 200}] + preprocessor = PreprocessingExecutor(config) + + assert 2 == len(preprocessor.processors) + assert isinstance(preprocessor.processors[0], Normalize) + assert isinstance(preprocessor.processors[1], Resize) + assert preprocessor.processors[0].mean == (1, 2, 3) + assert preprocessor.processors[1].dst_width == 200 + + +class TestCrop: + def test_crop_higher(self): + crop = Crop({'dst_width': 50, 'dst_height': 33, 'type': 'crop'}) + image = np.zeros((100, 100, 3)) + image_rep = crop(DataRepresentation(image)) + + assert image_rep.data.shape == (33, 50, 3) + assert image_rep.metadata == {'image_size': (100, 100, 3)} + + def test_crop_to_size(self): + crop = Crop({'size': 50, 'type': 'crop'}) + image = np.zeros((100, 100, 3)) + image_rep = crop(DataRepresentation(image)) + + assert image_rep.data.shape == (50, 50, 3) + assert image_rep.metadata == {'image_size': (100, 100, 3)} + + def test_crop_higher_non_symmetric(self): + crop = Crop({'dst_width': 50, 'dst_height': 12, 'type': 'crop'}) + image = np.zeros((70, 50, 3)) + image_rep = crop(DataRepresentation(image)) + + assert image_rep.data.shape == (12, 50, 3) + assert image_rep.metadata == {'image_size': (70, 50, 3)} + + def test_crop_less(self): + crop = Crop({'dst_width': 151, 'dst_height': 42, 'type': 'crop'}) + image = np.zeros((30, 30, 3)) + image_rep = crop(DataRepresentation(image)) + + assert image_rep.data.shape == (42, 151, 3) + assert image_rep.metadata == {'image_size': (30, 30, 3)} + + def test_crop_less_non_symmetric(self): + crop = Crop({'dst_width': 42, 'dst_height': 151, 'type': 'crop'}) + image = np.zeros((30, 40, 3)) + image_rep = crop(DataRepresentation(image)) + + assert image_rep.data.shape == (151, 42, 3) + assert image_rep.metadata == {'image_size': (30, 40, 3)} + + def test_crop_to_negative_size_raise_config_error(self): + with pytest.raises(ConfigError): + Crop({'size': -151, 'type': 'crop'}) + + def test_crop_to_negative_destination_width_raise_config_error(self): + with pytest.raises(ConfigError): + Crop({'dst_width': -100, 'dst_height': 100, 'type': 'crop'}) + + def test_crop_to_negative_destination_height_raise_config_error(self): + with pytest.raises(ConfigError): + Crop({'dst_width': 100, 'dst_height': -100, 'type': 'crop'}) + + def test_crop_with_both_provided_size_and_dst_height_dst_width_warn(self): + image = np.zeros((30, 40, 3)) + with pytest.warns(None) as warnings: + crop = Crop({'dst_width': 100, 'dst_height': 100, 'size': 200, 'type': 'crop'}) + assert len(warnings) == 1 + result = crop.process(DataRepresentation(image)) + assert result.data.shape == (200, 200, 3) + assert result.metadata == {'image_size': (30, 40, 3)} + + +class TestFlip: + def test_horizontal_flip(self): + image = np.random.randint(0, 255, (30, 40, 3)) + expected_image = cv2.flip(image, 0) + flip = Flip({'type': 'flip', 'mode': 'horizontal'}) + assert np.array_equal(expected_image, flip.process(DataRepresentation(image)).data) + + def test_vertical_flip(self): + image = np.random.randint(0, 255, (30, 40, 3)) + expected_image = cv2.flip(image, 1) + flip = Flip({'type': 'flip', 'mode': 'vertical'}) + assert np.array_equal(expected_image, flip.process(DataRepresentation(image)).data) + + def test_flip_raise_config_error_if_mode_not_provided(self): + with pytest.raises(ConfigError): + Flip({'type': 'flip'}) + + def test_flip_raise_config_error_if_mode_unknown(self): + with pytest.raises(ConfigError): + Flip({'type': 'flip', 'mode': 'unknown'}) + + +class TestBGRtoRGB: + def test_bgr_to_rgb(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bgr_to_rgb = BgrToRgb({'type': 'bgr_to_rgb'}) + assert np.array_equal(expected_image, bgr_to_rgb.process(DataRepresentation(image)).data) + + +class TestCropRect: + def test_crop_rect_if_rect_not_provided(self): + image = np.zeros((30, 40, 3)) + crop_rect = CropRect({'type': 'crop_rect'}) + assert np.array_equal(image, crop_rect(image, {})) + + def test_crop_rect_if_rect_equal_image(self): + image = np.zeros((30, 40, 3)) + crop_rect = CropRect({'type': 'crop_rect'}) + assert np.array_equal(image, crop_rect(DataRepresentation(image), {'rect': [0, 0, 40, 30]}).data) + + def test_crop_rect(self): + image = np.zeros((30, 40, 3)) + image[:, 20:, :] = 1 + expected_image = np.ones((30, 20, 3)) + crop_rect = CropRect({'type': 'crop_rect'}) + assert np.array_equal(expected_image, crop_rect(DataRepresentation(image), {'rect': [20, 0, 40, 30]}).data) + + def test_crop_rect_negative_coordinates_of_rect(self): + image = np.zeros((30, 40, 3)) + image[:, 20:, :] = 1 + expected_image = image + crop_rect = CropRect({'type': 'crop_rect'}) + assert np.array_equal(expected_image, crop_rect(DataRepresentation(image), {'rect': [-20, 0, 40, 30]}).data) + + def test_crop_rect_more_image_size_coordinates_of_rect(self): + image = np.zeros((30, 40, 3)) + image[:, 20:, :] = 1 + expected_image = np.ones((30, 20, 3)) + crop_rect = CropRect({'type': 'crop_rect'}) + assert np.array_equal(expected_image, crop_rect(DataRepresentation(image), {'rect': [20, 0, 40, 50]}).data) + + +class TestExtendAroundRect: + def test_default_extend_around_rect_without_rect(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = image + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect'}) + assert np.array_equal(expected_image, extend_image_around_rect(DataRepresentation(image), {}).data) + + def test_default_extend_around_rect(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = image + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect'}) + assert np.array_equal( + expected_image, extend_image_around_rect(DataRepresentation(image), {'rect': [20, 0, 40, 30]}).data + ) + + def test_extend_around_rect_with_positive_augmentation(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = cv2.copyMakeBorder(image, int(15.5), int(31), int(0), int(11), cv2.BORDER_REPLICATE) + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect', 'augmentation_param': 0.5}) + assert np.array_equal( + expected_image, extend_image_around_rect(DataRepresentation(image), {'rect': [20, 0, 40, 30]}).data + ) + + def test_extend_around_rect_with_negative_augmentation(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = image + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect', 'augmentation_param': -0.5}) + assert np.array_equal( + expected_image, extend_image_around_rect(DataRepresentation(image), {'rect': [20, 0, 40, 30]}).data + ) + + def test_extend_around_rect_with_rect_equal_image(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = cv2.copyMakeBorder(image, int(15.5), int(31), int(20.5), int(41), cv2.BORDER_REPLICATE) + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect', 'augmentation_param': 0.5}) + assert np.array_equal( + expected_image, extend_image_around_rect(DataRepresentation(image), {'rect': [0, 0, 40, 30]}).data + ) + + def test_extend_around_rect_negative_coordinates_of_rect(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = cv2.copyMakeBorder(image, int(15.5), int(31), int(20.5), int(41), cv2.BORDER_REPLICATE) + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect', 'augmentation_param': 0.5}) + assert np.array_equal( + expected_image, extend_image_around_rect(DataRepresentation(image), {'rect': [-20, 0, 40, 30]}).data + ) + + def test_extend_around_rect_more_image_size_coordinates_of_rect(self): + image = np.random.randint(0, 255, (30, 40, 3)).astype(np.uint8) + expected_image = cv2.copyMakeBorder(image, int(15.5), int(31), int(0), int(11), cv2.BORDER_REPLICATE) + extend_image_around_rect = ExtendAroundRect({'type': 'extend_around_rect', 'augmentation_param': 0.5}) + assert np.array_equal( + expected_image, extend_image_around_rect(DataRepresentation(image), {'rect': [20, 0, 40, 50]}).data + ) + + +class TestPointAlignment: + def test_point_alignment_width_negative_size_raise_config_error(self): + with pytest.raises(ConfigError): + PointAligner({'type': 'point_alignment', 'size': -100}) + + def test_point_alignment_negative_destination_width_raise_config_error(self): + with pytest.raises(ConfigError): + PointAligner({'type': 'point_alignment', 'dst_width': -100, 'dst_height': 100}) + + def test_point_alignment_to_negative_destination_height_raise_config_error(self): + with pytest.raises(ValueError): + PointAligner({'type': 'point_alignment', 'dst_width': 100, 'dst_height': -100}) + + def test_point_alignment_provided_only_dst_height_raise_config_error(self): + with pytest.raises(ValueError): + PointAligner({'type': 'point_alignment', 'dst_height': 100}) + + def test_point_alignment_provided_only_dst_width_raise_config_error(self): + with pytest.raises(ValueError): + PointAligner({'type': 'point_alignment', 'dst_width': 100}) + + def test_point_alignment_both_provided_size_and_dst_height_dst_width_warn(self): + input_image = np.ones((100, 50, 3)) + + with pytest.warns(None) as warnings: + point_aligner = PointAligner({'type': 'point_alignment', 'dst_width': 100, 'dst_height': 100, 'size': 200}) + assert len(warnings) == 1 + result = point_aligner(DataRepresentation(input_image), {}).data + assert result.shape == (100, 50, 3) + + def test_point_alignment_not_provided_points_im_meta(self): + input_image = np.ones((100, 50, 3)) + + point_aligner = PointAligner({'type': 'point_alignment', 'dst_width': 100, 'dst_height': 100}) + result = point_aligner(DataRepresentation(input_image), {}).data + assert result.shape == (100, 50, 3) + + def test_point_alignment_default_use_normalization(self): + image = np.random.randint(0, 255, (40, 40, 3)).astype(np.uint8) + + point_aligner = PointAligner({'type': 'point_alignment', 'dst_width': 40, 'dst_height': 40}) + result = point_aligner( + DataRepresentation(image), {'keypoints': PointAligner.ref_landmarks.reshape(-1).tolist()} + ).data + transformation_matrix = point_aligner.transformation_from_points( + point_aligner.ref_landmarks * 40, point_aligner.ref_landmarks + ) + expected_result = cv2.warpAffine(image, transformation_matrix, (40, 40), flags=cv2.WARP_INVERSE_MAP) + + assert np.array_equal(result, expected_result) + + def test_point_alignment_use_normalization(self): + image = np.random.randint(0, 255, (40, 40, 3)).astype(np.uint8) + + point_aligner = PointAligner({'type': 'point_alignment', 'dst_width': 40, 'dst_height': 40, 'normalize': True}) + result = point_aligner( + DataRepresentation(image), {'keypoints': PointAligner.ref_landmarks.reshape(-1).tolist()} + ).data + transformation_matrix = point_aligner.transformation_from_points( + point_aligner.ref_landmarks * 40, point_aligner.ref_landmarks + ) + expected_result = cv2.warpAffine(image, transformation_matrix, (40, 40), flags=cv2.WARP_INVERSE_MAP) + + assert np.array_equal(result, expected_result) + + def test_point_alignment_without_normalization(self): + image = np.random.randint(0, 255, (40, 40, 3)).astype(np.uint8) + + point_aligner = PointAligner({'type': 'point_alignment', 'dst_width': 40, 'dst_height': 40, 'normalize': False}) + result = point_aligner( + DataRepresentation(image), {'keypoints': PointAligner.ref_landmarks.reshape(-1).tolist()} + ).data + transformation_matrix = point_aligner.transformation_from_points( + point_aligner.ref_landmarks * 40, point_aligner.ref_landmarks * 40 + ) + expected_result = cv2.warpAffine(image, transformation_matrix, (40, 40), flags=cv2.WARP_INVERSE_MAP) + + assert np.array_equal(result, expected_result) + + def test_point_alignment_with_drawing_points(self): + image = np.random.randint(0, 255, (40, 40, 3)).astype(np.uint8) + + point_aligner = PointAligner({ + 'type': 'point_alignment', 'dst_width': 40, 'dst_height': 40, 'draw_points': True + }) + result = point_aligner( + DataRepresentation(image), {'keypoints': PointAligner.ref_landmarks.reshape(-1).tolist()} + ).data + transformation_matrix = point_aligner.transformation_from_points( + point_aligner.ref_landmarks * 40, point_aligner.ref_landmarks + ) + expected_result = image + for point in PointAligner.ref_landmarks: + cv2.circle(expected_result, (int(point[0]), int(point[1])), 5, (255, 0, 0), -1) + expected_result = cv2.warpAffine(expected_result, transformation_matrix, (40, 40), flags=cv2.WARP_INVERSE_MAP) + + assert np.array_equal(result, expected_result) + + def test_point_alignment_with_resizing(self): + image = np.random.randint(0, 255, (80, 80, 3)).astype(np.uint8) + + point_aligner = PointAligner({'type': 'point_alignment', 'size': 40}) + result = point_aligner( + DataRepresentation(image), {'keypoints': PointAligner.ref_landmarks.reshape(-1).tolist()} + ).data + transformation_matrix = point_aligner.transformation_from_points( + point_aligner.ref_landmarks * 40, point_aligner.ref_landmarks * 0.5 + ) + expected_result = cv2.resize(image, (40, 40)) + expected_result = cv2.warpAffine(expected_result, transformation_matrix, (40, 40), flags=cv2.WARP_INVERSE_MAP) + + assert np.array_equal(result, expected_result) + + +class TestPreprocessorExtraArgs: + def test_resize_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('resize', {'type': 'resize', 'size': 1, 'something_extra': 'extra'}) + + def test_normalization_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('normalization', {'type': 'normalization', 'mean': 0, 'something_extra': 'extra'}) + + def test_bgr_to_rgb_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('bgr_to_rgb', {'type': 'bgr_to_rgb', 'something_extra': 'extra'}) + + def test_flip_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('flip', {'type': 'flip', 'something_extra': 'extra'}) + + def test_crop_accuracy_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('crop', {'type': 'crop', 'size': 1, 'something_extra': 'extra'}) + + def test_extend_around_rect_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('extend_around_rect', {'type': 'extend_around_rect', 'something_extra': 'extra'}) + + def test_point_alignment_raise_config_error_on_extra_args(self): + with pytest.raises(ConfigError): + Preprocessor.provide('point_alignment', {'type': 'point_alignment', 'something_extra': 'extra'}) diff --git a/tools/accuracy_checker/tests/test_presenter.py b/tools/accuracy_checker/tests/test_presenter.py new file mode 100644 index 0000000..3980f24 --- /dev/null +++ b/tools/accuracy_checker/tests/test_presenter.py @@ -0,0 +1,348 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import pytest +from unittest.mock import MagicMock, call +from accuracy_checker.metrics import MetricsExecutor +from accuracy_checker.presenters import ScalarPrintPresenter, VectorPrintPresenter, EvaluationResult +from accuracy_checker.representation import ClassificationAnnotation, ClassificationPrediction + + +class TestPresenter: + def test_config_default_presenter(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1}]} + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for presenter, _ in dispatcher.iterate_metrics(annotations, predictions): + assert isinstance(presenter, ScalarPrintPresenter) + + def test_config_scalar_presenter(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1, 'presenter': 'print_scalar'}]} + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for presenter, _ in dispatcher.iterate_metrics(annotations, predictions): + assert isinstance(presenter, ScalarPrintPresenter) + + def test_config_vector_presenter(self): + annotations = [ClassificationAnnotation('identifier', 3)] + predictions = [ClassificationPrediction('identifier', [1.0, 1.0, 1.0, 4.0])] + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1, 'presenter': 'print_vector'}]} + dispatcher = MetricsExecutor(config, None) + dispatcher.update_metrics_on_batch(annotations, predictions) + + for presenter, _ in dispatcher.iterate_metrics(annotations, predictions): + assert isinstance(presenter, VectorPrintPresenter) + + def test_config_unknown_presenter(self): + config = {'annotation': 'mocked', 'metrics': [{'type': 'accuracy', 'top_k': 1, 'presenter': 'print_somehow'}]} + with pytest.raises(ValueError): + MetricsExecutor(config, None) + + def test_scalar_presenter_with_scalar_data(self, mocker): + mock_write_scalar_result = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=0.1, + reference_value=None, + threshold=None, + meta={}, + ) + presenter = ScalarPrintPresenter() + presenter.write_result(result) + mock_write_scalar_result.assert_called_once_with( + result.evaluated_value, + result.name, + result.reference_value, + result.threshold, + postfix='%', + scale=100, + result_format='{:.2f}' + ) + + def test_scalar_presenter_with_vector_data(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='vector_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={}, + ) + presenter = ScalarPrintPresenter() + presenter.write_result(result) + mock_write_scalar_res.assert_called_once_with( + np.mean(result.evaluated_value), + result.name, + result.reference_value, + result.threshold, + postfix='%', + scale=100, + result_format='{:.2f}' + ) + + def test_default_format_for_scalar_presenter_with_ignore_formatting(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='vector_metric', + evaluated_value=[0.456], + reference_value=None, + threshold=None, + meta={}, + ) + presenter = ScalarPrintPresenter() + presenter.write_result(result, ignore_results_formatting=True) + mock_write_scalar_res.assert_called_once_with( + np.mean(result.evaluated_value), + result.name, + result.reference_value, + result.threshold, + postfix=' ', + scale=1, + result_format='{}' + ) + + def test_specific_format_for_scalar_presenter_with_ignore_formatting(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='vector_metric', + evaluated_value=[0.456], + reference_value=None, + threshold=None, + meta={'scale': 0.5, 'postfix': 'km/h', 'data_format': '{:.4f}'}, + ) + presenter = ScalarPrintPresenter() + presenter.write_result(result, ignore_results_formatting=True) + mock_write_scalar_res.assert_called_once_with( + np.mean(result.evaluated_value), + result.name, + result.reference_value, + result.threshold, + postfix=' ', + scale=1, + result_format='{}' + ) + + def test_vector_presenter_with_scaler_data(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=0.4, + reference_value=None, + threshold=None, + meta={}, + ) + presenter = VectorPrintPresenter() + presenter.write_result(result) + mock_write_scalar_res.assert_called_once_with( + result.evaluated_value, + result.name, + result.reference_value, + result.threshold, + postfix='%', + scale=100, + value_name=None, + result_format='{:.2f}' + ) + + def test_vector_presenter_with_vector_data_contain_one_element(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4], + reference_value=None, + threshold=None, + meta={'names': ['prediction']} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result) + mock_write_scalar_res.assert_called_once_with( + result.evaluated_value, + result.name, + result.reference_value, + result.threshold, + postfix='%', + scale=100, + value_name=result.meta['names'][0], + result_format='{:.2f}' + ) + + def test_vector_presenter_with_vector_data_with_default_postfix_and_scale(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={'names': ['class1', 'class2']} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result) + calls = [ + call( + result.evaluated_value[0], result.name, result.reference_value, result.threshold, + postfix='%', scale=100, value_name=result.meta['names'][0], result_format='{:.2f}' + ), + call( + result.evaluated_value[1], result.name, result.reference_value, result.threshold, + postfix='%', scale=100, value_name=result.meta['names'][1], result_format='{:.2f}' + ), + call( + np.mean(np.multiply(result.evaluated_value, 100)), result.name, result.reference_value, + result.threshold, value_name='mean', postfix='%', scale=1, result_format='{:.2f}' + ) + ] + mock_write_scalar_res.assert_has_calls(calls) + + def test_vector_presenter_with_vector_data_has_default_format_with_ignore_formatting(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={'names': ['class1', 'class2']} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result, ignore_results_formatting=True) + calls = [ + call( + result.evaluated_value[0], result.name, result.reference_value, result.threshold, + postfix=' ', scale=1, value_name=result.meta['names'][0], result_format='{}' + ), + call( + result.evaluated_value[1], result.name, result.reference_value, result.threshold, + postfix=' ', scale=1, value_name=result.meta['names'][1], result_format='{}' + ), + call( + np.mean(np.multiply(result.evaluated_value, 1)), result.name, result.reference_value, result.threshold, + value_name='mean', postfix=' ', scale=1, result_format='{}' + ) + ] + mock_write_scalar_res.assert_has_calls(calls) + + def test_vector_presenter_with_vector_data_has_specific_format_with_ignore_formatting(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={'names': ['class1', 'class2'], 'scale': 0.5, 'postfix': 'km/h', 'data_format': '{:.4f}'} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result, ignore_results_formatting=True) + calls = [ + call( + result.evaluated_value[0], result.name, result.reference_value, result.threshold, + postfix=' ', scale=1, value_name=result.meta['names'][0], result_format='{}' + ), + call( + result.evaluated_value[1], result.name, result.reference_value, result.threshold, + postfix=' ', scale=1, value_name=result.meta['names'][1], result_format='{}' + ), + call( + np.mean(np.multiply(result.evaluated_value, 1)), result.name, result.reference_value, result.threshold, + value_name='mean', postfix=' ', scale=1, result_format='{}' + ) + ] + mock_write_scalar_res.assert_has_calls(calls) + + def test_vector_presenter_with_vector_data_with_scalar_postfix(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={'names': ['class1', 'class2'], 'postfix': '_'} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result) + calls = [ + call(result.evaluated_value[0], result.name, result.reference_value, result.threshold, + postfix=result.meta['postfix'], scale=100, value_name=result.meta['names'][0], result_format='{:.2f}' + ), + call( + result.evaluated_value[1], result.name, result.reference_value, result.threshold, + postfix=result.meta['postfix'], scale=100, value_name=result.meta['names'][1], result_format='{:.2f}' + ), + call( + np.mean(np.multiply(result.evaluated_value, 100)), result.name, result.reference_value, + result.threshold, value_name='mean', postfix=result.meta['postfix'], scale=1, result_format='{:.2f}' + ) + ] + mock_write_scalar_res.assert_has_calls(calls) + + def test_vector_presenter_with_vector_data_with_scalar_scale(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={'names': ['class1', 'class2'], 'scale': 10} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result) + calls = [ + call( + result.evaluated_value[0], result.name, result.reference_value, result.threshold, + postfix='%', scale=result.meta['scale'], value_name=result.meta['names'][0], result_format='{:.2f}' + ), + call( + result.evaluated_value[1], result.name, result.reference_value, result.threshold, + postfix='%', scale=result.meta['scale'], value_name=result.meta['names'][1], result_format='{:.2f}' + ), + call( + np.mean(np.multiply(result.evaluated_value, result.meta['scale'])), result.name, result.reference_value, + result.threshold, value_name='mean', postfix='%', scale=1, result_format='{:.2f}' + ) + ] + mock_write_scalar_res.assert_has_calls(calls) + + def test_vector_presenter_with_vector_data_with_vector_scale(self, mocker): + mock_write_scalar_res = mocker.patch('accuracy_checker.presenters.write_scalar_result') # type: MagicMock + result = EvaluationResult( + name='scalar_metric', + evaluated_value=[0.4, 0.6], + reference_value=None, + threshold=None, + meta={'names': ['class1', 'class2'], 'scale': [1, 2]} + ) + presenter = VectorPrintPresenter() + presenter.write_result(result) + calls = [ + call( + result.evaluated_value[0], result.name, result.reference_value, result.threshold, + postfix='%', scale=result.meta['scale'][0], result_format='{:.2f}', value_name=result.meta['names'][0] + ), + call( + result.evaluated_value[1], result.name, result.reference_value, result.threshold, postfix='%', + scale=result.meta['scale'][1], result_format='{:.2f}', value_name=result.meta['names'][1] + ), + call( + np.mean(np.multiply(result.evaluated_value, result.meta['scale'])), result.name, result.reference_value, + result.threshold, result_format='{:.2f}', value_name='mean', postfix='%', scale=1 + ) + ] + mock_write_scalar_res.assert_has_calls(calls) diff --git a/tools/accuracy_checker/tests/test_regression_metrics.py b/tools/accuracy_checker/tests/test_regression_metrics.py new file mode 100644 index 0000000..3829b5a --- /dev/null +++ b/tools/accuracy_checker/tests/test_regression_metrics.py @@ -0,0 +1,338 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +from accuracy_checker.metrics import MetricsExecutor +from accuracy_checker.representation import RegressionPrediction, RegressionAnnotation +from accuracy_checker.presenters import EvaluationResult + + +class TestRegressionMetric: + def setup_method(self): + self.module = 'accuracy_checker.metrics.metric_evaluator' + + def test_mae_with_zero_diff_between_annotation_and_prediction(self): + annotations = [RegressionAnnotation('identifier', 3)] + predictions = [RegressionPrediction('identifier', 3)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae'}]} + expected = EvaluationResult( + pytest.approx([0.0, 0.0]), + None, + 'mae', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean', 'std'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_with_negative_diff_between_annotation_and_prediction(self): + annotations = [RegressionAnnotation('identifier', 3), RegressionAnnotation('identifier2', 1)] + predictions = [RegressionPrediction('identifier', 5), RegressionPrediction('identifier2', 5)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae'}]} + expected = EvaluationResult( + pytest.approx([3.0, 1.0]), + None, + 'mae', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean', 'std'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_with_positive_diff_between_annotation_and_prediction(self): + annotations = [RegressionAnnotation('identifier', 3), RegressionAnnotation('identifier2', 1)] + predictions = [RegressionPrediction('identifier', 1), RegressionPrediction('identifier2', -3)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae'}]} + expected = EvaluationResult( + pytest.approx([3.0, 1.0]), + None, + 'mae', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean', 'std'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mse_with_zero_diff_between_annotation_and_prediction(self): + annotations = [RegressionAnnotation('identifier', 3)] + predictions = [RegressionPrediction('identifier', 3)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mse'}]} + expected = EvaluationResult( + pytest.approx([0.0, 0.0]), + None, + 'mse', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean', 'std'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mse_with_negative_diff_between_annotation_and_prediction(self): + annotations = [RegressionAnnotation('identifier', 3), RegressionAnnotation('identifier2', 1)] + predictions = [RegressionPrediction('identifier', 5), RegressionPrediction('identifier2', 5)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mse'}]} + expected = EvaluationResult( + pytest.approx([10.0, 6.0]), + None, + 'mse', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean', 'std'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mse_with_positive_diff_between_annotation_and_prediction(self): + annotations = [RegressionAnnotation('identifier', 3), RegressionAnnotation('identifier2', 1)] + predictions = [RegressionPrediction('identifier', 1), RegressionPrediction('identifier2', -3)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mse'}]} + expected = EvaluationResult( + pytest.approx([10.0, 6.0]), + None, + 'mse', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean', 'std'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_missed_interval(self): + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae_on_interval'}]} + with pytest.raises(ValueError): + MetricsExecutor(config, None) + + def test_mae_on_interval_default_all_missed(self): + annotations = [RegressionAnnotation('identifier', -2)] + predictions = [RegressionPrediction('identifier', 1)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae_on_interval', 'end': 1}]} + expected = EvaluationResult( + pytest.approx([0.0]), + None, + 'mae_on_interval', + None, + {'postfix': ' ', 'scale': 1, 'names': [], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + with pytest.warns(UserWarning) as warnings: + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert len(warnings) == 1 + assert evaluation_result == expected + + def test_mae_on_interval_default_all_not_in_range_not_ignore_out_of_range(self): + annotations = [RegressionAnnotation('identifier', -1), RegressionAnnotation('identifier', 2)] + predictions = [RegressionPrediction('identifier', 1), RegressionPrediction('identifier', 2)] + expected = EvaluationResult( + pytest.approx([2.0, 0.0, 0.0, 0.0]), + None, + 'mae_on_interval', + None, + { + 'postfix': ' ', + 'scale': 1, + 'names': ['mean: < 0.0', 'std: < 0.0', 'mean: > 1.0', 'std: > 1.0'], + 'calculate_mean': False + } + ) + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'mae_on_interval', 'end': 1, 'ignore_values_not_in_interval': False}] + } + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_on_interval_values_in_range(self): + annotations = [RegressionAnnotation('identifier', 0.5), RegressionAnnotation('identifier', 0.5)] + predictions = [RegressionPrediction('identifier', 1), RegressionPrediction('identifier', 0.25)] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae_on_interval', 'end': 1}]} + expected = EvaluationResult( + pytest.approx([0.375, 0.125]), + None, + 'mae_on_interval', + None, + {'postfix': ' ', 'scale': 1, 'names': ['mean: <= 0.0 < 1.0', 'std: <= 0.0 < 1.0'], 'calculate_mean': False} + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_on_interval_default_not_ignore_out_of_range(self): + annotations = [ + RegressionAnnotation('identifier', -1), + RegressionAnnotation('identifier', 2), + RegressionAnnotation('identifier', 0.5) + ] + predictions = [ + RegressionPrediction('identifier', 1), + RegressionPrediction('identifier', 2), + RegressionPrediction('identifier', 1) + ] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'mae_on_interval', 'end': 1, 'ignore_values_not_in_interval': False}] + } + expected = EvaluationResult( + pytest.approx([2.0, 0.0, 0.5, 0.0, 0.0, 0.0]), + None, + 'mae_on_interval', + None, + { + 'postfix': ' ', + 'scale': 1, + 'names': [ + 'mean: < 0.0', + 'std: < 0.0', + 'mean: <= 0.0 < 1.0', + 'std: <= 0.0 < 1.0', + 'mean: > 1.0', + 'std: > 1.0' + ], + 'calculate_mean': False + } + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_on_interval_with_given_interval(self): + annotations = [ + RegressionAnnotation('identifier', -1), + RegressionAnnotation('identifier', 2), + RegressionAnnotation('identifier', 1) + ] + predictions = [ + RegressionPrediction('identifier', 1), + RegressionPrediction('identifier', 3), + RegressionPrediction('identifier', 1) + ] + config = { + 'annotation': 'mocked', + 'metrics': [{'type': 'mae_on_interval', 'intervals': [0.0, 2.0, 4.0]}] + } + expected = EvaluationResult( + pytest.approx([0.0, 0.0, 1.0, 0.0]), + None, + 'mae_on_interval', + None, + { + 'postfix': ' ', + 'scale': 1, + 'names': ['mean: <= 0.0 < 2.0', 'std: <= 0.0 < 2.0', 'mean: <= 2.0 < 4.0', 'std: <= 2.0 < 4.0'], + 'calculate_mean': False + } + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_on_interval_with_repeated_values(self): + annotations = [ + RegressionAnnotation('identifier', -1), + RegressionAnnotation('identifier', 2), + RegressionAnnotation('identifier', 1) + ] + predictions = [ + RegressionPrediction('identifier', 1), + RegressionPrediction('identifier', 3), + RegressionPrediction('identifier', 1) + ] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae_on_interval', 'intervals': [0.0, 2.0, 2.0, 4.0]}]} + expected = EvaluationResult( + pytest.approx([0.0, 0.0, 1.0, 0.0]), + None, + 'mae_on_interval', + None, + { + 'postfix': ' ', + 'scale': 1, + 'names': ['mean: <= 0.0 < 2.0', 'std: <= 0.0 < 2.0', 'mean: <= 2.0 < 4.0', 'std: <= 2.0 < 4.0'], + 'calculate_mean': False + } + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_mae_on_interval_with_unsorted_values(self): + annotations = [ + RegressionAnnotation('identifier', -1), + RegressionAnnotation('identifier', 2), + RegressionAnnotation('identifier', 1) + ] + predictions = [ + RegressionPrediction('identifier', 1), + RegressionPrediction('identifier', 3), + RegressionPrediction('identifier', 1) + ] + config = {'annotation': 'mocked', 'metrics': [{'type': 'mae_on_interval', 'intervals': [2.0, 0.0, 4.0]}]} + expected = EvaluationResult( + pytest.approx([0.0, 0.0, 1.0, 0.0]), + None, + 'mae_on_interval', + None, + { + 'postfix': ' ', 'scale': 1, + 'names': ['mean: <= 0.0 < 2.0', 'std: <= 0.0 < 2.0', 'mean: <= 2.0 < 4.0', 'std: <= 2.0 < 4.0'], + 'calculate_mean': False + } + ) + dispatcher = MetricsExecutor(config, None) + + dispatcher.update_metrics_on_batch(annotations, predictions) + + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected diff --git a/tools/accuracy_checker/tests/test_reid_metrics.py b/tools/accuracy_checker/tests/test_reid_metrics.py new file mode 100644 index 0000000..b73008a --- /dev/null +++ b/tools/accuracy_checker/tests/test_reid_metrics.py @@ -0,0 +1,77 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +from accuracy_checker.metrics.reid import eval_cmc + + +class TestCMC: + def test_only_distance_matrix(self): + distance_matrix = np.array([ + [0, 1, 2, 3, 4], + [1, 0, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [1, 2, 3, 4, 0] + ]) + m, n = distance_matrix.shape + + result = eval_cmc( + distance_matrix, + query_ids=np.arange(m), + gallery_ids=np.arange(n), + query_cams=np.zeros(m).astype(np.int32), + gallery_cams=np.ones(n).astype(np.int32) + ) + + assert np.all(result[:5] == [0.6, 0.6, 0.8, 1.0, 1.0]) + + def test_duplicate_ids(self): + distance_matrix = np.array([ + [0, 1, 2, 3], + [0, 1, 2, 3], + [0, 1, 2, 3], + [0, 1, 2, 3] + ]) + + result = eval_cmc( + distance_matrix, + query_ids=np.array([0, 0, 1, 1]), + gallery_ids=np.array([0, 0, 1, 1]), + top_k=4, + gallery_cams=np.ones(distance_matrix.shape[1]).astype(np.int32), + query_cams=np.zeros(distance_matrix.shape[0]).astype(np.int32), + separate_camera_set=False, + single_gallery_shot=False + ) + + assert np.all(result == [0.5, 0.5, 1, 1]) + + def test_duplicate_cams(self): + distance_matrix = np.tile(np.arange(5), (5, 1)) + + result = eval_cmc( + distance_matrix, + query_ids=np.array([0, 0, 0, 1, 1]), + gallery_ids=np.array([0, 0, 0, 1, 1]), + query_cams=np.array([0, 0, 0, 0, 0]), + gallery_cams=np.array([0, 1, 1, 1, 1]), + top_k=5, + separate_camera_set=False, + single_gallery_shot=False + ) + + assert np.all(result == [0.6, 0.6, 0.6, 1, 1]) diff --git a/tools/accuracy_checker/tests/test_segmentation_metrics.py b/tools/accuracy_checker/tests/test_segmentation_metrics.py new file mode 100644 index 0000000..03095fc --- /dev/null +++ b/tools/accuracy_checker/tests/test_segmentation_metrics.py @@ -0,0 +1,164 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pytest +import numpy as np +from accuracy_checker.metrics import MetricsExecutor +from accuracy_checker.presenters import EvaluationResult +from .common import single_class_dataset, multi_class_dataset, make_segmentation_representation + + +def create_config(metric_name, use_argmax=False): + return {'annotation': 'mocked', 'metrics': [{'type': metric_name, 'use_argmax': use_argmax}]} + + +def generate_expected_result(values, metric_name, labels=None): + meta = {'names': list(labels.values())} if labels else {} + + return EvaluationResult(pytest.approx(values), None, metric_name, None, meta) + + +class TestPixelAccuracy: + name = 'segmentation_accuracy' + + def test_one_class(self): + annotations = make_segmentation_representation(np.array([[0, 0], [0, 0]]), True) + predictions = make_segmentation_representation(np.array([[0, 0], [0, 0]]), False) + dispatcher = MetricsExecutor(create_config(self.name), single_class_dataset()) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result(1.0, self.name) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class_not_matched(self): + annotations = make_segmentation_representation(np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]), True) + predictions = make_segmentation_representation(np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]), False) + dispatcher = MetricsExecutor(create_config(self.name), multi_class_dataset()) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result(0.0, self.name) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class(self): + annotations = make_segmentation_representation(np.array([[1, 0, 3, 0, 0], [0, 0, 0, 0, 0]]), True) + predictions = make_segmentation_representation(np.array([[1, 2, 3, 2, 3], [0, 0, 0, 0, 0]]), False) + dispatcher = MetricsExecutor(create_config(self.name), multi_class_dataset()) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result((5.0+1.0+1.0)/(8.0+1.0+1.0), self.name) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + +class TestMeanAccuracy: + name = 'mean_accuracy' + + def test_one_class(self): + annotations = make_segmentation_representation(np.array([[0, 0], [0, 0]]), True) + predictions = make_segmentation_representation(np.array([[0, 0], [0, 0]]), False) + dataset = single_class_dataset() + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result([1.0, 0.0], self.name, dataset.labels) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class_not_matched(self): + annotations = make_segmentation_representation(np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]), True) + predictions = make_segmentation_representation(np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]), False) + dataset = multi_class_dataset() + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result([0.0, 0.0, 0.0, 0.0], self.name, dataset.labels) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class(self): + dataset = multi_class_dataset() + annotations = make_segmentation_representation(np.array([[1, 2, 3, 2, 3], [0, 0, 0, 0, 0]]), True) + predictions = make_segmentation_representation(np.array([[1, 0, 3, 0, 0], [0, 0, 0, 0, 0]]), False) + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result([1.0, 1.0, 0.0, 0.5], self.name, dataset.labels) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + +class TestMeanIOU: + name = 'mean_iou' + + def test_one_class(self): + annotations = make_segmentation_representation(np.array([[0, 0], [0, 0]]), True) + predictions = make_segmentation_representation(np.array([[0, 0], [0, 0]]), False) + dataset = single_class_dataset() + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result([1.0, 0.0], self.name, dataset.labels) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class_not_matched(self): + annotations = make_segmentation_representation(np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]), True) + predictions = make_segmentation_representation(np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]), False) + dataset = multi_class_dataset() + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result([0.0, 0.0, 0.0, 0.0], self.name, dataset.labels) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class(self): + dataset = multi_class_dataset() + annotations = make_segmentation_representation(np.array([[1, 2, 3, 2, 3], [0, 0, 0, 0, 0]]), True) + predictions = make_segmentation_representation(np.array([[1, 0, 3, 0, 0], [0, 0, 0, 0, 0]]), False) + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result([0.625, 1.0, 0.0, 0.5], self.name, dataset.labels) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + +class TestSegmentationFWAcc: + name = 'frequency_weighted_accuracy' + + def test_one_class(self): + annotations = make_segmentation_representation(np.array([[0, 0], [0, 0]]), True) + predictions = make_segmentation_representation(np.array([[0, 0], [0, 0]]), False) + dataset = single_class_dataset() + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result(1.0, self.name) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class_not_matched(self): + annotations = make_segmentation_representation(np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]), True) + predictions = make_segmentation_representation(np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]), False) + dataset = multi_class_dataset() + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result(0.0, self.name) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected + + def test_multi_class(self): + dataset = multi_class_dataset() + annotations = make_segmentation_representation(np.array([[1, 2, 3, 2, 3], [0, 0, 0, 0, 0]]), True) + predictions = make_segmentation_representation(np.array([[1, 0, 3, 0, 0], [0, 0, 0, 0, 0]]), False) + dispatcher = MetricsExecutor(create_config(self.name), dataset) + dispatcher.update_metrics_on_batch(annotations, predictions) + expected = generate_expected_result(0.5125, self.name) + for _, evaluation_result in dispatcher.iterate_metrics(annotations, predictions): + assert evaluation_result == expected diff --git a/tools/accuracy_checker/tests/test_utils.py b/tools/accuracy_checker/tests/test_utils.py new file mode 100644 index 0000000..4ac9cdf --- /dev/null +++ b/tools/accuracy_checker/tests/test_utils.py @@ -0,0 +1,127 @@ +""" +Copyright (c) 2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from accuracy_checker.utils import concat_lists, contains_all, contains_any, overrides, zipped_transform + + +def test_concat_lists(): + assert ['a', 'b'] == concat_lists(['a'], ['b']) + assert ['a', 'b', 'c'] == concat_lists(['a'], ['b'], ['c']) + assert ['a', 'b', 'c'] == concat_lists(['a', 'b'], ['c']) + assert ['a'] == concat_lists(['a'], []) + assert [] == concat_lists([], [], []) + assert [] == concat_lists([]) + + +def test_contains_all(): + assert contains_all([1, 2, 3], [1, 2]) + assert contains_all([1, 2, 3], [1, 2], [3]) + assert not contains_all([1, 2, 3], [1, 5]) + + +def test_contains_any(): + assert contains_any([1, 2, 3], [1]) + assert contains_any([1, 2, 3], [4, 5, 2]) + assert not contains_any([1, 2, 3], [4, 5]) + + +class TestZippedTransform: + def test_two_iterables(self): + as_ = [2, 3, 5] + bs = [2, 3, 6] + + ras, rbs = zipped_transform(lambda a, b: (a + b, a - b), as_, bs) + + assert ras == [4, 6, 11] + assert rbs == [0, 0, -1] + assert as_ == [2, 3, 5] + assert bs == [2, 3, 6] + + def test_inplace(self): + as_ = [2, 3, 5] + bs = [2, 3, 6] + + zipped_transform(lambda a, b: (a + b, a - b), as_, bs, inplace=True) + + assert as_ == [4, 6, 11] + assert bs == [0, 0, -1] + + def test_three_iterables(self): + as_ = [1, 1, 1] + bs = [2, 2, 2] + cs = [3, 3, 3] + + ras, rbs, rcs = zipped_transform(lambda a, b, c: (a + 1, b + 2, c + 3), as_, bs, cs) + + assert ras == [2, 2, 2] + assert rbs == [4, 4, 4] + assert rcs == [6, 6, 6] + + def test_none_function(self): + xs = [1, 1, 1] + ys = [1, 1, 1] + zipped_transform(lambda a, b: None, xs, ys) + + +class TestOverrides: + def test_negative(self): + class A: + def foo(self): + pass + + class B(A): + pass + + assert not overrides(B, 'foo') + assert not overrides(B(), 'foo') + + def test_positive(self): + class A: + def foo(self): + pass + + class B(A): + def foo(self): + pass + + assert overrides(B, 'foo') + assert overrides(B(), 'foo') + + def test_three_class(self): + class A: + def foo(self): pass + + class B(A): + pass + + class C(B): + def foo(self): pass + + assert overrides(C, 'foo') + assert not overrides(B, 'foo') + + def test_custom_base(self): + class A: + def foo(self): pass + + class B: + def foo(self): pass + + class C: + pass + + assert overrides(B, 'foo', A) + assert not overrides(C, 'foo', A) diff --git a/tools/benchmark/README.md b/tools/benchmark/README.md new file mode 100644 index 0000000..16dcdc0 --- /dev/null +++ b/tools/benchmark/README.md @@ -0,0 +1,31 @@ +# OpenVINO™ Benchmark Python* package +Inference Engine `openvino.tools.benchmark` Python\* package consists types to measure synchronous mode latency. +The package depends on `openvino.tools.accuracy_checker` the package. + +Please, refer to https://docs.openvinotoolkit.org for details. + +## Usage +You can use the `openvino.tools.calibration` package in a simple way: +```Python +import openvino.tools.benchmark as benchmark + +config = benchmark.CommandLineReader.read() +result = benchmark.Benchmark(config).run() +print("{0}: {1:.4} ms".format(config.model, result.latency * 1000.0)) +``` +### Explanation +1. Import `openvino.tools.benchmark` types: +```Python +import openvino.tools.benchmark as benchmark +``` + +2. Read configuration and execute the benchmark: +```Python +config = benchmark.CommandLineReader.read() +result = benchmark.Benchmark(config).run() +``` + +3. Print results: +```Python +print("{0}: {1:.4} ms".format(config.model, result.latency * 1000.0)) +``` \ No newline at end of file diff --git a/tools/benchmark/__init__.py b/tools/benchmark/__init__.py new file mode 100644 index 0000000..d5f2cf5 --- /dev/null +++ b/tools/benchmark/__init__.py @@ -0,0 +1,26 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .benchmark import Benchmark +from .command_line_reader import CommandLineReader +from .configuration import Configuration + +__version__ = "0.0.1" +__all__ = [ + 'Benchmark', + 'CommandLineReader', + 'Configuration' +] diff --git a/tools/benchmark/__main__.py b/tools/benchmark/__main__.py new file mode 100644 index 0000000..5beda67 --- /dev/null +++ b/tools/benchmark/__main__.py @@ -0,0 +1,28 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import openvino.tools.benchmark as benchmark + + +def benchmark(): + + config = benchmark.CommandLineReader.read() + result = benchmark.Benchmark(config).run() + print("{0}: {1:.4} ms".format(config.model, result.latency * 1000.0)) + + +if __name__ == '__main__': + benchmark() diff --git a/tools/benchmark/benchmark.py b/tools/benchmark/benchmark.py new file mode 100644 index 0000000..07cc845 --- /dev/null +++ b/tools/benchmark/benchmark.py @@ -0,0 +1,157 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy +import datetime + +import openvino.inference_engine as ie + +from ..accuracy_checker.accuracy_checker.config import ConfigReader +from ..accuracy_checker.accuracy_checker.model_evaluator import ModelEvaluator +from ..accuracy_checker.accuracy_checker.progress_reporters import PrintProgressReporter, TQDMReporter + +from ..network import Network + +from .configuration import Configuration +from .logging import info + + +class BenchmarkCallback: + def __init__(self, configuration: Configuration, network: Network=None, iterations_count:int=1000): + self._latency = None + self._configuration = configuration + self._network = network + self._iterations_count = iterations_count if iterations_count else 1000 + + def output_callback(self, value, latency = None): + pass + + + def benchmark_callback(self, network_inputs_data): + latencies = list() + + if self._network: + ie_network = self._network.ie_network + else: + ie_network = ie.IENetwork(self._configuration.model, self._configuration.weights) + plugin = ie.IEPlugin(self._configuration.device) + if self._configuration.cpu_extension: + plugin.add_cpu_extension(self._configuration.cpu_extension) + exec_network = plugin.load(ie_network) + + # warming up + exec_network.infer(network_inputs_data) + + for i in range(self._iterations_count): + start = datetime.datetime.now() + exec_network.infer(network_inputs_data) + latencies.append((datetime.datetime.now() - start).microseconds) + self._latency = numpy.mean(latencies) / 1000000.0 + + del ie_network + del exec_network + del plugin + + + @property + def latency(self) -> float: + return self._latency + + +class BenchmarkResult: + def __init__(self, latency): + self._latency = latency + + @property + def latency(self) -> float: + return self._latency + + +class InferOptions: + def __init__(self, iterations_count=1000): + self._iterations_count = iterations_count + + @property + def iterations_count(self) -> int: + return self._iterations_count + + +class Benchmark: + def __init__(self, configuration: Configuration): + if configuration is None: + raise ValueError("configuration is None") + + self._configuration = configuration + pass + + def run( + self, + network: Network = None, + statistics=None, + quantization_levels=None, + iterations_count:int = 1000) -> BenchmarkResult: + + model = self._configuration.config['models'][0] + launcher_config = model['launchers'][0] + dataset_config = model['datasets'][0] + + model_evaluator = ModelEvaluator.from_configs(launcher_config, dataset_config) + try: + if network: + del model_evaluator.launcher.network + del model_evaluator.launcher.exec_network + model_evaluator.launcher.network = network.ie_network + model_evaluator.launcher.exec_network = model_evaluator.launcher.plugin.load(network.ie_network) + + ie_network = model_evaluator.launcher.network + + if statistics: + network_stats = {} + for layer_name, node_statistic in statistics.items(): + network_stats[layer_name] = ie.LayerStats( + min=tuple(node_statistic.min_outputs), + max=tuple(node_statistic.max_outputs)) + ie_network.stats.update(network_stats) + + if quantization_levels: + for layer_name, value in quantization_levels.items(): + params = ie_network.layers[layer_name].params + params["quantization_level"] = value + ie_network.layers[layer_name].params = params + + if model_evaluator.dataset.size != 1: + info("only one first image is used from dataset annotation to perform benchmark") + model_evaluator.dataset.size = 1 + + process_dataset_callback = BenchmarkCallback( + configuration=self._configuration, + network=network, + iterations_count=iterations_count) + + model_evaluator.process_dataset( + None, + progress_reporter=None, + output_callback=process_dataset_callback.output_callback, + benchmark=process_dataset_callback.benchmark_callback) + + if len(model_evaluator.launcher.exec_network.requests) != 1: + raise ValueError("unexpected network requests count") + + latency = process_dataset_callback.latency + finally: + model_evaluator.release() + + return BenchmarkResult(latency) \ No newline at end of file diff --git a/tools/benchmark/command_line_reader.py b/tools/benchmark/command_line_reader.py new file mode 100644 index 0000000..4599b28 --- /dev/null +++ b/tools/benchmark/command_line_reader.py @@ -0,0 +1,155 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +import collections +import errno +import pathlib +from functools import partial +from argparse import ArgumentParser +from typing import Union + +from ..accuracy_checker.accuracy_checker.config import ConfigReader +from ..accuracy_checker.accuracy_checker.utils import get_path +from ..network import Network + +from .configuration import Configuration +from .logging import info + + +class CommandLineReader: + """ + Class for parsing input config + """ + @staticmethod + def read(): + args, unknown_args = CommandLineReader.__build_arguments_parser().parse_known_args() + if unknown_args: + info("unknown command line arguments: {0}".format(unknown_args)) + + args.target_framework = "dlsdk" + args.aocl = None + + merged_config = ConfigReader.merge(args) + launcher = merged_config['models'][0]['launchers'][0] + + batch_size = args.batch_size if args.batch_size else (launcher['batch'] if 'batch' in launcher else None) + if not batch_size: + with Network(str(launcher['model']), str(launcher['weights'])) as network: + batch_size = network.ie_network.batch_size + + return Configuration( + config = merged_config, + model = str(launcher['model']), + weights = str(launcher['weights']), + cpu_extension = (str(launcher['cpu_extensions']) if 'cpu_extensions' in launcher else None), + gpu_extension = (str(launcher['gpu_extensions']) if 'gpu_extensions' in launcher else None), + device = launcher['device'], + benchmark_iterations_count = args.benchmark_iterations_count) + + @staticmethod + def __build_arguments_parser(): + parser = ArgumentParser(description='openvino.tools.benchmark') + + parser.add_argument( + '-d', '--definitions', + help='Optional. Path to the YML file with definitions', + type=str, + required=False) + + parser.add_argument( + '-c', + '--config', + help='Required. Path to the YML file with local configuration', + type=get_path, + required=True) + + parser.add_argument( + '-m', '--models', + help='Optional. Prefix path to the models and weights', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-s', '--source', + help='Optional. prefix path to the data source', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-a', '--annotations', + help='Optional. prefix path to the converted annotations and datasets meta data', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-e', '--extensions', + help='Optional. Prefix path to extensions folder', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '--cpu_extensions_mode', + help='Optional. specified preferable set of processor instruction for automatic searching cpu extension lib', + required=False, + choices=['avx2', 'sse4']) + + parser.add_argument( + '-b', '--bitstreams', + help='Optional. prefix path to bitstreams folder', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-C', '--converted_models', '--converted-models', + help='Optional. directory to store Model Optimizer converted models. Used for DLSDK launcher only', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-td', '--target_devices', '--target-devices', + help='Optional. Space-separated list of devices for infer', + required=False, + nargs='+', + default=["CPU"]) + + parser.add_argument( + '-tt', '--target_tags', '--target-tags', + help='Optional. Space-separated list of launcher tags for infer', + required=False, + nargs='+') + + parser.add_argument( + '--batch-size', + help='Optional. Batch size value. If not specified, the batch size value is determined from IR', + type=int, + required=False) + + parser.add_argument( + '-ic', + '--benchmark_iterations_count', + help='Optional. Benchmark itertations count. (1000 is default)', + type=float, + required=False, + default=1000) + + return parser \ No newline at end of file diff --git a/tools/benchmark/configuration.py b/tools/benchmark/configuration.py new file mode 100644 index 0000000..af3d6dc --- /dev/null +++ b/tools/benchmark/configuration.py @@ -0,0 +1,64 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Configuration: + def __init__( + self, + config: str, + model: str, + weights: str, + device: str, + cpu_extension: str, + gpu_extension: str, + benchmark_iterations_count: int + ): + + self._config = config + self._model = model + self._weights = weights + self._device = device + self._cpu_extension = cpu_extension + self._gpu_extension = gpu_extension + self._benchmark_iterations_count = benchmark_iterations_count + + @property + def config(self) -> str: + return self._config + + @property + def model(self) -> str: + return self._model + + @property + def weights(self) -> str: + return self._weights + + @property + def device(self) -> str: + return self._device + + @property + def cpu_extension(self) -> str: + return self._cpu_extension + + @property + def gpu_extension(self) -> str: + return self._gpu_extension + + @property + def benchmark_iterations_count(self): + return self._benchmark_iterations_count \ No newline at end of file diff --git a/tools/benchmark/logging.py b/tools/benchmark/logging.py new file mode 100644 index 0000000..f3fec90 --- /dev/null +++ b/tools/benchmark/logging.py @@ -0,0 +1,125 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import logging.config +import sys +import warnings + +# TODO: move to utils +_DEFAULT_LOGGER_NAME = 'openvino.tools.benchmark' +_DEFAULT_LOG_FILE = 'openvino.tools.benchmark.log' + +PRINT_INFO = logging.INFO + 5 +logging.addLevelName(PRINT_INFO, "PRINT_INFO") + +_LOG_LEVEL_ENVIRON = "CALIBRATION_TOOL_LOG_LEVEL" +# _LOGGING_LEVEL = logging.getLevelName(os.environ.get(_LOG_LEVEL_ENVIRON, PRINT_INFO)) +# TODO: refactoring: remove, use original line +_LOGGING_LEVEL = "DEBUG" + + +class LoggingFormatter(logging.Formatter): + def format(self, record: logging.LogRecord): + if record.levelno == PRINT_INFO: + return record.msg + return super().format(record) + + +class ConsoleHandler(logging.StreamHandler): + def __init__(self, default_stream=sys.stdout): + super().__init__(default_stream) + self.default_stream = default_stream + self.err_stream = sys.stderr + + def emit(self, record): + if record.levelno >= logging.WARNING: + self.stream = self.err_stream + else: + self.stream = self.default_stream + super().emit(record) + + +_LOGGING_CONFIGURATION = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'default': { + '()': LoggingFormatter, + 'format': '%(asctime)s %(name)s %(levelname)s: %(message)s', + 'datefmt': '%H:%M:%S' + }, + 'detailed': { + 'format': '%(asctime)s %(name)s %(levelname)s: %(message)s' + } + }, + 'handlers': { + 'console': { + 'level': 'DEBUG', + '()': ConsoleHandler, + 'formatter': 'default', + } + }, + + 'loggers': { + _DEFAULT_LOGGER_NAME: { + 'handlers': ['console'], + 'level': _LOGGING_LEVEL, + 'propagate': False + } + } +} + +logging.config.dictConfig(_LOGGING_CONFIGURATION) + +_default_logger = logging.getLogger(_DEFAULT_LOGGER_NAME) + + +def _warning_handler(message, category, filename, lineno): + s = warnings.formatwarning(message, category, filename, lineno) + _default_logger.warning(s) + + +warnings.showwarning = _warning_handler + + +def get_logger(logger_name: str): + if logger_name.startswith(_DEFAULT_LOGGER_NAME): + return _default_logger.getChild(logger_name) + return logging.getLogger(logger_name) + + +def error(msg, *args, **kwargs): + _default_logger.error(msg, *args, **kwargs) + + +def warning(msg, *args, raise_warning=True, **kwargs): + if raise_warning: + warnings.warn(msg) + else: + _default_logger.warning(msg, *args, **kwargs) + + +def info(msg, *args, **kwargs): + _default_logger.info(msg, *args, **kwargs) + + +def debug(msg, *args, **kwargs): + _default_logger.debug(msg, *args, **kwargs) + + +def print_info(msg, *args, **kwargs): + _default_logger.log(PRINT_INFO, msg, *args, **kwargs) diff --git a/tools/benchmark/requirements.txt b/tools/benchmark/requirements.txt new file mode 100644 index 0000000..5e3e8ee --- /dev/null +++ b/tools/benchmark/requirements.txt @@ -0,0 +1,8 @@ +py-cpuinfo +numpy +progress +pyyaml +opencv-python +shapely +sklearn +xmltodict diff --git a/tools/calibration/README.md b/tools/calibration/README.md new file mode 100644 index 0000000..fc55ad8 --- /dev/null +++ b/tools/calibration/README.md @@ -0,0 +1,33 @@ +# OpenVINO™ Calibration Python* package +The Inference Engine `openvino.tools.calibration` Python\* package includes types to calibrate a given FP32 model so that you can run it in low-precision 8-bit integer mode while keeping the input data of this model in the original precision. +The package has the following dependencies: +* `openvino.tools.accuracy_checker` package +* `openvino.tools.benchmark` package. + +Please, refer to https://docs.openvinotoolkit.org for details. + +## Usage +You can use the `openvino.tools.calibration` package in a simple way: +```Python +import openvino.tools.calibration as calibration + +with calibration.CommandLineProcessor.process() as config: + network = calibration.Calibrator(config).run() + if network: + network.serialize(config.output_model) +``` +### Explanation +1. Import openvino.tools.calibration types: +```Python +import openvino.tools.calibration as calibration +``` + +2. Read configuration and process the model: +```Python +config = calibration.CommandLineProcessor.process() +``` + +3. Serialize result model: +```Python +network.serialize(config.output_model) +``` \ No newline at end of file diff --git a/tools/calibration/__init__.py b/tools/calibration/__init__.py new file mode 100644 index 0000000..2622828 --- /dev/null +++ b/tools/calibration/__init__.py @@ -0,0 +1,34 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from .aggregated_statistics import AggregatedStatistics +from .calibrator import Calibrator +from .calibration_configuration import CalibrationConfiguration, CalibrationConfigurationHelper +from .calibrator_configuration import CalibratorConfiguration +from .calibrator_factory import CalibratorFactory +from .command_line_reader import CommandLineReader +from .command_line_processor import CommandLineProcessor + +__version__ = "0.0.1" +__all__ = [ + 'AggregatedStatistics', + 'Calibrator', + 'CalibrationConfiguration', + 'CalibrationConfigurationHelper', + 'CalibratorConfiguration', + 'CalibratorFactory', + 'CommandLineReader', + 'CommandLineProcessor' +] diff --git a/tools/calibration/__main__.py b/tools/calibration/__main__.py new file mode 100644 index 0000000..500c3fc --- /dev/null +++ b/tools/calibration/__main__.py @@ -0,0 +1,79 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from argparse import ArgumentParser + +import openvino.tools.calibration as calibration +import openvino.tools.utils as utils + + +def calibrate(): + config = calibration.CommandLineReader.read() + network = calibration.Calibrator(config).run() + network.serialize(config.output_model) + + +def check_accuracy(): + config = calibration.CommandLineReader.read() + calibrator = calibration.CalibratorFactory.create(config.precision, calibration.CalibratorConfiguration(config)) + + print("Collecting accuracy for {}...".format(config.model)) + result = calibrator.infer() + print("Accuracy: {0:.4f}%".format(100.0 * result.metrics.accuracy)) + + +def collect_statistics(): + import os + config = calibration.CommandLineReader.read() + calibrator = calibration.CalibratorFactory.create(config.precision, calibration.CalibratorConfiguration(config)) + + print("Collecting FP32 statistics for {}...".format(config.model)) + fp32_result = calibrator.infer(add_outputs=True, collect_aggregated_statistics=True) + print("FP32 accuracy: {0:.4f}%".format(100.0 * fp32_result.metrics.accuracy)) + + output_model_file_path = \ + os.path.splitext(config.model)[0] + ("_{}_statistics_without_ignored.xml".format(config.precision.lower()) if + config.ignore_layer_names else + "_{}_statistics.xml".format(config.precision.lower())) + output_weights_file_path = utils.Path.get_weights(output_model_file_path) + + quantization_levels = \ + calibrator.get_quantization_levels(calibration.CalibrationConfigurationHelper.read_ignore_layer_names(config)) + statistics = fp32_result.aggregated_statistics.get_node_statistics() + calibrator.save(output_model_file_path, output_weights_file_path, quantization_levels, statistics) + print("Network with statistics was written to {}.(xml|bin) IR file".format(os.path.splitext(output_model_file_path)[0])) + + +def __build_arguments_parser(): + parser = ArgumentParser(description='Calibration Tool') + parser.add_argument( + 'action', + help='Optional, possible values: calibrate, collect_statistics or check_accuracy', + nargs='?', + choices=('calibrate', 'collect_statistics', 'check_accuracy')) + return parser + + +if __name__ == '__main__': + parser, unknown_args = __build_arguments_parser().parse_known_args() + if parser.action == 'calibrate': + calibrate() + elif parser.action == 'collect_statistics': + collect_statistics() + elif parser.action == 'check_accuracy': + check_accuracy() + else: + calibrate() diff --git a/tools/calibration/aggregated_statistics.py b/tools/calibration/aggregated_statistics.py new file mode 100644 index 0000000..52072c3 --- /dev/null +++ b/tools/calibration/aggregated_statistics.py @@ -0,0 +1,170 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import numpy +import openvino.inference_engine as ie +from .network_node_stats import NetworkNodeStats +from .shape import Shape + + +class AggregatedStatistics: + INDEX_MIN = 0 + INDEX_MAX = 1 + + def __init__(self, result=None, ignore_layer_names: set=None, iterations_count: int = 1, dataset_size: int = 1): + self._ignore_layer_names = ignore_layer_names + self._registered_layers = None + self._iterations_count = iterations_count + self._dataset_size = dataset_size + self._itteration = 0 + + if result: + for inference_result in result.result: + self.add(network = result.network, exec_network = result.exec_network, inference_result = inference_result) + + def release(self): + if self._registered_layers: + del self._registered_layers + self._registered_layers = None + + def add( + self, + network: ie.IENetwork, + exec_network: ie.ExecutableNetwork, + inference_result + ): + ''' + Add inference result to aggregated statistics instance + ''' + layer_names = network.layers.keys() + + if not self._registered_layers: + self._registered_layers = dict() + initialized = False + else: + initialized = True + + # TODO: can be refactored: we are itterating by all layers (to cover input layers output) to collect statistics + # for inference_result in inference_results: + for out_layer_name in layer_names: + if self._ignore_layer_names and out_layer_name in self._ignore_layer_names: + continue + + if out_layer_name in network.inputs: + output_blob = exec_network.requests[0].inputs[out_layer_name] + shape = Shape.create(network.inputs[out_layer_name].layout, output_blob.shape) + else: + # TODO: can be refactored: we are itterating by all layers (to cover input layers output) to collect statistics + if out_layer_name not in inference_result: + continue + output_blob = inference_result[out_layer_name] + shape = Shape.create(network.outputs[out_layer_name].layout, output_blob.shape) + + if not initialized: + # for const layers N is not equal batch size + # self._registered_layers[out_layer_name] = numpy.empty((shape.c, self._dataset_size, 2)) + self._registered_layers[out_layer_name] = numpy.empty((shape.c, shape.n * self._iterations_count, 2)) + + if shape.layout[0] != 'C' and not (len(shape.layout) >= 2 and shape.layout[0] == 'N' and shape.layout[1] == 'C'): + raise ValueError("unsupported layout '{}'".format(shape.layout)) + + if shape.layout[0] != 'N': + output_blob = [output_blob] + + for sample in range(0, shape.n): + for channel in range(0, shape.c): + self.add_tensor_statistics(out_layer_name, output_blob, shape.n, sample, channel, self._itteration) + + self._itteration += 1 + + def register_layer(self, layer_name: str): + if layer_name in self._registered_layers: + raise ValueError("layer '{}' has been added already".format(layer_name)) + + self._registered_layers[layer_name] = None + + @property + def registered_layers(self): + return self._registered_layers + + def add_tensor_statistics(self, layer_name: str, data, n: int, sample: int, channel: int, itteration: int): + channels = self._registered_layers[layer_name] + + n_index = sample + n * itteration + if n_index >= channels.shape[1]: + channels.resize((channels.shape[0], channels.shape[1] + 1, channels.shape[2]), refcheck=False) + + channels.itemset((channel, n_index, self.INDEX_MIN), data[sample][channel].min()) + channels.itemset((channel, n_index, self.INDEX_MAX), data[sample][channel].max()) + + def get_number_channels(self, layer_name: str): + if layer_name in self._registered_layers: + return len(self._registered_layers[layer_name]) + return 0 + + def get_data_min_max(self, layer_name: str, channel: int, threshold: float = None): + # take data by name + if layer_name in self._registered_layers: + layer = self._registered_layers[layer_name] + stats = layer[channel] + + # having absolute min/max values, we can create new statistic + max_values = list() + min_values = list() + for tensor_statistic in stats: + max_values.append(tensor_statistic.item(self.INDEX_MAX)) + min_values.append(tensor_statistic.item(self.INDEX_MIN)) + + # define number of elements to throw out + element_to_take = int(len(max_values) * threshold / 100) if threshold else len(max_values) + elements_to_throw = len(max_values) - element_to_take if threshold else 0 + + max_values.sort() + min_values.sort() + + min = min_values[elements_to_throw] + max = max_values[element_to_take - 1] + else: + min = max = 0.0 + + return min, max + + def serialize(self, json_file_path: str): + with open(json_file_path, 'w') as out_file: + json.dump(self._registered_layers, out_file) + + + def get_node_statistics(self, threshold = None): + net_nodes_stats = dict() + # go over all outputs and get aggregated statistics + for layer_name in self.registered_layers: + channels_count = self.get_number_channels(layer_name) + + if layer_name not in net_nodes_stats: + node_stats = NetworkNodeStats(channels_count) + net_nodes_stats[layer_name] = node_stats + else: + node_stats = net_nodes_stats[layer_name] + + for channel in range(channels_count): + node_stats.min_outputs[channel], node_stats.max_outputs[channel] = self.get_data_min_max(layer_name, channel, threshold) + + return net_nodes_stats + + def pop(self, ignore_layer_names: set): + for ignore_layer_name in ignore_layer_names: + self._registered_layers.pop(ignore_layer_name) \ No newline at end of file diff --git a/tools/calibration/base_calibrator.py b/tools/calibration/base_calibrator.py new file mode 100644 index 0000000..6a54fc4 --- /dev/null +++ b/tools/calibration/base_calibrator.py @@ -0,0 +1,556 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from abc import abstractmethod +import numpy as np +import os +import tempfile +from typing import Dict + +import openvino.inference_engine as ie + +from ..accuracy_checker.accuracy_checker.progress_reporters import TQDMReporter, ProgressReporter +from ..accuracy_checker.accuracy_checker.config import ConfigReader +from ..accuracy_checker.accuracy_checker.model_evaluator import ModelEvaluator + +from ..utils.network_info import NetworkInfo +from ..utils.building.network_builder import NetworkBuilder +from ..utils.building.layer import Layer + +from .logging import info, debug +from .calibrator_configuration import CalibratorConfiguration +from .aggregated_statistics import AggregatedStatistics +from .nrmsd import compare_nrmsd +from .single_layer_network import SingleLayerNetwork +from .inference_result import InferenceResult +from .calibration_metrics import CalibrationMetrics +from .infer_raw_results import InferRawResults + + +class MetricsCallback: + def __init__(self): + self._values = list() + self._latencies = list() + + def callback(self, value, latency = None): + self._values.append(value) + self._latencies.append(latency) + + @property + def values(self): + return self._values + + @property + def latencies(self): + return self._latencies + + +class DatasetCallback: + def __init__( + self, + network: ie.IENetwork, + exec_network: ie.ExecutableNetwork, + collect_resuls: bool = True, + collect_layers: set = None, + collect_aggregated_statistics: bool = True, + iterations_count: int = 1, + dataset_size: int = 1 + ): + + self._network = network + self._exec_network = exec_network + self._aggregated_statistics = None + self._iterations_count = iterations_count + self._dataset_size = dataset_size + self._collect_results = collect_resuls + self._collect_layers = collect_layers + self._collect_aggregated_statistics = collect_aggregated_statistics + self._infer_raw_results = InferRawResults() if collect_resuls else None + self._latencies = list() + + def callback(self, value, latency = None): + if self._collect_aggregated_statistics: + if not self._aggregated_statistics: + self._aggregated_statistics = AggregatedStatistics( + iterations_count = self._iterations_count, + dataset_size = self._dataset_size) + self._aggregated_statistics.add(self._network, self._exec_network, value) + + if self._collect_results: + if self._collect_layers: + collect_value = dict() + for layer_name in value: + if layer_name in self._collect_layers: + collect_value[layer_name] = value[layer_name] + self._infer_raw_results.add(collect_value) + else: + self._infer_raw_results.add(value) + + if latency: + self._latencies.append(latency) + + @property + def aggregated_statistics(self) -> AggregatedStatistics: + return self._aggregated_statistics + + @property + def infer_raw_result(self) -> InferRawResults: + return self._infer_raw_results + + @property + def latencies(self) -> list: + return self._latencies + + def release(self): + if self._aggregated_statistics: + self._aggregated_statistics.release() + if self._infer_raw_results: + self._infer_raw_results.release() + + +class BaseCalibrator: + ''' + Base type for all calibrators + ''' + + def __init__(self, configuration: CalibratorConfiguration): + self._configuration = configuration + + network = self.create_network() + self._input_layer_name = next(iter(network.inputs)) + self._output_layer_name = next(iter(network.outputs)) + + self.plugin = ie.IEPlugin(self._configuration.device) + if self._configuration.cpu_extension and self._configuration.device == 'CPU': + self.plugin.add_cpu_extension(self._configuration.cpu_extension) + if self._configuration.gpu_extension and self._configuration.device == 'GPU': + self.plugin.set_config('CONFIG_FILE', self._configuration.gpu_extension) + + def will_be_fused_workaround(self, layer:ie.IENetLayer, network_info:NetworkInfo=None): + if layer.type == "Const" or layer.type == "Tile": + if not network_info: + network_info = NetworkInfo(self._configuration.model) + only_expected = network_info.explore_inputs(network_info.get_layer(layer.name), ['Const', 'Tile']) + return only_expected, network_info + return False, network_info + + def add_outputs(self, network:ie.IENetwork, output_layers: list=None) -> ie.IENetwork: + if output_layers is None: + output_layers = network.layers.values() + + network_info = None + for layer in output_layers: + fused, network_info = self.will_be_fused_workaround(layer, network_info) + if not fused: + network.add_outputs([layer.name]) + return network + + def create_network(self) -> ie.IENetwork: + network = ie.IENetwork(self._configuration.model, self._configuration.weights) + if len(network.outputs) == 0: + raise ValueError("no outputs") + if len(network.inputs) == 0: + raise ValueError("no inputs") + return network + + def create_network_for_layer( + self, + weights: str, + quantization_layer: ie.IENetLayer, + quantization_layer_info: Layer, + activation_layer: ie.IENetLayer + ): + + if self.is_quantization_supported(quantization_layer.type): + input_layer_info = quantization_layer_info.inputs[0].layer + + layers = [ + Layer( + 0, + "Input", + input_layer_info.name, + {}, + [], + input_layer_info.outputs[0].port.dim), + + Layer( + 1, + quantization_layer.type, + quantization_layer.name, + quantization_layer.params, + quantization_layer_info.inputs[0].port.dim, + quantization_layer_info.outputs[0].port.dim, + quantization_layer_info.weights, + quantization_layer_info.biases) + ] + + if activation_layer: + activation_layer_info = quantization_layer_info.outputs[0].layer + reference_output_layer_name = activation_layer_info.name + outputs = activation_layer_info.outputs + output_layer_outputs_dim = \ + outputs[0].port.dim if outputs else activation_layer_info.inputs[0].port.dim + + layers.append(Layer( + len(layers), + activation_layer.type, + activation_layer.name, + activation_layer.params, + activation_layer_info.inputs[0].port.dim, + output_layer_outputs_dim)) + else: + reference_output_layer_name = quantization_layer_info.name + output_layer_outputs_dim = quantization_layer_info.outputs[0].port.dim + + layers.append(Layer( + len(layers), + "Power", + quantization_layer.name + "_", + {'power': 1.0, 'scale': 1.0, 'shift': 0.0}, + output_layer_outputs_dim, + output_layer_outputs_dim)) + + builder = NetworkBuilder().sequential(layers) + else: + raise ValueError("unsupported layer type '{}'".format(quantization_layer.type)) + + # filling weights and biases + + temporary_file = tempfile.NamedTemporaryFile(delete=False) + try: + builder_str = str(builder) + network_content = str.encode(builder_str) + temporary_file.write(network_content) + temporary_file.close() + + network_for_layer_model = temporary_file.name + network_for_layer_weights = weights + network_for_layer = ie.IENetwork(network_for_layer_model, network_for_layer_weights) + network_for_layer.add_outputs([quantization_layer.name + "_"]) + finally: + if os.path.exists(temporary_file.name): + temporary_file.close() + os.remove(temporary_file.name) + + return network_for_layer, reference_output_layer_name + + def save(self, model_file_path: str, weights_file_path: str, quantization_level: dict, statistics): + ''' + Save calibration results. + ''' + + + if not statistics: + raise ValueError("statistics is empy") + + network = self.create_network() + + network_stats = {} + for layer_name, node_statistic in statistics.items(): + network_stats[layer_name] = ie.LayerStats(min=tuple(node_statistic.min_outputs), + max=tuple(node_statistic.max_outputs)) + network.stats.update(network_stats) + + for layer in network.layers.values(): + if self.is_quantization_supported(layer.type) and layer.name in quantization_level: + params = layer.params + params["quantization_level"] = quantization_level[layer.name] + layer.params = params + + network.serialize(model_file_path, weights_file_path) + + @staticmethod + def __parse_inputs(inputs_entry): + inputs = {} + for input_ in inputs_entry: + value = input_['value'] + if isinstance(value, list): + value = np.array(value) + + inputs[input_['name']] = value + + return inputs + + @staticmethod + def compare_result(result1, result2, output_name: str): + if len(result1) != len(result2): + return False + + for index in range(len(result1)): + result_map1 = result1[index] + result_map2 = result2[index] + + compare_result = result_map1[output_name] == result_map2[output_name] + if not compare_result.all(): + debug('\nresult_map1={}\n'.format(result_map1[output_name])) + debug('\nresult_map2={}\n'.format(result_map2[output_name])) + return False + return True + + def get_affected_layers(self, output_layers: list=None): + ''' + CVS-14299: Linux only: IENetwork.add_outputs (Python API) [and ICNNNetwork::addOutputs (C++ API)] + for some layers affects network inference result + ''' + affected_layers = [] + not_affected_layers = [] + + layers = self.create_network().layers.values() + info("total layers: {}".format(len(layers))) + + network = self.create_network() + ref_results = self._infer(network=network) + info("ORIGINAL: original accuracy (no additional output layers): {}".format(ref_results.metrics.accuracy)) + + index = 1 + for layer in layers: + if layer.type == 'Input': + info("SKIPPED ({}/{}): layer {}/{}".format(index, len(layers), layer.name, layer.type)) + else: + network = self.create_network() + + tmp = not_affected_layers.copy() + tmp.append(layer) + + self.add_outputs(network, tmp) + results = self._infer(network=network) + # if results.metrics.accuracy == 0.0: + if not Int8Calibrator.compare_result(ref_results.result, results.result, self._output_layer_name): + affected_layers.append(layer) + info("FAILED ({}/{}): output layer {}/{} affects result, accuracy: {}".format( + index, + len(layers), + layer.name, + layer.type, + results.metrics.accuracy)) + else: + not_affected_layers.append(layer) + info("PASSED ({}/{}): output layer {}/{}, accuracy: {}".format( + index, + len(layers), + layer.name, + layer.type, + results.metrics.accuracy)) + index += 1 + + return affected_layers + + # TODO: add_outputs - remove, not neccessary + def infer(self, + add_outputs=False, + statistics=None, + quantization_level: dict = None, + collect_resuls: bool = False, + collect_layers: set = None, + collect_aggregated_statistics: bool = False, + network: ie.IENetwork = None, + collect_performance_counters: bool = False) -> InferenceResult: + + if network is None: + network = self.create_network() + + if add_outputs: + self.add_outputs(network) + + if quantization_level: + for layer_name, value in quantization_level.items(): + params = network.layers[layer_name].params + params["quantization_level"] = value + network.layers[layer_name].params = params + + return self._infer( + network=network, + statistics=statistics, + collect_resuls=collect_resuls, + collect_layers=collect_layers, + collect_aggregated_statistics=collect_aggregated_statistics, + collect_performance_counters=collect_performance_counters) + + def infer_single_layer_network(self, + single_layer_network: SingleLayerNetwork, + full_network_result: InferenceResult): + ''' + Native infer and compare results + ''' + + if single_layer_network.input_layer_name in full_network_result: + input_layer_data = full_network_result[single_layer_network.input_layer_name] + else: + raise ValueError("single layer network input '{}' was not found in reference inference".format( + single_layer_network.input_layer_name)) + + single_layer_network_result = \ + single_layer_network.exec_network.infer({single_layer_network.input_layer_name: input_layer_data}) + if single_layer_network.output_layer_name not in single_layer_network_result: + raise ValueError("singld layer network output layer '{}' was not found in single" + " layer inference result".format(single_layer_network.layer_name)) + actual_result_data = single_layer_network_result[single_layer_network.output_layer_name] + + if single_layer_network.reference_output_layer_name not in full_network_result: + raise ValueError("single layer network output layer '{}' was not found in " + "full inference result".format(single_layer_network.layer_name)) + expected_result_data = full_network_result[single_layer_network.reference_output_layer_name] + + accuracy_drop = compare_nrmsd(actual_result_data, expected_result_data) + return accuracy_drop + + def _infer( + self, + network=None, + statistics=None, + collect_aggregated_statistics: bool = True, + collect_resuls: bool = True, + collect_layers: set = None, + collect_performance_counters: bool = False + ) -> InferenceResult: + ''' + Accuracy checker infer and compare results + ''' + accuracy = 0.0 + + model = self._configuration.config['models'][0] + launcher_config = model['launchers'][0] + dataset_config = model['datasets'][0] + + process_dataset_callback = None + model_evaluator = ModelEvaluator.from_configs(launcher_config, dataset_config) + try: + if network: + del model_evaluator.launcher.network + del model_evaluator.launcher.exec_network + model_evaluator.launcher.network = network + model_evaluator.launcher.exec_network = model_evaluator.launcher.plugin.load(network) + + if collect_performance_counters: + model_evaluator.launcher.plugin.set_config({'PERF_COUNT': 'YES'}) + + if statistics: + network_stats = {} + for layer_name, node_statistic in statistics.items(): + network_stats[layer_name] = ie.LayerStats(min=tuple(node_statistic.min_outputs), + max=tuple(node_statistic.max_outputs)) + model_evaluator.launcher.network.stats.update(network_stats) + + dataset_size = model_evaluator.dataset.size + + if self._configuration.progress: + progress_reporter = ProgressReporter.provide(( + self._configuration.progress if ':' not in self._configuration.progress + else self._configuration.progress.split(':')[0] + )) + progress_reporter.reset(len(model_evaluator.dataset)) + else : + progress_reporter = None + + process_dataset_callback = DatasetCallback( + model_evaluator.launcher.network, + model_evaluator.launcher.exec_network, + collect_resuls=collect_resuls, + collect_layers=collect_layers, + collect_aggregated_statistics=collect_aggregated_statistics, + iterations_count=int(dataset_size / self._configuration.batch_size), + dataset_size=dataset_size) + + model_evaluator.process_dataset(None, + progress_reporter=progress_reporter, + output_callback=process_dataset_callback.callback) + if len(model_evaluator.launcher.exec_network.requests) != 1: + raise ValueError("unexpected network requests count") + + inference_result = process_dataset_callback.infer_raw_result + inference_latencies = process_dataset_callback.latencies + + performance_counters = \ + model_evaluator.launcher.exec_network.requests[0].get_perf_counts() if collect_performance_counters else None + + model_evaluator_callback = MetricsCallback() + model_evaluator.compute_metrics(output_callback=model_evaluator_callback.callback) + presenter_values = model_evaluator_callback.values + for presenter_value in presenter_values: + value, reference, name, threshold, meta = presenter_value + accuracy = np.mean(value) + except Exception: + if process_dataset_callback: + process_dataset_callback.release() + raise + finally: + model_evaluator.release() + + return InferenceResult( + inference_result, + CalibrationMetrics(accuracy, np.mean(inference_latencies)) if len(inference_latencies) else CalibrationMetrics(accuracy), + process_dataset_callback.aggregated_statistics, + performance_counters) + + def get_quantization_levels(self, ignore_layer_names=None) -> Dict[str, str]: + network = self.create_network() + quantization_levels = dict() + + for layer in network.layers.values(): + if self.is_quantization_supported(layer.type): + if ignore_layer_names and (layer.name in ignore_layer_names): + quantization_levels[layer.name] = "FP32" + else: + quantization_levels[layer.name] = "I8" if self.precision == "INT8" else self.precision + + return quantization_levels + + @property + def precision(self) -> str: + raise NotImplementedError() + + @abstractmethod + def is_quantization_supported(self, layer_type: str) -> bool: + return NotImplementedError() + + def is_activation_supported(self, layer_type: str) -> bool: + return layer_type.lower() == 'relu' or layer_type.lower() == 'activation' or layer_type.lower() == 'clamp' + + def is_quantization_fusing_supported(self, parent_layer, child_layer): + if parent_layer.outputs[0].layer.id != child_layer.id: + # not supported fuse, let's ignore + return False + + return self.is_quantization_supported(parent_layer.type) and \ + len(parent_layer.outputs) == 1 and \ + len(parent_layer.outputs[0].layer.inputs) == 1 and \ + self.is_activation_supported(child_layer.type) + + def get_quantization_layers(self) -> list: + collect_layers = set() + + network_info = NetworkInfo(self._configuration.model) + previous_previous_layer = None + previous_layer = None + layer_index = 0 + for layer in network_info.layers.values(): + if previous_previous_layer: + if previous_layer and self.is_quantization_supported(previous_layer.type): + if self.is_quantization_fusing_supported(previous_layer, layer): + collect_layers.add(layer.name) + else: + collect_layers.add(previous_layer.name) + collect_layers.add(previous_previous_layer.name) + + if self.is_quantization_supported(layer.type) and layer_index == (len(network_info.layers) - 1): + collect_layers.add(layer.name) + collect_layers.add(previous_layer.name) + + layer_index += 1 + previous_previous_layer = previous_layer + previous_layer = layer + + return collect_layers diff --git a/tools/calibration/calibration_configuration.py b/tools/calibration/calibration_configuration.py new file mode 100644 index 0000000..5f86202 --- /dev/null +++ b/tools/calibration/calibration_configuration.py @@ -0,0 +1,150 @@ +import shutil +from ..utils.network_info import NetworkInfo + + +class CalibrationConfiguration: + """ + Class for parsing input config + """ + def __init__( + self, + config: str, + precision: str, + model: str, + weights: str, + tmp_directory: str, + output_model: str, + output_weights: str, + cpu_extension: str, + gpu_extension: str, + device: str, + batch_size: int, + threshold: float, + ignore_layer_types: list, + ignore_layer_types_path: str, + ignore_layer_names: list, + ignore_layer_names_path: str, + benchmark_iterations_count: int, + progress: str): + + self._config = config + self._precision = precision.upper() + self._model = model + self._weights = weights + self._tmp_directory = tmp_directory + self._output_model = output_model + self._output_weights = output_weights + self._cpu_extension = cpu_extension + self._gpu_extension = gpu_extension + self._device = device + self._batch_size = batch_size + self._threshold = threshold + self._ignore_layer_types = ignore_layer_types + self._ignore_layer_types_path = ignore_layer_types_path + self._ignore_layer_names = ignore_layer_names + self._ignore_layer_names_path = ignore_layer_names_path + self._benchmark_iterations_count = benchmark_iterations_count + self._progress = progress + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.release() + + def release(self): + if self.tmp_directory: + shutil.rmtree(self.tmp_directory) + self._tmp_directory = None + + @property + def config(self) -> list: + return self._config + + @property + def precision(self) -> str: + return self._precision + + @property + def model(self) -> str: + return self._model + + @property + def weights(self) -> str: + return self._weights + + @property + def tmp_directory(self) -> str: + return self._tmp_directory + + @property + def output_model(self) -> str: + return self._output_model + + @property + def output_weights(self) -> str: + return self._output_weights + + @property + def cpu_extension(self) -> str: + return self._cpu_extension + + @property + def gpu_extension(self) -> str: + return self._gpu_extension + + @property + def device(self) -> str: + return self._device + + @property + def batch_size(self) -> int: + return self._batch_size + + @property + def threshold(self) -> int: + return self._threshold + + @property + def ignore_layer_types(self): + return self._ignore_layer_types + + @property + def ignore_layer_types_path(self) -> str: + return self._ignore_layer_types_path + + @property + def ignore_layer_names(self): + return self._ignore_layer_names + + @property + def ignore_layer_names_path(self) -> str: + return self._ignore_layer_names_path + + @property + def benchmark_iterations_count(self) -> int: + return self._benchmark_iterations_count + + @property + def progress(self) -> str: + return self._progress + + +class CalibrationConfigurationHelper: + @staticmethod + def read_ignore_layer_names(configuration: CalibrationConfiguration): + ignore_layer_types = configuration.ignore_layer_types + + if configuration.ignore_layer_types_path: + ignore_layer_types_file = open(configuration.ignore_layer_types_path, 'r') + ignore_layer_types_from_file = [line.strip() for line in ignore_layer_types_file.readlines()] + ignore_layer_types.extend(ignore_layer_types_from_file) + + ignore_layer_names = NetworkInfo(configuration.model).get_layer_names(layer_types=ignore_layer_types) + + if configuration.ignore_layer_names_path: + ignore_layer_names_file = open(configuration.ignore_layer_names_path, 'r') + ignore_layer_names_from_file = [line.strip() for line in ignore_layer_names_file.readlines()] + ignore_layer_names.extend(ignore_layer_names_from_file) + + return ignore_layer_names diff --git a/tools/calibration/calibration_metrics.py b/tools/calibration/calibration_metrics.py new file mode 100644 index 0000000..c156e0c --- /dev/null +++ b/tools/calibration/calibration_metrics.py @@ -0,0 +1,30 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class CalibrationMetrics: + def __init__(self, accuracy: float, latency: float = None): + self._accuracy = accuracy + self._latency = latency + + @property + def accuracy(self): + return self._accuracy + + # TODO: remove: use benchmark instead + @property + def latency(self): + return self._latency \ No newline at end of file diff --git a/tools/calibration/calibrator.py b/tools/calibration/calibrator.py new file mode 100644 index 0000000..5e5d252 --- /dev/null +++ b/tools/calibration/calibrator.py @@ -0,0 +1,255 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import os +import platform + +from ..utils.network_info import NetworkInfo + +from ..benchmark.benchmark import Benchmark +from ..network import Network + +from .logging import info, debug, info_performance_counters, info_layer_accuracy_drop +from .calibrator_configuration import CalibratorConfiguration +from .calibrator_factory import CalibratorFactory +from .calibration_configuration import CalibrationConfiguration, CalibrationConfigurationHelper +from .layer_accuracy_drop.collector_by_layer import CollectorByLayer + +class Calibrator: + def __init__(self, configuration: CalibrationConfiguration): + if configuration is None: + raise ValueError("configuration is None") + + self._configuration = configuration + + def run(self) -> Network: + calibrator = CalibratorFactory.create( + self._configuration.precision, + CalibratorConfiguration(self._configuration)) + benchmark = Benchmark(self._configuration) + + info("Processor: {}".format(platform.processor())) + + info("Collecting FP32 statistics for {}...".format(self._configuration.model)) + fp32_result = calibrator.infer( + add_outputs=True, + collect_aggregated_statistics=True, + collect_performance_counters=True) + fp32_accuracy = fp32_result.metrics.accuracy + fp32_latency = benchmark.run(iterations_count=self._configuration.benchmark_iterations_count).latency + info("FP32 accuracy: {0:.4f}%, latency: {1:0.4f} ms".format(100.0 * fp32_accuracy, 1000 * fp32_latency)) + + info("FP32 performance counters:\n") + info_performance_counters(fp32_result.performance_counters) + + ignore_layer_names = CalibrationConfigurationHelper.read_ignore_layer_names(self._configuration) + fp32_result.aggregated_statistics.pop(ignore_layer_names=ignore_layer_names) + fp32_aggregated_statistics = fp32_result.aggregated_statistics + fp32_result = None + + info("Verification of network accuracy if all possible layers converted to {}\n".format( + self._configuration.precision)) + + best_lp_accuracy = None + best_lp_latency = 0.0 + best_lp_threshold = 100.0 + best_lp_statistics = None + best_lp_performance_counters = None + + threshold = 100.0 + threshold_low_boundary = 95.0 + threshold_step = .5 + + quantization_levels = calibrator.get_quantization_levels(ignore_layer_names) + + min_accuracy_drop = None + while threshold >= threshold_low_boundary: + info("Validate {} accuracy, threshold for activation statistics: {}%".format( + self._configuration.precision, + threshold)) + + lp_statistics = fp32_aggregated_statistics.get_node_statistics(threshold) + with Network.reload( + model_path=self._configuration.model, + statistics=lp_statistics, + quantization_levels=quantization_levels, + batch_size=self._configuration.batch_size + ) as reloaded_network: + + with calibrator.infer(network=reloaded_network.ie_network, + collect_performance_counters=True) as lp_result: + lp_accuracy = lp_result.metrics.accuracy + lp_performance_counters = lp_result.performance_counters + lp_latency = benchmark.run( + network=reloaded_network, + iterations_count=self._configuration.benchmark_iterations_count).latency + + if best_lp_accuracy is None or lp_accuracy > best_lp_accuracy: + + best_lp_accuracy = lp_accuracy + best_lp_latency = lp_latency + best_lp_threshold = threshold + if best_lp_statistics: + del best_lp_statistics + best_lp_statistics = lp_statistics + best_lp_performance_counters = lp_performance_counters + else: + del lp_statistics + + min_accuracy_drop = fp32_accuracy - lp_accuracy if min_accuracy_drop is None else min( + min_accuracy_drop, + fp32_accuracy - lp_accuracy) + + info("{0} accuracy is {1:.4f}%, latency: {2:0.4f} ms\n".format( + self._configuration.precision, + 100.0 * lp_accuracy, + 1000.0 * lp_latency)) + threshold = threshold - threshold_step + + + info("Best {0} accuracy is {1:.4f}%, latency: {2:0.4f} ms for threshold {3}%".format( + self._configuration.precision, + 100.0 * best_lp_accuracy, + 1000.0 * best_lp_latency, + best_lp_threshold)) + + info("{} performance counters:\n".format(self._configuration.precision)) + info_performance_counters(best_lp_performance_counters) + + accuracy_was_satisfied = False + if (fp32_accuracy - best_lp_accuracy) > (self._configuration.threshold / 100): + info("Accuracy of all layers conversion does not correspond to the required threshold") + info(("FP32 Accuracy: {0:.4f}% (latency: {1:0.4f} ms) vs all low precision layers accuracy: {2:.4f}% " + "(latency: {3:0.4f} ms), threshold for activation statistics: {4}%").format(100.0 * fp32_accuracy, + 1000.0 * fp32_latency, + 100.0 * best_lp_accuracy, + 1000.0 * best_lp_latency, + best_lp_threshold)) + + info("Collecting all raw FP32 results") + + quantization_layers = calibrator.get_quantization_layers() + debug("{} layers (total {}) are selected to cache".format( + len(quantization_layers), + len(NetworkInfo(self._configuration.model).layers))) + + with calibrator.infer(add_outputs=True, + collect_resuls=True, + collect_layers=quantization_layers) as fp32_result_with_raw_data: + info("Collecting intermediate per-layer accuracy drop") + layers_accuracy_drop = CollectorByLayer( + self._configuration, + calibrator.plugin, + calibrator).collect(best_lp_statistics, fp32_result_with_raw_data) + + info("Layer accuracy drop:\n") + info_layer_accuracy_drop(layers_accuracy_drop) + + if layers_accuracy_drop: + info("Starting to reduce number of layers being converted to Int8") + + for layer_accuracy_drop in layers_accuracy_drop: + info("Returning of '{}' to FP32 precision, start validation".format(layer_accuracy_drop.layer_name)) + quantization_levels[layer_accuracy_drop.layer_name] = "FP32" + + with Network.reload( + self._configuration.model, + statistics=best_lp_statistics, + quantization_levels=quantization_levels, + batch_size=self._configuration.batch_size + ) as reloaded_network: + + with calibrator.infer(network=reloaded_network.ie_network) as layer_int8_result: + best_lp_accuracy = layer_int8_result.metrics.accuracy + best_lp_latency = benchmark.run( + network=reloaded_network, + iterations_count=self._configuration.benchmark_iterations_count).latency + + accuracy_drop = fp32_accuracy - best_lp_accuracy + min_accuracy_drop = accuracy_drop if min_accuracy_drop is None else min(min_accuracy_drop, + accuracy_drop) + if accuracy_drop > (self._configuration.threshold / 100.0): + info("Was not achieved: FP32 accuracy: {0:.4f}% (latency: {1:.4} ms) VS {2} accuracy: {3:.4f}% " + "(latency {4:.4f} ms), accuracy drop {5:.4f}%".format(100.0 * fp32_accuracy, + 1000.0 * fp32_latency, + self._configuration.precision, + 100.0 * best_lp_accuracy, + 1000.0 * best_lp_latency, + 100.0 * accuracy_drop)) + else: + accuracy_was_satisfied = True + info("Achieved: FP32 accuracy: {0:.4f}% (latency: {1:.4} ms) VS {2} accuracy: {3:.4}% " + "(latency: {4:.4} ms), accuracy drop {5:.4}%".format(100.0 * fp32_accuracy, + 1000.0 * fp32_latency, + self._configuration.precision, + 100.0 * best_lp_accuracy, + 1000.0 * best_lp_latency, + 100.0 * accuracy_drop)) + break + else: + info("No layers to reduce number of converted to Int8") + + else: + accuracy_was_satisfied = True + + if accuracy_was_satisfied: + info("Achieved required accuracy drop satisfying threshold") + info("FP32 accuracy: {0:.4f}% (latency: {1:.4} ms) vs current low precision configuration accuracy: " + "{2:.4f}% (latency: {3:.4} ms) with threshold for activation statistic: {4}%".format( + 100.0 * fp32_accuracy, + 1000.0 * fp32_latency, + 100.0 * best_lp_accuracy, + 1000.0 * best_lp_latency, + best_lp_threshold)) + + quantized_layers_count = 0 + for quantization_level in quantization_levels.values(): + if quantization_level != "FP32": + quantized_layers_count += 1 + info("quantized layers (quantized {}, total {} layers):".format( + quantized_layers_count, + len(quantization_levels))) + + layers_message = "FP32 layers:\n" + for layer_name, quantization_level in quantization_levels.items(): + if quantization_level == "FP32": + layers_message += "\tlayer '{}': {}\n".format(layer_name, quantization_level) + info(layers_message) + + layers_message = "{} layers:\n".format(self._configuration.precision) + for layer_name, quantization_level in quantization_levels.items(): + if quantization_level != "FP32": + layers_message += "\tlayer '{}': {}\n".format(layer_name, quantization_level) + info(layers_message) + + info("Write calibrated network to {}.(xml|bin) IR file".format( + os.path.splitext(self._configuration.output_model)[0])) + + calibrator.save( + self._configuration.output_model, + self._configuration.output_weights, + quantization_levels, + best_lp_statistics) + + # TODO: need to load from hard drive while not fixed + output_network = Network(self._configuration.output_model, self._configuration.output_weights) + return output_network + else: + info("Required threshold of accuracy drop cannot be achieved with any {0} quantization. Minimal accuracy " + "drop: {1:0.4%}".format(self._configuration.precision, min_accuracy_drop)) + + return None diff --git a/tools/calibration/calibrator_configuration.py b/tools/calibration/calibrator_configuration.py new file mode 100644 index 0000000..2126711 --- /dev/null +++ b/tools/calibration/calibrator_configuration.py @@ -0,0 +1,66 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import collections + + +class CalibratorConfiguration: + def __init__(self, configuration): + self._config = configuration.config + self._model = configuration.model + self._weights = configuration.weights + self._device = configuration.device + self._cpu_extension = configuration.cpu_extension + self._gpu_extension = configuration.gpu_extension + self._threshold = configuration.threshold + self._batch_size = configuration.batch_size + self._progress = configuration.progress + + @property + def config(self) -> str: + return self._config + + @property + def model(self) -> str: + return self._model + + @property + def weights(self) -> str: + return self._weights + + @property + def device(self) -> str: + return self._device + + @property + def cpu_extension(self) -> str: + return self._cpu_extension + + @property + def gpu_extension(self) -> str: + return self._gpu_extension + + @property + def threshold(self) -> str: + return self._threshold + + @property + def batch_size(self) -> int: + return self._batch_size + + @property + def progress(self) -> str: + return self._progress diff --git a/tools/calibration/calibrator_factory.py b/tools/calibration/calibrator_factory.py new file mode 100644 index 0000000..5d16cc3 --- /dev/null +++ b/tools/calibration/calibrator_factory.py @@ -0,0 +1,31 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .calibrator_configuration import CalibratorConfiguration +from .int8_calibrator import Int8Calibrator +from .fp16_calibrator import Fp16Calibrator + + +class CalibratorFactory: + @staticmethod + def create(precision: str, configuration: CalibratorConfiguration): + if precision.lower() == "int8": + return Int8Calibrator(configuration) + + if precision.lower() == "fp16": + return Fp16Calibrator(configuration) + + raise ValueError("not supported precision '{}'".format(precision)) diff --git a/tools/calibration/command_line_processor.py b/tools/calibration/command_line_processor.py new file mode 100644 index 0000000..b300aaa --- /dev/null +++ b/tools/calibration/command_line_processor.py @@ -0,0 +1,142 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import tempfile + +from ..accuracy_checker.accuracy_checker.config import ConfigReader +from ..accuracy_checker.accuracy_checker.launcher.dlsdk_launcher import DLSDKLauncher + +from ..network import Network +from ..utils.path import Path +from ..utils.configuration_filter import ConfigurationFilter +from .calibration_configuration import CalibrationConfiguration +from .logging import info, default_logger +from .command_line_reader import CommandLineReader + + +class CommandLineProcessor: + """ + Class for parsing user input config + """ + @staticmethod + def process() -> CalibrationConfiguration: + args, unknown_args = CommandLineReader.parser().parse_known_args() + if unknown_args: + info("unknown command line arguments: {0}".format(unknown_args)) + + args.target_framework = "dlsdk" + args.aocl = None + + merged_config = ConfigReader.merge(args) + updated_config = ConfigurationFilter.filter(merged_config, args.metric_name, args.metric_type, default_logger) + + if len(updated_config['models']) > 1: + raise ValueError("too much models") + + if len(updated_config['models'][0]['launchers']) > 1: + raise ValueError("too much launchers") + + launcher = updated_config['models'][0]['launchers'][0] + if 'caffe_model' in launcher or 'tf_model' in launcher or 'mxnet_weights' in launcher: + if args.converted_models: + tmp_directory = None + else: + tmp_directory = tempfile.mkdtemp(".converted_models") + launcher['mo_params']['output_dir'] = tmp_directory + + if 'caffe_model' in launcher: + framework = 'caffe' + output_model = Path.get_model( + str(launcher['caffe_model']), + "_i8", + str(args.output_dir) if args.output_dir else None) + output_weights = Path.get_weights( + str(launcher['caffe_weights']), + "_i8", + str(args.output_dir) if args.output_dir else None) + elif 'tf_model' in launcher: + framework = 'tf' + output_model = Path.get_model( + str(launcher['tf_model']), + "_i8", + str(args.output_dir) if args.output_dir else None) + output_weights = Path.get_weights( + str(launcher['tf_model']), + "_i8", + str(args.output_dir) if args.output_dir else None) + elif 'mxnet_weights' in launcher: + framework = 'mxnet' + output_model = Path.get_model( + str(launcher['mxnet_weights']), + "_i8", + str(args.output_dir) if args.output_dir else None) + output_weights = Path.get_weights( + str(launcher['mxnet_weights']), + "_i8", + str(args.output_dir) if args.output_dir else None) + else: + raise ValueError("unknown model framework") + + model, weights = DLSDKLauncher.convert_model(launcher, framework) + launcher['model'] = model + launcher['weights'] = weights + + launcher.pop('caffe_model', None) + launcher.pop('caffe_weights', None) + launcher.pop('tf_model', None) + launcher.pop('mxnet_weights', None) + else: + model = launcher['model'] + output_model = Path.get_model(str(model), "_i8", str(args.output_dir) if args.output_dir else None) + weights = launcher['weights'] + output_weights = Path.get_weights(str(weights), "_i8", str(args.output_dir) if args.output_dir else None) + tmp_directory = None + + batch_size = args.batch_size if args.batch_size else (launcher['batch'] if 'batch' in launcher else None) + if not batch_size: + with Network(str(launcher['model']), str(launcher['weights'])) as network: + batch_size = network.ie_network.batch_size + + if 'cpu_extensions' in launcher: + cpu_extension = DLSDKLauncher.get_cpu_extension(launcher['cpu_extensions'], args.cpu_extensions_mode) + launcher['cpu_extensions'] = cpu_extension + else: + cpu_extension = None + + if not args.calibrate_fully_connected: + if args.ignore_layer_types is None: + args.ignore_layer_types = [] + args.ignore_layer_types.append("FullyConnected") + + return CalibrationConfiguration( + config=updated_config, + precision=args.precision, + model=str(model), + weights=str(weights), + tmp_directory=tmp_directory, + output_model=output_model, + output_weights=output_weights, + cpu_extension=str(cpu_extension) if cpu_extension else None, + gpu_extension=str(launcher['gpu_extensions']) if 'gpu_extensions' in launcher else None, + device=launcher['device'], + batch_size=batch_size, + threshold=args.threshold, + ignore_layer_types=args.ignore_layer_types, + ignore_layer_types_path=args.ignore_layer_types_path, + ignore_layer_names=args.ignore_layer_names, + ignore_layer_names_path=args.ignore_layer_names_path, + benchmark_iterations_count=args.benchmark_iterations_count, + progress=(None if args.progress == 'None' else args.progress)) \ No newline at end of file diff --git a/tools/calibration/command_line_reader.py b/tools/calibration/command_line_reader.py new file mode 100644 index 0000000..e9700c5 --- /dev/null +++ b/tools/calibration/command_line_reader.py @@ -0,0 +1,209 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import pathlib +from functools import partial +from argparse import ArgumentParser + +from ..accuracy_checker.accuracy_checker.utils import get_path + + +class CommandLineReader: + @staticmethod + def parser(): + parser = ArgumentParser(description='openvino.tools.calibration') + + parser.add_argument( + '-d', '--definitions', + help='Optional. Path to the YML file with definitions', + type=str, + required=False) + + parser.add_argument( + '-c', '--config', + help='Required. Path to the YML file with local configuration', + type=get_path, + required=True) + + parser.add_argument( + '-m', '--models', + help='Optional. Prefix path to the models and weights', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-s', '--source', + help='Optional. Prefix path to the data source', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-a', '--annotations', + help='Optional. Prefix path to the converted annotations and datasets meta data', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '-e', '--extensions', + help='Optional. Prefix path to extensions folder', + type=partial(get_path, is_directory=True), + default=pathlib.Path.cwd(), + required=False) + + parser.add_argument( + '--cpu_extensions_mode', '--cpu-extensions-mode', + help='Optional. specified preferable set of processor instruction for automatic searching cpu extension lib', + required=False, + choices=['avx2', 'sse4']) + + parser.add_argument( + '-C', '--converted_models', '--converted-models', + help='Optional. Directory to store Model Optimizer converted models. Used for DLSDK launcher only', + type=partial(get_path, is_directory=True), + required=False + ) + + parser.add_argument( + '-M', '--model_optimizer', '--model-optimizer', + help='Optional. Path to model optimizer caffe directory', + type=partial(get_path, is_directory=True), + # there is no default value because if user did not specify it we use specific locations + # defined in model_conversion.py + required=False + ) + + parser.add_argument( + '--tf_custom_op_config_dir', '--tf-custom-op-config-dir', + help='Optional. Path to directory with tensorflow custom operation configuration files for model optimizer', + type=partial(get_path, is_directory=True), + # there is no default value because if user did not specify it we use specific location + # defined in model_conversion.py + required=False + ) + + parser.add_argument( + '--tf_obj_detection_api_pipeline_config_path', '--tf-obj-detection-api-pipeline-config-path', + help='Optional. Path to directory with tensorflow object detection api pipeline configuration files for model optimizer', + type=partial(get_path, is_directory=True), + # there is no default value because if user did not specify it we use specific location + # defined in model_conversion.py + required=False + ) + + parser.add_argument( + '--progress', + help='Optional. Progress reporter', + required=False, + default='bar') + + parser.add_argument( + '-td', '--target_devices', '--target-devices', + help='Optional. Space-separated list of devices for infer', + required=False, + nargs='+', + default=["CPU"] + ) + + parser.add_argument( + '-tt', '--target_tags', '--target-tags', + help='Optional. Space-separated list of launcher tags for infer', + required=False, + nargs='+') + + parser.add_argument( + '-p', + '--precision', + help='Optional. Precision to calibrate. Default value is INT8', + type=str, + required=False, + default='INT8') + + parser.add_argument( + '--ignore_layer_types', '--ignore-layer-types', + help='Optional. Layer types list which will be skipped during quantization', + type=str, + required=False, + nargs='+') + + parser.add_argument( + '--ignore_layer_types_path', '--ignore-layer-types-path', + help='Optional. Ignore layer types file path', + type=str, + required=False, + nargs='+') + + parser.add_argument( + '--ignore_layer_names', '--ignore-layer-names', + help='Optional. Layer names list which will be skipped during quantization', + type=str, + required=False, + nargs='+') + + parser.add_argument( + '--ignore_layer_names_path', '--ignore-layer-names-path', + help='Optional. Ignore layer names file path', + type=str, + required=False) + + parser.add_argument( + '--batch_size', '--batch-size', + help='Optional. Batch size value. If not specified, the batch size value is determined from IR', + type=int, + required=False) + + parser.add_argument( + '-th', '--threshold', + help='Optional. Accuracy drop of quantized model should not exceed this threshold. ' + 'Should be pointer in percents without percent sign. (1%% is default)', + type=float, + required=False, + default=1.0) + + parser.add_argument( + '-ic', '--benchmark_iterations_count', '--benchmark-iterations-count', + help='Optional. Benchmark itertations count. (1000 is default)', + type=int, + required=False, + default=1000) + + parser.add_argument( + '-mn', '--metric_name', '--metric-name', + help='Optional. Metric name used during calibration', + type=str, + required=False) + + parser.add_argument( + '-mt', '--metric_type', '--metric-type', + help='Optional. Metric type used during calibration', + type=str, + required=False) + + parser.add_argument( + '-o', '--output_dir', '--output-dir', + help='Optional. Directory to store converted models. Original model directory is used if not defined', + type=partial(get_path, is_directory=True), + required=False) + + parser.add_argument( + '-cfc', '--calibrate_fully_connected', '--calibrate-fully-connected', + help='Optional. FullyConnected INT8 convertion support (False is default)', + action="store_true", + required=False) + + return parser diff --git a/tools/calibration/fp16_calibrator.py b/tools/calibration/fp16_calibrator.py new file mode 100644 index 0000000..030076b --- /dev/null +++ b/tools/calibration/fp16_calibrator.py @@ -0,0 +1,31 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .base_calibrator import BaseCalibrator +from .calibrator_configuration import CalibratorConfiguration + + +# TODO: not comlpeted. Some methods will be moved from Calibrator and customized to FP16 +class Fp16Calibrator(BaseCalibrator): + def __init__(self, configuration: CalibratorConfiguration): + pass + + @property + def precision(self): + return "FP32" + + def is_quantization_supported(self, layer_type: str) -> bool: + return layer_type.lower() == "convolution" or layer_type.lower() == "fullyconnected" diff --git a/tools/calibration/infer_raw_results.py b/tools/calibration/infer_raw_results.py new file mode 100644 index 0000000..b2f5655 --- /dev/null +++ b/tools/calibration/infer_raw_results.py @@ -0,0 +1,72 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +import numpy +import pickle +import shutil +import tempfile +from typing import Dict + + +class InferRawResults: + def __init__(self): + self._size = 0 + self._index = 0 + self._dir_path = None + pass + + def release(self): + if self._dir_path: + shutil.rmtree(self._dir_path) + self._dir_path = None + + def __iter__(self): + self._index = 0 + return self + + def __next__(self): + if self._index < self._size: + file_path = os.path.join(self._dir_path, str(self._index)) + self._index += 1 + + f = open(file_path, "rb") + try: + loaded_value = pickle.load(f) + finally: + f.close() + return loaded_value + else: + raise StopIteration + + def size(self): + return self._size + + def add(self, value: Dict[str, numpy.ndarray]): + if self._dir_path is None: + self._dir_path = tempfile.mkdtemp("__infer_raw_results") + if not os.path.exists(self._dir_path): + os.makedirs(self._dir_path) + + file_path = os.path.join(self._dir_path, str(self._size)) + + f = open(file_path, "wb") + try: + pickle.dump(value, f) + finally: + f.close() + + self._size += 1 diff --git a/tools/calibration/inference_result.py b/tools/calibration/inference_result.py new file mode 100644 index 0000000..65d8e94 --- /dev/null +++ b/tools/calibration/inference_result.py @@ -0,0 +1,85 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .aggregated_statistics import AggregatedStatistics +from .calibration_metrics import CalibrationMetrics +from .infer_raw_results import InferRawResults + + +class InferenceResult: + def __init__(self, + result: InferRawResults, + metrics: CalibrationMetrics, + aggregated_statistics: AggregatedStatistics, + performance_counters: dict): + self._result = result + self._metrics = metrics + self._aggregated_statistics = aggregated_statistics + self._performance_counters = performance_counters + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.release() + + def release(self): + if self._result: + self._result.release() + self._result = None + + @property + def result(self) -> InferRawResults: + return self._result + + @property + def metrics(self) -> CalibrationMetrics: + return self._metrics + + @property + def aggregated_statistics(self) -> AggregatedStatistics: + return self._aggregated_statistics + + @property + def performance_counters(self) -> dict: + return self._performance_counters + + def get_class_ids(self, output_layer_name: str) -> list: + ''' + Return class identifier list for classification networks + ''' + + result_classes_id_list = list() + for layers_result in self._result: + if output_layer_name not in layers_result: + raise KeyError("layer '{}' is not included int results".format(output_layer_name)) + + layer_result = layers_result[output_layer_name] + if layer_result.size == 0: + raise ValueError("result array is empty") + + max_value = layer_result.item(0) + max_class_id = 0 + + for class_id in range(layer_result.size): + value = layer_result.item(class_id) + if value > max_value: + max_value = value + max_class_id = class_id + + result_classes_id_list.append(max_class_id) + + return result_classes_id_list diff --git a/tools/calibration/int8_calibrator.py b/tools/calibration/int8_calibrator.py new file mode 100644 index 0000000..b9e0a16 --- /dev/null +++ b/tools/calibration/int8_calibrator.py @@ -0,0 +1,34 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .base_calibrator import BaseCalibrator +from .calibrator_configuration import CalibratorConfiguration + + +# TODO: not comlpeted. Some methods will be moved from Calibrator and customized to INT8 +class Int8Calibrator(BaseCalibrator): + ''' + INT8 calibrator + ''' + def __init__(self, configuration: CalibratorConfiguration): + super().__init__(configuration) + + @property + def precision(self): + return "INT8" + + def is_quantization_supported(self, layer_type: str) -> bool: + return layer_type.lower() == "convolution" or layer_type.lower() == "fullyconnected" diff --git a/tools/calibration/layer_accuracy_drop/__init__.py b/tools/calibration/layer_accuracy_drop/__init__.py new file mode 100644 index 0000000..9ec5df4 --- /dev/null +++ b/tools/calibration/layer_accuracy_drop/__init__.py @@ -0,0 +1,21 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from .collector_by_layer import CollectorByLayer + +__version__ = "0.0.1" +__all__ = [ + 'CollectorByLayer' +] diff --git a/tools/calibration/layer_accuracy_drop/collector_by_image.py b/tools/calibration/layer_accuracy_drop/collector_by_image.py new file mode 100644 index 0000000..787c7bf --- /dev/null +++ b/tools/calibration/layer_accuracy_drop/collector_by_image.py @@ -0,0 +1,128 @@ +import openvino.inference_engine as ie + +from ...utils.network_info import NetworkInfo +from ...network import Network + +from ..layer_accuracy_drop_info import LayerAccuracyDropInfo +from ..logging import debug +from ..single_layer_network import SingleLayerNetwork +from ..inference_result import InferenceResult + + +class CollectorByImage: + def __init__(self, configuration, plugin, normalizer): + self._configuration = configuration + self._plugin = plugin + self._normalizer = normalizer + + def _create_single_layer_networks(self, stat): + ''' + Method get layers which can be quantized and affect on final accuracy. Separate network is created for each layer. + ''' + network = ie.IENetwork(self._configuration.model, self._configuration.weights) + # if self._configuration.batch_size: + # # need to use reshape API + # network.batch_size = self._configuration.batch_size + + try: + network_info = NetworkInfo(self._configuration.model) + + # CVS-14302: IE Network INT8 Normalizer: scale factor calculation is incorrect + # for layer_name, layer_statistics in stat.items(): + # layer_info = network_info.get_layer(layer_name) + # if layer_info.type == 'Convolution' and \ + # layer_info.outputs and \ + # layer_info.outputs[0].layer.type == 'ReLU' and \ + # layer_info.outputs[0].layer.outputs[0] and \ + # len(layer_statistics.max_outputs) > len(stat[layer_info.outputs[0].layer.name].max_outputs): + + # relu_max_outputs = stat[layer_info.outputs[0].layer.name].max_outputs + # relu_min_outputs = stat[layer_info.outputs[0].layer.name].min_outputs + + # while len(layer_statistics.max_outputs) > len(relu_max_outputs): + # relu_max_outputs.append(relu_max_outputs[-1]) + # relu_min_outputs.append(relu_min_outputs[-1]) + + single_layer_networks = dict() + + layer_index = 1 + for layer_to_clone in network.layers.values(): + layer_to_clone_info = network_info.get_layer(layer_to_clone.name) + if not self._normalizer.is_quantization_supported(layer_to_clone.type) or \ + len(layer_to_clone_info.outputs) != 1 or \ + len(layer_to_clone_info.outputs[0].layer.inputs != 1): + continue + + activation_layer = network.layers[layer_to_clone_info.outputs[0].layer.name] if (len(layer_to_clone_info.outputs) == 1 and self._normalizer.is_quantization_fusing_supported(layer_to_clone_info, layer_to_clone_info.outputs[0].layer)) else None + if activation_layer: + debug("create network #{} for layer {} ({}) -> {} ({})".format(layer_index, layer_to_clone.name, layer_to_clone.type, activation_layer.name, activation_layer.type)) + else: + debug("create network #{} for layer {} ({})".format(layer_index, layer_to_clone.name, layer_to_clone.type)) + + layer_network, reference_output_layer_name = self._normalizer.create_network_for_layer( + self._configuration.weights, + layer_to_clone, + layer_to_clone_info, + activation_layer) + + Network.reshape(layer_network, self._configuration.batch_size) + + network_stats = {} + # TODO: initialize only neccessary statistic + for layer_name, node_statistic in stat.items(): + network_stats[layer_name] = ie.LayerStats(min=tuple(node_statistic.min_outputs), max=tuple(node_statistic.max_outputs)) + layer_network.stats.update(network_stats) + + params = layer_network.layers[layer_to_clone.name].params + params["quantization_level"] = 'I8' if self._configuration.precision == 'INT8' else self._configuration.precision + layer_network.layers[layer_to_clone.name].params = params + + exec_network = self._plugin.load(network=layer_network, config={ "EXCLUSIVE_ASYNC_REQUESTS": "YES" }) + + if len(layer_network.inputs) != 1: + raise ValueError("created network has several inputs") + + network_input_layer_name = next(iter(layer_network.inputs.keys())) + + single_layer_networks[layer_to_clone.name] = SingleLayerNetwork( + network = layer_network, + exec_network = exec_network, + input_layer_name = network_input_layer_name, + layer_name = layer_to_clone.name, + output_layer_name = layer_to_clone.name + "_", + reference_output_layer_name = reference_output_layer_name) + + layer_index += 1 + + return single_layer_networks + finally: + del network + + def collect(self, statistics: dict(), full_network_results: InferenceResult) -> list: + single_layer_networks = self._create_single_layer_networks(statistics) + + accuracy_drop_list_by_layer_name = dict() + image_index = 1 + for full_network_result in full_network_results.result: + debug("image {}/{} handling".format(image_index, full_network_results.result.size())) + + for single_layer_network_name, single_layer_network in single_layer_networks.items(): + accuracy_drop = self._normalizer.infer_single_layer_network(single_layer_network, full_network_result) + + if single_layer_network_name not in accuracy_drop_list_by_layer_name: + accuracy_drop_list_by_layer_name[single_layer_network_name] = list() + + accuracy_drop_list_by_layer_name[single_layer_network_name].append(accuracy_drop) + image_index += 1 + + accuracy_drop_by_layer = list() + for layer_name, accuracy_drop_list in accuracy_drop_list_by_layer_name.items(): + accuracy_drop_by_layer.append(LayerAccuracyDropInfo( + layer_name=layer_name, + value=LayerAccuracyDropInfo.calculate(accuracy_drop_list))) + + single_layer_network.release() + single_layer_networks.clear() + + accuracy_drop_by_layer.sort(key=lambda accuracy_drop: accuracy_drop.value, reverse=True) + return accuracy_drop_by_layer diff --git a/tools/calibration/layer_accuracy_drop/collector_by_layer.py b/tools/calibration/layer_accuracy_drop/collector_by_layer.py new file mode 100644 index 0000000..e888161 --- /dev/null +++ b/tools/calibration/layer_accuracy_drop/collector_by_layer.py @@ -0,0 +1,184 @@ +from collections import namedtuple +import multiprocessing +import threading + +import openvino.inference_engine as ie + +from ...utils.network_info import NetworkInfo +from ...network import Network + +from ..layer_accuracy_drop_info import LayerAccuracyDropInfo +from ..logging import info, debug +from ..single_layer_network import SingleLayerNetwork +from ..inference_result import InferenceResult + +QuantizationLayer = namedtuple('QuantizationLayer', 'index layer') + + +class SingleLayerNetworkThread(threading.Thread): + def __init__( + self, + base_calibrator, + statistics, + full_network_result: InferenceResult, + network: ie.IENetwork, + network_info: NetworkInfo, + quantization_layer: QuantizationLayer + ): + + threading.Thread.__init__(self) + self.base_calibrator = base_calibrator + self.statistics = statistics + self.full_network_result = full_network_result + self.network = network + self.network_info = network_info + self.quantization_layer = quantization_layer + self.result = None + + def run(self): + self.result = self.base_calibrator.collect_in_thread( + self.statistics, + self.full_network_result, + self.network, + self.network_info, + self.quantization_layer) + +class CollectorByLayer: + + def __init__(self, configuration, plugin, normalizer): + self._configuration = configuration + self._plugin = plugin + self._normalizer = normalizer + + def collect(self, statistics: dict(), full_network_result: InferenceResult) -> list: + ''' + Method get layers which can be quantized and affect on final accuracy. Separate network is created for each layer. + ''' + accuracy_drop_by_layer = list() + + network = ie.IENetwork(self._configuration.model, self._configuration.weights) + # if self._configuration.batch_size: + # # need to use reshape API + # network.batch_size = self._configuration.batch_size + + try: + network_info = NetworkInfo(self._configuration.model) + + # 2. go over all layers which affect accuracy and create network basing on it + quantization_layers = list() + + index = 1 + threads = list() + for layer in network.layers.values(): + if self._normalizer.is_quantization_supported(layer.type): + layer_info = network_info.get_layer(layer.name) + if (len(layer_info.outputs) == 1) and (len(layer_info.outputs[0].layer.inputs) == 1): + quantization_layer = QuantizationLayer(index, layer) + quantization_layers.append(quantization_layer) + threads.append(SingleLayerNetworkThread(self, statistics, full_network_result, network, network_info, quantization_layer)) + index += 1 + + it = iter(threads) + threads_num = multiprocessing.cpu_count() * 2 + active_threads = list() + while True: + active_threads.clear() + for thread_num in range(threads_num): + active_thread = next(it, None) + if not active_thread: + break + active_threads.append(active_thread) + active_thread.start() + + for active_thread in active_threads: + active_thread.join() + + if not active_thread: + debug("all layer networks were infered") + break + + debug("all layer networks before #{} were infered".format(active_thread.quantization_layer.index)) + + for thread in threads: + thread.join() + accuracy_drop_by_layer.append(thread.result) + + accuracy_drop_by_layer.sort(key=lambda accuracy_drop: accuracy_drop.value, reverse=True) + return accuracy_drop_by_layer + finally: + del network + + def collect_in_thread( + self, + statistics: dict(), + full_network_result: InferenceResult, + network: ie.IENetwork, + network_info: NetworkInfo, + quantization_layer: QuantizationLayer + ) -> LayerAccuracyDropInfo: + + index = quantization_layer.index + layer_to_clone = quantization_layer.layer + layer_to_clone_info = network_info.get_layer(layer_to_clone.name) + + activation_layer = network.layers[layer_to_clone_info.outputs[0].layer.name] if (len(layer_to_clone_info.outputs) == 1 and self._normalizer.is_quantization_fusing_supported(layer_to_clone_info, layer_to_clone_info.outputs[0].layer)) else None + if activation_layer: + debug("create network #{} for layer {} ({}) -> {} ({})".format(index, layer_to_clone.name, layer_to_clone.type, activation_layer.name, activation_layer.type)) + else: + debug("create network #{} for layer {} ({})".format(index, layer_to_clone.name, layer_to_clone.type)) + + layer_network, reference_output_layer_name = self._normalizer.create_network_for_layer( + self._configuration.weights, + layer_to_clone, + layer_to_clone_info, + activation_layer) + + Network.reshape(layer_network, self._configuration.batch_size) + + network_stats = {} + # TODO: initialize only neccessary statistic + for layer_name, node_statistic in statistics.items(): + network_stats[layer_name] = ie.LayerStats(min=tuple(node_statistic.min_outputs), max=tuple(node_statistic.max_outputs)) + layer_network.stats.update(network_stats) + + params = layer_network.layers[layer_to_clone.name].params + params["quantization_level"] = 'I8' if self._configuration.precision == 'INT8' else self._configuration.precision + layer_network.layers[layer_to_clone.name].params = params + + exec_network = self._plugin.load(network=layer_network, config={ "EXCLUSIVE_ASYNC_REQUESTS": "YES" }) + + if len(layer_network.inputs) != 1: + raise ValueError("created network has several inputs") + + network_input_layer_name = next(iter(layer_network.inputs.keys())) + + with SingleLayerNetwork( + network=layer_network, + exec_network=exec_network, + input_layer_name=network_input_layer_name, + layer_name=layer_to_clone.name, + output_layer_name=layer_to_clone.name + "_", + reference_output_layer_name=reference_output_layer_name + ) as single_layer_network: + + debug("single layer #{} {} network infer".format(index, single_layer_network.layer_name)) + accuracy_drop_list = self.infer_single_layer_network(single_layer_network, full_network_result) + + return LayerAccuracyDropInfo( + layer_name=single_layer_network.layer_name, + value=LayerAccuracyDropInfo.calculate(accuracy_drop_list)) + + def infer_single_layer_network(self, single_layer_network: SingleLayerNetwork, full_network_results: list()): + ''' + Native infer and compare results + ''' + + if full_network_results.result is None: + raise ValueError("output inference results are absent") + + accuracy_drop_list = list() + for full_network_result in full_network_results.result: + difference = self._normalizer.infer_single_layer_network(single_layer_network, full_network_result) + accuracy_drop_list.append(difference) + + return accuracy_drop_list diff --git a/tools/calibration/layer_accuracy_drop_info.py b/tools/calibration/layer_accuracy_drop_info.py new file mode 100644 index 0000000..2c262f9 --- /dev/null +++ b/tools/calibration/layer_accuracy_drop_info.py @@ -0,0 +1,36 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class LayerAccuracyDropInfo: + def __init__(self, layer_name: str, value: float): + self._layer_name = layer_name + self._value = value + + @property + def layer_name(self): + return self._layer_name + + @property + def value(self): + return self._value + + @staticmethod + def calculate(accuracy_drop: list) -> float: + sum = 0.0 + for d in accuracy_drop: + sum += d + return sum / len(accuracy_drop) diff --git a/tools/calibration/layers/__init__.py b/tools/calibration/layers/__init__.py new file mode 100644 index 0000000..abb94ea --- /dev/null +++ b/tools/calibration/layers/__init__.py @@ -0,0 +1,15 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" diff --git a/tools/calibration/logging.py b/tools/calibration/logging.py new file mode 100644 index 0000000..bc936b4 --- /dev/null +++ b/tools/calibration/logging.py @@ -0,0 +1,159 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import logging +import logging.config +import sys +import warnings +import threading + +# TODO: move to utils +_DEFAULT_LOGGER_NAME = 'openvino.tools.calibration' +_DEFAULT_LOG_FILE = 'openvino.tools.calibration.log' + +PRINT_INFO = logging.INFO + 5 +logging.addLevelName(PRINT_INFO, "PRINT_INFO") + +_LOG_LEVEL_ENVIRON = "CALIBRATION_TOOL_LOG_LEVEL" +# _LOGGING_LEVEL = logging.getLevelName(os.environ.get(_LOG_LEVEL_ENVIRON, PRINT_INFO)) +# TODO: refactoring: remove, use original line +_LOGGING_LEVEL = "DEBUG" + +lock = threading.Lock() + + +class LoggingFormatter(logging.Formatter): + def format(self, record: logging.LogRecord): + if record.levelno == PRINT_INFO: + return record.msg + return super().format(record) + + +class ConsoleHandler(logging.StreamHandler): + def __init__(self, default_stream=sys.stdout): + super().__init__(default_stream) + self.default_stream = default_stream + self.err_stream = sys.stderr + + def emit(self, record): + if record.levelno >= logging.WARNING: + self.stream = self.err_stream + else: + self.stream = self.default_stream + super().emit(record) + + +_LOGGING_CONFIGURATION = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'default': { + '()': LoggingFormatter, + 'format': '%(asctime)s %(name)s %(levelname)s: %(message)s', + 'datefmt': '%H:%M:%S' + }, + 'detailed': { + 'format': '%(asctime)s %(name)s %(levelname)s: %(message)s' + } + }, + 'handlers': { + 'console': { + 'level': 'DEBUG', + '()': ConsoleHandler, + 'formatter': 'default', + } + }, + + 'loggers': { + _DEFAULT_LOGGER_NAME: { + 'handlers': ['console'], + 'level': _LOGGING_LEVEL, + 'propagate': False + } + } +} + +logging.config.dictConfig(_LOGGING_CONFIGURATION) + +default_logger = logging.getLogger(_DEFAULT_LOGGER_NAME) + + +def _warning_handler(message, category, filename, lineno): + s = warnings.formatwarning(message, category, filename, lineno) + default_logger.warning(s) + + +warnings.showwarning = _warning_handler + + +def get_logger(logger_name: str): + if logger_name.startswith(_DEFAULT_LOGGER_NAME): + return default_logger.getChild(logger_name) + return logging.getLogger(logger_name) + + +def error(msg, *args, **kwargs): + with lock: + default_logger.error(msg, *args, **kwargs) + + +def warning(msg, *args, raise_warning=True, **kwargs): + with lock: + if raise_warning: + warnings.warn(msg) + else: + default_logger.warning(msg, *args, **kwargs) + + +def info(msg, *args, **kwargs): + with lock: + default_logger.info(msg, *args, **kwargs) + + +def info_performance_counters(performance_counters: dict, *args, **kwargs): + performance_counters_info = "\n\t{:<80} {:<15} {:<20} {:<15} {:<10}\n".format( + 'name', + 'layer_type', + 'exet_type', + 'status', + 'real_time, us') + + for layer_name, stats in performance_counters.items(): + performance_counters_info += "\t{:<80} {:<15} {:<20} {:<15} {:<10}\n".format( + layer_name[0:77] + "..." if len(layer_name) > 80 else layer_name, + stats['layer_type'], + stats['exec_type'], + stats['status'], + stats['real_time']) + info(performance_counters_info, *args, **kwargs) + + +def info_layer_accuracy_drop(layers_accuracy_drop: list, *args, **kwargs): + layer_accuracy_drop_text = "\n" + for layer_accuracy_drop in layers_accuracy_drop: + layer_accuracy_drop_text += "\t{0}: {1:.4f}%\n".format( + layer_accuracy_drop.layer_name[0:77] + "..." if len(layer_accuracy_drop.layer_name) > 80 else layer_accuracy_drop.layer_name, + layer_accuracy_drop.value * 100.0) + info(layer_accuracy_drop_text, *args, **kwargs) + + +def debug(msg, *args, **kwargs): + with lock: + default_logger.debug(msg, *args, **kwargs) + + +def print_info(msg, *args, **kwargs): + default_logger.log(PRINT_INFO, msg, *args, **kwargs) diff --git a/tools/calibration/network_node_stats.py b/tools/calibration/network_node_stats.py new file mode 100644 index 0000000..0a6c967 --- /dev/null +++ b/tools/calibration/network_node_stats.py @@ -0,0 +1,26 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class NetworkNodeStats: + __slots__ = ['min_outputs', 'max_outputs'] + + def __init__(self, channels_count: int): + self.min_outputs = list() + self.max_outputs = list() + for i in range(channels_count): + self.min_outputs.append(None) + self.max_outputs.append(None) \ No newline at end of file diff --git a/tools/calibration/nrmsd.py b/tools/calibration/nrmsd.py new file mode 100644 index 0000000..bd78fff --- /dev/null +++ b/tools/calibration/nrmsd.py @@ -0,0 +1,38 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np + + +def compare_nrmsd(actual_data, expected_data): + if actual_data.size != expected_data.size: + raise ValueError("actual data size {} is not equal expected data size {}".format(actual_data.size, expected_data.size)) + + sum = 0.0 + index = 0 + for expected_item in np.nditer(expected_data): + actual_item = actual_data.item(index) + sum += pow(expected_item - actual_item, 2) + index += 1 + + sum = sum / expected_data.size + sum = pow(sum, 0.5) + + if expected_data.max() - expected_data.min() == 0: + return 1.0 + + sum = sum / (expected_data.max() - expected_data.min()) + return sum diff --git a/tools/calibration/requirements.txt b/tools/calibration/requirements.txt new file mode 100644 index 0000000..5e3e8ee --- /dev/null +++ b/tools/calibration/requirements.txt @@ -0,0 +1,8 @@ +py-cpuinfo +numpy +progress +pyyaml +opencv-python +shapely +sklearn +xmltodict diff --git a/tools/calibration/shape.py b/tools/calibration/shape.py new file mode 100644 index 0000000..67d21b9 --- /dev/null +++ b/tools/calibration/shape.py @@ -0,0 +1,121 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class NchwShape: + def __init__(self, n: int, c: int, h: int, w: int): + self._n = n + self._c = c + self._h = h + self._w = w + + @property + def layout(self) -> str: + return 'NCHW' + + @property + def n(self) -> int: + return self._n + + @property + def c(self) -> int: + return self._c + + @property + def h(self) -> int: + return self._h + + @property + def w(self) -> int: + return self._w + + +class ChwShape: + def __init__(self, c: int, h: int, w: int): + self._c = c + self._h = h + self._w = w + + @property + def n(self) -> int: + return 1 + + @property + def layout(self) -> str: + return 'CHW' + + @property + def c(self) -> int: + return self._c + + @property + def h(self) -> int: + return self._h + + @property + def w(self) -> int: + return self._w + + +class NcShape: + def __init__(self, n: int, c: int): + self._n = n + self._c = c + + @property + def layout(self) -> str: + return 'NC' + + @property + def n(self) -> int: + return self._n + + @property + def c(self) -> int: + return self._c + + +class CShape: + def __init__(self, c: int): + self._n = 1 + self._c = c + + @property + def layout(self) -> str: + return 'C' + + @property + def n(self) -> int: + return self._n + + @property + def c(self) -> int: + return self._c + + +class Shape: + @staticmethod + def create(layout:str, dims): + if layout == 'NCHW': + return NchwShape(dims[0], dims[1], dims[2], dims[3]) + if layout == 'CHW': + return ChwShape(dims[0], dims[1], dims[2]) + elif layout == 'NC': + return NcShape(dims[0], dims[1]) + elif layout == 'C': + return CShape(dims[0]) + else: + raise ValueError("not supported layout '{}'".format(layout)) diff --git a/tools/calibration/single_layer_network.py b/tools/calibration/single_layer_network.py new file mode 100644 index 0000000..fb7c684 --- /dev/null +++ b/tools/calibration/single_layer_network.py @@ -0,0 +1,85 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from openvino.inference_engine import IENetwork, ExecutableNetwork, InferRequest + + +# TODO: network and request are not used +# TODO: refactor: create network before inference only +class SingleLayerNetwork: + ''' + One layer network description + ''' + + def __init__( + self, + network: IENetwork, + exec_network: ExecutableNetwork, + input_layer_name: str, + layer_name: str, + output_layer_name: str, + reference_output_layer_name: str): + + self._network = network + self._exec_network = exec_network + self._input_layer_name = input_layer_name + self._layer_name = layer_name + self._output_layer_name = output_layer_name + self._reference_output_layer_name = reference_output_layer_name + self._int8_accuracy_list = list() + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.release() + + def release(self): + if self._network: + del self._network + self._network = None + + if self._exec_network: + del self._exec_network + self._exec_network = None + + @property + def network(self) -> IENetwork: + return self._network + + @property + def exec_network(self) -> ExecutableNetwork: + return self._exec_network + + @property + def input_layer_name(self) -> str: + return self._input_layer_name + + @property + def layer_name(self) -> str: + return self._layer_name + + @property + def output_layer_name(self) -> str: + return self._output_layer_name + + @property + def reference_output_layer_name(self) -> str: + return self._reference_output_layer_name + + @property + def int8_accuracy_list(self) -> list: + return self._int8_accuracy_list diff --git a/tools/calibration/top_results.py b/tools/calibration/top_results.py new file mode 100644 index 0000000..6e0ddc0 --- /dev/null +++ b/tools/calibration/top_results.py @@ -0,0 +1,37 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class TopResults: + def __init__(self, data, channels_count: int): + self.__results = list() + + samples = int(data.size / channels_count) + for sample in range(samples): + max_value = None + max_value_class_number = None + + for class_number in range(channels_count): + value = data.item(class_number + sample * channels_count) + if (max_value is None) or (max_value < value): + max_value = value + max_value_class_number = class_number + + self.__results.append(max_value_class_number) + + @property + def results(self): + return self.__results diff --git a/tools/network.py b/tools/network.py new file mode 100644 index 0000000..303d3c3 --- /dev/null +++ b/tools/network.py @@ -0,0 +1,111 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +import tempfile +import shutil +import ntpath + +import openvino.inference_engine as ie +from .utils.path import Path + + +class Network: + @staticmethod + def reload(model_path: str, statistics = None, quantization_levels: dict() = None, batch_size: int = None): + tmp_model_dir = None + try: + with Network(model_path) as network: + if statistics: + network.set_statistics(statistics) + if quantization_levels: + network.set_quantization_levels(quantization_levels) + + tmp_model_dir = tempfile.mkdtemp(".model") + tmp_model_path = os.path.join(tmp_model_dir, ntpath.basename(model_path)) + network.serialize(tmp_model_path) + + network = Network(tmp_model_path) + Network.reshape(network.ie_network, batch_size) + return network + finally: + if tmp_model_dir: + shutil.rmtree(tmp_model_dir) + + def __init__(self, model_path: str, weights_path: str=None): + if model_path is None: + raise ValueError("model_path is None") + + self._model_path = model_path + self._weights_path = weights_path if weights_path else Path.get_weights(model_path) + self._ie_network = None + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.release() + + def release(self): + if self._ie_network: + del self._ie_network + self._ie_network = None + + @staticmethod + def reshape(ie_network: ie.IENetwork, batch_size: int) -> ie.IENetwork: + if batch_size and batch_size != ie_network.batch_size: + new_shapes = {} + for input_layer_name, input_layer in ie_network.inputs.items(): + layout = input_layer.layout + if layout == 'C': + new_shape = (input_layer.shape[0],) + elif layout == 'NC': + new_shape = (batch_size, input_layer.shape[1]) + else: + raise ValueError("not supported layout '{}'".format(layout)) + new_shapes[input_layer_name] = new_shape + ie_network.reshape(new_shapes) + return ie_network + + @property + def model_path(self) -> str: + return self._model_path + + @property + def weights_path(self) -> str: + return self._weights_path + + @property + def ie_network(self) -> ie.IENetwork: + if not self._ie_network: + self._ie_network = ie.IENetwork(self._model_path, self._weights_path) + return self._ie_network + + def set_quantization_levels(self, quantization_level: dict): + for layer_name, value in quantization_level.items(): + params = self.ie_network.layers[layer_name].params + params["quantization_level"] = value + self.ie_network.layers[layer_name].params = params + + def set_statistics(self, statistics: dict): + network_stats = {} + for layer_name, node_statistic in statistics.items(): + network_stats[layer_name] = ie.LayerStats(min=tuple(node_statistic.min_outputs), + max=tuple(node_statistic.max_outputs)) + self.ie_network.stats.update(network_stats) + + def serialize(self, model_path: str, weights_path: str=None): + self.ie_network.serialize(model_path, weights_path if weights_path else Path.get_weights(model_path)) diff --git a/tools/utils/__init__.py b/tools/utils/__init__.py new file mode 100644 index 0000000..95b0726 --- /dev/null +++ b/tools/utils/__init__.py @@ -0,0 +1,22 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from .path import Path + +__version__ = "0.0.1" +__all__ = [ + 'Path' +] diff --git a/tools/utils/biases.py b/tools/utils/biases.py new file mode 100644 index 0000000..88b7579 --- /dev/null +++ b/tools/utils/biases.py @@ -0,0 +1,29 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Biases: + def __init__(self, offset: int, size: int): + self._offset = offset + self._size = size + + @property + def offset(self) -> int: + return self._offset + + @property + def size(self) -> int: + return self._size diff --git a/tools/utils/building/__init__.py b/tools/utils/building/__init__.py new file mode 100644 index 0000000..e8cc80e --- /dev/null +++ b/tools/utils/building/__init__.py @@ -0,0 +1,17 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +__version__ = "0.0.1" diff --git a/tools/utils/building/layer.py b/tools/utils/building/layer.py new file mode 100644 index 0000000..199fc92 --- /dev/null +++ b/tools/utils/building/layer.py @@ -0,0 +1,157 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from ..biases import Biases +from ..weights import Weights + + +class Layer: + TEMPLATE = ( + '' + '{data}' + '{input}' + '{output}' + '{weights}' + '{biases}' + '') + + def __init__( + self, id: int, + type: str, + name: str, + params: dict, + input_dims: list, + output_dims: list, + weights: Weights = None, + biases: Biases = None): + self._id = id + self._type = type + self._name = name + self._params = params + self._input_dims = input_dims + self._output_dims = output_dims + self._weights = weights + self._biases = biases + + @property + def id(self) -> str: + return self._id + + @property + def type(self) -> str: + return self._type + + @property + def name(self) -> str: + return self._name + + @property + def params(self) -> dict: + return self._params + + @property + def input_dims(self) -> list: + return self._input_dims + + @property + def output_dims(self) -> list: + return self._output_dims + + @property + def weights(self) -> Weights: + return self._weights + + @property + def biases(self) -> Biases: + return self._biases + + def _output_dims_to_xml(self) -> str: + if self._output_dims: + if len(self._output_dims) == 2: + output_xml = ( + '' + '' + '{}' + '{}' + '' + '').format(self._output_dims[0], self._output_dims[1]) + elif len(self._output_dims) == 4: + output_xml = ( + '' + '' + '{}' + '{}' + '{}' + '{}' + '' + '').format(self._output_dims[0], self._output_dims[1], self._output_dims[2], self._output_dims[3]) + else: + raise NotImplementedError("{} dimensions for outputs (layer name '{}', type '{}') are not supported".format( + len(self._output_dims), + self._name, + self._type)) + else: + output_xml = None + return output_xml + + def _input_dims_to_xml(self) -> str: + if self._input_dims: + if len(self._input_dims) == 2: + input_xml = ( + '' + '' + '{}' + '{}' + '' + '').format(self._input_dims[0], self._input_dims[1]) + elif len(self._input_dims) == 4: + input_xml = ( + '' + '' + '{}' + '{}' + '{}' + '{}' + '' + '').format(self._input_dims[0], self._input_dims[1], self._input_dims[2], self._input_dims[3]) + else: + raise NotImplementedError("{} dimensions for inputs (layer name '{}', type '{}') are not supported".format( + len(self._input_dims), + self._name, + self._type)) + else: + input_xml = None + + return input_xml + + def __str__(self) -> str: + if self._params: + data_xml = "'.format(offset=self._weights.offset, size=self._weights.size) if self._weights else ''), + biases=(''.format(offset=self._biases.offset, size=self._biases.size) if self._biases else '') + ) diff --git a/tools/utils/building/network_builder.py b/tools/utils/building/network_builder.py new file mode 100644 index 0000000..fe6334b --- /dev/null +++ b/tools/utils/building/network_builder.py @@ -0,0 +1,51 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +# TODO: limitations: +# - one input +# - one output +# - dims size is 4 +class NetworkBuilder: + + EDGES_TEMPLATE2 = ( + '' + '' + '' + '') + + EDGES_TEMPLATE3 = ( + '' + '' + '' + '' + '') + + def __init__(self, version: int = 3): + self._layers = list() + + def __str__(self): + # xml = '' + xml = '' + for layer in self._layers: + xml = xml + str(layer) + + xml = xml + "" + (NetworkBuilder.EDGES_TEMPLATE2 if len(self._layers) == 3 else NetworkBuilder.EDGES_TEMPLATE3) + "" + return xml + + def sequential(self, layers): + self._layers = layers + return self diff --git a/tools/utils/building/port.py b/tools/utils/building/port.py new file mode 100644 index 0000000..a9ace63 --- /dev/null +++ b/tools/utils/building/port.py @@ -0,0 +1,20 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Port: + def __init__(self, dims: list): + self._dims = dims diff --git a/tools/utils/configuration_filter.py b/tools/utils/configuration_filter.py new file mode 100644 index 0000000..c5ed21c --- /dev/null +++ b/tools/utils/configuration_filter.py @@ -0,0 +1,74 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import copy +import os + + +class ConfigurationFilter: + @staticmethod + def filter(configuration, filter_metric_name: str, filter_metric_type: str, logger = None): + updated_configuration = copy.deepcopy(configuration) + if 'models' not in updated_configuration or len(updated_configuration['models']) == 0: + raise ValueError("'models' key is absent in configuration") + + updated_configuration['models'] = [model for model in updated_configuration['models'] if 'launchers' in model and model['launchers']] + if len(updated_configuration['models']) > 1: + raise ValueError("too many models") + + if not updated_configuration['models']: + raise ValueError("there are no models") + + model = updated_configuration['models'][0] + if 'datasets' not in model or len(model['datasets']) == 0: + raise ValueError("'datasets' key is absent in models") + + if len(model['datasets']) > 1: + raise ValueError("too many datasets in model") + + dataset = model['datasets'][0] + if filter_metric_name: + dataset['metrics'] = [i for i in dataset['metrics'] if i['name'] == filter_metric_name] + + if filter_metric_type: + dataset['metrics'] = [i for i in dataset['metrics'] if i['type'] == filter_metric_type] + + if 'metrics' not in dataset or len(dataset['metrics']) == 0: + raise ValueError("can not find appropriate metric in dataset{}{}".format( + ", filter_metric_name='{}'".format(filter_metric_name) if filter_metric_name else "", + ", filter_metric_type='{}'".format(filter_metric_type) if filter_metric_type else "")) + + if filter_metric_name is None and filter_metric_type is None and len(dataset['metrics']) > 1: + dataset['metrics'] = [dataset['metrics'][0]] + if logger: + logger.warn("too many metrics without filters, first metric '{}' is used".format(str(dataset['metrics'][0]))) + + if len(dataset['metrics']) > 1: + raise ValueError("too many metrics in datasets") + + metric = dataset['metrics'][0] + if 'presenter' in metric and metric['presenter'] != 'return_value': + original_presenter = metric['presenter'] + metric['presenter'] = 'return_value' + if logger: + logger.warn("presenter was changed from '{}' to '{}'".format(original_presenter, metric['presenter'])) + else: + metric['presenter'] = 'return_value' + if logger: + logger.warn("presenter was set to '{}'".format(metric['presenter'])) + + return updated_configuration + diff --git a/tools/utils/connection.py b/tools/utils/connection.py new file mode 100644 index 0000000..cb5ce73 --- /dev/null +++ b/tools/utils/connection.py @@ -0,0 +1,34 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Connection: + def __init__(self, edge, port, layer): + self._edge = edge + self._port = port + self._layer = layer + + @property + def edge(self): + return self._edge + + @property + def port(self): + return self._port + + @property + def layer(self): + return self._layer diff --git a/tools/utils/edge.py b/tools/utils/edge.py new file mode 100644 index 0000000..5c8d3c7 --- /dev/null +++ b/tools/utils/edge.py @@ -0,0 +1,39 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Edge: + def __init__(self, data: dict): + self._from_layer = int(data['from-layer']) + self._from_port = int(data['from-port']) + self._to_layer = int(data['to-layer']) + self._to_port = int(data['to-port']) + + @property + def from_layer(self) -> int: + return self._from_layer + + @property + def from_port(self) -> int: + return self._from_port + + @property + def to_layer(self) -> int: + return self._to_layer + + @property + def to_port(self) -> int: + return self._to_port diff --git a/tools/utils/layer.py b/tools/utils/layer.py new file mode 100644 index 0000000..707bb07 --- /dev/null +++ b/tools/utils/layer.py @@ -0,0 +1,99 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import collections + +from .biases import Biases +from .weights import Weights +from .port import Port + + +class Layer: + def __init__(self, data: dict): + self._id = int(data['id']) + self._name = data['name'] + self._precision = data['precision'] + self._type = data['type'] + + self._input_ports = Layer.__init_ports(data, 'input') + self._output_ports = Layer.__init_ports(data, 'output') + + self._inputs = list() + self._outputs = list() + + blobs = data['blobs'] if 'blobs' in data else data + self._weights = Weights(int(blobs['weights']['offset']), int(blobs['weights']['size'])) if 'weights' in blobs else Weights(0, 0) + self._biases = Biases(int(blobs['biases']['offset']), int(blobs['biases']['size'])) if 'biases' in blobs else Biases(0, 0) + + @staticmethod + def __init_ports(data: dict, key: str) -> dict: + result_ports = dict() + if (key in data) and ('port' in data[key]): + ports = data[key]['port'] + if type(ports) is list: + for port_dict in ports: + id = int(port_dict['id']) + result_ports[id] = Port(id, list(map(int, port_dict['dim']))) + elif type(ports) is collections.OrderedDict: + id = int(ports['id']) + result_ports[id] = Port(id, list(map(int, ports['dim']))) + else: + raise ValueError("unexpected ports type '{}'".format(type(ports))) + return result_ports + + def init(self, inputs: list, outputs: list): + self._inputs = inputs + self._outputs = outputs + + @property + def id(self) -> int: + return self._id + + @property + def name(self) -> str: + return self._name + + @property + def precision(self) -> str: + return self._precision + + @property + def type(self) -> str: + return self._type + + @property + def input_ports(self): + return self._input_ports + + @property + def output_ports(self): + return self._output_ports + + @property + def inputs(self) -> list: + return self._inputs + + @property + def outputs(self) -> list: + return self._outputs + + @property + def weights(self): + return self._weights + + @property + def biases(self): + return self._biases diff --git a/tools/utils/network_info.py b/tools/utils/network_info.py new file mode 100644 index 0000000..d318e46 --- /dev/null +++ b/tools/utils/network_info.py @@ -0,0 +1,123 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import xmltodict +from typing import List + +from .layer import Layer +from .edge import Edge +from .connection import Connection + + +# TODO: custom implementation: +# 1. get in/out layers +# 2. add_layer +class NetworkInfo: + def __init__(self, model_path: str): + + model_content = None + with open(model_path, 'r') as mode_file: + model_content = mode_file.read() + + model_xml = xmltodict.parse(model_content, attr_prefix='') + if 'net' not in model_xml: + raise ValueError("IR file '{}' format is not correct".format(model_path)) + + self._model = model_xml['net'] + + # TODO: move to private method + ordered_edges = self._model['edges']['edge'] + self._edges_by_from_layer = dict() + self._edges_by_to_layer = dict() + for ordered_edge in ordered_edges: + from_layer = int(ordered_edge['from-layer']) + to_layer = int(ordered_edge['to-layer']) + + edge = Edge(ordered_edge) + + if from_layer not in self._edges_by_from_layer: + self._edges_by_from_layer[from_layer] = list() + self._edges_by_from_layer[from_layer].append(edge) + + if to_layer not in self._edges_by_to_layer: + self._edges_by_to_layer[to_layer] = list() + self._edges_by_to_layer[to_layer].append(edge) + + # TODO: move to private method + ordered_layers = self._model['layers']['layer'] + self._layer_by_id = dict() + self._layer_by_name = dict() + for ordered_layer in ordered_layers: + layer = Layer(ordered_layer) + self._layer_by_id[int(ordered_layer['id'])] = layer + self._layer_by_name[layer.name] = layer + + # TODO: move to private method + for layer_id, layer in self._layer_by_id.items(): + input_edges = self._edges_by_to_layer[layer_id] if layer_id in self._edges_by_to_layer else list() + inputs = list() + for edge in input_edges: + if edge.from_layer not in self._layer_by_id: + raise ValueError("layer with id {} was not found".format(edge.from_layer)) + + # inputs.append(self._layer_by_id[edge.from_layer]) + from_layer = self._layer_by_id[edge.from_layer] + inputs.append(Connection(edge=edge, port=layer.input_ports[edge.to_port], layer=from_layer)) + + output_edges = self._edges_by_from_layer[layer_id] if layer_id in self._edges_by_from_layer else list() + outputs = list() + for edge in output_edges: + if edge.to_layer not in self._layer_by_id: + raise ValueError("layer with id {} was not found".format(edge.to_layer)) + + # outputs.append(self._layer_by_id[edge.to_layer]) + to_layer = self._layer_by_id[edge.to_layer] + outputs.append(Connection(edge=edge, port=layer.output_ports[edge.from_port], layer=to_layer)) + + layer.init(inputs, outputs) + + pass + + def get_layer_names(self, layer_types: List[str]) -> List[str]: + skipped = [] + if layer_types: + for layer in self._layer_by_name.values(): + if layer.type in layer_types: + skipped.append(layer.name) + return skipped + + @property + def layers(self) -> int: + return self._layer_by_id + + def get_layer(self, layer_name: str) -> Layer: + return self._layer_by_name[layer_name] + + def explore_inputs(self, layer: Layer, expected_input_types: List[str]) -> bool: + for layer_input in layer.inputs: + if layer_input.layer.type not in expected_input_types: + return False + if not self.explore_inputs(layer_input.layer, expected_input_types): + return False + return True + + @property + def inputs(self): + inputs = dict() + for id, layer in self.layers.items(): + if layer.type == 'Input': + inputs[id] = layer + return inputs diff --git a/tools/utils/path.py b/tools/utils/path.py new file mode 100644 index 0000000..ecc5e02 --- /dev/null +++ b/tools/utils/path.py @@ -0,0 +1,67 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os +import ntpath + + +class Path: + @staticmethod + def get_model(model_file_path:str, addition:str = None, directory:str = None) -> str: + if model_file_path is None: + raise ValueError("model_file_path is None") + + file_name = ntpath.basename(model_file_path) + model = os.path.splitext(file_name) + if len(model) < 2: + raise ValueError("model file name '{}' is not correct".format(file_name)) + if directory: + return os.path.join( + directory, + model[len(model) - 2] + (addition if addition else "") + ".xml") + else: + return os.path.join( + os.path.dirname(model_file_path), + model[len(model) - 2] + (addition if addition else "") + ".xml") + + @staticmethod + def get_weights(model_file_path:str, addition:str = None, directory:str = None) -> str: + if model_file_path is None: + raise ValueError("model_file_path is None") + + file_name = ntpath.basename(model_file_path) + model = os.path.splitext(file_name) + if len(model) < 2: + raise ValueError("model file name '{}' is not correct".format(file_name)) + if directory: + return os.path.join( + directory, + model[len(model) - 2] + (addition if addition else "") + ".bin") + else: + return os.path.join( + os.path.dirname(model_file_path), + model[len(model) - 2] + (addition if addition else "") + ".bin") + + @staticmethod + def update_name(file_path: str, addition: str) -> str: + file_name = ntpath.basename(file_path) + parts = os.path.splitext(file_name) + + name = parts[0] + extension = parts[-1] if len(parts) >= 2 else "" + + dir = os.path.dirname(file_path) + return os.path.join(dir, name + addition + extension) diff --git a/tools/utils/port.py b/tools/utils/port.py new file mode 100644 index 0000000..348cae3 --- /dev/null +++ b/tools/utils/port.py @@ -0,0 +1,29 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Port: + def __init__(self, id: int, dim: list): + self._id = id + self._dim = dim + + @property + def id(self): + return self._id + + @property + def dim(self): + return self._dim diff --git a/tools/utils/tensor_desc.py b/tools/utils/tensor_desc.py new file mode 100644 index 0000000..67f1cd7 --- /dev/null +++ b/tools/utils/tensor_desc.py @@ -0,0 +1,19 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +class TensorDesc: + def __init__(self, dims: list): + pass diff --git a/tools/utils/weights.py b/tools/utils/weights.py new file mode 100644 index 0000000..30d890a --- /dev/null +++ b/tools/utils/weights.py @@ -0,0 +1,29 @@ +""" +Copyright (C) 2018-2019 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class Weights: + def __init__(self, offset: int, size: int): + self._offset = offset + self._size = size + + @property + def offset(self) -> int: + return self._offset + + @property + def size(self) -> int: + return self._size -- 2.7.4